diff --git a/.bazelrc b/.bazelrc
index 73926e5a2f9..066b0db10bc 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -5,6 +5,7 @@
 # Android options:
 #    android:
 #    android_arm:
+#    android_arm64:
 #    android_x86:
 #    android_x86_64:
 #
@@ -18,8 +19,10 @@
 #
 # Compiler options:
 #     cuda_clang:             Use clang when building CUDA code.
-#     c++17:                  Build with C++17 options
-#     c++1z:                  Build with C++17 options
+#     c++17:                  Build with C++17 options (links with libc++)
+#     c++1z:                  Build with C++17 options (links with libc++)
+#     c++17_gcc:              Build with C++17 options (links with stdlibc++)
+#     c++1z_gcc:              Build with C++17 options (links with stdlibc++)
 #     avx_linux:              Build with avx instruction set on linux.
 #     avx2_linux:             Build with avx2 instruction set on linux.
 #     native_arch_linux:      Build with instruction sets available to the host machine on linux
@@ -44,10 +47,6 @@
 #     using_cuda:   CUDA is available to build system.
 #     cuda:         Build with full cuda support.
 #     rocm:         Build with AMD GPU support (rocm).
-#     sycl:         Build with SYCL support.
-#     sycl_nodouble:
-#     sycl_asan:
-#     sycl_trisycl:
 #     mkl:          Enable full mkl support.
 #     tensorrt:     Enable Tensorrt support.
 #     ngraph:       Enable ngraph support.
@@ -87,6 +86,7 @@
 #     release_cpu_linux:    Toolchain and CUDA options for Linux CPU builds.
 #     release_cpu_macos:    Toolchain and CUDA options for MacOS CPU builds.
 #     release_gpu_linux:    Toolchain and CUDA options for Linux GPU builds.
+#     release_gpu_linux_cuda_10_1:    Toolchain and CUDA options for CUDA 10.1 Linux GPU builds.
 #     release_cpu_windows:    Toolchain and CUDA options for Windows CPU builds.
 #     release_gpu_windows:    Toolchain and CUDA options for Windows GPU builds.
 
@@ -159,13 +159,11 @@ build --host_java_toolchain=//third_party/toolchains/java:tf_java_toolchain
 # environment variable "TF_MKL_ROOT" every time before build.
 build:mkl --define=build_with_mkl=true --define=enable_mkl=true
 build:mkl --define=tensorflow_mkldnn_contraction_kernel=0
-build:mkl --define=build_with_mkl_dnn_v1_only=true
 build:mkl -c opt
 
 # config to build OneDNN backend with a user specified threadpool.
 build:mkl_threadpool --define=build_with_mkl=true --define=enable_mkl=true
 build:mkl_threadpool --define=tensorflow_mkldnn_contraction_kernel=0
-build:mkl_threadpool --define=build_with_mkl_dnn_v1_only=true
 build:mkl_threadpool --define=build_with_mkl_opensource=true
 build:mkl_threadpool --define=build_with_mkldnn_threadpool=true
 build:mkl_threadpool -c opt
@@ -173,7 +171,6 @@ build:mkl_threadpool -c opt
 # Config setting to build with oneDNN and without the binary blob
 build:mkl_opensource_only --define=build_with_mkl=true --define=enable_mkl=true
 build:mkl_opensource_only --define=tensorflow_mkldnn_contraction_kernel=0
-build:mkl_opensource_only --define=build_with_mkl_dnn_v1_only=true
 build:mkl_opensource_only --define=build_with_mkl_opensource=true
 build:mkl_opensource_only -c opt
 
@@ -214,19 +211,6 @@ build:rocm --crosstool_top=@local_config_rocm//crosstool:toolchain
 build:rocm --define=using_rocm=true --define=using_rocm_hipcc=true
 build:rocm --action_env TF_NEED_ROCM=1
 
-build:sycl --crosstool_top=@local_config_sycl//crosstool:toolchain
-build:sycl --define=using_sycl=true
-build:sycl --action_env TF_NEED_OPENCL_SYCL=1
-
-build:sycl_nodouble --config=sycl
-build:sycl_nodouble --cxxopt -DTENSORFLOW_SYCL_NO_DOUBLE
-
-build:sycl_nodouble --config=sycl
-build:sycl_asan --copt -fno-omit-frame-pointer --copt -fsanitize-coverage=3 --copt -DGPR_NO_DIRECT_SYSCALLS --linkopt -fPIC --linkopt -fsanitize=address
-
-build:sycl_nodouble --config=sycl
-build:sycl_trisycl --define=using_trisycl=true
-
 # Options extracted from configure script
 build:ngraph --define=with_ngraph_support=true
 build:numa --define=with_numa_support=true
@@ -278,6 +262,8 @@ build:dynamic_kernels --copt=-DAUTOLOAD_DYNAMIC_KERNELS
 build:c++17 --cxxopt=-std=c++1z
 build:c++17 --cxxopt=-stdlib=libc++
 build:c++1z --config=c++17
+build:c++17_gcc --cxxopt=-std=c++1z
+build:c++1z_gcc --config=c++17_gcc
 
 # Enable using platform specific build settings, except when cross-compiling for
 # mobile platforms.
@@ -289,6 +275,7 @@ build:ios --noenable_platform_specific_config
 build:android --copt=-w
 build:ios --copt=-w
 build:linux --copt=-w
+build:linux --host_copt=-w
 build:macos --copt=-w
 build:windows --copt=/w
 
@@ -330,6 +317,11 @@ build:windows --host_copt=-DWIN32_LEAN_AND_MEAN
 build:windows --copt=-DNOGDI
 build:windows --host_copt=-DNOGDI
 
+# MSVC (Windows): Standards-conformant preprocessor mode
+# See https://docs.microsoft.com/en-us/cpp/preprocessor/preprocessor-experimental-overview
+build:windows --copt=/experimental:preprocessor
+build:windows --host_copt=/experimental:preprocessor
+
 # Misc build options we need for windows.
 build:windows --linkopt=/DEBUG
 build:windows --host_linkopt=/DEBUG
@@ -354,6 +346,7 @@ build --config=short_logs
 # TODO(gunan): Create a feature in toolchains for avx/avx2 to
 #   avoid having to define linux/win separately.
 build:avx_linux --copt=-mavx
+build:avx_linux --host_copt=-mavx
 build:avx2_linux --copt=-mavx2
 build:native_arch_linux --copt=-march=native
 build:avx_win --copt=/arch=AVX
@@ -368,7 +361,6 @@ build --config=v2
 test --config=v2
 
 # Enable XLA
-build:xla --action_env=TF_ENABLE_XLA=1
 build:xla --define=with_xla_support=true
 
 # BEGIN TF REMOTE BUILD EXECUTION OPTIONS
@@ -408,9 +400,12 @@ build:rbe_linux --config=avx_linux
 build:rbe_linux --config=short_logs
 # TODO(gunan): Check why we need this specified in rbe, but not in other builds.
 build:rbe_linux --linkopt=-lrt
+build:rbe_linux --host_linkopt=-lrt
 build:rbe_linux --linkopt=-lm
+build:rbe_linux --host_linkopt=-lm
 
 build:rbe_cpu_linux --config=rbe_linux
+build:rbe_cpu_linux --host_crosstool_top="//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010:toolchain"
 build:rbe_cpu_linux --crosstool_top="//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010:toolchain"
 build:rbe_cpu_linux --extra_toolchains="//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010:cc-toolchain-k8"
 build:rbe_cpu_linux --extra_execution_platforms="@ubuntu16.04-manylinux2010-py3_config_platform//:platform"
@@ -428,6 +423,7 @@ test:rbe_linux_cuda_base --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/
 
 build:rbe_linux_cuda10.1_nvcc_base --config=rbe_linux_cuda_base
 build:rbe_linux_cuda10.1_nvcc_base --define=using_cuda_nvcc=true
+build:rbe_linux_cuda10.1_nvcc_base --host_crosstool_top="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_cuda//crosstool:toolchain"
 build:rbe_linux_cuda10.1_nvcc_base --crosstool_top="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_cuda//crosstool:toolchain"
 build:rbe_linux_cuda10.1_nvcc_base --extra_toolchains="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_cuda//crosstool:toolchain-linux-x86_64"
 build:rbe_linux_cuda10.1_nvcc_base --extra_execution_platforms="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_platform//:platform"
@@ -444,6 +440,7 @@ build:rbe_linux_cuda10.1_nvcc_py3.8 --config=rbe_linux_cuda10.1_nvcc_base --repo
 
 build:rbe_linux_cuda11.0_nvcc_base --config=rbe_linux_cuda_base
 build:rbe_linux_cuda11.0_nvcc_base --define=using_cuda_nvcc=true
+build:rbe_linux_cuda11.0_nvcc_base --host_crosstool_top="@ubuntu18.04-gcc7_manylinux2010-cuda11.0-cudnn8-tensorrt7.1_config_cuda//crosstool:toolchain"
 build:rbe_linux_cuda11.0_nvcc_base --crosstool_top="@ubuntu18.04-gcc7_manylinux2010-cuda11.0-cudnn8-tensorrt7.1_config_cuda//crosstool:toolchain"
 build:rbe_linux_cuda11.0_nvcc_base --extra_toolchains="@ubuntu18.04-gcc7_manylinux2010-cuda11.0-cudnn8-tensorrt7.1_config_cuda//crosstool:toolchain-linux-x86_64"
 build:rbe_linux_cuda11.0_nvcc_base --extra_execution_platforms="@ubuntu18.04-gcc7_manylinux2010-cuda11.0-cudnn8-tensorrt7.1_config_platform//:platform"
@@ -458,12 +455,12 @@ build:rbe_linux_cuda11.0_nvcc_py3.6 --config=rbe_linux_cuda11.0_nvcc_base --repo
 build:rbe_linux_cuda11.0_nvcc_py3.7 --config=rbe_linux_cuda11.0_nvcc_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda11.0-cudnn8-tensorrt7.1_config_python3.7"
 build:rbe_linux_cuda11.0_nvcc_py3.8 --config=rbe_linux_cuda11.0_nvcc_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda11.0-cudnn8-tensorrt7.1_config_python3.8"
 
-# Map default to CUDA 10.1.
+# Map default to CUDA 11 for PY35 and greater.
 build:rbe_linux_cuda_nvcc_py27 --config=rbe_linux_cuda10.1_nvcc_py2.7
-build:rbe_linux_cuda_nvcc_py35 --config=rbe_linux_cuda10.1_nvcc_py3.5
-build:rbe_linux_cuda_nvcc_py36 --config=rbe_linux_cuda10.1_nvcc_py3.6
-build:rbe_linux_cuda_nvcc_py37 --config=rbe_linux_cuda10.1_nvcc_py3.7
-build:rbe_linux_cuda_nvcc_py38 --config=rbe_linux_cuda10.1_nvcc_py3.8
+build:rbe_linux_cuda_nvcc_py35 --config=rbe_linux_cuda11.0_nvcc_py3.5
+build:rbe_linux_cuda_nvcc_py36 --config=rbe_linux_cuda11.0_nvcc_py3.6
+build:rbe_linux_cuda_nvcc_py37 --config=rbe_linux_cuda11.0_nvcc_py3.7
+build:rbe_linux_cuda_nvcc_py38 --config=rbe_linux_cuda11.0_nvcc_py3.8
 
 # Deprecated configs that people might still use.
 build:rbe_linux_cuda_nvcc --config=rbe_linux_cuda_nvcc_py36
@@ -580,11 +577,11 @@ build:release_cpu_macos --config=avx_linux
 build:release_gpu_common --config=release_common
 build:release_gpu_common --config=cuda
 build:release_gpu_common --config=tensorrt
-build:release_gpu_common --action_env CUDA_TOOLKIT_PATH="/usr/local/cuda-10.1"
-build:release_gpu_common --action_env=TF_CUDA_VERSION="10"
-build:release_gpu_common --action_env=TF_CUDNN_VERSION="7"
+build:release_gpu_common --action_env CUDA_TOOLKIT_PATH="/usr/local/cuda-11.0"
+build:release_gpu_common --action_env=TF_CUDA_VERSION="11"
+build:release_gpu_common --action_env=TF_CUDNN_VERSION="8"
 build:release_gpu_common --action_env=TF_NEED_TENSORRT="1"
-build:release_gpu_common --action_env=TF_CUDA_COMPUTE_CAPABILITIES="sm_35,sm_37,sm_52,sm_60,sm_61,compute_70"
+build:release_gpu_common --action_env=TF_CUDA_COMPUTE_CAPABILITIES="sm_35,sm_50,sm_60,sm_70,sm_75,compute_80"
 build:release_gpu_common --action_env=TENSORRT_INSTALL_PATH="/usr/local/tensorrt"
 build:release_gpu_common --action_env=LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/tensorrt/lib"
 build:release_gpu_common --action_env=GCC_HOST_COMPILER_PATH="/usr/bin/gcc-5"
@@ -592,8 +589,7 @@ build:release_gpu_common --action_env=GCC_HOST_COMPILER_PATH="/usr/bin/gcc-5"
 
 build:release_gpu_linux --config=release_gpu_common
 build:release_gpu_linux --config=avx_linux
-build:release_gpu_linux --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain
-
+build:release_gpu_linux --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda11:toolchain
 build:release_windows_common --config=release_common
 build:release_windows_common --define=no_tensorflow_py_deps=true
 build:release_windows_common --announce_rc
@@ -601,3 +597,8 @@ build:release_windows_common --announce_rc
 build:release_cpu_windows --config=release_windows_common
 
 build:release_gpu_windows --config=release_windows_common
+
+build:release_gpu_linux_cuda_10_1 --config=release_gpu_linux
+build:release_gpu_linux_cuda_10_1 --action_env CUDA_TOOLKIT_PATH="/usr/local/cuda-10.1"
+build:release_gpu_linux_cuda_10_1 --action_env=TF_CUDA_VERSION="10"
+build:release_gpu_linux_cuda_10_1 --action_env=TF_CUDNN_VERSION="7"
diff --git a/.github/bot_config.yml b/.github/bot_config.yml
index d0e7256aec0..952ff91fef7 100644
--- a/.github/bot_config.yml
+++ b/.github/bot_config.yml
@@ -40,6 +40,22 @@ segfault_memory:
 # assignees
 filesystem_security_assignee:
    - mihaimaruseac
+   
+tflite_micro_path:
+   - tensorflow/lite/micro
+   
+tflite_micro_comment: >
+   Thanks for contributing to TensorFlow Lite Micro.
+   
+
+   To keep this process moving along, we'd like to make sure that you have completed the items on this list:
+      * Read the [contributing guidelines for TensorFlow Lite Micro](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/CONTRIBUTING.md)
+      * Created a [TF Lite Micro Github issue](https://github.com/tensorflow/tensorflow/issues/new?labels=comp%3Amicro&template=70-tflite-micro-issue.md)
+      * Linked to the issue from the PR description
+      
+
+   We would like to have a discussion on the Github issue first to determine the best path forward, and then proceed to the PR review.
+
 # Cuda Comment
 cuda_comment: >
    From the template it looks like you are installing **TensorFlow** (TF) prebuilt binaries:
diff --git a/ADOPTERS.md b/ADOPTERS.md
deleted file mode 100644
index c0be567dc14..00000000000
--- a/ADOPTERS.md
+++ /dev/null
@@ -1,10 +0,0 @@
-# TensorFlow Adopters
-
-This page contains a list of people and organizations who are using TensorFlow. If you'd like to be included
-here, please send a pull request which modifies this file.
-
-We intend to use this list to contact you for surveys, and to find good candidates for invite-only events. 
-We will also point to this list if we are asked who uses TensorFlow.
-
-We will not use any of the information here for promotions or to send other regular communications. You 
-should subscribe to discuss@tensorflow.org for such announcements.
diff --git a/CODEOWNERS b/CODEOWNERS
index 3ef02ffd68c..83ad24b2845 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -1,16 +1,15 @@
 # Where component owners are known, add them here.
 
-/tensorflow/c/eager @jaingaurav @alextp
-/tensorflow/core/common_runtime/eager @jaingaurav @alextp
+/tensorflow/c/eager @qqfish @kkimdev
+/tensorflow/core/common_runtime/eager @qqfish @kkimdev
 /tenosrflow/core/debug @caisq
 /tensorflow/core/nccl/ @azaks2 @chsigg
-/tensorflow/core/platform/windows/ @mrry
+/tensorflow/core/platform/windows/ @gunan @mihaimaruseac
 /tensorflow/lite/experimental/micro @petewarden @advaitjain
 /tensorflow/python/autograph/ @mdanatg @kkimdev
 /tensorflow/python/debug @caisq
-/tensorflow/python/eager @jaingaurav @alextp
+/tensorflow/python/eager @rohan100jain @kkimdev
 /tensorflow/python/tools/api/generator/ @annarev
-/tensorflow/tensorboard/ @jart
 /tensorflow/tools/docs/ @markdaoust
 
 /third_party/systemlibs/ @perfinion
diff --git a/README.md b/README.md
index 6398e8e27a1..f888f6bd9d4 100644
--- a/README.md
+++ b/README.md
@@ -157,7 +157,7 @@ Build Type
 *   [Learn ML with TensorFlow](https://www.tensorflow.org/resources/learn-ml)
 *   [TensorFlow Twitter](https://twitter.com/tensorflow)
 *   [TensorFlow YouTube](https://www.youtube.com/channel/UC0rqucBdTuFTjJiefW5t-IQ)
-*   [TensorFlow Roadmap](https://www.tensorflow.org/community/roadmap)
+*   [TensorFlow Roadmap](https://www.tensorflow.org/model_optimization/guide/roadmap)
 *   [TensorFlow White Papers](https://www.tensorflow.org/about/bib)
 *   [TensorBoard Visualization Toolkit](https://github.com/tensorflow/tensorboard)
 
diff --git a/RELEASE.md b/RELEASE.md
index b0c785c7d68..89dd3a8a78c 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -22,6 +22,7 @@
     * Code that uses `tf.map_fn`/`tf.cond`/`tf.while_loop`/control flow as op layers and happens to work before TF 2.4. These will explicitly be unsupported now. Converting these ops to Functional API op layers was unreliable before TF 2.4, and prone to erroring incomprehensibly or being silently buggy.
     * Code that directly asserts on a Keras symbolic value in cases where ops like `tf.rank` used to return a static or symbolic value depending on if the input had a fully static shape or not. Now these ops always return symbolic values.
     * Code already susceptible to leaking tensors outside of graphs becomes slightly more likely to do so now.
+    * Code that tries directly getting gradients with respect to symbolic Keras inputs/outputs. Use GradientTape on the actual Tensors passed to the already-constructed model instead.
     * Code that requires very tricky shape manipulation via converted op layers in order to work, where the Keras symbolic shape inference proves insufficient.
     * Code that tries manually walking a `tf.keras.Model` layer by layer and assumes layers only ever have one positional argument. This assumption doesn't hold true before TF 2.4 either, but is more likely to cause issues know.
     * Code that manually enters `keras.backend.get_graph()` before building a functional model. This is no longer needed.
@@ -33,6 +34,18 @@
   shape assumptions (note that you can pass shapes with `None` entries for axes
   that are meant to be dynamic). You can also disable the input checking
   entirely by setting `model.input_spec = None`.
+* XLA:CPU and XLA:GPU devices are no longer registered by default. Use
+  `TF_XLA_FLAGS=--tf_xla_enable_xla_devices` if you really need them (to be
+  removed).
+* `tf.raw_ops.Max` and `tf.raw_ops.Min` no longer accept inputs of type
+  `tf.complex64` or `tf.complex128`, because the behavior of these ops is not
+  well defined for complex types.
+* `tf.data.experimental.service.DispatchServer` now takes a config tuple
+  instead of individual arguments. Usages should be updated to
+  `tf.data.experimental.service.DispatchServer(dispatcher_config)`.
+* `tf.data.experimental.service.WorkerServer` now takes a config tuple
+  instead of individual arguments. Usages should be updated to
+  `tf.data.experimental.service.WorkerServer(worker_config)`.
 
 ## Known Caveats
 
@@ -67,11 +80,24 @@
     the same sparsity pattern, but with new provided values. It is similar to
     the `with_values` function of `RaggedTensor`.
   * Added `StatelessCase` op, and uses it if none of case branches has stateful ops.
+  * Added `tf.config.experimental.get_memory_usage` to return total memory usage
+    of the device.
 * `tf.data`:
     * Added new `tf.data.experimental.service.register_dataset` and
      `tf.data.experimental.service.from_dataset_id` APIs to enable one process
       to register a dataset with the tf.data service, and another process to
       consume data from the dataset.
+    * Added support for tf.data service dispatcher fault tolerance. To enable
+      fault tolerance, configure a `work_dir` when running your dispatcher
+      server and set `dispatcher_fault_tolerance=True`. The dispatcher will
+      store its state to `work_dir`, so that on restart it can continue from its
+      previous state after restart.
+    * Added tf.data service support for sharing dataset graphs via shared
+      filesystem instead of over RPC. This reduces load on the dispatcher,
+      improving performance of distributing datasets. For this to work, the
+      dispatcher's `work_dir` must be accessible from workers. If the worker
+      fails to read from the `work_dir`, it falls back to using RPC for dataset
+      graph transfer.
     * Added optional `exclude_cols` parameter to CsvDataset. This parameter is
       the complement of `select_cols`; at most one of these should be specified.
     * We have implemented an optimization which reorders data-discarding
@@ -79,11 +105,19 @@
       dataset when it is safe to do so. The optimization can be disabled via
       the `experimental_optimization.reorder_data_discarding_ops` dataset
       option.
+    * `tf.data.Options` were previously immutable and can now be overriden.
+    * `tf.data.Dataset.from_generator` now supports Ragged and Sparse tensors
+      with a new `output_signature` argument, which allows `from_generator` to
+      produce any type describable by a `tf.TypeSpec`.
+    * `tf.data.experimental.AUTOTUNE` is now available in the core API as
+      `tf.data.AUTOTUNE`.
 * `tf.image`:
     * Added deterministic `tf.image.stateless_random_*` functions for each
-      `tf.image.random_*` function. Given the same seed, the stateless functions
-      produce the same results independent of how many times the function is
-      called, and independent of global seed settings.
+      `tf.image.random_*` function. Added a new op
+      `stateless_sample_distorted_bounding_box` which is a determinstic
+      version of `sample_distorted_bounding_box` op. Given the same seed, these
+      stateless functions/ops produce the same results independent of how many
+      times the function is called, and independent of global seed settings.
 *   `tf.distribute`:
     * <ADD RELEASE NOTES HERE>
 * `tf.keras`:
@@ -95,16 +129,49 @@
       * Error messages when Functional API construction goes wrong (and when ops cannot be converted to Keras layers automatically) should be clearer and easier to understand.
     * `Optimizer.minimize` can now accept a loss `Tensor` and a `GradientTape`
       as an alternative to accepting a `callable` loss.
+    * Added `beta` hyperparameter to FTRL optimizer classes (Keras and others)
+      to match FTRL paper (https://research.google.com/pubs/archive/41159.pdf).
+    * Added `mobilenet_v3` to keras application model.
+    * `Optimizer.__init__` now accepts a `gradient_aggregator` to allow for
+      customization of how gradients are aggregated across devices, as well as
+      `gradients_transformers` to allow for custom gradient transformations
+      (such as gradient clipping).
+    * The `steps_per_execution` argument in `compile()` is no longer
+      experimental; if you were passing `experimental_steps_per_execution`,
+      rename it to `steps_per_execution` in your code. This argument controls
+      the number of batches to run during each `tf.function` call when calling
+      `fit()`. Running multiple batches inside a single `tf.function` call can
+      greatly improve performance on TPUs or small models with a large Python
+      overhead.
 * `tf.function` / AutoGraph:
   * Added `experimental_follow_type_hints` argument for `tf.function`. When
     True, the function may use type annotations to optimize the tracing
     performance.
   * Added support for `iter(DistributedDataset)` in AutoGraph `for` loops.
+  * AutoGraph now allows creating new symbols inside a TensorFLow loop, if
+    the values of these symbols at an iteration does not depend on the previous
+    iteration. These types of loops must run at least one iteration, and will
+    raise a runtime error otherwise.
+
+    Example:
+
+    ```
+    for batch in data:
+      outputs = train_step(batch)
+    tf.print('final outputs', outputs)
+    ```
+    See tensorflow/python/autograph/g3doc/reference/limitations.md for more
+    info.
 *   `tf.lite`:
     * `DynamicBuffer::AddJoinedString()` will now add a separator if the first
       string to be joined is empty.
     * `TFLiteConverter`:
       * Support optional flags `inference_input_type` and `inference_output_type` for full integer quantized models. This allows users to modify the model input and output type to integer types (`tf.int8`, `tf.uint8`) instead of defaulting to float type (`tf.float32`).
+    * Deprecate `Interpreter::UseNNAPI(bool)` C++ API
+      * Prefer using `NnApiDelegate()` and related delegate configuration methods directly.
+    * Add NNAPI Delegation support for requantization use cases by converting the operation into a dequantize-quantize pair.
+    * TFLite Profiler for Android is available. See the detailed
+      [guide](https://www.tensorflow.org/lite/performance/measurement#trace_tensorflow_lite_internals_in_android).
     * <ADD RELEASE NOTES HERE>
 *   `tf.random`:
     * <ADD RELEASE NOTES HERE>
@@ -116,14 +183,28 @@
       behavior by adjusting the `l2` parameter.
     * <ADD RELEASE NOTES HERE>
 *   XLA Support:
+    * xla.experimental.compile is deprecated, use
+      `tf.function(experimental_compile=True)` instead
+    * Added `tf.function.experimental_get_compiler_ir` which returns compiler IR
+    (currently 'hlo' and 'optimized_hlo') for given input for given function.
     * <ADD RELEASE NOTES HERE>
 *   Tracing and Debugging:
     * <ADD RELEASE NOTES HERE>
+*   `tf.train.Checkpoint`:
+    * Now accepts a `root` argument in the initialization, which generates a
+      checkpoint with a root object. This allows users to create a `Checkpoint`
+      object that is compatible with Keras `model.save_weights()` and
+      `model.load_weights`. The checkpoint is also compatible with the
+      checkpoint saved in the `variables/` folder in the SavedModel.
+    * When restoring, `save_path` can be a path to a SavedModel. The function
+      will automatically find the checkpoint in the SavedModel.
+*   `tf.nn`:
+    * `tf.nn.max_pool2d` now supports explicit padding.
 *   Other:
     * We have replaced uses of "whitelist" and "blacklist" with "allowlist"
   and "denylist" where possible. Please see 
   https://developers.google.com/style/word-list#blacklist for more context.
-    * <ADD RELEASE NOTES HERE>
+  <ADD RELEASE NOTES HERE>
 
 ## Thanks to our Contributors
 
@@ -215,6 +296,7 @@ stjohnso98, <NAME>, <HERE>, <USING>, <GITHUB>, <HANDLE>
     * Mutable tables now restore checkpointed values when loaded from SavedModel.
   * GPU
     * TF 2.3 includes PTX kernels only for [compute capability](https://developer.nvidia.com/cuda-gpus) 7.0 to reduce the TF pip binary size.  Earlier releases included PTX for a variety of older compute capabilities.
+    * Remove environmental variable `TF_USE_CUDNN`.
   * Others
     * Retain parent namescope for ops added inside `tf.while_loop`/`tf.cond`/`tf.switch_case`.
     * Update `tf.vectorized_map` to support vectorizing `tf.while_loop` and TensorList operations.
@@ -1546,6 +1628,7 @@ Yuan (Terry) Tang, Yuchen Ying, Yves-Noel Weweler, zhangyujing, zjjott, zyeric,
         color palette of the frame. This has been fixed now
     *   image.resize now considers proper pixel centers and has new kernels
         (incl. anti-aliasing).
+    *   Added an isotonic regression solver (tf.nn.isotonic_regression).
 *   Performance
     *   Turn on MKL-DNN contraction kernels by default. MKL-DNN dynamically
         dispatches the best kernel implementation based on CPU vector
diff --git a/configure.cmd b/configure.cmd
index 021afdbbea1..738e106da18 100644
--- a/configure.cmd
+++ b/configure.cmd
@@ -16,5 +16,5 @@
 
 set configure_dir=%~dp0
 set configure_dir=%configure_dir:~0,-1%
-python %configure_dir%\configure.py %* || ( exit /b )
+python "%configure_dir%\configure.py" %* || ( exit /b )
 echo Configuration finished
diff --git a/configure.py b/configure.py
index 9524eada3cd..5b9fd55b740 100644
--- a/configure.py
+++ b/configure.py
@@ -38,9 +38,6 @@ _DEFAULT_CUDNN_VERSION = '7'
 _DEFAULT_TENSORRT_VERSION = '6'
 _DEFAULT_CUDA_COMPUTE_CAPABILITIES = '3.5,7.0'
 
-_TF_OPENCL_VERSION = '1.2'
-_DEFAULT_COMPUTECPP_TOOLKIT_PATH = '/usr/local/computecpp'
-_DEFAULT_TRISYCL_INCLUDE_DIR = '/usr/local/triSYCL/include'
 _SUPPORTED_ANDROID_NDK_VERSIONS = [10, 11, 12, 13, 14, 15, 16, 17, 18]
 
 _DEFAULT_PROMPT_ASK_ATTEMPTS = 10
@@ -1114,62 +1111,6 @@ def set_host_c_compiler(environ_cp):
   write_action_env_to_bazelrc('HOST_C_COMPILER', host_c_compiler)
 
 
-def set_computecpp_toolkit_path(environ_cp):
-  """Set COMPUTECPP_TOOLKIT_PATH."""
-
-  def toolkit_exists(toolkit_path):
-    """Check if a computecpp toolkit path is valid."""
-    if is_linux():
-      sycl_rt_lib_path = 'lib/libComputeCpp.so'
-    else:
-      sycl_rt_lib_path = ''
-
-    sycl_rt_lib_path_full = os.path.join(toolkit_path, sycl_rt_lib_path)
-    exists = os.path.exists(sycl_rt_lib_path_full)
-    if not exists:
-      print('Invalid SYCL %s library path. %s cannot be found' %
-            (_TF_OPENCL_VERSION, sycl_rt_lib_path_full))
-    return exists
-
-  computecpp_toolkit_path = prompt_loop_or_load_from_env(
-      environ_cp,
-      var_name='COMPUTECPP_TOOLKIT_PATH',
-      var_default=_DEFAULT_COMPUTECPP_TOOLKIT_PATH,
-      ask_for_var=(
-          'Please specify the location where ComputeCpp for SYCL %s is '
-          'installed.' % _TF_OPENCL_VERSION),
-      check_success=toolkit_exists,
-      error_msg='Invalid SYCL compiler path. %s cannot be found.',
-      suppress_default_error=True)
-
-  write_action_env_to_bazelrc('COMPUTECPP_TOOLKIT_PATH',
-                              computecpp_toolkit_path)
-
-
-def set_trisycl_include_dir(environ_cp):
-  """Set TRISYCL_INCLUDE_DIR."""
-
-  ask_trisycl_include_dir = ('Please specify the location of the triSYCL '
-                             'include directory. (Use --config=sycl_trisycl '
-                             'when building with Bazel) '
-                             '[Default is %s]: ') % (
-                                 _DEFAULT_TRISYCL_INCLUDE_DIR)
-
-  while True:
-    trisycl_include_dir = get_from_env_or_user_or_default(
-        environ_cp, 'TRISYCL_INCLUDE_DIR', ask_trisycl_include_dir,
-        _DEFAULT_TRISYCL_INCLUDE_DIR)
-    if os.path.exists(trisycl_include_dir):
-      break
-
-    print('Invalid triSYCL include directory, %s cannot be found' %
-          (trisycl_include_dir))
-
-  # Set TRISYCL_INCLUDE_DIR
-  environ_cp['TRISYCL_INCLUDE_DIR'] = trisycl_include_dir
-  write_action_env_to_bazelrc('TRISYCL_INCLUDE_DIR', trisycl_include_dir)
-
-
 def system_specific_test_config(environ_cp):
   """Add default build and test flags required for TF tests to bazelrc."""
   write_to_bazelrc('test --flaky_test_attempts=3')
@@ -1397,8 +1338,6 @@ def main():
   setup_python(environ_cp)
 
   if is_windows():
-    environ_cp['TF_NEED_OPENCL_SYCL'] = '0'
-    environ_cp['TF_NEED_COMPUTECPP'] = '0'
     environ_cp['TF_NEED_OPENCL'] = '0'
     environ_cp['TF_CUDA_CLANG'] = '0'
     environ_cp['TF_NEED_TENSORRT'] = '0'
@@ -1415,21 +1354,6 @@ def main():
   if environ_cp.get('TF_ENABLE_XLA', '1') == '1':
     write_to_bazelrc('build --config=xla')
 
-  set_action_env_var(
-      environ_cp,
-      'TF_NEED_OPENCL_SYCL',
-      'OpenCL SYCL',
-      False,
-      bazel_config_name='sycl')
-  if environ_cp.get('TF_NEED_OPENCL_SYCL') == '1':
-    set_host_cxx_compiler(environ_cp)
-    set_host_c_compiler(environ_cp)
-    set_action_env_var(environ_cp, 'TF_NEED_COMPUTECPP', 'ComputeCPP', True)
-    if environ_cp.get('TF_NEED_COMPUTECPP') == '1':
-      set_computecpp_toolkit_path(environ_cp)
-    else:
-      set_trisycl_include_dir(environ_cp)
-
   set_action_env_var(
       environ_cp, 'TF_NEED_ROCM', 'ROCm', False, bazel_config_name='rocm')
   if (environ_cp.get('TF_NEED_ROCM') == '1' and
@@ -1442,6 +1366,11 @@ def main():
     write_action_env_to_bazelrc('ROCM_PATH', environ_cp.get('ROCM_PATH'))
     write_action_env_to_bazelrc('ROCM_ROOT', environ_cp.get('ROCM_PATH'))
 
+  if ((environ_cp.get('TF_NEED_ROCM') == '1') and
+      (environ_cp.get('TF_ENABLE_MLIR_GENERATED_GPU_KERNELS') == '1')):
+    write_to_bazelrc(
+        'build:rocm --define tensorflow_enable_mlir_generated_gpu_kernels=1')
+
   environ_cp['TF_NEED_CUDA'] = str(
       int(get_var(environ_cp, 'TF_NEED_CUDA', 'CUDA', False)))
   if (environ_cp.get('TF_NEED_CUDA') == '1' and
@@ -1523,17 +1452,15 @@ def main():
     # use it for the CPU build.
     set_tf_download_clang(environ_cp)
 
-  # SYCL / ROCm / CUDA are mutually exclusive.
+  # ROCm / CUDA are mutually exclusive.
   # At most 1 GPU platform can be configured.
   gpu_platform_count = 0
-  if environ_cp.get('TF_NEED_OPENCL_SYCL') == '1':
-    gpu_platform_count += 1
   if environ_cp.get('TF_NEED_ROCM') == '1':
     gpu_platform_count += 1
   if environ_cp.get('TF_NEED_CUDA') == '1':
     gpu_platform_count += 1
   if gpu_platform_count >= 2:
-    raise UserInputError('SYCL / CUDA / ROCm are mututally exclusive. '
+    raise UserInputError('CUDA / ROCm are mututally exclusive. '
                          'At most 1 GPU platform can be configured.')
 
   set_cc_opt_flags(environ_cp)
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index d1c1d7dcdef..668f3a55579 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -562,6 +562,7 @@ selects.config_setting_group(
 package_group(
     name = "internal",
     packages = [
+        "//learning/brain/distribute/...",
         "//learning/brain/swift/x10/...",
         "//perftools/accelerators/xprof/api/...",
         "//tensorflow/...",
@@ -578,11 +579,6 @@ package_group(
     packages = ["//learning/pathways/..."],
 )
 
-# Packages that use composite tensors or dispatch.
-# TODO(b/154762408) Remove this package group once it's no longer needed.
-# If this is modified, then copy.bara.sky must also be modified.
-package_group(name = "composite_tensor_whitelist")
-
 # Packages that use private types symbols, until they are exported.
 # TODO(b/154650521) Remove.
 package_group(
diff --git a/tensorflow/api_template.__init__.py b/tensorflow/api_template.__init__.py
index 0cd2b7da139..5932dda514d 100644
--- a/tensorflow/api_template.__init__.py
+++ b/tensorflow/api_template.__init__.py
@@ -137,7 +137,7 @@ if _running_from_pip_package():
   # TODO(gunan): Add sanity checks to loaded modules here.
   for _s in _site_packages_dirs:
     # Load first party dynamic kernels.
-    _main_dir = _os.path.join(_s, 'tensorflow_core/core/kernels')
+    _main_dir = _os.path.join(_s, 'tensorflow/core/kernels')
     if _fi.file_exists(_main_dir):
       _ll.load_library(_main_dir)
 
diff --git a/tensorflow/api_template_v1.__init__.py b/tensorflow/api_template_v1.__init__.py
index b73af197f7b..0d1d2e56fae 100644
--- a/tensorflow/api_template_v1.__init__.py
+++ b/tensorflow/api_template_v1.__init__.py
@@ -147,7 +147,7 @@ if _running_from_pip_package():
   # TODO(gunan): Add sanity checks to loaded modules here.
   for _s in _site_packages_dirs:
     # Load first party dynamic kernels.
-    _main_dir = _os.path.join(_s, 'tensorflow_core/core/kernels')
+    _main_dir = _os.path.join(_s, 'tensorflow/core/kernels')
     if _fi.file_exists(_main_dir):
       _ll.load_library(_main_dir)
 
diff --git a/tensorflow/c/BUILD b/tensorflow/c/BUILD
index e5efe323922..01f48cad192 100644
--- a/tensorflow/c/BUILD
+++ b/tensorflow/c/BUILD
@@ -23,6 +23,7 @@ filegroup(
     srcs = [
         "c_api.h",
         "c_api_experimental.h",
+        "c_api_macros.h",
         "tensor_interface.h",
         "tf_attrtype.h",
         "tf_datatype.h",
@@ -57,10 +58,11 @@ filegroup(
     visibility = ["//visibility:public"],
 )
 
-filegroup(
+cc_library(
     name = "pywrap_required_hdrs",
-    srcs = [
+    textual_hdrs = [
         "c_api_internal.h",
+        "c_api_macros.h",
         "conversion_macros.h",
         "python_api.h",
         "tensor_interface.h",
@@ -79,6 +81,7 @@ tf_cuda_library(
     hdrs = [
         "c_api.h",
         "c_api_internal.h",
+        "c_api_macros.h",
         "tf_datatype.h",
         "tf_tensor.h",
         "tf_tstring.h",
@@ -217,6 +220,7 @@ cc_library(
     name = "logging",
     srcs = ["logging.cc"],
     hdrs = ["logging.h"],
+    visibility = ["//visibility:public"],
     deps = [
         ":c_api_macros",
         "//tensorflow/core/platform:logging",
@@ -310,6 +314,7 @@ cc_library(
     hdrs = ["tf_tensor.h"],
     visibility = ["//visibility:public"],
     deps = [
+        ":c_api_macros",
         ":tensor_interface",
         ":tf_datatype",
         ":tf_status",
@@ -336,6 +341,7 @@ tf_cuda_library(
     ],
     visibility = ["//tensorflow:internal"],
     deps = [
+        ":c_api_macros",
         ":tensor_interface",
         ":tf_datatype",
         ":tf_status",
@@ -371,6 +377,7 @@ tf_cuda_library(
         "//tensorflow/c/eager:tfe_op_internal",
         "//tensorflow/c/eager:tfe_tensorhandle_internal",
         "//tensorflow/compiler/jit:flags",
+        "//tensorflow/compiler/jit:get_compiler_ir",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -381,6 +388,7 @@ tf_cuda_library(
         "//tensorflow/core/common_runtime/eager:eager_operation",
         "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
         "//tensorflow/core/platform",
+        "//tensorflow/core/platform:blocking_counter",
         "@com_google_absl//absl/strings",
     ],
     alwayslink = 1,
diff --git a/tensorflow/c/c_api_experimental.cc b/tensorflow/c/c_api_experimental.cc
index b4297033b6d..81fb9d1a2b8 100644
--- a/tensorflow/c/c_api_experimental.cc
+++ b/tensorflow/c/c_api_experimental.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/platform/blocking_counter.h"
 #include "tensorflow/core/platform/casts.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/platform/net.h"
@@ -560,6 +561,21 @@ TF_CAPI_EXPORT extern void TFE_AbortCollectiveOps(TFE_Context* ctx,
   collective_executor_handle->get()->StartAbort(status->status);
 }
 
+TF_CAPI_EXPORT extern void TFE_CollectiveOpsCheckPeerHealth(TFE_Context* ctx,
+                                                            const char* task,
+                                                            TF_Status* status) {
+  tensorflow::EagerContext* context =
+      tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
+  auto collective_executor_handle = context->GetCollectiveExecutorHandle();
+  tensorflow::Notification done;
+  collective_executor_handle->get()->remote_access()->CheckPeerHealth(
+      task, [&done, status](const Status& s) {
+        status->status = s;
+        done.Notify();
+      });
+  done.WaitForNotification();
+}
+
 TF_ShapeAndTypeList* TF_NewShapeAndTypeList(int num_items) {
   TF_ShapeAndTypeList* result = new TF_ShapeAndTypeList;
   result->num_items = num_items;
diff --git a/tensorflow/c/c_api_experimental.h b/tensorflow/c/c_api_experimental.h
index ebd14b4b571..c9c74f4e874 100644
--- a/tensorflow/c/c_api_experimental.h
+++ b/tensorflow/c/c_api_experimental.h
@@ -231,13 +231,20 @@ TF_CAPI_EXPORT extern void TFE_EnableCollectiveOps(TFE_Context* ctx,
                                                    TF_Status* status);
 
 // Aborts all ongoing collectives with the specified status. After abortion,
-// subsequent collectives will error with this status immediately.
+// subsequent collectives will error with this status immediately. To reset the
+// collectives, create a new EagerContext.
 //
-// This is intended to be used when a peer failure is detected. There's yet no
-// way to reset the collectives other than restarting the program.
+// This is intended to be used when a peer failure is detected.
 TF_CAPI_EXPORT extern void TFE_AbortCollectiveOps(TFE_Context* ctx,
                                                   TF_Status* status);
 
+// Checks the health of collective ops peers. Explicit health check is needed in
+// multi worker collective ops to detect failures in the cluster.  If a peer is
+// down, collective ops may hang.
+TF_CAPI_EXPORT extern void TFE_CollectiveOpsCheckPeerHealth(TFE_Context* ctx,
+                                                            const char* task,
+                                                            TF_Status* status);
+
 // Information about the shape of a Tensor and its type.
 struct TF_ShapeAndType {
   // Number of dimensions. -1 indicates unknown rank.
diff --git a/tensorflow/c/c_api_function_test.cc b/tensorflow/c/c_api_function_test.cc
index 3fff9bcd371..ec8cfe4a31a 100644
--- a/tensorflow/c/c_api_function_test.cc
+++ b/tensorflow/c/c_api_function_test.cc
@@ -1704,66 +1704,5 @@ TEST_F(CApiFunctionTest, GetFunctionsFromGraph) {
   TF_DeleteFunction(func1);
 }
 
-// This test only works when the TF build includes XLA compiler. One way to set
-// this up is via bazel build option "--define with_xla_support=true".
-//
-// FIXME: generalize the macro name TENSORFLOW_EAGER_USE_XLA to
-// something like TENSORFLOW_CAPI_USE_XLA.
-#ifdef TENSORFLOW_EAGER_USE_XLA
-TEST_F(CApiFunctionTest, StatelessIf_XLA) {
-  TF_Function* func;
-  const std::string funcName = "BranchFunc";
-  DefineFunction(funcName.c_str(), &func);
-  TF_GraphCopyFunction(host_graph_, func, nullptr, s_);
-  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
-
-  TF_Operation* feed = Placeholder(host_graph_, s_);
-  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
-
-  TF_Operation* true_cond = ScalarConst(true, host_graph_, s_);
-  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
-
-  TF_OperationDescription* desc =
-      TF_NewOperation(host_graph_, "StatelessIf", "IfNode");
-  TF_AddInput(desc, {true_cond, 0});
-  TF_Output inputs[] = {{feed, 0}};
-  TF_AddInputList(desc, inputs, TF_ARRAYSIZE(inputs));
-  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
-  TF_SetAttrType(desc, "Tcond", TF_BOOL);
-  TF_DataType inputType = TF_INT32;
-  TF_SetAttrTypeList(desc, "Tin", &inputType, 1);
-  TF_SetAttrTypeList(desc, "Tout", &inputType, 1);
-  TF_SetAttrFuncName(desc, "then_branch", funcName.data(), funcName.size());
-  TF_SetAttrFuncName(desc, "else_branch", funcName.data(), funcName.size());
-  TF_SetDevice(desc, "/device:XLA_CPU:0");
-  auto op = TF_FinishOperation(desc, s_);
-  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
-  ASSERT_NE(op, nullptr);
-
-  // Create a session for this graph.
-  CSession csession(host_graph_, s_, /*use_XLA*/ true);
-  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
-
-  // Run the graph.
-  csession.SetInputs({{feed, Int32Tensor(17)}});
-  csession.SetOutputs({op});
-  csession.Run(s_);
-  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
-  TF_Tensor* out = csession.output_tensor(0);
-  ASSERT_TRUE(out != nullptr);
-  EXPECT_EQ(TF_INT32, TF_TensorType(out));
-  EXPECT_EQ(0, TF_NumDims(out));  // scalar
-  ASSERT_EQ(sizeof(int32), TF_TensorByteSize(out));
-  int32* output_contents = static_cast<int32*>(TF_TensorData(out));
-  EXPECT_EQ(-17, *output_contents);
-
-  // Clean up
-  csession.CloseAndDelete(s_);
-  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
-
-  TF_DeleteFunction(func);
-}
-#endif  // TENSORFLOW_EAGER_USE_XLA
-
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/c/c_api_macros.h b/tensorflow/c/c_api_macros.h
index 85c9507db87..e0c91a0d549 100644
--- a/tensorflow/c/c_api_macros.h
+++ b/tensorflow/c/c_api_macros.h
@@ -30,4 +30,17 @@ limitations under the License.
 #endif  // _WIN32
 #endif  // SWIG
 
+// TF_Bool is the C API typedef for unsigned char, while TF_BOOL is
+// the datatype for boolean tensors.
+#ifndef TF_Bool
+#define TF_Bool unsigned char
+#endif  // TF_Bool
+
+// Macro used to calculate struct size for maintaining ABI stability across
+// different struct implementations.
+#ifndef TF_OFFSET_OF_END
+#define TF_OFFSET_OF_END(TYPE, MEMBER) \
+  (offsetof(TYPE, MEMBER) + sizeof(((TYPE *)0)->MEMBER))
+#endif  // TF_OFFSET_OF_END
+
 #endif  // TENSORFLOW_C_C_API_MACROS_H_
diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index 61701bc8b21..d259b32f339 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -6,7 +6,6 @@ load(
     "tf_copts",
     "tf_cuda_cc_test",
     "tf_cuda_library",
-    "tfe_xla_copts",
 )
 load(
     "//tensorflow/core/platform:build_config.bzl",
@@ -31,7 +30,7 @@ tf_cuda_library(
         "c_api_unified_experimental.h",
     ],
     hdrs = ["c_api.h"],
-    copts = tf_copts() + tfe_xla_copts(),
+    copts = tf_copts(),
     visibility = ["//visibility:public"],
     deps = select({
         "//tensorflow:android": [
@@ -72,13 +71,6 @@ tf_cuda_library(
             "//tensorflow/core:protos_all_cc",
             "//tensorflow/core/profiler/lib:traceme",
         ],
-    }) + select({
-        "//tensorflow:with_xla_support": [
-            "//tensorflow/compiler/tf2xla:xla_compiler",
-            "//tensorflow/compiler/jit",
-            "//tensorflow/compiler/jit:xla_device",
-        ],
-        "//conditions:default": [],
     }) + [
         "@com_google_absl//absl/memory",
         "//tensorflow/core/common_runtime/eager:eager_operation",
@@ -109,11 +101,17 @@ filegroup(
         "c_api_experimental.h",
         "c_api_internal.h",
         "c_api_unified_experimental.h",
+        "c_api_unified_experimental_internal.h",
         "dlpack.h",
+        "gradients.h",
+        "gradients_internal.h",
         "immediate_execution_context.h",
         "immediate_execution_operation.h",
         "immediate_execution_tensor_handle.h",
+        "mnist_gradients_testutil.h",
+        "tape.h",
         "tfe_cancellation_manager_internal.h",
+        "tfe_context_internal.h",
         "tfe_executor_internal.h",
         "tfe_monitoring_internal.h",
         "tfe_op_attrs_internal.h",
@@ -171,31 +169,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "gradients",
-    srcs = [
-        "gradients.cc",
-        "gradients_internal.h",
-    ],
-    hdrs = [
-        "gradients.h",
-    ],
-    visibility = [
-        "//tensorflow:internal",
-    ],
-    deps = [
-        ":abstract_context",
-        ":abstract_operation",
-        ":abstract_tensor_handle",
-        ":c_api_unified_internal",
-        ":tape",
-        "//tensorflow/core/common_runtime/eager:attr_builder",
-        "//tensorflow/core/lib/llvm_rtti",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/strings",
-    ],
-)
-
 cc_library(
     name = "gradients_internal",
     srcs = [
@@ -228,7 +201,6 @@ tf_cuda_cc_test(
         "gradients_test.cc",
     ],
     args = ["--heap_check=local"],
-    extra_copts = tfe_xla_copts(),
     linkstatic = tf_kernel_tests_linkstatic(),
     tags = tf_cuda_tests_tags() + ["nomac"],
     deps = [
@@ -240,6 +212,7 @@ tf_cuda_cc_test(
         "//tensorflow/c:c_api",
         "//tensorflow/c:c_test_util",
         "//tensorflow/c:tf_status_helper",
+        "//tensorflow/c/experimental/gradients:array_grad",
         "//tensorflow/c/experimental/gradients:math_grad",
         "//tensorflow/c/experimental/ops:array_ops",
         "//tensorflow/cc/profiler",
@@ -249,6 +222,184 @@ tf_cuda_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/lib/llvm_rtti",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "gradients_util",
+    srcs = [
+        "gradients_util.cc",
+    ],
+    hdrs = [
+        "gradients_util.h",
+    ],
+    visibility = [
+        "//tensorflow:internal",
+    ],
+    deps = [
+        ":abstract_context",
+        ":abstract_operation",
+        ":abstract_tensor_handle",
+        ":c_api",
+        ":c_api_experimental",
+        ":c_api_unified_internal",
+        ":gradients_internal",
+        ":tape",
+        "//tensorflow/c:c_api",
+        "//tensorflow/c:tf_status_helper",
+        "//tensorflow/c/experimental/ops:array_ops",
+        "//tensorflow/c/experimental/ops:math_ops",
+        "//tensorflow/c/experimental/ops:nn_ops",
+        "//tensorflow/cc/profiler",
+        "//tensorflow/compiler/mlir/tensorflow/c:mlir_c_api_registration",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/lib/llvm_rtti",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "mnist_gradients_testutil",
+    srcs = [
+        "mnist_gradients_testutil.cc",
+    ],
+    hdrs = [
+        "mnist_gradients_testutil.h",
+    ],
+    visibility = [
+        "//tensorflow:internal",
+    ],
+    deps = [
+        ":abstract_tensor_handle",
+        ":c_api_experimental",
+        ":c_api_unified_internal",
+        ":gradients_internal",
+        ":gradients_util",
+        ":tape",
+        "//tensorflow/c/experimental/ops:array_ops",
+        "//tensorflow/c/experimental/ops:math_ops",
+        "//tensorflow/c/experimental/ops:nn_ops",
+        "//tensorflow/core/lib/llvm_rtti",
+        "//tensorflow/core/platform:status",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "gradient_checker",
+    srcs = [
+        "gradient_checker.cc",
+    ],
+    hdrs = [
+        "gradient_checker.h",
+    ],
+    visibility = [
+        "//tensorflow:internal",
+    ],
+    deps = [
+        ":abstract_tensor_handle",
+        ":c_api_experimental",
+        ":c_api_unified_internal",
+        ":gradients_internal",
+        ":gradients_util",
+        "//tensorflow/c:c_api",
+        "//tensorflow/c:tf_status_helper",
+        "//tensorflow/c/experimental/gradients:math_grad",
+        "//tensorflow/c/experimental/gradients:nn_grad",
+        "//tensorflow/c/experimental/ops:array_ops",
+        "//tensorflow/c/experimental/ops:math_ops",
+        "//tensorflow/c/experimental/ops:nn_ops",
+        "//tensorflow/cc/profiler",
+        "//tensorflow/compiler/mlir/tensorflow/c:mlir_c_api_registration",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/lib/llvm_rtti",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+tf_cuda_cc_test(
+    name = "gradient_checker_test",
+    size = "small",
+    srcs = [
+        "gradient_checker_test.cc",
+    ],
+    args = ["--heap_check=local"],
+    linkstatic = tf_kernel_tests_linkstatic(),
+    tags = tf_cuda_tests_tags() + ["nomac"],
+    deps = [
+        ":abstract_tensor_handle",
+        ":c_api_experimental",
+        ":c_api_test_util",
+        ":c_api_unified_internal",
+        ":gradient_checker",
+        ":gradients_internal",
+        ":gradients_util",
+        ":mnist_gradients_testutil",
+        "//tensorflow/c:c_api",
+        "//tensorflow/c:c_test_util",
+        "//tensorflow/c:tf_status_helper",
+        "//tensorflow/c/experimental/gradients:math_grad",
+        "//tensorflow/c/experimental/gradients:nn_grad",
+        "//tensorflow/c/experimental/ops:array_ops",
+        "//tensorflow/c/experimental/ops:math_ops",
+        "//tensorflow/c/experimental/ops:nn_ops",
+        "//tensorflow/cc/profiler",
+        "//tensorflow/compiler/mlir/tensorflow/c:mlir_c_api_registration",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/lib/llvm_rtti",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+tf_cuda_cc_test(
+    name = "mnist_gradients_test",
+    size = "small",
+    srcs = [
+        "mnist_gradients_test.cc",
+    ],
+    args = ["--heap_check=local"],
+    linkstatic = tf_kernel_tests_linkstatic(),
+    tags = tf_cuda_tests_tags() + [
+        "nomac",
+    ],
+    deps = [
+        ":abstract_tensor_handle",
+        ":c_api_experimental",
+        ":c_api_unified_internal",
+        ":gradients_internal",
+        ":gradients_util",
+        ":mnist_gradients_testutil",
+        "//tensorflow/c:c_api",
+        "//tensorflow/c:c_test_util",
+        "//tensorflow/c:tf_status_helper",
+        "//tensorflow/c/experimental/gradients:math_grad",
+        "//tensorflow/c/experimental/gradients:nn_grad",
+        "//tensorflow/c/experimental/ops:array_ops",
+        "//tensorflow/c/experimental/ops:math_ops",
+        "//tensorflow/c/experimental/ops:nn_ops",
+        "//tensorflow/cc/profiler",
+        "//tensorflow/compiler/mlir/tensorflow/c:mlir_c_api_registration",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/lib/llvm_rtti",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
     ],
@@ -482,7 +633,6 @@ tf_cuda_cc_test(
         "c_api_debug_test.cc",
         "c_api_test.cc",
     ],
-    extra_copts = tfe_xla_copts(),
     tags = [
         "noguitar",  # TODO(b/155445984): flaky
         #"guitar",
@@ -508,6 +658,27 @@ tf_cuda_cc_test(
     ],
 )
 
+tf_cuda_library(
+    name = "c_api_remote_test_util",
+    testonly = 1,
+    srcs = ["c_api_remote_test_util.cc"],
+    hdrs = ["c_api_remote_test_util.h"],
+    visibility = ["//tensorflow:__subpackages__"],
+    deps = [
+        ":c_api",
+        ":c_api_internal",
+        ":c_api_test_util",
+        ":tfe_tensorhandle_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core/common_runtime/eager:tensor_handle",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 tf_cuda_cc_test(
     name = "c_api_remote_test",
     size = "small",
@@ -516,7 +687,6 @@ tf_cuda_cc_test(
     ],
     # TODO(b/136478427): Figure out how to correctly shut the server down
     args = ["--heap_check=local"],
-    extra_copts = tfe_xla_copts(),
     tags = [
         "no_windows",
     ],
@@ -524,6 +694,7 @@ tf_cuda_cc_test(
         ":c_api",
         ":c_api_experimental",
         ":c_api_internal",
+        ":c_api_remote_test_util",
         ":c_api_test_util",
         ":tfe_tensorhandle_internal",
         "//tensorflow/c:c_test_util",
@@ -540,6 +711,24 @@ tf_cuda_cc_test(
     ],
 )
 
+tf_cuda_cc_test(
+    name = "c_api_remote_function_test",
+    size = "small",
+    srcs = [
+        "c_api_remote_function_test.cc",
+    ],
+    # TODO(b/136478427): Figure out how to correctly shut the server down
+    args = ["--heap_check=local"],
+    tags = [
+        "no_windows",
+    ],
+    deps = [
+        ":c_api_remote_test_util",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 tf_cuda_cc_test(
     name = "c_api_distributed_test",
     size = "small",
@@ -548,7 +737,6 @@ tf_cuda_cc_test(
     ],
     # TODO(b/136478427): Figure out how to correctly shut the server down
     args = ["--heap_check=local"],
-    extra_copts = tfe_xla_copts(),
     tags = [
         "no_windows",
         "noasan",  # leaks gRPC server instances
@@ -582,7 +770,6 @@ tf_cuda_cc_test(
     ],
     # TODO(b/136478427): Figure out how to correctly shut the server down
     args = ["--heap_check=local"],
-    extra_copts = tfe_xla_copts(),
     tags = [
         "no_windows",
     ],
@@ -617,7 +804,7 @@ tf_cuda_library(
         "c_api_experimental.h",
         "c_api_unified_experimental.h",
     ],
-    copts = tf_copts() + tfe_xla_copts(),
+    copts = tf_copts(),
     visibility = ["//visibility:public"],
     deps = select({
         "//tensorflow:android": [
@@ -689,7 +876,6 @@ tf_cuda_cc_test(
         "c_api_experimental_test.cc",
     ],
     args = ["--heap_check=local"],
-    extra_copts = tfe_xla_copts(),
     linkstatic = tf_kernel_tests_linkstatic(),
     tags = tf_cuda_tests_tags() + ["nomac"],
     deps = [
@@ -702,6 +888,7 @@ tf_cuda_cc_test(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core/platform:status",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -713,7 +900,6 @@ tf_cuda_cc_test(
         "c_api_unified_experimental_test.cc",
     ],
     args = ["--heap_check=local"],
-    extra_copts = tfe_xla_copts(),
     linkstatic = tf_kernel_tests_linkstatic(),
     tags = tf_cuda_tests_tags() + ["nomac"],
     deps = [
@@ -722,6 +908,7 @@ tf_cuda_cc_test(
         ":c_api_test_util",
         "//tensorflow/c:c_api",
         "//tensorflow/c:c_test_util",
+        "//tensorflow/c:tf_status_helper",
         "//tensorflow/cc/profiler",
         "//tensorflow/compiler/mlir/tensorflow/c:mlir_c_api_registration",
         "//tensorflow/core:lib",
@@ -831,7 +1018,11 @@ filegroup(
             "c_api_unified_experimental_eager.cc",
             "c_api_unified_experimental_graph.cc",
             "c_api_unified_experimental_internal.h",
+            "gradient_checker.cc",
+            "gradient_checker.h",
             "gradients.cc",  # Uses RTTI.
+            "gradients_util.cc",
+            "gradients_util.h",
             "*test*",
             "*dlpack*",
         ],
diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index 76d603694e3..fb5ce22ae5f 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -51,9 +51,6 @@ limitations under the License.
 #include "tensorflow/core/protobuf/device_filters.pb.h"
 #include "tensorflow/core/protobuf/error_codes.pb.h"
 #include "tensorflow/core/util/device_name_utils.h"
-#ifdef TENSORFLOW_EAGER_USE_XLA
-#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#endif  // TENSORFLOW_EAGER_USE_XLA
 #include "tensorflow/core/common_runtime/copy_tensor.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
@@ -629,21 +626,30 @@ tensorflow::Status UpdateTFE_ContextWithServerDef(
                     "targets will fail.";
     }
   } else {
-    // The master's context_view_id will be incremented by one
-    // the UpdateRemoteMaster call later. We want all new workers and
-    // existing workers to also have the updated context_view_id, so
-    // we must set their context_view_id to the existing master's
-    // context_view_id + 1.
-    sg.Update(CreateRemoteContexts(
-        ctx, added_workers, context_id, context_view_id + 1, keep_alive_secs,
-        server_def, remote_eager_workers.get(), context->Executor().Async(),
-        context->LazyCopyFunctionRemoteInputs(), base_request));
+    if (sg.ok()) {
+      // Create remote contexts on the newly added workers only if the master
+      // has collected all device information from them (i.e., the
+      // GetAllRemoteDevices call returns succussfully). Note that in rare cases
+      // GetAllRemoteDevices can still fail even with RPCs configured to wait
+      // until the remote workers to become alive. If the master creates remote
+      // contexts on the workers whose devices are still not collected, those
+      // workers will be treated as existing workers subsequently, so the master
+      // will never get devices from them even with retrying UpdateServerDef.
+      sg.Update(CreateRemoteContexts(
+          ctx, added_workers, context_id, context_view_id + 1, keep_alive_secs,
+          server_def, remote_eager_workers.get(), context->Executor().Async(),
+          context->LazyCopyFunctionRemoteInputs(), base_request));
+    }
     if (!existing_workers.empty()) {
       if (VLOG_IS_ON(1)) {
         for (const string& w : existing_workers) {
           VLOG(1) << "Updating cluster with existing worker " << w;
         }
       }
+      // The master's context_view_id will be incremented by one in the
+      // UpdateRemoteMaster call later. We want existing workers to also have
+      // the updated context_view_id, so we must set their context_view_id to
+      // the master's current context_view_id + 1.
       sg.Update(UpdateRemoteContexts(ctx, existing_workers, added_workers,
                                      removed_workers, context_id,
                                      context_view_id + 1, server_def,
@@ -724,7 +730,7 @@ void TFE_DeleteContextOptions(TFE_ContextOptions* options) { delete options; }
 TFE_Context* TFE_NewContext(const TFE_ContextOptions* opts, TF_Status* status) {
   if (opts->use_tfrt) {
 #ifdef PLATFORM_GOOGLE
-    return tensorflow::wrap(new tfrt::ContextInterface(opts->async));
+    return tensorflow::wrap(new tfrt::tf::ContextInterface(opts->async));
 #else
     status->status = tensorflow::errors::Unimplemented("TFRT is not supported");
     return nullptr;
@@ -745,7 +751,6 @@ TFE_Context* TFE_NewContext(const TFE_ContextOptions* opts, TF_Status* status) {
       opts->session_options.options,
       static_cast<tensorflow::ContextDevicePlacementPolicy>(
           opts->device_placement_policy),
-      static_cast<tensorflow::ContextMirroringPolicy>(opts->mirroring_policy),
       opts->async, opts->lazy_remote_inputs_copy, device_mgr.release(),
       /*device_mgr_owned*/ true, r,
       tensorflow::GetDefaultCustomKernelCreator()));
@@ -851,20 +856,9 @@ TF_CAPI_EXPORT extern bool TFE_ContextCheckAlive(TFE_Context* ctx,
 #else   // !defined(IS_MOBILE_PLATFORM)
   tensorflow::EagerContext* context =
       tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
-  tensorflow::GrpcServer* grpc_server =
-      static_cast<tensorflow::GrpcServer*>(context->GetServer());
-
-  std::unique_ptr<tensorflow::eager::EagerClientCache> remote_eager_workers;
-  status->status = grpc_server->master_env()->worker_cache->GetEagerClientCache(
-      &remote_eager_workers);
-  if (!status->status.ok()) {
-    LOG(ERROR) << "Failed to get client cache for remote workers.";
-    return false;
-  }
-
   // TODO(yuefengz): support partially specified `worker_name`.
   tensorflow::core::RefCountPtr<tensorflow::eager::EagerClient> eager_client;
-  status->status = remote_eager_workers->GetClient(worker_name, &eager_client);
+  status->status = context->GetClient(worker_name, &eager_client);
   if (!status->status.ok()) {
     return false;
   }
@@ -1149,26 +1143,23 @@ void TFE_DeleteOp(TFE_Op* op) {
   tensorflow::unwrap(op)->Release();
 }
 
+const char* TFE_OpGetName(const TFE_Op* op, TF_Status* status) {
+  return tensorflow::unwrap(op)->Name().c_str();
+}
+
+TFE_Context* TFE_OpGetContext(const TFE_Op* op, TF_Status* status) {
+  return tensorflow::wrap(
+      &(OperationFromInterface(tensorflow::unwrap(op))->EagerContext()));
+}
+
 void TFE_OpSetDevice(TFE_Op* op, const char* device_name, TF_Status* status) {
   status->status = tensorflow::unwrap(op)->SetDeviceName(device_name);
 }
 
-const char* TFE_OpGetDevice(TFE_Op* op, TF_Status* status) {
+const char* TFE_OpGetDevice(const TFE_Op* op, TF_Status* status) {
   return tensorflow::unwrap(op)->DeviceName().c_str();
 }
 
-void TFE_OpSetXLACompilation(TFE_Op* op, unsigned char enable) {
-#ifdef TENSORFLOW_EAGER_USE_XLA
-  tensorflow::Status s = tensorflow::unwrap(op)->SetUseXla(enable);
-  if (!s.ok()) {
-    LOG(ERROR) << "Could not enable XLA compilation for op: " << s;
-  }
-#else
-  LOG(WARNING) << "This call is a no-op, as the TensorFlow library is not "
-                  "built with XLA support.";
-#endif  // TENSORFLOW_EAGER_USE_XLA
-}
-
 void TFE_OpAddInput(TFE_Op* op, TFE_TensorHandle* input, TF_Status* status) {
   status->status = tensorflow::unwrap(op)->AddInput(tensorflow::unwrap(input));
 }
@@ -1181,6 +1172,15 @@ void TFE_OpAddInputList(TFE_Op* op, TFE_TensorHandle** inputs, int num_inputs,
        static_cast<size_t>(num_inputs)});
 }
 
+extern int TFE_OpGetFlatInputCount(const TFE_Op* op, TF_Status* status) {
+  return tensorflow::unwrap(op)->GetInputs().size();
+}
+
+extern TFE_TensorHandle* TFE_OpGetFlatInput(const TFE_Op* op, int index,
+                                            TF_Status* status) {
+  return tensorflow::wrap(tensorflow::unwrap(op)->GetInputs()[index]);
+}
+
 TF_AttrType TFE_OpGetAttrType(TFE_Op* op, const char* attr_name,
                               unsigned char* is_list, TF_Status* status) {
   TF_AttrType ret = TF_ATTR_INT;
@@ -1486,7 +1486,7 @@ void TFE_ContextEndStep(TFE_Context* ctx) {
   tensorflow::unwrap(ctx)->EndStep();
 }
 
-const TFE_OpAttrs* TFE_OpGetAttrs(TFE_Op* op) {
+const TFE_OpAttrs* TFE_OpGetAttrs(const TFE_Op* op) {
   return tensorflow::wrap(
       &OperationFromInterface(tensorflow::unwrap(op))->Attrs());
 }
@@ -1551,8 +1551,67 @@ void SetOpAttrValueScalar(TFE_Context* ctx, TFE_Op* op,
       TFE_OpSetAttrFunction(op, attr_name, func_op);
       TFE_DeleteOp(func_op);
     } break;
-    case tensorflow::AttrValue::kList:
-      TF_FALLTHROUGH_INTENDED;
+    case tensorflow::AttrValue::kList: {
+      // String
+      if (const int s_size = default_value.list().s_size()) {
+        absl::InlinedVector<const void*, 4> values_vector;
+        absl::InlinedVector<size_t, 4> lengths_vector;
+        for (int i = 0; i < s_size; ++i) {
+          const string& v = default_value.list().s(i);
+          values_vector.push_back(v.data());
+          lengths_vector.push_back(v.size());
+        }
+        TFE_OpSetAttrStringList(op, attr_name, values_vector.data(),
+                                lengths_vector.data(), s_size);
+      }
+
+      // Int
+      if (const int i_size = default_value.list().i_size()) {
+        absl::InlinedVector<int64_t, 4> i_vector;
+        for (int i = 0; i < i_size; ++i) {
+          i_vector.push_back(default_value.list().i(i));
+        }
+        TFE_OpSetAttrIntList(op, attr_name, i_vector.data(), i_size);
+      }
+      // Float
+      if (const int f_size = default_value.list().f_size()) {
+        absl::InlinedVector<float, 4> f_vector;
+        for (int i = 0; i < f_size; ++i) {
+          f_vector.push_back(default_value.list().f(i));
+        }
+        TFE_OpSetAttrFloatList(op, attr_name, f_vector.data(), f_size);
+      }
+      // Bool
+      if (const int b_size = default_value.list().b_size()) {
+        absl::InlinedVector<unsigned char, 4> b_vector;
+        for (int i = 0; i < b_size; i++) {
+          b_vector.push_back(default_value.list().b(i));
+        }
+        TFE_OpSetAttrBoolList(op, attr_name, b_vector.data(), b_size);
+      }
+      // Type
+      if (const int type_size = default_value.list().type_size()) {
+        absl::InlinedVector<unsigned int, 4> type_vector;
+        for (int i = 0; i < type_size; ++i) {
+          type_vector.push_back(default_value.list().type(i));
+        }
+        TFE_OpSetAttrTypeList(
+            op, attr_name,
+            reinterpret_cast<const TF_DataType*>(type_vector.data()),
+            type_size);
+      }
+
+      // Rest are not supported.
+      if (default_value.list().shape_size() > 0 ||
+          default_value.list().func_size() > 0 ||
+          default_value.list().tensor_size() > 0) {
+        TF_SetStatus(
+            status, TF_UNIMPLEMENTED,
+            tensorflow::strings::StrCat("Unable to get setfor default value: ",
+                                        default_value.DebugString())
+                .data());
+      }
+    } break;
     case tensorflow::AttrValue::kTensor:
       TF_FALLTHROUGH_INTENDED;
     case tensorflow::AttrValue::kPlaceholder:
@@ -1612,19 +1671,12 @@ class CustomDeviceAPI : public tensorflow::CustomDevice {
     return status.status;
   }
 
-  tensorflow::Status Execute(tensorflow::EagerOperation* op,
+  tensorflow::Status Execute(const tensorflow::EagerOperation* op,
                              tensorflow::TensorHandle** retvals,
                              int* num_retvals) override {
-    std::vector<TFE_TensorHandle*> inputs;
-    inputs.reserve(op->Inputs().size());
-    for (int i = 0; i < op->Inputs().size(); ++i) {
-      op->Inputs()[i]->Ref();
-      inputs.push_back(tensorflow::wrap(op->Inputs()[i]));
-    }
     std::vector<TFE_TensorHandle*> outputs(*num_retvals);
     TF_Status status;
-    device_.execute(context_, inputs.size(), inputs.data(), op->Name().c_str(),
-                    wrap(&op->Attrs()), num_retvals, outputs.data(), &status,
+    device_.execute(tensorflow::wrap(op), num_retvals, outputs.data(), &status,
                     info_);
     if (status.status.ok()) {
       for (int i = 0; i < *num_retvals; ++i) {
@@ -1634,10 +1686,6 @@ class CustomDeviceAPI : public tensorflow::CustomDevice {
         TFE_DeleteTensorHandle(outputs[i]);
       }
     }
-
-    for (auto inp : inputs) {
-      TFE_DeleteTensorHandle(inp);
-    }
     return status.status;
   }
 
diff --git a/tensorflow/c/eager/c_api.h b/tensorflow/c/eager/c_api.h
index 5afe3047dd7..a58c681e8fe 100644
--- a/tensorflow/c/eager/c_api.h
+++ b/tensorflow/c/eager/c_api.h
@@ -248,22 +248,22 @@ typedef struct TFE_Op TFE_Op;
 TF_CAPI_EXPORT extern TFE_Op* TFE_NewOp(TFE_Context* ctx,
                                         const char* op_or_function_name,
                                         TF_Status* status);
-
 TF_CAPI_EXPORT extern void TFE_DeleteOp(TFE_Op* op);
 
+// Returns the op or function name `op` will execute.
+//
+// The returned string remains valid throughout the lifetime of 'op'.
+TF_CAPI_EXPORT extern const char* TFE_OpGetName(const TFE_Op* op,
+                                                TF_Status* status);
+TF_CAPI_EXPORT extern TFE_Context* TFE_OpGetContext(const TFE_Op* op,
+                                                    TF_Status* status);
+
 TF_CAPI_EXPORT extern void TFE_OpSetDevice(TFE_Op* op, const char* device_name,
                                            TF_Status* status);
 // The returned string remains valid throughout the lifetime of 'op'.
-TF_CAPI_EXPORT extern const char* TFE_OpGetDevice(TFE_Op* op,
+TF_CAPI_EXPORT extern const char* TFE_OpGetDevice(const TFE_Op* op,
                                                   TF_Status* status);
 
-// When 'enable' is set to 1, and if TensorFlow library is built with XLA
-// support, a subsequent TFE_Execute() call on `op` will run the op via XLA.
-//
-// If the library is not built with XLA support, this call would be a no-op.
-TF_CAPI_EXPORT extern void TFE_OpSetXLACompilation(TFE_Op* op,
-                                                   unsigned char enable);
-
 TF_CAPI_EXPORT extern void TFE_OpAddInput(TFE_Op* op, TFE_TensorHandle* input,
                                           TF_Status* status);
 
@@ -272,6 +272,23 @@ TF_CAPI_EXPORT extern void TFE_OpAddInputList(TFE_Op* op,
                                               int num_inputs,
                                               TF_Status* status);
 
+// Fetches the current number of inputs attached to `op`.
+//
+// Does not use the operation's definition to determine how many inputs should
+// be attached. It is intended for use with TFE_OpGetFlatInput to inspect an
+// already-finalized operation.
+//
+// Note that TFE_OpGetFlatInputCount and TFE_OpGetFlatInput operate on a flat
+// sequence of inputs, unlike TFE_OpGetInputLength (for getting the length of a
+// particular named input list, which may only be part of the op's inputs).
+TF_CAPI_EXPORT extern int TFE_OpGetFlatInputCount(const TFE_Op* op,
+                                                  TF_Status* status);
+// Returns a borrowed reference to one of `op`'s inputs. Use
+// `TFE_TensorHandleCopySharingTensor` to make a new reference.
+TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_OpGetFlatInput(const TFE_Op* op,
+                                                           int index,
+                                                           TF_Status* status);
+
 TF_CAPI_EXPORT extern TF_AttrType TFE_OpGetAttrType(TFE_Op* op,
                                                     const char* attr_name,
                                                     unsigned char* is_list,
diff --git a/tensorflow/c/eager/c_api_debug.cc b/tensorflow/c/eager/c_api_debug.cc
index dd55f05283b..b5721cdab0a 100644
--- a/tensorflow/c/eager/c_api_debug.cc
+++ b/tensorflow/c/eager/c_api_debug.cc
@@ -22,9 +22,6 @@ limitations under the License.
 #include "tensorflow/c/tf_status_internal.h"
 #include "tensorflow/core/common_runtime/eager/tensor_handle.h"
 #include "tensorflow/core/platform/status.h"
-#ifdef TENSORFLOW_EAGER_USE_XLA
-#include "tensorflow/compiler/jit/xla_device.h"
-#endif  // TENSORFLOW_EAGER_USE_XLA
 
 using tensorflow::string;
 
@@ -64,87 +61,6 @@ TF_CAPI_EXPORT extern TFE_TensorDebugInfo* TFE_TensorHandleTensorDebugInfo(
     return nullptr;
   }
 
-#ifdef TENSORFLOW_EAGER_USE_XLA
-  auto* device = absl::get<tensorflow::Device*>(handle->device());
-
-  // If tensor resides on an XLA device, use XLA device's PaddedShapeFn.
-  auto* xla_device = dynamic_cast<tensorflow::XlaDevice*>(device);
-  if (xla_device != nullptr) {
-    tensorflow::XlaDevice::PaddedShapeFn shape_fn =
-        xla_device->metadata().padded_shape_fn();
-    xla::Shape padded_shape;
-    status->status = shape_fn(*tensor, &padded_shape);
-    if (!status->status.ok()) {
-      return nullptr;
-    }
-    if (VLOG_IS_ON(3)) {
-      std::vector<tensorflow::int64> shape_to_log =
-          TensorShapeAsVector(*handle, &status->status);
-      if (!status->status.ok()) {
-        // Ignore the status here as we are simply logging.
-        status->status = tensorflow::Status::OK();
-      } else {
-        VLOG(3) << "Fully padded shape of ["
-                << absl::StrJoin(shape_to_log, ", ") << "] is "
-                << padded_shape.DebugString();
-      }
-    }
-
-    if (padded_shape.IsTuple()) {
-      if (xla::ShapeUtil::TupleElementCount(padded_shape) != 2) {
-        // Currently, the only case of XlaTensor containing a tuple shape is to
-        // represent 64 bit ints, doubles, and complex numbers (we don't support
-        // 64bit complex numbers).
-        status->status = tensorflow::errors::InvalidArgument(
-            "XlaTensors should only contain tuples of size 2. Shape: ",
-            padded_shape.DebugString());
-        return nullptr;
-      }
-
-      // shape0 is not a const& because we will assign it to padded_shape below.
-      // It is illegal to assign a part of a message to itself.
-      xla::Shape shape0 = xla::ShapeUtil::GetTupleElementShape(padded_shape, 0);
-      const xla::Shape& shape1 =
-          xla::ShapeUtil::GetTupleElementShape(padded_shape, 1);
-      if (shape0.IsTuple() || shape1.IsTuple()) {
-        status->status = tensorflow::errors::InvalidArgument(
-            "XlaTensors should not contain nested tuples. Shape: ",
-            padded_shape.DebugString());
-        return nullptr;
-      }
-      if (!xla::ShapeUtil::Equal(shape0, shape1)) {
-        status->status = tensorflow::errors::InvalidArgument(
-            "Subshapes of XlaTensors should be the same. Shape: ",
-            padded_shape.DebugString());
-        return nullptr;
-      }
-
-      // Since the only case we handle here are two equal subshapes, we
-      // simply return one of them. The caller will interpret it as this
-      // shape directly storing the 64bit types. This approximation is good
-      // enough for this API's debugging use case.
-      padded_shape = shape0;
-    }
-
-    int rank = padded_shape.dimensions_size();
-    std::vector<tensorflow::int64> dev_dims;
-    dev_dims.reserve(rank);
-    if (rank == 1) {
-      // Rank 1 tensors might not have padded_shape.layout.minor_to_major set,
-      dev_dims.push_back(padded_shape.dimensions(0));
-    } else {
-      for (int i = rank - 1; i >= 0; --i) {
-        tensorflow::int64 dim_index = padded_shape.layout().minor_to_major(i);
-        dev_dims.push_back(padded_shape.dimensions(dim_index));
-      }
-    }
-    status->status = tensorflow::Status::OK();
-    return new TFE_TensorDebugInfo(dev_dims);
-  }
-#endif  // TENSORFLOW_EAGER_USE_XLA
-
-  // If the tensor is not an XLA tensor, the device shape is
-  // the same as regular tensor shape.
   std::vector<tensorflow::int64> dev_dims =
       TensorShapeAsVector(*handle, &status->status);
   if (!status->status.ok()) {
diff --git a/tensorflow/c/eager/c_api_distributed_test.cc b/tensorflow/c/eager/c_api_distributed_test.cc
index 3738768cf02..2718c75c3ee 100644
--- a/tensorflow/c/eager/c_api_distributed_test.cc
+++ b/tensorflow/c/eager/c_api_distributed_test.cc
@@ -121,25 +121,6 @@ string AddVariablesFunction() {
   return def.SerializeAsString();
 }
 
-void VarIsInitialized(TFE_Context* ctx, TFE_TensorHandle* var_handle) {
-  TF_Status* status = TF_NewStatus();
-  TFE_Op* op = TFE_NewOp(ctx, "VarIsInitializedOp", status);
-  EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
-  TFE_OpAddInput(op, var_handle, status);
-  TFE_TensorHandle* is_initialized[1] = {nullptr};
-  int num_retvals = 1;
-  TFE_Execute(op, &is_initialized[0], &num_retvals, status);
-  CHECK_EQ(1, num_retvals);
-  TF_Tensor* t = TFE_TensorHandleResolve(is_initialized[0], status);
-  bool initialized = false;
-  memcpy(&initialized, TF_TensorData(t), TF_TensorByteSize(t));
-  EXPECT_EQ(initialized, true);
-  TF_DeleteTensor(t);
-  TFE_DeleteTensorHandle(is_initialized[0]);
-  TFE_DeleteOp(op);
-  delete status;
-}
-
 void TestFunctionWithPackedInput(const bool remote) {
   tensorflow::ServerDef server_def = GetServerDef(3);
 
@@ -182,9 +163,8 @@ void TestFunctionWithPackedInput(const bool remote) {
 
   // Add a sync point in order to make sure that variables have been initialized
   // before the function execution starts.
-  // TODO(b/155789951): Remove once b/155789951 is fixed.
-  VarIsInitialized(ctx, h1);
-  VarIsInitialized(ctx, h2);
+  TFE_ContextAsyncWait(ctx, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
 
   // Pack 3 variable handles into one TFE_TensorHandle.
   // When remote is false, function device is placed on task0. Handle types are
@@ -396,6 +376,8 @@ TEST(CAPI, DistributedFunctionGraphPassOnlyOnce) {
 
   TFE_TensorHandle* var_handle = TestVariable(ctx, 2.0, dev2_name);
   EXPECT_NE(var_handle, nullptr);
+  TFE_ContextAsyncWait(ctx, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
 
   const string function_def = VariableAddFunction();
   TFE_ContextAddFunctionDef(ctx, function_def.data(), function_def.size(),
@@ -517,8 +499,11 @@ void TestDistributedFunctionCancellation(bool inject_error) {
 
   TFE_TensorHandle* var_handle = TestVariable(ctx, 2.0, dev2_name);
   EXPECT_NE(var_handle, nullptr);
+  TFE_ContextAsyncWait(ctx, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
 
-  const string function_def = VariableAddFunctionWithGraphError();
+  const string function_def = inject_error ? VariableAddFunctionWithGraphError()
+                                           : VariableAddFunction();
   TFE_ContextAddFunctionDef(ctx, function_def.data(), function_def.size(),
                             status);
   ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
diff --git a/tensorflow/c/eager/c_api_experimental.cc b/tensorflow/c/eager/c_api_experimental.cc
index 7390cf243be..eabb159a631 100644
--- a/tensorflow/c/eager/c_api_experimental.cc
+++ b/tensorflow/c/eager/c_api_experimental.cc
@@ -486,29 +486,6 @@ TFE_MonitoringSamplerCell* TFE_MonitoringGetCellSampler2(
       static_cast<void*>(sampler->sampler->GetCell(label1, label2)));
 }
 
-void TFE_ContextOptionsSetMirroringPolicy(TFE_ContextOptions* options,
-                                          TFE_ContextMirroringPolicy policy) {
-  options->mirroring_policy = policy;
-}
-
-void TFE_ContextSetThreadLocalMirroringPolicy(
-    TFE_Context* ctx, TFE_ContextMirroringPolicy policy) {
-  tensorflow::EagerContext* context =
-      tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
-  context->SetThreadLocalMirroringPolicy(
-      static_cast<tensorflow::ContextMirroringPolicy>(policy));
-}
-
-// Note: this function looks up a thread local policy. So it should be called in
-// the appropriate client thread. In particular, in async mode, it may not be
-// safe to call this function from the async EagerExecutor threads.
-extern TFE_ContextMirroringPolicy TFE_ContextGetMirroringPolicy(
-    TFE_Context* ctx) {
-  tensorflow::EagerContext* context =
-      tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
-  return static_cast<TFE_ContextMirroringPolicy>(context->GetMirroringPolicy());
-}
-
 void TFE_ContextOptionsSetLazyRemoteInputsCopy(TFE_ContextOptions* options,
                                                bool lazy_copy) {
   options->lazy_remote_inputs_copy = lazy_copy;
diff --git a/tensorflow/c/eager/c_api_experimental.h b/tensorflow/c/eager/c_api_experimental.h
index 1af76c01154..12546c6082a 100644
--- a/tensorflow/c/eager/c_api_experimental.h
+++ b/tensorflow/c/eager/c_api_experimental.h
@@ -265,33 +265,6 @@ TF_CAPI_EXPORT extern void TFE_MonitoringDeleteSampler2(
 TF_CAPI_EXPORT extern TFE_MonitoringSamplerCell* TFE_MonitoringGetCellSampler2(
     TFE_MonitoringSampler2* sampler, const char* label1, const char* label2);
 
-// LINT.IfChange
-// Note: Keep in sync with internal copy of enum in eager/context.h.
-typedef enum TFE_ContextMirroringPolicy {
-  // Do not maintain mirrors in a TensorHandle, instead make new TensorHandle
-  // copies with their own lifetime.
-  TFE_MIRRORING_NONE = 0,
-  // Mirroring any remote tensor handles, associating them with the lifetime of
-  // the local TensorHandle.
-  TFE_MIRRORING_ALL = 1,
-} TFE_ContextMirroringPolicy;
-// LINT.ThenChange(//tensorflow/core/common_runtime/eager/context.h)
-
-TF_CAPI_EXPORT extern void TFE_ContextOptionsSetMirroringPolicy(
-    TFE_ContextOptions*, TFE_ContextMirroringPolicy);
-
-// Sets a thread-local mirroring policy. After this call, other calls to
-// TFE_Execute in the same thread will use the mirroring policy specified here
-// instead of the mirroring policy used to construct the context. This has no
-// effect on the mirroring policy used by other program threads.
-TF_CAPI_EXPORT extern void TFE_ContextSetThreadLocalMirroringPolicy(
-    TFE_Context*, TFE_ContextMirroringPolicy);
-
-// Returns the mirroring policy to be used by this context in the current
-// thread.
-TF_CAPI_EXPORT extern TFE_ContextMirroringPolicy TFE_ContextGetMirroringPolicy(
-    TFE_Context*);
-
 // Sets whether to copy the remote inputs of a function lazily.
 TF_CAPI_EXPORT extern void TFE_ContextOptionsSetLazyRemoteInputsCopy(
     TFE_ContextOptions*, bool lazy_copy);
@@ -441,7 +414,7 @@ typedef struct TFE_OpAttrs TFE_OpAttrs;
 
 // Fetch a reference to `op`'s attributes. The returned reference is only valid
 // while `op` is alive.
-const TFE_OpAttrs* TFE_OpGetAttrs(TFE_Op* op);
+TF_CAPI_EXPORT extern const TFE_OpAttrs* TFE_OpGetAttrs(const TFE_Op* op);
 // Add attributes in `attrs` to `op`.
 //
 // Does not overwrite or update existing attributes, but adds new ones.
@@ -462,7 +435,11 @@ TF_CAPI_EXPORT extern void TFE_OpSetAttrValueProto(const TFE_Op* op,
                                                    size_t proto_len,
                                                    TF_Status* status);
 
-#define TFE_CUSTOM_DEVICE_VERSION 2
+// TODO(b/166642410): It would be nice, for custom devices and for other users,
+// to have a non-string representation of devices (TF_Device) extracted from
+// tensors/ops/etc. and usable in APIs like OpSetDevice/ResetOp/etc.
+
+#define TFE_CUSTOM_DEVICE_VERSION 3
 
 // Struct to be filled in
 typedef struct TFE_CustomDevice {
@@ -481,9 +458,16 @@ typedef struct TFE_CustomDevice {
                                                void* device_info);
 
   // Method to execute an operation.
-  void (*execute)(TFE_Context* context, int num_inputs,
-                  TFE_TensorHandle** inputs, const char* operation_name,
-                  const TFE_OpAttrs* attributes, int* num_outputs,
+  //
+  // Arguments provide enough information to reconstruct the original `TFE_Op`,
+  // or construct a transformed version, by inspecting the passed `op`.
+  //
+  // TFE_OpGetDevice(op) records the original placement of the operation. It may
+  // be an empty string if no device was explicitly requested, but will
+  // otherwise be the name of this custom device. Ops are placed onto a custom
+  // device if any of their inputs are on that custom device, but custom devices
+  // are free to set a bad status in order to require explicit placement.
+  void (*execute)(const TFE_Op* op, int* num_outputs,
                   TFE_TensorHandle** outputs, TF_Status* s, void* device_info);
 
   // Method to delete a device.
diff --git a/tensorflow/c/eager/c_api_experimental_test.cc b/tensorflow/c/eager/c_api_experimental_test.cc
index a4d31417073..4975d303375 100644
--- a/tensorflow/c/eager/c_api_experimental_test.cc
+++ b/tensorflow/c/eager/c_api_experimental_test.cc
@@ -316,86 +316,6 @@ TEST(CAPI, Function_ident_CPU) {
   TF_DeleteStatus(status);
 }
 
-#ifdef TENSORFLOW_EAGER_USE_XLA
-TEST(CAPI, Function_ident_XLA_CPU) {
-  // First create a simple identity function.
-  TF_Graph* function_graph = TF_NewGraph();
-  TF_OperationDescription* arg_descr =
-      TF_NewOperation(function_graph, "Placeholder", "arg");
-  TF_SetAttrType(arg_descr, "dtype", TF_INT32);
-  TF_Status* status = TF_NewStatus();
-  TF_Operation* arg = TF_FinishOperation(arg_descr, status);
-  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
-  TF_OperationDescription* id_descr =
-      TF_NewOperation(function_graph, "Identity", "id");
-  TF_SetAttrType(id_descr, "T", TF_INT32);
-  TF_AddInput(id_descr, {arg, 0});
-  TF_Operation* id = TF_FinishOperation(id_descr, status);
-  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
-  TF_Output input{arg, 0};
-  TF_Output output{id, 0};
-  TF_Function* fn =
-      TF_GraphToFunction(function_graph, "ident", 0, 1, &id, 1, &input, 1,
-                         &output, nullptr, nullptr, "test", status);
-  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
-  TF_DeleteGraph(function_graph);
-  TFE_ContextOptions* opts = TFE_NewContextOptions();
-  TFE_Context* ctx = TFE_NewContext(opts, status);
-  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
-  TFE_DeleteContextOptions(opts);
-  TFE_ContextAddFunction(ctx, fn, status);
-  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
-  TF_DeleteFunction(fn);
-
-  for (bool async : {false, true, false}) {
-    TFE_Executor* old_executor = TFE_ContextGetExecutorForThread(ctx);
-    TFE_Executor* executor = TFE_NewExecutor(async);
-    TFE_ContextSetExecutorForThread(ctx, executor);
-    CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-    ASSERT_TRUE(TF_GetCode(status) == TF_OK);
-    TF_Tensor* t =
-        TF_AllocateTensor(TF_INT32, nullptr, 0, 1 * sizeof(tensorflow::int32));
-    *reinterpret_cast<tensorflow::int32*>(TF_TensorData(t)) = 42;
-    TFE_TensorHandle* h = TFE_NewTensorHandle(t, status);
-    ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
-    TF_DeleteTensor(t);
-
-    TFE_Op* op = TFE_NewOp(ctx, "ident", status);
-    ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
-    TFE_OpAddInput(op, h, status);
-    ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
-
-    // Now run it via XLA.
-    TFE_OpSetXLACompilation(op, true);
-
-    std::vector<TFE_TensorHandle*> result;
-    result.push_back(nullptr);
-    int num_retvals = 1;
-    TFE_Execute(op, result.data(), &num_retvals, status);
-    TFE_DeleteOp(op);
-    ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
-    ASSERT_EQ(num_retvals, 1);
-
-    TF_Tensor* r = TFE_TensorHandleResolve(result[0], status);
-    ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
-    EXPECT_EQ(*reinterpret_cast<tensorflow::int32*>(TF_TensorData(r)), 42);
-    TFE_ContextSetExecutorForThread(ctx, old_executor);
-    TFE_ExecutorWaitForAllPendingNodes(executor, status);
-    ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-    TFE_DeleteExecutor(executor);
-    TFE_DeleteExecutor(old_executor);
-    TFE_DeleteTensorHandle(h);
-    TF_DeleteTensor(r);
-    TFE_DeleteTensorHandle(result[0]);
-  }
-  TFE_ContextRemoveFunction(ctx, "ident", status);
-  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
-  TFE_DeleteContext(ctx);
-  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
-  TF_DeleteStatus(status);
-}
-#endif  // TENSORFLOW_EAGER_USE_XLA
-
 void Executor_MatMul_CPU(bool async) {
   TF_Status* status = TF_NewStatus();
   TFE_ContextOptions* opts = TFE_NewContextOptions();
diff --git a/tensorflow/c/eager/c_api_internal.h b/tensorflow/c/eager/c_api_internal.h
index 4d9be0c2501..356476c2186 100644
--- a/tensorflow/c/eager/c_api_internal.h
+++ b/tensorflow/c/eager/c_api_internal.h
@@ -32,7 +32,6 @@ struct TFE_ContextOptions {
   bool async = false;
   TFE_ContextDevicePlacementPolicy device_placement_policy{
       TFE_DEVICE_PLACEMENT_SILENT};
-  TFE_ContextMirroringPolicy mirroring_policy{TFE_MIRRORING_NONE};
   // If true, lazily copy the remote inputs of a function to the target devices.
   bool lazy_remote_inputs_copy = true;
   // If true, use TFRT backend
diff --git a/tensorflow/c/eager/c_api_remote_function_test.cc b/tensorflow/c/eager/c_api_remote_function_test.cc
new file mode 100644
index 00000000000..a9bbd5b694f
--- /dev/null
+++ b/tensorflow/c/eager/c_api_remote_function_test.cc
@@ -0,0 +1,64 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/eager/c_api_remote_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace {
+
+void TestRemoteExecuteSilentCopiesFunc(bool async, bool remote,
+                                       bool heavy_load_on_streaming_rpc,
+                                       bool remote_func_outputs = false) {
+  return TestRemoteExecuteSilentCopies(async, remote, /*func=*/true,
+                                       heavy_load_on_streaming_rpc,
+                                       remote_func_outputs);
+}
+
+TEST(CAPI, RemoteExecuteSilentCopiesAsyncFunc) {
+  TestRemoteExecuteSilentCopiesFunc(/*async=*/true, /*remote=*/true,
+                                    /*heavy_load_on_streaming_rpc=*/false);
+}
+TEST(CAPI, RemoteExecuteSilentCopiesFuncRemoteOutputs) {
+  TestRemoteExecuteSilentCopiesFunc(/*async=*/false, /*remote=*/true,
+                                    /*heavy_load_on_streaming_rpc=*/false,
+                                    /*remote_func_outputs=*/true);
+}
+TEST(CAPI, RemoteExecuteSilentCopiesAsyncFuncRemoteOutputs) {
+  TestRemoteExecuteSilentCopiesFunc(/*async=*/true, /*remote=*/true,
+                                    /*heavy_load_on_streaming_rpc=*/false,
+                                    /*remote_func_outputs=*/true);
+}
+TEST(CAPI, RemoteExecuteSilentCopiesLocalAsyncFunc) {
+  TestRemoteExecuteSilentCopiesFunc(/*async=*/true, /*remote=*/false,
+                                    /*heavy_load_on_streaming_rpc=*/false);
+}
+TEST(CAPI, RemoteExecuteSilentCopiesLocalFuncRemoteOutputs) {
+  TestRemoteExecuteSilentCopiesFunc(/*async=*/false, /*remote=*/false,
+                                    /*heavy_load_on_streaming_rpc=*/false,
+                                    /*remote_func_outputs=*/true);
+}
+TEST(CAPI, RemoteExecuteSilentCopiesLocalAsyncFuncRemoteOutputs) {
+  TestRemoteExecuteSilentCopiesFunc(/*async=*/true, /*remote=*/false,
+                                    /*heavy_load_on_streaming_rpc=*/false,
+                                    /*remote_func_outputs=*/true);
+}
+TEST(CAPI, RemoteExecuteSilentCopiesLocalAsyncFuncOrdering) {
+  // A remote input may be not ready when we start running a function. Test that
+  // the function execution should wait until the remote input is ready.
+  TestRemoteExecuteSilentCopiesFunc(/*async=*/true, /*remote=*/false,
+                                    /*heavy_load_on_streaming_rpc=*/true);
+}
+
+}  // namespace
diff --git a/tensorflow/c/eager/c_api_remote_test.cc b/tensorflow/c/eager/c_api_remote_test.cc
index 94c32cf3f30..e68e15ba560 100644
--- a/tensorflow/c/eager/c_api_remote_test.cc
+++ b/tensorflow/c/eager/c_api_remote_test.cc
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "absl/strings/str_cat.h"
 #include "tensorflow/c/eager/c_api.h"
 #include "tensorflow/c/eager/c_api_experimental.h"
 #include "tensorflow/c/eager/c_api_internal.h"
+#include "tensorflow/c/eager/c_api_remote_test_util.h"
 #include "tensorflow/c/eager/c_api_test_util.h"
 #include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
 #include "tensorflow/core/common_runtime/eager/eager_operation.h"
@@ -115,225 +117,24 @@ void TestRemoteExecute(bool async) {
 TEST(CAPI, RemoteExecute) { TestRemoteExecute(false); }
 TEST(CAPI, RemoteExecuteAsync) { TestRemoteExecute(true); }
 
-string MatMulFunction() {
-  tensorflow::FunctionDef def;
-  CHECK(tensorflow::protobuf::TextFormat::ParseFromString(
-      "    signature {"
-      "      name: 'MatMulFunction'"
-      "      input_arg {"
-      "        name: 'a'"
-      "        type: DT_FLOAT"
-      "      }"
-      "      input_arg {"
-      "        name: 'b'"
-      "        type: DT_FLOAT"
-      "      }"
-      "      output_arg {"
-      "        name: 'm'"
-      "        type: DT_FLOAT"
-      "      }"
-      "    }"
-      "    node_def {"
-      "      name: 'matmul'"
-      "      op: 'MatMul'"
-      "      input: 'a'"
-      "      input: 'b'"
-      "      attr {"
-      "        key: 'T'"
-      "        value {"
-      "          type: DT_FLOAT"
-      "        }"
-      "      }"
-      "    }"
-      "    ret {"
-      "      key: 'm'"
-      "      value: 'matmul:product'"
-      "    }",
-      &def));
-  return def.SerializeAsString();
-}
-
-// If heavy_load_on_streaming_rpc is true, send some rpc reqeusts before the one
-// which creates a remote remote input, to simulate a scenario that the remote
-// input is not ready when we start running an op or a function.
-void TestRemoteExecuteSilentCopies(bool async, bool remote, bool func,
-                                   bool heavy_load_on_streaming_rpc) {
-  tensorflow::ServerDef server_def = GetServerDef(3);
-
-  // This server def has the task index set to 0.
-  string serialized = server_def.SerializeAsString();
-
-  server_def.set_task_index(1);
-  std::unique_ptr<tensorflow::GrpcServer> worker_server1;
-  ASSERT_TRUE(tensorflow::GrpcServer::Create(
-                  server_def, tensorflow::Env::Default(), &worker_server1)
-                  .ok());
-  ASSERT_TRUE(worker_server1->Start().ok());
-
-  server_def.set_task_index(2);
-  std::unique_ptr<tensorflow::GrpcServer> worker_server2;
-  ASSERT_TRUE(tensorflow::GrpcServer::Create(
-                  server_def, tensorflow::Env::Default(), &worker_server2)
-                  .ok());
-  ASSERT_TRUE(worker_server2->Start().ok());
-
-  TF_Status* status = TF_NewStatus();
-  TFE_ContextOptions* opts = TFE_NewContextOptions();
-  TFE_ContextOptionsSetAsync(opts, static_cast<unsigned char>(async));
-  TFE_ContextOptionsSetDevicePlacementPolicy(opts, TFE_DEVICE_PLACEMENT_SILENT);
-  TFE_Context* ctx = TFE_NewContext(opts, status);
-  EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
-  TFE_DeleteContextOptions(opts);
-
-  TFE_ContextSetServerDef(ctx, 0, serialized.data(), serialized.size(), status);
-  EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
-
-  TFE_TensorHandle* h0_task0 = TestMatrixTensorHandle(ctx);
-  TFE_TensorHandle* h1_task0 = TestMatrixTensorHandle(ctx);
-  std::vector<TFE_TensorHandle*> handles_task0;
-  if (heavy_load_on_streaming_rpc) {
-    // Send 50 tensor copy requests to simulate that there have been some RPC
-    // requests been enqueued.
-    for (int i = 0; i < 50; ++i) {
-      handles_task0.push_back(TestMatrixTensorHandle(ctx));
-    }
-  }
-  const char task1_name[] = "/job:localhost/replica:0/task:1/device:CPU:0";
-  const char task2_name[] = "/job:localhost/replica:0/task:2/device:CPU:0";
-
-  std::vector<TFE_TensorHandle*> handles_task2;
-  for (auto* h_task0 : handles_task0) {
-    handles_task2.push_back(
-        TFE_TensorHandleCopyToDevice(h_task0, ctx, task2_name, status));
-    ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
-  }
-
-  auto* h1_task2 =
-      TFE_TensorHandleCopyToDevice(h1_task0, ctx, task2_name, status);
-  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
-
-  TFE_Op* matmul = nullptr;
-  if (func) {
-    string function_def = MatMulFunction();
-    TFE_ContextAddFunctionDef(ctx, function_def.data(), function_def.size(),
-                              status);
-    CHECK_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
-
-    matmul = TFE_NewOp(ctx, "MatMulFunction", status);
-    ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
-    TFE_OpAddInput(matmul, h0_task0, status);
-    ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
-    TFE_OpAddInput(matmul, h1_task2, status);
-    ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
-  } else {
-    // Handles are on task0 (local), and task2, but op is on task1.
-    matmul = MatMulOp(ctx, h0_task0, h1_task2);
-  }
-  if (remote) {
-    TFE_OpSetDevice(matmul, task1_name, status);
-    ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
-  } else if (!async) {
-    // Set the local device to CPU to easily validate mirroring
-    string cpu_device_name;
-    ASSERT_TRUE(GetDeviceName(ctx, &cpu_device_name, "CPU"));
-    TFE_OpSetDevice(matmul, cpu_device_name.c_str(), status);
-    EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
-    auto remote_arg =
-        tensorflow::TensorHandleFromInterface(tensorflow::unwrap(h1_task2));
-    // The input handles should never change since they have been mirrored.
-    ASSERT_FALSE(remote_arg->HasLocalMirror(nullptr));
-  }
-
-  TFE_TensorHandle* retvals[1];
-  int num_retvals = 1;
-  TFE_Execute(matmul, &retvals[0], &num_retvals, status);
-  EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
-
-  // TODO(gjn): Add support for waiting on async local mirrors
-  if (!remote && !async) {
-    auto remote_arg =
-        tensorflow::TensorHandleFromInterface(tensorflow::unwrap(h1_task2));
-    // The input handles should never change since they have been mirrored.
-    ASSERT_TRUE(remote_arg->HasLocalMirror(nullptr));
-  }
-
-  auto* retval_task0 = TFE_TensorHandleCopyToDevice(
-      retvals[0], ctx, "/job:localhost/replica:0/task:0/device:CPU:0", status);
-  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
-
-  TF_Tensor* t = TFE_TensorHandleResolve(retval_task0, status);
-  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
-  TFE_DeleteTensorHandle(retval_task0);
-  float product[4] = {0};
-  EXPECT_EQ(sizeof(product), TF_TensorByteSize(t));
-  memcpy(&product[0], TF_TensorData(t), TF_TensorByteSize(t));
-  TF_DeleteTensor(t);
-  EXPECT_EQ(7, product[0]);
-  EXPECT_EQ(10, product[1]);
-  EXPECT_EQ(15, product[2]);
-  EXPECT_EQ(22, product[3]);
-
-  TFE_DeleteTensorHandle(h0_task0);
-  TFE_DeleteTensorHandle(h1_task0);
-  TFE_DeleteTensorHandle(h1_task2);
-  TFE_DeleteTensorHandle(retvals[0]);
-  for (auto* h : handles_task0) {
-    TFE_DeleteTensorHandle(h);
-  }
-  for (auto* h : handles_task2) {
-    TFE_DeleteTensorHandle(h);
-  }
-
-  TFE_DeleteOp(matmul);
-
-  TFE_Executor* executor = TFE_ContextGetExecutorForThread(ctx);
-  TFE_ExecutorWaitForAllPendingNodes(executor, status);
-  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
-  TFE_DeleteExecutor(executor);
-  if (func) {
-    TFE_ContextRemoveFunction(ctx, "MatMulFunction", status);
-  }
-  TFE_DeleteContext(ctx);
-
-  TF_DeleteStatus(status);
-
-  // TODO(b/136478427): Figure out how to correctly shut the server down.
-  worker_server1.release();
-  worker_server2.release();
+void TestRemoteExecuteSilentCopiesOp(bool async, bool remote,
+                                     bool remote_func_outputs = false) {
+  return TestRemoteExecuteSilentCopies(async, remote, /*func=*/false,
+                                       /*heavy_load_on_streaming_rpc=*/false,
+                                       remote_func_outputs);
 }
 
 TEST(CAPI, RemoteExecuteSilentCopies) {
-  TestRemoteExecuteSilentCopies(/*async=*/false, /*remote=*/true,
-                                /*func=*/false,
-                                /*heavy_load_on_streaming_rpc=*/false);
+  TestRemoteExecuteSilentCopiesOp(/*async=*/false, /*remote=*/true);
 }
 TEST(CAPI, RemoteExecuteSilentCopiesAsync) {
-  TestRemoteExecuteSilentCopies(/*async=*/true, /*remote=*/true, /*func=*/false,
-                                /*heavy_load_on_streaming_rpc=*/false);
-}
-TEST(CAPI, RemoteExecuteSilentCopiesAsyncFunc) {
-  TestRemoteExecuteSilentCopies(/*async=*/true, /*remote=*/true, /*func=*/true,
-                                /*heavy_load_on_streaming_rpc=*/false);
+  TestRemoteExecuteSilentCopiesOp(/*async=*/true, /*remote=*/true);
 }
 TEST(CAPI, RemoteExecuteSilentCopiesLocal) {
-  TestRemoteExecuteSilentCopies(/*async=*/false, /*remote=*/false,
-                                /*func=*/false,
-                                /*heavy_load_on_streaming_rpc=*/false);
+  TestRemoteExecuteSilentCopiesOp(/*async=*/false, /*remote=*/false);
 }
 TEST(CAPI, RemoteExecuteSilentCopiesLocalAsync) {
-  TestRemoteExecuteSilentCopies(/*async=*/true, /*remote=*/false,
-                                /*func=*/false,
-                                /*heavy_load_on_streaming_rpc=*/false);
-}
-TEST(CAPI, RemoteExecuteSilentCopiesLocalAsyncFunc) {
-  TestRemoteExecuteSilentCopies(/*async=*/true, /*remote=*/false, /*func=*/true,
-                                /*heavy_load_on_streaming_rpc=*/false);
-}
-TEST(CAPI, RemoteExecuteSilentCopiesLocalAsyncFuncOrdering) {
-  // A remote input may be not ready when we start running a function. Test that
-  // the function execution should wait until the remote input is ready.
-  TestRemoteExecuteSilentCopies(/*async=*/true, /*remote=*/false, /*func=*/true,
-                                /*heavy_load_on_streaming_rpc=*/true);
+  TestRemoteExecuteSilentCopiesOp(/*async=*/true, /*remote=*/false);
 }
 
 }  // namespace
diff --git a/tensorflow/c/eager/c_api_remote_test_util.cc b/tensorflow/c/eager/c_api_remote_test_util.cc
new file mode 100644
index 00000000000..159fa442a73
--- /dev/null
+++ b/tensorflow/c/eager/c_api_remote_test_util.cc
@@ -0,0 +1,222 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/eager/c_api_remote_test_util.h"
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/c/eager/c_api_internal.h"
+#include "tensorflow/c/eager/c_api_test_util.h"
+#include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
+#include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/tensorflow_server.pb.h"
+
+using ::tensorflow::string;
+
+string MatMulFunction(const string& matmul_device) {
+  tensorflow::FunctionDef def;
+  CHECK(tensorflow::protobuf::TextFormat::ParseFromString(
+      absl::StrCat("    signature {"
+                   "      name: 'MatMulFunction'"
+                   "      input_arg {"
+                   "        name: 'a'"
+                   "        type: DT_FLOAT"
+                   "      }"
+                   "      input_arg {"
+                   "        name: 'b'"
+                   "        type: DT_FLOAT"
+                   "      }"
+                   "      output_arg {"
+                   "        name: 'm'"
+                   "        type: DT_FLOAT"
+                   "      }"
+                   "    }"
+                   "    node_def {"
+                   "      name: 'matmul'"
+                   "      op: 'MatMul'"
+                   "      input: 'a'"
+                   "      input: 'b'"
+                   "      device: '",
+                   matmul_device, "'",
+                   "      attr {"
+                   "        key: 'T'"
+                   "        value {"
+                   "          type: DT_FLOAT"
+                   "        }"
+                   "      }"
+                   "    }"
+                   "    ret {"
+                   "      key: 'm'"
+                   "      value: 'matmul:product'"
+                   "    }"),
+      &def));
+  return def.SerializeAsString();
+}
+
+void TestRemoteExecuteSilentCopies(bool async, bool remote, bool func,
+                                   bool heavy_load_on_streaming_rpc,
+                                   bool remote_func_outputs) {
+  tensorflow::ServerDef server_def = GetServerDef(3);
+
+  // This server def has the task index set to 0.
+  string serialized = server_def.SerializeAsString();
+
+  server_def.set_task_index(1);
+  std::unique_ptr<tensorflow::GrpcServer> worker_server1;
+  ASSERT_TRUE(tensorflow::GrpcServer::Create(
+                  server_def, tensorflow::Env::Default(), &worker_server1)
+                  .ok());
+  ASSERT_TRUE(worker_server1->Start().ok());
+
+  server_def.set_task_index(2);
+  std::unique_ptr<tensorflow::GrpcServer> worker_server2;
+  ASSERT_TRUE(tensorflow::GrpcServer::Create(
+                  server_def, tensorflow::Env::Default(), &worker_server2)
+                  .ok());
+  ASSERT_TRUE(worker_server2->Start().ok());
+
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_ContextOptionsSetAsync(opts, static_cast<unsigned char>(async));
+  TFE_ContextOptionsSetDevicePlacementPolicy(opts, TFE_DEVICE_PLACEMENT_SILENT);
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  TFE_ContextSetServerDef(ctx, 0, serialized.data(), serialized.size(), status);
+  EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+
+  TFE_TensorHandle* h0_task0 = TestMatrixTensorHandle(ctx);
+  TFE_TensorHandle* h1_task0 = TestMatrixTensorHandle(ctx);
+  std::vector<TFE_TensorHandle*> handles_task0;
+  if (heavy_load_on_streaming_rpc) {
+    // Send 50 tensor copy requests to simulate that there have been some RPC
+    // requests been enqueued.
+    for (int i = 0; i < 50; ++i) {
+      handles_task0.push_back(TestMatrixTensorHandle(ctx));
+    }
+  }
+  const char task1_name[] = "/job:localhost/replica:0/task:1/device:CPU:0";
+  const char task2_name[] = "/job:localhost/replica:0/task:2/device:CPU:0";
+
+  std::vector<TFE_TensorHandle*> handles_task2;
+  for (auto* h_task0 : handles_task0) {
+    handles_task2.push_back(
+        TFE_TensorHandleCopyToDevice(h_task0, ctx, task2_name, status));
+    ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  }
+
+  auto* h1_task2 =
+      TFE_TensorHandleCopyToDevice(h1_task0, ctx, task2_name, status);
+  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+
+  TFE_Op* matmul = nullptr;
+  if (func) {
+    const string matmul_device = remote_func_outputs ? task2_name : "";
+    string function_def = MatMulFunction(matmul_device);
+    TFE_ContextAddFunctionDef(ctx, function_def.data(), function_def.size(),
+                              status);
+    CHECK_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+
+    matmul = TFE_NewOp(ctx, "MatMulFunction", status);
+    ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+    TFE_OpAddInput(matmul, h0_task0, status);
+    ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+    TFE_OpAddInput(matmul, h1_task2, status);
+    ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  } else {
+    // Handles are on task0 (local), and task2, but op is on task1.
+    matmul = MatMulOp(ctx, h0_task0, h1_task2);
+  }
+  if (remote) {
+    TFE_OpSetDevice(matmul, task1_name, status);
+    ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  } else if (!async) {
+    // Set the local device to CPU to easily validate mirroring
+    string cpu_device_name;
+    ASSERT_TRUE(GetDeviceName(ctx, &cpu_device_name, "CPU"));
+    TFE_OpSetDevice(matmul, cpu_device_name.c_str(), status);
+    EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+    auto remote_arg =
+        tensorflow::TensorHandleFromInterface(tensorflow::unwrap(h1_task2));
+    // The input handles should never change since they have been mirrored.
+    ASSERT_FALSE(remote_arg->HasLocalMirror(nullptr));
+  }
+
+  TFE_TensorHandle* retvals[1];
+  int num_retvals = 1;
+  TFE_Execute(matmul, &retvals[0], &num_retvals, status);
+  EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+
+  // TODO(gjn): Add support for waiting on async local mirrors
+  if (!remote && !async && !remote_func_outputs) {
+    auto remote_arg =
+        tensorflow::TensorHandleFromInterface(tensorflow::unwrap(h1_task2));
+    // The input handles should never change since they have been mirrored.
+    ASSERT_TRUE(remote_arg->HasLocalMirror(nullptr));
+  }
+
+  if (remote_func_outputs) {
+    const string backing_device =
+        TFE_TensorHandleBackingDeviceName(retvals[0], status);
+    ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+    EXPECT_EQ(backing_device, task2_name);
+  }
+
+  auto* retval_task0 = TFE_TensorHandleCopyToDevice(
+      retvals[0], ctx, "/job:localhost/replica:0/task:0/device:CPU:0", status);
+  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+
+  TF_Tensor* t = TFE_TensorHandleResolve(retval_task0, status);
+  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  TFE_DeleteTensorHandle(retval_task0);
+  float product[4] = {0};
+  EXPECT_EQ(sizeof(product), TF_TensorByteSize(t));
+  memcpy(&product[0], TF_TensorData(t), TF_TensorByteSize(t));
+  TF_DeleteTensor(t);
+  EXPECT_EQ(7, product[0]);
+  EXPECT_EQ(10, product[1]);
+  EXPECT_EQ(15, product[2]);
+  EXPECT_EQ(22, product[3]);
+
+  TFE_DeleteTensorHandle(h0_task0);
+  TFE_DeleteTensorHandle(h1_task0);
+  TFE_DeleteTensorHandle(h1_task2);
+  TFE_DeleteTensorHandle(retvals[0]);
+  for (auto* h : handles_task0) {
+    TFE_DeleteTensorHandle(h);
+  }
+  for (auto* h : handles_task2) {
+    TFE_DeleteTensorHandle(h);
+  }
+
+  TFE_DeleteOp(matmul);
+
+  TFE_Executor* executor = TFE_ContextGetExecutorForThread(ctx);
+  TFE_ExecutorWaitForAllPendingNodes(executor, status);
+  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  TFE_DeleteExecutor(executor);
+  if (func) {
+    TFE_ContextRemoveFunction(ctx, "MatMulFunction", status);
+  }
+  TFE_DeleteContext(ctx);
+
+  TF_DeleteStatus(status);
+
+  // TODO(b/136478427): Figure out how to correctly shut the server down.
+  worker_server1.release();
+  worker_server2.release();
+}
diff --git a/tensorflow/c/eager/c_api_remote_test_util.h b/tensorflow/c/eager/c_api_remote_test_util.h
new file mode 100644
index 00000000000..08633689402
--- /dev/null
+++ b/tensorflow/c/eager/c_api_remote_test_util.h
@@ -0,0 +1,26 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EAGER_C_API_REMOTE_TEST_UTIL_H_
+#define TENSORFLOW_C_EAGER_C_API_REMOTE_TEST_UTIL_H_
+
+// Run a function containing a MatMul op and check its output.
+// If heavy_load_on_streaming_rpc is true, send some rpc reqeusts before the one
+// which creates a remote remote input, to simulate a scenario that the remote
+// input is not ready when we start running an op or a function.
+void TestRemoteExecuteSilentCopies(bool async, bool remote, bool func,
+                                   bool heavy_load_on_streaming_rpc,
+                                   bool remote_func_outputs = false);
+
+#endif  // TENSORFLOW_C_EAGER_C_API_REMOTE_TEST_UTIL_H_
diff --git a/tensorflow/c/eager/c_api_test.cc b/tensorflow/c/eager/c_api_test.cc
index 724176505ba..fd208c6770d 100644
--- a/tensorflow/c/eager/c_api_test.cc
+++ b/tensorflow/c/eager/c_api_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <string>
 
 // clang-format off
+#include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/platform/platform.h"
 // clang-format on
 
@@ -876,89 +877,6 @@ TEST(CAPI, Execute_Min_CPU) {
   TF_DeleteStatus(status);
 }
 
-#ifdef TENSORFLOW_EAGER_USE_XLA
-void Execute_MatMul_XLA_CPU(bool async) {
-  TF_Status* status = TF_NewStatus();
-  TFE_ContextOptions* opts = TFE_NewContextOptions();
-  TFE_ContextOptionsSetAsync(opts, static_cast<unsigned char>(async));
-  TFE_Context* ctx = TFE_NewContext(opts, status);
-  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-  TFE_DeleteContextOptions(opts);
-
-  TFE_TensorHandle* m = TestMatrixTensorHandle(ctx);
-  TFE_Op* matmul = MatMulOp(ctx, m, m);
-
-  TFE_OpSetXLACompilation(matmul, true);
-
-  TFE_TensorHandle* retvals[1] = {nullptr};
-  int num_retvals = 1;
-  TFE_Execute(matmul, &retvals[0], &num_retvals, status);
-  // Running a primitive TF operator via XLA is not yet supported.
-  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-
-  TFE_DeleteOp(matmul);
-  TFE_DeleteTensorHandle(m);
-  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-
-  EXPECT_EQ(1, num_retvals);
-
-  TF_Tensor* t = TFE_TensorHandleResolve(retvals[0], status);
-  TFE_DeleteTensorHandle(retvals[0]);
-  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-  float product[4] = {0};
-  EXPECT_EQ(sizeof(product), TF_TensorByteSize(t));
-  memcpy(&product[0], TF_TensorData(t), TF_TensorByteSize(t));
-  TF_DeleteTensor(t);
-  EXPECT_EQ(7, product[0]);
-  EXPECT_EQ(10, product[1]);
-  EXPECT_EQ(15, product[2]);
-  EXPECT_EQ(22, product[3]);
-  TFE_DeleteContext(ctx);
-  TF_DeleteStatus(status);
-}
-TEST(CAPI, Execute_MatMul_XLA_CPU) { Execute_MatMul_XLA_CPU(false); }
-TEST(CAPI, Execute_MatMul_XLA_CPUAsync) { Execute_MatMul_XLA_CPU(true); }
-
-void Execute_Min_XLA_CPU(bool async) {
-  TF_Status* status = TF_NewStatus();
-  TFE_ContextOptions* opts = TFE_NewContextOptions();
-  TFE_ContextOptionsSetAsync(opts, static_cast<unsigned char>(async));
-  TFE_Context* ctx = TFE_NewContext(opts, status);
-  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-  TFE_DeleteContextOptions(opts);
-
-  TFE_TensorHandle* input = TestMatrixTensorHandle(ctx);
-  TFE_TensorHandle* axis = TestAxisTensorHandle(ctx);
-  TFE_Op* minOp = MinOp(ctx, input, axis);
-
-  TFE_OpSetXLACompilation(minOp, true);
-
-  TFE_TensorHandle* retvals[1] = {nullptr};
-  int num_retvals = 1;
-  TFE_Execute(minOp, &retvals[0], &num_retvals, status);
-  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-  TFE_DeleteOp(minOp);
-  TFE_DeleteTensorHandle(input);
-  TFE_DeleteTensorHandle(axis);
-  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-  ASSERT_EQ(1, num_retvals);
-
-  TF_Tensor* t = TFE_TensorHandleResolve(retvals[0], status);
-  TFE_DeleteTensorHandle(retvals[0]);
-  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-  float output[2] = {0};
-  EXPECT_EQ(sizeof(output), TF_TensorByteSize(t));
-  memcpy(&output[0], TF_TensorData(t), TF_TensorByteSize(t));
-  TF_DeleteTensor(t);
-  EXPECT_EQ(1, output[0]);
-  EXPECT_EQ(3, output[1]);
-  TFE_DeleteContext(ctx);
-  TF_DeleteStatus(status);
-}
-TEST(CAPI, Execute_Min_XLA_CPU) { Execute_Min_XLA_CPU(false); }
-TEST(CAPI, Execute_Min_XLA_CPUAsync) { Execute_Min_XLA_CPU(true); }
-#endif  // TENSORFLOW_EAGER_USE_XLA
-
 void ExecuteWithTracing(bool async) {
   TF_Status* status = TF_NewStatus();
   TFE_ContextOptions* opts = TFE_NewContextOptions();
@@ -1274,6 +1192,68 @@ TEST(CAPI, StringAttributes) {
   TF_DeleteStatus(status);
 }
 
+// Same test as above, expect use SetOpAttrValueScalar to set attrs.
+TEST(CAPI, TestTFE_SetOpAttrs) {
+  // Test that TFE_OpSetAttrString doesn't hold on to the value after it
+  // returns.
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  std::vector<int64_t> dims(4, 1);
+  TFE_Op* op = TFE_NewOp(ctx, "AvgPool", status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TF_Tensor* tensor =
+      TF_AllocateTensor(TF_FLOAT, dims.data(), dims.size(), sizeof(float));
+  float tensor_data[] = {1};
+  memcpy(TF_TensorData(tensor), tensor_data, TF_TensorByteSize(tensor));
+  TFE_TensorHandle* tensor_handle = TFE_NewTensorHandle(tensor, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_OpAddInput(op, tensor_handle, status);
+  TF_DeleteTensor(tensor);
+  TFE_DeleteTensorHandle(tensor_handle);
+
+  tensorflow::AttrValue i_list_values;
+  for (int i = 0; i < 4; ++i) {
+    i_list_values.mutable_list()->add_i(1);
+  }
+  SetOpAttrValueScalar(ctx, op, i_list_values, "ksize", status);
+  SetOpAttrValueScalar(ctx, op, i_list_values, "strides", status);
+
+  tensorflow::AttrValue padding_value;
+  *padding_value.mutable_s() = "VALID";
+  tensorflow::SetOpAttrValueScalar(ctx, op, padding_value, "padding", status);
+
+  tensorflow::AttrValue data_format_value;
+  *data_format_value.mutable_s() = "NHWC";
+  tensorflow::SetOpAttrValueScalar(ctx, op, data_format_value, "data_format",
+                                   status);
+
+  TFE_OpSetAttrType(op, "T", TF_FLOAT);
+
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TFE_TensorHandle* retvals[1];
+  int num_retvals = 1;
+  TFE_Execute(op, &retvals[0], &num_retvals, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  ASSERT_EQ(1, num_retvals);
+
+  tensor = TFE_TensorHandleResolve(retvals[0], status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  EXPECT_EQ(4, TF_TensorByteSize(tensor));
+  TF_DeleteTensor(tensor);
+  TFE_DeleteTensorHandle(retvals[0]);
+
+  TFE_DeleteOp(op);
+
+  TFE_DeleteContext(ctx);
+  TF_DeleteStatus(status);
+}
+
 TEST(CAPI, TestTFE_TensorHandleCopySharingUnderlyingTensorHandle) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
@@ -1620,4 +1600,91 @@ TEST(CAPI, TestTFE_OpAttrsSerialize) {
   TFE_DeleteContext(ctx);
 }
 
+// Needs to work with a const TFE_Op since custom devices should not modify the
+// op they are called with.
+TFE_Op* CloneOp(const TFE_Op* other) {
+  TF_Status* status = TF_NewStatus();
+  TFE_Context* context = TFE_OpGetContext(other, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  const char* op_name = TFE_OpGetName(other, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_Op* ret = TFE_NewOp(context, op_name, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  const char* device = TFE_OpGetDevice(other, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_OpSetDevice(ret, device, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_OpAddAttrs(ret, TFE_OpGetAttrs(other));
+  int num_inputs = TFE_OpGetFlatInputCount(other, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  for (int input_index = 0; input_index < num_inputs; ++input_index) {
+    TFE_TensorHandle* input = TFE_OpGetFlatInput(other, input_index, status);
+    CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+    TFE_OpAddInput(ret, input, status);
+    CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  }
+  TF_DeleteStatus(status);
+  return ret;
+}
+
+TEST(CAPI, TestTFE_OpRecreation) {
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  // Clone an op with attributes and a device set.
+  TFE_Op* original_var_op = TFE_NewOp(ctx, "VarHandleOp", status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_OpSetAttrType(original_var_op, "dtype", TF_INT64);
+  TFE_OpSetAttrShape(original_var_op, "shape", {}, 0, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  EXPECT_EQ("", std::string(TFE_OpGetDevice(original_var_op, status)));
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_OpSetDevice(original_var_op,
+                  "/job:localhost/replica:0/task:0/device:CPU:0", status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_Op* cloned = CloneOp(original_var_op);
+
+  EXPECT_EQ("/job:localhost/replica:0/task:0/device:CPU:0",
+            std::string(TFE_OpGetDevice(cloned, status)));
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  EXPECT_EQ("VarHandleOp", std::string(TFE_OpGetName(cloned, status)));
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  int num_retvals = 1;
+  TFE_TensorHandle* ret;
+  TFE_Execute(cloned, &ret, &num_retvals, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteTensorHandle(ret);
+
+  // Clone an op with inputs and no device set.
+  TFE_TensorHandle* input1 = TestMatrixTensorHandle(ctx);
+  TFE_TensorHandle* input2 = TestMatrixTensorHandle(ctx);
+  TFE_Op* original_identity = TFE_NewOp(ctx, "IdentityN", status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_TensorHandle* inputs[] = {input1, input2};
+  TFE_OpAddInputList(original_identity, inputs, 2, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_Op* cloned_identity = CloneOp(original_identity);
+  EXPECT_EQ("", std::string(TFE_OpGetDevice(cloned_identity, status)));
+  TFE_TensorHandle* identity_ret[] = {nullptr, nullptr};
+  num_retvals = 2;
+  TFE_Execute(cloned_identity, identity_ret, &num_retvals, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TFE_DeleteTensorHandle(input1);
+  TFE_DeleteTensorHandle(input2);
+  TFE_DeleteTensorHandle(identity_ret[0]);
+  TFE_DeleteTensorHandle(identity_ret[1]);
+
+  TFE_DeleteOp(cloned_identity);
+  TFE_DeleteOp(original_identity);
+  TFE_DeleteOp(original_var_op);
+  TFE_DeleteOp(cloned);
+  TF_DeleteStatus(status);
+  TFE_DeleteContext(ctx);
+}
+
 }  // namespace
diff --git a/tensorflow/c/eager/c_api_test_util.cc b/tensorflow/c/eager/c_api_test_util.cc
index 192f10533a6..fd68866f502 100644
--- a/tensorflow/c/eager/c_api_test_util.cc
+++ b/tensorflow/c/eager/c_api_test_util.cc
@@ -102,6 +102,32 @@ TFE_TensorHandle* TestMatrixTensorHandleWithInput(TFE_Context* ctx,
   return th;
 }
 
+TFE_TensorHandle* TestTensorHandleWithDimsFloat(TFE_Context* ctx, float data[],
+                                                int64_t dims[], int num_dims) {
+  TF_Status* status = TF_NewStatus();
+  TF_Tensor* t =
+      TFE_AllocateHostTensor(ctx, TF_FLOAT, &dims[0], num_dims, status);
+  memcpy(TF_TensorData(t), &data[0], TF_TensorByteSize(t));
+  TFE_TensorHandle* th = TFE_NewTensorHandleFromTensor(ctx, t, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TF_DeleteTensor(t);
+  TF_DeleteStatus(status);
+  return th;
+}
+
+TFE_TensorHandle* TestTensorHandleWithDimsInt(TFE_Context* ctx, int data[],
+                                              int64_t dims[], int num_dims) {
+  TF_Status* status = TF_NewStatus();
+  TF_Tensor* t =
+      TFE_AllocateHostTensor(ctx, TF_INT32, &dims[0], num_dims, status);
+  memcpy(TF_TensorData(t), &data[0], TF_TensorByteSize(t));
+  TFE_TensorHandle* th = TFE_NewTensorHandleFromTensor(ctx, t, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TF_DeleteTensor(t);
+  TF_DeleteStatus(status);
+  return th;
+}
+
 TFE_TensorHandle* TestMatrixTensorHandle100x100(TFE_Context* ctx) {
   constexpr int64_t dims[] = {100, 100};
   constexpr int num_elements = dims[0] * dims[1];
diff --git a/tensorflow/c/eager/c_api_test_util.h b/tensorflow/c/eager/c_api_test_util.h
index fcf407aa9c3..2f77ae5cf44 100644
--- a/tensorflow/c/eager/c_api_test_util.h
+++ b/tensorflow/c/eager/c_api_test_util.h
@@ -40,6 +40,14 @@ TFE_TensorHandle* TestMatrixTensorHandleWithInput(TFE_Context* ctx,
                                                   float data[], int64_t dims[],
                                                   int num_dims);
 
+// Get a Matrix TensorHandle with given float values and dimensions
+TFE_TensorHandle* TestTensorHandleWithDimsFloat(TFE_Context* ctx, float data[],
+                                                int64_t dims[], int num_dims);
+
+// Get a Matrix TensorHandle with given int values and dimensions
+TFE_TensorHandle* TestTensorHandleWithDimsInt(TFE_Context* ctx, int data[],
+                                              int64_t dims[], int num_dims);
+
 // Return a tensor handle containing a 100x100 matrix of floats
 TFE_TensorHandle* TestMatrixTensorHandle100x100(TFE_Context* ctx);
 
diff --git a/tensorflow/c/eager/c_api_unified_experimental.cc b/tensorflow/c/eager/c_api_unified_experimental.cc
index 8408f7ef60f..2d290df19ce 100644
--- a/tensorflow/c/eager/c_api_unified_experimental.cc
+++ b/tensorflow/c/eager/c_api_unified_experimental.cc
@@ -39,7 +39,7 @@ static FactoriesMap& GetFactories() {
   return *factories;
 }
 
-static const char* default_factory = "<unset>";
+static tracing::FactoryFunction default_factory;
 
 void RegisterTracingEngineFactory(const string& name, FactoryFunction factory) {
   assert((!GetFactories().count(name)) ||
@@ -48,15 +48,15 @@ void RegisterTracingEngineFactory(const string& name, FactoryFunction factory) {
   GetFactories()[name] = factory;
 }
 
-void SetDefaultTracingEngine(const char* name) { default_factory = name; }
-
-static TracingContext* CreateTracingExecutionContext(const char* fn_name,
-                                                     TF_Status* s) {
-  auto entry = GetFactories().find(default_factory);
-  if (entry != GetFactories().end()) return entry->second(fn_name, s);
+Status SetDefaultTracingEngine(const char* name) {
+  auto entry = GetFactories().find(name);
+  if (entry != GetFactories().end()) {
+    default_factory = GetFactories().find(name)->second;
+    return Status::OK();
+  }
   string msg = absl::StrCat(
-      "No tracing engine factory has been registered with the key '",
-      default_factory, "' (available: ");
+      "No tracing engine factory has been registered with the key '", name,
+      "' (available: ");
   // Ensure deterministic (sorted) order in the error message
   std::set<string> factories_sorted;
   for (const auto& factory : GetFactories())
@@ -68,7 +68,16 @@ static TracingContext* CreateTracingExecutionContext(const char* fn_name,
   }
   msg += ")";
 
-  TF_SetStatus(s, TF_INVALID_ARGUMENT, msg.c_str());
+  return errors::InvalidArgument(msg.c_str());
+}
+
+static TracingContext* CreateTracingExecutionContext(const char* fn_name,
+                                                     TF_Status* s) {
+  if (default_factory) {
+    return default_factory(fn_name, s);
+  }
+  Set_TF_Status_from_Status(
+      s, errors::FailedPrecondition("default_factory is nullptr"));
   return nullptr;
 }
 
@@ -99,8 +108,8 @@ using tensorflow::tracing::TracingContext;
 using tensorflow::tracing::TracingOperation;
 using tensorflow::tracing::TracingTensorHandle;
 
-void TF_SetTracingImplementation(const char* name) {
-  SetDefaultTracingEngine(name);
+void TF_SetTracingImplementation(const char* name, TF_Status* s) {
+  Set_TF_Status_from_Status(s, SetDefaultTracingEngine(name));
 }
 
 // Creates a new TensorFlow function, it is an execution context attached to a
diff --git a/tensorflow/c/eager/c_api_unified_experimental.h b/tensorflow/c/eager/c_api_unified_experimental.h
index b66869b4290..d216b4e694b 100644
--- a/tensorflow/c/eager/c_api_unified_experimental.h
+++ b/tensorflow/c/eager/c_api_unified_experimental.h
@@ -52,7 +52,7 @@ typedef struct TF_AbstractFunction TF_AbstractFunction;
 // This allows the client to swap the implementation of the tracing engine.
 // Any future call to TF_CreateFunction will use the implementation defined
 // here.
-void TF_SetTracingImplementation(const char* name);
+void TF_SetTracingImplementation(const char* name, TF_Status*);
 
 // Creates a new TensorFlow function. A Function is an execution context, and as
 // such it can trace operations through TF_ExecuteOperation. After completing
diff --git a/tensorflow/c/eager/c_api_unified_experimental_graph.cc b/tensorflow/c/eager/c_api_unified_experimental_graph.cc
index 7bda3aed76d..0e9d6c18157 100644
--- a/tensorflow/c/eager/c_api_unified_experimental_graph.cc
+++ b/tensorflow/c/eager/c_api_unified_experimental_graph.cc
@@ -85,7 +85,11 @@ class GraphOperation : public TracingOperation {
       return errors::FailedPrecondition(
           "GraphOperation::Reset must be called before calling SetOpName.");
     }
-    op_.reset(TF_NewOperation(g_, op_type_.c_str(), op_name));
+    // TODO(b/145674566): We use Graph::NewName to get a unique name here but
+    // this may not be consistent with python's naming policy.
+    mutex_lock l(g_->mu);
+    op_.reset(new TF_OperationDescription(g_, op_type_.c_str(),
+                                          g_->graph.NewName(op_name).c_str()));
     return Status::OK();
   }
   const string& Name() const override { return op_type_; }
@@ -361,9 +365,10 @@ class GraphContext : public TracingContext {
     }
 
     auto s = TF_NewStatus();
-    func->func = TF_GraphToFunction(
-        graph_.get(), name_, 0, -1, nullptr, inputs_.size(), inputs_.data(),
-        graph_outputs.size(), graph_outputs.data(), nullptr, nullptr, name_, s);
+    func->func = TF_GraphToFunction(graph_.get(), name_.data(), 0, -1, nullptr,
+                                    inputs_.size(), inputs_.data(),
+                                    graph_outputs.size(), graph_outputs.data(),
+                                    nullptr, nullptr, name_.data(), s);
     TF_RETURN_IF_ERROR(StatusFromTF_Status(s));
     TF_DeleteStatus(s);
     *f = func.release();
@@ -387,7 +392,7 @@ class GraphContext : public TracingContext {
  private:
   std::unique_ptr<TF_Graph, decltype(&TF_DeleteGraph)> graph_;
   std::vector<TF_Output> inputs_;
-  const char* name_;
+  string name_;
 };
 
 static TracingContext* GraphTracingFactory(const char* name, TF_Status* s) {
@@ -397,7 +402,7 @@ static TracingContext* GraphTracingFactory(const char* name, TF_Status* s) {
 // Register the tracing implemented in this file as the default tracing engine.
 static bool register_tracing = [] {
   RegisterTracingEngineFactory("graphdef", GraphTracingFactory);
-  SetDefaultTracingEngine("graphdef");
+  SetDefaultTracingEngine("graphdef").IgnoreError();
   return true;
 }();
 
diff --git a/tensorflow/c/eager/c_api_unified_experimental_internal.h b/tensorflow/c/eager/c_api_unified_experimental_internal.h
index c00e04d98af..9433fe8f120 100644
--- a/tensorflow/c/eager/c_api_unified_experimental_internal.h
+++ b/tensorflow/c/eager/c_api_unified_experimental_internal.h
@@ -120,7 +120,7 @@ class TracingContext : public AbstractContext {
 };
 
 typedef TracingContext* (*FactoryFunction)(const char* fn_name, TF_Status*);
-void SetDefaultTracingEngine(const char* name);
+Status SetDefaultTracingEngine(const char* name);
 void RegisterTracingEngineFactory(const ::tensorflow::string& name,
                                   FactoryFunction factory);
 }  // namespace tracing
diff --git a/tensorflow/c/eager/c_api_unified_experimental_test.cc b/tensorflow/c/eager/c_api_unified_experimental_test.cc
index c56e8ab05fc..432ddb4b2d4 100644
--- a/tensorflow/c/eager/c_api_unified_experimental_test.cc
+++ b/tensorflow/c/eager/c_api_unified_experimental_test.cc
@@ -22,19 +22,30 @@ limitations under the License.
 #include "tensorflow/c/eager/c_api_test_util.h"
 #include "tensorflow/c/tf_datatype.h"
 #include "tensorflow/c/tf_status.h"
+#include "tensorflow/c/tf_status_helper.h"
 #include "tensorflow/c/tf_tensor.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/test.h"
 
+using tensorflow::Status;
 using tensorflow::string;
+using tensorflow::TF_StatusPtr;
 
 namespace tensorflow {
 namespace {
 
+// The tests are parameterized on:
+// - a string representing the tracing implementation: "mlir" or "graphdef".
+// - a boolean that when true enables TFRT as the execution engine.
 class UnifiedCAPI
     : public ::testing::TestWithParam<std::tuple<const char*, bool>> {
  protected:
   void SetUp() override {
-    TF_SetTracingImplementation(std::get<0>(GetParam()));
+    TF_StatusPtr status(TF_NewStatus());
+    TF_SetTracingImplementation(std::get<0>(GetParam()), status.get());
+    Status s = StatusFromTF_Status(status.get());
+    CHECK_EQ(errors::OK, s.code()) << s.error_message();
   }
 };
 
@@ -554,7 +565,7 @@ TEST_P(UnifiedCAPI, TestMultiOutputGraph) {
     auto* add_op = TF_NewAbstractOp(graph_ctx);
     TF_AbstractOpSetOpType(add_op, "Add", s);
     ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-    TF_AbstractOpSetOpName(add_op, "my_add1", s);
+    TF_AbstractOpSetOpName(add_op, "my_add", s);
     ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
     TF_AbstractTensor* inputs[2] = {arg0, arg1};
     TF_OutputList* add_outputs = TF_NewOutputList();
@@ -576,7 +587,7 @@ TEST_P(UnifiedCAPI, TestMultiOutputGraph) {
     auto* add_op = TF_NewAbstractOp(graph_ctx);
     TF_AbstractOpSetOpType(add_op, "Add", s);
     ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-    TF_AbstractOpSetOpName(add_op, "my_add2", s);
+    TF_AbstractOpSetOpName(add_op, "my_add", s);
     ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
     TF_AbstractTensor* inputs[2] = {arg1, arg1};
     TF_OutputList* add_outputs = TF_NewOutputList();
@@ -983,6 +994,10 @@ TEST_P(UnifiedCAPI, TF_ExecutionContextGetTFEContextFromFunctionContextRaises) {
 
   TF_DeleteExecutionContext(graph_ctx);
 }
+
+// The above tests are run for a combination of:
+// - graphdef and MLIR tracing engine
+// - Using TFRT as an execution runtime (true == enable TFRT)
 #ifdef PLATFORM_GOOGLE
 INSTANTIATE_TEST_SUITE_P(Tracing, UnifiedCAPI,
                          ::testing::Combine(::testing::Values("graphdef",
diff --git a/tensorflow/c/eager/custom_device_test.cc b/tensorflow/c/eager/custom_device_test.cc
index 1c078d4f42c..b058c79a17b 100644
--- a/tensorflow/c/eager/custom_device_test.cc
+++ b/tensorflow/c/eager/custom_device_test.cc
@@ -36,7 +36,8 @@ TEST(CUSTOM_DEVICE, RegisterSimpleDevice) {
   bool arrived = false;
   bool executed = false;
   const char* name = "/job:localhost/replica:0/task:0/device:CUSTOM:0";
-  RegisterLoggingDevice(context, name, &arrived, &executed, status.get());
+  RegisterLoggingDevice(context, name, /*strict_scope_placement=*/true,
+                        &arrived, &executed, status.get());
   ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
   TFE_TensorHandle* hcpu = TestMatrixTensorHandle(context);
   ASSERT_FALSE(arrived);
@@ -73,7 +74,8 @@ TEST(CUSTOM_DEVICE, ResetOperation) {
   bool executed = false;
   const char* custom_device_name =
       "/job:localhost/replica:0/task:0/device:CUSTOM:0";
-  RegisterLoggingDevice(context.get(), custom_device_name, &arrived, &executed,
+  RegisterLoggingDevice(context.get(), custom_device_name,
+                        /*strict_scope_placement=*/true, &arrived, &executed,
                         status.get());
   ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
 
@@ -103,7 +105,8 @@ TEST(CUSTOM_DEVICE, MakeVariable) {
   bool arrived = false;
   bool executed = false;
   const char* name = "/job:localhost/replica:0/task:0/device:CUSTOM:0";
-  RegisterLoggingDevice(context.get(), name, &arrived, &executed, status.get());
+  RegisterLoggingDevice(context.get(), name, /*strict_scope_placement=*/true,
+                        &arrived, &executed, status.get());
   ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
 
   // Create a variable handle placed on the custom device.
@@ -187,7 +190,8 @@ TEST(CUSTOM_DEVICE, AccessVariableOnCustomDevice) {
   bool arrived = false;
   bool executed = false;
   const char* name = "/job:localhost/replica:0/task:0/device:CUSTOM:0";
-  RegisterLoggingDevice(context.get(), name, &arrived, &executed, status.get());
+  RegisterLoggingDevice(context.get(), name, /*strict_scope_placement=*/false,
+                        &arrived, &executed, status.get());
   ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
 
   // Create a variable handle placed on the custom device.
@@ -264,10 +268,12 @@ TEST(CUSTOM_DEVICE, InputBasedPlacement) {
   const char* custom1 = "/job:localhost/replica:0/task:0/device:CUSTOM:1";
   bool arrived = false;
   bool executed = false;
-  RegisterLoggingDevice(context.get(), custom0, &arrived, &executed,
+  RegisterLoggingDevice(context.get(), custom0,
+                        /*strict_scope_placement=*/false, &arrived, &executed,
                         status.get());
   ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
-  RegisterLoggingDevice(context.get(), custom1, &arrived, &executed,
+  RegisterLoggingDevice(context.get(), custom1,
+                        /*strict_scope_placement=*/true, &arrived, &executed,
                         status.get());
   ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
 
@@ -314,14 +320,34 @@ TEST(CUSTOM_DEVICE, InputBasedPlacement) {
   ASSERT_TRUE(absl::StrContains(TF_Message(status.get()), custom0));
   ASSERT_TRUE(absl::StrContains(TF_Message(status.get()), custom1));
 
-  // Custom device: mix of custom/physical fails.
+  // Custom device: mix of custom/physical places the op on the custom device.
   matmul.reset(MatMulOp(context.get(), hcustom0.get(), hcpu.get()));
   num_retvals = 1;
+  executed = false;
   TFE_Execute(matmul.get(), &retval, &num_retvals, status.get());
-  ASSERT_NE(TF_OK, TF_GetCode(status.get()));
-  ASSERT_TRUE(absl::StrContains(TF_Message(status.get()), custom0));
-  ASSERT_TRUE(
-      absl::StrContains(TF_Message(status.get()), "[]"));  // kVariantDeviceNull
+  EXPECT_TRUE(executed);
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+  TFE_DeleteTensorHandle(retval);
+
+  // Explicit placement still forces the op onto the requested device
+  matmul.reset(MatMulOp(context.get(), hcustom0.get(), hcpu.get()));
+  TFE_OpSetDevice(matmul.get(), "/job:localhost/replica:0/task:0/device:CPU:0",
+                  status.get());
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+  num_retvals = 1;
+  executed = false;
+  TFE_Execute(matmul.get(), &retval, &num_retvals, status.get());
+  EXPECT_FALSE(executed);
+  ASSERT_FALSE(TF_GetCode(status.get()) == TF_OK);
+
+  // Custom devices can refuse to do type-based dispatch (as hcustom1 is
+  // configured to do)
+  matmul.reset(MatMulOp(context.get(), hcustom1.get(), hcpu.get()));
+  num_retvals = 1;
+  executed = false;
+  TFE_Execute(matmul.get(), &retval, &num_retvals, status.get());
+  EXPECT_FALSE(executed);
+  ASSERT_FALSE(TF_GetCode(status.get()) == TF_OK);
 }
 
 TEST(CUSTOM_DEVICE, InvalidRegistrationError) {
@@ -334,21 +360,24 @@ TEST(CUSTOM_DEVICE, InvalidRegistrationError) {
   ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
   bool arrived = false;
   bool executed = false;
-  RegisterLoggingDevice(context.get(), "/device:CUSTOM:0", &arrived, &executed,
+  RegisterLoggingDevice(context.get(), "/device:CUSTOM:0",
+                        /*strict_scope_placement=*/true, &arrived, &executed,
                         status.get());
   ASSERT_TRUE(TF_GetCode(status.get()) == TF_INVALID_ARGUMENT)
       << TF_Message(status.get());
 
   const char* name = "/job:localhost/replica:0/task:0/device:CUSTOM:0";
-  RegisterLoggingDevice(context.get(), name, &arrived, &executed, status.get());
+  RegisterLoggingDevice(context.get(), name, /*strict_scope_placement=*/true,
+                        &arrived, &executed, status.get());
   ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
-  RegisterLoggingDevice(context.get(), name, &arrived, &executed, status.get());
-  ASSERT_TRUE(TF_GetCode(status.get()) == TF_ALREADY_EXISTS)
-      << TF_Message(status.get());
-
-  RegisterLoggingDevice(context.get(),
-                        "/job:localhost/replica:0/task:0/device:CPU:0",
+  RegisterLoggingDevice(context.get(), name, /*strict_scope_placement=*/true,
                         &arrived, &executed, status.get());
   ASSERT_TRUE(TF_GetCode(status.get()) == TF_ALREADY_EXISTS)
       << TF_Message(status.get());
+
+  RegisterLoggingDevice(
+      context.get(), "/job:localhost/replica:0/task:0/device:CPU:0",
+      /*strict_scope_placement=*/true, &arrived, &executed, status.get());
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_ALREADY_EXISTS)
+      << TF_Message(status.get());
 }
diff --git a/tensorflow/c/eager/custom_device_testutil.cc b/tensorflow/c/eager/custom_device_testutil.cc
index 28de3665653..014abe38368 100644
--- a/tensorflow/c/eager/custom_device_testutil.cc
+++ b/tensorflow/c/eager/custom_device_testutil.cc
@@ -33,6 +33,9 @@ struct LoggingDevice {
   bool* arrived_flag;
   // Set to true whenever an operation is executed
   bool* executed_flag;
+  // If true, only explicit op placements are accepted. If false, uses
+  // type-based dispatch.
+  bool strict_scope_placement;
 };
 
 struct LoggedTensor {
@@ -84,18 +87,35 @@ TFE_TensorHandle* CopyTensorFromLoggingDevice(TFE_Context* context,
   return nullptr;
 }
 
-void LoggingDeviceExecute(TFE_Context* context, int num_inputs,
-                          TFE_TensorHandle** inputs, const char* operation_name,
-                          const TFE_OpAttrs* attributes, int* num_outputs,
+void LoggingDeviceExecute(const TFE_Op* original_op, int* num_outputs,
                           TFE_TensorHandle** outputs, TF_Status* s,
                           void* device_info) {
+  const char* requested_placement = TFE_OpGetDevice(original_op, s);
+  if (TF_GetCode(s) != TF_OK) return;
+
   LoggingDevice* dev = reinterpret_cast<LoggingDevice*>(device_info);
+  if (dev->strict_scope_placement && *requested_placement == '\0') {
+    TF_SetStatus(s, TF_INTERNAL,
+                 "Ops must be placed on the device explicitly, or their inputs "
+                 "first copied to other devices.");
+    return;
+  }
+  TFE_Context* context = TFE_OpGetContext(original_op, s);
+  if (TF_GetCode(s) != TF_OK) return;
+  const char* operation_name = TFE_OpGetName(original_op, s);
+  if (TF_GetCode(s) != TF_OK) return;
+  const TFE_OpAttrs* attributes = TFE_OpGetAttrs(original_op);
+
   TFE_Op* op(TFE_NewOp(context, operation_name, s));
   if (TF_GetCode(s) != TF_OK) return;
   TFE_OpAddAttrs(op, attributes);
   TFE_OpSetDevice(op, dev->underlying_device.c_str(), s);
+  if (TF_GetCode(s) != TF_OK) return;
+  int num_inputs = TFE_OpGetFlatInputCount(original_op, s);
+  if (TF_GetCode(s) != TF_OK) return;
   for (int j = 0; j < num_inputs; ++j) {
-    TFE_TensorHandle* input = inputs[j];
+    TFE_TensorHandle* input = TFE_OpGetFlatInput(original_op, j, s);
+    if (TF_GetCode(s) != TF_OK) return;
     const char* input_device = TFE_TensorHandleDeviceName(input, s);
     if (TF_GetCode(s) != TF_OK) return;
     if (dev->device_name == input_device) {
@@ -131,8 +151,8 @@ void DeleteLoggingDevice(void* device_info) {
 }  // namespace
 
 void RegisterLoggingDevice(TFE_Context* context, const char* name,
-                           bool* arrived_flag, bool* executed_flag,
-                           TF_Status* status) {
+                           bool strict_scope_placement, bool* arrived_flag,
+                           bool* executed_flag, TF_Status* status) {
   TFE_CustomDevice custom_device;
   custom_device.copy_tensor_to_device = &CopyToLoggingDevice;
   custom_device.copy_tensor_from_device = &CopyTensorFromLoggingDevice;
@@ -143,6 +163,7 @@ void RegisterLoggingDevice(TFE_Context* context, const char* name,
   device->executed_flag = executed_flag;
   device->device_name = name;
   device->underlying_device = "/job:localhost/replica:0/task:0/device:CPU:0";
+  device->strict_scope_placement = strict_scope_placement;
   TFE_RegisterCustomDevice(context, custom_device, name, device, status);
 }
 
@@ -168,5 +189,6 @@ void AllocateLoggingDevice(const char* name, bool* arrived_flag,
   logging_device->device_name = name;
   logging_device->underlying_device =
       "/job:localhost/replica:0/task:0/device:CPU:0";
+  logging_device->strict_scope_placement = true;
   *device_info = reinterpret_cast<void*>(logging_device);
 }
diff --git a/tensorflow/c/eager/custom_device_testutil.h b/tensorflow/c/eager/custom_device_testutil.h
index 509df7d3e3e..a7c60080adf 100644
--- a/tensorflow/c/eager/custom_device_testutil.h
+++ b/tensorflow/c/eager/custom_device_testutil.h
@@ -25,8 +25,8 @@ limitations under the License.
 #include "tensorflow/c/tf_status.h"
 
 void RegisterLoggingDevice(TFE_Context* context, const char* name,
-                           bool* arrived_flag, bool* executed_flag,
-                           TF_Status* status);
+                           bool strict_scope_placement, bool* arrived_flag,
+                           bool* executed_flag, TF_Status* status);
 void AllocateLoggingDevice(const char* name, bool* arrived_flag,
                            bool* executed_flag, TFE_CustomDevice** device,
                            void** device_info);
diff --git a/tensorflow/c/eager/dlpack.cc b/tensorflow/c/eager/dlpack.cc
index 45048bd6efb..30d2009dc6a 100644
--- a/tensorflow/c/eager/dlpack.cc
+++ b/tensorflow/c/eager/dlpack.cc
@@ -109,7 +109,8 @@ DLDataType GetDlDataType(TF_DataType data_type, TF_Status* status) {
 // Gets DLPack's DLContext from eager tensor handle.
 DLContext GetDlContext(TFE_TensorHandle* h, TF_Status* status) {
   DLContext ctx;
-  const char* device_name = tensorflow::unwrap(h)->DeviceName(&status->status);
+  const char* device_name =
+      tensorflow::unwrap(h)->BackingDeviceName(&status->status);
   DeviceNameUtils::ParsedName parsed_name;
   tensorflow::DeviceNameUtils::ParseFullName(device_name, &parsed_name);
   std::string device_type = parsed_name.type;
diff --git a/tensorflow/c/eager/gradient_checker.cc b/tensorflow/c/eager/gradient_checker.cc
new file mode 100644
index 00000000000..640edc7228a
--- /dev/null
+++ b/tensorflow/c/eager/gradient_checker.cc
@@ -0,0 +1,201 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/c/eager/gradient_checker.h"
+
+#include <memory>
+
+#include "absl/types/span.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/eager/c_api_unified_experimental.h"
+#include "tensorflow/c/eager/c_api_unified_experimental_internal.h"
+#include "tensorflow/c/eager/gradients.h"
+#include "tensorflow/c/eager/gradients_internal.h"
+#include "tensorflow/c/experimental/gradients/math_grad.h"
+#include "tensorflow/c/experimental/gradients/nn_grad.h"
+#include "tensorflow/c/experimental/ops/array_ops.h"
+#include "tensorflow/c/tf_status_helper.h"
+#include "tensorflow/c/tf_tensor.h"
+#include "tensorflow/core/lib/llvm_rtti/llvm_rtti.h"
+#include "tensorflow/core/platform/errors.h"
+
+namespace tensorflow {
+namespace gradients {
+
+using namespace std;
+
+// ================== Helper functions =================
+
+// Fills data with values [start,end) with given step size.
+void Range(vector<int>* data, int start, int end, int step = 1) {
+  for (int i = start; i < end; i += step) {
+    (*data)[i] = i;
+  }
+}
+
+// Returns AbstractTensorHandlePtr containing [0, ..., n-1].
+AbstractTensorHandlePtr GetRangeTensorHandleUtil(AbstractContext* ctx, int n) {
+  vector<int> vals(n);
+  int64_t vals_shape[] = {n};
+  Range(&vals, 0, n);
+  AbstractTensorHandlePtr r =
+      GetTensorHandleUtilInt(ctx, vals.data(), vals_shape, 1);
+  return r;
+}
+
+// Fills out_dims with the dimensions of the given tensor.
+void GetDims(const TF_Tensor* t, int64_t* out_dims) {
+  int num_dims = TF_NumDims(t);
+  for (int i = 0; i < num_dims; i++) {
+    out_dims[i] = TF_Dim(t, i);
+  }
+}
+
+// Runs model as is if output is a scalar,
+// else sums the output tensor before returning.
+Status RunAndMaybeSum(AbstractContext* ctx, Model forward,
+                      absl::Span<AbstractTensorHandle*> inputs,
+                      absl::Span<AbstractTensorHandle*> outputs,
+                      bool use_function) {
+  GradientRegistry registry;
+  std::vector<AbstractTensorHandle*> model_outputs(1);
+
+  // Run the model.
+  TF_RETURN_IF_ERROR(RunModel(forward, ctx, inputs,
+                              absl::MakeSpan(model_outputs), use_function,
+                              registry));
+  AbstractTensorHandle* model_out = model_outputs[0];
+
+  TF_Tensor* model_out_tensor;
+  TF_RETURN_IF_ERROR(GetValue(model_out, &model_out_tensor));
+  int num_dims_out = TF_NumDims(model_out_tensor);
+
+  // If the output is a scalar, then return the scalar output
+  if (num_dims_out == 0) {
+    outputs[0] = model_out;
+    return Status::OK();
+  }
+
+  // Else, reduce sum the output to get a scalar
+
+  // Will sum all dimensions, so get a Tensor containing [0,...,num_dims_out-1].
+  AbstractTensorHandlePtr sum_dims =
+      GetRangeTensorHandleUtil(ctx, num_dims_out);
+
+  // Reduce sum the output on all dimensions.
+  std::vector<AbstractTensorHandle*> sum_inputs(2);
+  sum_inputs[0] = model_out;
+  sum_inputs[1] = sum_dims.get();
+
+  TF_RETURN_IF_ERROR(ops::Sum(ctx, absl::MakeSpan(sum_inputs),
+                              absl::MakeSpan(model_outputs), "sum_output"));
+  outputs[0] = model_outputs[0];
+  return Status::OK();
+}
+// ========================= End Helper Functions==============================
+
+Status CalcNumericalGrad(AbstractContext* ctx, Model forward,
+                         absl::Span<AbstractTensorHandle*> inputs,
+                         int input_index, bool use_function,
+                         AbstractTensorHandle** numerical_grad) {
+  AbstractTensorHandle* theta =
+      inputs[input_index];  // parameter we are grad checking
+
+  // Convert from AbstractTensor to TF_Tensor.
+  TF_Tensor* theta_tensor;
+  TF_RETURN_IF_ERROR(GetValue(theta, &theta_tensor));
+
+  // Get number of elements and fill data.
+  int num_elems = TF_TensorElementCount(theta_tensor);
+  vector<float> theta_data(num_elems);
+  memcpy(theta_data.data(), TF_TensorData(theta_tensor),
+         TF_TensorByteSize(theta_tensor));
+
+  // Initialize space for the numerical gradient.
+  vector<float> dtheta_approx(num_elems);
+
+  // Get theta shape and store in theta_dims.
+  int num_dims = TF_NumDims(theta_tensor);
+  vector<int64_t> theta_dims(num_dims);
+  GetDims(theta_tensor, theta_dims.data());
+
+  // Initialize auxilary data structures.
+  vector<float> thetaPlus_data(num_elems);
+  vector<float> thetaMinus_data(num_elems);
+  std::vector<AbstractTensorHandle*> f_outputs(1);
+
+  // Numerical Grad Check
+  for (int i = 0; i < num_elems; i++) {
+    // Get relative epsilon value
+    float epsilon =
+        std::abs(theta_data[i] * 1e-4 + 1e-4);  // add 1e-4 to prevent div by 0
+    AbstractTensorHandlePtr two_eps =
+        GetScalarTensorHandleUtil(ctx, 2 * epsilon);
+
+    // Initialize theta[i] + epsilon.
+    memcpy(thetaPlus_data.data(), TF_TensorData(theta_tensor),
+           TF_TensorByteSize(theta_tensor));
+    thetaPlus_data[i] += epsilon;
+    AbstractTensorHandlePtr thetaPlus = GetTensorHandleUtilFloat(
+        ctx, thetaPlus_data.data(), theta_dims.data(), num_dims);
+
+    // Initialize theta[i] - epsilon.
+    memcpy(&thetaMinus_data[0], TF_TensorData(theta_tensor),
+           TF_TensorByteSize(theta_tensor));
+    thetaMinus_data[i] -= epsilon;
+    AbstractTensorHandlePtr thetaMinus = GetTensorHandleUtilFloat(
+        ctx, thetaMinus_data.data(), theta_dims.data(), num_dims);
+
+    // Get f(theta + eps):
+    inputs[input_index] = thetaPlus.get();
+    TF_RETURN_IF_ERROR(RunAndMaybeSum(ctx, forward, inputs,
+                                      absl::MakeSpan(f_outputs), use_function));
+    AbstractTensorHandle* fPlus = f_outputs[0];
+
+    // Get f(theta - eps):
+    inputs[input_index] = thetaMinus.get();
+    TF_RETURN_IF_ERROR(RunAndMaybeSum(ctx, forward, inputs,
+                                      absl::MakeSpan(f_outputs), use_function));
+    AbstractTensorHandle* fMinus = f_outputs[0];
+
+    // Take Difference of both estimates: (f(theta + eps) - f(theta - eps)).
+    TF_RETURN_IF_ERROR(
+        ops::Sub(ctx, {fPlus, fMinus}, absl::MakeSpan(f_outputs), "sub_top"));
+    AbstractTensorHandle* fDiff = f_outputs[0];
+
+    // Calculate using the difference quotient definition:
+    // (f(theta + eps) - f(theta - eps)) / (2 * eps).
+    TF_RETURN_IF_ERROR(ops::DivNoNan(ctx, {fDiff, two_eps.get()},
+                                     absl::MakeSpan(f_outputs),
+                                     "diff_quotient"));
+    AbstractTensorHandle* diff_quotient = f_outputs[0];
+
+    TF_Tensor* grad_tensor;
+    TF_RETURN_IF_ERROR(GetValue(diff_quotient, &grad_tensor));
+    float grad_data[1];
+    memcpy(&grad_data[0], TF_TensorData(grad_tensor),
+           TF_TensorByteSize(grad_tensor));
+
+    dtheta_approx[i] = grad_data[0];
+  }
+
+  // Populate *numerical_grad with the data from dtheta_approx.
+  TF_RETURN_IF_ERROR(TensorHandleWithDimsFloat(
+      ctx, dtheta_approx.data(), theta_dims.data(), num_dims, numerical_grad));
+  return Status::OK();
+}
+
+}  // namespace gradients
+}  // namespace tensorflow
diff --git a/tensorflow/c/eager/gradient_checker.h b/tensorflow/c/eager/gradient_checker.h
new file mode 100644
index 00000000000..8497f5af48e
--- /dev/null
+++ b/tensorflow/c/eager/gradient_checker.h
@@ -0,0 +1,53 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+
+#include "absl/types/span.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/eager/c_api_unified_experimental.h"
+#include "tensorflow/c/eager/c_api_unified_experimental_internal.h"
+#include "tensorflow/c/eager/gradients.h"
+#include "tensorflow/c/eager/gradients_internal.h"
+#include "tensorflow/c/eager/gradients_util.h"
+#include "tensorflow/c/experimental/gradients/math_grad.h"
+#include "tensorflow/c/experimental/gradients/nn_grad.h"
+#include "tensorflow/c/experimental/ops/array_ops.h"
+#include "tensorflow/c/tf_status_helper.h"
+#include "tensorflow/c/tf_tensor.h"
+#include "tensorflow/core/lib/llvm_rtti/llvm_rtti.h"
+#include "tensorflow/core/platform/errors.h"
+
+namespace tensorflow {
+namespace gradients {
+
+/* Returns numerical grad inside `dtheta_approx` given `forward` model and
+ * parameter specified by `input_index`.
+ *
+ * I.e. if y = <output of the forward model> and w = inputs[input_index],
+ * this will calculate dy/dw numerically.
+ *
+ * `use_function` indicates whether to use graph mode(true) or eager(false).
+ *
+ * `numerical_grad` is the pointer to the AbstractTensorHandle* which will
+ * hold the numerical gradient data at the end of the function.
+ */
+Status CalcNumericalGrad(AbstractContext* ctx, Model forward,
+                         absl::Span<AbstractTensorHandle*> inputs,
+                         int input_index, bool use_function,
+                         AbstractTensorHandle** numerical_grad);
+
+}  // namespace gradients
+}  // namespace tensorflow
diff --git a/tensorflow/c/eager/gradient_checker_test.cc b/tensorflow/c/eager/gradient_checker_test.cc
new file mode 100644
index 00000000000..7a438085fb5
--- /dev/null
+++ b/tensorflow/c/eager/gradient_checker_test.cc
@@ -0,0 +1,265 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/c/eager/gradient_checker.h"
+
+#include <memory>
+
+#include "absl/types/span.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/eager/c_api_unified_experimental.h"
+#include "tensorflow/c/eager/c_api_unified_experimental_internal.h"
+#include "tensorflow/c/eager/gradients.h"
+#include "tensorflow/c/eager/gradients_internal.h"
+#include "tensorflow/c/eager/gradients_util.h"
+#include "tensorflow/c/eager/mnist_gradients_testutil.h"
+#include "tensorflow/c/experimental/gradients/math_grad.h"
+#include "tensorflow/c/experimental/gradients/nn_grad.h"
+#include "tensorflow/c/experimental/ops/array_ops.h"
+#include "tensorflow/c/tf_status_helper.h"
+#include "tensorflow/c/tf_tensor.h"
+#include "tensorflow/core/lib/llvm_rtti/llvm_rtti.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace gradients {
+namespace internal {
+namespace {
+
+class GradientCheckerTest
+    : public ::testing::TestWithParam<std::tuple<const char*, bool, bool>> {
+ protected:
+  void SetUp() override {
+    TF_StatusPtr status(TF_NewStatus());
+    TF_SetTracingImplementation(std::get<0>(GetParam()), status.get());
+    Status s = StatusFromTF_Status(status.get());
+    CHECK_EQ(errors::OK, s.code()) << s.error_message();
+  }
+};
+
+Status RegisterGradients(GradientRegistry* registry) {
+  TF_RETURN_IF_ERROR(registry->Register("MatMul", MatMulRegisterer));
+  TF_RETURN_IF_ERROR(
+      registry->Register("SparseSoftmaxCrossEntropyWithLogits",
+                         SparseSoftmaxCrossEntropyWithLogitsRegisterer));
+  return Status::OK();
+}
+
+TEST_P(GradientCheckerTest, TestGradCheckMatMul) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  AbstractContextPtr ctx;
+  {
+    AbstractContext* ctx_raw = nullptr;
+    Status s =
+        BuildImmediateExecutionContext(std::get<1>(GetParam()), &ctx_raw);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    ctx.reset(ctx_raw);
+  }
+
+  float A_vals[] = {1.0f, 2.0f, 3.0f, 4.0f};
+  int64_t A_dims[] = {2, 2};
+  float B_vals[] = {.5f, -1.0f, 1.0f, 1.0f};
+  int64_t B_dims[] = {2, 2};
+  int num_dims = 2;
+
+  AbstractTensorHandlePtr A =
+      GetTensorHandleUtilFloat(ctx.get(), A_vals, A_dims, num_dims);
+  AbstractTensorHandlePtr B =
+      GetTensorHandleUtilFloat(ctx.get(), B_vals, B_dims, num_dims);
+
+  std::vector<AbstractTensorHandle*> inputs;
+  inputs.push_back(A.get());
+  inputs.push_back(B.get());
+
+  AbstractTensorHandle* grad_approx;
+  Status s = CalcNumericalGrad(
+      ctx.get(), MatMulModel, absl::MakeSpan(inputs), /*input_index=*/0,
+      /*use_function=*/!std::get<2>(GetParam()), &grad_approx);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  TF_Tensor* gt;
+  s = GetValue(grad_approx, &gt);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+  float result_data[4] = {0};
+  memcpy(&result_data[0], TF_TensorData(gt), TF_TensorByteSize(gt));
+
+  float expected_dA[4] = {-.5f, 2.0f, -.5f, 2.0f};
+  float tolerance = 1e-2;
+  for (int j = 0; j < 4; j++) {
+    ASSERT_NEAR(expected_dA[j], result_data[j], tolerance);
+  }
+  TF_DeleteTensor(gt);
+}
+
+TEST_P(GradientCheckerTest, TestGradCheckMul) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+
+  AbstractContextPtr ctx;
+  {
+    AbstractContext* ctx_raw = nullptr;
+    Status s =
+        BuildImmediateExecutionContext(std::get<1>(GetParam()), &ctx_raw);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    ctx.reset(ctx_raw);
+  }
+
+  AbstractTensorHandlePtr x;
+  {
+    AbstractTensorHandle* x_raw = nullptr;
+    Status s = ScalarTensorHandle(ctx.get(), 2.0f, &x_raw);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    x.reset(x_raw);
+  }
+
+  AbstractTensorHandlePtr y;
+  {
+    AbstractTensorHandle* y_raw = nullptr;
+    Status s = ScalarTensorHandle(ctx.get(), 7.0f, &y_raw);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    y.reset(y_raw);
+  }
+
+  // Will perform z = x*y.
+  // dz/dx = y
+
+  std::vector<AbstractTensorHandle*> inputs;
+  inputs.push_back(x.get());
+  inputs.push_back(y.get());
+  AbstractTensorHandle* g;
+
+  Status s = CalcNumericalGrad(ctx.get(), MulModel, absl::MakeSpan(inputs),
+                               /*input_index=*/0,
+                               /*use_function=*/!std::get<2>(GetParam()), &g);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  TF_Tensor* gt;
+  s = GetValue(g, &gt);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+  float result_data[1] = {0};
+  memcpy(&result_data[0], TF_TensorData(gt), TF_TensorByteSize(gt));
+
+  ASSERT_NEAR(result_data[0], 7.0f, /*abs_error=*/1e-2);
+  TF_DeleteTensor(gt);
+}
+
+TEST_P(GradientCheckerTest, TestGradCheckSoftmax) {
+  bool use_function = !std::get<2>(GetParam());
+  if (use_function) {
+    // TODO(b/168850692): Enable this.
+    GTEST_SKIP() << "Can't take gradient of "
+                    "SparseSoftmaxCrossEntropyWithLogits in tracing mode.";
+  }
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+
+  /** Test to show how to use this API with analytical gradients:
+   *
+   *  We have `SoftmaxLossGradModel`, which is a wrapper for the
+   *  Softmax analytical gradient found in c/experimental/nn_grads.
+   *
+   *  We will use the GradientChecker by applying finite differences
+   *  to the forward pass wrapped in `SoftmaxModel` and verify that
+   *  both the analytical and numerical gradients are relatively
+   *  close.
+   *
+   */
+
+  AbstractContextPtr ctx;
+  {
+    AbstractContext* ctx_raw = nullptr;
+    Status s =
+        BuildImmediateExecutionContext(std::get<1>(GetParam()), &ctx_raw);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    ctx.reset(ctx_raw);
+  }
+
+  // X = scores
+  float X_vals[] = {1.0f, 2.0f, 3.0f, -5.0f, -4.0f, -3.0f, 2.0f, 0.0f, 1.0f};
+  int64_t X_dims[] = {3, 3};
+  int num_dims = 2;
+  AbstractTensorHandlePtr X =
+      GetTensorHandleUtilFloat(ctx.get(), X_vals, X_dims, num_dims);
+
+  // y = labels
+  int y_vals[] = {1, 0, 1};
+  int64_t y_dims[] = {3};
+  num_dims = sizeof(y_dims) / sizeof(y_dims[0]);
+  AbstractTensorHandlePtr y =
+      GetTensorHandleUtilInt(ctx.get(), y_vals, y_dims, num_dims);
+
+  GradientRegistry registry;
+  Status s = RegisterGradients(&registry);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  std::vector<AbstractTensorHandle*> inputs;
+  inputs.push_back(X.get());
+  inputs.push_back(y.get());
+
+  // Run analytical gradient and get its data.
+  std::vector<AbstractTensorHandle*> outputs(2);
+  s = RunModel(SoftmaxLossGradModel, ctx.get(), absl::MakeSpan(inputs),
+               absl::MakeSpan(outputs),
+               /*use_function=*/!std::get<2>(GetParam()), registry);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  TF_Tensor* dX_tensor;
+  s = GetValue(outputs[0], &dX_tensor);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  float danalytical[9] = {0};  // Contains data from analytical gradient.
+  memcpy(&danalytical[0], TF_TensorData(dX_tensor),
+         TF_TensorByteSize(dX_tensor));
+
+  // Run numerical gradient approximation using the GradientChecker API.
+  AbstractTensorHandle* g;  // Will contain numerical approximation data.
+  s = CalcNumericalGrad(ctx.get(), SoftmaxModel, absl::MakeSpan(inputs),
+                        /*input_index=*/0,
+                        /*use_function=*/!std::get<2>(GetParam()), &g);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  TF_Tensor* gt;
+  s = GetValue(g, &gt);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+  float dnumerical[9] = {0};
+  memcpy(&dnumerical[0], TF_TensorData(gt), TF_TensorByteSize(gt));
+
+  // Now compare the two implementations:
+  for (int j = 0; j < 9; j++) {
+    ASSERT_NEAR(dnumerical[j], danalytical[j], /*abs_error=*/1e-2);
+  }
+
+  // Only Unref() first output as 2nd is nullptr grad for labels
+  outputs[0]->Unref();
+  TF_DeleteTensor(dX_tensor);
+  TF_DeleteTensor(gt);
+}
+
+#ifdef PLATFORM_GOOGLE
+INSTANTIATE_TEST_SUITE_P(
+    UnifiedCAPI, GradientCheckerTest,
+    ::testing::Combine(::testing::Values("graphdef"),
+                       /*tfrt*/ ::testing::Values(false),
+                       /*executing_eagerly*/ ::testing::Values(true, false)));
+#else
+INSTANTIATE_TEST_SUITE_P(
+    UnifiedCAPI, GradientCheckerTest,
+    ::testing::Combine(::testing::Values("graphdef"),
+                       /*tfrt*/ ::testing::Values(false),
+                       /*executing_eagerly*/ ::testing::Values(true, false)));
+#endif
+}  // namespace
+}  // namespace internal
+}  // namespace gradients
+}  // namespace tensorflow
diff --git a/tensorflow/c/eager/gradients.cc b/tensorflow/c/eager/gradients.cc
index 406da1291ae..89ff140fa73 100644
--- a/tensorflow/c/eager/gradients.cc
+++ b/tensorflow/c/eager/gradients.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/c/eager/gradients.h"
 
 #include "absl/strings/str_cat.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
 #include "tensorflow/c/eager/c_api_unified_experimental_internal.h"
 #include "tensorflow/c/eager/gradients_internal.h"
 #include "tensorflow/core/common_runtime/eager/attr_builder.h"
@@ -23,25 +24,97 @@ limitations under the License.
 namespace tensorflow {
 namespace gradients {
 
-Status GradientRegistry::Register(const string& op_name,
-                                  GradientFunctionFactory factory) {
+namespace {
+Status ZerosLike(AbstractContext* ctx, AbstractTensorHandle* t,
+                 AbstractTensorHandle** result) {
+  AbstractOperationPtr op(ctx->CreateOperation());
+  TF_RETURN_IF_ERROR(op->Reset("ZerosLike", /*raw_device_name=*/nullptr));
+  if (isa<tracing::TracingOperation>(op.get())) {
+    TF_RETURN_IF_ERROR(dyn_cast<tracing::TracingOperation>(op.get())->SetOpName(
+        absl::StrCat("ZerosLike", ToId(t)).c_str()));
+  }
+  TF_RETURN_IF_ERROR(op->AddInput(t));
+  int num_outputs = 1;
+  std::vector<AbstractTensorHandle*> outputs(num_outputs);
+  TF_RETURN_IF_ERROR(
+      op->Execute(absl::Span<AbstractTensorHandle*>(outputs), &num_outputs));
+  *result = outputs[0];
+  return Status::OK();
+}
+}  // namespace
+
+class IncomingGradientsImpl : public IncomingGradients {
+ public:
+  explicit IncomingGradientsImpl(
+      absl::Span<AbstractTensorHandle* const> grad_inputs, Context* ctx,
+      DefaultGradientFunction* default_gradients)
+      : grad_inputs_(grad_inputs),
+        ctx_(ctx),
+        default_gradients_(default_gradients) {}
+  AbstractTensorHandle* operator[](int i) const override {
+    return default_gradients_->get(ctx_, grad_inputs_, i);
+  }
+  size_t size() const override { return grad_inputs_.size(); }
+
+ private:
+  absl::Span<AbstractTensorHandle* const> grad_inputs_;
+  Context* ctx_;
+  DefaultGradientFunction* default_gradients_;
+};
+
+AllZerosDefaultGradients::AllZerosDefaultGradients(const ForwardOperation& op)
+    : outputs_(op.outputs) {
+  for (auto output : outputs_) {
+    output->Ref();
+  }
+}
+AbstractTensorHandle* AllZerosDefaultGradients::get(
+    Context* ctx, absl::Span<AbstractTensorHandle* const> grad_inputs, int i) {
+  if (grad_inputs[i]) {
+    return grad_inputs[i];
+  }
+  if (cached_default_grads_[i]) {
+    return cached_default_grads_[i].get();
+  }
+  AbstractTensorHandle* result = nullptr;
+  Status s = ZerosLike(ctx->ctx, outputs_[i], &result);
+  if (!s.ok()) {
+    if (result) {
+      result->Unref();
+    }
+    VLOG(1) << "Failed to create ZerosLike for index " << i;
+    return nullptr;
+  }
+  cached_default_grads_[i].reset(result);
+  return result;
+}
+
+PassThroughDefaultGradients::PassThroughDefaultGradients(
+    const ForwardOperation& op) {}
+AbstractTensorHandle* PassThroughDefaultGradients::get(
+    Context* ctx, absl::Span<AbstractTensorHandle* const> grad_inputs, int i) {
+  return grad_inputs[i];
+}
+
+Status GradientRegistry::Register(
+    const string& op_name, BackwardFunctionFactory backward_function_factory) {
   auto iter = registry_.find(op_name);
   if (iter != registry_.end()) {
     const string error_msg = "Gradient already exists for op: " + op_name + ".";
     return errors::AlreadyExists(error_msg);
   }
-  registry_.insert({op_name, factory});
+  registry_.insert({op_name, backward_function_factory});
   return Status::OK();
 }
 Status GradientRegistry::Lookup(
     const ForwardOperation& op,
-    std::unique_ptr<GradientFunction>* grad_fn) const {
+    std::unique_ptr<BackwardFunction>* backward_function) const {
   auto iter = registry_.find(op.op_name);
   if (iter == registry_.end()) {
     const string error_msg = "No gradient defined for op: " + op.op_name + ".";
     return errors::NotFound(error_msg);
   }
-  grad_fn->reset(iter->second(op));
+  backward_function->reset(iter->second(op));
   return Status::OK();
 }
 
@@ -92,33 +165,8 @@ AbstractTensorHandle* TapeTensor::OnesLike() const {
   }
   return outputs[0];
 }
-AbstractTensorHandle* TapeTensor::ZerosLike() const {
-  AbstractOperationPtr op(ctx_->CreateOperation());
-  // TODO(srbs): Consider adding a TF_RETURN_NULLPTR_IF_ERROR.
-  Status s = op->Reset("ZerosLike", /*raw_device_name=*/nullptr);
-  if (!s.ok()) {
-    return nullptr;
-  }
-  if (isa<tracing::TracingOperation>(op.get())) {
-    s = dyn_cast<tracing::TracingOperation>(op.get())->SetOpName(
-        absl::StrCat("ZerosLike", ToId(handle_)).c_str());
-    if (!s.ok()) {
-      return nullptr;
-    }
-  }
-  s = op->AddInput(handle_);
-  if (!s.ok()) {
-    return nullptr;
-  }
-  int num_outputs = 1;
-  // TODO(srbs): Figure out who is in charge of releasing this.
-  std::vector<AbstractTensorHandle*> outputs(num_outputs);
-  s = op->Execute(absl::Span<AbstractTensorHandle*>(outputs), &num_outputs);
-  if (!s.ok()) {
-    return nullptr;
-  }
-  return outputs[0];
-}
+
+AbstractTensorHandle* TapeTensor::ZerosLike() const { return nullptr; }
 
 // Returns the number of elements in the gradient tensor.
 int64 TapeVSpace::NumElements(AbstractTensorHandle* tensor) const {
@@ -159,13 +207,16 @@ AbstractTensorHandle* TapeVSpace::AggregateGradients(
 
 // Calls the passed-in backward function.
 Status TapeVSpace::CallBackwardFunction(
-    GradientFunction* backward_function,
+    BackwardFunction* backward_function,
     const std::vector<int64>& unneeded_gradients,
     gtl::ArraySlice<AbstractTensorHandle*> output_gradients,
     std::vector<AbstractTensorHandle*>* result) const {
   if (backward_function == nullptr) return Status::OK();
   Context ctx = {ctx_};
-  return backward_function->Compute(&ctx, output_gradients, result);
+  IncomingGradientsImpl incoming_gradients(
+      output_gradients, &ctx, backward_function->GetDefaultGradientFunction());
+  return backward_function->GetGradientFunction()->Compute(
+      &ctx, incoming_gradients, result);
 }
 
 // Looks up the ID of a Gradient.
@@ -191,6 +242,7 @@ namespace internal {
 Status Reset(AbstractOperation* op_, const char* op,
              const char* raw_device_name, ForwardOperation* forward_op_) {
   forward_op_->op_name = op;
+  forward_op_->attrs.Reset(op);
   return op_->Reset(op, raw_device_name);
 }
 Status AddInput(AbstractOperation* op_, AbstractTensorHandle* input,
@@ -363,21 +415,30 @@ Status Execute(AbstractOperation* op_, AbstractContext* ctx,
     input_ids[i] = ToId(forward_op_->inputs[i]);
     input_dtypes[i] = forward_op_->inputs[i]->DataType();
   }
+  for (int i = 0; i < *num_retvals; i++) {
+    // TODO(srbs): Manage refcount of ForwardOperation's inputs/outputs.
+    forward_op_->outputs.push_back(retvals[i]);
+  }
+  // TODO(b/166669239): This is needed to support AttrBuilder::Get for string
+  // attributes. Number type attrs and DataType attrs work fine without this.
+  // Consider getting rid of this and making the behavior between number types
+  // and string consistent.
+  forward_op_->attrs.BuildNodeDef();
   std::vector<TapeTensor> tape_tensors;
   for (auto t : retvals) {
     tape_tensors.push_back(TapeTensor(t, ctx));
   }
   tape->RecordOperation(
       op_->Name(), tape_tensors, input_ids, input_dtypes,
-      [registry, forward_op_]() -> GradientFunction* {
-        std::unique_ptr<GradientFunction> grad_fn;
-        Status s = registry.Lookup(*forward_op_, &grad_fn);
+      [registry, forward_op_]() -> BackwardFunction* {
+        std::unique_ptr<BackwardFunction> backward_fn;
+        Status s = registry.Lookup(*forward_op_, &backward_fn);
         if (!s.ok()) {
           return nullptr;
         }
-        return grad_fn.release();
+        return backward_fn.release();
       },
-      [](GradientFunction* ptr) {
+      [](BackwardFunction* ptr) {
         if (ptr) {
           delete ptr;
         }
diff --git a/tensorflow/c/eager/gradients.h b/tensorflow/c/eager/gradients.h
index 267ee5b7ab2..04e11291404 100644
--- a/tensorflow/c/eager/gradients.h
+++ b/tensorflow/c/eager/gradients.h
@@ -55,18 +55,25 @@ struct Context {
  public:
   AbstractContext* ctx;
 };
+
+class IncomingGradients {
+ public:
+  virtual AbstractTensorHandle* operator[](int i) const = 0;
+  virtual size_t size() const = 0;
+  virtual ~IncomingGradients() {}
+};
+
 class GradientFunction {
  public:
   // TODO(srbs): How we support CompositeTensors e.g. IndexedSlices in
   // `grad_inputs`.
-  virtual Status Compute(Context* ctx,
-                         absl::Span<AbstractTensorHandle* const> grad_inputs,
+  virtual Status Compute(Context* ctx, const IncomingGradients& grad_inputs,
                          std::vector<AbstractTensorHandle*>* grad_outputs) = 0;
   virtual ~GradientFunction() {}
 };
 
 // Metadata from the forward operation that is made available to the
-// gradient registerer to instantiate a GradientFunction.
+// gradient registerer to instantiate a BackwardFunction.
 struct ForwardOperation {
  public:
   string op_name;
@@ -76,18 +83,86 @@ struct ForwardOperation {
   AbstractContext* ctx;
 };
 
-using GradientFunctionFactory =
-    std::function<GradientFunction*(const ForwardOperation& op)>;
-
-// Map from op name to a `GradientFunctionFactory`.
-class GradientRegistry {
+// Interface for building default zeros gradients for op outputs which are
+// missing incoming gradients. Custom implementations of this can be used to
+// control which of the forward op's output tensors/their metadata needs to
+// be kept around in memory to build the default zeros grad.
+//
+// Some common helper implementations are provided below.
+class DefaultGradientFunction {
  public:
-  Status Register(const string& op, GradientFunctionFactory factory);
-  Status Lookup(const ForwardOperation& op,
-                std::unique_ptr<GradientFunction>* grad_fn) const;
+  virtual AbstractTensorHandle* get(
+      Context* ctx, absl::Span<AbstractTensorHandle* const> grad_inputs,
+      int i) = 0;
+  virtual ~DefaultGradientFunction() {}
+};
+
+// Returns zeros for any `nullptr` in `grad_inputs`.
+//
+// This may require keeping track of all of forward op's output
+// tensors and hence may incur a higher memory footprint. Use sparingly.
+//
+// Multiple calls to `AllZerosDefaultGradients::get` return the same tensor
+// handle.
+//
+// The destructor of this class `Unref`'s any cached tensor handles so users of
+// those tensor handles should `Ref` them in order to keep them alive if needed.
+class AllZerosDefaultGradients : public DefaultGradientFunction {
+ public:
+  explicit AllZerosDefaultGradients(const ForwardOperation& op);
+  AbstractTensorHandle* get(Context* ctx,
+                            absl::Span<AbstractTensorHandle* const> grad_inputs,
+                            int i) override;
 
  private:
-  absl::flat_hash_map<string, GradientFunctionFactory> registry_;
+  // TODO(srbs): We do not always need to keep the tensors around. In immediate
+  // execution mode we just need to store the shape and dtype. During tracing
+  // we may need to keep the tensor around if the shape is not full defined.
+  std::vector<AbstractTensorHandle*> outputs_;
+  std::vector<AbstractTensorHandlePtr> cached_default_grads_;
+};
+
+// Passes through `grad_inputs` as-is. The `GradientFunction`
+// will be expected to deal with nullptr in `grad_inputs` if any.
+class PassThroughDefaultGradients : public DefaultGradientFunction {
+ public:
+  explicit PassThroughDefaultGradients(const ForwardOperation& op);
+  AbstractTensorHandle* get(Context* ctx,
+                            absl::Span<AbstractTensorHandle* const> grad_inputs,
+                            int i) override;
+};
+
+// A `BackwardFunction` wraps a `GradientFunction` and a
+// `DefaultGradientFunction`. Both are owned by this class' instance.
+class BackwardFunction {
+ public:
+  BackwardFunction(GradientFunction* gradient_function,
+                   DefaultGradientFunction* default_gradients)
+      : gradient_function_(gradient_function),
+        default_gradients_(default_gradients) {}
+  GradientFunction* GetGradientFunction() { return gradient_function_.get(); }
+  DefaultGradientFunction* GetDefaultGradientFunction() {
+    return default_gradients_.get();
+  }
+
+ private:
+  std::unique_ptr<GradientFunction> gradient_function_;
+  std::unique_ptr<DefaultGradientFunction> default_gradients_;
+};
+
+using BackwardFunctionFactory =
+    std::function<BackwardFunction*(const ForwardOperation& op)>;
+
+// Map from op name to a `BackwardFunctionFactory`.
+class GradientRegistry {
+ public:
+  Status Register(const string& op,
+                  BackwardFunctionFactory backward_function_factory);
+  Status Lookup(const ForwardOperation& op,
+                std::unique_ptr<BackwardFunction>* backward_function) const;
+
+ private:
+  absl::flat_hash_map<string, BackwardFunctionFactory> registry_;
 };
 
 // Returns a unique id for the tensor which is used by the tape to build
@@ -106,9 +181,16 @@ int64 ToId(AbstractTensorHandle* t);
 // allow us to trace the data dependencies between operations and hence compute
 // gradients.
 //
-// This also implements `ZerosLike` and `OnesLike` to create the default
+// This also implements `OnesLike` to create the default
 // incoming gradients for tensors which do not already have an incoming
 // gradient.
+//
+// `ZerosLike` is not expected to be called and returns a nullptr. The creation
+// of default zeros grads is handled by the `DefaultGradientFunction` registered
+// for each op.
+// TODO(srbs): We need to define `ZerosLike` here to keep the compiler happy.
+// Figure out a way to avoid this.
+// TODO(srbs): Should ZerosLike check-fail instead of returning nullptr?
 class TapeTensor {
  public:
   TapeTensor(AbstractTensorHandle* handle, AbstractContext* ctx);
@@ -123,7 +205,7 @@ class TapeTensor {
 
  private:
   AbstractTensorHandle* handle_;
-  // The context where OnesLike and ZerosLike ops are to be created.
+  // The context where OnesLike ops are to be created.
   AbstractContext* ctx_;
 };
 
@@ -132,7 +214,7 @@ class TapeTensor {
 // gradient and for performing gradient aggregation.
 // See `tensorflow::eager::VSpace` for more details.
 class TapeVSpace
-    : public eager::VSpace<AbstractTensorHandle, GradientFunction, TapeTensor> {
+    : public eager::VSpace<AbstractTensorHandle, BackwardFunction, TapeTensor> {
  public:
   explicit TapeVSpace(AbstractContext* ctx) : ctx_(ctx) {}
   ~TapeVSpace() override {}
@@ -147,7 +229,7 @@ class TapeVSpace
 
   // Calls the passed-in backward function.
   Status CallBackwardFunction(
-      GradientFunction* backward_function,
+      BackwardFunction* backward_function,
       const std::vector<int64>& unneeded_gradients,
       gtl::ArraySlice<AbstractTensorHandle*> output_gradients,
       std::vector<AbstractTensorHandle*>* result) const override;
@@ -168,8 +250,14 @@ class TapeVSpace
 };
 
 // A tracing/immediate-execution agnostic tape.
+//
+// Gradient functions defined for this library support handling null incoming
+// gradients. `Tape::ComputeGradient` should be called with
+// `build_default_zeros_grads=false`. Calling with
+// `build_default_zeros_grads=true` (the default) is equivalent but just results
+// in extra work because `TapeTensor::ZerosLike` returns a `nullptr` anyway.
 using Tape = tensorflow::eager::GradientTape<AbstractTensorHandle,
-                                             GradientFunction, TapeTensor>;
+                                             BackwardFunction, TapeTensor>;
 
 }  // namespace gradients
 }  // namespace tensorflow
diff --git a/tensorflow/c/eager/gradients_test.cc b/tensorflow/c/eager/gradients_test.cc
index e02f189c3d2..3aedf55e97a 100644
--- a/tensorflow/c/eager/gradients_test.cc
+++ b/tensorflow/c/eager/gradients_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include <memory>
 
+#include "absl/container/flat_hash_set.h"
 #include "absl/types/span.h"
 #include "tensorflow/c/eager/abstract_tensor_handle.h"
 #include "tensorflow/c/eager/c_api_experimental.h"
@@ -23,6 +24,7 @@ limitations under the License.
 #include "tensorflow/c/eager/c_api_unified_experimental.h"
 #include "tensorflow/c/eager/c_api_unified_experimental_internal.h"
 #include "tensorflow/c/eager/gradients_internal.h"
+#include "tensorflow/c/experimental/gradients/array_grad.h"
 #include "tensorflow/c/experimental/gradients/math_grad.h"
 #include "tensorflow/c/experimental/ops/array_ops.h"
 #include "tensorflow/c/tf_status_helper.h"
@@ -35,17 +37,26 @@ namespace tensorflow {
 namespace gradients {
 namespace internal {
 namespace {
+using std::vector;
+using tensorflow::TF_StatusPtr;
+using tracing::TracingOperation;
 
 class CppGradients
     : public ::testing::TestWithParam<std::tuple<const char*, bool, bool>> {
  protected:
   void SetUp() override {
-    TF_SetTracingImplementation(std::get<0>(GetParam()));
+    TF_StatusPtr status(TF_NewStatus());
+    TF_SetTracingImplementation(std::get<0>(GetParam()), status.get());
+    Status s = StatusFromTF_Status(status.get());
+    CHECK_EQ(errors::OK, s.code()) << s.error_message();
   }
 };
 
 Status RegisterGradients(GradientRegistry* registry) {
-  return registry->Register("Add", AddRegisterer);
+  TF_RETURN_IF_ERROR(registry->Register("Add", AddRegisterer));
+  TF_RETURN_IF_ERROR(registry->Register("Exp", ExpRegisterer));
+  TF_RETURN_IF_ERROR(registry->Register("IdentityN", IdentityNRegisterer));
+  return Status::OK();
 }
 
 // Computes `inputs[0] + inputs[1]` and records it on the tape.
@@ -58,9 +69,9 @@ Status Add(AbstractContext* ctx, Tape* tape,
   forward_op.ctx = ctx;
   TF_RETURN_IF_ERROR(
       Reset(add_op.get(), "Add", /*raw_device_name=*/nullptr, &forward_op));
-  if (isa<tracing::TracingOperation>(add_op.get())) {
+  if (isa<TracingOperation>(add_op.get())) {
     TF_RETURN_IF_ERROR(
-        dyn_cast<tracing::TracingOperation>(add_op.get())->SetOpName("my_add"));
+        dyn_cast<TracingOperation>(add_op.get())->SetOpName("my_add"));
   }
   TF_RETURN_IF_ERROR(AddInput(add_op.get(), inputs[0], &forward_op));
   TF_RETURN_IF_ERROR(AddInput(add_op.get(), inputs[1], &forward_op));
@@ -69,6 +80,46 @@ Status Add(AbstractContext* ctx, Tape* tape,
                  registry);
 }
 
+// Computes `exp(inputs[0])` and records it on the tape.
+Status Exp(AbstractContext* ctx, Tape* tape,
+           absl::Span<AbstractTensorHandle* const> inputs,
+           absl::Span<AbstractTensorHandle*> outputs,
+           const GradientRegistry& registry) {
+  AbstractOperationPtr exp_op(ctx->CreateOperation());
+  ForwardOperation forward_op;
+  forward_op.ctx = ctx;
+  TF_RETURN_IF_ERROR(
+      Reset(exp_op.get(), "Exp", /*raw_device_name=*/nullptr, &forward_op));
+  if (isa<TracingOperation>(exp_op.get())) {
+    TF_RETURN_IF_ERROR(
+        dyn_cast<TracingOperation>(exp_op.get())->SetOpName("my_exp"));
+  }
+  TF_RETURN_IF_ERROR(AddInput(exp_op.get(), inputs[0], &forward_op));
+  int num_retvals = 1;
+  return Execute(exp_op.get(), ctx, outputs, &num_retvals, &forward_op, tape,
+                 registry);
+}
+
+// Computes `IdentityN(inputs)` and records it on the tape.
+Status IdentityN(AbstractContext* ctx, Tape* tape,
+                 absl::Span<AbstractTensorHandle* const> inputs,
+                 absl::Span<AbstractTensorHandle*> outputs,
+                 const GradientRegistry& registry) {
+  AbstractOperationPtr identity_n_op(ctx->CreateOperation());
+  ForwardOperation forward_op;
+  forward_op.ctx = ctx;
+  TF_RETURN_IF_ERROR(Reset(identity_n_op.get(), "IdentityN",
+                           /*raw_device_name=*/nullptr, &forward_op));
+  if (isa<TracingOperation>(identity_n_op.get())) {
+    TF_RETURN_IF_ERROR(dyn_cast<TracingOperation>(identity_n_op.get())
+                           ->SetOpName("my_identity_n"));
+  }
+  TF_RETURN_IF_ERROR(AddInputList(identity_n_op.get(), inputs, &forward_op));
+  int num_retvals = outputs.size();
+  return Execute(identity_n_op.get(), ctx, outputs, &num_retvals, &forward_op,
+                 tape, registry);
+}
+
 // Computes
 // y = inputs[0] + inputs[1]
 // return grad(y, {inputs[0], inputs[1]})
@@ -91,7 +142,8 @@ Status AddGradModel(AbstractContext* ctx,
       vspace, /*target_tensor_ids=*/{ToId(add_outputs[0])},
       /*source_tensor_ids=*/{ToId(inputs[0]), ToId(inputs[1])},
       source_tensors_that_are_targets,
-      /*output_gradients=*/{}, &out_grads));
+      /*output_gradients=*/{}, &out_grads,
+      /*build_default_zeros_grads=*/false));
   for (auto add_output : add_outputs) {
     add_output->Unref();
   }
@@ -101,6 +153,71 @@ Status AddGradModel(AbstractContext* ctx,
   return Status::OK();
 }
 
+// Computes
+// y = exp(inputs[0])
+// return grad(y, {inputs[0]})
+Status ExpGradModel(AbstractContext* ctx,
+                    absl::Span<AbstractTensorHandle* const> inputs,
+                    absl::Span<AbstractTensorHandle*> outputs,
+                    const GradientRegistry& registry) {
+  TapeVSpace vspace(ctx);
+  auto tape = new Tape(/*persistent=*/false);
+  tape->Watch(ToId(inputs[0]));  // Watch x.
+  std::vector<AbstractTensorHandle*> exp_outputs(1);
+  TF_RETURN_IF_ERROR(Exp(ctx, tape, inputs, absl::MakeSpan(exp_outputs),
+                         registry));  // Compute x+y.
+  std::unordered_map<tensorflow::int64, TapeTensor>
+      source_tensors_that_are_targets;
+
+  std::vector<AbstractTensorHandle*> out_grads;
+  TF_RETURN_IF_ERROR(tape->ComputeGradient(
+      vspace, /*target_tensor_ids=*/{ToId(exp_outputs[0])},
+      /*source_tensor_ids=*/{ToId(inputs[0])}, source_tensors_that_are_targets,
+      /*output_gradients=*/{}, &out_grads,
+      /*build_default_zeros_grads=*/false));
+  for (auto exp_output : exp_outputs) {
+    exp_output->Unref();
+  }
+  outputs[0] = out_grads[0];
+  delete tape;
+  return Status::OK();
+}
+
+// Computes
+// ignored, y = IdentityN(inputs[0], inputs[1])
+// return grad(y, {inputs[0], inputs[1]})
+// This should return [nullptr, 1].
+Status IdentityNGradModel(AbstractContext* ctx,
+                          absl::Span<AbstractTensorHandle* const> inputs,
+                          absl::Span<AbstractTensorHandle*> outputs,
+                          const GradientRegistry& registry) {
+  TapeVSpace vspace(ctx);
+  auto tape = new Tape(/*persistent=*/false);
+  tape->Watch(ToId(inputs[0]));
+  tape->Watch(ToId(inputs[1]));
+
+  vector<AbstractTensorHandle*> identity_n_outputs(2);
+  TF_RETURN_IF_ERROR(IdentityN(ctx, tape, inputs,
+                               absl::MakeSpan(identity_n_outputs), registry));
+
+  std::unordered_map<tensorflow::int64, TapeTensor>
+      source_tensors_that_are_targets;
+  vector<AbstractTensorHandle*> out_grads;
+  TF_RETURN_IF_ERROR(tape->ComputeGradient(
+      vspace, /*target_tensor_ids=*/{ToId(identity_n_outputs[1])},
+      /*source_tensor_ids=*/{ToId(inputs[0]), ToId(inputs[1])},
+      source_tensors_that_are_targets,
+      /*output_gradients=*/{}, &out_grads,
+      /*build_default_zeros_grads=*/false));
+  for (auto identity_n_output : identity_n_outputs) {
+    identity_n_output->Unref();
+  }
+  outputs[0] = out_grads[0];
+  outputs[1] = out_grads[1];
+  delete tape;
+  return Status::OK();
+}
+
 AbstractContext* BuildFunction(const char* fn_name) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
@@ -132,26 +249,42 @@ Status RunModel(Model model, AbstractContext* ctx,
   if (use_function) {
     const char* fn_name = "test_fn";
     std::unique_ptr<AbstractFunction> scoped_func;
+    // Returning null tensors from a tf.function is not supported, so we keep
+    // track of indices in the model's outputs are nullptr in this set.
+    // The FunctionDef only outputs the non-null tensors. We later pad the
+    // function op outputs to have nullptrs at the `null_indices`.
+    absl::flat_hash_set<int> null_indices;
     {
       AbstractContextPtr func_ctx(BuildFunction(fn_name));
       std::vector<AbstractTensorHandle*> func_inputs;
       func_inputs.reserve(inputs.size());
       TF_RETURN_IF_ERROR(
           CreateParamsForInputs(func_ctx.get(), inputs, &func_inputs));
-      OutputList output_list;
-      output_list.expected_num_outputs = outputs.size();
-      output_list.outputs.resize(outputs.size());
+      vector<AbstractTensorHandle*> model_outputs;
+      model_outputs.resize(outputs.size());
       TF_RETURN_IF_ERROR(model(func_ctx.get(), absl::MakeSpan(func_inputs),
-                               absl::MakeSpan(output_list.outputs), registry));
+                               absl::MakeSpan(model_outputs), registry));
       for (auto func_input : func_inputs) {
         func_input->Unref();
       }
       AbstractFunction* func = nullptr;
+      OutputList output_list;
+      output_list.expected_num_outputs = 0;
+      output_list.outputs.reserve(outputs.size());
+      for (int i = 0; i < model_outputs.size(); i++) {
+        if (model_outputs[i]) {
+          output_list.outputs.emplace_back(model_outputs[i]);
+          output_list.expected_num_outputs += 1;
+        } else {
+          null_indices.insert(i);
+        }
+      }
       TF_RETURN_IF_ERROR(dyn_cast<tracing::TracingContext>(func_ctx.get())
                              ->Finalize(&output_list, &func));
       scoped_func.reset(func);
-      output_list.outputs[0]->Unref();
-      output_list.outputs[1]->Unref();
+      for (auto output : output_list.outputs) {
+        output->Unref();
+      }
       TF_RETURN_IF_ERROR(ctx->RegisterFunction(func));
     }
 
@@ -160,8 +293,19 @@ Status RunModel(Model model, AbstractContext* ctx,
     for (auto input : inputs) {
       TF_RETURN_IF_ERROR(fn_op->AddInput(input));
     }
-    int retvals = outputs.size();
-    TF_RETURN_IF_ERROR(fn_op->Execute(outputs, &retvals));
+    int retvals = outputs.size() - null_indices.size();
+    vector<AbstractTensorHandle*> fn_outputs(retvals);
+    TF_RETURN_IF_ERROR(fn_op->Execute(
+        absl::Span<AbstractTensorHandle*>(fn_outputs.data(), fn_outputs.size()),
+        &retvals));
+    int skipped_indices = 0;
+    for (int i = 0; i < outputs.size(); i++) {
+      if (!null_indices.contains(i)) {
+        outputs[i] = fn_outputs[i - skipped_indices];
+      } else {
+        skipped_indices += 1;
+      }
+    }
     TF_RETURN_IF_ERROR(ctx->RemoveFunction(fn_name));
     return Status::OK();
   } else {
@@ -264,18 +408,172 @@ TEST_P(CppGradients, TestAddGrad) {
   TF_DeleteTensor(result_tensor);
 }
 
-// TODO(b/160888630): Enable this test with mlir after AddInputList is
-// supported. It is needed for AddN op which is used for gradient aggregation.
+TEST_P(CppGradients, TestExpGrad) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  AbstractContextPtr ctx;
+  {
+    AbstractContext* ctx_raw = nullptr;
+    Status s =
+        BuildImmediateExecutionContext(std::get<1>(GetParam()), &ctx_raw);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    ctx.reset(ctx_raw);
+  }
+
+  AbstractTensorHandlePtr x;
+  {
+    AbstractTensorHandle* x_raw = nullptr;
+    Status s = TestScalarTensorHandle(ctx.get(), 1.0f, &x_raw);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    x.reset(x_raw);
+  }
+
+  GradientRegistry registry;
+  Status s = RegisterGradients(&registry);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  // Pseudo-code:
+  //
+  // tape.watch(x)
+  // y = exp(x)
+  // outputs = tape.gradient(y, x)
+  std::vector<AbstractTensorHandle*> outputs(1);
+  s = RunModel(ExpGradModel, ctx.get(), {x.get()}, absl::MakeSpan(outputs),
+               /*use_function=*/!std::get<2>(GetParam()), registry);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  TF_Tensor* result_tensor;
+  s = getValue(outputs[0], &result_tensor);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+  auto result_value = static_cast<float*>(TF_TensorData(result_tensor));
+  EXPECT_NEAR(*result_value, 2.718, 0.001);
+  outputs[0]->Unref();
+  TF_DeleteTensor(result_tensor);
+  result_tensor = nullptr;
+}
+
+TEST_P(CppGradients, TestIdentityNGrad) {
+  // Pseudo-code:
+  //
+  // tape.watch(x1)
+  // tape.watch(x2)
+  // unused, y = IdentityN([x1, x2])
+  // outputs = tape.gradient(y, [x1, x2])
+  // Expected: [nullptr, 1]
+  //
+  // This test is interesting because the current implementation of GradientTape
+  // would return [0, 1] whereas we use build_default_zeros_grads=false here
+  // so we get back [nullptr, 1].
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  AbstractContextPtr ctx;
+  {
+    AbstractContext* ctx_raw = nullptr;
+    Status s =
+        BuildImmediateExecutionContext(std::get<1>(GetParam()), &ctx_raw);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    ctx.reset(ctx_raw);
+  }
+
+  AbstractTensorHandlePtr x1;
+  {
+    AbstractTensorHandle* x_raw = nullptr;
+    Status s = TestScalarTensorHandle(ctx.get(), 1.0f, &x_raw);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    x1.reset(x_raw);
+  }
+  AbstractTensorHandlePtr x2;
+  {
+    AbstractTensorHandle* x_raw = nullptr;
+    Status s = TestScalarTensorHandle(ctx.get(), 1.0f, &x_raw);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    x2.reset(x_raw);
+  }
+
+  GradientRegistry registry;
+  Status s = RegisterGradients(&registry);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  std::vector<AbstractTensorHandle*> outputs(2);
+  s = RunModel(IdentityNGradModel, ctx.get(), {x1.get(), x2.get()},
+               absl::MakeSpan(outputs),
+               /*use_function=*/!std::get<2>(GetParam()), registry);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  EXPECT_EQ(outputs[0], nullptr);
+  TF_Tensor* result_tensor;
+  s = getValue(outputs[1], &result_tensor);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+  auto result_value = static_cast<float*>(TF_TensorData(result_tensor));
+  EXPECT_EQ(*result_value, 1.0);
+  outputs[1]->Unref();
+  TF_DeleteTensor(result_tensor);
+  result_tensor = nullptr;
+}
+
+TEST_P(CppGradients, TestSetAttrString) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  AbstractContextPtr ctx;
+  {
+    AbstractContext* ctx_raw = nullptr;
+    Status s =
+        BuildImmediateExecutionContext(std::get<1>(GetParam()), &ctx_raw);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    ctx.reset(ctx_raw);
+  }
+
+  AbstractTensorHandlePtr t;
+  {
+    AbstractTensorHandle* x_raw = nullptr;
+    Status s = TestScalarTensorHandle(ctx.get(), 1.0f, &x_raw);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    t.reset(x_raw);
+  }
+
+  AbstractOperationPtr check_numerics_op(ctx->CreateOperation());
+  ForwardOperation forward_op;
+  forward_op.ctx = ctx.get();
+  Status s = Reset(check_numerics_op.get(), "CheckNumerics",
+                   /*raw_device_name=*/nullptr, &forward_op);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+  if (isa<TracingOperation>(check_numerics_op.get())) {
+    s = dyn_cast<TracingOperation>(check_numerics_op.get())
+            ->SetOpName("check_numerics");
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+  }
+  s = AddInput(check_numerics_op.get(), t.get(), &forward_op);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+  string message = "This is the way!";
+  s = SetAttrString(check_numerics_op.get(), "message", message.data(),
+                    message.length(), &forward_op);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+  int num_retvals = 1;
+  std::vector<AbstractTensorHandle*> outputs(1);
+  GradientRegistry registry;
+  std::unique_ptr<Tape> tape(new Tape(/*persistent=*/false));
+  s = Execute(check_numerics_op.get(), ctx.get(), absl::MakeSpan(outputs),
+              &num_retvals, &forward_op, tape.get(), registry);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  string read_message;
+  s = forward_op.attrs.Get("message", &read_message);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+  ASSERT_EQ(read_message, message);
+}
+
+// TODO(b/164171226): Enable this test with tfrt after AddInputList is
+// supported. It is needed for IdentityN.
 #ifdef PLATFORM_GOOGLE
 INSTANTIATE_TEST_SUITE_P(
     UnifiedCAPI, CppGradients,
-    ::testing::Combine(::testing::Values("graphdef"),
-                       /*tfrt*/ ::testing::Values(true, false),
+    ::testing::Combine(::testing::Values("graphdef", "mlir"),
+                       /*tfrt*/ ::testing::Values(false),
                        /*executing_eagerly*/ ::testing::Values(true, false)));
 #else
 INSTANTIATE_TEST_SUITE_P(
     UnifiedCAPI, CppGradients,
-    ::testing::Combine(::testing::Values("graphdef"),
+    ::testing::Combine(::testing::Values("graphdef", "mlir"),
                        /*tfrt*/ ::testing::Values(false),
                        /*executing_eagerly*/ ::testing::Values(true, false)));
 #endif
diff --git a/tensorflow/c/eager/gradients_util.cc b/tensorflow/c/eager/gradients_util.cc
new file mode 100644
index 00000000000..e53faf4a3f3
--- /dev/null
+++ b/tensorflow/c/eager/gradients_util.cc
@@ -0,0 +1,317 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/c/eager/gradients_util.h"
+
+#include <memory>
+
+#include "absl/types/span.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/eager/c_api_unified_experimental.h"
+#include "tensorflow/c/eager/c_api_unified_experimental_internal.h"
+#include "tensorflow/c/eager/gradients.h"
+#include "tensorflow/c/eager/gradients_internal.h"
+#include "tensorflow/c/experimental/ops/array_ops.h"
+#include "tensorflow/c/experimental/ops/math_ops.h"
+#include "tensorflow/c/experimental/ops/nn_ops.h"
+#include "tensorflow/c/tf_status_helper.h"
+#include "tensorflow/c/tf_tensor.h"
+#include "tensorflow/core/lib/llvm_rtti/llvm_rtti.h"
+#include "tensorflow/core/platform/errors.h"
+
+namespace tensorflow {
+namespace gradients {
+
+using namespace std;
+
+Status ScalarTensorHandleHelper(TFE_Context* ctx, float value,
+                                TFE_TensorHandle** result) {
+  float data[] = {value};
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  TF_Tensor* t =
+      TFE_AllocateHostTensor(ctx, TF_FLOAT, nullptr, 0, status.get());
+  memcpy(TF_TensorData(t), &data[0], TF_TensorByteSize(t));
+  TFE_TensorHandle* th = TFE_NewTensorHandleFromTensor(ctx, t, status.get());
+  *result = th;
+  TF_DeleteTensor(t);
+  return StatusFromTF_Status(status.get());
+}
+
+Status TensorHandleWithDimsFloatHelper(TFE_Context* ctx, float data[],
+                                       int64_t dims[], int num_dims,
+                                       TFE_TensorHandle** result) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  TF_Tensor* t =
+      TFE_AllocateHostTensor(ctx, TF_FLOAT, &dims[0], num_dims, status.get());
+  memcpy(TF_TensorData(t), &data[0], TF_TensorByteSize(t));
+  TFE_TensorHandle* th = TFE_NewTensorHandleFromTensor(ctx, t, status.get());
+  *result = th;
+  TF_DeleteTensor(t);
+  return StatusFromTF_Status(status.get());
+}
+
+Status TensorHandleWithDimsIntHelper(TFE_Context* ctx, int data[],
+                                     int64_t dims[], int num_dims,
+                                     TFE_TensorHandle** result) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  TF_Tensor* t =
+      TFE_AllocateHostTensor(ctx, TF_INT32, &dims[0], num_dims, status.get());
+  memcpy(TF_TensorData(t), &data[0], TF_TensorByteSize(t));
+  TFE_TensorHandle* th = TFE_NewTensorHandleFromTensor(ctx, t, status.get());
+  *result = th;
+  TF_DeleteTensor(t);
+  return StatusFromTF_Status(status.get());
+}
+
+// Get a scalar TensorHandle with given value
+Status ScalarTensorHandle(AbstractContext* ctx, float value,
+                          AbstractTensorHandle** tensor) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  TFE_Context* eager_ctx =
+      TF_ExecutionContextGetTFEContext(wrap(ctx), status.get());
+  TF_RETURN_IF_ERROR(StatusFromTF_Status(status.get()));
+  TFE_TensorHandle* input_eager;
+  TF_RETURN_IF_ERROR(ScalarTensorHandleHelper(eager_ctx, value, &input_eager));
+  *tensor =
+      unwrap(TF_CreateAbstractTensorFromEagerTensor(input_eager, status.get()));
+  return StatusFromTF_Status(status.get());
+}
+
+// Get a TensorHandle with given float values and dimensions
+Status TensorHandleWithDimsFloat(AbstractContext* ctx, float data[],
+                                 int64_t dims[], int num_dims,
+                                 AbstractTensorHandle** tensor) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  TFE_Context* eager_ctx =
+      TF_ExecutionContextGetTFEContext(wrap(ctx), status.get());
+  TF_RETURN_IF_ERROR(StatusFromTF_Status(status.get()));
+  TFE_TensorHandle* input_eager;
+  TF_RETURN_IF_ERROR(TensorHandleWithDimsFloatHelper(eager_ctx, data, dims,
+                                                     num_dims, &input_eager));
+  *tensor =
+      unwrap(TF_CreateAbstractTensorFromEagerTensor(input_eager, status.get()));
+  return StatusFromTF_Status(status.get());
+}
+
+// Get a TensorHandle with given int values and dimensions
+Status TensorHandleWithDimsInt(AbstractContext* ctx, int data[], int64_t dims[],
+                               int num_dims, AbstractTensorHandle** tensor) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  TFE_Context* eager_ctx =
+      TF_ExecutionContextGetTFEContext(wrap(ctx), status.get());
+  TF_RETURN_IF_ERROR(StatusFromTF_Status(status.get()));
+  TFE_TensorHandle* input_eager;
+  TF_RETURN_IF_ERROR(TensorHandleWithDimsIntHelper(eager_ctx, data, dims,
+                                                   num_dims, &input_eager));
+  *tensor =
+      unwrap(TF_CreateAbstractTensorFromEagerTensor(input_eager, status.get()));
+  return StatusFromTF_Status(status.get());
+}
+
+Status GetValue(AbstractTensorHandle* t, TF_Tensor** result_tensor) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  TFE_TensorHandle* result_t =
+      TF_AbstractTensorGetEagerTensor(wrap(t), status.get());
+  TF_RETURN_IF_ERROR(StatusFromTF_Status(status.get()));
+  *result_tensor = TFE_TensorHandleResolve(result_t, status.get());
+  return StatusFromTF_Status(status.get());
+}
+
+AbstractTensorHandlePtr GetTensorHandleUtilFloat(AbstractContext* ctx,
+                                                 float vals[], int64_t dims[],
+                                                 int num_dims) {
+  AbstractTensorHandlePtr A;
+  AbstractTensorHandle* a_raw = nullptr;
+  Status s = TensorHandleWithDimsFloat(ctx, vals, dims, num_dims, &a_raw);
+  if (s.ok()) {
+    A.reset(a_raw);
+  }
+  return A;
+}
+
+AbstractTensorHandlePtr GetTensorHandleUtilInt(AbstractContext* ctx, int vals[],
+                                               int64_t dims[], int num_dims) {
+  AbstractTensorHandlePtr A;
+  AbstractTensorHandle* a_raw = nullptr;
+  Status s = TensorHandleWithDimsInt(ctx, vals, dims, num_dims, &a_raw);
+  if (s.ok()) {
+    A.reset(a_raw);
+  }
+  return A;
+}
+
+AbstractTensorHandlePtr GetScalarTensorHandleUtil(AbstractContext* ctx,
+                                                  float val) {
+  AbstractTensorHandlePtr y;
+  AbstractTensorHandle* y_raw = nullptr;
+  Status s = ScalarTensorHandle(ctx, val, &y_raw);
+  if (s.ok()) {
+    y.reset(y_raw);
+  }
+  return y;
+}
+
+Status UpdateWeights(AbstractContext* ctx, vector<AbstractTensorHandle*>& grads,
+                     vector<AbstractTensorHandle*>& weights,
+                     AbstractTensorHandle* learning_rate) {
+  /* Update weights one by one using gradient update rule:
+   *
+   *    w -= lr*grad[w]
+   *
+   *  NOTE: assuming learning rate is positive
+   */
+
+  int num_grads = grads.size();
+  vector<AbstractTensorHandle*> temp_outputs(1);
+  std::string update_str;
+
+  // Negate learning rate for gradient descent
+  TF_RETURN_IF_ERROR(ops::Neg(ctx, {learning_rate},
+                              absl::MakeSpan(temp_outputs),
+                              "neg_lr"));  // Compute -lr
+  learning_rate = temp_outputs[0];
+
+  for (int i = 0; i < num_grads; i++) {
+    // Compute dW = -lr * grad(w[i])
+    update_str = "update_mul_" + std::to_string(i);
+    TF_RETURN_IF_ERROR(ops::Mul(ctx, {learning_rate, grads[i]},
+                                absl::MakeSpan(temp_outputs),
+                                update_str.c_str()));
+
+    AbstractTensorHandle* dW = temp_outputs[0];
+
+    // Compute temp = weights[i] + dW
+    update_str = "update_add_" + std::to_string(i);
+    TF_RETURN_IF_ERROR(ops::Add(ctx, {weights[i], dW},
+                                absl::MakeSpan(temp_outputs),
+                                update_str.c_str()));
+
+    // Update the weights
+    weights[i] = temp_outputs[0];
+  }
+
+  return Status::OK();
+}
+
+AbstractContext* BuildFunction(const char* fn_name) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  TF_ExecutionContext* graph_ctx = TF_CreateFunction(fn_name, status.get());
+  return unwrap(graph_ctx);
+}
+
+Status CreateParamsForInputs(AbstractContext* ctx,
+                             absl::Span<AbstractTensorHandle* const> inputs,
+                             vector<AbstractTensorHandle*>* params) {
+  tracing::TracingTensorHandle* handle = nullptr;
+  for (auto input : inputs) {
+    TF_RETURN_IF_ERROR(dyn_cast<tracing::TracingContext>(ctx)->AddParameter(
+        input->DataType(), &handle));
+    params->emplace_back(handle);
+  }
+  return Status::OK();
+}
+
+Status RunModel(Model model, AbstractContext* ctx,
+                absl::Span<AbstractTensorHandle* const> inputs,
+                absl::Span<AbstractTensorHandle*> outputs, bool use_function,
+                const GradientRegistry& registry) {
+  if (use_function) {
+    const char* fn_name = "test_fn";
+    std::unique_ptr<AbstractFunction> scoped_func;
+    // Returning null tensors from a tf.function is not supported, so we keep
+    // track of indices in the model's outputs are nullptr in this set.
+    // The FunctionDef only outputs the non-null tensors. We later pad the
+    // function op outputs to have nullptrs at the `null_indices`.
+    absl::flat_hash_set<int> null_indices;
+    {
+      AbstractContextPtr func_ctx(BuildFunction(fn_name));
+      vector<AbstractTensorHandle*> func_inputs;
+      func_inputs.reserve(inputs.size());
+      TF_RETURN_IF_ERROR(
+          CreateParamsForInputs(func_ctx.get(), inputs, &func_inputs));
+      vector<AbstractTensorHandle*> model_outputs;
+      model_outputs.resize(outputs.size());
+      TF_RETURN_IF_ERROR(model(func_ctx.get(), absl::MakeSpan(func_inputs),
+                               absl::MakeSpan(model_outputs), registry));
+      for (auto func_input : func_inputs) {
+        func_input->Unref();
+      }
+      AbstractFunction* func = nullptr;
+      OutputList output_list;
+      output_list.expected_num_outputs = 0;
+      output_list.outputs.reserve(outputs.size());
+      for (int i = 0; i < model_outputs.size(); i++) {
+        if (model_outputs[i]) {
+          output_list.outputs.emplace_back(model_outputs[i]);
+          output_list.expected_num_outputs += 1;
+        } else {
+          null_indices.insert(i);
+        }
+      }
+      TF_RETURN_IF_ERROR(dyn_cast<tracing::TracingContext>(func_ctx.get())
+                             ->Finalize(&output_list, &func));
+      scoped_func.reset(func);
+      for (auto output : output_list.outputs) {
+        output->Unref();
+      }
+      TF_RETURN_IF_ERROR(ctx->RegisterFunction(func));
+    }
+
+    AbstractOperationPtr fn_op(ctx->CreateOperation());
+    TF_RETURN_IF_ERROR(fn_op->Reset(fn_name, /*raw_device_name=*/nullptr));
+    for (auto input : inputs) {
+      TF_RETURN_IF_ERROR(fn_op->AddInput(input));
+    }
+    int retvals = outputs.size() - null_indices.size();
+    vector<AbstractTensorHandle*> fn_outputs(retvals);
+    TF_RETURN_IF_ERROR(fn_op->Execute(
+        absl::Span<AbstractTensorHandle*>(fn_outputs.data(), fn_outputs.size()),
+        &retvals));
+    int skipped_indices = 0;
+    for (int i = 0; i < outputs.size(); i++) {
+      if (!null_indices.contains(i)) {
+        outputs[i] = fn_outputs[i - skipped_indices];
+      } else {
+        skipped_indices += 1;
+      }
+    }
+    TF_RETURN_IF_ERROR(ctx->RemoveFunction(fn_name));
+    return Status::OK();
+  } else {
+    return model(ctx, inputs, outputs, registry);
+  }
+}
+
+Status BuildImmediateExecutionContext(bool use_tfrt, AbstractContext** ctx) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_ContextOptionsSetTfrt(opts, use_tfrt);
+  *ctx = unwrap(TF_NewEagerExecutionContext(opts, status.get()));
+  TF_RETURN_IF_ERROR(StatusFromTF_Status(status.get()));
+  TFE_DeleteContextOptions(opts);
+  return Status::OK();
+}
+
+}  // namespace gradients
+}  // namespace tensorflow
\ No newline at end of file
diff --git a/tensorflow/c/eager/gradients_util.h b/tensorflow/c/eager/gradients_util.h
new file mode 100644
index 00000000000..cd0bbc0720d
--- /dev/null
+++ b/tensorflow/c/eager/gradients_util.h
@@ -0,0 +1,88 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/types/span.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/eager/c_api_unified_experimental.h"
+#include "tensorflow/c/eager/c_api_unified_experimental_internal.h"
+#include "tensorflow/c/eager/gradients.h"
+#include "tensorflow/c/eager/gradients_internal.h"
+#include "tensorflow/c/experimental/ops/array_ops.h"
+#include "tensorflow/c/experimental/ops/math_ops.h"
+#include "tensorflow/c/experimental/ops/nn_ops.h"
+#include "tensorflow/c/tf_status_helper.h"
+#include "tensorflow/c/tf_tensor.h"
+#include "tensorflow/core/lib/llvm_rtti/llvm_rtti.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace gradients {
+
+// Get a scalar TensorHandle with given value
+Status ScalarTensorHandle(AbstractContext* ctx, float value,
+                          AbstractTensorHandle** tensor);
+
+// Get a TensorHandle with given float values and dimensions
+Status TensorHandleWithDimsFloat(AbstractContext* ctx, float data[],
+                                 int64_t dims[], int num_dims,
+                                 AbstractTensorHandle** tensor);
+
+// Get a TensorHandle with given int values and dimensions
+Status TensorHandleWithDimsInt(AbstractContext* ctx, int data[], int64_t dims[],
+                               int num_dims, AbstractTensorHandle** tensor);
+
+// Places data from `t` into *result_tensor.
+Status GetValue(AbstractTensorHandle* t, TF_Tensor** result_tensor);
+
+// Util function that wraps an AbstractTensorHandle* with given data and dims.
+AbstractTensorHandlePtr GetTensorHandleUtilFloat(AbstractContext* ctx,
+                                                 float vals[], int64_t dims[],
+                                                 int num_dims);
+
+// Util function that wraps an AbstractTensorHandle* with given data and dims.
+AbstractTensorHandlePtr GetTensorHandleUtilInt(AbstractContext* ctx, int vals[],
+                                               int64_t dims[], int num_dims);
+
+// Util function that wraps an AbstractTensorHandle* with given data.
+AbstractTensorHandlePtr GetScalarTensorHandleUtil(AbstractContext* ctx,
+                                                  float val);
+
+// Performs gradient update for each weight using given learning rate.
+Status UpdateWeights(AbstractContext* ctx,
+                     std::vector<AbstractTensorHandle*>& grads,
+                     std::vector<AbstractTensorHandle*>& weights,
+                     AbstractTensorHandle* learning_rate);
+
+using Model = std::function<Status(
+    AbstractContext*, absl::Span<AbstractTensorHandle* const>,
+    absl::Span<AbstractTensorHandle*>, const GradientRegistry&)>;
+
+// Runs given model in either graph or eager mode depending on value of
+// use_function.
+Status RunModel(Model model, AbstractContext* ctx,
+                absl::Span<AbstractTensorHandle* const> inputs,
+                absl::Span<AbstractTensorHandle*> outputs, bool use_function,
+                const GradientRegistry& registry);
+
+// Builds context and returns inside *ctx.
+Status BuildImmediateExecutionContext(bool use_tfrt, AbstractContext** ctx);
+
+}  // namespace gradients
+}  // namespace tensorflow
diff --git a/tensorflow/c/eager/immediate_execution_context.h b/tensorflow/c/eager/immediate_execution_context.h
index 6d06d9a8de6..02a3320ef65 100644
--- a/tensorflow/c/eager/immediate_execution_context.h
+++ b/tensorflow/c/eager/immediate_execution_context.h
@@ -57,15 +57,10 @@ class ImmediateExecutionContext : public AbstractContext {
 
   // Create a tensor instance from the given data buffer and description.
   // `memory_releaser` will be called on destruction, and it's responsible for
-  // cleaning up the underlying buffer. `convert_string` indicates whether it
-  // has to handle tstring conversion. Expected to be removed once tstring
-  // migration is done.
-  virtual AbstractTensorInterface* CreateTensor(DataType dtype,
-                                                const int64_t* dims,
-                                                int num_dims, void* data,
-                                                size_t len, bool convert_string,
-                                                MemoryReleaser memory_releaser,
-                                                void* memory_releaser_arg) = 0;
+  // cleaning up the underlying buffer.
+  virtual AbstractTensorInterface* CreateTensor(
+      DataType dtype, const int64_t* dims, int num_dims, void* data, size_t len,
+      MemoryReleaser memory_releaser, void* memory_releaser_arg) = 0;
 
   // Create a handle to wrap and manage a Tensor
   virtual ImmediateExecutionTensorHandle* CreateLocalHandle(
diff --git a/tensorflow/c/eager/immediate_execution_operation.h b/tensorflow/c/eager/immediate_execution_operation.h
index ee212b21a96..7b68ec2c9f4 100644
--- a/tensorflow/c/eager/immediate_execution_operation.h
+++ b/tensorflow/c/eager/immediate_execution_operation.h
@@ -47,9 +47,6 @@ class ImmediateExecutionOperation : public AbstractOperation {
   virtual Status InputLength(const char* input_name, int* length) = 0;
   virtual Status OutputLength(const char* output_name, int* length) = 0;
 
-  // Experimental
-  virtual Status SetUseXla(bool enable) = 0;
-
   // Set stack trace to be used for potential async error reporting.
   virtual void SetStackTrace(AbstractStackTrace stack_trace) = 0;
 
diff --git a/tensorflow/c/eager/mnist_gradients_test.cc b/tensorflow/c/eager/mnist_gradients_test.cc
new file mode 100644
index 00000000000..4114f50a798
--- /dev/null
+++ b/tensorflow/c/eager/mnist_gradients_test.cc
@@ -0,0 +1,723 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+
+#include "absl/types/span.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/eager/c_api_unified_experimental.h"
+#include "tensorflow/c/eager/c_api_unified_experimental_internal.h"
+#include "tensorflow/c/eager/gradients.h"
+#include "tensorflow/c/eager/gradients_internal.h"
+#include "tensorflow/c/eager/gradients_util.h"
+#include "tensorflow/c/eager/mnist_gradients_testutil.h"
+#include "tensorflow/c/experimental/gradients/math_grad.h"
+#include "tensorflow/c/experimental/gradients/nn_grad.h"
+#include "tensorflow/c/experimental/ops/array_ops.h"
+#include "tensorflow/c/tf_status_helper.h"
+#include "tensorflow/c/tf_tensor.h"
+#include "tensorflow/core/lib/llvm_rtti/llvm_rtti.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace gradients {
+namespace internal {
+namespace {
+using tensorflow::TF_StatusPtr;
+
+class CppGradients
+    : public ::testing::TestWithParam<std::tuple<const char*, bool, bool>> {
+ protected:
+  void SetUp() override {
+    TF_StatusPtr status(TF_NewStatus());
+    TF_SetTracingImplementation(std::get<0>(GetParam()), status.get());
+    Status s = StatusFromTF_Status(status.get());
+    CHECK_EQ(errors::OK, s.code()) << s.error_message();
+  }
+};
+
+Status RegisterGradients(GradientRegistry* registry) {
+  TF_RETURN_IF_ERROR(registry->Register("Add", AddRegisterer));
+  TF_RETURN_IF_ERROR(registry->Register("Exp", ExpRegisterer));
+  TF_RETURN_IF_ERROR(registry->Register("MatMul", MatMulRegisterer));
+  TF_RETURN_IF_ERROR(registry->Register("Relu", ReluRegisterer));
+  TF_RETURN_IF_ERROR(
+      registry->Register("SparseSoftmaxCrossEntropyWithLogits",
+                         SparseSoftmaxCrossEntropyWithLogitsRegisterer));
+  return Status::OK();
+}
+
+TEST_P(CppGradients, TestMatMulGrad) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  AbstractContextPtr ctx;
+  {
+    AbstractContext* ctx_raw = nullptr;
+    Status s =
+        BuildImmediateExecutionContext(std::get<1>(GetParam()), &ctx_raw);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    ctx.reset(ctx_raw);
+  }
+
+  float A_vals[] = {1.0f, 2.0f, 3.0f, 4.0f};
+  int64_t A_dims[] = {2, 2};
+  float B_vals[] = {.5f, -1.0f, 1.0f, 1.0f};
+  int64_t B_dims[] = {2, 2};
+  int num_dims = 2;
+
+  AbstractTensorHandlePtr A =
+      GetTensorHandleUtilFloat(ctx.get(), A_vals, A_dims, num_dims);
+  AbstractTensorHandlePtr B =
+      GetTensorHandleUtilFloat(ctx.get(), B_vals, B_dims, num_dims);
+
+  GradientRegistry registry;
+  Status s = RegisterGradients(&registry);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  /* Pseudo-code:
+   *
+   * tape.watch(A)
+   * tape.watch(B)
+   * Y = AB
+   * outputs = tape.gradient(Y, [A, B])
+   */
+
+  std::vector<AbstractTensorHandle*> outputs(2);
+  s = RunModel(MatMulGradModel, ctx.get(), {A.get(), B.get()},
+               absl::MakeSpan(outputs),
+               /*use_function=*/!std::get<2>(GetParam()), registry);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  TF_Tensor* dA_tensor;
+  s = GetValue(outputs[0], &dA_tensor);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  float result_data[4] = {0};
+  memcpy(&result_data[0], TF_TensorData(dA_tensor),
+         TF_TensorByteSize(dA_tensor));
+
+  float expected_dA[4] = {-.5f, 2.0f, -.5f, 2.0f};
+  float tolerance = 1e-3;
+  for (int j = 0; j < 4; j++) {
+    ASSERT_NEAR(result_data[j], expected_dA[j], tolerance);
+  }
+
+  TF_Tensor* dB_tensor;
+  s = GetValue(outputs[1], &dB_tensor);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  memcpy(&result_data[0], TF_TensorData(dB_tensor),
+         TF_TensorByteSize(dB_tensor));
+
+  float expected_dB[4] = {4.0f, 4.0f, 6.0f, 6.0f};
+  for (int j = 0; j < 4; j++) {
+    ASSERT_NEAR(result_data[j], expected_dB[j], tolerance);
+  }
+
+  outputs[0]->Unref();
+  outputs[1]->Unref();
+  TF_DeleteTensor(dA_tensor);
+  TF_DeleteTensor(dB_tensor);
+}
+
+TEST_P(CppGradients, TestMNISTForward) {
+  AbstractContextPtr ctx;
+  {
+    AbstractContext* ctx_raw = nullptr;
+    Status s =
+        BuildImmediateExecutionContext(std::get<1>(GetParam()), &ctx_raw);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    ctx.reset(ctx_raw);
+  }
+
+  // X = data
+  float X_vals[] = {1.0f, 2.0f, 3.0f, 4.0f};
+  int64_t dims[] = {2, 2};
+  int num_dims = 2;
+  AbstractTensorHandlePtr X =
+      GetTensorHandleUtilFloat(ctx.get(), X_vals, dims, num_dims);
+
+  // W1 = first weights
+  float W1_vals[] = {-1.0f, 10.0f, .5f, 1.0f};
+  AbstractTensorHandlePtr W1 =
+      GetTensorHandleUtilFloat(ctx.get(), W1_vals, dims, num_dims);
+
+  // W2 = second weights
+  float W2_vals[] = {.1f, .2f, .3f, -.5f};
+  AbstractTensorHandlePtr W2 =
+      GetTensorHandleUtilFloat(ctx.get(), W2_vals, dims, num_dims);
+
+  // y = labels
+  int y_vals[] = {1, 1};
+  int64_t dims_y[] = {2};
+  num_dims = sizeof(dims_y) / sizeof(dims_y[0]);
+  AbstractTensorHandlePtr y =
+      GetTensorHandleUtilInt(ctx.get(), y_vals, dims, num_dims);
+
+  GradientRegistry registry;
+
+  // Run the Forward Pass
+  std::vector<AbstractTensorHandle*> outputs(2);
+  Status s =
+      RunModel(MNISTForwardModel, ctx.get(),
+               {X.get(), W1.get(), W2.get(), y.get()}, absl::MakeSpan(outputs),
+               /*use_function=*/!std::get<2>(GetParam()), registry);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  // Verify the Results
+  TF_Tensor* scores_tensor;
+  s = GetValue(outputs[0], &scores_tensor);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  float result_data[4] = {0};
+  memcpy(&result_data[0], TF_TensorData(scores_tensor),
+         TF_TensorByteSize(scores_tensor));
+
+  float expected_scores[4] = {3.6f, -6.0f, 10.2f, -17.0f};
+  float tolerance = 1e-3;
+  for (int j = 0; j < 4; j++) {
+    ASSERT_NEAR(result_data[j], expected_scores[j], tolerance);
+  }
+
+  TF_Tensor* loss_vals_tensor;
+  s = GetValue(outputs[1], &loss_vals_tensor);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  memcpy(&result_data[0], TF_TensorData(loss_vals_tensor),
+         TF_TensorByteSize(loss_vals_tensor));
+  float expected_losses[2] = {9.6f, 27.2f};
+  for (int j = 0; j < 2; j++) {
+    ASSERT_NEAR(result_data[j], expected_losses[j], tolerance);
+  }
+
+  outputs[0]->Unref();
+  outputs[1]->Unref();
+  TF_DeleteTensor(scores_tensor);
+  TF_DeleteTensor(loss_vals_tensor);
+}
+
+TEST_P(CppGradients, TestMNISTForward2) {
+  AbstractContextPtr ctx;
+  {
+    AbstractContext* ctx_raw = nullptr;
+    Status s =
+        BuildImmediateExecutionContext(std::get<1>(GetParam()), &ctx_raw);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    ctx.reset(ctx_raw);
+  }
+
+  // X = data
+  float X_vals[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+  int64_t X_dims[] = {3, 2};
+  int num_dims = 2;
+  AbstractTensorHandlePtr X =
+      GetTensorHandleUtilFloat(ctx.get(), X_vals, X_dims, num_dims);
+
+  // W1 = first weights
+  float W1_vals[] = {-1.0f, 10.0f, .5f, 1.0f};
+  int64_t dims[] = {2, 2};
+  AbstractTensorHandlePtr W1 =
+      GetTensorHandleUtilFloat(ctx.get(), W1_vals, dims, num_dims);
+
+  // W2 = second weights
+  float W2_vals[] = {.1f, .2f, .3f, -.5f};
+  AbstractTensorHandlePtr W2 =
+      GetTensorHandleUtilFloat(ctx.get(), W2_vals, dims, num_dims);
+
+  // y = labels
+  int y_vals[] = {1, 1, 1};
+  int64_t y_dims[] = {3};
+  num_dims = sizeof(y_dims) / sizeof(y_dims[0]);
+  AbstractTensorHandlePtr y =
+      GetTensorHandleUtilInt(ctx.get(), y_vals, y_dims, num_dims);
+
+  GradientRegistry registry;
+
+  // Run the Forward Pass
+  std::vector<AbstractTensorHandle*> outputs(2);
+  Status s =
+      RunModel(MNISTForwardModel, ctx.get(),
+               {X.get(), W1.get(), W2.get(), y.get()}, absl::MakeSpan(outputs),
+               /*use_function=*/!std::get<2>(GetParam()), registry);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  // Verify the Results
+  TF_Tensor* scores_tensor;
+  s = GetValue(outputs[0], &scores_tensor);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  float result_data[6] = {0};
+  memcpy(&result_data[0], TF_TensorData(scores_tensor),
+         TF_TensorByteSize(scores_tensor));
+
+  float expected_scores[6] = {3.6f, -6.0f, 10.2f, -17.0f, 16.8f, -28.0f};
+  float tolerance = 1e-3;
+  for (int j = 0; j < 6; j++) {
+    ASSERT_NEAR(result_data[j], expected_scores[j], tolerance);
+  }
+
+  TF_Tensor* loss_vals_tensor;
+  s = GetValue(outputs[1], &loss_vals_tensor);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  memcpy(&result_data[0], TF_TensorData(loss_vals_tensor),
+         TF_TensorByteSize(loss_vals_tensor));
+  float expected_losses[3] = {9.6f, 27.2f, 44.8f};
+  for (int j = 0; j < 3; j++) {
+    ASSERT_NEAR(result_data[j], expected_losses[j], tolerance);
+  }
+
+  outputs[0]->Unref();
+  outputs[1]->Unref();
+  TF_DeleteTensor(scores_tensor);
+  TF_DeleteTensor(loss_vals_tensor);
+}
+
+TEST_P(CppGradients, TestMatMulTranspose) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+
+  AbstractContextPtr ctx;
+  {
+    AbstractContext* ctx_raw = nullptr;
+    Status s =
+        BuildImmediateExecutionContext(std::get<1>(GetParam()), &ctx_raw);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    ctx.reset(ctx_raw);
+  }
+
+  // X = data
+  float X_vals[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+  int64_t X_dims[] = {2, 3};
+  int num_dims = 2;
+  AbstractTensorHandlePtr X =
+      GetTensorHandleUtilFloat(ctx.get(), X_vals, X_dims, num_dims);
+
+  // W1 = first weights
+  float W1_vals[] = {1.0f, 2.0f, 3.0f, 4.0f};
+  int64_t dims[] = {2, 2};
+  AbstractTensorHandlePtr W1 =
+      GetTensorHandleUtilFloat(ctx.get(), W1_vals, dims, num_dims);
+
+  GradientRegistry registry;
+
+  // Run the MatMul Op
+  std::vector<AbstractTensorHandle*> outputs(1);
+
+  Status s = RunModel(MatMulTransposeModel, ctx.get(), {X.get(), W1.get()},
+                      absl::MakeSpan(outputs),
+                      /*use_function=*/!std::get<2>(GetParam()), registry);
+
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  // Verify the Results
+  TF_Tensor* scores_tensor;
+  s = GetValue(outputs[0], &scores_tensor);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  float result_data[6] = {0};
+  memcpy(&result_data[0], TF_TensorData(scores_tensor),
+         TF_TensorByteSize(scores_tensor));
+
+  float expected_scores[6] = {13.0f, 18.0f, 17.0f, 24.0f, 21.0f, 30.0f};
+  float tolerance = 1e-3;
+  for (int j = 0; j < 6; j++) {
+    ASSERT_NEAR(result_data[j], expected_scores[j], tolerance);
+  }
+}
+
+TEST_P(CppGradients, TestReluGrad) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+
+  AbstractContextPtr ctx;
+  {
+    AbstractContext* ctx_raw = nullptr;
+    Status s =
+        BuildImmediateExecutionContext(std::get<1>(GetParam()), &ctx_raw);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    ctx.reset(ctx_raw);
+  }
+
+  // X = data
+  float X_vals[] = {1.0f, 2.0f, 3.0f, -5.0f, -4.0f, -3.0f, 2.0f, 0.0f, -1.0f};
+  int64_t X_dims[] = {3, 3};
+  int num_dims = 2;
+  AbstractTensorHandlePtr X =
+      GetTensorHandleUtilFloat(ctx.get(), X_vals, X_dims, num_dims);
+
+  GradientRegistry registry;
+  Status s = RegisterGradients(&registry);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  /* Pseudo-code:
+   *
+   * tape.watch(X)
+   * Y = Relu(X)
+   * outputs = tape.gradient(Y, [X])
+   */
+  std::vector<AbstractTensorHandle*> outputs(1);
+  s = RunModel(ReluGradModel, ctx.get(), {X.get()}, absl::MakeSpan(outputs),
+               /*use_function=*/!std::get<2>(GetParam()), registry);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  TF_Tensor* dX_tensor;
+  s = GetValue(outputs[0], &dX_tensor);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  float result_data[9] = {0};
+  memcpy(&result_data[0], TF_TensorData(dX_tensor),
+         TF_TensorByteSize(dX_tensor));
+
+  float expected_dX[9] = {1.0f, 1.0f, 1.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f};
+  float tolerance = 1e-3;
+  for (int j = 0; j < 9; j++) {
+    ASSERT_NEAR(result_data[j], expected_dX[j], tolerance);
+  }
+
+  outputs[0]->Unref();
+  TF_DeleteTensor(dX_tensor);
+}
+
+TEST_P(CppGradients, TestSoftmaxLossGrad) {
+  bool use_function = !std::get<2>(GetParam());
+  if (use_function) {
+    // TODO(b/168850692): Enable this.
+    GTEST_SKIP() << "Can't take gradient of "
+                    "SparseSoftmaxCrossEntropyWithLogits in tracing mode.";
+  }
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+
+  AbstractContextPtr ctx;
+  {
+    AbstractContext* ctx_raw = nullptr;
+    Status s =
+        BuildImmediateExecutionContext(std::get<1>(GetParam()), &ctx_raw);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    ctx.reset(ctx_raw);
+  }
+
+  // X = scores
+  float X_vals[] = {1.0f, 2.0f, 3.0f, -5.0f, -4.0f, -3.0f, 2.0f, 0.0f, -1.0f};
+  int64_t X_dims[] = {3, 3};
+  int num_dims = 2;
+  AbstractTensorHandlePtr X =
+      GetTensorHandleUtilFloat(ctx.get(), X_vals, X_dims, num_dims);
+
+  // y = labels
+  int y_vals[] = {1, 0, 1};
+  int64_t y_dims[] = {3};
+  num_dims = sizeof(y_dims) / sizeof(y_dims[0]);
+  AbstractTensorHandlePtr y =
+      GetTensorHandleUtilInt(ctx.get(), y_vals, y_dims, num_dims);
+
+  GradientRegistry registry;
+  Status s = RegisterGradients(&registry);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  /* Pseudo-code:
+   *
+   * tape.watch(X)
+   * tape.watch(labels)
+   * loss = SoftmaxLoss(X, labels)
+   * outputs = tape.gradient(loss, [X, labels])
+   *
+   *
+   */
+
+  std::vector<AbstractTensorHandle*> outputs(2);
+  s = RunModel(SoftmaxLossGradModel, ctx.get(), {X.get(), y.get()},
+               absl::MakeSpan(outputs),
+               /*use_function=*/!std::get<2>(GetParam()), registry);
+
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  TF_Tensor* dX_tensor;
+  s = GetValue(outputs[0], &dX_tensor);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  float result_data[9] = {0};
+  memcpy(&result_data[0], TF_TensorData(dX_tensor),
+         TF_TensorByteSize(dX_tensor));
+
+  float expected_dX[9] = {0.090f,  -0.7553f, 0.6652f,  -0.9099f, 0.2447f,
+                          0.6652f, 0.8437f,  -0.8858f, 0.0420f};
+  float tolerance = 1e-3;
+  for (int j = 0; j < 9; j++) {
+    ASSERT_NEAR(result_data[j], expected_dX[j], tolerance);
+  }
+
+  // Only Unref() first output as 2nd is nullptr grad for labels
+  outputs[0]->Unref();
+  TF_DeleteTensor(dX_tensor);
+}
+
+TEST_P(CppGradients, TestMNISTGrad) {
+  bool use_function = !std::get<2>(GetParam());
+  if (use_function) {
+    // TODO(b/168850692): Enable this.
+    GTEST_SKIP() << "Can't take gradient of "
+                    "SparseSoftmaxCrossEntropyWithLogits in tracing mode.";
+  }
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  AbstractContextPtr ctx;
+  {
+    AbstractContext* ctx_raw = nullptr;
+    Status s =
+        BuildImmediateExecutionContext(std::get<1>(GetParam()), &ctx_raw);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    ctx.reset(ctx_raw);
+  }
+
+  // X = data
+  float X_vals[] = {1.0f, 2.0f, 3.0f, 4.0f};
+  int64_t X_dims[] = {2, 2};
+  int num_dims = 2;
+  AbstractTensorHandlePtr X =
+      GetTensorHandleUtilFloat(ctx.get(), X_vals, X_dims, num_dims);
+
+  // W1 = first weights
+  float W1_vals[] = {-1.0f, 10.0f, .5f, 1.0f};
+  int64_t dims[] = {2, 2};
+  AbstractTensorHandlePtr W1 =
+      GetTensorHandleUtilFloat(ctx.get(), W1_vals, dims, num_dims);
+
+  // W2 = second weights
+  float W2_vals[] = {.1f, .2f, .3f, -.5f};
+  AbstractTensorHandlePtr W2 =
+      GetTensorHandleUtilFloat(ctx.get(), W2_vals, dims, num_dims);
+
+  // y = labels
+  int y_vals[] = {1, 1};
+  int64_t y_dims[] = {2};
+  num_dims = sizeof(y_dims) / sizeof(y_dims[0]);
+  AbstractTensorHandlePtr y =
+      GetTensorHandleUtilInt(ctx.get(), y_vals, y_dims, num_dims);
+
+  // Register Grads
+  GradientRegistry registry;
+  Status s = RegisterGradients(&registry);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  /* Pseudo-code:
+   *
+   *
+   * tape.watch(W1)
+   * tape.watch(W2)
+   * mm = X*W1
+   * hidden = Relu(mm)
+   * scores = W2*hidden
+   * loss = SoftmaxLoss(scores, y)
+   * outputs = tape.gradient(loss, [A, B])
+   *
+   */
+
+  std::vector<AbstractTensorHandle*> outputs(3);
+  s = RunModel(MNISTGradModel, ctx.get(),
+               {X.get(), W1.get(), W2.get(), y.get()}, absl::MakeSpan(outputs),
+               /*use_function=*/!std::get<2>(GetParam()), registry);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  float tolerance = 1e-3;
+  TF_Tensor* dW1_tensor;
+  s = GetValue(outputs[0], &dW1_tensor);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  float result_data[4] = {0};
+  memcpy(&result_data[0], TF_TensorData(dW1_tensor),
+         TF_TensorByteSize(dW1_tensor));
+
+  float expected_dW1[4] = {0.0f, 3.2f, 0.0f, 4.8f};
+  for (int j = 0; j < 4; j++) {
+    ASSERT_NEAR(result_data[j], expected_dW1[j], tolerance);
+  }
+
+  TF_Tensor* dW2_tensor;
+  s = GetValue(outputs[1], &dW2_tensor);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  memcpy(&result_data[0], TF_TensorData(dW2_tensor),
+         TF_TensorByteSize(dW2_tensor));
+
+  float expected_dW2[4] = {0.0f, 0.0f, 46.0f, -46.0f};  // dLoss
+  for (int j = 0; j < 4; j++) {
+    ASSERT_NEAR(result_data[j], expected_dW2[j], tolerance);
+  }
+
+  outputs[0]->Unref();
+  outputs[1]->Unref();
+  outputs[2]->Unref();
+  TF_DeleteTensor(dW1_tensor);
+  TF_DeleteTensor(dW2_tensor);
+}
+
+TEST_P(CppGradients, TestScalarMul) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+
+  AbstractContextPtr ctx;
+  {
+    AbstractContext* ctx_raw = nullptr;
+    Status s =
+        BuildImmediateExecutionContext(std::get<1>(GetParam()), &ctx_raw);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    ctx.reset(ctx_raw);
+  }
+
+  AbstractTensorHandlePtr eta;
+  {
+    AbstractTensorHandle* x_raw = nullptr;
+    Status s = ScalarTensorHandle(ctx.get(), 1.5f, &x_raw);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    eta.reset(x_raw);
+  }
+
+  float A_vals[] = {1.0f, 2.0f, 3.0f, 4.0f};
+  int64_t A_dims[] = {2, 2};
+  int num_dims = 2;
+
+  AbstractTensorHandlePtr A =
+      GetTensorHandleUtilFloat(ctx.get(), A_vals, A_dims, num_dims);
+
+  GradientRegistry registry;
+  std::vector<AbstractTensorHandle*> outputs(1);
+  Status s = RunModel(ScalarMulModel, ctx.get(), {eta.get(), A.get()},
+                      absl::MakeSpan(outputs),
+                      /*use_function=*/!std::get<2>(GetParam()), registry);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  TF_Tensor* dA_tensor;
+  s = GetValue(outputs[0], &dA_tensor);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  float result_data[4] = {0};
+  memcpy(&result_data[0], TF_TensorData(dA_tensor),
+         TF_TensorByteSize(dA_tensor));
+
+  float tolerance = 1e-3;
+  float eta_val = 1.5f;
+  for (int j = 0; j < 4; j++) {
+    ASSERT_NEAR(result_data[j], eta_val * A_vals[j], tolerance);
+  }
+
+  outputs[0]->Unref();
+  TF_DeleteTensor(dA_tensor);
+}
+
+TEST_P(CppGradients, TestMNIST_Training) {
+  bool use_function = !std::get<2>(GetParam());
+  if (use_function) {
+    // TODO(b/168850692): Enable this.
+    GTEST_SKIP() << "Can't take gradient of "
+                    "SparseSoftmaxCrossEntropyWithLogits in tracing mode.";
+  }
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+
+  AbstractContextPtr ctx;
+  {
+    AbstractContext* ctx_raw = nullptr;
+    Status s =
+        BuildImmediateExecutionContext(std::get<1>(GetParam()), &ctx_raw);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    ctx.reset(ctx_raw);
+  }
+
+  // X = data
+  float X_vals[] = {1.0f, 2.0f, 3.0f, 4.0f};
+  int64_t X_dims[] = {2, 2};
+  int num_dims = 2;
+  AbstractTensorHandlePtr X =
+      GetTensorHandleUtilFloat(ctx.get(), X_vals, X_dims, num_dims);
+
+  // TODO(amturati): use random initializer for weights instead of
+  // constant values.
+
+  // W1 = first weights
+  float W1_vals[] = {-.01f, 0.4f, 0.5f, -.2f};
+  int64_t dims[] = {2, 2};
+  AbstractTensorHandlePtr W1 =
+      GetTensorHandleUtilFloat(ctx.get(), W1_vals, dims, num_dims);
+
+  // W2 = second weights
+  float W2_vals[] = {.1f, .2f, .3f, -.5f};
+  AbstractTensorHandlePtr W2 =
+      GetTensorHandleUtilFloat(ctx.get(), W2_vals, dims, num_dims);
+
+  // y = labels
+  int y_vals[] = {1, 1};
+  int64_t y_dims[] = {2};
+  num_dims = sizeof(y_dims) / sizeof(y_dims[0]);
+  AbstractTensorHandlePtr y =
+      GetTensorHandleUtilInt(ctx.get(), y_vals, y_dims, num_dims);
+
+  // Register Grads
+  GradientRegistry registry;
+  Status s = RegisterGradients(&registry);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  // Prepare for training
+  std::vector<AbstractTensorHandle*> weights;
+  weights.push_back(W1.get());
+  weights.push_back(W2.get());
+
+  // Set learning rate to be 1e-1
+  AbstractTensorHandle* learning_rate = nullptr;
+  s = ScalarTensorHandle(ctx.get(), 1e-1, &learning_rate);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  // Train
+  int num_iters = 10;
+  std::vector<AbstractTensorHandle*> mnist_outputs(3);
+  std::vector<AbstractTensorHandle*> grads(2);
+  for (int i = 0; i < num_iters; i++) {
+    // Run Forward Pass
+    s = RunModel(MNISTGradModel, ctx.get(),
+                 {X.get(), weights[0], weights[1], y.get()},
+                 absl::MakeSpan(mnist_outputs),
+                 /*use_function=*/!std::get<2>(GetParam()), registry);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+    // Fill grads
+    grads[0] = mnist_outputs[0];
+    grads[1] = mnist_outputs[1];
+
+    // Gradient Update
+    s = UpdateWeights(ctx.get(), grads, weights, learning_rate);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+  }
+
+  grads[0]->Unref();          // release W1_grad
+  grads[1]->Unref();          // release W2_grad
+  mnist_outputs[2]->Unref();  // release loss
+}
+
+#ifdef PLATFORM_GOOGLE
+INSTANTIATE_TEST_SUITE_P(
+    UnifiedCAPI, CppGradients,
+    ::testing::Combine(::testing::Values("graphdef", "mlir"),
+                       /*tfrt*/ ::testing::Values(false),
+                       /*executing_eagerly*/ ::testing::Values(true, false)));
+#else
+INSTANTIATE_TEST_SUITE_P(
+    UnifiedCAPI, CppGradients,
+    ::testing::Combine(::testing::Values("graphdef", "mlir"),
+                       /*tfrt*/ ::testing::Values(false),
+                       /*executing_eagerly*/ ::testing::Values(true, false)));
+#endif
+}  // namespace
+}  // namespace internal
+}  // namespace gradients
+}  // namespace tensorflow
diff --git a/tensorflow/c/eager/mnist_gradients_testutil.cc b/tensorflow/c/eager/mnist_gradients_testutil.cc
new file mode 100644
index 00000000000..932605ab8e0
--- /dev/null
+++ b/tensorflow/c/eager/mnist_gradients_testutil.cc
@@ -0,0 +1,518 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/c/eager/mnist_gradients_testutil.h"
+
+#include <memory>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/types/span.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/eager/c_api_unified_experimental.h"
+#include "tensorflow/c/eager/c_api_unified_experimental_internal.h"
+#include "tensorflow/c/eager/gradients.h"
+#include "tensorflow/c/eager/gradients_internal.h"
+#include "tensorflow/c/eager/gradients_util.h"
+#include "tensorflow/c/experimental/ops/array_ops.h"
+#include "tensorflow/c/experimental/ops/math_ops.h"
+#include "tensorflow/c/experimental/ops/nn_ops.h"
+#include "tensorflow/core/lib/llvm_rtti/llvm_rtti.h"
+
+// ========================== Tape Ops ==============================
+
+namespace tensorflow {
+namespace gradients {
+namespace internal {
+
+using std::vector;
+using tensorflow::tracing::TracingOperation;
+
+// Computes `inputs[0] + inputs[1]` and records it on the tape.
+Status Add(AbstractContext* ctx, Tape* tape,
+           absl::Span<AbstractTensorHandle* const> inputs,
+           absl::Span<AbstractTensorHandle*> outputs,
+           const GradientRegistry& registry) {
+  AbstractOperationPtr add_op(ctx->CreateOperation());
+  ForwardOperation forward_op;
+  forward_op.ctx = ctx;
+  TF_RETURN_IF_ERROR(
+      Reset(add_op.get(), "Add", /*raw_device_name=*/nullptr, &forward_op));
+  if (isa<TracingOperation>(add_op.get())) {
+    TF_RETURN_IF_ERROR(
+        dyn_cast<TracingOperation>(add_op.get())->SetOpName("my_add"));
+  }
+  TF_RETURN_IF_ERROR(AddInput(add_op.get(), inputs[0], &forward_op));
+  TF_RETURN_IF_ERROR(AddInput(add_op.get(), inputs[1], &forward_op));
+  int num_retvals = 1;
+  return Execute(add_op.get(), ctx, outputs, &num_retvals, &forward_op, tape,
+                 registry);
+}
+
+// Computes `inputs[0] * inputs[1]` for matrices and records it on the tape.
+Status MatMul(AbstractContext* ctx, Tape* tape,
+              absl::Span<AbstractTensorHandle* const> inputs,
+              absl::Span<AbstractTensorHandle*> outputs, const char* name,
+              bool transpose_a, bool transpose_b,
+              const GradientRegistry& registry) {
+  AbstractOperationPtr matmul_op(ctx->CreateOperation());
+  ForwardOperation forward_op;
+  forward_op.ctx = ctx;
+  TF_RETURN_IF_ERROR(Reset(matmul_op.get(), "MatMul",
+                           /*raw_device_name=*/nullptr, &forward_op));
+  if (isa<TracingOperation>(matmul_op.get())) {
+    TF_RETURN_IF_ERROR(
+        dyn_cast<TracingOperation>(matmul_op.get())->SetOpName(name));
+  }
+
+  TF_RETURN_IF_ERROR(AddInput(matmul_op.get(), inputs[0], &forward_op));
+  TF_RETURN_IF_ERROR(AddInput(matmul_op.get(), inputs[1], &forward_op));
+  TF_RETURN_IF_ERROR(tensorflow::gradients::internal::SetAttrBool(
+      matmul_op.get(), "transpose_a", transpose_a, &forward_op));
+  TF_RETURN_IF_ERROR(tensorflow::gradients::internal::SetAttrBool(
+      matmul_op.get(), "transpose_b", transpose_b, &forward_op));
+
+  int num_retvals = 1;
+  return Execute(matmul_op.get(), ctx, outputs, &num_retvals, &forward_op, tape,
+                 registry);
+}
+
+Status Mul(AbstractContext* ctx, Tape* tape,
+           absl::Span<AbstractTensorHandle* const> inputs,
+           absl::Span<AbstractTensorHandle*> outputs, const char* name,
+           const GradientRegistry& registry) {
+  AbstractOperationPtr mul_op(ctx->CreateOperation());
+  ForwardOperation forward_op;
+  forward_op.ctx = ctx;
+  TF_RETURN_IF_ERROR(
+      Reset(mul_op.get(), "Mul", /*raw_device_name=*/nullptr, &forward_op));
+  if (isa<TracingOperation>(mul_op.get())) {
+    TF_RETURN_IF_ERROR(
+        dyn_cast<TracingOperation>(mul_op.get())->SetOpName(name));
+  }
+
+  TF_RETURN_IF_ERROR(AddInput(mul_op.get(), inputs[0], &forward_op));
+  TF_RETURN_IF_ERROR(AddInput(mul_op.get(), inputs[1], &forward_op));
+
+  int num_retvals = 1;
+  return Execute(mul_op.get(), ctx, outputs, &num_retvals, &forward_op, tape,
+                 registry);
+}
+
+// Computes `Relu(inputs[0])` and records it on the tape.
+Status Relu(AbstractContext* ctx, Tape* tape,
+            absl::Span<AbstractTensorHandle* const> inputs,
+            absl::Span<AbstractTensorHandle*> outputs, const char* name,
+            const GradientRegistry& registry) {
+  AbstractOperationPtr relu_op(ctx->CreateOperation());
+  ForwardOperation forward_op;
+  forward_op.ctx = ctx;
+  TF_RETURN_IF_ERROR(
+      Reset(relu_op.get(), "Relu", /*raw_device_name=*/nullptr, &forward_op));
+  if (isa<TracingOperation>(relu_op.get())) {
+    TF_RETURN_IF_ERROR(
+        dyn_cast<TracingOperation>(relu_op.get())->SetOpName(name));
+  }
+  TF_RETURN_IF_ERROR(AddInput(relu_op.get(), inputs[0], &forward_op));
+  int num_retvals = 1;
+  return Execute(relu_op.get(), ctx, outputs, &num_retvals, &forward_op, tape,
+                 registry);
+}
+
+// Computes `SoftmaxLoss(scores, labels)` where labels are categorical (not
+// one-hot) and records it on the tape.
+Status SparseSoftmaxCrossEntropyWithLogits(
+    AbstractContext* ctx, Tape* tape,
+    absl::Span<AbstractTensorHandle* const> inputs,
+    absl::Span<AbstractTensorHandle*> outputs, const char* name,
+    const GradientRegistry& registry) {
+  AbstractTensorHandle* scores = inputs[0];
+  AbstractTensorHandle* labels = inputs[1];
+
+  AbstractOperationPtr sm_op(ctx->CreateOperation());
+  ForwardOperation forward_op;
+  forward_op.ctx = ctx;
+  TF_RETURN_IF_ERROR(Reset(sm_op.get(), "SparseSoftmaxCrossEntropyWithLogits",
+                           /*raw_device_name=*/nullptr, &forward_op));
+  if (isa<TracingOperation>(sm_op.get())) {
+    TF_RETURN_IF_ERROR(
+        dyn_cast<TracingOperation>(sm_op.get())->SetOpName(name));
+  }
+
+  TF_RETURN_IF_ERROR(AddInput(sm_op.get(), scores, &forward_op));
+  TF_RETURN_IF_ERROR(AddInput(sm_op.get(), labels, &forward_op));
+
+  int num_retvals = 2;  // returns loss values and backprop
+  return Execute(sm_op.get(), ctx, outputs, &num_retvals, &forward_op, tape,
+                 registry);
+}
+
+//===================== Test Models to run =========================
+
+// Computes
+// y = inputs[0] + inputs[1]
+// return grad(y, {inputs[0], inputs[1]})
+Status AddGradModel(AbstractContext* ctx,
+                    absl::Span<AbstractTensorHandle* const> inputs,
+                    absl::Span<AbstractTensorHandle*> outputs,
+                    const GradientRegistry& registry) {
+  TapeVSpace vspace(ctx);
+  auto tape = new Tape(/*persistent=*/false);
+  tape->Watch(ToId(inputs[0]));  // Watch x.
+  tape->Watch(ToId(inputs[1]));  // Watch y.
+  std::vector<AbstractTensorHandle*> add_outputs(1);
+  TF_RETURN_IF_ERROR(Add(ctx, tape, inputs, absl::MakeSpan(add_outputs),
+                         registry));  // Compute x+y.
+  std::unordered_map<tensorflow::int64, TapeTensor>
+      source_tensors_that_are_targets;
+
+  std::vector<AbstractTensorHandle*> out_grads;
+  TF_RETURN_IF_ERROR(tape->ComputeGradient(
+      vspace, /*target_tensor_ids=*/{ToId(add_outputs[0])},
+      /*source_tensor_ids=*/{ToId(inputs[0]), ToId(inputs[1])},
+      source_tensors_that_are_targets,
+      /*output_gradients=*/{}, &out_grads,
+      /*build_default_zeros_grads=*/false));
+  for (auto add_output : add_outputs) {
+    add_output->Unref();
+  }
+  outputs[0] = out_grads[0];
+  outputs[1] = out_grads[1];
+  delete tape;
+  return Status::OK();
+}
+
+// Computes
+// y = inputs[0] * inputs[1]
+// return grad(y, {inputs[0], inputs[1]})
+Status MatMulGradModel(AbstractContext* ctx,
+                       absl::Span<AbstractTensorHandle* const> inputs,
+                       absl::Span<AbstractTensorHandle*> outputs,
+                       const GradientRegistry& registry) {
+  TapeVSpace vspace(ctx);
+  auto tape = new Tape(/*persistent=*/false);
+  tape->Watch(ToId(inputs[0]));  // Watch x.
+  tape->Watch(ToId(inputs[1]));  // Watch y.
+  vector<AbstractTensorHandle*> mm_outputs(1);
+  TF_RETURN_IF_ERROR(MatMul(ctx, tape, inputs, absl::MakeSpan(mm_outputs),
+                            "matmul0", /*transpose_a=*/false,
+                            /*transpose_b=*/false, registry));  // Compute x*y.
+
+  std::unordered_map<tensorflow::int64, TapeTensor>
+      source_tensors_that_are_targets;
+
+  vector<AbstractTensorHandle*> out_grads;
+  TF_RETURN_IF_ERROR(tape->ComputeGradient(
+      vspace, /*target_tensor_ids=*/{ToId(mm_outputs[0])},
+      /*source_tensor_ids=*/{ToId(inputs[0]), ToId(inputs[1])},
+      source_tensors_that_are_targets,
+      /*output_gradients=*/{}, &out_grads,
+      /*build_default_zeros_grads=*/false));
+  for (auto mm_output : mm_outputs) {
+    mm_output->Unref();
+  }
+  outputs[0] = out_grads[0];
+  outputs[1] = out_grads[1];
+  delete tape;
+  return Status::OK();
+}
+
+// Model to run 2-layer net
+Status MNISTForwardModel(AbstractContext* ctx,
+                         absl::Span<AbstractTensorHandle* const> inputs,
+                         absl::Span<AbstractTensorHandle*> outputs,
+                         const GradientRegistry& registry) {
+  /**
+   * We will trace a 2-layer fully connected network for an MNIST model:
+   *
+   *   def mnist_forward(X, W1, W2, y_labels):
+   *     mm_out_1 = tf.matmul(X,W1)
+   *     hidden_layer = tf.nn.relu(mm_out_1)
+   *     scores = tf.matmul(hidden_layer,W2)
+   *     softmax =
+   *        tf.nn.sparse_softmax_cross_entropy_with_logits(scores,
+   *                                                       y_labels)
+   *     return scores, softmax
+   *
+   * Use this convention for inputs:
+   *
+   *   inputs = [X, W1, W2, y_labels]
+   *
+   */
+  AbstractTensorHandle* X = inputs[0];
+  AbstractTensorHandle* W1 = inputs[1];
+  AbstractTensorHandle* W2 = inputs[2];
+  AbstractTensorHandle* y_labels = inputs[3];
+
+  TapeVSpace vspace(ctx);
+  auto tape = new Tape(/*persistent=*/false);
+  tape->Watch(ToId(W1));  // Watch W1.
+  tape->Watch(ToId(W2));  // Watch W2.
+  vector<AbstractTensorHandle*> temp_outputs(1);
+
+  TF_RETURN_IF_ERROR(MatMul(ctx, tape, {X, W1}, absl::MakeSpan(temp_outputs),
+                            "matmul0", /*transpose_a=*/false,
+                            /*transpose_b=*/false, registry));  // Compute X*W1
+
+  TF_RETURN_IF_ERROR(Relu(ctx, tape, {temp_outputs[0]},
+                          absl::MakeSpan(temp_outputs), "relu",
+                          registry));  // Compute Relu(X*W1)
+
+  TF_RETURN_IF_ERROR(MatMul(ctx, tape, {temp_outputs[0], W2},
+                            absl::MakeSpan(temp_outputs), "matmul1",
+                            /*transpose_a=*/false, /*transpose_b=*/false,
+                            registry));  // Compute W2*Relu(X*W1)
+
+  AbstractTensorHandle* scores = temp_outputs[0];
+
+  temp_outputs.resize(2);
+  TF_RETURN_IF_ERROR(SparseSoftmaxCrossEntropyWithLogits(
+      ctx, tape, {scores, y_labels}, absl::MakeSpan(temp_outputs),
+      "softmax_loss", registry));  // Compute Softmax(Scores,labels)
+
+  AbstractTensorHandle* loss_vals = temp_outputs[0];
+
+  outputs[0] = scores;
+  outputs[1] = loss_vals;
+  delete tape;
+  return Status::OK();
+}
+
+Status MatMulTransposeModel(AbstractContext* ctx,
+                            absl::Span<AbstractTensorHandle* const> inputs,
+                            absl::Span<AbstractTensorHandle*> outputs,
+                            const GradientRegistry& registry) {
+  AbstractTensorHandle* X = inputs[0];
+  AbstractTensorHandle* W1 = inputs[1];
+
+  TapeVSpace vspace(ctx);
+  auto tape = new Tape(/*persistent=*/false);
+  tape->Watch(ToId(X));
+  tape->Watch(ToId(W1));
+  vector<AbstractTensorHandle*> temp_outputs(1);
+
+  TF_RETURN_IF_ERROR(MatMul(ctx, tape, {X, W1}, absl::MakeSpan(temp_outputs),
+                            "matmul0", /*transpose_a=*/true,
+                            /*transpose_b=*/false, registry));  // Compute X*W1
+
+  outputs[0] = temp_outputs[0];
+
+  delete tape;
+  return Status::OK();
+}
+
+Status ReluGradModel(AbstractContext* ctx,
+                     absl::Span<AbstractTensorHandle* const> inputs,
+                     absl::Span<AbstractTensorHandle*> outputs,
+                     const GradientRegistry& registry) {
+  TapeVSpace vspace(ctx);
+  auto tape = new Tape(/*persistent=*/false);
+  tape->Watch(ToId(inputs[0]));  // Watch X
+  vector<AbstractTensorHandle*> relu_outputs(1);
+  TF_RETURN_IF_ERROR(Relu(ctx, tape, inputs, absl::MakeSpan(relu_outputs),
+                          "relu0", registry));  // Relu(X)
+
+  std::unordered_map<tensorflow::int64, TapeTensor>
+      source_tensors_that_are_targets;
+
+  vector<AbstractTensorHandle*> out_grads;
+  TF_RETURN_IF_ERROR(tape->ComputeGradient(
+      vspace, /*target_tensor_ids=*/{ToId(relu_outputs[0])},
+      /*source_tensor_ids=*/{ToId(inputs[0])}, source_tensors_that_are_targets,
+      /*output_gradients=*/{}, &out_grads,
+      /*build_default_zeros_grads=*/false));
+
+  for (auto relu_output : relu_outputs) {
+    relu_output->Unref();
+  }
+
+  outputs[0] = out_grads[0];
+  delete tape;
+  return Status::OK();
+}
+
+Status SoftmaxLossGradModel(AbstractContext* ctx,
+                            absl::Span<AbstractTensorHandle* const> inputs,
+                            absl::Span<AbstractTensorHandle*> outputs,
+                            const GradientRegistry& registry) {
+  TapeVSpace vspace(ctx);
+  auto tape = new Tape(/*persistent=*/false);
+  tape->Watch(ToId(inputs[0]));  // Watch scores.
+  tape->Watch(ToId(inputs[1]));  // Watch labels.
+  vector<AbstractTensorHandle*> sm_outputs(2);
+  TF_RETURN_IF_ERROR(SparseSoftmaxCrossEntropyWithLogits(
+      ctx, tape, inputs, absl::MakeSpan(sm_outputs), "softmax0", registry));
+
+  std::unordered_map<tensorflow::int64, TapeTensor>
+      source_tensors_that_are_targets;
+
+  vector<AbstractTensorHandle*> out_grads;
+  TF_RETURN_IF_ERROR(tape->ComputeGradient(
+      vspace, /*target_tensor_ids=*/{ToId(sm_outputs[0])},
+      /*source_tensor_ids=*/{ToId(inputs[0]), ToId(inputs[1])},
+      source_tensors_that_are_targets,
+      /*output_gradients=*/{}, &out_grads,
+      /*build_default_zeros_grads=*/false));
+
+  outputs[0] = out_grads[0];
+  outputs[1] = out_grads[1];
+  delete tape;
+  return Status::OK();
+}
+
+Status MNISTGradModel(AbstractContext* ctx,
+                      absl::Span<AbstractTensorHandle* const> inputs,
+                      absl::Span<AbstractTensorHandle*> outputs,
+                      const GradientRegistry& registry) {
+  AbstractTensorHandle* X = inputs[0];
+  AbstractTensorHandle* W1 = inputs[1];
+  AbstractTensorHandle* W2 = inputs[2];
+  AbstractTensorHandle* y_labels = inputs[3];
+
+  TapeVSpace vspace(ctx);
+  auto tape = new Tape(/*persistent=*/true);
+  tape->Watch(ToId(X));   // Watch X.
+  tape->Watch(ToId(W1));  // Watch W1.
+  tape->Watch(ToId(W2));  // Watch W1.
+  vector<AbstractTensorHandle*> temp_outputs(1);
+  TF_RETURN_IF_ERROR(MatMul(ctx, tape, {X, W1}, absl::MakeSpan(temp_outputs),
+                            "matmul0", /*transpose_a=*/false,
+                            /*transpose_b=*/false, registry));  // Compute X*W1
+
+  AbstractTensorHandle* mm = temp_outputs[0];
+
+  TF_RETURN_IF_ERROR(Relu(ctx, tape, {mm},
+                          absl::MakeSpan(temp_outputs),  // Relu(X*W1)
+                          "relu0", registry));
+
+  AbstractTensorHandle* hidden = temp_outputs[0];
+
+  TF_RETURN_IF_ERROR(MatMul(ctx, tape, {hidden, W2},
+                            absl::MakeSpan(temp_outputs), "matmul1",
+                            /*transpose_a=*/false, /*transpose_b=*/false,
+                            registry));  // W2*Relu(X*W1)
+
+  AbstractTensorHandle* scores = temp_outputs[0];
+
+  temp_outputs.resize(2);
+  TF_RETURN_IF_ERROR(SparseSoftmaxCrossEntropyWithLogits(
+      ctx, tape, {scores, y_labels}, absl::MakeSpan(temp_outputs),
+      "softmaxloss", registry));  // W2*Relu(X*W1)
+
+  AbstractTensorHandle* loss = temp_outputs[0];
+
+  std::unordered_map<tensorflow::int64, TapeTensor>
+      source_tensors_that_are_targets;
+
+  vector<AbstractTensorHandle*> out_grads;
+  TF_RETURN_IF_ERROR(
+      tape->ComputeGradient(vspace, /*target_tensor_ids=*/{ToId(loss)},
+                            /*source_tensor_ids=*/{ToId(W1), ToId(W2)},
+                            source_tensors_that_are_targets,
+                            /*output_gradients=*/{}, &out_grads,
+                            /*build_default_zeros_grads=*/false));
+
+  // Only release 2nd temp output as first holds loss values.
+  temp_outputs[1]->Unref();
+
+  outputs[0] = out_grads[0];  // dW1
+  outputs[1] = out_grads[1];  // dW2
+  outputs[2] = loss;
+
+  delete tape;
+  return Status::OK();
+}
+
+Status ScalarMulModel(AbstractContext* ctx,
+                      absl::Span<AbstractTensorHandle* const> inputs,
+                      absl::Span<AbstractTensorHandle*> outputs,
+                      const GradientRegistry& registry) {
+  AbstractTensorHandle* eta = inputs[0];
+  AbstractTensorHandle* A = inputs[1];
+
+  TapeVSpace vspace(ctx);
+  auto tape = new Tape(/*persistent=*/false);
+  vector<AbstractTensorHandle*> temp_outputs(1);
+
+  TF_RETURN_IF_ERROR(Mul(ctx, tape, {eta, A}, absl::MakeSpan(temp_outputs),
+                         "scalarMul0", registry));  // Compute eta*A
+
+  outputs[0] = temp_outputs[0];
+
+  delete tape;
+  return Status::OK();
+}
+
+Status MatMulModel(AbstractContext* ctx,
+                   absl::Span<AbstractTensorHandle* const> inputs,
+                   absl::Span<AbstractTensorHandle*> outputs,
+                   const GradientRegistry& registry) {
+  AbstractTensorHandle* X = inputs[0];
+  AbstractTensorHandle* W1 = inputs[1];
+
+  TapeVSpace vspace(ctx);
+  auto tape = new Tape(/*persistent=*/false);
+  std::vector<AbstractTensorHandle*> temp_outputs(1);
+  TF_RETURN_IF_ERROR(MatMul(ctx, tape, {X, W1}, absl::MakeSpan(temp_outputs),
+                            "matmul0", /*transpose_a=*/false,
+                            /*transpose_b=*/false, registry));  // Compute X*W1
+
+  outputs[0] = temp_outputs[0];
+  delete tape;
+  return Status::OK();
+}
+
+Status MulModel(AbstractContext* ctx,
+                absl::Span<AbstractTensorHandle* const> inputs,
+                absl::Span<AbstractTensorHandle*> outputs,
+                const GradientRegistry& registry) {
+  AbstractTensorHandle* x = inputs[0];
+  AbstractTensorHandle* y = inputs[1];
+
+  TapeVSpace vspace(ctx);
+  auto tape = new Tape(/*persistent=*/false);
+  std::vector<AbstractTensorHandle*> temp_outputs(1);
+  TF_RETURN_IF_ERROR(Mul(ctx, tape, {x, y}, absl::MakeSpan(temp_outputs),
+                         "mul0", registry));  // Compute x*y
+
+  outputs[0] = temp_outputs[0];
+  delete tape;
+  return Status::OK();
+}
+
+Status SoftmaxModel(AbstractContext* ctx,
+                    absl::Span<AbstractTensorHandle* const> inputs,
+                    absl::Span<AbstractTensorHandle*> outputs,
+                    const GradientRegistry& registry) {
+  AbstractTensorHandle* x = inputs[0];
+  AbstractTensorHandle* labels = inputs[1];
+
+  TapeVSpace vspace(ctx);
+  auto tape = new Tape(/*persistent=*/false);
+  std::vector<AbstractTensorHandle*> temp_outputs(2);
+  TF_RETURN_IF_ERROR(SparseSoftmaxCrossEntropyWithLogits(
+      ctx, tape, {x, labels}, absl::MakeSpan(temp_outputs), "sm_loss",
+      registry));
+
+  outputs[0] = temp_outputs[0];  // loss values
+
+  delete tape;
+  return Status::OK();
+}
+
+// ============================= End Models ================================
+
+}  // namespace internal
+}  // namespace gradients
+}  // namespace tensorflow
diff --git a/tensorflow/c/eager/mnist_gradients_testutil.h b/tensorflow/c/eager/mnist_gradients_testutil.h
new file mode 100644
index 00000000000..1cf87bb9dee
--- /dev/null
+++ b/tensorflow/c/eager/mnist_gradients_testutil.h
@@ -0,0 +1,143 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EAGER_MNIST_GRADIENTS_TESTUTIL_H_
+#define TENSORFLOW_C_EAGER_MNIST_GRADIENTS_TESTUTIL_H_
+#include <memory>
+
+#include "absl/types/span.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/eager/c_api_unified_experimental.h"
+#include "tensorflow/c/eager/c_api_unified_experimental_internal.h"
+#include "tensorflow/c/eager/gradients.h"
+#include "tensorflow/c/eager/gradients_internal.h"
+#include "tensorflow/c/experimental/ops/array_ops.h"
+#include "tensorflow/c/experimental/ops/math_ops.h"
+#include "tensorflow/c/experimental/ops/nn_ops.h"
+#include "tensorflow/core/lib/llvm_rtti/llvm_rtti.h"
+#include "tensorflow/core/platform/status.h"
+
+// ========================== Tape Ops ==============================
+
+namespace tensorflow {
+namespace gradients {
+namespace internal {
+// Computes `inputs[0] + inputs[1]` and records it on the tape.
+Status Add(AbstractContext* ctx, Tape* tape,
+           absl::Span<AbstractTensorHandle* const> inputs,
+           absl::Span<AbstractTensorHandle*> outputs,
+           const GradientRegistry& registry);
+
+// Computes `inputs[0] * inputs[1]` for matrices and records it on the tape.
+Status MatMul(AbstractContext* ctx, Tape* tape,
+              absl::Span<AbstractTensorHandle* const> inputs,
+              absl::Span<AbstractTensorHandle*> outputs, const char* name,
+              bool transpose_a, bool transpose_b,
+              const GradientRegistry& registry);
+
+// Computes `inputs[0] * inputs[1]` and records it on the tape.
+Status Mul(AbstractContext* ctx, Tape* tape,
+           absl::Span<AbstractTensorHandle* const> inputs,
+           absl::Span<AbstractTensorHandle*> outputs, const char* name,
+           const GradientRegistry& registry);
+
+// Computes `Relu(inputs[0])` and records it on the tape.
+Status Relu(AbstractContext* ctx, Tape* tape,
+            absl::Span<AbstractTensorHandle* const> inputs,
+            absl::Span<AbstractTensorHandle*> outputs, const char* name,
+            const GradientRegistry& registry);
+
+// Computes `SoftmaxLoss(scores, labels)` for matrices and records it on the
+// tape.
+Status SparseSoftmaxCrossEntropyWithLogits(
+    AbstractContext* ctx, Tape* tape,
+    absl::Span<AbstractTensorHandle* const> inputs,
+    absl::Span<AbstractTensorHandle*> outputs, const char* name,
+    const GradientRegistry& registry);
+
+// ====================== End Tape Ops ============================
+
+// Computes
+// y = inputs[0] + inputs[1]
+// return grad(y, {inputs[0], inputs[1]})
+Status AddGradModel(AbstractContext* ctx,
+                    absl::Span<AbstractTensorHandle* const> inputs,
+                    absl::Span<AbstractTensorHandle*> outputs,
+                    const GradientRegistry& registry);
+
+// Computes
+// y = inputs[0] * inputs[1]
+// return grad(y, {inputs[0], inputs[1]})
+Status MatMulGradModel(AbstractContext* ctx,
+                       absl::Span<AbstractTensorHandle* const> inputs,
+                       absl::Span<AbstractTensorHandle*> outputs,
+                       const GradientRegistry& registry);
+
+// Computes 2-layer Neural Network with Softmax Loss.
+Status MNISTForwardModel(AbstractContext* ctx,
+                         absl::Span<AbstractTensorHandle* const> inputs,
+                         absl::Span<AbstractTensorHandle*> outputs,
+                         const GradientRegistry& registry);
+
+// Computes MatMul with first matrix tranposed.
+Status MatMulTransposeModel(AbstractContext* ctx,
+                            absl::Span<AbstractTensorHandle* const> inputs,
+                            absl::Span<AbstractTensorHandle*> outputs,
+                            const GradientRegistry& registry);
+
+// Test Model to verify ReluGrad functionality
+Status ReluGradModel(AbstractContext* ctx,
+                     absl::Span<AbstractTensorHandle* const> inputs,
+                     absl::Span<AbstractTensorHandle*> outputs,
+                     const GradientRegistry& registry);
+
+// Test Model to verify SoftmaxGrad functionality
+Status SoftmaxLossGradModel(AbstractContext* ctx,
+                            absl::Span<AbstractTensorHandle* const> inputs,
+                            absl::Span<AbstractTensorHandle*> outputs,
+                            const GradientRegistry& registry);
+
+// Test Model to verify Multi-grad functionality for MNIST
+Status MNISTGradModel(AbstractContext* ctx,
+                      absl::Span<AbstractTensorHandle* const> inputs,
+                      absl::Span<AbstractTensorHandle*> outputs,
+                      const GradientRegistry& registry);
+
+// Test Model to verify scalar-tensor multiplication Op
+Status ScalarMulModel(AbstractContext* ctx,
+                      absl::Span<AbstractTensorHandle* const> inputs,
+                      absl::Span<AbstractTensorHandle*> outputs,
+                      const GradientRegistry& registry);
+
+Status MatMulModel(AbstractContext* ctx,
+                   absl::Span<AbstractTensorHandle* const> inputs,
+                   absl::Span<AbstractTensorHandle*> outputs,
+                   const GradientRegistry& registry);
+
+Status MulModel(AbstractContext* ctx,
+                absl::Span<AbstractTensorHandle* const> inputs,
+                absl::Span<AbstractTensorHandle*> outputs,
+                const GradientRegistry& registry);
+
+Status SoftmaxModel(AbstractContext* ctx,
+                    absl::Span<AbstractTensorHandle* const> inputs,
+                    absl::Span<AbstractTensorHandle*> outputs,
+                    const GradientRegistry& registry);
+
+}  // namespace internal
+}  // namespace gradients
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EAGER_MNIST_GRADIENTS_TESTUTIL_H_
diff --git a/tensorflow/c/eager/parallel_device/BUILD b/tensorflow/c/eager/parallel_device/BUILD
index 0d0e5ffce10..3eec95294b3 100644
--- a/tensorflow/c/eager/parallel_device/BUILD
+++ b/tensorflow/c/eager/parallel_device/BUILD
@@ -76,10 +76,26 @@ cc_library(
         "//tensorflow/c/eager:c_api_experimental",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
         "@com_google_absl//absl/types:variant",
     ],
 )
 
+tf_cc_test(
+    name = "parallel_device_lib_test",
+    srcs = ["parallel_device_lib_test.cc"],
+    deps = [
+        ":parallel_device_lib",
+        "//tensorflow/c:c_api",
+        "//tensorflow/c:c_api_experimental",
+        "//tensorflow/c/eager:c_api",
+        "//tensorflow/c/eager:c_api_experimental",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 cc_library(
     name = "parallel_device_testlib",
     testonly = 1,
@@ -87,7 +103,6 @@ cc_library(
     hdrs = ["parallel_device_testlib.h"],
     deps = [
         ":parallel_device",
-        ":parallel_device_ops",
         "//tensorflow/c:c_api",
         "//tensorflow/c:c_api_experimental",
         "//tensorflow/c/eager:c_api",
@@ -102,7 +117,6 @@ tf_cc_test(
     srcs = ["parallel_device_test.cc"],
     deps = [
         ":parallel_device",
-        ":parallel_device_ops",
         ":parallel_device_testlib",
         "//tensorflow/c:c_api",
         "//tensorflow/c:c_api_experimental",
@@ -122,7 +136,6 @@ tf_cc_test(
     args = ["--heap_check=local"],
     deps = [
         ":parallel_device",
-        ":parallel_device_ops",
         ":parallel_device_testlib",
         "//tensorflow/c:c_api",
         "//tensorflow/c:c_api_experimental",
@@ -134,19 +147,3 @@ tf_cc_test(
         "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
     ],
 )
-
-# Note: ParallelDevice-specific ops are experimental and not currently linked in
-# to TensorFlow by default, just used in a few tests.
-filegroup(
-    name = "parallel_device_ops_srcs",
-    srcs = ["parallel_device_ops.cc"],
-    visibility = ["//tensorflow/python/distribute/parallel_device:__pkg__"],
-)
-
-cc_library(
-    name = "parallel_device_ops",
-    srcs = [":parallel_device_ops_srcs"],
-    visibility = ["//tensorflow:internal"],
-    deps = ["//tensorflow/core:framework"],
-    alwayslink = 1,
-)
diff --git a/tensorflow/c/eager/parallel_device/parallel_device.cc b/tensorflow/c/eager/parallel_device/parallel_device.cc
index d0e9f351478..41bde23448b 100644
--- a/tensorflow/c/eager/parallel_device/parallel_device.cc
+++ b/tensorflow/c/eager/parallel_device/parallel_device.cc
@@ -136,13 +136,6 @@ absl::optional<std::vector<MaybeParallelTensorOwned>> ExecuteWithSpecialOps(
     }
     result.emplace(std::move(outputs));
     return result;
-  } else if (operation_name == std::string("DeviceID")) {
-    std::vector<MaybeParallelTensorOwned> result_content;
-    result_content.reserve(1);
-    result_content.push_back(parallel_device.DeviceIDs(context, status));
-    if (TF_GetCode(status) != TF_OK) return result;
-    result.emplace(std::move(result_content));
-    return result;
   }
   std::vector<ParallelTensor*> parallel_inputs;
   std::vector<std::unique_ptr<ParallelTensor>> implicitly_broadcast_tensors;
@@ -255,28 +248,44 @@ TFE_TensorHandle* CopyTensorFromParallelDevice(TFE_Context* context,
 // Since this function is used to satisfy the TFE_CustomDevice C API,
 // device_info is passed in using a C-style generic. It must always be a
 // ParallelDevice.
-void ParallelDeviceExecute(TFE_Context* context, int num_inputs,
-                           TFE_TensorHandle** inputs,
-                           const char* operation_name,
-                           const TFE_OpAttrs* attributes, int* num_outputs,
+void ParallelDeviceExecute(const TFE_Op* original_op, int* num_outputs,
                            TFE_TensorHandle** outputs, TF_Status* status,
                            void* device_info) {
+  const char* requested_placement = TFE_OpGetDevice(original_op, status);
+  if (*requested_placement == '\0') {
+    TF_SetStatus(
+        status, TF_INTERNAL,
+        "Ops must be placed on the parallel device explicitly, or their inputs "
+        "first un-packed. Got an un-placed op with an input placed on the "
+        "parallel device.");
+    return;
+  }
+  TFE_Context* context = TFE_OpGetContext(original_op, status);
+  if (TF_GetCode(status) != TF_OK) return;
+  const char* operation_name = TFE_OpGetName(original_op, status);
+  if (TF_GetCode(status) != TF_OK) return;
+  const TFE_OpAttrs* attributes = TFE_OpGetAttrs(original_op);
+
   NamedParallelDevice* named_device =
       reinterpret_cast<NamedParallelDevice*>(device_info);
   std::vector<MaybeParallelTensorUnowned> typed_inputs;
+  int num_inputs = TFE_OpGetFlatInputCount(original_op, status);
+  if (TF_GetCode(status) != TF_OK) return;
   typed_inputs.reserve(num_inputs);
   for (int i = 0; i < num_inputs; ++i) {
+    TFE_TensorHandle* input = TFE_OpGetFlatInput(original_op, i, status);
+    if (TF_GetCode(status) != TF_OK) return;
     const char* tensor_handle_device =
-        TFE_TensorHandleDeviceName(inputs[i], status);
+        TFE_TensorHandleDeviceName(input, status);
     if (TF_GetCode(status) != TF_OK) return;
     if (named_device->name() == tensor_handle_device) {
       // We assume that any tensors already placed on this device are
       // ParallelTensors.
       typed_inputs.emplace_back(reinterpret_cast<ParallelTensor*>(
-          TFE_TensorHandleDevicePointer(inputs[i], status)));
+          TFE_TensorHandleDevicePointer(input, status)));
       if (TF_GetCode(status) != TF_OK) return;
     } else {
-      typed_inputs.emplace_back(inputs[i]);
+      typed_inputs.emplace_back(input);
     }
   }
 
diff --git a/tensorflow/c/eager/parallel_device/parallel_device_lib.cc b/tensorflow/c/eager/parallel_device/parallel_device_lib.cc
index 768f686bd88..e270bfcbb80 100644
--- a/tensorflow/c/eager/parallel_device/parallel_device_lib.cc
+++ b/tensorflow/c/eager/parallel_device/parallel_device_lib.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/c/eager/parallel_device/parallel_device_lib.h"
 
+#include "tensorflow/c/tf_status.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/mutex.h"
@@ -118,6 +119,9 @@ class DeviceThread {
   int expected_max_outputs_ TF_GUARDED_BY(execution_mutex_);
   //   Outputs
   std::vector<TensorHandlePtr> op_outputs_ TF_GUARDED_BY(execution_mutex_);
+  // TF_Status is an incomplete type and so can't be stack allocated. To avoid
+  // unnecessary allocations each Execute call, we keep one heap-allocated
+  // version for the thread.
   StatusPtr status_ TF_GUARDED_BY(execution_mutex_);
 
   const std::string device_;
@@ -188,6 +192,9 @@ std::vector<TensorHandlePtr> DeviceThread::Join(TF_Status* status) {
     if (TF_GetCode(status_.get()) != TF_OK) {
       TF_SetStatus(status, TF_GetCode(status_.get()),
                    TF_Message(status_.get()));
+      // Reset the member `status_` so future op executions (after recovery from
+      // the bad `status`) start with an OK status.
+      TF_SetStatus(status_.get(), TF_OK, "");
     }
     execution_state_ = ExecutionState::kIdle;
     result = std::move(op_outputs_);
@@ -255,18 +262,27 @@ std::unique_ptr<ParallelTensor> ParallelDevice::CopyToParallelDevice(
                                            status);
 }
 
-std::unique_ptr<ParallelTensor> ParallelDevice::DeviceIDs(
-    TFE_Context* context, TF_Status* status) const {
+std::unique_ptr<ParallelTensor> ParallelDevice::Vector(
+    TFE_Context* context, TF_Status* status,
+    absl::Span<const int32_t> values) const {
   // TODO(allenl): We could cache DeviceIDs (keyed by context).
   std::vector<TensorHandlePtr> components;
   components.reserve(underlying_devices_.size());
-  for (int device_index = 0; device_index < underlying_devices_.size();
+
+  if (values.size() != num_underlying_devices()) {
+    TF_SetStatus(
+        status, TF_INVALID_ARGUMENT,
+        "Number of values did not match number of underlying devices.");
+    return nullptr;
+  }
+
+  for (int device_index = 0; device_index < num_underlying_devices();
        ++device_index) {
-    int32_t* device_id = new int32_t;
-    *device_id = device_index;
+    int32_t* device_value = new int32_t;
+    *device_value = values[device_index];
     std::unique_ptr<TF_Tensor, decltype(&TF_DeleteTensor)> tensor(
         TF_NewTensor(
-            TF_INT32, /*dims=*/nullptr, /*num_dims=*/0, device_id,
+            TF_INT32, /*dims=*/nullptr, /*num_dims=*/0, device_value,
             sizeof(int32_t),
             [](void* data, size_t, void* arg) {
               delete reinterpret_cast<int32_t*>(data);
@@ -295,6 +311,16 @@ std::unique_ptr<ParallelTensor> ParallelDevice::DeviceIDs(
                                            status);
 }
 
+std::unique_ptr<ParallelTensor> ParallelDevice::DeviceIDs(
+    TFE_Context* context, TF_Status* status) const {
+  std::vector<int32_t> ids;
+  ids.reserve(num_underlying_devices());
+  for (int i = 0; i < num_underlying_devices(); ++i) {
+    ids.push_back(i);
+  }
+  return Vector(context, status, ids);
+}
+
 absl::optional<std::vector<std::unique_ptr<ParallelTensor>>>
 ParallelDevice::Execute(TFE_Context* context,
                         const std::vector<ParallelTensor*>& inputs,
@@ -319,21 +345,36 @@ ParallelDevice::Execute(TFE_Context* context,
                                 std::move(device_inputs), attributes,
                                 expected_max_outputs);
   }
+  StatusPtr first_bad_status(nullptr);
   for (int device_index = 0; device_index < underlying_devices_.size();
        ++device_index) {
     DeviceThread* device_thread = device_threads_[device_index].get();
     per_device_output_tensors.push_back(device_thread->Join(status));
-    if (TF_GetCode(status) != TF_OK) return result;
+    // We will run every Join even if there are bad statuses in case the user
+    // wants to recover and continue running ops on the parallel device (which
+    // would otherwise deadlock).
+    if (TF_GetCode(status) != TF_OK && first_bad_status == nullptr) {
+      first_bad_status.reset(TF_NewStatus());
+      TF_SetStatus(first_bad_status.get(), TF_GetCode(status),
+                   TF_Message(status));
+    }
+
     if (device_index == 0) {
       first_op_output_count = per_device_output_tensors.rbegin()->size();
     } else {
-      if (per_device_output_tensors.rbegin()->size() != first_op_output_count) {
-        TF_SetStatus(status, TF_INTERNAL,
+      if (first_bad_status == nullptr &&
+          per_device_output_tensors.rbegin()->size() != first_op_output_count) {
+        first_bad_status.reset(TF_NewStatus());
+        TF_SetStatus(first_bad_status.get(), TF_INTERNAL,
                      "Parallel ops produced different numbers of tensors.");
-        return result;
       }
     }
   }
+  if (first_bad_status != nullptr) {
+    TF_SetStatus(status, TF_GetCode(first_bad_status.get()),
+                 TF_Message(first_bad_status.get()));
+    return result;
+  }
   // For each output of the original operation, pack the per-device
   // TensorHandles we've computed into a single parallel TensorHandle.
   std::vector<std::unique_ptr<ParallelTensor>> per_device_outputs;
diff --git a/tensorflow/c/eager/parallel_device/parallel_device_lib.h b/tensorflow/c/eager/parallel_device/parallel_device_lib.h
index cbfea31d95f..b3dc47ab088 100644
--- a/tensorflow/c/eager/parallel_device/parallel_device_lib.h
+++ b/tensorflow/c/eager/parallel_device/parallel_device_lib.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/types/optional.h"
+#include "absl/types/span.h"
 #include "absl/types/variant.h"
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/c/eager/c_api.h"
@@ -61,6 +62,11 @@ class ParallelDevice {
                                                        TFE_TensorHandle* tensor,
                                                        TF_Status* status) const;
 
+  // Construct a parallel tensor consisting of the scalar values from `values`.
+  std::unique_ptr<ParallelTensor> Vector(
+      TFE_Context* context, TF_Status* status,
+      absl::Span<const int32_t> values) const;
+
   // A parallel tensor with scalar integers numbering component devices.
   std::unique_ptr<ParallelTensor> DeviceIDs(TFE_Context* context,
                                             TF_Status* status) const;
diff --git a/tensorflow/c/eager/parallel_device/parallel_device_lib_test.cc b/tensorflow/c/eager/parallel_device/parallel_device_lib_test.cc
new file mode 100644
index 00000000000..35befe959cb
--- /dev/null
+++ b/tensorflow/c/eager/parallel_device/parallel_device_lib_test.cc
@@ -0,0 +1,84 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/eager/parallel_device/parallel_device_lib.h"
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/c_api_experimental.h"
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace parallel_device {
+
+TEST(PARALLEL_DEVICE_LIB, TestOpWithError) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  std::unique_ptr<TFE_ContextOptions, decltype(&TFE_DeleteContextOptions)> opts(
+      TFE_NewContextOptions(), TFE_DeleteContextOptions);
+  std::unique_ptr<TF_Buffer, decltype(&TF_DeleteBuffer)> config(
+      TF_CreateConfig(
+          /*xla*/ false,
+          /* gpu_memory_allow_growth */ true, /* num_cpu_devices */
+          2),
+      TF_DeleteBuffer);
+  TFE_ContextOptionsSetConfig(opts.get(), config->data, config->length,
+                              status.get());
+  std::unique_ptr<TFE_Context, decltype(&TFE_DeleteContext)> context(
+      TFE_NewContext(opts.get(), status.get()), TFE_DeleteContext);
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+
+  std::vector<std::string> devices{
+      "/job:localhost/replica:0/task:0/device:CPU:0",
+      "/job:localhost/replica:0/task:0/device:CPU:1"};
+  ParallelDevice parallel_device(std::move(devices));
+  std::unique_ptr<TFE_Op, decltype(&TFE_DeleteOp)> handle_op(
+      TFE_NewOp(context.get(), "VarHandleOp", status.get()), TFE_DeleteOp);
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+  TFE_OpSetAttrType(handle_op.get(), "dtype", TF_FLOAT);
+  TFE_OpSetAttrShape(handle_op.get(), "shape", /*dims=*/nullptr, /*num_dims=*/0,
+                     status.get());
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+  auto outputs =
+      parallel_device.Execute(context.get(), std::vector<ParallelTensor*>(),
+                              "VarHandleOp", TFE_OpGetAttrs(handle_op.get()),
+                              /*expected_max_outputs=*/1, status.get());
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+  const std::vector<std::unique_ptr<ParallelTensor>>& handles = *outputs;
+  std::vector<ParallelTensor*> handle_inputs;
+  handle_inputs.reserve(handles.size());
+  for (auto& handle : handles) {
+    handle_inputs.push_back(handle.get());
+  }
+  std::unique_ptr<TFE_Op, decltype(&TFE_DeleteOp)> read_op(
+      TFE_NewOp(context.get(), "ReadVariableOp", status.get()), TFE_DeleteOp);
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+  TFE_OpSetAttrType(read_op.get(), "dtype", TF_FLOAT);
+  parallel_device.Execute(context.get(), handle_inputs, "ReadVariableOp",
+                          TFE_OpGetAttrs(read_op.get()),
+                          /*expected_max_outputs=*/1, status.get());
+  ASSERT_FALSE(TF_GetCode(status.get()) == TF_OK);
+  TF_SetStatus(status.get(), TF_OK, "");
+
+  // Check that ops still run successfully on the device.
+  parallel_device.Execute(context.get(), std::vector<ParallelTensor*>(),
+                          "VarHandleOp", TFE_OpGetAttrs(handle_op.get()),
+                          /*expected_max_outputs=*/1, status.get());
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+}
+
+}  // namespace parallel_device
+}  // namespace tensorflow
diff --git a/tensorflow/c/eager/parallel_device/parallel_device_testlib.cc b/tensorflow/c/eager/parallel_device/parallel_device_testlib.cc
index 828dcbae093..67bc596b180 100644
--- a/tensorflow/c/eager/parallel_device/parallel_device_testlib.cc
+++ b/tensorflow/c/eager/parallel_device/parallel_device_testlib.cc
@@ -279,30 +279,4 @@ void BasicTestsForTwoDevices(TFE_Context* context, const char* first_device,
         TFE_TensorHandleBackingDeviceName(components[1].get(), status.get());
     ASSERT_EQ(underlying_devices[1], second_device);
   }
-  // Compute the device ID twice and verify the result
-  for (int i = 0; i < 2; ++i) {
-    std::unique_ptr<TFE_Op, decltype(&TFE_DeleteOp)> op(
-        TFE_NewOp(context, "DeviceID", status.get()), TFE_DeleteOp);
-    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
-    TFE_OpSetDevice(op.get(), device_name, status.get());
-    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
-
-    TFE_TensorHandle* result_handle;
-    int num_retvals = 1;
-    TFE_Execute(op.get(), &result_handle, &num_retvals, status.get());
-    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
-    std::array<TensorHandlePtr, 2> components;
-    ExtractPerDeviceValues(context, result_handle, &components, status.get());
-    TFE_DeleteTensorHandle(result_handle);
-    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
-
-    ExpectScalarEq<int32_t>(components[0].get(), 0);
-    ExpectScalarEq<int32_t>(components[1].get(), 1);
-    std::string first_device =
-        TFE_TensorHandleBackingDeviceName(components[0].get(), status.get());
-    ASSERT_EQ(underlying_devices[0], first_device);
-    std::string second_device =
-        TFE_TensorHandleBackingDeviceName(components[1].get(), status.get());
-    ASSERT_EQ(underlying_devices[1], second_device);
-  }
 }
diff --git a/tensorflow/c/eager/tape.h b/tensorflow/c/eager/tape.h
index 27629bb3bdf..fcebe973500 100644
--- a/tensorflow/c/eager/tape.h
+++ b/tensorflow/c/eager/tape.h
@@ -146,13 +146,16 @@ class GradientTape {
   // once) and produces the gradient of the target tensors with respect to the
   // source tensors. The output gradients are used if not empty and not
   // null. The result is populated with one tensor per target element.
+  // When running backward functions, builds zeros-like tensors for
+  // incoming grads which are nullptrs, unless `build_default_zeros_grads`
+  // is set to false.
   Status ComputeGradient(
       const VSpace<Gradient, BackwardFunction, TapeTensor>& vspace,
       const gtl::ArraySlice<int64> target_tensor_ids,
       const gtl::ArraySlice<int64> source_tensor_ids,
       const std::unordered_map<int64, TapeTensor>& sources_that_are_targets,
       gtl::ArraySlice<Gradient*> output_gradients,
-      std::vector<Gradient*>* result);
+      std::vector<Gradient*>* result, bool build_default_zeros_grads = true);
 
   bool IsPersistent() const { return persistent_; }
 
@@ -655,8 +658,8 @@ Status GradientTape<Gradient, BackwardFunction, TapeTensor>::ComputeGradient(
     const gtl::ArraySlice<int64> target_tensor_ids,
     const gtl::ArraySlice<int64> source_tensor_ids,
     const std::unordered_map<int64, TapeTensor>& sources_that_are_targets,
-    gtl::ArraySlice<Gradient*> output_gradients,
-    std::vector<Gradient*>* result) {
+    gtl::ArraySlice<Gradient*> output_gradients, std::vector<Gradient*>* result,
+    bool build_default_zeros_grads) {
   std::unordered_set<int64> sources_set(source_tensor_ids.begin(),
                                         source_tensor_ids.end());
   BackpropInitialState<BackwardFunction, TapeTensor> state = PrepareBackprop(
@@ -717,14 +720,14 @@ Status GradientTape<Gradient, BackwardFunction, TapeTensor>::ComputeGradient(
       const int64 id = trace.output_tensor_info[i].GetID();
       auto grad_it = gradients.find(id);
       if (grad_it == gradients.end()) {
-        auto func_name_it =
-            FunctionsAcceptingNoneForIndicesMap()->find(trace.op_type);
-        if (func_name_it != FunctionsAcceptingNoneForIndicesMap()->end() &&
-            func_name_it->second.find(i) != func_name_it->second.end()) {
-          out_gradients.push_back(nullptr);
-        } else {
-          out_gradients.push_back(nullptr);
-          zero_indices.push_back(i);
+        out_gradients.push_back(nullptr);
+        if (build_default_zeros_grads) {
+          auto func_name_it =
+              FunctionsAcceptingNoneForIndicesMap()->find(trace.op_type);
+          if (func_name_it == FunctionsAcceptingNoneForIndicesMap()->end() ||
+              func_name_it->second.find(i) == func_name_it->second.end()) {
+            zero_indices.push_back(i);
+          }
         }
       } else {
         any_gradient_nonzero = true;
@@ -745,6 +748,7 @@ Status GradientTape<Gradient, BackwardFunction, TapeTensor>::ComputeGradient(
       }
     }
     std::vector<Gradient*> in_gradients;
+    DCHECK(build_default_zeros_grads || zero_indices.empty());
     if (any_gradient_nonzero) {
       for (const auto i : zero_indices) {
         out_gradients[i] = trace.output_tensor_info[i].ZerosLike();
diff --git a/tensorflow/c/experimental/filesystem/modular_filesystem.cc b/tensorflow/c/experimental/filesystem/modular_filesystem.cc
index 00a587521fd..9c8d3518800 100644
--- a/tensorflow/c/experimental/filesystem/modular_filesystem.cc
+++ b/tensorflow/c/experimental/filesystem/modular_filesystem.cc
@@ -35,8 +35,8 @@ using UniquePtrTo_TF_Status =
     ::std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)>;
 
 Status ModularFileSystem::NewRandomAccessFile(
-    const std::string& fname,
-    std::unique_ptr<RandomAccessFile>* result /*, TransactionToken* token */) {
+    const std::string& fname, TransactionToken* token,
+    std::unique_ptr<RandomAccessFile>* result) {
   if (ops_->new_random_access_file == nullptr)
     return errors::Unimplemented(tensorflow::strings::StrCat(
         "Filesystem for ", fname, " does not support NewRandomAccessFile()"));
@@ -55,8 +55,8 @@ Status ModularFileSystem::NewRandomAccessFile(
 }
 
 Status ModularFileSystem::NewWritableFile(
-    const std::string& fname,
-    std::unique_ptr<WritableFile>* result /*, TransactionToken* token */) {
+    const std::string& fname, TransactionToken* token,
+    std::unique_ptr<WritableFile>* result) {
   if (ops_->new_writable_file == nullptr)
     return errors::Unimplemented(tensorflow::strings::StrCat(
         "Filesystem for ", fname, " does not support NewWritableFile()"));
@@ -75,8 +75,8 @@ Status ModularFileSystem::NewWritableFile(
 }
 
 Status ModularFileSystem::NewAppendableFile(
-    const std::string& fname,
-    std::unique_ptr<WritableFile>* result /*, TransactionToken* token */) {
+    const std::string& fname, TransactionToken* token,
+    std::unique_ptr<WritableFile>* result) {
   if (ops_->new_appendable_file == nullptr)
     return errors::Unimplemented(tensorflow::strings::StrCat(
         "Filesystem for ", fname, " does not support NewAppendableFile()"));
@@ -95,8 +95,8 @@ Status ModularFileSystem::NewAppendableFile(
 }
 
 Status ModularFileSystem::NewReadOnlyMemoryRegionFromFile(
-    const std::string& fname, std::unique_ptr<ReadOnlyMemoryRegion>*
-                                  result /*, TransactionToken* token */) {
+    const std::string& fname, TransactionToken* token,
+    std::unique_ptr<ReadOnlyMemoryRegion>* result) {
   if (ops_->new_read_only_memory_region_from_file == nullptr)
     return errors::Unimplemented(tensorflow::strings::StrCat(
         "Filesystem for ", fname,
@@ -116,8 +116,8 @@ Status ModularFileSystem::NewReadOnlyMemoryRegionFromFile(
   return StatusFromTF_Status(plugin_status.get());
 }
 
-Status ModularFileSystem::FileExists(
-    const std::string& fname /*, TransactionToken* token */) {
+Status ModularFileSystem::FileExists(const std::string& fname,
+                                     TransactionToken* token) {
   if (ops_->path_exists == nullptr)
     return errors::Unimplemented(tensorflow::strings::StrCat(
         "Filesystem for ", fname, " does not support FileExists()"));
@@ -129,9 +129,9 @@ Status ModularFileSystem::FileExists(
   return StatusFromTF_Status(plugin_status.get());
 }
 
-bool ModularFileSystem::FilesExist(
-    const std::vector<std::string>& files,
-    std::vector<Status>* status /*, TransactionToken* token */) {
+bool ModularFileSystem::FilesExist(const std::vector<std::string>& files,
+                                   TransactionToken* token,
+                                   std::vector<Status>* status) {
   if (ops_->paths_exist == nullptr)
     return FileSystem::FilesExist(files, status);
 
@@ -162,9 +162,9 @@ bool ModularFileSystem::FilesExist(
   return result;
 }
 
-Status ModularFileSystem::GetChildren(
-    const std::string& dir,
-    std::vector<std::string>* result /*, TransactionToken* token */) {
+Status ModularFileSystem::GetChildren(const std::string& dir,
+                                      TransactionToken* token,
+                                      std::vector<std::string>* result) {
   if (ops_->get_children == nullptr)
     return errors::Unimplemented(tensorflow::strings::StrCat(
         "Filesystem for ", dir, " does not support GetChildren()"));
@@ -188,9 +188,9 @@ Status ModularFileSystem::GetChildren(
   return StatusFromTF_Status(plugin_status.get());
 }
 
-Status ModularFileSystem::GetMatchingPaths(
-    const std::string& pattern,
-    std::vector<std::string>* result /*, TransactionToken* token */) {
+Status ModularFileSystem::GetMatchingPaths(const std::string& pattern,
+                                           TransactionToken* token,
+                                           std::vector<std::string>* result) {
   if (ops_->get_matching_paths == nullptr)
     return internal::GetMatchingPaths(this, Env::Default(), pattern, result);
 
@@ -211,8 +211,8 @@ Status ModularFileSystem::GetMatchingPaths(
   return StatusFromTF_Status(plugin_status.get());
 }
 
-Status ModularFileSystem::DeleteFile(
-    const std::string& fname /*, TransactionToken* token */) {
+Status ModularFileSystem::DeleteFile(const std::string& fname,
+                                     TransactionToken* token) {
   if (ops_->delete_file == nullptr)
     return errors::Unimplemented(tensorflow::strings::StrCat(
         "Filesystem for ", fname, " does not support DeleteFile()"));
@@ -224,9 +224,10 @@ Status ModularFileSystem::DeleteFile(
   return StatusFromTF_Status(plugin_status.get());
 }
 
-Status ModularFileSystem::DeleteRecursively(
-    const std::string& dirname, int64* undeleted_files,
-    int64* undeleted_dirs /*, TransactionToken* token */) {
+Status ModularFileSystem::DeleteRecursively(const std::string& dirname,
+                                            TransactionToken* token,
+                                            int64* undeleted_files,
+                                            int64* undeleted_dirs) {
   if (undeleted_files == nullptr || undeleted_dirs == nullptr)
     return errors::FailedPrecondition(
         "DeleteRecursively must not be called with `undeleted_files` or "
@@ -247,8 +248,8 @@ Status ModularFileSystem::DeleteRecursively(
   return StatusFromTF_Status(plugin_status.get());
 }
 
-Status ModularFileSystem::DeleteDir(
-    const std::string& dirname /*, TransactionToken* token */) {
+Status ModularFileSystem::DeleteDir(const std::string& dirname,
+                                    TransactionToken* token) {
   if (ops_->delete_dir == nullptr)
     return errors::Unimplemented(tensorflow::strings::StrCat(
         "Filesystem for ", dirname, " does not support DeleteDir()"));
@@ -260,8 +261,8 @@ Status ModularFileSystem::DeleteDir(
   return StatusFromTF_Status(plugin_status.get());
 }
 
-Status ModularFileSystem::RecursivelyCreateDir(
-    const std::string& dirname /*, TransactionToken* token */) {
+Status ModularFileSystem::RecursivelyCreateDir(const std::string& dirname,
+                                               TransactionToken* token) {
   if (ops_->recursively_create_dir == nullptr)
     return FileSystem::RecursivelyCreateDir(dirname);
 
@@ -272,8 +273,8 @@ Status ModularFileSystem::RecursivelyCreateDir(
   return StatusFromTF_Status(plugin_status.get());
 }
 
-Status ModularFileSystem::CreateDir(
-    const std::string& dirname /*, TransactionToken* token */) {
+Status ModularFileSystem::CreateDir(const std::string& dirname,
+                                    TransactionToken* token) {
   if (ops_->create_dir == nullptr)
     return errors::Unimplemented(tensorflow::strings::StrCat(
         "Filesystem for ", dirname, " does not support CreateDir()"));
@@ -285,9 +286,8 @@ Status ModularFileSystem::CreateDir(
   return StatusFromTF_Status(plugin_status.get());
 }
 
-Status ModularFileSystem::Stat(
-    const std::string& fname,
-    FileStatistics* stat /*, TransactionToken* token */) {
+Status ModularFileSystem::Stat(const std::string& fname,
+                               TransactionToken* token, FileStatistics* stat) {
   if (ops_->stat == nullptr)
     return errors::Unimplemented(tensorflow::strings::StrCat(
         "Filesystem for ", fname, " does not support Stat()"));
@@ -310,8 +310,8 @@ Status ModularFileSystem::Stat(
   return StatusFromTF_Status(plugin_status.get());
 }
 
-Status ModularFileSystem::IsDirectory(
-    const std::string& name /*, TransactionToken* token */) {
+Status ModularFileSystem::IsDirectory(const std::string& name,
+                                      TransactionToken* token) {
   if (ops_->is_directory == nullptr) return FileSystem::IsDirectory(name);
 
   UniquePtrTo_TF_Status plugin_status(TF_NewStatus(), TF_DeleteStatus);
@@ -321,9 +321,9 @@ Status ModularFileSystem::IsDirectory(
   return StatusFromTF_Status(plugin_status.get());
 }
 
-Status ModularFileSystem::GetFileSize(
-    const std::string& fname,
-    uint64* file_size /*, TransactionToken* token */) {
+Status ModularFileSystem::GetFileSize(const std::string& fname,
+                                      TransactionToken* token,
+                                      uint64* file_size) {
   if (ops_->get_file_size == nullptr) {
     FileStatistics stat;
     Status status = Stat(fname, &stat);
@@ -342,9 +342,9 @@ Status ModularFileSystem::GetFileSize(
   return StatusFromTF_Status(plugin_status.get());
 }
 
-Status ModularFileSystem::RenameFile(
-    const std::string& src,
-    const std::string& target /*, TransactionToken* token */) {
+Status ModularFileSystem::RenameFile(const std::string& src,
+                                     const std::string& target,
+                                     TransactionToken* token) {
   if (ops_->rename_file == nullptr) {
     Status status = CopyFile(src, target);
     if (status.ok()) status = DeleteFile(src);
@@ -359,9 +359,9 @@ Status ModularFileSystem::RenameFile(
   return StatusFromTF_Status(plugin_status.get());
 }
 
-Status ModularFileSystem::CopyFile(
-    const std::string& src,
-    const std::string& target /*, TransactionToken* token */) {
+Status ModularFileSystem::CopyFile(const std::string& src,
+                                   const std::string& target,
+                                   TransactionToken* token) {
   if (ops_->copy_file == nullptr) return FileSystem::CopyFile(src, target);
 
   UniquePtrTo_TF_Status plugin_status(TF_NewStatus(), TF_DeleteStatus);
@@ -372,8 +372,7 @@ Status ModularFileSystem::CopyFile(
   return StatusFromTF_Status(plugin_status.get());
 }
 
-std::string ModularFileSystem::TranslateName(
-    const std::string& name /*, TransactionToken* token */) const {
+std::string ModularFileSystem::TranslateName(const std::string& name) const {
   if (ops_->translate_name == nullptr) return FileSystem::TranslateName(name);
 
   char* p = ops_->translate_name(filesystem_.get(), name.c_str());
@@ -385,7 +384,7 @@ std::string ModularFileSystem::TranslateName(
   return ret;
 }
 
-void ModularFileSystem::FlushCaches(/*TransactionToken* token*/) {
+void ModularFileSystem::FlushCaches(TransactionToken* token) {
   if (ops_->flush_caches != nullptr) ops_->flush_caches(filesystem_.get());
 }
 
diff --git a/tensorflow/c/experimental/filesystem/modular_filesystem.h b/tensorflow/c/experimental/filesystem/modular_filesystem.h
index a2639152eff..061a1aa446b 100644
--- a/tensorflow/c/experimental/filesystem/modular_filesystem.h
+++ b/tensorflow/c/experimental/filesystem/modular_filesystem.h
@@ -59,71 +59,48 @@ class ModularFileSystem final : public FileSystem {
 
   ~ModularFileSystem() override { ops_->cleanup(filesystem_.get()); }
 
+  TF_USE_FILESYSTEM_METHODS_WITH_NO_TRANSACTION_SUPPORT;
+
   Status NewRandomAccessFile(
-      const std::string& fname,
-      std::unique_ptr<RandomAccessFile>*
-          result /*, TransactionToken* token = nullptr */) override;
-  Status NewWritableFile(
-      const std::string& fname,
-      std::unique_ptr<WritableFile>*
-          result /*, TransactionToken* token = nullptr */) override;
-  Status NewAppendableFile(
-      const std::string& fname,
-      std::unique_ptr<WritableFile>*
-          result /*, TransactionToken* token = nullptr */) override;
+      const std::string& fname, TransactionToken* token,
+      std::unique_ptr<RandomAccessFile>* result) override;
+  Status NewWritableFile(const std::string& fname, TransactionToken* token,
+                         std::unique_ptr<WritableFile>* result) override;
+  Status NewAppendableFile(const std::string& fname, TransactionToken* token,
+                           std::unique_ptr<WritableFile>* result) override;
   Status NewReadOnlyMemoryRegionFromFile(
-      const std::string& fname,
-      std::unique_ptr<ReadOnlyMemoryRegion>*
-          result /*, TransactionToken* token = nullptr */) override;
-  Status FileExists(
-      const std::string& fname /*, TransactionToken* token = nullptr */)
-      override;
+      const std::string& fname, TransactionToken* token,
+      std::unique_ptr<ReadOnlyMemoryRegion>* result) override;
+  Status FileExists(const std::string& fname, TransactionToken* token) override;
   bool FilesExist(const std::vector<std::string>& files,
-                  std::vector<Status>*
-                      status /*, TransactionToken* token = nullptr */) override;
-  Status GetChildren(
-      const std::string& dir,
-      std::vector<std::string>* result /*, TransactionToken* token = nullptr */)
-      override;
-  Status GetMatchingPaths(
-      const std::string& pattern,
-      std::vector<std::string>*
-          results /*, TransactionToken* token = nullptr */) override;
-  Status DeleteFile(
-      const std::string& fname /*, TransactionToken* token = nullptr */)
-      override;
-  Status DeleteRecursively(
-      const std::string& dirname, int64* undeleted_files,
-      int64* undeleted_dirs /*, TransactionToken* token = nullptr */) override;
-  Status DeleteDir(
-      const std::string& dirname /*, TransactionToken* token = nullptr */)
-      override;
-  Status RecursivelyCreateDir(
-      const std::string& dirname /*, TransactionToken* token = nullptr */)
-      override;
-  Status CreateDir(
-      const std::string& dirname /*, TransactionToken* token = nullptr */)
-      override;
-  Status Stat(
-      const std::string& fname,
-      FileStatistics* stat /*, TransactionToken* token = nullptr */) override;
-  Status IsDirectory(
-      const std::string& fname /*, TransactionToken* token = nullptr */)
-      override;
-  Status GetFileSize(
-      const std::string& fname,
-      uint64* file_size /*, TransactionToken* token = nullptr */) override;
-  Status RenameFile(
-      const std::string& src,
-      const std::string& target /*, TransactionToken* token = nullptr */)
-      override;
-  Status CopyFile(const std::string& src,
-                  const std::string&
-                      target /*, TransactionToken* token = nullptr */) override;
-  std::string TranslateName(
-      const std::string& name /*, TransactionToken* token = nullptr */)
-      const override;
-  void FlushCaches(/* TransactionToken* token=nullptr */) override;
+                  TransactionToken* token,
+                  std::vector<Status>* status) override;
+  Status GetChildren(const std::string& dir, TransactionToken* token,
+                     std::vector<std::string>* result) override;
+  Status GetMatchingPaths(const std::string& pattern, TransactionToken* token,
+                          std::vector<std::string>* results) override;
+  Status DeleteFile(const std::string& fname, TransactionToken* token) override;
+  Status DeleteRecursively(const std::string& dirname, TransactionToken* token,
+                           int64* undeleted_files,
+                           int64* undeleted_dirs) override;
+  Status DeleteDir(const std::string& dirname,
+                   TransactionToken* token) override;
+  Status RecursivelyCreateDir(const std::string& dirname,
+                              TransactionToken* token) override;
+  Status CreateDir(const std::string& dirname,
+                   TransactionToken* token) override;
+  Status Stat(const std::string& fname, TransactionToken* token,
+              FileStatistics* stat) override;
+  Status IsDirectory(const std::string& fname,
+                     TransactionToken* token) override;
+  Status GetFileSize(const std::string& fname, TransactionToken* token,
+                     uint64* file_size) override;
+  Status RenameFile(const std::string& src, const std::string& target,
+                    TransactionToken* token) override;
+  Status CopyFile(const std::string& src, const std::string& target,
+                  TransactionToken* token) override;
+  std::string TranslateName(const std::string& name) const override;
+  void FlushCaches(TransactionToken* token) override;
 
  private:
   std::unique_ptr<TF_Filesystem> filesystem_;
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD b/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
index b2636571c25..54217db1de0 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
@@ -29,10 +29,12 @@ cc_library(
         ":gcs_helper",
         ":ram_file_block_cache",
         "//tensorflow/c:env",
+        "//tensorflow/c:logging",
         "//tensorflow/c:tf_status",
         "//tensorflow/c/experimental/filesystem:filesystem_interface",
         "@com_github_googlecloudplatform_google_cloud_cpp//:storage_client",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:variant",
     ],
 )
 
@@ -58,6 +60,7 @@ cc_library(
     deps = [
         ":cleanup",
         "//tensorflow/c:env",
+        "//tensorflow/c:logging",
         "//tensorflow/c:tf_status",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/synchronization",
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
index b6b481cda66..8cd8ad7ca81 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
@@ -19,9 +19,11 @@ limitations under the License.
 
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
+#include "absl/types/variant.h"
 #include "google/cloud/storage/client.h"
 #include "tensorflow/c/env.h"
 #include "tensorflow/c/experimental/filesystem/plugins/gcs/gcs_helper.h"
+#include "tensorflow/c/logging.h"
 #include "tensorflow/c/tf_status.h"
 
 // Implementation of a filesystem for GCS environments.
@@ -119,20 +121,20 @@ static int64_t LoadBufferFromGCS(const std::string& path, size_t offset,
     return -1;
   }
   int64_t read;
-  if (!absl::SimpleAtoi(stream.headers().find("content-length")->second,
-                        &read)) {
+  auto content_length = stream.headers().find("content-length");
+  if (content_length == stream.headers().end()) {
     // When we read a file with offset that is bigger than the actual file size.
     // GCS will return an empty header (e.g no `content-length` header). In this
     // case, we will set read to `0` and continue.
-    if (TF_GetCode(status) == TF_OUT_OF_RANGE) {
-      read = 0;
-    } else {
-      TF_SetStatus(status, TF_UNKNOWN, "Could not get content-length header");
-      return -1;
-    }
+    read = 0;
+  } else if (!absl::SimpleAtoi(content_length->second, &read)) {
+    TF_SetStatus(status, TF_UNKNOWN, "Could not get content-length header");
+    return -1;
   }
   // `TF_OUT_OF_RANGE` isn't considered as an error. So we clear it here.
   TF_SetStatus(status, TF_OK, "");
+  TF_VLog(1, "Successful read of %s @ %u of size: %u", path.c_str(), offset,
+          read);
   stream.read(buffer, read);
   read = stream.gcount();
   if (read < buffer_size) {
@@ -145,6 +147,8 @@ static int64_t LoadBufferFromGCS(const std::string& path, size_t offset,
                                   path, " @ ", offset)
                          .c_str());
       }
+      TF_VLog(2, "Successful integrity check for: %s @ %u", path.c_str(),
+              offset);
     }
   }
   return read;
@@ -258,7 +262,8 @@ static void SyncImpl(const std::string& bucket, const std::string& object,
   if (*offset == -1 || *offset == 0) {
     // UploadFile will automatically switch to resumable upload based on Client
     // configuration.
-    auto metadata = gcs_client->UploadFile(outfile->getName(), bucket, object);
+    auto metadata = gcs_client->UploadFile(outfile->getName(), bucket, object,
+                                           gcs::Fields("size"));
     if (!metadata) {
       TF_SetStatusFromGCSStatus(metadata.status(), status);
       return;
@@ -277,15 +282,18 @@ static void SyncImpl(const std::string& bucket, const std::string& object,
   } else {
     std::string temporary_object =
         gcs::CreateRandomPrefixName("tf_writable_file_gcs");
-    auto metadata =
-        gcs_client->UploadFile(outfile->getName(), bucket, temporary_object);
+    auto metadata = gcs_client->UploadFile(outfile->getName(), bucket,
+                                           temporary_object, gcs::Fields(""));
     if (!metadata) {
       TF_SetStatusFromGCSStatus(metadata.status(), status);
       return;
     }
+    TF_VLog(3, "AppendObject: gs://%s/%s to gs://%s/%s", bucket.c_str(),
+            temporary_object.c_str(), bucket.c_str(), object.c_str());
     const std::vector<gcs::ComposeSourceObject> source_objects = {
         {object, {}, {}}, {temporary_object, {}, {}}};
-    metadata = gcs_client->ComposeObject(bucket, source_objects, object);
+    metadata = gcs_client->ComposeObject(bucket, source_objects, object,
+                                         gcs::Fields("size"));
     if (!metadata) {
       TF_SetStatusFromGCSStatus(metadata.status(), status);
       return;
@@ -320,6 +328,8 @@ void Append(const TF_WritableFile* file, const char* buffer, size_t n,
                  "The internal temporary file is not writable.");
     return;
   }
+  TF_VLog(3, "Append: gs://%s/%s size %u", gcs_file->bucket.c_str(),
+          gcs_file->object.c_str(), n);
   gcs_file->sync_need = true;
   gcs_file->outfile.write(buffer, n);
   if (!gcs_file->outfile)
@@ -345,6 +355,8 @@ int64_t Tell(const TF_WritableFile* file, TF_Status* status) {
 void Flush(const TF_WritableFile* file, TF_Status* status) {
   auto gcs_file = static_cast<GCSFile*>(file->plugin_file);
   if (gcs_file->sync_need) {
+    TF_VLog(3, "Flush started: gs://%s/%s", gcs_file->bucket.c_str(),
+            gcs_file->object.c_str());
     if (!gcs_file->outfile) {
       TF_SetStatus(status, TF_INTERNAL,
                    "Could not append to the internal temporary file.");
@@ -352,6 +364,8 @@ void Flush(const TF_WritableFile* file, TF_Status* status) {
     }
     SyncImpl(gcs_file->bucket, gcs_file->object, &gcs_file->offset,
              &gcs_file->outfile, gcs_file->gcs_client, status);
+    TF_VLog(3, "Flush finished: gs://%s/%s", gcs_file->bucket.c_str(),
+            gcs_file->object.c_str());
     if (TF_GetCode(status) != TF_OK) return;
     gcs_file->sync_need = false;
   } else {
@@ -360,11 +374,16 @@ void Flush(const TF_WritableFile* file, TF_Status* status) {
 }
 
 void Sync(const TF_WritableFile* file, TF_Status* status) {
+  auto gcs_file = static_cast<GCSFile*>(file->plugin_file);
+  TF_VLog(3, "Sync: gs://%s/%s", gcs_file->bucket.c_str(),
+          gcs_file->object.c_str());
   Flush(file, status);
 }
 
 void Close(const TF_WritableFile* file, TF_Status* status) {
   auto gcs_file = static_cast<GCSFile*>(file->plugin_file);
+  TF_VLog(3, "Close: gs://%s/%s", gcs_file->bucket.c_str(),
+          gcs_file->object.c_str());
   if (gcs_file->sync_need) {
     Flush(file, status);
   }
@@ -427,6 +446,8 @@ GCSFile::GCSFile(google::cloud::storage::Client&& gcs_client)
   if (absl::SimpleAtoi(std::getenv(kMaxStaleness), &value)) {
     max_staleness = value;
   }
+  TF_VLog(1, "GCS cache max size = %u ; block size = %u ; max staleness = %u",
+          max_bytes, block_size, max_staleness);
 
   file_block_cache = std::make_unique<RamFileBlockCache>(
       block_size, max_bytes, max_staleness,
@@ -503,13 +524,18 @@ void Cleanup(TF_Filesystem* filesystem) {
 static void UncachedStatForObject(const std::string& bucket,
                                   const std::string& object, GcsFileStat* stat,
                                   gcs::Client* gcs_client, TF_Status* status) {
-  auto metadata = gcs_client->GetObjectMetadata(bucket, object);
+  auto metadata = gcs_client->GetObjectMetadata(
+      bucket, object, gcs::Fields("generation,size,timeStorageClassUpdated"));
   if (!metadata) return TF_SetStatusFromGCSStatus(metadata.status(), status);
   stat->generation_number = metadata->generation();
   stat->base.length = metadata->size();
   stat->base.mtime_nsec =
       metadata->time_storage_class_updated().time_since_epoch().count();
   stat->base.is_directory = object.back() == '/';
+  TF_VLog(1,
+          "Stat of: gs://%s/%s --  length: %u generation: %u; mtime_nsec: %u;",
+          bucket.c_str(), object.c_str(), stat->base.length,
+          stat->generation_number, stat->base.mtime_nsec);
   return TF_SetStatus(status, TF_OK, "");
 }
 
@@ -544,9 +570,10 @@ void NewRandomAccessFile(const TF_Filesystem* filesystem, const char* path,
       if (TF_GetCode(status) != TF_OK) return -1;
       if (!gcs_file->file_block_cache->ValidateAndUpdateFileSignature(
               path, stat.generation_number)) {
-        std::cout
-            << "File signature has been changed. Refreshing the cache. Path: "
-            << path;
+        TF_VLog(
+            1,
+            "File signature has been changed. Refreshing the cache. Path: %s",
+            path.c_str());
       }
       read = gcs_file->file_block_cache->Read(path, offset, n, buffer, status);
     } else {
@@ -578,6 +605,7 @@ void NewWritableFile(const TF_Filesystem* filesystem, const char* path,
        (gcs_file->compose ? 0 : -1)});
   // We are responsible for freeing the pointer returned by TF_GetTempFileName
   free(temp_file_name);
+  TF_VLog(3, "GcsWritableFile: %s", path);
   TF_SetStatus(status, TF_OK, "");
 }
 
@@ -607,7 +635,8 @@ void NewAppendableFile(const TF_Filesystem* filesystem, const char* path,
   } else {
     // If compose is true, we do not download anything.
     // Instead we only check if this file exists on server or not.
-    auto metadata = gcs_file->gcs_client.GetObjectMetadata(bucket, object);
+    auto metadata = gcs_file->gcs_client.GetObjectMetadata(bucket, object,
+                                                           gcs::Fields("size"));
     TF_SetStatusFromGCSStatus(metadata.status(), status);
     if (TF_GetCode(status) == TF_OK) {
       file->plugin_file = new tf_writable_file::GCSFile(
@@ -623,7 +652,8 @@ void NewAppendableFile(const TF_Filesystem* filesystem, const char* path,
       return;
     }
   }
-
+  TF_VLog(3, "GcsWritableFile: %s with existing file %s", path,
+          temp_file_name.c_str());
   TF_SetStatus(status, TF_OK, "");
 }
 
@@ -638,7 +668,8 @@ void NewReadOnlyMemoryRegionFromFile(const TF_Filesystem* filesystem,
   if (TF_GetCode(status) != TF_OK) return;
 
   auto gcs_file = static_cast<GCSFile*>(filesystem->plugin_filesystem);
-  auto metadata = gcs_file->gcs_client.GetObjectMetadata(bucket, object);
+  auto metadata = gcs_file->gcs_client.GetObjectMetadata(bucket, object,
+                                                         gcs::Fields("size"));
   if (!metadata) {
     TF_SetStatusFromGCSStatus(metadata.status(), status);
     return;
@@ -663,28 +694,190 @@ void NewReadOnlyMemoryRegionFromFile(const TF_Filesystem* filesystem,
   }
 }
 
-void CreateDir(const TF_Filesystem* filesystem, const char* path,
-               TF_Status* status) {
+static void StatForObject(GCSFile* gcs_file, const std::string& path,
+                          const std::string& bucket, const std::string& object,
+                          GcsFileStat* stat, TF_Status* status) {
+  if (object.empty())
+    return TF_SetStatus(
+        status, TF_INVALID_ARGUMENT,
+        absl::StrCat("'object' must be a non-empty string. (File: ", path, ")")
+            .c_str());
+  TF_SetStatus(status, TF_OK, "");
+  gcs_file->stat_cache->LookupOrCompute(
+      path, stat,
+      [gcs_file, bucket, object](const std::string& path, GcsFileStat* stat,
+                                 TF_Status* status) {
+        UncachedStatForObject(bucket, object, stat, &gcs_file->gcs_client,
+                              status);
+      },
+      status);
+}
+
+static bool ObjectExists(GCSFile* gcs_file, const std::string& path,
+                         const std::string& bucket, const std::string& object,
+                         TF_Status* status) {
+  GcsFileStat stat;
+  StatForObject(gcs_file, path, bucket, object, &stat, status);
+  if (TF_GetCode(status) != TF_OK && TF_GetCode(status) != TF_NOT_FOUND)
+    return false;
+  if (TF_GetCode(status) == TF_NOT_FOUND) {
+    TF_SetStatus(status, TF_OK, "");
+    return false;
+  }
+  return !stat.base.is_directory;
+}
+
+static bool BucketExists(GCSFile* gcs_file, const std::string& bucket,
+                         TF_Status* status) {
+  auto metadata =
+      gcs_file->gcs_client.GetBucketMetadata(bucket, gcs::Fields(""));
+  TF_SetStatusFromGCSStatus(metadata.status(), status);
+  if (TF_GetCode(status) != TF_OK && TF_GetCode(status) != TF_NOT_FOUND)
+    return false;
+  if (TF_GetCode(status) == TF_NOT_FOUND) {
+    TF_SetStatus(status, TF_OK, "");
+    return false;
+  }
+  return true;
+}
+
+static std::vector<std::string> GetChildrenBounded(
+    GCSFile* gcs_file, std::string dir, uint64_t max_results, bool recursive,
+    bool include_self_directory_marker, TF_Status* status) {
+  std::string bucket, prefix;
+  MaybeAppendSlash(&dir);
+  ParseGCSPath(dir, true, &bucket, &prefix, status);
+
+  std::vector<std::string> result;
+  uint64_t count = 0;
+  std::string delimiter = recursive ? "" : "/";
+
+  for (auto&& item : gcs_file->gcs_client.ListObjectsAndPrefixes(
+           bucket, gcs::Prefix(prefix), gcs::Delimiter(delimiter),
+           gcs::Fields("items(name),prefixes"))) {
+    if (count == max_results) {
+      TF_SetStatus(status, TF_OK, "");
+      return result;
+    }
+    if (!item) {
+      TF_SetStatusFromGCSStatus(item.status(), status);
+      return result;
+    }
+    auto value = *std::move(item);
+    std::string children = absl::holds_alternative<std::string>(value)
+                               ? absl::get<std::string>(value)
+                               : absl::get<gcs::ObjectMetadata>(value).name();
+    auto pos = children.find(prefix);
+    if (pos != 0) {
+      TF_SetStatus(status, TF_INTERNAL,
+                   absl::StrCat("Unexpected response: the returned file name ",
+                                children, " doesn't match the prefix ", prefix)
+                       .c_str());
+      return result;
+    }
+    children.erase(0, prefix.length());
+    if (!children.empty() || include_self_directory_marker) {
+      result.emplace_back(children);
+    }
+    ++count;
+  }
+
+  return result;
+}
+
+static bool FolderExists(GCSFile* gcs_file, std::string dir,
+                         TF_Status* status) {
+  ExpiringLRUCache<GcsFileStat>::ComputeFunc compute_func =
+      [gcs_file](const std::string& dir, GcsFileStat* stat, TF_Status* status) {
+        auto children =
+            GetChildrenBounded(gcs_file, dir, 1, true, true, status);
+        if (TF_GetCode(status) != TF_OK) return;
+        if (!children.empty()) {
+          stat->base = {0, 0, true};
+          return TF_SetStatus(status, TF_OK, "");
+        } else {
+          return TF_SetStatus(status, TF_INVALID_ARGUMENT, "Not a directory!");
+        }
+      };
+  GcsFileStat stat;
+  MaybeAppendSlash(&dir);
+  gcs_file->stat_cache->LookupOrCompute(dir, &stat, compute_func, status);
+  if (TF_GetCode(status) != TF_OK && TF_GetCode(status) != TF_INVALID_ARGUMENT)
+    return false;
+  if (TF_GetCode(status) == TF_INVALID_ARGUMENT) {
+    TF_SetStatus(status, TF_OK, "");
+    return false;
+  }
+  return true;
+}
+
+static void ClearFileCaches(GCSFile* gcs_file, const std::string& path) {
+  absl::ReaderMutexLock l(&gcs_file->block_cache_lock);
+  gcs_file->file_block_cache->RemoveFile(path);
+  gcs_file->stat_cache->Delete(path);
+}
+
+void PathExists(const TF_Filesystem* filesystem, const char* path,
+                TF_Status* status) {
   std::string bucket, object;
   ParseGCSPath(path, true, &bucket, &object, status);
   if (TF_GetCode(status) != TF_OK) return;
+
   auto gcs_file = static_cast<GCSFile*>(filesystem->plugin_filesystem);
   if (object.empty()) {
-    auto bucket_metadata = gcs_file->gcs_client.GetBucketMetadata(bucket);
-    TF_SetStatusFromGCSStatus(bucket_metadata.status(), status);
+    bool result = BucketExists(gcs_file, bucket, status);
+    if (result) return TF_SetStatus(status, TF_OK, "");
+  }
+
+  GcsFileStat stat;
+  StatForObject(gcs_file, path, bucket, object, &stat, status);
+  if (TF_GetCode(status) != TF_NOT_FOUND) return;
+
+  bool result = FolderExists(gcs_file, path, status);
+  if (TF_GetCode(status) != TF_OK || (TF_GetCode(status) == TF_OK && result))
+    return;
+  return TF_SetStatus(
+      status, TF_NOT_FOUND,
+      absl::StrCat("The path ", path, " does not exist.").c_str());
+}
+
+void CreateDir(const TF_Filesystem* filesystem, const char* path,
+               TF_Status* status) {
+  std::string dir = path;
+  MaybeAppendSlash(&dir);
+  TF_VLog(3,
+          "CreateDir: creating directory with path: %s and "
+          "path_with_slash: %s",
+          path, dir.c_str());
+  std::string bucket, object;
+  ParseGCSPath(dir, true, &bucket, &object, status);
+  if (TF_GetCode(status) != TF_OK) return;
+  auto gcs_file = static_cast<GCSFile*>(filesystem->plugin_filesystem);
+  if (object.empty()) {
+    bool is_directory = BucketExists(gcs_file, bucket, status);
+    if (TF_GetCode(status) != TF_OK) return;
+    if (!is_directory)
+      TF_SetStatus(status, TF_NOT_FOUND,
+                   absl::StrCat("The specified bucket ", dir, " was not found.")
+                       .c_str());
     return;
   }
 
-  MaybeAppendSlash(&object);
-  auto object_metadata = gcs_file->gcs_client.GetObjectMetadata(bucket, object);
-  TF_SetStatusFromGCSStatus(object_metadata.status(), status);
-  if (TF_GetCode(status) == TF_NOT_FOUND) {
-    auto insert_metadata =
-        gcs_file->gcs_client.InsertObject(bucket, object, "");
-    TF_SetStatusFromGCSStatus(insert_metadata.status(), status);
-  } else if (TF_GetCode(status) == TF_OK) {
-    TF_SetStatus(status, TF_ALREADY_EXISTS, path);
+  PathExists(filesystem, dir.c_str(), status);
+  if (TF_GetCode(status) == TF_OK) {
+    // Use the original name for a correct error here.
+    TF_VLog(3, "CreateDir: directory already exists, not uploading %s", path);
+    return TF_SetStatus(status, TF_ALREADY_EXISTS, path);
   }
+
+  auto metadata = gcs_file->gcs_client.InsertObject(
+      bucket, object, "",
+      // Adding this parameter means HTTP_CODE_PRECONDITION_FAILED
+      // will be returned if the object already exists, so avoid reuploading.
+      gcs::IfGenerationMatch(0), gcs::Fields(""));
+  TF_SetStatusFromGCSStatus(metadata.status(), status);
+  if (TF_GetCode(status) == TF_FAILED_PRECONDITION)
+    TF_SetStatus(status, TF_ALREADY_EXISTS, path);
 }
 
 // TODO(vnvo2409): `RecursivelyCreateDir` should use `CreateDir` instead of the
@@ -700,79 +893,31 @@ void DeleteFile(const TF_Filesystem* filesystem, const char* path,
   auto gcs_file = static_cast<GCSFile*>(filesystem->plugin_filesystem);
   auto gcs_status = gcs_file->gcs_client.DeleteObject(bucket, object);
   TF_SetStatusFromGCSStatus(gcs_status, status);
+  if (TF_GetCode(status) == TF_OK) ClearFileCaches(gcs_file, path);
 }
 
+// Checks that the directory is empty (i.e no objects with this prefix exist).
+// Deletes the GCS directory marker if it exists.
 void DeleteDir(const TF_Filesystem* filesystem, const char* path,
                TF_Status* status) {
-  std::string bucket, object;
-  ParseGCSPath(path, false, &bucket, &object, status);
-  if (TF_GetCode(status) != TF_OK) return;
-  MaybeAppendSlash(&object);
+  // A directory is considered empty either if there are no matching objects
+  // with the corresponding name prefix or if there is exactly one matching
+  // object and it is the directory marker. Therefore we need to retrieve
+  // at most two children for the prefix to detect if a directory is empty.
   auto gcs_file = static_cast<GCSFile*>(filesystem->plugin_filesystem);
-  int object_count = 0;
-  for (auto&& metadata :
-       gcs_file->gcs_client.ListObjects(bucket, gcs::Prefix(object))) {
-    if (!metadata) {
-      TF_SetStatusFromGCSStatus(metadata.status(), status);
-      return;
-    }
-    ++object_count;
-    // We consider a path is a non-empty directory in two cases:
-    // - There are more than two objects whose keys start with the name of this
-    // directory.
-    // - There is one object whose key contains the name of this directory ( but
-    // not equal ).
-    if (object_count > 1 || metadata->name() != object) {
-      TF_SetStatus(status, TF_FAILED_PRECONDITION,
-                   "Cannot delete a non-empty directory.");
-      return;
-    }
-  }
-  auto gcs_status = gcs_file->gcs_client.DeleteObject(bucket, object);
-  TF_SetStatusFromGCSStatus(gcs_status, status);
-}
-
-// TODO(vnvo2409): `DeleteRecursively` needs `GetChildrens` but there will be
-// some differents compared to the default implementation. Will be refactored.
-static void DeleteRecursively(const TF_Filesystem* filesystem, const char* path,
-                              uint64_t* undeleted_files,
-                              uint64_t* undeleted_dirs, TF_Status* status) {
-  std::string bucket, object;
-  ParseGCSPath(path, false, &bucket, &object, status);
+  auto childrens = GetChildrenBounded(gcs_file, path, 2, true, true, status);
   if (TF_GetCode(status) != TF_OK) return;
-
-  auto gcs_file = static_cast<GCSFile*>(filesystem->plugin_filesystem);
-  auto gcs_status = gcs::DeleteByPrefix(gcs_file->gcs_client, bucket, object);
-  TF_SetStatusFromGCSStatus(gcs_status, status);
-  if (TF_GetCode(status) != TF_OK) return;
-  *undeleted_dirs = 0;
-  *undeleted_files = 0;
-}
-
-// TODO(vnvo2409): `RewriteObjectBlocking` will set `status` to `TF_NOT_FOUND`
-// if the object does not exist. In that case, we will have to check if the
-// `src` is a directory or not to set the correspondent `status` (i.e
-// `TF_NOT_FOUND` if path `src` does not exist, `TF_FAILED_PRECONDITION` if
-// path `src` is a directory).
-void RenameFile(const TF_Filesystem* filesystem, const char* src,
-                const char* dst, TF_Status* status) {
-  std::string bucket_src, object_src;
-  ParseGCSPath(src, false, &bucket_src, &object_src, status);
-  if (TF_GetCode(status) != TF_OK) return;
-
-  std::string bucket_dst, object_dst;
-  ParseGCSPath(dst, false, &bucket_dst, &object_dst, status);
-  if (TF_GetCode(status) != TF_OK) return;
-
-  auto gcs_file = static_cast<GCSFile*>(filesystem->plugin_filesystem);
-  auto metadata = gcs_file->gcs_client.RewriteObjectBlocking(
-      bucket_src, object_src, bucket_dst, object_dst);
-  if (!metadata) {
-    TF_SetStatusFromGCSStatus(metadata.status(), status);
+  if (childrens.size() > 1 || (childrens.size() == 1 && !childrens[0].empty()))
+    return TF_SetStatus(status, TF_FAILED_PRECONDITION,
+                        "Cannot delete a non-empty directory.");
+  if (childrens.size() == 1 && childrens[0].empty()) {
+    // This is the directory marker object. Delete it.
+    std::string dir = path;
+    MaybeAppendSlash(&dir);
+    DeleteFile(filesystem, dir.c_str(), status);
     return;
   }
-  auto gcs_status = gcs_file->gcs_client.DeleteObject(bucket_src, object_src);
-  TF_SetStatusFromGCSStatus(gcs_status, status);
+  TF_SetStatus(status, TF_OK, "");
 }
 
 void CopyFile(const TF_Filesystem* filesystem, const char* src, const char* dst,
@@ -787,35 +932,11 @@ void CopyFile(const TF_Filesystem* filesystem, const char* src, const char* dst,
 
   auto gcs_file = static_cast<GCSFile*>(filesystem->plugin_filesystem);
   auto metadata = gcs_file->gcs_client.RewriteObjectBlocking(
-      bucket_src, object_src, bucket_dst, object_dst);
+      bucket_src, object_src, bucket_dst, object_dst,
+      gcs::Fields("done,rewriteToken"));
   TF_SetStatusFromGCSStatus(metadata.status(), status);
 }
 
-// TODO(vnvo2409): This approach can cause a problem when our path is
-// `path/to/dir` and there is an object with key `path/to/directory`. Will be
-// fixed when refactoring.
-void PathExists(const TF_Filesystem* filesystem, const char* path,
-                TF_Status* status) {
-  std::string bucket, object;
-  ParseGCSPath(path, true, &bucket, &object, status);
-  if (TF_GetCode(status) != TF_OK) return;
-
-  auto gcs_file = static_cast<GCSFile*>(filesystem->plugin_filesystem);
-  for (auto&& metadata :
-       gcs_file->gcs_client.ListObjects(bucket, gcs::Prefix(object))) {
-    if (!metadata) {
-      TF_SetStatusFromGCSStatus(metadata.status(), status);
-      return;
-    }
-    // We consider a path exists if there is at least one object whose key
-    // contains the path.
-    return TF_SetStatus(status, TF_OK, "");
-  }
-  return TF_SetStatus(
-      status, TF_NOT_FOUND,
-      absl::StrCat("The path ", path, " does not exist.").c_str());
-}
-
 bool IsDirectory(const TF_Filesystem* filesystem, const char* path,
                  TF_Status* status) {
   std::string bucket, object;
@@ -824,41 +945,133 @@ bool IsDirectory(const TF_Filesystem* filesystem, const char* path,
 
   auto gcs_file = static_cast<GCSFile*>(filesystem->plugin_filesystem);
   if (object.empty()) {
-    auto bucket_metadata = gcs_file->gcs_client.GetBucketMetadata(bucket);
-    TF_SetStatusFromGCSStatus(bucket_metadata.status(), status);
-    if (TF_GetCode(status) == TF_OK)
-      return true;
-    else
-      return false;
+    bool result = BucketExists(gcs_file, bucket, status);
+    if (TF_GetCode(status) != TF_OK) return false;
+    if (!result)
+      TF_SetStatus(
+          status, TF_NOT_FOUND,
+          absl::StrCat("The specified bucket gs://", bucket, " was not found.")
+              .c_str());
+    return result;
   }
 
-  // We check if there is an object with this key on the GCS server.
-  auto metadata = gcs_file->gcs_client.GetObjectMetadata(bucket, object);
-  if (metadata) {
-    TF_SetStatus(status, TF_OK, "");
-    if (metadata->name().back() == '/')
-      return true;
-    else
-      return false;
-  }
+  bool is_folder = FolderExists(gcs_file, path, status);
+  if (TF_GetCode(status) != TF_OK) return false;
+  if (is_folder) return true;
 
-  // If there is no object with this key on the GCS server. We check if there is
-  // any object whose key contains that path.
-  MaybeAppendSlash(&object);
-  for (auto&& metadata :
-       gcs_file->gcs_client.ListObjects(bucket, gcs::Prefix(object))) {
-    if (!metadata) {
-      TF_SetStatusFromGCSStatus(metadata.status(), status);
-      return false;
-    }
-    TF_SetStatus(status, TF_OK, "");
-    return true;
+  bool is_object = ObjectExists(gcs_file, path, bucket, object, status);
+  if (TF_GetCode(status) != TF_OK) return false;
+  if (is_object) {
+    TF_SetStatus(
+        status, TF_FAILED_PRECONDITION,
+        absl::StrCat("The specified path ", path, " is not a directory.")
+            .c_str());
+    return false;
   }
   TF_SetStatus(status, TF_NOT_FOUND,
                absl::StrCat("The path ", path, " does not exist.").c_str());
   return false;
 }
 
+static void RenameObject(const TF_Filesystem* filesystem,
+                         const std::string& src, const std::string& dst,
+                         TF_Status* status) {
+  TF_VLog(3, "RenameObject: started %s to %s", src.c_str(), dst.c_str());
+  std::string bucket_src, object_src;
+  ParseGCSPath(src, false, &bucket_src, &object_src, status);
+  if (TF_GetCode(status) != TF_OK) return;
+
+  std::string bucket_dst, object_dst;
+  ParseGCSPath(dst, false, &bucket_dst, &object_dst, status);
+  if (TF_GetCode(status) != TF_OK) return;
+
+  auto gcs_file = static_cast<GCSFile*>(filesystem->plugin_filesystem);
+  auto metadata = gcs_file->gcs_client.RewriteObjectBlocking(
+      bucket_src, object_src, bucket_dst, object_dst,
+      gcs::Fields("done,rewriteToken"));
+  TF_SetStatusFromGCSStatus(metadata.status(), status);
+  if (TF_GetCode(status) != TF_OK) return;
+  TF_VLog(3, "RenameObject: finished %s to %s", src.c_str(), dst.c_str());
+
+  ClearFileCaches(gcs_file, dst);
+  DeleteFile(filesystem, src.c_str(), status);
+}
+
+void RenameFile(const TF_Filesystem* filesystem, const char* src,
+                const char* dst, TF_Status* status) {
+  if (!IsDirectory(filesystem, src, status)) {
+    if (TF_GetCode(status) == TF_FAILED_PRECONDITION) {
+      TF_SetStatus(status, TF_OK, "");
+      RenameObject(filesystem, src, dst, status);
+    }
+    return;
+  }
+
+  auto gcs_file = static_cast<GCSFile*>(filesystem->plugin_filesystem);
+  std::vector<std::string> childrens =
+      GetChildrenBounded(gcs_file, src, UINT64_MAX, true, true, status);
+  if (TF_GetCode(status) != TF_OK) return;
+
+  std::string src_dir = src;
+  std::string dst_dir = dst;
+  MaybeAppendSlash(&src_dir);
+  MaybeAppendSlash(&dst_dir);
+  for (const std::string& children : childrens) {
+    RenameObject(filesystem, src_dir + children, dst_dir + children, status);
+    if (TF_GetCode(status) != TF_OK) return;
+  }
+  TF_SetStatus(status, TF_OK, "");
+}
+
+void DeleteRecursively(const TF_Filesystem* filesystem, const char* path,
+                       uint64_t* undeleted_files, uint64_t* undeleted_dirs,
+                       TF_Status* status) {
+  if (!undeleted_files || !undeleted_dirs)
+    return TF_SetStatus(
+        status, TF_INTERNAL,
+        "'undeleted_files' and 'undeleted_dirs' cannot be nullptr.");
+  *undeleted_files = 0;
+  *undeleted_dirs = 0;
+  if (!IsDirectory(filesystem, path, status)) {
+    *undeleted_dirs = 1;
+    return;
+  }
+  auto gcs_file = static_cast<GCSFile*>(filesystem->plugin_filesystem);
+  std::vector<std::string> childrens =
+      GetChildrenBounded(gcs_file, path, UINT64_MAX, true, true, status);
+  if (TF_GetCode(status) != TF_OK) return;
+
+  std::string dir = path;
+  MaybeAppendSlash(&dir);
+  for (const std::string& children : childrens) {
+    const std::string& full_path = dir + children;
+    DeleteFile(filesystem, full_path.c_str(), status);
+    if (TF_GetCode(status) != TF_OK) {
+      if (IsDirectory(filesystem, full_path.c_str(), status))
+        // The object is a directory marker.
+        (*undeleted_dirs)++;
+      else
+        (*undeleted_files)++;
+    }
+  }
+}
+
+int GetChildren(const TF_Filesystem* filesystem, const char* path,
+                char*** entries, TF_Status* status) {
+  auto gcs_file = static_cast<GCSFile*>(filesystem->plugin_filesystem);
+  std::vector<std::string> childrens =
+      GetChildrenBounded(gcs_file, path, UINT64_MAX, false, false, status);
+  if (TF_GetCode(status) != TF_OK) return -1;
+
+  int num_entries = childrens.size();
+  *entries = static_cast<char**>(
+      plugin_memory_allocate(num_entries * sizeof((*entries)[0])));
+  for (int i = 0; i < num_entries; i++)
+    (*entries)[i] = strdup(childrens[i].c_str());
+  TF_SetStatus(status, TF_OK, "");
+  return num_entries;
+}
+
 void Stat(const TF_Filesystem* filesystem, const char* path,
           TF_FileStatistics* stats, TF_Status* status) {
   std::string bucket, object;
@@ -867,7 +1080,8 @@ void Stat(const TF_Filesystem* filesystem, const char* path,
 
   auto gcs_file = static_cast<GCSFile*>(filesystem->plugin_filesystem);
   if (object.empty()) {
-    auto bucket_metadata = gcs_file->gcs_client.GetBucketMetadata(bucket);
+    auto bucket_metadata =
+        gcs_file->gcs_client.GetBucketMetadata(bucket, gcs::Fields(""));
     TF_SetStatusFromGCSStatus(bucket_metadata.status(), status);
     if (TF_GetCode(status) == TF_OK) {
       stats->is_directory = true;
@@ -882,8 +1096,9 @@ void Stat(const TF_Filesystem* filesystem, const char* path,
     stats->mtime_nsec = 0;
     return TF_SetStatus(status, TF_OK, "");
   }
-  if (TF_GetCode(status) == TF_OK) {
-    auto metadata = gcs_file->gcs_client.GetObjectMetadata(bucket, object);
+  if (TF_GetCode(status) == TF_FAILED_PRECONDITION) {
+    auto metadata = gcs_file->gcs_client.GetObjectMetadata(
+        bucket, object, gcs::Fields("size,timeStorageClassUpdated"));
     if (metadata) {
       stats->is_directory = false;
       stats->length = metadata.value().size();
@@ -896,6 +1111,29 @@ void Stat(const TF_Filesystem* filesystem, const char* path,
   }
 }
 
+int64_t GetFileSize(const TF_Filesystem* filesystem, const char* path,
+                    TF_Status* status) {
+  // Only validate the name.
+  std::string bucket, object;
+  ParseGCSPath(path, false, &bucket, &object, status);
+  if (TF_GetCode(status) != TF_OK) return -1;
+
+  TF_FileStatistics stat;
+  Stat(filesystem, path, &stat, status);
+  return stat.length;
+}
+
+static char* TranslateName(const TF_Filesystem* filesystem, const char* uri) {
+  return strdup(uri);
+}
+
+static void FlushCaches(const TF_Filesystem* filesystem) {
+  auto gcs_file = static_cast<GCSFile*>(filesystem->plugin_filesystem);
+  absl::ReaderMutexLock l(&gcs_file->block_cache_lock);
+  gcs_file->file_block_cache->Flush();
+  gcs_file->stat_cache->Clear();
+}
+
 }  // namespace tf_gcs_filesystem
 
 static void ProvideFilesystemSupportFor(TF_FilesystemPluginOps* ops,
@@ -912,6 +1150,13 @@ static void ProvideFilesystemSupportFor(TF_FilesystemPluginOps* ops,
       plugin_memory_allocate(TF_WRITABLE_FILE_OPS_SIZE));
   ops->writable_file_ops->cleanup = tf_writable_file::Cleanup;
 
+  ops->read_only_memory_region_ops = static_cast<TF_ReadOnlyMemoryRegionOps*>(
+      plugin_memory_allocate(TF_READ_ONLY_MEMORY_REGION_OPS_SIZE));
+  ops->read_only_memory_region_ops->cleanup =
+      tf_read_only_memory_region::Cleanup;
+  ops->read_only_memory_region_ops->data = tf_read_only_memory_region::Data;
+  ops->read_only_memory_region_ops->length = tf_read_only_memory_region::Length;
+
   ops->filesystem_ops = static_cast<TF_FilesystemOps*>(
       plugin_memory_allocate(TF_FILESYSTEM_OPS_SIZE));
   ops->filesystem_ops->init = tf_gcs_filesystem::Init;
@@ -921,6 +1166,20 @@ static void ProvideFilesystemSupportFor(TF_FilesystemPluginOps* ops,
   ops->filesystem_ops->new_writable_file = tf_gcs_filesystem::NewWritableFile;
   ops->filesystem_ops->new_appendable_file =
       tf_gcs_filesystem::NewAppendableFile;
+  ops->filesystem_ops->new_read_only_memory_region_from_file =
+      tf_gcs_filesystem::NewReadOnlyMemoryRegionFromFile;
+  ops->filesystem_ops->create_dir = tf_gcs_filesystem::CreateDir;
+  ops->filesystem_ops->delete_file = tf_gcs_filesystem::DeleteFile;
+  ops->filesystem_ops->delete_dir = tf_gcs_filesystem::DeleteDir;
+  ops->filesystem_ops->delete_recursively =
+      tf_gcs_filesystem::DeleteRecursively;
+  ops->filesystem_ops->copy_file = tf_gcs_filesystem::CopyFile;
+  ops->filesystem_ops->path_exists = tf_gcs_filesystem::PathExists;
+  ops->filesystem_ops->is_directory = tf_gcs_filesystem::IsDirectory;
+  ops->filesystem_ops->stat = tf_gcs_filesystem::Stat;
+  ops->filesystem_ops->get_children = tf_gcs_filesystem::GetChildren;
+  ops->filesystem_ops->translate_name = tf_gcs_filesystem::TranslateName;
+  ops->filesystem_ops->flush_caches = tf_gcs_filesystem::FlushCaches;
 }
 
 void TF_InitPlugin(TF_FilesystemPluginInfo* info) {
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.h b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.h
index 973ce9e9dc2..5612d004d82 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.h
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.h
@@ -87,6 +87,24 @@ void NewReadOnlyMemoryRegionFromFile(const TF_Filesystem* filesystem,
                                      const char* path,
                                      TF_ReadOnlyMemoryRegion* region,
                                      TF_Status* status);
+int64_t GetFileSize(const TF_Filesystem* filesystem, const char* path,
+                    TF_Status* status);
+void PathExists(const TF_Filesystem* filesystem, const char* path,
+                TF_Status* status);
+void CreateDir(const TF_Filesystem* filesystem, const char* path,
+               TF_Status* status);
+int GetChildren(const TF_Filesystem* filesystem, const char* path,
+                char*** entries, TF_Status* status);
+void DeleteFile(const TF_Filesystem* filesystem, const char* path,
+                TF_Status* status);
+void Stat(const TF_Filesystem* filesystem, const char* path,
+          TF_FileStatistics* stats, TF_Status* status);
+void DeleteDir(const TF_Filesystem* filesystem, const char* path,
+               TF_Status* status);
+void CopyFile(const TF_Filesystem* filesystem, const char* src, const char* dst,
+              TF_Status* status);
+void RenameFile(const TF_Filesystem* filesystem, const char* src,
+                const char* dst, TF_Status* status);
 }  // namespace tf_gcs_filesystem
 
 #endif  // TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_GCS_GCS_FILESYSTEM_H_
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem_test.cc b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem_test.cc
index 82c4e4b8705..e15921335ab 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem_test.cc
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/platform/test.h"
 
 #define ASSERT_TF_OK(x) ASSERT_EQ(TF_OK, TF_GetCode(x)) << TF_Message(x)
+#define EXPECT_TF_OK(x) EXPECT_EQ(TF_OK, TF_GetCode(x)) << TF_Message(x)
 
 static const char* content = "abcdefghijklmnopqrstuvwxyz1234567890";
 // We will work with content_view instead of content.
@@ -94,6 +95,70 @@ class GCSFilesystemTest : public ::testing::Test {
     return translated_name;
   }
 
+  std::unique_ptr<TF_WritableFile, void (*)(TF_WritableFile* file)>
+  GetWriter() {
+    std::unique_ptr<TF_WritableFile, void (*)(TF_WritableFile * file)> writer(
+        new TF_WritableFile, [](TF_WritableFile* file) {
+          if (file != nullptr) {
+            if (file->plugin_file != nullptr) tf_writable_file::Cleanup(file);
+            delete file;
+          }
+        });
+    writer->plugin_file = nullptr;
+    return writer;
+  }
+
+  std::unique_ptr<TF_RandomAccessFile, void (*)(TF_RandomAccessFile* file)>
+  GetReader() {
+    std::unique_ptr<TF_RandomAccessFile, void (*)(TF_RandomAccessFile * file)>
+        reader(new TF_RandomAccessFile, [](TF_RandomAccessFile* file) {
+          if (file != nullptr) {
+            if (file->plugin_file != nullptr)
+              tf_random_access_file::Cleanup(file);
+            delete file;
+          }
+        });
+    reader->plugin_file = nullptr;
+    return reader;
+  }
+
+  void WriteString(const std::string& path, const std::string& content) {
+    auto writer = GetWriter();
+    tf_gcs_filesystem::NewWritableFile(filesystem_, path.c_str(), writer.get(),
+                                       status_);
+    if (TF_GetCode(status_) != TF_OK) return;
+    tf_writable_file::Append(writer.get(), content.c_str(), content.length(),
+                             status_);
+    if (TF_GetCode(status_) != TF_OK) return;
+    tf_writable_file::Close(writer.get(), status_);
+    if (TF_GetCode(status_) != TF_OK) return;
+  }
+
+  std::string ReadAll(const std::string& path) {
+    auto reader = GetReader();
+    tf_gcs_filesystem::NewRandomAccessFile(filesystem_, path.c_str(),
+                                           reader.get(), status_);
+    if (TF_GetCode(status_) != TF_OK) return "";
+
+    auto file_size =
+        tf_gcs_filesystem::GetFileSize(filesystem_, path.c_str(), status_);
+    if (TF_GetCode(status_) != TF_OK) return "";
+
+    std::string content;
+    content.resize(file_size);
+    auto read = tf_random_access_file::Read(reader.get(), 0, file_size,
+                                            &content[0], status_);
+    if (TF_GetCode(status_) != TF_OK) return "";
+    if (read >= 0) content.resize(read);
+    if (file_size != content.size())
+      TF_SetStatus(
+          status_, TF_DATA_LOSS,
+          std::string("expected " + std::to_string(file_size) + " got " +
+                      std::to_string(content.size()) + " bytes")
+              .c_str());
+    return content;
+  }
+
  protected:
   TF_Filesystem* filesystem_;
   TF_Status* status_;
@@ -326,6 +391,145 @@ TEST_F(GCSFilesystemTest, ReadOnlyMemoryRegion) {
   delete region;
 }
 
+TEST_F(GCSFilesystemTest, PathExists) {
+  tf_gcs_filesystem::Init(filesystem_, status_);
+  ASSERT_TF_OK(status_);
+  const std::string path = GetURIForPath("PathExists");
+  tf_gcs_filesystem::PathExists(filesystem_, path.c_str(), status_);
+  EXPECT_EQ(TF_NOT_FOUND, TF_GetCode(status_)) << TF_Message(status_);
+  TF_SetStatus(status_, TF_OK, "");
+  WriteString(path, "test");
+  ASSERT_TF_OK(status_);
+  tf_gcs_filesystem::PathExists(filesystem_, path.c_str(), status_);
+  EXPECT_TF_OK(status_);
+}
+
+TEST_F(GCSFilesystemTest, GetChildren) {
+  tf_gcs_filesystem::Init(filesystem_, status_);
+  ASSERT_TF_OK(status_);
+  const std::string base = GetURIForPath("GetChildren");
+  tf_gcs_filesystem::CreateDir(filesystem_, base.c_str(), status_);
+  EXPECT_TF_OK(status_);
+
+  const std::string file = io::JoinPath(base, "TestFile.csv");
+  WriteString(file, "test");
+  EXPECT_TF_OK(status_);
+
+  const std::string subdir = io::JoinPath(base, "SubDir");
+  tf_gcs_filesystem::CreateDir(filesystem_, subdir.c_str(), status_);
+  EXPECT_TF_OK(status_);
+  const std::string subfile = io::JoinPath(subdir, "TestSubFile.csv");
+  WriteString(subfile, "test");
+  EXPECT_TF_OK(status_);
+
+  char** entries;
+  auto num_entries = tf_gcs_filesystem::GetChildren(filesystem_, base.c_str(),
+                                                    &entries, status_);
+  EXPECT_TF_OK(status_);
+
+  std::vector<std::string> childrens;
+  for (int i = 0; i < num_entries; ++i) {
+    childrens.push_back(entries[i]);
+  }
+  std::sort(childrens.begin(), childrens.end());
+  EXPECT_EQ(std::vector<string>({"SubDir/", "TestFile.csv"}), childrens);
+}
+
+TEST_F(GCSFilesystemTest, DeleteFile) {
+  tf_gcs_filesystem::Init(filesystem_, status_);
+  ASSERT_TF_OK(status_);
+  const std::string path = GetURIForPath("DeleteFile");
+  WriteString(path, "test");
+  ASSERT_TF_OK(status_);
+  tf_gcs_filesystem::DeleteFile(filesystem_, path.c_str(), status_);
+  EXPECT_TF_OK(status_);
+  tf_gcs_filesystem::PathExists(filesystem_, path.c_str(), status_);
+  EXPECT_EQ(TF_GetCode(status_), TF_NOT_FOUND);
+}
+
+TEST_F(GCSFilesystemTest, CreateDir) {
+  tf_gcs_filesystem::Init(filesystem_, status_);
+  ASSERT_TF_OK(status_);
+  const std::string dir = GetURIForPath("CreateDir");
+  tf_gcs_filesystem::CreateDir(filesystem_, dir.c_str(), status_);
+  EXPECT_TF_OK(status_);
+
+  TF_FileStatistics stat;
+  tf_gcs_filesystem::Stat(filesystem_, dir.c_str(), &stat, status_);
+  EXPECT_TF_OK(status_);
+  EXPECT_TRUE(stat.is_directory);
+}
+
+TEST_F(GCSFilesystemTest, DeleteDir) {
+  tf_gcs_filesystem::Init(filesystem_, status_);
+  ASSERT_TF_OK(status_);
+  const std::string dir = GetURIForPath("DeleteDir");
+  const std::string file = io::JoinPath(dir, "DeleteDirFile.csv");
+  WriteString(file, "test");
+  ASSERT_TF_OK(status_);
+  tf_gcs_filesystem::DeleteDir(filesystem_, dir.c_str(), status_);
+  EXPECT_EQ(TF_GetCode(status_), TF_FAILED_PRECONDITION);
+
+  TF_SetStatus(status_, TF_OK, "");
+  tf_gcs_filesystem::DeleteFile(filesystem_, file.c_str(), status_);
+  EXPECT_TF_OK(status_);
+  tf_gcs_filesystem::DeleteDir(filesystem_, dir.c_str(), status_);
+  EXPECT_TF_OK(status_);
+  TF_FileStatistics stat;
+  tf_gcs_filesystem::Stat(filesystem_, dir.c_str(), &stat, status_);
+  EXPECT_EQ(TF_GetCode(status_), TF_NOT_FOUND) << TF_Message(status_);
+}
+
+TEST_F(GCSFilesystemTest, StatFile) {
+  tf_gcs_filesystem::Init(filesystem_, status_);
+  ASSERT_TF_OK(status_);
+  const std::string path = GetURIForPath("StatFile");
+  WriteString(path, "test");
+  ASSERT_TF_OK(status_);
+
+  TF_FileStatistics stat;
+  tf_gcs_filesystem::Stat(filesystem_, path.c_str(), &stat, status_);
+  EXPECT_TF_OK(status_);
+  EXPECT_EQ(4, stat.length);
+  EXPECT_FALSE(stat.is_directory);
+}
+
+TEST_F(GCSFilesystemTest, RenameFile) {
+  tf_gcs_filesystem::Init(filesystem_, status_);
+  ASSERT_TF_OK(status_);
+  const std::string src = GetURIForPath("RenameFileSrc");
+  const std::string dst = GetURIForPath("RenameFileDst");
+  WriteString(src, "test");
+  ASSERT_TF_OK(status_);
+
+  tf_gcs_filesystem::RenameFile(filesystem_, src.c_str(), dst.c_str(), status_);
+  EXPECT_TF_OK(status_);
+  auto result = ReadAll(dst);
+  EXPECT_TF_OK(status_);
+  EXPECT_EQ("test", result);
+}
+
+TEST_F(GCSFilesystemTest, RenameFileOverwrite) {
+  tf_gcs_filesystem::Init(filesystem_, status_);
+  ASSERT_TF_OK(status_);
+  const std::string src = GetURIForPath("RenameFileOverwriteSrc");
+  const std::string dst = GetURIForPath("RenameFileOverwriteDst");
+
+  WriteString(src, "test_old");
+  ASSERT_TF_OK(status_);
+  WriteString(dst, "test_new");
+  ASSERT_TF_OK(status_);
+
+  tf_gcs_filesystem::PathExists(filesystem_, dst.c_str(), status_);
+  EXPECT_TF_OK(status_);
+  tf_gcs_filesystem::RenameFile(filesystem_, src.c_str(), dst.c_str(), status_);
+  EXPECT_TF_OK(status_);
+
+  auto result = ReadAll(dst);
+  EXPECT_TF_OK(status_);
+  EXPECT_EQ("test_old", result);
+}
+
 // These tests below are ported from
 // `//tensorflow/core/platform/cloud:gcs_file_system_test`
 TEST_F(GCSFilesystemTest, NewRandomAccessFile_NoBlockCache) {
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/ram_file_block_cache.h b/tensorflow/c/experimental/filesystem/plugins/gcs/ram_file_block_cache.h
index 2abfb6f924b..72659a97d42 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/ram_file_block_cache.h
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/ram_file_block_cache.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "absl/synchronization/notification.h"
 #include "tensorflow/c/env.h"
+#include "tensorflow/c/logging.h"
 #include "tensorflow/c/tf_status.h"
 
 namespace tf_gcs_filesystem {
@@ -65,8 +66,8 @@ class RamFileBlockCache {
       pruning_thread_.reset(
           TF_StartThread(&thread_options, "TF_prune_FBC", PruneThread, this));
     }
-    std::cout << "GCS file block cache is "
-              << (IsCacheEnabled() ? "enabled" : "disabled") << ".\n";
+    TF_VLog(1, "GCS file block cache is %s.\n",
+            (IsCacheEnabled() ? "enabled" : "disabled"));
   }
 
   ~RamFileBlockCache() {
diff --git a/tensorflow/c/experimental/filesystem/plugins/hadoop/BUILD b/tensorflow/c/experimental/filesystem/plugins/hadoop/BUILD
index 51ffd709f3d..bb97587d6d1 100644
--- a/tensorflow/c/experimental/filesystem/plugins/hadoop/BUILD
+++ b/tensorflow/c/experimental/filesystem/plugins/hadoop/BUILD
@@ -1,5 +1,5 @@
 # Experimental hadoop filesystem plugin.
-load("//tensorflow:tensorflow.bzl", "get_win_copts", "tf_cc_shared_object")
+load("//tensorflow:tensorflow.bzl", "get_win_copts", "tf_cc_shared_object", "tf_cc_test")
 
 package(
     licenses = ["notice"],  # Apache 2.0
@@ -33,3 +33,38 @@ cc_library(
         "@com_google_absl//absl/synchronization",
     ],
 )
+
+# This test is set to manual because it requires downloading the Hadoop
+# distribution to run. To run this test:
+# 1. Ensure $JAVA_HOME is set to the location of a JDK 8 installation.
+# 2. Download the binary Hadoop distribution from:
+#    http://hadoop.apache.org/releases.html
+# 3. Extract the Hadoop distribution and run:
+#    source libexec/hadoop-config.sh
+# 4. Optionally set up HDFS cluster configurations (optionally Kerberos) within
+#    $HADOOP_HDFS_HOME/etc/hadoop if you want to test against real
+#    distributed HDFS cluster
+# 5. bazel test \
+#      --test_env=LD_LIBRARY_PATH=$JAVA_HOME/jre/lib/amd64/server \
+#      --test_env=HADOOP_HDFS_HOME=$HADOOP_HDFS_HOME \
+#      --test_env=CLASSPATH=$($HADOOP_HDFS_HOME/bin/hadoop classpath --glob) \
+#      :hadoop_file_system_test
+#    To test against the real distributed cluster, add the following option for
+#    bazel test:
+#      --test_env=HADOOP_TEST_TMPDIR=hdfs://cluster/test/tmp/dir
+tf_cc_test(
+    name = "hadoop_filesystem_test",
+    srcs = [
+        "hadoop_filesystem_test.cc",
+    ],
+    tags = [
+        "manual",
+        "notap",
+    ],
+    deps = [
+        ":hadoop_filesystem_impl",
+        "//tensorflow/core/platform:path",
+        "//tensorflow/core/platform:stacktrace_handler",
+        "//tensorflow/core/platform:test",
+    ],
+)
diff --git a/tensorflow/c/experimental/filesystem/plugins/hadoop/hadoop_filesystem.cc b/tensorflow/c/experimental/filesystem/plugins/hadoop/hadoop_filesystem.cc
index e94be3e83a2..b904ba292ab 100644
--- a/tensorflow/c/experimental/filesystem/plugins/hadoop/hadoop_filesystem.cc
+++ b/tensorflow/c/experimental/filesystem/plugins/hadoop/hadoop_filesystem.cc
@@ -37,11 +37,17 @@ static void plugin_memory_free(void* ptr) { free(ptr); }
 void ParseHadoopPath(const std::string& fname, std::string* scheme,
                      std::string* namenode, std::string* path) {
   size_t scheme_end = fname.find("://") + 2;
-  *scheme = fname.substr(0, scheme_end + 1);
+  // We don't want `://` in scheme.
+  *scheme = fname.substr(0, scheme_end - 2);
   size_t nn_end = fname.find("/", scheme_end + 1);
-  if (nn_end == std::string::npos) return;
+  if (nn_end == std::string::npos) {
+    *namenode = fname.substr(scheme_end + 1);
+    *path = "";
+    return;
+  }
   *namenode = fname.substr(scheme_end + 1, nn_end - scheme_end - 1);
-  *path = fname.substr(nn_end + 1);
+  // We keep `/` in path.
+  *path = fname.substr(nn_end);
 }
 
 void SplitArchiveNameAndPath(std::string* path, std::string* nn,
@@ -54,7 +60,7 @@ void SplitArchiveNameAndPath(std::string* path, std::string* nn,
   }
   // Case of hadoop archive. Namenode is the path to the archive.
   std::ostringstream namenodestream;
-  namenodestream << "har://" << nn
+  namenodestream << "har://" << *nn
                  << path->substr(0, index_end_archive_name + 4);
   *nn = namenodestream.str();
   path->erase(0, index_end_archive_name + 4);
@@ -247,8 +253,8 @@ int64_t Read(const TF_RandomAccessFile* file, uint64_t offset, size_t n,
 
   char* dst = buffer;
   bool eof_retried = false;
-  int64_t r = 0;
-  while (TF_GetCode(status) == TF_OK && !eof_retried) {
+  int64_t read = 0;
+  while (TF_GetCode(status) == TF_OK && n > 0) {
     // We lock inside the loop rather than outside so we don't block other
     // concurrent readers.
     absl::MutexLock l(&hdfs_file->mu);
@@ -257,12 +263,13 @@ int64_t Read(const TF_RandomAccessFile* file, uint64_t offset, size_t n,
     // of int32. -2 offset can avoid JVM OutOfMemoryError.
     size_t read_n =
         (std::min)(n, static_cast<size_t>(std::numeric_limits<int>::max() - 2));
-    r = libhdfs->hdfsPread(fs, handle, static_cast<tOffset>(offset), dst,
-                           static_cast<tSize>(read_n));
+    int64_t r = libhdfs->hdfsPread(fs, handle, static_cast<tOffset>(offset),
+                                   dst, static_cast<tSize>(read_n));
     if (r > 0) {
       dst += r;
       n -= r;
       offset += r;
+      read += r;
     } else if (!eof_retried && r == 0) {
       // Always reopen the file upon reaching EOF to see if there's more data.
       // If writers are streaming contents while others are concurrently
@@ -274,11 +281,13 @@ int64_t Read(const TF_RandomAccessFile* file, uint64_t offset, size_t n,
         TF_SetStatusFromIOError(status, errno, path);
         return -1;
       }
-      handle = libhdfs->hdfsOpenFile(fs, hdfs_path, O_RDONLY, 0, 0, 0);
-      if (handle == nullptr) {
+      hdfs_file->handle =
+          libhdfs->hdfsOpenFile(fs, hdfs_path, O_RDONLY, 0, 0, 0);
+      if (hdfs_file->handle == nullptr) {
         TF_SetStatusFromIOError(status, errno, path);
         return -1;
       }
+      handle = hdfs_file->handle;
       eof_retried = true;
     } else if (eof_retried && r == 0) {
       TF_SetStatus(status, TF_OUT_OF_RANGE, "Read less bytes than requested");
@@ -288,7 +297,7 @@ int64_t Read(const TF_RandomAccessFile* file, uint64_t offset, size_t n,
       TF_SetStatusFromIOError(status, errno, path);
     }
   }
-  return r;
+  return read;
 }
 
 }  // namespace tf_random_access_file
@@ -308,7 +317,7 @@ typedef struct HDFSFile {
         handle(handle) {}
 } HDFSFile;
 
-static void Cleanup(TF_WritableFile* file) {
+void Cleanup(TF_WritableFile* file) {
   auto hdfs_file = static_cast<HDFSFile*>(file->plugin_file);
   hdfs_file->libhdfs->hdfsCloseFile(hdfs_file->fs, hdfs_file->handle);
   hdfs_file->fs = nullptr;
@@ -433,6 +442,23 @@ void NewWritableFile(const TF_Filesystem* filesystem, const char* path,
   std::string scheme, namenode, hdfs_path;
   ParseHadoopPath(path, &scheme, &namenode, &hdfs_path);
 
+  auto handle = libhdfs->hdfsOpenFile(fs, hdfs_path.c_str(), O_WRONLY, 0, 0, 0);
+  if (handle == nullptr) return TF_SetStatusFromIOError(status, errno, path);
+
+  file->plugin_file =
+      new tf_writable_file::HDFSFile(hdfs_path, fs, libhdfs, handle);
+  TF_SetStatus(status, TF_OK, "");
+}
+
+void NewAppendableFile(const TF_Filesystem* filesystem, const char* path,
+                       TF_WritableFile* file, TF_Status* status) {
+  auto libhdfs = static_cast<LibHDFS*>(filesystem->plugin_filesystem);
+  auto fs = Connect(libhdfs, path, status);
+  if (TF_GetCode(status) != TF_OK) return;
+
+  std::string scheme, namenode, hdfs_path;
+  ParseHadoopPath(path, &scheme, &namenode, &hdfs_path);
+
   auto handle = libhdfs->hdfsOpenFile(fs, hdfs_path.c_str(),
                                       O_WRONLY | O_APPEND, 0, 0, 0);
   if (handle == nullptr) return TF_SetStatusFromIOError(status, errno, path);
@@ -442,6 +468,202 @@ void NewWritableFile(const TF_Filesystem* filesystem, const char* path,
   TF_SetStatus(status, TF_OK, "");
 }
 
+void NewReadOnlyMemoryRegionFromFile(const TF_Filesystem* filesystem,
+                                     const char* path,
+                                     TF_ReadOnlyMemoryRegion* region,
+                                     TF_Status* status) {
+  // hadoopReadZero() technically supports this call with the following
+  // caveats:
+  // - It only works up to 2 GB. We'd have to Stat() the file to ensure that
+  //   it fits.
+  // - If not on the local filesystem, the entire file will be read, making
+  //   it inefficient for callers that assume typical mmap() behavior.
+  TF_SetStatus(status, TF_UNIMPLEMENTED,
+               "HDFS does not support ReadOnlyMemoryRegion");
+}
+
+void PathExists(const TF_Filesystem* filesystem, const char* path,
+                TF_Status* status) {
+  auto libhdfs = static_cast<LibHDFS*>(filesystem->plugin_filesystem);
+  auto fs = Connect(libhdfs, path, status);
+  if (TF_GetCode(status) != TF_OK) return;
+
+  std::string scheme, namenode, hdfs_path;
+  ParseHadoopPath(path, &scheme, &namenode, &hdfs_path);
+
+  if (libhdfs->hdfsExists(fs, hdfs_path.c_str()) == 0)
+    TF_SetStatus(status, TF_OK, "");
+  else
+    TF_SetStatus(status, TF_NOT_FOUND,
+                 (std::string(path) + " not found").c_str());
+}
+
+void Stat(const TF_Filesystem* filesystem, const char* path,
+          TF_FileStatistics* stats, TF_Status* status) {
+  auto libhdfs = static_cast<LibHDFS*>(filesystem->plugin_filesystem);
+  auto fs = Connect(libhdfs, path, status);
+  if (TF_GetCode(status) != TF_OK) return;
+
+  std::string scheme, namenode, hdfs_path;
+  ParseHadoopPath(path, &scheme, &namenode, &hdfs_path);
+
+  auto info = libhdfs->hdfsGetPathInfo(fs, hdfs_path.c_str());
+  if (info == nullptr) return TF_SetStatusFromIOError(status, errno, path);
+
+  stats->length = static_cast<int64_t>(info->mSize);
+  stats->mtime_nsec = static_cast<int64_t>(info->mLastMod) * 1e9;
+  stats->is_directory = info->mKind == kObjectKindDirectory;
+  libhdfs->hdfsFreeFileInfo(info, 1);
+  TF_SetStatus(status, TF_OK, "");
+}
+
+int64_t GetFileSize(const TF_Filesystem* filesystem, const char* path,
+                    TF_Status* status) {
+  auto libhdfs = static_cast<LibHDFS*>(filesystem->plugin_filesystem);
+  auto fs = Connect(libhdfs, path, status);
+  if (TF_GetCode(status) != TF_OK) return -1;
+
+  std::string scheme, namenode, hdfs_path;
+  ParseHadoopPath(path, &scheme, &namenode, &hdfs_path);
+
+  auto info = libhdfs->hdfsGetPathInfo(fs, hdfs_path.c_str());
+  if (info == nullptr) {
+    TF_SetStatusFromIOError(status, errno, path);
+    return -1;
+  }
+
+  TF_SetStatus(status, TF_OK, "");
+  auto size = static_cast<int64_t>(info->mSize);
+  libhdfs->hdfsFreeFileInfo(info, 1);
+  return size;
+}
+
+void DeleteFile(const TF_Filesystem* filesystem, const char* path,
+                TF_Status* status) {
+  auto libhdfs = static_cast<LibHDFS*>(filesystem->plugin_filesystem);
+  auto fs = Connect(libhdfs, path, status);
+  if (TF_GetCode(status) != TF_OK) return;
+
+  std::string scheme, namenode, hdfs_path;
+  ParseHadoopPath(path, &scheme, &namenode, &hdfs_path);
+
+  if (libhdfs->hdfsDelete(fs, hdfs_path.c_str(), /*recursive=*/0) != 0)
+    TF_SetStatusFromIOError(status, errno, path);
+  else
+    TF_SetStatus(status, TF_OK, "");
+}
+
+void CreateDir(const TF_Filesystem* filesystem, const char* path,
+               TF_Status* status) {
+  auto libhdfs = static_cast<LibHDFS*>(filesystem->plugin_filesystem);
+  auto fs = Connect(libhdfs, path, status);
+  if (TF_GetCode(status) != TF_OK) return;
+
+  std::string scheme, namenode, hdfs_path;
+  ParseHadoopPath(path, &scheme, &namenode, &hdfs_path);
+
+  if (libhdfs->hdfsCreateDirectory(fs, hdfs_path.c_str()) != 0)
+    TF_SetStatusFromIOError(status, errno, path);
+  else
+    TF_SetStatus(status, TF_OK, "");
+}
+
+void DeleteDir(const TF_Filesystem* filesystem, const char* path,
+               TF_Status* status) {
+  auto libhdfs = static_cast<LibHDFS*>(filesystem->plugin_filesystem);
+  auto fs = Connect(libhdfs, path, status);
+  if (TF_GetCode(status) != TF_OK) return;
+
+  std::string scheme, namenode, hdfs_path;
+  ParseHadoopPath(path, &scheme, &namenode, &hdfs_path);
+
+  // Count the number of entries in the directory, and only delete if it's
+  // non-empty. This is consistent with the interface, but note that there's
+  // a race condition where a file may be added after this check, in which
+  // case the directory will still be deleted.
+  int entries = 0;
+  auto info = libhdfs->hdfsListDirectory(fs, hdfs_path.c_str(), &entries);
+  if (info != nullptr) libhdfs->hdfsFreeFileInfo(info, entries);
+
+  // Due to HDFS bug HDFS-8407, we can't distinguish between an error and empty
+  // folder, especially for Kerberos enable setup, EAGAIN is quite common when
+  // the call is actually successful. Check again by Stat.
+  if (info == nullptr && errno != 0) {
+    TF_FileStatistics stat;
+    Stat(filesystem, path, &stat, status);
+    if (TF_GetCode(status) != TF_OK) return;
+  }
+
+  if (entries > 0)
+    return TF_SetStatus(status, TF_FAILED_PRECONDITION,
+                        "Cannot delete a non-empty directory.");
+
+  if (libhdfs->hdfsDelete(fs, hdfs_path.c_str(), /*recursive=*/1) != 0)
+    TF_SetStatusFromIOError(status, errno, path);
+  else
+    TF_SetStatus(status, TF_OK, "");
+}
+
+void RenameFile(const TF_Filesystem* filesystem, const char* src,
+                const char* dst, TF_Status* status) {
+  auto libhdfs = static_cast<LibHDFS*>(filesystem->plugin_filesystem);
+  auto fs = Connect(libhdfs, src, status);
+  if (TF_GetCode(status) != TF_OK) return;
+
+  std::string scheme, namenode, hdfs_path_src, hdfs_path_dst;
+  ParseHadoopPath(src, &scheme, &namenode, &hdfs_path_src);
+  ParseHadoopPath(dst, &scheme, &namenode, &hdfs_path_dst);
+
+  if (libhdfs->hdfsExists(fs, hdfs_path_dst.c_str()) == 0 &&
+      libhdfs->hdfsDelete(fs, hdfs_path_dst.c_str(), /*recursive=*/0) != 0)
+    return TF_SetStatusFromIOError(status, errno, dst);
+
+  if (libhdfs->hdfsRename(fs, hdfs_path_src.c_str(), hdfs_path_dst.c_str()) !=
+      0)
+    TF_SetStatusFromIOError(status, errno, src);
+  else
+    TF_SetStatus(status, TF_OK, "");
+}
+
+int GetChildren(const TF_Filesystem* filesystem, const char* path,
+                char*** entries, TF_Status* status) {
+  auto libhdfs = static_cast<LibHDFS*>(filesystem->plugin_filesystem);
+  auto fs = Connect(libhdfs, path, status);
+  if (TF_GetCode(status) != TF_OK) return -1;
+
+  std::string scheme, namenode, hdfs_path;
+  ParseHadoopPath(path, &scheme, &namenode, &hdfs_path);
+
+  // hdfsListDirectory returns nullptr if the directory is empty. Do a separate
+  // check to verify the directory exists first.
+  TF_FileStatistics stat;
+  Stat(filesystem, path, &stat, status);
+  if (TF_GetCode(status) != TF_OK) return -1;
+
+  int num_entries = 0;
+  auto info = libhdfs->hdfsListDirectory(fs, hdfs_path.c_str(), &num_entries);
+  if (info == nullptr) {
+    if (stat.is_directory) {
+      // Assume it's an empty directory.
+      TF_SetStatus(status, TF_OK, "");
+      return 0;
+    }
+    TF_SetStatusFromIOError(status, errno, path);
+    return -1;
+  }
+  *entries = static_cast<char**>(
+      plugin_memory_allocate(num_entries * sizeof((*entries)[0])));
+  auto BaseName = [](const std::string& name) {
+    return name.substr(name.find_last_of('/') + 1);
+  };
+  for (int i = 0; i < num_entries; i++) {
+    (*entries)[i] = strdup(BaseName(info[i].mName).c_str());
+  }
+  libhdfs->hdfsFreeFileInfo(info, num_entries);
+  TF_SetStatus(status, TF_OK, "");
+  return num_entries;
+}
+
 // TODO(vnvo2409): Implement later
 
 }  // namespace tf_hadoop_filesystem
diff --git a/tensorflow/c/experimental/filesystem/plugins/hadoop/hadoop_filesystem.h b/tensorflow/c/experimental/filesystem/plugins/hadoop/hadoop_filesystem.h
index 850cefe0231..8de66c05bac 100644
--- a/tensorflow/c/experimental/filesystem/plugins/hadoop/hadoop_filesystem.h
+++ b/tensorflow/c/experimental/filesystem/plugins/hadoop/hadoop_filesystem.h
@@ -15,7 +15,62 @@ limitations under the License.
 #ifndef TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_HADOOP_HADOOP_FILESYSTEM_H_
 #define TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_HADOOP_HADOOP_FILESYSTEM_H_
 
+#include <string>
+
 #include "tensorflow/c/experimental/filesystem/filesystem_interface.h"
 #include "tensorflow/c/tf_status.h"
 
+void ParseHadoopPath(const std::string& fname, std::string* scheme,
+                     std::string* namenode, std::string* path);
+void SplitArchiveNameAndPath(std::string* path, std::string* nn,
+                             TF_Status* status);
+class LibHDFS;
+
+namespace tf_random_access_file {
+void Cleanup(TF_RandomAccessFile* file);
+int64_t Read(const TF_RandomAccessFile* file, uint64_t offset, size_t n,
+             char* buffer, TF_Status* status);
+}  // namespace tf_random_access_file
+
+namespace tf_writable_file {
+void Cleanup(TF_WritableFile* file);
+void Append(const TF_WritableFile* file, const char* buffer, size_t n,
+            TF_Status* status);
+int64_t Tell(const TF_WritableFile* file, TF_Status* status);
+void Sync(const TF_WritableFile* file, TF_Status* status);
+void Flush(const TF_WritableFile* file, TF_Status* status);
+void Close(const TF_WritableFile* file, TF_Status* status);
+}  // namespace tf_writable_file
+
+namespace tf_hadoop_filesystem {
+void Init(TF_Filesystem* filesystem, TF_Status* status);
+void Cleanup(TF_Filesystem* filesystem);
+void NewRandomAccessFile(const TF_Filesystem* filesystem, const char* path,
+                         TF_RandomAccessFile* file, TF_Status* status);
+void NewWritableFile(const TF_Filesystem* filesystem, const char* path,
+                     TF_WritableFile* file, TF_Status* status);
+void NewAppendableFile(const TF_Filesystem* filesystem, const char* path,
+                       TF_WritableFile* file, TF_Status* status);
+void NewReadOnlyMemoryRegionFromFile(const TF_Filesystem* filesystem,
+                                     const char* path,
+                                     TF_ReadOnlyMemoryRegion* region,
+                                     TF_Status* status);
+void PathExists(const TF_Filesystem* filesystem, const char* path,
+                TF_Status* status);
+void Stat(const TF_Filesystem* filesystem, const char* path,
+          TF_FileStatistics* stats, TF_Status* status);
+int64_t GetFileSize(const TF_Filesystem* filesystem, const char* path,
+                    TF_Status* status);
+void DeleteFile(const TF_Filesystem* filesystem, const char* path,
+                TF_Status* status);
+void CreateDir(const TF_Filesystem* filesystem, const char* path,
+               TF_Status* status);
+void DeleteDir(const TF_Filesystem* filesystem, const char* path,
+               TF_Status* status);
+void RenameFile(const TF_Filesystem* filesystem, const char* src,
+                const char* dst, TF_Status* status);
+int GetChildren(const TF_Filesystem* filesystem, const char* path,
+                char*** entries, TF_Status* status);
+}  // namespace tf_hadoop_filesystem
+
 #endif  // TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_HADOOP_HADOOP_FILESYSTEM_H_
diff --git a/tensorflow/c/experimental/filesystem/plugins/hadoop/hadoop_filesystem_test.cc b/tensorflow/c/experimental/filesystem/plugins/hadoop/hadoop_filesystem_test.cc
new file mode 100644
index 00000000000..77079fb5325
--- /dev/null
+++ b/tensorflow/c/experimental/filesystem/plugins/hadoop/hadoop_filesystem_test.cc
@@ -0,0 +1,418 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/c/experimental/filesystem/plugins/hadoop/hadoop_filesystem.h"
+
+#include "tensorflow/core/platform/path.h"
+#include "tensorflow/core/platform/stacktrace_handler.h"
+#include "tensorflow/core/platform/test.h"
+#include "third_party/hadoop/hdfs.h"
+
+#define ASSERT_TF_OK(x) ASSERT_EQ(TF_OK, TF_GetCode(x)) << TF_Message(x)
+#define EXPECT_TF_OK(x) EXPECT_EQ(TF_OK, TF_GetCode(x)) << TF_Message(x)
+
+namespace tensorflow {
+namespace {
+
+class HadoopFileSystemTest : public ::testing::Test {
+ public:
+  void SetUp() override {
+    status_ = TF_NewStatus();
+    filesystem_ = new TF_Filesystem;
+    tf_hadoop_filesystem::Init(filesystem_, status_);
+    ASSERT_TF_OK(status_) << "Could not initialize filesystem. "
+                          << TF_Message(status_);
+  }
+  void TearDown() override {
+    TF_DeleteStatus(status_);
+    tf_hadoop_filesystem::Cleanup(filesystem_);
+    delete filesystem_;
+  }
+
+  std::string TmpDir(const std::string& path) {
+    char* test_dir = getenv("HADOOP_TEST_TMPDIR");
+    if (test_dir != nullptr) {
+      return io::JoinPath(std::string(test_dir), path);
+    } else {
+      return "file://" + io::JoinPath(testing::TmpDir(), path);
+    }
+  }
+
+  std::unique_ptr<TF_WritableFile, void (*)(TF_WritableFile* file)>
+  GetWriter() {
+    std::unique_ptr<TF_WritableFile, void (*)(TF_WritableFile * file)> writer(
+        new TF_WritableFile, [](TF_WritableFile* file) {
+          if (file != nullptr) {
+            if (file->plugin_file != nullptr) tf_writable_file::Cleanup(file);
+            delete file;
+          }
+        });
+    writer->plugin_file = nullptr;
+    return writer;
+  }
+
+  std::unique_ptr<TF_RandomAccessFile, void (*)(TF_RandomAccessFile* file)>
+  GetReader() {
+    std::unique_ptr<TF_RandomAccessFile, void (*)(TF_RandomAccessFile * file)>
+        reader(new TF_RandomAccessFile, [](TF_RandomAccessFile* file) {
+          if (file != nullptr) {
+            if (file->plugin_file != nullptr)
+              tf_random_access_file::Cleanup(file);
+            delete file;
+          }
+        });
+    reader->plugin_file = nullptr;
+    return reader;
+  }
+
+  void WriteString(const std::string& path, const std::string& content) {
+    auto writer = GetWriter();
+    tf_hadoop_filesystem::NewWritableFile(filesystem_, path.c_str(),
+                                          writer.get(), status_);
+    if (TF_GetCode(status_) != TF_OK) return;
+    tf_writable_file::Append(writer.get(), content.c_str(), content.length(),
+                             status_);
+    if (TF_GetCode(status_) != TF_OK) return;
+    tf_writable_file::Close(writer.get(), status_);
+    if (TF_GetCode(status_) != TF_OK) return;
+  }
+
+  std::string ReadAll(const std::string& path) {
+    auto reader = GetReader();
+    tf_hadoop_filesystem::NewRandomAccessFile(filesystem_, path.c_str(),
+                                              reader.get(), status_);
+    if (TF_GetCode(status_) != TF_OK) return "";
+
+    auto file_size =
+        tf_hadoop_filesystem::GetFileSize(filesystem_, path.c_str(), status_);
+    if (TF_GetCode(status_) != TF_OK) return "";
+
+    std::string content;
+    content.resize(file_size);
+    auto read = tf_random_access_file::Read(reader.get(), 0, file_size,
+                                            &content[0], status_);
+    if (TF_GetCode(status_) != TF_OK) return "";
+    if (read >= 0) content.resize(read);
+    if (file_size != content.size())
+      TF_SetStatus(
+          status_, TF_DATA_LOSS,
+          std::string("expected " + std::to_string(file_size) + " got " +
+                      std::to_string(content.size()) + " bytes")
+              .c_str());
+    return content;
+  }
+
+ protected:
+  TF_Filesystem* filesystem_;
+  TF_Status* status_;
+};
+
+TEST_F(HadoopFileSystemTest, RandomAccessFile) {
+  const std::string path = TmpDir("RandomAccessFile");
+  const std::string content = "abcdefghijklmn";
+
+  WriteString(path, content);
+  ASSERT_TF_OK(status_);
+
+  auto reader = GetReader();
+  tf_hadoop_filesystem::NewRandomAccessFile(filesystem_, path.c_str(),
+                                            reader.get(), status_);
+  EXPECT_TF_OK(status_);
+
+  std::string result;
+  result.resize(content.size());
+  auto read = tf_random_access_file::Read(reader.get(), 0, content.size(),
+                                          &result[0], status_);
+  result.resize(read);
+  EXPECT_TF_OK(status_);
+  EXPECT_EQ(content.size(), result.size());
+  EXPECT_EQ(content, result);
+
+  result.clear();
+  result.resize(4);
+  read = tf_random_access_file::Read(reader.get(), 2, 4, &result[0], status_);
+  result.resize(read);
+  EXPECT_TF_OK(status_);
+  EXPECT_EQ(4, result.size());
+  EXPECT_EQ(content.substr(2, 4), result);
+}
+
+TEST_F(HadoopFileSystemTest, WritableFile) {
+  auto writer = GetWriter();
+  const std::string path = TmpDir("WritableFile");
+  tf_hadoop_filesystem::NewWritableFile(filesystem_, path.c_str(), writer.get(),
+                                        status_);
+  EXPECT_TF_OK(status_);
+  tf_writable_file::Append(writer.get(), "content1,", strlen("content1,"),
+                           status_);
+  EXPECT_TF_OK(status_);
+  auto pos = tf_writable_file::Tell(writer.get(), status_);
+  EXPECT_TF_OK(status_);
+  EXPECT_EQ(pos, 9);
+
+  tf_writable_file::Append(writer.get(), "content2", strlen("content2"),
+                           status_);
+  EXPECT_TF_OK(status_);
+  tf_writable_file::Flush(writer.get(), status_);
+  EXPECT_TF_OK(status_);
+  tf_writable_file::Sync(writer.get(), status_);
+  EXPECT_TF_OK(status_);
+  tf_writable_file::Close(writer.get(), status_);
+  EXPECT_TF_OK(status_);
+
+  auto content = ReadAll(path);
+  EXPECT_TF_OK(status_);
+  EXPECT_EQ("content1,content2", content);
+}
+
+TEST_F(HadoopFileSystemTest, PathExists) {
+  const std::string path = TmpDir("PathExists");
+  tf_hadoop_filesystem::PathExists(filesystem_, path.c_str(), status_);
+  EXPECT_EQ(TF_NOT_FOUND, TF_GetCode(status_)) << TF_Message(status_);
+  TF_SetStatus(status_, TF_OK, "");
+  WriteString(path, "test");
+  ASSERT_TF_OK(status_);
+  tf_hadoop_filesystem::PathExists(filesystem_, path.c_str(), status_);
+  EXPECT_TF_OK(status_);
+}
+
+TEST_F(HadoopFileSystemTest, GetChildren) {
+  const std::string base = TmpDir("GetChildren");
+  tf_hadoop_filesystem::CreateDir(filesystem_, base.c_str(), status_);
+  EXPECT_TF_OK(status_);
+
+  const std::string file = io::JoinPath(base, "TestFile.csv");
+  WriteString(file, "test");
+  EXPECT_TF_OK(status_);
+
+  const std::string subdir = io::JoinPath(base, "SubDir");
+  tf_hadoop_filesystem::CreateDir(filesystem_, subdir.c_str(), status_);
+  EXPECT_TF_OK(status_);
+  const std::string subfile = io::JoinPath(subdir, "TestSubFile.csv");
+  WriteString(subfile, "test");
+  EXPECT_TF_OK(status_);
+
+  char** entries;
+  auto num_entries = tf_hadoop_filesystem::GetChildren(
+      filesystem_, base.c_str(), &entries, status_);
+  EXPECT_TF_OK(status_);
+
+  std::vector<std::string> childrens;
+  for (int i = 0; i < num_entries; ++i) {
+    childrens.push_back(entries[i]);
+  }
+  std::sort(childrens.begin(), childrens.end());
+  EXPECT_EQ(std::vector<string>({"SubDir", "TestFile.csv"}), childrens);
+}
+
+TEST_F(HadoopFileSystemTest, DeleteFile) {
+  const std::string path = TmpDir("DeleteFile");
+  WriteString(path, "test");
+  ASSERT_TF_OK(status_);
+  tf_hadoop_filesystem::DeleteFile(filesystem_, path.c_str(), status_);
+  EXPECT_TF_OK(status_);
+}
+
+TEST_F(HadoopFileSystemTest, GetFileSize) {
+  const std::string path = TmpDir("GetFileSize");
+  WriteString(path, "test");
+  ASSERT_TF_OK(status_);
+  auto file_size =
+      tf_hadoop_filesystem::GetFileSize(filesystem_, path.c_str(), status_);
+  EXPECT_TF_OK(status_);
+  EXPECT_EQ(4, file_size);
+}
+
+TEST_F(HadoopFileSystemTest, CreateDirStat) {
+  const std::string path = TmpDir("CreateDirStat");
+  tf_hadoop_filesystem::CreateDir(filesystem_, path.c_str(), status_);
+  EXPECT_TF_OK(status_);
+  TF_FileStatistics stat;
+  tf_hadoop_filesystem::Stat(filesystem_, path.c_str(), &stat, status_);
+  EXPECT_TF_OK(status_);
+  EXPECT_TRUE(stat.is_directory);
+}
+
+TEST_F(HadoopFileSystemTest, DeleteDir) {
+  const std::string path = TmpDir("DeleteDir");
+  tf_hadoop_filesystem::DeleteDir(filesystem_, path.c_str(), status_);
+  EXPECT_NE(TF_GetCode(status_), TF_OK);
+  tf_hadoop_filesystem::CreateDir(filesystem_, path.c_str(), status_);
+  EXPECT_TF_OK(status_);
+  tf_hadoop_filesystem::DeleteDir(filesystem_, path.c_str(), status_);
+  EXPECT_TF_OK(status_);
+  TF_FileStatistics stat;
+  tf_hadoop_filesystem::Stat(filesystem_, path.c_str(), &stat, status_);
+  EXPECT_NE(TF_GetCode(status_), TF_OK);
+}
+
+TEST_F(HadoopFileSystemTest, RenameFile) {
+  const std::string src = TmpDir("RenameFileSrc");
+  const std::string dst = TmpDir("RenameFileDst");
+  WriteString(src, "test");
+  ASSERT_TF_OK(status_);
+
+  tf_hadoop_filesystem::RenameFile(filesystem_, src.c_str(), dst.c_str(),
+                                   status_);
+  EXPECT_TF_OK(status_);
+  auto result = ReadAll(dst);
+  EXPECT_TF_OK(status_);
+  EXPECT_EQ("test", result);
+}
+
+TEST_F(HadoopFileSystemTest, RenameFileOverwrite) {
+  const std::string src = TmpDir("RenameFileOverwriteSrc");
+  const std::string dst = TmpDir("RenameFileOverwriteDst");
+
+  WriteString(src, "test_old");
+  ASSERT_TF_OK(status_);
+  WriteString(dst, "test_new");
+  ASSERT_TF_OK(status_);
+
+  tf_hadoop_filesystem::PathExists(filesystem_, dst.c_str(), status_);
+  EXPECT_TF_OK(status_);
+  tf_hadoop_filesystem::RenameFile(filesystem_, src.c_str(), dst.c_str(),
+                                   status_);
+  EXPECT_TF_OK(status_);
+
+  auto result = ReadAll(dst);
+  EXPECT_TF_OK(status_);
+  EXPECT_EQ("test_old", result);
+}
+
+TEST_F(HadoopFileSystemTest, StatFile) {
+  const std::string path = TmpDir("StatFile");
+  WriteString(path, "test");
+  ASSERT_TF_OK(status_);
+  TF_FileStatistics stat;
+  tf_hadoop_filesystem::Stat(filesystem_, path.c_str(), &stat, status_);
+  EXPECT_TF_OK(status_);
+  EXPECT_EQ(4, stat.length);
+  EXPECT_FALSE(stat.is_directory);
+}
+
+TEST_F(HadoopFileSystemTest, WriteWhileReading) {
+  const std::string path = TmpDir("WriteWhileReading");
+  // Skip the test if we're not testing on HDFS. Hadoop's local filesystem
+  // implementation makes no guarantees that writable files are readable while
+  // being written.
+  if (path.find_first_of("hdfs://") != 0) GTEST_SKIP();
+
+  auto writer = GetWriter();
+  tf_hadoop_filesystem::NewWritableFile(filesystem_, path.c_str(), writer.get(),
+                                        status_);
+  EXPECT_TF_OK(status_);
+
+  const std::string content1 = "content1";
+  tf_writable_file::Append(writer.get(), content1.c_str(), content1.size(),
+                           status_);
+  EXPECT_TF_OK(status_);
+  tf_writable_file::Flush(writer.get(), status_);
+  EXPECT_TF_OK(status_);
+
+  auto reader = GetReader();
+  tf_hadoop_filesystem::NewRandomAccessFile(filesystem_, path.c_str(),
+                                            reader.get(), status_);
+  EXPECT_TF_OK(status_);
+
+  std::string result;
+  result.resize(content1.size());
+  auto read = tf_random_access_file::Read(reader.get(), 0, content1.size(),
+                                          &result[0], status_);
+  result.resize(read);
+  EXPECT_TF_OK(status_);
+  EXPECT_EQ(content1, result);
+
+  const std::string content2 = "content2";
+  tf_writable_file::Append(writer.get(), content2.c_str(), content2.size(),
+                           status_);
+  EXPECT_TF_OK(status_);
+  tf_writable_file::Flush(writer.get(), status_);
+  EXPECT_TF_OK(status_);
+
+  result.resize(content2.size());
+  read = tf_random_access_file::Read(reader.get(), content1.size(),
+                                     content2.size(), &result[0], status_);
+  result.resize(read);
+  EXPECT_TF_OK(status_);
+  EXPECT_EQ(content2, result);
+
+  tf_writable_file::Close(writer.get(), status_);
+  EXPECT_TF_OK(status_);
+}
+
+TEST_F(HadoopFileSystemTest, HarSplit) {
+  const std::string har_path =
+      "har://hdfs-root/user/j.doe/my_archive.har/dir0/dir1/file.txt";
+  std::string scheme, namenode, path;
+  ParseHadoopPath(har_path, &scheme, &namenode, &path);
+  EXPECT_EQ("har", scheme);
+  EXPECT_EQ("hdfs-root", namenode);
+  EXPECT_EQ("/user/j.doe/my_archive.har/dir0/dir1/file.txt", path);
+  SplitArchiveNameAndPath(&path, &namenode, status_);
+  EXPECT_TF_OK(status_);
+  EXPECT_EQ("har://hdfs-root/user/j.doe/my_archive.har", namenode);
+  EXPECT_EQ("/dir0/dir1/file.txt", path);
+}
+
+TEST_F(HadoopFileSystemTest, NoHarExtension) {
+  const std::string har_path =
+      "har://hdfs-root/user/j.doe/my_archive/dir0/dir1/file.txt";
+  std::string scheme, namenode, path;
+  ParseHadoopPath(har_path, &scheme, &namenode, &path);
+  EXPECT_EQ("har", scheme);
+  EXPECT_EQ("hdfs-root", namenode);
+  EXPECT_EQ("/user/j.doe/my_archive/dir0/dir1/file.txt", path);
+  SplitArchiveNameAndPath(&path, &namenode, status_);
+  EXPECT_EQ(TF_GetCode(status_), TF_INVALID_ARGUMENT) << TF_Message(status_);
+}
+
+TEST_F(HadoopFileSystemTest, HarRootPath) {
+  const std::string har_path = "har://hdfs-root/user/j.doe/my_archive.har";
+  std::string scheme, namenode, path;
+  ParseHadoopPath(har_path, &scheme, &namenode, &path);
+  EXPECT_EQ("har", scheme);
+  EXPECT_EQ("hdfs-root", namenode);
+  EXPECT_EQ("/user/j.doe/my_archive.har", path);
+  SplitArchiveNameAndPath(&path, &namenode, status_);
+  EXPECT_TF_OK(status_);
+  EXPECT_EQ("har://hdfs-root/user/j.doe/my_archive.har", namenode);
+  EXPECT_EQ("/", path);
+}
+
+TEST_F(HadoopFileSystemTest, WriteLargeFile) {
+  if (std::getenv("HADOOP_TEST_LARGE_FILE") != "1") GTEST_SKIP();
+  const std::string path = TmpDir("WriteLargeFile");
+  const size_t file_size =
+      static_cast<size_t>(std::numeric_limits<tSize>::max()) + 1024;
+  // Fake a test string.
+  std::string source(file_size, {});
+  for (size_t i = 0; i < file_size; ++i) source[i] = (i % 128);
+  WriteString(path, source);
+  ASSERT_TF_OK(status_);
+  auto result = ReadAll(path);
+  EXPECT_TF_OK(status_);
+  EXPECT_EQ(source, result);
+}
+// NewAppendableFile() is not testable. Local filesystem maps to
+// ChecksumFileSystem in Hadoop, where appending is an unsupported operation.
+
+}  // namespace
+}  // namespace tensorflow
+
+GTEST_API_ int main(int argc, char** argv) {
+  tensorflow::testing::InstallStacktraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/c/experimental/filesystem/plugins/s3/BUILD b/tensorflow/c/experimental/filesystem/plugins/s3/BUILD
index 56bd3b4a75c..a2108d06cbb 100644
--- a/tensorflow/c/experimental/filesystem/plugins/s3/BUILD
+++ b/tensorflow/c/experimental/filesystem/plugins/s3/BUILD
@@ -26,6 +26,8 @@ cc_library(
     }),
     deps = [
         ":aws_crypto",
+        ":aws_logging",
+        "//tensorflow/c:logging",
         "//tensorflow/c:tf_status",
         "//tensorflow/c/experimental/filesystem:filesystem_interface",
         "@aws",
@@ -45,6 +47,18 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "aws_logging",
+    srcs = ["aws_logging.cc"],
+    hdrs = ["aws_logging.h"],
+    deps = [
+        "//tensorflow/c:logging",
+        "@aws",
+        "@com_google_absl//absl/synchronization",
+    ],
+    alwayslink = 1,
+)
+
 tf_cc_test(
     name = "s3_filesystem_test",
     srcs = [
diff --git a/tensorflow/c/experimental/filesystem/plugins/s3/aws_logging.cc b/tensorflow/c/experimental/filesystem/plugins/s3/aws_logging.cc
new file mode 100644
index 00000000000..353b733fd25
--- /dev/null
+++ b/tensorflow/c/experimental/filesystem/plugins/s3/aws_logging.cc
@@ -0,0 +1,159 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/c/experimental/filesystem/plugins/s3/aws_logging.h"
+
+#include <aws/core/Aws.h>
+#include <aws/core/utils/logging/AWSLogging.h>
+#include <aws/core/utils/logging/LogSystemInterface.h>
+
+#include <cstdarg>
+#include <cstdio>
+#include <sstream>
+
+#include "absl/synchronization/mutex.h"
+#include "tensorflow/c/logging.h"
+
+static constexpr char kAWSLoggingTag[] = "AWSLogging";
+
+static const std::map<const std::string, const Aws::Utils::Logging::LogLevel>
+    log_levels_string_to_aws = {
+        {"off", Aws::Utils::Logging::LogLevel::Off},
+        {"fatal", Aws::Utils::Logging::LogLevel::Fatal},
+        {"error", Aws::Utils::Logging::LogLevel::Error},
+        {"warn", Aws::Utils::Logging::LogLevel::Warn},
+        {"info", Aws::Utils::Logging::LogLevel::Info},
+        {"debug", Aws::Utils::Logging::LogLevel::Debug},
+        {"trace", Aws::Utils::Logging::LogLevel::Trace}};
+
+static const std::map<const int, const Aws::Utils::Logging::LogLevel>
+    log_levels_tf_to_aws = {{0, Aws::Utils::Logging::LogLevel::Info},
+                            {1, Aws::Utils::Logging::LogLevel::Warn},
+                            {2, Aws::Utils::Logging::LogLevel::Error},
+                            {3, Aws::Utils::Logging::LogLevel::Fatal}};
+
+namespace tf_s3_filesystem {
+
+AWSLogSystem::AWSLogSystem(Aws::Utils::Logging::LogLevel log_level)
+    : log_level_(log_level) {}
+
+void AWSLogSystem::LogMessage(Aws::Utils::Logging::LogLevel log_level,
+                              const std::string& message) {
+  if (message == "Initializing Curl library") return;
+  switch (log_level) {
+    case Aws::Utils::Logging::LogLevel::Info:
+      TF_Log(TF_INFO, message.c_str());
+      break;
+    case Aws::Utils::Logging::LogLevel::Warn:
+      TF_Log(TF_WARNING, message.c_str());
+      break;
+    case Aws::Utils::Logging::LogLevel::Error:
+      TF_Log(TF_ERROR, message.c_str());
+      break;
+    case Aws::Utils::Logging::LogLevel::Fatal:
+      TF_Log(TF_FATAL, message.c_str());
+      break;
+    default:
+      // this will match for DEBUG, TRACE
+      TF_Log(TF_INFO, message.c_str());
+      break;
+  }
+}
+
+void AWSLogSystem::Log(Aws::Utils::Logging::LogLevel log_level, const char* tag,
+                       const char* format, ...) {
+  char buffer[256];
+  va_list args;
+  va_start(args, format);
+  vsnprintf(buffer, 256, format, args);
+  va_end(args);
+  LogMessage(log_level, buffer);
+}
+
+void AWSLogSystem::LogStream(Aws::Utils::Logging::LogLevel log_level,
+                             const char* tag,
+                             const Aws::OStringStream& message_stream) {
+  LogMessage(log_level, message_stream.rdbuf()->str().c_str());
+}
+
+void AWSLogSystem::Flush() { return; }
+
+static Aws::Utils::Logging::LogLevel TfLogLevelToAwsLogLevel(int level) {
+  // Converts TF Log Levels INFO, WARNING, ERROR and FATAL to the AWS enum
+  // values for the levels
+  if (log_levels_tf_to_aws.find(level) != log_levels_tf_to_aws.end()) {
+    return log_levels_tf_to_aws.at(level);
+  } else {
+    // default to fatal
+    return Aws::Utils::Logging::LogLevel::Fatal;
+  }
+}
+
+static Aws::Utils::Logging::LogLevel ParseAwsLogLevelFromEnv() {
+  // defaults to FATAL log level for the AWS SDK
+  // this is because many normal tensorflow operations are logged as errors in
+  // the AWS SDK such as checking if a file exists can log an error in AWS SDK
+  // if the file does not actually exist. Another such case is when reading a
+  // file till the end, TensorFlow expects to see an InvalidRange exception at
+  // the end, but this would be an error in the AWS SDK. This confuses users,
+  // hence the default setting.
+  Aws::Utils::Logging::LogLevel log_level =
+      Aws::Utils::Logging::LogLevel::Fatal;
+
+  const char* aws_env_var_val = getenv("AWS_LOG_LEVEL");
+  if (aws_env_var_val != nullptr) {
+    std::string maybe_integer_str(aws_env_var_val, strlen(aws_env_var_val));
+    std::istringstream ss(maybe_integer_str);
+    int level;
+    ss >> level;
+    if (ss.fail()) {
+      // wasn't a number
+      // expecting a string
+      std::string level_str = maybe_integer_str;
+      if (log_levels_string_to_aws.find(level_str) !=
+          log_levels_string_to_aws.end()) {
+        log_level = log_levels_string_to_aws.at(level_str);
+      }
+    } else {
+      // backwards compatibility
+      // valid number, but this number follows the standard TensorFlow log
+      // levels need to convert this to AWS SDK logging level number
+      log_level = TfLogLevelToAwsLogLevel(level);
+    }
+  }
+  return log_level;
+}
+
+static bool initialized = false;
+ABSL_CONST_INIT static absl::Mutex s3_logging_mutex(absl::kConstInit);
+void AWSLogSystem::InitializeAWSLogging() {
+  absl::MutexLock l(&s3_logging_mutex);
+  if (!initialized) {
+    Aws::Utils::Logging::InitializeAWSLogging(Aws::MakeShared<AWSLogSystem>(
+        kAWSLoggingTag, ParseAwsLogLevelFromEnv()));
+    initialized = true;
+    return;
+  }
+}
+
+void AWSLogSystem::ShutdownAWSLogging() {
+  absl::MutexLock l(&s3_logging_mutex);
+  if (initialized) {
+    Aws::Utils::Logging::ShutdownAWSLogging();
+    initialized = false;
+    return;
+  }
+}
+
+}  // namespace tf_s3_filesystem
diff --git a/tensorflow/c/experimental/filesystem/plugins/s3/aws_logging.h b/tensorflow/c/experimental/filesystem/plugins/s3/aws_logging.h
new file mode 100644
index 00000000000..afecd7e5e62
--- /dev/null
+++ b/tensorflow/c/experimental/filesystem/plugins/s3/aws_logging.h
@@ -0,0 +1,64 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_S3_AWS_LOGGING_H_
+#define TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_S3_AWS_LOGGING_H_
+
+#include <aws/core/utils/logging/LogLevel.h>
+#include <aws/core/utils/logging/LogSystemInterface.h>
+
+#include <atomic>
+#include <string>
+
+namespace tf_s3_filesystem {
+
+class AWSLogSystem : public Aws::Utils::Logging::LogSystemInterface {
+ public:
+  static void InitializeAWSLogging();
+  static void ShutdownAWSLogging();
+
+  explicit AWSLogSystem(Aws::Utils::Logging::LogLevel log_level);
+  virtual ~AWSLogSystem() = default;
+
+  // Gets the currently configured log level.
+  Aws::Utils::Logging::LogLevel GetLogLevel(void) const override {
+    return log_level_;
+  }
+
+  // Set a new log level. This has the immediate effect of changing the log.
+  void SetLogLevel(Aws::Utils::Logging::LogLevel log_level) {
+    log_level_.store(log_level);
+  }
+
+  // Does a printf style output to ProcessFormattedStatement. Don't use this,
+  // it's unsafe. See LogStream.
+  void Log(Aws::Utils::Logging::LogLevel log_level, const char* tag,
+           const char* format, ...) override;
+
+  // Writes the stream to ProcessFormattedStatement.
+  void LogStream(Aws::Utils::Logging::LogLevel log_level, const char* tag,
+                 const Aws::OStringStream& messageStream) override;
+
+  // Flushes the buffered messages if the logger supports buffering
+  void Flush() override;
+
+ private:
+  void LogMessage(Aws::Utils::Logging::LogLevel log_level,
+                  const std::string& message);
+  std::atomic<Aws::Utils::Logging::LogLevel> log_level_;
+};
+
+}  // namespace tf_s3_filesystem
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_S3_AWS_LOGGING_H_
diff --git a/tensorflow/c/experimental/filesystem/plugins/s3/s3_filesystem.cc b/tensorflow/c/experimental/filesystem/plugins/s3/s3_filesystem.cc
index 7e1b36f2dcc..9ff07633f2a 100644
--- a/tensorflow/c/experimental/filesystem/plugins/s3/s3_filesystem.cc
+++ b/tensorflow/c/experimental/filesystem/plugins/s3/s3_filesystem.cc
@@ -38,6 +38,8 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "tensorflow/c/experimental/filesystem/filesystem_interface.h"
 #include "tensorflow/c/experimental/filesystem/plugins/s3/aws_crypto.h"
+#include "tensorflow/c/experimental/filesystem/plugins/s3/aws_logging.h"
+#include "tensorflow/c/logging.h"
 #include "tensorflow/c/tf_status.h"
 
 // Implementation of a filesystem for S3 environments.
@@ -186,6 +188,8 @@ static void GetS3Client(tf_s3_filesystem::S3File* s3_file) {
   absl::MutexLock l(&s3_file->initialization_lock);
 
   if (s3_file->s3_client.get() == nullptr) {
+    tf_s3_filesystem::AWSLogSystem::InitializeAWSLogging();
+
     Aws::SDKOptions options;
     options.cryptoOptions.sha256Factory_create_fn = []() {
       return Aws::MakeShared<tf_s3_filesystem::AWSSHA256Factory>(
@@ -250,6 +254,7 @@ static void ShutdownClient(Aws::S3::S3Client* s3_client) {
     delete s3_client;
     Aws::SDKOptions options;
     Aws::ShutdownAPI(options);
+    tf_s3_filesystem::AWSLogSystem::ShutdownAWSLogging();
   }
 }
 
@@ -281,6 +286,7 @@ void Cleanup(TF_RandomAccessFile* file) {
 
 static int64_t ReadS3Client(S3File* s3_file, uint64_t offset, size_t n,
                             char* buffer, TF_Status* status) {
+  TF_VLog(3, "ReadFile using S3Client\n");
   Aws::S3::Model::GetObjectRequest get_object_request;
   get_object_request.WithBucket(s3_file->bucket).WithKey(s3_file->object);
   Aws::String bytes =
@@ -306,12 +312,14 @@ static int64_t ReadS3Client(S3File* s3_file, uint64_t offset, size_t n,
 
 static int64_t ReadS3TransferManager(S3File* s3_file, uint64_t offset, size_t n,
                                      char* buffer, TF_Status* status) {
+  TF_VLog(3, "Using TransferManager\n");
   auto create_download_stream = [&]() {
     return Aws::New<TFS3UnderlyingStream>(
         "S3ReadStream",
         Aws::New<Aws::Utils::Stream::PreallocatedStreamBuf>(
             "S3ReadStream", reinterpret_cast<unsigned char*>(buffer), n));
   };
+  TF_VLog(3, "Created stream to read with transferManager\n");
   auto handle = s3_file->transfer_manager->DownloadFile(
       s3_file->bucket, s3_file->object, offset, n, create_download_stream);
   handle->WaitUntilFinished();
@@ -322,6 +330,10 @@ static int64_t ReadS3TransferManager(S3File* s3_file, uint64_t offset, size_t n,
              Aws::Http::HttpResponseCode::REQUESTED_RANGE_NOT_SATISFIABLE &&
          retries++ < kDownloadRetries) {
     // Only failed parts will be downloaded again.
+    TF_VLog(
+        1,
+        "Retrying read of s3://%s/%s after failure. Current retry count: %u\n",
+        s3_file->bucket.c_str(), s3_file->object.c_str(), retries);
     s3_file->transfer_manager->RetryDownload(handle);
     handle->WaitUntilFinished();
   }
@@ -341,6 +353,8 @@ static int64_t ReadS3TransferManager(S3File* s3_file, uint64_t offset, size_t n,
 int64_t Read(const TF_RandomAccessFile* file, uint64_t offset, size_t n,
              char* buffer, TF_Status* status) {
   auto s3_file = static_cast<S3File*>(file->plugin_file);
+  TF_VLog(1, "ReadFilefromS3 s3://%s/%s from %u for n: %u\n",
+          s3_file->bucket.c_str(), s3_file->object.c_str(), offset, n);
   if (s3_file->use_multi_part_download)
     return ReadS3TransferManager(s3_file, offset, n, buffer, status);
   else
@@ -416,6 +430,8 @@ void Sync(const TF_WritableFile* file, TF_Status* status) {
     TF_SetStatus(status, TF_OK, "");
     return;
   }
+  TF_VLog(1, "WriteFileToS3: s3://%s/%s\n", s3_file->bucket.c_str(),
+          s3_file->object.c_str());
   auto position = static_cast<int64_t>(s3_file->outfile->tellp());
   auto handle = s3_file->transfer_manager->UploadFile(
       s3_file->outfile, s3_file->bucket, s3_file->object,
@@ -426,6 +442,10 @@ void Sync(const TF_WritableFile* file, TF_Status* status) {
   while (handle->GetStatus() == Aws::Transfer::TransferStatus::FAILED &&
          retries++ < kUploadRetries) {
     // if multipart upload was used, only the failed parts will be re-sent
+    TF_VLog(1,
+            "Retrying upload of s3://%s/%s after failure. Current retry count: "
+            "%u\n",
+            s3_file->bucket.c_str(), s3_file->object.c_str(), retries);
     s3_file->transfer_manager->RetryUpload(s3_file->outfile, handle);
     handle->WaitUntilFinished();
   }
@@ -613,6 +633,7 @@ void NewAppendableFile(const TF_Filesystem* filesystem, const char* path,
 
 void Stat(const TF_Filesystem* filesystem, const char* path,
           TF_FileStatistics* stats, TF_Status* status) {
+  TF_VLog(1, "Stat on path: %s\n", path);
   Aws::String bucket, object;
   ParseS3Path(path, true, &bucket, &object, status);
   if (TF_GetCode(status) != TF_OK) return;
@@ -737,6 +758,8 @@ static void SimpleCopyFile(const Aws::String& source,
                            const Aws::String& bucket_dst,
                            const Aws::String& object_dst, S3File* s3_file,
                            TF_Status* status) {
+  TF_VLog(1, "SimpleCopyFile from %s to %s/%s\n", bucket_dst.c_str(),
+          object_dst.c_str());
   Aws::S3::Model::CopyObjectRequest copy_object_request;
   copy_object_request.WithCopySource(source)
       .WithBucket(bucket_dst)
@@ -801,6 +824,8 @@ static void MultiPartCopy(const Aws::String& source,
                           const Aws::String& object_dst, const size_t num_parts,
                           const uint64_t file_size, S3File* s3_file,
                           TF_Status* status) {
+  TF_VLog(1, "MultiPartCopy from %s to %s/%s\n", bucket_dst.c_str(),
+          object_dst.c_str());
   Aws::S3::Model::CreateMultipartUploadRequest create_multipart_upload_request;
   create_multipart_upload_request.WithBucket(bucket_dst).WithKey(object_dst);
 
@@ -827,6 +852,8 @@ static void MultiPartCopy(const Aws::String& source,
   auto chunk_size =
       s3_file->multi_part_chunk_sizes[Aws::Transfer::TransferDirection::UPLOAD];
 
+  TF_VLog(1, "Copying from %s in %u parts of size %u each\n", source.c_str(),
+          num_parts, chunk_size);
   size_t retries = 0;
   while (retries++ < 3) {
     // Queue up parts.
@@ -891,6 +918,9 @@ static void MultiPartCopy(const Aws::String& source,
                                           status);
         } else {
           // Retry.
+          TF_Log(TF_ERROR,
+                 "Retrying failed copy of part %u due to an error with S3\n",
+                 part_number);
           num_finished_parts--;
         }
       }
@@ -967,6 +997,7 @@ void CopyFile(const TF_Filesystem* filesystem, const char* src, const char* dst,
 
 void DeleteFile(const TF_Filesystem* filesystem, const char* path,
                 TF_Status* status) {
+  TF_VLog(1, "DeleteFile: %s\n", path);
   Aws::String bucket, object;
   ParseS3Path(path, false, &bucket, &object, status);
   if (TF_GetCode(status) != TF_OK) return;
@@ -985,6 +1016,7 @@ void DeleteFile(const TF_Filesystem* filesystem, const char* path,
 
 void CreateDir(const TF_Filesystem* filesystem, const char* path,
                TF_Status* status) {
+  TF_VLog(1, "CreateDir: %s\n", path);
   Aws::String bucket, object;
   ParseS3Path(path, true, &bucket, &object, status);
   if (TF_GetCode(status) != TF_OK) return;
@@ -1026,6 +1058,7 @@ void CreateDir(const TF_Filesystem* filesystem, const char* path,
 
 void DeleteDir(const TF_Filesystem* filesystem, const char* path,
                TF_Status* status) {
+  TF_VLog(1, "DeleteDir: %s\n", path);
   Aws::String bucket, object;
   ParseS3Path(path, false, &bucket, &object, status);
   if (TF_GetCode(status) != TF_OK) return;
@@ -1060,6 +1093,7 @@ void DeleteDir(const TF_Filesystem* filesystem, const char* path,
 
 void RenameFile(const TF_Filesystem* filesystem, const char* src,
                 const char* dst, TF_Status* status) {
+  TF_VLog(1, "RenameFile from: %s to %s\n", src, dst);
   Aws::String bucket_src, object_src;
   ParseS3Path(src, false, &bucket_src, &object_src, status);
   if (TF_GetCode(status) != TF_OK) return;
@@ -1120,6 +1154,7 @@ void RenameFile(const TF_Filesystem* filesystem, const char* src,
 
 int GetChildren(const TF_Filesystem* filesystem, const char* path,
                 char*** entries, TF_Status* status) {
+  TF_VLog(1, "GetChildren for path: %s\n", path);
   Aws::String bucket, prefix;
   ParseS3Path(path, true, &bucket, &prefix, status);
   if (TF_GetCode(status) != TF_OK) return -1;
diff --git a/tensorflow/c/experimental/gradients/BUILD b/tensorflow/c/experimental/gradients/BUILD
index e3acdf7e2c3..5386c0cf3f7 100644
--- a/tensorflow/c/experimental/gradients/BUILD
+++ b/tensorflow/c/experimental/gradients/BUILD
@@ -3,6 +3,24 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
+cc_library(
+    name = "array_grad",
+    srcs = ["array_grad.cc"],
+    hdrs = [
+        "array_grad.h",
+    ],
+    visibility = [
+        "//tensorflow:internal",
+    ],
+    deps = [
+        "//tensorflow/c/eager:abstract_operation",
+        "//tensorflow/c/eager:abstract_tensor_handle",
+        "//tensorflow/c/eager:c_api_unified_internal",
+        "//tensorflow/c/eager:gradients_internal",
+        "//tensorflow/core/lib/llvm_rtti",
+    ],
+)
+
 cc_library(
     name = "math_grad",
     srcs = ["math_grad.cc"],
@@ -13,11 +31,63 @@ cc_library(
         "//tensorflow:internal",
     ],
     deps = [
-        "//tensorflow/c/eager:abstract_operation",
         "//tensorflow/c/eager:abstract_tensor_handle",
-        "//tensorflow/c/eager:c_api_unified_internal",
-        "//tensorflow/c/eager:gradients",
+        "//tensorflow/c/eager:gradients_internal",
         "//tensorflow/c/experimental/ops:array_ops",
-        "//tensorflow/core/lib/llvm_rtti",
+        "//tensorflow/c/experimental/ops:math_ops",
+        "//tensorflow/c/experimental/ops:nn_ops",
+    ],
+)
+
+cc_library(
+    name = "nn_grad",
+    srcs = ["nn_grad.cc"],
+    hdrs = [
+        "nn_grad.h",
+    ],
+    visibility = [
+        "//tensorflow:internal",
+    ],
+    deps = [
+        "//tensorflow/c/eager:abstract_tensor_handle",
+        "//tensorflow/c/eager:gradients_internal",
+        "//tensorflow/c/eager:immediate_execution_context",
+        "//tensorflow/c/eager:immediate_execution_tensor_handle",
+        "//tensorflow/c/experimental/ops:array_ops",
+        "//tensorflow/c/experimental/ops:math_ops",
+        "//tensorflow/c/experimental/ops:nn_ops",
+        "//tensorflow/core/lib/llvm_rtti",
+        "//tensorflow/core/platform:errors",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "gradients",
+    hdrs = [
+        "array_grad.h",
+        "math_grad.h",
+        "nn_grad.h",
+    ],
+    visibility = [
+        "//tensorflow:internal",
+    ],
+    deps = [
+        ":array_grad",
+        ":math_grad",
+        ":nn_grad",
+    ],
+)
+
+filegroup(
+    name = "pywrap_required_hdrs",
+    srcs = [
+        "array_grad.h",
+        "math_grad.h",
+        "nn_grad.h",
+    ],
+    visibility = [
+        "//tensorflow/core:__pkg__",
+        "//tensorflow/python:__pkg__",
     ],
 )
diff --git a/tensorflow/c/experimental/gradients/array_grad.cc b/tensorflow/c/experimental/gradients/array_grad.cc
new file mode 100644
index 00000000000..069209a4b6b
--- /dev/null
+++ b/tensorflow/c/experimental/gradients/array_grad.cc
@@ -0,0 +1,48 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/c/experimental/gradients/array_grad.h"
+
+namespace tensorflow {
+namespace gradients {
+namespace {
+using std::vector;
+class IdentityNGradientFunction : public GradientFunction {
+ public:
+  Status Compute(Context* ctx, const IncomingGradients& grad_inputs,
+                 vector<AbstractTensorHandle*>* grad_outputs) override {
+    grad_outputs->resize(grad_inputs.size(), nullptr);
+    for (int i = 0; i < grad_inputs.size(); i++) {
+      auto grad_input = grad_inputs[i];
+      // TODO(srbs): Should we add a copy contructor to AbstractTensorHandle
+      // that takes care of this similar to `Tensor`?
+      if (grad_input) {
+        grad_input->Ref();
+      }
+      (*grad_outputs)[i] = grad_input;
+    }
+    return Status::OK();
+  }
+  ~IdentityNGradientFunction() override {}
+};
+}  // namespace
+
+BackwardFunction* IdentityNRegisterer(const ForwardOperation& op) {
+  auto gradient_function = new IdentityNGradientFunction;
+  auto default_gradients = new PassThroughDefaultGradients(op);
+  return new BackwardFunction(gradient_function, default_gradients);
+}
+
+}  // namespace gradients
+}  // namespace tensorflow
diff --git a/tensorflow/c/experimental/gradients/array_grad.h b/tensorflow/c/experimental/gradients/array_grad.h
new file mode 100644
index 00000000000..edeeb5fcb4a
--- /dev/null
+++ b/tensorflow/c/experimental/gradients/array_grad.h
@@ -0,0 +1,26 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_GRADIENTS_ARRAY_GRAD_H_
+#define TENSORFLOW_C_EXPERIMENTAL_GRADIENTS_ARRAY_GRAD_H_
+
+#include "tensorflow/c/eager/gradients.h"
+
+namespace tensorflow {
+namespace gradients {
+BackwardFunction* IdentityNRegisterer(const ForwardOperation& op);
+}  // namespace gradients
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_GRADIENTS_ARRAY_GRAD_H_
diff --git a/tensorflow/c/experimental/gradients/math_grad.cc b/tensorflow/c/experimental/gradients/math_grad.cc
index 47bd8cce23d..c2aa9caf814 100644
--- a/tensorflow/c/experimental/gradients/math_grad.cc
+++ b/tensorflow/c/experimental/gradients/math_grad.cc
@@ -14,9 +14,16 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/c/experimental/gradients/math_grad.h"
 
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/c/eager/gradients.h"
 #include "tensorflow/c/experimental/ops/array_ops.h"
+#include "tensorflow/c/experimental/ops/math_ops.h"
+#include "tensorflow/c/experimental/ops/nn_ops.h"
 
-using tensorflow::ops::Identity;
+using std::vector;
+using tensorflow::ops::Conj;
+using tensorflow::ops::MatMul;
+using tensorflow::ops::Mul;
 
 namespace tensorflow {
 namespace gradients {
@@ -24,30 +31,184 @@ namespace {
 
 class AddGradientFunction : public GradientFunction {
  public:
-  Status Compute(Context* ctx,
-                 absl::Span<AbstractTensorHandle* const> grad_inputs,
-                 std::vector<AbstractTensorHandle*>* grad_outputs) override {
+  Status Compute(Context* ctx, const IncomingGradients& grad_inputs,
+                 vector<AbstractTensorHandle*>* grad_outputs) override {
     grad_outputs->resize(2);
-    std::vector<AbstractTensorHandle*> identity_outputs(1);
-    // TODO(b/145674566): Handle name unification in tracing code.
     // TODO(b/161805092): Support broadcasting.
-    TF_RETURN_IF_ERROR(ops::Identity(ctx->ctx, {grad_inputs[0]},
-                                     absl::MakeSpan(identity_outputs),
-                                     "Identity0"));
-    (*grad_outputs)[0] = identity_outputs[0];
-    TF_RETURN_IF_ERROR(ops::Identity(ctx->ctx, {grad_inputs[0]},
-                                     absl::MakeSpan(identity_outputs),
-                                     "Identity1"));
-    (*grad_outputs)[1] = identity_outputs[0];
+
+    DCHECK(grad_inputs[0]);
+    (*grad_outputs)[0] = grad_inputs[0];
+    (*grad_outputs)[1] = grad_inputs[0];
+
+    (*grad_outputs)[0]->Ref();
+    (*grad_outputs)[1]->Ref();
     return Status::OK();
   }
   ~AddGradientFunction() override {}
 };
 
+class ExpGradientFunction : public GradientFunction {
+ public:
+  explicit ExpGradientFunction(AbstractTensorHandle* exp) : exp_(exp) {
+    exp->Ref();
+  }
+  Status Compute(Context* ctx, const IncomingGradients& grad_inputs,
+                 vector<AbstractTensorHandle*>* grad_outputs) override {
+    vector<AbstractTensorHandle*> conj_outputs(1);
+    std::string name = "Conj_Exp_Grad";
+    TF_RETURN_IF_ERROR(Conj(ctx->ctx, {exp_.get()},
+                            absl::MakeSpan(conj_outputs), name.c_str()));
+    AbstractTensorHandlePtr conj_output_releaser(conj_outputs[0]);
+    grad_outputs->resize(1);
+
+    name = "Mul_Exp_Grad";
+    TF_RETURN_IF_ERROR(Mul(ctx->ctx, {conj_outputs[0], grad_inputs[0]},
+                           absl::MakeSpan(*grad_outputs), name.c_str()));
+    return Status::OK();
+  }
+  ~ExpGradientFunction() override {}
+
+ private:
+  AbstractTensorHandlePtr exp_;
+};
+
+class MatMulGradientFunction : public GradientFunction {
+ public:
+  explicit MatMulGradientFunction(vector<AbstractTensorHandle*> f_inputs,
+                                  AttrBuilder f_attrs)
+      : forward_inputs(f_inputs), forward_attrs(f_attrs) {}
+
+  Status Compute(Context* ctx, const IncomingGradients& grad_inputs,
+                 vector<AbstractTensorHandle*>* grad_outputs) override {
+    /* Given upstream grad U and a matmul op A*B, the gradients are:
+     *
+     *    dA = U * B.T
+     *    dB = A.T * U
+     *
+     *    where A.T means `transpose(A)`
+     */
+    AbstractTensorHandle* upstream_grad = grad_inputs[0];
+    grad_outputs->resize(2);
+
+    // Get transpose attrs
+    bool t_a;
+    TF_RETURN_IF_ERROR(forward_attrs.Get("transpose_a", &t_a));
+
+    bool t_b;
+    TF_RETURN_IF_ERROR(forward_attrs.Get("transpose_b", &t_b));
+
+    // Conj each input
+    vector<AbstractTensorHandle*> conj_outputs(1);
+    std::string name = "Conj_A_MatMul_Grad";
+    TF_RETURN_IF_ERROR(Conj(ctx->ctx, {forward_inputs[0]},
+                            absl::MakeSpan(conj_outputs), name.c_str()));
+
+    AbstractTensorHandle* A = conj_outputs[0];
+
+    name = "Conj_B_MatMul_Grad";
+    TF_RETURN_IF_ERROR(Conj(ctx->ctx, {forward_inputs[1]},
+                            absl::MakeSpan(conj_outputs), name.c_str()));
+
+    AbstractTensorHandle* B = conj_outputs[0];
+
+    // Calc Grad
+    vector<AbstractTensorHandle*> matmul_A_outputs(1);
+    vector<AbstractTensorHandle*> matmul_B_outputs(1);
+    std::string name_grad_A = "MatMul_Grad_A";
+    std::string name_grad_B = "MatMul_Grad_B";
+    if (!t_a && !t_b) {
+      TF_RETURN_IF_ERROR(MatMul(ctx->ctx, {upstream_grad, B},
+                                absl::MakeSpan(matmul_A_outputs),
+                                name_grad_A.c_str(),
+                                /*transpose_a = */ false,
+                                /*transpose_b = */ true));
+
+      TF_RETURN_IF_ERROR(MatMul(ctx->ctx, {A, upstream_grad},
+                                absl::MakeSpan(matmul_B_outputs),
+                                name_grad_B.c_str(),
+                                /*transpose_a = */ true,
+                                /*transpose_b = */ false));
+    } else if (!t_a && t_b) {
+      TF_RETURN_IF_ERROR(MatMul(ctx->ctx, {upstream_grad, B},
+                                absl::MakeSpan(matmul_A_outputs),
+                                name_grad_A.c_str(),
+                                /*transpose_a = */ false,
+                                /*transpose_b = */ false));
+
+      TF_RETURN_IF_ERROR(MatMul(ctx->ctx, {upstream_grad, A},
+                                absl::MakeSpan(matmul_B_outputs),
+                                name_grad_B.c_str(),
+                                /*transpose_a = */ true,
+                                /*transpose_b = */ false));
+
+    } else if (t_a && !t_b) {
+      TF_RETURN_IF_ERROR(MatMul(ctx->ctx, {B, upstream_grad},
+                                absl::MakeSpan(matmul_A_outputs),
+                                name_grad_A.c_str(),
+                                /*transpose_a = */ false,
+                                /*transpose_b = */ true));
+
+      TF_RETURN_IF_ERROR(MatMul(ctx->ctx, {A, upstream_grad},
+                                absl::MakeSpan(matmul_B_outputs),
+                                name_grad_B.c_str(),
+                                /*transpose_a = */ false,
+                                /*transpose_b = */ false));
+    } else {  // t_a && t_b
+      TF_RETURN_IF_ERROR(MatMul(ctx->ctx, {B, upstream_grad},
+                                absl::MakeSpan(matmul_A_outputs),
+                                name_grad_A.c_str(),
+                                /*transpose_a = */ true,
+                                /*transpose_b = */ true));
+
+      TF_RETURN_IF_ERROR(MatMul(ctx->ctx, {upstream_grad, A},
+                                absl::MakeSpan(matmul_B_outputs),
+                                name_grad_B.c_str(),
+                                /*transpose_a = */ true,
+                                /*transpose_b = */ true));
+    }
+
+    // Gradient for A
+    (*grad_outputs)[0] = matmul_A_outputs[0];
+
+    // Gradient for B
+    (*grad_outputs)[1] = matmul_B_outputs[0];
+    return Status::OK();
+  }
+  ~MatMulGradientFunction() override {}
+
+ private:
+  vector<AbstractTensorHandle*> forward_inputs;
+  AttrBuilder forward_attrs;
+};
+
 }  // namespace
 
-GradientFunction* AddRegisterer(const ForwardOperation& op) {
-  return new AddGradientFunction;
+BackwardFunction* AddRegisterer(const ForwardOperation& op) {
+  auto gradient_function = new AddGradientFunction;
+  // For ops with a single output, the gradient function is not called if there
+  // is no incoming gradient. So we do not need to worry about creating zeros
+  // grads in this case.
+  auto default_gradients = new PassThroughDefaultGradients(op);
+  return new BackwardFunction(gradient_function, default_gradients);
 }
+
+BackwardFunction* ExpRegisterer(const ForwardOperation& op) {
+  auto gradient_function = new ExpGradientFunction(op.outputs[0]);
+  // For ops with a single output, the gradient function is not called if there
+  // is no incoming gradient. So we do not need to worry about creating zeros
+  // grads in this case.
+  auto default_gradients = new PassThroughDefaultGradients(op);
+  return new BackwardFunction(gradient_function, default_gradients);
+}
+
+BackwardFunction* MatMulRegisterer(const ForwardOperation& op) {
+  auto gradient_function = new MatMulGradientFunction(op.inputs, op.attrs);
+  // For ops with a single output, the gradient function is not called if there
+  // is no incoming gradient. So we do not need to worry about creating zeros
+  // grads in this case.
+  auto default_gradients = new PassThroughDefaultGradients(op);
+  return new BackwardFunction(gradient_function, default_gradients);
+}
+
 }  // namespace gradients
 }  // namespace tensorflow
diff --git a/tensorflow/c/experimental/gradients/math_grad.h b/tensorflow/c/experimental/gradients/math_grad.h
index 473253f9b27..205419e1201 100644
--- a/tensorflow/c/experimental/gradients/math_grad.h
+++ b/tensorflow/c/experimental/gradients/math_grad.h
@@ -19,8 +19,10 @@ limitations under the License.
 
 namespace tensorflow {
 namespace gradients {
-GradientFunction* AddRegisterer(const ForwardOperation& op);
+BackwardFunction* AddRegisterer(const ForwardOperation& op);
+BackwardFunction* ExpRegisterer(const ForwardOperation& op);
+BackwardFunction* MatMulRegisterer(const ForwardOperation& op);
 }  // namespace gradients
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_C_EXPERIMENTAL_GRADIENTS_MATH_GRAD_H_
+#endif  // TENSORFLOW_C_EXPERIMENTAL_GRADIENTS_MATH_GRAD_H_
\ No newline at end of file
diff --git a/tensorflow/c/experimental/gradients/nn_grad.cc b/tensorflow/c/experimental/gradients/nn_grad.cc
new file mode 100644
index 00000000000..64532c8ffc0
--- /dev/null
+++ b/tensorflow/c/experimental/gradients/nn_grad.cc
@@ -0,0 +1,133 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/c/experimental/gradients/nn_grad.h"
+
+#include "absl/types/span.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/c/eager/immediate_execution_context.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/c/experimental/ops/array_ops.h"
+#include "tensorflow/c/experimental/ops/math_ops.h"
+#include "tensorflow/c/experimental/ops/nn_ops.h"
+#include "tensorflow/core/lib/llvm_rtti/llvm_rtti.h"
+#include "tensorflow/core/platform/errors.h"
+
+using std::vector;
+using tensorflow::ops::Mul;
+using tensorflow::ops::ReluGrad;
+
+namespace tensorflow {
+namespace gradients {
+namespace {
+
+class ReluGradientFunction : public GradientFunction {
+ public:
+  explicit ReluGradientFunction(vector<AbstractTensorHandle*> f_outputs)
+      : forward_outputs(f_outputs) {}
+
+  Status Compute(Context* ctx, const IncomingGradients& grad_inputs,
+                 vector<AbstractTensorHandle*>* grad_outputs) override {
+    AbstractTensorHandle* upstream_grad = grad_inputs[0];
+    AbstractTensorHandle* activations = forward_outputs[0];
+    grad_outputs->resize(1);
+    vector<AbstractTensorHandle*> relugrad_outputs(1);
+
+    // Calculate Grad
+    std::string name = "relu_grad";
+
+    TF_RETURN_IF_ERROR(ReluGrad(ctx->ctx, {upstream_grad, activations},
+                                absl::MakeSpan(relugrad_outputs),
+                                name.c_str()));
+    (*grad_outputs)[0] = relugrad_outputs[0];
+
+    return Status::OK();
+  }
+  ~ReluGradientFunction() override {}
+
+ private:
+  vector<AbstractTensorHandle*> forward_outputs;
+};
+
+Status BroadcastMul(AbstractContext* ctx, AbstractTensorHandle* vec,
+                    AbstractTensorHandle* mat,
+                    absl::Span<AbstractTensorHandle*> outputs) {
+  if (!isa<ImmediateExecutionContext>(ctx)) {
+    // TODO(b/168850692): Fix this.
+    return errors::Unimplemented(
+        "BroadcastMul is not supported in tracing mode yet.");
+  }
+  auto imm_ctx = dyn_cast<ImmediateExecutionContext>(ctx);
+  AbstractTensorPtr minus_1(imm_ctx->CreateInt32Scalar(-1));
+  ImmediateTensorHandlePtr dim(imm_ctx->CreateLocalHandle(minus_1.get()));
+  vector<AbstractTensorHandle*> expand_dims_outputs(1);
+  TF_RETURN_IF_ERROR(ops::ExpandDims(ctx, {vec, dim.get()},
+                                     absl::MakeSpan(expand_dims_outputs),
+                                     "ExpandDims"));
+  TF_RETURN_IF_ERROR(
+      ops::Mul(ctx, {expand_dims_outputs[0], mat}, outputs, "Mul"));
+  expand_dims_outputs[0]->Unref();
+  return Status::OK();
+}
+
+class SparseSoftmaxCrossEntropyWithLogitsGradientFunction
+    : public GradientFunction {
+ public:
+  explicit SparseSoftmaxCrossEntropyWithLogitsGradientFunction(
+      vector<AbstractTensorHandle*> f_outputs)
+      : forward_outputs(f_outputs) {}
+
+  Status Compute(Context* ctx, const IncomingGradients& grad_inputs,
+                 vector<AbstractTensorHandle*>* grad_outputs) override {
+    grad_outputs->resize(2);
+
+    // Grad for Softmax Input
+    vector<AbstractTensorHandle*> mul_outputs(1);
+    TF_RETURN_IF_ERROR(BroadcastMul(
+        ctx->ctx, grad_inputs[0], forward_outputs[1],
+        absl::MakeSpan(mul_outputs)));  // upstream_grad * local softmax grad
+    (*grad_outputs)[0] = mul_outputs[0];
+
+    // Grad for labels is null
+    (*grad_outputs)[1] = nullptr;
+
+    return Status::OK();
+  }
+  ~SparseSoftmaxCrossEntropyWithLogitsGradientFunction() override {}
+
+ private:
+  vector<AbstractTensorHandle*> forward_outputs;
+};
+
+}  // namespace
+
+BackwardFunction* ReluRegisterer(const ForwardOperation& op) {
+  auto gradient_function = new ReluGradientFunction(op.outputs);
+  // For ops with a single output, the gradient function is not called if there
+  // is no incoming gradient. So we do not need to worry about creating zeros
+  // grads in this case.
+  auto default_gradients = new PassThroughDefaultGradients(op);
+  return new BackwardFunction(gradient_function, default_gradients);
+}
+
+BackwardFunction* SparseSoftmaxCrossEntropyWithLogitsRegisterer(
+    const ForwardOperation& op) {
+  auto gradient_function =
+      new SparseSoftmaxCrossEntropyWithLogitsGradientFunction(op.outputs);
+  auto default_gradients = new PassThroughDefaultGradients(op);
+  return new BackwardFunction(gradient_function, default_gradients);
+}
+
+}  // namespace gradients
+}  // namespace tensorflow
diff --git a/tensorflow/c/experimental/gradients/nn_grad.h b/tensorflow/c/experimental/gradients/nn_grad.h
new file mode 100644
index 00000000000..034f20d7325
--- /dev/null
+++ b/tensorflow/c/experimental/gradients/nn_grad.h
@@ -0,0 +1,28 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_GRADIENTS_NN_GRAD_H_
+#define TENSORFLOW_C_EXPERIMENTAL_GRADIENTS_NN_GRAD_H_
+
+#include "tensorflow/c/eager/gradients.h"
+
+namespace tensorflow {
+namespace gradients {
+BackwardFunction* ReluRegisterer(const ForwardOperation& op);
+BackwardFunction* SparseSoftmaxCrossEntropyWithLogitsRegisterer(
+    const ForwardOperation& op);
+}  // namespace gradients
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_GRADIENTS_NN_GRAD_H_
diff --git a/tensorflow/c/experimental/ops/BUILD b/tensorflow/c/experimental/ops/BUILD
index 312709f4332..d2c22e65f80 100644
--- a/tensorflow/c/experimental/ops/BUILD
+++ b/tensorflow/c/experimental/ops/BUILD
@@ -15,6 +15,7 @@ cc_library(
         "//tensorflow:internal",
     ],
     deps = [
+        "//tensorflow/c/eager:abstract_context",
         "//tensorflow/c/eager:abstract_operation",
         "//tensorflow/c/eager:abstract_tensor_handle",
         "//tensorflow/c/eager:c_api_unified_internal",
@@ -22,3 +23,80 @@ cc_library(
         "//tensorflow/core/platform:errors",
     ],
 )
+
+cc_library(
+    name = "math_ops",
+    srcs = [
+        "math_ops.cc",
+    ],
+    hdrs = [
+        "math_ops.h",
+    ],
+    visibility = [
+        "//tensorflow:internal",
+    ],
+    deps = [
+        ":array_ops",
+        "//tensorflow/c/eager:abstract_context",
+        "//tensorflow/c/eager:abstract_tensor_handle",
+        "//tensorflow/c/eager:c_api_unified_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core/lib/llvm_rtti",
+        "//tensorflow/core/platform:errors",
+    ],
+)
+
+cc_library(
+    name = "nn_ops",
+    srcs = [
+        "nn_ops.cc",
+    ],
+    hdrs = [
+        "nn_ops.h",
+    ],
+    visibility = [
+        "//tensorflow:internal",
+    ],
+    deps = [
+        "//tensorflow/c/eager:abstract_operation",
+        "//tensorflow/c/eager:abstract_tensor_handle",
+        "//tensorflow/c/eager:c_api_unified_internal",
+        "//tensorflow/core/lib/llvm_rtti",
+        "//tensorflow/core/platform:errors",
+    ],
+)
+
+cc_library(
+    name = "ops",
+    hdrs = [
+        "array_ops.h",
+        "math_ops.h",
+        "nn_ops.h",
+    ],
+    visibility = [
+        "//tensorflow:internal",
+    ],
+    deps = [
+        ":array_ops",
+        ":math_ops",
+        ":nn_ops",
+        "//tensorflow/c/eager:abstract_context",
+        "//tensorflow/c/eager:abstract_operation",
+        "//tensorflow/c/eager:abstract_tensor_handle",
+        "//tensorflow/c/eager:c_api_unified_internal",
+        "//tensorflow/core/lib/llvm_rtti",
+    ],
+)
+
+filegroup(
+    name = "pywrap_required_hdrs",
+    srcs = [
+        "array_ops.h",
+        "math_ops.h",
+        "nn_ops.h",
+    ],
+    visibility = [
+        "//tensorflow/core:__pkg__",
+        "//tensorflow/python:__pkg__",
+    ],
+)
diff --git a/tensorflow/c/experimental/ops/array_ops.cc b/tensorflow/c/experimental/ops/array_ops.cc
index e38b00088cf..6ea7a0b73f8 100644
--- a/tensorflow/c/experimental/ops/array_ops.cc
+++ b/tensorflow/c/experimental/ops/array_ops.cc
@@ -14,11 +14,12 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/c/experimental/ops/array_ops.h"
 
+#include "tensorflow/c/eager/c_api_unified_experimental_internal.h"
 #include "tensorflow/core/platform/errors.h"
 
 namespace tensorflow {
 namespace ops {
-// Creates an Identity op.
+
 Status Identity(AbstractContext* ctx,
                 absl::Span<AbstractTensorHandle* const> inputs,
                 absl::Span<AbstractTensorHandle*> outputs, const char* name) {
@@ -34,5 +35,51 @@ Status Identity(AbstractContext* ctx,
   return identity_op->Execute(outputs, &num_retvals);
 }
 
+Status ZerosLike(AbstractContext* ctx,
+                 absl::Span<AbstractTensorHandle* const> inputs,
+                 absl::Span<AbstractTensorHandle*> outputs, const char* name) {
+  AbstractOperationPtr z_op(ctx->CreateOperation());
+  TF_RETURN_IF_ERROR(z_op->Reset("ZerosLike", /*raw_device_name=*/nullptr));
+  if (isa<tensorflow::tracing::TracingOperation>(z_op.get())) {
+    TF_RETURN_IF_ERROR(
+        dyn_cast<tracing::TracingOperation>(z_op.get())->SetOpName(name));
+  }
+  TF_RETURN_IF_ERROR(z_op->AddInput(inputs[0]));
+  int num_retvals = 1;
+  return z_op->Execute(outputs, &num_retvals);
+}
+
+Status Shape(AbstractContext* ctx,
+             absl::Span<AbstractTensorHandle* const> inputs,
+             absl::Span<AbstractTensorHandle*> outputs, const char* name) {
+  AbstractOperationPtr shape_op(ctx->CreateOperation());
+  TF_RETURN_IF_ERROR(shape_op->Reset("Shape", /*raw_device_name=*/nullptr));
+
+  if (isa<tracing::TracingOperation>(shape_op.get())) {
+    TF_RETURN_IF_ERROR(
+        dyn_cast<tracing::TracingOperation>(shape_op.get())->SetOpName(name));
+  }
+
+  TF_RETURN_IF_ERROR(shape_op->AddInput(inputs[0]));  // input
+  int num_retvals = 1;
+  TF_RETURN_IF_ERROR(shape_op->Execute(outputs, &num_retvals));
+  return Status::OK();
+}
+
+Status ExpandDims(AbstractContext* ctx,
+                  absl::Span<AbstractTensorHandle* const> inputs,
+                  absl::Span<AbstractTensorHandle*> outputs, const char* name) {
+  AbstractOperationPtr op(ctx->CreateOperation());
+  TF_RETURN_IF_ERROR(op->Reset("ExpandDims", /*raw_device_name=*/nullptr));
+  if (isa<tensorflow::tracing::TracingOperation>(op.get())) {
+    TF_RETURN_IF_ERROR(
+        dyn_cast<tracing::TracingOperation>(op.get())->SetOpName(name));
+  }
+  TF_RETURN_IF_ERROR(op->AddInput(inputs[0]));
+  TF_RETURN_IF_ERROR(op->AddInput(inputs[1]));
+  int num_retvals = 1;
+  return op->Execute(outputs, &num_retvals);
+}
+
 }  // namespace ops
 }  // namespace tensorflow
diff --git a/tensorflow/c/experimental/ops/array_ops.h b/tensorflow/c/experimental/ops/array_ops.h
index 8a9db484c2e..a2179d3f137 100644
--- a/tensorflow/c/experimental/ops/array_ops.h
+++ b/tensorflow/c/experimental/ops/array_ops.h
@@ -15,16 +15,30 @@ limitations under the License.
 #ifndef TENSORFLOW_C_EXPERIMENTAL_OPS_ARRAY_OPS_H_
 #define TENSORFLOW_C_EXPERIMENTAL_OPS_ARRAY_OPS_H_
 
+#include "tensorflow/c/eager/abstract_context.h"
 #include "tensorflow/c/eager/abstract_operation.h"
 #include "tensorflow/c/eager/abstract_tensor_handle.h"
-#include "tensorflow/c/eager/c_api_unified_experimental_internal.h"
 #include "tensorflow/core/lib/llvm_rtti/llvm_rtti.h"
 
 namespace tensorflow {
 namespace ops {
+
 Status Identity(AbstractContext* ctx,
                 absl::Span<AbstractTensorHandle* const> inputs,
                 absl::Span<AbstractTensorHandle*> outputs, const char* name);
+
+Status ZerosLike(AbstractContext* ctx,
+                 absl::Span<AbstractTensorHandle* const> inputs,
+                 absl::Span<AbstractTensorHandle*> outputs, const char* name);
+
+Status Shape(AbstractContext* ctx,
+             absl::Span<AbstractTensorHandle* const> inputs,
+             absl::Span<AbstractTensorHandle*> outputs, const char* name);
+
+Status ExpandDims(AbstractContext* ctx,
+                  absl::Span<AbstractTensorHandle* const> inputs,
+                  absl::Span<AbstractTensorHandle*> outputs, const char* name);
+
 }  // namespace ops
 }  // namespace tensorflow
 
diff --git a/tensorflow/c/experimental/ops/math_ops.cc b/tensorflow/c/experimental/ops/math_ops.cc
new file mode 100644
index 00000000000..2c6d01b5e21
--- /dev/null
+++ b/tensorflow/c/experimental/ops/math_ops.cc
@@ -0,0 +1,166 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/c/experimental/ops/math_ops.h"
+
+#include "tensorflow/c/eager/abstract_context.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/c/eager/c_api_unified_experimental_internal.h"
+#include "tensorflow/c/experimental/ops/array_ops.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/errors.h"
+namespace tensorflow {
+namespace ops {
+using tensorflow::tracing::TracingOperation;
+
+Status Mul(AbstractContext* ctx, absl::Span<AbstractTensorHandle* const> inputs,
+           absl::Span<AbstractTensorHandle*> outputs, const char* name) {
+  AbstractOperationPtr mul_op(ctx->CreateOperation());
+  TF_RETURN_IF_ERROR(mul_op->Reset("Mul", /*raw_device_name=*/nullptr));
+  if (isa<TracingOperation>(mul_op.get())) {
+    TF_RETURN_IF_ERROR(
+        dyn_cast<TracingOperation>(mul_op.get())->SetOpName(name));
+  }
+  TF_RETURN_IF_ERROR(mul_op->AddInput(inputs[0]));
+  TF_RETURN_IF_ERROR(mul_op->AddInput(inputs[1]));
+  int num_retvals = 1;
+  return mul_op->Execute(outputs, &num_retvals);
+}
+
+Status Conj(AbstractContext* ctx,
+            absl::Span<AbstractTensorHandle* const> inputs,
+            absl::Span<AbstractTensorHandle*> outputs, const char* name) {
+  auto dtype = inputs[0]->DataType();
+  if (DataTypeIsFloating(BaseType(dtype)) ||
+      DataTypeIsInteger(BaseType(dtype))) {
+    TF_RETURN_IF_ERROR(Identity(ctx, inputs, outputs, name));
+  } else {
+    return errors::Unimplemented("Conj does not support complex types yet.");
+  }
+  return Status::OK();
+}
+
+Status Add(AbstractContext* ctx, absl::Span<AbstractTensorHandle* const> inputs,
+           absl::Span<AbstractTensorHandle*> outputs, const char* name) {
+  AbstractOperationPtr add_op(ctx->CreateOperation());
+  TF_RETURN_IF_ERROR(add_op->Reset("AddV2", /*raw_device_name=*/nullptr));
+
+  if (isa<tracing::TracingOperation>(add_op.get())) {
+    TF_RETURN_IF_ERROR(
+        dyn_cast<tracing::TracingOperation>(add_op.get())->SetOpName(name));
+  }
+
+  TF_RETURN_IF_ERROR(add_op->AddInput(inputs[0]));
+  TF_RETURN_IF_ERROR(add_op->AddInput(inputs[1]));
+
+  int num_retvals = 1;
+  TF_RETURN_IF_ERROR(add_op->Execute(outputs, &num_retvals));
+  return Status::OK();
+}
+
+Status Sub(AbstractContext* ctx, absl::Span<AbstractTensorHandle* const> inputs,
+           absl::Span<AbstractTensorHandle*> outputs, const char* name) {
+  AbstractOperationPtr sub_op(ctx->CreateOperation());
+  TF_RETURN_IF_ERROR(sub_op->Reset("Sub", /*raw_device_name=*/nullptr));
+
+  if (isa<tracing::TracingOperation>(sub_op.get())) {
+    TF_RETURN_IF_ERROR(
+        dyn_cast<tracing::TracingOperation>(sub_op.get())->SetOpName(name));
+  }
+
+  TF_RETURN_IF_ERROR(sub_op->AddInput(inputs[0]));
+  TF_RETURN_IF_ERROR(sub_op->AddInput(inputs[1]));
+
+  int num_retvals = 1;
+  TF_RETURN_IF_ERROR(sub_op->Execute(outputs, &num_retvals));
+  return Status::OK();
+}
+
+Status MatMul(AbstractContext* ctx,
+              absl::Span<AbstractTensorHandle* const> inputs,
+              absl::Span<AbstractTensorHandle*> outputs, const char* name,
+              bool transpose_a = false, bool transpose_b = false) {
+  AbstractOperationPtr matmul_op(ctx->CreateOperation());
+  TF_RETURN_IF_ERROR(matmul_op->Reset("MatMul", /*raw_device_name=*/nullptr));
+
+  if (isa<tracing::TracingOperation>(matmul_op.get())) {
+    TF_RETURN_IF_ERROR(
+        dyn_cast<tracing::TracingOperation>(matmul_op.get())->SetOpName(name));
+  }
+
+  TF_RETURN_IF_ERROR(matmul_op->AddInput(inputs[0]));
+  TF_RETURN_IF_ERROR(matmul_op->AddInput(inputs[1]));
+
+  TF_RETURN_IF_ERROR(matmul_op->SetAttrBool("transpose_a", transpose_a));
+  TF_RETURN_IF_ERROR(matmul_op->SetAttrBool("transpose_b", transpose_b));
+
+  int num_retvals = 1;
+  TF_RETURN_IF_ERROR(matmul_op->Execute(outputs, &num_retvals));
+  return Status::OK();
+}
+
+Status Neg(AbstractContext* ctx, absl::Span<AbstractTensorHandle* const> inputs,
+           absl::Span<AbstractTensorHandle*> outputs, const char* name) {
+  AbstractOperationPtr neg_op(ctx->CreateOperation());
+  TF_RETURN_IF_ERROR(neg_op->Reset("Neg", /*raw_device_name=*/nullptr));
+  if (isa<TracingOperation>(neg_op.get())) {
+    TF_RETURN_IF_ERROR(
+        dyn_cast<TracingOperation>(neg_op.get())->SetOpName(name));
+  }
+  TF_RETURN_IF_ERROR(neg_op->AddInput(inputs[0]));
+
+  int num_retvals = 1;
+  return neg_op->Execute(outputs, &num_retvals);
+}
+
+Status Sum(AbstractContext* ctx, absl::Span<AbstractTensorHandle* const> inputs,
+           absl::Span<AbstractTensorHandle*> outputs, const char* name) {
+  AbstractOperationPtr sum_op(ctx->CreateOperation());
+  TF_RETURN_IF_ERROR(sum_op->Reset("Sum", /*raw_device_name=*/nullptr));
+
+  if (isa<tracing::TracingOperation>(sum_op.get())) {
+    TF_RETURN_IF_ERROR(
+        dyn_cast<tracing::TracingOperation>(sum_op.get())->SetOpName(name));
+  }
+
+  TF_RETURN_IF_ERROR(sum_op->AddInput(inputs[0]));  // input_vals
+  TF_RETURN_IF_ERROR(sum_op->AddInput(inputs[1]));  // reduction_indices
+
+  int num_retvals = 1;
+  TF_RETURN_IF_ERROR(sum_op->Execute(outputs, &num_retvals));
+  return Status::OK();
+}
+
+Status DivNoNan(AbstractContext* ctx,
+                absl::Span<AbstractTensorHandle* const> inputs,
+                absl::Span<AbstractTensorHandle*> outputs, const char* name) {
+  AbstractOperationPtr div_op(ctx->CreateOperation());
+  TF_RETURN_IF_ERROR(div_op->Reset("DivNoNan", /*raw_device_name=*/nullptr));
+
+  if (isa<tracing::TracingOperation>(div_op.get())) {
+    TF_RETURN_IF_ERROR(
+        dyn_cast<tracing::TracingOperation>(div_op.get())->SetOpName(name));
+  }
+
+  TF_RETURN_IF_ERROR(div_op->AddInput(inputs[0]));  // x
+  TF_RETURN_IF_ERROR(div_op->AddInput(inputs[1]));  // y
+
+  int num_retvals = 1;
+  TF_RETURN_IF_ERROR(div_op->Execute(
+      outputs, &num_retvals));  // z = x / y, (z_i = 0 if y_i = 0)
+  return Status::OK();
+}
+
+}  // namespace ops
+}  // namespace tensorflow
diff --git a/tensorflow/c/experimental/ops/math_ops.h b/tensorflow/c/experimental/ops/math_ops.h
new file mode 100644
index 00000000000..004b8f2bb4d
--- /dev/null
+++ b/tensorflow/c/experimental/ops/math_ops.h
@@ -0,0 +1,53 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_OPS_MATH_OPS_H_
+#define TENSORFLOW_C_EXPERIMENTAL_OPS_MATH_OPS_H_
+
+#include "tensorflow/c/eager/abstract_context.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+
+namespace tensorflow {
+namespace ops {
+Status Mul(AbstractContext* ctx, absl::Span<AbstractTensorHandle* const> inputs,
+           absl::Span<AbstractTensorHandle*> outputs, const char* name);
+
+Status Conj(AbstractContext* ctx,
+            absl::Span<AbstractTensorHandle* const> inputs,
+            absl::Span<AbstractTensorHandle*> outputs, const char* name);
+
+Status Add(AbstractContext* ctx, absl::Span<AbstractTensorHandle* const> inputs,
+           absl::Span<AbstractTensorHandle*> outputs, const char* name);
+
+Status MatMul(AbstractContext* ctx,
+              absl::Span<AbstractTensorHandle* const> inputs,
+              absl::Span<AbstractTensorHandle*> outputs, const char* name,
+              bool transpose_a, bool transpose_b);
+
+Status Neg(AbstractContext* ctx, absl::Span<AbstractTensorHandle* const> inputs,
+           absl::Span<AbstractTensorHandle*> outputs, const char* name);
+
+Status Sum(AbstractContext* ctx, absl::Span<AbstractTensorHandle* const> inputs,
+           absl::Span<AbstractTensorHandle*> outputs, const char* name);
+
+Status Sub(AbstractContext* ctx, absl::Span<AbstractTensorHandle* const> inputs,
+           absl::Span<AbstractTensorHandle*> outputs, const char* name);
+
+Status DivNoNan(AbstractContext* ctx,
+                absl::Span<AbstractTensorHandle* const> inputs,
+                absl::Span<AbstractTensorHandle*> outputs, const char* name);
+}  // namespace ops
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_OPS_MATH_OPS_H_
diff --git a/tensorflow/c/experimental/ops/nn_ops.cc b/tensorflow/c/experimental/ops/nn_ops.cc
new file mode 100644
index 00000000000..bcc5586c0ef
--- /dev/null
+++ b/tensorflow/c/experimental/ops/nn_ops.cc
@@ -0,0 +1,85 @@
+
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/c/experimental/ops/nn_ops.h"
+
+#include "tensorflow/core/platform/errors.h"
+
+namespace tensorflow {
+namespace ops {
+
+// Softmax Loss given scores and labels, used by the SoftMaxLossGradient
+Status SparseSoftmaxCrossEntropyWithLogits(
+    AbstractContext* ctx, absl::Span<AbstractTensorHandle* const> inputs,
+    absl::Span<AbstractTensorHandle*> outputs, const char* name) {
+  AbstractOperationPtr sm_loss_op(ctx->CreateOperation());
+  TF_RETURN_IF_ERROR(sm_loss_op->Reset("SparseSoftmaxCrossEntropyWithLogits",
+                                       /*raw_device_name=*/nullptr));
+
+  if (isa<tracing::TracingOperation>(sm_loss_op.get())) {
+    TF_RETURN_IF_ERROR(
+        dyn_cast<tracing::TracingOperation>(sm_loss_op.get())->SetOpName(name));
+  }
+
+  TF_RETURN_IF_ERROR(sm_loss_op->AddInput(inputs[0]));  // input scores
+  TF_RETURN_IF_ERROR(sm_loss_op->AddInput(inputs[1]));  // labels
+
+  // Outputs will contain: [loss_vals, gradients].
+  int num_retvals = 2;
+  TF_RETURN_IF_ERROR(sm_loss_op->Execute(outputs, &num_retvals));
+  return Status::OK();
+}
+
+// Computes Relu gradient given input features
+Status ReluGrad(AbstractContext* ctx,
+                absl::Span<AbstractTensorHandle* const> inputs,
+                absl::Span<AbstractTensorHandle*> outputs, const char* name) {
+  AbstractOperationPtr relugrad_op(ctx->CreateOperation());
+  TF_RETURN_IF_ERROR(
+      relugrad_op->Reset("ReluGrad", /*raw_device_name=*/nullptr));
+
+  if (isa<tracing::TracingOperation>(relugrad_op.get())) {
+    TF_RETURN_IF_ERROR(dyn_cast<tracing::TracingOperation>(relugrad_op.get())
+                           ->SetOpName(name));
+  }
+
+  TF_RETURN_IF_ERROR(relugrad_op->AddInput(inputs[0]));  // upstream grads
+  TF_RETURN_IF_ERROR(relugrad_op->AddInput(inputs[1]));  // relu inputs
+
+  int num_retvals = 1;
+  TF_RETURN_IF_ERROR(relugrad_op->Execute(outputs, &num_retvals));
+  return Status::OK();
+}
+
+Status Relu(AbstractContext* ctx,
+            absl::Span<AbstractTensorHandle* const> inputs,
+            absl::Span<AbstractTensorHandle*> outputs, const char* name) {
+  AbstractOperationPtr relu_op(ctx->CreateOperation());
+  TF_RETURN_IF_ERROR(relu_op->Reset("Relu", /*raw_device_name=*/nullptr));
+
+  if (isa<tracing::TracingOperation>(relu_op.get())) {
+    TF_RETURN_IF_ERROR(
+        dyn_cast<tracing::TracingOperation>(relu_op.get())->SetOpName(name));
+  }
+
+  TF_RETURN_IF_ERROR(relu_op->AddInput(inputs[0]));
+
+  int num_retvals = 1;
+  TF_RETURN_IF_ERROR(relu_op->Execute(outputs, &num_retvals));
+  return Status::OK();
+}
+
+}  // namespace ops
+}  // namespace tensorflow
diff --git a/tensorflow/c/experimental/ops/nn_ops.h b/tensorflow/c/experimental/ops/nn_ops.h
new file mode 100644
index 00000000000..142b74aff0e
--- /dev/null
+++ b/tensorflow/c/experimental/ops/nn_ops.h
@@ -0,0 +1,41 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_OPS_NN_OPS_H_
+#define TENSORFLOW_C_EXPERIMENTAL_OPS_NN_OPS_H_
+
+#include "tensorflow/c/eager/abstract_operation.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/c/eager/c_api_unified_experimental_internal.h"
+#include "tensorflow/core/lib/llvm_rtti/llvm_rtti.h"
+
+namespace tensorflow {
+namespace ops {
+
+Status SparseSoftmaxCrossEntropyWithLogits(
+    AbstractContext* ctx, absl::Span<AbstractTensorHandle* const> inputs,
+    absl::Span<AbstractTensorHandle*> outputs, const char* name);
+
+Status ReluGrad(AbstractContext* ctx,
+                absl::Span<AbstractTensorHandle* const> inputs,
+                absl::Span<AbstractTensorHandle*> outputs, const char* name);
+
+Status Relu(AbstractContext* ctx,
+            absl::Span<AbstractTensorHandle* const> inputs,
+            absl::Span<AbstractTensorHandle*> outputs, const char* name);
+
+}  // namespace ops
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_OPS_NN_OPS_H_
diff --git a/tensorflow/c/experimental/saved_model/core/BUILD b/tensorflow/c/experimental/saved_model/core/BUILD
index 8078758328c..2feb7c1b33e 100644
--- a/tensorflow/c/experimental/saved_model/core/BUILD
+++ b/tensorflow/c/experimental/saved_model/core/BUILD
@@ -44,7 +44,9 @@ cc_library(
     ],
     deps = [
         ":concrete_function",
+        ":signature_def_function",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -70,6 +72,26 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "signature_def_function",
+    hdrs = [
+        "signature_def_function.h",
+    ],
+    deps = [
+        ":signature_def_function_metadata",
+        "//tensorflow/c/eager:immediate_execution_operation",
+        "//tensorflow/c/eager:immediate_execution_tensor_handle",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "signature_def_function_metadata",
+    hdrs = [
+        "signature_def_function_metadata.h",
+    ],
+)
+
 cc_library(
     name = "test_utils",
     testonly = True,
@@ -115,6 +137,7 @@ cc_library(
         ":concrete_function",
         ":saved_model_api",
         ":saved_model_utils",
+        ":signature_def_function",
         "//tensorflow/c:tensor_interface",
         "//tensorflow/c/eager:immediate_execution_context",
         "//tensorflow/c/eager:immediate_execution_tensor_handle",
@@ -206,13 +229,30 @@ tf_cc_test(
         "//tensorflow/c/experimental/saved_model/core/revived_types:constant",
         "//tensorflow/core:all_kernels",
         "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/common_runtime:core_cpu_lib",
         "//tensorflow/core/common_runtime/eager:context",
         "//tensorflow/core/common_runtime/eager:core",
+        "//tensorflow/core/common_runtime/eager:tensor_handle",
+    ],
+)
+
+tf_cc_test(
+    name = "signature_flattening_test",
+    srcs = [
+        "signature_flattening_test.cc",
+    ],
+    deps = [
+        ":saved_model_utils",
+        "//tensorflow/c/experimental/saved_model/core:tf_concrete_function_test_protos",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/common_runtime/eager:core",
     ],
 )
 
diff --git a/tensorflow/c/experimental/saved_model/core/concrete_function.h b/tensorflow/c/experimental/saved_model/core/concrete_function.h
index da3a64b91a3..48a20ef7768 100644
--- a/tensorflow/c/experimental/saved_model/core/concrete_function.h
+++ b/tensorflow/c/experimental/saved_model/core/concrete_function.h
@@ -26,10 +26,14 @@ limitations under the License.
 
 namespace tensorflow {
 
-// Note that ConcreteFunctions's lifetimes are effectively bound
-// to the SavedModel they are loaded from, since they retain pointers
-// to the TensorHandles owned by the SavedModel, and the FunctionDef
-// of the SavedModel.
+// ConcreteFunctions correspond to an instance of a tf.function with a known set
+// of inputs (either through get_concrete_function) or an input_signature.
+// ConcreteFunction attempts to preserve the user-facing semantics of the
+// tf.function python API and can take a limited set of types as arguments
+// (to be modeled in tensorflow::Value), not just Tensors.
+// SavedModelAPI's ConcreteFunctions' lifetimes are bound to the SavedModel they
+// are loaded from, since they retain pointers to the TensorHandles owned by the
+// SavedModel, and the FunctionDef of the SavedModel.
 // Note(bmzhao): This class is only TEMPORARILY virtual, as a way to unblock
 // TFRT integration with TF Serving. Do not add more virtual implementations of
 // this class. Eventually we want to remove this virtual base class indirection
@@ -39,8 +43,8 @@ class ConcreteFunction {
   virtual ~ConcreteFunction() = default;
 
   // This method returns the "Call" Op used to execute the function.
-  virtual Status GetCallOp(absl::Span<AbstractTensorHandle* const> inputs,
-                           ImmediateOpPtr* out) = 0;
+  virtual Status MakeCallOp(absl::Span<AbstractTensorHandle* const> inputs,
+                            ImmediateOpPtr* out) const = 0;
 
   virtual const FunctionMetadata& GetFunctionMetadata() const = 0;
 };
diff --git a/tensorflow/c/experimental/saved_model/core/ops/variable_ops.cc b/tensorflow/c/experimental/saved_model/core/ops/variable_ops.cc
index 492a58f816d..be9ffff99ff 100644
--- a/tensorflow/c/experimental/saved_model/core/ops/variable_ops.cc
+++ b/tensorflow/c/experimental/saved_model/core/ops/variable_ops.cc
@@ -37,10 +37,11 @@ static const char kNoSharingResourceID[] =
 
 Status CreateUninitializedResourceVariable(ImmediateExecutionContext* ctx,
                                            DataType dtype, TensorShape shape,
+                                           const char* raw_device_name,
                                            ImmediateTensorHandlePtr* handle) {
   ImmediateOpPtr varhandle_op(ctx->CreateOperation());
 
-  TF_RETURN_IF_ERROR(varhandle_op->Reset("VarHandleOp", nullptr));
+  TF_RETURN_IF_ERROR(varhandle_op->Reset("VarHandleOp", raw_device_name));
   TF_RETURN_IF_ERROR(varhandle_op->SetAttrType("dtype", dtype));
 
   // Note that if shape is unknown rank, shape.dim_sizes() will be empty, and
diff --git a/tensorflow/c/experimental/saved_model/core/ops/variable_ops.h b/tensorflow/c/experimental/saved_model/core/ops/variable_ops.h
index 13c941a77fe..accad1591da 100644
--- a/tensorflow/c/experimental/saved_model/core/ops/variable_ops.h
+++ b/tensorflow/c/experimental/saved_model/core/ops/variable_ops.h
@@ -31,6 +31,7 @@ namespace internal {
 // https://github.com/tensorflow/tensorflow/blob/516608035f85cec8b126712b0ff8407220206b22/tensorflow/python/ops/resource_variable_ops.py#L1867-L1872
 Status CreateUninitializedResourceVariable(ImmediateExecutionContext* ctx,
                                            DataType dtype, TensorShape shape,
+                                           const char* raw_device_name,
                                            ImmediateTensorHandlePtr* handle);
 
 // Executes an AssignVariableOp using `ctx`, assigning the variable associated
diff --git a/tensorflow/c/experimental/saved_model/core/ops/variable_ops_test.cc b/tensorflow/c/experimental/saved_model/core/ops/variable_ops_test.cc
index 55a4a32e983..5ce027fe6d8 100644
--- a/tensorflow/c/experimental/saved_model/core/ops/variable_ops_test.cc
+++ b/tensorflow/c/experimental/saved_model/core/ops/variable_ops_test.cc
@@ -55,7 +55,7 @@ TEST_F(VariableOpsTest, CreateVariableSuccessful) {
   // Create a DT_Resource TensorHandle that points to a scalar DT_FLOAT tensor
   ImmediateTensorHandlePtr handle;
   TF_EXPECT_OK(internal::CreateUninitializedResourceVariable(
-      context(), DT_FLOAT, {}, &handle));
+      context(), DT_FLOAT, {}, nullptr, &handle));
   // The created TensorHandle should be a DT_Resource
   EXPECT_EQ(handle->DataType(), DT_RESOURCE);
 }
@@ -65,7 +65,7 @@ TEST_F(VariableOpsTest, DestroyVariableSuccessful) {
   // Create a DT_Resource TensorHandle that points to a scalar DT_FLOAT tensor
   ImmediateTensorHandlePtr handle;
   TF_EXPECT_OK(internal::CreateUninitializedResourceVariable(
-      context(), DT_FLOAT, {}, &handle));
+      context(), DT_FLOAT, {}, nullptr, &handle));
 
   // Destroy the variable
   TF_EXPECT_OK(internal::DestroyResource(context(), handle.get()));
@@ -76,7 +76,7 @@ TEST_F(VariableOpsTest, AssignVariableAndReadSuccessful) {
   // Create a DT_Resource TensorHandle that points to a scalar DT_FLOAT tensor
   ImmediateTensorHandlePtr variable;
   TF_EXPECT_OK(internal::CreateUninitializedResourceVariable(
-      context(), DT_FLOAT, {}, &variable));
+      context(), DT_FLOAT, {}, nullptr, &variable));
 
   // Create a Scalar float TensorHandle with value 42, and assign it to
   // the variable.
diff --git a/tensorflow/c/experimental/saved_model/core/revived_types/BUILD b/tensorflow/c/experimental/saved_model/core/revived_types/BUILD
index 2b883618c87..25cac39daa0 100644
--- a/tensorflow/c/experimental/saved_model/core/revived_types/BUILD
+++ b/tensorflow/c/experimental/saved_model/core/revived_types/BUILD
@@ -28,6 +28,26 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "flat_tensor_function",
+    srcs = [
+        "flat_tensor_function.cc",
+    ],
+    hdrs = [
+        "flat_tensor_function.h",
+    ],
+    deps = [
+        "//tensorflow/c/eager:abstract_tensor_handle",
+        "//tensorflow/c/eager:immediate_execution_context",
+        "//tensorflow/c/eager:immediate_execution_operation",
+        "//tensorflow/c/eager:immediate_execution_tensor_handle",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/common_runtime/eager:context",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
 cc_library(
     name = "variable",
     srcs = [
@@ -68,7 +88,7 @@ cc_library(
         "tf_concrete_function.h",
     ],
     deps = [
-        ":tensorhandle_convertible",
+        ":flat_tensor_function",
         "//tensorflow/c/eager:abstract_tensor_handle",
         "//tensorflow/c/eager:immediate_execution_context",
         "//tensorflow/c/eager:immediate_execution_operation",
@@ -81,3 +101,26 @@ cc_library(
         "@com_google_absl//absl/types:span",
     ],
 )
+
+cc_library(
+    name = "tf_signature_def_function",
+    srcs = [
+        "tf_signature_def_function.cc",
+    ],
+    hdrs = [
+        "tf_signature_def_function.h",
+    ],
+    deps = [
+        ":flat_tensor_function",
+        "//tensorflow/c/eager:abstract_tensor_handle",
+        "//tensorflow/c/eager:immediate_execution_context",
+        "//tensorflow/c/eager:immediate_execution_operation",
+        "//tensorflow/c/eager:immediate_execution_tensor_handle",
+        "//tensorflow/c/experimental/saved_model/core:signature_def_function",
+        "//tensorflow/c/experimental/saved_model/core:signature_def_function_metadata",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/common_runtime/eager:context",
+        "@com_google_absl//absl/types:span",
+    ],
+)
diff --git a/tensorflow/c/experimental/saved_model/core/revived_types/flat_tensor_function.cc b/tensorflow/c/experimental/saved_model/core/revived_types/flat_tensor_function.cc
new file mode 100644
index 00000000000..ad9f896f43d
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/core/revived_types/flat_tensor_function.cc
@@ -0,0 +1,85 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/experimental/saved_model/core/revived_types/flat_tensor_function.h"
+
+#include <memory>
+#include <string>
+
+#include "absl/types/span.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/c/eager/immediate_execution_operation.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/protobuf/saved_object_graph.pb.h"
+#include "tensorflow/core/protobuf/struct.pb.h"
+
+namespace tensorflow {
+
+FlatTensorFunction::FlatTensorFunction(
+    const std::string& name,
+    std::vector<ImmediateExecutionTensorHandle*> captures,
+    ImmediateExecutionContext* ctx)
+    : name_(name), captures_(std::move(captures)), ctx_(ctx) {}
+
+FlatTensorFunction::~FlatTensorFunction() {
+  Status status = ctx_->RemoveFunction(name_);
+  if (!status.ok()) {
+    LOG(ERROR) << "Failed to remove functiondef " << name_ << ". "
+               << status.error_message();
+  }
+}
+
+Status FlatTensorFunction::Create(
+    const FunctionDef* function_def,
+    std::vector<ImmediateExecutionTensorHandle*> captures,
+    ImmediateExecutionContext* ctx, std::unique_ptr<FlatTensorFunction>* out) {
+  TF_RETURN_IF_ERROR(ctx->AddFunctionDef(*function_def));
+  out->reset(new FlatTensorFunction(function_def->signature().name(),
+                                    std::move(captures), ctx));
+  return Status();
+}
+
+Status FlatTensorFunction::MakeCallOp(
+    absl::Span<AbstractTensorHandle* const> inputs, ImmediateOpPtr* out) const {
+  out->reset(ctx_->CreateOperation());
+  // In eager mode, TF2 python executes functions by constructing an op with
+  // the name of the functiondef:
+  // https://github.com/tensorflow/tensorflow/blob/66668ec0ca432e2f38a575b814f45b6d299d01ed/tensorflow/python/eager/function.py#L545
+  // In graph mode, we create a PartitionedCallOp instead:
+  // https://github.com/tensorflow/tensorflow/blob/66668ec0ca432e2f38a575b814f45b6d299d01ed/tensorflow/python/eager/function.py#L573
+
+  // TODO(bmzhao): After discussing with Allen, we should execute this via a
+  // PartitionedCallOp for compatibility with "tooling that assumes functions in
+  // graphs are PartitionedCallOps".
+  TF_RETURN_IF_ERROR((*out)->Reset(name_.c_str(), nullptr));
+
+  // Adding the user-provided inputs to the function.
+  TF_RETURN_IF_ERROR((*out)->AddInputList(inputs));
+
+  absl::Span<AbstractTensorHandle* const> captures(
+      reinterpret_cast<AbstractTensorHandle* const*>(captures_.data()),
+      captures_.size());
+
+  // Adding the captures of the function.
+  TF_RETURN_IF_ERROR((*out)->AddInputList(captures));
+  return Status();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/c/experimental/saved_model/core/revived_types/flat_tensor_function.h b/tensorflow/c/experimental/saved_model/core/revived_types/flat_tensor_function.h
new file mode 100644
index 00000000000..e6bcdec7e3a
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/core/revived_types/flat_tensor_function.h
@@ -0,0 +1,84 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_FLAT_TENSOR_FUNCTION_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_FLAT_TENSOR_FUNCTION_H_
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/c/eager/immediate_execution_context.h"
+#include "tensorflow/c/eager/immediate_execution_operation.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/protobuf/saved_object_graph.pb.h"
+
+namespace tensorflow {
+
+// FlatTensorFunction models a TF2 eager runtime view of a callable function,
+// taking + returning flat lists of tensors, including any captures.
+// Effectively, it is a thin wrapper around a FunctionDef owned by the
+// EagerContext, and any TensorHandle captures associated with the function. The
+// MakeCallOp method handles the logic of marshaling captures after the user
+// provided inputs automatically.
+// Note(bmzhao): This class is mainly intended to house low-level reusable
+// function logic between SignatureDefFunction and ConcreteFunction, which
+// present higher level interfaces. This type does *not* hold any "function
+// metadata".
+class FlatTensorFunction {
+ public:
+  // Factory for creating a FlatTensorFunction.
+  //
+  // Params:
+  //  function_def - The function_def associated with the created
+  //                 FlatTensorFunction. FlatTensorFunction will register this
+  //                 function_def with `ctx` on creation, and de-register it on
+  //                 destruction. function_def must be non-null, but
+  //                 otherwise has no lifetime requirements.
+  //  captures - The captured TensorHandles associated with this
+  //             FlatTensorFunction.
+  //  ctx      - A handle to the Tensorflow runtime. This MUST be non-null and
+  //             outlive TFConcreteFunction.
+  //  out      - The output FlatTensorFunction.
+  static Status Create(const FunctionDef* function_def,
+                       std::vector<ImmediateExecutionTensorHandle*> captures,
+                       ImmediateExecutionContext* ctx,
+                       std::unique_ptr<FlatTensorFunction>* out);
+
+  // This method creates a "Call" Op used to execute the function.
+  Status MakeCallOp(absl::Span<AbstractTensorHandle* const> inputs,
+                    ImmediateOpPtr* out) const;
+
+  ~FlatTensorFunction();
+
+ private:
+  FlatTensorFunction(const std::string& name,
+                     std::vector<ImmediateExecutionTensorHandle*> captures,
+                     ImmediateExecutionContext* ctx);
+
+  FlatTensorFunction(const FlatTensorFunction&) = delete;
+  FlatTensorFunction& operator=(const FlatTensorFunction&) = delete;
+
+  // Name of the FunctionDef corresponding to this TFConcreteFunction
+  std::string name_;
+  std::vector<ImmediateExecutionTensorHandle*> captures_;
+  ImmediateExecutionContext* ctx_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_FLAT_TENSOR_FUNCTION_H_
diff --git a/tensorflow/c/experimental/saved_model/core/revived_types/tf_concrete_function.cc b/tensorflow/c/experimental/saved_model/core/revived_types/tf_concrete_function.cc
index f734f9eca66..d9773a4520f 100644
--- a/tensorflow/c/experimental/saved_model/core/revived_types/tf_concrete_function.cc
+++ b/tensorflow/c/experimental/saved_model/core/revived_types/tf_concrete_function.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "tensorflow/c/eager/abstract_tensor_handle.h"
 #include "tensorflow/c/eager/immediate_execution_operation.h"
 #include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
-#include "tensorflow/c/experimental/saved_model/core/revived_types/tensorhandle_convertible.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/flat_tensor_function.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/platform/errors.h"
@@ -33,32 +33,20 @@ limitations under the License.
 
 namespace tensorflow {
 
-TFConcreteFunction::TFConcreteFunction(
-    const std::string& name,
-    std::vector<ImmediateExecutionTensorHandle*> captures,
-    FunctionMetadata metadata, ImmediateExecutionContext* ctx)
-    : name_(name),
-      captures_(std::move(captures)),
-      metadata_(std::move(metadata)),
-      ctx_(ctx) {}
-
-TFConcreteFunction::~TFConcreteFunction() {
-  Status status = ctx_->RemoveFunction(name_);
-  if (!status.ok()) {
-    LOG(ERROR) << "Failed to remove functiondef " << name_ << ". "
-               << status.error_message();
-  }
-}
+TFConcreteFunction::TFConcreteFunction(std::unique_ptr<FlatTensorFunction> func,
+                                       FunctionMetadata metadata)
+    : func_(std::move(func)), metadata_(std::move(metadata)) {}
 
 Status TFConcreteFunction::Create(
     const FunctionDef* function_def,
     std::vector<ImmediateExecutionTensorHandle*> captures,
     FunctionMetadata metadata, ImmediateExecutionContext* ctx,
     std::unique_ptr<TFConcreteFunction>* out) {
-  TF_RETURN_IF_ERROR(ctx->AddFunctionDef(*function_def));
-  out->reset(new TFConcreteFunction(function_def->signature().name(),
-                                    std::move(captures), std::move(metadata),
-                                    ctx));
+  std::unique_ptr<FlatTensorFunction> func;
+  TF_RETURN_IF_ERROR(FlatTensorFunction::Create(
+      function_def, std::move(captures), ctx, &func));
+
+  out->reset(new TFConcreteFunction(std::move(func), std::move(metadata)));
   return Status();
 }
 
@@ -66,30 +54,9 @@ const FunctionMetadata& TFConcreteFunction::GetFunctionMetadata() const {
   return metadata_;
 }
 
-Status TFConcreteFunction::GetCallOp(
-    absl::Span<AbstractTensorHandle* const> inputs, ImmediateOpPtr* out) {
-  out->reset(ctx_->CreateOperation());
-  // In eager mode, TF2 python executes functions by constructing an op with
-  // the name of the functiondef:
-  // https://github.com/tensorflow/tensorflow/blob/66668ec0ca432e2f38a575b814f45b6d299d01ed/tensorflow/python/eager/function.py#L545
-  // In graph mode, we create a PartitionedCallOp instead:
-  // https://github.com/tensorflow/tensorflow/blob/66668ec0ca432e2f38a575b814f45b6d299d01ed/tensorflow/python/eager/function.py#L573
-
-  // TODO(bmzhao): After discussing with Allen, we should execute this via a
-  // PartitionedCallOp for compatibility with "tooling that assumes functions in
-  // graphs are PartitionedCallOps".
-  TF_RETURN_IF_ERROR((*out)->Reset(name_.c_str(), nullptr));
-
-  // Adding the user-provided inputs to the function.
-  TF_RETURN_IF_ERROR((*out)->AddInputList(inputs));
-
-  absl::Span<AbstractTensorHandle* const> captures(
-      reinterpret_cast<AbstractTensorHandle**>(captures_.data()),
-      captures_.size());
-
-  // Adding the captures of the function.
-  TF_RETURN_IF_ERROR((*out)->AddInputList(captures));
-  return Status();
+Status TFConcreteFunction::MakeCallOp(
+    absl::Span<AbstractTensorHandle* const> inputs, ImmediateOpPtr* out) const {
+  return func_->MakeCallOp(inputs, out);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/c/experimental/saved_model/core/revived_types/tf_concrete_function.h b/tensorflow/c/experimental/saved_model/core/revived_types/tf_concrete_function.h
index d38f3546f91..edc26f4d5aa 100644
--- a/tensorflow/c/experimental/saved_model/core/revived_types/tf_concrete_function.h
+++ b/tensorflow/c/experimental/saved_model/core/revived_types/tf_concrete_function.h
@@ -27,7 +27,7 @@ limitations under the License.
 #include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
 #include "tensorflow/c/experimental/saved_model/core/concrete_function.h"
 #include "tensorflow/c/experimental/saved_model/core/function_metadata.h"
-#include "tensorflow/c/experimental/saved_model/core/revived_types/tensorhandle_convertible.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/flat_tensor_function.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/protobuf/saved_object_graph.pb.h"
 
@@ -58,26 +58,22 @@ class TFConcreteFunction : public ConcreteFunction {
                        std::unique_ptr<TFConcreteFunction>* out);
 
   // This method returns the "Call" Op used to execute the function.
-  Status GetCallOp(absl::Span<AbstractTensorHandle* const> inputs,
-                   ImmediateOpPtr* out) override;
+  Status MakeCallOp(absl::Span<AbstractTensorHandle* const> inputs,
+                    ImmediateOpPtr* out) const override;
 
   const FunctionMetadata& GetFunctionMetadata() const override;
 
-  ~TFConcreteFunction() override;
+  ~TFConcreteFunction() override = default;
 
  private:
-  TFConcreteFunction(const std::string& name,
-                     std::vector<ImmediateExecutionTensorHandle*> captures,
-                     FunctionMetadata metadata, ImmediateExecutionContext* ctx);
+  TFConcreteFunction(std::unique_ptr<FlatTensorFunction> func,
+                     FunctionMetadata metadata);
 
   TFConcreteFunction(const TFConcreteFunction&) = delete;
   TFConcreteFunction& operator=(const TFConcreteFunction&) = delete;
 
-  // Name of the FunctionDef corresponding to this TFConcreteFunction
-  std::string name_;
-  std::vector<ImmediateExecutionTensorHandle*> captures_;
+  std::unique_ptr<FlatTensorFunction> func_;
   FunctionMetadata metadata_;
-  ImmediateExecutionContext* ctx_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/c/experimental/saved_model/core/revived_types/tf_signature_def_function.cc b/tensorflow/c/experimental/saved_model/core/revived_types/tf_signature_def_function.cc
new file mode 100644
index 00000000000..ab1745dcd47
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/core/revived_types/tf_signature_def_function.cc
@@ -0,0 +1,64 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/experimental/saved_model/core/revived_types/tf_signature_def_function.h"
+
+#include <memory>
+#include <string>
+
+#include "absl/types/span.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/c/eager/immediate_execution_operation.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/flat_tensor_function.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/protobuf/saved_object_graph.pb.h"
+#include "tensorflow/core/protobuf/struct.pb.h"
+
+namespace tensorflow {
+
+TFSignatureDefFunction::TFSignatureDefFunction(
+    std::unique_ptr<FlatTensorFunction> func,
+    SignatureDefFunctionMetadata metadata)
+    : func_(std::move(func)), metadata_(std::move(metadata)) {}
+
+Status TFSignatureDefFunction::Create(
+    const FunctionDef* function_def,
+    std::vector<ImmediateExecutionTensorHandle*> captures,
+    SignatureDefFunctionMetadata metadata, ImmediateExecutionContext* ctx,
+    std::unique_ptr<TFSignatureDefFunction>* out) {
+  std::unique_ptr<FlatTensorFunction> func;
+  TF_RETURN_IF_ERROR(FlatTensorFunction::Create(
+      function_def, std::move(captures), ctx, &func));
+
+  out->reset(new TFSignatureDefFunction(std::move(func), std::move(metadata)));
+  return Status();
+}
+
+const SignatureDefFunctionMetadata&
+TFSignatureDefFunction::GetFunctionMetadata() const {
+  return metadata_;
+}
+
+Status TFSignatureDefFunction::MakeCallOp(
+    absl::Span<AbstractTensorHandle* const> inputs, ImmediateOpPtr* out) const {
+  return func_->MakeCallOp(inputs, out);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/c/experimental/saved_model/core/revived_types/tf_signature_def_function.h b/tensorflow/c/experimental/saved_model/core/revived_types/tf_signature_def_function.h
new file mode 100644
index 00000000000..7b564185b8b
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/core/revived_types/tf_signature_def_function.h
@@ -0,0 +1,85 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_TF_SIGNATURE_DEF_FUNCTION_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_TF_SIGNATURE_DEF_FUNCTION_H_
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/c/eager/immediate_execution_context.h"
+#include "tensorflow/c/eager/immediate_execution_operation.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/flat_tensor_function.h"
+#include "tensorflow/c/experimental/saved_model/core/signature_def_function.h"
+#include "tensorflow/c/experimental/saved_model/core/signature_def_function_metadata.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/protobuf/saved_object_graph.pb.h"
+
+namespace tensorflow {
+
+// This is the TF eager runtime implementation of SignatureDefFunction (separate
+// from the TFRT implementation). The user-facing API of SignatureDefFunctions
+// and their semantic differences from ConcreteFunction are described here:
+// https://github.com/tensorflow/tensorflow/blob/e2db60c9d9598ebae0b7741587ce6f5d473584d9/tensorflow/cc/saved_model/experimental/public/signature_def_function.h#L30-L59
+// Additional implementation notes are available here:
+// https://github.com/tensorflow/tensorflow/blob/e2db60c9d9598ebae0b7741587ce6f5d473584d9/tensorflow/c/experimental/saved_model/core/signature_def_function.h#L31-L48
+class TFSignatureDefFunction : public SignatureDefFunction {
+ public:
+  // Factory function for creating a TFSignatureDefFunction.
+  //
+  // Params:
+  //  function_def - The function_def associated with the created
+  //                 TFSignatureDefFunction. TFSignatureDefFunction will
+  //                 register this function_def with `ctx` on creation, and
+  //                 de-register it on destruction. function_def must be
+  //                 non-null, but otherwise has no lifetime requirements.
+  //  captures - The captured TensorHandles associated with this
+  //             TFConcreteFunction.
+  //  metadata - FunctionMetadata associated with this TFSignatureDefFunction.
+  //  ctx      - A handle to the Tensorflow runtime. This MUST be non-null and
+  //             outlive TFSignatureDefFunction.
+  //  out      - The output TFSignatureDefFunction.
+  static Status Create(const FunctionDef* function_def,
+                       std::vector<ImmediateExecutionTensorHandle*> captures,
+                       SignatureDefFunctionMetadata metadata,
+                       ImmediateExecutionContext* ctx,
+                       std::unique_ptr<TFSignatureDefFunction>* out);
+
+  // This method creates a "Call" Op used to execute the function.
+  Status MakeCallOp(absl::Span<AbstractTensorHandle* const> inputs,
+                    ImmediateOpPtr* out) const override;
+
+  const SignatureDefFunctionMetadata& GetFunctionMetadata() const override;
+
+  ~TFSignatureDefFunction() override = default;
+
+ private:
+  TFSignatureDefFunction(std::unique_ptr<FlatTensorFunction> func,
+                         SignatureDefFunctionMetadata metadata);
+
+  TFSignatureDefFunction(const TFSignatureDefFunction&) = delete;
+  TFSignatureDefFunction& operator=(const TFSignatureDefFunction&) = delete;
+
+  std::unique_ptr<FlatTensorFunction> func_;
+  SignatureDefFunctionMetadata metadata_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_TF_SIGNATURE_DEF_FUNCTION_H_
diff --git a/tensorflow/c/experimental/saved_model/core/revived_types/variable.cc b/tensorflow/c/experimental/saved_model/core/revived_types/variable.cc
index d831a8dd840..a212c25bd28 100644
--- a/tensorflow/c/experimental/saved_model/core/revived_types/variable.cc
+++ b/tensorflow/c/experimental/saved_model/core/revived_types/variable.cc
@@ -65,10 +65,11 @@ Status Variable::ReadValue(ImmediateTensorHandlePtr* out) {
 Status Variable::CreateUninitialized(ImmediateExecutionContext* ctx,
                                      DataType dtype, TensorShape shape,
                                      absl::optional<std::string> name,
+                                     const char* raw_device_name,
                                      std::unique_ptr<Variable>* output) {
   ImmediateTensorHandlePtr handle;
   TF_RETURN_IF_ERROR(internal::CreateUninitializedResourceVariable(
-      ctx, dtype, shape, &handle));
+      ctx, dtype, shape, raw_device_name, &handle));
 
   output->reset(
       new Variable(ctx, dtype, shape, std::move(name), std::move(handle)));
diff --git a/tensorflow/c/experimental/saved_model/core/revived_types/variable.h b/tensorflow/c/experimental/saved_model/core/revived_types/variable.h
index 48ea1d08862..13f56fda5f3 100644
--- a/tensorflow/c/experimental/saved_model/core/revived_types/variable.h
+++ b/tensorflow/c/experimental/saved_model/core/revived_types/variable.h
@@ -37,6 +37,7 @@ class Variable : public TensorHandleConvertible {
   static Status CreateUninitialized(ImmediateExecutionContext* ctx,
                                     DataType dtype, TensorShape shape,
                                     absl::optional<std::string> name,
+                                    const char* raw_device_name,
                                     std::unique_ptr<Variable>* output);
 
   // The dtype of the underlying variable.
diff --git a/tensorflow/c/experimental/saved_model/core/saved_model_api.h b/tensorflow/c/experimental/saved_model/core/saved_model_api.h
index 5d0ed63a765..ff891e13ba4 100644
--- a/tensorflow/c/experimental/saved_model/core/saved_model_api.h
+++ b/tensorflow/c/experimental/saved_model/core/saved_model_api.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/c/experimental/saved_model/core/concrete_function.h"
+#include "tensorflow/c/experimental/saved_model/core/signature_def_function.h"
 #include "tensorflow/core/platform/status.h"
 
 namespace tensorflow {
@@ -39,11 +40,11 @@ class SavedModelAPI {
   virtual Status GetFunction(const std::string& function_path,
                              ConcreteFunction** function) = 0;
 
-  // Retrieve a function from a SavedModel, using the key of the
+  // Retrieve a SignatureDefFunction from a SavedModel, using the key of the
   // SignatureDef map:
   // https://github.com/tensorflow/tensorflow/blob/69b08900b1e991d84bce31f3b404f5ed768f339f/tensorflow/core/protobuf/meta_graph.proto#L89
   virtual Status GetSignatureDefFunction(const std::string& signature_def_key,
-                                         ConcreteFunction** function) = 0;
+                                         SignatureDefFunction** function) = 0;
 
   virtual std::vector<ConcreteFunction*> ListFunctions() = 0;
 
diff --git a/tensorflow/c/experimental/saved_model/core/saved_model_utils.cc b/tensorflow/c/experimental/saved_model/core/saved_model_utils.cc
index 2037c4886de..e79fd8d7001 100644
--- a/tensorflow/c/experimental/saved_model/core/saved_model_utils.cc
+++ b/tensorflow/c/experimental/saved_model/core/saved_model_utils.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/stringpiece.h"
 #include "tensorflow/core/protobuf/saved_object_graph.pb.h"
 #include "tensorflow/core/protobuf/struct.pb.h"
@@ -36,52 +37,8 @@ namespace tensorflow {
 namespace internal {
 namespace {
 
-// This returns the size of `tf.nest.flatten(value)`, on values that are
-// used in tf.function's input_signatures.
-int FlattenedSize(const tensorflow::StructuredValue& value, Status* status) {
-  // This follows the logic from
-  // https://github.com/tensorflow/tensorflow/blob/1c064ab76064c58e54261b805027474885a1534d/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc#L2775
-  switch (value.kind_case()) {
-    case StructuredValue::kDictValue: {
-      const DictValue& dict = value.dict_value();
-      int size = 0;
-      for (const auto& field : dict.fields()) {
-        size += FlattenedSize(field.second, status);
-      }
-      return size;
-    }
-    case StructuredValue::kTupleValue: {
-      const TupleValue& tuple = value.tuple_value();
-      int size = 0;
-      for (const StructuredValue& value : tuple.values()) {
-        size += FlattenedSize(value, status);
-      }
-      return size;
-    }
-    case StructuredValue::kListValue: {
-      const ListValue& list = value.list_value();
-      int size = 0;
-      for (const StructuredValue& value : list.values()) {
-        size += FlattenedSize(value, status);
-      }
-      return size;
-    }
-    case StructuredValue::kTensorSpecValue: {
-      return 1;
-    }
-    case StructuredValue::kNoneValue: {
-      // Base case: do nothing.
-      // This arises, for example, as the top-level object of an output
-      // signature when there are no return values.
-      return 0;
-    }
-    default: {
-      status->Update(errors::Internal("Unhandled structured value kind ",
-                                      value.kind_case()));
-      return 0;
-    }
-  }
-}
+using StructuredValueDictEntry =
+    protobuf::MapPair<std::string, StructuredValue>;
 
 // Perform some basic sanity checks on SavedConcreteFunction's input and
 // output signatures with respect to the corresponding FunctionDef's input
@@ -111,34 +68,34 @@ Status ValidateSavedFunctionCompatibleWithFunctionDef(
   // https://github.com/tensorflow/tensorflow/blob/1c064ab76064c58e54261b805027474885a1534d/tensorflow/python/eager/function.py#L1974-L1979
 
   const std::string& name = function_def->signature().name();
+
   const StructuredValue& input_signature =
       saved_concrete_function.canonicalized_input_signature();
-  Status status;
-  int input_signature_size = FlattenedSize(input_signature, &status);
-  TF_RETURN_IF_ERROR(status);
-  if (input_signature_size + saved_concrete_function.bound_inputs_size() !=
+  std::vector<const TensorSpecProto*> input_specs;
+  TF_RETURN_IF_ERROR(FlattenSignature(input_signature, &input_specs));
+  if (input_specs.size() + saved_concrete_function.bound_inputs_size() !=
       function_def->signature().input_arg_size()) {
     return errors::FailedPrecondition(
         "FunctionDef ", name, " has ",
         function_def->signature().input_arg_size(),
-        " inputs, but the SavedConcreteFunction has ", input_signature_size,
+        " inputs, but the SavedConcreteFunction has ", input_specs.size(),
         " flattened user inputs and ",
         saved_concrete_function.bound_inputs_size(), " captured inputs.");
   }
 
   const StructuredValue& output_signature =
       saved_concrete_function.output_signature();
-  int output_signature_size = FlattenedSize(output_signature, &status);
-  TF_RETURN_IF_ERROR(status);
-  if (output_signature_size != function_def->signature().output_arg_size()) {
+  std::vector<const TensorSpecProto*> output_specs;
+  TF_RETURN_IF_ERROR(FlattenSignature(output_signature, &output_specs));
+  if (output_specs.size() != function_def->signature().output_arg_size()) {
     return errors::FailedPrecondition(
         "FunctionDef ", name, " has ",
         function_def->signature().output_arg_size(),
-        " outputs, but the SavedConcreteFunction has ", output_signature_size,
+        " outputs, but the SavedConcreteFunction has ", output_specs.size(),
         " flattened outputs.");
   }
 
-  return status;
+  return Status();
 }
 
 }  // namespace
@@ -165,9 +122,9 @@ Status LoadSavedVariable(ImmediateExecutionContext* ctx,
   tensorflow::TensorShape shape(variable.shape());
   tensorflow::DataType dtype = variable.dtype();
 
-  TF_RETURN_IF_ERROR(
-      Variable::CreateUninitialized(ctx, dtype, shape, name, output));
-
+  TF_RETURN_IF_ERROR(Variable::CreateUninitialized(
+      ctx, dtype, shape, name,
+      variable.device().empty() ? nullptr : variable.device().c_str(), output));
   return Status();
 }
 
@@ -197,6 +154,62 @@ Status LoadTFConcreteFunction(
                                     out);
 }
 
+Status FlattenSignature(const StructuredValue& signature,
+                        std::vector<const TensorSpecProto*>* flattened_specs) {
+  // This follows the logic from
+  // https://github.com/tensorflow/tensorflow/blob/1c064ab76064c58e54261b805027474885a1534d/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc#L2775
+  switch (signature.kind_case()) {
+    case StructuredValue::kDictValue: {
+      // Dictionaries must be sorted in order of keys
+      const DictValue& dict = signature.dict_value();
+      std::vector<const StructuredValueDictEntry*> entries;
+      entries.reserve(dict.fields_size());
+      for (const auto& field : dict.fields()) {
+        entries.push_back(&field);
+      }
+
+      std::sort(entries.begin(), entries.end(),
+                [](const StructuredValueDictEntry* x,
+                   const StructuredValueDictEntry* y) {
+                  return x->first < y->first;
+                });
+
+      for (const auto& entry : entries) {
+        TF_RETURN_IF_ERROR(FlattenSignature(entry->second, flattened_specs));
+      }
+      return Status();
+    }
+    case StructuredValue::kTupleValue: {
+      const TupleValue& tuple = signature.tuple_value();
+      for (const StructuredValue& value : tuple.values()) {
+        TF_RETURN_IF_ERROR(FlattenSignature(value, flattened_specs));
+      }
+      return Status();
+    }
+    case StructuredValue::kListValue: {
+      const ListValue& list = signature.list_value();
+      for (const StructuredValue& value : list.values()) {
+        TF_RETURN_IF_ERROR(FlattenSignature(value, flattened_specs));
+      }
+      return Status();
+    }
+    case StructuredValue::kTensorSpecValue: {
+      flattened_specs->push_back(&signature.tensor_spec_value());
+      return Status();
+    }
+    case StructuredValue::kNoneValue: {
+      // Base case: do nothing.
+      // This arises, for example, as the top-level object of an output
+      // signature when there are no return values.
+      return Status();
+    }
+    default: {
+      return errors::Internal("Unhandled structured value kind ",
+                              signature.kind_case());
+    }
+  }
+}
+
 const SavedObject* FindNodeAtPath(StringPiece path,
                                   const SavedObjectGraph& object_graph) {
   const auto& nodes = object_graph.nodes();
diff --git a/tensorflow/c/experimental/saved_model/core/saved_model_utils.h b/tensorflow/c/experimental/saved_model/core/saved_model_utils.h
index 57f30afa91b..68bfbe32222 100644
--- a/tensorflow/c/experimental/saved_model/core/saved_model_utils.h
+++ b/tensorflow/c/experimental/saved_model/core/saved_model_utils.h
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/stringpiece.h"
 #include "tensorflow/core/protobuf/saved_object_graph.pb.h"
+#include "tensorflow/core/protobuf/struct.pb.h"
 
 namespace tensorflow {
 namespace internal {
@@ -59,10 +60,17 @@ Status LoadTFConcreteFunction(
         captured_objects,
     ImmediateExecutionContext* ctx, std::unique_ptr<TFConcreteFunction>* out);
 
-// Find the SavedObject in `object_graph` at location `path`. `path` must be a
-// dot-delimited string of object names relative to the root object. If no
-// object is found, returns nullptr. Callers must ensure `object_graph` outlives
-// the returned pointer.
+// Flattens `signature` into a vector of TensorSpecProto pointers back into
+// `signature`. `signature` must outlive flattened_specs. `signature` must also
+// be the input or output signature of a SavedConcreteFunction (i.e. "nested
+// structures of tensorspecs").
+Status FlattenSignature(const StructuredValue& signature,
+                        std::vector<const TensorSpecProto*>* flattened_specs);
+
+// Find the SavedObject in `object_graph` at location `path`. `path` must be
+// a dot-delimited string of object names relative to the root object. If no
+// object is found, returns nullptr. Callers must ensure `object_graph`
+// outlives the returned pointer.
 const SavedObject* FindNodeAtPath(StringPiece path,
                                   const SavedObjectGraph& object_graph);
 
diff --git a/tensorflow/c/experimental/saved_model/core/saved_variable_loading_test.cc b/tensorflow/c/experimental/saved_model/core/saved_variable_loading_test.cc
index cf58e5e3536..45b0ac00c9b 100644
--- a/tensorflow/c/experimental/saved_model/core/saved_variable_loading_test.cc
+++ b/tensorflow/c/experimental/saved_model/core/saved_variable_loading_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/c/tensor_interface.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/common_runtime/eager/tensor_handle.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.h"
@@ -38,9 +39,15 @@ namespace {
 class SavedVariableLoadingTest : public ::testing::TestWithParam<
                                      std::tuple<DataType, std::vector<int64>>> {
  public:
-  SavedVariableLoadingTest()
-      : device_mgr_(testing::CreateTestingDeviceMgr()),
-        ctx_(testing::CreateTestingEagerContext(device_mgr_.get())) {}
+  SavedVariableLoadingTest() {
+    SessionOptions options;
+    options.config.mutable_device_count()->insert({"CPU", 3});
+    std::vector<std::unique_ptr<Device>> devices;
+    TF_CHECK_OK(DeviceFactory::AddDevices(
+        options, "/job:localhost/replica:0/task:0", &devices));
+    device_mgr_ = absl::make_unique<StaticDeviceMgr>(std::move(devices));
+    ctx_ = testing::CreateTestingEagerContext(device_mgr_.get());
+  }
 
   EagerContext* context() { return ctx_.get(); }
 
@@ -67,6 +74,39 @@ TEST_P(SavedVariableLoadingTest, LoadSavedVariableSuccessful) {
   EXPECT_EQ(var->shape(), shape);
 }
 
+// Verify that a device specified in the SavedVariable is kept.
+TEST_P(SavedVariableLoadingTest, LoadSavedVariableWithDevice) {
+  auto& test_params = GetParam();
+  DataType dtype = std::get<0>(test_params);
+  TensorShape shape(std::get<1>(test_params));
+
+  SavedVariable saved_variable;
+  saved_variable.set_dtype(dtype);
+  saved_variable.set_device("/job:localhost/replica:0/task:0/device:CPU:1"),
+      shape.AsProto(saved_variable.mutable_shape());
+
+  std::unique_ptr<Variable> var;
+  TF_ASSERT_OK(internal::LoadSavedVariable(context(), saved_variable, &var));
+  EXPECT_EQ(down_cast<TensorHandle*>(var->handle())->resource_device()->name(),
+            "/job:localhost/replica:0/task:0/device:CPU:1");
+}
+
+// Verify load failure if a non-existing device is specified.
+TEST_P(SavedVariableLoadingTest, LoadSavedVariableWithInvalidDevice) {
+  auto& test_params = GetParam();
+  DataType dtype = std::get<0>(test_params);
+  TensorShape shape(std::get<1>(test_params));
+
+  SavedVariable saved_variable;
+  saved_variable.set_dtype(dtype);
+  saved_variable.set_device("/job:localhost/replica:0/task:0/device:CPU:99"),
+      shape.AsProto(saved_variable.mutable_shape());
+
+  std::unique_ptr<Variable> var;
+  ASSERT_NE(Status::OK(),
+            internal::LoadSavedVariable(context(), saved_variable, &var));
+}
+
 // Assigning and reading values should yield
 // consistent results.
 TEST_P(SavedVariableLoadingTest, AssignAndReadVariableSuccesful) {
@@ -79,7 +119,7 @@ TEST_P(SavedVariableLoadingTest, AssignAndReadVariableSuccesful) {
   Status status;
   std::unique_ptr<Variable> var;
   TF_EXPECT_OK(Variable::CreateUninitialized(context(), dtype, shape,
-                                             absl::nullopt, &var));
+                                             absl::nullopt, nullptr, &var));
 
   // Create a TensorHandle
   ImmediateTensorHandlePtr expected_handle =
diff --git a/tensorflow/c/experimental/saved_model/core/signature_def_function.h b/tensorflow/c/experimental/saved_model/core/signature_def_function.h
new file mode 100644
index 00000000000..0a217f3cc21
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/core/signature_def_function.h
@@ -0,0 +1,62 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_SIGNATURE_DEF_FUNCTION_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_SIGNATURE_DEF_FUNCTION_H_
+
+#include <memory>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "tensorflow/c/eager/immediate_execution_operation.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/c/experimental/saved_model/core/signature_def_function_metadata.h"
+
+namespace tensorflow {
+
+// See tensorflow/cc/experimental/saved_model/public/signature_def_function.h
+// for SignatureDefFunction's intended user-facing semantics.
+// This class is the "implementation" C++ part of the C++/C/C++ sandwich for
+// a SignatureDefFunction.
+// Note(bmzhao): Implementation-wise, SignatureDefFunctions are always saved as
+// a "BareConcreteFunction", w/o a FunctionSpec, rather than a SavedFunction:
+// https://github.com/tensorflow/tensorflow/blob/9bcefa44cd335c1db4a703a13da09f29ae1bbdb2/tensorflow/core/protobuf/saved_object_graph.proto#L60
+// Additionally they are guaranteed to be children of the .signatures attribute
+// of the root object, where the child object "name" is the signature_def key:
+// https://github.com/tensorflow/tensorflow/blob/9bcefa44cd335c1db4a703a13da09f29ae1bbdb2/tensorflow/python/saved_model/signature_serialization.py#L181-L230
+// One of the critical requirements of SignatureDef functions is that their
+// inputs and outputs are "named". For example, a `.signatures` function:
+// a. Requires users to pass: kwargs of all inputs:
+// https://github.com/tensorflow/tensorflow/blob/26c4ee0c833e74f94d0102d8b005c41a28b44445/tensorflow/python/saved_model/signature_serialization.py#L119-L126
+// b. Returns a dictionary of named outputs.
+// https://github.com/tensorflow/tensorflow/blob/26c4ee0c833e74f94d0102d8b005c41a28b44445/tensorflow/python/saved_model/signature_serialization.py#L153-L161
+// Since SignatureDefFunctions do not have FunctionSpecs, but guarantee the
+// dictionary of inputs/outputs, we can parse these dictionaries' keys to obtain
+// the input/output names of the SignatureDef:
+// https://github.com/tensorflow/tensorflow/blob/9bcefa44cd335c1db4a703a13da09f29ae1bbdb2/tensorflow/core/protobuf/meta_graph.proto#L318-L321
+class SignatureDefFunction {
+ public:
+  virtual ~SignatureDefFunction() = default;
+
+  // Creates a "Call" Op used to execute the function.
+  virtual Status MakeCallOp(absl::Span<AbstractTensorHandle* const> inputs,
+                            ImmediateOpPtr* out) const = 0;
+
+  virtual const SignatureDefFunctionMetadata& GetFunctionMetadata() const = 0;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_SIGNATURE_DEF_FUNCTION_H_
diff --git a/tensorflow/c/experimental/saved_model/core/signature_def_function_metadata.h b/tensorflow/c/experimental/saved_model/core/signature_def_function_metadata.h
new file mode 100644
index 00000000000..5a579676d4e
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/core/signature_def_function_metadata.h
@@ -0,0 +1,27 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_SIGNATURE_DEF_FUNCTION_METADATA_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_SIGNATURE_DEF_FUNCTION_METADATA_H_
+
+namespace tensorflow {
+
+class SignatureDefFunctionMetadata {
+  // TODO(bmzhao): Fill in with fields as necessary
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_SIGNATURE_DEF_FUNCTION_METADATA_H_
diff --git a/tensorflow/c/experimental/saved_model/core/signature_flattening_test.cc b/tensorflow/c/experimental/saved_model/core/signature_flattening_test.cc
new file mode 100644
index 00000000000..9ee495f524a
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/core/signature_flattening_test.cc
@@ -0,0 +1,133 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <vector>
+
+#include "tensorflow/c/experimental/saved_model/core/saved_model_utils.h"
+#include "tensorflow/c/experimental/saved_model/core/tf_concrete_function_test_protos.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/struct.pb.h"
+
+namespace tensorflow {
+namespace {
+
+// Validates names, shapes, and dtypes of two tensorspecprotos are equivalent.
+bool TensorSpecsAreEqual(const TensorSpecProto& spec,
+                         const std::string& expected_name,
+                         const PartialTensorShape& expected_shape,
+                         DataType expected_dtype) {
+  return spec.name() == expected_name &&
+         PartialTensorShape(spec.shape()).IsIdenticalTo(expected_shape) &&
+         spec.dtype() == expected_dtype;
+}
+
+// This tests the common case for a tf.function w/o inputs. This ends up
+// being serialized as a tuple of an empty tuple + empty dictionary
+// (corresponding to the args, kwargs) of the function.
+TEST(SignatureFlatteningTest, ZeroArgInputSignature) {
+  std::vector<const TensorSpecProto*> flattened;
+  StructuredValue value = testing::ZeroArgInputSignature();
+  TF_EXPECT_OK(internal::FlattenSignature(value, &flattened));
+  EXPECT_EQ(flattened.size(), 0);
+}
+
+// This tests the common case for a tf.function w/o outputs. This ends up
+// being serialized as a "NoneValue".
+TEST(SignatureFlatteningTest, ZeroRetOutputSignature) {
+  std::vector<const TensorSpecProto*> flattened;
+  StructuredValue value = testing::ZeroReturnOutputSignature();
+  TF_EXPECT_OK(internal::FlattenSignature(value, &flattened));
+  EXPECT_EQ(flattened.size(), 0);
+}
+
+TEST(SignatureFlatteningTest, SingleArgInputSignature) {
+  std::vector<const TensorSpecProto*> flattened;
+  StructuredValue value = testing::SingleArgInputSignature();
+  TF_EXPECT_OK(internal::FlattenSignature(value, &flattened));
+  EXPECT_EQ(flattened.size(), 1);
+  EXPECT_TRUE(TensorSpecsAreEqual(*flattened[0],
+                                  /* expected_name = */ "x",
+                                  /* expected_shape = */ {1, 10},
+                                  /* expected_dtype = */ DT_FLOAT))
+      << "Expected " << flattened[0]->DebugString();
+}
+
+TEST(SignatureFlatteningTest, SingleReturnOutputSignature) {
+  std::vector<const TensorSpecProto*> flattened;
+  StructuredValue value = testing::SingleReturnOutputSignature();
+  TF_EXPECT_OK(internal::FlattenSignature(value, &flattened));
+  EXPECT_EQ(flattened.size(), 1);
+  EXPECT_TRUE(TensorSpecsAreEqual(*flattened[0],
+                                  /* expected_name = */ "",
+                                  /* expected_shape = */ {1},
+                                  /* expected_dtype = */ DT_FLOAT))
+      << "Expected " << flattened[0]->DebugString();
+}
+
+TEST(SignatureFlatteningTest, ThreeArgInputSignature) {
+  std::vector<const TensorSpecProto*> flattened;
+  StructuredValue value = testing::ThreeArgInputSignature();
+  TF_EXPECT_OK(internal::FlattenSignature(value, &flattened));
+  EXPECT_EQ(flattened.size(), 3);
+  EXPECT_TRUE(TensorSpecsAreEqual(*flattened[0],
+                                  /* expected_name = */ "x",
+                                  /* expected_shape = */ {1},
+                                  /* expected_dtype = */ DT_FLOAT))
+      << "Expected " << flattened[0]->DebugString();
+
+  EXPECT_TRUE(TensorSpecsAreEqual(*flattened[1],
+                                  /* expected_name = */ "y",
+                                  /* expected_shape = */ {1},
+                                  /* expected_dtype = */ DT_FLOAT))
+      << "Expected " << flattened[1]->DebugString();
+
+  EXPECT_TRUE(TensorSpecsAreEqual(*flattened[2],
+                                  /* expected_name = */ "z",
+                                  /* expected_shape = */ {1},
+                                  /* expected_dtype = */ DT_FLOAT))
+      << "Expected " << flattened[2]->DebugString();
+}
+
+// This test has an exotic outputsignature of tuple of a
+// dictionary<string,tensor>, tensor
+TEST(SignatureFlatteningTest, ThreeReturnOutputSignature) {
+  std::vector<const TensorSpecProto*> flattened;
+  StructuredValue value = testing::ThreeReturnOutputSignature();
+  TF_EXPECT_OK(internal::FlattenSignature(value, &flattened));
+  EXPECT_EQ(flattened.size(), 3);
+  EXPECT_TRUE(TensorSpecsAreEqual(*flattened[0],
+                                  /* expected_name = */ "0/a",
+                                  /* expected_shape = */ {1},
+                                  /* expected_dtype = */ DT_FLOAT))
+      << "Expected " << flattened[0]->DebugString();
+
+  EXPECT_TRUE(TensorSpecsAreEqual(*flattened[1],
+                                  /* expected_name = */ "0/b",
+                                  /* expected_shape = */ {1},
+                                  /* expected_dtype = */ DT_FLOAT))
+      << "Expected " << flattened[1]->DebugString();
+
+  EXPECT_TRUE(TensorSpecsAreEqual(*flattened[2],
+                                  /* expected_name = */ "1",
+                                  /* expected_shape = */ {1},
+                                  /* expected_dtype = */ DT_FLOAT))
+      << "Expected " << flattened[2]->DebugString();
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/c/experimental/saved_model/core/test_utils.cc b/tensorflow/c/experimental/saved_model/core/test_utils.cc
index b803d129b90..7c11158b17d 100644
--- a/tensorflow/c/experimental/saved_model/core/test_utils.cc
+++ b/tensorflow/c/experimental/saved_model/core/test_utils.cc
@@ -28,7 +28,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/lib/bfloat16/bfloat16.h"
+#include "tensorflow/core/platform/bfloat16.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
@@ -45,7 +45,6 @@ EagerContextPtr CreateTestingEagerContext(DeviceMgr* device_mgr) {
   return EagerContextPtr(new EagerContext(
       SessionOptions(),
       tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
-      tensorflow::ContextMirroringPolicy::MIRRORING_NONE,
       /* async= */ false,
       /* lazy_copy_function_remote_inputs= */ false, device_mgr,
       /* device_mgr_owned= */ false, /* rendezvous= */ nullptr,
diff --git a/tensorflow/c/experimental/saved_model/core/tf_saved_model_api.cc b/tensorflow/c/experimental/saved_model/core/tf_saved_model_api.cc
index c22f8d86174..ab7052b52ed 100644
--- a/tensorflow/c/experimental/saved_model/core/tf_saved_model_api.cc
+++ b/tensorflow/c/experimental/saved_model/core/tf_saved_model_api.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "tensorflow/c/experimental/saved_model/core/revived_types/tf_concrete_function.h"
 #include "tensorflow/c/experimental/saved_model/core/revived_types/variable.h"
 #include "tensorflow/c/experimental/saved_model/core/saved_model_utils.h"
+#include "tensorflow/c/experimental/saved_model/core/signature_def_function.h"
 #include "tensorflow/cc/saved_model/bundle_v2.h"
 #include "tensorflow/cc/saved_model/constants.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
@@ -47,6 +48,7 @@ limitations under the License.
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/platform/casts.h"
 #include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/path.h"
 #include "tensorflow/core/platform/stringpiece.h"
@@ -241,8 +243,11 @@ Status RestoreCheckpoint(SavedModelV2Bundle* bundle,
           // TODO(bmzhao): This requires using the newly added Save/Restore
           // functions from
           // https://github.com/tensorflow/tensorflow/commit/df6b21c13c82b5d0981642cfe18f10e60f78ea5c
-          return errors::Unimplemented(
-              "Restoring non-variable objects has not been implemented yet. ");
+          LOG(WARNING) << "Restoring non-variable objects has not been "
+                          "implemented yet. (Kind="
+                       << bundle->saved_object_graph().nodes(node).kind_case()
+                       << ")";
+          return Status::OK();
         }
 
         Variable* variable =
@@ -301,7 +306,7 @@ Status TFSavedModelAPI::GetFunction(const std::string& function_path,
 }
 
 Status TFSavedModelAPI::GetSignatureDefFunction(
-    const std::string& signature_def_key, ConcreteFunction** function) {
+    const std::string& signature_def_key, SignatureDefFunction** function) {
   // TODO(bmzhao): Add support for retrieving a signaturedef function.
   return errors::Unimplemented(
       "Retrieving SignatureDef functions is unimplemented currently");
diff --git a/tensorflow/c/experimental/saved_model/core/tf_saved_model_api.h b/tensorflow/c/experimental/saved_model/core/tf_saved_model_api.h
index fc8e738e86f..fd07c09474b 100644
--- a/tensorflow/c/experimental/saved_model/core/tf_saved_model_api.h
+++ b/tensorflow/c/experimental/saved_model/core/tf_saved_model_api.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/c/experimental/saved_model/core/revived_types/tensorhandle_convertible.h"
 #include "tensorflow/c/experimental/saved_model/core/revived_types/tf_concrete_function.h"
 #include "tensorflow/c/experimental/saved_model/core/saved_model_api.h"
+#include "tensorflow/c/experimental/saved_model/core/signature_def_function.h"
 #include "tensorflow/cc/saved_model/bundle_v2.h"
 #include "tensorflow/core/platform/status.h"
 
@@ -55,7 +56,7 @@ class TFSavedModelAPI : public SavedModelAPI {
                      ConcreteFunction** function) override;
 
   Status GetSignatureDefFunction(const std::string& signature_def_key,
-                                 ConcreteFunction** function) override;
+                                 SignatureDefFunction** function) override;
 
   static Status Load(
       const std::string& directory,
diff --git a/tensorflow/c/experimental/saved_model/internal/BUILD b/tensorflow/c/experimental/saved_model/internal/BUILD
index 323298c5fc1..c0d121a4aee 100644
--- a/tensorflow/c/experimental/saved_model/internal/BUILD
+++ b/tensorflow/c/experimental/saved_model/internal/BUILD
@@ -142,6 +142,8 @@ cc_library(
         ":concrete_function_list_type",
         ":concrete_function_type",
         ":saved_model_api_type",
+        ":signature_def_function",
+        ":signature_def_function_type",
         "//tensorflow/c:c_api_macros",
         "//tensorflow/c:tf_status",
         "//tensorflow/c:tf_status_internal",
@@ -165,6 +167,77 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "signature_def_function",
+    srcs = [
+        "signature_def_function.cc",
+    ],
+    hdrs = [
+        "//tensorflow/c/experimental/saved_model/public:signature_def_function.h",
+    ],
+    copts = tf_copts(),
+    visibility = [
+        "//tensorflow/c/experimental/saved_model/public:__pkg__",
+    ],
+    deps = [
+        ":signature_def_function_metadata",
+        ":signature_def_function_metadata_type",
+        ":signature_def_function_type",
+        "//tensorflow/c:c_api_macros",
+        "//tensorflow/c:tf_status_internal",
+        "//tensorflow/c/eager:abstract_tensor_handle",
+        "//tensorflow/c/eager:c_api",
+        "//tensorflow/c/eager:immediate_execution_operation",
+        "//tensorflow/c/eager:tfe_op_internal",
+        "//tensorflow/c/eager:tfe_tensorhandle_internal",
+        "//tensorflow/c/experimental/saved_model/core:signature_def_function",
+        "//tensorflow/c/experimental/saved_model/core:signature_def_function_metadata",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "signature_def_function_type",
+    hdrs = [
+        "signature_def_function_type.h",
+    ],
+    deps = [
+        "//tensorflow/c:conversion_macros",
+        "//tensorflow/c/experimental/saved_model/core:signature_def_function",
+    ],
+)
+
+cc_library(
+    name = "signature_def_function_metadata",
+    srcs = [
+        "signature_def_function_metadata.cc",
+    ],
+    hdrs = [
+        "//tensorflow/c/experimental/saved_model/public:signature_def_function_metadata.h",
+    ],
+    copts = tf_copts(),
+    visibility = [
+        "//tensorflow/c/experimental/saved_model/public:__pkg__",
+    ],
+    deps = [
+        ":signature_def_function_metadata_type",
+        "//tensorflow/c:c_api_macros",
+        "//tensorflow/c/experimental/saved_model/core:signature_def_function_metadata",
+    ],
+)
+
+cc_library(
+    name = "signature_def_function_metadata_type",
+    hdrs = [
+        "signature_def_function_metadata_type.h",
+    ],
+    deps = [
+        "//tensorflow/c:conversion_macros",
+        "//tensorflow/c/experimental/saved_model/core:signature_def_function_metadata",
+    ],
+)
+
 tf_cc_test(
     name = "saved_model_api_test",
     size = "small",
diff --git a/tensorflow/c/experimental/saved_model/internal/concrete_function.cc b/tensorflow/c/experimental/saved_model/internal/concrete_function.cc
index 65c6eca5623..2beed8f4119 100644
--- a/tensorflow/c/experimental/saved_model/internal/concrete_function.cc
+++ b/tensorflow/c/experimental/saved_model/internal/concrete_function.cc
@@ -34,15 +34,15 @@ TF_FunctionMetadata* TF_ConcreteFunctionGetMetadata(TF_ConcreteFunction* func) {
       &tensorflow::unwrap(func)->GetFunctionMetadata()));
 }
 
-TFE_Op* TF_ConcreteFunctionGetCallOp(TF_ConcreteFunction* func,
-                                     TFE_TensorHandle** inputs, int num_inputs,
-                                     TF_Status* status) {
+TFE_Op* TF_ConcreteFunctionMakeCallOp(TF_ConcreteFunction* func,
+                                      TFE_TensorHandle** inputs, int num_inputs,
+                                      TF_Status* status) {
   tensorflow::ImmediateOpPtr call_op;
   absl::Span<tensorflow::AbstractTensorHandle* const> input_span(
       reinterpret_cast<tensorflow::AbstractTensorHandle**>(
           tensorflow::unwrap(inputs)),
       static_cast<size_t>(num_inputs));
-  status->status = tensorflow::unwrap(func)->GetCallOp(input_span, &call_op);
+  status->status = tensorflow::unwrap(func)->MakeCallOp(input_span, &call_op);
   if (!status->status.ok()) {
     return nullptr;
   }
diff --git a/tensorflow/c/experimental/saved_model/internal/saved_model_api.cc b/tensorflow/c/experimental/saved_model/internal/saved_model_api.cc
index 983c98affb2..b89fb9f6d64 100644
--- a/tensorflow/c/experimental/saved_model/internal/saved_model_api.cc
+++ b/tensorflow/c/experimental/saved_model/internal/saved_model_api.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/c/experimental/saved_model/internal/concrete_function_list_type.h"
 #include "tensorflow/c/experimental/saved_model/internal/concrete_function_type.h"
 #include "tensorflow/c/experimental/saved_model/internal/saved_model_api_type.h"
+#include "tensorflow/c/experimental/saved_model/internal/signature_def_function_type.h"
 #include "tensorflow/c/tf_status.h"
 #include "tensorflow/c/tf_status_internal.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
@@ -106,9 +107,11 @@ TF_ConcreteFunction* TF_GetSavedModelConcreteFunction(TF_SavedModel* model,
   return tensorflow::wrap(result);
 }
 
-TF_CAPI_EXPORT extern TF_ConcreteFunction* TF_GetSavedModelSignatureDefFunction(
-    TF_SavedModel* model, const char* signature_def_key, TF_Status* status) {
-  tensorflow::ConcreteFunction* result = nullptr;
+TF_CAPI_EXPORT extern TF_SignatureDefFunction*
+TF_GetSavedModelSignatureDefFunction(TF_SavedModel* model,
+                                     const char* signature_def_key,
+                                     TF_Status* status) {
+  tensorflow::SignatureDefFunction* result = nullptr;
   tensorflow::Status get_function_status =
       tensorflow::unwrap(model)->GetSignatureDefFunction(signature_def_key,
                                                          &result);
diff --git a/tensorflow/c/experimental/saved_model/internal/saved_model_api_test.cc b/tensorflow/c/experimental/saved_model/internal/saved_model_api_test.cc
index e58b232f9c9..df998fcf6cd 100644
--- a/tensorflow/c/experimental/saved_model/internal/saved_model_api_test.cc
+++ b/tensorflow/c/experimental/saved_model/internal/saved_model_api_test.cc
@@ -107,7 +107,7 @@ TEST_P(CSavedModelAPITest, LoadsSavedModel) {
   compute_fn_inputs.push_back(input_a);
   compute_fn_inputs.push_back(input_b);
 
-  TFE_Op* compute_fn_op = TF_ConcreteFunctionGetCallOp(
+  TFE_Op* compute_fn_op = TF_ConcreteFunctionMakeCallOp(
       compute_fn, compute_fn_inputs.data(), compute_fn_inputs.size(), status);
   EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
 
diff --git a/tensorflow/c/experimental/saved_model/internal/signature_def_function.cc b/tensorflow/c/experimental/saved_model/internal/signature_def_function.cc
new file mode 100644
index 00000000000..64f7506f32e
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/internal/signature_def_function.cc
@@ -0,0 +1,53 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/experimental/saved_model/public/signature_def_function.h"
+
+#include "absl/types/span.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/c/eager/immediate_execution_operation.h"
+#include "tensorflow/c/eager/tfe_op_internal.h"
+#include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
+#include "tensorflow/c/experimental/saved_model/core/signature_def_function.h"
+#include "tensorflow/c/experimental/saved_model/core/signature_def_function_metadata.h"
+#include "tensorflow/c/experimental/saved_model/internal/signature_def_function_metadata_type.h"
+#include "tensorflow/c/experimental/saved_model/internal/signature_def_function_type.h"
+#include "tensorflow/c/tf_status_internal.h"
+#include "tensorflow/core/platform/status.h"
+
+extern "C" {
+
+TF_SignatureDefFunctionMetadata* TF_SignatureDefFunctionGetMetadata(
+    TF_SignatureDefFunction* func) {
+  return tensorflow::wrap(const_cast<tensorflow::SignatureDefFunctionMetadata*>(
+      &tensorflow::unwrap(func)->GetFunctionMetadata()));
+}
+
+TFE_Op* TF_SignatureDefFunctionMakeCallOp(TF_SignatureDefFunction* func,
+                                          TFE_TensorHandle** inputs,
+                                          int num_inputs, TF_Status* status) {
+  tensorflow::ImmediateOpPtr call_op;
+  absl::Span<tensorflow::AbstractTensorHandle* const> input_span(
+      reinterpret_cast<tensorflow::AbstractTensorHandle**>(
+          tensorflow::unwrap(inputs)),
+      static_cast<size_t>(num_inputs));
+  status->status = tensorflow::unwrap(func)->MakeCallOp(input_span, &call_op);
+  if (!status->status.ok()) {
+    return nullptr;
+  }
+  return tensorflow::wrap(call_op.release());
+}
+
+}  // end extern "C"
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/register_all_passes.cc b/tensorflow/c/experimental/saved_model/internal/signature_def_function_metadata.cc
similarity index 73%
rename from tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/register_all_passes.cc
rename to tensorflow/c/experimental/saved_model/internal/signature_def_function_metadata.cc
index 9349bee041e..c5c3616211c 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/register_all_passes.cc
+++ b/tensorflow/c/experimental/saved_model/internal/signature_def_function_metadata.cc
@@ -13,16 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "mlir-hlo/Dialect/mhlo/transforms/register_passes.h"
+#include "tensorflow/c/experimental/saved_model/public/signature_def_function_metadata.h"
 
-namespace mlir {
+#include "tensorflow/c/experimental/saved_model/internal/signature_def_function_metadata_type.h"
 
-namespace {
-
-bool register_all_passes = ([] {
-    mhlo::registerAllMhloPasses();
-  lmhlo::registerAllLmhloPasses();
-}(), true);
-
-}  // namespace
-}  // namespace mlir
+// TODO(bmzhao): Add getter functions here as necessary.
diff --git a/tensorflow/c/experimental/saved_model/internal/signature_def_function_metadata_type.h b/tensorflow/c/experimental/saved_model/internal/signature_def_function_metadata_type.h
new file mode 100644
index 00000000000..fa6d0f6541e
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/internal/signature_def_function_metadata_type.h
@@ -0,0 +1,31 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_SIGNATURE_DEF_FUNCTION_METADATA_TYPE_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_SIGNATURE_DEF_FUNCTION_METADATA_TYPE_H_
+
+#include "tensorflow/c/conversion_macros.h"
+#include "tensorflow/c/experimental/saved_model/core/signature_def_function_metadata.h"
+
+typedef struct TF_SignatureDefFunctionMetadata TF_SignatureDefFunctionMetadata;
+
+namespace tensorflow {
+
+DEFINE_CONVERSION_FUNCTIONS(tensorflow::SignatureDefFunctionMetadata,
+                            TF_SignatureDefFunctionMetadata)
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_SIGNATURE_DEF_FUNCTION_METADATA_TYPE_H_
diff --git a/tensorflow/c/experimental/saved_model/internal/signature_def_function_type.h b/tensorflow/c/experimental/saved_model/internal/signature_def_function_type.h
new file mode 100644
index 00000000000..ca44dc43bd6
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/internal/signature_def_function_type.h
@@ -0,0 +1,31 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_SIGNATURE_DEF_FUNCTION_TYPE_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_SIGNATURE_DEF_FUNCTION_TYPE_H_
+
+#include "tensorflow/c/conversion_macros.h"
+#include "tensorflow/c/experimental/saved_model/core/signature_def_function.h"
+
+typedef struct TF_SignatureDefFunction TF_SignatureDefFunction;
+
+namespace tensorflow {
+
+DEFINE_CONVERSION_FUNCTIONS(tensorflow::SignatureDefFunction,
+                            TF_SignatureDefFunction)
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_SIGNATURE_DEF_FUNCTION_TYPE_H_
diff --git a/tensorflow/c/experimental/saved_model/public/BUILD b/tensorflow/c/experimental/saved_model/public/BUILD
index af65e05e7f6..d29585ae1ba 100644
--- a/tensorflow/c/experimental/saved_model/public/BUILD
+++ b/tensorflow/c/experimental/saved_model/public/BUILD
@@ -24,6 +24,8 @@ exports_files(
         "concrete_function_list.h",
         "function_metadata.h",
         "saved_model_api.h",
+        "signature_def_function.h",
+        "signature_def_function_metadata.h",
     ],
     visibility = ["//tensorflow/c/experimental/saved_model/internal:__pkg__"],
 )
@@ -39,6 +41,8 @@ cc_library(
         ":concrete_function_list",
         ":function_metadata",
         ":saved_model_api",
+        ":signature_def_function",
+        ":signature_def_function_metadata",
     ],
 )
 
@@ -61,3 +65,13 @@ alias(
     name = "saved_model_api",
     actual = "//tensorflow/c/experimental/saved_model/internal:saved_model_api",
 )
+
+alias(
+    name = "signature_def_function",
+    actual = "//tensorflow/c/experimental/saved_model/internal:signature_def_function",
+)
+
+alias(
+    name = "signature_def_function_metadata",
+    actual = "//tensorflow/c/experimental/saved_model/internal:signature_def_function_metadata",
+)
diff --git a/tensorflow/c/experimental/saved_model/public/README.md b/tensorflow/c/experimental/saved_model/public/README.md
new file mode 100644
index 00000000000..9b3f392d7a8
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/public/README.md
@@ -0,0 +1,28 @@
+# TensorFlow Saved Model C API
+
+## Small ConcreteFunction Example
+
+The following example loads a saved model from `"/path/to/model"` and
+executes a function `f` taking no arguments and returning one single
+value (error checking is omitted for simplicity):
+
+```c
+TF_Status* status = TF_NewStatus();
+TFE_ContextOptions* ctx_options = TFE_NewContextOptions();
+TFE_Context* ctx = TFE_NewContext(ctx_options, status);
+
+TF_SavedModel* saved_model = TF_LoadSavedModel("/path/to/model", ctx, status);
+TF_ConcreteFunction* f = TF_GetSavedModelConcreteFunction(saved_model, "f", status);
+TFE_Op* op = TF_ConcreteFunctionMakeCallOp(f, NULL, 0, status);
+
+TFE_TensorHandle* output;
+int nouts = 1;
+TFE_Execute(op, &output, &nouts, status);
+
+TFE_DeleteTensorHandle(output);
+TFE_DeleteOp(op);
+TFE_DeleteSavedModel(saved_model);
+TFE_DeleteContext(ctx);
+TFE_DeleteContextOptions(ctx_options);
+TF_DeleteStatus(status);
+```
diff --git a/tensorflow/c/experimental/saved_model/public/c_saved_model_api.h b/tensorflow/c/experimental/saved_model/public/c_saved_model_api.h
index 30f533f140a..cedb9de66b8 100644
--- a/tensorflow/c/experimental/saved_model/public/c_saved_model_api.h
+++ b/tensorflow/c/experimental/saved_model/public/c_saved_model_api.h
@@ -21,6 +21,8 @@ limitations under the License.
 #include "tensorflow/c/experimental/saved_model/public/concrete_function_list.h"
 #include "tensorflow/c/experimental/saved_model/public/function_metadata.h"
 #include "tensorflow/c/experimental/saved_model/public/saved_model_api.h"
+#include "tensorflow/c/experimental/saved_model/public/signature_def_function.h"
+#include "tensorflow/c/experimental/saved_model/public/signature_def_function_metadata.h"
 // IWYU pragma: end_exports
 
 #endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_C_SAVED_MODEL_API_H_
diff --git a/tensorflow/c/experimental/saved_model/public/concrete_function.h b/tensorflow/c/experimental/saved_model/public/concrete_function.h
index ee5292294d6..ff8a245961a 100644
--- a/tensorflow/c/experimental/saved_model/public/concrete_function.h
+++ b/tensorflow/c/experimental/saved_model/public/concrete_function.h
@@ -40,7 +40,14 @@ TF_CAPI_EXPORT extern TF_FunctionMetadata* TF_ConcreteFunctionGetMetadata(
 // The caller is responsible for deleting the returned TFE_Op. If op
 // construction fails, `status` will be non-OK and the returned pointer will be
 // null.
-TF_CAPI_EXPORT extern TFE_Op* TF_ConcreteFunctionGetCallOp(
+// TODO(bmzhao): Remove this function in a subsequent change; Design + implement
+// a Function Execution interface for ConcreteFunction that accepts a tagged
+// union of types (tensorflow::Value). This effectively requires moving much of
+// the implementation of function.py/def_function.py to C++, and exposing a
+// high-level API here. A strawman for what this interface could look like:
+// TF_Value* TF_ExecuteFunction(TFE_Context*, TF_ConcreteFunction*, TF_Value*
+// inputs, int num_inputs, TF_Status* status);
+TF_CAPI_EXPORT extern TFE_Op* TF_ConcreteFunctionMakeCallOp(
     TF_ConcreteFunction* func, TFE_TensorHandle** inputs, int num_inputs,
     TF_Status* status);
 
diff --git a/tensorflow/c/experimental/saved_model/public/saved_model_api.h b/tensorflow/c/experimental/saved_model/public/saved_model_api.h
index 875167bec63..80ba37bab26 100644
--- a/tensorflow/c/experimental/saved_model/public/saved_model_api.h
+++ b/tensorflow/c/experimental/saved_model/public/saved_model_api.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/c/c_api_macros.h"
 #include "tensorflow/c/experimental/saved_model/public/concrete_function.h"
 #include "tensorflow/c/experimental/saved_model/public/concrete_function_list.h"
+#include "tensorflow/c/experimental/saved_model/public/signature_def_function.h"
 #include "tensorflow/c/tf_status.h"
 
 #ifdef __cplusplus
@@ -91,10 +92,13 @@ TF_CAPI_EXPORT extern TF_ConcreteFunction* TF_GetSavedModelConcreteFunction(
 //  status - Set to OK on success and an appropriate error on failure.
 // Returns:
 //  If status is not OK, returns nullptr. Otherwise, returns a
-//  TF_ConcreteFunction instance. Once `model` is deleted, all
-//  `TF_ConcreteFunctions` retrieved from it are invalid, and have been deleted.
-TF_CAPI_EXPORT extern TF_ConcreteFunction* TF_GetSavedModelSignatureDefFunction(
-    TF_SavedModel* model, const char* signature_def_key, TF_Status* status);
+//  TF_SignatureDefFunction instance. Once `model` is deleted, all
+//  `TF_SignatureDefFunctions` retrieved from it are invalid, and have been
+//  deleted.
+TF_CAPI_EXPORT extern TF_SignatureDefFunction*
+TF_GetSavedModelSignatureDefFunction(TF_SavedModel* model,
+                                     const char* signature_def_key,
+                                     TF_Status* status);
 
 // Returns a list of all ConcreteFunctions stored in this SavedModel.
 // The lifetime of the returned list is bound to `model`.
diff --git a/tensorflow/c/experimental/saved_model/public/signature_def_function.h b/tensorflow/c/experimental/saved_model/public/signature_def_function.h
new file mode 100644
index 00000000000..16471fdc1fa
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/public/signature_def_function.h
@@ -0,0 +1,50 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_SIGNATURE_DEF_FUNCTION_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_SIGNATURE_DEF_FUNCTION_H_
+
+#include "tensorflow/c/c_api_macros.h"
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/experimental/saved_model/public/signature_def_function_metadata.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// An opaque type that corresponds to a SignatureDefFunction loaded from a
+// SavedModel.
+typedef struct TF_SignatureDefFunction TF_SignatureDefFunction;
+
+// Returns FunctionMetadata associated with `func`. Metadata's lifetime is
+// bound to `func`, which is bound to the TF_SavedModel it was loaded from.
+TF_CAPI_EXPORT extern TF_SignatureDefFunctionMetadata*
+TF_SignatureDefFunctionGetMetadata(TF_SignatureDefFunction* func);
+
+// Returns a TFE_Op suitable for executing this function. Caller must provide
+// all function inputs in `inputs`, and must not add any additional inputs on
+// the returned op. (i.e. don't call TFE_OpAddInput or TFE_OpAddInputList).
+// The caller is responsible for deleting the returned TFE_Op. If op
+// construction fails, `status` will be non-OK and the returned pointer will be
+// null.
+TF_CAPI_EXPORT extern TFE_Op* TF_SignatureDefFunctionMakeCallOp(
+    TF_SignatureDefFunction* func, TFE_TensorHandle** inputs, int num_inputs,
+    TF_Status* status);
+
+#ifdef __cplusplus
+}  // end extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_SIGNATURE_DEF_FUNCTION_H_
diff --git a/tensorflow/c/experimental/saved_model/public/signature_def_function_metadata.h b/tensorflow/c/experimental/saved_model/public/signature_def_function_metadata.h
new file mode 100644
index 00000000000..6f4459732c4
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/public/signature_def_function_metadata.h
@@ -0,0 +1,31 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_SIGNATURE_DEF_FUNCTION_METADATA_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_SIGNATURE_DEF_FUNCTION_METADATA_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// An opaque type that corresponds to a SignatureDefFunction loaded from a
+// SavedModel.
+typedef struct TF_SignatureDefFunctionMetadata TF_SignatureDefFunctionMetadata;
+
+#ifdef __cplusplus
+}  // end extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_SIGNATURE_DEF_FUNCTION_METADATA_H_
diff --git a/tensorflow/c/experimental/stream_executor/BUILD b/tensorflow/c/experimental/stream_executor/BUILD
new file mode 100644
index 00000000000..7daa311d461
--- /dev/null
+++ b/tensorflow/c/experimental/stream_executor/BUILD
@@ -0,0 +1,60 @@
+# Description:
+# StreamExecutor C API.
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cc_test",
+)
+
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
+
+cc_library(
+    name = "stream_executor",
+    srcs = ["stream_executor.cc"],
+    hdrs = ["stream_executor.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":stream_executor_internal",
+        "//tensorflow/c:c_api_macros",
+        "//tensorflow/c:tf_status",
+        "//tensorflow/c:tf_status_helper",
+        "//tensorflow/core:lib",
+        "//tensorflow/stream_executor:executor_cache",
+        "//tensorflow/stream_executor:multi_platform_manager",
+        "//tensorflow/stream_executor:platform",
+        "//tensorflow/stream_executor:stream_executor_internal",
+        "//tensorflow/stream_executor:stream_executor_pimpl",
+        "//tensorflow/stream_executor:timer",
+    ],
+)
+
+cc_library(
+    name = "stream_executor_internal",
+    hdrs = [
+        "stream_executor.h",
+        "stream_executor_internal.h",
+    ],
+    deps = [
+        "//tensorflow/c:c_api_macros",
+        "//tensorflow/c:tf_status",
+        "//tensorflow/stream_executor:executor_cache",
+        "//tensorflow/stream_executor/lib",
+    ],
+)
+
+tf_cc_test(
+    name = "stream_executor_test",
+    srcs = ["stream_executor_test.cc"],
+    deps = [
+        ":stream_executor",
+        ":stream_executor_internal",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/protobuf:error_codes_proto_impl_cc",
+        "//tensorflow/stream_executor:multi_platform_manager",
+        "//tensorflow/stream_executor:stream",
+        "//tensorflow/stream_executor:stream_executor_pimpl",
+    ],
+)
diff --git a/tensorflow/c/experimental/stream_executor/stream_executor.cc b/tensorflow/c/experimental/stream_executor/stream_executor.cc
new file mode 100644
index 00000000000..901ef942305
--- /dev/null
+++ b/tensorflow/c/experimental/stream_executor/stream_executor.cc
@@ -0,0 +1,853 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// This file extends/implements core stream executor base classes in terms of
+// the C API defined in stream_executor.h. A class "CSomething" represents a
+// "Something" that can be manipulated via calls in the C interface and a C
+// struct called "SP_Something".
+//
+// This file also contains stream_executor::Platform registration for pluggable
+// device.
+#include "tensorflow/c/experimental/stream_executor/stream_executor.h"
+
+#include <string>
+
+#include "tensorflow/c/experimental/stream_executor/stream_executor_internal.h"
+#include "tensorflow/c/tf_status_helper.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/stream_executor/executor_cache.h"
+#include "tensorflow/stream_executor/multi_platform_manager.h"
+#include "tensorflow/stream_executor/platform.h"
+#include "tensorflow/stream_executor/stream_executor_internal.h"
+#include "tensorflow/stream_executor/stream_executor_pimpl.h"
+#include "tensorflow/stream_executor/timer.h"
+
+using tensorflow::StatusFromTF_Status;
+
+namespace stream_executor {
+namespace {
+
+#define VALIDATE_STRUCT_SIZE(STRUCT_NAME, STRUCT_OBJ, SIZE_VALUE_NAME) \
+  do {                                                                 \
+    if (STRUCT_OBJ.struct_size == 0) {                                 \
+      return port::FailedPreconditionError(                            \
+          "struct_size field in " #STRUCT_NAME                         \
+          " must be set to " #SIZE_VALUE_NAME ".");                    \
+    }                                                                  \
+  } while (0)
+
+#define VALIDATE_MEMBER(STRUCT_NAME, STRUCT_OBJ, NAME)           \
+  do {                                                           \
+    if (STRUCT_OBJ.NAME == 0) {                                  \
+      return port::FailedPreconditionError(                      \
+          "'" #NAME "' field in " #STRUCT_NAME " must be set."); \
+    }                                                            \
+  } while (0)
+
+port::Status ValidateSPPlatform(const SP_Platform& platform) {
+  VALIDATE_STRUCT_SIZE(SP_Platform, platform, SP_PLATFORM_STRUCT_SIZE);
+  VALIDATE_MEMBER(SP_Platform, platform, name);
+  VALIDATE_MEMBER(SP_Platform, platform, type);
+  // `visible_device_count` could be 0 at initialization time.
+  return port::Status::OK();
+}
+
+port::Status ValidateSPPlatformFns(const SP_PlatformFns& platform_fns) {
+  VALIDATE_STRUCT_SIZE(SP_PlatformFns, platform_fns,
+                       SP_PLATFORM_FNS_STRUCT_SIZE);
+  VALIDATE_MEMBER(SP_PlatformFns, platform_fns, create_device);
+  VALIDATE_MEMBER(SP_PlatformFns, platform_fns, destroy_device);
+  VALIDATE_MEMBER(SP_PlatformFns, platform_fns, create_stream_executor);
+  VALIDATE_MEMBER(SP_PlatformFns, platform_fns, destroy_stream_executor);
+  VALIDATE_MEMBER(SP_PlatformFns, platform_fns, create_timer_fns);
+  VALIDATE_MEMBER(SP_PlatformFns, platform_fns, destroy_timer_fns);
+  return port::Status::OK();
+}
+
+port::Status ValidateSPTimerFns(const SP_TimerFns& timer_fns) {
+  VALIDATE_STRUCT_SIZE(SP_TimerFns, timer_fns, SP_TIMER_FNS_STRUCT_SIZE);
+  VALIDATE_MEMBER(SP_TimerFns, timer_fns, nanoseconds);
+  return port::Status::OK();
+}
+
+port::Status ValidateSPAllocatorStats(const SP_AllocatorStats& stats) {
+  VALIDATE_STRUCT_SIZE(SP_AllocatorStats, stats, SP_ALLOCATORSTATS_STRUCT_SIZE);
+  // All other fields could theoretically be zero/null.
+  return port::Status::OK();
+}
+
+port::Status ValidateSPDeviceMemoryBase(const SP_DeviceMemoryBase& mem) {
+  VALIDATE_STRUCT_SIZE(SP_DeviceMemoryBase, mem,
+                       SP_DEVICE_MEMORY_BASE_STRUCT_SIZE);
+  // All other fields could theoretically be zero/null.
+  return port::Status::OK();
+}
+
+port::Status ValidateSPDevice(const SP_Device& device) {
+  VALIDATE_STRUCT_SIZE(SP_Device, device, SP_DEVICE_STRUCT_SIZE);
+  // All other fields could theoretically be zero/null.
+  return port::Status::OK();
+}
+
+port::Status ValidateSPStreamExecutor(const SP_StreamExecutor& se,
+                                      const SP_Platform& platform) {
+  VALIDATE_STRUCT_SIZE(SP_StreamExecutor, se, SP_STREAM_EXECUTOR_STRUCT_SIZE);
+  VALIDATE_MEMBER(SP_StreamExecutor, se, allocate);
+  VALIDATE_MEMBER(SP_StreamExecutor, se, deallocate);
+  VALIDATE_MEMBER(SP_StreamExecutor, se, get_allocator_stats);
+  VALIDATE_MEMBER(SP_StreamExecutor, se, host_memory_allocate);
+  VALIDATE_MEMBER(SP_StreamExecutor, se, host_memory_deallocate);
+  if (platform.supports_unified_memory) {
+    VALIDATE_MEMBER(SP_StreamExecutor, se, unified_memory_allocate);
+    VALIDATE_MEMBER(SP_StreamExecutor, se, unified_memory_deallocate);
+  }
+  VALIDATE_MEMBER(SP_StreamExecutor, se, device_memory_usage);
+  VALIDATE_MEMBER(SP_StreamExecutor, se, create_stream);
+  VALIDATE_MEMBER(SP_StreamExecutor, se, destroy_stream);
+  VALIDATE_MEMBER(SP_StreamExecutor, se, create_stream_dependency);
+  VALIDATE_MEMBER(SP_StreamExecutor, se, get_stream_status);
+  VALIDATE_MEMBER(SP_StreamExecutor, se, create_event);
+  VALIDATE_MEMBER(SP_StreamExecutor, se, destroy_event);
+  VALIDATE_MEMBER(SP_StreamExecutor, se, get_event_status);
+  VALIDATE_MEMBER(SP_StreamExecutor, se, record_event);
+  VALIDATE_MEMBER(SP_StreamExecutor, se, wait_for_event);
+  VALIDATE_MEMBER(SP_StreamExecutor, se, create_timer);
+  VALIDATE_MEMBER(SP_StreamExecutor, se, destroy_timer);
+  VALIDATE_MEMBER(SP_StreamExecutor, se, start_timer);
+  VALIDATE_MEMBER(SP_StreamExecutor, se, stop_timer);
+  VALIDATE_MEMBER(SP_StreamExecutor, se, memcpy_dtoh);
+  VALIDATE_MEMBER(SP_StreamExecutor, se, memcpy_htod);
+  VALIDATE_MEMBER(SP_StreamExecutor, se, sync_memcpy_dtoh);
+  VALIDATE_MEMBER(SP_StreamExecutor, se, sync_memcpy_htod);
+  VALIDATE_MEMBER(SP_StreamExecutor, se, block_host_for_event);
+  VALIDATE_MEMBER(SP_StreamExecutor, se, synchronize_all_activity);
+  VALIDATE_MEMBER(SP_StreamExecutor, se, host_callback);
+  return port::Status::OK();
+}
+
+port::Status ValidateSEPlatformRegistrationParams(
+    const SE_PlatformRegistrationParams& params) {
+  VALIDATE_STRUCT_SIZE(SE_PlatformRegistrationParams, params,
+                       SE_PLATFORM_REGISTRATION_PARAMS_STRUCT_SIZE);
+  VALIDATE_MEMBER(SE_PlatformRegistrationParams, params, destroy_platform);
+  VALIDATE_MEMBER(SE_PlatformRegistrationParams, params, destroy_platform_fns);
+  return port::Status::OK();
+}
+#undef VALIDATE_MEMBER
+
+struct TFStatusDeleter {
+  void operator()(TF_Status* s) const { TF_DeleteStatus(s); }
+};
+using OwnedTFStatus = std::unique_ptr<TF_Status, TFStatusDeleter>;
+
+class CStream : public internal::StreamInterface {
+ public:
+  CStream(SP_Device* device, SP_StreamExecutor* stream_executor)
+      : device_(device),
+        stream_executor_(stream_executor),
+        stream_handle_(nullptr) {}
+  ~CStream() override { Destroy(); }
+
+  port::Status Create() {
+    OwnedTFStatus c_status(TF_NewStatus());
+    stream_executor_->create_stream(device_, &stream_handle_, c_status.get());
+    port::Status s = StatusFromTF_Status(c_status.get());
+    return s;
+  }
+
+  void Destroy() {
+    if (stream_handle_ != nullptr) {
+      stream_executor_->destroy_stream(device_, stream_handle_);
+      stream_handle_ = nullptr;
+    }
+  }
+
+  SP_Stream Handle() { return stream_handle_; }
+
+ private:
+  SP_Device* device_;
+  SP_StreamExecutor* stream_executor_;
+  SP_Stream stream_handle_;
+};
+
+// Converts SE_EventStatus to Event::Status.
+Event::Status SEEventStatusToEventStatus(SE_EventStatus s) {
+  switch (s) {
+    case SE_EVENT_ERROR:
+      return Event::Status::kError;
+    case SE_EVENT_PENDING:
+      return Event::Status::kPending;
+    case SE_EVENT_COMPLETE:
+      return Event::Status::kComplete;
+    default:
+      return Event::Status::kUnknown;
+  }
+}
+
+class CEvent : public internal::EventInterface {
+ public:
+  CEvent(SP_Device* device, SP_StreamExecutor* stream_executor)
+      : device_(device),
+        stream_executor_(stream_executor),
+        event_handle_(nullptr) {}
+  ~CEvent() override { Destroy(); }
+
+  port::Status Create() {
+    OwnedTFStatus c_status(TF_NewStatus());
+    stream_executor_->create_event(device_, &event_handle_, c_status.get());
+    return StatusFromTF_Status(c_status.get());
+  }
+
+  port::Status Record(SP_Stream stream_handle) {
+    OwnedTFStatus c_status(TF_NewStatus());
+    stream_executor_->record_event(device_, stream_handle, event_handle_,
+                                   c_status.get());
+    return StatusFromTF_Status(c_status.get());
+  }
+
+  void Destroy() {
+    if (event_handle_ != nullptr) {
+      stream_executor_->destroy_event(device_, event_handle_);
+      event_handle_ = nullptr;
+    }
+  }
+
+  SP_Event Handle() { return event_handle_; }
+
+ private:
+  SP_Device* device_;
+  SP_StreamExecutor* stream_executor_;
+  SP_Event event_handle_;
+};
+
+class CTimer : public internal::TimerInterface {
+ public:
+  CTimer(SP_Device* device, SP_StreamExecutor* stream_executor,
+         SP_TimerFns* timer_fns)
+      : device_(device),
+        stream_executor_(stream_executor),
+        timer_handle_(nullptr),
+        timer_fns_(timer_fns) {}
+  ~CTimer() override { Destroy(); }
+
+  port::Status Create() {
+    OwnedTFStatus c_status(TF_NewStatus());
+    stream_executor_->create_timer(device_, &timer_handle_, c_status.get());
+    return StatusFromTF_Status(c_status.get());
+  }
+
+  void Destroy() {
+    if (timer_handle_ != nullptr) {
+      stream_executor_->destroy_timer(device_, timer_handle_);
+      timer_handle_ = nullptr;
+    }
+  }
+
+  SP_Timer Handle() { return timer_handle_; }
+
+  uint64 Microseconds() const override {
+    return timer_fns_->nanoseconds(timer_handle_) / 1000;
+  }
+
+  uint64 Nanoseconds() const override {
+    return timer_fns_->nanoseconds(timer_handle_);
+  }
+
+ private:
+  SP_Device* device_;
+  SP_StreamExecutor* stream_executor_;
+  SP_Timer timer_handle_;
+  SP_TimerFns* timer_fns_;
+};
+
+// Converts DeviceMemoryBase to a C struct.
+SP_DeviceMemoryBase DeviceMemoryBaseToC(const DeviceMemoryBase* mem) {
+  SP_DeviceMemoryBase device_memory_base{SP_DEVICE_MEMORY_BASE_STRUCT_SIZE};
+  // `opaque` field inside SP_DeviceMemoryBase is not const.
+  // Therefore, we need to cast away the constness before setting it.
+  device_memory_base.opaque = const_cast<void*>(mem->opaque());
+  device_memory_base.size = mem->size();
+  device_memory_base.payload = mem->payload();
+  // TODO(annarev): Add `ext` field to DeviceMemoryBase and set it here.
+  return device_memory_base;
+}
+
+DeviceMemoryBase DeviceMemoryBaseFromC(const SP_DeviceMemoryBase& mem) {
+  DeviceMemoryBase base(mem.opaque, mem.size);
+  base.SetPayload(mem.payload);
+  // TODO(annarev): Add `ext` field to DeviceMemoryBase and set it here.
+  return base;
+}
+
+// Wrapper that allows passing std::function across C API.
+struct HostCallbackContext {
+  std::function<port::Status()> callback;
+};
+
+// This wrapper allows calling `HostCallbackContext::callback` across C API.
+// This function matches `SE_StatusCallbackFn` signature and will be passed as
+// `callback_fn` to `host_callback` in `SP_StreamExecutor`.
+void HostCallbackTrampoline(void* ctx, TF_Status* status) {
+  HostCallbackContext* host_ctx = static_cast<HostCallbackContext*>(ctx);
+  port::Status s = host_ctx->callback();
+  Set_TF_Status_from_Status(status, s);
+  delete host_ctx;
+}
+
+class CStreamExecutor : public internal::StreamExecutorInterface {
+ public:
+  explicit CStreamExecutor(SP_Device device, SP_StreamExecutor* stream_executor,
+                           SP_Platform* platform, SP_PlatformFns* platform_fns,
+                           SP_TimerFns* timer_fns, const std::string& name,
+                           int visible_device_count)
+      : device_(std::move(device)),
+        stream_executor_(stream_executor),
+        platform_(platform),
+        platform_fns_(platform_fns),
+        timer_fns_(timer_fns),
+        platform_name_(name),
+        visible_device_count_(visible_device_count) {}
+
+  ~CStreamExecutor() override {
+    platform_fns_->destroy_device(platform_, &device_);
+  }
+
+  port::Status Init(int device_ordinal, DeviceOptions device_options) override {
+    return port::Status::OK();
+  }
+
+  DeviceMemoryBase Allocate(uint64 size, int64 memory_space) override {
+    SP_DeviceMemoryBase mem = {SP_DEVICE_MEMORY_BASE_STRUCT_SIZE};
+    stream_executor_->allocate(&device_, size, memory_space, &mem);
+    port::Status status = ValidateSPDeviceMemoryBase(mem);
+    if (!status.ok()) {
+      LOG(ERROR) << status.error_message();
+    }
+    return DeviceMemoryBaseFromC(mem);
+  }
+  DeviceMemoryBase Allocate(uint64 size) {
+    return Allocate(size, /*memory_space=*/0);
+  }
+  void* GetSubBuffer(DeviceMemoryBase* parent, uint64 offset,
+                     uint64 size) override {
+    LOG(FATAL) << "GetSubBuffer is not supported by pluggable device.";
+  }
+
+  void Deallocate(DeviceMemoryBase* mem) override {
+    SP_DeviceMemoryBase device_memory_base = DeviceMemoryBaseToC(mem);
+    stream_executor_->deallocate(&device_, &device_memory_base);
+  }
+
+  void* HostMemoryAllocate(uint64 size) override {
+    return stream_executor_->host_memory_allocate(&device_, size);
+  }
+
+  void HostMemoryDeallocate(void* mem) override {
+    stream_executor_->host_memory_deallocate(&device_, mem);
+  }
+
+  bool HostMemoryRegister(void* mem, uint64 size) override { return false; }
+  bool HostMemoryUnregister(void* mem) override { return false; }
+
+  void* UnifiedMemoryAllocate(uint64 size) override {
+    CHECK(stream_executor_->unified_memory_allocate);
+    return stream_executor_->unified_memory_allocate(&device_, size);
+  }
+
+  void UnifiedMemoryDeallocate(void* mem) override {
+    CHECK(stream_executor_->unified_memory_deallocate);
+    stream_executor_->unified_memory_deallocate(&device_, mem);
+  }
+
+  absl::optional<AllocatorStats> GetAllocatorStats() override {
+    SP_AllocatorStats c_stats{SP_ALLOCATORSTATS_STRUCT_SIZE};
+    TF_Bool has_stats =
+        stream_executor_->get_allocator_stats(&device_, &c_stats);
+    if (!has_stats) {
+      return absl::nullopt;
+    }
+    port::Status status = ValidateSPAllocatorStats(c_stats);
+    if (!status.ok()) {
+      LOG(ERROR) << status.error_message();
+      return absl::nullopt;
+    }
+    // TODO(annarev): validate SP_AllocatorStats.
+    ::stream_executor::AllocatorStats stats;
+    stats.num_allocs = c_stats.num_allocs;
+    stats.bytes_in_use = c_stats.bytes_in_use;
+    stats.peak_bytes_in_use = c_stats.peak_bytes_in_use;
+    stats.largest_alloc_size = c_stats.largest_alloc_size;
+    if (c_stats.has_bytes_limit) {
+      stats.bytes_limit = c_stats.bytes_limit;
+    }
+    stats.bytes_reserved = c_stats.bytes_reserved;
+    stats.peak_bytes_reserved = c_stats.peak_bytes_reserved;
+    if (c_stats.has_bytes_reservable_limit) {
+      stats.bytes_reservable_limit = c_stats.bytes_reservable_limit;
+    }
+    stats.largest_free_block_bytes = c_stats.largest_free_block_bytes;
+    return stats;
+  }
+  bool SynchronizeAllActivity() override {
+    OwnedTFStatus c_status(TF_NewStatus());
+    stream_executor_->synchronize_all_activity(&device_, c_status.get());
+    if (TF_GetCode(c_status.get()) != TF_OK) {
+      LOG(ERROR) << TF_Message(c_status.get());
+      return false;
+    }
+    return true;
+  }
+  port::Status SynchronousMemZero(DeviceMemoryBase* location,
+                                  uint64 size) override {
+    // TODO(annarev): figure out if we should support memzero/memset
+    // functionality by allocating on host and then copying to device.
+    return port::UnimplementedError(
+        "SynchronousMemZero is not supported by pluggable device.");
+  }
+  port::Status SynchronousMemSet(DeviceMemoryBase* location, int value,
+                                 uint64 size) override {
+    return port::UnimplementedError(
+        "SynchronousMemSet is not supported by pluggable device.");
+  }
+  port::Status SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
+                                 const void* host_src, uint64 size) override {
+    OwnedTFStatus c_status(TF_NewStatus());
+    SP_DeviceMemoryBase device_memory_base = DeviceMemoryBaseToC(gpu_dst);
+    stream_executor_->sync_memcpy_htod(&device_, &device_memory_base, host_src,
+                                       size, c_status.get());
+    return StatusFromTF_Status(c_status.get());
+  }
+  port::Status SynchronousMemcpy(void* host_dst,
+                                 const DeviceMemoryBase& gpu_src,
+                                 uint64 size) override {
+    OwnedTFStatus c_status(TF_NewStatus());
+    SP_DeviceMemoryBase device_memory_base = DeviceMemoryBaseToC(&gpu_src);
+    stream_executor_->sync_memcpy_dtoh(&device_, host_dst, &device_memory_base,
+                                       size, c_status.get());
+    return StatusFromTF_Status(c_status.get());
+  }
+  port::Status SynchronousMemcpyDeviceToDevice(DeviceMemoryBase* gpu_dst,
+                                               const DeviceMemoryBase& gpu_src,
+                                               uint64 size) override {
+    OwnedTFStatus c_status(TF_NewStatus());
+    SP_DeviceMemoryBase device_mem_dst = DeviceMemoryBaseToC(gpu_dst);
+    SP_DeviceMemoryBase device_mem_src = DeviceMemoryBaseToC(&gpu_src);
+    stream_executor_->sync_memcpy_dtod(&device_, &device_mem_dst,
+                                       &device_mem_src, size, c_status.get());
+    return StatusFromTF_Status(c_status.get());
+  }
+  port::Status MemZero(Stream* stream, DeviceMemoryBase* location,
+                       uint64 size) override {
+    return port::UnimplementedError(
+        "MemZero is not supported by pluggable device.");
+  }
+  port::Status Memset(Stream* stream, DeviceMemoryBase* location, uint8 pattern,
+                      uint64 size) override {
+    return port::UnimplementedError(
+        "Memset is not supported by pluggable device.");
+  }
+  port::Status Memset32(Stream* stream, DeviceMemoryBase* location,
+                        uint32 pattern, uint64 size) override {
+    return port::UnimplementedError(
+        "Memset32 is not supported by pluggable device.");
+  }
+  bool Memcpy(Stream* stream, void* host_dst, const DeviceMemoryBase& gpu_src,
+              uint64 size) override {
+    OwnedTFStatus c_status(TF_NewStatus());
+    SP_Stream stream_handle =
+        static_cast<CStream*>(stream->implementation())->Handle();
+    SP_DeviceMemoryBase device_mem_src = DeviceMemoryBaseToC(&gpu_src);
+    stream_executor_->memcpy_dtoh(&device_, stream_handle, host_dst,
+                                  &device_mem_src, size, c_status.get());
+    if (TF_GetCode(c_status.get()) != TF_OK) {
+      LOG(ERROR) << TF_Message(c_status.get());
+      return false;
+    }
+    return true;
+  }
+  bool Memcpy(Stream* stream, DeviceMemoryBase* gpu_dst, const void* host_src,
+              uint64 size) override {
+    OwnedTFStatus c_status(TF_NewStatus());
+    SP_Stream stream_handle =
+        static_cast<CStream*>(stream->implementation())->Handle();
+    SP_DeviceMemoryBase device_mem_dst = DeviceMemoryBaseToC(gpu_dst);
+    stream_executor_->memcpy_htod(&device_, stream_handle, &device_mem_dst,
+                                  host_src, size, c_status.get());
+    if (TF_GetCode(c_status.get()) != TF_OK) {
+      LOG(ERROR) << TF_Message(c_status.get());
+      return false;
+    }
+    return true;
+  }
+  bool MemcpyDeviceToDevice(Stream* stream, DeviceMemoryBase* gpu_dst,
+                            const DeviceMemoryBase& gpu_src,
+                            uint64 size) override {
+    OwnedTFStatus c_status(TF_NewStatus());
+    SP_Stream stream_handle =
+        static_cast<CStream*>(stream->implementation())->Handle();
+    SP_DeviceMemoryBase device_mem_dst = DeviceMemoryBaseToC(gpu_dst);
+    SP_DeviceMemoryBase device_mem_src = DeviceMemoryBaseToC(&gpu_src);
+    stream_executor_->memcpy_dtod(&device_, stream_handle, &device_mem_dst,
+                                  &device_mem_src, size, c_status.get());
+    if (TF_GetCode(c_status.get()) != TF_OK) {
+      LOG(ERROR) << TF_Message(c_status.get());
+      return false;
+    }
+    return true;
+  }
+  bool HostCallback(Stream* stream,
+                    std::function<port::Status()> callback) override {
+    SP_Stream stream_handle =
+        static_cast<CStream*>(stream->implementation())->Handle();
+    HostCallbackContext* ctx = new HostCallbackContext{callback};
+    return stream_executor_->host_callback(&device_, stream_handle,
+                                           &HostCallbackTrampoline, ctx);
+  }
+  port::Status AllocateEvent(Event* event) override {
+    DCHECK(event != nullptr);
+    return static_cast<CEvent*>(event->implementation())->Create();
+  }
+  port::Status DeallocateEvent(Event* event) override {
+    static_cast<CEvent*>(event->implementation())->Destroy();
+    return port::Status::OK();
+  }
+  port::Status RecordEvent(Stream* stream, Event* event) override {
+    SP_Stream stream_handle =
+        static_cast<CStream*>(stream->implementation())->Handle();
+    return static_cast<CEvent*>(event->implementation())->Record(stream_handle);
+  }
+  port::Status WaitForEvent(Stream* stream, Event* event) override {
+    SP_Stream stream_handle =
+        static_cast<CStream*>(stream->implementation())->Handle();
+    SP_Event event_handle =
+        static_cast<CEvent*>(event->implementation())->Handle();
+    OwnedTFStatus c_status(TF_NewStatus());
+    stream_executor_->wait_for_event(&device_, stream_handle, event_handle,
+                                     c_status.get());
+    port::Status s = StatusFromTF_Status(c_status.get());
+    return s;
+  }
+  Event::Status PollForEventStatus(Event* event) override {
+    SP_Event event_handle =
+        static_cast<CEvent*>(event->implementation())->Handle();
+    SE_EventStatus event_status =
+        stream_executor_->get_event_status(&device_, event_handle);
+    return SEEventStatusToEventStatus(event_status);
+  }
+  bool AllocateStream(Stream* stream) override {
+    DCHECK(stream != nullptr);
+    port::Status status =
+        static_cast<CStream*>(stream->implementation())->Create();
+    // TODO(annarev): update AllocateStream to return status instead
+    // (similar to AllocateEvent).
+    return status.ok();
+  }
+  void DeallocateStream(Stream* stream) override {
+    static_cast<CStream*>(stream->implementation())->Destroy();
+  }
+  bool CreateStreamDependency(Stream* dependent, Stream* other) override {
+    OwnedTFStatus c_status(TF_NewStatus());
+    SP_Stream dependent_handle =
+        static_cast<CStream*>(dependent->implementation())->Handle();
+    SP_Stream other_handle =
+        static_cast<CStream*>(other->implementation())->Handle();
+    stream_executor_->create_stream_dependency(&device_, dependent_handle,
+                                               other_handle, c_status.get());
+    if (TF_GetCode(c_status.get()) != TF_OK) {
+      LOG(ERROR) << TF_Message(c_status.get());
+      return false;
+    }
+    return true;
+  }
+  bool AllocateTimer(Timer* timer) override {
+    port::Status status =
+        static_cast<CTimer*>(timer->implementation())->Create();
+    // TODO(annarev): change return value of AllocateTimer
+    // to status (similar to AllocateEvent).
+    return status.ok();
+  }
+  void DeallocateTimer(Timer* timer) override {
+    static_cast<CTimer*>(timer->implementation())->Destroy();
+  }
+  bool StartTimer(Stream* stream, Timer* timer) override {
+    OwnedTFStatus c_status(TF_NewStatus());
+    SP_Stream stream_handle =
+        static_cast<CStream*>(stream->implementation())->Handle();
+    SP_Timer timer_handle =
+        static_cast<CTimer*>(timer->implementation())->Handle();
+    stream_executor_->start_timer(&device_, stream_handle, timer_handle,
+                                  c_status.get());
+    if (TF_GetCode(c_status.get()) != TF_OK) {
+      LOG(ERROR) << TF_Message(c_status.get());
+      return false;
+    }
+    return true;
+  }
+  bool StopTimer(Stream* stream, Timer* timer) override {
+    OwnedTFStatus c_status(TF_NewStatus());
+    SP_Stream stream_handle =
+        static_cast<CStream*>(stream->implementation())->Handle();
+    SP_Timer timer_handle =
+        static_cast<CTimer*>(timer->implementation())->Handle();
+    stream_executor_->stop_timer(&device_, stream_handle, timer_handle,
+                                 c_status.get());
+    if (TF_GetCode(c_status.get()) != TF_OK) {
+      LOG(ERROR) << TF_Message(c_status.get());
+      return false;
+    }
+    return true;
+  }
+  port::Status BlockHostForEvent(Stream* stream, Event* event) {
+    OwnedTFStatus c_status(TF_NewStatus());
+    SP_Event event_handle =
+        static_cast<CEvent*>(event->implementation())->Handle();
+    stream_executor_->block_host_for_event(&device_, event_handle,
+                                           c_status.get());
+    return StatusFromTF_Status(c_status.get());
+  }
+
+  port::Status BlockHostUntilDone(Stream* stream) override {
+    OwnedTFStatus c_status(TF_NewStatus());
+    SP_Stream stream_handle =
+        static_cast<CStream*>(stream->implementation())->Handle();
+
+    // If `block_host_until_done` is set, use it.
+    if (stream_executor_->block_host_until_done != nullptr) {
+      stream_executor_->block_host_until_done(&device_, stream_handle,
+                                              c_status.get());
+      return StatusFromTF_Status(c_status.get());
+    }
+    // Create and record an event and then wait for it.
+    SP_Event event_handle;
+    stream_executor_->create_event(&device_, &event_handle, c_status.get());
+    TF_RETURN_IF_ERROR(StatusFromTF_Status(c_status.get()));
+    stream_executor_->record_event(&device_, stream_handle, event_handle,
+                                   c_status.get());
+    port::Status s = StatusFromTF_Status(c_status.get());
+    if (!s.ok()) {
+      stream_executor_->destroy_event(&device_, event_handle);
+      return s;
+    }
+    stream_executor_->block_host_for_event(&device_, event_handle,
+                                           c_status.get());
+    stream_executor_->destroy_event(&device_, event_handle);
+    return StatusFromTF_Status(c_status.get());
+  }
+
+  port::Status GetStatus(Stream* stream) override {
+    OwnedTFStatus c_status(TF_NewStatus());
+    SP_Stream stream_handle =
+        static_cast<CStream*>(stream->implementation())->Handle();
+    stream_executor_->get_stream_status(&device_, stream_handle,
+                                        c_status.get());
+    return StatusFromTF_Status(c_status.get());
+  }
+  int PlatformDeviceCount() override { return visible_device_count_; }
+  port::Status EnablePeerAccessTo(StreamExecutorInterface* other) override {
+    return port::UnimplementedError(
+        "EnablePeerAccessTo is not supported by pluggable device.");
+  }
+  bool CanEnablePeerAccessTo(StreamExecutorInterface* other) override {
+    return false;
+  }
+
+  bool DeviceMemoryUsage(int64* free, int64* total) const override {
+    static_assert(sizeof(int64_t) == sizeof(tensorflow::int64),
+                  "64-bit int types should match in size");
+    return stream_executor_->device_memory_usage(
+        &device_, reinterpret_cast<int64_t*>(free),
+        reinterpret_cast<int64_t*>(total));
+  }
+
+  // Creates a new DeviceDescription object.
+  // Ownership is transferred to the caller.
+  port::StatusOr<std::unique_ptr<DeviceDescription>> CreateDeviceDescription()
+      const override {
+    // TODO(annarev): Figure out if we need to support more description fields.
+    internal::DeviceDescriptionBuilder builder;
+    builder.set_name(platform_name_);
+    // TODO(annarev): `Also supports_unified_memory` in DeviceDescription.
+    return builder.Build();
+  }
+
+  // Each call creates a new instance of the platform-specific implementation of
+  // the corresponding interface type.
+  std::unique_ptr<internal::EventInterface> CreateEventImplementation()
+      override {
+    return std::unique_ptr<internal::EventInterface>(
+        new CEvent(&device_, stream_executor_));
+  }
+  std::unique_ptr<internal::KernelInterface> CreateKernelImplementation()
+      override {
+    LOG(FATAL)
+        << "CreateKernelImplementation is not supported by pluggable device.";
+  }
+  std::unique_ptr<internal::StreamInterface> GetStreamImplementation()
+      override {
+    return std::unique_ptr<internal::StreamInterface>(
+        new CStream(&device_, stream_executor_));
+  }
+  std::unique_ptr<internal::TimerInterface> GetTimerImplementation() override {
+    return std::unique_ptr<internal::TimerInterface>(
+        new CTimer(&device_, stream_executor_, timer_fns_));
+  }
+
+ private:
+  SP_Device device_;
+  SP_StreamExecutor* stream_executor_;
+  SP_Platform* platform_;
+  SP_PlatformFns* platform_fns_;
+  SP_TimerFns* timer_fns_;
+  std::string platform_name_;
+  int visible_device_count_;
+};
+}  // namespace
+
+CPlatform::CPlatform(SP_Platform platform,
+                     void (*destroy_platform)(SP_Platform*),
+                     SP_PlatformFns platform_fns,
+                     void (*destroy_platform_fns)(SP_PlatformFns*),
+                     SP_StreamExecutor stream_executor, SP_TimerFns timer_fns)
+    : platform_(std::move(platform)),
+      destroy_platform_(destroy_platform),
+      platform_fns_(std::move(platform_fns)),
+      destroy_platform_fns_(destroy_platform_fns),
+      stream_executor_(std::move(stream_executor)),
+      timer_fns_(std::move(timer_fns)),
+      name_(platform.name) {}
+
+CPlatform::~CPlatform() {
+  executor_cache_.DestroyAllExecutors();
+  platform_fns_.destroy_stream_executor(&platform_, &stream_executor_);
+  platform_fns_.destroy_timer_fns(&platform_, &timer_fns_);
+  destroy_platform_(&platform_);
+  destroy_platform_fns_(&platform_fns_);
+}
+
+port::StatusOr<std::unique_ptr<DeviceDescription>>
+CPlatform::DescriptionForDevice(int ordinal) const {
+  // TODO(annarev): see if we can get StreamExecutor instance
+  // and call GetDeviceDescription. executor_cache_.Get would need
+  // to be made const for it to work.
+  internal::DeviceDescriptionBuilder builder;
+  builder.set_name(name_);
+  return builder.Build();
+}
+port::StatusOr<StreamExecutor*> CPlatform::ExecutorForDevice(int ordinal) {
+  stream_executor::StreamExecutorConfig config;
+  config.ordinal = ordinal;
+  return GetExecutor(config);
+}
+port::StatusOr<StreamExecutor*> CPlatform::ExecutorForDeviceWithPluginConfig(
+    int ordinal, const PluginConfig& plugin_config) {
+  StreamExecutorConfig config;
+  config.ordinal = ordinal;
+  config.plugin_config = plugin_config;
+  return GetExecutor(config);
+}
+port::StatusOr<StreamExecutor*> CPlatform::GetExecutor(
+    const StreamExecutorConfig& config) {
+  return executor_cache_.GetOrCreate(
+      config, [&]() { return GetUncachedExecutor(config); });
+}
+port::StatusOr<std::unique_ptr<StreamExecutor>> CPlatform::GetUncachedExecutor(
+    const StreamExecutorConfig& config) {
+  // Fill device creation params
+  SE_CreateDeviceParams device_params{SE_CREATE_DEVICE_PARAMS_STRUCT_SIZE};
+  SP_Device device{SP_DEVICE_STRUCT_SIZE};
+  device_params.device = &device;
+  device_params.ext = nullptr;
+  device_params.ordinal = config.ordinal;
+  OwnedTFStatus c_status(TF_NewStatus());
+
+  // Create Device
+  platform_fns_.create_device(&platform_, &device_params, c_status.get());
+  TF_RETURN_IF_ERROR(StatusFromTF_Status(c_status.get()));
+  TF_RETURN_IF_ERROR(ValidateSPDevice(device));
+
+  auto executor = absl::make_unique<CStreamExecutor>(
+      std::move(device), &stream_executor_, &platform_, &platform_fns_,
+      &timer_fns_, name_, platform_.visible_device_count);
+  auto result = absl::make_unique<StreamExecutor>(this, std::move(executor),
+                                                  config.ordinal);
+  return result;
+}
+
+port::Status InitStreamExecutorPlugin(void* dso_handle) {
+  tensorflow::Env* env = tensorflow::Env::Default();
+
+  // Step 1: Load symbol for `TF_InitPlugin`
+  void* dso_symbol;
+  TF_RETURN_IF_ERROR(
+      env->GetSymbolFromLibrary(dso_handle, "SE_InitPlugin", &dso_symbol));
+
+  // Step 2: Call `TF_InitPlugin`
+  auto init_fn = reinterpret_cast<SEInitPluginFn>(dso_symbol);
+  return InitStreamExecutorPlugin(init_fn);
+}
+
+port::Status InitStreamExecutorPlugin(SEInitPluginFn init_fn) {
+  SE_PlatformRegistrationParams params{
+      SE_PLATFORM_REGISTRATION_PARAMS_STRUCT_SIZE};
+  SP_Platform platform{SP_PLATFORM_STRUCT_SIZE};
+  SP_PlatformFns platform_fns{SP_PLATFORM_FNS_STRUCT_SIZE};
+  params.major_version = SE_MAJOR;
+  params.minor_version = SE_MINOR;
+  params.patch_version = SE_PATCH;
+  params.platform = &platform;
+  params.platform_fns = &platform_fns;
+
+  OwnedTFStatus c_status(TF_NewStatus());
+  init_fn(&params, c_status.get());
+  TF_RETURN_IF_ERROR(tensorflow::StatusFromTF_Status(c_status.get()));
+  TF_RETURN_IF_ERROR(ValidateSEPlatformRegistrationParams(params));
+  TF_RETURN_IF_ERROR(ValidateSPPlatform(platform));
+  TF_RETURN_IF_ERROR(ValidateSPPlatformFns(platform_fns));
+
+  // Fill stream executor creation params
+  SE_CreateStreamExecutorParams se_params{
+      SE_CREATE_STREAM_EXECUTOR_PARAMS_STRUCT_SIZE};
+  SP_StreamExecutor se{SP_STREAMEXECUTOR_STRUCT_SIZE};
+  se_params.stream_executor = &se;
+
+  // Create StreamExecutor
+  platform_fns.create_stream_executor(&platform, &se_params, c_status.get());
+  TF_RETURN_IF_ERROR(tensorflow::StatusFromTF_Status(c_status.get()));
+  TF_RETURN_IF_ERROR(ValidateSPStreamExecutor(se, platform));
+
+  SP_TimerFns timer_fns{SP_TIMER_FNS_STRUCT_SIZE};
+  platform_fns.create_timer_fns(&platform, &timer_fns, c_status.get());
+  TF_RETURN_IF_ERROR(tensorflow::StatusFromTF_Status(c_status.get()));
+  TF_RETURN_IF_ERROR(ValidateSPTimerFns(timer_fns));
+
+  platform_fns.create_timer_fns(&platform, &timer_fns, c_status.get());
+  TF_RETURN_IF_ERROR(tensorflow::StatusFromTF_Status(c_status.get()));
+  TF_RETURN_IF_ERROR(ValidateSPTimerFns(timer_fns));
+
+  // Register new platform
+  std::string platform_name = std::string(platform.name);
+  std::unique_ptr<stream_executor::CPlatform> cplatform(
+      new stream_executor::CPlatform(
+          std::move(platform), params.destroy_platform, std::move(platform_fns),
+          params.destroy_platform_fns, std::move(se), std::move(timer_fns)));
+  SE_CHECK_OK(stream_executor::MultiPlatformManager::RegisterPlatform(
+      std::move(cplatform)));
+
+  // TODO(annarev): Add pluggable device registration here.
+  return port::Status::OK();
+}
+}  // namespace stream_executor
diff --git a/tensorflow/c/experimental/stream_executor/stream_executor.h b/tensorflow/c/experimental/stream_executor/stream_executor.h
new file mode 100644
index 00000000000..796b4e95121
--- /dev/null
+++ b/tensorflow/c/experimental/stream_executor/stream_executor.h
@@ -0,0 +1,439 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_STREAM_EXECUTOR_STREAM_EXECUTOR_H_
+#define TENSORFLOW_C_EXPERIMENTAL_STREAM_EXECUTOR_STREAM_EXECUTOR_H_
+#include <stddef.h>
+#include <stdint.h>
+
+#include "tensorflow/c/c_api_macros.h"
+#include "tensorflow/c/tf_status.h"
+
+// --------------------------------------------------------------------------
+// C API for StreamExecutor. The API is under active development and eventually
+// should allow registering a pluggable device with TensorFlow.
+//
+// Conventions:
+//   * Struct prefix indicates whether struct fields should be filled by the
+//     plugin or core implementation:
+//     * SE_ : set/filled by core unless explicitly marked otherwise.
+//     * SP_ : set/filled by plugin unless explicitly marked otherwise.
+//   * We use `struct_size` for version checking. It is exempt from the `SE/SP`
+//     rule above and should be set both by core and the plugin.
+//     * For example, `create_device` function receives `SP_Device*` as input
+//       with `struct_size` populated by core. The plugin is responsible for
+//       setting `struct_size` as well, along with all other fields.
+//     * Refer to "TensorFlow Versioning Strategy" section at
+//       https://github.com/tensorflow/community/pull/257/files.
+//     * Note that the API is still under active development and doesn't have
+//       versioning guarantees yet.
+//   * `void* ext` is a free-form field that can be populated by
+//     a plugin in `SP_*` structs or potential future extension points in `SE_`
+//     structs.
+//
+// Example usage:
+//
+//   /* Sample TensorFlow code below, exact implementation might differ. */
+//   // Version checking uses `struct_size`. It is exempt from the `SE/SP` rule
+//   // above and should be set both by core and the plugin."
+//   SP_Device device { SP_DEVICE_STRUCT_SIZE };
+//   SE_CreateDeviceParams params { SE_CREATE_DEVICE_PARAMS_STRUCT_SIZE } ;
+//   params.device = &device;
+//
+//   /* Plugin code below */
+//   constexpr char DEVICE_NAME[] = "MyDevice";
+//   constexpr char DEVICE_TYPE[] = "GPU";
+//
+//   void create_device(const SP_Platform* platform,
+//                      SE_CreateDeviceParams* params, TF_Status* status) {
+//     // Custom actions based on TensorFlow's view of SP_Device.
+//     OnTFDeviceView(params->device->struct_size);
+//     params->device = { SP_DEVICE_STRUCT_SIZE };
+//     params->device->device_handle = get_my_device_handle(device->ordinal);
+//     params->device->ordinal = params->ordinal;
+//     ...
+//   }
+//
+//   void destroy_device(const SP_Platform* platform, SP_Device* device) {
+//     delete_my_device_handle(device->device_handle);
+//   }
+//
+//   void SE_InitPlugin(
+//       SE_PlatformRegistrationParams* params,
+//       TF_Status* status) {
+//     params->platform = { SP_PLATFORM_STRUCT_SIZE };
+//     // Values such as `name` and `type` must outlive SE_InitPlugin call.
+//     params->platform->name = DEVICE_NAME;
+//     params->platform->type = DEVICE_TYPE;
+//     params->platform->visible_device_count = 2;
+//     params->platform_fns->create_device = create_device;
+//     params->platform_fns->destroy_device = destroy_device;
+//     ...
+//   }
+
+#define SE_MAJOR 0
+#define SE_MINOR 0
+#define SE_PATCH 1
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct SP_Stream_st* SP_Stream;
+typedef struct SP_Event_st* SP_Event;
+typedef struct SP_Timer_st* SP_Timer;
+// Takes `callback_arg` passed to `host_callback` as the first argument.
+typedef void (*SE_StatusCallbackFn)(void* const, TF_Status* const);
+
+typedef struct SP_TimerFns {
+  size_t struct_size;
+  void* ext;  // reserved for future use
+  uint64_t (*nanoseconds)(SP_Timer timer);
+} SP_TimerFns;
+
+#define SP_TIMER_FNS_STRUCT_SIZE TF_OFFSET_OF_END(SP_TimerFns, nanoseconds)
+
+typedef struct SP_AllocatorStats {
+  size_t struct_size;
+  int64_t num_allocs;
+  int64_t bytes_in_use;
+  int64_t peak_bytes_in_use;
+  int64_t largest_alloc_size;
+
+  int8_t has_bytes_limit;
+  int64_t bytes_limit;
+
+  int64_t bytes_reserved;
+  int64_t peak_bytes_reserved;
+
+  int8_t has_bytes_reservable_limit;
+  int64_t bytes_reservable_limit;
+
+  int64_t largest_free_block_bytes;
+} SP_AllocatorStats;
+
+#define SP_ALLOCATORSTATS_STRUCT_SIZE \
+  TF_OFFSET_OF_END(SP_AllocatorStats, largest_free_block_bytes)
+
+// Potential states for an SP_Event. If `poll_for_status` returns anything aside
+// from kPending or kComplete, an error has occurred; kUnknown is a bad state.
+typedef enum SE_EventStatus {
+  SE_EVENT_UNKNOWN,
+  SE_EVENT_ERROR,
+  SE_EVENT_PENDING,
+  SE_EVENT_COMPLETE,
+} SE_EventStatus;
+
+// Memory allocation information.
+// This matches DeviceMemoryBase defined here:
+// https://cs.opensource.google/tensorflow/tensorflow/+/refs/tags/v2.3.0:tensorflow/stream_executor/device_memory.h;l=57
+typedef struct SP_DeviceMemoryBase {
+  size_t struct_size;
+  void* ext;  // free-form data set by plugin
+  // Platform-dependent value representing allocated memory.
+  void* opaque;
+  uint64_t size;     // Size in bytes of this allocation.
+  uint64_t payload;  // Value for plugin's use
+} SP_DeviceMemoryBase;
+
+#define SP_DEVICE_MEMORY_BASE_STRUCT_SIZE \
+  TF_OFFSET_OF_END(SP_DeviceMemoryBase, payload)
+
+typedef struct SP_Device {
+  size_t struct_size;
+  void* ext;        // free-form data set by plugin
+  int32_t ordinal;  // device index
+
+  // Device vendor can store handle to their device representation
+  // here.
+  void* device_handle;
+} SP_Device;
+
+#define SP_DEVICE_STRUCT_SIZE TF_OFFSET_OF_END(SP_Device, device_handle)
+
+typedef struct SE_CreateDeviceParams {
+  size_t struct_size;
+  void* ext;        // reserved for future use
+  int32_t ordinal;  // device index
+
+  SP_Device* device;  // Input/output, struct_size set by TF for plugin to read.
+                      // Subsequently plugin fills the entire struct.
+} SE_CreateDeviceParams;
+
+#define SE_CREATE_DEVICE_PARAMS_STRUCT_SIZE \
+  TF_OFFSET_OF_END(SE_CreateDeviceParams, device)
+
+typedef struct SP_StreamExecutor {
+  size_t struct_size;
+  void* ext;  // reserved for future use
+
+  /*** ALLOCATION CALLBACKS ***/
+  // Synchronously allocates `size` bytes on the underlying platform and returns
+  // `SP_DeviceMemoryBase` representing that allocation. In the case of failure,
+  // nullptr is returned.
+  // `memory_space` is reserved for a potential future usage and should be set
+  // to 0.
+  void (*allocate)(const SP_Device* device, uint64_t size, int64_t memory_space,
+                   SP_DeviceMemoryBase* mem);
+
+  // Deallocate the device memory previously allocated via this interface.
+  // Deallocation of a nullptr-representative value is permitted.
+  void (*deallocate)(const SP_Device* device, SP_DeviceMemoryBase* memory);
+
+  // Allocates a region of host memory and registers it with the platform API.
+  // Memory allocated in this manner is required for use in asynchronous memcpy
+  // operations, such as `memcpy_dtoh`.
+  void* (*host_memory_allocate)(const SP_Device* device, uint64_t size);
+
+  // Deallocates a region of host memory allocated by `host_memory_allocate`.
+  void (*host_memory_deallocate)(const SP_Device* device, void* mem);
+
+  // Allocates unified memory space of the given size, if supported. Unified
+  // memory support should be added by setting `supports_unified_memory` field
+  // in `SP_Platform`.
+  void* (*unified_memory_allocate)(const SP_Device* device, uint64_t bytes);
+
+  // Deallocates unified memory space previously allocated with
+  // `unified_memory_allocate`. Unified
+  // memory support should be added by setting `supports_unified_memory` field
+  // in `SP_Platform`.
+  void (*unified_memory_deallocate)(const SP_Device* device, void* location);
+
+  // Fills SP_AllocatorStats with allocator statistics, if it is available.
+  // If it is not available, return false.
+  TF_Bool (*get_allocator_stats)(const SP_Device* device,
+                                 SP_AllocatorStats* stats);
+  // Fills the underlying device memory usage information, if it is
+  // available. If it is not available (false is returned), free/total need not
+  // be initialized.
+  TF_Bool (*device_memory_usage)(const SP_Device* device, int64_t* free,
+                                 int64_t* total);
+
+  /*** STREAM CALLBACKS ***/
+  // Creates SP_Stream. This call should also allocate stream
+  // resources on the underlying platform and initializes its
+  // internals.
+  void (*create_stream)(const SP_Device* device, SP_Stream* stream,
+                        TF_Status* status);
+
+  // Destroys SP_Stream and deallocates any underlying resources.
+  void (*destroy_stream)(const SP_Device* device, SP_Stream stream);
+
+  // Causes `dependent` to not begin execution until `other` has finished its
+  // last-enqueued work.
+  void (*create_stream_dependency)(const SP_Device* device, SP_Stream dependent,
+                                   SP_Stream other, TF_Status* status);
+
+  // Without blocking the device, retrieve the current stream status.
+  void (*get_stream_status)(const SP_Device* device, SP_Stream stream,
+                            TF_Status* status);
+
+  /*** EVENT CALLBACKS ***/
+  // Create SP_Event. Performs platform-specific allocation and initialization
+  // of an event.
+  void (*create_event)(const SP_Device* device, SP_Event* event,
+                       TF_Status* status);
+
+  // Destroy SE_Event and perform any platform-specific deallocation and
+  // cleanup of an event.
+  void (*destroy_event)(const SP_Device* device, SP_Event event);
+
+  // Requests the current status of the event from the underlying platform.
+  SE_EventStatus (*get_event_status)(const SP_Device* device, SP_Event event);
+  // Inserts the specified event at the end of the specified stream.
+  void (*record_event)(const SP_Device* device, SP_Stream stream,
+                       SP_Event event, TF_Status* status);
+
+  // Wait for the specified event at the end of the specified stream.
+  void (*wait_for_event)(const SP_Device* const device, SP_Stream stream,
+                         SP_Event event, TF_Status* const status);
+
+  /*** TIMER CALLBACKS ***/
+  // Creates SP_Timer. Allocates timer resources on the underlying platform
+  // and initializes its internals, setting `timer` output variable. Sets
+  // values in `timer_fns` struct.
+  void (*create_timer)(const SP_Device* device, SP_Timer* timer,
+                       TF_Status* status);
+
+  // Destroy timer and deallocates timer resources on the underlying platform.
+  void (*destroy_timer)(const SP_Device* device, SP_Timer timer);
+
+  // Records a start event for an interval timer.
+  void (*start_timer)(const SP_Device* device, SP_Stream stream, SP_Timer timer,
+                      TF_Status* status);
+
+  // Records a stop event for an interval timer.
+  void (*stop_timer)(const SP_Device* device, SP_Stream stream, SP_Timer timer,
+                     TF_Status* status);
+
+  /*** MEMCPY CALLBACKS ***/
+  // Enqueues a memcpy operation onto stream, with a host destination location
+  // `host_dst` and a device memory source, with target size `size`.
+  void (*memcpy_dtoh)(const SP_Device* device, SP_Stream stream, void* host_dst,
+                      const SP_DeviceMemoryBase* device_src, uint64_t size,
+                      TF_Status* status);
+
+  // Enqueues a memcpy operation onto stream, with a device destination
+  // location and a host memory source, with target size `size`.
+  void (*memcpy_htod)(const SP_Device* device, SP_Stream stream,
+                      SP_DeviceMemoryBase* device_dst, const void* host_src,
+                      uint64_t size, TF_Status* status);
+
+  // Enqueues a memcpy operation onto stream, with a device destination
+  // location and a device memory source, with target size `size`.
+  void (*memcpy_dtod)(const SP_Device* device, SP_Stream stream,
+                      SP_DeviceMemoryBase* device_dst,
+                      const SP_DeviceMemoryBase* device_src, uint64_t size,
+                      TF_Status* status);
+
+  // Blocks the caller while a data segment of the given size is
+  // copied from the device source to the host destination.
+  void (*sync_memcpy_dtoh)(const SP_Device* device, void* host_dst,
+                           const SP_DeviceMemoryBase* device_src, uint64_t size,
+                           TF_Status* status);
+
+  // Blocks the caller while a data segment of the given size is
+  // copied from the host source to the device destination.
+  void (*sync_memcpy_htod)(const SP_Device* device,
+                           SP_DeviceMemoryBase* device_dst,
+                           const void* host_src, uint64_t size,
+                           TF_Status* status);
+
+  // Blocks the caller while a data segment of the given size is copied from the
+  // device source to the device destination.
+  void (*sync_memcpy_dtod)(const SP_Device* device,
+                           SP_DeviceMemoryBase* device_dst,
+                           const SP_DeviceMemoryBase* device_src, uint64_t size,
+                           TF_Status* status);
+
+  // Causes the host code to synchronously wait for the event to complete.
+  void (*block_host_for_event)(const SP_Device* device, SP_Event event,
+                               TF_Status* status);
+
+  // [Optional]
+  // Causes the host code to synchronously wait for operations entrained onto
+  // stream to complete. Effectively a join on the asynchronous device
+  // operations enqueued on the stream before this program point.
+  // If not set, then corresponding functionality will be implemented
+  // by registering an event on the `stream` and waiting for it using
+  // `block_host_for_event`.
+  void (*block_host_until_done)(const SP_Device* device, SP_Stream stream,
+                                TF_Status* status);
+
+  // Synchronizes all activity occurring in the StreamExecutor's context (most
+  // likely a whole device).
+  void (*synchronize_all_activity)(const SP_Device* device, TF_Status* status);
+
+  // Enqueues on a stream a user-specified function to be run on the host.
+  // `callback_arg` should be passed as the first argument to `callback_fn`.
+  TF_Bool (*host_callback)(SP_Device* device, SP_Stream stream,
+                           SE_StatusCallbackFn callback_fn, void* callback_arg);
+} SP_StreamExecutor;
+
+#define SP_STREAMEXECUTOR_STRUCT_SIZE \
+  TF_OFFSET_OF_END(SP_StreamExecutor, host_callback)
+
+typedef struct SE_CreateStreamExecutorParams {
+  size_t struct_size;
+  void* ext;  // reserved for future use
+
+  SP_StreamExecutor* stream_executor;  // output, to be filled by plugin
+} SE_CreateStreamExecutorParams;
+
+#define SE_CREATE_STREAM_EXECUTOR_PARAMS_STRUCT_SIZE \
+  TF_OFFSET_OF_END(SE_CreateStreamExecutorParams, stream_executor)
+
+typedef struct SP_Platform {
+  size_t struct_size;
+
+  void* ext;  // free-form data set by plugin
+
+  // Platform name. Must be null-terminated.
+  const char* name;
+
+  // Device type name, for example GPU. Must be null-terminated.
+  const char* type;
+
+  // Number of visible devices
+  size_t visible_device_count;
+
+  // Whether this platform supports unified memory.
+  // Unified memory is a single memory address space accessible from any device.
+  TF_Bool supports_unified_memory;
+} SP_Platform;
+
+#define SP_PLATFORM_STRUCT_SIZE \
+  TF_OFFSET_OF_END(SP_Platform, supports_unified_memory)
+
+typedef struct SP_PlatformFns {
+  size_t struct_size;
+
+  void* ext;  // reserved for future use
+
+  // Callbacks for creating/destroying SP_Device.
+  void (*create_device)(const SP_Platform* platform,
+                        SE_CreateDeviceParams* params, TF_Status* status);
+
+  // Clean up fields inside SP_Device that were allocated
+  // by the plugin. `device` itself should not be deleted here.
+  void (*destroy_device)(const SP_Platform* platform, SP_Device* device);
+
+  // Callbacks for creating/destroying SP_StreamExecutor.
+  void (*create_stream_executor)(const SP_Platform* platform,
+                                 SE_CreateStreamExecutorParams* params,
+                                 TF_Status* status);
+  // Clean up fields inside SP_StreamExecutor that were allocated
+  // by the plugin. `stream_executor` itself should not be deleted here.
+  void (*destroy_stream_executor)(const SP_Platform* platform,
+                                  SP_StreamExecutor* stream_executor);
+
+  // Callbacks for creating/destroying SP_TimerFns.
+  void (*create_timer_fns)(const SP_Platform* platform, SP_TimerFns* timer,
+                           TF_Status* status);
+
+  void (*destroy_timer_fns)(const SP_Platform* platform,
+                            SP_TimerFns* timer_fns);
+} SP_PlatformFns;
+
+#define SP_PLATFORM_FNS_STRUCT_SIZE \
+  TF_OFFSET_OF_END(SP_PlatformFns, destroy_timer_fns)
+
+typedef struct SE_PlatformRegistrationParams {
+  size_t struct_size;
+  void* ext;  // reserved for future use
+
+  // StreamExecutor C API version.
+  int32_t major_version;
+  int32_t minor_version;
+  int32_t patch_version;
+
+  SP_Platform* platform;         // output, set by plugin
+  SP_PlatformFns* platform_fns;  // output, set by plugin
+  // Clean up fields inside SP_Platform that were allocated
+  // by the plugin. `platform` itself should not be deleted here.
+  void (*destroy_platform)(SP_Platform* platform);  // out, set by plugin
+  void (*destroy_platform_fns)(
+      SP_PlatformFns* platform_fns);  // out, set by plugin
+} SE_PlatformRegistrationParams;
+
+#define SE_PLATFORM_REGISTRATION_PARAMS_STRUCT_SIZE \
+  TF_OFFSET_OF_END(SE_PlatformRegistrationParams, destroy_platform_fns)
+
+void SE_InitPlugin(SE_PlatformRegistrationParams* params, TF_Status* status);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_STREAM_EXECUTOR_STREAM_EXECUTOR_H_
diff --git a/tensorflow/c/experimental/stream_executor/stream_executor_internal.h b/tensorflow/c/experimental/stream_executor/stream_executor_internal.h
new file mode 100644
index 00000000000..079c3661453
--- /dev/null
+++ b/tensorflow/c/experimental/stream_executor/stream_executor_internal.h
@@ -0,0 +1,85 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Classes and utilities that work with StreamExecutor C API for internal use.
+// This includes functions used for device registration and interfaces needed
+// for testing.
+#ifndef TENSORFLOW_C_EXPERIMENTAL_STREAM_EXECUTOR_STREAM_EXECUTOR_INTERNAL_H_
+#define TENSORFLOW_C_EXPERIMENTAL_STREAM_EXECUTOR_STREAM_EXECUTOR_INTERNAL_H_
+
+#include "tensorflow/c/experimental/stream_executor/stream_executor.h"
+#include "tensorflow/stream_executor/executor_cache.h"
+#include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/platform.h"
+
+namespace stream_executor {
+
+// Plugin initialization function that a device plugin
+// must define.
+typedef void (*SEInitPluginFn)(SE_PlatformRegistrationParams* const,
+                               TF_Status* const);
+
+// Registers StreamExecutor platform.
+port::Status InitStreamExecutorPlugin(void* dso_handle);
+
+// Allow registering a StreamExecutor plugin using a function (used for
+// testing).
+port::Status InitStreamExecutorPlugin(SEInitPluginFn init_fn);
+
+class CPlatform : public Platform {
+ public:
+  explicit CPlatform(SP_Platform platform,
+                     void (*destroy_platform)(SP_Platform*),
+                     SP_PlatformFns platform_fns,
+                     void (*destroy_platform_fns)(SP_PlatformFns*),
+                     SP_StreamExecutor stream_executor, SP_TimerFns timer_fns);
+  ~CPlatform() override;
+
+  Id id() const override { return const_cast<int*>(&plugin_id_value_); }
+  const std::string& Name() const override { return name_; }
+  int VisibleDeviceCount() const override {
+    return platform_.visible_device_count;
+  }
+  port::StatusOr<std::unique_ptr<DeviceDescription>> DescriptionForDevice(
+      int ordinal) const override;
+  port::StatusOr<StreamExecutor*> ExecutorForDevice(int ordinal) override;
+  port::StatusOr<StreamExecutor*> ExecutorForDeviceWithPluginConfig(
+      int ordinal, const PluginConfig& plugin_config) override;
+  port::StatusOr<StreamExecutor*> GetExecutor(
+      const StreamExecutorConfig& config) override;
+  port::StatusOr<std::unique_ptr<StreamExecutor>> GetUncachedExecutor(
+      const StreamExecutorConfig& config) override;
+
+  // Trace listener is not supported
+  void RegisterTraceListener(std::unique_ptr<TraceListener> listener) override {
+    LOG(FATAL) << "RegisterTraceListener is not supported by pluggable device";
+  }
+  void UnregisterTraceListener(TraceListener* listener) override {}
+
+  void DestroyAllExecutors() { executor_cache_.DestroyAllExecutors(); }
+
+ private:
+  SP_Platform platform_;
+  void (*destroy_platform_)(SP_Platform*);
+  SP_PlatformFns platform_fns_;
+  void (*destroy_platform_fns_)(SP_PlatformFns*);
+  SP_StreamExecutor stream_executor_;
+  SP_TimerFns timer_fns_;
+  const std::string name_;
+  int plugin_id_value_;
+  stream_executor::ExecutorCache executor_cache_;
+};
+
+}  // namespace stream_executor
+#endif  // TENSORFLOW_C_EXPERIMENTAL_STREAM_EXECUTOR_STREAM_EXECUTOR_INTERNAL_H_
diff --git a/tensorflow/c/experimental/stream_executor/stream_executor_test.cc b/tensorflow/c/experimental/stream_executor/stream_executor_test.cc
new file mode 100644
index 00000000000..c280a3975b7
--- /dev/null
+++ b/tensorflow/c/experimental/stream_executor/stream_executor_test.cc
@@ -0,0 +1,883 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0(the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/c/experimental/stream_executor/stream_executor.h"
+
+#include "tensorflow/c/experimental/stream_executor/stream_executor_internal.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/error_codes.pb.h"
+#include "tensorflow/stream_executor/event.h"
+#include "tensorflow/stream_executor/multi_platform_manager.h"
+#include "tensorflow/stream_executor/stream.h"
+#include "tensorflow/stream_executor/stream_executor_pimpl.h"
+#include "tensorflow/stream_executor/timer.h"
+
+struct SP_Stream_st {
+  explicit SP_Stream_st(int id) : stream_id(id) {}
+  int stream_id;
+};
+
+struct SP_Event_st {
+  explicit SP_Event_st(int id) : event_id(id) {}
+  int event_id;
+};
+
+struct SP_Timer_st {
+  explicit SP_Timer_st(int id) : timer_id(id) {}
+  int timer_id;
+};
+
+namespace stream_executor {
+namespace {
+constexpr int DEVICE_COUNT = 2;
+constexpr char DEVICE_NAME[] = "MyDevice";
+constexpr char DEVICE_TYPE[] = "GPU";
+
+/*** Create SP_StreamExecutor (with empty functions) ***/
+void allocate(const SP_Device* const device, uint64_t size,
+              int64_t memory_space, SP_DeviceMemoryBase* const mem) {}
+void deallocate(const SP_Device* const device, SP_DeviceMemoryBase* const mem) {
+}
+void* host_memory_allocate(const SP_Device* const device, uint64_t size) {
+  return nullptr;
+}
+void host_memory_deallocate(const SP_Device* const device, void* mem) {}
+TF_Bool get_allocator_stats(const SP_Device* const device,
+                            SP_AllocatorStats* const stats) {
+  return true;
+}
+TF_Bool device_memory_usage(const SP_Device* const device, int64_t* const free,
+                            int64_t* const total) {
+  return true;
+}
+void create_stream(const SP_Device* const device, SP_Stream* stream,
+                   TF_Status* const status) {
+  stream = nullptr;
+}
+void destroy_stream(const SP_Device* const device, SP_Stream stream) {}
+void create_stream_dependency(const SP_Device* const device,
+                              SP_Stream dependent, SP_Stream other,
+                              TF_Status* const status) {}
+void get_stream_status(const SP_Device* const device, SP_Stream stream,
+                       TF_Status* const status) {}
+void create_event(const SP_Device* const device, SP_Event* event,
+                  TF_Status* const status) {
+  event = nullptr;
+}
+void destroy_event(const SP_Device* const device, SP_Event event) {}
+SE_EventStatus get_event_status(const SP_Device* const device, SP_Event event) {
+  return SE_EVENT_UNKNOWN;
+}
+void record_event(const SP_Device* const device, SP_Stream stream,
+                  SP_Event event, TF_Status* const status) {}
+void wait_for_event(const SP_Device* const device, SP_Stream stream,
+                    SP_Event event, TF_Status* const status) {}
+void create_timer(const SP_Device* const device, SP_Timer* timer,
+                  TF_Status* const status) {}
+void destroy_timer(const SP_Device* const device, SP_Timer timer) {}
+void start_timer(const SP_Device* const device, SP_Stream stream,
+                 SP_Timer timer, TF_Status* const status) {}
+void stop_timer(const SP_Device* const device, SP_Stream stream, SP_Timer timer,
+                TF_Status* const status) {}
+void memcpy_dtoh(const SP_Device* const device, SP_Stream stream,
+                 void* host_dst, const SP_DeviceMemoryBase* const device_src,
+                 uint64_t size, TF_Status* const status) {}
+void memcpy_htod(const SP_Device* const device, SP_Stream stream,
+                 SP_DeviceMemoryBase* const device_dst, const void* host_src,
+                 uint64_t size, TF_Status* const status) {}
+void sync_memcpy_dtoh(const SP_Device* const device, void* host_dst,
+                      const SP_DeviceMemoryBase* const device_src,
+                      uint64_t size, TF_Status* const status) {}
+void sync_memcpy_htod(const SP_Device* const device,
+                      SP_DeviceMemoryBase* const device_dst,
+                      const void* host_src, uint64_t size,
+                      TF_Status* const status) {}
+void block_host_for_event(const SP_Device* const device, SP_Event event,
+                          TF_Status* const status) {}
+void synchronize_all_activity(const SP_Device* const device,
+                              TF_Status* const status) {}
+TF_Bool host_callback(SP_Device* const device, SP_Stream stream,
+                      SE_StatusCallbackFn const callback_fn,
+                      void* const callback_arg) {
+  return true;
+}
+
+void PopulateDefaultStreamExecutor(SP_StreamExecutor* se) {
+  *se = {SP_STREAMEXECUTOR_STRUCT_SIZE};
+  se->allocate = allocate;
+  se->deallocate = deallocate;
+  se->host_memory_allocate = host_memory_allocate;
+  se->host_memory_deallocate = host_memory_deallocate;
+  se->get_allocator_stats = get_allocator_stats;
+  se->device_memory_usage = device_memory_usage;
+  se->create_stream = create_stream;
+  se->destroy_stream = destroy_stream;
+  se->create_stream_dependency = create_stream_dependency;
+  se->get_stream_status = get_stream_status;
+  se->create_event = create_event;
+  se->destroy_event = destroy_event;
+  se->get_event_status = get_event_status;
+  se->record_event = record_event;
+  se->wait_for_event = wait_for_event;
+  se->create_timer = create_timer;
+  se->destroy_timer = destroy_timer;
+  se->start_timer = start_timer;
+  se->stop_timer = stop_timer;
+  se->memcpy_dtoh = memcpy_dtoh;
+  se->memcpy_htod = memcpy_htod;
+  se->sync_memcpy_dtoh = sync_memcpy_dtoh;
+  se->sync_memcpy_htod = sync_memcpy_htod;
+  se->block_host_for_event = block_host_for_event;
+  se->synchronize_all_activity = synchronize_all_activity;
+  se->host_callback = host_callback;
+}
+
+/*** Create SP_TimerFns ***/
+uint64_t nanoseconds(SP_Timer timer) { return timer->timer_id; }
+
+void PopulateDefaultTimerFns(SP_TimerFns* timer_fns) {
+  timer_fns->nanoseconds = nanoseconds;
+}
+
+/*** Create SP_Platform ***/
+void create_timer_fns(const SP_Platform* platform, SP_TimerFns* timer_fns,
+                      TF_Status* status) {
+  TF_SetStatus(status, TF_OK, "");
+  PopulateDefaultTimerFns(timer_fns);
+}
+void destroy_timer_fns(const SP_Platform* platform, SP_TimerFns* timer_fns) {}
+
+void create_stream_executor(const SP_Platform* platform,
+                            SE_CreateStreamExecutorParams* params,
+                            TF_Status* status) {
+  TF_SetStatus(status, TF_OK, "");
+  PopulateDefaultStreamExecutor(params->stream_executor);
+}
+void destroy_stream_executor(const SP_Platform* platform,
+                             SP_StreamExecutor* se) {}
+
+void create_device(const SP_Platform* platform, SE_CreateDeviceParams* params,
+                   TF_Status* status) {
+  TF_SetStatus(status, TF_OK, "");
+  params->device->struct_size = SP_DEVICE_STRUCT_SIZE;
+}
+void destroy_device(const SP_Platform* platform, SP_Device* device) {}
+
+void PopulateDefaultPlatform(SP_Platform* platform,
+                             SP_PlatformFns* platform_fns) {
+  *platform = {SP_PLATFORM_STRUCT_SIZE};
+  platform->name = DEVICE_NAME;
+  platform->type = DEVICE_TYPE;
+  platform->visible_device_count = DEVICE_COUNT;
+  platform_fns->create_device = create_device;
+  platform_fns->destroy_device = destroy_device;
+  platform_fns->create_stream_executor = create_stream_executor;
+  platform_fns->destroy_stream_executor = destroy_stream_executor;
+  platform_fns->create_timer_fns = create_timer_fns;
+  platform_fns->destroy_timer_fns = destroy_timer_fns;
+}
+
+void destroy_platform(SP_Platform* const platform) {}
+void destroy_platform_fns(SP_PlatformFns* const platform_fns) {}
+
+/*** Registration tests ***/
+TEST(StreamExecutor, SuccessfulRegistration) {
+  auto plugin_init = [](SE_PlatformRegistrationParams* const params,
+                        TF_Status* const status) -> void {
+    TF_SetStatus(status, TF_OK, "");
+    PopulateDefaultPlatform(params->platform, params->platform_fns);
+    params->destroy_platform = destroy_platform;
+    params->destroy_platform_fns = destroy_platform_fns;
+  };
+  port::Status status = InitStreamExecutorPlugin(plugin_init);
+  TF_ASSERT_OK(status);
+  port::StatusOr<Platform*> maybe_platform =
+      MultiPlatformManager::PlatformWithName("MyDevice");
+  TF_ASSERT_OK(maybe_platform.status());
+  Platform* platform = maybe_platform.ConsumeValueOrDie();
+  ASSERT_EQ(platform->Name(), DEVICE_NAME);
+  ASSERT_EQ(platform->VisibleDeviceCount(), DEVICE_COUNT);
+
+  port::StatusOr<StreamExecutor*> maybe_executor =
+      platform->ExecutorForDevice(0);
+  TF_ASSERT_OK(maybe_executor.status());
+  StreamExecutor* executor = maybe_executor.ConsumeValueOrDie();
+  ASSERT_EQ(executor->GetDeviceDescription().name(), "MyDevice");
+}
+
+TEST(StreamExecutor, NameNotSet) {
+  auto plugin_init = [](SE_PlatformRegistrationParams* const params,
+                        TF_Status* const status) -> void {
+    TF_SetStatus(status, TF_OK, "");
+    PopulateDefaultPlatform(params->platform, params->platform_fns);
+    params->platform->name = nullptr;
+    params->destroy_platform = destroy_platform;
+    params->destroy_platform_fns = destroy_platform_fns;
+  };
+
+  port::Status status = InitStreamExecutorPlugin(plugin_init);
+  ASSERT_EQ(status.code(), tensorflow::error::FAILED_PRECONDITION);
+  ASSERT_EQ(status.error_message(), "'name' field in SP_Platform must be set.");
+}
+
+TEST(StreamExecutor, CreateDeviceNotSet) {
+  auto plugin_init = [](SE_PlatformRegistrationParams* const params,
+                        TF_Status* const status) -> void {
+    TF_SetStatus(status, TF_OK, "");
+    PopulateDefaultPlatform(params->platform, params->platform_fns);
+    params->platform_fns->create_device = nullptr;
+    params->destroy_platform = destroy_platform;
+    params->destroy_platform_fns = destroy_platform_fns;
+  };
+
+  port::Status status = InitStreamExecutorPlugin(plugin_init);
+  ASSERT_EQ(status.code(), tensorflow::error::FAILED_PRECONDITION);
+  ASSERT_EQ(status.error_message(),
+            "'create_device' field in SP_PlatformFns must be set.");
+}
+
+TEST(StreamExecutor, UnifiedMemoryAllocateNotSet) {
+  auto plugin_init = [](SE_PlatformRegistrationParams* const params,
+                        TF_Status* const status) -> void {
+    TF_SetStatus(status, TF_OK, "");
+    PopulateDefaultPlatform(params->platform, params->platform_fns);
+    params->platform->supports_unified_memory = true;
+    params->destroy_platform = destroy_platform;
+    params->destroy_platform_fns = destroy_platform_fns;
+  };
+
+  port::Status status = InitStreamExecutorPlugin(plugin_init);
+  ASSERT_EQ(status.code(), tensorflow::error::FAILED_PRECONDITION);
+  ASSERT_EQ(
+      status.error_message(),
+      "'unified_memory_allocate' field in SP_StreamExecutor must be set.");
+}
+
+/*** StreamExecutor behavior tests ***/
+class StreamExecutorTest : public ::testing::Test {
+ protected:
+  StreamExecutorTest() {}
+  void SetUp() override {
+    PopulateDefaultPlatform(&platform_, &platform_fns_);
+    PopulateDefaultStreamExecutor(&se_);
+    PopulateDefaultTimerFns(&timer_fns_);
+  }
+  void TearDown() override {}
+
+  StreamExecutor* GetExecutor(int ordinal) {
+    if (!cplatform_) {
+      cplatform_ = absl::make_unique<CPlatform>(
+          platform_, destroy_platform, platform_fns_, destroy_platform_fns, se_,
+          timer_fns_);
+    }
+    port::StatusOr<StreamExecutor*> maybe_executor =
+        cplatform_->ExecutorForDevice(ordinal);
+    TF_CHECK_OK(maybe_executor.status());
+    return maybe_executor.ConsumeValueOrDie();
+  }
+  SP_Platform platform_;
+  SP_PlatformFns platform_fns_;
+  SP_StreamExecutor se_;
+  SP_TimerFns timer_fns_;
+  std::unique_ptr<CPlatform> cplatform_;
+};
+
+TEST_F(StreamExecutorTest, Allocate) {
+  se_.allocate = [](const SP_Device* const device, uint64_t size,
+                    int64_t memory_space, SP_DeviceMemoryBase* const mem) {
+    mem->struct_size = SP_DEVICE_MEMORY_BASE_STRUCT_SIZE;
+    mem->opaque = malloc(size);
+    mem->size = size;
+  };
+  se_.deallocate = [](const SP_Device* const device,
+                      SP_DeviceMemoryBase* const mem) {
+    EXPECT_EQ(mem->size, 2 * sizeof(int));
+    free(mem->opaque);
+    mem->opaque = nullptr;
+    mem->size = 0;
+  };
+  StreamExecutor* executor = GetExecutor(0);
+  DeviceMemory<int> mem = executor->AllocateArray<int>(2);
+  ASSERT_NE(mem.opaque(), nullptr);
+  ASSERT_EQ(mem.size(), 2 * sizeof(int));
+  executor->Deallocate(&mem);
+  ASSERT_EQ(mem.opaque(), nullptr);
+}
+
+TEST_F(StreamExecutorTest, HostMemoryAllocate) {
+  static bool allocate_called = false;
+  static bool deallocate_called = false;
+  se_.host_memory_allocate = [](const SP_Device* const device, uint64_t size) {
+    allocate_called = true;
+    return malloc(size);
+  };
+  se_.host_memory_deallocate = [](const SP_Device* const device, void* mem) {
+    free(mem);
+    deallocate_called = true;
+  };
+  StreamExecutor* executor = GetExecutor(0);
+  ASSERT_FALSE(allocate_called);
+  void* mem = executor->HostMemoryAllocate(8);
+  ASSERT_NE(mem, nullptr);
+  ASSERT_TRUE(allocate_called);
+  ASSERT_FALSE(deallocate_called);
+  executor->HostMemoryDeallocate(mem);
+  ASSERT_TRUE(deallocate_called);
+}
+
+TEST_F(StreamExecutorTest, UnifiedMemoryAllocate) {
+  static bool allocate_called = false;
+  static bool deallocate_called = false;
+  se_.unified_memory_allocate = [](const SP_Device* const device,
+                                   uint64_t size) {
+    allocate_called = true;
+    return malloc(size);
+  };
+  se_.unified_memory_deallocate = [](const SP_Device* const device, void* mem) {
+    free(mem);
+    deallocate_called = true;
+  };
+  StreamExecutor* executor = GetExecutor(0);
+  ASSERT_FALSE(allocate_called);
+  void* mem = executor->UnifiedMemoryAllocate(8);
+  ASSERT_NE(mem, nullptr);
+  ASSERT_TRUE(allocate_called);
+  ASSERT_FALSE(deallocate_called);
+  executor->UnifiedMemoryDeallocate(mem);
+  ASSERT_TRUE(deallocate_called);
+}
+
+TEST_F(StreamExecutorTest, GetAllocatorStats) {
+  se_.get_allocator_stats = [](const SP_Device* const device,
+                               SP_AllocatorStats* const stat) -> TF_Bool {
+    stat->struct_size = SP_ALLOCATORSTATS_STRUCT_SIZE;
+    stat->bytes_in_use = 123;
+    return true;
+  };
+
+  StreamExecutor* executor = GetExecutor(0);
+  absl::optional<AllocatorStats> optional_stats = executor->GetAllocatorStats();
+  ASSERT_TRUE(optional_stats.has_value());
+  AllocatorStats stats = optional_stats.value();
+  ASSERT_EQ(stats.bytes_in_use, 123);
+}
+
+TEST_F(StreamExecutorTest, DeviceMemoryUsage) {
+  se_.device_memory_usage = [](const SP_Device* const device,
+                               int64_t* const free,
+                               int64_t* const total) -> TF_Bool {
+    *free = 45;
+    *total = 7;
+    return true;
+  };
+
+  StreamExecutor* executor = GetExecutor(0);
+  int64 free = 0;
+  int64 total = 0;
+  executor->DeviceMemoryUsage(&free, &total);
+  ASSERT_EQ(free, 45);
+  ASSERT_EQ(total, 7);
+}
+
+TEST_F(StreamExecutorTest, CreateStream) {
+  static bool stream_created = false;
+  static bool stream_deleted = false;
+  se_.create_stream = [](const SP_Device* const device, SP_Stream* stream,
+                         TF_Status* const status) -> void {
+    *stream = new SP_Stream_st(14);
+    stream_created = true;
+  };
+  se_.destroy_stream = [](const SP_Device* const device,
+                          SP_Stream stream) -> void {
+    auto custom_stream = static_cast<SP_Stream_st*>(stream);
+    ASSERT_EQ(custom_stream->stream_id, 14);
+    delete custom_stream;
+    stream_deleted = true;
+  };
+
+  StreamExecutor* executor = GetExecutor(0);
+  ASSERT_FALSE(stream_created);
+  Stream* stream = new Stream(executor);
+  stream->Init();
+  ASSERT_TRUE(stream->ok());
+  ASSERT_TRUE(stream_created);
+  ASSERT_FALSE(stream_deleted);
+  delete stream;
+  ASSERT_TRUE(stream_deleted);
+}
+
+TEST_F(StreamExecutorTest, CreateStreamDependency) {
+  static bool create_stream_dependency_called = false;
+  se_.create_stream_dependency = [](const SP_Device* const device,
+                                    SP_Stream dependent, SP_Stream other,
+                                    TF_Status* const status) {
+    TF_SetStatus(status, TF_OK, "");
+    create_stream_dependency_called = true;
+  };
+
+  StreamExecutor* executor = GetExecutor(0);
+  Stream dependent(executor);
+  dependent.Init();
+  Stream other(executor);
+  other.Init();
+  ASSERT_FALSE(create_stream_dependency_called);
+  dependent.ThenWaitFor(&other);
+  ASSERT_TRUE(create_stream_dependency_called);
+}
+
+TEST_F(StreamExecutorTest, StreamStatus) {
+  static bool status_ok = true;
+  se_.get_stream_status = [](const SP_Device* const device, SP_Stream stream,
+                             TF_Status* const status) -> void {
+    if (status_ok) {
+      TF_SetStatus(status, TF_OK, "");
+    } else {
+      TF_SetStatus(status, TF_INTERNAL, "Test error");
+    }
+  };
+
+  StreamExecutor* executor = GetExecutor(0);
+  Stream stream(executor);
+  stream.Init();
+  ASSERT_TRUE(stream.ok());
+  TF_ASSERT_OK(stream.RefreshStatus());
+  status_ok = false;
+  auto updated_status = stream.RefreshStatus();
+  ASSERT_FALSE(stream.ok());
+  ASSERT_EQ(updated_status.error_message(), "Test error");
+}
+
+TEST_F(StreamExecutorTest, CreateEvent) {
+  static bool event_created = false;
+  static bool event_deleted = false;
+  se_.create_event = [](const SP_Device* const device, SP_Event* event,
+                        TF_Status* const status) -> void {
+    *event = new SP_Event_st(123);
+    event_created = true;
+  };
+  se_.destroy_event = [](const SP_Device* const device,
+                         SP_Event event) -> void {
+    auto custom_event = static_cast<SP_Event_st*>(event);
+    ASSERT_EQ(custom_event->event_id, 123);
+    delete custom_event;
+    event_deleted = true;
+  };
+
+  StreamExecutor* executor = GetExecutor(0);
+  ASSERT_FALSE(event_created);
+  Event* event = new Event(executor);
+  event->Init();
+  ASSERT_TRUE(event_created);
+  ASSERT_FALSE(event_deleted);
+  delete event;
+  ASSERT_TRUE(event_deleted);
+}
+
+TEST_F(StreamExecutorTest, PollForEventStatus) {
+  static SE_EventStatus event_status = SE_EVENT_COMPLETE;
+  se_.create_event = [](const SP_Device* const device, SP_Event* event,
+                        TF_Status* const status) -> void {
+    *event = new SP_Event_st(123);
+  };
+  se_.destroy_event = [](const SP_Device* const device,
+                         SP_Event event) -> void { delete event; };
+  se_.get_event_status = [](const SP_Device* const device,
+                            SP_Event event) -> SE_EventStatus {
+    EXPECT_EQ(event->event_id, 123);
+    return event_status;
+  };
+
+  StreamExecutor* executor = GetExecutor(0);
+  Event event(executor);
+  event.Init();
+  ASSERT_EQ(event.PollForStatus(), Event::Status::kComplete);
+  event_status = SE_EVENT_ERROR;
+  ASSERT_EQ(event.PollForStatus(), Event::Status::kError);
+}
+
+TEST_F(StreamExecutorTest, RecordAndWaitForEvent) {
+  static bool record_called = false;
+  static bool wait_called = false;
+  se_.create_stream = [](const SP_Device* const device, SP_Stream* stream,
+                         TF_Status* const status) -> void {
+    *stream = new SP_Stream_st(1);
+  };
+  se_.destroy_stream = [](const SP_Device* const device,
+                          SP_Stream stream) -> void { delete stream; };
+  se_.create_event = [](const SP_Device* const device, SP_Event* event,
+                        TF_Status* const status) -> void {
+    *event = new SP_Event_st(2);
+  };
+  se_.destroy_event = [](const SP_Device* const device,
+                         SP_Event event) -> void { delete event; };
+  se_.record_event = [](const SP_Device* const device, SP_Stream stream,
+                        SP_Event event, TF_Status* const status) {
+    EXPECT_EQ(stream->stream_id, 1);
+    EXPECT_EQ(event->event_id, 2);
+    TF_SetStatus(status, TF_OK, "");
+    record_called = true;
+  };
+  se_.wait_for_event = [](const SP_Device* const device, SP_Stream stream,
+                          SP_Event event, TF_Status* const status) {
+    EXPECT_EQ(stream->stream_id, 1);
+    EXPECT_EQ(event->event_id, 2);
+    TF_SetStatus(status, TF_OK, "");
+    wait_called = true;
+  };
+
+  StreamExecutor* executor = GetExecutor(0);
+  Event event(executor);
+  event.Init();
+  Stream stream(executor);
+  stream.Init();
+  ASSERT_FALSE(record_called);
+  stream.ThenRecordEvent(&event);
+  ASSERT_TRUE(record_called);
+  ASSERT_FALSE(wait_called);
+  stream.ThenWaitFor(&event);
+  ASSERT_TRUE(wait_called);
+}
+
+TEST_F(StreamExecutorTest, CreateTimer) {
+  static bool timer_created = false;
+  static bool timer_deleted = false;
+  se_.create_timer = [](const SP_Device* const device, SP_Timer* timer,
+                        TF_Status* const status) -> void {
+    *timer = new SP_Timer_st(25);
+    timer_created = true;
+  };
+  se_.destroy_timer = [](const SP_Device* const device,
+                         SP_Timer timer) -> void {
+    auto custom_timer = static_cast<SP_Timer_st*>(timer);
+    EXPECT_EQ(custom_timer->timer_id, 25);
+    delete custom_timer;
+    timer_deleted = true;
+  };
+
+  StreamExecutor* executor = GetExecutor(0);
+  ASSERT_FALSE(timer_created);
+  Stream stream(executor);
+  stream.Init();
+  Timer* timer = new Timer(executor);
+  stream.InitTimer(timer);
+  ASSERT_TRUE(stream.ok());
+  ASSERT_TRUE(timer_created);
+  ASSERT_FALSE(timer_deleted);
+  delete timer;
+  ASSERT_TRUE(timer_deleted);
+}
+
+TEST_F(StreamExecutorTest, StartTimer) {
+  static bool start_called = false;
+  static bool stop_called = false;
+  static TF_Code start_timer_status = TF_OK;
+  static TF_Code stop_timer_status = TF_OK;
+  se_.create_timer = [](const SP_Device* const device, SP_Timer* timer,
+                        TF_Status* const status) -> void {
+    *timer = new SP_Timer_st(7);
+  };
+  se_.destroy_timer = [](const SP_Device* const device,
+                         SP_Timer timer) -> void { delete timer; };
+  se_.start_timer = [](const SP_Device* const device, SP_Stream stream,
+                       SP_Timer timer, TF_Status* const status) {
+    TF_SetStatus(status, start_timer_status, "");
+    EXPECT_EQ(timer->timer_id, 7);
+    start_called = true;
+  };
+  se_.stop_timer = [](const SP_Device* const device, SP_Stream stream,
+                      SP_Timer timer, TF_Status* const status) {
+    TF_SetStatus(status, stop_timer_status, "");
+    EXPECT_EQ(timer->timer_id, 7);
+    stop_called = true;
+  };
+  StreamExecutor* executor = GetExecutor(0);
+  Stream stream(executor);
+  stream.Init();
+  Timer timer(executor);
+  stream.InitTimer(&timer);
+
+  // Check both start and stop succeed
+  ASSERT_FALSE(start_called);
+  stream.ThenStartTimer(&timer);
+  ASSERT_TRUE(start_called);
+  ASSERT_FALSE(stop_called);
+  stream.ThenStopTimer(&timer);
+  ASSERT_TRUE(stop_called);
+
+  // Check start timer fails
+  ASSERT_TRUE(stream.ok());
+  start_timer_status = TF_UNKNOWN;
+  stream.ThenStartTimer(&timer);
+  ASSERT_FALSE(stream.ok());
+
+  // Check stop timer fails
+  start_timer_status = TF_OK;
+  stop_timer_status = TF_UNKNOWN;
+  Stream stream2(executor);
+  stream2.Init();
+  Timer timer2(executor);
+  stream2.InitTimer(&timer2);
+  stream2.ThenStartTimer(&timer2);
+  ASSERT_TRUE(stream2.ok());
+  stream2.ThenStopTimer(&timer2);
+  ASSERT_FALSE(stream2.ok());
+}
+
+TEST_F(StreamExecutorTest, TimerFns) {
+  se_.create_timer = [](const SP_Device* const device, SP_Timer* timer,
+                        TF_Status* const status) -> void {
+    *timer = new SP_Timer_st(25000);
+  };
+  se_.destroy_timer = [](const SP_Device* const device,
+                         SP_Timer timer) -> void { delete timer; };
+
+  StreamExecutor* executor = GetExecutor(0);
+  Stream stream(executor);
+  stream.Init();
+  Timer timer(executor);
+  stream.InitTimer(&timer);
+  // Our test nanoseconds function just returns value
+  // passed to SP_Timer_st constructor.
+  ASSERT_EQ(timer.Nanoseconds(), 25000);
+  ASSERT_EQ(timer.Microseconds(), 25);
+}
+
+TEST_F(StreamExecutorTest, MemcpyToHost) {
+  se_.create_stream = [](const SP_Device* const device, SP_Stream* stream,
+                         TF_Status* const status) -> void {
+    *stream = new SP_Stream_st(14);
+  };
+  se_.destroy_stream = [](const SP_Device* const device,
+                          SP_Stream stream) -> void { delete stream; };
+
+  se_.memcpy_dtoh = [](const SP_Device* const device, SP_Stream stream,
+                       void* host_dst,
+                       const SP_DeviceMemoryBase* const device_src,
+                       uint64_t size, TF_Status* const status) {
+    TF_SetStatus(status, TF_OK, "");
+    EXPECT_EQ(stream->stream_id, 14);
+    std::memcpy(host_dst, device_src->opaque, size);
+  };
+
+  StreamExecutor* executor = GetExecutor(0);
+  Stream stream(executor);
+  stream.Init();
+  size_t size = sizeof(int);
+  int src_data = 34;
+  int dst_data = 2;
+  DeviceMemoryBase device_src(&src_data, size);
+  Stream& stream_ref = stream.ThenMemcpy(&dst_data, device_src, size);
+  ASSERT_EQ(dst_data, 34);
+  ASSERT_EQ(stream_ref.implementation(), stream.implementation());
+}
+
+TEST_F(StreamExecutorTest, MemcpyFromHost) {
+  se_.memcpy_htod = [](const SP_Device* const device, SP_Stream stream,
+                       SP_DeviceMemoryBase* const device_dst,
+                       const void* host_src, uint64_t size,
+                       TF_Status* const status) {
+    TF_SetStatus(status, TF_OK, "");
+    std::memcpy(device_dst->opaque, host_src, size);
+  };
+
+  StreamExecutor* executor = GetExecutor(0);
+  Stream stream(executor);
+  stream.Init();
+  size_t size = sizeof(int);
+  int src_data = 18;
+  int dst_data = 0;
+  DeviceMemoryBase device_dst(&dst_data, size);
+  stream.ThenMemcpy(&device_dst, &src_data, size);
+  ASSERT_EQ(dst_data, 18);
+}
+
+TEST_F(StreamExecutorTest, MemcpyDeviceToDevice) {
+  se_.memcpy_dtod = [](const SP_Device* const device, SP_Stream stream,
+                       SP_DeviceMemoryBase* const device_dst,
+                       const SP_DeviceMemoryBase* const device_src,
+                       uint64_t size, TF_Status* const status) {
+    TF_SetStatus(status, TF_OK, "");
+    std::memcpy(device_dst->opaque, device_src->opaque, size);
+  };
+
+  StreamExecutor* executor = GetExecutor(0);
+  Stream stream(executor);
+  stream.Init();
+  size_t size = sizeof(int);
+  int src_data = 18;
+  int dst_data = 0;
+  DeviceMemoryBase device_dst(&dst_data, size);
+  DeviceMemoryBase device_src(&src_data, size);
+  stream.ThenMemcpy(&device_dst, device_src, size);
+  ASSERT_EQ(dst_data, 18);
+}
+
+TEST_F(StreamExecutorTest, SyncMemcpyToHost) {
+  se_.sync_memcpy_dtoh = [](const SP_Device* const device, void* host_dst,
+                            const SP_DeviceMemoryBase* const device_src,
+                            uint64_t size, TF_Status* const status) {
+    TF_SetStatus(status, TF_OK, "");
+    std::memcpy(host_dst, device_src->opaque, size);
+  };
+
+  StreamExecutor* executor = GetExecutor(0);
+  size_t size = sizeof(int);
+  int src_data = 34;
+  int dst_data = 2;
+  DeviceMemoryBase device_src(&src_data, size);
+  TF_ASSERT_OK(executor->SynchronousMemcpyD2H(device_src, size, &dst_data));
+  ASSERT_EQ(dst_data, 34);
+}
+
+TEST_F(StreamExecutorTest, SyncMemcpyFromHost) {
+  se_.sync_memcpy_htod =
+      [](const SP_Device* const device, SP_DeviceMemoryBase* const device_dst,
+         const void* host_src, uint64_t size, TF_Status* const status) {
+        TF_SetStatus(status, TF_OK, "");
+        std::memcpy(device_dst->opaque, host_src, size);
+      };
+
+  StreamExecutor* executor = GetExecutor(0);
+  size_t size = sizeof(int);
+  int src_data = 18;
+  int dst_data = 0;
+  DeviceMemoryBase device_dst(&dst_data, size);
+  TF_ASSERT_OK(executor->SynchronousMemcpyH2D(&src_data, size, &device_dst));
+  ASSERT_EQ(dst_data, 18);
+}
+
+TEST_F(StreamExecutorTest, SyncMemcpyDeviceToDevice) {
+  se_.sync_memcpy_dtod = [](const SP_Device* const device,
+                            SP_DeviceMemoryBase* const device_dst,
+                            const SP_DeviceMemoryBase* const device_src,
+                            uint64_t size, TF_Status* const status) {
+    TF_SetStatus(status, TF_OK, "");
+    std::memcpy(device_dst->opaque, device_src->opaque, size);
+  };
+
+  StreamExecutor* executor = GetExecutor(0);
+  size_t size = sizeof(int);
+  int src_data = 18;
+  int dst_data = 0;
+  DeviceMemoryBase device_dst(&dst_data, size);
+  DeviceMemoryBase device_src(&src_data, size);
+  ASSERT_TRUE(executor->SynchronousMemcpy(&device_dst, device_src, size));
+  ASSERT_EQ(dst_data, 18);
+}
+
+TEST_F(StreamExecutorTest, BlockHostForEvent) {
+  static bool block_host_for_event_called = false;
+  se_.create_event = [](const SP_Device* const device, SP_Event* event,
+                        TF_Status* const status) {
+    *event = new SP_Event_st(357);
+  };
+  se_.destroy_event = [](const SP_Device* const device, SP_Event event) {
+    delete event;
+  };
+  se_.block_host_for_event = [](const SP_Device* const device, SP_Event event,
+                                TF_Status* const status) -> void {
+    ASSERT_EQ(event->event_id, 357);
+    TF_SetStatus(status, TF_OK, "");
+    block_host_for_event_called = true;
+  };
+
+  StreamExecutor* executor = GetExecutor(0);
+  Stream stream(executor);
+  stream.Init();
+  ASSERT_FALSE(block_host_for_event_called);
+  TF_ASSERT_OK(stream.BlockHostUntilDone());
+  ASSERT_TRUE(block_host_for_event_called);
+}
+
+TEST_F(StreamExecutorTest, BlockHostUntilDone) {
+  static bool block_host_until_done_called = false;
+  se_.create_stream = [](const SP_Device* const device, SP_Stream* stream,
+                         TF_Status* const status) {
+    *stream = new SP_Stream_st(58);
+  };
+  se_.destroy_stream = [](const SP_Device* const device, SP_Stream stream) {
+    delete stream;
+  };
+  se_.block_host_until_done = [](const SP_Device* const device,
+                                 SP_Stream stream,
+                                 TF_Status* const status) -> void {
+    ASSERT_EQ(stream->stream_id, 58);
+    TF_SetStatus(status, TF_OK, "");
+    block_host_until_done_called = true;
+  };
+
+  StreamExecutor* executor = GetExecutor(0);
+  Stream stream(executor);
+  stream.Init();
+  ASSERT_FALSE(block_host_until_done_called);
+  TF_ASSERT_OK(stream.BlockHostUntilDone());
+  ASSERT_TRUE(block_host_until_done_called);
+}
+
+TEST_F(StreamExecutorTest, SynchronizeAllActivity) {
+  static bool synchronize_all_called = false;
+  se_.synchronize_all_activity = [](const SP_Device* const device,
+                                    TF_Status* const status) {
+    TF_SetStatus(status, TF_OK, "");
+    synchronize_all_called = true;
+  };
+
+  StreamExecutor* executor = GetExecutor(0);
+  ASSERT_FALSE(synchronize_all_called);
+  ASSERT_TRUE(executor->SynchronizeAllActivity());
+  ASSERT_TRUE(synchronize_all_called);
+}
+
+TEST_F(StreamExecutorTest, HostCallbackOk) {
+  se_.host_callback = [](SP_Device* const device, SP_Stream stream,
+                         SE_StatusCallbackFn const callback_fn,
+                         void* const callback_arg) -> TF_Bool {
+    TF_Status* status = TF_NewStatus();
+    callback_fn(callback_arg, status);
+    bool ok = TF_GetCode(status) == TF_OK;
+    TF_DeleteStatus(status);
+    return ok;
+  };
+  StreamExecutor* executor = GetExecutor(0);
+  Stream stream(executor);
+  stream.Init();
+  std::function<port::Status()> callback = []() -> port::Status {
+    return port::Status::OK();
+  };
+  stream.ThenDoHostCallbackWithStatus(callback);
+  ASSERT_TRUE(stream.ok());
+}
+
+TEST_F(StreamExecutorTest, HostCallbackError) {
+  se_.host_callback = [](SP_Device* const device, SP_Stream stream,
+                         SE_StatusCallbackFn const callback_fn,
+                         void* const callback_arg) -> TF_Bool {
+    TF_Status* status = TF_NewStatus();
+    callback_fn(callback_arg, status);
+    bool ok = TF_GetCode(status) == TF_OK;
+    TF_DeleteStatus(status);
+    return ok;
+  };
+  StreamExecutor* executor = GetExecutor(0);
+  Stream stream(executor);
+  stream.Init();
+  std::function<port::Status()> callback = []() -> port::Status {
+    return port::UnimplementedError("Unimplemented");
+  };
+  stream.ThenDoHostCallbackWithStatus(callback);
+  ASSERT_FALSE(stream.ok());
+}
+}  // namespace
+}  // namespace stream_executor
diff --git a/tensorflow/c/kernels.cc b/tensorflow/c/kernels.cc
index 20a6c5117cf..ed501b5b101 100644
--- a/tensorflow/c/kernels.cc
+++ b/tensorflow/c/kernels.cc
@@ -261,7 +261,6 @@ TF_Tensor* TF_AllocateOutput(TF_OpKernelContext* context, int index,
                              size_t len, TF_Status* status) {
   TF_SetStatus(status, TF_OK, "");
   auto* cc_ctx = reinterpret_cast<::tensorflow::OpKernelContext*>(context);
-
   static_assert(sizeof(int64_t) == sizeof(tensorflow::int64),
                 "64-bit int types should match in size");
   tensorflow::gtl::ArraySlice<tensorflow::int64> dimarray(
@@ -279,4 +278,73 @@ TF_Tensor* TF_AllocateOutput(TF_OpKernelContext* context, int index,
     return nullptr;
   }
   return tf_tensor;
-}
\ No newline at end of file
+}
+
+TF_Tensor* TF_ForwardInputOrAllocateOutput(
+    TF_OpKernelContext* context, int* candidate_input_indices,
+    int num_candidate_input_indices, int output_index, int64_t* output_dims,
+    int output_num_dims, int* forwarded_input, TF_Status* status) {
+  TF_SetStatus(status, TF_OK, "");
+  auto* cc_ctx = reinterpret_cast<::tensorflow::OpKernelContext*>(context);
+
+  static_assert(sizeof(int64_t) == sizeof(tensorflow::int64),
+                "64-bit int types should match in size");
+  tensorflow::gtl::ArraySlice<int> input_indices_array(
+      candidate_input_indices, num_candidate_input_indices);
+  tensorflow::gtl::ArraySlice<tensorflow::int64> output_dimarray(
+      reinterpret_cast<tensorflow::int64*>(output_dims), output_num_dims);
+  tensorflow::Tensor* output_tensor_pointer;
+  tensorflow::Status s = cc_ctx->forward_input_or_allocate_output(
+      input_indices_array, output_index,
+      tensorflow::TensorShape(output_dimarray), &output_tensor_pointer,
+      forwarded_input);
+  if (!s.ok()) {
+    ::tensorflow::Set_TF_Status_from_Status(status, s);
+    return nullptr;
+  }
+  TF_Tensor* tf_tensor_output = TF_TensorFromTensor(*output_tensor_pointer, &s);
+  if (!s.ok()) {
+    ::tensorflow::Set_TF_Status_from_Status(status, s);
+    return nullptr;
+  }
+  return tf_tensor_output;
+}
+
+TF_Tensor* TF_AllocateTemp(TF_OpKernelContext* context, TF_DataType dtype,
+                           int64_t* dims, int num_dims,
+                           TF_AllocatorAttributes* attributes,
+                           TF_Status* status) {
+  auto* cc_ctx = reinterpret_cast<::tensorflow::OpKernelContext*>(context);
+  TF_SetStatus(status, TF_OK, "");
+  static_assert(sizeof(int64_t) == sizeof(tensorflow::int64),
+                "64-bit int types should match in size");
+  tensorflow::gtl::ArraySlice<tensorflow::int64> dimarray(
+      reinterpret_cast<tensorflow::int64*>(dims), num_dims);
+  if (attributes && !attributes->struct_size) {
+    TF_SetStatus(
+        status, TF_INVALID_ARGUMENT,
+        "TF_AllocatorAttributes struct "
+        "size member must be set to TF_ALLOCATOR_ATTRIBUTES_STRUCT_SIZE");
+    return nullptr;
+  }
+  tensorflow::AllocatorAttributes allocator_attr;
+  if (attributes && attributes->on_host) {
+    allocator_attr.set_on_host(true);
+  }
+  tensorflow::Status s;
+  tensorflow::Tensor tensor;
+  s = cc_ctx->allocate_temp(static_cast<tensorflow::DataType>(dtype),
+                            tensorflow::TensorShape(dimarray), &tensor,
+                            allocator_attr);
+  if (!s.ok()) {
+    ::tensorflow::Set_TF_Status_from_Status(status, s);
+    return nullptr;
+  }
+  TF_Tensor* tf_tensor;
+  tf_tensor = TF_TensorFromTensor(tensor, &s);
+  if (!s.ok()) {
+    ::tensorflow::Set_TF_Status_from_Status(status, s);
+    return nullptr;
+  }
+  return tf_tensor;
+}
diff --git a/tensorflow/c/kernels.h b/tensorflow/c/kernels.h
index c7138a39c73..489aa5399a5 100644
--- a/tensorflow/c/kernels.h
+++ b/tensorflow/c/kernels.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/c/tf_datatype.h"
 #include "tensorflow/c/tf_status.h"
+#include "tensorflow/c/tf_tensor.h"
 
 // Macro to control visibility of exported symbols in the shared library (.so,
 // .dylib, .dll).
@@ -199,6 +200,26 @@ TF_CAPI_EXPORT TF_Tensor* TF_AllocateOutput(TF_OpKernelContext* context,
                                             int64_t* dims, int num_dims,
                                             size_t len, TF_Status* status);
 
+// Tries to forward one of the inputs given in input_indices to
+// output[output_index]. If none of the given inputs can be forwarded, calls
+// allocate_output() to allocate a new output buffer. The index of the
+// forwarded input will be assign to output argument forwarded_input (if it's
+// not nullptr). If no inputs are forwarded, forwarded_input will be assigned
+// -1.
+TF_CAPI_EXPORT TF_Tensor* TF_ForwardInputOrAllocateOutput(
+    TF_OpKernelContext* context, int* candidate_input_indices,
+    int num_candidate_input_indices, int output_index, int64_t* output_dims,
+    int output_num_dims, int* forwarded_input, TF_Status* status);
+
+// Allocates a temporary Tensor of the specified type and shape. The
+// Tensor must not be used after kernel construction is
+// complete.
+//
+// num_dims must equal the size of array dims
+TF_CAPI_EXPORT extern TF_Tensor* TF_AllocateTemp(
+    TF_OpKernelContext* context, TF_DataType dtype, int64_t* dims, int num_dims,
+    TF_AllocatorAttributes* alloc_attrs, TF_Status* status);
+
 #ifdef __cplusplus
 } /* end extern "C" */
 #endif
diff --git a/tensorflow/c/kernels/BUILD b/tensorflow/c/kernels/BUILD
index 5fec068bd73..6bb2b347a30 100644
--- a/tensorflow/c/kernels/BUILD
+++ b/tensorflow/c/kernels/BUILD
@@ -39,6 +39,33 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "histogram_summary_op",
+    prefix = "histogram_summary_op",
+    deps = [
+        "//tensorflow/c:kernels",
+        "//tensorflow/c:tf_status",
+        "//tensorflow/c:tf_tensor",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//third_party/eigen3",
+    ],
+)
+
+tf_kernel_library(
+    name = "merge_summary_op",
+    prefix = "merge_summary_op",
+    deps = [
+        "//tensorflow/c:kernels",
+        "//tensorflow/c:tf_status",
+        "//tensorflow/c:tf_tensor",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
 tf_gen_op_libs(
     op_lib_names = ["bitcast"],
     deps = [
@@ -59,6 +86,24 @@ tf_gen_op_libs(
     ],
 )
 
+tf_gen_op_libs(
+    op_lib_names = ["histogram_summary"],
+    deps = [
+        "//tensorflow/c:ops",
+        "//tensorflow/c:tf_status",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_gen_op_libs(
+    op_lib_names = ["merge_summary"],
+    deps = [
+        "//tensorflow/c:ops",
+        "//tensorflow/c:tf_status",
+        "//tensorflow/core:lib",
+    ],
+)
+
 tf_cc_test(
     name = "bitcast_op_test",
     srcs = ["bitcast_op_test.cc"],
@@ -87,6 +132,23 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "summary_op_benchmark_test",
+    size = "small",
+    srcs = ["summary_op_benchmark_test.cc"],
+    deps = [
+        ":summary_op",
+        "//tensorflow/c:kernels",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 cc_library(
     name = "tensor_shape_utils",
     srcs = ["tensor_shape_utils.cc"],
@@ -122,6 +184,8 @@ filegroup(
     name = "android_all_op_kernels",
     srcs = [
         "bitcast_op.cc",
+        "histogram_summary_op.cc",
+        "merge_summary_op.cc",
         "summary_op.cc",
         "tensor_shape_utils.cc",
         "tensor_shape_utils.h",
@@ -133,6 +197,8 @@ filegroup(
     name = "android_all_ops",
     srcs = [
         "ops/bitcast.cc",
+        "ops/histogram_summary.cc",
+        "ops/merge_summary.cc",
         "ops/summary.cc",
     ],
 )
diff --git a/tensorflow/c/kernels/histogram_summary_op.cc b/tensorflow/c/kernels/histogram_summary_op.cc
new file mode 100644
index 00000000000..143a2675a05
--- /dev/null
+++ b/tensorflow/c/kernels/histogram_summary_op.cc
@@ -0,0 +1,165 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <sstream>
+#include <string>
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/c/kernels.h"
+#include "tensorflow/c/tf_status.h"
+#include "tensorflow/c/tf_tensor.h"
+#include "tensorflow/core/framework/selective_registration.h"
+#include "tensorflow/core/framework/summary.pb.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/histogram/histogram.h"
+#include "tensorflow/core/platform/bfloat16.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/tstring.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace {
+
+// Operators used to create a std::unique_ptr for TF_Tensor and TF_Status.
+struct TFTensorDeleter {
+  void operator()(TF_Tensor* tf_tensor) const { TF_DeleteTensor(tf_tensor); }
+};
+
+struct TFStatusDeleter {
+  void operator()(TF_Status* tf_status) const { TF_DeleteStatus(tf_status); }
+};
+
+// Struct that wraps TF_Tensor and TF_Status to delete once out of scope.
+using Safe_TF_TensorPtr = std::unique_ptr<TF_Tensor, TFTensorDeleter>;
+using Safe_TF_StatusPtr = std::unique_ptr<TF_Status, TFStatusDeleter>;
+
+// Used to pass the operation node name from kernel construction to
+// kernel computation.
+struct HistogramSummaryOp {
+  std::string op_node_name;
+};
+
+void* HistogramSummaryOp_Create(TF_OpKernelConstruction* ctx) {
+  HistogramSummaryOp* kernel = new HistogramSummaryOp;
+  TF_StringView string_view_name = TF_OpKernelConstruction_GetName(ctx);
+  kernel->op_node_name =
+      std::string(string_view_name.data, string_view_name.len);
+  return kernel;
+}
+
+void HistogramSummaryOp_Delete(void* kernel) {
+  delete static_cast<HistogramSummaryOp*>(kernel);
+}
+
+template <typename T>
+void HistogramSummaryOp_Compute(void* kernel, TF_OpKernelContext* ctx) {
+  HistogramSummaryOp* k = static_cast<HistogramSummaryOp*>(kernel);
+  TF_Tensor* tags;
+  TF_Tensor* values;
+  Safe_TF_StatusPtr status(TF_NewStatus());
+  TF_GetInput(ctx, 0, &tags, status.get());
+  Safe_TF_TensorPtr safe_tags_ptr(tags);
+  if (TF_GetCode(status.get()) != TF_OK) {
+    TF_OpKernelContext_Failure(ctx, status.get());
+    return;
+  }
+  TF_GetInput(ctx, 1, &values, status.get());
+  Safe_TF_TensorPtr safe_values_ptr(values);
+  if (TF_GetCode(status.get()) != TF_OK) {
+    TF_OpKernelContext_Failure(ctx, status.get());
+    return;
+  }
+  if (TF_NumDims(safe_tags_ptr.get()) != 0) {
+    TF_SetStatus(status.get(), TF_INVALID_ARGUMENT, "tags must be scalar");
+    TF_OpKernelContext_Failure(ctx, status.get());
+    return;
+  }
+  // Cast values to array to access tensor elements by index
+  auto values_array = static_cast<T*>(TF_TensorData(safe_values_ptr.get()));
+  tensorflow::histogram::Histogram histo;
+  for (int64_t i = 0; i < TF_TensorElementCount(safe_values_ptr.get()); ++i) {
+    const double double_val = static_cast<double>(values_array[i]);
+    if (Eigen::numext::isnan(double_val)) {
+      std::ostringstream err;
+      err << "Nan in summary histogram for: " << k->op_node_name;
+      TF_SetStatus(status.get(), TF_INVALID_ARGUMENT, err.str().c_str());
+      TF_OpKernelContext_Failure(ctx, status.get());
+      return;
+    } else if (Eigen::numext::isinf(double_val)) {
+      std::ostringstream err;
+      err << "Infinity in Histogram for: " << k->op_node_name;
+      TF_SetStatus(status.get(), TF_INVALID_ARGUMENT, err.str().c_str());
+      TF_OpKernelContext_Failure(ctx, status.get());
+      return;
+    }
+    histo.Add(double_val);
+  }
+  tensorflow::Summary s;
+  tensorflow::Summary::Value* v = s.add_value();
+  const tensorflow::tstring& tag =
+      *(static_cast<tensorflow::tstring*>(TF_TensorData(safe_tags_ptr.get())));
+  v->set_tag(tag.data(), tag.size());
+  histo.EncodeToProto(v->mutable_histo(), false /* Drop zero buckets */);
+
+  Safe_TF_TensorPtr summary_tensor(TF_AllocateOutput(
+      /*context=*/ctx, /*index=*/0, /*dtype=*/TF_ExpectedOutputDataType(ctx, 0),
+      /*dims=*/nullptr, /*num_dims=*/0,
+      /*len=*/sizeof(tensorflow::tstring), status.get()));
+
+  if (TF_GetCode(status.get()) != TF_OK) {
+    TF_OpKernelContext_Failure(ctx, status.get());
+    return;
+  }
+  tensorflow::tstring* output_tstring = reinterpret_cast<tensorflow::tstring*>(
+      TF_TensorData(summary_tensor.get()));
+  CHECK(SerializeToTString(s, output_tstring));
+}
+
+template <typename T>
+void RegisterHistogramSummaryOpKernel() {
+  TF_Status* status = TF_NewStatus();
+  {
+    auto* builder = TF_NewKernelBuilder(
+        "HistogramSummary", tensorflow::DEVICE_CPU, &HistogramSummaryOp_Create,
+        &HistogramSummaryOp_Compute<T>, &HistogramSummaryOp_Delete);
+    TF_KernelBuilder_TypeConstraint(
+        builder, "T",
+        static_cast<TF_DataType>(tensorflow::DataTypeToEnum<T>::v()), status);
+    CHECK_EQ(TF_OK, TF_GetCode(status)) << "Error while adding type constraint";
+    TF_RegisterKernelBuilder("HistogramSummary", builder, status);
+    CHECK_EQ(TF_OK, TF_GetCode(status))
+        << "Error while registering Histogram Summmary kernel";
+  }
+  TF_DeleteStatus(status);
+}
+
+// A dummy static variable initialized by a lambda whose side-effect is to
+// register the Histogram Summary kernel.
+TF_ATTRIBUTE_UNUSED static bool IsHistogramSummaryOpKernelRegistered = []() {
+  if (SHOULD_REGISTER_OP_KERNEL("HistogramSummary")) {
+    RegisterHistogramSummaryOpKernel<tensorflow::int64>();
+    RegisterHistogramSummaryOpKernel<tensorflow::uint64>();
+    RegisterHistogramSummaryOpKernel<tensorflow::int32>();
+    RegisterHistogramSummaryOpKernel<tensorflow::uint32>();
+    RegisterHistogramSummaryOpKernel<tensorflow::uint16>();
+    RegisterHistogramSummaryOpKernel<tensorflow::int16>();
+    RegisterHistogramSummaryOpKernel<tensorflow::int8>();
+    RegisterHistogramSummaryOpKernel<tensorflow::uint8>();
+    RegisterHistogramSummaryOpKernel<Eigen::half>();
+    RegisterHistogramSummaryOpKernel<tensorflow::bfloat16>();
+    RegisterHistogramSummaryOpKernel<float>();
+    RegisterHistogramSummaryOpKernel<double>();
+  }
+  return true;
+}();
+}  // namespace
diff --git a/tensorflow/c/kernels/merge_summary_op.cc b/tensorflow/c/kernels/merge_summary_op.cc
new file mode 100644
index 00000000000..e45029319e5
--- /dev/null
+++ b/tensorflow/c/kernels/merge_summary_op.cc
@@ -0,0 +1,123 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <sstream>
+#include <unordered_set>
+
+#include "tensorflow/c/kernels.h"
+#include "tensorflow/c/tf_status.h"
+#include "tensorflow/c/tf_tensor.h"
+#include "tensorflow/core/framework/selective_registration.h"
+#include "tensorflow/core/framework/summary.pb.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/tstring.h"
+
+namespace {
+
+// Operators used to create a std::unique_ptr for TF_Tensor and TF_Status
+struct TFTensorDeleter {
+  void operator()(TF_Tensor* tf_tensor) const { TF_DeleteTensor(tf_tensor); }
+};
+
+struct TFStatusDeleter {
+  void operator()(TF_Status* tf_status) const { TF_DeleteStatus(tf_status); }
+};
+
+// Struct that wraps TF_Tensor and TF_Status to delete once out of scope
+using Safe_TF_TensorPtr = std::unique_ptr<TF_Tensor, TFTensorDeleter>;
+using Safe_TF_StatusPtr = std::unique_ptr<TF_Status, TFStatusDeleter>;
+
+// dummy functions used for kernel registration
+void* MergeSummaryOp_Create(TF_OpKernelConstruction* ctx) { return nullptr; }
+
+void MergeSummaryOp_Delete(void* kernel) {}
+
+void MergeSummaryOp_Compute(void* kernel, TF_OpKernelContext* ctx) {
+  tensorflow::Summary s;
+  std::unordered_set<tensorflow::string> tags;
+  Safe_TF_StatusPtr status(TF_NewStatus());
+  for (int input_num = 0; input_num < TF_NumInputs(ctx); ++input_num) {
+    TF_Tensor* input;
+    TF_GetInput(ctx, input_num, &input, status.get());
+    Safe_TF_TensorPtr safe_input_ptr(input);
+    if (TF_GetCode(status.get()) != TF_OK) {
+      TF_OpKernelContext_Failure(ctx, status.get());
+      return;
+    }
+    auto tags_array =
+        static_cast<tensorflow::tstring*>(TF_TensorData(safe_input_ptr.get()));
+    for (int i = 0; i < TF_TensorElementCount(safe_input_ptr.get()); ++i) {
+      const tensorflow::tstring& s_in = tags_array[i];
+      tensorflow::Summary summary_in;
+      if (!tensorflow::ParseProtoUnlimited(&summary_in, s_in)) {
+        TF_SetStatus(status.get(), TF_INVALID_ARGUMENT,
+                     "Could not parse one of the summary inputs");
+        TF_OpKernelContext_Failure(ctx, status.get());
+        return;
+      }
+      for (int v = 0; v < summary_in.value_size(); ++v) {
+        // This tag is unused by the TensorSummary op, so no need to check for
+        // duplicates.
+        const tensorflow::string& tag = summary_in.value(v).tag();
+        if ((!tag.empty()) && !tags.insert(tag).second) {
+          std::ostringstream err;
+          err << "Duplicate tag " << tag << " found in summary inputs ";
+          TF_SetStatus(status.get(), TF_INVALID_ARGUMENT, err.str().c_str());
+          TF_OpKernelContext_Failure(ctx, status.get());
+          return;
+        }
+        *s.add_value() = summary_in.value(v);
+      }
+    }
+  }
+  Safe_TF_TensorPtr summary_tensor(TF_AllocateOutput(
+      /*context=*/ctx, /*index=*/0, /*dtype=*/TF_ExpectedOutputDataType(ctx, 0),
+      /*dims=*/nullptr, /*num_dims=*/0,
+      /*len=*/sizeof(tensorflow::tstring), status.get()));
+  if (TF_GetCode(status.get()) != TF_OK) {
+    TF_OpKernelContext_Failure(ctx, status.get());
+    return;
+  }
+  tensorflow::tstring* output_tstring = reinterpret_cast<tensorflow::tstring*>(
+      TF_TensorData(summary_tensor.get()));
+  CHECK(SerializeToTString(s, output_tstring));
+}
+
+void RegisterMergeSummaryOpKernel() {
+  TF_Status* status = TF_NewStatus();
+  {
+    auto* builder = TF_NewKernelBuilder(
+        "MergeSummary", tensorflow::DEVICE_CPU, &MergeSummaryOp_Create,
+        &MergeSummaryOp_Compute, &MergeSummaryOp_Delete);
+    TF_RegisterKernelBuilder("MergeSummary", builder, status);
+    CHECK_EQ(TF_OK, TF_GetCode(status))
+        << "Error while registering Merge Summmary kernel";
+  }
+  TF_DeleteStatus(status);
+}
+
+// A dummy static variable initialized by a lambda whose side-effect is to
+// register the Histogram Summary kernel.
+TF_ATTRIBUTE_UNUSED static bool IsMergeSummaryOpKernelRegistered = []() {
+  if (SHOULD_REGISTER_OP_KERNEL("MergeSummary")) {
+    RegisterMergeSummaryOpKernel();
+  }
+  return true;
+}();
+
+}  // namespace
diff --git a/tensorflow/c/kernels/ops/histogram_summary.cc b/tensorflow/c/kernels/ops/histogram_summary.cc
new file mode 100644
index 00000000000..67d4d1b0a5b
--- /dev/null
+++ b/tensorflow/c/kernels/ops/histogram_summary.cc
@@ -0,0 +1,50 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/ops.h"
+#include "tensorflow/c/tf_status.h"
+#include "tensorflow/core/framework/selective_registration.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+
+static void histogram_summary_shape_inference_fn(TF_ShapeInferenceContext* ctx,
+                                                 TF_Status* status) {
+  TF_SetStatus(status, TF_OK, "");
+  TF_ShapeHandle* result = TF_ShapeInferenceContextScalar(ctx);
+  TF_ShapeInferenceContextSetOutput(ctx, 0, result, status);
+  TF_DeleteShapeHandle(result);
+}
+
+void Register_HistogramSummaryOp() {
+  TF_Status* status = TF_NewStatus();
+
+  TF_OpDefinitionBuilder* op_builder =
+      TF_NewOpDefinitionBuilder("HistogramSummary");
+  TF_OpDefinitionBuilderAddInput(op_builder, "tag: string");
+  TF_OpDefinitionBuilderAddInput(op_builder, "values: T");
+  TF_OpDefinitionBuilderAddOutput(op_builder, "summary: string");
+  TF_OpDefinitionBuilderAddAttr(op_builder, "T: realnumbertype = DT_FLOAT");
+  TF_OpDefinitionBuilderSetShapeInferenceFunction(
+      op_builder, &histogram_summary_shape_inference_fn);
+
+  TF_RegisterOpDefinition(op_builder, status);
+  CHECK_EQ(TF_GetCode(status), TF_OK)
+      << "HistogramSummary op registration failed: " << TF_Message(status);
+  TF_DeleteStatus(status);
+}
+
+TF_ATTRIBUTE_UNUSED static bool HistogramSummaryOpRegistered = []() {
+  if (SHOULD_REGISTER_OP("HistogramSummary")) {
+    Register_HistogramSummaryOp();
+  }
+  return true;
+}();
diff --git a/tensorflow/c/kernels/ops/merge_summary.cc b/tensorflow/c/kernels/ops/merge_summary.cc
new file mode 100644
index 00000000000..991c469fff6
--- /dev/null
+++ b/tensorflow/c/kernels/ops/merge_summary.cc
@@ -0,0 +1,51 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/c/ops.h"
+#include "tensorflow/c/tf_status.h"
+#include "tensorflow/core/framework/selective_registration.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+
+static void merge_summary_shape_inference_fn(TF_ShapeInferenceContext* ctx,
+                                             TF_Status* status) {
+  TF_SetStatus(status, TF_OK, "");
+  TF_ShapeHandle* result = TF_ShapeInferenceContextScalar(ctx);
+  TF_ShapeInferenceContextSetOutput(ctx, 0, result, status);
+  TF_DeleteShapeHandle(result);
+}
+
+void Register_MergeSummaryOp() {
+  TF_Status* status = TF_NewStatus();
+
+  TF_OpDefinitionBuilder* op_builder =
+      TF_NewOpDefinitionBuilder("MergeSummary");
+  TF_OpDefinitionBuilderAddInput(op_builder, "inputs: N * string");
+  TF_OpDefinitionBuilderAddOutput(op_builder, "summary: string");
+  TF_OpDefinitionBuilderAddAttr(op_builder, "N: int >= 1");
+  TF_OpDefinitionBuilderSetShapeInferenceFunction(
+      op_builder, &merge_summary_shape_inference_fn);
+
+  TF_RegisterOpDefinition(op_builder, status);
+  CHECK_EQ(TF_GetCode(status), TF_OK)
+      << "MergeSummary op registration failed: " << TF_Message(status);
+  TF_DeleteStatus(status);
+}
+
+TF_ATTRIBUTE_UNUSED static bool MergeSummaryOpRegistered = []() {
+  if (SHOULD_REGISTER_OP("MergeSummary")) {
+    Register_MergeSummaryOp();
+  }
+  return true;
+}();
diff --git a/tensorflow/c/kernels/summary_op.cc b/tensorflow/c/kernels/summary_op.cc
index bd528da4165..ac7eced0ae7 100644
--- a/tensorflow/c/kernels/summary_op.cc
+++ b/tensorflow/c/kernels/summary_op.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include "tensorflow/core/framework/selective_registration.h"
 #include "tensorflow/core/framework/summary.pb.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/lib/bfloat16/bfloat16.h"
+#include "tensorflow/core/platform/bfloat16.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/protobuf.h"
diff --git a/tensorflow/c/kernels/summary_op_benchmark_test.cc b/tensorflow/c/kernels/summary_op_benchmark_test.cc
new file mode 100644
index 00000000000..887a86066d3
--- /dev/null
+++ b/tensorflow/c/kernels/summary_op_benchmark_test.cc
@@ -0,0 +1,71 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+namespace {
+
+Graph* BM_ScalarSummaryOp(TensorShape shape, std::string tag, float value) {
+  Graph* g = new Graph(OpRegistry::Global());
+  Tensor tags(DT_STRING, shape);
+  Tensor values(DT_FLOAT, shape);
+  for (int i = 0; i < tags.NumElements(); ++i) {
+    tags.flat<tstring>()(i) = tag;
+    values.flat<float>()(i) = value;
+  }
+  Node* ret;
+  TF_CHECK_OK(NodeBuilder(g->NewName("dummy"), "ScalarSummary")
+                  .Input(test::graph::Constant(g, tags))
+                  .Input(test::graph::Constant(g, values))
+                  .Attr("T", DT_FLOAT)
+                  .Finalize(g, &ret));
+  return g;
+}
+
+// Macro used to parse initializer list for tensorshape
+#define DIMARGS(...) \
+  { __VA_ARGS__ }
+// // Random parameters for testing
+constexpr char longTagParam[] = "LONGTAG____________________________";
+constexpr float largeValueParam = 2352352.2623433;
+
+#define BM_ScalarSummaryDev(device, dims, name, tag, value) \
+  void BM_ScalarSummary##name##device(int iters) {          \
+    testing::StopTiming();                                  \
+    TensorShape tensorshape(DIMARGS dims);                  \
+    auto g = BM_ScalarSummaryOp(tensorshape, #tag, value);  \
+    testing::StartTiming();                                 \
+    test::Benchmark("cpu", g).Run(iters);                   \
+  }                                                         \
+  BENCHMARK(BM_ScalarSummary##name##device);
+
+BM_ScalarSummaryDev(Cpu, (5, 10, 100), Base, Tag, 5.2);
+// Benchmark for large shapes
+BM_ScalarSummaryDev(Cpu, (500, 100, 100), LargeShape, Tag, 5.2);
+// Benchmark for large tag tstring
+BM_ScalarSummaryDev(Cpu, (5, 10, 100), LongTag, longTagParam, 5.2);
+// Benchmark for large values
+BM_ScalarSummaryDev(Cpu, (500, 100, 100), LargeValue, Tag, largeValueParam);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/c/kernels_test.cc b/tensorflow/c/kernels_test.cc
index 3c8ac934428..c9df2cc34d1 100644
--- a/tensorflow/c/kernels_test.cc
+++ b/tensorflow/c/kernels_test.cc
@@ -368,6 +368,16 @@ class DeviceKernelOpTest : public OpsTestBase {
 #endif
 };
 
+// Validates that the tensor has shape and type corresponding to
+// dims and dtype.
+void validate_tensor(TF_Tensor* tensor, int64_t* dims, int64_t num_dims,
+                     TF_DataType dtype);
+
+// Copies data of length tensor_size_bytes from values to tensor.
+template <typename T>
+void set_tensor_data(TF_Tensor* tensor, T* values, size_t tensor_size_bytes,
+                     TF_OpKernelContext* ctx);
+
 REGISTER_OP("AllocateOutputOp1").Output("output1: float");
 
 TEST_F(DeviceKernelOpTest, TestAllocateOutputSizeOne) {
@@ -379,22 +389,11 @@ TEST_F(DeviceKernelOpTest, TestAllocateOutputSizeOne) {
     TF_Tensor* output = TF_AllocateOutput(
         /*context=*/ctx, /*index=*/0, /*dtype=*/TF_FLOAT, /*dims=*/&dim,
         /*num_dims=*/1, /*len=*/tensor_size_bytes, s);
-    EXPECT_EQ(TF_OK, TF_GetCode(s));
-    EXPECT_EQ(TF_FLOAT, TF_TensorType(output));
-    EXPECT_EQ(1, TF_NumDims(output));
-    EXPECT_EQ(1, TF_Dim(output, 0));
+    validate_tensor(output, &dim, 1, TF_FLOAT);
 
     // Set output to 3
-    float* data = reinterpret_cast<float*>(TF_TensorData(output));
-    float value = 3.0f;
-#if GOOGLE_CUDA
-    OpKernelContext* cc_ctx = reinterpret_cast<OpKernelContext*>(ctx);
-    cc_ctx->eigen_gpu_device().memcpyHostToDevice(data, &value,
-                                                  tensor_size_bytes);
-#else
-    *data = value;
-#endif
-
+    float values[1] = {3.0f};
+    set_tensor_data<float>(output, values, tensor_size_bytes, ctx);
     TF_DeleteStatus(s);
     TF_DeleteTensor(output);
   };
@@ -417,12 +416,8 @@ TEST_F(DeviceKernelOpTest, TestAllocateEmptyOutput) {
     TF_Tensor* output = TF_AllocateOutput(
         /*context=*/ctx, /*index=*/0, /*dtype=*/TF_FLOAT, /*dims=*/&dim,
         /*num_dims=*/1, /*len=*/0, s);
-
     EXPECT_EQ(TF_OK, TF_GetCode(s));
-    EXPECT_EQ(TF_FLOAT, TF_TensorType(output));
-    EXPECT_EQ(1, TF_NumDims(output));
-    EXPECT_EQ(0, TF_Dim(output, 0));
-
+    validate_tensor(output, &dim, 1, TF_FLOAT);
     TF_DeleteStatus(s);
     TF_DeleteTensor(output);
   };
@@ -442,27 +437,16 @@ TEST_F(DeviceKernelOpTest, TestAllocateOutputSize2x3) {
     TF_Status* s = TF_NewStatus();
     // Allocate 2x3 output
     int64_t dim[2] = {2, 3};
-    size_t tensor_size_bytes = 6 * TF_DataTypeSize(TF_FLOAT);
+    size_t tensor_size_bytes = TF_DataTypeSize(TF_FLOAT) * 6;
     TF_Tensor* output = TF_AllocateOutput(
         /*context=*/ctx, /*index=*/0, /*dtype=*/TF_FLOAT, /*dims=*/dim,
         /*num_dims=*/2, /*len=*/tensor_size_bytes, s);
     EXPECT_EQ(TF_OK, TF_GetCode(s));
-    EXPECT_EQ(TF_FLOAT, TF_TensorType(output));
-    EXPECT_EQ(2, TF_NumDims(output));
-    EXPECT_EQ(2, TF_Dim(output, 0));
-    EXPECT_EQ(3, TF_Dim(output, 1));
+    validate_tensor(output, dim, 2, TF_FLOAT);
 
     // Set output to [1 2 3 4 5 6]
-    void* data = TF_TensorData(output);
-    float value[6] = {1, 2, 3, 4, 5, 6};
-#if GOOGLE_CUDA
-    OpKernelContext* cc_ctx = reinterpret_cast<OpKernelContext*>(ctx);
-    cc_ctx->eigen_gpu_device().memcpyHostToDevice(data, value,
-                                                  tensor_size_bytes);
-#else
-    memcpy(data, value, tensor_size_bytes);
-#endif
-
+    float values[6] = {1, 2, 3, 4, 5, 6};
+    set_tensor_data<float>(output, values, tensor_size_bytes, ctx);
     TF_DeleteStatus(s);
     TF_DeleteTensor(output);
   };
@@ -474,4 +458,200 @@ TEST_F(DeviceKernelOpTest, TestAllocateOutputSize2x3) {
   EXPECT_EQ("Tensor<type: float shape: [2,3] values: [1 2 3][4 5 6]>",
             output->DebugString(100));
 }
+
+REGISTER_OP("AllocateTempOp1").Output("output1: float");
+
+TEST_F(DeviceKernelOpTest, TestAllocateTempSizeOne) {
+  auto my_compute_func = [](void* kernel, TF_OpKernelContext* ctx) {
+    // Allocate scalar TF_Tensor
+    TF_Status* s = TF_NewStatus();
+    int64_t dim = 1;
+    TF_AllocatorAttributes alloc_attrs;
+    alloc_attrs.struct_size = TF_ALLOCATOR_ATTRIBUTES_STRUCT_SIZE;
+#if GOOGLE_CUDA
+    alloc_attrs.on_host = 0;
+#else
+    alloc_attrs.on_host = 1;
+#endif
+    TF_Tensor* output = TF_AllocateTemp(
+        /*context=*/ctx, /*dtype=*/TF_FLOAT, /*dims=*/&dim,
+        /*num_dims=*/1, /*allocator_attributes*/ &alloc_attrs, s);
+    size_t tensor_size_bytes = TF_DataTypeSize(TF_FLOAT);
+    EXPECT_EQ(TF_OK, TF_GetCode(s));
+    validate_tensor(output, &dim, 1, TF_FLOAT);
+
+    // Set TF_Tensor value to 3
+    float values[1] = {3.0f};
+    set_tensor_data<float>(output, values, tensor_size_bytes, ctx);
+    TF_SetOutput(ctx, 0, output, s);
+    TF_DeleteStatus(s);
+    TF_DeleteTensor(output);
+  };
+
+  SetupOp("AllocateTempOp1", "AllocateTemp1", my_compute_func);
+
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor* output = GetOutput(0);
+  EXPECT_EQ("Tensor<type: float shape: [1] values: 3>",
+            output->DebugString(100));
+}
+
+REGISTER_OP("AllocateTempOp0").Output("output1: float");
+
+TEST_F(DeviceKernelOpTest, TestAllocateTempEmpty) {
+  auto my_compute_func = [](void* kernel, TF_OpKernelContext* ctx) {
+    TF_Status* s = TF_NewStatus();
+    // Allocate empty TF_Tensor
+    int64_t dim = 0;
+    TF_AllocatorAttributes alloc_attrs;
+    alloc_attrs.struct_size = TF_ALLOCATOR_ATTRIBUTES_STRUCT_SIZE;
+#if GOOGLE_CUDA
+    alloc_attrs.on_host = 0;
+#else
+    alloc_attrs.on_host = 1;
+#endif
+    TF_Tensor* output = TF_AllocateTemp(
+        /*context=*/ctx, /*dtype=*/TF_FLOAT, /*dims=*/&dim,
+        /*num_dims=*/1, /*allocator_attributes*/ &alloc_attrs, s);
+    EXPECT_EQ(TF_OK, TF_GetCode(s));
+    validate_tensor(output, &dim, 1, TF_FLOAT);
+    TF_SetOutput(ctx, 0, output, s);
+    TF_DeleteStatus(s);
+    TF_DeleteTensor(output);
+  };
+
+  SetupOp("AllocateTempOp0", "AllocateTemp0", my_compute_func);
+
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor* output = GetOutput(0);
+  EXPECT_EQ("Tensor<type: float shape: [0] values: >",
+            output->DebugString(100));
+}
+
+REGISTER_OP("AllocateTempOp2x3").Output("output1: float");
+
+TEST_F(DeviceKernelOpTest, TestAllocateTempSize2x3) {
+  auto my_compute_func = [](void* kernel, TF_OpKernelContext* ctx) {
+    TF_Status* s = TF_NewStatus();
+    size_t tensor_size_bytes = 6 * TF_DataTypeSize(TF_FLOAT);
+    // Allocate 2x3 TF_Tensor
+    int64_t dim[2] = {2, 3};
+    TF_AllocatorAttributes alloc_attrs;
+    alloc_attrs.struct_size = TF_ALLOCATOR_ATTRIBUTES_STRUCT_SIZE;
+#if GOOGLE_CUDA
+    alloc_attrs.on_host = 0;
+#else
+    alloc_attrs.on_host = 1;
+#endif
+    TF_Tensor* output = TF_AllocateTemp(
+        /*context=*/ctx, /*dtype=*/TF_FLOAT, /*dims=*/dim,
+        /*num_dims=*/2, /*allocator_attributes*/ &alloc_attrs, s);
+    EXPECT_EQ(TF_OK, TF_GetCode(s));
+    validate_tensor(output, dim, 2, TF_FLOAT);
+
+    // Set TF_Tensor values to [1 2 3 4 5 6]
+    float values[6] = {1, 2, 3, 4, 5, 6};
+    set_tensor_data<float>(output, values, tensor_size_bytes, ctx);
+    TF_SetOutput(ctx, 0, output, s);
+    TF_DeleteStatus(s);
+    TF_DeleteTensor(output);
+  };
+
+  SetupOp("AllocateTempOp2x3", "AllocateTempOp2x3", my_compute_func);
+
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor* output = GetOutput(0);
+  EXPECT_EQ("Tensor<type: float shape: [2,3] values: [1 2 3][4 5 6]>",
+            output->DebugString(100));
+}
+
+TEST_F(DeviceKernelOpTest, TestForwardInputOrAllocateOutput) {
+  const char* node_name = "TestForwardInputOrAllocateOutputKernel";
+  const char* op_name = "BazOp";
+  const char* device_name = "FakeDeviceName";
+
+  REGISTER_OP(op_name)
+      .Input("input1: float")
+      .Input("input2: float")
+      .Output("output1: float")
+      .Attr("SomeDataTypeAttr: type");
+
+  // A kernel whose Compute function that forwards a scalar input to output
+  auto my_compute_func = [](void* kernel, TF_OpKernelContext* ctx) {
+    TF_Status* s = TF_NewStatus();
+    int candidate_input_indices[1] = {0};
+    int forwarded_input;
+    int64_t output_dims[1] = {};
+    TF_Tensor* output = TF_ForwardInputOrAllocateOutput(
+        /*context=*/ctx, candidate_input_indices,
+        /*num_candidate_input_indices=*/1,
+        /*output_index=*/0, output_dims, /*output_num_dims=*/0,
+        &forwarded_input, /*status=*/s);
+    EXPECT_EQ(TF_OK, TF_GetCode(s));
+    EXPECT_EQ(forwarded_input, 0);
+    EXPECT_EQ(TF_FLOAT, TF_TensorType(output));
+    EXPECT_EQ(0, TF_NumDims(output));
+    TF_DeleteStatus(s);
+    TF_DeleteTensor(output);
+  };
+
+  TF_KernelBuilder* builder = TF_NewKernelBuilder(op_name, device_name, nullptr,
+                                                  my_compute_func, nullptr);
+
+  {
+    TF_Status* status = TF_NewStatus();
+    TF_RegisterKernelBuilder(node_name, builder, status);
+    EXPECT_EQ(TF_OK, TF_GetCode(status));
+    TF_DeleteStatus(status);
+  }
+
+  {
+    OpKernelContext::Params p;
+    DummyDevice dummy_device(nullptr);
+    p.device = &dummy_device;
+    AllocatorAttributes alloc_attrs;
+    p.output_attr_array = &alloc_attrs;
+
+    Tensor t(123.0f);
+
+    gtl::InlinedVector<TensorValue, 4> inputs;
+    // GetFakeKernel requires a NodeDef with two inputs
+    inputs.emplace_back(&t);
+    inputs.emplace_back();
+    p.inputs = &inputs;
+
+    Status status;
+    std::unique_ptr<OpKernel> kernel =
+        GetFakeKernel(device_name, op_name, node_name, &status);
+    TF_EXPECT_OK(status);
+    ASSERT_NE(nullptr, kernel.get());
+
+    p.op_kernel = kernel.get();
+    OpKernelContext ctx(&p);
+    kernel->Compute(&ctx);
+    ASSERT_EQ(123, ctx.mutable_output(0)->scalar<float>()());
+  }
+}
+
+void validate_tensor(TF_Tensor* tensor, int64_t* dims, int64_t num_dims,
+                     TF_DataType dtype) {
+  EXPECT_EQ(TF_FLOAT, TF_TensorType(tensor));
+  EXPECT_EQ(num_dims, TF_NumDims(tensor));
+  for (int i = 0; i < num_dims; ++i) {
+    EXPECT_EQ(dims[i], TF_Dim(tensor, i));
+  }
+}
+
+template <typename T>
+void set_tensor_data(TF_Tensor* tensor, T* values, size_t tensor_size_bytes,
+                     TF_OpKernelContext* ctx) {
+  T* data = reinterpret_cast<T*>(TF_TensorData(tensor));
+#if GOOGLE_CUDA
+  OpKernelContext* cc_ctx = reinterpret_cast<OpKernelContext*>(ctx);
+  cc_ctx->eigen_gpu_device().memcpyHostToDevice(data, values,
+                                                tensor_size_bytes);
+#else
+  memcpy(data, values, tensor_size_bytes);
+#endif
+}
 }  // namespace tensorflow
diff --git a/tensorflow/c/logging.cc b/tensorflow/c/logging.cc
index bf6bf069fff..13c9e6ac208 100644
--- a/tensorflow/c/logging.cc
+++ b/tensorflow/c/logging.cc
@@ -28,6 +28,7 @@ void TF_Log(TF_LogLevel level, const char* fmt, ...) {
   va_list args;
   va_start(args, fmt);
   auto message = BuildMessage(fmt, args);
+  va_end(args);
   switch (level) {
     case TF_INFO:
       LOG(INFO) << message;
@@ -48,6 +49,7 @@ void TF_VLog(int level, const char* fmt, ...) {
   va_list args;
   va_start(args, fmt);
   auto message = BuildMessage(fmt, args);
+  va_end(args);
   VLOG(level) << message;
 }
 
@@ -55,5 +57,6 @@ void TF_DVLog(int level, const char* fmt, ...) {
   va_list args;
   va_start(args, fmt);
   auto message = BuildMessage(fmt, args);
+  va_end(args);
   DVLOG(level) << message;
 }
diff --git a/tensorflow/c/tf_status_helper.h b/tensorflow/c/tf_status_helper.h
index ff8085f1229..a895e608159 100644
--- a/tensorflow/c/tf_status_helper.h
+++ b/tensorflow/c/tf_status_helper.h
@@ -28,6 +28,14 @@ void Set_TF_Status_from_Status(TF_Status* tf_status,
 // Returns a "status" from "tf_status".
 tensorflow::Status StatusFromTF_Status(const TF_Status* tf_status);
 
+namespace internal {
+struct TF_StatusDeleter {
+  void operator()(TF_Status* tf_status) const { TF_DeleteStatus(tf_status); }
+};
+}  // namespace internal
+
+using TF_StatusPtr = std::unique_ptr<TF_Status, internal::TF_StatusDeleter>;
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_C_TF_STATUS_HELPER_H_
diff --git a/tensorflow/c/tf_tensor.cc b/tensorflow/c/tf_tensor.cc
index 0feb986ce44..39d2683226f 100644
--- a/tensorflow/c/tf_tensor.cc
+++ b/tensorflow/c/tf_tensor.cc
@@ -288,7 +288,7 @@ TF_Tensor* TF_TensorFromTensor(const tensorflow::Tensor& src, Status* status) {
   if (!tensor.CopyFrom(src, src.shape())) {
     return nullptr;
   }
-  return new TF_Tensor{new tensorflow::TensorInterface(tensor)};
+  return new TF_Tensor{new tensorflow::TensorInterface(std::move(tensor))};
 }
 
 Status TF_TensorToTensor(const TF_Tensor* src, Tensor* dst) {
diff --git a/tensorflow/c/tf_tensor.h b/tensorflow/c/tf_tensor.h
index acdf053e63a..e0a026f984f 100644
--- a/tensorflow/c/tf_tensor.h
+++ b/tensorflow/c/tf_tensor.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <stdbool.h>
 #include <stdint.h>
 
+#include "tensorflow/c/c_api_macros.h"
 #include "tensorflow/c/tf_datatype.h"
 #include "tensorflow/c/tf_status.h"
 
@@ -45,6 +46,16 @@ limitations under the License.
 extern "C" {
 #endif
 
+// Allocator Attributes used for tensor allocation.
+typedef struct TF_AllocatorAttributes {
+  size_t struct_size;
+  // Set boolean to 1 for CPU allocation, else 0.
+  TF_Bool on_host;
+} TF_AllocatorAttributes;
+
+#define TF_ALLOCATOR_ATTRIBUTES_STRUCT_SIZE \
+  TF_OFFSET_OF_END(TF_AllocatorAttributes, on_host)
+
 // --------------------------------------------------------------------------
 // TF_Tensor holds a multi-dimensional array of elements of a single data type.
 // For all types other than TF_STRING, the data buffer stores elements
diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD
index e1fad8e697a..8602bfafff8 100644
--- a/tensorflow/cc/BUILD
+++ b/tensorflow/cc/BUILD
@@ -558,6 +558,7 @@ tf_gen_op_wrappers_cc(
         "io_ops",
         "linalg_ops",
         "list_ops",
+        "map_ops",
         "logging_ops",
         "lookup_ops",
         "manip_ops",
diff --git a/tensorflow/cc/saved_model/BUILD b/tensorflow/cc/saved_model/BUILD
index a67d349bab7..fddbcfec6e6 100644
--- a/tensorflow/cc/saved_model/BUILD
+++ b/tensorflow/cc/saved_model/BUILD
@@ -47,6 +47,7 @@ cc_library(
         # TODO(b/111634734): :lib and :protos_all contain dependencies that
         # cannot be built on mobile platforms. Instead, include the appropriate
         # tf_lib depending on the build platform.
+        "@com_google_absl//absl/memory:memory",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
     ]),
@@ -56,7 +57,7 @@ tf_cc_test(
     name = "reader_test",
     srcs = ["reader_test.cc"],
     data = [
-        ":saved_model_half_plus_two",
+        ":saved_model_test_files",
     ],
     linkstatic = 1,
     deps = [
@@ -148,7 +149,7 @@ tf_cc_test(
     name = "bundle_v2_test",
     srcs = ["bundle_v2_test.cc"],
     data = [
-        ":saved_model_half_plus_two",
+        ":saved_model_test_files",
     ],
     linkstatic = 1,
     deps = [
@@ -165,12 +166,13 @@ tf_cc_test(
     name = "saved_model_bundle_test",
     srcs = ["saved_model_bundle_test.cc"],
     data = [
-        ":saved_model_half_plus_two",
+        ":saved_model_test_files",
     ],
     linkstatic = 1,
     deps = [
         ":constants",
         ":loader",
+        ":reader",
         ":signature_constants",
         ":tag_constants",
         "//tensorflow/core:lib",
@@ -186,7 +188,7 @@ tf_cc_test(
     name = "saved_model_bundle_lite_test",
     srcs = ["saved_model_bundle_lite_test.cc"],
     data = [
-        ":saved_model_half_plus_two",
+        ":saved_model_test_files",
     ],
     linkstatic = 1,
     deps = [
@@ -225,7 +227,7 @@ py_binary(
 
 # TODO(b/32673259): add a test to continuously validate these files.
 filegroup(
-    name = "saved_model_half_plus_two",
+    name = "saved_model_test_files",
     srcs = glob([
         "testdata/half_plus_two_pbtxt/**",
         "testdata/half_plus_two_main_op/**",
@@ -234,9 +236,15 @@ filegroup(
         "testdata/x_plus_y_v2_debuginfo/**",
         "testdata/CyclicModule/**",
         "testdata/VarsAndArithmeticObjectGraph/**",
+        "testdata/fuzz_generated/**",
     ]),
 )
 
+alias(
+    name = "saved_model_half_plus_two",
+    actual = ":saved_model_test_files",
+)
+
 exports_files(
     glob([
         "testdata/half_plus_two_pbtxt/**",
@@ -246,5 +254,6 @@ exports_files(
         "testdata/x_plus_y_v2_debuginfo/**",
         "testdata/CyclicModule/**",
         "testdata/VarsAndArithmeticObjectGraph/**",
+        "testdata/fuzz_generated/**",
     ]),
 )
diff --git a/tensorflow/cc/saved_model/experimental/public/BUILD b/tensorflow/cc/saved_model/experimental/public/BUILD
index 3e9a671a61f..9640848ebf5 100644
--- a/tensorflow/cc/saved_model/experimental/public/BUILD
+++ b/tensorflow/cc/saved_model/experimental/public/BUILD
@@ -51,8 +51,32 @@ cc_library(
     deps = [
         ":concrete_function",
         ":concrete_function_list",
+        ":signature_def_function",
         "//tensorflow/c/experimental/saved_model/public:saved_model_api",
         "//tensorflow/cc/experimental/base/public:runtime",
         "//tensorflow/cc/experimental/base/public:status",
     ],
 )
+
+cc_library(
+    name = "signature_def_function",
+    hdrs = [
+        "signature_def_function.h",
+    ],
+    deps = [
+        ":signature_def_function_metadata",
+        "//tensorflow/c/eager:c_api",
+        "//tensorflow/c/experimental/saved_model/public:signature_def_function",
+        "//tensorflow/cc/experimental/base/public:status",
+    ],
+)
+
+cc_library(
+    name = "signature_def_function_metadata",
+    hdrs = [
+        "signature_def_function_metadata.h",
+    ],
+    deps = [
+        "//tensorflow/c/experimental/saved_model/public:signature_def_function_metadata",
+    ],
+)
diff --git a/tensorflow/cc/saved_model/experimental/public/saved_model_api.h b/tensorflow/cc/saved_model/experimental/public/saved_model_api.h
index 04018bf2aab..c2bfb4dcf83 100644
--- a/tensorflow/cc/saved_model/experimental/public/saved_model_api.h
+++ b/tensorflow/cc/saved_model/experimental/public/saved_model_api.h
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/cc/experimental/base/public/status.h"
 #include "tensorflow/cc/saved_model/experimental/public/concrete_function.h"
 #include "tensorflow/cc/saved_model/experimental/public/concrete_function_list.h"
+#include "tensorflow/cc/saved_model/experimental/public/signature_def_function.h"
 
 namespace tensorflow {
 namespace experimental {
@@ -80,8 +81,8 @@ class SavedModelAPI {
   //  If status is not OK, returns nullptr. Otherwise, returns a
   //  tensorflow::cc::ConcreteFunction pointer. The lifetime of this pointer
   //  is bound to SavedModelAPI it was loaded from.
-  ConcreteFunction* GetSignatureDefFunction(const std::string& function_path,
-                                            Status* status);
+  SignatureDefFunction* GetSignatureDefFunction(
+      const std::string& function_path, Status* status);
 
   // Lists all Conrete Functions available from the SavedModel.
   std::vector<ConcreteFunction*> ListFunctions();
@@ -140,14 +141,14 @@ inline ConcreteFunction* SavedModelAPI::GetConcreteFunction(
   return ConcreteFunction::wrap(function);
 }
 
-inline ConcreteFunction* SavedModelAPI::GetSignatureDefFunction(
+inline SignatureDefFunction* SavedModelAPI::GetSignatureDefFunction(
     const std::string& function_path, Status* status) {
-  TF_ConcreteFunction* function = TF_GetSavedModelSignatureDefFunction(
+  TF_SignatureDefFunction* function = TF_GetSavedModelSignatureDefFunction(
       saved_model_.get(), function_path.c_str(), status->GetTFStatus());
   if (!status->ok()) {
     return nullptr;
   }
-  return ConcreteFunction::wrap(function);
+  return SignatureDefFunction::wrap(function);
 }
 
 inline std::vector<ConcreteFunction*> SavedModelAPI::ListFunctions() {
diff --git a/tensorflow/cc/saved_model/experimental/public/signature_def_function.h b/tensorflow/cc/saved_model/experimental/public/signature_def_function.h
new file mode 100644
index 00000000000..bc72d208e87
--- /dev/null
+++ b/tensorflow/cc/saved_model/experimental/public/signature_def_function.h
@@ -0,0 +1,89 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CC_SAVED_MODEL_EXPERIMENTAL_PUBLIC_SIGNATURE_DEF_FUNCTION_H_
+#define TENSORFLOW_CC_SAVED_MODEL_EXPERIMENTAL_PUBLIC_SIGNATURE_DEF_FUNCTION_H_
+
+#include <vector>
+
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/experimental/saved_model/public/signature_def_function.h"
+#include "tensorflow/cc/experimental/base/public/status.h"
+#include "tensorflow/cc/saved_model/experimental/public/signature_def_function_metadata.h"
+
+namespace tensorflow {
+namespace experimental {
+namespace cc {
+
+// SignatureDefFunctions are functions that correspond to either:
+// "signatures" saved from a TF2 SavedModel APIs:
+// https://github.com/tensorflow/tensorflow/blob/8ce0600f58ed84a8c84a7bbdb014d1f09e44f4c8/tensorflow/python/saved_model/save.py#L830-L854
+// Or the "SignatureDefMap" saved from TF1 SavedModel APIs:
+// https://github.com/tensorflow/tensorflow/blob/8ce0600f58ed84a8c84a7bbdb014d1f09e44f4c8/tensorflow/python/saved_model/load_v1_in_v2_test.py#L170-L174
+// In both cases, a SignatureDef is serialized as a SignatureDef protobuf:
+// https://github.com/tensorflow/tensorflow/blob/8ce0600f58ed84a8c84a7bbdb014d1f09e44f4c8/tensorflow/core/protobuf/meta_graph.proto#L260-L330
+// and represents a computation defined by a TF subgraph.
+// These Signatures were primarily designed to be interoperable with the legacy
+// TF 1 Session-based C++ SavedModelBundle loading APIs:
+// https://github.com/tensorflow/tensorflow/blob/26c4ee0c833e74f94d0102d8b005c41a28b44445/tensorflow/cc/saved_model/loader.h#L96-L108
+// SignatureDefFunctions have different semantics from regular TF2
+// ConcreteFunctions, and are mainly intended provide a serving-friendly
+// transition point from the TF1 Session API.
+// First, SignatureDefFunctions have different calling conventions.
+// SignatureDefFunctions' inputs and outputs are constrained to **flattened
+// lists of TensorHandles only**. They do not support more exotic input/output
+// types (like optionals, generators, etc). Additionally, this flattening means
+// they will not preserve the exact interface of the original tf.function they
+// were traced from, as things like composite tensors decay into their
+// internal dense tensor representation.
+// Second, all inputs and outputs are "named", and these names are load bearing
+// (eg: they are part of the interface of tensorflow_serving):
+// https://github.com/tensorflow/serving/blob/e0d247b2e4050713194b8fad0be24a0636df7209/tensorflow_serving/apis/predict.proto#L21
+// https://github.com/tensorflow/serving/blob/e0d247b2e4050713194b8fad0be24a0636df7209/tensorflow_serving/apis/predict.proto#L39
+// The name of each input/output is stored in the corresponding tf::Argument in
+// SignatureDefFunctionMetadata::arguments(). Users must ensure the order of
+// TensorHandles passed to the function matches with the order of named
+// arguments. Similarly the name of the outputs is stored in
+// SignatureDefFunctionMetadata::returns().
+class SignatureDefFunction final {
+ public:
+  // Returns FunctionMetadata associated with this ConcreteFunction.
+  const SignatureDefFunctionMetadata* GetFunctionMetadata();
+
+ private:
+  friend class SavedModelAPI;
+  friend class ConcreteFunctionList;
+
+  // TODO(bmzhao): Consider adding a macro for wrapping/unwrapping
+  // when moving out of experimental.
+  static SignatureDefFunction* wrap(TF_SignatureDefFunction* p) {
+    return reinterpret_cast<SignatureDefFunction*>(p);
+  }
+  static TF_SignatureDefFunction* unwrap(SignatureDefFunction* p) {
+    return reinterpret_cast<TF_SignatureDefFunction*>(p);
+  }
+};
+
+inline const SignatureDefFunctionMetadata*
+SignatureDefFunction::GetFunctionMetadata() {
+  return SignatureDefFunctionMetadata::wrap(
+      TF_SignatureDefFunctionGetMetadata(unwrap(this)));
+}
+
+}  // namespace cc
+}  // namespace experimental
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_SAVED_MODEL_EXPERIMENTAL_PUBLIC_SIGNATURE_DEF_FUNCTION_H_
diff --git a/tensorflow/cc/saved_model/experimental/public/signature_def_function_metadata.h b/tensorflow/cc/saved_model/experimental/public/signature_def_function_metadata.h
new file mode 100644
index 00000000000..6cb01bf1a26
--- /dev/null
+++ b/tensorflow/cc/saved_model/experimental/public/signature_def_function_metadata.h
@@ -0,0 +1,47 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CC_SAVED_MODEL_EXPERIMENTAL_PUBLIC_SIGNATURE_DEF_FUNCTION_METADATA_H_
+#define TENSORFLOW_CC_SAVED_MODEL_EXPERIMENTAL_PUBLIC_SIGNATURE_DEF_FUNCTION_METADATA_H_
+
+#include <memory>
+
+#include "tensorflow/c/experimental/saved_model/public/signature_def_function_metadata.h"
+
+namespace tensorflow {
+namespace experimental {
+namespace cc {
+
+// SignatureDefFunctionMetadata stores additional information on each input
+// and output's names, dtypes, and shape.
+class SignatureDefFunctionMetadata final {
+  // TODO(bmzhao): Add getters here as necessary.
+ private:
+  friend class SignatureDefFunction;
+  static SignatureDefFunctionMetadata* wrap(
+      TF_SignatureDefFunctionMetadata* p) {
+    return reinterpret_cast<SignatureDefFunctionMetadata*>(p);
+  }
+  static TF_SignatureDefFunctionMetadata* unwrap(
+      SignatureDefFunctionMetadata* p) {
+    return reinterpret_cast<TF_SignatureDefFunctionMetadata*>(p);
+  }
+};
+
+}  // namespace cc
+}  // namespace experimental
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_SAVED_MODEL_EXPERIMENTAL_PUBLIC_SIGNATURE_DEF_FUNCTION_METADATA_H_
diff --git a/tensorflow/cc/saved_model/loader.cc b/tensorflow/cc/saved_model/loader.cc
index f9c720a2ba2..ecefe7d0406 100644
--- a/tensorflow/cc/saved_model/loader.cc
+++ b/tensorflow/cc/saved_model/loader.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/protobuf/graph_debug_info.pb.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
 #include "tensorflow/core/protobuf/saver.pb.h"
 #include "tensorflow/core/public/session.h"
 #include "tensorflow/core/public/session_options.h"
@@ -95,16 +96,6 @@ static Status ValidateSavedTensors(const GraphDef& graph_def) {
   return Status::OK();
 }
 
-Status LoadMetaGraphIntoSession(const MetaGraphDef& meta_graph_def,
-                                const SessionOptions& session_options,
-                                std::unique_ptr<Session>* session) {
-  Session* session_p = nullptr;
-  TF_RETURN_IF_ERROR(NewSession(session_options, &session_p));
-  session->reset(session_p);
-  TF_RETURN_IF_ERROR(ValidateSavedTensors(meta_graph_def.graph_def()));
-  return (*session)->Create(meta_graph_def.graph_def());
-}
-
 Tensor CreateStringTensor(const string& value) {
   Tensor tensor(DT_STRING, TensorShape({}));
   tensor.scalar<tstring>()() = value;
@@ -228,22 +219,18 @@ Status RunRestore(const RunOptions& run_options, const string& export_dir,
                  nullptr /* outputs */, &run_metadata, session);
 }
 
-Status ReadSavedModelDebugInfoIfPresent(
-    const string& export_dir,
-    std::unique_ptr<GraphDebugInfo>* debug_info_proto) {
-  LOG(INFO) << "Reading SavedModel debug info (if present) from: "
-            << export_dir;
+}  // namespace
 
-  const string debug_info_pb_path =
-      io::JoinPath(export_dir, "debug", "saved_model_debug_info.pb");
-  if (Env::Default()->FileExists(debug_info_pb_path).ok()) {
-    GraphDebugInfo debug_info;
-    TF_RETURN_IF_ERROR(
-        ReadBinaryProto(Env::Default(), debug_info_pb_path, &debug_info));
-    *debug_info_proto =
-        absl::make_unique<GraphDebugInfo>(std::move(debug_info));
-  }
-  return Status::OK();
+SavedModelBundleInterface::~SavedModelBundleInterface() {}
+
+Status LoadMetagraphIntoSession(const SessionOptions& session_options,
+                                const MetaGraphDef& meta_graph,
+                                std::unique_ptr<Session>* session) {
+  Session* session_p = nullptr;
+  TF_RETURN_IF_ERROR(NewSession(session_options, &session_p));
+  session->reset(session_p);
+  TF_RETURN_IF_ERROR(ValidateSavedTensors(meta_graph.graph_def()));
+  return (*session)->Create(meta_graph.graph_def());
 }
 
 Status LoadSavedModelInternal(const SessionOptions& session_options,
@@ -251,46 +238,17 @@ Status LoadSavedModelInternal(const SessionOptions& session_options,
                               const string& export_dir,
                               const std::unordered_set<string>& tags,
                               SavedModelBundle* const bundle) {
-  const uint64 read_start_microseconds = Env::Default()->NowMicros();
   TF_RETURN_IF_ERROR(ReadMetaGraphDefFromSavedModel(export_dir, tags,
                                                     &bundle->meta_graph_def));
   TF_RETURN_IF_ERROR(
       ReadSavedModelDebugInfoIfPresent(export_dir, &bundle->debug_info));
-  TF_RETURN_IF_ERROR(LoadMetaGraphIntoSession(
-      bundle->meta_graph_def, session_options, &bundle->session));
-
-  std::vector<AssetFileDef> asset_file_defs;
-  TF_RETURN_IF_ERROR(
-      internal::GetAssetFileDefs(bundle->meta_graph_def, &asset_file_defs));
-  TF_RETURN_IF_ERROR(
-      RunRestore(run_options, export_dir,
-                 bundle->meta_graph_def.saver_def().restore_op_name(),
-                 bundle->meta_graph_def.saver_def().filename_tensor_name(),
-                 asset_file_defs, bundle->session.get()));
-  // Record walltime spent in restoring graph from disk, but postpone metric
-  // increments until graph init finishes.
-  const uint64 restore_graph_walltime =
-      GetLatencyMicroseconds(read_start_microseconds);
-
-  const uint64 graph_init_start_microseconds = Env::Default()->NowMicros();
-  string init_op_name;
-  TF_RETURN_IF_ERROR(
-      internal::GetInitOp(export_dir, bundle->meta_graph_def, &init_op_name));
-  TF_RETURN_IF_ERROR(RunInitOp(run_options, export_dir, bundle->meta_graph_def,
-                               asset_file_defs, bundle->session.get(),
-                               init_op_name));
-  load_latency_by_stage->GetCell(export_dir, "restore_graph")
-      ->Add(restore_graph_walltime);
-  // Record wall time spent in init op.
-  load_latency_by_stage->GetCell(export_dir, "init_graph")
-      ->Add(GetLatencyMicroseconds(graph_init_start_microseconds));
+  TF_RETURN_IF_ERROR(LoadMetagraphIntoSession(
+      session_options, bundle->meta_graph_def, &bundle->session));
+  TF_RETURN_IF_ERROR(RestoreSession(run_options, bundle->meta_graph_def,
+                                    export_dir, &bundle->session));
   return Status::OK();
 }
 
-}  // namespace
-
-SavedModelBundleInterface::~SavedModelBundleInterface() {}
-
 Status LoadSavedModel(const SessionOptions& session_options,
                       const RunOptions& run_options, const string& export_dir,
                       const std::unordered_set<string>& tags,
@@ -424,6 +382,35 @@ class LiteSessionWrapper : public Session {
 };
 }  // namespace
 
+Status RestoreSession(const RunOptions& run_options,
+                      const MetaGraphDef& meta_graph, const string& export_dir,
+                      std::unique_ptr<Session>* session) {
+  const uint64 read_start_microseconds = Env::Default()->NowMicros();
+  std::vector<AssetFileDef> asset_file_defs;
+  TF_RETURN_IF_ERROR(internal::GetAssetFileDefs(meta_graph, &asset_file_defs));
+  TF_RETURN_IF_ERROR(RunRestore(run_options, export_dir,
+                                meta_graph.saver_def().restore_op_name(),
+                                meta_graph.saver_def().filename_tensor_name(),
+                                asset_file_defs, session->get()));
+  // Record walltime spent in restoring graph from disk, but postpone metric
+  // increments until graph init finishes.
+  const uint64 restore_graph_walltime =
+      GetLatencyMicroseconds(read_start_microseconds);
+
+  const uint64 graph_init_start_microseconds = Env::Default()->NowMicros();
+  string init_op_name;
+  TF_RETURN_IF_ERROR(
+      internal::GetInitOp(export_dir, meta_graph, &init_op_name));
+  TF_RETURN_IF_ERROR(RunInitOp(run_options, export_dir, meta_graph,
+                               asset_file_defs, session->get(), init_op_name));
+  load_latency_by_stage->GetCell(export_dir, "restore_graph")
+      ->Add(restore_graph_walltime);
+  // Record wall time spent in init op.
+  load_latency_by_stage->GetCell(export_dir, "init_graph")
+      ->Add(GetLatencyMicroseconds(graph_init_start_microseconds));
+  return Status::OK();
+}
+
 Status LoadSavedModel(const SessionOptions& session_options,
                       const RunOptions& run_options, const string& export_dir,
                       const std::unordered_set<string>& tags,
diff --git a/tensorflow/cc/saved_model/loader.h b/tensorflow/cc/saved_model/loader.h
index 2b2e44bc619..5ef6070998e 100644
--- a/tensorflow/cc/saved_model/loader.h
+++ b/tensorflow/cc/saved_model/loader.h
@@ -96,6 +96,21 @@ class SavedModelBundleLite : public SavedModelBundleInterface {
   protobuf::Map<string, SignatureDef> signatures_;
 };
 
+// Restore variable and resources in the SavedModel export dir for the
+// indicated metagraph.
+// The recommended way to load a saved model is to call LoadSavedModel,
+// which provides an already initialized Metagraph, Session, and DebugInfo.
+Status RestoreSession(const RunOptions& run_options,
+                      const MetaGraphDef& meta_graph, const string& export_dir,
+                      std::unique_ptr<Session>* session);
+
+// Initialize a session which wraps this metagraph.
+// The recommended way to load a saved model is to call LoadSavedModel,
+// which provides an already initialized Metagraph, Session, and DebugInfo.
+Status LoadMetagraphIntoSession(const SessionOptions& session_options,
+                                const MetaGraphDef& meta_graph,
+                                std::unique_ptr<Session>* session);
+
 /// Loads a SavedModel from the specified export directory. The MetaGraphDef
 /// to be loaded is identified by the supplied tags, corresponding exactly to
 /// the set of tags used at SavedModel build time. Stores a SavedModel bundle in
diff --git a/tensorflow/cc/saved_model/reader.cc b/tensorflow/cc/saved_model/reader.cc
index d6d99229372..c1d4736f6b9 100644
--- a/tensorflow/cc/saved_model/reader.cc
+++ b/tensorflow/cc/saved_model/reader.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <unordered_set>
 
+#include "absl/memory/memory.h"
 #include "tensorflow/cc/saved_model/constants.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -86,4 +87,22 @@ Status ReadMetaGraphDefFromSavedModel(const string& export_dir,
   return Status::OK();
 }
 
+Status ReadSavedModelDebugInfoIfPresent(
+    const string& export_dir,
+    std::unique_ptr<GraphDebugInfo>* debug_info_proto) {
+  LOG(INFO) << "Reading SavedModel debug info (if present) from: "
+            << export_dir;
+
+  const string debug_info_pb_path =
+      io::JoinPath(export_dir, "debug", "saved_model_debug_info.pb");
+  if (Env::Default()->FileExists(debug_info_pb_path).ok()) {
+    GraphDebugInfo debug_info;
+    TF_RETURN_IF_ERROR(
+        ReadBinaryProto(Env::Default(), debug_info_pb_path, &debug_info));
+    *debug_info_proto =
+        absl::make_unique<GraphDebugInfo>(std::move(debug_info));
+  }
+  return Status::OK();
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/cc/saved_model/reader.h b/tensorflow/cc/saved_model/reader.h
index 5815108df2a..602f6cb21c1 100644
--- a/tensorflow/cc/saved_model/reader.h
+++ b/tensorflow/cc/saved_model/reader.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <unordered_set>
 
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/protobuf/graph_debug_info.pb.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
 
 namespace tensorflow {
@@ -34,6 +35,11 @@ Status ReadMetaGraphDefFromSavedModel(const string& export_dir,
                                       const std::unordered_set<string>& tags,
                                       MetaGraphDef* const meta_graph_def);
 
+// Store debug info from the SavedModel export dir.
+Status ReadSavedModelDebugInfoIfPresent(
+    const string& export_dir,
+    std::unique_ptr<GraphDebugInfo>* debug_info_proto);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CC_SAVED_MODEL_READER_H_
diff --git a/tensorflow/cc/saved_model/reader_test.cc b/tensorflow/cc/saved_model/reader_test.cc
index bc630bcaede..b5e8b67a123 100644
--- a/tensorflow/cc/saved_model/reader_test.cc
+++ b/tensorflow/cc/saved_model/reader_test.cc
@@ -106,5 +106,11 @@ TEST_F(ReaderTest, InvalidExportPath) {
   EXPECT_FALSE(st.ok());
 }
 
+TEST_F(ReaderTest, ReadSavedModelDebugInfoIfPresent) {
+  const string export_dir = GetDataDependencyFilepath(TestDataSharded());
+  std::unique_ptr<GraphDebugInfo> debug_info_proto;
+  TF_ASSERT_OK(ReadSavedModelDebugInfoIfPresent(export_dir, &debug_info_proto));
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/cc/saved_model/saved_model_bundle_test.cc b/tensorflow/cc/saved_model/saved_model_bundle_test.cc
index d6c375c7448..3f258745fa4 100644
--- a/tensorflow/cc/saved_model/saved_model_bundle_test.cc
+++ b/tensorflow/cc/saved_model/saved_model_bundle_test.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/cc/saved_model/loader.h"
-
 #include "tensorflow/cc/saved_model/constants.h"
+#include "tensorflow/cc/saved_model/loader.h"
+#include "tensorflow/cc/saved_model/reader.h"
 #include "tensorflow/cc/saved_model/signature_constants.h"
 #include "tensorflow/cc/saved_model/tag_constants.h"
 #include "tensorflow/core/example/example.pb.h"
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
 
 namespace tensorflow {
 namespace {
@@ -131,6 +132,43 @@ TEST_F(LoaderTest, TagMatch) {
   CheckSavedModelBundle(export_dir, bundle);
 }
 
+TEST_F(LoaderTest, ReadMetaGraphFromSavedModel) {
+  SavedModelBundle bundle;
+  SessionOptions session_options;
+  RunOptions run_options;
+
+  const string export_dir =
+      io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataSharded);
+  TF_ASSERT_OK(LoadSavedModel(session_options, run_options, export_dir,
+                              {kSavedModelTagServe}, &bundle));
+  MetaGraphDef actual_metagraph;
+  TF_ASSERT_OK(ReadMetaGraphDefFromSavedModel(export_dir, {kSavedModelTagServe},
+                                              &actual_metagraph));
+  EXPECT_EQ(actual_metagraph.DebugString(),
+            bundle.meta_graph_def.DebugString());
+}
+
+TEST_F(LoaderTest, RestoreSession) {
+  SavedModelBundle bundle;
+  SessionOptions session_options;
+  RunOptions run_options;
+
+  const string export_dir =
+      io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataSharded);
+  TF_ASSERT_OK(LoadSavedModel(session_options, run_options, export_dir,
+                              {kSavedModelTagServe}, &bundle));
+
+  SavedModelBundle actual_bundle;
+  const std::unordered_set<std::string> tags = {kSavedModelTagServe};
+  TF_ASSERT_OK(ReadMetaGraphDefFromSavedModel(export_dir, tags,
+                                              &actual_bundle.meta_graph_def));
+  TF_ASSERT_OK(LoadMetagraphIntoSession(
+      session_options, actual_bundle.meta_graph_def, &actual_bundle.session));
+  TF_ASSERT_OK(RestoreSession(run_options, actual_bundle.meta_graph_def,
+                              export_dir, &actual_bundle.session));
+  CheckSavedModelBundle(export_dir, actual_bundle);
+}
+
 TEST_F(LoaderTest, NoTagMatch) {
   SavedModelBundle bundle;
   RunOptions run_options;
@@ -270,6 +308,9 @@ TEST_F(LoaderTest, NegativeShapeDimension) {
   Status st = LoadSavedModel(session_options, run_options, export_dir,
                              {kSavedModelTagServe}, &bundle);
   EXPECT_FALSE(st.ok());
+  EXPECT_NE(
+      st.error_message().find("initializes from a tensor with -1 elements"),
+      std::string::npos);
 }
 
 TEST_F(LoaderTest, ConstNoValue) {
@@ -282,6 +323,9 @@ TEST_F(LoaderTest, ConstNoValue) {
   Status st = LoadSavedModel(session_options, run_options, export_dir,
                              {kSavedModelTagServe}, &bundle);
   EXPECT_FALSE(st.ok());
+  EXPECT_NE(
+      st.error_message().find("constant tensor but no value has been provided"),
+      std::string::npos);
 }
 
 }  // namespace
diff --git a/third_party/sycl/crosstool/BUILD b/tensorflow/cc/saved_model/testdata/fuzz_generated/const_with_no_value/assets/empty
similarity index 100%
rename from third_party/sycl/crosstool/BUILD
rename to tensorflow/cc/saved_model/testdata/fuzz_generated/const_with_no_value/assets/empty
diff --git a/tensorflow/cc/saved_model/testdata/fuzz_generated/const_with_no_value b/tensorflow/cc/saved_model/testdata/fuzz_generated/const_with_no_value/saved_model.pb
similarity index 100%
rename from tensorflow/cc/saved_model/testdata/fuzz_generated/const_with_no_value
rename to tensorflow/cc/saved_model/testdata/fuzz_generated/const_with_no_value/saved_model.pb
diff --git a/tensorflow/cc/saved_model/testdata/fuzz_generated/const_with_no_value/variables/variables.data-00000-of-00001 b/tensorflow/cc/saved_model/testdata/fuzz_generated/const_with_no_value/variables/variables.data-00000-of-00001
new file mode 100644
index 00000000000..3fd3ba2223d
Binary files /dev/null and b/tensorflow/cc/saved_model/testdata/fuzz_generated/const_with_no_value/variables/variables.data-00000-of-00001 differ
diff --git a/tensorflow/cc/saved_model/testdata/fuzz_generated/const_with_no_value/variables/variables.index b/tensorflow/cc/saved_model/testdata/fuzz_generated/const_with_no_value/variables/variables.index
new file mode 100644
index 00000000000..7357e8d57ed
Binary files /dev/null and b/tensorflow/cc/saved_model/testdata/fuzz_generated/const_with_no_value/variables/variables.index differ
diff --git a/tensorflow/cc/saved_model/testdata/fuzz_generated/negative_shape/assets/empty b/tensorflow/cc/saved_model/testdata/fuzz_generated/negative_shape/assets/empty
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tensorflow/cc/saved_model/testdata/fuzz_generated/negative_shape b/tensorflow/cc/saved_model/testdata/fuzz_generated/negative_shape/saved_model.pb
similarity index 100%
rename from tensorflow/cc/saved_model/testdata/fuzz_generated/negative_shape
rename to tensorflow/cc/saved_model/testdata/fuzz_generated/negative_shape/saved_model.pb
diff --git a/tensorflow/cc/saved_model/testdata/fuzz_generated/negative_shape/variables/variables.data-00000-of-00001 b/tensorflow/cc/saved_model/testdata/fuzz_generated/negative_shape/variables/variables.data-00000-of-00001
new file mode 100644
index 00000000000..3fd3ba2223d
Binary files /dev/null and b/tensorflow/cc/saved_model/testdata/fuzz_generated/negative_shape/variables/variables.data-00000-of-00001 differ
diff --git a/tensorflow/cc/saved_model/testdata/fuzz_generated/negative_shape/variables/variables.index b/tensorflow/cc/saved_model/testdata/fuzz_generated/negative_shape/variables/variables.index
new file mode 100644
index 00000000000..7357e8d57ed
Binary files /dev/null and b/tensorflow/cc/saved_model/testdata/fuzz_generated/negative_shape/variables/variables.index differ
diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index ecbb1a5d200..82375577610 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -15,6 +15,7 @@ package_group(
         "//tensorflow/compiler/tf2xla:internal",
     ],
     packages = [
+        "//tensorflow/c/...",
         "//tensorflow/compiler/tests/...",
         "//tensorflow/python/...",
     ],
@@ -128,22 +129,6 @@ cc_library(
     alwayslink = 1,
 )
 
-cc_library(
-    name = "xla_interpreter_device",
-    srcs = ["xla_interpreter_device.cc"],
-    visibility = [":friends"],
-    deps = [
-        ":jit_compilation_passes",
-        ":xla_device",
-        "//tensorflow/compiler/jit/kernels:xla_ops",
-        "//tensorflow/compiler/tf2xla:xla_compiler",
-        "//tensorflow/compiler/tf2xla/kernels:xla_ops",
-        "//tensorflow/compiler/xla/service:interpreter_plugin",  # buildcleaner: keep
-        "@com_google_absl//absl/memory",
-    ],
-    alwayslink = 1,
-)
-
 cc_library(
     name = "xla_tensor",
     srcs = ["xla_tensor.cc"],
@@ -211,6 +196,7 @@ XLA_DEVICE_DEPS = [
     "//tensorflow/core/kernels/data:optional_ops",
     "//tensorflow/core/kernels/data:prefetch_dataset_op",
     "//tensorflow/core/profiler/lib:traceme",
+    "//tensorflow/stream_executor:tf_allocator_adapter",
     "//tensorflow/stream_executor/platform",
 ]
 
@@ -221,16 +207,19 @@ cc_library(
         "xla_device.cc",
         "xla_device_context.cc",
         "xla_device_ops.cc",
+        "xla_ops_on_regular_devices.cc",
+        "xla_platform_info.cc",
     ],
     hdrs = [
         "xla_compile_on_demand_op.h",
         "xla_device.h",
         "xla_device_context.h",
         "xla_device_ops.h",
+        "xla_platform_info.h",
     ],
     # Public visibility is needed for external TF/XLA backends.
     visibility = ["//visibility:public"],
-    deps = XLA_DEVICE_DEPS,
+    deps = XLA_DEVICE_DEPS + [":xla_compilation_cache"],
 )
 
 cc_library(
@@ -341,8 +330,10 @@ cc_library(
     srcs = ["xla_compilation_cache.cc"],
     hdrs = ["xla_compilation_cache.h"],
     deps = [
+        ":flags",
         ":xla_activity_listener",
         ":xla_activity_proto_cc",
+        "//tensorflow/compiler/mlir:array_container_utils",
         "//tensorflow/compiler/mlir/tensorflow:compile_mlir_util_no_tf_dialect_passes",
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:xla_compiler",
@@ -373,8 +364,11 @@ tf_cc_test(
         "xla_compilation_cache_test.cc",
     ],
     deps = [
+        ":flags",
         ":xla_compilation_cache",
+        ":xla_cpu_jit",
         "//tensorflow/compiler/tf2xla:common",
+        "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
     ],
@@ -394,20 +388,70 @@ cc_library(
     alwayslink = 1,
 )
 
-# Linked by tensorflow core, without registration of jit compilation passes
-# which is not necessary to create and run a XlaLocalLaunchBase kernel.
-# Linking jit compilation passes could cause programs stuck right now (b/140069592).
 cc_library(
-    name = "xla_kernel_creator_util",
-    srcs = [
-        "xla_kernel_creator_util.cc",
+    name = "get_compiler_ir",
+    srcs = ["get_compiler_ir.cc"],
+    hdrs = ["get_compiler_ir.h"],
+    visibility = [
+        ":internal",
+        "//learning/brain/contrib/tpu_modeling/exp/tpu_inference_converter:__pkg__",
+        "//tensorflow/core/common_runtime/eager:__pkg__",
+    ],
+    deps = [
+        ":common",
+        ":compilability_check_util",
+        ":flags",
+        ":xla_device_no_jit_rewrite_registration",
+        ":xla_launch_util",
+        "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/common_runtime:core_cpu_internal",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:span",
+    ],
+    alwayslink = 1,
+)
+
+# Header-only version of "flags" library, for linking from the shared object
+# without ODR violations.
+cc_library(
+    name = "get_compiler_ir_hdrs_only",
+    hdrs = ["get_compiler_ir.h"],
+    visibility = [
+        ":internal",
+        "//learning/brain/contrib/tpu_modeling/exp/tpu_inference_converter:__pkg__",
+        "//tensorflow/core/common_runtime/eager:__pkg__",
+    ],
+    deps = [
+        "//tensorflow/compiler/xla:statusor",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "xla_kernel_creator",
+    srcs = [
+        "xla_kernel_creator.cc",
+        "xla_kernel_creator.h",
+    ],
+    visibility = [
+        ":internal",
+        "//learning/brain/contrib/tpu_modeling/exp/tpu_inference_converter:__pkg__",
+        "//tensorflow/core/common_runtime/eager:__pkg__",
     ],
-    hdrs = ["xla_kernel_creator_util.h"],
-    visibility = ["//tensorflow/core/common_runtime/eager:__pkg__"],
     deps = [
         ":common",
         ":compilability_check_util",
         ":compilation_passes",
+        ":flags",
+        ":jit_compilation_passes",
         "//tensorflow/compiler/jit/kernels:xla_ops_no_jit_rewrite_registration",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla:xla_op_registry",
@@ -422,25 +466,6 @@ cc_library(
     alwayslink = 1,
 )
 
-cc_library(
-    name = "xla_kernel_creator",
-    srcs = [
-        "xla_kernel_creator.cc",
-        "xla_kernel_creator.h",
-    ],
-    deps = [
-        ":compilability_check_util",
-        ":flags",
-        ":jit_compilation_passes",
-        ":xla_kernel_creator_util",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-    ],
-    alwayslink = 1,
-)
-
 tf_cc_test(
     name = "xla_kernel_creator_test",
     srcs = [
@@ -632,7 +657,6 @@ cc_library(
         ":flags",
         ":resource_operation_safety_analysis",
         ":shape_inference_helpers",
-        ":union_find",
         ":xla_activity_listener",
         ":xla_cluster_util",
         "//tensorflow/cc:cc_ops",
@@ -651,8 +675,8 @@ cc_library(
         "//tensorflow/compiler/tf2xla/cc:xla_ops",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:union_find",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/core:all_kernels",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
@@ -729,11 +753,6 @@ tf_cc_test(
     ],
 )
 
-cc_library(
-    name = "union_find",
-    hdrs = ["union_find.h"],
-)
-
 tf_cc_test(
     name = "deadness_analysis_test",
     size = "small",
@@ -828,6 +847,7 @@ tf_cc_test(
         "//tensorflow/compiler/tf2xla/kernels:xla_dummy_ops",
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
         "//tensorflow/compiler/xla:test",
+        "//tensorflow/core:all_kernels",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
@@ -914,7 +934,6 @@ cc_library(
         ":device_util",
         ":flags",
         ":resource_operation_safety_analysis",
-        ":union_find",
         ":xla_activity_listener",
         ":xla_activity_proto_cc",
         ":xla_cluster_util",
@@ -923,6 +942,7 @@ cc_library(
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla:xla_op_registry",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:union_find",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
@@ -946,6 +966,7 @@ tf_cc_test(
         ":xla_cpu_jit",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:function_ops",
+        "//tensorflow/cc:functional_ops",
         "//tensorflow/cc:ops",
         "//tensorflow/cc:scope",
         "//tensorflow/compiler/tf2xla:test_util",
@@ -972,6 +993,7 @@ tf_cc_test(
         ":xla_cpu_jit",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:ops",
+        "//tensorflow/core:all_kernels",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:direct_session_internal",
         "//tensorflow/core:framework",
diff --git a/tensorflow/compiler/jit/build_xla_ops_pass_test.cc b/tensorflow/compiler/jit/build_xla_ops_pass_test.cc
index 8463c788496..160ea83585d 100644
--- a/tensorflow/compiler/jit/build_xla_ops_pass_test.cc
+++ b/tensorflow/compiler/jit/build_xla_ops_pass_test.cc
@@ -130,17 +130,6 @@ FunctionDefLibrary CreateFunctionDefLibWithConstFunction(const string& name) {
   return fdef_lib;
 }
 
-FunctionDefLibrary CreateFunctionDefLibWithInt32Input(const string& name) {
-  FunctionDefLibrary fdef_lib;
-  FunctionDef func = FunctionDefHelper::Create(
-      /*function_name=*/name, /*in_def=*/{"in: int32"},
-      /*out_def=*/{"out: int32"},
-      /*attr_def=*/{}, /*node_def=*/{{{"out"}, "Identity", {"in"}}},
-      /*ret_def=*/{{"out", "out:output:0"}});
-  *fdef_lib.add_function() = std::move(func);
-  return fdef_lib;
-}
-
 TEST_F(BuildXlaOpsTest, ControlDepsPreserved) {
   const char* kXlaDeviceName = "/job:worker/replica:0/task:0/device:XLA_CPU:0";
   Scope root = Scope::NewRootScope().WithDevice(kXlaDeviceName).ExitOnError();
@@ -269,6 +258,17 @@ TEST_F(BuildXlaOpsTest, NoExtraMergeForEdgeToSink) {
 }
 
 #ifdef GOOGLE_CUDA
+FunctionDefLibrary CreateFunctionDefLibWithInt32Input(const string& name) {
+  FunctionDefLibrary fdef_lib;
+  FunctionDef func = FunctionDefHelper::Create(
+      /*function_name=*/name, /*in_def=*/{"in: int32"},
+      /*out_def=*/{"out: int32"},
+      /*attr_def=*/{}, /*node_def=*/{{{"out"}, "Identity", {"in"}}},
+      /*ret_def=*/{{"out", "out:output:0"}});
+  *fdef_lib.add_function() = std::move(func);
+  return fdef_lib;
+}
+
 // This tests a rewrite that only makes sense and is active in a CUDA-enabled
 // build.  Specifically we check that we insert an IdentityN op to avoid extra
 // device-to-host copies.
diff --git a/tensorflow/compiler/jit/compilability_check_util.cc b/tensorflow/compiler/jit/compilability_check_util.cc
index 6d4bc51f1b2..20efbe248d7 100644
--- a/tensorflow/compiler/jit/compilability_check_util.cc
+++ b/tensorflow/compiler/jit/compilability_check_util.cc
@@ -36,7 +36,6 @@ limitations under the License.
 #include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/jit/graphcycles/graphcycles.h"
 #include "tensorflow/compiler/jit/resource_operation_safety_analysis.h"
-#include "tensorflow/compiler/jit/union_find.h"
 #include "tensorflow/compiler/jit/xla_activity.pb.h"
 #include "tensorflow/compiler/jit/xla_activity_listener.h"
 #include "tensorflow/compiler/jit/xla_cluster_util.h"
@@ -44,6 +43,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/resource_operation_table.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/union_find.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/graph_constructor.h"
@@ -84,6 +84,43 @@ Status MakeCallNodeFromAttribute(const Node& node, const std::string& attr_name,
   return Status::OK();
 }
 
+// Utility which searches for values in a sorted list by scanning over it once.
+// No matter how many times ScanForValue is called, the list is scanned at most
+// once. However, if a call to ScanForValue skips over a value, that value is
+// not revisited in future calls to ScanForValue, so callers must take
+// care to order their calls.
+//
+// Useful for merging multiple sorted lists in O(n) time.
+class SinglePassSearch {
+ public:
+  // Creates a SinglePassSearch object that can be used to search in `values`.
+  // Does not take ownership of `values`. `values` must outlive this.
+  // `values` must be sorted.
+  explicit SinglePassSearch(absl::Span<int const> values)
+      : current_index_(0), values_(values) {}
+
+  // Scans forward in the vector looking for "value", updating the internal
+  // position in to the vector.
+  // Returns true iff the vector contains the given value at or after current
+  // position.
+  // Not thread-safe.
+  bool ScanForValue(int value) {
+    while (current_index_ < values_.size() &&
+           values_[current_index_] <= value) {
+      if (values_[current_index_] == value) {
+        current_index_++;
+        return true;
+      }
+      current_index_++;
+    }
+    return false;
+  }
+
+ private:
+  int current_index_;
+  const absl::Span<int const> values_;
+};
+
 }  // anonymous namespace
 
 RecursiveCompilabilityChecker::UncompilableNodesMap
@@ -518,23 +555,23 @@ RecursiveCompilabilityChecker::OperationFilter CreateOperationFilter(
   }
 }
 
+// Returns `true` iff node has a given `attr` set to `true`. Returns `false`
+// both for the missing attr, and the attr set to `false`.
+static bool HasBoolAttr(const NodeDef& node, const char* attr) {
+  const auto& it = node.attr().find(attr);
+  return it != node.attr().end() && it->second.b();
+}
+
 bool CanCreateXlaKernel(const NodeDef& node_def) {
-  // If kXlaMustCompileAttr is set on the node_def, use its value.
-  const auto& it = node_def.attr().find(kXlaMustCompileAttr);
-  return it != node_def.attr().end() && it->second.b();
+  return HasBoolAttr(node_def, kXlaMustCompileAttr);
 }
 
 Status GetBodyAndConstantsAndResources(FunctionLibraryRuntime* flr,
-                                       const NodeDef& node_def,
+                                       const NameAttrList& function,
                                        const FunctionBody** fbody,
                                        std::vector<int>* constant_arg_indices,
                                        std::vector<int>* resource_arg_indices) {
   FunctionLibraryRuntime::Handle handle;
-  // If node_def is not instantiable, e.g., the function does not exist,
-  // simply bail out.
-  NameAttrList function;
-  TF_RETURN_IF_ERROR(NameAndAttrsFromFunctionCall(node_def, &function));
-
   TF_RETURN_IF_ERROR(
       flr->Instantiate(function.name(), AttrSlice(&function.attr()), &handle));
   *fbody = flr->GetFunctionBody(handle);
@@ -564,4 +601,96 @@ Status GetBodyAndConstantsAndResources(FunctionLibraryRuntime* flr,
   return Status::OK();
 }
 
+tensorflow::MemoryTypeVector GetInputMemoryTypes(
+    const tensorflow::FunctionBody* fbody,
+    absl::Span<int const> constant_arg_indices,
+    absl::Span<int const> resource_arg_indices) {
+  // Set input and output memory types.
+  tensorflow::MemoryTypeVector input_memory_types(fbody->arg_types.size(),
+                                                  tensorflow::DEVICE_MEMORY);
+  // These indices are used only for optimization purposes. They allow us
+  // to loop over constant_arg_indices and resource_arg_indices only once
+  // while iterating over all the function arguments checking if it is a
+  // resource or a constant.
+  // The reason we optimized this code is because functions can have a lot of
+  // captured arguments. For example, the backward pass of ResNet50 takes in all
+  // 214 variables and a similar number of activations.
+  SinglePassSearch constants_search(constant_arg_indices);
+  SinglePassSearch resources_search(resource_arg_indices);
+  for (size_t i = 0; i < fbody->arg_types.size(); ++i) {
+    if (resources_search.ScanForValue(i) || constants_search.ScanForValue(i)) {
+      // Compile-time constants and resource handles are expected to be in
+      // host memory.
+      input_memory_types[i] = tensorflow::HOST_MEMORY;
+    }
+  }
+  return input_memory_types;
+}
+
+tensorflow::MemoryTypeVector GetOutputMemoryTypes(
+    const tensorflow::FunctionBody* fbody) {
+  tensorflow::MemoryTypeVector output_memory_types(fbody->ret_types.size(),
+                                                   tensorflow::DEVICE_MEMORY);
+  for (size_t i = 0; i < fbody->ret_types.size(); ++i) {
+    if (fbody->ret_types[i] == tensorflow::DT_RESOURCE) {
+      output_memory_types[i] = tensorflow::HOST_MEMORY;
+    }
+  }
+  return output_memory_types;
+}
+
+static auto const ops_triggering_xla_compilation =
+    new absl::flat_hash_set<std::string>{"XlaBroadcastHelper",
+                                         "XlaConv",
+                                         "XlaDequantize",
+                                         "XlaDot",
+                                         "XlaDynamicSlice",
+                                         "XlaDynamicUpdateSlice",
+                                         "XlaEinsum",
+                                         "XlaGather",
+                                         "XlaIf",
+                                         "XlaKeyValueSort",
+                                         "XlaPad",
+                                         "XlaRecv",
+                                         "XlaReduce",
+                                         "XlaReduceWindow",
+                                         "XlaReplicaId",
+                                         "XlaScatter",
+                                         "XlaSelectAndScatter",
+                                         "XlaSelfAdjointEig",
+                                         "XlaSend",
+                                         "XlaSharding",
+                                         "XlaSort",
+                                         "XlaSpmdFullToShardShape",
+                                         "XlaSpmdShardToFullShape",
+                                         "XlaSvd",
+                                         "XlaWhile"};
+
+static bool NodeCanTriggerXlaCompilation(const NodeDef& node) {
+  return node.attr().find(kXlaClusterIdAttr) != node.attr().end() ||
+         HasBoolAttr(node, kXlaMustCompileAttr) ||
+         HasBoolAttr(node, kXlaCompileAttr) ||
+         HasBoolAttr(node, kXlaScopeAttr) ||
+         HasBoolAttr(node, kXlaInternalScopeAttr) ||
+         ops_triggering_xla_compilation->count(node.op());
+}
+
+bool CanTriggerXlaCompilation(const GraphDef& graph) {
+  for (const FunctionDef& function : graph.library().function()) {
+    for (const NodeDef& node : function.node_def()) {
+      if (NodeCanTriggerXlaCompilation(node)) {
+        return true;
+      }
+    }
+  }
+
+  for (const NodeDef& node : graph.node()) {
+    if (NodeCanTriggerXlaCompilation(node)) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/compilability_check_util.h b/tensorflow/compiler/jit/compilability_check_util.h
index 3b20784cc29..3c1378bf764 100644
--- a/tensorflow/compiler/jit/compilability_check_util.h
+++ b/tensorflow/compiler/jit/compilability_check_util.h
@@ -26,11 +26,11 @@ limitations under the License.
 #include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/jit/graphcycles/graphcycles.h"
 #include "tensorflow/compiler/jit/resource_operation_safety_analysis.h"
-#include "tensorflow/compiler/jit/union_find.h"
 #include "tensorflow/compiler/tf2xla/const_analysis.h"
 #include "tensorflow/compiler/tf2xla/resource_operation_table.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/union_find.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/graph_constructor.h"
@@ -126,9 +126,10 @@ class RecursiveCompilabilityChecker {
     bool allow_inaccurate_ops = false;
   };
 
-  RecursiveCompilabilityChecker(const OperationFilter* op_filter,
-                                const DeviceType* jit_device_type)
-      : op_filter_(*op_filter), jit_device_type_(*jit_device_type) {}
+  RecursiveCompilabilityChecker(OperationFilter op_filter,
+                                DeviceType jit_device_type)
+      : op_filter_(std::move(op_filter)),
+        jit_device_type_(std::move(jit_device_type)) {}
 
   using UncompilableNodesMap =
       std::map<std::string,
@@ -259,21 +260,20 @@ class RecursiveCompilabilityChecker {
   // Make sure we don't recurse infinitely on recursive functions.
   const size_t kMaxRecursionDepth = 10;
 
-  const OperationFilter& op_filter_;
-  const DeviceType& jit_device_type_;
+  const OperationFilter op_filter_;
+  const DeviceType jit_device_type_;
 };
 
 RecursiveCompilabilityChecker::OperationFilter CreateOperationFilter(
     const XlaOpRegistry::DeviceRegistration& registration);
 
-// Given a FunctionLibraryRuntime and a NodeDef calling a function in the
-// runtime, returns this function's body in `fbody` as well as the indices
-// of its constant and resource arguments.
+// Given a FunctionLibraryRuntime and a `function`, returns this function's body
+// in `fbody` as well as the indices of its constant and resource arguments.
 // `fbody` is owned by `flr`.
 // `constant_arg_indices` and `resource_arg_indices` should be empty vector.
 // They are sorted in ascending order on this function's return.
 Status GetBodyAndConstantsAndResources(FunctionLibraryRuntime* flr,
-                                       const NodeDef& node_def,
+                                       const NameAttrList& function,
                                        const FunctionBody** fbody,
                                        std::vector<int>* constant_arg_indices,
                                        std::vector<int>* resource_arg_indices);
@@ -282,6 +282,44 @@ Status GetBodyAndConstantsAndResources(FunctionLibraryRuntime* flr,
 // set.
 bool CanCreateXlaKernel(const NodeDef& node_def);
 
+// Returns memory types for the input.
+// `constant_arg_indices` and `resource_arg_indices` are sorted arrays of
+// indices corresponding to constant and resource arguments respectively.
+//
+// One might wonder, about the case where a compile-time constant argument
+// (which must be in host memory) is also used as an input into an op,
+// e.g. `Add`, that expects its inputs in device memory. Here is how it
+// works now.
+// First, what do we mean by "op expects an input in XYZ memory"?
+// There are two types of "ops" here: the tf2xla kernel and the HLO
+// computation it builds. The tf2xla kernel needs to retrieve the actual
+// numeric value of the compile-time constant tensors, so it really expects
+// them to be on in host memory. However, for other inputs, it refers to them
+// using xla::ComputationDataHandle, which is just a symbolic handle that
+// xla::ComputationBuilder assigns. How does this handle gets assigned for
+// constant arguments? Even constant arguments get an _Arg node in the graph
+// instantiated for Function compilation. The tf2xla kernel for constant _Arg
+// nodes takes the constant value, converts it to XlaLiteral, and feeds it
+// to xla::ComputationBuilder.ConstantLiteral, which returns the handle. This
+// constant XlaLiteral is included in the HLO graph, and subsequently, in
+// the actual executable, which is copied to the device before being
+// executed. Thus, when this executable runs, the constant is available in
+// device memory.
+tensorflow::MemoryTypeVector GetInputMemoryTypes(
+    const tensorflow::FunctionBody* fbody,
+    absl::Span<int const> constant_arg_indices,
+    absl::Span<int const> resource_arg_indices);
+
+// Returns output memory types.
+//
+// XlaLaunch kernel keeps all outputs (including constants, which it copies),
+// in device memory except for resources.
+tensorflow::MemoryTypeVector GetOutputMemoryTypes(
+    const tensorflow::FunctionBody* fbody);
+
+// Check whether graph can trigger XLA compilation.
+bool CanTriggerXlaCompilation(const GraphDef& graph);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_JIT_COMPILABILITY_CHECK_UTIL_H_
diff --git a/tensorflow/compiler/jit/compilability_check_util_test.cc b/tensorflow/compiler/jit/compilability_check_util_test.cc
index 3ea38e69ad9..3851c66ba1a 100644
--- a/tensorflow/compiler/jit/compilability_check_util_test.cc
+++ b/tensorflow/compiler/jit/compilability_check_util_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/cc/ops/function_ops.h"
+#include "tensorflow/cc/ops/functional_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
@@ -75,8 +76,8 @@ class CompilabilityCheckUtilTest : public ::testing::Test {
     op_filter_.allow_inaccurate_ops = false;
     op_filter_.allow_slow_ops = false;
 
-    checker_ = absl::make_unique<RecursiveCompilabilityChecker>(&op_filter_,
-                                                                &device_type_);
+    checker_ = absl::make_unique<RecursiveCompilabilityChecker>(op_filter_,
+                                                                device_type_);
   }
 
   FunctionLibraryRuntime* GetFunctionLibraryRuntime() {
@@ -354,5 +355,110 @@ TEST_F(CompilabilityCheckUtilTest, CheckFunctionalIfNode) {
                                 "unsupported op"));
 }
 
+TEST_F(CompilabilityCheckUtilTest, TestCanNotTriggerXlaCompilation) {
+  GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+  Scope root = Scope::NewRootScope().ExitOnError();
+  FunctionDefLibrary library;
+
+  FunctionDef identity_func = FunctionDefHelper::Create(
+      "IdentityFunc",
+      /*in_def=*/{"x:float"},
+      /*out_def=*/{"res:float"},
+      /*attr_def=*/{},
+      /*node_def=*/{{{"t0"}, "Identity", {"x"}, {{"T", DT_FLOAT}}}},
+      /*ret_def*/ {{"res", "t0:output"}});
+
+  *library.add_function() = identity_func;
+
+  Output in = ops::Placeholder(root, DT_FLOAT);
+  NameAttrList b_name_attr;
+  b_name_attr.set_name("IdentityFunc");
+  ops::PartitionedCall call(root.WithOpName("call"), {in}, {DT_FLOAT},
+                            b_name_attr);
+
+  GraphDef graph_def;
+  TF_ASSERT_OK(root.graph()->AddFunctionLibrary(library));
+  TF_ASSERT_OK(root.ToGraphDef(&graph_def));
+
+  EXPECT_FALSE(CanTriggerXlaCompilation(graph_def));
+}
+
+TEST_F(CompilabilityCheckUtilTest, TestXlaOpsCanTriggerXlaCompilation) {
+  GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+  Scope root = Scope::NewRootScope().ExitOnError();
+  FunctionDefLibrary library;
+
+  FunctionDef sort_func = FunctionDefHelper::Create(
+      "SortFunc",
+      /*in_def=*/{"x:float"},
+      /*out_def=*/{"res:float"},
+      /*attr_def=*/{},
+      /*node_def=*/{{{"t0"}, "XlaSort", {"x"}, {{"T", DT_FLOAT}}}},
+      /*ret_def*/ {{"res", "t0:output"}});
+
+  *library.add_function() = sort_func;
+
+  Output in = ops::Placeholder(root, DT_FLOAT);
+  NameAttrList b_name_attr;
+  b_name_attr.set_name("SortFunc");
+  ops::PartitionedCall call(root.WithOpName("call"), {in}, {DT_FLOAT},
+                            b_name_attr);
+
+  GraphDef graph_def;
+  TF_ASSERT_OK(root.graph()->AddFunctionLibrary(library));
+  TF_ASSERT_OK(root.ToGraphDef(&graph_def));
+
+  EXPECT_TRUE(CanTriggerXlaCompilation(graph_def));
+}
+
+TEST_F(CompilabilityCheckUtilTest, TestCanTriggerXlaCompilation) {
+  GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+  Scope root = Scope::NewRootScope().ExitOnError();
+  FunctionDefLibrary library;
+
+  AttrValue true_attribute;
+  true_attribute.set_b(true);
+
+  FunctionDef identity_func = FunctionDefHelper::Create(
+      "IdentityFunc",
+      /*in_def=*/{"x:float"},
+      /*out_def=*/{"res:float"},
+      /*attr_def=*/{},
+      /*node_def=*/{{{"t0"}, "Identity", {"x"}, {{"T", DT_FLOAT}}}},
+      /*ret_def*/ {{"res", "t0:output"}});
+
+  (*identity_func.mutable_attr())[kXlaMustCompileAttr] = true_attribute;
+
+  FunctionDef call_identity = FunctionDefHelper::Create(
+      "CallIdentity",
+      /*in_def=*/{"x:float"},
+      /*out_def=*/{"z:float"}, /*attr_def=*/{},
+      /*node_def=*/
+      {{{"func_call"},
+        "PartitionedCall",
+        {"x"},
+        {{"Tin", DataTypeSlice({DT_FLOAT})},
+         {"Tout", DataTypeSlice({DT_FLOAT})},
+         {"f",
+          FunctionDefHelper::FunctionRef("IdentityRef", {{"T", DT_FLOAT}})},
+         {kXlaMustCompileAttr, true}}}},
+      /*ret_def=*/{{"z", "func_call:output:0"}});
+
+  *library.add_function() = identity_func;
+  *library.add_function() = call_identity;
+
+  Output in = ops::Placeholder(root, DT_FLOAT);
+  NameAttrList b_name_attr;
+  b_name_attr.set_name("CallIdentity");
+  ops::PartitionedCall call(root.WithOpName("call"), {in}, {DT_FLOAT},
+                            b_name_attr);
+
+  GraphDef graph_def;
+  TF_ASSERT_OK(root.graph()->AddFunctionLibrary(library));
+  TF_ASSERT_OK(root.ToGraphDef(&graph_def));
+
+  EXPECT_TRUE(CanTriggerXlaCompilation(graph_def));
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/defs.cc b/tensorflow/compiler/jit/defs.cc
index 4bea71e8fc1..84e1e36bcf6 100644
--- a/tensorflow/compiler/jit/defs.cc
+++ b/tensorflow/compiler/jit/defs.cc
@@ -28,4 +28,6 @@ const char* const kXlaScopeAttr = "_XlaScope";
 // only when auto_jit is ON.
 const char* const kXlaInternalScopeAttr = "_XlaInternalScope";
 
+const char* const kXlaClusterIdAttr = "_xla_compile_id";
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/defs.h b/tensorflow/compiler/jit/defs.h
index 9eb4c2ca2e8..fa983db8df8 100644
--- a/tensorflow/compiler/jit/defs.h
+++ b/tensorflow/compiler/jit/defs.h
@@ -35,6 +35,9 @@ extern const char* const kXlaCompileAttr;  // "_XlaCompile"
 extern const char* const kXlaScopeAttr;    // "_XlaScope"
 extern const char* const kXlaInternalScopeAttr;  // "_XlaInternalScope"
 
+// The id of the compiled cluster.
+extern const char* const kXlaClusterIdAttr;  // "_xla_compile_id"
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_JIT_DEFS_H_
diff --git a/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc b/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc
index ed25baa62ff..4a5c79c02d9 100644
--- a/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc
+++ b/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "absl/strings/ascii.h"
 #include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/jit/encapsulate_subgraphs_pass.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/framework/node_def.pb.h"
@@ -34,9 +35,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-const char* const EncapsulateXlaComputationsPass::kXlaClusterAttr =
-    "_xla_compile_id";
-
 namespace {
 
 const char* const kXlaClusterOutput = "XlaClusterOutput";
@@ -45,10 +43,7 @@ bool IsCpuGpuCompile(const Graph* graph) {
   for (Node* n : graph->nodes()) {
     string name;
     // Only consider nodes being compiled.
-    if (!GetNodeAttr(n->attrs(),
-                     EncapsulateXlaComputationsPass::kXlaClusterAttr, &name)
-             .ok())
-      continue;
+    if (!GetNodeAttr(n->attrs(), kXlaClusterIdAttr, &name).ok()) continue;
     // Early return for any node with a device that is not a CPU or GPU.
     DeviceNameUtils::ParsedName parsed;
     if (DeviceNameUtils::ParseFullName(n->requested_device(), &parsed)) {
@@ -180,8 +175,7 @@ Status RewriteSubgraph(const std::vector<OutputTensor>& arg_source_tensors,
     retvals[i]->AddAttr("index", i);
   }
 
-  AddNodeAttr(EncapsulateXlaComputationsPass::kXlaClusterAttr, call_def->name(),
-              call_def);
+  AddNodeAttr(kXlaClusterIdAttr, call_def->name(), call_def);
   AddNodeAttr("_variable_start_index", variable_start_index, call_def);
 
   // Uniquify the function name.
@@ -216,8 +210,8 @@ Status RewriteSubgraph(const std::vector<OutputTensor>& arg_source_tensors,
   // O(n) pass over the edges.
   for (const Edge* e : (*graph)->edges()) {
     if (!e->IsControlEdge() &&
-        e->src()->attrs().Find(kXlaClusterAttr) != nullptr &&
-        e->dst()->attrs().Find(kXlaClusterAttr) == nullptr &&
+        e->src()->attrs().Find(kXlaClusterIdAttr) != nullptr &&
+        e->dst()->attrs().Find(kXlaClusterIdAttr) == nullptr &&
         e->dst()->type_string() != kXlaClusterOutput) {
       return errors::InvalidArgument(
           "Undeclared output of XLA computation. Some common causes of this "
@@ -232,9 +226,9 @@ Status RewriteSubgraph(const std::vector<OutputTensor>& arg_source_tensors,
 
   auto output = absl::make_unique<Graph>((*graph)->op_registry());
   TF_RETURN_WITH_CONTEXT_IF_ERROR(
-      EncapsulateSubgraphsInFunctions(kXlaClusterAttr, **graph, RewriteSubgraph,
-                                      /*reuse_existing_functions=*/true,
-                                      &output, flib_def),
+      EncapsulateSubgraphsInFunctions(
+          kXlaClusterIdAttr, **graph, RewriteSubgraph,
+          /*reuse_existing_functions=*/true, &output, flib_def),
       "EncapsulateXlaComputationsPass failed");
   graph->swap(output);
   return Status::OK();
@@ -246,7 +240,7 @@ Status RewriteSubgraph(const std::vector<OutputTensor>& arg_source_tensors,
   // while iterating.
   std::vector<Node*> launch_nodes;
   for (Node* n : graph->nodes()) {
-    const string& name = GetNodeAttrString(n->attrs(), kXlaClusterAttr);
+    const string& name = GetNodeAttrString(n->attrs(), kXlaClusterIdAttr);
     if (!name.empty()) {
       launch_nodes.push_back(n);
     }
diff --git a/tensorflow/compiler/jit/encapsulate_xla_computations_pass.h b/tensorflow/compiler/jit/encapsulate_xla_computations_pass.h
index 3057e4c7469..9931b23fa41 100644
--- a/tensorflow/compiler/jit/encapsulate_xla_computations_pass.h
+++ b/tensorflow/compiler/jit/encapsulate_xla_computations_pass.h
@@ -34,8 +34,6 @@ namespace tensorflow {
 // XlaLaunch operators.
 class EncapsulateXlaComputationsPass : public GraphOptimizationPass {
  public:
-  static const char* const kXlaClusterAttr;  // _xla_compile_id
-
   Status Run(const GraphOptimizationPassOptions& options) override;
 
   // The following methods are public only for unit tests.
diff --git a/tensorflow/compiler/jit/encapsulate_xla_computations_pass_test.cc b/tensorflow/compiler/jit/encapsulate_xla_computations_pass_test.cc
index cc177036591..61c9a3ff9c0 100644
--- a/tensorflow/compiler/jit/encapsulate_xla_computations_pass_test.cc
+++ b/tensorflow/compiler/jit/encapsulate_xla_computations_pass_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/cc/ops/function_ops.h"
 #include "tensorflow/cc/ops/resource_variable_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/jit/encapsulate_subgraphs_pass.h"
 #include "tensorflow/compiler/tf2xla/cc/ops/xla_jit_ops.h"
 #include "tensorflow/compiler/tf2xla/test_util.h"
@@ -46,19 +47,18 @@ static std::unique_ptr<Graph> MakeOuterGraph(
   auto w = ops::Placeholder(scope.WithOpName("W"), DT_RESOURCE);
 
   NodeDef def;
-  TF_CHECK_OK(
-      NodeDefBuilder("launch0", function, &flib_def)
-          .Input(a.node()->name(), 0, DT_INT32)
-          .Input(b.node()->name(), 0, DT_FLOAT)
-          .Input(c.node()->name(), 0, DT_INT32)
-          .Input(d.node()->name(), 0, DT_FLOAT)
-          .Input(u.node()->name(), 0, DT_RESOURCE)
-          .Input(v.node()->name(), 0, DT_RESOURCE)
-          .Input(w.node()->name(), 0, DT_RESOURCE)
-          .Device("/gpu:0")
-          .Attr(EncapsulateXlaComputationsPass::kXlaClusterAttr, "launch0")
-          .Attr("_variable_start_index", 4)
-          .Finalize(&def));
+  TF_CHECK_OK(NodeDefBuilder("launch0", function, &flib_def)
+                  .Input(a.node()->name(), 0, DT_INT32)
+                  .Input(b.node()->name(), 0, DT_FLOAT)
+                  .Input(c.node()->name(), 0, DT_INT32)
+                  .Input(d.node()->name(), 0, DT_FLOAT)
+                  .Input(u.node()->name(), 0, DT_RESOURCE)
+                  .Input(v.node()->name(), 0, DT_RESOURCE)
+                  .Input(w.node()->name(), 0, DT_RESOURCE)
+                  .Device("/gpu:0")
+                  .Attr(kXlaClusterIdAttr, "launch0")
+                  .Attr("_variable_start_index", 4)
+                  .Finalize(&def));
 
   Status status;
   Node* launch = scope.graph()->AddNode(def, &status);
@@ -107,7 +107,7 @@ static std::unique_ptr<Graph> MakeBodyGraph() {
   auto arg6 = ops::_Arg(scope.WithOpName("w_0_arg"), DT_RESOURCE, 6);
 
   auto add_attrs = [](Node* node) {
-    node->AddAttr(EncapsulateXlaComputationsPass::kXlaClusterAttr, "launch0");
+    node->AddAttr(kXlaClusterIdAttr, "launch0");
     node->set_requested_device("/gpu:0");
   };
 
@@ -155,8 +155,7 @@ TEST(EncapsulateXlaComputations, DeterministicEncapsulate) {
                                     : ops::Add(scope.WithOpName("E"), a1, a0);
 
       auto add_attrs = [](Node* node) {
-        node->AddAttr(EncapsulateXlaComputationsPass::kXlaClusterAttr,
-                      "launch0");
+        node->AddAttr(kXlaClusterIdAttr, "launch0");
       };
       add_attrs(e.node());
 
@@ -216,7 +215,7 @@ TEST(EncapsulateXlaComputations, Encapsulate) {
     auto w = ops::Placeholder(scope.WithOpName("W"), DT_RESOURCE);
 
     auto add_attrs = [](Node* node) {
-      node->AddAttr(EncapsulateXlaComputationsPass::kXlaClusterAttr, "launch0");
+      node->AddAttr(kXlaClusterIdAttr, "launch0");
       node->set_requested_device("/gpu:0");
     };
 
diff --git a/tensorflow/compiler/jit/flags.cc b/tensorflow/compiler/jit/flags.cc
index d1301a8c40f..ee7daf092da 100644
--- a/tensorflow/compiler/jit/flags.cc
+++ b/tensorflow/compiler/jit/flags.cc
@@ -159,7 +159,7 @@ void AllocateAndParseFlags() {
 
   device_flags = new XlaDeviceFlags;
   device_flags->tf_xla_compile_on_demand = false;
-  device_flags->tf_xla_enable_xla_devices = true;
+  device_flags->tf_xla_enable_xla_devices = false;
 
   ops_flags = new XlaOpsCommonFlags;
   ops_flags->tf_xla_always_defer_compilation = false;
@@ -268,10 +268,10 @@ void AppendMarkForCompilationPassFlags(std::vector<Flag>* flag_list) {
   AppendMarkForCompilationPassFlagsInternal(flag_list);
 }
 
-static bool xla_is_enabled = false;
+static std::atomic<bool> xla_compilation_disabled(false);
 
-void SetXlaIsEnabled() { xla_is_enabled = true; }
+void DisableXlaCompilation() { xla_compilation_disabled = true; }
 
-bool IsXlaEnabled() { return xla_is_enabled; }
+bool FailOnXlaCompilation() { return xla_compilation_disabled; }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/flags.h b/tensorflow/compiler/jit/flags.h
index 89e20d9f8ea..5612b3b5864 100644
--- a/tensorflow/compiler/jit/flags.h
+++ b/tensorflow/compiler/jit/flags.h
@@ -162,13 +162,12 @@ MlirCommonFlags* GetMlirCommonFlags();
 void AppendMarkForCompilationPassFlags(
     std::vector<tensorflow::Flag>* flag_list);
 
-// Makes all future calls to `IsXlaEnabled()` return `true`.
-//
-// Should only be called when XLA is linked in.
-void SetXlaIsEnabled();
+// Disables XLA compilation, forces it to return an error message instead. Can
+// be used by a server to ensure that JIT compilation is opt-in.
+void DisableXlaCompilation();
 
-// Returns whether XLA is enabled.
-bool IsXlaEnabled();
+// Returns `false` unless `DisableXlaCompilation` was called.
+bool FailOnXlaCompilation();
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/jit/force_xla_constants_on_host_pass.cc b/tensorflow/compiler/jit/force_xla_constants_on_host_pass.cc
index 3ba32f07506..3692d1f3aba 100644
--- a/tensorflow/compiler/jit/force_xla_constants_on_host_pass.cc
+++ b/tensorflow/compiler/jit/force_xla_constants_on_host_pass.cc
@@ -38,10 +38,12 @@ Status ForceXlaConstantsOnHostPass::Run(
       std::vector<int> constant_arg_indices;
       std::vector<int> resource_arg_indices;
 
+      NameAttrList function;
+      TF_RETURN_IF_ERROR(NameAndAttrsFromFunctionCall(node->def(), &function));
+
       // Force all constants to be on the host memory.
       TF_RETURN_IF_ERROR(GetBodyAndConstantsAndResources(
-          flr, node->def(), &fbody, &constant_arg_indices,
-          &resource_arg_indices));
+          flr, function, &fbody, &constant_arg_indices, &resource_arg_indices));
       VLOG(3) << "Found constant arg indices: "
               << absl::StrJoin(constant_arg_indices, ", ");
 
diff --git a/tensorflow/compiler/jit/get_compiler_ir.cc b/tensorflow/compiler/jit/get_compiler_ir.cc
new file mode 100644
index 00000000000..7c6a7583c3a
--- /dev/null
+++ b/tensorflow/compiler/jit/get_compiler_ir.cc
@@ -0,0 +1,114 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/get_compiler_ir.h"
+
+#include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "tensorflow/compiler/jit/compilability_check_util.h"
+#include "tensorflow/compiler/jit/defs.h"
+#include "tensorflow/compiler/jit/flags.h"
+#include "tensorflow/compiler/jit/xla_launch_util.h"
+#include "tensorflow/compiler/jit/xla_platform_info.h"
+#include "tensorflow/compiler/tf2xla/const_analysis.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/util/ptr_util.h"
+
+namespace tensorflow {
+
+xla::StatusOr<std::string> GetCompilerIr(
+    IrExportStage stage, ProcessFunctionLibraryRuntime* pflr,
+    absl::string_view func_name, Device* dev,
+    absl::Span<const Tensor* const> inputs) {
+  NameAttrList function;
+  function.set_name(std::string{func_name});
+
+  FunctionLibraryRuntime* flr = pflr->GetFLR(dev->name());
+  ResourceMgr* rmgr = dev->resource_manager();
+
+  const FunctionBody* fbody = nullptr;
+  std::vector<int> constant_arg_indices;
+  std::vector<int> resource_arg_indices;
+  TF_RETURN_IF_ERROR(GetBodyAndConstantsAndResources(
+      flr, function, &fbody, &constant_arg_indices, &resource_arg_indices));
+
+  MemoryTypeVector input_memory_types =
+      GetInputMemoryTypes(fbody, constant_arg_indices, resource_arg_indices);
+  MemoryTypeVector output_memory_types = GetOutputMemoryTypes(fbody);
+
+  std::vector<VariableInfo> variable_infos;
+  TF_RETURN_IF_ERROR(GetVariableInfosFromInputs(
+      rmgr, dev, inputs, resource_arg_indices, &variable_infos));
+  TF_RETURN_IF_ERROR(LockVariables(absl::MakeSpan(variable_infos)));
+
+  XlaPlatformInfo platform_info = XlaPlatformInfoFromDevice(dev);
+
+  XlaCompilationCache* cache;
+  TF_RETURN_IF_ERROR(rmgr->LookupOrCreate<XlaCompilationCache>(
+      rmgr->default_container(), "xla_cache", &cache,
+      [&](XlaCompilationCache** cache_write_into) {
+        return BuildXlaCompilationCache(dev, platform_info, cache_write_into);
+      }));
+  core::ScopedUnref cache_ref(cache);
+
+  absl::optional<se::TfAllocatorAdapter> tf_allocator_adapter;
+
+  XlaCompiler::Options options =
+      GenerateCompilerOptions(*cache, *flr, dev,
+                              /*stream=*/nullptr, platform_info,
+                              /*has_ref_vars=*/false, &tf_allocator_adapter);
+
+  XlaCompiler::CompileOptions compile_options;
+  compile_options.always_return_tuple = false;
+  compile_options.alias_resource_update = true;
+
+  XlaCompiler compiler(options);
+
+  xla::StatusOr<std::vector<XlaCompiler::Argument>> args =
+      XlaComputationLaunchContext::BuildXlaCompilerArguments(
+          constant_arg_indices, inputs, variable_infos);
+  TF_RETURN_IF_ERROR(args.status());
+
+  switch (stage) {
+    case IrExportStage::HLO: {
+      XlaCompiler::CompilationResult result;
+      TF_RETURN_IF_ERROR(
+          compiler.CompileFunction(compile_options, function, *args, &result));
+
+      TF_ASSIGN_OR_RETURN(xla::ProgramShape program_shape,
+                          result.computation->GetProgramShape());
+      xla::HloModuleConfig config(program_shape);
+      TF_ASSIGN_OR_RETURN(
+          std::unique_ptr<xla::HloModule> new_module,
+          xla::HloModule::CreateFromProto(result.computation->proto(), config));
+
+      return new_module->ToString();
+    }
+    case IrExportStage::OPTIMIZED_HLO: {
+      const XlaCompiler::CompilationResult* compilation_result = nullptr;
+      xla::LocalExecutable* executable = nullptr;
+      TF_RETURN_IF_ERROR(
+          cache->Compile(options, function, *args, compile_options,
+                         XlaCompilationCache::CompileMode::kStrict,
+                         &compilation_result, &executable));
+      return executable->executable()->module().ToString();
+    }
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/get_compiler_ir.h b/tensorflow/compiler/jit/get_compiler_ir.h
new file mode 100644
index 00000000000..81e5af29279
--- /dev/null
+++ b/tensorflow/compiler/jit/get_compiler_ir.h
@@ -0,0 +1,39 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_JIT_GET_COMPILER_IR_H_
+#define TENSORFLOW_COMPILER_JIT_GET_COMPILER_IR_H_
+
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace tensorflow {
+
+class ProcessFunctionLibraryRuntime;
+class Device;
+class Tensor;
+
+enum class IrExportStage { HLO, OPTIMIZED_HLO };
+
+// Returns HLO text for a given function `func_name` using library runtime
+// `runtime` on a device `dev` with given `inputs`.
+xla::StatusOr<std::string> GetCompilerIr(
+    IrExportStage stage, ProcessFunctionLibraryRuntime* pflr,
+    absl::string_view func_name, Device* dev,
+    absl::Span<const Tensor* const> inputs);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_GET_COMPILER_IR_H_
diff --git a/tensorflow/compiler/jit/kernels/xla_ops.cc b/tensorflow/compiler/jit/kernels/xla_ops.cc
index 38e33a60657..12b40b1c83b 100644
--- a/tensorflow/compiler/jit/kernels/xla_ops.cc
+++ b/tensorflow/compiler/jit/kernels/xla_ops.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/jit/xla_activity_listener.h"
 #include "tensorflow/compiler/jit/xla_cluster_util.h"
+#include "tensorflow/compiler/jit/xla_platform_info.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
@@ -63,38 +64,6 @@ namespace tensorflow {
 
 namespace {
 
-XlaPlatformInfo PlatformInfoFromContext(OpKernelConstruction* ctx) {
-  DeviceType device_type = ctx->device_type();
-  se::Platform::Id platform_id = nullptr;
-  const XlaDevice::Metadata* xla_device_metadata = nullptr;
-  se::DeviceMemoryAllocator* custom_allocator = nullptr;
-
-  if (ctx->device_type() == DeviceType(DEVICE_CPU)) {
-    platform_id = se::host::kHostPlatformId;
-  } else if (ctx->device_type() == DeviceType(DEVICE_GPU)) {
-    platform_id = ctx->device()
-                      ->tensorflow_gpu_device_info()
-                      ->stream->parent()
-                      ->platform()
-                      ->id();
-  } else if (XlaDevice::GetMetadata(ctx, &xla_device_metadata).ok()) {
-    // If we are on an XlaDevice, use the underlying XLA platform's allocator
-    // directly. We could use the StreamExecutor's allocator which may
-    // theoretically be more correct, but XLA returns a nice OOM message in a
-    // Status and StreamExecutor does not.
-    //
-    // Importantly we can't use ctx->device()->GetAllocator() as the allocator
-    // (which xla_allocator above uses) as on an XlaDevice, this is a dummy
-    // allocator that returns XlaTensor objects. The XlaCompiler needs a real
-    // allocator to allocate real buffers.
-    platform_id = xla_device_metadata->platform()->id();
-    custom_allocator =
-        xla_device_metadata->client()->backend().memory_allocator();
-  }
-
-  return XlaPlatformInfo(device_type, platform_id, xla_device_metadata,
-                         custom_allocator);
-}
 
 // A closure describing how to run a compiled version of a TensorFlow function.
 //
@@ -178,31 +147,6 @@ class XlaExecutableClosureStore {
   TF_DISALLOW_COPY_AND_ASSIGN(XlaExecutableClosureStore);
 };
 
-// Return allocator from platform info if non-null, or populate and return a
-// pointer to the allocator adapter with allocator from context.
-//
-// This is necessary because for XLA devices the underlying TF allocator returns
-// dummy tensors.
-se::DeviceMemoryAllocator* GetAllocator(
-    absl::optional<se::TfAllocatorAdapter>* tf_allocator_adapter,
-    OpKernelContext* ctx, const XlaPlatformInfo& platform_info) {
-  if (platform_info.custom_allocator()) {
-    return platform_info.custom_allocator();
-  }
-  if (!ctx->op_device_context()) {
-    // Stream is not set for the host platform.
-    se::Platform* platform =
-        se::MultiPlatformManager::PlatformWithId(platform_info.platform_id())
-            .ValueOrDie();
-    tf_allocator_adapter->emplace(ctx->device()->GetAllocator({}), platform);
-    return &tf_allocator_adapter->value();
-  }
-  // platform_info.
-  tf_allocator_adapter->emplace(ctx->device()->GetAllocator({}),
-                                ctx->op_device_context()->stream());
-  return &tf_allocator_adapter->value();
-}
-
 }  // namespace
 
 XlaLocalLaunchBase::XlaLocalLaunchBase(OpKernelConstruction* ctx,
@@ -214,68 +158,13 @@ XlaLocalLaunchBase::XlaLocalLaunchBase(OpKernelConstruction* ctx,
       constants_(constants),
       resources_(resources),
       function_(function),
-      platform_info_(PlatformInfoFromContext(ctx)),
+      platform_info_(XlaPlatformInfoFromDevice(ctx->device())),
       has_ref_vars_(has_ref_vars) {}
 
-static Status BuildCompilationCache(OpKernelContext* ctx,
-                                    const XlaPlatformInfo& platform_info,
-                                    XlaCompilationCache** cache) {
-  if (platform_info.xla_device_metadata()) {
-    *cache = new XlaCompilationCache(
-        platform_info.xla_device_metadata()->client(),
-        platform_info.xla_device_metadata()->jit_device_type());
-    return Status::OK();
-  }
-
-  auto platform =
-      se::MultiPlatformManager::PlatformWithId(platform_info.platform_id());
-  if (!platform.ok()) {
-    return platform.status();
-  }
-
-  xla::StatusOr<xla::Compiler*> compiler_for_platform =
-      xla::Compiler::GetForPlatform(platform.ValueOrDie());
-  if (!compiler_for_platform.ok()) {
-    // In some rare cases (usually in unit tests with very small clusters) we
-    // may end up transforming an XLA cluster with at least one GPU operation
-    // (which would normally force the cluster to be compiled using XLA:GPU)
-    // into an XLA cluster with no GPU operations (i.e. containing only CPU
-    // operations).  Such a cluster can fail compilation (in way that
-    // MarkForCompilation could not have detected) if the CPU JIT is not linked
-    // in.
-    //
-    // So bail out of _XlaCompile in this case, and let the executor handle the
-    // situation for us.
-    const Status& status = compiler_for_platform.status();
-    if (status.code() == error::NOT_FOUND) {
-      return errors::Unimplemented("Could not find compiler for platform ",
-                                   platform.ValueOrDie()->Name(), ": ",
-                                   status.ToString());
-    }
-  }
-
-  xla::LocalClientOptions client_options;
-  client_options.set_platform(platform.ValueOrDie());
-  client_options.set_intra_op_parallelism_threads(
-      ctx->device()->tensorflow_cpu_worker_threads()->num_threads);
-  auto client = xla::ClientLibrary::GetOrCreateLocalClient(client_options);
-  if (!client.ok()) {
-    return client.status();
-  }
-  const XlaOpRegistry::DeviceRegistration* registration;
-  if (!XlaOpRegistry::GetCompilationDevice(platform_info.device_type().type(),
-                                           &registration)) {
-    return errors::InvalidArgument("No JIT device registered for ",
-                                   platform_info.device_type().type());
-  }
-  *cache = new XlaCompilationCache(
-      client.ValueOrDie(), DeviceType(registration->compilation_device_name));
-  return Status::OK();
-}
-
 static Status CompileToLocalExecutable(
     OpKernelContext* ctx, const NameAttrList& function, bool has_ref_vars,
     const XlaPlatformInfo& platform_info,
+    absl::Span<const Tensor* const> inputs,
     absl::Span<VariableInfo const> variable_infos,
     absl::Span<const int> constants, bool lazy, bool may_alias_resource_update,
     xla::LocalClient** client,
@@ -292,7 +181,7 @@ static Status CompileToLocalExecutable(
   TF_RETURN_IF_ERROR(rm->LookupOrCreate<XlaCompilationCache>(
       rm->default_container(), "xla_cache", &cache,
       [&](XlaCompilationCache** cache) {
-        return BuildCompilationCache(ctx, platform_info, cache);
+        return BuildXlaCompilationCache(ctx->device(), platform_info, cache);
       }));
   // Hold the reference to the JIT during evaluation. (We could probably
   // free it sooner because the ResourceMgr will retain a reference, but
@@ -302,32 +191,11 @@ static Status CompileToLocalExecutable(
   *client = static_cast<xla::LocalClient*>(cache->client());
 
   absl::optional<se::TfAllocatorAdapter> tf_allocator_adapter;
-  XlaCompiler::Options options;
-  options.client = *client;
-  if (ctx->op_device_context() != nullptr) {
-    options.device_ordinal =
-        ctx->op_device_context()->stream()->parent()->device_ordinal();
-  }
-  options.device_type = cache->device_type();
-  options.flib_def = ctx->function_library()->GetFunctionLibraryDefinition();
-  options.graph_def_version = ctx->function_library()->graph_def_version();
-  options.allow_cpu_custom_calls =
-      (platform_info.platform_id() == se::host::kHostPlatformId);
-  options.device_allocator =
-      GetAllocator(&tf_allocator_adapter, ctx, platform_info);
-  if (platform_info.xla_device_metadata()) {
-    options.shape_representation_fn =
-        platform_info.xla_device_metadata()->shape_representation_fn();
-  }
-  // If reference variables are not present in the graph, we can safely alias
-  // passthrough parameters without performing a copy.
-  options.alias_passthrough_params =
-      !has_ref_vars && !platform_info.is_on_xla_device();
+  XlaCompiler::Options options = GenerateCompilerOptions(
+      *cache, *ctx->function_library(), ctx->device(),
+      ctx->op_device_context() ? ctx->op_device_context()->stream() : nullptr,
+      platform_info, has_ref_vars, &tf_allocator_adapter);
 
-  std::map<int, Tensor> constant_args;
-  for (int i : constants) {
-    constant_args.insert({i, ctx->input(i)});
-  }
   XlaCompiler::CompileOptions compile_options;
   compile_options.is_entry_computation = true;
   // Optimization: where possible, have the computation return a naked array
@@ -337,10 +205,11 @@ static Status CompileToLocalExecutable(
                                           !platform_info.is_on_xla_device() &&
                                           may_alias_resource_update;
 
-  std::vector<XlaCompiler::Argument> args;
-  TF_RETURN_IF_ERROR(XlaComputationLaunchContext::BuildXlaCompilerArguments(
-      constant_args, variable_infos, ctx, &args));
-  return cache->Compile(options, function, args, compile_options,
+  xla::StatusOr<std::vector<XlaCompiler::Argument>> args =
+      XlaComputationLaunchContext::BuildXlaCompilerArguments(constants, inputs,
+                                                             variable_infos);
+  TF_RETURN_IF_ERROR(args.status());
+  return cache->Compile(options, function, *args, compile_options,
                         lazy ? XlaCompilationCache::CompileMode::kLazy
                              : XlaCompilationCache::CompileMode::kStrict,
                         compilation_result, executable);
@@ -350,6 +219,7 @@ void XlaLocalLaunchBase::Compute(OpKernelContext* ctx) {
   VLOG(1) << "XlaLocalLaunchOpBase::Compute "
           << Canonicalize(function_.name(), AttrSlice(&function_.attr()));
 
+  std::vector<const Tensor*> inputs = InputsFromContext(ctx);
   xla::LocalClient* client;
   const XlaCompiler::CompilationResult* compilation_result;
   xla::LocalExecutable* executable;
@@ -357,10 +227,11 @@ void XlaLocalLaunchBase::Compute(OpKernelContext* ctx) {
   std::vector<VariableInfo> variable_infos;
   {
     OP_REQUIRES_OK(
-        ctx, GetVariableInfosFromCtxInputs(ctx, resources_, &variable_infos));
+        ctx, GetVariableInfosFromInputs(ctx->resource_manager(), ctx->device(),
+                                        inputs, resources_, &variable_infos));
     OP_REQUIRES_OK(ctx, LockVariables(absl::MakeSpan(variable_infos)));
     Status s = CompileToLocalExecutable(
-        ctx, function_, /*has_ref_vars=*/has_ref_vars_, platform_info_,
+        ctx, function_, /*has_ref_vars=*/has_ref_vars_, platform_info_, inputs,
         variable_infos, constants_, /*lazy=*/false,
         /*may_alias_resource_update=*/true, &client, &compilation_result,
         &executable);
@@ -378,8 +249,10 @@ void XlaLocalLaunchBase::Compute(OpKernelContext* ctx) {
   VLOG(1) << "Executing XLA Computation...";
 
   absl::optional<se::TfAllocatorAdapter> tf_allocator_adapter;
-  se::DeviceMemoryAllocator* allocator =
-      GetAllocator(&tf_allocator_adapter, ctx, platform_info_);
+  se::DeviceMemoryAllocator* allocator = GetAllocator(
+      &tf_allocator_adapter, ctx->device(),
+      ctx->op_device_context() ? ctx->op_device_context()->stream() : nullptr,
+      platform_info_);
   int device_ordinal = stream ? stream->parent()->device_ordinal()
                               : client->default_device_ordinal();
   XlaComputationLaunchContext launch_context(
@@ -503,7 +376,7 @@ XlaCompileOp::XlaCompileOp(OpKernelConstruction* ctx)
       constants_(ConstantsVector(ctx)),
       resources_(ResourcesVector(ctx)),
       function_(FunctionAttr(ctx)),
-      platform_info_(PlatformInfoFromContext(ctx)),
+      platform_info_(XlaPlatformInfoFromDevice(ctx->device())),
       must_compile_(MustCompileAttr(ctx)),
       has_ref_vars_(HasRefVars(ctx)) {}
 
@@ -515,6 +388,7 @@ void XlaCompileOp::Compute(OpKernelContext* ctx) {
   xla::LocalExecutable* executable;
   ResourceVarsSnapshot variables;
 
+  std::vector<const Tensor*> inputs = InputsFromContext(ctx);
   bool cannot_compile_cluster;
   {
     mutex_lock guard(cannot_compile_cluster_mu_);
@@ -527,13 +401,14 @@ void XlaCompileOp::Compute(OpKernelContext* ctx) {
   } else {
     std::vector<VariableInfo> variable_infos;
     OP_REQUIRES_OK(
-        ctx, GetVariableInfosFromCtxInputs(ctx, resources_, &variable_infos));
+        ctx, GetVariableInfosFromInputs(ctx->resource_manager(), ctx->device(),
+                                        inputs, resources_, &variable_infos));
     OP_REQUIRES_OK(ctx, LockVariables(absl::MakeSpan(variable_infos)));
 
     // Do not alias resource updates as locking variables in XlaCompile and
     // unlocking them in XlaRun may lead to deadlocks.
     Status status = CompileToLocalExecutable(
-        ctx, function_, has_ref_vars_, platform_info_, variable_infos,
+        ctx, function_, has_ref_vars_, platform_info_, inputs, variable_infos,
         constants_,
         /*lazy=*/!must_compile_,
         /*may_alias_resource_update=*/false, &client, &kernel, &executable);
@@ -591,7 +466,7 @@ void XlaCompileOp::Compute(OpKernelContext* ctx) {
 }
 
 XlaRunOp::XlaRunOp(OpKernelConstruction* ctx)
-    : OpKernel(ctx), platform_info_(PlatformInfoFromContext(ctx)) {}
+    : OpKernel(ctx), platform_info_(XlaPlatformInfoFromDevice(ctx->device())) {}
 
 void XlaRunOp::Compute(OpKernelContext* ctx) {
   VLOG(3) << "XlaRunOp " << def().name();
@@ -602,8 +477,10 @@ void XlaRunOp::Compute(OpKernelContext* ctx) {
       XlaExecutableClosureStore::Global()->Consume(key);
 
   absl::optional<se::TfAllocatorAdapter> tf_allocator_adapter;
-  se::DeviceMemoryAllocator* allocator =
-      GetAllocator(&tf_allocator_adapter, ctx, platform_info_);
+  se::DeviceMemoryAllocator* allocator = GetAllocator(
+      &tf_allocator_adapter, ctx->device(),
+      ctx->op_device_context() ? ctx->op_device_context()->stream() : nullptr,
+      platform_info_);
   se::Stream* stream =
       ctx->op_device_context() ? ctx->op_device_context()->stream() : nullptr;
   int device_ordinal = stream ? stream->parent()->device_ordinal()
diff --git a/tensorflow/compiler/jit/kernels/xla_ops.h b/tensorflow/compiler/jit/kernels/xla_ops.h
index 112408226a8..78707c8126d 100644
--- a/tensorflow/compiler/jit/kernels/xla_ops.h
+++ b/tensorflow/compiler/jit/kernels/xla_ops.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/compiler/jit/xla_compilation_cache.h"
 #include "tensorflow/compiler/jit/xla_device.h"
 #include "tensorflow/compiler/jit/xla_launch_util.h"
+#include "tensorflow/compiler/jit/xla_platform_info.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -31,61 +32,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-// Holds some information about the platform on which an
-// XlaLaunch/_XlaCompile/_XlaRun op must run on.
-class XlaPlatformInfo {
- public:
-  XlaPlatformInfo() : device_type_("") {}
-  XlaPlatformInfo(XlaPlatformInfo&&) = default;
-  explicit XlaPlatformInfo(const DeviceType device_type,
-                           se::Platform::Id platform_id,
-                           const XlaDevice::Metadata* xla_device_metadata,
-                           se::DeviceMemoryAllocator* device_allocator)
-      : device_type_(device_type),
-        platform_id_(platform_id),
-        xla_device_metadata_(xla_device_metadata),
-        device_allocator_(device_allocator) {}
-
-  XlaPlatformInfo& operator=(XlaPlatformInfo&& other) = default;
-
-  bool UseMultipleStreams() const {
-    return xla_device_metadata_ && xla_device_metadata_->UseMultipleStreams();
-  }
-
-  // Non-null only when run on an XLA device.
-  se::DeviceMemoryAllocator* custom_allocator() const {
-    return device_allocator_;
-  }
-
-  DeviceType device_type() const { return device_type_; }
-
-  // This is equal to xla_device_metadata()->platform()->id() if
-  // xla_device_metadata() is not nullptr.
-  se::Platform::Id platform_id() const { return platform_id_; }
-
-  // This may be null if the op this XlaPlatformInfo is for was not placed on an
-  // XLA device.
-  const XlaDevice::Metadata* xla_device_metadata() const {
-    return xla_device_metadata_;
-  }
-  bool is_on_xla_device() const { return xla_device_metadata() != nullptr; }
-
- private:
-  DeviceType device_type_;
-  se::Platform::Id platform_id_;
-
-  // xla_device_metadata_ lives in the tensorflow::DeviceBase in which the
-  // XlaLaunch/_XlaCompile/_XlaRun op is placed and thus does not die before the
-  // XlaLaunch/_XlaCompile/_XlaRun OpKernel.
-  const XlaDevice::Metadata* xla_device_metadata_;
-
-  // If the op associated with this XlaPlatformInfo is placed on an XLA device
-  // then device_allocator_ is the xla::Backend's memory allocator.  If the op
-  // is placed on a regular CPU or GPU device then device_allocator_ is null.
-  se::DeviceMemoryAllocator* device_allocator_;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(XlaPlatformInfo);
-};
 
 // XlaLocalLaunchBase is almost the same as XlaLocalLaunchOp.
 // The only difference is that it does not require arguments to follow
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index 19eb61b6f72..81403fbf2dc 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -32,12 +32,12 @@ limitations under the License.
 #include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/jit/graphcycles/graphcycles.h"
 #include "tensorflow/compiler/jit/resource_operation_safety_analysis.h"
-#include "tensorflow/compiler/jit/union_find.h"
 #include "tensorflow/compiler/jit/xla_cluster_util.h"
 #include "tensorflow/compiler/tf2xla/const_analysis.h"
 #include "tensorflow/compiler/tf2xla/resource_operation_table.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/union_find.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/graph_constructor.h"
@@ -1196,12 +1196,9 @@ Status MarkForCompilationPassImpl::FindCompilationCandidates() {
       continue;
     }
 
-    DeviceType jit_device_type(registration->compilation_device_name);
-
-    RecursiveCompilabilityChecker::OperationFilter op_filter =
-        CreateOperationFilter(*registration);
-
-    if (!RecursiveCompilabilityChecker{&op_filter, &jit_device_type}
+    if (!RecursiveCompilabilityChecker{
+            CreateOperationFilter(*registration),
+            DeviceType{registration->compilation_device_name}}
              .IsCompilableNode(*node, lib_runtime)) {
       continue;
     }
@@ -1718,7 +1715,6 @@ bool IsCompilable(FunctionLibraryRuntime* flr, const NodeDef& ndef,
   const XlaOpRegistry::DeviceRegistration* registration;
   CHECK(XlaOpRegistry::GetCompilationDevice(device->device_type(),
                                             &registration));
-  DeviceType jit_device_type(registration->compilation_device_name);
 
   // We can always *compile* resource operations, stateful RNGs and dummy ops,
   // even if we are sometimes unable to auto-cluster them.
@@ -1733,7 +1729,8 @@ bool IsCompilable(FunctionLibraryRuntime* flr, const NodeDef& ndef,
   op_filter.allow_slow_ops = true;
   op_filter.allow_inaccurate_ops = true;
 
-  RecursiveCompilabilityChecker checker{&op_filter, &jit_device_type};
+  RecursiveCompilabilityChecker checker{
+      op_filter, DeviceType{registration->compilation_device_name}};
   if (!uncompilable_node_info) {
     // We do not need uncompilable node info. Just return the result.
     return checker.IsCompilableCall(ndef, flr);
@@ -1837,7 +1834,9 @@ absl::flat_hash_map<string, std::vector<string>>* GetAllowlistTable() {
       "ConcatOffset", "Const", "MirrorPad", "Pack", "Pad", "PadV2", "Reverse",
       "ReverseV2", "ReverseSequence", "Slice", "Split", "SplitV",
       "StridedSlice", "StridedSliceGrad", "ResourceStridedSliceAssign",
-      "Tile", "Transpose", "InvertPermutation", "Unpack", "DeviceIndex"}}};
+      "Tile", "Transpose", "InvertPermutation", "Unpack", "DeviceIndex",
+      "TensorStridedSliceUpdate",
+     }}};
   // clang-format on
   return result;
 }
@@ -1996,6 +1995,8 @@ absl::flat_hash_set<string> GetKnownXLAAllowlistOp() {
                                      "ResourceScatterNdUpdate",
                                      "ResourceScatterSub",
                                      "ResourceScatterUpdate",
+                                     "RngReadAndSkip",
+                                     "RngSkip",
                                      "Roll",
                                      "ScatterNd",
                                      "SelfAdjointEigV2",
@@ -2018,11 +2019,17 @@ absl::flat_hash_set<string> GetKnownXLAAllowlistOp() {
                                      "StatelessCase",
                                      "StatelessIf",
                                      "StatelessMultinomial",
+                                     "StatelessRandomGetKeyCounterAlg",
                                      "StatelessRandomNormal",
+                                     "StatelessRandomNormalV2",
                                      "StatelessRandomUniform",
+                                     "StatelessRandomUniformV2",
                                      "StatelessRandomUniformInt",
+                                     "StatelessRandomUniformIntV2",
                                      "StatelessRandomUniformFullInt",
+                                     "StatelessRandomUniformFullIntV2",
                                      "StatelessTruncatedNormal",
+                                     "StatelessTruncatedNormalV2",
                                      "StatelessWhile",
                                      "Svd",
                                      "SymbolicGradient",
@@ -2080,6 +2087,7 @@ absl::flat_hash_set<string> GetKnownXLAAllowlistOp() {
                                      "XlaSelectAndScatter",
                                      "XlaSelfAdjointEig",
                                      "XlaSend",
+                                     "XlaSetBound",
                                      "XlaSharding",
                                      "XlaSort",
                                      "XlaSpmdFullToShardShape",
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
index e88319bb732..1be3e5ba9e7 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
@@ -44,6 +44,11 @@ using ::tensorflow::testing::FindNodeByName;
 namespace tensorflow {
 namespace {
 
+static bool Initialized = [] {
+  tensorflow::GetXlaDeviceFlags()->tf_xla_enable_xla_devices = true;
+  return true;
+}();
+
 REGISTER_OP("UncompilableNullary").Output("o: float");
 REGISTER_OP("UncompilableUnary").Input("a: float").Output("o: float");
 
diff --git a/tensorflow/compiler/jit/partially_decluster_pass_test.cc b/tensorflow/compiler/jit/partially_decluster_pass_test.cc
index 7378d17f88d..87c9fbf0af7 100644
--- a/tensorflow/compiler/jit/partially_decluster_pass_test.cc
+++ b/tensorflow/compiler/jit/partially_decluster_pass_test.cc
@@ -406,37 +406,6 @@ TEST(PartiallyDeclusterPassTest, DontDeclusterXlaDeviceOps) {
   EXPECT_EQ(GetXlaClusterForNode(*n), "cluster_0");
 }
 
-TEST(PartiallyDeclusterPassTest, DontDeclusterNonTensorFlowOps) {
-  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  Output dynamic_slice_operand =
-      ops::Placeholder(s.WithOpName("dynamic_slice_operand"), DT_INT32,
-                       ops::Placeholder::Attrs{});
-  Output dynamic_slice_begin = ops::Placeholder(
-      s.WithOpName("dynamic_slice_begin"), DT_INT32, ops::Placeholder::Attrs{});
-  Output dynamic_slice_size = ops::Placeholder(
-      s.WithOpName("dynamic_slice_size"), DT_INT32, ops::Placeholder::Attrs{});
-  Output dynamic_slice =
-      ops::XlaDynamicSlice(s.WithOpName("dynamic_slice"), dynamic_slice_operand,
-                           dynamic_slice_begin, dynamic_slice_size);
-
-  Output reshape_input = ops::Placeholder(s.WithOpName("reshape_input"),
-                                          DT_FLOAT, ops::Placeholder::Attrs{});
-  Output reshape =
-      ops::Reshape(s.WithOpName("reshape"), reshape_input, dynamic_slice);
-
-  AddToCluster({dynamic_slice.node(), reshape.node()}, "cluster_0");
-
-  std::unique_ptr<Graph> graph = absl::make_unique<Graph>(OpRegistry::Global());
-  TF_ASSERT_OK(s.ToGraph(graph.get()));
-
-  Node* n = FindNodeByName(*graph, "dynamic_slice");
-  ASSERT_NE(n, nullptr);
-
-  TF_ASSERT_OK(PartiallyDecluster(&graph));
-
-  EXPECT_EQ(GetXlaClusterForNode(*n), "cluster_0");
-}
-
 TEST(PartiallyDeclusterPassTest, EliminatedUnusedNodes) {
   const char* const kClusteredProducer0Name = "ClusteredProducer0";
   const char* const kClusteredProducer1Name = "ClusteredProducer1";
diff --git a/tensorflow/compiler/jit/xla_compilation_cache.cc b/tensorflow/compiler/jit/xla_compilation_cache.cc
index b1525337dbc..fb184d62e27 100644
--- a/tensorflow/compiler/jit/xla_compilation_cache.cc
+++ b/tensorflow/compiler/jit/xla_compilation_cache.cc
@@ -20,9 +20,11 @@ limitations under the License.
 #include "absl/base/call_once.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
+#include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/jit/xla_activity.pb.h"
 #include "tensorflow/compiler/jit/xla_activity_listener.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.h"
+#include "tensorflow/compiler/mlir/utils/array_container_utils.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
@@ -278,25 +280,23 @@ Status XlaCompilationCache::CompileSingleOp(
     const NodeDef& node_def = ctx->op_kernel().def();
     TF_ASSIGN_OR_RETURN(auto graph, CreateGraph(node_def, args, result_dtypes));
 
-    bool are_args_supported =
-        absl::c_all_of(args, [](const XlaCompiler::Argument arg) {
-          return arg.kind == XlaCompiler::Argument::kConstant ||
-                 arg.kind == XlaCompiler::Argument::kParameter;
+    bool has_tensor_list_arg =
+        absl::c_any_of(args, [](const XlaCompiler::Argument arg) {
+          return arg.kind == XlaCompiler::Argument::kTensorList;
         });
     const ConfigProto* config = ctx->function_library()->config_proto();
     bool use_mlir = config && config->experimental().enable_mlir_bridge();
-    // TODO(b/155596779): Understand the source of other argument types and
-    // depending on the source either support those or avoid these codepath.
-    if (!use_mlir || !are_args_supported) {
+    // TODO(b/155596779): Support TensorList args.
+    if (!use_mlir || !has_tensor_list_arg) {
       return compiler->CompileGraph(compile_options, node_def.name(),
                                     std::move(graph), args, result);
     }
 
     GraphDebugInfo debug_info;
     return CompileGraphToXlaHlo(
-        *graph, {args.data(), args.size()}, options.device_type.type_string(),
-        compile_options.use_tuple_arg, *options.flib_def, debug_info,
-        options.shape_representation_fn, result);
+        *graph, mlir::SpanToArrayRef<XlaCompiler::Argument>(args),
+        options.device_type.type_string(), compile_options.use_tuple_arg,
+        *options.flib_def, debug_info, options.shape_representation_fn, result);
   };
   return CompileImpl(options, name, args, compile_op,
                      /*compile_threshold=*/absl::nullopt,
@@ -325,6 +325,10 @@ Status XlaCompilationCache::CompileImpl(
     absl::optional<int64> compile_threshold,
     const XlaCompiler::CompilationResult** out_compilation_result,
     xla::LocalExecutable** out_executable) {
+  if (FailOnXlaCompilation()) {
+    return errors::Internal("XLA compilation disabled");
+  }
+
   DCHECK_NE(out_executable, nullptr);
   VLOG(2) << "XlaCompilationCache::Compile " << DebugString();
 
diff --git a/tensorflow/compiler/jit/xla_compilation_cache_test.cc b/tensorflow/compiler/jit/xla_compilation_cache_test.cc
index 7227615d2bb..5578925b790 100644
--- a/tensorflow/compiler/jit/xla_compilation_cache_test.cc
+++ b/tensorflow/compiler/jit/xla_compilation_cache_test.cc
@@ -15,7 +15,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/jit/xla_compilation_cache.h"
 
+#include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 
@@ -52,6 +54,30 @@ TEST(XlaCompilationCacheTest, SignatureEquality) {
   }
 }
 
+TEST(XlaCompilationCacheTest, TestDisabledXlaCompilation) {
+  NameAttrList fn;
+  fn.set_name("afunction");
+
+  DisableXlaCompilation();
+
+  xla::LocalClient* client = xla::ClientLibrary::LocalClientOrDie();
+  DeviceType device_type = DeviceType(DEVICE_CPU_XLA_JIT);
+
+  const XlaCompiler::CompilationResult* compilation_result;
+  xla::LocalExecutable* executable;
+
+  auto cache = new XlaCompilationCache(client, device_type);
+  core::ScopedUnref cache_ref(cache);
+
+  Status status = cache->Compile(XlaCompiler::Options{}, fn, {},
+                                 XlaCompiler::CompileOptions{},
+                                 XlaCompilationCache::CompileMode::kStrict,
+                                 &compilation_result, &executable);
+  EXPECT_FALSE(status.ok());
+  EXPECT_TRUE(
+      absl::StrContains(status.error_message(), "XLA compilation disabled"));
+}
+
 static void BM_BuildSignature(int iters, int n_args) {
   NameAttrList fn;
   fn.set_name("afunction");
diff --git a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
index 50813859603..d092508eccf 100644
--- a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
+++ b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "tensorflow/compiler/jit/xla_device.h"
 #include "tensorflow/compiler/jit/xla_launch_util.h"
+#include "tensorflow/compiler/jit/xla_platform_info.h"
 #include "tensorflow/compiler/tf2xla/const_analysis.h"
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
@@ -41,18 +42,23 @@ static std::vector<int> GetResourceVariableIndices(OpKernelContext* ctx) {
 }
 
 Status XlaCompileOnDemandOp::Run(OpKernelContext* ctx,
-                                 const XlaDevice::Metadata& metadata,
+                                 XlaCompilationCache* cache,
                                  const XlaCompiler::CompilationResult* result,
                                  xla::LocalExecutable* executable,
                                  const ResourceVarsSnapshot& variable_args) {
-  xla::LocalClient* client = metadata.client();
+  xla::LocalClient* client = static_cast<xla::LocalClient*>(cache->client());
 
-  // Builds an XLA allocator for the device.
+  absl::optional<se::TfAllocatorAdapter> tf_allocator_adapter;
+  se::DeviceMemoryAllocator* allocator = GetAllocator(
+      &tf_allocator_adapter, ctx->device(),
+      ctx->op_device_context() ? ctx->op_device_context()->stream() : nullptr,
+      platform_info_);
   XlaComputationLaunchContext launch_context(
-      client, client->backend().memory_allocator(),
-      client->default_device_ordinal(),
-      /*allocate_xla_tensors=*/true,
-      /*use_multiple_streams=*/metadata.UseMultipleStreams());
+      client, allocator, client->default_device_ordinal(),
+      /*allocate_xla_tensors=*/platform_info_.xla_device_metadata() != nullptr,
+      platform_info_.xla_device_metadata()
+          ? platform_info_.xla_device_metadata()->UseMultipleStreams()
+          : false);
 
   std::map<int, const Tensor*> snapshot_ptrs;
   for (auto& p : variable_args) {
@@ -70,12 +76,11 @@ Status XlaCompileOnDemandOp::Run(OpKernelContext* ctx,
 
   se::Stream* stream =
       ctx->op_device_context() ? ctx->op_device_context()->stream() : nullptr;
-  TF_RET_CHECK(stream);
 
   VLOG(2) << "Executing computation: " << name();
   xla::ExecutableRunOptions run_options;
   run_options.set_stream(stream);
-  run_options.set_allocator(client->backend().memory_allocator());
+  run_options.set_allocator(allocator);
   run_options.set_intra_op_thread_pool(&ctx->eigen_cpu_device());
   run_options.set_rng_seed(GetXLARandomSeed());
 
@@ -94,98 +99,39 @@ Status XlaCompileOnDemandOp::Run(OpKernelContext* ctx,
   return Status::OK();
 }
 
-Status XlaCompileOnDemandOp::MustArgumentBeConstant(
-    const OpKernel* op_kernel, int64 argument_idx,
-    FunctionLibraryRuntime* flib_runtime, bool* result) {
-  *result = false;
+Status XlaCompileOnDemandOp::Compile(
+    OpKernelContext* ctx, const XlaCompiler::CompilationResult** result,
+    XlaCompilationCache** cache, ResourceVarsSnapshot* variable_args,
+    xla::LocalExecutable** executable) {
 
-  // TODO(jmolloy): This could be expensive, so memoize.
   std::vector<int> constant_input_indices;
   TF_RETURN_IF_ERROR(GetCompileTimeConstInputs(
-      op_kernel, &constant_input_indices, flib_runtime));
-  *result = absl::c_binary_search(constant_input_indices, argument_idx);
-  return Status::OK();
-}
-
-// TODO(ycao): Remove the need to call ShouldArgumentBeConstant. Its benefit is
-// not clear yet and it causes heavy constant analysis to run twice.
-Status XlaCompileOnDemandOp::ShouldArgumentBeConstant(
-    const OpKernel* op_kernel, int64 argument_idx,
-    FunctionLibraryRuntime* flib_runtime, bool* result) {
-  return MustArgumentBeConstant(op_kernel, argument_idx, flib_runtime, result);
-}
-
-Status XlaCompileOnDemandOp::Compile(
-    OpKernelContext* ctx, const XlaDevice::Metadata& metadata,
-    const XlaCompiler::CompilationResult** result,
-    ResourceVarsSnapshot* variable_args, xla::LocalExecutable** executable) {
-  std::map<int, Tensor> constant_arguments;
-  for (int64 i = 0; i < ctx->num_inputs(); ++i) {
-    const Tensor& device_tensor = ctx->input(i);
-    if (const XlaTensor* xla_tensor = XlaTensor::FromTensor(&device_tensor)) {
-      if (xla_tensor->has_host_tensor()) {
-        bool should_arg_be_const;
-        TF_RETURN_IF_ERROR(ShouldArgumentBeConstant(&ctx->op_kernel(), i,
-                                                    ctx->function_library(),
-                                                    &should_arg_be_const));
-        if (should_arg_be_const) {
-          constant_arguments[i] = xla_tensor->host_tensor();
-        }
-      }
-    }
-
-    if (constant_arguments.count(i) == 0) {
-      bool must_argument_be_const;
-      TF_RETURN_IF_ERROR(MustArgumentBeConstant(&ctx->op_kernel(), i,
-                                                ctx->function_library(),
-                                                &must_argument_be_const));
-
-      if (must_argument_be_const) {
-        // Slow path; the argument is not available as a host constant so we
-        // must fetch it synchronously.
-        Tensor host_tensor;
-        AllocatorAttributes attrs;
-        attrs.set_on_host(true);
-        TF_RETURN_IF_ERROR(ctx->allocate_temp(
-            device_tensor.dtype(), device_tensor.shape(), &host_tensor, attrs));
-        Status status = ctx->op_device_context()->CopyDeviceTensorToCPUSync(
-            &device_tensor, "ConstantArgument",
-            reinterpret_cast<Device*>(ctx->device()), &host_tensor);
-        if (!status.ok()) {
-          LOG(ERROR) << "Copying tensor of shape "
-                     << device_tensor.shape().DebugString() << " from "
-                     << ctx->device()->name() << "to CPU failed with "
-                     << status.ToString();
-          return status;
-        }
-        constant_arguments[i] = host_tensor;
-      }
-    }
+      &ctx->op_kernel(), &constant_input_indices, ctx->function_library()));
+  if (!absl::c_all_of(constant_input_indices, [&](int idx) {
+        return ctx->input_memory_type(idx) == HOST_MEMORY;
+      })) {
+    return errors::Internal("Unexpected device placement for a constant input");
   }
+  std::vector<const Tensor*> inputs = InputsFromContext(ctx);
 
   // We store information about the JIT-compiled XLA computation
   // in the ResourceMgr.
   ResourceMgr* rm = ctx->resource_manager();
   CHECK(rm);
 
-  XlaCompilationCache* cache;
   TF_RETURN_IF_ERROR(rm->LookupOrCreate<XlaCompilationCache>(
-      rm->default_container(), "xla_cache", &cache,
-      [&](XlaCompilationCache** cache) {
-        *cache = new XlaCompilationCache(metadata.client(),
-                                         metadata.jit_device_type());
-        return Status::OK();
+      rm->default_container(), "xla_cache", cache,
+      [&](XlaCompilationCache** write_into_cache) {
+        return BuildXlaCompilationCache(ctx->device(), platform_info_,
+                                        write_into_cache);
       }));
-  // Hold the reference to the JIT during evaluation. (We could probably
-  // free it sooner because the ResourceMgr will retain a reference, but
-  // this is more obviously correct.)
-  core::ScopedUnref cache_ref(cache);
 
-  XlaCompiler::Options options;
-  options.device_type = metadata.jit_device_type();
-  options.client = metadata.client();
-  options.flib_def = ctx->function_library()->GetFunctionLibraryDefinition();
-  options.shape_representation_fn = metadata.shape_representation_fn();
+  absl::optional<se::TfAllocatorAdapter> tf_allocator_adapter;
+  XlaCompiler::Options options = GenerateCompilerOptions(
+      **cache, *ctx->function_library(), ctx->device(),
+      ctx->op_device_context() ? ctx->op_device_context()->stream() : nullptr,
+      platform_info_,
+      /*has_ref_vars=*/true, &tf_allocator_adapter);
 
   XlaCompiler::CompileOptions compile_options;
   compile_options.is_entry_computation = true;
@@ -194,31 +140,41 @@ Status XlaCompileOnDemandOp::Compile(
   compile_options.always_return_tuple = false;
 
   std::vector<int> variables_indices = GetResourceVariableIndices(ctx);
-  std::vector<XlaCompiler::Argument> args;
+  xla::StatusOr<std::vector<XlaCompiler::Argument>> args;
   {
     std::vector<VariableInfo> variable_infos;
     TF_RETURN_IF_ERROR(
-        GetVariableInfosFromCtxInputs(ctx, variables_indices, &variable_infos));
+        GetVariableInfosFromInputs(ctx->resource_manager(), ctx->device(),
+                                   inputs, variables_indices, &variable_infos));
+
     TF_RETURN_IF_ERROR(LockVariables(absl::MakeSpan(variable_infos)));
     TF_RETURN_IF_ERROR(SnapshotResourceVariables(
         ctx, variables_indices, variable_infos, variable_args));
-    TF_RETURN_IF_ERROR(XlaComputationLaunchContext::BuildXlaCompilerArguments(
-        constant_arguments, variable_infos, ctx, &args));
+
+    args = XlaComputationLaunchContext::BuildXlaCompilerArguments(
+        constant_input_indices, inputs, variable_infos);
+    TF_RETURN_IF_ERROR(args.status());
   }
 
-  return cache->CompileSingleOp(options, args, ctx, compile_options, result,
-                                executable);
+  return (*cache)->CompileSingleOp(options, *args, ctx, compile_options, result,
+                                   executable);
 }
 
 void XlaCompileOnDemandOp::Compute(OpKernelContext* ctx) {
   const XlaCompiler::CompilationResult* result;
   xla::LocalExecutable* executable;
-  const XlaDevice::Metadata* metadata;
-  OP_REQUIRES_OK(ctx, XlaDevice::GetMetadata(ctx, &metadata));
   ResourceVarsSnapshot variable_args;
+  XlaCompilationCache* cache;
+  OP_REQUIRES(ctx, ctx->function_library(),
+              errors::Internal("Function library missing"));
   OP_REQUIRES_OK(ctx,
-                 Compile(ctx, *metadata, &result, &variable_args, &executable));
-  OP_REQUIRES_OK(ctx, Run(ctx, *metadata, result, executable, variable_args));
+                 Compile(ctx, &result, &cache, &variable_args, &executable));
+
+  // Hold the reference to the JIT during evaluation. (We could probably
+  // free it sooner because the ResourceMgr will retain a reference, but
+  // this is more obviously correct.)
+  core::ScopedUnref cache_ref(cache);
+  OP_REQUIRES_OK(ctx, Run(ctx, cache, result, executable, variable_args));
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_compile_on_demand_op.h b/tensorflow/compiler/jit/xla_compile_on_demand_op.h
index cc5f2f1e42f..bb8ab889ce9 100644
--- a/tensorflow/compiler/jit/xla_compile_on_demand_op.h
+++ b/tensorflow/compiler/jit/xla_compile_on_demand_op.h
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/jit/xla_device.h"
 #include "tensorflow/compiler/jit/xla_launch_util.h"
+#include "tensorflow/compiler/jit/xla_platform_info.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/core/framework/function.h"
@@ -35,25 +36,25 @@ namespace tensorflow {
 // vanilla TensorFlow op as long as the bridge supports it.
 class XlaCompileOnDemandOp : public OpKernel {
  public:
-  explicit XlaCompileOnDemandOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+  explicit XlaCompileOnDemandOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx),
+        platform_info_(XlaPlatformInfoFromDevice(ctx->device())) {}
   void Compute(OpKernelContext* ctx) override;
 
  private:
   XlaCompiler::Argument CreateCompilerArgument(OpKernelContext* ctx, int64 i);
-  Status ShouldArgumentBeConstant(const OpKernel* op_kernel, int64 argument_idx,
-                                  FunctionLibraryRuntime* flib_runtime,
-                                  bool* result);
-  Status MustArgumentBeConstant(const OpKernel* op_kernel, int64 argument_idx,
-                                FunctionLibraryRuntime* flib_runtime,
-                                bool* result);
-  Status Compile(OpKernelContext* ctx, const XlaDevice::Metadata& metadata,
+  Status Compile(OpKernelContext* ctx,
                  const XlaCompiler::CompilationResult** result,
+                 XlaCompilationCache** cache,
                  ResourceVarsSnapshot* variable_args,
                  xla::LocalExecutable** executable);
-  Status Run(OpKernelContext* ctx, const XlaDevice::Metadata& metadata,
+
+  Status Run(OpKernelContext* ctx, XlaCompilationCache* cache,
              const XlaCompiler::CompilationResult* result,
              xla::LocalExecutable* executable,
              const ResourceVarsSnapshot& variable_args);
+
+  const XlaPlatformInfo platform_info_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_cpu_device.cc b/tensorflow/compiler/jit/xla_cpu_device.cc
index 446cd8944de..dd1ddb616f5 100644
--- a/tensorflow/compiler/jit/xla_cpu_device.cc
+++ b/tensorflow/compiler/jit/xla_cpu_device.cc
@@ -51,7 +51,7 @@ Status XlaCpuDeviceFactory::CreateDevices(
     std::vector<std::unique_ptr<Device>>* devices) {
   XlaDeviceFlags* flags = GetXlaDeviceFlags();
   if (!flags->tf_xla_enable_xla_devices) {
-    LOG(INFO) << "Not creating XLA devices, tf_xla_enable_xla_devices not set";
+    VLOG(1) << "Not creating XLA devices, tf_xla_enable_xla_devices not set";
     return Status::OK();
   }
   bool compile_on_demand = flags->tf_xla_compile_on_demand;
diff --git a/tensorflow/compiler/jit/xla_device.cc b/tensorflow/compiler/jit/xla_device.cc
index 7842513331d..089d22dca03 100644
--- a/tensorflow/compiler/jit/xla_device.cc
+++ b/tensorflow/compiler/jit/xla_device.cc
@@ -61,6 +61,21 @@ limitations under the License.
 
 namespace tensorflow {
 
+// Default PaddedShapeFn implementation that simply returns the unpadded
+// on-device shape. This is accurate for CPU and GPU devices that neither
+// transpose nor pad tensors.
+Status DefaultPaddedShapeFn(const Tensor& tensor, xla::Shape* shape) {
+  const tensorflow::XlaTensor* xla_tensor =
+      tensorflow::XlaTensor::FromTensor(&tensor);
+  if (xla_tensor == nullptr) {
+    return TensorShapeToXLAShape(tensor.dtype(), tensor.shape(), shape);
+  }
+
+  const xla::ShapedBuffer& shaped_buffer = xla_tensor->shaped_buffer();
+  *shape = shaped_buffer.on_device_shape();
+  return Status::OK();
+}
+
 // Caches a XlaDeviceAllocator per <backend, device ordinal> pair. A
 // XlaDeviceAllocator is created on demand and is associated with a
 // XlaDevice. It outlives the device itself (for instance, the buffer
@@ -116,20 +131,6 @@ XlaDeviceAllocator* XlaDeviceAllocatorState::GetOrCreateXlaDeviceAllocator(
 
 namespace {
 
-// Default PaddedShapeFn implementation that simply returns the unpadded
-// on-device shape. This is accurate for CPU and GPU devices that neither
-// transpose nor pad tensors.
-Status DefaultPaddedShapeFn(const Tensor& tensor, xla::Shape* shape) {
-  const tensorflow::XlaTensor* xla_tensor =
-      tensorflow::XlaTensor::FromTensor(&tensor);
-  if (xla_tensor == nullptr) {
-    return TensorShapeToXLAShape(tensor.dtype(), tensor.shape(), shape);
-  }
-
-  const xla::ShapedBuffer& shaped_buffer = xla_tensor->shaped_buffer();
-  *shape = shaped_buffer.on_device_shape();
-  return Status::OK();
-}
 
 static DeviceAttributes BuildXlaDeviceAttributes(const string& name_prefix,
                                                  const string& device_name,
@@ -572,8 +573,7 @@ XlaDeviceOpRegistrations* RegisterXlaDeviceKernels(const char* device,
   // Any op assigned to the device that isn't rewritten by the graph rewriter
   // gets executed by an XlaCompileOnDemandOp, which compiles it and executes
   // it just-in-time.
-  OpKernel* (*factory)(OpKernelConstruction*) =
-      [](OpKernelConstruction* context) -> OpKernel* {
+  auto factory = [](OpKernelConstruction* context) -> OpKernel* {
     return new XlaCompileOnDemandOp(context);
   };
   XlaOpRegistry::RegisterCompilationKernels();
@@ -582,6 +582,13 @@ XlaDeviceOpRegistrations* RegisterXlaDeviceKernels(const char* device,
            jit_device,
            /*include_compilation_only_kernels=*/false)) {
     KernelDef* def = new KernelDef(*jit_def);
+    const std::unordered_set<std::string>* constant_inputs =
+        XlaOpRegistry::CompileTimeConstantInputArgNames(def->op());
+
+    for (const std::string& arg_name : *constant_inputs) {
+      def->add_host_memory_arg(arg_name);
+    }
+
     def->set_device_type(device);
     registrations->op_kernel_registrars.emplace_back(
         new kernel_factory::OpKernelRegistrar(def, "XlaCompileOnDemandOp",
diff --git a/tensorflow/compiler/jit/xla_device.h b/tensorflow/compiler/jit/xla_device.h
index 30f9a99e36a..6d6086ce0fa 100644
--- a/tensorflow/compiler/jit/xla_device.h
+++ b/tensorflow/compiler/jit/xla_device.h
@@ -94,6 +94,11 @@ class XlaDevice : public LocalDevice {
   static Status GetMetadata(OpKernelConstruction* ctx,
                             const Metadata** metadata);
 
+  // Sets `*metadata` to the XlaDevice Metadata in the XLA device used by
+  // `device`.
+  static Status GetMetadataFromDevice(DeviceBase* device,
+                                      const XlaDevice::Metadata** metadata);
+
   struct Options {
     // The StreamExecutor platform. Not owned. Must be non-null.
     se::Platform* platform = nullptr;
@@ -196,8 +201,6 @@ class XlaDevice : public LocalDevice {
   xla::StatusOr<std::pair<XlaDeviceContext*, XlaDeviceContext*>>
   GetDeviceContextLocked() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
-  static Status GetMetadataFromDevice(DeviceBase* device,
-                                      const XlaDevice::Metadata** metadata);
 
   Status MakeTensorFromProto(XlaDeviceContext* device_context,
                              const TensorProto& tensor_proto,
@@ -280,6 +283,8 @@ struct XlaDeviceOpRegistrations {
 XlaDeviceOpRegistrations* RegisterXlaDeviceKernels(const char* device,
                                                    const char* jit_device);
 
+Status DefaultPaddedShapeFn(const Tensor& tensor, xla::Shape* shape);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_JIT_XLA_DEVICE_H_
diff --git a/tensorflow/compiler/jit/xla_gpu_device.cc b/tensorflow/compiler/jit/xla_gpu_device.cc
index 16f496d51a3..99ba5658819 100644
--- a/tensorflow/compiler/jit/xla_gpu_device.cc
+++ b/tensorflow/compiler/jit/xla_gpu_device.cc
@@ -66,7 +66,7 @@ class XlaGpuDeviceFactory : public DeviceFactory {
 Status XlaGpuDeviceFactory::ListPhysicalDevices(std::vector<string>* devices) {
   XlaDeviceFlags* flags = GetXlaDeviceFlags();
   if (!flags->tf_xla_enable_xla_devices) {
-    LOG(INFO) << "Not creating XLA devices, tf_xla_enable_xla_devices not set";
+    VLOG(1) << "Not creating XLA devices, tf_xla_enable_xla_devices not set";
     return Status::OK();
   }
 
diff --git a/tensorflow/compiler/jit/xla_interpreter_device.cc b/tensorflow/compiler/jit/xla_interpreter_device.cc
deleted file mode 100644
index f720183e196..00000000000
--- a/tensorflow/compiler/jit/xla_interpreter_device.cc
+++ /dev/null
@@ -1,106 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Registers the XLA_INTERPRETER device which exposes the XLA Interpreter.
-
-#include "absl/memory/memory.h"
-#include "tensorflow/compiler/jit/kernels/xla_ops.h"
-#include "tensorflow/compiler/jit/xla_device.h"
-#include "tensorflow/compiler/jit/xla_device_ops.h"
-#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-
-namespace tensorflow {
-
-const char* const DEVICE_XLA_INTERPRETER = "XLA_INTERPRETER";
-const char* const DEVICE_INTERPRETER_XLA_JIT = "XLA_INTERPRETER_JIT";
-
-constexpr std::array<DataType, 10> kExecAllTypes = {
-    {DT_INT8, DT_INT32, DT_INT64, DT_HALF, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64,
-     DT_COMPLEX128, DT_BOOL, DT_BFLOAT16}};
-
-class XlaInterpreterDeviceFactory : public DeviceFactory {
- public:
-  Status ListPhysicalDevices(std::vector<string>* devices) override;
-  Status CreateDevices(const SessionOptions& options, const string& name_prefix,
-                       std::vector<std::unique_ptr<Device>>* devices) override;
-};
-
-Status XlaInterpreterDeviceFactory::ListPhysicalDevices(
-    std::vector<string>* devices) {
-  devices->push_back(
-      absl::StrCat("/physical_device:", DEVICE_XLA_INTERPRETER, ":0"));
-
-  return Status::OK();
-}
-
-Status XlaInterpreterDeviceFactory::CreateDevices(
-    const SessionOptions& session_options, const string& name_prefix,
-    std::vector<std::unique_ptr<Device>>* devices) {
-  static XlaDeviceOpRegistrations* registrations = RegisterXlaDeviceKernels(
-      DEVICE_XLA_INTERPRETER, DEVICE_INTERPRETER_XLA_JIT);
-  (void)registrations;
-
-  XlaOpRegistry::DeviceRegistration registration;
-  registration.compilation_device_name = DEVICE_INTERPRETER_XLA_JIT;
-  registration.autoclustering_policy =
-      XlaOpRegistry::AutoclusteringPolicy::kAlways;
-  registration.cluster_resource_variable_ops_unsafely = true;
-  registration.cluster_stack_ops = false;
-  registration.cluster_tensor_array_ops = true;
-  registration.cluster_stateful_rng_ops = true;
-  registration.cluster_control_trigger = true;
-  registration.elide_assert_and_checknumerics = true;
-  registration.cluster_variant_ops = true;
-  registration.cluster_slow_ops = true;
-  registration.cluster_inaccurate_ops = true;
-  XlaOpRegistry::RegisterCompilationDevice(DEVICE_XLA_INTERPRETER,
-                                           registration);
-
-  TF_ASSIGN_OR_RETURN(
-      auto platform, se::MultiPlatformManager::PlatformWithName("Interpreter"));
-
-  XlaDevice::Options options;
-  options.platform = platform;
-  options.device_name_prefix = name_prefix;
-  options.device_name = DEVICE_XLA_INTERPRETER;
-  options.device_ordinal = 0;
-  options.compilation_device_name = DEVICE_INTERPRETER_XLA_JIT;
-  options.use_multiple_streams = false;
-  devices->push_back(absl::make_unique<XlaDevice>(session_options, options));
-
-  return Status::OK();
-}
-
-// Set priority to be below the default priority (50), so that Interpreter is
-// not selected as a high priority device over other default devices. See
-// constructor comments for Registrar in
-// tensorflow/core/common_runtime/device_factory.h for a list of priority for
-// devices.
-REGISTER_LOCAL_DEVICE_FACTORY(DEVICE_XLA_INTERPRETER,
-                              XlaInterpreterDeviceFactory, 40);
-
-// Kernel registrations
-static bool OpFilter(KernelDef* kdef) { return true; }
-
-REGISTER_XLA_LAUNCH_KERNEL(DEVICE_XLA_INTERPRETER, XlaLocalLaunchOp,
-                           kExecAllTypes);
-REGISTER_XLA_COMPILE_KERNEL(DEVICE_XLA_INTERPRETER, XlaCompileOp,
-                            kExecAllTypes);
-REGISTER_XLA_RUN_KERNEL(DEVICE_XLA_INTERPRETER, XlaRunOp, kExecAllTypes);
-
-REGISTER_XLA_DEVICE_KERNELS(DEVICE_XLA_INTERPRETER, kExecAllTypes);
-REGISTER_XLA_BACKEND(DEVICE_INTERPRETER_XLA_JIT, kExecAllTypes, OpFilter);
-
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_kernel_creator.cc b/tensorflow/compiler/jit/xla_kernel_creator.cc
index 5ca146969e0..7387978fbcd 100644
--- a/tensorflow/compiler/jit/xla_kernel_creator.cc
+++ b/tensorflow/compiler/jit/xla_kernel_creator.cc
@@ -14,10 +14,21 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/jit/xla_kernel_creator.h"
 
+#include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
 #include "tensorflow/compiler/jit/compilability_check_util.h"
+#include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/jit/flags.h"
-#include "tensorflow/compiler/jit/xla_kernel_creator_util.h"
+#include "tensorflow/compiler/jit/kernels/xla_ops.h"
+#include "tensorflow/compiler/jit/mark_for_compilation_pass.h"
+#include "tensorflow/compiler/tf2xla/const_analysis.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 
@@ -27,6 +38,78 @@ bool XlaKernelCreator::CanCreateKernel(
   return CanCreateXlaKernel(props->node_def);
 }
 
+static Status CreateXlaKernel(FunctionLibraryRuntime* flr,
+                              const NodeDef& node_def,
+                              std::unique_ptr<OpKernel>* kernel) {
+  if (!CanCreateXlaKernel(node_def)) {
+    return errors::Internal("Invalid node: ", node_def.ShortDebugString());
+  }
+
+  VLOG(3) << "Attempting to create XlaLaunchOp for " << node_def.DebugString();
+
+  // Make sure that kernels have been registered on the JIT device.
+  XlaOpRegistry::RegisterCompilationKernels();
+
+  // Only check for compilability if the MLIR bridge is not enabled.
+  if (!GetMlirCommonFlags()->tf_mlir_enable_mlir_bridge) {
+    RecursiveCompilabilityChecker::UncompilableNodesMap uncompilable_nodes_map;
+    if (!IsCompilable(flr, node_def, &uncompilable_nodes_map)) {
+      std::vector<RecursiveCompilabilityChecker::UncompilableNodeInfo>
+          uncompilable_node_info;
+      for (const auto& it : uncompilable_nodes_map) {
+        for (const auto& info : it.second.second) {
+          uncompilable_node_info.emplace_back(info);
+        }
+      }
+      string message = absl::StrCat(
+          "Function invoked by the following node is not compilable: ",
+          SummarizeNodeDef(node_def, /*max_inputs_in_summary=*/10), ".\n");
+      absl::StrAppend(&message, "Uncompilable nodes:");
+      for (const auto& node_info : uncompilable_node_info) {
+        string node_message = absl::StrCat("\n", node_info.name, ": ",
+                                           node_info.uncompilable_reason, "\n",
+                                           "\tStacktrace:\n");
+        for (const auto& stack_frame : node_info.stack_trace) {
+          absl::StrAppendFormat(&node_message, "\t\tNode: %s, function: %s\n",
+                                stack_frame.name, stack_frame.function_name);
+        }
+        absl::StrAppend(&message, node_message);
+      }
+      VLOG(1) << message;
+      return errors::InvalidArgument(message);
+    }
+  }
+
+  // Get function body, constant args, and resource args.
+  NameAttrList function;
+  TF_RETURN_IF_ERROR(NameAndAttrsFromFunctionCall(node_def, &function));
+  const FunctionBody* fbody = nullptr;
+  std::vector<int> constant_arg_indices;
+  std::vector<int> resource_arg_indices;
+  TF_RETURN_IF_ERROR(GetBodyAndConstantsAndResources(
+      flr, function, &fbody, &constant_arg_indices, &resource_arg_indices));
+
+  MemoryTypeVector input_memory_types =
+      GetInputMemoryTypes(fbody, constant_arg_indices, resource_arg_indices);
+  MemoryTypeVector output_memory_types = GetOutputMemoryTypes(fbody);
+
+  // Create the kernel.
+  Device* dev = flr->device();
+  Status s;
+  auto props = std::make_shared<NodeProperties>(
+      &fbody->fdef.signature(), node_def, fbody->arg_types, fbody->ret_types);
+  OpKernelConstruction construction(DeviceType(dev->device_type()), dev,
+                                    dev->GetAllocator(AllocatorAttributes()),
+                                    flr, dev->resource_manager(), props,
+                                    input_memory_types, output_memory_types,
+                                    flr->graph_def_version(), &s);
+
+  *kernel = absl::make_unique<XlaLocalLaunchBase>(
+      &construction, constant_arg_indices, resource_arg_indices, function,
+      /*has_ref_vars=*/false);
+  return s;
+}
+
 Status XlaKernelCreator::CreateKernel(
     FunctionLibraryRuntime* flr,
     const std::shared_ptr<const NodeProperties>& props,
@@ -34,19 +117,12 @@ Status XlaKernelCreator::CreateKernel(
   return CreateXlaKernel(flr, props->node_def, kernel);
 }
 
-namespace {
-
-bool RegisterLaunchOpCreator() {
+static bool RegisterLaunchOpCreator() {
   XlaKernelCreator* xla_kernel_creator = new XlaKernelCreator();
   RegisterDefaultCustomKernelCreator(xla_kernel_creator);
   return true;
 }
 
 static bool register_me = RegisterLaunchOpCreator();
-static bool register_xla = [] {
-  SetXlaIsEnabled();
-  return true;
-}();
 
-}  // end namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_kernel_creator_util.cc b/tensorflow/compiler/jit/xla_kernel_creator_util.cc
deleted file mode 100644
index 61c89d8a67a..00000000000
--- a/tensorflow/compiler/jit/xla_kernel_creator_util.cc
+++ /dev/null
@@ -1,186 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/compiler/jit/xla_kernel_creator_util.h"
-
-#include "absl/memory/memory.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_format.h"
-#include "tensorflow/compiler/jit/compilability_check_util.h"
-#include "tensorflow/compiler/jit/defs.h"
-#include "tensorflow/compiler/jit/kernels/xla_ops.h"
-#include "tensorflow/compiler/jit/mark_for_compilation_pass.h"
-#include "tensorflow/compiler/tf2xla/const_analysis.h"
-#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/core/common_runtime/function.h"
-#include "tensorflow/core/framework/node_def_builder.h"
-#include "tensorflow/core/framework/node_def_util.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/util/ptr_util.h"
-
-namespace tensorflow {
-namespace {
-
-// Utility which searches for values in a sorted list by scanning over it once.
-// No matter how many times ScanForValue is called, the list is scanned at most
-// once. However, if a call to ScanForValue skips over a value, that value is
-// not revisited in future calls to ScanForValue, so callers must take
-// care to order their calls.
-//
-// Useful for merging multiple sorted lists in O(n) time.
-class SinglePassSearch {
- public:
-  // Creates a SinglePassSearch object that can be used to search in `values`.
-  // Does not take ownership of `values`. `values` must outlive this.
-  // `values` must be sorted.
-  explicit SinglePassSearch(const std::vector<int>* values)
-      : current_index_(0), values_(values) {}
-
-  // Scans forward in the vector looking for "value", updating the internal
-  // position in to the vector.
-  // Returns true iff the vector contains the given value at or after current
-  // position.
-  // Not thread-safe.
-  bool ScanForValue(int value) {
-    while (current_index_ < values_->size() &&
-           (*values_)[current_index_] <= value) {
-      if ((*values_)[current_index_] == value) {
-        current_index_++;
-        return true;
-      }
-      current_index_++;
-    }
-    return false;
-  }
-
- private:
-  int current_index_;
-  const std::vector<int>* values_;
-};
-}  // namespace
-
-Status CreateXlaKernel(FunctionLibraryRuntime* flr, const NodeDef& node_def,
-                       std::unique_ptr<OpKernel>* kernel) {
-  if (!CanCreateXlaKernel(node_def)) {
-    return errors::Internal("Invalid node: ", node_def.ShortDebugString());
-  }
-
-  VLOG(3) << "Attempting to create XlaLaunchOp for " << node_def.DebugString();
-
-  // Make sure that kernels have been registered on the JIT device.
-  XlaOpRegistry::RegisterCompilationKernels();
-
-  // Only check for compilability if the MLIR bridge is not enabled.
-  if (!GetMlirCommonFlags()->tf_mlir_enable_mlir_bridge) {
-    RecursiveCompilabilityChecker::UncompilableNodesMap uncompilable_nodes_map;
-    if (!IsCompilable(flr, node_def, &uncompilable_nodes_map)) {
-      std::vector<RecursiveCompilabilityChecker::UncompilableNodeInfo>
-          uncompilable_node_info;
-      for (const auto& it : uncompilable_nodes_map) {
-        for (const auto& info : it.second.second) {
-          uncompilable_node_info.emplace_back(info);
-        }
-      }
-      string message = absl::StrCat(
-          "Function invoked by the following node is not compilable: ",
-          SummarizeNodeDef(node_def, /*max_inputs_in_summary=*/10), ".\n");
-      absl::StrAppend(&message, "Uncompilable nodes:");
-      for (const auto& node_info : uncompilable_node_info) {
-        string node_message = absl::StrCat("\n", node_info.name, ": ",
-                                           node_info.uncompilable_reason, "\n",
-                                           "\tStacktrace:\n");
-        for (const auto& stack_frame : node_info.stack_trace) {
-          absl::StrAppendFormat(&node_message, "\t\tNode: %s, function: %s\n",
-                                stack_frame.name, stack_frame.function_name);
-        }
-        absl::StrAppend(&message, node_message);
-      }
-      VLOG(1) << message;
-      return errors::InvalidArgument(message);
-    }
-  }
-
-  // Get function body, constant args, and resource args.
-  const FunctionBody* fbody = nullptr;
-  std::vector<int> constant_arg_indices;
-  std::vector<int> resource_arg_indices;
-  TF_RETURN_IF_ERROR(GetBodyAndConstantsAndResources(
-      flr, node_def, &fbody, &constant_arg_indices, &resource_arg_indices));
-
-  // Set input and output memory types.
-  MemoryTypeVector input_memory_types(fbody->arg_types.size(), DEVICE_MEMORY);
-  // These indices are used only for optimization purposes. They allow us
-  // to loop over constant_arg_indices and resource_arg_indices only once
-  // while iterating over all the function arguments checking if it is a
-  // resource or a constant.
-  // The reason we optimized this code is because functions can have a lot of
-  // captured arguments. For example, the backward pass of ResNet50 takes in all
-  // 214 variables and a similar number of activations.
-  SinglePassSearch constants_search(&constant_arg_indices);
-  SinglePassSearch resources_search(&resource_arg_indices);
-  for (size_t i = 0; i < fbody->arg_types.size(); ++i) {
-    if (resources_search.ScanForValue(i) || constants_search.ScanForValue(i)) {
-      // Compile-time constants and resource handles are expected to be in
-      // host memory.
-      input_memory_types[i] = HOST_MEMORY;
-    }
-  }
-  // One might wonder, about the case where a compile-time constant argument
-  // (which must be in host memory) is also used as an input into an op,
-  // e.g. Add, that expects its inputs in device memory. Here is how it
-  // works now.
-  // First, what do we mean by "op expects an input in XYZ memory"?
-  // There are two types of "ops" here: the tf2xla kernel and the HLO
-  // computation it builds. The tf2xla kernel needs to retrieve the actual
-  // numeric value of the compile-time constant tensors, so it really expects
-  // them to be on in host memory. However, for other inputs, it refers to them
-  // using xla::ComputationDataHandle, which is just a symbolic handle that
-  // xla::ComputationBuilder assigns. How does this handle gets assigned for
-  // constant arguments? Even constant arguments get an _Arg node in the graph
-  // instantiated for Function compilation. The tf2xla kernel for constant _Arg
-  // nodes takes the constant value, converts it to XlaLiteral, and feeds it
-  // to xla::ComputationBuilder.ConstantLiteral, which returns the handle. This
-  // constant XlaLiteral is included in the HLO graph, and subsequently, in
-  // the actual executable, which is copied to the device before being
-  // executed. Thus, when this executable runs, the constant is available in
-  // device memory.
-
-  // XlaLaunch kernel keeps all outputs (including constants, which it copies),
-  // in device memory except for resources.
-  MemoryTypeVector output_memory_types(fbody->ret_types.size(), DEVICE_MEMORY);
-  for (size_t i = 0; i < fbody->ret_types.size(); ++i) {
-    if (fbody->ret_types[i] == DT_RESOURCE) {
-      output_memory_types[i] = HOST_MEMORY;
-    }
-  }
-
-  // Create the kernel.
-  NameAttrList function;
-  TF_RETURN_IF_ERROR(NameAndAttrsFromFunctionCall(node_def, &function));
-  Device* dev = flr->device();
-  Status s;
-  auto props = std::make_shared<NodeProperties>(
-      &fbody->fdef.signature(), node_def, fbody->arg_types, fbody->ret_types);
-  OpKernelConstruction construction(DeviceType(dev->device_type()), dev,
-                                    dev->GetAllocator(AllocatorAttributes()),
-                                    flr, dev->resource_manager(), props,
-                                    input_memory_types, output_memory_types,
-                                    flr->graph_def_version(), &s);
-
-  *kernel = absl::make_unique<XlaLocalLaunchBase>(
-      &construction, constant_arg_indices, resource_arg_indices, function,
-      /*has_ref_vars=*/false);
-  return s;
-}
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_launch_util.cc b/tensorflow/compiler/jit/xla_launch_util.cc
index 19e2b5a2bb5..a8b090f1450 100644
--- a/tensorflow/compiler/jit/xla_launch_util.cc
+++ b/tensorflow/compiler/jit/xla_launch_util.cc
@@ -44,12 +44,6 @@ namespace {
 using xla::ScopedShapedBuffer;
 using xla::ShapedBuffer;
 
-const char kPossibleNonVariableResourceHintMessage[] =
-    "If the error is similar to `Trying to access resource using the wrong "
-    "type`, this is likely because XLA only accepts Resource Variables as "
-    "inputs by snapshotting their values. Other TensorFlow resource types like "
-    "TensorList/TensorArray/Stack are not supported. Try removing non-variable "
-    "resource inputs to XLA.";
 }  // anonymous namespace
 
 VariableInfo::VariableInfo(int index, absl::string_view name, Var* var)
@@ -85,19 +79,22 @@ VariableInfo::~VariableInfo() {
   }
 }
 
-// Returns a vector of VariableInfo instances for the resource variable inputs
-// to the kernel with context `ctx`.  The input indices for the resource
-// variable inputs are in `variable_indices`.
-Status GetVariableInfosFromCtxInputs(OpKernelContext* ctx,
-                                     absl::Span<const int> variable_indices,
-                                     std::vector<VariableInfo>* result) {
+Status GetVariableInfosFromInputs(ResourceMgr* rm, DeviceBase* dev,
+                                  absl::Span<const Tensor* const> inputs,
+                                  absl::Span<const int> variable_indices,
+                                  std::vector<VariableInfo>* result) {
   result->clear();
   result->reserve(variable_indices.size());
   for (int var_idx : variable_indices) {
     Var* variable = nullptr;
-    ResourceHandle handle = HandleFromInput(ctx, var_idx);
-    TF_RETURN_IF_ERROR(
-        LookupOrCreateResource<Var>(ctx, handle, &variable, [&](Var** ptr) {
+    ResourceHandle handle = inputs[var_idx]->flat<ResourceHandle>()(0);
+    if (handle.device() != dev->attributes().name()) {
+      return errors::InvalidArgument("Trying to access resource ",
+                                     handle.name(), " located in device ",
+                                     dev->name());
+    }
+    TF_RETURN_IF_ERROR(rm->LookupOrCreate<Var>(
+        handle.container(), handle.name(), &variable, [](Var** ptr) {
           // This var is uninitialized for now.
           *ptr = new Var(DT_INVALID);
           return Status::OK();
@@ -107,6 +104,15 @@ Status GetVariableInfosFromCtxInputs(OpKernelContext* ctx,
   return Status::OK();
 }
 
+std::vector<const Tensor*> InputsFromContext(OpKernelContext* ctx) {
+  std::vector<const Tensor*> inputs;
+  inputs.reserve(ctx->num_inputs());
+  for (int input_idx = 0; input_idx < ctx->num_inputs(); input_idx++) {
+    inputs.push_back(&ctx->input(input_idx));
+  }
+  return inputs;
+}
+
 Status LockVariables(absl::Span<VariableInfo> variables) {
   std::vector<int> lock_order(variables.size());
   std::iota(lock_order.begin(), lock_order.end(), 0);
@@ -358,9 +364,6 @@ static Status SetOutputForConstant(
     ctx->set_output(output_num, const_tensor);
     output_tensor = ctx->mutable_output(output_num);
   }
-  if (XlaTensor* xla_tensor = XlaTensor::FromTensor(output_tensor)) {
-    xla_tensor->set_host_tensor(const_tensor);
-  }
   return Status::OK();
 }
 
@@ -557,11 +560,14 @@ Status XlaComputationLaunchContext::PopulateOutputs(
   return Status::OK();
 }
 
-Status XlaComputationLaunchContext::BuildXlaCompilerArguments(
-    const std::map<int, Tensor>& must_be_constant_args,
-    absl::Span<VariableInfo const> variable_args, OpKernelContext* ctx,
-    std::vector<XlaCompiler::Argument>* args) {
-  args->resize(ctx->num_inputs());
+xla::StatusOr<std::vector<XlaCompiler::Argument>>
+XlaComputationLaunchContext::BuildXlaCompilerArguments(
+    absl::Span<int const> must_be_constant_idxs,
+    absl::Span<const Tensor* const> inputs,
+    absl::Span<VariableInfo const> variable_args) {
+  CHECK(absl::c_is_sorted(must_be_constant_idxs));
+  std::vector<XlaCompiler::Argument> out;
+  out.resize(inputs.size());
 
   absl::flat_hash_map<int, const VariableInfo*> variable_info_lookup;
   for (const VariableInfo& info : variable_args) {
@@ -571,33 +577,20 @@ Status XlaComputationLaunchContext::BuildXlaCompilerArguments(
     variable_info_lookup.emplace(info.index(), &info);
   }
 
-  for (int64 input_num = 0; input_num < ctx->num_inputs(); ++input_num) {
-    XlaCompiler::Argument& arg = (*args)[input_num];
+  for (int64 input_num = 0; input_num < inputs.size(); ++input_num) {
+    const Tensor* input = inputs[input_num];
 
-    if (must_be_constant_args.count(input_num) > 0) {
+    XlaCompiler::Argument& arg = out[input_num];
+    if (absl::c_binary_search(must_be_constant_idxs, input_num)) {
       // Handles compile-time constants.
-      const Tensor& input = must_be_constant_args.at(input_num);
-      TF_RET_CHECK(input.dtype() != DT_RESOURCE);
+      TF_RET_CHECK(input->dtype() != DT_RESOURCE);
       arg.kind = XlaCompiler::Argument::kConstant;
-      arg.type = input.dtype();
-      arg.shape = input.shape();
-      arg.constant_value = input;
-    } else if (variable_info_lookup.count(input_num) == 0) {
-      // Handles the non-constant arguments.
-      const Tensor& input = ctx->input(input_num);
-      TF_RET_CHECK(input.dtype() != DT_RESOURCE);
-      if (input.NumElements() > 0) {
-        arg.kind = XlaCompiler::Argument::kParameter;
-      } else {
-        arg.kind = XlaCompiler::Argument::kConstant;
-        arg.constant_value = input;
-      }
-      arg.type = input.dtype();
-      arg.shape = input.shape();
-    } else {
+      arg.type = input->dtype();
+      arg.shape = input->shape();
+      arg.constant_value = *input;
+    } else if (variable_info_lookup.count(input_num)) {
       // Handles resource variables.
-      const Tensor& input = ctx->input(input_num);
-      TF_RET_CHECK(input.dtype() == DT_RESOURCE);
+      TF_RET_CHECK(input->dtype() == DT_RESOURCE);
       const VariableInfo& variable = *variable_info_lookup[input_num];
       arg.name = std::string(variable.name());
       arg.kind = XlaCompiler::Argument::kResource;
@@ -616,10 +609,21 @@ Status XlaComputationLaunchContext::BuildXlaCompilerArguments(
         arg.type = DT_INVALID;
         arg.shape = TensorShape();
       }
+    } else {
+      // Normal inputs.
+      TF_RET_CHECK(input->dtype() != DT_RESOURCE);
+      if (input->NumElements() > 0) {
+        arg.kind = XlaCompiler::Argument::kParameter;
+      } else {
+        arg.kind = XlaCompiler::Argument::kConstant;
+        arg.constant_value = *input;
+      }
+      arg.type = input->dtype();
+      arg.shape = input->shape();
     }
   }
 
-  return Status::OK();
+  return out;
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_launch_util.h b/tensorflow/compiler/jit/xla_launch_util.h
index b34b3059a4f..ac085a022c8 100644
--- a/tensorflow/compiler/jit/xla_launch_util.h
+++ b/tensorflow/compiler/jit/xla_launch_util.h
@@ -109,12 +109,16 @@ Status SnapshotResourceVariables(OpKernelContext* ctx,
 Status LockVariables(absl::Span<VariableInfo> variables)
     TF_EXCLUSIVE_LOCK_FUNCTION();
 
-// Returns a vector of VariableInfo instances for the resource variable inputs
-// to the kernel with context `ctx`.  The input indices for the resource
+// Returns a vector of VariableInfo instances for the resource variable inputs,
+// given that *all* inputs are in `inputs`. The input indices for the resource
 // variable inputs are in `variable_indices`.
-Status GetVariableInfosFromCtxInputs(OpKernelContext* ctx,
-                                     absl::Span<const int> variable_indices,
-                                     std::vector<VariableInfo>* result);
+Status GetVariableInfosFromInputs(ResourceMgr* rm, DeviceBase* dev,
+                                  absl::Span<const Tensor* const> inputs,
+                                  absl::Span<const int> variable_indices,
+                                  std::vector<VariableInfo>* result);
+
+// Returns pointers to inputs stored in `ctx`.
+std::vector<const Tensor*> InputsFromContext(OpKernelContext* ctx);
 
 // Helper class to perform the marshalling of TensorFlow inputs and outputs to
 // ShapedBuffers suitable for passing to an XLA computation.
@@ -136,10 +140,10 @@ class XlaComputationLaunchContext {
   // Builds a XlaCompiler::Argument vector from the arguments to an XlaLaunch
   // op.
   // Precondition: variables in `variable_args` are locked.
-  static Status BuildXlaCompilerArguments(
-      const std::map<int, Tensor>& constant_args,
-      absl::Span<VariableInfo const> variable_args, OpKernelContext* ctx,
-      std::vector<XlaCompiler::Argument>* args);
+  static xla::StatusOr<std::vector<XlaCompiler::Argument>>
+  BuildXlaCompilerArguments(absl::Span<int const> must_be_constant_idxs,
+                            absl::Span<const Tensor* const> inputs,
+                            absl::Span<VariableInfo const> variable_args);
 
   // Add all inputs within `ctx` as XLA arguments (returned by arguments()).
   // `variables` is a map from TensorFlow argument number to resource variable.
diff --git a/tensorflow/compiler/jit/xla_ops_on_regular_devices.cc b/tensorflow/compiler/jit/xla_ops_on_regular_devices.cc
new file mode 100644
index 00000000000..6c6c490e032
--- /dev/null
+++ b/tensorflow/compiler/jit/xla_ops_on_regular_devices.cc
@@ -0,0 +1,98 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Register XlaXXX operations on regular CPU/GPU devices using
+// `XlaCompileOnDemandOp`.
+#include "tensorflow/compiler/jit/xla_compile_on_demand_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+
+#define REGISTER_XLA_OPS_ON_DEVICE(DEVICE)                                     \
+  REGISTER_KERNEL_BUILDER(Name("XlaConv")                                      \
+                              .HostMemory("window_strides")                    \
+                              .HostMemory("padding")                           \
+                              .HostMemory("lhs_dilation")                      \
+                              .HostMemory("rhs_dilation")                      \
+                              .HostMemory("feature_group_count")               \
+                              .Device(DEVICE),                                 \
+                          XlaCompileOnDemandOp);                               \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("XlaBroadcastHelper").HostMemory("broadcast_dims").Device(DEVICE),  \
+      XlaCompileOnDemandOp);                                                   \
+  REGISTER_KERNEL_BUILDER(Name("XlaSelfAdjointEig").Device(DEVICE),            \
+                          XlaCompileOnDemandOp);                               \
+  REGISTER_KERNEL_BUILDER(Name("XlaSvd").Device(DEVICE),                       \
+                          XlaCompileOnDemandOp);                               \
+  REGISTER_KERNEL_BUILDER(Name("XlaDot").Device(DEVICE),                       \
+                          XlaCompileOnDemandOp);                               \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("XlaDynamicSlice").HostMemory("size_indices").Device(DEVICE),       \
+      XlaCompileOnDemandOp);                                                   \
+  REGISTER_KERNEL_BUILDER(Name("XlaDynamicUpdateSlice").Device(DEVICE),        \
+                          XlaCompileOnDemandOp);                               \
+  REGISTER_KERNEL_BUILDER(Name("XlaIf").Device(DEVICE), XlaCompileOnDemandOp); \
+  REGISTER_KERNEL_BUILDER(Name("XlaPad")                                       \
+                              .HostMemory("padding_low")                       \
+                              .HostMemory("padding_high")                      \
+                              .HostMemory("padding_interior")                  \
+                              .Device(DEVICE),                                 \
+                          XlaCompileOnDemandOp);                               \
+  REGISTER_KERNEL_BUILDER(Name("XlaRecv").Device(DEVICE),                      \
+                          XlaCompileOnDemandOp);                               \
+  REGISTER_KERNEL_BUILDER(Name("XlaReduce").Device(DEVICE),                    \
+                          XlaCompileOnDemandOp);                               \
+  REGISTER_KERNEL_BUILDER(Name("XlaReduceWindow")                              \
+                              .HostMemory("window_dimensions")                 \
+                              .HostMemory("window_strides")                    \
+                              .HostMemory("base_dilations")                    \
+                              .HostMemory("window_dilations")                  \
+                              .HostMemory("padding")                           \
+                              .Device(DEVICE),                                 \
+                          XlaCompileOnDemandOp);                               \
+  REGISTER_KERNEL_BUILDER(Name("XlaSelectAndScatter")                          \
+                              .HostMemory("window_dimensions")                 \
+                              .HostMemory("window_strides")                    \
+                              .HostMemory("padding")                           \
+                              .Device(DEVICE),                                 \
+                          XlaCompileOnDemandOp);                               \
+  REGISTER_KERNEL_BUILDER(Name("XlaSend").Device(DEVICE),                      \
+                          XlaCompileOnDemandOp);                               \
+  REGISTER_KERNEL_BUILDER(Name("XlaSort").Device(DEVICE),                      \
+                          XlaCompileOnDemandOp);                               \
+  REGISTER_KERNEL_BUILDER(Name("XlaKeyValueSort").Device(DEVICE),              \
+                          XlaCompileOnDemandOp);                               \
+  REGISTER_KERNEL_BUILDER(Name("XlaWhile").Device(DEVICE),                     \
+                          XlaCompileOnDemandOp);                               \
+  REGISTER_KERNEL_BUILDER(Name("XlaDequantize").Device(DEVICE),                \
+                          XlaCompileOnDemandOp);                               \
+  REGISTER_KERNEL_BUILDER(Name("XlaEinsum").Device(DEVICE),                    \
+                          XlaCompileOnDemandOp);                               \
+  REGISTER_KERNEL_BUILDER(Name("XlaSpmdShardToFullShape").Device(DEVICE),      \
+                          XlaCompileOnDemandOp);                               \
+  REGISTER_KERNEL_BUILDER(Name("XlaSharding").Device(DEVICE),                  \
+                          XlaCompileOnDemandOp);                               \
+  REGISTER_KERNEL_BUILDER(Name("XlaReplicaId").Device(DEVICE),                 \
+                          XlaCompileOnDemandOp);                               \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("XlaGather").HostMemory("slice_sizes").Device(DEVICE),              \
+      XlaCompileOnDemandOp);                                                   \
+  REGISTER_KERNEL_BUILDER(Name("XlaScatter").Device(DEVICE),                   \
+                          XlaCompileOnDemandOp);
+
+REGISTER_XLA_OPS_ON_DEVICE(DEVICE_CPU);
+REGISTER_XLA_OPS_ON_DEVICE(DEVICE_GPU);
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_platform_info.cc b/tensorflow/compiler/jit/xla_platform_info.cc
new file mode 100644
index 00000000000..b38bf9282b1
--- /dev/null
+++ b/tensorflow/compiler/jit/xla_platform_info.cc
@@ -0,0 +1,158 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/xla_platform_info.h"
+
+#include "tensorflow/compiler/xla/client/client_library.h"
+
+namespace tensorflow {
+
+Status BuildXlaCompilationCache(DeviceBase* device,
+                                const XlaPlatformInfo& platform_info,
+                                XlaCompilationCache** cache) {
+  if (platform_info.xla_device_metadata()) {
+    *cache = new XlaCompilationCache(
+        platform_info.xla_device_metadata()->client(),
+        platform_info.xla_device_metadata()->jit_device_type());
+    return Status::OK();
+  }
+
+  auto platform =
+      se::MultiPlatformManager::PlatformWithId(platform_info.platform_id());
+  if (!platform.ok()) {
+    return platform.status();
+  }
+
+  xla::StatusOr<xla::Compiler*> compiler_for_platform =
+      xla::Compiler::GetForPlatform(platform.ValueOrDie());
+  if (!compiler_for_platform.ok()) {
+    // In some rare cases (usually in unit tests with very small clusters) we
+    // may end up transforming an XLA cluster with at least one GPU operation
+    // (which would normally force the cluster to be compiled using XLA:GPU)
+    // into an XLA cluster with no GPU operations (i.e. containing only CPU
+    // operations).  Such a cluster can fail compilation (in way that
+    // MarkForCompilation could not have detected) if the CPU JIT is not linked
+    // in.
+    //
+    // So bail out of _XlaCompile in this case, and let the executor handle the
+    // situation for us.
+    const Status& status = compiler_for_platform.status();
+    if (status.code() == error::NOT_FOUND) {
+      return errors::Unimplemented("Could not find compiler for platform ",
+                                   platform.ValueOrDie()->Name(), ": ",
+                                   status.ToString());
+    }
+  }
+
+  xla::LocalClientOptions client_options;
+  client_options.set_platform(platform.ValueOrDie());
+  client_options.set_intra_op_parallelism_threads(
+      device->tensorflow_cpu_worker_threads()->num_threads);
+  auto client = xla::ClientLibrary::GetOrCreateLocalClient(client_options);
+  if (!client.ok()) {
+    return client.status();
+  }
+  const XlaOpRegistry::DeviceRegistration* registration;
+  if (!XlaOpRegistry::GetCompilationDevice(platform_info.device_type().type(),
+                                           &registration)) {
+    return errors::InvalidArgument("No JIT device registered for ",
+                                   platform_info.device_type().type());
+  }
+  *cache = new XlaCompilationCache(
+      client.ValueOrDie(), DeviceType(registration->compilation_device_name));
+  return Status::OK();
+}
+
+XlaPlatformInfo XlaPlatformInfoFromDevice(DeviceBase* device_base) {
+  auto device = static_cast<Device*>(device_base);
+  se::Platform::Id platform_id = nullptr;
+  const XlaDevice::Metadata* xla_device_metadata = nullptr;
+  se::DeviceMemoryAllocator* custom_allocator = nullptr;
+
+  if (device->device_type() == DEVICE_CPU) {
+    platform_id = se::host::kHostPlatformId;
+  } else if (device->device_type() == DEVICE_GPU) {
+    platform_id = device->tensorflow_gpu_device_info()
+                      ->stream->parent()
+                      ->platform()
+                      ->id();
+  } else if (XlaDevice::GetMetadataFromDevice(device, &xla_device_metadata)
+                 .ok()) {
+    // If we are on an XlaDevice, use the underlying XLA platform's allocator
+    // directly. We could use the StreamExecutor's allocator which may
+    // theoretically be more correct, but XLA returns a nice OOM message in a
+    // Status and StreamExecutor does not.
+    //
+    // Importantly we can't use ctx->device()->GetAllocator() as the allocator
+    // (which xla_allocator above uses) as on an XlaDevice, this is a dummy
+    // allocator that returns XlaTensor objects. The XlaCompiler needs a real
+    // allocator to allocate real buffers.
+    platform_id = xla_device_metadata->platform()->id();
+    custom_allocator =
+        xla_device_metadata->client()->backend().memory_allocator();
+  }
+
+  return XlaPlatformInfo(DeviceType(device->device_type()), platform_id,
+                         xla_device_metadata, custom_allocator);
+}
+
+se::DeviceMemoryAllocator* GetAllocator(
+    absl::optional<se::TfAllocatorAdapter>* tf_allocator_adapter,
+    DeviceBase* device, se::Stream* stream,
+    const XlaPlatformInfo& platform_info) {
+  if (platform_info.custom_allocator()) {
+    return platform_info.custom_allocator();
+  }
+  if (!stream) {
+    // Stream is not set for the host platform.
+    se::Platform* platform =
+        se::MultiPlatformManager::PlatformWithId(platform_info.platform_id())
+            .ValueOrDie();
+    tf_allocator_adapter->emplace(device->GetAllocator({}), platform);
+    return &tf_allocator_adapter->value();
+  }
+  tf_allocator_adapter->emplace(device->GetAllocator({}), stream);
+  return &tf_allocator_adapter->value();
+}
+
+XlaCompiler::Options GenerateCompilerOptions(
+    const XlaCompilationCache& cache,
+    const FunctionLibraryRuntime& function_library, DeviceBase* device,
+    se::Stream* stream, const XlaPlatformInfo& platform_info, bool has_ref_vars,
+    absl::optional<se::TfAllocatorAdapter>* tf_allocator_adapter) {
+  XlaCompiler::Options options;
+  options.client = static_cast<xla::LocalClient*>(cache.client());
+  if (stream != nullptr) {
+    options.device_ordinal = stream->parent()->device_ordinal();
+  }
+  options.device_type = cache.device_type();
+  options.flib_def = function_library.GetFunctionLibraryDefinition();
+  options.graph_def_version = function_library.graph_def_version();
+  options.allow_cpu_custom_calls =
+      (platform_info.platform_id() == se::host::kHostPlatformId);
+  options.device_allocator =
+      GetAllocator(tf_allocator_adapter, device, stream, platform_info);
+  if (platform_info.xla_device_metadata()) {
+    options.shape_representation_fn =
+        platform_info.xla_device_metadata()->shape_representation_fn();
+  }
+  // If reference variables are not present in the graph, we can safely alias
+  // passthrough parameters without performing a copy.
+  options.alias_passthrough_params =
+      !has_ref_vars && !platform_info.is_on_xla_device();
+  return options;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_platform_info.h b/tensorflow/compiler/jit/xla_platform_info.h
new file mode 100644
index 00000000000..bfb438cc398
--- /dev/null
+++ b/tensorflow/compiler/jit/xla_platform_info.h
@@ -0,0 +1,112 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_XLA_PLATFORM_INFO_H_
+#define TENSORFLOW_COMPILER_JIT_XLA_PLATFORM_INFO_H_
+
+#include "tensorflow/compiler/jit/xla_compilation_cache.h"
+#include "tensorflow/compiler/jit/xla_device.h"
+#include "tensorflow/stream_executor/tf_allocator_adapter.h"
+
+namespace tensorflow {
+
+// Holds some information about the platform on which an
+// XlaLaunch/_XlaCompile/_XlaRun op must run on. Provides a common layer of
+// abstraction for normal and XLA devices.
+class XlaPlatformInfo {
+ public:
+  XlaPlatformInfo() : device_type_("") {}
+  XlaPlatformInfo(XlaPlatformInfo&&) = default;
+  explicit XlaPlatformInfo(const DeviceType device_type,
+                           se::Platform::Id platform_id,
+                           const XlaDevice::Metadata* xla_device_metadata,
+                           se::DeviceMemoryAllocator* device_allocator)
+      : device_type_(device_type),
+        platform_id_(platform_id),
+        xla_device_metadata_(xla_device_metadata),
+        device_allocator_(device_allocator) {}
+
+  XlaPlatformInfo& operator=(XlaPlatformInfo&& other) = default;
+
+  bool UseMultipleStreams() const {
+    return xla_device_metadata_ && xla_device_metadata_->UseMultipleStreams();
+  }
+
+  // Non-null only when run on an XLA device.
+  se::DeviceMemoryAllocator* custom_allocator() const {
+    return device_allocator_;
+  }
+
+  DeviceType device_type() const { return device_type_; }
+
+  // This is equal to xla_device_metadata()->platform()->id() if
+  // xla_device_metadata() is not nullptr.
+  se::Platform::Id platform_id() const { return platform_id_; }
+
+  // This may be null if the op this XlaPlatformInfo is for was not placed on an
+  // XLA device.
+  const XlaDevice::Metadata* xla_device_metadata() const {
+    return xla_device_metadata_;
+  }
+  bool is_on_xla_device() const { return xla_device_metadata() != nullptr; }
+
+ private:
+  DeviceType device_type_;
+  se::Platform::Id platform_id_;
+
+  // xla_device_metadata_ lives in the tensorflow::DeviceBase in which the
+  // XlaLaunch/_XlaCompile/_XlaRun op is placed and thus does not die before the
+  // XlaLaunch/_XlaCompile/_XlaRun OpKernel.
+  const XlaDevice::Metadata* xla_device_metadata_;
+
+  // If the op associated with this XlaPlatformInfo is placed on an XLA device
+  // then device_allocator_ is the xla::Backend's memory allocator.  If the op
+  // is placed on a regular CPU or GPU device then device_allocator_ is null.
+  se::DeviceMemoryAllocator* device_allocator_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(XlaPlatformInfo);
+};
+
+// Returns created XLA compilation cache.
+Status BuildXlaCompilationCache(DeviceBase* dev,
+                                const XlaPlatformInfo& platform_info,
+                                XlaCompilationCache** cache);
+
+// Returns information about the platform from kernel context.
+XlaPlatformInfo XlaPlatformInfoFromDevice(DeviceBase* device);
+
+// Returns allocator from platform info if non-null, or populate and return a
+// pointer to the allocator adapter with allocator from context.
+//
+// This is necessary because for XLA devices the underlying TF allocator returns
+// dummy tensors.
+//
+// `stream` parameter is nullable when running on host.
+se::DeviceMemoryAllocator* GetAllocator(
+    absl::optional<se::TfAllocatorAdapter>* tf_allocator_adapter,
+    DeviceBase* device, se::Stream* stream,
+    const XlaPlatformInfo& platform_info);
+
+// Returns created options for the XLA compiler, and writes the used allocator
+// into `tf_allocator_adapter`.
+XlaCompiler::Options GenerateCompilerOptions(
+    const XlaCompilationCache& cache,
+    const FunctionLibraryRuntime& function_library, DeviceBase* device,
+    se::Stream* stream, const XlaPlatformInfo& platform_info, bool has_ref_vars,
+    absl::optional<se::TfAllocatorAdapter>* tf_allocator_adapter);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_XLA_PLATFORM_INFO_H_
diff --git a/tensorflow/compiler/jit/xla_tensor.h b/tensorflow/compiler/jit/xla_tensor.h
index dc358760534..2da1501819c 100644
--- a/tensorflow/compiler/jit/xla_tensor.h
+++ b/tensorflow/compiler/jit/xla_tensor.h
@@ -71,18 +71,6 @@ class XlaTensor {
     shaped_buffer_ = std::move(shaped_buffer);
   }
 
-  // Some tensors on the device may have known values on the host. We use these
-  // in on-demand mode to avoid re-copying values from the device if we know the
-  // host value already.
-
-  // Return true if this XlaTensor contains a host tensor.
-  bool has_host_tensor() const { return host_tensor_.has_value(); }
-  // Return the contained host tensor.
-  // REQUIRES: has_host_tensor()
-  const Tensor& host_tensor() const { return *host_tensor_; }
-  // Sets the contained host tensor.
-  void set_host_tensor(const Tensor& tensor) { host_tensor_.emplace(tensor); }
-
   // Adds synchronization events to 'stream' that wait for this tensor to be
   // defined on 'stream'. Does nothing if the tensor is already defined on that
   // stream.
diff --git a/tensorflow/compiler/mlir/BUILD b/tensorflow/compiler/mlir/BUILD
index 01c187790b7..b1870b15595 100644
--- a/tensorflow/compiler/mlir/BUILD
+++ b/tensorflow/compiler/mlir/BUILD
@@ -24,11 +24,40 @@ filegroup(
     srcs = glob(["**/*.td"]),
 )
 
+cc_library(
+    name = "string_container_utils",
+    hdrs = ["utils/string_container_utils.h"],
+    deps = [
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
+cc_library(
+    name = "array_container_utils",
+    hdrs = ["utils/array_container_utils.h"],
+    deps = [
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
+cc_library(
+    name = "name_utils",
+    srcs = ["utils/name_utils.cc"],
+    hdrs = ["utils/name_utils.h"],
+    deps = [
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
 cc_library(
     name = "op_or_arg_name_mapper",
     srcs = ["op_or_arg_name_mapper.cc"],
     hdrs = ["op_or_arg_name_mapper.h"],
     deps = [
+        ":name_utils",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
@@ -40,14 +69,14 @@ cc_library(
     srcs = ["tf_mlir_opt_main.cc"],
     deps = [
         ":init_mlir",
+        "//tensorflow/compiler/mlir/hlo:hlo_dialect_registration",
+        "//tensorflow/compiler/mlir/lite:tensorflow_lite",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tools/kernel_gen/ir:tf_framework_ops",
         "//tensorflow/core:lib",
-        "//tensorflow/core/platform:logging",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:AllPassesAndDialects",
-        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:AllPassesAndDialectsNoRegistration",
         "@llvm-project//mlir:MlirOptLib",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:Shape",
     ],
 )
 
@@ -64,14 +93,13 @@ cc_library(
         # xla-legalize-tf-with-tf2xla pass.
         "//tensorflow/compiler/jit",
         "//tensorflow/compiler/mlir/lite:tensorflow_lite",
-        "//tensorflow/compiler/mlir/lite:tensorflow_lite_dialect_registration",
         "//tensorflow/compiler/mlir/lite:tensorflow_lite_legalize_tf",
         "//tensorflow/compiler/mlir/lite:tensorflow_lite_optimize",
         "//tensorflow/compiler/mlir/lite:tensorflow_lite_quantize",
         "//tensorflow/compiler/mlir/lite/quantization:quantization_passes",
         "//tensorflow/compiler/mlir/lite/quantization/tensorflow:tf_to_quant",
         "//tensorflow/compiler/mlir/tensorflow",
-        "//tensorflow/compiler/mlir/tensorflow:tensorflow_dialect_registration",
+        "//tensorflow/compiler/mlir/tensorflow:compile_mlir_util_pass",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_passes",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_test_passes",
         "//tensorflow/compiler/mlir/tensorflow:tf_dialect_passes",
@@ -127,11 +155,8 @@ tf_cc_binary(
     deps = [
         ":passes",
         ":tf_mlir_opt_main",
-        "//tensorflow/compiler/mlir/lite:tensorflow_lite_dialect_registration",
         "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_pass_registration",
-        "//tensorflow/compiler/mlir/tensorflow:tensorflow_dialect_registration",
         "//tensorflow/compiler/mlir/tensorflow:tf_graph_optimization_pass",
-        "//tensorflow/compiler/mlir/tfjs:tensorflow_js_dialect_registration",
         "//tensorflow/compiler/mlir/xla:all_xla_passes_for_testing",
     ],
 )
@@ -141,10 +166,9 @@ tf_cc_binary(
     srcs = ["tf_mlir_translate_main.cc"],
     deps = [
         ":init_mlir",
-        "//tensorflow/compiler/mlir/lite:tensorflow_lite_dialect_registration",
         "//tensorflow/compiler/mlir/tensorflow:convert_graphdef",
         "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_flags",
-        "//tensorflow/compiler/mlir/tensorflow:tensorflow_dialect_registration",
+        "//tensorflow/compiler/mlir/tensorflow:tf_xla_mlir_translate",
         "//tensorflow/compiler/mlir/tensorflow:translate_cl_options",
         "//tensorflow/compiler/mlir/tensorflow:translate_lib",
         "//tensorflow/compiler/mlir/tensorflow:translate_registration",
@@ -157,7 +181,7 @@ tf_cc_binary(
         "//tensorflow/stream_executor/lib",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:AllPassesAndDialects",
+        "@llvm-project//mlir:AllPassesAndDialectsNoRegistration",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:Translation",
@@ -168,3 +192,5 @@ filegroup(
     name = "litfiles",
     srcs = glob(["runlit*py"]),
 )
+
+exports_files(["run_lit.sh"])
diff --git a/tensorflow/compiler/mlir/glob_lit_test.bzl b/tensorflow/compiler/mlir/glob_lit_test.bzl
index edbf3663a89..1fa57babdae 100644
--- a/tensorflow/compiler/mlir/glob_lit_test.bzl
+++ b/tensorflow/compiler/mlir/glob_lit_test.bzl
@@ -43,10 +43,10 @@ def _run_lit_test(name, data, size, tags, driver, features, exec_properties):
               and specifying a default driver will abort the tests.
       features: [str], list of extra features to enable.
     """
-    if driver != _default_driver:
-        fail("There is no present support for custom drivers. Please omit" +
-             " the driver parameter when running this test. If you require" +
-             " custom driver support, please file an issue to request it.")
+
+    # Remove the default_driver from the data: it does not exist as a file and is
+    # just a placeholder from the copybara rewrite.
+    data = [d for d in data if d != _default_driver]
 
     # Disable tests on windows for now, to enable testing rest of all xla and mlir.
     native.py_test(
diff --git a/tensorflow/compiler/mlir/hlo/.gitignore b/tensorflow/compiler/mlir/hlo/.gitignore
new file mode 100644
index 00000000000..cc1696bf575
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/.gitignore
@@ -0,0 +1,4 @@
+build
+llvm-project
+llvm-build
+
diff --git a/tensorflow/compiler/mlir/hlo/BUILD b/tensorflow/compiler/mlir/hlo/BUILD
index 9eee39894e4..0e167519263 100644
--- a/tensorflow/compiler/mlir/hlo/BUILD
+++ b/tensorflow/compiler/mlir/hlo/BUILD
@@ -17,6 +17,7 @@ package_group(
         "//learning/brain/experimental/mlir/...",
         "//learning/brain/google/xla/kernels/...",
         "//learning/brain/google/xla/mlir/...",
+        "//learning/deepmind/partir/...",
         "//learning/pathways/data_parallel/tf2xla/...",
         "//platforms/xla/...",
         "//tensorflow/compiler/mlir/...",
@@ -41,6 +42,7 @@ filegroup(
         "include/mlir-hlo/Dialect/mhlo/IR/infer_fusibility_op_interface.td",
         "include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.td",
         "@llvm-project//mlir:OpBaseTdFiles",
+        "@llvm-project//mlir:include/mlir/Interfaces/CopyOpInterface.td",
         "@llvm-project//mlir:include/mlir/Interfaces/InferTypeOpInterface.td",
         "@llvm-project//mlir:include/mlir/Interfaces/LoopLikeInterface.td",
         "@llvm-project//mlir:include/mlir/Interfaces/SideEffectInterfaces.td",
@@ -298,6 +300,7 @@ cc_library(
         ":lhlo_ops_inc_gen",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:Analysis",
+        "@llvm-project//mlir:CopyOpInterface",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:SideEffects",
@@ -310,17 +313,6 @@ cc_library(
     alwayslink = 1,
 )
 
-cc_library(
-    name = "hlo_dialect_force_registration",
-    srcs = ["lib/Dialect/mhlo/IR/dialect_registration.cc"],
-    deps = [
-        ":hlo",
-        ":lhlo",
-        "@llvm-project//mlir:IR",
-    ],
-    alwayslink = 1,
-)
-
 cc_library(
     name = "hlo_dialect_registration",
     srcs = ["lib/Dialect/mhlo/IR/init.cc"],
@@ -341,6 +333,7 @@ cc_library(
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:Transforms",
@@ -348,6 +341,22 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "mhlo_control_flow_to_scf",
+    srcs = ["lib/Dialect/mhlo/transforms/mhlo_control_flow_to_scf.cc"],
+    hdrs = ["include/mlir-hlo/Dialect/mhlo/transforms/passes.h"],
+    deps = [
+        ":hlo",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SCFDialect",
+        "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:Transforms",
+    ],
+)
+
 cc_library(
     name = "map_lmhlo_to_scalar_op",
     hdrs = ["include/mlir-hlo/Dialect/mhlo/transforms/map_lmhlo_to_scalar_op.h"],
@@ -404,6 +413,7 @@ cc_library(
 cc_library(
     name = "lhlo_legalize_to_llvm",
     srcs = ["lib/Dialect/mhlo/transforms/lhlo_legalize_to_llvm.cc"],
+    hdrs = ["include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h"],
     deps = [
         ":lhlo",
         "@llvm-project//mlir:IR",
@@ -419,7 +429,10 @@ cc_library(
 cc_library(
     name = "legalize_to_linalg",
     srcs = ["lib/Dialect/mhlo/transforms/legalize_to_linalg.cc"],
-    hdrs = ["include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h"],
+    hdrs = [
+        "include/mlir-hlo/Dialect/mhlo/transforms/passes.h",
+        "include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h",
+    ],
     deps = [
         ":hlo",
         ":lhlo",
@@ -438,9 +451,13 @@ cc_library(
 cc_library(
     name = "transform_unranked_hlo",
     srcs = ["lib/Dialect/mhlo/transforms/transform_unranked_hlo.cc"],
-    hdrs = ["include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h"],
+    hdrs = [
+        "include/mlir-hlo/Dialect/mhlo/transforms/passes.h",
+        "include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h",
+    ],
     deps = [
         ":hlo",
+        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Shape",
@@ -458,6 +475,7 @@ cc_library(
         ":lhlo",
         ":map_lmhlo_to_scalar_op",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:Affine",
         "@llvm-project//mlir:GPUDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LinalgOps",
@@ -476,9 +494,11 @@ cc_library(
     deps = [
         ":lhlo",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:Affine",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LinalgTransforms",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TransformUtils",
@@ -486,21 +506,6 @@ cc_library(
     alwayslink = 1,
 )
 
-cc_library(
-    name = "lhlo_copy_removal",
-    srcs = ["lib/Dialect/mhlo/transforms/lhlo_copy_removal.cc"],
-    hdrs = ["include/mlir-hlo/Dialect/mhlo/transforms/passes.h"],
-    deps = [
-        ":lhlo",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:StandardOps",
-        "@llvm-project//mlir:Support",
-    ],
-    alwayslink = 1,
-)
-
 cc_library(
     name = "hlo_legalize_to_lhlo",
     srcs = ["lib/Dialect/mhlo/transforms/hlo_legalize_to_lhlo.cc"],
@@ -681,7 +686,6 @@ cc_library(
     ],
     deps = [
         ":hlo",
-        ":hlo_dialect_force_registration",
         ":lower_complex_inc_gen",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:Analysis",
@@ -732,6 +736,7 @@ cc_library(
     srcs = ["lib/Dialect/mhlo/transforms/chlo_legalize_to_hlo.cc"],
     hdrs = ["include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h"],
     deps = [
+        ":chlo_legalize_to_hlo_inc_gen",
         ":hlo",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:SCFDialect",
@@ -741,6 +746,25 @@ cc_library(
     ],
 )
 
+gentbl(
+    name = "chlo_legalize_to_hlo_inc_gen",
+    strip_include_prefix = "lib/Dialect/mhlo/transforms/",
+    tbl_outs = [
+        (
+            "-gen-rewriters",
+            "lib/Dialect/mhlo/transforms/generated_chlo_legalize_to_hlo.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "lib/Dialect/mhlo/transforms/chlo_legalize_to_hlo_patterns.td",
+    td_relative_includes = [
+        "include",
+    ],
+    td_srcs = [
+        ":hlo_ops_td_files",
+    ],
+)
+
 cc_library(
     name = "test_passes",
     srcs = [
@@ -759,8 +783,6 @@ cc_library(
         ":lhlo_legalize_to_llvm",  # build-cleaner: keep
         ":materialize_broadcasts",  # build-cleaner: keep
         ":unfuse_batch_norm",  # build-cleaner: keep
-        "@llvm-project//mlir:AffineToStandardTransforms",
-        "@llvm-project//mlir:CFGTransforms",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:InferTypeOpInterface",
         "@llvm-project//mlir:LLVMDialect",
@@ -793,11 +815,11 @@ cc_library(
         ":legalize_to_linalg",
         ":legalize_to_standard",
         ":lhlo",
-        ":lhlo_copy_removal",
         ":lhlo_fuse_linalg",
         ":lhlo_legalize_to_affine",
         ":lhlo_legalize_to_gpu",
         ":lhlo_legalize_to_parallel_loops",
+        ":mhlo_control_flow_to_scf",
         ":mhlo_fusion",
         ":mhlo_to_mhlo_lowering_patterns",
         ":sink_constants_to_control_flow",
@@ -807,13 +829,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "register_all_passes",
-    srcs = ["lib/Dialect/mhlo/transforms/register_all_passes.cc"],
-    deps = [":all_passes"],
-    alwayslink = 1,
-)
-
 cc_binary(
     name = "mlir-hlo-opt",
     srcs = [
@@ -821,7 +836,8 @@ cc_binary(
     ],
     deps = [
         ":all_passes",
-        ":hlo_dialect_registration",
+        ":hlo",
+        ":lhlo",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:AllPassesAndDialectsNoRegistration",
         "@llvm-project//mlir:IR",
diff --git a/tensorflow/compiler/mlir/hlo/CMakeLists.txt b/tensorflow/compiler/mlir/hlo/CMakeLists.txt
new file mode 100644
index 00000000000..c4e2ea123df
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/CMakeLists.txt
@@ -0,0 +1,94 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+cmake_minimum_required(VERSION 3.13.4)
+
+if(POLICY CMP0068)
+  cmake_policy(SET CMP0068 NEW)
+  set(CMAKE_BUILD_WITH_INSTALL_NAME_DIR ON)
+endif()
+
+if(POLICY CMP0075)
+  cmake_policy(SET CMP0075 NEW)
+endif()
+
+if(POLICY CMP0077)
+  cmake_policy(SET CMP0077 NEW)
+endif()
+
+#-------------------------------------------------------------------------------
+# Project setup and globals
+#-------------------------------------------------------------------------------
+
+project(mlir-hlo LANGUAGES CXX C)
+set(CMAKE_C_STANDARD 11)
+set(CMAKE_CXX_STANDARD 14)
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules")
+
+#-------------------------------------------------------------------------------
+# Options and settings
+#-------------------------------------------------------------------------------
+
+#-------------------------------------------------------------------------------
+# MSVC defaults
+#-------------------------------------------------------------------------------
+
+if(MSVC)
+    add_compile_options(
+        $<$<CONFIG:>:/MD>
+        $<$<CONFIG:Debug>:/MD>
+        $<$<CONFIG:Release>:/MD>
+    )
+endif()
+
+#-------------------------------------------------------------------------------
+# MLIR/LLVM Configuration
+#-------------------------------------------------------------------------------
+
+find_package(MLIR REQUIRED CONFIG)
+message(STATUS "Using MLIRConfig.cmake in: ${MLIR_DIR}")
+message(STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}")
+list(APPEND CMAKE_MODULE_PATH "${MLIR_CMAKE_DIR}")
+list(APPEND CMAKE_MODULE_PATH "${LLVM_CMAKE_DIR}")
+
+if(LLVM_ENABLE_ZLIB)
+  find_package(ZLIB)
+endif()
+
+include(TableGen)
+include(AddLLVM)
+include(AddMLIR)
+include(HandleLLVMOptions)
+include_directories(${LLVM_INCLUDE_DIRS})
+include_directories(${MLIR_INCLUDE_DIRS})
+include_directories(${PROJECT_SOURCE_DIR}/include)
+include_directories(${PROJECT_BINARY_DIR}/include)
+include_directories(${PROJECT_BINARY_DIR}/)
+link_directories(${LLVM_BUILD_LIBRARY_DIR})
+add_definitions(${LLVM_DEFINITIONS})
+
+#-------------------------------------------------------------------------------
+# Directory setup
+#-------------------------------------------------------------------------------
+
+set(MLIR_HLO_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+set(MLIR_HLO_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
+
+add_custom_target(check-mlir-hlo)
+
+add_subdirectory(include/mlir-hlo)
+add_subdirectory(lib)
+add_subdirectory(tools)
+add_subdirectory(tests)
diff --git a/tensorflow/compiler/mlir/hlo/README.md b/tensorflow/compiler/mlir/hlo/README.md
index 1be6fb29d13..9eaa14031fd 100644
--- a/tensorflow/compiler/mlir/hlo/README.md
+++ b/tensorflow/compiler/mlir/hlo/README.md
@@ -1,4 +1,4 @@
-# MLIR-HLO
+# MLIR-HLO: A Standalone "HLO" MLIR-based Compiler
 
 The code here exists in two places:
 
@@ -22,10 +22,43 @@ upstream.
 
 ## QuickStart: building and testing
 
-TODO
+These instructions work on Linux, you may have to adjust for your plaform.
+
+To build the code in this repository, you need a clone of the LLVM/MLIR git
+repository:
+
+    $ git clone https://github.com/llvm/llvm-project.git
+
+
+You need to make sure you have the right commit checked out in the LLVM
+repository (you need to do this every time you pull from this repo):
+
+    $ (cd llvm-project && git checkout $(cat build_tools/llvm_version.txt))
+
+We provide a script to configure and build LLVM/MLIR:
+
+    $ build_tools/build_mlir.sh ${PWD}/llvm-project/ ${PWD}/llvm-build
+
+Again this is something to do every time you pull from this repository and the
+LLVM revision changes.
+
+Finally you can build and test this repository:
+
+    $ mkdir build && cd build
+    $ cmake .. -GNinja \
+       -DLLVM_ENABLE_LLD=ON \
+       -DCMAKE_BUILD_TYPE=Release \
+       -DLLVM_ENABLE_ASSERTIONS=On \
+       -DMLIR_DIR=${PWD}/../llvm-build/lib/cmake/mlir
+    $ ninja check-mlir-hlo
+
 
 ## Overview
 
+MLIR-HLO aims to provide an end-to-end compiler for CPU and GPU, as well as
+building reusable blocks for other accelerators. This is heavily inspired by the
+success of XLA.
+
 [XLA](https://www.tensorflow.org/xla/) (Accelerated Linear Algebra) is a
 domain-specific compiler framework and execution environment for linear algebra,
 which powers code-generation for ML frameworks like TensorFlow, JAX, and others.
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/CMakeLists.txt b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/CMakeLists.txt
new file mode 100644
index 00000000000..92759d76383
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/CMakeLists.txt
@@ -0,0 +1,16 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+add_subdirectory(Dialect)
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/CMakeLists.txt b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/CMakeLists.txt
new file mode 100644
index 00000000000..5ee1a1924ec
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/CMakeLists.txt
@@ -0,0 +1,16 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+add_subdirectory(mhlo)
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/CMakeLists.txt b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/CMakeLists.txt
new file mode 100644
index 00000000000..e138afa587f
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/CMakeLists.txt
@@ -0,0 +1,17 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+add_subdirectory(IR)
+add_subdirectory(transforms)
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/CMakeLists.txt b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/CMakeLists.txt
new file mode 100644
index 00000000000..09bdca84cd3
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/CMakeLists.txt
@@ -0,0 +1,31 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Need a separate function because of the .cc vs .cpp used in the one provided by MLIR
+function(add_mlir_hlo_dialect dialect dialect_namespace)
+  set(LLVM_TARGET_DEFINITIONS ${dialect}.td)
+  mlir_tablegen(${dialect}.h.inc -gen-op-decls)
+  mlir_tablegen(${dialect}.cc.inc -gen-op-defs)
+  mlir_tablegen(${dialect}_structs.h.inc -gen-struct-attr-decls)
+  mlir_tablegen(${dialect}_structs.cc.inc -gen-struct-attr-defs)
+  add_public_tablegen_target(MLIR${dialect}IncGen)
+  add_dependencies(mlir-headers MLIR${dialect}IncGen)
+endfunction()
+
+add_mlir_hlo_dialect(chlo_ops chlo)
+add_mlir_hlo_dialect(hlo_ops mhlo)
+add_mlir_hlo_dialect(lhlo_ops lmhlo)
+
+add_mlir_interface(infer_fusibility_op_interface)
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/chlo_ops.h b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/chlo_ops.h
index 14a22e92a74..05b22770401 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/chlo_ops.h
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/chlo_ops.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "mlir/IR/OpDefinition.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/IR/StandardTypes.h"
+#include "mlir/IR/TypeUtilities.h"
 #include "mlir/IR/Types.h"
 #include "mlir/Interfaces/InferTypeOpInterface.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
@@ -32,14 +33,39 @@ namespace mlir {
 namespace chlo {
 
 class HloClientDialect : public Dialect {
+  void initialize();
+
  public:
-  explicit HloClientDialect(MLIRContext *context);
+  explicit HloClientDialect(MLIRContext *context)
+      : Dialect(getDialectNamespace(), context,
+                TypeID::get<HloClientDialect>()) {
+    initialize();
+  }
   static StringRef getDialectNamespace() { return "chlo"; }
 };
 
+}  // namespace chlo
+}  // namespace mlir
+
 #define GET_OP_CLASSES
 #include "mlir-hlo/Dialect/mhlo/IR/chlo_ops.h.inc"
 
+namespace mlir {
+namespace chlo {
+
+template <typename T>
+static Value getConstantLike(OpBuilder& b, Location loc, T constant,
+                             Value val) {
+  Type ty = getElementTypeOrSelf(val.getType());
+
+  auto getAttr = [&]() -> Attribute {
+    if (ty.isa<IntegerType>()) return b.getIntegerAttr(ty, constant);
+    if (ty.isa<FloatType>()) return b.getFloatAttr(ty, constant);
+    llvm_unreachable("unhandled element type");
+  };
+  return b.create<ConstantLikeOp>(loc, getAttr(), val);
+}
+
 }  // namespace chlo
 }  // namespace mlir
 
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/chlo_ops.td b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/chlo_ops.td
index d7cdd12d351..54b40fe0c94 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/chlo_ops.td
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/chlo_ops.td
@@ -37,7 +37,7 @@ include "mlir-hlo/Dialect/mhlo/IR/infer_fusibility_op_interface.td"
 
 def HLOClient_Dialect : Dialect {
   let name = "chlo";
-  let cppNamespace = "chlo";
+  let cppNamespace = "::mlir::chlo";
   let summary = [{
     Client HLO Ops
   }];
@@ -344,14 +344,16 @@ def HLOClient_BroadcastComplexOp : HLOClient_BroadcastBinaryElementwiseOp<
 //===----------------------------------------------------------------------===//
 
 class HLOClient_UnaryElementwiseOp<string mnemonic, list<OpTrait> traits,
-    Type TensorType>: HLOClient_Op<mnemonic,
-      !listconcat(traits, [InferFusibilityOpInterface])> {
+    Type TensorType> : HLOClient_Op<mnemonic, !listconcat(traits, [
+    InferFusibilityOpInterface, NoSideEffect, SameOperandsAndResultType])> {
   let arguments = (ins TensorType:$operand);
-  let results = (outs TensorType);
+  let results = (outs TensorType:$result);
+
+  let assemblyFormat = "$operand attr-dict `:` type($operand)";
 }
 
-def HLOClient_AcosOp: HLOClient_UnaryElementwiseOp<"acos",
-    [NoSideEffect, SameOperandsAndResultType], HLO_FpOrComplexTensor> {
+def HLOClient_AcosOp : HLOClient_UnaryElementwiseOp<"acos", [],
+    HLO_FpOrComplexTensor> {
   let summary = "Acos operator";
 
   let description = [{
@@ -364,6 +366,37 @@ def HLOClient_AcosOp: HLOClient_UnaryElementwiseOp<"acos",
   }];
 }
 
+def HLOClient_TanOp : HLOClient_UnaryElementwiseOp<"tan", [],
+    HLO_FpOrComplexTensor> {
+  let summary = "Tan operation";
+
+  let description = [{
+    Returns `Tan(operand)` element-wise.
+
+    $$
+    \tan(x) = \sin(x) / \cos(x)
+    $$
+  }];
+}
+
+def HLOClient_ConstantLikeOp : HLOClient_Op<"constant_like",
+    [NoSideEffect, SameOperandsAndResultShape,
+     InferTypeOpInterface,
+     DeclareOpInterfaceMethods<InferShapedTypeOpInterface>,
+     NativeOpTrait<"InferTensorType">]> {
+  let summary = "Constant like operator";
+
+  let description = [{
+    Returns a splat constant of the same shape as the operand.
+  }];
+
+  // TODO(jpienaar): value's type could be tightened.
+  let arguments = (ins AnyAttr:$value, HLO_Tensor:$operand);
+  let results = (outs HLO_Tensor);
+
+  let hasCanonicalizer = 1;
+}
+
 //===----------------------------------------------------------------------===//
 // Broadcasting compare op
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h
index 0036cc0dc19..60ee4e613eb 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h
@@ -19,7 +19,6 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_IR_HLO_OPS_H_
 
 #include "llvm/ADT/StringRef.h"
-#include "mlir-hlo/Dialect/mhlo/IR/infer_fusibility_op_interface.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Dialect.h"
 #include "mlir/IR/DialectImplementation.h"
@@ -32,11 +31,14 @@ limitations under the License.
 #include "mlir/Interfaces/InferTypeOpInterface.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 
+// clang-format off
+#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops_structs.h.inc"
+#include "mlir-hlo/Dialect/mhlo/IR/infer_fusibility_op_interface.h"
+// clang-format on
+
 namespace mlir {
 class OpBuilder;
 
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops_structs.h.inc"
-
 namespace mhlo {
 
 class MhloDialect : public Dialect {
@@ -56,22 +58,9 @@ class MhloDialect : public Dialect {
   void printType(Type type, DialectAsmPrinter &os) const override;
 };
 
-namespace HLOTypes {
-enum Kind {
-  Token = Type::FIRST_XLA_HLO_TYPE,
-};
-}  // namespace HLOTypes
-
 class TokenType : public Type::TypeBase<TokenType, Type, TypeStorage> {
  public:
   using Base::Base;
-
-  static TokenType get(MLIRContext *context) {
-    return Base::get(context, HLOTypes::Token);
-  }
-
-  // Support method to enable LLVM-style type casting.
-  static bool kindof(unsigned kind) { return kind == HLOTypes::Token; }
 };
 
 // Shape derivation function that computes the shape of the result based on
@@ -90,10 +79,10 @@ LogicalResult deriveShapeFromFirstOperand(
     OpBuilder *builder, Operation *op,
     SmallVectorImpl<Value> *reifiedReturnShapes);
 
-#define GET_OP_CLASSES
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h.inc"
-
 }  // end namespace mhlo
 }  // end namespace mlir
 
+#define GET_OP_CLASSES
+#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h.inc"
+
 #endif  //  TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_IR_HLO_OPS_H_
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.td b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.td
index e83bf874c62..351e8bdae0e 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.td
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.td
@@ -27,7 +27,7 @@ include "mlir-hlo/Dialect/mhlo/IR/infer_fusibility_op_interface.td"
 
 def HLO_Dialect : Dialect {
   let name = "mhlo";
-  let cppNamespace = "mhlo";
+  let cppNamespace = "::mlir::mhlo";
 }
 
 class HLO_Op<string mnemonic, list<OpTrait> traits> :
@@ -67,8 +67,7 @@ def HLO_ConstOp : HLO_Op<"constant",
     "OpBuilder &builder, OperationState &result, Attribute value"
   >];
 
-  let printer = [{ return Print(*this, &p); }];
-  let parser = [{ return ParseConstOp(&parser, &result); }];
+  let assemblyFormat = "attr-dict $value";
 
   let hasFolder = 1;
 
@@ -225,11 +224,14 @@ def HLO_LogisticOp: HLO_UnaryElementwiseOp<"logistic",
 
 def HLO_NotOp: HLO_UnaryElementwiseOp<"not",
     [NoSideEffect, SameOperandsAndResultType], HLO_PredOrIntTensor>,
-    BASE_HLO_NotOp;
+    BASE_HLO_NotOp {
+}
 
 def HLO_NegOp: HLO_UnaryElementwiseOp<"negate",
     [NoSideEffect, SameOperandsAndResultType], HLO_IntFpOrComplexTensor>,
-    BASE_HLO_NegOp;
+    BASE_HLO_NegOp {
+  let hasFolder = 1;
+}
 
 def HLO_PopulationCountOp: HLO_UnaryElementwiseOp<"popcnt",
     [NoSideEffect, SameOperandsAndResultType], HLO_IntTensor>,
@@ -263,7 +265,9 @@ def HLO_SinOp: HLO_UnaryElementwiseOp<"sine",
 
 def HLO_SqrtOp: HLO_UnaryElementwiseOp<"sqrt",
     [NoSideEffect, SameOperandsAndResultType], HLO_FpOrComplexTensor>,
-    BASE_HLO_SqrtOp;
+    BASE_HLO_SqrtOp {
+  let hasFolder = 1;
+}
 
 def HLO_TanhOp: HLO_UnaryElementwiseOp<"tanh",
     [NoSideEffect, SameOperandsAndResultType],
@@ -380,6 +384,8 @@ class HLO_BinaryLogicalElementwiseOp<string mnemonic> :
     HLO_PredOrIntTensor:$lhs,
     HLO_PredOrIntTensor:$rhs
   );
+
+  let hasFolder = 1;
 }
 
 def HLO_AndOp: HLO_BinaryLogicalElementwiseOp<"and">, BASE_HLO_AndOp;
@@ -492,9 +498,7 @@ def HLO_RecvOp : HLO_Op<"recv", []> {
 
 def HLO_ReplicaIdOp : HLO_Op<"replica_id", [NoSideEffect]>,
       BASE_HLO_ReplicaIdOp {
-  // TODO(prakalps): The output should unsigned 32-bit integer but mlir does
-  // not differentiate between signed and unsigned int.
-  let results = (outs I32Tensor);
+  let results = (outs TensorOf<[UI32]>);
 }
 
 //===----------------------------------------------------------------------===//
@@ -671,11 +675,13 @@ def HLO_TupleOp : HLO_Op<"tuple", [NoSideEffect]>, BASE_HLO_TupleOp {
                   "OpBuilder &builder, OperationState &results, "
                   "ValueRange values">];
 
+  let hasCanonicalizer = 1;
 }
 
-def HLO_CompareOp: HLO_Op<"compare",
-      [NoSideEffect, SameTypeOperands, SameOperandsAndResultShape]>,
-      BASE_HLO_CompareOp {
+def HLO_CompareOp: HLO_Op<"compare", [NoSideEffect, SameTypeOperands,
+    SameOperandsAndResultShape,
+    DeclareOpInterfaceMethods<InferShapedTypeOpInterface,
+    ["reifyReturnTypeShapes"]>]>, BASE_HLO_CompareOp {
   let arguments = (ins
     HLO_Tensor:$lhs,
     HLO_Tensor:$rhs,
@@ -1067,6 +1073,8 @@ def HLO_GatherOp: HLO_Op<"gather", [NoSideEffect]>, BASE_HLO_GatherOp {
   );
 
   let results = (outs HLO_Tensor);
+
+  let hasCanonicalizer = 1;
 }
 
 def HLO_GetDimensionSizeOp: HLO_Op<"get_dimension_size", [NoSideEffect]>,
@@ -1079,6 +1087,8 @@ def HLO_GetDimensionSizeOp: HLO_Op<"get_dimension_size", [NoSideEffect]>,
   // XLA semantics is available. This limitation is because of the current XLA
   // implementation.
   let results = (outs I32Tensor);
+
+  let hasFolder = 1;
 }
 
 def HLO_MapOp: HLO_Op<"map",
@@ -1143,7 +1153,10 @@ def HLO_ScatterOp: HLO_Op<"scatter", [RecursiveSideEffects]>,
 }
 
 // TODO(jpienaar): Add broadcastable trait.
-def HLO_SelectOp: HLO_Op<"select", [NoSideEffect, DeclareOpInterfaceMethods<InferTypeOpInterface>]>, BASE_HLO_SelectOp {
+def HLO_SelectOp: HLO_Op<"select", [NoSideEffect,
+    DeclareOpInterfaceMethods<InferShapedTypeOpInterface,
+    ["reifyReturnTypeShapes"]>, DeclareOpInterfaceMethods<InferTypeOpInterface>,
+    ]>, BASE_HLO_SelectOp {
   let arguments = (ins
     HLO_PredTensor:$pred,
     HLO_Tensor:$on_true,
@@ -1151,6 +1164,8 @@ def HLO_SelectOp: HLO_Op<"select", [NoSideEffect, DeclareOpInterfaceMethods<Infe
   );
 
   let results = (outs HLO_Tensor);
+
+  let hasFolder = 1;
 }
 
 def HLO_SelectAndScatterOp: HLO_Op<"select_and_scatter",
@@ -1329,8 +1344,9 @@ def HLO_TorchIndexSelectOp : HLO_Op<"torch_index_select", [NoSideEffect]> {
 }
 
 //===----------------------------------------------------------------------===//
-// MHLO RngUniform Operator.
+// MHLO RNG Operators.
 //===----------------------------------------------------------------------===//
+
 def HLO_RngUniformOp : HLO_Op<"rng_uniform", []>, BASE_HLO_RngUniformOp {
   let arguments = (ins
     HLO_PredIntOrFpTensor:$a,
@@ -1355,6 +1371,19 @@ def HLO_RngNormalOp : HLO_Op<"rng_normal", []>, BASE_HLO_RngNormalOp {
   let hasCustomHLOConverter = 1;
 }
 
+def HLO_RngBitGeneratorOp : HLO_Op<"rng_bit_generator", [NoSideEffect]>, BASE_HLO_RngBitGeneratorOp {
+  let arguments = (ins
+    // TODO(jpienaar): This could be an enum instead.
+    I32Attr:$rng_algorithm,
+    HLO_IntOrFpTensor:$initial_state
+  );
+
+  let results = (outs HLO_TensorOrTuple:$result);
+
+  // TODO(jpienaar): This should not be needed.
+  let hasCustomHLOConverter = 1;
+}
+
 //===----------------------------------------------------------------------===//
 // MHLO Quantize Operator.
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_base.td b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_base.td
index 7f9784d7f11..2f80545ad19 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_base.td
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_base.td
@@ -316,6 +316,19 @@ class BASE_HLO_RealOp {
   }];
 }
 
+class BASE_HLO_RngBitGeneratorOp {
+  string summary = "Uniform random number generator operator";
+
+  string description = [{
+    Returns an output with a given shape filled with uniform random bits using
+    the specified algorithm (or backend default) and returns an updated state
+    (with the same shape as initial state) and the generated random data.
+
+    See
+    https://www.tensorflow.org/xla/operation_semantics#rngbitgenerator.
+  }];
+}
+
 class BASE_HLO_RoundOp {
   string summary = "Round operator";
 
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_utils.td b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_utils.td
index e1ae9e1fb89..32940cbc623 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_utils.td
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_utils.td
@@ -27,6 +27,9 @@ def CastIntElementsAttr : NativeCodeCall<"$0.cast<DenseIntElementsAttr>()">;
 class ConstantSplat<string value> : NativeCodeCall<
     "hlo::getSplat(&$_builder, $0, " # value # ")">;
 
+class HLO_ConstantLike<string value> : NativeCodeCall<
+    "chlo::getConstantLike($_builder, $_loc, " # value # ", $0)">;
+
 def NullDenseIntElementsAttr : NativeCodeCall<"DenseIntElementsAttr()">;
 
 def BinBroadcastDimensions : NativeCodeCall<
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h
index bb9b29096f3..cc24e17c001 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h
@@ -27,14 +27,17 @@ limitations under the License.
 #include "mlir/IR/Operation.h"
 #include "mlir/IR/StandardTypes.h"
 #include "mlir/IR/Types.h"
+#include "mlir/Interfaces/CopyOpInterface.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 #include "mlir/Interfaces/ViewLikeInterface.h"
 
 namespace mlir {
 class OpBuilder;
+}  // namespace mlir
 
 #include "mlir-hlo/Dialect/mhlo/IR/lhlo_ops_structs.h.inc"
 
+namespace mlir {
 namespace lmhlo {
 
 class LmhloDialect : public Dialect {
@@ -43,10 +46,10 @@ class LmhloDialect : public Dialect {
   static StringRef getDialectNamespace() { return "lmhlo"; }
 };
 
-#define GET_OP_CLASSES
-#include "mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h.inc"
-
 }  // namespace lmhlo
 }  // end namespace mlir
 
+#define GET_OP_CLASSES
+#include "mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h.inc"
+
 #endif  // TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_IR_LHLO_OPS_H_
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.td b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.td
index 3fa46584ca2..9225d0289dd 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.td
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.td
@@ -34,13 +34,14 @@ limitations under the License.
 #define LHLO_OPS
 
 include "mlir/IR/OpBase.td"
+include "mlir/Interfaces/CopyOpInterface.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/Interfaces/ViewLikeInterface.td"
 include "mlir-hlo/Dialect/mhlo/IR/hlo_ops_base.td"
 
 def LHLO_Dialect : Dialect {
   let name = "lmhlo";
-  let cppNamespace = "lmhlo";
+  let cppNamespace = "::mlir::lmhlo";
 }
 
 //===----------------------------------------------------------------------===//
@@ -81,6 +82,8 @@ def LHLO_ConstOp : LHLO_Op<"constant", []>, BASE_HLO_ConstOp {
     ElementsAttr:$value,
     Arg<LHLO_Buffer, "", [MemWrite]>:$output
   );
+
+  let hasCanonicalizer = 1;
 }
 
 def LHLO_IotaOp : LHLO_Op<"iota", []>, BASE_HLO_IotaOp {
@@ -614,11 +617,16 @@ def LHLO_ConvOp : LHLO_Op<"convolution", []>, BASE_HLO_ConvOp {
   );
 }
 
-def LHLO_CopyOp: LHLO_Op<"copy", []>, BASE_HLO_CopyOp {
+def LHLO_CopyOp: LHLO_Op<"copy", [CopyOpInterface]>, BASE_HLO_CopyOp {
   let arguments = (ins
     Arg<LHLO_Buffer, "", [MemRead]>:$operand,
     Arg<LHLO_Buffer, "", [MemWrite]>:$output
   );
+
+  let extraClassDeclaration = [{
+    Value getSource() { return operand();}
+    Value getTarget() { return output(); }
+  }];
 }
 
 def LHLO_DotOp: LHLO_Op<"dot", []>, BASE_HLO_DotOp {
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/register.h b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/register.h
index 5773901ad78..cb0af3a159d 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/register.h
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/register.h
@@ -17,10 +17,11 @@ limitations under the License.
 #define MLIR_HLO_DIALECT_MHLO_IR_REGISTER_H_
 
 namespace mlir {
+class DialectRegistry;
 namespace mhlo {
 
-void registerAllDialects();
-
+// Add chlo, mhlo, lmhlo dialects to the provided registry.
+void registerAllMhloDialects(DialectRegistry &registry);
 }
 }  // namespace mlir
 
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/CMakeLists.txt b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/CMakeLists.txt
new file mode 100644
index 00000000000..6de6851b8d7
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/CMakeLists.txt
@@ -0,0 +1,23 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+set(LLVM_TARGET_DEFINITIONS mhlo_passes.td)
+mlir_tablegen(mhlo_passes.h.inc -gen-pass-decls -name MHLO)
+add_public_tablegen_target(MLIRMhloPassIncGen)
+
+set(LLVM_TARGET_DEFINITIONS lmhlo_passes.td)
+mlir_tablegen(lmhlo_passes.h.inc -gen-pass-decls -name LMHLO)
+add_public_tablegen_target(MLIRLmhloPassIncGen)
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/lmhlo_passes.td b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/lmhlo_passes.td
index 963ff5dbacf..39b4ca65043 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/lmhlo_passes.td
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/lmhlo_passes.td
@@ -15,12 +15,6 @@ limitations under the License.
 
 include "mlir/Pass/PassBase.td"
 
-def LhloCopyRemovalPass : Pass<"lhlo-copy-removal", "FuncOp"> {
-  let summary = "Removes redundant LHLO copy operations.";
-  let constructor = "createLhloCopyRemovalPass()";
-}
-
-
 def LhloLegalizeToLinalgPass : Pass<"lhlo-legalize-to-linalg", "FuncOp"> {
   let summary = "Legalize from LHLO dialect to Linalg dialect.";
   let constructor = "createLegalizeLhloToLinalgPass()";
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/map_hlo_to_lhlo_op.h b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/map_hlo_to_lhlo_op.h
index c51bcfcfe89..d2621759213 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/map_hlo_to_lhlo_op.h
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/map_hlo_to_lhlo_op.h
@@ -40,6 +40,7 @@ using HloToLhloOp = typename HloToLhloOpImpl<HloOpTy>::Type;
 MAP_HLO_TO_LHLO(AbsOp);
 MAP_HLO_TO_LHLO(AddOp);
 MAP_HLO_TO_LHLO(AndOp);
+MAP_HLO_TO_LHLO(Atan2Op);
 MAP_HLO_TO_LHLO(BroadcastInDimOp);
 MAP_HLO_TO_LHLO(CeilOp);
 MAP_HLO_TO_LHLO(ConstOp);
@@ -52,6 +53,7 @@ MAP_HLO_TO_LHLO(CosOp);
 MAP_HLO_TO_LHLO(DivOp);
 MAP_HLO_TO_LHLO(DotOp);
 MAP_HLO_TO_LHLO(ExpOp);
+MAP_HLO_TO_LHLO(FloorOp);
 MAP_HLO_TO_LHLO(GatherOp);
 MAP_HLO_TO_LHLO(ImagOp);
 MAP_HLO_TO_LHLO(IotaOp);
@@ -68,9 +70,11 @@ MAP_HLO_TO_LHLO(RsqrtOp);
 MAP_HLO_TO_LHLO(SelectOp);
 MAP_HLO_TO_LHLO(SignOp);
 MAP_HLO_TO_LHLO(SinOp);
+MAP_HLO_TO_LHLO(SliceOp);
 MAP_HLO_TO_LHLO(SqrtOp);
 MAP_HLO_TO_LHLO(SubOp);
 MAP_HLO_TO_LHLO(TanhOp);
+MAP_HLO_TO_LHLO(TransposeOp);
 
 #undef MAP_HLO_TO_LHLO
 
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/map_lmhlo_to_scalar_op.h b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/map_lmhlo_to_scalar_op.h
index 2bb5ab2888d..1199dae1ab2 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/map_lmhlo_to_scalar_op.h
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/map_lmhlo_to_scalar_op.h
@@ -336,6 +336,15 @@ inline Value MapLhloOpToStdScalarOp<lmhlo::SinOp>(Location loc,
       loc, result_types, args, b);
 }
 
+template <>
+inline Value MapLhloOpToStdScalarOp<lmhlo::FloorOp>(Location loc,
+                                                    ArrayRef<Type> result_types,
+                                                    ArrayRef<Value> args,
+                                                    OpBuilder* b) {
+  return MapLhloOpToStdScalarOpImpl<FloatType, ::mlir::FloorFOp>{}(
+      loc, result_types, args, b);
+}
+
 /// Implements the conversion of HLO op to scalar op (to use within region of a
 /// linalg.generic op) for compare-select style operations like min/max.
 template <typename... Args>
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/mhlo_passes.td b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/mhlo_passes.td
index fa3bde24df1..aa0f4c317d4 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/mhlo_passes.td
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/mhlo_passes.td
@@ -30,6 +30,11 @@ def LegalizeControlFlowPass : Pass<"mhlo-legalize-control-flow", "FuncOp"> {
   let constructor = "createLegalizeControlFlowPass()";
 }
 
+def LegalizeControlFlowToScfPass : Pass<"mhlo-control-flow-to-scf", "FuncOp"> {
+  let summary = "Legalize from MHLO control flow to SCF control flow.";
+  let constructor = "createControlFlowToScfPass()";
+}
+
 def LegalizeGatherToTorchIndexSelectPass : Pass<"mhlo-legalize-gather-to-torch-index-select", "FuncOp"> {
   let summary = "Legalizes gathers to a torch index select.";
   let constructor = "createLegalizeGatherToTorchIndexSelectPass()";
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h
index efa116f3f0d..fae79d91b1b 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h
@@ -30,11 +30,17 @@ template <typename T>
 class OperationPass;
 class Pass;
 
+// Transforms unranked HLO operations to ranked ones where possible.
+std::unique_ptr<FunctionPass> createTransformUnrankedHloPass();
+
 namespace mhlo {
 
 /// Lowers HLO control flow ops to the Standard dialect.
 std::unique_ptr<OperationPass<FuncOp>> createLegalizeControlFlowPass();
 
+/// Lowers MHLO control flow ops to the SCF dialect.
+std::unique_ptr<OperationPass<FuncOp>> createControlFlowToScfPass();
+
 /// Lowers from HLO dialect to Standard dialect.
 std::unique_ptr<OperationPass<FuncOp>> createLegalizeToStdPass();
 
@@ -49,9 +55,6 @@ std::unique_ptr<OperationPass<ModuleOp>> createLegalizeToLhloPass(
 // Lowers from HLO dialect to Linalg dialect.
 std::unique_ptr<OperationPass<FuncOp>> createLegalizeHloToLinalgPass();
 
-// Transforms unranked HLO operations to ranked ones where possible.
-std::unique_ptr<OperationPass<FuncOp>> createTransformUnrankedHloPass();
-
 // Sinks constants implicitly captured in control flow regions. This is
 // necessary to export to XLA.
 std::unique_ptr<OperationPass<FuncOp>> createSinkConstantsToControlFlowPass();
@@ -92,12 +95,6 @@ std::unique_ptr<FunctionPass> createLegalizeToGpuPass();
 std::unique_ptr<FunctionPass> createLhloFuseLinalgPass(
     bool use_parallel_loops = false, llvm::ArrayRef<unsigned> tile_sizes = {});
 
-// Removes unnecessary LHLO copies which copy from the allocated buffers to the
-// block arguments. The block arguments are used instead of all uses of these
-// buffers. The buffers are freed. This pass only works in regions that contain
-// a single block.
-std::unique_ptr<Pass> createLhloCopyRemovalPass();
-
 // Lowers from LHLO dialect to parallel loops.
 std::unique_ptr<OperationPass<FuncOp>> createLegalizeLhloToParallelLoopsPass();
 
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h
index 725155e9403..cf21a95db6f 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/PatternMatch.h"
+#include "mlir/Transforms/BufferPlacement.h"
 #include "mlir/Transforms/DialectConversion.h"
 
 namespace mlir {
@@ -27,6 +28,12 @@ class LLVMTypeConverter;
 class LowerToLLVMOptions;
 class OwningRewritePatternList;
 class BufferAssignmentPlacer;
+
+// Populates a collection of rewrite patterns to realize element-wise operations
+// on ranked tensors where possible.
+void PopulateTransformUnrankedHloPatterns(MLIRContext *context,
+                                          OwningRewritePatternList *patterns);
+
 namespace mhlo {
 
 // Collection of rewrite patterns for lowering a general dot product.
@@ -50,8 +57,9 @@ void PopulateMhloToStdPatterns(OwningRewritePatternList *patterns,
 
 // Collection of rewrite patterns for lowering of HLO to LHLO dialect.
 void populateHLOToLHLOConversionPattern(
-    MLIRContext *context, BufferAssignmentPlacer *bufferAssignment,
-    TypeConverter *converter, OwningRewritePatternList *patterns);
+    MLIRContext *context, BufferAssignmentTypeConverter *converter,
+    OwningRewritePatternList *patterns);
+
 // Collection of rewrite patterns for lowering of HLO to Linalg dialect.
 void populateHLOToLinalgConversionPattern(MLIRContext *context,
                                           OwningRewritePatternList *patterns);
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/utils/broadcast_utils.h b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/utils/broadcast_utils.h
index 1e2404299b2..1c57073f4ab 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/utils/broadcast_utils.h
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/utils/broadcast_utils.h
@@ -38,10 +38,12 @@ bool IsLegalNumpyRankedBroadcast(Value lhs, Value rhs,
 
 // Emits shape dialect ops to compute the result shape for a broadcasting
 // binary elementwise op which broadcasts according to "numpy" semantics
-// (see above), returning an extents tensor of the resulting shape.
-Value ComputeBinaryElementwiseBroadcastingResultExtents(Location loc, Value lhs,
-                                                        Value rhs,
-                                                        OpBuilder& builder);
+// (see above), returning a `shape.shape` or an extent tensor of the resulting
+// shape. The result should only be an extent tensor in contexts that ensure
+// both operands to be broadcastable.
+Value ComputeBinaryElementwiseBroadcastingResultExtents(
+    Location loc, Value lhs, Value rhs, OpBuilder& builder,
+    bool unsafe_as_extent_tensor);
 
 }  // namespace hlo
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/utils/hlo_utils.h b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/utils/hlo_utils.h
index 1e335ae6b82..74ea9c9b1a7 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/utils/hlo_utils.h
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/utils/hlo_utils.h
@@ -65,9 +65,24 @@ static ElementsAttr getSplat(Builder* b, Value val, T constant) {
 
 // Returns DenseElementsAttr of rank zero with the given element type and the
 // value.
-// Requires `ty` to be either FloatType of IntegerType.
+// Requires `ty` to be either FloatType, IntegerType, or ComplexType.
 DenseElementsAttr GetScalarOfType(Type ty, int64_t raw_value);
 
+// Enum type used to specify scalar argument to GetScalarLimitOfType.
+enum ScalarLimit {
+  kLowest,          // The scalar corresponding to numeric_limits<T>::lowest.
+  kInfinityLowest,  // Like kMax, but returns -infinity where available.
+  kMax,             // The scalar corresponding to numeric_limits<T>::max.
+  kInfinityMax,     // Like kMax, but returns infinity where available.
+};
+
+// Returns a scalar limit value for the given type.
+//
+// The argument 'limit' describes which scalar value to return.
+//
+// Requires `ty` to be either FloatType or IntegerType.
+DenseElementsAttr GetScalarLimitOfType(Type ty, ScalarLimit limit);
+
 }  // namespace hlo
 }  // namespace mlir
 
diff --git a/tensorflow/compiler/mlir/hlo/lib/CMakeLists.txt b/tensorflow/compiler/mlir/hlo/lib/CMakeLists.txt
new file mode 100644
index 00000000000..ec65a5ee882
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/lib/CMakeLists.txt
@@ -0,0 +1,17 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+add_subdirectory(Dialect)
+add_subdirectory(utils)
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/CMakeLists.txt b/tensorflow/compiler/mlir/hlo/lib/Dialect/CMakeLists.txt
new file mode 100644
index 00000000000..5ee1a1924ec
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/CMakeLists.txt
@@ -0,0 +1,16 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+add_subdirectory(mhlo)
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/CMakeLists.txt b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/CMakeLists.txt
new file mode 100644
index 00000000000..e138afa587f
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/CMakeLists.txt
@@ -0,0 +1,17 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+add_subdirectory(IR)
+add_subdirectory(transforms)
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/CMakeLists.txt b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/CMakeLists.txt
new file mode 100644
index 00000000000..d7bb5057b00
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/CMakeLists.txt
@@ -0,0 +1,82 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+include_directories(BEFORE
+    ${CMAKE_CURRENT_BINARY_DIR}
+    ${CMAKE_CURRENT_SOURCE_DIR})
+
+set(LLVM_TARGET_DEFINITIONS hlo_patterns.td)
+mlir_tablegen(hlo_patterns.cc.inc -gen-rewriters)
+add_public_tablegen_target(MLIRMhloRewriterIncGen)
+
+set(LLVM_TARGET_DEFINITIONS mhlo_canonicalize.td)
+mlir_tablegen(mhlo_canonicalize.inc -gen-rewriters)
+add_public_tablegen_target(MLIRMhloCanonicalizeIncGen)
+
+add_mlir_dialect_library(ChloDialect
+  chlo_ops.cc
+
+  DEPENDS
+  MLIRchlo_opsIncGen
+)
+target_link_libraries(ChloDialect PUBLIC MLIRIR)
+
+add_mlir_library(MhloInferFusibilityOpInterface
+  infer_fusibility_op_interface.cc
+
+  DEPENDS
+  MLIRinfer_fusibility_op_interfaceIncGen
+)
+
+
+add_mlir_dialect_library(MhloDialect
+  hlo_ops.cc
+
+  DEPENDS
+  MLIRhlo_opsIncGen
+  MLIRMhloCanonicalizeIncGen
+  MLIRMhloRewriterIncGen
+  MLIRinfer_fusibility_op_interfaceIncGen
+)
+target_link_libraries(MhloDialect
+  PUBLIC
+  MLIRIR
+  MhloInferFusibilityOpInterface
+  MLIRMhloUtils
+)
+
+
+add_mlir_dialect_library(LmhloDialect
+  lhlo_ops.cc
+
+  DEPENDS
+  MLIRlhlo_opsIncGen
+)
+target_link_libraries(LmhloDialect PUBLIC MLIRIR)
+
+
+add_mlir_dialect_library(MhloRegisterDialects
+  init.cc
+DEPENDS
+  MLIRchlo_opsIncGen
+  MLIRhlo_opsIncGen
+  MLIRlhlo_opsIncGen
+)
+target_link_libraries(MhloRegisterDialects
+  PUBLIC
+  ChloDialect
+  MhloDialect
+  LmhloDialect
+)
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/chlo_canonicalize.td b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/chlo_canonicalize.td
new file mode 100644
index 00000000000..eb92d9e0e46
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/chlo_canonicalize.td
@@ -0,0 +1,30 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This is the canonicalize pattern definition file.
+
+include "mlir/IR/OpBase.td"
+include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.td"
+include "mlir-hlo/Dialect/mhlo/IR/hlo_utils.td"
+
+def UnaryToBinaryEinsumEq : NativeCodeCall<
+  "$_builder.getStringAttr(\",\" + $0.getValue().str())">;
+
+// Convert UnaryEinsumOp to EinsumOp with two operands with redundant first
+// operand.
+def UnaryEinsumToEinsum : Pat<
+  (HLO_UnaryEinsumOp $operand, $equation),
+  (HLO_EinsumOp (HLO_ConstOp (GetScalarOfType<1> $operand)),
+                $operand, (UnaryToBinaryEinsumEq $equation))>;
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/chlo_ops.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/chlo_ops.cc
index 99ed8bcb849..99b22a75a14 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/chlo_ops.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/chlo_ops.cc
@@ -15,10 +15,12 @@ limitations under the License.
 
 #include "mlir-hlo/Dialect/mhlo/IR/chlo_ops.h"
 
+#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
 #include "mlir-hlo/utils/broadcast_utils.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/StandardTypes.h"
 #include "mlir/IR/TypeUtilities.h"
 
@@ -151,7 +153,7 @@ LogicalResult ReifyBroadcastBinaryOpReturnTypeShapes(
   }
 
   Value computed_shape = hlo::ComputeBinaryElementwiseBroadcastingResultExtents(
-      loc, lhs, rhs, builder);
+      loc, lhs, rhs, builder, /*unsafe_as_extent_tensor=*/false);
   if (!computed_shape) return failure();
   reifiedReturnShapes.push_back(computed_shape);
   return success();
@@ -259,15 +261,62 @@ BROADCAST_BINARY_OP_DEFS(BroadcastXorOp);
 #undef BROADCAST_INFER_SHAPE_TYPE_OP_DEFS
 #undef BROADCAST_BINARY_OP_DEFS
 
+static LogicalResult Verify(ConstantLikeOp op) {
+  if (op.value().getType() != op.getType().cast<ShapedType>().getElementType())
+    return op.emitOpError() << "value's type doesn't match element return type";
+  return success();
+}
+
+LogicalResult ConstantLikeOp::inferReturnTypeComponents(
+    MLIRContext* context, Optional<Location> location, ValueRange operands,
+    DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<ShapedTypeComponents>& inferedReturnShapes) {
+  ConstantLikeOp::Adaptor op(operands, attributes);
+  if (failed(op.verify(location.getValue()))) return failure();
+  Type element_type = op.value().getType();
+  Type operand_type = op.operand().getType();
+  if (operand_type.isa<UnrankedTensorType>()) {
+    inferedReturnShapes.emplace_back(element_type);
+  } else {
+    const auto& shape = operand_type.cast<RankedTensorType>().getShape();
+    inferedReturnShapes.emplace_back(shape, element_type);
+  }
+  return success();
+}
+
+struct ConstantLikeToConstant : public OpRewritePattern<ConstantLikeOp> {
+  using OpRewritePattern<ConstantLikeOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(ConstantLikeOp op,
+                                PatternRewriter& rewriter) const override {
+    auto op_type = op.operand().getType().cast<ShapedType>();
+    if (!op_type.hasStaticShape()) return failure();
+    auto type = RankedTensorType::get(op_type.getShape(), op.value().getType());
+    ElementsAttr attr = DenseElementsAttr::get(type, op.value());
+    rewriter.replaceOpWithNewOp<mhlo::ConstOp>(op.getOperation(), attr);
+    return success();
+  }
+};
+
+void ConstantLikeOp::getCanonicalizationPatterns(
+    OwningRewritePatternList& results, MLIRContext* context) {
+  results.insert<ConstantLikeToConstant>(context);
+}
+
+}  // namespace chlo
+}  // namespace mlir
+
 #define GET_OP_CLASSES
 #include "mlir-hlo/Dialect/mhlo/IR/chlo_ops.cc.inc"
 
+namespace mlir {
+namespace chlo {
+
 //===----------------------------------------------------------------------===//
 // chlo Dialect Constructor
 //===----------------------------------------------------------------------===//
 
-HloClientDialect::HloClientDialect(MLIRContext* context)
-    : Dialect(getDialectNamespace(), context) {
+void HloClientDialect::initialize() {
   addOperations<
 #define GET_OP_LIST
 #include "mlir-hlo/Dialect/mhlo/IR/chlo_ops.cc.inc"
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/hlo_ops.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/hlo_ops.cc
index 69b01009a0d..6711a916896 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/hlo_ops.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/hlo_ops.cc
@@ -60,7 +60,11 @@ limitations under the License.
 
 namespace mlir {
 #include "hlo_patterns.cc.inc"
+}  // namespace mlir
+
 #include "mlir-hlo/Dialect/mhlo/IR/hlo_ops_structs.cc.inc"
+
+namespace mlir {
 namespace mhlo {
 
 Operation* MhloDialect::materializeConstant(OpBuilder& builder, Attribute value,
@@ -112,37 +116,6 @@ DenseIntElementsAttr BuildSliceLimits(DenseIntElementsAttr start_indices,
 // ConstOp
 //===----------------------------------------------------------------------===//
 
-static void Print(ConstOp op, OpAsmPrinter* printer) {
-  // Print op name.
-  *printer << op.getOperationName();
-
-  // Elide attribute value while printing the attribute dictionary.
-  SmallVector<StringRef, 1> elided_attrs;
-  elided_attrs.push_back("value");
-  printer->printOptionalAttrDict(op.getAttrs(), elided_attrs);
-
-  *printer << ' ' << op.value();
-}
-
-static ParseResult ParseConstOp(OpAsmParser* parser, OperationState* result) {
-  if (parser->parseOptionalAttrDict(result->attributes)) return failure();
-
-  // If colon is not present after attribute dictionary, it should be short form
-  // and attribute 'value' is outside the dictionary.
-  if (failed(parser->parseOptionalColon())) {
-    Attribute value;
-    if (parser->parseAttribute(value, "value", result->attributes))
-      return failure();
-    return parser->addTypeToList(value.getType(), result->types);
-  }
-
-  // Long form should have type of the result after colon.
-  Type ty;
-  if (parser->parseType(ty)) return failure();
-  result->types.push_back(ty);
-  return success();
-}
-
 OpFoldResult ConstOp::fold(ArrayRef<Attribute> operands) {
   assert(operands.empty() && "constant has no operands");
 
@@ -196,6 +169,71 @@ static LogicalResult Verify(DotGeneralOp op) {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// GatherOp
+//===----------------------------------------------------------------------===//
+
+// Converts gather ops to slice ops in case we have a single set of constant
+// indices.
+struct GatherSlice : public OpRewritePattern<GatherOp> {
+  using OpRewritePattern<GatherOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(GatherOp gather,
+                                PatternRewriter& rewriter) const override {
+    DenseIntElementsAttr index;
+    if (!matchPattern(gather.start_indices(), m_Constant(&index)))
+      return failure();
+
+    const auto& dnums = gather.dimension_numbers();
+    if (dnums.collapsed_slice_dims().getNumElements() != 0 ||
+        dnums.index_vector_dim().getInt() != 0 || index.getType().getRank() > 1)
+      return failure();
+
+    // TODO(tberghammer): Remove when the verifier catches this case what is
+    // invalid if all previous condition holds.
+    if (index.getNumElements() != dnums.start_index_map().getNumElements())
+      return failure();
+
+    auto slice_end =
+        llvm::to_vector<8>(gather.slice_sizes().getValues<int64_t>());
+    llvm::SmallVector<int64_t, 8> slice_start(slice_end.size(), 0);
+    for (auto it : llvm::zip(dnums.start_index_map().getIntValues(),
+                             index.getIntValues())) {
+      int64_t map_index = std::get<0>(it).getSExtValue();
+      int64_t offset = std::get<1>(it).getSExtValue();
+      slice_start[map_index] += offset;
+      slice_end[map_index] += offset;
+    }
+
+    llvm::SmallVector<int64_t, 8> slice_stride(slice_end.size(), 1);
+    rewriter.replaceOpWithNewOp<SliceOp>(
+        gather, gather.getType(), gather.getOperand(0),
+        GetI64ElementsAttr(slice_start, &rewriter),
+        GetI64ElementsAttr(slice_end, &rewriter),
+        GetI64ElementsAttr(slice_stride, &rewriter));
+    return success();
+  }
+};
+
+void GatherOp::getCanonicalizationPatterns(OwningRewritePatternList& results,
+                                           MLIRContext* context) {
+  results.insert<GatherSlice>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// GetDimensionSizeOp
+//===----------------------------------------------------------------------===//
+
+/// Fold get_dimension_size when the said shape dimension is a constant.
+OpFoldResult GetDimensionSizeOp::fold(ArrayRef<Attribute> attrs) {
+  RankedTensorType type = operand().getType().cast<RankedTensorType>();
+  int32_t dim = dimension();
+  if (type.isDynamic(dim)) return {};
+  // The result type is always is a 0-d i32 tensor.
+  return DenseIntElementsAttr::get<int32_t>(
+      getResult().getType().cast<RankedTensorType>(), type.getDimSize(dim));
+}
+
 //===----------------------------------------------------------------------===//
 // IotaOp
 //===----------------------------------------------------------------------===//
@@ -207,7 +245,7 @@ static LogicalResult Verify(IotaOp op) {
   if (shape.getRank() == 0)
     return op.emitOpError() << "does not support scalars.";
 
-  auto iota_dimension = op.iota_dimension().getSExtValue();
+  auto iota_dimension = op.iota_dimension();
   if (iota_dimension >= shape.getRank() || iota_dimension < 0)
     return op.emitOpError() << "iota dimension cannot go beyond the output "
                                "rank or be negative.";
@@ -229,8 +267,7 @@ struct IotaBroadcast : public OpRewritePattern<IotaOp> {
     auto iota_dimension = iota.iota_dimension();
 
     auto iota_type = RankedTensorType::get(
-        {result_ty.getDimSize(iota_dimension.getLimitedValue())},
-        result_ty.getElementType());
+        {result_ty.getDimSize(iota_dimension)}, result_ty.getElementType());
 
     auto new_iota = rewriter.create<IotaOp>(iota.getLoc(), iota_type,
                                             rewriter.getI64IntegerAttr(0));
@@ -250,7 +287,7 @@ void IotaOp::getCanonicalizationPatterns(OwningRewritePatternList& results,
 }
 
 OpFoldResult IotaOp::fold(ArrayRef<Attribute> operands) {
-  auto dimension = iota_dimension().getLimitedValue();
+  auto dimension = iota_dimension();
   auto result_ty = getResult().getType().cast<ShapedType>();
   if (result_ty.hasRank() && result_ty.getDimSize(dimension) == 1) {
     Builder builder(getContext());
@@ -294,7 +331,7 @@ struct DynamicIotaBroadcast : public OpRewritePattern<DynamicIotaOp> {
     }
 
     auto iota_dimension = iota.iota_dimension();
-    auto iota_dimension_int = iota_dimension.getLimitedValue();
+    auto iota_dimension_int = iota_dimension;
 
     auto converted_shape = rewriter.create<IndexCastOp>(
         iota.getLoc(),
@@ -340,6 +377,33 @@ void DynamicIotaOp::getCanonicalizationPatterns(
   results.insert<DynamicIotaBroadcast>(context);
 }
 
+//===----------------------------------------------------------------------===//
+// DynamicUpdateSliceOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(DynamicUpdateSliceOp op) {
+  OperandRange indices = op.start_indices();
+  if (indices.size() <= 1) return success();
+
+  // Note: start_indices is constrained to Variadic<HLO_ScalarIntTensor>, so it
+  // is OK to cast indices to ShapedType here.
+  auto idx_tensor = indices.take_front().front().getType().cast<ShapedType>();
+  Type first_elem_ty = idx_tensor.getElementType();
+  Type elem_ty;
+
+  for (auto idx : llvm::drop_begin(indices, 1)) {
+    idx_tensor = idx.getType().cast<ShapedType>();
+    elem_ty = idx_tensor.getElementType();
+
+    if (first_elem_ty != elem_ty) {
+      return op.emitOpError() << "start indices must have same element type "
+                                 "(encountered mismatch: "
+                              << first_elem_ty << " vs " << elem_ty << ")";
+    }
+  }
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // AbsOp
 //===----------------------------------------------------------------------===//
@@ -466,7 +530,7 @@ static LogicalResult Verify(DequantizeOp op) {
 //===----------------------------------------------------------------------===//
 
 static LogicalResult Verify(GetTupleElementOp op) {
-  auto indexVal = op.index().getZExtValue();
+  auto indexVal = op.index();
   auto operandType = op.getOperand().getType().cast<TupleType>();
   if (indexVal >= operandType.size()) {
     return op.emitOpError(
@@ -485,7 +549,7 @@ static LogicalResult Verify(GetTupleElementOp op) {
 OpFoldResult GetTupleElementOp::fold(ArrayRef<Attribute> operands) {
   if (auto tupleOp =
           dyn_cast_or_null<mhlo::TupleOp>(getOperand().getDefiningOp())) {
-    return tupleOp.getOperand(index().getLimitedValue());
+    return tupleOp.getOperand(index());
   }
 
   return {};
@@ -506,6 +570,46 @@ static LogicalResult Verify(TupleOp op) {
   return success();
 }
 
+namespace {
+
+// Pattern for unpacking and repacking the same tuple.
+struct UnpackRepackSameTuple : public OpRewritePattern<TupleOp> {
+  using OpRewritePattern<TupleOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(TupleOp op,
+                                PatternRewriter& rewriter) const override {
+    if (op.val().empty()) return failure();
+
+    Value first_element = op.val().front();
+    auto first_element_op =
+        dyn_cast_or_null<GetTupleElementOp>(first_element.getDefiningOp());
+    if (!first_element_op || first_element_op.indexAttr().getInt() != 0)
+      return failure();
+
+    Value tuple_predecessor = first_element_op.getOperand();
+    if (tuple_predecessor.getType() != op.getType()) return failure();
+
+    for (auto element_and_idx : llvm::enumerate(op.val().drop_front(1))) {
+      auto element_op = dyn_cast_or_null<GetTupleElementOp>(
+          element_and_idx.value().getDefiningOp());
+      if (!element_op ||
+          element_op.indexAttr().getInt() != element_and_idx.index() + 1 ||
+          element_op.getOperand() != tuple_predecessor)
+        return failure();
+    }
+
+    rewriter.replaceOp(op, tuple_predecessor);
+    return success();
+  }
+};
+
+}  // namespace
+
+void TupleOp::getCanonicalizationPatterns(OwningRewritePatternList& results,
+                                          MLIRContext* context) {
+  results.insert<UnpackRepackSameTuple>(context);
+}
+
 //===----------------------------------------------------------------------===//
 // AllToAllOp
 //===----------------------------------------------------------------------===//
@@ -515,8 +619,8 @@ static LogicalResult Verify(AllToAllOp op) {
   // count.
   auto type = op.getOperand().getType().dyn_cast<RankedTensorType>();
   if (!type) return success();
-  auto split_dim_size = type.getDimSize(op.split_dimension().getSExtValue());
-  auto split_count = op.split_count().getSExtValue();
+  auto split_dim_size = type.getDimSize(op.split_dimension());
+  auto split_count = op.split_count();
   if (split_dim_size % split_count != 0) {
     return op.emitError() << "split dimension has size " << split_dim_size
                           << ", expected to be a multiple of split_count "
@@ -708,10 +812,12 @@ static LogicalResult Verify(DynamicBroadcastInDimOp op) {
 
     auto dimSize = operandType.getDimSize(i);
     auto resultDimSize = resultType.getDimSize(dimIndex);
-    if (dimSize != 1 && dimSize != resultDimSize) {
+    // Note: verifyCompatibleShapes doesn't consider size-1 broadcasting, so we
+    // add a manual check for this.
+    if (dimSize != 1 && failed(verifyCompatibleShape(dimSize, resultDimSize))) {
       return op.emitOpError(
-          llvm::formatv("size of operand dimension {0} ({1}) is not equal to "
-                        "1 or size of result dimension {2} ({3})",
+          llvm::formatv("size of operand dimension {0} ({1}) is not compatible "
+                        "with size of result dimension {2} ({3})",
                         i, dimSize, dimIndex, resultDimSize));
     }
   }
@@ -862,7 +968,7 @@ class ConcatenateOperandRemoval : public OpRewritePattern<ConcatenateOp> {
   using OpRewritePattern::OpRewritePattern;
   LogicalResult matchAndRewrite(ConcatenateOp op,
                                 PatternRewriter& rewriter) const override {
-    auto axis = op.dimension().getLimitedValue();
+    auto axis = op.dimension();
     llvm::SmallVector<Value, 6> new_operands;
     for (auto operand : op.getOperands()) {
       auto ty = operand.getType().cast<ShapedType>();
@@ -903,13 +1009,38 @@ LogicalResult ConcatenateOp::inferReturnTypes(
     }
   }
 
-  // If an input is unranked the output shape is unranked.
+  // Find the first ranked input to determine the output rank.
+  for (auto type : operands.getTypes()) {
+    auto shaped_type = type.cast<ShapedType>();
+    if (shaped_type.hasRank()) {
+      first_type = shaped_type;
+      break;
+    }
+  }
+
+  // If all inputs are unranked, the result must be unranked.
   if (!first_type.hasRank()) {
     inferredReturnTypes.push_back(UnrankedTensorType::get(out_element));
     return success();
   }
 
   auto out_shape = llvm::to_vector<6>(first_type.getShape());
+
+  // Determine what the non-concatenate dimensions should be.
+  for (auto type : operands.getTypes()) {
+    auto shaped_ty = type.cast<ShapedType>();
+    if (!shaped_ty.hasRank()) {
+      continue;
+    }
+
+    for (auto it : llvm::enumerate(shaped_ty.getShape())) {
+      // If a dimension is not dynamic, the output shape should match.
+      if (ShapedType::isDynamic(out_shape[it.index()])) {
+        out_shape[it.index()] = it.value();
+      }
+    }
+  }
+
   out_shape[dimension] = 0;
 
   for (auto operand : operands.getTypes()) {
@@ -942,7 +1073,7 @@ void ConcatenateOp::getCanonicalizationPatterns(
 template <typename T>
 static Attribute foldConcatenateHelper(ConcatenateOp* op,
                                        ArrayRef<Attribute> operands) {
-  auto axis = op->dimension().getLimitedValue();
+  auto axis = op->dimension();
   auto type = op->getType().cast<ShapedType>();
 
   SmallVector<T, 6> values;
@@ -990,7 +1121,7 @@ OpFoldResult ConcatenateOp::fold(ArrayRef<Attribute> operands) {
   ShapedType type = getResult().getType().cast<ShapedType>();
   if (!type.hasStaticShape()) return {};
 
-  auto axis = dimension().getLimitedValue();
+  auto axis = dimension();
   if (auto attr = foldConcatenate(this, operands)) {
     return attr;
   }
@@ -1165,6 +1296,131 @@ static LogicalResult Verify(InfeedOp op) {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// Logical Ops
+//===----------------------------------------------------------------------===//
+
+OpFoldResult AndOp::fold(ArrayRef<Attribute> operands) {
+  if (lhs() == rhs()) return lhs();
+
+  auto rType = getType().cast<ShapedType>();
+  auto lhsVal = operands[0].dyn_cast_or_null<DenseElementsAttr>();
+  auto rhsVal = operands[1].dyn_cast_or_null<DenseElementsAttr>();
+
+  if (lhsVal && lhsVal.isSplat()) {
+    if (lhsVal.getSplatValue()
+            .cast<IntegerAttr>()
+            .getValue()
+            .isAllOnesValue()) {
+      return rhs();
+    }
+
+    if (lhsVal.getSplatValue().cast<IntegerAttr>().getValue().isNullValue()) {
+      return lhsVal;
+    }
+  }
+
+  if (rhsVal && rhsVal.isSplat()) {
+    if (rhsVal.getSplatValue()
+            .cast<IntegerAttr>()
+            .getValue()
+            .isAllOnesValue()) {
+      return lhs();
+    }
+
+    if (rhsVal.getSplatValue().cast<IntegerAttr>().getValue().isNullValue()) {
+      return rhsVal;
+    }
+  }
+
+  if (!rhsVal || !lhsVal) return {};
+
+  llvm::SmallVector<APInt, 4> values;
+  values.reserve(rhsVal.getNumElements());
+  for (auto it : llvm::zip(rhsVal.getIntValues(), lhsVal.getIntValues())) {
+    values.push_back(std::get<0>(it) & std::get<1>(it));
+  }
+
+  return DenseIntElementsAttr::get(rType, values);
+}
+
+OpFoldResult OrOp::fold(ArrayRef<Attribute> operands) {
+  if (lhs() == rhs()) return lhs();
+
+  auto rType = getType().cast<ShapedType>();
+  auto lhsVal = operands[0].dyn_cast_or_null<DenseElementsAttr>();
+  auto rhsVal = operands[1].dyn_cast_or_null<DenseElementsAttr>();
+
+  if (lhsVal && lhsVal.isSplat()) {
+    if (lhsVal.getSplatValue()
+            .cast<IntegerAttr>()
+            .getValue()
+            .isAllOnesValue()) {
+      return lhsVal;
+    }
+
+    if (lhsVal.getSplatValue().cast<IntegerAttr>().getValue().isNullValue()) {
+      return rhs();
+    }
+  }
+
+  if (rhsVal && rhsVal.isSplat()) {
+    if (rhsVal.getSplatValue()
+            .cast<IntegerAttr>()
+            .getValue()
+            .isAllOnesValue()) {
+      return rhsVal;
+    }
+
+    if (rhsVal.getSplatValue().cast<IntegerAttr>().getValue().isNullValue()) {
+      return lhs();
+    }
+  }
+
+  if (!rhsVal || !lhsVal) return {};
+
+  llvm::SmallVector<APInt, 4> values;
+  values.reserve(rhsVal.getNumElements());
+  for (auto it : llvm::zip(rhsVal.getIntValues(), lhsVal.getIntValues())) {
+    values.push_back(std::get<0>(it) | std::get<1>(it));
+  }
+
+  return DenseIntElementsAttr::get(rType, values);
+}
+
+OpFoldResult XorOp::fold(ArrayRef<Attribute> operands) {
+  auto rType = getType().cast<ShapedType>();
+  if (lhs() == rhs()) {
+    Builder builder(getContext());
+    return builder.getZeroAttr(rType);
+  }
+
+  auto lhsVal = operands[0].dyn_cast_or_null<DenseElementsAttr>();
+  auto rhsVal = operands[1].dyn_cast_or_null<DenseElementsAttr>();
+
+  if (lhsVal && lhsVal.isSplat()) {
+    if (lhsVal.getSplatValue().cast<IntegerAttr>().getValue().isNullValue()) {
+      return rhs();
+    }
+  }
+
+  if (rhsVal && rhsVal.isSplat()) {
+    if (rhsVal.getSplatValue().cast<IntegerAttr>().getValue().isNullValue()) {
+      return lhs();
+    }
+  }
+
+  if (!rhsVal || !lhsVal) return {};
+
+  llvm::SmallVector<APInt, 4> values;
+  values.reserve(rhsVal.getNumElements());
+  for (auto it : llvm::zip(rhsVal.getIntValues(), lhsVal.getIntValues())) {
+    values.push_back(std::get<0>(it) ^ std::get<1>(it));
+  }
+
+  return DenseIntElementsAttr::get(rType, values);
+}
+
 //===----------------------------------------------------------------------===//
 // MapOp
 //===----------------------------------------------------------------------===//
@@ -1358,6 +1614,29 @@ static LogicalResult Verify(SelectOp op) {
   return success();
 }
 
+OpFoldResult SelectOp::fold(ArrayRef<Attribute> operands) {
+  if (on_true() == on_false()) {
+    return on_true();
+  }
+
+  auto predicate = operands[0].dyn_cast_or_null<DenseIntElementsAttr>();
+  if (!predicate) {
+    return {};
+  }
+
+  auto predicateTy = predicate.getType().cast<ShapedType>();
+  if (!predicateTy.getElementType().isInteger(1)) {
+    return {};
+  }
+
+  if (predicate.isSplat()) {
+    return predicate.getSplatValue<APInt>().getBoolValue() ? on_true()
+                                                           : on_false();
+  }
+
+  return {};
+}
+
 // Makes it such that a SelectOp that is a non-root operation in a DRR infers
 // the return type based on operand type.
 LogicalResult SelectOp::inferReturnTypes(
@@ -1399,6 +1678,20 @@ LogicalResult SelectOp::inferReturnTypes(
   return success();
 }
 
+LogicalResult SelectOp::inferReturnTypeComponents(
+    mlir::MLIRContext*, llvm::Optional<mlir::Location>, mlir::ValueRange,
+    mlir::DictionaryAttr, mlir::RegionRange,
+    llvm::SmallVectorImpl<mlir::ShapedTypeComponents>&) {
+  // TODO(b/168772852)
+  return failure();
+}
+
+LogicalResult SelectOp::reifyReturnTypeShapes(
+    OpBuilder& builder, SmallVectorImpl<Value>& reifiedReturnShapes) {
+  return deriveShapeFromFirstOperand(&builder, getOperation(),
+                                     &reifiedReturnShapes);
+}
+
 //===----------------------------------------------------------------------===//
 // PadOp
 //===----------------------------------------------------------------------===//
@@ -1546,6 +1839,79 @@ static LogicalResult Verify(CaseOp op) {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// SqrtOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult SqrtOp::fold(ArrayRef<Attribute> operands) {
+  auto val = operands[0].dyn_cast_or_null<DenseElementsAttr>();
+  if (!val) return {};
+
+  auto type = getElementTypeOrSelf(getType());
+  if (!type.isF32() && !type.isF64()) return {};
+
+  auto shaped_type = getType().cast<ShapedType>();
+  if (!shaped_type.hasStaticShape()) return {};
+
+  int bit_width = type.getIntOrFloatBitWidth();
+  llvm::SmallVector<APFloat, 4> values;
+  values.reserve(val.getNumElements());
+  for (auto it : val.getFloatValues()) {
+    double value = bit_width == 32 ? it.convertToFloat() : it.convertToDouble();
+    if (value < 0) return {};
+    value = std::sqrt(value);
+    if (bit_width == 32)
+      values.emplace_back(static_cast<float>(value));
+    else
+      values.emplace_back(value);
+  }
+  return DenseFPElementsAttr::get(shaped_type, values);
+}
+
+//===----------------------------------------------------------------------===//
+// UnaryOps
+//===----------------------------------------------------------------------===//
+
+template <typename Op, typename ElementType = Type, typename ValType,
+          typename Convert>
+static Attribute UnaryFolder(Op* op, ArrayRef<Attribute> attrs) {
+  if (!attrs[0]) return {};
+
+  DenseElementsAttr val = attrs[0].dyn_cast<DenseElementsAttr>();
+  if (!val) return {};
+
+  ShapedType type = op->getType().template cast<ShapedType>();
+  if (!type.hasStaticShape()) {
+    return {};
+  }
+
+  Type etype = type.getElementType();
+
+  // Evaluate for integer values.
+  if (!etype.isa<ElementType>()) {
+    return {};
+  }
+
+  SmallVector<ValType, 6> values;
+  values.reserve(val.getNumElements());
+  for (const auto v : val.getValues<ValType>()) {
+    values.push_back(Convert()(v));
+  }
+
+  return DenseElementsAttr::get(type, values);
+}
+
+#define UNARY_FOLDER(Op, Func)                                                \
+  OpFoldResult Op::fold(ArrayRef<Attribute> attrs) {                          \
+    if (getElementTypeOrSelf(getType()).isa<FloatType>())                     \
+      return UnaryFolder<Op, FloatType, APFloat, Func<APFloat>>(this, attrs); \
+    if (getElementTypeOrSelf(getType()).isa<IntegerType>())                   \
+      return UnaryFolder<Op, IntegerType, APInt, Func<APInt>>(this, attrs);   \
+    return {};                                                                \
+  }
+
+UNARY_FOLDER(NegOp, std::negate);
+
 //===----------------------------------------------------------------------===//
 // BinaryOps
 //===----------------------------------------------------------------------===//
@@ -1720,11 +2086,11 @@ static Attribute FoldSlice(SliceOp* op, I values) {
 
 OpFoldResult SliceOp::fold(ArrayRef<Attribute> operands) {
   // Check if the SliceOp is a NoOp operation.
-  auto operand_shape = getOperand().getType().cast<ShapedType>().getShape();
+  auto operand_type = getOperand().getType().cast<ShapedType>();
   auto result_type = getResult().getType().cast<ShapedType>();
-  auto result_shape = result_type.getShape();
 
-  if (result_type.hasStaticShape() && (operand_shape == result_shape)) {
+  if (operand_type.hasStaticShape() && result_type.hasStaticShape() &&
+      (operand_type.getShape() == result_type.getShape())) {
     return getOperand();
   }
 
@@ -1770,7 +2136,7 @@ struct SimplifyConcatSlice : public OpRewritePattern<SliceOp> {
       return failure();
     }
 
-    auto dimension = concat.dimension().getSExtValue();
+    auto dimension = concat.dimension();
 
     auto start = slice.start_indices().getIntValues();
     auto limit = slice.limit_indices().getIntValues();
@@ -1920,7 +2286,7 @@ static LogicalResult Verify(SortOp op) {
       return op.emitOpError("requires all inputs to have the same dimensions");
 
     int64_t rank = input_shape.size();
-    int64_t cmp_dim = op.dimension().getSExtValue();
+    int64_t cmp_dim = op.dimension();
     if (cmp_dim < -rank || cmp_dim >= rank)
       return op.emitOpError("dimension attribute value must be in range [-")
              << rank << ", " << rank << "), but found " << cmp_dim;
@@ -2121,9 +2487,28 @@ void CompareOp::build(OpBuilder& builder, OperationState& result, Value lhs,
   build(builder, result, new_type, lhs, rhs, comparison_direction);
 }
 
+LogicalResult CompareOp::inferReturnTypeComponents(
+    mlir::MLIRContext*, llvm::Optional<mlir::Location>, mlir::ValueRange,
+    mlir::DictionaryAttr, mlir::RegionRange,
+    llvm::SmallVectorImpl<mlir::ShapedTypeComponents>&) {
+  // TODO(b/168772852)
+  return failure();
+}
+
+LogicalResult CompareOp::reifyReturnTypeShapes(
+    OpBuilder& builder, SmallVectorImpl<Value>& reifiedReturnShapes) {
+  return deriveShapeFromFirstOperand(&builder, getOperation(),
+                                     &reifiedReturnShapes);
+}
+
+}  // namespace mhlo
+}  // namespace mlir
 #define GET_OP_CLASSES
 #include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.cc.inc"
 
+namespace mlir {
+namespace mhlo {
+
 //===----------------------------------------------------------------------===//
 // mhlo Dialect Interfaces
 //===----------------------------------------------------------------------===//
@@ -2150,7 +2535,7 @@ struct HLOInlinerInterface : public DialectInlinerInterface {
 //===----------------------------------------------------------------------===//
 
 MhloDialect::MhloDialect(MLIRContext* context)
-    : Dialect(getDialectNamespace(), context) {
+    : Dialect(getDialectNamespace(), context, TypeID::get<MhloDialect>()) {
   addOperations<
 #define GET_OP_LIST
 #include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.cc.inc"
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/init.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/init.cc
index 9fffeae1cc5..503b100c7ab 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/init.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/init.cc
@@ -18,16 +18,10 @@ limitations under the License.
 #include "mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
 #include "mlir-hlo/Dialect/mhlo/IR/register.h"
 
-// Static initialization for *HLO dialects registration.
-
-void mlir::mhlo::registerAllDialects() {
-  static bool init_once = []() {
-    registerDialect<mlir::chlo::HloClientDialect>();
-    registerDialect<mlir::lmhlo::LmhloDialect>();
-    registerDialect<mlir::mhlo::MhloDialect>();
-    return true;
-  }();
-  (void)init_once;
-
-  // Dependent dialects
+void mlir::mhlo::registerAllMhloDialects(mlir::DialectRegistry &registry) {
+  // clang-format off
+  registry.insert<mlir::chlo::HloClientDialect,
+                  mlir::lmhlo::LmhloDialect,
+                  mlir::mhlo::MhloDialect>();
+  // clang-format on
 }
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/lhlo_ops.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/lhlo_ops.cc
index bbb463cd1a9..cba0d3b4788 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/lhlo_ops.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/lhlo_ops.cc
@@ -29,6 +29,8 @@ limitations under the License.
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h.inc"
+#include "mlir-hlo/Dialect/mhlo/IR/lhlo_ops_structs.cc.inc"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/Dialect.h"
@@ -45,17 +47,48 @@ limitations under the License.
 #include "mlir/IR/Value.h"
 
 namespace mlir {
-#include "mlir-hlo/Dialect/mhlo/IR/lhlo_ops_structs.cc.inc"
 namespace lmhlo {
 
 LmhloDialect::LmhloDialect(MLIRContext *context)
-    : Dialect(getDialectNamespace(), context) {
+    : Dialect(getDialectNamespace(), context, TypeID::get<LmhloDialect>()) {
   addOperations<
 #define GET_OP_LIST
 #include "mlir-hlo/Dialect/mhlo/IR/lhlo_ops.cc.inc"
       >();
 }
 
+//===----------------------------------------------------------------------===//
+// ConstOp.
+//===----------------------------------------------------------------------===//
+
+/// An lho.constant on an memref that is locally allocated and with no other
+/// users (other than dealloc's) can be erased.
+// TODO: This can be generalized to an arbitrary op by making use of memory
+// effects (write memory effect).
+struct EraseConstOp : public OpRewritePattern<ConstOp> {
+  using OpRewritePattern<ConstOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(ConstOp op,
+                                PatternRewriter& rewriter) const override {
+    Value memref = op.output();
+    if (!memref.getDefiningOp<AllocOp>()) {
+      return failure();
+    }
+
+    // Check that all uses of the memref are either DeallocOps or this op.
+    for (Operation* user : memref.getUsers())
+      if (user != op && !isa<DeallocOp>(user)) return failure();
+
+    rewriter.eraseOp(op);
+    return success();
+  }
+};
+
+void ConstOp::getCanonicalizationPatterns(OwningRewritePatternList& results,
+                                          MLIRContext* context) {
+  results.insert<EraseConstOp>(context);
+}
+
 //===----------------------------------------------------------------------===//
 // StaticMemRefCastOp
 //===----------------------------------------------------------------------===//
@@ -126,9 +159,15 @@ static LogicalResult Verify(ReshapeMemRefCastOp op) {
   return success();
 }
 
+}  // namespace lmhlo
+}  // namespace mlir
+
 #define GET_OP_CLASSES
 #include "mlir-hlo/Dialect/mhlo/IR/lhlo_ops.cc.inc"
 
+namespace mlir {
+namespace lmhlo {
+
 // TODO(cheshire): Support folding, reuse code from hlo_ops.cc.
 
 void FusionOp::build(OpBuilder &builder, OperationState &result,
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/CMakeLists.txt b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/CMakeLists.txt
new file mode 100644
index 00000000000..e02add4353a
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/CMakeLists.txt
@@ -0,0 +1,160 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+include_directories(BEFORE
+    ${CMAKE_CURRENT_BINARY_DIR}
+    ${CMAKE_CURRENT_SOURCE_DIR})
+
+set(LLVM_TARGET_DEFINITIONS lower_complex_patterns.td)
+mlir_tablegen(generated_lower_complex.inc -gen-rewriters)
+add_public_tablegen_target(MLIRMhloLowerComplexIncGen)
+
+set(LLVM_TARGET_DEFINITIONS legalize_to_standard_patterns.td)
+mlir_tablegen(generated_legalize_to_standard.inc -gen-rewriters)
+add_public_tablegen_target(MLIRMhloLegalizeToStandardIncGen)
+
+set(LLVM_TARGET_DEFINITIONS chlo_legalize_to_hlo_patterns.td)
+mlir_tablegen(generated_chlo_legalize_to_hlo.inc -gen-rewriters)
+add_public_tablegen_target(MLIRChloLegalizeToHloIncGen)
+
+
+add_mlir_library(ChloPasses
+  chlo_legalize_to_hlo.cc
+  chlo_legalize_to_hlo_pass.cc
+
+  DEPENDS
+  MLIRhlo_opsIncGen
+  MLIRChloLegalizeToHloIncGen
+
+  LINK_COMPONENTS
+  Core
+
+  LINK_LIBS PUBLIC
+  ChloDialect
+  MLIRIR
+  MLIRPass
+)
+
+add_mlir_library(MhloPasses
+  legalize_gather_to_torch_index_select.cc
+  legalize_tanh_to_approximation.cc
+  lower_complex.cc
+  lower_complex_patterns.td
+  lower_general_dot.cc
+  materialize_broadcasts.cc
+  materialize_broadcasts_pass.cc
+  mhlo_fusion.cc
+  optimize_mhlo.cc
+  optimize_mhlo_pass.cc
+  sink_constants_to_control_flow.cc
+  test_infer_shaped_type_pass.cc
+  transform_unranked_hlo.cc
+  unfuse_batch_norm.cc
+  unfuse_batch_norm_pass.cc
+
+  DEPENDS
+  MLIRhlo_opsIncGen
+  MLIRMhloLowerComplexIncGen
+
+  LINK_COMPONENTS
+  Core
+
+  LINK_LIBS PUBLIC
+  MLIRIR
+  MLIRMhloUtils
+  MLIRPass
+  MLIRTransformUtils
+)
+
+add_mlir_library(MhloToLhloConversion
+  hlo_legalize_to_lhlo.cc
+
+  DEPENDS
+  MLIRhlo_opsIncGen
+  MLIRlhlo_opsIncGen
+
+  LINK_COMPONENTS
+  Core
+
+  LINK_LIBS PUBLIC
+  MhloDialect
+  LmhloDialect
+  MLIRIR
+  MLIRPass
+)
+
+add_mlir_library(MhloToStandard
+  legalize_control_flow.cc
+  legalize_to_standard.cc
+  mhlo_control_flow_to_scf.cc
+
+  DEPENDS
+  MLIRhlo_opsIncGen
+  MLIRlhlo_opsIncGen
+  MLIRMhloLegalizeToStandardIncGen
+
+  LINK_COMPONENTS
+  Core
+
+  LINK_LIBS PUBLIC
+  MLIRIR
+  MLIRPass
+)
+
+add_mlir_library(MhloLhloToLinalg
+  legalize_to_linalg.cc
+
+  DEPENDS
+  MLIRhlo_opsIncGen
+  MLIRlhlo_opsIncGen
+
+  LINK_COMPONENTS
+  Core
+
+  LINK_LIBS PUBLIC
+  MhloDialect
+  MLIRIR
+  MLIRPass
+)
+
+add_mlir_library(LmhloPasses
+  lhlo_fuse_linalg.cc
+  lhlo_legalize_to_affine.cc
+  lhlo_legalize_to_gpu.cc
+  lhlo_legalize_to_llvm.cc
+  lhlo_legalize_to_llvm_pass.cc
+  lhlo_legalize_to_parallel_loops.cc
+
+  DEPENDS
+  MLIRlhlo_opsIncGen
+
+  LINK_COMPONENTS
+  Core
+
+  LINK_LIBS PUBLIC
+  LmhloDialect
+  MLIRIR
+  MLIRPass
+)
+
+add_library(AllMhloPasses INTERFACE)
+target_link_libraries(AllMhloPasses INTERFACE
+  ChloPasses
+  MhloPasses
+  MhloToLhloConversion
+  MhloToStandard
+  MhloLhloToLinalg
+  LmhloPasses
+)
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/chlo_legalize_to_hlo.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/chlo_legalize_to_hlo.cc
index adbd2e5a628..626b5d3bd59 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/chlo_legalize_to_hlo.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/chlo_legalize_to_hlo.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <numeric>
+
 #include "mlir-hlo/Dialect/mhlo/IR/chlo_ops.h"
 #include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
 #include "mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
@@ -31,6 +33,39 @@ namespace mlir {
 namespace chlo {
 namespace {
 
+struct ConvertConstantLikeOp : public OpConversionPattern<ConstantLikeOp> {
+  using OpConversionPattern<ConstantLikeOp>::OpConversionPattern;
+  LogicalResult matchAndRewrite(
+      ConstantLikeOp op, ArrayRef<Value> operands,
+      ConversionPatternRewriter &rewriter) const override {
+    auto result_ty = op.getType().cast<ShapedType>();
+
+    // Unranked uses are not supported.  Consider `transform-unranked-hlo`.
+    if (!result_ty.hasRank()) return failure();
+
+    // Lower to MHLO constant if statically shaped.
+    if (result_ty.hasStaticShape()) {
+      rewriter.replaceOpWithNewOp<mhlo::ConstOp>(
+          op, DenseElementsAttr::get(result_ty, op.value()));
+      return success();
+    }
+
+    // Lower to broadcasted constant.
+    ConstantLikeOp::Adaptor transformed(operands);
+    auto loc = op.getLoc();
+    Type extent_tensor_type = shape::getExtentTensorType(op.getContext());
+    Value constant = rewriter.create<mhlo::ConstOp>(loc, op.value());
+    Value uncasted_shape = rewriter.create<shape::ShapeOfOp>(
+        loc, extent_tensor_type, transformed.operand());
+    Type shape_ty =
+        RankedTensorType::get({result_ty.getRank()}, rewriter.getIndexType());
+    Value shape = rewriter.create<TensorCastOp>(loc, shape_ty, uncasted_shape);
+    rewriter.replaceOpWithNewOp<mhlo::DynamicBroadcastInDimOp>(
+        op, result_ty, constant, shape, rewriter.getI64TensorAttr({}));
+    return success();
+  }
+};
+
 // Converts binary ops that statically are determined to not broadcast directly
 // to the corresponding mhlo non-broadcasting op.
 template <typename ChloOpTy, typename HloOpTy, typename Adaptor>
@@ -124,8 +159,8 @@ struct ConvertRankedDynamicBroadcastBinaryOp
 
     int64_t result_rank = std::max(lhs_type.getRank(), rhs_type.getRank());
     Value result_extents =
-        hlo::ComputeBinaryElementwiseBroadcastingResultExtents(loc, lhs, rhs,
-                                                               rewriter);
+        hlo::ComputeBinaryElementwiseBroadcastingResultExtents(
+            loc, lhs, rhs, rewriter, /*unsafe_as_extent_tensor=*/true);
 
     // Note that we unconditionally emit DynamicBroadcastInDim ops and let
     // downstream canonicalizations fold them away if possible. This is
@@ -338,30 +373,37 @@ struct ConvertUnrankedDynamicBroadcastBinaryOp
     Value lhs_shape = if_builder.create<shape::ShapeOfOp>(loc, lhs);
     Value rhs_shape = if_builder.create<shape::ShapeOfOp>(loc, rhs);
     SmallVector<int64_t, 6> ranked_shape(targeted_rank, 1);
-    auto extent_tensor_type =
+    auto unknown_rank_extent_tensor_type = RankedTensorType::get(
+        {RankedTensorType::kDynamicSize}, builder.getIndexType());
+    auto known_rank_extent_tensor_type =
         RankedTensorType::get({targeted_rank}, builder.getIndexType());
     auto reshaped_type = RankedTensorType::get(
         llvm::SmallVector<int64_t, 6>(targeted_rank,
                                       RankedTensorType::kDynamicSize),
         lhs.getType().template dyn_cast<TensorType>().getElementType());
     Value ranked_shape_val = if_builder.create<shape::ConstShapeOp>(
-        loc, extent_tensor_type,
-        mlir::DenseIntElementsAttr::get(extent_tensor_type, ranked_shape));
-    // TODO(tpopp): Return extent tensors when possible to signal that this is a
-    // guaranteed safe broadcast by construction.
+        loc, known_rank_extent_tensor_type,
+        mlir::DenseIntElementsAttr::get(known_rank_extent_tensor_type,
+                                        ranked_shape));
     Value extended_lhs = if_builder.create<shape::BroadcastOp>(
-        loc, extent_tensor_type, lhs_shape, ranked_shape_val, nullptr);
+        loc, unknown_rank_extent_tensor_type, lhs_shape, ranked_shape_val,
+        nullptr);
+    Value extended_lhs_casted = if_builder.create<TensorCastOp>(
+        loc, known_rank_extent_tensor_type, extended_lhs);
     Value extended_rhs = if_builder.create<shape::BroadcastOp>(
-        loc, extent_tensor_type, rhs_shape, ranked_shape_val, nullptr);
+        loc, unknown_rank_extent_tensor_type, rhs_shape, ranked_shape_val,
+        nullptr);
+    Value extended_rhs_casted = if_builder.create<TensorCastOp>(
+        loc, known_rank_extent_tensor_type, extended_rhs);
 
     // 1. Reshape operands to the given rank (with the same number of elements)
     // 2. Compute the ranked-broadcasted ChloOp (which will assert that the ops
     //    can be broadcasted and do the actual broadcasting)
     // 3. Type erase the output back to unranked
     Value reshaped_lhs = if_builder.create<mhlo::DynamicReshapeOp>(
-        loc, reshaped_type, lhs, extended_lhs);
+        loc, reshaped_type, lhs, extended_lhs_casted);
     Value reshaped_rhs = if_builder.create<mhlo::DynamicReshapeOp>(
-        loc, reshaped_type, rhs, extended_rhs);
+        loc, reshaped_type, rhs, extended_rhs_casted);
     Value result = if_builder.create<ChloOpTy>(
         loc, ArrayRef<Type>{reshaped_type},
         ArrayRef<Value>{reshaped_lhs, reshaped_rhs}, op.getAttrs());
@@ -469,10 +511,13 @@ struct HloCompareAdaptor {
   }
 };
 
+#include "generated_chlo_legalize_to_hlo.inc"
 }  // namespace
 
 void PopulateLegalizeChloToHloPatterns(MLIRContext *context,
                                        OwningRewritePatternList *patterns) {
+  populateWithGenerated(context, patterns);
+
   // Instantiate conversion templates for conforming binary elementwise ops
   // that do not have different dtypes between operands and results and do
   // not have special attributes that need to be preserved.
@@ -502,6 +547,9 @@ void PopulateLegalizeChloToHloPatterns(MLIRContext *context,
       context, patterns);
   PopulateForBinaryOp<BroadcastCompareOp, mhlo::CompareOp, HloCompareAdaptor>(
       context, patterns);
+
+  // Other patterns.
+  patterns->insert<ConvertConstantLikeOp>(context);
 }
 
 }  // namespace chlo
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/chlo_legalize_to_hlo_pass.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/chlo_legalize_to_hlo_pass.cc
index 50cd6df5c99..263b6cdd1c3 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/chlo_legalize_to_hlo_pass.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/chlo_legalize_to_hlo_pass.cc
@@ -29,6 +29,10 @@ namespace {
 
 struct TestChloLegalizeToHloPass
     : public PassWrapper<TestChloLegalizeToHloPass, FunctionPass> {
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<mhlo::MhloDialect, shape::ShapeDialect, scf::SCFDialect>();
+  }
+
   void runOnFunction() override {
     ConversionTarget conversionTarget(getContext());
     OwningRewritePatternList conversionPatterns;
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/chlo_legalize_to_hlo_patterns.td b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/chlo_legalize_to_hlo_patterns.td
new file mode 100644
index 00000000000..7b612ff4b02
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/chlo_legalize_to_hlo_patterns.td
@@ -0,0 +1,59 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This is the legalization pattern definition file for CHLO to MHLO.
+
+include "mlir/IR/OpBase.td"
+include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.td"
+include "mlir-hlo/Dialect/mhlo/IR/chlo_ops.td"
+
+//===----------------------------------------------------------------------===//
+// Unary op patterns.
+//===----------------------------------------------------------------------===//
+
+// Expand acos to MHLO dialect as follows:
+//   acos(x) = 2 * atan(sqrt(1 - x^2) / (1 + x))  if x != -1
+//           = pi                                 if x == -1
+def : Pat<(HLOClient_AcosOp $input),
+  (HLO_SelectOp
+    (HLO_CompareOp $input,
+      (HLO_ConstantLike<"0"> $input),
+      HLO_COMPARISON_DIRECTION_NE
+    ),
+    (HLO_MulOp
+      (HLO_ConstantLike<"2.0f"> $input),
+      (HLO_Atan2Op
+        (HLO_SqrtOp
+          (HLO_SubOp
+            (HLO_ConstantLike<"1"> $input),
+            (HLO_MulOp $input, $input)
+          )
+        ),
+        (HLO_AddOp
+          (HLO_ConstantLike<"1"> $input),
+          $input
+        )
+      )
+    ),
+    (HLO_ConstantLike<"M_PI"> $input))>;
+
+// Express tan in MHLO dialect as
+//   tan(x) = sin(x) / cos(x).
+def : Pat<(HLOClient_TanOp $input),
+  (HLO_DivOp
+    (HLO_SinOp $input),
+    (HLO_CosOp $input)
+  )>;
+
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/hlo_legalize_to_lhlo.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/hlo_legalize_to_lhlo.cc
index a8c3ad17ebb..0f1a3d034eb 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/hlo_legalize_to_lhlo.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/hlo_legalize_to_lhlo.cc
@@ -78,7 +78,6 @@ Value InsertDynamicAllocAndDealloc(Location loc, Value result,
 }
 
 Value InsertAlloc(Location loc, OpResult result,
-                  BufferAssignmentPlacer* bufferAssignment,
                   ConversionPatternRewriter* rewriter) {
   auto result_type = result.getType().dyn_cast<ShapedType>();
   if (!result_type || !result_type.hasStaticShape()) {
@@ -88,8 +87,7 @@ Value InsertAlloc(Location loc, OpResult result,
   auto memref_type =
       MemRefType::get(result_type.getShape(), result_type.getElementType());
   OpBuilder::InsertionGuard guard(*rewriter);
-  rewriter->restoreInsertionPoint(
-      bufferAssignment->computeAllocPosition(result));
+  rewriter->setInsertionPoint(result.getDefiningOp());
   auto alloc = rewriter->create<AllocOp>(loc, memref_type);
   return alloc;
 }
@@ -111,8 +109,8 @@ class HloToLhloOpConverter : public BaseOpConversion<HloOpTy> {
         return failure();
       }
       if (resultType.hasStaticShape()) {
-        buffer_args.push_back(InsertAlloc(op->getLoc(), result.value(),
-                                          this->bufferAssignment, &rewriter));
+        buffer_args.push_back(
+            InsertAlloc(op->getLoc(), result.value(), &rewriter));
       } else {
         SmallVector<Value, 1> results_shape;
         auto shape_type_op = dyn_cast<InferShapedTypeOpInterface>(op);
@@ -259,8 +257,7 @@ struct HloToLhloReduceOpConverter : public BaseOpConversion<mhlo::ReduceOp> {
     const auto& original_results = op.getResults();
     SmallVector<Value, 4> buffer_args(operands.begin(), operands.end());
     for (auto result : original_results) {
-      buffer_args.push_back(
-          InsertAlloc(loc, result, this->bufferAssignment, &rewriter));
+      buffer_args.push_back(InsertAlloc(loc, result, &rewriter));
     }
     auto new_op = rewriter.create<lmhlo::ReduceOp>(loc, llvm::None, buffer_args,
                                                    op.getAttrs());
@@ -290,11 +287,36 @@ struct HloToLhloReduceOpConverter : public BaseOpConversion<mhlo::ReduceOp> {
   }
 };
 
-// Legalize mhlo.return to a lmhlo.copy and lmhlo.terminator. This functionality
-// is provided by mlir buffer assignment, so use the pattern from there.
-// TODO(DFKI): Move this out of detail.
-using HloToLhloReturnOpConverter = detail::BufferAssignmentReturnOpConverter<
-    mhlo::ReturnOp, lmhlo::TerminatorOp, lmhlo::CopyOp, false>;
+// Legalize mhlo.return to a lmhlo.copy and lmhlo.terminator.
+struct HloToLhloReturnOpConverter : public BaseOpConversion<mhlo::ReturnOp> {
+ public:
+  using BaseOpConversion<mhlo::ReturnOp>::BaseOpConversion;
+
+  LogicalResult matchAndRewrite(
+      mhlo::ReturnOp op, ArrayRef<Value> operands,
+      ConversionPatternRewriter& rewriter) const final {
+    auto loc = op.getLoc();
+    auto& entry_block = op.getParentRegion()->front();
+    auto num_arguments = entry_block.getNumArguments();
+    if (operands.size() > num_arguments) {
+      return op.emitError(
+          "The number of operands that need Copy operations is more "
+          "than the number of target function arguments.");
+    }
+
+    // The index of the first output block argument.
+    auto dest_arg_idx = num_arguments - operands.size();
+
+    // Create a lmhlo.copy for each operand of mhlo.return.
+    for (Value operand : operands) {
+      rewriter.create<lmhlo::CopyOp>(loc, operand,
+                                     entry_block.getArgument(dest_arg_idx));
+      ++dest_arg_idx;
+    }
+    rewriter.replaceOpWithNewOp<lmhlo::TerminatorOp>(op);
+    return success();
+  }
+};
 
 class HloToLhloTensorLoadOpConverter
     : public BaseOpConversion<mlir::TensorLoadOp> {
@@ -388,6 +410,10 @@ class HloToLhloTensorStoreOpConverter
 
 struct HloLegalizeToLhlo
     : public PassWrapper<HloLegalizeToLhlo, OperationPass<ModuleOp>> {
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<lmhlo::LmhloDialect>();
+  }
+
  public:
   HloLegalizeToLhlo() = default;
   HloLegalizeToLhlo(const HloLegalizeToLhlo& o) {
@@ -428,28 +454,19 @@ struct HloLegalizeToLhlo
                          isMemRefType);
     });
 
-    auto module = getOperation();
-    WalkResult result = module.walk([&](FuncOp func) -> WalkResult {
-      BufferAssignmentPlacer bufferAssignment(func);
-      OwningRewritePatternList patterns;
-      populateHLOToLHLOConversionPattern(func.getContext(), &bufferAssignment,
-                                         &converter, &patterns);
-      if (results_escape_function) {
-        populateWithBufferAssignmentOpConversionPatterns<
-            mlir::ReturnOp, mlir::ReturnOp, lmhlo::CopyOp,
-            /*allowMemrefFunctionResults=*/true>(&context, &bufferAssignment,
-                                                 &converter, &patterns);
-      } else {
-        populateWithBufferAssignmentOpConversionPatterns<
-            mlir::ReturnOp, mlir::ReturnOp, lmhlo::CopyOp,
-            /*allowMemrefFunctionResults=*/false>(&context, &bufferAssignment,
-                                                  &converter, &patterns);
-      }
-      return applyPartialConversion(func, target, patterns);
-    });
-    if (result.wasInterrupted()) {
+    auto kind = results_escape_function
+                    ? BufferAssignmentTypeConverter::KeepAsFunctionResult
+                    : BufferAssignmentTypeConverter::AppendToArgumentsList;
+    converter.setResultConversionKind<UnrankedTensorType, UnrankedMemRefType>(
+        kind);
+    converter.setResultConversionKind<RankedTensorType, MemRefType>(kind);
+
+    populateHLOToLHLOConversionPattern(&context, &converter, &patterns);
+    populateWithBufferAssignmentOpConversionPatterns<
+        mlir::ReturnOp, mlir::ReturnOp, lmhlo::CopyOp>(&context, &converter,
+                                                       &patterns);
+    if (failed(applyPartialConversion(getOperation(), target, patterns)))
       signalPassFailure();
-    }
   }
 
  private:
@@ -462,8 +479,8 @@ struct HloLegalizeToLhlo
 }  // namespace
 
 void populateHLOToLHLOConversionPattern(
-    MLIRContext* context, BufferAssignmentPlacer* bufferAssignment,
-    TypeConverter* converter, OwningRewritePatternList* patterns) {
+    MLIRContext* context, BufferAssignmentTypeConverter* converter,
+    OwningRewritePatternList* patterns) {
   // clang-format off
   patterns->insert<
       HloToLhloDynamicBroadcastInDimOpConverter,
@@ -471,6 +488,7 @@ void populateHLOToLHLOConversionPattern(
       HloToLhloOpConverter<mhlo::AbsOp>,
       HloToLhloOpConverter<mhlo::AddOp>,
       HloToLhloOpConverter<mhlo::AndOp>,
+      HloToLhloOpConverter<mhlo::Atan2Op>,
       HloToLhloOpConverter<mhlo::BroadcastInDimOp>,
       HloToLhloOpConverter<mhlo::CeilOp>,
       HloToLhloOpConverter<mhlo::CompareOp>,
@@ -483,6 +501,7 @@ void populateHLOToLHLOConversionPattern(
       HloToLhloOpConverter<mhlo::DivOp>,
       HloToLhloOpConverter<mhlo::DotOp>,
       HloToLhloOpConverter<mhlo::ExpOp>,
+      HloToLhloOpConverter<mhlo::FloorOp>,
       HloToLhloOpConverter<mhlo::GatherOp>,
       HloToLhloOpConverter<mhlo::ImagOp>,
       HloToLhloOpConverter<mhlo::IotaOp>,
@@ -497,14 +516,17 @@ void populateHLOToLHLOConversionPattern(
       HloToLhloOpConverter<mhlo::ReshapeOp>,
       HloToLhloOpConverter<mhlo::SelectOp>,
       HloToLhloOpConverter<mhlo::SignOp>,
+      HloToLhloOpConverter<mhlo::SinOp>,
+      HloToLhloOpConverter<mhlo::SliceOp>,
       HloToLhloOpConverter<mhlo::SqrtOp>,
       HloToLhloOpConverter<mhlo::SubOp>,
       HloToLhloOpConverter<mhlo::TanhOp>,
+      HloToLhloOpConverter<mhlo::TransposeOp>,
       HloToLhloReduceOpConverter,
       HloToLhloReturnOpConverter,
       HloToLhloTensorLoadOpConverter,
       HloToLhloTensorStoreOpConverter
-  >(context, bufferAssignment, converter);
+  >(context, converter);
   // clang-format on
 }
 
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_to_linalg.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_to_linalg.cc
index f47f2c2fbdc..0a8105eb366 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_to_linalg.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_to_linalg.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 // This file implements logic for lowering HLO/LHLO dialect to Linalg dialect.
 
+#include <numeric>
+
 #include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
 #include "mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
 #include "mlir-hlo/Dialect/mhlo/transforms/map_lmhlo_to_scalar_op.h"
@@ -598,6 +600,7 @@ class ReshapeOpConverter : public OpConversionPattern<OpTy> {
     unsigned currSrcDim = 0, currDstDim = 0;
     SmallVector<linalg::ReassociationExprs, 4> reassociationMap(
         dstShape.size());
+    bool isExpandingOrCollapsing = true;
     while (currSrcDim < srcShape.size() && currDstDim < dstShape.size()) {
       int64_t dstSize = dstShape[currDstDim];
       int64_t srcSize = srcShape[currSrcDim];
@@ -619,11 +622,48 @@ class ReshapeOpConverter : public OpConversionPattern<OpTy> {
           }
         }
       } else {
-        return failure();
+        isExpandingOrCollapsing = false;
+        break;
       }
       currDstDim++;
     }
-    if (currSrcDim != srcShape.size()) return failure();
+    if (currSrcDim != srcShape.size() || currDstDim != dstShape.size())
+      isExpandingOrCollapsing = false;
+
+    if (!isExpandingOrCollapsing) {
+      auto getIdentityExprs = [&rewriter](int n) {
+        SmallVector<AffineExpr, 4> exprs;
+        for (int i = 0; i < n; ++i)
+          exprs.push_back(rewriter.getAffineDimExpr(i));
+        return exprs;
+      };
+      Location loc = reshapeOp.getLoc();
+      int64_t totalElems = std::accumulate(srcShape.begin(), srcShape.end(), 1,
+                                           std::multiplies<int64_t>());
+      auto elemType = operandType.getElementType();
+      SmallVector<linalg::ReassociationExprs, 4> collapsingMap = {
+          getIdentityExprs(dstShape.size())};
+      SmallVector<linalg::ReassociationExprs, 4> expandingMap = {
+          getIdentityExprs(srcShape.size())};
+
+      if (isLHLO) {
+        auto collapsedType = MemRefType::get({totalElems}, elemType);
+        Value collapsedOp = rewriter.create<linalg::ReshapeOp>(
+            loc, collapsedType, args[0], collapsingMap);
+        Value reshapeBuffer = rewriter.create<linalg::ReshapeOp>(
+            loc, resultType, collapsedOp, expandingMap);
+        rewriter.replaceOpWithNewOp<linalg::CopyOp>(
+            reshapeOp, reshapeBuffer, args[1], /*inputPermutation =*/nullptr,
+            /*outputPermutation =*/nullptr);
+      } else {
+        auto collapsedType = RankedTensorType::get({totalElems}, elemType);
+        Value collapsedOp = rewriter.create<linalg::TensorReshapeOp>(
+            loc, collapsedType, args[0], collapsingMap);
+        rewriter.replaceOpWithNewOp<linalg::TensorReshapeOp>(
+            reshapeOp, resultType, collapsedOp, expandingMap);
+      }
+      return success();
+    }
 
     if (isLHLO) {
       Value reshapeBuffer = rewriter.create<linalg::ReshapeOp>(
@@ -665,7 +705,7 @@ class IotaConverter : public OpConversionPattern<OpTy> {
         [&](OpBuilder& nestedBuilder, Location nestedLoc, ValueRange ivs,
             ValueRange args) {
           Value castOp = nestedBuilder.create<IndexCastOp>(
-              nestedLoc, ivs[iotaOp.iota_dimension().getZExtValue()],
+              nestedLoc, ivs[iotaOp.iota_dimension()],
               nestedBuilder.getIntegerType(
                   resultElementType.getIntOrFloatBitWidth()));
           if (resultElementType.template isa<FloatType>()) {
@@ -783,6 +823,7 @@ void populateLHLOToLinalgConversionPattern(MLIRContext* context,
                    PointwiseToLinalgConverter<lmhlo::CosOp>,
                    PointwiseToLinalgConverter<lmhlo::DivOp>,
                    PointwiseToLinalgConverter<lmhlo::ExpOp>,
+                   PointwiseToLinalgConverter<lmhlo::FloorOp>,
                    PointwiseToLinalgConverter<lmhlo::ImagOp>,
                    PointwiseToLinalgConverter<lmhlo::LogOp>,
                    PointwiseToLinalgConverter<lmhlo::MaxOp>,
@@ -801,7 +842,8 @@ void populateLHLOToLinalgConversionPattern(MLIRContext* context,
                    ReshapeOpConverter<lmhlo::ReshapeOp>,
                    ReverseConverter<lmhlo::ReverseOp>,
                    ScalarPointwiseToStandardConverter<lmhlo::AddOp>,
-                   SliceConverter
+                   SliceConverter,
+                   TransposeConverter<lmhlo::TransposeOp>
                   >(context);
   // clang-format on
 }
@@ -827,6 +869,10 @@ void populateLHLOToLinalgConversionPattern(MLIRContext* context,
 // } : (memref<2x2xf32>, memref<2x2xf32>, memref<2x2xf32>) -> ()
 struct LhloLegalizeToLinalgPass
     : public PassWrapper<LhloLegalizeToLinalgPass, FunctionPass> {
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<AffineDialect, linalg::LinalgDialect>();
+  }
+
   void runOnFunction() override {
     OwningRewritePatternList patterns;
     ConversionTarget target(getContext());
@@ -843,6 +889,10 @@ struct LhloLegalizeToLinalgPass
 
 struct HloLegalizeToLinalgPass
     : public PassWrapper<HloLegalizeToLinalgPass, FunctionPass> {
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<linalg::LinalgDialect>();
+  }
+
   void runOnFunction() override {
     OwningRewritePatternList patterns;
     ConversionTarget target(getContext());
@@ -882,6 +932,7 @@ void populateHLOToLinalgConversionPattern(MLIRContext* context,
                PointwiseToLinalgConverter<mhlo::CosOp, false>,
                PointwiseToLinalgConverter<mhlo::DivOp, false>,
                PointwiseToLinalgConverter<mhlo::ExpOp, false>,
+               PointwiseToLinalgConverter<mhlo::FloorOp, false>,
                PointwiseToLinalgConverter<mhlo::ImagOp, false>,
                PointwiseToLinalgConverter<mhlo::LogOp, false>,
                PointwiseToLinalgConverter<mhlo::MaxOp, false>,
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_to_standard.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_to_standard.cc
index cc574e008d5..d2d4bab45ab 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_to_standard.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_to_standard.cc
@@ -117,7 +117,7 @@ class ConvertIotaOp : public OpRewritePattern<mhlo::IotaOp> {
                                 PatternRewriter &rewriter) const override {
     auto output_type = op.getType().cast<ShapedType>();
     auto output_size = output_type.getNumElements();
-    auto dimension = op.iota_dimension().getSExtValue();
+    auto dimension = op.iota_dimension();
     auto max_dim_size = output_type.getDimSize(dimension);
 
     auto element_type = output_type.getElementType();
@@ -178,6 +178,10 @@ class ConvertIotaOp : public OpRewritePattern<mhlo::IotaOp> {
 namespace {
 struct LegalizeToStandardPass
     : public PassWrapper<LegalizeToStandardPass, FunctionPass> {
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<StandardOpsDialect>();
+  }
+
   /// Perform the lowering to Standard dialect.
   void runOnFunction() override;
 };
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_to_standard_patterns.td b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_to_standard_patterns.td
index ea67c052c5c..6ee6f124628 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_to_standard_patterns.td
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_to_standard_patterns.td
@@ -36,6 +36,10 @@ def IsSameSizePred : CPred<
 def IsSameSizeConstraint : Constraint<IsSameSizePred, "inputs are same size">;
 
 
+// Unary Lowering Patterns.
+def : Pat<(HLO_CeilOp HLO_FpTensor:$i), (CeilFOp $i)>;
+
+// Binary Lowering Patterns.
 def : Pat<(HLO_AndOp HLO_PredTensor:$l, HLO_PredTensor:$r),
           (AndOp $l, $r),
           [(IsSameSizeConstraint $l, $r)]>;
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_copy_removal.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_copy_removal.cc
deleted file mode 100644
index 7a4418466b5..00000000000
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_copy_removal.cc
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This file implements a pass to remove redundant LHLO copy operations.
-
-#include "mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/passes.h"
-#include "mlir/Dialect/StandardOps/IR/Ops.h"
-#include "mlir/IR/Operation.h"
-#include "mlir/Pass/Pass.h"
-
-namespace mlir {
-namespace lmhlo {
-namespace {
-
-// Removes LHLO copy operations that copy from allocated buffers to block
-// arguments. All uses of each buffer are replaced with the corresponding block
-// argument and the buffer is freed. Note that this pass only works in regions
-// with a single block.
-struct LhloCopyRemovalPass
-    : mlir::PassWrapper<LhloCopyRemovalPass, OperationPass<>> {
-  void runOnOperation() override {
-    llvm::SmallVector<mlir::Operation*, 2> eraseList;
-    auto operation = getOperation();
-    operation->walk([&](mlir::lmhlo::CopyOp copyOp) {
-      // If this region contains more than one block, then ignore this copy
-      // operation.
-      if (copyOp.getParentRegion()->getBlocks().size() > 1) {
-        return;
-      }
-
-      mlir::Value fromOperand = copyOp.operand();
-      mlir::Value toOperand = copyOp.output();
-
-      // If the fromOperand value is a block argument or the toOperand
-      // value is not a block argument, then ignore this copy operation.
-      if (!fromOperand.getDefiningOp() || toOperand.getDefiningOp()) {
-        return;
-      }
-
-      // The copy operation removal is illegal if there is at least a single use
-      // of toOperand value that lies between the first use of fromOperand value
-      // and the copy operation.
-      auto fromOperandUsers = fromOperand.getUsers();
-      auto firstUser = *fromOperandUsers.begin();
-      for (auto op : fromOperandUsers) {
-        if (op->isBeforeInBlock(firstUser)) firstUser = op;
-      }
-      for (auto op : toOperand.getUsers()) {
-        if (op->isBeforeInBlock(copyOp) && firstUser->isBeforeInBlock(op)) {
-          return;
-        }
-      }
-
-      // TODO(DFKI): Use live variable analysis to solve aliasing issues among
-      // block arguments.
-
-      // Remove the associated alloc operation.
-      auto allocOp = fromOperand.getDefiningOp();
-      eraseList.push_back(allocOp);
-
-      // Iterate over all uses of the fromOperand to find the associated
-      // deallocOp (if any).
-      for (auto op : fromOperandUsers) {
-        if (isa<mlir::DeallocOp>(op)) {
-          eraseList.push_back(op);
-          break;
-        }
-      }
-
-      // Replace all uses of the fromOperand with the toOperand. This rewires
-      // all references pointing to the original alloc operation to the new
-      // target operation in order to safely remove the copy op.
-      fromOperand.replaceAllUsesWith(toOperand);
-      copyOp.erase();
-    });
-    for (auto op : eraseList) {
-      op->erase();
-    }
-  };
-};
-
-}  // namespace
-
-std::unique_ptr<Pass> createLhloCopyRemovalPass() {
-  return std::make_unique<LhloCopyRemovalPass>();
-}
-
-}  // namespace lmhlo
-}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_fuse_linalg.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_fuse_linalg.cc
index 1467f015dc9..6dc5b64a105 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_fuse_linalg.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_fuse_linalg.cc
@@ -19,8 +19,10 @@ limitations under the License.
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "mlir-hlo/Dialect/mhlo/transforms/passes.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Linalg/Analysis/DependenceAnalysis.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/Dialect/SCF/SCF.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/FoldUtils.h"
@@ -33,6 +35,10 @@ using linalg::LinalgOp;
 
 class LhloFuseLinalgPass
     : public PassWrapper<LhloFuseLinalgPass, FunctionPass> {
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<AffineDialect, linalg::LinalgDialect, scf::SCFDialect>();
+  }
+
  public:
   LhloFuseLinalgPass() = default;
   LhloFuseLinalgPass(const LhloFuseLinalgPass&) {}
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_affine.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_affine.cc
index 07891327775..2771afc6302 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_affine.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_affine.cc
@@ -139,6 +139,9 @@ void populateLHLOToAffineConversionPattern(MLIRContext* context,
 
 struct LhloLegalizeToAffinePass
     : public PassWrapper<LhloLegalizeToAffinePass, FunctionPass> {
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<AffineDialect>();
+  }
   void runOnFunction() override {
     OwningRewritePatternList patterns;
     auto func = getFunction();
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_gpu.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_gpu.cc
index 0d0b8b0ab6e..fbade8f7387 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_gpu.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_gpu.cc
@@ -20,8 +20,10 @@ limitations under the License.
 #include "llvm/ADT/ArrayRef.h"
 #include "mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
 #include "mlir-hlo/Dialect/mhlo/transforms/map_lmhlo_to_scalar_op.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/GPU/GPUDialect.h"
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"
+#include "mlir/Dialect/Linalg/IR/LinalgTypes.h"
 #include "mlir/Dialect/SCF/SCF.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/Attributes.h"
@@ -147,9 +149,9 @@ class LhloReduceToGPULaunchConverter : public OpConversionPattern<ReduceOp> {
       // Now copy over the actual body of the reduction, leaving out the
       // terminator.
       BlockAndValueMapping mapping;
-      mapping.map(reduce_op.body().front().getArgument(0), accumulator);
-      mapping.map(reduce_op.body().front().getArgument(1), rhs);
-      mapping.map(reduce_op.body().front().getArgument(2), accumulator);
+      mapping.map(reduce_op.body().getArgument(0), accumulator);
+      mapping.map(reduce_op.body().getArgument(1), rhs);
+      mapping.map(reduce_op.body().getArgument(2), accumulator);
       for (auto& nested : reduce_op.body().front().without_terminator()) {
         auto clone = rewriter.clone(nested, mapping);
         for (auto pair : llvm::zip(nested.getResults(), clone->getResults())) {
@@ -169,6 +171,11 @@ class LhloReduceToGPULaunchConverter : public OpConversionPattern<ReduceOp> {
 
 struct LhloLegalizeToGpuPass
     : public PassWrapper<LhloLegalizeToGpuPass, FunctionPass> {
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<AffineDialect, gpu::GPUDialect, linalg::LinalgDialect,
+                    scf::SCFDialect>();
+  }
+
   void runOnFunction() override {
     OwningRewritePatternList patterns;
     ConversionTarget target(getContext());
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_llvm.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_llvm.cc
index af64c448ad9..57ea947c473 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_llvm.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_llvm.cc
@@ -45,7 +45,7 @@ struct StaticMemRefCastOpConverter
       return failure();
     // Create descriptor.
     auto desc = MemRefDescriptor::undef(rewriter, loc, llvmTargetDescriptorTy);
-    Type llvmTargetElementTy = desc.getElementType();
+    Type llvmTargetElementTy = desc.getElementPtrType();
     // Set allocated ptr.
     Value allocated = sourceMemRef.allocatedPtr(rewriter, loc);
     allocated =
@@ -96,7 +96,7 @@ struct DynamicMemRefCastOpConverter
       return failure();
     // Create descriptor.
     auto desc = MemRefDescriptor::undef(rewriter, loc, llvmTargetDescriptorTy);
-    Type llvmTargetElementTy = desc.getElementType();
+    Type llvmTargetElementTy = desc.getElementPtrType();
     // Set allocated ptr.
     Value allocated = sourceMemRef.allocatedPtr(rewriter, loc);
     allocated =
@@ -217,8 +217,7 @@ struct ReshapeMemRefCastOpConverter
     SmallVector<Value, 1> sizes;
     UnrankedMemRefDescriptor::computeSizes(rewriter, loc, typeConverter,
                                            {target_desc}, sizes);
-    auto void_ptr_type =
-        LLVM::LLVMType::getInt8PtrTy(typeConverter.getDialect());
+    auto void_ptr_type = LLVM::LLVMType::getInt8PtrTy(rewriter.getContext());
     Value ranked_desc_mem = rewriter.create<LLVM::AllocaOp>(
         loc, void_ptr_type, sizes.front(), llvm::None);
     target_desc.setMemRefDescPtr(rewriter, loc, ranked_desc_mem);
@@ -282,7 +281,7 @@ struct ReshapeMemRefCastOpConverter
     auto index_arg = cond_block->addArgument(typeConverter.getIndexType());
     auto stride_arg = cond_block->addArgument(typeConverter.getIndexType());
     auto pred = rewriter.create<LLVM::ICmpOp>(
-        loc, LLVM::LLVMType::getInt1Ty(typeConverter.getDialect()),
+        loc, LLVM::LLVMType::getInt1Ty(rewriter.getContext()),
         LLVM::ICmpPredicate::sge, index_arg, zero_index);
 
     Block *body_block =
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_llvm_pass.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_llvm_pass.cc
index 00252735023..3d49027bb50 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_llvm_pass.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_llvm_pass.cc
@@ -15,8 +15,6 @@ limitations under the License.
 
 #include "mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
 #include "mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
-#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
-#include "mlir/Conversion/SCFToStandard/SCFToStandard.h"
 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
@@ -31,6 +29,10 @@ namespace {
 class TestLhloToLLVMPass
     : public ::mlir::PassWrapper<TestLhloToLLVMPass,
                                  ::mlir::OperationPass<::mlir::ModuleOp>> {
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<LLVM::LLVMDialect>();
+  }
+
  public:
   void runOnOperation() override {
     ModuleOp m = getOperation();
@@ -39,8 +41,6 @@ class TestLhloToLLVMPass
     LLVMTypeConverter converter(&getContext());
     populateStdToLLVMConversionPatterns(converter, patterns);
     PopulateLhloToLLVMConversionPatterns(&converter, &patterns);
-    populateLoopToStdConversionPatterns(patterns, &getContext());
-    populateAffineToStdConversionPatterns(patterns, &getContext());
 
     ConversionTarget target(getContext());
     target.addLegalDialect<LLVM::LLVMDialect>();
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_parallel_loops.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_parallel_loops.cc
index 19f47d08c0d..d9a2d993496 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_parallel_loops.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_parallel_loops.cc
@@ -691,6 +691,10 @@ class SelectAndScatterOpConverter
 
 struct LhloLegalizeToParallelLoopsPass
     : public PassWrapper<LhloLegalizeToParallelLoopsPass, FunctionPass> {
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<StandardOpsDialect, scf::SCFDialect>();
+  }
+
   void runOnFunction() override {
     auto func = getFunction();
 
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/mhlo_control_flow_to_scf.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/mhlo_control_flow_to_scf.cc
new file mode 100644
index 00000000000..dba3cab6956
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/mhlo_control_flow_to_scf.cc
@@ -0,0 +1,199 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "llvm/Support/Casting.h"
+#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "mlir-hlo/Dialect/mhlo/transforms/passes.h"
+#include "mlir/Dialect/SCF/SCF.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+
+#define DEBUG_TYPE "mhlo-control-flow-to-scf"
+
+namespace mlir {
+namespace mhlo {
+
+namespace {
+
+/// Convert MHLO While to SCF.
+void MatchAndRewrite(WhileOp whileOp);
+
+/// Pass that converts MHLO control flow to SCF.
+class ControlFlowToScfPass
+    : public mlir::PassWrapper<ControlFlowToScfPass, FunctionPass> {
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<scf::SCFDialect>();
+  }
+  void runOnFunction() override {
+    getFunction().walk([&](WhileOp whileOp) { MatchAndRewrite(whileOp); });
+  }
+};
+
+// TODO(jpienaar): Look into reformulating as a pattern.
+void MatchAndRewrite(WhileOp whileOp) {
+  // Handle pattern:
+  //   x = start
+  //   step = ...
+  //   limit = ...
+  //   while (x < limit) { ... x += step; }
+
+  // Only handling multi value while loops at the moment.
+  auto tupleOp = whileOp.getOperand().getDefiningOp<TupleOp>();
+  if (!tupleOp) return;
+  auto bodyReturn = whileOp.body()
+                        .front()
+                        .getTerminator()
+                        ->getOperand(0)
+                        .getDefiningOp<mhlo::TupleOp>();
+  // Note: due to the shape restrictions on While, if the operand to While is a
+  // tuple, then so is the return type of the body. But the verifier isn't
+  // checking that at the moment, so just bail out here if this doesn't hold.
+  if (!bodyReturn) return;
+
+  Value result = whileOp.cond().front().getTerminator()->getOperand(0);
+  // TODO(jpienaar): Expand to handle more than simple case with LT compare and
+  // constant step.
+  auto cmp = result.getDefiningOp<mhlo::CompareOp>();
+  if (!cmp || cmp.comparison_direction() != "LT") return;
+
+  const int kConstant = -1;
+  auto getValueAndIndex = [&](Value val) -> std::pair<Value, int> {
+    if (matchPattern(val, m_Constant())) return {val, kConstant};
+    // If it is defined by a tuple, then the tuple has to have been fed in and
+    // the external value is captured.
+    if (auto gte = val.getDefiningOp<GetTupleElementOp>()) {
+      if (!gte.getOperand().isa<mlir::BlockArgument>()) return {nullptr, 0};
+      int index = gte.index();
+      return {tupleOp.getOperand(index), index};
+    }
+    return {nullptr, 0};
+  };
+
+  using ValueIndex = std::pair<Value, int>;
+  ValueIndex loopIndVar = getValueAndIndex(cmp.lhs());
+  ValueIndex max = getValueAndIndex(cmp.rhs());
+  if (!loopIndVar.first || !max.first) return;
+  auto add =
+      bodyReturn.getOperand(loopIndVar.second).getDefiningOp<mhlo::AddOp>();
+  if (!add) return;
+  ValueIndex step = getValueAndIndex(add.rhs());
+  if (step.second != kConstant || !step.first) return;
+
+  // Only handle case where tuple isn't propagated as is for now.
+  // TODO(jpienaar): Remove this when a tuple is also created inside the loop
+  // to propagate.
+  for (auto* use : whileOp.body().front().getArgument(0).getUsers())
+    if (!isa<GetTupleElementOp>(use)) return;
+
+  LLVM_DEBUG(llvm::dbgs() << "Found for (" << whileOp.getLoc() << "):\n";
+             llvm::dbgs() << "  loopIndVar = " << loopIndVar.second << " max = "
+                          << max.second << " step = " << step.second << "\n";
+             llvm::dbgs() << "  loopIndVar = " << loopIndVar.first << " max = "
+                          << max.first << " step = " << step.first << "\n";);
+  OpBuilder b(whileOp);
+  // Inputs to new for loop.
+  llvm::SmallVector<Value, 4> input;
+  input.reserve(tupleOp.getNumOperands());
+  for (auto r : tupleOp.getOperands().take_front(loopIndVar.second))
+    input.push_back(r);
+  for (auto r : tupleOp.getOperands().drop_front(loopIndVar.second + 1))
+    input.push_back(r);
+
+  auto tensorIndexType = RankedTensorType::get({}, b.getIndexType());
+  auto getAsIndex = [&](Value val) {
+    auto loc = whileOp.getLoc();
+    return b.create<ExtractElementOp>(
+        loc, b.create<IndexCastOp>(loc, tensorIndexType, val), ValueRange());
+  };
+
+  // SCF for uses index type, so converted these.
+  auto forloopIndVar = getAsIndex(loopIndVar.first);
+  auto forMax = getAsIndex(max.first);
+  auto forStep = getAsIndex(step.first);
+  auto forOp = b.create<mlir::scf::ForOp>(whileOp.getLoc(), forloopIndVar,
+                                          forMax, forStep, input);
+  // Transfer the body without the block arguments.
+  forOp.getLoopBody().front().getOperations().splice(
+      forOp.getLoopBody().front().getOperations().end(),
+      whileOp.body().front().getOperations());
+
+  b.setInsertionPointToStart(&forOp.getLoopBody().front());
+  auto loopIndVarElType =
+      loopIndVar.first.getType().cast<ShapedType>().getElementType();
+  Value indVar = b.create<SplatOp>(
+      whileOp.getLoc(), RankedTensorType::get({}, loopIndVarElType),
+      b.create<IndexCastOp>(whileOp.getLoc(), loopIndVarElType,
+                            forOp.getInductionVar()));
+  // Update all block argument users to the SCF For args.
+  for (auto* use :
+       llvm::make_early_inc_range(whileOp.body().getArgument(0).getUsers())) {
+    // TODO(jpienaar): Expand here too when we allow using the tuple in the
+    // loop.
+    auto gte = cast<GetTupleElementOp>(use);
+    // If the loop induction var, then refer to the loop induction variable as
+    // this operand is not updated.
+    if (gte.index() == loopIndVar.second) {
+      use->getResult(0).replaceAllUsesWith(indVar);
+      use->erase();
+      continue;
+    }
+    int index = gte.index();
+    // If after the loop induction variable, then decrement as we don't include
+    // the loop induction variable in the for iter operands.
+    if (index > loopIndVar.second) --index;
+    use->getResult(0).replaceAllUsesWith(forOp.getIterOperands()[index]);
+    use->erase();
+  }
+
+  // Create new yield op without induction var update.
+  SmallVector<Value, 4> newYieldOps;
+  newYieldOps.reserve(bodyReturn.getNumOperands() - 1);
+  for (auto r : bodyReturn.getOperands().take_front(loopIndVar.second))
+    newYieldOps.push_back(r);
+  for (auto r : bodyReturn.getOperands().drop_front(loopIndVar.second + 1))
+    newYieldOps.push_back(r);
+  // Delete return & tuple op.
+  forOp.getLoopBody().front().back().erase();
+  forOp.getLoopBody().front().back().erase();
+  b.setInsertionPointToEnd(&forOp.getLoopBody().front());
+  b.create<scf::YieldOp>(whileOp.getLoc(), newYieldOps);
+
+  // Recombine output tuple with max value of induction variable.
+  llvm::SmallVector<Value, 4> loopOut;
+  loopOut.reserve(forOp.getNumResults() + 1);
+  for (auto r : forOp.getResults().take_front(loopIndVar.second))
+    loopOut.push_back(r);
+  loopOut.push_back(max.first);
+  for (auto r : forOp.getResults().drop_front(loopIndVar.second))
+    loopOut.push_back(r);
+  b.setInsertionPoint(whileOp);
+  auto newRes = b.create<mhlo::TupleOp>(whileOp.getLoc(), loopOut);
+  whileOp.replaceAllUsesWith(newRes.getOperation());
+  whileOp.erase();
+}
+
+}  // anonymous namespace
+
+std::unique_ptr<OperationPass<FuncOp>> createControlFlowToScfPass() {
+  return std::make_unique<ControlFlowToScfPass>();
+}
+
+}  // namespace mhlo
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/transform_unranked_hlo.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/transform_unranked_hlo.cc
index 8db5d849322..4a17a5b5391 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/transform_unranked_hlo.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/transform_unranked_hlo.cc
@@ -14,6 +14,7 @@ limitations under the License.
 
 ==============================================================================*/
 
+#include "mlir-hlo/Dialect/mhlo/IR/chlo_ops.h"
 #include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
 #include "mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
 #include "mlir/Dialect/Shape/IR/Shape.h"
@@ -27,7 +28,6 @@ limitations under the License.
 #include "mlir/Transforms/DialectConversion.h"
 
 namespace mlir {
-namespace mhlo {
 namespace {
 
 // TODO(herhut): Generate these out of op definitions.
@@ -46,115 +46,80 @@ namespace {
           sep fn(ShiftLeftOp) sep fn(ShiftRightArithmeticOp)              \
               sep fn(ShiftRightLogicalOp) sep fn(SubOp)
 
-// TODO(frgossen): Make it variadic.
+// TODO(herhut): Generate these out of op definitions.
+#define MAP_CHLO_OPERATION_CWISE_UNARY(fn, sep) fn(TanOp) sep fn(AcosOp)
+
 template <typename OpTy>
 inline void AddLegalOpOnRankedTensor(ConversionTarget *target) {
   target->addDynamicallyLegalOp<OpTy>([](OpTy op) {
-    return llvm::all_of((op.getOperation())->getOperandTypes(),
+    return llvm::all_of(op.getOperation()->getOperandTypes(),
                         [&](Type t) { return t.isa<RankedTensorType>(); });
   });
 }
 
-/// Unary element-wise operations on unranked tensors can be applied to the
-/// flattened tensor with the same effect.
-/// This pattern rewrites every such operation to
+/// Element-wise operations on unranked tensors can be applied to the flattened
+/// tensor operands with the same effect.  This pattern rewrites every such
+/// operation to
 ///   (i)   flatten the input tensor,
-///   (ii)  apply the unary operation, and
+///   (ii)  apply the operation, and
 ///   (iii) restore the original shape.
 template <typename OpTy>
-struct UnaryElementwiseOpConversion : public OpRewritePattern<OpTy> {
-  explicit UnaryElementwiseOpConversion(MLIRContext *context)
+struct ElementwiseOpConversion : public OpRewritePattern<OpTy> {
+  explicit ElementwiseOpConversion(MLIRContext *context)
       : OpRewritePattern<OpTy>(context) {}
 
   LogicalResult matchAndRewrite(OpTy op,
                                 PatternRewriter &rewriter) const override {
-    // Don't apply conversion to ops with statically shaped operands.
-    Value operand = op.getOperand();
-    auto operandTy = operand.getType().dyn_cast<TensorType>();
-    if (operandTy.hasRank()) return failure();
-
-    // Generate IR to flatten the operand.
-    auto loc = op.getLoc();
-    Value shape = rewriter.create<shape::ShapeOfOp>(loc, operand);
-    Value numElements = rewriter.create<shape::NumElementsOp>(loc, shape);
-    Value numElementsAsIndex =
-        rewriter.create<shape::SizeToIndexOp>(loc, numElements);
-    Value flatShapeAsDimTensor =
-        rewriter.create<TensorFromElementsOp>(loc, numElementsAsIndex);
-    auto flatTensorTy = RankedTensorType::get({ShapedType::kDynamicSize},
-                                              operandTy.getElementType());
-    Value flatOperand = rewriter.create<mhlo::DynamicReshapeOp>(
-        loc, flatTensorTy, operand, flatShapeAsDimTensor);
-
-    // Generate IR for the actual operation.
-    Value flatResult = rewriter.create<OpTy>(loc, flatTensorTy, flatOperand);
-
-    // Generate IR to restore the original shape.
-    auto extentTensorTy = RankedTensorType::get({ShapedType::kDynamicSize},
-                                                rewriter.getIndexType());
-    Value shapeAsExtentTensor =
-        rewriter.create<shape::ToExtentTensorOp>(loc, extentTensorTy, shape);
-    Value result = rewriter.create<mhlo::DynamicReshapeOp>(
-        loc, operandTy, flatResult, shapeAsExtentTensor);
-    rewriter.replaceOp(op, result);
-
-    return success();
-  }
-};
-
-/// Binary element-wise operation on unranked tensors can be applied to the
-/// flattened operand tensors with the same effect.
-/// This pattern rewrites every such operation to
-///   (i)   flatten the operand tensors,
-///   (ii)  apply the binary operation, and
-//    (iii) restore the original shape.
-template <typename OpTy>
-struct BinaryElementwiseOpConversion : public OpRewritePattern<OpTy> {
-  explicit BinaryElementwiseOpConversion(MLIRContext *context)
-      : OpRewritePattern<OpTy>(context) {}
-
-  LogicalResult matchAndRewrite(OpTy op,
-                                PatternRewriter &rewriter) const override {
-    // Don't apply conversion unless both operands are unranked.
-    if (op.lhs().getType().template isa<RankedTensorType>() ||
-        op.rhs().getType().template isa<RankedTensorType>()) {
+    // Don't apply conversion unless all operands are unranked.
+    if (!llvm::all_of(op.getOperation()->getOperands(), [&](Value operand) {
+          return operand.getType().isa<UnrankedTensorType>();
+        })) {
       return failure();
     }
 
-    // Flatten operands.
-    Type shapeTy = shape::ShapeType::get(rewriter.getContext());
+    // Get operands' shape.
     auto loc = op.getLoc();
-    Value shapeLhs = rewriter.create<shape::ShapeOfOp>(loc, op.lhs());
-    Value shapeRhs = rewriter.create<shape::ShapeOfOp>(loc, op.rhs());
-    Value shape = rewriter.create<shape::AnyOp>(loc, shapeTy,
-                                                ValueRange{shapeLhs, shapeRhs});
-    Value numElements = rewriter.create<shape::NumElementsOp>(loc, shape);
-    Value numElementsAsIndex =
-        rewriter.create<shape::SizeToIndexOp>(loc, numElements);
-    Value flatShape =
-        rewriter.create<TensorFromElementsOp>(loc, numElementsAsIndex);
-    TensorType lhsTy = op.lhs().getType().template cast<TensorType>();
-    Type flatLhsTy = RankedTensorType::get({ShapedType::kDynamicSize},
-                                           lhsTy.getElementType());
-    Value flatLhs =
-        rewriter.create<DynamicReshapeOp>(loc, flatLhsTy, op.lhs(), flatShape);
-    TensorType rhsTy = op.rhs().getType().template cast<TensorType>();
-    Type flatRhsTy = RankedTensorType::get({ShapedType::kDynamicSize},
-                                           rhsTy.getElementType());
-    Value flatRhs =
-        rewriter.create<DynamicReshapeOp>(loc, flatRhsTy, op.rhs(), flatShape);
+    Type extentTensorTy = shape::getExtentTensorType(rewriter.getContext());
+    SmallVector<Value, 3> operandShapes;
+    for (Value operand : op.getOperation()->getOperands()) {
+      Value shape =
+          rewriter.create<shape::ShapeOfOp>(loc, extentTensorTy, operand);
+      operandShapes.push_back(shape);
+    }
+    Value shape =
+        operandShapes.size() == 1
+            ? operandShapes.front()
+            : rewriter.create<shape::AnyOp>(loc, extentTensorTy, operandShapes);
 
-    // Apply actual operation to flattened operands.
-    Value flatResult = rewriter.create<OpTy>(loc, flatLhs, flatRhs);
+    // Derive flat shape.
+    Type indexTy = rewriter.getIndexType();
+    Value numElements =
+        rewriter.create<shape::NumElementsOp>(loc, indexTy, shape);
+    Value flatShape = rewriter.create<TensorFromElementsOp>(loc, numElements);
+
+    // Flatten operands.
+    SmallVector<Value, 3> flatOperands;
+    for (Value operand : op.getOperation()->getOperands()) {
+      Type operandElementTy =
+          operand.getType().template cast<ShapedType>().getElementType();
+      Type flatTy =
+          RankedTensorType::get({ShapedType::kDynamicSize}, operandElementTy);
+      Value flat = rewriter.create<mhlo::DynamicReshapeOp>(loc, flatTy, operand,
+                                                           flatShape);
+      flatOperands.push_back(flat);
+    }
+
+    // Apply operation to flattened operands.
+    Type resultElementTy =
+        op.getType().template cast<ShapedType>().getElementType();
+    Type flatResultTy =
+        RankedTensorType::get({ShapedType::kDynamicSize}, resultElementTy);
+    Value flatResult =
+        rewriter.create<OpTy>(loc, flatResultTy, flatOperands, op.getAttrs());
 
     // Restore original shape.
-    auto extentTensorTy = RankedTensorType::get({ShapedType::kDynamicSize},
-                                                rewriter.getIndexType());
-    Value shapeAsExtentTensor =
-        rewriter.create<shape::ToExtentTensorOp>(loc, extentTensorTy, shape);
-    Value result = rewriter.create<DynamicReshapeOp>(
-        loc, op.getType(), flatResult, shapeAsExtentTensor);
-    rewriter.replaceOp(op, result);
+    rewriter.replaceOpWithNewOp<mhlo::DynamicReshapeOp>(op, op.getType(),
+                                                        flatResult, shape);
 
     return success();
   }
@@ -162,24 +127,33 @@ struct BinaryElementwiseOpConversion : public OpRewritePattern<OpTy> {
 
 struct TransformUnrankedHloPass
     : public PassWrapper<TransformUnrankedHloPass, FunctionPass> {
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<shape::ShapeDialect, mhlo::MhloDialect>();
+  }
+
   void runOnFunction() override {
     // Setup conversion target.
     MLIRContext &ctx = getContext();
     ConversionTarget target(ctx);
-    target.addLegalDialect<MhloDialect, StandardOpsDialect,
+    target.addLegalDialect<mhlo::MhloDialect, StandardOpsDialect,
                            shape::ShapeDialect>();
     target.addLegalOp<FuncOp>();
-#define ADD_LEGAL(op) AddLegalOpOnRankedTensor<op>(&target)
-    MAP_XLA_OPERATION_CWISE_UNARY(ADD_LEGAL, ;);
-    MAP_XLA_OPERATION_CWISE_BINARY(ADD_LEGAL, ;);
-#undef ADD_LEGAL
+#define ADD_LEGAL_MHLO(op) AddLegalOpOnRankedTensor<mhlo::op>(&target)
+#define ADD_LEGAL_CHLO(op) AddLegalOpOnRankedTensor<chlo::op>(&target)
+    MAP_XLA_OPERATION_CWISE_UNARY(ADD_LEGAL_MHLO, ;);
+    MAP_XLA_OPERATION_CWISE_BINARY(ADD_LEGAL_MHLO, ;);
+    MAP_CHLO_OPERATION_CWISE_UNARY(ADD_LEGAL_CHLO, ;);
+#undef ADD_LEGAL_MHLO
+#undef ADD_LEGAL_CHLO
+    AddLegalOpOnRankedTensor<mhlo::CompareOp>(&target);
+    AddLegalOpOnRankedTensor<mhlo::SelectOp>(&target);
 
     // Populate rewrite patterns.
     OwningRewritePatternList patterns;
     PopulateTransformUnrankedHloPatterns(&ctx, &patterns);
 
     // Apply transformation.
-    if (failed(applyFullConversion(getFunction(), target, patterns)))
+    if (failed(applyPartialConversion(getFunction(), target, patterns)))
       return signalPassFailure();
   }
 };
@@ -188,24 +162,26 @@ struct TransformUnrankedHloPass
 
 void PopulateTransformUnrankedHloPatterns(MLIRContext *context,
                                           OwningRewritePatternList *patterns) {
-  // TODO(frgossen): Populate all unary and binary operations.
-  // clang-format off
-#define MAP_UNARY(op) UnaryElementwiseOpConversion<op>
-#define MAP_BINARY(op) BinaryElementwiseOpConversion<op>
+#define MAP_UNARY(op) ElementwiseOpConversion<mhlo::op>
+#define MAP_BINARY(op) ElementwiseOpConversion<mhlo::op>
+#define MAP_CHLO_UNARY(op) ElementwiseOpConversion<chlo::op>
 #define COMMA ,
+  // clang-format off
   patterns->insert<
       MAP_XLA_OPERATION_CWISE_UNARY(MAP_UNARY, COMMA),
-      MAP_XLA_OPERATION_CWISE_BINARY(MAP_BINARY, COMMA)
-      >(context);
+      MAP_XLA_OPERATION_CWISE_BINARY(MAP_BINARY, COMMA),
+      MAP_CHLO_OPERATION_CWISE_UNARY(MAP_CHLO_UNARY, COMMA),
+      ElementwiseOpConversion<mhlo::CompareOp>,
+      ElementwiseOpConversion<mhlo::SelectOp>>(context);
+  // clang-format on
 #undef MAP_UNARY
 #undef MAP_BINARY
+#undef MAP_CHLO_UNARY
 #undef COMMA
-  // clang-format on
 }
 
-std::unique_ptr<::mlir::Pass> createTransformUnrankedHloPass() {
+std::unique_ptr<FunctionPass> createTransformUnrankedHloPass() {
   return std::make_unique<TransformUnrankedHloPass>();
 }
 
-}  // namespace mhlo
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/unfuse_batch_norm.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/unfuse_batch_norm.cc
index 1458e5f3d63..9d072488389 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/unfuse_batch_norm.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/unfuse_batch_norm.cc
@@ -122,7 +122,7 @@ class UnfuseBatchNormInferencePattern
     if (!fp_type) {
       return failure();
     }
-    int64_t feature_dim = bn_op.feature_index().getSExtValue();
+    int64_t feature_dim = bn_op.feature_index();
 
     // Add epsilon to the variance and sqrt to get stddev:
     // stddev = sqrt(variance + epsilon)
diff --git a/tensorflow/compiler/mlir/hlo/lib/utils/CMakeLists.txt b/tensorflow/compiler/mlir/hlo/lib/utils/CMakeLists.txt
new file mode 100644
index 00000000000..17e86f1caa8
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/lib/utils/CMakeLists.txt
@@ -0,0 +1,25 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+add_mlir_library(MLIRMhloUtils
+  broadcast_utils.cc
+  convert_op_folder.cc
+  cycle_detector.cc
+  hlo_utils.cc
+
+  LINK_LIBS PUBLIC
+  MLIRSupport
+  )
diff --git a/tensorflow/compiler/mlir/hlo/lib/utils/broadcast_utils.cc b/tensorflow/compiler/mlir/hlo/lib/utils/broadcast_utils.cc
index a3ce4d44436..71b1a4e164f 100644
--- a/tensorflow/compiler/mlir/hlo/lib/utils/broadcast_utils.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/utils/broadcast_utils.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "llvm/ADT/Sequence.h"
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/Dialect/Shape/IR/Shape.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/Diagnostics.h"
 #include "mlir/IR/StandardTypes.h"
 
@@ -46,9 +47,9 @@ bool IsLegalNumpyRankedBroadcast(Value lhs, Value rhs,
                     broadcast_dims.getIntValues().begin());
 }
 
-Value ComputeBinaryElementwiseBroadcastingResultExtents(Location loc, Value lhs,
-                                                        Value rhs,
-                                                        OpBuilder& builder) {
+Value ComputeBinaryElementwiseBroadcastingResultExtents(
+    Location loc, Value lhs, Value rhs, OpBuilder& builder,
+    bool unsafe_as_extent_tensor) {
   auto lhs_type = lhs.getType().dyn_cast<RankedTensorType>();
   auto rhs_type = rhs.getType().dyn_cast<RankedTensorType>();
   if (!lhs_type || !rhs_type) {
@@ -57,15 +58,22 @@ Value ComputeBinaryElementwiseBroadcastingResultExtents(Location loc, Value lhs,
     return nullptr;
   }
 
-  int64_t result_rank = std::max(lhs_type.getRank(), rhs_type.getRank());
   Value lhs_shape_v = builder.createOrFold<shape::ShapeOfOp>(loc, lhs);
   Value rhs_shape_v = builder.createOrFold<shape::ShapeOfOp>(loc, rhs);
-  Value result_shape_v = builder.createOrFold<shape::BroadcastOp>(
-      loc, shape::ShapeType::get(builder.getContext()), lhs_shape_v,
-      rhs_shape_v, nullptr /* error */);
-  return builder.createOrFold<shape::ToExtentTensorOp>(
-      loc, RankedTensorType::get({result_rank}, builder.getIndexType()),
-      result_shape_v);
+
+  if (unsafe_as_extent_tensor) {
+    int64_t result_rank = std::max(lhs_type.getRank(), rhs_type.getRank());
+    Value result_shape_v = builder.createOrFold<shape::BroadcastOp>(
+        loc, shape::getExtentTensorType(builder.getContext()), lhs_shape_v,
+        rhs_shape_v, nullptr /* error */);
+    return builder.createOrFold<TensorCastOp>(
+        loc, RankedTensorType::get({result_rank}, builder.getIndexType()),
+        result_shape_v);
+  }
+
+  return builder.createOrFold<shape::BroadcastOp>(
+      loc, builder.getType<shape::ShapeType>(), lhs_shape_v, rhs_shape_v,
+      nullptr /* error */);
 }
 
 }  // namespace hlo
diff --git a/tensorflow/compiler/mlir/hlo/lib/utils/hlo_utils.cc b/tensorflow/compiler/mlir/hlo/lib/utils/hlo_utils.cc
index df2442cc4b6..0bbd91e0680 100644
--- a/tensorflow/compiler/mlir/hlo/lib/utils/hlo_utils.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/utils/hlo_utils.cc
@@ -60,10 +60,76 @@ DenseElementsAttr GetScalarOfType(Type ty, int64_t raw_value) {
   if (auto float_ty = ty.dyn_cast<FloatType>()) {
     APFloat value(float_ty.getFloatSemantics(), raw_value);
     return DenseElementsAttr::get(scalar_ty, value);
+  } else if (auto int_ty = ty.dyn_cast<IntegerType>()) {
+    APInt value(int_ty.getWidth(), static_cast<int64_t>(raw_value), true);
+    return DenseElementsAttr::get(scalar_ty, value);
+  } else if (auto complex_ty = ty.dyn_cast<ComplexType>()) {
+    Type complex_element_ty = complex_ty.getElementType();
+    if (complex_element_ty.isF32()) {
+      return DenseElementsAttr::get(
+          scalar_ty, static_cast<std::complex<float>>(raw_value));
+    } else if (complex_element_ty.isF64()) {
+      return DenseElementsAttr::get(
+          scalar_ty, static_cast<std::complex<double>>(raw_value));
+    }
   }
-  auto int_ty = ty.cast<IntegerType>();
-  APInt value(int_ty.getWidth(), static_cast<int64_t>(raw_value), true);
-  return DenseElementsAttr::get(scalar_ty, value);
+  llvm_unreachable("unsupported type");
+}
+
+static APFloat GetScalarLimitOfFloatType(FloatType float_ty,
+                                         ScalarLimit limit) {
+  auto &semantics = float_ty.getFloatSemantics();
+  switch (limit) {
+    case kLowest:
+      return APFloat::getLargest(semantics, /*negative=*/true);
+    case kInfinityLowest:
+      return APFloat::getInf(semantics, /*negative=*/true);
+    case kMax:
+      return APFloat::getLargest(semantics, /*negative=*/false);
+    case kInfinityMax:
+      return APFloat::getInf(semantics, /*negative=*/false);
+  }
+  llvm_unreachable("invalid limit");
+}
+
+// Returns a scalar value for the given integer type.
+//
+// The argument 'scalar' describes which scalar value to return. `integer_value`
+// is used to specify the integer value for kInteger. For any other scalar,
+// integer_value is ignored.
+static APInt GetScalarLimitOfIntegerType(IntegerType integer_ty,
+                                         ScalarLimit limit) {
+  unsigned width = integer_ty.getWidth();
+  switch (limit) {
+    case kLowest:
+    case kInfinityLowest:
+      if (integer_ty.isUnsigned()) {
+        return APInt::getMinValue(width);
+      } else {
+        return APInt::getSignedMinValue(width);
+      }
+
+    case kMax:
+    case kInfinityMax:
+      if (integer_ty.isUnsigned()) {
+        return APInt::getMaxValue(width);
+      } else {
+        return APInt::getSignedMaxValue(width);
+      }
+  }
+  llvm_unreachable("invalid limit");
+}
+
+DenseElementsAttr GetScalarLimitOfType(Type ty, ScalarLimit limit) {
+  RankedTensorType scalar_ty = RankedTensorType::get({}, ty);
+  if (auto float_ty = ty.dyn_cast<FloatType>()) {
+    return DenseElementsAttr::get(scalar_ty,
+                                  GetScalarLimitOfFloatType(float_ty, limit));
+  } else if (auto integer_ty = ty.dyn_cast<IntegerType>()) {
+    return DenseElementsAttr::get(
+        scalar_ty, GetScalarLimitOfIntegerType(integer_ty, limit));
+  }
+  llvm_unreachable("unsupported type");
 }
 
 }  // namespace hlo
diff --git a/tensorflow/compiler/mlir/hlo/tests/CMakeLists.txt b/tensorflow/compiler/mlir/hlo/tests/CMakeLists.txt
new file mode 100644
index 00000000000..36a7eec5a1f
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/tests/CMakeLists.txt
@@ -0,0 +1,36 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+configure_lit_site_cfg(
+        ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in
+        ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg.py
+        MAIN_CONFIG
+        ${CMAKE_CURRENT_SOURCE_DIR}/lit.cfg.py
+)
+
+set(MLIR_HLO_TEST_DEPENDS
+        FileCheck count not
+        mlir-hlo-opt
+)
+
+add_lit_testsuite(check-mlir-hlo-lit "Running the mlir-hlo regression tests"
+        ${CMAKE_CURRENT_BINARY_DIR}
+        DEPENDS ${MLIR_HLO_TEST_DEPENDS}
+        )
+set_target_properties(check-mlir-hlo-lit PROPERTIES FOLDER "Tests")
+
+add_lit_testsuites(MLIR_HLO_OPT ${CMAKE_CURRENT_SOURCE_DIR} DEPENDS ${MLIR_HLO_TEST_DEPENDS})
+
+add_dependencies(check-mlir-hlo check-mlir-hlo-lit)
diff --git a/tensorflow/compiler/mlir/hlo/tests/canonicalize.mlir b/tensorflow/compiler/mlir/hlo/tests/canonicalize.mlir
index f0fe52266f0..5da43d5f113 100644
--- a/tensorflow/compiler/mlir/hlo/tests/canonicalize.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/canonicalize.mlir
@@ -191,6 +191,20 @@ func @concatenate_const_2D_horizontal() -> tensor<2x2xi32> {
   return %2 : tensor<2x2xi32>
 }
 
+// CHECK-LABEL: constant_like_constant
+func @constant_like_constant(%arg0: tensor<3x4xi32>) -> tensor<3x4xf32> {
+  // CHECK: mhlo.constant dense<3.200000e+00>
+  %0 = "chlo.constant_like"(%arg0) { value = 3.2 : f32 } : (tensor<3x4xi32>) -> tensor<3x4xf32>
+  return %0 : tensor<3x4xf32>
+}
+
+// CHECK-LABEL: constant_like_constant_dynamic
+func @constant_like_constant_dynamic(%arg0: tensor<*xi32>) -> tensor<*xf32> {
+  // CHECK: chlo.constant_like
+  %0 = "chlo.constant_like"(%arg0) { value = 3.2 : f32 } : (tensor<*xi32>) -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
+
 // CHECK-LABEL: dynamic_slice_variable_start
 func @dynamic_slice_variable_start(%arg0: tensor<3x4xi32>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<1x4xi32> {
   // CHECK: "mhlo.dynamic-slice"
@@ -287,6 +301,13 @@ func @slice_2D_fold_vertical() -> tensor<4x1xi64> {
   return %1 : tensor<4x1xi64>
 }
 
+// CHECK-LABEL: slice_unknown_shape
+func @slice_unknown_shape(%arg0: tensor<*xf32>) -> tensor<*xf32> {
+  // CHECK: "mhlo.slice"(%arg0) {limit_indices = dense<[1, 4]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<*xf32>) -> tensor<*xf32>
+  %0 = "mhlo.slice"(%arg0) {limit_indices = dense<[1, 4]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<*xf32>) -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
+
 // CHECK-LABEL: slice_concat_fold_first
 func @slice_concat_fold_first(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5xf32>) -> tensor<1x5xf32> {
   %0 = "mhlo.concatenate"(%arg0, %arg1) { dimension = 0 : i64 } : (tensor<1x5xf32>, tensor<1x5xf32>) -> tensor<2x5xf32>
@@ -561,3 +582,298 @@ func @dce_while_without_side_effect(%arg0: tensor<i64>) -> tensor<i64> {
 
   return %arg0 : tensor<i64>
 }
+
+// CHECK-LABEL: unpack_repack_same_tuple
+// CHECK-SAME: ([[ARG0:%.*]]: tuple<tensor<i32>, !mhlo.token, tensor<f32>>)
+func @unpack_repack_same_tuple(%arg0: tuple<tensor<i32>, !mhlo.token, tensor<f32>>) -> tuple<tensor<i32>, !mhlo.token, tensor<f32>> {
+  %0 = "mhlo.get_tuple_element"(%arg0) {index = 0 : i32} : (tuple<tensor<i32>, !mhlo.token, tensor<f32>>) -> tensor<i32>
+  %1 = "mhlo.get_tuple_element"(%arg0) {index = 1 : i32} : (tuple<tensor<i32>, !mhlo.token, tensor<f32>>) -> !mhlo.token
+  %2 = "mhlo.get_tuple_element"(%arg0) {index = 2 : i32} : (tuple<tensor<i32>, !mhlo.token, tensor<f32>>) -> tensor<f32>
+  %3 = "mhlo.tuple"(%0, %1, %2) : (tensor<i32>, !mhlo.token, tensor<f32>) -> tuple<tensor<i32>, !mhlo.token, tensor<f32>>
+
+  // CHECK: return [[ARG0]]
+  return %3 : tuple<tensor<i32>, !mhlo.token, tensor<f32>>
+}
+
+// CHECK-LABEL: unpack_repack_same_tuple_single_element
+// CHECK-SAME: ([[ARG0:%.*]]: tuple<tensor<i32>>)
+func @unpack_repack_same_tuple_single_element(%arg0: tuple<tensor<i32>>) -> tuple<tensor<i32>> {
+  %0 = "mhlo.get_tuple_element"(%arg0) {index = 0 : i32} : (tuple<tensor<i32>>) -> tensor<i32>
+  %3 = "mhlo.tuple"(%0) : (tensor<i32>) -> tuple<tensor<i32>>
+
+  // CHECK: return [[ARG0]]
+  return %3 : tuple<tensor<i32>>
+}
+
+// CHECK-LABEL: func @erase_dead_lhlo_constant
+func @erase_dead_lhlo_constant() {
+  %M = alloc() : memref<256x1024xf32>
+  // CHECK-NEXT: return
+  "lmhlo.constant"(%M) {value = dense<0.0> : tensor<f32>} : (memref<256x1024xf32>) -> ()
+  dealloc %M : memref<256x1024xf32>
+  return
+}
+
+// A negative test for dead lhlo constant op erasure.
+// CHECK-LABEL: func @erase_dead_lhlo_constant_negative
+func @erase_dead_lhlo_constant_negative(%M : memref<4xf32>) -> memref<256x1024xf32> {
+  // CHECK-NEXT: lmhlo.constant
+  "lmhlo.constant"(%M) {value = dense<0.0> : tensor<f32>} : (memref<4xf32>) -> ()
+  // CHECK-NEXT: alloc
+  // CHECK-NEXT: lmhlo.constant
+  %N = alloc() : memref<256x1024xf32>
+  "lmhlo.constant"(%N) {value = dense<0.0> : tensor<f32>} : (memref<256x1024xf32>) -> ()
+  return %N : memref<256x1024xf32>
+}
+
+// CHECK-LABEL: func @fold_get_dimension_size
+func @fold_get_dimension_size(%I : tensor<1x128x512xf32>) -> tensor<i32> {
+  %size = "mhlo.get_dimension_size"(%I) {dimension = 2 : i32} : (tensor<1x128x512xf32>) -> tensor<i32>
+  return %size : tensor<i32>
+  // CHECK-NEXT: %[[C:.*]] = mhlo.constant dense<512> : tensor<i32>
+  // CHECK-NEXT: return %[[C]]
+}
+
+// CHECK-LABEL: func @fold_select_same
+func @fold_select_same(%arg0 : tensor<f32>, %arg1 : tensor<i1>) -> tensor<f32> {
+  %1 = "mhlo.select"(%arg1, %arg0, %arg0) : (tensor<i1>, tensor<f32>, tensor<f32>) -> tensor<f32>
+  // CHECK: return %arg0
+  return %1 : tensor<f32>
+}
+
+// CHECK-LABEL: func @fold_select_first
+func @fold_select_first(%arg0 : tensor<f32>, %arg1 : tensor<f32>) -> tensor<f32> {
+  %0 = mhlo.constant dense<1> : tensor<i1>
+  %1 = "mhlo.select"(%0, %arg0, %arg1) : (tensor<i1>, tensor<f32>, tensor<f32>) -> tensor<f32>
+  // CHECK: return %arg0
+  return %1 : tensor<f32>
+}
+
+// CHECK-LABEL: func @fold_select_second
+func @fold_select_second(%arg0 : tensor<f32>, %arg1 : tensor<f32>) -> tensor<f32> {
+  %0 = mhlo.constant dense<0> : tensor<i1>
+  %1 = "mhlo.select"(%0, %arg0, %arg1) : (tensor<i1>, tensor<f32>, tensor<f32>) -> tensor<f32>
+  // CHECK: return %arg1
+  return %1 : tensor<f32>
+}
+
+// CHECK-LABEL: func @fold_select_vector
+func @fold_select_vector(%arg0 : tensor<4xf32>, %arg1 : tensor<4xf32>) -> tensor<4xf32> {
+  %0 = mhlo.constant dense<1> : tensor<4xi1>
+  %1 = "mhlo.select"(%0, %arg0, %arg1) : (tensor<4xi1>, tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+  // CHECK: return %arg0
+  return %1 : tensor<4xf32>
+}
+
+// CHECK-LABEL: gather_to_slice
+func @gather_to_slice(%arg0: tensor<5x6x7xf32>) -> tensor<3x6x5xf32> {
+  %0 = constant dense<[1, 2]> : tensor<2xi32>
+  %1 = "mhlo.gather"(%arg0, %0) {
+    dimension_numbers = {collapsed_slice_dims = dense<> : tensor<0xi64>,
+                         index_vector_dim = 0 : i64,
+                         offset_dims = dense<[0, 1, 2]> : tensor<3xi64>,
+                         start_index_map = dense<[0, 2]> : tensor<2xi64>},
+    indices_are_sorted = false,
+    slice_sizes = dense<[3, 6, 5]> : tensor<3xi64>} : (tensor<5x6x7xf32>, tensor<2xi32>) -> tensor<3x6x5xf32>
+  return %1 : tensor<3x6x5xf32>
+  // CHECK:  %[[RET:.*]] = "mhlo.slice"(%arg0) {limit_indices = dense<[4, 6, 7]> : tensor<3xi64>, start_indices = dense<[1, 0, 2]> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<5x6x7xf32>) -> tensor<3x6x5xf32>
+  // CHECK: return %[[RET]] : tensor<3x6x5xf32>
+}
+
+// CHECK-LABEL: gather_scalar_index_to_slice
+func @gather_scalar_index_to_slice(%arg0: tensor<5x6x7xf32>) -> tensor<5x6x4xf32> {
+  %0 = constant dense<1> : tensor<i32>
+  %1 = "mhlo.gather"(%arg0, %0) {
+    dimension_numbers = {collapsed_slice_dims = dense<> : tensor<0xi64>,
+                         index_vector_dim = 0 : i64,
+                         offset_dims = dense<[0, 1, 2]> : tensor<3xi64>,
+                         start_index_map = dense<[2]> : tensor<1xi64>},
+    indices_are_sorted = false,
+    slice_sizes = dense<[5, 6, 4]> : tensor<3xi64>} : (tensor<5x6x7xf32>, tensor<i32>) -> tensor<5x6x4xf32>
+  return %1 : tensor<5x6x4xf32>
+  // CHECK:  %[[RET:.*]] = "mhlo.slice"(%arg0) {limit_indices = dense<[5, 6, 5]> : tensor<3xi64>, start_indices = dense<[0, 0, 1]> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<5x6x7xf32>) -> tensor<5x6x4xf32>
+  // CHECK: return %[[RET]] : tensor<5x6x4xf32>
+}
+
+// CHECK-LABEL: func @fold_and_same
+func @fold_and_same(%arg0 : tensor<4xi32>) -> tensor<4xi32> {
+  %0 = "mhlo.and"(%arg0, %arg0) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+  // CHECK: return %arg0
+  return %0 : tensor<4xi32>
+}
+
+// CHECK-LABEL: func @fold_and_ones
+func @fold_and_ones(%arg0 : tensor<4xi32>) -> tensor<4xi32> {
+  %0 = mhlo.constant dense<-1> : tensor<4xi32>
+  %1 = "mhlo.and"(%0, %arg0) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+  // CHECK: return %arg0
+  return %1 : tensor<4xi32>
+}
+
+// CHECK-LABEL: func @fold_and_zeros
+func @fold_and_zeros(%arg0 : tensor<4xi32>) -> tensor<4xi32> {
+  %0 = mhlo.constant dense<0> : tensor<4xi32>
+  %1 = "mhlo.and"(%0, %arg0) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+  // CHECK: return %0
+  return %1 : tensor<4xi32>
+}
+
+// CHECK-LABEL: func @fold_and_constant
+func @fold_and_constant(%arg0 : tensor<4xi32>) -> tensor<4xi32> {
+  %0 = mhlo.constant dense<7> : tensor<4xi32>
+  // CHECK: mhlo.and
+  %1 = "mhlo.and"(%0, %arg0) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+  return %1 : tensor<4xi32>
+}
+
+// CHECK-LABEL: func @fold_and_constants
+func @fold_and_constants() -> tensor<4xi32> {
+  %0 = mhlo.constant dense<[0, 1, 6, 3]> : tensor<4xi32>
+  %1 = mhlo.constant dense<[7, 3, 7, 2]> : tensor<4xi32>
+  %2 = "mhlo.and"(%0, %1) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+  // CHECK: %0 = mhlo.constant dense<[0, 1, 6, 2]> : tensor<4xi32>
+  // CHECK: return %0
+  return %2 : tensor<4xi32>
+}
+
+// CHECK-LABEL: func @fold_or_same
+func @fold_or_same(%arg0 : tensor<4xi32>) -> tensor<4xi32> {
+  %0 = "mhlo.or"(%arg0, %arg0) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+  // CHECK: return %arg0
+  return %0 : tensor<4xi32>
+}
+
+// CHECK-LABEL: func @fold_or_ones
+func @fold_or_ones(%arg0 : tensor<4xi32>) -> tensor<4xi32> {
+  %0 = mhlo.constant dense<-1> : tensor<4xi32>
+  %1 = "mhlo.or"(%0, %arg0) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+  // CHECK: return %0
+  return %1 : tensor<4xi32>
+}
+
+// CHECK-LABEL: func @fold_or_zeros
+func @fold_or_zeros(%arg0 : tensor<4xi32>) -> tensor<4xi32> {
+  %0 = mhlo.constant dense<0> : tensor<4xi32>
+  %1 = "mhlo.or"(%0, %arg0) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+  // CHECK: return %arg0
+  return %1 : tensor<4xi32>
+}
+
+// CHECK-LABEL: func @fold_or_constant
+func @fold_or_constant(%arg0 : tensor<4xi32>) -> tensor<4xi32> {
+  %0 = mhlo.constant dense<7> : tensor<4xi32>
+  // CHECK: mhlo.or
+  %1 = "mhlo.or"(%0, %arg0) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+  return %1 : tensor<4xi32>
+}
+
+// CHECK-LABEL: func @fold_or_zeros_right
+func @fold_or_zeros_right(%arg0 : tensor<4xi32>) -> tensor<4xi32> {
+  %0 = mhlo.constant dense<0> : tensor<4xi32>
+  %1 = "mhlo.or"(%arg0, %0) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+  // CHECK: return %arg0
+  return %1 : tensor<4xi32>
+}
+
+// CHECK-LABEL: func @fold_or_zeros_constants
+func @fold_or_zeros_constants() -> tensor<4xi32> {
+  %0 = mhlo.constant dense<[0, 1, 6, 3]> : tensor<4xi32>
+  %1 = mhlo.constant dense<[7, 3, 7, 2]> : tensor<4xi32>
+  %2 = "mhlo.or"(%0, %1) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+  // CHECK: %0 = mhlo.constant dense<[7, 3, 7, 3]> : tensor<4xi32>
+  // CHECK: return %0
+  return %2 : tensor<4xi32>
+}
+
+// CHECK-LABEL: func @fold_xor_same
+func @fold_xor_same(%arg0 : tensor<4xi32>) -> tensor<4xi32> {
+  %0 = "mhlo.xor"(%arg0, %arg0) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+  // CHECK: %0 = mhlo.constant dense<0> : tensor<4xi32>
+  // CHECK: return %0
+  return %0 : tensor<4xi32>
+}
+
+// CHECK-LABEL: func @fold_xor_ones_left
+func @fold_xor_ones_left(%arg0 : tensor<4xi32>) -> tensor<4xi32> {
+  %0 = mhlo.constant dense<-1> : tensor<4xi32>
+  // CHECK: mhlo.xor
+  %1 = "mhlo.xor"(%0, %arg0) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+  return %1 : tensor<4xi32>
+}
+
+// CHECK-LABEL: func @fold_xor_ones_right
+func @fold_xor_ones_right(%arg0 : tensor<4xi32>) -> tensor<4xi32> {
+  %0 = mhlo.constant dense<-1> : tensor<4xi32>
+  // CHECK: mhlo.xor
+  %1 = "mhlo.xor"(%arg0, %0) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+  return %1 : tensor<4xi32>
+}
+
+// CHECK-LABEL: func @fold_xor_zeros_left
+func @fold_xor_zeros_left(%arg0 : tensor<4xi32>) -> tensor<4xi32> {
+  %0 = mhlo.constant dense<0> : tensor<4xi32>
+  %1 = "mhlo.xor"(%0, %arg0) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+  // CHECK: return %arg0
+  return %1 : tensor<4xi32>
+}
+
+// CHECK-LABEL: func @fold_xor_zeros_right
+func @fold_xor_zeros_right(%arg0 : tensor<4xi32>) -> tensor<4xi32> {
+  %0 = mhlo.constant dense<0> : tensor<4xi32>
+  %1 = "mhlo.xor"(%arg0, %0) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+  // CHECK: return %arg0
+  return %1 : tensor<4xi32>
+}
+
+// CHECK-LABEL: func @fold_xor_zeros_constants
+func @fold_xor_zeros_constants() -> tensor<4xi32> {
+  %0 = mhlo.constant dense<[0, 1, 6, 3]> : tensor<4xi32>
+  %1 = mhlo.constant dense<[7, 3, 7, 2]> : tensor<4xi32>
+  %2 = "mhlo.xor"(%0, %1) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+  // CHECK: %0 = mhlo.constant dense<[7, 2, 1, 1]> : tensor<4xi32>
+  // CHECK: return %0
+  return %2 : tensor<4xi32>
+}
+
+// CHECK-LABEL: func @fold_negate_int
+func @fold_negate_int() -> tensor<4xi32> {
+  %0 = mhlo.constant dense<[0, 1, 6, -3]> : tensor<4xi32>
+  // CHECK: mhlo.constant dense<[0, -1, -6, 3]>
+  %1 = "mhlo.negate"(%0) : (tensor<4xi32>) -> tensor<4xi32>
+  return %1 : tensor<4xi32>
+}
+
+// CHECK-LABEL: func @fold_negate_float
+func @fold_negate_float() -> tensor<4xf32> {
+  %0 = mhlo.constant dense<[0., 1., 6., -3.]> : tensor<4xf32>
+  // CHECK: mhlo.constant dense<[-0.000000e+00, -1.000000e+00, -6.000000e+00, 3.000000e+00]>
+  %1 = "mhlo.negate"(%0) : (tensor<4xf32>) -> tensor<4xf32>
+  return %1 : tensor<4xf32>
+}
+
+// CHECK-LABEL: func @fold_sqrt_f32_constants
+func @fold_sqrt_f32_constants() -> tensor<4xf32> {
+  %0 = mhlo.constant dense<1.0> : tensor<4xf32>
+  %1 = "mhlo.sqrt"(%0) : (tensor<4xf32>) -> tensor<4xf32>
+  //     CHECK: mhlo.constant dense<1.000000e+00> : tensor<4xf32>
+  // CHECK-NOT: mhlo.sqrt
+  return %1 : tensor<4xf32>
+}
+
+// CHECK-LABEL: func @fold_sqrt_f64_constants
+func @fold_sqrt_f64_constants() -> tensor<4xf64> {
+  %0 = mhlo.constant dense<[1.0, 4.0, 9.0, 16.0]> : tensor<4xf64>
+  %1 = "mhlo.sqrt"(%0) : (tensor<4xf64>) -> tensor<4xf64>
+  //     CHECK: mhlo.constant dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00]> : tensor<4xf64>
+  // CHECK-NOT: mhlo.sqrt
+  return %1 : tensor<4xf64>
+}
+
+// CHECK-LABEL: func @not_fold_sqrt_neg_constants
+func @not_fold_sqrt_neg_constants() -> tensor<4xf32> {
+  %0 = mhlo.constant dense<-1.0> : tensor<4xf32>
+  %1 = "mhlo.sqrt"(%0) : (tensor<4xf32>) -> tensor<4xf32>
+  // CHECK: mhlo.constant dense<-1.000000e+00> : tensor<4xf32>
+  // CHECK: mhlo.sqrt
+  return %1 : tensor<4xf32>
+}
diff --git a/tensorflow/compiler/mlir/hlo/tests/chlo_infer_shape_type_methods.mlir b/tensorflow/compiler/mlir/hlo/tests/chlo_infer_shape_type_methods.mlir
index 99aab532688..0738459f8b6 100644
--- a/tensorflow/compiler/mlir/hlo/tests/chlo_infer_shape_type_methods.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/chlo_infer_shape_type_methods.mlir
@@ -1,19 +1,18 @@
-// RUN: mlir-hlo-opt -mhlo-test-infer-shaped-type-methods -allow-unregistered-dialect -split-input-file -verify-diagnostics %s -o - | FileCheck %s
+// RUN: mlir-hlo-opt --mhlo-test-infer-shaped-type-methods --allow-unregistered-dialect --split-input-file %s | FileCheck %s
 
 // CHECK-LABEL: @broadcast_add
 // Note that all broadcast_ops are expanded from the same template, so
 // only test reification on an examplar op.
 // CHECK-SAME: %[[ARG0:.+]]: tensor<?xf32>,
 // CHECK-SAME: %[[ARG1:.+]]: tensor<?xf32>
-func @broadcast_add(%arg0: tensor<?xf32>, %arg1: tensor<?xf32>) -> tensor<1xindex> {
+func @broadcast_add(%arg0: tensor<?xf32>, %arg1: tensor<?xf32>) -> !shape.shape {
   // CHECK-DAG: %[[ARG0_S:.+]] = shape.shape_of %[[ARG0]]
   // CHECK-DAG: %[[ARG1_S:.+]] = shape.shape_of %[[ARG1]]
-  // CHECK-DAG: %[[BCAST_S:.+]] = shape.broadcast %[[ARG0_S]], %[[ARG1_S]]
-  // CHECK: %[[EXTENTS:.+]] = shape.to_extent_tensor %[[BCAST_S]]
-  // CHECK: return %[[EXTENTS]]
+  // CHECK-DAG: %[[BCAST_S:.+]] = shape.broadcast %[[ARG0_S]], %[[ARG1_S]] : tensor<?xindex>, tensor<?xindex> -> !shape.shape
+  // CHECK: return %[[BCAST_S]] : !shape.shape
   %0 = chlo.broadcast_add %arg0, %arg1 : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  %1 = "mhlo_test.reify_return_type_shapes"(%0) : (tensor<?xf32>) -> tensor<1xindex>
-  return %1 : tensor<1xindex>
+  %1 = "mhlo_test.reify_return_type_shapes"(%0) : (tensor<?xf32>) -> !shape.shape
+  return %1 : !shape.shape
 }
 
 // -----
diff --git a/tensorflow/compiler/mlir/hlo/tests/chlo_legalize_to_hlo_broadcasts.mlir b/tensorflow/compiler/mlir/hlo/tests/chlo_legalize_to_hlo_broadcasts.mlir
index c08ead5081e..af19a9b5c1c 100644
--- a/tensorflow/compiler/mlir/hlo/tests/chlo_legalize_to_hlo_broadcasts.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/chlo_legalize_to_hlo_broadcasts.mlir
@@ -19,7 +19,7 @@ func @dynamicBroadcast(%arg0: tensor<?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?
   // CHECK-NEXT: %[[WITNESS:.+]] = shape.cstr_broadcastable %[[ARG0_S]], %[[ARG1_S]]
   // CHECK-NEXT: %[[FINAL_RESULT:.+]] = shape.assuming %[[WITNESS]]
   // CHECK-DAG:    %[[RESULT_S:.+]] = shape.broadcast %[[ARG0_S]], %[[ARG1_S]]
-  // CHECK:        %[[RESULT_EXTENTS:.+]] = shape.to_extent_tensor %[[RESULT_S]]
+  // CHECK:        %[[RESULT_EXTENTS:.+]] = tensor_cast %[[RESULT_S]] : tensor<?xindex> to tensor<2xindex>
   // CHECK-DAG:    %[[ARG0_B:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[ARG0]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
   // CHECK-DAG:    %[[ARG1_B:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[ARG1]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}
   // CHECK-NEXT:   %[[RESULT:.+]] = mhlo.add %[[ARG0_B]], %[[ARG1_B]]
@@ -40,7 +40,7 @@ func @dynamicBroadcastComplex(%arg0: tensor<?xf32>, %arg1: tensor<?x?xf32>) -> t
   // CHECK-NEXT: %[[WITNESS:.+]] = shape.cstr_broadcastable %[[ARG0_S]], %[[ARG1_S]]
   // CHECK-NEXT: %[[FINAL_RESULT:.+]] = shape.assuming %[[WITNESS]]
   // CHECK-NEXT:   %[[RESULT_S:.+]] = shape.broadcast %[[ARG0_S]], %[[ARG1_S]]
-  // CHECK-NEXT:   %[[RESULT_EXTENTS:.+]] = shape.to_extent_tensor %[[RESULT_S]]
+  // CHECK-NEXT:   %[[RESULT_EXTENTS:.+]] = tensor_cast %[[RESULT_S]] : tensor<?xindex> to tensor<2xindex>
   // CHECK-DAG:    %[[ARG0_B:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[ARG0]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
   // CHECK-DAG:    %[[ARG1_B:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[ARG1]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<?x?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
   // CHECK-NEXT:   %[[RESULT:.+]] = "mhlo.complex"(%[[ARG0_B]], %[[ARG1_B]]) : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xcomplex<f32>>
@@ -61,7 +61,7 @@ func @dynamicBroadcastCompare(%arg0: tensor<?xf32>, %arg1: tensor<?x?xf32>) -> t
   // CHECK: %[[WITNESS:.+]] = shape.cstr_broadcastable %[[ARG0_S]], %[[ARG1_S]]
   // CHECK: %[[FINAL_RESULT:.+]] = shape.assuming %[[WITNESS]]
   // CHECK: %[[RESULT_S:.+]] = shape.broadcast %[[ARG0_S]], %[[ARG1_S]]
-  // CHECK: %[[RESULT_EXTENTS:.+]] = shape.to_extent_tensor %[[RESULT_S]]
+  // CHECK: %[[RESULT_EXTENTS:.+]] = tensor_cast %[[RESULT_S]] : tensor<?xindex> to tensor<2xindex>
   // CHECK-DAG: %[[ARG0_B:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[ARG0]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
   // CHECK-DAG: %[[ARG1_B:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[ARG1]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<?x?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
   // CHECK: %[[RESULT:.+]] = "mhlo.compare"(%[[ARG0_B]], %[[ARG1_B]]) {comparison_direction = "EQ"} : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xi1>
@@ -253,7 +253,7 @@ func @addScalarUnranked(%arg0: tensor<f32>, %arg1: tensor<*xf32>) -> tensor<*xf3
 //                  to a 1D tensor.
 // CHECK:           %[[SHAPE_1:.*]] = shape.shape_of %[[ARG_1]] : tensor<*xf32>
 // CHECK:           %[[NUM_ELEMENTS:.*]] = shape.num_elements %[[SHAPE_1]] : tensor<?xindex> -> index
-// CHECK:           %[[SIZE_TENSOR:.*]] = tensor_from_elements(%[[NUM_ELEMENTS]]) : tensor<1xindex>
+// CHECK:           %[[SIZE_TENSOR:.*]] = tensor_from_elements %[[NUM_ELEMENTS]] : tensor<1xindex>
 // CHECK:           %[[RESHAPED:.*]] = "mhlo.dynamic_reshape"(%[[ARG_1]], %[[SIZE_TENSOR]]) : (tensor<*xf32>, tensor<1xindex>) -> tensor<?xf32>
 //                  The assuming region is part of the second stage of lowering
 //                  with ranked broadcasting logic.
@@ -263,7 +263,7 @@ func @addScalarUnranked(%arg0: tensor<f32>, %arg1: tensor<*xf32>) -> tensor<*xf3
 // CHECK:           %[[ASSUMING_RESULT:.*]] = shape.assuming %[[WITNESS]] -> (tensor<?xf32>) {
 // CHECK:             %[[SCALAR_SHAPE:.*]] = shape.const_shape []
 // CHECK:             %[[BROADCASTED_SHAPE:.*]] = shape.broadcast %[[SCALAR_SHAPE]], %[[SHAPE_RESHAPED]]
-// CHECK:             %[[SHAPE_TENSOR:.*]] = shape.to_extent_tensor %[[BROADCASTED_SHAPE]] : !shape.shape -> tensor<1xindex>
+// CHECK:             %[[SHAPE_TENSOR:.*]] = tensor_cast %[[BROADCASTED_SHAPE]] : tensor<?xindex> to tensor<1xindex>
 // CHECK:             %[[BROADCASTED_LHS:.*]] = "mhlo.dynamic_broadcast_in_dim"(%[[ARG_0]], %[[SHAPE_TENSOR]]) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<1xindex>) -> tensor<?xf32>
 // CHECK:             %[[BROADCASTED_RHS:.*]] = "mhlo.dynamic_broadcast_in_dim"(%[[RESHAPED]], %[[SHAPE_TENSOR]]) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<?xf32>, tensor<1xindex>) -> tensor<?xf32>
 // CHECK:             %[[BROADCASTED_RESULT:.*]] = mhlo.add %[[BROADCASTED_LHS]], %[[BROADCASTED_RHS]] : tensor<?xf32>
@@ -288,7 +288,7 @@ func @addUnrankedScalar(%arg0: tensor<*xf32>, %arg1: tensor<f32>) -> tensor<*xf3
 //                  to a 1D tensor.
 // CHECK:           %[[SHAPE_0:.*]] = shape.shape_of %[[ARG_0]] : tensor<*xf32>
 // CHECK:           %[[NUM_ELEMENTS:.*]] = shape.num_elements %[[SHAPE_0]] : tensor<?xindex> -> index
-// CHECK:           %[[SIZE_TENSOR:.*]] = tensor_from_elements(%[[NUM_ELEMENTS]]) : tensor<1xindex>
+// CHECK:           %[[SIZE_TENSOR:.*]] = tensor_from_elements %[[NUM_ELEMENTS]] : tensor<1xindex>
 // CHECK:           %[[RESHAPED:.*]] = "mhlo.dynamic_reshape"(%[[ARG_0]], %[[SIZE_TENSOR]]) : (tensor<*xf32>, tensor<1xindex>) -> tensor<?xf32>
 //                  The assuming region is part of the second stage of lowering
 //                  with ranked broadcasting logic.
@@ -296,7 +296,7 @@ func @addUnrankedScalar(%arg0: tensor<*xf32>, %arg1: tensor<f32>) -> tensor<*xf3
 // CHECK:           %[[SHAPE_1:.*]] = shape.shape_of %[[ARG_1]] : tensor<f32>
 // CHECK:           %[[WITNESS:.*]] = shape.cstr_broadcastable %[[SHAPE_RESHAPED]], %[[SHAPE_1]]
 // CHECK:           %[[ASSUMING_RESULT:.*]] = shape.assuming %[[WITNESS]] -> (tensor<?xf32>) {
-// CHECK:             %[[ASTENSOR:.*]] = shape.to_extent_tensor %[[SHAPE_RESHAPED]]
+// CHECK:             %[[ASTENSOR:.*]] = tensor_cast %[[SHAPE_RESHAPED]]
 // CHECK:             %[[BROADCASTED_LHS:.*]] = "mhlo.dynamic_broadcast_in_dim"(%[[RESHAPED]], %[[ASTENSOR]]) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<?xf32>, tensor<1xindex>) -> tensor<?xf32>
 // CHECK:             %[[BROADCASTED_RHS:.*]] = "mhlo.dynamic_broadcast_in_dim"(%[[ARG_1]], %[[ASTENSOR]]) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<1xindex>) -> tensor<?xf32>
 // CHECK:             %[[BROADCASTED_RESULT:.*]] = mhlo.add %[[BROADCASTED_LHS]], %[[BROADCASTED_RHS]] : tensor<?xf32>
@@ -353,10 +353,12 @@ func @addUnrankedUnranked(
 //                        Handle rank 2 specialization
 // CHECK:                 %[[VAL_26:.*]] = scf.if %[[GREATEST_RANK_IS_2]] -> (tensor<*xf32>) {
 // CHECK:                   %[[CONST_SHAPE_2:.*]] = shape.const_shape [1, 1]
-// CHECK:                   %[[BROADCASTED_LHS_2:.*]] = shape.broadcast %[[LHS_SHAPE]], %[[CONST_SHAPE_2]] : tensor<?xindex>, tensor<2xindex> -> tensor<2xindex>
-// CHECK:                   %[[BROADCASTED_RHS_2:.*]] = shape.broadcast %[[RHS_SHAPE]], %[[CONST_SHAPE_2]] : tensor<?xindex>, tensor<2xindex> -> tensor<2xindex>
-// CHECK:                   %[[RESHAPED_LHS_2:.*]] = "mhlo.dynamic_reshape"(%[[LHS]], %[[BROADCASTED_LHS_2]]) : (tensor<*xf32>, tensor<2xindex>) -> tensor<?x?xf32>
-// CHECK:                   %[[RESHAPED_RHS_2:.*]] = "mhlo.dynamic_reshape"(%[[RHS]], %[[BROADCASTED_RHS_2]]) : (tensor<*xf32>, tensor<2xindex>) -> tensor<?x?xf32>
+// CHECK:                   %[[BROADCASTED_LHS_2:.*]] = shape.broadcast %[[LHS_SHAPE]], %[[CONST_SHAPE_2]] : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+// CHECK:                   %[[CASTED_LHS_2:.*]] = tensor_cast %[[BROADCASTED_LHS_2]] : tensor<?xindex> to tensor<2xindex>
+// CHECK:                   %[[BROADCASTED_RHS_2:.*]] = shape.broadcast %[[RHS_SHAPE]], %[[CONST_SHAPE_2]] : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+// CHECK:                   %[[CASTED_RHS_2:.*]] = tensor_cast %[[BROADCASTED_RHS_2]] : tensor<?xindex> to tensor<2xindex>
+// CHECK:                   %[[RESHAPED_LHS_2:.*]] = "mhlo.dynamic_reshape"(%[[LHS]], %[[CASTED_LHS_2]]) : (tensor<*xf32>, tensor<2xindex>) -> tensor<?x?xf32>
+// CHECK:                   %[[RESHAPED_RHS_2:.*]] = "mhlo.dynamic_reshape"(%[[RHS]], %[[CASTED_RHS_2]]) : (tensor<*xf32>, tensor<2xindex>) -> tensor<?x?xf32>
 // CHECK:                   %[[RESULT_RANK_2:.*]] = chlo.broadcast_add %[[RESHAPED_LHS_2]], %[[RESHAPED_RHS_2]] : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
 // CHECK:                   %[[RESULT_2:.*]] = tensor_cast %[[RESULT_RANK_2]] : tensor<?x?xf32> to tensor<*xf32>
 // CHECK:                   scf.yield %[[RESULT_2]] : tensor<*xf32>
@@ -366,10 +368,12 @@ func @addUnrankedUnranked(
 //                          Handle rank 3 specialization
 // CHECK:                   %[[VAL_34:.*]] = scf.if %[[GREATEST_RANK_IS_3]] -> (tensor<*xf32>) {
 // CHECK:                     %[[CONST_SHAPE_3:.*]] = shape.const_shape [1, 1, 1]
-// CHECK:                     %[[BROADCASTED_LHS_3:.*]] = shape.broadcast %[[LHS_SHAPE]], %[[CONST_SHAPE_3]] : tensor<?xindex>, tensor<3xindex> -> tensor<3xindex>
-// CHECK:                     %[[BROADCASTED_RHS_3:.*]] = shape.broadcast %[[RHS_SHAPE]], %[[CONST_SHAPE_3]] : tensor<?xindex>, tensor<3xindex> -> tensor<3xindex>
-// CHECK:                     %[[RESHAPED_LHS_3:.*]] = "mhlo.dynamic_reshape"(%[[LHS]], %[[BROADCASTED_LHS_3]]) : (tensor<*xf32>, tensor<3xindex>) -> tensor<?x?x?xf32>
-// CHECK:                     %[[RESHAPED_RHS_3:.*]] = "mhlo.dynamic_reshape"(%[[RHS]], %[[BROADCASTED_RHS_3]]) : (tensor<*xf32>, tensor<3xindex>) -> tensor<?x?x?xf32>
+// CHECK:                     %[[BROADCASTED_LHS_3:.*]] = shape.broadcast %[[LHS_SHAPE]], %[[CONST_SHAPE_3]] : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+// CHECK:                     %[[CASTED_LHS_3:.*]] = tensor_cast %[[BROADCASTED_LHS_3]] : tensor<?xindex> to tensor<3xindex>
+// CHECK:                     %[[BROADCASTED_RHS_3:.*]] = shape.broadcast %[[RHS_SHAPE]], %[[CONST_SHAPE_3]] : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+// CHECK:                     %[[CASTED_RHS_3:.*]] = tensor_cast %[[BROADCASTED_RHS_3]] : tensor<?xindex> to tensor<3xindex>
+// CHECK:                     %[[RESHAPED_LHS_3:.*]] = "mhlo.dynamic_reshape"(%[[LHS]], %[[CASTED_LHS_3]]) : (tensor<*xf32>, tensor<3xindex>) -> tensor<?x?x?xf32>
+// CHECK:                     %[[RESHAPED_RHS_3:.*]] = "mhlo.dynamic_reshape"(%[[RHS]], %[[CASTED_RHS_3]]) : (tensor<*xf32>, tensor<3xindex>) -> tensor<?x?x?xf32>
 // CHECK:                     %[[RESULT_RANK_3:.*]] = chlo.broadcast_add %[[RESHAPED_LHS_3]], %[[RESHAPED_RHS_3]] : (tensor<?x?x?xf32>, tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
 // CHECK:                     %[[RESULT_3:.*]] = tensor_cast %[[RESULT_RANK_3]] : tensor<?x?x?xf32> to tensor<*xf32>
 // CHECK:                     scf.yield %[[RESULT_3]] : tensor<*xf32>
@@ -379,10 +383,12 @@ func @addUnrankedUnranked(
 //                            Handle rank 4 specialization
 // CHECK:                     %[[VAL_42:.*]] = scf.if %[[GREATEST_RANK_IS_4]] -> (tensor<*xf32>) {
 // CHECK:                       %[[CONST_SHAPE_4:.*]] = shape.const_shape [1, 1, 1, 1]
-// CHECK:                       %[[BROADCASTED_LHS_4:.*]] = shape.broadcast %[[LHS_SHAPE]], %[[CONST_SHAPE_4]] : tensor<?xindex>, tensor<4xindex> -> tensor<4xindex>
-// CHECK:                       %[[BROADCASTED_RHS_4:.*]] = shape.broadcast %[[RHS_SHAPE]], %[[CONST_SHAPE_4]] : tensor<?xindex>, tensor<4xindex> -> tensor<4xindex>
-// CHECK:                       %[[RESHAPED_LHS_4:.*]] = "mhlo.dynamic_reshape"(%[[LHS]], %[[BROADCASTED_LHS_4]]) : (tensor<*xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
-// CHECK:                       %[[RESHAPED_RHS_4:.*]] = "mhlo.dynamic_reshape"(%[[RHS]], %[[BROADCASTED_RHS_4]]) : (tensor<*xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
+// CHECK:                       %[[BROADCASTED_LHS_4:.*]] = shape.broadcast %[[LHS_SHAPE]], %[[CONST_SHAPE_4]] : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+// CHECK:                       %[[CASTED_LHS_4:.*]] = tensor_cast %[[BROADCASTED_LHS_4]] : tensor<?xindex> to tensor<4xindex>
+// CHECK:                       %[[BROADCASTED_RHS_4:.*]] = shape.broadcast %[[RHS_SHAPE]], %[[CONST_SHAPE_4]] : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+// CHECK:                       %[[CASTED_RHS_4:.*]] = tensor_cast %[[BROADCASTED_RHS_4]] : tensor<?xindex> to tensor<4xindex>
+// CHECK:                       %[[RESHAPED_LHS_4:.*]] = "mhlo.dynamic_reshape"(%[[LHS]], %[[CASTED_LHS_4]]) : (tensor<*xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
+// CHECK:                       %[[RESHAPED_RHS_4:.*]] = "mhlo.dynamic_reshape"(%[[RHS]], %[[CASTED_RHS_4]]) : (tensor<*xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
 // CHECK:                       %[[RESULT_RANK_4:.*]] = chlo.broadcast_add %[[RESHAPED_LHS_4]], %[[RESHAPED_RHS_4]] : (tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
 // CHECK:                       %[[RESULT_4:.*]] = tensor_cast %[[RESULT_RANK_4]] : tensor<?x?x?x?xf32> to tensor<*xf32>
 // CHECK:                       scf.yield %[[RESULT_4]] : tensor<*xf32>
@@ -392,10 +398,12 @@ func @addUnrankedUnranked(
 //                              Handle rank 5 specialization
 // CHECK:                       %[[VAL_50:.*]] = scf.if %[[GREATEST_RANK_IS_5]] -> (tensor<*xf32>) {
 // CHECK:                         %[[CONST_SHAPE_5:.*]] = shape.const_shape [1, 1, 1, 1, 1]
-// CHECK:                         %[[BROADCASTED_LHS_5:.*]] = shape.broadcast %[[LHS_SHAPE]], %[[CONST_SHAPE_5]] : tensor<?xindex>, tensor<5xindex> -> tensor<5xindex>
-// CHECK:                         %[[BROADCASTED_RHS_5:.*]] = shape.broadcast %[[RHS_SHAPE]], %[[CONST_SHAPE_5]] : tensor<?xindex>, tensor<5xindex> -> tensor<5xindex>
-// CHECK:                         %[[RESHAPED_LHS_5:.*]] = "mhlo.dynamic_reshape"(%[[LHS]], %[[BROADCASTED_LHS_5]]) : (tensor<*xf32>, tensor<5xindex>) -> tensor<?x?x?x?x?xf32>
-// CHECK:                         %[[RESHAPED_RHS_5:.*]] = "mhlo.dynamic_reshape"(%[[RHS]], %[[BROADCASTED_RHS_5]]) : (tensor<*xf32>, tensor<5xindex>) -> tensor<?x?x?x?x?xf32>
+// CHECK:                         %[[BROADCASTED_LHS_5:.*]] = shape.broadcast %[[LHS_SHAPE]], %[[CONST_SHAPE_5]] : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+// CHECK:                         %[[CASTED_LHS_5:.*]] = tensor_cast %[[BROADCASTED_LHS_5]] : tensor<?xindex> to tensor<5xindex>
+// CHECK:                         %[[BROADCASTED_RHS_5:.*]] = shape.broadcast %[[RHS_SHAPE]], %[[CONST_SHAPE_5]] : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+// CHECK:                         %[[CASTED_RHS_5:.*]] = tensor_cast %[[BROADCASTED_RHS_5]] : tensor<?xindex> to tensor<5xindex>
+// CHECK:                         %[[RESHAPED_LHS_5:.*]] = "mhlo.dynamic_reshape"(%[[LHS]], %[[CASTED_LHS_5]]) : (tensor<*xf32>, tensor<5xindex>) -> tensor<?x?x?x?x?xf32>
+// CHECK:                         %[[RESHAPED_RHS_5:.*]] = "mhlo.dynamic_reshape"(%[[RHS]], %[[CASTED_RHS_5]]) : (tensor<*xf32>, tensor<5xindex>) -> tensor<?x?x?x?x?xf32>
 // CHECK:                         %[[RESULT_RANK_5:.*]] = chlo.broadcast_add %[[RESHAPED_LHS_5]], %[[RESHAPED_RHS_5]] : (tensor<?x?x?x?x?xf32>, tensor<?x?x?x?x?xf32>) -> tensor<?x?x?x?x?xf32>
 // CHECK:                         %[[RESULT_5:.*]] = tensor_cast %[[RESULT_RANK_5]] : tensor<?x?x?x?x?xf32> to tensor<*xf32>
 // CHECK:                         scf.yield %[[RESULT_5]] : tensor<*xf32>
@@ -405,10 +413,12 @@ func @addUnrankedUnranked(
 //                                Handle rank 6 specialization
 // CHECK:                         %[[VAL_58:.*]] = scf.if %[[GREATEST_RANK_IS_6]] -> (tensor<*xf32>) {
 // CHECK:                           %[[CONST_SHAPE_6:.*]] = shape.const_shape [1, 1, 1, 1, 1, 1]
-// CHECK:                           %[[BROADCASTED_LHS_6:.*]] = shape.broadcast %[[LHS_SHAPE]], %[[CONST_SHAPE_6]] : tensor<?xindex>, tensor<6xindex> -> tensor<6xindex>
-// CHECK:                           %[[BROADCASTED_RHS_6:.*]] = shape.broadcast %[[RHS_SHAPE]], %[[CONST_SHAPE_6]] : tensor<?xindex>, tensor<6xindex> -> tensor<6xindex>
-// CHECK:                           %[[RESHAPED_LHS_6:.*]] = "mhlo.dynamic_reshape"(%[[LHS]], %[[BROADCASTED_LHS_6]]) : (tensor<*xf32>, tensor<6xindex>) -> tensor<?x?x?x?x?x?xf32>
-// CHECK:                           %[[RESHAPED_RHS_6:.*]] = "mhlo.dynamic_reshape"(%[[RHS]], %[[BROADCASTED_RHS_6]]) : (tensor<*xf32>, tensor<6xindex>) -> tensor<?x?x?x?x?x?xf32>
+// CHECK:                           %[[BROADCASTED_LHS_6:.*]] = shape.broadcast %[[LHS_SHAPE]], %[[CONST_SHAPE_6]] : tensor<?xindex>, tensor<6xindex> -> tensor<?xindex>
+// CHECK:                           %[[CASTED_LHS_6:.*]] = tensor_cast %[[BROADCASTED_LHS_6]] : tensor<?xindex> to tensor<6xindex>
+// CHECK:                           %[[BROADCASTED_RHS_6:.*]] = shape.broadcast %[[RHS_SHAPE]], %[[CONST_SHAPE_6]] : tensor<?xindex>, tensor<6xindex> -> tensor<?xindex>
+// CHECK:                           %[[CASTED_RHS_6:.*]] = tensor_cast %[[BROADCASTED_RHS_6]] : tensor<?xindex> to tensor<6xindex>
+// CHECK:                           %[[RESHAPED_LHS_6:.*]] = "mhlo.dynamic_reshape"(%[[LHS]], %[[CASTED_LHS_6]]) : (tensor<*xf32>, tensor<6xindex>) -> tensor<?x?x?x?x?x?xf32>
+// CHECK:                           %[[RESHAPED_RHS_6:.*]] = "mhlo.dynamic_reshape"(%[[RHS]], %[[CASTED_RHS_6]]) : (tensor<*xf32>, tensor<6xindex>) -> tensor<?x?x?x?x?x?xf32>
 // CHECK:                           %[[RESULT_RANK_6:.*]] = chlo.broadcast_add %[[RESHAPED_LHS_6]], %[[RESHAPED_RHS_6]] : (tensor<?x?x?x?x?x?xf32>, tensor<?x?x?x?x?x?xf32>) -> tensor<?x?x?x?x?x?xf32>
 // CHECK:                           %[[RESULT_6:.*]] = tensor_cast %[[RESULT_RANK_6]] : tensor<?x?x?x?x?x?xf32> to tensor<*xf32>
 // CHECK:                           scf.yield %[[RESULT_6]] : tensor<*xf32>
diff --git a/tensorflow/compiler/mlir/hlo/tests/chlo_legalize_to_mhlo.mlir b/tensorflow/compiler/mlir/hlo/tests/chlo_legalize_to_mhlo.mlir
new file mode 100644
index 00000000000..371e730c30b
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/tests/chlo_legalize_to_mhlo.mlir
@@ -0,0 +1,26 @@
+// RUN: mlir-hlo-opt --mhlo-test-chlo-legalize-to-hlo --split-input-file %s | FileCheck %s
+
+// Lower statically shaped `constant_like` to constant.
+// CHECK-LABEL: @constant_like_static_shape
+func @constant_like_static_shape(%arg : tensor<1x2xi64>) -> tensor<1x2xf32> {
+  // CHECK: %[[RESULT:.*]] = mhlo.constant dense<3.200000e+00> : tensor<1x2xf32>
+  // CHECK: return %[[RESULT]]
+  %result = "chlo.constant_like"(%arg) { value = 3.2 : f32 }
+      : (tensor<1x2xi64>) -> tensor<1x2xf32>
+  return %result : tensor<1x2xf32>
+}
+
+// Lower dynamically shaped `constant_like` to broadcasted constant.
+// CHECK-LABEL: constant_like_dynamic_shape
+// CHECK-SAME: (%[[ARG:.*]]: tensor<?x?xi64>)
+func @constant_like_dynamic_shape(%arg : tensor<?x?xi64>) -> tensor<?x?xf32> {
+  // CHECK: %[[CONSTANT:.*]] = mhlo.constant dense<3.200000e+00> : tensor<f32>
+  // CHECK: %[[UNCASTED_SHAPE:.*]] = shape.shape_of %[[ARG]] : tensor<?x?xi64> -> tensor<?xindex>
+  // CHECK: %[[SHAPE:.*]] = tensor_cast %[[UNCASTED_SHAPE]] : tensor<?xindex> to tensor<2xindex>
+  // CHECK: %[[BROADCASTED_CONSTANT:.*]] = "mhlo.dynamic_broadcast_in_dim"(%[[CONSTANT]], %[[SHAPE]]) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<2xindex>) -> tensor<?x?xf32>
+  // CHECK: return %[[BROADCASTED_CONSTANT]] : tensor<?x?xf32>
+  %result = "chlo.constant_like"(%arg) { value = 3.2 : f32 }
+      : (tensor<?x?xi64>) -> tensor<?x?xf32>
+  return %result : tensor<?x?xf32>
+}
+
diff --git a/tensorflow/compiler/mlir/hlo/tests/hlo-legalize-to-lhlo.mlir b/tensorflow/compiler/mlir/hlo/tests/hlo-legalize-to-lhlo.mlir
index 018711e33cb..960a769c388 100644
--- a/tensorflow/compiler/mlir/hlo/tests/hlo-legalize-to-lhlo.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/hlo-legalize-to-lhlo.mlir
@@ -170,7 +170,7 @@ func @dyn_broadcast(%operand: memref<?x?xf32>) {
   // BOTH-SAME: (%[[OPERAND:.*]]: memref<?x?xf32>)
   %tensor_operand = tensor_load %operand : memref<?x?xf32>
   %c1 = constant 1 : i64
-  %shape = tensor_from_elements(%c1, %c1, %c1) : tensor<3xi64>
+  %shape = tensor_from_elements %c1, %c1, %c1 : tensor<3xi64>
   %tensor_result = "mhlo.dynamic_broadcast_in_dim"(%tensor_operand, %shape) {
     broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>
   } : (tensor<?x?xf32>, tensor<3xi64>) -> tensor<?x?x?xf32>
@@ -320,6 +320,18 @@ func @cos(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
 
 // -----
 
+// BOTH-LABEL: func @floor
+func @floor(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
+  %tensor_operand = tensor_load %operand : memref<2x2xf32>
+  %tensor_result = "mhlo.floor"(%tensor_operand)
+      : (tensor<2x2xf32>) -> tensor<2x2xf32>
+  // BOTH: "lmhlo.floor"(%{{.*}}, %{{.*}})
+  tensor_store %tensor_result, %result : memref<2x2xf32>
+  return
+}
+
+// -----
+
 // BOTH-LABEL: func @neg
 func @neg(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
   %tensor_operand = tensor_load %operand : memref<2x2xf32>
@@ -404,7 +416,7 @@ func @add_dyn(%lhs: tensor<?x?xf32>, %rhs: tensor<?x?xf32>) {
   // BOTH: %[[C1:.*]] = constant 1 : index
   // BOTH: %[[DIM1:.*]] = dim %arg0, %[[C1]] : memref<?x?xf32>
   // BOTH: %[[IC1:.*]] = index_cast %[[DIM1]] : index to i64
-  // BOTH: %[[SHAPE:.*]] = tensor_from_elements(%[[IC0]], %[[IC1]]) : tensor<2xi64>
+  // BOTH: %[[SHAPE:.*]] = tensor_from_elements %[[IC0]], %[[IC1]] : tensor<2xi64>
   // BOTH: %[[C0_:.*]] = constant 0 : index
   // BOTH: %[[EE0:.*]] = extract_element %[[SHAPE]][%[[C0_]]] : tensor<2xi64>
   // BOTH: %[[ICS0:.*]] = index_cast %[[EE0]] : i64 to index
@@ -429,7 +441,7 @@ func @tanh_dyn(%arg0: tensor<?x?xf32>) {
   // BOTH: %[[C1:.*]] = constant 1 : index
   // BOTH: %[[DIM1:.*]] = dim %arg0, %[[C1]] : memref<?x?xf32>
   // BOTH: %[[IC1:.*]] = index_cast %[[DIM1]] : index to i64
-  // BOTH: %[[SHAPE:.*]] = tensor_from_elements(%[[IC0]], %[[IC1]]) : tensor<2xi64>
+  // BOTH: %[[SHAPE:.*]] = tensor_from_elements %[[IC0]], %[[IC1]] : tensor<2xi64>
   // BOTH: %[[C0_:.*]] = constant 0 : index
   // BOTH: %[[EE0:.*]] = extract_element %[[SHAPE]][%[[C0_]]] : tensor<2xi64>
   // BOTH: %[[ICS0:.*]] = index_cast %[[EE0]] : i64 to index
@@ -510,3 +522,16 @@ func @reduce(%arg0: tensor<1x8xf32>, %arg1: tensor<f32>) -> tensor<1xf32> {
       : (tensor<1x8xf32>, tensor<f32>) -> tensor<1xf32>
   return %0 : tensor<1xf32>
 }
+
+// -----
+
+// BOTH-LABEL: func @transpose
+func @transpose(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
+  %tensor_operand = tensor_load %operand : memref<2x2xf32>
+  %tensor_result = "mhlo.transpose"(%tensor_operand) {permutation = dense<[1, 0]> : tensor<2xi64>}
+                    : (tensor<2x2xf32>) -> tensor<2x2xf32>
+  // BOTH: "lmhlo.transpose"(%{{.*}}, %{{.*}}) {permutation = dense<[1, 0]> : tensor<2xi64>}
+  // BOTH-NOT: tensor_store
+  tensor_store %tensor_result, %result : memref<2x2xf32>
+  return
+}
diff --git a/tensorflow/compiler/mlir/hlo/tests/hlo-legalize-to-linalg.mlir b/tensorflow/compiler/mlir/hlo/tests/hlo-legalize-to-linalg.mlir
index 46725e0bd09..263ea1b4040 100644
--- a/tensorflow/compiler/mlir/hlo/tests/hlo-legalize-to-linalg.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/hlo-legalize-to-linalg.mlir
@@ -152,6 +152,16 @@ func @float_ceil(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
 
 // -----
 
+// CHECK-LABEL: func @floor
+func @floor(%input: tensor<2x2xf32>) -> tensor<2x2xf32> {
+  // CHECK: linalg.generic
+  // CHECK: floorf
+  %0 = "mhlo.floor"(%input) : (tensor<2x2xf32>) -> tensor<2x2xf32>
+  return %0 : tensor<2x2xf32>
+}
+
+// -----
+
 // CHECK-LABEL: func @float_neg
 func @float_neg(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
   // CHECK: linalg.generic
@@ -373,6 +383,40 @@ func @reshape_2D_4D(%arg0: tensor<12x42xi32>) -> tensor<12x1x42x1xi32> {
 
 // -----
 
+// CHECK-DAG: #[[RESHAPE_MAP1:.*]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+// CHECK-DAG: #[[RESHAPE_MAP2:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+// CHECK-LABEL: func @reshape_3D_4D
+func @reshape_3D_4D(%arg0: tensor<1x49x16xf32>) -> tensor<1x784x1x1xf32> {
+  %0 = "mhlo.reshape"(%arg0) : (tensor<1x49x16xf32>) -> tensor<1x784x1x1xf32>
+  return %0 : tensor<1x784x1x1xf32>
+}
+// CHECK: linalg.tensor_reshape %{{.*}} [#[[RESHAPE_MAP1]]]
+// CHECK: linalg.tensor_reshape %{{.*}} [#[[RESHAPE_MAP2]]]
+
+// -----
+
+// CHECK-DAG: #[[MAP:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+// CHECK-LABEL: func @reshape1_4D_4D
+func @reshape1_4D_4D(%arg0: tensor<4x512x1x1xi32>) -> tensor<1x4x1x512xi32> {
+  %0 = "mhlo.reshape"(%arg0) : (tensor<4x512x1x1xi32>) -> tensor<1x4x1x512xi32>
+  return %0 : tensor<1x4x1x512xi32>
+}
+// CHECK: linalg.tensor_reshape %{{.*}} [#[[MAP]]]
+// CHECK: linalg.tensor_reshape %{{.*}} [#[[MAP]]]
+
+// -----
+
+// CHECK-DAG: #[[MAP:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+// CHECK-LABEL: func @reshape2_4D_4D
+func @reshape2_4D_4D(%arg0: tensor<4x1x1x1024xi32>) -> tensor<4x1024x1x1xi32> {
+  %0 = "mhlo.reshape"(%arg0) : (tensor<4x1x1x1024xi32>) -> tensor<4x1024x1x1xi32>
+  return %0 : tensor<4x1024x1x1xi32>
+}
+// CHECK: linalg.tensor_reshape %{{.*}} [#[[MAP]]]
+// CHECK: linalg.tensor_reshape %{{.*}} [#[[MAP]]]
+
+// -----
+
 // CHECK-LABEL: func @minf
 func @minf(%lhs: tensor<2x2xf32>, %rhs: tensor<2x2xf32>) -> tensor<2x2xf32> {
   %0 = "mhlo.minimum"(%lhs, %rhs)
diff --git a/tensorflow/compiler/mlir/hlo/tests/mhlo-transform-unranked.mlir b/tensorflow/compiler/mlir/hlo/tests/hlo-transform-unranked.mlir
similarity index 68%
rename from tensorflow/compiler/mlir/hlo/tests/mhlo-transform-unranked.mlir
rename to tensorflow/compiler/mlir/hlo/tests/hlo-transform-unranked.mlir
index 6cc07e0460c..ae61fc8477e 100644
--- a/tensorflow/compiler/mlir/hlo/tests/mhlo-transform-unranked.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/hlo-transform-unranked.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-hlo-opt -transform-unranked-hlo -split-input-file %s | FileCheck %s
+// RUN: mlir-hlo-opt --transform-unranked-hlo --split-input-file %s | FileCheck %s
 
 // Check the validity of expected IR.
 // CHECK-LABEL: @sqr_transform_result
@@ -7,8 +7,7 @@ func @sqr_transform_result(%a: tensor<*xf32>) -> tensor<*xf32> {
   // Flatten operand shape.
   %shape = shape.shape_of %a : tensor<*xf32> -> tensor<?xindex>
   %num_elements = shape.num_elements %shape : tensor<?xindex> -> index
-  %num_elements_as_index = shape.size_to_index %num_elements : index
-  %flat_shape = tensor_from_elements(%num_elements_as_index) : tensor<1xindex>
+  %flat_shape = tensor_from_elements %num_elements : tensor<1xindex>
   %flat_a = "mhlo.dynamic_reshape"(%a, %flat_shape)
       : (tensor<*xf32>, tensor<1xindex>) -> tensor<?xf32>
 
@@ -16,8 +15,7 @@ func @sqr_transform_result(%a: tensor<*xf32>) -> tensor<*xf32> {
   %flat_b = "mhlo.sqrt"(%flat_a) : (tensor<?xf32>) -> tensor<?xf32>
 
   // Restore original shape.
-  %shape_as_extent_tensor = shape.to_extent_tensor %shape : tensor<?xindex> -> tensor<?xindex>
-  %b = "mhlo.dynamic_reshape"(%flat_b, %shape_as_extent_tensor)
+  %b = "mhlo.dynamic_reshape"(%flat_b, %shape)
       : (tensor<?xf32>, tensor<?xindex>) -> tensor<*xf32>
 
   return %b : tensor<*xf32>
@@ -29,14 +27,12 @@ func @sqr_transform_result(%a: tensor<*xf32>) -> tensor<*xf32> {
 // CHECK-LABEL: @sqrt
 // CHECK-SAME: (%[[A:.*]]: tensor<*xf32>)
 func @sqrt(%a: tensor<*xf32>) -> tensor<*xf32> {
-  // CHECK-NEXT: %[[SHAPE:.*]] = shape.shape_of %[[A]] : tensor<*xf32>
+  // CHECK-NEXT: %[[SHAPE:.*]] = shape.shape_of %[[A]] : tensor<*xf32> -> tensor<?xindex>
   // CHECK-NEXT: %[[NUM_ELEMENTS:.*]] = shape.num_elements %[[SHAPE]]
-  // CHECK-NEXT: %[[NUM_ELEMENTS_AS_INDEX:.*]] = shape.size_to_index %[[NUM_ELEMENTS]]
-  // CHECK-NEXT: %[[FLAT_SHAPE:.*]] = tensor_from_elements(%[[NUM_ELEMENTS_AS_INDEX]]) : tensor<1xindex>
+  // CHECK-NEXT: %[[FLAT_SHAPE:.*]] = tensor_from_elements %[[NUM_ELEMENTS]] : tensor<1xindex>
   // CHECK-NEXT: %[[FLAT_A:.*]] = "mhlo.dynamic_reshape"(%[[A]], %[[FLAT_SHAPE]]) : (tensor<*xf32>, tensor<1xindex>) -> tensor<?xf32>
   // CHECK-NEXT: %[[FLAT_B:.*]] = "mhlo.sqrt"(%[[FLAT_A]]) : (tensor<?xf32>) -> tensor<?xf32>
-  // CHECK-NEXT: %[[SHAPE_AS_EXTENT_TENSOR:.*]] = shape.to_extent_tensor %[[SHAPE]] : tensor<?xindex>
-  // CHECK-NEXT: %[[B:.*]] = "mhlo.dynamic_reshape"(%[[FLAT_B]], %[[SHAPE_AS_EXTENT_TENSOR]]) : (tensor<?xf32>, tensor<?xindex>) -> tensor<*xf32>
+  // CHECK-NEXT: %[[B:.*]] = "mhlo.dynamic_reshape"(%[[FLAT_B]], %[[SHAPE]]) : (tensor<?xf32>, tensor<?xindex>) -> tensor<*xf32>
   // CHECK-NEXT: return %[[B]] : tensor<*xf32>
   %b = "mhlo.sqrt"(%a) : (tensor<*xf32>) -> tensor<*xf32>
   return %b : tensor<*xf32>
@@ -73,16 +69,30 @@ func @sqrt_static(%a: tensor<2x3xf32>) -> tensor<2x3xf32> {
 func @add_unranked(%a : tensor<*xf32>, %b : tensor<*xf32>) -> tensor<*xf32> {
   // CHECK: %[[SHAPE_A:.*]] = shape.shape_of %[[A]]
   // CHECK: %[[SHAPE_B:.*]] = shape.shape_of %[[B]]
-  // CHECK: %[[SHAPE:.*]] = "shape.any"(%[[SHAPE_A]], %[[SHAPE_B]])
+  // CHECK: %[[SHAPE:.*]] = shape.any %[[SHAPE_A]], %[[SHAPE_B]]
   // CHECK: %[[NUM_ELEMENTS:.*]] = shape.num_elements %[[SHAPE]]
-  // CHECK: %[[NUM_ELEMENTS_AS_INDEX:.*]] = shape.size_to_index %[[NUM_ELEMENTS]]
-  // CHECK: %[[FLAT_SHAPE:.*]] = tensor_from_elements(%[[NUM_ELEMENTS_AS_INDEX]]) : tensor<1xindex>
+  // CHECK: %[[FLAT_SHAPE:.*]] = tensor_from_elements %[[NUM_ELEMENTS]] : tensor<1xindex>
   // CHECK: %[[FLAT_A:.*]] = "mhlo.dynamic_reshape"(%[[A]], %[[FLAT_SHAPE]]) : (tensor<*xf32>, tensor<1xindex>) -> tensor<?xf32>
   // CHECK: %[[FLAT_B:.*]] = "mhlo.dynamic_reshape"(%[[B]], %[[FLAT_SHAPE]]) : (tensor<*xf32>, tensor<1xindex>) -> tensor<?xf32>
   // CHECK: %[[FLAT_RESULT:.*]] = mhlo.add %[[FLAT_A]], %[[FLAT_B]] : tensor<?xf32>
-  // CHECK: %[[SHAPE_AS_EXTENT_TENSOR:.*]] = shape.to_extent_tensor %[[SHAPE]]
-  // CHECK: %[[RESULT:.*]] = "mhlo.dynamic_reshape"(%[[FLAT_RESULT]], %[[SHAPE_AS_EXTENT_TENSOR]]) : (tensor<?xf32>, tensor<?xindex>) -> tensor<*xf32>
+  // CHECK: %[[RESULT:.*]] = "mhlo.dynamic_reshape"(%[[FLAT_RESULT]], %[[SHAPE]]) : (tensor<?xf32>, tensor<?xindex>) -> tensor<*xf32>
   // CHECK: return %[[RESULT]] : tensor<*xf32>
   %result = mhlo.add %a, %b : tensor<*xf32>
   return %result : tensor<*xf32>
 }
+
+// -----
+
+// CHECK-LABEL: @tan
+// CHECK-SAME: (%[[A:.*]]: tensor<*xf32>) -> tensor<*xf32>
+func @tan(%a : tensor<*xf32>) -> tensor<*xf32> {
+  // CHECK: %[[SHAPE:.*]] = shape.shape_of %[[A]] : tensor<*xf32> -> tensor<?xindex>
+  // CHECK: %[[NUM_ELEMENTS:.*]] = shape.num_elements %[[SHAPE]]
+  // CHECK: %[[FLAT_SHAPE:.*]] = tensor_from_elements %[[NUM_ELEMENTS]] : tensor<1xindex>
+  // CHECK: %[[FLAT_A:.*]] = "mhlo.dynamic_reshape"(%[[A]], %[[FLAT_SHAPE]]) : (tensor<*xf32>, tensor<1xindex>) -> tensor<?xf32>
+  // CHECK: %[[FLAT_B:.*]] = chlo.tan %[[FLAT_A]] : tensor<?xf32>
+  // CHECK: %[[B:.*]] = "mhlo.dynamic_reshape"(%[[FLAT_B]], %[[SHAPE]]) : (tensor<?xf32>, tensor<?xindex>) -> tensor<*xf32>
+  // CHECK: return %[[B]] : tensor<*xf32>
+  %result = chlo.tan %a : tensor<*xf32>
+  return %result : tensor<*xf32>
+}
diff --git a/tensorflow/compiler/mlir/hlo/tests/legalize-to-std.mlir b/tensorflow/compiler/mlir/hlo/tests/legalize-to-std.mlir
index 37a61498fbf..abe4e872b73 100644
--- a/tensorflow/compiler/mlir/hlo/tests/legalize-to-std.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/legalize-to-std.mlir
@@ -42,6 +42,15 @@ func @binary_ops_int(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32
   return %4 : tensor<4xi32>
 }
 
+// CHECK-LABEL: func @unary_ops_float
+func @unary_ops_float(%arg0: tensor<4xf32>) -> tensor<4xf32> {
+  // CHECK-NEXT: %0 = ceilf %arg0 : tensor<4xf32>
+  %0 = "mhlo.ceil"(%arg0) : (tensor<4xf32>) -> tensor<4xf32>
+
+  // CHECK-NEXT:   return %0 : tensor<4xf32>
+  return %0 : tensor<4xf32>
+}
+
 // CHECK-LABEL: func @compare_int(%arg0: tensor<4xi32>) -> (tensor<4xi1>, tensor<4xi1>, tensor<4xi1>, tensor<4xi1>, tensor<4xi1>, tensor<4xi1>) {
 func @compare_int(%arg0: tensor<4xi32>) -> (tensor<4xi1>,tensor<4xi1>,tensor<4xi1>,tensor<4xi1>,tensor<4xi1>,tensor<4xi1>) {
   // CHECK-NEXT: %0 = cmpi "eq", %arg0, %arg0 : tensor<4xi32>
diff --git a/tensorflow/compiler/mlir/hlo/tests/legalize_to_scf.mlir b/tensorflow/compiler/mlir/hlo/tests/legalize_to_scf.mlir
new file mode 100644
index 00000000000..9c887a73a0f
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/tests/legalize_to_scf.mlir
@@ -0,0 +1,38 @@
+// RUN: mlir-hlo-opt --mhlo-control-flow-to-scf %s | FileCheck %s
+
+func @lt_loop(%arg0: tensor<4xf32>, %arg1: tensor<f32>, %arg2: tensor<f32>, %arg3: tensor<4xf32>, %arg4: tensor<f32>, %arg5: tensor<f32>, %arg6: tensor<f32>, %arg7: tensor<f32>, %arg8: tensor<i32>) -> (tuple<tensor<i32>, tensor<i32>, tensor<i32>>) {
+  %cst = constant dense<-1> : tensor<i32>
+  %cst_0 = constant dense<1> : tensor<i32>
+  %cst_1 = constant dense<0> : tensor<i32>
+  %cst_2 = constant dense<1000> : tensor<i32>
+  %0 = "mhlo.tuple"(%cst_1, %cst, %cst_2) : (tensor<i32>, tensor<i32>, tensor<i32>) -> tuple<tensor<i32>, tensor<i32>, tensor<i32>>
+  %1 = "mhlo.while"(%0) ( {
+  ^bb0(%arg9: tuple<tensor<i32>, tensor<i32>, tensor<i32>>):  // no predecessors
+    %2 = "mhlo.get_tuple_element"(%arg9) {index = 0 : i32} : (tuple<tensor<i32>, tensor<i32>, tensor<i32>>) -> tensor<i32>
+    %3 = "mhlo.get_tuple_element"(%arg9) {index = 2 : i32} : (tuple<tensor<i32>, tensor<i32>, tensor<i32>>) -> tensor<i32>
+    %4 = "mhlo.compare"(%2, %3) {comparison_direction = "LT"} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+    "mhlo.return"(%4) : (tensor<i1>) -> ()
+  },  {
+  ^bb0(%arg9: tuple<tensor<i32>, tensor<i32>, tensor<i32>>):  // no predecessors
+    %2 = "mhlo.get_tuple_element"(%arg9) {index = 0 : i32} : (tuple<tensor<i32>, tensor<i32>, tensor<i32>>) -> tensor<i32>
+    %3 = mhlo.add %2, %cst_0 : tensor<i32>
+    %4 = "mhlo.get_tuple_element"(%arg9) {index = 1 : i32} : (tuple<tensor<i32>, tensor<i32>, tensor<i32>>) -> tensor<i32>
+    %5 = "mhlo.get_tuple_element"(%arg9) {index = 2 : i32} : (tuple<tensor<i32>, tensor<i32>, tensor<i32>>) -> tensor<i32>
+    %6 = "mhlo.tuple"(%3, %4, %5) : (tensor<i32>, tensor<i32>, tensor<i32>) -> tuple<tensor<i32>, tensor<i32>, tensor<i32>>
+    "mhlo.return"(%6) : (tuple<tensor<i32>, tensor<i32>, tensor<i32>>) -> ()
+  }) : (tuple<tensor<i32>, tensor<i32>, tensor<i32>>) -> tuple<tensor<i32>, tensor<i32>, tensor<i32>>
+  return %1 : tuple<tensor<i32>, tensor<i32>, tensor<i32>>
+}
+
+// CHECK-LABEL:   func @lt_loop(
+// CHECK:  %[[VAL_9:.*]] = constant dense<-1> : tensor<i32>
+// CHECK:  %[[VAL_10:.*]] = constant dense<1> : tensor<i32>
+// CHECK:  %[[VAL_11:.*]] = constant dense<0> : tensor<i32>
+// CHECK:  %[[VAL_12:.*]] = constant dense<1000> : tensor<i32>
+// CHECK:  %[[VAL_14:.*]] = index_cast %[[VAL_11]] : tensor<i32> to tensor<index>
+// CHECK:  %[[VAL_15:.*]] = extract_element %[[VAL_14]][] : tensor<index>
+// CHECK:  %[[VAL_16:.*]] = index_cast %[[VAL_12]] : tensor<i32> to tensor<index>
+// CHECK:  %[[VAL_17:.*]] = extract_element %[[VAL_16]][] : tensor<index>
+// CHECK:  %[[VAL_18:.*]] = index_cast %[[VAL_10]] : tensor<i32> to tensor<index>
+// CHECK:  %[[VAL_19:.*]] = extract_element %[[VAL_18]][] : tensor<index>
+// CHECK:  scf.for %[[VAL_21:.*]] = %[[VAL_15]] to %[[VAL_17]] step %[[VAL_19]] iter_args(%[[VAL_22:.*]] = %[[VAL_9]], %[[VAL_23:.*]] = %[[VAL_12]])
diff --git a/tensorflow/compiler/mlir/hlo/tests/lhlo-copy-removal.mlir b/tensorflow/compiler/mlir/hlo/tests/lhlo-copy-removal.mlir
deleted file mode 100644
index 3271595900d..00000000000
--- a/tensorflow/compiler/mlir/hlo/tests/lhlo-copy-removal.mlir
+++ /dev/null
@@ -1,115 +0,0 @@
-// RUN: mlir-hlo-opt -lhlo-copy-removal %s -o - | FileCheck %s
-
-// CHECK-LABEL: func @remove_simple
-func @remove_simple(%arg0: memref<2x2xf32>) {
-    %0 = alloc() {temp = true} : memref<2x2xf32>
-    "lmhlo.copy"(%0, %arg0) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-    dealloc %0 : memref<2x2xf32>
-    // CHECK-NEXT: "lmhlo.terminator"() : () -> ()
-    "lmhlo.terminator"() : () -> ()
-}
-
-// -----
-
-// CHECK-LABEL: func @remove_without_dealloc
-func @remove_without_dealloc(%arg0: memref<2x2xf32>) {
-    %0 = alloc() {temp = true} : memref<2x2xf32>
-    "lmhlo.copy"(%0, %arg0) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-    // CHECK-NEXT: "lmhlo.terminator"() : () -> ()
-    "lmhlo.terminator"() : () -> ()
-}
-
-// -----
-
-// CHECK-LABEL: func @replace_dependency
-func @replace_dependency(%arg0: memref<2x2xf32>, %arg1: memref<2x2xf32>) {
-    %0 = alloc() {temp = true} : memref<2x2xf32>
-    "lmhlo.exponential"(%arg0, %0) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-    // CHECK-NEXT: "lmhlo.exponential"(%arg0, %arg1) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-    "lmhlo.copy"(%0, %arg1) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-    dealloc %0 : memref<2x2xf32>
-    // CHECK-NEXT: "lmhlo.terminator"() : () -> ()
-    "lmhlo.terminator"() : () -> ()
-}
-
-// -----
-
-// CHECK-LABEL: func @keep_copies
-func @keep_copies(%arg0: memref<2x2xf32>, %arg1: memref<2x2xf32>) {
-    // CHECK-NEXT: "lmhlo.copy"(%arg0, %arg1) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-    "lmhlo.copy"(%arg0, %arg1) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-    // CHECK-NEXT: "lmhlo.terminator"() : () -> ()
-    "lmhlo.terminator"() : () -> ()
-}
-
-// -----
-
-// CHECK-LABEL: func @must_not_be_removed
-func @must_not_be_removed(%arg0: memref<2x2xf32>,
-                          %arg1: memref<2x2xf32>,
-                          %arg2: memref<2x2xf32>) {
-    // CHECK-NEXT: %[[ALLOC:.*]] = alloc() {temp = true} : memref<2x2xf32>
-    %0 = alloc() {temp = true} : memref<2x2xf32>
-    // CHECK-NEXT: "lmhlo.exponential"(%arg0, %[[ALLOC]]) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-    "lmhlo.exponential"(%arg0, %0) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-    // CHECK-NEXT: "lmhlo.exponential"(%arg1, %arg2) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-    "lmhlo.exponential"(%arg1, %arg2) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-    // CHECK-NEXT: "lmhlo.copy"(%[[ALLOC]], %arg2) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-    "lmhlo.copy"(%0, %arg2) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-    dealloc %0 : memref<2x2xf32>
-    "lmhlo.terminator"() : () -> ()
-}
-
-// -----
-
-// CHECK-LABEL: func @must_be_removed_first
-func @must_be_removed_first(%arg0: memref<2x2xf32>,
-                            %arg1: memref<2x2xf32>,
-                            %arg2: memref<2x2xf32>) {
-    %0 = alloc() {temp = true} : memref<2x2xf32>
-    // CHECK-NEXT: "lmhlo.exponential"(%arg1, %arg2) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-    "lmhlo.exponential"(%arg1, %arg2) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-    // CHECK-NEXT: "lmhlo.exponential"(%arg0, %arg2) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-    "lmhlo.exponential"(%arg0, %0) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-    "lmhlo.copy"(%0, %arg2) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-    dealloc %0 : memref<2x2xf32>
-    "lmhlo.terminator"() : () -> ()
-}
-
-// -----
-
-// CHECK-LABEL: func @must_be_removed_second
-func @must_be_removed_second(%arg0: memref<2x2xf32>,
-                             %arg1: memref<2x2xf32>,
-                             %arg2: memref<2x2xf32>) {
-    %0 = alloc() {temp = true} : memref<2x2xf32>
-    // CHECK-NEXT: "lmhlo.exponential"(%arg0, %arg2) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-    "lmhlo.exponential"(%arg0, %0) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-    "lmhlo.copy"(%0, %arg2) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-    // CHECK-NEXT: "lmhlo.exponential"(%arg1, %arg2) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-    "lmhlo.exponential"(%arg1, %arg2) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-    dealloc %0 : memref<2x2xf32>
-    "lmhlo.terminator"() : () -> ()
-}
-
-// -----
-
-// CHECK-LABEL: func @reduce
-func @reduce(%arg0: memref<1x8xf32>, %arg1: memref<f32>, %arg2: memref<1xf32>) {
-  %0 = alloc() : memref<1xf32>
-  "lmhlo.reduce"(%arg0, %arg1, %0) ( {
-  // CHECK: ^bb0(%[[ARG0:.*]]: memref<f32>, %[[ARG1:.*]]: memref<f32>,
-  // CHECK-SAME: %[[ARG2:.*]]: memref<f32>)
-  ^bb0(%arg3: memref<f32>, %arg4: memref<f32>, %arg5: memref<f32>):
-    %1 = alloc() : memref<f32>
-    // CHECK: "lmhlo.add"(%[[ARG0]], %[[ARG1]], %[[ARG2]])
-    "lmhlo.add"(%arg3, %arg4, %1)
-        : (memref<f32>, memref<f32>, memref<f32>) -> ()
-    // CHECK-NOT; lmhlo.copy
-    "lmhlo.copy"(%1, %arg5) : (memref<f32>, memref<f32>) -> ()
-    "lmhlo.terminator"() : () -> ()
-  }) {dimensions = dense<1> : tensor<1xi64>}
-      : (memref<1x8xf32>, memref<f32>, memref<1xf32>) -> ()
-  "lmhlo.copy"(%0, %arg2) : (memref<1xf32>, memref<1xf32>) -> ()
-  return
-}
diff --git a/tensorflow/compiler/mlir/hlo/tests/lhlo-legalize-to-linalg.mlir b/tensorflow/compiler/mlir/hlo/tests/lhlo-legalize-to-linalg.mlir
index 768d8da22bd..3162f37f912 100644
--- a/tensorflow/compiler/mlir/hlo/tests/lhlo-legalize-to-linalg.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/lhlo-legalize-to-linalg.mlir
@@ -496,6 +496,18 @@ func @sin(%input: memref<2x2xf32>,
 
 // -----
 
+// CHECK-LABEL: func @floor
+func @floor(%input: memref<2x2xf32>, %result: memref<2x2xf32>) {
+  "lmhlo.floor"(%input, %result) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
+  return
+}
+// CHECK: linalg.generic
+// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: f32, %[[RESULT_OUT:.*]]):
+// CHECK-NEXT:   %[[RESULT:.*]] = floorf %[[OPERAND_IN]] : f32
+// CHECK-NEXT:   linalg.yield %[[RESULT]] : f32
+
+// -----
+
 // CHECK-LABEL: func @negf
 func @negf(%input: memref<2x2xf32>, %result: memref<2x2xf32>) {
   "lmhlo.negate"(%input, %result) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
@@ -688,6 +700,46 @@ func @reshape_2D_4D(%arg0: memref<12x42xi32>, %arg1 : memref<12x1x42x1xi32>) {
 
 // -----
 
+// CHECK-DAG: #[[RESHAPE_MAP1:.*]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+// CHECK-DAG: #[[RESHAPE_MAP2:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+// CHECK-LABEL: func @reshape_3D_4D
+func @reshape_3D_4D(%arg0: memref<1x49x16xf32>, %arg1: memref<1x784x1x1xf32>) {
+  "lmhlo.reshape"(%arg0, %arg1)
+   : (memref<1x49x16xf32>, memref<1x784x1x1xf32>) -> ()
+  return
+}
+// CHECK: linalg.reshape %{{.*}} [#[[RESHAPE_MAP1]]]
+// CHECK: linalg.reshape %{{.*}} [#[[RESHAPE_MAP2]]]
+// CHECK: linalg.copy
+
+// -----
+
+// CHECK-DAG: #[[MAP:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+// CHECK-LABEL: func @reshape1_4D_4D
+func @reshape1_4D_4D(%arg0: memref<4x512x1x1xi32>,
+                     %arg1: memref<1x4x1x512xi32>) {
+  "lmhlo.reshape"(%arg0, %arg1)
+   : (memref<4x512x1x1xi32>, memref<1x4x1x512xi32>) -> ()
+  return
+}
+// CHECK: linalg.reshape %{{.*}} [#[[MAP]]]
+// CHECK: linalg.reshape %{{.*}} [#[[MAP]]]
+
+// -----
+
+// CHECK-DAG: #[[MAP:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+// CHECK-LABEL: func @reshape2_4D_4D
+func @reshape2_4D_4D(%arg0: memref<4x1x1x1024xi32>,
+                     %arg1: memref<4x1024x1x1xi32>) {
+  "lmhlo.reshape"(%arg0, %arg1)
+   : (memref<4x1x1x1024xi32>, memref<4x1024x1x1xi32>) -> ()
+  return
+}
+// CHECK: linalg.reshape %{{.*}} [#[[MAP]]]
+// CHECK: linalg.reshape %{{.*}} [#[[MAP]]]
+
+// -----
+
 // CHECK-DAG: #[[OPERAND_MAP:.*]] = affine_map<(d0, d1) -> (d0, -d1 + 2)>
 // CHECK-DAG: #[[RESULT_MAP:.*]] = affine_map<(d0, d1) -> (d0, d1)>
 // CHECK-LABEL: func @reverse
@@ -722,3 +774,16 @@ func @conv(%input: memref<3x5x5x3xf32>, %filter: memref<2x2x3x4xf32>, %output: m
   "lmhlo.copy"(%0, %output) : (memref<3x5x5x4xf32>, memref<3x5x5x4xf32>) -> ()
   "lmhlo.terminator"() : () -> ()
 }
+
+// -----
+
+// CHECK-DAG: #[[TRANSPOSE_INPUT_MAP:.*]] = affine_map<(d0, d1) -> (d1, d0)>
+// CHECK-DAG: #[[TRANSPOSE_OUTPUT_MAP:.*]] = affine_map<(d0, d1) -> (d0, d1)>
+// CHECK-LABEL: func @transpose
+func @transpose(%arg0: memref<2x2xf32>, %arg1: memref<2x2xf32>) {
+  "lmhlo.transpose"(%arg0, %arg1) {
+    permutation = dense<[1, 0]> : tensor<2xi64>
+  } : (memref<2x2xf32>, memref<2x2xf32>) -> ()
+  return
+}
+// CHECK: linalg.generic {{{.*}}indexing_maps = [#[[TRANSPOSE_INPUT_MAP]], #[[TRANSPOSE_OUTPUT_MAP]]]
diff --git a/tensorflow/compiler/mlir/hlo/tests/lhlo-legalize-to-llvm.mlir b/tensorflow/compiler/mlir/hlo/tests/lhlo-legalize-to-llvm.mlir
index 5bb1d475b24..45c383bd1d6 100644
--- a/tensorflow/compiler/mlir/hlo/tests/lhlo-legalize-to-llvm.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/lhlo-legalize-to-llvm.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-hlo-opt %s --test-lhlo-legalize-to-llvm -split-input-file | FileCheck %s
+// RUN: mlir-hlo-opt %s -lower-affine -convert-scf-to-std -test-lhlo-legalize-to-llvm -split-input-file | FileCheck %s
 
 // CHECK-LABEL: func @static_memref_cast
 func @static_memref_cast(%buf : memref<10x1x5xf32>) {
diff --git a/tensorflow/compiler/mlir/hlo/tests/lit.cfg.py b/tensorflow/compiler/mlir/hlo/tests/lit.cfg.py
new file mode 100644
index 00000000000..f81d47a76cd
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/tests/lit.cfg.py
@@ -0,0 +1,82 @@
+"""Lit configuration to drive test in this repo."""
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# -*- Python -*-
+# pylint: disable=undefined-variable
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+import lit.formats
+from lit.llvm import llvm_config
+from lit.llvm.subst import ToolSubst
+import lit.util
+
+# Configuration file for the 'lit' test runner.
+
+# name: The name of this test suite.
+config.name = 'MLIR_HLO_OPT'
+
+config.test_format = lit.formats.ShTest(not llvm_config.use_lit_shell)
+
+# suffixes: A list of file extensions to treat as test files.
+config.suffixes = ['.mlir', '.mlir.py']
+
+# test_source_root: The root path where tests are located.
+config.test_source_root = os.path.dirname(__file__)
+
+# test_exec_root: The root path where tests should be run.
+config.test_exec_root = os.path.join(config.mlir_hlo_obj_root, 'test')
+
+config.substitutions.append(('%PATH%', config.environment['PATH']))
+config.substitutions.append(('%shlibext', config.llvm_shlib_ext))
+
+llvm_config.with_system_environment(['HOME', 'INCLUDE', 'LIB', 'TMP', 'TEMP'])
+
+llvm_config.use_default_substitutions()
+
+# excludes: A list of directories to exclude from the testsuite. The 'Inputs'
+# subdirectories contain auxiliary inputs for various tests in their parent
+# directories.
+config.excludes = [
+    'Inputs', 'Examples', 'CMakeLists.txt', 'README.txt', 'LICENSE.txt'
+]
+
+# test_source_root: The root path where tests are located.
+config.test_source_root = os.path.dirname(__file__)
+
+# test_exec_root: The root path where tests should be run.
+config.test_exec_root = os.path.join(config.mlir_hlo_obj_root, 'test')
+config.mlir_hlo_tools_dir = os.path.join(config.mlir_hlo_obj_root, 'tools')
+
+# Tweak the PATH to include the tools dir.
+llvm_config.with_environment('PATH', config.llvm_tools_dir, append_path=True)
+
+tool_dirs = [
+    os.path.join(config.mlir_hlo_tools_dir, 'mlir-hlo-opt'),
+    config.llvm_tools_dir,
+]
+tools = [
+    'mlir-hlo-opt',
+    'mlir-cpu-runner',
+    ToolSubst(
+        '%mlir_runner_utils_dir',
+        config.mlir_runner_utils_dir,
+        unresolved='ignore'),
+]
+
+llvm_config.add_tool_substitutions(tools, tool_dirs)
diff --git a/tensorflow/compiler/mlir/hlo/tests/lit.site.cfg.py.in b/tensorflow/compiler/mlir/hlo/tests/lit.site.cfg.py.in
new file mode 100644
index 00000000000..1555d314df0
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/tests/lit.site.cfg.py.in
@@ -0,0 +1,63 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+@LIT_SITE_CFG_IN_HEADER@
+
+import sys
+
+config.host_triple = "@LLVM_HOST_TRIPLE@"
+config.target_triple = "@TARGET_TRIPLE@"
+config.llvm_src_root = "@LLVM_SOURCE_DIR@"
+config.llvm_obj_root = "@LLVM_BINARY_DIR@"
+config.llvm_tools_dir = "@LLVM_TOOLS_DIR@"
+config.llvm_lib_dir = "@LLVM_LIBRARY_DIR@"
+config.llvm_shlib_dir = "@SHLIBDIR@"
+config.llvm_shlib_ext = "@SHLIBEXT@"
+config.llvm_exe_ext = "@EXEEXT@"
+config.lit_tools_dir = "@LLVM_LIT_TOOLS_DIR@"
+config.python_executable = "@PYTHON_EXECUTABLE@"
+config.gold_executable = "@GOLD_EXECUTABLE@"
+config.ld64_executable = "@LD64_EXECUTABLE@"
+config.enable_shared = @ENABLE_SHARED@
+config.enable_assertions = @ENABLE_ASSERTIONS@
+config.targets_to_build = "@TARGETS_TO_BUILD@"
+config.native_target = "@LLVM_NATIVE_ARCH@"
+config.llvm_bindings = "@LLVM_BINDINGS@".split(' ')
+config.host_os = "@HOST_OS@"
+config.host_cc = "@HOST_CC@"
+config.host_cxx = "@HOST_CXX@"
+# Note: ldflags can contain double-quoted paths, so must use single quotes here.
+config.host_ldflags = '@HOST_LDFLAGS@'
+config.llvm_use_sanitizer = "@LLVM_USE_SANITIZER@"
+config.llvm_host_triple = '@LLVM_HOST_TRIPLE@'
+config.host_arch = "@HOST_ARCH@"
+config.mlir_hlo_src_root = "@CMAKE_SOURCE_DIR@"
+config.mlir_hlo_obj_root = "@CMAKE_BINARY_DIR@"
+config.mlir_runner_utils_dir = os.path.join(config.llvm_obj_root, "lib")
+
+# Support substitution of the tools_dir with user parameters. This is
+# used when we can't determine the tool dir at configuration time.
+try:
+    config.llvm_tools_dir = config.llvm_tools_dir % lit_config.params
+    config.llvm_shlib_dir = config.llvm_shlib_dir % lit_config.params
+except KeyError:
+    e = sys.exc_info()[1]
+    key, = e.args
+    lit_config.fatal("unable to find %r parameter, use '--param=%s=VALUE'" % (key,key))
+
+
+import lit.llvm
+lit.llvm.initialize(lit_config, config)
+
+# Let the main config do the real work.
+lit_config.load_config(config, "@CMAKE_SOURCE_DIR@/tests/lit.cfg.py")
diff --git a/tensorflow/compiler/mlir/hlo/tests/mhlo_infer_shape_type_methods.mlir b/tensorflow/compiler/mlir/hlo/tests/mhlo_infer_shape_type_methods.mlir
new file mode 100644
index 00000000000..d626f520824
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/tests/mhlo_infer_shape_type_methods.mlir
@@ -0,0 +1,37 @@
+// RUN: mlir-hlo-opt --mhlo-test-infer-shaped-type-methods --allow-unregistered-dialect --split-input-file %s | FileCheck %s
+
+// -----
+// CHECK-LABEL: @select
+// CHECK-SAME: (%[[PRED:.*]]: tensor<2x?xi1>,
+func @select(%pred : tensor<2x?xi1>, %a : tensor<2x?xf32>, %b : tensor<2x?xf32>)
+    -> tensor<2xi64> {
+  // CHECK: %[[C2:.*]] = constant 2 : i64
+  // CHECK: %[[C1:.*]] = constant 1 : index
+  // CHECK: %[[DIM_AS_INDEX:.*]] = dim %[[PRED]], %[[C1]] : tensor<2x?xi1>
+  // CHECK: %[[DIM:.*]] = index_cast %[[DIM_AS_INDEX]] : index to i64
+  // CHECK: %[[SHAPE:.*]] = tensor_from_elements %[[C2]], %[[DIM]] : tensor<2xi64>
+  // CHECK: return %[[SHAPE]] : tensor<2xi64>
+  %0 = "mhlo.select"(%pred, %a, %b)
+      : (tensor<2x?xi1>, tensor<2x?xf32>, tensor<2x?xf32>) -> tensor<2x?xf32>
+  %1 = "mhlo_test.reify_return_type_shapes"(%0)
+      : (tensor<2x?xf32>) -> tensor<2xi64>
+  return %1 : tensor<2xi64>
+}
+
+// -----
+// CHECK-LABEL: @compare
+// CHECK-SAME: (%[[A:.*]]: tensor<2x?xf32>,
+func @compare(%a : tensor<2x?xf32>, %b : tensor<2x?xf32>) -> tensor<2xi64> {
+  // CHECK: %[[C2:.*]] = constant 2 : i64
+  // CHECK: %[[C1:.*]] = constant 1 : index
+  // CHECK: %[[DIM_AS_INDEX:.*]] = dim %[[A]], %[[C1]] : tensor<2x?xf32>
+  // CHECK: %[[DIM:.*]] = index_cast %[[DIM_AS_INDEX]] : index to i64
+  // CHECK: %[[SHAPE:.*]] = tensor_from_elements %[[C2]], %[[DIM]] : tensor<2xi64>
+  // CHECK: return %[[SHAPE]] : tensor<2xi64>
+  %0 = "mhlo.compare"(%a, %b) { comparison_direction = "NE" }
+      : (tensor<2x?xf32>, tensor<2x?xf32>) -> tensor<2x?xi1>
+  %1 = "mhlo_test.reify_return_type_shapes"(%0)
+      : (tensor<2x?xi1>) -> tensor<2xi64>
+  return %1 : tensor<2xi64>
+}
+
diff --git a/tensorflow/compiler/mlir/hlo/tests/ops.mlir b/tensorflow/compiler/mlir/hlo/tests/ops.mlir
index 212e79432b1..0120a7a5652 100644
--- a/tensorflow/compiler/mlir/hlo/tests/ops.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/ops.mlir
@@ -116,6 +116,30 @@ func @dynamic_broadcast_in_dim(%arg0: tensor<?x?xi32>, %shape: tensor<3xi64>) ->
 
 // -----
 
+// CHECK-LABEL: func @dynamic_broadcast_in_dim_unknown_dim
+func @dynamic_broadcast_in_dim_unknown_dim(%arg0: tensor<32xf32>, %shape: tensor<3xi64>) -> tensor<?x?x?xf32> {
+  %0 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %shape) {broadcast_dimensions = dense<[2]> : tensor<1xi64>} : (tensor<32xf32>, tensor<3xi64>) -> tensor<?x?x?xf32>
+  return %0 : tensor<?x?x?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @dynamic_broadcast_in_dim_ok_dim
+func @dynamic_broadcast_in_dim_ok_dim(%arg0: tensor<1xf32>, %shape: tensor<3xi64>) -> tensor<7x8x9xf32> {
+  %0 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %shape) {broadcast_dimensions = dense<[2]> : tensor<1xi64>} : (tensor<1xf32>, tensor<3xi64>) -> tensor<7x8x9xf32>
+  return %0 : tensor<7x8x9xf32>
+}
+
+// -----
+
+func @dynamic_broadcast_in_dim_shape_mismatch(%arg0: tensor<32xf32>, %shape: tensor<3xi64>) -> tensor<7x8x9xf32> {
+  // expected-error@+1 {{size of operand dimension 0 (32) is not compatible with size of result dimension 2 (9)}}
+  %0 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %shape) {broadcast_dimensions = dense<[2]> : tensor<1xi64>} : (tensor<32xf32>, tensor<3xi64>) -> tensor<7x8x9xf32>
+  return %0 : tensor<7x8x9xf32>
+}
+
+// -----
+
 func @broadcast_in_dim_bad_dimension_rank(%arg0: tensor<1x2xi32>) -> tensor<1x2x3xi32> {
   // expected-error@+1 {{broadcast_dimensions has rank 2 instead of rank 1}}
   %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[[1,1],[1,1]]> : tensor<2x2xi64>} : (tensor<1x2xi32>) -> tensor<1x2x3xi32>
@@ -456,7 +480,7 @@ func @map_non_scalar_computation_operand(%arg0: tensor<4x5xf32>, %arg1: tensor<4
   // expected-error@+1 {{computation arguments must be 0-rank tensor, but got: arg #1 of type 'tensor<5xf32>'}}
   %0 = "mhlo.map"(%arg0, %arg1) ( {
     ^bb0(%arg2: tensor<f32>, %arg3: tensor<5xf32>):
-    %1 = mhlo.constant {value = dense<2.0> : tensor<f32>} : tensor<f32>
+    %1 = mhlo.constant dense<2.0> : tensor<f32>
     "mhlo.return"(%1) : (tensor<f32>) -> ()
   }) {dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<4x5xf32>, tensor<4x5xf32>) -> tensor<4x5xf32>
   return %0 : tensor<4x5xf32>
@@ -468,7 +492,7 @@ func @map_mismatch_operand_and_computation_args(%arg0: tensor<4x5xf32>, %arg1: t
   // expected-error@+1 {{element type of operands and computation arguments must match, but got: 'f32' and 'i32'}}
   %0 = "mhlo.map"(%arg0, %arg1) ( {
     ^bb0(%arg2: tensor<i32>, %arg3: tensor<i32>):
-    %1 = mhlo.constant {value = dense<2.0> : tensor<f32>} : tensor<f32>
+    %1 = mhlo.constant dense<2.0> : tensor<f32>
     "mhlo.return"(%1) : (tensor<f32>) -> ()
   }) {dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<4x5xf32>, tensor<4x5xf32>) -> tensor<4x5xf32>
   return %0 : tensor<4x5xf32>
@@ -480,7 +504,7 @@ func @map_invalid_number_of_computation_output(%arg0: tensor<4x5xf32>, %arg1: te
   // expected-error@+1 {{computation must return single output, but got: 0}}
   %0 = "mhlo.map"(%arg0, %arg1) ( {
     ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
-    %1 = mhlo.constant {value = dense<2.0> : tensor<f32>} : tensor<f32>
+    %1 = mhlo.constant dense<2.0> : tensor<f32>
     "mhlo.return"() : () -> ()
   }) {dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<4x5xf32>, tensor<4x5xf32>) -> tensor<4x5xf32>
   return %0 : tensor<4x5xf32>
@@ -492,7 +516,7 @@ func @main_non_scalar_computation_output(%arg0: tensor<4x5xf32>, %arg1: tensor<4
   // expected-error@+1 {{computation must return 0-rank tensor, but got: 'tensor<5xf32>'}}
   %0 = "mhlo.map"(%arg0, %arg1) ( {
     ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
-    %1 = mhlo.constant {value = dense<2.0> : tensor<f32>} : tensor<5xf32>
+    %1 = mhlo.constant dense<2.0> : tensor<5xf32>
     "mhlo.return"(%1) : (tensor<5xf32>) -> ()
   }) {dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<4x5xf32>, tensor<4x5xf32>) -> tensor<4x5xf32>
   return %0 : tensor<4x5xf32>
@@ -504,7 +528,7 @@ func @mismatch_computation_output_type(%arg0: tensor<4x5xf32>, %arg1: tensor<4x5
   // expected-error@+1 {{element type of result and computation output must match, but got: 'f32' and 'i32'}}
   %0 = "mhlo.map"(%arg0, %arg1) ( {
     ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
-    %1 = mhlo.constant {value = dense<2> : tensor<i32>} : tensor<i32>
+    %1 = mhlo.constant dense<2> : tensor<i32>
     "mhlo.return"(%1) : (tensor<i32>) -> ()
   }) {dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<4x5xf32>, tensor<4x5xf32>) -> tensor<4x5xf32>
   return %0 : tensor<4x5xf32>
@@ -576,6 +600,14 @@ func @recv_non_token_second_result(%token: !mhlo.token) -> tuple<tensor<3x4xi32>
 
 // -----
 
+// CHECK-LABEL: func @replica_id
+func @replica_id() -> tensor<ui32> {
+  %0 = "mhlo.replica_id"() : () -> tensor<ui32>
+  return %0 : tensor<ui32>
+}
+
+// -----
+
 func @rng_uniform_invalid_type(%mu: tensor<complex<f32>>, %sigma: tensor<f32>) -> tensor<2x3x5xf32> {
   %shape = mhlo.constant dense<[2, 3, 5]> : tensor<3xi64>
   // expected-error@+1 {{but got 'tensor<complex<f32>>'}}
@@ -730,6 +762,14 @@ func @dynamic_update_slice_invalid_start(%input: tensor<3x4xi64>, %update: tenso
 
 // -----
 
+func @dynamic_update_slice_mismatched_start(%input: tensor<11x3x4xi32>, %update: tensor<1x3x4xi32>, %start1: tensor<i32>, %start2: tensor<i64>, %start3: tensor<i64>) -> tensor<11x3x4xi32> {
+  // expected-error@+1 {{start indices must have same element type (encountered mismatch: 'i32' vs 'i64')}}
+  %0 = "mhlo.dynamic-update-slice"(%input, %update, %start1, %start2, %start3) : (tensor<11x3x4xi32>, tensor<1x3x4xi32>, tensor<i32>, tensor<i64>, tensor<i64>) -> tensor<11x3x4xi32>
+  return %0 : tensor<11x3x4xi32>
+}
+
+// -----
+
 // CHECK-LABEL: func @transpose
 func @transpose(%arg0: tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32> {
   %0 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0, 3, 2]> : tensor<4xi64>} : (tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32>
diff --git a/tensorflow/compiler/mlir/hlo/tests/unfuse_batch_norm.mlir b/tensorflow/compiler/mlir/hlo/tests/unfuse_batch_norm.mlir
index f903dbb7080..53ee94f8d1a 100644
--- a/tensorflow/compiler/mlir/hlo/tests/unfuse_batch_norm.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/unfuse_batch_norm.mlir
@@ -109,7 +109,7 @@ func @batchNormInference_dynamic_shape(
   // CHECK-DAG: %[[C3:.*]] = constant 3 : index
   // CHECK-DAG: %[[EPS:.+]] = mhlo.constant dense<1.000000e-03> : tensor<f32>
   // CHECK-DAG: %[[DIM:.+]] = dim %[[VARIANCE]], %[[C0]] : tensor<?xf32>
-  // CHECK-DAG: %[[TO_DIM_TENSOR:.+]] = tensor_from_elements(%[[DIM]]) : tensor<1xindex>
+  // CHECK-DAG: %[[TO_DIM_TENSOR:.+]] = tensor_from_elements %[[DIM]] : tensor<1xindex>
   // CHECK-DAG: %[[EPS_BCAST:.+]] =  "mhlo.dynamic_broadcast_in_dim"(%[[EPS]], %[[TO_DIM_TENSOR]]) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<1xindex>) -> tensor<?xf32>
   // CHECK-DAG: %[[VARIANCE_EPS:.+]] = mhlo.add %[[VARIANCE]], %[[EPS_BCAST]] : tensor<?xf32>
   // CHECK-DAG: %[[STDDEV:.+]] = "mhlo.sqrt"(%[[VARIANCE_EPS]]) : (tensor<?xf32>) -> tensor<?xf32>
@@ -117,7 +117,7 @@ func @batchNormInference_dynamic_shape(
   // CHECK-DAG: %[[INPUT_DIM_1:.+]] = dim %[[X]], %[[C1]] : tensor<?x?x?x?xf32>
   // CHECK-DAG: %[[INPUT_DIM_2:.+]] = dim %[[X]], %[[C2]] : tensor<?x?x?x?xf32>
   // CHECK-DAG: %[[INPUT_DIM_3:.+]] = dim %[[X]], %[[C3]] : tensor<?x?x?x?xf32>
-  // CHECK-DAG: %[[TO_INPUT_DIM_TENSOR:.+]] = tensor_from_elements(%[[INPUT_DIM_0]], %[[INPUT_DIM_1]], %[[INPUT_DIM_2]], %[[INPUT_DIM_3]]) : tensor<4xindex>
+  // CHECK-DAG: %[[TO_INPUT_DIM_TENSOR:.+]] = tensor_from_elements %[[INPUT_DIM_0]], %[[INPUT_DIM_1]], %[[INPUT_DIM_2]], %[[INPUT_DIM_3]] : tensor<4xindex>
   // CHECK-DAG: %[[STDDEV_BCAST:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[STDDEV]], %[[TO_INPUT_DIM_TENSOR]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
   // CHECK-DAG: %[[SCALE_BCAST:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[SCALE]], %[[TO_INPUT_DIM_TENSOR]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
   // CHECK-DAG: %[[OFFSET_BCAST:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[OFFSET]], %[[TO_INPUT_DIM_TENSOR]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
diff --git a/tensorflow/compiler/mlir/hlo/tools/CMakeLists.txt b/tensorflow/compiler/mlir/hlo/tools/CMakeLists.txt
new file mode 100644
index 00000000000..0f3d1c85795
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/tools/CMakeLists.txt
@@ -0,0 +1,16 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+add_subdirectory(mlir-hlo-opt)
diff --git a/tensorflow/compiler/mlir/hlo/tools/mlir-hlo-opt/CMakeLists.txt b/tensorflow/compiler/mlir/hlo/tools/mlir-hlo-opt/CMakeLists.txt
new file mode 100644
index 00000000000..69971f4c024
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/tools/mlir-hlo-opt/CMakeLists.txt
@@ -0,0 +1,34 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
+get_property(conversion_libs GLOBAL PROPERTY MLIR_CONVERSION_LIBS)
+set(LIBS
+        ${dialect_libs}
+        ${conversion_libs}
+        MLIROptLib
+
+        MhloRegisterDialects
+        AllMhloPasses
+        )
+add_llvm_executable(mlir-hlo-opt mlir-hlo-opt.cpp
+  DEPENDS
+        MLIRLmhloPassIncGen
+        MLIRMhloPassIncGen
+)
+llvm_update_compile_flags(mlir-hlo-opt)
+target_link_libraries(mlir-hlo-opt PRIVATE ${LIBS})
+
+mlir_check_all_link_libraries(mlir-hlo-opt)
diff --git a/tensorflow/compiler/mlir/hlo/tools/mlir-hlo-opt/mlir-hlo-opt.cpp b/tensorflow/compiler/mlir/hlo/tools/mlir-hlo-opt/mlir-hlo-opt.cpp
index 70fc21d6959..d0c0e3c51e1 100644
--- a/tensorflow/compiler/mlir/hlo/tools/mlir-hlo-opt/mlir-hlo-opt.cpp
+++ b/tensorflow/compiler/mlir/hlo/tools/mlir-hlo-opt/mlir-hlo-opt.cpp
@@ -13,109 +13,25 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/InitLLVM.h"
-#include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/ToolOutputFile.h"
-#include "mlir-hlo/Dialect/mhlo/IR/register.h"
+#include "mlir-hlo/Dialect/mhlo/IR/chlo_ops.h"
+#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
 #include "mlir-hlo/Dialect/mhlo/transforms/register_passes.h"
-#include "mlir/IR/Dialect.h"
-#include "mlir/IR/MLIRContext.h"
 #include "mlir/InitAllDialects.h"
 #include "mlir/InitAllPasses.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Pass/PassManager.h"
-#include "mlir/Support/FileUtilities.h"
 #include "mlir/Support/MlirOptMain.h"
 
-// NOLINTNEXTLINE
-static llvm::cl::opt<std::string> inputFilename(llvm::cl::Positional,
-                                                llvm::cl::desc("<input file>"),
-                                                llvm::cl::init("-"));
-
-// NOLINTNEXTLINE
-static llvm::cl::opt<std::string> outputFilename(
-    "o", llvm::cl::desc("Output filename"), llvm::cl::value_desc("filename"),
-    llvm::cl::init("-"));
-
-// NOLINTNEXTLINE
-static llvm::cl::opt<bool> splitInputFile(
-    "split-input-file",
-    llvm::cl::desc("Split the input file into pieces and process each "
-                   "chunk independently"),
-    llvm::cl::init(false));
-
-// NOLINTNEXTLINE
-static llvm::cl::opt<bool> verifyDiagnostics(
-    "verify-diagnostics",
-    llvm::cl::desc("Check that emitted diagnostics match "
-                   "expected-* lines on the corresponding line"),
-    llvm::cl::init(false));
-
-// NOLINTNEXTLINE
-static llvm::cl::opt<bool> verifyPasses(
-    "verify-each",
-    llvm::cl::desc("Run the verifier after each transformation pass"),
-    llvm::cl::init(true));
-
-// NOLINTNEXTLINE
-static llvm::cl::opt<bool> allowUnregisteredDialects(
-    "allow-unregistered-dialect",
-    llvm::cl::desc("Allow operation with no registered dialects"),
-    llvm::cl::init(false));
-
-// NOLINTNEXTLINE
-static llvm::cl::opt<bool> showDialects(
-    "show-dialects", llvm::cl::desc("Print the list of registered dialects"),
-    llvm::cl::init(false));
-
 int main(int argc, char **argv) {
-  mlir::registerAllDialects();
   mlir::registerAllPasses();
-
-  mlir::mhlo::registerAllDialects();
   mlir::mhlo::registerAllMhloPasses();
   mlir::lmhlo::registerAllLmhloPasses();
 
-  llvm::InitLLVM y(argc, argv);
+  mlir::DialectRegistry registry;
+  mlir::registerAllDialects(registry);
+  registry.insert<mlir::mhlo::MhloDialect>();
+  registry.insert<mlir::chlo::HloClientDialect>();
+  registry.insert<mlir::lmhlo::LmhloDialect>();
 
-  // Register any pass manager command line options.
-  mlir::registerPassManagerCLOptions();
-  mlir::PassPipelineCLParser passPipeline("", "Compiler passes to run");
-
-  // Parse pass names in main to ensure static initialization completed.
-  llvm::cl::ParseCommandLineOptions(argc, argv,
-                                    "MLIR modular optimizer driver\n");
-
-  if (showDialects) {
-    mlir::MLIRContext context;
-    llvm::outs() << "Registered Dialects:\n";
-    for (mlir::Dialect *dialect : context.getRegisteredDialects()) {
-      llvm::outs() << dialect->getNamespace() << "\n";
-    }
-    return 0;
-  }
-
-  // Set up the input file.
-  std::string errorMessage;
-  auto file = mlir::openInputFile(inputFilename, &errorMessage);
-  if (!file) {
-    llvm::errs() << errorMessage << "\n";
-    return 1;
-  }
-
-  auto output = mlir::openOutputFile(outputFilename, &errorMessage);
-  if (!output) {
-    llvm::errs() << errorMessage << "\n";
-    exit(1);
-  }
-
-  if (failed(MlirOptMain(output->os(), std::move(file), passPipeline,
-                         splitInputFile, verifyDiagnostics, verifyPasses,
-                         allowUnregisteredDialects))) {
-    return 1;
-  }
-  // Keep the output file if the invocation of MlirOptMain was successful.
-  output->keep();
-  return 0;
+  return failed(
+      mlir::MlirOptMain(argc, argv, "MLIR HLO pass driver\n", registry));
 }
diff --git a/tensorflow/compiler/mlir/lite/BUILD b/tensorflow/compiler/mlir/lite/BUILD
index 555c11779f5..aee6cd5ad91 100644
--- a/tensorflow/compiler/mlir/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/BUILD
@@ -29,6 +29,7 @@ filegroup(
         "ir/tfl_ops.td",
         "//tensorflow/compiler/mlir/lite/quantization:quantization_td_files",
         "@llvm-project//mlir:OpBaseTdFiles",
+        "@llvm-project//mlir:include/mlir/Interfaces/InferTypeOpInterface.td",
         "@llvm-project//mlir:include/mlir/Interfaces/LoopLikeInterface.td",
         "@llvm-project//mlir:include/mlir/Interfaces/SideEffectInterfaces.td",
     ],
@@ -227,6 +228,7 @@ cc_library(
         "@llvm-project//mlir:DerivedAttributeOpInterface",
         "@llvm-project//mlir:Dialect",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:InferTypeOpInterface",
         "@llvm-project//mlir:LoopLikeInterface",
         "@llvm-project//mlir:QuantOps",
         "@llvm-project//mlir:SideEffects",
@@ -237,6 +239,28 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "constant_utils",
+    srcs = [
+        "utils/constant_utils.cc",
+    ],
+    hdrs = [
+        "utils/constant_utils.h",
+    ],
+    copts = ["-std=c++14"],
+    deps = [
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:mangling_util",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/platform:status",
+        "//tensorflow/stream_executor/lib",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
 cc_library(
     name = "lstm_utils",
     srcs = [
@@ -256,6 +280,28 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "nms_utils",
+    srcs = [
+        "utils/nms_utils.cc",
+    ],
+    hdrs = [
+        "utils/nms_utils.h",
+    ],
+    copts = ["-std=c++14"],
+    deps = [
+        ":tensorflow_lite",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_attributes",
+        "//tensorflow/core:framework",
+        "@flatbuffers",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
 cc_library(
     name = "tftext_utils",
     srcs = [
@@ -347,7 +393,9 @@ cc_library(
         "transforms/passes.h",
     ],
     deps = [
+        ":constant_utils",
         ":lstm_utils",
+        ":nms_utils",
         ":stateful_ops_utils",
         ":tensorflow_lite",
         ":tftext_utils",
@@ -359,6 +407,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:convert_tensor",
         "//tensorflow/compiler/mlir/tensorflow:mangling_util",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_attributes",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
         "//tensorflow/compiler/mlir/tensorflow:tf_legalize_hlo",
         "//tensorflow/compiler/mlir/tensorflow:unroll_batch_matmul_pass",
@@ -477,25 +526,13 @@ gentbl(
     tblgen = "//tensorflow/compiler/mlir/lite/quantization:op_quant_spec_getters_gen",
     td_file = "ir/tfl_ops.td",
     td_srcs = [
+        "@llvm-project//mlir:include/mlir/Interfaces/InferTypeOpInterface.td",
         "@llvm-project//mlir:include/mlir/Interfaces/LoopLikeInterface.td",
         "//tensorflow/compiler/mlir/lite/quantization:quantization_td_files",
         "ir/tfl_op_interfaces.td",
     ],
 )
 
-# Library with tensorflow Lite dialect static initialization.
-cc_library(
-    name = "tensorflow_lite_dialect_registration",
-    srcs = [
-        "ir/dialect_registration.cc",
-    ],
-    deps = [
-        ":tensorflow_lite",
-        "@llvm-project//mlir:IR",
-    ],
-    alwayslink = 1,
-)
-
 tf_native_cc_binary(
     name = "converter-gen",
     srcs = [
@@ -602,12 +639,10 @@ cc_library(
         ":flatbuffer_tflite_operator_lib",
         ":stateful_ops_utils",
         ":tensorflow_lite",
-        ":tensorflow_lite_dialect_registration",
         "//tensorflow/compiler/mlir:op_or_arg_name_mapper",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:convert_tensor",
         "//tensorflow/compiler/mlir/tensorflow:export_tf_dialect_op",
-        "//tensorflow/compiler/mlir/tensorflow:tensorflow_dialect_registration",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/core:protos_all_cc",
@@ -646,7 +681,7 @@ cc_library(
         ":convert_type",
         ":flatbuffer_tflite_operator_lib",
         ":tensorflow_lite",
-        ":tensorflow_lite_dialect_registration",
+        "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:mangling_util",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
         "//tensorflow/compiler/xla:statusor",
@@ -714,16 +749,13 @@ cc_library(
     ],
     deps = [
         ":flatbuffer_translate_lib",
+        ":tensorflow_lite",
+        "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_flags",
-        "@com_google_absl//absl/base",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:MlirTranslateMain",
         "@llvm-project//mlir:QuantOps",
-        "@llvm-project//mlir:SCFTransforms",
         "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:Translation",
@@ -736,7 +768,7 @@ tf_cc_binary(
     deps = [
         ":flatbuffer_translate_registeration",
         # TODO(b/155809683): Link only necessary dialects.
-        "@llvm-project//mlir:AllPassesAndDialects",
+        "@llvm-project//mlir:AllPassesAndDialectsNoRegistration",
     ],
 )
 
@@ -788,7 +820,7 @@ tf_cc_binary(
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
         # TODO(b/155809683): Link only necessary dialects.
-        "@llvm-project//mlir:AllPassesAndDialects",
+        "@llvm-project//mlir:AllPassesAndDialectsNoRegistration",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Support",
@@ -812,19 +844,18 @@ tf_cc_binary(
     deps = [
         ":flatbuffer_translate_lib",
         ":flatbuffer_translate_registeration",
-        "@com_google_absl//absl/strings",
-        "@llvm-project//llvm:Support",
-        # TODO(b/155809683): Link only necessary dialects.
-        "@llvm-project//mlir:AllPassesAndDialects",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Parser",
-        "@llvm-project//mlir:Support",
-        "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_flags",
+        ":tensorflow_lite",
+        "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/core:lib",
         "//tensorflow/core/platform:logging",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/delegates/flex:delegate",
         "//tensorflow/lite/kernels:builtin_ops",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Parser",
+        "@llvm-project//mlir:StandardOps",
     ],
 )
 
@@ -844,14 +875,13 @@ cc_library(
         "//tensorflow/compiler/mlir/lite/quantization:quantization_passes",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:decode_constant_pass",
-        "//tensorflow/compiler/mlir/tensorflow:tensorflow_dialect_registration",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_passes",
         "//tensorflow/compiler/mlir/tensorflow:tf_graph_optimization_pass",
         "//tensorflow/compiler/mlir/tensorflow:tf_saved_model_passes",
         "//tensorflow/compiler/mlir/tensorflow:translate_lib",
         "//tensorflow/core:core_cpu_base",
         "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:AllPassesAndDialects",
+        "@llvm-project//mlir:AllPassesAndDialectsNoRegistration",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Transforms",
@@ -885,7 +915,7 @@ cc_library(
         "//tensorflow/stream_executor/lib",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:AllPassesAndDialects",
+        "@llvm-project//mlir:AllPassesAndDialectsNoRegistration",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Parser",
         "@llvm-project//mlir:Pass",
diff --git a/tensorflow/compiler/mlir/lite/converter_gen.cc b/tensorflow/compiler/mlir/lite/converter_gen.cc
index edead2037a3..44eba0d5e6f 100644
--- a/tensorflow/compiler/mlir/lite/converter_gen.cc
+++ b/tensorflow/compiler/mlir/lite/converter_gen.cc
@@ -513,7 +513,7 @@ static bool RuntimeVerifierWriterMain(raw_ostream &os, RecordKeeper &records) {
         continue;
       }
       if (trait.getDef().getValueAsString("trait") !=
-          "OpTrait::TFLRuntimeOpTrait") {
+          "::mlir::OpTrait::TFLRuntimeOpTrait") {
         continue;
       }
 
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_export.cc b/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
index 89fae87cb25..34200fb88b6 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
@@ -61,6 +61,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/utils/convert_type.h"
 #include "tensorflow/compiler/mlir/lite/utils/stateful_ops_utils.h"
 #include "tensorflow/compiler/mlir/op_or_arg_name_mapper.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.h"
@@ -133,63 +134,59 @@ static StatusOr<tflite::TensorType> GetTFLiteType(Type type,
     return Status(error::INVALID_ARGUMENT,
                   "'isSigned' can only be set for 8-bits integer type");
   }
-  switch (type.getKind()) {
-    case mlir::StandardTypes::F32:
-      return tflite::TensorType_FLOAT32;
-    case mlir::StandardTypes::F16:
-      return tflite::TensorType_FLOAT16;
-    case mlir::StandardTypes::F64:
-      return tflite::TensorType_FLOAT64;
-    case mlir::TF::TensorFlowTypes::STRING:
-      return tflite::TensorType_STRING;
-    case mlir::TF::TensorFlowTypes::QUINT8:
-      return tflite::TensorType_UINT8;
-    case mlir::StandardTypes::Complex: {
-      auto ftype = type.cast<mlir::ComplexType>().getElementType();
-      if (ftype && ftype.isF32()) {
-        return tflite::TensorType_COMPLEX64;
-      }
-      if (ftype && ftype.isF64()) {
-        return tflite::TensorType_COMPLEX128;
-      }
-      return Status(error::INVALID_ARGUMENT, "Unsupported type");
+
+  if (type.isF32()) {
+    return tflite::TensorType_FLOAT32;
+  } else if (type.isF16()) {
+    return tflite::TensorType_FLOAT16;
+  } else if (type.isF64()) {
+    return tflite::TensorType_FLOAT64;
+  } else if (type.isa<mlir::TF::StringType>()) {
+    return tflite::TensorType_STRING;
+  } else if (type.isa<mlir::TF::Quint8Type>()) {
+    return tflite::TensorType_UINT8;
+  } else if (auto complex_type = type.dyn_cast<mlir::ComplexType>()) {
+    auto ftype = complex_type.getElementType();
+    if (ftype.isF32()) {
+      return tflite::TensorType_COMPLEX64;
     }
-    case mlir::StandardTypes::Integer: {
-      const auto& itype = type.cast<mlir::IntegerType>();
-      switch (itype.getWidth()) {
-        case 1:
-          return tflite::TensorType_BOOL;
-        case 8:
-          return itype.isUnsigned() ? tflite::TensorType_UINT8
-                                    : tflite::TensorType_INT8;
-        case 16:
-          return tflite::TensorType_INT16;
-        case 32:
-          return tflite::TensorType_INT32;
-        case 64:
-          return tflite::TensorType_INT64;
-      }
+    if (ftype.isF64()) {
+      return tflite::TensorType_COMPLEX128;
     }
-    case mlir::quant::QuantizationTypes::UniformQuantized: {
-      auto qtype = type.cast<mlir::quant::UniformQuantizedType>();
-      return GetTFLiteType(qtype.getStorageType(), qtype.isSigned());
+    return Status(error::INVALID_ARGUMENT, "Unsupported type");
+  } else if (auto itype = type.dyn_cast<mlir::IntegerType>()) {
+    switch (itype.getWidth()) {
+      case 1:
+        return tflite::TensorType_BOOL;
+      case 8:
+        return itype.isUnsigned() ? tflite::TensorType_UINT8
+                                  : tflite::TensorType_INT8;
+      case 16:
+        return tflite::TensorType_INT16;
+      case 32:
+        return tflite::TensorType_INT32;
+      case 64:
+        return tflite::TensorType_INT64;
     }
-    case mlir::quant::QuantizationTypes::UniformQuantizedPerAxis: {
-      auto qtype = type.cast<mlir::quant::UniformQuantizedPerAxisType>();
-      return GetTFLiteType(qtype.getStorageType(), qtype.isSigned());
-    }
-    case mlir::TF::TensorFlowTypes::RESOURCE: {
-      // Treat tf.resource values as integer values in flatbuffer.
-      // TODO(b/146131919): Maybe need to have a detailed design for supporting
-      // other resource types beyonds hash table resources and resource
-      // variables.
-      return tflite::TensorType_INT32;
-    }
-    default:
-      // TFLite export fills FLOAT32 for unknown data types. Returning an error
-      // for now for safety and this could be revisited when required.
-      return Status(error::INVALID_ARGUMENT, "Unsupported type");
+  } else if (auto q_uniform_type =
+                 type.dyn_cast<mlir::quant::UniformQuantizedType>()) {
+    return GetTFLiteType(q_uniform_type.getStorageType(),
+                         q_uniform_type.isSigned());
+
+  } else if (auto q_peraxis_type =
+                 type.dyn_cast<mlir::quant::UniformQuantizedPerAxisType>()) {
+    return GetTFLiteType(q_peraxis_type.getStorageType(),
+                         q_peraxis_type.isSigned());
+  } else if (type.isa<mlir::TF::ResourceType>()) {
+    // Treat tf.resource values as integer values in flatbuffer.
+    // TODO(b/146131919): Maybe need to have a detailed design for supporting
+    // other resource types beyonds hash table resources and resource
+    // variables.
+    return tflite::TensorType_INT32;
   }
+  // TFLite export fills FLOAT32 for unknown data types. Returning an error
+  // for now for safety and this could be revisited when required.
+  return Status(error::INVALID_ARGUMENT, "Unsupported type");
 }
 
 static bool IsConst(Operation* op) {
@@ -358,8 +355,13 @@ class Translator {
     if (emit_custom_ops) {
       enabled_op_types_.emplace(OpType::kCustomOp);
     }
-    tf_dialect_ = module.getContext()->getRegisteredDialect("tf");
-    tfl_dialect_ = module.getContext()->getRegisteredDialect("tfl");
+    tf_dialect_ =
+        module.getContext()->getOrLoadDialect<mlir::TF::TensorFlowDialect>();
+    tfl_dialect_ = module.getContext()
+                       ->getOrLoadDialect<mlir::TFL::TensorFlowLiteDialect>();
+    // Right now the TF executor dialect is still needed to build NodeDef.
+    module.getContext()
+        ->getOrLoadDialect<mlir::tf_executor::TensorFlowExecutorDialect>();
   }
 
   Optional<std::string> TranslateInternal();
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_import.cc b/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
index 3c8bf26aa14..62eaffa8ed9 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
@@ -65,6 +65,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/flatbuffer_operator.h"
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/lite/utils/convert_type.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/mangling_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -254,20 +255,35 @@ mlir::Operation* ConvertMinMaxToStatsOp(const TensorT& tensor, OpBuilder b,
                                              layer_stats, axis_stats, axis);
 }
 
-StatusOr<std::string> OpNameForOpCode(const tflite::OperatorCodeT opcode) {
-  if (opcode.builtin_code == tflite::BuiltinOperator_CUSTOM) {
+// Returns true if this is a basic LSTM op.
+bool IsBasicLSTMOp(tflite::BuiltinOptionsUnion op_union) {
+  if (const auto* op = op_union.AsLSTMOptions()) {
+    return op->kernel_type == tflite::LSTMKernelType_BASIC;
+  } else {
+    return false;
+  }
+}
+
+// Gets the MLIR op name with the dialect name for the flatbuffer operator.
+StatusOr<std::string> GetMlirOpName(const tflite::OperatorT& op,
+                                    const tflite::OperatorCodeT& op_code) {
+  if (IsBasicLSTMOp(op.builtin_options)) {
+    return std::string("tfl.basic_lstm");
+  }
+
+  if (op_code.builtin_code == tflite::BuiltinOperator_CUSTOM) {
     return std::string("tfl.custom");
   }
-  if (opcode.builtin_code == tflite::BuiltinOperator_IF) {
+  if (op_code.builtin_code == tflite::BuiltinOperator_IF) {
     return std::string("tf.If");
   }
-  if (opcode.builtin_code == tflite::BuiltinOperator_WHILE) {
+  if (op_code.builtin_code == tflite::BuiltinOperator_WHILE) {
     return std::string("tf.While");
   }
 
-  const char* op_name = tflite::EnumNameBuiltinOperator(opcode.builtin_code);
-  std::string lowered_name = llvm::StringRef(op_name).lower();
-  return llvm::Twine("tfl.", lowered_name).str();
+  llvm::StringRef op_name(
+      tflite::EnumNameBuiltinOperator(op_code.builtin_code));
+  return llvm::Twine("tfl.", op_name.lower()).str();
 }
 
 // The buffers in TFLite flatbuffers have their contents stored as a vector of
@@ -464,7 +480,7 @@ StatusOr<Operation*> BuildConstOp(const tflite::TensorT& tensor,
 
     value = mlir::DenseStringElementsAttr::get(shaped_type, refs);
   } else if (elem_type.isa<mlir::ComplexType, mlir::TF::TensorFlowType>()) {
-    auto dialect = elem_type.getContext()->getRegisteredDialect("tf");
+    auto dialect = elem_type.getContext()->getLoadedDialect("tf");
     tensorflow::TensorProto repr = ConvertTfliteConstTensor(tensor, buffer);
     std::string mangled = tensorflow::mangling_util::MangleTensor(repr);
 
@@ -510,14 +526,6 @@ llvm::SmallVector<mlir::NamedAttribute, 4> ConvertSubgraphIdxsToFunctionAttrs(
   return {};
 }
 
-// Returns true if this is a basic LSTM op.
-bool IsBasicLSTMOp(tflite::BuiltinOptionsUnion op_union) {
-  if (const auto* op = op_union.AsLSTMOptions()) {
-    return op->kernel_type == tflite::LSTMKernelType_BASIC;
-  } else {
-    return false;
-  }
-}
 
 // TODO(krzysd) Handle function calls
 StatusOr<Operation*> ConvertOp(
@@ -525,7 +533,6 @@ StatusOr<Operation*> ConvertOp(
     const std::vector<mlir::TensorType>& intermediate_types,
     Value optional_arg_marker,
     const std::vector<std::unique_ptr<tflite::OperatorCodeT>>& op_codes,
-    const std::vector<std::string>& op_names,
     const std::vector<std::string>& func_names,
     const std::vector<std::unique_ptr<tflite::TensorT>>& tensors, Location loc,
     OpBuilder builder) {
@@ -537,10 +544,10 @@ StatusOr<Operation*> ConvertOp(
     return emitError(loc, err.ToString()), err;
   }
 
-  const bool is_basic_lstm = IsBasicLSTMOp(op.builtin_options);
-  const tflite::OperatorCodeT op_code = *op_codes.at(op.opcode_index);
-  const std::string& op_name =
-      is_basic_lstm ? "tfl.basic_lstm" : op_names.at(op.opcode_index);
+  const tflite::OperatorCodeT& op_code = *op_codes.at(op.opcode_index);
+
+  TF_ASSIGN_OR_RETURN(const std::string op_name, GetMlirOpName(op, op_code));
+
   OperationState op_state(loc, op_name);
 
   for (auto input_num : op.inputs) {
@@ -777,7 +784,7 @@ static StatusOr<FuncOp> PostProcessFuncOp(FuncOp func) {
         auto new_output_type = new_qtype.castFromExpressedType(
             mlir::quant::UniformQuantizedType::castToExpressedType(
                 value.getType()));
-        builder.setInsertionPointAfter(cst);
+        builder.setInsertionPointAfter(cst.getOperation());
         auto new_op = builder.create<tfl::QConstOp>(
             cst.getLoc(), new_output_type, mlir::TypeAttr::get(new_output_type),
             cst.valueAttr());
@@ -791,8 +798,7 @@ static StatusOr<FuncOp> PostProcessFuncOp(FuncOp func) {
 }
 
 // Build a FuncOp from a tflite SubGraph
-// The op_names are a mapping from indexes into the TFLite operators array to
-// the operator name MLIR expects (tfl.foo_op). The buffers are directly taken
+// The buffers are directly taken
 // from the deserialized flatbuffer as we do not have the type information to
 // interpret them until this point. The base_loc parameter is the location of
 // the flatbuffer as a whole (usually a file). The is_entry_point flag
@@ -802,7 +808,6 @@ static StatusOr<FuncOp> PostProcessFuncOp(FuncOp func) {
 StatusOr<FuncOp> ConvertSubgraph(
     const tflite::SubGraphT& subgraph, llvm::StringRef name,
     const std::vector<std::unique_ptr<tflite::OperatorCodeT>>& op_codes,
-    const std::vector<std::string>& op_names,
     const std::vector<std::string>& func_names,
     const std::vector<std::unique_ptr<tflite::BufferT>>& buffers,
     Location base_loc, Builder builder, bool is_entry_point,
@@ -1002,8 +1007,7 @@ StatusOr<FuncOp> ConvertSubgraph(
     TF_ASSIGN_OR_RETURN(
         auto* mlir_op,
         ConvertOp(*op, vals_map, intermediate_types, maybe_optional_arg_marker,
-                  op_codes, op_names, func_names, subgraph.tensors, op_loc,
-                  op_builder));
+                  op_codes, func_names, subgraph.tensors, op_loc, op_builder));
 
     // Add the results to the value maps. There are two cases: 1. the result
     // tensor does not have min/max values, the original op result is used
@@ -1069,6 +1073,10 @@ OwningModuleRef tflite::FlatBufferToMlir(
     const std::vector<std::string>& ordered_input_arrays,
     const std::vector<std::string>& ordered_output_arrays,
     bool experimental_prune_unreachable_nodes_unconditionally) {
+  context->loadDialect<
+      mlir::StandardOpsDialect, mlir::quant::QuantizationDialect,
+      mlir::TFL::TensorFlowLiteDialect, mlir::TF::TensorFlowDialect>();
+
   auto model_ptr =
       FlatBufferModel::VerifyAndBuildFromBuffer(buffer.data(), buffer.length());
   if (nullptr == model_ptr) {
@@ -1079,17 +1087,6 @@ OwningModuleRef tflite::FlatBufferToMlir(
 
   auto builder = Builder(context);
 
-  std::vector<std::string> operator_names;
-  operator_names.reserve(model->operator_codes.size());
-
-  for (auto& opcode : model->operator_codes) {
-    auto operator_name_or_error = OpNameForOpCode(*opcode);
-    if (!operator_name_or_error.ok()) {
-      return emitError(base_loc, operator_name_or_error.status().ToString()),
-             nullptr;
-    }
-    operator_names.push_back(operator_name_or_error.ConsumeValueOrDie());
-  }
 
   std::vector<std::string> func_names;
   for (auto& subgraph : model->subgraphs) {
@@ -1110,8 +1107,8 @@ OwningModuleRef tflite::FlatBufferToMlir(
     auto& subgraph = e.value();
     std::string name = SubgraphName(e.index(), *subgraph);
     auto func_or_error = ConvertSubgraph(
-        *subgraph, name, model->operator_codes, operator_names, func_names,
-        model->buffers, base_loc, builder,
+        *subgraph, name, model->operator_codes, func_names, model->buffers,
+        base_loc, builder,
         // TODO(b/131175224,b/132239787) Support multiple entry points
         /*is_entry_point=*/e.index() == 0,
         /*use_external_constant=*/use_external_constant, ordered_input_arrays,
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_operator.cc b/tensorflow/compiler/mlir/lite/flatbuffer_operator.cc
index ceaa4e215cf..60fd1160be2 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_operator.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_operator.cc
@@ -95,50 +95,44 @@ static tflite::MirrorPadMode ConvertTFL_MirrorPaddingAttrForOptionWriter(
 
 static tflite::TensorType ConvertDerivedTypeAttrForOptionWriter(
     mlir::Type type, flatbuffers::FlatBufferBuilder* builder) {
-  switch (type.getKind()) {
-    case mlir::StandardTypes::F16:
-      return tflite::TensorType_FLOAT16;
-    case mlir::StandardTypes::F32:
-      return tflite::TensorType_FLOAT32;
-    case mlir::TF::TensorFlowTypes::STRING:
-      return tflite::TensorType_STRING;
-    case mlir::StandardTypes::Complex: {
-      auto etype = type.cast<mlir::ComplexType>().getElementType();
-      if (etype.isF32()) {
-        return tflite::TensorType_COMPLEX64;
-      }
-      llvm_unreachable("invalid complex Type in conversion");
+  if (type.isF16()) {
+    return tflite::TensorType_FLOAT16;
+  } else if (type.isF32()) {
+    return tflite::TensorType_FLOAT32;
+  } else if (type.isa<mlir::TF::StringType>()) {
+    return tflite::TensorType_STRING;
+  } else if (auto complex_type = type.dyn_cast<mlir::ComplexType>()) {
+    if (complex_type.getElementType().isF32()) {
+      return tflite::TensorType_COMPLEX64;
     }
-    case mlir::StandardTypes::Integer: {
-      const auto& itype = type.cast<mlir::IntegerType>();
-      switch (itype.getWidth()) {
-        case 1:
-          return tflite::TensorType_BOOL;
-        case 8:
-          return tflite::TensorType_INT8;
-        case 16:
-          return tflite::TensorType_INT16;
-        case 32:
-          return tflite::TensorType_INT32;
-        case 64:
-          return tflite::TensorType_INT64;
-        default:
-          llvm_unreachable("invalid integer Type in conversion");
-      }
+    llvm_unreachable("invalid complex Type in conversion");
+  } else if (auto itype = type.dyn_cast<mlir::IntegerType>()) {
+    switch (itype.getWidth()) {
+      case 1:
+        return tflite::TensorType_BOOL;
+      case 8:
+        return tflite::TensorType_INT8;
+      case 16:
+        return tflite::TensorType_INT16;
+      case 32:
+        return tflite::TensorType_INT32;
+      case 64:
+        return tflite::TensorType_INT64;
+      default:
+        llvm_unreachable("invalid integer Type in conversion");
     }
-    default:
-      llvm_unreachable("invalid Type in conversion");
   }
+  llvm_unreachable("invalid Type in conversion");
 }
 
 // I32Attr already returns an int as required by flatbuffer builders.
 static int ConvertI32AttrForOptionWriter(
-    llvm::APInt i, flatbuffers::FlatBufferBuilder* builder) {
-  return i.getSExtValue();
+    int i, flatbuffers::FlatBufferBuilder* builder) {
+  return i;
 }
 
 static int ConvertPositiveI32AttrForOptionWriter(
-    llvm::APInt i, flatbuffers::FlatBufferBuilder* builder) {
+    int i, flatbuffers::FlatBufferBuilder* builder) {
   return ConvertI32AttrForOptionWriter(i, builder);
 }
 
@@ -255,7 +249,7 @@ Status mlir::CustomOptionsToAttributes(
       {static_cast<int64_t>(custom_options.size())}, builder.getIntegerType(8));
   attributes->emplace_back(builder.getNamedAttr(
       "custom_option",
-      OpaqueElementsAttr::get(builder.getContext()->getRegisteredDialect("tfl"),
+      OpaqueElementsAttr::get(builder.getContext()->getLoadedDialect("tfl"),
                               type, content)));
 
   return Status::OK();
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_translate.cc b/tensorflow/compiler/mlir/lite/flatbuffer_translate.cc
index 5b95b30a96c..94f7e2261f7 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_translate.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_translate.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Support/raw_ostream.h"
+#include "mlir/Dialect/Quant/QuantOps.h"  // from @llvm-project
 #include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
@@ -33,6 +34,8 @@ limitations under the License.
 #include "mlir/Translation.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/flatbuffer_export.h"
 #include "tensorflow/compiler/mlir/lite/flatbuffer_import.h"
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
 
 using llvm::cl::opt;
@@ -175,5 +178,11 @@ static TranslateToMLIRRegistration FlatBufferFileToMlirTransReg(
     });
 
 static TranslateFromMLIRRegistration MLIRToFlatBufferTranslate(
-    "mlir-to-tflite-flatbuffer", MlirToFlatBufferFileTranslateFunction);
+    "mlir-to-tflite-flatbuffer", MlirToFlatBufferFileTranslateFunction,
+    [](DialectRegistry& registry) {
+      registry.insert<quant::QuantizationDialect>();
+      registry.insert<TF::TensorFlowDialect>();
+      registry.insert<TFL::TensorFlowLiteDialect>();
+      registry.insert<StandardOpsDialect>();
+    });
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc b/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
index ae1e3ebe5e6..2894af9b97e 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/Matchers.h"  // from @llvm-project
 #include "mlir/IR/OpImplementation.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
@@ -40,10 +41,10 @@ limitations under the License.
 #include "mlir/Transforms/FoldUtils.h"  // from @llvm-project
 #include "mlir/Transforms/InliningUtils.h"  // from @llvm-project
 #include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/ir/tfl_structs.cc.inc"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 
 namespace mlir {
-#include "tensorflow/compiler/mlir/lite/ir/tfl_structs.cc.inc"
 namespace TFL {
 
 // Returns true when the given operand arguments have the same shape or
@@ -253,9 +254,8 @@ struct TensorFlowLiteInlinerInterface : public DialectInlinerInterface {
   }
 };
 
-struct TensorFlowLiteOpFolderDialectInterface
-    : public OpFolderDialectInterface {
-  using OpFolderDialectInterface::OpFolderDialectInterface;
+struct TensorFlowLiteDialectFoldInterface : public DialectFoldInterface {
+  using DialectFoldInterface::DialectFoldInterface;
 
   // Registered hook to check if the given region, which is attached to an
   // operation that is *not* isolated from above (i.e. no internal regions
@@ -269,13 +269,13 @@ struct TensorFlowLiteOpFolderDialectInterface
 };
 
 TensorFlowLiteDialect::TensorFlowLiteDialect(mlir::MLIRContext *context)
-    : Dialect(/*name=*/"tfl", context) {
+    : Dialect(/*name=*/"tfl", context, TypeID::get<TensorFlowLiteDialect>()) {
   addOperations<
 #define GET_OP_LIST
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.cc.inc"
       >();
   addInterfaces<TensorFlowLiteInlinerInterface,
-                TensorFlowLiteOpFolderDialectInterface>();
+                TensorFlowLiteDialectFoldInterface>();
 }
 
 //===----------------------------------------------------------------------===//
@@ -569,7 +569,7 @@ namespace {
 
 int64_t GetConcatenationOpAxis(ConcatenationOp op) {
   auto output_type = op.output().getType().cast<RankedTensorType>();
-  int64_t axis = op.axis().getSExtValue();
+  int32_t axis = op.axis();
   if (axis < 0) axis += output_type.getRank();
   return axis;
 }
@@ -1027,10 +1027,13 @@ static LogicalResult Verify(PackOp op) {
 
   // Check axis bounds.
   if (input_type.hasRank()) {
-    int64_t axis_value = op.axis().getSExtValue();
-    if (abs(axis_value) > input_type.getRank())
-      return op.emitOpError("op attribute 'axis' is out of bounds, got ")
-             << axis_value;
+    int32_t axis_value = op.axis();
+    if (axis_value < 0) axis_value += input_type.getRank() + 1;
+    if (axis_value < 0 || axis_value >= input_type.getRank() + 1)
+      return op.emitOpError()
+             << "op attribute 'axis' should be in range [-rank - 1, rank + 1), "
+             << "got rank = " << input_type.getRank()
+             << ", and axis = " << op.axis();
   }
 
   // Make sure all inputs have the same shape and element type.
@@ -1443,12 +1446,59 @@ void FakeQuantOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
 
 // TODO(b/133486129): Implement shape inference for unpack
 
-static LogicalResult Verify(UnpackOp op) {
-  // TODO(antiagainst): Implement other checks as in
-  // tensorflow/lite/kernels/unpack.cc
+LogicalResult UnpackOp::inferReturnTypes(
+    MLIRContext *context, Optional<Location> loc, ValueRange operands,
+    DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<Type> &inferredReturnTypes) {
+  UnpackOpAdaptor op(operands, attributes);
+  // TODO(jpienaar): Refactor verify
+  if (failed(op.verify(loc.hasValue() ? *loc : UnknownLoc::get(context))))
+    return failure();
 
-  if (op.getOperation()->getNumResults() != op.num())
-    return op.emitOpError("output count should match 'num' attribute");
+  if (operands.size() != 1) {
+    return emitOptionalError(loc, "input count should be equal to 1");
+  }
+
+  const int64_t num_value = op.num().getInt();
+  auto input_type = operands[0].getType().dyn_cast<ShapedType>();
+  if (!input_type || !input_type.hasRank()) {
+    // If input is unranked, then so is output.
+    inferredReturnTypes.assign(
+        num_value, UnrankedTensorType::get(input_type.getElementType()));
+    return success();
+  }
+
+  if (input_type.hasStaticShape() && input_type.getNumElements() <= 0) {
+    return emitOptionalError(
+        loc, "number of elements in input shoule be larger than 0");
+  }
+
+  const int64_t rank = input_type.getRank();
+  if (rank <= 0) {
+    return emitOptionalError(loc, "input should be of rank larger than 0");
+  }
+
+  int64_t axis_value = op.axis().getInt();
+  if (axis_value < 0) {
+    axis_value += rank;
+  }
+  if (axis_value < 0 || axis_value >= rank) {
+    return emitOptionalError(
+        loc, "attribute 'axis' should be in range [-rank, rank), got axis = ",
+        op.axis().getInt(), ", and rank = ", rank);
+  }
+
+  if (!ShapedType::isDynamic(input_type.getDimSize(axis_value)) &&
+      input_type.getDimSize(axis_value) != num_value) {
+    return emitOptionalError(loc, "output count should match 'num' attribute");
+  }
+
+  auto output_shape = llvm::to_vector<4>(input_type.getShape());
+  output_shape.erase(output_shape.begin() + axis_value);
+
+  auto output_type =
+      RankedTensorType::get(output_shape, input_type.getElementType());
+  inferredReturnTypes.assign(num_value, output_type);
 
   return success();
 }
@@ -1495,7 +1545,7 @@ static LogicalResult VerifySplitOpOutputTypes(
 }
 
 static LogicalResult Verify(SplitOp op) {
-  int64_t num_splits = op.num_splits().getSExtValue();
+  int64_t num_splits = op.num_splits();
   if (op.getNumResults() != num_splits)
     return op.emitOpError("output count should match 'num_splits' attribute");
 
@@ -1531,7 +1581,7 @@ static LogicalResult Verify(SplitOp op) {
 }
 
 static LogicalResult Verify(SplitVOp op) {
-  int64_t num_splits = op.num_splits().getSExtValue();
+  int64_t num_splits = op.num_splits();
   if (op.getNumResults() != num_splits)
     return op.emitOpError("output count should match 'num_splits' attribute");
 
@@ -2327,8 +2377,16 @@ LogicalResult WhileOp::moveOutOfLoop(llvm::ArrayRef<mlir::Operation *> ops) {
 //===----------------------------------------------------------------------===//
 
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops_interface.cc.inc"
+
+}  // namespace TFL
+}  // namespace mlir
+
 #define GET_OP_CLASSES
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.cc.inc"
+
+namespace mlir {
+namespace TFL {
+
 #include "tensorflow/compiler/mlir/lite/runtime_verifiers.inc"
 
 Operation *TensorFlowLiteDialect::materializeConstant(OpBuilder &builder,
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.h b/tensorflow/compiler/mlir/lite/ir/tfl_ops.h
index caed0bb3ad9..589f18d789d 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.h
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.h
@@ -26,14 +26,15 @@ limitations under the License.
 #include "mlir/IR/OpImplementation.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/Interfaces/DerivedAttributeOpInterface.h"  // from @llvm-project
+#include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project
 #include "mlir/Interfaces/LoopLikeInterface.h"  // from @llvm-project
 #include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/ir/tfl_structs.h.inc"
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
 namespace mlir {
-#include "tensorflow/compiler/mlir/lite/ir/tfl_structs.h.inc"
 namespace TFL {
 
 class TensorFlowLiteDialect : public Dialect {
@@ -49,10 +50,11 @@ class TensorFlowLiteDialect : public Dialect {
 };
 
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops_interface.h.inc"
-#define GET_OP_CLASSES
-#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h.inc"
 
 }  // end namespace TFL
 }  // end namespace mlir
 
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h.inc"
+
 #endif  // TENSORFLOW_COMPILER_MLIR_LITE_IR_TFL_OPS_H_
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
index 6dc9fda656f..1b91c0dbe61 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
@@ -19,6 +19,7 @@ limitations under the License.
 #define TFL_OPS
 
 include "mlir/IR/OpBase.td"
+include "mlir/Interfaces/InferTypeOpInterface.td"
 include "mlir/Interfaces/LoopLikeInterface.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "tensorflow/compiler/mlir/lite/ir/tfl_op_interfaces.td"
@@ -38,7 +39,7 @@ def TFL_Dialect : Dialect {
       represented using zero-dimensional tensors);
   }];
 
-  let cppNamespace = "TFL";
+  let cppNamespace = "::mlir::TFL";
 }
 
 //===----------------------------------------------------------------------===//
@@ -107,7 +108,11 @@ def OpaqueBytesAttr : ElementsAttrBase<
           ".getElementType().isInteger(8)">,
   ]>,
   "opaque bytes attribute"
- >;
+ > {
+  let storageType = [{ OpaqueElementsAttr }];
+  let returnType = [{ OpaqueElementsAttr }];
+  let convertFromStorage = "$_self";
+}
 
 //===----------------------------------------------------------------------===//
 // Derived shape attribute class.
@@ -2442,8 +2447,7 @@ def TFL_ReluOp: TFL_Op<"relu", [
     PredOpTrait<"x and y must have same element type",
       TFL_TCresVTEtIsSameAsOp<0, 0>>,
     NoSideEffect,
-    SameOperandsAndResultShape,
-    SameOperandsAndResultsScale]> {
+    SameOperandsAndResultShape]> {
   let summary = "Relu operator";
 
   let description = [{
@@ -2471,8 +2475,7 @@ def TFL_Relu6Op: TFL_Op<"relu6", [
     PredOpTrait<"x and y must have same element type",
       TFL_TCresVTEtIsSameAsOp<0, 0>>,
     NoSideEffect,
-    SameOperandsAndResultShape,
-    SameOperandsAndResultsScale]> {
+    SameOperandsAndResultShape]> {
   let summary = "Relu6 operator";
 
   let description = [{
@@ -2500,8 +2503,7 @@ def TFL_Relu1Op: TFL_Op<"relu_n1_to_1", [
     PredOpTrait<"x and y must have same element type",
       TFL_TCresVTEtIsSameAsOp<0, 0>>,
     NoSideEffect,
-    SameOperandsAndResultShape,
-    SameOperandsAndResultsScale]> {
+    SameOperandsAndResultShape]> {
   let summary = "Relu1 operator";
 
   let description = [{
@@ -3024,7 +3026,8 @@ def TFL_TransposeOp : TFL_Op<"transpose", [
 def TFL_UnpackOp : TFL_Op<"unpack", [
     NoSideEffect,
     SameOperandsAndResultElementType,
-    SameOperandsAndResultsScale]> {
+    SameOperandsAndResultsScale,
+    DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
   let summary = "Unpacks a tensor along a dimension into multiple tensors";
 
   let description = [{
@@ -3047,7 +3050,7 @@ def TFL_UnpackOp : TFL_Op<"unpack", [
   let arguments = (ins
     TFL_TensorOf<[F32, I1, I8, UI8, I32, QI8, QUI8, I16, QI16]>:$input,
 
-    I32Attr:$num,
+    Confined<I32Attr, [IntNonNegative]>:$num,
     I32Attr:$axis
   );
 
@@ -3055,8 +3058,6 @@ def TFL_UnpackOp : TFL_Op<"unpack", [
     TFL_VariadicTensorOf<[F32, I1, I8, UI8, I32, QI8, QUI8, I16, QI16]>:$outputs
   );
 
-  let verifier = [{ return Verify(*this); }];
-
   let hasOptions = 1;
 }
 
diff --git a/tensorflow/compiler/mlir/lite/mlir_tflite_runner.cc b/tensorflow/compiler/mlir/lite/mlir_tflite_runner.cc
index 0d42fbb9646..35a58a01a29 100644
--- a/tensorflow/compiler/mlir/lite/mlir_tflite_runner.cc
+++ b/tensorflow/compiler/mlir/lite/mlir_tflite_runner.cc
@@ -30,12 +30,16 @@ limitations under the License.
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SMLoc.h"
 #include "llvm/Support/SourceMgr.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/Dialect.h"  // from @llvm-project
 #include "mlir/IR/Function.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Module.h"  // from @llvm-project
 #include "mlir/Parser.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/flatbuffer_export.h"
 #include "tensorflow/compiler/mlir/lite/flatbuffer_export_flags.h"
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/lite/delegates/flex/delegate.h"
@@ -98,6 +102,10 @@ int main(int argc, char** argv) {
 
   // Load the MLIR module.
   mlir::MLIRContext context;
+  context.getDialectRegistry()
+      .insert<mlir::TF::TensorFlowDialect, mlir::TFL::TensorFlowLiteDialect,
+              mlir::StandardOpsDialect>();
+
   llvm::SourceMgr source_mgr;
   source_mgr.AddNewSourceBuffer(std::move(*file_or_err), llvm::SMLoc());
   mlir::OwningModuleRef module(mlir::parseSourceFile(source_mgr, &context));
diff --git a/tensorflow/compiler/mlir/lite/quantization/import_quant_stats_pass.cc b/tensorflow/compiler/mlir/lite/quantization/import_quant_stats_pass.cc
index 6299a70b1df..7e7d4678a87 100644
--- a/tensorflow/compiler/mlir/lite/quantization/import_quant_stats_pass.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/import_quant_stats_pass.cc
@@ -62,6 +62,10 @@ class ImportQuantStatsPass
 
   void runOnFunction() override;
 
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<quant::QuantizationDialect>();
+  }
+
   // Parses the serialized quant stats protobuf and initialize the internal
   // data structure. This method must be called after the pass is created.
   bool ParseQuantStats(const std::string &stats_str);
diff --git a/tensorflow/compiler/mlir/lite/quantization/lite/BUILD b/tensorflow/compiler/mlir/lite/quantization/lite/BUILD
index 31c0e4cb8a9..38c7ad86e05 100644
--- a/tensorflow/compiler/mlir/lite/quantization/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/quantization/lite/BUILD
@@ -28,6 +28,7 @@ cc_library(
     deps = [
         "//tensorflow/compiler/mlir/lite:common",
         "//tensorflow/compiler/mlir/lite:flatbuffer_translate_lib",
+        "//tensorflow/compiler/mlir/lite:tensorflow_lite",
         "//tensorflow/compiler/mlir/lite:tensorflow_lite_quantize",
         "//tensorflow/compiler/mlir/lite/quantization:quantization_config",
         "//tensorflow/compiler/mlir/tensorflow:error_util",
@@ -74,6 +75,6 @@ tf_cc_binary(
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:AllPassesAndDialects",
+        "@llvm-project//mlir:AllPassesAndDialectsNoRegistration",
     ],
 )
diff --git a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.cc b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.cc
index a2e3c065113..238710bcf13 100644
--- a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/common/tfl_pass_config.h"
 #include "tensorflow/compiler/mlir/lite/flatbuffer_export.h"
 #include "tensorflow/compiler/mlir/lite/flatbuffer_import.h"
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_config.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 #include "tensorflow/compiler/mlir/lite/utils/convert_type.h"
@@ -52,6 +53,7 @@ TfLiteStatus QuantizeModel(
   }
 
   MLIRContext context;
+  context.getDialectRegistry().insert<mlir::TFL::TensorFlowLiteDialect>();
   StatusScopedDiagnosticHandler statusHandler(&context,
                                               /*propagate=*/true);
 
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc b/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc
index 9e0ad990657..16b51496b5f 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc
@@ -99,12 +99,14 @@ class QuantizationDriver {
  public:
   explicit QuantizationDriver(FuncOp fn, bool is_signed,
                               bool disable_per_channel,
-                              OpQuantSpecGetter op_quant_spec_getter)
+                              OpQuantSpecGetter op_quant_spec_getter,
+                              bool enforce_fixed_output_range)
       : fn_(fn),
         builder_(fn.getBody()),
         is_signed_(is_signed),
         disable_per_channel_(disable_per_channel),
-        op_quant_spec_getter_(op_quant_spec_getter) {}
+        op_quant_spec_getter_(op_quant_spec_getter),
+        enforce_fixed_output_range_(enforce_fixed_output_range) {}
 
   // The entry point of the quantization parameters propagation.
   void Run();
@@ -354,6 +356,8 @@ class QuantizationDriver {
   llvm::SmallVector<BlockArgument, 4> args_;
 
   OpQuantSpecGetter op_quant_spec_getter_;
+
+  bool enforce_fixed_output_range_;
 };
 }  // namespace
 
@@ -794,7 +798,8 @@ bool QuantizationDriver::PropagateParams() {
     }
 
     // TODO(fengliuai): make the bit width configurable.
-    if (auto restricted = llvm::dyn_cast<FixedOutputRangeInterface>(op)) {
+    auto restricted = llvm::dyn_cast<FixedOutputRangeInterface>(op);
+    if (restricted && enforce_fixed_output_range_) {
       // TODO(fengliuai): different result can have different fixed range.
       auto params = restricted.GetFixedOutputRange(is_signed_, /*bit_width=*/8);
       for (auto i = 0; i < op->getNumResults(); ++i) {
@@ -864,10 +869,12 @@ void QuantizationDriver::Run() {
   }
 }
 
-void ApplyQuantizationParamsPropagation(
-    mlir::FuncOp func, bool is_signed, bool disable_per_channel,
-    OpQuantSpecGetter op_quant_spec_getter) {
-  QuantizationDriver(func, is_signed, disable_per_channel, op_quant_spec_getter)
+void ApplyQuantizationParamsPropagation(mlir::FuncOp func, bool is_signed,
+                                        bool disable_per_channel,
+                                        OpQuantSpecGetter op_quant_spec_getter,
+                                        bool post_training_quantization) {
+  QuantizationDriver(func, is_signed, disable_per_channel, op_quant_spec_getter,
+                     post_training_quantization)
       .Run();
 }
 
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h
index 07e5ba4e879..eb9843f6e4a 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h
@@ -106,9 +106,9 @@ struct ConvertStatsToQDQs : public OpRewritePattern<quant::StatisticsOp> {
         mins.push_back(FloatAttr::getValueAsDouble(*it++));
         maxs.push_back(FloatAttr::getValueAsDouble(*it));
       }
-      quant_type = quant::fakeQuantAttrsToType(
-          op.getLoc(), num_bits, op.axis()->getSExtValue(), mins, maxs,
-          narrow_range, expressed, is_signed);
+      quant_type =
+          quant::fakeQuantAttrsToType(op.getLoc(), num_bits, *op.axis(), mins,
+                                      maxs, narrow_range, expressed, is_signed);
     } else if (auto stats = op.layerStats().dyn_cast<DenseFPElementsAttr>()) {
       double rmin = FloatAttr::getValueAsDouble(stats.getValue<APFloat>({0}));
       double rmax = FloatAttr::getValueAsDouble(stats.getValue<APFloat>({1}));
@@ -119,7 +119,7 @@ struct ConvertStatsToQDQs : public OpRewritePattern<quant::StatisticsOp> {
       return failure();
     }
 
-    rewriter.setInsertionPointAfter(op);
+    rewriter.setInsertionPointAfter(op.getOperation());
     Type result_type = quant_type.castFromExpressedType(op.getType());
     auto q = rewriter.create<Q>(op.getLoc(), result_type, op.arg());
     auto dq = rewriter.create<DQ>(op.getLoc(), op.getType(), q);
@@ -490,9 +490,13 @@ quant::QuantizedType GetUniformQuantizedTypeForBias(
 // and the propagation results are materialized by inserting pairs of quantize
 // and dequantize ops to this function. Set `disable_per_channel` to true to not
 // use per channel quantization even the op supports it.
+// Setting `enforce_fixed_output_range` to true, to infer quantization
+// parameters from the fixed output range ops. This is only used for
+// post-training quantization.
 void ApplyQuantizationParamsPropagation(mlir::FuncOp func, bool is_signed,
                                         bool disable_per_channel,
-                                        OpQuantSpecGetter op_quant_spec_getter);
+                                        OpQuantSpecGetter op_quant_spec_getter,
+                                        bool enforce_fixed_output_range);
 
 // The function might contain more stats ops than required, and it will
 // introduce requantize if the calibration stats have conflicts. This method
diff --git a/tensorflow/compiler/mlir/lite/quantization/tensorflow/tf_to_quant.cc b/tensorflow/compiler/mlir/lite/quantization/tensorflow/tf_to_quant.cc
index 0826b3265f6..b043834188c 100644
--- a/tensorflow/compiler/mlir/lite/quantization/tensorflow/tf_to_quant.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/tensorflow/tf_to_quant.cc
@@ -106,9 +106,8 @@ struct InsertQuantOpsAfterTFFakeQuantOp
     }
     // Use the min/max from the operands and the num_bits and narrow_range
     // attribute to create the quantization parameter for the new quantize op.
-    rewriter.setInsertionPointAfter(tf_op);
-    IntegerAttr num_bits =
-        rewriter.getI64IntegerAttr(tf_op.num_bits().getSExtValue());
+    rewriter.setInsertionPointAfter(tf_op.getOperation());
+    IntegerAttr num_bits = rewriter.getI64IntegerAttr(tf_op.num_bits());
     BoolAttr narrow_range = rewriter.getBoolAttr(tf_op.narrow_range());
     Type res_type = tf_op.getType();
     TypeAttr qtype = quant::GetQuantizedTypeAttr(
diff --git a/tensorflow/compiler/mlir/lite/quantization/tools/op_quant_spec_getters_gen.cc b/tensorflow/compiler/mlir/lite/quantization/tools/op_quant_spec_getters_gen.cc
index 208fb4c8a56..fc56ad05535 100644
--- a/tensorflow/compiler/mlir/lite/quantization/tools/op_quant_spec_getters_gen.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/tools/op_quant_spec_getters_gen.cc
@@ -55,7 +55,7 @@ static bool OpQuantSpecWriter(raw_ostream &os, RecordKeeper &records) {
     for (const auto t : op.getTraits()) {
       if (auto opTrait = llvm::dyn_cast<mlir::tblgen::NativeOpTrait>(&t)) {
         auto trait = opTrait->getTrait();
-        if (!trait.consume_front("OpTrait::quant::")) continue;
+        if (!trait.consume_front("::mlir::OpTrait::quant::")) continue;
 
         OUT(2) << "if (auto tfl = llvm::dyn_cast<" << op.getQualCppClassName()
                << ">(op)) {\n";
@@ -65,7 +65,7 @@ static bool OpQuantSpecWriter(raw_ostream &os, RecordKeeper &records) {
           OUT(4) << "for (int i = 0, e = op->getNumResults(); i != e; ++i)\n";
           OUT(6) << "spec->restricted_output_params[std::make_pair("
                  << matches[1] << ", " << matches[2]
-                 << ")].push_back(tfl.OpTrait::quant::" << trait << "<"
+                 << ")].push_back(tfl.::mlir::OpTrait::quant::" << trait << "<"
                  << op.getQualCppClassName()
                  << ">::GetResultQuantizedType(i));\n";
           matches.clear();
diff --git a/tensorflow/compiler/mlir/lite/tests/end2end/conv_2d_nchw.pbtxt b/tensorflow/compiler/mlir/lite/tests/end2end/conv_2d_nchw.pbtxt
new file mode 100644
index 00000000000..5f498a404a9
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/end2end/conv_2d_nchw.pbtxt
@@ -0,0 +1,232 @@
+# RUN: tf_tfl_translate -tf-input-arrays=input -tf-input-shapes=1,8,8,2 -tf-input-data-types=DT_FLOAT -tf-output-arrays=output_0 -print-function-result-mapping %s -o - 2>&1 | FileCheck %s
+
+node {
+  name: "input"
+  op: "Placeholder"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 1
+        }
+        dim {
+          size: 8
+        }
+        dim {
+          size: 8
+        }
+        dim {
+          size: 2
+        }
+      }
+    }
+  }
+}
+node {
+  name: "conv_net_2d/conv_2d_0/w"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 3
+          }
+          dim {
+            size: 2
+          }
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: ";;\177<5\241i\275\312f\211>#\346j>\033W\325\275\253>\210=Vr\r\276\304\222\313\276\374\346\214>\016e\211>)\253\000>\3241\337\275\235g-\276*(\216\276\326#\367\274\023\213\300\276\227\031\206>PUF=\253\330\263<\337IL\276\334\320\215>\377\306v\276\372C\302\273baM>H\314\270<2\221\352=J\026{\276\221\243\245\276?\314\240=UW2\2755\207\253\274\256\207\333\273\335\372\227>\246\232;\276%\r\374<Z\346\204>"
+      }
+    }
+  }
+}
+node {
+  name: "conv_net_2d/conv_2d_0/w/read"
+  op: "Identity"
+  input: "conv_net_2d/conv_2d_0/w"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@conv_net_2d/conv_2d_0/w"
+      }
+    }
+  }
+}
+node {
+  name: "conv_net_2d_1/conv_2d_0/convolution"
+  op: "Conv2D"
+  input: "input"
+  input: "conv_net_2d/conv_2d_0/w/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NCHW"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "explicit_paddings"
+    value {
+      list {
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "conv_net_2d/conv_2d_0/b"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\315\314\314=\315\314\314="
+      }
+    }
+  }
+}
+node {
+  name: "conv_net_2d/conv_2d_0/b/read"
+  op: "Identity"
+  input: "conv_net_2d/conv_2d_0/b"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@conv_net_2d/conv_2d_0/b"
+      }
+    }
+  }
+}
+node {
+  name: "conv_net_2d_1/conv_2d_0/BiasAdd"
+  op: "BiasAdd"
+  input: "conv_net_2d_1/conv_2d_0/convolution"
+  input: "conv_net_2d/conv_2d_0/b/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "conv_net_2d_1/Relu"
+  op: "Relu"
+  input: "conv_net_2d_1/conv_2d_0/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "output_0"
+  op: "Identity"
+  input: "conv_net_2d_1/Relu"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+library {
+}
+
+# CHECK: 'main' inputs:
+# CHECK-NEXT: name: 'input'
+# CHECK-NEXT: 'main' outputs:
+# CHECK-NEXT: name: 'output_0'
diff --git a/tensorflow/compiler/mlir/lite/tests/end2end/if_op.pbtxt b/tensorflow/compiler/mlir/lite/tests/end2end/if_op.pbtxt
index f482e3db6b9..a7f6040f211 100644
--- a/tensorflow/compiler/mlir/lite/tests/end2end/if_op.pbtxt
+++ b/tensorflow/compiler/mlir/lite/tests/end2end/if_op.pbtxt
@@ -1,4 +1,4 @@
-# RUN: tf_tfl_translate -tf-input-arrays=a,b -tf-input-data-types=DT_FLOAT,DT_FLOAT -tf-input-shapes=4:4 -tf-output-arrays=StatefulIf,StatelessIf %s -o - --output-mlir | FileCheck %s
+# RUN: tf_tfl_translate -tf-input-arrays=a,b -tf-input-data-types=DT_FLOAT,DT_FLOAT -tf-input-shapes=: -tf-output-arrays=StatefulIf,StatelessIf %s -o - --output-mlir | FileCheck %s
 node {
   name: "tf.Less"
   op: "Less"
diff --git a/tensorflow/compiler/mlir/lite/tests/fuse-tftext.mlir b/tensorflow/compiler/mlir/lite/tests/fuse-tftext.mlir
index f6f32e7a069..138614d81e6 100644
--- a/tensorflow/compiler/mlir/lite/tests/fuse-tftext.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/fuse-tftext.mlir
@@ -3435,4 +3435,19 @@ func @NGrams_SlidingWindow_RaggedConcat_assert_equal_2_Assert_AssertGuard_false_
 }
 // CHECK:  func @ngrams_ragged_rank_2(%arg0: tensor<?x!tf.string> {tf._user_specified_name = "values"}, %arg1: tensor<3xi64> {tf._user_specified_name = "args_0"}, %arg2: tensor<?xi64> {tf._user_specified_name = "args_1"}) -> (tensor<?x!tf.string>, tensor<3xi64>, tensor<?xi64>) attributes {sym_visibility = "private", tf._implements = #tf.func<@"tftext:Ngrams", {axis = -1 : i64, reduction_type = "STRING_JOIN", string_separator = "", width = 2 : i64}>, tf._input_shapes = [#tf.shape<?>, #tf.shape<3>, #tf.shape<?>], tf.signature.is_stateful} {
 // CHECK:    %0:3 = "tfl.custom"(%arg0, %arg1, %arg2) {custom_code = "tftext:Ngrams", custom_option = opaque<"tfl", "0x776964746800737472696E675F736570617261746F720000006178697300726564756374696F6E5F74797065000B535452494E475F4A4F494E0004221E373E040104FF152C0204141404082401"> : tensor<77xi8>} : (tensor<?x!tf.string>, tensor<3xi64>, tensor<?xi64>) -> (tensor<?x!tf.string>, tensor<3xi64>, tensor<?xi64>)
-// CHECK:    return %0#0, %0#1, %0#2 : tensor<?x!tf.string>, tensor<3xi64>, tensor<?xi64>
\ No newline at end of file
+// CHECK:    return %0#0, %0#1, %0#2 : tensor<?x!tf.string>, tensor<3xi64>, tensor<?xi64>
+
+
+func @sgnn_projection(%arg0: tensor<?x!tf.string> {tf._user_specified_name = "values"}, %arg1: tensor<?xi64> {tf._user_specified_name = "row_splits"}) -> tensor<?x10xf64> attributes {sym_visibility = "private", tf._implements = #tf.func<@"tftext:custom:SgnnProjection", {buckets = 2147483647 : i64, hash_seed = [1902835825, -1475704015, 473120514, 1254202069, 1558833093, 1756181982, 1906603252, -1034142694, 542842690, 535515822]}>, tf._input_shapes = [#tf.shape<?>, #tf.shape<?>], tf.signature.is_stateful} {
+  %0 = "tf.Const"() {value = dense<[[1902835825], [-1475704015], [473120514], [1254202069], [1558833093], [1756181982], [1906603252], [-1034142694], [542842690], [535515822]]> : tensor<10x1xi64>} : () -> tensor<10x1xi64>
+  %1 = "tf.StringToHashBucketFast"(%arg0) {device = "", num_buckets = 2147483647 : i64} : (tensor<?x!tf.string>) -> tensor<?xi64>
+  %2 = "tf.Sgnn"(%1, %0) {device = ""} : (tensor<?xi64>, tensor<10x1xi64>) -> tensor<10x?xf64>
+  %3 = "tf.Const"() {value = dense<1> : tensor<1xi64>} : () -> tensor<1xi64>
+  %4 = "tf.Reshape"(%2, %3) : (tensor<10x?xf64>, tensor<1xi64>) -> tensor<?x10xf64>
+  return %4 : tensor<?x10xf64>
+}
+
+
+// CHECK: func @sgnn_projection(%arg0: tensor<?x!tf.string> {tf._user_specified_name = "values"}, %arg1: tensor<?xi64> {tf._user_specified_name = "row_splits"}) -> tensor<?x10xf64> attributes {sym_visibility = "private", tf._implements = #tf.func<@"tftext:custom:SgnnProjection", {buckets = 2147483647 : i64, hash_seed = [1902835825, -1475704015, 473120514, 1254202069, 1558833093, 1756181982, 1906603252, -1034142694, 542842690, 535515822]}>, tf._input_shapes = [#tf.shape<?>, #tf.shape<?>], tf.signature.is_stateful} {
+// CHECK:   %0 = "tfl.custom"(%arg0, %arg1) {custom_code = "tftext:custom:SgnnProjection", custom_option = opaque<"tfl", "0x686173685F736565640000000A00000071F86A71318B0AA8023F331CD59AC14AC5E7E95CDE35AD68F474A4711A3C5CC2421F5B20AE52EB1F6275636B6574730002094200030000000100000002000000FFFFFF7F44000000062E0A2601"> : tensor<93xi8>} : (tensor<?x!tf.string>, tensor<?xi64>) -> tensor<?x10xf64>
+// CHECK:   return %0 : tensor<?x10xf64>
diff --git a/tensorflow/compiler/mlir/lite/tests/legalize-tf-no-runtime-verification.mlir b/tensorflow/compiler/mlir/lite/tests/legalize-tf-no-runtime-verification.mlir
index 90266b4e78e..3c390df74b4 100644
--- a/tensorflow/compiler/mlir/lite/tests/legalize-tf-no-runtime-verification.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/legalize-tf-no-runtime-verification.mlir
@@ -1,12 +1,11 @@
-// RUN: tf-opt %s -tfl-legalize-tf='run-tfl-runtime-verification=false' | FileCheck %s
+// RUN: tf-opt %s -tfl-prepare-tf -tfl-legalize-tf='run-tfl-runtime-verification=false' | FileCheck %s
 
 func @broadcast_to_bf16(%arg0: tensor<3xbf16>, %arg1: tensor<2xi64>) -> tensor<3x3xbf16> {
   %0 = "tf.BroadcastTo"(%arg0, %arg1) : (tensor<3xbf16>, tensor<2xi64>) -> tensor<3x3xbf16>
   return %0: tensor<3x3xbf16>
 
 // CHECK-LABEL: broadcast_to_bf16
-// CHECK:  [[CST:%.*]] = constant dense<1.000000e+00> : tensor<bf16>
-// CHECK:  [[FILL:%.*]] = "tfl.fill"(%arg1, [[CST]]) : (tensor<2xi64>, tensor<bf16>) -> tensor<3x3xbf16>
-// CHECK:  [[MUL:%.*]] = "tfl.mul"(%arg0, [[FILL]]) {fused_activation_function = "NONE"} : (tensor<3xbf16>, tensor<3x3xbf16>) -> tensor<3x3xbf16>
+// CHECK:  [[CST:%.*]] = constant dense<1.000000e+00> : tensor<3x3xbf16>
+// CHECK:  [[MUL:%.*]] = "tfl.mul"(%arg0, [[CST]]) {fused_activation_function = "NONE"} : (tensor<3xbf16>, tensor<3x3xbf16>) -> tensor<3x3xbf16>
 // CHECK:  return [[MUL]] : tensor<3x3xbf16>
 }
diff --git a/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
index 7cb9c4dd22c..3a2a0a8b9d2 100644
--- a/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-opt %s -tfl-legalize-tf | FileCheck %s
+// RUN: tf-opt %s -tfl-legalize-tf --cse | FileCheck %s
 
 func @add(%arg0: tensor<1xf32>, %arg1: tensor<1xf32>) -> tensor<1xf32> {
   %0 = "tf.Add"(%arg0, %arg1) : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
@@ -196,7 +196,6 @@ func @shape(%arg0: tensor<?x1001xf32>) -> tensor<2xi32> {
 
 // CHECK-LABEL: shape
 // CHECK:  "tfl.shape"(%arg0) : (tensor<?x1001xf32>) -> tensor<2xi32>
-// CHECK:  %1 = "tfl.shape"(%arg0) : (tensor<?x1001xf32>) -> tensor<2xi32>
 }
 
 func @fill(%arg0: tensor<3xi32>, %arg1: tensor<f32>) -> tensor<?x?x?xf32> {
@@ -719,9 +718,8 @@ func @matrix_diag_v2_no_match(%arg0: tensor<8x16xf32>) -> tensor<8x16x16xf32> {
 // CHECK-SAME:                                  [[VAL_0:%.*]]: tensor<8x16xf32>) -> tensor<8x16x16xf32> {
 // CHECK:           [[VAL_1:%.*]] = constant dense<1> : tensor<1xi32>
 // CHECK:           [[VAL_2:%.*]] = constant dense<-1> : tensor<1xi32>
-// CHECK:           [[VAL_5:%.*]] = constant dense<-1> : tensor<1xi32>
 // CHECK:           [[VAL_3:%.*]] = constant dense<0> : tensor<2xi32>
-// CHECK:           [[VAL_4:%.*]] = "tf.MatrixDiagV2"([[VAL_0]], [[VAL_1]], [[VAL_2]], [[VAL_5]], [[VAL_3]]) : (tensor<8x16xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<2xi32>) -> tensor<8x16x16xf32>
+// CHECK:           [[VAL_4:%.*]] = "tf.MatrixDiagV2"([[VAL_0]], [[VAL_1]], [[VAL_2]], [[VAL_2]], [[VAL_3]]) : (tensor<8x16xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<2xi32>) -> tensor<8x16x16xf32>
 // CHECK:           return [[VAL_4]] : tensor<8x16x16xf32>
 }
 
@@ -753,9 +751,8 @@ func @matrix_diag_v3_no_match(%arg0: tensor<8x16xf32>) -> tensor<8x16x16xf32> {
 // CHECK-SAME:      [[VAL_0:%.*]]: tensor<8x16xf32>) -> tensor<8x16x16xf32> {
 // CHECK:           [[VAL_1:%.*]] = constant dense<1> : tensor<1xi32>
 // CHECK:           [[VAL_2:%.*]] = constant dense<-1> : tensor<1xi32>
-// CHECK:           [[VAL_5:%.*]] = constant dense<-1> : tensor<1xi32>
 // CHECK:           [[VAL_3:%.*]] = constant dense<0> : tensor<2xi32>
-// CHECK:           [[VAL_4:%.*]] = "tf.MatrixDiagV3"([[VAL_0]], [[VAL_1]], [[VAL_2]], [[VAL_5]], [[VAL_3]]) : (tensor<8x16xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<2xi32>) -> tensor<8x16x16xf32>
+// CHECK:           [[VAL_4:%.*]] = "tf.MatrixDiagV3"([[VAL_0]], [[VAL_1]], [[VAL_2]], [[VAL_2]], [[VAL_3]]) : (tensor<8x16xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<2xi32>) -> tensor<8x16x16xf32>
 // CHECK:           return [[VAL_4]] : tensor<8x16x16xf32>
 }
 
@@ -1029,14 +1026,48 @@ func @splitv(%arg0: tensor<1x4x3x3xf32>, %arg1: tensor<2xi32>, %arg2: tensor<i32
   // CHECK: "tfl.split_v"(%arg0, %arg1, %arg2) {num_splits = 2 : i32} : (tensor<1x4x3x3xf32>, tensor<2xi32>, tensor<i32>) -> (tensor<1x4x2x3xf32>, tensor<1x4x1x3xf32>)
 }
 
-func @matmul_transposed(%arg0: tensor<40x37xf32>, %arg1: tensor<40x37xf32>) -> tensor<40x40xf32> {
+func @matmul(%arg0: tensor<40x37xf32>, %arg1: tensor<37x40xf32>) -> tensor<40x40xf32> {
+  %0 = "tf.MatMul"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", device = "/device:CPU:0", name = "MatMul", transpose_a = false, transpose_b = false} :
+(tensor<40x37xf32>, tensor<37x40xf32>) -> tensor<40x40xf32>
+  return %0 : tensor<40x40xf32>
+// CHECK-LABEL: matmul
+// CHECK: %[[CST:.*]] = constant dense<[1, 0]> : tensor<2xi32>
+// CHECK: %[[ARG:.*]] = "tfl.transpose"(%arg1, %[[CST]]) : (tensor<37x40xf32>, tensor<2xi32>) -> tensor<40x37xf32>
+// CHECK: %[[CST_0:.*]] = constant unit
+// CHECK: "tfl.fully_connected"(%arg0, %[[ARG]], %[[CST_0]]) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<40x37xf32>, tensor<40x37xf32>, none) -> tensor<40x40xf32>
+}
+
+func @matmul_transposed_a(%arg0: tensor<37x40xf32>, %arg1: tensor<37x40xf32>) -> tensor<40x40xf32> {
+  %0 = "tf.MatMul"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", device = "/device:CPU:0", name = "MatMul", transpose_a = true, transpose_b = false} :
+(tensor<37x40xf32>, tensor<37x40xf32>) -> tensor<40x40xf32>
+  return %0 : tensor<40x40xf32>
+// CHECK-LABEL: matmul_transposed_a
+// CHECK: %[[CST_0:.*]] = constant dense<[1, 0]> : tensor<2xi32>
+// CHECK: %[[ARG_0:.*]] = "tfl.transpose"(%arg0, %[[CST_0]]) : (tensor<37x40xf32>, tensor<2xi32>) -> tensor<40x37xf32>
+// CHECK: %[[ARG_1:.*]] = "tfl.transpose"(%arg1, %[[CST_0]]) : (tensor<37x40xf32>, tensor<2xi32>) -> tensor<40x37xf32>
+// CHECK: %[[CST_2:.*]] = constant unit
+// CHECK: "tfl.fully_connected"(%[[ARG_0]], %[[ARG_1]], %[[CST_2]]) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<40x37xf32>, tensor<40x37xf32>, none) -> tensor<40x40xf32>
+}
+
+func @matmul_transposed_b(%arg0: tensor<40x37xf32>, %arg1: tensor<40x37xf32>) -> tensor<40x40xf32> {
   %0 = "tf.MatMul"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", device = "/device:CPU:0", name = "MatMul", transpose_a = false, transpose_b = true} :
 (tensor<40x37xf32>, tensor<40x37xf32>) -> tensor<40x40xf32>
   return %0 : tensor<40x40xf32>
-// CHECK-LABEL: matmul_transposed
+// CHECK-LABEL: matmul_transposed_b
 // CHECK: "tfl.fully_connected"(%arg0, %arg1, %cst) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<40x37xf32>, tensor<40x37xf32>, none) -> tensor<40x40xf32>
 }
 
+func @matmul_transposed_ab(%arg0: tensor<37x40xf32>, %arg1: tensor<40x37xf32>) -> tensor<40x40xf32> {
+  %0 = "tf.MatMul"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", device = "/device:CPU:0", name = "MatMul", transpose_a = true, transpose_b = true} :
+(tensor<37x40xf32>, tensor<40x37xf32>) -> tensor<40x40xf32>
+  return %0 : tensor<40x40xf32>
+// CHECK-LABEL: matmul_transposed_ab
+// CHECK: %[[CST_0:.*]] = constant dense<[1, 0]> : tensor<2xi32>
+// CHECK: %[[ARG_0:.*]] = "tfl.transpose"(%arg0, %[[CST_0]]) : (tensor<37x40xf32>, tensor<2xi32>) -> tensor<40x37xf32>
+// CHECK: %[[CST_1:.*]] = constant unit
+// CHECK: "tfl.fully_connected"(%[[ARG_0]], %arg1, %[[CST_1]]) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<40x37xf32>, tensor<40x37xf32>, none) -> tensor<40x40xf32>
+}
+
 func @concatv2With3Tensors(%arg0: tensor<2x1xi32>, %arg1: tensor<2x1xi32>, %arg2: tensor<2x1xi32>) -> tensor<2x3xi32> {
   %0 = "tf.Const"() { value = dense<-1> : tensor<i32> } : () -> tensor<i32>
   %1 = "tf.ConcatV2"(%arg0, %arg1, %arg2, %0) : (tensor<2x1xi32>, tensor<2x1xi32>, tensor<2x1xi32>, tensor<i32>) -> tensor<2x3xi32>
@@ -1324,10 +1355,7 @@ func @conv2d_backprop_input(%arg0: tensor<4xi32>, %arg1: tensor<3x3x1x32xf32>, %
   // CHECK: %[[ARG0:.*]] = "tfl.transpose"(%arg1, %[[CST]]) : (tensor<3x3x1x32xf32>, tensor<4xi32>) -> tensor<1x3x3x32xf32>
   // CHECK: %[[CST_0:.*]] = constant unit
   // CHECK: %[[ARG1:.*]] = "tfl.transpose_conv"(%arg0, %[[ARG0]], %arg2, %[[CST_0]]) {padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<4xi32>, tensor<1x3x3x32xf32>, tensor<15x14x14x32xf32>, none) -> tensor<15x28x28x1xf32>
-  // CHECK: %[[CST_1:.*]] = constant dense<[2, 0, 1, 3]> : tensor<4xi32>
-  // CHECK: %[[ARG2:.*]] = "tfl.transpose"(%arg1, %[[CST_1]]) : (tensor<3x3x1x32xf32>, tensor<4xi32>) -> tensor<1x3x3x32xf32>
-  // CHECK: %[[CST_2:.*]] = constant unit
-  // CHECK: %[[ARG3:.*]] = "tfl.transpose_conv"(%arg0, %[[ARG2]], %arg2, %[[CST_2]]) {padding = "VALID", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<4xi32>, tensor<1x3x3x32xf32>, tensor<15x14x14x32xf32>, none) -> tensor<15x28x28x1xf32>
+  // CHECK: %[[ARG3:.*]] = "tfl.transpose_conv"(%arg0, %[[ARG0]], %arg2, %[[CST_0]]) {padding = "VALID", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<4xi32>, tensor<1x3x3x32xf32>, tensor<15x14x14x32xf32>, none) -> tensor<15x28x28x1xf32>
   // CHECK: %[[RESULT:.*]] = tfl.add %[[ARG1]], %[[ARG3]] {fused_activation_function = "NONE"} : tensor<15x28x28x1xf32>
   // CHECK: return %[[RESULT]] : tensor<15x28x28x1xf32>
 }
@@ -1482,28 +1510,6 @@ func @UnidirectionalRnn(%arg: tensor<28x1x28xf32>) -> (tensor<28x1x28xf32>) {
 // CHECK:           return [[VAL_4]] : tensor<28x1x28xf32>
 // CHECK:         }
 
-func @broadcast_to_f32(%arg0: tensor<3xf32>, %arg1: tensor<2xi32>) -> tensor<3x3xf32> {
-  %0 = "tf.BroadcastTo"(%arg0, %arg1) : (tensor<3xf32>, tensor<2xi32>) -> tensor<3x3xf32>
-  return %0: tensor<3x3xf32>
-
-// CHECK-LABEL: broadcast_to_f32
-// CHECK:  [[CST:%.*]] = constant dense<1.000000e+00> : tensor<f32>
-// CHECK:  [[FILL:%.*]] = "tfl.fill"(%arg1, [[CST]]) : (tensor<2xi32>, tensor<f32>) -> tensor<3x3xf32>
-// CHECK:  [[MUL:%.*]] = "tfl.mul"(%arg0, [[FILL]]) {fused_activation_function = "NONE"} : (tensor<3xf32>, tensor<3x3xf32>) -> tensor<3x3xf32>
-// CHECK:  return [[MUL]] : tensor<3x3xf32>
-}
-
-func @broadcast_to_i32(%input: tensor<3xi32>, %shape: tensor<2xi32>) -> tensor<3x3xi32> {
-  %0 = "tf.BroadcastTo"(%input, %shape) : (tensor<3xi32>, tensor<2xi32>) -> tensor<3x3xi32>
-  return %0: tensor<3x3xi32>
-
-// CHECK-LABEL: broadcast_to_i32
-// CHECK:  [[CST:%.*]] = constant dense<1> : tensor<i32>
-// CHECK:  [[FILL:%.*]] = "tfl.fill"(%arg1, [[CST]]) : (tensor<2xi32>, tensor<i32>) -> tensor<3x3xi32>
-// CHECK:  [[MUL:%.*]] = "tfl.mul"(%arg0, [[FILL]]) {fused_activation_function = "NONE"} : (tensor<3xi32>, tensor<3x3xi32>) -> tensor<3x3xi32>
-// CHECK:  return [[MUL]] : tensor<3x3xi32>
-}
-
 func @matmul_batch(%arg0: tensor<10x15xf32>, %arg1: tensor<15x17xf32>) -> tensor<10x17xf32> {
   %0 = "tf.BatchMatMul"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", device = "/device:CPU:0", name = "MatMul", adj_x = false, adj_y = false} :
 (tensor<10x15xf32>, tensor<15x17xf32>) -> tensor<10x17xf32>
@@ -1555,3 +1561,27 @@ func @add_with_int32_5d_inputs(%arg0: tensor<1x1x1x3x1xi32>, %arg1 : tensor<1x1x
 // CHECK-LABEL: add_with_int32_5d_inputs
 // CHECK: "tf.Add"(%arg0, %arg1)
 }
+
+func @tranpose_int32_perm(%arg0: tensor<2x3xf32>) -> tensor<3x2xf32> {
+  %cst = "tf.Const"() { value = dense<[1, 0]> : tensor<2xi32> } : () -> tensor<2xi32>
+  %0 = "tf.Transpose"(%arg0, %cst): (tensor<2x3xf32>, tensor<2xi32>) -> tensor<3x2xf32>
+  return %0 : tensor<3x2xf32>
+  // CHECK-LABEL: tranpose_int32_perm
+  // CHECK: "tfl.transpose"
+}
+
+func @tranpose_int64_perm(%arg0: tensor<2x3xf32>) -> tensor<3x2xf32> {
+  %cst = "tf.Const"() { value = dense<[1, 0]> : tensor<2xi64> } : () -> tensor<2xi64>
+  %0 = "tf.Transpose"(%arg0, %cst): (tensor<2x3xf32>, tensor<2xi64>) -> tensor<3x2xf32>
+  return %0 : tensor<3x2xf32>
+  // CHECK-LABEL: tranpose_int64_perm
+  // CHECK: "tfl.transpose"
+}
+
+func @tranpose_arg(%arg0: tensor<2x3xf32>, %arg1: tensor<2xi32>) -> tensor<3x2xf32> {
+  %0 = "tf.Transpose"(%arg0, %arg1): (tensor<2x3xf32>, tensor<2xi32>) -> tensor<3x2xf32>
+  return %0 : tensor<3x2xf32>
+  // CHECK-LABEL: tranpose_arg
+  // CHECK: "tfl.transpose"
+}
+
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/unknown-op.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/unknown-op.mlir
deleted file mode 100644
index 7e9f66baa90..00000000000
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/unknown-op.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-// RUN: not flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - 2>&1 | FileCheck %s
-
-func @main(tensor<3x2xi32>) -> tensor<3x2xi32> {
-^bb0(%arg0: tensor<3x2xi32>):
-  // CHECK: error: 'unknown_op' op dialect is not registered
-  %0 = "unknown_op"(%arg0) : (tensor<3x2xi32>) -> tensor<3x2xi32>
-  return %0 : tensor<3x2xi32>
-}
diff --git a/tensorflow/compiler/mlir/lite/tests/ops.mlir b/tensorflow/compiler/mlir/lite/tests/ops.mlir
index 7ef6997f938..b62f5655183 100644
--- a/tensorflow/compiler/mlir/lite/tests/ops.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/ops.mlir
@@ -1139,9 +1139,15 @@ func @packInputRank(%arg0: tensor<1x4xi32>, %arg1: tensor<1x4xi32>) -> tensor<1x
 
 // -----
 
-func @packNegInputRank(%arg0: tensor<1x4xi32>, %arg1: tensor<1x4xi32>) -> tensor<2x1x4xi32> {
+func @packNegInputAxis2(%arg0: tensor<1x4xi32>, %arg1: tensor<1x4xi32>) -> tensor<1x2x4xi32> {
   // CHECK: "tfl.pack"(%arg0, %arg1) {axis = -2 : i32, values_count = 2 : i32}
-  %0 = "tfl.pack"(%arg0, %arg1) {axis = -2 : i32, values_count = 2 : i32} : (tensor<1x4xi32>, tensor<1x4xi32>) -> tensor<2x1x4xi32>
+  %0 = "tfl.pack"(%arg0, %arg1) {axis = -2 : i32, values_count = 2 : i32} : (tensor<1x4xi32>, tensor<1x4xi32>) -> tensor<1x2x4xi32>
+  return %0 : tensor<1x2x4xi32>
+}
+
+func @packNegInputAxis3(%arg0: tensor<1x4xi32>, %arg1: tensor<1x4xi32>) -> tensor<2x1x4xi32> {
+  // CHECK: "tfl.pack"(%arg0, %arg1) {axis = -3 : i32, values_count = 2 : i32}
+  %0 = "tfl.pack"(%arg0, %arg1) {axis = -3 : i32, values_count = 2 : i32} : (tensor<1x4xi32>, tensor<1x4xi32>) -> tensor<2x1x4xi32>
   return %0 : tensor<2x1x4xi32>
 }
 
@@ -1172,7 +1178,7 @@ func @pack(%arg0: tensor<1xi32>, %arg1: tensor<2xi32>) -> tensor<2x2xi32> {
 // -----
 
 func @pack(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>) -> tensor<2x2xi32> {
-  // expected-error @+1 {{op attribute 'axis' is out of bounds, got 3}}
+  // expected-error @+1 {{op attribute 'axis' should be in range [-rank - 1, rank + 1), got rank = 1, and axis = 3}}
   %0 = "tfl.pack"(%arg0, %arg1) {axis = 3 : i32, values_count = 2 : i32} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2x2xi32>
   return %0 : tensor<2x2xi32>
 }
@@ -1183,7 +1189,22 @@ func @unpack(%arg0: tensor<2x3xi32>) -> tensor<2xi32> {
   // CHECK: "tfl.unpack"(%arg0) {axis = 1 : i32, num = 3 : i32}
   %0:3 = "tfl.unpack"(%arg0) {axis = 1 : i32, num = 3 : i32} : (tensor<2x3xi32>) -> (tensor<2xi32>, tensor<2xi32>, tensor<2xi32>)
   return %0#0 : tensor<2xi32>
+}
 
+// -----
+
+func @unpack(%arg0: tensor<2x3xi32>) -> tensor<2xi32> {
+  // CHECK: "tfl.unpack"(%arg0) {axis = -1 : i32, num = 3 : i32}
+  %0:3 = "tfl.unpack"(%arg0) {axis = -1 : i32, num = 3 : i32} : (tensor<2x3xi32>) -> (tensor<2xi32>, tensor<2xi32>, tensor<2xi32>)
+  return %0#0 : tensor<2xi32>
+}
+
+// -----
+
+func @unpack(%arg0: tensor<2x3xi32>) -> tensor<3xi32> {
+  // CHECK: "tfl.unpack"(%arg0) {axis = -2 : i32, num = 2 : i32}
+  %0:2 = "tfl.unpack"(%arg0) {axis = -2 : i32, num = 2 : i32} : (tensor<2x3xi32>) -> (tensor<3xi32>, tensor<3xi32>)
+  return %0#0 : tensor<3xi32>
 }
 
 // -----
@@ -1204,6 +1225,45 @@ func @unpack(%arg0: tensor<2x3xi32>) -> tensor<2xi32> {
 
 // -----
 
+func @unpack(%arg0: tensor<2x3xi32>) -> tensor<2xi32> {
+  // expected-error @+1 {{attribute 'axis' should be in range [-rank, rank), got axis = 2, and rank = 2}}
+  %0:3 = "tfl.unpack"(%arg0) {axis = 2 : i32, num = 3 : i32} : (tensor<2x3xi32>) -> (tensor<2xi32>, tensor<2xi32>, tensor<2xi32>)
+  return %0#0 : tensor<2xi32>
+}
+
+// -----
+
+func @unpack(%arg0: tensor<2x3xi32>) -> tensor<2xi32> {
+  // expected-error @+1 {{attribute 'axis' should be in range [-rank, rank), got axis = -3, and rank = 2}}
+  %0:3 = "tfl.unpack"(%arg0) {axis = -3 : i32, num = 3 : i32} : (tensor<2x3xi32>) -> (tensor<2xi32>, tensor<2xi32>, tensor<2xi32>)
+  return %0#0 : tensor<2xi32>
+}
+
+// -----
+
+func @unpack(%arg0: tensor<i32>) -> tensor<2xi32> {
+  // expected-error @+1 {{input should be of rank larger than 0}}
+  %0:3 = "tfl.unpack"(%arg0) {axis = 0 : i32, num = 3 : i32} : (tensor<i32>) -> (tensor<2xi32>, tensor<2xi32>, tensor<2xi32>)
+  return %0#0 : tensor<2xi32>
+}
+
+// -----
+
+func @unpack(%arg0: tensor<2x3xi32>) -> tensor<2xi32> {
+  // expected-error @+1 {{op inferred type incompatible with return type of operation}}
+  %0:3 = "tfl.unpack"(%arg0) {axis = 1 : i32, num = 3 : i32} : (tensor<2x3xi32>) -> (tensor<2xi32>, tensor<2x1xi32>, tensor<2xi32>)
+  return %0#0 : tensor<2xi32>
+}
+
+// -----
+
+func @unpack(%arg0: tensor<*xi32>) -> (tensor<*xi32>, tensor<*xi32>) {
+  %0:2 = "tfl.unpack"(%arg0) {axis = 1 : i32, num = 2 : i32} : (tensor<*xi32>) -> (tensor<*xi32>, tensor<*xi32>)
+  return %0#0, %0#1 : tensor<*xi32>, tensor<*xi32>
+}
+
+// -----
+
 // CHECK-LABEL: testMean
 func @testMean(%arg0: tensor<2x2xf32>, %arg1 : tensor<1xi32>) -> tensor<1x2xf32> {
   // CHECK: "tfl.mean"(%arg0, %arg1) {keep_dims = false}
@@ -1640,6 +1700,15 @@ func @testRelu6WithQuantizedTypes(%arg0 : tensor<10x!quant.uniform<u8:f32, 1.0>>
 
 // -----
 
+func @testReluWithDifferentScales(%arg0 : tensor<10x!quant.uniform<u8:f32, 1.0>>) -> tensor<10x!quant.uniform<u8:f32, 4.0>> {
+  %0 = "tfl.relu"(%arg0) : (tensor<10x!quant.uniform<u8:f32, 1.0>>) -> tensor<10x!quant.uniform<u8:f32, 2.0>>
+  %1 = "tfl.relu_n1_to_1"(%0) : (tensor<10x!quant.uniform<u8:f32, 2.0>>) -> tensor<10x!quant.uniform<u8:f32, 3.0>>
+  %2 = "tfl.relu6"(%1) : (tensor<10x!quant.uniform<u8:f32, 3.0>>) -> tensor<10x!quant.uniform<u8:f32, 4.0>>
+  return %2 : tensor<10x!quant.uniform<u8:f32, 4.0>>
+}
+
+// -----
+
 func @testEmbeddingLookup(%arg0 : tensor<?xi32>, %arg1 : tensor<?x?xf32>) -> tensor<?xf32> {
   %0 = "tfl.embedding_lookup"(%arg0, %arg1) : (tensor<?xi32>,tensor<?x?xf32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
diff --git a/tensorflow/compiler/mlir/lite/tests/optimize.mlir b/tensorflow/compiler/mlir/lite/tests/optimize.mlir
index b8be96a9159..8d64bc6ed0a 100644
--- a/tensorflow/compiler/mlir/lite/tests/optimize.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/optimize.mlir
@@ -50,6 +50,96 @@ func @fuseSubIntoConv2d(%arg0: tensor<256x32x32x3xf32>, %arg1: tensor<16x3x3x3xf
   // CHECK: %0 = "tfl.conv_2d"(%arg0, %arg1, %cst)
 }
 
+// CHECK-LABEL: fuseAddIntoTransposeConv
+func @fuseAddIntoTransposeConv(%arg0: tensor<1x32x42x128xf32>) -> tensor<1x64x84x32xf32> {
+  %cst = constant dense<1.5> : tensor<32xf32>
+  %cst_0 = constant dense<[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0]> : tensor<16xf32>
+  %cst_1 = constant dense<[1, 64, 84, 32]> : tensor<4xi32>
+  %cst_2 = constant dense<1.0> : tensor<32x4x4x128xf32>
+  %cst_3 = constant dense<[1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0]> : tensor<32xf32>
+  %0 = "tfl.transpose_conv"(%cst_1, %cst_2, %arg0, %cst_3) {padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<4xi32>, tensor<32x4x4x128xf32>, tensor<1x32x42x128xf32>, tensor<32xf32>) -> tensor<1x64x84x32xf32>
+  %1 = "tfl.add"(%0, %cst) {fused_activation_function = "NONE"} : (tensor<1x64x84x32xf32>, tensor<32xf32>) -> tensor<1x64x84x32xf32>
+  return %1 : tensor<1x64x84x32xf32>
+
+  // CHECK: %[[SHAPE:.*]] = constant dense<[1, 64, 84, 32]> : tensor<4xi32>
+  // CHECK: %[[WEIGHTS:.*]] = constant dense<1.000000e+00> : tensor<32x4x4x128xf32>
+  // CHECK: %[[BIAS:.*]] = constant dense<[2.500000e+00, 3.500000e+00, 2.500000e+00, 3.500000e+00, 2.500000e+00, 3.500000e+00, 2.500000e+00, 3.500000e+00, 2.500000e+00, 3.500000e+00, 2.500000e+00, 3.500000e+00, 2.500000e+00, 3.500000e+00, 2.500000e+00, 3.500000e+00, 2.500000e+00, 3.500000e+00, 2.500000e+00, 3.500000e+00, 2.500000e+00, 3.500000e+00, 2.500000e+00, 3.500000e+00, 2.500000e+00, 3.500000e+00, 2.500000e+00, 3.500000e+00, 2.500000e+00, 3.500000e+00, 2.500000e+00, 3.500000e+00]> : tensor<32xf32>
+  // CHECK: %[[RESULT:.*]] = "tfl.transpose_conv"(%[[SHAPE]], %[[WEIGHTS]], %arg0, %[[BIAS]])
+  // CHECK: return %[[RESULT]]
+}
+
+// CHECK-LABEL: fuseSubIntoTransposeConv
+func @fuseSubIntoTransposeConv(%arg0: tensor<1x32x42x128xf32>) -> tensor<1x64x84x32xf32> {
+  %cst = constant dense<1.5> : tensor<32xf32>
+  %cst_0 = constant dense<[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0]> : tensor<16xf32>
+  %cst_1 = constant dense<[1, 64, 84, 32]> : tensor<4xi32>
+  %cst_2 = constant dense<1.0> : tensor<32x4x4x128xf32>
+  %cst_3 = constant dense<[1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0]> : tensor<32xf32>
+  %0 = "tfl.transpose_conv"(%cst_1, %cst_2, %arg0, %cst_3) {padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<4xi32>, tensor<32x4x4x128xf32>, tensor<1x32x42x128xf32>, tensor<32xf32>) -> tensor<1x64x84x32xf32>
+  %1 = "tfl.sub"(%0, %cst) {fused_activation_function = "NONE"} : (tensor<1x64x84x32xf32>, tensor<32xf32>) -> tensor<1x64x84x32xf32>
+  return %1 : tensor<1x64x84x32xf32>
+
+  // CHECK: %[[SHAPE:.*]] = constant dense<[1, 64, 84, 32]> : tensor<4xi32>
+  // CHECK: %[[WEIGHTS:.*]] = constant dense<1.000000e+00> : tensor<32x4x4x128xf32>
+  // CHECK: %[[BIAS:.*]] = constant dense<[-5.000000e-01, 5.000000e-01, -5.000000e-01, 5.000000e-01, -5.000000e-01, 5.000000e-01, -5.000000e-01, 5.000000e-01, -5.000000e-01, 5.000000e-01, -5.000000e-01, 5.000000e-01, -5.000000e-01, 5.000000e-01, -5.000000e-01, 5.000000e-01, -5.000000e-01, 5.000000e-01, -5.000000e-01, 5.000000e-01, -5.000000e-01, 5.000000e-01, -5.000000e-01, 5.000000e-01, -5.000000e-01, 5.000000e-01, -5.000000e-01, 5.000000e-01, -5.000000e-01, 5.000000e-01, -5.000000e-01, 5.000000e-01]> : tensor<32xf32>
+  // CHECK: %[[RESULT:.*]] = "tfl.transpose_conv"(%[[SHAPE]], %[[WEIGHTS]], %arg0, %[[BIAS]])
+  // CHECK: return %[[RESULT]]
+}
+
+// CHECK-LABEL: fuseAddIntoTransposeConvNoBias
+func @fuseAddIntoTransposeConvNoBias(%arg0: tensor<1x32x42x128xf32>) -> tensor<1x64x84x32xf32> {
+  %cst = constant dense<1.5> : tensor<32xf32>
+  %cst_0 = constant dense<[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0]> : tensor<16xf32>
+  %cst_1 = constant dense<[1, 64, 84, 32]> : tensor<4xi32>
+  %cst_2 = constant dense<1.0> : tensor<32x4x4x128xf32>
+  %cst_3 = constant unit
+  %0 = "tfl.transpose_conv"(%cst_1, %cst_2, %arg0, %cst_3) {padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<4xi32>, tensor<32x4x4x128xf32>, tensor<1x32x42x128xf32>, none) -> tensor<1x64x84x32xf32>
+  %1 = "tfl.add"(%0, %cst) {fused_activation_function = "NONE"} : (tensor<1x64x84x32xf32>, tensor<32xf32>) -> tensor<1x64x84x32xf32>
+  return %1 : tensor<1x64x84x32xf32>
+
+  // CHECK: %[[SHAPE:.*]] = constant dense<[1, 64, 84, 32]> : tensor<4xi32>
+  // CHECK: %[[WEIGHTS:.*]] = constant dense<1.000000e+00> : tensor<32x4x4x128xf32>
+  // CHECK: %[[BIAS:.*]] = constant dense<1.500000e+00> : tensor<32xf32>
+  // CHECK: %[[RESULT:.*]] = "tfl.transpose_conv"(%[[SHAPE]], %[[WEIGHTS]], %arg0, %[[BIAS]])
+  // CHECK: return %[[RESULT]]
+}
+
+// CHECK-LABEL: fuseMulIntoTransposeConv
+func @fuseMulIntoTransposeConv(%arg0: tensor<1x32x42x128xf32>) -> tensor<1x64x84x32xf32> {
+  %cst = constant dense<1.5> : tensor<32xf32>
+  %cst_0 = constant dense<[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0]> : tensor<16xf32>
+  %cst_1 = constant dense<[1, 64, 84, 32]> : tensor<4xi32>
+  %cst_2 = constant dense<1.0> : tensor<32x4x4x128xf32>
+  %cst_3 = constant dense<[1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0]> : tensor<32xf32>
+  %0 = "tfl.transpose_conv"(%cst_1, %cst_2, %arg0, %cst_3) {padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<4xi32>, tensor<32x4x4x128xf32>, tensor<1x32x42x128xf32>, tensor<32xf32>) -> tensor<1x64x84x32xf32>
+  %1 = "tfl.mul"(%0, %cst) {fused_activation_function = "NONE"} : (tensor<1x64x84x32xf32>, tensor<32xf32>) -> tensor<1x64x84x32xf32>
+  return %1 : tensor<1x64x84x32xf32>
+
+  // CHECK: %[[SHAPE:.*]] = constant dense<[1, 64, 84, 32]> : tensor<4xi32>
+  // CHECK: %[[WEIGHTS:.*]] = constant dense<1.500000e+00> : tensor<32x4x4x128xf32>
+  // CHECK: %[[BIAS:.*]] = constant dense<[1.500000e+00, 3.000000e+00, 1.500000e+00, 3.000000e+00, 1.500000e+00, 3.000000e+00, 1.500000e+00, 3.000000e+00, 1.500000e+00, 3.000000e+00, 1.500000e+00, 3.000000e+00, 1.500000e+00, 3.000000e+00, 1.500000e+00, 3.000000e+00, 1.500000e+00, 3.000000e+00, 1.500000e+00, 3.000000e+00, 1.500000e+00, 3.000000e+00, 1.500000e+00, 3.000000e+00, 1.500000e+00, 3.000000e+00, 1.500000e+00, 3.000000e+00, 1.500000e+00, 3.000000e+00, 1.500000e+00, 3.000000e+00]> : tensor<32xf32>
+  // CHECK: %[[RESULT:.*]] = "tfl.transpose_conv"(%[[SHAPE]], %[[WEIGHTS]], %arg0, %[[BIAS]])
+  // CHECK: return %[[RESULT]]
+}
+
+// CHECK-LABEL: fuseMulIntoTransposeConvNoBias
+func @fuseMulIntoTransposeConvNoBias(%arg0: tensor<1x32x42x128xf32>) -> tensor<1x64x84x32xf32> {
+  %cst = constant dense<1.5> : tensor<32xf32>
+  %cst_0 = constant dense<[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0]> : tensor<16xf32>
+  %cst_1 = constant dense<[1, 64, 84, 32]> : tensor<4xi32>
+  %cst_2 = constant dense<1.0> : tensor<32x4x4x128xf32>
+  %cst_3 = constant unit
+  %0 = "tfl.transpose_conv"(%cst_1, %cst_2, %arg0, %cst_3) {padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<4xi32>, tensor<32x4x4x128xf32>, tensor<1x32x42x128xf32>, none) -> tensor<1x64x84x32xf32>
+  %1 = "tfl.mul"(%0, %cst) {fused_activation_function = "NONE"} : (tensor<1x64x84x32xf32>, tensor<32xf32>) -> tensor<1x64x84x32xf32>
+  return %1 : tensor<1x64x84x32xf32>
+
+  // CHECK: %[[SHAPE:.*]] = constant dense<[1, 64, 84, 32]> : tensor<4xi32>
+  // CHECK: %[[WEIGHTS:.*]] = constant dense<1.500000e+00> : tensor<32x4x4x128xf32>
+  // CHECK: %[[BIAS:.*]] = constant unit
+  // CHECK: %[[RESULT:.*]] = "tfl.transpose_conv"(%[[SHAPE]], %[[WEIGHTS]], %arg0, %[[BIAS]])
+  // CHECK: return %[[RESULT]]
+}
+
 // CHECK-LABEL: fuseAddIntoFollowingConv2d
 func @fuseAddIntoFollowingConv2d(%arg0: tensor<256x32x32x3xf32>) -> tensor<256x30x30x16xf32> {
   %cst = constant dense<1.5> : tensor<f32>
@@ -1066,3 +1156,138 @@ func @DontConvertSqueezeToReshape(%arg0: tensor<*xf32>) -> tensor<*xf32> {
 // CHECK:  return %[[RESULT]]
 }
 
+func @ConvertPow1ToIdentity(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
+  %cst = constant dense<1.000000e+00> : tensor<f32>
+  %0 = "tfl.pow"(%arg0, %cst) : (tensor<2x2xf32>, tensor<f32>) -> tensor<2x2xf32>
+  return %0 : tensor<2x2xf32>
+
+// CHECK-LABEL: ConvertPow1ToIdentity
+// CHECK: return %arg0
+}
+
+func @ConvertPow2ToSquare(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
+  %cst = constant dense<2.000000e+00> : tensor<f32>
+  %0 = "tfl.pow"(%arg0, %cst) : (tensor<2x2xf32>, tensor<f32>) -> tensor<2x2xf32>
+  return %0 : tensor<2x2xf32>
+
+// CHECK-LABEL: ConvertPow2ToSquare
+// CHECK: %[[RESULT:.*]] = tfl.mul %arg0, %arg0 {fused_activation_function = "NONE"} : tensor<2x2xf32>
+// CHECK: return %[[RESULT]]
+}
+
+func @ConvertIdentityGatherNdOp(%arg0: tensor<4x3xf32>) -> tensor<4x3xf32> {
+  %cst = constant dense<[[0], [1], [2], [3]]> : tensor<4x1xi32>
+  %0 = "tfl.gather_nd"(%arg0, %cst) : (tensor<4x3xf32>, tensor<4x1xi32>) -> tensor<4x3xf32>
+  return %0 : tensor<4x3xf32>
+
+// CHECK-LABEL: ConvertIdentityGatherNdOp
+// CHECK-SAME: (%[[ARG:.*]]: tensor<4x3xf32>) -> tensor<4x3xf32>
+// CHECK-NEXT: return %[[ARG]] : tensor<4x3xf32>
+}
+
+func @ConvertIdentityGatherNdOp3D(%arg0: tensor<4x3x4xf32>) -> tensor<4x3x4xf32> {
+  %cst = constant dense<[[0], [1], [2], [3]]> : tensor<4x1xi32>
+  %0 = "tfl.gather_nd"(%arg0, %cst) : (tensor<4x3x4xf32>, tensor<4x1xi32>) -> tensor<4x3x4xf32>
+  return %0 : tensor<4x3x4xf32>
+
+// CHECK-LABEL: ConvertIdentityGatherNdOp3D
+// CHECK-SAME: (%[[ARG:.*]]: tensor<4x3x4xf32>) -> tensor<4x3x4xf32>
+// CHECK-NEXT: return %[[ARG]] : tensor<4x3x4xf32>
+}
+
+func @ConvertIdentityScatterNd(%arg0: tensor<4x3xf32>) -> tensor<4x3xf32> {
+  %cst = constant dense<[[0], [1], [2], [3]]> : tensor<4x1xi32>
+  %shape = constant dense<[4, 3]> : tensor<2xi32>
+  %0 = "tfl.scatter_nd"(%cst, %arg0, %shape) : (tensor<4x1xi32>, tensor<4x3xf32>, tensor<2xi32>) -> tensor<4x3xf32>
+  return %0 : tensor<4x3xf32>
+
+// CHECK-LABEL: ConvertIdentityScatterNd
+// CHECK-SAME: (%[[ARG:.*]]: tensor<4x3xf32>) -> tensor<4x3xf32>
+// CHECK-NEXT: return %[[ARG]] : tensor<4x3xf32>
+}
+
+func @ReshapeAddUnknownShape(%arg0: tensor<*xf32>) -> tensor<3x4xf32> {
+  %cst = constant dense<[3, 4]> : tensor<2xi32>
+  %cst_0 = constant dense<1.000000e+00> : tensor<3x4xf32>
+  %0 = "tfl.reshape"(%arg0, %cst) : (tensor<*xf32>, tensor<2xi32>) -> tensor<3x4xf32>
+  %1 = "tfl.add"(%0, %cst_0) {fused_activation_function = "NONE"} : (tensor<3x4xf32>, tensor<3x4xf32>) -> tensor<3x4xf32>
+  return %1 : tensor<3x4xf32>
+// CHECK-LABEL: ReshapeAddUnknownShape
+// CHECK: %[[rs1:.*]] = "tfl.reshape"(%arg0
+// CHECK: %[[rs2:.*]] = tfl.add %[[rs1]]
+// CHECK: return %[[rs2]]
+}
+
+func @FoldSumKeepDim(%arg0: tensor<8x128xf32>) -> tensor<8x1xf32> {
+  %cst = constant dense<1> : tensor<1xi32>
+  %cst_1 = constant dense<[8, 1]> : tensor<2xi32>
+  %0 = "tfl.sum"(%arg0, %cst) {keep_dims = false} : (tensor<8x128xf32>, tensor<1xi32>) -> tensor<8xf32>
+  %1 = "tfl.reshape"(%0, %cst_1) : (tensor<8xf32>, tensor<2xi32>) -> tensor<8x1xf32>
+  return %1 : tensor<8x1xf32>
+
+// CHECK-LABEL: FoldSumKeepDim
+// CHECK: %[[RESULT:.*]] = "tfl.sum"(%arg0, %cst) {keep_dims = true} : (tensor<8x128xf32>, tensor<1xi32>) -> tensor<8x1xf32>
+// CHECK: return %[[RESULT]] : tensor<8x1xf32>
+}
+
+func @FoldReduceMinKeepDim(%arg0: tensor<8x128xf32>) -> tensor<1x128xf32> {
+  %cst = constant dense<0> : tensor<1xi32>
+  %cst_1 = constant dense<[1, 128]> : tensor<2xi32>
+  %0 = "tfl.reduce_min"(%arg0, %cst) {keep_dims = false} : (tensor<8x128xf32>, tensor<1xi32>) -> tensor<128xf32>
+  %1 = "tfl.reshape"(%0, %cst_1) : (tensor<128xf32>, tensor<2xi32>) -> tensor<1x128xf32>
+  return %1 : tensor<1x128xf32>
+
+// CHECK-LABEL: FoldReduceMinKeepDim
+// CHECK: %[[RESULT:.*]] = "tfl.reduce_min"(%arg0, %cst) {keep_dims = true} : (tensor<8x128xf32>, tensor<1xi32>) -> tensor<1x128xf32>
+// CHECK: return %[[RESULT]] : tensor<1x128xf32>
+}
+
+func @FoldReduceMaxKeepDim(%arg0: tensor<8x128xf32>) -> tensor<1x128xf32> {
+  %cst = constant dense<0> : tensor<1xi32>
+  %cst_1 = constant dense<[1, 128]> : tensor<2xi32>
+  %0 = "tfl.reduce_max"(%arg0, %cst) {keep_dims = false} : (tensor<8x128xf32>, tensor<1xi32>) -> tensor<128xf32>
+  %1 = "tfl.reshape"(%0, %cst_1) : (tensor<128xf32>, tensor<2xi32>) -> tensor<1x128xf32>
+  return %1 : tensor<1x128xf32>
+
+// CHECK-LABEL: FoldReduceMaxKeepDim
+// CHECK: %[[RESULT:.*]] = "tfl.reduce_max"(%arg0, %cst) {keep_dims = true} : (tensor<8x128xf32>, tensor<1xi32>) -> tensor<1x128xf32>
+// CHECK: return %[[RESULT]] : tensor<1x128xf32>
+}
+
+func @FoldReduceProdKeepDim(%arg0: tensor<8x128xf32>) -> tensor<1x1xf32> {
+  %cst = constant dense<[0, 1]> : tensor<2xi32>
+  %cst_1 = constant dense<[1, 1]> : tensor<2xi32>
+  %0 = "tfl.reduce_prod"(%arg0, %cst) {keep_dims = false} : (tensor<8x128xf32>, tensor<2xi32>) -> tensor<f32>
+  %1 = "tfl.reshape"(%0, %cst_1) : (tensor<f32>, tensor<2xi32>) -> tensor<1x1xf32>
+  return %1 : tensor<1x1xf32>
+
+// CHECK-LABEL: FoldReduceProdKeepDim
+// CHECK: %[[RESULT:.*]] = "tfl.reduce_prod"(%arg0, %cst) {keep_dims = true} : (tensor<8x128xf32>, tensor<2xi32>) -> tensor<1x1xf32>
+// CHECK: return %[[RESULT]] : tensor<1x1xf32>
+}
+
+func @SoftMaxWithNormalization(%arg0: tensor<8x128xf32>) -> tensor<8x128xf32> {
+  %cst = constant dense<1> : tensor<1xi32>
+  %0 = "tfl.reduce_max"(%arg0, %cst) {keep_dims = true} : (tensor<8x128xf32>, tensor<1xi32>) -> tensor<8x1xf32>
+  %1 = "tfl.sub"(%arg0, %0) {fused_activation_function = "NONE"} : (tensor<8x128xf32>, tensor<8x1xf32>) -> tensor<8x128xf32>
+  %2 = "tfl.exp"(%1) : (tensor<8x128xf32>) -> tensor<8x128xf32>
+  %3 = "tfl.sum"(%2, %cst) {keep_dims = true} : (tensor<8x128xf32>, tensor<1xi32>) -> tensor<8x1xf32>
+  %4 = "tfl.div"(%2, %3) {fused_activation_function = "NONE"} : (tensor<8x128xf32>, tensor<8x1xf32>) -> tensor<8x128xf32>
+  return %4 : tensor<8x128xf32>
+
+// CHECK-LABEL: SoftMaxWithNormalization
+// CHECK: %[[RESULT:.*]] = "tfl.softmax"(%arg0) {beta = 1.000000e+00 : f32} : (tensor<8x128xf32>) -> tensor<8x128xf32>
+// CHECK: return %[[RESULT]] : tensor<8x128xf32>
+}
+
+func @SoftMaxWithoutNormalization(%arg0: tensor<8x128xf32>) -> tensor<8x128xf32> {
+  %cst = constant dense<1> : tensor<1xi32>
+  %0 = "tfl.exp"(%arg0) : (tensor<8x128xf32>) -> tensor<8x128xf32>
+  %1 = "tfl.sum"(%0, %cst) {keep_dims = true} : (tensor<8x128xf32>, tensor<1xi32>) -> tensor<8x1xf32>
+  %2 = "tfl.div"(%0, %1) {fused_activation_function = "NONE"} : (tensor<8x128xf32>, tensor<8x1xf32>) -> tensor<8x128xf32>
+  return %2 : tensor<8x128xf32>
+
+// CHECK-LABEL: SoftMaxWithoutNormalization
+// CHECK: %[[RESULT:.*]] = "tfl.softmax"(%arg0) {beta = 1.000000e+00 : f32} : (tensor<8x128xf32>) -> tensor<8x128xf32>
+// CHECK: return %[[RESULT]] : tensor<8x128xf32>
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/prepare-composite-functions-tf.mlir b/tensorflow/compiler/mlir/lite/tests/prepare-composite-functions-tf.mlir
index 6847cdd5874..2b871769c81 100644
--- a/tensorflow/compiler/mlir/lite/tests/prepare-composite-functions-tf.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/prepare-composite-functions-tf.mlir
@@ -457,6 +457,7 @@ func @inference_standard_lstm_time_major_cannot_fuse(%arg0: tensor<?x8x8xf32>, %
 // -----
 
 module {
+// expected-warning @+1 {{we cannot fuse this lstm func because the batch size is not fixed, please consider setting fixed batch size}}
 func @dynamic_shape_non_fuse_standard_lstm(%arg0: tensor<?x8x8xf32>, %arg1: tensor<?x10xf32>, %arg2: tensor<?x10xf32>, %arg3: tensor<8x40xf32>, %arg4: tensor<10x40xf32>, %arg5: tensor<40xf32>) -> (tensor<?x10xf32>, tensor<?x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>) attributes {tf._input_shapes = ["tfshape$dim { size: -1 } dim { size: 8 } dim { size: 8 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true"], tf.api_implements = "lstm_b4e9f0e7-ac55-42bc-8ef2-8496419a608c", tf.api_preferred_device = "CPU", tf.go_backwards = false, tf.time_major = true} {
   %0 = "tf.BatchMatMulV2"(%arg0, %arg3) {adj_x = false, adj_y = false} : (tensor<?x8x8xf32>, tensor<8x40xf32>) -> tensor<?x8x40xf32>
   %1 = "tf.Add"(%0, %arg5) : (tensor<?x8x40xf32>, tensor<40xf32>) -> tensor<?x8x40xf32>
@@ -519,3 +520,42 @@ func @func_with_call(%arg0: tensor<100xf32>) -> tensor<100xf32> {
   return %0 : tensor<100xf32>
   }
 }
+
+// -----
+
+module {
+func @tflite_custom_nms(%arg0: tensor<1x100x4xf32>, %arg1: tensor<1x100x91xf32>, %arg2: tensor<100x4xf32>) -> (tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>) attributes  {tf._implements = #tf.func<@"TFLite_Detection_PostProcess", {max_detections = 10 : i64, max_classes_per_detection = 1 : i64, num_classes = 91 : i64, nms_score_threshold = 0.5 : f32, nms_iou_threshold = 0.6 : f32, y_scale = 5.0 : f32, x_scale = 10.0 : f32, h_scale = 1.0 : f32, w_scale = 2.0 : f32, use_regular_nms = 0 : i1}>, tf._reference = "mlir"} {
+  %0 = "tf.Const"() {value = dense<0.0> : tensor<f32>} : () -> tensor<f32>
+  %1 = "tf.Const"() {value = dense<0.0> : tensor<f32>} : () -> tensor<f32>
+  %2 = "tf.Const"() {value = dense<0.0> : tensor<f32>} : () -> tensor<f32>
+  %3 = "tf.Const"() {value = dense<0.0> : tensor<f32>} : () -> tensor<f32>
+  return %0, %1, %2, %3 : tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>
+}
+
+// CHECK-LABEL: func @tflite_custom_nms(
+// CHECK-SAME:                          %[[VAL_0:.*]]: tensor<1x100x4xf32>,
+// CHECK-SAME:                          %[[VAL_1:.*]]: tensor<1x100x91xf32>,
+// CHECK-SAME:                          %[[VAL_2:.*]]: tensor<100x4xf32>) -> (tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>) attributes {tf._implements = "TFLite_Detection_PostProcess", tf._reference = "mlir"} {
+// CHECK:         %[[VAL_3:.*]]:4 = "tfl.custom"(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]]) {custom_code = "TFLite_Detection_PostProcess", custom_option = opaque<"tfl", "0x6D61785F646574656374696F6E73006D61785F636C61737365735F7065725F646574656374696F6E006E756D5F636C6173736573006E6D735F73636F72655F7468726573686F6C64006E6D735F696F755F7468726573686F6C6400795F7363616C6500785F7363616C6500685F7363616C6500775F7363616C65007573655F726567756C61725F6E6D73000A217E8E465B681720313A00000C000000010000000A0000000000803F010000000A0000009A99193F0000003F5B0000000000000000000040000020410000A0400E06060E0E06060E0E0E322601"> : tensor<217xi8>} : (tensor<1x100x4xf32>, tensor<1x100x91xf32>, tensor<100x4xf32>) -> (tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>)
+// CHECK:         return %[[VAL_3]]#0, %[[VAL_3]]#1, %[[VAL_3]]#2, %[[VAL_3]]#3 : tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>
+// CHECK:       }
+}
+
+// -----
+
+module {
+// expected-error @+1 {{Invalid number of results from TFLite_Detection_PostProcess}}
+func @tflite_custom_nms_invalid_results(%arg0: tensor<1x100x4xf32>, %arg1: tensor<1x100x91xf32>, %arg2: tensor<100x4xf32>) -> (tensor<f32>, tensor<f32>, tensor<f32>) attributes  {tf._implements = #tf.func<@"TFLite_Detection_PostProcess", {max_detections = 10 : i64, max_classes_per_detection = 1 : i64, num_classes = 91 : i64, nms_score_threshold = 0.5 : f32, nms_iou_threshold = 0.6 : f32, y_scale = 5.0 : f32, x_scale = 10.0 : f32, h_scale = 1.0 : f32, w_scale = 2.0 : f32, use_regular_nms = 0 : i1}>, tf._reference = "mlir"}
+
+// expected-error @+1 {{Invalid number of arguments to TFLite_Detection_PostProcess}}
+func @tflite_custom_nms_invalid_args(%arg0: tensor<1x100x4xf32>, %arg1: tensor<1x100x91xf32>) -> (tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>) attributes  {tf._implements = #tf.func<@"TFLite_Detection_PostProcess", {max_detections = 10 : i64, max_classes_per_detection = 1 : i64, num_classes = 91 : i64, nms_score_threshold = 0.5 : f32, nms_iou_threshold = 0.6 : f32, y_scale = 5.0 : f32, x_scale = 10.0 : f32, h_scale = 1.0 : f32, w_scale = 2.0 : f32, use_regular_nms = 0 : i1}>, tf._reference = "mlir"}
+
+// expected-error @+1 {{max_classes_per_detection attribute is not set or not an integer}}
+func @tflite_custom_nms_missing_func_args(%arg0: tensor<1x100x4xf32>, %arg1: tensor<1x100x91xf32>, %arg2: tensor<100x4xf32>) -> (tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>) attributes  {tf._implements = #tf.func<@"TFLite_Detection_PostProcess", {max_detections = 10 : i64, num_classes = 91 : i64, nms_score_threshold = 0.5 : f32, nms_iou_threshold = 0.6 : f32, y_scale = 5.0 : f32, x_scale = 10.0 : f32, h_scale = 1.0 : f32, w_scale = 2.0 : f32, use_regular_nms = 0 : i1}>, tf._reference = "mlir"} {
+  %0 = "tf.Const"() {value = dense<0.0> : tensor<f32>} : () -> tensor<f32>
+  %1 = "tf.Const"() {value = dense<0.0> : tensor<f32>} : () -> tensor<f32>
+  %2 = "tf.Const"() {value = dense<0.0> : tensor<f32>} : () -> tensor<f32>
+  %3 = "tf.Const"() {value = dense<0.0> : tensor<f32>} : () -> tensor<f32>
+  return %0, %1, %2, %3 : tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>
+}
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir b/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir
index 066139e179b..a0cc6cc1fdb 100644
--- a/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir
@@ -1,4 +1,5 @@
 // RUN: tf-opt -tfl-prepare-tf %s | FileCheck %s
+// RUN: tf-opt %s -tf-layout-optimization=force-data-format=NHWC -tfl-prepare-tf  | FileCheck --check-prefix=LAYOUT --dump-input=always %s
 
 module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 268 : i32}} {
 
@@ -53,6 +54,15 @@ func @depthwiseConv2D(tensor<256x32x32x3xf32>, tensor<3x3x3x4xf32>, tensor<256x3
 // CHECK:  %5 = "tf.DepthwiseConv2dNative"
 }
 
+func @Conv2dNCHW(%arg0: tensor<256x3x32x32xf32>, %arg1: tensor<3x3x3x16xf32>) -> tensor<256x16x30x30xf32> {
+  %0 = "tf.Conv2D"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", data_format = "NCHW", dilations = [1, 1, 1, 1], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<256x3x32x32xf32>, tensor<3x3x3x16xf32>) -> tensor<256x16x30x30xf32>
+  return %0 : tensor<256x16x30x30xf32>
+
+  // LAYOUT-LABEL: Conv2dNCHW
+  // LAYOUT: "tfl.conv_2d"
+}
+
+
 func @fusedBatchNorm(tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>) {
 ^bb0(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8xf32>, %arg2: tensor<8xf32>, %arg3: tensor<8xf32>, %arg4: tensor<8xf32>):
   // OK
@@ -82,8 +92,8 @@ func @fusedBatchNorm(tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8
 //              offset - mean * scale * rsqrt(variance + epsilon)
 // CHECK:  %[[ADD2:.*]] = "tf.Add"(%[[MUL2]], %[[SUB]])
 
-// CHECK:  %[[BATCHNORM1_a:[^,]+]], {{.*}} = "tf.FusedBatchNorm"(%[[ADD2]], %[[ARG1]], %[[ARG2]], %[[ARG3]], %[[ARG4]])
-// CHECK:  "tf.FusedBatchNorm"(%[[BATCHNORM1_a]], %[[ARG1]], %[[ARG2]], %[[ARG3]], %[[ARG4]])
+// CHECK:  %[[BATCHNORM1_a:[^,]+]], {{.*}} = "tf.FusedBatchNormV3"(%[[ADD2]], %[[ARG1]], %[[ARG2]], %[[ARG3]], %[[ARG4]])
+// CHECK:  "tf.FusedBatchNormV3"(%[[BATCHNORM1_a]], %[[ARG1]], %[[ARG2]], %[[ARG3]], %[[ARG4]])
 }
 
 func @fusedBatchNormV3(tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>) {
@@ -483,6 +493,20 @@ func @StridedSliceEllipsisMaskBefore(%arg0: tensor<21x15x7xf32>) -> tensor<21x15
   // CHECK: %[[STRIDED_SLICE:.*]] = "tf.StridedSlice"(%arg0, %[[CST]], %[[CST]], %[[CST_0]]) {begin_mask = 3 : i64, ellipsis_mask = 0 : i64, end_mask = 3 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<21x15x7xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<21x15x2xf32>
 }
 
+// CHECK-LABEL: @StridedSliceEllipsisMaskBeforeWithBeginAndEndMask
+func @StridedSliceEllipsisMaskBeforeWithBeginAndEndMask(%arg0: tensor<4x5x4xf32>) -> tensor<4x4x4xf32> {
+  %cst = constant dense<[0, 1, 0]> : tensor<3xi32>
+  %cst_0 = constant dense<0> : tensor<3xi32>
+  %cst_1 = constant dense<1> : tensor<3xi32>
+  %0 = "tf.StridedSlice"(%arg0, %cst, %cst_0, %cst_1) {begin_mask = 6 : i64, ellipsis_mask = 1 : i64, end_mask = 4 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<4x5x4xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<4x4x4xf32>
+  return %0 : tensor<4x4x4xf32>
+
+  // CHECK: %[[CST:.*]] = constant dense<[0, 1, 0]> : tensor<3xi32>
+  // CHECK: %[[CST_0:.*]] = constant dense<0> : tensor<3xi32>
+  // CHECK: %[[CST_1:.*]] = constant dense<1> : tensor<3xi32>
+  // CHECK: %[[STRIDED_SLICE:.*]] = "tf.StridedSlice"(%arg0, %[[CST]], %[[CST_0]], %[[CST_1]]) {begin_mask = 7 : i64, ellipsis_mask = 0 : i64, end_mask = 5 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<4x5x4xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<4x4x4xf32>
+}
+
 // CHECK-LABEL: @StridedSliceEllipsisMaskAfter
 func @StridedSliceEllipsisMaskAfter(%arg0: tensor<21x15x7xf32>) -> tensor<5x15x7xf32> {
   %cst = constant dense<0> : tensor<2xi32>
@@ -595,4 +619,51 @@ func @xla_conv(%arg0: tensor<4x8x8x16xf32>) -> tensor<4x8x8x16xf32> {
   // CHECK: return %[[RES]]
 }
 
+func @broadcast_to_f32(%arg0: tensor<3xf32>, %arg1: tensor<2xi32>) -> tensor<3x3xf32> {
+  %0 = "tf.BroadcastTo"(%arg0, %arg1) : (tensor<3xf32>, tensor<2xi32>) -> tensor<3x3xf32>
+  return %0: tensor<3x3xf32>
+
+// CHECK-LABEL: broadcast_to_f32
+// CHECK:  [[CST:%.*]] = constant dense<1.000000e+00> : tensor<3x3xf32>
+// CHECK:  [[MUL:%.*]] = "tf.Mul"(%arg0, [[CST]]) : (tensor<3xf32>, tensor<3x3xf32>) -> tensor<3x3xf32>
+// CHECK:  return [[MUL]] : tensor<3x3xf32>
+}
+
+func @broadcast_to_i32(%input: tensor<3xi32>, %shape: tensor<2xi32>) -> tensor<3x3xi32> {
+  %0 = "tf.BroadcastTo"(%input, %shape) : (tensor<3xi32>, tensor<2xi32>) -> tensor<3x3xi32>
+  return %0: tensor<3x3xi32>
+
+// CHECK-LABEL: broadcast_to_i32
+// CHECK:  [[CST:%.*]] = constant dense<1> : tensor<3x3xi32>
+// CHECK:  [[MUL:%.*]] = "tf.Mul"(%arg0, [[CST]]) : (tensor<3xi32>, tensor<3x3xi32>) -> tensor<3x3xi32>
+// CHECK:  return [[MUL]] : tensor<3x3xi32>
+}
+
+// CHECK-LABEL: lower_rfft_to_rfft2d
+func @lower_rfft_to_rfft2d(%input: tensor<10x20x30xf32>, %fft_len: tensor<1xi32>) -> tensor<10x20x30xcomplex<f64>> {
+  %0 = "tf.RFFT"(%input, %fft_len) : (tensor<10x20x30xf32>, tensor<1xi32>) -> tensor<10x20x30xcomplex<f64>>
+  return %0: tensor<10x20x30xcomplex<f64>>
+
+// CHECK:  %[[CST:.*]] = constant dense<-2> : tensor<i32>
+// CHECK:  %[[CST0:.*]] = constant dense<1> : tensor<1xi32>
+// CHECK:  %[[CST1:.*]] = constant dense<0> : tensor<i32>
+// CHECK:  %[[EXP:.*]] = "tf.ExpandDims"(%arg0, %[[CST]]) : (tensor<10x20x30xf32>, tensor<i32>) -> tensor<10x20x1x30xf32>
+// CHECK:  %[[CON:.*]] = "tf.ConcatV2"(%[[CST0]], %arg1, %[[CST1]]) : (tensor<1xi32>, tensor<1xi32>, tensor<i32>) -> tensor<2xi32>
+// CHECK:  %[[RFF:.*]] = "tf.RFFT2D"(%[[EXP]], %[[CON]]) : (tensor<10x20x1x30xf32>, tensor<2xi32>) -> tensor<10x20x1x30xcomplex<f64>>
+// CHECK:  %[[SQE:.*]] = "tf.Squeeze"(%[[RFF]]) {squeeze_dims = [-2]} : (tensor<10x20x1x30xcomplex<f64>>) -> tensor<10x20x30xcomplex<f64>>
+}
+
+// CHECK-LABEL: xla_gather_to_slice
+func @xla_gather_to_slice(%arg0 : tensor<1x9x104x768xf32>) -> tensor<*xf32> {
+  %0 = "tf.Const"() {value = dense<0> : tensor<1xi32>} : () -> tensor<1xi32>
+  %1 = "tf.Const"() {value = dense<[1, 9, 23, 768]> : tensor<4xi32>} : () -> tensor<4xi32>
+  %2 = "tf.XlaGather"(%arg0, %0, %1) {device = "", dimension_numbers = "\0A\04\00\01\02\03\1A\01\02", indices_are_sorted = false} : (tensor<1x9x104x768xf32>, tensor<1xi32>, tensor<4xi32>) -> tensor<*xf32>
+  return %2 : tensor<*xf32>
+
+// CHECK: %[[CST:.*]] = constant dense<0> : tensor<4xi64>
+// CHECK: %[[CST0:.*]] = constant dense<[1, 9, 23, 768]> : tensor<4xi64>
+// CHECK: %[[V0:.*]] = "tf.Slice"(%arg0, %[[CST]], %[[CST0]]) : (tensor<1x9x104x768xf32>, tensor<4xi64>, tensor<4xi64>) -> tensor<*xf32>
+// CHECK: return %[[V0]] : tensor<*xf32>
+}
+
 }
diff --git a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
index d63eb481376..2feb7fedb81 100644
--- a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
+++ b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
@@ -38,6 +38,10 @@ CreateTFExecutorToControlDialectConversion();
 }  // namespace mlir
 
 namespace tensorflow {
+namespace {
+// Data layout supported by TFLite.
+const char kTFLiteDataLayout[] = "NHWC";
+}  // namespace
 
 void AddQuantizationPasses(const mlir::TFL::QuantizationSpecs& quant_specs,
                            mlir::OpPassManager* pass_manager) {
@@ -170,6 +174,12 @@ void AddTFToTFLConversionPasses(const mlir::TFL::PassConfig& pass_config,
     if (pass_config.shape_inference) {
       pass_manager->addPass(mlir::TF::CreateTFShapeInferencePass());
     }
+    // Force layout supported by TFLite, this will transpose the data
+    // to match 'kTFLiteDataLayout'
+    mlir::TF::LayoutOptimizationPipelineOptions layout_optimization_options;
+    layout_optimization_options.force_data_format = kTFLiteDataLayout;
+    mlir::TF::CreateLayoutOptimizationPipeline(*pass_manager,
+                                               layout_optimization_options);
     // Prepare for TFLite dialect, rerun canonicalization, and then legalize to
     // the TFLite dialect.
     pass_manager->addPass(
diff --git a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
index 414a0de0118..c158f3a8e21 100644
--- a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
@@ -129,6 +129,18 @@ Status ConvertTFExecutorToTFLOrFlatbuffer(
     bool emit_select_tf_ops, bool emit_custom_ops,
     const mlir::TFL::QuantizationSpecs& quant_specs, std::string* result,
     mlir::PassManager* pass_manager) {
+  // Register a warning handler only log to std out.
+  mlir::ScopedDiagnosticHandler s(
+      module.getContext(), [](mlir::Diagnostic& diag) {
+        if (diag.getSeverity() == mlir::DiagnosticSeverity::Warning) {
+          for (auto& note : diag.getNotes()) {
+            std::cout << note.str() << "\n";
+            LOG(WARNING) << note.str() << "\n";
+          }
+        }
+        return mlir::failure();
+      });
+
   mlir::StatusScopedDiagnosticHandler statusHandler(module.getContext(),
                                                     /*propagate=*/true);
 
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td b/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
index 47cfaecd3fb..322da815a47 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
@@ -27,6 +27,9 @@ def NonOpaqueElementsAttr : ElementsAttrBase<
 def F32ElementsAttr : ElementsAttrBase<
   CPred<"$_self.cast<ElementsAttr>().getType().getElementType().isF32()">, "float constant tensor">;
 
+def Int64ElementsAttr : ElementsAttrBase<
+  CPred<"$_self.cast<ElementsAttr>().getType().getElementType().isInteger(64)">, "Int 64 constant tensor">;
+
 // Extract the ith int element from an ArrayAttr $0 as an 32-bit IntegerAttr
 // with builder.
 class ExtractI32At<int i> : NativeCodeCall<
@@ -50,6 +53,10 @@ def ExtractSingleElementAsInteger : NativeCodeCall<
 def ExtractSingleElementAsInt32 : NativeCodeCall<
     "$_builder.getI32IntegerAttr(ExtractSingleElementAsInteger($_self.cast<ElementsAttr>()).getInt())">;
 
+// Converts tensor with int64 to int32.
+def CreateCastToInt32 : NativeCodeCall<
+  "CreateCastToInt32($0, $_loc, $_builder)">;
+
 // Checks whether the given operation has static shapes and same shapes of all inputs.
 def HasSameStaticShapesPred : CPred<"HasSameStaticShapes($0.getDefiningOp())">;
 def HasSameStaticShapes : Constraint<HasSameStaticShapesPred, "op must have static same input shapes">;
@@ -149,6 +156,7 @@ def LegalizeMaxPool2D : Pat<
               IsIntList1XY1:$ksize,
               IsIntList1XY1:$strides,
               $padding,
+              $explicit_paddings,
               IsDataFormatNHWC:$format),
           (TFL_MaxPool2DOp $value,
               /*padding=*/$padding,
@@ -207,8 +215,14 @@ def LegalizeSoftPlus : Pat<(TF_SoftplusOp F32Tensor:$arg0),
 def LegalizeSqueeze : Pat<(TF_SqueezeOp $arg, $squeeze_dims),
                           (TFL_SqueezeOp $arg, $squeeze_dims)>;
 def LegalizeTanh : Pat<(TF_TanhOp $arg), (TFL_TanhOp $arg)>;
+
+def LegalizeTransposeInt64 : Pat<
+  (TF_TransposeOp $arg, (ConstantOp Int64ElementsAttr:$perm)),
+  (TFL_TransposeOp $arg, (CreateCastToInt32 $perm))>;
+
 def LegalizeTranspose : Pat<(TF_TransposeOp $arg, $perm),
                             (TFL_TransposeOp $arg, $perm)>;
+
 def LegalizeWhere : Pat<(TF_WhereOp $arg), (TFL_WhereOp $arg)>;
 def LegalizeZerosLike : Pat<(TF_ZerosLikeOp $arg), (TFL_ZerosLikeOp $arg)>;
 
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
index 7a16e475ce3..6f7f3b88471 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/Threading.h"
 #include "mlir/Dialect/Quant/FakeQuantSupport.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/QuantOps.h"  // from @llvm-project
 #include "mlir/Dialect/Quant/UniformSupport.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Diagnostics.h"  // from @llvm-project
@@ -45,8 +46,10 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 #include "tensorflow/compiler/mlir/lite/utils/attribute_utils.h"
+#include "tensorflow/compiler/mlir/lite/utils/constant_utils.h"
 #include "tensorflow/compiler/mlir/lite/utils/validators.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/mangling_util.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -64,7 +67,6 @@ namespace TFL {
 // The actual LegalizeTF Pass.
 namespace {
 
-using xla::Status;
 using xla::StatusOr;
 
 constexpr char kUnidirectionalSequenceLstm[] = "tf.UnidirectionalSequenceLstm";
@@ -73,6 +75,10 @@ constexpr char kTfLiteInputIndices[] = "_tflite_input_indices";
 
 // Legalize operations in functions.
 class LegalizeTF : public PassWrapper<LegalizeTF, FunctionPass> {
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<quant::QuantizationDialect, TFL::TensorFlowLiteDialect>();
+  }
+
  public:
   LegalizeTF() = default;
   LegalizeTF(const LegalizeTF&) {}
@@ -111,6 +117,17 @@ bool HasSameStaticShapes(Operation* op) {
   return true;
 }
 
+// Util that casts 'val' to Int32 by adding a cast Op.
+Value CreateCastToInt32(Attribute val, Location loc,
+                        PatternRewriter& rewriter) {
+  auto shape = val.getType().dyn_cast<RankedTensorType>().getShape();
+  IntegerType new_ele_type = rewriter.getIntegerType(32);
+  ShapedType new_type = RankedTensorType::get(shape, new_ele_type);
+  return rewriter.create<TF::CastOp>(loc, new_type,
+                                     rewriter.create<TF::ConstOp>(loc, val),
+                                     rewriter.getBoolAttr(false));
+}
+
 #include "tensorflow/compiler/mlir/lite/transforms/generated_legalize_tf.inc"
 
 #define DECL_CONVERT_OP(tf_op)                                               \
@@ -137,7 +154,6 @@ DECL_CONVERT_OP(StridedSlice);
 DECL_CONVERT_OP(Unpack);
 DECL_CONVERT_OP(Reciprocal);
 DECL_CONVERT_OP(RandomUniform);
-DECL_CONVERT_OP(BroadcastTo);
 
 #undef DECL_CONVERT_OP
 
@@ -154,9 +170,8 @@ LogicalResult ConvertTFRandomUniformOp::matchAndRewrite(
       tensorflow::random::PhiloxRandom, float>
       Distribution;
 
-  tensorflow::random::PhiloxRandom generator(
-      random_uniform_op.seed().getSExtValue(),
-      random_uniform_op.seed2().getSExtValue());
+  tensorflow::random::PhiloxRandom generator(random_uniform_op.seed(),
+                                             random_uniform_op.seed2());
   Distribution dist;
   size_t num_elements = 0;
   if (auto output_type =
@@ -227,26 +242,47 @@ LogicalResult ConvertTFConcatV2Op::matchAndRewrite(
   return success();
 }
 
-// The following is effectively:
-// def : Pat<
-//   (TF_MatMulOp $a, $b, ConstBoolAttrFalse:$transpose_a,
-//      ConstBoolAttrTrue:$transpose_b),
-//   (TFL_FullyConnectedOp:$__0 $a, $b,
-//     NoInput.pattern, TFL_AF_None, TFL_FCWO_Default, ConstBoolAttrFalse)>;
 LogicalResult ConvertTFMatMulOp::matchAndRewrite(
     Operation* op, PatternRewriter& rewriter) const {
   auto tf_matmul_op = cast<TF::MatMulOp>(op);
-  if (tf_matmul_op.transpose_a()) return failure();
-  if (!tf_matmul_op.transpose_b()) return failure();
+  auto lhs = op->getOperand(0);
+  auto rhs = op->getOperand(1);
+  auto transpose = [&](Value input) -> std::pair<LogicalResult, Value> {
+    RankedTensorType type =
+        input.getType().dyn_cast_or_null<RankedTensorType>();
+    if (!type || type.getRank() != 2) return {failure(), nullptr};
+
+    auto permute_attr = DenseIntElementsAttr::get(
+        RankedTensorType::get({2}, rewriter.getI32Type()), {1, 0});
+    auto permute = rewriter.create<ConstantOp>(
+        op->getLoc(), permute_attr.getType(), permute_attr);
+    llvm::SmallVector<int64_t, 2> new_shape{type.getShape()[1],
+                                            type.getShape()[0]};
+    auto output = rewriter.create<TFL::TransposeOp>(
+        op->getLoc(), RankedTensorType::get(new_shape, type.getElementType()),
+        input, permute);
+    return {success(), output};
+  };
+
+  // TODO(jpienaar): Remove once handled via dailect conversion.
+  if (tf_matmul_op.transpose_a()) {
+    LogicalResult result = success();
+    std::tie(result, lhs) = transpose(lhs);
+    if (failed(result)) return failure();
+  }
+  if (!tf_matmul_op.transpose_b()) {
+    LogicalResult result = success();
+    std::tie(result, rhs) = transpose(rhs);
+    if (failed(result)) return failure();
+  }
 
   Type output_type = tf_matmul_op.getResult().getType();
-  // TODO(jpienaar): Follow up post shuffle discussion.
   auto no_input = rewriter.create<ConstantOp>(
       op->getLoc(), rewriter.getNoneType(), rewriter.getUnitAttr());
   auto fc_op = rewriter.create<FullyConnectedOp>(
-      op->getLoc(), ArrayRef<Type>{output_type}, op->getOperand(0),
-      op->getOperand(1), no_input, rewriter.getStringAttr("NONE"),
-      rewriter.getStringAttr("DEFAULT"), rewriter.getBoolAttr(false));
+      op->getLoc(), ArrayRef<Type>{output_type}, lhs, rhs, no_input,
+      rewriter.getStringAttr("NONE"), rewriter.getStringAttr("DEFAULT"),
+      rewriter.getBoolAttr(false));
   rewriter.replaceOp(op, {fc_op.getResult(0)});
   return success();
 }
@@ -259,7 +295,7 @@ LogicalResult ConvertTFPackOp::matchAndRewrite(
   auto output_type = tf_pack_op.output().getType();
   auto values_count = rewriter.getI32IntegerAttr(tf_pack_op.N());
   // Axis can be negative.
-  auto axis = rewriter.getI32IntegerAttr(tf_pack_op.axis().getSExtValue());
+  auto axis = rewriter.getI32IntegerAttr(tf_pack_op.axis());
 
   rewriter.replaceOpWithNewOp<PackOp>(op, output_type, values, values_count,
                                       axis);
@@ -356,27 +392,22 @@ LogicalResult ConvertTFStridedSliceOp::matchAndRewrite(
         op, tf_strided_slice_op.output().getType(), tf_strided_slice_op.input(),
         tf_strided_slice_op.begin(), tf_strided_slice_op.end(),
         tf_strided_slice_op.strides(),
-        rewriter.getI32IntegerAttr(
-            tf_strided_slice_op.begin_mask().getSExtValue()),
-        rewriter.getI32IntegerAttr(
-            tf_strided_slice_op.end_mask().getSExtValue()),
-        rewriter.getI32IntegerAttr(
-            tf_strided_slice_op.ellipsis_mask().getSExtValue()),
-        rewriter.getI32IntegerAttr(
-            tf_strided_slice_op.new_axis_mask().getSExtValue()),
-        rewriter.getI32IntegerAttr(
-            tf_strided_slice_op.shrink_axis_mask().getSExtValue()));
+        rewriter.getI32IntegerAttr(tf_strided_slice_op.begin_mask()),
+        rewriter.getI32IntegerAttr(tf_strided_slice_op.end_mask()),
+        rewriter.getI32IntegerAttr(tf_strided_slice_op.ellipsis_mask()),
+        rewriter.getI32IntegerAttr(tf_strided_slice_op.new_axis_mask()),
+        rewriter.getI32IntegerAttr(tf_strided_slice_op.shrink_axis_mask()));
     return success();
   }
 
   int num_input_dims = ranked_input_type.getRank();
   // Pad `begin` array with zero values and update the `begin_mask`.
   SmallVector<int32_t, 8> begin_pad_val(num_input_dims, 0);
-  int begin_mask = tf_strided_slice_op.begin_mask().getSExtValue();
+  int begin_mask = tf_strided_slice_op.begin_mask();
   Value padded_begin = PadStridedSliceAttributeArray(
       op, rewriter, tf_strided_slice_op.begin(), begin_pad_val, &begin_mask);
   // Pad `end` array with `input_shape` and update the `end_mask`.
-  int end_mask = tf_strided_slice_op.end_mask().getSExtValue();
+  int end_mask = tf_strided_slice_op.end_mask();
   auto input_shape = ranked_input_type.getShape();
   SmallVector<int32_t, 8> end_pad_val(input_shape.begin(), input_shape.end());
   Value padded_end = PadStridedSliceAttributeArray(
@@ -390,12 +421,9 @@ LogicalResult ConvertTFStridedSliceOp::matchAndRewrite(
       padded_begin, padded_end, padded_strides,
       rewriter.getI32IntegerAttr(begin_mask),
       rewriter.getI32IntegerAttr(end_mask),
-      rewriter.getI32IntegerAttr(
-          tf_strided_slice_op.ellipsis_mask().getSExtValue()),
-      rewriter.getI32IntegerAttr(
-          tf_strided_slice_op.new_axis_mask().getSExtValue()),
-      rewriter.getI32IntegerAttr(
-          tf_strided_slice_op.shrink_axis_mask().getSExtValue()));
+      rewriter.getI32IntegerAttr(tf_strided_slice_op.ellipsis_mask()),
+      rewriter.getI32IntegerAttr(tf_strided_slice_op.new_axis_mask()),
+      rewriter.getI32IntegerAttr(tf_strided_slice_op.shrink_axis_mask()));
   return success();
 }
 
@@ -406,7 +434,7 @@ LogicalResult ConvertTFUnpackOp::matchAndRewrite(
   auto input = tf_unpack_op.value();
   auto num = rewriter.getI32IntegerAttr(tf_unpack_op.num());
   // Axis can be negative.
-  auto axis = rewriter.getI32IntegerAttr(tf_unpack_op.axis().getSExtValue());
+  auto axis = rewriter.getI32IntegerAttr(tf_unpack_op.axis());
 
   rewriter.replaceOpWithNewOp<UnpackOp>(op, tf_unpack_op.output().getTypes(),
                                         input, num, axis);
@@ -483,89 +511,6 @@ LogicalResult ConvertTFAssertOp::matchAndRewrite(
   return success();
 }
 
-StatusOr<ConstantOp> CreateConstOpWithSingleValue(PatternRewriter* rewriter,
-                                                  Location loc,
-                                                  ShapedType shaped_type,
-                                                  int value) {
-  Type element_type = shaped_type.getElementType();
-  ShapedType scalar_type = RankedTensorType::get({}, element_type);
-  Attribute attr;
-  switch (element_type.getKind()) {
-    case mlir::StandardTypes::F16: {
-      auto floatType = mlir::FloatType::getF16(element_type.getContext());
-      auto floatAttr =
-          mlir::FloatAttr::get(floatType, static_cast<float>(value));
-      std::vector<Attribute> floatValues({floatAttr});
-      attr = DenseElementsAttr::get(scalar_type, floatValues);
-      break;
-    }
-    case mlir::StandardTypes::BF16: {
-      auto floatType = mlir::FloatType::getBF16(element_type.getContext());
-      auto floatAttr =
-          mlir::FloatAttr::get(floatType, static_cast<float>(value));
-      std::vector<Attribute> floatValues({floatAttr});
-      attr = DenseElementsAttr::get(scalar_type, floatValues);
-      break;
-    }
-    case mlir::StandardTypes::F32: {
-      attr =
-          DenseElementsAttr::get<float>(scalar_type, static_cast<float>(value));
-      break;
-    }
-    case mlir::StandardTypes::Complex: {
-      auto etype = element_type.cast<mlir::ComplexType>().getElementType();
-      if (etype.isF32()) {
-        auto dialect = etype.getContext()->getRegisteredDialect("tf");
-        tensorflow::TensorProto repr;
-        repr.set_dtype(tensorflow::DT_COMPLEX64);
-
-        tensorflow::TensorShapeProto* shape = repr.mutable_tensor_shape();
-        shape->set_unknown_rank(false);
-        shape->add_dim()->set_size(int64_t{1});
-        std::string content;
-        auto complex_value =
-            std::complex<float>(static_cast<float>(value), 0.0f);
-        content.assign(reinterpret_cast<const char*>(&complex_value),
-                       sizeof(complex_value));
-        repr.set_tensor_content(content);
-        std::string mangled = tensorflow::mangling_util::MangleTensor(repr);
-
-        attr = mlir::OpaqueElementsAttr::get(dialect, scalar_type, mangled);
-        break;
-      }
-      return Status(tensorflow::error::INVALID_ARGUMENT, "Unsupported type");
-    }
-    case mlir::StandardTypes::Integer: {
-      const auto& itype = element_type.cast<mlir::IntegerType>();
-      switch (itype.getWidth()) {
-        case 8:
-          attr = DenseElementsAttr::get<int8_t>(scalar_type,
-                                                static_cast<int8_t>(value));
-          break;
-        case 16:
-          attr = DenseElementsAttr::get<int16_t>(scalar_type,
-                                                 static_cast<int16_t>(value));
-          break;
-        case 32:
-          attr = DenseElementsAttr::get<int32_t>(scalar_type,
-                                                 static_cast<int32_t>(value));
-          break;
-        case 64:
-          attr = DenseElementsAttr::get<int64_t>(scalar_type,
-                                                 static_cast<int64_t>(value));
-          break;
-        default:
-          return Status(tensorflow::error::INVALID_ARGUMENT,
-                        "Unsupported type");
-      }
-      break;
-    }
-    default:
-      return Status(tensorflow::error::INVALID_ARGUMENT, "Unsupported type");
-  }
-  return rewriter->create<ConstantOp>(loc, scalar_type, attr);
-}
-
 LogicalResult ConvertTFReciprocalOp::matchAndRewrite(
     Operation* op, PatternRewriter& rewriter) const {
   auto tf_reciprocal_op = cast<TF::ReciprocalOp>(op);
@@ -586,31 +531,6 @@ LogicalResult ConvertTFReciprocalOp::matchAndRewrite(
   return success();
 }
 
-LogicalResult ConvertTFBroadcastToOp::matchAndRewrite(
-    Operation* op, PatternRewriter& rewriter) const {
-  auto tf_broadcast_to_op = cast<TF::BroadcastToOp>(op);
-  auto element_type = tf_broadcast_to_op.input().getType().cast<ShapedType>();
-  auto output_type = tf_broadcast_to_op.output().getType();
-
-  auto status_or_const_op =
-      CreateConstOpWithSingleValue(&rewriter, op->getLoc(), element_type, 1);
-  if (!status_or_const_op.ok()) {
-    return failure();
-  }
-
-  auto tfl_fill_op = rewriter.create<TFL::FillOp>(
-      op->getLoc(), output_type, tf_broadcast_to_op.shape(),
-      status_or_const_op.ValueOrDie());
-
-  StringAttr fused_activation_function =
-      StringAttr::get("NONE", rewriter.getContext());
-
-  rewriter.replaceOpWithNewOp<TFL::MulOp>(
-      op, output_type, tf_broadcast_to_op.input(), tfl_fill_op,
-      fused_activation_function);
-  return success();
-}
-
 // Legalize unidirectional sequence lstm.
 struct LegalizeUnidirectionalSequenceLstm : public RewritePattern {
   explicit LegalizeUnidirectionalSequenceLstm(MLIRContext* context)
@@ -751,7 +671,7 @@ void LegalizeTF::runOnFunction() {
               ConvertTFMatrixDiagV3Op, ConvertTFPackOp, ConvertTFReshapeOp,
               ConvertTFSplitOp, ConvertTFSplitVOp, ConvertTFStridedSliceOp,
               ConvertTFUnpackOp, ConvertTFAssertOp, ConvertTFReciprocalOp,
-              ConvertTFRandomUniformOp, ConvertTFBroadcastToOp>(context);
+              ConvertTFRandomUniformOp>(context);
 
   // Ophint python converter converted tf node pattern.
   patterns.insert<LegalizeUnidirectionalSequenceLstm,
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_tf_while.cc b/tensorflow/compiler/mlir/lite/transforms/legalize_tf_while.cc
index 6202507ae91..dc0f6615d5d 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_tf_while.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_tf_while.cc
@@ -33,6 +33,10 @@ namespace {
 // cond and body regions.
 struct LegalizeWhile
     : public PassWrapper<LegalizeWhile, OperationPass<ModuleOp>> {
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<TFL::TensorFlowLiteDialect>();
+  }
+
   void RunOnFunction(FuncOp func);
 
   void runOnOperation() override {
@@ -60,8 +64,8 @@ void RunOnWhile(TF::WhileOp while_op) {
     // Mark old function as private so that it can be DCE'd if not called.
     func.setVisibility(SymbolTable::Visibility::Private);
   };
-  create_region_with_call(while_op.cond_func(), new_op.cond());
-  create_region_with_call(while_op.body_func(), new_op.body());
+  create_region_with_call(while_op.cond_function(), new_op.cond());
+  create_region_with_call(while_op.body_function(), new_op.body());
 
   op->replaceAllUsesWith(new_op.getResults());
   op->erase();
diff --git a/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc b/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc
index edddc7751ab..54bfc5fa3a7 100644
--- a/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc
@@ -714,7 +714,7 @@ struct ConvertTensorListStack
     RankedTensorType shape_type =
         RankedTensorType::get({-1}, rewriter.getIntegerType(32));
     auto new_shape = rewriter.create<TF::ShapeOp>(loc, shape_type, input);
-    SmallVector<int64_t, 8> output_shape = {op.num_elements().getSExtValue()};
+    SmallVector<int64_t, 8> output_shape(/*Size=*/1, op.num_elements());
     for (const auto &dim : dense_elem_attr.getIntValues())
       output_shape.push_back(dim.getSExtValue());
     RankedTensorType result_type =
@@ -749,7 +749,7 @@ Type VariantToUnrankedTensorType(Type type, Value value) {
 // Changes the function type of `cond_func` and `body_func` for the given While
 // op.
 LogicalResult UpdateFunctionTypes(TF::WhileOp op) {
-  for (FuncOp func : {op.cond_func(), op.body_func()}) {
+  for (FuncOp func : {op.cond_function(), op.body_function()}) {
     if (!func) continue;
 
     FunctionType func_type = func.getType();
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize.cc b/tensorflow/compiler/mlir/lite/transforms/optimize.cc
index 6de6187d81a..d28ee4b31fa 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
@@ -37,8 +38,10 @@ limitations under the License.
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
@@ -103,7 +106,8 @@ bool OperandsBroadcastToOutputType(Type a, Type b, Type expected_output) {
 bool IsTailOfShape(Type type1, Type type2) {
   auto tail_type = type1.dyn_cast<ShapedType>();
   auto full_type = type2.dyn_cast<ShapedType>();
-  if (!tail_type || !full_type || tail_type.getRank() > full_type.getRank())
+  if (!tail_type || !full_type || !tail_type.hasRank() ||
+      !full_type.hasRank() || tail_type.getRank() > full_type.getRank())
     return false;
   auto i1 = tail_type.getShape().rbegin(), e1 = tail_type.getShape().rend();
   auto i2 = full_type.getShape().rbegin();
@@ -160,6 +164,31 @@ bool CanFuseConvOrDepthwiseConv(Attribute filter, Attribute val,
   return false;
 }
 
+// Retuns true if we can eliminate the GatherNdOp or ScatterNdOp. When the value
+// of `indices` are from 0 to n-1, the output tensor are identical to the
+// `params`.
+bool CanOptimizeIdentityGatherNdOrScatterNdOp(Value params,
+                                              DenseIntElementsAttr indices) {
+  auto params_type = params.getType().dyn_cast<RankedTensorType>();
+  auto indices_type = indices.getType().dyn_cast<RankedTensorType>();
+  // Checks the shape of `params` is [n, ...], shape of `indices` is [n, 1]. 2D
+  // `indices` means it gets the first row of `params`. As long as indices
+  // iterate the first row of `params`, the output is identical to input.
+  if (!params_type || !indices_type || indices_type.getRank() != 2 ||
+      indices_type.getDimSize(0) != params_type.getDimSize(0) ||
+      indices_type.getDimSize(1) != 1)
+    return false;
+
+  // Checks the value in `indices` is from 0 to n-1.
+  int cur_value = 0;
+  for (const auto &v : indices.getValues<APInt>()) {
+    if (v.getSExtValue() != cur_value) return false;
+    ++cur_value;
+  }
+
+  return true;
+}
+
 // Expand Attribute 'a' to 4D with all 1s except 1 dimension.
 // Which dimension depends on 'is_depthwise' is true or false.
 ElementsAttr ExpandTo4DForConvImpl(Attribute a, bool is_depthwise) {
@@ -219,6 +248,38 @@ static Type GetShapeStrippedType(TypeAttr type_attr) {
   }
 }
 
+// Returns `true` if reducing `axes` in `input` with `keep_dims=true` results in
+// the specified `shape` and `false` otherwise.
+static bool ShapeMatchesReduceWithKeepAxes(Value input,
+                                           const mlir::Attribute &axes,
+                                           const mlir::Attribute &shape) {
+  RankedTensorType type = input.getType().dyn_cast_or_null<RankedTensorType>();
+  if (!type) return false;
+
+  DenseIntElementsAttr axes_attr =
+      axes.dyn_cast_or_null<DenseIntElementsAttr>();
+  DenseIntElementsAttr shape_attr =
+      shape.dyn_cast_or_null<DenseIntElementsAttr>();
+  if (!axes_attr || !shape_attr) return false;
+
+  if (shape_attr.getNumElements() != type.getRank()) return false;
+
+  llvm::SmallSet<uint64_t, 4> axes_set;
+  for (auto a : axes_attr.getIntValues()) {
+    axes_set.insert(a.getZExtValue());
+  }
+
+  auto type_shape = type.getShape();
+  for (uint64_t i = 0; i < type.getRank(); ++i) {
+    if (axes_set.contains(i)) {
+      if (shape_attr.getValue<APInt>({i}) != 1) return false;
+    } else {
+      if (shape_attr.getValue<APInt>({i}) != type_shape[i]) return false;
+    }
+  }
+  return true;
+}
+
 #include "tensorflow/compiler/mlir/lite/transforms/generated_optimize.inc"
 
 // Fuse Add with proceeding FullyConnected.
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize_functional_ops.cc b/tensorflow/compiler/mlir/lite/transforms/optimize_functional_ops.cc
index 2311ae0668c..f1ea837446b 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize_functional_ops.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize_functional_ops.cc
@@ -83,8 +83,8 @@ class FoldIfOp : public OpRewritePattern<TF::IfOp> {
     if (!llvm::hasSingleElement(parent_op)) return failure();
 
     // Find the then and else branch functions.
-    FuncOp then_func = op.then_func();
-    FuncOp else_func = op.else_func();
+    FuncOp then_func = op.then_function();
+    FuncOp else_func = op.else_function();
 
     // If the If has no uses and its functions are side-effect free, then
     // remove.
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td b/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
index 83a09e9dd2b..8243ed2a620 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
@@ -21,8 +21,13 @@ include "tensorflow/compiler/mlir/lite/ir/tfl_ops.td"
 include "tensorflow/compiler/mlir/lite/utils/utils.td"
 include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
 
+// Checks if the param passed is a F32 ElementsAttr.
 def F32ElementsAttr : ElementsAttrBase<
-  CPred<"$_self.cast<ElementsAttr>().getType().getElementType().isF32()">, "float constant tensor">;
+  CPred<"$_self.isa<ElementsAttr>() && $_self.cast<ElementsAttr>().getType().getElementType().isF32()">,
+        "float constant tensor">;
+
+// Checks if the param passed is of NoneType.
+def IsNoneType : Constraint<CPred<"$0.getType().isa<NoneType>()">>;
 
 def ExtractSingleElementAsFloat : NativeCodeCall<
     "ExtractSingleElementAsFloat($_self.cast<ElementsAttr>())">;
@@ -93,6 +98,29 @@ multiclass FuseBinaryOpToPrecedingAffine<dag binaryOp> {
       $multiplier),
     [(CanFuseConvOrDepthwiseConv<"true"> $filter, $value),
      (HasOneUse $output)]>;
+   def FuseBinaryOpWithTransposeConv#binaryOp : Pat<
+    (binaryOp (TFL_TransposeConvOp:$output $output_shape, $weights, $inputs,
+                (ConstantOp F32ElementsAttr:$bias), $padding,
+                $stride_h, $stride_w),
+              (ConstantOp F32ElementsAttr:$value), TFL_AF_None),
+    (TFL_TransposeConvOp $output_shape, $weights, $inputs,
+      (binaryOp (ConstantOp $bias),
+         (ConstantOp $value), TFL_AF_None),
+      $padding, $stride_h, $stride_w),
+    [(CanFuseConvOrDepthwiseConv<"false"> $weights, $value),
+     (HasOneUse $output)]>;
+  // Fuse for TransposeConv with no bias
+  def FuseBinaryOpWithTransposeConvNoneBias#binaryOp : Pat<
+    (binaryOp (TFL_TransposeConvOp:$output $output_shape, $weights, $inputs,
+                (ConstantOp $bias), $padding,
+                $stride_h, $stride_w),
+              (ConstantOp F32ElementsAttr:$value), TFL_AF_None),
+    (TFL_TransposeConvOp $output_shape, $weights, $inputs,
+      (ConstantOp $value),
+      $padding, $stride_h, $stride_w),
+    [(CanFuseConvOrDepthwiseConv<"false"> $weights, $value),
+     (IsNoneType $bias),
+     (HasOneUse $output)]>;
 }
 foreach binaryOp = [TFL_AddOp, TFL_SubOp] in
   defm : FuseBinaryOpToPrecedingAffine<binaryOp>;
@@ -146,6 +174,39 @@ multiclass FuseMulOrDivWithConv2dOrDepthwiseConv2d<dag BinaryOp> {
       $h_factor, $w_factor, $act_fn, $padding, $stride_h, $stride_w),
     [(CanFuseConvOrDepthwiseConv<"false"> $filter, $value),
      (HasOneUse $conv_output)]>;
+  def FuseMulOrDivWithTransposeConv#BinaryOp : Pat<
+    (BinaryOp (TFL_TransposeConvOp:$output $output_shape,
+                (ConstantOp F32ElementsAttr:$weights), $input,
+                (ConstantOp F32ElementsAttr:$bias),
+                $padding, $stride_h, $stride_w),
+              (ConstantOp $value), TFL_AF_None),
+    (TFL_TransposeConvOp $output_shape,
+      (BinaryOp (ConstantOp $weights),
+        (ConstantOp (ExpandTo4DForConv $value)),
+        TFL_AF_None),
+      $input,
+      (BinaryOp (ConstantOp $bias),
+        (ConstantOp $value),
+        TFL_AF_None),
+      $padding, $stride_h, $stride_w),
+    [(CanFuseConvOrDepthwiseConv<"false"> $weights, $value),
+     (HasOneUse $output)]>;
+  def FuseMulOrDivWithTransposeConvWithNoneBias#BinaryOp : Pat<
+    (BinaryOp (TFL_TransposeConvOp:$output $output_shape,
+                (ConstantOp F32ElementsAttr:$weights), $input,
+                (ConstantOp $bias),
+                $padding, $stride_h, $stride_w),
+              (ConstantOp $value), TFL_AF_None),
+    (TFL_TransposeConvOp $output_shape,
+      (BinaryOp (ConstantOp $weights),
+        (ConstantOp (ExpandTo4DForConv $value)),
+        TFL_AF_None),
+      $input,
+      (ConstantOp $bias),
+      $padding, $stride_h, $stride_w),
+    [(CanFuseConvOrDepthwiseConv<"false"> $weights, $value),
+     (IsNoneType $bias),
+     (HasOneUse $output)]>;
 }
 
 foreach BinaryOp = [TFL_DivOp, TFL_MulOp] in
@@ -508,3 +569,81 @@ foreach ActFun = [TFL_AF_Relu, TFL_AF_Relu6, TFL_AF_Relu1, TFL_AF_None] in {
 def OptimizeReluSquaredDifference : Pat<
   (TFL_ReluOp (TFL_SquaredDifferenceOp $l, $r)),
   (TFL_SquaredDifferenceOp $l, $r)>;
+
+// Optimize X^1 o X
+def OptimizePow1ToIdentity : Pat<
+  (TFL_PowOp $input,
+    (ConstantOp ConstantAttr<RankedF32ElementsAttr<[]>, "1.0f">)),
+  (replaceWithValue $input)>;
+
+// Optimize X^2 to X*X
+def OptimizePow2ToSquare : Pat<
+  (TFL_PowOp $input,
+    (ConstantOp ConstantAttr<RankedF32ElementsAttr<[]>, "2.0f">)),
+  (TFL_MulOp $input, $input, TFL_AF_None)>;
+
+def CanOptimizeIdentityGatherNdOrScatterNdOp : Constraint<CPred<
+  "TFL::CanOptimizeIdentityGatherNdOrScatterNdOp("
+  "$0, $1.cast<DenseIntElementsAttr>())">>;
+
+def OptimizeIdentityGatherNdOp : Pat<
+  (TFL_GatherNdOp $params, (ConstantOp I32ElementsAttr: $indices)),
+  (replaceWithValue $params),
+  [(CanOptimizeIdentityGatherNdOrScatterNdOp $params, $indices)]>;
+
+def OptimizeIdentityScatterNdOp : Pat<
+  (TFL_ScatterNdOp (ConstantOp I32ElementsAttr: $indices), $params, $ignored),
+  (replaceWithValue $params),
+  [(CanOptimizeIdentityGatherNdOrScatterNdOp $params, $indices)]>;
+
+def ShapeMatchesReduceWithKeepAxes : Constraint<CPred<
+  "ShapeMatchesReduceWithKeepAxes($0, $1, $2)">>;
+
+// Fold reshapes re-inserting reduced dimensions into the results of a reduction
+// with `keep_dims=false` by chaning it to one using `keep_dims=true`.
+foreach ReduceOp = [TFL_ReduceMaxOp, TFL_ReduceMinOp, TFL_ReduceProdOp,
+                    TFL_SumOp] in {
+  def FoldReshapeTo#ReduceOp : Pat<
+    (TFL_ReshapeOp
+      (ReduceOp:$reduce $input, (ConstantOp I32ElementsAttr: $axes),
+                        ConstBoolAttrFalse),
+      (ConstantOp I32ElementsAttr: $shape)),
+    (ReduceOp $input, (ConstantOp $axes), ConstBoolAttrTrue),
+    [(ShapeMatchesReduceWithKeepAxes $input, $axes, $shape),
+     (HasOneUse $reduce)]>;
+}
+
+
+def IsSame : Constraint<CPred<"$0 == $1">>;
+def HasTwoUse : Constraint<CPred<
+  "std::distance($0.use_begin(), $0.use_end()) == 2">>;
+def AxesIsLastDimension : Constraint<CPred<
+  "$0.cast<DenseIntElementsAttr>().getNumElements() == 1 && "
+  "$0.cast<DenseIntElementsAttr>().getValue<APInt>({0}) == "
+  "$1.getType().cast<ShapedType>().getRank() - 1">>;
+
+// Convert exp(x)/sum(exp(x)) into softmax.
+def OptimizeToSoftmax : Pat<
+  (TFL_DivOp (TFL_ExpOp:$exp $input),
+             (TFL_SumOp:$sum $sum_input, (ConstantOp I32ElementsAttr: $axes),
+                             ConstBoolAttrTrue), TFL_AF_None),
+  (TFL_SoftmaxOp $input, ConstF32Attr<"1.0">),
+  [(IsSame $exp, $sum_input),
+   (AxesIsLastDimension $axes, $sum_input),
+   (HasTwoUse $exp),
+   (HasOneUse $sum)]>;
+
+// Convert softmax(x-max(x)) into softmax(x) as the softmax op already deals
+// with the max normalization.
+def FoldNormalizationIntoSoftmax : Pat<
+  (TFL_SoftmaxOp
+    (TFL_SubOp:$sub $input,
+      (TFL_ReduceMaxOp:$max $max_input, (ConstantOp I32ElementsAttr: $axes),
+                            ConstBoolAttrTrue),
+    TFL_AF_None),
+    $beta),
+  (TFL_SoftmaxOp $input, $beta),
+  [(IsSame $input, $max_input),
+   (AxesIsLastDimension $axes, $max_input),
+   (HasOneUse $sub),
+   (HasOneUse $max)]>;
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc
index 3be6246c0dd..172ce59ddd4 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc
@@ -42,6 +42,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 #include "tensorflow/compiler/mlir/lite/utils/lstm_utils.h"
+#include "tensorflow/compiler/mlir/lite/utils/nms_utils.h"
 #include "tensorflow/compiler/mlir/lite/utils/tftext_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
@@ -59,6 +60,7 @@ namespace {
 
 constexpr char kTFAPIImplements[] = "tf.api_implements";
 constexpr char kTFTextAPIPrefix[] = "tftext:";
+constexpr char kCustomSSDPostprocessing[] = "TFLite_Detection_PostProcess";
 constexpr char kTfNMSPadded[] = "non_max_suppression_padded_v2";
 
 using mlir::TF::FuncAttr;
@@ -99,59 +101,6 @@ class ConvertEmbeddedLookupFunc {
   FuncOp func_;
 };
 
-// Abstracts the conversion of the padded NMS composite function.
-class ConvertNMSPaddedFunc {
- public:
-  explicit ConvertNMSPaddedFunc(FuncOp func) : func_(func) {}
-
-  void RewriteFunc() {
-    func_.setAttr(kTFImplements,
-                  StringAttr::get(kTfNMSPadded, func_.getContext()));
-    Value boxes = func_.getArgument(0);
-    Value scores = func_.getArgument(1);
-    Value max_output_size = func_.getArgument(2);
-    Value iou_threshold = func_.getArgument(3);
-    Value score_threshold = func_.getArgument(4);
-    auto output_type0 = func_.getType().getResult(0);
-    auto output_type1 = func_.getType().getResult(1);
-
-    OpBuilder builder(func_.getBody());
-    auto op = builder.create<mlir::TFL::NonMaxSuppressionV4Op>(
-        func_.getLoc(), output_type0, output_type1, boxes, scores,
-        max_output_size, iou_threshold, score_threshold);
-
-    builder.create<mlir::ReturnOp>(func_.getLoc(), op.getResults());
-  }
-
-  LogicalResult VerifySignature() {
-    // Verify high-level function signature.
-    // Relevant argument characteristics are checked by the TFL op definition.
-    if (func_.getNumArguments() < 5) {
-      return func_.emitError()
-             << "Invalid number of arguments to "
-                "non_max_suppression_padded_v2 (need atleast 5): "
-             << func_.getNumArguments();
-    }
-    if (func_.getType().getNumResults() != 2) {
-      return func_.emitError() << "Invalid number of results from "
-                                  "non_max_suppression_padded_v2 (need 2): "
-                               << func_.getType().getNumResults();
-    }
-    // The TFLite fused op does not support batching yet.
-    // TODO(b/158709815): Add support for batches with padded NMS.
-    auto boxes_type =
-        func_.getArgument(0).getType().dyn_cast<RankedTensorType>();
-    if (!boxes_type.hasRank() || boxes_type.getRank() != 2) {
-      return func_.emitError() << "TFLite does not support batched input for "
-                                  "non_max_suppression_padded";
-    }
-    return success();
-  }
-
- private:
-  FuncOp func_;
-};
-
 // This pass uses mechanisms listed in RFC:
 // https://github.com/tensorflow/community/pull/113
 // It prepares composite functions that are attributed to indicate
@@ -161,6 +110,10 @@ class ConvertNMSPaddedFunc {
 class PrepareCompositeFunctionsPass
     : public PassWrapper<PrepareCompositeFunctionsPass,
                          OperationPass<ModuleOp>> {
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<TFL::TensorFlowLiteDialect>();
+  }
+
  public:
   explicit PrepareCompositeFunctionsPass() {}
 
@@ -219,6 +172,12 @@ void PrepareCompositeFunctionsPass::ConvertTFImplementsWithAttributes(
     if (failed(ConvertTFTextAPI(func, api_name, attr))) {
       return signalPassFailure();
     }
+  } else if (api_name == kCustomSSDPostprocessing) {
+    ConvertSSDPostProcessFunc convert_ssd_postprocess(func, attr);
+    if (failed(convert_ssd_postprocess.VerifySignature()) ||
+        failed(convert_ssd_postprocess.RewriteFunc())) {
+      return signalPassFailure();
+    }
   }
 }
 
@@ -261,7 +220,15 @@ LogicalResult CheckFusableKerasLstm(FuncOp lstm_func, ModuleOp module) {
   for (int i = 1; i < 3; ++i) {
     auto input = lstm_func.getArgument(i);
     auto input_type = input.getType().dyn_cast_or_null<RankedTensorType>();
-    if (!input_type || !input_type.hasStaticShape()) return failure();
+    if (!input_type || !input_type.hasStaticShape()) {
+      lstm_func.emitWarning(
+          "we cannot fuse this lstm func because the batch size is not fixed, "
+          "please consider setting fixed batch size like "
+          "https://github.com/tensorflow/tensorflow/blob/master/tensorflow/"
+          "lite/examples/experimental_new_converter/"
+          "Keras_LSTM_fusion_Codelab.ipynb");
+      return failure();
+    }
   }
 
   return success();
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_patterns.td b/tensorflow/compiler/mlir/lite/transforms/prepare_patterns.td
index f5b252773f6..326b6b23398 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_patterns.td
@@ -40,7 +40,7 @@ def : Pat<
     (TF_MulOp $t, (TF_MulOp:$mul (TF_RsqrtOp (TF_AddOp $v, (TF_ConstOp $variance_epsilon))), $gamma)),
     (TF_SubOp $beta, (TF_MulOp $m, $mul)))>;
 
-// Converts tf.FusedBatchNorm & tf.FusedBatchNormV3 into a sequence of more primitive arithmetic
+// Converts tf.FusedBatchNormV3 into a sequence of more primitive arithmetic
 // operations. Specifically, performs the following calculation:
 //
 //   (x - mean) * scale / sqrt(variance + epsilon) + offset
@@ -50,28 +50,6 @@ def : Pat<
 //   (x - mean) * scale / sqrt(variance + epsilon) + offset,
 // is then to compute
 //   (x * multiplier) + (offset - mean * multiplier).
-def : Pattern<
-    (TF_FusedBatchNormOp:$root
-        $x, $scale, $offset, $mean, $variance,
-        F32Attr:$epsilon, $exponential_avg_factor,
-        $data_format, FalseBoolAttr:$is_training),
-    [(TF_AddOp
-        (TF_MulOp
-            $x,
-            (TF_MulOp:$multiplier
-                $scale,
-                (TF_RsqrtOp
-                    (TF_AddOp $variance,
-                              (TF_ConstOp $epsilon))))),
-        (TF_SubOp $offset, (TF_MulOp $mean, $multiplier))),
-     // We already guaranteed that the last four results has no use so it does
-     // not matter what value we provide here for replacement.
-     /*batch_mean=*/(replaceWithValue $x),
-     /*batch_variance=*/(replaceWithValue $x),
-     /*reserve_space_1=*/(replaceWithValue $x),
-     /*reserve_space_2=*/(replaceWithValue $x)],
-    [(HasNoUseOf:$root__1), (HasNoUseOf:$root__2),
-     (HasNoUseOf:$root__3), (HasNoUseOf:$root__4)]>;
 
 def : Pattern<
     (TF_FusedBatchNormV3Op:$root
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
index 9a27d0de62a..783f21fce21 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/CommandLine.h"
 #include "mlir/Dialect/Quant/QuantOps.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
@@ -68,6 +69,11 @@ namespace {
 // training quantization simpler.
 class PrepareQuantizePass
     : public PassWrapper<PrepareQuantizePass, FunctionPass> {
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<TFL::TensorFlowLiteDialect,
+                    ::mlir::quant::QuantizationDialect>();
+  }
+
  public:
   // Constructor used by the PassRegistration and enforce uint8 quantization.
   // This is only used by test.
@@ -122,6 +128,10 @@ class PrepareQuantizePass
   // the best quantization practise. This also fixes some simple violations.
   void SanityCheckAndAdjustment(FuncOp func);
 
+  // Whether the func contains Quantize ops. This is used to determine whether
+  // to use the quantization parameters from the fixed output range property.
+  bool ContainsQuantizeOps(FuncOp func);
+
   QuantizationSpecs quant_specs_;
 };
 
@@ -285,6 +295,13 @@ void PrepareQuantizePass::SanityCheckAndAdjustment(FuncOp func) {
   });
 }
 
+bool PrepareQuantizePass::ContainsQuantizeOps(FuncOp func) {
+  for (const auto& op : func.getOps()) {
+    if (llvm::isa<quant::DequantizeCastOp>(op)) return true;
+  }
+  return false;
+}
+
 using PrepareQuantStats =
     quant::ConvertStatsToQDQs<quant::QuantizeCastOp, quant::DequantizeCastOp>;
 
@@ -309,6 +326,7 @@ void PrepareQuantizePass::runOnFunction() {
   OwningRewritePatternList patterns;
   bool is_signed = quant_specs_.IsSignedInferenceType();
   int bit_width = quant_specs_.GetQuantizationTypeWidth();
+  bool enforce_fixed_output_range = ContainsQuantizeOps(func);
   if (is_signed) {
     patterns.insert<quant::ConvertUnsignedToSigned<quant::QuantizeCastOp>>(ctx);
     // Convert quant stats to int8 quantization parameters.
@@ -327,7 +345,8 @@ void PrepareQuantizePass::runOnFunction() {
   // values (tensors).
   ApplyQuantizationParamsPropagation(
       func, is_signed, disable_per_channel || quant_specs_.disable_per_channel,
-      GetOpQuantSpec);
+      GetOpQuantSpec,
+      enforce_fixed_output_range || quant_specs_.post_training_quantization);
 
   ConvertMlirQuantOpsToTFLQuantOps(func);
 }
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
index 62688937d7e..2b118d0b810 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "llvm/Support/Debug.h"
 #include "mlir/Analysis/LoopAnalysis.h"  // from @llvm-project
 #include "mlir/Dialect/Quant/FakeQuantSupport.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/QuantOps.h"  // from @llvm-project
 #include "mlir/Dialect/Quant/UniformSupport.h"  // from @llvm-project
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
@@ -57,6 +58,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/transforms/dilated_conv.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 #include "tensorflow/compiler/mlir/lite/utils/attribute_utils.h"
+#include "tensorflow/compiler/mlir/lite/utils/constant_utils.h"
 #include "tensorflow/compiler/mlir/lite/utils/validators.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/einsum.h"
@@ -78,13 +80,23 @@ namespace {
 // Prepare TF operations in functions for subsequent legalization.
 class PrepareTFPass : public PassWrapper<PrepareTFPass, FunctionPass> {
  public:
-  explicit PrepareTFPass() : unfold_batch_matmul_(true) {}
-  explicit PrepareTFPass(bool unfold_batch_matmul)
-      : unfold_batch_matmul_(unfold_batch_matmul) {}
+  PrepareTFPass() = default;
+  PrepareTFPass(const PrepareTFPass &) {}
+  explicit PrepareTFPass(bool unfold_batch_matmul) {
+    unfold_batch_matmul_ = unfold_batch_matmul;
+  }
   void runOnFunction() override;
 
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<mhlo::MhloDialect, quant::QuantizationDialect,
+                    TFL::TensorFlowLiteDialect>();
+  }
+
  private:
-  bool unfold_batch_matmul_;
+  Option<bool> unfold_batch_matmul_{
+      *this, "tfl-unfold-batch-matmul",
+      llvm::cl::desc("Unfold BatchMatMul into individual MatMul ops."),
+      llvm::cl::init(true)};
 };
 
 template <class TFFakeQuantOp>
@@ -203,9 +215,8 @@ struct InsertTFLQuantOpsAfterTFFakeQuantOp
     }
     // Use the min/max from the operands and the num_bits and narrow_range
     // attribute to create the quantization parameter for the new quantize op.
-    rewriter.setInsertionPointAfter(tf_op);
-    IntegerAttr num_bits =
-        rewriter.getI64IntegerAttr(tf_op.num_bits().getSExtValue());
+    rewriter.setInsertionPointAfter(tf_op.getOperation());
+    IntegerAttr num_bits = rewriter.getI64IntegerAttr(tf_op.num_bits());
     BoolAttr narrow_range = rewriter.getBoolAttr(tf_op.narrow_range());
     Type res_type = tf_op.getType();
     TypeAttr qtype = quant::GetQuantizedTypeAttr(
@@ -526,8 +537,8 @@ struct ConvertTFStridedSlice : public RewritePattern {
         loc, new_output_type, original_input, shape);
 
     // Replace the original strided_slice.
-    llvm::APInt new_begin_mask = strided_slice_op.begin_mask();
-    llvm::APInt new_end_mask = strided_slice_op.end_mask();
+    uint64_t new_begin_mask = strided_slice_op.begin_mask();
+    uint64_t new_end_mask = strided_slice_op.end_mask();
     // Since we expand the dims, we need to apply them to the begin_mask &
     // end_mask.
     new_begin_mask |= strided_slice_op.new_axis_mask();
@@ -590,8 +601,8 @@ struct ConvertTFStridedSlice : public RewritePattern {
 
     const int ellipsis_filled_dim_size = input_size - begin_shape[0] + 1;
 
-    int64_t begin_mask = strided_slice_op.begin_mask().getSExtValue();
-    int64_t end_mask = strided_slice_op.end_mask().getSExtValue();
+    int64_t begin_mask = strided_slice_op.begin_mask();
+    int64_t end_mask = strided_slice_op.end_mask();
     int64_t new_begin_mask = 0;
     int64_t new_end_mask = 0;
 
@@ -627,13 +638,16 @@ struct ConvertTFStridedSlice : public RewritePattern {
     ++index;
 
     // After the ellipsis.
-    for (; index < begin_shape[0]; ++index) {
+    for (; index < begin_shape[0];) {
       padded_begin.push_back(begin_dense_elem_attr.getValue<int32_t>(index));
       padded_end.push_back(end_dense_elem_attr.getValue<int32_t>(index));
       padded_stride.push_back(stride_dense_elem_attr.getValue<int32_t>(index));
 
       if ((begin_mask >> index) & 1) new_begin_mask |= (1 << new_index);
       if ((end_mask >> index) & 1) new_end_mask |= (1 << new_index);
+
+      ++index;
+      ++new_index;
     }
 
     auto attribute_type = rewriter.getIntegerType(64);
@@ -669,16 +683,16 @@ struct ConvertTFStridedSlice : public RewritePattern {
 
     // TODO(renjieliu): Consider expand the transformation for shrink mask as
     // well.
-    if (strided_slice_op.shrink_axis_mask().getZExtValue()) return failure();
+    if (strided_slice_op.shrink_axis_mask()) return failure();
 
     // Handle new axis mask.
-    uint64_t new_axis_mask = strided_slice_op.new_axis_mask().getZExtValue();
+    uint64_t new_axis_mask = strided_slice_op.new_axis_mask();
     if (new_axis_mask != 0) {
       return RewriteNewAxisMask(strided_slice_op, new_axis_mask, rewriter);
     }
 
     // Handle ellipsis mask.
-    uint64_t ellipsis_mask = strided_slice_op.ellipsis_mask().getZExtValue();
+    uint64_t ellipsis_mask = strided_slice_op.ellipsis_mask();
     if (ellipsis_mask != 0) {
       return RewriteEllipsisMask(strided_slice_op, ellipsis_mask, rewriter);
     }
@@ -686,6 +700,71 @@ struct ConvertTFStridedSlice : public RewritePattern {
   }
 };
 
+struct ConvertTFBroadcastTo : public RewritePattern {
+  explicit ConvertTFBroadcastTo(MLIRContext *context)
+      : RewritePattern(TF::BroadcastToOp::getOperationName(), 1, context) {}
+
+  LogicalResult matchAndRewrite(Operation *op,
+                                PatternRewriter &rewriter) const override {
+    auto tf_broadcast_to_op = cast<TF::BroadcastToOp>(op);
+    auto input_type = tf_broadcast_to_op.input().getType().cast<ShapedType>();
+    auto output_type = tf_broadcast_to_op.output().getType().cast<ShapedType>();
+    auto shape_type = tf_broadcast_to_op.shape().getType().cast<ShapedType>();
+    Type element_type = input_type.getElementType();
+
+    // Allow lowering when low dimension inputs are given and its type is F32 or
+    // I32.
+    if (!((output_type.hasRank() && output_type.getRank() <= 5) ||
+          (shape_type.hasStaticShape() && shape_type.getRank() == 1 &&
+           shape_type.getDimSize(0) <= 5)))
+      return failure();
+
+    if (!(element_type.isa<BFloat16Type, Float32Type>() ||
+          element_type.isInteger(32)))
+      return failure();
+
+    auto status_or_const_op =
+        CreateConstOpWithSingleValue(&rewriter, op->getLoc(), input_type, 1);
+    if (!status_or_const_op.ok()) {
+      return failure();
+    }
+
+    auto tf_fill_op = rewriter.create<TF::FillOp>(
+        op->getLoc(), output_type, tf_broadcast_to_op.shape(),
+        status_or_const_op.ValueOrDie());
+
+    auto mul_op = rewriter.create<TF::MulOp>(
+        op->getLoc(), output_type, tf_broadcast_to_op.input(), tf_fill_op);
+    rewriter.replaceOp(op, mul_op.getResult());
+    return success();
+  }
+};
+
+struct ConvertFusedBatchNorm : public OpRewritePattern<TF::FusedBatchNormOp> {
+  explicit ConvertFusedBatchNorm(MLIRContext *context)
+      : OpRewritePattern<TF::FusedBatchNormOp>(context) {}
+
+  LogicalResult matchAndRewrite(TF::FusedBatchNormOp tf_fused_batch_norm_op,
+                                PatternRewriter &rewriter) const override {
+    auto new_result_types =
+        llvm::to_vector<6>(tf_fused_batch_norm_op.getResultTypes());
+    // reserve_space_3
+    new_result_types.push_back(
+        UnrankedTensorType::get(FloatType::getF32(rewriter.getContext())));
+
+    OperationState new_state(tf_fused_batch_norm_op.getLoc(),
+                             TF::FusedBatchNormV3Op::getOperationName(),
+                             tf_fused_batch_norm_op.getOperands(),
+                             new_result_types,
+                             tf_fused_batch_norm_op.getAttrs());
+    Operation *tf_fused_batch_norm_op_v3 = rewriter.createOperation(new_state);
+
+    rewriter.replaceOp(tf_fused_batch_norm_op,
+                       tf_fused_batch_norm_op_v3->getResults().drop_back());
+    return success();
+  }
+};
+
 #include "tensorflow/compiler/mlir/lite/transforms/generated_prepare_tf.inc"
 
 // Returns success if all the operations in the `op`'s regions including `op`
@@ -711,14 +790,113 @@ LogicalResult ConvertTf2XlaOps(FuncOp func, MLIRContext *context) {
   target.addLegalOp<ModuleOp>();
   target.addLegalOp<FuncOp>();
   target.addIllegalOp<TF::XlaConvOp>();
+  target.addIllegalOp<TF::XlaGatherOp>();
 
   OwningRewritePatternList patterns;
   mhlo::PopulateLegalizeTfWithTf2XlaPatterns("XLA_CPU_JIT", patterns);
+  mhlo::PopulateLegalizeTfPatterns(context, &patterns);
   TF::PopulateLegalizeHloToTfPatterns(&patterns, context);
+  mhlo::GatherOp::getCanonicalizationPatterns(patterns, context);
 
   return applyPartialConversion(func, target, patterns);
 }
 
+// Convert rfft to rfft2d.
+// The transformation pattern looks like below:
+//
+//    input     fft_len
+//     \      /
+//     rfft
+//
+//     ||
+//     \/
+//
+//   input       fft_len
+//    \            /
+//   expand_dim    concat with [1] at the front
+//      \         /
+//     rfft_2d
+//       |
+//     squeeze
+struct ConvertRfftToRfft2d : public RewritePattern {
+  explicit ConvertRfftToRfft2d(MLIRContext *context)
+      : RewritePattern(TF::RFFTOp::getOperationName(), 1, context) {}
+
+  LogicalResult matchAndRewrite(Operation *op,
+                                PatternRewriter &rewriter) const override {
+    auto rfft_op = dyn_cast<TF::RFFTOp>(op);
+
+    auto input = rfft_op.input();
+    auto input_type = input.getType().dyn_cast_or_null<RankedTensorType>();
+    if (!input_type) return failure();
+    auto fft_len = rfft_op.fft_length();
+    auto fft_len_type = fft_len.getType().dyn_cast_or_null<ShapedType>();
+    if (!fft_len_type) return failure();
+
+    auto output_type =
+        rfft_op.getResult().getType().dyn_cast_or_null<RankedTensorType>();
+    if (!output_type) return failure();
+
+    // Expanded inputs.
+    // Insert at -2 location.
+    auto one_ele_type =
+        mlir::RankedTensorType::get({1}, rewriter.getIntegerType(32));
+    auto minus_two = CreateConstOpWithSingleValue(&rewriter, rfft_op.getLoc(),
+                                                  one_ele_type, -2);
+
+    SmallVector<int64_t, 4> expanded_input_shape;
+    SmallVector<int64_t, 4> expanded_output_shape;
+    int expanded_rank = input_type.getRank() + 1;
+    int r = 0;
+    for (int i = 0; i < expanded_rank; ++i) {
+      if (i == expanded_rank - 2) {
+        expanded_input_shape.push_back(1);
+        expanded_output_shape.push_back(1);
+      } else {
+        expanded_input_shape.push_back(input_type.getDimSize(r));
+        expanded_output_shape.push_back(output_type.getDimSize(r));
+        r++;
+      }
+    }
+
+    auto expaned_input_type = mlir::RankedTensorType::get(
+        expanded_input_shape, input_type.getElementType());
+    TF::ExpandDimsOp expanded_input = rewriter.create<TF::ExpandDimsOp>(
+        rfft_op.getLoc(), expaned_input_type, input, minus_two->getResult());
+
+    // Expanded fft_len.
+    auto one_attr = mlir::DenseIntElementsAttr::get(one_ele_type, {1});
+
+    auto one = rewriter.create<TF::ConstOp>(rfft_op.getLoc(), one_attr);
+
+    auto zero = CreateConstOpWithSingleValue(&rewriter, rfft_op.getLoc(),
+                                             one_ele_type, 0);
+
+    auto expanded_fft_len_type =
+        mlir::RankedTensorType::get({2}, fft_len_type.getElementType());
+
+    TF::ConcatV2Op expanded_fft_len = rewriter.create<TF::ConcatV2Op>(
+        rfft_op.getLoc(), expanded_fft_len_type,
+        SmallVector<Value, 2>({one.getResult(), fft_len}), zero->getResult());
+
+    // Insert the rfft_2d.
+    auto rfft2d_out_type = mlir::RankedTensorType::get(
+        expanded_output_shape, output_type.getElementType());
+    TF::RFFT2DOp rfft2d = rewriter.create<TF::RFFT2DOp>(
+        rfft_op.getLoc(), rfft2d_out_type, expanded_input.getResult(),
+        expanded_fft_len.getResult());
+
+    // Insert the squeeze op.
+    auto squeeze_dim = rewriter.getI64ArrayAttr({-2});
+    TF::SqueezeOp squeeze = rewriter.create<TF::SqueezeOp>(
+        rfft_op.getLoc(), output_type, rfft2d.getResult(), squeeze_dim);
+
+    rewriter.replaceOp(op, squeeze.getResult());
+
+    return success();
+  }
+};
+
 void PrepareTFPass::runOnFunction() {
   OwningRewritePatternList patterns;
   auto func = getFunction();
@@ -751,6 +929,8 @@ void PrepareTFPass::runOnFunction() {
   // replaced with a single Conv op with dilation parameter.
   patterns.insert<ConvertTFDilatedConvOp<TF::Conv2DOp>,
                   ConvertTFDilatedConvOp<TF::DepthwiseConv2dNativeOp>>(ctx);
+
+  patterns.insert<ConvertFusedBatchNorm>(ctx);
   TFL::populateWithGenerated(ctx, &patterns);
   // TODO(karimnosseir): Split to separate pass probably after
   // deciding on long term plan for this optimization.
@@ -767,8 +947,9 @@ void PrepareTFPass::runOnFunction() {
     patterns.insert<TF::ConvertTFBatchMatMulOp<TF::BatchMatMulOp>,
                     TF::ConvertTFBatchMatMulOp<TF::BatchMatMulV2Op>>(ctx);
   }
-  patterns.insert<TF::ConvertTFEinsumOp, ConvertTFConv2D,
-                  ConvertTFDepthwiseConv2dNative, ConvertTFStridedSlice>(ctx);
+  patterns.insert<TF::ConvertTFEinsumOp, ConvertTFBroadcastTo, ConvertTFConv2D,
+                  ConvertTFDepthwiseConv2dNative, ConvertTFStridedSlice,
+                  ConvertRfftToRfft2d>(ctx);
   applyPatternsAndFoldGreedily(func, patterns);
 }
 
diff --git a/tensorflow/compiler/mlir/lite/transforms/while_loop_outline.cc b/tensorflow/compiler/mlir/lite/transforms/while_loop_outline.cc
index 3342981b75f..56b38ec58d8 100644
--- a/tensorflow/compiler/mlir/lite/transforms/while_loop_outline.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/while_loop_outline.cc
@@ -80,7 +80,7 @@ void WhileOutlinePass::OutlineWhile(WhileOp while_op) {
   // The basic block arguments correspond to values that are loop carried, while
   // all those post are loop independent. Initialize extern_values with while_op
   // not loop carried operands.
-  auto num_loop_carried = while_op.cond().front().getNumArguments();
+  auto num_loop_carried = while_op.cond().getNumArguments();
   auto not_carried_operands =
       while_op.getOperands().drop_front(num_loop_carried);
   extern_values.insert(not_carried_operands.begin(),
@@ -124,8 +124,7 @@ void WhileOutlinePass::OutlineWhile(WhileOp while_op) {
   // Collect new types.
   SmallVector<Type, 4> types;
   types.reserve(extra_operands.size() + while_op.getNumOperands());
-  for (BlockArgument ba : while_op.cond().front().getArguments())
-    types.push_back(ba.getType());
+  for (Type type : while_op.cond().getArgumentTypes()) types.push_back(type);
   for (Value operand : extern_values) types.push_back(operand.getType());
 
   // Create outline function from region. Optional pass extra arguments through
diff --git a/tensorflow/compiler/mlir/lite/utils/constant_utils.cc b/tensorflow/compiler/mlir/lite/utils/constant_utils.cc
new file mode 100644
index 00000000000..b32da24d00f
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/utils/constant_utils.cc
@@ -0,0 +1,98 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/lite/utils/constant_utils.h"
+
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/mangling_util.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace mlir {
+namespace TFL {
+
+stream_executor::port::StatusOr<ConstantOp> CreateConstOpWithSingleValue(
+    PatternRewriter* rewriter, Location loc, ShapedType shaped_type,
+    int value) {
+  Type element_type = shaped_type.getElementType();
+  ShapedType scalar_type = RankedTensorType::get({}, element_type);
+  Attribute attr;
+  if (element_type.isF16()) {
+    auto floatType = mlir::FloatType::getF16(element_type.getContext());
+    auto floatAttr = mlir::FloatAttr::get(floatType, static_cast<float>(value));
+    std::vector<Attribute> floatValues({floatAttr});
+    attr = DenseElementsAttr::get(scalar_type, floatValues);
+  } else if (element_type.isBF16()) {
+    auto floatType = mlir::FloatType::getBF16(element_type.getContext());
+    auto floatAttr = mlir::FloatAttr::get(floatType, static_cast<float>(value));
+    std::vector<Attribute> floatValues({floatAttr});
+    attr = DenseElementsAttr::get(scalar_type, floatValues);
+  } else if (element_type.isF32()) {
+    attr =
+        DenseElementsAttr::get<float>(scalar_type, static_cast<float>(value));
+  } else if (auto complex_type = element_type.dyn_cast<mlir::ComplexType>()) {
+    auto etype = complex_type.getElementType();
+    if (etype.isF32()) {
+      auto dialect = etype.getContext()->getLoadedDialect("tf");
+      tensorflow::TensorProto repr;
+      repr.set_dtype(tensorflow::DT_COMPLEX64);
+
+      tensorflow::TensorShapeProto* shape = repr.mutable_tensor_shape();
+      shape->set_unknown_rank(false);
+      shape->add_dim()->set_size(int64_t{1});
+      std::string content;
+      auto complex_value = std::complex<float>(static_cast<float>(value), 0.0f);
+      content.assign(reinterpret_cast<const char*>(&complex_value),
+                     sizeof(complex_value));
+      repr.set_tensor_content(content);
+      std::string mangled = tensorflow::mangling_util::MangleTensor(repr);
+
+      attr = mlir::OpaqueElementsAttr::get(dialect, scalar_type, mangled);
+    } else {
+      return tensorflow::Status(tensorflow::error::INVALID_ARGUMENT,
+                                "Unsupported type");
+    }
+  } else if (auto itype = element_type.dyn_cast<mlir::IntegerType>()) {
+    switch (itype.getWidth()) {
+      case 8:
+        attr = DenseElementsAttr::get<int8_t>(scalar_type,
+                                              static_cast<int8_t>(value));
+        break;
+      case 16:
+        attr = DenseElementsAttr::get<int16_t>(scalar_type,
+                                               static_cast<int16_t>(value));
+        break;
+      case 32:
+        attr = DenseElementsAttr::get<int32_t>(scalar_type,
+                                               static_cast<int32_t>(value));
+        break;
+      case 64:
+        attr = DenseElementsAttr::get<int64_t>(scalar_type,
+                                               static_cast<int64_t>(value));
+        break;
+      default:
+        return tensorflow::Status(tensorflow::error::INVALID_ARGUMENT,
+                                  "Unsupported type");
+    }
+  } else {
+    return tensorflow::Status(tensorflow::error::INVALID_ARGUMENT,
+                              "Unsupported type");
+  }
+  return rewriter->create<ConstantOp>(loc, scalar_type, attr);
+}
+
+}  // namespace TFL
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/utils/constant_utils.h b/tensorflow/compiler/mlir/lite/utils/constant_utils.h
new file mode 100644
index 00000000000..5c348021b5e
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/utils/constant_utils.h
@@ -0,0 +1,35 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_UTILS_CONSTANT_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_UTILS_CONSTANT_UTILS_H_
+
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "tensorflow/stream_executor/lib/statusor.h"
+
+namespace mlir {
+namespace TFL {
+
+// Returns a Constant op with a single value.
+stream_executor::port::StatusOr<ConstantOp> CreateConstOpWithSingleValue(
+    PatternRewriter* rewriter, Location loc, ShapedType shaped_type, int value);
+
+}  // namespace TFL
+}  // namespace mlir
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_UTILS_CONSTANT_UTILS_H_
diff --git a/tensorflow/compiler/mlir/lite/utils/lstm_utils_test.cc b/tensorflow/compiler/mlir/lite/utils/lstm_utils_test.cc
index 081ba7ac6e7..f26689fac5e 100644
--- a/tensorflow/compiler/mlir/lite/utils/lstm_utils_test.cc
+++ b/tensorflow/compiler/mlir/lite/utils/lstm_utils_test.cc
@@ -93,8 +93,9 @@ class LstmUtilsTest : public ::testing::Test {
   LstmUtilsTest() {}
 
   void SetUp() override {
-    RegisterDialects();
     context_ = std::make_unique<mlir::MLIRContext>();
+    context_->loadDialect<mlir::StandardOpsDialect, mlir::TF::TensorFlowDialect,
+                          TensorFlowLiteDialect>();
     builder_ = std::unique_ptr<mlir::Builder>(new Builder(context_.get()));
     fused_lstm_func_ = createLstmCompositeFunc(builder_.get(), false, false);
     fused_lstm_func_cifg_ =
@@ -109,12 +110,6 @@ class LstmUtilsTest : public ::testing::Test {
     builder_.reset();
   }
 
-  void RegisterDialects() {
-    mlir::registerDialect<mlir::StandardOpsDialect>();
-    mlir::registerDialect<mlir::TF::TensorFlowDialect>();
-    mlir::registerDialect<TensorFlowLiteDialect>();
-  }
-
   FuncOp fused_lstm_func_;
   FuncOp fused_lstm_func_cifg_;
   FuncOp fused_ln_lstm_func_;
diff --git a/tensorflow/compiler/mlir/lite/utils/nms_utils.cc b/tensorflow/compiler/mlir/lite/utils/nms_utils.cc
new file mode 100644
index 00000000000..e462d4f38b0
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/utils/nms_utils.cc
@@ -0,0 +1,174 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/lite/utils/nms_utils.h"
+
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+
+namespace mlir {
+namespace TFL {
+
+namespace {
+
+// TODO(b/162842801): Consolidate all util definitions of kTFImplements.
+constexpr char kTFImplements[] = "tf._implements";
+constexpr char kCustomSSDPostprocessing[] = "TFLite_Detection_PostProcess";
+constexpr char kTfNMSPadded[] = "non_max_suppression_padded_v2";
+
+inline OpaqueElementsAttr CustomOption(OpBuilder* builder,
+                                       const std::string& content) {
+  ShapedType type = RankedTensorType::get(
+      {static_cast<int64_t>(content.size())}, builder->getIntegerType(8));
+  return OpaqueElementsAttr::get(builder->getContext()->getLoadedDialect("tfl"),
+                                 type,
+                                 StringRef(content.data(), content.size()));
+}
+
+}  // namespace
+
+void ConvertNMSPaddedFunc::RewriteFunc() {
+  func_.setAttr(kTFImplements,
+                StringAttr::get(kTfNMSPadded, func_.getContext()));
+  Value boxes = func_.getArgument(0);
+  Value scores = func_.getArgument(1);
+  Value max_output_size = func_.getArgument(2);
+  Value iou_threshold = func_.getArgument(3);
+  Value score_threshold = func_.getArgument(4);
+  auto output_type0 = func_.getType().getResult(0);
+  auto output_type1 = func_.getType().getResult(1);
+
+  OpBuilder builder(func_.getBody());
+  auto op = builder.create<mlir::TFL::NonMaxSuppressionV4Op>(
+      func_.getLoc(), output_type0, output_type1, boxes, scores,
+      max_output_size, iou_threshold, score_threshold);
+
+  builder.create<mlir::ReturnOp>(func_.getLoc(), op.getResults());
+}
+
+LogicalResult ConvertNMSPaddedFunc::VerifySignature() {
+  // Verify high-level function signature.
+  // Relevant argument characteristics are checked by the TFL op definition.
+  if (func_.getNumArguments() < 5) {
+    return func_.emitError()
+           << "Invalid number of arguments to "
+              "non_max_suppression_padded_v2 (need atleast 5): "
+           << func_.getNumArguments();
+  }
+  if (func_.getType().getNumResults() != 2) {
+    return func_.emitError() << "Invalid number of results from "
+                                "non_max_suppression_padded_v2 (need 2): "
+                             << func_.getType().getNumResults();
+  }
+  // The TFLite fused op does not support batching yet.
+  // TODO(b/158709815): Add support for batches with padded NMS.
+  auto boxes_type = func_.getArgument(0).getType().dyn_cast<RankedTensorType>();
+  if (!boxes_type.hasRank() || boxes_type.getRank() != 2) {
+    return func_.emitError() << "TFLite does not support batched input for "
+                                "non_max_suppression_padded";
+  }
+  return success();
+}
+
+LogicalResult ConvertSSDPostProcessFunc::RewriteFunc() {
+  func_.eraseBody();
+  func_.addEntryBlock();
+  func_.setAttr(kTFImplements,
+                StringAttr::get(kCustomSSDPostprocessing, func_.getContext()));
+
+  OpBuilder builder(func_.getBody());
+  std::string custom_option_buffer;
+  if (failed(CreateNMSCustomOptions(func_, attr_.GetAttrs(),
+                                    custom_option_buffer))) {
+    return failure();
+  }
+  auto op = builder.create<CustomOp>(
+      func_.getLoc(), func_.getType().getResults(), func_.getArguments(),
+      kCustomSSDPostprocessing, CustomOption(&builder, custom_option_buffer));
+  builder.create<ReturnOp>(func_.getLoc(), op.getResults());
+
+  return success();
+}
+
+LogicalResult ConvertSSDPostProcessFunc::CreateNMSCustomOptions(
+    FuncOp func, DictionaryAttr attrs, std::string& custom_option_buffer) {
+  flexbuffers::Builder fbb;
+  size_t start_map = fbb.StartMap();
+
+  if (failed(AddIntAttr(func, attrs, "max_detections", &fbb)) ||
+      failed(AddIntAttr(func, attrs, "max_classes_per_detection", &fbb)) ||
+      failed(AddIntAttr(func, attrs, "num_classes", &fbb)) ||
+      failed(AddFloatAttr(func, attrs, "nms_score_threshold", &fbb)) ||
+      failed(AddFloatAttr(func, attrs, "nms_iou_threshold", &fbb)) ||
+      failed(AddFloatAttr(func, attrs, "y_scale", &fbb)) ||
+      failed(AddFloatAttr(func, attrs, "x_scale", &fbb)) ||
+      failed(AddFloatAttr(func, attrs, "h_scale", &fbb)) ||
+      failed(AddFloatAttr(func, attrs, "w_scale", &fbb)))
+    return failure();
+  auto use_regular_nms =
+      attrs.get("use_regular_nms").dyn_cast_or_null<BoolAttr>();
+  if (!use_regular_nms) {
+    return func.emitError()
+           << "use_regular_nms attribute is not set or not a bool";
+  }
+  fbb.Int("use_regular_nms", use_regular_nms.getValue());
+
+  fbb.EndMap(start_map);
+  fbb.Finish();
+  custom_option_buffer.assign(fbb.GetBuffer().begin(), fbb.GetBuffer().end());
+  return success();
+}
+
+LogicalResult ConvertSSDPostProcessFunc::AddIntAttr(
+    FuncOp func, DictionaryAttr attrs, const std::string& attribute,
+    flexbuffers::Builder* builder) {
+  auto int_attr = attrs.get(attribute).dyn_cast_or_null<IntegerAttr>();
+  if (!int_attr) {
+    return func.emitError()
+           << attribute.c_str() << " attribute is not set or not an integer";
+  }
+  builder->Int(attribute.c_str(), int_attr.getInt());
+  return success();
+}
+
+LogicalResult ConvertSSDPostProcessFunc::AddFloatAttr(
+    FuncOp func, DictionaryAttr attrs, const std::string& attribute,
+    flexbuffers::Builder* builder) {
+  auto float_attr = attrs.get(attribute).dyn_cast_or_null<FloatAttr>();
+  if (!float_attr) {
+    return func.emitError()
+           << attribute.c_str() << " attribute is not set or not a float";
+  }
+  builder->Float(attribute.c_str(), float_attr.getValue().convertToFloat());
+  return success();
+}
+
+LogicalResult ConvertSSDPostProcessFunc::VerifySignature() {
+  // Verify high-level function signature.
+  if (func_.getNumArguments() != 3) {
+    return func_.emitError()
+           << "Invalid number of arguments to " << kCustomSSDPostprocessing
+           << ": " << func_.getNumArguments();
+  }
+  if (func_.getType().getNumResults() != 4) {
+    return func_.emitError()
+           << "Invalid number of results from " << kCustomSSDPostprocessing
+           << ": " << func_.getType().getNumResults();
+  }
+  return success();
+}
+
+}  // namespace TFL
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/utils/nms_utils.h b/tensorflow/compiler/mlir/lite/utils/nms_utils.h
new file mode 100644
index 00000000000..6a9035e0c81
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/utils/nms_utils.h
@@ -0,0 +1,76 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This header file defines common utils used by TFLite transformation
+// passes to work with NMS ops in TFLite.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_UTILS_NMS_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_UTILS_NMS_UTILS_H_
+
+#include <string>
+
+#include "flatbuffers/flexbuffers.h"  // from @flatbuffers
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
+
+namespace mlir {
+namespace TFL {
+
+// Abstracts the conversion of the padded NMS composite function.
+class ConvertNMSPaddedFunc {
+ public:
+  explicit ConvertNMSPaddedFunc(FuncOp func) : func_(func) {}
+
+  void RewriteFunc();
+
+  LogicalResult VerifySignature();
+
+ private:
+  FuncOp func_;
+};
+
+// Abstracts the conversion of the SSD post-processing composite function to
+// TFLite.
+class ConvertSSDPostProcessFunc {
+ public:
+  explicit ConvertSSDPostProcessFunc(FuncOp func, mlir::TF::FuncAttr attr)
+      : func_(func), attr_(attr) {}
+
+  LogicalResult RewriteFunc();
+
+  LogicalResult VerifySignature();
+
+ private:
+  LogicalResult CreateNMSCustomOptions(FuncOp func, DictionaryAttr attrs,
+                                       std::string& custom_option_buffer);
+
+  LogicalResult AddIntAttr(FuncOp func, DictionaryAttr attrs,
+                           const std::string& attribute,
+                           flexbuffers::Builder* builder);
+
+  LogicalResult AddFloatAttr(FuncOp func, DictionaryAttr attrs,
+                             const std::string& attribute,
+                             flexbuffers::Builder* builder);
+
+  FuncOp func_;
+  mlir::TF::FuncAttr attr_;
+};
+
+}  // end namespace TFL
+}  // end namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_UTILS_TFTEXT_UTILS_H_
diff --git a/tensorflow/compiler/mlir/lite/utils/tftext_utils.cc b/tensorflow/compiler/mlir/lite/utils/tftext_utils.cc
index 96d22cb51e9..cce8038d3fa 100644
--- a/tensorflow/compiler/mlir/lite/utils/tftext_utils.cc
+++ b/tensorflow/compiler/mlir/lite/utils/tftext_utils.cc
@@ -47,6 +47,7 @@ namespace {
 
 constexpr char kNgrams[] = "tftext:Ngrams";
 constexpr char kWhitespaceTokenizer[] = "tftext:WhitespaceTokenizer";
+constexpr char kCustomSgnnProjection[] = "tftext:custom:SgnnProjection";
 constexpr char kTFImplements[] = "tf._implements";
 
 using mlir::TF::FuncAttr;
@@ -56,9 +57,9 @@ inline OpaqueElementsAttr CustomOption(OpBuilder* builder,
                                        const std::string& content) {
   ShapedType type = RankedTensorType::get(
       {static_cast<int64_t>(content.size())}, builder->getIntegerType(8));
-  return OpaqueElementsAttr::get(
-      builder->getContext()->getRegisteredDialect("tfl"), type,
-      StringRef(content.data(), content.size()));
+  return OpaqueElementsAttr::get(builder->getContext()->getLoadedDialect("tfl"),
+                                 type,
+                                 StringRef(content.data(), content.size()));
 }
 
 inline TensorType GetInputType(FuncOp func, int idx) {
@@ -269,6 +270,85 @@ LogicalResult ConvertNgrams(FuncOp func, llvm::StringRef api, FuncAttr attr) {
   return success();
 }
 
+LogicalResult VerifySgnnProjection(FuncOp func, FuncAttr attr) {
+  if (func.getType().getNumInputs() != 2 ||
+      func.getType().getNumResults() != 1) {
+    return func.emitError() << "Mismatched number of inputs and outputs.";
+  }
+  auto values_type = GetInputType(func, 0);
+  if (!values_type || !values_type.getElementType().isa<StringType>()) {
+    return func.emitError() << "First input should be a string tensor";
+  }
+  auto row_splits_type = GetInputType(func, 1);
+  if (!row_splits_type ||
+      !row_splits_type.getElementType().isa<IntegerType>()) {
+    return func.emitError() << "Second input should be an integer tensor";
+  }
+
+  auto hash_seed =
+      attr.GetAttrs().get("hash_seed").dyn_cast_or_null<ArrayAttr>();
+  if (!hash_seed) {
+    return func.emitError()
+           << "'hash_seed' attribute is not set or not an array";
+  }
+  auto output_type = GetResultType(func, 0);
+  if (!output_type || !output_type.getElementType().isa<FloatType>() ||
+      !RankEquals(output_type, 2)) {
+    return func.emitError() << "Output should be a 2D float tensor.";
+  }
+  if (output_type.getDimSize(1) != hash_seed.size()) {
+    return func.emitError()
+           << "Output 2nd dimension should be the num of hash seeds.";
+  }
+
+  auto buckets = attr.GetAttrs().get("buckets").dyn_cast_or_null<IntegerAttr>();
+  if (!buckets) {
+    return func.emitError() << "'buckets' attribute is not set or not int";
+  }
+
+  return success();
+}
+
+LogicalResult CreateSgnnProjectionCustomOption(
+    FuncOp func, DictionaryAttr attrs, std::string& custom_option_buffer) {
+  flexbuffers::Builder fbb;
+  size_t start_map = fbb.StartMap();
+
+  auto hash_seed = attrs.get("hash_seed").dyn_cast_or_null<ArrayAttr>();
+  auto vector_start = fbb.StartVector("hash_seed");
+  for (int i = 0; i < hash_seed.size(); i++) {
+    fbb.Add(static_cast<int32_t>(
+        (hash_seed.getValue().data() + i)->dyn_cast<IntegerAttr>().getInt()));
+  }
+  fbb.EndVector(vector_start, /*typed=*/true, /*fixed=*/false);
+
+  auto buckets = attrs.get("buckets").dyn_cast_or_null<IntegerAttr>();
+  fbb.Int("buckets", buckets.getInt());
+
+  fbb.EndMap(start_map);
+  fbb.Finish();
+  custom_option_buffer.assign(fbb.GetBuffer().begin(), fbb.GetBuffer().end());
+  return success();
+}
+
+LogicalResult ConvertSgnnProjection(FuncOp func, llvm::StringRef api,
+                                    FuncAttr attr) {
+  // See more details in tensorflow_models/sequence_projection/sgnn/sgnn.py
+  func.eraseBody();
+  func.addEntryBlock();
+  func.setAttr(kTFImplements, attr);
+  OpBuilder builder(func.getBody());
+  std::string custom_option_buffer;
+  if (failed(CreateSgnnProjectionCustomOption(func, attr.GetAttrs(),
+                                              custom_option_buffer))) {
+    return failure();
+  }
+  auto op = builder.create<CustomOp>(
+      func.getLoc(), func.getType().getResults(), func.getArguments(), api,
+      CustomOption(&builder, custom_option_buffer));
+  builder.create<ReturnOp>(func.getLoc(), op.getResults());
+  return success();
+}
 }  // namespace
 
 LogicalResult ConvertTFTextAPI(FuncOp func, llvm::StringRef api,
@@ -281,6 +361,10 @@ LogicalResult ConvertTFTextAPI(FuncOp func, llvm::StringRef api,
     if (succeeded(VerifyNgrams(func))) {
       return ConvertNgrams(func, api, attr);
     }
+  } else if (api.str() == kCustomSgnnProjection) {
+    if (succeeded(VerifySgnnProjection(func, attr))) {
+      return ConvertSgnnProjection(func, api, attr);
+    }
   }
   return failure();
 }
diff --git a/tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc b/tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc
index 8e6d9042987..d97e12fbe45 100644
--- a/tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc
+++ b/tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc
@@ -91,16 +91,14 @@ MlirOptimizationPassRegistry& MlirOptimizationPassRegistry::Global() {
   return *global;
 }
 
-static void RegisterDialects() {
-  static bool init_once = []() {
-    mlir::registerDialect<mlir::StandardOpsDialect>();
-    mlir::registerDialect<mlir::TF::TensorFlowDialect>();
-    mlir::registerDialect<mlir::shape::ShapeDialect>();
-    mlir::registerDialect<mlir::tf_device::TensorFlowDeviceDialect>();
-    mlir::registerDialect<mlir::tf_executor::TensorFlowExecutorDialect>();
-    return true;
-  }();
-  (void)init_once;
+static void RegisterDialects(mlir::DialectRegistry& registry) {
+  // clang-format off
+  registry.insert<mlir::StandardOpsDialect,
+                  mlir::TF::TensorFlowDialect,
+                  mlir::shape::ShapeDialect,
+                  mlir::tf_device::TensorFlowDeviceDialect,
+                  mlir::tf_executor::TensorFlowExecutorDialect>();
+  // clang-format on
 }
 
 Status MlirFunctionOptimizationPass::Run(
@@ -126,12 +124,18 @@ Status MlirFunctionOptimizationPass::Run(
                           << " passes)";
 
   GraphDebugInfo debug_info;
-  RegisterDialects();
   mlir::MLIRContext context;
+  RegisterDialects(context.getDialectRegistry());
   GraphImportConfig import_config;
   import_config.graph_as_function = true;
   import_config.control_outputs = *control_ret_node_names;
   import_config.upgrade_legacy = true;
+  // Disable shape inference during import as some TensorFlow op fails during
+  // shape inference with dynamic shaped operands. This in turn causes the
+  // import to fail. Shape inference during import is going to be removed and
+  // the shape inference pass is run early in the pass pipeline, shape inference
+  // during import is not necessary.
+  import_config.enable_shape_inference = false;
   TF_ASSIGN_OR_RETURN(auto module_ref,
                       ConvertGraphToMlir(**graph, debug_info, *flib_def,
                                          import_config, &context));
@@ -200,8 +204,8 @@ Status MlirV1CompatGraphOptimizationPass::Run(
                           << " passes)";
 
   GraphDebugInfo debug_info;
-  RegisterDialects();
   mlir::MLIRContext context;
+  RegisterDialects(context.getDialectRegistry());
   GraphImportConfig import_config;
   import_config.upgrade_legacy = true;
   // Restrict functionalization to TPU nodes to avoid problems in v1 session
diff --git a/tensorflow/compiler/mlir/op_or_arg_name_mapper.cc b/tensorflow/compiler/mlir/op_or_arg_name_mapper.cc
index bce0ed4a33d..6b605741355 100644
--- a/tensorflow/compiler/mlir/op_or_arg_name_mapper.cc
+++ b/tensorflow/compiler/mlir/op_or_arg_name_mapper.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/utils/name_utils.h"
 
 static inline absl::string_view StringRefToView(llvm::StringRef ref) {
   return absl::string_view(ref.data(), ref.size());
@@ -103,62 +104,16 @@ int OpOrArgNameMapper::InitOpName(OpOrVal op_or_val, llvm::StringRef name) {
 
 bool OpOrArgNameMapper::IsUnique(llvm::StringRef name) { return true; }
 
-namespace {
-// Derives name from location.
-std::string GetNameFromLoc(mlir::Location loc) {
-  llvm::SmallVector<llvm::StringRef, 8> loc_names;
-  llvm::SmallVector<mlir::Location, 8> locs;
-  locs.push_back(loc);
-  bool names_is_nonempty = false;
-
-  while (!locs.empty()) {
-    mlir::Location curr_loc = locs.pop_back_val();
-
-    if (auto name_loc = curr_loc.dyn_cast<mlir::NameLoc>()) {
-      // Add name in NameLoc. For NameLoc we also account for names due to ops
-      // in functions where the op's name is first.
-      auto name = name_loc.getName().strref().split('@').first;
-      loc_names.push_back(name);
-      if (!name.empty()) names_is_nonempty = true;
-      continue;
-    } else if (auto call_loc = curr_loc.dyn_cast<mlir::CallSiteLoc>()) {
-      // Add name if CallSiteLoc's callee has a NameLoc (as should be the
-      // case if imported with DebugInfo).
-      if (auto name_loc = call_loc.getCallee().dyn_cast<mlir::NameLoc>()) {
-        auto name = name_loc.getName().strref().split('@').first;
-        loc_names.push_back(name);
-        if (!name.empty()) names_is_nonempty = true;
-        continue;
-      }
-    } else if (auto fused_loc = curr_loc.dyn_cast<mlir::FusedLoc>()) {
-      // Push all locations in FusedLoc in reverse order, so locations are
-      // visited based on order in FusedLoc.
-      auto reversed_fused_locs = llvm::reverse(fused_loc.getLocations());
-      locs.append(reversed_fused_locs.begin(), reversed_fused_locs.end());
-      continue;
-    }
-
-    // Location is not a supported, so an empty StringRef is added.
-    loc_names.push_back(llvm::StringRef());
-  }
-
-  if (names_is_nonempty)
-    return llvm::join(loc_names.begin(), loc_names.end(), ";");
-
-  return "";
-}
-}  // anonymous namespace
-
 std::string OpOrArgLocNameMapper::GetName(OpOrVal op_or_val) {
   if (auto* op = op_or_val.dyn_cast<mlir::Operation*>()) {
-    auto name_from_loc = GetNameFromLoc(op->getLoc());
+    auto name_from_loc = mlir::GetNameFromLoc(op->getLoc());
     if (!name_from_loc.empty()) return name_from_loc;
     // If the location is none of the expected types, then simply use name
     // generated using the op type.
     return std::string(op->getName().getStringRef());
   }
   auto val = op_or_val.dyn_cast<mlir::Value>();
-  auto name_from_loc = GetNameFromLoc(val.getLoc());
+  auto name_from_loc = mlir::GetNameFromLoc(val.getLoc());
   if (!name_from_loc.empty()) return name_from_loc;
   // If the location is none of the expected types, then simply use name
   // generated using the op type. Follow TF convention and append the result
diff --git a/tensorflow/compiler/mlir/python/BUILD b/tensorflow/compiler/mlir/python/BUILD
index 5bbfba773a3..66283bded71 100644
--- a/tensorflow/compiler/mlir/python/BUILD
+++ b/tensorflow/compiler/mlir/python/BUILD
@@ -10,6 +10,7 @@ cc_library(
     deps = [
         "//tensorflow/c:tf_status",
         "//tensorflow/c:tf_status_helper",
+        "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:convert_graphdef",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_passes",
         "//tensorflow/compiler/mlir/tensorflow:tf_saved_model_passes",
@@ -35,6 +36,9 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Parser",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:AllPassesAndDialectsNoRegistration",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/compiler/mlir/python/mlir.cc b/tensorflow/compiler/mlir/python/mlir.cc
index 5ce0ca8cfcb..066726593a7 100644
--- a/tensorflow/compiler/mlir/python/mlir.cc
+++ b/tensorflow/compiler/mlir/python/mlir.cc
@@ -16,19 +16,53 @@ limitations under the License.
 #include <string>
 
 #include "llvm/Support/raw_ostream.h"
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/InitAllPasses.h"  // from @llvm-project
 #include "mlir/Parser.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
 #include "tensorflow/c/tf_status.h"
 #include "tensorflow/c/tf_status_helper.h"
+#include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/import_model.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/import_utils.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/op.h"
 
 namespace tensorflow {
 
+namespace {
+
+// Runs pass pipeline `pass_pipeline` on `module` if `pass_pipeline` is not
+// empty.
+std::string RunPassPipelineOnModule(mlir::ModuleOp module,
+                                    const std::string &pass_pipeline,
+                                    TF_Status *status) {
+  if (!pass_pipeline.empty()) {
+    mlir::PassManager pm(module.getContext());
+    std::string error;
+    llvm::raw_string_ostream error_stream(error);
+    if (failed(mlir::parsePassPipeline(pass_pipeline, pm, error_stream))) {
+      TF_SetStatus(status, TF_INVALID_ARGUMENT,
+                   ("Invalid pass_pipeline: " + error_stream.str()).c_str());
+      return "// error";
+    }
+
+    mlir::StatusScopedDiagnosticHandler statusHandler(module.getContext());
+    if (failed(pm.run(module))) {
+      Set_TF_Status_from_Status(status, statusHandler.ConsumeStatus());
+      return "// error";
+    }
+  }
+  return MlirModuleToString(module);
+}
+
+}  // anonymous namespace
+
 std::string ImportGraphDef(const std::string &proto,
                            const std::string &pass_pipeline,
                            TF_Status *status) {
@@ -47,24 +81,43 @@ std::string ImportGraphDef(const std::string &proto,
     return "// error";
   }
 
-  // Run the pass_pipeline on the module if not empty.
-  if (!pass_pipeline.empty()) {
-    mlir::PassManager pm(&context);
-    std::string error;
-    llvm::raw_string_ostream error_stream(error);
-    if (failed(mlir::parsePassPipeline(pass_pipeline, pm, error_stream))) {
-      TF_SetStatus(status, TF_INVALID_ARGUMENT,
-                   ("Invalid pass_pipeline: " + error_stream.str()).c_str());
-      return "// error";
-    }
+  return RunPassPipelineOnModule(module->get(), pass_pipeline, status);
+}
 
-    mlir::StatusScopedDiagnosticHandler statusHandler(&context);
-    if (failed(pm.run(*module.ValueOrDie()))) {
-      Set_TF_Status_from_Status(status, statusHandler.ConsumeStatus());
-      return "// error";
-    }
+std::string ImportFunction(const std::string &functiondef_proto,
+                           const std::string &functiondef_library_proto,
+                           const std::string &pass_pipeline,
+                           TF_Status *status) {
+  FunctionDef functiondef;
+  auto s = tensorflow::LoadProtoFromBuffer(functiondef_proto, &functiondef);
+  if (!s.ok()) {
+    Set_TF_Status_from_Status(status, s);
+    return "// error";
   }
-  return MlirModuleToString(*module.ConsumeValueOrDie());
+
+  FunctionDefLibrary fdef_lib;
+  s = tensorflow::LoadProtoFromBuffer(functiondef_library_proto, &fdef_lib);
+  if (!s.ok()) {
+    Set_TF_Status_from_Status(status, s);
+    return "// error";
+  }
+
+  FunctionLibraryDefinition flib_def(OpRegistry::Global(), fdef_lib);
+  s = flib_def.AddFunctionDef(functiondef);
+  if (!s.ok()) {
+    Set_TF_Status_from_Status(status, s);
+    return "// error";
+  }
+
+  const std::string &function_name = functiondef.signature().name();
+  mlir::MLIRContext context;
+  auto module = ConvertFunctionToMlir(function_name, flib_def, &context);
+  if (!module.ok()) {
+    Set_TF_Status_from_Status(status, module.status());
+    return "// error";
+  }
+
+  return RunPassPipelineOnModule(module->get(), pass_pipeline, status);
 }
 
 std::string ExperimentalConvertSavedModelToMlir(
@@ -150,6 +203,7 @@ std::string ExperimentalRunPassPipeline(const std::string &mlir_txt,
                                         bool show_debug_info,
                                         TF_Status *status) {
   mlir::MLIRContext context;
+  mlir::RegisterAllTensorFlowDialects(context.getDialectRegistry());
   mlir::OwningModuleRef module;
   {
     mlir::StatusScopedDiagnosticHandler diagnostic_handler(&context);
@@ -164,6 +218,7 @@ std::string ExperimentalRunPassPipeline(const std::string &mlir_txt,
   mlir::PassManager pm(&context);
   std::string error;
   llvm::raw_string_ostream error_stream(error);
+  mlir::registerAllPasses();
   if (failed(mlir::parsePassPipeline(pass_pipeline, pm, error_stream))) {
     TF_SetStatus(status, TF_INVALID_ARGUMENT,
                  ("Invalid pass_pipeline: " + error_stream.str()).c_str());
diff --git a/tensorflow/compiler/mlir/python/mlir.h b/tensorflow/compiler/mlir/python/mlir.h
index e68ac28124b..6133068a5e8 100644
--- a/tensorflow/compiler/mlir/python/mlir.h
+++ b/tensorflow/compiler/mlir/python/mlir.h
@@ -25,13 +25,23 @@ limitations under the License.
 namespace tensorflow {
 
 // Simple wrapper to support tf.mlir.experimental.convert_graph_def.
-// Load a .pbptx, convert to MLIR, and (optionally) optimize the module before
-// returning it as a string.
+// Load a GraphDef (binary or textual proto format), convert to MLIR, and
+// (optionally) optimize the module before returning it as a string.
 // This is an early experimental API, ideally we should return a wrapper object
 // around a Python binding to the MLIR module.
 std::string ImportGraphDef(const std::string &proto,
                            const std::string &pass_pipeline, TF_Status *status);
 
+// Simple wrapper to support tf.mlir.experimental.convert_function.
+// Load FunctionDef and FunctionDefLibrary (binary or textual proto format),
+// convert to MLIR, and (optionally) optimize the module before returning it as
+// a string.
+// This is an early experimental API, ideally we should return a wrapper object
+// around a Python binding to the MLIR module.
+std::string ImportFunction(const std::string &functiondef_proto,
+                           const std::string &functiondef_library_proto,
+                           const std::string &pass_pipeline, TF_Status *status);
+
 // Load a SavedModel and return a textual MLIR string corresponding to it.
 //
 // Args:
diff --git a/tensorflow/compiler/mlir/python/mlir_wrapper/BUILD b/tensorflow/compiler/mlir/python/mlir_wrapper/BUILD
index 5e21dddd444..31bce8d1bf6 100644
--- a/tensorflow/compiler/mlir/python/mlir_wrapper/BUILD
+++ b/tensorflow/compiler/mlir/python/mlir_wrapper/BUILD
@@ -21,6 +21,7 @@ tf_python_pybind_extension(
         "//tensorflow/python:pybind11_lib",
         "//tensorflow/python:pybind11_status",
         "@llvm-project//llvm:Support",
+        "@llvm-project//llvm:filecheck-lib",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Parser",
         "@llvm-project//mlir:StandardOps",
@@ -37,6 +38,7 @@ tf_python_pybind_extension(
         "//tensorflow/python:pybind11_lib",
         "//tensorflow/python:pybind11_status",
         "@llvm-project//llvm:Support",
+        "@llvm-project//llvm:filecheck-lib",
         "@pybind11",
     ],
 )
diff --git a/tensorflow/compiler/mlir/python/mlir_wrapper/basic_classes.cc b/tensorflow/compiler/mlir/python/mlir_wrapper/basic_classes.cc
index 25adb44fe1d..5ae638851f4 100644
--- a/tensorflow/compiler/mlir/python/mlir_wrapper/basic_classes.cc
+++ b/tensorflow/compiler/mlir/python/mlir_wrapper/basic_classes.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "llvm/Support/FileCheck.h"
+#include "llvm/FileCheck/FileCheck.h"
 #include "mlir/IR/Block.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/python/mlir_wrapper/filecheck_wrapper.cc b/tensorflow/compiler/mlir/python/mlir_wrapper/filecheck_wrapper.cc
index 8a841856b72..051952ebaba 100644
--- a/tensorflow/compiler/mlir/python/mlir_wrapper/filecheck_wrapper.cc
+++ b/tensorflow/compiler/mlir/python/mlir_wrapper/filecheck_wrapper.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "llvm/Support/FileCheck.h"
+#include "llvm/FileCheck/FileCheck.h"
 #include "llvm/Support/SourceMgr.h"
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"
diff --git a/tensorflow/compiler/mlir/python/mlir_wrapper/mlir_wrapper.cc b/tensorflow/compiler/mlir/python/mlir_wrapper/mlir_wrapper.cc
index 63ca4c7bb28..6cd49cf368d 100644
--- a/tensorflow/compiler/mlir/python/mlir_wrapper/mlir_wrapper.cc
+++ b/tensorflow/compiler/mlir/python/mlir_wrapper/mlir_wrapper.cc
@@ -22,22 +22,25 @@ limitations under the License.
 #include "mlir/Parser.h"  // from @llvm-project
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"
+#include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/python/lib/core/pybind11_lib.h"
 #include "tensorflow/python/lib/core/pybind11_status.h"
 
 PYBIND11_MODULE(mlir_wrapper, m) {
-  m.def("registerDialects", []() {
-    mlir::registerDialect<mlir::TF::TensorFlowDialect>();
-    mlir::registerDialect<mlir::tf_executor::TensorFlowExecutorDialect>();
-    mlir::registerDialect<mlir::StandardOpsDialect>();
+  m.def("preloadTensorFlowDialects", [](mlir::MLIRContext &context) {
+    mlir::RegisterAllTensorFlowDialects(context.getDialectRegistry());
+    context.getDialectRegistry().loadAll(&context);
   });
+
   m.def("verify", [](std::string input) {
     llvm::SourceMgr SM = llvm::SourceMgr();
     SM.AddNewSourceBuffer(llvm::MemoryBuffer::getMemBuffer(input),
                           llvm::SMLoc());
     mlir::MLIRContext ctx;
+    mlir::RegisterAllTensorFlowDialects(ctx.getDialectRegistry());
+    ctx.getDialectRegistry().loadAll(&ctx);
     auto module = mlir::parseSourceFile(SM, &ctx);
     if (!module) {
       return false;
diff --git a/tensorflow/compiler/mlir/python/mlir_wrapper/types.cc b/tensorflow/compiler/mlir/python/mlir_wrapper/types.cc
index 2be67f8e93e..be2dc2065f3 100644
--- a/tensorflow/compiler/mlir/python/mlir_wrapper/types.cc
+++ b/tensorflow/compiler/mlir/python/mlir_wrapper/types.cc
@@ -20,11 +20,6 @@ limitations under the License.
 void init_types(py::module& m) {
   // Type
   py::class_<mlir::Type> Type(m, "Type");
-  Type.def("getKind", &mlir::Type::getKind);
-
-  // Type Enums
-  py::enum_<mlir::StandardTypes::Kind>(Type, "StandardTypes_Kind")
-      .value("BF16", mlir::StandardTypes::BF16);
 
   // Type Sub-classes
   py::class_<mlir::FunctionType, mlir::Type>(m, "FunctionType")
@@ -32,7 +27,10 @@ void init_types(py::module& m) {
            [](mlir::FunctionType& ft) { return ft.getResults().vec(); });
 
   py::class_<mlir::FloatType, mlir::Type>(m, "FloatType")
-      .def("get", &mlir::FloatType::get);
+      .def("getBF16", &mlir::FloatType::getBF16)
+      .def("getF16", &mlir::FloatType::getF16)
+      .def("getF32", &mlir::FloatType::getF32)
+      .def("getF64", &mlir::FloatType::getF64);
 
   py::class_<mlir::IntegerType, mlir::Type>(m, "IntegerType")
       .def("get", py::overload_cast<unsigned, mlir::MLIRContext*>(
diff --git a/tensorflow/compiler/mlir/runlit.cfg.py b/tensorflow/compiler/mlir/runlit.cfg.py
index 45c8dce8422..f9870183b88 100644
--- a/tensorflow/compiler/mlir/runlit.cfg.py
+++ b/tensorflow/compiler/mlir/runlit.cfg.py
@@ -74,7 +74,7 @@ tool_names = [
     'tf_tfjs_translate', 'flatbuffer_to_string', 'flatbuffer_translate',
     'tf-mlir-translate', 'mlir-tflite-runner', 'tfcompile',
     'json_to_flatbuffer', 'xla-gpu-opt', 'xla-opt', 'hlo_to_llvm_ir',
-    'kernel-gen-opt', 'xla-thunks-opt'
+    'kernel-gen-opt', 'xla-thunks-opt', 'tfjs-opt'
 ]
 tools = [ToolSubst(s, unresolved='ignore') for s in tool_names]
 llvm_config.add_tool_substitutions(tools, tool_dirs)
diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index c6f0083fc92..7bdc3b0396f 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -13,6 +13,7 @@ package_group(
         "//learning/brain/experimental/dtensor/...",
         "//learning/brain/experimental/tfrt/...",
         "//learning/pathways/data_parallel/tf2xla/...",
+        "//platforms/xla/sparse_core/...",
         "//tensorflow/compiler/...",
         "//tensorflow/lite/experimental/tf_runtime/...",
         "//tensorflow/python/...",
@@ -33,6 +34,7 @@ filegroup(
         "ir/tf_op_base.td",
         "ir/tf_op_interfaces.td",
         "ir/tf_ops.td",
+        "ir/tfrt_ops.td",
         "@llvm-project//mlir:OpBaseTdFiles",
         "@llvm-project//mlir:include/mlir/Interfaces/CallInterfaces.td",
         "@llvm-project//mlir:include/mlir/Interfaces/InferTypeOpInterface.td",
@@ -124,6 +126,25 @@ gentbl(
     ],
 )
 
+gentbl(
+    name = "tensorflow_tfrt_ops_inc_gen",
+    tbl_outs = [
+        (
+            "-gen-op-decls",
+            "ir/tfrt_ops.h.inc",
+        ),
+        (
+            "-gen-op-defs",
+            "ir/tfrt_ops.cc.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "ir/tfrt_ops.td",
+    td_srcs = [
+        ":tensorflow_ops_td_files",
+    ],
+)
+
 # We only shard tf_op on name for build performance reasons.
 tf_ops_category_list = [
     {
@@ -343,6 +364,7 @@ cc_library(
         name = "tensorflow_" + target["name"],
         srcs = [
             "ir/tf_ops.h",
+            "ir/tfrt_ops.h",
             "ir/tf_remaining_ops.h",
             "ir/tf_" + target["name"] + ".cc",
             "ir/tf_" + target["name"] + ".cc.inc",
@@ -352,9 +374,11 @@ cc_library(
         textual_hdrs = [
             "ir/tf_all_ops.h.inc",
             "ir/tf_ops_helpers.inc",
+            "ir/tfrt_ops.h.inc",
             "ir/tf_remaining_ops.h.inc",
         ] + ["ir/tf_" + target["name"] + ".h.inc" for target in tf_ops_category_list],
         deps = [
+            ":attribute_utils",
             ":tensorflow_attributes",
             ":tensorflow_canonicalize_inc_gen",
             ":tensorflow_op_interfaces",
@@ -385,6 +409,7 @@ cc_library(
         "ir/tf_ops.h",
         "ir/tf_remaining_ops.h",
         "ir/tf_remaining_ops.cc",
+        "ir/tfrt_ops.h",
     ] + ["ir/tf_" + target["name"] + ".h" for target in tf_ops_category_list],
     hdrs = [
     ],
@@ -392,6 +417,49 @@ cc_library(
         "ir/tf_all_ops.h.inc",
         "ir/tf_ops_helpers.inc",
         "ir/tf_remaining_ops.h.inc",
+        "ir/tfrt_ops.h.inc",
+    ] + ["ir/tf_" + target["name"] + ".h.inc" for target in tf_ops_category_list],
+    deps = [
+        ":tensorflow_attributes",
+        ":tensorflow_canonicalize_inc_gen",
+        ":tensorflow_op_interfaces",
+        ":tensorflow_op_interfaces_inc_gen",
+        ":tensorflow_remaining_ops_inc_gen",
+        ":tensorflow_side_effects",
+        ":tensorflow_structs",
+        ":tensorflow_tfrt_ops_inc_gen",
+        ":tensorflow_traits",
+        ":tensorflow_types",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:DerivedAttributeOpInterface",
+        "@llvm-project//mlir:Dialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:InferTypeOpInterface",
+        "@llvm-project//mlir:LoopLikeInterface",
+        "@llvm-project//mlir:Parser",
+        "@llvm-project//mlir:SideEffects",
+        "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+cc_library(
+    name = "tensorflow_tfrt_ops",
+    srcs = [
+        "ir/tf_ops.h",
+        "ir/tfrt_ops.h",
+        "ir/tfrt_ops.cc",
+        "ir/tf_remaining_ops.h",
+    ] + ["ir/tf_" + target["name"] + ".h" for target in tf_ops_category_list],
+    hdrs = [
+    ],
+    textual_hdrs = [
+        "ir/tf_all_ops.h.inc",
+        "ir/tf_ops_helpers.inc",
+        "ir/tfrt_ops.h.inc",
+        "ir/tf_remaining_ops.h.inc",
     ] + ["ir/tf_" + target["name"] + ".h.inc" for target in tf_ops_category_list],
     deps = [
         ":tensorflow_attributes",
@@ -401,6 +469,7 @@ cc_library(
         ":tensorflow_remaining_ops_inc_gen",
         ":tensorflow_side_effects",
         ":tensorflow_structs",
+        ":tensorflow_tfrt_ops_inc_gen",
         ":tensorflow_traits",
         ":tensorflow_types",
         "//tensorflow/core:framework",
@@ -427,9 +496,11 @@ cc_library(
     textual_hdrs = [
         "ir/tf_all_ops.h.inc",
         "ir/tf_remaining_ops.h",
+        "ir/tfrt_ops.h",
     ] + ["ir/tf_" + target["name"] + ".h" for target in tf_ops_category_list],
     deps = [
         ":tensorflow_all_ops_inc_gen",
+        ":tensorflow_tfrt_ops_inc_gen",
         ":tensorflow_remaining_ops_inc_gen",
         ":tensorflow_attributes",
         ":tensorflow_canonicalize_inc_gen",
@@ -440,6 +511,7 @@ cc_library(
         ":tensorflow_traits",
         ":tensorflow_types",
         ":tensorflow_remaining_ops",
+        ":tensorflow_tfrt_ops",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:DerivedAttributeOpInterface",
         "@llvm-project//mlir:Dialect",
@@ -512,6 +584,7 @@ cc_library(
         "ir/tf_saved_model.cc",
     ],
     hdrs = [
+        "dialect_registration.h",
         "ir/tf_device.h",
         "ir/tf_executor.h",
         "ir/tf_ops.h",
@@ -536,6 +609,7 @@ cc_library(
         ":tensorflow_ops",
         ":tensorflow_side_effects",
         ":tensorflow_structs",
+        ":tensorflow_tfrt_ops_inc_gen",
         ":tensorflow_traits",
         ":tensorflow_types",
         ":tf_saved_model_inc_gen",
@@ -718,12 +792,13 @@ cc_library(
     deps = [
         ":tensorflow",
         ":tensorflow_types",
-        "//tensorflow/compiler/tf2xla:resource_operation_table",
         "//tensorflow/core:framework",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:Analysis",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SideEffectInterfaces",
         "@llvm-project//mlir:Support",
     ],
 )
@@ -738,6 +813,7 @@ cc_library(
         "transforms/cluster_formation.cc",
         "transforms/cluster_outlining.cc",
         "transforms/collection_ops_util.cc",
+        "transforms/contraction_fusion.cc",
         "transforms/decompose_resource_ops_pass.cc",
         "transforms/device_index_selector.cc",
         "transforms/einsum.cc",
@@ -769,24 +845,34 @@ cc_library(
         "transforms/replicate_to_island.cc",
         "transforms/resource_device_inference.cc",
         "transforms/resource_op_lifting.cc",
+        "transforms/resource_op_lifting_cleanup.cc",
+        "transforms/resource_op_lifting_cleanup.h",
         "transforms/rewrite_tpu_embedding_ops.cc",
         "transforms/shape_inference.cc",
         "transforms/shape_inference_pass.cc",
         "transforms/sink_constant.cc",
         "transforms/stack_ops_decomposition.cc",
         "transforms/tensor_array_ops_decomposition.cc",
+        "transforms/tensor_device_copy_conversion.cc",
         "transforms/tensor_list_ops_decomposition.cc",
+        "transforms/test_resource_alias_analysis.cc",
         "transforms/test_side_effect_analysis.cc",
+        "transforms/test_visitor_util.cc",
         "transforms/tf_data_optimization_pass.cc",
         "transforms/tf_device_assignment.cc",
+        "transforms/tpu_cluster_cleanup_attributes.cc",
         "transforms/tpu_cluster_formation.cc",
+        "transforms/tpu_colocate_composite_resource_ops.cc",
         "transforms/tpu_dynamic_layout_pass.cc",
         "transforms/tpu_dynamic_padding_mapper.cc",
         "transforms/tpu_extract_head_tail_outside_compilation.cc",
         "transforms/tpu_extract_outside_compilation.cc",
         "transforms/tpu_host_computation_expansion.cc",
+        "transforms/tpu_identity_pruning.cc",
         "transforms/tpu_merge_variables_with_execute.cc",
         "transforms/tpu_outside_compilation_cluster.cc",
+        "transforms/tpu_parallel_execute_sink_resource_write.cc",
+        "transforms/tpu_resource_read_for_write.cc",
         "transforms/tpu_rewrite_pass.cc",
         "transforms/tpu_sharding_identification_pass.cc",
         "transforms/tpu_space_to_depth_pass.cc",
@@ -797,8 +883,6 @@ cc_library(
         "translate/tf_functional_to_executor.cc",
     ],
     hdrs = [
-        "transforms/attribute_utils.h",
-        "transforms/batchmatmul_to_einsum.h",
         "transforms/bridge.h",
         "transforms/collection_ops_util.h",
         "transforms/einsum.h",
@@ -806,7 +890,11 @@ cc_library(
         "transforms/shape_inference.h",
     ],
     includes = ["include"],
+    textual_hdrs = [
+        "ir/tf_ops_helpers.inc",
+    ],
     deps = [
+        ":attribute_utils",
         ":bridge_logger",
         ":convert_tensor",
         ":convert_type",
@@ -815,7 +903,10 @@ cc_library(
         ":device_util",
         ":error_util",
         ":export_tf_dialect_op",
+        ":lower_tf_lib",
         ":mangling_util",
+        ":serialize_mlir_module_utils",
+        ":shape_inference_utils",
         ":tensorflow",
         ":tensorflow_analysis",
         ":tensorflow_optimize_inc_gen",
@@ -824,9 +915,12 @@ cc_library(
         ":tpu_rewrite_device_util",
         ":translate_utils",
         ":unroll_batch_matmul_pass",
+        ":visitor_util",
         ":xla_sharding_util",
         "//tensorflow/compiler/mlir:op_or_arg_name_mapper",
         "//tensorflow/compiler/mlir/lite:validators",
+        "//tensorflow/compiler/mlir/xla:xla_legalize_tf",
+        "//tensorflow/compiler/mlir/xla:xla_legalize_tf_with_tf2xla",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla:xla_proto_cc",
         "//tensorflow/compiler/xla/client:sharding_builder",
@@ -843,6 +937,7 @@ cc_library(
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:Analysis",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:InferTypeOpInterface",
         "@llvm-project//mlir:Parser",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:StandardOps",
@@ -909,17 +1004,6 @@ cc_library(
     alwayslink = 1,
 )
 
-# Library with TensorFlow dialect static initialization.
-cc_library(
-    name = "tensorflow_dialect_registration",
-    srcs = ["ir/dialect_registration.cc"],
-    deps = [
-        ":tensorflow",
-        "@llvm-project//mlir:Shape",
-    ],
-    alwayslink = 1,
-)
-
 cc_library(
     name = "convert_graphdef",
     srcs = [
@@ -949,6 +1033,7 @@ cc_library(
         "//tensorflow/cc/saved_model:loader_lite",
         "//tensorflow/cc/saved_model:loader_util",
         "//tensorflow/compiler/jit:shape_inference_helpers",
+        "//tensorflow/compiler/mlir:name_utils",
         "//tensorflow/compiler/mlir:op_or_arg_name_mapper",
         "//tensorflow/compiler/tf2xla:functionalize_control_flow",
         "//tensorflow/compiler/xla:status_macros",
@@ -1064,6 +1149,7 @@ cc_library(
         ":export_utils",
         ":tensorflow",
         "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/stream_executor/lib",
@@ -1079,6 +1165,7 @@ cc_library(
     srcs = ["translate/translate_tf_dialect_op.cc"],
     deps = [
         ":export_tf_dialect_op",
+        ":tensorflow",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
@@ -1264,7 +1351,7 @@ cc_library(
     name = "tf_dialect_passes",
     srcs = [
         "transforms/constant_fold.cc",
-        "transforms/dialect_hooks.cc",
+        "transforms/decode_attributes_hook.cc",
     ],
     hdrs = [
         "transforms/constant_fold.h",
@@ -1292,9 +1379,8 @@ cc_library(
 cc_library(
     name = "tf_dialect_lib",
     deps = [
-        ":tensorflow_dialect_registration",
         ":tf_dialect_passes",
-        "@llvm-project//mlir:AllPassesAndDialects",
+        "@llvm-project//mlir:AllPassesAndDialectsNoRegistration",
     ],
 )
 
@@ -1305,6 +1391,7 @@ cc_library(
     deps = [
         ":convert_graphdef",
         ":mlir_roundtrip_flags",
+        ":tensorflow",
         "//tensorflow/compiler/tf2xla:functionalize_control_flow_pass_registration",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
@@ -1399,6 +1486,7 @@ cc_library(
     deps = [
         ":convert_graphdef",
         ":mlir_roundtrip_flags",
+        ":tensorflow",
         ":translate_cl_options",
         ":translate_lib",
         "//tensorflow/core:protos_all_cc",
@@ -1481,25 +1569,27 @@ gentbl(
 COMPILE_MLIR_UTIL_DEPS = [
     ":bridge_logger",
     ":convert_graphdef",
+    ":convert_tensor",
     ":convert_type",
     ":dump_mlir_util",
     ":error_util",
     ":mlir_roundtrip_flags",
+    ":serialize_mlir_module_utils",
     ":tensorflow",
-    ":tensorflow_dialect_registration",
     ":tensorflow_types",
     ":tensorflow_passes",
     ":translate_utils",
     "@com_google_absl//absl/types:optional",
+    "@com_google_absl//absl/types:variant",
     "@llvm-project//llvm:Support",
     "@llvm-project//mlir:IR",
-    "@llvm-project//mlir:Parser",
     "@llvm-project//mlir:Pass",
     "@llvm-project//mlir:Shape",
     "@llvm-project//mlir:StandardOps",
     "@llvm-project//mlir:TransformUtils",
     "@llvm-project//mlir:Transforms",
     "//tensorflow/compiler/mlir/hlo:hlo",
+    "//tensorflow/compiler/mlir/hlo:hlo_dialect_registration",
     "//tensorflow/compiler/mlir/hlo:sink_constants_to_control_flow",
     "//tensorflow/compiler/mlir/xla:mlir_hlo_to_hlo",
     "//tensorflow/compiler/mlir/xla:type_to_shape",
@@ -1514,9 +1604,9 @@ COMPILE_MLIR_UTIL_DEPS = [
     "//tensorflow/core:framework",
     "//tensorflow/core:protos_all_cc",
     "//tensorflow/stream_executor/lib",
+    "//tensorflow/compiler/xla:shape_util",
     "//tensorflow/compiler/xla:xla_data_proto_cc",
     "//tensorflow/compiler/xla/service:hlo",
-    ":convert_tensor",
 ]
 
 # Prefer to link 'compile_mlir_util' library that also links necessary
@@ -1543,27 +1633,61 @@ cc_library(
     ],
 )
 
-tf_cc_test(
-    name = "compile_mlir_util_test",
-    size = "small",
-    srcs = ["utils/compile_mlir_util_test.cc"],
+cc_library(
+    name = "compile_mlir_util_pass",
+    srcs = ["utils/compile_mlir_util_pass.cc"],
     deps = [
         ":compile_mlir_util",
-        "//tensorflow/cc:function_ops",
-        "//tensorflow/cc:resource_variable_ops",
-        "//tensorflow/cc:scope",
-        "//tensorflow/compiler/jit",
-        "//tensorflow/compiler/tf2xla:common",
-        "//tensorflow/compiler/tf2xla:xla_compiler",
-        "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/service:hlo",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-        "//tensorflow/stream_executor/lib",
+        "@llvm-project//mlir:Pass",
     ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "serialize_mlir_module_utils",
+    srcs = ["utils/serialize_mlir_module_utils.cc"],
+    hdrs = ["utils/serialize_mlir_module_utils.h"],
+    deps = [
+        ":error_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/core/platform:errors",
+        "//tensorflow/core/platform:status",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Parser",
+    ],
+)
+
+cc_library(
+    name = "tf_xla_mlir_translate",
+    srcs = ["utils/tf_xla_mlir_translate.cc"],
+    deps = [
+        ":compile_mlir_util",
+        ":mlir_roundtrip_flags",
+        ":serialize_mlir_module_utils",
+        ":tensorflow",
+        ":translate_cl_options",
+        "//tensorflow/compiler/mlir:string_container_utils",
+        "//tensorflow/compiler/mlir/xla:translate_cl_options",
+        "//tensorflow/compiler/tf2xla:xla_argument",
+        "//tensorflow/compiler/tf2xla:xla_helpers",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_module_config",
+        "//tensorflow/compiler/xla/service:hlo_proto_cc",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/platform:errors",
+        "//tensorflow/core/platform:status",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Parser",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:Translation",
+    ],
+    alwayslink = 1,
 )
 
 cc_library(
@@ -1627,6 +1751,7 @@ cc_library(
     deps = [
         ":lower_tf_inc_gen",
         ":tensorflow",
+        ":tensorflow_ops",
         ":tensorflow_types",
         "//tensorflow/core:framework",
         "@llvm-project//llvm:Support",
@@ -1679,6 +1804,7 @@ cc_library(
         ":tensorflow",
         "//tensorflow/core:core_cpu_lib",
         "//tensorflow/core:framework",
+        "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
@@ -1738,14 +1864,13 @@ cc_library(
         ":convert_graphdef",
         ":error_util",
         ":tensorflow",
-        ":tensorflow_dialect_registration",
         ":tensorflow_passes",
         "//tensorflow/core:framework",
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core/platform:logging",
         "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:AllPassesAndDialects",
+        "@llvm-project//mlir:AllPassesAndDialectsNoRegistration",
         "@llvm-project//mlir:Analysis",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
@@ -1761,6 +1886,7 @@ tf_cc_test(
         "//tensorflow/core:framework",
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
+        "//tensorflow/core:ops",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/platform:test",
@@ -1781,6 +1907,21 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "visitor_util",
+    srcs = [
+        "utils/visitor_util.cc",
+    ],
+    hdrs = [
+        "utils/visitor_util.h",
+    ],
+    deps = [
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
 cc_library(
     name = "xla_sharding_util",
     srcs = [
@@ -1798,3 +1939,35 @@ cc_library(
         "@llvm-project//mlir:Support",
     ],
 )
+
+cc_library(
+    name = "attribute_utils",
+    hdrs = ["utils/attribute_utils.h"],
+    deps = [
+        "@llvm-project//mlir:IR",
+    ],
+)
+
+cc_library(
+    name = "shape_inference_utils",
+    srcs = ["utils/shape_inference_utils.cc"],
+    hdrs = ["utils/shape_inference_utils.h"],
+    deps = [
+        ":convert_tensor",
+        ":convert_type",
+        ":export_utils",
+        ":tensorflow",
+        ":tensorflow_attributes",
+        ":tensorflow_types",
+        "//tensorflow/compiler/mlir:array_container_utils",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:ops",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/platform:types",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:DerivedAttributeOpInterface",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:InferTypeOpInterface",
+        "@llvm-project//mlir:Support",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.cc b/tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.cc
index 3278c06fabe..d70bd01e490 100644
--- a/tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.cc
+++ b/tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.cc
@@ -21,11 +21,14 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SCCIterator.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
+#include "mlir/Analysis/CallGraph.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Block.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
@@ -34,19 +37,19 @@ limitations under the License.
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Interfaces/CallInterfaces.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
-#include "tensorflow/compiler/tf2xla/resource_operation_table.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 
 namespace mlir {
 namespace TF {
+namespace detail {
 
-namespace {
 //===----------------------------------------------------------------------===//
 // BacktrackAnalysisInfo
 //===----------------------------------------------------------------------===//
@@ -86,9 +89,6 @@ class BacktrackAnalysisInfo {
   // Backtracked values indexed by the result number.
   llvm::SmallVector<Value, 4> backtracked_values_;
 };
-}  // namespace
-
-namespace detail {
 
 //===----------------------------------------------------------------------===//
 // BacktrackAnalysis
@@ -137,12 +137,46 @@ class BacktrackAnalysis {
     return GetAnalysisForRegion(region);
   }
 
+  // Returns the backtrack analysis for the given region if it exists.
+  // If the region has not yet been analyzed, returns llvm::None.
+  Optional<const InfoT*> GetAnalysisIfExists(Region& region) const {
+    auto it = info_map_.find(&region);
+    if (it == info_map_.end()) return llvm::None;
+    return &it->second;
+  }
+
+  Optional<const InfoT*> GetAnalysisIfExists(FuncOp func) const {
+    return GetAnalysisIfExists(func.getBody());
+  }
+
  private:
   llvm::SmallDenseMap<Region*, InfoT> info_map_;
 };
 
 // Analyzes all regions attached to all operations in the module.
 BacktrackAnalysis::BacktrackAnalysis(ModuleOp module) {
+  const CallGraph call_graph(module);
+
+  // Visit functions bottom up when doing the analysis. Note that SCC iterator
+  // has the property that if there is an edge from SCC1->SCC2, SCC1 is visited
+  // after SCC2, i.e., the graph is traversed bottom up just the way we want.
+  auto scc_begin = llvm::scc_begin(&call_graph);
+  auto scc_end = llvm::scc_end(&call_graph);
+  for (auto& scc : make_range(scc_begin, scc_end)) {
+    // Each SCC node is a collection of callgraph nodes that form a cycle. We
+    // will visit these nodes in an arbitrary order. If a node being visited
+    // calls a function that has not yet been analyzed, we will not be able to
+    // backtrack through that function call (our analysis will be correct but
+    // pessimistic).
+    for (CallGraphNode* node : scc) {
+      if (node->isExternal()) continue;
+      Region* region = node->getCallableRegion();
+      GetOrCreateAnalysis(*region);
+    }
+  }
+
+  // This above call graph analysis will cover all regions attached to functions
+  // but we also need to analyze regions attached to other ops.
   module.walk([this](Operation* op) {
     for (Region& region : op->getRegions()) GetOrCreateAnalysis(region);
   });
@@ -161,17 +195,26 @@ Value BacktrackAnalysis::BacktrackValue(Value value) {
       // in the Island body.
       if (value == island.control()) break;
       value = island.GetYield().getOperand(res_index);
-    } else if (isa<TF::IdentityNOp, TF::IdentityOp>(op)) {
+    } else if (isa<IdentityNOp, IdentityOp>(op)) {
       value = op->getOperand(res_index);
+    } else if (auto call = dyn_cast<CallOpInterface>(op)) {
+      FuncOp func = dyn_cast<FuncOp>(call.resolveCallable());
+      if (!func) break;
+      // Check if the function being called has been analyzed. if not,
+      // we cannot backtrack the value further.
+      Optional<const InfoT*> callee_info = GetAnalysisIfExists(func);
+      if (!callee_info) break;
+      Optional<int> passthrough_arg = callee_info.getValue()->GetArg(res_index);
+      if (!passthrough_arg) break;
+      value = call.getArgOperands()[passthrough_arg.getValue()];
+    } else if (isa<tf_device::LaunchOp, tf_device::ClusterOp>(op)) {
+      value = op->getRegion(0).front().getTerminator()->getOperand(res_index);
     } else {
       break;
     }
   }
   return value;
 }
-}  // namespace detail
-
-namespace {
 
 // Analyze the region.
 BacktrackAnalysisInfo::BacktrackAnalysisInfo(
@@ -188,6 +231,8 @@ BacktrackAnalysisInfo::BacktrackAnalysisInfo(
     backtracked_values_.push_back(backtrack_analysis.BacktrackValue(result));
 }
 
+namespace {
+
 //===----------------------------------------------------------------------===//
 // ResourceAliasAnalysisInfo helper functions.
 //===----------------------------------------------------------------------===//
@@ -196,12 +241,12 @@ constexpr char kResourceArgUniqueIdAttr[] = "tf._resource_arg_unique_id";
 
 // Returns if a VarHandleOp is anonymous, which means it always creates a new
 // variable.
-bool IsResourceHandleAnonymous(TF::VarHandleOp handle) {
+bool IsResourceHandleAnonymous(VarHandleOp handle) {
   return handle.shared_name() == tensorflow::ResourceHandle::ANONYMOUS_NAME;
 }
 
 // Returns a string unique identifier for a non-anonymous VarHandleOp.
-std::string GetVarHandleStringId(TF::VarHandleOp handle) {
+std::string GetVarHandleStringId(VarHandleOp handle) {
   auto device = handle.getAttrOfType<StringAttr>("device");
   return absl::StrCat(handle.container().str(), "/", handle.shared_name().str(),
                       "/", device ? device.getValue().str() : std::string(""));
@@ -210,7 +255,7 @@ std::string GetVarHandleStringId(TF::VarHandleOp handle) {
 // Finds a unique ID for a VarHandleOp's output. If it is anonymous, always
 // creates a new ID; otherwise, tries to reuse the existing ID for the
 // referenced variable if it exists, or creates a new one if not.
-int64_t GetOrCreateIdForVarHandle(TF::VarHandleOp handle, int64_t* next_id,
+int64_t GetOrCreateIdForVarHandle(VarHandleOp handle, int64_t* next_id,
                                   llvm::StringMap<int64_t>* name_id_map) {
   // Always create a new ID for anonymous handle.
   if (IsResourceHandleAnonymous(handle)) return (*next_id)++;
@@ -224,131 +269,269 @@ int64_t GetOrCreateIdForVarHandle(TF::VarHandleOp handle, int64_t* next_id,
 
 }  // namespace
 
-namespace detail {
 //===----------------------------------------------------------------------===//
 // ResourceAliasAnalysisInfo
 //===----------------------------------------------------------------------===//
 
+constexpr int64_t ResourceAliasAnalysisInfo::kUnknownResourceId;
+
 // Constructs the analysis info by analyzing the given function.
 ResourceAliasAnalysisInfo::ResourceAliasAnalysisInfo(
-    FuncOp func_op, const detail::BacktrackAnalysis& backtrack_analysis) {
+    FuncOp func_op, const BacktrackAnalysis& backtrack_analysis) {
   // This function populates resource_value_to_ids_ and id_to_resource_values_.
 
+  int64_t next_unique_id = 0;
+
+  // Helper to assign new unique id for all resources in the given list of
+  // values.
+  auto assign_unique_id_to_all = [&](ValueRange values) {
+    for (Value value : filter_resources(values)) {
+      AddValueUniqueIDMapping(value, next_unique_id++);
+    }
+  };
+
+  // Helper to assign new unknown id for all resources in the given list of
+  // values.
+  auto assign_unknown_id_to_all = [&](ValueRange values) {
+    for (Value value : filter_resources(values)) {
+      AddValueUniqueIDMapping(value, kUnknownResourceId);
+    }
+  };
+
   // If the "tf.resource_arg_unique_id" argument attributes are present for
   // resource-type arguments, respect them when choosing IDs; otherwise, they
   // must not alias.
-  int64_t next_unique_id = 0;
   const bool has_arg_unique_id_attrs =
       llvm::any_of(func_op.getArguments(), [&](const BlockArgument& arg) {
         return func_op.getArgAttr(arg.getArgNumber(), kResourceArgUniqueIdAttr);
       });
   // Maps the kResourceArgUniqueIdAttr attribute value to the internal integer
   // ID used by this pass.
-  llvm::SmallDenseMap<int64_t, int64_t> attr_id_to_internal_id;
-  for (auto arg : func_op.getArguments()) {
-    if (!mlir::getElementTypeOrSelf(arg.getType()).isa<TF::ResourceType>())
-      continue;
-    if (has_arg_unique_id_attrs) {
+  if (has_arg_unique_id_attrs) {
+    llvm::SmallDenseMap<int64_t, int64_t> attr_id_to_internal_id;
+    for (auto arg : filter_resources(func_op.getArguments())) {
       auto id_attr = func_op.getArgAttrOfType<IntegerAttr>(
           arg.getArgNumber(), kResourceArgUniqueIdAttr);
       assert(id_attr &&
-             "tf.resource_arg_unique_id attribute should exist on either none "
-             "or all arguments.");
+             "tf.resource_arg_unique_id attribute should exist on either "
+             "none or all arguments.");
       auto emplace_res = attr_id_to_internal_id.try_emplace(id_attr.getInt(),
                                                             next_unique_id++);
       AddValueUniqueIDMapping(arg, emplace_res.first->getSecond());
-    } else {
-      AddValueUniqueIDMapping(arg, next_unique_id++);
     }
+  } else {
+    assign_unique_id_to_all(func_op.getArguments());
   }
-  llvm::StringMap<int64_t> var_handle_name_id_map;
-  auto forward_input_to_output = [&](const Value& operand,
-                                     const Value& result) {
-    if (!mlir::getElementTypeOrSelf(result.getType()).isa<TF::ResourceType>())
-      return;
-    auto& result_ids = resource_value_to_ids_[result];
-    auto operand_it = resource_value_to_ids_.find(operand);
-    assert(operand_it != resource_value_to_ids_.end() &&
-           "A resource-type output does not have the corresponding "
-           "resource-type input.");
-    result_ids.insert(operand_it->getSecond().begin(),
-                      operand_it->getSecond().end());
-  };
 
+  // Since this analysis is neither inter-procedural nor inter-regional,
+  // each region attached to Op's within a function is analyzed independently.
+  // Seed this analysis for each such region by mapping all resource arguments
+  // for such regions to a new unique-id. This is required because walk() walks
+  // the attached regions first before visiting the op, so there is no
+  // opportunity during the walk to seed region arguments. Also note that walk
+  // eventually also visits the Op on which the walk() is called, so make sure
+  // we do not overwrite the function argument mapping here.
   func_op.walk([&](Operation* op) {
-    if (auto var_handle = llvm::dyn_cast<TF::VarHandleOp>(op)) {
+    if (op == func_op) return;
+    for (Region& region : op->getRegions()) {
+      assign_unique_id_to_all(region.getArguments());
+    }
+  });
+
+  llvm::StringMap<int64_t> var_handle_name_id_map;
+  func_op.walk([&](Operation* op) {
+    if (auto var_handle = dyn_cast<VarHandleOp>(op)) {
       AddValueUniqueIDMapping(
           var_handle.resource(),
           GetOrCreateIdForVarHandle(var_handle, &next_unique_id,
                                     &var_handle_name_id_map));
-    } else if (llvm::isa<TF::IdentityNOp, TF::IdentityOp>(op)) {
-      for (auto operand_and_result :
-           llvm::zip(op->getOperands(), op->getResults())) {
-        forward_input_to_output(std::get<0>(operand_and_result),
-                                std::get<1>(operand_and_result));
+    } else if (llvm::isa<IdentityNOp, IdentityOp>(op)) {
+      for (auto result : filter_resources(op->getResults()))
+        PropagateInputToOutput(op->getOperand(result.getResultNumber()),
+                               result);
+    } else if (auto while_op = dyn_cast<WhileOp>(op)) {
+      AnalyzeWhileLoop(while_op, backtrack_analysis.GetAnalysisForFunc(
+                                     while_op.body_function()));
+    } else if (auto while_region = dyn_cast<WhileRegionOp>(op)) {
+      AnalyzeWhileLoop(while_region, backtrack_analysis.GetAnalysisForRegion(
+                                         while_region.body()));
+    } else if (auto case_op = dyn_cast<CaseOp>(op)) {
+      llvm::SmallVector<FuncOp, 4> functions;
+      functions.reserve(case_op.branches().size());
+      for (auto branch : case_op.branches())
+        functions.emplace_back(SymbolTable::lookupNearestSymbolFrom<FuncOp>(
+            case_op, branch.cast<SymbolRefAttr>()));
+
+      AnalyzeFunctionalCaseOrIfOp(case_op, functions, backtrack_analysis);
+    } else if (auto if_op = dyn_cast<IfOp>(op)) {
+      AnalyzeFunctionalCaseOrIfOp(
+          if_op, {if_op.then_function(), if_op.else_function()},
+          backtrack_analysis);
+    } else if (llvm::isa<CaseRegionOp, IfRegionOp>(op)) {
+      AnalyzeRegionCaseOrIfOp(op, backtrack_analysis);
+    } else if (auto call = dyn_cast<CallOpInterface>(op)) {
+      FuncOp func = dyn_cast<FuncOp>(call.resolveCallable());
+      if (!func) {
+        assign_unknown_id_to_all(op->getResults());
+        return WalkResult::advance();
       }
-    } else if (auto replicate = llvm::dyn_cast<tf_device::ReplicateOp>(op)) {
-      // The nested block for ReplicateOp is handled separately in side-effect
-      // analysis. Inside that block, we can still treat its block arguments as
-      // different resources.
-      for (auto arg : replicate.GetBody().getArguments()) {
-        if (mlir::getElementTypeOrSelf(arg.getType()).isa<TF::ResourceType>()) {
-          AddValueUniqueIDMapping(arg, next_unique_id++);
-        }
-      }
-    } else if (auto while_op = llvm::dyn_cast<TF::WhileOp>(op)) {
-      const auto& body_info =
-          backtrack_analysis.GetAnalysisForFunc(while_op.body_func());
-      // If a result is a passthrough of the body input, use the corresponding
-      // operand's resource IDs.
-      for (auto result : llvm::enumerate(while_op.getResults())) {
-        if (!mlir::getElementTypeOrSelf(result.value().getType())
-                 .isa<TF::ResourceType>()) {
-          continue;
-        }
-        auto passthrough_arg = body_info.GetArg(result.index());
+      const auto& func_info = backtrack_analysis.GetAnalysisForFunc(func);
+      for (auto result : filter_resources(op->getResults())) {
+        auto passthrough_arg = func_info.GetArg(result.getResultNumber());
         if (passthrough_arg) {
-          forward_input_to_output(
-              while_op.getOperand(passthrough_arg.getValue()), result.value());
+          PropagateInputToOutput(
+              call.getArgOperands()[passthrough_arg.getValue()], result);
         } else {
-          AddValueUniqueIDMapping(result.value(), kUnknownResourceId);
+          AddValueUniqueIDMapping(result, kUnknownResourceId);
         }
       }
-    } else if (auto if_op = llvm::dyn_cast<TF::IfOp>(op)) {
-      const auto& then_info =
-          backtrack_analysis.GetAnalysisForFunc(if_op.then_func());
-      const auto& else_info =
-          backtrack_analysis.GetAnalysisForFunc(if_op.else_func());
-      // If a result is a passthrough of both branches' inputs, merge the
-      // resource IDs of corresponding operands for the two inputs.
-      for (auto result : llvm::enumerate(if_op.getResults())) {
-        if (!mlir::getElementTypeOrSelf(result.value().getType())
-                 .isa<TF::ResourceType>()) {
-          continue;
-        }
-        auto passthrough_then_arg = then_info.GetArg(result.index());
-        auto passthrough_else_arg = else_info.GetArg(result.index());
-        if (passthrough_then_arg && passthrough_else_arg) {
-          Value then_operand = if_op.input()[passthrough_then_arg.getValue()];
-          Value else_operand = if_op.input()[passthrough_else_arg.getValue()];
-          forward_input_to_output(then_operand, result.value());
-          forward_input_to_output(else_operand, result.value());
-        } else {
-          AddValueUniqueIDMapping(result.value(), kUnknownResourceId);
-        }
+    } else if (isa<tf_device::LaunchOp, tf_device::ClusterOp>(op)) {
+      Region& region = op->getRegion(0);
+      const auto& body_info = backtrack_analysis.GetAnalysisForRegion(region);
+      for (auto result : filter_resources(op->getResults())) {
+        Value body_result = body_info.GetValue(result.getResultNumber());
+        PropagateInputToOutput(body_result, result);
       }
     } else {
-      for (auto result : op->getResults()) {
-        if (!mlir::getElementTypeOrSelf(result.getType())
-                 .isa<TF::ResourceType>())
-          continue;
-        AddValueUniqueIDMapping(result, kUnknownResourceId);
-      }
+      assign_unknown_id_to_all(op->getResults());
     }
+    return WalkResult::advance();
   });
 }
 
-bool ResourceAliasAnalysisInfo::IsUnknownResource(const Value resource) const {
+// Propagates the resource ID's from an input operand to a result. Returns true
+// if the mapping changed.
+bool ResourceAliasAnalysisInfo::PropagateInputToOutput(const Value& operand,
+                                                       const OpResult& result) {
+  auto operand_it = resource_value_to_ids_.find(operand);
+  assert(operand_it != resource_value_to_ids_.end() &&
+         "A resource-type output does not have the corresponding "
+         "resource-type input.");
+  bool change = false;
+  for (int64_t id : operand_it->second)
+    change = AddValueUniqueIDMapping(result, id) || change;
+  return change;
+}
+
+// Analyzes while loops to compute resourceIDs for the loop results.
+//
+// (1) The base case for the analysis is that if the loop body does not execute
+//     at all, the resource IDs for each result is the same as the resource IDs
+//     of the corresponding input.
+// (2) If the loop does execute one or more times, then we need to account for
+//     data flow through the body of the while loop. If result #r is the same
+//     as arg #a of the loop body (pass through argument), then we can reason
+//     further, else if the result is not a passthrough, we mark it as unknown.
+// (3) For passthrough results, if result #r is the same as arg #a of the loop
+//     body, after one iteration, result #r = arg #a, so we need to also
+//     propagate arg #a to result #r. After another iteration, arg #a of the
+//     loop body will be result #a of the previous iteration. So then we need
+//     propagate from result #a to result #r. Generalizing, the resource ID
+//     propagation (for results which are passthrough) looks like:
+//
+//     for r in (0, num_results) : result[r] = arg[r];
+//     repeat till no change {
+//       a = passthrough arg for result #r;
+//       result[r] += result[a];
+//     }
+//
+void ResourceAliasAnalysisInfo::AnalyzeWhileLoop(
+    Operation* while_op, const BacktrackAnalysisInfo& body_info) {
+  // Seed the resource ID's for the results using either the resource ID of the
+  // passthrough arg, or unknown. We need to perform further analysis if we
+  // find a passthrough arg which is not the same as corresponding the result #.
+  llvm::SmallVector<Optional<int>, 4> passthrough_args(
+      while_op->getNumResults());
+  bool need_analysis = false;
+  for (auto result : filter_resources(while_op->getResults())) {
+    int result_index = result.getResultNumber();
+    passthrough_args[result_index] = body_info.GetArg(result_index);
+    if (passthrough_args[result_index]) {
+      int passthru_index = passthrough_args[result_index].getValue();
+      PropagateInputToOutput(while_op->getOperand(passthru_index), result);
+      need_analysis |=
+          !IsUnknownResource(result) && passthru_index != result_index;
+    } else {
+      AddValueUniqueIDMapping(result, kUnknownResourceId);
+    }
+  }
+
+  if (!need_analysis) return;
+
+  // We found a result that is not unknown and whose passthrough operand index
+  // is not the same as the result index, which means there is "crosstalk"
+  // between 2 or more operands. In that case, we do an iterative propagation
+  // of resource ID's till the results converge.
+  bool change = true;
+  while (change) {
+    change = false;
+    for (auto result : filter_resources(while_op->getResults())) {
+      if (IsUnknownResource(result)) continue;
+      // If this result has a valid passthrough arg, propagate resource ID's
+      // from the result of the passthrough arg
+      int result_index = result.getResultNumber();
+      int passthru_index = passthrough_args[result_index].getValue();
+      change =
+          PropagateInputToOutput(while_op->getResult(passthru_index), result) ||
+          change;
+    }
+  }
+}
+
+template <class CaseOrIfOp>
+void ResourceAliasAnalysisInfo::AnalyzeFunctionalCaseOrIfOp(
+    CaseOrIfOp case_or_if_op, llvm::ArrayRef<FuncOp> functions,
+    const BacktrackAnalysis& backtrack_analysis) {
+  llvm::SmallVector<const BacktrackAnalysisInfo*, 2> infos;
+  infos.reserve(functions.size());
+  for (FuncOp func : functions)
+    infos.push_back(&backtrack_analysis.GetAnalysisForFunc(func));
+
+  // If a result is a passthrough of all branches' inputs, merge the resource
+  // IDs of corresponding operands for all the inputs.
+  for (auto result : filter_resources(case_or_if_op.getResults())) {
+    llvm::SmallVector<llvm::Optional<int>, 2> passthrough_args;
+    passthrough_args.reserve(functions.size());
+    for (const auto* info : infos)
+      passthrough_args.emplace_back(info->GetArg(result.getResultNumber()));
+
+    const bool all_passthrough_args_known = llvm::all_of(
+        passthrough_args, [](const llvm::Optional<int>& passthrough_arg) {
+          return passthrough_arg.hasValue();
+        });
+    if (all_passthrough_args_known) {
+      for (const auto& passthrough_arg : passthrough_args) {
+        Value operand = case_or_if_op.input()[passthrough_arg.getValue()];
+        PropagateInputToOutput(operand, result);
+      }
+    } else {
+      AddValueUniqueIDMapping(result, kUnknownResourceId);
+    }
+  }
+}
+
+void ResourceAliasAnalysisInfo::AnalyzeRegionCaseOrIfOp(
+    Operation* case_or_if_op, const BacktrackAnalysis& backtrack_analysis) {
+  llvm::SmallVector<const BacktrackAnalysisInfo*, 2> infos;
+  infos.reserve(case_or_if_op->getNumRegions());
+  for (Region& region : case_or_if_op->getRegions())
+    infos.push_back(&backtrack_analysis.GetAnalysisForRegion(region));
+
+  // For region Case/If, the walk would have visited all branch regions before
+  // visiting the Case/If op. Backtracking of each region results will either
+  // give a value computed within these regions, or a region capture. If it is a
+  // region capture computed before this Case/If, it will have been visited
+  // earlier and a mapping would exist for that value. If it is computed within
+  // the region, then again a mapping would exist.
+  for (auto result : filter_resources(case_or_if_op->getResults())) {
+    for (const auto* info : infos) {
+      Value region_result = info->GetValue(result.getResultNumber());
+      PropagateInputToOutput(region_result, result);
+    }
+  }
+}
+
+bool ResourceAliasAnalysisInfo::IsUnknownResource(Value resource) const {
   auto it = resource_value_to_ids_.find(resource);
   assert(it != resource_value_to_ids_.end() && !it->getSecond().empty());
   // The set is sorted so we only need to check the first element since
@@ -360,6 +543,7 @@ bool ResourceAliasAnalysisInfo::IsUnknownResource(const Value resource) const {
 
 const llvm::SmallSet<int64_t, 8>&
 ResourceAliasAnalysisInfo::GetResourceUniqueIds(Value resource) const {
+  assert(!IsUnknownResource(resource));
   auto it = resource_value_to_ids_.find(resource);
   assert(it != resource_value_to_ids_.end() && "Unseen resource was queried");
   return it->getSecond();
@@ -373,14 +557,19 @@ ResourceAliasAnalysisInfo::GetUniqueIdResources(const int64_t id) const {
 }
 
 llvm::SmallSetVector<Value, 8> ResourceAliasAnalysisInfo::GetResourceAliases(
-    const Value resource) const {
-  assert(!IsUnknownResource(resource) && "Unseen resource was queried");
+    Value resource) const {
+  assert(!IsUnknownResource(resource) && "Unknown resource was queried");
   llvm::SmallSetVector<Value, 8> aliases;
   for (int64_t id : GetResourceUniqueIds(resource)) {
     const llvm::SmallSetVector<Value, 8>& resources_aliasing_id =
         GetUniqueIdResources(id);
     aliases.insert(resources_aliasing_id.begin(), resources_aliasing_id.end());
   }
+  // If there are resources that were marked as unknown, they alias with all
+  // other resources.
+  auto it = id_to_resource_values_.find(kUnknownResourceId);
+  if (it != id_to_resource_values_.end())
+    aliases.insert(it->getSecond().begin(), it->getSecond().end());
   return aliases;
 }
 
@@ -390,10 +579,7 @@ llvm::SmallSetVector<Value, 8> ResourceAliasAnalysisInfo::GetResourceAliases(
 // ResourceAliasAnalysis
 //===----------------------------------------------------------------------===//
 
-ResourceAliasAnalysis::ResourceAliasAnalysis(Operation* op) {
-  auto module = dyn_cast<ModuleOp>(op);
-  assert(module);
-
+ResourceAliasAnalysis::ResourceAliasAnalysis(ModuleOp module) {
   // Analyze all regions for backtracking info.
   detail::BacktrackAnalysis backtrack_analysis(module);
 
diff --git a/tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.h b/tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.h
index 5a514a7fb64..5575767dcc4 100644
--- a/tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.h
+++ b/tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.h
@@ -20,18 +20,23 @@ limitations under the License.
 #include <cstdint>
 #include <memory>
 
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Region.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/analysis/per_function_aggregate_analysis.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 
 namespace mlir {
 namespace TF {
 namespace detail {
 class BacktrackAnalysis;
+class BacktrackAnalysisInfo;
 
 // Resource alias analysis information for a single function.
 class ResourceAliasAnalysisInfo {
@@ -43,7 +48,7 @@ class ResourceAliasAnalysisInfo {
   ResourceAliasAnalysisInfo(ResourceAliasAnalysisInfo&&) = default;
 
   // Returns if the analysis fails to resolve a resource-type value.
-  bool IsUnknownResource(const Value resource) const;
+  bool IsUnknownResource(Value resource) const;
 
   // Returns the set unique IDs which `resource` could alias. Requires that
   // IsUnknownResource(resource) == false.
@@ -54,15 +59,35 @@ class ResourceAliasAnalysisInfo {
   llvm::SmallSetVector<Value, 8> GetResourceAliases(Value resource) const;
 
  private:
-  // Maps resource value to unique ID and vice-versa.
-  void AddValueUniqueIDMapping(Value value, int64_t id) {
+  // Maps resource value to unique ID and vice-versa. Returns true of the
+  // mapping has changed.
+  bool AddValueUniqueIDMapping(Value value, int64_t id) {
     resource_value_to_ids_[value].insert(id);
-    id_to_resource_values_[id].insert(value);
+    return id_to_resource_values_[id].insert(value);
   }
 
   // Returns the set unique Values which map to `id`.
   const llvm::SmallSetVector<Value, 8>& GetUniqueIdResources(int64_t id) const;
 
+  // Propagates the resource ID's from an input operand to a result. Returns
+  // true of the mapping has changed.
+  bool PropagateInputToOutput(const Value& operand, const OpResult& result);
+
+  // Analyzes while loops to compute resourceID's for the loop results.
+  // `body_info` is the backtrack analysis info for the loop body.
+  void AnalyzeWhileLoop(Operation* while_op,
+                        const BacktrackAnalysisInfo& body_info);
+
+  // Analyzes tf.Case/tf.If ops to compute resourceID's.
+  template <class CaseOrIfOp>
+  void AnalyzeFunctionalCaseOrIfOp(CaseOrIfOp case_or_if_op,
+                                   llvm::ArrayRef<FuncOp> functions,
+                                   const BacktrackAnalysis& backtrack_analysis);
+
+  // Analyzes tf.CaseRegion/tf.IfRegion ops to compute resourceID's.
+  void AnalyzeRegionCaseOrIfOp(Operation* case_or_if_op,
+                               const BacktrackAnalysis& backtrack_analysis);
+
   // Maps each resource-type value to a set of unique IDs that it could alias.
   llvm::SmallDenseMap<Value, llvm::SmallSet<int64_t, 8>, 8>
       resource_value_to_ids_;
@@ -88,9 +113,18 @@ class ResourceAliasAnalysis : public detail::PerFunctionAggregateAnalysis<
                                   detail::ResourceAliasAnalysisInfo> {
  public:
   // Constructs analysis by analyzing the given module operation.
-  explicit ResourceAliasAnalysis(Operation* op);
+  explicit ResourceAliasAnalysis(ModuleOp module);
 };
 
+// Returns a range with just resource type values from the input range
+// preserved.
+template <typename RangeT>
+auto filter_resources(RangeT&& range) {
+  return llvm::make_filter_range(std::forward<RangeT>(range), [](Value val) {
+    return getElementTypeOrSelf(val.getType()).isa<TF::ResourceType>();
+  });
+}
+
 }  // namespace TF
 }  // namespace mlir
 
diff --git a/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc b/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc
index 9e78b90debc..4a2080c5951 100644
--- a/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc
+++ b/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc
@@ -21,27 +21,29 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Block.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/Module.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
-#include "tensorflow/compiler/tf2xla/resource_operation_table.h"
-#include "tensorflow/core/framework/resource_mgr.h"
 
 namespace mlir {
 namespace TF {
@@ -67,16 +69,12 @@ llvm::SmallDenseSet<int64_t, 8> FindAccessedResources(
     Operation* op, const ResourceAliasAnalysis::Info& alias_analysis) {
   llvm::SmallDenseSet<int64_t, 8> resources;
 
-  for (auto operand : op->getOperands()) {
-    if (!mlir::getElementTypeOrSelf(operand.getType()).isa<TF::ResourceType>())
-      continue;
+  for (auto operand : filter_resources(op->getOperands())) {
     if (alias_analysis.IsUnknownResource(operand)) return UnknownResourceSet();
     const auto& ids = alias_analysis.GetResourceUniqueIds(operand);
     resources.insert(ids.begin(), ids.end());
   }
-  for (auto result : op->getResults()) {
-    if (!mlir::getElementTypeOrSelf(result.getType()).isa<TF::ResourceType>())
-      continue;
+  for (auto result : filter_resources(op->getResults())) {
     if (alias_analysis.IsUnknownResource(result)) return UnknownResourceSet();
     const auto& ids = alias_analysis.GetResourceUniqueIds(result);
     resources.insert(ids.begin(), ids.end());
@@ -84,67 +82,139 @@ llvm::SmallDenseSet<int64_t, 8> FindAccessedResources(
   return resources;
 }
 
-// Returns an XlaResourceOpInfo (or nullptr if it does not exist) that specifies
-// the resource access type of the op. It tells whether the op is read only,
-// etc.
-//
-// TODO(yuanzx): Define this information in a different place. Currently we use
-// tensorflow/compiler/tf2xla/resource_operation_table.h.
-const tensorflow::XlaResourceOpInfo* GetResourceInfoForOp(Operation* op) {
-  if (op->getName().getDialect() !=
-      TF::TensorFlowDialect::getDialectNamespace()) {
-    return nullptr;
+// Helper struct defining what memory effects are present for a resource.
+struct SideEffects {
+  bool alloc = false;
+  bool free = false;
+  bool read = false;
+  bool write = false;
+
+  bool IsAllocOnly() const { return alloc && !free && !read && !write; }
+  bool IsReadOnly() const { return !alloc && !free && read && !write; }
+};
+
+using ResourceSideEffectsByValue = llvm::SmallDenseMap<Value, SideEffects>;
+
+// Collects memory side effects for an operation by value (operands and
+// results).
+ResourceSideEffectsByValue GetResourceInfoForOp(Operation* op) {
+  ResourceSideEffectsByValue resource_info;
+
+  auto interface = dyn_cast<MemoryEffectOpInterface>(op);
+  if (!interface) return resource_info;
+
+  llvm::SmallVector<MemoryEffects::EffectInstance, 4> effects;
+  interface.getEffects(effects);
+
+  for (auto& effect : effects) {
+    // TODO(lyandy): Support effects with no value defined.
+    if (!effect.getValue()) return ResourceSideEffectsByValue();
+    auto it = resource_info.try_emplace(effect.getValue());
+    auto& side_effect = it.first->getSecond();
+    auto* resource_effect = effect.getEffect();
+    if (isa<MemoryEffects::Allocate>(resource_effect)) {
+      side_effect.alloc = true;
+    } else if (isa<MemoryEffects::Free>(resource_effect)) {
+      side_effect.free = true;
+    } else if (isa<MemoryEffects::Read>(resource_effect)) {
+      side_effect.read = true;
+    } else if (isa<MemoryEffects::Write>(resource_effect)) {
+      side_effect.write = true;
+    } else {
+      return ResourceSideEffectsByValue();
+    }
   }
-  return tensorflow::GetResourceOpInfoForOp(
-      op->getName().getStringRef().split('.').second.str());
+
+  return resource_info;
 }
 
-// Returns whether `op` accesses resources and it is known to be read-only.
-bool OpIsReadOnly(Operation* op) {
-  auto resource_op_info = GetResourceInfoForOp(op);
-  return resource_op_info &&
-         resource_op_info->kind() == tensorflow::XlaResourceOpKind::kRead;
+// Checks if a value is a result of `op`.
+bool IsOperationResult(Operation* op, Value value) {
+  return value.getDefiningOp() == op;
+}
+
+// Checks if an operation's resource operands are read only. Operation results
+// are ignored.
+bool IsResourceOpReadOnly(Operation* op,
+                          const ResourceSideEffectsByValue& resource_op_info) {
+  if (resource_op_info.empty()) return false;
+
+  for (const auto& resource_info : resource_op_info) {
+    Value value = resource_info.getFirst();
+    if (IsOperationResult(op, value)) continue;
+    const SideEffects& side_effects = resource_info.getSecond();
+    if (!side_effects.IsReadOnly()) return false;
+  }
+
+  return true;
+}
+
+// Checks if an operation's resource results are alloc only and no side effects
+// are present for its operands.
+bool IsResourceOpAllocOnly(Operation* op,
+                           const ResourceSideEffectsByValue& resource_op_info) {
+  if (resource_op_info.empty()) return false;
+
+  for (const auto& resource_info : resource_op_info) {
+    // Operand with side effect.
+    Value value = resource_info.getFirst();
+    if (!IsOperationResult(op, value)) return false;
+    const SideEffects& side_effects = resource_info.getSecond();
+    if (!side_effects.IsAllocOnly()) return false;
+  }
+
+  return true;
 }
 
 // Returns if `op` is a resource declaration.
 bool OpIsDeclaration(Operation* op,
                      const ResourceAliasAnalysis::Info& alias_analysis) {
-  // TODO(yuanzx): Add other types of resources.
-  return llvm::isa<TF::VarHandleOp>(op) ||
-         (llvm::isa<TF::IdentityNOp, TF::IdentityOp>(op) &&
-          !FindAccessedResources(op, alias_analysis).empty());
+  return llvm::isa<TF::IdentityNOp, TF::IdentityOp>(op) &&
+         !FindAccessedResources(op, alias_analysis).empty();
 }
 
-// Returns if `op` is know to not have any side effect.
-bool OpIsKnownToHaveNoSideEffect(Operation* op) {
-  // TODO(riverriddle) We shouldn't treat all terminator operations as having
-  // side effects, this should be relaxed.
-  // TODO(riverriddle) Properly handle region side effects.
-  if (MemoryEffectOpInterface::hasNoEffect(op) && op->isKnownNonTerminator() &&
-      op->getNumRegions() == 0) {
-    return true;
-  }
-  if (auto if_op = llvm::dyn_cast<TF::IfOp>(op)) {
-    return if_op.is_stateless();
-  }
-  if (auto while_op = llvm::dyn_cast<TF::WhileOp>(op)) {
-    return while_op.is_stateless();
-  }
+// A vector of resource variable id's with their associated resource value.
+using ResourceIdsByValue =
+    llvm::SmallVector<std::pair<Value, const llvm::SmallSet<int64_t, 8>*>, 4>;
 
-  // Try to get the statefulness flag from the registry.
-  //
-  // TODO(yuanzx): Remove this after all ops are defined in the dialect.
-  if (op->getName().getDialect() !=
-      TF::TensorFlowDialect::getDialectNamespace()) {
-    return false;
-  }
-  StringRef op_name = op->getName().getStringRef();
-  // Drop the `tf.` prefix to query TF registry.
-  auto node_name =
-      op_name.drop_front(TensorFlowDialect::getDialectNamespace().size() + 1);
-  const tensorflow::OpRegistrationData* op_reg_data =
-      tensorflow::OpRegistry::Global()->LookUp(node_name.data());
-  return op_reg_data && !op_reg_data->op_def.is_stateful();
+// Collects resource id's by resource value. If operation resource side effects
+// are unknown or a resource is unknown, an empty optional is returned.
+llvm::Optional<ResourceIdsByValue> GetResourceIdsByValue(
+    Operation* op, const ResourceAliasAnalysis::Info& alias_analysis,
+    const ResourceSideEffectsByValue& resource_op_info) {
+  ResourceIdsByValue resource_ids_by_value;
+  if (resource_op_info.empty()) return llvm::None;
+
+  auto collect_ids = [&](ValueRange values) {
+    for (auto value : filter_resources(values)) {
+      if (alias_analysis.IsUnknownResource(value)) return false;
+      const auto& ids = alias_analysis.GetResourceUniqueIds(value);
+      resource_ids_by_value.push_back({value, &ids});
+    }
+    return true;
+  };
+
+  if (collect_ids(op->getOperands()) && collect_ids(op->getResults()))
+    return resource_ids_by_value;
+  else
+    return llvm::None;
+}
+
+// Returns true if `op` is known to not have any side effect.
+bool OpIsKnownToHaveNoSideEffect(Operation* op) {
+  // Note: Identity op is really side-effect free, but it is not marked as such
+  // in the TF dialect (see comments in definition of Identity op in tf_ops.td)
+  // However, for adding control dependencies, its safe to assume
+  // that the Identity op is side-effect free.
+  if (isa<IdentityOp>(op)) return true;
+
+  // For op's in the Tensorflow dialect, query the dialect.
+  if (op->getName().getDialect() ==
+      TF::TensorFlowDialect::getDialectNamespace())
+    return !TensorFlowDialect::CanHaveSideEffects(op);
+
+  // Otherwise, conservatively assume that there can be side effects.
+  return false;
 }
 
 }  // namespace
@@ -272,17 +342,17 @@ void SideEffectAnalysisInfo::AnalyzeRegion(
       if (OpIsDeclaration(&op, alias_analysis)) continue;
 
       auto resource_op_info = GetResourceInfoForOp(&op);
-      if (!resource_op_info && OpIsKnownToHaveNoSideEffect(&op)) continue;
+      if (resource_op_info.empty() && OpIsKnownToHaveNoSideEffect(&op))
+        continue;
 
-      llvm::SmallDenseSet<int64_t, 8> resources =
-          resource_op_info ? FindAccessedResources(&op, alias_analysis)
-                           : UnknownResourceSet();
-      assert(!resources.empty());
-      const bool is_unknown = resources.count(kUnknownResourceId) > 0;
-      const bool read_only = OpIsReadOnly(&op);
+      if (IsResourceOpAllocOnly(&op, resource_op_info)) continue;
+
+      auto resource_ids_by_value =
+          GetResourceIdsByValue(&op, alias_analysis, resource_op_info);
+      const bool read_only = IsResourceOpReadOnly(&op, resource_op_info);
       bool indirectly_tracked_unknown_access = false;
       // First add edges from known resources.
-      if (is_unknown) {
+      if (!resource_ids_by_value.hasValue()) {
         for (auto& entry : per_resource_access_info_) {
           if (entry.getFirst() == kUnknownResourceId) continue;
           AddPredecessorsForAccess(entry.getFirst(), &op, read_only);
@@ -291,20 +361,43 @@ void SideEffectAnalysisInfo::AnalyzeRegion(
                                                             read_only);
         }
       } else {
-        for (int64_t resource : resources) {
-          AddPredecessorsForAccess(resource, &op, read_only);
+        // Collect all resource id's and whether their side effect is read only.
+        llvm::SmallDenseMap<int64_t, bool> read_only_by_resource_id;
+        for (const auto& resource_ids : *resource_ids_by_value) {
+          const bool is_result = resource_ids.first.getDefiningOp() == &op;
+          auto value_resource_info = resource_op_info.find(resource_ids.first);
+          bool resource_read_only = false;
+          if (value_resource_info != resource_op_info.end()) {
+            if (is_result && value_resource_info->getSecond().IsAllocOnly())
+              continue;
+            resource_read_only = value_resource_info->getSecond().IsReadOnly();
+          }
+
+          for (const auto& id : *resource_ids.second) {
+            auto it =
+                read_only_by_resource_id.try_emplace(id, resource_read_only);
+            if (!it.second && !resource_read_only)
+              it.first->getSecond() = resource_read_only;
+          }
+        }
+
+        for (const auto& resource : read_only_by_resource_id) {
+          const auto& resource_id = resource.getFirst();
+          const auto& resource_read_only = resource.getSecond();
+          AddPredecessorsForAccess(resource_id, &op, resource_read_only);
           indirectly_tracked_unknown_access |=
-              unknown_access_indirectly_tracked_by_resource(resource,
-                                                            read_only);
+              unknown_access_indirectly_tracked_by_resource(resource_id,
+                                                            resource_read_only);
           // Update access info for known resources.
-          TrackAccess(resource, &op, read_only);
+          TrackAccess(resource_id, &op, resource_read_only);
         }
       }
+
       // If not indirectly tracked, add edges from the unknown resource.
       if (!indirectly_tracked_unknown_access) {
         AddPredecessorsForAccess(kUnknownResourceId, &op, read_only);
       }
-      if (is_unknown) {
+      if (!resource_ids_by_value.hasValue()) {
         // Update access info for unknown resource.
         TrackAccess(kUnknownResourceId, &op, read_only);
       }
@@ -339,10 +432,7 @@ SideEffectAnalysisInfo::DirectControlSuccessors(
 }
 }  // namespace detail
 
-SideEffectAnalysis::SideEffectAnalysis(Operation* op) {
-  auto module = dyn_cast<ModuleOp>(op);
-  assert(module);
-
+SideEffectAnalysis::SideEffectAnalysis(ModuleOp module) {
   // Analyze entire module for alias analysis info.
   ResourceAliasAnalysis alias_analysis(module);
 
diff --git a/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.h b/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.h
index c92c6e1882c..a75f7eb7dee 100644
--- a/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.h
+++ b/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.h
@@ -130,7 +130,7 @@ class SideEffectAnalysis : public detail::PerFunctionAggregateAnalysis<
                                detail::SideEffectAnalysisInfo> {
  public:
   // Constructs analysis by analyzing the given module operation.
-  explicit SideEffectAnalysis(Operation* op);
+  explicit SideEffectAnalysis(ModuleOp module);
 };
 
 }  // namespace TF
diff --git a/tensorflow/compiler/mlir/tensorflow/c/BUILD b/tensorflow/compiler/mlir/tensorflow/c/BUILD
index 801e35280d6..5c6f39699bf 100644
--- a/tensorflow/compiler/mlir/tensorflow/c/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/c/BUILD
@@ -2,7 +2,6 @@ load(
     "//tensorflow:tensorflow.bzl",
     "tf_copts",
     "tf_cuda_library",
-    "tfe_xla_copts",
 )
 
 package(
@@ -20,7 +19,7 @@ tf_cuda_library(
     srcs = [
         "c_api_unified_experimental_mlir.cc",
     ],
-    copts = tf_copts() + tfe_xla_copts(),
+    copts = tf_copts(),
     deps = [
         "//tensorflow/c:c_api",
         "//tensorflow/c:tensor_interface",
@@ -41,6 +40,7 @@ tf_cuda_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/lib/llvm_rtti",
         "//tensorflow/core/platform:errors",
+        "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
diff --git a/tensorflow/compiler/mlir/tensorflow/c/c_api_unified_experimental_mlir.cc b/tensorflow/compiler/mlir/tensorflow/c/c_api_unified_experimental_mlir.cc
index 66447995709..6bfe4c302cd 100644
--- a/tensorflow/compiler/mlir/tensorflow/c/c_api_unified_experimental_mlir.cc
+++ b/tensorflow/compiler/mlir/tensorflow/c/c_api_unified_experimental_mlir.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <cstddef>
 #include <memory>
 
+#include "absl/strings/str_cat.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/raw_ostream.h"
@@ -42,6 +43,7 @@ limitations under the License.
 #include "tensorflow/c/tf_status.h"
 #include "tensorflow/c/tf_status_helper.h"
 #include "tensorflow/c/tf_status_internal.h"
+#include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
@@ -64,21 +66,18 @@ using tensorflow::AbstractTensorInterface;
 using tensorflow::dyn_cast;
 using tensorflow::OutputList;
 using tensorflow::string;
+using tensorflow::errors::FailedPrecondition;
+using tensorflow::errors::InvalidArgument;
+using tensorflow::errors::Unimplemented;
 using tensorflow::tracing::TracingContext;
 using tensorflow::tracing::TracingOperation;
 using tensorflow::tracing::TracingTensorHandle;
 
 namespace {
 
-static void RegisterDialects() {
-  static bool init_once = []() {
-    mlir::registerDialect<mlir::StandardOpsDialect>();
-    mlir::registerDialect<mlir::tf_device::TensorFlowDeviceDialect>();
-    mlir::registerDialect<mlir::tf_executor::TensorFlowExecutorDialect>();
-    mlir::registerDialect<mlir::TF::TensorFlowDialect>();
-    return true;
-  }();
-  (void)init_once;
+void RegisterDialects(mlir::MLIRContext& ctx) {
+  mlir::RegisterAllTensorFlowDialects(ctx.getDialectRegistry());
+  ctx.getDialectRegistry().loadAll(&ctx);
 }
 
 Status ConvertDataTypeToTensor(tensorflow::DataType dtype, Builder builder,
@@ -95,7 +94,7 @@ class MlirTensor : public TracingTensorHandle {
 
   tensorflow::DataType DataType() const override {
     tensorflow::DataType type;
-    Status s = ConvertScalarTypeToDataType(value_.getType(), &type);
+    Status s = ConvertToDataType(value_.getType(), &type);
     if (!s.ok()) {
       return tensorflow::DT_INVALID;
     }
@@ -103,6 +102,9 @@ class MlirTensor : public TracingTensorHandle {
   }
 
   Value getValue() { return value_; }
+  Type getElementType() {
+    return value_.getType().cast<ShapedType>().getElementType();
+  }
 
   // For LLVM style RTTI.
   static bool classof(const AbstractTensorHandle* ptr) {
@@ -184,11 +186,18 @@ class MlirAbstractOp : public TracingOperation {
   }
 
  private:
+  // Return true is there are still unfilled ODS slots for adding more inputs.
+  bool IsNextODSArgAvailable();
+
   MLIRContext* context_;
   MlirFunctionContext* function_context_;
   SmallVector<Value, 8> operands_;
   llvm::StringMap<Attribute> attrs_;
   std::unique_ptr<OperationState> state_;
+  // This is the index of the next ODS operand that will be added with AddInput
+  // or AddInput;
+  int current_ods_input_ = 0;
+  const tensorflow::OpDef* op_def_ = nullptr;
   const char* op_name_ = nullptr;
   string tf_op_type_;
   // TODO(srbs): Use this.
@@ -225,6 +234,7 @@ class MlirFunctionContext : public TracingContext {
       : TracingContext(kMlir),
         context_(std::make_unique<MLIRContext>()),
         builder_(context_.get()) {
+    RegisterDialects(*context_);
     // TODO(aminim) figure out the location story here
     module_ = ModuleOp::create(builder_.getUnknownLoc());
     func_ = FuncOp::create(builder_.getUnknownLoc(), name,
@@ -244,12 +254,12 @@ class MlirFunctionContext : public TracingContext {
   Status Finalize(OutputList* outputs, AbstractFunction** f) override;
 
   Status RegisterFunction(AbstractFunction* func) override {
-    return tensorflow::errors::Unimplemented(
+    return Unimplemented(
         "Registering graph functions has not been implemented yet.");
   }
 
   Status RemoveFunction(const string& func) override {
-    return tensorflow::errors::Unimplemented(
+    return Unimplemented(
         "MlirFunctionContext::RemoveFunction has not been implemented yet.");
   }
 
@@ -264,9 +274,12 @@ class MlirFunctionContext : public TracingContext {
 
 Status MlirAbstractOp::Reset(const char* op, const char* device_name) {
   if (state_) {
-    return tensorflow::errors::FailedPrecondition(
-        "Reset called on already built op.");
+    return FailedPrecondition("Reset called on already built op.");
   }
+  TF_RETURN_IF_ERROR(
+      tensorflow::OpRegistry::Global()->LookUpOpDef(op, &op_def_));
+  assert(op_def_);
+
   tf_op_type_ = op;
   std::string name = "tf.";
   name += op;
@@ -277,13 +290,12 @@ Status MlirAbstractOp::Reset(const char* op, const char* device_name) {
 
 Status MlirAbstractOp::SetAttrType(const char* attr_name,
                                    tensorflow::DataType dtype) {
-  if (!state_) {
-    return Status(tensorflow::error::Code::FAILED_PRECONDITION,
-                  "op_type must be specified before specifying attrs.");
-  }
+  if (!state_)
+    return FailedPrecondition(
+        "op_type must be specified before specifying attrs.");
   Type mlir_type;
   Builder builder(context_);
-  TF_RETURN_IF_ERROR(ConvertDataTypeToTensor(dtype, builder, &mlir_type));
+  TF_RETURN_IF_ERROR(ConvertDataType(dtype, builder, &mlir_type));
   attrs_[attr_name] = TypeAttr::get(mlir_type);
   return Status::OK();
 }
@@ -291,8 +303,7 @@ Status MlirAbstractOp::SetAttrType(const char* attr_name,
 Status MlirAbstractOp::SetOpName(const char* const op_name) {
   // TODO(aminim): should we use a location?
   if (op_name_) {
-    return tensorflow::errors::FailedPrecondition(
-        "SetOpName called on already built op.");
+    return FailedPrecondition("SetOpName called on already built op.");
   }
   op_name_ = op_name;
   return Status::OK();
@@ -301,8 +312,7 @@ Status MlirAbstractOp::SetOpName(const char* const op_name) {
 Status MlirAbstractOp::AddRef(Type type, Type* output_type) {
   Type elt_type = getElementTypeOrSelf(type);
   if (elt_type.isa<mlir::TF::TensorFlowRefType>()) {
-    return tensorflow::errors::InvalidArgument(
-        "Requested reference to a reference type");
+    return InvalidArgument("Requested reference to a reference type");
   }
   elt_type = TensorFlowRefType::get(elt_type);
   if (RankedTensorType tensor_type = type.dyn_cast<RankedTensorType>()) {
@@ -315,138 +325,97 @@ Status MlirAbstractOp::AddRef(Type type, Type* output_type) {
 Status MlirAbstractOp::Create(ArrayRef<Value> operands,
                               OperationState** state) {
   state_->operands = llvm::to_vector<4>(operands);
-  const tensorflow::OpDef* op_def;
-  auto node_name = state_->name.getStringRef().drop_front(
-      TensorFlowDialect::getDialectNamespace().size() + 1);
-  TF_RETURN_IF_ERROR(
-      tensorflow::OpRegistry::Global()->LookUpOpDef(node_name.str(), &op_def));
   Builder builder(context_);
-  // Process operands according to the op_def and infer derived attributes.
-  int current_operand = 0;
-  for (const tensorflow::OpDef::ArgDef& input_arg : op_def->input_arg()) {
-    if (!input_arg.number_attr().empty()) {
-      // TODO(b/156122856): we don't support variadic operands.
-      return tensorflow::errors::Unimplemented(
-          "Unsupported 'number_attr' for '", input_arg.number_attr(), "'");
-    } else if (!input_arg.type_list_attr().empty()) {
-      return tensorflow::errors::InvalidArgument(
-          "Unsupported 'type_list_attr' for '", input_arg.number_attr(), "'");
-    }
-    if (current_operand >= operands.size()) {
-      return tensorflow::errors::InvalidArgument("Missing operand for '",
-                                                 input_arg.name(), "'");
-    }
-    Type expected_type;
-    if (input_arg.type() != tensorflow::DT_INVALID) {
-      TF_RETURN_IF_ERROR(
-          ConvertDataTypeToTensor(input_arg.type(), builder, &expected_type));
-      Type output_type;
-      if (input_arg.is_ref())
-        TF_RETURN_IF_ERROR(AddRef(expected_type, &output_type));
-      expected_type = output_type;
-    } else {
-      expected_type = operands[current_operand].getType();
-    }
-    if (!input_arg.type_attr().empty()) {
-      attrs_[input_arg.type_attr()] = TypeAttr::get(expected_type);
-    }
-    ++current_operand;
-  }
 
-  for (const tensorflow::OpDef::ArgDef& output_arg : op_def->output_arg()) {
+  if (current_ods_input_ != op_def_->input_arg_size())
+    return InvalidArgument(absl::StrCat("Mismatch in operands number: got ",
+                                        current_ods_input_, " expected ",
+                                        op_def_->input_arg_size(), " ; for op ",
+                                        state_->name.getStringRef().str()));
+
+  // Process results according to the op_def and infer types for derived
+  // attributes.
+  for (const tensorflow::OpDef::ArgDef& output_arg : op_def_->output_arg()) {
     int original_size = state_->types.size();
     if (!output_arg.number_attr().empty()) {
       // Same type repeated "repeats" times.
       Attribute repeats_attr = attrs_[output_arg.number_attr()];
-      if (!repeats_attr) {
-        return tensorflow::errors::InvalidArgument(
-            "Missing attribute '", output_arg.number_attr(),
-            "' required for output list '", output_arg.name(), "'");
-      }
-      if (!repeats_attr.isa<IntegerAttr>()) {
-        return tensorflow::errors::InvalidArgument(
-            "Attribute '", output_arg.number_attr(),
-            "' required for output list '", output_arg.name(),
-            "' isn't an integer");
-      }
+      if (!repeats_attr)
+        return InvalidArgument("Missing attribute '", output_arg.number_attr(),
+                               "' required for output list '",
+                               output_arg.name(), "'");
+      if (!repeats_attr.isa<IntegerAttr>())
+        return InvalidArgument("Attribute '", output_arg.number_attr(),
+                               "' required for output list '",
+                               output_arg.name(), "' isn't an integer");
       int64_t repeats = repeats_attr.cast<IntegerAttr>().getInt();
 
       if (!output_arg.type_attr().empty()) {
         // Same type repeated "repeats" times.
         Attribute attr = attrs_[output_arg.type_attr()];
-        if (!attr) {
-          return tensorflow::errors::InvalidArgument(
-              "Missing attribute '", output_arg.type_attr(),
-              "' required for output '", output_arg.name(), "'");
-        }
+        if (!attr)
+          return InvalidArgument("Missing attribute '", output_arg.type_attr(),
+                                 "' required for output '", output_arg.name(),
+                                 "'");
         TypeAttr type_attr = attr.dyn_cast<TypeAttr>();
-        if (!type_attr) {
-          return tensorflow::errors::InvalidArgument(
-              "Attribute '", output_arg.type_attr(), "' required for output '",
-              output_arg.name(), "' isn't a type attribute");
-        }
+        if (!type_attr)
+          return InvalidArgument("Attribute '", output_arg.type_attr(),
+                                 "' required for output '", output_arg.name(),
+                                 "' isn't a type attribute");
         for (int i = 0; i < repeats; ++i)
-          state_->types.push_back(type_attr.getType());
+          state_->types.push_back(UnrankedTensorType::get(type_attr.getType()));
       } else if (output_arg.type() != tensorflow::DT_INVALID) {
         for (int i = 0; i < repeats; ++i) {
           Type type;
           TF_RETURN_IF_ERROR(
-              ConvertDataTypeToTensor(output_arg.type(), builder, &type));
+              ConvertDataType(output_arg.type(), builder, &type));
           state_->types.push_back(type);
         }
       } else {
-        return tensorflow::errors::InvalidArgument(
-            "Missing type or type_attr field in ",
-            output_arg.ShortDebugString());
+        return InvalidArgument("Missing type or type_attr field in ",
+                               output_arg.ShortDebugString());
       }
     } else if (!output_arg.type_attr().empty()) {
       Attribute attr = attrs_[output_arg.type_attr()];
-      if (!attr) {
-        return tensorflow::errors::InvalidArgument(
-            "Missing attribute '", output_arg.type_attr(),
-            "' required for output '", output_arg.name(), "'");
-      }
+      if (!attr)
+        return InvalidArgument("Missing attribute '", output_arg.type_attr(),
+                               "' required for output '", output_arg.name(),
+                               "'");
       TypeAttr type_attr = attr.dyn_cast<TypeAttr>();
-      if (!type_attr) {
-        return tensorflow::errors::InvalidArgument(
-            "Attribute '", output_arg.type_attr(), "' required for output '",
-            output_arg.name(), "' isn't a type attribute");
-      }
-      state_->types.push_back(type_attr.getValue());
+      if (!type_attr)
+        return InvalidArgument("Attribute '", output_arg.type_attr(),
+                               "' required for output '", output_arg.name(),
+                               "' isn't a type attribute");
+      state_->types.push_back(UnrankedTensorType::get(type_attr.getValue()));
     } else if (!output_arg.type_list_attr().empty()) {
       // This is pointing to an attribute which is an array of types.
       Attribute attr = attrs_[output_arg.type_list_attr()];
-      if (!attr) {
-        return tensorflow::errors::InvalidArgument(
+      if (!attr)
+        return InvalidArgument(
             "Missing attribute '", output_arg.type_list_attr(),
             "' required for output '", output_arg.name(), "'");
-      }
       ArrayAttr array_attr = attr.dyn_cast<ArrayAttr>();
-      if (!array_attr) {
-        return tensorflow::errors::InvalidArgument(
-            "Attribute '", output_arg.type_list_attr(),
-            "' required for output '", output_arg.name(),
-            "' isn't an array attribute");
-      }
+      if (!array_attr)
+        return InvalidArgument("Attribute '", output_arg.type_list_attr(),
+                               "' required for output '", output_arg.name(),
+                               "' isn't an array attribute");
       for (Attribute attr : array_attr) {
         TypeAttr type_attr = attr.dyn_cast<TypeAttr>();
-        if (!type_attr) {
-          return tensorflow::errors::InvalidArgument(
-              "Array Attribute '", output_arg.type_list_attr(),
-              "' required for output '", output_arg.name(),
-              "' has a non-Type element");
-        }
-        state_->types.push_back(type_attr.getValue());
+        if (!type_attr)
+          return InvalidArgument("Array Attribute '",
+                                 output_arg.type_list_attr(),
+                                 "' required for output '", output_arg.name(),
+                                 "' has a non-Type element");
+        state_->types.push_back(UnrankedTensorType::get(type_attr.getValue()));
       }
     } else if (output_arg.type() != tensorflow::DT_INVALID) {
       Type type;
       Builder builder(context_);
-      TF_RETURN_IF_ERROR(
-          ConvertDataTypeToTensor(output_arg.type(), builder, &type));
+      TF_RETURN_IF_ERROR(ConvertDataType(output_arg.type(), builder, &type));
       state_->types.push_back(type);
     } else {
-      return tensorflow::errors::InvalidArgument("No type fields in ",
-                                                 output_arg.ShortDebugString());
+      return InvalidArgument("No type fields in ",
+                             output_arg.ShortDebugString());
     }
     if (output_arg.is_ref()) {
       // For all types that were added by this function call, make them refs.
@@ -458,6 +427,7 @@ Status MlirAbstractOp::Create(ArrayRef<Value> operands,
       }
     }
   }
+  for (auto& it : attrs_) state_->addAttribute(it.first(), it.second);
   *state = state_.get();
   return Status::OK();
 }
@@ -471,88 +441,68 @@ Status MlirAbstractOp::SetDeviceName(const char* name) {
   return Status::OK();
 }
 
-Status MlirAbstractOp::AddInputList(
-    absl::Span<AbstractTensorHandle* const> inputs) {
-  return tensorflow::errors::Unimplemented(
-      "AddInputList has not been implemented yet.");
-}
-
 Status MlirAbstractOp::SetAttrString(const char* attr_name, const char* data,
                                      size_t length) {
-  return tensorflow::errors::Unimplemented(
-      "SetAttrString has not been implemented yet.");
+  return Unimplemented("SetAttrString has not been implemented yet.");
 }
 Status MlirAbstractOp::SetAttrInt(const char* attr_name, int64_t value) {
-  return tensorflow::errors::Unimplemented(
-      "SetAttrInt has not been implemented yet.");
+  return Unimplemented("SetAttrInt has not been implemented yet.");
 }
 Status MlirAbstractOp::SetAttrFloat(const char* attr_name, float value) {
-  return tensorflow::errors::Unimplemented(
-      "SetAttrFloat has not been implemented yet.");
+  return Unimplemented("SetAttrFloat has not been implemented yet.");
 }
 Status MlirAbstractOp::SetAttrBool(const char* attr_name, bool value) {
-  return tensorflow::errors::Unimplemented(
-      "SetAttrBool has not been implemented yet.");
+  attrs_[attr_name] = BoolAttr::get(value, context_);
+  return Status::OK();
 }
 Status MlirAbstractOp::SetAttrShape(const char* attr_name, const int64_t* dims,
                                     const int num_dims) {
-  return tensorflow::errors::Unimplemented(
-      "SetAttrShape has not been implemented yet.");
+  return Unimplemented("SetAttrShape has not been implemented yet.");
 }
 Status MlirAbstractOp::SetAttrFunction(const char* attr_name,
                                        const AbstractOperation* value) {
-  return tensorflow::errors::Unimplemented(
-      "SetAttrFunction has not been implemented yet.");
+  return Unimplemented("SetAttrFunction has not been implemented yet.");
 }
 Status MlirAbstractOp::SetAttrFunctionName(const char* attr_name,
                                            const char* value, size_t length) {
-  return tensorflow::errors::Unimplemented(
-      "SetAttrFunctionName has not been implemented yet.");
+  return Unimplemented("SetAttrFunctionName has not been implemented yet.");
 }
 Status MlirAbstractOp::SetAttrTensor(const char* attr_name,
                                      AbstractTensorInterface* tensor) {
-  return tensorflow::errors::Unimplemented(
-      "SetAttrTensor has not been implemented yet.");
+  return Unimplemented("SetAttrTensor has not been implemented yet.");
 }
 Status MlirAbstractOp::SetAttrStringList(const char* attr_name,
                                          const void* const* values,
                                          const size_t* lengths,
                                          int num_values) {
-  return tensorflow::errors::Unimplemented(
-      "SetAttrStringList has not been implemented yet.");
+  return Unimplemented("SetAttrStringList has not been implemented yet.");
 }
 Status MlirAbstractOp::SetAttrFloatList(const char* attr_name,
                                         const float* values, int num_values) {
-  return tensorflow::errors::Unimplemented(
-      "SetAttrFloatList has not been implemented yet.");
+  return Unimplemented("SetAttrFloatList has not been implemented yet.");
 }
 Status MlirAbstractOp::SetAttrIntList(const char* attr_name,
                                       const int64_t* values, int num_values) {
-  return tensorflow::errors::Unimplemented(
-      "SetAttrIntList has not been implemented yet.");
+  return Unimplemented("SetAttrIntList has not been implemented yet.");
 }
 Status MlirAbstractOp::SetAttrTypeList(const char* attr_name,
                                        const tensorflow::DataType* values,
                                        int num_values) {
-  return tensorflow::errors::Unimplemented(
-      "SetAttrTypeList has not been implemented yet.");
+  return Unimplemented("SetAttrTypeList has not been implemented yet.");
 }
 Status MlirAbstractOp::SetAttrBoolList(const char* attr_name,
                                        const unsigned char* values,
                                        int num_values) {
-  return tensorflow::errors::Unimplemented(
-      "SetAttrBoolList has not been implemented yet.");
+  return Unimplemented("SetAttrBoolList has not been implemented yet.");
 }
 Status MlirAbstractOp::SetAttrShapeList(const char* attr_name,
                                         const int64_t** dims,
                                         const int* num_dims, int num_values) {
-  return tensorflow::errors::Unimplemented(
-      "SetAttrShapeList has not been implemented yet.");
+  return Unimplemented("SetAttrShapeList has not been implemented yet.");
 }
 Status MlirAbstractOp::SetAttrFunctionList(
     const char* attr_name, absl::Span<const AbstractOperation*> values) {
-  return tensorflow::errors::Unimplemented(
-      "SetAttrFunctionList has not been implemented yet.");
+  return Unimplemented("SetAttrFunctionList has not been implemented yet.");
 }
 
 Status MlirFunction::GetFunctionDef(tensorflow::FunctionDef** f) {
@@ -604,28 +554,101 @@ Status MlirFunctionContext::AddParameter(tensorflow::DataType dtype,
 }
 
 Status MlirAbstractOp::AddInput(AbstractTensorHandle* input) {
+  if (current_ods_input_ >= op_def_->input_arg_size())
+    return InvalidArgument(
+        absl::StrCat("More Input() (", current_ods_input_, ") calls than the ",
+                     op_def_->input_arg_size(), " allowed input_args ; for op ",
+                     state_->name.getStringRef().str()));
+
   auto* operand = dyn_cast<MlirTensor>(input);
-  if (!operand) {
-    return tensorflow::errors::InvalidArgument(
-        "Unable to cast input to MlirTensor");
-  }
+  if (!operand) return InvalidArgument("Unable to cast input to MlirTensor");
   operands_.push_back(operand->getValue());
+
+  // Get the next ArgDef and use it to infer the derived attributes associated
+  // to this input.
+  const tensorflow::OpDef::ArgDef& arg_def =
+      op_def_->input_arg(current_ods_input_++);
+  Type expected_type;
+  if (arg_def.type() != tensorflow::DT_INVALID) {
+    Builder builder(context_);
+    TF_RETURN_IF_ERROR(
+        tensorflow::ConvertDataType(arg_def.type(), builder, &expected_type));
+    if (arg_def.is_ref()) {
+      Type output_type;
+      TF_RETURN_IF_ERROR(AddRef(expected_type, &output_type));
+      expected_type = output_type;
+    }
+  } else {
+    expected_type = cast<MlirTensor>(input)->getElementType();
+  }
+  if (!arg_def.type_attr().empty())
+    attrs_[arg_def.type_attr()] = TypeAttr::get(expected_type);
+
   return Status::OK();
 }
+
+Status MlirAbstractOp::AddInputList(
+    absl::Span<AbstractTensorHandle* const> inputs) {
+  if (current_ods_input_ >= op_def_->input_arg_size())
+    return InvalidArgument(
+        absl::StrCat("More Input() (", current_ods_input_, ") calls than the ",
+                     op_def_->input_arg_size(), " allowed input_args"));
+
+  for (AbstractTensorHandle* input : inputs) {
+    auto* operand = dyn_cast<MlirTensor>(input);
+    if (!operand) return InvalidArgument("Unable to cast input to MlirTensor");
+    operands_.push_back(operand->getValue());
+  }
+
+  // Get the next ArgDef and use it to infer the derived attributes associated
+  // to this input.
+  const tensorflow::OpDef::ArgDef& arg_def =
+      op_def_->input_arg(current_ods_input_++);
+  if (!arg_def.number_attr().empty()) {
+    Builder builder(context_);
+    attrs_[arg_def.number_attr()] = builder.getI32IntegerAttr(inputs.size());
+    // TODO(aminim): handle ref variable.
+    if (arg_def.type() != tensorflow::DT_INVALID) {
+      // TODO(aminim): check type wrt input
+      Type arg_def_type;
+      TF_RETURN_IF_ERROR(
+          ConvertDataType(arg_def.type(), builder, &arg_def_type));
+      // Ensure each of the type in the list matches the op def type.
+      // TODO(aminim): can we improve the error message with the actual types?
+      for (AbstractTensorHandle* input : inputs)
+        if (arg_def_type != cast<MlirTensor>(input)->getElementType())
+          return InvalidArgument(
+              "Invalid input list: type mismatch the op def expectation");
+    } else if (!inputs.empty()) {
+      if (arg_def.type_attr().empty())
+        return FailedPrecondition(
+            "Invalid opdef type constraint: either type or type_attr required");
+
+      attrs_[arg_def.type_attr()] =
+          TypeAttr::get(cast<MlirTensor>(inputs.front())->getElementType());
+    }
+  } else if (!arg_def.type_list_attr().empty()) {
+    // TODO(aminim): handle ref variable.
+    SmallVector<Attribute, 8> types;
+    types.reserve(inputs.size());
+    for (AbstractTensorHandle* input : inputs)
+      types.push_back(TypeAttr::get(cast<MlirTensor>(input)->getElementType()));
+    attrs_[arg_def.type_list_attr()] = ArrayAttr::get(types, GetContext());
+  }
+  return Status::OK();
+}
+
 Status MlirFunctionContext::Finalize(OutputList* outputs,
                                      AbstractFunction** f) {
   Block& body = func_.getBody().front();
   SmallVector<Value, 8> ret_operands;
   for (auto* output : outputs->outputs) {
     auto* operand = dyn_cast<MlirTensor>(output);
-    if (!operand) {
-      return tensorflow::errors::InvalidArgument(
-          "Capturing eager tensors is not supported yet.");
-    }
-    if (operand->getValue().getContext() != context_.get()) {
-      return tensorflow::errors::InvalidArgument(
+    if (!operand)
+      return InvalidArgument("Capturing eager tensors is not supported yet.");
+    if (operand->getValue().getContext() != context_.get())
+      return InvalidArgument(
           "Capturing tensors from other context is not supported.");
-    }
     ret_operands.push_back(operand->getValue());
   }
   builder_.create<ReturnOp>(func_.getLoc(), ret_operands);
@@ -640,7 +663,6 @@ Status MlirFunctionContext::Finalize(OutputList* outputs,
 
 extern "C" {
 TracingContext* MlirTracingFactory(const char* fn_name, TF_Status* s) {
-  RegisterDialects();
   return new MlirFunctionContext(fn_name);
 }
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/dialect_registration.cc b/tensorflow/compiler/mlir/tensorflow/dialect_registration.h
similarity index 50%
rename from tensorflow/compiler/mlir/tensorflow/ir/dialect_registration.cc
rename to tensorflow/compiler/mlir/tensorflow/dialect_registration.h
index 45985cea583..a63bfd154ab 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/dialect_registration.cc
+++ b/tensorflow/compiler/mlir/tensorflow/dialect_registration.h
@@ -1,4 +1,4 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,22 +13,25 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_DIALECT_REGISTRATION_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_DIALECT_REGISTRATION_H_
+
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/Dialect.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
 
 namespace mlir {
-
-// Static initialization for TF dialect registration.
-static DialectRegistration<TF::TensorFlowDialect> tf_ops;
-static DialectRegistration<tf_executor::TensorFlowExecutorDialect>
-    tf_executor_dialect;
-static DialectRegistration<tf_device::TensorFlowDeviceDialect>
-    tf_device_dialect;
-static DialectRegistration<tf_saved_model::TensorFlowSavedModelDialect>
-    tf_saved_model_dialect;
-static DialectRegistration<mlir::shape::ShapeDialect> shape_dialect;
-
+// Inserts all the TensorFlow dialects in the provided registry. This is
+// intended for tools that need to register dialects before parsing .mlir files.
+inline void RegisterAllTensorFlowDialects(DialectRegistry &registry) {
+  registry.insert<mlir::StandardOpsDialect, mlir::TF::TensorFlowDialect,
+                  mlir::tf_device::TensorFlowDeviceDialect,
+                  mlir::tf_executor::TensorFlowExecutorDialect,
+                  mlir::tf_saved_model::TensorFlowSavedModelDialect>();
+}
 }  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_DIALECT_REGISTRATION_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.cc
index dfad1fce26d..40cc2c99c27 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.cc
@@ -74,12 +74,9 @@ struct FuncAttrStorage : public AttributeStorage {
 // Get or create a shape attribute.
 ShapeAttr ShapeAttr::get(mlir::MLIRContext* context,
                          llvm::Optional<ArrayRef<int64_t>> shape) {
-  if (shape)
-    return Base::get(context, AttrKind::SHAPE, *shape,
-                     /*unranked=*/false);
+  if (shape) return Base::get(context, *shape, /*unranked=*/false);
 
-  return Base::get(context, AttrKind::SHAPE, ArrayRef<int64_t>(),
-                   /*unranked=*/true);
+  return Base::get(context, ArrayRef<int64_t>(), /*unranked=*/true);
 }
 
 llvm::Optional<ArrayRef<int64_t>> ShapeAttr::getValue() const {
@@ -112,12 +109,12 @@ bool ShapeAttr::hasStaticShape() const {
 FuncAttr FuncAttr::get(mlir::MLIRContext* context, llvm::StringRef name,
                        DictionaryAttr attr) {
   auto symbol = SymbolRefAttr::get(name, context);
-  return Base::get(context, AttrKind::FUNC, symbol, attr);
+  return Base::get(context, symbol, attr);
 }
 
 FuncAttr FuncAttr::get(mlir::MLIRContext* context, SymbolRefAttr symbol,
                        DictionaryAttr attr) {
-  return Base::get(context, AttrKind::FUNC, symbol, attr);
+  return Base::get(context, symbol, attr);
 }
 
 SymbolRefAttr FuncAttr::GetName() const {
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h
index 1edc7356ab4..5a18b77ab5c 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h
@@ -24,19 +24,6 @@ limitations under the License.
 namespace mlir {
 namespace TF {
 
-namespace AttrKind {
-
-// List of supported custom TensorFlow Attribute kinds, necessary for
-// isa/dyn_cast.
-enum Kind {
-  FIRST_USED_TENSORFLOW_ATTR = Attribute::FIRST_TENSORFLOW_ATTR,
-  SHAPE = FIRST_USED_TENSORFLOW_ATTR,
-  FUNC,
-  LAST_USED_TENSORFLOW_ATTR,
-};
-
-}  // namespace AttrKind
-
 namespace detail {
 
 struct ShapeAttrStorage;
@@ -70,8 +57,6 @@ class ShapeAttr : public Attribute::AttrBase<ShapeAttr, Attribute,
   // have static shape. If all dimensions have known size (>= 0), it has static
   // shape.
   bool hasStaticShape() const;
-
-  static bool kindof(unsigned kind) { return kind == AttrKind::SHAPE; }
 };
 
 // Custom attribute to model AttrValue.value.func (NameAttrList type attribute).
@@ -97,8 +82,6 @@ class FuncAttr
   SymbolRefAttr GetName() const;
 
   DictionaryAttr GetAttrs() const;
-
-  static bool kindof(unsigned kind) { return kind == AttrKind::FUNC; }
 };
 
 }  // namespace TF
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc
index 77008b55672..0e85582337d 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc
@@ -101,7 +101,8 @@ bool BlockWrapsSingleOp(Block* block) {
 }  // end anonymous namespace
 
 TensorFlowDeviceDialect::TensorFlowDeviceDialect(MLIRContext* context)
-    : Dialect(/*name=*/"tf_device", context) {
+    : Dialect(/*name=*/"tf_device", context,
+              TypeID::get<TensorFlowDeviceDialect>()) {
   addOperations<
 #define GET_OP_LIST
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc.inc"
@@ -118,31 +119,6 @@ TensorFlowDeviceDialect::TensorFlowDeviceDialect(MLIRContext* context)
 // operation results are perfectly forwarded to the launch return.
 bool LaunchOp::WrapsSingleOp() { return BlockWrapsSingleOp(&GetBody()); }
 
-//===----------------------------------------------------------------------===//
-// tf_device.return
-//===----------------------------------------------------------------------===//
-
-namespace {
-ParseResult ParseReturnOp(OpAsmParser* parser, OperationState* state) {
-  llvm::SmallVector<OpAsmParser::OperandType, 2> op_info;
-  llvm::SmallVector<Type, 2> types;
-  llvm::SMLoc loc = parser->getCurrentLocation();
-  return failure(parser->parseOperandList(op_info) ||
-                 (!op_info.empty() && parser->parseColonTypeList(types)) ||
-                 parser->resolveOperands(op_info, types, loc, state->operands));
-}
-
-void Print(ReturnOp op, OpAsmPrinter* p) {
-  *p << op.getOperationName();
-  if (op.getNumOperands() > 0) {
-    *p << ' ';
-    p->printOperands(op.getOperands());
-    *p << " : ";
-    interleaveComma(op.getOperandTypes(), *p);
-  }
-}
-}  // anonymous namespace
-
 //===----------------------------------------------------------------------===//
 // tf_device.parallel_execute
 //===----------------------------------------------------------------------===//
@@ -393,7 +369,7 @@ void Print(ReplicateOp op, OpAsmPrinter* p) {
   //     [%a, ...] as %block_arg0: type
   //   packed_input
   //     %b as %block_arg1: type
-  const int32_t n = op.n().getSExtValue();
+  const int32_t n = op.n();
   const int32_t num_replicated_inputs =
       (*op.operand_segment_sizes().int_value_begin()).getSExtValue();
   const int32_t num_replicated_block_args = num_replicated_inputs / n;
@@ -437,7 +413,7 @@ LogicalResult VerifyCompatibleTypes(Type a, Type b) {
 }
 
 LogicalResult Verify(ReplicateOp op) {
-  int32_t n = op.n().getSExtValue();
+  int32_t n = op.n();
 
   // Check number of devices, if set, matches `n`.
   if (op.devices().hasValue()) {
@@ -694,12 +670,12 @@ void LaunchOp::getCanonicalizationPatterns(OwningRewritePatternList& results,
   results.insert<DropEmptyLaunch>(context);
 }
 
+}  // namespace tf_device
+}  // namespace mlir
+
 //===----------------------------------------------------------------------===//
 // TableGen'd op method definitions
 //===----------------------------------------------------------------------===//
 
 #define GET_OP_CLASSES
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc.inc"
-
-}  // namespace tf_device
-}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_device.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_device.h
index d1ca07d85a7..5b1d9711875 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_device.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_device.h
@@ -36,15 +36,16 @@ namespace tf_device {
 // XlaRun.
 class TensorFlowDeviceDialect : public Dialect {
  public:
+  static StringRef getDialectNamespace() { return "tf_device"; }
   // Constructing TensorFlowDevice dialect under an non-null MLIRContext.
   explicit TensorFlowDeviceDialect(MLIRContext* context);
 };
 
+}  // namespace tf_device
+}  // namespace mlir
+
 // Declares the operations for this dialect using the generated header.
 #define GET_OP_CLASSES
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h.inc"
 
-}  // namespace tf_device
-}  // namespace mlir
-
 #endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_DEVICE_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_device_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_device_ops.td
index 565be63a74f..8f1cd6877e4 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_device_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_device_ops.td
@@ -36,7 +36,7 @@ def TfDevice_Dialect : Dialect {
     XlaRun.
 }];
 
-  let cppNamespace = "tf_device";
+  let cppNamespace = "::mlir::tf_device";
 }
 
 //===----------------------------------------------------------------------===//
@@ -104,8 +104,7 @@ The `tf_device.return` operation terminates and returns values from a
     }]>
    ];
 
-  let parser = [{ return Parse$cppClass(&parser, &result); }];
-  let printer = [{ return Print(*this, &p); }];
+  let assemblyFormat = "attr-dict ($results^ `:` type($results))?";
 }
 
 def TfDevice_LaunchFuncOp : TfDevice_Op<"launch_func", []> {
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc
index c18723b0982..f2d0a548420 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc
@@ -54,9 +54,6 @@ namespace tf_executor {
 
 namespace {
 
-using TF::DropRefType;
-using TF::DropTypeSubTypes;
-
 struct TensorFlowExecutorInlinerInterface : public DialectInlinerInterface {
   using DialectInlinerInterface::DialectInlinerInterface;
 
@@ -75,9 +72,8 @@ struct TensorFlowExecutorInlinerInterface : public DialectInlinerInterface {
   }
 };
 
-struct TensorFlowExecutorOpFolderDialectInterface
-    : public OpFolderDialectInterface {
-  using OpFolderDialectInterface::OpFolderDialectInterface;
+struct TensorFlowExecutorDialectFoldInterface : public DialectFoldInterface {
+  using DialectFoldInterface::DialectFoldInterface;
 
   // Registered hook to check if the given region, which is attached to an
   // operation that is *not* isolated from above (i.e. no internal regions
@@ -92,14 +88,15 @@ struct TensorFlowExecutorOpFolderDialectInterface
 }  // namespace
 
 TensorFlowExecutorDialect::TensorFlowExecutorDialect(MLIRContext *context)
-    : Dialect(/*name=*/"tf_executor", context) {
+    : Dialect(/*name=*/"tf_executor", context,
+              TypeID::get<TensorFlowExecutorDialect>()) {
   addOperations<
 #define GET_OP_LIST
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc.inc"
       >();
 
   addInterfaces<TensorFlowExecutorInlinerInterface,
-                TensorFlowExecutorOpFolderDialectInterface>();
+                TensorFlowExecutorDialectFoldInterface>();
 
   addTypes<ControlType, TokenType>();
 }
@@ -253,33 +250,6 @@ ParseResult ParseGraphOp(OpAsmParser &parser, OperationState &result) {
 // tf_executor.fetch
 //===----------------------------------------------------------------------===//
 
-namespace {
-
-void Print(FetchOp fetch, OpAsmPrinter &p) {
-  p << fetch.getOperationName();
-  if (fetch.getNumOperands() > 0) {
-    p << ' ';
-    p.printOperands(fetch.operand_begin(), fetch.operand_end());
-    p << " : ";
-    interleaveComma(fetch.getOperandTypes(), p);
-  }
-  p.printOptionalAttrDict(fetch.getAttrs());
-}
-
-ParseResult ParseFetchOp(OpAsmParser &parser, OperationState &result) {
-  SmallVector<OpAsmParser::OperandType, 2> opInfo;
-  SmallVector<Type, 2> types;
-  llvm::SMLoc loc = parser.getCurrentLocation();
-  return failure(parser.parseOperandList(opInfo) ||
-                 (!opInfo.empty() && parser.parseColonTypeList(types)) ||
-                 parser.resolveOperands(opInfo, types, loc, result.operands) ||
-                 parser.parseOptionalAttrDict(result.attributes)
-
-  );
-}
-
-}  // anonymous namespace
-
 //===----------------------------------------------------------------------===//
 // tf_executor.island
 //===----------------------------------------------------------------------===//
@@ -414,31 +384,6 @@ ParseResult ParseIslandOp(OpAsmParser &parser, OperationState &result) {
 // tf_executor.yield
 //===----------------------------------------------------------------------===//
 
-namespace {
-
-void Print(YieldOp yield, OpAsmPrinter &p) {
-  p << yield.getOperationName();
-  if (yield.getNumOperands() > 0) {
-    p << ' ';
-    p.printOperands(yield.operand_begin(), yield.operand_end());
-    p << " : ";
-    interleaveComma(yield.getOperandTypes(), p);
-  }
-  p.printOptionalAttrDict(yield.getAttrs());
-}
-
-ParseResult ParseYieldOp(OpAsmParser &parser, OperationState &result) {
-  SmallVector<OpAsmParser::OperandType, 2> op_info;
-  SmallVector<Type, 2> types;
-  llvm::SMLoc loc = parser.getCurrentLocation();
-  return failure(parser.parseOperandList(op_info) ||
-                 (!op_info.empty() && parser.parseColonTypeList(types)) ||
-                 parser.resolveOperands(op_info, types, loc, result.operands) ||
-                 parser.parseOptionalAttrDict(result.attributes));
-}
-
-}  // anonymous namespace
-
 //===----------------------------------------------------------------------===//
 // tf_executor.Switch
 //===----------------------------------------------------------------------===//
@@ -550,8 +495,8 @@ LogicalResult Verify(SwitchNOp switchn) {
              << operand0_tensor_type << " vs " << output_tensor_type;
     }
     Type broadcasted_type = OpTrait::util::getBroadcastedType(
-        DropRefType(DropTypeSubTypes(operand0_tensor_type)),
-        DropRefType(DropTypeSubTypes(output_tensor_type)));
+        TF::DropRefAndSubTypes(operand0_tensor_type),
+        TF::DropRefAndSubTypes(output_tensor_type));
     if (!broadcasted_type) {
       return switchn.emitOpError()
              << "expects data operand to be broadcastable with all output types"
@@ -667,8 +612,8 @@ LogicalResult Verify(MergeOp merge) {
              << operand_tensor_ty << " vs " << output_tensor_ty;
     }
     Type broadcasted_type = OpTrait::util::getBroadcastedType(
-        DropRefType(DropTypeSubTypes(output_tensor_ty)),
-        DropRefType(DropTypeSubTypes(operand_tensor_ty)));
+        TF::DropRefAndSubTypes(output_tensor_ty),
+        TF::DropRefAndSubTypes(operand_tensor_ty));
     if (!broadcasted_type)
       return merge.emitOpError()
              << "expects all operands to be broadcastable with output type"
@@ -851,23 +796,6 @@ LogicalResult Verify(NextIterationSourceOp source) {
   return success();
 }
 
-void Print(NextIterationSourceOp next_iteration, OpAsmPrinter &p) {
-  p << next_iteration.getOperationName() << " : " << next_iteration.getType(0);
-  p.printOptionalAttrDict(next_iteration.getAttrs());
-}
-
-ParseResult ParseNextIterationSourceOp(OpAsmParser &parser,
-                                       OperationState &result) {
-  SmallVector<Type, 1> types;
-  if (parser.parseColonTypeList(types)) return failure();
-
-  MLIRContext *context = parser.getBuilder().getContext();
-  Type token_type = TokenType::get(context);
-  Type control_type = ControlType::get(context);
-  result.addTypes({types.front(), token_type, control_type});
-  return parser.parseOptionalAttrDict(result.attributes);
-}
-
 }  // anonymous namespace
 
 //===----------------------------------------------------------------------===//
@@ -894,36 +822,6 @@ LogicalResult Verify(NextIterationSinkOp sink) {
   return success();
 }
 
-void Print(NextIterationSinkOp next_iteration, OpAsmPrinter &p) {
-  p << next_iteration.getOperationName() << " [";
-  p.printOperand(next_iteration.getOperand(0));
-  p << "] ";
-  p.printOperands(llvm::drop_begin(next_iteration.getOperands(), 1));
-  p << " : " << next_iteration.getOperand(1).getType();
-  p.printOptionalAttrDict(next_iteration.getAttrs());
-}
-
-ParseResult ParseNextIterationSinkOp(OpAsmParser &parser,
-                                     OperationState &result) {
-  SmallVector<OpAsmParser::OperandType, 2> op_infos;
-  llvm::SMLoc loc = parser.getCurrentLocation();
-
-  // First type is always the token consumed from the NextIteration.source
-  Type token_type = TokenType::get(parser.getBuilder().getContext());
-  SmallVector<Type, 1> types = {token_type};
-
-  if (parser.parseOperandList(op_infos, 1, OpAsmParser::Delimiter::Square) ||
-      parser.parseOperandList(op_infos) || parser.parseColonTypeList(types))
-    return failure();
-
-  Type control_type = ControlType::get(parser.getBuilder().getContext());
-  types.append(op_infos.size() - 2, control_type);
-  if (parser.resolveOperands(op_infos, types, loc, result.operands))
-    return failure();
-
-  return parser.parseOptionalAttrDict(result.attributes);
-}
-
 }  // anonymous namespace
 
 //===----------------------------------------------------------------------===//
@@ -962,32 +860,6 @@ ParseResult ParseExitOp(OpAsmParser &parser, OperationState &result) {
 // tf_executor.ControlTrigger
 //===----------------------------------------------------------------------===//
 
-namespace {
-
-void Print(ControlTriggerOp trigger, OpAsmPrinter &p) {
-  p << trigger.getOperationName() << ' ';
-  p.printOperands(trigger.getOperands());
-  p.printOptionalAttrDict(trigger.getAttrs());
-}
-
-ParseResult ParseControlTriggerOp(OpAsmParser &parser, OperationState &result) {
-  SmallVector<OpAsmParser::OperandType, 2> op_infos;
-  SmallVector<Type, 1> types;
-  llvm::SMLoc loc = parser.getCurrentLocation();
-
-  if (parser.parseOperandList(op_infos)) return failure();
-  Type control_type = ControlType::get(parser.getBuilder().getContext());
-  types.append(op_infos.size(), control_type);
-  if (parser.resolveOperands(op_infos, types, loc, result.operands))
-    return failure();
-
-  // Single control as the only output
-  result.types.push_back(control_type);
-  return parser.parseOptionalAttrDict(result.attributes);
-}
-
-}  // anonymous namespace
-
 //===----------------------------------------------------------------------===//
 // tf_executor.LoopCond
 //===----------------------------------------------------------------------===//
@@ -1249,12 +1121,12 @@ LogicalResult IslandOp::fold(llvm::ArrayRef<Attribute> operands,
   return success();
 }
 
+}  // namespace tf_executor
+}  // namespace mlir
+
 //===----------------------------------------------------------------------===//
 // TableGen'd op method definitions
 //===----------------------------------------------------------------------===//
 
 #define GET_OP_CLASSES
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc.inc"
-
-}  // namespace tf_executor
-}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h
index 3bb30f16c3d..2bc13556b4b 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h
@@ -35,6 +35,7 @@ namespace tf_executor {
 
 class TensorFlowExecutorDialect : public Dialect {
  public:
+  static StringRef getDialectNamespace() { return "tf_executor"; }
   explicit TensorFlowExecutorDialect(MLIRContext *context);
 
   // Parses a type registered to this dialect.
@@ -44,44 +45,23 @@ class TensorFlowExecutorDialect : public Dialect {
   void printType(Type type, DialectAsmPrinter &os) const override;
 };
 
-namespace TFTypes {
-enum Kind {
-  Control = Type::FIRST_TENSORFLOW_EXECUTOR_TYPE,
-  Token,
-};
-}  // namespace TFTypes
-
 // The Control type is a token-like value that models control dependencies from
 // TensorFlow graphs.
 class ControlType : public Type::TypeBase<ControlType, Type, TypeStorage> {
  public:
   using Base::Base;
-
-  static ControlType get(MLIRContext *context) {
-    return Base::get(context, TFTypes::Control);
-  }
-
-  // Support method to enable LLVM-style type casting.
-  static bool kindof(unsigned kind) { return kind == TFTypes::Control; }
 };
 
 class TokenType : public Type::TypeBase<TokenType, Type, TypeStorage> {
  public:
   using Base::Base;
-
-  static TokenType get(MLIRContext *context) {
-    return Base::get(context, TFTypes::Token);
-  }
-
-  // Support method to enable LLVM-style type casting.
-  static bool kindof(unsigned kind) { return kind == TFTypes::Token; }
 };
 
+}  // namespace tf_executor
+}  // namespace mlir
+
 // Declares the operations for this dialect using the generated header.
 #define GET_OP_CLASSES
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h.inc"
 
-}  // namespace tf_executor
-}  // namespace mlir
-
 #endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_EXECUTOR_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor_ops.td
index 3081018b8da..713ddc44cba 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor_ops.td
@@ -43,14 +43,16 @@ def TfExecutor_Dialect : Dialect {
     value).
 }];
 
-  let cppNamespace = "tf_executor";
+  let cppNamespace = "::mlir::tf_executor";
 }
 
 // Control type.
-def TfeControlType : Type<CPred<"$_self.isa<ControlType>()">, "control">;
+def TfeControlType : Type<CPred<"$_self.isa<ControlType>()">, "control">,
+                     BuildableType<"$_builder.getType<ControlType>()">;
 
 // Token type.
-def TfeTokenType : Type<CPred<"$_self.isa<TokenType>()">, "token">;
+def TfeTokenType : Type<CPred<"$_self.isa<TokenType>()">, "token">,
+                   BuildableType<"$_builder.getType<TokenType>()">;
 
 // TODO(hinsu): Define and use TensorType instead of AnyType for data operands
 // and results. For example, MergeOp output type.
@@ -148,7 +150,11 @@ def TfExecutor_FetchOp : TfExecutor_Op<"fetch",
     }]>
    ];
 
+  let assemblyFormat = "($fetches^ `:` type($fetches))? attr-dict";
+
   let verifier = ?;
+  let printer = ?;
+  let parser = ?;
 }
 
 def TfExecutor_IslandOp : TfExecutor_Op<"island",
@@ -229,7 +235,11 @@ def TfExecutor_YieldOp : TfExecutor_Op<"yield",
     }]>
    ];
 
+  let assemblyFormat = "($fetches^ `:` type($fetches))? attr-dict";
+
   let verifier = ?;
+  let printer = ?;
+  let parser = ?;
 }
 
 def TfExecutor_SwitchOp : TfExecutor_Op<"Switch",
@@ -466,6 +476,10 @@ def TfExecutor_NextIterationSourceOp : TfExecutor_Op<"NextIteration.Source",
     }
   }];
 
+  let assemblyFormat = "`:` type($output) attr-dict";
+
+  let printer = ?;
+  let parser = ?;
 }
 
 
@@ -527,6 +541,11 @@ def TfExecutor_NextIterationSinkOp : TfExecutor_Op<"NextIteration.Sink",
       result.attributes.append(attributes.begin(), attributes.end());
     }]>
    ];
+
+  let assemblyFormat = " `[` $token `]` $input (`,` $controlInputs^)? `:` type($input) attr-dict";
+
+  let printer = ?;
+  let parser = ?;
 }
 
 def TfExecutor_ExitOp : TfExecutor_Op<"Exit",
@@ -552,7 +571,7 @@ def TfExecutor_ExitOp : TfExecutor_Op<"Exit",
        .Attr("T: type")
 
     For example:
-     %1:2 = tf_executor.Exit %0#0 {T: "tfdtype$DT_INT32"} : tensor<*xi32>
+     %1:2 = tf_executor.Exit %0#0 : tensor<*xi32> {T: "tfdtype$DT_INT32"}
 
     Note: Additional result corresponds to the control output.
   }];
@@ -607,6 +626,11 @@ def TfExecutor_ControlTriggerOp : TfExecutor_Op<"ControlTrigger",
       result.attributes.append(attributes.begin(), attributes.end());
     }]>
    ];
+
+  let assemblyFormat = "$controlInputs attr-dict";
+
+  let printer = ?;
+  let parser = ?;
 }
 
 def TfExecutor_LoopCondOp : TfExecutor_Op<"LoopCond",
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index 081903d13cf..ba9ba8ea248 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -52,6 +52,12 @@ an output element, this operation computes \\(y = |x|\\).
 def TF_AcosOp : TF_Op<"Acos", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Computes acos of x element-wise.";
 
+  let description = [{
+Provided an input tensor, the `tf.math.acos` operation returns the inverse cosine of each element of the tensor. If `y = tf.math.cos(x)` then, `x = tf.math.acos(y)`.
+
+  Input range is `[-1, 1]` and the output has a range of `[0, pi]`.
+  }];
+
   let arguments = (ins
     TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64]>:$x
   );
@@ -87,29 +93,6 @@ tf.math.acosh(x) ==> [nan nan 0. 0.62236255 5.9914584 9.903487 inf]
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
-def TF_AddOp : TF_Op<"Add", [NoSideEffect, ResultsBroadcastableShape, TF_LayoutAgnostic, TF_SameOperandsAndResultElementTypeResolveRef]>,
-               WithBroadcastableBinOpBuilder {
-  let summary = "Returns x + y element-wise.";
-
-  let description = [{
-*NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-  }];
-
-  let arguments = (ins
-    TF_NumberOrStrTensor:$x,
-    TF_NumberOrStrTensor:$y
-  );
-
-  let results = (outs
-    TF_NumberOrStrTensor:$z
-  );
-
-  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
-
-  let hasCanonicalizer = 1;
-}
-
 def TF_AddNOp : TF_Op<"AddN", [Commutative, NoSideEffect]> {
   let summary = "Add all input tensors element wise.";
 
@@ -136,31 +119,6 @@ Inputs must be of same size and shape.
   let hasFolder = 1;
 }
 
-def TF_AddV2Op : TF_Op<"AddV2", [Commutative, NoSideEffect, ResultsBroadcastableShape, TF_LayoutAgnostic, TF_SameOperandsAndResultElementTypeResolveRef, TF_CwiseBinary]>,
-                 WithBroadcastableBinOpBuilder {
-  let summary = "Returns x + y element-wise.";
-
-  let description = [{
-*NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-  }];
-
-  let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Uint8]>:$x,
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Uint8]>:$y
-  );
-
-  let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Uint8]>:$z
-  );
-
-  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
-
-  let hasCanonicalizer = 1;
-
-  let hasFolder = 1;
-}
-
 def TF_AdjustContrastv2Op : TF_Op<"AdjustContrastv2", [NoSideEffect]> {
   let summary = "Adjust the contrast of one or more images.";
 
@@ -571,7 +529,7 @@ see the incremented value or a subsequent newer one.
   }];
 
   let arguments = (ins
-    TF_ResourceTensor:$resource,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$resource,
     TF_Tensor:$value
   );
 
@@ -589,7 +547,7 @@ see the decremented value or a subsequent newer one.
   }];
 
   let arguments = (ins
-    TF_ResourceTensor:$resource,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$resource,
     TF_Tensor:$value
   );
 
@@ -607,7 +565,7 @@ this value or a subsequent newer value of the variable.
   }];
 
   let arguments = (ins
-    TF_ResourceTensor:$resource,
+    Arg<TF_ResourceTensor, "", [TF_VariableWrite]>:$resource,
     TF_Tensor:$value
   );
 
@@ -859,15 +817,15 @@ about broadcasting
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$x,
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$y,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, TF_Complex128, TF_Complex64]>:$x,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, TF_Complex128, TF_Complex64]>:$y,
 
     DefaultValuedAttr<BoolAttr, "false">:$adj_x,
     DefaultValuedAttr<BoolAttr, "false">:$adj_y
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$output
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, TF_Complex128, TF_Complex64]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -965,7 +923,41 @@ reverse of SpaceToBatch.  See below for a precise description.
   TF_DerivedOperandTypeAttr Tblock_shape = TF_DerivedOperandTypeAttr<1>;
 }
 
-def TF_BiasAddOp : TF_Op<"BiasAdd", [NoSideEffect]> {
+def TF_BetaincOp : TF_Op<"Betainc", [NoSideEffect]> {
+  let summary = [{
+Compute the regularized incomplete beta integral \\(I_x(a, b)\\).
+  }];
+
+  let description = [{
+The regularized incomplete beta integral is defined as:
+
+
+\\(I_x(a, b) = \frac{B(x; a, b)}{B(a, b)}\\)
+
+where
+
+
+\\(B(x; a, b) = \int_0^x t^{a-1} (1 - t)^{b-1} dt\\)
+
+
+is the incomplete beta function and \\(B(a, b)\\) is the *complete*
+beta function.
+  }];
+
+  let arguments = (ins
+    TF_F32OrF64Tensor:$a,
+    TF_F32OrF64Tensor:$b,
+    TF_F32OrF64Tensor:$x
+  );
+
+  let results = (outs
+    TF_F32OrF64Tensor:$z
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
+def TF_BiasAddOp : TF_Op<"BiasAdd", [NoSideEffect, TF_ContractionFusableInterface]> {
   let summary = "Adds `bias` to `value`.";
 
   let description = [{
@@ -986,6 +978,11 @@ Broadcasting is supported, so `value` may have any number of dimensions.
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 
+  let extraClassDeclaration = [{
+    // TF_ContractionFusableInterface:
+    Optional<ContractionFusion> GetContractionFusion();
+  }];
+
   let verifier = [{
     return Verify(*this);
   }];
@@ -1319,6 +1316,7 @@ subsequent operation and then be optimized away, however.)
   let verifier = [{
     return Verify(*this);
   }];
+  let hasFolder = 1;
 }
 
 def TF_BucketizeOp : TF_Op<"Bucketize", [NoSideEffect, SameOperandsAndResultShape]> {
@@ -1350,48 +1348,6 @@ then the output will be
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
-def TF_CaseOp : TF_Op<"Case", []> {
-  let summary = [{
-An n-way switch statement which calls a single branch function.
-  }];
-
-  let description = [{
-An n-way switch statement, implementing the following:
-    ```
-    switch (branch_index) {
-      case 0:
-        output = branches[0](input);
-        break;
-      case 1:
-        output = branches[1](input);
-        break;
-      ...
-      case [[nbranches-1]]:
-      default:
-        output = branches[nbranches-1](input);
-        break;
-    }
-    ```
-  }];
-
-  let arguments = (ins
-    I32Tensor:$branch_index,
-    Variadic<TF_Tensor>:$input,
-
-    Confined<SymbolRefArrayAttr, [ArrayMinCount<1>]>:$branches,
-    DefaultValuedAttr<TF_ShapeAttrArray, "{}">:$output_shapes
-  );
-
-  let results = (outs
-    Variadic<TF_Tensor>:$output
-  );
-
-  TF_DerivedOperandTypeListAttr Tin = TF_DerivedOperandTypeListAttr<1>;
-  TF_DerivedResultTypeListAttr Tout = TF_DerivedResultTypeListAttr<0>;
-
-  let hasCanonicalizer = 1;
-}
-
 def TF_CastOp : TF_Op<"Cast", [NoSideEffect, SameOperandsAndResultShape]> {
   let summary = "Cast x of type SrcT to y of DstT.";
 
@@ -1446,6 +1402,38 @@ that are not a number (NaN) or infinity (Inf). Otherwise, passes `tensor` as-is.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_CholeskyOp : TF_Op<"Cholesky", [NoSideEffect]> {
+  let summary = [{
+Computes the Cholesky decomposition of one or more square matrices.
+  }];
+
+  let description = [{
+The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+form square matrices.
+
+The input has to be symmetric and positive definite. Only the lower-triangular
+part of the input will be used for this operation. The upper-triangular part
+will not be read.
+
+The output is a tensor of the same shape as the input
+containing the Cholesky decompositions for all input submatrices `[..., :, :]`.
+
+**Note**: The gradient computation on GPU is faster for large matrices but
+not for large batch dimensions when the submatrices are small. In this
+case it might be faster to use the CPU.
+  }];
+
+  let arguments = (ins
+    TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$input
+  );
+
+  let results = (outs
+    TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_ClipByValueOp : TF_Op<"ClipByValue", [NoSideEffect, TF_SameOperandsAndResultElementTypeResolveRef]> {
   let summary = "Clips tensor values to a specified min and max.";
 
@@ -1715,6 +1703,24 @@ def TF_ConcatV2Op : TF_Op<"ConcatV2", [NoSideEffect]> {
   let hasCanonicalizer = 1;
 }
 
+def TF_ConfigureDistributedTPUOp : TF_Op<"ConfigureDistributedTPU", []> {
+  let summary = [{
+Sets up the centralized structures for a distributed TPU system.
+  }];
+
+  let arguments = (ins
+    StrAttr:$embedding_config,
+    StrAttr:$tpu_embedding_config,
+    DefaultValuedAttr<BoolAttr, "false">:$is_global_init,
+    DefaultValuedAttr<BoolAttr, "false">:$enable_whole_mesh_compilations,
+    DefaultValuedAttr<BoolAttr, "true">:$compilation_failure_closes_chips
+  );
+
+  let results = (outs
+    TF_StrTensor:$topology
+  );
+}
+
 def TF_ConjOp : TF_Op<"Conj", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Returns the complex conjugate of a complex number.";
 
@@ -2067,17 +2073,73 @@ and `B, D, F, H` as group 1. Thus we get the outputs:
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F32, I32, TF_Uint32]>:$input,
+    TensorOf<[BF16, F16, F32, I32, TF_Uint32]>:$input,
     I32Tensor:$group_assignment
   );
 
   let results = (outs
-    TensorOf<[BF16, F32, I32, TF_Uint32]>:$output
+    TensorOf<[BF16, F16, F32, I32, TF_Uint32]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_CumprodOp : TF_Op<"Cumprod", [NoSideEffect, TF_AllTypesMatch<["x", "out"]>]> {
+  let summary = [{
+Compute the cumulative product of the tensor `x` along `axis`.
+  }];
+
+  let description = [{
+By default, this op performs an inclusive cumprod, which means that the first
+element of the input is identical to the first element of the output:
+
+```python
+tf.cumprod([a, b, c])  # => [a, a * b, a * b * c]
+```
+
+By setting the `exclusive` kwarg to `True`, an exclusive cumprod is
+performed instead:
+
+```python
+tf.cumprod([a, b, c], exclusive=True)  # => [1, a, a * b]
+```
+
+By setting the `reverse` kwarg to `True`, the cumprod is performed in the
+opposite direction:
+
+```python
+tf.cumprod([a, b, c], reverse=True)  # => [a * b * c, b * c, c]
+```
+
+This is more efficient than using separate `tf.reverse` ops.
+
+The `reverse` and `exclusive` kwargs can also be combined:
+
+```python
+tf.cumprod([a, b, c], exclusive=True, reverse=True)  # => [b * c, c, 1]
+```
+  }];
+
+  let arguments = (ins
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$x,
+    TF_I32OrI64Tensor:$axis,
+
+    DefaultValuedAttr<BoolAttr, "false">:$exclusive,
+    DefaultValuedAttr<BoolAttr, "false">:$reverse
+  );
+
+  let results = (outs
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$out
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedOperandTypeAttr Tidx = TF_DerivedOperandTypeAttr<1>;
+
+  let verifier = [{
+    return Verify(*this);
+  }];
+}
+
 def TF_CumsumOp : TF_Op<"Cumsum", [NoSideEffect, TF_AllTypesMatch<["x", "out"]>]> {
   let summary = "Compute the cumulative sum of the tensor `x` along `axis`.";
 
@@ -2126,6 +2188,10 @@ tf.cumsum([a, b, c], exclusive=True, reverse=True)  # => [b + c, c, 0]
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
   TF_DerivedOperandTypeAttr Tidx = TF_DerivedOperandTypeAttr<1>;
+
+  let verifier = [{
+    return Verify(*this);
+  }];
 }
 
 def TF_DataFormatDimMapOp : TF_Op<"DataFormatDimMap", [NoSideEffect, SameOperandsAndResultType]> {
@@ -2151,6 +2217,82 @@ the source data format.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_DataFormatVecPermuteOp : TF_Op<"DataFormatVecPermute", [NoSideEffect, SameOperandsAndResultType]> {
+  let summary = "Permute input tensor from `src_format` to `dst_format`.";
+
+  let description = [{
+Input tensor must be a vector of size 4, or a 4x2 tensor.
+
+For example, with `src_format` of `NHWC`, `dst_format` of `NCHW`, and inputs:
+```
+[1, 2, 3, 4]
+```
+and
+```
+[[1, 2, 3, 4],
+ [5, 6, 7, 8]]
+```
+, the outputs will be (respectively):
+```
+[1, 4, 2, 3]
+```
+and
+```
+[[1, 4, 2, 3],
+ [5, 8, 6, 7]]
+```
+  }];
+
+  let arguments = (ins
+    TF_I32OrI64Tensor:$x,
+
+    DefaultValuedAttr<StrAttr, "NHWC">:$src_format,
+    DefaultValuedAttr<StrAttr, "NCHW">:$dst_format
+  );
+
+  let results = (outs
+    TF_I32OrI64Tensor:$y
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+
+  let verifier = [{ return Verify(*this); }];
+}
+
+def TF_DebugIdentityV2Op : TF_Op<"DebugIdentityV2", []> {
+  let summary = "Debug Identity V2 Op.";
+
+  let description = [{
+Provides an identity mapping from input to output, while writing the content of
+the input tensor by calling DebugEventsWriter.
+
+The semantics of the input tensor depends on tensor_debug_mode. In typical
+usage, the input tensor comes directly from the user computation only when
+graph_debug_mode is FULL_TENSOR (see protobuf/debug_event.proto for a
+list of all the possible values of graph_debug_mode). For the other debug modes,
+the input tensor should be produced by an additional op or subgraph that
+computes summary information about one or more tensors.
+  }];
+
+  let arguments = (ins
+    TF_Tensor:$input,
+
+    StrAttr:$tfdbg_context_id,
+    StrAttr:$op_name,
+    DefaultValuedAttr<I64Attr, "-1">:$output_slot,
+    DefaultValuedAttr<I64Attr, "-1">:$tensor_debug_mode,
+    DefaultValuedAttr<StrArrayAttr, "{}">:$debug_urls,
+    DefaultValuedAttr<I64Attr, "1000">:$circular_buffer_size,
+    StrAttr:$tfdbg_run_id
+  );
+
+  let results = (outs
+    TF_Tensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_DecodeAndCropJpegOp : TF_Op<"DecodeAndCropJpeg", [NoSideEffect]> {
   let summary = "Decode and Crop a JPEG-encoded image to a uint8 tensor.";
 
@@ -2444,6 +2586,54 @@ horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_DepthwiseConv2dNativeBackpropFilterOp : TF_Op<"DepthwiseConv2dNativeBackpropFilter", [NoSideEffect]> {
+  let summary = [{
+Computes the gradients of depthwise convolution with respect to the filter.
+  }];
+
+  let arguments = (ins
+    TF_FpTensor:$input,
+    I32Tensor:$filter_sizes,
+    TF_FpTensor:$out_backprop,
+
+    I64ArrayAttr:$strides,
+    TF_AnyStrAttrOf<["SAME", "VALID", "EXPLICIT"]>:$padding,
+    DefaultValuedAttr<I64ArrayAttr, "{}">:$explicit_paddings,
+    DefaultValuedAttr<TF_ConvnetDataFormatAttr, "NHWC">:$data_format,
+    DefaultValuedAttr<I64ArrayAttr, "{1, 1, 1, 1}">:$dilations
+  );
+
+  let results = (outs
+    TF_FpTensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
+def TF_DepthwiseConv2dNativeBackpropInputOp : TF_Op<"DepthwiseConv2dNativeBackpropInput", [NoSideEffect]> {
+  let summary = [{
+Computes the gradients of depthwise convolution with respect to the input.
+  }];
+
+  let arguments = (ins
+    I32Tensor:$input_sizes,
+    TF_FpTensor:$filter,
+    TF_FpTensor:$out_backprop,
+
+    I64ArrayAttr:$strides,
+    TF_AnyStrAttrOf<["SAME", "VALID", "EXPLICIT"]>:$padding,
+    DefaultValuedAttr<I64ArrayAttr, "{}">:$explicit_paddings,
+    DefaultValuedAttr<TF_ConvnetDataFormatAttr, "NHWC">:$data_format,
+    DefaultValuedAttr<I64ArrayAttr, "{1, 1, 1, 1}">:$dilations
+  );
+
+  let results = (outs
+    TF_FpTensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<1>;
+}
+
 def TF_DeviceIndexOp : TF_Op<"DeviceIndex", [NoSideEffect]> {
   let summary = "Return the index of device the op runs.";
 
@@ -2463,6 +2653,40 @@ this op runs. The length of the list is returned in two cases:
   );
 }
 
+def TF_DiagOp : TF_Op<"Diag", [NoSideEffect, SameOperandsAndResultElementType]> {
+  let summary = "Returns a diagonal tensor with a given diagonal values.";
+
+  let description = [{
+Given a `diagonal`, this operation returns a tensor with the `diagonal` and
+everything else padded with zeros. The diagonal is computed as follows:
+
+Assume `diagonal` has dimensions [D1,..., Dk], then the output is a tensor of
+rank 2k with dimensions [D1,..., Dk, D1,..., Dk] where:
+
+`output[i1,..., ik, i1,..., ik] = diagonal[i1, ..., ik]` and 0 everywhere else.
+
+For example:
+
+```
+# 'diagonal' is [1, 2, 3, 4]
+tf.diag(diagonal) ==> [[1, 0, 0, 0]
+                       [0, 2, 0, 0]
+                       [0, 0, 3, 0]
+                       [0, 0, 0, 4]]
+```
+  }];
+
+  let arguments = (ins
+    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$diagonal
+  );
+
+  let results = (outs
+    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_DiagPartOp : TF_Op<"DiagPart", [NoSideEffect]> {
   let summary = "Returns the diagonal part of the tensor.";
 
@@ -2543,27 +2767,6 @@ def TF_DivOp : TF_Op<"Div", [NoSideEffect, ResultsBroadcastableShape, TF_SameOpe
   let hasFolder = 1;
 }
 
-def TF_DivNoNanOp : TF_Op<"DivNoNan", [NoSideEffect, ResultsBroadcastableShape, TF_SameOperandsAndResultElementTypeResolveRef]>,
-                    WithBroadcastableBinOpBuilder {
-  let summary = "Returns 0 if the denominator is zero.";
-
-  let description = [{
-*NOTE*: `DivNoNan` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-  }];
-
-  let arguments = (ins
-    TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$x,
-    TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$y
-  );
-
-  let results = (outs
-    TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$z
-  );
-
-  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
-}
-
 def TF_DynamicStitchOp : TF_Op<"DynamicStitch", [NoSideEffect, SameVariadicOperandSize]> {
   let summary = [{
 Interleave the values from the `data` tensors into a single tensor.
@@ -3117,6 +3320,27 @@ i.e. `exp(x) - 1` or `e^(x) - 1`, where `x` is the input tensor.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_ExtractImagePatchesOp : TF_Op<"ExtractImagePatches", [NoSideEffect]> {
+  let summary = [{
+Extract `patches` from `images` and put them in the "depth" output dimension.
+  }];
+
+  let arguments = (ins
+    TensorOf<[BF16, F16, F32, F64, I1, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$images,
+
+    Confined<I64ArrayAttr, [ArrayMinCount<4>]>:$ksizes,
+    Confined<I64ArrayAttr, [ArrayMinCount<4>]>:$strides,
+    Confined<I64ArrayAttr, [ArrayMinCount<4>]>:$rates,
+    TF_AnyStrAttrOf<["SAME", "VALID"]>:$padding
+  );
+
+  let results = (outs
+    TensorOf<[BF16, F16, F32, F64, I1, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$patches
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_FFTOp : TF_Op<"FFT", [NoSideEffect]> {
   let summary = "Fast Fourier transform.";
 
@@ -3589,6 +3813,95 @@ The size of 1D Tensors matches the dimension C of the 4D Tensors.
   }];
 }
 
+def TF_FusedBatchNormV2Op : TF_Op<"FusedBatchNormV2", [NoSideEffect, TF_FoldOperandsTransposeInterface, TF_LayoutSensitiveInterface]> {
+  let summary = "Batch normalization.";
+
+  let description = [{
+Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+The size of 1D Tensors matches the dimension C of the 4D Tensors.
+  }];
+
+  let arguments = (ins
+    TensorOf<[BF16, F16, F32]>:$x,
+    F32Tensor:$scale,
+    F32Tensor:$offset,
+    F32Tensor:$mean,
+    F32Tensor:$variance,
+
+    DefaultValuedAttr<F32Attr, "0.0001f">:$epsilon,
+    DefaultValuedAttr<F32Attr, "1.0f">:$exponential_avg_factor,
+    DefaultValuedAttr<TF_ConvnetDataFormatAttr, "NHWC">:$data_format,
+    DefaultValuedAttr<BoolAttr, "true">:$is_training
+  );
+
+  let results = (outs
+    TensorOf<[BF16, F16, F32]>:$y,
+    F32Tensor:$batch_mean,
+    F32Tensor:$batch_variance,
+    F32Tensor:$reserve_space_1,
+    F32Tensor:$reserve_space_2
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedOperandTypeAttr U = TF_DerivedOperandTypeAttr<1>;
+
+  let extraClassDeclaration = [{
+    // TF_FoldOperandsTransposeInterface:
+    SmallVector<unsigned, 4> GetLayoutDependentArgs() { return {0}; }
+    SmallVector<unsigned, 4> GetLayoutDependentResults() { return {0}; }
+    LogicalResult FoldOperandsPermutation(ArrayRef<int64_t> permutation);
+
+    // TF_LayoutSensitiveInterface:
+    StringRef GetOptimalLayout(const RuntimeDevices& devices);
+    LogicalResult UpdateDataFormat(StringRef data_format);
+  }];
+}
+
+def TF_FusedBatchNormV3Op : TF_Op<"FusedBatchNormV3", [NoSideEffect, TF_FoldOperandsTransposeInterface, TF_LayoutSensitiveInterface]> {
+  let summary = "Batch normalization.";
+
+  let description = [{
+Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+The size of 1D Tensors matches the dimension C of the 4D Tensors.
+  }];
+
+  let arguments = (ins
+    TensorOf<[BF16, F16, F32]>:$x,
+    F32Tensor:$scale,
+    F32Tensor:$offset,
+    F32Tensor:$mean,
+    F32Tensor:$variance,
+
+    DefaultValuedAttr<F32Attr, "0.0001f">:$epsilon,
+    DefaultValuedAttr<F32Attr, "1.0f">:$exponential_avg_factor,
+    DefaultValuedAttr<TF_ConvnetDataFormatAttr, "NHWC">:$data_format,
+    DefaultValuedAttr<BoolAttr, "true">:$is_training
+  );
+
+  let results = (outs
+    TensorOf<[BF16, F16, F32]>:$y,
+    F32Tensor:$batch_mean,
+    F32Tensor:$batch_variance,
+    F32Tensor:$reserve_space_1,
+    F32Tensor:$reserve_space_2,
+    F32Tensor:$reserve_space_3
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedOperandTypeAttr U = TF_DerivedOperandTypeAttr<1>;
+
+  let extraClassDeclaration = [{
+    // TF_FoldOperandsTransposeInterface:
+    SmallVector<unsigned, 4> GetLayoutDependentArgs() { return {0}; }
+    SmallVector<unsigned, 4> GetLayoutDependentResults() { return {0}; }
+    LogicalResult FoldOperandsPermutation(ArrayRef<int64_t> permutation);
+
+    // TF_LayoutSensitiveInterface:
+    StringRef GetOptimalLayout(const RuntimeDevices& devices);
+    LogicalResult UpdateDataFormat(StringRef data_format);
+  }];
+}
+
 def TF_GatherOp : TF_Op<"Gather", [NoSideEffect]> {
   let summary = "Gather slices from `params` according to `indices`.";
 
@@ -3922,7 +4235,7 @@ table will be immutable.
   );
 
   let results = (outs
-    TF_ResourceTensor:$table_handle
+    Res<TF_ResourceTensor, "", [TF_LookupTableAlloc]>:$table_handle
   );
 }
 
@@ -4227,33 +4540,37 @@ tf.imag(input) ==> [4.75, 5.75]
   TF_DerivedResultTypeAttr Tout = TF_DerivedResultTypeAttr<0>;
 }
 
-def TF_InitializeTableFromTextFileV2Op : TF_Op<"InitializeTableFromTextFileV2", []> {
-  let summary = "Initializes a table from a text file.";
-
-  let description = [{
-It inserts one key-value pair into the table for each line of the file.
-The key and value is extracted from the whole line content, elements from the
-split line based on `delimiter` or the line number (starting from zero).
-Where to extract the key and value from a line is specified by `key_index` and
-`value_index`.
-
-- A value of -1 means use the line number(starting from zero), expects `int64`.
-- A value of -2 means use the whole line content, expects `string`.
-- A value >= 0 means use the index (starting at zero) of the split line based
-  on `delimiter`.
+def TF_InfeedDequeueOp : TF_Op<"InfeedDequeue", []> {
+  let summary = [{
+A placeholder op for a value that will be fed into the computation.
   }];
 
   let arguments = (ins
-    TF_ResourceTensor:$table_handle,
-    TF_StrTensor:$filename,
+    TF_ShapeAttr:$shape
+  );
 
-    Confined<I64Attr, [IntMinValue<-2>]>:$key_index,
-    Confined<I64Attr, [IntMinValue<-2>]>:$value_index,
-    Confined<DefaultValuedAttr<I64Attr, "-1">, [IntMinValue<-1>]>:$vocab_size,
-    DefaultValuedAttr<StrAttr, "\t">:$delimiter
+  let results = (outs
+    TF_Tensor:$output
+  );
+
+  TF_DerivedResultTypeAttr dtype = TF_DerivedResultTypeAttr<0>;
+}
+
+def TF_InitializeTableV2Op : TF_Op<"InitializeTableV2", []> {
+  let summary = [{
+Table initializer that takes two tensors for keys and values respectively.
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_LookupTableWrite]>:$table_handle,
+    TF_Tensor:$keys,
+    TF_Tensor:$values
   );
 
   let results = (outs);
+
+  TF_DerivedOperandTypeAttr Tval = TF_DerivedOperandTypeAttr<2>;
+  TF_DerivedOperandTypeAttr Tkey = TF_DerivedOperandTypeAttr<1>;
 }
 
 def TF_InplaceUpdateOp : TF_Op<"InplaceUpdate", [NoSideEffect]> {
@@ -4563,7 +4880,7 @@ def TF_LRNGradOp : TF_Op<"LRNGrad", [NoSideEffect]> {
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
-def TF_LeakyReluOp : TF_Op<"LeakyRelu", [NoSideEffect, SameOperandsAndResultType]> {
+def TF_LeakyReluOp : TF_Op<"LeakyRelu", [NoSideEffect, SameOperandsAndResultType, TF_ContractionFusableInterface]> {
   let summary = "Computes rectified linear: `max(features, features * alpha)`.";
 
   let arguments = (ins
@@ -4579,6 +4896,11 @@ def TF_LeakyReluOp : TF_Op<"LeakyRelu", [NoSideEffect, SameOperandsAndResultType
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 
   let hasFolder = 1;
+
+  let extraClassDeclaration = [{
+    // TF_ContractionFusableInterface:
+    Optional<ContractionFusion> GetContractionFusion();
+  }];
 }
 
 def TF_LeakyReluGradOp : TF_Op<"LeakyReluGrad", [NoSideEffect, SameOperandsAndResultType]> {
@@ -4772,6 +5094,49 @@ tf.linspace(10.0, 12.0, 3, name="linspace") => [ 10.0  11.0  12.0]
   TF_DerivedOperandTypeAttr Tidx = TF_DerivedOperandTypeAttr<2>;
 }
 
+def TF_ListDiffOp : TF_Op<"ListDiff", [NoSideEffect]> {
+  let summary = [{
+Computes the difference between two lists of numbers or strings.
+  }];
+
+  let description = [{
+Given a list `x` and a list `y`, this operation returns a list `out` that
+represents all values that are in `x` but not in `y`. The returned list `out`
+is sorted in the same order that the numbers appear in `x` (duplicates are
+preserved). This operation also returns a list `idx` that represents the
+position of each `out` element in `x`. In other words:
+
+`out[i] = x[idx[i]] for i in [0, 1, ..., len(out) - 1]`
+
+For example, given this input:
+
+```
+x = [1, 2, 3, 4, 5, 6]
+y = [1, 3, 5]
+```
+
+This operation would return:
+
+```
+out ==> [2, 4, 6]
+idx ==> [1, 3, 5]
+```
+  }];
+
+  let arguments = (ins
+    TF_Tensor:$x,
+    TF_Tensor:$y
+  );
+
+  let results = (outs
+    TF_Tensor:$out,
+    TF_I32OrI64Tensor:$idx
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedResultTypeAttr out_idx = TF_DerivedResultTypeAttr<1>;
+}
+
 def TF_LogOp : TF_Op<"Log", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Computes natural logarithm of x element-wise.";
 
@@ -4896,6 +5261,22 @@ def TF_LogicalOrOp : TF_Op<"LogicalOr", [Commutative, NoSideEffect, ResultsBroad
   );
 }
 
+def TF_LookupTableExportV2Op : TF_Op<"LookupTableExportV2", []> {
+  let summary = "Outputs all keys and values in the table.";
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_LookupTableRead]>:$table_handle
+  );
+
+  let results = (outs
+    TF_Tensor:$keys,
+    TF_Tensor:$values
+  );
+
+  TF_DerivedResultTypeAttr Tkeys = TF_DerivedResultTypeAttr<0>;
+  TF_DerivedResultTypeAttr Tvalues = TF_DerivedResultTypeAttr<1>;
+}
+
 def TF_LookupTableFindV2Op : TF_Op<"LookupTableFindV2", []> {
   let summary = "Looks up keys in a table, outputs the corresponding values.";
 
@@ -4908,7 +5289,7 @@ table. It must also be of the same type as the table values.
   }];
 
   let arguments = (ins
-    TF_ResourceTensor:$table_handle,
+    Arg<TF_ResourceTensor, "", [TF_LookupTableRead]>:$table_handle,
     TF_Tensor:$keys,
     TF_Tensor:$default_value
   );
@@ -4932,7 +5313,7 @@ The tensor `values` must be of the type of the table values.
   }];
 
   let arguments = (ins
-    TF_ResourceTensor:$table_handle,
+    Arg<TF_ResourceTensor, "", [TF_LookupTableWrite]>:$table_handle,
     TF_Tensor:$keys,
     TF_Tensor:$values
   );
@@ -4943,6 +5324,44 @@ The tensor `values` must be of the type of the table values.
   TF_DerivedOperandTypeAttr Tout = TF_DerivedOperandTypeAttr<2>;
 }
 
+def TF_LookupTableInsertV2Op : TF_Op<"LookupTableInsertV2", []> {
+  let summary = "Updates the table to associates keys with values.";
+
+  let description = [{
+The tensor `keys` must be of the same type as the keys of the table.
+The tensor `values` must be of the type of the table values.
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_LookupTableWrite]>:$table_handle,
+    TF_Tensor:$keys,
+    TF_Tensor:$values
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeAttr Tin = TF_DerivedOperandTypeAttr<1>;
+  TF_DerivedOperandTypeAttr Tout = TF_DerivedOperandTypeAttr<2>;
+}
+
+def TF_LookupTableRemoveV2Op : TF_Op<"LookupTableRemoveV2", []> {
+  let summary = "Removes keys and its associated values from a table.";
+
+  let description = [{
+The tensor `keys` must of the same type as the keys of the table. Keys not
+already in the table are silently ignored.
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_LookupTableWrite]>:$table_handle,
+    TF_Tensor:$keys
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeAttr Tin = TF_DerivedOperandTypeAttr<1>;
+}
+
 def TF_LookupTableSizeV2Op : TF_Op<"LookupTableSizeV2", []> {
   let summary = "Computes the number of elements in the given table.";
 
@@ -4955,6 +5374,44 @@ def TF_LookupTableSizeV2Op : TF_Op<"LookupTableSizeV2", []> {
   );
 }
 
+def TF_LowerBoundOp : TF_Op<"LowerBound", [NoSideEffect]> {
+  let summary = [{
+Applies lower_bound(sorted_search_values, values) along each row.
+  }];
+
+  let description = [{
+Each set of rows with the same index in (sorted_inputs, values) is treated
+independently.  The resulting row is the equivalent of calling
+`np.searchsorted(sorted_inputs, values, side='left')`.
+
+The result is not a global index to the entire
+`Tensor`, but rather just the index in the last dimension.
+
+A 2-D example:
+  sorted_sequence = [[0, 3, 9, 9, 10],
+                     [1, 2, 3, 4, 5]]
+  values = [[2, 4, 9],
+            [0, 2, 6]]
+
+  result = LowerBound(sorted_sequence, values)
+
+  result == [[1, 2, 2],
+             [0, 1, 5]]
+  }];
+
+  let arguments = (ins
+    TF_Tensor:$sorted_inputs,
+    TF_Tensor:$values
+  );
+
+  let results = (outs
+    TF_I32OrI64Tensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedResultTypeAttr out_type = TF_DerivedResultTypeAttr<0>;
+}
+
 def TF_MatMulOp : TF_Op<"MatMul", [NoSideEffect, TF_SameOperandsAndResultElementTypeResolveRef]> {
   let summary = [{
 Multiply the matrix "a" by the matrix "b".
@@ -5464,6 +5921,36 @@ tf.matrix_diag(diagonal, k = -1, num_rows = 3, padding_value = 9)
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_MatrixInverseOp : TF_Op<"MatrixInverse", [NoSideEffect]> {
+  let summary = [{
+Computes the inverse of one or more square invertible matrices or their adjoints (conjugate transposes).
+  }];
+
+  let description = [{
+The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+form square matrices. The output is a tensor of the same shape as the input
+containing the inverse for all input submatrices `[..., :, :]`.
+
+The op uses LU decomposition with partial pivoting to compute the inverses.
+
+If a matrix is not invertible there is no guarantee what the op does. It
+may detect the condition and raise an exception or it may simply return a
+garbage result.
+  }];
+
+  let arguments = (ins
+    TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$input,
+
+    DefaultValuedAttr<BoolAttr, "false">:$adjoint
+  );
+
+  let results = (outs
+    TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_MatrixSetDiagOp : TF_Op<"MatrixSetDiag", [NoSideEffect]> {
   let summary = [{
 Returns a batched matrix tensor with new batched diagonal values.
@@ -5715,6 +6202,100 @@ tf.matrix_set_diag(input, diagonals, k = (-1, 2), align="LEFT_RIGHT")
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_MatrixSolveOp : TF_Op<"MatrixSolve", [NoSideEffect]> {
+  let summary = "Solves systems of linear equations.";
+
+  let description = [{
+`Matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+form square matrices. `Rhs` is a tensor of shape `[..., M, K]`. The `output` is
+a tensor shape `[..., M, K]`.  If `adjoint` is `False` then each output matrix
+satisfies `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
+If `adjoint` is `True` then each output matrix satisfies
+`adjoint(matrix[..., :, :]) * output[..., :, :] = rhs[..., :, :]`.
+  }];
+
+  let arguments = (ins
+    TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$matrix,
+    TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$rhs,
+
+    DefaultValuedAttr<BoolAttr, "false">:$adjoint
+  );
+
+  let results = (outs
+    TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
+def TF_MatrixTriangularSolveOp : TF_Op<"MatrixTriangularSolve", [NoSideEffect]> {
+  let summary = [{
+Solves systems of linear equations with upper or lower triangular matrices by backsubstitution.
+  }];
+
+  let description = [{
+`matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions form
+square matrices. If `lower` is `True` then the strictly upper triangular part
+of each inner-most matrix is assumed to be zero and not accessed.
+If `lower` is False then the strictly lower triangular part of each inner-most
+matrix is assumed to be zero and not accessed.
+`rhs` is a tensor of shape `[..., M, N]`.
+
+The output is a tensor of shape `[..., M, N]`. If `adjoint` is
+`True` then the innermost matrices in `output` satisfy matrix equations
+`matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
+If `adjoint` is `False` then the strictly then the  innermost matrices in
+`output` satisfy matrix equations
+`adjoint(matrix[..., i, k]) * output[..., k, j] = rhs[..., i, j]`.
+
+Note, the batch shapes for the inputs only need to broadcast.
+
+Example:
+```python
+
+a = tf.constant([[3,  0,  0,  0],
+                 [2,  1,  0,  0],
+                 [1,  0,  1,  0],
+                 [1,  1,  1,  1]], dtype=tf.float32)
+
+b = tf.constant([[4],
+                 [2],
+                 [4],
+                 [2]], dtype=tf.float32)
+
+x = tf.linalg.triangular_solve(a, b, lower=True)
+x
+# <tf.Tensor: shape=(4, 1), dtype=float32, numpy=
+# array([[ 1.3333334 ],
+#        [-0.66666675],
+#        [ 2.6666665 ],
+#        [-1.3333331 ]], dtype=float32)>
+
+# in python3 one can use `a@x`
+tf.matmul(a, x)
+# <tf.Tensor: shape=(4, 1), dtype=float32, numpy=
+# array([[4.       ],
+#        [2.       ],
+#        [4.       ],
+#        [1.9999999]], dtype=float32)>
+```
+  }];
+
+  let arguments = (ins
+    TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$matrix,
+    TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$rhs,
+
+    DefaultValuedAttr<BoolAttr, "true">:$lower,
+    DefaultValuedAttr<BoolAttr, "false">:$adjoint
+  );
+
+  let results = (outs
+    TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_MaxOp : TF_Op<"Max", [NoSideEffect]> {
   let summary = [{
 Computes the maximum of elements across dimensions of a tensor.
@@ -5728,14 +6309,14 @@ retained with length 1.
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$input,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Qint16, TF_Qint32, TF_Qint8, TF_Quint16, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$input,
     TF_I32OrI64Tensor:$reduction_indices,
 
     DefaultValuedAttr<BoolAttr, "false">:$keep_dims
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Qint16, TF_Qint32, TF_Qint8, TF_Quint16, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -5755,7 +6336,8 @@ def TF_MaxPoolOp : TF_Op<"MaxPool", [NoSideEffect, TF_FoldOperandsTransposeInter
 
     Confined<I64ArrayAttr, [ArrayMinCount<4>]>:$ksize,
     Confined<I64ArrayAttr, [ArrayMinCount<4>]>:$strides,
-    TF_AnyStrAttrOf<["SAME", "VALID"]>:$padding,
+    TF_AnyStrAttrOf<["SAME", "VALID", "EXPLICIT"]>:$padding,
+    DefaultValuedAttr<I64ArrayAttr, "{}">:$explicit_paddings,
     DefaultValuedAttr<TF_AnyStrAttrOf<["NHWC", "NCHW", "NCHW_VECT_C"]>, "NHWC">:$data_format
   );
 
@@ -5824,7 +6406,8 @@ def TF_MaxPoolGradOp : TF_Op<"MaxPoolGrad", [NoSideEffect]> {
 
     Confined<I64ArrayAttr, [ArrayMinCount<4>]>:$ksize,
     Confined<I64ArrayAttr, [ArrayMinCount<4>]>:$strides,
-    TF_AnyStrAttrOf<["SAME", "VALID"]>:$padding,
+    TF_AnyStrAttrOf<["SAME", "VALID", "EXPLICIT"]>:$padding,
+    DefaultValuedAttr<I64ArrayAttr, "{}">:$explicit_paddings,
     DefaultValuedAttr<TF_ConvnetDataFormatAttr, "NHWC">:$data_format
   );
 
@@ -5839,25 +6422,60 @@ def TF_MaxPoolGradOp : TF_Op<"MaxPoolGrad", [NoSideEffect]> {
   }];
 }
 
-def TF_MaximumOp : TF_Op<"Maximum", [NoSideEffect, ResultsBroadcastableShape, TF_SameOperandsAndResultElementTypeResolveRef]>,
-                   WithBroadcastableBinOpBuilder {
-  let summary = "Returns the max of x and y (i.e. x > y ? x : y) element-wise.";
+def TF_MeanOp : TF_Op<"Mean", [NoSideEffect, TF_FoldOperandsTransposeInterface]> {
+  let summary = "Computes the mean of elements across dimensions of a tensor.";
 
   let description = [{
-*NOTE*: `Maximum` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+Reduces `input` along the dimensions given in `axis`. Unless
+`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+`axis`. If `keep_dims` is true, the reduced dimensions are
+retained with length 1.
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, TF_Uint8]>:$x,
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, TF_Uint8]>:$y
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$input,
+    TF_I32OrI64Tensor:$reduction_indices,
+
+    DefaultValuedAttr<BoolAttr, "false">:$keep_dims
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, TF_Uint8]>:$z
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedOperandTypeAttr Tidx = TF_DerivedOperandTypeAttr<1>;
+
+  let extraClassDeclaration = [{
+    // TF_FoldOperandsTransposeInterface:
+    SmallVector<unsigned, 4> GetLayoutDependentArgs() { return {0}; }
+    SmallVector<unsigned, 4> GetLayoutDependentResults() { return {}; }
+    LogicalResult FoldOperandsPermutation(ArrayRef<int64_t> permutation);
+  }];
+}
+
+def TF_MergeSummaryOp : TF_Op<"MergeSummary", [NoSideEffect, SameOperandsAndResultType]> {
+  let summary = "Merges summaries.";
+
+  let description = [{
+This op creates a
+[`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
+protocol buffer that contains the union of all the values in the input
+summaries.
+
+When the Op is run, it reports an `InvalidArgument` error if multiple values
+in the summaries to merge use the same tag.
+  }];
+
+  let arguments = (ins
+    Variadic<TF_StrTensor>:$inputs
+  );
+
+  let results = (outs
+    TF_StrTensor:$summary
+  );
+
+  TF_DerivedOperandSizeAttr N = TF_DerivedOperandSizeAttr<0>;
 }
 
 def TF_MergeV2CheckpointsOp : TF_Op<"MergeV2Checkpoints", []> {
@@ -5899,14 +6517,14 @@ retained with length 1.
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$input,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Qint16, TF_Qint32, TF_Qint8, TF_Quint16, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$input,
     TF_I32OrI64Tensor:$reduction_indices,
 
     DefaultValuedAttr<BoolAttr, "false">:$keep_dims
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Qint16, TF_Qint32, TF_Qint8, TF_Quint16, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -5981,7 +6599,7 @@ pad(t, paddings) ==> [[2, 1, 1, 2, 3, 3, 2]
 }
 
 def TF_MlirLocalVarOp : TF_Op<"MlirLocalVarOp", []> {
-  let summary = "Creates a handle to a in-scope variable.";
+  let summary = "Creates a handle to an in-scope variable.";
 
   let description = [{
 Used by internal passes for temporary representation of local state, which will
@@ -5991,7 +6609,7 @@ be eventually removed.
   let arguments = (ins);
 
   let results = (outs
-    TF_ResourceTensor:$resource
+    Res<TF_ResourceTensor, "", [TF_VariableAlloc]>:$resource
   );
 }
 
@@ -6072,7 +6690,7 @@ the result here is consistent with a truncating divide. E.g.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
-def TF_MulOp : TF_Op<"Mul", [Commutative, NoSideEffect, ResultsBroadcastableShape, TF_SameOperandsAndResultElementTypeResolveRef, TF_CwiseBinary]>,
+def TF_MulOp : TF_Op<"Mul", [Commutative, NoSideEffect, ResultsBroadcastableShape, TF_CwiseBinary, TF_SameOperandsAndResultElementTypeResolveRef]>,
                WithBroadcastableBinOpBuilder {
   let summary = "Returns x * y element-wise.";
 
@@ -6137,6 +6755,85 @@ def TF_MultinomialOp : TF_Op<"Multinomial", [TF_CannotDuplicate]> {
   TF_DerivedResultTypeAttr output_dtype = TF_DerivedResultTypeAttr<0>;
 }
 
+def TF_MutableDenseHashTableV2Op : TF_Op<"MutableDenseHashTableV2", []> {
+  let summary = [{
+Creates an empty hash table that uses tensors as the backing store.
+  }];
+
+  let description = [{
+It uses "open addressing" with quadratic reprobing to resolve
+collisions.
+
+This op creates a mutable hash table, specifying the type of its keys and
+values. Each value must be a scalar. Data can be inserted into the table using
+the insert operations. It does not support the initialization operation.
+  }];
+
+  let arguments = (ins
+    TF_Tensor:$empty_key,
+    TF_Tensor:$deleted_key,
+
+    StrAttr:$container,
+    StrAttr:$shared_name,
+    DefaultValuedAttr<BoolAttr, "false">:$use_node_name_sharing,
+    TypeAttr:$value_dtype,
+    DefaultValuedAttr<TF_ShapeAttr, "llvm::ArrayRef<int64_t>({})">:$value_shape,
+    DefaultValuedAttr<I64Attr, "131072">:$initial_num_buckets,
+    DefaultValuedAttr<F32Attr, "0.8f">:$max_load_factor
+  );
+
+  let results = (outs
+    Res<TF_ResourceTensor, "", [TF_LookupTableAlloc]>:$table_handle
+  );
+
+  TF_DerivedOperandTypeAttr key_dtype = TF_DerivedOperandTypeAttr<0>;
+}
+
+def TF_MutableHashTableOfTensorsV2Op : TF_Op<"MutableHashTableOfTensorsV2", []> {
+  let summary = "Creates an empty hash table.";
+
+  let description = [{
+This op creates a mutable hash table, specifying the type of its keys and
+values. Each value must be a vector. Data can be inserted into the table using
+the insert operations. It does not support the initialization operation.
+  }];
+
+  let arguments = (ins
+    StrAttr:$container,
+    StrAttr:$shared_name,
+    DefaultValuedAttr<BoolAttr, "false">:$use_node_name_sharing,
+    TypeAttr:$key_dtype,
+    TypeAttr:$value_dtype,
+    DefaultValuedAttr<TF_ShapeAttr, "llvm::ArrayRef<int64_t>({})">:$value_shape
+  );
+
+  let results = (outs
+    Res<TF_ResourceTensor, "", [TF_LookupTableAlloc]>:$table_handle
+  );
+}
+
+def TF_MutableHashTableV2Op : TF_Op<"MutableHashTableV2", []> {
+  let summary = "Creates an empty hash table.";
+
+  let description = [{
+This op creates a mutable hash table, specifying the type of its keys and
+values. Each value must be a scalar. Data can be inserted into the table using
+the insert operations. It does not support the initialization operation.
+  }];
+
+  let arguments = (ins
+    StrAttr:$container,
+    StrAttr:$shared_name,
+    DefaultValuedAttr<BoolAttr, "false">:$use_node_name_sharing,
+    TypeAttr:$key_dtype,
+    TypeAttr:$value_dtype
+  );
+
+  let results = (outs
+    Res<TF_ResourceTensor, "", [TF_LookupTableAlloc]>:$table_handle
+  );
+}
+
 def TF_NdtriOp : TF_Op<"Ndtri", [NoSideEffect]> {
   let summary = "";
 
@@ -7233,9 +7930,6 @@ def TF_RangeDatasetOp : TF_Op<"RangeDataset", []> {
 Creates a dataset with a range of values. Corresponds to python's xrange.
   }];
 
-  let description = [{
-  }];
-
   let arguments = (ins
     I64Tensor:$start,
     I64Tensor:$stop,
@@ -7340,33 +8034,6 @@ tf.real(input) ==> [-2.25, 3.25]
   TF_DerivedResultTypeAttr Tout = TF_DerivedResultTypeAttr<0>;
 }
 
-def TF_RealDivOp : TF_Op<"RealDiv", [NoSideEffect, ResultsBroadcastableShape, TF_CwiseBinary]>,
-                   WithBroadcastableBinOpBuilder {
-  let summary = "Returns x / y element-wise for real types.";
-
-  let description = [{
-If `x` and `y` are reals, this will return the floating-point division.
-
-*NOTE*: `Div` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-  }];
-
-  let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Uint16, TF_Uint8]>:$x,
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Uint16, TF_Uint8]>:$y
-  );
-
-  let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Uint16, TF_Uint8]>:$z
-  );
-
-  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
-
-  let hasCanonicalizer = 1;
-
-  let hasFolder = 1;
-}
-
 def TF_ReciprocalOp : TF_Op<"Reciprocal", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Computes the reciprocal of x element-wise.";
 
@@ -7430,7 +8097,7 @@ most one RecvTPUEmbeddingActivations op in the TPU graph.
   TF_DerivedResultSizeAttr num_outputs = TF_DerivedResultSizeAttr<0>;
 }
 
-def TF_ReluOp : TF_Op<"Relu", [NoSideEffect, SameOperandsAndResultType, TF_LayoutAgnostic]> {
+def TF_ReluOp : TF_Op<"Relu", [NoSideEffect, SameOperandsAndResultType, TF_ContractionFusableInterface, TF_LayoutAgnostic]> {
   let summary = "Computes rectified linear: `max(features, 0)`.";
 
   let description = [{
@@ -7449,6 +8116,11 @@ array([ 0.,  0., -0.,  3.], dtype=float32)
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+
+  let extraClassDeclaration = [{
+    // TF_ContractionFusableInterface:
+    Optional<ContractionFusion> GetContractionFusion();
+  }];
 }
 
 def TF_Relu6Op : TF_Op<"Relu6", [NoSideEffect, SameOperandsAndResultType]> {
@@ -7645,6 +8317,105 @@ Resize `images` to `size` using nearest neighbor interpolation.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_ResourceApplyAdaMaxOp : TF_Op<"ResourceApplyAdaMax", []> {
+  let summary = "Update '*var' according to the AdaMax algorithm.";
+
+  let description = [{
+m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+v_t <- max(beta2 * v_{t-1}, abs(g))
+variable <- variable - learning_rate / (1 - beta1^t) * m_t / (v_t + epsilon)
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$var,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$m,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$v,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$beta1_power,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$lr,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$beta1,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$beta2,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$epsilon,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$grad,
+
+    DefaultValuedAttr<BoolAttr, "false">:$use_locking
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<3>;
+}
+
+def TF_ResourceApplyAdadeltaOp : TF_Op<"ResourceApplyAdadelta", []> {
+  let summary = "Update '*var' according to the adadelta scheme.";
+
+  let description = [{
+accum = rho() * accum + (1 - rho()) * grad.square();
+update = (update_accum + epsilon).sqrt() * (accum + epsilon()).rsqrt() * grad;
+update_accum = rho() * update_accum + (1 - rho()) * update.square();
+var -= update;
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$var,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$accum,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$accum_update,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$lr,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$rho,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$epsilon,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$grad,
+
+    DefaultValuedAttr<BoolAttr, "false">:$use_locking
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<3>;
+}
+
+def TF_ResourceApplyAdagradOp : TF_Op<"ResourceApplyAdagrad", []> {
+  let summary = "Update '*var' according to the adagrad scheme.";
+
+  let description = [{
+accum += grad * grad
+var -= lr * grad * (1 / sqrt(accum))
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$var,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$accum,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$lr,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$grad,
+
+    DefaultValuedAttr<BoolAttr, "false">:$use_locking,
+    DefaultValuedAttr<BoolAttr, "true">:$update_slots
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<2>;
+}
+
+def TF_ResourceApplyAdagradDAOp : TF_Op<"ResourceApplyAdagradDA", []> {
+  let summary = "Update '*var' according to the proximal adagrad scheme.";
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$var,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$gradient_accumulator,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$gradient_squared_accumulator,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$grad,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$lr,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$l1,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$l2,
+    I64Tensor:$global_step,
+
+    DefaultValuedAttr<BoolAttr, "false">:$use_locking
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<3>;
+}
+
 def TF_ResourceApplyAdagradV2Op : TF_Op<"ResourceApplyAdagradV2", []> {
   let summary = "Update '*var' according to the adagrad scheme.";
 
@@ -7654,8 +8425,8 @@ var -= lr * grad * (1 / (sqrt(accum) + epsilon))
   }];
 
   let arguments = (ins
-    TF_ResourceTensor:$var,
-    TF_ResourceTensor:$accum,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$var,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$accum,
     TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$lr,
     TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$epsilon,
     TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$grad,
@@ -7680,9 +8451,9 @@ $$\text{variable} := \text{variable} - \text{lr}_t * m_t / (\sqrt{v_t} + \epsilo
   }];
 
   let arguments = (ins
-    TF_ResourceTensor:$var,
-    TF_ResourceTensor:$m,
-    TF_ResourceTensor:$v,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$var,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$m,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$v,
     TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$beta1_power,
     TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$beta2_power,
     TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$lr,
@@ -7700,6 +8471,32 @@ $$\text{variable} := \text{variable} - \text{lr}_t * m_t / (\sqrt{v_t} + \epsilo
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<3>;
 }
 
+def TF_ResourceApplyAddSignOp : TF_Op<"ResourceApplyAddSign", []> {
+  let summary = "Update '*var' according to the AddSign update.";
+
+  let description = [{
+m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+update <- (alpha + sign_decay * sign(g) *sign(m)) * g
+variable <- variable - lr_t * update
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$var,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$m,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$lr,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$alpha,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$sign_decay,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$beta,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$grad,
+
+    DefaultValuedAttr<BoolAttr, "false">:$use_locking
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<2>;
+}
+
 def TF_ResourceApplyCenteredRMSPropOp : TF_Op<"ResourceApplyCenteredRMSProp", []> {
   let summary = "Update '*var' according to the centered RMSProp algorithm.";
 
@@ -7725,10 +8522,10 @@ var <- var - mom
   }];
 
   let arguments = (ins
-    TF_ResourceTensor:$var,
-    TF_ResourceTensor:$mg,
-    TF_ResourceTensor:$ms,
-    TF_ResourceTensor:$mom,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$var,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$mg,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$ms,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$mom,
     TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$lr,
     TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$rho,
     TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$momentum,
@@ -7743,11 +8540,74 @@ var <- var - mom
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<4>;
 }
 
+def TF_ResourceApplyFtrlOp : TF_Op<"ResourceApplyFtrl", []> {
+  let summary = "Update '*var' according to the Ftrl-proximal scheme.";
+
+  let description = [{
+accum_new = accum + grad * grad
+linear += grad - (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+accum = accum_new
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$var,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$accum,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$linear,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$grad,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$lr,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$l1,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$l2,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$lr_power,
+
+    DefaultValuedAttr<BoolAttr, "false">:$use_locking,
+    DefaultValuedAttr<BoolAttr, "false">:$multiply_linear_by_lr
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<3>;
+}
+
+def TF_ResourceApplyFtrlV2Op : TF_Op<"ResourceApplyFtrlV2", []> {
+  let summary = "Update '*var' according to the Ftrl-proximal scheme.";
+
+  let description = [{
+grad_with_shrinkage = grad + 2 * l2_shrinkage * var
+accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
+linear += grad_with_shrinkage +
+    (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+accum = accum_new
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$var,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$accum,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$linear,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$grad,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$lr,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$l1,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$l2,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$l2_shrinkage,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$lr_power,
+
+    DefaultValuedAttr<BoolAttr, "false">:$use_locking,
+    DefaultValuedAttr<BoolAttr, "false">:$multiply_linear_by_lr
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<3>;
+}
+
 def TF_ResourceApplyGradientDescentOp : TF_Op<"ResourceApplyGradientDescent", []> {
   let summary = "Update '*var' by subtracting 'alpha' * 'delta' from it.";
 
   let arguments = (ins
-    TF_ResourceTensor:$var,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$var,
     TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$alpha,
     TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$delta,
 
@@ -7770,8 +8630,8 @@ var += accum
   }];
 
   let arguments = (ins
-    TF_ResourceTensor:$var,
-    TF_ResourceTensor:$accum,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$var,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$accum,
     TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$lr,
     TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$grad,
     TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$momentum,
@@ -7796,8 +8656,8 @@ var -= lr * accum
   }];
 
   let arguments = (ins
-    TF_ResourceTensor:$var,
-    TF_ResourceTensor:$accum,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$var,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$accum,
     TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$lr,
     TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$grad,
     TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$momentum,
@@ -7811,6 +8671,116 @@ var -= lr * accum
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<2>;
 }
 
+def TF_ResourceApplyPowerSignOp : TF_Op<"ResourceApplyPowerSign", []> {
+  let summary = "Update '*var' according to the AddSign update.";
+
+  let description = [{
+m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+update <- exp(logbase * sign_decay * sign(g) * sign(m_t)) * g
+variable <- variable - lr_t * update
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$var,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$m,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$lr,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$logbase,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$sign_decay,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$beta,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$grad,
+
+    DefaultValuedAttr<BoolAttr, "false">:$use_locking
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<2>;
+}
+
+def TF_ResourceApplyProximalAdagradOp : TF_Op<"ResourceApplyProximalAdagrad", []> {
+  let summary = [{
+Update '*var' and '*accum' according to FOBOS with Adagrad learning rate.
+  }];
+
+  let description = [{
+accum += grad * grad
+prox_v = var - lr * grad * (1 / sqrt(accum))
+var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$var,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$accum,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$lr,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$l1,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$l2,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$grad,
+
+    DefaultValuedAttr<BoolAttr, "false">:$use_locking
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<2>;
+}
+
+def TF_ResourceApplyProximalGradientDescentOp : TF_Op<"ResourceApplyProximalGradientDescent", []> {
+  let summary = "Update '*var' as FOBOS algorithm with fixed learning rate.";
+
+  let description = [{
+prox_v = var - alpha * delta
+var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$var,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$alpha,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$l1,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$l2,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$delta,
+
+    DefaultValuedAttr<BoolAttr, "false">:$use_locking
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<1>;
+}
+
+def TF_ResourceApplyRMSPropOp : TF_Op<"ResourceApplyRMSProp", []> {
+  let summary = "Update '*var' according to the RMSProp algorithm.";
+
+  let description = [{
+Note that in dense implementation of this algorithm, ms and mom will
+update even if the grad is zero, but in this sparse implementation, ms
+and mom will not update in iterations during which the grad is zero.
+
+mean_square = decay * mean_square + (1-decay) * gradient ** 2
+Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+
+ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+var <- var - mom
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$var,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$ms,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$mom,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$lr,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$rho,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$momentum,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$epsilon,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$grad,
+
+    DefaultValuedAttr<BoolAttr, "false">:$use_locking
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<3>;
+}
+
 def TF_ResourceGatherOp : TF_Op<"ResourceGather", []> {
   let summary = [{
 Gather slices from the variable pointed to by `resource` according to `indices`.
@@ -7833,7 +8803,7 @@ Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
   }];
 
   let arguments = (ins
-    TF_ResourceTensor:$resource,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead]>:$resource,
     TF_I32OrI64Tensor:$indices,
 
     DefaultValuedAttr<I64Attr, "0">:$batch_dims,
@@ -7848,6 +8818,405 @@ Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
   TF_DerivedResultTypeAttr dtype = TF_DerivedResultTypeAttr<0>;
 }
 
+def TF_ResourceScatterAddOp : TF_Op<"ResourceScatterAdd", []> {
+  let summary = "Adds sparse updates to the variable referenced by `resource`.";
+
+  let description = [{
+This operation computes
+
+    # Scalar indices
+    ref[indices, ...] += updates[...]
+
+    # Vector indices (for each i)
+    ref[indices[i], ...] += updates[i, ...]
+
+    # High rank indices (for each i, ..., j)
+    ref[indices[i, ..., j], ...] += updates[i, ..., j, ...]
+
+Duplicate entries are handled correctly: if multiple `indices` reference
+the same location, their contributions add.
+
+Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+
+<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+</div>
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$resource,
+    TF_I32OrI64Tensor:$indices,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$updates
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeAttr Tindices = TF_DerivedOperandTypeAttr<1>;
+  TF_DerivedOperandTypeAttr dtype = TF_DerivedOperandTypeAttr<2>;
+}
+
+def TF_ResourceScatterDivOp : TF_Op<"ResourceScatterDiv", []> {
+  let summary = [{
+Divides sparse updates into the variable referenced by `resource`.
+  }];
+
+  let description = [{
+This operation computes
+
+    # Scalar indices
+    ref[indices, ...] /= updates[...]
+
+    # Vector indices (for each i)
+    ref[indices[i], ...] /= updates[i, ...]
+
+    # High rank indices (for each i, ..., j)
+    ref[indices[i, ..., j], ...] /= updates[i, ..., j, ...]
+
+Duplicate entries are handled correctly: if multiple `indices` reference
+the same location, their contributions multiply.
+
+Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+
+<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+</div>
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$resource,
+    TF_I32OrI64Tensor:$indices,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$updates
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeAttr Tindices = TF_DerivedOperandTypeAttr<1>;
+  TF_DerivedOperandTypeAttr dtype = TF_DerivedOperandTypeAttr<2>;
+}
+
+def TF_ResourceScatterMaxOp : TF_Op<"ResourceScatterMax", []> {
+  let summary = [{
+Reduces sparse updates into the variable referenced by `resource` using the `max` operation.
+  }];
+
+  let description = [{
+This operation computes
+
+    # Scalar indices
+    ref[indices, ...] = max(ref[indices, ...], updates[...])
+
+    # Vector indices (for each i)
+    ref[indices[i], ...] = max(ref[indices[i], ...], updates[i, ...])
+
+    # High rank indices (for each i, ..., j)
+    ref[indices[i, ..., j], ...] = max(ref[indices[i, ..., j], ...], updates[i, ..., j, ...])
+
+Duplicate entries are handled correctly: if multiple `indices` reference
+the same location, their contributions are combined.
+
+Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+
+<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+</div>
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$resource,
+    TF_I32OrI64Tensor:$indices,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$updates
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeAttr Tindices = TF_DerivedOperandTypeAttr<1>;
+  TF_DerivedOperandTypeAttr dtype = TF_DerivedOperandTypeAttr<2>;
+}
+
+def TF_ResourceScatterMinOp : TF_Op<"ResourceScatterMin", []> {
+  let summary = [{
+Reduces sparse updates into the variable referenced by `resource` using the `min` operation.
+  }];
+
+  let description = [{
+This operation computes
+
+    # Scalar indices
+    ref[indices, ...] = min(ref[indices, ...], updates[...])
+
+    # Vector indices (for each i)
+    ref[indices[i], ...] = min(ref[indices[i], ...], updates[i, ...])
+
+    # High rank indices (for each i, ..., j)
+    ref[indices[i, ..., j], ...] = min(ref[indices[i, ..., j], ...], updates[i, ..., j, ...])
+
+Duplicate entries are handled correctly: if multiple `indices` reference
+the same location, their contributions are combined.
+
+Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+
+<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+</div>
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$resource,
+    TF_I32OrI64Tensor:$indices,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$updates
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeAttr Tindices = TF_DerivedOperandTypeAttr<1>;
+  TF_DerivedOperandTypeAttr dtype = TF_DerivedOperandTypeAttr<2>;
+}
+
+def TF_ResourceScatterMulOp : TF_Op<"ResourceScatterMul", []> {
+  let summary = [{
+Multiplies sparse updates into the variable referenced by `resource`.
+  }];
+
+  let description = [{
+This operation computes
+
+    # Scalar indices
+    ref[indices, ...] *= updates[...]
+
+    # Vector indices (for each i)
+    ref[indices[i], ...] *= updates[i, ...]
+
+    # High rank indices (for each i, ..., j)
+    ref[indices[i, ..., j], ...] *= updates[i, ..., j, ...]
+
+Duplicate entries are handled correctly: if multiple `indices` reference
+the same location, their contributions multiply.
+
+Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+
+<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+</div>
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$resource,
+    TF_I32OrI64Tensor:$indices,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$updates
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeAttr Tindices = TF_DerivedOperandTypeAttr<1>;
+  TF_DerivedOperandTypeAttr dtype = TF_DerivedOperandTypeAttr<2>;
+}
+
+def TF_ResourceScatterNdAddOp : TF_Op<"ResourceScatterNdAdd", []> {
+  let summary = [{
+Applies sparse addition to individual values or slices in a Variable.
+  }];
+
+  let description = [{
+`ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+
+`indices` must be integer tensor, containing indices into `ref`.
+It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+
+The innermost dimension of `indices` (with length `K`) corresponds to
+indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+dimension of `ref`.
+
+`updates` is `Tensor` of rank `Q-1+P-K` with shape:
+
+```
+[d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]]
+```
+
+For example, say we want to add 4 scattered elements to a rank-1 tensor to
+8 elements. In Python, that addition would look like this:
+
+```python
+ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8], use_resource=True)
+indices = tf.constant([[4], [3], [1], [7]])
+updates = tf.constant([9, 10, 11, 12])
+add = tf.scatter_nd_add(ref, indices, updates)
+with tf.Session() as sess:
+  print sess.run(add)
+```
+
+The resulting update to ref would look like this:
+
+    [1, 13, 3, 14, 14, 6, 7, 20]
+
+See `tf.scatter_nd` for more details about how to make updates to
+slices.
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$ref,
+    TF_I32OrI64Tensor:$indices,
+    TF_Tensor:$updates,
+
+    DefaultValuedAttr<BoolAttr, "true">:$use_locking
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeAttr Tindices = TF_DerivedOperandTypeAttr<1>;
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<2>;
+}
+
+def TF_ResourceScatterNdSubOp : TF_Op<"ResourceScatterNdSub", []> {
+  let summary = [{
+Applies sparse subtraction to individual values or slices in a Variable.
+  }];
+
+  let description = [{
+`ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+
+`indices` must be integer tensor, containing indices into `ref`.
+It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+
+The innermost dimension of `indices` (with length `K`) corresponds to
+indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+dimension of `ref`.
+
+`updates` is `Tensor` of rank `Q-1+P-K` with shape:
+
+```
+[d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]]
+```
+
+For example, say we want to subtract 4 scattered elements from a rank-1 tensor
+with 8 elements. In Python, that subtraction would look like this:
+
+```python
+ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8], use_resource=True)
+indices = tf.constant([[4], [3], [1], [7]])
+updates = tf.constant([9, 10, 11, 12])
+sub = tf.scatter_nd_sub(ref, indices, updates)
+with tf.Session() as sess:
+  print sess.run(sub)
+```
+
+The resulting update to ref would look like this:
+
+    [1, -9, 3, -6, -4, 6, 7, -4]
+
+See `tf.scatter_nd` for more details about how to make updates to
+slices.
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$ref,
+    TF_I32OrI64Tensor:$indices,
+    TF_Tensor:$updates,
+
+    DefaultValuedAttr<BoolAttr, "true">:$use_locking
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeAttr Tindices = TF_DerivedOperandTypeAttr<1>;
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<2>;
+}
+
+def TF_ResourceScatterNdUpdateOp : TF_Op<"ResourceScatterNdUpdate", []> {
+  let summary = [{
+Applies sparse `updates` to individual values or slices within a given
+  }];
+
+  let description = [{
+variable according to `indices`.
+
+`ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+
+`indices` must be integer tensor, containing indices into `ref`.
+It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+
+The innermost dimension of `indices` (with length `K`) corresponds to
+indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+dimension of `ref`.
+
+`updates` is `Tensor` of rank `Q-1+P-K` with shape:
+
+```
+[d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+```
+
+For example, say we want to update 4 scattered elements to a rank-1 tensor to
+8 elements. In Python, that update would look like this:
+
+```python
+    ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+    indices = tf.constant([[4], [3], [1] ,[7]])
+    updates = tf.constant([9, 10, 11, 12])
+    update = tf.scatter_nd_update(ref, indices, updates)
+    with tf.Session() as sess:
+      print sess.run(update)
+```
+
+The resulting update to ref would look like this:
+
+    [1, 11, 3, 10, 9, 6, 7, 12]
+
+See `tf.scatter_nd` for more details about how to make updates to
+slices.
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$ref,
+    TF_I32OrI64Tensor:$indices,
+    TF_Tensor:$updates,
+
+    DefaultValuedAttr<BoolAttr, "true">:$use_locking
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeAttr Tindices = TF_DerivedOperandTypeAttr<1>;
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<2>;
+}
+
+def TF_ResourceScatterSubOp : TF_Op<"ResourceScatterSub", []> {
+  let summary = [{
+Subtracts sparse updates from the variable referenced by `resource`.
+  }];
+
+  let description = [{
+This operation computes
+
+    # Scalar indices
+    ref[indices, ...] -= updates[...]
+
+    # Vector indices (for each i)
+    ref[indices[i], ...] -= updates[i, ...]
+
+    # High rank indices (for each i, ..., j)
+    ref[indices[i, ..., j], ...] -= updates[i, ..., j, ...]
+
+Duplicate entries are handled correctly: if multiple `indices` reference
+the same location, their contributions add.
+
+Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+
+<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+</div>
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$resource,
+    TF_I32OrI64Tensor:$indices,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$updates
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeAttr Tindices = TF_DerivedOperandTypeAttr<1>;
+  TF_DerivedOperandTypeAttr dtype = TF_DerivedOperandTypeAttr<2>;
+}
+
 def TF_ResourceScatterUpdateOp : TF_Op<"ResourceScatterUpdate", []> {
   let summary = [{
 Assigns sparse updates to the variable referenced by `resource`.
@@ -7867,7 +9236,7 @@ This operation computes
   }];
 
   let arguments = (ins
-    TF_ResourceTensor:$resource,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$resource,
     TF_I32OrI64Tensor:$indices,
     TF_Tensor:$updates
   );
@@ -7878,6 +9247,38 @@ This operation computes
   TF_DerivedOperandTypeAttr dtype = TF_DerivedOperandTypeAttr<2>;
 }
 
+def TF_ResourceStridedSliceAssignOp : TF_Op<"ResourceStridedSliceAssign", []> {
+  let summary = "Assign `value` to the sliced l-value reference of `ref`.";
+
+  let description = [{
+The values of `value` are assigned to the positions in the variable
+`ref` that are selected by the slice parameters. The slice parameters
+`begin, `end`, `strides`, etc. work exactly as in `StridedSlice`.
+
+NOTE this op currently does not support broadcasting and so `value`'s
+shape must be exactly the shape produced by the slice of `ref`.
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$ref,
+    TF_I32OrI64Tensor:$begin,
+    TF_I32OrI64Tensor:$end,
+    TF_I32OrI64Tensor:$strides,
+    TF_Tensor:$value,
+
+    DefaultValuedAttr<I64Attr, "0">:$begin_mask,
+    DefaultValuedAttr<I64Attr, "0">:$end_mask,
+    DefaultValuedAttr<I64Attr, "0">:$ellipsis_mask,
+    DefaultValuedAttr<I64Attr, "0">:$new_axis_mask,
+    DefaultValuedAttr<I64Attr, "0">:$shrink_axis_mask
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<4>;
+  TF_DerivedOperandTypeAttr Index = TF_DerivedOperandTypeAttr<1>;
+}
+
 def TF_RestoreV2Op : TF_Op<"RestoreV2", []> {
   let summary = "Restores tensors from a V2 checkpoint.";
 
@@ -8129,6 +9530,47 @@ rint([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0]) ==> [-2., -2., -0., 0., 2., 2., 2.]
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_RollOp : TF_Op<"Roll", [NoSideEffect]> {
+  let summary = "Rolls the elements of a tensor along an axis.";
+
+  let description = [{
+The elements are shifted positively (towards larger indices) by the offset of
+`shift` along the dimension of `axis`. Negative `shift` values will shift
+elements in the opposite direction. Elements that roll passed the last position
+will wrap around to the first and vice versa. Multiple shifts along multiple
+axes may be specified.
+
+For example:
+
+```
+# 't' is [0, 1, 2, 3, 4]
+roll(t, shift=2, axis=0) ==> [3, 4, 0, 1, 2]
+
+# shifting along multiple dimensions
+# 't' is [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
+roll(t, shift=[1, -2], axis=[0, 1]) ==> [[7, 8, 9, 5, 6], [2, 3, 4, 0, 1]]
+
+# shifting along the same axis multiple times
+# 't' is [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
+roll(t, shift=[2, -3], axis=[1, 1]) ==> [[1, 2, 3, 4, 0], [6, 7, 8, 9, 5]]
+```
+  }];
+
+  let arguments = (ins
+    TF_Tensor:$input,
+    TF_I32OrI64Tensor:$shift,
+    TF_I32OrI64Tensor:$axis
+  );
+
+  let results = (outs
+    TF_Tensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr Tshift = TF_DerivedOperandTypeAttr<1>;
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedOperandTypeAttr Taxis = TF_DerivedOperandTypeAttr<2>;
+}
+
 def TF_RoundOp : TF_Op<"Round", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = [{
 Rounds the values of a tensor to the nearest integer, element-wise.
@@ -8712,7 +10154,7 @@ This operation returns N 1-D integer tensors representing shape of `input[i]s`.
     return Verify(*this);
   }];
 
-  let hasFolder = 1;
+  let hasCanonicalizer = 1;
 }
 
 def TF_ShardedFilenameOp : TF_Op<"ShardedFilename", [NoSideEffect]> {
@@ -8735,6 +10177,18 @@ Generate a sharded filename. The filename is printf formatted as
   );
 }
 
+def TF_ShutdownDistributedTPUOp : TF_Op<"ShutdownDistributedTPU", []> {
+  let summary = "Shuts down a running distributed TPU system.";
+
+  let description = [{
+The op returns an error if no system is running.
+  }];
+
+  let arguments = (ins);
+
+  let results = (outs);
+}
+
 def TF_SigmoidOp : TF_Op<"Sigmoid", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Computes sigmoid of `x` element-wise.";
 
@@ -8876,6 +10330,8 @@ size(t) ==> 12
   let verifier = [{
     return Verify(*this);
   }];
+
+  let hasFolder = 1;
 }
 
 def TF_SliceOp : TF_Op<"Slice", [NoSideEffect]> {
@@ -9251,6 +10707,41 @@ backpropagation,
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<1>;
 }
 
+def TF_SparseMatMulOp : TF_Op<"SparseMatMul", [NoSideEffect]> {
+  let summary = [{
+Multiply matrix "a" by matrix "b".
+  }];
+
+  let description = [{
+The inputs must be two-dimensional matrices and the inner dimension of "a" must
+match the outer dimension of "b". Both "a" and "b" must be `Tensor`s not
+`SparseTensor`s.  This op is optimized for the case where at least one of "a" or
+"b" is sparse, in the sense that they have a large proportion of zero values.
+The breakeven for using this versus a dense matrix multiply on one platform was
+30% zero values in the sparse matrix.
+
+The gradient computation of this operation will only take advantage of sparsity
+in the input gradient when that gradient comes from a Relu.
+  }];
+
+  let arguments = (ins
+    TensorOf<[BF16, F32]>:$a,
+    TensorOf<[BF16, F32]>:$b,
+
+    DefaultValuedAttr<BoolAttr, "false">:$transpose_a,
+    DefaultValuedAttr<BoolAttr, "false">:$transpose_b,
+    DefaultValuedAttr<BoolAttr, "false">:$a_is_sparse,
+    DefaultValuedAttr<BoolAttr, "false">:$b_is_sparse
+  );
+
+  let results = (outs
+    F32Tensor:$product
+  );
+
+  TF_DerivedOperandTypeAttr Ta = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedOperandTypeAttr Tb = TF_DerivedOperandTypeAttr<1>;
+}
+
 def TF_SparseReshapeOp : TF_Op<"SparseReshape", [NoSideEffect]> {
   let summary = [{
 Reshapes a SparseTensor to represent values in a new dense shape.
@@ -9482,7 +10973,7 @@ I.e., \\(y = x * x = x^2\\).
 
 def TF_SquaredDifferenceOp : TF_Op<"SquaredDifference", [Commutative, NoSideEffect, ResultsBroadcastableShape]>,
                              WithBroadcastableBinOpBuilder {
-  let summary = "Returns (x - y)(x - y) element-wise.";
+  let summary = "Returns conj(x - y)(x - y) element-wise.";
 
   let description = [{
 *NOTE*: `SquaredDifference` supports broadcasting. More about broadcasting
@@ -9542,7 +11033,7 @@ def TF_StackCloseV2Op : TF_Op<"StackCloseV2", []> {
   let summary = "Delete the stack from its resource container.";
 
   let arguments = (ins
-    TF_ResourceTensor:$handle
+    Arg<TF_ResourceTensor, "", [TF_StackFree]>:$handle
   );
 
   let results = (outs);
@@ -9552,7 +11043,7 @@ def TF_StackPopV2Op : TF_Op<"StackPopV2", []> {
   let summary = "Pop the element at the top of the stack.";
 
   let arguments = (ins
-    TF_ResourceTensor:$handle
+    Arg<TF_ResourceTensor, "", [TF_StackRead, TF_StackWrite]>:$handle
   );
 
   let results = (outs
@@ -9566,7 +11057,7 @@ def TF_StackPushV2Op : TF_Op<"StackPushV2", []> {
   let summary = "Push an element onto the stack.";
 
   let arguments = (ins
-    TF_ResourceTensor:$handle,
+    Arg<TF_ResourceTensor, "", [TF_StackRead, TF_StackWrite]>:$handle,
     TF_Tensor:$elem,
 
     DefaultValuedAttr<BoolAttr, "false">:$swap_memory
@@ -9590,10 +11081,53 @@ def TF_StackV2Op : TF_Op<"StackV2", []> {
   );
 
   let results = (outs
-    TF_ResourceTensor:$handle
+    Res<TF_ResourceTensor, "", [TF_StackAlloc]>:$handle
   );
 }
 
+def TF_StatelessMultinomialOp : TF_Op<"StatelessMultinomial", [NoSideEffect]> {
+  let summary = "Draws samples from a multinomial distribution.";
+
+  let arguments = (ins
+    TF_IntOrFpTensor:$logits,
+    I32Tensor:$num_samples,
+    TF_I32OrI64Tensor:$seed
+  );
+
+  let results = (outs
+    TF_I32OrI64Tensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedOperandTypeAttr Tseed = TF_DerivedOperandTypeAttr<2>;
+  TF_DerivedResultTypeAttr output_dtype = TF_DerivedResultTypeAttr<0>;
+}
+
+def TF_StatelessRandomNormalOp : TF_Op<"StatelessRandomNormal", [NoSideEffect]> {
+  let summary = [{
+Outputs deterministic pseudorandom values from a normal distribution.
+  }];
+
+  let description = [{
+The generated values will have mean 0 and standard deviation 1.
+
+The outputs are a deterministic function of `shape` and `seed`.
+  }];
+
+  let arguments = (ins
+    TF_I32OrI64Tensor:$shape,
+    TF_I32OrI64Tensor:$seed
+  );
+
+  let results = (outs
+    TF_FpTensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedOperandTypeAttr Tseed = TF_DerivedOperandTypeAttr<1>;
+  TF_DerivedResultTypeAttr dtype = TF_DerivedResultTypeAttr<0>;
+}
+
 def TF_StatelessRandomUniformOp : TF_Op<"StatelessRandomUniform", [NoSideEffect]> {
   let summary = [{
 Outputs deterministic pseudorandom random values from a uniform distribution.
@@ -9620,6 +11154,33 @@ The outputs are a deterministic function of `shape` and `seed`.
   TF_DerivedResultTypeAttr dtype = TF_DerivedResultTypeAttr<0>;
 }
 
+def TF_StatelessRandomUniformIntOp : TF_Op<"StatelessRandomUniformInt", [NoSideEffect]> {
+  let summary = [{
+Outputs deterministic pseudorandom random integers from a uniform distribution.
+  }];
+
+  let description = [{
+The generated values follow a uniform distribution in the range `[minval, maxval)`.
+
+The outputs are a deterministic function of `shape`, `seed`, `minval`, and `maxval`.
+  }];
+
+  let arguments = (ins
+    TF_I32OrI64Tensor:$shape,
+    TF_I32OrI64Tensor:$seed,
+    TF_I32OrI64Tensor:$minval,
+    TF_I32OrI64Tensor:$maxval
+  );
+
+  let results = (outs
+    TF_I32OrI64Tensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedOperandTypeAttr Tseed = TF_DerivedOperandTypeAttr<1>;
+  TF_DerivedOperandTypeAttr dtype = TF_DerivedOperandTypeAttr<2>;
+}
+
 def TF_StatelessTruncatedNormalOp : TF_Op<"StatelessTruncatedNormal", [NoSideEffect]> {
   let summary = [{
 Outputs deterministic pseudorandom values from a truncated normal distribution.
@@ -9889,7 +11450,37 @@ Examples:
   TF_DerivedOperandSizeAttr N = TF_DerivedOperandSizeAttr<0>;
 }
 
-def TF_SubOp : TF_Op<"Sub", [NoSideEffect, ResultsBroadcastableShape, TF_SameOperandsAndResultElementTypeResolveRef, TF_CwiseBinary]>,
+def TF_StringToHashBucketFastOp : TF_Op<"StringToHashBucketFast", [NoSideEffect]> {
+  let summary = [{
+Converts each string in the input Tensor to its hash mod by a number of buckets.
+  }];
+
+  let description = [{
+The hash function is deterministic on the content of the string within the
+process and will never change. However, it is not suitable for cryptography.
+This function may be used when CPU time is scarce and inputs are trusted or
+unimportant. There is a risk of adversaries constructing inputs that all hash
+to the same bucket. To prevent this problem, use a strong hash function with
+`tf.string_to_hash_bucket_strong`.
+
+Examples:
+
+>>> tf.strings.to_hash_bucket_fast(["Hello", "TensorFlow", "2.x"], 3).numpy()
+array([0, 2, 2])
+  }];
+
+  let arguments = (ins
+    TF_StrTensor:$input,
+
+    Confined<I64Attr, [IntMinValue<1>]>:$num_buckets
+  );
+
+  let results = (outs
+    I64Tensor:$output
+  );
+}
+
+def TF_SubOp : TF_Op<"Sub", [NoSideEffect, ResultsBroadcastableShape, TF_CwiseBinary, TF_SameOperandsAndResultElementTypeResolveRef]>,
                WithBroadcastableBinOpBuilder {
   let summary = "Returns x - y element-wise.";
 
@@ -9944,6 +11535,25 @@ retained with length 1.
   >];
 }
 
+def TF_SymbolicGradientOp : TF_Op<"SymbolicGradient", [NoSideEffect]> {
+  let summary = [{
+Computes the gradient function for function f via backpropagation.
+  }];
+
+  let arguments = (ins
+    Variadic<TF_Tensor>:$input,
+
+    SymbolRefAttr:$f
+  );
+
+  let results = (outs
+    Variadic<TF_Tensor>:$output
+  );
+
+  TF_DerivedOperandTypeListAttr Tin = TF_DerivedOperandTypeListAttr<0>;
+  TF_DerivedResultTypeListAttr Tout = TF_DerivedResultTypeListAttr<0>;
+}
+
 def TF_TPUCompilationResultOp : TF_Op<"TPUCompilationResult", [NoSideEffect]> {
   let summary = "Returns the result of a TPU compilation.";
 
@@ -10158,9 +11768,9 @@ variables.
   }];
 
   let arguments = (ins
-    Variadic<TF_ResourceTensor>:$vars,
+    Arg<Variadic<TF_ResourceTensor>, "", [TF_VariableRead, TF_VariableWrite]>:$vars,
     TF_StrTensor:$new_format_key,
-    TF_ResourceTensor:$format_state_var
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$format_state_var
   );
 
   let results = (outs);
@@ -10248,7 +11858,7 @@ of a step/run.
   }];
 
   let arguments = (ins
-    TF_ResourceTensor:$handle
+    Arg<TF_ResourceTensor, "", [TF_TensorArrayFree]>:$handle
   );
 
   let results = (outs);
@@ -10272,7 +11882,7 @@ All elements must have the same shape (excepting the first dimension).
   }];
 
   let arguments = (ins
-    TF_ResourceTensor:$handle,
+    Arg<TF_ResourceTensor, "", [TF_TensorArrayRead]>:$handle,
     F32Tensor:$flow_in,
 
     DefaultValuedAttr<TF_ShapeAttr, "llvm::None">:$element_shape_except0
@@ -10296,7 +11906,7 @@ All elements selected by `indices` must have the same shape.
   }];
 
   let arguments = (ins
-    TF_ResourceTensor:$handle,
+    Arg<TF_ResourceTensor, "", [TF_TensorArrayRead]>:$handle,
     I32Tensor:$indices,
     F32Tensor:$flow_in,
 
@@ -10355,14 +11965,14 @@ calculation gets its own TensorArray accumulator.
   }];
 
   let arguments = (ins
-    TF_ResourceTensor:$handle,
+    Arg<TF_ResourceTensor, "", [TF_TensorArrayRead, TF_TensorArrayWrite]>:$handle,
     F32Tensor:$flow_in,
 
     StrAttr:$source
   );
 
   let results = (outs
-    TF_ResourceTensor:$grad_handle,
+    Res<TF_ResourceTensor, "", [TF_TensorArrayAlloc]>:$grad_handle,
     F32Tensor:$flow_out
   );
 }
@@ -10371,7 +11981,7 @@ def TF_TensorArrayReadV3Op : TF_Op<"TensorArrayReadV3", []> {
   let summary = "Read an element from the TensorArray into output `value`.";
 
   let arguments = (ins
-    TF_ResourceTensor:$handle,
+    Arg<TF_ResourceTensor, "", [TF_TensorArrayRead]>:$handle,
     I32Tensor:$index,
     F32Tensor:$flow_in
   );
@@ -10393,7 +12003,7 @@ Scatter the data from the input value into specific TensorArray elements.
   }];
 
   let arguments = (ins
-    TF_ResourceTensor:$handle,
+    Arg<TF_ResourceTensor, "", [TF_TensorArrayRead, TF_TensorArrayWrite]>:$handle,
     I32Tensor:$indices,
     TF_Tensor:$value,
     F32Tensor:$flow_in
@@ -10410,7 +12020,7 @@ def TF_TensorArraySizeV3Op : TF_Op<"TensorArraySizeV3", []> {
   let summary = "Get the current size of the TensorArray.";
 
   let arguments = (ins
-    TF_ResourceTensor:$handle,
+    Arg<TF_ResourceTensor, "", [TF_TensorArrayRead]>:$handle,
     F32Tensor:$flow_in
   );
 
@@ -10445,7 +12055,7 @@ and having size
   }];
 
   let arguments = (ins
-    TF_ResourceTensor:$handle,
+    Arg<TF_ResourceTensor, "", [TF_TensorArrayRead, TF_TensorArrayWrite]>:$handle,
     TF_Tensor:$value,
     I64Tensor:$lengths,
     F32Tensor:$flow_in
@@ -10477,7 +12087,7 @@ Write data via Write and read via Read or Pack.
   );
 
   let results = (outs
-    TF_ResourceTensor:$handle,
+    Res<TF_ResourceTensor, "", [TF_TensorArrayAlloc]>:$handle,
     F32Tensor:$flow
   );
 }
@@ -10486,7 +12096,7 @@ def TF_TensorArrayWriteV3Op : TF_Op<"TensorArrayWriteV3", []> {
   let summary = "Push an element onto the tensor_array.";
 
   let arguments = (ins
-    TF_ResourceTensor:$handle,
+    Arg<TF_ResourceTensor, "", [TF_TensorArrayRead, TF_TensorArrayWrite]>:$handle,
     I32Tensor:$index,
     TF_Tensor:$value,
     F32Tensor:$flow_in
@@ -10881,6 +12491,40 @@ On GPU, if an out of bound index is found, the index is ignored.
   ];
 }
 
+def TF_TensorStridedSliceUpdateOp : TF_Op<"TensorStridedSliceUpdate", [NoSideEffect]> {
+  let summary = "Assign `value` to the sliced l-value reference of `input`.";
+
+  let description = [{
+The values of `value` are assigned to the positions in the tensor `input` that
+are selected by the slice parameters. The slice parameters `begin` `end`
+`strides` etc. work exactly as in `StridedSlice`.
+
+NOTE this op currently does not support broadcasting and so `value`'s shape
+must be exactly the shape produced by the slice of `input`.
+  }];
+
+  let arguments = (ins
+    TF_Tensor:$input,
+    TF_I32OrI64Tensor:$begin,
+    TF_I32OrI64Tensor:$end,
+    TF_I32OrI64Tensor:$strides,
+    TF_Tensor:$value,
+
+    DefaultValuedAttr<I64Attr, "0">:$begin_mask,
+    DefaultValuedAttr<I64Attr, "0">:$end_mask,
+    DefaultValuedAttr<I64Attr, "0">:$ellipsis_mask,
+    DefaultValuedAttr<I64Attr, "0">:$new_axis_mask,
+    DefaultValuedAttr<I64Attr, "0">:$shrink_axis_mask
+  );
+
+  let results = (outs
+    TF_Tensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedOperandTypeAttr Index = TF_DerivedOperandTypeAttr<1>;
+}
+
 def TF_TileOp : TF_Op<"Tile", [NoSideEffect]> {
   let summary = "Constructs a tensor by tiling a given tensor.";
 
@@ -10925,46 +12569,9 @@ array([[1, 2, 3, 1, 2, 3],
   TF_DerivedOperandTypeAttr Tmultiples = TF_DerivedOperandTypeAttr<1>;
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 
-  // TODO(parkers): Add folds for multiples = [1,...].
-  // TODO(parkers): Add errors for negative multiples and multiples.size() !=
-  // input.rank()
-}
+  let verifier = [{ return Verify(*this); }];
 
-def TF_ToBoolOp : TF_Op<"ToBool", [NoSideEffect]> {
-  let summary = "Converts a tensor to a scalar predicate.";
-
-  let description = [{
-Converts a tensor to a scalar predicate with the following rules:
-
-- For 0D tensors, truthiness is determined by comparing against a "zero"
-  value. For numerical types it is the obvious zero. For strings it is the
-  empty string.
-
-- For >0D tensors, truthiness is determined by looking at the number of
-  elements. If has zero elements, then the result is false. Otherwise the
-  result is true.
-
-This matches the behavior of If and While for determining if a tensor counts
-as true/false for a branch condition.
-  }];
-
-  let arguments = (ins
-    TF_Tensor:$input
-  );
-
-  let results = (outs
-    I1Tensor:$output
-  );
-
-  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
-
-  let builders = [OpBuilder<
-    "OpBuilder &builder, OperationState &result, Value value", [{
-      build(builder, result, RankedTensorType::get({}, builder.getI1Type()),
-            value);
-    }]>];
-
-  let hasCanonicalizer = 1;
+  let hasFolder = 1;
 }
 
 def TF_TopKV2Op : TF_Op<"TopKV2", [NoSideEffect]> {
@@ -11388,13 +12995,51 @@ tf.unsorted_segment_sum(c, tf.constant([0, 1, 0]), num_segments=2)
   let verifier = [{ return VerifyUnsortedSegmentReduction(*this); }];
 }
 
+def TF_UpperBoundOp : TF_Op<"UpperBound", [NoSideEffect]> {
+  let summary = [{
+Applies upper_bound(sorted_search_values, values) along each row.
+  }];
+
+  let description = [{
+Each set of rows with the same index in (sorted_inputs, values) is treated
+independently.  The resulting row is the equivalent of calling
+`np.searchsorted(sorted_inputs, values, side='right')`.
+
+The result is not a global index to the entire
+`Tensor`, but rather just the index in the last dimension.
+
+A 2-D example:
+  sorted_sequence = [[0, 3, 9, 9, 10],
+                     [1, 2, 3, 4, 5]]
+  values = [[2, 4, 9],
+            [0, 2, 6]]
+
+  result = UpperBound(sorted_sequence, values)
+
+  result == [[1, 2, 4],
+             [0, 2, 5]]
+  }];
+
+  let arguments = (ins
+    TF_Tensor:$sorted_inputs,
+    TF_Tensor:$values
+  );
+
+  let results = (outs
+    TF_I32OrI64Tensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedResultTypeAttr out_type = TF_DerivedResultTypeAttr<0>;
+}
+
 def TF_VarIsInitializedOp : TF_Op<"VarIsInitializedOp", []> {
   let summary = [{
 Checks whether a resource handle-based variable has been initialized.
   }];
 
   let arguments = (ins
-    TF_ResourceTensor:$resource
+    Arg<TF_ResourceTensor, "", [TF_VariableRead]>:$resource
   );
 
   let results = (outs
@@ -11419,7 +13064,7 @@ shape(t) ==> [2, 2, 3]
   }];
 
   let arguments = (ins
-    TF_ResourceTensor:$input
+    Arg<TF_ResourceTensor, "", [TF_VariableRead]>:$input
   );
 
   let results = (outs
@@ -11563,14 +13208,14 @@ for binary operators.
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$lhs,
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$rhs,
-    TF_I32OrI64Tensor:$broadcast_dims
+    Arg<TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{the LHS input tensor}]>:$lhs,
+    Arg<TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{the RHS input tensor}]>:$rhs,
+    Arg<TF_I32OrI64Tensor, [{an XLA-style broadcast dimension specification}]>:$broadcast_dims
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$lhs_output,
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$rhs_output
+    Res<TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{the broadcasted LHS tensor}]>:$lhs_output,
+    Res<TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{the broadcasted RHS tensor}]>:$rhs_output
   );
 
   TF_DerivedOperandTypeAttr Tindices = TF_DerivedOperandTypeAttr<2>;
@@ -11586,13 +13231,13 @@ https://www.tensorflow.org/performance/xla/operation_semantics#conv_convolution
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$lhs,
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$rhs,
-    TF_I32OrI64Tensor:$window_strides,
-    TF_I32OrI64Tensor:$padding,
-    TF_I32OrI64Tensor:$lhs_dilation,
-    TF_I32OrI64Tensor:$rhs_dilation,
-    TF_I32OrI64Tensor:$feature_group_count,
+    Arg<TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{the input tensor}]>:$lhs,
+    Arg<TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{the kernel tensor}]>:$rhs,
+    Arg<TF_I32OrI64Tensor, [{the inter-window strides}]>:$window_strides,
+    Arg<TF_I32OrI64Tensor, [{the padding to apply at the start and end of each input dimensions}]>:$padding,
+    Arg<TF_I32OrI64Tensor, [{dilation to apply between input elements}]>:$lhs_dilation,
+    Arg<TF_I32OrI64Tensor, [{dilation to apply between kernel elements}]>:$rhs_dilation,
+    Arg<TF_I32OrI64Tensor, [{number of feature groups for grouped convolution.}]>:$feature_group_count,
 
     StrAttr:$dimension_numbers,
     StrAttr:$precision_config
@@ -11615,8 +13260,8 @@ https://www.tensorflow.org/performance/xla/operation_semantics#dotgeneral
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$lhs,
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$rhs,
+    Arg<TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{the LHS tensor}]>:$lhs,
+    Arg<TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{the RHS tensor}]>:$rhs,
 
     StrAttr:$dimension_numbers,
     StrAttr:$precision_config
@@ -11644,8 +13289,11 @@ with dimension size equal to the rank of operand.
   }];
 
   let arguments = (ins
-    TF_Tensor:$input,
-    TF_I32OrI64Tensor:$start_indices,
+    Arg<TF_Tensor, [{A `Tensor` of type T.}]>:$input,
+    Arg<TF_I32OrI64Tensor, [{List of N integers containing the slice size for each
+dimension. Each value must be strictly greater than zero, and start + size
+must be less than or equal to the size of the dimension to avoid
+implementation defined behavior.}]>:$start_indices,
     TF_I32OrI64Tensor:$size_indices
   );
 
@@ -11673,13 +13321,14 @@ Handling of out-of-bounds slice indices is implementation-defined.
   }];
 
   let arguments = (ins
-    TF_Tensor:$input,
-    TF_Tensor:$update,
-    TF_I32OrI64Tensor:$indices
+    Arg<TF_Tensor, [{A `Tensor` of type T.}]>:$input,
+    Arg<TF_Tensor, [{A `Tensor` of type T. Same rank as `input`.}]>:$update,
+    Arg<TF_I32OrI64Tensor, [{A vector of indices into `input`. Must have length equal to the rank of
+`input`.}]>:$indices
   );
 
   let results = (outs
-    TF_Tensor:$output
+    Res<TF_Tensor, [{A `Tensor` of type T.}]>:$output
   );
 
   TF_DerivedOperandTypeAttr Tindices = TF_DerivedOperandTypeAttr<2>;
@@ -11694,9 +13343,9 @@ https://www.tensorflow.org/xla/operation_semantics#gather
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$operand,
-    TF_I32OrI64Tensor:$start_indices,
-    TF_I32OrI64Tensor:$slice_sizes,
+    Arg<TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{The array we're gathering from.}]>:$operand,
+    Arg<TF_I32OrI64Tensor, [{Array containing the starting indices of the slices we gather.}]>:$start_indices,
+    Arg<TF_I32OrI64Tensor, [{slice_sizes[i] is the bounds for the slice on dimension i.}]>:$slice_sizes,
 
     StrAttr:$dimension_numbers,
     BoolAttr:$indices_are_sorted
@@ -11745,13 +13394,13 @@ Sorts a tensor. Currently only sorts in ascending order are supported.
   }];
 
   let arguments = (ins
-    TF_IntOrFpTensor:$keys,
-    TF_Tensor:$values
+    Arg<TF_IntOrFpTensor, [{A `Tensor` of type K.}]>:$keys,
+    Arg<TF_Tensor, [{A `Tensor` of type V.}]>:$values
   );
 
   let results = (outs
-    TF_IntOrFpTensor:$sorted_keys,
-    TF_Tensor:$sorted_values
+    Res<TF_IntOrFpTensor, [{A `Tensor` of type K.}]>:$sorted_keys,
+    Res<TF_Tensor, [{A `Tensor` of type V.}]>:$sorted_values
   );
 
   TF_DerivedOperandTypeAttr V = TF_DerivedOperandTypeAttr<1>;
@@ -11767,15 +13416,15 @@ https://www.tensorflow.org/performance/xla/operation_semantics#pad
   }];
 
   let arguments = (ins
-    TF_Tensor:$input,
-    TF_Tensor:$padding_value,
-    TF_I32OrI64Tensor:$padding_low,
-    TF_I32OrI64Tensor:$padding_high,
-    TF_I32OrI64Tensor:$padding_interior
+    Arg<TF_Tensor, [{A `Tensor` of type T.}]>:$input,
+    Arg<TF_Tensor, [{A scalar `Tensor` of type T.}]>:$padding_value,
+    Arg<TF_I32OrI64Tensor, [{the padding to apply at the start of each input dimensions}]>:$padding_low,
+    Arg<TF_I32OrI64Tensor, [{the padding to apply at the end of each input dimension.}]>:$padding_high,
+    Arg<TF_I32OrI64Tensor, [{the padding to apply between each input element.}]>:$padding_interior
   );
 
   let results = (outs
-    TF_Tensor:$output
+    Res<TF_Tensor, [{A `Tensor` of type T.}]>:$output
   );
 
   TF_DerivedOperandTypeAttr Tindices = TF_DerivedOperandTypeAttr<2>;
@@ -11785,6 +13434,13 @@ https://www.tensorflow.org/performance/xla/operation_semantics#pad
 def TF_XlaRecvFromHostOp : TF_Op<"XlaRecvFromHost", []> {
   let summary = "An op to receive a tensor from the host.";
 
+  let description = [{
+output: the tensor that will be received from the host.
+Toutput: element type for output.
+shape: shape for output.
+key: A unique identifier for this region used to match up host transfers.
+  }];
+
   let arguments = (ins
     TF_ShapeAttr:$shape,
     StrAttr:$key
@@ -11805,8 +13461,8 @@ https://www.tensorflow.org/performance/xla/operation_semantics#reduce .
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$input,
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$init_value,
+    Arg<TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{the input tensor}]>:$input,
+    Arg<TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{a scalar representing the initial value for the reduction}]>:$init_value,
 
     I64ArrayAttr:$dimensions_to_reduce,
     SymbolRefAttr:$reducer
@@ -11829,6 +13485,32 @@ def TF_XlaReplicaIdOp : TF_Op<"XlaReplicaId", [NoSideEffect]> {
   );
 }
 
+def TF_XlaScatterOp : TF_Op<"XlaScatter", [NoSideEffect]> {
+  let summary = "Wraps the XLA Scatter operator documented at";
+
+  let description = [{
+https://www.tensorflow.org/xla/operation_semantics#scatter.
+  }];
+
+  let arguments = (ins
+    Arg<TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Array to be scattered into.}]>:$operand,
+    Arg<TF_I32OrI64Tensor, [{Array containing the starting indices of the slices that must
+be scattered to.}]>:$scatter_indices,
+    Arg<TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Array containing the values that must be used for scattering.}]>:$updates,
+
+    SymbolRefAttr:$update_computation,
+    StrAttr:$dimension_numbers,
+    BoolAttr:$indices_are_sorted
+  );
+
+  let results = (outs
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr Tindices = TF_DerivedOperandTypeAttr<1>;
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_XlaSelfAdjointEigOp : TF_Op<"XlaSelfAdjointEig", [NoSideEffect]> {
   let summary = [{
 Computes the eigen decomposition of a batch of self-adjoint matrices
@@ -11843,7 +13525,7 @@ i=0...N-1.
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$a,
+    Arg<TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{the input tensor.}]>:$a,
 
     BoolAttr:$lower,
     I64Attr:$max_iter,
@@ -11851,8 +13533,10 @@ i=0...N-1.
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$w,
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$v
+    Res<TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{The eigenvalues in ascending order, each repeated according to its
+multiplicity.}]>:$w,
+    Res<TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{The column v[..., :, i] is the normalized eigenvector corresponding to the
+eigenvalue w[..., i].}]>:$v
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -11861,6 +13545,12 @@ i=0...N-1.
 def TF_XlaSendToHostOp : TF_Op<"XlaSendToHost", []> {
   let summary = "An op to send a tensor to the host.";
 
+  let description = [{
+input: the tensor that will be sent to the host.
+Tinput: element type for input.
+key: A unique identifier for this region used to match up host transfers.
+  }];
+
   let arguments = (ins
     TF_Tensor:$input,
 
@@ -11885,7 +13575,7 @@ tensor such that tensor[...,:,:] = u[..., :, :] * Diag(s[..., :]) * Transpose(v[
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$a,
+    Arg<TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{the input tensor.}]>:$a,
 
     I64Attr:$max_iter,
     F32Attr:$epsilon,
@@ -11893,9 +13583,10 @@ tensor such that tensor[...,:,:] = u[..., :, :] * Diag(s[..., :]) * Transpose(v[
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$s,
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$u,
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$v
+    Res<TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Singular values. The values are sorted in reverse order of magnitude, so
+s[..., 0] is the largest value, s[..., 1] is the second largest, etc.}]>:$s,
+    Res<TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Left singular vectors.}]>:$u,
+    Res<TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Right singular vectors.}]>:$v
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -11946,6 +13637,43 @@ def TF_ZerosLikeOp : TF_Op<"ZerosLike", [NoSideEffect, SameOperandsAndResultType
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF__FusedBatchNormExOp : TF_Op<"_FusedBatchNormEx", [NoSideEffect]> {
+  let summary = "Internal FusedBatchNorm operation: reserved for internal use.";
+
+  let description = [{
+Do not invoke this operator directly in Python. A fusion optimization is
+expected to create these operators.
+  }];
+
+  let arguments = (ins
+    TensorOf<[F16, F32]>:$x,
+    F32Tensor:$scale,
+    F32Tensor:$offset,
+    F32Tensor:$mean,
+    F32Tensor:$variance,
+    Variadic<TensorOf<[F16, F32]>>:$side_input,
+
+    DefaultValuedAttr<F32Attr, "0.0001f">:$epsilon,
+    DefaultValuedAttr<F32Attr, "1.0f">:$exponential_avg_factor,
+    DefaultValuedAttr<StrAttr, "Identity">:$activation_mode,
+    DefaultValuedAttr<TF_ConvnetDataFormatAttr, "NHWC">:$data_format,
+    DefaultValuedAttr<BoolAttr, "true">:$is_training
+  );
+
+  let results = (outs
+    TensorOf<[F16, F32]>:$y,
+    F32Tensor:$batch_mean,
+    F32Tensor:$batch_variance,
+    F32Tensor:$reserve_space_1,
+    F32Tensor:$reserve_space_2,
+    F32Tensor:$reserve_space_3
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedOperandTypeAttr U = TF_DerivedOperandTypeAttr<1>;
+  TF_DerivedOperandSizeAttr num_side_inputs = TF_DerivedOperandSizeAttr<5>;
+}
+
 def TF__FusedConv2DOp : TF_Op<"_FusedConv2D", [NoSideEffect]> {
   let summary = [{
 Performs a convolution followed by a specified series of operations.
@@ -11983,7 +13711,8 @@ create these operators.
     DefaultValuedAttr<I64ArrayAttr, "{1, 1, 1, 1}">:$dilations,
     DefaultValuedAttr<BoolAttr, "true">:$use_cudnn_on_gpu,
     DefaultValuedAttr<StrArrayAttr, "{}">:$fused_ops,
-    DefaultValuedAttr<F32Attr, "0.0001f">:$epsilon
+    DefaultValuedAttr<F32Attr, "0.0001f">:$epsilon,
+    DefaultValuedAttr<F32Attr, "0.2f">:$leakyrelu_alpha
   );
 
   let results = (outs
@@ -12049,13 +13778,19 @@ Tensor of activations per table specified in the model.
   }];
 
   let arguments = (ins
-    TF_VariantTensor:$deduplication_data,
+    Arg<TF_VariantTensor, [{A Tensor with type=DT_VARIANT containing the deduplication
+data. The tensor is an XLA nested tuple containing N elements (where N is
+the ratio of the number of embedding to tensor cores per TPU chip). Each
+element of the nested tuple is a tuple of rank 1 tensors. Each tensor either
+contains indices (DT_UINT32) for embedding lookup on the TensorCore or
+weights (DT_FLOAT) to apply to the output of the embedding lookup operation.}]>:$deduplication_data,
 
     StrAttr:$config
   );
 
   let results = (outs
-    Variadic<F32Tensor>:$outputs
+    Res<Variadic<F32Tensor>, [{A TensorList of embedding activations containing one Tensor per
+embedding table in the model.}]>:$outputs
   );
 
   TF_DerivedResultSizeAttr num_tables = TF_DerivedResultSizeAttr<0>;
@@ -12067,18 +13802,17 @@ Compiles a computations for execution on one or more TPU devices.
   }];
 
   let description = [{
-For the internal use of the distributed TPU compiler. Note that currently only
-single TPU device is supported.
+For the internal use of the distributed TPU compiler.
 
 'mlir_module' is a serialized MLIR module with a `main` function that contains
 target computation.
 'dynamic_shapes' contains dynamic shapes of arguments whose shapes were not
 known statically at TPUReplication rewrite time.
-'metadata' is a serialized TPUCompileMetadataProto describing
-the shapes and types of the inputs to the computation, as well as a mapping onto
-the TPU pod topology.
-'program' output is a string key that is passed to the _TPUExecute op and
-used to look up the program in the compilation cache.
+'metadata' is a serialized TPUCompileMetadataProto describing the shapes and
+types of the inputs to the computation, as well as a mapping onto the TPU pod
+topology.
+'program' output is a string key that is passed to the TPUExecute op and used to
+look up the program in the compilation cache.
   }];
 
   let arguments = (ins
@@ -12115,13 +13849,35 @@ rewrite passes must replace this op with a _TPUCompileMlir op `program` output.
   );
 }
 
+def TF__UnaryOpsCompositionOp : TF_Op<"_UnaryOpsComposition", [NoSideEffect, SameOperandsAndResultType]> {
+  let summary = [{
+*NOTE*: Do not invoke this operator directly in Python. Graph rewrite pass is
+  }];
+
+  let description = [{
+expected to create these operators.
+  }];
+
+  let arguments = (ins
+    TensorOf<[F16, F32, F64]>:$x,
+
+    StrArrayAttr:$op_names
+  );
+
+  let results = (outs
+    TensorOf<[F16, F32, F64]>:$y
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF__XlaHostComputeMlirOp : TF_Op<"_XlaHostComputeMlir", []> {
   let summary = [{
 A pseudo-op to represent host-side computation in an XLA program.
   }];
 
   let arguments = (ins
-    Variadic<TF_Tensor>:$inputs,
+    Arg<Variadic<TF_Tensor>, [{A list of tensors that will be sent to the host.}]>:$inputs,
 
     StrAttr:$send_key,
     StrAttr:$recv_key,
@@ -12129,7 +13885,7 @@ A pseudo-op to represent host-side computation in an XLA program.
   );
 
   let results = (outs
-    Variadic<TF_Tensor>:$outputs
+    Res<Variadic<TF_Tensor>, [{A list of tensors that will be returned to the device.}]>:$outputs
   );
 
   TF_DerivedOperandTypeListAttr Tinputs = TF_DerivedOperandTypeListAttr<0>;
@@ -12142,14 +13898,15 @@ A placeholder op to receive values from a running XLA computation.
   }];
 
   let arguments = (ins
-    TF_StrTensor:$dynamic_key,
+    Arg<TF_StrTensor, [{The key sent at runtime by the compile node to identify which
+execution the transfer corresponds to.}]>:$dynamic_key,
 
     StrAttr:$key,
     I64Attr:$device_ordinal
   );
 
   let results = (outs
-    Variadic<TF_Tensor>:$outputs
+    Res<Variadic<TF_Tensor>, [{A list of tensors that will be received from the XLA computation.}]>:$outputs
   );
 
   TF_DerivedResultTypeListAttr Toutputs = TF_DerivedResultTypeListAttr<0>;
@@ -12159,8 +13916,9 @@ def TF__XlaSendFromHostOp : TF_Op<"_XlaSendFromHost", []> {
   let summary = "A placeholder op to send values to a running XLA computation.";
 
   let arguments = (ins
-    Variadic<TF_Tensor>:$inputs,
-    TF_StrTensor:$dynamic_key,
+    Arg<Variadic<TF_Tensor>, [{A list of tensors that will be sent to the XLA computation.}]>:$inputs,
+    Arg<TF_StrTensor, [{The key sent at runtime by the compile node to identify which
+execution the transfer corresponds to.}]>:$dynamic_key,
 
     StrAttr:$key,
     I64Attr:$device_ordinal
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
index 1755c975c23..1edae47cfe6 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
@@ -46,7 +46,7 @@ Invariants:
 TODO: Make invariants more structured so that we can reference them in ops.
   }];
 
-  let cppNamespace = "TF";
+  let cppNamespace = "::mlir::TF";
 }
 
 //===----------------------------------------------------------------------===//
@@ -108,14 +108,29 @@ class TF_ResourceBase<string resourceKind> :
 def TF_VariableResource : TF_ResourceBase<"Variable">;
 def TF_StackResource : TF_ResourceBase<"Stack">;
 def TF_TensorArrayResource : TF_ResourceBase<"TensorArray">;
+def TF_SummaryResource : TF_ResourceBase<"Summary">;
+def TF_LookupTableResource : TF_ResourceBase<"LookupTable">;
 
 def TF_VariableRead : MemRead<TF_VariableResource>;
 def TF_StackRead : MemRead<TF_StackResource>;
 def TF_TensorArrayRead : MemRead<TF_TensorArrayResource>;
+def TF_LookupTableRead : MemRead<TF_LookupTableResource>;
 
 def TF_VariableWrite : MemWrite<TF_VariableResource>;
 def TF_StackWrite : MemWrite<TF_StackResource>;
 def TF_TensorArrayWrite : MemWrite<TF_TensorArrayResource>;
+def TF_SummaryWrite : MemWrite<TF_SummaryResource>;
+def TF_LookupTableWrite : MemWrite<TF_LookupTableResource>;
+
+def TF_VariableAlloc : MemAlloc<TF_VariableResource>;
+def TF_StackAlloc : MemAlloc<TF_StackResource>;
+def TF_TensorArrayAlloc : MemAlloc<TF_TensorArrayResource>;
+def TF_SummaryAlloc : MemAlloc<TF_SummaryResource>;
+def TF_LookupTableAlloc : MemAlloc<TF_LookupTableResource>;
+
+def TF_StackFree : MemFree<TF_StackResource>;
+def TF_TensorArrayFree : MemFree<TF_TensorArrayResource>;
+def TF_SummaryFree : MemFree<TF_SummaryResource>;
 
 //===----------------------------------------------------------------------===//
 // TensorFlow op definitions
@@ -157,20 +172,10 @@ class TF_TensorFlowType <string name, string description> :
          "TensorFlow " # description # " type">,
     BuildableType<"getType<mlir::TF::" # name # "Type>()">;
 
-// Any tensor element type allowed in TensorFlow ops
-def TF_ElementType : Type<Or<[AnyFloat.predicate,
-                              AnySignlessInteger.predicate,
-                              AnyUnsignedInteger.predicate,
-                              AnyComplex.predicate,
-                              TF_TFDialectType.predicate]>,
-                          "tf.dtype">;
-
-// Any TensorFlow tensor type
-def TF_Tensor : TensorOf<[TF_ElementType]>;
-
 //===----------------------------------------------------------------------===//
 // Integer types
 
+// TODO(mgester) shouldn't this be SignedIntOfWidths?
 def TF_I32Or64 : SignlessIntOfWidths<[32, 64]>;
 
 def TF_I32OrI64Tensor : TensorOf<[TF_I32Or64]>;
@@ -191,10 +196,11 @@ def TF_Uint64Tensor : TensorOf<[TF_Uint64]>;
 def TF_UInt : UnsignedIntOfWidths<[8, 16, 32, 64]>;
 
 // Any signed integer type
+// TODO(mgester) shouldn't this be SignedIntOfWidths?
 def TF_SInt : SignlessIntOfWidths<[8, 16, 32, 64]>;
 
 // Any integer type
-def TF_Int : AnyTypeOf<[TF_SInt, TF_UInt]>;
+def TF_Int : AnyTypeOf<[TF_SInt, TF_UInt], "integer">;
 
 // Any integer tensor types
 def TF_IntTensor : TensorOf<[TF_Int]>;
@@ -208,8 +214,8 @@ def TF_Quint8  : TF_TensorFlowType<"Quint8", "quint8">;
 def TF_Quint16 : TF_TensorFlowType<"Quint16", "quint16">;
 
 // Any quantized type
-def TF_AnyQuantized : AnyTypeOf<[TF_Qint8, TF_Qint16, TF_Qint32, TF_Quint8,
-                              TF_Quint16]>;
+def TF_Quantized : AnyTypeOf<[TF_Qint8, TF_Qint16, TF_Qint32, TF_Quint8,
+                              TF_Quint16], "quantized">;
 //===----------------------------------------------------------------------===//
 // Floating-point types
 
@@ -217,8 +223,10 @@ def TF_F32Or64 : FloatOfWidths<[32, 64]>;
 
 def TF_F32OrF64Tensor : TensorOf<[TF_F32Or64]>;
 
+def TF_Float : AnyTypeOf<[F16, F32, F64, BF16], "floating-point">;
+
 // Any floating-point tensor types
-def TF_FpTensor : TensorOf<[AnyFloat]>;
+def TF_FpTensor : TensorOf<[TF_Float]>;
 
 //===----------------------------------------------------------------------===//
 // Complex types
@@ -231,10 +239,9 @@ def TF_Complex64Tensor : TensorOf<[TF_Complex64]>;
 def TF_Complex128 : Complex<F<64>>;
 def TF_Complex128Tensor : TensorOf<[TF_Complex128]>;
 
-def TF_AnyComplex : AnyTypeOf<[TF_Complex64, TF_Complex128],
-                              "64/128-bit complex type">;
+def TF_Complex : AnyTypeOf<[TF_Complex64, TF_Complex128], "complex">;
 
-def TF_ComplexTensor : TensorOf<[TF_AnyComplex]>;
+def TF_ComplexTensor : TensorOf<[TF_Complex]>;
 
 //===----------------------------------------------------------------------===//
 // String/variant/resource types
@@ -248,27 +255,114 @@ def TF_VariantTensor : TensorOf<[TF_Variant]>;
 def TF_Resource : TF_TensorFlowType<"Resource", "resource">;
 def TF_ResourceTensor : TensorOf<[TF_Resource]>;
 
+//===----------------------------------------------------------------------===//
+// Reference types
+
+// Float reference types
+def TF_F16Ref : TF_TensorFlowType<"HalfRef", "f16ref">;
+def TF_F32Ref : TF_TensorFlowType<"FloatRef", "f32ref">;
+def TF_F64Ref : TF_TensorFlowType<"DoubleRef", "f64ref">;
+def TF_Bfloat16Ref : TF_TensorFlowType<"Bfloat16Ref", "bf16ref">;
+
+// Any float reference type
+def TF_FloatRef : AnyTypeOf<[TF_F16Ref, TF_F32Ref, TF_F64Ref, TF_Bfloat16Ref],
+                            "floating-point reference">;
+
+// Complex reference types
+def TF_Complex64Ref : TF_TensorFlowType<"Complex64Ref", "complex64ref">;
+def TF_Complex128Ref : TF_TensorFlowType<"Complex128Ref", "complex128ref">;
+
+// Any complex reference type
+def TF_ComplexRef : AnyTypeOf<[TF_Complex64Ref, TF_Complex128Ref], "complex reference">;
+
+// Integer reference types
+def TF_Int8Ref : TF_TensorFlowType<"Int8Ref", "i8ref">;
+def TF_Int16Ref : TF_TensorFlowType<"Int16Ref", "i16ref">;
+def TF_Int32Ref : TF_TensorFlowType<"Int32Ref", "i32ref">;
+def TF_Int64Ref : TF_TensorFlowType<"Int64Ref", "i64ref">;
+
+def TF_Uint8Ref : TF_TensorFlowType<"Uint8Ref", "ui8ref">;
+def TF_Uint16Ref : TF_TensorFlowType<"Uint16Ref", "ui16ref">;
+def TF_Uint32Ref : TF_TensorFlowType<"Uint32Ref", "ui32ref">;
+def TF_Uint64Ref : TF_TensorFlowType<"Uint64Ref", "ui64ref">;
+
+// Any signed integer reference type
+def TF_SIntRef : AnyTypeOf<[TF_Int8Ref, TF_Int16Ref, TF_Int32Ref, TF_Int64Ref],
+                           "signed integer reference">;
+
+// Any unsigned integer reference type
+def TF_UIntRef : AnyTypeOf<[TF_Uint8Ref, TF_Uint16Ref, TF_Uint32Ref,
+                            TF_Uint64Ref], "unsigned integer reference">;
+
+// Any integer reference type
+def TF_IntRef : AnyTypeOf<[TF_SIntRef, TF_UIntRef], "integer reference">;
+
+// Quantized reference types
+def TF_Qint8Ref : TF_TensorFlowType<"Qint8Ref", "qint8ref">;
+def TF_Qint16Ref : TF_TensorFlowType<"Qint16Ref", "qint16ref">;
+def TF_Qint32Ref : TF_TensorFlowType<"Qint32Ref", "qint32ref">;
+def TF_Quint8Ref : TF_TensorFlowType<"Quint8Ref", "quint8ref">;
+def TF_Quint16Ref : TF_TensorFlowType<"Quint16Ref", "quint16ref">;
+
+// Any quantized reference type
+def TF_QuantizedRef : AnyTypeOf<[TF_Qint8Ref, TF_Qint16Ref, TF_Qint32Ref,
+                                 TF_Quint8Ref, TF_Quint16Ref], "quantized reference">;
+
+// Other reference types
+def TF_BoolRef : TF_TensorFlowType<"BoolRef", "boolref">;
+def TF_ResourceRef : TF_TensorFlowType<"ResourceRef", "resourceref">;
+def TF_StrRef : TF_TensorFlowType<"StringRef", "stringref">;
+def TF_VariantRef : TF_TensorFlowType<"VariantRef", "variantref">;
+
+// Reference tensor types
+def TF_FpRefTensor : TensorOf<[TF_FloatRef]>;
+def TF_I32OrI64RefTensor : TensorOf<[TF_Int32Ref, TF_Int64Ref]>;
+
 //===----------------------------------------------------------------------===//
 // Multi-category type constraints
 
 def TF_IntOrF32OrF64Tensor: TensorOf<[TF_Int, TF_F32Or64]>;
 
-def TF_FpOrI32OrI64Tensor : TensorOf<[AnyFloat, TF_I32Or64]>;
+def TF_FpOrI32OrI64Tensor : TensorOf<[TF_Float, TF_I32Or64]>;
 
 // Any integer or floating-point tensor types
-def TF_IntOrFpTensor : TensorOf<[TF_Int, AnyFloat]>;
+def TF_IntOrFpTensor : TensorOf<[TF_Int, TF_Float]>;
 
-def TF_SintOrFpTensor : TensorOf<[TF_SInt, AnyFloat]>;
+def TF_SintOrFpTensor : TensorOf<[TF_SInt, TF_Float]>;
 
-def TF_FpOrComplexTensor : TensorOf<[AnyFloat, TF_AnyComplex]>;
+def TF_FpOrComplexTensor : TensorOf<[TF_Float, TF_Complex]>;
 
-def TF_AnyNumber : AnyTypeOf<[TF_Int, AnyFloat, TF_AnyQuantized, TF_AnyComplex],
-                             "number">;
+def TF_Number : AnyTypeOf<[TF_Int, TF_Float, TF_Quantized, TF_Complex],
+                          "number">;
+def TF_NumberRef : AnyTypeOf<[TF_IntRef, TF_FloatRef, TF_QuantizedRef,
+                              TF_ComplexRef], "number reference">;
 
-def TF_NumberTensor : TensorOf<[TF_AnyNumber]>;
+def TF_NumberTensor : TensorOf<[TF_Number]>;
+def TF_NumberRefTensor : TensorOf<[TF_NumberRef]>;
 
-def TF_NumberOrStr : AnyTypeOf<[AnyFloat, TF_SInt, TF_AnyComplex, TF_Uint8, TF_Str]>;
-def TF_NumberOrStrTensor : TensorOf<[TF_NumberOrStr]>;
+def TF_NumberNotQuantizedOrStr :
+  AnyTypeOf<[TF_Float, TF_SInt, TF_Complex, TF_Uint8, TF_Str]>;
+def TF_NumberNotQuantizedOrStrRef :
+  AnyTypeOf<[TF_FloatRef, TF_SIntRef, TF_ComplexRef, TF_Uint8Ref, TF_StrRef]>;
+def TF_NumberNotQuantizedOrStrTensor : TensorOf<[TF_NumberNotQuantizedOrStr]>;
+
+//===----------------------------------------------------------------------===//
+// Tensor and tensor element types
+
+// Bool type
+def TF_Bool : I<1>;
+
+// Any tensor element type allowed in TensorFlow ops
+// (see https://www.tensorflow.org/api_docs/python/tf/dtypes/DType)
+def TF_ElementType : Type<Or<[TF_Float.predicate,
+                              TF_Complex.predicate,
+                              TF_Int.predicate,
+                              TF_Bool.predicate,
+                              TF_TFDialectType.predicate]>,
+                          "tf.dtype">;
+
+// Any TensorFlow tensor type
+def TF_Tensor : TensorOf<[TF_ElementType]>;
 
 //===----------------------------------------------------------------------===//
 // TensorFlow attribute definitions
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h
index ec1f748367d..1eb5c89f0fc 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h
@@ -15,12 +15,40 @@ limitations under the License.
 
 #ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_OP_INTERFACES_H_
 #define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_OP_INTERFACES_H_
+
+#include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/OpImplementation.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_verifiers.h"
 
 namespace mlir {
 namespace TF {
+
+//===----------------------------------------------------------------------===//
+// TensorFlow Contraction Fusion.
+//===----------------------------------------------------------------------===//
+
+struct ContractionFusion {
+  explicit ContractionFusion(
+      StringRef output_kernel, ArrayRef<int> additional_arguments = {},
+      ArrayRef<NamedAttribute> additional_attributes = {})
+      : output_kernel(output_kernel.str()),
+        additional_arguments(additional_arguments.begin(),
+                             additional_arguments.end()),
+        additional_attributes(additional_attributes.begin(),
+                              additional_attributes.end()) {}
+
+  // Name of the output kernel implementing the contraction fusion.
+  std::string output_kernel;
+
+  // Indices of additional arguments that will be forwarded to the fused
+  // operation (e.g. forward bias vector if fusing BiasAdd operation).
+  SmallVector<int, 4> additional_arguments;
+
+  // Add additional attributes to the fused node.
+  SmallVector<NamedAttribute, 4> additional_attributes;
+};
+
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h.inc"
 }  // namespace TF
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.td
index 3743bdda043..3c41c04a0d6 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.td
@@ -21,7 +21,7 @@ limitations under the License.
 include "mlir/IR/OpBase.td"
 
 //===----------------------------------------------------------------------===//
-// TensorFlow interfaces
+// TensorFlow Layout Optimization Interfaces.
 //===----------------------------------------------------------------------===//
 
 def TF_LayoutSensitiveInterface : OpInterface<"LayoutSensitiveInterface"> {
@@ -104,4 +104,25 @@ def TF_FoldOperandsTransposeInterface : OpInterface<"FoldOperandsTransposeInterf
   }];
 }
 
+//===----------------------------------------------------------------------===//
+// TensorFlow Contraction Fusion Interfaces.
+//===----------------------------------------------------------------------===//
+
+def TF_ContractionFusableInterface : OpInterface<"ContractionFusableInterface"> {
+  let description = [{
+    A contraction fusable operation is one that can be fused into the output of
+    a tensor contraction (MatMul, Conv2D, etc...) operation.
+
+    For example all element wise operations are trivially contraction fusable.
+  }];
+
+  let methods = [
+    InterfaceMethod<
+      [{Returns contraction fusion if the operation satisfies all the fusion
+        requirements. Otherwise returns empty optional.}],
+      "Optional<ContractionFusion>", "GetContractionFusion", (ins)
+    >,
+  ];
+}
+
 #endif // TF_OP_INTERFACES
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
index abff4c21cf1..634004038d0 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
@@ -55,6 +55,8 @@ limitations under the License.
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Interfaces/DecodeAttributesInterfaces.h"  // from @llvm-project
+#include "mlir/Interfaces/FoldInterfaces.h"  // from @llvm-project
 #include "mlir/Parser.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
@@ -74,23 +76,6 @@ namespace TF {
 //===----------------------------------------------------------------------===//
 
 namespace {
-// Returns true if the op can be duplicated.
-bool CanDuplicate(Operation *op) {
-  // If the op is marked with the cannot duplicate trait, it cannot be
-  // duplicated.
-  if (op->hasTrait<OpTrait::TF::CannotDuplicate>()) return false;
-
-  // If the op has no memory side effects, it can be duplicated.
-  if (MemoryEffectOpInterface::hasNoEffect(op)) return true;
-
-  // If the op is marked stateless using the `is_stateless` attribute, that
-  // attribute determines if the op can be duplicated.
-  if (auto is_stateless = op->getAttrOfType<BoolAttr>("is_stateless"))
-    return is_stateless.getValue();
-
-  // Otherwise, assume ops can be duplicated by default.
-  return true;
-}
 
 // Returns true of the given function has a single uses (within the scope
 // of the module containing it and all parent modules).
@@ -129,6 +114,22 @@ bool HasSingleUse(FuncOp func) {
   return true;
 }
 
+struct TFConstantFoldInterface : public DialectFoldInterface {
+  TFConstantFoldInterface(Dialect *dialect) : DialectFoldInterface(dialect) {}
+  LogicalResult fold(Operation *op, ArrayRef<Attribute> operands,
+                     SmallVectorImpl<OpFoldResult> &results) const final {
+    return TensorFlowDialect::constantFold(op, operands, results);
+  }
+};
+
+struct TFDecodeAttributesInterface : public DialectDecodeAttributesInterface {
+  TFDecodeAttributesInterface(Dialect *dialect)
+      : DialectDecodeAttributesInterface(dialect) {}
+  LogicalResult decode(OpaqueElementsAttr input, ElementsAttr &output) const {
+    return TensorFlowDialect::decode(input, output);
+  }
+};
+
 struct TFInlinerInterface : public DialectInlinerInterface {
   using DialectInlinerInterface::DialectInlinerInterface;
 
@@ -156,7 +157,7 @@ struct TFInlinerInterface : public DialectInlinerInterface {
     //     post inlining, the function will be dead and eliminated from the IR.
     //     So there won't be any code duplication.
     FuncOp func = op->getParentOfType<FuncOp>();
-    return !func || CanDuplicate(op) || HasSingleUse(func);
+    return !func || TensorFlowDialect::CanDuplicate(op) || HasSingleUse(func);
   }
 
   //===--------------------------------------------------------------------===//
@@ -183,22 +184,66 @@ struct TFInlinerInterface : public DialectInlinerInterface {
 // TF Dialect
 //===----------------------------------------------------------------------===//
 
+// Returns true if the op can be duplicated.
+bool TensorFlowDialect::CanDuplicate(Operation *op) {
+  // If the op is marked with the cannot duplicate trait, it cannot be
+  // duplicated.
+  if (op->hasTrait<OpTrait::TF::CannotDuplicate>()) return false;
+
+  // If the op has no memory side effects, it can be duplicated.
+  if (MemoryEffectOpInterface::hasNoEffect(op)) return true;
+
+  // If the op is marked stateless using the `is_stateless` attribute, that
+  // attribute determines if the op can be duplicated.
+  if (auto is_stateless = op->getAttrOfType<BoolAttr>("is_stateless"))
+    return is_stateless.getValue();
+
+  // Otherwise, assume ops can be duplicated by default if its registered, else
+  // it cannot be for unknown ops.
+  return op->isRegistered();
+}
+
+// Returns true if the op can have side effects.
+bool TensorFlowDialect::CanHaveSideEffects(Operation *op) {
+  // If the op has no memory side effects, it has no side effects
+  if (MemoryEffectOpInterface::hasNoEffect(op)) return false;
+
+  // If the op is marked stateless using the `is_stateless` attribute, then
+  // it has no side effects.
+  if (auto is_stateless = op->getAttrOfType<BoolAttr>("is_stateless"))
+    return !is_stateless.getValue();
+
+  // Terminators defined in the TF dialect do not have side effects.
+  if (op->isKnownTerminator()) return false;
+
+  // Otherwise assume that the op can have side effects.
+  return true;
+}
+
 std::vector<TensorFlowDialect::AdditionalOpFunction>
     *TensorFlowDialect::additional_operation_hooks_ =
         new std::vector<TensorFlowDialect::AdditionalOpFunction>();
 
+TensorFlowDialect::ConstantFoldHook TensorFlowDialect::constant_fold_hook_;
+TensorFlowDialect::DecodeConstantHook TensorFlowDialect::decode_constant_hook_;
+
 TensorFlowDialect::TensorFlowDialect(MLIRContext *context)
-    : Dialect(/*name=*/"tf", context) {
+    : Dialect(/*name=*/"tf", context, TypeID::get<TensorFlowDialect>()) {
   addOperations<
 #define GET_OP_LIST
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_all_ops.cc.inc"
       >();
+  addOperations<
+#define GET_OP_LIST
+#include "tensorflow/compiler/mlir/tensorflow/ir/tfrt_ops.cc.inc"
+      >();
   addTypes<
 #define HANDLE_TF_TYPE(tftype, enumerant, name) tftype##Type,
 #define HANDLE_LAST_TF_TYPE(tftype, enumerant, name) tftype##Type
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.def"
       >();
-  addInterfaces<TFInlinerInterface>();
+  addInterfaces<TFInlinerInterface, TFDecodeAttributesInterface,
+                TFConstantFoldInterface>();
   addAttributes<ShapeAttr, FuncAttr>();
 
   // Support unknown operations because not all TensorFlow operations are
@@ -317,16 +362,12 @@ Attribute TensorFlowDialect::parseAttribute(DialectAsmParser &parser,
 
 void TensorFlowDialect::printAttribute(Attribute attr,
                                        DialectAsmPrinter &os) const {
-  switch (attr.getKind()) {
-    case AttrKind::SHAPE:
-      PrintShapeAttr(attr.cast<ShapeAttr>(), os);
-      break;
-    case AttrKind::FUNC:
-      PrintFuncAttr(attr.cast<FuncAttr>(), os);
-      break;
-    default:
-      llvm_unreachable("unexpected tensorflow attribute kind");
-  }
+  if (auto shape_attr = attr.dyn_cast<ShapeAttr>())
+    PrintShapeAttr(shape_attr, os);
+  else if (auto func_attr = attr.dyn_cast<FuncAttr>())
+    PrintFuncAttr(func_attr, os);
+  else
+    llvm_unreachable("unexpected tensorflow attribute type");
 }
 
 // Parses a type registered to this dialect.
@@ -335,51 +376,37 @@ Type TensorFlowDialect::parseType(DialectAsmParser &parser) const {
   if (parser.parseKeyword(&data)) return Type();
 
   Location loc = parser.getEncodedSourceLoc(parser.getNameLoc());
-  auto typeKind = llvm::StringSwitch<unsigned>(data)
+
 #define HANDLE_TF_TYPE(tftype, enumerant, name) \
-  .Case(name, TensorFlowTypes::enumerant)
+  if (data == name) return tftype##Type::get(getContext());
 // Custom TensorFlow types are handled separately at the end as they do partial
 // match.
 #define HANDLE_CUSTOM_TF_TYPE(tftype, enumerant, name)
 // NOLINTNEXTLINE
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.def"
-                      .StartsWith("resource", TensorFlowTypes::RESOURCE)
-                      .StartsWith("variant", TensorFlowTypes::VARIANT)
-                      .Default(0);
-  switch (typeKind) {
-    default:
-      return (emitError(loc, "unknown TensorFlow type: " + data), nullptr);
 
-#define HANDLE_TF_TYPE(tftype, enumerant, name) \
-  case TensorFlowTypes::enumerant:              \
-    return tftype##Type::get(getContext());
-#define HANDLE_CUSTOM_TF_TYPE(tftype, enumerant, name)
-// NOLINTNEXTLINE
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.def"
-    case TensorFlowTypes::RESOURCE:
-      return ParseResourceType(parser, loc);
-    case TensorFlowTypes::VARIANT:
-      return ParseVariantType(parser, loc);
-  }
+  if (data.startswith("resource")) return ParseResourceType(parser, loc);
+  if (data.startswith("variant")) return ParseVariantType(parser, loc);
+  return (emitError(loc, "unknown TensorFlow type: " + data), nullptr);
 }
 
 // Prints a type registered to this dialect.
 void TensorFlowDialect::printType(Type ty, DialectAsmPrinter &os) const {
   assert(ty.isa<TensorFlowType>());
-  switch (ty.getKind()) {
-    default:
-      llvm_unreachable("unexpected tensorflow type kind");
-#define HANDLE_TF_TYPE(tftype, enumerant, name) \
-  case TensorFlowTypes::enumerant:              \
-    os << name;                                 \
-    break;
+#define HANDLE_TF_TYPE(tftype, enumerant, name)        \
+  if (auto derived_ty = ty.dyn_cast<tftype##Type>()) { \
+    os << name;                                        \
+    return;                                            \
+  }
 #define HANDLE_CUSTOM_TF_TYPE(tftype, enumerant, name) \
-  case TensorFlowTypes::enumerant:                     \
-    Print##tftype##Type(ty.cast<tftype##Type>(), os);  \
-    break;
+  if (auto derived_ty = ty.dyn_cast<tftype##Type>()) { \
+    Print##tftype##Type(derived_ty, os);               \
+    return;                                            \
+  }
 // NOLINTNEXTLINE
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.def"
-  }
+
+  llvm_unreachable("unexpected tensorflow type kind");
 }
 
 namespace {
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h
index 039ed1bc3a8..2755a62a3c9 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h
@@ -43,6 +43,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_verifiers.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tfrt_ops.h"
 
 namespace mlir {
 namespace TF {
@@ -63,6 +64,12 @@ class TensorFlowDialect : public Dialect {
   // Returns the string description of stateful attribute.
   static StringRef GetStatefulAttrName() { return "tf.signature.is_stateful"; }
 
+  // Returns true if the op can be duplicated during transformations.
+  static bool CanDuplicate(Operation *op);
+
+  // Returns true if the op can have side effects.
+  static bool CanHaveSideEffects(Operation *op);
+
   Attribute parseAttribute(DialectAsmParser &parser, Type type) const override;
 
   void printAttribute(Attribute attr, DialectAsmPrinter &os) const override;
@@ -110,10 +117,35 @@ class TensorFlowDialect : public Dialect {
         0, (addOperation(AbstractOperation::get<Args>(*this)), 0)...};
   }
 
+  using ConstantFoldHook = LogicalResult (*)(Operation *, ArrayRef<Attribute>,
+                                             SmallVectorImpl<OpFoldResult> &);
+  static void RegisterConstantFoldHook(ConstantFoldHook fn) {
+    constant_fold_hook_ = std::move(fn);
+  }
+
+  static LogicalResult constantFold(Operation *op, ArrayRef<Attribute> operands,
+                                    SmallVectorImpl<OpFoldResult> &results) {
+    if (constant_fold_hook_) return constant_fold_hook_(op, operands, results);
+    return failure();
+  }
+
+  using DecodeConstantHook = LogicalResult (*)(OpaqueElementsAttr input,
+                                               ElementsAttr &output);
+  static void RegisterDecodeConstantHook(DecodeConstantHook fn) {
+    decode_constant_hook_ = std::move(fn);
+  }
+  static LogicalResult decode(OpaqueElementsAttr input, ElementsAttr &output) {
+    if (decode_constant_hook_) return decode_constant_hook_(input, output);
+    return failure();
+  }
+
  private:
   // Hook functions which may add additional operations to the dialect.
   // These are invoked at construction time.
   static std::vector<AdditionalOpFunction> *additional_operation_hooks_;
+
+  static ConstantFoldHook constant_fold_hook_;
+  static DecodeConstantHook decode_constant_hook_;
 };
 
 }  // namespace TF
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
index 1e99675d938..5fe19f7b0cb 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
@@ -68,6 +68,100 @@ class TF_TensorListInitOp<string mnemonic> : TF_Op<mnemonic, [NoSideEffect]> {
   }];
 }
 
+def TF_CaseOp : TF_Op<"Case", []> {
+  let summary = [{
+An n-way switch statement which calls a single branch function.
+  }];
+
+  let description = [{
+An n-way switch statement, implementing the following:
+    ```
+    switch (branch_index) {
+      case 0:
+        output = branches[0](input);
+        break;
+      case 1:
+        output = branches[1](input);
+        break;
+      ...
+      case [[nbranches-1]]:
+      default:
+        output = branches[nbranches-1](input);
+        break;
+    }
+    ```
+  }];
+
+  let arguments = (ins
+    I32Tensor:$branch_index,
+    Variadic<TF_Tensor>:$input,
+
+    Confined<SymbolRefArrayAttr, [ArrayMinCount<1>]>:$branches,
+
+    // Used to map StatelessCase and Case op defined in TensorFlow to a common
+    // op.
+    BoolAttr:$is_stateless
+  );
+
+  let results = (outs
+    Variadic<TF_Tensor>:$output
+  );
+
+  TF_DerivedOperandTypeListAttr Tin = TF_DerivedOperandTypeListAttr<1>;
+  TF_DerivedResultTypeListAttr Tout = TF_DerivedResultTypeListAttr<0>;
+  TF_DerivedResultShapeListAttr output_shapes = TF_DerivedResultShapeListAttr<0>;
+
+  let hasCanonicalizer = 1;
+
+  let verifier = [{
+    return Verify(*this);
+  }];
+}
+
+def TF_CaseRegionOp : TF_Op<"CaseRegion",
+      [SingleBlockImplicitTerminator<"YieldOp">, NoRegionArguments]> {
+  let summary = [{
+An n-way switch statement which calls a single branch function.
+  }];
+
+  let description = [{
+An n-way switch statement, implementing the following:
+    ```
+    switch (branch_index) {
+      case 0:
+        output = branches[0](input);
+        break;
+      case 1:
+        output = branches[1](input);
+        break;
+      ...
+      case [[nbranches-1]]:
+      default:
+        output = branches[nbranches-1](input);
+        break;
+    }
+    ```
+  }];
+
+  let arguments = (ins
+    I32Tensor:$branch_index,
+
+    // Used to map StatelessCase and Case op defined in TensorFlow to a common
+    // op.
+    BoolAttr:$is_stateless
+  );
+
+  let results = (outs
+    Variadic<TF_Tensor>:$output
+  );
+
+  let regions = (region VariadicRegion<SizedRegion<1>>:$branches);
+
+  let verifier = [{
+    return Verify(*this);
+  }];
+}
+
 // In MLIR, the TensorFlow tensor value is represented as an ElementsAttr, with
 // its type encoding the tensor's shape and data type.
 def TF_ConstOp : TF_Op<"Const", [ConstantLike, NoSideEffect,
@@ -123,30 +217,6 @@ source_target_pairs=`[[0,1],[1,2],[2,3],[3,0]]` gets the outputs:
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
-
-def TF_DataFormatVecPermuteOp : TF_Op<"DataFormatVecPermute", [NoSideEffect, SameOperandsAndResultType]> {
-  let summary = "Permute input tensor from `src_format` to `dst_format`";
-
-  let description = [{
-Input tensor must be a vector of size 4, or a 4x2 tensor.
-  }];
-
-  let arguments = (ins
-    TF_I32OrI64Tensor:$x,
-
-    DefaultValuedAttr<StrAttr, "NHWC">:$src_format,
-    DefaultValuedAttr<StrAttr, "NCHW">:$dst_format
-  );
-
-  let results = (outs
-    TF_I32OrI64Tensor:$y
-  );
-
-  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
-
-  let verifier = [{ return Verify(*this); }];
-}
-
 def TF_EmptyTensorListOp : TF_TensorListInitOp<"EmptyTensorList"> {
   let summary = "Creates and returns an empty tensor list.";
 
@@ -235,19 +305,19 @@ else_branch: A function that takes 'inputs' and returns a list of
 
   let extraClassDeclaration = [{
     // Get the then branch function.
-    FuncOp then_func() {
+    FuncOp then_function() {
      return SymbolTable::lookupNearestSymbolFrom<FuncOp>(*this, then_branch());
     }
 
     // Get the else branch function.
-    FuncOp else_func() {
+    FuncOp else_function() {
      return SymbolTable::lookupNearestSymbolFrom<FuncOp>(*this, else_branch());
     }
   }];
 }
 
 def TF_YieldOp : TF_Op<"Yield",
-      [Terminator, ParentOneOf<["IfRegionOp", "WhileRegionOp"]>]> {
+      [Terminator, ParentOneOf<["CaseRegionOp", "IfRegionOp", "WhileRegionOp"]>]> {
   let summary = "Yield operation";
 
   let description = [{
@@ -283,7 +353,7 @@ else_branch: A region that computes the outputs of the op if cond = false.
   }];
 
   let arguments = (ins
-    TF_Tensor:$cond,
+    0DTensorOf<[I1]>:$cond,
 
     // Used to map StatelessIf and If op defined in TensorFlow to a common op.
     BoolAttr:$is_stateless
@@ -293,47 +363,13 @@ else_branch: A region that computes the outputs of the op if cond = false.
     Variadic<TF_Tensor>:$output
   );
 
-  TF_DerivedOperandTypeAttr Tcond = TF_DerivedOperandTypeAttr<0>;
-  TF_DerivedOperandTypeListAttr Tin = TF_DerivedOperandTypeListAttr<1>;
-  TF_DerivedResultTypeListAttr Tout = TF_DerivedResultTypeListAttr<0>;
-
   let regions = (region SizedRegion<1>:$then_branch, SizedRegion<1>:$else_branch);
 
   let verifier = [{
     return Verify(*this);
   }];
-}
 
-def TF_MeanOp : TF_Op<"Mean", [NoSideEffect, TF_FoldOperandsTransposeInterface]> {
-  let summary = "Computes the mean of elements across dimensions of a tensor.";
-
-  let description = [{
-Reduces `input` along the dimensions given in `axis`. Unless
-`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-`axis`. If `keep_dims` is true, the reduced dimensions are
-retained with length 1.
-  }];
-
-  let arguments = (ins
-    TF_NumberTensor:$input,
-    TF_I32OrI64Tensor:$reduction_indices,
-
-    DefaultValuedAttr<BoolAttr, "false">:$keep_dims
-  );
-
-  let results = (outs
-    TF_NumberTensor:$output
-  );
-
-  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
-  TF_DerivedOperandTypeAttr Tidx = TF_DerivedOperandTypeAttr<1>;
-
-  let extraClassDeclaration = [{
-    // TF_FoldOperandsTransposeInterface:
-    SmallVector<unsigned, 4> GetLayoutDependentArgs() { return {0}; }
-    SmallVector<unsigned, 4> GetLayoutDependentResults() { return {}; }
-    LogicalResult FoldOperandsPermutation(ArrayRef<int64_t> permutation);
-  }];
+  let hasCanonicalizer = 1;
 }
 
 def TF_LegacyCallOp : TF_Op<"LegacyCall",
@@ -534,36 +570,6 @@ def TF_PlaceholderWithDefaultOp : TF_Op<"PlaceholderWithDefault", [NoSideEffect]
   DerivedAttr shape = TF_DerivedResultShapeAttr;
 }
 
-def TF_SparseMatMulOp : TF_Op<"SparseMatMul", [NoSideEffect]> {
-  let summary = [{
-SparseMatMul is MatMul with hints on the sparseness of the matrices.
-  }];
-
-  let description = [{
-Similar to MatMul, with a_is_sparse and b_is_sparse indicating whether a and b
-are sparse matrices.
-  }];
-
-  let arguments = (ins
-    TensorOf<[BF16, F32]>:$a,
-    TensorOf<[BF16, F32]>:$b,
-
-    DefaultValuedAttr<BoolAttr, "true">:$a_is_sparse,
-    DefaultValuedAttr<BoolAttr, "false">:$b_is_sparse,
-
-    DefaultValuedAttr<BoolAttr, "false">:$transpose_a,
-    DefaultValuedAttr<BoolAttr, "false">:$transpose_b
-  );
-
-  let results = (outs
-    TensorOf<[F32]>:$product
-  );
-
-  TF_DerivedOperandTypeAttr Ta = TF_DerivedOperandTypeAttr<0>;
-  TF_DerivedOperandTypeAttr Tb = TF_DerivedOperandTypeAttr<1>;
-}
-
-
 def TF_StatefulPartitionedCallOp : TF_Op<"StatefulPartitionedCall",
                                          [CallOpInterface]> {
   let summary =
@@ -655,12 +661,12 @@ body: A function that takes a list of tensors and returns another
 
   let extraClassDeclaration = [{
     // Get the condition function.
-    FuncOp cond_func() {
+    FuncOp cond_function() {
       return SymbolTable::lookupNearestSymbolFrom<FuncOp>(*this, cond());
     }
 
     // Get the body function.
-    FuncOp body_func() {
+    FuncOp body_function() {
       return SymbolTable::lookupNearestSymbolFrom<FuncOp>(*this, body());
     }
   }];
@@ -710,8 +716,6 @@ def TL_WhileRegionOp : TF_Op<"WhileRegion",
   );
   let results = (outs Variadic<AnyTensor>:$output);
 
-  TF_DerivedOperandTypeListAttr T = TF_DerivedOperandTypeListAttr<0>;
-
   let regions = (region SizedRegion<1>:$cond, SizedRegion<1>:$body);
 
   let verifier = [{ return Verify(*this); }];
@@ -787,7 +791,7 @@ Example:
   );
 
   let results = (outs
-    TF_ResourceTensor:$resource
+    Res<TF_ResourceTensor, "", [TF_VariableAlloc]>:$resource
   );
 
   TF_DerivedOperandOrResultHandleTypeAttr dtype =
@@ -796,45 +800,6 @@ Example:
     TF_DerivedOperandOrResultHandleShapeAttr<"resource">;
 }
 
-// Not generated because it begins with an underscore, which isn't allowed by
-// the C++ standard.
-def TF_FusedBatchNormExOp : TF_Op<"_FusedBatchNormEx", [NoSideEffect]> {
-  let summary = "Internal FusedBatchNorm operation: reserved for internal use";
-
-  let description = [{
- Do not invoke this operator directly in Python. A fusion optimization is
- expected to create these operators.
-  }];
-
-  let arguments = (ins
-    TensorOf<[F16, F32]>:$x,
-    F32Tensor:$scale,
-    F32Tensor:$offset,
-    F32Tensor:$mean,
-    F32Tensor:$variance,
-    Variadic<TensorOf<[F16, F32]>>:$side_input,
-
-    DefaultValuedAttr<F32Attr, "0.0001f">:$epsilon,
-    DefaultValuedAttr<F32Attr, "1.0f">:$exponential_avg_factor,
-    DefaultValuedAttr<StrAttr, "Identity">:$activation_mode,
-    DefaultValuedAttr<TF_ConvnetDataFormatAttr, "NHWC">:$data_format,
-    DefaultValuedAttr<BoolAttr, "true">:$is_training
-  );
-
-  let results = (outs
-    TensorOf<[F16, F32]>:$y,
-    F32Tensor:$batch_mean,
-    F32Tensor:$batch_variance,
-    F32Tensor:$reserve_space_1,
-    F32Tensor:$reserve_space_2,
-    F32Tensor:$reserve_space_3
-  );
-
-  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
-  TF_DerivedOperandTypeAttr U = TF_DerivedOperandTypeAttr<1>;
-  TF_DerivedOperandSizeAttr num_side_inputs = TF_DerivedOperandSizeAttr<5>;
-}
-
 // Multiple variadic operands with different sizes are not supported by the
 // dialect generator, so we manually added the op.
 def TF_SendTPUEmbeddingGradientsOp : TF_Op<"SendTPUEmbeddingGradients", [AttrSizedOperandSegments]> {
@@ -1105,6 +1070,43 @@ def TF_TensorSliceDatasetOp : TF_Op<"TensorSliceDataset", []> {
   TF_DerivedOperandTypeListAttr Toutput_types = TF_DerivedOperandTypeListAttr<0>;
 }
 
+def TF_ToBoolOp : TF_Op<"ToBool", [NoSideEffect]> {
+  let summary = "Converts a tensor to a scalar predicate.";
+
+  let description = [{
+Converts a tensor to a scalar predicate with the following rules:
+
+- For 0D tensors, truthiness is determined by comparing against a "zero"
+  value. For numerical types it is the obvious zero. For strings it is the
+  empty string.
+
+- For >0D tensors, truthiness is determined by looking at the number of
+  elements. If has zero elements, then the result is false. Otherwise the
+  result is true.
+
+This matches the behavior of If and While for determining if a tensor counts
+as true/false for a branch condition.
+  }];
+
+  let arguments = (ins
+    TF_Tensor:$input
+  );
+
+  let results = (outs
+    0DTensorOf<[I1]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+
+  let builders = [OpBuilder<
+    "OpBuilder &builder, OperationState &result, Value value", [{
+      build(builder, result, RankedTensorType::get({}, builder.getI1Type()),
+            value);
+    }]>];
+
+  let hasCanonicalizer = 1;
+}
+
 def TF_BesselI0eOp : TF_Op<"BesselI0e", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Computes the Bessel i0e function of `x` element-wise.";
 
@@ -1147,36 +1149,6 @@ This function is faster and numerically stabler than `bessel_i1(x)`.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
-def TF_StringToHashBucketFastOp : TF_Op<"StringToHashBucketFast", [NoSideEffect]> {
-  let summary = [{
-Converts each string in the input Tensor to its hash mod by a number of buckets.
-  }];
-
-  let description = [{
-The hash function is deterministic on the content of the string within the
-process and will never change. However, it is not suitable for cryptography.
-This function may be used when CPU time is scarce and inputs are trusted or
-unimportant. There is a risk of adversaries constructing inputs that all hash
-to the same bucket. To prevent this problem, use a strong hash function with
-`tf.string_to_hash_bucket_strong`.
-
-Examples:
-
->>> tf.strings.to_hash_bucket_fast(["Hello", "TensorFlow", "2.x"], 3).numpy()
-array([0, 2, 2])
-  }];
-
-  let arguments = (ins
-    TF_StrTensor:$input,
-
-    Confined<I64Attr, [IntMinValue<1>]>:$num_buckets
-  );
-
-  let results = (outs
-    I64Tensor:$output
-  );
-}
-
 def TF_TPUPartitionedCallOp : TF_Op<"TPUPartitionedCall", [CallOpInterface]> {
   let summary = "Calls a function placed on a specified TPU device.";
 
@@ -1211,63 +1183,6 @@ def TF_TPUPartitionedCallOp : TF_Op<"TPUPartitionedCall", [CallOpInterface]> {
   let verifier = [{ return VerifyPartitionedCall(*this); }];
 }
 
-class TF_FusedBatchNormOpBase<string Name> : TF_Op<Name, [NoSideEffect, TF_FoldOperandsTransposeInterface, TF_LayoutSensitiveInterface]> {
-  let summary = "Batch normalization.";
-
-  let description = [{
-Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-The size of 1D Tensors matches the dimension C of the 4D Tensors.
-  }];
-
-  let arguments = (ins
-    TensorOf<[BF16, F16, F32]>:$x,
-    F32Tensor:$scale,
-    F32Tensor:$offset,
-    F32Tensor:$mean,
-    F32Tensor:$variance,
-
-    DefaultValuedAttr<F32Attr, "0.0001f">:$epsilon,
-    DefaultValuedAttr<F32Attr, "1.0f">:$exponential_avg_factor,
-    DefaultValuedAttr<TF_ConvnetDataFormatAttr, "NHWC">:$data_format,
-    DefaultValuedAttr<BoolAttr, "true">:$is_training
-  );
-
-  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
-  TF_DerivedOperandTypeAttr U = TF_DerivedOperandTypeAttr<1>;
-
-  let extraClassDeclaration = [{
-    // TF_FoldOperandsTransposeInterface:
-    SmallVector<unsigned, 4> GetLayoutDependentArgs() { return {0}; }
-    SmallVector<unsigned, 4> GetLayoutDependentResults() { return {0}; }
-    LogicalResult FoldOperandsPermutation(ArrayRef<int64_t> permutation);
-
-    // TF_LayoutSensitiveInterface:
-    StringRef GetOptimalLayout(const RuntimeDevices& devices);
-    LogicalResult UpdateDataFormat(StringRef data_format);
-  }];
-}
-
-def TF_FusedBatchNormV2Op : TF_FusedBatchNormOpBase<"FusedBatchNormV2"> {
-  let results = (outs
-    TensorOf<[BF16, F16, F32]>:$y,
-    F32Tensor:$batch_mean,
-    F32Tensor:$batch_variance,
-    F32Tensor:$reserve_space_1,
-    F32Tensor:$reserve_space_2
-  );
-}
-
-def TF_FusedBatchNormV3Op : TF_FusedBatchNormOpBase<"FusedBatchNormV3"> {
-  let results = (outs
-    TensorOf<[BF16, F16, F32]>:$y,
-    F32Tensor:$batch_mean,
-    F32Tensor:$batch_variance,
-    F32Tensor:$reserve_space_1,
-    F32Tensor:$reserve_space_2,
-    F32Tensor:$reserve_space_3
-  );
-}
-
 def TF_BatchFunctionOp : TF_Op<"BatchFunction", [AttrSizedOperandSegments]> {
   let summary = [{
 Batches all the inputs tensors to the computation done by the function.
@@ -1295,6 +1210,7 @@ So, for example, in the following code
           batch_timeout_micros=100000,  # 100ms
           allowed_batch_sizes=[3, 10],
           batching_queue="")
+  ```
 
 If more than one session.run call is simultaneously trying to compute `b`
 the values of `a` will be gathered, non-deterministically concatenated
@@ -1338,4 +1254,625 @@ must be a Tensor or a list/tuple of Tensors.
   TF_DerivedResultTypeListAttr Tout = TF_DerivedResultTypeListAttr<0>;
 }
 
+def TF_AddV2Op : TF_Op<"AddV2", [Commutative, NoSideEffect, ResultsBroadcastableShape, TF_CwiseBinary, TF_LayoutAgnostic, TF_SameOperandsAndResultElementTypeResolveRef]>,
+                 WithBroadcastableBinOpBuilder {
+  let summary = "Returns x + y element-wise.";
+
+  let description = [{
+*NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+  }];
+
+  let arguments = (ins
+    TensorOf<[TF_Float, TF_SInt, TF_Complex, TF_Uint8, TF_Uint32, TF_FloatRef, TF_SIntRef, TF_ComplexRef, TF_Uint8Ref, TF_Uint32Ref]>:$x,
+    TensorOf<[TF_Float, TF_SInt, TF_Complex, TF_Uint8, TF_Uint32, TF_FloatRef, TF_SIntRef, TF_ComplexRef, TF_Uint8Ref, TF_Uint32Ref]>:$y
+  );
+
+  let results = (outs
+    TensorOf<[TF_Float, TF_SInt, TF_Complex, TF_Uint8, TF_Uint32, TF_FloatRef, TF_SIntRef, TF_ComplexRef, TF_Uint8Ref, TF_Uint32Ref]>:$z
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+
+  let hasCanonicalizer = 1;
+
+  let hasFolder = 1;
+}
+
+def TF_DivNoNanOp : TF_Op<"DivNoNan", [NoSideEffect, ResultsBroadcastableShape, TF_SameOperandsAndResultElementTypeResolveRef]>,
+                    WithBroadcastableBinOpBuilder {
+  let summary = "Returns 0 if the denominator is zero.";
+
+  let description = [{
+*NOTE*: `DivNoNan` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+  }];
+
+  let arguments = (ins
+    TensorOf<[F16, F32, F64, TF_Complex, TF_F16Ref, TF_F32Ref, TF_F64Ref, TF_ComplexRef]>:$x,
+    TensorOf<[F16, F32, F64, TF_Complex, TF_F16Ref, TF_F32Ref, TF_F64Ref, TF_ComplexRef]>:$y
+  );
+
+  let results = (outs
+    TensorOf<[F16, F32, F64, TF_Complex, TF_F16Ref, TF_F32Ref, TF_F64Ref, TF_ComplexRef]>:$z
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
+def TF_MaximumOp : TF_Op<"Maximum", [NoSideEffect, ResultsBroadcastableShape, TF_SameOperandsAndResultElementTypeResolveRef]>,
+                   WithBroadcastableBinOpBuilder {
+  let summary = "Returns the max of x and y (i.e. x > y ? x : y) element-wise.";
+
+  let description = [{
+*NOTE*: `Maximum` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+  }];
+
+  let arguments = (ins
+    TensorOf<[TF_Float, I16, I32, I64, TF_Uint8, TF_FloatRef, TF_Int16Ref, TF_Int32Ref, TF_Int64Ref, TF_Uint8Ref]>:$x,
+    TensorOf<[TF_Float, I16, I32, I64, TF_Uint8, TF_FloatRef, TF_Int16Ref, TF_Int32Ref, TF_Int64Ref, TF_Uint8Ref]>:$y
+  );
+
+  let results = (outs
+    TensorOf<[TF_Float, I16, I32, I64, TF_Uint8, TF_FloatRef, TF_Int16Ref, TF_Int32Ref, TF_Int64Ref, TF_Uint8Ref]>:$z
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
+def TF_RealDivOp : TF_Op<"RealDiv", [NoSideEffect, ResultsBroadcastableShape, TF_CwiseBinary]>,
+                   WithBroadcastableBinOpBuilder {
+  let summary = "Returns x / y element-wise for real types.";
+
+  let description = [{
+If `x` and `y` are reals, this will return the floating-point division.
+
+*NOTE*: `Div` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+  }];
+
+  let arguments = (ins
+    TensorOf<[TF_Float, TF_SInt, TF_Complex, TF_Uint16, TF_Uint8, TF_FloatRef, TF_SIntRef, TF_ComplexRef, TF_Uint16Ref, TF_Uint8Ref]>:$x,
+    TensorOf<[TF_Float, TF_SInt, TF_Complex, TF_Uint16, TF_Uint8, TF_FloatRef, TF_SIntRef, TF_ComplexRef, TF_Uint16Ref, TF_Uint8Ref]>:$y
+  );
+
+  let results = (outs
+    TensorOf<[TF_Float, TF_SInt, TF_Complex, TF_Uint16, TF_Uint8, TF_FloatRef, TF_SIntRef, TF_ComplexRef, TF_Uint16Ref, TF_Uint8Ref]>:$z
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+
+  let hasCanonicalizer = 1;
+
+  let hasFolder = 1;
+}
+
+def TF_AddOp : TF_Op<"Add", [NoSideEffect, ResultsBroadcastableShape, TF_LayoutAgnostic, TF_SameOperandsAndResultElementTypeResolveRef]>,
+               WithBroadcastableBinOpBuilder {
+  let summary = "Returns x + y element-wise.";
+
+  let description = [{
+*NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
+Given two input tensors, the `tf.add` operation computes the sum for every element in the tensor.
+
+Both input and output have a range `(-inf, inf)`.
+  }];
+
+  let arguments = (ins
+    TensorOf<[TF_NumberNotQuantizedOrStr, TF_NumberNotQuantizedOrStrRef]>:$x,
+    TensorOf<[TF_NumberNotQuantizedOrStr, TF_NumberNotQuantizedOrStrRef]>:$y
+  );
+
+  let results = (outs
+    TensorOf<[TF_NumberNotQuantizedOrStr, TF_NumberNotQuantizedOrStrRef]>:$z
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+
+  let hasCanonicalizer = 1;
+}
+
+def TF_StatefulStandardNormalV2Op : TF_Op<"StatefulStandardNormalV2", []> {
+  let summary = "Outputs random values from a normal distribution.";
+
+  let description = [{
+The generated values will have mean 0 and standard deviation 1.
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$resource,
+    I64Tensor:$algorithm,
+    TF_I32OrI64Tensor:$shape
+  );
+
+  let results = (outs
+    TF_FpTensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr shape_dtype = TF_DerivedOperandTypeAttr<2>;
+  TF_DerivedResultTypeAttr dtype = TF_DerivedResultTypeAttr<0>;
+}
+
+def TF_StatefulTruncatedNormalOp : TF_Op<"StatefulTruncatedNormal", []> {
+  let summary = "Outputs random values from a truncated normal distribution.";
+
+  let description = [{
+The generated values follow a normal distribution with mean 0 and standard
+deviation 1, except that values whose magnitude is more than 2 standard
+deviations from the mean are dropped and re-picked.
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_VariableRead,TF_VariableWrite]>:$resource,
+    I64Tensor:$algorithm,
+    TF_I32OrI64Tensor:$shape
+  );
+
+  let results = (outs
+    TF_FpTensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr shape_dtype = TF_DerivedOperandTypeAttr<2>;
+  TF_DerivedResultTypeAttr dtype = TF_DerivedResultTypeAttr<0>;
+}
+
+def TF_StatefulUniformOp : TF_Op<"StatefulUniform", []> {
+  let summary = "Outputs random values from a uniform distribution.";
+
+  let description = [{
+The generated values follow a uniform distribution in the range `[0, 1)`. The
+lower bound 0 is included in the range, while the upper bound 1 is excluded.
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_VariableRead,TF_VariableWrite]>:$resource,
+    I64Tensor:$algorithm,
+    TF_I32OrI64Tensor:$shape
+  );
+
+  let results = (outs
+    TF_FpTensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr shape_dtype = TF_DerivedOperandTypeAttr<2>;
+  TF_DerivedResultTypeAttr dtype = TF_DerivedResultTypeAttr<0>;
+}
+
+def TF_StatefulUniformFullIntOp : TF_Op<"StatefulUniformFullInt", []> {
+  let summary = "Outputs random integers from a uniform distribution.";
+
+  let description = [{
+The generated values are uniform integers covering the whole range of `dtype`.
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_VariableRead,TF_VariableWrite]>:$resource,
+    I64Tensor:$algorithm,
+    TF_I32OrI64Tensor:$shape
+  );
+
+  let results = (outs
+    TensorOf<[I32, I64, TF_Uint32, TF_Uint64]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr shape_dtype = TF_DerivedOperandTypeAttr<2>;
+  TF_DerivedResultTypeAttr dtype = TF_DerivedResultTypeAttr<0>;
+}
+
+// TODO(lyandy): Investigate supported dtypes (`minval`, `maxval`, `output`) for
+// `tf.StatefulUniformInt`. tf2xla kernels support i32, i64, ui32, and ui64
+// while TensorFlow CPU/GPU kernels only support i32 and i64.
+def TF_StatefulUniformIntOp : TF_Op<"StatefulUniformInt", []> {
+  let summary = "Outputs random integers from a uniform distribution.";
+
+  let description = [{
+The generated values are uniform integers in the range `[minval, maxval)`.
+The lower bound `minval` is included in the range, while the upper bound
+`maxval` is excluded.
+
+The random integers are slightly biased unless `maxval - minval` is an exact
+power of two.  The bias is small for values of `maxval - minval` significantly
+smaller than the range of the output (either `2^32` or `2^64`).
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_VariableRead,TF_VariableWrite]>:$resource,
+    I64Tensor:$algorithm,
+    TF_I32OrI64Tensor:$shape,
+    TensorOf<[I32, I64, TF_Uint32, TF_Uint64]>:$minval,
+    TensorOf<[I32, I64, TF_Uint32, TF_Uint64]>:$maxval
+  );
+
+  let results = (outs
+    TensorOf<[I32, I64, TF_Uint32, TF_Uint64]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr shape_dtype = TF_DerivedOperandTypeAttr<2>;
+  TF_DerivedOperandTypeAttr dtype = TF_DerivedOperandTypeAttr<3>;
+}
+
+def TF_CloseSummaryWriterOp : TF_Op<"CloseSummaryWriter", []> {
+  let summary = "Flushes and closes the summary writer.";
+
+  let description = [{
+Also removes it from the resource manager. To reopen, use another
+CreateSummaryFileWriter op.
+
+writer: A handle to the summary writer resource.
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_SummaryFree]>:$writer
+  );
+
+  let results = (outs);
+}
+
+// TODO(b/168035831): Model db_uri read/write.
+def TF_CreateSummaryDbWriterOp : TF_Op<"CreateSummaryDbWriter", []> {
+  let summary = "Creates summary database writer accessible by given resource handle.";
+
+  let description = [{
+This can be used to write tensors from the execution graph directly
+to a database. Only SQLite is supported right now. This function
+will create the schema if it doesn't exist. Entries in the Users,
+Experiments, and Runs tables will be created automatically if they
+don't already exist.
+
+writer: Handle to SummaryWriter resource to overwrite.
+db_uri: For example "file:/tmp/foo.sqlite".
+experiment_name: Can't contain ASCII control characters or <>. Case
+  sensitive. If empty, then the Run will not be associated with any
+  Experiment.
+run_name: Can't contain ASCII control characters or <>. Case sensitive.
+  If empty, then each Tag will not be associated with any Run.
+user_name: Must be valid as both a DNS label and Linux username. If
+  empty, then the Experiment will not be associated with any User.
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_SummaryWrite]>:$writer,
+    TF_StrTensor:$db_uri,
+    TF_StrTensor:$experiment_name,
+    TF_StrTensor:$run_name,
+    TF_StrTensor:$user_name
+  );
+
+  let results = (outs);
+}
+
+// TODO(b/168035831): Model logdir read/write.
+def TF_CreateSummaryFileWriterOp : TF_Op<"CreateSummaryFileWriter", []> {
+  let summary = "Creates a summary file writer accessible by the given resource handle.";
+
+  let description = [{
+writer: A handle to the summary writer resource
+logdir: Directory where the event file will be written.
+max_queue: Size of the queue of pending events and summaries.
+flush_millis: How often, in milliseconds, to flush the pending events and
+  summaries to disk.
+filename_suffix: Every event file's name is suffixed with this suffix.
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_SummaryWrite]>:$writer,
+    TF_StrTensor:$logdir,
+    I32Tensor:$max_queue,
+    I32Tensor:$flush_millis,
+    TF_StrTensor:$filename_suffix
+  );
+
+  let results = (outs);
+}
+
+def TF_FlushSummaryWriterOp : TF_Op<"FlushSummaryWriter", []> {
+  let summary = "Flushes the writer's unwritten events.";
+
+  let description = [{
+writer: A handle to the summary writer resource.
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_SummaryWrite]>:$writer
+  );
+
+  let results = (outs);
+}
+
+def TF_ImportEventOp : TF_Op<"ImportEvent", []> {
+  let summary = "Outputs a `tf.Event` protocol buffer.";
+
+  let description = [{
+When CreateSummaryDbWriter is being used, this op can be useful for
+importing data from event logs.
+
+writer: A handle to a summary writer.
+event: A string containing a binary-encoded tf.Event proto.
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_SummaryWrite]>:$writer,
+    TF_StrTensor:$event
+  );
+
+  let results = (outs);
+}
+
+def TF_SummaryWriterOp : TF_Op<"SummaryWriter", []> {
+  let summary = "Returns a handle to be used to access a summary writer.";
+
+  let description = [{
+The summary writer is an in-graph resource which can be used by ops to write
+summaries to event files.
+
+writer: the summary writer resource. Scalar handle.
+  }];
+
+  let arguments = (ins
+    StrAttr:$shared_name,
+    StrAttr:$container
+  );
+
+  let results = (outs
+    Res<TF_ResourceTensor, "", [TF_SummaryAlloc]>:$writer
+  );
+}
+
+def TF_WriteAudioSummaryOp : TF_Op<"WriteAudioSummary", []> {
+  let summary = "Writes a `Summary` protocol buffer with audio.";
+
+  let description = [{
+The summary has up to `max_outputs` summary values containing audio. The
+audio is built from `tensor` which must be 3-D with shape `[batch_size,
+frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
+assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
+
+The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+build the `tag` of the summary values:
+
+*  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
+*  If `max_outputs` is greater than 1, the summary value tags are
+   generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
+
+writer: A handle to a summary writer.
+step: The step to write the summary for.
+tag: Scalar. Used to build the `tag` attribute of the summary values.
+tensor: 2-D of shape `[batch_size, frames]`.
+sample_rate: The sample rate of the signal in hertz.
+max_outputs: Max number of batch elements to generate audio for.
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_SummaryWrite]>:$writer,
+    I64Tensor:$step,
+    TF_StrTensor:$tag,
+    F32Tensor:$tensor,
+    F32Tensor:$sample_rate,
+
+    Confined<DefaultValuedAttr<I64Attr, "3">, [IntMinValue<1>]>:$max_outputs
+  );
+
+  let results = (outs);
+}
+
+def TF_WriteGraphSummaryOp : TF_Op<"WriteGraphSummary", []> {
+  let summary = "Writes a `GraphDef` protocol buffer to a `SummaryWriter`.";
+
+  let description = [{
+writer: Handle of `SummaryWriter`.
+step: The step to write the summary for.
+tensor: A scalar string of the serialized tf.GraphDef proto.
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_SummaryWrite]>:$writer,
+    I64Tensor:$step,
+    TF_StrTensor:$tensor
+  );
+
+  let results = (outs);
+}
+
+def TF_WriteHistogramSummaryOp : TF_Op<"WriteHistogramSummary", []> {
+  let summary = "Writes a histogram summary.";
+
+  let description = [{
+The generated
+[`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
+has one summary value containing a histogram for `values`.
+
+This op reports an `InvalidArgument` error if any value is not finite.
+
+writer: A handle to a summary writer.
+step: The step to write the summary for.
+tag: Scalar.  Tag to use for the `Summary.Value`.
+values: Any shape. Values to use to build the histogram.
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_SummaryWrite]>:$writer,
+    I64Tensor:$step,
+    TF_StrTensor:$tag,
+    TF_IntOrFpTensor:$values
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<3>;
+}
+
+def TF_WriteImageSummaryOp : TF_Op<"WriteImageSummary", []> {
+  let summary = "Writes a `Summary` protocol buffer with images.";
+
+  let description = [{
+The summary has up to `max_images` summary values containing images. The
+images are built from `tensor` which must be 4-D with shape `[batch_size,
+height, width, channels]` and where `channels` can be:
+
+*  1: `tensor` is interpreted as Grayscale.
+*  3: `tensor` is interpreted as RGB.
+*  4: `tensor` is interpreted as RGBA.
+
+The images have the same number of channels as the input tensor. For float
+input, the values are normalized one image at a time to fit in the range
+`[0, 255]`.  `uint8` values are unchanged.  The op uses two different
+normalization algorithms:
+
+*  If the input values are all positive, they are rescaled so the largest one
+   is 255.
+
+*  If any input value is negative, the values are shifted so input value 0.0
+   is at 127.  They are then rescaled so that either the smallest value is 0,
+   or the largest one is 255.
+
+The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+build the `tag` of the summary values:
+
+*  If `max_images` is 1, the summary value tag is '*tag*/image'.
+*  If `max_images` is greater than 1, the summary value tags are
+   generated sequentially as '*tag*/image/0', '*tag*/image/1', etc.
+
+The `bad_color` argument is the color to use in the generated images for
+non-finite input values.  It is a `unit8` 1-D tensor of length `channels`.
+Each element must be in the range `[0, 255]` (It represents the value of a
+pixel in the output image).  Non-finite values in the input tensor are
+replaced by this tensor in the output image.  The default value is the color
+red.
+
+writer: A handle to a summary writer.
+step: The step to write the summary for.
+tag: Scalar. Used to build the `tag` attribute of the summary values.
+tensor: 4-D of shape `[batch_size, height, width, channels]` where
+  `channels` is 1, 3, or 4.
+max_images: Max number of batch elements to generate images for.
+bad_color: Color to use for pixels with non-finite values.
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_SummaryWrite]>:$writer,
+    I64Tensor:$step,
+    TF_StrTensor:$tag,
+    TensorOf<[F16, F32, TF_Uint8]>:$tensor,
+    TF_Uint8Tensor:$bad_color,
+
+    Confined<DefaultValuedAttr<I64Attr, "3">, [IntMinValue<1>]>:$max_images
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<3>;
+}
+
+def TF_WriteRawProtoSummaryOp : TF_Op<"WriteRawProtoSummary", []> {
+  let summary = "Writes a `Summary` protocol buffer with serialized string `Summary` protocol buffers.";
+
+  let description = [{
+writer: A handle to a summary writer.
+step: The step to write the summary for.
+tensor: A tensor holding one or more serialized `Summary` protobufs to write.
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_SummaryWrite]>:$writer,
+    I64Tensor:$step,
+    TF_StrTensor:$tensor
+  );
+
+  let results = (outs);
+}
+
+def TF_WriteScalarSummaryOp : TF_Op<"WriteScalarSummary", []> {
+  let summary = "Writes a `Summary` protocol buffer with scalar values.";
+
+  let description = [{
+The input `tag` and `value` must have the scalars.
+
+writer: A handle to a summary writer.
+step: The step to write the summary for.
+tag: Tag for the summary.
+value: Value for the summary.
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_SummaryWrite]>:$writer,
+    I64Tensor:$step,
+    TF_StrTensor:$tag,
+    TF_IntOrFpTensor:$value
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<3>;
+}
+
+def TF_WriteSummaryOp : TF_Op<"WriteSummary", []> {
+  let summary = "Outputs a `Summary` protocol buffer with a tensor.";
+
+  let description = [{
+writer: A handle to a summary writer.
+step: The step to write the summary for.
+tensor: A tensor to serialize.
+tag: The summary's tag.
+summary_metadata: Serialized SummaryMetadata protocol buffer containing
+ plugin-related metadata for this summary.
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_SummaryWrite]>:$writer,
+    I64Tensor:$step,
+    TF_Tensor:$tensor,
+    TF_StrTensor:$tag,
+    TF_StrTensor:$summary_metadata
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<2>;
+}
+
+// TODO(b/168035831): Model dataset read.
+def TF_InitializeTableFromDatasetOp : TF_Op<"InitializeTableFromDataset", []> {
+  let summary = "";
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_LookupTableWrite]>:$table_handle,
+    TF_VariantTensor:$dataset
+  );
+
+  let results = (outs);
+}
+
+// TODO(b/168035831): Model filename read.
+def TF_InitializeTableFromTextFileV2Op : TF_Op<"InitializeTableFromTextFileV2", []> {
+  let summary = "Initializes a table from a text file.";
+
+  let description = [{
+It inserts one key-value pair into the table for each line of the file.
+The key and value is extracted from the whole line content, elements from the
+split line based on `delimiter` or the line number (starting from zero).
+Where to extract the key and value from a line is specified by `key_index` and
+`value_index`.
+
+- A value of -1 means use the line number(starting from zero), expects `int64`.
+- A value of -2 means use the whole line content, expects `string`.
+- A value >= 0 means use the index (starting at zero) of the split line based
+  on `delimiter`.
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_LookupTableWrite]>:$table_handle,
+    TF_StrTensor:$filename,
+
+    Confined<I64Attr, [IntMinValue<-2>]>:$key_index,
+    Confined<I64Attr, [IntMinValue<-2>]>:$value_index,
+    Confined<DefaultValuedAttr<I64Attr, "-1">, [IntMinValue<-1>]>:$vocab_size,
+    DefaultValuedAttr<StrAttr, "\t">:$delimiter
+  );
+
+  let results = (outs);
+}
+
 #endif // TF_OPS
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc
index 5c19f9c3daa..953236602f9 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <algorithm>
 #include <cstdint>
 #include <functional>
+#include <iterator>
 #include <limits>
 #include <numeric>
 #include <string>
@@ -37,6 +38,7 @@ limitations under the License.
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/raw_ostream.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/Dialect/Traits.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
@@ -64,6 +66,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_side_effects.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/util/tensor_format.h"
 
@@ -173,6 +176,72 @@ static LogicalResult Verify(BatchMatMulV2Op op) {
   if (!HasRankAtLeast(op.y(), 2)) {
     return op.emitOpError("requires rhs operand to have rank at least two");
   }
+
+  RankedTensorType x_ty = GetRankedTensorTypeForOperand(op.x());
+  RankedTensorType y_ty = GetRankedTensorTypeForOperand(op.y());
+
+  if (!x_ty || !y_ty) return success();
+
+  ArrayRef<int64_t> x_shape = x_ty.getShape();
+  ArrayRef<int64_t> y_shape = y_ty.getShape();
+
+  // Check broadcast compatibility if both input shapes are known.
+  //
+  // The last two dimensions are non-batch dimensions that don't need to
+  // participate in batch dimension compatibility check.
+
+  llvm::SmallVector<int64_t, 4> result_batch_shape;
+  if (!OpTrait::util::getBroadcastedShape(
+          x_shape.drop_back(2), y_shape.drop_back(2), result_batch_shape))
+    return op.emitOpError()
+           << "found incompatible broadcast batch dimensions for lhs shape "
+           << x_ty << " and rhs shape " << y_ty;
+
+  RankedTensorType output_ty = GetRankedTensorTypeForOperand(op.output());
+  if (!output_ty) return success();
+
+  int64_t expected_output_rank = std::max(x_ty.getRank(), y_ty.getRank());
+  if (output_ty.getRank() != expected_output_rank)
+    return op.emitOpError()
+           << "found invalid output rank, expected " << expected_output_rank
+           << " but got " << output_ty.getRank();
+
+  // Check output batch dim with potential broadcasting.
+  ArrayRef<int64_t> output_shape = output_ty.getShape();
+  for (int i = 0; i < result_batch_shape.size(); ++i) {
+    if (output_shape[i] != ShapedType::kDynamicSize &&
+        output_shape[i] != result_batch_shape[i])
+      return op.emitOpError()
+             << "has mismatching input batch dimension "
+             << result_batch_shape[i] << " and output batch dimension "
+             << output_shape[i];
+  }
+
+  // Check output shape for non-batch dimension, following documentation below.
+  // https://www.tensorflow.org/api_docs/cc/class/tensorflow/ops/batch-mat-mul
+  int64_t x_row_dim = x_shape[x_shape.size() - 2];
+  int64_t x_col_dim = x_shape[x_shape.size() - 1];
+  int64_t y_row_dim = y_shape[y_shape.size() - 2];
+  int64_t y_col_dim = y_shape[y_shape.size() - 1];
+  int64_t out_row_dim = output_shape[output_shape.size() - 2];
+  int64_t out_col_dim = output_shape[output_shape.size() - 1];
+
+  int64_t expected_out_row_dim = op.adj_x() ? x_col_dim : x_row_dim;
+  int64_t expected_out_col_dim = op.adj_y() ? y_row_dim : y_col_dim;
+
+  if (expected_out_row_dim != ShapedType::kDynamicSize &&
+      out_row_dim != ShapedType::kDynamicSize &&
+      out_row_dim != expected_out_row_dim)
+    return op.emitOpError()
+           << "found invalid output dimension on row, expected "
+           << expected_out_row_dim << " but got " << out_row_dim;
+  if (expected_out_col_dim != ShapedType::kDynamicSize &&
+      out_col_dim != ShapedType::kDynamicSize &&
+      out_col_dim != expected_out_col_dim)
+    return op.emitOpError()
+           << "found invalid output dimension on col, expected "
+           << expected_out_col_dim << " but got " << out_col_dim;
+
   return success();
 }
 
@@ -187,7 +256,7 @@ void BatchMatMulV2Op::getCanonicalizationPatterns(
 
 static LogicalResult Verify(BatchToSpaceOp op) {
   // Op already has a constraint that block_size >= 2.
-  int64_t block_size = op.block_size().getSExtValue();
+  int64_t block_size = op.block_size();
 
   llvm::SmallVector<int64_t, 4> input_shape(4, ShapedType::kDynamicSize);
   auto input_type = op.input().getType().cast<TensorType>();
@@ -339,15 +408,19 @@ void BatchToSpaceOp::getCanonicalizationPatterns(
 //   are not unknown.
 //
 static LogicalResult Verify(BiasAddOp op) {
-  StringRef format = op.data_format();
-  if (format == "NHWC") {
+  absl::string_view data_format(op.data_format().data(),
+                                op.data_format().size());
+  tensorflow::TensorFormat format;
+  bool is_valid = FormatFromString(data_format, &format);
+  DCHECK(is_valid) << data_format;
+  if (format == tensorflow::TensorFormat::FORMAT_NHWC) {
     if (!HasRankAtLeast(op.value(), 2))
       return op.emitOpError(
           "requires value operand to have rank at least two with `NHWC` data "
           "format");
   } else {
     // Op definition requires data_format to be either NHWC or NCHW.
-    DCHECK_EQ(format.str(), "NCHW");
+    DCHECK_EQ(format, tensorflow::TensorFormat::FORMAT_NCHW);
     if (!HasRankAtLeast(op.value(), 3))
       return op.emitOpError(
           "requires value operand to have rank at least three with `NCHW` data "
@@ -361,9 +434,8 @@ static LogicalResult Verify(BiasAddOp op) {
   RankedTensorType bias_ty = op.bias().getType().dyn_cast<RankedTensorType>();
   if (!bias_ty || !value_ty) return success();
 
-  // TODO(hinsu): Leverage tensor_format.h utility in TensorFlow to compute
-  // dimension indices based on format.
-  int64_t feature_dim_idx = format == "NHWC" ? value_ty.getRank() - 1 : 1;
+  int64_t feature_dim_idx =
+      tensorflow::GetTensorFeatureDimIndex(value_ty.getRank(), format);
   int64_t feature_dim = value_ty.getDimSize(feature_dim_idx);
   int64_t bias_len = bias_ty.getDimSize(0);
   if (feature_dim != -1 && bias_len != -1 && feature_dim != bias_len) {
@@ -375,6 +447,13 @@ static LogicalResult Verify(BiasAddOp op) {
   return success();
 }
 
+Optional<ContractionFusion> BiasAddOp::GetContractionFusion() {
+  // Only NHWC in f32 is supported for fusion.
+  if (data_format() != "NHWC" || !T().isF32()) return None;
+
+  return ContractionFusion("BiasAdd", /*additional_arguments=*/{1});
+}
+
 //===----------------------------------------------------------------------===//
 // BiasAddGradOp
 //===----------------------------------------------------------------------===//
@@ -383,15 +462,19 @@ static LogicalResult Verify(BiasAddOp op) {
 // * the out_backprop operands have valid ranks or are unranked.
 //
 static LogicalResult Verify(BiasAddGradOp op) {
-  StringRef format = op.data_format();
-  if (format == "NHWC") {
+  absl::string_view data_format(op.data_format().data(),
+                                op.data_format().size());
+  tensorflow::TensorFormat format;
+  bool is_valid = FormatFromString(data_format, &format);
+  DCHECK(is_valid) << data_format;
+  if (format == tensorflow::TensorFormat::FORMAT_NHWC) {
     if (!HasRankAtLeast(op.out_backprop(), 2))
       return op.emitOpError(
           "requires out_backprop operand to have rank at least two with `NHWC` "
           "data format");
   } else {
     // Op definition requires data_format to be either NHWC or NCHW.
-    DCHECK_EQ(format.str(), "NCHW");
+    DCHECK_EQ(format, tensorflow::TensorFormat::FORMAT_NCHW);
     if (!HasRankAtLeast(op.out_backprop(), 3))
       return op.emitOpError(
           "requires out_backprop operand to have rank at least three with "
@@ -431,6 +514,19 @@ static LogicalResult Verify(BroadcastToOp op) {
   return success();
 }
 
+OpFoldResult BroadcastToOp::fold(ArrayRef<Attribute> operands) {
+  Value input = this->input();
+
+  // Fold broadcast if operand and result types are the same and all dimensions
+  // are statically known (no-op broadcast).
+  auto result_ty = getType().dyn_cast<ShapedType>();
+  if (result_ty && result_ty.hasStaticShape() && result_ty == input.getType()) {
+    return input;
+  }
+
+  return {};
+}
+
 //===----------------------------------------------------------------------===//
 // CaseOp
 //===----------------------------------------------------------------------===//
@@ -449,28 +545,119 @@ LogicalResult FoldConstantCaseOp::matchAndRewrite(
   DenseIntElementsAttr branch;
   if (!matchPattern(op.branch_index(), m_Constant(&branch))) return failure();
 
-  // Only attempt to fold scalar valued case statements.
-  // TODO(jpienaar): This can be removed if CaseOp's verifier covers it.
-  if (!branch.getType().cast<RankedTensorType>().getShape().empty())
-    return failure();
-
   int index = *branch.getValues<int>().begin();
-  // TODO(jpienaar): This can be removed if CaseOp's verifier covers it.
-  if (index >= op.branches().size()) return failure();
+  if (index < 0 || index >= op.branches().size())
+    index = op.branches().size() - 1;
 
   auto func = op.branches()[index].cast<SymbolRefAttr>();
   auto empty = rewriter.getStringAttr("");
   auto call_op = rewriter.create<PartitionedCallOp>(
       op.getLoc(), op.getResultTypes(), op.getOperands().drop_front(), func,
       /*config=*/empty, /*config_proto=*/empty, /*executor_type=*/empty);
-  PropagateDeviceAndInternalAttrs(op.getOperation(), call_op);
+  CopyDeviceAndUnderscoredAttributes(op.getOperation(), call_op);
   rewriter.replaceOp(op, call_op.getResults());
   return success();
 }
 
 void CaseOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
                                          MLIRContext *context) {
-  results.insert<FoldConstantCaseOp>(context);
+  results.insert<FoldConstantCaseOp, DropAttributes<CaseOp>>(context);
+}
+
+static LogicalResult VerifyCaseOpBase(Operation *op, Value branch_index) {
+  if (!IsOfRankOrUnranked(branch_index, 0))
+    return op->emitOpError()
+           << "expects 'branch_index' to be a scalar, but got "
+           << branch_index.getType();
+  return success();
+}
+
+static LogicalResult VerifyCaseOrIfOpBranchFunctions(
+    Operation *op, ArrayRef<Attribute> branches,
+    llvm::function_ref<std::string(unsigned branch_index)> branch_name) {
+  SmallVector<FunctionType, 2> branch_types;
+  branch_types.reserve(branches.size());
+
+  // Functions have one less operand compared to op as first operand is elided
+  // (`cond` of `tf.If` and `branch_index` of `tf.Case`).
+  TypeRangeWithDesc input{op->getOperands().drop_front().getTypes(), "input"};
+  TypeRangeWithDesc result{op->getResultTypes(), "result"};
+
+  for (auto branch : llvm::enumerate(branches)) {
+    auto branch_func = SymbolTable::lookupNearestSymbolFrom<FuncOp>(
+        op, branch.value().cast<SymbolRefAttr>());
+    if (!branch_func)
+      return op->emitOpError()
+             << "expects " << branch_name(branch.index()) << " ("
+             << branch.value() << ") to point to a defined function";
+
+    FunctionType branch_type = branch_func.getType();
+    std::string desc = branch_name(branch.index()) + " input";
+    TypeRangeWithDesc branch_input{branch_type.getInputs(), desc};
+    if (failed(VerifyTypeRangesAreCompatible(op, branch_input, input)))
+      return failure();
+
+    desc = branch_name(branch.index()) + " result";
+    TypeRangeWithDesc branch_result{branch_type.getResults(), desc};
+    if (failed(VerifyTypeRangesAreCompatible(op, branch_result, result)))
+      return failure();
+
+    branch_types.push_back(branch_type);
+  }
+
+  // If branches have incompatible input types that means that no tensor can
+  // serve as input to all the functions. Hence, the op is invalid.
+  int expected_num_inputs = op->getNumOperands() - 1;
+  for (int i = 0; i < expected_num_inputs; ++i) {
+    SmallVector<Type, 2> branch_input_i_types;
+    branch_input_i_types.reserve(branches.size());
+    llvm::transform(
+        branch_types, std::back_inserter(branch_input_i_types),
+        [i](FunctionType &branch_type) { return branch_type.getInput(i); });
+    if (!AreCastCompatible(branch_input_i_types)) {
+      std::string input_types_str;
+      llvm::raw_string_ostream os(input_types_str);
+      llvm::interleaveComma(branch_input_i_types, os);
+      return op->emitOpError()
+             << "expects all branch input type(s) (" << os.str()
+             << ") at index " << i << " to be cast compatible";
+    }
+  }
+
+  return success();
+}
+
+static LogicalResult Verify(CaseOp op) {
+  if (failed(VerifyCaseOpBase(op, op.branch_index()))) return failure();
+  auto branch_name = [](unsigned index) {
+    return llvm::formatv("branch #{0}", index).str();
+  };
+  return VerifyCaseOrIfOpBranchFunctions(op, op.branches().getValue(),
+                                         branch_name);
+}
+
+//===----------------------------------------------------------------------===//
+// CaseRegionOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(CaseRegionOp op) {
+  if (op.branches().empty())
+    return op.emitOpError() << "expects to have at least 1 region";
+
+  if (failed(VerifyCaseOpBase(op, op.branch_index()))) return failure();
+
+  TypeRangeWithDesc results{op.getResultTypes(), "result"};
+
+  for (auto region_and_idx : llvm::enumerate(op.branches())) {
+    std::string description =
+        llvm::formatv("branch #{0} result", region_and_idx.index()).str();
+    Operation *yield = region_and_idx.value().front().getTerminator();
+    TypeRangeWithDesc branch_results{yield->getOperandTypes(), description};
+    if (failed(VerifyTypeRangesAreCompatible(op, branch_results, results)))
+      return failure();
+  }
+
+  return success();
 }
 
 //===----------------------------------------------------------------------===//
@@ -727,6 +914,35 @@ void ConcatV2Op::getCanonicalizationPatterns(OwningRewritePatternList &results,
       context);
 }
 
+//===----------------------------------------------------------------------===//
+// CumsumOp and CumprodOp
+//===----------------------------------------------------------------------===//
+
+template <typename OpT, typename std::enable_if<llvm::is_one_of<
+                            OpT, CumsumOp, CumprodOp>::value>::type * = nullptr>
+static LogicalResult Verify(OpT op) {
+  if (!IsOfRankOrUnranked(op.axis(), 0))
+    return op.emitOpError("requires scalar axis operand");
+
+  DenseIntElementsAttr axis_attr;
+  if (matchPattern(op.axis(), m_Constant(&axis_attr))) {
+    auto input_ty = op.x().getType().template dyn_cast<RankedTensorType>();
+    if (input_ty) {
+      int64_t rank = input_ty.getRank();
+      assert(axis_attr.getNumElements() == 1 &&
+             "scalar attribute should have exactly one element");
+      int64_t axis = (*axis_attr.begin()).getSExtValue();
+      if (axis < -rank || axis >= rank) {
+        return op.emitError()
+               << "axis operand should be within range [" << -rank << ", "
+               << rank << "); actual value: " << axis;
+      }
+    }
+  }
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // ConcatOffsetOp
 //===----------------------------------------------------------------------===//
@@ -990,7 +1206,8 @@ static LogicalResult Verify(OpT op) {
 
   int64_t input_channels = -1;
   if (auto ty = op.input().getType().template dyn_cast<RankedTensorType>()) {
-    std::string data_format = op.data_format().str();
+    absl::string_view data_format(op.data_format().data(),
+                                  op.data_format().size());
     tensorflow::TensorFormat format;
     auto is_valid = FormatFromString(data_format, &format);
     DCHECK(is_valid) << data_format;
@@ -1475,7 +1692,7 @@ static LogicalResult Verify(FakeQuantWithMinMaxArgsOp op) {
     return op.emitOpError("range is invalid: [" + Twine(std::to_string(rmin)) +
                           "," + Twine(std::to_string(rmax)) + "]");
   }
-  int64_t num_bits = op.num_bits().getSExtValue();
+  int64_t num_bits = op.num_bits();
   if (num_bits < 2 || num_bits > 16) {
     return op.emitOpError(
         "requires num_bits to be between 2 and 16, inclusive");
@@ -1495,7 +1712,7 @@ static LogicalResult Verify(FakeQuantWithMinMaxVarsOp op) {
   if (max && !IsOfRankedFloatTensorType(max, 0))
     return op.emitOpError("requires max to be a 0d float tensor");
 
-  int64_t num_bits = op.num_bits().getSExtValue();
+  int64_t num_bits = op.num_bits();
   if (num_bits < 2 || num_bits > 16) {
     return op.emitOpError(
         "requires num_bits to be between 2 and 16, inclusive");
@@ -1519,7 +1736,7 @@ static LogicalResult Verify(FakeQuantWithMinMaxVarsPerChannelOp op) {
   if (!HasRankAtLeast(inputs, 1))
     return op.emitError("requires inputs to be at least 1d float tensor");
 
-  int64_t num_bits = op.num_bits().getSExtValue();
+  int64_t num_bits = op.num_bits();
   if (num_bits < 2 || num_bits > 16) {
     return op.emitOpError(
         "requires num_bits to be between 2 and 16, inclusive");
@@ -1722,7 +1939,7 @@ StringRef FusedBatchNormV3Op::GetOptimalLayout(const RuntimeDevices &devices) {
 //===----------------------------------------------------------------------===//
 
 static LogicalResult Verify(GatherV2Op op) {
-  int64_t batch_dims = op.batch_dims().getSExtValue();
+  int64_t batch_dims = op.batch_dims();
   if (auto ty = op.indices().getType().dyn_cast<RankedTensorType>()) {
     int64_t rank = ty.getRank();
     if (batch_dims > rank || batch_dims < -rank)
@@ -1760,79 +1977,18 @@ static LogicalResult Verify(GatherV2Op op) {
 //===----------------------------------------------------------------------===//
 
 static LogicalResult Verify(IfOp op) {
-  auto then_fn = op.then_func();
-  if (!then_fn)
-    return op.emitOpError("then_branch refers to an undefined function : ")
-           << op.then_branch();
-  auto else_fn = op.else_func();
-  if (!else_fn)
-    return op.emitOpError("else_branch refers to an undefined function : ")
-           << op.else_branch();
-  auto then_fn_type = then_fn.getType();
-  auto else_fn_type = else_fn.getType();
-
-  // Non-conditional operands starting with the second operand are passed to
-  // branches and should be pair-wise compatible with branches' inputs.
-  unsigned expected_num_inputs = op.getNumOperands() - 1;
-  if (then_fn_type.getNumInputs() != expected_num_inputs ||
-      else_fn_type.getNumInputs() != expected_num_inputs)
-    return op.emitError("branches should have " + Twine(expected_num_inputs) +
-                        " inputs");
-
-  for (unsigned i = 0; i < expected_num_inputs; ++i) {
-    auto operand_type = op.getOperand(i + 1).getType().cast<TensorType>();
-    auto then_input_type = then_fn_type.getInput(i).cast<TensorType>();
-    if (!AreCastCompatible({operand_type, then_input_type}))
-      return op.emitError(
-          llvm::formatv("then branch input type {0} is incompatible with "
-                        "operand type {1} at index {2}",
-                        then_input_type, operand_type, i));
-
-    auto else_input_type = else_fn_type.getInput(i).cast<TensorType>();
-    if (!AreCastCompatible({operand_type, else_input_type}))
-      return op.emitError(
-          llvm::formatv("else branch input type {0} is incompatible with "
-                        "operand type {1} at index {2}",
-                        else_input_type, operand_type, i));
-
-    // If branches have incompatible input types that means that no tensor can
-    // serve as input to both the functions. Hence, the op is invalid.
-    if (!AreCastCompatible({then_input_type, else_input_type}))
-      return op.emitError(llvm::formatv(
-          "branches inputs have incompatible types {0} and {1} at index {2}",
-          then_input_type, else_input_type, i));
-  }
-
-  // Branches' results should be pair-wise compatible with the op results.
-  unsigned expected_num_results = op.getNumResults();
-  if (then_fn_type.getNumResults() != expected_num_results ||
-      else_fn_type.getNumResults() != expected_num_results)
-    return op.emitError("branches should have " + Twine(expected_num_results) +
-                        " results");
-
-  for (unsigned i = 0; i < expected_num_results; ++i) {
-    auto result_type = op.getResult(i).getType().cast<TensorType>();
-    auto then_result_type = then_fn_type.getResult(i).cast<TensorType>();
-    if (!AreCastCompatible({then_result_type, result_type}))
-      return op.emitError(
-          llvm::formatv("then branch result type {0} is incompatible with op "
-                        "result type {1} at index {2}",
-                        then_result_type, result_type, i));
-
-    auto else_result_type = else_fn_type.getResult(i).cast<TensorType>();
-    if (!AreCastCompatible({else_result_type, result_type}))
-      return op.emitError(
-          llvm::formatv("else branch result type {0} is incompatible with op "
-                        "result type {1} at index {2}",
-                        else_result_type, result_type, i));
-  }
-  return success();
+  auto branch_name = [](unsigned index) -> std::string {
+    return index == 0 ? "'then_branch'" : "'else_branch'";
+  };
+  return VerifyCaseOrIfOpBranchFunctions(
+      op, {op.then_branchAttr(), op.else_branchAttr()}, branch_name);
 }
 
 //===----------------------------------------------------------------------===//
 // IfOp canonicalization.
 //===----------------------------------------------------------------------===//
 
+namespace {
 class FoldConstantIfOp : public OpRewritePattern<TF::IfOp> {
  public:
   explicit FoldConstantIfOp(MLIRContext *context)
@@ -1864,9 +2020,9 @@ LogicalResult FoldConstantIfOp::matchAndRewrite(
   auto rewrite = [&](auto op_type) {
     auto empty = rewriter.getStringAttr("");
     auto call_op = rewriter.create<typename decltype(op_type)::CallOp>(
-        op.getLoc(), op.getResultTypes(), op.getOperands().drop_front(), func,
+        op.getLoc(), op.getResultTypes(), op.input(), func,
         /*config=*/empty, /*config_proto=*/empty, /*executor_type=*/empty);
-    PropagateDeviceAndInternalAttrs(op.getOperation(), call_op);
+    CopyDeviceAndUnderscoredAttributes(op.getOperation(), call_op);
     rewriter.replaceOp(op, call_op.getResults());
   };
 
@@ -1877,6 +2033,7 @@ LogicalResult FoldConstantIfOp::matchAndRewrite(
 
   return success();
 }
+}  // anonymous namespace
 
 void IfOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
                                        MLIRContext *context) {
@@ -1888,13 +2045,77 @@ void IfOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
 //===----------------------------------------------------------------------===//
 
 static LogicalResult Verify(IfRegionOp op) {
-  if (failed(VerifyRegionResults(op, op.then_branch(), "then")))
+  TypeRange then_types =
+      op.then_branch().front().getTerminator()->getOperandTypes();
+  TypeRange else_types =
+      op.else_branch().front().getTerminator()->getOperandTypes();
+
+  TypeRangeWithDesc results{op.getResultTypes(), "result"};
+  TypeRangeWithDesc then_results{then_types, "then result"};
+  TypeRangeWithDesc else_results{else_types, "else result"};
+
+  if (failed(VerifyTypeRangesAreCompatible(op, then_results, results)))
     return failure();
-  if (failed(VerifyRegionResults(op, op.else_branch(), "else")))
+  if (failed(VerifyTypeRangesAreCompatible(op, else_results, results)))
     return failure();
   return success();
 }
 
+namespace {
+class FoldConstantIfRegionOp : public OpRewritePattern<TF::IfRegionOp> {
+ public:
+  explicit FoldConstantIfRegionOp(MLIRContext *context)
+      : OpRewritePattern<TF::IfRegionOp>(context) {}
+  LogicalResult matchAndRewrite(TF::IfRegionOp op,
+                                PatternRewriter &rewriter) const override;
+};
+
+LogicalResult FoldConstantIfRegionOp::matchAndRewrite(
+    TF::IfRegionOp op, PatternRewriter &rewriter) const {
+  // Extract the constant cond value.
+  DenseIntElementsAttr cond_attr;
+  if (!matchPattern(op.cond(), m_Constant(&cond_attr))) return failure();
+
+  // IfRegion condition should always be a scalar. Select the region to fold to.
+  bool cond = cond_attr.getSplatValue<BoolAttr>().getValue();
+  Region &region = cond ? op.then_branch() : op.else_branch();
+
+  // If the IfRegion is stateless but the region being inlined itself is not
+  // stateless, then inlining the region could cause a loss of information.
+  // However, its probably better to fold the IfRegion instead of having the
+  // dead branch stay.
+
+  // Inline the region in place of the IfRegion op, and forward the yield
+  // inputs to the IfRegion op results. This is possible only if the yield
+  // types match the result types.
+  auto yield = cast<YieldOp>(region.front().getTerminator());
+  auto updated_results = llvm::to_vector<4>(yield.getOperands());
+
+  // If the yield types do not match the IfRegion result types, add appropriate
+  // casts.
+  rewriter.setInsertionPoint(yield);
+  for (auto it : llvm::zip(op.getResultTypes(), updated_results)) {
+    auto &updated_result = std::get<1>(it);
+    Type result_type = std::get<0>(it);
+    if (result_type != updated_result.getType()) {
+      updated_result =
+          rewriter.create<TF::CastOp>(op.getLoc(), result_type, updated_result,
+                                      /*Truncate=*/rewriter.getBoolAttr(false));
+    }
+  }
+  // Inline the region into the block containing the IfRegion.
+  rewriter.mergeBlockBefore(&region.front(), op);
+  rewriter.eraseOp(yield);
+  rewriter.replaceOp(op, updated_results);
+  return success();
+}
+}  // anonymous namespace
+
+void IfRegionOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                             MLIRContext *context) {
+  results.insert<FoldConstantIfRegionOp>(context);
+}
+
 //===----------------------------------------------------------------------===//
 // InvertOp
 //===----------------------------------------------------------------------===//
@@ -1943,6 +2164,15 @@ OpFoldResult LeakyReluOp::fold(ArrayRef<Attribute> operands) {
   return {};
 }
 
+Optional<ContractionFusion> LeakyReluOp::GetContractionFusion() {
+  // Only f32 is supported for fusion.
+  if (!T().isF32()) return None;
+
+  NamedAttribute alpha(Identifier::get("alpha", getContext()), alphaAttr());
+  return ContractionFusion("LeakyRelu", /*additional_arguments=*/{},
+                           /*additional_attributes=*/{alpha});
+}
+
 //===----------------------------------------------------------------------===//
 // LogOp
 //===----------------------------------------------------------------------===//
@@ -2064,12 +2294,12 @@ OpFoldResult MulOp::fold(ArrayRef<Attribute> operands) {
   return IdentityArithmeticOpFolder<MulOp>(*this, operands);
 }
 
+}  // namespace TF
+}  // namespace mlir
+
 //===----------------------------------------------------------------------===//
 // TableGen'd op method definitions
 //===----------------------------------------------------------------------===//
 
 #define GET_OP_CLASSES
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc.inc"
-
-}  // namespace TF
-}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.h
index 19a927a23d7..8d98632b198 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.h
@@ -43,6 +43,9 @@ namespace TF {
 
 class YieldOp;
 
+}  // namespace TF
+}  // namespace mlir
+
 // TODO(b/131258166): TensorFlow's mutex.h defines a `mutex_lock` macro, whose
 // purpose is to catch bug on `tensorflow::mutex_lock`. We don't use
 // `tensorflow::mutex_lock` here but we have ops (`tf.MutexLock` and
@@ -56,7 +59,4 @@ class YieldOp;
 #define GET_OP_CLASSES
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.h.inc"
 
-}  // namespace TF
-}  // namespace mlir
-
 #endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_OPS_A_M_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_helpers.inc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_helpers.inc
index 71f1560aa6c..44df2b12d88 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_helpers.inc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_helpers.inc
@@ -18,17 +18,6 @@ limitations under the License.
 // tf_verifiers or tf_ops.
 // TODO(jpienaar): Remove this file post refactoring.
 
-// Propagates underscore and device attributes from src to dst.
-// TODO(b/158769932): This should be a general feature instead post some policy
-// discussion.
-static void PropagateDeviceAndInternalAttrs(Operation *src, Operation *dst) {
-  auto device = mlir::Identifier::get("device", src->getContext());
-  for (auto named_attr : src->getAttrs()) {
-    if (*named_attr.first.begin() == '_' || named_attr.first == device)
-      dst->setAttr(named_attr.first, named_attr.second);
-  }
-}
-
 //===----------------------------------------------------------------------===//
 // TF op helper functions
 //===----------------------------------------------------------------------===//
@@ -554,27 +543,27 @@ static LogicalResult VerifyReductionInputAndDims(Value input, Value dims,
   return success();
 }
 
-LogicalResult VerifyRegionResults(Operation *op, Region &region,
-                                  StringRef region_name) {
-  auto op_name = op->getName().getStringRef();
-  // verify that op outputs match yield inputs
-  YieldOp yield = cast<YieldOp>(region.front().getTerminator());
-  unsigned expected_num_results = op->getNumResults();
-  if (yield.getNumOperands() != expected_num_results)
-    return op->emitOpError()
-           << region_name + " should have same number (" << expected_num_results
-           << ") of results as " << op_name << " but has "
-           << yield.getNumOperands() << " results";
+// A type range with description (in singular form) attached to it.
+using TypeRangeWithDesc = std::pair<TypeRange, StringRef>;
 
-  for (int idx : llvm::seq<int>(0, expected_num_results)) {
-    auto op_result_type = op->getResult(idx).getType().cast<TensorType>();
-    auto region_result_type =
-        yield.getOperand(idx).getType().cast<TensorType>();
-    if (!AreCastCompatible({region_result_type, op_result_type}))
-      return op->emitError(llvm::formatv(
-          "{0} result type {1} is incompatible with {2} "
-          "result type {3} at index {4}",
-          region_name, region_result_type, op_name, op_result_type, idx));
+LogicalResult VerifyTypeRangesAreCompatible(Operation *op,
+                                            TypeRangeWithDesc range0,
+                                            TypeRangeWithDesc range1) {
+  if (range0.first.size() != range1.first.size()) {
+    return op->emitOpError()
+           << range0.second << "s (size = " << range0.first.size() << ")"
+           << " should have the same number of values as " << range1.second
+           << "s (size = " << range1.first.size() << ")";
+  }
+
+  for (auto it : llvm::enumerate(llvm::zip(range0.first, range1.first))) {
+    int index = it.index();
+    Type type0 = std::get<0>(it.value());
+    Type type1 = std::get<1>(it.value());
+    if (!AreCastCompatible({type0, type1}))
+      return op->emitOpError(llvm::formatv(
+          "{0} type {1} is incompatible with {2} type {3} at index {4}",
+          range0.second, type0, range1.second, type1, index));
   }
   return success();
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc
index ffedcb47f7e..c2f39733c7a 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc
@@ -109,7 +109,7 @@ void NotEqualOp::build(OpBuilder &builder, OperationState &result, Value x,
 //===----------------------------------------------------------------------===//
 
 static LogicalResult Verify(OneHotOp op) {
-  int64_t axis = op.axis().getSExtValue();
+  int64_t axis = op.axis();
 
   auto indices_ty = op.indices().getType().dyn_cast<RankedTensorType>();
   if (indices_ty &&
@@ -207,7 +207,7 @@ static LogicalResult Verify(PackOp op) {
   // the axis value range is [-(R+1), R+1).
   int64_t range_begin = -inputs_rank - 1;  // Inclusive
   int64_t range_end = inputs_rank + 1;     // Exclusive
-  int64_t axis = op.axis().getSExtValue();
+  int64_t axis = op.axis();
   if (axis < range_begin || axis >= range_end) {
     return op.emitError() << "attribute 'axis' should be within range ["
                           << range_begin << ", " << range_end
@@ -232,7 +232,7 @@ OpFoldResult PackOp::fold(ArrayRef<Attribute> operands) {
   if (values().size() < 2) return {};
 
   // Dimensions packed along axis = 0 (pack scalars into vector).
-  if (axis().getSExtValue() != 0) return {};
+  if (axis() != 0) return {};
 
   // First packed value is defined by a strided slice operation.
   auto slice_op = dyn_cast_or_null<StridedSliceOp>(values()[0].getDefiningOp());
@@ -247,11 +247,9 @@ OpFoldResult PackOp::fold(ArrayRef<Attribute> operands) {
 
   // All masks are `0` except `shrink_axis_mask` which is equal to `1` (slicing
   // scalar value from input vector).
-  if (slice_op.begin_mask().getSExtValue() != 0 ||
-      slice_op.ellipsis_mask().getSExtValue() != 0 ||
-      slice_op.end_mask().getSExtValue() != 0 ||
-      slice_op.new_axis_mask().getSExtValue() != 0 ||
-      slice_op.shrink_axis_mask().getSExtValue() != 1)
+  if (slice_op.begin_mask() != 0 || slice_op.ellipsis_mask() != 0 ||
+      slice_op.end_mask() != 0 || slice_op.new_axis_mask() != 0 ||
+      slice_op.shrink_axis_mask() != 1)
     return {};
 
   // Returns a value if the `value` is defined by a ConstOp with a single
@@ -566,6 +564,17 @@ OpFoldResult RealDivOp::fold(ArrayRef<Attribute> operands) {
   return IdentityArithmeticOpFolder<RealDivOp>(*this, operands);
 }
 
+//===----------------------------------------------------------------------===//
+// ReluOp
+//===----------------------------------------------------------------------===//
+
+Optional<ContractionFusion> ReluOp::GetContractionFusion() {
+  // Only f32 is supported for fusion.
+  if (!T().isF32()) return None;
+
+  return ContractionFusion("Relu", /*additional_arguments=*/{});
+}
+
 //===----------------------------------------------------------------------===//
 // ReshapeOp
 //===----------------------------------------------------------------------===//
@@ -707,7 +716,6 @@ OpFoldResult ReshapeOp::fold(ArrayRef<Attribute> operands) {
 
   // Fold reshape if operand and result types are the same and all dimensions
   // are statically known (no-op reshape).
-  // TODO(ezhulenev): Add the same folding for BroadcastToOp.
   auto result_ty = getType().dyn_cast<ShapedType>();
   if (result_ty && result_ty.hasStaticShape() &&
       result_ty == tensor.getType()) {
@@ -932,24 +940,75 @@ static LogicalResult Verify(ShapeNOp op) {
   return success();
 }
 
-LogicalResult ShapeNOp::fold(ArrayRef<Attribute> operands,
-                             SmallVectorImpl<OpFoldResult> &results) {
-  if (getNumOperands() == 0) return success();
-  int width =
-      getType(0).cast<ShapedType>().getElementType().getIntOrFloatBitWidth();
-
-  for (Type input_ty : getOperandTypes()) {
-    OpFoldResult result = ConvertShapeToAttr(input_ty, width);
-    if (!result) return failure();
-
-    results.push_back(result);
-  }
-  return success();
-}
-
-// TODO(hinsu): Add canonicalization pattern for ShapeN ops that don't have all
+namespace {
+// Canonicalization pattern for ShapeNOp that don't have all
 // static input shapes. Replacing output values corresponding to static input
 // types may enable optimizations in users of the values.
+class ShapeNPartialStaticInputShape : public OpRewritePattern<ShapeNOp> {
+  using OpRewritePattern<ShapeNOp>::OpRewritePattern;
+  LogicalResult matchAndRewrite(ShapeNOp op,
+                                PatternRewriter &rewriter) const override {
+    if (op.getNumOperands() == 0) {
+      rewriter.eraseOp(op);
+      return success();
+    }
+
+    int width = getElementTypeOrSelf(op.getType(0)).getIntOrFloatBitWidth();
+
+    SmallVector<Value, 4> results(op.getNumOperands());
+    SmallVector<int64_t, 4> dynamic_indices;
+    SmallVector<Value, 4> dynamic_inputs;
+    SmallVector<Type, 4> result_types;
+    for (auto e : llvm::enumerate(op.getOperands())) {
+      if (Attribute result = ConvertShapeToAttr(e.value().getType(), width)) {
+        results[e.index()] = rewriter.create<TF::ConstOp>(op.getLoc(), result);
+      } else {
+        dynamic_indices.push_back(e.index());
+        dynamic_inputs.push_back(e.value());
+        result_types.push_back(op.getType(e.index()));
+      }
+    }
+
+    if (dynamic_inputs.size() == op.getNumOperands()) {
+      // Cannot canonicalize ShapeN if all inputs are dynamic.
+      return failure();
+    }
+
+    // Create a ShapeNOp for all dynamic inputs.
+    if (!dynamic_inputs.empty()) {
+      auto dynamic_shape_n = rewriter.create<TF::ShapeNOp>(
+          op.getLoc(), result_types, dynamic_inputs);
+      for (auto index_result :
+           llvm::zip(dynamic_indices, dynamic_shape_n.getResults())) {
+        results[std::get<0>(index_result)] = std::get<1>(index_result);
+      }
+    }
+
+    rewriter.replaceOp(op, results);
+    return success();
+  }
+};
+
+// Canonicalize ShapeNOp to ShapeOp if there is only one operand.
+class ShapeNToShape : public OpRewritePattern<ShapeNOp> {
+  using OpRewritePattern<ShapeNOp>::OpRewritePattern;
+  LogicalResult matchAndRewrite(ShapeNOp op,
+                                PatternRewriter &rewriter) const override {
+    if (op.getNumOperands() != 1) {
+      return failure();
+    }
+    auto shape = rewriter.create<TF::ShapeOp>(op.getLoc(), op.getType(0),
+                                              op.getOperand(0));
+    rewriter.replaceOp(op, {shape});
+    return success();
+  }
+};
+}  // namespace
+
+void ShapeNOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                           MLIRContext *context) {
+  results.insert<ShapeNToShape, ShapeNPartialStaticInputShape>(context);
+}
 
 //===----------------------------------------------------------------------===//
 // SizeOp
@@ -964,9 +1023,23 @@ static LogicalResult Verify(SizeOp op) {
     return op.emitOpError(
         "requires ranked input tensor to be of rank INT32_MAX or less");
 
+  // Output type needs to be scalar.
+  if (!IsOfRankOrUnranked(op.output(), /*rank=*/0))
+    return op.emitOpError("requires scalar output");
+
   return success();
 }
 
+OpFoldResult SizeOp::fold(ArrayRef<Attribute> operands) {
+  ShapedType output_type = getType().cast<ShapedType>();
+  ShapedType input_type = getOperand().getType().cast<ShapedType>();
+  if (!input_type.hasStaticShape()) return {};
+  int size = input_type.getNumElements();
+  return DenseElementsAttr::get(
+      output_type,
+      IntegerAttr::get(output_type.getElementType(), /*value=*/size));
+}
+
 //===----------------------------------------------------------------------===//
 // SliceOp
 //===----------------------------------------------------------------------===//
@@ -978,8 +1051,11 @@ static LogicalResult Verify(SizeOp op) {
 //   of elements in operands begin and size.
 // - if begin are constants, that
 //   0 <= begin[i] <= begin[i] + size[i] <= input_ty.getShape()[i]
+//   and
+//   size[i] == output_ty.getShape()[i]
 // - if begins aren't constant but the input is a ranked tensor, that
 //   size[i] <= input_ty.getShape()[i]
+// - output rank is the same as input rank
 //
 static LogicalResult Verify(SliceOp op) {
   RankedTensorType begin_ty = GetRankedTensorTypeForOperand(op.begin());
@@ -1007,21 +1083,40 @@ static LogicalResult Verify(SliceOp op) {
                                "are equal to input rank";
   }
 
+  auto output_ty = op.output().getType().dyn_cast<RankedTensorType>();
+  if (output_ty && input_ty && output_ty.getRank() != input_ty.getRank()) {
+    return op.emitOpError()
+           << "requires output to have the same rank as input, but got input "
+              "rank "
+           << input_ty.getRank() << " and output rank " << output_ty.getRank();
+  }
+
   DenseIntElementsAttr begin_indices;
   if (matchPattern(op.begin(), m_Constant(&begin_indices))) {
     DenseIntElementsAttr slice_sizes;
     bool constant_slice_sizes =
         matchPattern(op.size(), m_Constant(&slice_sizes));
     int dim = 0;
+    // TODO(jpienaar): Reformulate the shape verification below to not use magic
+    // constants.
     for (const APInt &raw_begin_index : begin_indices.getValues<APInt>()) {
       int64_t begin_index = raw_begin_index.getSExtValue();
       int64_t input_size = input_ty ? input_ty.getShape()[dim] : -1;
       int64_t slice_size = constant_slice_sizes
                                ? slice_sizes.getValue<APInt>(dim).getSExtValue()
                                : 0;
+      int64_t output_size = output_ty ? output_ty.getShape()[dim] : -1;
+
       if (slice_size == -1 && input_size != -1) {
         slice_size = input_size - begin_index;
       }
+      if (output_size != -1 && constant_slice_sizes &&
+          output_size != slice_size) {
+        return op.emitOpError()
+               << "requires output size to have the same size of slice, got "
+                  "slice size "
+               << slice_size << " and output size " << output_size;
+      }
       if (begin_index < 0 ||
           (input_size != -1 && begin_index + slice_size > input_size)) {
         return op.emitOpError()
@@ -1079,6 +1174,13 @@ static LogicalResult Verify(SoftmaxCrossEntropyWithLogitsOp op) {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// SpaceToBatchNDOp
+//===----------------------------------------------------------------------===//
+
+// TODO(b/157475606): Add Verify(SpaceToBatchNDOp)
+// TODO(b/157475606): Add SpaceToBatchNDOp::inferReturnTypes
+
 //===----------------------------------------------------------------------===//
 // SparseSoftmaxCrossEntropyWithLogitsOp
 //===----------------------------------------------------------------------===//
@@ -1325,7 +1427,7 @@ static LogicalResult VerifyStridedSliceBase(OpTy op) {
 
   // Use bit compares to ensure ellipsis_mask is 0 or a power of 2, i.e. there
   // exists only no more than one ellipsis.
-  uint32_t ellipsis_mask = op.ellipsis_mask().getZExtValue();
+  uint32_t ellipsis_mask = op.ellipsis_mask();
   if (ellipsis_mask != 0 && !llvm::isPowerOf2_32(ellipsis_mask))
     return op.emitOpError("cannot have multiple ellipses");
 
@@ -1581,10 +1683,9 @@ bool StridedSliceOp::GetSlicedBoundRanges(
     sparse_strides.push_back(stride.getSExtValue());
 
   CalculateSlicedShapeFromSparseIndices(
-      input_shape, sparse_begin, sparse_end, sparse_strides,
-      begin_mask().getZExtValue(), end_mask().getZExtValue(),
-      ellipsis_mask().getZExtValue(), new_axis_mask().getZExtValue(),
-      shrink_axis_mask().getZExtValue(), slice_begin, slice_end, slice_stride);
+      input_shape, sparse_begin, sparse_end, sparse_strides, begin_mask(),
+      end_mask(), ellipsis_mask(), new_axis_mask(), shrink_axis_mask(),
+      slice_begin, slice_end, slice_stride);
   return true;
 }
 
@@ -1635,10 +1736,9 @@ bool StridedSliceGradOp::GetSlicedShapeAndBoundRanges(
     sparse_strides.push_back(stride.getSExtValue());
 
   CalculateSlicedShapeFromSparseIndices(
-      *input_shape, sparse_begin, sparse_end, sparse_strides,
-      begin_mask().getZExtValue(), end_mask().getZExtValue(),
-      ellipsis_mask().getZExtValue(), new_axis_mask().getZExtValue(),
-      shrink_axis_mask().getZExtValue(), slice_begin, slice_end, slice_stride);
+      *input_shape, sparse_begin, sparse_end, sparse_strides, begin_mask(),
+      end_mask(), ellipsis_mask(), new_axis_mask(), shrink_axis_mask(),
+      slice_begin, slice_end, slice_stride);
   return true;
 }
 
@@ -1712,6 +1812,87 @@ static LogicalResult Verify(TensorScatterUpdateOp op) {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// TileOp
+//===----------------------------------------------------------------------===//
+
+// Verifies that,
+//
+// - input has at least rank 1
+// - multiples is rank 1
+// - multiples.size() == input.rank()
+// - input.rank() == output.rank()
+// - Elements in multiples are non-negative
+// - input.shape[i] * multiples[i] == output.shape[i]
+//   for i in [0, input.rank() - 1]
+
+static LogicalResult Verify(TileOp op) {
+  auto input_type = op.input().getType().dyn_cast<RankedTensorType>();
+  auto multiples_type = op.multiples().getType().dyn_cast<RankedTensorType>();
+  auto output_type = op.output().getType().dyn_cast<RankedTensorType>();
+
+  if (multiples_type && multiples_type.getRank() != 1) {
+    return op.emitOpError() << "expected multiples to be rank 1, got rank = "
+                            << multiples_type.getRank();
+  }
+
+  if (input_type && multiples_type && multiples_type.hasStaticShape() &&
+      (input_type.getRank() != multiples_type.getNumElements() ||
+       (input_type.getRank() == 0 && multiples_type.getNumElements() == 1))) {
+    return op.emitOpError()
+           << "expected size of multiples equal to rank of input"
+           << ", got multiples of size " << multiples_type.getNumElements()
+           << ", and input of rank " << input_type.getRank();
+  }
+
+  if (input_type && output_type) {
+    if (input_type.getRank() != output_type.getRank()) {
+      return op.emitOpError()
+             << "expected rank of input to equal to rank of output"
+             << ", got input of rank " << input_type.getRank()
+             << ", and output of rank " << output_type.getRank();
+    }
+
+    DenseIntElementsAttr multiples_attr;
+    if (matchPattern(op.multiples(), m_Constant(&multiples_attr))) {
+      for (int32_t i = 0, e = input_type.getRank(); i < e; ++i) {
+        const int64_t input_dim = input_type.getDimSize(i);
+        const int64_t output_dim = output_type.getDimSize(i);
+        const int64_t m = multiples_attr.getValue<APInt>(i).getSExtValue();
+
+        if (m < 0) {
+          return op.emitOpError()
+                 << "expected multiples to be non-negative, got "
+                 << "multiples[" << i << "] = " << m;
+        }
+
+        if (!ShapedType::isDynamic(input_dim) &&
+            !ShapedType::isDynamic(output_dim) && output_dim != input_dim * m) {
+          return op.emitOpError()
+                 << "requires input.shape[" << i << "] (" << input_dim << ")"
+                 << " * " << m << " to be equal to "
+                 << "output.shape[" << i << "] (" << output_dim << ")";
+        }
+      }
+    }
+  }
+
+  return success();
+}
+
+OpFoldResult TileOp::fold(ArrayRef<Attribute> operands) {
+  DenseIntElementsAttr multiples_attr;
+  if (matchPattern(multiples(), m_Constant(&multiples_attr))) {
+    // Return input directly when multiples are all ones,
+    // regardless what input is.
+    if (multiples_attr.isSplat() &&
+        multiples_attr.getSplatValue<APInt>().getSExtValue() == 1) {
+      return input();
+    }
+  }
+  return {};
+}
+
 //===----------------------------------------------------------------------===//
 // TopKV2Op
 //===----------------------------------------------------------------------===//
@@ -1732,26 +1913,57 @@ static LogicalResult Verify(TopKV2Op op) {
 //===----------------------------------------------------------------------===//
 
 namespace {
-// If the input to ToBoolOp is a `tensor<i1>`, then the ToBoolOp is an identity
-// function and can be removed.
-class ToBoolOfZeroDBoolTensor : public OpRewritePattern<ToBoolOp> {
+// If the input to ToBoolOp is a ranked tensor, then the ToBoolOp can be folded
+// into an identity or an equality comparison.
+class ToBoolOfRankedTensor : public OpRewritePattern<ToBoolOp> {
   using OpRewritePattern<ToBoolOp>::OpRewritePattern;
   LogicalResult matchAndRewrite(ToBoolOp op,
                                 PatternRewriter &rewriter) const override {
-    if (auto type = op.getOperand().getType().dyn_cast<RankedTensorType>()) {
-      if (type.getRank() == 0 && type.getElementType().isInteger(1)) {
-        rewriter.replaceOp(op, op.getOperand());
-        return success();
-      }
+    auto type = op.getOperand().getType().dyn_cast<RankedTensorType>();
+    // If the input is an unranked tensor, cannpt rewrite.
+    if (!type) return failure();
+
+    // Expected return type of the ToBool operation.
+    auto result_type = op.getResult().getType().cast<RankedTensorType>();
+
+    // If input is already a tensor<i1>, it can be folded into an identity.
+    if (type == result_type) {
+      rewriter.replaceOp(op, op.getOperand());
+      return success();
     }
-    return failure();
+
+    if (type.getRank() == 0) {
+      // If the input is a scalar tensor, the ToBool can be expanded to
+      // element != 0 (for numerical values) or element == empty (for string).
+      Type element_type = type.getElementType();
+      Attribute zero_attr;
+      if (element_type.isIntOrFloat())
+        zero_attr = rewriter.getZeroAttr(type);
+      else if (element_type.isa<TF::StringType>())
+        zero_attr = DenseStringElementsAttr::get(type, {""});
+
+      if (!zero_attr) return failure();
+
+      auto zero_const = rewriter.create<TF::ConstOp>(op.getLoc(), zero_attr);
+      rewriter.replaceOpWithNewOp<TF::NotEqualOp>(
+          op, result_type, op.getOperand(), zero_const, false);
+    } else {
+      // If the input is a non-scalar ranked tensor, ToBool can be expanded
+      // to numElements != 0. numElements will be 0 iff one of the dimensions is
+      // zero.
+      bool any_zero =
+          llvm::any_of(type.getShape(), [](int64_t dim) { return dim == 0; });
+      rewriter.replaceOpWithNewOp<TF::ConstOp>(
+          op, result_type, DenseElementsAttr::get(result_type, {!any_zero}));
+    }
+    return success();
   }
 };
 }  // namespace
 
 void ToBoolOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
                                            MLIRContext *context) {
-  results.insert<ToBoolOfZeroDBoolTensor>(context);
+  results.insert<ToBoolOfRankedTensor>(context);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1844,11 +2056,9 @@ void TransposeOp::build(OpBuilder &builder, OperationState &result, Value x,
 namespace {
 
 OpFoldResult FoldIdentityTranspose(TransposeOp op) {
-  auto const_perm = dyn_cast_or_null<TF::ConstOp>(op.perm().getDefiningOp());
-  if (!const_perm) return {};
-
-  auto const_value = const_perm.value();
-  const auto elements = const_value.getValues<APInt>();
+  DenseIntElementsAttr perm;
+  if (!matchPattern(op.perm(), m_Constant(&perm))) return {};
+  const auto elements = perm.getValues<APInt>();
 
   for (auto it : llvm::enumerate(elements)) {
     if (it.index() != it.value()) return {};
@@ -1871,14 +2081,14 @@ OpFoldResult FoldCancellableTranspose(TransposeOp op) {
   if (!transpose) return {};
 
   // Permutations defined by constant operations.
-  auto perm0 = dyn_cast_or_null<TF::ConstOp>(op.perm().getDefiningOp());
-  auto perm1 = dyn_cast_or_null<TF::ConstOp>(transpose.perm().getDefiningOp());
-  if (!perm0 || !perm1) return {};
+  DenseIntElementsAttr perm0;
+  DenseIntElementsAttr perm1;
+  if (!matchPattern(op.perm(), m_Constant(&perm0)) ||
+      !matchPattern(transpose.perm(), m_Constant(&perm1)))
+    return {};
 
   // With permutation indices that cancel each other
-  auto perm0_value = perm0.value().cast<DenseIntElementsAttr>();
-  auto perm1_value = perm1.value().cast<DenseIntElementsAttr>();
-  if (!AreCancellablePermutations(perm0_value, perm1_value)) return {};
+  if (!AreCancellablePermutations(perm0, perm1)) return {};
 
   return transpose.x();
 }
@@ -1909,7 +2119,7 @@ static LogicalResult Verify(UnpackOp op) {
   if (!value_type) return success();
 
   int64_t value_rank = value_type.getRank();
-  int64_t axis = op.axis().getSExtValue();
+  int64_t axis = op.axis();
   if (axis < -value_rank || axis >= value_rank)
     return op.emitOpError("axis attribute must be in the range of [-")
            << value_rank << ", " << value_rank << ')';
@@ -2029,38 +2239,19 @@ OpFoldResult VariableShapeOp::fold(ArrayRef<Attribute> operands) {
 // WhileOp
 //===----------------------------------------------------------------------===//
 
-static LogicalResult Verify(WhileOp op) {
-  auto cond_fn = op.cond_func();
-  auto body_fn = op.body_func();
-  if (!cond_fn) {
-    return op.emitOpError("cond refers to an undefined function : ")
-           << op.cond();
-  }
-  if (!body_fn) {
-    return op.emitOpError("body refers to an undefined function : ")
-           << op.body();
-  }
-
-  auto cond_fn_type = cond_fn.getType();
-  auto body_fn_type = body_fn.getType();
-
-  // Verify that the cond function has exactly one result.
-  if (cond_fn_type.getNumResults() != 1)
-    return op.emitOpError("requires cond function to have exactly one result");
-
-  SmallVector<Type, 4> operands(op.getOperandTypes());
-
+static LogicalResult VerifyWhileTypes(Operation *op, TypeRange cond_input,
+                                      TypeRange body_input,
+                                      TypeRange body_result) {
   // Collect all the type lists for the op so that different pairs of type lists
   // can be compared for the compatibility.
   constexpr int kNumTypeLists = 5;
-  const std::array<std::pair<std::string, ArrayRef<Type>>, kNumTypeLists>
-      type_lists = {{
-          {"operand", operands},
-          {"body function result", body_fn_type.getResults()},
-          {"result", op.getResultTypes()},
-          {"cond function input", cond_fn_type.getInputs()},
-          {"body function input", body_fn_type.getInputs()},
-      }};
+  const std::array<TypeRangeWithDesc, kNumTypeLists> type_lists = {{
+      {op->getOperandTypes(), "input"},
+      {body_result, "body result"},
+      {op->getResultTypes(), "result"},
+      {cond_input, "condition input"},
+      {body_input, "body input"},
+  }};
 
   // A pair of type lists should be cast compatible with each other if one is
   // converted to the another for a function call or assignment or there is a
@@ -2090,28 +2281,38 @@ static LogicalResult Verify(WhileOp op) {
     for (int j = std::max(2, i + 1); j < kNumTypeLists; ++j) {
       auto &a = type_lists[i];
       auto &b = type_lists[j];
-
-      int a_size = a.second.size();
-      if (a_size != b.second.size())
-        return op.emitOpError(
-            llvm::formatv("requires the number of {0}s to be equal to the "
-                          "number of {1}s. Found {2} and {3}, respectively",
-                          a.first, b.first, a_size, b.second.size()));
-
-      for (int idx = 0; idx < a_size; ++idx) {
-        auto a_type = a.second[idx];
-        auto b_type = b.second[idx];
-
-        if (!AreCastCompatible({a_type, b_type}))
-          return op.emitError(llvm::formatv(
-              "{0} type {1} is incompatible with {2} type {3} at index {4}",
-              a.first, a_type, b.first, b_type, idx));
-      }
+      if (failed(VerifyTypeRangesAreCompatible(op, a, b))) return failure();
     }
   }
   return success();
 }
 
+static LogicalResult Verify(WhileOp op) {
+  auto cond_fn = op.cond_function();
+  auto body_fn = op.body_function();
+  if (!cond_fn) {
+    return op.emitOpError("cond refers to an undefined function : ")
+           << op.cond();
+  }
+  if (!body_fn) {
+    return op.emitOpError("body refers to an undefined function : ")
+           << op.body();
+  }
+
+  auto cond_fn_type = cond_fn.getType();
+  auto body_fn_type = body_fn.getType();
+
+  // Verify that the cond function has exactly one result.
+  if (cond_fn_type.getNumResults() != 1)
+    return op.emitOpError("requires cond function to have exactly one result");
+
+  if (failed(VerifyWhileTypes(op, /*cond_input=*/cond_fn_type.getInputs(),
+                              /*body_input=*/body_fn_type.getInputs(),
+                              /*body_result=*/body_fn_type.getResults())))
+    return failure();
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // WhileOp canonicalization.
 //===----------------------------------------------------------------------===//
@@ -2125,50 +2326,23 @@ void WhileOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
 //===----------------------------------------------------------------------===//
 static LogicalResult Verify(WhileRegionOp op) {
   // Verify that the condition generates a single tensor<i1> result.
-  YieldOp yield = cast<YieldOp>(op.cond().front().getTerminator());
-  if (yield.getNumOperands() != 1)
+  Operation *cond_yield = op.cond().front().getTerminator();
+  if (cond_yield->getNumOperands() != 1)
     return op.emitOpError()
            << "condition should have a single tensor<i1> result";
 
-  auto cond_type = yield.getOperand(0).getType().dyn_cast<RankedTensorType>();
+  auto cond_type =
+      cond_yield->getOperand(0).getType().dyn_cast<RankedTensorType>();
   if (!cond_type || !cond_type.getShape().equals({}) ||
       !cond_type.getElementType().isInteger(/*width=*/1))
     return op.emitOpError()
            << "condition should have a single tensor<i1> result";
 
-  // The body result types should match while op result types.
-  if (failed(VerifyRegionResults(op, op.body(), "body"))) return failure();
-
-  // Both condition and body should have same number and type of operands as
-  // the WhileRegion inputs.
-  const int num_inputs = op.getNumOperands();
-  auto block_inputs_match_op_inputs = [&](Region &region,
-                                          StringRef name) -> LogicalResult {
-    Block &block = region.front();
-    if (block.getNumArguments() != num_inputs)
-      return op.emitOpError()
-             << name << " should have same number of inputs (" << num_inputs
-             << ") as " << WhileRegionOp::getOperationName() << " but has "
-             << block.getNumArguments() << " inputs";
-
-    for (auto types_idx : llvm::enumerate(
-             llvm::zip(op.getOperandTypes(), block.getArgumentTypes()))) {
-      auto op_input_type = std::get<0>(types_idx.value());
-      auto block_input_type = std::get<1>(types_idx.value());
-      if (!AreCastCompatible({block_input_type, op_input_type}))
-        return op.emitOpError(llvm::formatv(
-            "{0} input type {1} is incompatible with {2} "
-            "input type {3} at index {4}",
-            name, block_input_type, WhileRegionOp::getOperationName(),
-            op_input_type, types_idx.index()));
-    }
-    return success();
-  };
-
-  if (failed(block_inputs_match_op_inputs(op.cond(), "condition")) ||
-      failed(block_inputs_match_op_inputs(op.body(), "body")))
+  Operation *body_yield = op.body().front().getTerminator();
+  if (failed(VerifyWhileTypes(op, /*cond_input=*/op.cond().getArgumentTypes(),
+                              /*body_input=*/op.body().getArgumentTypes(),
+                              /*body_result=*/body_yield->getOperandTypes())))
     return failure();
-
   return success();
 }
 
@@ -2280,7 +2454,8 @@ struct WhileRegionEliminatePassThrough
     auto &new_body_block = new_while_op.body().front();
     auto &new_yield = *new_body_block.getTerminator();
 
-    // Build a vector of new results. Also patch up the region bodies and yield.
+    // Build a vector of new results. Also patch up the region bodies and
+    // yield.
     SmallVector<Value, 4> new_results;
     next_idx = 0;
     for (int op_idx : llvm::seq<int>(0, old_num_operands)) {
@@ -2315,12 +2490,12 @@ void XdivyOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
   results.insert<XdivyWithSqrtDivisor>(context);
 }
 
+}  // namespace TF
+}  // namespace mlir
+
 //===----------------------------------------------------------------------===//
 // TableGen'd op method definitions
 //===----------------------------------------------------------------------===//
 
 #define GET_OP_CLASSES
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc.inc"
-
-}  // namespace TF
-}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.h
index 761c06a475c..9b06d855b01 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.h
@@ -38,15 +38,9 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_verifiers.h"
 
-namespace mlir {
-namespace TF {
-
 #define GET_OP_FWD_DEFINES
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_all_ops.h.inc"
 #define GET_OP_CLASSES
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.h.inc"
 
-}  // namespace TF
-}  // namespace mlir
-
 #endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_OPS_N_Z_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_remaining_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_remaining_ops.cc
index e87cc494a4a..38f9175a500 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_remaining_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_remaining_ops.cc
@@ -70,11 +70,12 @@ limitations under the License.
 
 namespace mlir {
 namespace TF {
-
 namespace {
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_helpers.inc"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/generated_canonicalize.inc"
 }  // namespace
+}  // namespace TF
+}  // namespace mlir
 
 //===----------------------------------------------------------------------===//
 // TableGen'd op method definitions
@@ -82,6 +83,3 @@ namespace {
 
 #define GET_OP_CLASSES
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_remaining_ops.cc.inc"
-
-}  // namespace TF
-}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_remaining_ops.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_remaining_ops.h
index 8586515edee..589e0e91615 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_remaining_ops.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_remaining_ops.h
@@ -36,15 +36,9 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_verifiers.h"
 
-namespace mlir {
-namespace TF {
-
 #define GET_OP_FWD_DEFINES
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_all_ops.h.inc"
 #define GET_OP_CLASSES
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_remaining_ops.h.inc"
 
-}  // namespace TF
-}  // namespace mlir
-
 #endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_REMAINING_OPS_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc
index 94a792ec3db..1eaf997ab69 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc
@@ -105,15 +105,27 @@ static LogicalResult Verify(SessionInitializerOp session_initializer) {
   return success();
 }
 
+}  // namespace tf_saved_model
+}  // namespace mlir
+
 #define GET_OP_CLASSES
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc.inc"
 
+namespace mlir {
+namespace tf_saved_model {
+
 //===----------------------------------------------------------------------===//
 // TensorFlowSavedModelDialect Dialect
 //===----------------------------------------------------------------------===//
 
 TensorFlowSavedModelDialect::TensorFlowSavedModelDialect(MLIRContext *context)
-    : Dialect(/*name=*/"tf_saved_model", context) {
+    : Dialect(/*name=*/"tf_saved_model", context,
+              TypeID::get<TensorFlowSavedModelDialect>()) {
+  // The TensorFlow Dialect is needed in the verifier and other routines
+  // associated to this dialect. It makes little sense anyway to use the
+  // SavedModel dialect without the TensorFlow Dialect.
+  context->loadDialect<TF::TensorFlowDialect>();
+
   addOperations<
 #define GET_OP_LIST
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc.inc"
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h
index 02b7f0b75f4..c8518a9ca02 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h
@@ -40,10 +40,16 @@ class TensorFlowSavedModelDialect : public Dialect {
   static StringRef getDialectNamespace() { return "tf_saved_model"; }
 };
 
+}  // namespace tf_saved_model
+}  // namespace mlir
+
 // Declares the operations for this dialect using the generated header.
 #define GET_OP_CLASSES
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h.inc"
 
+namespace mlir {
+namespace tf_saved_model {
+
 // Returns the list of exported names for `op`.
 // An empty list means `op` is not exported.
 SmallVector<StringRef, 2> GetExportedNames(Operation *op);
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model_ops.td
index a22a684953b..753e2368d6e 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model_ops.td
@@ -82,7 +82,7 @@ def TfSavedModel_Dialect : Dialect {
     with "get_global @some_global_tensor" in the function body.
   }];
 
-  let cppNamespace = "tf_saved_model";
+  let cppNamespace = "::mlir::tf_saved_model";
 }
 
 
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_side_effects.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_side_effects.h
index 9be61b1db39..8dc5ffb5d09 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_side_effects.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_side_effects.h
@@ -35,6 +35,14 @@ struct TensorArray : ::mlir::SideEffects::Resource::Base<TensorArray> {
   StringRef getName() final { return "TensorArray"; }
 };
 
+struct Summary : ::mlir::SideEffects::Resource::Base<Summary> {
+  StringRef getName() final { return "Summary"; }
+};
+
+struct LookupTable : ::mlir::SideEffects::Resource::Base<LookupTable> {
+  StringRef getName() final { return "LookupTable"; }
+};
+
 }  // namespace ResourceEffects
 }  // namespace TF
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_structs.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_structs.cc
index 6c5485c16dd..9d8f25c6633 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_structs.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_structs.cc
@@ -15,11 +15,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h"
 
-namespace mlir {
-
-// NOLINTNEXTLINE
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.cc.inc"
 
+namespace mlir {
 namespace TF {
 
 void RuntimeDevices::AddDevice(const ParsedName& device) {
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h
index b1f39ad1d28..b90bf2d47a8 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h
@@ -26,10 +26,9 @@ limitations under the License.
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "tensorflow/core/util/device_name_utils.h"
 
-namespace mlir {
-
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h.inc"
 
+namespace mlir {
 namespace TF {
 
 // Tensorflow devices available at runtime with corresponding metadata if it is
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h
index fc8e6f40f65..412bf113a0f 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h
@@ -33,7 +33,7 @@ namespace TF {
 static inline LogicalResult VerifyRefTypeMatch(mlir::Type type,
                                                mlir::Type maybe_ref_type) {
   if (auto ref_type = maybe_ref_type.dyn_cast<mlir::TF::TensorFlowRefType>())
-    return success(ref_type.RemoveRef().getKind() == type.getKind());
+    return success(ref_type.RemoveRef().getTypeID() == type.getTypeID());
   return failure();
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_types.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_types.cc
index 994378ea1cf..50f034e8ba1 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_types.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_types.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "llvm/Support/ErrorHandling.h"
 #include "mlir/Dialect/Traits.h"  // from @llvm-project
+#include "mlir/IR/Dialect.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
 
@@ -61,6 +62,192 @@ bool GetCastCompatibleShape(llvm::ArrayRef<int64_t> a_shape,
   return true;
 }
 
+}  // namespace
+
+namespace mlir {
+namespace TF {
+//===----------------------------------------------------------------------===//
+// Utility iterators
+//===----------------------------------------------------------------------===//
+
+OperandShapeIterator::OperandShapeIterator(Operation::operand_iterator it)
+    : llvm::mapped_iterator<Operation::operand_iterator,
+                            llvm::Optional<ArrayRef<int64_t>> (*)(Value)>(
+          it, &GetShape) {}
+
+ResultShapeIterator::ResultShapeIterator(Operation::result_iterator it)
+    : llvm::mapped_iterator<Operation::result_iterator,
+                            llvm::Optional<ArrayRef<int64_t>> (*)(Value)>(
+          it, &GetShape) {}
+
+//===----------------------------------------------------------------------===//
+// TF types helper functions
+//===----------------------------------------------------------------------===//
+
+bool TensorFlowType::classof(Type type) {
+  return type.getDialect().getNamespace() == "tf";
+}
+bool TensorFlowRefType::classof(Type type) {
+  return type.isa<
+#define HANDLE_TF_TYPE(tftype, enumerant, name)
+#define HANDLE_TF_REF_TYPE(tftype, enumerant, name) tftype##Type,
+#define HANDLE_LAST_TF_TYPE(tftype, enumerant, name) tftype##Type
+// NOLINTNEXTLINE
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.def"
+      >();
+}
+bool TensorFlowTypeWithSubtype::classof(Type type) {
+  return type.isa<ResourceType, VariantType>();
+}
+
+TensorFlowType TensorFlowRefType::get(Type type) {
+  MLIRContext* ctx = type.getContext();
+  type = getElementTypeOrSelf(type);
+  if (type.isF16()) {
+    return HalfRefType::get(ctx);
+  } else if (type.isF32()) {
+    return FloatRefType::get(ctx);
+  } else if (type.isF64()) {
+    return DoubleRefType::get(ctx);
+  } else if (type.isBF16()) {
+    return Bfloat16RefType::get(ctx);
+  } else if (auto complex_type = type.dyn_cast<ComplexType>()) {
+    Type etype = complex_type.getElementType();
+    if (etype.isF32()) {
+      return Complex64RefType::get(ctx);
+    } else if (etype.isF64()) {
+      return Complex128RefType::get(ctx);
+    }
+    llvm_unreachable("unexpected complex type");
+  } else if (auto itype = type.dyn_cast<IntegerType>()) {
+    switch (itype.getWidth()) {
+      case 1:
+        return BoolRefType::get(ctx);
+      case 8:
+        return itype.isUnsigned() ? TensorFlowType(Uint8RefType::get(ctx))
+                                  : Int8RefType::get(ctx);
+      case 16:
+        return itype.isUnsigned() ? TensorFlowType(Uint16RefType::get(ctx))
+                                  : Int16RefType::get(ctx);
+      case 32:
+        return itype.isUnsigned() ? TensorFlowType(Uint32RefType::get(ctx))
+                                  : Int32RefType::get(ctx);
+      case 64:
+        return itype.isUnsigned() ? TensorFlowType(Uint64RefType::get(ctx))
+                                  : Int64RefType::get(ctx);
+      default:
+        llvm_unreachable("unexpected integer type");
+    }
+  }
+#define HANDLE_TF_TYPE(tftype, enumerant, name)        \
+  if (auto derived_ty = type.dyn_cast<tftype##Type>()) \
+    return tftype##RefType::get(ctx);
+
+#define HANDLE_TF_REF_TYPE(tftype, enumerant, name)
+// NOLINTNEXTLINE
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.def"
+  llvm_unreachable("unexpected type kind");
+}
+
+Type TensorFlowRefType::RemoveRef() {
+  MLIRContext* ctx = getContext();
+  if (isa<HalfRefType>()) return mlir::FloatType::getF16(ctx);
+  if (isa<FloatRefType>()) return mlir::FloatType::getF32(ctx);
+  if (isa<DoubleRefType>()) return mlir::FloatType::getF64(ctx);
+  if (isa<Bfloat16RefType>()) return mlir::FloatType::getBF16(ctx);
+  if (isa<BoolRefType>()) return mlir::IntegerType::get(1, ctx);
+  if (isa<Int8RefType>()) return mlir::IntegerType::get(8, ctx);
+  if (isa<Int16RefType>()) return mlir::IntegerType::get(16, ctx);
+  if (isa<Int32RefType>()) return mlir::IntegerType::get(32, ctx);
+  if (isa<Int64RefType>()) return mlir::IntegerType::get(64, ctx);
+  if (isa<Uint8RefType>())
+    return mlir::IntegerType::get(8, IntegerType::Unsigned, ctx);
+  if (isa<Uint16RefType>())
+    return mlir::IntegerType::get(16, IntegerType::Unsigned, ctx);
+  if (isa<Uint32RefType>())
+    return mlir::IntegerType::get(32, IntegerType::Unsigned, ctx);
+  if (isa<Uint64RefType>())
+    return mlir::IntegerType::get(64, IntegerType::Unsigned, ctx);
+  if (isa<Complex64RefType>())
+    return mlir::ComplexType::get(mlir::FloatType::getF32(ctx));
+  if (isa<Complex128RefType>())
+    return mlir::ComplexType::get(mlir::FloatType::getF64(ctx));
+#define HANDLE_TF_TYPE(tftype, enumerant, name) \
+  if (isa<tftype##RefType>()) return tftype##Type::get(ctx);
+
+#define HANDLE_TF_REF_TYPE(tftype, enumerant, name)
+// NOLINTNEXTLINE
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.def"
+  llvm_unreachable("unexpected tensorflow ref type kind");
+}
+
+Type TensorFlowTypeWithSubtype::RemoveSubtypes() {
+  MLIRContext* ctx = getContext();
+  if (isa<VariantType>()) return VariantType::get(ctx);
+  if (isa<ResourceType>()) return ResourceType::get(ctx);
+  llvm_unreachable("unexpected tensorflow type with subtypes kind");
+}
+
+ArrayRef<TensorType> TensorFlowTypeWithSubtype::GetSubtypes() {
+  if (auto variant_type = dyn_cast<VariantType>())
+    return variant_type.getSubtypes();
+  if (auto resource_type = dyn_cast<ResourceType>())
+    return resource_type.getSubtypes();
+  llvm_unreachable("unexpected tensorflow type with subtypes kind");
+}
+
+// TODO(jpienaar): BroadcastCompatible and HasCompatibleElementTypes have
+// similar structure that could be extracted into helper method.
+bool BroadcastCompatible(ArrayRef<Type> lhs, ArrayRef<Type> rhs) {
+  if (lhs.size() != rhs.size()) return false;
+  for (auto types : llvm::zip(lhs, rhs)) {
+    // Drop ref types because they don't affect broadcast compatibility. E.g.,
+    // `tensor<!tf.f32ref>` and `tensor<f32>` should be considered broadcast
+    // compatible.
+    auto lhs_type = DropRefType(std::get<0>(types));
+    auto rhs_type = DropRefType(std::get<1>(types));
+
+    // This should be true for all TF ops:
+    auto lhs_tt = lhs_type.dyn_cast<TensorType>();
+    auto rhs_tt = rhs_type.dyn_cast<TensorType>();
+    if (!lhs_tt || !rhs_tt) {
+      if (lhs_type != rhs_type) return false;
+      continue;
+    }
+
+    // Verify matching element types. These should be identical, except for
+    // variant type where unknown subtype is considered compatible with all
+    // subtypes.
+    auto lhs_et = lhs_tt.getElementType();
+    auto rhs_et = rhs_tt.getElementType();
+    if (lhs_et != rhs_et) {
+      // If either does not have subtypes, then the element types don't match.
+      auto lhs_wst = lhs_et.dyn_cast<TF::TensorFlowTypeWithSubtype>();
+      auto rhs_wst = rhs_et.dyn_cast<TF::TensorFlowTypeWithSubtype>();
+      if (!lhs_wst || !rhs_wst) return false;
+
+      // Consider the subtype of variant types.
+      auto lhs_wst_st = lhs_wst.GetSubtypes();
+      auto rhs_wst_st = rhs_wst.GetSubtypes();
+      if (!lhs_wst_st.empty() && !rhs_wst_st.empty()) {
+        for (auto subtypes : llvm::zip(lhs_wst_st, rhs_wst_st)) {
+          if (!BroadcastCompatible(std::get<0>(subtypes),
+                                   std::get<1>(subtypes)))
+            return false;
+        }
+      }
+    }
+
+    auto lhs_rt = lhs_type.dyn_cast<RankedTensorType>();
+    auto rhs_rt = rhs_type.dyn_cast<RankedTensorType>();
+    if (!lhs_rt || !rhs_rt) return true;
+    SmallVector<int64_t, 4> shape;
+    return OpTrait::util::getBroadcastedShape(lhs_rt.getShape(),
+                                              rhs_rt.getShape(), shape);
+  }
+  return true;
+}
+
 // Given two types `a` and `b`, returns a refined type which is cast compatible
 // with both `a` and `b` and is equal to or more precise than both of them. It
 // returns empty Type if the input types are not cast compatible.
@@ -100,7 +287,7 @@ mlir::Type GetCastCompatibleType(mlir::Type a, mlir::Type b,
         if (a == b) return a;
       }
     }
-    if (a.getKind() != b.getKind()) return nullptr;
+    if (a.getTypeID() != b.getTypeID()) return nullptr;
 
     // If either is not a type that contain subtypes then the types are not cast
     // compatible.
@@ -156,199 +343,6 @@ mlir::Type GetCastCompatibleType(mlir::Type a, mlir::Type b,
 
   return mlir::RankedTensorType::get(refined_shape, refined_element_ty);
 }
-}  // namespace
-
-namespace mlir {
-namespace TF {
-//===----------------------------------------------------------------------===//
-// Utility iterators
-//===----------------------------------------------------------------------===//
-
-OperandShapeIterator::OperandShapeIterator(Operation::operand_iterator it)
-    : llvm::mapped_iterator<Operation::operand_iterator,
-                            llvm::Optional<ArrayRef<int64_t>> (*)(Value)>(
-          it, &GetShape) {}
-
-ResultShapeIterator::ResultShapeIterator(Operation::result_iterator it)
-    : llvm::mapped_iterator<Operation::result_iterator,
-                            llvm::Optional<ArrayRef<int64_t>> (*)(Value)>(
-          it, &GetShape) {}
-
-//===----------------------------------------------------------------------===//
-// TF types helper functions
-//===----------------------------------------------------------------------===//
-
-TensorFlowType TensorFlowRefType::get(Type type) {
-  MLIRContext* ctx = type.getContext();
-  switch (getElementTypeOrSelf(type).getKind()) {
-    case StandardTypes::F16:
-      return HalfRefType::get(ctx);
-    case StandardTypes::F32:
-      return FloatRefType::get(ctx);
-    case StandardTypes::F64:
-      return DoubleRefType::get(ctx);
-    case StandardTypes::BF16:
-      return Bfloat16RefType::get(ctx);
-    case StandardTypes::Complex: {
-      const auto& etype = type.cast<ComplexType>().getElementType();
-      switch (getElementTypeOrSelf(etype).getKind()) {
-        case StandardTypes::F32:
-          return Complex64RefType::get(ctx);
-        case StandardTypes::F64:
-          return Complex128RefType::get(ctx);
-        default:
-          llvm_unreachable("unexpected complex type");
-      }
-    }
-    case StandardTypes::Integer: {
-      const auto& itype = type.cast<IntegerType>();
-      switch (itype.getWidth()) {
-        case 1:
-          return BoolRefType::get(ctx);
-        case 8:
-          return itype.isUnsigned() ? TensorFlowType(Uint8RefType::get(ctx))
-                                    : Int8RefType::get(ctx);
-        case 16:
-          return itype.isUnsigned() ? TensorFlowType(Uint16RefType::get(ctx))
-                                    : Int16RefType::get(ctx);
-        case 32:
-          return itype.isUnsigned() ? TensorFlowType(Uint32RefType::get(ctx))
-                                    : Int32RefType::get(ctx);
-        case 64:
-          return itype.isUnsigned() ? TensorFlowType(Uint64RefType::get(ctx))
-                                    : Int64RefType::get(ctx);
-        default:
-          llvm_unreachable("unexpected integer type");
-      }
-    }
-#define HANDLE_TF_TYPE(tftype, enumerant, name) \
-  case TensorFlowTypes::enumerant:              \
-    return tftype##RefType::get(ctx);
-
-#define HANDLE_TF_REF_TYPE(tftype, enumerant, name)
-// NOLINTNEXTLINE
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.def"
-    default:
-      llvm_unreachable("unexpected type kind");
-  }
-}
-
-Type TensorFlowRefType::RemoveRef() {
-  MLIRContext* ctx = getContext();
-  switch (getKind()) {
-    case TensorFlowTypes::HALF_REF:
-      return mlir::FloatType::getF16(ctx);
-    case TensorFlowTypes::FLOAT_REF:
-      return mlir::FloatType::getF32(ctx);
-    case TensorFlowTypes::DOUBLE_REF:
-      return mlir::FloatType::getF64(ctx);
-    case TensorFlowTypes::BFLOAT16_REF:
-      return mlir::FloatType::getBF16(ctx);
-    case TensorFlowTypes::BOOL_REF:
-      return mlir::IntegerType::get(1, ctx);
-    case TensorFlowTypes::INT8_REF:
-      return mlir::IntegerType::get(8, ctx);
-    case TensorFlowTypes::INT16_REF:
-      return mlir::IntegerType::get(16, ctx);
-    case TensorFlowTypes::INT32_REF:
-      return mlir::IntegerType::get(32, ctx);
-    case TensorFlowTypes::INT64_REF:
-      return mlir::IntegerType::get(64, ctx);
-    case TensorFlowTypes::UINT8_REF:
-      return mlir::IntegerType::get(8, IntegerType::Unsigned, ctx);
-    case TensorFlowTypes::UINT16_REF:
-      return mlir::IntegerType::get(16, IntegerType::Unsigned, ctx);
-    case TensorFlowTypes::UINT32_REF:
-      return mlir::IntegerType::get(32, IntegerType::Unsigned, ctx);
-    case TensorFlowTypes::UINT64_REF:
-      return mlir::IntegerType::get(64, IntegerType::Unsigned, ctx);
-    case TensorFlowTypes::COMPLEX64_REF:
-      return mlir::ComplexType::get(mlir::FloatType::getF32(ctx));
-    case TensorFlowTypes::COMPLEX128_REF:
-      return mlir::ComplexType::get(mlir::FloatType::getF64(ctx));
-#define HANDLE_TF_TYPE(tftype, enumerant, name) \
-  case TensorFlowTypes::enumerant##_REF:        \
-    return tftype##Type::get(ctx);
-
-#define HANDLE_TF_REF_TYPE(tftype, enumerant, name)
-// NOLINTNEXTLINE
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.def"
-    default:
-      llvm_unreachable("unexpected tensorflow ref type kind");
-  }
-}
-
-Type TensorFlowTypeWithSubtype::RemoveSubtypes() {
-  MLIRContext* ctx = getContext();
-  switch (getKind()) {
-    case TensorFlowTypes::VARIANT:
-      return VariantType::get(ctx);
-    case TensorFlowTypes::RESOURCE:
-      return ResourceType::get(ctx);
-    default:
-      llvm_unreachable("unexpected tensorflow type with subtypes kind");
-  }
-}
-
-ArrayRef<TensorType> TensorFlowTypeWithSubtype::GetSubtypes() {
-  switch (getKind()) {
-    case TensorFlowTypes::VARIANT:
-      return this->cast<VariantType>().getSubtypes();
-    case TensorFlowTypes::RESOURCE:
-      return this->cast<ResourceType>().getSubtypes();
-    default:
-      llvm_unreachable("unexpected tensorflow type with subtypes kind");
-  }
-}
-
-// TODO(jpienaar): BroadcastCompatible and HasCompatibleElementTypes have
-// similar structure that could be extracted into helper method.
-bool BroadcastCompatible(ArrayRef<Type> lhs, ArrayRef<Type> rhs) {
-  if (lhs.size() != rhs.size()) return false;
-  for (auto types : llvm::zip(lhs, rhs)) {
-    auto lhs_type = std::get<0>(types);
-    auto rhs_type = std::get<1>(types);
-
-    // This should be true for all TF ops:
-    auto lhs_tt = lhs_type.dyn_cast<TensorType>();
-    auto rhs_tt = rhs_type.dyn_cast<TensorType>();
-    if (!lhs_tt || !rhs_tt) {
-      if (lhs_type != rhs_type) return false;
-      continue;
-    }
-
-    // Verify matching element types. These should be identical, except for
-    // variant type where unknown subtype is considered compatible with all
-    // subtypes.
-    auto lhs_et = lhs_tt.getElementType();
-    auto rhs_et = rhs_tt.getElementType();
-    if (lhs_et != rhs_et) {
-      // If either does not have subtypes, then the element types don't match.
-      auto lhs_wst = lhs_et.dyn_cast<TF::TensorFlowTypeWithSubtype>();
-      auto rhs_wst = rhs_et.dyn_cast<TF::TensorFlowTypeWithSubtype>();
-      if (!lhs_wst || !rhs_wst) return false;
-
-      // Consider the subtype of variant types.
-      auto lhs_wst_st = lhs_wst.GetSubtypes();
-      auto rhs_wst_st = rhs_wst.GetSubtypes();
-      if (!lhs_wst_st.empty() && !rhs_wst_st.empty()) {
-        for (auto subtypes : llvm::zip(lhs_wst_st, rhs_wst_st)) {
-          if (!BroadcastCompatible(std::get<0>(subtypes),
-                                   std::get<1>(subtypes)))
-            return false;
-        }
-      }
-    }
-
-    auto lhs_rt = lhs_type.dyn_cast<RankedTensorType>();
-    auto rhs_rt = rhs_type.dyn_cast<RankedTensorType>();
-    if (!lhs_rt || !rhs_rt) return true;
-    SmallVector<int64_t, 4> shape;
-    return OpTrait::util::getBroadcastedShape(lhs_rt.getShape(),
-                                              rhs_rt.getShape(), shape);
-  }
-  return true;
-}
 
 bool HasCompatibleElementTypes(Type lhs, Type rhs,
                                bool may_ignore_ref_type_lhs) {
@@ -366,27 +360,31 @@ bool AreCastCompatible(ArrayRef<Type> types) {
   return true;
 }
 
-ShapedType DropTypeSubTypes(ShapedType ty) {
-  Type element_ty = ty.getElementType();
-  auto subtype_ty = element_ty.dyn_cast<TF::TensorFlowTypeWithSubtype>();
-  if (!subtype_ty) return ty;
+// Assumes a function `GetDefaultTypeOf(ComposedType)` that returns the default
+// type for a composed type (such as a ref type or a type with subtypes).
+template <typename ComposedType>
+Type DropTypeHelper(Type ty) {
+  Type element_ty = getElementTypeOrSelf(ty);
+  auto composed_type = element_ty.dyn_cast<ComposedType>();
+  if (!composed_type) return ty;
 
-  Type default_ty = GetDefaultTypeOf(subtype_ty);
-  if (ty.hasRank()) return RankedTensorType::get(ty.getShape(), default_ty);
-
-  return UnrankedTensorType::get(default_ty);
+  Type default_ty = GetDefaultTypeOf(composed_type);
+  if (auto ranked_ty = ty.dyn_cast<RankedTensorType>()) {
+    return RankedTensorType::get(ranked_ty.getShape(), default_ty);
+  } else if (ty.dyn_cast<UnrankedTensorType>()) {
+    return UnrankedTensorType::get(default_ty);
+  } else {
+    return default_ty;
+  }
 }
 
-ShapedType DropRefType(ShapedType ty) {
-  Type element_ty = ty.getElementType();
-  TF::TensorFlowRefType ref_ty = element_ty.dyn_cast<TF::TensorFlowRefType>();
-  if (!ref_ty) return ty;
-
-  Type default_ty = TF::GetDefaultTypeOf(ref_ty);
-  if (ty.hasRank()) return RankedTensorType::get(ty.getShape(), default_ty);
-
-  return UnrankedTensorType::get(default_ty);
+Type DropSubTypes(Type ty) {
+  return DropTypeHelper<TF::TensorFlowTypeWithSubtype>(ty);
 }
 
+Type DropRefType(Type ty) { return DropTypeHelper<TF::TensorFlowRefType>(ty); }
+
+Type DropRefAndSubTypes(Type ty) { return DropRefType(DropSubTypes(ty)); }
+
 }  // namespace TF
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_types.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_types.h
index 125f6bb31df..60a86f32920 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_types.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_types.h
@@ -67,26 +67,13 @@ using ResultShapeRange = iterator_range<ResultShapeIterator>;
 // TensorFlow types
 //===----------------------------------------------------------------------===//
 
-namespace TensorFlowTypes {
-// List of supported TensorFlowType kinds, necessary for isa/dyn_cast.
-enum Kind {
-  FIRST_USED_TENSORFLOW_TYPE = Type::FIRST_TENSORFLOW_TYPE,
-#define HANDLE_TF_TYPE(tftype, enumerant, name) enumerant,
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.def"
-  LAST_USED_TENSORFLOW_TYPE,
-};
-}  // namespace TensorFlowTypes
-
 // The base class in the TensorFlow type hierarchy.
 class TensorFlowType : public Type {
  public:
   using Type::Type;
 
   // Support method to enable LLVM-style type casting.
-  static bool classof(Type type) {
-    return type.getKind() >= Type::FIRST_TENSORFLOW_TYPE &&
-           type.getKind() <= TensorFlowTypes::LAST_USED_TENSORFLOW_TYPE;
-  }
+  static bool classof(Type type);
 };
 
 // Returns true if the specified type is a valid TensorFlow element type.
@@ -105,10 +92,7 @@ static inline bool IsValidTFTensorType(Type type) {
 
 namespace detail {
 // Common implementation of TensorFlow types. The template argument indicates
-// the concrete derived class per CRTP. Concrete classes must implement the
-// following:
-//   - `static unsigned getTypeKind()` that returns the (fixed) kind of the
-//     type.
+// the concrete derived class per CRTP.
 template <typename Derived>
 class TensorFlowTypeImpl
     : public Type::TypeBase<Derived, TensorFlowType, TypeStorage> {
@@ -116,14 +100,6 @@ class TensorFlowTypeImpl
   using Base = typename Type::TypeBase<Derived, TensorFlowType, TypeStorage>;
   using TFBase = TensorFlowTypeImpl<Derived>;
   using Base::Base;
-
-  // Get the unique'ed type in the given context.
-  static Derived get(MLIRContext* context) {
-    return Base::get(context, Derived::getTypeKind());
-  }
-
-  // Support method to enable LLVM-style type casting.
-  static bool kindof(unsigned kind) { return kind == Derived::getTypeKind(); }
 };
 }  // namespace detail
 
@@ -133,10 +109,7 @@ class TensorFlowRefType : public TensorFlowType {
   using TensorFlowType::TensorFlowType;
 
   // Checks if a type is TensorFlow Ref type.
-  static bool classof(Type type) {
-    return type.getKind() >= TensorFlowTypes::FLOAT_REF &&
-           type.getKind() <= TensorFlowTypes::LAST_USED_TENSORFLOW_TYPE;
-  }
+  static bool classof(Type type);
 
   // Converts a type to the corresponding TensorFlowRef type.
   static TensorFlowType get(Type type);
@@ -182,7 +155,6 @@ static inline Type GetElementTypeOrSelfResolveRef(Type type) {
   class tftype##Type : public detail::TensorFlowTypeImpl<tftype##Type> { \
    public:                                                               \
     using TFBase::TFBase;                                                \
-    static unsigned getTypeKind() { return TensorFlowTypes::enumerant; } \
   };
 
 // Custom TensorFlow types are defined separately.
@@ -220,8 +192,6 @@ class TypeWithSubtypeStorage : public TypeStorage {
 // opaque and their interpretation depends on the actual underlying type.
 // The template argument indicates the concrete derived class per CRTP. Concrete
 // classes must implement the following:
-//   - `static unsigned getTypeKind()` that returns the (fixed) kind of the
-//     type.
 //   - `static std::string getTypeName()` that returns the name of the type for
 //     verification logging.
 template <typename Derived>
@@ -233,19 +203,16 @@ class TypeWithSubtypeImpl
   using Base::Base;
 
   static Derived get(ArrayRef<TensorType> subtypes, MLIRContext* context) {
-    return Base::get(context, Derived::getTypeKind(), subtypes);
+    return Base::get(context, subtypes);
   }
 
   static Derived getChecked(ArrayRef<TensorType> subtypes, MLIRContext* context,
                             Location loc) {
-    return Base::getChecked(loc, Derived::getTypeKind(), subtypes);
+    return Base::getChecked(loc, subtypes);
   }
 
   static Derived get(MLIRContext* context) { return get({}, context); }
 
-  // Support method to enable LLVM-style type casting.
-  static bool kindof(unsigned kind) { return kind == Derived::getTypeKind(); }
-
   static LogicalResult verifyConstructionInvariants(
       Location loc, ArrayRef<TensorType> subtypes) {
     // Each of the subtypes should be a valid TensorFlow type.
@@ -269,10 +236,7 @@ class TensorFlowTypeWithSubtype : public TensorFlowType {
   using TensorFlowType::TensorFlowType;
 
   // Checks if a type is TensorFlow type with subtypes.
-  static bool classof(Type type) {
-    return type.getKind() == TensorFlowTypes::VARIANT ||
-           type.getKind() == TensorFlowTypes::RESOURCE;
-  }
+  static bool classof(Type type);
 
   // Converts a TypeWithSubtype type to the same type but without its subtypes.
   Type RemoveSubtypes();
@@ -294,7 +258,6 @@ static inline Type GetDefaultTypeOf(TensorFlowTypeWithSubtype type) {
 class ResourceType : public detail::TypeWithSubtypeImpl<ResourceType> {
  public:
   using TFBase::TFBase;
-  static unsigned getTypeKind() { return TensorFlowTypes::RESOURCE; }
   static std::string getTypeName() { return "ResourceType"; }
 };
 
@@ -306,10 +269,18 @@ class ResourceType : public detail::TypeWithSubtypeImpl<ResourceType> {
 class VariantType : public detail::TypeWithSubtypeImpl<VariantType> {
  public:
   using TFBase::TFBase;
-  static unsigned getTypeKind() { return TensorFlowTypes::VARIANT; }
   static std::string getTypeName() { return "VariantType"; }
 };
 
+// Given two types `a` and `b`, returns a refined type which is cast compatible
+// with both `a` and `b` and is equal to or more precise than both of them. It
+// returns empty Type if the input types are not cast compatible.
+// Provides option to ignore ref types on 'a'. This is useful for TF ops that
+// might allow operands to either be same as result type or be a ref type
+// corresponding to it.
+mlir::Type GetCastCompatibleType(mlir::Type a, mlir::Type b,
+                                 bool may_ignore_ref_type_a);
+
 // Returns whether two arrays of Type are broadcast compatible.
 bool BroadcastCompatible(ArrayRef<Type> lhs, ArrayRef<Type> rhs);
 
@@ -331,15 +302,21 @@ bool HasCompatibleElementTypes(Type lhs, Type rhs,
 // compatible.
 bool AreCastCompatible(ArrayRef<Type> types);
 
-// If the given tensor has elements of type with subtypes, then returns a new
-// type after dropping subtypes info. Otherwise, returns the original type as
-// is.
-ShapedType DropTypeSubTypes(ShapedType ty);
+// If `ty` is a tensor type and its element type has subtypes, then returns a
+// new type of same shape but dropped subtypes for the element type.
+// Otherwise, if `ty` has subtypes, then returns corresponding type with dropped
+// subtypes.
+// Otherwise, returns the original type `ty`.
+Type DropSubTypes(Type ty);
 
-// If the given tensor has elements of type ref, then returns a new type
-// of the shape, but corresponding non-ref type as element type. Otherwise,
-// returns the original type as is.
-ShapedType DropRefType(ShapedType ty);
+// If `ty` is a tensor type and has elements of a ref type, then returns a new
+// type of same shape but corresponding non-ref type as element type.
+// Otherwise, if `ty` is a ref type, then returns corresponding non-ref type.
+// Otherwise, returns the original type `ty`.
+Type DropRefType(Type ty);
+
+// Convenience call for executing both `DropRefType` and `DropSubTypes`.
+Type DropRefAndSubTypes(Type ty);
 
 }  // end namespace TF
 }  // end namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tfrt_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tfrt_ops.cc
new file mode 100644
index 00000000000..6a6a7574f29
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tfrt_ops.cc
@@ -0,0 +1,27 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tensorflow/ir/tfrt_ops.h"
+
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+
+//===----------------------------------------------------------------------===//
+// TableGen'd op method definitions
+//===----------------------------------------------------------------------===//
+
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/mlir/tensorflow/ir/tfrt_ops.cc.inc"
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/dialect_registration.cc b/tensorflow/compiler/mlir/tensorflow/ir/tfrt_ops.h
similarity index 57%
rename from tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/dialect_registration.cc
rename to tensorflow/compiler/mlir/tensorflow/ir/tfrt_ops.h
index 9d1c354690a..039f211533c 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/dialect_registration.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tfrt_ops.h
@@ -13,11 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "mlir-hlo/Dialect/mhlo/IR/chlo_ops.h"
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TFRT_OPS_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TFRT_OPS_H_
 
-// Static initialization for *HLO dialects registration.
-static mlir::DialectRegistration<mlir::mhlo::MhloDialect> mhlo_ops;
-static mlir::DialectRegistration<mlir::chlo::HloClientDialect> chlo_ops;
-static mlir::DialectRegistration<mlir::lmhlo::LmhloDialect> lmhlo_ops;
+#include "mlir/Interfaces/DerivedAttributeOpInterface.h"  // from @llvm-project
+#include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
+
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/mlir/tensorflow/ir/tfrt_ops.h.inc"
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TFRT_OPS_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tfrt_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tfrt_ops.td
new file mode 100644
index 00000000000..fea9500b638
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tfrt_ops.td
@@ -0,0 +1,61 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This is the operation definition file for TensorFlow operations with
+// implementation available only in TFRT.
+
+#ifndef TFRT_OPS
+#define TFRT_OPS
+
+include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td"
+include "mlir/IR/OpBase.td"
+
+def TF__JitFusedMatMulOp : TF_Op<"_JitFusedMatMul", [NoSideEffect, SameOperandsAndResultElementType]> {
+  let summary = [{
+    MatMul operation with an output fusion compiled at runtime via MLIR codegen.
+  }];
+
+  let description = [{
+The inputs to the MatMul are specified by `a` and `b`. The series of operations
+that follows is specified by the `fusion` attribute, which is a list of output
+kernel names specified as strings (e.g. "BiasAdd"). They are performed in order,
+where the (first) input to each op is the output of the preceding op. The first
+input and the output of each fused_op must be of type T.
+
+Supported list of fusions is defined by ContractionOutputKernelBuilder
+implementations.
+
+*WARN*: This is a TFRT only operations, and it does not exist in TF. This
+operation is only added by the ContractionFusion pass.
+  }];
+
+  let arguments = (ins
+    TensorOf<[F32]>:$a,
+    TensorOf<[F32]>:$b,
+    Variadic<TensorOf<[F32]>>:$additional_args,
+
+    DefaultValuedAttr<BoolAttr, "false">:$transpose_a,
+    DefaultValuedAttr<BoolAttr, "false">:$transpose_b,
+    DefaultValuedAttr<StrArrayAttr, "{}">:$fusion
+  );
+
+  let results = (outs
+    TensorOf<[F32]>:$product
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
+#endif // TFRT_OPS
diff --git a/tensorflow/compiler/mlir/tensorflow/ops/mlir_local_var_op.cc b/tensorflow/compiler/mlir/tensorflow/ops/mlir_local_var_op.cc
index 211866900aa..d2c2cecdfdd 100644
--- a/tensorflow/compiler/mlir/tensorflow/ops/mlir_local_var_op.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ops/mlir_local_var_op.cc
@@ -21,7 +21,7 @@ namespace tensorflow {
 REGISTER_OP("MlirLocalVarOp")
     .Output("resource: resource")
     .SetShapeFn(shape_inference::UnknownShape)
-    .Doc(R"(Creates a handle to a in-scope variable.
+    .Doc(R"(Creates a handle to an in-scope variable.
 Used by internal passes for temporary representation of local state, which will
 be eventually removed.)");
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/breakup-islands.mlir b/tensorflow/compiler/mlir/tensorflow/tests/breakup-islands.mlir
index 05d34eb0755..6654341ab42 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/breakup-islands.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/breakup-islands.mlir
@@ -285,7 +285,7 @@ func @empty_island_multiple_data_results(%arg0: tensor<*xf32>, %arg1: tensor<*xi
 // and certain tf_executor ops are added correctly.
 
 // CHECK: %[[CONTROL:[^ ,]*]] = tf_executor.island wraps "tf.Print"
-// CHECK: tf_executor.NextIteration.Sink [{{.*}}] {{.*}}, %[[CONTROL]]
+// CHECK: tf_executor.NextIteration.Sink[{{.*}}] {{.*}}, %[[CONTROL]]
 func @next_iteration_sink_control_input() {
   tf_executor.graph {
     %source:3 = tf_executor.NextIteration.Source : tensor<*xi32>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir b/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
index 595bdce5be4..ff90c6f4c5b 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
@@ -444,6 +444,14 @@ func @testReshapeNoOp(%arg0: tensor<2x4xf32>, %arg1: tensor<2xi32>) -> tensor<2x
   return %0 : tensor<2x4xf32>
 }
 
+// CHECK-LABEL: func @testBroadcastToNoOp
+func @testBroadcastToNoOp(%arg0: tensor<2x4xf32>, %arg1: tensor<2xi32>) -> tensor<2x4xf32> {
+  %0 = "tf.BroadcastTo"(%arg0, %arg1) : (tensor<2x4xf32>, tensor<2xi32>) -> tensor<2x4xf32>
+
+  // CHECK: return %arg0
+  return %0 : tensor<2x4xf32>
+}
+
 // CHECK-LABEL: func @testPackShapeComputation
 func @testPackShapeComputation(%arg0: tensor<?x1xf32>, %arg1: tensor<?x1x2xf32>, %arg2: tensor<*xf32>) -> (tensor<2xi32>, tensor<3xi32>, tensor<3xi32>,  tensor<3xi32>, tensor<3xi32>, tensor<*xi32>) {
   // Test dimensions sizes.
@@ -560,6 +568,14 @@ func @testSelectElseUnranked(%arg0: tensor<3xi1>, %arg1: tensor<3x2xf16>, %arg2:
   return %0: tensor<*xf16>
 }
 
+// CHECK-LABEL: testTileMultiplesAllOnes
+func @testTileMultiplesAllOnes(%arg0: tensor<2x3xf32>) -> tensor<2x3xf32> {
+  %cst = constant dense <[1, 1]> : tensor<2xi32>
+  // CHECK: return %arg0
+  %0 = "tf.Tile"(%arg0, %cst) : (tensor<2x3xf32>, tensor<2xi32>) -> tensor<2x3xf32>
+  return %0: tensor<2x3xf32>
+}
+
 // CHECK-LABEL: testLogicalNotOfEqual
 func @testLogicalNotOfEqual(%arg0: tensor<8x16xf32>, %arg1: tensor<8x16xf32>) -> tensor<8x16xi1> {
   %0 = "tf.Equal"(%arg0, %arg1) : (tensor<8x16xf32>, tensor<8x16xf32>) -> tensor<8x16xi1>
@@ -620,6 +636,15 @@ func @testLogicalNotOfLessEqual(%arg0: tensor<8x16xf32>, %arg1: tensor<8x16xf32>
 // CHECK: return %0
 }
 
+// CHECK-LABEL: testSizeFolding
+func @testSizeFolding(%arg0: tensor<3x5x7xf32>) -> tensor<i32> {
+  %0 = "tf.Size"(%arg0) : (tensor<3x5x7xf32>) -> tensor<i32>
+  return %0: tensor<i32>
+
+// CHECK: %0 = "tf.Const"() {value = dense<105> : tensor<i32>} : () -> tensor<i32>
+// CHECK: return %0 : tensor<i32>
+}
+
 // CHECK-LABEL: testDivWithSqrtDivisor
 func @testDivWithSqrtDivisor(%arg0: tensor<8x16xf32>, %arg1: tensor<8x16xf32>) -> tensor<8x16xf32> {
   %0 = "tf.Sqrt"(%arg1) : (tensor<8x16xf32>) -> tensor<8x16xf32>
@@ -685,6 +710,15 @@ func @identityTranspose(%arg0: tensor<2x3x4x5x6xf32>) -> tensor<2x3x4x5x6xf32> {
   // CHECK: return %arg0
 }
 
+// CHECK-LABEL: @identityTransposeConst
+func @identityTransposeConst(%arg0: tensor<2x3x4x5x6xf32>) -> tensor<2x3x4x5x6xf32> {
+  %0 = constant dense<[0, 1, 2, 3, 4]> : tensor<5xi32>
+  %1 = "tf.Transpose"(%arg0, %0) : (tensor<2x3x4x5x6xf32>, tensor<5xi32>) -> tensor<2x3x4x5x6xf32>
+
+  return %1 : tensor<2x3x4x5x6xf32>
+  // CHECK: return %arg0
+}
+
 // CHECK-LABEL: @nonIdentityTranspose
 func @nonIdentityTranspose(%arg0: tensor<2x3x4x5x6xf32>) -> tensor<2x3x4x6x5xf32> {
   %0 = "tf.Const"() {value = dense<[0, 1, 2, 4, 3]> : tensor<5xi32>} : () -> tensor<5xi32>
@@ -707,6 +741,17 @@ func @cancellableTranspose(%arg0: tensor<1x4x4x8xf32>) -> tensor<1x4x4x8xf32> {
   // CHECK: return %arg0
 }
 
+// CHECK-LABEL: @cancellableTransposeConst
+func @cancellableTransposeConst(%arg0: tensor<1x4x4x8xf32>) -> tensor<1x4x4x8xf32> {
+  %0 = constant dense<[0, 3, 1, 2]> : tensor<4xi32>
+  %1 = constant dense<[0, 2, 3, 1]> : tensor<4xi32>
+  %2 = "tf.Transpose"(%arg0, %0) : (tensor<1x4x4x8xf32>, tensor<4xi32>) -> tensor<1x8x4x4xf32>
+  %3 = "tf.Transpose"(%2, %1) : (tensor<1x8x4x4xf32>, tensor<4xi32>) -> tensor<1x4x4x8xf32>
+
+  return %3 : tensor<1x4x4x8xf32>
+  // CHECK: return %arg0
+}
+
 // CHECK-LABEL: @nonCancellableTranspose
 func @nonCancellableTranspose(%arg0: tensor<1x4x4x8xf32>) -> tensor<4x1x4x8xf32> {
   %0 = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32>
@@ -725,13 +770,72 @@ func @addN(%arg0: tensor<*xf32>) -> tensor<*xf32> {
   return %0 : tensor<*xf32>
 }
 
-// CHECK-LABEL: func @ToBool_0DScalar
-func @ToBool_0DScalar(%arg0: tensor<i1>) -> tensor<i1> {
+// CHECK-LABEL: func @ToBool_0DScalarI1
+func @ToBool_0DScalarI1(%arg0: tensor<i1>) -> tensor<i1> {
   // CHECK: return %arg0
   %0 = "tf.ToBool"(%arg0) : (tensor<i1>) -> tensor<i1>
   return %0 : tensor<i1>
 }
 
+// CHECK-LABEL: func @ToBool_0DScalarInt
+func @ToBool_0DScalarInt(%arg0: tensor<i32>) -> tensor<i1> {
+  // CHECK: [[Zero:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>}
+  // CHECK: [[NE:%.*]] = "tf.NotEqual"(%arg0, [[Zero]])
+  // CHECK: return [[NE]]
+  %0 = "tf.ToBool"(%arg0) : (tensor<i32>) -> tensor<i1>
+  return %0 : tensor<i1>
+}
+
+// CHECK-LABEL: func @ToBool_0DScalarFloat
+func @ToBool_0DScalarFloat(%arg0: tensor<f32>) -> tensor<i1> {
+  // CHECK: [[Zero:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<f32>} : () -> tensor<f32>
+  // CHECK: [[NE:%.*]] = "tf.NotEqual"(%arg0, [[Zero]])
+  // CHECK: return [[NE]]
+  %0 = "tf.ToBool"(%arg0) : (tensor<f32>) -> tensor<i1>
+  return %0 : tensor<i1>
+}
+
+// CHECK-LABEL: func @ToBool_0DScalarString
+func @ToBool_0DScalarString(%arg0: tensor<!tf.string>) -> tensor<i1> {
+  // CHECK: [[EmptyStr:%.*]] = "tf.Const"() {value = dense<""> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  // CHECK: [[NE:%.*]] = "tf.NotEqual"(%arg0, [[EmptyStr]]) {incompatible_shape_error = false} : (tensor<!tf.string>, tensor<!tf.string>) -> tensor<i1>
+  // CHECK: return [[NE]] : tensor<i1>
+  %0 = "tf.ToBool"(%arg0) : (tensor<!tf.string>) -> tensor<i1>
+  return %0 : tensor<i1>
+}
+
+// CHECK-LABEL: func @ToBool_1DTensor
+func @ToBool_1DTensor(%arg0: tensor<1xf32>) -> tensor<i1> {
+  // CHECK: [[Const:%.*]] = "tf.Const"() {value = dense<true> : tensor<i1>} : () -> tensor<i1>
+  // CHECK: return [[Const]]
+  %0 = "tf.ToBool"(%arg0) : (tensor<1xf32>) -> tensor<i1>
+  return %0 : tensor<i1>
+}
+
+// CHECK-LABEL: func @ToBool_1DTensorZeroDim
+func @ToBool_1DTensorZeroDim(%arg0: tensor<0xf32>) -> tensor<i1> {
+  // CHECK: [[Const:%.*]] = "tf.Const"() {value = dense<false> : tensor<i1>} : () -> tensor<i1>
+  // CHECK: return [[Const]]
+  %0 = "tf.ToBool"(%arg0) : (tensor<0xf32>) -> tensor<i1>
+  return %0 : tensor<i1>
+}
+
+// CHECK-LABEL: func @ToBool_2DTensor
+func @ToBool_2DTensor(%arg0: tensor<1x5xf32>) -> tensor<i1> {
+  // CHECK: [[Const:%.*]] = "tf.Const"() {value = dense<true> : tensor<i1>} : () -> tensor<i1>
+  // CHECK: return [[Const]]
+  %0 = "tf.ToBool"(%arg0) : (tensor<1x5xf32>) -> tensor<i1>
+  return %0 : tensor<i1>
+}
+
+// CHECK-LABEL: func @ToBool_2DTensorZeroDim
+func @ToBool_2DTensorZeroDim(%arg0: tensor<1x0xf32>) -> tensor<i1> {
+  // CHECK: [[Const:%.*]] = "tf.Const"() {value = dense<false> : tensor<i1>} : () -> tensor<i1>
+  // CHECK: return [[Const]]
+  %0 = "tf.ToBool"(%arg0) : (tensor<1x0xf32>) -> tensor<i1>
+  return %0 : tensor<i1>
+}
+
 // CHECK-LABEL: testReadVariableOpOfCast
 func @testReadVariableOpOfCast(%arg0: tensor<!tf.resource<tensor<8x40xf32>>>) -> tensor<8x40xf32> {
   %0 = "tf.Cast"(%arg0) : (tensor<!tf.resource<tensor<8x40xf32>>>) -> tensor<*x!tf.resource>
@@ -826,6 +930,51 @@ func @foldIf(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<i1>) -> (tens
   return %4 : tensor<f32>
 }
 
+// CHECK-LABEL: foldIfRegion
+func @foldIfRegion(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<i1>) -> (tensor<f32>, tensor<f32>) {
+  %false = "tf.Const"() {value = dense<false> : tensor<i1>} : () -> tensor<i1>
+  %true = "tf.Const"() {value = dense<true> : tensor<i1>} : () -> tensor<i1>
+
+  // CHECK: [[Val0:%.*]] = "tf.Mul"(%arg0, %arg1)
+  %0 = "tf.IfRegion"(%true) ({
+      %true_value = "tf.Mul"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+      "tf.Yield"(%true_value) : (tensor<f32>) -> ()
+    }, {
+      %false_value = "tf.Sub"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+      "tf.Yield"(%false_value) : (tensor<f32>) -> ()
+    }) { is_stateless = true}: (tensor<i1>) -> tensor<f32>
+
+  // CHECK: [[Val1:%.*]] = "tf.Sub"(%arg0, %arg1)
+  %1 = "tf.IfRegion"(%false) ({
+      %true_value = "tf.Mul"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+      "tf.Yield"(%true_value) : (tensor<f32>) -> ()
+    }, {
+      %false_value = "tf.Sub"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+      "tf.Yield"(%false_value) : (tensor<f32>) -> ()
+    }) { is_stateless = true}: (tensor<i1>) -> tensor<f32>
+
+  // CHECK: return [[Val0]], [[Val1]]
+  return %0, %1 : tensor<f32>, tensor<f32>
+}
+
+// CHECK-LABEL: foldIfRegionMismatchedTypes
+func @foldIfRegionMismatchedTypes(%arg0: tensor<?xf32>, %arg1: tensor<?xf32>, %arg2: tensor<i1>) -> tensor<1xf32> {
+  %false = "tf.Const"() {value = dense<false> : tensor<i1>} : () -> tensor<i1>
+  %true = "tf.Const"() {value = dense<true> : tensor<i1>} : () -> tensor<i1>
+
+  // CHECK: [[Val0:%.*]] = "tf.Mul"(%arg0, %arg1)
+  // CHECK-NEXT: [[Cast:%.*]] = "tf.Cast"([[Val0]])
+  // CHECK-NEXT: return [[Cast]]
+  %0 = "tf.IfRegion"(%true) ({
+      %true_value = "tf.Mul"(%arg0, %arg1) : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+      "tf.Yield"(%true_value) : (tensor<?xf32>) -> ()
+    }, {
+      %false_value = "tf.Sub"(%arg0, %arg1) : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+      "tf.Yield"(%false_value) : (tensor<?xf32>) -> ()
+    }) { is_stateless = true}: (tensor<i1>) -> tensor<1xf32>
+  return %0 : tensor<1xf32>
+}
+
 // CHECK-LABEL: foldCase
 func @foldCase(%arg0: tensor<f32>, %arg1: tensor<f32>) -> (tensor<f32>) {
   %2 = constant dense<1> : tensor<i32>
@@ -834,11 +983,11 @@ func @foldCase(%arg0: tensor<f32>, %arg1: tensor<f32>) -> (tensor<f32>) {
   // CHECK: PartitionedCall
   // CHECK-SAME: device = "noodle"
   // CHECK-SAME: f = @add
-  %4 = "tf.Case"(%2, %arg0, %arg1) {branches = [@sub, @add], output_shapes = [#tf.shape<>], device = "noodle"} : (tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<f32>
+  %4 = "tf.Case"(%2, %arg0, %arg1) {branches = [@sub, @add], output_shapes = [#tf.shape<>], device = "noodle", is_stateless = false} : (tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<f32>
   // CHECK: PartitionedCall
   // CHECK-SAME: _cluster_launch = "not_ready"
   // CHECK-SAME: f = @sub
-  %5 = "tf.Case"(%3, %4, %arg1) {branches = [@sub, @add], output_shapes = [#tf.shape<>], _cluster_launch = "not_ready"} : (tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<f32>
+  %5 = "tf.Case"(%3, %4, %arg1) {branches = [@sub, @add], output_shapes = [#tf.shape<>], _cluster_launch = "not_ready", is_stateless = false} : (tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<f32>
   return %5 : tensor<f32>
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/BUILD b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/BUILD
new file mode 100644
index 00000000000..6be08ac988c
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/BUILD
@@ -0,0 +1,25 @@
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
+
+licenses(["notice"])
+
+glob_lit_tests(
+    data = [
+        ":test_utilities",
+    ],
+    driver = "@llvm-project//mlir:run_lit.sh",
+    test_file_exts = [
+        "mlir",
+        "pbtxt",
+    ],
+)
+
+# Bundle together all of the test utilities that are used by tests.
+filegroup(
+    name = "test_utilities",
+    testonly = True,
+    data = [
+        "//tensorflow/compiler/mlir:tf-mlir-translate",
+        "@llvm-project//llvm:FileCheck",
+        "@llvm-project//llvm:not",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/add.mlir b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/add.mlir
new file mode 100644
index 00000000000..84e3f528a5c
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/add.mlir
@@ -0,0 +1,38 @@
+// RUN: tf-mlir-translate -mlir-tf-to-hlo-text %s -tf-input-shapes=: -emit-return-tuple | FileCheck %s
+// RUN: tf-mlir-translate -mlir-tf-to-hlo-text %s -tf-input-shapes=: -emit-use-tuple-args -emit-return-tuple | FileCheck -check-prefix=TUPLE-ARGS %s
+
+module attributes {tf.versions = {producer = 179 : i32}} {
+  func @main(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
+    %0 = "tf.AddV2"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+    return %0 : tensor<f32>
+  }
+}
+
+// CHECK-LABEL: HloModule main
+// CHECK:       ENTRY %main.{{[0-9]+}} ([[ARG0:.*]]: f32[], [[ARG1:.*]]: f32[]) -> (f32[]) {
+// CHECK-NEXT:    %[[ARG0]] = f32[] parameter(0)
+// CHECK-NEXT:    %[[ARG1]] = f32[] parameter(1)
+// CHECK-NEXT:    [[ADD:%.*]] = f32[] add(f32[] %[[ARG0]], f32[] %[[ARG1]])
+// CHECK-NEXT:    ROOT %tuple.{{[0-9]+}} = (f32[]) tuple(f32[] [[ADD]])
+// CHECK-NEXT:  }
+
+// CHECK:       // InputMapping {0, 1}
+// CHECK-NEXT:  // XlaInputShape f32[]
+// CHECK-NEXT:  // XlaInputShape f32[]
+// CHECK-NEXT:  // XlaOutputShape (f32[])
+// CHECK-NEXT:  // XlaOutputDescription type=float shape=()
+
+
+// TUPLE-ARGS-LABEL: HloModule main
+// TUPLE-ARGS:       ENTRY %main.{{[0-9]+}} ([[ARG_TUPLE:.*]]: (f32[], f32[])) -> (f32[]) {
+// TUPLE-ARGS:         %[[ARG_TUPLE]] = (f32[], f32[]) parameter(0)
+// TUPLE-ARGS:         [[ARG0:%.*]] = f32[] get-tuple-element((f32[], f32[]) %[[ARG_TUPLE]]), index=0
+// TUPLE-ARGS:         [[ARG1:%.*]] = f32[] get-tuple-element((f32[], f32[]) %[[ARG_TUPLE]]), index=1
+// TUPLE-ARGS:         [[ADD:%.*]] = f32[] add(f32[] [[ARG0]], f32[] [[ARG1]])
+// TUPLE-ARGS:         ROOT %tuple.{{[0-9]+}} = (f32[]) tuple(f32[] [[ADD]])
+// TUPLE-ARGS:       }
+
+// TUPLE-ARGS:       // InputMapping {0, 1}
+// TUPLE-ARGS-NEXT:  // XlaInputShape (f32[], f32[])
+// TUPLE-ARGS-NEXT:  // XlaOutputShape (f32[])
+// TUPLE-ARGS-NEXT:  // XlaOutputDescription type=float shape=()
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/argument-sharding-invalid.mlir b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/argument-sharding-invalid.mlir
new file mode 100644
index 00000000000..5347037d7cf
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/argument-sharding-invalid.mlir
@@ -0,0 +1,9 @@
+// RUN: not tf-mlir-translate -mlir-tf-to-hlo-text %s -tf-input-shapes=128,10 -emit-use-tuple-args -emit-return-tuple 2>&1 | FileCheck %s
+
+module attributes {tf.versions = {producer = 179 : i32}} {
+  func @main(%arg0: tensor<128x8xf32> {mhlo.sharding = "bad_sharding"}) {
+    return
+  }
+}
+
+// CHECK: failed to parse argument sharding 0 'bad_sharding'
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/argument-sharding.mlir b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/argument-sharding.mlir
new file mode 100644
index 00000000000..7154919c3d1
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/argument-sharding.mlir
@@ -0,0 +1,38 @@
+// RUN: tf-mlir-translate -mlir-tf-to-hlo-text %s -tf-input-shapes=128,10:10,1024:128,1024 -emit-use-tuple-args -emit-return-tuple | FileCheck %s
+
+module attributes {tf.versions = {producer = 179 : i32}} {
+  func @main(%arg0: tensor<128x10xf32> {mhlo.sharding = "\08\03\1A\02\01\02\22\02\00\01"}, %arg1: tensor<10x1024xf32> {mhlo.sharding = "\08\01\1A\01\01\22\01\00"}, %arg2: tensor<128x1024xf32> {mhlo.sharding = ""}) {
+    return
+  }
+}
+
+// The following xla::OpSharding protos are used:
+//  Serialized string:
+//   "\08\03\1A\02\01\02\22\02\00\01"
+//  Proto debug string:
+//   type: OTHER
+//   tile_assignment_dimensions: 1
+//   tile_assignment_dimensions: 2
+//   tile_assignment_devices: 0
+//   tile_assignment_devices: 1
+//
+//  Serialized string:
+//   "\08\01\1A\01\01\22\01\00"
+//  Proto debug string:
+//   type: MAXIMAL
+//   tile_assignment_dimensions: 1
+//   tile_assignment_devices: 0
+//
+//  Serialized string:
+//   ""
+//  Proto debug string (empty but would equivalent to):
+//   type: REPLICATED
+
+// CHECK-LABEL: HloModule main
+// CHECK:       ENTRY %main.{{[0-9]+}} ([[ARG_TUPLE:.*]]: (f32[128,10], f32[10,1024], f32[128,1024])) -> () {
+// CHECK:         %[[ARG_TUPLE]] = (f32[128,10]{1,0}, f32[10,1024]{1,0}, f32[128,1024]{1,0}) parameter(0)
+// CHECK-SAME:    sharding={
+// CHECK-SAME:    {devices=[1,2]0,1}
+// CHECK-SAME:    {maximal device=0}
+// CHECK-SAME:    {replicated}
+// CHECK-SAME:    }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/constant-folding-hook.mlir b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/constant-folding-hook.mlir
new file mode 100644
index 00000000000..c745fbc0744
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/constant-folding-hook.mlir
@@ -0,0 +1,16 @@
+// RUN: tf-mlir-translate -mlir-tf-to-hlo-text %s -tf-input-shapes=: -emit-use-tuple-args -emit-return-tuple | FileCheck %s
+
+module attributes {tf.versions = {producer = 179 : i32}} {
+  func @main() -> (tensor<0xi32>, tensor<0xi32>) {
+    %0 = "tf.Const"() {value = dense<[]> : tensor<0xi32>} : () -> tensor<0xi32>
+    %r0, %r1 = "tf.BroadcastGradientArgs"(%0, %0) {T = i32} : (tensor<0xi32>, tensor<0xi32>) -> (tensor<0xi32>, tensor<0xi32>)
+    return %r0, %r1 : tensor<0xi32>, tensor<0xi32>
+  }
+}
+
+// CHECK-LABEL: HloModule main
+// CHECK:       ENTRY %main.{{[0-9+]}} ([[ARG_TUPLE:.*]]: ()) -> (s32[0], s32[0]) {
+// CHECK:         %[[ARG_TUPLE]] = () parameter(0)
+// CHECK:         [[CONSTANT:%.*]] = s32[0]{0} constant({})
+// CHECK:         ROOT %tuple.{{[0-9]+}} = (s32[0]{0}, s32[0]{0}) tuple(s32[0]{0} [[CONSTANT]], s32[0]{0} [[CONSTANT]])
+// CHECK:       }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/constant-folding.mlir b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/constant-folding.mlir
new file mode 100644
index 00000000000..e54ff79e5e4
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/constant-folding.mlir
@@ -0,0 +1,23 @@
+// RUN: tf-mlir-translate -mlir-tf-to-hlo-text %s -tf-input-shapes=10,19:19,10 -emit-use-tuple-args -emit-return-tuple | FileCheck %s
+
+module attributes {tf.versions = {producer = 179 : i32}} {
+  func @main(%arg0: tensor<10x19xf32>, %arg1: tensor<19x10xf32> {mhlo.is_same_data_across_replicas}) -> tensor<10x19xf32> {
+    %0 = "tf.Shape"(%arg0) : (tensor<10x19xf32>) -> tensor<2xi64>
+    %1 = "tf.Reshape"(%arg1, %0) : (tensor<19x10xf32>, tensor<2xi64>) -> tensor<10x19xf32>
+    return %1 : tensor<10x19xf32>
+  }
+}
+
+// Tests that foldable ops are constant-folded to enable legalization of ops
+// that require compile time constant operand.
+// "tf.Shape" can only be folded away after shape inference. tf.Reshape can only
+// be lowered when tf.Shape is folded into a constant.
+
+// CHECK-LABEL: HloModule main
+// CHECK:       ENTRY %main.{{[0-9]+}} ([[ARG_TUPLE:.*]]: (f32[10,19], f32[19,10])) -> (f32[10,19]) {
+// CHECK:         %[[ARG_TUPLE]] = (f32[10,19]{1,0}, f32[19,10]{1,0}) parameter(0), parameter_replication={false,true}
+// CHECK:         [[ARG0:%.*]] = f32[10,19]{1,0} get-tuple-element((f32[10,19]{1,0}, f32[19,10]{1,0}) %[[ARG_TUPLE]]), index=0
+// CHECK:         [[ARG1:%.*]] = f32[19,10]{1,0} get-tuple-element((f32[10,19]{1,0}, f32[19,10]{1,0}) %[[ARG_TUPLE]]), index=1
+// CHECK:         [[RESHAPE:%.*]] = f32[10,19]{1,0} reshape(f32[19,10]{1,0} [[ARG1]])
+// CHECK:         ROOT %tuple.{{[0-9]+}} = (f32[10,19]{1,0}) tuple(f32[10,19]{1,0} [[RESHAPE]])
+// CHECK:       }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/graph-resource.mlir b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/graph-resource.mlir
new file mode 100644
index 00000000000..3d1a34b932d
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/graph-resource.mlir
@@ -0,0 +1,27 @@
+// RUN: tf-mlir-translate -mlir-tf-graph-to-hlo-text %s -tf-input-shapes=2:2 -tf-input-data-types=DT_FLOAT,DT_FLOAT -tf-xla-input-types=parameter,resource -emit-return-tuple | FileCheck %s
+
+module attributes {tf.versions = {producer = 511 : i32}} {
+  func @main(%arg0: tensor<*xf32>, %arg1: tensor<*x!tf.resource>) {
+    tf_executor.graph {
+      %control = tf_executor.island wraps "tf.AssignVariableOp"(%arg1, %arg0) : (tensor<*x!tf.resource>, tensor<*xf32>) -> ()
+      tf_executor.fetch %control : !tf_executor.control
+    }
+    return
+  }
+}
+
+// Tests a conversion from Graph (tf_executor dialect MLIR) to MLIR with
+// resource arguments.
+
+// CHECK-LABEL: HloModule main.{{[0-9]+}}, input_output_alias={ {0}: (1, {}, may-alias) }
+// CHECK:       ENTRY %main.{{[0-9]+}} ([[ARG0:.*]]: f32[2], [[ARG1:.*]]: f32[2]) -> (f32[2]) {
+// CHECK-NEXT:    %[[ARG1]] = f32[2]{0} parameter(1)
+// CHECK-NEXT:    %[[ARG0]] = f32[2]{0} parameter(0)
+// CHECK-NEXT:    ROOT %tuple.{{[0-9]+}} = (f32[2]{0}) tuple(f32[2]{0} %[[ARG0]])
+// CHECK-NEXT:  }
+
+// CHECK:       // InputMapping {0, 1}
+// CHECK-NEXT:  // XlaInputShape f32[2]
+// CHECK-NEXT:  // XlaInputShape f32[2]
+// CHECK-NEXT:  // XlaOutputShape (f32[2])
+// CHECK-NEXT:  // ResourceUpdate input_index=1 type=float shape=(2) modified
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/graph-resource.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/graph-resource.pbtxt
new file mode 100644
index 00000000000..5fb90b1bce0
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/graph-resource.pbtxt
@@ -0,0 +1,66 @@
+# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -tf-graph-as-function | tf-mlir-translate -mlir-tf-graph-to-hlo-text -tf-input-shapes=2:2 -tf-input-data-types=DT_FLOAT,DT_FLOAT -tf-xla-input-types=parameter,resource -emit-return-tuple | FileCheck %s
+
+node {
+  name: "arg0"
+  op: "_Arg"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "index"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "arg1"
+  op: "_Arg"
+  attr {
+    key: "T"
+    value {
+      type: DT_RESOURCE
+    }
+  }
+  attr {
+    key: "index"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "assign_variable"
+  op: "AssignVariableOp"
+  input: "arg1"
+  input: "arg0"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+library {
+}
+versions {
+  producer: 511
+}
+
+# Tests a conversion from Graph to MLIR with resource arguments.
+
+# CHECK-LABEL: HloModule main.{{[0-9]+}}, input_output_alias={ {0}: (1, {}, may-alias) }
+# CHECK:       ENTRY %main.{{[0-9]+}} ([[ARG0:.*]]: f32[2], [[ARG1:.*]]: f32[2]) -> (f32[2]) {
+# CHECK-NEXT:    %[[ARG1]] = f32[2]{0} parameter(1)
+# CHECK-NEXT:    %[[ARG0]] = f32[2]{0} parameter(0)
+# CHECK-NEXT:    ROOT %tuple.{{[0-9]+}} = (f32[2]{0}) tuple(f32[2]{0} %[[ARG0]])
+# CHECK-NEXT:  }
+
+# CHECK:       // InputMapping {0, 1}
+# CHECK-NEXT:  // XlaInputShape f32[2]
+# CHECK-NEXT:  // XlaInputShape f32[2]
+# CHECK-NEXT:  // XlaOutputShape (f32[2])
+# CHECK-NEXT:  // ResourceUpdate input_index=1 type=float shape=(2) modified
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/graph.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/graph.pbtxt
new file mode 100644
index 00000000000..f1f7c6434eb
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/graph.pbtxt
@@ -0,0 +1,47 @@
+# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -tf-graph-as-function | tf-mlir-translate -mlir-tf-graph-to-hlo-text -tf-input-shapes='' -tf-input-data-types=DT_FLOAT -emit-return-tuple | FileCheck %s
+
+node {
+  name: "arg"
+  op: "_Arg"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "index"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "retval"
+  op: "_Retval"
+  input: "arg"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "index"
+    value {
+      i: 0
+    }
+  }
+}
+versions {
+  producer: 511
+}
+
+# Verify that conversion from Graph to MLIR and empty shape representation
+# function is successful.
+
+# CHECK-LABEL: HloModule main
+# CHECK:       ENTRY %main.{{[0-9]+}} ([[ARG0:.*]]: f32[]) -> (f32[]) {
+# CHECK-NEXT:    %[[ARG0]] = f32[] parameter(0)
+# CHECK-NEXT:    ROOT %tuple.{{[0-9]+}} = (f32[]) tuple(f32[] %[[ARG0]])
+# CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/mlir-module-serialized-str-attr.mlir b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/mlir-module-serialized-str-attr.mlir
new file mode 100644
index 00000000000..b68f177b183
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/mlir-module-serialized-str-attr.mlir
@@ -0,0 +1,10 @@
+// RUN: tf-mlir-translate -mlir-tf-mlir-to-str-attr %s | FileCheck %s
+
+module attributes {tf.versions = {producer = 888 : i32}} {
+  func @main(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = "tf.Identity"(%arg0) : (tensor<?xi32>) -> tensor<?xi32> loc(unknown)
+    return %0 : tensor<?xi32> loc(unknown)
+  } loc(unknown)
+} loc(unknown)
+
+// CHECK: "\0A\0Amodule attributes {tf.versions = {producer = 888 : i32}} {\0A func @main(%arg0: tensor<?xi32>) -> tensor<?xi32> {\0A %0 = \22tf.Identity\22(%arg0) : (tensor<?xi32>) -> tensor<?xi32> loc(unknown)\0A return %0 : tensor<?xi32> loc(unknown)\0A } loc(unknown)\0A} loc(unknown)"
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/result-sharding.mlir b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/result-sharding.mlir
new file mode 100644
index 00000000000..c9c02ba2588
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/result-sharding.mlir
@@ -0,0 +1,39 @@
+// RUN: tf-mlir-translate -mlir-tf-to-hlo-text %s -tf-input-shapes=128,10:10,1024:128,1024 -emit-use-tuple-args -emit-return-tuple | FileCheck %s
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 351 : i32}} {
+  func @main(%arg0: tensor<128x10xf32>, %arg1: tensor<10x1024xf32>, %arg2: tensor<128x1024xf32>) -> (tensor<128x10xf32> {mhlo.sharding = "\08\03\1A\02\01\02\22\02\00\01"}, tensor<10x1024xf32> {mhlo.sharding = "\08\01\1A\01\01\22\01\00"}, tensor<128x1024xf32> {mhlo.sharding = ""}) {
+    return %arg0, %arg1, %arg2 : tensor<128x10xf32>, tensor<10x1024xf32>, tensor<128x1024xf32>
+  }
+}
+
+// The following xla::OpSharding protos are used:
+//  Serialized string:
+//   "\08\03\1A\02\01\02\22\02\00\01"
+//  Proto debug string:
+//   type: OTHER
+//   tile_assignment_dimensions: 1
+//   tile_assignment_dimensions: 2
+//   tile_assignment_devices: 0
+//   tile_assignment_devices: 1
+//
+//  Serialized string:
+//   "\08\01\1A\01\01\22\01\00"
+//  Proto debug string:
+//   type: MAXIMAL
+//   tile_assignment_dimensions: 1
+//   tile_assignment_devices: 0
+//
+//  Serialized string:
+//   ""
+//  Proto debug string (empty but would equivalent to):
+//   type: REPLICATED
+
+// CHECK-LABEL: HloModule main
+// CHECK:       ENTRY %main.{{[0-9]+}}
+// CHECK-SAME:  (arg_tuple.{{[0-9]+}}: (f32[128,10], f32[10,1024], f32[128,1024])) -> (f32[128,10], f32[10,1024], f32[128,1024]) {
+// CHECK:         ROOT %tuple.{{[0-9]+}}
+// CHECK-SAME:    sharding={
+// CHECK-SAME:    {devices=[1,2]0,1}
+// CHECK-SAME:    {maximal device=0}
+// CHECK-SAME:    {replicated}
+// CHECK-SAME:    }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/serialized-mlir-module-str-attr-invalid.mlir b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/serialized-mlir-module-str-attr-invalid.mlir
new file mode 100644
index 00000000000..ced11f3a083
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/serialized-mlir-module-str-attr-invalid.mlir
@@ -0,0 +1,5 @@
+// RUN: not tf-mlir-translate -mlir-tf-str-attr-to-mlir %s 2>&1 | FileCheck %s
+
+"totally @invalid MLIR module {here} <-"
+
+// CHECK: Invalid argument: could not parse MLIR module-:1:1: error: custom op 'totally' is unknown
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/serialized-mlir-module-str-attr.mlir b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/serialized-mlir-module-str-attr.mlir
new file mode 100644
index 00000000000..9a0e1dc38c8
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/serialized-mlir-module-str-attr.mlir
@@ -0,0 +1,15 @@
+// RUN: tf-mlir-translate -mlir-tf-str-attr-to-mlir %s -mlir-print-debuginfo | FileCheck %s
+
+"\0A\0Amodule attributes {tf.versions = {producer = 888 : i32}} {\0A func @main(%arg0: tensor<?xi32>) -> tensor<?xi32> {\0A %0 = \22tf.Identity\22(%arg0) : (tensor<?xi32>) -> tensor<?xi32> loc(unknown)\0A return %0 : tensor<?xi32> loc(unknown)\0A } loc(unknown)\0A} loc(unknown)"
+
+// Test simple serialized computation consisting of a function named `main`
+// with a tf.Identity op forwarding the function single argument to the function
+// single result.
+
+// CHECK-LABEL: module
+// CHECK-SAME:  attributes {tf.versions = {producer = 888 : i32}} {
+// CHECK-NEXT:   func @main([[ARG0:%.+]]: tensor<?xi32>) -> tensor<?xi32> {
+// CHECK-NEXT:     [[IDENTITY:%.+]] = "tf.Identity"([[ARG0]]) : (tensor<?xi32>) -> tensor<?xi32> loc(unknown)
+// CHECK-NEXT:     return [[IDENTITY]] : tensor<?xi32> loc(unknown)
+// CHECK-NEXT:   } loc(unknown)
+// CHECK-NEXT: } loc(unknown)
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/shape-inference-after-legalization.mlir b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/shape-inference-after-legalization.mlir
new file mode 100644
index 00000000000..55bdea5dd36
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/shape-inference-after-legalization.mlir
@@ -0,0 +1,11 @@
+// RUN: tf-mlir-translate -mlir-tf-to-hlo-text %s -tf-input-shapes=8,16,16,64:64 -emit-use-tuple-args -emit-return-tuple | FileCheck %s
+
+module attributes {tf.versions = {producer = 179 : i32}} {
+  func @main(%arg0: tensor<8x16x16x64xbf16>, %arg1: tensor<64xf32>) -> (tensor<8x16x16x64xbf16>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<*xf32>) {
+    %0:6 = "tf.FusedBatchNormV3"(%arg0, %arg1, %arg1, %arg1, %arg1) {data_format = "NHWC", device = "", epsilon = 9.99999974E-5 : f32, exponential_avg_factor = 1.000000e+00 : f32, is_training = false} : (tensor<8x16x16x64xbf16>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>) -> (tensor<8x16x16x64xbf16>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<*xf32>)
+    return %0#0, %0#1, %0#2, %0#3, %0#4, %0#5 : tensor<8x16x16x64xbf16>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<*xf32>
+  }
+}
+
+// CHECK-LABEL: HloModule main
+// CHECK:       -> (bf16[8,16,16,64], f32[64], f32[64], f32[64], f32[64], f32[0])
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/shape-inference.mlir b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/shape-inference.mlir
new file mode 100644
index 00000000000..f9eca514da3
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/shape-inference.mlir
@@ -0,0 +1,11 @@
+// RUN: tf-mlir-translate -mlir-tf-to-hlo-text %s -tf-input-shapes=10,17:17,19 -emit-use-tuple-args -emit-return-tuple | FileCheck %s
+
+module attributes {tf.versions = {producer = 179 : i32}} {
+  func @main(%arg0: tensor<*xf32>, %arg1: tensor<?x19xf32>) -> tensor<?x19xf32> {
+    %0 = "tf.MatMul"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", transpose_a = false, transpose_b = false} : (tensor<*xf32>, tensor<?x19xf32>) -> tensor<?x19xf32>
+    return %0 : tensor<?x19xf32>
+  }
+}
+
+// CHECK-LABEL: HloModule main
+// CHECK:       (arg_tuple.{{[0-9]+}}: (f32[10,17], f32[17,19])) -> (f32[10,19])
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir b/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir
index b86815dbe57..fff985efa6f 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir
@@ -89,16 +89,45 @@ func @testEmptybf16() -> (tensor<5xbf16>) {
 }
 
 // CHECK-LABEL: func @testShapeN
-func @testShapeN(%arg0: tensor<f32>, %arg1: tensor<1x32x32x16xf32>, %arg2: tensor<*xf32>) -> (tensor<0xi64>, tensor<4xi64>, tensor<4xi64>, tensor<?xi64>) {
+func @testShapeN(%arg0: tensor<f32>, %arg1: tensor<1x32x32x16xf32>) -> (tensor<0xi64>, tensor<4xi64>) {
 
-  // CHECK: "tf.Const"() {value = dense<> : tensor<0xi64>
-  // CHECK: "tf.Const"() {value = dense<[1, 32, 32, 16]> : tensor<4xi64>}
+  // CHECK: %[[SHAPE0:.*]] = "tf.Const"() {value = dense<> : tensor<0xi64>}
+  // CHECK: %[[SHAPE1:.*]] = "tf.Const"() {value = dense<[1, 32, 32, 16]> : tensor<4xi64>}
   %0:2 = "tf.ShapeN"(%arg0, %arg1) : (tensor<f32>, tensor<1x32x32x16xf32>) -> (tensor<0xi64>, tensor<4xi64>)
 
-  // CHECK: tf.ShapeN
-  %1:2 = "tf.ShapeN"(%arg1, %arg2) : (tensor<1x32x32x16xf32>, tensor<*xf32>) -> (tensor<4xi64>, tensor<?xi64>)
+  // CHECK: return %[[SHAPE0]], %[[SHAPE1]]
+  return %0#0, %0#1 : tensor<0xi64>, tensor<4xi64>
+}
 
-  return %0#0, %0#1, %1#0, %1#1 : tensor<0xi64>, tensor<4xi64>, tensor<4xi64>, tensor<?xi64>
+// CHECK-LABEL: func @testShapeNPartialStatic
+func @testShapeNPartialStatic(%arg0: tensor<f32>, %arg1: tensor<2x?x3xf32>, %arg2: tensor<1x32x32x16xf32>, %arg3: tensor<*xf32>) -> (tensor<0xi64>, tensor<3xi64>, tensor<4xi64>, tensor<?xi64>) {
+  // CHECK: %[[SHAPE0:.*]] = "tf.Const"() {value = dense<> : tensor<0xi64>}
+  // CHECK: %[[SHAPE2:.*]] = "tf.Const"() {value = dense<[1, 32, 32, 16]> : tensor<4xi64>}
+  // CHECK: %[[SHAPE13:.*]]:2 = "tf.ShapeN"(%arg1, %arg3) : (tensor<2x?x3xf32>, tensor<*xf32>) -> (tensor<3xi64>, tensor<?xi64>)
+  %0:4 = "tf.ShapeN"(%arg0, %arg1, %arg2, %arg3) : (tensor<f32>, tensor<2x?x3xf32>, tensor<1x32x32x16xf32>, tensor<*xf32>) -> (tensor<0xi64>, tensor<3xi64>, tensor<4xi64>, tensor<?xi64>)
+
+  // CHECK: return %[[SHAPE0]], %[[SHAPE13]]#0, %[[SHAPE2]], %[[SHAPE13]]#1
+  return %0#0, %0#1, %0#2, %0#3 : tensor<0xi64>, tensor<3xi64>, tensor<4xi64>, tensor<?xi64>
+}
+
+// CHECK-LABEL: func @testShapeNOneDynamic
+func @testShapeNOneDynamic(%arg0: tensor<f32>, %arg1: tensor<1x32x32x16xf32>, %arg2: tensor<*xf32>) -> (tensor<0xi64>, tensor<4xi64>, tensor<?xi64>) {
+  // CHECK: %[[SHAPE0:.*]] = "tf.Const"() {value = dense<> : tensor<0xi64>}
+  // CHECK: %[[SHAPE1:.*]] = "tf.Const"() {value = dense<[1, 32, 32, 16]> : tensor<4xi64>}
+  // CHECK: %[[SHAPE2:.*]] = "tf.Shape"(%arg2) : (tensor<*xf32>) -> tensor<?xi64>
+  %0:3 = "tf.ShapeN"(%arg0, %arg1, %arg2) : (tensor<f32>, tensor<1x32x32x16xf32>, tensor<*xf32>) -> (tensor<0xi64>, tensor<4xi64>, tensor<?xi64>)
+
+  // CHECK: return %[[SHAPE0]], %[[SHAPE1]], %[[SHAPE2]]
+  return %0#0, %0#1, %0#2 : tensor<0xi64>, tensor<4xi64>, tensor<?xi64>
+}
+
+// CHECK-LABEL: func @testShapeNToShape
+func @testShapeNToShape(%arg0: tensor<*xf32>) -> tensor<?xi64> {
+  // CHECK: %[[SHAPE0:.*]] = "tf.Shape"(%arg0) : (tensor<*xf32>) -> tensor<?xi64>
+  %0:1 = "tf.ShapeN"(%arg0) : (tensor<*xf32>) -> tensor<?xi64>
+
+  // CHECK: return %[[SHAPE0]]
+  return %0#0 : tensor<?xi64>
 }
 
 // CHECK-LABEL: func @testLeakyRelu
@@ -463,3 +492,13 @@ func @DontFoldTile() -> (tensor<8x10000xi32>) {
   return %3 : tensor<8x10000xi32>
 }
 // LINT.ThenChange(../transforms/constant_fold.cc:folding-policy)
+
+func @fold_conv() -> tensor<1x520x520x1xf32> {
+  %0 = "tf.Const"() {value = dense<0.111111112> : tensor<3x3x1x1xf32>} : () -> tensor<3x3x1x1xf32>
+  %1 = "tf.Const"() {value = dense<1.000000e+00> : tensor<1x520x520x1xf32>} : () -> tensor<1x520x520x1xf32>
+  %2 = "tf.DepthwiseConv2dNative"(%1, %0) {data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<1x520x520x1xf32>, tensor<3x3x1x1xf32>) -> tensor<1x520x520x1xf32>
+  return %2 : tensor<1x520x520x1xf32>
+
+  // CHECK: tf.Const
+  // CHECK-NOT: tf.DepthwiseConv2dNative
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/contraction_fusion.mlir b/tensorflow/compiler/mlir/tensorflow/tests/contraction_fusion.mlir
new file mode 100644
index 00000000000..b12f50ad525
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/contraction_fusion.mlir
@@ -0,0 +1,37 @@
+// RUN: tf-opt %s -tf-contraction-fusion | FileCheck %s
+
+// CHECK-LABEL: matmulBiasAdd
+func @matmulBiasAdd(%arg0: tensor<64xf32>, %arg1: tensor<8x32xf32>, %arg2: tensor<32x64xf32>) -> tensor<8x64xf32> {
+  // CHECK: %[[FUSED:.*]] = "tf._JitFusedMatMul"(%arg1, %arg2, %arg0)
+  // CHECK-SAME: fusion = ["BiasAdd"]
+  // CHECK-SAME: transpose_a = false, transpose_b = false
+  %3 = "tf.MatMul"(%arg1, %arg2) {transpose_a = false, transpose_b = false} : (tensor<8x32xf32>, tensor<32x64xf32>) -> tensor<8x64xf32>
+  %4 = "tf.BiasAdd"(%3, %arg0) {data_format = "NHWC"} : (tensor<8x64xf32>, tensor<64xf32>) -> tensor<8x64xf32>
+  // CHECK: return %[[FUSED]]
+  return %4 : tensor<8x64xf32>
+}
+
+// CHECK-LABEL: matmulBiasAddRelu
+func @matmulBiasAddRelu(%arg0: tensor<64xf32>, %arg1: tensor<8x32xf32>, %arg2: tensor<32x64xf32>) -> tensor<8x64xf32> {
+  // CHECK: %[[FUSED:.*]] = "tf._JitFusedMatMul"(%arg1, %arg2, %arg0)
+  // CHECK-SAME: fusion = ["BiasAdd", "Relu"]
+  // CHECK-SAME: transpose_a = false, transpose_b = false
+  %3 = "tf.MatMul"(%arg1, %arg2) {transpose_a = false, transpose_b = false} : (tensor<8x32xf32>, tensor<32x64xf32>) -> tensor<8x64xf32>
+  %4 = "tf.BiasAdd"(%3, %arg0) {data_format = "NHWC"} : (tensor<8x64xf32>, tensor<64xf32>) -> tensor<8x64xf32>
+  %5 = "tf.Relu"(%4) : (tensor<8x64xf32>) -> tensor<8x64xf32>
+  // CHECK: return %[[FUSED]]
+  return %5 : tensor<8x64xf32>
+}
+
+// CHECK-LABEL: matmulBiasAddLeakyRelu
+func @matmulBiasAddLeakyRelu(%arg0: tensor<64xf32>, %arg1: tensor<8x32xf32>, %arg2: tensor<32x64xf32>) -> tensor<8x64xf32> {
+  // CHECK: %[[FUSED:.*]] = "tf._JitFusedMatMul"(%arg1, %arg2, %arg0)
+  // CHECK-SAME: alpha = 2.000000e-01 : f32
+  // CHECK-SAME: fusion = ["BiasAdd", "LeakyRelu"]
+  // CHECK-SAME: transpose_a = false, transpose_b = false
+  %3 = "tf.MatMul"(%arg1, %arg2) {transpose_a = false, transpose_b = false} : (tensor<8x32xf32>, tensor<32x64xf32>) -> tensor<8x64xf32>
+  %4 = "tf.BiasAdd"(%3, %arg0) {data_format = "NHWC"} : (tensor<8x64xf32>, tensor<64xf32>) -> tensor<8x64xf32>
+  %5 = "tf.LeakyRelu"(%4) { alpha = 0.2 : f32 } : (tensor<8x64xf32>) -> tensor<8x64xf32>
+  // CHECK: return %[[FUSED]]
+  return %5 : tensor<8x64xf32>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/decompose_resource_ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/decompose_resource_ops.mlir
index ff4dbf41221..e6a92a520f0 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/decompose_resource_ops.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/decompose_resource_ops.mlir
@@ -101,7 +101,7 @@ func @decompose_resource_apply_momentum_non_nesterov(%arg0: tensor<f32>, %arg1:
 
   // CHECK: [[ACCUM:%.*]] = "tf.ReadVariableOp"([[ACCUM_HANDLE]])
   // CHECK: [[ACCUM_MOMENTUM:%.*]] = "tf.Mul"([[ACCUM]], [[MOMENTUM]])
-  // CHECK: [[ACCUM_NEW:%.*]] = "tf.Add"([[ACCUM_MOMENTUM]], [[GRAD]])
+  // CHECK: [[ACCUM_NEW:%.*]] = "tf.AddV2"([[ACCUM_MOMENTUM]], [[GRAD]])
   // CHECK: "tf.AssignVariableOp"([[ACCUM_HANDLE]], [[ACCUM_NEW]])
   // CHECK: [[ACCUM_NEW_LR:%.*]] = "tf.Mul"([[ACCUM_NEW]], [[LR]])
   // CHECK: [[VAR:%.*]] = "tf.ReadVariableOp"([[VAR_HANDLE]])
@@ -127,12 +127,12 @@ func @decompose_resource_apply_momentum_nesterov(%arg0: tensor<f32>, %arg1: tens
 
   // CHECK: [[ACCUM:%.*]] = "tf.ReadVariableOp"([[ACCUM_HANDLE]])
   // CHECK: [[ACCUM_MOMENTUM:%.*]] = "tf.Mul"([[ACCUM]], [[MOMENTUM]])
-  // CHECK: [[ACCUM_NEW:%.*]] = "tf.Add"([[ACCUM_MOMENTUM]], [[GRAD]])
+  // CHECK: [[ACCUM_NEW:%.*]] = "tf.AddV2"([[ACCUM_MOMENTUM]], [[GRAD]])
   // CHECK: "tf.AssignVariableOp"([[ACCUM_HANDLE]], [[ACCUM_NEW]])
   // CHECK: [[GRAD_LR:%.*]] = "tf.Mul"([[GRAD]], [[LR]])
   // CHECK: [[MOMENTUM_LR:%.*]] = "tf.Mul"([[MOMENTUM]], [[LR]])
   // CHECK: [[ACCUM_NEW_MOMENTUM_LR:%.*]] = "tf.Mul"([[ACCUM_NEW]], [[MOMENTUM_LR]])
-  // CHECK: [[DELTA:%.*]] = "tf.Add"([[GRAD_LR]], [[ACCUM_NEW_MOMENTUM_LR]])
+  // CHECK: [[DELTA:%.*]] = "tf.AddV2"([[GRAD_LR]], [[ACCUM_NEW_MOMENTUM_LR]])
   // CHECK: [[VAR:%.*]] = "tf.ReadVariableOp"([[VAR_HANDLE]])
   // CHECK: [[VAR_NEW:%.*]] = "tf.Sub"([[VAR]], [[DELTA]])
   // CHECK: "tf.AssignVariableOp"([[VAR_HANDLE]], [[VAR_NEW]])
@@ -231,6 +231,31 @@ func @decompose_resource_apply_adagradv2(%arg0: tensor<f32>, %arg1: tensor<f32>,
   return
 }
 
+// -----
+// CHECK-LABEL: func @decompose_resource_apply_adagrad
+// CHECK-SAME:  (%[[LR:.*]]: tensor<f32>, %[[GRAD:.*]]: tensor<f32>)
+func @decompose_resource_apply_adagrad(%arg0: tensor<f32>, %arg1: tensor<f32>) -> () {
+
+  // CHECK: %[[VAR_HANDLE:.*]] = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource>
+  // CHECK: %[[ACCUM_HANDLE:.*]] = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource>
+  // CHECK: %[[GRAD_SQUARE:.*]] = "tf.Mul"(%[[GRAD]], %[[GRAD]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  // CHECK: %[[ACCUM:.*]] = "tf.ReadVariableOp"(%[[ACCUM_HANDLE]]) : (tensor<*x!tf.resource>) -> tensor<*xf32>
+  // CHECK: %[[ACCUM_NEW:.*]] = "tf.AddV2"(%[[ACCUM]], %[[GRAD_SQUARE]]) : (tensor<*xf32>, tensor<f32>) -> tensor<*xf32>
+  // CHECK: %[[LR_MULTIPLY:.*]] = "tf.Mul"(%[[LR]], %[[GRAD]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  // CHECK: %[[SQRT:.*]] = "tf.Sqrt"(%[[ACCUM_NEW]]) : (tensor<*xf32>) -> tensor<*xf32>
+  // CHECK: %[[DIV:.*]] = "tf.Div"(%[[LR_MULTIPLY]], %[[SQRT]]) : (tensor<f32>, tensor<*xf32>) -> tensor<*xf32>
+  // CHECK: %[[VAR:.*]] = "tf.ReadVariableOp"(%[[VAR_HANDLE]]) : (tensor<*x!tf.resource>) -> tensor<*xf32>
+  // CHECK: %[[VAR_NEW:.*]] = "tf.Sub"(%[[VAR]], %[[DIV]]) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+  // CHECK: "tf.AssignVariableOp"(%[[VAR_HANDLE]], %[[VAR_NEW]]) : (tensor<*x!tf.resource>, tensor<*xf32>) -> ()
+  // CHECK: "tf.AssignVariableOp"(%[[ACCUM_HANDLE]], %[[ACCUM_NEW]]) : (tensor<*x!tf.resource>, tensor<*xf32>) -> ()
+  %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource>
+  %1 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource>
+
+  "tf.ResourceApplyAdagrad"(%0, %1, %arg0, %arg1) {update_slots = true, use_locking = true} : (tensor<*x!tf.resource>, tensor<*x!tf.resource>, tensor<f32>, tensor<f32>) -> ()
+
+  return
+}
+
 // -----
 
 // Tests that composite tf.ResourceApplyAdam (non-Nesterov) operation is
@@ -388,14 +413,14 @@ func @decompose_resource_apply_centered_RMS_prop(%arg0: tensor<f32>, %arg1: tens
   // CHECK: [[GRAD_SUB:%.*]] = "tf.Mul"([[GRADSQ]], [[SB]])
   // CHECK: [[MS:%.*]] = "tf.ReadVariableOp"([[MS_HANDLE]])
   // CHECK: [[MS_RHO:%.*]] = "tf.Mul"([[MS]], [[RHO]])
-  // CHECK: [[MS_NEW:%.*]] = "tf.Add"([[GRAD_SUB]], [[MS_RHO]])
+  // CHECK: [[MS_NEW:%.*]] = "tf.AddV2"([[GRAD_SUB]], [[MS_RHO]])
   // CHECK: "tf.AssignVariableOp"([[MS_HANDLE]], [[MS_NEW]])
 
   // CHECK: [[SUB_RHO:%.*]] = "tf.Sub"([[ONE]], [[RHO]])
   // CHECK: [[SUB_GRAD:%.*]] = "tf.Mul"([[GRAD]], [[SUB_RHO]])
   // CHECK: [[MG:%.*]] = "tf.ReadVariableOp"([[MG_HANDLE]])
   // CHECK: [[MG_RHO:%.*]] = "tf.Mul"([[MG]], [[RHO]])
-  // CHECK: [[MG_NEW:%.*]] = "tf.Add"([[SUB_GRAD]], [[MG_RHO]])
+  // CHECK: [[MG_NEW:%.*]] = "tf.AddV2"([[SUB_GRAD]], [[MG_RHO]])
   // CHECK: "tf.AssignVariableOp"([[MG_HANDLE]], [[MG_NEW]])
 
   // CHECK: [[MOM:%.*]] = "tf.ReadVariableOp"([[MOM_HANDLE]])
@@ -403,11 +428,11 @@ func @decompose_resource_apply_centered_RMS_prop(%arg0: tensor<f32>, %arg1: tens
   // CHECK: [[LR_GRAD:%.*]] = "tf.Mul"([[LR]], [[GRAD]])
 
   // CHECK: [[MG_MG:%.*]] = "tf.Mul"([[MG_NEW]], [[MG_NEW]])
-  // CHECK: [[MG_NEW:%.*]] = "tf.Add"([[MG_MG]], [[EPSILON]])
+  // CHECK: [[MG_NEW:%.*]] = "tf.AddV2"([[MG_MG]], [[EPSILON]])
   // CHECK: [[MG_SUB:%.*]] = "tf.Sub"([[MS_NEW]], [[MG_NEW]])
   // CHECK: [[MG_SQRT:%.*]] = "tf.Sqrt"([[MG_SUB]])
   // CHECK: [[MOM_DIV:%.*]] = "tf.Div"([[LR_GRAD]], [[MG_SQRT]])
-  // CHECK: [[MOM_NEW:%.*]] = "tf.Add"([[MOM_MOM]], [[MOM_DIV]])
+  // CHECK: [[MOM_NEW:%.*]] = "tf.AddV2"([[MOM_MOM]], [[MOM_DIV]])
 
   // CHECK: [[VAR:%.*]] = "tf.ReadVariableOp"([[VAR_HANDLE]])
   // CHECK: [[VAR_NEW:%.*]] = "tf.Sub"([[VAR]], [[MOM_NEW]])
@@ -416,6 +441,33 @@ func @decompose_resource_apply_centered_RMS_prop(%arg0: tensor<f32>, %arg1: tens
   "tf.ResourceApplyCenteredRMSProp"(%0, %1, %2, %3, %arg4, %arg5, %arg6, %arg7, %arg8) {use_locking = false} : (tensor<*x!tf.resource>, tensor<*x!tf.resource>, tensor<*x!tf.resource>, tensor<*x!tf.resource>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>) -> ()
   return
 }
+// -----
+// CHECK-LABEL: func @decompose_resource_apply_RMS_prop
+// CHECK-SAME:  (%[[VAR_HANDLE:.*]]: tensor<*x!tf.resource>, %[[MS_HANDLE:.*]]: tensor<*x!tf.resource>, %[[MOM_HANDLE:.*]]: tensor<*x!tf.resource>,
+// CHECK-SAME:   %[[LR:.*]]: tensor<f32>, %[[RHO:.*]]: tensor<f32>, %[[MOMENTUM:.*]]: tensor<f32>, %[[EPSILON:.*]]: tensor<f32>, %[[GRAD:.*]]: tensor<f32>)
+func @decompose_resource_apply_RMS_prop(%arg0: tensor<*x!tf.resource>, %arg1: tensor<*x!tf.resource>, %arg2: tensor<*x!tf.resource>, %arg3: tensor<f32>, %arg4: tensor<f32>, %arg5: tensor<f32>, %arg6: tensor<f32>, %arg7: tensor<f32>) -> () {
+// CHECK: %[[ONE:.*]] = "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
+// CHECK: %[[MS:.*]] = "tf.ReadVariableOp"(%[[MS_HANDLE]]) : (tensor<*x!tf.resource>) -> tensor<*xf32>
+// CHECK: %[[MS_RHO:.*]] = "tf.Mul"(%[[MS]], %[[RHO]]) : (tensor<*xf32>, tensor<f32>) -> tensor<*xf32>
+// CHECK: %[[GRAD_SQUARE:.*]] = "tf.Square"(%[[GRAD]]) : (tensor<f32>) -> tensor<f32>
+// CHECK: %[[ONE_RHO:.*]] = "tf.Sub"(%[[ONE]], %[[RHO]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+// CHECK: %[[MUL:.*]] = "tf.Mul"(%[[GRAD_SQUARE]], %[[ONE_RHO]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+// CHECK: %[[MS_NEW:.*]] = "tf.AddV2"(%[[MS_RHO]], %[[MUL]]) : (tensor<*xf32>, tensor<f32>) -> tensor<*xf32>
+// CHECK: "tf.AssignVariableOp"(%[[MS_HANDLE]], %[[MS_NEW]]) : (tensor<*x!tf.resource>, tensor<*xf32>) -> ()
+// CHECK: %[[MOM:.*]] = "tf.ReadVariableOp"(%[[MOM_HANDLE]]) : (tensor<*x!tf.resource>) -> tensor<*xf32>
+// CHECK: %[[MOMENTUM_MOM:.*]] = "tf.Mul"(%[[MOMENTUM]], %[[MOM]]) : (tensor<f32>, tensor<*xf32>) -> tensor<*xf32>
+// CHECK: %[[LR_GRAD:.*]] = "tf.Mul"(%[[LR]], %[[GRAD]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+// CHECK: %[[ADD:.*]] = "tf.AddV2"(%[[MS_NEW]], %[[EPSILON]]) : (tensor<*xf32>, tensor<f32>) -> tensor<*xf32>
+// CHECK: %[[SQRT:.*]] = "tf.Sqrt"(%[[ADD]]) : (tensor<*xf32>) -> tensor<*xf32>
+// CHECK: %[[DIV:.*]] = "tf.Div"(%[[LR_GRAD]], %[[SQRT]]) : (tensor<f32>, tensor<*xf32>) -> tensor<*xf32>
+// CHECK: %[[MOM_NEW:.*]] = "tf.AddV2"(%[[MOMENTUM_MOM]], %[[DIV]]) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+// CHECK: "tf.AssignVariableOp"(%[[MOM_HANDLE]], %[[MOM_NEW]]) : (tensor<*x!tf.resource>, tensor<*xf32>) -> ()
+// CHECK: %[[VAR:.*]] = "tf.ReadVariableOp"(%[[VAR_HANDLE]]) : (tensor<*x!tf.resource>) -> tensor<*xf32>
+// CHECK: %[[VAR_NEW:.*]] = "tf.Sub"(%[[VAR]], %[[MOM_NEW]]) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+// CHECK: "tf.AssignVariableOp"(%[[VAR_HANDLE]], %[[VAR_NEW]]) : (tensor<*x!tf.resource>, tensor<*xf32>) -> ()
+  "tf.ResourceApplyRMSProp"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7) {use_locking = false} : (tensor<*x!tf.resource>, tensor<*x!tf.resource>, tensor<*x!tf.resource>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>) -> ()
+  return
+}
 
 // -----
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/device_copy.mlir b/tensorflow/compiler/mlir/tensorflow/tests/device_copy.mlir
new file mode 100644
index 00000000000..8250bcf7101
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/device_copy.mlir
@@ -0,0 +1,16 @@
+// RUN: tf-opt -tf-tensor-device-copy %s | FileCheck %s --dump-input=fail
+
+// CHECK-LABEL: func @fold_identity
+// CHECK-SAME: ([[arg0:%.*]]: tensor<2x2xf32>, [[arg1:%.*]]: tensor<2x2xf32>
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32}} {
+  func @fold_identity(%arg0: tensor<2x2xf32>, %arg1: tensor<2x2xf32>) -> tensor<2x2xf32> {
+    %0 = tf_executor.graph {
+      // CHECK: tf.MatMul
+      %outputs, %control = tf_executor.island wraps "tf.MatMul"(%arg0, %arg1) {device = "", transpose_a = false, transpose_b = false} : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+      // CHECK-NOT: tf.Identity
+      %outputs_0, %control_1 = tf_executor.island wraps "tf.Identity"(%outputs) {device = ""} : (tensor<2x2xf32>) -> tensor<2x2xf32>
+      tf_executor.fetch %outputs_0 : tensor<2x2xf32>
+    }
+    return %0 : tensor<2x2xf32>
+  }
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/executor_island_coarsening.mlir b/tensorflow/compiler/mlir/tensorflow/tests/executor_island_coarsening.mlir
index bec48181b3b..726495f1fbc 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/executor_island_coarsening.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/executor_island_coarsening.mlir
@@ -220,7 +220,7 @@ func @merge_islands_only() {
     %11:2 = tf_executor.island(%10#1) wraps "tf.opF"() : () -> tensor<i32>
     %12:2 = tf_executor.island wraps "tf.opG"(%10#0, %11#0) : (tensor<*xi32>, tensor<i32>) -> tensor<*xi32>
     %13 = tf_executor.ControlTrigger %2, %12#1, %9#1
-    tf_executor.NextIteration.Sink [%3#1] %12#0, %13 : tensor<*xi32>
+    tf_executor.NextIteration.Sink[%3#1] %12#0, %13 : tensor<*xi32>
     tf_executor.fetch
   }
   return
@@ -244,7 +244,7 @@ func @merge_islands_only() {
 // CHECK-NEXT:     %[[OP_G:[0-9]*]] = "tf.opG"(%[[OP_E]], %[[OP_F]])
 // CHECK-NEXT:     tf_executor.yield %[[OP_G]] : tensor<*xi32>
 // CHECK:        %[[CT:.*]] = tf_executor.ControlTrigger %[[ISLAND_1]], %[[ISLAND_3_control]], %[[EXIT_control]]
-// CHECK-NEXT:   tf_executor.NextIteration.Sink [%[[NEXTIT_SRC_token]]] %[[ISLAND_3]], %[[CT]]
+// CHECK-NEXT:   tf_executor.NextIteration.Sink[%[[NEXTIT_SRC_token]]] %[[ISLAND_3]], %[[CT]]
 
 
 // Test no merging took place as cycle would be formed otherwise.
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_outline_island/case_op.mlir b/tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_outline_island/case_op.mlir
index 7d761b5d690..0000d43823b 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_outline_island/case_op.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_outline_island/case_op.mlir
@@ -16,7 +16,7 @@ module {
        "tf.TPUReplicateMetadata"() {_tpu_replicate = "cluster", device = "device", num_replicas = 1, topology = "topology"} : () -> ()
         %index = "tf.opA"(%arg0) {_tpu_replicate = "cluster"} : (tensor<i1>) -> tensor<i32>
         %input = "tf.opB"(%arg0) {_tpu_replicate = "cluster"} : (tensor<i1>) -> tensor<i32>
-        %result = "tf.Case"(%index, %input) {branches = [@branch_0, @branch_1, @branch_2, @branch_3, @branch_4]} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+        %result = "tf.Case"(%index, %input) {branches = [@branch_0, @branch_1, @branch_2, @branch_3, @branch_4], is_stateless = false} : (tensor<i32>, tensor<i32>) -> tensor<i32>
         tf_executor.yield %result : tensor<i32>
       }
       tf_executor.fetch %output : tensor<i32>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/functional-control-flow-to-regions.mlir b/tensorflow/compiler/mlir/tensorflow/tests/functional-control-flow-to-regions.mlir
index c8c82c5c08f..e4e7f0859c8 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/functional-control-flow-to-regions.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/functional-control-flow-to-regions.mlir
@@ -123,6 +123,27 @@ func @testIfNoInputAndNoResult(%arg0: tensor<i1>) -> () {
 
 // -----
 
+// If with non tensor<i1> condition
+
+// Simple If
+// CHECK: func @testIf1Then{{.+}}
+// CHECK: func @testIf1Else{{.+}}
+func @testIf1Then(tensor<*xf32>) -> tensor<*xf32>
+func @testIf1Else(tensor<*xf32>) -> tensor<*xf32>
+
+// CHECK-LABEL: func @testIf1Result(%arg0: tensor<i32>, %arg1: tensor<*xf32>)
+func @testIf1Result(%arg0: tensor<i32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
+  %0 = "tf.If"(%arg0, %arg1) {
+    then_branch = @testIf1Then, else_branch = @testIf1Else, is_stateless = false
+  } : (tensor<i32>, tensor<*xf32>) -> tensor<*xf32>
+
+  // CHECK: [[ToBool:%.*]] = "tf.ToBool"
+  // CHECK: "tf.IfRegion"([[ToBool]])
+  return %0 : tensor<*xf32>
+}
+
+// -----
+
 // Simple While
 func @testWhileCond(tensor<*xf32>) -> (tensor<i1>)
 func @testWhileBody(tensor<*xf32>) -> (tensor<*xf32>)
@@ -200,3 +221,58 @@ func @testWhileResult(tensor<*xf32>) -> (tensor<*xf32>) {
   return %1 : tensor<*xf32>
 }
 
+// -----
+
+// While with non tensor<i1> condition
+func @testWhileCond(tensor<*xf32>) -> (tensor<f32>)
+func @testWhileBody(tensor<*xf32>) -> (tensor<*xf32>)
+
+// CHECK-LABEL: func @testWhileResult
+func @testWhileResult(tensor<*xf32>) -> (tensor<*xf32>) {
+^bb0(%arg0: tensor<*xf32>):
+  %1 = "tf.While"(%arg0) {
+    cond = @testWhileCond,
+    body = @testWhileBody,
+    is_stateless = true,
+    _attr0 = 10, _attr1 = true, attr2 = "hello"
+  } : (tensor<*xf32>) -> (tensor<*xf32>)
+
+  // CHECK: [[Result0:%.*]] = "tf.WhileRegion"
+  // CHECK: [[Result1:%.*]] = call @testWhileCond
+  // CHECK: [[ToBool:%.*]] = "tf.ToBool"([[Result1]])
+  // CHECK: "tf.Yield"([[ToBool]])
+  // CHECK: [[Result2:%.*]] = call @testWhileBody
+  // CHECK: "tf.Yield"([[Result2]])
+  // CHECK: return [[Result0]]
+  return %1 : tensor<*xf32>
+}
+
+// -----
+
+func @then_branch() -> ()
+func @else_branch() -> ()
+
+// Test tf.If device is preserved.
+// CHECK-LABEL: func @testIfDevice
+func @testIfDevice(%arg0: tensor<i1>) {
+  "tf.If"(%arg0) {then_branch = @then_branch, else_branch = @else_branch, is_stateless = false, device = "/device:CPU:0"} : (tensor<i1>) -> ()
+
+  // CHECK: "tf.IfRegion"
+  // CHECK: device = "/device:CPU:0"
+  return
+}
+
+// -----
+
+func @cond() -> tensor<i1>
+func @body() -> ()
+
+// Test tf.While device is preserved.
+// CHECK-LABEL: func @testWhileDevice
+func @testWhileDevice() {
+  "tf.While"() {cond = @cond, body = @body, is_stateless = false, device = "/device:CPU:0"} : () -> ()
+
+  // CHECK: "tf.WhileRegion"
+  // CHECK: device = "/device:CPU:0"
+  return
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/case_op.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/case_op.pbtxt
new file mode 100644
index 00000000000..1372ad71283
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/case_op.pbtxt
@@ -0,0 +1,261 @@
+# RUN: tf-mlir-translate -graphdef-to-splatted-mlir %s -o - | FileCheck %s
+
+node {
+  name: "Const"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Const_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "indexed_case"
+  op: "StatelessCase"
+  input: "Const_1"
+  input: "Const"
+  attr {
+    key: "Tin"
+    value {
+      list {
+        type: DT_INT32
+      }
+    }
+  }
+  attr {
+    key: "Tout"
+    value {
+      list {
+        type: DT_INT32
+      }
+    }
+  }
+  attr {
+    key: "_lower_using_switch_merge"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "_read_only_resource_inputs"
+    value {
+      list {
+      }
+    }
+  }
+  attr {
+    key: "branches"
+    value {
+      list {
+        func {
+          name: "indexed_case_branch0_4"
+        }
+        func {
+          name: "indexed_case_branch1_5"
+        }
+      }
+    }
+  }
+  attr {
+    key: "output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "indexed_case/Identity"
+  op: "Identity"
+  input: "indexed_case"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+library {
+  function {
+    signature {
+      name: "indexed_case_branch0_4"
+      input_arg {
+        name: "add_const"
+        type: DT_INT32
+      }
+      output_arg {
+        name: "add"
+        type: DT_INT32
+      }
+    }
+    node_def {
+      name: "add/y"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 1
+          }
+        }
+      }
+      experimental_debug_info {
+        original_node_names: "add/y"
+      }
+    }
+    node_def {
+      name: "add_0"
+      op: "AddV2"
+      input: "add_const"
+      input: "add/y:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_INT32
+        }
+      }
+      experimental_debug_info {
+        original_node_names: "add"
+      }
+    }
+    ret {
+      key: "add"
+      value: "add_0:z:0"
+    }
+    arg_attr {
+      key: 0
+      value {
+        attr {
+          key: "_output_shapes"
+          value {
+            list {
+              shape {
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  function {
+    signature {
+      name: "indexed_case_branch1_5"
+      input_arg {
+        name: "add_const"
+        type: DT_INT32
+      }
+      output_arg {
+        name: "add"
+        type: DT_INT32
+      }
+    }
+    node_def {
+      name: "add/y"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 2
+          }
+        }
+      }
+      experimental_debug_info {
+        original_node_names: "add/y"
+      }
+    }
+    node_def {
+      name: "add_0"
+      op: "AddV2"
+      input: "add_const"
+      input: "add/y:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_INT32
+        }
+      }
+      experimental_debug_info {
+        original_node_names: "add"
+      }
+    }
+    ret {
+      key: "add"
+      value: "add_0:z:0"
+    }
+    arg_attr {
+      key: 0
+      value {
+        attr {
+          key: "_output_shapes"
+          value {
+            list {
+              shape {
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+versions {
+  producer: 486
+  min_consumer: 12
+}
+
+# CHECK: tf.Case
+# CHECK-SAME: is_stateless
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-while-loop.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-while-loop.pbtxt
index e21fd901a9e..a6b1979ee26 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-while-loop.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-while-loop.pbtxt
@@ -7,7 +7,7 @@
 # CHECK:    %[[NEXTITERATION:[a-z0-9]+]], %[[NEXTITERATION_token:[a-z0-9]+]], {{.*}} = tf_executor.NextIteration.Source
 # CHECK:    tf_executor.Merge {{.*}} %[[NEXTITERATION]]
 
-# CHECK:    tf_executor.NextIteration.Sink [%[[NEXTITERATION_token]]]
+# CHECK:    tf_executor.NextIteration.Sink[%[[NEXTITERATION_token]]]
 
 node {
   name: "Const"
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_layout_assignment_to_nchw.mlir b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_layout_assignment_to_nchw.mlir
index 30599b2e437..9bb05a75877 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_layout_assignment_to_nchw.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_layout_assignment_to_nchw.mlir
@@ -7,7 +7,7 @@
 // CHECK-LABEL: func @transposeConv2D
 func @transposeConv2D(%input: tensor<1x32x32x3xf32>, %filter: tensor<1x1x3x8xf32>) -> tensor<1x32x32x8xf32> {
 
-  // CHECK: %[[ARG_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>}
+  // CHECK: %[[ARG_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>}
   // CHECK: %[[ARG_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%arg0, %[[ARG_PERM]])
 
   // CHECK: %[[CONV2D:[0-9]*]] = "tf.Conv2D"(%[[ARG_TRANSPOSE]], %arg1)
@@ -18,7 +18,7 @@ func @transposeConv2D(%input: tensor<1x32x32x3xf32>, %filter: tensor<1x1x3x8xf32
   // CHECK-SAME: strides = [5, 8, 6, 7]
   // CHECK-SAME: (tensor<1x3x32x32xf32>, tensor<1x1x3x8xf32>) -> tensor<1x8x32x32xf32>
 
-  // CHECK: %[[RES_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>}
+  // CHECK: %[[RES_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi64>}
   // CHECK: %[[RES_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%[[CONV2D]], %[[RES_PERM]])
   // CHECK: return %[[RES_TRANSPOSE]]
 
@@ -38,7 +38,7 @@ func @transposeConv2D(%input: tensor<1x32x32x3xf32>, %filter: tensor<1x1x3x8xf32
 func @transposeConv2DWithDefaultAttr(%input: tensor<1x32x32x3xf32>, %filter: tensor<1x1x3x8xf32>) -> tensor<*xf32>
 {
 
-  // CHECK: %[[ARG_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>}
+  // CHECK: %[[ARG_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>}
   // CHECK: %[[ARG_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%arg0, %[[ARG_PERM]])
 
   // CHECK: %[[CONV2D:[0-9]*]] = "tf.Conv2D"(%[[ARG_TRANSPOSE]], %arg1)
@@ -49,7 +49,7 @@ func @transposeConv2DWithDefaultAttr(%input: tensor<1x32x32x3xf32>, %filter: ten
   // CHECK-SAME: strides = [5, 8, 6, 7]
   // CHECK-SAME: (tensor<1x3x32x32xf32>, tensor<1x1x3x8xf32>) -> tensor<*xf32>
 
-  // CHECK: %[[RES_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>}
+  // CHECK: %[[RES_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi64>}
   // CHECK: %[[RES_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%[[CONV2D]], %[[RES_PERM]])
   // CHECK: return %[[RES_TRANSPOSE]]
 
@@ -77,7 +77,7 @@ func @transposeConv2DBackpropFilter(
   // CHECK-SAME: dst_format = "NCHW"
   // CHECK-SAME: src_format = "NHWC"
 
-  // CHECK: %[[ARG_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>}
+  // CHECK: %[[ARG_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>}
   // CHECK: %[[IN_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%arg0, %[[ARG_PERM]])
   // CHECK: %[[OUT_BP_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%arg2, %[[ARG_PERM]])
 
@@ -117,7 +117,7 @@ func @transposeConv2DBackpropInput(
   // CHECK-SAME: dst_format = "NCHW"
   // CHECK-SAME: src_format = "NHWC"
 
-  // CHECK: %[[ARG_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>}
+  // CHECK: %[[ARG_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>}
   // CHECK: %[[OUT_BP_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%arg2, %[[ARG_PERM]])
 
   // CHECK: %[[CONV2D_BACKPROP:[0-9]*]] = "tf.Conv2DBackpropInput"
@@ -130,7 +130,7 @@ func @transposeConv2DBackpropInput(
   // CHECK-SAME: (tensor<4xi32>, tensor<1x1x3x8xf32>, tensor<1x8x32x32xf32>)
   // CHECK-SAME: -> tensor<1x3x32x32xf32>
 
-  // CHECK: %[[RES_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>}
+  // CHECK: %[[RES_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi64>}
   // CHECK: %[[RES_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%[[CONV2D_BACKPROP]], %[[RES_PERM]])
   // CHECK: return %[[RES_TRANSPOSE]]
 
@@ -154,7 +154,7 @@ func @transposeFusedBatchNormV3(
 ) -> tensor<1x28x28x64xf32> {
 
   // CHECK: %[[ARG_PERM:[0-9]*]] = "tf.Const"()
-  // CHECK-SAME: {value = dense<[0, 3, 1, 2]> : tensor<4xi32>}
+  // CHECK-SAME: {value = dense<[0, 3, 1, 2]> : tensor<4xi64>}
   // CHECK: %[[ARG_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%arg0, %[[ARG_PERM]])
 
   // CHECK: "tf.FusedBatchNormV3"
@@ -164,7 +164,7 @@ func @transposeFusedBatchNormV3(
   // CHECK-SAME: -> (tensor<1x64x28x28xf32>, tensor<64xf32>,
 
   // CHECK: %[[RES_PERM:[0-9]*]] = "tf.Const"()
-  // CHECK-SAME: {value = dense<[0, 2, 3, 1]> : tensor<4xi32>}
+  // CHECK-SAME: {value = dense<[0, 2, 3, 1]> : tensor<4xi64>}
   // CHECK: %[[RES_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%y, %[[RES_PERM]])
   // CHECK: return %[[RES_TRANSPOSE]]
 
@@ -192,7 +192,7 @@ func @transposeFusedBatchNormGradV3(
 ) -> tensor<1x28x28x64xf32> {
 
   // CHECK: %[[ARG_PERM:[0-9]*]] = "tf.Const"()
-  // CHECK-SAME: {value = dense<[0, 3, 1, 2]> : tensor<4xi32>}
+  // CHECK-SAME: {value = dense<[0, 3, 1, 2]> : tensor<4xi64>}
 
   // CHECK: %[[ARG0_TPOSE:[0-9]*]] = "tf.Transpose"(%arg0, %[[ARG_PERM]])
   // CHECK: %[[ARG1_TPOSE:[0-9]*]] = "tf.Transpose"(%arg1, %[[ARG_PERM]])
@@ -204,7 +204,7 @@ func @transposeFusedBatchNormGradV3(
   // CHECK-SAME: -> (tensor<1x64x28x28xf32>,
 
   // CHECK: %[[RES_PERM:[0-9]*]] = "tf.Const"()
-  // CHECK-SAME: {value = dense<[0, 2, 3, 1]> : tensor<4xi32>}
+  // CHECK-SAME: {value = dense<[0, 2, 3, 1]> : tensor<4xi64>}
 
   // CHECK: %[[RES_TPOSE:[0-9]*]] = "tf.Transpose"
   // CHECK-SAME: (%x_backprop, %[[RES_PERM]])
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_layout_assignment_to_nhwc.mlir b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_layout_assignment_to_nhwc.mlir
index e6b3bf08394..c71d8ef2850 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_layout_assignment_to_nhwc.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_layout_assignment_to_nhwc.mlir
@@ -7,7 +7,7 @@
 // CHECK-LABEL: func @transposeConv2D
 func @transposeConv2D(%input: tensor<1x3x32x32xf32>, %filter: tensor<1x1x3x8xf32>) -> tensor<1x8x32x32xf32> {
 
-  // CHECK: %[[ARG_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>}
+  // CHECK: %[[ARG_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi64>}
   // CHECK: %[[ARG_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%arg0, %[[ARG_PERM]])
 
   // CHECK: %[[CONV2D:[0-9]*]] = "tf.Conv2D"(%[[ARG_TRANSPOSE]], %arg1)
@@ -18,7 +18,7 @@ func @transposeConv2D(%input: tensor<1x3x32x32xf32>, %filter: tensor<1x1x3x8xf32
   // CHECK-SAME: strides = [5, 7, 8, 6]
   // CHECK-SAME: (tensor<1x32x32x3xf32>, tensor<1x1x3x8xf32>) -> tensor<1x32x32x8xf32>
 
-  // CHECK: %[[RES_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>}
+  // CHECK: %[[RES_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>}
   // CHECK: %[[RES_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%[[CONV2D]], %[[RES_PERM]])
   // CHECK: return %[[RES_TRANSPOSE]]
 
@@ -41,7 +41,7 @@ func @transposeFusedBatchNormV3(
 ) -> tensor<1x64x28x28xf32> {
 
   // CHECK: %[[ARG_PERM:[0-9]*]] = "tf.Const"()
-  // CHECK-SAME: {value = dense<[0, 2, 3, 1]> : tensor<4xi32>}
+  // CHECK-SAME: {value = dense<[0, 2, 3, 1]> : tensor<4xi64>}
   // CHECK: %[[ARG_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%arg0, %[[ARG_PERM]])
 
   // CHECK: "tf.FusedBatchNormV3"
@@ -51,7 +51,7 @@ func @transposeFusedBatchNormV3(
   // CHECK-SAME: -> (tensor<1x28x28x64xf32>, tensor<64xf32>,
 
   // CHECK: %[[RES_PERM:[0-9]*]] = "tf.Const"()
-  // CHECK-SAME: {value = dense<[0, 3, 1, 2]> : tensor<4xi32>}
+  // CHECK-SAME: {value = dense<[0, 3, 1, 2]> : tensor<4xi64>}
   // CHECK: %[[RES_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%y, %[[RES_PERM]])
   // CHECK: return %[[RES_TRANSPOSE]]
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_move_transposes_begin.mlir b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_move_transposes_begin.mlir
index 0b1e27733eb..bacfeea2dc9 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_move_transposes_begin.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_move_transposes_begin.mlir
@@ -65,3 +65,40 @@ func @move_with_multiple_uses(%arg0: tensor<1x4x4x8xf32>) -> tensor<1x8x4x4xf32>
 
   return %3 : tensor<1x8x4x4xf32>
 }
+
+// CHECK-LABEL: move_transpose_handle_broadcast
+func @move_transpose_handle_broadcast(%arg0:tensor<8x64xf32>, %arg1:tensor<8x64x64xf32>) -> tensor<512x64xf32> {
+  %cst = "tf.Const"() {value = dense<3> : tensor<i32>} : () -> tensor<i32>
+  %cst_1 = "tf.Const"() {value = dense<[2, 0, 1]> : tensor<3xi32>} : () -> tensor<3xi32>
+  %cst_2 = "tf.Const"() {value = dense<[512, 64]> : tensor<2xi32>} : () -> tensor<2xi32>
+  %0 = "tf.ExpandDims"(%arg0, %cst) {device = ""} : (tensor<8x64xf32>, tensor<i32>) -> tensor<8x64x1xf32>
+  %1 = "tf.AddV2"(%0, %arg1) {device = ""} : (tensor<8x64x1xf32>, tensor<8x64x64xf32>) -> tensor<8x64x64xf32>
+  %2 = "tf.Transpose"(%1, %cst_1) {device = ""} : (tensor<8x64x64xf32>, tensor<3xi32>) -> tensor<64x8x64xf32>
+  %3 = "tf.Reshape"(%2, %cst_2) {device = ""} : (tensor<64x8x64xf32>, tensor<2xi32>) -> tensor<512x64xf32>
+
+  return %3 : tensor<512x64xf32>
+
+  // CHECK: %[[CST_0:.*]] = "tf.Const"() {value = dense<[2, 0, 1]> : tensor<3xi32>} : () -> tensor<3xi32>
+  // CHECK: %[[CST_1:.*]] = "tf.Const"() {value = dense<3> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: %[[CST_2:.*]] = "tf.Const"() {value = dense<[512, 64]> : tensor<2xi32>} : () -> tensor<2xi32>
+  // CHECK: %[[EXPAND_DIMS:.*]] = "tf.ExpandDims"(%arg0, %[[CST_1]]) {device = ""} : (tensor<8x64xf32>, tensor<i32>) -> tensor<8x64x1xf32>
+  // CHECK: %[[TRANSPOSE_1:.*]] = "tf.Transpose"(%[[EXPAND_DIMS]], %[[CST_0]]) : (tensor<8x64x1xf32>, tensor<3xi32>) -> tensor<1x8x64xf32>
+  // CHECK: %[[TRANSPOSE_2:.*]] = "tf.Transpose"(%arg1, %[[CST_0]]) : (tensor<8x64x64xf32>, tensor<3xi32>) -> tensor<64x8x64xf32>
+  // CHECK: %[[ADD:.*]] = "tf.AddV2"(%[[TRANSPOSE_1]], %[[TRANSPOSE_2]]) {device = ""} : (tensor<1x8x64xf32>, tensor<64x8x64xf32>) -> tensor<64x8x64xf32>
+  // CHECK: %[[RESHAPE:.*]] = "tf.Reshape"(%[[ADD]], %[[CST_2]]) {device = ""} : (tensor<64x8x64xf32>, tensor<2xi32>) -> tensor<512x64xf32>
+  // CHECK: return %[[RESHAPE]] : tensor<512x64xf32>
+}
+
+// CHECK-LABEL: dont_move_transpose_different_ranks
+func @dont_move_transpose_different_ranks(%arg0:tensor<1x1x2x3xf32>, %arg1:tensor<2x3xf32>) -> tensor<1x2x1x3xf32> {
+  %cst = "tf.Const"() {value = dense<[0, 2, 1, 3]> : tensor<4xi32>} : () -> tensor<4xi32>
+  %0 = "tf.AddV2"(%arg0, %arg1) {device = ""} : (tensor<1x1x2x3xf32>, tensor<2x3xf32>) -> tensor<1x1x2x3xf32>
+  %1 = "tf.Transpose"(%0, %cst) {device = ""} : (tensor<1x1x2x3xf32>, tensor<4xi32>) -> tensor<1x2x1x3xf32>
+
+  return %1 : tensor<1x2x1x3xf32>
+
+  // CHECK: %[[CST:.*]] = "tf.Const"() {value = dense<[0, 2, 1, 3]> : tensor<4xi32>} : () -> tensor<4xi32>
+  // CHECK: %[[ADD:.*]] = "tf.AddV2"(%arg0, %arg1) {device = ""} : (tensor<1x1x2x3xf32>, tensor<2x3xf32>) -> tensor<1x1x2x3xf32>
+  // CHECK: %[[TRANSPOSE:.*]] = "tf.Transpose"(%[[ADD]], %[[CST]]) {device = ""} : (tensor<1x1x2x3xf32>, tensor<4xi32>) -> tensor<1x2x1x3xf32>
+  // CHECK: return %[[TRANSPOSE]] : tensor<1x2x1x3xf32>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/legalize_hlo.mlir b/tensorflow/compiler/mlir/tensorflow/tests/legalize_hlo.mlir
index 4f044cd5eff..9864cffee7c 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/legalize_hlo.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/legalize_hlo.mlir
@@ -1,177 +1,396 @@
+// NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
 // RUN: tf-opt -tf-legalize-hlo %s | FileCheck %s
 
 
+// CHECK-LABEL:   func @biasAdd_NHWC(
+// CHECK-SAME:                       %[[VAL_0:.*]]: tensor<1x32x10x32xi32>,
+// CHECK-SAME:                       %[[VAL_1:.*]]: tensor<32xi32>) -> tensor<1x32x10x32xi32> {
+// CHECK:           %[[VAL_2:.*]] = "tf.AddV2"(%[[VAL_0]], %[[VAL_1]]) : (tensor<1x32x10x32xi32>, tensor<32xi32>) -> tensor<1x32x10x32xi32>
+// CHECK:           return %[[VAL_2]] : tensor<1x32x10x32xi32>
+// CHECK:         }
 func @biasAdd_NHWC(%arg0: tensor<1x32x10x32xi32>, %arg1: tensor<32xi32>) -> tensor<1x32x10x32xi32> {
   %0 = "chlo.broadcast_add"(%arg0, %arg1) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<1x32x10x32xi32>, tensor<32xi32>) -> tensor<1x32x10x32xi32>
   return %0 : tensor<1x32x10x32xi32>
 }
 
+// CHECK-LABEL:   func @biasAdd_NCHW(
+// CHECK-SAME:                       %[[VAL_0:.*]]: tensor<1x32x10x32xi32>,
+// CHECK-SAME:                       %[[VAL_1:.*]]: tensor<32xi32>) -> tensor<1x32x10x32xi32> {
+// CHECK:           %[[VAL_2:.*]] = "tf.AddV2"(%[[VAL_0]], %[[VAL_1]]) : (tensor<1x32x10x32xi32>, tensor<32xi32>) -> tensor<1x32x10x32xi32>
+// CHECK:           return %[[VAL_2]] : tensor<1x32x10x32xi32>
+// CHECK:         }
 func @biasAdd_NCHW(%arg0: tensor<1x32x10x32xi32>, %arg1: tensor<32xi32>) -> tensor<1x32x10x32xi32> {
   %0 = "chlo.broadcast_add"(%arg0, %arg1) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<1x32x10x32xi32>, tensor<32xi32>) -> tensor<1x32x10x32xi32>
   return %0 : tensor<1x32x10x32xi32>
 }
 
+// CHECK-LABEL:   func @biasAdd_dynamic(
+// CHECK-SAME:                          %[[VAL_0:.*]]: tensor<?x?x?x?xi32>,
+// CHECK-SAME:                          %[[VAL_1:.*]]: tensor<?xi32>) -> tensor<?x?x?x?xi32> {
+// CHECK:           %[[VAL_2:.*]] = "tf.AddV2"(%[[VAL_0]], %[[VAL_1]]) : (tensor<?x?x?x?xi32>, tensor<?xi32>) -> tensor<?x?x?x?xi32>
+// CHECK:           return %[[VAL_2]] : tensor<?x?x?x?xi32>
+// CHECK:         }
 func @biasAdd_dynamic(%arg0: tensor<?x?x?x?xi32>, %arg1: tensor<?xi32>) -> tensor<?x?x?x?xi32> {
   %0 = "chlo.broadcast_add"(%arg0, %arg1) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<?x?x?x?xi32>, tensor<?xi32>) -> tensor<?x?x?x?xi32>
   return %0 : tensor<?x?x?x?xi32>
 }
 
+// CHECK-LABEL:   func @add(
+// CHECK-SAME:              %[[VAL_0:.*]]: tensor<2xi32>) -> tensor<2xi32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.AddV2"(%[[VAL_0]], %[[VAL_0]]) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+// CHECK:           %[[VAL_2:.*]] = "tf.AddV2"(%[[VAL_1]], %[[VAL_0]]) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+// CHECK:           return %[[VAL_2]] : tensor<2xi32>
+// CHECK:         }
 func @add(%arg0: tensor<2xi32>) -> tensor<2xi32> {
   %0 = mhlo.add %arg0, %arg0 : tensor<2xi32>
   %1 = mhlo.add %0, %arg0 : tensor<2xi32>
   return %1 : tensor<2xi32>
 }
 
+// CHECK-LABEL:   func @broadcast_add(
+// CHECK-SAME:                        %[[VAL_0:.*]]: tensor<1xi32>,
+// CHECK-SAME:                        %[[VAL_1:.*]]: tensor<1x2xi32>) -> tensor<1x2xi32> {
+// CHECK:           %[[VAL_2:.*]] = "tf.AddV2"(%[[VAL_0]], %[[VAL_1]]) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
+// CHECK:           return %[[VAL_2]] : tensor<1x2xi32>
+// CHECK:         }
 func @broadcast_add(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
   %0 = "chlo.broadcast_add"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
   return %0 : tensor<1x2xi32>
 }
 
+// CHECK-LABEL:   func @broadcast_multi_dim_add(
+// CHECK-SAME:                                  %[[VAL_0:.*]]: tensor<4x1x1xi32>,
+// CHECK-SAME:                                  %[[VAL_1:.*]]: tensor<4x4x4x4xi32>) -> tensor<4x4x4x4xi32> {
+// CHECK:           %[[VAL_2:.*]] = "tf.AddV2"(%[[VAL_0]], %[[VAL_1]]) : (tensor<4x1x1xi32>, tensor<4x4x4x4xi32>) -> tensor<4x4x4x4xi32>
+// CHECK:           return %[[VAL_2]] : tensor<4x4x4x4xi32>
+// CHECK:         }
 func @broadcast_multi_dim_add(%arg0: tensor<4x1x1xi32>, %arg1: tensor<4x4x4x4xi32>) -> tensor<4x4x4x4xi32> {
   %0 = "chlo.broadcast_add"(%arg0, %arg1) {broadcast_dimensions = dense<[1, 2, 3]> : tensor<3xi64>} : (tensor<4x1x1xi32>, tensor<4x4x4x4xi32>) -> tensor<4x4x4x4xi32>
   return %0 : tensor<4x4x4x4xi32>
 }
 
+// CHECK-LABEL:   func @div(
+// CHECK-SAME:              %[[VAL_0:.*]]: tensor<2xi32>) -> tensor<2xi32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Div"(%[[VAL_0]], %[[VAL_0]]) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+// CHECK:           return %[[VAL_1]] : tensor<2xi32>
+// CHECK:         }
 func @div(%arg0: tensor<2xi32>) -> tensor<2xi32> {
   %0 = mhlo.divide %arg0, %arg0 : tensor<2xi32>
   return %0 : tensor<2xi32>
 }
 
+// CHECK-LABEL:   func @broadcast_div(
+// CHECK-SAME:                        %[[VAL_0:.*]]: tensor<1xi32>,
+// CHECK-SAME:                        %[[VAL_1:.*]]: tensor<1x2xi32>) -> tensor<1x2xi32> {
+// CHECK:           %[[VAL_2:.*]] = "tf.Div"(%[[VAL_0]], %[[VAL_1]]) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
+// CHECK:           return %[[VAL_2]] : tensor<1x2xi32>
+// CHECK:         }
 func @broadcast_div(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
   %0 = "chlo.broadcast_divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
   return %0 : tensor<1x2xi32>
 }
 
+// CHECK-LABEL:   func @shift_left(
+// CHECK-SAME:                     %[[VAL_0:.*]]: tensor<4xi32>,
+// CHECK-SAME:                     %[[VAL_1:.*]]: tensor<4xi32>) -> tensor<4xi32> {
+// CHECK:           %[[VAL_2:.*]] = "tf.LeftShift"(%[[VAL_0]], %[[VAL_1]]) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+// CHECK:           return %[[VAL_2]] : tensor<4xi32>
+// CHECK:         }
 func @shift_left(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
   %0 = mhlo.shift_left %arg0, %arg1 : tensor<4xi32>
   return %0 : tensor<4xi32>
 }
 
+// CHECK-LABEL:   func @div_dynamic(
+// CHECK-SAME:                      %[[VAL_0:.*]]: tensor<?xi32>,
+// CHECK-SAME:                      %[[VAL_1:.*]]: tensor<?x?xi32>) -> tensor<?x?xi32> {
+// CHECK:           %[[VAL_2:.*]] = "tf.Div"(%[[VAL_0]], %[[VAL_1]]) : (tensor<?xi32>, tensor<?x?xi32>) -> tensor<?x?xi32>
+// CHECK:           return %[[VAL_2]] : tensor<?x?xi32>
+// CHECK:         }
 func @div_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<?x?xi32>) -> tensor<?x?xi32> {
   %0 = "chlo.broadcast_divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xi32>, tensor<?x?xi32>) -> tensor<?x?xi32>
   return %0 : tensor<?x?xi32>
 }
 
+// CHECK-LABEL:   func @maximum(
+// CHECK-SAME:                  %[[VAL_0:.*]]: tensor<4xf32>,
+// CHECK-SAME:                  %[[VAL_1:.*]]: tensor<4xf32>) -> tensor<4xf32> {
+// CHECK:           %[[VAL_2:.*]] = "tf.Maximum"(%[[VAL_0]], %[[VAL_1]]) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+// CHECK:           return %[[VAL_2]] : tensor<4xf32>
+// CHECK:         }
 func @maximum(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
   %0 = mhlo.maximum %arg0, %arg1 : tensor<4xf32>
   return %0 : tensor<4xf32>
 }
 
+// CHECK-LABEL:   func @minimum(
+// CHECK-SAME:                  %[[VAL_0:.*]]: tensor<4xf32>,
+// CHECK-SAME:                  %[[VAL_1:.*]]: tensor<4xf32>) -> tensor<4xf32> {
+// CHECK:           %[[VAL_2:.*]] = "tf.Minimum"(%[[VAL_0]], %[[VAL_1]]) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+// CHECK:           return %[[VAL_2]] : tensor<4xf32>
+// CHECK:         }
 func @minimum(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
   %0 = mhlo.minimum %arg0, %arg1 : tensor<4xf32>
   return %0 : tensor<4xf32>
 }
 
+// CHECK-LABEL:   func @mul(
+// CHECK-SAME:              %[[VAL_0:.*]]: tensor<2xi32>) -> tensor<2xi32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Mul"(%[[VAL_0]], %[[VAL_0]]) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+// CHECK:           return %[[VAL_1]] : tensor<2xi32>
+// CHECK:         }
 func @mul(%arg0: tensor<2xi32>) -> tensor<2xi32> {
   %0 = mhlo.multiply %arg0, %arg0 : tensor<2xi32>
   return %0 : tensor<2xi32>
 }
 
+// CHECK-LABEL:   func @broadcast_mul(
+// CHECK-SAME:                        %[[VAL_0:.*]]: tensor<1xi32>,
+// CHECK-SAME:                        %[[VAL_1:.*]]: tensor<1x2xi32>) -> tensor<1x2xi32> {
+// CHECK:           %[[VAL_2:.*]] = "tf.Mul"(%[[VAL_0]], %[[VAL_1]]) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
+// CHECK:           return %[[VAL_2]] : tensor<1x2xi32>
+// CHECK:         }
 func @broadcast_mul(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
   %0 = "chlo.broadcast_multiply"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
   return %0 : tensor<1x2xi32>
 }
 
+// CHECK-LABEL:   func @real_div(
+// CHECK-SAME:                   %[[VAL_0:.*]]: tensor<2xi32>) -> tensor<2xi32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Div"(%[[VAL_0]], %[[VAL_0]]) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+// CHECK:           return %[[VAL_1]] : tensor<2xi32>
+// CHECK:         }
 func @real_div(%arg0: tensor<2xi32>) -> tensor<2xi32> {
   %0 = mhlo.divide %arg0, %arg0 : tensor<2xi32>
   return %0 : tensor<2xi32>
 }
 
+// CHECK-LABEL:   func @broadcast_real_div(
+// CHECK-SAME:                             %[[VAL_0:.*]]: tensor<1xi32>,
+// CHECK-SAME:                             %[[VAL_1:.*]]: tensor<1x2xi32>) -> tensor<1x2xi32> {
+// CHECK:           %[[VAL_2:.*]] = "tf.Div"(%[[VAL_0]], %[[VAL_1]]) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
+// CHECK:           return %[[VAL_2]] : tensor<1x2xi32>
+// CHECK:         }
 func @broadcast_real_div(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
   %0 = "chlo.broadcast_divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
   return %0 : tensor<1x2xi32>
 }
 
+// CHECK-LABEL:   func @sub(
+// CHECK-SAME:              %[[VAL_0:.*]]: tensor<2xi32>) -> tensor<2xi32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Sub"(%[[VAL_0]], %[[VAL_0]]) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+// CHECK:           return %[[VAL_1]] : tensor<2xi32>
+// CHECK:         }
 func @sub(%arg0: tensor<2xi32>) -> tensor<2xi32> {
   %0 = mhlo.subtract %arg0, %arg0 : tensor<2xi32>
   return %0 : tensor<2xi32>
 }
 
+// CHECK-LABEL:   func @broadcast_sub(
+// CHECK-SAME:                        %[[VAL_0:.*]]: tensor<1xi32>,
+// CHECK-SAME:                        %[[VAL_1:.*]]: tensor<1x2xi32>) -> tensor<1x2xi32> {
+// CHECK:           %[[VAL_2:.*]] = "tf.Sub"(%[[VAL_0]], %[[VAL_1]]) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
+// CHECK:           return %[[VAL_2]] : tensor<1x2xi32>
+// CHECK:         }
 func @broadcast_sub(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
   %0 = "chlo.broadcast_subtract"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
   return %0 : tensor<1x2xi32>
 }
 
+// CHECK-LABEL:   func @shift_right(
+// CHECK-SAME:                      %[[VAL_0:.*]]: tensor<4xi32>,
+// CHECK-SAME:                      %[[VAL_1:.*]]: tensor<4xi32>) -> tensor<4xi32> {
+// CHECK:           %[[VAL_2:.*]] = "tf.RightShift"(%[[VAL_0]], %[[VAL_1]]) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+// CHECK:           return %[[VAL_2]] : tensor<4xi32>
+// CHECK:         }
 func @shift_right(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
   %0 = mhlo.shift_right_arithmetic %arg0, %arg1 : tensor<4xi32>
   return %0 : tensor<4xi32>
 }
 
+// CHECK-LABEL:   func @broadcast_shift_right(
+// CHECK-SAME:                                %[[VAL_0:.*]]: tensor<4xi32>,
+// CHECK-SAME:                                %[[VAL_1:.*]]: tensor<2x4xi32>) -> tensor<2x4xi32> {
+// CHECK:           %[[VAL_2:.*]] = "tf.RightShift"(%[[VAL_0]], %[[VAL_1]]) : (tensor<4xi32>, tensor<2x4xi32>) -> tensor<2x4xi32>
+// CHECK:           return %[[VAL_2]] : tensor<2x4xi32>
+// CHECK:         }
 func @broadcast_shift_right(%arg0: tensor<4xi32>, %arg1: tensor<2x4xi32>) -> tensor<2x4xi32> {
   %0 = "chlo.broadcast_shift_right_arithmetic"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xi32>, tensor<2x4xi32>) -> tensor<2x4xi32>
   return %0 : tensor<2x4xi32>
 }
 
-func @and(%arg0: tensor<2xi1>) -> tensor<2xi1> {
-  %0 = mhlo.and %arg0, %arg0 : tensor<2xi1>
+// CHECK-LABEL:   func @and(
+// CHECK-SAME:              %[[VAL_0:.*]]: tensor<2xi1>,
+// CHECK-SAME:              %[[VAL_1:.*]]: tensor<2xi1>) -> tensor<2xi1> {
+// CHECK:           %[[VAL_2:.*]] = "tf.LogicalAnd"(%[[VAL_0]], %[[VAL_1]]) : (tensor<2xi1>, tensor<2xi1>) -> tensor<2xi1>
+// CHECK:           return %[[VAL_2]] : tensor<2xi1>
+// CHECK:         }
+func @and(%arg0: tensor<2xi1>, %arg1: tensor<2xi1>) -> tensor<2xi1> {
+  %0 = mhlo.and %arg0, %arg1 : tensor<2xi1>
   return %0 : tensor<2xi1>
 }
 
+// CHECK-LABEL:   func @and_broadcast(
+// CHECK-SAME:                        %[[VAL_0:.*]]: tensor<1xi1>,
+// CHECK-SAME:                        %[[VAL_1:.*]]: tensor<1x2xi1>) -> tensor<1x2xi1> {
+// CHECK:           %[[VAL_2:.*]] = "tf.LogicalAnd"(%[[VAL_0]], %[[VAL_1]]) : (tensor<1xi1>, tensor<1x2xi1>) -> tensor<1x2xi1>
+// CHECK:           return %[[VAL_2]] : tensor<1x2xi1>
+// CHECK:         }
 func @and_broadcast(%arg0: tensor<1xi1>, %arg1: tensor<1x2xi1>) -> tensor<1x2xi1> {
   %0 = "chlo.broadcast_and"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi1>, tensor<1x2xi1>) -> tensor<1x2xi1>
   return %0 : tensor<1x2xi1>
 }
 
+// CHECK-LABEL:   func @and_dynamic(
+// CHECK-SAME:                      %[[VAL_0:.*]]: tensor<?xi1>,
+// CHECK-SAME:                      %[[VAL_1:.*]]: tensor<1xi1>) -> tensor<?xi1> {
+// CHECK:           %[[VAL_2:.*]] = "tf.LogicalAnd"(%[[VAL_0]], %[[VAL_1]]) : (tensor<?xi1>, tensor<1xi1>) -> tensor<?xi1>
+// CHECK:           return %[[VAL_2]] : tensor<?xi1>
+// CHECK:         }
 func @and_dynamic(%arg0: tensor<?xi1>, %arg1: tensor<1xi1>) -> tensor<?xi1> {
   %0 = "chlo.broadcast_and"(%arg0, %arg1) : (tensor<?xi1>, tensor<1xi1>) -> tensor<?xi1>
   return %0 : tensor<?xi1>
 }
 
-func @or(%arg0: tensor<2xi1>) -> tensor<2xi1> {
-  %0 = mhlo.or %arg0, %arg0 : tensor<2xi1>
+// CHECK-LABEL:   func @or(
+// CHECK-SAME:             %[[VAL_0:.*]]: tensor<2xi1>,
+// CHECK-SAME:             %[[VAL_1:.*]]: tensor<2xi1>) -> tensor<2xi1> {
+// CHECK:           %[[VAL_2:.*]] = "tf.LogicalOr"(%[[VAL_0]], %[[VAL_1]]) : (tensor<2xi1>, tensor<2xi1>) -> tensor<2xi1>
+// CHECK:           return %[[VAL_2]] : tensor<2xi1>
+// CHECK:         }
+func @or(%arg0: tensor<2xi1>, %arg1: tensor<2xi1>) -> tensor<2xi1> {
+  %0 = mhlo.or %arg0, %arg1 : tensor<2xi1>
   return %0 : tensor<2xi1>
 }
 
+// CHECK-LABEL:   func @or_broadcast(
+// CHECK-SAME:                       %[[VAL_0:.*]]: tensor<1xi1>,
+// CHECK-SAME:                       %[[VAL_1:.*]]: tensor<1x2xi1>) -> tensor<1x2xi1> {
+// CHECK:           %[[VAL_2:.*]] = "tf.LogicalOr"(%[[VAL_0]], %[[VAL_1]]) : (tensor<1xi1>, tensor<1x2xi1>) -> tensor<1x2xi1>
+// CHECK:           return %[[VAL_2]] : tensor<1x2xi1>
+// CHECK:         }
 func @or_broadcast(%arg0: tensor<1xi1>, %arg1: tensor<1x2xi1>) -> tensor<1x2xi1> {
   %0 = "chlo.broadcast_or"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi1>, tensor<1x2xi1>) -> tensor<1x2xi1>
   return %0 : tensor<1x2xi1>
 }
 
+// CHECK-LABEL:   func @or_dynamic(
+// CHECK-SAME:                     %[[VAL_0:.*]]: tensor<?xi1>,
+// CHECK-SAME:                     %[[VAL_1:.*]]: tensor<1xi1>) -> tensor<?xi1> {
+// CHECK:           %[[VAL_2:.*]] = "tf.LogicalOr"(%[[VAL_0]], %[[VAL_1]]) : (tensor<?xi1>, tensor<1xi1>) -> tensor<?xi1>
+// CHECK:           return %[[VAL_2]] : tensor<?xi1>
+// CHECK:         }
 func @or_dynamic(%arg0: tensor<?xi1>, %arg1: tensor<1xi1>) -> tensor<?xi1> {
   %0 = "chlo.broadcast_or"(%arg0, %arg1) : (tensor<?xi1>, tensor<1xi1>) -> tensor<?xi1>
   return %0 : tensor<?xi1>
 }
 
+// CHECK-LABEL:   func @bitwise_or(
+// CHECK-SAME:                     %[[VAL_0:.*]]: tensor<4xi32>,
+// CHECK-SAME:                     %[[VAL_1:.*]]: tensor<4xi32>) -> tensor<4xi32> {
+// CHECK:           %[[VAL_2:.*]] = "tf.BitwiseOr"(%[[VAL_0]], %[[VAL_1]]) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+// CHECK:           return %[[VAL_2]] : tensor<4xi32>
+// CHECK:         }
 func @bitwise_or(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
   %0 = mhlo.or %arg0, %arg1 : tensor<4xi32>
   return %0 : tensor<4xi32>
 }
 
+// CHECK-LABEL:   func @bitwise_or_broadcast(
+// CHECK-SAME:                               %[[VAL_0:.*]]: tensor<1xi8>,
+// CHECK-SAME:                               %[[VAL_1:.*]]: tensor<1x4xi8>) -> tensor<1x4xi8> {
+// CHECK:           %[[VAL_2:.*]] = "tf.BitwiseOr"(%[[VAL_0]], %[[VAL_1]]) : (tensor<1xi8>, tensor<1x4xi8>) -> tensor<1x4xi8>
+// CHECK:           return %[[VAL_2]] : tensor<1x4xi8>
+// CHECK:         }
 func @bitwise_or_broadcast(%arg0: tensor<1xi8>, %arg1: tensor<1x4xi8>) -> tensor<1x4xi8> {
   %0 = "chlo.broadcast_or"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi8>, tensor<1x4xi8>) -> tensor<1x4xi8>
   return %0 : tensor<1x4xi8>
 }
 
+// CHECK-LABEL:   func @bitwise_or_dynamic(
+// CHECK-SAME:                             %[[VAL_0:.*]]: tensor<?xi32>,
+// CHECK-SAME:                             %[[VAL_1:.*]]: tensor<1xi32>) -> tensor<?xi32> {
+// CHECK:           %[[VAL_2:.*]] = "tf.BitwiseOr"(%[[VAL_0]], %[[VAL_1]]) : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
+// CHECK:           return %[[VAL_2]] : tensor<?xi32>
+// CHECK:         }
 func @bitwise_or_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?xi32> {
   %0 = "chlo.broadcast_or"(%arg0, %arg1) : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
   return %0 : tensor<?xi32>
 }
 
+// CHECK-LABEL:   func @bitwise_and(
+// CHECK-SAME:                      %[[VAL_0:.*]]: tensor<4xi32>,
+// CHECK-SAME:                      %[[VAL_1:.*]]: tensor<4xi32>) -> tensor<4xi32> {
+// CHECK:           %[[VAL_2:.*]] = "tf.BitwiseAnd"(%[[VAL_0]], %[[VAL_1]]) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+// CHECK:           return %[[VAL_2]] : tensor<4xi32>
+// CHECK:         }
 func @bitwise_and(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
   %0 = mhlo.and %arg0, %arg1 : tensor<4xi32>
   return %0 : tensor<4xi32>
 }
 
+// CHECK-LABEL:   func @bitwise_and_broadcast(
+// CHECK-SAME:                                %[[VAL_0:.*]]: tensor<1xi8>,
+// CHECK-SAME:                                %[[VAL_1:.*]]: tensor<1x4xi8>) -> tensor<1x4xi8> {
+// CHECK:           %[[VAL_2:.*]] = "tf.BitwiseAnd"(%[[VAL_0]], %[[VAL_1]]) : (tensor<1xi8>, tensor<1x4xi8>) -> tensor<1x4xi8>
+// CHECK:           return %[[VAL_2]] : tensor<1x4xi8>
+// CHECK:         }
 func @bitwise_and_broadcast(%arg0: tensor<1xi8>, %arg1: tensor<1x4xi8>) -> tensor<1x4xi8> {
   %0 = "chlo.broadcast_and"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi8>, tensor<1x4xi8>) -> tensor<1x4xi8>
   return %0 : tensor<1x4xi8>
 }
 
+// CHECK-LABEL:   func @bitwise_and_dynamic(
+// CHECK-SAME:                              %[[VAL_0:.*]]: tensor<?xi32>,
+// CHECK-SAME:                              %[[VAL_1:.*]]: tensor<1xi32>) -> tensor<?xi32> {
+// CHECK:           %[[VAL_2:.*]] = "tf.BitwiseAnd"(%[[VAL_0]], %[[VAL_1]]) : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
+// CHECK:           return %[[VAL_2]] : tensor<?xi32>
+// CHECK:         }
 func @bitwise_and_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?xi32> {
   %0 = "chlo.broadcast_and"(%arg0, %arg1) : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
   return %0 : tensor<?xi32>
 }
 
+// CHECK-LABEL:   func @pow(
+// CHECK-SAME:              %[[VAL_0:.*]]: tensor<2xf32>) -> tensor<2xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Pow"(%[[VAL_0]], %[[VAL_0]]) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
+// CHECK:           return %[[VAL_1]] : tensor<2xf32>
+// CHECK:         }
 func @pow(%arg0: tensor<2xf32>) -> tensor<2xf32> {
   %0 = mhlo.power %arg0, %arg0 : tensor<2xf32>
   return %0 : tensor<2xf32>
 }
 
+// CHECK-LABEL:   func @pow_dynamic(
+// CHECK-SAME:                      %[[VAL_0:.*]]: tensor<?xf32>) -> tensor<?xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Pow"(%[[VAL_0]], %[[VAL_0]]) : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+// CHECK:           return %[[VAL_1]] : tensor<?xf32>
+// CHECK:         }
 func @pow_dynamic(%arg0: tensor<?xf32>) -> tensor<?xf32> {
   %0 = mhlo.power %arg0, %arg0 : tensor<?xf32>
   return %0 : tensor<?xf32>
 }
 
+// CHECK-LABEL:   func @floordiv_broadcast_i32(
+// CHECK-SAME:                                 %[[VAL_0:.*]]: tensor<2x3xi32>,
+// CHECK-SAME:                                 %[[VAL_1:.*]]: tensor<3xi32>) -> tensor<2x3xi32> {
+// CHECK:           %[[VAL_2:.*]] = "tf.Const"() {value = dense<0> : tensor<2x3xi32>} : () -> tensor<2x3xi32>
+// CHECK:           %[[VAL_3:.*]] = "tf.Less"(%[[VAL_0]], %[[VAL_2]]) : (tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi1>
+// CHECK:           %[[VAL_4:.*]] = "tf.Const"() {value = dense<0> : tensor<3xi32>} : () -> tensor<3xi32>
+// CHECK:           %[[VAL_5:.*]] = "tf.Less"(%[[VAL_1]], %[[VAL_4]]) : (tensor<3xi32>, tensor<3xi32>) -> tensor<3xi1>
+// CHECK:           %[[VAL_6:.*]] = "tf.Equal"(%[[VAL_3]], %[[VAL_5]]) {incompatible_shape_error = true} : (tensor<2x3xi1>, tensor<3xi1>) -> tensor<2x3xi1>
+// CHECK:           %[[VAL_7:.*]] = "tf.Div"(%[[VAL_0]], %[[VAL_1]]) : (tensor<2x3xi32>, tensor<3xi32>) -> tensor<2x3xi32>
+// CHECK:           %[[VAL_8:.*]] = "tf.Abs"(%[[VAL_0]]) : (tensor<2x3xi32>) -> tensor<2x3xi32>
+// CHECK:           %[[VAL_9:.*]] = "tf.Abs"(%[[VAL_1]]) : (tensor<3xi32>) -> tensor<3xi32>
+// CHECK:           %[[VAL_10:.*]] = "tf.Const"() {value = dense<1> : tensor<3xi32>} : () -> tensor<3xi32>
+// CHECK:           %[[VAL_11:.*]] = "tf.Sub"(%[[VAL_9]], %[[VAL_10]]) : (tensor<3xi32>, tensor<3xi32>) -> tensor<3xi32>
+// CHECK:           %[[VAL_12:.*]] = "tf.AddV2"(%[[VAL_8]], %[[VAL_11]]) : (tensor<2x3xi32>, tensor<3xi32>) -> tensor<2x3xi32>
+// CHECK:           %[[VAL_13:.*]] = "tf.Neg"(%[[VAL_12]]) : (tensor<2x3xi32>) -> tensor<2x3xi32>
+// CHECK:           %[[VAL_14:.*]] = "tf.Abs"(%[[VAL_1]]) : (tensor<3xi32>) -> tensor<3xi32>
+// CHECK:           %[[VAL_15:.*]] = "tf.Div"(%[[VAL_13]], %[[VAL_14]]) : (tensor<2x3xi32>, tensor<3xi32>) -> tensor<2x3xi32>
+// CHECK:           %[[VAL_16:.*]] = "tf.Select"(%[[VAL_6]], %[[VAL_7]], %[[VAL_15]]) : (tensor<2x3xi1>, tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
+// CHECK:           return %[[VAL_16]] : tensor<2x3xi32>
+// CHECK:         }
 func @floordiv_broadcast_i32(%arg0: tensor<2x3xi32>, %arg1: tensor<3xi32>) -> tensor<2x3xi32> {
   %0 = mhlo.constant dense<0> : tensor<2x3xi32>
   %1 = "chlo.broadcast_compare"(%arg0, %0) {comparison_direction = "LT"} : (tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi1>
@@ -191,6 +410,26 @@ func @floordiv_broadcast_i32(%arg0: tensor<2x3xi32>, %arg1: tensor<3xi32>) -> te
   return %14 : tensor<2x3xi32>
 }
 
+// CHECK-LABEL:   func @floordiv_reverse_broadcast_i32(
+// CHECK-SAME:                                         %[[VAL_0:.*]]: tensor<3xi32>,
+// CHECK-SAME:                                         %[[VAL_1:.*]]: tensor<2x3xi32>) -> tensor<2x3xi32> {
+// CHECK:           %[[VAL_2:.*]] = "tf.Const"() {value = dense<0> : tensor<3xi32>} : () -> tensor<3xi32>
+// CHECK:           %[[VAL_3:.*]] = "tf.Less"(%[[VAL_0]], %[[VAL_2]]) : (tensor<3xi32>, tensor<3xi32>) -> tensor<3xi1>
+// CHECK:           %[[VAL_4:.*]] = "tf.Const"() {value = dense<0> : tensor<2x3xi32>} : () -> tensor<2x3xi32>
+// CHECK:           %[[VAL_5:.*]] = "tf.Less"(%[[VAL_1]], %[[VAL_4]]) : (tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi1>
+// CHECK:           %[[VAL_6:.*]] = "tf.Equal"(%[[VAL_3]], %[[VAL_5]]) {incompatible_shape_error = true} : (tensor<3xi1>, tensor<2x3xi1>) -> tensor<2x3xi1>
+// CHECK:           %[[VAL_7:.*]] = "tf.Div"(%[[VAL_0]], %[[VAL_1]]) : (tensor<3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
+// CHECK:           %[[VAL_8:.*]] = "tf.Abs"(%[[VAL_0]]) : (tensor<3xi32>) -> tensor<3xi32>
+// CHECK:           %[[VAL_9:.*]] = "tf.Abs"(%[[VAL_1]]) : (tensor<2x3xi32>) -> tensor<2x3xi32>
+// CHECK:           %[[VAL_10:.*]] = "tf.Const"() {value = dense<1> : tensor<2x3xi32>} : () -> tensor<2x3xi32>
+// CHECK:           %[[VAL_11:.*]] = "tf.Sub"(%[[VAL_9]], %[[VAL_10]]) : (tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
+// CHECK:           %[[VAL_12:.*]] = "tf.AddV2"(%[[VAL_8]], %[[VAL_11]]) : (tensor<3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
+// CHECK:           %[[VAL_13:.*]] = "tf.Neg"(%[[VAL_12]]) : (tensor<2x3xi32>) -> tensor<2x3xi32>
+// CHECK:           %[[VAL_14:.*]] = "tf.Abs"(%[[VAL_1]]) : (tensor<2x3xi32>) -> tensor<2x3xi32>
+// CHECK:           %[[VAL_15:.*]] = "tf.Div"(%[[VAL_13]], %[[VAL_14]]) : (tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
+// CHECK:           %[[VAL_16:.*]] = "tf.Select"(%[[VAL_6]], %[[VAL_7]], %[[VAL_15]]) : (tensor<2x3xi1>, tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
+// CHECK:           return %[[VAL_16]] : tensor<2x3xi32>
+// CHECK:         }
 func @floordiv_reverse_broadcast_i32(%arg0: tensor<3xi32>, %arg1: tensor<2x3xi32>) -> tensor<2x3xi32> {
   %0 = mhlo.constant dense<0> : tensor<3xi32>
   %1 = "mhlo.compare"(%arg0, %0) {comparison_direction = "LT"} : (tensor<3xi32>, tensor<3xi32>) -> tensor<3xi1>
@@ -210,6 +449,13 @@ func @floordiv_reverse_broadcast_i32(%arg0: tensor<3xi32>, %arg1: tensor<2x3xi32
   return %14 : tensor<2x3xi32>
 }
 
+// CHECK-LABEL:   func @floordiv_f32(
+// CHECK-SAME:                       %[[VAL_0:.*]]: tensor<2xf32>) -> tensor<2xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Div"(%[[VAL_0]], %[[VAL_0]]) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
+// CHECK:           %[[VAL_2:.*]] = "tf.Div"(%[[VAL_0]], %[[VAL_0]]) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
+// CHECK:           %[[VAL_3:.*]] = "tf.FloorDiv"(%[[VAL_0]], %[[VAL_0]]) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
+// CHECK:           return %[[VAL_3]] : tensor<2xf32>
+// CHECK:         }
 func @floordiv_f32(%arg0: tensor<2xf32>) -> tensor<2xf32> {
   %0 = mhlo.divide %arg0, %arg0 : tensor<2xf32>
   %1 = mhlo.divide %arg0, %arg0 : tensor<2xf32>
@@ -217,6 +463,14 @@ func @floordiv_f32(%arg0: tensor<2xf32>) -> tensor<2xf32> {
   return %2 : tensor<2xf32>
 }
 
+// CHECK-LABEL:   func @floordiv_f16_broadcast(
+// CHECK-SAME:                                 %[[VAL_0:.*]]: tensor<2x3xf16>,
+// CHECK-SAME:                                 %[[VAL_1:.*]]: tensor<3xf16>) -> tensor<2x3xf16> {
+// CHECK:           %[[VAL_2:.*]] = "tf.Div"(%[[VAL_0]], %[[VAL_1]]) : (tensor<2x3xf16>, tensor<3xf16>) -> tensor<2x3xf16>
+// CHECK:           %[[VAL_3:.*]] = "tf.Div"(%[[VAL_0]], %[[VAL_1]]) : (tensor<2x3xf16>, tensor<3xf16>) -> tensor<2x3xf16>
+// CHECK:           %[[VAL_4:.*]] = "tf.FloorDiv"(%[[VAL_0]], %[[VAL_1]]) : (tensor<2x3xf16>, tensor<3xf16>) -> tensor<2x3xf16>
+// CHECK:           return %[[VAL_4]] : tensor<2x3xf16>
+// CHECK:         }
 func @floordiv_f16_broadcast(%arg0: tensor<2x3xf16>, %arg1: tensor<3xf16>) -> tensor<2x3xf16> {
   %0 = "chlo.broadcast_divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<2x3xf16>, tensor<3xf16>) -> tensor<2x3xf16>
   %1 = "chlo.broadcast_divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<2x3xf16>, tensor<3xf16>) -> tensor<2x3xf16>
@@ -224,118 +478,252 @@ func @floordiv_f16_broadcast(%arg0: tensor<2x3xf16>, %arg1: tensor<3xf16>) -> te
   return %2 : tensor<2x3xf16>
 }
 
+// CHECK-LABEL:   func @equal(
+// CHECK-SAME:                %[[VAL_0:.*]]: tensor<2xi32>) -> tensor<2xi1> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Equal"(%[[VAL_0]], %[[VAL_0]]) {incompatible_shape_error = true} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+// CHECK:           return %[[VAL_1]] : tensor<2xi1>
+// CHECK:         }
 func @equal(%arg0: tensor<2xi32>) -> tensor<2xi1> {
   %0 = "mhlo.compare"(%arg0, %arg0) {comparison_direction = "EQ"} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
   return %0 : tensor<2xi1>
 }
 
+// CHECK-LABEL:   func @equal_dynamic(
+// CHECK-SAME:                        %[[VAL_0:.*]]: tensor<?xi32>,
+// CHECK-SAME:                        %[[VAL_1:.*]]: tensor<1xi32>) -> tensor<?xi1> {
+// CHECK:           %[[VAL_2:.*]] = "tf.Equal"(%[[VAL_0]], %[[VAL_1]]) {incompatible_shape_error = true} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
+// CHECK:           return %[[VAL_2]] : tensor<?xi1>
+// CHECK:         }
 func @equal_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?xi1> {
   %0 = "chlo.broadcast_compare"(%arg0, %arg1) {comparison_direction = "EQ"} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
   return %0 : tensor<?xi1>
 }
 
+// CHECK-LABEL:   func @equal_broadcast(
+// CHECK-SAME:                          %[[VAL_0:.*]]: tensor<1xi32>,
+// CHECK-SAME:                          %[[VAL_1:.*]]: tensor<1x2xi32>) -> tensor<1x2xi1> {
+// CHECK:           %[[VAL_2:.*]] = "tf.Equal"(%[[VAL_0]], %[[VAL_1]]) {incompatible_shape_error = true} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+// CHECK:           return %[[VAL_2]] : tensor<1x2xi1>
+// CHECK:         }
 func @equal_broadcast(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
   %0 = "chlo.broadcast_compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "EQ"} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   return %0 : tensor<1x2xi1>
 }
 
+// CHECK-LABEL:   func @equal_broadcast_no_incompatible_shapes_error(
+// CHECK-SAME:                                                       %[[VAL_0:.*]]: tensor<2xi32>,
+// CHECK-SAME:                                                       %[[VAL_1:.*]]: tensor<1x2xi32>) -> tensor<1x2xi1> {
+// CHECK:           %[[VAL_2:.*]] = "tf.Equal"(%[[VAL_0]], %[[VAL_1]]) {incompatible_shape_error = true} : (tensor<2xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+// CHECK:           return %[[VAL_2]] : tensor<1x2xi1>
+// CHECK:         }
 func @equal_broadcast_no_incompatible_shapes_error(%arg0: tensor<2xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
   %0 = "chlo.broadcast_compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "EQ"} : (tensor<2xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   return %0 : tensor<1x2xi1>
 }
 
+// CHECK-LABEL:   func @equal_incompatible_shape_broadcastable(
+// CHECK-SAME:                                                 %[[VAL_0:.*]]: tensor<?xi32>,
+// CHECK-SAME:                                                 %[[VAL_1:.*]]: tensor<1xi32>) -> tensor<?xi1> {
+// CHECK:           %[[VAL_2:.*]] = "tf.Equal"(%[[VAL_0]], %[[VAL_1]]) {incompatible_shape_error = true} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
+// CHECK:           return %[[VAL_2]] : tensor<?xi1>
+// CHECK:         }
 func @equal_incompatible_shape_broadcastable(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?xi1> {
   %0 = "chlo.broadcast_compare"(%arg0, %arg1) {comparison_direction = "EQ"} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
   return %0 : tensor<?xi1>
 }
 
+// CHECK-LABEL:   func @notequal(
+// CHECK-SAME:                   %[[VAL_0:.*]]: tensor<2xi32>) -> tensor<2xi1> {
+// CHECK:           %[[VAL_1:.*]] = "tf.NotEqual"(%[[VAL_0]], %[[VAL_0]]) {incompatible_shape_error = true} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+// CHECK:           return %[[VAL_1]] : tensor<2xi1>
+// CHECK:         }
 func @notequal(%arg0: tensor<2xi32>) -> tensor<2xi1> {
   %0 = "mhlo.compare"(%arg0, %arg0) {comparison_direction = "NE"} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
   return %0 : tensor<2xi1>
 }
 
+// CHECK-LABEL:   func @notequal_broadcast(
+// CHECK-SAME:                             %[[VAL_0:.*]]: tensor<1xi32>,
+// CHECK-SAME:                             %[[VAL_1:.*]]: tensor<1x2xi32>) -> tensor<1x2xi1> {
+// CHECK:           %[[VAL_2:.*]] = "tf.NotEqual"(%[[VAL_0]], %[[VAL_1]]) {incompatible_shape_error = true} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+// CHECK:           return %[[VAL_2]] : tensor<1x2xi1>
+// CHECK:         }
 func @notequal_broadcast(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
   %0 = "chlo.broadcast_compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "NE"} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   return %0 : tensor<1x2xi1>
 }
 
+// CHECK-LABEL:   func @notequal_broadcast_no_incompatible_shapes_error(
+// CHECK-SAME:                                                          %[[VAL_0:.*]]: tensor<2xi32>,
+// CHECK-SAME:                                                          %[[VAL_1:.*]]: tensor<1x2xi32>) -> tensor<1x2xi1> {
+// CHECK:           %[[VAL_2:.*]] = "tf.NotEqual"(%[[VAL_0]], %[[VAL_1]]) {incompatible_shape_error = true} : (tensor<2xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+// CHECK:           return %[[VAL_2]] : tensor<1x2xi1>
+// CHECK:         }
 func @notequal_broadcast_no_incompatible_shapes_error(%arg0: tensor<2xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
   %0 = "chlo.broadcast_compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "NE"} : (tensor<2xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   return %0 : tensor<1x2xi1>
 }
 
+// CHECK-LABEL:   func @notequal_incompatible_shape_broadcastable(
+// CHECK-SAME:                                                    %[[VAL_0:.*]]: tensor<?xi32>,
+// CHECK-SAME:                                                    %[[VAL_1:.*]]: tensor<1xi32>) -> tensor<?xi1> {
+// CHECK:           %[[VAL_2:.*]] = "tf.NotEqual"(%[[VAL_0]], %[[VAL_1]]) {incompatible_shape_error = true} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
+// CHECK:           return %[[VAL_2]] : tensor<?xi1>
+// CHECK:         }
 func @notequal_incompatible_shape_broadcastable(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?xi1> {
   %0 = "chlo.broadcast_compare"(%arg0, %arg1) {comparison_direction = "NE"} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
   return %0 : tensor<?xi1>
 }
 
+// CHECK-LABEL:   func @greater(
+// CHECK-SAME:                  %[[VAL_0:.*]]: tensor<2xi32>) -> tensor<2xi1> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Greater"(%[[VAL_0]], %[[VAL_0]]) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+// CHECK:           return %[[VAL_1]] : tensor<2xi1>
+// CHECK:         }
 func @greater(%arg0: tensor<2xi32>) -> tensor<2xi1> {
   %0 = "mhlo.compare"(%arg0, %arg0) {comparison_direction = "GT"} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
   return %0 : tensor<2xi1>
 }
 
+// CHECK-LABEL:   func @broadcast_greater(
+// CHECK-SAME:                            %[[VAL_0:.*]]: tensor<1xi32>,
+// CHECK-SAME:                            %[[VAL_1:.*]]: tensor<1x2xi32>) -> tensor<1x2xi1> {
+// CHECK:           %[[VAL_2:.*]] = "tf.Greater"(%[[VAL_0]], %[[VAL_1]]) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+// CHECK:           return %[[VAL_2]] : tensor<1x2xi1>
+// CHECK:         }
 func @broadcast_greater(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
   %0 = "chlo.broadcast_compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "GT"} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   return %0 : tensor<1x2xi1>
 }
 
+// CHECK-LABEL:   func @greater_equal(
+// CHECK-SAME:                        %[[VAL_0:.*]]: tensor<2xi32>) -> tensor<2xi1> {
+// CHECK:           %[[VAL_1:.*]] = "tf.GreaterEqual"(%[[VAL_0]], %[[VAL_0]]) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+// CHECK:           return %[[VAL_1]] : tensor<2xi1>
+// CHECK:         }
 func @greater_equal(%arg0: tensor<2xi32>) -> tensor<2xi1> {
   %0 = "mhlo.compare"(%arg0, %arg0) {comparison_direction = "GE"} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
   return %0 : tensor<2xi1>
 }
 
+// CHECK-LABEL:   func @broadcast_greater_equal(
+// CHECK-SAME:                                  %[[VAL_0:.*]]: tensor<1xi32>,
+// CHECK-SAME:                                  %[[VAL_1:.*]]: tensor<1x2xi32>) -> tensor<1x2xi1> {
+// CHECK:           %[[VAL_2:.*]] = "tf.GreaterEqual"(%[[VAL_0]], %[[VAL_1]]) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+// CHECK:           return %[[VAL_2]] : tensor<1x2xi1>
+// CHECK:         }
 func @broadcast_greater_equal(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
   %0 = "chlo.broadcast_compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "GE"} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   return %0 : tensor<1x2xi1>
 }
 
+// CHECK-LABEL:   func @less(
+// CHECK-SAME:               %[[VAL_0:.*]]: tensor<2xi32>) -> tensor<2xi1> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Less"(%[[VAL_0]], %[[VAL_0]]) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+// CHECK:           return %[[VAL_1]] : tensor<2xi1>
+// CHECK:         }
 func @less(%arg0: tensor<2xi32>) -> tensor<2xi1> {
   %0 = "mhlo.compare"(%arg0, %arg0) {comparison_direction = "LT"} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
   return %0 : tensor<2xi1>
 }
 
+// CHECK-LABEL:   func @broadcast_less(
+// CHECK-SAME:                         %[[VAL_0:.*]]: tensor<1xi32>,
+// CHECK-SAME:                         %[[VAL_1:.*]]: tensor<1x2xi32>) -> tensor<1x2xi1> {
+// CHECK:           %[[VAL_2:.*]] = "tf.Less"(%[[VAL_0]], %[[VAL_1]]) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+// CHECK:           return %[[VAL_2]] : tensor<1x2xi1>
+// CHECK:         }
 func @broadcast_less(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
   %0 = "chlo.broadcast_compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "LT"} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   return %0 : tensor<1x2xi1>
 }
 
+// CHECK-LABEL:   func @less_equal(
+// CHECK-SAME:                     %[[VAL_0:.*]]: tensor<2xi32>) -> tensor<2xi1> {
+// CHECK:           %[[VAL_1:.*]] = "tf.LessEqual"(%[[VAL_0]], %[[VAL_0]]) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+// CHECK:           return %[[VAL_1]] : tensor<2xi1>
+// CHECK:         }
 func @less_equal(%arg0: tensor<2xi32>) -> tensor<2xi1> {
   %0 = "mhlo.compare"(%arg0, %arg0) {comparison_direction = "LE"} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
   return %0 : tensor<2xi1>
 }
 
+// CHECK-LABEL:   func @broadcast_less_equal(
+// CHECK-SAME:                               %[[VAL_0:.*]]: tensor<1xi32>,
+// CHECK-SAME:                               %[[VAL_1:.*]]: tensor<1x2xi32>) -> tensor<1x2xi1> {
+// CHECK:           %[[VAL_2:.*]] = "tf.LessEqual"(%[[VAL_0]], %[[VAL_1]]) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+// CHECK:           return %[[VAL_2]] : tensor<1x2xi1>
+// CHECK:         }
 func @broadcast_less_equal(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
   %0 = "chlo.broadcast_compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "LE"} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   return %0 : tensor<1x2xi1>
 }
 
+// CHECK-LABEL:   func @concat_v2(
+// CHECK-SAME:                    %[[VAL_0:.*]]: tensor<3x3xf32>,
+// CHECK-SAME:                    %[[VAL_1:.*]]: tensor<3x3xf32>) -> tensor<6x3xf32> {
+// CHECK:           %[[VAL_2:.*]] = "tf.Const"() {value = dense<0> : tensor<i64>} : () -> tensor<i64>
+// CHECK:           %[[VAL_3:.*]] = "tf.ConcatV2"(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]]) : (tensor<3x3xf32>, tensor<3x3xf32>, tensor<i64>) -> tensor<6x3xf32>
+// CHECK:           return %[[VAL_3]] : tensor<6x3xf32>
+// CHECK:         }
 func @concat_v2(%arg0: tensor<3x3xf32>, %arg1: tensor<3x3xf32>) -> tensor<6x3xf32> {
   %2 = "mhlo.concatenate"(%arg0, %arg1) {dimension = 0 : i64} : (tensor<3x3xf32>, tensor<3x3xf32>) -> tensor<6x3xf32>
   return %2 : tensor<6x3xf32>
 }
 
+// CHECK-LABEL:   func @concat_v2_1d_axis(
+// CHECK-SAME:                            %[[VAL_0:.*]]: tensor<3x3xf32>,
+// CHECK-SAME:                            %[[VAL_1:.*]]: tensor<3x3xf32>) -> tensor<3x6xf32> {
+// CHECK:           %[[VAL_2:.*]] = "tf.Const"() {value = dense<1> : tensor<i64>} : () -> tensor<i64>
+// CHECK:           %[[VAL_3:.*]] = "tf.ConcatV2"(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]]) : (tensor<3x3xf32>, tensor<3x3xf32>, tensor<i64>) -> tensor<3x6xf32>
+// CHECK:           return %[[VAL_3]] : tensor<3x6xf32>
+// CHECK:         }
 func @concat_v2_1d_axis(%arg0: tensor<3x3xf32>, %arg1: tensor<3x3xf32>) -> tensor<3x6xf32> {
   %2 = "mhlo.concatenate"(%arg0, %arg1) {dimension = 1 : i64} : (tensor<3x3xf32>, tensor<3x3xf32>) -> tensor<3x6xf32>
   return %2 : tensor<3x6xf32>
 }
 
+// CHECK-LABEL:   func @const() -> tensor<2xi32> {
+// CHECK:           %[[VAL_0:.*]] = "tf.Const"() {value = dense<0> : tensor<2xi32>} : () -> tensor<2xi32>
+// CHECK:           return %[[VAL_0]] : tensor<2xi32>
+// CHECK:         }
 func @const() -> tensor<2xi32> {
   %0 = mhlo.constant dense<0> : tensor<2xi32>
   return %0 : tensor<2xi32>
 }
 
+// CHECK-LABEL:   func @relu(
+// CHECK-SAME:               %[[VAL_0:.*]]: tensor<1xi32>) -> tensor<1xi32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK:           %[[VAL_2:.*]] = "tf.Maximum"(%[[VAL_1]], %[[VAL_0]]) : (tensor<i32>, tensor<1xi32>) -> tensor<1xi32>
+// CHECK:           return %[[VAL_2]] : tensor<1xi32>
+// CHECK:         }
 func @relu(%arg0: tensor<1xi32>) -> tensor<1xi32> {
   %0 = mhlo.constant dense<0> : tensor<i32>
   %1 = "chlo.broadcast_maximum"(%0, %arg0) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<i32>, tensor<1xi32>) -> tensor<1xi32>
   return %1 : tensor<1xi32>
 }
 
+// CHECK-LABEL:   func @relu_unranked(
+// CHECK-SAME:                        %[[VAL_0:.*]]: tensor<?xi32>) -> tensor<?xi32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK:           %[[VAL_2:.*]] = "tf.Maximum"(%[[VAL_1]], %[[VAL_0]]) : (tensor<i32>, tensor<?xi32>) -> tensor<?xi32>
+// CHECK:           return %[[VAL_2]] : tensor<?xi32>
+// CHECK:         }
 func @relu_unranked(%arg0: tensor<?xi32>) -> tensor<?xi32> {
   %0 = mhlo.constant dense<0> : tensor<i32>
   %1 = "chlo.broadcast_maximum"(%0, %arg0) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<i32>, tensor<?xi32>) -> tensor<?xi32>
   return %1 : tensor<?xi32>
 }
 
+// CHECK-LABEL:   func @relu6(
+// CHECK-SAME:                %[[VAL_0:.*]]: tensor<1xi32>) -> tensor<1xi32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK:           %[[VAL_2:.*]] = "tf.Const"() {value = dense<6> : tensor<i32>} : () -> tensor<i32>
+// CHECK:           %[[VAL_3:.*]] = "tf.Minimum"(%[[VAL_0]], %[[VAL_2]]) : (tensor<1xi32>, tensor<i32>) -> tensor<1xi32>
+// CHECK:           %[[VAL_4:.*]] = "tf.Maximum"(%[[VAL_3]], %[[VAL_1]]) : (tensor<1xi32>, tensor<i32>) -> tensor<1xi32>
+// CHECK:           return %[[VAL_4]] : tensor<1xi32>
+// CHECK:         }
 func @relu6(%arg0: tensor<1xi32>) -> tensor<1xi32> {
   %0 = mhlo.constant dense<0> : tensor<i32>
   %1 = mhlo.constant dense<6> : tensor<i32>
@@ -344,6 +732,14 @@ func @relu6(%arg0: tensor<1xi32>) -> tensor<1xi32> {
   return %3 : tensor<1xi32>
 }
 
+// CHECK-LABEL:   func @relu6_unranked(
+// CHECK-SAME:                         %[[VAL_0:.*]]: tensor<?xi32>) -> tensor<?xi32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK:           %[[VAL_2:.*]] = "tf.Const"() {value = dense<6> : tensor<i32>} : () -> tensor<i32>
+// CHECK:           %[[VAL_3:.*]] = "tf.Minimum"(%[[VAL_0]], %[[VAL_2]]) : (tensor<?xi32>, tensor<i32>) -> tensor<?xi32>
+// CHECK:           %[[VAL_4:.*]] = "tf.Maximum"(%[[VAL_3]], %[[VAL_1]]) : (tensor<?xi32>, tensor<i32>) -> tensor<?xi32>
+// CHECK:           return %[[VAL_4]] : tensor<?xi32>
+// CHECK:         }
 func @relu6_unranked(%arg0: tensor<?xi32>) -> tensor<?xi32> {
   %0 = mhlo.constant dense<0> : tensor<i32>
   %1 = mhlo.constant dense<6> : tensor<i32>
@@ -352,6 +748,15 @@ func @relu6_unranked(%arg0: tensor<?xi32>) -> tensor<?xi32> {
   return %3 : tensor<?xi32>
 }
 
+// CHECK-LABEL:   func @relu_grad(
+// CHECK-SAME:                    %[[VAL_0:.*]]: tensor<4x8xf32>,
+// CHECK-SAME:                    %[[VAL_1:.*]]: tensor<?x?xf32>) -> tensor<4x8xf32> {
+// CHECK:           %[[VAL_2:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<f32>} : () -> tensor<f32>
+// CHECK:           %[[VAL_3:.*]] = "tf.Greater"(%[[VAL_1]], %[[VAL_2]]) : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xi1>
+// CHECK:           %[[VAL_4:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<4x8xf32>} : () -> tensor<4x8xf32>
+// CHECK:           %[[VAL_5:.*]] = "tf.Select"(%[[VAL_3]], %[[VAL_0]], %[[VAL_4]]) : (tensor<?x?xi1>, tensor<4x8xf32>, tensor<4x8xf32>) -> tensor<4x8xf32>
+// CHECK:           return %[[VAL_5]] : tensor<4x8xf32>
+// CHECK:         }
 func @relu_grad(%arg0: tensor<4x8xf32>, %arg1: tensor<?x?xf32>) -> tensor<4x8xf32> {
   %0 = mhlo.constant dense<0.000000e+00> : tensor<f32>
   %1 = "chlo.broadcast_compare"(%arg1, %0) {broadcast_dimensions = dense<[]> : tensor<0xi64>, comparison_direction = "GT"} : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xi1>
@@ -360,31 +765,74 @@ func @relu_grad(%arg0: tensor<4x8xf32>, %arg1: tensor<?x?xf32>) -> tensor<4x8xf3
   return %3 : tensor<4x8xf32>
 }
 
+// CHECK-LABEL:   func @select(
+// CHECK-SAME:                 %[[VAL_0:.*]]: tensor<2xi1>,
+// CHECK-SAME:                 %[[VAL_1:.*]]: tensor<2xi32>,
+// CHECK-SAME:                 %[[VAL_2:.*]]: tensor<2xi32>) -> tensor<2xi32> {
+// CHECK:           %[[VAL_3:.*]] = "tf.Select"(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]]) : (tensor<2xi1>, tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+// CHECK:           return %[[VAL_3]] : tensor<2xi32>
+// CHECK:         }
 func @select(%arg0: tensor<2xi1>, %arg1: tensor<2xi32>, %arg2: tensor<2xi32>) -> tensor<2xi32> {
   %0 = "mhlo.select"(%arg0, %arg1, %arg2) : (tensor<2xi1>, tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
   return %0 : tensor<2xi32>
 }
 
+// CHECK-LABEL:   func @select_float(
+// CHECK-SAME:                       %[[VAL_0:.*]]: tensor<2xi1>,
+// CHECK-SAME:                       %[[VAL_1:.*]]: tensor<2xf32>,
+// CHECK-SAME:                       %[[VAL_2:.*]]: tensor<2xf32>) -> tensor<2xf32> {
+// CHECK:           %[[VAL_3:.*]] = "tf.Select"(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]]) : (tensor<2xi1>, tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
+// CHECK:           return %[[VAL_3]] : tensor<2xf32>
+// CHECK:         }
 func @select_float(%arg0: tensor<2xi1>, %arg1: tensor<2xf32>, %arg2: tensor<2xf32>) -> tensor<2xf32> {
   %0 = "mhlo.select"(%arg0, %arg1, %arg2) : (tensor<2xi1>, tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
   return %0 : tensor<2xf32>
 }
 
+// CHECK-LABEL:   func @select_multidimensional(
+// CHECK-SAME:                                  %[[VAL_0:.*]]: tensor<3x2xi1>,
+// CHECK-SAME:                                  %[[VAL_1:.*]]: tensor<3x2xi32>,
+// CHECK-SAME:                                  %[[VAL_2:.*]]: tensor<3x2xi32>) -> tensor<3x2xi32> {
+// CHECK:           %[[VAL_3:.*]] = "tf.Select"(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]]) : (tensor<3x2xi1>, tensor<3x2xi32>, tensor<3x2xi32>) -> tensor<3x2xi32>
+// CHECK:           return %[[VAL_3]] : tensor<3x2xi32>
+// CHECK:         }
 func @select_multidimensional(%arg0: tensor<3x2xi1>, %arg1: tensor<3x2xi32>, %arg2: tensor<3x2xi32>) -> tensor<3x2xi32> {
   %0 = "mhlo.select"(%arg0, %arg1, %arg2) : (tensor<3x2xi1>, tensor<3x2xi32>, tensor<3x2xi32>) -> tensor<3x2xi32>
   return %0 : tensor<3x2xi32>
 }
 
+// CHECK-LABEL:   func @selectv2(
+// CHECK-SAME:                   %[[VAL_0:.*]]: tensor<2xi1>,
+// CHECK-SAME:                   %[[VAL_1:.*]]: tensor<2xi32>,
+// CHECK-SAME:                   %[[VAL_2:.*]]: tensor<2xi32>) -> tensor<2xi32> {
+// CHECK:           %[[VAL_3:.*]] = "tf.Select"(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]]) : (tensor<2xi1>, tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+// CHECK:           return %[[VAL_3]] : tensor<2xi32>
+// CHECK:         }
 func @selectv2(%arg0: tensor<2xi1>, %arg1: tensor<2xi32>, %arg2: tensor<2xi32>) -> tensor<2xi32> {
   %0 = "mhlo.select"(%arg0, %arg1, %arg2) : (tensor<2xi1>, tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
   return %0 : tensor<2xi32>
 }
 
+// CHECK-LABEL:   func @selectv2_pred_scalar(
+// CHECK-SAME:                               %[[VAL_0:.*]]: tensor<i1>,
+// CHECK-SAME:                               %[[VAL_1:.*]]: tensor<2xi32>,
+// CHECK-SAME:                               %[[VAL_2:.*]]: tensor<2xi32>) -> tensor<2xi32> {
+// CHECK:           %[[VAL_3:.*]] = "tf.Select"(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]]) : (tensor<i1>, tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+// CHECK:           return %[[VAL_3]] : tensor<2xi32>
+// CHECK:         }
 func @selectv2_pred_scalar(%arg0: tensor<i1>, %arg1: tensor<2xi32>, %arg2: tensor<2xi32>) -> tensor<2xi32> {
   %0 = "mhlo.select"(%arg0, %arg1, %arg2) : (tensor<i1>, tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
   return %0 : tensor<2xi32>
 }
 
+// CHECK-LABEL:   func @transpose_2d(
+// CHECK-SAME:                       %[[VAL_0:.*]]: tensor<2x3xf32>) -> tensor<3x2xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi64>} : () -> tensor<2xi64>
+// CHECK:           %[[VAL_2:.*]] = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi64>} : () -> tensor<2xi64>
+// CHECK:           %[[VAL_3:.*]] = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi64>} : () -> tensor<2xi64>
+// CHECK:           %[[VAL_4:.*]] = "tf.Transpose"(%[[VAL_0]], %[[VAL_3]]) : (tensor<2x3xf32>, tensor<2xi64>) -> tensor<3x2xf32>
+// CHECK:           return %[[VAL_4]] : tensor<3x2xf32>
+// CHECK:         }
 func @transpose_2d(%arg0: tensor<2x3xf32>) -> tensor<3x2xf32> {
   %0 = mhlo.constant dense<[1, 0]> : tensor<2xi64>
   %1 = mhlo.constant dense<[1, 0]> : tensor<2xi64>
@@ -392,6 +840,14 @@ func @transpose_2d(%arg0: tensor<2x3xf32>) -> tensor<3x2xf32> {
   return %2 : tensor<3x2xf32>
 }
 
+// CHECK-LABEL:   func @transpose_3d_int32(
+// CHECK-SAME:                             %[[VAL_0:.*]]: tensor<1x2x3xf32>) -> tensor<3x2x1xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Const"() {value = dense<[2, 1, 0]> : tensor<3xi32>} : () -> tensor<3xi32>
+// CHECK:           %[[VAL_2:.*]] = "tf.Const"() {value = dense<[2, 1, 0]> : tensor<3xi64>} : () -> tensor<3xi64>
+// CHECK:           %[[VAL_3:.*]] = "tf.Const"() {value = dense<[2, 1, 0]> : tensor<3xi64>} : () -> tensor<3xi64>
+// CHECK:           %[[VAL_4:.*]] = "tf.Transpose"(%[[VAL_0]], %[[VAL_3]]) : (tensor<1x2x3xf32>, tensor<3xi64>) -> tensor<3x2x1xf32>
+// CHECK:           return %[[VAL_4]] : tensor<3x2x1xf32>
+// CHECK:         }
 func @transpose_3d_int32(%arg0: tensor<1x2x3xf32>) -> tensor<3x2x1xf32> {
   %0 = mhlo.constant dense<[2, 1, 0]> : tensor<3xi32>
   %1 = mhlo.constant dense<[2, 1, 0]> : tensor<3xi64>
@@ -399,6 +855,14 @@ func @transpose_3d_int32(%arg0: tensor<1x2x3xf32>) -> tensor<3x2x1xf32> {
   return %2 : tensor<3x2x1xf32>
 }
 
+// CHECK-LABEL:   func @transpose_3d(
+// CHECK-SAME:                       %[[VAL_0:.*]]: tensor<1x2x3xf32>) -> tensor<3x2x1xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Const"() {value = dense<[2, 1, 0]> : tensor<3xi64>} : () -> tensor<3xi64>
+// CHECK:           %[[VAL_2:.*]] = "tf.Const"() {value = dense<[2, 1, 0]> : tensor<3xi64>} : () -> tensor<3xi64>
+// CHECK:           %[[VAL_3:.*]] = "tf.Const"() {value = dense<[2, 1, 0]> : tensor<3xi64>} : () -> tensor<3xi64>
+// CHECK:           %[[VAL_4:.*]] = "tf.Transpose"(%[[VAL_0]], %[[VAL_3]]) : (tensor<1x2x3xf32>, tensor<3xi64>) -> tensor<3x2x1xf32>
+// CHECK:           return %[[VAL_4]] : tensor<3x2x1xf32>
+// CHECK:         }
 func @transpose_3d(%arg0: tensor<1x2x3xf32>) -> tensor<3x2x1xf32> {
   %0 = mhlo.constant dense<[2, 1, 0]> : tensor<3xi64>
   %1 = mhlo.constant dense<[2, 1, 0]> : tensor<3xi64>
@@ -406,6 +870,14 @@ func @transpose_3d(%arg0: tensor<1x2x3xf32>) -> tensor<3x2x1xf32> {
   return %2 : tensor<3x2x1xf32>
 }
 
+// CHECK-LABEL:   func @transpose_dynamic_2d(
+// CHECK-SAME:                               %[[VAL_0:.*]]: tensor<?x4xf32>) -> tensor<4x?xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi64>} : () -> tensor<2xi64>
+// CHECK:           %[[VAL_2:.*]] = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi64>} : () -> tensor<2xi64>
+// CHECK:           %[[VAL_3:.*]] = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi64>} : () -> tensor<2xi64>
+// CHECK:           %[[VAL_4:.*]] = "tf.Transpose"(%[[VAL_0]], %[[VAL_3]]) : (tensor<?x4xf32>, tensor<2xi64>) -> tensor<4x?xf32>
+// CHECK:           return %[[VAL_4]] : tensor<4x?xf32>
+// CHECK:         }
 func @transpose_dynamic_2d(%arg0: tensor<?x4xf32>) -> tensor<4x?xf32> {
   %0 = mhlo.constant dense<[1, 0]> : tensor<2xi64>
   %1 = mhlo.constant dense<[1, 0]> : tensor<2xi64>
@@ -413,6 +885,14 @@ func @transpose_dynamic_2d(%arg0: tensor<?x4xf32>) -> tensor<4x?xf32> {
   return %2 : tensor<4x?xf32>
 }
 
+// CHECK-LABEL:   func @transpose_unranked_2d(
+// CHECK-SAME:                                %[[VAL_0:.*]]: tensor<*xf32>) -> tensor<*xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi64>} : () -> tensor<2xi64>
+// CHECK:           %[[VAL_2:.*]] = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi64>} : () -> tensor<2xi64>
+// CHECK:           %[[VAL_3:.*]] = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi64>} : () -> tensor<2xi64>
+// CHECK:           %[[VAL_4:.*]] = "tf.Transpose"(%[[VAL_0]], %[[VAL_3]]) : (tensor<*xf32>, tensor<2xi64>) -> tensor<*xf32>
+// CHECK:           return %[[VAL_4]] : tensor<*xf32>
+// CHECK:         }
 func @transpose_unranked_2d(%arg0: tensor<*xf32>) -> tensor<*xf32> {
   %0 = mhlo.constant dense<[1, 0]> : tensor<2xi64>
   %1 = mhlo.constant dense<[1, 0]> : tensor<2xi64>
@@ -420,146 +900,297 @@ func @transpose_unranked_2d(%arg0: tensor<*xf32>) -> tensor<*xf32> {
   return %2 : tensor<*xf32>
 }
 
+// CHECK-LABEL:   func @abs(
+// CHECK-SAME:              %[[VAL_0:.*]]: tensor<2xf32>) -> tensor<2xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Abs"(%[[VAL_0]]) : (tensor<2xf32>) -> tensor<2xf32>
+// CHECK:           return %[[VAL_1]] : tensor<2xf32>
+// CHECK:         }
 func @abs(%arg0: tensor<2xf32>) -> tensor<2xf32> {
   %0 = "mhlo.abs"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
   return %0 : tensor<2xf32>
 }
 
+// CHECK-LABEL:   func @abs_dynamic(
+// CHECK-SAME:                      %[[VAL_0:.*]]: tensor<?xf32>) -> tensor<?xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Abs"(%[[VAL_0]]) : (tensor<?xf32>) -> tensor<?xf32>
+// CHECK:           return %[[VAL_1]] : tensor<?xf32>
+// CHECK:         }
 func @abs_dynamic(%arg0: tensor<?xf32>) -> tensor<?xf32> {
   %0 = "mhlo.abs"(%arg0) : (tensor<?xf32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
 
+// CHECK-LABEL:   func @abs_unranked(
+// CHECK-SAME:                       %[[VAL_0:.*]]: tensor<*xf32>) -> tensor<*xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Abs"(%[[VAL_0]]) : (tensor<*xf32>) -> tensor<*xf32>
+// CHECK:           return %[[VAL_1]] : tensor<*xf32>
+// CHECK:         }
 func @abs_unranked(%arg0: tensor<*xf32>) -> tensor<*xf32> {
   %0 = "mhlo.abs"(%arg0) : (tensor<*xf32>) -> tensor<*xf32>
   return %0 : tensor<*xf32>
 }
 
+// CHECK-LABEL:   func @ceil(
+// CHECK-SAME:               %[[VAL_0:.*]]: tensor<2xf32>) -> tensor<2xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Ceil"(%[[VAL_0]]) : (tensor<2xf32>) -> tensor<2xf32>
+// CHECK:           return %[[VAL_1]] : tensor<2xf32>
+// CHECK:         }
 func @ceil(%arg0: tensor<2xf32>) -> tensor<2xf32> {
   %0 = "mhlo.ceil"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
   return %0 : tensor<2xf32>
 }
 
+// CHECK-LABEL:   func @ceil_dynamic(
+// CHECK-SAME:                       %[[VAL_0:.*]]: tensor<?xf32>) -> tensor<?xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Ceil"(%[[VAL_0]]) : (tensor<?xf32>) -> tensor<?xf32>
+// CHECK:           return %[[VAL_1]] : tensor<?xf32>
+// CHECK:         }
 func @ceil_dynamic(%arg0: tensor<?xf32>) -> tensor<?xf32> {
   %0 = "mhlo.ceil"(%arg0) : (tensor<?xf32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
 
+// CHECK-LABEL:   func @ceil_unranked(
+// CHECK-SAME:                        %[[VAL_0:.*]]: tensor<*xf32>) -> tensor<*xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Ceil"(%[[VAL_0]]) : (tensor<*xf32>) -> tensor<*xf32>
+// CHECK:           return %[[VAL_1]] : tensor<*xf32>
+// CHECK:         }
 func @ceil_unranked(%arg0: tensor<*xf32>) -> tensor<*xf32> {
   %0 = "mhlo.ceil"(%arg0) : (tensor<*xf32>) -> tensor<*xf32>
   return %0 : tensor<*xf32>
 }
 
+// CHECK-LABEL:   func @complex_abs(
+// CHECK-SAME:                      %[[VAL_0:.*]]: tensor<2xcomplex<f32>>) -> tensor<2xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.ComplexAbs"(%[[VAL_0]]) : (tensor<2xcomplex<f32>>) -> tensor<2xf32>
+// CHECK:           return %[[VAL_1]] : tensor<2xf32>
+// CHECK:         }
 func @complex_abs(%arg0: tensor<2xcomplex<f32>>) -> tensor<2xf32> {
   %0 = "mhlo.abs"(%arg0) : (tensor<2xcomplex<f32>>) -> tensor<2xf32>
   return %0 : tensor<2xf32>
 }
 
+// CHECK-LABEL:   func @cos(
+// CHECK-SAME:              %[[VAL_0:.*]]: tensor<2xf32>) -> tensor<2xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Cos"(%[[VAL_0]]) : (tensor<2xf32>) -> tensor<2xf32>
+// CHECK:           return %[[VAL_1]] : tensor<2xf32>
+// CHECK:         }
 func @cos(%arg0: tensor<2xf32>) -> tensor<2xf32> {
   %0 = "mhlo.cosine"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
   return %0 : tensor<2xf32>
 }
 
+// CHECK-LABEL:   func @cos_dynamic(
+// CHECK-SAME:                      %[[VAL_0:.*]]: tensor<?xf32>) -> tensor<?xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Cos"(%[[VAL_0]]) : (tensor<?xf32>) -> tensor<?xf32>
+// CHECK:           return %[[VAL_1]] : tensor<?xf32>
+// CHECK:         }
 func @cos_dynamic(%arg0: tensor<?xf32>) -> tensor<?xf32> {
   %0 = "mhlo.cosine"(%arg0) : (tensor<?xf32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
 
+// CHECK-LABEL:   func @cos_unranked(
+// CHECK-SAME:                       %[[VAL_0:.*]]: tensor<*xf32>) -> tensor<*xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Cos"(%[[VAL_0]]) : (tensor<*xf32>) -> tensor<*xf32>
+// CHECK:           return %[[VAL_1]] : tensor<*xf32>
+// CHECK:         }
 func @cos_unranked(%arg0: tensor<*xf32>) -> tensor<*xf32> {
   %0 = "mhlo.cosine"(%arg0) : (tensor<*xf32>) -> tensor<*xf32>
   return %0 : tensor<*xf32>
 }
 
+// CHECK-LABEL:   func @exp(
+// CHECK-SAME:              %[[VAL_0:.*]]: tensor<2xf32>) -> tensor<2xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Exp"(%[[VAL_0]]) : (tensor<2xf32>) -> tensor<2xf32>
+// CHECK:           return %[[VAL_1]] : tensor<2xf32>
+// CHECK:         }
 func @exp(%arg0: tensor<2xf32>) -> tensor<2xf32> {
   %0 = "mhlo.exponential"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
   return %0 : tensor<2xf32>
 }
 
+// CHECK-LABEL:   func @exp_dynamic(
+// CHECK-SAME:                      %[[VAL_0:.*]]: tensor<?xf32>) -> tensor<?xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Exp"(%[[VAL_0]]) : (tensor<?xf32>) -> tensor<?xf32>
+// CHECK:           return %[[VAL_1]] : tensor<?xf32>
+// CHECK:         }
 func @exp_dynamic(%arg0: tensor<?xf32>) -> tensor<?xf32> {
   %0 = "mhlo.exponential"(%arg0) : (tensor<?xf32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
 
+// CHECK-LABEL:   func @exp_unranked(
+// CHECK-SAME:                       %[[VAL_0:.*]]: tensor<*xf32>) -> tensor<*xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Exp"(%[[VAL_0]]) : (tensor<*xf32>) -> tensor<*xf32>
+// CHECK:           return %[[VAL_1]] : tensor<*xf32>
+// CHECK:         }
 func @exp_unranked(%arg0: tensor<*xf32>) -> tensor<*xf32> {
   %0 = "mhlo.exponential"(%arg0) : (tensor<*xf32>) -> tensor<*xf32>
   return %0 : tensor<*xf32>
 }
 
+// CHECK-LABEL:   func @floor(
+// CHECK-SAME:                %[[VAL_0:.*]]: tensor<2xf32>) -> tensor<2xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Floor"(%[[VAL_0]]) : (tensor<2xf32>) -> tensor<2xf32>
+// CHECK:           return %[[VAL_1]] : tensor<2xf32>
+// CHECK:         }
 func @floor(%arg0: tensor<2xf32>) -> tensor<2xf32> {
   %0 = "mhlo.floor"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
   return %0 : tensor<2xf32>
 }
 
+// CHECK-LABEL:   func @floor_dynamic(
+// CHECK-SAME:                        %[[VAL_0:.*]]: tensor<?xf32>) -> tensor<?xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Floor"(%[[VAL_0]]) : (tensor<?xf32>) -> tensor<?xf32>
+// CHECK:           return %[[VAL_1]] : tensor<?xf32>
+// CHECK:         }
 func @floor_dynamic(%arg0: tensor<?xf32>) -> tensor<?xf32> {
   %0 = "mhlo.floor"(%arg0) : (tensor<?xf32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
 
+// CHECK-LABEL:   func @floor_unranked(
+// CHECK-SAME:                         %[[VAL_0:.*]]: tensor<*xf32>) -> tensor<*xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Floor"(%[[VAL_0]]) : (tensor<*xf32>) -> tensor<*xf32>
+// CHECK:           return %[[VAL_1]] : tensor<*xf32>
+// CHECK:         }
 func @floor_unranked(%arg0: tensor<*xf32>) -> tensor<*xf32> {
   %0 = "mhlo.floor"(%arg0) : (tensor<*xf32>) -> tensor<*xf32>
   return %0 : tensor<*xf32>
 }
 
+// CHECK-LABEL:   func @is_finite(
+// CHECK-SAME:                    %[[VAL_0:.*]]: tensor<2xf32>) -> tensor<2xi1> {
+// CHECK:           %[[VAL_1:.*]] = "tf.IsFinite"(%[[VAL_0]]) : (tensor<2xf32>) -> tensor<2xi1>
+// CHECK:           return %[[VAL_1]] : tensor<2xi1>
+// CHECK:         }
 func @is_finite(%arg0: tensor<2xf32>) -> tensor<2xi1> {
   %0 = "mhlo.is_finite"(%arg0) : (tensor<2xf32>) -> tensor<2xi1>
   return %0 : tensor<2xi1>
 }
 
+// CHECK-LABEL:   func @is_finite_dynamic(
+// CHECK-SAME:                            %[[VAL_0:.*]]: tensor<?xf32>) -> tensor<?xi1> {
+// CHECK:           %[[VAL_1:.*]] = "tf.IsFinite"(%[[VAL_0]]) : (tensor<?xf32>) -> tensor<?xi1>
+// CHECK:           return %[[VAL_1]] : tensor<?xi1>
+// CHECK:         }
 func @is_finite_dynamic(%arg0: tensor<?xf32>) -> tensor<?xi1> {
   %0 = "mhlo.is_finite"(%arg0) : (tensor<?xf32>) -> tensor<?xi1>
   return %0 : tensor<?xi1>
 }
 
+// CHECK-LABEL:   func @is_finite_unranked(
+// CHECK-SAME:                             %[[VAL_0:.*]]: tensor<*xf32>) -> tensor<*xi1> {
+// CHECK:           %[[VAL_1:.*]] = "tf.IsFinite"(%[[VAL_0]]) : (tensor<*xf32>) -> tensor<*xi1>
+// CHECK:           return %[[VAL_1]] : tensor<*xi1>
+// CHECK:         }
 func @is_finite_unranked(%arg0: tensor<*xf32>) -> tensor<*xi1> {
   %0 = "mhlo.is_finite"(%arg0) : (tensor<*xf32>) -> tensor<*xi1>
   return %0 : tensor<*xi1>
 }
 
+// CHECK-LABEL:   func @log(
+// CHECK-SAME:              %[[VAL_0:.*]]: tensor<2xf32>) -> tensor<2xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Log"(%[[VAL_0]]) : (tensor<2xf32>) -> tensor<2xf32>
+// CHECK:           return %[[VAL_1]] : tensor<2xf32>
+// CHECK:         }
 func @log(%arg0: tensor<2xf32>) -> tensor<2xf32> {
   %0 = "mhlo.log"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
   return %0 : tensor<2xf32>
 }
 
+// CHECK-LABEL:   func @log_dynamic(
+// CHECK-SAME:                      %[[VAL_0:.*]]: tensor<?xf32>) -> tensor<?xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Log"(%[[VAL_0]]) : (tensor<?xf32>) -> tensor<?xf32>
+// CHECK:           return %[[VAL_1]] : tensor<?xf32>
+// CHECK:         }
 func @log_dynamic(%arg0: tensor<?xf32>) -> tensor<?xf32> {
   %0 = "mhlo.log"(%arg0) : (tensor<?xf32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
 
+// CHECK-LABEL:   func @log_unranked(
+// CHECK-SAME:                       %[[VAL_0:.*]]: tensor<*xf32>) -> tensor<*xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Log"(%[[VAL_0]]) : (tensor<*xf32>) -> tensor<*xf32>
+// CHECK:           return %[[VAL_1]] : tensor<*xf32>
+// CHECK:         }
 func @log_unranked(%arg0: tensor<*xf32>) -> tensor<*xf32> {
   %0 = "mhlo.log"(%arg0) : (tensor<*xf32>) -> tensor<*xf32>
   return %0 : tensor<*xf32>
 }
 
+// CHECK-LABEL:   func @log1p(
+// CHECK-SAME:                %[[VAL_0:.*]]: tensor<2xf32>) -> tensor<2xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Log1p"(%[[VAL_0]]) : (tensor<2xf32>) -> tensor<2xf32>
+// CHECK:           return %[[VAL_1]] : tensor<2xf32>
+// CHECK:         }
 func @log1p(%arg0: tensor<2xf32>) -> tensor<2xf32> {
   %0 = "mhlo.log_plus_one"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
   return %0 : tensor<2xf32>
 }
 
+// CHECK-LABEL:   func @log1p_dynamic(
+// CHECK-SAME:                        %[[VAL_0:.*]]: tensor<?xf32>) -> tensor<?xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Log1p"(%[[VAL_0]]) : (tensor<?xf32>) -> tensor<?xf32>
+// CHECK:           return %[[VAL_1]] : tensor<?xf32>
+// CHECK:         }
 func @log1p_dynamic(%arg0: tensor<?xf32>) -> tensor<?xf32> {
   %0 = "mhlo.log_plus_one"(%arg0) : (tensor<?xf32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
 
+// CHECK-LABEL:   func @log1p_unranked(
+// CHECK-SAME:                         %[[VAL_0:.*]]: tensor<*xf32>) -> tensor<*xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Log1p"(%[[VAL_0]]) : (tensor<*xf32>) -> tensor<*xf32>
+// CHECK:           return %[[VAL_1]] : tensor<*xf32>
+// CHECK:         }
 func @log1p_unranked(%arg0: tensor<*xf32>) -> tensor<*xf32> {
   %0 = "mhlo.log_plus_one"(%arg0) : (tensor<*xf32>) -> tensor<*xf32>
   return %0 : tensor<*xf32>
 }
 
+// CHECK-LABEL:   func @neg(
+// CHECK-SAME:              %[[VAL_0:.*]]: tensor<2xf32>) -> tensor<2xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Neg"(%[[VAL_0]]) : (tensor<2xf32>) -> tensor<2xf32>
+// CHECK:           return %[[VAL_1]] : tensor<2xf32>
+// CHECK:         }
 func @neg(%arg0: tensor<2xf32>) -> tensor<2xf32> {
   %0 = "mhlo.negate"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
   return %0 : tensor<2xf32>
 }
 
+// CHECK-LABEL:   func @neg_dynamic(
+// CHECK-SAME:                      %[[VAL_0:.*]]: tensor<?xf32>) -> tensor<?xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Neg"(%[[VAL_0]]) : (tensor<?xf32>) -> tensor<?xf32>
+// CHECK:           return %[[VAL_1]] : tensor<?xf32>
+// CHECK:         }
 func @neg_dynamic(%arg0: tensor<?xf32>) -> tensor<?xf32> {
   %0 = "mhlo.negate"(%arg0) : (tensor<?xf32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
 
+// CHECK-LABEL:   func @neg_unranked(
+// CHECK-SAME:                       %[[VAL_0:.*]]: tensor<*xf32>) -> tensor<*xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Neg"(%[[VAL_0]]) : (tensor<*xf32>) -> tensor<*xf32>
+// CHECK:           return %[[VAL_1]] : tensor<*xf32>
+// CHECK:         }
 func @neg_unranked(%arg0: tensor<*xf32>) -> tensor<*xf32> {
   %0 = "mhlo.negate"(%arg0) : (tensor<*xf32>) -> tensor<*xf32>
   return %0 : tensor<*xf32>
 }
 
+// CHECK-LABEL:   func @sigmoid(
+// CHECK-SAME:                  %[[VAL_0:.*]]: tensor<2xf32>) -> tensor<2xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Const"() {value = dense<5.000000e-01> : tensor<f32>} : () -> tensor<f32>
+// CHECK:           %[[VAL_2:.*]] = "tf.Const"() {value = dense<2> : tensor<1xi64>} : () -> tensor<1xi64>
+// CHECK:           %[[VAL_3:.*]] = "tf.Const"() {value = dense<5.000000e-01> : tensor<2xf32>} : () -> tensor<2xf32>
+// CHECK:           %[[VAL_4:.*]] = "tf.Mul"(%[[VAL_0]], %[[VAL_3]]) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
+// CHECK:           %[[VAL_5:.*]] = "tf.Tanh"(%[[VAL_4]]) : (tensor<2xf32>) -> tensor<2xf32>
+// CHECK:           %[[VAL_6:.*]] = "tf.Mul"(%[[VAL_5]], %[[VAL_3]]) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
+// CHECK:           %[[VAL_7:.*]] = "tf.AddV2"(%[[VAL_6]], %[[VAL_3]]) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
+// CHECK:           return %[[VAL_7]] : tensor<2xf32>
+// CHECK:         }
 func @sigmoid(%arg0: tensor<2xf32>) -> tensor<2xf32> {
   %0 = mhlo.constant dense<5.000000e-01> : tensor<f32>
   %1 = mhlo.constant dense<2> : tensor<1xi64>
@@ -571,86 +1202,177 @@ func @sigmoid(%arg0: tensor<2xf32>) -> tensor<2xf32> {
   return %6 : tensor<2xf32>
 }
 
+// CHECK-LABEL:   func @sin(
+// CHECK-SAME:              %[[VAL_0:.*]]: tensor<2xf32>) -> tensor<2xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Sin"(%[[VAL_0]]) : (tensor<2xf32>) -> tensor<2xf32>
+// CHECK:           return %[[VAL_1]] : tensor<2xf32>
+// CHECK:         }
 func @sin(%arg0: tensor<2xf32>) -> tensor<2xf32> {
   %0 = "mhlo.sine"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
   return %0 : tensor<2xf32>
 }
 
+// CHECK-LABEL:   func @sin_dynamic(
+// CHECK-SAME:                      %[[VAL_0:.*]]: tensor<?xf32>) -> tensor<?xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Sin"(%[[VAL_0]]) : (tensor<?xf32>) -> tensor<?xf32>
+// CHECK:           return %[[VAL_1]] : tensor<?xf32>
+// CHECK:         }
 func @sin_dynamic(%arg0: tensor<?xf32>) -> tensor<?xf32> {
   %0 = "mhlo.sine"(%arg0) : (tensor<?xf32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
 
+// CHECK-LABEL:   func @sin_unranked(
+// CHECK-SAME:                       %[[VAL_0:.*]]: tensor<*xf32>) -> tensor<*xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Sin"(%[[VAL_0]]) : (tensor<*xf32>) -> tensor<*xf32>
+// CHECK:           return %[[VAL_1]] : tensor<*xf32>
+// CHECK:         }
 func @sin_unranked(%arg0: tensor<*xf32>) -> tensor<*xf32> {
   %0 = "mhlo.sine"(%arg0) : (tensor<*xf32>) -> tensor<*xf32>
   return %0 : tensor<*xf32>
 }
 
+// CHECK-LABEL:   func @rsqrt(
+// CHECK-SAME:                %[[VAL_0:.*]]: tensor<2xf32>) -> tensor<2xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Rsqrt"(%[[VAL_0]]) : (tensor<2xf32>) -> tensor<2xf32>
+// CHECK:           return %[[VAL_1]] : tensor<2xf32>
+// CHECK:         }
 func @rsqrt(%arg0: tensor<2xf32>) -> tensor<2xf32> {
   %0 = "mhlo.rsqrt"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
   return %0 : tensor<2xf32>
 }
 
+// CHECK-LABEL:   func @rsqrt_dynamic(
+// CHECK-SAME:                        %[[VAL_0:.*]]: tensor<?xf32>) -> tensor<?xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Rsqrt"(%[[VAL_0]]) : (tensor<?xf32>) -> tensor<?xf32>
+// CHECK:           return %[[VAL_1]] : tensor<?xf32>
+// CHECK:         }
 func @rsqrt_dynamic(%arg0: tensor<?xf32>) -> tensor<?xf32> {
   %0 = "mhlo.rsqrt"(%arg0) : (tensor<?xf32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
 
+// CHECK-LABEL:   func @rsqrt_unranked(
+// CHECK-SAME:                         %[[VAL_0:.*]]: tensor<*xf32>) -> tensor<*xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Rsqrt"(%[[VAL_0]]) : (tensor<*xf32>) -> tensor<*xf32>
+// CHECK:           return %[[VAL_1]] : tensor<*xf32>
+// CHECK:         }
 func @rsqrt_unranked(%arg0: tensor<*xf32>) -> tensor<*xf32> {
   %0 = "mhlo.rsqrt"(%arg0) : (tensor<*xf32>) -> tensor<*xf32>
   return %0 : tensor<*xf32>
 }
 
+// CHECK-LABEL:   func @sqrt(
+// CHECK-SAME:               %[[VAL_0:.*]]: tensor<2xf32>) -> tensor<2xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Sqrt"(%[[VAL_0]]) : (tensor<2xf32>) -> tensor<2xf32>
+// CHECK:           return %[[VAL_1]] : tensor<2xf32>
+// CHECK:         }
 func @sqrt(%arg0: tensor<2xf32>) -> tensor<2xf32> {
   %0 = "mhlo.sqrt"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
   return %0 : tensor<2xf32>
 }
 
+// CHECK-LABEL:   func @sqrt_dynamic(
+// CHECK-SAME:                       %[[VAL_0:.*]]: tensor<?xf32>) -> tensor<?xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Sqrt"(%[[VAL_0]]) : (tensor<?xf32>) -> tensor<?xf32>
+// CHECK:           return %[[VAL_1]] : tensor<?xf32>
+// CHECK:         }
 func @sqrt_dynamic(%arg0: tensor<?xf32>) -> tensor<?xf32> {
   %0 = "mhlo.sqrt"(%arg0) : (tensor<?xf32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
 
+// CHECK-LABEL:   func @sqrt_unranked(
+// CHECK-SAME:                        %[[VAL_0:.*]]: tensor<*xf32>) -> tensor<*xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Sqrt"(%[[VAL_0]]) : (tensor<*xf32>) -> tensor<*xf32>
+// CHECK:           return %[[VAL_1]] : tensor<*xf32>
+// CHECK:         }
 func @sqrt_unranked(%arg0: tensor<*xf32>) -> tensor<*xf32> {
   %0 = "mhlo.sqrt"(%arg0) : (tensor<*xf32>) -> tensor<*xf32>
   return %0 : tensor<*xf32>
 }
 
+// CHECK-LABEL:   func @tanh(
+// CHECK-SAME:               %[[VAL_0:.*]]: tensor<2xf32>) -> tensor<2xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Tanh"(%[[VAL_0]]) : (tensor<2xf32>) -> tensor<2xf32>
+// CHECK:           return %[[VAL_1]] : tensor<2xf32>
+// CHECK:         }
 func @tanh(%arg0: tensor<2xf32>) -> tensor<2xf32> {
   %0 = "mhlo.tanh"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
   return %0 : tensor<2xf32>
 }
 
+// CHECK-LABEL:   func @tanh_dynamic(
+// CHECK-SAME:                       %[[VAL_0:.*]]: tensor<?xf32>) -> tensor<?xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Tanh"(%[[VAL_0]]) : (tensor<?xf32>) -> tensor<?xf32>
+// CHECK:           return %[[VAL_1]] : tensor<?xf32>
+// CHECK:         }
 func @tanh_dynamic(%arg0: tensor<?xf32>) -> tensor<?xf32> {
   %0 = "mhlo.tanh"(%arg0) : (tensor<?xf32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
 
+// CHECK-LABEL:   func @tanh_unranked(
+// CHECK-SAME:                        %[[VAL_0:.*]]: tensor<*xf32>) -> tensor<*xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Tanh"(%[[VAL_0]]) : (tensor<*xf32>) -> tensor<*xf32>
+// CHECK:           return %[[VAL_1]] : tensor<*xf32>
+// CHECK:         }
 func @tanh_unranked(%arg0: tensor<*xf32>) -> tensor<*xf32> {
   %0 = "mhlo.tanh"(%arg0) : (tensor<*xf32>) -> tensor<*xf32>
   return %0 : tensor<*xf32>
 }
 
+// CHECK-LABEL:   func @bitcast(
+// CHECK-SAME:                  %[[VAL_0:.*]]: tensor<2xf32>) -> tensor<2xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Bitcast"(%[[VAL_0]]) : (tensor<2xf32>) -> tensor<2xf32>
+// CHECK:           return %[[VAL_1]] : tensor<2xf32>
+// CHECK:         }
 func @bitcast(%arg0: tensor<2xf32>) -> tensor<2xf32> {
   %0 = "mhlo.bitcast_convert"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
   return %0 : tensor<2xf32>
 }
 
+// CHECK-LABEL:   func @bitcast_dynamic(
+// CHECK-SAME:                          %[[VAL_0:.*]]: tensor<?xf32>) -> tensor<?xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Bitcast"(%[[VAL_0]]) : (tensor<?xf32>) -> tensor<?xf32>
+// CHECK:           return %[[VAL_1]] : tensor<?xf32>
+// CHECK:         }
 func @bitcast_dynamic(%arg0: tensor<?xf32>) -> tensor<?xf32> {
   %0 = "mhlo.bitcast_convert"(%arg0) : (tensor<?xf32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
 
+// CHECK-LABEL:   func @bitcast_unranked(
+// CHECK-SAME:                           %[[VAL_0:.*]]: tensor<*xf32>) -> tensor<*xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Bitcast"(%[[VAL_0]]) : (tensor<*xf32>) -> tensor<*xf32>
+// CHECK:           return %[[VAL_1]] : tensor<*xf32>
+// CHECK:         }
 func @bitcast_unranked(%arg0: tensor<*xf32>) -> tensor<*xf32> {
   %0 = "mhlo.bitcast_convert"(%arg0) : (tensor<*xf32>) -> tensor<*xf32>
   return %0 : tensor<*xf32>
 }
 
+// CHECK-LABEL:   func @bitcast_same_widths(
+// CHECK-SAME:                              %[[VAL_0:.*]]: tensor<2xf32>) -> tensor<2xi32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Bitcast"(%[[VAL_0]]) : (tensor<2xf32>) -> tensor<2xi32>
+// CHECK:           return %[[VAL_1]] : tensor<2xi32>
+// CHECK:         }
 func @bitcast_same_widths(%arg0: tensor<2xf32>) -> tensor<2xi32> {
   %0 = "mhlo.bitcast_convert"(%arg0) : (tensor<2xf32>) -> tensor<2xi32>
   return %0 : tensor<2xi32>
 }
 
+// CHECK-LABEL:   func @sign(
+// CHECK-SAME:               %[[VAL_0:.*]]: tensor<1x2x3x4xf32>) -> tensor<1x2x3x4xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.NotEqual"(%[[VAL_0]], %[[VAL_0]]) {incompatible_shape_error = true} : (tensor<1x2x3x4xf32>, tensor<1x2x3x4xf32>) -> tensor<1x2x3x4xi1>
+// CHECK:           %[[VAL_2:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1x2x3x4xf32>} : () -> tensor<1x2x3x4xf32>
+// CHECK:           %[[VAL_3:.*]] = "tf.NotEqual"(%[[VAL_0]], %[[VAL_0]]) {incompatible_shape_error = true} : (tensor<1x2x3x4xf32>, tensor<1x2x3x4xf32>) -> tensor<1x2x3x4xi1>
+// CHECK:           %[[VAL_4:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1x2x3x4xf32>} : () -> tensor<1x2x3x4xf32>
+// CHECK:           %[[VAL_5:.*]] = "tf.Sign"(%[[VAL_0]]) : (tensor<1x2x3x4xf32>) -> tensor<1x2x3x4xf32>
+// CHECK:           %[[VAL_6:.*]] = "tf.Select"(%[[VAL_3]], %[[VAL_4]], %[[VAL_5]]) : (tensor<1x2x3x4xi1>, tensor<1x2x3x4xf32>, tensor<1x2x3x4xf32>) -> tensor<1x2x3x4xf32>
+// CHECK:           %[[VAL_7:.*]] = "tf.Select"(%[[VAL_1]], %[[VAL_2]], %[[VAL_6]]) : (tensor<1x2x3x4xi1>, tensor<1x2x3x4xf32>, tensor<1x2x3x4xf32>) -> tensor<1x2x3x4xf32>
+// CHECK:           return %[[VAL_7]] : tensor<1x2x3x4xf32>
+// CHECK:         }
 func @sign(%arg0: tensor<1x2x3x4xf32>) -> tensor<1x2x3x4xf32> {
   %0 = "mhlo.compare"(%arg0, %arg0) {comparison_direction = "NE"} : (tensor<1x2x3x4xf32>, tensor<1x2x3x4xf32>) -> tensor<1x2x3x4xi1>
   %1 = mhlo.constant dense<0.000000e+00> : tensor<1x2x3x4xf32>
@@ -662,72 +1384,180 @@ func @sign(%arg0: tensor<1x2x3x4xf32>) -> tensor<1x2x3x4xf32> {
   return %6 : tensor<1x2x3x4xf32>
 }
 
+// CHECK-LABEL:   func @size_rank_one_i32(
+// CHECK-SAME:                            %[[VAL_0:.*]]: tensor<f32>) -> tensor<i32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+// CHECK:           return %[[VAL_1]] : tensor<i32>
+// CHECK:         }
 func @size_rank_one_i32(%arg0: tensor<f32>) -> tensor<i32> {
   %0 = mhlo.constant dense<1> : tensor<i32>
   return %0 : tensor<i32>
 }
 
+// CHECK-LABEL:   func @size_rank_one_i64(
+// CHECK-SAME:                            %[[VAL_0:.*]]: tensor<f32>) -> tensor<i64> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Const"() {value = dense<1> : tensor<i64>} : () -> tensor<i64>
+// CHECK:           return %[[VAL_1]] : tensor<i64>
+// CHECK:         }
 func @size_rank_one_i64(%arg0: tensor<f32>) -> tensor<i64> {
   %0 = mhlo.constant dense<1> : tensor<i64>
   return %0 : tensor<i64>
 }
 
+// CHECK-LABEL:   func @complex(
+// CHECK-SAME:                  %[[VAL_0:.*]]: tensor<3xf32>,
+// CHECK-SAME:                  %[[VAL_1:.*]]: tensor<3xf32>) -> tensor<3xcomplex<f32>> {
+// CHECK:           %[[VAL_2:.*]] = "tf.Complex"(%[[VAL_0]], %[[VAL_1]]) : (tensor<3xf32>, tensor<3xf32>) -> tensor<3xcomplex<f32>>
+// CHECK:           return %[[VAL_2]] : tensor<3xcomplex<f32>>
+// CHECK:         }
 func @complex(%arg0: tensor<3xf32>, %arg1: tensor<3xf32>) -> tensor<3xcomplex<f32>> {
   %0 = "mhlo.complex"(%arg0, %arg1) : (tensor<3xf32>, tensor<3xf32>) -> tensor<3xcomplex<f32>>
   return %0 : tensor<3xcomplex<f32>>
 }
 
+// CHECK-LABEL:   func @convert_i32_f32(
+// CHECK-SAME:                          %[[VAL_0:.*]]: tensor<2xi32>) -> tensor<2xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Cast"(%[[VAL_0]]) {Truncate = false} : (tensor<2xi32>) -> tensor<2xf32>
+// CHECK:           return %[[VAL_1]] : tensor<2xf32>
+// CHECK:         }
 func @convert_i32_f32(%arg0: tensor<2xi32>) -> tensor<2xf32> {
   %0 = "mhlo.convert"(%arg0) : (tensor<2xi32>) -> tensor<2xf32>
   return %0 : tensor<2xf32>
 }
 
+// CHECK-LABEL:   func @convert_slice(
+// CHECK-SAME:                        %[[VAL_0:.*]]: tensor<1x4672xf32>) -> tensor<1x519xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Const"() {value = dense<[0, 4153]> : tensor<2xi64>} : () -> tensor<2xi64>
+// CHECK:           %[[VAL_2:.*]] = "tf.Const"() {value = dense<[1, 519]> : tensor<2xi64>} : () -> tensor<2xi64>
+// CHECK:           %[[VAL_3:.*]] = "tf.Slice"(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]]) : (tensor<1x4672xf32>, tensor<2xi64>, tensor<2xi64>) -> tensor<1x519xf32>
+// CHECK:           return %[[VAL_3]] : tensor<1x519xf32>
+// CHECK:         }
 func @convert_slice(%arg0: tensor<1x4672xf32>) -> tensor<1x519xf32> {
   %0 = "mhlo.slice"(%arg0) {limit_indices = dense<[1, 4672]> : tensor<2xi64>, start_indices = dense<[0, 4153]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x4672xf32>) -> tensor<1x519xf32>
   return %0 : tensor<1x519xf32>
 }
 
+// CHECK-LABEL:   func @reshape(
+// CHECK-SAME:                  %[[VAL_0:.*]]: tensor<4x6xf32>) -> tensor<2x2x6xf32> {
+// CHECK:           %[[VAL_1:.*]] = constant dense<[2, 2, 6]> : tensor<3xi64>
+// CHECK:           %[[VAL_2:.*]] = "tf.Reshape"(%[[VAL_0]], %[[VAL_1]]) : (tensor<4x6xf32>, tensor<3xi64>) -> tensor<2x2x6xf32>
+// CHECK:           return %[[VAL_2]] : tensor<2x2x6xf32>
+// CHECK:         }
 func @reshape(%arg0: tensor<4x6xf32>) -> tensor<2x2x6xf32> {
   %0 = "mhlo.reshape"(%arg0) : (tensor<4x6xf32>) -> tensor<2x2x6xf32>
   return %0 : tensor<2x2x6xf32>
 
 }
 
+// CHECK-LABEL:   func @convert_dot_1d_2d(
+// CHECK-SAME:                            %[[VAL_0:.*]]: tensor<256xf32>,
+// CHECK-SAME:                            %[[VAL_1:.*]]: tensor<256x1xf32>) -> tensor<1xf32> {
+// CHECK:           %[[VAL_2:.*]] = constant dense<[1, 256]> : tensor<2xi64>
+// CHECK:           %[[VAL_3:.*]] = "tf.Reshape"(%[[VAL_0]], %[[VAL_2]]) : (tensor<256xf32>, tensor<2xi64>) -> tensor<1x256xf32>
+// CHECK:           %[[VAL_4:.*]] = "tf.MatMul"(%[[VAL_3]], %[[VAL_1]]) {transpose_a = false, transpose_b = false} : (tensor<1x256xf32>, tensor<256x1xf32>) -> tensor<1x1xf32>
+// CHECK:           %[[VAL_5:.*]] = constant dense<1> : tensor<1xi64>
+// CHECK:           %[[VAL_6:.*]] = "tf.Reshape"(%[[VAL_4]], %[[VAL_5]]) : (tensor<1x1xf32>, tensor<1xi64>) -> tensor<1xf32>
+// CHECK:           return %[[VAL_6]] : tensor<1xf32>
+// CHECK:         }
 func @convert_dot_1d_2d(%arg0: tensor<256xf32>, %arg1: tensor<256x1xf32>) -> tensor<1xf32> {
   %0 = "mhlo.dot"(%arg0, %arg1) {precision_config = ["DEFAULT", "DEFAULT"]} : (tensor<256xf32>, tensor<256x1xf32>) -> tensor<1xf32>
   return %0 : tensor<1xf32>
 }
 
+// CHECK-LABEL:   func @convert_dot_2d_1d(
+// CHECK-SAME:                            %[[VAL_0:.*]]: tensor<1x256xf32>,
+// CHECK-SAME:                            %[[VAL_1:.*]]: tensor<256xf32>) -> tensor<1xf32> {
+// CHECK:           %[[VAL_2:.*]] = constant dense<[1, 256]> : tensor<2xi64>
+// CHECK:           %[[VAL_3:.*]] = "tf.Reshape"(%[[VAL_1]], %[[VAL_2]]) : (tensor<256xf32>, tensor<2xi64>) -> tensor<1x256xf32>
+// CHECK:           %[[VAL_4:.*]] = "tf.MatMul"(%[[VAL_0]], %[[VAL_3]]) {transpose_a = false, transpose_b = true} : (tensor<1x256xf32>, tensor<1x256xf32>) -> tensor<1x1xf32>
+// CHECK:           %[[VAL_5:.*]] = constant dense<1> : tensor<1xi64>
+// CHECK:           %[[VAL_6:.*]] = "tf.Reshape"(%[[VAL_4]], %[[VAL_5]]) : (tensor<1x1xf32>, tensor<1xi64>) -> tensor<1xf32>
+// CHECK:           return %[[VAL_6]] : tensor<1xf32>
+// CHECK:         }
 func @convert_dot_2d_1d(%arg0: tensor<1x256xf32>, %arg1: tensor<256xf32>) -> tensor<1xf32> {
   %0 = "mhlo.dot"(%arg0, %arg1) {precision_config = ["DEFAULT", "DEFAULT"]} : (tensor<1x256xf32>, tensor<256xf32>) -> tensor<1xf32>
   return %0 : tensor<1xf32>
 }
 
+// CHECK-LABEL:   func @convert_dot_1d_1d(
+// CHECK-SAME:                            %[[VAL_0:.*]]: tensor<256xf32>,
+// CHECK-SAME:                            %[[VAL_1:.*]]: tensor<256xf32>) -> tensor<f32> {
+// CHECK:           %[[VAL_2:.*]] = constant dense<[1, 256]> : tensor<2xi64>
+// CHECK:           %[[VAL_3:.*]] = "tf.Reshape"(%[[VAL_0]], %[[VAL_2]]) : (tensor<256xf32>, tensor<2xi64>) -> tensor<1x256xf32>
+// CHECK:           %[[VAL_4:.*]] = constant dense<[1, 256]> : tensor<2xi64>
+// CHECK:           %[[VAL_5:.*]] = "tf.Reshape"(%[[VAL_1]], %[[VAL_4]]) : (tensor<256xf32>, tensor<2xi64>) -> tensor<1x256xf32>
+// CHECK:           %[[VAL_6:.*]] = "tf.MatMul"(%[[VAL_3]], %[[VAL_5]]) {transpose_a = false, transpose_b = true} : (tensor<1x256xf32>, tensor<1x256xf32>) -> tensor<1x1xf32>
+// CHECK:           %[[VAL_7:.*]] = constant dense<> : tensor<0xi64>
+// CHECK:           %[[VAL_8:.*]] = "tf.Reshape"(%[[VAL_6]], %[[VAL_7]]) : (tensor<1x1xf32>, tensor<0xi64>) -> tensor<f32>
+// CHECK:           return %[[VAL_8]] : tensor<f32>
+// CHECK:         }
 func @convert_dot_1d_1d(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) -> tensor<f32> {
   %0 = "mhlo.dot"(%arg0, %arg1) {precision_config = ["DEFAULT", "DEFAULT"]} : (tensor<256xf32>, tensor<256xf32>) -> tensor<f32>
   return %0 : tensor<f32>
 }
 
+// CHECK-LABEL:   func @convert_dot_2d_2d(
+// CHECK-SAME:                            %[[VAL_0:.*]]: tensor<1x256xf32>,
+// CHECK-SAME:                            %[[VAL_1:.*]]: tensor<256x1xf32>) -> tensor<1x1xf32> {
+// CHECK:           %[[VAL_2:.*]] = "tf.MatMul"(%[[VAL_0]], %[[VAL_1]]) {transpose_a = false, transpose_b = false} : (tensor<1x256xf32>, tensor<256x1xf32>) -> tensor<1x1xf32>
+// CHECK:           return %[[VAL_2]] : tensor<1x1xf32>
+// CHECK:         }
 func @convert_dot_2d_2d(%arg0: tensor<1x256xf32>, %arg1: tensor<256x1xf32>) -> tensor<1x1xf32> {
   %0 = "mhlo.dot"(%arg0, %arg1) {precision_config = ["DEFAULT", "DEFAULT"]} : (tensor<1x256xf32>, tensor<256x1xf32>) -> tensor<1x1xf32>
   return %0 : tensor<1x1xf32>
 }
 
+// CHECK-LABEL:   func @broadcast_in_dim_tf_style(
+// CHECK-SAME:                                    %[[VAL_0:.*]]: tensor<8x1x16xf32>) -> tensor<3x8x8x16xf32> {
+// CHECK:           %[[VAL_1:.*]] = constant dense<[3, 8, 8, 16]> : tensor<4xi64>
+// CHECK:           %[[VAL_2:.*]] = "tf.BroadcastTo"(%[[VAL_0]], %[[VAL_1]]) : (tensor<8x1x16xf32>, tensor<4xi64>) -> tensor<3x8x8x16xf32>
+// CHECK:           return %[[VAL_2]] : tensor<3x8x8x16xf32>
+// CHECK:         }
 func @broadcast_in_dim_tf_style(%arg0: tensor<8x1x16xf32>) -> tensor<3x8x8x16xf32> {
   %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[1, 2, 3]> : tensor<3xi64>, name = "broadcast.0"} : (tensor<8x1x16xf32>) -> tensor<3x8x8x16xf32>
   return %0 : tensor<3x8x8x16xf32>
 }
 
+// CHECK-LABEL:   func @broadcast_in_dim_general_case(
+// CHECK-SAME:                                        %[[VAL_0:.*]]: tensor<3x1x16xf32>) -> tensor<3x8x8x16xf32> {
+// CHECK:           %[[VAL_1:.*]] = constant dense<[3, 1, 1, 16]> : tensor<4xi64>
+// CHECK:           %[[VAL_2:.*]] = "tf.Reshape"(%[[VAL_0]], %[[VAL_1]]) : (tensor<3x1x16xf32>, tensor<4xi64>) -> tensor<3x1x1x16xf32>
+// CHECK:           %[[VAL_3:.*]] = constant dense<[3, 8, 8, 16]> : tensor<4xi64>
+// CHECK:           %[[VAL_4:.*]] = "tf.BroadcastTo"(%[[VAL_2]], %[[VAL_3]]) : (tensor<3x1x1x16xf32>, tensor<4xi64>) -> tensor<3x8x8x16xf32>
+// CHECK:           return %[[VAL_4]] : tensor<3x8x8x16xf32>
+// CHECK:         }
 func @broadcast_in_dim_general_case(%arg0: tensor<3x1x16xf32>) -> tensor<3x8x8x16xf32> {
   %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 2, 3]> : tensor<3xi64>, name = "broadcast.0"} : (tensor<3x1x16xf32>) -> tensor<3x8x8x16xf32>
   return %0 : tensor<3x8x8x16xf32>
 }
 
+// CHECK-LABEL:   func @convert_dot_general(
+// CHECK-SAME:                              %[[VAL_0:.*]]: tensor<3x2x6x5x1xf32>,
+// CHECK-SAME:                              %[[VAL_1:.*]]: tensor<3x2x4x6xf32>) -> tensor<3x5x1x4xf32> {
+// CHECK:           %[[VAL_2:.*]] = "tf.Const"() {value = dense<[0, 3, 4, 1, 2]> : tensor<5xi64>} : () -> tensor<5xi64>
+// CHECK:           %[[VAL_3:.*]] = "tf.Transpose"(%[[VAL_0]], %[[VAL_2]]) : (tensor<3x2x6x5x1xf32>, tensor<5xi64>) -> tensor<3x5x1x2x6xf32>
+// CHECK:           %[[VAL_4:.*]] = "tf.Const"() {value = dense<[0, 1, 3, 2]> : tensor<4xi64>} : () -> tensor<4xi64>
+// CHECK:           %[[VAL_5:.*]] = "tf.Transpose"(%[[VAL_1]], %[[VAL_4]]) : (tensor<3x2x4x6xf32>, tensor<4xi64>) -> tensor<3x2x6x4xf32>
+// CHECK:           %[[VAL_6:.*]] = constant dense<[3, 5, 12]> : tensor<3xi64>
+// CHECK:           %[[VAL_7:.*]] = "tf.Reshape"(%[[VAL_3]], %[[VAL_6]]) : (tensor<3x5x1x2x6xf32>, tensor<3xi64>) -> tensor<3x5x12xf32>
+// CHECK:           %[[VAL_8:.*]] = constant dense<[3, 12, 4]> : tensor<3xi64>
+// CHECK:           %[[VAL_9:.*]] = "tf.Reshape"(%[[VAL_5]], %[[VAL_8]]) : (tensor<3x2x6x4xf32>, tensor<3xi64>) -> tensor<3x12x4xf32>
+// CHECK:           %[[VAL_10:.*]] = "tf.BatchMatMulV2"(%[[VAL_7]], %[[VAL_9]]) {adj_x = false, adj_y = false} : (tensor<3x5x12xf32>, tensor<3x12x4xf32>) -> tensor<3x5x4xf32>
+// CHECK:           %[[VAL_11:.*]] = constant dense<[3, 5, 1, 4]> : tensor<4xi64>
+// CHECK:           %[[VAL_12:.*]] = "tf.Reshape"(%[[VAL_10]], %[[VAL_11]]) : (tensor<3x5x4xf32>, tensor<4xi64>) -> tensor<3x5x1x4xf32>
+// CHECK:           return %[[VAL_12]] : tensor<3x5x1x4xf32>
+// CHECK:         }
 func @convert_dot_general(%arg0: tensor<3x2x6x5x1xf32>, %arg1: tensor<3x2x4x6xf32>) -> tensor<3x5x1x4xf32> {
   %0 = "mhlo.dot_general"(%arg0, %arg1) {dot_dimension_numbers = {lhs_batching_dimensions = dense<0> : tensor<1xi64>, lhs_contracting_dimensions = dense<[1, 2]> : tensor<2xi64>, rhs_batching_dimensions = dense<0> : tensor<1xi64>, rhs_contracting_dimensions = dense<[1, 3]> : tensor<2xi64>}, precision_config = ["DEFAULT", "DEFAULT"]} : (tensor<3x2x6x5x1xf32>, tensor<3x2x4x6xf32>) -> tensor<3x5x1x4xf32>
   return %0 : tensor<3x5x1x4xf32>
 }
 
+// CHECK-LABEL:   func @convert_conv2d(
+// CHECK-SAME:                         %[[VAL_0:.*]]: tensor<1x8x8x207xf32>,
+// CHECK-SAME:                         %[[VAL_1:.*]]: tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32> {
+// CHECK:           %[[VAL_2:.*]] = "tf.Conv2D"(%[[VAL_0]], %[[VAL_1]]) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true} : (tensor<1x8x8x207xf32>, tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32>
+// CHECK:           return %[[VAL_2]] : tensor<1x8x8x16xf32>
+// CHECK:         }
 func @convert_conv2d(%arg0: tensor<1x8x8x207xf32>, %arg1: tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32> {
   %0 = "mhlo.convolution"(%arg0, %arg1) {batch_group_count = 1 : i64, dimension_numbers =
        {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>},
@@ -736,6 +1566,12 @@ func @convert_conv2d(%arg0: tensor<1x8x8x207xf32>, %arg1: tensor<3x3x207x16xf32>
   return %0 : tensor<1x8x8x16xf32>
 }
 
+// CHECK-LABEL:   func @convert_depthwise_conv2d(
+// CHECK-SAME:                                   %[[VAL_0:.*]]: tensor<1x8x8x207xf32>,
+// CHECK-SAME:                                   %[[VAL_1:.*]]: tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32> {
+// CHECK:           %[[VAL_2:.*]] = "tf.DepthwiseConv2dNative"(%[[VAL_0]], %[[VAL_1]]) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<1x8x8x207xf32>, tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32>
+// CHECK:           return %[[VAL_2]] : tensor<1x8x8x16xf32>
+// CHECK:         }
 func @convert_depthwise_conv2d(%arg0: tensor<1x8x8x207xf32>, %arg1: tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32> {
   %0 = "mhlo.convolution"(%arg0, %arg1) {batch_group_count = 1 : i64, dimension_numbers =
        {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>},
@@ -744,6 +1580,12 @@ func @convert_depthwise_conv2d(%arg0: tensor<1x8x8x207xf32>, %arg1: tensor<3x3x2
   return %0 : tensor<1x8x8x16xf32>
 }
 
+// CHECK-LABEL:   func @convert_conv2d_valid_padding(
+// CHECK-SAME:                                       %[[VAL_0:.*]]: tensor<1x8x8x207xf32>,
+// CHECK-SAME:                                       %[[VAL_1:.*]]: tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32> {
+// CHECK:           %[[VAL_2:.*]] = "tf.Conv2D"(%[[VAL_0]], %[[VAL_1]]) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "VALID", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true} : (tensor<1x8x8x207xf32>, tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32>
+// CHECK:           return %[[VAL_2]] : tensor<1x8x8x16xf32>
+// CHECK:         }
 func @convert_conv2d_valid_padding(%arg0: tensor<1x8x8x207xf32>, %arg1: tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32> {
   %0 = "mhlo.convolution"(%arg0, %arg1) {batch_group_count = 1 : i64, dimension_numbers =
        {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>},
@@ -752,6 +1594,13 @@ func @convert_conv2d_valid_padding(%arg0: tensor<1x8x8x207xf32>, %arg1: tensor<3
   return %0 : tensor<1x8x8x16xf32>
 }
 
+// CHECK-LABEL:   func @convert_reduce_to_sum(
+// CHECK-SAME:                                %[[VAL_0:.*]]: tensor<1x256xf32>) -> tensor<1xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<f32>} : () -> tensor<f32>
+// CHECK:           %[[VAL_2:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi64>} : () -> tensor<1xi64>
+// CHECK:           %[[VAL_3:.*]] = "tf.Sum"(%[[VAL_0]], %[[VAL_2]]) {keep_dims = false} : (tensor<1x256xf32>, tensor<1xi64>) -> tensor<1xf32>
+// CHECK:           return %[[VAL_3]] : tensor<1xf32>
+// CHECK:         }
 func @convert_reduce_to_sum(%arg0: tensor<1x256xf32>) -> tensor<1xf32> {
   %0 = mhlo.constant dense<0.000000e+00> : tensor<f32>
   %1 = "mhlo.reduce"(%arg0, %0) ( {
@@ -762,6 +1611,13 @@ func @convert_reduce_to_sum(%arg0: tensor<1x256xf32>) -> tensor<1xf32> {
   return %1 : tensor<1xf32>
 }
 
+// CHECK-LABEL:   func @convert_reduce_to_max(
+// CHECK-SAME:                                %[[VAL_0:.*]]: tensor<1x256xf32>) -> tensor<1xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Const"() {value = dense<0xFF800000> : tensor<f32>} : () -> tensor<f32>
+// CHECK:           %[[VAL_2:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi64>} : () -> tensor<1xi64>
+// CHECK:           %[[VAL_3:.*]] = "tf.Max"(%[[VAL_0]], %[[VAL_2]]) {keep_dims = false} : (tensor<1x256xf32>, tensor<1xi64>) -> tensor<1xf32>
+// CHECK:           return %[[VAL_3]] : tensor<1xf32>
+// CHECK:         }
 func @convert_reduce_to_max(%arg0: tensor<1x256xf32>) -> tensor<1xf32> {
   // "0xFF800000" represents -INF for f32.
   %0 = mhlo.constant dense<0xFF800000> : tensor<f32>
@@ -773,7 +1629,13 @@ func @convert_reduce_to_max(%arg0: tensor<1x256xf32>) -> tensor<1xf32> {
   return %1 : tensor<1xf32>
 }
 
-
+// CHECK-LABEL:   func @convert_reduce_to_min(
+// CHECK-SAME:                                %[[VAL_0:.*]]: tensor<1x256xf32>) -> tensor<1xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Const"() {value = dense<0x7F800000> : tensor<f32>} : () -> tensor<f32>
+// CHECK:           %[[VAL_2:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi64>} : () -> tensor<1xi64>
+// CHECK:           %[[VAL_3:.*]] = "tf.Min"(%[[VAL_0]], %[[VAL_2]]) {keep_dims = false} : (tensor<1x256xf32>, tensor<1xi64>) -> tensor<1xf32>
+// CHECK:           return %[[VAL_3]] : tensor<1xf32>
+// CHECK:         }
 func @convert_reduce_to_min(%arg0: tensor<1x256xf32>) -> tensor<1xf32> {
   // "0x7F800000" represents INF for f32.
   %0 = mhlo.constant dense<0x7F800000> : tensor<f32>
@@ -785,928 +1647,31 @@ func @convert_reduce_to_min(%arg0: tensor<1x256xf32>) -> tensor<1xf32> {
   return %1 : tensor<1xf32>
 }
 
+// CHECK-LABEL:   func @convert_iota_1d() -> tensor<123xf32> {
+// CHECK:           %[[VAL_0:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<f32>} : () -> tensor<f32>
+// CHECK:           %[[VAL_1:.*]] = "tf.Const"() {value = dense<1.230000e+02> : tensor<f32>} : () -> tensor<f32>
+// CHECK:           %[[VAL_2:.*]] = "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
+// CHECK:           %[[VAL_3:.*]] = "tf.Range"(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]]) : (tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<123xf32>
+// CHECK:           return %[[VAL_3]] : tensor<123xf32>
+// CHECK:         }
+func @convert_iota_1d() -> tensor<123xf32> {
+  %0 = "mhlo.iota"() { iota_dimension = 0 : i64 } : () -> tensor<123xf32>
+  return %0 : tensor<123xf32>
+}
+
+// CHECK-LABEL:   func @convert_iota_3d() -> tensor<5x7x9xi32> {
+// CHECK:           %[[VAL_0:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK:           %[[VAL_1:.*]] = "tf.Const"() {value = dense<7> : tensor<i32>} : () -> tensor<i32>
+// CHECK:           %[[VAL_2:.*]] = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+// CHECK:           %[[VAL_3:.*]] = "tf.Range"(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]]) : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<7xi32>
+// CHECK:           %[[VAL_4:.*]] = "tf.Const"() {value = dense<[1, 7, 1]> : tensor<3xi64>} : () -> tensor<3xi64>
+// CHECK:           %[[VAL_5:.*]] = "tf.Reshape"(%[[VAL_3]], %[[VAL_4]]) : (tensor<7xi32>, tensor<3xi64>) -> tensor<1x7x1xi32>
+// CHECK:           %[[VAL_6:.*]] = "tf.Const"() {value = dense<[5, 7, 9]> : tensor<3xi64>} : () -> tensor<3xi64>
+// CHECK:           %[[VAL_7:.*]] = "tf.BroadcastTo"(%[[VAL_5]], %[[VAL_6]]) : (tensor<1x7x1xi32>, tensor<3xi64>) -> tensor<5x7x9xi32>
+// CHECK:           return %[[VAL_7]] : tensor<5x7x9xi32>
+// CHECK:         }
+func @convert_iota_3d() -> tensor<5x7x9xi32> {
+  %0 = "mhlo.iota"() { iota_dimension = 1 : i64 } : () -> tensor<5x7x9xi32>
+  return %0 : tensor<5x7x9xi32>
+}
 
-
-// NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
-
-// CHECK-LABEL:   func @biasAdd_NHWC(
-// CHECK-SAME:                       [[VAL_0:%.*]]: tensor<1x32x10x32xi32>, [[VAL_1:%.*]]: tensor<32xi32>) -> tensor<1x32x10x32xi32> {
-// CHECK:           [[VAL_2:%.*]] = "tf.AddV2"([[VAL_0]], [[VAL_1]]) : (tensor<1x32x10x32xi32>, tensor<32xi32>) -> tensor<1x32x10x32xi32>
-// CHECK:           return [[VAL_2]] : tensor<1x32x10x32xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @biasAdd_NCHW(
-// CHECK-SAME:                       [[VAL_3:%.*]]: tensor<1x32x10x32xi32>, [[VAL_4:%.*]]: tensor<32xi32>) -> tensor<1x32x10x32xi32> {
-// CHECK:           [[VAL_5:%.*]] = "tf.AddV2"([[VAL_3]], [[VAL_4]]) : (tensor<1x32x10x32xi32>, tensor<32xi32>) -> tensor<1x32x10x32xi32>
-// CHECK:           return [[VAL_5]] : tensor<1x32x10x32xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @biasAdd_dynamic(
-// CHECK-SAME:                          [[VAL_6:%.*]]: tensor<?x?x?x?xi32>, [[VAL_7:%.*]]: tensor<?xi32>) -> tensor<?x?x?x?xi32> {
-// CHECK:           [[VAL_8:%.*]] = "tf.AddV2"([[VAL_6]], [[VAL_7]]) : (tensor<?x?x?x?xi32>, tensor<?xi32>) -> tensor<?x?x?x?xi32>
-// CHECK:           return [[VAL_8]] : tensor<?x?x?x?xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @add(
-// CHECK-SAME:              [[VAL_9:%.*]]: tensor<2xi32>) -> tensor<2xi32> {
-// CHECK:           [[VAL_10:%.*]] = "tf.AddV2"([[VAL_9]], [[VAL_9]]) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-// CHECK:           [[VAL_11:%.*]] = "tf.AddV2"([[VAL_10]], [[VAL_9]]) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-// CHECK:           return [[VAL_11]] : tensor<2xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @broadcast_add(
-// CHECK-SAME:                        [[VAL_12:%.*]]: tensor<1xi32>, [[VAL_13:%.*]]: tensor<1x2xi32>) -> tensor<1x2xi32> {
-// CHECK:           [[VAL_14:%.*]] = "tf.AddV2"([[VAL_12]], [[VAL_13]]) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
-// CHECK:           return [[VAL_14]] : tensor<1x2xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @broadcast_multi_dim_add(
-// CHECK-SAME:                                  [[VAL_15:%.*]]: tensor<4x1x1xi32>, [[VAL_16:%.*]]: tensor<4x4x4x4xi32>) -> tensor<4x4x4x4xi32> {
-// CHECK:           [[VAL_17:%.*]] = "tf.AddV2"([[VAL_15]], [[VAL_16]]) : (tensor<4x1x1xi32>, tensor<4x4x4x4xi32>) -> tensor<4x4x4x4xi32>
-// CHECK:           return [[VAL_17]] : tensor<4x4x4x4xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @div(
-// CHECK-SAME:              [[VAL_18:%.*]]: tensor<2xi32>) -> tensor<2xi32> {
-// CHECK:           [[VAL_19:%.*]] = "tf.Div"([[VAL_18]], [[VAL_18]]) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-// CHECK:           return [[VAL_19]] : tensor<2xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @broadcast_div(
-// CHECK-SAME:                        [[VAL_20:%.*]]: tensor<1xi32>, [[VAL_21:%.*]]: tensor<1x2xi32>) -> tensor<1x2xi32> {
-// CHECK:           [[VAL_22:%.*]] = "tf.Div"([[VAL_20]], [[VAL_21]]) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
-// CHECK:           return [[VAL_22]] : tensor<1x2xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @shift_left(
-// CHECK-SAME:                     [[VAL_23:%.*]]: tensor<4xi32>, [[VAL_24:%.*]]: tensor<4xi32>) -> tensor<4xi32> {
-// CHECK:           [[VAL_25:%.*]] = "tf.LeftShift"([[VAL_23]], [[VAL_24]]) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
-// CHECK:           return [[VAL_25]] : tensor<4xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @div_dynamic(
-// CHECK-SAME:                      [[VAL_26:%.*]]: tensor<?xi32>, [[VAL_27:%.*]]: tensor<?x?xi32>) -> tensor<?x?xi32> {
-// CHECK:           [[VAL_28:%.*]] = "tf.Div"([[VAL_26]], [[VAL_27]]) : (tensor<?xi32>, tensor<?x?xi32>) -> tensor<?x?xi32>
-// CHECK:           return [[VAL_28]] : tensor<?x?xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @maximum(
-// CHECK-SAME:                  [[VAL_29:%.*]]: tensor<4xf32>, [[VAL_30:%.*]]: tensor<4xf32>) -> tensor<4xf32> {
-// CHECK:           [[VAL_31:%.*]] = "tf.Maximum"([[VAL_29]], [[VAL_30]]) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-// CHECK:           return [[VAL_31]] : tensor<4xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @minimum(
-// CHECK-SAME:                  [[VAL_32:%.*]]: tensor<4xf32>, [[VAL_33:%.*]]: tensor<4xf32>) -> tensor<4xf32> {
-// CHECK:           [[VAL_34:%.*]] = "tf.Minimum"([[VAL_32]], [[VAL_33]]) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-// CHECK:           return [[VAL_34]] : tensor<4xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @mul(
-// CHECK-SAME:              [[VAL_35:%.*]]: tensor<2xi32>) -> tensor<2xi32> {
-// CHECK:           [[VAL_36:%.*]] = "tf.Mul"([[VAL_35]], [[VAL_35]]) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-// CHECK:           return [[VAL_36]] : tensor<2xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @broadcast_mul(
-// CHECK-SAME:                        [[VAL_37:%.*]]: tensor<1xi32>, [[VAL_38:%.*]]: tensor<1x2xi32>) -> tensor<1x2xi32> {
-// CHECK:           [[VAL_39:%.*]] = "tf.Mul"([[VAL_37]], [[VAL_38]]) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
-// CHECK:           return [[VAL_39]] : tensor<1x2xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @real_div(
-// CHECK-SAME:                   [[VAL_40:%.*]]: tensor<2xi32>) -> tensor<2xi32> {
-// CHECK:           [[VAL_41:%.*]] = "tf.Div"([[VAL_40]], [[VAL_40]]) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-// CHECK:           return [[VAL_41]] : tensor<2xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @broadcast_real_div(
-// CHECK-SAME:                             [[VAL_42:%.*]]: tensor<1xi32>, [[VAL_43:%.*]]: tensor<1x2xi32>) -> tensor<1x2xi32> {
-// CHECK:           [[VAL_44:%.*]] = "tf.Div"([[VAL_42]], [[VAL_43]]) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
-// CHECK:           return [[VAL_44]] : tensor<1x2xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @sub(
-// CHECK-SAME:              [[VAL_45:%.*]]: tensor<2xi32>) -> tensor<2xi32> {
-// CHECK:           [[VAL_46:%.*]] = "tf.Sub"([[VAL_45]], [[VAL_45]]) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-// CHECK:           return [[VAL_46]] : tensor<2xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @broadcast_sub(
-// CHECK-SAME:                        [[VAL_47:%.*]]: tensor<1xi32>, [[VAL_48:%.*]]: tensor<1x2xi32>) -> tensor<1x2xi32> {
-// CHECK:           [[VAL_49:%.*]] = "tf.Sub"([[VAL_47]], [[VAL_48]]) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
-// CHECK:           return [[VAL_49]] : tensor<1x2xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @shift_right(
-// CHECK-SAME:                      [[VAL_50:%.*]]: tensor<4xi32>, [[VAL_51:%.*]]: tensor<4xi32>) -> tensor<4xi32> {
-// CHECK:           [[VAL_52:%.*]] = "tf.RightShift"([[VAL_50]], [[VAL_51]]) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
-// CHECK:           return [[VAL_52]] : tensor<4xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @broadcast_shift_right(
-// CHECK-SAME:                                [[VAL_53:%.*]]: tensor<4xi32>, [[VAL_54:%.*]]: tensor<2x4xi32>) -> tensor<2x4xi32> {
-// CHECK:           [[VAL_55:%.*]] = "tf.RightShift"([[VAL_53]], [[VAL_54]]) : (tensor<4xi32>, tensor<2x4xi32>) -> tensor<2x4xi32>
-// CHECK:           return [[VAL_55]] : tensor<2x4xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @and(
-// CHECK-SAME:              [[VAL_56:%.*]]: tensor<2xi1>) -> tensor<2xi1> {
-// CHECK:           [[VAL_57:%.*]] = "tf.LogicalAnd"([[VAL_56]], [[VAL_56]]) : (tensor<2xi1>, tensor<2xi1>) -> tensor<2xi1>
-// CHECK:           return [[VAL_57]] : tensor<2xi1>
-// CHECK:         }
-
-// CHECK-LABEL:   func @and_broadcast(
-// CHECK-SAME:                        [[VAL_58:%.*]]: tensor<1xi1>, [[VAL_59:%.*]]: tensor<1x2xi1>) -> tensor<1x2xi1> {
-// CHECK:           [[VAL_60:%.*]] = "tf.LogicalAnd"([[VAL_58]], [[VAL_59]]) : (tensor<1xi1>, tensor<1x2xi1>) -> tensor<1x2xi1>
-// CHECK:           return [[VAL_60]] : tensor<1x2xi1>
-// CHECK:         }
-
-// CHECK-LABEL:   func @and_dynamic(
-// CHECK-SAME:                      [[VAL_61:%.*]]: tensor<?xi1>, [[VAL_62:%.*]]: tensor<1xi1>) -> tensor<?xi1> {
-// CHECK:           [[VAL_63:%.*]] = "tf.LogicalAnd"([[VAL_61]], [[VAL_62]]) : (tensor<?xi1>, tensor<1xi1>) -> tensor<?xi1>
-// CHECK:           return [[VAL_63]] : tensor<?xi1>
-// CHECK:         }
-
-// CHECK-LABEL:   func @or(
-// CHECK-SAME:             [[VAL_64:%.*]]: tensor<2xi1>) -> tensor<2xi1> {
-// CHECK:           [[VAL_65:%.*]] = "tf.LogicalOr"([[VAL_64]], [[VAL_64]]) : (tensor<2xi1>, tensor<2xi1>) -> tensor<2xi1>
-// CHECK:           return [[VAL_65]] : tensor<2xi1>
-// CHECK:         }
-
-// CHECK-LABEL:   func @or_broadcast(
-// CHECK-SAME:                       [[VAL_66:%.*]]: tensor<1xi1>, [[VAL_67:%.*]]: tensor<1x2xi1>) -> tensor<1x2xi1> {
-// CHECK:           [[VAL_68:%.*]] = "tf.LogicalOr"([[VAL_66]], [[VAL_67]]) : (tensor<1xi1>, tensor<1x2xi1>) -> tensor<1x2xi1>
-// CHECK:           return [[VAL_68]] : tensor<1x2xi1>
-// CHECK:         }
-
-// CHECK-LABEL:   func @or_dynamic(
-// CHECK-SAME:                     [[VAL_69:%.*]]: tensor<?xi1>, [[VAL_70:%.*]]: tensor<1xi1>) -> tensor<?xi1> {
-// CHECK:           [[VAL_71:%.*]] = "tf.LogicalOr"([[VAL_69]], [[VAL_70]]) : (tensor<?xi1>, tensor<1xi1>) -> tensor<?xi1>
-// CHECK:           return [[VAL_71]] : tensor<?xi1>
-// CHECK:         }
-
-// CHECK-LABEL:   func @bitwise_or(
-// CHECK-SAME:                     [[VAL_72:%.*]]: tensor<4xi32>, [[VAL_73:%.*]]: tensor<4xi32>) -> tensor<4xi32> {
-// CHECK:           [[VAL_74:%.*]] = "tf.BitwiseOr"([[VAL_72]], [[VAL_73]]) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
-// CHECK:           return [[VAL_74]] : tensor<4xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @bitwise_or_broadcast(
-// CHECK-SAME:                               [[VAL_75:%.*]]: tensor<1xi8>, [[VAL_76:%.*]]: tensor<1x4xi8>) -> tensor<1x4xi8> {
-// CHECK:           [[VAL_77:%.*]] = "tf.BitwiseOr"([[VAL_75]], [[VAL_76]]) : (tensor<1xi8>, tensor<1x4xi8>) -> tensor<1x4xi8>
-// CHECK:           return [[VAL_77]] : tensor<1x4xi8>
-// CHECK:         }
-
-// CHECK-LABEL:   func @bitwise_or_dynamic(
-// CHECK-SAME:                             [[VAL_78:%.*]]: tensor<?xi32>, [[VAL_79:%.*]]: tensor<1xi32>) -> tensor<?xi32> {
-// CHECK:           [[VAL_80:%.*]] = "tf.BitwiseOr"([[VAL_78]], [[VAL_79]]) : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
-// CHECK:           return [[VAL_80]] : tensor<?xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @bitwise_and(
-// CHECK-SAME:                      [[VAL_81:%.*]]: tensor<4xi32>, [[VAL_82:%.*]]: tensor<4xi32>) -> tensor<4xi32> {
-// CHECK:           [[VAL_83:%.*]] = "tf.BitwiseAnd"([[VAL_81]], [[VAL_82]]) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
-// CHECK:           return [[VAL_83]] : tensor<4xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @bitwise_and_broadcast(
-// CHECK-SAME:                                [[VAL_84:%.*]]: tensor<1xi8>, [[VAL_85:%.*]]: tensor<1x4xi8>) -> tensor<1x4xi8> {
-// CHECK:           [[VAL_86:%.*]] = "tf.BitwiseAnd"([[VAL_84]], [[VAL_85]]) : (tensor<1xi8>, tensor<1x4xi8>) -> tensor<1x4xi8>
-// CHECK:           return [[VAL_86]] : tensor<1x4xi8>
-// CHECK:         }
-
-// CHECK-LABEL:   func @bitwise_and_dynamic(
-// CHECK-SAME:                              [[VAL_87:%.*]]: tensor<?xi32>, [[VAL_88:%.*]]: tensor<1xi32>) -> tensor<?xi32> {
-// CHECK:           [[VAL_89:%.*]] = "tf.BitwiseAnd"([[VAL_87]], [[VAL_88]]) : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
-// CHECK:           return [[VAL_89]] : tensor<?xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @pow(
-// CHECK-SAME:              [[VAL_90:%.*]]: tensor<2xf32>) -> tensor<2xf32> {
-// CHECK:           [[VAL_91:%.*]] = "tf.Pow"([[VAL_90]], [[VAL_90]]) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
-// CHECK:           return [[VAL_91]] : tensor<2xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @pow_dynamic(
-// CHECK-SAME:                      [[VAL_92:%.*]]: tensor<?xf32>) -> tensor<?xf32> {
-// CHECK:           [[VAL_93:%.*]] = "tf.Pow"([[VAL_92]], [[VAL_92]]) : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-// CHECK:           return [[VAL_93]] : tensor<?xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @floordiv_broadcast_i32(
-// CHECK-SAME:                                 [[VAL_94:%.*]]: tensor<2x3xi32>, [[VAL_95:%.*]]: tensor<3xi32>) -> tensor<2x3xi32> {
-// CHECK:           [[VAL_96:%.*]] = "tf.Const"() {value = dense<0> : tensor<2x3xi32>} : () -> tensor<2x3xi32>
-// CHECK:           [[VAL_97:%.*]] = "tf.Less"([[VAL_94]], [[VAL_96]]) : (tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi1>
-// CHECK:           [[VAL_98:%.*]] = "tf.Const"() {value = dense<0> : tensor<3xi32>} : () -> tensor<3xi32>
-// CHECK:           [[VAL_99:%.*]] = "tf.Less"([[VAL_95]], [[VAL_98]]) : (tensor<3xi32>, tensor<3xi32>) -> tensor<3xi1>
-// CHECK:           [[VAL_100:%.*]] = "tf.Equal"([[VAL_97]], [[VAL_99]]) {incompatible_shape_error = true} : (tensor<2x3xi1>, tensor<3xi1>) -> tensor<2x3xi1>
-// CHECK:           [[VAL_101:%.*]] = "tf.Div"([[VAL_94]], [[VAL_95]]) : (tensor<2x3xi32>, tensor<3xi32>) -> tensor<2x3xi32>
-// CHECK:           [[VAL_102:%.*]] = "tf.Abs"([[VAL_94]]) : (tensor<2x3xi32>) -> tensor<2x3xi32>
-// CHECK:           [[VAL_103:%.*]] = "tf.Abs"([[VAL_95]]) : (tensor<3xi32>) -> tensor<3xi32>
-// CHECK:           [[VAL_104:%.*]] = "tf.Const"() {value = dense<1> : tensor<3xi32>} : () -> tensor<3xi32>
-// CHECK:           [[VAL_105:%.*]] = "tf.Sub"([[VAL_103]], [[VAL_104]]) : (tensor<3xi32>, tensor<3xi32>) -> tensor<3xi32>
-// CHECK:           [[VAL_106:%.*]] = "tf.AddV2"([[VAL_102]], [[VAL_105]]) : (tensor<2x3xi32>, tensor<3xi32>) -> tensor<2x3xi32>
-// CHECK:           [[VAL_107:%.*]] = "tf.Neg"([[VAL_106]]) : (tensor<2x3xi32>) -> tensor<2x3xi32>
-// CHECK:           [[VAL_108:%.*]] = "tf.Abs"([[VAL_95]]) : (tensor<3xi32>) -> tensor<3xi32>
-// CHECK:           [[VAL_109:%.*]] = "tf.Div"([[VAL_107]], [[VAL_108]]) : (tensor<2x3xi32>, tensor<3xi32>) -> tensor<2x3xi32>
-// CHECK:           [[VAL_110:%.*]] = "tf.Select"([[VAL_100]], [[VAL_101]], [[VAL_109]]) : (tensor<2x3xi1>, tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
-// CHECK:           return [[VAL_110]] : tensor<2x3xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @floordiv_reverse_broadcast_i32(
-// CHECK-SAME:                                         [[VAL_111:%.*]]: tensor<3xi32>, [[VAL_112:%.*]]: tensor<2x3xi32>) -> tensor<2x3xi32> {
-// CHECK:           [[VAL_113:%.*]] = "tf.Const"() {value = dense<0> : tensor<3xi32>} : () -> tensor<3xi32>
-// CHECK:           [[VAL_114:%.*]] = "tf.Less"([[VAL_111]], [[VAL_113]]) : (tensor<3xi32>, tensor<3xi32>) -> tensor<3xi1>
-// CHECK:           [[VAL_115:%.*]] = "tf.Const"() {value = dense<0> : tensor<2x3xi32>} : () -> tensor<2x3xi32>
-// CHECK:           [[VAL_116:%.*]] = "tf.Less"([[VAL_112]], [[VAL_115]]) : (tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi1>
-// CHECK:           [[VAL_117:%.*]] = "tf.Equal"([[VAL_114]], [[VAL_116]]) {incompatible_shape_error = true} : (tensor<3xi1>, tensor<2x3xi1>) -> tensor<2x3xi1>
-// CHECK:           [[VAL_118:%.*]] = "tf.Div"([[VAL_111]], [[VAL_112]]) : (tensor<3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
-// CHECK:           [[VAL_119:%.*]] = "tf.Abs"([[VAL_111]]) : (tensor<3xi32>) -> tensor<3xi32>
-// CHECK:           [[VAL_120:%.*]] = "tf.Abs"([[VAL_112]]) : (tensor<2x3xi32>) -> tensor<2x3xi32>
-// CHECK:           [[VAL_121:%.*]] = "tf.Const"() {value = dense<1> : tensor<2x3xi32>} : () -> tensor<2x3xi32>
-// CHECK:           [[VAL_122:%.*]] = "tf.Sub"([[VAL_120]], [[VAL_121]]) : (tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
-// CHECK:           [[VAL_123:%.*]] = "tf.AddV2"([[VAL_119]], [[VAL_122]]) : (tensor<3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
-// CHECK:           [[VAL_124:%.*]] = "tf.Neg"([[VAL_123]]) : (tensor<2x3xi32>) -> tensor<2x3xi32>
-// CHECK:           [[VAL_125:%.*]] = "tf.Abs"([[VAL_112]]) : (tensor<2x3xi32>) -> tensor<2x3xi32>
-// CHECK:           [[VAL_126:%.*]] = "tf.Div"([[VAL_124]], [[VAL_125]]) : (tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
-// CHECK:           [[VAL_127:%.*]] = "tf.Select"([[VAL_117]], [[VAL_118]], [[VAL_126]]) : (tensor<2x3xi1>, tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
-// CHECK:           return [[VAL_127]] : tensor<2x3xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @floordiv_f32(
-// CHECK-SAME:                       [[VAL_128:%.*]]: tensor<2xf32>) -> tensor<2xf32> {
-// CHECK:           [[VAL_129:%.*]] = "tf.Div"([[VAL_128]], [[VAL_128]]) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
-// CHECK:           [[VAL_130:%.*]] = "tf.Div"([[VAL_128]], [[VAL_128]]) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
-// CHECK:           [[VAL_131:%.*]] = "tf.FloorDiv"([[VAL_128]], [[VAL_128]]) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
-// CHECK:           return [[VAL_131]] : tensor<2xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @floordiv_f16_broadcast(
-// CHECK-SAME:                                 [[VAL_132:%.*]]: tensor<2x3xf16>, [[VAL_133:%.*]]: tensor<3xf16>) -> tensor<2x3xf16> {
-// CHECK:           [[VAL_134:%.*]] = "tf.Div"([[VAL_132]], [[VAL_133]]) : (tensor<2x3xf16>, tensor<3xf16>) -> tensor<2x3xf16>
-// CHECK:           [[VAL_135:%.*]] = "tf.Div"([[VAL_132]], [[VAL_133]]) : (tensor<2x3xf16>, tensor<3xf16>) -> tensor<2x3xf16>
-// CHECK:           [[VAL_136:%.*]] = "tf.FloorDiv"([[VAL_132]], [[VAL_133]]) : (tensor<2x3xf16>, tensor<3xf16>) -> tensor<2x3xf16>
-// CHECK:           return [[VAL_136]] : tensor<2x3xf16>
-// CHECK:         }
-
-// CHECK-LABEL:   func @equal(
-// CHECK-SAME:                [[VAL_137:%.*]]: tensor<2xi32>) -> tensor<2xi1> {
-// CHECK:           [[VAL_138:%.*]] = "tf.Equal"([[VAL_137]], [[VAL_137]]) {incompatible_shape_error = true} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
-// CHECK:           return [[VAL_138]] : tensor<2xi1>
-// CHECK:         }
-
-// CHECK-LABEL:   func @equal_dynamic(
-// CHECK-SAME:                        [[VAL_139:%.*]]: tensor<?xi32>, [[VAL_140:%.*]]: tensor<1xi32>) -> tensor<?xi1> {
-// CHECK:           [[VAL_141:%.*]] = "tf.Equal"([[VAL_139]], [[VAL_140]]) {incompatible_shape_error = true} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
-// CHECK:           return [[VAL_141]] : tensor<?xi1>
-// CHECK:         }
-
-// CHECK-LABEL:   func @equal_broadcast(
-// CHECK-SAME:                          [[VAL_142:%.*]]: tensor<1xi32>, [[VAL_143:%.*]]: tensor<1x2xi32>) -> tensor<1x2xi1> {
-// CHECK:           [[VAL_144:%.*]] = "tf.Equal"([[VAL_142]], [[VAL_143]]) {incompatible_shape_error = true} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
-// CHECK:           return [[VAL_144]] : tensor<1x2xi1>
-// CHECK:         }
-
-// CHECK-LABEL:   func @equal_broadcast_no_incompatible_shapes_error(
-// CHECK-SAME:                                                       [[VAL_145:%.*]]: tensor<2xi32>, [[VAL_146:%.*]]: tensor<1x2xi32>) -> tensor<1x2xi1> {
-// CHECK:           [[VAL_147:%.*]] = "tf.Equal"([[VAL_145]], [[VAL_146]]) {incompatible_shape_error = true} : (tensor<2xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
-// CHECK:           return [[VAL_147]] : tensor<1x2xi1>
-// CHECK:         }
-
-// CHECK-LABEL:   func @equal_incompatible_shape_broadcastable(
-// CHECK-SAME:                                                 [[VAL_148:%.*]]: tensor<?xi32>, [[VAL_149:%.*]]: tensor<1xi32>) -> tensor<?xi1> {
-// CHECK:           [[VAL_150:%.*]] = "tf.Equal"([[VAL_148]], [[VAL_149]]) {incompatible_shape_error = true} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
-// CHECK:           return [[VAL_150]] : tensor<?xi1>
-// CHECK:         }
-
-// CHECK-LABEL:   func @notequal(
-// CHECK-SAME:                   [[VAL_151:%.*]]: tensor<2xi32>) -> tensor<2xi1> {
-// CHECK:           [[VAL_152:%.*]] = "tf.NotEqual"([[VAL_151]], [[VAL_151]]) {incompatible_shape_error = true} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
-// CHECK:           return [[VAL_152]] : tensor<2xi1>
-// CHECK:         }
-
-// CHECK-LABEL:   func @notequal_broadcast(
-// CHECK-SAME:                             [[VAL_153:%.*]]: tensor<1xi32>, [[VAL_154:%.*]]: tensor<1x2xi32>) -> tensor<1x2xi1> {
-// CHECK:           [[VAL_155:%.*]] = "tf.NotEqual"([[VAL_153]], [[VAL_154]]) {incompatible_shape_error = true} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
-// CHECK:           return [[VAL_155]] : tensor<1x2xi1>
-// CHECK:         }
-
-// CHECK-LABEL:   func @notequal_broadcast_no_incompatible_shapes_error(
-// CHECK-SAME:                                                          [[VAL_156:%.*]]: tensor<2xi32>, [[VAL_157:%.*]]: tensor<1x2xi32>) -> tensor<1x2xi1> {
-// CHECK:           [[VAL_158:%.*]] = "tf.NotEqual"([[VAL_156]], [[VAL_157]]) {incompatible_shape_error = true} : (tensor<2xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
-// CHECK:           return [[VAL_158]] : tensor<1x2xi1>
-// CHECK:         }
-
-// CHECK-LABEL:   func @notequal_incompatible_shape_broadcastable(
-// CHECK-SAME:                                                    [[VAL_159:%.*]]: tensor<?xi32>, [[VAL_160:%.*]]: tensor<1xi32>) -> tensor<?xi1> {
-// CHECK:           [[VAL_161:%.*]] = "tf.NotEqual"([[VAL_159]], [[VAL_160]]) {incompatible_shape_error = true} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
-// CHECK:           return [[VAL_161]] : tensor<?xi1>
-// CHECK:         }
-
-// CHECK-LABEL:   func @greater(
-// CHECK-SAME:                  [[VAL_162:%.*]]: tensor<2xi32>) -> tensor<2xi1> {
-// CHECK:           [[VAL_163:%.*]] = "tf.Greater"([[VAL_162]], [[VAL_162]]) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
-// CHECK:           return [[VAL_163]] : tensor<2xi1>
-// CHECK:         }
-
-// CHECK-LABEL:   func @broadcast_greater(
-// CHECK-SAME:                            [[VAL_164:%.*]]: tensor<1xi32>, [[VAL_165:%.*]]: tensor<1x2xi32>) -> tensor<1x2xi1> {
-// CHECK:           [[VAL_166:%.*]] = "tf.Greater"([[VAL_164]], [[VAL_165]]) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
-// CHECK:           return [[VAL_166]] : tensor<1x2xi1>
-// CHECK:         }
-
-// CHECK-LABEL:   func @greater_equal(
-// CHECK-SAME:                        [[VAL_167:%.*]]: tensor<2xi32>) -> tensor<2xi1> {
-// CHECK:           [[VAL_168:%.*]] = "tf.GreaterEqual"([[VAL_167]], [[VAL_167]]) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
-// CHECK:           return [[VAL_168]] : tensor<2xi1>
-// CHECK:         }
-
-// CHECK-LABEL:   func @broadcast_greater_equal(
-// CHECK-SAME:                                  [[VAL_169:%.*]]: tensor<1xi32>, [[VAL_170:%.*]]: tensor<1x2xi32>) -> tensor<1x2xi1> {
-// CHECK:           [[VAL_171:%.*]] = "tf.GreaterEqual"([[VAL_169]], [[VAL_170]]) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
-// CHECK:           return [[VAL_171]] : tensor<1x2xi1>
-// CHECK:         }
-
-// CHECK-LABEL:   func @less(
-// CHECK-SAME:               [[VAL_172:%.*]]: tensor<2xi32>) -> tensor<2xi1> {
-// CHECK:           [[VAL_173:%.*]] = "tf.Less"([[VAL_172]], [[VAL_172]]) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
-// CHECK:           return [[VAL_173]] : tensor<2xi1>
-// CHECK:         }
-
-// CHECK-LABEL:   func @broadcast_less(
-// CHECK-SAME:                         [[VAL_174:%.*]]: tensor<1xi32>, [[VAL_175:%.*]]: tensor<1x2xi32>) -> tensor<1x2xi1> {
-// CHECK:           [[VAL_176:%.*]] = "tf.Less"([[VAL_174]], [[VAL_175]]) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
-// CHECK:           return [[VAL_176]] : tensor<1x2xi1>
-// CHECK:         }
-
-// CHECK-LABEL:   func @less_equal(
-// CHECK-SAME:                     [[VAL_177:%.*]]: tensor<2xi32>) -> tensor<2xi1> {
-// CHECK:           [[VAL_178:%.*]] = "tf.LessEqual"([[VAL_177]], [[VAL_177]]) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
-// CHECK:           return [[VAL_178]] : tensor<2xi1>
-// CHECK:         }
-
-// CHECK-LABEL:   func @broadcast_less_equal(
-// CHECK-SAME:                               [[VAL_179:%.*]]: tensor<1xi32>, [[VAL_180:%.*]]: tensor<1x2xi32>) -> tensor<1x2xi1> {
-// CHECK:           [[VAL_181:%.*]] = "tf.LessEqual"([[VAL_179]], [[VAL_180]]) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
-// CHECK:           return [[VAL_181]] : tensor<1x2xi1>
-// CHECK:         }
-
-// CHECK-LABEL:   func @concat_v2(
-// CHECK-SAME:                    [[VAL_182:%.*]]: tensor<3x3xf32>, [[VAL_183:%.*]]: tensor<3x3xf32>) -> tensor<6x3xf32> {
-// CHECK:           [[VAL_184:%.*]] = "tf.Const"() {value = dense<0> : tensor<i64>} : () -> tensor<i64>
-// CHECK:           [[VAL_185:%.*]] = "tf.ConcatV2"([[VAL_182]], [[VAL_183]], [[VAL_184]]) : (tensor<3x3xf32>, tensor<3x3xf32>, tensor<i64>) -> tensor<6x3xf32>
-// CHECK:           return [[VAL_185]] : tensor<6x3xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @concat_v2_1d_axis(
-// CHECK-SAME:                            [[VAL_186:%.*]]: tensor<3x3xf32>, [[VAL_187:%.*]]: tensor<3x3xf32>) -> tensor<3x6xf32> {
-// CHECK:           [[VAL_188:%.*]] = "tf.Const"() {value = dense<1> : tensor<i64>} : () -> tensor<i64>
-// CHECK:           [[VAL_189:%.*]] = "tf.ConcatV2"([[VAL_186]], [[VAL_187]], [[VAL_188]]) : (tensor<3x3xf32>, tensor<3x3xf32>, tensor<i64>) -> tensor<3x6xf32>
-// CHECK:           return [[VAL_189]] : tensor<3x6xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @const() -> tensor<2xi32> {
-// CHECK:           [[VAL_190:%.*]] = "tf.Const"() {value = dense<0> : tensor<2xi32>} : () -> tensor<2xi32>
-// CHECK:           return [[VAL_190]] : tensor<2xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @relu(
-// CHECK-SAME:               [[VAL_192:%.*]]: tensor<1xi32>) -> tensor<1xi32> {
-// CHECK:           [[VAL_193:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-// CHECK:           [[VAL_194:%.*]] = "tf.Maximum"([[VAL_193]], [[VAL_192]]) : (tensor<i32>, tensor<1xi32>) -> tensor<1xi32>
-// CHECK:           return [[VAL_194]] : tensor<1xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @relu_unranked(
-// CHECK-SAME:                        [[VAL_195:%.*]]: tensor<?xi32>) -> tensor<?xi32> {
-// CHECK:           [[VAL_196:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-// CHECK:           [[VAL_197:%.*]] = "tf.Maximum"([[VAL_196]], [[VAL_195]]) : (tensor<i32>, tensor<?xi32>) -> tensor<?xi32>
-// CHECK:           return [[VAL_197]] : tensor<?xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @relu6(
-// CHECK-SAME:                [[VAL_198:%.*]]: tensor<1xi32>) -> tensor<1xi32> {
-// CHECK:           [[VAL_199:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-// CHECK:           [[VAL_200:%.*]] = "tf.Const"() {value = dense<6> : tensor<i32>} : () -> tensor<i32>
-// CHECK:           [[VAL_201:%.*]] = "tf.Minimum"([[VAL_198]], [[VAL_200]]) : (tensor<1xi32>, tensor<i32>) -> tensor<1xi32>
-// CHECK:           [[VAL_202:%.*]] = "tf.Maximum"([[VAL_201]], [[VAL_199]]) : (tensor<1xi32>, tensor<i32>) -> tensor<1xi32>
-// CHECK:           return [[VAL_202]] : tensor<1xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @relu6_unranked(
-// CHECK-SAME:                         [[VAL_203:%.*]]: tensor<?xi32>) -> tensor<?xi32> {
-// CHECK:           [[VAL_204:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-// CHECK:           [[VAL_205:%.*]] = "tf.Const"() {value = dense<6> : tensor<i32>} : () -> tensor<i32>
-// CHECK:           [[VAL_206:%.*]] = "tf.Minimum"([[VAL_203]], [[VAL_205]]) : (tensor<?xi32>, tensor<i32>) -> tensor<?xi32>
-// CHECK:           [[VAL_207:%.*]] = "tf.Maximum"([[VAL_206]], [[VAL_204]]) : (tensor<?xi32>, tensor<i32>) -> tensor<?xi32>
-// CHECK:           return [[VAL_207]] : tensor<?xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @relu_grad(
-// CHECK-SAME:                    [[VAL_208:%.*]]: tensor<4x8xf32>, [[VAL_209:%.*]]: tensor<?x?xf32>) -> tensor<4x8xf32> {
-// CHECK:           [[VAL_210:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<f32>} : () -> tensor<f32>
-// CHECK:           [[VAL_211:%.*]] = "tf.Greater"([[VAL_209]], [[VAL_210]]) : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xi1>
-// CHECK:           [[VAL_212:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<4x8xf32>} : () -> tensor<4x8xf32>
-// CHECK:           [[VAL_213:%.*]] = "tf.Select"([[VAL_211]], [[VAL_208]], [[VAL_212]]) : (tensor<?x?xi1>, tensor<4x8xf32>, tensor<4x8xf32>) -> tensor<4x8xf32>
-// CHECK:           return [[VAL_213]] : tensor<4x8xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @select(
-// CHECK-SAME:                 [[VAL_214:%.*]]: tensor<2xi1>, [[VAL_215:%.*]]: tensor<2xi32>, [[VAL_216:%.*]]: tensor<2xi32>) -> tensor<2xi32> {
-// CHECK:           [[VAL_217:%.*]] = "tf.Select"([[VAL_214]], [[VAL_215]], [[VAL_216]]) : (tensor<2xi1>, tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-// CHECK:           return [[VAL_217]] : tensor<2xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @select_float(
-// CHECK-SAME:                       [[VAL_218:%.*]]: tensor<2xi1>, [[VAL_219:%.*]]: tensor<2xf32>, [[VAL_220:%.*]]: tensor<2xf32>) -> tensor<2xf32> {
-// CHECK:           [[VAL_221:%.*]] = "tf.Select"([[VAL_218]], [[VAL_219]], [[VAL_220]]) : (tensor<2xi1>, tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
-// CHECK:           return [[VAL_221]] : tensor<2xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @select_multidimensional(
-// CHECK-SAME:                                  [[VAL_222:%.*]]: tensor<3x2xi1>, [[VAL_223:%.*]]: tensor<3x2xi32>, [[VAL_224:%.*]]: tensor<3x2xi32>) -> tensor<3x2xi32> {
-// CHECK:           [[VAL_225:%.*]] = "tf.Select"([[VAL_222]], [[VAL_223]], [[VAL_224]]) : (tensor<3x2xi1>, tensor<3x2xi32>, tensor<3x2xi32>) -> tensor<3x2xi32>
-// CHECK:           return [[VAL_225]] : tensor<3x2xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @selectv2(
-// CHECK-SAME:                   [[VAL_226:%.*]]: tensor<2xi1>, [[VAL_227:%.*]]: tensor<2xi32>, [[VAL_228:%.*]]: tensor<2xi32>) -> tensor<2xi32> {
-// CHECK:           [[VAL_229:%.*]] = "tf.Select"([[VAL_226]], [[VAL_227]], [[VAL_228]]) : (tensor<2xi1>, tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-// CHECK:           return [[VAL_229]] : tensor<2xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @selectv2_pred_scalar(
-// CHECK-SAME:                               [[VAL_230:%.*]]: tensor<i1>, [[VAL_231:%.*]]: tensor<2xi32>, [[VAL_232:%.*]]: tensor<2xi32>) -> tensor<2xi32> {
-// CHECK:           [[VAL_233:%.*]] = "tf.Select"([[VAL_230]], [[VAL_231]], [[VAL_232]]) : (tensor<i1>, tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-// CHECK:           return [[VAL_233]] : tensor<2xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @transpose_2d(
-// CHECK-SAME:                       [[VAL_234:%.*]]: tensor<2x3xf32>) -> tensor<3x2xf32> {
-// CHECK:           [[VAL_235:%.*]] = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi64>} : () -> tensor<2xi64>
-// CHECK:           [[VAL_236:%.*]] = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi64>} : () -> tensor<2xi64>
-// CHECK:           [[VAL_237:%.*]] = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi64>} : () -> tensor<2xi64>
-// CHECK:           [[VAL_238:%.*]] = "tf.Transpose"([[VAL_234]], [[VAL_237]]) : (tensor<2x3xf32>, tensor<2xi64>) -> tensor<3x2xf32>
-// CHECK:           return [[VAL_238]] : tensor<3x2xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @transpose_3d_int32(
-// CHECK-SAME:                             [[VAL_239:%.*]]: tensor<1x2x3xf32>) -> tensor<3x2x1xf32> {
-// CHECK:           [[VAL_240:%.*]] = "tf.Const"() {value = dense<[2, 1, 0]> : tensor<3xi32>} : () -> tensor<3xi32>
-// CHECK:           [[VAL_241:%.*]] = "tf.Const"() {value = dense<[2, 1, 0]> : tensor<3xi64>} : () -> tensor<3xi64>
-// CHECK:           [[VAL_242:%.*]] = "tf.Const"() {value = dense<[2, 1, 0]> : tensor<3xi64>} : () -> tensor<3xi64>
-// CHECK:           [[VAL_243:%.*]] = "tf.Transpose"([[VAL_239]], [[VAL_242]]) : (tensor<1x2x3xf32>, tensor<3xi64>) -> tensor<3x2x1xf32>
-// CHECK:           return [[VAL_243]] : tensor<3x2x1xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @transpose_3d(
-// CHECK-SAME:                       [[VAL_244:%.*]]: tensor<1x2x3xf32>) -> tensor<3x2x1xf32> {
-// CHECK:           [[VAL_245:%.*]] = "tf.Const"() {value = dense<[2, 1, 0]> : tensor<3xi64>} : () -> tensor<3xi64>
-// CHECK:           [[VAL_246:%.*]] = "tf.Const"() {value = dense<[2, 1, 0]> : tensor<3xi64>} : () -> tensor<3xi64>
-// CHECK:           [[VAL_247:%.*]] = "tf.Const"() {value = dense<[2, 1, 0]> : tensor<3xi64>} : () -> tensor<3xi64>
-// CHECK:           [[VAL_248:%.*]] = "tf.Transpose"([[VAL_244]], [[VAL_247]]) : (tensor<1x2x3xf32>, tensor<3xi64>) -> tensor<3x2x1xf32>
-// CHECK:           return [[VAL_248]] : tensor<3x2x1xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @transpose_dynamic_2d(
-// CHECK-SAME:                               [[VAL_249:%.*]]: tensor<?x4xf32>) -> tensor<4x?xf32> {
-// CHECK:           [[VAL_250:%.*]] = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi64>} : () -> tensor<2xi64>
-// CHECK:           [[VAL_251:%.*]] = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi64>} : () -> tensor<2xi64>
-// CHECK:           [[VAL_252:%.*]] = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi64>} : () -> tensor<2xi64>
-// CHECK:           [[VAL_253:%.*]] = "tf.Transpose"([[VAL_249]], [[VAL_252]]) : (tensor<?x4xf32>, tensor<2xi64>) -> tensor<4x?xf32>
-// CHECK:           return [[VAL_253]] : tensor<4x?xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @transpose_unranked_2d(
-// CHECK-SAME:                                [[VAL_254:%.*]]: tensor<*xf32>) -> tensor<*xf32> {
-// CHECK:           [[VAL_255:%.*]] = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi64>} : () -> tensor<2xi64>
-// CHECK:           [[VAL_256:%.*]] = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi64>} : () -> tensor<2xi64>
-// CHECK:           [[VAL_257:%.*]] = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi64>} : () -> tensor<2xi64>
-// CHECK:           [[VAL_258:%.*]] = "tf.Transpose"([[VAL_254]], [[VAL_257]]) : (tensor<*xf32>, tensor<2xi64>) -> tensor<*xf32>
-// CHECK:           return [[VAL_258]] : tensor<*xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @abs(
-// CHECK-SAME:              [[VAL_259:%.*]]: tensor<2xf32>) -> tensor<2xf32> {
-// CHECK:           [[VAL_260:%.*]] = "tf.Abs"([[VAL_259]]) : (tensor<2xf32>) -> tensor<2xf32>
-// CHECK:           return [[VAL_260]] : tensor<2xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @abs_dynamic(
-// CHECK-SAME:                      [[VAL_261:%.*]]: tensor<?xf32>) -> tensor<?xf32> {
-// CHECK:           [[VAL_262:%.*]] = "tf.Abs"([[VAL_261]]) : (tensor<?xf32>) -> tensor<?xf32>
-// CHECK:           return [[VAL_262]] : tensor<?xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @abs_unranked(
-// CHECK-SAME:                       [[VAL_263:%.*]]: tensor<*xf32>) -> tensor<*xf32> {
-// CHECK:           [[VAL_264:%.*]] = "tf.Abs"([[VAL_263]]) : (tensor<*xf32>) -> tensor<*xf32>
-// CHECK:           return [[VAL_264]] : tensor<*xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @ceil(
-// CHECK-SAME:               [[VAL_265:%.*]]: tensor<2xf32>) -> tensor<2xf32> {
-// CHECK:           [[VAL_266:%.*]] = "tf.Ceil"([[VAL_265]]) : (tensor<2xf32>) -> tensor<2xf32>
-// CHECK:           return [[VAL_266]] : tensor<2xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @ceil_dynamic(
-// CHECK-SAME:                       [[VAL_267:%.*]]: tensor<?xf32>) -> tensor<?xf32> {
-// CHECK:           [[VAL_268:%.*]] = "tf.Ceil"([[VAL_267]]) : (tensor<?xf32>) -> tensor<?xf32>
-// CHECK:           return [[VAL_268]] : tensor<?xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @ceil_unranked(
-// CHECK-SAME:                        [[VAL_269:%.*]]: tensor<*xf32>) -> tensor<*xf32> {
-// CHECK:           [[VAL_270:%.*]] = "tf.Ceil"([[VAL_269]]) : (tensor<*xf32>) -> tensor<*xf32>
-// CHECK:           return [[VAL_270]] : tensor<*xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @complex_abs(
-// CHECK-SAME:                      [[VAL_271:%.*]]: tensor<2xcomplex<f32>>) -> tensor<2xf32> {
-// CHECK:           [[VAL_272:%.*]] = "tf.ComplexAbs"([[VAL_271]]) : (tensor<2xcomplex<f32>>) -> tensor<2xf32>
-// CHECK:           return [[VAL_272]] : tensor<2xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @cos(
-// CHECK-SAME:              [[VAL_273:%.*]]: tensor<2xf32>) -> tensor<2xf32> {
-// CHECK:           [[VAL_274:%.*]] = "tf.Cos"([[VAL_273]]) : (tensor<2xf32>) -> tensor<2xf32>
-// CHECK:           return [[VAL_274]] : tensor<2xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @cos_dynamic(
-// CHECK-SAME:                      [[VAL_275:%.*]]: tensor<?xf32>) -> tensor<?xf32> {
-// CHECK:           [[VAL_276:%.*]] = "tf.Cos"([[VAL_275]]) : (tensor<?xf32>) -> tensor<?xf32>
-// CHECK:           return [[VAL_276]] : tensor<?xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @cos_unranked(
-// CHECK-SAME:                       [[VAL_277:%.*]]: tensor<*xf32>) -> tensor<*xf32> {
-// CHECK:           [[VAL_278:%.*]] = "tf.Cos"([[VAL_277]]) : (tensor<*xf32>) -> tensor<*xf32>
-// CHECK:           return [[VAL_278]] : tensor<*xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @exp(
-// CHECK-SAME:              [[VAL_279:%.*]]: tensor<2xf32>) -> tensor<2xf32> {
-// CHECK:           [[VAL_280:%.*]] = "tf.Exp"([[VAL_279]]) : (tensor<2xf32>) -> tensor<2xf32>
-// CHECK:           return [[VAL_280]] : tensor<2xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @exp_dynamic(
-// CHECK-SAME:                      [[VAL_281:%.*]]: tensor<?xf32>) -> tensor<?xf32> {
-// CHECK:           [[VAL_282:%.*]] = "tf.Exp"([[VAL_281]]) : (tensor<?xf32>) -> tensor<?xf32>
-// CHECK:           return [[VAL_282]] : tensor<?xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @exp_unranked(
-// CHECK-SAME:                       [[VAL_283:%.*]]: tensor<*xf32>) -> tensor<*xf32> {
-// CHECK:           [[VAL_284:%.*]] = "tf.Exp"([[VAL_283]]) : (tensor<*xf32>) -> tensor<*xf32>
-// CHECK:           return [[VAL_284]] : tensor<*xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @floor(
-// CHECK-SAME:                [[VAL_285:%.*]]: tensor<2xf32>) -> tensor<2xf32> {
-// CHECK:           [[VAL_286:%.*]] = "tf.Floor"([[VAL_285]]) : (tensor<2xf32>) -> tensor<2xf32>
-// CHECK:           return [[VAL_286]] : tensor<2xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @floor_dynamic(
-// CHECK-SAME:                        [[VAL_287:%.*]]: tensor<?xf32>) -> tensor<?xf32> {
-// CHECK:           [[VAL_288:%.*]] = "tf.Floor"([[VAL_287]]) : (tensor<?xf32>) -> tensor<?xf32>
-// CHECK:           return [[VAL_288]] : tensor<?xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @floor_unranked(
-// CHECK-SAME:                         [[VAL_289:%.*]]: tensor<*xf32>) -> tensor<*xf32> {
-// CHECK:           [[VAL_290:%.*]] = "tf.Floor"([[VAL_289]]) : (tensor<*xf32>) -> tensor<*xf32>
-// CHECK:           return [[VAL_290]] : tensor<*xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @is_finite(
-// CHECK-SAME:                    [[VAL_291:%.*]]: tensor<2xf32>) -> tensor<2xi1> {
-// CHECK:           [[VAL_292:%.*]] = "tf.IsFinite"([[VAL_291]]) : (tensor<2xf32>) -> tensor<2xi1>
-// CHECK:           return [[VAL_292]] : tensor<2xi1>
-// CHECK:         }
-
-// CHECK-LABEL:   func @is_finite_dynamic(
-// CHECK-SAME:                            [[VAL_293:%.*]]: tensor<?xf32>) -> tensor<?xi1> {
-// CHECK:           [[VAL_294:%.*]] = "tf.IsFinite"([[VAL_293]]) : (tensor<?xf32>) -> tensor<?xi1>
-// CHECK:           return [[VAL_294]] : tensor<?xi1>
-// CHECK:         }
-
-// CHECK-LABEL:   func @is_finite_unranked(
-// CHECK-SAME:                             [[VAL_295:%.*]]: tensor<*xf32>) -> tensor<*xi1> {
-// CHECK:           [[VAL_296:%.*]] = "tf.IsFinite"([[VAL_295]]) : (tensor<*xf32>) -> tensor<*xi1>
-// CHECK:           return [[VAL_296]] : tensor<*xi1>
-// CHECK:         }
-
-// CHECK-LABEL:   func @log(
-// CHECK-SAME:              [[VAL_297:%.*]]: tensor<2xf32>) -> tensor<2xf32> {
-// CHECK:           [[VAL_298:%.*]] = "tf.Log"([[VAL_297]]) : (tensor<2xf32>) -> tensor<2xf32>
-// CHECK:           return [[VAL_298]] : tensor<2xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @log_dynamic(
-// CHECK-SAME:                      [[VAL_299:%.*]]: tensor<?xf32>) -> tensor<?xf32> {
-// CHECK:           [[VAL_300:%.*]] = "tf.Log"([[VAL_299]]) : (tensor<?xf32>) -> tensor<?xf32>
-// CHECK:           return [[VAL_300]] : tensor<?xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @log_unranked(
-// CHECK-SAME:                       [[VAL_301:%.*]]: tensor<*xf32>) -> tensor<*xf32> {
-// CHECK:           [[VAL_302:%.*]] = "tf.Log"([[VAL_301]]) : (tensor<*xf32>) -> tensor<*xf32>
-// CHECK:           return [[VAL_302]] : tensor<*xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @log1p(
-// CHECK-SAME:                [[VAL_303:%.*]]: tensor<2xf32>) -> tensor<2xf32> {
-// CHECK:           [[VAL_304:%.*]] = "tf.Log1p"([[VAL_303]]) : (tensor<2xf32>) -> tensor<2xf32>
-// CHECK:           return [[VAL_304]] : tensor<2xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @log1p_dynamic(
-// CHECK-SAME:                        [[VAL_305:%.*]]: tensor<?xf32>) -> tensor<?xf32> {
-// CHECK:           [[VAL_306:%.*]] = "tf.Log1p"([[VAL_305]]) : (tensor<?xf32>) -> tensor<?xf32>
-// CHECK:           return [[VAL_306]] : tensor<?xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @log1p_unranked(
-// CHECK-SAME:                         [[VAL_307:%.*]]: tensor<*xf32>) -> tensor<*xf32> {
-// CHECK:           [[VAL_308:%.*]] = "tf.Log1p"([[VAL_307]]) : (tensor<*xf32>) -> tensor<*xf32>
-// CHECK:           return [[VAL_308]] : tensor<*xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @neg(
-// CHECK-SAME:              [[VAL_309:%.*]]: tensor<2xf32>) -> tensor<2xf32> {
-// CHECK:           [[VAL_310:%.*]] = "tf.Neg"([[VAL_309]]) : (tensor<2xf32>) -> tensor<2xf32>
-// CHECK:           return [[VAL_310]] : tensor<2xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @neg_dynamic(
-// CHECK-SAME:                      [[VAL_311:%.*]]: tensor<?xf32>) -> tensor<?xf32> {
-// CHECK:           [[VAL_312:%.*]] = "tf.Neg"([[VAL_311]]) : (tensor<?xf32>) -> tensor<?xf32>
-// CHECK:           return [[VAL_312]] : tensor<?xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @neg_unranked(
-// CHECK-SAME:                       [[VAL_313:%.*]]: tensor<*xf32>) -> tensor<*xf32> {
-// CHECK:           [[VAL_314:%.*]] = "tf.Neg"([[VAL_313]]) : (tensor<*xf32>) -> tensor<*xf32>
-// CHECK:           return [[VAL_314]] : tensor<*xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @sigmoid(
-// CHECK-SAME:                  [[VAL_315:%.*]]: tensor<2xf32>) -> tensor<2xf32> {
-// CHECK:           [[VAL_316:%.*]] = "tf.Const"() {value = dense<5.000000e-01> : tensor<f32>} : () -> tensor<f32>
-// CHECK:           [[VAL_317:%.*]] = "tf.Const"() {value = dense<2> : tensor<1xi64>} : () -> tensor<1xi64>
-// CHECK:           [[VAL_318:%.*]] = "tf.Const"() {value = dense<5.000000e-01> : tensor<2xf32>} : () -> tensor<2xf32>
-// CHECK:           [[VAL_319:%.*]] = "tf.Mul"([[VAL_315]], [[VAL_318]]) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
-// CHECK:           [[VAL_320:%.*]] = "tf.Tanh"([[VAL_319]]) : (tensor<2xf32>) -> tensor<2xf32>
-// CHECK:           [[VAL_321:%.*]] = "tf.Mul"([[VAL_320]], [[VAL_318]]) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
-// CHECK:           [[VAL_322:%.*]] = "tf.AddV2"([[VAL_321]], [[VAL_318]]) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
-// CHECK:           return [[VAL_322]] : tensor<2xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @sin(
-// CHECK-SAME:              [[VAL_323:%.*]]: tensor<2xf32>) -> tensor<2xf32> {
-// CHECK:           [[VAL_324:%.*]] = "tf.Sin"([[VAL_323]]) : (tensor<2xf32>) -> tensor<2xf32>
-// CHECK:           return [[VAL_324]] : tensor<2xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @sin_dynamic(
-// CHECK-SAME:                      [[VAL_325:%.*]]: tensor<?xf32>) -> tensor<?xf32> {
-// CHECK:           [[VAL_326:%.*]] = "tf.Sin"([[VAL_325]]) : (tensor<?xf32>) -> tensor<?xf32>
-// CHECK:           return [[VAL_326]] : tensor<?xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @sin_unranked(
-// CHECK-SAME:                       [[VAL_327:%.*]]: tensor<*xf32>) -> tensor<*xf32> {
-// CHECK:           [[VAL_328:%.*]] = "tf.Sin"([[VAL_327]]) : (tensor<*xf32>) -> tensor<*xf32>
-// CHECK:           return [[VAL_328]] : tensor<*xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @rsqrt(
-// CHECK-SAME:                [[VAL_329:%.*]]: tensor<2xf32>) -> tensor<2xf32> {
-// CHECK:           [[VAL_330:%.*]] = "tf.Rsqrt"([[VAL_329]]) : (tensor<2xf32>) -> tensor<2xf32>
-// CHECK:           return [[VAL_330]] : tensor<2xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @rsqrt_dynamic(
-// CHECK-SAME:                        [[VAL_331:%.*]]: tensor<?xf32>) -> tensor<?xf32> {
-// CHECK:           [[VAL_332:%.*]] = "tf.Rsqrt"([[VAL_331]]) : (tensor<?xf32>) -> tensor<?xf32>
-// CHECK:           return [[VAL_332]] : tensor<?xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @rsqrt_unranked(
-// CHECK-SAME:                         [[VAL_333:%.*]]: tensor<*xf32>) -> tensor<*xf32> {
-// CHECK:           [[VAL_334:%.*]] = "tf.Rsqrt"([[VAL_333]]) : (tensor<*xf32>) -> tensor<*xf32>
-// CHECK:           return [[VAL_334]] : tensor<*xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @sqrt(
-// CHECK-SAME:               [[VAL_335:%.*]]: tensor<2xf32>) -> tensor<2xf32> {
-// CHECK:           [[VAL_336:%.*]] = "tf.Sqrt"([[VAL_335]]) : (tensor<2xf32>) -> tensor<2xf32>
-// CHECK:           return [[VAL_336]] : tensor<2xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @sqrt_dynamic(
-// CHECK-SAME:                       [[VAL_337:%.*]]: tensor<?xf32>) -> tensor<?xf32> {
-// CHECK:           [[VAL_338:%.*]] = "tf.Sqrt"([[VAL_337]]) : (tensor<?xf32>) -> tensor<?xf32>
-// CHECK:           return [[VAL_338]] : tensor<?xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @sqrt_unranked(
-// CHECK-SAME:                        [[VAL_339:%.*]]: tensor<*xf32>) -> tensor<*xf32> {
-// CHECK:           [[VAL_340:%.*]] = "tf.Sqrt"([[VAL_339]]) : (tensor<*xf32>) -> tensor<*xf32>
-// CHECK:           return [[VAL_340]] : tensor<*xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @tanh(
-// CHECK-SAME:               [[VAL_341:%.*]]: tensor<2xf32>) -> tensor<2xf32> {
-// CHECK:           [[VAL_342:%.*]] = "tf.Tanh"([[VAL_341]]) : (tensor<2xf32>) -> tensor<2xf32>
-// CHECK:           return [[VAL_342]] : tensor<2xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @tanh_dynamic(
-// CHECK-SAME:                       [[VAL_343:%.*]]: tensor<?xf32>) -> tensor<?xf32> {
-// CHECK:           [[VAL_344:%.*]] = "tf.Tanh"([[VAL_343]]) : (tensor<?xf32>) -> tensor<?xf32>
-// CHECK:           return [[VAL_344]] : tensor<?xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @tanh_unranked(
-// CHECK-SAME:                        [[VAL_345:%.*]]: tensor<*xf32>) -> tensor<*xf32> {
-// CHECK:           [[VAL_346:%.*]] = "tf.Tanh"([[VAL_345]]) : (tensor<*xf32>) -> tensor<*xf32>
-// CHECK:           return [[VAL_346]] : tensor<*xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @bitcast(
-// CHECK-SAME:                  [[VAL_347:%.*]]: tensor<2xf32>) -> tensor<2xf32> {
-// CHECK:           [[VAL_348:%.*]] = "tf.Bitcast"([[VAL_347]]) : (tensor<2xf32>) -> tensor<2xf32>
-// CHECK:           return [[VAL_348]] : tensor<2xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @bitcast_dynamic(
-// CHECK-SAME:                          [[VAL_349:%.*]]: tensor<?xf32>) -> tensor<?xf32> {
-// CHECK:           [[VAL_350:%.*]] = "tf.Bitcast"([[VAL_349]]) : (tensor<?xf32>) -> tensor<?xf32>
-// CHECK:           return [[VAL_350]] : tensor<?xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @bitcast_unranked(
-// CHECK-SAME:                           [[VAL_351:%.*]]: tensor<*xf32>) -> tensor<*xf32> {
-// CHECK:           [[VAL_352:%.*]] = "tf.Bitcast"([[VAL_351]]) : (tensor<*xf32>) -> tensor<*xf32>
-// CHECK:           return [[VAL_352]] : tensor<*xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @bitcast_same_widths(
-// CHECK-SAME:                              [[VAL_353:%.*]]: tensor<2xf32>) -> tensor<2xi32> {
-// CHECK:           [[VAL_354:%.*]] = "tf.Bitcast"([[VAL_353]]) : (tensor<2xf32>) -> tensor<2xi32>
-// CHECK:           return [[VAL_354]] : tensor<2xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @sign(
-// CHECK-SAME:               [[VAL_355:%.*]]: tensor<1x2x3x4xf32>) -> tensor<1x2x3x4xf32> {
-// CHECK:           [[VAL_356:%.*]] = "tf.NotEqual"([[VAL_355]], [[VAL_355]]) {incompatible_shape_error = true} : (tensor<1x2x3x4xf32>, tensor<1x2x3x4xf32>) -> tensor<1x2x3x4xi1>
-// CHECK:           [[VAL_357:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1x2x3x4xf32>} : () -> tensor<1x2x3x4xf32>
-// CHECK:           [[VAL_358:%.*]] = "tf.NotEqual"([[VAL_355]], [[VAL_355]]) {incompatible_shape_error = true} : (tensor<1x2x3x4xf32>, tensor<1x2x3x4xf32>) -> tensor<1x2x3x4xi1>
-// CHECK:           [[VAL_359:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1x2x3x4xf32>} : () -> tensor<1x2x3x4xf32>
-// CHECK:           [[VAL_360:%.*]] = "tf.Sign"([[VAL_355]]) : (tensor<1x2x3x4xf32>) -> tensor<1x2x3x4xf32>
-// CHECK:           [[VAL_361:%.*]] = "tf.Select"([[VAL_358]], [[VAL_359]], [[VAL_360]]) : (tensor<1x2x3x4xi1>, tensor<1x2x3x4xf32>, tensor<1x2x3x4xf32>) -> tensor<1x2x3x4xf32>
-// CHECK:           [[VAL_362:%.*]] = "tf.Select"([[VAL_356]], [[VAL_357]], [[VAL_361]]) : (tensor<1x2x3x4xi1>, tensor<1x2x3x4xf32>, tensor<1x2x3x4xf32>) -> tensor<1x2x3x4xf32>
-// CHECK:           return [[VAL_362]] : tensor<1x2x3x4xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @size_rank_one_i32(
-// CHECK-SAME:                            [[VAL_363:%.*]]: tensor<f32>) -> tensor<i32> {
-// CHECK:           [[VAL_364:%.*]] = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
-// CHECK:           return [[VAL_364]] : tensor<i32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @size_rank_one_i64(
-// CHECK-SAME:                            [[VAL_365:%.*]]: tensor<f32>) -> tensor<i64> {
-// CHECK:           [[VAL_366:%.*]] = "tf.Const"() {value = dense<1> : tensor<i64>} : () -> tensor<i64>
-// CHECK:           return [[VAL_366]] : tensor<i64>
-// CHECK:         }
-
-// CHECK-LABEL:   func @complex(
-// CHECK-SAME:                  [[VAL_367:%.*]]: tensor<3xf32>, [[VAL_368:%.*]]: tensor<3xf32>) -> tensor<3xcomplex<f32>> {
-// CHECK:           [[VAL_369:%.*]] = "tf.Complex"([[VAL_367]], [[VAL_368]]) : (tensor<3xf32>, tensor<3xf32>) -> tensor<3xcomplex<f32>>
-// CHECK:           return [[VAL_369]] : tensor<3xcomplex<f32>>
-// CHECK:         }
-
-// CHECK-LABEL:   func @convert_i32_f32(
-// CHECK-SAME:                          [[VAL_370:%.*]]: tensor<2xi32>) -> tensor<2xf32> {
-// CHECK:           [[VAL_371:%.*]] = "tf.Cast"([[VAL_370]]) {Truncate = false} : (tensor<2xi32>) -> tensor<2xf32>
-// CHECK:           return [[VAL_371]] : tensor<2xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @convert_slice(
-// CHECK-SAME:                          [[VAL_372:%.*]]: tensor<1x4672xf32>) -> tensor<1x519xf32> {
-// CHECK:           [[VAL_373:%.*]] = "tf.Const"() {value = dense<[0, 4153]> : tensor<2xi64>} : () -> tensor<2xi64>
-// CHECK:           [[VAL_374:%.*]] = "tf.Const"() {value = dense<[1, 519]> : tensor<2xi64>} : () -> tensor<2xi64>
-// CHECK:           [[VAL_375:%.*]] = "tf.Slice"([[VAL_372]], [[VAL_373]], [[VAL_374]]) : (tensor<1x4672xf32>, tensor<2xi64>, tensor<2xi64>) -> tensor<1x519xf32>
-// CHECK:           return [[VAL_375]] : tensor<1x519xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @reshape(
-// CHECK-SAME:                  [[VAL_372:%.*]]: tensor<4x6xf32>) -> tensor<2x2x6xf32> {
-// CHECK:           [[VAL_373:%.*]] = constant dense<[2, 2, 6]> : tensor<3xi64>
-// CHECK:           [[VAL_374:%.*]] = "tf.Reshape"([[VAL_372]], [[VAL_373]]) : (tensor<4x6xf32>, tensor<3xi64>) -> tensor<2x2x6xf32>
-// CHECK:           return [[VAL_374]] : tensor<2x2x6xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @convert_dot_1d_2d(
-// CHECK-SAME:                            [[VAL_376:%.*]]: tensor<256xf32>, [[VAL_377:%.*]]: tensor<256x1xf32>) -> tensor<1xf32> {
-// CHECK:           [[VAL_378:%.*]] = "tf.Reshape"([[VAL_376]], {{.*}}) : (tensor<256xf32>, tensor<2xi64>) -> tensor<1x256xf32>
-// CHECK:           [[VAL_379:%.*]] = "tf.MatMul"([[VAL_378]], [[VAL_377]]) {transpose_a = false, transpose_b = false} : (tensor<1x256xf32>, tensor<256x1xf32>) -> tensor<1x1xf32>
-// CHECK:           [[VAL_380:%.*]] = "tf.Reshape"([[VAL_379]], {{.*}}) : (tensor<1x1xf32>, tensor<1xi64>) -> tensor<1xf32>
-// CHECK:           return [[VAL_380]] : tensor<1xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @convert_dot_2d_1d(
-// CHECK-SAME:                            [[VAL_381:%.*]]: tensor<1x256xf32>, [[VAL_382:%.*]]: tensor<256xf32>) -> tensor<1xf32> {
-// CHECK:           [[VAL_383:%.*]] = "tf.Reshape"([[VAL_382]], {{.*}}) : (tensor<256xf32>, tensor<2xi64>) -> tensor<1x256xf32>
-// CHECK:           [[VAL_384:%.*]] = "tf.MatMul"([[VAL_381]], [[VAL_383]]) {transpose_a = false, transpose_b = true} : (tensor<1x256xf32>, tensor<1x256xf32>) -> tensor<1x1xf32>
-// CHECK:           [[VAL_385:%.*]] = "tf.Reshape"([[VAL_384]], {{.*}}) : (tensor<1x1xf32>, tensor<1xi64>) -> tensor<1xf32>
-// CHECK:           return [[VAL_385]] : tensor<1xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @convert_dot_1d_1d(
-// CHECK-SAME:                            [[VAL_386:%.*]]: tensor<256xf32>, [[VAL_387:%.*]]: tensor<256xf32>) -> tensor<f32> {
-// CHECK-DAG:       [[VAL_388:%.*]] = "tf.Reshape"([[VAL_386]], {{.*}}) : (tensor<256xf32>, tensor<2xi64>) -> tensor<1x256xf32>
-// CHECK-DAG:       [[VAL_389:%.*]] = "tf.Reshape"([[VAL_387]], {{.*}}) : (tensor<256xf32>, tensor<2xi64>) -> tensor<1x256xf32>
-// CHECK:           [[VAL_390:%.*]] = "tf.MatMul"([[VAL_388]], [[VAL_389]]) {transpose_a = false, transpose_b = true} : (tensor<1x256xf32>, tensor<1x256xf32>) -> tensor<1x1xf32>
-// CHECK:           [[VAL_391:%.*]] = "tf.Reshape"([[VAL_390]], {{.*}}) : (tensor<1x1xf32>, tensor<0xi64>) -> tensor<f32>
-// CHECK:           return [[VAL_391]] : tensor<f32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @convert_dot_2d_2d(
-// CHECK-SAME:                            [[VAL_392:%.*]]: tensor<1x256xf32>, [[VAL_393:%.*]]: tensor<256x1xf32>) -> tensor<1x1xf32> {
-// CHECK:           [[VAL_394:%.*]] = "tf.MatMul"([[VAL_392]], [[VAL_393]]) {transpose_a = false, transpose_b = false} : (tensor<1x256xf32>, tensor<256x1xf32>) -> tensor<1x1xf32>
-// CHECK:           return [[VAL_394]] : tensor<1x1xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @broadcast_in_dim_tf_style(
-// CHECK-SAME:                  [[VAL_395:%.*]]: tensor<8x1x16xf32>) -> tensor<3x8x8x16xf32> {
-// CHECK:           [[VAL_396:%.*]] = constant dense<[3, 8, 8, 16]> : tensor<4xi64>
-// CHECK:           [[VAL_397:%.*]] = "tf.BroadcastTo"([[VAL_395]], [[VAL_396]]) : (tensor<8x1x16xf32>, tensor<4xi64>) -> tensor<3x8x8x16xf32>
-// CHECK:           return [[VAL_397]] : tensor<3x8x8x16xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @broadcast_in_dim_general_case(
-// CHECK-SAME:                  [[VAL_398:%.*]]: tensor<3x1x16xf32>) -> tensor<3x8x8x16xf32> {
-// CHECK:           [[VAL_399:%.*]] = constant dense<[3, 1, 1, 16]> : tensor<4xi64>
-// CHECK:           [[VAL_400:%.*]] = "tf.Reshape"([[VAL_398]], [[VAL_399]]) : (tensor<3x1x16xf32>, tensor<4xi64>) -> tensor<3x1x1x16xf32>
-// CHECK:           [[VAL_401:%.*]] = constant dense<[3, 8, 8, 16]> : tensor<4xi64>
-// CHECK:           [[VAL_402:%.*]] = "tf.BroadcastTo"([[VAL_400]], [[VAL_401]]) : (tensor<3x1x1x16xf32>, tensor<4xi64>) -> tensor<3x8x8x16xf32>
-// CHECK:           return [[VAL_402]] : tensor<3x8x8x16xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @convert_dot_general(
-// CHECK-SAME:                  [[VAL_396:%.*]]: tensor<3x2x6x5x1xf32>, [[VAL_397:%.*]]: tensor<3x2x4x6xf32>) -> tensor<3x5x1x4xf32> {
-// CHECK:           [[VAL_398:%.*]] = "tf.Transpose"([[VAL_396]], {{.*}}) : (tensor<3x2x6x5x1xf32>, tensor<5xi64>) -> tensor<3x5x1x2x6xf32>
-// CHECK:           [[VAL_399:%.*]] = "tf.Transpose"([[VAL_397]], {{.*}}) : (tensor<3x2x4x6xf32>, tensor<4xi64>) -> tensor<3x2x6x4xf32>
-// CHECK:           [[VAL_400:%.*]] = "tf.Reshape"([[VAL_398]], {{.*}}) : (tensor<3x5x1x2x6xf32>, tensor<3xi64>) -> tensor<3x5x12xf32>
-// CHECK:           [[VAL_401:%.*]] = "tf.Reshape"([[VAL_399]], {{.*}}) : (tensor<3x2x6x4xf32>, tensor<3xi64>) -> tensor<3x12x4xf32>
-// CHECK:           [[VAL_402:%.*]] = "tf.BatchMatMulV2"([[VAL_400]], [[VAL_401]]) {adj_x = false, adj_y = false} : (tensor<3x5x12xf32>, tensor<3x12x4xf32>) -> tensor<3x5x4xf32>
-// CHECK:           [[VAL_403:%.*]] = "tf.Reshape"([[VAL_402]], {{.*}}) : (tensor<3x5x4xf32>, tensor<4xi64>) -> tensor<3x5x1x4xf32>
-// CHECK:           return [[VAL_403]] : tensor<3x5x1x4xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @convert_conv2d(
-// CHECK-SAME:                  [[VAL_404:%.*]]: tensor<1x8x8x207xf32>, [[VAL_405:%.*]]: tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32> {
-// CHECK:           [[VAL_406:%.*]] = "tf.Conv2D"([[VAL_404]], [[VAL_405]]) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true} : (tensor<1x8x8x207xf32>, tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32>
-// CHECK:           return [[VAL_406]] : tensor<1x8x8x16xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @convert_depthwise_conv2d(
-// CHECK-SAME:                  [[VAL_407:%.*]]: tensor<1x8x8x207xf32>, [[VAL_408:%.*]]: tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32> {
-// CHECK:           [[VAL_409:%.*]] = "tf.DepthwiseConv2dNative"([[VAL_407]], [[VAL_408]]) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<1x8x8x207xf32>, tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32>
-// CHECK:           return [[VAL_409]] : tensor<1x8x8x16xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @convert_conv2d_valid_padding(
-// CHECK-SAME:                  [[VAL_410:%.*]]: tensor<1x8x8x207xf32>, [[VAL_411:%.*]]: tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32> {
-// CHECK:           [[VAL_412:%.*]] = "tf.Conv2D"([[VAL_410]], [[VAL_411]]) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "VALID", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true} : (tensor<1x8x8x207xf32>, tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32>
-// CHECK:           return [[VAL_412]] : tensor<1x8x8x16xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @convert_reduce_to_sum(
-// CHECK-SAME:                  [[VAL_413:%.*]]: tensor<1x256xf32>) -> tensor<1xf32> {
-// CHECK:           [[VAL_414:%.*]] = "tf.Const"() {value = dense<1> : tensor<1xi64>} : () -> tensor<1xi64>
-// CHECK:           [[VAL_415:%.*]] = "tf.Sum"([[VAL_413:%.*]], [[VAL_414:%.*]]) {keep_dims = false} : (tensor<1x256xf32>, tensor<1xi64>) -> tensor<1xf32>
-// CHECK:           return [[VAL_415]] : tensor<1xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @convert_reduce_to_max(
-// CHECK-SAME:                  [[VAL_416:%.*]]: tensor<1x256xf32>) -> tensor<1xf32> {
-// CHECK:           [[VAL_417:%.*]] = "tf.Const"() {value = dense<1> : tensor<1xi64>} : () -> tensor<1xi64>
-// CHECK:           [[VAL_418:%.*]] = "tf.Max"([[VAL_416:%.*]], [[VAL_417:%.*]]) {keep_dims = false} : (tensor<1x256xf32>, tensor<1xi64>) -> tensor<1xf32>
-// CHECK:           return [[VAL_418]] : tensor<1xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @convert_reduce_to_min(
-// CHECK-SAME:                  [[VAL_419:%.*]]: tensor<1x256xf32>) -> tensor<1xf32> {
-// CHECK:           [[VAL_420:%.*]] = "tf.Const"() {value = dense<1> : tensor<1xi64>} : () -> tensor<1xi64>
-// CHECK:           [[VAL_421:%.*]] = "tf.Min"([[VAL_419:%.*]], [[VAL_420:%.*]]) {keep_dims = false} : (tensor<1x256xf32>, tensor<1xi64>) -> tensor<1xf32>
-// CHECK:           return [[VAL_421]] : tensor<1xf32>
-// CHECK:         }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/lower_tf.mlir b/tensorflow/compiler/mlir/tensorflow/tests/lower_tf.mlir
index e7e07845fcc..155f84ecc37 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/lower_tf.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/lower_tf.mlir
@@ -215,6 +215,60 @@ func @rsqrt_grad_unranked(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<
   return %0 : tensor<*xf32>
 }
 
+// %input has 1 batch dimension then 2 block dimensions then 1 remainder
+// dimension.
+// CHECK-LABEL: fourdim_SpaceToBatchND
+func @fourdim_SpaceToBatchND(%input: tensor<3x5x7x10xf32>, %block_shape: tensor<2xi64>, %paddings: tensor<2x2xi64>) -> tensor<*xf32> {
+  // CHECK-DAG: [[PAD00:%.+]] = "tf.Const"() {value = dense<0> : tensor<1x2xi64>}
+  // CHECK-DAG: [[ZERO_I32:%.+]] = "tf.Const"() {value = dense<0> : tensor<i32>}
+  // CHECK-DAG: [[ZERO_I64:%.+]] = "tf.Const"() {value = dense<0> : tensor<i64>}
+  // CHECK-DAG: [[ONE_I64:%.+]] = "tf.Const"() {value = dense<1> : tensor<i64>}
+  // CHECK-DAG: [[FULL_PADDINGS:%.+]] = "tf.ConcatV2"([[PAD00]], %arg2, [[PAD00]], [[ZERO_I64]])
+  // CHECK-DAG: [[PAD_DEFAULT:%.+]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<f32>}
+  // CHECK-DAG: [[PADDED:%.+]] = "tf.PadV2"(%arg0, [[FULL_PADDINGS]], [[PAD_DEFAULT]])
+  // CHECK-DAG: [[PADDINGS_SUM:%.+]] = "tf.Sum"([[FULL_PADDINGS]], [[ONE_I64]])
+  // CHECK-DAG: [[INPUT_SHAPE:%.+]] = "tf.Const"() {value = dense<[3, 5, 7, 10]> : tensor<4xi64>}
+  // CHECK-DAG: [[PADDED_SHAPE:%.+]] = "tf.Add"([[PADDINGS_SUM]], [[INPUT_SHAPE]])
+  // CHECK-DAG: [[PADDED_SHAPE_SPLITS:%.+]]:4 = "tf.Split"([[ZERO_I32]], [[PADDED_SHAPE]])
+  // CHECK-DAG: [[BLOCK_SHAPE_SPLITS:%.+]]:2 = "tf.Split"([[ZERO_I32]], %arg1)
+  // CHECK-DAG: [[OUTER_SHAPE_0:%.+]] = "tf.Div"([[PADDED_SHAPE_SPLITS]]#1, [[BLOCK_SHAPE_SPLITS]]#0)
+  // CHECK-DAG: [[OUTER_SHAPE_1:%.+]] = "tf.Div"([[PADDED_SHAPE_SPLITS]]#2, [[BLOCK_SHAPE_SPLITS]]#1)
+  // CHECK-DAG: [[RESHAPED_SHAPE:%.+]] = "tf.ConcatV2"([[PADDED_SHAPE_SPLITS]]#0, [[OUTER_SHAPE_0]], [[BLOCK_SHAPE_SPLITS]]#0, [[OUTER_SHAPE_1]], [[BLOCK_SHAPE_SPLITS]]#1, [[PADDED_SHAPE_SPLITS]]#3, [[ZERO_I64]])
+  // CHECK-DAG: [[PERMUTATION:%.+]] = "tf.Const"() {value = dense<[2, 4, 0, 1, 3, 5]> : tensor<6xi64>}
+  // CHECK-DAG: [[OUTPUT_BATCH_PART:%.+]] = "tf.Mul"([[PADDED_SHAPE_SPLITS]]#0, [[BLOCK_SHAPE_SPLITS]]#0)
+  // CHECK-DAG: [[OUTPUT_BATCH:%.+]] = "tf.Mul"([[OUTPUT_BATCH_PART]], [[BLOCK_SHAPE_SPLITS]]#1)
+  // CHECK-DAG: [[OUTPUT_SHAPE:%.+]] = "tf.ConcatV2"([[OUTPUT_BATCH]], [[OUTER_SHAPE_0]], [[OUTER_SHAPE_1]], [[PADDED_SHAPE_SPLITS]]#3, [[ZERO_I64]])
+  // CHECK-DAG: [[RESHAPED:%.+]] = "tf.Reshape"([[PADDED]], [[RESHAPED_SHAPE]])
+  // CHECK-DAG: [[PERMUTED:%.+]] = "tf.Transpose"([[RESHAPED]], [[PERMUTATION]])
+  // CHECK-DAG: [[RESULT:%.+]] = "tf.Reshape"([[PERMUTED]], [[OUTPUT_SHAPE]])
+  // CHECK-DAG: return [[RESULT]]
+  %0 = "tf.SpaceToBatchND"(%input, %block_shape, %paddings) : (tensor<3x5x7x10xf32>, tensor<2xi64>, tensor<2x2xi64>) -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
+
+// %input has 1 batch dimension then 3 block dimensions then 2 remainder
+// dimensions. This checks only ops that are specific to the case with 3 block
+// dimension and 2 remainder dimensions.
+// CHECK-LABEL: sixdim_SpaceToBatchND
+func @sixdim_SpaceToBatchND(%input: tensor<3x5x7x9x10x11xf32>, %block_shape: tensor<3xi64>, %paddings: tensor<3x2xi64>) -> tensor<*xf32> {
+  // CHECK-DAG: [[PAD00:%.+]] = "tf.Const"()
+  // CHECK-DAG: [[FULL_PADDINGS:%.+]] = "tf.ConcatV2"([[PAD00]], %arg2, [[PAD00]], [[PAD00]], {{.+}})
+  // CHECK-DAG: [[INPUT_SHAPE:%.+]] = "tf.Const"() {value = dense<[3, 5, 7, 9, 10, 11]> : tensor<6xi64>}
+  // CHECK-DAG: [[PADDED_SHAPE_SPLITS:%.+]]:6 = "tf.Split"
+  // CHECK-DAG: [[BLOCK_SHAPE_SPLITS:%.+]]:3 = "tf.Split"
+  // CHECK-DAG: [[OUTER_SHAPE_0:%.+]] = "tf.Div"([[PADDED_SHAPE_SPLITS]]#1, [[BLOCK_SHAPE_SPLITS]]#0)
+  // CHECK-DAG: [[OUTER_SHAPE_1:%.+]] = "tf.Div"([[PADDED_SHAPE_SPLITS]]#2, [[BLOCK_SHAPE_SPLITS]]#1)
+  // CHECK-DAG: [[OUTER_SHAPE_2:%.+]] = "tf.Div"([[PADDED_SHAPE_SPLITS]]#3, [[BLOCK_SHAPE_SPLITS]]#2)
+  // CHECK-DAG: [[RESHAPED_SHAPE:%.+]] = "tf.ConcatV2"([[PADDED_SHAPE_SPLITS]]#0, [[OUTER_SHAPE_0]], [[BLOCK_SHAPE_SPLITS]]#0, [[OUTER_SHAPE_1]], [[BLOCK_SHAPE_SPLITS]]#1, [[OUTER_SHAPE_2]], [[BLOCK_SHAPE_SPLITS]]#2, [[PADDED_SHAPE_SPLITS]]#4, [[PADDED_SHAPE_SPLITS]]#5, {{.+}})
+  // CHECK-DAG: [[PERMUTATION:%.+]] = "tf.Const"() {value = dense<[2, 4, 6, 0, 1, 3, 5, 7, 8]> : tensor<9xi64>}
+  // CHECK-DAG: [[OUTPUT_BATCH_PART1:%.+]] = "tf.Mul"([[PADDED_SHAPE_SPLITS]]#0, [[BLOCK_SHAPE_SPLITS]]#0)
+  // CHECK-DAG: [[OUTPUT_BATCH_PART2:%.+]] = "tf.Mul"([[OUTPUT_BATCH_PART1]], [[BLOCK_SHAPE_SPLITS]]#1)
+  // CHECK-DAG: [[OUTPUT_BATCH:%.+]] = "tf.Mul"([[OUTPUT_BATCH_PART2]], [[BLOCK_SHAPE_SPLITS]]#2)
+  // CHECK-DAG: [[OUTPUT_SHAPE:%.+]] = "tf.ConcatV2"([[OUTPUT_BATCH]], [[OUTER_SHAPE_0]], [[OUTER_SHAPE_1]], [[OUTER_SHAPE_2]], [[PADDED_SHAPE_SPLITS]]#4, [[PADDED_SHAPE_SPLITS]]#5, {{.+}})
+  %0 = "tf.SpaceToBatchND"(%input, %block_shape, %paddings) : (tensor<3x5x7x9x10x11xf32>, tensor<3xi64>, tensor<3x2xi64>) -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
+
 // CHECK-LABEL: SoftmaxCrossEntropyWithLogits
 // CHECK-SAME: %[[FEATURES:.*]]: tensor<2x3xf32>, %[[LABELS:.*]]: tensor<2x3xf32>
 func @SoftmaxCrossEntropyWithLogits(%features: tensor<2x3xf32>, %labels: tensor<2x3xf32>) -> (tensor<2xf32>, tensor<2x3xf32>) {
@@ -353,8 +407,16 @@ func @ZerosLike_variant(%arg0: tensor<!tf.variant<tensor<2xi32>>>) -> tensor<!tf
   return %0 : tensor<!tf.variant<tensor<2xi32>>>
 }
 
-// CHECK-LABEL: func @addN
-func @addN(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>, %arg2: tensor<*xf32>) -> tensor<*xf32> {
+// CHECK-LABEL: func @addN_2
+func @addN_2(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
+  // CHECK: %[[SUM0:.*]] = "tf.AddV2"(%arg0, %arg1)
+  // return %[[SUM0]]
+  %0 = "tf.AddN"(%arg0, %arg1) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
+
+// CHECK-LABEL: func @addN_3
+func @addN_3(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>, %arg2: tensor<*xf32>) -> tensor<*xf32> {
   // CHECK: %[[SUM0:.*]] = "tf.AddV2"(%arg0, %arg1)
   // CHECK: %[[SUM1:.*]] = "tf.AddV2"(%[[SUM0]], %arg2)
   // return %[[SUM1]]
@@ -362,6 +424,27 @@ func @addN(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>, %arg2: tensor<*xf32>) ->
   return %0 : tensor<*xf32>
 }
 
+// CHECK-LABEL: func @addN_4
+func @addN_4(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>, %arg2: tensor<*xf32>, %arg3: tensor<*xf32>) -> tensor<*xf32> {
+  // CHECK: %[[SUM0:.*]] = "tf.AddV2"(%arg0, %arg1)
+  // CHECK: %[[SUM1:.*]] = "tf.AddV2"(%arg2, %arg3)
+  // CHECK: %[[SUM2:.*]] = "tf.AddV2"(%[[SUM0]], %[[SUM1]])
+  // return %[[SUM2]]
+  %0 = "tf.AddN"(%arg0, %arg1, %arg2, %arg3) : (tensor<*xf32>, tensor<*xf32>, tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
+
+// CHECK-LABEL: func @addN_5
+func @addN_5(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>, %arg2: tensor<*xf32>, %arg3: tensor<*xf32>, %arg4: tensor<*xf32>) -> tensor<*xf32> {
+  // CHECK: %[[SUM0:.*]] = "tf.AddV2"(%arg0, %arg1)
+  // CHECK: %[[SUM1:.*]] = "tf.AddV2"(%arg2, %arg3)
+  // CHECK: %[[SUM2:.*]] = "tf.AddV2"(%[[SUM0]], %[[SUM1]])
+  // CHECK: %[[SUM3:.*]] = "tf.AddV2"(%[[SUM2]], %arg4)
+  // return %[[SUM3]]
+  %0 = "tf.AddN"(%arg0, %arg1, %arg2, %arg3, %arg4) : (tensor<*xf32>, tensor<*xf32>, tensor<*xf32>, tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
+
 // CHECK-LABEL: func @addN_variant
 func @addN_variant(%arg0: tensor<!tf.variant<tensor<2xf32>>>, %arg1: tensor<!tf.variant<tensor<2xf32>>>, %arg2: tensor<!tf.variant<tensor<2xf32>>>) -> tensor<!tf.variant<tensor<2xf32>>> {
   // CHECK: tf.AddN
@@ -450,13 +533,39 @@ func @DynamicStitch_duplicates(%arg0: tensor<2x2xf32>) -> tensor<1x2xf32> {
   return %0 : tensor<1x2xf32>
 }
 
-func @Reciprocal(%arg0: tensor<*xf32>) -> tensor<*xf32> {
+// CHECK-LABEL: @Reciprocal_i32
+func @Reciprocal_i32(%arg0: tensor<*xi32>) -> tensor<*xi32> {
+  // CHECK: %[[ONE:.*]] = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: "tf.Div"(%[[ONE]], %arg0) : (tensor<i32>, tensor<*xi32>) -> tensor<*xi32>
+  %0 = "tf.Reciprocal"(%arg0) : (tensor<*xi32>) -> tensor<*xi32>
+  return %0 : tensor<*xi32>
+}
+
+// CHECK-LABEL: @Reciprocal_f32
+func @Reciprocal_f32(%arg0: tensor<*xf32>) -> tensor<*xf32> {
   // CHECK: %[[ONE:.*]] = "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
   // CHECK: "tf.Div"(%[[ONE]], %arg0) : (tensor<f32>, tensor<*xf32>) -> tensor<*xf32>
   %0 = "tf.Reciprocal"(%arg0) : (tensor<*xf32>) -> tensor<*xf32>
   return %0 : tensor<*xf32>
 }
 
+// CHECK-LABEL: @Reciprocal_complexf32
+func @Reciprocal_complexf32(%arg0: tensor<*xcomplex<f32>>) -> tensor<*xcomplex<f32>> {
+  // CHECK: %[[ONE:.*]] = "tf.Const"() {value = dense<(1.000000e+00,0.000000e+00)> : tensor<complex<f32>>} : () -> tensor<complex<f32>>
+  // CHECK: "tf.Div"(%[[ONE]], %arg0) : (tensor<complex<f32>>, tensor<*xcomplex<f32>>) -> tensor<*xcomplex<f32>>
+  %0 = "tf.Reciprocal"(%arg0) : (tensor<*xcomplex<f32>>) -> tensor<*xcomplex<f32>>
+  return %0 : tensor<*xcomplex<f32>>
+}
+
+// CHECK-LABEL: @Reciprocal_complexf64
+func @Reciprocal_complexf64(%arg0: tensor<*xcomplex<f64>>) -> tensor<*xcomplex<f64>> {
+  // CHECK: %[[ONE:.*]] = "tf.Const"() {value = dense<(1.000000e+00,0.000000e+00)> : tensor<complex<f64>>} : () -> tensor<complex<f64>>
+  // CHECK: "tf.Div"(%[[ONE]], %arg0) : (tensor<complex<f64>>, tensor<*xcomplex<f64>>) -> tensor<*xcomplex<f64>>
+  %0 = "tf.Reciprocal"(%arg0) : (tensor<*xcomplex<f64>>) -> tensor<*xcomplex<f64>>
+  return %0 : tensor<*xcomplex<f64>>
+}
+
+// CHECK-LABEL: @ScatterNd
 func @ScatterNd(%arg0: tensor<4x1xi32>, %arg1: tensor<4xf32>) -> tensor<8xf32> {
   // CHECK: %[[ZERO:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<8xf32>} : () -> tensor<8xf32>
   // CHECK: "tf.TensorScatterUpdate"(%[[ZERO]], %arg0, %arg1) : (tensor<8xf32>, tensor<4x1xi32>, tensor<4xf32>) -> tensor<8xf32>
@@ -465,3 +574,16 @@ func @ScatterNd(%arg0: tensor<4x1xi32>, %arg1: tensor<4xf32>) -> tensor<8xf32> {
   %0 = "tf.ScatterNd"(%arg0, %arg1, %shape) : (tensor<4x1xi32>, tensor<4xf32>, tensor<1xi32>) -> tensor<8xf32>
   return %0 : tensor<8xf32>
 }
+
+// CHECK-LABEL: @_UnaryOpsComposition
+// CHECK-SAME: %[[ARG0:.*]]: tensor<4xf32>
+func @_UnaryOpsComposition(%arg0: tensor<4xf32>) -> tensor<4xf32> {
+
+  // CHECK: %[[RESULT0:.*]] = "tf.Asin"(%[[ARG0]])
+  // CHECK: %[[RESULT1:.*]] = "tf.Abs"(%[[RESULT0]])
+  // CHECK: %[[RESULT2:.*]] = "tf.Log"(%[[RESULT1]])
+  // CHECK: return %[[RESULT2]]
+
+  %0 = "tf._UnaryOpsComposition"(%arg0) {op_names = ["Asin", "Abs", "Log"]} : (tensor<4xf32>) -> tensor<4xf32>
+  return %0 : tensor<4xf32>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/mark_ops_for_outside_compilation.mlir b/tensorflow/compiler/mlir/tensorflow/tests/mark_ops_for_outside_compilation.mlir
index 3efa0b09439..dc99d9d6343 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/mark_ops_for_outside_compilation.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/mark_ops_for_outside_compilation.mlir
@@ -1,53 +1,286 @@
 // RUN: tf-opt %s -tf-mark-ops-for-outside-compilation | FILECHECK_OPTS="" FileCheck %s
 
-
-// CHECK-LABEL: func @op_string_result
-func @op_string_result() -> tensor<?xi32> {
+// CHECK-LABEL: func @unsupported_op_no_soft_placement
+func @unsupported_op_no_soft_placement() -> tensor<i32> {
   %0 = "tf_device.cluster"() ( {
-    // CHECK: "tf.A"
+    // CHECK: "tf.UnsupportedOp"
     // CHECK-NOT: _xla_outside_compilation
-    // CHECK: "tf.B"
-    // CHECK-SAME: _xla_outside_compilation
-    // CHECK: "tf.C"
+    // CHECK: "tf.Identity"
     // CHECK-NOT: _xla_outside_compilation
-    %1 = "tf.A"() : () -> tensor<?xi32>
-    %2 = "tf.B"(%1) : (tensor<?xi32>) -> tensor<!tf.string>
-    %3 = "tf.C"(%1) : (tensor<?xi32>) -> tensor<?xi32>
-    tf_device.return %3 : tensor<?xi32>
-  }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<?xi32>
-  return %0 : tensor<?xi32>
+    %1 = "tf.UnsupportedOp"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %2 = "tf.Identity"(%1) : (tensor<i32>) -> tensor<i32>
+    tf_device.return %2 : tensor<i32>
+  }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<i32>
+  return %0 : tensor<i32>
 }
 
-// CHECK-LABEL: func @op_string_operand
-func @op_string_operand(%arg0: tensor<!tf.string>) -> tensor<?xi32> {
+// CHECK-LABEL: func @unsupported_op_soft_placement_false
+func @unsupported_op_soft_placement_false() -> tensor<i32> {
   %0 = "tf_device.cluster"() ( {
-    // CHECK: "tf.A"
+    // CHECK: "tf.UnsupportedOp"
     // CHECK-NOT: _xla_outside_compilation
-    // CHECK: "tf.B"
+    // CHECK: "tf.Identity"
+    // CHECK-NOT: _xla_outside_compilation
+    %1 = "tf.UnsupportedOp"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %2 = "tf.Identity"(%1) : (tensor<i32>) -> tensor<i32>
+    tf_device.return %2 : tensor<i32>
+  }) {allow_soft_placement = false, num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<i32>
+  return %0 : tensor<i32>
+}
+
+// CHECK-LABEL: func @unsupported_op
+func @unsupported_op() -> tensor<i32> {
+  %0 = "tf_device.cluster"() ( {
+    // CHECK: "tf.UnsupportedOp"
     // CHECK-SAME: _xla_outside_compilation
-    // CHECK: "tf.C"
+    // CHECK: "tf.Identity"
     // CHECK-NOT: _xla_outside_compilation
-    %1 = "tf.A"() : () -> tensor<?xi32>
-    %2 = "tf.B"(%arg0) : (tensor<!tf.string>) -> tensor<?xi32>
-    %3 = "tf.C"(%2) : (tensor<?xi32>) -> tensor<?xi32>
-    tf_device.return %3 : tensor<?xi32>
-  }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<?xi32>
-  return %0 : tensor<?xi32>
+    %1 = "tf.UnsupportedOp"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %2 = "tf.Identity"(%1) : (tensor<i32>) -> tensor<i32>
+    tf_device.return %2 : tensor<i32>
+  }) {allow_soft_placement = true, num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<i32>
+  return %0 : tensor<i32>
+}
+
+// CHECK-LABEL: func @tf2xla_fallback_op
+func @tf2xla_fallback_op() -> tensor<f32> {
+  %0 = "tf_device.cluster"() ( {
+    // CHECK: "tf.UnsupportedOp"
+    // CHECK-SAME: _xla_outside_compilation
+    // CHECK: "tf.Identity"
+    // CHECK-NOT: _xla_outside_compilation
+    // CHECK: "tf.Sinh"
+    // CHECK-NOT: _xla_outside_compilation
+    %1 = "tf.UnsupportedOp"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %2 = "tf.Const"() {value = dense<1.0> : tensor<f32>} : () -> tensor<f32>
+    %3 = "tf.Identity"(%1) : (tensor<i32>) -> tensor<i32>
+    %4 = "tf.Sinh"(%2) : (tensor<f32>) -> tensor<f32>
+    tf_device.return %4 : tensor<f32>
+  }) {allow_soft_placement = true, num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<f32>
+  return %0 : tensor<f32>
+}
+
+// CHECK-LABEL: func @ignore_embedding_ops
+func @ignore_embedding_ops() -> () {
+  "tf_device.cluster"() ( {
+    // CHECK: "tf.RecvTPUEmbeddingActivations"
+    // CHECK-NOT: _xla_outside_compilation
+    // CHECK: "tf.SendTPUEmbeddingGradients"
+    // CHECK-NOT: _xla_outside_compilation
+    %2:2 = "tf.RecvTPUEmbeddingActivations"() {_tpu_embedding_layer = "call1", config = "\0A\0B\0C\0D"} : () -> (tensor<2x2xf32>, tensor<4x4xf32>)
+    "tf.SendTPUEmbeddingGradients"(%2#0, %2#1) {_tpu_embedding_layer = "call1", config = "\0A\0B\0C\0D", operand_segment_sizes = dense<[2, 0]> : vector<2xi32>} : (tensor<2x2xf32>, tensor<4x4xf32>) -> ()
+    tf_device.return
+  }) {allow_soft_placement = true, num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> ()
+  return
+}
+
+// CHECK-LABEL: func @op_string_result
+func @op_string_result() -> tensor<i32> {
+  %0 = "tf_device.cluster"() ( {
+    // CHECK: "tf.Const"() {value = dense<1> : tensor<i32>}
+    // CHECK-NOT: _xla_outside_compilation
+    // CHECK: "tf.Const"
+    // CHECK-SAME: _xla_outside_compilation
+    // CHECK-SAME: tf.string
+    // CHECK: "tf.Identity"
+    // CHECK-NOT: _xla_outside_compilation
+    %1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %2 = "tf.Const"() {value = dense<"x"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+    %3 = "tf.Identity"(%1) : (tensor<i32>) -> tensor<i32>
+    tf_device.return %3 : tensor<i32>
+  }) {allow_soft_placement = true, num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<i32>
+  return %0 : tensor<i32>
+}
+// CHECK-LABEL: func @op_string_operand
+func @op_string_operand(%arg0: tensor<!tf.string>) -> tensor<i32> {
+  %0 = "tf_device.cluster"() ( {
+    // CHECK: "tf.Const"() {value = dense<1> : tensor<i32>}
+    // CHECK-NOT: _xla_outside_compilation
+    // CHECK: "tf.StringToNumber"
+    // CHECK-SAME: _xla_outside_compilation
+    // CHECK-SAME: tf.string
+    // CHECK: "tf.Identity"
+    // CHECK-NOT: _xla_outside_compilation
+    %1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %2 = "tf.StringToNumber"(%arg0) {out_type = f32} : (tensor<!tf.string>) -> tensor<f32>
+    %3 = "tf.Identity"(%1) : (tensor<i32>) -> tensor<i32>
+    tf_device.return %3 : tensor<i32>
+  }) {allow_soft_placement = true, num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<i32>
+  return %0 : tensor<i32>
 }
 
 // CHECK-LABEL: func @op_string_operand_string_result
-func @op_string_operand_string_result(%arg0: tensor<!tf.string>) -> tensor<?xi32> {
+func @op_string_operand_string_result(%arg0: tensor<!tf.string>) -> tensor<i32> {
   %0 = "tf_device.cluster"() ( {
-    // CHECK: "tf.A"
+    // CHECK: "tf.Const"() {value = dense<1> : tensor<i32>}
     // CHECK-NOT: _xla_outside_compilation
-    // CHECK: "tf.B"
+    // CHECK: "tf.Identity"
     // CHECK-SAME: _xla_outside_compilation
-    // CHECK: "tf.C"
+    // CHECK-SAME: tf.string
+    // CHECK: "tf.Identity"
     // CHECK-NOT: _xla_outside_compilation
-    %1 = "tf.A"() : () -> tensor<?xi32>
-    %2 = "tf.B"(%arg0) : (tensor<!tf.string>) -> tensor<!tf.string>
-    %3 = "tf.C"(%1) : (tensor<?xi32>) -> tensor<?xi32>
-    tf_device.return %3 : tensor<?xi32>
-  }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<?xi32>
-  return %0 : tensor<?xi32>
+    %1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %2 = "tf.Identity"(%arg0)  : (tensor<!tf.string>) -> tensor<!tf.string>
+    %3 = "tf.Identity"(%1) : (tensor<i32>) -> tensor<i32>
+    tf_device.return %3 : tensor<i32>
+  }) {allow_soft_placement = true, num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<i32>
+  return %0 : tensor<i32>
+}
+
+// Test that a tf.IfRegion op with a captured string operand is marked for outside compilation.
+
+// CHECK-LABEL: func @if_region_captured_string
+func @if_region_captured_string(%arg0: tensor<i1>, %arg1: tensor<!tf.string>) -> tensor<f32> {
+  %0 = "tf_device.cluster"() ( {
+    // CHECK: "tf.Const"() {value = dense<1> : tensor<i32>}
+    // CHECK-NOT: _xla_outside_compilation
+    // CHECK: "tf.IfRegion"
+    // CHECK: "tf.StringToNumber"
+    // CHECK-NOT: _xla_outside_compilation
+    // CHECK: _xla_outside_compilation = "auto1", is_stateless = true
+    %1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %2 = "tf.IfRegion"(%arg0) ( {
+      %3 = "tf.StringToNumber"(%arg1) {out_type = f32} : (tensor<!tf.string>) -> tensor<f32>
+      "tf.Yield"(%3) : (tensor<f32>) -> ()
+     },  {
+      %4 = "tf.Const"() {value = dense<1.0> : tensor<f32>} : () -> tensor<f32>
+      "tf.Yield"(%4) : (tensor<f32>) -> ()
+    }) {is_stateless = true} : (tensor<i1>) -> (tensor<f32>)
+    %5 = "tf.Identity"(%2) : (tensor<f32>) -> tensor<f32>
+    tf_device.return %5 : tensor<f32>
+  }) {allow_soft_placement = true, num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<f32>
+  return %0 : tensor<f32>
+}
+
+// Test that ops with string results/operands inside a tf.IfRegion branch are marked for outside compilation.
+
+// CHECK-LABEL: func @if_region_string_op
+func @if_region_string_op(%arg0: tensor<i1>, %arg1: tensor<?xi32>) -> tensor<f32> {
+  %0 = "tf_device.cluster"() ( {
+    // CHECK: "tf.Const"() {value = dense<1> : tensor<i32>}
+    // CHECK-NOT: _xla_outside_compilation
+    // CHECK: "tf.IfRegion"
+    // CHECK-NOT: _xla_outside_compilation
+    %1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %2 = "tf.IfRegion"(%arg0) ( {
+      %3 = "tf.Const"() {value = dense<1.0> : tensor<f32>} : () -> tensor<f32>
+      "tf.Yield"(%3) : (tensor<f32>) -> ()
+     },  {
+      // CHECK: "tf.Const"() {_xla_outside_compilation = "auto0", value = dense<"1.0"> : tensor<!tf.string>}
+      // CHECK-NEXT: "tf.StringToNumber"
+      // CHECK-SAME: _xla_outside_compilation
+      %4 = "tf.Const"() {value = dense<"1.0"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+      %5 = "tf.StringToNumber"(%4) {out_type = f32} : (tensor<!tf.string>) -> tensor<f32>
+      "tf.Yield"(%5) : (tensor<f32>) -> ()
+    // CHECK: {is_stateless
+    }) {is_stateless = true} : (tensor<i1>) -> (tensor<f32>)
+    %6 = "tf.Identity"(%2) : (tensor<f32>) -> tensor<f32>
+    tf_device.return %6: tensor<f32>
+  }) {allow_soft_placement = true, num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<f32>
+  return %0 : tensor<f32>
+}
+
+// Test that ops with string results/operands inside a nested tf.IfRegion branch are marked for outside compilation.
+
+// CHECK-LABEL: func @nested_if_region_string_op
+func @nested_if_region_string_op(%arg0: tensor<i1>, %arg1: tensor<?xi32>) -> tensor<f32> {
+  %0 = "tf_device.cluster"() ( {
+    // CHECK: "tf.Const"() {value = dense<1> : tensor<i32>}
+    // CHECK-NOT: _xla_outside_compilation
+    // CHECK: "tf.IfRegion"
+    // CHECK-NOT: _xla_outside_compilation
+    %1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %2 = "tf.IfRegion"(%arg0) ( {
+      %3 = "tf.Const"() {value = dense<1.0> : tensor<f32>} : () -> tensor<f32>
+      "tf.Yield"(%3) : (tensor<f32>) -> ()
+      },  {
+       // CHECK: "tf.Const"() {value = dense<true> : tensor<i1>}
+       // CHECK-NOT: _xla_outside_compilation
+       %4 = "tf.Const"() {value = dense<true> : tensor<i1>} : () -> tensor<i1>
+       %5 = "tf.IfRegion"(%4)({
+         // CHECK: "tf.Const"() {_xla_outside_compilation = "auto0", value = dense<"1.0"> : tensor<!tf.string>}
+         // CHECK-NEXT: "tf.StringToNumber"
+         // CHECK-SAME: _xla_outside_compilation
+         %6 = "tf.Const"() {value = dense<"1.0"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+         %7 = "tf.StringToNumber"(%6) {out_type = f32} : (tensor<!tf.string>) -> tensor<f32>
+         "tf.Yield"(%7) : (tensor<f32>) -> ()
+       },  {
+         // CHECK: "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>}
+         // CHECK-NOT: _xla_outside_compilation
+         %8 = "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
+         "tf.Yield"(%8) : (tensor<f32>) -> ()
+       // CHECK: {is_stateless
+       }){is_stateless = true} : (tensor<i1>) -> (tensor<f32>)
+       "tf.Yield"(%5) : (tensor<f32>) -> ()
+    // CHECK: {is_stateless
+    }) {is_stateless = true} : (tensor<i1>) -> (tensor<f32>)
+    %9 = "tf.Identity"(%2) : (tensor<f32>) -> tensor<f32>
+    tf_device.return %9: tensor<f32>
+  }) {allow_soft_placement = true, num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<f32>
+  return %0 : tensor<f32>
+}
+
+// Test that a tf.WhileRegion op with a captured string operand is marked for outside compilation.
+
+// CHECK-LABEL: func @while_region_captured_string
+func @while_region_captured_string(%arg0: tensor<i32>, %arg1: tensor<!tf.string>) -> tensor<f32> {
+  %0 = "tf_device.cluster"() ( {
+    // CHECK: "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>}
+    // CHECK-NOT: _xla_outside_compilation
+    // CHECK: "tf.WhileRegion"
+    // CHECK: "tf.StringToNumber"
+    // CHECK: _xla_outside_compilation = "auto1", is_stateless = true
+    %1 = "tf.Const"() {value = dense<1.0> : tensor<f32>} : () -> tensor<f32>
+    %2:2 = "tf.WhileRegion"(%1, %arg0) ( {
+      ^bb0(%carg0: tensor<f32>, %carg1: tensor<i32>):
+         %limit = constant dense<5> : tensor<i32>
+         %cond = "tf.NotEqual"(%carg1, %limit) : (tensor<i32>, tensor<i32>) -> tensor<i1>
+         "tf.Yield"(%cond) : (tensor<i1>) -> ()
+    },  {
+      ^bb0(%barg0: tensor<f32>, %barg1: tensor<i32>):
+        %one = constant dense<1> : tensor<i32>
+        %sub = "tf.Sub"(%barg1, %one) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+        %3 = "tf.StringToNumber"(%arg1) {out_type = f32} : (tensor<!tf.string>) -> tensor<f32>
+        "tf.Yield"(%3, %sub) : (tensor<f32>, tensor<i32>) -> ()
+    }) {is_stateless = true} : (tensor<f32>, tensor<i32>) -> (tensor<f32>, tensor<i32>)
+    // CHECK: "tf.Identity"
+    // CHECK-NOT: _xla_outside_compilation
+    %5 = "tf.Identity"(%2#0) : (tensor<f32>) -> (tensor<f32>)
+    tf_device.return %5 : tensor<f32>
+  }) {allow_soft_placement = true, num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<f32>
+  return %0 : tensor<f32>
+}
+
+// Test that an unsupported op within a  tf.WhileRegion is marked for outside compilation.
+
+// CHECK-LABEL: func @while_region_unsupported_op
+func @while_region_unsupported_op(%arg0: tensor<i32>, %arg1: tensor<!tf.string>) -> tensor<f32> {
+  %0 = "tf_device.cluster"() ( {
+    // CHECK: "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>}
+    // CHECK-NOT: _xla_outside_compilation
+    // CHECK: "tf.WhileRegion"
+    %1 = "tf.Const"() {value = dense<1.0> : tensor<f32>} : () -> tensor<f32>
+    %2:2 = "tf.WhileRegion"(%1, %arg0) ( {
+      ^bb0(%carg0: tensor<f32>, %carg1: tensor<i32>):
+         %limit = constant dense<5> : tensor<i32>
+         %cond = "tf.NotEqual"(%carg1, %limit) : (tensor<i32>, tensor<i32>) -> tensor<i1>
+         "tf.Yield"(%cond) : (tensor<i1>) -> ()
+    },  {
+      ^bb0(%barg0: tensor<f32>, %barg1: tensor<i32>):
+        %one = constant dense<1> : tensor<i32>
+        %sub = "tf.Sub"(%barg1, %one) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+        // CHECK: "tf.UnsupportedOp"
+        // CHECK-SAME: _xla_outside_compilation
+        %3 = "tf.UnsupportedOp"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+        // CHECK: "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>}
+        %4 = "tf.Const"() {value = dense<1.0> : tensor<f32>} : () -> tensor<f32>
+        "tf.Yield"(%4, %sub) : (tensor<f32>, tensor<i32>) -> ()
+    // CHECK: {is_stateless = true
+    }) {is_stateless = true} : (tensor<f32>, tensor<i32>) -> (tensor<f32>, tensor<i32>)
+    // CHECK: "tf.Identity"
+    // CHECK-NOT: _xla_outside_compilation
+    %5 = "tf.Identity"(%2#0) : (tensor<f32>) -> (tensor<f32>)
+    tf_device.return %5 : tensor<f32>
+  }) {allow_soft_placement = true, num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<f32>
+  return %0 : tensor<f32>
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/case.mlir b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/case.mlir
new file mode 100644
index 00000000000..2f2ee6f1286
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/case.mlir
@@ -0,0 +1,38 @@
+// RUN: tf-mlir-translate -mlir-to-graphdef %s -o - | FileCheck %s
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 486 : i32}} {
+  func @main() {
+    tf_executor.graph {
+      %outputs, %control = tf_executor.island wraps "tf.Const"() {device = "", value = dense<1> : tensor<i32>} : () -> tensor<i32>
+      %outputs_0, %control_1 = tf_executor.island wraps "tf.Const"() {device = "", value = dense<0> : tensor<i32>} : () -> tensor<i32>
+      %outputs_2, %control_3 = tf_executor.island wraps "tf.Case"(%outputs_0, %outputs) {Tin = [i32], Tout = [i32], _lower_using_switch_merge = true, _read_only_resource_inputs = [], branches = [@indexed_case_branch0_40, @indexed_case_branch1_50], device = "", is_stateless = true, output_shapes = [#tf.shape<>]} : (tensor<i32>, tensor<i32>) -> tensor<*xi32> loc("stateless_case")
+      %outputs_4, %control_5 = tf_executor.island wraps "tf.Identity"(%outputs_2) {device = ""} : (tensor<*xi32>) -> tensor<*xi32>
+      %outputs_6, %control_7 = tf_executor.island wraps "tf.Case"(%outputs_0, %outputs) {Tin = [i32], Tout = [i32], _lower_using_switch_merge = true, _read_only_resource_inputs = [], branches = [@indexed_case_branch0_40, @indexed_case_branch1_50], device = "", is_stateless = false, output_shapes = [#tf.shape<>]} : (tensor<i32>, tensor<i32>) -> tensor<*xi32> loc("regular_case")
+      tf_executor.fetch
+    }
+    return
+  }
+
+  func @indexed_case_branch0_40(%arg0: tensor<i32>) -> tensor<*xi32> attributes {sym_visibility = "private"} {
+    %0 = tf_executor.graph {
+      %outputs, %control = tf_executor.island wraps "tf.Const"() {device = "", value = dense<1> : tensor<i32>} : () -> tensor<i32>
+      %outputs_0, %control_1 = tf_executor.island wraps "tf.AddV2"(%arg0, %outputs) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<*xi32>
+      tf_executor.fetch %outputs_0 : tensor<*xi32>
+    }
+    return %0 : tensor<*xi32>
+  }
+
+  func @indexed_case_branch1_50(%arg0: tensor<i32>) -> tensor<*xi32> attributes {sym_visibility = "private"} {
+    %0 = tf_executor.graph {
+      %outputs, %control = tf_executor.island wraps "tf.Const"() {device = "", value = dense<2> : tensor<i32>} : () -> tensor<i32>
+      %outputs_0, %control_1 = tf_executor.island wraps "tf.AddV2"(%arg0, %outputs) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<*xi32>
+      tf_executor.fetch %outputs_0 : tensor<*xi32>
+    }
+    return %0 : tensor<*xi32>
+  }
+}
+
+// CHECK: name: "stateless_case"
+// CHECK-NEXT: "StatelessCase"
+// CHECK: name: "regular_case"
+// CHECK-NEXT: "Case"
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/func_list_attr.mlir b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/func_list_attr.mlir
index c6543f3121e..09a38b5b5de 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/func_list_attr.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/func_list_attr.mlir
@@ -43,7 +43,7 @@ func @main() {
 // CHECK-NEXT:     }
 // CHECK-NEXT:   }
 // CHECK:      }
-    %1:2 = tf_executor.island wraps "tf.Case"(%0#0) {Tin = [], Tout = ["tfdtype$DT_FLOAT"], branches = [@foo, @bar], device = "", output_shapes = []} : (tensor<i32>) -> tensor<*xf32> loc("Case")
+    %1:2 = tf_executor.island wraps "tf.Case"(%0#0) {Tin = [], Tout = ["tfdtype$DT_FLOAT"], branches = [@foo, @bar], device = "", output_shapes = [], is_stateless = false} : (tensor<i32>) -> tensor<*xf32> loc("Case")
     tf_executor.fetch
   }
   return
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/region-control-flow-to-functional.mlir b/tensorflow/compiler/mlir/tensorflow/tests/region-control-flow-to-functional.mlir
index e9d4e441a10..3e8935b699e 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/region-control-flow-to-functional.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/region-control-flow-to-functional.mlir
@@ -212,6 +212,28 @@ func @testNoOutputs(%arg0: tensor<i1>, %arg1: tensor<*xf32>) -> () {
   return
 }
 
+// -----
+// Check ToBool folding for IfRegion
+// CHECK: func @tf.IfRegion_else(%arg0: tensor<*xf32>) -> tensor<*xf32>
+// CHECK-NEXT:   "tf.Neg"
+// CHECK: func @tf.IfRegion_then(%arg0: tensor<*xf32>) -> tensor<*xf32>
+// CHECK-NEXT:   "tf.Abs"
+// CHECK-LABEL: @testToBoolFold
+func @testToBoolFold(%arg0: tensor<i32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
+  // CHECK-NEXT: "tf.If"(%arg0, %arg1)
+  // CHECK-SAME: else_branch = @tf.IfRegion_else
+  // CHECK-SAME: then_branch = @tf.IfRegion_then
+  %tobool = "tf.ToBool"(%arg0) : (tensor<i32>) -> tensor<i1>
+  %0 = "tf.IfRegion"(%tobool) ({
+    %1 = "tf.Abs"(%arg1) : (tensor<*xf32>) -> tensor<*xf32>
+    "tf.Yield"(%1) : (tensor<*xf32>) -> ()
+    }, {
+    %2 = "tf.Neg"(%arg1) : (tensor<*xf32>) -> tensor<*xf32>
+    "tf.Yield"(%2) : (tensor<*xf32>) -> ()
+    }) {is_stateless = true} :  (tensor<i1>) -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
+
 // -----
 
 // Simple WhileRegion
@@ -592,3 +614,64 @@ func @testWhileRegionBlockArgMismatch(%arg0 : tensor<*xf32>, %arg1 : tensor<i32>
   // CHECK: return [[Result]]#0
   return %0#0 : tensor<*xf32>
 }
+
+// -----
+
+// Simple trivially transformable while with ToBool
+// CHECK: func @while_cond
+// CHECK: func @while_body
+// CHECK-LABEL: testWhileRegionTrivial
+func @while_cond(%arg0 : tensor<*xf32>, %arg1 : tensor<i32>) -> tensor<i32>
+func @while_body(%arg0 : tensor<*xf32>, %arg1 : tensor<i32>) -> (tensor<*xf32>, tensor<i32>)
+func @testWhileRegionTrivial(%arg0 : tensor<*xf32>, %arg1 : tensor<i32>) -> tensor<*xf32> {
+  // CHECK: [[Result:%.*]]:2 = "tf.While"(%arg0, %arg1) {body = @while_body, cond = @while_cond
+  %0:2 = "tf.WhileRegion"(%arg0, %arg1) (
+    {
+      ^bb0(%carg0: tensor<*xf32>, %carg1: tensor<i32>):
+        %cond_i32 = call @while_cond(%carg0, %carg1) : (tensor<*xf32>, tensor<i32>) -> tensor<i32>
+        %cond = "tf.ToBool"(%cond_i32) : (tensor<i32>) -> tensor<i1>
+        "tf.Yield"(%cond) : (tensor<i1>) -> ()
+    },
+    {
+      // loop body
+      ^bb0(%barg0: tensor<*xf32>, %barg1: tensor<i32>):
+        %bdy:2 = call @while_body(%barg0, %barg1) : (tensor<*xf32>, tensor<i32>) -> (tensor<*xf32>, tensor<i32>)
+        "tf.Yield"(%bdy#0, %bdy#1) : (tensor<*xf32>, tensor<i32>) -> ()
+    }
+  ) { is_stateless = false } : (tensor<*xf32>, tensor<i32>) -> (tensor<*xf32>, tensor<i32>)
+  // CHECK: return [[Result]]#0
+  return %0#0 : tensor<*xf32>
+}
+
+// -----
+
+// Test tf.IfRegion device is preserved.
+// CHECK-LABEL: func @testIfRegionDevice
+func @testIfRegionDevice(%arg0: tensor<i1>) {
+  "tf.IfRegion"(%arg0) ({
+    "tf.Yield"() : () -> ()
+  }, {
+    "tf.Yield"() : () -> ()
+  }) {is_stateless = false, device = "/device:CPU:0"} : (tensor<i1>) -> ()
+
+  // CHECK: "tf.If"
+  // CHECK-SAME: device = "/device:CPU:0"
+  return
+}
+
+// -----
+
+// Test tf.WhileRegion device is preserved.
+// CHECK-LABEL: func @testWhileRegionDevice
+func @testWhileRegionDevice() {
+  "tf.WhileRegion"() ( {
+    %0 = "tf.Const"() {value = dense<false> : tensor<i1>} : () -> tensor<i1>
+    "tf.Yield"(%0) : (tensor<i1>) -> ()
+  }, {
+    "tf.Yield"() : () -> ()
+  }) {is_stateless = false, device = "/device:CPU:0"} : () -> ()
+
+  // CHECK: "tf.While"
+  // CHECK-SAME: device = "/device:CPU:0"
+  return
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/replicate_to_island.mlir b/tensorflow/compiler/mlir/tensorflow/tests/replicate_to_island.mlir
index 9931a45f995..487234ce958 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/replicate_to_island.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/replicate_to_island.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-opt %s -tf-replicate-to-island | FileCheck %s
+// RUN: tf-opt -split-input-file %s -tf-replicate-to-island | FileCheck %s
 
 // Tests per replica island has same control operands as island holding
 // replicate.
@@ -223,3 +223,219 @@ func @replica_id_attr_added(%arg0: tensor<!tf.string>, %arg1: tensor<!tf.string>
 // CHECK:      "tf.A"
 // CHECK-NOT:   _xla_replica_id
 // CHECK:      tf_executor.fetch
+
+
+// Tests device ordinals are added to `tf._XlaSendFromHost`/`tf._XlaRecvAtHost`
+// based on the first TPU core device id.
+// CHECK-LABEL: func @device_ordinals
+func @device_ordinals(%arg0: tensor<f32>, %arg1: tensor<2x!tf.string>) {
+  tf_executor.graph {
+    tf_executor.island {
+      tf_device.replicate([%arg0, %arg0] as %arg2: tensor<f32>) {n = 2 : i32, devices = {TPU_REPLICATED_CORE_0 = ["/job:worker/replica:0/task:0/device:TPU:1", "/job:worker/replica:0/task:0/device:TPU:2"]}} {
+        %0 = "tf._XlaRecvAtHost"(%arg1) {_xla_has_host_transfer = true, device_ordinal = 0 : i64, key = "host_compute_channel_send_0"} : (tensor<2x!tf.string>) -> tensor<f32>
+        "tf._XlaSendFromHost"(%0, %arg1) {_xla_has_host_transfer = true, device_ordinal = 0 : i64, key = "host_compute_channel_recv_0"} : (tensor<f32>, tensor<2x!tf.string>) -> ()
+        "tf.NoOp"() : () -> ()
+        tf_device.return
+      }
+      tf_executor.yield
+    }
+    tf_executor.fetch
+  }
+  return
+}
+
+// CHECK:      tf_executor.island
+// CHECK:      "tf._XlaRecvAtHost"
+// CHECK-SAME:   device_ordinal = 1
+// CHECK:      "tf._XlaSendFromHost"
+// CHECK-SAME:   device_ordinal = 1
+// CHECK:      "tf.NoOp"
+// CHECK:      tf_executor.island
+// CHECK:      "tf._XlaRecvAtHost"
+// CHECK-SAME:   device_ordinal = 2
+// CHECK:      "tf._XlaSendFromHost"
+// CHECK-SAME:   device_ordinal = 2
+// CHECK:      "tf.NoOp"
+
+// -----
+
+// Tests functions with replica variant ops reachable from a replicate region
+// is cloned and remapped.
+
+// CHECK-LABEL: func @call_with_replicate_variant_ops
+func @call_with_replicate_variant_ops(%arg0: tensor<f32>, %arg1: tensor<2x!tf.string>) {
+  tf_executor.graph {
+    tf_executor.island {
+      tf_device.replicate([%arg0, %arg0] as %arg2: tensor<f32>) {n = 2 : i32, devices = {TPU_REPLICATED_CORE_0 = ["/job:worker/replica:0/task:0/device:TPU:1", "/job:worker/replica:0/task:0/device:TPU:2"]}} {
+        "tf.StatefulPartitionedCall"(%arg1) {config = "", config_proto = "", executor_type = "", f = @send_recv} : (tensor<2x!tf.string>) -> ()
+        tf_device.return
+      }
+      tf_executor.yield
+    }
+    tf_executor.fetch
+  }
+  return
+}
+
+// CHECK: "tf.StatefulPartitionedCall"
+// CHECK-SAME: f = [[CALL_REPLICA_0:@[a-z0-9_]+]]
+// CHECK: "tf.StatefulPartitionedCall"
+// CHECK-SAME: f = [[CALL_REPLICA_1:@[a-z0-9_]+]]
+
+func @send_recv(%arg0: tensor<2x!tf.string>) {
+  %0 = "tf._XlaRecvAtHost"(%arg0) {_xla_has_host_transfer = true, device_ordinal = 0 : i64, key = "host_compute_channel_send_0"} : (tensor<2x!tf.string>) -> tensor<f32>
+  "tf._XlaSendFromHost"(%0, %arg0) {_xla_has_host_transfer = true, device_ordinal = 0 : i64, key = "host_compute_channel_recv_0"} : (tensor<f32>, tensor<2x!tf.string>) -> ()
+  "tf.NoOp"() : () -> ()
+  return
+}
+
+// CHECK: func [[CALL_REPLICA_0]]
+// CHECK: "tf._XlaRecvAtHost"
+// CHECK-SAME: device_ordinal = 1
+// CHECK: "tf._XlaSendFromHost"
+// CHECK-SAME: device_ordinal = 1
+
+// CHECK: func [[CALL_REPLICA_1]]
+// CHECK: "tf._XlaRecvAtHost"
+// CHECK-SAME: device_ordinal = 2
+// CHECK: "tf._XlaSendFromHost"
+// CHECK-SAME: device_ordinal = 2
+
+// -----
+
+// Tests transitive functions with replica variant ops reachable from a
+// replicate region is cloned and remapped.
+
+// CHECK-LABEL: func @call_with_replicate_variant_ops
+func @call_with_replicate_variant_ops(%arg0: tensor<f32>, %arg1: tensor<2x!tf.string>) {
+  tf_executor.graph {
+    tf_executor.island {
+      tf_device.replicate([%arg0, %arg0] as %arg2: tensor<f32>) {n = 2 : i32, devices = {TPU_REPLICATED_CORE_0 = ["/job:worker/replica:0/task:0/device:TPU:1", "/job:worker/replica:0/task:0/device:TPU:2"]}} {
+        "tf.StatefulPartitionedCall"(%arg1) {config = "", config_proto = "", executor_type = "", f = @callee} : (tensor<2x!tf.string>) -> ()
+        tf_device.return
+      }
+      tf_executor.yield
+    }
+    tf_executor.fetch
+  }
+  return
+}
+
+// CHECK: "tf.StatefulPartitionedCall"
+// CHECK-SAME: f = [[CALLEE_REPLICA_0:@[a-z0-9_]+]]
+// CHECK: "tf.StatefulPartitionedCall"
+// CHECK-SAME: f = [[CALLEE_REPLICA_1:@[a-z0-9_]+]]
+
+func @callee(%arg0: tensor<2x!tf.string>) {
+  "tf.StatefulPartitionedCall"(%arg0) {config = "", config_proto = "", executor_type = "", f = @send_recv} : (tensor<2x!tf.string>) -> ()
+  return
+}
+
+func @send_recv(%arg0: tensor<2x!tf.string>) {
+  %0 = "tf._XlaRecvAtHost"(%arg0) {_xla_has_host_transfer = true, device_ordinal = 0 : i64, key = "host_compute_channel_send_0"} : (tensor<2x!tf.string>) -> tensor<f32>
+  "tf._XlaSendFromHost"(%0, %arg0) {_xla_has_host_transfer = true, device_ordinal = 0 : i64, key = "host_compute_channel_recv_0"} : (tensor<f32>, tensor<2x!tf.string>) -> ()
+  "tf.NoOp"() : () -> ()
+  return
+}
+
+// CHECK: func [[CALLEE_REPLICA_0]]
+// CHECK: "tf.StatefulPartitionedCall"
+// CHECK-SAME: f = [[TRANSITIVE_CALLEE_REPLICA_0:@[a-z0-9_]+]]
+
+// CHECK: func [[TRANSITIVE_CALLEE_REPLICA_0]]
+// CHECK: "tf._XlaRecvAtHost"
+// CHECK-SAME: device_ordinal = 1
+// CHECK: "tf._XlaSendFromHost"
+// CHECK-SAME: device_ordinal = 1
+
+// CHECK: func [[CALLEE_REPLICA_1]]
+// CHECK: "tf.StatefulPartitionedCall"
+// CHECK-SAME: f = [[TRANSITIVE_CALLEE_REPLICA_1:@[a-z0-9_]+]]
+
+// CHECK: func [[TRANSITIVE_CALLEE_REPLICA_1]]
+// CHECK: "tf._XlaRecvAtHost"
+// CHECK-SAME: device_ordinal = 2
+// CHECK: "tf._XlaSendFromHost"
+// CHECK-SAME: device_ordinal = 2
+
+// -----
+
+// Tests functional control flow functions with replica variant ops reachable
+// from a replicate region is cloned and remapped. Only the branches reachable
+// with replica variant ops are cloned.
+
+// CHECK-LABEL: func @control_flow_with_replicate_variant_ops
+func @control_flow_with_replicate_variant_ops(%arg0: tensor<i1>, %arg1: tensor<f32>, %arg2: tensor<f32>, %arg3: tensor<2x!tf.string>) {
+  tf_executor.graph {
+    tf_executor.island {
+      tf_device.replicate([%arg0, %arg0] as %arg4: tensor<i1>, [%arg1, %arg1] as %arg5: tensor<f32>, [%arg2, %arg2] as %arg6: tensor<f32>) {n = 2 : i32, devices = {TPU_REPLICATED_CORE_0 = ["/job:worker/replica:0/task:0/device:TPU:1", "/job:worker/replica:0/task:0/device:TPU:2"]}} {
+        %0 = "tf.If"(%arg4, %arg5, %arg6, %arg3) {else_branch = @cond_false, is_stateless = true, then_branch = @cond_true} : (tensor<i1>, tensor<f32>, tensor<f32>, tensor<2x!tf.string>) -> tensor<f32>
+        tf_device.return
+      }
+      tf_executor.yield
+    }
+    tf_executor.fetch
+  }
+  return
+}
+
+// CHECK: "tf.If"
+// CHECK-SAME: else_branch = @cond_false
+// CHECK-SAME: then_branch = [[COND_TRUE_REPLICA_0:@[a-z0-9_]+]]
+// CHECK: "tf.If"
+// CHECK-SAME: else_branch = @cond_false
+// CHECK-SAME: then_branch = [[COND_TRUE_REPLICA_1:@[a-z0-9_]+]]
+
+func @cond_false(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<2x!tf.string>) -> tensor<f32> {
+  return %arg0 : tensor<f32>
+}
+
+// CHECK-NOT: func @cond_false.+(
+
+func @cond_true(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<2x!tf.string>) -> tensor<f32> {
+  "tf._XlaSendFromHost"(%arg1, %arg2) {_xla_has_host_transfer = true, device_ordinal = 0 : i64, key = "host_compute_channel_recv_0"} : (tensor<f32>, tensor<2x!tf.string>) -> ()
+  %0 = "tf._XlaRecvAtHost"(%arg2) {_xla_has_host_transfer = true, device_ordinal = 0 : i64, key = "host_compute_channel_send_0"} : (tensor<2x!tf.string>) -> tensor<f32>
+  return %0 : tensor<f32>
+}
+
+// CHECK: func [[COND_TRUE_REPLICA_0]]
+// CHECK: "tf._XlaSendFromHost"
+// CHECK-SAME: device_ordinal = 1
+// CHECK: "tf._XlaRecvAtHost"
+// CHECK-SAME: device_ordinal = 1
+
+// CHECK: func [[COND_TRUE_REPLICA_1]]
+// CHECK: "tf._XlaSendFromHost"
+// CHECK-SAME: device_ordinal = 2
+// CHECK: "tf._XlaRecvAtHost"
+// CHECK-SAME: device_ordinal = 2
+
+// -----
+
+// Tests function with no replica variant ops reachable from a replicate region
+// is not cloned.
+
+// CHECK-LABEL: func @no_replicate_variant_ops
+func @no_replicate_variant_ops(%arg0: tensor<f32>, %arg1: tensor<2x!tf.string>) {
+  tf_executor.graph {
+    tf_executor.island {
+      tf_device.replicate([%arg0, %arg0] as %arg2: tensor<f32>) {n = 2 : i32, devices = {TPU_REPLICATED_CORE_0 = ["/job:worker/replica:0/task:0/device:TPU:1", "/job:worker/replica:0/task:0/device:TPU:2"]}} {
+        "tf.StatefulPartitionedCall"(%arg1) {config = "", config_proto = "", executor_type = "", f = @send_recv} : (tensor<2x!tf.string>) -> ()
+        tf_device.return
+      }
+      tf_executor.yield
+    }
+    tf_executor.fetch
+  }
+  return
+}
+
+// CHECK: "tf.StatefulPartitionedCall"
+// CHECK-SAME: f = @send_recv
+
+func @send_recv(%arg0: tensor<2x!tf.string>) {
+  "tf.NoOp"() : () -> ()
+  return
+}
+
+// CHECK-NOT: @send_recv.+(
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/resource-alias-analysis-test.mlir b/tensorflow/compiler/mlir/tensorflow/tests/resource-alias-analysis-test.mlir
new file mode 100644
index 00000000000..e857831e6be
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/resource-alias-analysis-test.mlir
@@ -0,0 +1,363 @@
+// RUN: tf-opt -split-input-file -tf-test-resource-alias-analysis -verify-diagnostics %s | FileCheck %s
+
+// Test 2 resources that do not alias.
+
+!tf_res = type tensor<*x!tf.resource<tensor<32xf32>>>
+// CHECK-LABEL: func @non_aliasing_reads_writes
+// expected-remark@below {{Region #0, Arg #0, ID 1 : 1}}
+// expected-remark@below {{Region #0, Arg #1, ID 2 : 2}}
+func @non_aliasing_reads_writes(
+  %arg0: !tf_res,
+  %arg1: !tf_res,
+  %arg2: tensor<32xf32>) -> (tensor<32xf32>) {
+  %graph = tf_executor.graph {
+    // CHECK: tf_executor.island
+    %island:2 = tf_executor.island {
+      %read0 = "tf.ReadVariableOp"(%arg0) : (!tf_res) -> tensor<32xf32>
+      "tf.AssignVariableOp"(%arg0, %arg2) : (!tf_res, tensor<32xf32>) -> ()
+      %read1 = "tf.ReadVariableOp"(%arg1) : (!tf_res) -> tensor<32xf32>
+      // expected-remark@below {{Result #0, ID 0 : 0}}
+      %var_handle = "tf.VarHandleOp"() {container = "c", shared_name = "v0"} : () -> !tf_res
+      %read2 = "tf.ReadVariableOp"(%var_handle) : (!tf_res) -> tensor<32xf32>
+      "tf.AssignVariableOp"(%arg1, %read0) : (!tf_res, tensor<32xf32>) -> ()
+      "tf.AssignVariableOp"(%arg0, %read2) : (!tf_res, tensor<32xf32>) -> ()
+      %read3 = "tf.ReadVariableOp"(%arg0) : (!tf_res) -> tensor<32xf32>
+      tf_executor.yield %read3 : tensor<32xf32>
+    }
+    tf_executor.fetch %island#0 : tensor<32xf32>
+  }
+  return %graph : tensor<32xf32>
+}
+
+// -----
+// Tests aliasing of the two resource handles that refer to the same variable.
+
+!tf_res = type tensor<*x!tf.resource<tensor<32xf32>>>
+// CHECK-LABEL: func @aliasing_reads_writes
+func @aliasing_reads_writes(%arg0: tensor<32xf32>) -> () {
+  tf_executor.graph {
+    // CHECK: tf_executor.island
+    %island = tf_executor.island {
+      // expected-remark@below {{Result #0, ID 0 : 0, 1, 2}}
+      %vh0 = "tf.VarHandleOp"() {container = "c", shared_name = "v0"} : () -> !tf_res
+      // expected-remark@below {{Result #0, ID 1 : 0, 1, 2}}
+      %vh1 = "tf.VarHandleOp"() {container = "c", shared_name = "v0"} : () -> !tf_res
+      // expected-remark@below {{Result #0, ID 2 : 0, 1, 2}}
+      %vh1_id:2 = "tf.IdentityN"(%vh1, %arg0) : (!tf_res, tensor<32xf32>) -> (!tf_res, tensor<32xf32>)
+      %read0 = "tf.ReadVariableOp"(%vh0) : (!tf_res) -> tensor<32xf32>
+      "tf.AssignVariableOp"(%vh1_id#0, %arg0) : (!tf_res, tensor<32xf32>) -> ()
+      %read1 = "tf.ReadVariableOp"(%vh0) : (!tf_res) -> tensor<32xf32>
+      %read2 = "tf.ReadVariableOp"(%vh1) : (!tf_res) -> tensor<32xf32>
+      "tf.AssignVariableOp"(%vh0, %read2) : (!tf_res, tensor<32xf32>) -> ()
+      "tf.AssignVariableOp"(%vh1_id#0, %read1) : (!tf_res, tensor<32xf32>) -> ()
+      tf_executor.yield
+    }
+    tf_executor.fetch %island : !tf_executor.control
+  }
+  return
+}
+
+// -----
+// Test an unknown op that has a resource result is marked unknown
+
+!tf_res = type tensor<*x!tf.resource<tensor<32xf32>>>
+// CHECK-LABEL: func @unknown_resource_op
+func @unknown_resource_op(%arg0: tensor<32xf32>) -> () {
+    // expected-remark@below {{Result #0, ID 0 : Unknown}}
+    %0 = "tf.UnknownVarHandleOp"() : () -> !tf_res
+}
+
+// -----
+// Test aliasing through IfOp
+
+!tf_res = type tensor<*x!tf.resource<tensor<32xf32>>>
+
+// CHECK-LABEL: func @if_op_aliasing
+// expected-remark@below {{Region #0, Arg #0, ID 4 : 1, 4}}
+// expected-remark@below {{Region #0, Arg #1, ID 5 : 1, 2, 3, 5}}
+func @if_op_aliasing(%arg0: !tf_res, %arg1: !tf_res) {
+  // expected-remark@below {{Result #0, ID 0 : 0}}
+  %vh0 = "tf.VarHandleOp"() {container = "c", shared_name = "v0"} : () -> !tf_res
+  %read0 = "tf.ReadVariableOp"(%vh0) : (!tf_res) -> tensor<32xf32>
+  // expected-remark@below {{Result #0, ID 1 : Unknown}}
+  // expected-remark@below {{Result #1, ID 2 : 1, 2, 3, 5}}
+  // expected-remark@below {{Result #2, ID 3 : 0, 1, 2, 3, 5}}
+  %if:3 = "tf.If"(%read0, %arg1, %vh0) {
+            then_branch = @if_then, else_branch = @if_else, is_stateless = true
+          } : (tensor<32xf32>, !tf_res, !tf_res) -> (!tf_res, !tf_res, !tf_res)
+  return
+}
+
+// expected-remark@below {{Region #0, Arg #0, ID 2 : 0, 1, 2}}
+// expected-remark@below {{Region #0, Arg #1, ID 3 : 0, 3}}
+func @if_then(%arg0: !tf_res, %arg1: !tf_res) -> (!tf_res, !tf_res, !tf_res) {
+  // expected-remark@below {{Result #0, ID 0 : Unknown}}
+  %u0 = "tf._UnknownSideEffectingOp_"() : () -> !tf_res
+  // expected-remark@below {{Result #0, ID 1 : 0, 1, 2}}
+  %id0 = "tf.Identity"(%arg0) : (!tf_res) -> !tf_res
+  return %u0, %id0, %id0 : !tf_res, !tf_res, !tf_res
+}
+
+// expected-remark@below {{Region #0, Arg #0, ID 1 : 0, 1}}
+// expected-remark@below {{Region #0, Arg #1, ID 2 : 2}}
+func @if_else(%arg0: !tf_res, %arg1: !tf_res) -> (!tf_res, !tf_res, !tf_res) {
+  // expected-remark@below {{Result #0, ID 0 : 0, 1}}
+  %id0 = "tf.Identity"(%arg0) : (!tf_res) -> !tf_res
+  return %id0, %id0, %arg1 : !tf_res, !tf_res, !tf_res
+}
+
+// -----
+// Test aliasing through CaseOp
+
+!tf_res = type tensor<*x!tf.resource<tensor<i32>>>
+
+// CHECK-LABEL: func @case_op_aliasing
+// expected-remark@below {{Region #0, Arg #0, ID 4 : 1, 4}}
+// expected-remark@below {{Region #0, Arg #1, ID 5 : 1, 2, 3, 5}}
+func @case_op_aliasing(%arg0: !tf_res, %arg1: !tf_res) {
+  // expected-remark@below {{Result #0, ID 0 : 0}}
+  %vh0 = "tf.VarHandleOp"() {container = "c", shared_name = "v0"} : () -> !tf_res
+  %read0 = "tf.ReadVariableOp"(%vh0) : (!tf_res) -> tensor<i32>
+  // expected-remark@below {{Result #0, ID 1 : Unknown}}
+  // expected-remark@below {{Result #1, ID 2 : 1, 2, 3, 5}}
+  // expected-remark@below {{Result #2, ID 3 : 0, 1, 2, 3, 5}}
+  %if:3 = "tf.Case"(%read0, %arg1, %vh0) {
+            branches = [@case_branch0, @case_branch1, @case_branch2],
+            is_stateless = true
+          } : (tensor<i32>, !tf_res, !tf_res) -> (!tf_res, !tf_res, !tf_res)
+  return
+}
+
+// expected-remark@below {{Region #0, Arg #0, ID 2 : 0, 1, 2}}
+// expected-remark@below {{Region #0, Arg #1, ID 3 : 0, 3}}
+func @case_branch0(%arg0: !tf_res, %arg1: !tf_res) -> (!tf_res, !tf_res, !tf_res) {
+  // expected-remark@below {{Result #0, ID 0 : Unknown}}
+  %u0 = "tf._UnknownSideEffectingOp_"() : () -> !tf_res
+  // expected-remark@below {{Result #0, ID 1 : 0, 1, 2}}
+  %id0 = "tf.Identity"(%arg0) : (!tf_res) -> !tf_res
+  return %u0, %id0, %id0 : !tf_res, !tf_res, !tf_res
+}
+
+// expected-remark@below {{Region #0, Arg #0, ID 1 : 0, 1}}
+// expected-remark@below {{Region #0, Arg #1, ID 2 : 2}}
+func @case_branch1(%arg0: !tf_res, %arg1: !tf_res) -> (!tf_res, !tf_res, !tf_res) {
+  // expected-remark@below {{Result #0, ID 0 : 0, 1}}
+  %id0 = "tf.Identity"(%arg0) : (!tf_res) -> !tf_res
+  return %id0, %id0, %arg1 : !tf_res, !tf_res, !tf_res
+}
+
+// expected-remark@below {{Region #0, Arg #0, ID 0 : 0}}
+// expected-remark@below {{Region #0, Arg #1, ID 1 : 1}}
+func @case_branch2(%arg0: !tf_res, %arg1: !tf_res) -> (!tf_res, !tf_res, !tf_res) {
+  return %arg0, %arg0, %arg1 : !tf_res, !tf_res, !tf_res
+}
+
+// -----
+// Test aliasing through WhileOp
+!tf_res = type tensor<*x!tf.resource<tensor<32xf32>>>
+
+// CHECK-LABEL: func @while_op_aliasing
+// expected-remark@below {{Region #0, Arg #0, ID 4 : 1, 4}}
+// expected-remark@below {{Region #0, Arg #1, ID 5 : 1, 2, 3, 5}}
+// expected-remark@below {{Region #0, Arg #2, ID 6 : 1, 2, 3, 6}}
+func @while_op_aliasing(%arg0: !tf_res, %arg1: !tf_res, %arg2: !tf_res) {
+  // expected-remark@below {{Result #0, ID 0 : 0}}
+  %vh0 = "tf.VarHandleOp"() {container = "c", shared_name = "v0"} : () -> !tf_res
+  // expected-remark@below {{Result #0, ID 1 : Unknown}}
+  // expected-remark@below {{Result #1, ID 2 : 1, 2, 3, 5, 6}}
+  // expected-remark@below {{Result #2, ID 3 : 1, 2, 3, 5, 6}}
+  %w:3 = "tf.While"(%arg0, %arg1, %arg2) {
+            body = @while_body, cond = @while_cond, is_stateless = false
+         } : (!tf_res, !tf_res, !tf_res) -> (!tf_res, !tf_res, !tf_res)
+  return
+}
+
+// CHECK-LABEL: func @while_body
+// Return 0 : new unknown resource
+// Return 1 : arg2
+// Return 2 : arg1
+// expected-remark@below {{Region #0, Arg #0, ID 1 : 0, 1}}
+// expected-remark@below {{Region #0, Arg #1, ID 2 : 0, 2}}
+// expected-remark@below {{Region #0, Arg #2, ID 3 : 0, 3}}
+func @while_body(%arg0: !tf_res, %arg1: !tf_res, %arg2: !tf_res) -> (!tf_res, !tf_res, !tf_res) {
+  // expected-remark@below {{Result #0, ID 0 : Unknown}}
+  %u0 = "tf._UnknownSideEffectingOp_"() : () -> !tf_res
+  return %u0, %arg2, %arg1 : !tf_res, !tf_res, !tf_res
+}
+
+// CHECK-LABEL: func @while_cond
+// expected-remark@below {{Region #0, Arg #0, ID 0 : 0}}
+// expected-remark@below {{Region #0, Arg #1, ID 1 : 1}}
+// expected-remark@below {{Region #0, Arg #2, ID 2 : 2}}
+func @while_cond(%arg0: !tf_res, %arg1: !tf_res, %arg2: !tf_res) -> tensor<i1> {
+  %0 = constant dense<false> : tensor<i1>
+  return %0 : tensor<i1>
+}
+
+// -----
+// Test alias propagation through calls.
+!tf_res = type tensor<*x!tf.resource<tensor<32xf32>>>
+// CHECK-LABEL: func @aliasing_through_calls
+func @aliasing_through_calls(%arg0: tensor<32xf32>) -> () {
+  // expected-remark@below {{Result #0, ID 0 : 0, 1, 2, 3}}
+  %vh0 = "tf.VarHandleOp"() {container = "c", shared_name = "v0"} : () -> !tf_res
+  // expected-remark@below {{Result #0, ID 1 : 0, 1, 2, 3}}
+  %vh1 = "tf.Identity"(%vh0) : (!tf_res) -> (!tf_res)
+  // expected-remark@below {{Result #0, ID 2 : Unknown}}
+  // expected-remark@below {{Result #1, ID 3 : 0, 1, 2, 3}}
+  %c:2 = call @passthru(%vh1) : (!tf_res) -> (!tf_res, !tf_res)
+  return
+}
+
+// expected-remark@below {{Region #0, Arg #0, ID 1 : 1}}
+func @passthru(%arg0: !tf_res) -> (!tf_res, !tf_res) {
+  // expected-remark@below {{Result #0, ID 0 : 0}}
+  %vx = "tf.VarHandleOp"() {container = "cf", shared_name = "vx"} : () -> !tf_res
+  return %vx, %arg0 : !tf_res, !tf_res
+}
+
+// -----
+// Test aliasing through IfRegion
+
+!tf_res = type tensor<*x!tf.resource<tensor<i1>>>
+
+// CHECK-LABEL: func @if_region_aliasing
+// expected-remark@below {{Region #0, Arg #0, ID 7 : 1, 4, 6, 7}}
+// expected-remark@below {{Region #0, Arg #1, ID 8 : 1, 2, 4, 5, 6, 8}}
+func @if_region_aliasing(%arg0: !tf_res, %arg1: !tf_res) {
+  // expected-remark@below {{Result #0, ID 0 : 0, 1, 3, 4, 5}}
+  %vh0 = "tf.VarHandleOp"() {container = "c", shared_name = "v0"} : () -> !tf_res
+  %read0 = "tf.ReadVariableOp"(%vh0) : (!tf_res) -> tensor<i1>
+  // expected-remark@below {{Result #0, ID 4 : Unknown}}
+  // expected-remark@below {{Result #1, ID 5 : 0, 1, 2, 3, 4, 5, 6, 8}}
+  // expected-remark@below {{Result #2, ID 6 : 1, 2, 4, 5, 6, 7, 8}}
+  %if:3 = "tf.IfRegion"(%read0) ({
+            // expected-remark@below {{Result #0, ID 1 : Unknown}}
+            %u0 = "tf._UnknownSideEffectingOp_"() : () -> !tf_res
+            // expected-remark@below {{Result #0, ID 2 : 1, 2, 4, 5, 6, 8}}
+            %id0 = "tf.Identity"(%arg1) : (!tf_res) -> !tf_res
+            "tf.Yield"(%u0, %id0, %id0) : (!tf_res, !tf_res, !tf_res) -> ()
+          }, {
+            // expected-remark@below {{Result #0, ID 3 : 0, 1, 3, 4, 5}}
+            %id0 = "tf.Identity"(%vh0) : (!tf_res) -> !tf_res
+            "tf.Yield"(%id0, %id0, %arg0) : (!tf_res, !tf_res, !tf_res) -> ()
+          }) {is_stateless = true} : (tensor<i1>) -> (!tf_res, !tf_res, !tf_res)
+  return
+}
+
+// -----
+// Test aliasing through CaseRegion
+
+!tf_res = type tensor<*x!tf.resource<tensor<i32>>>
+
+// CHECK-LABEL: func @case_region_aliasing
+// expected-remark@below {{Region #0, Arg #0, ID 7 : 1, 4, 6, 7}}
+// expected-remark@below {{Region #0, Arg #1, ID 8 : 1, 2, 4, 5, 6, 8}}
+func @case_region_aliasing(%arg0: !tf_res, %arg1: !tf_res) {
+  // expected-remark@below {{Result #0, ID 0 : 0, 1, 3, 4, 5}}
+  %vh0 = "tf.VarHandleOp"() {container = "c", shared_name = "v0"} : () -> !tf_res
+  %read0 = "tf.ReadVariableOp"(%vh0) : (!tf_res) -> tensor<i32>
+  // expected-remark@below {{Result #0, ID 4 : Unknown}}
+  // expected-remark@below {{Result #1, ID 5 : 0, 1, 2, 3, 4, 5, 6, 8}}
+  // expected-remark@below {{Result #2, ID 6 : 1, 2, 4, 5, 6, 7, 8}}
+  %if:3 = "tf.CaseRegion"(%read0) ({
+            // expected-remark@below {{Result #0, ID 1 : Unknown}}
+            %u0 = "tf._UnknownSideEffectingOp_"() : () -> !tf_res
+            // expected-remark@below {{Result #0, ID 2 : 1, 2, 4, 5, 6, 8}}
+            %id0 = "tf.Identity"(%arg1) : (!tf_res) -> !tf_res
+            "tf.Yield"(%u0, %id0, %id0) : (!tf_res, !tf_res, !tf_res) -> ()
+          }, {
+            // expected-remark@below {{Result #0, ID 3 : 0, 1, 3, 4, 5}}
+            %id0 = "tf.Identity"(%vh0) : (!tf_res) -> !tf_res
+            "tf.Yield"(%id0, %id0, %arg0) : (!tf_res, !tf_res, !tf_res) -> ()
+          }, {
+            "tf.Yield"(%vh0, %arg1, %arg1) : (!tf_res, !tf_res, !tf_res) -> ()
+          }) {is_stateless = true} : (tensor<i32>) -> (!tf_res, !tf_res, !tf_res)
+  return
+}
+
+// -----
+// Test aliasing through WhileRegion
+!tf_res = type tensor<*x!tf.resource<tensor<32xf32>>>
+
+// CHECK-LABEL: func @while_region_aliasing
+// expected-remark@below {{Region #0, Arg #0, ID 11 : 1, 8, 11}}
+// expected-remark@below {{Region #0, Arg #1, ID 12 : 1, 8, 9, 10, 12}}
+// expected-remark@below {{Region #0, Arg #2, ID 13 : 1, 8, 9, 10, 13}}
+func @while_region_aliasing(%arg0: !tf_res, %arg1: !tf_res, %arg2: !tf_res) {
+  // expected-remark@below {{Result #0, ID 0 : 0, 1, 8}}
+  %vh0 = "tf.VarHandleOp"() {container = "c", shared_name = "v0"} : () -> !tf_res
+  // expected-remark@below {{Result #0, ID 8 : Unknown}}
+  // expected-remark@below {{Result #1, ID 9 : 1, 8, 9, 10, 12, 13}}
+  // expected-remark@below {{Result #2, ID 10 : 1, 8, 9, 10, 12, 13}}
+  // expected-remark@below {{Region #0, Arg #0, ID 2 : 1, 2, 8}}
+  // expected-remark@below {{Region #0, Arg #1, ID 3 : 1, 3, 8}}
+  // expected-remark@below {{Region #0, Arg #2, ID 4 : 1, 4, 8}}
+  // expected-remark@below {{Region #1, Arg #0, ID 5 : 1, 5, 8}}
+  // expected-remark@below {{Region #1, Arg #1, ID 6 : 1, 6, 8}}
+  // expected-remark@below {{Region #1, Arg #2, ID 7 : 1, 7, 8}}
+  %w:3 = "tf.WhileRegion"(%arg0, %arg1, %arg2) ({
+          ^bb0(%carg0: !tf_res, %carg1: !tf_res, %carg2: !tf_res):
+          %0 = constant dense<false> : tensor<i1>
+          "tf.Yield"(%0) : (tensor<i1>) -> ()
+         },{
+          ^bb0(%barg0: !tf_res, %barg1: !tf_res, %barg2: !tf_res):
+          // expected-remark@below {{Result #0, ID 1 : Unknown}}
+          %u0 = "tf._UnknownSideEffectingOp_"() : () -> !tf_res
+          "tf.Yield"(%u0, %barg2, %barg1) : (!tf_res, !tf_res, !tf_res) -> ()
+         }) {is_stateless = false} : (!tf_res, !tf_res, !tf_res) -> (!tf_res, !tf_res, !tf_res)
+  return
+}
+
+// -----
+// Test aliasing through calls
+!tf_res = type tensor<*x!tf.resource<tensor<32xf32>>>
+
+// CHECK-LABEL: func @aliasing_through_calls
+func @aliasing_through_calls(%arg0: tensor<32xf32>) -> () {
+  // expected-remark@below {{Result #0, ID 0 : 0, 1, 2}}
+  %vh0 = "tf.VarHandleOp"() {container = "c", shared_name = "v0"} : () -> !tf_res
+  // expected-remark@below {{Result #0, ID 1 : Unknown}}
+  // expected-remark@below {{Result #1, ID 2 : 0, 1, 2}}
+  %c:2 = call @passthru(%vh0) : (!tf_res) -> (!tf_res, !tf_res)
+  return
+}
+
+// expected-remark@below {{Region #0, Arg #0, ID 1 : 1}}
+func @passthru(%arg0: !tf_res) -> (!tf_res, !tf_res) {
+  // expected-remark@below {{Result #0, ID 0 : 0}}
+  %vh0 = "tf.VarHandleOp"() {container = "c", shared_name = "v0"} : () -> !tf_res
+  return %vh0, %arg0 : !tf_res, !tf_res
+}
+
+// -----
+// Test aliasing through tf_device.launch
+!tf_res = type tensor<*x!tf.resource<tensor<32xf32>>>
+
+// CHECK-LABEL: func @aliasing_through_launch
+func @aliasing_through_launch(%arg0: tensor<32xf32>) {
+  // expected-remark@below {{Result #0, ID 0 : 0, 1}}
+  %vh = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> !tf_res
+
+  // expected-remark@below {{Result #0, ID 1 : 0, 1}}
+  %launch = "tf_device.launch"() ({
+    tf_device.return %vh : !tf_res
+  }) {device = ""} : () -> !tf_res
+  return
+}
+
+// -----
+// Test aliasing through tf_device.cluster
+!tf_res = type tensor<*x!tf.resource<tensor<32xf32>>>
+
+// CHECK-LABEL: func @aliasing_through_cluster
+func @aliasing_through_cluster(%arg0: tensor<32xf32>) {
+  // expected-remark@below {{Result #0, ID 0 : 0, 1}}
+  %vh = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> !tf_res
+
+  // expected-remark@below {{Result #0, ID 1 : 0, 1}}
+  %cluster = "tf_device.cluster"() ({
+    tf_device.return %vh : !tf_res
+  }) : () -> !tf_res
+  return
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/resource-device-inference.mlir b/tensorflow/compiler/mlir/tensorflow/tests/resource-device-inference.mlir
index a4a7c1dad2e..75cafde88e3 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/resource-device-inference.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/resource-device-inference.mlir
@@ -1,31 +1,33 @@
 // RUN: tf-opt -split-input-file -verify-diagnostics -tf-resource-device-inference %s | FileCheck %s
 
+!tf_res = type tensor<*x!tf.resource<tensor<32xf32>>>
+
 // Tests that the pass can correctly propagate device attributes inside the same
 // function.
 
 // CHECK-LABEL: func @propagate_in_function
 func @propagate_in_function(
-  %arg0: tensor<*x!tf.resource<tensor<32xf32>>> {tf.device = "/TPU:0"},
-  %arg1: tensor<*x!tf.resource<tensor<32xf32>>> {tf.device = "/TPU:1"}) {
+  %arg0: !tf_res {tf.device = "/TPU:0"},
+  %arg1: !tf_res {tf.device = "/TPU:1"}) {
   tf_executor.graph {
     // CHECK: tf_executor.island
     %island = tf_executor.island {
       // CHECK-NEXT: "tf.VarHandleOp"
       %var_handle = "tf.VarHandleOp"() {container = "c", shared_name = "v0", device = "/CPU:0"}
-        : () -> tensor<*x!tf.resource<tensor<32xf32>>>
+        : () -> !tf_res
       // CHECK-NEXT: "tf.Identity"
       // CHECK-SAME: {device = "/TPU:0"}
-      %id0 = "tf.Identity"(%arg0) : (tensor<*x!tf.resource<tensor<32xf32>>>)
-        -> tensor<*x!tf.resource<tensor<32xf32>>>
+      %id0 = "tf.Identity"(%arg0) : (!tf_res)
+        -> !tf_res
       // CHECK-NEXT: "tf.Identity"
       // CHECK-SAME: {device = "/TPU:0"}
-      %id1 = "tf.Identity"(%id0) : (tensor<*x!tf.resource<tensor<32xf32>>>)
-        -> tensor<*x!tf.resource<tensor<32xf32>>>
+      %id1 = "tf.Identity"(%id0) : (!tf_res)
+        -> !tf_res
       // CHECK-NEXT: "tf.Identity"
       // CHECK-SAME: {device = "/CPU:0"}
-      %id2 = "tf.Identity"(%var_handle) : (tensor<*x!tf.resource<tensor<32xf32>>>)
-        -> tensor<*x!tf.resource<tensor<32xf32>>>
-      %read = "tf.ReadVariableOp"(%id2) : (tensor<*x!tf.resource<tensor<32xf32>>>) -> tensor<32xf32>
+      %id2 = "tf.Identity"(%var_handle) : (!tf_res)
+        -> !tf_res
+      %read = "tf.ReadVariableOp"(%id2) : (!tf_res) -> tensor<32xf32>
       %id3 = "tf.Identity"(%read) : (tensor<32xf32>) -> tensor<32xf32>
       tf_executor.yield
     }
@@ -35,30 +37,31 @@ func @propagate_in_function(
 }
 
 // -----
+!tf_res = type tensor<*x!tf.resource<tensor<32xf32>>>
 
 // Tesets that the pass can propagate through tf.If's branches.
 
 // CHECK-LABEL: func @propagate_if_op
 func @propagate_if_op(
-  %arg0: tensor<*x!tf.resource<tensor<32xf32>>> {tf.device = "/TPU:0"},
+  %arg0: !tf_res {tf.device = "/TPU:0"},
   %arg1: tensor<i1>) {
   tf_executor.graph {
     // CHECK: tf_executor.island
     %island = tf_executor.island {
       // CHECK-NEXT: "tf.Identity"
       // CHECK-SAME: {device = "/TPU:0"}
-      %id0 = "tf.Identity"(%arg0) : (tensor<*x!tf.resource<tensor<32xf32>>>)
-        -> tensor<*x!tf.resource<tensor<32xf32>>>
+      %id0 = "tf.Identity"(%arg0) : (!tf_res)
+        -> !tf_res
       // CHECK-NEXT: "tf.VarHandleOp"
       %var_handle = "tf.VarHandleOp"() {container = "c", shared_name = "v0", device = "/TPU:1"}
-        : () -> tensor<*x!tf.resource<tensor<32xf32>>>
+        : () -> !tf_res
       // CHECK-NEXT: "tf.If"
       "tf.If"(%arg1, %id0, %var_handle) {
           then_branch = @if_then,
           else_branch = @if_else,
           is_stateless = false}
-        : (tensor<i1>, tensor<*x!tf.resource<tensor<32xf32>>>,
-           tensor<*x!tf.resource<tensor<32xf32>>>) -> ()
+        : (tensor<i1>, !tf_res,
+           !tf_res) -> ()
       tf_executor.yield
     }
     tf_executor.fetch %island : !tf_executor.control
@@ -68,19 +71,19 @@ func @propagate_if_op(
 
 // CHECK-LABEL: func @if_then
 func @if_then(
-  %arg0: tensor<*x!tf.resource<tensor<32xf32>>>,
-  %arg1: tensor<*x!tf.resource<tensor<32xf32>>>) {
+  %arg0: !tf_res,
+  %arg1: !tf_res) {
   tf_executor.graph {
     // CHECK: tf_executor.island
     %island = tf_executor.island {
       // CHECK-NEXT: "tf.Identity"
       // CHECK-SAME: {device = "/TPU:0"}
-      %id0 = "tf.Identity"(%arg0) : (tensor<*x!tf.resource<tensor<32xf32>>>)
-        -> tensor<*x!tf.resource<tensor<32xf32>>>
+      %id0 = "tf.Identity"(%arg0) : (!tf_res)
+        -> !tf_res
       // CHECK-NEXT: "tf.Identity"
       // CHECK-SAME: {device = "/TPU:1"}
-      %id1 = "tf.Identity"(%arg1) : (tensor<*x!tf.resource<tensor<32xf32>>>)
-        -> tensor<*x!tf.resource<tensor<32xf32>>>
+      %id1 = "tf.Identity"(%arg1) : (!tf_res)
+        -> !tf_res
       tf_executor.yield
     }
     tf_executor.fetch %island : !tf_executor.control
@@ -90,15 +93,15 @@ func @if_then(
 
 // CHECK-LABEL: func @if_else
 func @if_else(
-  %arg0: tensor<*x!tf.resource<tensor<32xf32>>>,
-  %arg1: tensor<*x!tf.resource<tensor<32xf32>>>) {
+  %arg0: !tf_res,
+  %arg1: !tf_res) {
   tf_executor.graph {
     // CHECK: tf_executor.island
     %island = tf_executor.island {
       // CHECK-NEXT: "tf.Identity"
       // CHECK-SAME: {device = "/TPU:0"}
-      %id0 = "tf.Identity"(%arg0) : (tensor<*x!tf.resource<tensor<32xf32>>>)
-        -> tensor<*x!tf.resource<tensor<32xf32>>>
+      %id0 = "tf.Identity"(%arg0) : (!tf_res)
+        -> !tf_res
       tf_executor.yield
     }
     tf_executor.fetch %island : !tf_executor.control
@@ -108,31 +111,31 @@ func @if_else(
 
 
 // -----
+!tf_res = type tensor<*x!tf.resource<tensor<32xf32>>>
 
 // Tesets that the pass can propagate through tf.While's branches.
-
 // CHECK-LABEL: func @propagate_while_op
 func @propagate_while_op(
-  %arg0: tensor<*x!tf.resource<tensor<32xf32>>> {tf.device = "/TPU:0"},
+  %arg0: !tf_res {tf.device = "/TPU:0"},
   %arg1: tensor<i32>) {
   tf_executor.graph {
     // CHECK: tf_executor.island
     %island = tf_executor.island {
       // CHECK-NEXT: "tf.Identity"
       // CHECK-SAME: {device = "/TPU:0"}
-      %id0 = "tf.Identity"(%arg0) : (tensor<*x!tf.resource<tensor<32xf32>>>)
-        -> tensor<*x!tf.resource<tensor<32xf32>>>
+      %id0 = "tf.Identity"(%arg0) : (!tf_res)
+        -> !tf_res
       // CHECK-NEXT: "tf.VarHandleOp"
       %var_handle = "tf.VarHandleOp"() {container = "c", shared_name = "v0", device = "/TPU:1"}
-        : () -> tensor<*x!tf.resource<tensor<32xf32>>>
+        : () -> !tf_res
       // CHECK-NEXT: "tf.While"
       "tf.While"(%arg1, %id0, %var_handle) {
           body = @while_body,
           cond = @while_cond, is_stateless = false}
-        : (tensor<i32>, tensor<*x!tf.resource<tensor<32xf32>>>,
-           tensor<*x!tf.resource<tensor<32xf32>>>) ->
-          (tensor<i32>, tensor<*x!tf.resource<tensor<32xf32>>>,
-           tensor<*x!tf.resource<tensor<32xf32>>>)
+        : (tensor<i32>, !tf_res,
+           !tf_res) ->
+          (tensor<i32>, !tf_res,
+           !tf_res)
       tf_executor.yield
     }
     tf_executor.fetch %island : !tf_executor.control
@@ -143,48 +146,48 @@ func @propagate_while_op(
 // CHECK-LABEL: func @while_body
 func @while_body(
   %arg0: tensor<i32>,
-  %arg1: tensor<*x!tf.resource<tensor<32xf32>>>,
-  %arg2: tensor<*x!tf.resource<tensor<32xf32>>>) ->
-  (tensor<i32>, tensor<*x!tf.resource<tensor<32xf32>>>,
-   tensor<*x!tf.resource<tensor<32xf32>>>) {
+  %arg1: !tf_res,
+  %arg2: !tf_res) ->
+  (tensor<i32>, !tf_res,
+   !tf_res) {
   %graph:3 = tf_executor.graph {
     // CHECK: tf_executor.island
     %island:4 = tf_executor.island {
       // CHECK-NEXT: "tf.Identity"
       // CHECK-SAME: {device = "/TPU:0"}
-      %id0 = "tf.Identity"(%arg1) : (tensor<*x!tf.resource<tensor<32xf32>>>)
-        -> tensor<*x!tf.resource<tensor<32xf32>>>
+      %id0 = "tf.Identity"(%arg1) : (!tf_res)
+        -> !tf_res
       // CHECK-NEXT: "tf.Identity"
       // CHECK-SAME: {device = "/TPU:1"}
-      %id1 = "tf.Identity"(%arg2) : (tensor<*x!tf.resource<tensor<32xf32>>>)
-        -> tensor<*x!tf.resource<tensor<32xf32>>>
+      %id1 = "tf.Identity"(%arg2) : (!tf_res)
+        -> !tf_res
       tf_executor.yield %arg0, %id0, %id1
-        : tensor<i32>, tensor<*x!tf.resource<tensor<32xf32>>>,
-          tensor<*x!tf.resource<tensor<32xf32>>>
+        : tensor<i32>, !tf_res,
+          !tf_res
     }
     tf_executor.fetch %island#0, %island#1, %island#2
-      : tensor<i32>, tensor<*x!tf.resource<tensor<32xf32>>>,
-        tensor<*x!tf.resource<tensor<32xf32>>>
+      : tensor<i32>, !tf_res,
+        !tf_res
   }
   return %graph#0, %graph#1, %graph#2
-     : tensor<i32>, tensor<*x!tf.resource<tensor<32xf32>>>,
-       tensor<*x!tf.resource<tensor<32xf32>>>
+     : tensor<i32>, !tf_res,
+       !tf_res
 }
 
 // CHECK-LABEL: func @while_cond
 func @while_cond(
   %arg0: tensor<i32>,
-  %arg1: tensor<*x!tf.resource<tensor<32xf32>>>,
-  %arg2: tensor<*x!tf.resource<tensor<32xf32>>>) -> tensor<32xf32> {
+  %arg1: !tf_res,
+  %arg2: !tf_res) -> tensor<32xf32> {
   %graph = tf_executor.graph {
     // CHECK: tf_executor.island
     %island:2 = tf_executor.island {
       // CHECK-NEXT: "tf.Identity"
       // CHECK-SAME: {device = "/TPU:0"}
-      %id0 = "tf.Identity"(%arg1) : (tensor<*x!tf.resource<tensor<32xf32>>>)
-        -> tensor<*x!tf.resource<tensor<32xf32>>>
+      %id0 = "tf.Identity"(%arg1) : (!tf_res)
+        -> !tf_res
       %read = "tf.ReadVariableOp"(%id0)
-        : (tensor<*x!tf.resource<tensor<32xf32>>>) -> tensor<32xf32>
+        : (!tf_res) -> tensor<32xf32>
       tf_executor.yield %read : tensor<32xf32>
     }
     tf_executor.fetch %island#0 : tensor<32xf32>
@@ -193,31 +196,32 @@ func @while_cond(
 }
 
 // -----
+!tf_res = type tensor<*x!tf.resource<tensor<32xf32>>>
 
 // Tesets that the pass reports error on conflicting assignments from multiple
 // callers.
 
 func @error_on_conflict_multiple_callers(
-  %arg0: tensor<*x!tf.resource<tensor<32xf32>>> {tf.device = "/TPU:0"},
+  %arg0: !tf_res {tf.device = "/TPU:0"},
   %arg1: tensor<i1>) {
   tf_executor.graph {
     %island = tf_executor.island {
-      %id0 = "tf.Identity"(%arg0) : (tensor<*x!tf.resource<tensor<32xf32>>>)
-        -> tensor<*x!tf.resource<tensor<32xf32>>>
+      %id0 = "tf.Identity"(%arg0) : (!tf_res)
+        -> !tf_res
       %var_handle = "tf.VarHandleOp"() {container = "c", shared_name = "v0", device = "/TPU:1"}
-        : () -> tensor<*x!tf.resource<tensor<32xf32>>>
+        : () -> !tf_res
       "tf.If"(%arg1, %id0, %var_handle) {
           then_branch = @if_then_and_else,
           else_branch = @if_then_and_else, is_stateless = false}
-        : (tensor<i1>, tensor<*x!tf.resource<tensor<32xf32>>>,
-           tensor<*x!tf.resource<tensor<32xf32>>>) -> ()
+        : (tensor<i1>, !tf_res,
+           !tf_res) -> ()
       "tf.If"(%arg1, %var_handle, %id0) {
       // expected-error@above {{Conflicting device assignment for resource}}
           then_branch = @if_then_and_else,
           else_branch = @if_then_and_else,
           is_stateless = false}
-        : (tensor<i1>, tensor<*x!tf.resource<tensor<32xf32>>>,
-           tensor<*x!tf.resource<tensor<32xf32>>>) -> ()
+        : (tensor<i1>, !tf_res,
+           !tf_res) -> ()
       tf_executor.yield
     }
     tf_executor.fetch %island : !tf_executor.control
@@ -226,17 +230,311 @@ func @error_on_conflict_multiple_callers(
 }
 
 func @if_then_and_else(
-  %arg0: tensor<*x!tf.resource<tensor<32xf32>>>,
-  %arg1: tensor<*x!tf.resource<tensor<32xf32>>>) {
+  %arg0: !tf_res,
+  %arg1: !tf_res) {
   tf_executor.graph {
     %island = tf_executor.island {
-      %id0 = "tf.Identity"(%arg0) : (tensor<*x!tf.resource<tensor<32xf32>>>)
-        -> tensor<*x!tf.resource<tensor<32xf32>>>
-      %id1 = "tf.Identity"(%arg1) : (tensor<*x!tf.resource<tensor<32xf32>>>)
-        -> tensor<*x!tf.resource<tensor<32xf32>>>
+      %id0 = "tf.Identity"(%arg0) : (!tf_res)
+        -> !tf_res
+      %id1 = "tf.Identity"(%arg1) : (!tf_res)
+        -> !tf_res
       tf_executor.yield
     }
     tf_executor.fetch %island : !tf_executor.control
   }
   return
 }
+
+// -----
+
+// Test that the pass can propagate through calls
+!tf_res = type tensor<*x!tf.resource<tensor<32xf32>>>
+
+// CHECK-LABEL: func @test_function
+// CHECK-SAME: {tf.device = "/TPU:0"}
+func @test_function(%arg0: !tf_res) {
+  // CHECK: "tf.Identity"
+  // CHECK-SAME: {device = "/TPU:0"}
+  %id0 = "tf.Identity"(%arg0) : (!tf_res) -> !tf_res
+  %read = "tf.ReadVariableOp"(%id0) : (!tf_res) -> tensor<32xf32>
+  %cst = constant dense<3.0> : tensor<32xf32>
+  %add = "tf.AddV2"(%read, %cst) : (tensor<32xf32>, tensor<32xf32>) -> tensor<32xf32>
+  "tf.AssignVariableOp"(%arg0, %add) : (!tf_res, tensor<32xf32>) -> ()
+  return
+}
+
+// CHECK-LABEL: func @propagate_through_calls
+func @propagate_through_calls(
+  %arg0: !tf_res {tf.device = "/TPU:0"},
+  %arg1: !tf_res {tf.device = "/TPU:1"}) {
+  tf_executor.graph {
+    // CHECK: tf_executor.island
+    %island = tf_executor.island {
+      // CHECK-NEXT: "tf.VarHandleOp"
+      %var_handle = "tf.VarHandleOp"() {container = "c", shared_name = "v0", device = "/CPU:0"}
+        : () -> !tf_res
+      // CHECK-NEXT: "tf.Identity"
+      // CHECK-SAME: {device = "/TPU:0"}
+      %id0 = "tf.Identity"(%arg0) : (!tf_res)
+        -> !tf_res
+      // CHECK-NEXT: "tf.Identity"
+      // CHECK-SAME: {device = "/TPU:0"}
+      %id1 = "tf.Identity"(%id0) : (!tf_res)
+        -> !tf_res
+      // CHECK-NEXT: "tf.Identity"
+      // CHECK-SAME: {device = "/CPU:0"}
+      %id2 = "tf.Identity"(%var_handle) : (!tf_res)
+        -> !tf_res
+      %read = "tf.ReadVariableOp"(%id2) : (!tf_res) -> tensor<32xf32>
+      %id3 = "tf.Identity"(%read) : (tensor<32xf32>) -> tensor<32xf32>
+      call @test_function(%id1) : (!tf_res) -> ()
+      tf_executor.yield
+    }
+    tf_executor.fetch %island : !tf_executor.control
+  }
+  return
+}
+
+// Test propagation through IfRegion (with non-inlined calls)
+// CHECK-LABEL: func @propagate_if_region
+func @propagate_if_region(
+  %arg0: !tf_res {tf.device = "/TPU:0"},
+  %arg1: tensor<i1>) {
+  tf_executor.graph {
+    // CHECK: tf_executor.island
+    %island = tf_executor.island {
+      // CHECK-NEXT: "tf.Identity"
+      // CHECK-SAME: {device = "/TPU:0"}
+      %id0 = "tf.Identity"(%arg0) : (!tf_res)
+        -> !tf_res
+      // CHECK-NEXT: "tf.VarHandleOp"
+      %var_handle = "tf.VarHandleOp"() {container = "c", shared_name = "v0", device = "/TPU:1"}
+        : () -> !tf_res
+      // CHECK-NEXT: "tf.IfRegion"
+      "tf.IfRegion"(%arg1) ({
+          call @ifregion_then(%id0, %var_handle) : (!tf_res, !tf_res) -> ()
+          "tf.Yield"() : () -> ()
+        }, {
+          call @ifregion_else(%id0, %var_handle) : (!tf_res, !tf_res) -> ()
+          "tf.Yield"() : () -> ()
+        }) {is_stateless = false} : (tensor<i1>) -> ()
+      tf_executor.yield
+    }
+    tf_executor.fetch %island : !tf_executor.control
+  }
+  return
+}
+
+// CHECK-LABEL: func @ifregion_then
+// CHECK-SAME: (%arg0: {{.+}} {tf.device = "/TPU:0"}, %arg1: {{.+}} {tf.device = "/TPU:1"}
+func @ifregion_then(
+  %arg0: !tf_res,
+  %arg1: !tf_res) {
+  tf_executor.graph {
+    // CHECK: tf_executor.island
+    %island = tf_executor.island {
+      // CHECK-NEXT: "tf.Identity"
+      // CHECK-SAME: {device = "/TPU:0"}
+      %id0 = "tf.Identity"(%arg0) : (!tf_res)
+        -> !tf_res
+      // CHECK-NEXT: "tf.Identity"
+      // CHECK-SAME: {device = "/TPU:1"}
+      %id1 = "tf.Identity"(%arg1) : (!tf_res)
+        -> !tf_res
+      tf_executor.yield
+    }
+    tf_executor.fetch %island : !tf_executor.control
+  }
+  return
+}
+
+// CHECK-LABEL: func @ifregion_else
+// CHECK-SAME: (%arg0: {{.+}} {tf.device = "/TPU:0"}, %arg1: {{.+}} {tf.device = "/TPU:1"}
+func @ifregion_else(
+  %arg0: !tf_res,
+  %arg1: !tf_res) {
+  tf_executor.graph {
+    // CHECK: tf_executor.island
+    %island = tf_executor.island {
+      // CHECK-NEXT: "tf.Identity"
+      // CHECK-SAME: {device = "/TPU:0"}
+      %id0 = "tf.Identity"(%arg0) : (!tf_res)
+        -> !tf_res
+      // CHECK-NEXT: "tf.Identity"
+      // CHECK-SAME: {device = "/TPU:1"}
+      %id1 = "tf.Identity"(%arg1) : (!tf_res)
+        -> !tf_res
+      tf_executor.yield
+    }
+    tf_executor.fetch %island : !tf_executor.control
+  }
+  return
+}
+
+// Test progagation through IfRegion (inlined calls)
+// CHECK-LABEL: func @propagate_if_region_inlined
+func @propagate_if_region_inlined(
+  %arg0: !tf_res {tf.device = "/TPU:0"},
+  %arg1: tensor<i1>) {
+  tf_executor.graph {
+    // CHECK: tf_executor.island
+    %island = tf_executor.island {
+      // CHECK-NEXT: "tf.Identity"
+      // CHECK-SAME: {device = "/TPU:0"}
+      %id0 = "tf.Identity"(%arg0) : (!tf_res)
+        -> !tf_res
+      // CHECK-NEXT: "tf.VarHandleOp"
+      %var_handle = "tf.VarHandleOp"() {container = "c", shared_name = "v0", device = "/TPU:1"}
+        : () -> !tf_res
+      // CHECK-NEXT: "tf.IfRegion"
+      "tf.IfRegion"(%arg1) ({
+          tf_executor.graph {
+             // CHECK: tf_executor.island
+             %island = tf_executor.island {
+               // CHECK-NEXT: "tf.Identity"
+               // CHECK-SAME: {device = "/TPU:0"}
+               %id1 = "tf.Identity"(%id0) : (!tf_res) -> !tf_res
+               // CHECK-NEXT: "tf.Identity"
+               // CHECK-SAME: {device = "/TPU:1"}
+               %id2 = "tf.Identity"(%var_handle) : (!tf_res) -> !tf_res
+               tf_executor.yield
+             }
+             tf_executor.fetch %island : !tf_executor.control
+          }
+          "tf.Yield"() : () -> ()
+        }, {
+          tf_executor.graph {
+             // CHECK: tf_executor.island
+             %island = tf_executor.island {
+               // CHECK-NEXT: "tf.Identity"
+               // CHECK-SAME: {device = "/TPU:0"}
+               %id1 = "tf.Identity"(%id0) : (!tf_res) -> !tf_res
+               // CHECK-NEXT: "tf.Identity"
+               // CHECK-SAME: {device = "/TPU:1"}
+               %id2 = "tf.Identity"(%var_handle) : (!tf_res) -> !tf_res
+               tf_executor.yield
+             }
+             tf_executor.fetch %island : !tf_executor.control
+          }
+          "tf.Yield"() : () -> ()
+        }) {is_stateless = false} : (tensor<i1>) -> ()
+      tf_executor.yield
+    }
+    tf_executor.fetch %island : !tf_executor.control
+  }
+  return
+}
+
+// Test propagation through WhileRegion (inlined calls)
+// CHECK-LABEL: func @propagate_while_region_inlined
+func @propagate_while_region_inlined(
+  %arg0: !tf_res {tf.device = "/TPU:0"},
+  %arg1: tensor<i32>) {
+  tf_executor.graph {
+    // CHECK: tf_executor.island
+    %island = tf_executor.island {
+      // CHECK-NEXT: "tf.Identity"
+      // CHECK-SAME: {device = "/TPU:0"}
+      %id0 = "tf.Identity"(%arg0) : (!tf_res) -> !tf_res
+      // CHECK-NEXT: "tf.VarHandleOp"
+      %var_handle = "tf.VarHandleOp"() {container = "c", shared_name = "v0", device = "/TPU:1"} : () -> !tf_res
+      // CHECK-NEXT: "tf.WhileRegion"
+      "tf.WhileRegion"(%arg1, %id0, %var_handle) ({
+          ^bb0(%carg0: tensor<i32>, %carg1: !tf_res, %carg2: !tf_res):
+            // CHECK: ^bb
+            // CHECK: "tf.Identity"
+            // CHECK-SAME: {device = "/TPU:0"}
+            %cid0 = "tf.Identity"(%carg1) : (!tf_res) -> !tf_res loc("cid0")
+            %read = "tf.ReadVariableOp"(%cid0) : (!tf_res) -> tensor<32xf32>
+            %cst = constant dense<3.0> : tensor<32xf32>
+            %cmp = "tf.Less"(%read, %cst) : (tensor<32xf32>, tensor<32xf32>) -> tensor<32xi1>
+            %dims = constant dense<0> : tensor<1xi32>
+            %reduce = "tf.All"(%cmp, %dims) {keep_dims = false} : (tensor<32xi1>, tensor<1xi32>) -> tensor<i1>
+            "tf.Yield"(%reduce) : (tensor<i1>) -> ()
+        }, {
+          ^bb0(%barg0: tensor<i32>, %barg1: !tf_res, %barg2: !tf_res):
+            // CHECK: ^bb
+            // CHECK: "tf.Identity"
+            // CHECK-SAME: {device = "/TPU:0"}
+            %bid0 = "tf.Identity"(%barg1) : (!tf_res) -> !tf_res
+            // CHECK-NEXT: "tf.Identity"
+            // CHECK-SAME: {device = "/TPU:1"}
+            %id1 = "tf.Identity"(%barg2) : (!tf_res) -> !tf_res
+            "tf.Yield"(%barg0, %bid0, %id1) : (tensor<i32>, !tf_res,!tf_res) -> ()
+        }){is_stateless = false}
+        : (tensor<i32>, !tf_res, !tf_res) -> (tensor<i32>, !tf_res, !tf_res)
+      tf_executor.yield
+    }
+    tf_executor.fetch %island : !tf_executor.control
+  }
+  return
+}
+
+// Test propagation through WhileRegion (non-inlined calls)
+// CHECK-LABEL: func @propagate_while_region
+func @propagate_while_region(
+  %arg0: !tf_res {tf.device = "/TPU:0"},
+  %arg1: tensor<i32>) {
+  tf_executor.graph {
+    // CHECK: tf_executor.island
+    %island = tf_executor.island {
+      // CHECK-NEXT: "tf.Identity"
+      // CHECK-SAME: {device = "/TPU:0"}
+      %id0 = "tf.Identity"(%arg0) : (!tf_res) -> !tf_res
+      // CHECK-NEXT: "tf.VarHandleOp"
+      %var_handle = "tf.VarHandleOp"() {container = "c", shared_name = "v0", device = "/TPU:1"} : () -> !tf_res
+      // CHECK-NEXT: "tf.WhileRegion"
+      "tf.WhileRegion"(%arg1, %id0, %var_handle) ({
+          ^bb0(%carg0: tensor<i32>, %carg1: !tf_res, %carg2: !tf_res):
+            %cond = call @whileregion_cond(%carg0, %carg1, %carg2) : (tensor<i32>, !tf_res, !tf_res) -> tensor<i1>
+            "tf.Yield"(%cond) : (tensor<i1>) -> ()
+        }, {
+          ^bb0(%barg0: tensor<i32>, %barg1: !tf_res, %barg2: !tf_res):
+            %new_values:3 = call @whileregion_body(%barg0, %barg1, %barg2) : (tensor<i32>, !tf_res,!tf_res) -> (tensor<i32>, !tf_res,!tf_res)
+            "tf.Yield"(%new_values#0, %new_values#1, %new_values#2) : (tensor<i32>, !tf_res,!tf_res) -> ()
+        }){is_stateless = false}
+        : (tensor<i32>, !tf_res, !tf_res) -> (tensor<i32>, !tf_res, !tf_res)
+      tf_executor.yield
+    }
+    tf_executor.fetch %island : !tf_executor.control
+  }
+  return
+}
+
+// CHECK-LABEL: func @whileregion_body
+func @whileregion_body(%arg0: tensor<i32>, %arg1: !tf_res, %arg2: !tf_res) -> (tensor<i32>, !tf_res, !tf_res) {
+  %graph:3 = tf_executor.graph {
+    // CHECK: tf_executor.island
+    %island:4 = tf_executor.island {
+      // CHECK-NEXT: "tf.Identity"
+      // CHECK-SAME: {device = "/TPU:0"}
+      %id0 = "tf.Identity"(%arg1) : (!tf_res) -> !tf_res
+      // CHECK-NEXT: "tf.Identity"
+      // CHECK-SAME: {device = "/TPU:1"}
+      %id1 = "tf.Identity"(%arg2) : (!tf_res) -> !tf_res
+      tf_executor.yield %arg0, %id0, %id1 : tensor<i32>, !tf_res, !tf_res
+    }
+    tf_executor.fetch %island#0, %island#1, %island#2 : tensor<i32>, !tf_res, !tf_res
+  }
+  return %graph#0, %graph#1, %graph#2: tensor<i32>, !tf_res, !tf_res
+}
+
+// CHECK-LABEL: func @whileregion_cond
+func @whileregion_cond(%arg0: tensor<i32>, %arg1: !tf_res, %arg2: !tf_res) -> tensor<i1> {
+  %graph = tf_executor.graph {
+    // CHECK: tf_executor.island
+    %island:2 = tf_executor.island {
+      // CHECK-NEXT: "tf.Identity"
+      // CHECK-SAME: {device = "/TPU:0"}
+      %id0 = "tf.Identity"(%arg1) : (!tf_res) -> !tf_res
+      %read = "tf.ReadVariableOp"(%id0) : (!tf_res) -> tensor<32xf32>
+      %cst = constant dense<3.0> : tensor<32xf32>
+      %cmp = "tf.Less"(%read, %cst) : (tensor<32xf32>, tensor<32xf32>) -> tensor<32xi1>
+      %dims = constant dense<0> : tensor<1xi32>
+      %reduce = "tf.All"(%cmp, %dims) {keep_dims = false} : (tensor<32xi1>, tensor<1xi32>) -> tensor<i1>
+      tf_executor.yield %reduce : tensor<i1>
+    }
+    tf_executor.fetch %island#0 : tensor<i1>
+  }
+  return %graph : tensor<i1>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/resource_op_lifting.mlir b/tensorflow/compiler/mlir/tensorflow/tests/resource_op_lifting.mlir
index ac5c2df8f7e..8457d9c62cd 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/resource_op_lifting.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/resource_op_lifting.mlir
@@ -8,7 +8,7 @@ func @only_resource_load() -> tensor<*xi32> {
   // CHECK: %[[RES_HANDLE:[0-9]*]] = "tf.VarHandleOp"
   %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource>
 
-  // CHECK: %[[RES_READ_VAL:[0-9]*]] = "tf.ReadVariableOp"(%[[RES_HANDLE]]) {dtype = i32}
+  // CHECK: %[[RES_READ_VAL:[0-9]*]] = "tf.ReadVariableOp"(%[[RES_HANDLE]])
   // CHECK: "tf_device.cluster"
   // CHECK: %[[COMPUTE_RES:[0-9]*]] = "tf.SomeComputation"(%[[RES_READ_VAL]])
   // CHECK: tf_device.return %[[COMPUTE_RES]]
@@ -39,7 +39,7 @@ func @only_resource_store() -> tensor<*xi32> {
   // CHECK: tf_device.return %[[COMPUTE_RES]], %[[COMPUTE_RES]]
   // CHECK: {cluster_attr = "cluster_attr"}
   // CHECK-SAME: () -> (tensor<*xi32>, tensor<*xi32>)
-  // CHECK: "tf.AssignVariableOp"(%[[RES_HANDLE]], %[[CLUSTER_RES]]#1) {dtype = i32}
+  // CHECK: "tf.AssignVariableOp"(%[[RES_HANDLE]], %[[CLUSTER_RES]]#1)
 
   %1 = "tf_device.cluster"() ( {
     %2 = "tf.SomeComputation"() : () -> (tensor<*xi32>)
@@ -61,13 +61,13 @@ func @same_resource_load_and_store() -> tensor<*xi32> {
   // CHECK: %[[RES_HANDLE:[0-9]*]] = "tf.VarHandleOp"
   %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource>
 
-  // CHECK: %[[RES_READ_VAL:[0-9]*]] = "tf.ReadVariableOp"(%[[RES_HANDLE]]) {dtype = i32}
+  // CHECK: %[[RES_READ_VAL:[0-9]*]] = "tf.ReadVariableOp"(%[[RES_HANDLE]])
   // CHECK: %[[CLUSTER_RES:[0-9]*]]:2 = "tf_device.cluster"
   // CHECK: %[[COMPUTE_RES:[0-9]*]] = "tf.SomeComputation"(%[[RES_READ_VAL]])
   // CHECK: tf_device.return %[[COMPUTE_RES]], %[[COMPUTE_RES]]
   // CHECK: {cluster_attr = "cluster_attr"}
   // CHECK-SAME: () -> (tensor<*xi32>, tensor<*xi32>)
-  // CHECK: "tf.AssignVariableOp"(%[[RES_HANDLE]], %[[CLUSTER_RES]]#1) {dtype = i32}
+  // CHECK: "tf.AssignVariableOp"(%[[RES_HANDLE]], %[[CLUSTER_RES]]#1)
 
   %1 = "tf_device.cluster"() ( {
     %2 = "tf.ReadVariableOp"(%0) {dtype = i32} : (tensor<*x!tf.resource>) -> tensor<*xi32>
@@ -112,26 +112,6 @@ func @internal_resource() -> tensor<*xi32> {
 
 // -----
 
-// Tests that pass fails when there are remaining resource operationss that can
-// not be lifted.
-
-func @lifting_failure() -> tensor<*xi32> {
-
-  %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource>
-
-  // expected-error @+1 {{has remaining resource inputs that can not be lifted}}
-  %1 = "tf_device.cluster"() ( {
-    %2 = "tf.ReadVariableOp"(%0) {dtype = i32} : (tensor<*x!tf.resource>) -> tensor<*xi32>
-		%3 = "tf.SomeResourceOp"(%0, %2) : (tensor<*x!tf.resource>, tensor<*xi32>) -> tensor<*xi32>
-    "tf.AssignVariableOp"(%0, %3) {dtype = i32} : (tensor<*x!tf.resource>, tensor<*xi32>) -> ()
-    tf_device.return %3 : tensor<*xi32>
-  }) {cluster_attr = "cluster_attr"} : () -> tensor<*xi32>
-
-  return %1 : tensor<*xi32>
-}
-
-// -----
-
 // Tests that pass lifts resource reads/writes from a loop, and removed unused
 // resources.
 
@@ -328,6 +308,7 @@ func @while_cond1(%arg0: tensor<*x!tf.resource<tensor<f32>>>, %arg1: tensor<*x!t
 func @cluster_with_loop() -> () {
   %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource<tensor<f32>>>
   "tf_device.cluster"() ( {
+    // expected-error@+1 {{result #0 not tied to function argument for branch @while_body}}
     %1 = "tf.While"(%0) {
       body = @while_body, cond = @while_cond, device = "", is_stateless = false}
          : (tensor<*x!tf.resource<tensor<f32>>>) -> (tensor<*x!tf.resource<tensor<f32>>>)
@@ -337,7 +318,6 @@ func @cluster_with_loop() -> () {
 }
 func @while_body(%arg0: tensor<*x!tf.resource<tensor<f32>>>) -> (tensor<*x!tf.resource<tensor<f32>>>) {
   %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v2"} : () -> tensor<*x!tf.resource<tensor<f32>>>
-  // expected-error @+1 {{resource used in while loop is only supported when the resource input and output alias each other in the loop body}}
   return %0 : tensor<*x!tf.resource<tensor<f32>>>
 }
 func @while_cond(%arg0: tensor<*x!tf.resource<tensor<f32>>>) -> tensor<f32> {
@@ -347,35 +327,12 @@ func @while_cond(%arg0: tensor<*x!tf.resource<tensor<f32>>>) -> tensor<f32> {
 
 // -----
 
-// Tests that pass reports error on unsupported ops in loop body.
-
-func @cluster_with_loop() -> () {
-  %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource<tensor<f32>>>
-  "tf_device.cluster"() ( {
-    %1 = "tf.While"(%0) {
-      body = @while_body, cond = @while_cond, device = "", is_stateless = false}
-         : (tensor<*x!tf.resource<tensor<f32>>>) -> (tensor<*x!tf.resource<tensor<f32>>>)
-    tf_device.return
-  }) {cluster_attr = "cluster_attr"} : () -> ()
-  return
-}
-func @while_body(%arg0: tensor<*x!tf.resource<tensor<f32>>>) -> (tensor<*x!tf.resource<tensor<f32>>>) {
-  // expected-error @+1 {{found unsupported operations on resource.}}
-  "tf._UnknownOp"(%arg0) : (tensor<*x!tf.resource<tensor<f32>>>) -> ()
-  return %arg0 : tensor<*x!tf.resource<tensor<f32>>>
-}
-func @while_cond(%arg0: tensor<*x!tf.resource<tensor<f32>>>) -> tensor<f32> {
-  %read = "tf.ReadVariableOp"(%arg0) : (tensor<*x!tf.resource<tensor<f32>>>) -> tensor<f32>
-  return %read : tensor<f32>
-}
-
-// -----
-
 // Tests that pass reports error on unsupported ops in loop cond.
 
 func @cluster_with_loop() -> () {
   %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource<tensor<f32>>>
   "tf_device.cluster"() ( {
+    // expected-error@+1 {{found resource write in loop condition.}}
     %1 = "tf.While"(%0) {
       body = @while_body, cond = @while_cond, device = "", is_stateless = false}
          : (tensor<*x!tf.resource<tensor<f32>>>) -> (tensor<*x!tf.resource<tensor<f32>>>)
@@ -391,7 +348,6 @@ func @while_body(%arg0: tensor<*x!tf.resource<tensor<f32>>>) -> (tensor<*x!tf.re
 func @while_cond(%arg0: tensor<*x!tf.resource<tensor<f32>>>) -> tensor<f32> {
   %read = "tf.ReadVariableOp"(%arg0) : (tensor<*x!tf.resource<tensor<f32>>>) -> tensor<f32>
   %constant = "tf.Const"() {value = dense<0.0> : tensor<f32>} : () -> tensor<f32>
-  // expected-error @+1 {{found resource write in loop condition.}}
   "tf.AssignVariableOp"(%arg0, %constant) : (tensor<*x!tf.resource<tensor<f32>>>, tensor<f32>) -> ()
   return %read : tensor<f32>
 }
@@ -409,7 +365,7 @@ func @cluster_with_case(%arg0: tensor<i32>) -> tensor<4xf32> {
   // CHECK: %[[CLUSTER:.*]]:2 = "tf_device.cluster"()
   %2 = "tf_device.cluster"() ( {
     // CHECK: %[[CASE:.*]]:2 = "tf.Case"(%[[ARG0]], %[[READ0]], %[[READ1]])
-    %3:2 = "tf.Case"(%arg0, %0, %1) {branches = [@branch_0, @branch_1, @branch_2]}
+    %3:2 = "tf.Case"(%arg0, %0, %1) {branches = [@branch_0, @branch_1, @branch_2], is_stateless = false}
       : (tensor<i32>, tensor<*x!tf.resource<tensor<4xf32>>>, tensor<*x!tf.resource<tensor<4xf32>>>)
       -> (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>)
     // CHECK-NEXT: %[[ADD:.*]] = "tf.AddV2"(%[[CASE]]#1, %[[CASE]]#0)
@@ -571,7 +527,7 @@ func @cluster_with_if(%arg0: tensor<i1>) -> tensor<4xf32> {
   %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource<tensor<4xf32>>>
   %1 = "tf.VarHandleOp"() {container = "c", shared_name = "v2"} : () -> tensor<*x!tf.resource<tensor<4xf32>>>
   %2 = "tf_device.cluster"() ( {
-    // expected-error @+1 {{unsupported output: resource does not alias a single input}}
+    // expected-error @+1 {{result #0 is not tied to the same argument across all branches}}
     %3 = "tf.If"(%arg0, %0, %1) {then_branch = @if_then, else_branch = @if_else,
         is_stateless = false}
       : (tensor<i1>, tensor<*x!tf.resource<tensor<4xf32>>>, tensor<*x!tf.resource<tensor<4xf32>>>)
@@ -598,7 +554,7 @@ func @cluster_with_if(%arg0: tensor<i1>) -> tensor<4xf32> {
   %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource<tensor<4xf32>>>
   %1 = "tf.VarHandleOp"() {container = "c", shared_name = "v2"} : () -> tensor<*x!tf.resource<tensor<4xf32>>>
   %2 = "tf_device.cluster"() ( {
-    // expected-error @+1 {{unsupported output: resource does not alias input}}
+    // expected-error @+1 {{result #0 not tied to function argument for branch @if_then}}
     %3 = "tf.If"(%arg0, %0, %1) {then_branch = @if_then, else_branch = @if_else,
         is_stateless = false}
       : (tensor<i1>, tensor<*x!tf.resource<tensor<4xf32>>>, tensor<*x!tf.resource<tensor<4xf32>>>)
@@ -757,3 +713,381 @@ func @callee(%arg0: tensor<*x!tf.resource<tensor<f32>>>) -> tensor<f32> {
 
 // CHECK:      func @callee_resource_lifted(%[[A0:.*]]: tensor<f32>) -> tensor<f32>
 // CHECK-NEXT:   return %[[A0]]
+
+// -----
+
+// Test that the pass can lift resources out of IfRegion
+// CHECK: func @cluster_with_ifregion(%[[ARG0:.*]]: tensor<i1>) -> tensor<4xf32>
+func @cluster_with_ifregion(%arg0: tensor<i1>) -> tensor<4xf32> {
+  // CHECK: %[[VH0:.*]] = "tf.VarHandleOp"()
+  // CHECK: %[[VH1:.*]] = "tf.VarHandleOp"()
+  // CHECK: %[[READ1:.*]] = "tf.ReadVariableOp"(%[[VH1]])
+  // CHECK: %[[CLUSTER:.*]]:2 = "tf_device.cluster"()
+  // CHECK: %[[IF:.*]]:2 = "tf.IfRegion"(%[[ARG0]])
+  %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource<tensor<4xf32>>>
+  %1 = "tf.VarHandleOp"() {container = "c", shared_name = "v2"} : () -> tensor<*x!tf.resource<tensor<4xf32>>>
+  %2 = "tf_device.cluster"() ( {
+    %3:2 = "tf.IfRegion"(%arg0) ({
+          // CHECK-NEXT: %[[CONST:.*]] = "tf.Const"()
+          // CHECK-NEXT: "tf.Yield"(%[[CONST]], %[[CONST]])
+          %constant = "tf.Const"() {value = dense<0.0> : tensor<4xf32>} : () -> tensor<4xf32>
+          "tf.AssignVariableOp"(%0, %constant) : (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>) -> ()
+          "tf.Yield"(%0, %constant) : (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>) -> ()
+      }, {
+          // CHECK: "tf.Yield"(%[[READ1]], %[[READ1]])
+          %id = "tf.Identity"(%1) : (tensor<*x!tf.resource<tensor<4xf32>>>) -> tensor<*x!tf.resource<tensor<4xf32>>>
+          %read = "tf.ReadVariableOp"(%id) : (tensor<*x!tf.resource<tensor<4xf32>>>) -> tensor<4xf32>
+          "tf.AssignVariableOp"(%0, %read) : (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>) -> ()
+          "tf.Yield"(%0, %read) : (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>) -> ()
+      }) {is_stateless = false} : (tensor<i1>) -> (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>)
+    // CHECK: %[[ADD:.*]] = "tf.AddV2"(%[[IF]]#1, %[[IF]]#0)
+    // CHECK-NEXT: tf_device.return %[[ADD]], %[[IF]]#1
+    %4 = "tf.ReadVariableOp"(%3#0) : (tensor<*x!tf.resource<tensor<4xf32>>>) -> tensor<4xf32>
+    %5 = "tf.AddV2"(%4, %3#1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+    tf_device.return %5 : tensor<4xf32>
+  }) {cluster_attr = "cluster_attr"} : () -> tensor<4xf32>
+  // CHECK: {cluster_attr = "cluster_attr"} : () -> (tensor<4xf32>, tensor<4xf32>)
+  // CHECK: "tf.AssignVariableOp"(%[[VH0]], %[[CLUSTER]]#1)
+  // CHECK: return %[[CLUSTER]]#0
+  return %2 : tensor<4xf32>
+}
+
+// Test that the pass can lift resources out of CaseRegion
+// CHECK: func @cluster_with_caseregion(%[[ARG0:.*]]: tensor<i32>) -> tensor<4xf32>
+func @cluster_with_caseregion(%arg0: tensor<i32>) -> tensor<4xf32> {
+  // CHECK: %[[VH0:.*]] = "tf.VarHandleOp"()
+  // CHECK: %[[VH1:.*]] = "tf.VarHandleOp"()
+  // CHECK: %[[READ1:.*]] = "tf.ReadVariableOp"(%[[VH1]])
+  // CHECK: %[[CLUSTER:.*]]:2 = "tf_device.cluster"()
+  // CHECK: %[[CASE:.*]]:2 = "tf.CaseRegion"(%[[ARG0]])
+  %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource<tensor<4xf32>>>
+  %1 = "tf.VarHandleOp"() {container = "c", shared_name = "v2"} : () -> tensor<*x!tf.resource<tensor<4xf32>>>
+  %2 = "tf_device.cluster"() ( {
+    %3:2 = "tf.CaseRegion"(%arg0) ({
+          // CHECK-NEXT: %[[CONST:.*]] = "tf.Const"()
+          // CHECK-NEXT: "tf.Yield"(%[[CONST]], %[[CONST]])
+          %constant = "tf.Const"() {value = dense<0.0> : tensor<4xf32>} : () -> tensor<4xf32>
+          "tf.AssignVariableOp"(%0, %constant) : (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>) -> ()
+          "tf.Yield"(%0, %constant) : (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>) -> ()
+      }, {
+          // CHECK: "tf.Yield"(%[[READ1]], %[[READ1]])
+          %id = "tf.Identity"(%1) : (tensor<*x!tf.resource<tensor<4xf32>>>) -> tensor<*x!tf.resource<tensor<4xf32>>>
+          %read = "tf.ReadVariableOp"(%id) : (tensor<*x!tf.resource<tensor<4xf32>>>) -> tensor<4xf32>
+          "tf.AssignVariableOp"(%0, %read) : (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>) -> ()
+          "tf.Yield"(%0, %read) : (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>) -> ()
+      }, {
+          // CHECK: %[[CONST1:.*]] = "tf.Const"
+          // CHECK: %[[SUB:.*]] = "tf.Sub"(%[[READ1]], %[[CONST1]])
+          // CHECK: "tf.Yield"(%[[READ1]], %[[SUB]])
+          %id = "tf.Identity"(%1) : (tensor<*x!tf.resource<tensor<4xf32>>>) -> tensor<*x!tf.resource<tensor<4xf32>>>
+          %read = "tf.ReadVariableOp"(%id) : (tensor<*x!tf.resource<tensor<4xf32>>>) -> tensor<4xf32>
+          %constant = "tf.Const"() {value = dense<1.0> : tensor<4xf32>} : () -> tensor<4xf32>
+          %sub = "tf.Sub"(%read, %constant) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+          "tf.AssignVariableOp"(%0, %sub) : (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>) -> ()
+          "tf.Yield"(%0, %read) : (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>) -> ()
+      }) {is_stateless = false} : (tensor<i32>) -> (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>)
+    // CHECK: %[[ADD:.*]] = "tf.AddV2"(%[[CASE]]#1, %[[CASE]]#0)
+    // CHECK-NEXT: tf_device.return %[[ADD]], %[[CASE]]#1
+    %4 = "tf.ReadVariableOp"(%3#0) : (tensor<*x!tf.resource<tensor<4xf32>>>) -> tensor<4xf32>
+    %5 = "tf.AddV2"(%4, %3#1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+    tf_device.return %5 : tensor<4xf32>
+  }) {cluster_attr = "cluster_attr"} : () -> tensor<4xf32>
+  // CHECK: {cluster_attr = "cluster_attr"} : () -> (tensor<4xf32>, tensor<4xf32>)
+  // CHECK: "tf.AssignVariableOp"(%[[VH0]], %[[CLUSTER]]#1)
+  // CHECK: return %[[CLUSTER]]#0
+  return %2 : tensor<4xf32>
+}
+
+// -----
+
+// Test that the pass can lift resources out of WhileRegion
+// CHECK-LABEL: func @cluster_with_whileregion
+func @cluster_with_whileregion() -> () {
+  // CHECK: %[[COUNT:.*]] = "tf.Const"() {value = dense<10> : tensor<i32>}
+  // CHECK: %[[VH:.*]] = "tf.VarHandleOp"()
+  // CHECK: %[[READ:.*]] = "tf.ReadVariableOp"(%[[VH]])
+  // CHECK: %[[CLUSTER:.*]] = "tf_device.cluster"()
+  // CHECK: %[[WHILE:.*]]:2 = "tf.WhileRegion"(%[[COUNT]], %[[READ]])
+  %0 = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
+  %1 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource<tensor<f32>>>
+  %unused = "tf.VarHandleOp"() {container = "c", shared_name = "v2"} : () -> tensor<*x!tf.resource<tensor<f32>>>
+  "tf_device.cluster"() ( {
+    %2:3 = "tf.WhileRegion"(%0, %1, %unused) ({
+            // CHECK: (%[[CARG0:.+]]: tensor<i32>, %[[CARG1:.+]]: tensor<f32>):
+            // CHECK: %[[CAST:.+]] = "tf.Cast"(%[[CARG1]])
+            // CHECK: "tf.Less"(%[[CARG0]], %[[CAST]])
+            // CHECK: "tf.Yield"
+            ^bb0(%carg0: tensor<i32>, %carg1:tensor<*x!tf.resource<tensor<f32>>>, %carg2: tensor<*x!tf.resource<tensor<f32>>>):
+               %read0 = "tf.ReadVariableOp"(%carg1) : (tensor<*x!tf.resource<tensor<f32>>>) -> tensor<f32>
+               %cast = "tf.Cast"(%read0) : (tensor<f32>) -> tensor<i32>
+               %cond = "tf.Less"(%carg0, %cast) : (tensor<i32>, tensor<i32>) -> tensor<i1>
+               "tf.Yield"(%cond) : (tensor<i1>) -> ()
+            }, {
+            // CHECK: (%[[BARG0:.+]]: tensor<i32>, %[[BARG1:.+]]: tensor<f32>):
+            // CHECK: %[[ADD0:.*]] = "tf.AddV2"(%[[BARG1]], %[[BARG1]])
+            // CHECK-NEXT: %[[ADD1:.*]] = "tf.AddV2"(%[[ADD0]], %[[ADD0]])
+            // CHECK-NEXT: %[[DELTA:.*]] = "tf.Const"() {value = dense<-1> : tensor<i32>}
+            // CHECK-NEXT: %[[ADD2:.*]] = "tf.AddV2"(%[[BARG0]], %[[DELTA]])
+            // CHECK-NEXT: "tf.Yield"(%[[ADD2]], %[[ADD1]])
+            ^bb1(%barg0: tensor<i32>, %barg1:tensor<*x!tf.resource<tensor<f32>>>, %barg2: tensor<*x!tf.resource<tensor<f32>>>):
+              %read0 = "tf.ReadVariableOp"(%barg1) : (tensor<*x!tf.resource<tensor<f32>>>) -> tensor<f32>
+              %add0 = "tf.AddV2"(%read0, %read0) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+              "tf.AssignVariableOp"(%barg1, %add0) : (tensor<*x!tf.resource<tensor<f32>>>, tensor<f32>) -> ()
+              %read1 = "tf.ReadVariableOp"(%barg1) : (tensor<*x!tf.resource<tensor<f32>>>) -> tensor<f32>
+              %add1 = "tf.AddV2"(%read1, %read1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+              "tf.AssignVariableOp"(%barg1, %add1) : (tensor<*x!tf.resource<tensor<f32>>>, tensor<f32>) -> ()
+              %constant = "tf.Const"() {value = dense<-1> : tensor<i32>} : () -> tensor<i32>
+              %add2 = "tf.AddV2"(%barg0, %constant) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+              %id = "tf.Identity"(%barg2) : (tensor<*x!tf.resource<tensor<f32>>>) -> tensor<*x!tf.resource<tensor<f32>>>
+              "tf.Yield"(%add2, %barg1, %id) : (tensor<i32>, tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<f32>>>) -> ()
+            }) {device = "", is_stateless = false}
+         : (tensor<i32>, tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<f32>>>)
+         -> (tensor<i32>, tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<f32>>>)
+    tf_device.return
+  }) {cluster_attr = "cluster_attr"} : () -> ()
+  // CHECK: tf_device.return %[[WHILE]]#1 : tensor<f32>
+  // CHECK: {cluster_attr = "cluster_attr"} : () -> tensor<f32>
+  // CHECK: "tf.AssignVariableOp"(%[[VH]], %[[CLUSTER]])
+  // CHECK: return
+  return
+}
+
+// -----
+
+// Test that the pass can lift out recursively (If with another if it its body)
+// CHECK: func @cluster_with_if_within_if(%[[ARG0:.*]]: tensor<i1>, %[[ARG1:.*]]: tensor<i1>) -> tensor<4xf32>
+func @cluster_with_if_within_if(%arg0: tensor<i1>, %arg1: tensor<i1>) -> tensor<4xf32> {
+  // CHECK: %[[VH0:.*]] = "tf.VarHandleOp"()
+  // CHECK: %[[VH1:.*]] = "tf.VarHandleOp"()
+  // CHECK: %[[READ0:.*]] = "tf.ReadVariableOp"(%[[VH0]])
+  // CHECK: %[[READ1:.*]] = "tf.ReadVariableOp"(%[[VH1]])
+  // CHECK: %[[CLUSTER:.*]]:2 = "tf_device.cluster"()
+  // CHECK: %[[IF:.*]]:2 = "tf.IfRegion"(%[[ARG0]])
+  %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource<tensor<4xf32>>>
+  %1 = "tf.VarHandleOp"() {container = "c", shared_name = "v2"} : () -> tensor<*x!tf.resource<tensor<4xf32>>>
+  %2 = "tf_device.cluster"() ( {
+    %3:2 = "tf.IfRegion"(%arg0) ({
+          // CHECK-NEXT: %[[CONST:.*]] = "tf.Const"()
+          // CHECK-NEXT: "tf.Yield"(%[[CONST]], %[[CONST]])
+          %constant = "tf.Const"() {value = dense<0.0> : tensor<4xf32>} : () -> tensor<4xf32>
+          "tf.AssignVariableOp"(%0, %constant) : (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>) -> ()
+          "tf.Yield"(%0, %constant) : (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>) -> ()
+      }, {
+          // CHECK: %[[IF1:.*]] = "tf.IfRegion"
+          // CHECK:  "tf.Yield"(%[[READ1]])
+          // CHECK:  "tf.Yield"(%[[READ0]])
+          // CHECK: "tf.Yield"(%[[IF1]], %[[IF1]])
+          %id = "tf.Identity"(%1) : (tensor<*x!tf.resource<tensor<4xf32>>>) -> tensor<*x!tf.resource<tensor<4xf32>>>
+          %read = "tf.IfRegion"(%arg1) ({
+            %read_then = "tf.ReadVariableOp"(%id) : (tensor<*x!tf.resource<tensor<4xf32>>>) -> tensor<4xf32>
+            "tf.Yield"(%read_then) : (tensor<4xf32>) -> ()
+          }, {
+            %read_else = "tf.ReadVariableOp"(%0) : (tensor<*x!tf.resource<tensor<4xf32>>>) -> tensor<4xf32>
+            "tf.Yield"(%read_else) : (tensor<4xf32>) -> ()
+          }) {is_stateless = false} : (tensor<i1>) -> tensor<4xf32>
+          "tf.AssignVariableOp"(%0, %read) : (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>) -> ()
+          "tf.Yield"(%0, %read) : (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>) -> ()
+      }) {is_stateless = false} : (tensor<i1>) -> (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>)
+    // CHECK: %[[ADD:.*]] = "tf.AddV2"(%[[IF]]#1, %[[IF]]#0)
+    // CHECK-NEXT: tf_device.return %[[ADD]], %[[IF]]#1
+    %4 = "tf.ReadVariableOp"(%3#0) : (tensor<*x!tf.resource<tensor<4xf32>>>) -> tensor<4xf32>
+    %5 = "tf.AddV2"(%4, %3#1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+    tf_device.return %5 : tensor<4xf32>
+  }) {cluster_attr = "cluster_attr"} : () -> tensor<4xf32>
+  // CHECK: {cluster_attr = "cluster_attr"} : () -> (tensor<4xf32>, tensor<4xf32>)
+  // CHECK: "tf.AssignVariableOp"(%[[VH0]], %[[CLUSTER]]#1)
+  // CHECK: return %[[CLUSTER]]#0
+  return %2 : tensor<4xf32>
+}
+
+// -----
+
+// IfRegion with store in just one branch
+
+// CHECK: func @if_region_with_store_in_then(%[[ARG0:.*]]: tensor<i1>)
+func @if_region_with_store_in_then(%arg0: tensor<i1>) {
+  // CHECK: %[[VH:.*]] = "tf.VarHandleOp"()
+  // CHECK: %[[READ:.*]] = "tf.ReadVariableOp"(%[[VH]])
+  // CHECK: %[[CLUSTER:.*]] = "tf_device.cluster"()
+  // CHECK: %[[IF:.*]] = "tf.IfRegion"(%[[ARG0]])
+  %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource<tensor<4xf32>>>
+  "tf_device.cluster"() ({
+    "tf.IfRegion"(%arg0) ({
+       // CHECK: %[[CONST:.*]] = "tf.Const"() {value = dense<0.000000e+00>
+       // CHECK: "tf.Yield"(%[[CONST]])
+       %constant = "tf.Const"() {value = dense<0.0> : tensor<4xf32>} : () -> tensor<4xf32>
+       "tf.AssignVariableOp"(%0, %constant) : (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>) -> ()
+       "tf.Yield"() : () -> ()
+      }, {
+       // CHECK: "tf.Yield"(%[[READ]])
+       "tf.Yield"() : () -> ()
+      }) { is_stateless = true} : (tensor<i1>) -> ()
+    tf_device.return
+  }) { cluster_attr = "cluster_attr" } : () -> ()
+  // CHECK: tf_device.return %[[IF]]
+  // CHECK: "tf.AssignVariableOp"(%[[VH0]], %[[CLUSTER]])
+  return
+}
+
+// -----
+
+// IfRegion with store in both branches
+
+// CHECK: func @if_region_with_store_in_both(%[[ARG0:.*]]: tensor<i1>)
+func @if_region_with_store_in_both(%arg0: tensor<i1>) {
+  // CHECK: %[[VH:.*]] = "tf.VarHandleOp"()
+  // CHECK: %[[CLUSTER:.*]] = "tf_device.cluster"()
+  // CHECK: %[[IF:.*]] = "tf.IfRegion"(%[[ARG0]])
+  %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource<tensor<4xf32>>>
+  "tf_device.cluster"() ({
+    "tf.IfRegion"(%arg0) ({
+       // CHECK: %[[CONST:.*]] = "tf.Const"() {value = dense<0.000000e+00>
+       // CHECK: "tf.Yield"(%[[CONST]])
+       %constant = "tf.Const"() {value = dense<0.0> : tensor<4xf32>} : () -> tensor<4xf32>
+       "tf.AssignVariableOp"(%0, %constant) : (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>) -> ()
+       "tf.Yield"() : () -> ()
+      }, {
+       // CHECK: %[[CONST:.*]] = "tf.Const"() {value = dense<1.000000e+00>
+       // CHECK: "tf.Yield"(%[[CONST]])
+       %constant = "tf.Const"() {value = dense<1.0> : tensor<4xf32>} : () -> tensor<4xf32>
+       "tf.AssignVariableOp"(%0, %constant) : (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>) -> ()
+       "tf.Yield"() : () -> ()
+      }) { is_stateless = true} : (tensor<i1>) -> ()
+    tf_device.return
+  }) { cluster_attr = "cluster_attr" } : () -> ()
+  // CHECK: tf_device.return %[[IF]]
+  // CHECK: "tf.AssignVariableOp"(%[[VH0]], %[[CLUSTER]])
+  return
+}
+
+
+// Make sure unsupported resources are handled correctly. If a resource is used
+// in an unsupported op, resource op lifting should skip lifting that resource.
+// So for the below test, the IR should stay unchanged.
+// CHECK-LABEL: func @test_unsupported_resource_op
+func @test_unsupported_resource_op() -> tensor<*xi32> {
+  // CHECK: "tf.VarHandleOp"
+  // CHECK: "tf_device.cluster"() ( {
+  // CHECK: "tf.ReadVariableOp"
+  // CHECK: "tf.SomeResourceOperation"
+  // CHECK: "tf.SomeComputation"
+  // CHECK: tf_device.return
+  // CHECK: {cluster_attr = "cluster_attr"}
+  // CHECK: return
+  %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource>
+  %1 = "tf_device.cluster"() ( {
+    %2 = "tf.ReadVariableOp"(%0) {dtype = i32} : (tensor<*x!tf.resource>) -> tensor<*xi32>
+    "tf.SomeResourceOperation"(%0) : (tensor<*x!tf.resource>) -> ()
+    %3 = "tf.SomeComputation"(%2) : (tensor<*xi32>) -> (tensor<*xi32>)
+    tf_device.return %3 : tensor<*xi32>
+  }) {cluster_attr = "cluster_attr"} : () -> tensor<*xi32>
+
+  return %1 : tensor<*xi32>
+}
+
+// Test unsupported use of resource ops in functional control flow. In the test
+// below, arg0 has an unsupported use whereas arg1 does not. So we expect arg0
+// to not be lifted and arg1 to be lifted.
+// CHECK-LABEL: func @test_unsupported_resource_op_in_if
+func @test_unsupported_resource_op_in_if(%arg0: tensor<i1>) -> tensor<*xi32> {
+  // CHECK: [[VH0:%.*]] = "tf.VarHandleOp"() {container = "c", shared_name = "v"}
+  // CHECK: [[VH1:%.*]] = "tf.VarHandleOp"() {container = "d", shared_name = "w"}
+  // CHECK-NOT: "tf.ReadVariableOp"([[VH0]])
+  // CHECK: [[READ1:%.*]] = "tf.ReadVariableOp"([[VH1]])
+  // CHECK-NOT: "tf.ReadVariableOp"([[VH0]])
+  // CHECK: "tf_device.cluster"() ( {
+  // CHECK:   "tf.If"({{%.*}}, [[VH0]], [[READ1]])
+  // CHECK-SAME: else_branch = @else_fn, is_stateless = true, then_branch = @then_fn
+  // CHECK: tf_device.return
+  // CHECK: return
+  %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource>
+  %1 = "tf.VarHandleOp"() {container = "d", shared_name = "w"} : () -> tensor<*x!tf.resource>
+  %2 = "tf_device.cluster"() ( {
+    %3 = "tf.If"(%arg0, %0, %1)
+          { else_branch = @else_fn, then_branch = @then_fn, is_stateless = true}
+          : (tensor<i1>, tensor<*x!tf.resource>, tensor<*x!tf.resource>) -> tensor<*xi32>
+    tf_device.return %3 : tensor<*xi32>
+  }) {cluster_attr = "cluster_attr"} : () -> tensor<*xi32>
+  return %2 : tensor<*xi32>
+}
+
+// CHECK-LABEL: func @else_fn
+// CHECK-SAME: (%{{.*}}: tensor<*x!tf.resource>, %{{.*}}: tensor<*xi32>)
+func @else_fn(%arg0: tensor<*x!tf.resource>, %arg1: tensor<*x!tf.resource>) -> tensor<*xi32> {
+  %0 = "tf.ReadVariableOp"(%arg0) : (tensor<*x!tf.resource>) -> tensor<*xi32>
+  %1 = "tf.ReadVariableOp"(%arg1) : (tensor<*x!tf.resource>) -> tensor<*xi32>
+  %2 = "tf.Add"(%0, %1) : (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi32>
+  return %2 : tensor<*xi32>
+}
+
+// CHECK-LABEL: func @then_fn
+// CHECK-SAME: (%{{.*}}: tensor<*x!tf.resource>, %{{.*}}: tensor<*xi32>)
+func @then_fn(%arg0: tensor<*x!tf.resource>, %arg1: tensor<*x!tf.resource>) -> tensor<*xi32> {
+  %0 = "tf.ReadVariableOp"(%arg0) : (tensor<*x!tf.resource>) -> tensor<*xi32>
+  %1 = "tf.ReadVariableOp"(%arg1) : (tensor<*x!tf.resource>) -> tensor<*xi32>
+  %2 = "tf.Add"(%0, %1) : (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi32>
+  "tf.UnsupportedResourceOp"(%arg0) : (tensor<*x!tf.resource>) -> ()
+  return %2 : tensor<*xi32>
+}
+
+// Test type refinement. If the resource has a single subtype, check that that
+// type gets used when hoisting the read. None of the result types will change.
+// CHECK-LABEL: func @type_refinement_use_subtype
+func @type_refinement_use_subtype() -> tensor<*xi32> {
+
+  // CHECK: %[[RES_HANDLE:[0-9]*]] = "tf.VarHandleOp"
+  %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource<tensor<4xi32>>>
+
+  // CHECK: %[[RES_READ_VAL:[0-9]*]] = "tf.ReadVariableOp"(%[[RES_HANDLE]])
+  // CHECK-SAME: -> tensor<4xi32>
+  // CHECK: %[[CLUSTER_RES:[0-9]*]]:2 = "tf_device.cluster"
+  // CHECK: %[[COMPUTE_RES:[0-9]*]] = "tf.SomeComputation"(%[[RES_READ_VAL]]) : (tensor<4xi32>) -> tensor<*xi32>
+  // CHECK: tf_device.return %[[COMPUTE_RES]], %[[COMPUTE_RES]]
+  // CHECK-SAME: tensor<*xi32>, tensor<*xi32>
+  // CHECK: {cluster_attr = "cluster_attr"}
+  // CHECK-SAME: () -> (tensor<*xi32>, tensor<*xi32>)
+  // CHECK: "tf.AssignVariableOp"(%[[RES_HANDLE]], %[[CLUSTER_RES]]#1)
+
+  %1 = "tf_device.cluster"() ( {
+    %2 = "tf.ReadVariableOp"(%0) {dtype = i32} : (tensor<*x!tf.resource<tensor<4xi32>>>) -> tensor<*xi32>
+    %3 = "tf.SomeComputation"(%2) : (tensor<*xi32>) -> (tensor<*xi32>)
+    "tf.AssignVariableOp"(%0, %3) {dtype = i32} : (tensor<*x!tf.resource<tensor<4xi32>>>, tensor<*xi32>) -> ()
+    tf_device.return %3 : tensor<*xi32>
+  }) {cluster_attr = "cluster_attr"} : () -> tensor<*xi32>
+
+  // CHECK: return %[[CLUSTER_RES]]#0
+  // CHECK-SAME: tensor<*xi32>
+  return %1 : tensor<*xi32>
+}
+
+// If multiple types are used across reads and writes, check that the read uses
+// the most refined type. The first ReadVariable should refine the type from
+// *xi32 to ?xi32 and the assign should refine it further to 4xi32.
+// CHECK-LABEL: func @type_refinement_use_refined_type
+func @type_refinement_use_refined_type() -> tensor<4xi32> {
+
+  // CHECK: %[[RES_HANDLE:[0-9]*]] = "tf.VarHandleOp"
+  %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource<tensor<*xi32>>>
+
+  // CHECK: %[[RES_READ_VAL:[0-9]*]] = "tf.ReadVariableOp"(%[[RES_HANDLE]])
+  // CHECK-SAME: -> tensor<4xi32>
+  // CHECK: %[[CLUSTER_RES:[0-9]*]]:2 = "tf_device.cluster"
+  // CHECK: %[[COMPUTE_RES:[0-9]*]] = "tf.SomeComputation"(%[[RES_READ_VAL]]) : (tensor<4xi32>) -> tensor<4xi32>
+  // CHECK: tf_device.return %[[COMPUTE_RES]], %[[COMPUTE_RES]]
+  // CHECK-SAME: tensor<4xi32>, tensor<4xi32>
+  // CHECK: {cluster_attr = "cluster_attr"}
+  // CHECK-SAME: () -> (tensor<4xi32>, tensor<4xi32>)
+  // CHECK: "tf.AssignVariableOp"(%[[RES_HANDLE]], %[[CLUSTER_RES]]#1)
+
+  %1 = "tf_device.cluster"() ( {
+    %2 = "tf.ReadVariableOp"(%0) {dtype = i32} : (tensor<*x!tf.resource<tensor<*xi32>>>) -> tensor<?xi32>
+    %3 = "tf.SomeComputation"(%2) : (tensor<?xi32>) -> (tensor<4xi32>)
+    "tf.AssignVariableOp"(%0, %3) {dtype = i32} : (tensor<*x!tf.resource<tensor<*xi32>>>, tensor<4xi32>) -> ()
+    tf_device.return %3 : tensor<4xi32>
+  }) {cluster_attr = "cluster_attr"} : () -> tensor<4xi32>
+
+  // CHECK: return %[[CLUSTER_RES]]#0
+  // CHECK-SAME: tensor<4xi32>
+  return %1 : tensor<4xi32>
+}
+
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir b/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
index 4a5e3c8deaa..26df60229e4 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
@@ -2,69 +2,69 @@
 // RUN: tf-opt %s -tf-shape-inference=propagate-caller-callee-constants -verify-diagnostics | FileCheck %s
 
 module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 130 : i32}} {
-// CHECK-LABEL: func @main(%arg0: tensor<1xi32>, %arg1: tensor<1xi32>) -> tensor<1xi32>
+  // CHECK-LABEL: func @main(%arg0: tensor<1xi32>, %arg1: tensor<1xi32>) -> tensor<1xi32>
   func @main(%arg0: tensor<1xi32>, %arg1: tensor<1xi32>) -> tensor<*xi32> {
- // CHECK: %[[RESULT:.*]] = "tf.AddV2"
- // CHECK-SAME: (tensor<1xi32>, tensor<1xi32>) -> tensor<1xi32>
- // CHECK: return %[[RESULT]] : tensor<1xi32>
+    // CHECK: %[[RESULT:.*]] = "tf.AddV2"
+    // CHECK-SAME: (tensor<1xi32>, tensor<1xi32>) -> tensor<1xi32>
+    // CHECK: return %[[RESULT]] : tensor<1xi32>
     %0 = "tf.Cast"(%arg0) : (tensor<1xi32>) -> tensor<*xi32>
     %1 = "tf.Cast"(%arg1) : (tensor<1xi32>) -> tensor<*xi32>
     %2 = "tf.AddV2"(%0, %1) : (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi32>
     return %2 : tensor<*xi32>
   }
 
-// CHECK-LABEL: func @simple_chain
+  // CHECK-LABEL: func @simple_chain
   func @simple_chain(%arg0: tensor<1xf32>) -> tensor<*xf32> {
-// CHECK: %[[MUL:.*]] = "tf.Mul"{{.*}} (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
-// CHECK: %[[ADD:.*]] = "tf.Add"(%[[MUL]], %[[MUL]]) : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
-// CHECK: return %[[ADD]] : tensor<1xf32>
+    // CHECK: %[[MUL:.*]] = "tf.Mul"{{.*}} (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
+    // CHECK: %[[ADD:.*]] = "tf.Add"(%[[MUL]], %[[MUL]]) : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
+    // CHECK: return %[[ADD]] : tensor<1xf32>
     %0 = "tf.Mul"(%arg0, %arg0) : (tensor<1xf32>, tensor<1xf32>) -> tensor<*xf32>
     %1 = "tf.Add"(%0, %0) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
     return %1 : tensor<*xf32>
   }
 
-// CHECK-LABEL: func @simple_chain_with_broadcast
+  // CHECK-LABEL: func @simple_chain_with_broadcast
   func @simple_chain_with_broadcast(%arg0: tensor<1xf32>, %arg1: tensor<10xf32>) -> tensor<*xf32> {
-// CHECK: %[[MUL:.*]] = "tf.Mul"{{.*}} (tensor<1xf32>, tensor<10xf32>) -> tensor<10xf32>
-// CHECK: %[[ADD:.*]] = "tf.Add"(%[[MUL]], %[[MUL]]) : (tensor<10xf32>, tensor<10xf32>) -> tensor<10xf32>
-// CHECK: %[[CAST:.*]] = "tf.Cast"(%[[ADD]]) {{.*}} : (tensor<10xf32>) -> tensor<*xf32>
-// CHECK: %[[UNKNOWN:.*]] = addf %[[CAST]], %[[CAST]] : tensor<*xf32>
-// CHECK: return %[[UNKNOWN]] : tensor<*xf32>
+    // CHECK: %[[MUL:.*]] = "tf.Mul"{{.*}} (tensor<1xf32>, tensor<10xf32>) -> tensor<10xf32>
+    // CHECK: %[[ADD:.*]] = "tf.Add"(%[[MUL]], %[[MUL]]) : (tensor<10xf32>, tensor<10xf32>) -> tensor<10xf32>
+    // CHECK: %[[CAST:.*]] = "tf.Cast"(%[[ADD]]) {{.*}} : (tensor<10xf32>) -> tensor<*xf32>
+    // CHECK: %[[UNKNOWN:.*]] = addf %[[CAST]], %[[CAST]] : tensor<*xf32>
+    // CHECK: return %[[UNKNOWN]] : tensor<*xf32>
     %0 = "tf.Mul"(%arg0, %arg1) : (tensor<1xf32>, tensor<10xf32>) -> tensor<*xf32>
     %1 = "tf.Add"(%0, %0) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
     %2 = addf %1, %1 : tensor<*xf32>
     return %2 : tensor<*xf32>
   }
 
-// CHECK-LABEL: func @unknown_op
+  // CHECK-LABEL: func @unknown_op
   func @unknown_op(%arg0: tensor<1xf32>) -> tensor<*xf32> {
-// CHECK: %[[MUL:.*]] = "tf.Mul"{{.*}} (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
-// CHECK: %[[UNKNOWN:.*]] = "tf.Unknown"(%[[MUL]], %[[MUL]]) : (tensor<1xf32>, tensor<1xf32>) -> tensor<*xf32>
-// CHECK: return %[[UNKNOWN]] : tensor<*xf32>
+    // CHECK: %[[MUL:.*]] = "tf.Mul"{{.*}} (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
+    // CHECK: %[[UNKNOWN:.*]] = "tf.Unknown"(%[[MUL]], %[[MUL]]) : (tensor<1xf32>, tensor<1xf32>) -> tensor<*xf32>
+    // CHECK: return %[[UNKNOWN]] : tensor<*xf32>
     %0 = "tf.Mul"(%arg0, %arg0) : (tensor<1xf32>, tensor<1xf32>) -> tensor<*xf32>
     %1 = "tf.Unknown"(%0, %0) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
     return %1 : tensor<*xf32>
   }
 
-// CHECK-LABEL: func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<?xf32>
-func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<*xf32> {
-  br ^bb1
-^bb1:
-// CHECK: %[[IDENTITY:.*]] = "tf.Identity"(%arg0) : (tensor<?xf32>) -> tensor<?xf32>
-// CHECK: return %[[IDENTITY]] : tensor<?xf32>
-  %ret = "tf.Identity"(%arg0) : (tensor<?xf32>) -> tensor<*xf32>
-  return %ret : tensor<*xf32>
-}
+  // CHECK-LABEL: func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<?xf32>
+  func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<*xf32> {
+    br ^bb1
+  ^bb1:
+  // CHECK: %[[IDENTITY:.*]] = "tf.Identity"(%arg0) : (tensor<?xf32>) -> tensor<?xf32>
+  // CHECK: return %[[IDENTITY]] : tensor<?xf32>
+    %ret = "tf.Identity"(%arg0) : (tensor<?xf32>) -> tensor<*xf32>
+    return %ret : tensor<*xf32>
+  }
 
 
-// Tests the case where an inference opportunity relies on folding.
+  // Tests the case where an inference opportunity relies on folding.
 
-// CHECK-LABEL: func @simple_folding
+  // CHECK-LABEL: func @simple_folding
   func @simple_folding(%arg0: tensor<1x1x1x1xi32>, %arg1: tensor<1x1x1x1xf32>) -> tensor<?x?x?x?xf32> {
-// CHECK: %[[SHAPE:.*]] = "tf.Shape"
-// CHECK: %[[CONV:.*]] = "tf.Conv2DBackpropInput"(%[[SHAPE]]
-// CHECK-SAME: (tensor<4xi32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
-// CHECK: return %[[CONV]] : tensor<1x1x1x1xf32>
+    // CHECK: %[[SHAPE:.*]] = "tf.Shape"
+    // CHECK: %[[CONV:.*]] = "tf.Conv2DBackpropInput"(%[[SHAPE]]
+    // CHECK-SAME: (tensor<4xi32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
+    // CHECK: return %[[CONV]] : tensor<1x1x1x1xf32>
     %0 = "tf.Shape"(%arg0) : (tensor<1x1x1x1xi32>) -> tensor<4xi32>
     %1 = "tf.Conv2DBackpropInput"(%0, %arg1, %arg1) {
       padding = "VALID", strides = [1, 1, 1, 1]
@@ -72,7 +72,7 @@ func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<*xf32> {
     return %1 : tensor<?x?x?x?xf32>
   }
 
-// Tests where tf.Const's value needs to be refined.
+  // Tests where tf.Const's value needs to be refined.
 
   func @const_refine() -> tensor<*xi32> {
     %0 = "tf.Const"() {value = dense<[3, 2]> : tensor<2xi32>} : () -> tensor<*xi32>
@@ -81,9 +81,9 @@ func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<*xf32> {
     return %0 : tensor<*xi32>
   }
 
-// Tests the case where an op's shape function returns non-fully-defined shapes.
+  // Tests the case where an op's shape function returns non-fully-defined shapes.
 
-// CHECK-LABEL: func @op_non_fully_defined_shape_fn
+  // CHECK-LABEL: func @op_non_fully_defined_shape_fn
   func @op_non_fully_defined_shape_fn(%arg0: tensor<0xi32>, %arg1: tensor<0xi32>) -> tensor<?xi32> {
     // CHECK: tf.BroadcastGradientArgs
     // CHECK-SAME: (tensor<0xi32>, tensor<0xi32>) -> (tensor<?xi32>, tensor<?xi32>)
@@ -91,7 +91,7 @@ func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<*xf32> {
     return %2#0 : tensor<?xi32>
   }
 
-// CHECK-LABEL: func @shape_from_const_input
+  // CHECK-LABEL: func @shape_from_const_input
   func @shape_from_const_input(%arg0: tensor<3x3x32x64xf32>, %arg1: tensor<200x24x24x64xf32>) -> tensor<?x?x?x?xf32> {
     %0 = "tf.Const"() {value = dense<[200, 26, 26, 32]> : tensor<4xi32>} : () -> tensor<4xi32>
     // CHECK: tf.Conv2DBackpropInput
@@ -223,7 +223,7 @@ func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<*xf32> {
   // CHECK-SAME:    %[[ARG_1:.*]]: tensor<!tf.resource<tensor<1x2x3xf32>>>
   func @shape_from_case_to_branch_functions(%arg0: tensor<i32>, %arg1: tensor<!tf.resource<tensor<1x2x3xf32>>>) -> tensor<1x2x3xf32> {
     // CHECK: %[[CASE:.*]] = "tf.Case"(%[[ARG_0]], %[[ARG_1]])
-    %0 = "tf.Case"(%arg0, %arg1) {branches = [@branch_0, @branch_1]} : (tensor<i32>, tensor<!tf.resource<tensor<1x2x3xf32>>>) -> tensor<1x2x3xf32>
+    %0 = "tf.Case"(%arg0, %arg1) {branches = [@branch_0, @branch_1], is_stateless = false} : (tensor<i32>, tensor<!tf.resource<tensor<1x2x3xf32>>>) -> tensor<1x2x3xf32>
     // CHECK:           return %[[CASE]] : tensor<1x2x3xf32>
     return %0 : tensor<1x2x3xf32>
   }
@@ -530,6 +530,21 @@ func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<*xf32> {
     return %3#0, %3#1 : tensor<*xf32>, tensor<*xf32>
   }
 
+  // CHECK-LABEL: infer_device_cluster
+  func @infer_device_cluster(%arg0: tensor<1x8x2xi32>) -> (tensor<*xf32>, tensor<*xf32>) {
+    %0 = "tf.Const"() {value = dense<-1> : tensor<i32>} : () -> tensor<i32>
+    %1 = "tf_device.cluster"() ({
+      %2 = "tf.Cast"(%arg0) {Truncate = false} : (tensor<1x8x2xi32>) -> tensor<1x8x2xf32>
+      tf_device.return %2 : tensor<1x8x2xf32>
+    // CHECK: () -> tensor<1x8x2xf32>
+    }) : () -> tensor<*xf32>
+    // CHECK: "tf.Cast"(%{{.*}}) {Truncate = false} : (tensor<1x8x2xf32>) -> tensor<*xf32>
+    // CHECK: (tensor<i32>, tensor<1x8x2xf32>) -> (tensor<1x8x1xf32>, tensor<1x8x1xf32>)
+    %3:2 = "tf.Split"(%0, %1) {device = ""} : (tensor<i32>, tensor<*xf32>) -> (tensor<*xf32>, tensor<*xf32>)
+    %4 = addf %1, %1 : tensor<*xf32>
+    return %3#0, %3#1 : tensor<*xf32>, tensor<*xf32>
+  }
+
   // CHECK-LABEL: func @tensor_cast(%arg0: tensor<1xi32>) -> tensor<1xi32>
   func @tensor_cast(%arg0: tensor<1xi32>) -> tensor<*xi32> {
    // CHECK: %[[RESULT:.*]] = tensor_cast
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/side-effect-analysis-test.mlir b/tensorflow/compiler/mlir/tensorflow/tests/side-effect-analysis-test.mlir
index d79e028ba9e..5eacbdea180 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/side-effect-analysis-test.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/side-effect-analysis-test.mlir
@@ -277,7 +277,7 @@ func @with_replicate(
 
 // -----
 
-// Tests that the pass does not add control dependencies a stateless if op.
+// Tests that the pass does not add control dependencies for a stateless if op.
 
 // CHECK-LABEL: func @stateless_if_op
 func @stateless_if_op(
@@ -361,6 +361,83 @@ func @if_else(%arg0: tensor<i1>) -> tensor<i1> {
 
 // -----
 
+// Tests that the pass does not add control dependencies for a stateless
+// IfRegion op.
+
+// CHECK-LABEL: func @stateless_ifregion_op
+func @stateless_ifregion_op(
+  // expected-remark@above {{ID: 18}}
+  %arg0: tensor<*x!tf.resource<tensor<32xf32>>>,
+  %arg1: tensor<i1>) {
+  tf_executor.graph {
+  // expected-remark@above {{ID: 16}}
+  // expected-remark@above {{Successors: {17}}}
+    // CHECK: tf_executor.island
+    %island = tf_executor.island {
+    // expected-remark@above {{ID: 14}}
+    // expected-remark@above {{Successors: {15}}}
+
+      %r0 = "tf.ReadVariableOp"(%arg0) :
+      // expected-remark@above {{ID: 0}}
+      // expected-remark@above {{Successors: {12}}}
+        (tensor<*x!tf.resource<tensor<32xf32>>>) -> tensor<32xf32>
+
+      %if = "tf.IfRegion"(%arg1) (
+      // expected-remark@above {{ID: 11}}
+        { // Then region.
+          %graph = tf_executor.graph {
+          // expected-remark@above {{ID: 4}}
+            %island:2 = tf_executor.island {
+            // expected-remark@above {{ID: 2}}
+            // expected-remark@above {{Successors: {3}}}
+              tf_executor.yield %arg1 : tensor<i1>
+              // expected-remark@above {{ID: 1}}
+            }
+            tf_executor.fetch %island#0 : tensor<i1>
+            // expected-remark@above {{ID: 3}}
+            // expected-remark@above {{Predecessors: {2}}}
+          }
+          "tf.Yield"(%graph) : (tensor<i1>) -> ()
+          // expected-remark@above {{ID: 5}}
+        }, { // Else region
+          %graph = tf_executor.graph {
+          // expected-remark@above {{ID: 9}}
+            %island:2 = tf_executor.island {
+            // expected-remark@above {{ID: 7}}
+            // expected-remark@above {{Successors: {8}}}
+              tf_executor.yield %arg1 : tensor<i1>
+              // expected-remark@above {{ID: 6}}
+            }
+            tf_executor.fetch %island#0 : tensor<i1>
+            // expected-remark@above {{ID: 8}}
+            // expected-remark@above {{Predecessors: {7}}}
+          }
+          "tf.Yield"(%graph) : (tensor<i1>) -> ()
+          // expected-remark@above {{ID: 10}}
+        }
+      ) { is_stateless = true} : (tensor<i1>) -> tensor<i1>
+
+      "tf.AssignVariableOp"(%arg0, %r0) :
+      // expected-remark@above {{ID: 12}}
+      // expected-remark@above {{Predecessors: {0}}}
+      // expected-remark@above {{Successors: {13}}}
+        (tensor<*x!tf.resource<tensor<32xf32>>>, tensor<32xf32>) -> ()
+
+      tf_executor.yield
+      // expected-remark@above {{ID: 13}}
+      // expected-remark@above {{Predecessors: {12}}}
+    }
+    tf_executor.fetch %island : !tf_executor.control
+    // expected-remark@above {{ID: 15}}
+    // expected-remark@above {{Predecessors: {14}}}
+  }
+  return
+  // expected-remark@above {{ID: 17}}
+  // expected-remark@above {{Predecessors: {16}}}
+}
+
+// -----
+
 // Tests that the pass does not add control dependencies a stateless while op.
 
 // CHECK-LABEL: func @stateless_if_op
@@ -379,7 +456,7 @@ func @stateless_if_op(
       // expected-remark@above {{ID: 0}}
       // expected-remark@above {{Successors: {2}}}
         (tensor<*x!tf.resource<tensor<32xf32>>>) -> tensor<32xf32>
-      %if = "tf.While"(%arg1) {
+      %while = "tf.While"(%arg1) {
       // expected-remark@above {{ID: 1}}
           body = @while_body, cond = @while_cond, is_stateless = true}
         : (tensor<i1>) -> tensor<i1>
@@ -445,9 +522,98 @@ func @while_cond(%arg0: tensor<i1>) -> tensor<i1> {
 
 // -----
 
+// Tests that the pass does not add control dependencies a stateless WhileRegion
+// op.
+
+// CHECK-LABEL: func @stateless_whileregion_op
+func @stateless_whileregion_op(
+  // expected-remark@above {{ID: 18}}
+  %arg0: tensor<*x!tf.resource<tensor<32xf32>>>,
+  %arg1: tensor<i1>) {
+  tf_executor.graph {
+  // expected-remark@above {{ID: 16}}
+  // expected-remark@above {{Successors: {17}}}
+    // CHECK: tf_executor.island
+    %island = tf_executor.island {
+    // expected-remark@above {{ID: 14}}
+    // expected-remark@above {{Successors: {15}}}
+      %r0 = "tf.ReadVariableOp"(%arg0) :
+      // expected-remark@above {{ID: 0}}
+      // expected-remark@above {{Successors: {12}}}
+        (tensor<*x!tf.resource<tensor<32xf32>>>) -> tensor<32xf32>
+
+      %while = "tf.WhileRegion"(%arg1) (
+      // expected-remark@above {{ID: 11}}
+        {
+          ^bb0(%carg: tensor<i1>):
+            %graph = tf_executor.graph {
+            // expected-remark@above {{ID: 4}}
+              %island:2 = tf_executor.island {
+              // expected-remark@above {{ID: 2}}
+              // expected-remark@above {{Successors: {3}}}
+                tf_executor.yield %carg : tensor<i1>
+                // expected-remark@above {{ID: 1}}
+              }
+              tf_executor.fetch %island#0 : tensor<i1>
+              // expected-remark@above {{ID: 3}}
+              // expected-remark@above {{Predecessors: {2}}}
+            }
+            "tf.Yield"(%graph) : (tensor<i1>) -> ()
+            // expected-remark@above {{ID: 5}}
+        }, {
+          ^bb0(%barg: tensor<i1>):
+            %graph = tf_executor.graph {
+            // expected-remark@above {{ID: 9}}
+              %island:2 = tf_executor.island {
+              // expected-remark@above {{ID: 7}}
+              // expected-remark@above {{Successors: {8}}}
+                tf_executor.yield %barg : tensor<i1>
+                // expected-remark@above {{ID: 6}}
+              }
+              tf_executor.fetch %island#0 : tensor<i1>
+              // expected-remark@above {{ID: 8}}
+              // expected-remark@above {{Predecessors: {7}}}
+            }
+            "tf.Yield"(%graph) : (tensor<i1>) -> ()
+            // expected-remark@above {{ID: 10}}
+        }
+      ) {is_stateless = true} : (tensor<i1>) -> tensor<i1>
+      "tf.AssignVariableOp"(%arg0, %r0) :
+      // expected-remark@above {{ID: 12}}
+      // expected-remark@above {{Predecessors: {0}}}
+      // expected-remark@above {{Successors: {13}}}
+        (tensor<*x!tf.resource<tensor<32xf32>>>, tensor<32xf32>) -> ()
+      tf_executor.yield
+      // expected-remark@above {{ID: 13}}
+      // expected-remark@above {{Predecessors: {12}}}
+    }
+    tf_executor.fetch %island : !tf_executor.control
+    // expected-remark@above {{ID: 15}}
+    // expected-remark@above {{Predecessors: {14}}}
+  }
+  return
+  // expected-remark@above {{ID: 17}}
+  // expected-remark@above {{Predecessors: {16}}}
+}
+
+// -----
+
 // Tests that the pass tracks control dependencies for variables from an if op's
 // output.
 
+// In this test, the resources computed and used are as follows:
+// (* = unknown resource id which aliases with everything else)
+//   id0 = arg0
+//   if-then-branch: [u0,   arg0, arg0]
+//   if-else-branch: [arg0, arg0, arg1]
+//     => first result is unknown, second and third is passthrough
+//   if results    : [*,    arg0, {arg0, arg1}[
+//   ID #2: read (unknown)         -> succ {5, 6)
+//   ID #3: read (arg0)            -> succ {5}
+//   ID #4: read({arg0,arg1})      -> succ {5,6}
+//   ID #5: write(arg0)
+//   ID #6: write(arg1)
+
 // CHECK-LABEL: func @output_of_if_op
 func @output_of_if_op(
   // expected-remark@above {{ID: 12}}
@@ -597,9 +763,151 @@ func @if_else(
 
 // -----
 
+// Tests that the pass tracks control dependencies for variables from an
+// IfRegion op's output.
+
+// CHECK-LABEL: func @output_of_ifregion_op
+func @output_of_ifregion_op(
+  // expected-remark@above {{ID: 26}}
+  %arg0: tensor<*x!tf.resource<tensor<32xf32>>>,
+  %arg1: tensor<*x!tf.resource<tensor<32xf32>>>,
+  %arg2: tensor<i1>) {
+  tf_executor.graph {
+  // expected-remark@above {{ID: 24}}
+  // expected-remark@above {{Successors: {25}}}
+    // CHECK: tf_executor.island
+    %island = tf_executor.island {
+    // expected-remark@above {{ID: 22}}
+    // expected-remark@above {{Successors: {23}}}
+      %id0 = "tf.Identity"(%arg0) : (tensor<*x!tf.resource<tensor<32xf32>>>)
+      // expected-remark@above {{ID: 0}}
+        -> tensor<*x!tf.resource<tensor<32xf32>>>
+      %if:3 = "tf.IfRegion"(%arg2) (
+      // expected-remark@above {{ID: 15}}
+      // expected-remark@above {{Successors: {16,17,18}}}
+        {
+          %graph:3 = tf_executor.graph {
+          // expected-remark@above {{ID: 6}}
+            %island:4 = tf_executor.island {
+            // expected-remark@above {{ID: 4}}
+            // expected-remark@above {{Successors: {5}}}
+              %u0 = "tf._UnknownSideEffectingOp_"() : ()
+              // expected-remark@above {{ID: 1}}
+              // expected-remark@above {{Successors: {3}}}
+                -> tensor<*x!tf.resource<tensor<32xf32>>>
+              %iid0 = "tf.Identity"(%id0) : (tensor<*x!tf.resource<tensor<32xf32>>>)
+              // expected-remark@above {{ID: 2}}
+                -> tensor<*x!tf.resource<tensor<32xf32>>>
+              tf_executor.yield %u0, %iid0, %iid0 :
+              // expected-remark@above {{ID: 3}}
+              // expected-remark@above {{Predecessors: {1}}}
+                tensor<*x!tf.resource<tensor<32xf32>>>,
+                tensor<*x!tf.resource<tensor<32xf32>>>,
+                tensor<*x!tf.resource<tensor<32xf32>>>
+            }
+            tf_executor.fetch %island#0, %island#1, %island#2 :
+            // expected-remark@above {{ID: 5}}
+            // expected-remark@above {{Predecessors: {4}}}
+              tensor<*x!tf.resource<tensor<32xf32>>>,
+              tensor<*x!tf.resource<tensor<32xf32>>>,
+              tensor<*x!tf.resource<tensor<32xf32>>>
+          }
+          "tf.Yield"(%graph#0, %graph#1, %graph#2) :
+          // expected-remark@above {{ID: 7}}
+            (tensor<*x!tf.resource<tensor<32xf32>>>,
+            tensor<*x!tf.resource<tensor<32xf32>>>,
+            tensor<*x!tf.resource<tensor<32xf32>>>) -> ()
+        },
+        {
+          %graph:3 = tf_executor.graph {
+          // expected-remark@above {{ID: 13}}
+            %island:4 = tf_executor.island {
+            // expected-remark@above {{ID: 11}}
+            // expected-remark@above {{Successors: {12}}}
+              %iid0 = "tf.Identity"(%id0) : (tensor<*x!tf.resource<tensor<32xf32>>>)
+              // expected-remark@above {{ID: 8}}
+                -> tensor<*x!tf.resource<tensor<32xf32>>>
+              %iid1 = "tf.Identity"(%arg1) : (tensor<*x!tf.resource<tensor<32xf32>>>)
+              // expected-remark@above {{ID: 9}}
+                -> tensor<*x!tf.resource<tensor<32xf32>>>
+              tf_executor.yield %iid0, %iid0, %iid1 :
+              // expected-remark@above {{ID: 10}}
+                tensor<*x!tf.resource<tensor<32xf32>>>,
+                tensor<*x!tf.resource<tensor<32xf32>>>,
+                tensor<*x!tf.resource<tensor<32xf32>>>
+            }
+            tf_executor.fetch %island#0, %island#1, %island#2 :
+            // expected-remark@above {{ID: 12}}
+            // expected-remark@above {{Predecessors: {11}}}
+              tensor<*x!tf.resource<tensor<32xf32>>>,
+              tensor<*x!tf.resource<tensor<32xf32>>>,
+              tensor<*x!tf.resource<tensor<32xf32>>>
+          }
+          "tf.Yield"(%graph#0, %graph#1, %graph#2) :
+          // expected-remark@above {{ID: 14}}
+            (tensor<*x!tf.resource<tensor<32xf32>>>,
+            tensor<*x!tf.resource<tensor<32xf32>>>,
+            tensor<*x!tf.resource<tensor<32xf32>>>) -> ()
+        }) { is_stateless = false}
+        : (tensor<i1>) ->
+          (tensor<*x!tf.resource<tensor<32xf32>>>,
+           tensor<*x!tf.resource<tensor<32xf32>>>,
+           tensor<*x!tf.resource<tensor<32xf32>>>)
+      %r0 = "tf.ReadVariableOp"(%if#0) :
+      // expected-remark@above {{ID: 16}}
+      // expected-remark@above {{Predecessors: {15}}}
+      // expected-remark@above {{Successors: {19,20}}}
+        (tensor<*x!tf.resource<tensor<32xf32>>>) -> tensor<32xf32>
+      %r1 = "tf.ReadVariableOp"(%if#1) :
+      // expected-remark@above {{ID: 17}}
+      // expected-remark@above {{Predecessors: {15}}}
+      // expected-remark@above {{Successors: {19}}}
+        (tensor<*x!tf.resource<tensor<32xf32>>>) -> tensor<32xf32>
+      %r2 = "tf.ReadVariableOp"(%if#2) :
+      // expected-remark@above {{ID: 18}}
+      // expected-remark@above {{Predecessors: {15}}}
+      // expected-remark@above {{Successors: {19,20}}}
+        (tensor<*x!tf.resource<tensor<32xf32>>>) -> tensor<32xf32>
+      "tf.AssignVariableOp"(%arg0, %r0) :
+      // expected-remark@above {{ID: 19}}
+      // expected-remark@above {{Predecessors: {16,17,18}}}
+      // expected-remark@above {{Successors: {21}}}
+        (tensor<*x!tf.resource<tensor<32xf32>>>, tensor<32xf32>) -> ()
+      "tf.AssignVariableOp"(%arg1, %r0) :
+      // expected-remark@above {{ID: 20}}
+      // expected-remark@above {{Predecessors: {16,18}}}
+      // expected-remark@above {{Successors: {21}}}
+        (tensor<*x!tf.resource<tensor<32xf32>>>, tensor<32xf32>) -> ()
+      tf_executor.yield
+      // expected-remark@above {{ID: 21}}
+      // expected-remark@above {{Predecessors: {19,20}}}
+    }
+    tf_executor.fetch %island : !tf_executor.control
+    // expected-remark@above {{ID: 23}}
+    // expected-remark@above {{Predecessors: {22}}}
+  }
+  return
+  // expected-remark@above {{ID: 25}}
+  // expected-remark@above {{Predecessors: {24}}}
+}
+
+// -----
+
 // Tests that the pass tracks control dependencies for variables from a while
 // op's output.
 
+// Here:
+//   id0 = arg0
+//   while-inputs = (id0/arg0, arg1, arg1)
+//   while body pass through first and second arg, not last one
+//   while-results = (arg0, arg1, Unknown)
+//   #ID 2: read(arg0)      -> succ{5}
+//   #ID 3: read(arg1)      -> succ{6}
+//   #ID 4: read(unknown)   -> succ{5,6}
+//   #ID 5 : write(arg0)
+//   #ID 6 : write(arg1)
+
+
 // CHECK-LABEL: func @output_of_while_op
 func @output_of_while_op(
   // expected-remark@above {{ID: 12}}
@@ -631,24 +939,24 @@ func @output_of_while_op(
       // expected-remark@above {{Predecessors: {1}}}
       // expected-remark@above {{Successors: {5}}}
         (tensor<*x!tf.resource<tensor<32xf32>>>) -> tensor<32xf32>
-      %r1 = "tf.ReadVariableOp"(%while#1) :
+      %r1 = "tf.ReadVariableOp"(%while#2) :
       // expected-remark@above {{ID: 3}}
       // expected-remark@above {{Predecessors: {1}}}
-      // expected-remark@above {{Successors: {5}}}
-        (tensor<*x!tf.resource<tensor<32xf32>>>) -> tensor<32xf32>
-      %r2 = "tf.ReadVariableOp"(%while#2) :
-      // expected-remark@above {{ID: 4}}
-      // expected-remark@above {{Predecessors: {1}}}
       // expected-remark@above {{Successors: {6}}}
         (tensor<*x!tf.resource<tensor<32xf32>>>) -> tensor<32xf32>
+      %r2 = "tf.ReadVariableOp"(%while#3) :
+      // expected-remark@above {{ID: 4}}
+      // expected-remark@above {{Predecessors: {1}}}
+      // expected-remark@above {{Successors: {5,6}}}
+        (tensor<*x!tf.resource<tensor<32xf32>>>) -> tensor<32xf32>
       "tf.AssignVariableOp"(%arg0, %r0) :
       // expected-remark@above {{ID: 5}}
-      // expected-remark@above {{Predecessors: {2,3}}}
+      // expected-remark@above {{Predecessors: {2,4}}}
       // expected-remark@above {{Successors: {7}}}
         (tensor<*x!tf.resource<tensor<32xf32>>>, tensor<32xf32>) -> ()
       "tf.AssignVariableOp"(%arg1, %r0) :
       // expected-remark@above {{ID: 6}}
-      // expected-remark@above {{Predecessors: {4}}}
+      // expected-remark@above {{Predecessors: {3,4}}}
       // expected-remark@above {{Successors: {7}}}
         (tensor<*x!tf.resource<tensor<32xf32>>>, tensor<32xf32>) -> ()
       tf_executor.yield
@@ -740,6 +1048,136 @@ func @while_cond(
 
 // -----
 
+// Tests that the pass tracks control dependencies for variables from a
+// WhileRegion op's output.
+
+// CHECK-LABEL: func @output_of_whileregion_op
+func @output_of_whileregion_op(
+  // expected-remark@above {{ID: 26}}
+  %arg0: tensor<*x!tf.resource<tensor<32xf32>>>,
+  %arg1: tensor<*x!tf.resource<tensor<32xf32>>>,
+  %arg2: tensor<i1>) {
+  tf_executor.graph {
+  // expected-remark@above {{ID: 24}}
+  // expected-remark@above {{Successors: {25}}}
+    // CHECK: tf_executor.island
+    %island = tf_executor.island {
+    // expected-remark@above {{ID: 22}}
+    // expected-remark@above {{Successors: {23}}}
+      %id0 = "tf.Identity"(%arg0) : (tensor<*x!tf.resource<tensor<32xf32>>>)
+      // expected-remark@above {{ID: 0}}
+        -> tensor<*x!tf.resource<tensor<32xf32>>>
+      %while:4 = "tf.WhileRegion"(%arg2, %id0, %arg1, %arg1) (
+      // expected-remark@above {{ID: 15}}
+      // expected-remark@above {{Successors: {16,17,18}}}
+        {
+          ^bb0(%pred: tensor<i1>,
+               %carg1: tensor<*x!tf.resource<tensor<32xf32>>>,
+               %carg2: tensor<*x!tf.resource<tensor<32xf32>>>,
+               %carg3: tensor<*x!tf.resource<tensor<32xf32>>>):
+            %graph = tf_executor.graph {
+            // expected-remark@above {{ID: 6}}
+              %island:2 = tf_executor.island {
+              // expected-remark@above {{ID: 4}}
+              // expected-remark@above {{Successors: {5}}}
+                %const = "tf.Const"() { value = dense<0> : tensor<i1> } : () -> tensor<i1>
+                // expected-remark@above {{ID: 1}}
+                %eq = "tf.Equal"(%pred, %const) : (tensor<i1>, tensor<i1>) -> tensor<i1>
+                // expected-remark@above {{ID: 2}}
+                tf_executor.yield %eq : tensor<i1>
+                // expected-remark@above {{ID: 3}}
+              }
+              tf_executor.fetch %island#0 : tensor<i1>
+              // expected-remark@above {{ID: 5}}
+              // expected-remark@above {{Predecessors: {4}}}
+            }
+            "tf.Yield"(%graph) : (tensor<i1>) -> ()
+            // expected-remark@above {{ID: 7}}
+        },
+        {
+          ^bb0(%pred: tensor<i1>,
+               %barg0: tensor<*x!tf.resource<tensor<32xf32>>>,
+               %barg1: tensor<*x!tf.resource<tensor<32xf32>>>,
+               %barg2: tensor<*x!tf.resource<tensor<32xf32>>>):
+             %graph:4 = tf_executor.graph {
+            // expected-remark@above {{ID: 13}}
+              %island:5 = tf_executor.island {
+              // expected-remark@above {{ID: 11}}
+              // expected-remark@above {{Successors: {12}}}
+                %iid0 = "tf.Identity"(%barg0) : (tensor<*x!tf.resource<tensor<32xf32>>>)
+                // expected-remark@above {{ID: 8}}
+                  -> tensor<*x!tf.resource<tensor<32xf32>>>
+                %u0 = "tf._UnknownSideEffectingOp_"() : ()
+                // expected-remark@above {{ID: 9}}
+                // expected-remark@above {{Successors: {10}}}
+                  -> tensor<*x!tf.resource<tensor<32xf32>>>
+                tf_executor.yield %pred, %iid0, %barg1, %u0 :
+                // expected-remark@above {{ID: 10}}
+                // expected-remark@above {{Predecessors: {9}}}
+                  tensor<i1>, tensor<*x!tf.resource<tensor<32xf32>>>,
+                  tensor<*x!tf.resource<tensor<32xf32>>>,
+                  tensor<*x!tf.resource<tensor<32xf32>>>
+              }
+              tf_executor.fetch %island#0, %island#1, %island#2, %island#3 :
+              // expected-remark@above {{ID: 12}}
+              // expected-remark@above {{Predecessors: {11}}}
+                tensor<i1>, tensor<*x!tf.resource<tensor<32xf32>>>,
+                tensor<*x!tf.resource<tensor<32xf32>>>,
+                tensor<*x!tf.resource<tensor<32xf32>>>
+            }
+            "tf.Yield"(%graph#0, %graph#1, %graph#2, %graph#3) :
+            // expected-remark@above {{ID: 14}}
+              (tensor<i1>, tensor<*x!tf.resource<tensor<32xf32>>>,
+              tensor<*x!tf.resource<tensor<32xf32>>>,
+              tensor<*x!tf.resource<tensor<32xf32>>>) -> ()
+        }
+      ) {is_stateless = false}
+        : (tensor<i1>, tensor<*x!tf.resource<tensor<32xf32>>>,
+           tensor<*x!tf.resource<tensor<32xf32>>>,
+           tensor<*x!tf.resource<tensor<32xf32>>>) ->
+          (tensor<i1>, tensor<*x!tf.resource<tensor<32xf32>>>,
+           tensor<*x!tf.resource<tensor<32xf32>>>,
+           tensor<*x!tf.resource<tensor<32xf32>>>)
+      %r0 = "tf.ReadVariableOp"(%while#1) :
+      // expected-remark@above {{ID: 16}}
+      // expected-remark@above {{Predecessors: {15}}}
+      // expected-remark@above {{Successors: {19}}}
+        (tensor<*x!tf.resource<tensor<32xf32>>>) -> tensor<32xf32>
+      %r1 = "tf.ReadVariableOp"(%while#2) :
+      // expected-remark@above {{ID: 17}}
+      // expected-remark@above {{Predecessors: {15}}}
+      // expected-remark@above {{Successors: {20}}}
+        (tensor<*x!tf.resource<tensor<32xf32>>>) -> tensor<32xf32>
+      %r2 = "tf.ReadVariableOp"(%while#3) :
+      // expected-remark@above {{ID: 18}}
+      // expected-remark@above {{Predecessors: {15}}}
+      // expected-remark@above {{Successors: {19,20}}}
+        (tensor<*x!tf.resource<tensor<32xf32>>>) -> tensor<32xf32>
+      "tf.AssignVariableOp"(%arg0, %r0) :
+      // expected-remark@above {{ID: 19}}
+      // expected-remark@above {{Predecessors: {16,18}}}
+      // expected-remark@above {{Successors: {21}}}
+        (tensor<*x!tf.resource<tensor<32xf32>>>, tensor<32xf32>) -> ()
+      "tf.AssignVariableOp"(%arg1, %r0) :
+      // expected-remark@above {{ID: 20}}
+      // expected-remark@above {{Predecessors: {17,18}}}
+      // expected-remark@above {{Successors: {21}}}
+        (tensor<*x!tf.resource<tensor<32xf32>>>, tensor<32xf32>) -> ()
+      tf_executor.yield
+      // expected-remark@above {{ID: 21}}
+      // expected-remark@above {{Predecessors: {19,20}}}
+    }
+    tf_executor.fetch %island : !tf_executor.control
+    // expected-remark@above {{ID: 23}}
+    // expected-remark@above {{Predecessors: {22}}}
+  }
+  return
+  // expected-remark@above {{ID: 25}}
+  // expected-remark@above {{Predecessors: {24}}}
+}
+
+// -----
+
 // Tests that the pass tracks control dependencies based on TF op registry
 // statefulness flag, for ops not yet defined in ODS.
 
@@ -824,4 +1262,3 @@ func @arguments_with_unique_ids(
   // expected-remark@above {{ID: 8}}
   // expected-remark@above {{Predecessors: {7}}}
 }
-
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tensor_list_ops_decomposition.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tensor_list_ops_decomposition.mlir
index 3d187aa5d60..92cb0458bf9 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tensor_list_ops_decomposition.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tensor_list_ops_decomposition.mlir
@@ -256,7 +256,7 @@ func @main(%arg0: tensor<i32>) -> () {
   %max_size = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
   // CHECK-NOT: tf.EmptyTensorList
   %tl = "tf.EmptyTensorList"(%elem_shape, %max_size) : (tensor<0xi32>, tensor<i32>) -> tensor<!tf.variant<tensor<f32>>>
-  %case_op = "tf.Case"(%arg0, %tl) {branches = [@branch_0, @branch_1, @branch_2]}
+  %case_op = "tf.Case"(%arg0, %tl) {branches = [@branch_0, @branch_1, @branch_2], is_stateless = false}
     : (tensor<i32>, tensor<!tf.variant<tensor<f32>>>) -> tensor<!tf.variant<tensor<f32>>>
   // CHECK: "tf.Slice"
   %pop:2 = "tf.TensorListPopBack"(%case_op, %elem_shape) : (tensor<!tf.variant<tensor<f32>>>, tensor<0xi32>) -> (tensor<!tf.variant<tensor<f32>>>, tensor<f32>)
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
index 20a0e22c48e..1d5e6aad982 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
@@ -183,6 +183,20 @@ func @testLeakyWrongAlphaType(tensor<16xf32>) -> tensor<16xf32> {
 
 // -----
 
+// Test tf.Min with complex numbers.
+// Previous versions of tensorflow said complex numbers were allowed with
+// tf.Min even though it doesn't make sense. The legalization of tf to xla
+// requires that complex types are not allowed in tf.Min, so we have an
+// explicit unit here to make sure that invariant is enforced.
+func @testMinComplex(%arg0: tensor<4x8xcomplex<f32>>) -> tensor<4x1xcomplex<f32>> {
+  %dimension = "tf.Const"() { value = dense<1> : tensor<1xi64> } : () -> tensor<1xi64>
+  // expected-error@below {{'tf.Min' op operand #0 must be tensor of}}
+  %0 = "tf.Min"(%arg0, %dimension) { keep_dims = true }: (tensor<4x8xcomplex<f32>>, tensor<1xi64>) -> tensor<4x1xcomplex<f32>>
+  return %0 : tensor<4x1xcomplex<f32>>
+}
+
+// -----
+
 // CHECK-LABEL: func @testMul
 func @testMul(%arg0: tensor<2xui16>) -> (tensor<2xui16>) {
   %0 = "tf.Mul"(%arg0, %arg0) {T = "tfdtype$DT_UINT16", device = "/device:CPU:0", name = "Mul"} : (tensor<2xui16>, tensor<2xui16>) -> tensor<2xui16>
@@ -775,12 +789,30 @@ func @testInvalidIfOp(tensor<i1>, tensor<2xf32>) -> tensor<2xf32> {
 // -----
 
 func @testIfThen(tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
-func @testIfElse(tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
+func @testIfElse(tensor<2xf32>) -> tensor<2xf32>
 
 // Test invalid tf.If operation
 func @testInvalidIfOp(tensor<i1>, tensor<2xf32>) -> tensor<2xf32> {
 ^bb0(%arg0: tensor<i1>, %arg1: tensor<2xf32>):
-  // expected-error @+1 {{branches should have 1 inputs}}
+  // expected-error @+1 {{'tf.If' op 'then_branch' inputs (size = 2) should have the same number of values as inputs (size = 1)}}
+  %1 = "tf.If"(%arg0, %arg1) {
+    then_branch = @testIfThen,
+    else_branch = @testIfElse,
+    is_stateless = false
+  } : (tensor<i1>, tensor<2xf32>) -> tensor<2xf32>
+
+  return %1 : tensor<2xf32>
+}
+
+// -----
+
+func @testIfThen(tensor<2xf32>) -> (tensor<2xf32>, tensor<2xf32>)
+func @testIfElse(tensor<2xf32>) -> tensor<2xf32>
+
+// Test invalid tf.If operation
+func @testInvalidIfOp(tensor<i1>, tensor<2xf32>) -> tensor<2xf32> {
+^bb0(%arg0: tensor<i1>, %arg1: tensor<2xf32>):
+  // expected-error @+1 {{'tf.If' op 'then_branch' results (size = 2) should have the same number of values as results (size = 1)}}
   %1 = "tf.If"(%arg0, %arg1) {
     then_branch = @testIfThen,
     else_branch = @testIfElse,
@@ -798,7 +830,7 @@ func @testIfElse(tensor<*xf32>) -> tensor<*xf32>
 // Test invalid tf.If operation
 func @testInvalidIfOp(tensor<i1>, tensor<2xf32>) -> tensor<2xf32> {
 ^bb0(%arg0: tensor<i1>, %arg1: tensor<2xf32>):
-  // expected-error @+1 {{then branch input type tensor<*xf16> is incompatible with operand type tensor<2xf32>}}
+  // expected-error @+1 {{'tf.If' op 'then_branch' input type tensor<*xf16> is incompatible with input type tensor<2xf32> at index 0}}
   %1 = "tf.If"(%arg0, %arg1) {
     then_branch = @testIfThen,
     else_branch = @testIfElse,
@@ -816,7 +848,7 @@ func @testIfElse(tensor<3xf32>) -> tensor<*xf32>
 // Test invalid tf.If operation
 func @testInvalidIfOp(tensor<i1>, tensor<*xf32>) -> tensor<2xf32> {
 ^bb0(%arg0: tensor<i1>, %arg1: tensor<*xf32>):
-  // expected-error @+1 {{branches inputs have incompatible types tensor<2xf32> and tensor<3xf32>}}
+  // expected-error @+1 {{expects all branch input type(s) (tensor<2xf32>, tensor<3xf32>) at index 0 to be cast compatible}}
   %1 = "tf.If"(%arg0, %arg1) {
     then_branch = @testIfThen,
     else_branch = @testIfElse,
@@ -834,7 +866,7 @@ func @testIfElse(tensor<*xf32>) -> tensor<3xf32>
 // Test invalid tf.If operation
 func @testInvalidIfOp(tensor<i1>, tensor<*xf32>) -> tensor<2xf32> {
 ^bb0(%arg0: tensor<i1>, %arg1: tensor<*xf32>):
-  // expected-error @+1 {{else branch result type tensor<3xf32> is incompatible with op result type tensor<2xf32>}}
+  // expected-error @+1 {{'tf.If' op 'else_branch' result type tensor<3xf32> is incompatible with result type tensor<2xf32> at index 0}}
   %1 = "tf.If"(%arg0, %arg1) {
     then_branch = @testIfThen,
     else_branch = @testIfElse,
@@ -848,7 +880,7 @@ func @testInvalidIfOp(tensor<i1>, tensor<*xf32>) -> tensor<2xf32> {
 
 // Test invalid tf.Yield operation (parent should be IfRegion)
 func @testInvalidYieldOp(%arg0: f32) -> () {
-  // expected-error @+1 {{'tf.Yield' op expects parent op to be one of 'tf.IfRegion, tf.WhileRegion'}}
+  // expected-error @+1 {{'tf.Yield' op expects parent op to be one of 'tf.CaseRegion, tf.IfRegion, tf.WhileRegion'}}
   "tf.Yield"(%arg0) : (f32) -> ()
 }
 
@@ -895,7 +927,7 @@ func @testValidIfRegionOpWithMultipleResults(%arg0: tensor<i1>, %arg1: tensor<2x
 
 // Test invalid type for operand #0 for tf.IfRegion operation
 func @testInvalidIfRegionOpType0(%arg0: f32, %arg1: tensor<2xf32>) -> tensor<2xf32> {
-  // expected-error @+1 {{operand #0 must be tensor of tf.dtype values}}
+  // expected-error @+1 {{operand #0 must be 0D tensor of 1-bit signless integer values, but got 'f32'}}
   %0 = "tf.IfRegion"(%arg0) ({
      %t = "tf.Abs"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
      "tf.Yield"(%t) : (tensor<2xf32>) -> ()
@@ -982,7 +1014,7 @@ func @testIfRegionElseTerminator(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> ten
 
 // tf.Region yield number of results should match op number of results
 func @testIfRegionThenResultCount(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
-  // expected-error @+1 {{'tf.IfRegion' op then should have same number (1) of results as tf.IfRegion but has 2 results}}
+  // expected-error @+1 {{'tf.IfRegion' op then results (size = 2) should have the same number of values as results (size = 1)}}
   %0 = "tf.IfRegion"(%arg0) ({
      %t = "tf.Abs"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
      "tf.Yield"(%t, %t) : (tensor<2xf32>, tensor<2xf32>) -> ()
@@ -997,7 +1029,7 @@ func @testIfRegionThenResultCount(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> te
 // -----
 
 func @testIfRegionElseResultCount(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
-  // expected-error @+1 {{tf.IfRegion' op else should have same number (1) of results as tf.IfRegion but has 2 results}}
+  // expected-error @+1 {{'tf.IfRegion' op else results (size = 2) should have the same number of values as results (size = 1)}}
   %0 = "tf.IfRegion"(%arg0) ({
      %t = "tf.Abs"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
      "tf.Yield"(%t) : (tensor<2xf32>) -> ()
@@ -1013,7 +1045,7 @@ func @testIfRegionElseResultCount(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> te
 
 // tf.IfRegion yield types should match op result types
 func @testIfRegionOpYieldMismatchThen(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
-  // expected-error @+1 {{then result type tensor<i1> is incompatible with tf.IfRegion result type tensor<2xf32> at index 0}}
+  // expected-error @+1 {{'tf.IfRegion' op then result type tensor<i1> is incompatible with result type tensor<2xf32> at index 0}}
   %0 = "tf.IfRegion"(%arg0) ({
      "tf.Yield"(%arg0) : (tensor<i1>) -> ()
     }, {
@@ -1027,7 +1059,7 @@ func @testIfRegionOpYieldMismatchThen(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -
 // -----
 
 func @testIfRegionOpYieldMismatchElse(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
-  // expected-error @+1 {{else result type tensor<i1> is incompatible with tf.IfRegion result type tensor<2xf32> at index 0}}
+  // expected-error @+1 {{'tf.IfRegion' op else result type tensor<i1> is incompatible with result type tensor<2xf32> at index 0}}
   %0 = "tf.IfRegion"(%arg0) ({
      %t = "tf.Acos"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
      "tf.Yield"(%t) : (tensor<2xf32>) -> ()
@@ -1509,7 +1541,7 @@ func @testWhileBody(tensor<*xf32>) -> (tensor<*xf32>)
 // Test invalid 'While' operation
 func @testWhileResult(tensor<*xf32>) -> (tensor<*xi32>) {
 ^bb0(%arg0: tensor<*xf32>):
-  // expected-error @+1 {{operand type tensor<*xf32> is incompatible with result type}}
+  // expected-error @+1 {{'tf.While' op input type tensor<*xf32> is incompatible with result type tensor<*xi32> at index 0}}
   %1 = "tf.While"(%arg0) {
     cond = @testWhileCond,
     body = @testWhileBody,
@@ -1527,7 +1559,7 @@ func @testWhileBody(tensor<*xf32>) -> (tensor<*xf32>)
 // Test invalid 'While' operation
 func @testWhileResult(tensor<*xf32>) -> (tensor<*xf32>) {
 ^bb0(%arg0: tensor<*xf32>):
-  // expected-error @+1 {{operand type tensor<*xf32> is incompatible with cond function input type}}
+  // expected-error @+1 {{'tf.While' op input type tensor<*xf32> is incompatible with condition input type tensor<*xi32> at index 0}}
   %1 = "tf.While"(%arg0) {
     cond = @testWhileCond,
     body = @testWhileBody,
@@ -1545,7 +1577,7 @@ func @testWhileBody(tensor<*xf32>, tensor<*xf32>) -> (tensor<*xf32>)
 // Test invalid 'While' operation
 func @testWhileResult(tensor<*xf32>) -> (tensor<*xf32>) {
 ^bb0(%arg0: tensor<*xf32>):
-  // expected-error @+1 {{requires the number of operands to be equal to the number of body function inputs. Found 1 and 2, respectively}}
+  // expected-error @+1 {{'tf.While' op inputs (size = 1) should have the same number of values as body inputs (size = 2)}}
   %1 = "tf.While"(%arg0) {
     cond = @testWhileCond,
     body = @testWhileBody,
@@ -1563,7 +1595,7 @@ func @testWhileBody(tensor<*xf32>) -> (tensor<*xi32>)
 // Test invalid 'While' operation
 func @testWhileResult(tensor<*xf32>) -> (tensor<*xf32>) {
 ^bb0(%arg0: tensor<*xf32>):
-  // expected-error @+1 {{body function result type tensor<*xi32> is incompatible with result type}}
+  // expected-error @+1 {{'tf.While' op body result type tensor<*xi32> is incompatible with result type tensor<*xf32> at index 0}}
   %1 = "tf.While"(%arg0) {
     cond = @testWhileCond,
     body = @testWhileBody,
@@ -1581,7 +1613,7 @@ func @testWhileBody(tensor<4xf32>) -> (tensor<*xf32>)
 // Test invalid 'While' operation
 func @testWhileResult(tensor<*xf32>) -> (tensor<*xf32>) {
 ^bb0(%arg0: tensor<*xf32>):
-  // expected-error @+1 {{cond function input type tensor<3xf32> is incompatible with body function input type}}
+  // expected-error @+1 {{'tf.While' op condition input type tensor<3xf32> is incompatible with body input type tensor<4xf32> at index 0}}
   %1 = "tf.While"(%arg0) {
     cond = @testWhileCond,
     body = @testWhileBody,
@@ -1600,7 +1632,7 @@ func @testWhileBody(tensor<*x!tf.resource<tensor<32xf32>>>) -> (tensor<!tf.resou
 // subtypes.
 func @testWhileResult(tensor<*x!tf.resource<tensor<32xf32>>>) -> (tensor<!tf.resource<tensor<16xf32>>>) {
 ^bb0(%arg0: tensor<*x!tf.resource<tensor<32xf32>>>):
-  // expected-error @+1 {{operand type tensor<*x!tf.resource<tensor<32xf32>>> is incompatible with result type}}
+  // expected-error @+1 {{'tf.While' op input type tensor<*x!tf.resource<tensor<32xf32>>> is incompatible with result type tensor<!tf.resource<tensor<16xf32>>> at index 0}}
   %1 = "tf.While"(%arg0) {
     cond = @testWhileCond,
     body = @testWhileBody,
@@ -1696,48 +1728,71 @@ func @testValidWhileRegionNoInputs() -> () {
 }
 
 // -----
+// Invalid while tests. There are 5 sets of type matching that is required
+//   I = input, O = output, BI, BO = body input/output, CI = cond input.
+//   [I, O], [I, CI], [I, BI], [BO, BI], [BO, O].
+// Each check can fail due to number or type mismatch. However, these
+// conditions are not all independent. So we just check I->{CI, BI}, O->BO, and
+// in addition I->O. BO->BI mismatch cannot be independently created without
+// breaking one of these mismatches. That gives us 4x2 tests. In addition
+// condition result needs to be tensor<i1>, for which we have 3
+// additional validation tests. All these tests are based on the following
+// valid while
 
-func @testInvalidWhileRegionMismatchCondInputCount(%arg : tensor<i32>) -> (tensor<i32>) {
-  // expected-error @+1 {{'tf.WhileRegion' op condition should have same number of inputs (1) as tf.WhileRegion but has 0 inputs}}
-  %0 = "tf.WhileRegion"(%arg) (
-     {
-       // ^bb0(%carg: tensor<i32>):
-        %true = constant dense<1> : tensor<i1>
-        "tf.Yield"(%true) : (tensor<i1>) -> ()
-     },
-     {
-       ^bb0(%barg: tensor<i32>):
-        "tf.Yield"(%arg) : (tensor<i32>) -> ()
-     }
-  ) : (tensor<i32>) -> (tensor<i32>)
+func @testInvalidTestValidBase(%arg0 : tensor<i32>) -> (tensor<i32>) {
+  %0 = "tf.WhileRegion"(%arg0) (
+    {
+     ^bb0(%carg: tensor<i32>):
+      %false = constant dense<false> : tensor<i1>
+      "tf.Yield"(%false) : (tensor<i1>) -> ()
+    },
+    {
+     ^bb0(%barg: tensor<i32>):
+      "tf.Yield"(%barg) : (tensor<i32>) -> ()
+    }
+  ) { is_stateless = true } : (tensor<i32>) -> (tensor<i32>)
+  return %0 : tensor<i32>
+}
 
+func @testInvalidWhileRegion_I_CI_CountMismatch(%arg0 : tensor<i32>) -> (tensor<i32>) {
+  // expected-error @+1 {{'tf.WhileRegion' op inputs (size = 1) should have the same number of values as condition inputs (size = 0)}}
+  %0 = "tf.WhileRegion"(%arg0) (
+    {
+     //^bb0(%carg: tensor<i32>):
+      %false = constant dense<false> : tensor<i1>
+      "tf.Yield"(%false) : (tensor<i1>) -> ()
+    },
+    {
+     ^bb0(%barg: tensor<i32>):
+      "tf.Yield"(%barg) : (tensor<i32>) -> ()
+    }
+  ) { is_stateless = true } : (tensor<i32>) -> (tensor<i32>)
   return %0 : tensor<i32>
 }
 
 // -----
 
-func @testInvalidWhileRegionMismatchCondInputType(%arg : tensor<i32>) -> (tensor<i32>) {
-  // expected-error @+1 {{'tf.WhileRegion' op condition input type tensor<f32> is incompatible with tf.WhileRegion input type tensor<i32> at index 0}}
-  %0 = "tf.WhileRegion"(%arg) (
-     {
-       ^bb0(%carg: tensor<f32>):
-        %true = constant dense<1> : tensor<i1>
-        "tf.Yield"(%true) : (tensor<i1>) -> ()
-     },
-     {
-       ^bb0(%barg: tensor<i32>):
-        "tf.Yield"(%barg) : (tensor<i32>) -> ()
-     }
-  ) : (tensor<i32>) -> (tensor<i32>)
-
+func @testInvalidWhileRegion_I_CI_TypeMismatch(%arg0 : tensor<i32>) -> (tensor<i32>) {
+  // expected-error @+1 {{'tf.WhileRegion' op input type tensor<i32> is incompatible with condition input type tensor<f32> at index 0}}
+  %0 = "tf.WhileRegion"(%arg0) (
+    {
+     ^bb0(%carg: tensor<f32>):
+      %false = constant dense<false> : tensor<i1>
+      "tf.Yield"(%false) : (tensor<i1>) -> ()
+    },
+    {
+     ^bb0(%barg: tensor<i32>):
+      "tf.Yield"(%barg) : (tensor<i32>) -> ()
+    }
+  ) { is_stateless = true } : (tensor<i32>) -> (tensor<i32>)
   return %0 : tensor<i32>
 }
 
 // -----
 
-func @testInvalidWhileRegionMismatchBodyInputCount(%arg : tensor<i32>) -> (tensor<i32>) {
-  // expected-error @+1 {{'tf.WhileRegion' op body should have same number of inputs (1) as tf.WhileRegion but has 2 inputs}}
-  %0 = "tf.WhileRegion"(%arg) (
+func @testInvalidWhileRegion_I_BI_CountMismatch(%arg0 : tensor<i32>) -> (tensor<i32>) {
+  // expected-error @+1 {{'tf.WhileRegion' op inputs (size = 1) should have the same number of values as body inputs (size = 2)}}
+  %0 = "tf.WhileRegion"(%arg0) (
      {
        ^bb0(%carg: tensor<i32>):
         %true = constant dense<1> : tensor<i1>
@@ -1754,9 +1809,9 @@ func @testInvalidWhileRegionMismatchBodyInputCount(%arg : tensor<i32>) -> (tenso
 
 // -----
 
-func @testInvalidWhileRegionMismatchBodyInputType(%arg : tensor<i32>) -> (tensor<i32>) {
-  // expected-error @+1 {{body input type tensor<f32> is incompatible with tf.WhileRegion input type tensor<i32> at index 0}}
-  %0 = "tf.WhileRegion"(%arg) (
+func @testInvalidWhileRegion_I_BI_TypeMismatch(%arg0 : tensor<i32>) -> (tensor<i32>) {
+  // expected-error @+1 {{'tf.WhileRegion' op input type tensor<i32> is incompatible with body input type tensor<f32> at index 0}}
+  %0 = "tf.WhileRegion"(%arg0) (
      {
        ^bb0(%carg: tensor<i32>):
         %true = constant dense<1> : tensor<i1>
@@ -1774,6 +1829,77 @@ func @testInvalidWhileRegionMismatchBodyInputType(%arg : tensor<i32>) -> (tensor
 
 // -----
 
+func @testInvalidWhileRegion_O_BO_CountMismatch(%arg0 : tensor<i32>) -> (tensor<i32>) {
+  // expected-error @+1 {{'tf.WhileRegion' op body results (size = 2) should have the same number of values as results (size = 1)}}
+  %0 = "tf.WhileRegion"(%arg0) (
+    {
+     ^bb0(%carg: tensor<i32>):
+      %false = constant dense<false> : tensor<i1>
+      "tf.Yield"(%false) : (tensor<i1>) -> ()
+    },
+    {
+     ^bb0(%barg: tensor<i32>):
+      "tf.Yield"(%barg, %barg) : (tensor<i32>, tensor<i32>) -> ()
+    }
+  ) { is_stateless = true } : (tensor<i32>) -> (tensor<i32>)
+  return %0#0 : tensor<i32>
+}
+
+// -----
+
+func @testInvalidWhileRegionMismatch_O_BO_TypeMismatch(%arg0 : tensor<i32>, %arg1: tensor<f32>) -> (tensor<i32>) {
+  // expected-error @+1 {{'tf.WhileRegion' op body result type tensor<f32> is incompatible with result type tensor<i32> at index 0}}
+  %0 = "tf.WhileRegion"(%arg0) (
+    {
+     ^bb0(%carg: tensor<i32>):
+      %false = constant dense<false> : tensor<i1>
+      "tf.Yield"(%false) : (tensor<i1>) -> ()
+    },
+    {
+     ^bb0(%barg: tensor<i32>):
+      "tf.Yield"(%arg1) : (tensor<f32>) -> ()
+    }
+  ) { is_stateless = true } : (tensor<i32>) -> (tensor<i32>)
+  return %0 : tensor<i32>
+}
+
+// -----
+
+func @testInvalidWhileRegion_I_O_CountMismatch(%arg0 : tensor<i32>) -> (tensor<i32>) {
+  // expected-error@+1 {{'tf.WhileRegion' op inputs (size = 1) should have the same number of values as results (size = 2)}}
+  %0:2 = "tf.WhileRegion"(%arg0) (
+    {
+     ^bb0(%carg: tensor<i32>):
+      %false = constant dense<false> : tensor<i1>
+      "tf.Yield"(%false) : (tensor<i1>) -> ()
+    },
+    {
+     ^bb0(%barg: tensor<i32>):
+      "tf.Yield"(%barg, %barg) : (tensor<i32>, tensor<i32>) -> ()
+    }
+  ) { is_stateless = true } : (tensor<i32>) -> (tensor<i32>, tensor<i32>)
+  return %0#0 : tensor<i32>
+}
+
+// -----
+
+func @testInvalidWhileRegion_I_O_TypeMismatch(%arg0: tensor<i32>, %arg1 : tensor<f32>) -> (tensor<f32>) {
+  // expected-error@+1 {{'tf.WhileRegion' op input type tensor<i32> is incompatible with result type tensor<f32> at index 0}}
+  %0 = "tf.WhileRegion"(%arg0) (
+    {
+     ^bb0(%carg: tensor<i32>):
+      %false = constant dense<false> : tensor<i1>
+      "tf.Yield"(%false) : (tensor<i1>) -> ()
+    },
+    {
+     ^bb0(%barg: tensor<i32>):
+      "tf.Yield"(%arg1) : (tensor<f32>) -> ()
+    }
+  ) { is_stateless = true } : (tensor<i32>) -> (tensor<f32>)
+  return %0 : tensor<f32>
+}
+// -----
+
 func @testInvalidWhileRegionConditionOutputCount2(%arg : tensor<i32>) -> (tensor<i32>) {
   // expected-error @+1 {{'tf.WhileRegion' op condition should have a single tensor<i1> result}}
   %0 = "tf.WhileRegion"(%arg) (
@@ -1827,45 +1953,6 @@ func @testInvalidWhileRegionConditionOutputType(%arg : tensor<i32>) -> (tensor<i
   return %0 : tensor<i32>
 }
 
-// -----
-
-func @testInvalidWhileRegionMismatchBodyOutputCount(%arg : tensor<i32>) -> (tensor<i32>) {
-  // expected-error @+1 {{'tf.WhileRegion' op body should have same number (1) of results as tf.WhileRegion but has 2 results}}
-  %0 = "tf.WhileRegion"(%arg) (
-     {
-       ^bb0(%carg: tensor<i32>):
-        %true = constant dense<1> : tensor<i1>
-        "tf.Yield"(%true) : (tensor<i1>) -> ()
-     },
-     {
-       ^bb0(%barg: tensor<i32>):
-        %false = constant dense<1> : tensor<i1>
-        "tf.Yield"(%barg, %false) : (tensor<i32>, tensor<i1>) -> ()
-     }
-  ) : (tensor<i32>) -> (tensor<i32>)
-
-  return %0 : tensor<i32>
-}
-
-// -----
-
-func @testInvalidWhileRegionMismatchBodyOutputType(%arg : tensor<i32>) -> (tensor<i32>) {
-  // expected-error @+1 {{body result type tensor<f32> is incompatible with tf.WhileRegion result type tensor<i32> at index 0}}
-  %0 = "tf.WhileRegion"(%arg) (
-     {
-       ^bb0(%carg: tensor<i32>):
-        %true = constant dense<1> : tensor<i1>
-        "tf.Yield"(%true) : (tensor<i1>) -> ()
-     },
-     {
-       ^bb0(%barg: tensor<i32>):
-        %c = "tf.Cast"(%barg) : (tensor<i32>) -> tensor<f32>
-        "tf.Yield"(%c) : (tensor<f32>) -> ()
-     }
-  ) : (tensor<i32>) -> (tensor<i32>)
-
-  return %0 : tensor<i32>
-}
 
 // -----
 
@@ -2033,6 +2120,15 @@ func @testConst() -> tensor<f32> {
 
 // -----
 
+// Test invalid tf.ToBool
+func @testInvalidToBool(%arg0: tensor<i32>) -> tensor<1xi1> {
+  // expected-error @+1 {{op result #0 must be 0D tensor of 1-bit signless integer values, but got 'tensor<1xi1>'}}
+  %0 = "tf.ToBool"(%arg0) : (tensor<i32>) -> tensor<1xi1>
+  return %0 : tensor<1xi1>
+}
+
+// -----
+
 // Test valid tf.Transpose
 // CHECK-LABEL: testTranspose
 func @testTranspose(tensor<2x3xf32>) -> tensor<3x2xf32> {
@@ -2354,6 +2450,25 @@ func @testSlice_unknown_begin_in_bounds(%arg0: tensor<4xi32>, %begins: tensor<1x
 
 // -----
 
+func @testSlice_unequal_output_input_rank(%arg0: tensor<4xi32>, %begins: tensor<1xi64>) -> tensor<i32> {
+  %sizes = "tf.Const"() {value = dense<[1]> : tensor<1xi64>} : () -> (tensor<1xi64>)
+  // expected-error @+1 {{requires output to have the same rank as input, but got input rank 1 and output rank 0}}
+  %0 = "tf.Slice"(%arg0, %begins, %sizes) : (tensor<4xi32>, tensor<1xi64>, tensor<1xi64>) -> tensor<i32>
+  return %0 : tensor<i32>
+}
+
+// -----
+
+func @testSlice_wrong_output_size(%arg0: tensor<4xi32>) -> tensor<1xi32> {
+  %begins = "tf.Const"() {value = dense<[1]> : tensor<1xi64>} : () -> (tensor<1xi64>)
+  %sizes = "tf.Const"() {value = dense<[2]> : tensor<1xi64>} : () -> (tensor<1xi64>)
+  // expected-error @+1 {{requires output size to have the same size of slice, got slice size 2 and output size 1}}
+  %0 = "tf.Slice"(%arg0, %begins, %sizes) : (tensor<4xi32>, tensor<1xi64>, tensor<1xi64>) -> tensor<1xi32>
+  return %0 : tensor<1xi32>
+}
+
+// -----
+
 // Valid StridedSlice operation.
 func @testStridedSlice(%input: tensor<4x8xf32>, %begin: tensor<2xi64>, %end: tensor<2xi64>, %strides: tensor<2xi64>) -> tensor<?x?xf32> {
   %0 = "tf.StridedSlice"(%input, %begin, %end, %strides) : (tensor<4x8xf32>, tensor<2xi64>, tensor<2xi64>, tensor<2xi64>) -> tensor<?x?xf32>
@@ -3138,6 +3253,125 @@ func @testBatchMatMulV2(%lhs: tensor<10x10xf32>, %rhs: tensor<f32>) {
 
 // -----
 
+// CHECK-LABEL: func @testBatchMatMulV2NoBatchDimension
+func @testBatchMatMulV2NoBatchDimension(%lhs: tensor<5x10xf32>, %rhs: tensor<10x10xf32>) -> (tensor<5x10xf32>) {
+  %0 = "tf.BatchMatMulV2"(%lhs, %rhs) : (tensor<5x10xf32>, tensor<10x10xf32>) -> tensor<5x10xf32>
+  return %0 : tensor<5x10xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @testBatchMatMulV2ValidBroadcastingBatchDimension
+func @testBatchMatMulV2ValidBroadcastingBatchDimension(%lhs: tensor<10x2x5x10xf32>, %rhs: tensor<10x10xf32>) -> (tensor<10x2x5x10xf32>) {
+  %0 = "tf.BatchMatMulV2"(%lhs, %rhs) : (tensor<10x2x5x10xf32>, tensor<10x10xf32>) -> tensor<10x2x5x10xf32>
+  return %0 : tensor<10x2x5x10xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @testBatchMatMulV2ValidMultiBatchDimension
+func @testBatchMatMulV2ValidMultiBatchDimension(%lhs: tensor<4x5x1x3x2xf32>, %rhs: tensor<1x1x3x5xf32>) -> (tensor<4x5x1x2x5xf32>) {
+  %0 = "tf.BatchMatMulV2"(%lhs, %rhs) { adj_x = true } : (tensor<4x5x1x3x2xf32>, tensor<1x1x3x5xf32>) -> tensor<4x5x1x2x5xf32>
+  return %0 : tensor<4x5x1x2x5xf32>
+}
+
+// -----
+
+func @testBatchMatMulV2InvalidBroadcastingBatchDimensionWithHigherXRank(%lhs: tensor<10x2x5x10xf32>, %rhs: tensor<10x10x10xf32>) {
+  // expected-error @+1 {{found incompatible broadcast batch dimensions for lhs shape 'tensor<10x2x5x10xf32>' and rhs shape 'tensor<10x10x10xf32>'}}
+  %0 = "tf.BatchMatMulV2"(%lhs, %rhs) : (tensor<10x2x5x10xf32>, tensor<10x10x10xf32>) -> tensor<10x10xf32>
+}
+
+// -----
+
+func @testBatchMatMulV2InvalidBroadcastingBatchDimensionWithSameRank(%lhs: tensor<10x2x5x10xf32>, %rhs: tensor<10x10x10x10xf32>) {
+  // expected-error @+1 {{found incompatible broadcast batch dimensions for lhs shape 'tensor<10x2x5x10xf32>' and rhs shape 'tensor<10x10x10x10xf32>'}}
+  %0 = "tf.BatchMatMulV2"(%lhs, %rhs) : (tensor<10x2x5x10xf32>, tensor<10x10x10x10xf32>) -> tensor<10x10xf32>
+}
+
+// -----
+
+func @testBatchMatMulV2InvalidBroadcastingBatchDimensionWithHigherYRank(%lhs: tensor<2x5x10xf32>, %rhs: tensor<10x10x10x10xf32>) {
+  // expected-error @+1 {{found incompatible broadcast batch dimensions for lhs shape 'tensor<2x5x10xf32>' and rhs shape 'tensor<10x10x10x10xf32>'}}
+  %0 = "tf.BatchMatMulV2"(%lhs, %rhs) : (tensor<2x5x10xf32>, tensor<10x10x10x10xf32>) -> tensor<10x10xf32>
+}
+
+// -----
+
+func @testBatchMatMulV2InvalidOutputBatchDimension(%lhs: tensor<10x2x5x10xf32>, %rhs: tensor<2x10x10xf32>) {
+  // expected-error @+1 {{has mismatching input batch dimension 2 and output batch dimension 3}}
+  %0 = "tf.BatchMatMulV2"(%lhs, %rhs) : (tensor<10x2x5x10xf32>, tensor<2x10x10xf32>) -> tensor<10x3x10x10xf32>
+}
+
+// -----
+
+func @testBatchMatMulV2InvalidOutputRank(%lhs: tensor<10x2x5x10xf32>, %rhs: tensor<10x1x10x10xf32>) {
+  // expected-error @+1 {{found invalid output rank, expected 4 but got 3}}
+  %0 = "tf.BatchMatMulV2"(%lhs, %rhs) : (tensor<10x2x5x10xf32>, tensor<10x1x10x10xf32>) -> tensor<10x5x10xf32>
+}
+
+// -----
+
+func @testBatchMatMulV2InvalidOutputRowDim(%lhs: tensor<10x2x5x10xf32>, %rhs: tensor<10x10xf32>) {
+  // expected-error @+1 {{found invalid output dimension on row, expected 5 but got 10}}
+  %0 = "tf.BatchMatMulV2"(%lhs, %rhs) : (tensor<10x2x5x10xf32>, tensor<10x10xf32>) -> tensor<10x2x10x10xf32>
+}
+
+// -----
+
+func @testBatchMatMulV2AdjXInvalidOutputRowDim(%lhs: tensor<10x2x10x5xf32>, %rhs: tensor<10x10xf32>) {
+  // expected-error @+1 {{found invalid output dimension on row, expected 5 but got 10}}
+  %0 = "tf.BatchMatMulV2"(%lhs, %rhs) { adj_x = true } : (tensor<10x2x10x5xf32>, tensor<10x10xf32>) -> tensor<10x2x10x10xf32>
+}
+
+// -----
+
+func @testBatchMatMulV2InvalidOutputColDim(%lhs: tensor<10x2x5x10xf32>, %rhs: tensor<10x10xf32>) {
+  // expected-error @+1 {{found invalid output dimension on col, expected 10 but got 5}}
+  %0 = "tf.BatchMatMulV2"(%lhs, %rhs) : (tensor<10x2x5x10xf32>, tensor<10x10xf32>) -> tensor<10x2x5x5xf32>
+}
+
+// -----
+
+func @testBatchMatMulV2AdjYInvalidOutputColDim(%lhs: tensor<10x2x5x10xf32>, %rhs: tensor<4x10xf32>) {
+  // expected-error @+1 {{found invalid output dimension on col, expected 4 but got 10}}
+  %0 = "tf.BatchMatMulV2"(%lhs, %rhs) { adj_y = true } : (tensor<10x2x5x10xf32>, tensor<4x10xf32>) -> tensor<10x2x5x10xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @testBatchMatMulV2PartiallyKnownInputBatchDim
+func @testBatchMatMulV2PartiallyKnownInputBatchDim(%lhs: tensor<4x5x?x3x2xf32>, %rhs: tensor<1x1x3x5xf32>) -> (tensor<4x5x?x2x5xf32>) {
+  %0 = "tf.BatchMatMulV2"(%lhs, %rhs) { adj_x = true } : (tensor<4x5x?x3x2xf32>, tensor<1x1x3x5xf32>) -> tensor<4x5x?x2x5xf32>
+  return %0 : tensor<4x5x?x2x5xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @testBatchMatMulV2PartiallyKnownMatmulDim
+func @testBatchMatMulV2PartiallyKnownMatmulDim(%lhs: tensor<4x5x1x?x3xf32>, %rhs: tensor<1x1x3x5xf32>) -> (tensor<4x5x1x?x5xf32>) {
+  %0 = "tf.BatchMatMulV2"(%lhs, %rhs) : (tensor<4x5x1x?x3xf32>, tensor<1x1x3x5xf32>) -> tensor<4x5x1x?x5xf32>
+  return %0 : tensor<4x5x1x?x5xf32>
+}
+
+// -----
+
+func @testBatchMatMulV2InvalidPartiallyKnownMatmulDim(%lhs: tensor<4x5x1x?x3xf32>, %rhs: tensor<1x1x3x5xf32>) -> (tensor<4x5x1x?x3xf32>) {
+  // expected-error @+1 {{found invalid output dimension on col, expected 5 but got 3}}
+  %0 = "tf.BatchMatMulV2"(%lhs, %rhs) : (tensor<4x5x1x?x3xf32>, tensor<1x1x3x5xf32>) -> tensor<4x5x1x?x3xf32>
+  return %0 : tensor<4x5x1x?x3xf32>
+}
+
+// -----
+
+func @testBatchMatMulV2AdjXInvalidPartiallyKnownMatmulDim(%lhs: tensor<4x5x1x3x?xf32>, %rhs: tensor<1x1x3x5xf32>) -> (tensor<4x5x1x?x3xf32>) {
+  // expected-error @+1 {{found invalid output dimension on col, expected 5 but got 3}}
+  %0 = "tf.BatchMatMulV2"(%lhs, %rhs) { adj_x = true } : (tensor<4x5x1x3x?xf32>, tensor<1x1x3x5xf32>) -> tensor<4x5x1x?x3xf32>
+  return %0 : tensor<4x5x1x?x3xf32>
+}
+
+// -----
+
 func @testDataFormatVecPermuteInvalid1dInput(%x: tensor<5xi32>) {
   // expected-error @+1 {{requires 1D input of size 4}}
   %0 = "tf.DataFormatVecPermute"(%x): (tensor<5xi32>) -> tensor<5xi32>
@@ -3313,3 +3547,220 @@ func @testBatchToSpaceInvalidOutputDepth(%arg0: tensor<16x8x8x3xf32>, %arg1: ten
   %0 = "tf.BatchToSpace"(%arg0, %arg1) {block_size = 2 : i64} : (tensor<16x8x8x3xf32>, tensor<*xi32>) -> tensor<4x8x8x8xf32>
   return
 }
+
+// -----
+
+func @branch()
+
+func @testCaseBadBranchIndicesShape(%arg0: tensor<8xi32>) {
+  // expected-error @+1 {{expects 'branch_index' to be a scalar, but got 'tensor<8xi32>'}}
+  "tf.Case"(%arg0) {branches = [@branch], is_stateless = false} : (tensor<8xi32>) -> ()
+  return
+}
+
+// -----
+
+func @branch0(tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
+func @branch1(tensor<2xf32>) -> tensor<2xf32>
+
+func @testCaseMismatchedNumOperands(%arg0: tensor<i32>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
+  // expected-error @+1 {{'tf.Case' op branch #0 inputs (size = 2) should have the same number of values as inputs (size = 1)}}
+  %0 = "tf.Case"(%arg0, %arg1) {branches = [@branch0, @branch1], is_stateless = false} : (tensor<i32>, tensor<2xf32>) -> tensor<2xf32>
+  return %0 : tensor<2xf32>
+}
+
+// -----
+
+func @branch0(tensor<2xf32>) -> (tensor<2xf32>, tensor<2xf32>)
+func @branch1(tensor<2xf32>) -> tensor<2xf32>
+
+func @testCaseMismatchedNumResults(%arg0: tensor<i32>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
+  // expected-error @+1 {{'tf.Case' op branch #0 results (size = 2) should have the same number of values as results (size = 1)}}
+  %0 = "tf.Case"(%arg0, %arg1) {branches = [@branch0, @branch1], is_stateless = false} : (tensor<i32>, tensor<2xf32>) -> tensor<2xf32>
+  return %0 : tensor<2xf32>
+}
+
+// -----
+
+func @branch0(tensor<*xf16>) -> tensor<*xf32>
+func @branch1(tensor<*xf32>) -> tensor<*xf32>
+
+func @testCaseOperandNotCastCompatible(%arg0: tensor<i32>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
+  // expected-error @+1 {{'tf.Case' op branch #0 input type tensor<*xf16> is incompatible with input type tensor<2xf32> at index 0}}
+  %0 = "tf.Case"(%arg0, %arg1) {branches = [@branch0, @branch1], is_stateless = false} : (tensor<i32>, tensor<2xf32>) -> tensor<2xf32>
+  return %0 : tensor<2xf32>
+}
+
+// -----
+
+func @branch0(tensor<2xf32>) -> tensor<*xf32>
+func @branch1(tensor<3xf32>) -> tensor<*xf32>
+
+func @testCaseBranchArgumentsNotCastCompatible(%arg0: tensor<i32>, %arg1: tensor<*xf32>) -> tensor<2xf32> {
+  // expected-error @+1 {{expects all branch input type(s) (tensor<2xf32>, tensor<3xf32>) at index 0 to be cast compatible}}
+  %0 = "tf.Case"(%arg0, %arg1) {branches = [@branch0, @branch1], is_stateless = false} : (tensor<i32>, tensor<*xf32>) -> tensor<2xf32>
+  return %0 : tensor<2xf32>
+}
+
+// -----
+
+func @branch0(tensor<*xf32>) -> tensor<*xf32>
+func @branch1(tensor<*xf32>) -> tensor<3xf32>
+
+func @testCaseResultNotCastCompatible(%arg0: tensor<i32>, %arg1: tensor<*xf32>) -> tensor<2xf32> {
+  // expected-error @+1 {{'tf.Case' op branch #1 result type tensor<3xf32> is incompatible with result type tensor<2xf32> at index 0}}
+  %0 = "tf.Case"(%arg0, %arg1) {branches = [@branch0, @branch1], is_stateless = false} : (tensor<i32>, tensor<*xf32>) -> tensor<2xf32>
+  return %0 : tensor<2xf32>
+}
+
+// -----
+
+func @testCaseRegionNoRegions(%arg0: tensor<i32>) {
+  // expected-error @+1 {{expects to have at least 1 region}}
+  "tf.CaseRegion"(%arg0) {is_stateless = false} : (tensor<i32>) -> ()
+  return
+}
+
+// -----
+
+func @testCaseRegionBadBranchIndicesShape(%arg0: tensor<8xi32>) {
+  // expected-error @+1 {{expects 'branch_index' to be a scalar, but got 'tensor<8xi32>'}}
+  "tf.CaseRegion"(%arg0) ( {
+    "tf.Yield"() : () -> ()
+  }) {is_stateless = false} : (tensor<8xi32>) -> ()
+  return
+}
+
+// -----
+
+func @testCaseRegionMismatchedNumResults(%arg0: tensor<i32>) {
+  // expected-error @+1 {{'tf.CaseRegion' op branch #0 results (size = 0) should have the same number of values as results (size = 1)}}
+  %1 = "tf.CaseRegion"(%arg0) ( {
+    "tf.Yield"() : () -> ()
+  }) {is_stateless = false} : (tensor<i32>) -> tensor<i1>
+  return
+}
+
+// -----
+
+func @testCaseRegionMismatchedResultTypes(%arg0: tensor<i32>, %arg1: tensor<f32>) {
+  // expected-error @+1 {{'tf.CaseRegion' op branch #0 result type tensor<f32> is incompatible with result type tensor<i1> at index 0}}
+  %1 = "tf.CaseRegion"(%arg0) ( {
+    "tf.Yield"(%arg1) : (tensor<f32>) -> ()
+  }) {is_stateless = false} : (tensor<i32>) -> tensor<i1>
+  return
+}
+
+// -----
+
+// Test valid tf.Cumsum
+func @testCumsum(%arg: tensor<8x16xf32>, %axis: tensor<i32>) -> tensor<8x16xf32> {
+  %0 = "tf.Cumsum"(%arg, %axis) : (tensor<8x16xf32>, tensor<i32>) -> tensor<8x16xf32>
+  return %0 : tensor<8x16xf32>
+}
+
+// -----
+
+func @testCumprod(%arg: tensor<8x16xf32>, %axis: tensor<2xi32>) -> tensor<8x16xf32> {
+  // expected-error @+1 {{requires scalar axis operand}}
+  %0 = "tf.Cumprod"(%arg, %axis) : (tensor<8x16xf32>, tensor<2xi32>) -> tensor<8x16xf32>
+  return %0 : tensor<8x16xf32>
+}
+
+// -----
+
+func @testCumprod(%arg: tensor<8x16xf32>) -> tensor<8x16xf32> {
+  %axis = constant dense<-3> : tensor<i32>
+  // expected-error @+1 {{axis operand should be within range [-2, 2)}}
+  %0 = "tf.Cumprod"(%arg, %axis) : (tensor<8x16xf32>, tensor<i32>) -> tensor<8x16xf32>
+  return %0 : tensor<8x16xf32>
+}
+
+// -----
+
+func @testTile(%arg0: tensor<2x3x?xf32>) {
+  %cst = constant dense <[2, 3, 4]> : tensor<3xi32>
+  %0 = "tf.Tile"(%arg0, %cst) : (tensor<2x3x?xf32>, tensor<3xi32>) -> tensor<4x9x?xf32>
+  return
+}
+
+// -----
+
+func @testTileMultipleNotRank1(%arg0: tensor<2x3xf32>, %arg1: tensor<1x1xi32>) {
+  // expected-error @+1 {{expected multiples to be rank 1, got rank = 2}}
+  %0 = "tf.Tile"(%arg0, %arg1) : (tensor<2x3xf32>, tensor<1x1xi32>) -> tensor<2x3xf32>
+  return
+}
+
+// -----
+
+func @testTileInputRankNotEqualToMultiplesSize(%arg0: tensor<2x3xf32>, %arg1: tensor<3xi32>) {
+  // expected-error @+1 {{expected size of multiples equal to rank of input, got multiples of size 3, and input of rank 2}}
+  %0 = "tf.Tile"(%arg0, %arg1) : (tensor<2x3xf32>, tensor<3xi32>) -> tensor<2x3xf32>
+  return
+}
+
+// -----
+
+func @testTileInputRankNotEqualToOutputRank(%arg0: tensor<2x3xf32>, %arg1: tensor<2xi32>) {
+  // expected-error @+1 {{expected rank of input to equal to rank of output, got input of rank 2, and output of rank 3}}
+  %0 = "tf.Tile"(%arg0, %arg1) : (tensor<2x3xf32>, tensor<2xi32>) -> tensor<2x3x1xf32>
+  return
+}
+
+// -----
+
+func @testTileNegativeMultiples(%arg0: tensor<2x3xf32>) {
+  %cst = constant dense <[-1, 1]> : tensor<2xi32>
+  // expected-error @+1 {{expected multiples to be non-negative, got multiples[0] = -1}}
+  %0 = "tf.Tile"(%arg0, %cst) : (tensor<2x3xf32>, tensor<2xi32>) -> tensor<2x3xf32>
+  return
+}
+
+// -----
+
+func @testTileInvalidOutputShape(%arg0: tensor<2x3xf32>) {
+  %cst = constant dense <[2, 3]> : tensor<2xi32>
+  // expected-error @+1 {{requires input.shape[1] (3) * 3 to be equal to output.shape[1] (6)}}
+  %0 = "tf.Tile"(%arg0, %cst) : (tensor<2x3xf32>, tensor<2xi32>) -> tensor<4x6xf32>
+  return
+}
+
+// -----
+
+// Test reference variable support for some ops (no errors expected)
+
+// CHECK-LABEL: @testMaximumWithRef
+func @testMaximumWithRef(%arg0: tensor<!tf.f32ref>, %arg1: tensor<f32>) -> tensor<f32> {
+  // CHECK: tf.Maximum
+  %0 = "tf.Maximum"(%arg0, %arg1) : (tensor<!tf.f32ref>, tensor<f32>) -> tensor<f32>
+  return %0 : tensor<f32>
+}
+
+// CHECK-LABEL: @testAddV2WithRef
+func @testAddV2WithRef(%arg0: tensor<!tf.int16ref>, %arg1: tensor<i16>) -> tensor<i16> {
+  // CHECK: tf.AddV2
+  %0 = "tf.AddV2"(%arg0, %arg1) : (tensor<!tf.int16ref>, tensor<i16>) -> tensor<i16>
+  return %0 : tensor<i16>
+}
+
+// CHECK-LABEL: @testRealDivWithRef
+func @testRealDivWithRef(%arg0: tensor<f64>, %arg1: tensor<!tf.f64ref>) -> tensor<f64> {
+  // CHECK: tf.RealDivOp
+  %0 = "tf.RealDivOp"(%arg0, %arg1) : (tensor<f64>, tensor<!tf.f64ref>) -> tensor<f64>
+  return %0 : tensor<f64>
+}
+
+// CHECK-LABEL: @testDivNoNanWithRef
+func @testDivNoNanWithRef(%arg0: tensor<f32>, %arg1: tensor<!tf.f32ref>) -> tensor<f32> {
+  // CHECK: tf.DivNoNanOp
+  %0 = "tf.DivNoNanOp"(%arg0, %arg1) : (tensor<f32>, tensor<!tf.f32ref>) -> tensor<f32>
+  return %0 : tensor<f32>
+}
+
+// CHECK-LABEL: @testAddWithRef
+func @testAddWithRef(%arg0: tensor<!tf.f64ref>, %arg1: tensor<f64>) -> tensor<f64> {
+  // CHECK: tf.Add
+  %0 = "tf.Add"(%arg0, %arg1) : (tensor<!tf.f64ref>, tensor<f64>) -> tensor<f64>
+  return %0 : tensor<f64>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_device_index_selector.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_device_index_selector.mlir
index 7fc2b210f91..11ceac1fe99 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_device_index_selector.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_device_index_selector.mlir
@@ -9,17 +9,17 @@ func @select(%arg0: tensor<f32>, %arg1: tensor<f32>) -> (tensor<i32>, tensor<f32
   // CHECK:  return %[[first]],
   %0 = "tf.DeviceIndex"() {device = "", device_names = ["CPU", "GPU"]} : () -> tensor<i32>
   %1 = "tf.DeviceIndex"() {device = "", device_names = ["CPU", "GPU"]} : () -> tensor<i32>
-  %4 = "tf.Case"(%1, %arg0, %arg1) {branches = [@sub, @add], output_shapes = [#tf.shape<>]} : (tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<f32>
+  %4 = "tf.Case"(%1, %arg0, %arg1) {branches = [@sub, @add], output_shapes = [#tf.shape<>], is_stateless = false} : (tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<f32>
 
   return %0, %4 : tensor<i32>, tensor<f32>
 }
 
-func @add(%i: tensor<i32>, %arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
+func @add(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
   %0 = "tf.Add"(%arg0, %arg1): (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
   return %0 : tensor<*xf32>
 }
 
-func @sub(%i: tensor<i32>, %arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
+func @sub(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
   %0 = "tf.Sub"(%arg0, %arg1) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
   return %0 : tensor<*xf32>
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_executor_ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_executor_ops.mlir
index 1e537880620..23a8e904ad9 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_executor_ops.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_executor_ops.mlir
@@ -433,7 +433,7 @@ func @nextiteration(%arg0: tensor<*xf32>, %arg1: i1) -> tensor<*xf32> {
     %1:3 = tf_executor.NextIteration.Source : tensor<*xf32>
     tf_executor.NextIteration.Sink[%1#1] %1#0 : tensor<*xf32>
 // CHECK: tf_executor.NextIteration.Source : tensor<*xf32>
-// CHECK: tf_executor.NextIteration.Sink [%{{.*}}] %{{.*}} : tensor<*xf32>
+// CHECK: tf_executor.NextIteration.Sink[%{{.*}}] %{{.*}} : tensor<*xf32>
     tf_executor.fetch %1#0 : tensor<*xf32>
   }
   return %0 : tensor<*xf32>
@@ -445,7 +445,7 @@ func @nextiteration_with_attributes(%arg0: tensor<*xf32>, %arg1: i1) -> tensor<*
     %1:3 = tf_executor.NextIteration.Source : tensor<*xf32> {attr3 = 32 : i64, tf_executor.attr_fetch = "some_value"}
     tf_executor.NextIteration.Sink[%1#1] %1#0 : tensor<*xf32> {attr4 = 42 : i64, tf_executor.attr_push = "other_value"}
 // CHECK: tf_executor.NextIteration.Source : tensor<*xf32> {attr3 = 32 : i64, tf_executor.attr_fetch = "some_value"}
-// CHECK: tf_executor.NextIteration.Sink [%{{.*}}] %{{.*}} : tensor<*xf32> {attr4 = 42 : i64, tf_executor.attr_push = "other_value"}
+// CHECK: tf_executor.NextIteration.Sink[%{{.*}}] %{{.*}} : tensor<*xf32> {attr4 = 42 : i64, tf_executor.attr_push = "other_value"}
     tf_executor.fetch %1#0 : tensor<*xf32>
   }
   return %0 : tensor<*xf32>
@@ -457,9 +457,9 @@ func @nextiteration_control(%arg0: tensor<*xf32>, %arg1: tensor<i1>) -> tensor<*
     %1:3 = tf_executor.Switch %arg0, %arg1 : tensor<*xf32>
     %2:2 = tf_executor.Enter %arg0, %1#2, %1#2 frame "some/frame" : tensor<*xf32>
     %3:3 = tf_executor.NextIteration.Source : tensor<*xf32>
-    tf_executor.NextIteration.Sink [%3#1] %3#0, %1#2 : tensor<*xf32>
+    tf_executor.NextIteration.Sink[%3#1] %3#0, %1#2 : tensor<*xf32>
 // CHECK: tf_executor.NextIteration.Source : tensor<*xf32>
-// CHECK: tf_executor.NextIteration.Sink [%{{.*}}] %{{.*}}, %{{.*}} : tensor<*xf32>
+// CHECK: tf_executor.NextIteration.Sink[%{{.*}}] %{{.*}}, %{{.*}} : tensor<*xf32>
     tf_executor.fetch %3#0 : tensor<*xf32>
   }
   return %0 : tensor<*xf32>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu-cluster-cleanup-attributes.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu-cluster-cleanup-attributes.mlir
new file mode 100644
index 00000000000..6399d7d6fb0
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu-cluster-cleanup-attributes.mlir
@@ -0,0 +1,24 @@
+// RUN: tf-opt %s -tf-tpu-cleanup-cluster-attributes | FileCheck %s
+
+func @test(%arg0: tensor<i1>, %arg1: tensor<f32>, %arg2: tensor<f32>) ->  tensor<f32> {
+  // CHECK: "tf_device.cluster"
+  // CHECK-NOT: _tpu_replicate =
+  // CHECK-NOT: device =
+  %1 = "tf_device.cluster"() ( {
+    %2 = "tf.Add"(%arg1, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+    %3 = "tf.IfRegion"(%arg0) ({
+        %4 = "tf.Mul" (%arg1, %2) {device = "y"}: (tensor<f32>, tensor<f32>) -> tensor<f32>
+        "tf.Yield"(%4) : (tensor<f32>) -> ()
+      }, {
+        %5 = "tf.Div" (%arg1, %2) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+        "tf.Yield"(%5) : (tensor<f32>) -> ()
+      }) {is_stateless = true, _tpu_replicate = "x" } : (tensor<i1>) -> (tensor<f32>)
+    tf_device.return %3 : tensor<f32>
+  // CHECK: {_tpu_replicate = "x", cluster_attr = "cluster_attr", device = "y"}
+  }) {cluster_attr = "cluster_attr", _tpu_replicate = "x", device = "y"} : () -> tensor<f32>
+  // CHECK: "tf.Add"
+  // CHECK-SAME: {_tpu_replicate = "x", device = "y"}
+  %2 = "tf.Add"(%arg2, %1) {_tpu_replicate = "x", device = "y"} : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  // CHECK: return
+  return %2 : tensor<f32>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu-dynamic-layout-pass.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu-dynamic-layout-pass.mlir
index 9467f890419..7b670cd831c 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu-dynamic-layout-pass.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu-dynamic-layout-pass.mlir
@@ -11,9 +11,9 @@ func @non_replicated(%arg0: tensor<*x!tf.resource> {tf.device = "/device:CPU:0"}
       NumDynamicShapes = 0 : i64,
       // The metadata encodes 2 parameter and two return values.
       metadata = "\0A\0E\08\01\18\01\22\08\08\01\1A\01\01\22\01\00\0A \08\01\12\10\12\02\08\03\12\02\08\03\12\02\08\01\12\02\08 \18\01\22\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\18\02 \01",
-      mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<!tf.string>)
-    tf_device.return %1#0, %1#1 : tensor<!tf.string>, tensor<!tf.string>
-  }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<!tf.string>)
+      mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
+    tf_device.return %1#0, %1#1 : tensor<!tf.string>, tensor<2x!tf.string>
+  }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
   // CHECK-DAG: %[[LAYOUT0:.*]] = "tf.TPUGetLayoutOp"(%[[COMPILE]]#1) {index = 0 : i64, is_output = false}
   // CHECK-DAG: %[[LAYOUT1:.*]] = "tf.TPUGetLayoutOp"(%[[COMPILE]]#1) {index = 1 : i64, is_output = false}
   // CHECK: %[[ITER:.*]]:2 = "tf.IteratorGetNext"
@@ -31,7 +31,7 @@ func @non_replicated(%arg0: tensor<*x!tf.resource> {tf.device = "/device:CPU:0"}
   // CHECK-NEXT: "tf.TPUExecute"(%[[COPY0]], %[[COPY1]], %[[COMPILE]]#1)
   %execute = "tf_device.launch"() ( {
     %3 = "tf.TPUExecute"(%2#0, %2#1, %compile#1)
-      : (tensor<3x3x1x32xf32>, tensor<3x3x1x32xf32>, tensor<!tf.string>) -> tensor<i32>
+      : (tensor<3x3x1x32xf32>, tensor<3x3x1x32xf32>, tensor<2x!tf.string>) -> tensor<i32>
     tf_device.return %3 : tensor<i32>
   }) {device = "/device:TPU:0"} : () -> tensor<i32>
   return %execute : tensor<i32>
@@ -49,9 +49,9 @@ func @multiple_compile_uses(%arg0: tensor<*x!tf.resource> {tf.device = "/device:
       NumDynamicShapes = 0 : i64,
       // The metadata encodes 2 parameter and two return values.
       metadata = "\0A\0E\08\01\18\01\22\08\08\01\1A\01\01\22\01\00\0A \08\01\12\10\12\02\08\03\12\02\08\03\12\02\08\01\12\02\08 \18\01\22\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\18\02 \01",
-      mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<!tf.string>)
-    tf_device.return %1#0, %1#1 : tensor<!tf.string>, tensor<!tf.string>
-  }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<!tf.string>)
+      mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
+    tf_device.return %1#0, %1#1 : tensor<!tf.string>, tensor<2x!tf.string>
+  }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
   // CHECK-NOT: "tf.TPUGetLayoutOp"
   // CHECK-NOT: "tf.TPUCopyWithLayout"
   %2:2 = "tf.IteratorGetNext"(%arg0) {device = "/device:CPU:0"}
@@ -62,13 +62,13 @@ func @multiple_compile_uses(%arg0: tensor<*x!tf.resource> {tf.device = "/device:
   }) {device = "/device:CPU:0"} : () -> ()
   %execute0 = "tf_device.launch"() ( {
     %3 = "tf.TPUExecute"(%2#0, %2#1, %compile#1)
-      : (tensor<3x3x1x32xf32>, tensor<3x3x1x32xf32>, tensor<!tf.string>) -> tensor<i32>
+      : (tensor<3x3x1x32xf32>, tensor<3x3x1x32xf32>, tensor<2x!tf.string>) -> tensor<i32>
     tf_device.return %3 : tensor<i32>
   }) {device = "/device:TPU:0"} : () -> tensor<i32>
   %4:2 = "tf._UnKnownOp_"() : () -> (tensor<3x3x1x32xf32>, tensor<3x3x1x32xf32>)
   %execute1 = "tf_device.launch"() ( {
     %5 = "tf.TPUExecute"(%4#0, %4#1, %compile#1)
-      : (tensor<3x3x1x32xf32>, tensor<3x3x1x32xf32>, tensor<!tf.string>) -> tensor<i32>
+      : (tensor<3x3x1x32xf32>, tensor<3x3x1x32xf32>, tensor<2x!tf.string>) -> tensor<i32>
     tf_device.return %5 : tensor<i32>
   }) {device = "/device:TPU:0"} : () -> tensor<i32>
   return %execute1 : tensor<i32>
@@ -85,9 +85,9 @@ func @on_tpu_iter(%arg0: tensor<*x!tf.resource> {tf.device = "/device:TPU:0"}) -
       NumDynamicShapes = 0 : i64,
       // The metadata encodes 2 parameter and two return values.
       metadata = "\0A\0E\08\01\18\01\22\08\08\01\1A\01\01\22\01\00\0A \08\01\12\10\12\02\08\03\12\02\08\03\12\02\08\01\12\02\08 \18\01\22\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\18\02 \01",
-      mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<!tf.string>)
-    tf_device.return %1#0, %1#1 : tensor<!tf.string>, tensor<!tf.string>
-  }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<!tf.string>)
+      mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
+    tf_device.return %1#0, %1#1 : tensor<!tf.string>, tensor<2x!tf.string>
+  }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
   // CHECK-NOT: "tf.TPUGetLayoutOp"
   // CHECK-NOT: "tf.TPUCopyWithLayout"
   %2:2 = "tf.IteratorGetNext"(%arg0) {device = "/device:TPU:0"}
@@ -98,7 +98,7 @@ func @on_tpu_iter(%arg0: tensor<*x!tf.resource> {tf.device = "/device:TPU:0"}) -
   }) {device = "/device:CPU:0"} : () -> ()
   %execute = "tf_device.launch"() ( {
     %3 = "tf.TPUExecute"(%2#0, %2#1, %compile#1)
-      : (tensor<3x3x1x32xf32>, tensor<3x3x1x32xf32>, tensor<!tf.string>) -> tensor<i32>
+      : (tensor<3x3x1x32xf32>, tensor<3x3x1x32xf32>, tensor<2x!tf.string>) -> tensor<i32>
     tf_device.return %3 : tensor<i32>
   }) {device = "/device:TPU:0"} : () -> tensor<i32>
   return %execute : tensor<i32>
@@ -116,9 +116,9 @@ func @arg_on_tpu_iter_on_cpu(%arg0: tensor<*x!tf.resource> {tf.device = "/device
       NumDynamicShapes = 0 : i64,
       // The metadata encodes 2 parameter and two return values.
       metadata = "\0A\0E\08\01\18\01\22\08\08\01\1A\01\01\22\01\00\0A \08\01\12\10\12\02\08\03\12\02\08\03\12\02\08\01\12\02\08 \18\01\22\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\18\02 \01",
-      mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<!tf.string>)
-    tf_device.return %1#0, %1#1 : tensor<!tf.string>, tensor<!tf.string>
-  }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<!tf.string>)
+      mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
+    tf_device.return %1#0, %1#1 : tensor<!tf.string>, tensor<2x!tf.string>
+  }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
   // CHECK-NOT: "tf.TPUGetLayoutOp"
   // CHECK-NOT: "tf.TPUCopyWithLayout"
   %2:2 = "tf.IteratorGetNext"(%arg0) {device = "/device:CPU:0"}
@@ -129,7 +129,7 @@ func @arg_on_tpu_iter_on_cpu(%arg0: tensor<*x!tf.resource> {tf.device = "/device
   }) {device = "/device:CPU:0"} : () -> ()
   %execute = "tf_device.launch"() ( {
     %3 = "tf.TPUExecute"(%2#0, %2#1, %compile#1)
-      : (tensor<3x3x1x32xf32>, tensor<3x3x1x32xf32>, tensor<!tf.string>) -> tensor<i32>
+      : (tensor<3x3x1x32xf32>, tensor<3x3x1x32xf32>, tensor<2x!tf.string>) -> tensor<i32>
     tf_device.return %3 : tensor<i32>
   }) {device = "/device:TPU:0"} : () -> tensor<i32>
   return %execute : tensor<i32>
@@ -148,9 +148,9 @@ func @arg_on_tpu_intermediate_ops_on_cpu(%arg0: tensor<*x!tf.resource> {tf.devic
       NumDynamicShapes = 0 : i64,
       // The metadata encodes 2 parameter and two return values.
       metadata = "\0A\0E\08\01\18\01\22\08\08\01\1A\01\01\22\01\00\0A \08\01\12\10\12\02\08\03\12\02\08\03\12\02\08\01\12\02\08 \18\01\22\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\18\02 \01",
-      mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<!tf.string>)
-    tf_device.return %1#0, %1#1 : tensor<!tf.string>, tensor<!tf.string>
-  }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<!tf.string>)
+      mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
+    tf_device.return %1#0, %1#1 : tensor<!tf.string>, tensor<2x!tf.string>
+  }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
   %id1 = "tf.Identity"(%arg0) {device = "/device:CPU:0"} : (tensor<*x!tf.resource>) -> (tensor<*x!tf.resource>)
   %id2 = "tf.Identity"(%id1) {device = "/device:CPU:0"} : (tensor<*x!tf.resource>) -> (tensor<*x!tf.resource>)
   // CHECK-NOT: "tf.TPUGetLayoutOp"
@@ -163,7 +163,7 @@ func @arg_on_tpu_intermediate_ops_on_cpu(%arg0: tensor<*x!tf.resource> {tf.devic
   }) {device = "/device:CPU:0"} : () -> ()
   %execute = "tf_device.launch"() ( {
     %3 = "tf.TPUExecute"(%2#0, %2#1, %compile#1)
-      : (tensor<3x3x1x32xf32>, tensor<3x3x1x32xf32>, tensor<!tf.string>) -> tensor<i32>
+      : (tensor<3x3x1x32xf32>, tensor<3x3x1x32xf32>, tensor<2x!tf.string>) -> tensor<i32>
     tf_device.return %3 : tensor<i32>
   }) {device = "/device:TPU:0"} : () -> tensor<i32>
   return %execute : tensor<i32>
@@ -181,9 +181,9 @@ func @var_handle_on_tpu_iter_on_cpu() -> tensor<i32> {
       NumDynamicShapes = 0 : i64,
       // The metadata encodes 2 parameter and two return values.
       metadata = "\0A\0E\08\01\18\01\22\08\08\01\1A\01\01\22\01\00\0A \08\01\12\10\12\02\08\03\12\02\08\03\12\02\08\01\12\02\08 \18\01\22\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\18\02 \01",
-      mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<!tf.string>)
-    tf_device.return %1#0, %1#1 : tensor<!tf.string>, tensor<!tf.string>
-  }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<!tf.string>)
+      mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
+    tf_device.return %1#0, %1#1 : tensor<!tf.string>, tensor<2x!tf.string>
+  }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
   %var = "tf.VarHandleOp"() {container = "c", shared_name = "v", device = "/device:TPU:0"} : () -> tensor<*x!tf.resource>
   // CHECK-NOT: "tf.TPUGetLayoutOp"
   // CHECK-NOT: "tf.TPUCopyWithLayout"
@@ -195,7 +195,7 @@ func @var_handle_on_tpu_iter_on_cpu() -> tensor<i32> {
   }) {device = "/device:CPU:0"} : () -> ()
   %execute = "tf_device.launch"() ( {
     %3 = "tf.TPUExecute"(%2#0, %2#1, %compile#1)
-      : (tensor<3x3x1x32xf32>, tensor<3x3x1x32xf32>, tensor<!tf.string>) -> tensor<i32>
+      : (tensor<3x3x1x32xf32>, tensor<3x3x1x32xf32>, tensor<2x!tf.string>) -> tensor<i32>
     tf_device.return %3 : tensor<i32>
   }) {device = "/device:TPU:0"} : () -> tensor<i32>
   return %execute : tensor<i32>
@@ -212,9 +212,9 @@ func @unsupported_ops(%arg0: tensor<3x3x1x32xf32> {tf.device = "/device:CPU:0"})
       NumDynamicShapes = 0 : i64,
       // The metadata encodes 2 parameter and two return values.
       metadata = "\0A\0E\08\01\18\01\22\08\08\01\1A\01\01\22\01\00\0A \08\01\12\10\12\02\08\03\12\02\08\03\12\02\08\01\12\02\08 \18\01\22\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\18\02 \01",
-      mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<!tf.string>)
-    tf_device.return %1#0, %1#1 : tensor<!tf.string>, tensor<!tf.string>
-  }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<!tf.string>)
+      mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
+    tf_device.return %1#0, %1#1 : tensor<!tf.string>, tensor<2x!tf.string>
+  }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
   // CHECK-NOT: "tf.TPUGetLayoutOp"
   // CHECK-NOT: "tf.TPUCopyWithLayout"
   %2 = "tf._Unknown_"() : () -> tensor<3x3x1x32xf32>
@@ -224,7 +224,7 @@ func @unsupported_ops(%arg0: tensor<3x3x1x32xf32> {tf.device = "/device:CPU:0"})
   }) {device = "/device:CPU:0"} : () -> ()
   %execute = "tf_device.launch"() ( {
     %3 = "tf.TPUExecute"(%arg0, %2, %compile#1)
-      : (tensor<3x3x1x32xf32>, tensor<3x3x1x32xf32>, tensor<!tf.string>) -> tensor<i32>
+      : (tensor<3x3x1x32xf32>, tensor<3x3x1x32xf32>, tensor<2x!tf.string>) -> tensor<i32>
     tf_device.return %3 : tensor<i32>
   }) {device = "/device:TPU:0"} : () -> tensor<i32>
   return %execute : tensor<i32>
@@ -246,9 +246,9 @@ func @replicated(%arg0: tensor<*x!tf.resource> {tf.device = "/device:CPU:0"}) ->
       NumDynamicShapes = 0 : i64,
       // The metadata encodes 2 parameter and two return values.
       metadata = "\0A\0E\08\01\18\01\22\08\08\01\1A\01\01\22\01\00\0A \08\01\12\10\12\02\08\03\12\02\08\03\12\02\08\01\12\02\08 \18\01\22\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\18\02 \01",
-      mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<!tf.string>)
-    tf_device.return %1#0, %1#1 : tensor<!tf.string>, tensor<!tf.string>
-  }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<!tf.string>)
+      mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
+    tf_device.return %1#0, %1#1 : tensor<!tf.string>, tensor<2x!tf.string>
+  }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
   // CHECK-DAG: %[[LAYOUT0:.*]] = "tf.TPUGetLayoutOp"(%[[COMPILE]]#1) {index = 0 : i64, is_output = false}
   // CHECK-DAG: %[[LAYOUT1:.*]] = "tf.TPUGetLayoutOp"(%[[COMPILE]]#1) {index = 1 : i64, is_output = false}
   // CHECK: %[[ITER1:.*]]:2 = "tf.IteratorGetNext"
@@ -267,7 +267,7 @@ func @replicated(%arg0: tensor<*x!tf.resource> {tf.device = "/device:CPU:0"}) ->
       {n = 2 : i32, devices = {TPU_REPLICATED_CORE_0 = ["/device:TPU:0", "/device:TPU:1"]}} {
     // CHECK: "tf.TPUExecute"(%[[R0]], %[[R1]], %[[COMPILE]]#1)
     %execute = "tf_device.launch"() ( {
-      %4 = "tf.TPUExecute"(%r0, %r1, %compile#1) : (tensor<3x3x1x32xf32>, tensor<3x3x1x32xf32>, tensor<!tf.string>) -> tensor<i32>
+      %4 = "tf.TPUExecute"(%r0, %r1, %compile#1) : (tensor<3x3x1x32xf32>, tensor<3x3x1x32xf32>, tensor<2x!tf.string>) -> tensor<i32>
       tf_device.return %4 : tensor<i32>
     }) {device = "TPU_REPLICATED_CORE_0"} : () -> tensor<i32>
     tf_device.return %execute : tensor<i32>
@@ -286,9 +286,9 @@ func @inside_replicated(%arg0: tensor<*x!tf.resource> {tf.device = "/device:CPU:
       NumDynamicShapes = 0 : i64,
       // The metadata encodes 2 parameter and two return values.
       metadata = "\0A\0E\08\01\18\01\22\08\08\01\1A\01\01\22\01\00\0A \08\01\12\10\12\02\08\03\12\02\08\03\12\02\08\01\12\02\08 \18\01\22\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\18\02 \01",
-      mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<!tf.string>)
-    tf_device.return %1#0, %1#1 : tensor<!tf.string>, tensor<!tf.string>
-  }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<!tf.string>)
+      mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
+    tf_device.return %1#0, %1#1 : tensor<!tf.string>, tensor<2x!tf.string>
+  }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
   // CHECK-NOT: "tf.TPUGetLayoutOp"
   // CHECK-NOT: "tf.TPUCopyWithLayout"
   "tf_device.launch"() ( {
@@ -300,7 +300,7 @@ func @inside_replicated(%arg0: tensor<*x!tf.resource> {tf.device = "/device:CPU:
     %2:2 = "tf.IteratorGetNext"(%r0)
       : (tensor<*x!tf.resource>) -> (tensor<3x3x1x32xf32>, tensor<3x3x1x32xf32>)
     %execute = "tf_device.launch"() ( {
-      %4 = "tf.TPUExecute"(%2#0, %2#1, %compile#1) : (tensor<3x3x1x32xf32>, tensor<3x3x1x32xf32>, tensor<!tf.string>) -> tensor<i32>
+      %4 = "tf.TPUExecute"(%2#0, %2#1, %compile#1) : (tensor<3x3x1x32xf32>, tensor<3x3x1x32xf32>, tensor<2x!tf.string>) -> tensor<i32>
       tf_device.return %4 : tensor<i32>
     }) {device = "TPU_REPLICATED_CORE_0"} : () -> tensor<i32>
     tf_device.return %execute : tensor<i32>
@@ -330,9 +330,9 @@ func @parallel_execute(%arg0: tensor<*x!tf.resource> {tf.device = "/device:CPU:0
   // CHECK: %[[COMPILE:.*]]:3 = "tf_device.launch"
   // CHECK-NEXT: "tf._TPUCompileMlir"()
   %compile:3 = "tf_device.launch"() ( {
-    %1:3 = "tf._TPUCompileMlir"() {NumDynamicShapes = 0 : i64, metadata = "\0A\09\08\01\12\05\12\03\08\80\01\18\01 \02", mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>)
-    tf_device.return %1#0, %1#1, %1#2 : tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>
-  }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>)
+    %1:3 = "tf._TPUCompileMlir"() {NumDynamicShapes = 0 : i64, metadata = "\0A\09\08\01\12\05\12\03\08\80\01\18\01 \02", mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<2x!tf.string>, tensor<2x!tf.string>)
+    tf_device.return %1#0, %1#1, %1#2 : tensor<!tf.string>, tensor<2x!tf.string>, tensor<2x!tf.string>
+  }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<2x!tf.string>, tensor<2x!tf.string>)
   // CHECK-DAG: %[[LAYOUT0:.*]] = "tf.TPUGetLayoutOp"(%[[COMPILE]]#1) {index = 0 : i64, is_output = false}
   // CHECK-DAG: %[[LAYOUT1:.*]] = "tf.TPUGetLayoutOp"(%[[COMPILE]]#2) {index = 0 : i64, is_output = false}
   // CHECK: %[[ITER:.*]]:2 = "tf.IteratorGetNext"
@@ -351,7 +351,7 @@ func @parallel_execute(%arg0: tensor<*x!tf.resource> {tf.device = "/device:CPU:0
     // CHECK-NEXT: tf_device.return
     // CHECK-NEXT: device = "/device:TPU:0"
     "tf_device.launch"() ( {
-      "tf.TPUExecute"(%2#0, %compile#1) : (tensor<128xf32>, tensor<!tf.string>) -> ()
+      "tf.TPUExecute"(%2#0, %compile#1) : (tensor<128xf32>, tensor<2x!tf.string>) -> ()
       tf_device.return
     }) {device = "/device:TPU:0"} : () -> ()
     tf_device.return
@@ -364,7 +364,7 @@ func @parallel_execute(%arg0: tensor<*x!tf.resource> {tf.device = "/device:CPU:0
     // CHECK-NEXT: tf_device.return
     // CHECK-NEXT: device = "/device:TPU:1"
     "tf_device.launch"() ( {
-      "tf.TPUExecute"(%2#1, %compile#2) : (tensor<128xf32>, tensor<!tf.string>) -> ()
+      "tf.TPUExecute"(%2#1, %compile#2) : (tensor<128xf32>, tensor<2x!tf.string>) -> ()
       tf_device.return
     }) {device = "/device:TPU:1"} : () -> ()
     tf_device.return
@@ -396,9 +396,9 @@ func @replicated_parallel_execute(%arg0: tensor<*x!tf.resource> {tf.device = "/d
   // CHECK: %[[COMPILE:.*]]:3 = "tf_device.launch"
   // CHECK-NEXT: "tf._TPUCompileMlir"()
   %compile:3 = "tf_device.launch"() ( {
-    %1:3 = "tf._TPUCompileMlir"() {NumDynamicShapes = 0 : i64, metadata = "\0A\09\08\01\12\05\12\03\08\80\01\18\02 \02", mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>)
-    tf_device.return %1#0, %1#1, %1#2 : tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>
-  }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>)
+    %1:3 = "tf._TPUCompileMlir"() {NumDynamicShapes = 0 : i64, metadata = "\0A\09\08\01\12\05\12\03\08\80\01\18\02 \02", mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<2x!tf.string>, tensor<2x!tf.string>)
+    tf_device.return %1#0, %1#1, %1#2 : tensor<!tf.string>, tensor<2x!tf.string>, tensor<2x!tf.string>
+  }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<2x!tf.string>, tensor<2x!tf.string>)
   // CHECK-DAG: %[[LAYOUT0:.*]] = "tf.TPUGetLayoutOp"(%[[COMPILE]]#1) {index = 0 : i64, is_output = false}
   // CHECK-DAG: %[[LAYOUT1:.*]] = "tf.TPUGetLayoutOp"(%[[COMPILE]]#2) {index = 0 : i64, is_output = false}
   // CHECK-DAG: %[[ITER0:.*]]:2 = "tf.IteratorGetNext"(%[[ARG0]])
@@ -423,7 +423,7 @@ func @replicated_parallel_execute(%arg0: tensor<*x!tf.resource> {tf.device = "/d
       // CHECK-NEXT: tf_device.return
       // CHECK-NEXT: device = "TPU_REPLICATED_CORE_0"
       "tf_device.launch"() ( {
-        "tf.TPUExecute"(%r0, %compile#1) : (tensor<128xf32>, tensor<!tf.string>) -> ()
+        "tf.TPUExecute"(%r0, %compile#1) : (tensor<128xf32>, tensor<2x!tf.string>) -> ()
         tf_device.return
       }) {device = "TPU_REPLICATED_CORE_0"} : () -> ()
       tf_device.return
@@ -433,7 +433,7 @@ func @replicated_parallel_execute(%arg0: tensor<*x!tf.resource> {tf.device = "/d
       // CHECK-NEXT: tf_device.return
       // CHECK-NEXT: device = "TPU_REPLICATED_CORE_1"
       "tf_device.launch"() ( {
-        "tf.TPUExecute"(%r1, %compile#2) : (tensor<128xf32>, tensor<!tf.string>) -> ()
+        "tf.TPUExecute"(%r1, %compile#2) : (tensor<128xf32>, tensor<2x!tf.string>) -> ()
         tf_device.return
       }) {device = "TPU_REPLICATED_CORE_1"} : () -> ()
       tf_device.return
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu-resource-read-for-write.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu-resource-read-for-write.mlir
new file mode 100644
index 00000000000..a505a4e3269
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu-resource-read-for-write.mlir
@@ -0,0 +1,64 @@
+// RUN: tf-opt -tf-tpu-resource-read-for-write %s | FileCheck %s --dump-input=always
+
+// CHECK-LABEL: func @write_only_resource
+// CHECK-SAME: ([[ARG0:%.*]]: tensor<i32>, [[ARG1:%.*]]: tensor<f32>, [[ARG2:%.*]]: tensor<*x!tf.resource<tensor<i32>>>)
+func @write_only_resource(%arg0: tensor<i32>, %arg1: tensor<f32>, %arg2: tensor<*x!tf.resource<tensor<i32>>>) {
+  // CHECK-NEXT: [[READ:%.*]] = "tf.ReadVariableOp"([[ARG2]])
+  // CHECK-NEXT: [[CLUSTER:%.*]]:2 = "tf_device.cluster_func"([[ARG0]], [[ARG1]], [[READ]])
+  // CHECK-SAME: _tpu_replicate = "write"
+  %0:2 = "tf_device.cluster_func"(%arg0, %arg1) {_tpu_replicate = "write", func = @write_func} : (tensor<i32>, tensor<f32>) -> (tensor<f32>, tensor<i32>)
+  // CHECK-NEXT: "tf.AssignVariableOp"([[ARG2]], [[CLUSTER]]#1)
+  "tf.AssignVariableOp"(%arg2, %0#1) : (tensor<*x!tf.resource<tensor<i32>>>, tensor<i32>) -> ()
+  // CHECK-NEXT: return
+  return
+}
+
+// CHECK-LABEL: func @write_func
+// CHECK-SAME: ({{%.*}}: tensor<i32>, {{%.*}}: tensor<f32>, {{%.*}}: tensor<i32>) -> (tensor<f32>, tensor<i32>)
+func @write_func(%arg0: tensor<i32>, %arg1: tensor<f32>) -> (tensor<f32>, tensor<i32>) {
+  return %arg1, %arg0 : tensor<f32>, tensor<i32>
+}
+
+// CHECK-LABEL: func @read_write_resource
+func @read_write_resource(%arg0: tensor<i32>, %arg1: tensor<f32>, %arg2: tensor<*x!tf.resource<tensor<i32>>>) {
+  // CHECK-COUNT-1: tf.ReadVariableOp
+  %0 = "tf.ReadVariableOp"(%arg2) : (tensor<*x!tf.resource<tensor<i32>>>) -> tensor<i32>
+  %1:2 = "tf_device.cluster_func"(%arg0, %arg1, %0) {_tpu_replicate = "read_write", func = @read_write_func} : (tensor<i32>, tensor<f32>, tensor<i32>) -> (tensor<f32>, tensor<i32>)
+  "tf.AssignVariableOp"(%arg2, %1#1) : (tensor<*x!tf.resource<tensor<i32>>>, tensor<i32>) -> ()
+  return
+}
+
+// CHECK-LABEL: func @read_write_func
+// CHECK-SAME: ({{%.*}}: tensor<i32>, {{%.*}}: tensor<f32>) -> (tensor<f32>, tensor<i32>)
+func @read_write_func(%arg0: tensor<i32>, %arg1: tensor<f32>) -> (tensor<f32>, tensor<i32>) {
+  return %arg1, %arg0 : tensor<f32>, tensor<i32>
+}
+
+// CHECK-LABEL: func @multiple_write_resource
+func @multiple_write_resource(%arg0: tensor<i32>, %arg1: tensor<*x!tf.resource<tensor<i32>>>) {
+  // CHECK-NOT: tf.ReadVariableOp
+  %0:2 = "tf_device.cluster_func"(%arg0) {_tpu_replicate = "multiple_write", func = @multiple_write_func} : (tensor<i32>) -> (tensor<i32>, tensor<i32>)
+  "tf.AssignVariableOp"(%arg1, %0#0) : (tensor<*x!tf.resource<tensor<i32>>>, tensor<i32>) -> ()
+  "tf.AssignVariableOp"(%arg1, %0#1) : (tensor<*x!tf.resource<tensor<i32>>>, tensor<i32>) -> ()
+  return
+}
+
+// CHECK-LABEL: func @multiple_write_func
+// CHECK-SAME: ({{%.*}}: tensor<i32>) -> (tensor<i32>, tensor<i32>)
+func @multiple_write_func(%arg0: tensor<i32>) -> (tensor<i32>, tensor<i32>) {
+  return %arg0, %arg0 : tensor<i32>, tensor<i32>
+}
+
+// CHECK-LABEL: func @multiple_result_user
+func @multiple_result_user(%arg0: tensor<i32>, %arg1: tensor<*x!tf.resource<tensor<i32>>>) -> tensor<i32> {
+  // CHECK-NOT: tf.ReadVariableOp
+  %0 = "tf_device.cluster_func"(%arg0) {_tpu_replicate = "multiple_uses", func = @multiple_result_user_func} : (tensor<i32>) -> tensor<i32>
+  "tf.AssignVariableOp"(%arg1, %0) : (tensor<*x!tf.resource<tensor<i32>>>, tensor<i32>) -> ()
+  return %0 : tensor<i32>
+}
+
+// CHECK-LABEL: func @multiple_result_user_func
+// CHECK-SAME: ({{%.*}}: tensor<i32>) -> tensor<i32>
+func @multiple_result_user_func(%arg0: tensor<i32>) -> tensor<i32> {
+  return %arg0 : tensor<i32>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu-variable-runtime-reformatting.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu-variable-runtime-reformatting.mlir
index 1e308b42bfc..277e4a8415e 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu-variable-runtime-reformatting.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu-variable-runtime-reformatting.mlir
@@ -61,9 +61,9 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
         NumDynamicShapes = 0 : i64,
         // The metadata encodes 2 parameter and two return values.
         metadata = "\0A\0E\08\01\18\01\22\08\08\01\1A\01\01\22\01\00\0A \08\01\12\10\12\02\08\03\12\02\08\03\12\02\08\01\12\02\08 \18\01\22\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\18\02 \01",
-        mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<!tf.string>)
-      tf_device.return %2#0, %2#1 : tensor<!tf.string>, tensor<!tf.string>
-    }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<!tf.string>)
+        mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
+      tf_device.return %2#0, %2#1 : tensor<!tf.string>, tensor<2x!tf.string>
+    }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
     "tf_device.launch"() ( {
       "tf.TPUCompileSucceededAssert"(%compile#0) : (tensor<!tf.string>) -> ()
       tf_device.return
@@ -86,7 +86,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
       "tf_device.launch"() ( {
         "tf.TPUExecuteAndUpdateVariables"(%id, %arg31, %compile#1)
               {device_var_reads_indices = [0, 1], device_var_updates_indices = [0, 1]}
-                : (tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>, tensor<!tf.string>) -> ()
+                : (tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>, tensor<2x!tf.string>) -> ()
         tf_device.return
       }) {device = "TPU_REPLICATED_CORE_0"} : () -> ()
       %ret = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
@@ -153,9 +153,9 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
         NumDynamicShapes = 0 : i64,
         // The metadata encodes 2 parameter and two return values.
         metadata = "\0A\0E\08\01\18\01\22\08\08\01\1A\01\01\22\01\00\0A \08\01\12\10\12\02\08\03\12\02\08\03\12\02\08\01\12\02\08 \18\01\22\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\18\02 \01",
-        mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<!tf.string>)
-      tf_device.return %2#0, %2#1 : tensor<!tf.string>, tensor<!tf.string>
-    }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<!tf.string>)
+        mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
+      tf_device.return %2#0, %2#1 : tensor<!tf.string>, tensor<2x!tf.string>
+    }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
     "tf_device.launch"() ( {
       "tf.TPUCompileSucceededAssert"(%compile#0) : (tensor<!tf.string>) -> ()
       tf_device.return
@@ -173,7 +173,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
         "tf.TPUExecuteAndUpdateVariables"(%arg30, %arg31, %arg32, %compile#1)
               {device_var_reads_indices = [0, 1], device_var_updates_indices = [0, 1]}
                 : (tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>,
-                   tensor<*x!tf.resource<tensor<f32>>>, tensor<!tf.string>) -> ()
+                   tensor<*x!tf.resource<tensor<f32>>>, tensor<2x!tf.string>) -> ()
         tf_device.return
       }) {device = "TPU_REPLICATED_CORE_0"} : () -> ()
       tf_device.return
@@ -239,9 +239,9 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
         NumDynamicShapes = 0 : i64,
         // The metadata encodes 2 parameter and two return values.
         metadata = "\0A\0E\08\01\18\01\22\08\08\01\1A\01\01\22\01\00\0A \08\01\12\10\12\02\08\03\12\02\08\03\12\02\08\01\12\02\08 \18\01\22\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\18\02 \01",
-        mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<!tf.string>)
-      tf_device.return %2#0, %2#1 : tensor<!tf.string>, tensor<!tf.string>
-    }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<!tf.string>)
+        mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
+      tf_device.return %2#0, %2#1 : tensor<!tf.string>, tensor<2x!tf.string>
+    }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
     "tf_device.launch"() ( {
       "tf.TPUCompileSucceededAssert"(%compile#0) : (tensor<!tf.string>) -> ()
       tf_device.return
@@ -254,7 +254,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
         "tf_device.launch"() ( {
           "tf.TPUExecuteAndUpdateVariables"(%id, %arg31, %compile#1)
                 {device_var_reads_indices = [0, 1], device_var_updates_indices = [0, 1]}
-                  : (tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>, tensor<!tf.string>) -> ()
+                  : (tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>, tensor<2x!tf.string>) -> ()
           tf_device.return
         }) {device = "TPU_REPLICATED_CORE_0"} : () -> ()
         tf_device.return
@@ -342,9 +342,9 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
         NumDynamicShapes = 0 : i64,
         // The metadata encodes 2 parameter and two return values.
         metadata = "\0A\0E\08\01\18\01\22\08\08\01\1A\01\01\22\01\00\0A \08\01\12\10\12\02\08\03\12\02\08\03\12\02\08\01\12\02\08 \18\01\22\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\18\02 \01",
-        mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<!tf.string>)
-      tf_device.return %2#0, %2#1 : tensor<!tf.string>, tensor<!tf.string>
-    }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<!tf.string>)
+        mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
+      tf_device.return %2#0, %2#1 : tensor<!tf.string>, tensor<2x!tf.string>
+    }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
     "tf_device.launch"() ( {
       "tf.TPUCompileSucceededAssert"(%compile#0) : (tensor<!tf.string>) -> ()
       tf_device.return
@@ -367,7 +367,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
       "tf_device.launch"() ( {
         "tf.TPUExecuteAndUpdateVariables"(%id, %arg31, %compile#1)
               {device_var_reads_indices = [0, 1], device_var_updates_indices = [0, 1]}
-                : (tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>, tensor<!tf.string>) -> ()
+                : (tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>, tensor<2x!tf.string>) -> ()
         tf_device.return
       }) {device = "TPU_REPLICATED_CORE_0"} : () -> ()
       %ret = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_cluster_formation.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_cluster_formation.mlir
index 37dfec5e6df..281e4baaa12 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_cluster_formation.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_cluster_formation.mlir
@@ -331,6 +331,155 @@ func @mirrored_variables(%arg0: tensor<!tf.resource<tensor<32xf32>>>, %arg1: ten
 // CHECK-SAME: _replicated_input_indices = [0, 1, 2]
 
 
+// Test resource usage after resource use in cluster is moved to after the
+// cluster.
+// CHECK-LABEL: func @resource_after_cluster
+// CHECK-SAME:  ([[USED_RESOURCE:%.*]]: tensor<*x!tf.resource<tensor<f32>>>, [[UNUSED_RESOURCE:%.*]]: tensor<*x!tf.resource<tensor<f32>>>)
+func @resource_after_cluster(%arg0: tensor<*x!tf.resource<tensor<f32>>>, %arg1: tensor<*x!tf.resource<tensor<f32>>>) {
+  // CHECK-NEXT: [[CONST:%.*]] = "tf.Const"
+  %0 = "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
+
+  // CHECK-NEXT: "tf.AssignSubVariableOp"([[UNUSED_RESOURCE]], [[CONST]])
+
+  // CHECK:      "tf_device.cluster"
+  // CHECK-NEXT:   "tf.ReadVariableOp"([[USED_RESOURCE]])
+  // CHECK-NEXT:   "tf.NoOp"
+  // CHECK-NEXT:   tf_device.return
+  "tf.TPUReplicateMetadata"() {_tpu_replicate = "cluster_test_fn", allow_soft_placement = false, computation_shape = [], device_assignment = [], host_compute_core = [], num_cores_per_replica = 1 : i64, num_replicas = 1 : i64, padding_map = [], step_marker_location = "STEP_MARK_AT_ENTRY", topology = "", use_spmd_for_xla_partitioning = false, use_tpu = true} : () -> ()
+  %1 = "tf.ReadVariableOp"(%arg0) {_tpu_replicate = "cluster_test_fn"} : (tensor<*x!tf.resource<tensor<f32>>>) -> tensor<f32>
+
+  "tf.AssignSubVariableOp"(%arg1, %0) : (tensor<*x!tf.resource<tensor<f32>>>, tensor<f32>) -> ()
+
+  // CHECK:       "tf.AssignAddVariableOp"([[USED_RESOURCE]], [[CONST]])
+  "tf.AssignAddVariableOp"(%arg0, %0) : (tensor<*x!tf.resource<tensor<f32>>>, tensor<f32>) -> ()
+
+  "tf.NoOp"() {_tpu_replicate = "cluster_test_fn"} : () -> ()
+  return
+}
+
+
+// Test resource not used by cluster is moved to before the cluster.
+// CHECK-LABEL: func @resource_before_cluster
+func @resource_before_cluster() {
+  // CHECK-NEXT: [[CONST:%.*]] = "tf.Const"
+  %0 = "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
+
+  // CHECK-NEXT: [[UNUSED_RESOURCE:%.*]] = "tf.VarHandleOp"
+  // CHECK-NEXT: "tf.AssignAddVariableOp"([[UNUSED_RESOURCE]], [[CONST]])
+
+  // CHECK:      "tf_device.cluster"
+  // CHECK-NEXT:   "tf.NoOp"
+  // CHECK-NEXT:   tf_device.return
+  "tf.TPUReplicateMetadata"() {_tpu_replicate = "cluster_test_fn", allow_soft_placement = false, computation_shape = [], device_assignment = [], host_compute_core = [], num_cores_per_replica = 1 : i64, num_replicas = 1 : i64, padding_map = [], step_marker_location = "STEP_MARK_AT_ENTRY", topology = "", use_spmd_for_xla_partitioning = false, use_tpu = true} : () -> ()
+
+  %1 = "tf.VarHandleOp"() {container = "", shape = #tf.shape<>, shared_name = "x"} : () -> tensor<*x!tf.resource<tensor<f32>>>
+  "tf.AssignAddVariableOp"(%1, %0) : (tensor<*x!tf.resource<tensor<f32>>>, tensor<f32>) -> ()
+
+  "tf.NoOp"() {_tpu_replicate = "cluster_test_fn"} : () -> ()
+  return
+}
+
+
+// Test cluster formation with ops with attached regions within a cluster.
+// Nested op's that are moved should get their _tpu_replicate and device
+// attributes cleared.
+// CHECK-LABEL: func @cluster_ops_with_regions
+func @cluster_ops_with_regions() {
+  %0 = "tf.opA"() ({
+      %1 = "tf.opB"() {_tpu_replicate = "replicate", device = "device", name = "nameB"} : () -> (tensor<i32>)
+    }) {_tpu_replicate = "replicate", device = "device", name = "nameA"} : () -> tensor<i1>
+  "tf.TPUReplicateMetadata"() {_tpu_replicate = "replicate", device = "device", num_replicas = 1, topology = "topology"} : () -> ()
+  return
+}
+
+// CHECK:      "tf.opA"() ( {
+// CHECK-NEXT: "tf.opB"
+// CHECK-NOT: _tpu_replicate = "replicate"
+// CHECK-NOT:  device = "device"
+// CHECK-SAME: name = "nameB"
+// CHECK:      })
+// CHECK-NOT:  _tpu_replicate = "replicate"
+// CHECK-NOT:  device = "device"
+// CHECK:      name = "nameA"
+// CHECK:      tf_device.return
+
+// A nested cluster op using result of another cluster op. In the below, opA and
+// opB go in a cluster, and opD stays outside.
+// CHECK-LABEL: func @cluster_nested_op_using_other_op
+func @cluster_nested_op_using_other_op() {
+  %0 = "tf.opA"() { _tpu_replicate = "foo" } : () -> tensor<i32>
+  "tf.opB"() ({
+    "tf.opC"(%0) : (tensor<i32>) -> ()
+   }) { _tpu_replicate = "foo" } : () -> ()
+  "tf.opD"(%0) : (tensor<i32>) -> ()
+  "tf.TPUReplicateMetadata"() {_tpu_replicate = "foo", device = "CPU", num_replicas = 1, topology = "topology"} : () -> ()
+  return
+}
+
+// CHECK: [[CLUSTER:%.*]] = "tf_device.cluster"() ( {
+// CHECK:    [[OPA:%.*]] = "tf.opA"() : () -> tensor<i32>
+// CHECK:    "tf.opB"() ( {
+// CHECK:      "tf.opC"([[OPA]])
+// CHECK:    tf_device.return [[OPA]]
+// CHECK:    "tf.opD"([[CLUSTER]])
+
+// Preceding user is using resource updated by a nested op.
+!tf_res = type tensor<*x!tf.resource<tensor<f32>>>
+// CHECK-LABEL: func @cluster_nested_op_updating_resource
+func @cluster_nested_op_updating_resource() {
+  %0 = "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
+  %1 = "tf.VarHandleOp"() {container = "", shape = #tf.shape<>, shared_name = "x"} : () -> !tf_res
+
+  "tf.opA"() ({
+    "tf.AssignAddVariableOp"(%1, %0) : (!tf_res, tensor<f32>) -> ()
+    "tf.terminator"() : () -> ()
+  }) { _tpu_replicate = "foo" } : () -> ()
+  "tf.AssignAddVariableOp"(%1, %0) : (!tf_res, tensor<f32>) -> ()
+  "tf.opB"() { _tpu_replicate = "foo" } : () -> ()
+  "tf.TPUReplicateMetadata"() {_tpu_replicate = "foo", device = "CPU", num_replicas = 1, topology = "topology"} : () -> ()
+  return
+}
+
+// CHECK: [[CONST:%.*]] = "tf.Const"
+// CHECK: [[VAR:%.*]] = "tf.VarHandleOp"
+// CHECK: "tf_device.cluster"() ( {
+// CHECK:   "tf.opA"() ( {
+// CHECK:     "tf.AssignAddVariableOp"([[VAR]], [[CONST]])
+// CHECK:    })
+// CHECK:    "tf.opB"()
+// CHECK:    tf_device.return
+// CHECK:  })
+// CHECK-SAME: _tpu_replicate = "foo"
+// CHECK: "tf.AssignAddVariableOp"([[VAR]], [[CONST]])
+
+// Preceding user is using resource updated by the cluster within a nested op.
+// Resource is updated by a cluster op, and opA (not in cluster) is using the
+// resource in a nested op. We expect opA to be after the cluster.
+// CHECK-LABEL: func @cluster_nested_op_using_resource
+func @cluster_nested_op_using_resource() {
+  %0 = "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
+  %1 = "tf.VarHandleOp"() {container = "", shape = #tf.shape<>, shared_name = "x"} : () -> !tf_res
+  "tf.AssignAddVariableOp"(%1, %0) { _tpu_replicate = "foo" } : (!tf_res, tensor<f32>) -> ()
+  "tf.opA"() ({
+    "tf.AssignAddVariableOp"(%1, %0) : (!tf_res, tensor<f32>) -> ()
+    "tf.terminator"() : () -> ()
+   }) : () -> ()
+  "tf.opB"() { _tpu_replicate = "foo" } : () -> ()
+  "tf.TPUReplicateMetadata"() {_tpu_replicate = "foo", device = "CPU", num_replicas = 1, topology = "topology"} : () -> ()
+  return
+}
+
+// CHECK: [[CONST:%.*]] = "tf.Const"
+// CHECK: [[VAR:%.*]] = "tf.VarHandleOp"
+// CHECK: "tf_device.cluster"() ( {
+// CHECK:   "tf.AssignAddVariableOp"([[VAR]], [[CONST]])
+// CHECK:    "tf.opB"()
+// CHECK:    tf_device.return
+// CHECK:  })
+// CHECK-SAME: _tpu_replicate = "foo"
+// CHECK:  "tf.opA"() ( {
+// CHECK:   "tf.AssignAddVariableOp"([[VAR]], [[CONST]])
+
 // -----
 
 
@@ -358,18 +507,6 @@ func @bad_num_replicas() {
 // -----
 
 
-// Test that functions without TPUReplicateMetadata op are skipped without
-// error
-// CHECK-LABEL: func @missing_metadata_op
-func @missing_metadata_op() {
-  // expected-warning@+1 {{TPUReplicateMetadata for associated '_tpu_replicate' attribute 'replicate' is missing}}
-  %0 = "tf.opA"() {_tpu_replicate = "replicate"} : () -> tensor<i1>
-  return
-}
-
-// -----
-
-
 // Test cluster with TPUReplicatedInput where the number of operands does not
 // match associated `num_replicas` attribute.
 func @mismatched_replicated_input(%arg0: tensor<i1>) {
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_colocate_composite_resource_ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_colocate_composite_resource_ops.mlir
new file mode 100644
index 00000000000..88af4535d81
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_colocate_composite_resource_ops.mlir
@@ -0,0 +1,118 @@
+// RUN: tf-opt %s -tf-tpu-colocate-composite-resource-ops | FileCheck %s
+
+// Tests ReadVariable op using composite device resource is wrapped inside
+// tf_device.Cluster.
+
+// CHECK-LABEL: func @testReadVariableOpColocated
+// CHECK-SAME: (%[[ARG0:.*]]: tensor<*x!tf.resource<tensor<4xf32>>>)
+func @testReadVariableOpColocated(%arg0: tensor<*x!tf.resource<tensor<4xf32>>>) {
+  // CHECK:      tf_device.replicate
+  // CHECK-SAME: (%[[ARG0]] as %[[RI_0:[a-z0-9]*]]: tensor<*x!tf.resource<tensor<4xf32>>>)
+  tf_device.replicate(%arg0 as %arg1: tensor<*x!tf.resource<tensor<4xf32>>>) {
+    _mirrored_variable_indices = [0], _replicated_input_indices = [-1],
+    devices = {TPU_REPLICATED_CORE_0 = ["/job:worker/replica:0/task:0/device:TPU:0", "/job:worker/replica:0/task:0/device:TPU:1"]},
+    n = 2 : i32} {
+     // CHECK:      %[[RESOURCE_OUT:.*]] = "tf_device.launch"()
+     // CHECK-NEXT:   %[[READ_OUT:.*]] = "tf.ReadVariableOp"(%[[RI_0]])
+     // CHECK-NEXT:   tf_device.return %[[READ_OUT]]
+     // CHECK-NEXT: TPU_REPLICATED_CORE_0
+     %0 = "tf.ReadVariableOp"(%arg1) : (tensor<*x!tf.resource<tensor<4xf32>>>) -> tensor<4xf32>
+     %1 = "tf.A"() : () -> (tensor<2x!tf.string>)
+     "tf_device.launch"() ( {
+       "tf.TPUExecuteAndUpdateVariables"(%arg1, %1) {device_var_reads_indices = [0], device_var_updates_indices = [-1]} : (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<2x!tf.string>) -> ()
+       tf_device.return
+    }) {device = "TPU_REPLICATED_CORE_0"} : () -> ()
+    "tf_device.launch"() ( {
+      // CHECK:  "tf.B"(%[[RESOURCE_OUT]])
+      "tf.B"(%0) : (tensor<4xf32>) -> ()
+       tf_device.return
+    }) {device = "TPU_REPLICATED_CORE_0"} : () -> ()
+    tf_device.return
+  }
+  return
+}
+
+// CHECK-LABEL: func @testReadVariableOpAfterIdentityColocated
+// CHECK-SAME: (%[[ARG0:.*]]: tensor<*x!tf.resource<tensor<4xf32>>>)
+func @testReadVariableOpAfterIdentityColocated(%arg0: tensor<*x!tf.resource<tensor<4xf32>>>) {
+  // CHECK:      tf_device.replicate
+  // CHECK-SAME: (%[[ARG0]] as %[[RI_0:[a-z0-9]*]]: tensor<*x!tf.resource<tensor<4xf32>>>)
+  tf_device.replicate(%arg0 as %arg1: tensor<*x!tf.resource<tensor<4xf32>>>) {
+    _mirrored_variable_indices = [0], _replicated_input_indices = [-1],
+    devices = {TPU_REPLICATED_CORE_0 = ["/job:worker/replica:0/task:0/device:TPU:0", "/job:worker/replica:0/task:0/device:TPU:1"]},
+    n = 2 : i32} {
+     // CHECK:      %[[IDENTITY_OUT:.*]] = "tf.Identity"(%[[RI_0]])
+     // CHECK:      %[[RESOURCE_OUT:.*]] = "tf_device.launch"()
+     // CHECK-NEXT:   %[[READ_OUT:.*]] = "tf.ReadVariableOp"(%[[IDENTITY_OUT]])
+     // CHECK-NEXT:   tf_device.return %[[READ_OUT]]
+     // CHECK-NEXT: TPU_REPLICATED_CORE_0
+     %0 = "tf.Identity"(%arg1) : (tensor<*x!tf.resource<tensor<4xf32>>>) -> tensor<*x!tf.resource<tensor<4xf32>>>
+     %1 = "tf.ReadVariableOp"(%0) : (tensor<*x!tf.resource<tensor<4xf32>>>) -> tensor<4xf32>
+     %2 = "tf.A"() : () -> (tensor<2x!tf.string>)
+     "tf_device.launch"() ( {
+       "tf.TPUExecuteAndUpdateVariables"(%arg1, %2) {device_var_reads_indices = [0], device_var_updates_indices = [-1]} : (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<2x!tf.string>) -> ()
+       tf_device.return
+    }) {device = "TPU_REPLICATED_CORE_0"} : () -> ()
+    "tf_device.launch"() ( {
+      // CHECK:  "tf.B"(%[[RESOURCE_OUT]])
+      "tf.B"(%1) : (tensor<4xf32>) -> ()
+       tf_device.return
+    }) {device = "TPU_REPLICATED_CORE_0"} : () -> ()
+    tf_device.return
+  }
+  return
+}
+
+// Tests AssignVariable op using composite device resource is wrapped inside
+// tf_device.Cluster.
+
+// CHECK-LABEL: func @testAssignVariableOpColocated
+// CHECK-SAME: (%[[ARG0:.*]]: tensor<*x!tf.resource<tensor<4xf32>>>)
+func @testAssignVariableOpColocated(%arg0: tensor<*x!tf.resource<tensor<4xf32>>>) {
+  // CHECK:      tf_device.replicate
+  // CHECK-SAME: (%[[ARG0]] as %[[RI_0:[a-z0-9]*]]: tensor<*x!tf.resource<tensor<4xf32>>>)
+  tf_device.replicate(%arg0 as %arg1: tensor<*x!tf.resource<tensor<4xf32>>>) {
+    _mirrored_variable_indices = [0], _replicated_input_indices = [-1],
+    devices = {TPU_REPLICATED_CORE_0 = ["/job:worker/replica:0/task:0/device:TPU:0", "/job:worker/replica:0/task:0/device:TPU:1"]},
+    n = 2 : i32} {
+     // CHECK:      %[[VAL_OUT:.*]] = "tf.A"() : () -> tensor<4xf32>
+     // CHECK:      "tf_device.launch"()
+     // CHECK-NEXT:   "tf.AssignVariableOp"(%[[RI_0]], %[[VAL_OUT]])
+     // CHECK-NEXT:   tf_device.return
+     // CHECK-NEXT: TPU_REPLICATED_CORE_0
+     %1 = "tf.A"() : () -> (tensor<4xf32>)
+     "tf.AssignVariableOp"(%arg1, %1) : (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>) -> ()
+     %2 = "tf.B"() : () -> (tensor<2x!tf.string>)
+     "tf_device.launch"() ( {
+       "tf.TPUExecuteAndUpdateVariables"(%arg1, %2) {device_var_reads_indices = [0], device_var_updates_indices = [-1]} : (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<2x!tf.string>) -> ()
+       tf_device.return
+    }) {device = "TPU_REPLICATED_CORE_0"} : () -> ()
+    tf_device.return
+  }
+  return
+}
+
+// Tests tf_device.replicate op not running on TPU devices ignored.
+
+// CHECK-LABEL: func @testNonTPUDeviceReplicationIgnored
+// CHECK-SAME: (%[[ARG0:.*]]: tensor<*x!tf.resource<tensor<4xf32>>>)
+func @testNonTPUDeviceReplicationIgnored(%arg0: tensor<*x!tf.resource<tensor<4xf32>>>) {
+  // CHECK:      tf_device.replicate
+  // CHECK-SAME: (%[[ARG0]] as %[[RI_0:[a-z0-9]*]]: tensor<*x!tf.resource<tensor<4xf32>>>)
+  tf_device.replicate(%arg0 as %arg1: tensor<*x!tf.resource<tensor<4xf32>>>) {
+    _mirrored_variable_indices = [0], _replicated_input_indices = [-1],
+    devices = {TPU_REPLICATED_HOST = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:CPU:1"]},
+    n = 2 : i32} {
+     // CHECK:      %[[VAL_OUT:.*]] = "tf.A"() : () -> tensor<4xf32>
+     // CHECK-NEXT: "tf.AssignVariableOp"(%[[RI_0]], %[[VAL_OUT]])
+     %1 = "tf.A"() : () -> (tensor<4xf32>)
+     "tf.AssignVariableOp"(%arg1, %1) : (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>) -> ()
+     %2 = "tf.B"() : () -> (tensor<2x!tf.string>)
+     "tf_device.launch"() ( {
+       "tf.TPUExecuteAndUpdateVariables"(%arg1, %2) {device_var_reads_indices = [0], device_var_updates_indices = [-1]} : (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<2x!tf.string>) -> ()
+       tf_device.return
+    }) {device = "TPU_REPLICATED_HOST"} : () -> ()
+    tf_device.return
+  }
+  return
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_head_tail_outside_compilation.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_head_tail_outside_compilation.mlir
index 32a8000ea82..8ae6fa958a6 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_head_tail_outside_compilation.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_head_tail_outside_compilation.mlir
@@ -173,7 +173,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
   func @tail_single_outside_compiled_op() {
     // CHECK:      %[[CLUSTER_OUT:.*]] = "tf_device.cluster"
     // CHECK-NEXT:   %[[A_OUT:.*]] = "tf.A"
-    // CHECK-NEXT:   "tf.C"
+    // CHECK-NEXT:   "tf.NoOp"
     // CHECK-NEXT:   tf_device.return %[[A_OUT]]
     // CHECK-NEXT: {
     // CHECK-DAG:  num_cores_per_replica = 1
@@ -190,7 +190,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     "tf_device.cluster"() ( {
       %a = "tf.A"() : () -> tensor<i32>
       "tf.B"(%a) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> ()
-      "tf.C"() : () -> ()
+      "tf.NoOp"() : () -> ()
       tf_device.return
     }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
     return
@@ -200,7 +200,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
   func @tail_single_outside_compiled_op_user() -> tensor<i32> {
     // CHECK:      %[[CLUSTER_OUT:.*]] = "tf_device.cluster"
     // CHECK-NEXT:   %[[A_OUT:.*]] = "tf.A"
-    // CHECK-NEXT:   "tf.C"
+    // CHECK-NEXT:   "tf.NoOp"
     // CHECK-NEXT:   tf_device.return %[[A_OUT]]
     // CHECK-NEXT: {
     // CHECK-DAG:  num_cores_per_replica = 1
@@ -217,7 +217,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     %cluster = "tf_device.cluster"() ( {
       %a = "tf.A"() : () -> tensor<i32>
       %b = "tf.B"(%a) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> tensor<i32>
-      "tf.C"() : () -> ()
+      "tf.NoOp"() : () -> ()
       tf_device.return %b : tensor<i32>
     }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> tensor<i32>
     // CHECK:      return %[[LAUNCH_OUT]]
@@ -262,7 +262,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     %b = "tf.B"() : () -> tensor<i32>
     // CHECK:      %[[CLUSTER_OUT:.*]]:2 = "tf_device.cluster"
     // CHECK-NEXT:   %[[C_OUT:.*]] = "tf.C"
-    // CHECK-NEXT:   %[[E_OUT:.*]] = "tf.E"
+    // CHECK-NEXT:   %[[E_OUT:.*]] = "tf.Const"
     // CHECK-NEXT:   tf_device.return %[[C_OUT]], %[[E_OUT]]
     // CHECK-NEXT: {
     // CHECK-DAG:  num_cores_per_replica = 1
@@ -279,7 +279,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     %cluster:5 = "tf_device.cluster"() ( {
       %c = "tf.C"()  : () -> tensor<i32>
       %d = "tf.D"(%c, %a) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-      %e = "tf.E"()  : () -> tensor<i32>
+      %e = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
       tf_device.return %a, %b, %c, %d, %e : tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>
     }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> (tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>)
     // CHECK:      return %[[A_OUT]], %[[B_OUT]], %[[CLUSTER_OUT]]#0, %[[LAUNCH_OUT]], %[[CLUSTER_OUT]]#1
@@ -320,14 +320,14 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
   func @head_tail_no_extraction_middle_outside_compiled_ops(%arg0: tensor<i32>) {
     // CHECK-NOT:  "tf_device.launch"
     // CHECK:      "tf_device.cluster"
-    // CHECK-NEXT:   "tf.A"
+    // CHECK-NEXT:   "tf.Identity"
     // CHECK-NEXT:   "tf.B"
-    // CHECK-NEXT:   "tf.C"
+    // CHECK-NEXT:   "tf.Identity"
     // CHECK-NEXT:   tf_device.return
     "tf_device.cluster"() ( {
-      %a = "tf.A"(%arg0) : (tensor<i32>) -> tensor<i32>
+      %a = "tf.Identity"(%arg0) : (tensor<i32>) -> tensor<i32>
       %b = "tf.B"(%a) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> tensor<i32>
-      "tf.C"(%b) : (tensor<i32>) -> ()
+      %c = "tf.Identity"(%b) : (tensor<i32>) -> tensor<i32>
       tf_device.return
     }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
     return
@@ -379,7 +379,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK:        %[[CLUSTER_OUT:.*]] = "tf_device.cluster"
     // CHECK-NEXT:     %[[B_OUT:.*]] = "tf.B"
     // CHECK-NEXT:     %[[C_OUT:.*]] = "tf.C"(%[[RI]], %[[B_OUT]])
-    // CHECK-NEXT:     "tf.E"(%[[C_OUT]], %[[HEAD_LAUNCH_OUT]])
+    // CHECK-NEXT:     "tf.IdentityN"(%[[C_OUT]], %[[HEAD_LAUNCH_OUT]])
     // CHECK-NEXT:     tf_device.return %[[C_OUT]]
     // CHECK-NEXT:   {
     // CHECK-DAG:    num_cores_per_replica = 1
@@ -399,11 +399,139 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
         %b = "tf.B"() : () -> tensor<i32>
         %c = "tf.C"(%ri, %b) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
         %d = "tf.D"(%a, %c, %ri) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<i32>
-        %e = "tf.E"(%c, %a) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+        %e:2 = "tf.IdentityN"(%c, %a) : (tensor<i32>, tensor<i32>) -> (tensor<i32>, tensor<i32>)
         tf_device.return
       }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
       tf_device.return
     }
     return
   }
+
+  // CHECK-LABEL: func @side_effect_middle
+  func @side_effect_middle() {
+    // CHECK:      "tf_device.cluster"
+    // CHECK-NEXT:   "tf.A"
+    // CHECK-NEXT:   "tf.B"
+    // CHECK-NEXT:   "tf.C"
+    // CHECK-NEXT:   tf_device.return
+    "tf_device.cluster"() ( {
+      "tf.A"() : () -> ()
+      "tf.B"() {_xla_outside_compilation = "cluster1"} : () -> ()
+      "tf.C"() : () -> ()
+      tf_device.return
+    }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
+    return
+  }
+
+  // CHECK-LABEL: func @side_effect_head_no_operand
+  func @side_effect_head_no_operand() {
+    // CHECK:      %[[HEAD_LAUNCH_OUT:.*]] = "tf_device.launch"()
+    // CHECK-NEXT:   "tf.B"
+    // CHECK-NEXT:   %[[C_OUT:.*]] = "tf.C"
+    // CHECK-NEXT:   tf_device.return %[[C_OUT]]
+    // CHECK-NEXT: device = "/job:worker/replica:0/task:0/device:CPU:0"
+
+    // CHECK:      "tf_device.cluster"
+    // CHECK-NEXT:   "tf.Const"
+    // CHECK-NEXT:   "tf.D"(%[[HEAD_LAUNCH_OUT]])
+    // CHECK-NEXT:   tf_device.return
+
+    "tf_device.cluster"() ( {
+      %cst = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+      "tf.B"() {_xla_outside_compilation = "cluster1"} : () -> ()
+      %c = "tf.C"() {_xla_outside_compilation = "cluster1"} : () -> tensor<i32>
+      "tf.D"(%c) : (tensor<i32>) -> ()
+      tf_device.return
+    }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
+    return
+  }
+
+  // CHECK-LABEL: func @side_effect_tail_no_operand
+  func @side_effect_tail_no_operand() {
+    // CHECK:      %[[CLUSTER_OUT:.*]] = "tf_device.cluster"
+    // CHECK-NEXT:   %[[A_OUT:.*]] = "tf.A"
+    // CHECK-NEXT:   "tf.Const"
+    // CHECK-NEXT:   tf_device.return %[[A_OUT]]
+
+    // CHECK:      "tf_device.launch"()
+    // CHECK-NEXT:   "tf.B"(%[[CLUSTER_OUT]])
+    // CHECK-NEXT:   "tf.C"
+    // CHECK-NEXT:   tf_device.return
+    // CHECK-NEXT: device = "/job:worker/replica:0/task:0/device:CPU:0"
+    "tf_device.cluster"() ( {
+      %a = "tf.A"() : () -> tensor<i32>
+      "tf.B"(%a) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> ()
+      "tf.C"() {_xla_outside_compilation = "cluster1"} : () -> ()
+      %cst = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+      tf_device.return
+    }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
+    return
+  }
+
+  // Test embedding ops can be head extracted and side effect analysis
+  // predecessors are ignored.
+
+  // CHECK-LABEL: func @embedding_head_extraction
+  func @embedding_head_extraction(%arg0: tensor<!tf.string>) {
+    // CHECK:      "tf_device.launch"()
+    // CHECK-NEXT:   "tf.EnqueueTPUEmbeddingRaggedTensorBatch"
+    // CHECK-NEXT:   tf_device.return
+    // CHECK-NEXT: device = "/job:worker/replica:0/task:0/device:CPU:0"
+
+    // CHECK:      "tf_device.cluster"
+    // CHECK-NEXT:   "tf.UnknownOp"
+    // CHECK-NEXT:   tf_device.return
+    "tf_device.cluster"() ( {
+      "tf.UnknownOp"() : () -> ()
+      "tf.EnqueueTPUEmbeddingRaggedTensorBatch"(%arg0) {_xla_outside_compilation = "cluster1", table_ids = [1, 2]} : (tensor<!tf.string>) -> ()
+      tf_device.return
+    }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
+    return
+  }
+
+  // Test side effecting op after embedding op can be head extracted.
+
+  // CHECK-LABEL: func @op_after_embedding_head_extraction
+  func @op_after_embedding_head_extraction() {
+    // CHECK:      "tf_device.launch"()
+    // CHECK-NEXT:   "tf.A"
+    // CHECK-NEXT:   tf_device.return
+    // CHECK-NEXT: device = "/job:worker/replica:0/task:0/device:CPU:0"
+
+    // CHECK:      "tf_device.cluster"
+    // CHECK-NEXT:   "tf.RecvTPUEmbeddingActivations"
+    // CHECK-NEXT:   "tf.SendTPUEmbeddingGradients"
+    // CHECK-NEXT:   tf_device.return
+    "tf_device.cluster"() ( {
+      %0 = "tf.RecvTPUEmbeddingActivations"() {config = "test_config_recv_embedding"} : () -> tensor<512x256xf32>
+      "tf.SendTPUEmbeddingGradients"(%0) {N = 1 : i64, NN = 0 : i64, config = "test_config_send_embedding", operand_segment_sizes = dense<[1, 0]> : vector<2xi32>} : (tensor<512x256xf32>) -> ()
+      "tf.A"() {_xla_outside_compilation = "cluster1"} : () -> ()
+      tf_device.return
+    }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
+    return
+  }
+
+  // Test side effecting op before embedding op can be tail extracted.
+
+  // CHECK-LABEL: func @op_before_embedding_tail_extraction
+  func @op_before_embedding_tail_extraction() {
+    // CHECK:      "tf_device.cluster"
+    // CHECK-NEXT:   "tf.UnknownOp"
+    // CHECK-NEXT:   "tf.RecvTPUEmbeddingActivations"
+    // CHECK-NEXT:   "tf.SendTPUEmbeddingGradients"
+    // CHECK-NEXT:   tf_device.return
+
+    // CHECK:      "tf_device.launch"()
+    // CHECK-NEXT:   "tf.A"
+    // CHECK-NEXT:   tf_device.return
+    // CHECK-NEXT: device = "/job:worker/replica:0/task:0/device:CPU:0"
+    "tf_device.cluster"() ( {
+      "tf.UnknownOp"() : () -> ()
+      "tf.A"() {_xla_outside_compilation = "cluster1"} : () -> ()
+      %0 = "tf.RecvTPUEmbeddingActivations"() {config = "test_config_recv_embedding"} : () -> tensor<512x256xf32>
+      "tf.SendTPUEmbeddingGradients"(%0) {N = 1 : i64, NN = 0 : i64, config = "test_config_send_embedding", operand_segment_sizes = dense<[1, 0]> : vector<2xi32>} : (tensor<512x256xf32>) -> ()
+      tf_device.return
+    }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
+    return
+  }
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_outside_compilation.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_outside_compilation.mlir
index 732e34fce90..9b828e42844 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_outside_compilation.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_outside_compilation.mlir
@@ -456,4 +456,739 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     }
     return %1 : tensor<?xi32>
   }
+
+  // Tests extraction of a single outside compiled cluster inside a tf.IfRegion op.
+
+  // CHECK-LABEL: func @outside_compiled_ops_inside_tf_if
+  func @outside_compiled_ops_inside_tf_if(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+
+    // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+    // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
+    // CHECK-NEXT:     "tf_device.launch"
+    // CHECK-NEXT:      %[[PLACEHOLDER_KEY:[0-9]*]] = "tf._TPUCompileMlirPlaceholderProgramKey"()
+    // CHECK-NEXT:      %[[PREDICATE_RECV_OUTPUT:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PLACEHOLDER_KEY]])
+    // CHECK-SAME:      device_ordinal = 0
+    // CHECK-SAME:      key = "if_predicate_channel_cluster1_0"
+    // CHECK-NEXT:       tf.IfRegion"(%[[PREDICATE_RECV_OUTPUT]])
+    // CHECK-NEXT:         %[[ARG_RECV_OUTPUT:[0-9]*]]:2 = "tf._XlaRecvAtHost"(%[[PLACEHOLDER_KEY]])
+    // CHECK-SAME:         device_ordinal = 0
+    // CHECK-SAME:         key = "host_compute_channel_cluster1_args"
+    // CHECK:              "tf.D"(%[[ARG_RECV_OUTPUT]]#0, %[[ARG_RECV_OUTPUT]]#1)
+    // CHECK:              "tf._XlaSendFromHost"(%[[PLACEHOLDER_KEY]])
+    // CHECK-SAME:         device_ordinal = 0
+    // CHECK-SAME:         key = "host_compute_channel_cluster1_retvals"
+    // CHECK-NEXT:         "tf.Yield"() : () -> ()
+    // CHECK:          "tf_device.cluster"
+    // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
+    // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"
+    // CHECK:            %[[G_OUTPUT:[0-9]*]] = "tf.G"
+    // CHECK:            "tf.XlaSendToHost"(%6) {key = "if_predicate_channel_cluster1_0"}
+    // CHECK-NEXT:       tf.IfRegion"(%[[G_OUTPUT]])
+    // CHECK:              "tf._XlaHostComputeMlir"(%[[B_OUTPUT]], %[[A_OUTPUT]])
+    // CHECK-SAME:         recv_key = "host_compute_channel_cluster1_retvals"
+    // CHECK-SAME:         send_key = "host_compute_channel_cluster1_args"
+    // CHECK-SAME:         tpu_core = 0
+    // CHECK-NEXT:         "tf.Yield"() : () -> ()
+    %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+      %2 = "tf_device.cluster"() ( {
+        %3 = "tf.A"() : () -> (tensor<?xi32>)
+        %4 = "tf.B"() : () -> (tensor<?xi32>)
+        %6 = "tf.G"() : () -> (tensor<i1>)
+
+        "tf.IfRegion"(%6) ({
+          "tf.D"(%4, %3) {_xla_outside_compilation = "cluster1"} : (tensor<?xi32>, tensor<?xi32>) -> ()
+          "tf.Yield"() : () -> ()
+        }, {
+          "tf.Yield"() : () -> ()
+        }) { is_stateless = false} : (tensor<i1>) -> ()
+
+        %5 = "tf.E"() : () -> tensor<?xi32>
+        tf_device.return %5 : tensor<?xi32>
+      }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<?xi32>
+      tf_device.return %2 : tensor<?xi32>
+    }
+
+    return %1 : tensor<?xi32>
+  }
+
+  // Tests extraction of an outside compiled tf.IfRegion op where the entirety
+  // of tf.IfRegion op is outside compiled
+
+  // CHECK-LABEL: func @outside_compiled_tf_if
+  func @outside_compiled_tf_if(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    // CHECK:      %[[A_OUT:[0-9]*]] = "tf.A"
+    // CHECK:      %[[F_OUT:[0-9]*]] = "tf.F"
+    // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+    // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
+    // CHECK-NEXT:     "tf_device.launch"
+    // CHECK-NEXT:       %[[PLACEHOLDER_KEY:[0-9]*]] = "tf._TPUCompileMlirPlaceholderProgramKey"()
+    // CHECK-NEXT:       %[[RECV_OUTPUT:[0-9]*]]:3 = "tf._XlaRecvAtHost"(%[[PLACEHOLDER_KEY]])
+    // CHECK-SAME:       device_ordinal = 0
+    // CHECK-SAME:       key = "host_compute_channel_cluster1_args"
+    // CHECK-SAME:       (tensor<2x!tf.string>) -> (tensor<?xi32>, tensor<?xi32>, tensor<i1>)
+    // CHECK-NEXT:       tf.IfRegion"(%[[RECV_OUTPUT]]#2)
+    // CHECK:              "tf.D"(%[[RECV_OUTPUT]]#0, %[[RECV_OUTPUT]]#1, %[[F_OUT]])
+    // CHECK:              "tf._XlaSendFromHost"(%[[PLACEHOLDER_KEY]])
+    // CHECK-SAME:         device_ordinal = 0
+    // CHECK-SAME:         key = "host_compute_channel_cluster1_retvals"
+    // CHECK:          "tf_device.cluster"
+    // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
+    // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"
+    // CHECK:            %[[G_OUTPUT:[0-9]*]] = "tf.G"
+    // CHECK:            "tf._XlaHostComputeMlir"(%[[B_OUTPUT]], %[[A_OUTPUT]], %[[G_OUTPUT]])
+    // CHECK-SAME:       recv_key = "host_compute_channel_cluster1_retvals"
+    // CHECK-SAME:       send_key = "host_compute_channel_cluster1_args"
+    // CHECK-SAME:       tpu_core = 0
+    %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+    %7 = "tf.F"() : () -> tensor<?xi32>
+
+    %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+      %2 = "tf_device.cluster"() ( {
+        %3 = "tf.A"() : () -> (tensor<?xi32>)
+        %4 = "tf.B"() : () -> (tensor<?xi32>)
+        %6 = "tf.G"() : () -> (tensor<i1>)
+
+        "tf.IfRegion"(%6) ({
+          "tf.D"(%4, %3, %7) {} : (tensor<?xi32>, tensor<?xi32>, tensor<?xi32>) -> ()
+          "tf.Yield"() : () -> ()
+        }, {
+          "tf.Yield"() : () -> ()
+        }) {_xla_outside_compilation = "cluster1", is_stateless = false} : (tensor<i1>) -> ()
+
+        %5 = "tf.E"() : () -> tensor<?xi32>
+        tf_device.return %5 : tensor<?xi32>
+      }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<?xi32>
+      tf_device.return %2 : tensor<?xi32>
+    }
+
+    return %1 : tensor<?xi32>
+  }
+
+  // Tests extraction of an outside compiled tf.IfRegion op where the entirety
+  // of tf.IfRegion op is outside compiled and wrapped inside another
+  // tf.IfRegion op
+
+  // CHECK-LABEL: func @outside_compiled_tf_if_nested
+  func @outside_compiled_tf_if_nested(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    // CHECK:      %[[A_OUT:[0-9]*]] = "tf.A"
+    // CHECK:      %[[F_OUT:[0-9]*]] = "tf.F"
+    // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+    // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
+    // CHECK-NEXT:     "tf_device.launch"
+    // CHECK-NEXT:       %[[PLACEHOLDER_KEY:[0-9]*]] = "tf._TPUCompileMlirPlaceholderProgramKey"()
+    // CHECK-NEXT:       %[[RECV_OUTPUT_PREDICATE:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PLACEHOLDER_KEY]])
+    // CHECK-SAME:       device_ordinal = 0
+    // CHECK-SAME:       key = "if_predicate_channel_cluster1_0"
+    // CHECK-SAME:       (tensor<2x!tf.string>) -> tensor<i1>
+    // CHECK-NEXT:       tf.IfRegion"(%[[RECV_OUTPUT_PREDICATE]])
+    // CHECK-NEXT:         %[[RECV_OUTPUT:[0-9]*]]:2 = "tf._XlaRecvAtHost"(%[[PLACEHOLDER_KEY]])
+    // CHECK-SAME:         device_ordinal = 0
+    // CHECK-SAME:         key = "host_compute_channel_cluster1_args"
+    // CHECK-SAME:         (tensor<2x!tf.string>) -> (tensor<?xi32>, tensor<i1>)
+    // CHECK-NEXT:         tf.IfRegion"(%[[RECV_OUTPUT]]#1)
+    // CHECK-NEXT:           "tf.H"(%[[RECV_OUTPUT]]#0, %[[F_OUT]])
+    // CHECK:                "tf.Yield"() : () -> ()
+    // CHECK:                "tf.Yield"() : () -> ()
+    // CHECK:              "tf._XlaSendFromHost"(%[[PLACEHOLDER_KEY]])
+    // CHECK-SAME:         device_ordinal = 0
+    // CHECK-SAME:         key = "host_compute_channel_cluster1_retvals"
+    // CHECK:          "tf_device.cluster"
+    // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
+    // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"
+    // CHECK:            %[[G_OUTPUT:[0-9]*]] = "tf.G"
+    // CHECK:            "tf.XlaSendToHost"(%[[G_OUTPUT]])
+    // CHECK-SAME:       key = "if_predicate_channel_cluster1_0"
+    // CHECK-SAME:       (tensor<i1>) -> ()
+    // CHECK-NEXT:       "tf.IfRegion"(%[[G_OUTPUT]])
+    // CHECK:              %[[D_OUT:[0-9]*]] = "tf.D"
+    // CHECK-NEXT:         %[[F_OUT:[0-9]*]] = "tf.F"
+    // CHECK:              "tf._XlaHostComputeMlir"(%[[D_OUT]], %[[F_OUT]])
+    // CHECK-SAME:         recv_key = "host_compute_channel_cluster1_retvals"
+    // CHECK-SAME:         send_key = "host_compute_channel_cluster1_args"
+    // CHECK-SAME:         tpu_core = 0
+    // CHECK:              "tf.Yield"() : () -> ()
+    // CHECK:              "tf.Yield"() : () -> ()
+    %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+    %7 = "tf.F"() : () -> tensor<?xi32>
+
+    %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+      %2 = "tf_device.cluster"() ( {
+        %3 = "tf.A"() : () -> (tensor<?xi32>)
+        %4 = "tf.B"() : () -> (tensor<?xi32>)
+        %6 = "tf.G"() : () -> (tensor<i1>)
+
+        "tf.IfRegion"(%6) ({
+          %8 = "tf.D"(%4, %3, %7) {} : (tensor<?xi32>, tensor<?xi32>, tensor<?xi32>) -> (tensor<?xi32>)
+          %9 = "tf.F"(%4) {} : (tensor<?xi32>) -> (tensor<i1>)
+
+          "tf.IfRegion"(%9) ({
+            "tf.H"(%8, %7) : (tensor<?xi32>, tensor<?xi32>) -> ()
+            "tf.Yield"() : () -> ()
+          }, {
+            "tf.Yield"() : () -> ()
+          }) {_xla_outside_compilation = "cluster1", is_stateless = false} : (tensor<i1>) -> ()
+
+          "tf.Yield"() : () -> ()
+        }, {
+          "tf.Yield"() : () -> ()
+        }) {is_stateless = false} : (tensor<i1>) -> ()
+
+        %5 = "tf.E"() : () -> tensor<?xi32>
+        tf_device.return %5 : tensor<?xi32>
+      }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<?xi32>
+      tf_device.return %2 : tensor<?xi32>
+    }
+
+    return %1 : tensor<?xi32>
+  }
+
+  // Tests extraction of a single outside compiled cluster inside a tf.IfRegion
+  // op with return values.
+
+  // CHECK-LABEL: func @outside_compiled_ops_inside_tf_if_with_return_values
+  func @outside_compiled_ops_inside_tf_if_with_return_values(
+    %arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+
+    // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+    // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
+    // CHECK-NEXT:     "tf_device.launch"
+    // CHECK-NEXT:      %[[PLACEHOLDER_KEY:[0-9]*]] = "tf._TPUCompileMlirPlaceholderProgramKey"()
+    // CHECK-NEXT:      %[[PREDICATE_RECV_OUTPUT:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PLACEHOLDER_KEY]])
+    // CHECK-SAME:      device_ordinal = 0
+    // CHECK-SAME:      key = "if_predicate_channel_cluster1_0"
+    // CHECK-NEXT:       tf.IfRegion"(%[[PREDICATE_RECV_OUTPUT]])
+    // CHECK-NEXT:         %[[ARG_RECV_OUTPUT:[0-9]*]]:2 = "tf._XlaRecvAtHost"(%[[PLACEHOLDER_KEY]])
+    // CHECK-SAME:         device_ordinal = 0
+    // CHECK-SAME:         key = "host_compute_channel_cluster1_args"
+    // CHECK:              %[[D_OUTPUT:[0-9]*]] = "tf.D"(%[[ARG_RECV_OUTPUT]]#0, %[[ARG_RECV_OUTPUT]]#1)
+    // CHECK:              "tf._XlaSendFromHost"(%[[D_OUTPUT]], %[[PLACEHOLDER_KEY]])
+    // CHECK-SAME:         device_ordinal = 0
+    // CHECK-SAME:         key = "host_compute_channel_cluster1_retvals"
+    // CHECK-NEXT:         "tf.Yield"() : () -> ()
+    // CHECK:          "tf_device.cluster"
+    // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
+    // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"
+    // CHECK:            %[[G_OUTPUT:[0-9]*]] = "tf.G"
+    // CHECK:            "tf.XlaSendToHost"(%6) {key = "if_predicate_channel_cluster1_0"}
+    // CHECK-NEXT:       tf.IfRegion"(%[[G_OUTPUT]])
+    // CHECK:              %[[HOST_COMPUTE_OUT:[0-9]*]] = "tf._XlaHostComputeMlir"(%[[B_OUTPUT]], %[[A_OUTPUT]])
+    // CHECK-SAME:         recv_key = "host_compute_channel_cluster1_retvals"
+    // CHECK-SAME:         send_key = "host_compute_channel_cluster1_args"
+    // CHECK-SAME:         tpu_core = 0
+    // CHECK-NEXT:         "tf.Yield"(%[[HOST_COMPUTE_OUT]])
+    %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+      %2 = "tf_device.cluster"() ( {
+        %3 = "tf.A"() : () -> (tensor<?xi32>)
+        %4 = "tf.B"() : () -> (tensor<?xi32>)
+        %6 = "tf.G"() : () -> (tensor<i1>)
+
+        "tf.IfRegion"(%6) ({
+          %7 = "tf.D"(%4, %3) {_xla_outside_compilation = "cluster1"} : (tensor<?xi32>, tensor<?xi32>) -> (tensor<?xi32>)
+          "tf.Yield"(%7) : (tensor<?xi32>) -> ()
+        }, {
+
+          %8 = "tf.F"() : () -> (tensor<?xi32>)
+          "tf.Yield"(%8) : (tensor<?xi32>) -> ()
+        }) { is_stateless = false} : (tensor<i1>) -> (tensor<?xi32>)
+
+        %5 = "tf.E"() : () -> tensor<?xi32>
+        tf_device.return %5 : tensor<?xi32>
+      }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<?xi32>
+      tf_device.return %2 : tensor<?xi32>
+    }
+
+    return %1 : tensor<?xi32>
+  }
+
+  // Tests extraction of a single outside compiled cluster inside a tf.IfRegion op without external inputs/outputs
+
+  // CHECK-LABEL: func @outside_compiled_ops_inside_tf_if_without_input_outputs
+  func @outside_compiled_ops_inside_tf_if_without_input_outputs(
+    %arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+    // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+    // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
+    // CHECK-NEXT:     "tf_device.launch"
+    // CHECK-NEXT:      %[[PLACEHOLDER_KEY:[0-9]*]] = "tf._TPUCompileMlirPlaceholderProgramKey"()
+    // CHECK-NEXT:      %[[PREDICATE_RECV_OUTPUT:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PLACEHOLDER_KEY]])
+    // CHECK-SAME:      device_ordinal = 0
+    // CHECK-SAME:      key = "if_predicate_channel_cluster1_0"
+    // CHECK-NEXT:       tf.IfRegion"(%[[PREDICATE_RECV_OUTPUT]])
+    // CHECK:              "tf.D"
+    // CHECK-NEXT:         "tf.Yield"() : () -> ()
+    // CHECK:          "tf_device.cluster"
+    // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
+    // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"
+    // CHECK:            %[[G_OUTPUT:[0-9]*]] = "tf.G"
+    // CHECK:            "tf.XlaSendToHost"(%6) {key = "if_predicate_channel_cluster1_0"}
+    // CHECK-NEXT:       tf.IfRegion"(%[[G_OUTPUT]])
+    // CHECK-NEXT:         "tf.Yield"() : () -> ()
+    %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+      %2 = "tf_device.cluster"() ( {
+        %3 = "tf.A"() : () -> (tensor<?xi32>)
+        %4 = "tf.B"() : () -> (tensor<?xi32>)
+        %6 = "tf.G"() : () -> (tensor<i1>)
+
+        "tf.IfRegion"(%6) ({
+          "tf.D"() {_xla_outside_compilation = "cluster1"} : () -> ()
+          "tf.Yield"() : () -> ()
+        }, {
+          "tf.Yield"() : () -> ()
+        }) { is_stateless = false} : (tensor<i1>) -> ()
+
+        %5 = "tf.E"() : () -> tensor<?xi32>
+        tf_device.return %5 : tensor<?xi32>
+      }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<?xi32>
+      tf_device.return %2 : tensor<?xi32>
+    }
+
+    return %1 : tensor<?xi32>
+  }
+
+  // Tests extraction of a single outside compiled cluster inside a nested
+  // tf.IfRegion op.
+
+  // CHECK-LABEL: func @outside_compiled_ops_inside_nested_if
+  func @outside_compiled_ops_inside_nested_if(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+    // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+    // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
+    // CHECK-NEXT:     "tf_device.launch"
+    // CHECK-NEXT:      %[[PLACEHOLDER_KEY:[0-9]*]] = "tf._TPUCompileMlirPlaceholderProgramKey"()
+    // CHECK-NEXT:      %[[PREDICATE_RECV_OUTPUT:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PLACEHOLDER_KEY]])
+    // CHECK-SAME:      device_ordinal = 0
+    // CHECK-SAME:      key = "if_predicate_channel_cluster1_0"
+    // CHECK-NEXT:      tf.IfRegion"(%[[PREDICATE_RECV_OUTPUT]])
+    // CHECK-NEXT:        %[[PREDICATE2_RECV_OUTPUT:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PLACEHOLDER_KEY]])
+    // CHECK-SAME:        device_ordinal = 0
+    // CHECK-SAME:        key = "if_predicate_channel_cluster1_1"
+    // CHECK-NEXT:        tf.IfRegion"(%[[PREDICATE2_RECV_OUTPUT]])
+    // CHECK-NEXT:          "tf.Yield"() : () -> ()
+    // CHECK:               %[[ARG_RECV_OUTPUT:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PLACEHOLDER_KEY]])
+    // CHECK-SAME:          device_ordinal = 0
+    // CHECK-SAME:          key = "host_compute_channel_cluster1_args"
+    // CHECK:               "tf.D"(%[[ARG_RECV_OUTPUT]])
+    // CHECK:               "tf._XlaSendFromHost"(%[[PLACEHOLDER_KEY]])
+    // CHECK-SAME:          device_ordinal = 0
+    // CHECK-SAME:          key = "host_compute_channel_cluster1_retvals"
+    // CHECK-NEXT:          "tf.Yield"() : () -> ()
+
+    // CHECK:          "tf_device.cluster"
+    // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
+    // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"
+    // CHECK:            %[[G_OUTPUT:[0-9]*]] = "tf.G"
+    // CHECK:            "tf.XlaSendToHost"(%[[G_OUTPUT]]) {key = "if_predicate_channel_cluster1_0"}
+    // CHECK-NEXT:       tf.IfRegion"(%[[G_OUTPUT]])
+    // CHECK:              %[[H_OUTPUT:[0-9]*]] = "tf.H"(%[[B_OUTPUT]])
+    // CHECK:              "tf.XlaSendToHost"(%[[H_OUTPUT]]) {key = "if_predicate_channel_cluster1_1"}
+    // CHECK-NEXT:         tf.IfRegion"(%[[H_OUTPUT]])
+    // CHECK-NEXT:           "tf.Yield"() : () -> ()
+    // CHECK:                 %[[I_OUTPUT:[0-9]*]] = "tf.I"(%[[H_OUTPUT]])
+    // CHECK:                 "tf._XlaHostComputeMlir"(%[[I_OUTPUT]])
+    // CHECK-NEXT:            "tf.Yield"() : () -> ()
+    %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+      %2 = "tf_device.cluster"() ( {
+        %3 = "tf.A"() : () -> (tensor<?xi32>)
+        %4 = "tf.B"() : () -> (tensor<?xi32>)
+        %6 = "tf.G"() : () -> (tensor<i1>)
+
+        "tf.IfRegion"(%6) ({
+           %7 = "tf.H"(%4) : (tensor<?xi32>) -> (tensor<i1>)
+
+          "tf.IfRegion"(%7)({
+              "tf.Yield"() : () -> ()
+            },
+            {
+              %8 = "tf.I"(%7) : (tensor<i1>) -> (tensor<?xi32>)
+              "tf.D"(%8) {_xla_outside_compilation = "cluster1"} : (tensor<?xi32>) -> ()
+              "tf.Yield"() : () -> ()
+            }) { is_stateless = false} : (tensor<i1>) -> ()
+
+          "tf.Yield"() : () -> ()
+        }, {
+          "tf.Yield"() : () -> ()
+        }) { is_stateless = false} : (tensor<i1>) -> ()
+
+        %5 = "tf.E"() : () -> tensor<?xi32>
+        tf_device.return %5 : tensor<?xi32>
+      }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<?xi32>
+      tf_device.return %2 : tensor<?xi32>
+    }
+
+    return %1 : tensor<?xi32>
+  }
+
+  // Tests extraction of a single outside compiled cluster inside a tf.WhileRegion op body.
+
+  // CHECK-LABEL: func @outside_compiled_ops_inside_tf_while_body
+  func @outside_compiled_ops_inside_tf_while_body(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+
+    // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+    // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
+    // CHECK-NEXT:     "tf_device.launch"
+    // CHECK-NEXT:      %[[PLACEHOLDER_KEY:[0-9]*]] = "tf._TPUCompileMlirPlaceholderProgramKey"()
+    // CHECK-NEXT:       tf.WhileRegion"
+    // CHECK-NEXT:         %[[COND_RECV_OUTPUT:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PLACEHOLDER_KEY]])
+    // CHECK-SAME:         device_ordinal = 0
+    // CHECK-SAME:         key = "while_condition_channel_cluster1_0"
+    // CHECK:              "tf.Yield"(%[[COND_RECV_OUTPUT]])
+    // CHECK:              %[[BODY_RECV_OUTPUT:[0-9]*]]:2 = "tf._XlaRecvAtHost"(%[[PLACEHOLDER_KEY]])
+    // CHECK:              %[[D_OUTPUT:[0-9]*]] = "tf.D"
+    // CHECK:              "tf._XlaSendFromHost"(%[[D_OUTPUT]], %[[PLACEHOLDER_KEY]])
+    // CHECK_NEXT:         "tf.Yield"
+    // CHECK:          "tf_device.cluster"
+    // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
+    // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"
+    // CHECK:            %[[G_OUTPUT:[0-9]*]] = "tf.G"
+    // CHECK-NEXT:       tf.WhileRegion"(%[[B_OUTPUT]], %[[A_OUTPUT]])
+    // CHECK:              %[[H_OUTPUT:[0-9]*]] = "tf.H"
+    // CHECK-NEXT:         "tf.XlaSendToHost"(%[[H_OUTPUT]])
+    // CHECK-NEXT:         "tf.Yield"(%[[H_OUTPUT]])
+    // CHECK:              %[[C_OUTPUT:[0-9]*]] = "tf.C"
+    // CHECK-NEXT:         %[[HOST_COMPUTE_OUTPUT:[0-9]*]] = "tf._XlaHostComputeMlir"
+    // CHECK-NEXT          "tf.Yield"(%[[C_OUTPUT]], %[[HOST_COMPUTE_OUTPUT]])
+    %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+      %2 = "tf_device.cluster"() ( {
+        %3 = "tf.A"() : () -> (tensor<?xf32>)
+        %4 = "tf.B"() : () -> (tensor<i32>)
+        %6 = "tf.G"() : () -> (tensor<i1>)
+
+        "tf.WhileRegion"(%4, %3) ({
+	^bb0(%arg1: tensor<i32>, %arg2: tensor<?xf32>):
+	  %7 = "tf.H"(%arg1) :  (tensor<i32>) -> tensor<i1>
+          "tf.Yield"(%7) : (tensor<i1>) -> ()
+        }, {
+	^bb0(%arg1: tensor<i32>, %arg2: tensor<?xf32>):
+	  %8 = "tf.C"(%arg1) : (tensor<i32>) -> tensor<i32>
+          %9 = "tf.D"(%arg1, %arg2) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<?xf32>) -> tensor<?xf32>
+          "tf.Yield"(%8, %9) : (tensor<i32>, tensor<?xf32>) -> ()
+        }) { is_stateless = false} : (tensor<i32>, tensor<?xf32>) -> (tensor<i32>, tensor<?xf32>)
+
+        %5 = "tf.E"() : () -> tensor<?xi32>
+        tf_device.return %5 : tensor<?xi32>
+      }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<?xi32>
+      tf_device.return %2 : tensor<?xi32>
+    }
+
+    return %1 : tensor<?xi32>
+  }
+
+  // Tests extraction of a single outside compiled cluster inside a tf.WhileRegion op cond.
+
+  // CHECK-LABEL: func @outside_compiled_ops_inside_tf_while_cond
+  func @outside_compiled_ops_inside_tf_while_cond(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+
+    // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+    // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
+    // CHECK-NEXT:     "tf_device.launch"
+    // CHECK-NEXT:      %[[PLACEHOLDER_KEY:[0-9]*]] = "tf._TPUCompileMlirPlaceholderProgramKey"()
+    // CHECK-NEXT:       tf.WhileRegion"
+    // CHECK-NEXT:         %[[COND_RECV_OUTPUT1:[0-9]*]]:2 = "tf._XlaRecvAtHost"(%[[PLACEHOLDER_KEY]])
+    // CHECK-NEXT:         %[[I_OUTPUT:[0-9]*]] = "tf.I"(%[[COND_RECV_OUTPUT1]]#0, %[[COND_RECV_OUTPUT1]]#1)
+    // CHECK-NEXT:         "tf._XlaSendFromHost"(%[[I_OUTPUT]], %[[PLACEHOLDER_KEY]])
+    // CHECK-NEXT:         %[[COND_RECV_OUTPUT2:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PLACEHOLDER_KEY]])
+    // CHECK-SAME:         device_ordinal = 0
+    // CHECK-SAME:         key = "while_condition_channel_cluster1_0"
+    // CHECK:              "tf.Yield"(%[[COND_RECV_OUTPUT2]])
+    // CHECK_NEXT:         "tf.Yield"
+    // CHECK:          "tf_device.cluster"
+    // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
+    // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"
+    // CHECK:            %[[G_OUTPUT:[0-9]*]] = "tf.G"
+    // CHECK-NEXT:       tf.WhileRegion"(%[[B_OUTPUT]], %[[A_OUTPUT]])
+    // CHECK               "tf.XlaHostCompute"
+    // CHECK:              %[[H_OUTPUT:[0-9]*]] = "tf.H"
+    // CHECK-NEXT:         "tf.XlaSendToHost"(%[[H_OUTPUT]])
+    // CHECK-NEXT:         "tf.Yield"(%[[H_OUTPUT]])
+    // CHECK:              %[[C_OUTPUT:[0-9]*]] = "tf.C"
+    // CHECK-NEXT:         "tf.D"
+    // CHECK-NEXT          "tf.Yield"(%[[C_OUTPUT]], %[[HOST_COMPUTE_OUTPUT]])
+    %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+      %2 = "tf_device.cluster"() ( {
+        %3 = "tf.A"() : () -> (tensor<?xf32>)
+        %4 = "tf.B"() : () -> (tensor<i32>)
+        %6 = "tf.G"() : () -> (tensor<i1>)
+
+        "tf.WhileRegion"(%4, %3) ({
+	^bb0(%arg1: tensor<i32>, %arg2: tensor<?xf32>):
+          %7 = "tf.I"(%arg1, %arg2) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<?xf32>) -> tensor<i32>
+	  %8 = "tf.H"(%7) :  (tensor<i32>) -> tensor<i1>
+          "tf.Yield"(%8) : (tensor<i1>) -> ()
+        }, {
+	^bb0(%arg1: tensor<i32>, %arg2: tensor<?xf32>):
+	  %7 = "tf.C"(%arg1) : (tensor<i32>) -> tensor<i32>
+          %8 = "tf.D"(%arg1, %arg2) : (tensor<i32>, tensor<?xf32>) -> tensor<?xf32>
+          "tf.Yield"(%7, %8) : (tensor<i32>, tensor<?xf32>) -> ()
+        }) { is_stateless = false} : (tensor<i32>, tensor<?xf32>) -> (tensor<i32>, tensor<?xf32>)
+
+        %5 = "tf.E"() : () -> tensor<?xi32>
+        tf_device.return %5 : tensor<?xi32>
+      }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<?xi32>
+      tf_device.return %2 : tensor<?xi32>
+    }
+
+    return %1 : tensor<?xi32>
+  }
+
+  // Tests extraction of a single outside compiled cluster inside a tf.WhileRegion op cond and body.
+
+  // CHECK-LABEL: func @outside_compiled_ops_inside_tf_while_cond_body
+  func @outside_compiled_ops_inside_tf_while_cond_body(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+
+    // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+    // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
+    // CHECK-NEXT:     "tf_device.launch"
+    // CHECK-NEXT:       %[[PLACEHOLDER_KEY:[0-9]*]] = "tf._TPUCompileMlirPlaceholderProgramKey"()
+    // CHECK-NEXT:       tf.WhileRegion"
+    // CHECK-NEXT:         %[[COND_RECV_OUTPUT:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PLACEHOLDER_KEY]])
+    // CHECK-SAME:         device_ordinal = 0
+    // CHECK-SAME:         key = "while_condition_channel_cluster2_0"
+    // CHECK:              "tf.Yield"(%[[COND_RECV_OUTPUT]])
+    // CHECK:              %[[BODY_RECV_OUTPUT:[0-9]*]]:2 = "tf._XlaRecvAtHost"(%[[PLACEHOLDER_KEY]])
+    // CHECK:              %[[D_OUTPUT:[0-9]*]] = "tf.D"
+    // CHECK:              "tf._XlaSendFromHost"(%[[D_OUTPUT]], %[[PLACEHOLDER_KEY]])
+    // CHECK_NEXT:         "tf.Yield"
+    // CHECK:         "tf_device.launch"
+    // CHECK-NEXT:       %[[PLACEHOLDER_KEY:[0-9]*]] = "tf._TPUCompileMlirPlaceholderProgramKey"()
+    // CHECK-NEXT:       tf.WhileRegion"
+    // CHECK-NEXT:         %[[COND_RECV_OUTPUT1:[0-9]*]]:2 = "tf._XlaRecvAtHost"(%[[PLACEHOLDER_KEY]])
+    // CHECK-NEXT:         %[[I_OUTPUT:[0-9]*]] = "tf.I"(%[[COND_RECV_OUTPUT1]]#0, %[[COND_RECV_OUTPUT1]]#1)
+    // CHECK-NEXT:         "tf._XlaSendFromHost"(%[[I_OUTPUT]], %[[PLACEHOLDER_KEY]])
+    // CHECK-NEXT:         %[[COND_RECV_OUTPUT2:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PLACEHOLDER_KEY]])
+    // CHECK-SAME:         device_ordinal = 0
+    // CHECK-SAME:         key = "while_condition_channel_cluster1_0"
+    // CHECK:              "tf.Yield"(%[[COND_RECV_OUTPUT2]])
+    // CHECK_NEXT:         "tf.Yield"
+    // CHECK:          "tf_device.cluster"
+    // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
+    // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"
+    // CHECK:            %[[G_OUTPUT:[0-9]*]] = "tf.G"
+    // CHECK-NEXT:       tf.WhileRegion"(%[[B_OUTPUT]], %[[A_OUTPUT]])
+    // CHECK               "tf.XlaHostCompute"
+    // CHECK:              %[[H_OUTPUT:[0-9]*]] = "tf.H"
+    // CHECK-NEXT:         "tf.XlaSendToHost"(%[[H_OUTPUT]])
+    // CHECK-NEXT:         "tf.XlaSendToHost"(%[[H_OUTPUT]])
+    // CHECK-NEXT:         "tf.Yield"(%[[H_OUTPUT]])
+    // CHECK:              %[[C_OUTPUT:[0-9]*]] = "tf.C"
+    // CHECK-NEXT          "tf.Yield"(%[[C_OUTPUT]], %[[HOST_COMPUTE_OUTPUT]])
+    %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+      %2 = "tf_device.cluster"() ( {
+        %3 = "tf.A"() : () -> (tensor<?xf32>)
+        %4 = "tf.B"() : () -> (tensor<i32>)
+        %6 = "tf.G"() : () -> (tensor<i1>)
+
+        "tf.WhileRegion"(%4, %3) ({
+	^bb0(%arg1: tensor<i32>, %arg2: tensor<?xf32>):
+          %7 = "tf.I"(%arg1, %arg2) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<?xf32>) -> tensor<i32>
+	  %8 = "tf.H"(%7) :  (tensor<i32>) -> tensor<i1>
+          "tf.Yield"(%8) : (tensor<i1>) -> ()
+        }, {
+	^bb0(%arg1: tensor<i32>, %arg2: tensor<?xf32>):
+	  %7 = "tf.C"(%arg1) : (tensor<i32>) -> tensor<i32>
+          %8 = "tf.D"(%arg1, %arg2) {_xla_outside_compilation = "cluster2"} : (tensor<i32>, tensor<?xf32>) -> tensor<?xf32>
+          "tf.Yield"(%7, %8) : (tensor<i32>, tensor<?xf32>) -> ()
+        }) { is_stateless = false} : (tensor<i32>, tensor<?xf32>) -> (tensor<i32>, tensor<?xf32>)
+
+        %5 = "tf.E"() : () -> tensor<?xi32>
+        tf_device.return %5 : tensor<?xi32>
+      }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<?xi32>
+      tf_device.return %2 : tensor<?xi32>
+    }
+
+    return %1 : tensor<?xi32>
+  }
+
+  // Tests extraction of a single outside compiled cluster inside a tf.IfRegion op
+  // nested in a tf.WhileRegion.
+
+  // CHECK-LABEL: func @outside_compiled_ops_inside_tf_while_if
+  func @outside_compiled_ops_inside_tf_while_if(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+
+    // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+    // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
+    // CHECK-NEXT:     "tf_device.launch"
+    // CHECK-NEXT:      %[[PLACEHOLDER_KEY:[0-9]*]] = "tf._TPUCompileMlirPlaceholderProgramKey"()
+    // CHECK-NEXT:       tf.WhileRegion"
+    // CHECK-NEXT:         %[[COND_RECV_OUTPUT:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PLACEHOLDER_KEY]])
+    // CHECK-SAME:         device_ordinal = 0
+    // CHECK-SAME:         key = "while_condition_channel_cluster1_0"
+    // CHECK:              "tf.Yield"(%[[COND_RECV_OUTPUT]])
+    // CHECK:              %[[PREDICATE_RECV_OUTPUT:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PLACEHOLDER_KEY]])
+    // CHECK-NEXT:         tf.IfRegion"(%[[PREDICATE_RECV_OUTPUT]])
+    // CHECK:                "tf._XlaRecvAtHost"(%[[PLACEHOLDER_KEY]])
+    // CHECK:                %[[D_OUTPUT:[0-9]*]] = "tf.D"
+    // CHECK:                "tf._XlaSendFromHost"(%[[D_OUTPUT]], %[[PLACEHOLDER_KEY]])
+    // CHECK_NEXT:           "tf.Yield"
+    // CHECK:          "tf_device.cluster"
+    // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
+    // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"
+    // CHECK:            %[[G_OUTPUT:[0-9]*]] = "tf.G"
+    // CHECK-NEXT:       tf.WhileRegion"(%[[B_OUTPUT]], %[[A_OUTPUT]])
+    // CHECK:              %[[H_OUTPUT:[0-9]*]] = "tf.H"
+    // CHECK-NEXT:         "tf.XlaSendToHost"(%[[H_OUTPUT]])
+    // CHECK-NEXT:         "tf.Yield"(%[[H_OUTPUT]])
+    // CHECK:              %[[C_OUTPUT:[0-9]*]] = "tf.C"
+    // CHECK-NEXT:         "tf.XlaSendToHost"(%[[G_OUTPUT]])
+    // CHECK-NEXT:         tf.IfRegion"(%[[G_OUTPUT]])
+    // CHECK-NEXT:         %[[HOST_COMPUTE_OUTPUT:[0-9]*]] = "tf._XlaHostComputeMlir"
+    // CHECK-NEXT          "tf.Yield"(%[[HOST_COMPUTE_OUTPUT]])
+    %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+      %2 = "tf_device.cluster"() ( {
+        %3 = "tf.A"() : () -> (tensor<?xf32>)
+        %4 = "tf.B"() : () -> (tensor<i32>)
+        %6 = "tf.G"() : () -> (tensor<i1>)
+
+        "tf.WhileRegion"(%4, %3) ({
+	^bb0(%arg1: tensor<i32>, %arg2: tensor<?xf32>):
+	  %7 = "tf.H"(%arg1) :  (tensor<i32>) -> tensor<i1>
+          "tf.Yield"(%7) : (tensor<i1>) -> ()
+        }, {
+	^bb0(%arg1: tensor<i32>, %arg2: tensor<?xf32>):
+	  %8 = "tf.C"(%arg1) : (tensor<i32>) -> tensor<i32>
+          %10 = "tf.IfRegion"(%6) ({
+            %9 = "tf.D"() {_xla_outside_compilation = "cluster1"} : () -> tensor<?xf32>
+	    "tf.Yield"(%9) : (tensor<?xf32>) -> ()
+	  }, {
+	    "tf.Yield"(%arg2) : (tensor<?xf32>) -> ()
+	  }) { is_stateless = false} : (tensor<i1>) -> tensor<?xf32>
+          "tf.Yield"(%8, %10) : (tensor<i32>, tensor<?xf32>) -> ()
+        }) { is_stateless = false} : (tensor<i32>, tensor<?xf32>) -> (tensor<i32>, tensor<?xf32>)
+
+        %5 = "tf.E"() : () -> tensor<?xi32>
+        tf_device.return %5 : tensor<?xi32>
+      }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<?xi32>
+      tf_device.return %2 : tensor<?xi32>
+    }
+
+    return %1 : tensor<?xi32>
+  }
+
+  // Tests extraction of an outside compiled tf.IfRegion op where the entirety
+  // of tf.IfRegion op is outside compiled with a nested tf.WhileRegion op.
+
+  // CHECK-LABEL: func @outside_compiled_tf_if_nested_while
+  func @outside_compiled_tf_if_nested_while(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    // CHECK:      %[[A_OUT:[0-9]*]] = "tf.A"
+    // CHECK:      %[[F_OUT:[0-9]*]] = "tf.F"
+    // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+    // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
+    // CHECK-NEXT:     "tf_device.launch"
+    // CHECK-NEXT:       %[[PLACEHOLDER_KEY:[0-9]*]] = "tf._TPUCompileMlirPlaceholderProgramKey"()
+    // CHECK-NEXT:       %[[RECV_OUTPUT:[0-9]*]]:3 = "tf._XlaRecvAtHost"(%[[PLACEHOLDER_KEY]])
+    // CHECK-SAME:       device_ordinal = 0
+    // CHECK-SAME:       key = "host_compute_channel_cluster1_args"
+    // CHECK-SAME:       (tensor<2x!tf.string>) -> (tensor<?xi32>, tensor<?xi32>, tensor<i1>)
+    // CHECK-NEXT:       tf.IfRegion"(%[[RECV_OUTPUT]]#2)
+    // CHECK:              %[[D_OUTPUT:[0-9]*]] = "tf.D"(%[[RECV_OUTPUT]]#0, %[[RECV_OUTPUT]]#1, %[[F_OUT]])
+    // CHECK-NEXT:         %[[J_OUTPUT:[0-9]*]] = "tf.J"
+    // CHECK-NEXT:         %[[K_OUTPUT:[0-9]*]] = "tf.K"
+    // CHECK-NEXT:          tf.WhileRegion"(%[[J_OUTPUT]], %[[D_OUTPUT]])
+    // CHECK:                 %[[H_OUTPUT:[0-9]*]] = "tf.H"(%[[K_OUTPUT]])
+    // CHECK:              "tf._XlaSendFromHost"(%[[PLACEHOLDER_KEY]])
+    // CHECK-SAME:         device_ordinal = 0
+    // CHECK-SAME:         key = "host_compute_channel_cluster1_retvals"
+    // CHECK:          "tf_device.cluster"
+    // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
+    // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"
+    // CHECK:            %[[G_OUTPUT:[0-9]*]] = "tf.G"
+    // CHECK:            "tf._XlaHostComputeMlir"(%[[B_OUTPUT]], %[[A_OUTPUT]], %[[G_OUTPUT]])
+    // CHECK-SAME:       recv_key = "host_compute_channel_cluster1_retvals"
+    // CHECK-SAME:       send_key = "host_compute_channel_cluster1_args"
+    // CHECK-SAME:       tpu_core = 0
+    %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+    %7 = "tf.F"() : () -> tensor<?xi32>
+
+    %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+      %2 = "tf_device.cluster"() ( {
+        %3 = "tf.A"() : () -> (tensor<?xi32>)
+        %4 = "tf.B"() : () -> (tensor<?xi32>)
+        %6 = "tf.G"() : () -> (tensor<i1>)
+
+        "tf.IfRegion"(%6) ({
+          %8 = "tf.D"(%4, %3, %7) {} : (tensor<?xi32>, tensor<?xi32>, tensor<?xi32>) -> (tensor<?xf32>)
+          %9 = "tf.J"() : () -> (tensor<i32>)
+          %10 = "tf.K"() : () -> (tensor<i32>)
+          "tf.WhileRegion"(%9, %8) ({
+	  ^bb0(%arg1: tensor<i32>, %arg2: tensor<?xf32>):
+            %11 = "tf.I"(%arg1, %arg2) : (tensor<i32>, tensor<?xf32>) -> tensor<i32>
+	    %12 = "tf.H"(%10) :  (tensor<i32>) -> tensor<i1>
+            "tf.Yield"(%12) : (tensor<i1>) -> ()
+          }, {
+	  ^bb0(%arg1: tensor<i32>, %arg2: tensor<?xf32>):
+	    %11 = "tf.C"(%arg1) : (tensor<i32>) -> tensor<i32>
+            %12 = "tf.D"(%arg1, %arg2) : (tensor<i32>, tensor<?xf32>) -> tensor<?xf32>
+            "tf.Yield"(%11, %12) : (tensor<i32>, tensor<?xf32>) -> ()
+          }) { is_stateless = false} : (tensor<i32>, tensor<?xf32>) -> (tensor<i32>, tensor<?xf32>)
+          "tf.Yield"() : () -> ()
+        }, {
+          "tf.Yield"() : () -> ()
+        }) {_xla_outside_compilation = "cluster1", is_stateless = false} : (tensor<i1>) -> ()
+
+        %5 = "tf.E"() : () -> tensor<?xi32>
+        tf_device.return %5 : tensor<?xi32>
+      }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<?xi32>
+      tf_device.return %2 : tensor<?xi32>
+    }
+
+    return %1 : tensor<?xi32>
+  }
+
+  // Tests extraction of an outside compiled tf.WhileRegion where the entire
+  // tf.WhileREgion op is outside compiled with a nested tf.IfRegion.
+
+  // CHECK-LABEL: func @outside_compiled_ops_tf_while_nested_if
+  func @outside_compiled_ops_tf_while_nested_if(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+
+    // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+    // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
+    // CHECK-NEXT:     "tf_device.launch"
+    // CHECK-NEXT:      %[[PLACEHOLDER_KEY:[0-9]*]] = "tf._TPUCompileMlirPlaceholderProgramKey"()
+    // CHECK-NEXT:      %[[HOST_RECV_OUTPUT:[0-9]*]]:3 = "tf._XlaRecvAtHost"(%[[PLACEHOLDER_KEY]])
+    // CHECK:           "tf.WhileRegion"(%[[HOST_RECV_OUTPUT]]#1, %[[HOST_RECV_OUTPUT]]#2)
+    // CHECK:             %[[C_OUTPUT:[0-9]*]] = "tf.C"
+    // CHECK:             "tf.IfRegion"(%[[HOST_RECV_OUTPUT]]#0)
+    // CHECK:               %[[D_OUTPUT:[0-9]*]] = "tf.D"(%[[C_OUTPUT]])
+    // CHECK_NEXT:          "tf.Yield"
+    // CHECK:          "tf_device.cluster"
+    // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
+    // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"
+    // CHECK:            %[[G_OUTPUT:[0-9]*]] = "tf.G"
+    // CHECK:            "tf._XlaHostComputeMlir"(%[[G_OUTPUT]], %[[B_OUTPUT]], %[[A_OUTPUT]])
+    %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+      %2 = "tf_device.cluster"() ( {
+        %3 = "tf.A"() : () -> (tensor<?xf32>)
+        %4 = "tf.B"() : () -> (tensor<i32>)
+        %6 = "tf.G"() : () -> (tensor<i1>)
+
+        "tf.WhileRegion"(%4, %3) ({
+	^bb0(%arg1: tensor<i32>, %arg2: tensor<?xf32>):
+	  %7 = "tf.H"(%arg1) :  (tensor<i32>) -> tensor<i1>
+          "tf.Yield"(%7) : (tensor<i1>) -> ()
+        }, {
+	^bb0(%arg1: tensor<i32>, %arg2: tensor<?xf32>):
+	  %8 = "tf.C"(%arg1) : (tensor<i32>) -> tensor<i32>
+          %10 = "tf.IfRegion"(%6) ({
+            %9 = "tf.D"(%8) : (tensor<i32>) -> tensor<?xf32>
+	    "tf.Yield"(%9) : (tensor<?xf32>) -> ()
+	  }, {
+	    "tf.Yield"(%arg2) : (tensor<?xf32>) -> ()
+	  }) { is_stateless = false} : (tensor<i1>) -> tensor<?xf32>
+          "tf.Yield"(%8, %10) : (tensor<i32>, tensor<?xf32>) -> ()
+        }) {_xla_outside_compilation = "cluster1", is_stateless = false} : (tensor<i32>, tensor<?xf32>) -> (tensor<i32>, tensor<?xf32>)
+
+        %5 = "tf.E"() : () -> tensor<?xi32>
+        tf_device.return %5 : tensor<?xi32>
+      }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<?xi32>
+      tf_device.return %2 : tensor<?xi32>
+    }
+
+    return %1 : tensor<?xi32>
+  }
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_identity_pruning.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_identity_pruning.mlir
new file mode 100644
index 00000000000..317e7036c42
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_identity_pruning.mlir
@@ -0,0 +1,93 @@
+// RUN: tf-opt %s -tf-tpu-identity-pruning | FileCheck %s --dump-input=always
+
+// Tests Identity op in cluster is pruned away.
+
+// CHECK-LABEL: func @testIdentity
+// CHECK-SAME: ([[ARG0:%.*]]: tensor<i32>)
+func @testIdentity(%arg0: tensor<i32>) {
+  // CHECK-NOT:  "tf.Identity"
+  // CHECK:      "tf_device.cluster"
+  // CHECK-NEXT: tf_device.return [[ARG0]]
+  %0 = "tf_device.cluster"() ( {
+    %1 = "tf.Identity"(%arg0) : (tensor<i32>) -> tensor<i32>
+    tf_device.return %1 : tensor<i32>
+  }) : () -> tensor<i32>
+  return
+}
+
+// Tests IdentityN op in cluster is pruned away.
+
+// CHECK-LABEL: func @testIdentityN
+// CHECK-SAME: ([[ARG0:%.*]]: tensor<i32>, [[ARG1:%.*]]: tensor<f32>)
+func @testIdentityN(%arg0: tensor<i32>, %arg1: tensor<f32>) {
+  // CHECK-NOT:  "tf.IdentityN"
+  // CHECK:      "tf_device.cluster"
+  // CHECK-NEXT: tf_device.return [[ARG0]], [[ARG1]]
+  %0:2 = "tf_device.cluster"() ( {
+    %1:2 = "tf.IdentityN"(%arg0, %arg1) : (tensor<i32>, tensor<f32>) -> (tensor<i32>, tensor<f32>)
+    tf_device.return %1#0, %1#1 : tensor<i32>, tensor<f32>
+  }) : () -> (tensor<i32>, tensor<f32>)
+  return
+}
+
+// Tests transitive Identity ops reachable from the cluster are pruned away.
+
+// CHECK-LABEL: func @testTransitiveIdentity
+// CHECK-SAME: ([[ARG0:%.*]]: tensor<i32>)
+func @testTransitiveIdentity(%arg0: tensor<i32>) {
+  // CHECK:      "tf_device.cluster"
+  // CHECK:      "tf.PartitionedCall"([[ARG0]])
+  // CHECK-SAME: f = @callee0
+  %0 = "tf_device.cluster"() ( {
+    %1 = "tf.PartitionedCall"(%arg0) {config = "", config_proto = "", executor_type = "", f = @callee0} : (tensor<i32>) -> tensor<i32>
+    tf_device.return %1 : tensor<i32>
+  }) : () -> tensor<i32>
+  return
+}
+
+// CHECK-LABEL: func @callee0
+// CHECK-SAME: ([[ARG0:%.*]]: tensor<i32>)
+func @callee0(%arg0: tensor<i32>) -> tensor<i32> {
+  // CHECK-NOT:  "tf.Identity"
+  // CHECK:      "tf.PartitionedCall"([[ARG0]])
+  // CHECK-SAME: f = @callee1
+  %0 = "tf.Identity"(%arg0) : (tensor<i32>) -> tensor<i32>
+  %1 = "tf.PartitionedCall"(%arg0) {config = "", config_proto = "", executor_type = "", f = @callee1} : (tensor<i32>) -> tensor<i32>
+  return %1 : tensor<i32>
+}
+
+// CHECK-LABEL: func @callee1
+// CHECK-SAME: ([[ARG0:%.*]]: tensor<i32>)
+func @callee1(%arg0: tensor<i32>) -> tensor<i32> {
+  // CHECK-NOT:  "tf.Identity"
+  // CHECK:      return [[ARG0]]
+  %0 = "tf.Identity"(%arg0) : (tensor<i32>) -> tensor<i32>
+  return %0 : tensor<i32>
+}
+
+// Tests Identity ops not reachable from the cluster are not pruned away.
+
+// CHECK-LABEL: func @testIdentityOutsideCluster
+// CHECK-SAME: ([[ARG0:%.*]]: tensor<i32>)
+func @testIdentityOutsideCluster(%arg0: tensor<i32>) {
+  // CHECK:      [[IDENTITY:%.*]] = "tf.Identity"([[ARG0]])
+  // CHECK:      [[CLUSTER:%.*]] = "tf_device.cluster"
+  // CHECK-NEXT: tf_device.return [[IDENTITY]]
+  %0 = "tf.Identity"(%arg0) : (tensor<i32>) -> tensor<i32>
+  %1 = "tf_device.cluster"() ( {
+    tf_device.return %0 : tensor<i32>
+  }) : () -> tensor<i32>
+  // CHECK:      "tf.PartitionedCall"([[CLUSTER]])
+  // CHECK-SAME: f = @callee2
+  %2 = "tf.PartitionedCall"(%1) {config = "", config_proto = "", executor_type = "", f = @callee2} : (tensor<i32>) -> tensor<i32>
+  return
+}
+
+// CHECK-LABEL: func @callee2
+// CHECK-SAME: ([[ARG0:%.*]]: tensor<i32>)
+func @callee2(%arg0: tensor<i32>) -> tensor<i32> {
+  // CHECK:      [[IDENTITY:%.*]] = "tf.Identity"([[ARG0]])
+  %0 = "tf.Identity"(%arg0) : (tensor<i32>) -> tensor<i32>
+  // CHECK:      return [[IDENTITY]]
+  return %0 : tensor<i32>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_outside_compilation_cluster.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_outside_compilation_cluster.mlir
index 1394bd22dc8..269af51504f 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_outside_compilation_cluster.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_outside_compilation_cluster.mlir
@@ -75,7 +75,7 @@ func @two_clusters_no_dependencies() {
   // CHECK: "tf.opB"
   // CHECK-SAME: _xla_outside_compilation = "[[CLUSTER4:[a-zA-Z_0-9]+]]"
   // CHECK: "tf.opC"
-  // CHECK-SAME: _xla_outside_compilation = "[[CLUSTER4]]"
+  // CHECK-SAME: _xla_outside_compilation = "[[CLUSTER5:[a-zA-Z_0-9]+]]"
   // CHECK: "tf.opD"
   "tf_device.cluster"() ( {
     "tf.opA"() : () -> ()
@@ -135,6 +135,27 @@ func @two_clusters_with_two_ops_each() {
   return
 }
 
+// CHECK-LABEL: func @resource_side_effect_cycle
+func @resource_side_effect_cycle(%arg0: tensor<!tf.resource<tensor<f32>>>, %arg1: tensor<!tf.resource<tensor<f32>>>) {
+  // CHECK: "tf.ReadVariableOp"
+  // CHECK-SAME: _xla_outside_compilation = "[[CLUSTER1:[a-zA-Z_0-9]+]]"
+  // CHECK-NEXT: "tf.Identity"
+  // CHECK-SAME: _xla_outside_compilation = "[[CLUSTER1]]"
+  // CHECK-NEXT: "tf.AssignVariableOp"
+  // CHECK-NOT:  {_xla_outside_compilation = "[[CLUSTER1]]"
+  "tf_device.cluster"() ( {
+    %read0 = "tf.ReadVariableOp"(%arg0) {_xla_outside_compilation = "0"} : (tensor<!tf.resource<tensor<f32>>>) -> tensor<f32>
+    %idet0 = "tf.Identity"(%read0) {_xla_outside_compilation = "0"} : (tensor<f32>) -> tensor<f32>
+    "tf.AssignVariableOp"(%arg1, %idet0) : (tensor<!tf.resource<tensor<f32>>>, tensor<f32>) -> ()
+    %read1 = "tf.ReadVariableOp"(%arg1) {_xla_outside_compilation = "0"} : (tensor<!tf.resource<tensor<f32>>>) -> tensor<f32>
+    %idet1 = "tf.Identity"(%read1) {_xla_outside_compilation = "0"} : (tensor<f32>) -> tensor<f32>
+    %add0 = "tf.AddV2"(%idet0, %idet1) {_xla_outside_compilation = "0"} : (tensor<f32>, tensor<f32>) -> tensor<f32>
+    "tf.AssignVariableOp"(%arg0, %add0) {_xla_outside_compilation = "0"} : (tensor<!tf.resource<tensor<f32>>>, tensor<f32>) -> ()
+    tf_device.return
+  }) {cluster_attr = "cluster_attr"} : () -> ()
+  return
+}
+
 // CHECK-LABEL: func @two_clusters_with_same_parent
 func @two_clusters_with_same_parent() {
   // CHECK: "tf.opA"
@@ -144,10 +165,10 @@ func @two_clusters_with_same_parent() {
   // CHECK-NOT: _xla_outside_compilation = "[[CLUSTER10]]"
   // CHECK-SAME: _xla_outside_compilation = "[[CLUSTER11:[a-zA-Z_0-9]+]]"
   // CHECK-NEXT: "tf.opD"
-  // CHECK-SAME: _xla_outside_compilation = "[[CLUSTER10]]"
+  // CHECK-SAME: _xla_outside_compilation = "[[CLUSTER12:[a-zA-Z_0-9]+]]"
   // CHECK-NEXT: "tf.opE"
   // CHECK-NEXT: "tf.opF"
-  // CHECK-SAME: _xla_outside_compilation = "[[CLUSTER11]]"
+  // CHECK-SAME: _xla_outside_compilation = "[[CLUSTER13:[a-zA-Z_0-9]+]]"
   // CHECK-NEXT: "tf.opG"
   "tf_device.cluster"() ( {
     %a = "tf.opA"() {_xla_outside_compilation = "0"} : () -> tensor<i32>
@@ -171,8 +192,8 @@ func @two_clusters_with_same_outside_compiled_parent() {
   // CHECK-NOT: _xla_outside_compilation = "[[CLUSTER12]]"
   // CHECK-SAME: _xla_outside_compilation = "[[CLUSTER13:[a-zA-Z_0-9]+]]"
   // CHECK-NEXT: "tf.opD"
-  // CHECK-SAME: _xla_outside_compilation = "[[CLUSTER12]]"
-  // CHECK-NEXT: "tf.opE"
+  // CHECK-SAME: _xla_outside_compilation = "[[CLUSTER14:[a-zA-Z_0-9]+]]"
+  // CHECK-NEXT: "tf.Identity"
   // CHECK-NEXT: "tf.opF"
   // CHECK-SAME: _xla_outside_compilation = "[[CLUSTER13]]"
   // CHECK-NEXT: "tf.opG"
@@ -182,7 +203,7 @@ func @two_clusters_with_same_outside_compiled_parent() {
     %b = "tf.opB"(%a) : (tensor<i32>) -> tensor<i32>
     %c = "tf.opC"(%b) {_xla_outside_compilation = "0"} : (tensor<i32>) -> tensor<i32>
     %d = "tf.opD"() {_xla_outside_compilation = "0"} : () -> tensor<i32>
-    %e = "tf.opE"(%d) : (tensor<i32>) -> tensor<i32>
+    %e = "tf.Identity"(%d) : (tensor<i32>) -> tensor<i32>
     %f = "tf.opF"(%e) {_xla_outside_compilation = "0"} : (tensor<i32>) -> tensor<i32>
     %g = "tf.opG"(%c, %f) {_xla_outside_compilation = "0"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
     tf_device.return
@@ -213,14 +234,15 @@ func @outside_compile_with_block() {
   // CHECK-NEXT: "tf.opB"
   // CHECK-SAME: _xla_outside_compilation = "[[CLUSTER15]]"
   // CHECK: "tf.opC"
-  // CHECK-SAME: _xla_outside_compilation = "[[CLUSTER15]]"
+  // CHECK-NOT: _xla_outside_compilation = "[[CLUSTER14]]"
+  // CHECK-SAME: _xla_outside_compilation = "{{[a-zA-Z_0-9]+}}"
   "tf_device.cluster"() ( {
     %a = "tf.opA"() {_xla_outside_compilation = "0"} : () -> tensor<i32>
-    %b = "tf.opB"() {_xla_outside_compilation = "0"} : () -> tensor<i32>
+    %b = "tf.opB"(%a) {_xla_outside_compilation = "0"} : (tensor<i32>) -> tensor<i32>
     "tf_device.cluster" () ( {
       tf_device.return
     }) {cluster_attr = "cluster_attr"} : () -> ()
-    %c = "tf.opC"() {_xla_outside_compilation = "0"} : () -> tensor<i32>
+    %c = "tf.opC"(%b) {_xla_outside_compilation = "0"} : (tensor<i32>) -> tensor<i32>
     tf_device.return
   }) {cluster_attr = "cluster_attr"} : () -> ()
   return
@@ -248,3 +270,144 @@ func @two_clusters_with_one_op_each_with_indirect_dependency() {
   }) {cluster_attr = "cluster_attr"} : () -> ()
   return
 }
+
+// CHECK-LABEL: func @check_ops_with_data_dependency_added_as_host_cluster
+func @check_ops_with_data_dependency_added_as_host_cluster() {
+  // CHECK: "tf.opA"
+  // CHECK-NEXT: "tf.opB"
+  // CHECK-SAME: _xla_outside_compilation = "[[CLUSTER16:[a-zA-Z_0-9]+]]"
+  // CHECK-NEXT: "tf.Identity"
+  // CHECK-NEXT: "tf.Identity"
+  // CHECK-NEXT: "tf.opE"
+  // CHECK-SAME: _xla_outside_compilation = "[[CLUSTER16]]"
+  // CHECK-NEXT: "tf.opF"
+  "tf_device.cluster"() ( {
+    %a = "tf.opA"() : () -> tensor<i32>
+    %b = "tf.opB"(%a) {_xla_outside_compilation = "0"} : (tensor<i32>) -> tensor<i32>
+    %c = "tf.Identity"(%b) : (tensor<i32>) -> tensor<i32>
+    %d = "tf.Identity"(%c) : (tensor<i32>) -> tensor<i32>
+    %e = "tf.opE"(%d, %b, %c) {_xla_outside_compilation = "0"} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<i32>
+    "tf.opF"(%e) : (tensor<i32>) -> ()
+    tf_device.return
+  }) {cluster_attr = "cluster_attr"} : () -> ()
+  return
+}
+
+// CHECK-LABEL: func @check_op_inside_nested_region_clustered
+func @check_op_inside_nested_region_clustered(%arg0 : tensor<*x!tf.resource>) {
+  // CHECK:      tf_device.cluster
+  // CHECK:        "tf.IfRegion"
+  // CHECK-NEXT:     "tf.Const"
+  // CHECK-NEXT:     "tf.B"
+  // CHECK-NEXT:     "tf.C"
+  // CHECK-NEXT:     "tf.Const"
+  // CHECK-SAME:     _xla_outside_compilation = "[[CLUSTER17:[a-zA-Z_0-9]+]]"
+  // CHECK-NEXT:     "tf.Const"
+  // CHECK-SAME:     _xla_outside_compilation = "[[CLUSTER17]]"
+  // CHECK-NEXT:     "tf.WriteSummary"
+  // CHECK-SAME:     _xla_outside_compilation = "[[CLUSTER17]]"
+  "tf_device.cluster"() ( {
+    %0 = "tf.Const"() {value = dense<true> : tensor<i1>} : () -> tensor<i1>
+    "tf.IfRegion"(%0) ( {
+      %1 = "tf.Const"() {value = dense<true> : tensor<i1>} : () -> tensor<i1>
+      %2 = "tf.B"() : () -> (tensor<i64>)
+      %3 = "tf.C"() : () -> (tensor<f32>)
+      %4 = "tf.Const"() {_xla_outside_compilation = "auto0", value = dense<"logits"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+      %5 = "tf.Const"() {_xla_outside_compilation = "auto1", value = dense<"\0A\09\0A\07scalars"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+      "tf.WriteSummary"(%arg0, %2, %3, %4, %5) {_xla_outside_compilation = "auto2", device = "/device:CPU:0"} : (tensor<*x!tf.resource>, tensor<i64>, tensor<f32>, tensor<!tf.string>, tensor<!tf.string>) -> ()
+      "tf.Yield"(%1) : (tensor<i1>) -> ()
+      }, {
+      %1 = "tf.Const"() {value = dense<false> : tensor<i1>} : () -> tensor<i1>
+      "tf.Yield"(%1) : (tensor<i1>) -> ()
+      }) { is_stateless = true } : (tensor<i1>) -> tensor<i1>
+
+    tf_device.return
+  }) {cluster_attr = "cluster_attr"} : () -> ()
+  return
+}
+
+// CHECK-LABEL: func @check_ops_inside_different_block_in_different_cluster
+func @check_ops_inside_different_block_in_different_cluster(%arg0 : tensor<*x!tf.resource>) {
+  // CHECK:      tf_device.cluster
+  // CHECK-NEXT:   "tf.Const"
+  // CHECK-NEXT:   "tf.B"
+  // CHECK-SAME:   _xla_outside_compilation = "[[CLUSTER17:[a-zA-Z_0-9]+]]"
+  // CHECK-NEXT:   "tf.C"
+  // CHECK-SAME:   _xla_outside_compilation = "[[CLUSTER17:[a-zA-Z_0-9]+]]"
+  // CHECK:      "tf.IfRegion"
+  // CHECK-NEXT:     "tf.Const"
+  // CHECK-NEXT:     "tf.Const"
+  // CHECK-SAME:     _xla_outside_compilation = "[[CLUSTER18:[a-zA-Z_0-9]+]]"
+  // CHECK-NEXT:     "tf.Const"
+  // CHECK-SAME:     _xla_outside_compilation = "[[CLUSTER18]]"
+  // CHECK-NEXT:     "tf.WriteSummary"
+  // CHECK-SAME:     _xla_outside_compilation = "[[CLUSTER18]]"
+  // CHECK:          "tf.Const"
+  // CHECK-NEXT:     "tf.Const"
+  // CHECK-SAME:     _xla_outside_compilation = "[[CLUSTER19:[a-zA-Z_0-9]+]]"
+  // CHECK-NEXT:     "tf.D"
+  // CHECK-SAME:     _xla_outside_compilation = "[[CLUSTER19]]"
+  "tf_device.cluster"() ( {
+    %0 = "tf.Const"() {value = dense<true> : tensor<i1>} : () -> tensor<i1>
+    %2 = "tf.B"() {_xla_outside_compilation = "auto1"} : () -> (tensor<i64>)
+    %3 = "tf.C"() {_xla_outside_compilation = "auto2"} : () -> (tensor<f32>)
+    "tf.IfRegion"(%0) ( {
+      %1 = "tf.Const"() {value = dense<true> : tensor<i1>} : () -> tensor<i1>
+      %4 = "tf.Const"() {_xla_outside_compilation = "auto3", value = dense<"logits"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+      %5 = "tf.Const"() {_xla_outside_compilation = "auto4", value = dense<"\0A\09\0A\07scalars"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+      "tf.WriteSummary"(%arg0, %2, %3, %4, %5) {_xla_outside_compilation = "auto2", device = "/device:CPU:0"} : (tensor<*x!tf.resource>, tensor<i64>, tensor<f32>, tensor<!tf.string>, tensor<!tf.string>) -> ()
+      "tf.Yield"(%1) : (tensor<i1>) -> ()
+      }, {
+      %1 = "tf.Const"() {value = dense<false> : tensor<i1>} : () -> tensor<i1>
+      %4 = "tf.Const"() {_xla_outside_compilation = "auto5", value = dense<"a"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+      "tf.D"(%3, %4, %1) {_xla_outside_compilation = "auto6"} : (tensor<f32>, tensor<!tf.string>, tensor<i1>) -> ()
+      "tf.Yield"(%1) : (tensor<i1>) -> ()
+      }) { is_stateless = true } : (tensor<i1>) -> tensor<i1>
+
+    tf_device.return
+  }) {cluster_attr = "cluster_attr"} : () -> ()
+  return
+}
+
+// CHECK-LABEL: func @check_clustering_ops_inside_nested_control_flow
+func @check_clustering_ops_inside_nested_control_flow(%arg0 : tensor<*x!tf.resource>) {
+  // CHECK:      tf_device.cluster
+  // CHECK-NEXT:   "tf.Const"
+  // CHECK-NEXT:   "tf.B"
+  // CHECK-SAME:   _xla_outside_compilation = "[[CLUSTER17:[a-zA-Z_0-9]+]]"
+  // CHECK-NEXT:   "tf.C"
+  // CHECK-SAME:   _xla_outside_compilation = "[[CLUSTER17:[a-zA-Z_0-9]+]]"
+  // CHECK:        "tf.IfRegion"
+  // CHECK:          "tf.IfRegion"
+  // CHECK-NEXT:       "tf.Const"
+  // CHECK-NEXT:       "tf.Const"
+  // CHECK-SAME:       _xla_outside_compilation = "[[CLUSTER18:[a-zA-Z_0-9]+]]"
+  // CHECK-NEXT:       "tf.Const"
+  // CHECK-SAME:       _xla_outside_compilation = "[[CLUSTER18]]"
+  // CHECK-NEXT:       "tf.WriteSummary"
+  // CHECK-SAME:       _xla_outside_compilation = "[[CLUSTER18]]"
+  "tf_device.cluster"() ( {
+    %0 = "tf.Const"() {value = dense<true> : tensor<i1>} : () -> tensor<i1>
+    %2 = "tf.B"() {_xla_outside_compilation = "auto1"} : () -> (tensor<i64>)
+    %3 = "tf.C"() {_xla_outside_compilation = "auto2"} : () -> (tensor<f32>)
+    "tf.IfRegion"(%0) ( {
+      %6 = "tf.Const"() {value = dense<true> : tensor<i1>} : () -> tensor<i1>
+      "tf.IfRegion"(%6) ( {
+        %1 = "tf.Const"() {value = dense<true> : tensor<i1>} : () -> tensor<i1>
+        %4 = "tf.Const"() {_xla_outside_compilation = "auto3", value = dense<"logits"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+        %5 = "tf.Const"() {_xla_outside_compilation = "auto4", value = dense<"\0A\09\0A\07scalars"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+        "tf.WriteSummary"(%arg0, %2, %3, %4, %5) {_xla_outside_compilation = "auto2", device = "/device:CPU:0"} : (tensor<*x!tf.resource>, tensor<i64>, tensor<f32>, tensor<!tf.string>, tensor<!tf.string>) -> ()
+        "tf.Yield"(%1) : (tensor<i1>) -> ()
+      }, {
+        %1 = "tf.Const"() {value = dense<false> : tensor<i1>} : () -> tensor<i1>
+        "tf.Yield"(%1) : (tensor<i1>) -> ()
+      }) { is_stateless = true } : (tensor<i1>) -> tensor<i1>
+      "tf.Yield"(%6) : (tensor<i1>) -> ()
+    }, {
+      %7 = "tf.Const"() {value = dense<false> : tensor<i1>} : () -> tensor<i1>
+      "tf.Yield"(%7) : (tensor<i1>) -> ()
+    }) { is_stateless = true } : (tensor<i1>) -> tensor<i1>
+    tf_device.return
+  }) {cluster_attr = "cluster_attr"} : () -> ()
+  return
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_parallel_execute_sink_resource_write.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_parallel_execute_sink_resource_write.mlir
new file mode 100644
index 00000000000..ad4433c1d20
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_parallel_execute_sink_resource_write.mlir
@@ -0,0 +1,137 @@
+// RUN: tf-opt %s -tf-tpu-parallel-execute-sink-resource-write | FILECHECK_OPTS="" FileCheck %s
+
+// CHECK-LABEL: func @multiple_uses
+// CHECK-SAME:  ({{.+}}: tensor<i1>, [[ARG1:%.+]]: tensor<!tf.resource>)
+func @multiple_uses(%arg0: tensor<i1>, %arg1: tensor<!tf.resource>) -> tensor<i1> {
+  // CHECK:      [[PARALLEL_EXECUTE:%.+]]:2 = "tf_device.parallel_execute"
+  %0:2 = "tf_device.parallel_execute"() ( {
+    tf_device.return %arg0 : tensor<i1>
+  }, {
+    tf_device.return %arg0 : tensor<i1>
+  // CHECK:      }) : () -> (tensor<i1>, tensor<i1>)
+  }) : () -> (tensor<i1>, tensor<i1>)
+  // CHECK-NEXT: "tf.AssignVariableOp"([[ARG1]], [[PARALLEL_EXECUTE]]#0)
+  "tf.AssignVariableOp"(%arg1, %0#0) : (tensor<!tf.resource>, tensor<i1>) -> ()
+  // CHECK-NEXT: return [[PARALLEL_EXECUTE]]#0
+  return %0#0 : tensor<i1>
+}
+
+// CHECK-LABEL: func @not_assign_var
+// CHECK-SAME:  ({{.+}}: tensor<i1>, [[ARG1:%.+]]: tensor<!tf.resource>)
+func @not_assign_var(%arg0: tensor<i1>, %arg1: tensor<!tf.resource>) {
+  // CHECK:      [[PARALLEL_EXECUTE:%.+]]:2 = "tf_device.parallel_execute"
+  %0:2 = "tf_device.parallel_execute"() ( {
+    tf_device.return %arg0 : tensor<i1>
+  }, {
+    tf_device.return %arg0 : tensor<i1>
+  // CHECK:      }) : () -> (tensor<i1>, tensor<i1>)
+  }) : () -> (tensor<i1>, tensor<i1>)
+  // CHECK-NEXT: "tf.AssignAddVariableOp"([[ARG1]], [[PARALLEL_EXECUTE]]#0)
+  "tf.AssignAddVariableOp"(%arg1, %0#0) : (tensor<!tf.resource>, tensor<i1>) -> ()
+  return
+}
+
+// CHECK-LABEL: func @resource_handle_output
+// CHECK-SAME:  ([[ARG0:%.+]]: tensor<i1>, {{.+}}: tensor<!tf.resource>)
+func @resource_handle_output(%arg0: tensor<i1>, %arg1: tensor<!tf.resource>) {
+  // CHECK:      [[PARALLEL_EXECUTE:%.+]]:2 = "tf_device.parallel_execute"
+  %0:2 = "tf_device.parallel_execute"() ( {
+    tf_device.return %arg1 : tensor<!tf.resource>
+  }, {
+    tf_device.return %arg1 : tensor<!tf.resource>
+  // CHECK:      }) : () -> (tensor<!tf.resource>, tensor<!tf.resource>)
+  }) : () -> (tensor<!tf.resource>, tensor<!tf.resource>)
+  // CHECK-NEXT: "tf.AssignVariableOp"([[PARALLEL_EXECUTE]]#0, [[ARG0]])
+  "tf.AssignVariableOp"(%0#0, %arg0) : (tensor<!tf.resource>, tensor<i1>) -> ()
+  return
+}
+
+// CHECK-LABEL: func @resource_handle_and_value_output
+func @resource_handle_and_value_output(%arg0: tensor<i1>, %arg1: tensor<!tf.resource>) {
+  // CHECK: [[PARALLEL_EXECUTE:%.+]]:2 = "tf_device.parallel_execute"
+  %0:2 = "tf_device.parallel_execute"() ( {
+    tf_device.return %arg0, %arg1 : tensor<i1>, tensor<!tf.resource>
+  }, {
+    tf_device.return
+  }) : () -> (tensor<i1>, tensor<!tf.resource>)
+  // CHECK: "tf.AssignVariableOp"([[PARALLEL_EXECUTE]]#1, [[PARALLEL_EXECUTE]]#0)
+  "tf.AssignVariableOp"(%0#1, %0#0) : (tensor<!tf.resource>, tensor<i1>) -> ()
+  return
+}
+
+// CHECK-LABEL: func @resource_handle_after_parallel_execute
+func @resource_handle_after_parallel_execute(%arg0: tensor<i1>) {
+  // CHECK:      [[PARALLEL_EXECUTE:%.+]]:2 = "tf_device.parallel_execute"
+  %0:2 = "tf_device.parallel_execute"() ( {
+    tf_device.return %arg0 : tensor<i1>
+  }, {
+    tf_device.return %arg0 : tensor<i1>
+  // CHECK:      }) : () -> (tensor<i1>, tensor<i1>)
+  }) : () -> (tensor<i1>, tensor<i1>)
+  // CHECK-NEXT: [[VAR:%.+]] = "tf.VarHandleOp"
+  %1 = "tf.VarHandleOp"() {container = "", shape = #tf.shape<>, shared_name = "x"} : () -> tensor<!tf.resource<tensor<i1>>>
+  // CHECK-NEXT: "tf.AssignVariableOp"([[VAR]], [[PARALLEL_EXECUTE]]#0)
+  "tf.AssignVariableOp"(%1, %0#0) : (tensor<!tf.resource<tensor<i1>>>, tensor<i1>) -> ()
+  return
+}
+
+// CHECK-LABEL: func @replace_single_output
+// CHECK-SAME:  ([[ARG0:%.+]]: tensor<i1>, [[ARG1:%.+]]: tensor<i1>, [[ARG2:%.+]]: tensor<i1>, [[ARG3:%.+]]: tensor<!tf.resource>)
+func @replace_single_output(%arg0: tensor<i1>, %arg1: tensor<i1>, %arg2: tensor<i1>, %arg3: tensor<!tf.resource>) {
+  // CHECK:      {{%.+}}:2 = "tf_device.parallel_execute"
+  %0:3 = "tf_device.parallel_execute"() ( {
+    // CHECK-NEXT: "tf.AssignVariableOp"([[ARG3]], [[ARG1]])
+    // CHECK-NEXT: tf_device.return [[ARG0]], [[ARG2]] : tensor<i1>, tensor<i1>
+    tf_device.return %arg0, %arg1, %arg2 : tensor<i1>, tensor<i1>, tensor<i1>
+  // CHECK-NEXT: }, {
+  }, {
+    // CHECK-NEXT: tf_device.return
+    tf_device.return
+  // CHECK-NEXT: }) : () -> (tensor<i1>, tensor<i1>)
+  }) : () -> (tensor<i1>, tensor<i1>, tensor<i1>)
+  "tf.AssignVariableOp"(%arg3, %0#1) : (tensor<!tf.resource>, tensor<i1>) -> ()
+  // CHECK-NEXT: return
+  return
+}
+
+// CHECK-LABEL: func @replace_multiple_outputs
+// CHECK-SAME:  ([[ARG0:%.+]]: tensor<i1>, [[ARG1:%.+]]: tensor<i32>, [[ARG2:%.+]]: tensor<i64>, [[ARG3:%.+]]: tensor<f32>, [[ARG4:%.+]]: tensor<f64>, [[ARG5:%.+]]: tensor<!tf.resource>, [[ARG6:%.+]]: tensor<!tf.resource>)
+func @replace_multiple_outputs(%arg0: tensor<i1>, %arg1: tensor<i32>, %arg2: tensor<i64>, %arg3: tensor<f32>, %arg4: tensor<f64>, %arg5: tensor<!tf.resource>, %arg6: tensor<!tf.resource>) {
+  // CHECK:      {{%.+}}:3 = "tf_device.parallel_execute"
+  %0:5 = "tf_device.parallel_execute"() ( {
+    // CHECK-NEXT: "tf.AssignVariableOp"([[ARG5]], [[ARG1]])
+    // CHECK-NEXT: "tf.AssignVariableOp"([[ARG6]], [[ARG3]])
+    // CHECK-NEXT: tf_device.return [[ARG0]], [[ARG2]], [[ARG4]] : tensor<i1>, tensor<i64>, tensor<f64>
+    tf_device.return %arg0, %arg1, %arg2, %arg3, %arg4 : tensor<i1>, tensor<i32>, tensor<i64>, tensor<f32>, tensor<f64>
+  // CHECK-NEXT: }, {
+  }, {
+    // CHECK-NEXT: tf_device.return
+    tf_device.return
+  // CHECK-NEXT: }) : () -> (tensor<i1>, tensor<i64>, tensor<f64>)
+  }) : () -> (tensor<i1>, tensor<i32>, tensor<i64>, tensor<f32>, tensor<f64>)
+  "tf.AssignVariableOp"(%arg5, %0#1) : (tensor<!tf.resource>, tensor<i32>) -> ()
+  "tf.AssignVariableOp"(%arg6, %0#3) : (tensor<!tf.resource>, tensor<f32>) -> ()
+  // CHECK-NEXT: return
+  return
+}
+
+// CHECK-LABEL: func @replace_multiple_outputs_regions
+// CHECK-SAME:  ([[ARG0:%.+]]: tensor<i1>, [[ARG1:%.+]]: tensor<i32>, [[ARG2:%.+]]: tensor<i64>, [[ARG3:%.+]]: tensor<bf16>, [[ARG4:%.+]]: tensor<f32>, [[ARG5:%.+]]: tensor<f64>, [[ARG6:%.+]]: tensor<!tf.resource>, [[ARG7:%.+]]: tensor<!tf.resource>)
+func @replace_multiple_outputs_regions(%arg0: tensor<i1>, %arg1: tensor<i32>, %arg2: tensor<i64>, %arg3: tensor<bf16>, %arg4: tensor<f32>, %arg5: tensor<f64>, %arg6: tensor<!tf.resource>, %arg7: tensor<!tf.resource>) {
+  // CHECK:      {{%.+}}:4 = "tf_device.parallel_execute"
+  %0:6 = "tf_device.parallel_execute"() ( {
+    // CHECK-NEXT: "tf.AssignVariableOp"([[ARG6]], [[ARG1]])
+    // CHECK-NEXT: tf_device.return [[ARG0]], [[ARG2]] : tensor<i1>, tensor<i64>
+    tf_device.return %arg0, %arg1, %arg2 : tensor<i1>, tensor<i32>, tensor<i64>
+  // CHECK-NEXT: }, {
+  }, {
+    // CHECK-NEXT: "tf.AssignVariableOp"([[ARG7]], [[ARG4]])
+    // CHECK-NEXT: tf_device.return [[ARG3]], [[ARG5]] : tensor<bf16>, tensor<f64>
+    tf_device.return %arg3, %arg4, %arg5 : tensor<bf16>, tensor<f32>, tensor<f64>
+  // CHECK-NEXT: }) : () -> (tensor<i1>, tensor<i64>, tensor<bf16>, tensor<f64>)
+  }) : () -> (tensor<i1>, tensor<i32>, tensor<i64>, tensor<bf16>, tensor<f32>, tensor<f64>)
+  "tf.AssignVariableOp"(%arg6, %0#1) : (tensor<!tf.resource>, tensor<i32>) -> ()
+  "tf.AssignVariableOp"(%arg7, %0#4) : (tensor<!tf.resource>, tensor<f32>) -> ()
+  // CHECK-NEXT: return
+  return
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir
index 2a0091ce9bf..ef7b52cd978 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir
@@ -1262,15 +1262,15 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
       // CHECK-NOT:"tf._TPUCompileMlirPlaceholderProgramKey"
       // CHECK:    "tf.E"(%[[COMPILE_OUTPUT]]#1
       %3 = "tf_device.parallel_execute"() ( {
-         %program = "tf._TPUCompileMlirPlaceholderProgramKey"() : () -> tensor<?x!tf.string>
-        "tf.D"(%program) : (tensor<?x!tf.string>) -> ()
+         %program = "tf._TPUCompileMlirPlaceholderProgramKey"() : () -> tensor<2x!tf.string>
+        "tf.D"(%program) : (tensor<2x!tf.string>) -> ()
         tf_device.return
       }, {
         %4 = "tf_device.cluster_func"(%ri_0) {_tpu_replicate = "cluster0", func = @tpu0_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], use_spmd_for_xla_partitioning = false} : (tensor<?xi32>) -> tensor<?xi32>
         tf_device.return %4 : tensor<?xi32>
       }, {
-        %program = "tf._TPUCompileMlirPlaceholderProgramKey"() : () -> tensor<?x!tf.string>
-        "tf.E"(%program) : (tensor<?x!tf.string>) -> ()
+        %program = "tf._TPUCompileMlirPlaceholderProgramKey"() : () -> tensor<2x!tf.string>
+        "tf.E"(%program) : (tensor<2x!tf.string>) -> ()
         tf_device.return
       }) : () -> (tensor<?xi32>)
       tf_device.return %3 : tensor<?xi32>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_space_to_depth_pass.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_space_to_depth_pass.mlir
index 280986a7ee1..ceecb3e72d9 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_space_to_depth_pass.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_space_to_depth_pass.mlir
@@ -83,5 +83,80 @@ module attributes {tf.devices = {"/job:localhost/replica:0/task:0/device:CPU:0"
   }
 }
 
-// ----
+// -----
+
+// Tests for space to depth host and device transform with replicate inputs.
+
+module attributes {tf.devices = {"/job:localhost/replica:0/task:0/device:COMPOSITE:0" = {}, "/job:localhost/replica:0/task:0/device:CPU:0" = {}, "/job:localhost/replica:0/task:0/device:TPU:0" = {}, "/job:localhost/replica:0/task:0/device:TPU:1" = {}, "/job:localhost/replica:0/task:0/device:TPU_SYSTEM:0" = {}}, tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 458 : i32}} {
+  func @main(%arg0: tensor<*x!tf.resource> {tf._user_specified_name = "iterator", tf.device = "/job:localhost/replica:0/task:0/device:CPU:0"}, %arg1: tensor<!tf.variant> {tf._user_specified_name = "iterator", tf.device = "/job:localhost/replica:0/task:0/device:CPU:0"}, %arg2: tensor<*x!tf.resource> {tf._user_specified_name = "iterator", tf.device = "/job:localhost/replica:0/task:0/device:CPU:0"}, %arg3: tensor<!tf.variant> {tf._user_specified_name = "iterator", tf.device = "/job:localhost/replica:0/task:0/device:CPU:0"}, %arg4: tensor<*x!tf.resource> {tf._user_specified_name = "iterator", tf.device = "/job:localhost/replica:0/task:0/device:CPU:0"}, %arg5: tensor<!tf.variant> {tf._user_specified_name = "iterator", tf.device = "/job:localhost/replica:0/task:0/device:CPU:0"}, %arg6: tensor<*x!tf.resource<tensor<7x7x3x64xf32>>> {tf._composite_device = "/job:localhost/replica:0/task:0/device:COMPOSITE:0", tf.device = "/job:localhost/replica:0/task:0/device:COMPOSITE:0"}, %arg7: tensor<*x!tf.resource<tensor<64x1001xf32>>> {tf._composite_device = "/job:localhost/replica:0/task:0/device:COMPOSITE:0", tf.device = "/job:localhost/replica:0/task:0/device:COMPOSITE:0"}, %arg8: tensor<*x!tf.resource<tensor<1001xf32>>> {tf._composite_device = "/job:localhost/replica:0/task:0/device:COMPOSITE:0", tf.device = "/job:localhost/replica:0/task:0/device:COMPOSITE:0"}, %arg9: tensor<*x!tf.resource<tensor<f32>>> {tf._composite_device = "/job:localhost/replica:0/task:0/device:COMPOSITE:0", tf.device = "/job:localhost/replica:0/task:0/device:COMPOSITE:0"}, %arg10: tensor<*x!tf.resource<tensor<f32>>> {tf._composite_device = "/job:localhost/replica:0/task:0/device:COMPOSITE:0", tf.device = "/job:localhost/replica:0/task:0/device:COMPOSITE:0"}, %arg11: tensor<*x!tf.resource<tensor<f32>>> {tf._composite_device = "/job:localhost/replica:0/task:0/device:COMPOSITE:0", tf.device = "/job:localhost/replica:0/task:0/device:COMPOSITE:0"}, %arg12: tensor<*x!tf.resource<tensor<f32>>> {tf._composite_device = "/job:localhost/replica:0/task:0/device:COMPOSITE:0", tf.device = "/job:localhost/replica:0/task:0/device:COMPOSITE:0"}) attributes {tf.entry_function = {control_outputs = "IteratorGetNext,IteratorGetNext_1,CrossReplicaSum,AssignAddVariableOp,CrossReplicaSum_1,AssignAddVariableOp_1,CrossReplicaSum_2,AssignAddVariableOp_2,CrossReplicaSum_3,AssignAddVariableOp_3", inputs = "iterator,iterator_1,iterator_2,iterator_3,iterator_4,iterator_5,resnet50_conv1_conv2d_conv1_kernel_140365606309224_handle_inputs_0,resnet50_fc1000_matmul_fc1000_kernel_140365944145960_handle_inputs_0,resnet50_fc1000_biasadd_fc1000_bias_140365944146240_handle_inputs_0,total_140366323758976_handle_inputs_0,count_140366323759312_handle_inputs_0,total_140366323760264_handle_inputs_0,count_140366323760600_handle_inputs_0", outputs = ""}} {
+    // CHECK: %[[INPUT00:.*]] = "tf.IteratorGetNext"
+    // CHECK-DAG: %[[SPACETODEPTH00:.*]] = "tf.SpaceToDepth"([[INPUT00:.*]]#0) {block_size = 2 : i64, data_format = "NHWC"} : (tensor<2x224x224x3xf32>) -> tensor<2x112x112x12xf32>
+    %0:2 = "tf.IteratorGetNext"(%arg2) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<*x!tf.resource>) -> (tensor<2x224x224x3xf32>, tensor<2x1xf32>)
+    // CHECK: %[[INPUT01:.*]] = "tf.IteratorGetNext"
+    // CHECK-DAG: %[[SPACETODEPTH01:.*]] = "tf.SpaceToDepth"([[INPUT01:.*]]#0) {block_size = 2 : i64, data_format = "NHWC"} : (tensor<2x224x224x3xf32>) -> tensor<2x112x112x12xf32>
+    %1:2 = "tf.IteratorGetNext"(%arg4) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<*x!tf.resource>) -> (tensor<2x224x224x3xf32>, tensor<2x1xf32>)
+    tf_device.replicate([%0#0, %1#0] as %arg13: tensor<2x224x224x3xf32>, [%0#1, %1#1] as %arg14: tensor<2x1xf32>, %arg6 as %arg15: tensor<*x!tf.resource<tensor<7x7x3x64xf32>>>, %arg8 as %arg16: tensor<*x!tf.resource<tensor<1001xf32>>>, %arg7 as %arg17: tensor<*x!tf.resource<tensor<64x1001xf32>>>, %arg9 as %arg18: tensor<*x!tf.resource<tensor<f32>>>, %arg10 as %arg19: tensor<*x!tf.resource<tensor<f32>>>, %arg11 as %arg20: tensor<*x!tf.resource<tensor<f32>>>, %arg12 as %arg21: tensor<*x!tf.resource<tensor<f32>>>) {_mirrored_variable_indices = [2, 3, 4, 5, 6, 7, 8], _replicated_input_indices = [1, 2, -1, -1, -1, -1, -1, -1, -1], devices = {}, n = 2 : i32} {
+      %2 = "tf.ReadVariableOp"(%arg15) : (tensor<*x!tf.resource<tensor<7x7x3x64xf32>>>) -> tensor<7x7x3x64xf32>
+      %3 = "tf.ReadVariableOp"(%arg16) : (tensor<*x!tf.resource<tensor<1001xf32>>>) -> tensor<1001xf32>
+      %4 = "tf.ReadVariableOp"(%arg17) : (tensor<*x!tf.resource<tensor<64x1001xf32>>>) -> tensor<64x1001xf32>
+      %5 = "tf.ReadVariableOp"(%arg18) : (tensor<*x!tf.resource<tensor<f32>>>) -> tensor<f32>
+      %6 = "tf.ReadVariableOp"(%arg19) : (tensor<*x!tf.resource<tensor<f32>>>) -> tensor<f32>
+      %7 = "tf.ReadVariableOp"(%arg20) : (tensor<*x!tf.resource<tensor<f32>>>) -> tensor<f32>
+      %8 = "tf.ReadVariableOp"(%arg21) : (tensor<*x!tf.resource<tensor<f32>>>) -> tensor<f32>
+      %9:4 = "tf_device.cluster_func"(%arg13, %arg14, %2, %4, %3, %5, %6, %7, %8) {_tpu_replicate = "cluster_eval_step", allow_soft_placement = false, computation_shape = [], device = "", device_assignment = [], func = @_func, host_compute_core = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00", "\08\01\1A\01\01\22\01\00", "\08\01\1A\01\01\22\01\00", "\08\01\1A\01\01\22\01\00", "\08\01\1A\01\01\22\01\00", "\08\01\1A\01\01\22\01\00", "\08\01\1A\01\01\22\01\00", "\08\01\1A\01\01\22\01\00", "\08\01\1A\01\01\22\01\00"], num_cores_per_replica = 1 : i64, output_sharding_configuration = ["\08\01\1A\01\01\22\01\00", "\08\01\1A\01\01\22\01\00", "\08\01\1A\01\01\22\01\00", "\08\01\1A\01\01\22\01\00"], padding_map = [], step_marker_location = "STEP_MARK_AT_ENTRY", topology = "", use_spmd_for_xla_partitioning = false, use_tpu = true} : (tensor<2x224x224x3xf32>, tensor<2x1xf32>, tensor<7x7x3x64xf32>, tensor<64x1001xf32>, tensor<1001xf32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>) -> (tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>)
+      "tf.AssignVariableOp"(%arg18, %9#0) : (tensor<*x!tf.resource<tensor<f32>>>, tensor<f32>) -> ()
+      "tf.AssignVariableOp"(%arg19, %9#1) : (tensor<*x!tf.resource<tensor<f32>>>, tensor<f32>) -> ()
+      "tf.AssignVariableOp"(%arg20, %9#2) : (tensor<*x!tf.resource<tensor<f32>>>, tensor<f32>) -> ()
+      "tf.AssignVariableOp"(%arg21, %9#3) : (tensor<*x!tf.resource<tensor<f32>>>, tensor<f32>) -> ()
+      tf_device.return
+    }
+    return
+  }
+  // CHECK-LABEL: func @_func
+  // CHECK-SAME: [[FUNCINPUT00:.*]]: tensor<2x112x112x12xf32> {mhlo.sharding = "\08\01\1A\01\01\22\01\00"}, %arg1: tensor<2x1xf32> {mhlo.sharding = "\08\01\1A\01\01\22\01\00"}, %arg2: tensor<7x7x3x64xf32> {mhlo.is_same_data_across_replicas, mhlo.sharding = "\08\01\1A\01\01\22\01\00"}, %arg3: tensor<64x1001xf32> {mhlo.is_same_data_across_replicas, mhlo.sharding = "\08\01\1A\01\01\22\01\00"}, %arg4: tensor<1001xf32> {mhlo.is_same_data_across_replicas, mhlo.sharding = "\08\01\1A\01\01\22\01\00"}, %arg5: tensor<f32> {mhlo.is_same_data_across_replicas, mhlo.sharding = "\08\01\1A\01\01\22\01\00"}, %arg6: tensor<f32> {mhlo.is_same_data_across_replicas, mhlo.sharding = "\08\01\1A\01\01\22\01\00"}, %arg7: tensor<f32> {mhlo.is_same_data_across_replicas, mhlo.sharding = "\08\01\1A\01\01\22\01\00"}, %arg8: tensor<f32> {mhlo.is_same_data_across_replicas, mhlo.sharding = "\08\01\1A\01\01\22\01\00"}) -> (tensor<f32> {mhlo.sharding = "\08\01\1A\01\01\22\01\00"}, tensor<f32> {mhlo.sharding = "\08\01\1A\01\01\22\01\00"}, tensor<f32> {mhlo.sharding = "\08\01\1A\01\01\22\01\00"}, tensor<f32> {mhlo.sharding = "\08\01\1A\01\01\22\01\00"}) attributes {sym_visibility = "private"} {
+  func @_func(%arg0: tensor<2x224x224x3xf32> {mhlo.sharding = "\08\01\1A\01\01\22\01\00"}, %arg1: tensor<2x1xf32> {mhlo.sharding = "\08\01\1A\01\01\22\01\00"}, %arg2: tensor<7x7x3x64xf32> {mhlo.is_same_data_across_replicas, mhlo.sharding = "\08\01\1A\01\01\22\01\00"}, %arg3: tensor<64x1001xf32> {mhlo.is_same_data_across_replicas, mhlo.sharding = "\08\01\1A\01\01\22\01\00"}, %arg4: tensor<1001xf32> {mhlo.is_same_data_across_replicas, mhlo.sharding = "\08\01\1A\01\01\22\01\00"}, %arg5: tensor<f32> {mhlo.is_same_data_across_replicas, mhlo.sharding = "\08\01\1A\01\01\22\01\00"}, %arg6: tensor<f32> {mhlo.is_same_data_across_replicas, mhlo.sharding = "\08\01\1A\01\01\22\01\00"}, %arg7: tensor<f32> {mhlo.is_same_data_across_replicas, mhlo.sharding = "\08\01\1A\01\01\22\01\00"}, %arg8: tensor<f32> {mhlo.is_same_data_across_replicas, mhlo.sharding = "\08\01\1A\01\01\22\01\00"}) -> (tensor<f32> {mhlo.sharding = "\08\01\1A\01\01\22\01\00"}, tensor<f32> {mhlo.sharding = "\08\01\1A\01\01\22\01\00"}, tensor<f32> {mhlo.sharding = "\08\01\1A\01\01\22\01\00"}, tensor<f32> {mhlo.sharding = "\08\01\1A\01\01\22\01\00"}) attributes {sym_visibility = "private"} {
+    %0 = "tf.Const"() {value = dense<2.000000e+00> : tensor<f32>} : () -> tensor<f32>
+    %1 = "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
+    %2 = "tf.Const"() {value = dense<-1> : tensor<i32>} : () -> tensor<i32>
+    %3 = "tf.Const"() {value = dense<[[0, 1]]> : tensor<1x2xi32>} : () -> tensor<1x2xi32>
+    %4 = "tf.Const"() {value = dense<> : tensor<0xi32>} : () -> tensor<0xi32>
+    %5 = "tf.Const"() {value = dense<2.500000e-01> : tensor<f32>} : () -> tensor<f32>
+    %6 = "tf.Const"() {value = dense<0> : tensor<1xi32>} : () -> tensor<1xi32>
+    %7 = "tf.Const"() {value = dense<[-1, 1001]> : tensor<2xi32>} : () -> tensor<2xi32>
+    %8 = "tf.Const"() {value = dense<[1, 2]> : tensor<2xi32>} : () -> tensor<2xi32>
+    %9 = "tf.Const"() {value = dense<-1> : tensor<1xi32>} : () -> tensor<1xi32>
+    %10 = "tf.Const"() {value = dense<[[0, 0], [3, 3], [3, 3], [0, 0]]> : tensor<4x2xi32>} : () -> tensor<4x2xi32>
+    %11 = "tf.Pad"(%arg0, %10) : (tensor<2x224x224x3xf32>, tensor<4x2xi32>) -> tensor<2x230x230x3xf32>
+    %12 = "tf.Cast"(%arg1) {Truncate = false} : (tensor<2x1xf32>) -> tensor<2x1xi64>
+    %13 = "tf.Reshape"(%12, %9) : (tensor<2x1xi64>, tensor<1xi32>) -> tensor<2xi64>
+    %14 = "tf.Squeeze"(%arg1) {squeeze_dims = [-1]} : (tensor<2x1xf32>) -> tensor<2xf32>
+    // CHECK: "tf.Conv2D"
+    // CHECK-SAME: strides = [1, 1, 1, 1]
+    // CHECK-SAME: (tensor<2x115x115x12xf32>, tensor<4x4x12x64xf32>) -> tensor<2x112x112x64xf32>
+    %15 = "tf.Conv2D"(%11, %arg2) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "VALID", strides = [1, 2, 2, 1], use_cudnn_on_gpu = true} : (tensor<2x230x230x3xf32>, tensor<7x7x3x64xf32>) -> tensor<2x112x112x64xf32>
+    %16 = "tf.Mean"(%15, %8) {keep_dims = false} : (tensor<2x112x112x64xf32>, tensor<2xi32>) -> tensor<2x64xf32>
+    %17 = "tf.MatMul"(%16, %arg3) {transpose_a = false, transpose_b = false} : (tensor<2x64xf32>, tensor<64x1001xf32>) -> tensor<2x1001xf32>
+    %18 = "tf.BiasAdd"(%17, %arg4) {data_format = "NHWC"} : (tensor<2x1001xf32>, tensor<1001xf32>) -> tensor<2x1001xf32>
+    %19 = "tf.Reshape"(%18, %7) : (tensor<2x1001xf32>, tensor<2xi32>) -> tensor<2x1001xf32>
+    %loss, %backprop = "tf.SparseSoftmaxCrossEntropyWithLogits"(%19, %13) : (tensor<2x1001xf32>, tensor<2xi64>) -> (tensor<2xf32>, tensor<2x1001xf32>)
+    %20 = "tf.Sum"(%loss, %6) {keep_dims = false} : (tensor<2xf32>, tensor<1xi32>) -> tensor<f32>
+    %21 = "tf.Mul"(%20, %5) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+    %22 = "tf.Sum"(%21, %4) {keep_dims = false} : (tensor<f32>, tensor<0xi32>) -> tensor<f32>
+    %23 = "tf.CrossReplicaSum"(%22, %3) : (tensor<f32>, tensor<1x2xi32>) -> tensor<f32>
+    %24 = "tf.Softmax"(%18) : (tensor<2x1001xf32>) -> tensor<2x1001xf32>
+    %25 = "tf.ArgMax"(%24, %2) : (tensor<2x1001xf32>, tensor<i32>) -> tensor<2xi64>
+    %26 = "tf.Cast"(%25) {Truncate = false} : (tensor<2xi64>) -> tensor<2xf32>
+    %27 = "tf.Equal"(%14, %26) {incompatible_shape_error = true} : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xi1>
+    %28 = "tf.Cast"(%27) {Truncate = false} : (tensor<2xi1>) -> tensor<2xf32>
+    %29 = "tf.Sum"(%28, %6) {keep_dims = false} : (tensor<2xf32>, tensor<1xi32>) -> tensor<f32>
+    %30 = "tf.CrossReplicaSum"(%29, %3) : (tensor<f32>, tensor<1x2xi32>) -> tensor<f32>
+    %31 = "tf.AddV2"(%arg5, %23) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+    %32 = "tf.CrossReplicaSum"(%1, %3) : (tensor<f32>, tensor<1x2xi32>) -> tensor<f32>
+    %33 = "tf.AddV2"(%arg6, %32) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+    %34 = "tf.AddV2"(%arg7, %30) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+    %35 = "tf.CrossReplicaSum"(%0, %3) : (tensor<f32>, tensor<1x2xi32>) -> tensor<f32>
+    %36 = "tf.AddV2"(%arg8, %35) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+    return %31, %33, %34, %36 : tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>
+  }
+}
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/visitor-interrupt-util.mlir b/tensorflow/compiler/mlir/tensorflow/tests/visitor-interrupt-util.mlir
new file mode 100644
index 00000000000..8cc8d273bec
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/visitor-interrupt-util.mlir
@@ -0,0 +1,91 @@
+// RUN: tf-opt -split-input-file -verify-diagnostics -tf-test-visitor-util-interrupt %s
+
+// Test simple operations with no regions and no interrupts. They should be
+// visited with stage "before all regions".
+
+// expected-remark@below {{0: before all regions}}
+// expected-remark@below {{4: after all regions}}
+func @foo(%arg0: tensor<f32>) -> tensor<f32> {
+  // expected-remark@below {{1: before all regions}}
+  %cst = constant dense<1.0> : tensor<f32>
+  // expected-remark@below {{2: before all regions}}
+  %0 = "tf.Identity"(%arg0) : (tensor<f32>) -> tensor<f32>
+  // expected-remark@below {{3: before all regions}}
+  return %0 : tensor<f32>
+}
+
+// -----
+
+// Test simple operations with no regions and interrupts. No remarks after
+// the interrupting operation is visited.
+
+// expected-remark@below {{0: before all regions}}
+// expected-remark@below {{2: walk was interrupted}}
+func @foo(%arg0: tensor<f32>) -> tensor<f32> {
+  // expected-remark@below {{1: before all regions}}
+  %cst = constant dense<1.0> : tensor<f32>
+  %0 = "tf.Identity"(%arg0)  {interrupt_before_all = true} : (tensor<f32>) -> tensor<f32>
+  return %0 : tensor<f32>
+}
+
+// -----
+// Test operation with non empty regions.
+// expected-remark@below {{0: before all regions}}
+// expected-remark@below {{5: walk was interrupted}}
+func @foo(%arg0: tensor<f32>) -> tensor<f32> {
+  // expected-remark@below {{1: before all regions}}
+  %cst = constant dense<1.0> : tensor<f32>
+  // expected-remark@below {{2: before all regions}}
+  %0 = "tf.unknownop"(%arg0) ({
+    // expected-remark@below {{3: before all regions}}
+    %1 = "tf.Identity"(%arg0) : (tensor<f32>) -> tensor<f32>
+    // expected-remark@below {{4: before all regions}}
+    "tf.yield"(%1) : (tensor<f32>) -> ()
+  }) {interrupt_after_all = true} : (tensor<f32>) -> tensor<f32>
+  return %0 : tensor<f32>
+}
+
+// -----
+// Test operation with multiple regions.
+// expected-remark@below {{0: before all regions}}
+// expected-remark@below {{5: walk was interrupted}}
+func @foo(%arg0: tensor<f32>) -> tensor<f32> {
+  // expected-remark@below {{1: before all regions}}
+  %cst = constant dense<1.0> : tensor<f32>
+  // expected-remark@below {{2: before all regions}}
+  %0 = "tf.unknownop"(%arg0) ({
+    // expected-remark@below {{3: before all regions}}
+    %1 = "tf.Identity"(%arg0) : (tensor<f32>) -> tensor<f32>
+    // expected-remark@below {{4: before all regions}}
+    "tf.yield"(%1) : (tensor<f32>) -> ()
+  }, {
+    %1 = "tf.Identity"(%arg0) : (tensor<f32>) -> tensor<f32>
+    "tf.yield"(%1) : (tensor<f32>) -> ()
+  }) {interrupt_after_region = 0} : (tensor<f32>) -> tensor<f32>
+  return %0 : tensor<f32>
+}
+
+// -----
+// Test static filtering
+// expected-remark@below {{0: before all regions}}
+// expected-remark@below {{7: walk was interrupted}}
+func @foo(%arg0: tensor<f32>, %arg1: tensor<i1>) -> tensor<f32> {
+  // expected-remark@below {{1: before all regions}}
+  %cst = constant dense<1.0> : tensor<f32>
+  // expected-remark@below {{2: before all regions}}
+  // expected-remark@below {{5: before region #1}}
+  // expected-remark@below {{8: before all regions}}
+  // expected-remark@below {{9: before region #1}}
+  // expected-remark@below {{10: after all regions}}
+  %0 = "tf.IfRegion"(%arg1) ({
+    // expected-remark@below {{3: before all regions}}
+    %1 = "tf.Identity"(%arg0) : (tensor<f32>) -> tensor<f32>
+    // expected-remark@below {{4: before all regions}}
+    "tf.Yield"(%1) : (tensor<f32>) -> ()
+  }, {
+    // expected-remark@below {{6: before all regions}}
+    %1 = "tf.Identity"(%arg0) : (tensor<f32>) -> tensor<f32>
+    "tf.Yield"(%1) { interrupt_after_all = true } : (tensor<f32>) -> ()
+  }) {is_stateless = true}: (tensor<i1>) -> tensor<f32>
+  return %0 : tensor<f32>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/visitor-util.mlir b/tensorflow/compiler/mlir/tensorflow/tests/visitor-util.mlir
new file mode 100644
index 00000000000..9a832b7fe8d
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/visitor-util.mlir
@@ -0,0 +1,102 @@
+// RUN: tf-opt -split-input-file -verify-diagnostics -tf-test-visitor-util %s
+
+// Test simple operations with no regions. They should be visited with stage
+// = before all regions.
+
+// expected-remark@below {{0: before all regions}}
+// expected-remark@below {{4: after all regions}}
+func @foo(%arg0: tensor<f32>) -> tensor<f32> {
+  // expected-remark@below {{1: before all regions}}
+  %cst = constant dense<1.0> : tensor<f32>
+  // expected-remark@below {{2: before all regions}}
+  %0 = "tf.Identity"(%arg0) : (tensor<f32>) -> tensor<f32>
+  // expected-remark@below {{3: before all regions}}
+  return %0 : tensor<f32>
+}
+
+// -----
+// Test operation with empty regions.
+// expected-remark@below {{0: before all regions}}
+// expected-remark@below {{5: after all regions}}
+func @foo(%arg0: tensor<f32>) -> tensor<f32> {
+  // expected-remark@below {{1: before all regions}}
+  %cst = constant dense<1.0> : tensor<f32>
+  // expected-remark@below {{2: before all regions}}
+  // expected-remark@below {{3: after all regions}}
+  %0 = "tf.unknownop"(%arg0) ({
+  }) : (tensor<f32>) -> tensor<f32>
+  // expected-remark@below {{4: before all regions}}
+  return %0 : tensor<f32>
+}
+
+// -----
+// Test operation with non empty regions.
+// expected-remark@below {{0: before all regions}}
+// expected-remark@below {{7: after all regions}}
+func @foo(%arg0: tensor<f32>) -> tensor<f32> {
+  // expected-remark@below {{1: before all regions}}
+  %cst = constant dense<1.0> : tensor<f32>
+  // expected-remark@below {{2: before all regions}}
+  // expected-remark@below {{5: after all regions}}
+  %0 = "tf.unknownop"(%arg0) ({
+    // expected-remark@below {{3: before all regions}}
+    %1 = "tf.Identity"(%arg0) : (tensor<f32>) -> tensor<f32>
+    // expected-remark@below {{4: before all regions}}
+    "tf.yield"(%1) : (tensor<f32>) -> ()
+  }) : (tensor<f32>) -> tensor<f32>
+  // expected-remark@below {{6: before all regions}}
+  return %0 : tensor<f32>
+}
+
+// -----
+// Test operation with multiple regions.
+// expected-remark@below {{0: before all regions}}
+// expected-remark@below {{10: after all regions}}
+func @foo(%arg0: tensor<f32>) -> tensor<f32> {
+  // expected-remark@below {{1: before all regions}}
+  %cst = constant dense<1.0> : tensor<f32>
+  // expected-remark@below {{2: before all regions}}
+  // expected-remark@below {{5: before region #1}}
+  // expected-remark@below {{8: after all regions}}
+  %0 = "tf.unknownop"(%arg0) ({
+    // expected-remark@below {{3: before all regions}}
+    %1 = "tf.Identity"(%arg0) : (tensor<f32>) -> tensor<f32>
+    // expected-remark@below {{4: before all regions}}
+    "tf.yield"(%1) : (tensor<f32>) -> ()
+  }, {
+    // expected-remark@below {{6: before all regions}}
+    %1 = "tf.Identity"(%arg0) : (tensor<f32>) -> tensor<f32>
+    // expected-remark@below {{7: before all regions}}
+    "tf.yield"(%1) : (tensor<f32>) -> ()
+  }) : (tensor<f32>) -> tensor<f32>
+  // expected-remark@below {{9: before all regions}}
+  return %0 : tensor<f32>
+}
+
+// -----
+// Test static filtering
+// expected-remark@below {{0: before all regions}}
+// expected-remark@below {{10: after all regions}}
+func @foo(%arg0: tensor<f32>, %arg1: tensor<i1>) -> tensor<f32> {
+  // expected-remark@below {{1: before all regions}}
+  %cst = constant dense<1.0> : tensor<f32>
+  // expected-remark@below {{2: before all regions}}
+  // expected-remark@below {{5: before region #1}}
+  // expected-remark@below {{8: after all regions}}
+  // expected-remark@below {{11: before all regions}}
+  // expected-remark@below {{12: before region #1}}
+  // expected-remark@below {{13: after all regions}}
+  %0 = "tf.IfRegion"(%arg1) ({
+    // expected-remark@below {{3: before all regions}}
+    %1 = "tf.Identity"(%arg0) : (tensor<f32>) -> tensor<f32>
+    // expected-remark@below {{4: before all regions}}
+    "tf.Yield"(%1) : (tensor<f32>) -> ()
+  }, {
+    // expected-remark@below {{6: before all regions}}
+    %1 = "tf.Identity"(%arg0) : (tensor<f32>) -> tensor<f32>
+    // expected-remark@below {{7: before all regions}}
+    "tf.Yield"(%1) : (tensor<f32>) -> ()
+  }) {is_stateless = true}: (tensor<i1>) -> tensor<f32>
+  // expected-remark@below {{9: before all regions}}
+  return %0 : tensor<f32>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/batchmatmul_to_einsum.cc b/tensorflow/compiler/mlir/tensorflow/transforms/batchmatmul_to_einsum.cc
index de73dff8b0b..fe0c5bea44e 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/batchmatmul_to_einsum.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/batchmatmul_to_einsum.cc
@@ -13,8 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/mlir/tensorflow/transforms/batchmatmul_to_einsum.h"
-
 #include <climits>
 #include <cstdint>
 #include <numeric>
@@ -41,7 +39,48 @@ namespace mlir {
 namespace TF {
 
 namespace {
-// Replace TF BatchMatMul by TF Einsum
+
+// Replace TF BatchMatMul by TF Einsum op
+template <typename BatchMatMulOpType>
+class ConvertTFBatchMatMulToEinsumOp
+    : public OpRewritePattern<BatchMatMulOpType> {
+  using OpRewritePattern<BatchMatMulOpType>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(BatchMatMulOpType op,
+                                PatternRewriter& rewriter) const override {
+    Value input_lhs = op.x();
+    Value input_rhs = op.y();
+
+    // LHS and RHS must be a ranked tensor type
+    auto lhs_type = input_lhs.getType().dyn_cast<RankedTensorType>();
+    auto rhs_type = input_rhs.getType().dyn_cast<RankedTensorType>();
+
+    if (!lhs_type || !rhs_type) return failure();
+
+    auto lhs_shape = lhs_type.getShape();
+    auto rhs_shape = rhs_type.getShape();
+
+    // Ensure that input ranks are at least 2.
+    const int dims_a = lhs_shape.size();
+    const int dims_b = rhs_shape.size();
+    if (dims_a < 2 || dims_b < 2) {
+      return failure();
+    }
+
+    // einsum equation for batchmatmul
+    std::string equation("...mk,...kn->...mn");
+    if (op.adj_x()) std::swap(equation[3], equation[4]);
+    if (op.adj_y()) std::swap(equation[6 + 3], equation[6 + 4]);
+
+    rewriter.replaceOpWithNewOp<TF::EinsumOp>(
+        op, op.getType(),
+        /*inputs=*/ValueRange({input_lhs, input_rhs}),
+        /*equation=*/equation);
+
+    return success();
+  }
+};
+
 struct BatchMatMulToEinsumPass
     : public PassWrapper<BatchMatMulToEinsumPass, FunctionPass> {
   void runOnFunction() override;
@@ -57,65 +96,10 @@ void BatchMatMulToEinsumPass::runOnFunction() {
   applyPatternsAndFoldGreedily(func, patterns);
 }
 
-}  // namespace
-
-template <typename BatchMatMulOpType>
-LogicalResult
-ConvertTFBatchMatMulToEinsumOp<BatchMatMulOpType>::matchAndRewrite(
-    BatchMatMulOpType op, PatternRewriter& rewriter) const {
-  Value input_lhs = op.x();
-  Value input_rhs = op.y();
-
-  if (!input_lhs.getType().isa<RankedTensorType>()) {
-    // LHS must be a ranked tensor type
-    return failure();
-  }
-  if (!input_rhs.getType().isa<RankedTensorType>()) {
-    // RHS must be a ranked tensor type
-    return failure();
-  }
-
-  auto lhs_type = input_lhs.getType().dyn_cast<RankedTensorType>();
-  auto rhs_type = input_rhs.getType().dyn_cast<RankedTensorType>();
-
-  if (!lhs_type || !rhs_type) {
-    return failure();
-  }
-
-  auto lhs_shape = lhs_type.getShape();
-  auto rhs_shape = rhs_type.getShape();
-
-  Location loc = op.getLoc();
-
-  // Ensure that input ranks are at least 2.
-  const int dims_a = lhs_shape.size();
-  const int dims_b = rhs_shape.size();
-  if (dims_a < 2 || dims_b < 2) {
-    // Both inputs must have rank >= 2
-    return failure();
-  }
-
-  // einsum equation for batchmatmul
-  std::string equation("...mk,...kn->...mn");
-
-  if (op.adj_x()) {
-    std::swap(equation[3], equation[4]);
-  }
-  if (op.adj_y()) {
-    std::swap(equation[6 + 3], equation[6 + 4]);
-  }
-
-  llvm::SmallVector<Value, 2> inputs = {input_lhs, input_rhs};
-  rewriter.replaceOpWithNewOp<TF::EinsumOp>(op, op.getType(),
-                                            /*inputs=*/ValueRange(inputs),
-                                            /*equation=*/equation);
-
-  return success();
-}
-
-static PassRegistration<BatchMatMulToEinsumPass> pass(
+PassRegistration<BatchMatMulToEinsumPass> pass(
     "tf-batch-matmul-to-tf-einsum",
     "Replace TF BatchMatMul op by TF Einsum op.");
+}  // namespace
 
 std::unique_ptr<OperationPass<FuncOp>> CreateBatchMatMulToEinsumPass() {
   return std::make_unique<BatchMatMulToEinsumPass>();
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/batchmatmul_to_einsum.h b/tensorflow/compiler/mlir/tensorflow/transforms/batchmatmul_to_einsum.h
deleted file mode 100644
index d39f3575b4a..00000000000
--- a/tensorflow/compiler/mlir/tensorflow/transforms/batchmatmul_to_einsum.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_BATCHMATMUL_TO_EINSUM_H_
-#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_BATCHMATMUL_TO_EINSUM_H_
-
-#include "llvm/ADT/ArrayRef.h"
-#include "mlir/IR/Location.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
-#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
-#include "tensorflow/core/util/matmul_bcast.h"
-
-namespace mlir {
-namespace TF {
-
-// Replace TF BatchMatMul by TF Einsum op
-template <typename BatchMatMulOpType>
-class ConvertTFBatchMatMulToEinsumOp
-    : public OpRewritePattern<BatchMatMulOpType> {
-  using OpRewritePattern<BatchMatMulOpType>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(
-      BatchMatMulOpType op,
-      PatternRewriter& rewriter) const override;  // NOLINT
-};
-
-}  // namespace TF
-}  // namespace mlir
-
-#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_BATCHMATMUL_TO_EINSUM_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc b/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc
index ed0528ae054..358963a79e1 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc
@@ -47,7 +47,8 @@ void AddGraphExportLoweringPasses(OpPassManager &pm) {
 
   pm.addNestedPass<FuncOp>(CreateFunctionalToExecutorDialectConversionPass());
   add_pass(TFDevice::CreateParallelizeEmbeddingParamsOpsPass());
-  add_pass(TFDevice::CreateReplicateToIslandPass());
+  pm.addPass(TFDevice::CreateReplicateToIslandPass());
+  pm.addPass(CreateBreakUpIslandsPass());
   add_pass(TFDevice::CreateParallelExecuteToIslandsPass());
   add_pass(TFDevice::CreateLaunchToDeviceAttributePass());
 }
@@ -85,8 +86,8 @@ void CreateTPUBridgePipeline(OpPassManager &pm) {
   // Encode this in its own scope so that func_pm is not mistakenly used
   // later on.
   {
+    pm.addPass(CreateTPUClusterFormationPass());
     OpPassManager &func_pm = pm.nest<FuncOp>();
-    func_pm.addPass(CreateTPUClusterFormationPass());
     // Place DecomposeResourceOpsPass before TFExecutorConstantSinking pass
     // because DecomposeResourceOpsPass uses pattern rewriter which hoists
     // changed constants out of tf_device.Launch.
@@ -94,26 +95,32 @@ void CreateTPUBridgePipeline(OpPassManager &pm) {
     func_pm.addPass(CreateTPUHostComputationExpansionPass());
     func_pm.addPass(CreateTPUUpdateEmbeddingEnqueueOpInputsPass());
   }
-  pm.addPass(TF::CreateTFFunctionalControlFlowToRegions());
-  pm.addPass(mlir::createInlinerPass());
-  pm.addPass(CreateTPUExtractHeadTailOutsideCompilationPass());
-  pm.addPass(TF::CreateTFRegionControlFlowToFunctional());
-
   // Run another shape inference pass because resource decomposition might have
   // created new partial types.
   pm.addPass(TF::CreateTFShapeInferencePass());
-  pm.addNestedPass<FuncOp>(tf_executor::CreateTFExecutorConstantSinkingPass());
+  pm.addPass(TF::CreateTFFunctionalControlFlowToRegions());
+  pm.addPass(mlir::createInlinerPass());
+  pm.addPass(CreateTPUClusterCleanupAttributesPass());
   pm.addPass(TFDevice::CreateResourceOpLiftingPass());
+  pm.addPass(TFDevice::CreateMarkOpsForOutsideCompilationPass());
+  pm.addPass(CreateTPUExtractHeadTailOutsideCompilationPass());
+  pm.addPass(CreateTPUExtractOutsideCompilationPass());
+
+  pm.addNestedPass<FuncOp>(tf_executor::CreateTFExecutorConstantSinkingPass());
   pm.addPass(TF::CreateResourceDeviceInferencePass());
   pm.addPass(TFDevice::CreateClusterOutliningPass());
   pm.addPass(CreateTPUDynamicPaddingMapperPass());
+  pm.addPass(CreateTPUResourceReadForWritePass());
   pm.addPass(CreateTPUShardingIdentificationPass());
   pm.addPass(TFDevice::CreateAnnotateParameterReplicationPass());
   pm.addPass(CreateTPURewritePass());
   pm.addPass(createSymbolDCEPass());
   pm.addNestedPass<FuncOp>(TFDevice::CreateReplicateInvariantOpHoistingPass());
   pm.addNestedPass<FuncOp>(CreateTPUDynamicLayoutPass());
+  pm.addNestedPass<FuncOp>(CreateTPUParallelExecuteSinkResourceWritePass());
   pm.addNestedPass<FuncOp>(CreateTPUMergeVariablesWithExecutePass());
+  pm.addNestedPass<FuncOp>(CreateTPUColocateCompositeResourceOps());
+  pm.addPass(TF::CreateTFRegionControlFlowToFunctional());
   pm.addPass(CreateTPUVariableReformattingPass());
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/cluster_formation.cc b/tensorflow/compiler/mlir/tensorflow/transforms/cluster_formation.cc
index 2b8ab85be38..e85058a1964 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/cluster_formation.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/cluster_formation.cc
@@ -39,6 +39,10 @@ namespace {
 
 struct ClusterFormationPass
     : public PassWrapper<ClusterFormationPass, FunctionPass> {
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<tf_device::TensorFlowDeviceDialect>();
+  }
+
   void runOnFunction() override;
 };
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/collection_ops_util.cc b/tensorflow/compiler/mlir/tensorflow/transforms/collection_ops_util.cc
index 57a5cd888a1..cde07503e75 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/collection_ops_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/collection_ops_util.cc
@@ -181,14 +181,14 @@ llvm::Optional<RankedTensorType> GetElementTypeFromAccess(
     llvm::function_ref<llvm::Optional<Type>(Operation*)> infer_from_op) {
   for (auto& use : collection.getUses()) {
     if (auto while_op = llvm::dyn_cast<TF::WhileOp>(use.getOwner())) {
-      auto body = while_op.body_func();
+      auto body = while_op.body_function();
       assert(body);
       auto type_from_body = GetElementTypeFromAccess(
           body.getArgument(use.getOperandNumber()), module, infer_from_op);
       if (type_from_body.hasValue()) return type_from_body;
     } else if (auto if_op = llvm::dyn_cast<TF::IfOp>(use.getOwner())) {
-      auto then_branch = if_op.then_func();
-      auto else_branch = if_op.else_func();
+      auto then_branch = if_op.then_function();
+      auto else_branch = if_op.else_function();
       assert(then_branch && else_branch);
       auto type_from_then = GetElementTypeFromAccess(
           then_branch.getArgument(use.getOperandNumber() - 1), module,
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.cc b/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.cc
index 1429e2b3fd4..3005c78c54f 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <algorithm>
 
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
 #include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/c/eager/c_api.h"
@@ -68,7 +69,7 @@ static bool ShouldBeFolded(Operation* inst) {
 
 LogicalResult ConstantFoldFallbackHook(
     Operation* inst, ArrayRef<Attribute> operands,
-    SmallVectorImpl<Attribute>& results) {  // NOLINT
+    SmallVectorImpl<OpFoldResult>& results) {  // NOLINT
   // Instructions with side effects should not be constant folded to preserve
   // the original semantics.
   if (inst->getNumRegions() != 0 || !MemoryEffectOpInterface::hasNoEffect(inst))
@@ -126,8 +127,16 @@ LogicalResult ConstantFoldFallbackHook(
   // TODO(jpienaar): Avoid using global context & mutex here.
   static auto* mu = new tensorflow::mutex();
   tensorflow::mutex_lock l(*mu);
-  return tensorflow::EvaluateOperation(inst, inputs, ctx, &results);
+  SmallVector<Attribute, 8> constants;
+  LogicalResult status =
+      tensorflow::EvaluateOperation(inst, inputs, ctx, &constants);
+  results.assign(constants.begin(), constants.end());
+  return status;
 }
 
+static bool init_hooks = ([] () {
+  TensorFlowDialect::RegisterConstantFoldHook(ConstantFoldFallbackHook);
+}(), true);
+
 }  // namespace TF
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.h b/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.h
index 69e39080965..887eea745e7 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.h
@@ -27,7 +27,7 @@ namespace TF {
 
 LogicalResult ConstantFoldFallbackHook(
     Operation *inst, ArrayRef<Attribute> operands,
-    SmallVectorImpl<Attribute> &results);  // NOLINT
+    SmallVectorImpl<OpFoldResult> &results);  // NOLINT
 
 }  // namespace TF
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/contraction_fusion.cc b/tensorflow/compiler/mlir/tensorflow/transforms/contraction_fusion.cc
new file mode 100644
index 00000000000..b5d09f7a794
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/contraction_fusion.cc
@@ -0,0 +1,162 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/UseDefLists.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+namespace mlir {
+namespace TF {
+namespace {
+
+// -------------------------------------------------------------------------- //
+// Fuse ContractionFusableInterface operations into contraction operation.
+// -------------------------------------------------------------------------- //
+
+template <typename BaseOp, typename FusedOp>
+class FuseIntoContractionOp : public RewritePattern {
+ public:
+  FuseIntoContractionOp()
+      : RewritePattern(PatternBenefit(1), MatchAnyOpTypeTag()) {}
+
+  LogicalResult matchAndRewrite(Operation *op,
+                                PatternRewriter &rewriter) const override {
+    auto fusable = dyn_cast<ContractionFusableInterface>(op);
+    if (!fusable) return failure();
+
+    auto failed = [&](Twine message) -> LogicalResult {
+      return rewriter.notifyMatchFailure(op, message);
+    };
+
+    // Check if the operation can be fused.
+    Optional<ContractionFusion> fusion = fusable.GetContractionFusion();
+    if (!fusion.hasValue()) {
+      return failed("returned empty contraction fusion specification");
+    }
+
+    // Check if preceeding operation is a BaseOp or FusedOp that we can use for
+    // fusion.
+    Operation *fuse_into = nullptr;
+    Value operand = op->getOperand(0);
+
+    if (BaseOp base_op = operand.getDefiningOp<BaseOp>()) {
+      fuse_into = base_op.getOperation();
+    } else if (FusedOp fused_op = operand.getDefiningOp<FusedOp>()) {
+      fuse_into = fused_op.getOperation();
+    } else {
+      return failed("input to the fusable op must be a " +
+                    BaseOp::getOperationName() + " or a " +
+                    FusedOp::getOperationName());
+    }
+
+    // Operand result must have one use, because we do not want to compute
+    // tensor contraction twice.
+    if (!fuse_into->getResult(0).hasOneUse()) {
+      return failed("fused into op result must have one use");
+    }
+
+    MLIRContext *ctx = op->getContext();
+
+    // Build a fused MatMul operation from a base MatMul and a fusion.
+    SmallVector<Location, 3> locations = {fuse_into->getLoc(), op->getLoc()};
+    Location loc = rewriter.getFusedLoc(locations);
+
+    // Fusion can't change the type of a fused operation.
+    Type result_ty = fuse_into->getResult(0).getType();
+
+    // Copy all operands from a base op and add additional fusion arguments.
+    SmallVector<Value, 3> operands(fuse_into->getOperands());
+    for (int idx : fusion->additional_arguments) {
+      operands.push_back(op->getOperand(idx));
+    }
+
+    // Copy attributes from a base op that we fuse into (e.g. copy all
+    // MatMul or Conv attributes to the fused operation).
+    SmallVector<NamedAttribute, 4> attrs(fuse_into->getAttrs().begin(),
+                                         fuse_into->getAttrs().end());
+
+    // Add fusion specific additional attributes.
+    for (auto attr : fusion->additional_attributes) {
+      attrs.push_back(attr);
+    }
+
+    // Add a fused output kernel name to the list of fusions.
+    Identifier fusion_id = Identifier::get("fusion", ctx);
+    StringAttr fusion_name = StringAttr::get(fusion->output_kernel, ctx);
+
+    auto is_fusion = [&](const NamedAttribute &attr) -> bool {
+      return attr.first == fusion_id;
+    };
+
+    if (isa<BaseOp>(fuse_into)) {
+      NamedAttribute fusion_attr(fusion_id, ArrayAttr::get({fusion_name}, ctx));
+      attrs.push_back(fusion_attr);
+
+    } else {
+      ArrayAttr arr =
+          llvm::find_if(attrs, is_fusion)->second.template cast<ArrayAttr>();
+      llvm::erase_if(attrs, is_fusion);
+
+      auto rng = arr.getAsRange<Attribute>();
+      SmallVector<Attribute, 4> updated(rng.begin(), rng.end());
+      updated.push_back(fusion_name);
+
+      attrs.push_back(NamedAttribute(fusion_id, ArrayAttr::get(updated, ctx)));
+    }
+
+    // Update all uses of a fusable op with a new fused operation.
+    Value fused = rewriter.create<FusedOp>(loc, result_ty, operands, attrs);
+    rewriter.replaceOp(op, {fused});
+
+    return failure();
+  }
+};
+
+// -------------------------------------------------------------------------- //
+
+using FuseIntoMatMulOp = FuseIntoContractionOp<MatMulOp, _JitFusedMatMulOp>;
+
+struct ContractionFusionPass
+    : public PassWrapper<ContractionFusionPass, FunctionPass> {
+  void runOnFunction() override;
+};
+
+void ContractionFusionPass::runOnFunction() {
+  FuncOp func = getFunction();
+
+  OwningRewritePatternList patterns;
+  patterns.insert<FuseIntoMatMulOp>();
+  applyPatternsAndFoldGreedily(func, patterns);
+}
+
+}  // namespace
+
+std::unique_ptr<OperationPass<FuncOp>> CreateContractionFusionPass() {
+  return std::make_unique<ContractionFusionPass>();
+}
+
+static PassRegistration<ContractionFusionPass> pass(
+    "tf-contraction-fusion",
+    "Fuses operations implementing ContractionFusionInterface into the "
+    "contraction operations");
+
+}  // namespace TF
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/dialect_hooks.cc b/tensorflow/compiler/mlir/tensorflow/transforms/decode_attributes_hook.cc
similarity index 74%
rename from tensorflow/compiler/mlir/tensorflow/transforms/dialect_hooks.cc
rename to tensorflow/compiler/mlir/tensorflow/transforms/decode_attributes_hook.cc
index 109ceea47e7..d309c6d379f 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/dialect_hooks.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/decode_attributes_hook.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/Dialect.h"  // from @llvm-project
-#include "mlir/IR/DialectHooks.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
@@ -35,31 +34,22 @@ namespace {
 
 // Since this method is passed to MLIR as decode hook it has to conform
 // to LLVM style used by MLIR.
-bool DecodeOpaqueTensorHook(const OpaqueElementsAttr input,
-                            ElementsAttr& output) {  // NOLINT
+LogicalResult DecodeOpaqueTensorHook(const OpaqueElementsAttr input,
+                                     ElementsAttr& output) {  // NOLINT
   Builder builder(input.getType().getContext());
   auto decoded_attr_or = tensorflow::DecodeOpaqueTensor(input, builder);
   if (!decoded_attr_or.ok()) {
     VLOG(2) << decoded_attr_or.status().error_message();
-    return true;
+    return failure();
   }
 
   output = decoded_attr_or.ValueOrDie();
-  return false;
+  return success();
 }
 
-// Hooks for the TensorFlow dialect.
-class TensorFlowHooks : public DialectHooks {
- public:
-  DialectConstantFoldHook getConstantFoldHook() {
-    return TF::ConstantFoldFallbackHook;
-  }
-  DialectConstantDecodeHook getDecodeHook() { return DecodeOpaqueTensorHook; }
-};
+static bool init_hooks = ([] () {
+  TF::TensorFlowDialect::RegisterDecodeConstantHook(DecodeOpaqueTensorHook);
+}(), true);
 
 }  // anonymous namespace
-
-// Static initialization for TensorFlow dialect hooks registration.
-static DialectHooksRegistration<TensorFlowHooks> tf_hooks_registration("tf");
-
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.td b/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.td
index 40339cebd31..4ed0307e2ef 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.td
@@ -85,7 +85,7 @@ def DecomposeResourceApplyMomentumOpNonNesterov :
        $var_resource, $accum_resource, $lr, $grad, $momentum,
        BoolAttr:$_, ConstBoolAttrFalse:$use_nesterov
     ),
-    [(TF_AddOp:$accum_new
+    [(TF_AddV2Op:$accum_new
       (TF_MulOp
         (CreateTFReadVariableOp $src_op, $grad, $accum_resource),
         $momentum
@@ -107,7 +107,7 @@ def DecomposeResourceApplyMomentumOpNesterov :
        $var_resource, $accum_resource, $lr, $grad, $momentum,
        BoolAttr:$_, ConstBoolAttrTrue:$use_nesterov
     ),
-    [(TF_AddOp:$accum_new
+    [(TF_AddV2Op:$accum_new
        (TF_MulOp
          (CreateTFReadVariableOp $src_op, $grad, $accum_resource),
          $momentum
@@ -117,7 +117,7 @@ def DecomposeResourceApplyMomentumOpNesterov :
      (TF_AssignVariableOp $accum_resource, $accum_new),
      (TF_AssignSubVariableOp
        $var_resource,
-       (TF_AddOp
+       (TF_AddV2Op
          (TF_MulOp $grad, $lr),
          (TF_MulOp $accum_new, (TF_MulOp $momentum, $lr))
        )
@@ -175,7 +175,7 @@ def DecomposeResourceApplyKerasMomentumOpNesterov :
     ]
   >;
 
-// Pattern to Decompose ResourceApplyAdagrad.
+// Pattern to Decompose ResourceApplyAdagradV2.
 // This decomposition is only correct inside XLA as it ignores use_locking
 // attribute.
 // accum <- accum + grad * grad
@@ -201,6 +201,21 @@ def DecomposeResourceApplyAdagradV2 :
     ]
   >;
 
+// ResourceApplyAdagrad op can be canonicalized to ResourceApplyAdagradV2 with
+// zero epsilon and then decomposed using DecomposeResourceApplyAdagradV2
+// pattern.
+def DecomposeResourceApplyAdagrad :
+  Pattern<
+    (TF_ResourceApplyAdagradOp $var_resource, $accum_resource, $lr, $grad,
+       $use_locking, $update_slots),
+    [
+      (TF_ConstOp:$zero_epsilon (GetScalarOfType<0> $grad)),
+      (TF_ResourceApplyAdagradV2Op $var_resource, $accum_resource, $lr,
+          $zero_epsilon, $grad, $use_locking, $update_slots
+      )
+    ]>;
+
+
 // Pattern to Decompose ResourceApplyAdam without Nesterov momentum.
 // This decomposition is only correct inside XLA as it ignores use_locking
 // attribute.
@@ -342,7 +357,7 @@ def DecomposeResourceApplyCenteredRMSProp :
     ),
     [(TF_ConstOp:$one (GetScalarOfType<1> $grad)),
      (CreateTFReadVariableOp $src_op, $grad, $ms_resource),
-     (TF_AddOp:$ms_new
+     (TF_AddV2Op:$ms_new
        (TF_MulOp
          (TF_MulOp $grad, $grad),
          (TF_SubOp $one, $rho)
@@ -354,7 +369,7 @@ def DecomposeResourceApplyCenteredRMSProp :
      ),
      (TF_AssignVariableOp $ms_resource, $ms_new),
      // mg = grad * (one - rho) + mg * rho;
-     (TF_AddOp:$mg_new
+     (TF_AddV2Op:$mg_new
        (TF_MulOp
          $grad,
          (TF_SubOp $one, $rho)
@@ -366,7 +381,7 @@ def DecomposeResourceApplyCenteredRMSProp :
      ),
      (TF_AssignVariableOp $mg_resource, $mg_new),
      // mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms - mg * mg + epsilon)
-     (TF_AddOp:$mom_new
+     (TF_AddV2Op:$mom_new
       (TF_MulOp $momentum,
        (CreateTFReadVariableOp $src_op, $grad, $mom_resource)),
       (TF_DivOp
@@ -374,7 +389,7 @@ def DecomposeResourceApplyCenteredRMSProp :
          (TF_SqrtOp
            (TF_SubOp
              $ms_new,
-             (TF_AddOp
+             (TF_AddV2Op
                (TF_MulOp
                  $mg_new,
                  $mg_new
@@ -390,3 +405,45 @@ def DecomposeResourceApplyCenteredRMSProp :
      (TF_AssignSubVariableOp $var_resource, $mom_new)
    ]
    >;
+
+// This decomposition is only correct inside XLA as it ignores use_locking
+// attribute.
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+// var <- var - mom
+def DecomposeResourceApplyRMSProp :
+  Pattern<
+    (TF_ResourceApplyRMSPropOp:$src_op
+       $var_resource, $ms_resource, $mom_resource, $lr, $rho, $momentum, $epsilon,
+       $grad, ConstBoolAttrFalse:$use_locking
+    ),
+    [(TF_ConstOp:$one (GetScalarOfType<1> $grad)),
+     (CreateTFReadVariableOp $src_op, $grad, $ms_resource),
+     // ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+     (TF_AddV2Op:$ms_new
+       (TF_MulOp
+          (CreateTFReadVariableOp $src_op, $grad, $ms_resource),
+          $rho
+       ),
+       (TF_MulOp
+         (TF_SquareOp $grad),
+         (TF_SubOp $one, $rho)
+       )
+     ),
+     (TF_AssignVariableOp $ms_resource, $ms_new),
+     // mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+     (TF_AddV2Op:$mom_new
+      (TF_MulOp $momentum,
+       (CreateTFReadVariableOp $src_op, $grad, $mom_resource)),
+      (TF_DivOp
+         (TF_MulOp $lr, $grad),
+         (TF_SqrtOp
+           (TF_AddV2Op $ms_new, $epsilon)
+         )
+      )
+     ),
+     (TF_AssignVariableOp $mom_resource, $mom_new),
+     // var <- var - mom
+     (TF_AssignSubVariableOp $var_resource, $mom_new)
+   ]
+   >;
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/fold_switch.cc b/tensorflow/compiler/mlir/tensorflow/transforms/fold_switch.cc
index b47378762a9..cc24c98a786 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/fold_switch.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/fold_switch.cc
@@ -240,7 +240,7 @@ static LogicalResult FoldMergeNodes(FuncOp function, const DeadQueue& queue) {
       auto def_op = val.getDefiningOp();
 #ifndef NDEBUG
       auto exec_dialect =
-          function.getContext()->getRegisteredDialect("tf_executor");
+          function.getContext()->getLoadedDialect("tf_executor");
       assert(def_op->getDialect() == exec_dialect &&
              "unable to forward control dependencies");
 #endif
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_cfg.cc b/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_cfg.cc
index d8678e620f4..a5d76619416 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_cfg.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_cfg.cc
@@ -157,14 +157,14 @@ static LogicalResult LowerIfOp(IfOp op) {
 
   // Set up the 'then' block.
   Block* then_block = builder.createBlock(merge_block);
-  Operation* call_op = CallFn(loc, get_operand, op.then_func(), &builder);
+  Operation* call_op = CallFn(loc, get_operand, op.then_function(), &builder);
 
   auto get_then_result = [&](int i) { return call_op->getResult(i); };
   JumpToBlock(loc, get_then_result, merge_block, &builder);
 
   // Set up the 'else' block.
   Block* else_block = builder.createBlock(merge_block);
-  call_op = CallFn(loc, get_operand, op.else_func(), &builder);
+  call_op = CallFn(loc, get_operand, op.else_function(), &builder);
 
   auto get_else_result = [&](int i) { return call_op->getResult(i); };
   JumpToBlock(loc, get_else_result, merge_block, &builder);
@@ -190,8 +190,8 @@ static LogicalResult LowerWhileOp(WhileOp op) {
 
   OpBuilder builder(op_inst);
 
-  auto cond_fn = op.cond_func();
-  auto body_fn = op.body_func();
+  auto cond_fn = op.cond_function();
+  auto body_fn = op.body_function();
 
   // Split the block containing the While op into two blocks.  One containing
   // operations before the While op and other containing the rest.  Create two
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_regions.cc b/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_regions.cc
index d23b977f0e3..87733bbbf3f 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_regions.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_regions.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/Function.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/IR/Verifier.h"  // from @llvm-project
@@ -31,8 +32,8 @@ limitations under the License.
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
-#include "tensorflow/compiler/mlir/tensorflow/transforms/attribute_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
 
 #define DEBUG_TYPE "tf-functional-cf-to-region"
 
@@ -53,8 +54,8 @@ struct FunctionalControlFlowToRegions
 // the input arguments are used as is (for IfOp) or block arguments of the same
 // type as the input arguments are created and then used as call arguments (for
 // While).
-void CreateCall(Operation* op, FuncOp func, Region& caller_region,
-                ValueRange args, bool use_region_args) {
+YieldOp CreateCall(Operation* op, FuncOp func, Region& caller_region,
+                   ValueRange args, bool use_region_args) {
   assert(caller_region.empty() &&
          "Expected empty region for newly created ops");
   OpBuilder builder(caller_region);
@@ -76,20 +77,31 @@ void CreateCall(Operation* op, FuncOp func, Region& caller_region,
     casted_args.push_back(arg);
   }
   auto call = builder.create<CallOp>(op->getLoc(), func, casted_args);
-  builder.create<YieldOp>(op->getLoc(), call.getResults());
+  return builder.create<YieldOp>(op->getLoc(), call.getResults());
+}
+
+// Converts the condition for an IfOp/WhileOp to a boolean value.
+Value ConvertConditionToBoolean(Operation* op, Value cond) {
+  if (auto ranked_type = cond.getType().dyn_cast<RankedTensorType>())
+    if (ranked_type.getRank() == 0 &&
+        ranked_type.getElementType().isSignlessInteger(1))
+      return cond;
+
+  OpBuilder builder(op);
+  return builder.create<TF::ToBoolOp>(op->getLoc(), cond);
 }
 
 // Transform a functional IfOp to a region based IfRegionOp.
 LogicalResult ConvertIfOp(IfOp if_op) {
+  Value cond = ConvertConditionToBoolean(if_op, if_op.cond());
   auto if_region = OpBuilder(if_op).create<TF::IfRegionOp>(
-      if_op.getLoc(), if_op.getResultTypes(), if_op.cond(),
-      if_op.is_stateless());
-  CopyUnderscoredAttributes(if_op, if_region);
+      if_op.getLoc(), if_op.getResultTypes(), cond, if_op.is_stateless());
+  CopyDeviceAndUnderscoredAttributes(if_op, if_region);
 
-  CreateCall(if_op, if_op.then_func(),
+  CreateCall(if_op, if_op.then_function(),
              /*caller_region=*/if_region.then_branch(), if_op.input(),
              /*use_region_args=*/false);
-  CreateCall(if_op, if_op.else_func(),
+  CreateCall(if_op, if_op.else_function(),
              /*caller_region=*/if_region.else_branch(), if_op.input(),
              /*use_region_args=*/false);
   if_op.replaceAllUsesWith(if_region.getResults());
@@ -101,12 +113,17 @@ LogicalResult ConvertWhileOp(WhileOp while_op) {
   auto while_region = OpBuilder(while_op).create<TF::WhileRegionOp>(
       while_op.getLoc(), while_op.getResultTypes(), while_op.input(),
       while_op.is_stateless(), while_op.parallel_iterations());
-  CopyUnderscoredAttributes(while_op, while_region);
+  CopyDeviceAndUnderscoredAttributes(while_op, while_region);
 
-  CreateCall(while_op, while_op.cond_func(),
-             /*caller_region=*/while_region.cond(), while_op.input(),
-             /*use_region_args=*/true);
-  CreateCall(while_op, while_op.body_func(),
+  YieldOp cond_yield =
+      CreateCall(while_op, while_op.cond_function(),
+                 /*caller_region=*/while_region.cond(), while_op.input(),
+                 /*use_region_args=*/true);
+  Value i1_cond =
+      ConvertConditionToBoolean(cond_yield, cond_yield.getOperand(0));
+  cond_yield.setOperand(0, i1_cond);
+
+  CreateCall(while_op, while_op.body_function(),
              /*caller_region=*/while_region.body(), while_op.input(),
              /*use_region_args=*/true);
   while_op.replaceAllUsesWith(while_region.getResults());
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/gpu_fusion.cc b/tensorflow/compiler/mlir/tensorflow/transforms/gpu_fusion.cc
index 175baeb627f..fbe0524ce8b 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/gpu_fusion.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/gpu_fusion.cc
@@ -91,7 +91,7 @@ struct ReluToFusedBatchNorm : public OpRewritePattern<ReluOp> {
 
     // Build the newly fused operation to replace the batch norm
     OperationState state(batch_norm.getLoc(),
-                         FusedBatchNormExOp::getOperationName());
+                         _FusedBatchNormExOp::getOperationName());
     state.addOperands(batch_norm.getOperands());
     if (side_input) state.operands.push_back(side_input);
     state.addTypes(batch_norm.getResultTypes());
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/launch_to_device_attribute.cc b/tensorflow/compiler/mlir/tensorflow/transforms/launch_to_device_attribute.cc
index bce18c0b4b7..4e507c8e760 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/launch_to_device_attribute.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/launch_to_device_attribute.cc
@@ -104,10 +104,10 @@ LogicalResult HoistOpsAndAnnotateWithDevice(const Dialect* tf_dialect,
 }
 
 void LaunchToDeviceAttributePass::runOnFunction() {
-  const Dialect* tf_dialect = getContext().getRegisteredDialect("tf");
+  const Dialect* tf_dialect = getContext().getLoadedDialect("tf");
   if (!tf_dialect) {
-    signalPassFailure();
     getFunction().emitError() << "'tf' dialect is not registered";
+    return signalPassFailure();
   }
 
   auto result = getFunction().walk([&](tf_device::LaunchOp launch) {
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/layout_optimization.cc b/tensorflow/compiler/mlir/tensorflow/transforms/layout_optimization.cc
index e76a8da0b29..8123f50757e 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/layout_optimization.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/layout_optimization.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/Function.h"  // from @llvm-project
@@ -33,6 +35,34 @@ namespace mlir {
 namespace TF {
 
 namespace {
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_helpers.inc"
+
+// Helper method that returns an op from 'transpose_ops' that match criteria
+// for an 'operand' and 'permutation'
+TransposeOp ReuseExistingTranspose(const OpOperand* operand,
+                                   const SmallVector<int64_t, 4>& permutation,
+                                   Operation* op, ConstOp permutation_op,
+                                   SmallVector<TransposeOp, 2>* transpose_ops) {
+  for (auto it = transpose_ops->begin(); it != transpose_ops->end(); ++it) {
+    auto tranpose_op = *it;
+    for (auto tranpose_operand : tranpose_op.getOperands()) {
+      auto ranked_tranpose_type =
+          tranpose_operand.getType().dyn_cast_or_null<RankedTensorType>();
+      if (!ranked_tranpose_type) continue;
+      if (ranked_tranpose_type.getRank() == permutation.size() &&
+          operand->get().getType() ==
+              ShuffleRankedTensorType(ranked_tranpose_type, permutation)) {
+        TransposeOp transpose = tranpose_op;
+        transpose.getOperation()->moveBefore(op);
+        transpose.setOperand(0, operand->get());
+        transpose.setOperand(1, permutation_op);
+        transpose_ops->erase(it);
+        return transpose;
+      }
+    }
+  }
+  return nullptr;
+}
 
 // LayoutAssignmentPass assigns optimal data layout (data format) for all
 // layout sensitive operations.
@@ -79,18 +109,7 @@ class MoveTransposesPass
           clEnumValN(Direction::kEnd, "end", "end of the block"))};
 };
 
-using Permutation = SmallVector<int32_t, 4>;
-
-Permutation GetDataFormatPermutation(StringRef from_data_format,
-                                     StringRef to_data_format) {
-  if (from_data_format == "NHWC" && to_data_format == "NCHW") {
-    return {0, 3, 1, 2};
-  } else if (from_data_format == "NCHW" && to_data_format == "NHWC") {
-    return {0, 2, 3, 1};
-  } else {
-    llvm_unreachable("Unknown data format combination");
-  }
-}
+using Permutation = SmallVector<int64_t, 4>;
 
 void LayoutAssignmentPass::runOnFunction() {
   FuncOp func = getFunction();
@@ -131,7 +150,7 @@ void LayoutAssignmentPass::runOnFunction() {
     OpBuilder builder = OpBuilder::atBlockEnd(op->getBlock());
 
     auto perm_attr = [&](Permutation permutation) -> DenseIntElementsAttr {
-      auto perm_ty = RankedTensorType::get({4}, builder.getIntegerType(32));
+      auto perm_ty = RankedTensorType::get({4}, builder.getIntegerType(64));
       return DenseIntElementsAttr::get(perm_ty, permutation);
     };
 
@@ -202,6 +221,27 @@ void MoveTransposeBefore(Operation* op, SmallVector<Operation*, 8>* work_list) {
 
   // Nothing to do here.
   if (!permutation_op || transpose_ops.empty()) return;
+  SmallVector<int64_t, 4> permutation;
+  auto perm_attr = permutation_op.value().cast<DenseElementsAttr>();
+  for (const auto& value : perm_attr.getIntValues())
+    permutation.push_back(value.getSExtValue());
+
+  // We want to make sure the shape of the operand equals the transposed shape.
+  // mismatch can happen if 'op' supports broadcasting and the operands have
+  // different ranks.
+  if (op->hasTrait<OpTrait::ResultsBroadcastableShape>()) {
+    auto transpose_op = *transpose_ops.begin();
+    auto result_type =
+        transpose_op.getResult().getType().dyn_cast_or_null<ShapedType>();
+    auto is_valid_move =
+        llvm::all_of(op->getOperands(), [result_type](Value operand) -> bool {
+          auto operand_type = operand.getType().dyn_cast_or_null<ShapedType>();
+          return result_type && operand_type && result_type.hasRank() &&
+                 operand_type.hasRank() &&
+                 result_type.getRank() == operand_type.getRank();
+        });
+    if (!is_valid_move) return;
+  }
 
   // At this point we checked that we can safely move Transpose node before
   // `op`, and bypass all result transposes.
@@ -228,16 +268,12 @@ void MoveTransposeBefore(Operation* op, SmallVector<Operation*, 8>* work_list) {
       work_list->push_back(operand_op);
 
     // Try to reuse result transposes.
-    TransposeOp transpose;
-    if (!transpose_ops.empty()) {
-      transpose = transpose_ops.pop_back_val();
-      transpose.getOperation()->moveBefore(op);
-      transpose.setOperand(0, operand.get());
-      transpose.setOperand(1, permutation_op);
-    } else {
+    TransposeOp transpose = ReuseExistingTranspose(
+        &operand, permutation, op, permutation_op, &transpose_ops);
+    // If no transpose available for using, create new one.
+    if (!transpose)
       transpose =
           builder.create<TransposeOp>(loc, operand.get(), permutation_op);
-    }
 
     operand.set(transpose);
   }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo.cc b/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo.cc
index ad241ef9488..8e93a7e7470 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo.cc
@@ -88,7 +88,7 @@ class ConvertConvOp : public OpConversionPattern<mhlo::ConvOp> {
     const int input_channels =
         conv_op.lhs().getType().cast<ShapedType>().getDimSize(
             input_feature_dimension);
-    int feature_group_count = conv_op.feature_group_count().getSExtValue();
+    int feature_group_count = conv_op.feature_group_count();
 
     const bool is_depthwise_conv = input_channels == feature_group_count;
     std::string padding;
@@ -250,7 +250,7 @@ class ConvertSliceOp : public OpConversionPattern<mhlo::SliceOp> {
         strides.getSplatValue().cast<IntegerAttr>().getInt() != 1)
       return failure();
 
-    rewriter.setInsertionPointAfter(slice_op);
+    rewriter.setInsertionPointAfter(slice_op.getOperation());
     auto start_indices = slice_op.start_indices();
     auto limit_indices = slice_op.limit_indices();
     std::vector<int64_t> size_values;
@@ -614,7 +614,65 @@ class ConvertReduceOpToTfMin : public OpConversionPattern<mhlo::ReduceOp> {
   };
 };
 
+class ConvertIotaOpToTfRange : public OpConversionPattern<mhlo::IotaOp> {
+ public:
+  using OpConversionPattern::OpConversionPattern;
+
+  LogicalResult matchAndRewrite(
+      mhlo::IotaOp iota_op, ArrayRef<Value> args,
+      ConversionPatternRewriter &rewriter) const final {
+    RankedTensorType type =
+        iota_op.getType().dyn_cast_or_null<RankedTensorType>();
+    if (!type) return failure();
+
+    const uint64_t dimension = iota_op.iota_dimension();
+    Type element_type = type.getElementType();
+    Attribute start, limit, delta;
+    if (element_type.isa<FloatType>()) {
+      start = rewriter.getFloatAttr(element_type, 0.0);
+      limit = rewriter.getFloatAttr(element_type, type.getShape()[dimension]);
+      delta = rewriter.getFloatAttr(element_type, 1.0);
+    } else if (element_type.isa<IntegerType>()) {
+      start = rewriter.getIntegerAttr(element_type, 0);
+      limit = rewriter.getIntegerAttr(element_type, type.getShape()[dimension]);
+      delta = rewriter.getIntegerAttr(element_type, 1);
+    } else {
+      return failure();
+    }
+
+    auto range_type =
+        RankedTensorType::get({type.getShape()[dimension]}, element_type);
+    Value start_op = rewriter.create<TF::ConstOp>(iota_op.getLoc(), start);
+    Value limit_op = rewriter.create<TF::ConstOp>(iota_op.getLoc(), limit);
+    Value delta_op = rewriter.create<TF::ConstOp>(iota_op.getLoc(), delta);
+    Value result = rewriter.create<TF::RangeOp>(iota_op.getLoc(), range_type,
+                                                start_op, limit_op, delta_op);
+
+    if (type.getRank() > 1) {
+      std::vector<int64_t> reshape_shape(type.getRank(), 1);
+      reshape_shape[iota_op.iota_dimension()] = type.getShape()[dimension];
+      auto reshape_type = RankedTensorType::get(reshape_shape, element_type);
+      Value reshape_shape_op = rewriter.create<TF::ConstOp>(
+          iota_op.getLoc(), rewriter.getI64TensorAttr(reshape_shape));
+      result = rewriter.create<TF::ReshapeOp>(iota_op.getLoc(), reshape_type,
+                                              result, reshape_shape_op);
+
+      Value broadcast_shape_op = rewriter.create<TF::ConstOp>(
+          iota_op.getLoc(), rewriter.getI64TensorAttr(type.getShape()));
+      result = rewriter.create<TF::BroadcastToOp>(iota_op.getLoc(), type,
+                                                  result, broadcast_shape_op);
+    }
+
+    rewriter.replaceOp(iota_op, result);
+    return success();
+  }
+};
+
 class LegalizeHloToTf : public PassWrapper<LegalizeHloToTf, FunctionPass> {
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<TF::TensorFlowDialect>();
+  }
+
  public:
   LegalizeHloToTf() = default;
   LegalizeHloToTf(const LegalizeHloToTf &) {}
@@ -765,7 +823,8 @@ void PopulateLegalizeHloToTfPatterns(OwningRewritePatternList *patterns,
                                      MLIRContext *context) {
   populateWithGenerated(context, patterns);
   patterns->insert<ConvertConvOp, ConvertSliceOp, ConvertReduceOpToTfMax,
-                   ConvertReduceOpToTfMin, ConvertReduceOpToTfSum>(context);
+                   ConvertReduceOpToTfMin, ConvertReduceOpToTfSum,
+                   ConvertIotaOpToTfRange>(context);
 }
 
 std::unique_ptr<OperationPass<FuncOp>> CreateLegalizeHloToTfPass() {
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.cc b/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.cc
index d67739a739b..f88488de27d 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_remaining_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/core/util/tensor_format.h"
 
@@ -55,18 +56,27 @@ static DenseIntElementsAttr GetI64ElementsAttrForSeq(int start, int end,
   return DenseIntElementsAttr::get(ty, vals);
 }
 
-// Returns int or float DenseElementsAttr with scalar shape with the given
-// element type and the integer value.
+// Returns int, float, or complex DenseElementsAttr with scalar shape with the
+// given element type and the integer value.
 static DenseElementsAttr GetScalarOfType(Type ty, int64_t raw_value) {
   RankedTensorType scalar_ty = RankedTensorType::get({}, ty);
   if (auto float_ty = ty.dyn_cast_or_null<FloatType>()) {
     FloatAttr attr = FloatAttr::get(float_ty, raw_value);
     return DenseElementsAttr::get(scalar_ty, attr);
+  } else if (auto int_ty = ty.dyn_cast_or_null<IntegerType>()) {
+    IntegerAttr attr = IntegerAttr::get(int_ty, raw_value);
+    return DenseElementsAttr::get(scalar_ty, attr);
+  } else if (auto complex_ty = ty.dyn_cast_or_null<ComplexType>()) {
+    Type complex_element_ty = complex_ty.getElementType();
+    if (complex_element_ty.isF32()) {
+      return DenseElementsAttr::get(
+          scalar_ty, static_cast<std::complex<float>>(raw_value));
+    } else if (complex_element_ty.isF64()) {
+      return DenseElementsAttr::get(
+          scalar_ty, static_cast<std::complex<double>>(raw_value));
+    }
   }
-
-  auto int_ty = ty.cast<IntegerType>();
-  IntegerAttr attr = IntegerAttr::get(int_ty, raw_value);
-  return DenseElementsAttr::get(scalar_ty, attr);
+  llvm_unreachable("unsupported type");
 }
 
 // Returns float DenseElementsAttr with scalar shape with the specified value.
@@ -111,34 +121,87 @@ Type InferExpandDimsType(Type ty, int64_t axis, Builder *builder) {
   return RankedTensorType::get(shape, ranked_ty.getElementType());
 }
 
+// Converts individual Values to a tensor of rank 1. Each input Value has rank 1
+// and size 1.
+Value ValuesToRank1(PatternRewriter &rewriter, Location loc, Type dtype,
+                    ArrayRef<Value> vals) {
+  int64_t length = vals.size();
+  auto type = RankedTensorType::get({length}, dtype);
+  auto axis = rewriter.create<TF::ConstOp>(
+      loc, GetScalarOfType(rewriter.getIntegerType(64), 0));
+  return rewriter.create<TF::ConcatV2Op>(loc, type, ValueRange(vals), axis);
+}
+
 // Lowers AddN op to a sequence of AddV2 ops to accumulate operands.
 //
+// Note that to improve the parallelism, AddN op uses tree-based reduction.
+// For example, tf.AddN([0, 1, 2, 3, 4]) behaves as follows:
+//
+//                 0     1     2     3     4
+//                 |     |     |     |     |
+//                 -------     -------     |
+//                    |           |        |
+//                    5           6        |
+//                    |           |        |
+//                    -------------        |
+//                          |              |
+//                          7              |
+//                          |              |
+//                          ----------------
+//                                 |
+//                                 8
+//
+// Example:
+//
 //   %result = "tf.AddN"(%0, %1, %2)
 //
 // is lowered to:
 //
-//   %sum_0 = "tf.AddV2"(%0, %1)
-//   %result = "tf.AddV2"(%sum_0, %2)
+//   %sum0 = "tf.AddV2"(%0, %1)
+//   %result = "tf.AddV2"(%sum0, %2)
 //
-class LowerAddNOp : public OpRewritePattern<TF::AddNOp> {
+// While
+//
+//   %result = "tf.AddN"(%0, %1, %2, %3, %4)
+//
+// is lowered to:
+//
+//   %sum0 = "tf.AddV2"(%0, %1)
+//   %sum1 = "tf.AddV2"(%2, %3)
+//   %sum2 = "tf.AddV2"(%sum0, %sum1)
+//   %result = "tf.AddV2"(%sum2, %4)
+//
+class LowerAddNOp : public RewritePattern {
  public:
   explicit LowerAddNOp(MLIRContext *context)
-      : OpRewritePattern<TF::AddNOp>(context) {}
+      : RewritePattern(TF::AddNOp::getOperationName(),
+                       {TF::AddV2Op::getOperationName()}, 1, context) {}
 
-  LogicalResult matchAndRewrite(TF::AddNOp op,
+  LogicalResult matchAndRewrite(Operation *op,
                                 PatternRewriter &rewriter) const override {
+    auto addn_op = cast<TF::AddNOp>(op);
+
     // TODO(hinsu): Support variant with TensorList type. tf.AddV2 doesn't
     // support variant type so variant types require special handling.
-    if (getElementTypeOrSelf(op.getType()).isa<VariantType>()) return failure();
+    if (getElementTypeOrSelf(addn_op.getType()).isa<VariantType>())
+      return failure();
+    llvm::SmallVector<Value, 4> operands(addn_op.inputs().begin(),
+                                         addn_op.inputs().end());
 
-    // TODO(hinsu): Improve parallelism by splitting operands in two halves and
-    // accumulating them first.
-    Value result = *op.inputs().begin();
-    for (Value operand : llvm::drop_begin(op.inputs(), 1)) {
-      result = rewriter.create<TF::AddV2Op>(op.getLoc(), result, operand);
+    int64_t n = operands.size();
+    // Keep doing tree-based reduction when there are more than one operand.
+    while (n > 1) {
+      for (int64_t i = 0; i < n; i += 2) {
+        // Add two adjacent operands if applicable.
+        operands[i / 2] =
+            (i + 1 < n) ? rewriter.create<TF::AddV2Op>(
+                              addn_op.getLoc(), operands[i], operands[i + 1])
+                        : operands[i];
+      }
+      n = (n + 1) / 2;
     }
 
-    rewriter.replaceOp(op, result);
+    rewriter.replaceOp(addn_op, operands[0]);
     return success();
   }
 };
@@ -224,7 +287,7 @@ class LowerDynamicStitchOp : public OpRewritePattern<TF::DynamicStitchOp> {
           reshaped_data.getType().cast<RankedTensorType>().getShape()[0];
       auto items = rewriter.create<UnpackOp>(
           loc, SmallVector<Type, 4>(num_items, item_ty), reshaped_data,
-          /*axis=*/APInt(64, 0));
+          /*axis=*/0);
       for (auto index_item : llvm::zip(index_attr, items.getResults())) {
         int64_t output_index = std::get<0>(index_item).getSExtValue();
         Value item = std::get<1>(index_item);
@@ -320,7 +383,7 @@ class LowerPackOp : public OpRewritePattern<TF::PackOp> {
         loc,
         DenseElementsAttr::get(
             RankedTensorType::get({}, rewriter.getIntegerType(64)), op.axis()));
-    int64_t axis = op.axis().getSExtValue();
+    int64_t axis = op.axis();
 
     Type prev_input_ty, inferred_ty;
     SmallVector<Value, 4> expanded_inputs;
@@ -344,6 +407,187 @@ class LowerPackOp : public OpRewritePattern<TF::PackOp> {
   }
 };
 
+// Lowers SpaceToBatchND by reducing to reshape(transpose(reshape(pad(input)))).
+//
+// Before rewrite:
+//   output = SpaceToBatchND(input, block_shape, paddings)
+// Let:
+//   [batch] + spatial_shape + remaining_shape = input.shape
+//   M = spatial_shape.rank
+// After rewrite:
+//   padded = zero-pad input with paddings
+//     The spatial_shape component of input.shape pads with paddings[*, 0]
+//     before each dimension, and paddings[*, 1] after each dimension.
+//   reshaped = reshape padded to:
+//     [batch]
+//     + [padded.shape[1]/block_shape[0], block_shape[0], ...,
+//        padded.shape[M]/block_shape[M-1], block_shape[M-1]]
+//     + remaining_shape
+//   permuted = transpose reshaped to:
+//     block_shape
+//     + [batch]
+//     + [padded.shape[1]/block_shape[0], ..., padded.shape[M]/block_shape[M-1]]
+//     + remaining_shape
+//   result = reshape permuted to:
+//     [batch * product(block_shape)]
+//     + [padded.shape[1]/block_shape[0], ..., padded.shape[M]/block_shape[M-1]]
+//     + remaining_shape
+class LowerSpaceToBatchNDOp : public OpRewritePattern<TF::SpaceToBatchNDOp> {
+ public:
+  using OpRewritePattern<TF::SpaceToBatchNDOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(TF::SpaceToBatchNDOp op,
+                                PatternRewriter &rewriter) const override {
+    Location loc = op.getLoc();
+    auto input_type = op.input().getType().cast<TensorType>();
+    if (!input_type.hasStaticShape()) {
+      return failure();
+    }
+    ArrayRef<int64_t> input_shape = input_type.getShape();
+    auto block_shape_type = op.block_shape().getType().cast<TensorType>();
+    if (!block_shape_type.hasStaticShape()) {
+      return failure();
+    }
+    auto paddings_type = op.paddings().getType().cast<ShapedType>();
+
+    int64_t input_rank = input_type.getRank();
+    int64_t block_rank = block_shape_type.getNumElements();
+    int64_t remaining_rank = input_rank - 1 - block_rank;
+    if (remaining_rank < 0) {
+      // TODO(b/157475606): Move this check to ::Verify
+      return failure();
+    }
+
+    auto block_shape_i64_type = RankedTensorType::get(
+        block_shape_type.getShape(), rewriter.getIntegerType(64));
+    auto block_shape_i64 = rewriter.create<TF::CastOp>(
+        loc, block_shape_i64_type, op.block_shape());
+
+    auto paddings_i64_type = RankedTensorType::get(paddings_type.getShape(),
+                                                   rewriter.getIntegerType(64));
+    auto paddings_i64 =
+        rewriter.create<TF::CastOp>(loc, paddings_i64_type, op.paddings());
+
+    auto pad00 = rewriter.create<TF::ConstOp>(
+        loc, DenseElementsAttr::get<int64_t>(
+                 RankedTensorType::get({1, 2}, rewriter.getIntegerType(64)),
+                 {0, 0}));
+    SmallVector<Value, 4> full_paddings_list{pad00, paddings_i64};
+    full_paddings_list.append(remaining_rank, pad00);
+    auto full_paddings_type =
+        RankedTensorType::get({input_rank, 2}, rewriter.getIntegerType(64));
+    auto zero_i64 = rewriter.create<TF::ConstOp>(
+        loc, GetScalarOfType(rewriter.getIntegerType(64), 0));
+    // Extends paddings to all dimensions of input by adding 0s to non-block
+    // dimensions.
+    auto full_paddings = rewriter.create<TF::ConcatV2Op>(
+        loc, full_paddings_type, full_paddings_list, zero_i64);
+
+    SmallVector<int64_t, 4> padded_shape(input_rank, ShapedType::kDynamicSize);
+    auto padded_type =
+        RankedTensorType::get(padded_shape, rewriter.getF32Type());
+    // padded = pad(input, full_paddings)
+    auto padded =
+        rewriter.create<TF::PadOp>(loc, padded_type, op.input(), full_paddings);
+
+    auto paddings_sum_type =
+        RankedTensorType::get({input_rank}, rewriter.getIntegerType(64));
+    auto one_i64 = rewriter.create<TF::ConstOp>(
+        loc, GetScalarOfType(rewriter.getIntegerType(64), 1));
+    // paddings_sum = paddings[*,0] + paddings[*,1]
+    auto paddings_sum = rewriter.create<TF::SumOp>(loc, paddings_sum_type,
+                                                   full_paddings, one_i64);
+
+    // input_shape_tensor = input.shape
+    auto input_shape_tensor = rewriter.create<TF::ConstOp>(
+        loc,
+        DenseElementsAttr::get(
+            RankedTensorType::get({input_rank}, rewriter.getIntegerType(64)),
+            input_shape));
+
+    // padded_shape_tensor is the shape of padded.
+    auto padded_shape_tensor =
+        rewriter.create<TF::AddOp>(loc, paddings_sum, input_shape_tensor);
+
+    auto zero_i32 = rewriter.create<TF::ConstOp>(
+        loc, GetScalarOfType(rewriter.getIntegerType(32), 0));
+    SmallVector<Type, 4> padded_shape_splits_types(
+        input_rank, RankedTensorType::get({1}, rewriter.getIntegerType(64)));
+    SmallVector<Value, 4> padded_shape_splits(
+        rewriter
+            .create<TF::SplitOp>(loc, padded_shape_splits_types, zero_i32,
+                                 padded_shape_tensor)
+            .output());
+
+    SmallVector<Type, 4> block_shape_splits_types(
+        block_rank, RankedTensorType::get({1}, rewriter.getIntegerType(64)));
+    SmallVector<Value, 4> block_shape_splits(
+        rewriter
+            .create<TF::SplitOp>(loc, block_shape_splits_types, zero_i32,
+                                 block_shape_i64)
+            .output());
+
+    SmallVector<Value, 4> outer_shape_vals;
+    for (int64_t i = 0; i < block_rank; ++i) {
+      // TODO(b/157475606): Insert tf.Assert that the following division has
+      // remainder 0.
+      outer_shape_vals.push_back(rewriter.create<TF::DivOp>(
+          loc, padded_shape_splits[1 + i], block_shape_splits[i]));
+    }
+
+    SmallVector<Value, 6> reshaped_shape_vals{padded_shape_splits[0]};
+    for (int64_t i = 0; i < block_rank; ++i) {
+      reshaped_shape_vals.push_back(outer_shape_vals[i]);
+      reshaped_shape_vals.push_back(block_shape_splits[i]);
+    }
+    for (int64_t i = 1 + block_rank; i < input_rank; ++i) {
+      reshaped_shape_vals.push_back(padded_shape_splits[i]);
+    }
+    auto reshaped_shape = ValuesToRank1(
+        rewriter, loc, rewriter.getIntegerType(64), reshaped_shape_vals);
+
+    SmallVector<int64_t, 6> permutation_vals;
+    for (int64_t i = 0; i < block_rank; ++i) {
+      permutation_vals.push_back(2 + 2 * i);
+    }
+    permutation_vals.push_back(0);
+    for (int64_t i = 0; i < block_rank; ++i) {
+      permutation_vals.push_back(1 + 2 * i);
+    }
+    for (int64_t i = 1 + block_rank; i < input_rank; ++i) {
+      permutation_vals.push_back(block_rank + i);
+    }
+    auto permutation = rewriter.create<TF::ConstOp>(
+        loc, GetI64ElementsAttr(permutation_vals, &rewriter));
+
+    auto output_batch = padded_shape_splits[0];
+    for (int64_t i = 0; i < block_rank; ++i) {
+      output_batch =
+          rewriter.create<TF::MulOp>(loc, output_batch, block_shape_splits[i]);
+    }
+    SmallVector<Value, 4> output_shape_vals{output_batch};
+    for (int64_t i = 0; i < block_rank; ++i) {
+      output_shape_vals.push_back(outer_shape_vals[i]);
+    }
+    for (int64_t i = 1 + block_rank; i < input_rank; ++i) {
+      output_shape_vals.push_back(padded_shape_splits[i]);
+    }
+    auto output_shape = ValuesToRank1(
+        rewriter, loc, rewriter.getIntegerType(64), output_shape_vals);
+    auto reshaped = rewriter.create<TF::ReshapeOp>(loc, padded, reshaped_shape);
+    auto permuted =
+        rewriter.create<TF::TransposeOp>(loc, reshaped, permutation);
+
+    // Sometimes the result type is more specific than what the reshape builder
+    // can infer.
+    auto result_type = op.getResult().getType();
+    rewriter.replaceOpWithNewOp<TF::ReshapeOp>(op, result_type, permuted,
+                                               output_shape);
+
+    return success();
+  }
+};
+
 // Lowers `TF::SparseMatMulOp` to `TF::MatMulOp`, ignoring the sparseness hints,
 // since we currently don't have an implementation that can use this
 // information. Adds appropriate casts where necessary to align element types
@@ -388,12 +632,37 @@ class LowerSparseMatMulOp : public OpRewritePattern<TF::SparseMatMulOp> {
   }
 };
 
+// Lowers _UnaryOpsComposition op as a series of original TensorFlow ops that
+// were fused together.
+class Lower_UnaryOpsComposition
+    : public OpRewritePattern<TF::_UnaryOpsCompositionOp> {
+ public:
+  using OpRewritePattern<TF::_UnaryOpsCompositionOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(TF::_UnaryOpsCompositionOp op,
+                                PatternRewriter &rewriter) const override {
+    Value result = op.x();
+    for (StringRef op_name : op.op_names().getAsValueRange<StringAttr>()) {
+      std::string full_name = "tf." + op_name.str();
+      // All ops in the sequences have the same result type as the original
+      // result type.
+      OperationState state(op.getLoc(), full_name, /*operands=*/{result},
+                           /*types=*/{op.getType()}, /*attributes=*/{});
+      Operation *op = rewriter.createOperation(state);
+      result = op->getResult(0);
+    }
+    rewriter.replaceOp(op, {result});
+    return success();
+  }
+};
+
 }  // namespace
 
 void PopulateLoweringTFPatterns(MLIRContext *context,
                                 OwningRewritePatternList *patterns) {
   patterns->insert<LowerAddNOp, LowerDynamicStitchOp, LowerInvertPermutationOp,
-                   LowerPackOp, LowerSparseMatMulOp>(context);
+                   LowerPackOp, LowerSpaceToBatchNDOp, LowerSparseMatMulOp,
+                   Lower_UnaryOpsComposition>(context);
   populateWithGenerated(context, patterns);
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.td b/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.td
index 6b7d7178ab6..f7a867f3130 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.td
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.td
@@ -195,8 +195,7 @@ def : Pat<(TF_PadOp TensorOf<[AnySignlessInteger, AnyFloat]>:$input, $paddings),
 // Reciprocal op patterns.
 //===----------------------------------------------------------------------===//
 
-// TODO(hinsu): Support complex and unsigned input types.
-def LowerReciprocal : Pat<(TF_ReciprocalOp TF_SintOrFpTensor:$x),
+def LowerReciprocal : Pat<(TF_ReciprocalOp $x),
                           (TF_DivOp (TF_ConstOp (GetScalarOfType<1> $x)), $x)>;
 
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/mark_ops_for_outside_compilation.cc b/tensorflow/compiler/mlir/tensorflow/transforms/mark_ops_for_outside_compilation.cc
index 72f7a3a438c..25bd53ee73c 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/mark_ops_for_outside_compilation.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/mark_ops_for_outside_compilation.cc
@@ -17,12 +17,17 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "llvm/Support/FormatVariadic.h"
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/compiler/mlir/xla/transforms/passes.h"
 
 namespace mlir {
 namespace TFDevice {
@@ -30,6 +35,7 @@ namespace TFDevice {
 namespace {
 
 constexpr char kXlaOutsideCompilationAttr[] = "_xla_outside_compilation";
+constexpr char kAllowSoftPlacementAttr[] = "allow_soft_placement";
 
 // This pass marks unsupported ops in a device cluster with
 // `_xla_outside_compilation` attribute so the operations will run on the host
@@ -41,6 +47,36 @@ struct MarkOpsForOutsideCompilation
   void runOnOperation() override;
 };
 
+// Adds any canonicalization patterns to list of supported `patterns`.
+// TODO(b/161726307): Move or import the relevant patterns to LowerTF pass and
+// remove this.
+void AddCanonicalizationPatterns(MLIRContext* context,
+                                 OwningRewritePatternList* patterns) {
+  for (auto* op : context->getRegisteredOperations())
+    op->getCanonicalizationPatterns(*patterns, context);
+}
+
+// TODO(b/159128666): Check the control flow legalization passes instead once
+// added.
+void AddSupportedControlFlowOps(MLIRContext* context,
+                                llvm::DenseSet<OperationName>* supported_ops) {
+  supported_ops->insert(
+      OperationName(TF::IfRegionOp::getOperationName(), context));
+  supported_ops->insert(
+      OperationName(TF::WhileRegionOp::getOperationName(), context));
+  supported_ops->insert(
+      OperationName(TF::YieldOp::getOperationName(), context));
+}
+
+// These embedding ops are rewritten when running TPUCompileOp.
+void AddRewrittenEmbeddingOps(MLIRContext* context,
+                              llvm::DenseSet<OperationName>* supported_ops) {
+  supported_ops->insert(OperationName(
+      TF::RecvTPUEmbeddingActivationsOp::getOperationName(), context));
+  supported_ops->insert(OperationName(
+      TF::SendTPUEmbeddingGradientsOp::getOperationName(), context));
+}
+
 bool HasStringOperand(Operation& op) {
   for (auto operand : op.getOperands()) {
     if (getElementTypeOrSelf(operand).isa<TF::StringType>()) return true;
@@ -55,29 +91,143 @@ bool HasStringResult(Operation& op) {
   return false;
 }
 
-// Checks if the op is supported inside of a device cluster.
-bool IsSupportedOp(Operation& op) {
-  if (HasStringOperand(op) || HasStringResult(op)) {
-    return false;
-  }
-  return true;
+bool MatchesPattern(Operation& op,
+                    const llvm::DenseSet<OperationName>& supported_ops) {
+  return (supported_ops.contains(op.getName()));
 }
 
-LogicalResult MarkUncompilableOps(Block* block) {
-  for (Operation& op : *block) {
-    if (!IsSupportedOp(op)) {
-      op.setAttr(kXlaOutsideCompilationAttr,
-                 StringAttr::get("auto", op.getContext()));
-    }
+// Checks if the op is supported inside of a device cluster.  Ops not
+// in `tf_dialect` are considered supported.
+bool IsSupportedOp(Operation& op,
+                   const llvm::DenseSet<OperationName>& supported_ops,
+                   const Dialect* tf_dialect) {
+  if (op.getDialect() != tf_dialect)
+    return true;
+  else
+    return !HasStringOperand(op) && !HasStringResult(op) &&
+           (MatchesPattern(op, supported_ops) ||
+            mhlo::IsOpAllowedTf2XlaFallback(&op));
+}
+
+// Checks all regions of `op` for captured string operands.
+bool HasCapturedStringOperand(Operation* op) {
+  bool string_operand = false;
+  for (auto& region : op->getRegions()) {
+    mlir::visitUsedValuesDefinedAbove(
+        region, region, [&](mlir::OpOperand* operand) {
+          if (getElementTypeOrSelf(operand->get()).isa<TF::StringType>())
+            string_operand = true;
+        });
+    if (string_operand) return string_operand;
   }
+  return string_operand;
+}
+
+// Marks uncompilable ops that are in `tf_dialect` for outside compilation.
+LogicalResult MarkUncompilableOps(
+    const Dialect* tf_dialect, Block* block,
+    llvm::DenseSet<OperationName>& supported_ops) {
+  // Automatically marked ops for outside compilation have
+  // `_xla_outside_compilation` attribute value of "auto" plus
+  // an increasing counter.  Manually marked ops for outside compilation only
+  // have an increasing counteri for the attribute value.  Therefore there is no
+  // collision in
+  // `_xla_outside_compilation` attribute between automatically and manually
+  // marking ops.
+  int outside_compiled_cluster_counter = 0;
+  block->walk([&](Operation* op) {
+    if (!IsSupportedOp(*op, supported_ops, tf_dialect)) {
+      op->setAttr(
+          kXlaOutsideCompilationAttr,
+          StringAttr::get(
+              llvm::formatv("auto{0}", outside_compiled_cluster_counter).str(),
+              op->getContext()));
+      outside_compiled_cluster_counter++;
+    }
+    if (llvm::isa<TF::IfRegionOp, TF::WhileRegionOp>(op)) {
+      if (HasCapturedStringOperand(op)) {
+        op->setAttr(
+            kXlaOutsideCompilationAttr,
+            StringAttr::get(
+                llvm::formatv("auto{0}", outside_compiled_cluster_counter)
+                    .str(),
+                op->getContext()));
+        outside_compiled_cluster_counter++;
+      }
+    }
+  });
   return success();
 }
 
+// Unmarks outside compilation for any op that has parents already
+// marked for outside compilation since the child will be extracted
+// anyways.
+void UnmarkChildren(Block* block) {
+  block->walk([&](Operation* op) {
+    if (!op->getAttrOfType<StringAttr>(kXlaOutsideCompilationAttr)) return;
+    Operation* iter_op = op;
+    bool remove_attr = false;
+    while (auto* parent_op = iter_op->getParentOp()) {
+      if (parent_op->getAttrOfType<StringAttr>(kXlaOutsideCompilationAttr)) {
+        remove_attr = true;
+        break;
+      }
+      iter_op = parent_op;
+    }
+    if (remove_attr) op->removeAttr(kXlaOutsideCompilationAttr);
+  });
+}
+
 void MarkOpsForOutsideCompilation::runOnOperation() {
   auto module = getOperation();
+  const Dialect* tf_dialect = getContext().getLoadedDialect("tf");
+  if (!tf_dialect) {
+    getOperation().emitError() << "'tf' dialect is not registered";
+    return signalPassFailure();
+  }
+  OwningRewritePatternList patterns;
+  mhlo::PopulateLegalizeTfPatterns(module.getContext(), &patterns);
+  TF::PopulateLoweringTFPatterns(module.getContext(), &patterns);
+  AddCanonicalizationPatterns(module.getContext(), &patterns);
+
+  // `supported_ops` contains the name of all of the ops that can potentially be
+  // lowered into HLO on the device. This doesn't always mean that the op can
+  // be lowered in the future passes but if the op is not in this set, it can't
+  // be lowered in a subsequent pass.
+  llvm::DenseSet<OperationName> supported_ops;
+  for (auto& pattern : patterns) {
+    Optional<OperationName> root_kind = pattern->getRootKind();
+    if (root_kind.hasValue()) supported_ops.insert(root_kind.getValue());
+  }
+  AddSupportedControlFlowOps(module.getContext(), &supported_ops);
+  AddRewrittenEmbeddingOps(module.getContext(), &supported_ops);
+
+  auto result = module.walk([&](tf_device::ClusterOp cluster) {
+    // Only if `allow_soft_placement` attribute is true should we mark ops
+    // for outside compilation.
+    auto soft_placement_attr =
+        cluster.getAttrOfType<BoolAttr>(kAllowSoftPlacementAttr);
+    if (!(soft_placement_attr && soft_placement_attr.getValue())) {
+      return WalkResult::advance();
+    }
+    if (failed(
+            MarkUncompilableOps(tf_dialect, &cluster.GetBody(), supported_ops)))
+      return WalkResult::interrupt();
+
+    return WalkResult::advance();
+  });
+
+  if (result.wasInterrupted()) return signalPassFailure();
 
   module.walk([&](tf_device::ClusterOp cluster) {
-    MarkUncompilableOps(&cluster.GetBody());
+    // Only if `allow_soft_placement` attribute is true should we unmark ops
+    // for outside compilation.
+    auto soft_placement_attr =
+        cluster.getAttrOfType<BoolAttr>(kAllowSoftPlacementAttr);
+    if (!(soft_placement_attr && soft_placement_attr.getValue())) {
+      return;
+    }
+    UnmarkChildren(&cluster.GetBody());
   });
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/optimize_global_tensors.cc b/tensorflow/compiler/mlir/tensorflow/transforms/optimize_global_tensors.cc
index 6fee693554e..b81e390580d 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/optimize_global_tensors.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/optimize_global_tensors.cc
@@ -109,13 +109,14 @@ class ResourceAnalyzer {
         return;
       }
       if (auto if_op = dyn_cast<TF::IfOp>(op)) {
-        for (auto callee : {if_op.then_func(), if_op.else_func()}) {
+        for (auto callee : {if_op.then_function(), if_op.else_function()}) {
           PropagatePotentiallyWrittenUpFromCallee(callee, if_op.input());
         }
         return;
       }
       if (auto while_op = dyn_cast<TF::WhileOp>(op)) {
-        for (auto callee : {while_op.cond_func(), while_op.body_func()}) {
+        for (auto callee :
+             {while_op.cond_function(), while_op.body_function()}) {
           PropagatePotentiallyWrittenUpFromCallee(callee, while_op.input());
         }
         return;
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/parallelize_embedding_params_ops_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/parallelize_embedding_params_ops_pass.cc
index 527af0934ea..352604955c0 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/parallelize_embedding_params_ops_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/parallelize_embedding_params_ops_pass.cc
@@ -39,6 +39,10 @@ namespace {
 
 struct ParallelizeEmbeddingParamsOpsPass
     : public PassWrapper<ParallelizeEmbeddingParamsOpsPass, FunctionPass> {
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<tf_device::TensorFlowDeviceDialect>();
+  }
+
   void runOnFunction() override;
 };
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
index 3afadd2b06d..a4ddb713ec0 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
@@ -79,6 +79,11 @@ std::unique_ptr<OperationPass<FuncOp>> CreateRewriteTPUEmbeddingOpsPass();
 // Performs specific fusion for GPU targets.
 std::unique_ptr<OperationPass<FuncOp>> CreateGpuOpFusionPass();
 
+// Create a pass that convert ops that copy tensors between devices, e.g.
+// tf.Identity.
+std::unique_ptr<OperationPass<mlir::FuncOp>>
+CreateTensorDeviceCopyConversionPass();
+
 struct LayoutOptimizationPipelineOptions
     : public PassPipelineOptions<LayoutOptimizationPipelineOptions> {
   Option<std::string> force_data_format{
@@ -162,6 +167,12 @@ void PopulateLegalizeHloToTfPatterns(OwningRewritePatternList* patterns,
 // future these fusions may be codegen'd automatically.
 std::unique_ptr<OperationPass<FuncOp>> CreateFusedKernelMatcherPass();
 
+// Fuses operations defining `ContractionFusableInterface` interface into the
+// contraction operations (MatMul, Conv2D, etc...). This is a more general
+// version of `CreateFusedKernelMatcherPass` that relies on codegen to compose
+// contraction fusions together.
+std::unique_ptr<OperationPass<FuncOp>> CreateContractionFusionPass();
+
 // Creates function pass to select device index/fold tf.DeviceIndex.
 std::unique_ptr<OperationPass<FuncOp>> CreateDeviceIndexSelectorPass();
 
@@ -239,7 +250,7 @@ std::unique_ptr<OperationPass<FuncOp>> CreateReplicateInvariantOpHoistingPass();
 
 // Creates a pass that forms replica `tf_executor.island` from a single
 // `tf_device.replicate` island.
-std::unique_ptr<OperationPass<FuncOp>> CreateReplicateToIslandPass();
+std::unique_ptr<OperationPass<ModuleOp>> CreateReplicateToIslandPass();
 
 // Creates a pass that creates `tf_executor.island` from a single
 // `tf_device.parallel_execute` island.
@@ -269,7 +280,15 @@ std::unique_ptr<OperationPass<FuncOp>> CreateLaunchToDeviceAttributePass();
 namespace TFTPU {
 // Creates a pass that forms clusters from operations of the same
 // `_tpu_replicate` attribute.
-std::unique_ptr<OperationPass<FuncOp>> CreateTPUClusterFormationPass();
+std::unique_ptr<OperationPass<ModuleOp>> CreateTPUClusterFormationPass();
+
+// Creates a pass that cleans up `_tpu_replicate` attribute on operations
+// that are inside a cluster.
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateTPUClusterCleanupAttributesPass();
+
+// Creates a pass that removes Identity/IdentityN ops from a cluster.
+std::unique_ptr<OperationPass<ModuleOp>> CreateTPUIdentityPruningPass();
 
 // Creates a pass that allows TPU program inputs to have layouts determined at
 // run time.
@@ -279,6 +298,10 @@ std::unique_ptr<OperationPass<ModuleOp>> CreateTPUDynamicLayoutPass();
 // `tf_device.launch_func` `padding_map` attribute to its encapsulated function.
 std::unique_ptr<OperationPass<ModuleOp>> CreateTPUDynamicPaddingMapperPass();
 
+// Creates a pass that adds `tf.ReadVariableOp` to a TPU cluster for resources
+// the cluster only writes to.
+std::unique_ptr<OperationPass<ModuleOp>> CreateTPUResourceReadForWritePass();
+
 // Creates a pass that rewrites `tf_device.launch_func` on TPUs into TPU runtime
 // ops.
 std::unique_ptr<OperationPass<ModuleOp>> CreateTPURewritePass();
@@ -287,18 +310,29 @@ std::unique_ptr<OperationPass<ModuleOp>> CreateTPURewritePass();
 // computation.
 std::unique_ptr<OperationPass<ModuleOp>> CreateTPUShardingIdentificationPass();
 
+// Creates a pass that moves `tf.AssignVariableOp` into a
+// `tf_device.parallel_execute` region if the `tf.AssignVariableOp` is the
+// only consumer of a `tf_device.parallel_execute` result.
+std::unique_ptr<OperationPass<FuncOp>>
+CreateTPUParallelExecuteSinkResourceWritePass();
+
 // Creates a pass that merges device variable reads/updates into the surrounded
 // TPUExecute node. This allows the execute node to perform in-place variable
 // updates.
 std::unique_ptr<OperationPass<FuncOp>> CreateTPUMergeVariablesWithExecutePass();
 
+// Creates a pass that wraps ReadVariableOp/AssignVariable op that consumes a
+// packed tensor to have same device placement as underlying TPU device.
+std::unique_ptr<OperationPass<FuncOp>> CreateTPUColocateCompositeResourceOps();
+
 // Creates a pass that adds ops which perform formatting on variables at
 // run-time according to compilation result.
 std::unique_ptr<OperationPass<ModuleOp>> CreateTPUVariableReformattingPass();
 
 // Creates a pass that groups outside compiled operations (CPU ops inside TPU
 // cluster) into clusters that can be extracted and run on the CPU.
-std::unique_ptr<OperationPass<FuncOp>> CreateTPUOutsideCompilationClusterPass();
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateTPUOutsideCompilationClusterPass();
 
 // Creates a pass that extracts outside compilation (CPU ops inside TPU cluster)
 // at head/tail of TPU cluster to run before/after TPU computation.
@@ -321,6 +355,7 @@ std::unique_ptr<OperationPass<ModuleOp>>
 CreateTPUExtractOutsideCompilationPass();
 
 // Populates the supplied passmanager with the passes required to run the
+// bridge.
 void CreateTPUBridgePipeline(OpPassManager& pm);
 
 // Populates the supplied passmanager with the passes required to run the
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/region_control_flow_to_functional.cc b/tensorflow/compiler/mlir/tensorflow/transforms/region_control_flow_to_functional.cc
index ba876e08fbb..1e403bff0eb 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/region_control_flow_to_functional.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/region_control_flow_to_functional.cc
@@ -36,8 +36,8 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/op_or_arg_name_mapper.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
-#include "tensorflow/compiler/mlir/tensorflow/transforms/attribute_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
 
 #define DEBUG_TYPE "tf-region-cf-to-functional"
 
@@ -158,9 +158,11 @@ void ExtractSingleBlockRegion(Region& region, StringRef name,
 }
 
 // Returns call for region with single call whose result feeds into the
-// terminator of the region. Returns none if the region doesn't contain just
-// call and non-truncting casts ops.
-llvm::Optional<CallOp> IsSingleCallRegion(Region& region) {
+// terminator of the region. if `allow_to_bool` is true, also allows a single
+// ToBoolOp between the region yield and the call. Returns none if the region
+// does not conform to this pattern.
+llvm::Optional<CallOp> IsSingleCallRegion(Region& region,
+                                          bool allow_to_bool = false) {
   if (!llvm::hasSingleElement(region)) return llvm::None;
 
   Block& block = region.front();
@@ -169,31 +171,44 @@ llvm::Optional<CallOp> IsSingleCallRegion(Region& region) {
 
   if (it == block.rend()) return llvm::None;
 
+  // Operation which is expected to consume all the call results.
+  Operation* call_consumer = yield;
+
+  // Allow a single ToBoolOp between the call and the yield (valid only
+  // when the yield has a single operand)
+  if (allow_to_bool && yield.getNumOperands() == 1 && isa<ToBoolOp>(*it)) {
+    if (it->getResult(0) != yield.getOperand(0)) return llvm::None;
+    call_consumer = cast<ToBoolOp>(*it);
+    it++;
+  }
+
   // Check if there is a Call before the Yield.
   CallOp call = dyn_cast<CallOp>(*it++);
   if (!call) return llvm::None;
 
+  // All call results should feed into expected consumer
+  // All results of the call should feed into the yield.
+  if (call.getNumResults() != call_consumer->getNumOperands())
+    return llvm::None;
+
+  for (auto res_it : llvm::zip(call.getResults(), call_consumer->getOperands()))
+    if (std::get<0>(res_it) != std::get<1>(res_it)) return llvm::None;
+
   // There can only be non-truncating cast op's prior to the call.
   for (; it != block.rend(); ++it) {
     CastOp cast = dyn_cast<CastOp>(*it);
     if (!cast || cast.Truncate()) return llvm::None;
   }
 
-  // All results of the call should feed into the yield.
-  if (call.getNumResults() != yield.getNumOperands()) return llvm::None;
-
-  for (auto res_it : llvm::zip(call.getResults(), yield.getOperands()))
-    if (std::get<0>(res_it) != std::get<1>(res_it)) return llvm::None;
-
   return call;
 }
 
-using MatcherFn = function_ref<bool(Value, Region&, Value, Region&)>;
+using ArgMatcherFn = function_ref<bool(Value, Region&, Value, Region&)>;
 
 // Returns whether the arguments of the given 2 calls are match (after looking
 // through cast ops). `matcher` is the predicate used to check if two arguments
 // match.
-bool MatchCallArgs(CallOp first, CallOp second, MatcherFn matcher) {
+bool MatchCallArgs(CallOp first, CallOp second, ArgMatcherFn matcher) {
   if (first.getNumOperands() != second.getNumOperands()) return false;
 
   Region& first_region = *first.getParentRegion();
@@ -225,38 +240,37 @@ struct TrivialTransformInfo {
   // List of callee names (one for each region).
   llvm::SmallVector<StringRef, 2> callee_names;
 
-  // Constructor will analyze the 2 regions.
-  TrivialTransformInfo(Region& first, Region& second, MatcherFn matcher);
+  // Analyzes the given calls (from regions attached to the same parent op) to
+  // check if the parent op be transformed to functional form trivially (i.e.,
+  // reusing existing functions and without outlining). This is possible when
+  // all the regions are single call regions (checked using matchers outside
+  // this class) and the all the calls match using the given argument matcher.
+  //
+  // If such a trivial transformation is possible, stash the relevant
+  // information needed for the transformation, else indicate that a trivial
+  // transformation is not possible by setting `can_transform` to false.
+  TrivialTransformInfo(llvm::Optional<CallOp> first_call,
+                       llvm::Optional<CallOp> second_call,
+                       ArgMatcherFn arg_matcher) {
+    if (!first_call || !second_call) return;
+
+    if (!MatchCallArgs(first_call.getValue(), second_call.getValue(),
+                       arg_matcher))
+      return;
+
+    can_transform = true;
+    callee_names = {first_call.getValue().getCallee(),
+                    second_call.getValue().getCallee()};
+  }
 };
 
-// Analyzes the given set of regions (attached to the same parent op) to check
-// if the parent op be transformed to functional form trivially (i.e., reusing
-// existing functions and without outlining). This is possible when all the
-// regions are single call regions and the all the calls have the same
-// arguments.
-//
-// If such a trivial transformation is possible, stash the relevant information
-// needed for the transformation, else indicate that a trivial transformation is
-// not possible by setting `can_transform` to false.
-TrivialTransformInfo::TrivialTransformInfo(Region& first, Region& second,
-                                           MatcherFn matcher) {
-  auto call0 = IsSingleCallRegion(first);
-  auto call1 = IsSingleCallRegion(second);
-  if (!call0 || !call1) return;
-
-  if (!MatchCallArgs(call0.getValue(), call1.getValue(), matcher)) return;
-
-  can_transform = true;
-  callee_names = {call0.getValue().getCallee(), call1.getValue().getCallee()};
-}
-
 // Transform IfRegionOp to IfOp.
 LogicalResult RegionControlFlowToFunctional::ConvertIfOp(IfRegionOp if_region) {
   llvm::SmallVector<Value, 4> extern_values;
 
   // For IfOp, arguments of calls in the then and else regions match if they
   // are the same value.
-  auto if_matcher = [&](Value first, Region&, Value second, Region&) {
+  auto if_arg_matcher = [&](Value first, Region&, Value second, Region&) {
     if (first != second) return false;
 
     // collect the call arguments post lookup through cast Op's
@@ -264,8 +278,9 @@ LogicalResult RegionControlFlowToFunctional::ConvertIfOp(IfRegionOp if_region) {
     return true;
   };
 
-  const TrivialTransformInfo tti(if_region.then_branch(),
-                                 if_region.else_branch(), if_matcher);
+  const TrivialTransformInfo tti(IsSingleCallRegion(if_region.then_branch()),
+                                 IsSingleCallRegion(if_region.else_branch()),
+                                 if_arg_matcher);
 
   std::string then_name, else_name;
 
@@ -293,16 +308,23 @@ LogicalResult RegionControlFlowToFunctional::ConvertIfOp(IfRegionOp if_region) {
                              worklist, /*extern_values_passthrough=*/false);
   }
 
+  // Look through ToBool operations for the condition.
+  Value cond = if_region.cond();
+  auto to_bool = dyn_cast_or_null<ToBoolOp>(cond.getDefiningOp());
+  if (to_bool) cond = to_bool.getOperand();
+
   // Once we have the `then` and `else` functions ready (either outlined or
   // existing ones), replace the region based op with a functional control flow
   // op.
   OpBuilder builder(if_region);
   auto if_op = builder.create<IfOp>(
-      if_region.getLoc(), if_region.getResultTypes(), if_region.cond(),
-      extern_values, then_name, else_name, if_region.is_stateless());
-  CopyUnderscoredAttributes(if_region, if_op);
+      if_region.getLoc(), if_region.getResultTypes(), cond, extern_values,
+      then_name, else_name, if_region.is_stateless());
+  CopyDeviceAndUnderscoredAttributes(if_region, if_op);
   if_region.replaceAllUsesWith(if_op.getResults());
   if_region.erase();
+
+  if (to_bool && to_bool.use_empty()) to_bool.erase();
   return success();
 }
 
@@ -315,8 +337,8 @@ LogicalResult RegionControlFlowToFunctional::ConvertWhileOp(
   // cannot do a trivial transformation because post transform, we will need to
   // pass this extern value as an argument to the function, so we cannot use the
   // existing function as is.
-  auto while_matcher = [](Value first, Region& first_region, Value second,
-                          Region& second_region) {
+  auto while_arg_matcher = [](Value first, Region& first_region, Value second,
+                              Region& second_region) {
     if (!first.isa<BlockArgument>() || !second.isa<BlockArgument>())
       return false;
     BlockArgument first_block_arg = first.cast<BlockArgument>();
@@ -329,8 +351,9 @@ LogicalResult RegionControlFlowToFunctional::ConvertWhileOp(
            second_block_arg.getParentBlock() == &second_region.front();
   };
 
-  const TrivialTransformInfo tti(while_region.cond(), while_region.body(),
-                                 while_matcher);
+  const TrivialTransformInfo tti(
+      IsSingleCallRegion(while_region.cond(), /*allow_to_bool=*/true),
+      IsSingleCallRegion(while_region.body()), while_arg_matcher);
 
   // All existing inputs to while region are inputs to the functional while.
   auto new_inputs = llvm::to_vector<4>(while_region.getOperands());
@@ -376,7 +399,7 @@ LogicalResult RegionControlFlowToFunctional::ConvertWhileOp(
   auto while_op = builder.create<WhileOp>(
       while_region.getLoc(), new_result_types, new_inputs, cond_name, body_name,
       while_region.parallel_iterations(), while_region.is_stateless());
-  CopyUnderscoredAttributes(while_region, while_op);
+  CopyDeviceAndUnderscoredAttributes(while_region, while_op);
 
   // Redirect old results to new results.
   for (auto it : llvm::zip(
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/replicate_invariant_op_hoisting.cc b/tensorflow/compiler/mlir/tensorflow/transforms/replicate_invariant_op_hoisting.cc
index 031d57e99ba..96ff2890558 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/replicate_invariant_op_hoisting.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/replicate_invariant_op_hoisting.cc
@@ -151,7 +151,7 @@ bool IsOpReplicateInvariant(Region* replicate_region, Operation* op) {
 // invariant. Shape ops are rewritten to be invariant when possible, prior to
 // hoisting ops.
 void HoistReplicateInvariantOps(tf_device::ReplicateOp replicate_op) {
-  const int num_replicas = replicate_op.n().getLimitedValue();
+  const int num_replicas = replicate_op.n();
   Block* replicate_block = &replicate_op.GetBody();
 
   replicate_op.walk([&](TF::ShapeOp shape_op) {
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/replicate_to_island.cc b/tensorflow/compiler/mlir/tensorflow/transforms/replicate_to_island.cc
index b16868311f0..5b70729ee80 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/replicate_to_island.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/replicate_to_island.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <utility>
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Sequence.h"
@@ -32,12 +33,14 @@ limitations under the License.
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/Diagnostics.h"  // from @llvm-project
 #include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/device_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace mlir {
@@ -45,10 +48,11 @@ namespace TFDevice {
 namespace {
 constexpr char kDeviceAttr[] = "device";
 constexpr char kReplicaIdAttr[] = "_xla_replica_id";
+constexpr char kDeviceOrdinalAttr[] = "device_ordinal";
 
 struct ReplicateToIslandPass
-    : public PassWrapper<ReplicateToIslandPass, FunctionPass> {
-  void runOnFunction() override;
+    : public PassWrapper<ReplicateToIslandPass, OperationPass<ModuleOp>> {
+  void runOnOperation() override;
 };
 
 // Returns whether op requires `_xla_replica_id` attribute.
@@ -57,29 +61,207 @@ bool RequiresReplicaIDAttribute(Operation* op) {
                    TF::EnqueueTPUEmbeddingRaggedTensorBatchOp>(op);
 }
 
-// Adds integer attribute that represents replica id for replicated ops that
-// require replica id attribute.
-void AddReplicaIdToOpsInReplicatedRegion(OpBuilder* builder, Region* region,
-                                         const int replica_id) {
-  region->walk([&](Operation* replicated_op) {
-    if (RequiresReplicaIDAttribute(replicated_op))
-      replicated_op->setAttr(kReplicaIdAttr,
-                             builder->getI32IntegerAttr(replica_id));
+bool RequiresDeviceOrdinalAttribute(Operation* op) {
+  return llvm::isa<TF::_XlaSendFromHostOp>(op) ||
+         llvm::isa<TF::_XlaRecvAtHostOp>(op);
+}
+
+// Checks if a region contains ops that are replica variant.
+bool HasReplicaVariantOps(Region& region,
+                          const llvm::Optional<DictionaryAttr>& devices) {
+  auto result = region.walk([&](Operation* op) {
+    if (RequiresReplicaIDAttribute(op) ||
+        (devices.hasValue() && RequiresDeviceOrdinalAttribute(op)))
+      return WalkResult::interrupt();
+
+    if (auto launch = dyn_cast<tf_device::LaunchOp>(op))
+      if (devices.hasValue() && devices.getValue().get(launch.device()))
+        return WalkResult::interrupt();
+
+    return WalkResult::advance();
   });
+  return result.wasInterrupted();
+}
+
+// Collects all functions reachable from a region, including transitive ones.
+llvm::SmallPtrSet<FuncOp, 4> GetReachableFunctionsFromRegion(ModuleOp module,
+                                                             Region& region) {
+  llvm::SmallPtrSet<FuncOp, 4> visited_functions;
+
+  SymbolTable symbol_table(module);
+  auto symbol_uses = symbol_table.getSymbolUses(&region);
+  if (!symbol_uses) return {};
+
+  for (auto& use : *symbol_uses)
+    if (auto func =
+            symbol_table.lookup<FuncOp>(use.getSymbolRef().getRootReference()))
+      visited_functions.insert(func);
+
+  llvm::SmallVector<FuncOp, 4> functions_to_visit(visited_functions.begin(),
+                                                  visited_functions.end());
+  while (!functions_to_visit.empty()) {
+    llvm::SmallVector<FuncOp, 4> new_functions_to_visit;
+
+    for (FuncOp function_to_visit : functions_to_visit) {
+      auto func_symbol_uses =
+          symbol_table.getSymbolUses(function_to_visit.getCallableRegion());
+      if (!func_symbol_uses) continue;
+
+      for (auto& use : *func_symbol_uses)
+        if (auto func = symbol_table.lookup<FuncOp>(
+                use.getSymbolRef().getRootReference()))
+          if (visited_functions.insert(func).second)
+            new_functions_to_visit.push_back(func);
+    }
+
+    functions_to_visit.swap(new_functions_to_visit);
+  }
+
+  return visited_functions;
+}
+
+// Collects all functions and transitive functions reachable from region that
+// contain replicate variant ops.
+llvm::SmallDenseMap<llvm::StringRef, FuncOp> GetReachableFunctionsToClone(
+    ModuleOp module, Region& region,
+    const llvm::Optional<DictionaryAttr>& devices) {
+  llvm::SmallPtrSet<FuncOp, 4> reachable_functions =
+      GetReachableFunctionsFromRegion(module, region);
+
+  llvm::SmallDenseMap<llvm::StringRef, FuncOp> functions_to_clone;
+  llvm::SmallVector<FuncOp, 4> functions_to_visit;
+  for (FuncOp func : reachable_functions) {
+    if (!func.getCallableRegion()) continue;
+    if (HasReplicaVariantOps(*func.getCallableRegion(), devices)) {
+      functions_to_clone.insert({func.getName(), func});
+      functions_to_visit.push_back(func);
+    }
+  }
+
+  while (!functions_to_visit.empty()) {
+    llvm::SmallVector<FuncOp, 4> new_functions_to_visit;
+
+    for (FuncOp func_to_visit : functions_to_visit) {
+      auto func_uses = func_to_visit.getSymbolUses(module);
+      if (!func_uses) continue;
+      for (auto use : *func_uses) {
+        auto parent_func = use.getUser()->getParentOfType<FuncOp>();
+        if (!parent_func || !reachable_functions.contains(parent_func) ||
+            !functions_to_clone.insert({parent_func.getName(), parent_func})
+                 .second)
+          continue;
+        new_functions_to_visit.push_back(parent_func);
+      }
+    }
+
+    functions_to_visit.swap(new_functions_to_visit);
+  }
+
+  return functions_to_clone;
+}
+
+struct FuncOldNameAndClone {
+  StringRef old_name;
+  FuncOp clone;
+};
+
+// Replaces all symbol uses with cloned functions, for `region` and across the
+// cloned functions themselves.
+LogicalResult UpdateSymbolUsesWithClones(
+    SymbolTable& symbol_table, ModuleOp module, Region& region,
+    llvm::MutableArrayRef<FuncOldNameAndClone> cloned_functions) {
+  llvm::SmallVector<std::pair<StringRef, StringRef>, 4> old_to_new_names;
+  old_to_new_names.reserve(cloned_functions.size());
+  for (auto& cloned_function : cloned_functions)
+    old_to_new_names.push_back(
+        {cloned_function.old_name, cloned_function.clone.getName()});
+
+  for (const auto& old_to_new_name : old_to_new_names) {
+    if (failed(symbol_table.replaceAllSymbolUses(
+            old_to_new_name.first, old_to_new_name.second, &region)))
+      return failure();
+
+    for (auto& cloned_function : cloned_functions)
+      if (failed(symbol_table.replaceAllSymbolUses(
+              old_to_new_name.first, old_to_new_name.second,
+              cloned_function.clone.getCallableRegion())))
+        return failure();
+  }
+  return success();
+}
+
+// Collects TPU device ordinal for outside compilation communication ops. This
+// currently assumes outside compilation only uses `TPU_REPLICATED_CORE_0`
+// aliased device for the device computation.
+llvm::Optional<int64_t> GetDeviceOrdinal(
+    const llvm::Optional<DictionaryAttr>& devices, Location loc,
+    unsigned replica_id) {
+  int64_t device_ordinal = 0;
+  if (devices.hasValue()) {
+    if (auto tpu_replica_0 = devices.getValue().get("TPU_REPLICATED_CORE_0")) {
+      llvm::StringRef tpu_device = tpu_replica_0.cast<ArrayAttr>()[replica_id]
+                                       .cast<StringAttr>()
+                                       .getValue();
+      if (succeeded(tensorflow::GetDeviceOrdinalFromDeviceString(
+              loc, tpu_device, &device_ordinal))) {
+        return llvm::Optional<int64_t>(device_ordinal);
+      }
+    }
+  }
+  return llvm::None;
+}
+
+// Updates replica variant ops in a region based on replica `replica_id`.
+// TODO(b/157624749): Replace this with better abstraction to differentiate ops
+// for different replicas. Some ops, such as XlaHostCompute op or TPU Embedding
+// ops, require replica id to be added as an op attribute to be used during
+// execution. Handle such ops separately and add an integer attribute that
+// represents replica id.
+LogicalResult UpdateRegionReplicateVariantOps(
+    OpBuilder& builder, Location loc, Region& region, int replica_id,
+    llvm::MutableArrayRef<FuncOldNameAndClone> cloned_functions,
+    const llvm::Optional<DictionaryAttr>& devices) {
+  llvm::Optional<int64_t> device_ordinal =
+      GetDeviceOrdinal(devices, loc, replica_id);
+
+  auto update_replicate_variant_ops = [&](Operation* op) {
+    // Add replica id.
+    if (RequiresReplicaIDAttribute(op))
+      op->setAttr(kReplicaIdAttr, builder.getI32IntegerAttr(replica_id));
+
+    if (!devices.hasValue()) return;
+
+    // Map aliased devices to explicit devices based on replica.
+    if (auto launch = dyn_cast<tf_device::LaunchOp>(op))
+      if (auto device_by_replica = devices.getValue().get(launch.device()))
+        launch.setAttr(
+            kDeviceAttr,
+            device_by_replica.cast<ArrayAttr>()[replica_id].cast<StringAttr>());
+
+    // Add device ordinal.
+    if (device_ordinal && RequiresDeviceOrdinalAttribute(op))
+      op->setAttr(kDeviceOrdinalAttr,
+                  builder.getI64IntegerAttr(*device_ordinal));
+  };
+
+  region.walk(update_replicate_variant_ops);
+  for (auto& cloned_function : cloned_functions)
+    cloned_function.clone.getCallableRegion()->walk(
+        update_replicate_variant_ops);
+
+  return success();
 }
 
 // Creates islands per replica from `tf_device.replicate` region. If for a
 // `tf_device.launch` op the device is an aliased device of the
 // `tf_device.replicate`, the device will be remapped to an explicit device
 // for the associated replica island.
-llvm::SmallVector<tf_executor::IslandOp, 8> ExpandReplicateIntoReplicas(
-    const Dialect* tf_dialect, OpBuilder* builder,
+LogicalResult ExpandReplicateIntoReplicas(
+    const Dialect* tf_dialect, OpBuilder& builder, ModuleOp module,
     tf_executor::IslandOp island_op, tf_device::ReplicateOp replicate_op,
-    int num_replicas) {
-  auto devices = replicate_op.devices();
-  const bool has_devices = devices.hasValue();
-  llvm::SmallVector<tf_executor::IslandOp, 8> replicas;
+    int num_replicas, llvm::SmallVectorImpl<tf_executor::IslandOp>& replicas) {
   replicas.reserve(num_replicas);
+  auto devices = replicate_op.devices();
 
   // Collect result types and operands.
   Operation& terminator = replicate_op.GetBody().back();
@@ -88,16 +270,30 @@ llvm::SmallVector<tf_executor::IslandOp, 8> ExpandReplicateIntoReplicas(
   llvm::SmallVector<Value, 8> replica_inputs(island_op.controlInputs());
 
   // Replace replicate terminator with YieldOp.
-  builder->setInsertionPoint(&terminator);
-  builder->create<tf_executor::YieldOp>(terminator.getLoc(),
-                                        terminator.getOperands());
+  builder.setInsertionPoint(&terminator);
+  builder.create<tf_executor::YieldOp>(terminator.getLoc(),
+                                       terminator.getOperands());
   terminator.erase();
 
-  builder->setInsertionPoint(island_op);
+  auto funcs_to_clone =
+      GetReachableFunctionsToClone(module, replicate_op.body(), devices);
+  SymbolTable symbol_table(module);
+
+  builder.setInsertionPoint(island_op);
   BlockAndValueMapping mapping;
   for (int i : llvm::seq<int>(0, num_replicas)) {
+    // Clone reachable functions with replica variant ops.
+    llvm::SmallVector<FuncOldNameAndClone, 4> cloned_functions;
+    cloned_functions.reserve(funcs_to_clone.size());
+    for (auto& func_to_clone : funcs_to_clone) {
+      auto cloned_function = func_to_clone.getSecond().clone();
+      symbol_table.insert(cloned_function, module.end());
+      cloned_functions.push_back(
+          {func_to_clone.getSecond().getName(), cloned_function});
+    }
+
     // Create new island for replica.
-    auto replica = builder->create<tf_executor::IslandOp>(
+    auto replica = builder.create<tf_executor::IslandOp>(
         island_op.getLoc(), output_types, control_type, replica_inputs);
 
     // Map block arg to replica arg.
@@ -109,28 +305,19 @@ llvm::SmallVector<tf_executor::IslandOp, 8> ExpandReplicateIntoReplicas(
     // Copy over replicate region into replica island.
     replicate_op.body().cloneInto(&replica.body(), mapping);
 
-    // TODO(b/157624749): Replace this with better abstraction to
-    // differentiate ops for different replicas.
-    // Some ops, such as XlaHostCompute op or TPU Embedding ops, require
-    // replica id to be added as an op attribute to be used during
-    // execution. Handle such ops separately and add an integer attribute
-    // that represents replica id.
-    AddReplicaIdToOpsInReplicatedRegion(builder, &replica.body(), i);
+    if (failed(UpdateSymbolUsesWithClones(symbol_table, module, replica.body(),
+                                          cloned_functions)))
+      return failure();
 
-    // Map aliased devices to explicit devices based on replica.
-    if (has_devices) {
-      replica.walk([&](tf_device::LaunchOp launch) {
-        if (auto device_by_replica = devices.getValue().get(launch.device()))
-          launch.setAttr(
-              kDeviceAttr,
-              device_by_replica.cast<ArrayAttr>()[i].cast<StringAttr>());
-      });
-    }
+    if (failed(UpdateRegionReplicateVariantOps(
+            builder, replicate_op.getLoc(), replica.body(),
+            /*replica_id=*/i, cloned_functions, devices)))
+      return failure();
 
     replicas.push_back(replica);
   }
 
-  return replicas;
+  return success();
 }
 
 // Creates islands per replica from `tf_device.replicate` region and remap
@@ -183,17 +370,19 @@ llvm::SmallVector<tf_executor::IslandOp, 8> ExpandReplicateIntoReplicas(
 //   }) {device = "/DEVICE:3"} : () -> tensor<i1>
 //   tf_executor.yield %a1, %b1 : tensor<i1>, tensor<i1>
 // }
-void CreateIslandsFromReplicate(const Dialect* tf_dialect,
-                                tf_executor::GraphOp graph_op,
-                                tf_executor::IslandOp island_op,
-                                tf_device::ReplicateOp replicate_op) {
+LogicalResult CreateIslandsFromReplicate(const Dialect* tf_dialect,
+                                         ModuleOp module,
+                                         tf_executor::GraphOp graph_op,
+                                         tf_executor::IslandOp island_op,
+                                         tf_device::ReplicateOp replicate_op) {
   OpBuilder builder(island_op);
-  const int num_replicas = replicate_op.n().getLimitedValue();
+  const int num_replicas = replicate_op.n();
 
   // Create islands per replica.
-  llvm::SmallVector<tf_executor::IslandOp, 8> replicas =
-      ExpandReplicateIntoReplicas(tf_dialect, &builder, island_op, replicate_op,
-                                  num_replicas);
+  llvm::SmallVector<tf_executor::IslandOp, 8> replicas;
+  if (failed(ExpandReplicateIntoReplicas(tf_dialect, builder, module, island_op,
+                                         replicate_op, num_replicas, replicas)))
+    return failure();
 
   // Collect all replica results.
   llvm::SmallVector<Value, 8> replicas_outputs(replicate_op.getNumResults(),
@@ -244,36 +433,41 @@ void CreateIslandsFromReplicate(const Dialect* tf_dialect,
   }
 
   island_op.erase();
+  return success();
 }
 
-// Finds islands with a single `tf_device.replicate` and create individual
-// islands per replica of the replicate.
-void LowerSingleIslandReplicateToIslands(const Dialect* tf_dialect,
-                                         tf_executor::GraphOp graph_op,
-                                         tf_executor::IslandOp island_op) {
-  if (!island_op.WrapsSingleOp()) return;
-
-  if (auto replicate_op =
-          llvm::dyn_cast<tf_device::ReplicateOp>(&island_op.GetBody().front()))
-    CreateIslandsFromReplicate(tf_dialect, graph_op, island_op, replicate_op);
-}
-
-void ReplicateToIslandPass::runOnFunction() {
-  const Dialect* tf_dialect = getContext().getRegisteredDialect("tf");
+void ReplicateToIslandPass::runOnOperation() {
+  auto module = getOperation();
+  const Dialect* tf_dialect = getContext().getLoadedDialect("tf");
   if (!tf_dialect) {
-    signalPassFailure();
-    getFunction().emitError() << "'tf' dialect is not registered";
+    module.emitError() << "'tf' dialect is not registered";
+    return signalPassFailure();
   }
 
-  getFunction().walk([&](tf_executor::GraphOp graph_op) {
-    for (auto island_op :
-         llvm::make_early_inc_range(graph_op.getOps<tf_executor::IslandOp>()))
-      LowerSingleIslandReplicateToIslands(tf_dialect, graph_op, island_op);
+  // Find islands with a single `tf_device.replicate` and create individual
+  // islands per replica of the replicate.
+  llvm::SmallVector<tf_executor::IslandOp, 4> replicate_op_islands;
+  module.walk([&](tf_executor::GraphOp graph_op) {
+    for (auto island_op : graph_op.getOps<tf_executor::IslandOp>()) {
+      if (!island_op.WrapsSingleOp()) continue;
+
+      if (isa<tf_device::ReplicateOp>(&island_op.GetBody().front()))
+        replicate_op_islands.push_back(island_op);
+    }
   });
+
+  for (tf_executor::IslandOp island_op : replicate_op_islands) {
+    auto graph_op = island_op.getParentOfType<tf_executor::GraphOp>();
+    auto replicate_op =
+        cast<tf_device::ReplicateOp>(island_op.GetBody().front());
+    if (failed(CreateIslandsFromReplicate(tf_dialect, module, graph_op,
+                                          island_op, replicate_op)))
+      return signalPassFailure();
+  }
 }
 }  // anonymous namespace
 
-std::unique_ptr<OperationPass<FuncOp>> CreateReplicateToIslandPass() {
+std::unique_ptr<OperationPass<ModuleOp>> CreateReplicateToIslandPass() {
   return std::make_unique<ReplicateToIslandPass>();
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/resource_device_inference.cc b/tensorflow/compiler/mlir/tensorflow/transforms/resource_device_inference.cc
index 7e8e9ee30c8..648805febfe 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/resource_device_inference.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/resource_device_inference.cc
@@ -26,10 +26,13 @@ limitations under the License.
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/OpImplementation.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/IR/Visitors.h"  // from @llvm-project
@@ -39,6 +42,9 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/visitor_util.h"
+
+#define DEBUG_TYPE "tf-resource-device-inference"
 
 namespace mlir {
 namespace TF {
@@ -66,22 +72,18 @@ class PerFunctionResult {
       : alias_analysis_(alias_analysis) {}
 
   // Returns the recorded device assignment for a resource, if any.
-  llvm::Optional<llvm::StringRef> DeviceForResource(
-      const Value resource) const {
-    llvm::Optional<llvm::StringRef> result;
-    if (alias_analysis_.IsUnknownResource(resource)) return result;
+  Optional<StringRef> DeviceForResource(Value resource) const {
+    Optional<StringRef> result;
+    if (alias_analysis_.IsUnknownResource(resource)) return llvm::None;
     for (int64_t id : alias_analysis_.GetResourceUniqueIds(resource)) {
       auto it = resource_id_to_device_.find(id);
       if (it == resource_id_to_device_.end()) continue;
-      if (!result) {
+      if (!result || result == it->second) {
         result = it->getSecond();
         continue;
       }
-      if (result != it->getSecond()) {
-        // Got conflicting assignments, clear the result.
-        result.reset();
-        return result;
-      }
+      // Got conflicting assignments
+      return llvm::None;
     }
     return result;
   }
@@ -90,7 +92,7 @@ class PerFunctionResult {
   // conflicts with an existing one, returns an error.
   //
   // If `changed` is provided, assign *changed to true if anything is modified.
-  LogicalResult AddResourceDevice(const Value resource, llvm::StringRef device,
+  LogicalResult AddResourceDevice(Value resource, StringRef device,
                                   bool* changed = nullptr) {
     if (alias_analysis_.IsUnknownResource(resource)) return success();
     for (int64_t id : alias_analysis_.GetResourceUniqueIds(resource)) {
@@ -106,13 +108,12 @@ class PerFunctionResult {
   }
 
  private:
-  llvm::SmallDenseMap<int64_t, llvm::StringRef, 8> resource_id_to_device_;
+  llvm::SmallDenseMap<int64_t, StringRef, 8> resource_id_to_device_;
   const TF::ResourceAliasAnalysis::Info& alias_analysis_;
 };
 
 // Tries to record device assignment for a resource.
-LogicalResult AddResourceDeviceAndEmitError(const Value resource,
-                                            llvm::StringRef device,
+LogicalResult AddResourceDeviceAndEmitError(Value resource, StringRef device,
                                             Operation* error_reporting_op,
                                             PerFunctionResult* result,
                                             bool* changed = nullptr) {
@@ -124,18 +125,34 @@ LogicalResult AddResourceDeviceAndEmitError(const Value resource,
   return res;
 }
 
+// Extracts and canonicalizes the device attribute.
+inline StringRef GetDeviceAttr(FuncOp func, int arg_no) {
+  auto device_attr =
+      func.getArgAttrOfType<mlir::StringAttr>(arg_no, kFuncDeviceAttr);
+  return device_attr ? device_attr.getValue() : "";
+}
+
+// Extracts and canonicalizes the device attribute.
+inline StringRef GetDeviceAttr(Operation* op) {
+  auto device_attr = op->getAttrOfType<mlir::StringAttr>(kDeviceAttr);
+  return device_attr ? device_attr.getValue() : "";
+}
+
+// Print operation with debug info (to get line number info for debugging)
+void dump(StringRef message, Operation* op) {
+  llvm::dbgs() << message;
+  op->print(llvm::dbgs(), OpPrintingFlags().enableDebugInfo(true));
+  llvm::dbgs() << "\n";
+}
+
 // Propagates device assignment inside a function.
 LogicalResult ComputeResourceDevicesInComputation(FuncOp func_op,
                                                   PerFunctionResult* result) {
   OpBuilder builder(func_op);
   // Function arguments.
-  for (auto arg : func_op.getArguments()) {
-    if (!mlir::getElementTypeOrSelf(arg.getType()).isa<TF::ResourceType>()) {
-      continue;
-    }
-    auto device_attr = func_op.getArgAttrOfType<mlir::StringAttr>(
-        arg.getArgNumber(), kFuncDeviceAttr);
-    if (!device_attr || device_attr.getValue() == "") {
+  for (auto arg : filter_resources(func_op.getArguments())) {
+    StringRef device_attr = GetDeviceAttr(func_op, arg.getArgNumber());
+    if (device_attr.empty()) {
       // If device_attr does not exist, try to construct it from any recorded
       // assignment.
       if (auto device = result->DeviceForResource(arg)) {
@@ -145,51 +162,71 @@ LogicalResult ComputeResourceDevicesInComputation(FuncOp func_op,
       continue;
     }
     // Record the attribute.
-    auto res = AddResourceDeviceAndEmitError(arg, device_attr.getValue(),
-                                             func_op, result);
+    auto res = AddResourceDeviceAndEmitError(arg, device_attr, func_op, result);
     if (failed(res)) return res;
   }
-  auto walk_res = func_op.walk([&](Operation* op) {
-    if (auto var_handle = llvm::dyn_cast<TF::VarHandleOp>(op)) {
-      // Record VarHandleOp's device attribute.
-      auto device_attr =
-          var_handle.getAttrOfType<mlir::StringAttr>(kDeviceAttr);
-      if (!device_attr || device_attr.getValue().empty()) {
-        return WalkResult::advance();
-      }
-      auto res = AddResourceDeviceAndEmitError(
-          var_handle.resource(), device_attr.getValue(), op, result);
-      if (failed(res)) return WalkResult::interrupt();
-    }
-    if (auto identity = llvm::dyn_cast<TF::IdentityOp>(op)) {
-      // Try to construct IdentityOp's attribute from recorded assignment.
-      if (!mlir::getElementTypeOrSelf(identity.output().getType())
-               .isa<TF::ResourceType>()) {
-        return WalkResult::advance();
-      }
-      if (auto device = result->DeviceForResource(identity.output())) {
-        auto device_attr =
-            identity.getAttrOfType<mlir::StringAttr>(kDeviceAttr);
-        if (!device_attr || device_attr.getValue().empty()) {
-          identity.setAttr(kDeviceAttr, builder.getStringAttr(*device));
+
+  // To support WhileRegion, we need to propagate device attributes from
+  // WhileRegion operands to body/cond region arguments *prior* to visiting
+  // these regions. Use tensorflow::walk() instead of MLIR core walker to
+  // implement such a pre-order walk.
+  auto walk_res = tensorflow::GenericWalk(
+      func_op, [&](Operation* op, const tensorflow::WalkStage& stage) {
+        // We just need to visit operations in pre-order mode.
+        if (!stage.IsBeforeAllRegions()) return WalkResult::advance();
+
+        if (auto var_handle = dyn_cast<VarHandleOp>(op)) {
+          // Record VarHandleOp's device attribute.
+          StringRef device_attr = GetDeviceAttr(op);
+          if (device_attr.empty()) return WalkResult::advance();
+          auto res = AddResourceDeviceAndEmitError(var_handle.resource(),
+                                                   device_attr, op, result);
+          if (failed(res)) return WalkResult::interrupt();
+        } else if (auto identity = dyn_cast<IdentityOp>(op)) {
+          LLVM_DEBUG(dump("Visiting ", identity));
+          // Try to construct IdentityOp's attribute from recorded assignment.
+          if (!GetDeviceAttr(op).empty()) return WalkResult::advance();
+          for (auto output : filter_resources(op->getResults())) {
+            LLVM_DEBUG(llvm::dbgs() << "  Processing output #"
+                                    << output.getResultNumber() << "\n");
+            if (auto device = result->DeviceForResource(output)) {
+              LLVM_DEBUG(llvm::dbgs()
+                         << " Setting device = " << *device << "\n");
+              identity.setAttr(kDeviceAttr, builder.getStringAttr(*device));
+            }
+          }
+        } else if (auto while_region = dyn_cast<WhileRegionOp>(op)) {
+          // For WhileRegion, do local analysis prior to visiting the attached
+          // regions and propagate device annotations to the cond and body
+          // region arguments. The annotations are the union of annotations
+          // on the input and result. Resource alias analysis already propagates
+          // resource ID from the inputs to the results for a while, so just
+          // need to consider the results.
+          LLVM_DEBUG(llvm::dbgs() << "Visiting WhileRegion\n");
+
+          for (auto output : filter_resources(while_region.getResults())) {
+            auto device = result->DeviceForResource(output);
+            int output_index = output.getResultNumber();
+            if (!device) {
+              LLVM_DEBUG(llvm::dbgs()
+                         << "  No device for output #" << output_index << "\n");
+              continue;
+            }
+            // Transfer the annotation to both region arguments
+            for (Region* region : while_region.getRegions()) {
+              BlockArgument arg = region->getArgument(output_index);
+              LLVM_DEBUG(llvm::dbgs()
+                         << "  Propagating device = '" << *device
+                         << "' to arg #" << output_index << " of region #"
+                         << region->getRegionNumber() << "\n");
+              if (failed(AddResourceDeviceAndEmitError(arg, *device,
+                                                       while_region, result)))
+                return WalkResult::interrupt();
+            }
+          }
         }
-      }
-      return WalkResult::advance();
-    }
-    // Propagate and record output device assignment for other ops based on
-    // existing recording. E.g., IdentityN.
-    for (auto output : op->getResults()) {
-      if (!mlir::getElementTypeOrSelf(output.getType())
-               .isa<TF::ResourceType>()) {
-        continue;
-      }
-      if (auto device = result->DeviceForResource(output)) {
-        auto res = AddResourceDeviceAndEmitError(output, *device, op, result);
-        if (failed(res)) return WalkResult::interrupt();
-      }
-    }
-    return WalkResult::advance();
-  });
+        return WalkResult::advance();
+      });
   return failure(walk_res.wasInterrupted());
 }
 
@@ -198,13 +235,13 @@ void ResourceDeviceInference::runOnOperation() {
   const auto& resource_alias_analysis =
       getAnalysis<TF::ResourceAliasAnalysis>();
 
-  llvm::SmallDenseMap<Operation*, PerFunctionResult, 4> per_function_results;
+  llvm::SmallDenseMap<FuncOp, PerFunctionResult, 4> per_function_results;
   llvm::SetVector<FuncOp> worklist;
-  module.walk([&](FuncOp func_op) {
+  for (auto func_op : module.getOps<FuncOp>()) {
     worklist.insert(func_op);
     per_function_results.try_emplace(
         func_op, func_op, resource_alias_analysis.GetAnalysisForFunc(func_op));
-  });
+  }
   // Helper that propagates an op's recorded operand device assignments to its
   // called function's arguments.
   auto propagate_operands_to_callee_arguments =
@@ -214,51 +251,59 @@ void ResourceDeviceInference::runOnOperation() {
           assert(callee);
           auto& callee_res = per_function_results.find(callee)->getSecond();
           bool callee_needs_recompute = false;
-          for (auto operand_and_argument :
-               llvm::zip(caller_operands, callee.getArguments())) {
-            if (!mlir::getElementTypeOrSelf(
-                     std::get<0>(operand_and_argument).getType())
-                     .isa<TF::ResourceType>()) {
-              continue;
-            }
-            auto device =
-                caller_res.DeviceForResource(std::get<0>(operand_and_argument));
+          for (BlockArgument arg : filter_resources(callee.getArguments())) {
+            Value arg_operand = caller_operands[arg.getArgNumber()];
+            auto device = caller_res.DeviceForResource(arg_operand);
             if (!device) continue;
-            if (failed(AddResourceDeviceAndEmitError(
-                    std::get<1>(operand_and_argument), *device, caller,
-                    &callee_res, &callee_needs_recompute))) {
+            LLVM_DEBUG(llvm::dbgs()
+                       << "Propagating '" << *device << "' to arg #"
+                       << arg.getArgNumber() << " of function @"
+                       << callee.getName() << "\n");
+            if (failed(AddResourceDeviceAndEmitError(arg, *device, caller,
+                                                     &callee_res,
+                                                     &callee_needs_recompute)))
               return failure();
-            }
           }
           // If the callee recording is modified, make sure that it will be
           // reprocessed.
-          if (callee_needs_recompute) {
-            worklist.insert(callee);
-          }
+          if (callee_needs_recompute) worklist.insert(callee);
         }
         return success();
       };
 
   while (!worklist.empty()) {
-    auto func_op = worklist.back();
-    worklist.pop_back();
+    auto func_op = worklist.pop_back_val();
     auto& func_res = per_function_results.find(func_op)->getSecond();
     // In-function propagation.
-    if (failed(ComputeResourceDevicesInComputation(func_op, &func_res))) {
+    if (failed(ComputeResourceDevicesInComputation(func_op, &func_res)))
       return signalPassFailure();
-    }
+
     // Propagation to callees.
     auto walk_res = func_op.walk([&](Operation* op) {
-      if (auto while_op = llvm::dyn_cast<TF::WhileOp>(op)) {
+      if (auto while_op = dyn_cast<WhileOp>(op)) {
         if (failed(propagate_operands_to_callee_arguments(
                 while_op, while_op.getOperands(),
-                {while_op.body_func(), while_op.cond_func()}, func_res)))
-          return WalkResult::interrupt();
-      } else if (auto if_op = llvm::dyn_cast<TF::IfOp>(op)) {
-        if (failed(propagate_operands_to_callee_arguments(
-                if_op, if_op.input(), {if_op.then_func(), if_op.else_func()},
+                {while_op.body_function(), while_op.cond_function()},
                 func_res)))
           return WalkResult::interrupt();
+      } else if (auto if_op = dyn_cast<IfOp>(op)) {
+        if (failed(propagate_operands_to_callee_arguments(
+                if_op, if_op.input(),
+                {if_op.then_function(), if_op.else_function()}, func_res)))
+          return WalkResult::interrupt();
+      } else if (auto call = dyn_cast<CallOpInterface>(op)) {
+        auto func = dyn_cast<FuncOp>(call.resolveCallable());
+        if (!func) {
+          op->emitError(
+              "Cannot propagate device attribute to callee: Unable to resolve "
+              "call");
+          return WalkResult::interrupt();
+        }
+        LLVM_DEBUG(llvm::dbgs()
+                   << "Visiting call to function @" << func.getName() << "\n");
+        if (failed(propagate_operands_to_callee_arguments(
+                call, call.getArgOperands(), {func}, func_res)))
+          return WalkResult::interrupt();
       }
       return WalkResult::advance();
     });
@@ -266,15 +311,15 @@ void ResourceDeviceInference::runOnOperation() {
   }
 }
 
+PassRegistration<ResourceDeviceInference> pass(
+    "tf-resource-device-inference",
+    "Propagates the device attribute on resources from callers to callees.");
+
 }  // namespace
 
 std::unique_ptr<OperationPass<ModuleOp>> CreateResourceDeviceInferencePass() {
   return std::make_unique<ResourceDeviceInference>();
 }
 
-static PassRegistration<ResourceDeviceInference> pass(
-    "tf-resource-device-inference",
-    "Propagates the device attribute on resources from callers to callees.");
-
 }  // namespace TF
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc b/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
index 702455d156d..5984aafb88f 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
@@ -15,12 +15,17 @@ limitations under the License.
 
 // This pass lifts resource variable operations outside of device computation.
 
+#include <cstddef>
 #include <cstdint>
 
+#include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Casting.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
@@ -31,20 +36,24 @@ limitations under the License.
 #include "mlir/IR/Function.h"  // from @llvm-project
 #include "mlir/IR/Module.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Region.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/Verifier.h"  // from @llvm-project
 #include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting_cleanup.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/mangling_util.h"
@@ -136,15 +145,18 @@ struct ResourceOpLiftingPass
   void runOnOperation() override;
 };
 
-// Removes identity nodes in the block. The device computation does not need
-// such nodes to carry information.
-void RemoveIdentity(Block* block) {
-  for (auto& op : llvm::make_early_inc_range(*block)) {
-    if (isa<TF::IdentityOp, TF::IdentityNOp>(&op)) {
-      op.replaceAllUsesWith(op.getOperands());
-      op.erase();
-    }
-  }
+bool IsResource(Value value) {
+  return getElementTypeOrSelf(value.getType()).isa<TF::ResourceType>();
+}
+
+// Get the type of the data contained in a resource. Returns null if there is
+// no single type in the resource.
+Type GetResourceSubtype(Value value) {
+  auto resource_type =
+      getElementTypeOrSelf(value.getType()).dyn_cast<TF::ResourceType>();
+  auto subtypes = resource_type.getSubtypes();
+  if (subtypes.size() == 1) return subtypes[0];
+  return nullptr;
 }
 
 // Performs store-load forwarding. This effectively removes
@@ -186,166 +198,448 @@ void ForwardStoreToLoad(Block* block) {
   }
 }
 
-// Moves resource load operations with the provided `move_load` function. This
-// assumes load-store forwarding has been performed on this block such that
-// all loads of same resource are on its initial values. A `skip_load` functions
-// is used to indicate whether a load should be skipped. If there are multiple
-// loads on the same resource, only the first one will be moved, and the later
-// ones will be removed and replaced with the first one.
-void HoistResourceLoads(
-    Block* block, llvm::function_ref<bool(TF::ReadVariableOp)> skip_load,
-    llvm::function_ref<void(TF::ReadVariableOp)> move_load) {
-  llvm::SmallDenseMap<Value, TF::ReadVariableOp> resource_to_read_ops;
+//===----------------------------------------------------------------------===//
+// RegionResourceHoister
+//===----------------------------------------------------------------------===//
 
+// Helper class to hoist resource ops out of regions attached to an op.
+class RegionResourceHoister {
+ public:
+  explicit RegionResourceHoister(Operation* op) : op_(op) {}
+
+  // Analyzes attached regions to record resources read and written.
+  LogicalResult Analyze();
+
+  // Returns all resources accessed by the regions attached the op.
+  auto& GetResources() { return resources_; }
+
+  // Returns if the given value is a resouce that needs lifting.
+  bool Contains(Value resource) const {
+    return resources_.find(resource) != resources_.end();
+  }
+
+  // Drops the given resource from lifting.
+  void DropResource(Value resource) {
+    resources_.erase(resource);
+    written_resources_.remove(resource);
+  }
+
+  // Replaces all resource loads in all regions attached to the op.
+  void ReplaceResourceLoads(bool read_only) {
+    llvm::for_each(op_->getRegions(), [&](Region& region) {
+      ReplaceResourceLoads(region, read_only);
+    });
+  }
+
+  static LogicalResult ReplaceOpWithNewOp(Operation* op);
+
+ private:
+  // Returns if any resources need lifting.
+  bool NeedsLifting() const { return !resources_.empty(); }
+
+  // Returns the number of results generated by the lifted op.
+  int GetLiftedNumResults() const { return num_new_results_; }
+
+  // Generates hoisted reads for resources that need them before the op.
+  void GenerateHoistedReads();
+
+  // Replaces all resource loads in the given region with hoisted loads. If
+  // `read_only` is true, limit this replacement to read only resources.
+  void ReplaceResourceLoads(Region& region, bool read_only);
+
+  // Appends final values writte to resources to the region returns for the
+  // given set of regions.
+  void AppendResourceStoreValueToReturn(RegionRange regions);
+
+  // Performs the final replacement of the op.
+  void ReplaceOpWithNewOp();
+
+  // Returns is this resource was written to in any of the regions.
+  bool IsWritten(Value resource) const {
+    return written_resources_.contains(resource);
+  }
+
+  static LogicalResult HoistResourcesOutOfIfCaseCluster(Operation* op);
+  static LogicalResult HoistResourcesOutOfWhileRegion(TF::WhileRegionOp op);
+
+  Operation* op_;
+
+  // Per resource information about accesses to that resource.
+  struct ResourceInfo {
+    // Is this resource read in any of the regions?
+    bool is_read;
+    // Is this resource written in any of the regions?
+    bool is_written;
+    // Is this resource written in all of the regions?
+    bool is_written_all;
+    // The hoisted read used to replace region reads.
+    Value hoisted_read;
+    // the type of the data held by the resource.
+    Type data_type;
+    // For written resources, the result # of the lifted op which will hold the
+    // value of the resource. This result will be used to generates writes to
+    // the resource after the lifted op.
+    int result_index;
+    // Attributes on the read operation.
+    DictionaryAttr read_attrs;
+    // Attributes on the write operation.
+    DictionaryAttr write_attrs;
+
+    ResourceInfo()
+        : is_read(false),
+          is_written(false),
+          is_written_all(false),
+          hoisted_read(nullptr),
+          data_type(nullptr),
+          result_index(-1) {}
+
+    bool IsResultIndexAssigned() { return result_index != -1; }
+
+    // Refine the resource type using the given type `type`.
+    void RefineType(Type type) {
+      if (!data_type) {
+        data_type = type;
+      } else {
+        data_type = TF::GetCastCompatibleType(data_type, type,
+                                              /*may_ignore_ref_type_a=*/false);
+        assert(data_type != nullptr && "Resource used with incompatible types");
+      }
+    }
+  };
+  llvm::MapVector<Value, ResourceInfo> resources_;
+  llvm::SetVector<Value> written_resources_;
+  // number of new results after lifting.
+  int num_new_results_;
+};
+
+// Analyzes resources that are read or written within attached regions.
+LogicalResult RegionResourceHoister::Analyze() {
+  // Hoisting of child regions might have created opportunity for store-load
+  // forwarding.
+  for (Region& region : op_->getRegions()) {
+    ForwardStoreToLoad(&region.front());
+  }
+
+  llvm::SetVector<Value> all_resources;
+  bool is_func = false;
+  // For functions, the resources to analyze are the function arguments.
+  // Otherwise, its the region captures.
+  if (FuncOp func = dyn_cast<FuncOp>(op_)) {
+    is_func = true;
+    Region& body = func.getBody();
+    for (BlockArgument arg : body.getArguments()) {
+      if (IsResource(arg)) all_resources.insert(arg);
+    }
+  } else {
+    getUsedValuesDefinedAbove(op_->getRegions(), all_resources);
+    all_resources.remove_if([](Value value) { return !IsResource(value); });
+  }
+
+  num_new_results_ = op_->getNumResults();
+
+  for (auto resource : all_resources) {
+    ResourceInfo info;
+    info.data_type = GetResourceSubtype(resource);
+    llvm::BitVector written_regions(op_->getNumRegions());
+    bool unsupported_use = false;
+    for (OpOperand& use : resource.getUses()) {
+      Operation* user = use.getOwner();
+      // If the user is not in one of the regions, we are not interested in it.
+      // Since all the sub-regions within this region (i.e., regions attached to
+      // op's in this region) have themselves gone through lifting, all resource
+      // users are expected to be operations in this region and and not embedded
+      // within other sub-regions attached to op's in this region. So the check
+      // for whether a user is in one of the regions attached to this op is
+      // straightforward.
+      if (user->getParentRegion()->getParentOp() != op_) continue;
+
+      // For functions, if the resource is used as a return operand, use that
+      // as its result index.
+      if (is_func && isa<ReturnOp>(user)) {
+        assert(!info.IsResultIndexAssigned() &&
+               "Expect resource argument to returned no more than once");
+        info.result_index = use.getOperandNumber();
+        continue;
+      }
+
+      auto read = dyn_cast<TF::ReadVariableOp>(user);
+      auto write = dyn_cast<TF::AssignVariableOp>(user);
+      if (!read && !write) {
+        unsupported_use = true;
+        break;
+      }
+
+      if (read && !info.is_read) {
+        info.is_read = true;
+        info.RefineType(read.value().getType());
+        info.read_attrs = user->getAttrDictionary();
+      }
+
+      if (write) {
+        info.is_written = true;
+        info.RefineType(write.value().getType());
+        info.write_attrs = user->getAttrDictionary();
+        written_regions.set(user->getParentRegion()->getRegionNumber());
+      }
+    }
+
+    // If the resource is used in an op that we do not understand, skip
+    // lifting for that resource.
+    if (unsupported_use) continue;
+
+    info.is_written_all = written_regions.count() == op_->getNumRegions();
+
+    // If the resource is written in some but not all regions, we would need
+    // a read for the value before these regions. Note that this is applicable
+    // only to multi-region ops:
+    // If/Case: If not all regions write to the resource, post hoisting the read
+    //   value need to be routed through all paths that don't write.
+    // While: since while condition cannot write, any resource written in the
+    //   while body will need to be read as well in case the while body is never
+    //   executed.
+    // Both cases are handled by the condition below.
+    if (info.is_written && !info.is_written_all) info.is_read = true;
+
+    // Allocate a result index for written resources that don't have one.
+    if (info.is_written) {
+      written_resources_.insert(resource);
+      if (!info.IsResultIndexAssigned()) info.result_index = num_new_results_++;
+    }
+
+    resources_.insert({resource, info});
+  }
+  return success();
+}
+
+// Generates hoisted reads for all resources that need them just before the op.
+void RegionResourceHoister::GenerateHoistedReads() {
+  OpBuilder builder(op_);
+  for (auto& resource_it : GetResources()) {
+    Value resource = resource_it.first;
+    auto& info = resource_it.second;
+
+    if (info.is_read) {
+      Operation* read = builder.create<TF::ReadVariableOp>(
+          op_->getLoc(), info.data_type, resource);
+      read->setAttrs(info.read_attrs);
+      info.hoisted_read = read->getResult(0);
+    }
+  }
+}
+
+// Replaces all resource reads with the hoisted read.
+void RegionResourceHoister::ReplaceResourceLoads(Region& region,
+                                                 bool read_only) {
+  assert(llvm::hasSingleElement(region) && "Expected single block region");
   // Only iterate through ops directly in the body as we can't handle
   // ops nested deeper in regions.
-  for (Operation& op : llvm::make_early_inc_range(*block)) {
-    auto read_variable_op = dyn_cast<TF::ReadVariableOp>(&op);
-    if (!read_variable_op) continue;
-    if (skip_load(read_variable_op)) continue;
+  auto all_reads = region.front().getOps<TF::ReadVariableOp>();
+  for (auto read_op : llvm::make_early_inc_range(all_reads)) {
+    Value resource = read_op.resource();
+    if (!Contains(resource)) continue;
 
-    Value resource = read_variable_op.resource();
-    auto p = resource_to_read_ops.insert({resource, read_variable_op});
-    if (p.second) {
-      move_load(read_variable_op);
-      continue;
+    ResourceInfo& info = resources_[resource];
+    // If replacing loads for read only resources, skip if the resource
+    // was written to.
+    if (read_only && info.is_written) continue;
+
+    read_op.replaceAllUsesWith(info.hoisted_read);
+    read_op.erase();
+  }
+}
+
+// For written resources, add its value at the end of each region to that
+// regions return value. For a region, its value at the end may be a value
+// written to that resource in that region, or its hoisted read value if the
+// resource is not written in that region. The return value can be vended out
+// either as an existing return value, or a newly allocated return value.
+void RegionResourceHoister::AppendResourceStoreValueToReturn(
+    RegionRange regions) {
+  for (Region* region : regions) {
+    assert(llvm::hasSingleElement(*region) && "Expected single block region");
+    Block& front = region->front();
+    auto old_return = front.getTerminator();
+    assert(old_return->getNumOperands() == op_->getNumResults());
+    auto new_return_operands = llvm::to_vector<4>(old_return->getOperands());
+    new_return_operands.resize(num_new_results_);
+
+    // initialize return values for written resources to be the hosited reads.
+    for (Value resource : written_resources_) {
+      const ResourceInfo& info = resources_[resource];
+      new_return_operands[info.result_index] = info.hoisted_read;
     }
 
-    // Getting here means a load operation of this resource has been hoisted out
-    // before. Use hoisted load result to replace all uses of current op result
-    // and erase op.
-    op.replaceAllUsesWith(p.first->second);
-    op.erase();
-  }
-}
+    // Only iterate through ops directly in the body as op's embedded in child
+    // regions should have been lifted out.
+    auto assign_ops = front.getOps<TF::AssignVariableOp>();
+    for (auto assign_variable_op : llvm::make_early_inc_range(assign_ops)) {
+      Value resource = assign_variable_op.resource();
+      if (!IsWritten(resource)) continue;
 
-// If there are any stores to resource defined outside of the block then the
-// stored values must be returned so that new values can be used by sunk
-// resource stores.
-// Returns true if any resource variable stored values are appended, otherwise
-// false.
-bool AppendResourceStoreValueToReturn(Block* body) {
-  bool has_resource_store = false;
-  auto old_return = body->getTerminator();
-
-  llvm::SmallVector<Value, 4> new_return_operands(old_return->getOperands());
-
-  // Only iterate through ops directly in the body as we can't handle ops nested
-  // deeper in regions.
-  for (auto assign_variable_op : body->getOps<TF::AssignVariableOp>()) {
-    Value resource = assign_variable_op.resource();
-    if (!resource) continue;
-
-    // Skip resources created inside of the body.
-    if (resource.getParentRegion() == body->getParent()) continue;
-
-    // TODO(ycao): Prevent same value from being returned multiple times.
-    // TODO(ycao): Do not return resource store value if it is defined outside
-    // of cluster.
-    new_return_operands.push_back(assign_variable_op.value());
-    has_resource_store = true;
-  }
-
-  // If no resource stores are found, no need to update return op.
-  if (!has_resource_store) return false;
-
-  OpBuilder builder(old_return);
-  builder.create<tf_device::ReturnOp>(old_return->getLoc(),
-                                      new_return_operands);
-  old_return->erase();
-  return true;
-}
-
-// Moves resource store operations to after cluster. This assumes load-store
-// forwarding has been performed on this cluster such that there is at most one
-// resource store operation carrying its final value.
-tf_device::ClusterOp SinkResourceStores(tf_device::ClusterOp cluster,
-                                        OpBuilder* builder) {
-  // Update ReturnOp inside cluster's body to output final values of updated
-  // external resources.
-  if (!AppendResourceStoreValueToReturn(&cluster.GetBody())) return cluster;
-
-  auto new_return_op = cluster.GetBody().getTerminator();
-  llvm::SmallVector<Type, 4> new_return_types(new_return_op->getOperandTypes());
-
-  builder->setInsertionPoint(cluster);
-  auto new_cluster = builder->create<tf_device::ClusterOp>(
-      cluster.getLoc(), new_return_types,
-      /*operands=*/llvm::SmallVector<Value, 4>(), cluster.getAttrs());
-  new_cluster.body().takeBody(cluster.body());
-
-  // Replace uses of old cluster results with those of new_cluster.
-  for (auto result : llvm::zip(cluster.getResults(), new_cluster.getResults()))
-    std::get<0>(result).replaceAllUsesWith(std::get<1>(result));
-
-  // Create a mapping from operands of new_return_op operands to new_cluster
-  // results.
-  BlockAndValueMapping mapper;
-  for (auto operand_result :
-       llvm::zip(new_return_op->getOperands(), new_cluster.getResults()))
-    mapper.map(std::get<0>(operand_result), std::get<1>(operand_result));
-
-  // Clone all resource store ops and map their operands to values returned from
-  // new_cluster.
-  for (Operation& op : llvm::make_early_inc_range(new_cluster.GetBody())) {
-    if (isa<TF::AssignVariableOp>(op)) {
-      builder->clone(op, mapper);
-      op.erase();
+      // TODO(ycao): Prevent same value from being returned multiple times.
+      // TODO(ycao): Do not return resource store value if it is defined outside
+      // of cluster. Both of these can be post-resource-op-lifting cleanup
+      // passes.
+      int result_index = resources_[resource].result_index;
+      new_return_operands[result_index] = assign_variable_op.value();
+      assign_variable_op.erase();
     }
+    old_return->setOperands(new_return_operands);
   }
-
-  cluster.erase();
-  return new_cluster;
 }
 
-// Hoists resource variable loads and sinks stores from cluster.
-LogicalResult HoistResourceOpsFromCluster(tf_device::ClusterOp cluster,
-                                          ModuleOp module) {
-  OpBuilder builder(module);
+// Replace the old op with a new op (with potentially additional results), and
+// add stores to written resources after the new op.
+void RegionResourceHoister::ReplaceOpWithNewOp() {
+  auto new_result_types = llvm::to_vector<4>(op_->getResultTypes());
+  int result_region = isa<TF::WhileRegionOp>(op_) ? 1 : 0;
+  Operation* terminator = op_->getRegion(result_region).front().getTerminator();
+  auto extra_result_types =
+      terminator->getOperands().drop_front(op_->getNumResults()).getTypes();
+  new_result_types.insert(new_result_types.end(), extra_result_types.begin(),
+                          extra_result_types.end());
+  OpBuilder builder(op_);
+  // Clone ths old operation but with new result types.
+  Operation* new_op = Operation::create(
+      op_->getLoc(), op_->getName(), new_result_types,
+      llvm::to_vector<4>(op_->getOperands()), op_->getAttrs(),
+      llvm::to_vector<4>(op_->getSuccessors()), op_->getNumRegions());
+  builder.insert(new_op);
 
-  // Remove identity nodes to avoid aliasing.
-  RemoveIdentity(&cluster.GetBody());
-
-  // Perform store-load forwarding. So that each resource is only loaded with
-  // its initial value and is only stored with its final value.
-  ForwardStoreToLoad(&cluster.GetBody());
-
-  // Move loads of external resources, if any, to before cluster.
-  // (Skipping resources created inside of cluster.)
-  HoistResourceLoads(
-      &cluster.GetBody(),
-      /*skip_load=*/
-      [&](TF::ReadVariableOp read) {
-        return read.resource().getParentRegion() == &cluster.body();
-      },
-      /*move_load=*/
-      [&](TF::ReadVariableOp read) {
-        read.getOperation()->moveBefore(cluster);
-      });
-
-  // Move stores of external resources, if any, to after cluster.
-  auto new_cluster = SinkResourceStores(cluster, &builder);
-
-  llvm::SetVector<Value> captured_values;
-  getUsedValuesDefinedAbove(new_cluster.body(), new_cluster.body(),
-                            captured_values);
-
-  for (Value v : captured_values) {
-    auto tensor_type = v.getType().dyn_cast<TensorType>();
-    if (!tensor_type) continue;
-    if (!tensor_type.getElementType().isa<TF::ResourceType>()) continue;
-
-    return new_cluster.emitOpError()
-           << "has remaining resource inputs that can not be lifted";
+  // Move regions to the new op.
+  for (auto it : llvm::zip(op_->getRegions(), new_op->getRegions())) {
+    Region& old_region = std::get<0>(it);
+    Region& new_region = std::get<1>(it);
+    new_region.takeBody(old_region);
   }
 
+  // Insert stores to all written resources.
+  for (Value resource : written_resources_) {
+    ResourceInfo& info = resources_[resource];
+    Value value_to_write = new_op->getResult(info.result_index);
+    Operation* write = builder.create<TF::AssignVariableOp>(
+        op_->getLoc(), resource, value_to_write);
+    write->setAttrs(info.write_attrs);
+  }
+
+  // As a part of lifting, we either reuse an existing slot for resource type
+  // results or add a new slot. Resource type results should not have any uses
+  // to begin with. So we can safely replace each old op result with the
+  // corresponding new op result.
+  int old_num_results = op_->getNumResults();
+  op_->replaceAllUsesWith(new_op->getResults().take_front(old_num_results));
+  op_->erase();
+  op_ = nullptr;
+}
+
+// Lift resource load and stores out of regions attached to `op`, where op is
+// an If/case/cluster op.
+LogicalResult RegionResourceHoister::HoistResourcesOutOfIfCaseCluster(
+    Operation* op) {
+  RegionResourceHoister hoister(op);
+  if (failed(hoister.Analyze())) return failure();
+
+  // If there are no resource region captures, then nothing to do.
+  if (!hoister.NeedsLifting()) return success();
+
+  // Start the transformation. For each region, replace the resource read with
+  // the value read before the op.
+  hoister.GenerateHoistedReads();
+  hoister.ReplaceResourceLoads(/*read_only=*/false);
+  hoister.AppendResourceStoreValueToReturn(op->getRegions());
+  hoister.ReplaceOpWithNewOp();
   return success();
 }
 
+// Lift resource loads and stores out of WhileRegion
+LogicalResult RegionResourceHoister::HoistResourcesOutOfWhileRegion(
+    TF::WhileRegionOp op) {
+  // For WhileRegion, post canonicalization all resource used within the
+  // body and condition regions are replaced with captured values, so we do not
+  // need to take into account the body and condition region arguments.
+  RegionResourceHoister hoister(op);
+
+  if (failed(hoister.Analyze())) return failure();
+
+  // If there are no resource region captures, then nothing to do.
+  if (!hoister.NeedsLifting()) return success();
+
+  // The resources captured for While loop fall into two categories:
+  // (a) read-only. These reads can be replaced by a hoisted read created
+  //        before the WhileOp (similar to if and case).
+  // (b) written: since the value is written in the loop (which can only in
+  //        loop body, all these will become loop variables. Since all resource
+  //        variables are removed from the loop variabled during
+  //        canonicalizationW, we need to create new operand/result slots. The
+  //        input operands for these slots are the read values
+  //        prior to the op, and all references to these are replaced by the
+  //        corresponding slot argument. We need to generate writes following
+  //        the while for these resources.
+  //
+  // Note that for WhileRegion ops, if a resource is written, it will be written
+  // only in the body and not the condition, so the hoister analysis will infer
+  // it as needing a read as well.
+
+  // Generate hoisted reads before the while.
+  hoister.GenerateHoistedReads();
+
+  // Replace just the read-only resources with the hoisted reads.
+  hoister.ReplaceResourceLoads(/*read_only=*/true);
+
+  // For written resources, add additional operands to the while op.
+  int num_old_results = op.getNumResults();
+  int num_new_results = hoister.GetLiftedNumResults();
+  int num_extra_results = num_new_results - num_old_results;
+
+  SmallVector<Type, 4> new_result_types;
+  SmallVector<Value, 4> new_while_operands;
+  new_result_types.resize(num_extra_results);
+  new_while_operands.resize(num_extra_results);
+
+  for (auto& it : hoister.GetResources()) {
+    if (!it.second.is_written) continue;
+    int index = it.second.result_index - num_old_results;
+    new_result_types[index] = it.second.data_type;
+    new_while_operands[index] = it.second.hoisted_read;
+  }
+  op.getOperation()->insertOperands(op.getNumOperands(), new_while_operands);
+
+  // Patch the cond and body regions to have additional arguments, and replace
+  // the remaining resource reads (which will be resource reads for written
+  // resources) with these arguments.
+  for (Region* region : op.getRegions()) {
+    region->addArguments(new_result_types);
+    // Point hoisted read for written resources to the region's arguments.
+    for (auto& it : hoister.GetResources()) {
+      if (!it.second.is_written) continue;
+      it.second.hoisted_read = region->getArgument(it.second.result_index);
+    }
+    hoister.ReplaceResourceLoads(*region, /*read_only=*/false);
+  }
+
+  // Add additional return values to body return. These correspond to values
+  // written to resources in the body region.
+  hoister.AppendResourceStoreValueToReturn(op.getRegions().drop_front());
+
+  // Finally, create a new while with additional return values.
+  hoister.ReplaceOpWithNewOp();
+  return success();
+}
+
+// Lift resources out of the regions attached to `op`
+LogicalResult RegionResourceHoister::ReplaceOpWithNewOp(Operation* op) {
+  if (auto while_op = dyn_cast<TF::WhileRegionOp>(op))
+    return HoistResourcesOutOfWhileRegion(while_op);
+  return HoistResourcesOutOfIfCaseCluster(op);
+}
+
 // Holds information about a function's use of a resource argument.
 struct ResourceArgUseInfo {
+  // Data type of the data contained in the resource.
   Type data_type;
+  // Is the resource argument used in an assign op?
   bool updated;
+  // Is the resource argument used in a read or assign op?
   bool used;
 };
 
@@ -356,34 +650,35 @@ struct ResourceArgUseInfo {
 LogicalResult FindResourceArgUseInfo(
     FuncOp func_op, llvm::SmallDenseMap<int64_t, ResourceArgUseInfo>* result) {
   auto return_op = func_op.front().getTerminator();
-  for (auto arg : func_op.getArguments()) {
-    if (!getElementTypeOrSelf(arg.getType()).isa<TF::ResourceType>()) continue;
+  for (auto arg : TF::filter_resources(func_op.getArguments())) {
     ResourceArgUseInfo info;
     info.used = false;
     info.updated = false;
-    bool do_not_touch = false;
+    bool read_or_assigned = false;
+    bool used_in_unsupported_op = false;
     for (auto user : arg.getUsers()) {
       if (user == return_op) continue;
+      info.used = true;
       if (auto read = llvm::dyn_cast<TF::ReadVariableOp>(user)) {
-        info.used = true;
+        read_or_assigned = true;
         info.data_type = read.getType();
         continue;
       }
+
       if (auto assign = llvm::dyn_cast<TF::AssignVariableOp>(user)) {
-        info.used = true;
+        read_or_assigned = true;
         info.updated = true;
         info.data_type = assign.value().getType();
         continue;
       }
-      if (isa<TF::StackPushV2Op, TF::StackPopV2Op>(user)) {
-        // Stacks will be handled by a separate pass.
-        do_not_touch = true;
-        break;
-      }
-      user->emitOpError("found unsupported operations on resource.");
-      return failure();
+
+      used_in_unsupported_op = true;
+      break;
     }
-    if (!do_not_touch) (*result)[arg.getArgNumber()] = info;
+
+    // If the arg is used in an unsupported op, skip lifting it.
+    if (used_in_unsupported_op) continue;
+    (*result)[arg.getArgNumber()] = info;
   }
   return success();
 }
@@ -469,59 +764,61 @@ void RemoveUnusedResourceArgumentsAndForwardedRetvals(
 // signature. resource_data_types is the (index, data type) pair for each
 // resource argument. handle_updated_arg_value is a caller-provided function
 // that handles the updated value for an resource argument.
-void LiftArgRetResourcesForFunction(
+LogicalResult LiftArgRetResourcesForFunction(
     FuncOp func_op,
     const llvm::SmallDenseMap<int64_t, Type>& resource_data_types,
     llvm::function_ref<void(int64_t, Value)> handle_updated_arg_value) {
   ForwardStoreToLoad(&func_op.front());
-  // Maps a resource argument to the first read.
-  llvm::SmallDenseMap<Value, TF::ReadVariableOp, 4> resource_arg_read;
-  // Maps a resource argument to the last write.
-  llvm::SmallDenseMap<Value, TF::AssignVariableOp, 4> resource_arg_write;
-  // Use HoistResourceLoads to CSE loads and the `move_load` function only
-  // records the remaining load to resource_arg_read.
-  HoistResourceLoads(
-      &func_op.front(),
-      /*skip_load=*/
-      [&](TF::ReadVariableOp read) {
-        return !read.resource().isa<BlockArgument>();
-      },
-      /*move_load=*/
-      [&](TF::ReadVariableOp read) {
-        resource_arg_read[read.resource()] = read;
-      });
-  // Record the stores in resource_arg_read.
-  for (auto& op : llvm::make_early_inc_range(func_op.front())) {
-    auto write = llvm::dyn_cast<TF::AssignVariableOp>(&op);
-    if (!write) continue;
-    auto arg = write.resource().dyn_cast<BlockArgument>();
-    if (!arg) continue;
-    // After ForwardStoreToLoad(), there should be just one store for each
-    // resource.
-    resource_arg_write[arg] = write;
-  }
-  // Now change the input types to non-resource and remove the internal loads.
-  auto new_types = llvm::to_vector<8>(func_op.getType().getInputs());
-  for (auto& entry : resource_data_types) {
-    auto arg = func_op.getArgument(entry.getFirst());
-    auto read_it = resource_arg_read.find(arg);
-    auto write_it = resource_arg_write.find(arg);
-    arg.setType(entry.getSecond());
-    new_types[arg.getArgNumber()] = entry.getSecond();
-    if (read_it != resource_arg_read.end()) {
-      read_it->getSecond().replaceAllUsesWith(arg);
-      read_it->getSecond().erase();
-    }
-    if (write_it != resource_arg_write.end()) {
-      handle_updated_arg_value(arg.getArgNumber(),
-                               write_it->getSecond().value());
-      write_it->getSecond().erase();
+
+  RegionResourceHoister hoister(func_op);
+  if (failed(hoister.Analyze())) return failure();
+
+  // Each of these resources could be read or written in the function. If its
+  // read, we need to replace the resource arg with a value arg to get the
+  // read value. If its written, we need to replace the write with an additional
+  // value to be written.
+
+  // Now create read values that will be used to replace each resource that
+  // is read in the function body. These read vaulues are just the same argument
+  // with type replaced.
+  llvm::SmallVector<Value, 4> skipped_args;
+  for (auto& it : hoister.GetResources()) {
+    BlockArgument arg = it.first.dyn_cast<BlockArgument>();
+    assert(arg && "Expect resources for FuncOp to be its arguments");
+    auto type_iter = resource_data_types.find(arg.getArgNumber());
+    if (type_iter == resource_data_types.end()) {
+      // Skip lifting the resource if it's not present in the data type map.
+      // This indicates that the resource is not to be lifted because it is used
+      // in an unsupported op in some other function.
+      skipped_args.push_back(arg);
+    } else {
+      arg.setType(type_iter->second);
+      it.second.hoisted_read = arg;
     }
   }
-  func_op.setType(FunctionType::get(
-      new_types,
-      llvm::to_vector<4>(func_op.front().getTerminator()->getOperandTypes()),
-      func_op.getContext()));
+
+  // Drop all the args that have to be skipped.
+  for (Value arg : skipped_args) hoister.DropResource(arg);
+
+  hoister.ReplaceResourceLoads(/*read_only=*/false);
+
+  // For writes, invoke the callback and then erase the write.
+  auto assign_ops = func_op.front().getOps<TF::AssignVariableOp>();
+  for (auto assign_variable_op : llvm::make_early_inc_range(assign_ops)) {
+    Value resource = assign_variable_op.resource();
+    if (!hoister.Contains(resource)) continue;
+
+    auto arg = resource.dyn_cast<BlockArgument>();
+    handle_updated_arg_value(arg.getArgNumber(), assign_variable_op.value());
+    assign_variable_op.erase();
+  }
+
+  func_op.setType(
+      FunctionType::get(func_op.front().getArgumentTypes(),
+                        func_op.front().getTerminator()->getOperandTypes(),
+                        func_op.getContext()));
+
+  return success();
 }
 
 // Returns a vector filtered from range where the unused elements (specified by
@@ -570,29 +867,7 @@ void AddLoadsStoresOutsideControlFlowOp(
 
 // Lifts loads/stores from while loop's body and cond functions.
 LogicalResult HandleWhileLoop(TF::WhileOp while_op, FuncOp body, FuncOp cond) {
-  // Remove identity nodes to avoid aliasing.
-  RemoveIdentity(&body.front());
-  RemoveIdentity(&cond.front());
   auto return_op = body.front().getTerminator();
-  // Sanity check: body resource input/output should alias each other.
-  for (auto arg : body.getArguments()) {
-    if (!getElementTypeOrSelf(arg.getType()).isa<TF::ResourceType>()) continue;
-    if (return_op->getOperand(arg.getArgNumber()) != arg) {
-      return return_op->emitOpError(
-                 "resource used in while loop is only supported when the ")
-             << "resource input and output alias each other in the loop body.";
-    }
-  }
-  // FindResourceArgUseInfo will check supported resource ops (read and assign),
-  // but loop condition has additional requirement that it cannot write
-  // resources.
-  if (cond.walk([&](TF::AssignVariableOp assign) {
-            assign.emitOpError("found resource write in loop condition.");
-            return WalkResult::interrupt();
-          })
-          .wasInterrupted()) {
-    return failure();
-  }
   llvm::SmallDenseMap<int64_t, ResourceArgUseInfo> body_use_info;
   llvm::SmallDenseMap<int64_t, ResourceArgUseInfo> cond_use_info;
   if (failed(FindResourceArgUseInfo(body, &body_use_info)) ||
@@ -603,12 +878,7 @@ LogicalResult HandleWhileLoop(TF::WhileOp while_op, FuncOp body, FuncOp cond) {
   auto resource_arg_uses =
       MergeArgResourceUseInfo(body_use_info, cond_use_info);
   if (resource_arg_uses.empty()) return success();
-  for (const auto& entry : resource_arg_uses) {
-    // Replace output resource uses with the input, so that we can later freely
-    // change the output type.
-    while_op.getResult(entry.getFirst())
-        .replaceAllUsesWith(while_op.getOperand(entry.getFirst()));
-  }
+
   // Remove unused resources in functions.
   llvm::SmallVector<int64_t, 4> old_to_new_indices;
   llvm::SmallDenseMap<int64_t, Type> remaining_resource_data_types;
@@ -661,50 +931,8 @@ LogicalResult HandleWhileLoop(TF::WhileOp while_op, FuncOp body, FuncOp cond) {
 // Lifts loads/stores from an IfOp or CaseOp's branches.
 template <class CaseOrIfOp>
 LogicalResult HandleCaseOrIfOp(CaseOrIfOp op, ArrayRef<FuncOp> branches) {
-  // Remove identity nodes to avoid aliasing.
-  for (auto func : branches) RemoveIdentity(&func.front());
-
-  // Sanity check: branch return of resources should be aliases of inputs. If
-  // so, replace the output uses with the input so that we can remove these
-  // outputs.
-  for (OpResult result : op.getResults()) {
-    if (!getElementTypeOrSelf(result.getType()).isa<TF::ResourceType>())
-      continue;
-    unsigned result_index = result.getResultNumber();
-    constexpr unsigned kUnassigned = -1;
-    unsigned common_aliasing_arg_num = kUnassigned;
-    for (auto func : branches) {
-      auto retval = func.front().getTerminator()->getOperand(result_index);
-      assert(result.getType() == retval.getType());
-      auto aliasing_arg = retval.dyn_cast<BlockArgument>();
-      if (!aliasing_arg)
-        return op.emitOpError("unsupported output: ")
-               << "resource does not alias input";
-      if (common_aliasing_arg_num == kUnassigned)
-        common_aliasing_arg_num = aliasing_arg.getArgNumber();
-      if (aliasing_arg.getArgNumber() != common_aliasing_arg_num)
-        return op.emitOpError("unsupported output: ")
-               << "resource does not alias a single input";
-    }
-    assert(common_aliasing_arg_num != kUnassigned);
-    result.replaceAllUsesWith(op.getOperand(common_aliasing_arg_num + 1));
-  }
-
-  // Erase the resource outputs from the branches.
-  int64_t non_resource_results = 0;
-  llvm::SmallVector<int64_t, 4> old_to_new_output_indices;
-  bool output_removed = false;
-  for (auto result : op.getResults()) {
-    if (!getElementTypeOrSelf(result.getType())
-             .template isa<TF::ResourceType>()) {
-      old_to_new_output_indices.push_back(non_resource_results++);
-      continue;
-    }
-    old_to_new_output_indices.push_back(-1);
-    for (auto func : branches)
-      func.front().getTerminator()->eraseOperand(non_resource_results);
-    output_removed = true;
-  }
+  // For canonicalized If/Case, there should not be any resource outputs
+  int64_t non_resource_results = op.getNumResults();
 
   llvm::SmallDenseMap<int64_t, ResourceArgUseInfo> resource_arg_uses;
   if (failed(FindResourceArgUseInfo(branches.front(), &resource_arg_uses)))
@@ -719,7 +947,7 @@ LogicalResult HandleCaseOrIfOp(CaseOrIfOp op, ArrayRef<FuncOp> branches) {
         MergeArgResourceUseInfo(resource_arg_uses, branch_use_info);
   }
 
-  if (resource_arg_uses.empty() && !output_removed) return success();
+  if (resource_arg_uses.empty()) return success();
   // Remove unused resources in functions.
   llvm::SmallDenseMap<int64_t, Type> remaining_resource_data_types;
   RemoveUnusedResourceArgumentsAndForwardedRetvals(
@@ -794,12 +1022,7 @@ LogicalResult HandleCaseOrIfOp(CaseOrIfOp op, ArrayRef<FuncOp> branches) {
   AddLoadsStoresOutsideControlFlowOp(new_op,
                                      arg_data_type_and_updated_output_index);
   // Replace uses.
-  for (int64_t i = 0, end = old_to_new_output_indices.size(); i < end; ++i) {
-    if (old_to_new_output_indices[i] >= 0) {
-      op.getResult(i).replaceAllUsesWith(
-          new_op.getResult(old_to_new_output_indices[i]));
-    }
-  }
+  op.replaceAllUsesWith(new_op.getResults().take_front(op.getNumResults()));
   op.erase();
   return success();
 }
@@ -825,8 +1048,6 @@ struct PartitionedCallLiftingInfo {
 // happens on a clone, which will be stored in `result`.
 LogicalResult HandlePartitionedCallOpCallee(
     FuncOp callee, PartitionedCallLiftingInfo* result) {
-  // Remove identity nodes to avoid aliasing.
-  RemoveIdentity(&callee.front());
   // Sanity check: return of resources should be aliases of inputs. Such outputs
   // will be removed later.
   int64_t non_resource_results = 0;
@@ -914,8 +1135,8 @@ LogicalResult HandlePartitionedCallOpCallee(
 // resource-lifted new callee function in lifting_info.
 template <typename CallOpType>
 void UpdatePartitionedCallOpWithNewCallee(
-    CallOpType call_op, const PartitionedCallLiftingInfo& lifting_info) {
-  if (lifting_info.lifted_callee == nullptr) return;
+    CallOpType call_op, PartitionedCallLiftingInfo& lifting_info) {
+  if (!lifting_info.lifted_callee) return;
   // Replace output resource uses with the aliasing input, so that we can remove
   // this output.
   for (const auto& entry : lifting_info.old_outputs_aliasing_old_inputs) {
@@ -929,12 +1150,10 @@ void UpdatePartitionedCallOpWithNewCallee(
   auto new_operands =
       FilterRange<Value, OperandRange>(call_op.args(), lifting_info.use_info);
   auto new_call = builder.create<CallOpType>(
-      call_op.getLoc(),
-      const_cast<FuncOp&>(lifting_info.lifted_callee).getType().getResults(),
+      call_op.getLoc(), lifting_info.lifted_callee.getType().getResults(),
       new_operands, call_op.getAttrs());
   new_call.setAttr(
-      "f", builder.getSymbolRefAttr(
-               const_cast<FuncOp&>(lifting_info.lifted_callee).getName()));
+      "f", builder.getSymbolRefAttr(lifting_info.lifted_callee.getName()));
   AddLoadsStoresOutsideControlFlowOp(
       new_call, lifting_info.arg_data_type_and_updated_output_index);
   // Replace uses.
@@ -948,8 +1167,9 @@ void UpdatePartitionedCallOpWithNewCallee(
   call_op.erase();
 }
 
-LogicalResult HoistForFunctionalControlFlow(
-    Block*, ModuleOp, llvm::SmallDenseMap<FuncOp, PartitionedCallLiftingInfo>*);
+LogicalResult HoistForControlFlow(
+    Block*, ModuleOp,
+    llvm::SmallDenseMap<llvm::StringRef, PartitionedCallLiftingInfo>*);
 
 // A templated routine for handling both PartitionedCallOp and
 // StatefulPartitionedCallOp. If the callee is already lifted, it just updates
@@ -958,12 +1178,15 @@ LogicalResult HoistForFunctionalControlFlow(
 template <typename CallOpType>
 LogicalResult HandlePartitionedCallOp(
     CallOpType call_op, FuncOp callee, ModuleOp module,
-    llvm::SmallDenseMap<FuncOp, PartitionedCallLiftingInfo>* lifted_callees) {
-  auto emplace_res =
-      lifted_callees->try_emplace(callee, PartitionedCallLiftingInfo());
+    llvm::SmallDenseMap<llvm::StringRef, PartitionedCallLiftingInfo>*
+        lifted_callees) {
+  auto emplace_res = lifted_callees->try_emplace(callee.getName(),
+                                                 PartitionedCallLiftingInfo());
   if (emplace_res.second) {
     // Unseen callee. Perform resource lifting on it.
-    HoistForFunctionalControlFlow(&callee.front(), module, lifted_callees);
+    if (failed(HoistForControlFlow(&callee.front(), module, lifted_callees)))
+      return failure();
+
     if (failed(HandlePartitionedCallOpCallee(
             callee, &emplace_res.first->getSecond()))) {
       return failure();
@@ -975,30 +1198,28 @@ LogicalResult HandlePartitionedCallOp(
 
 // Hoists resource loads/stores from control flow ops in `block` outside the
 // body/cond/branch/callee functions.
-LogicalResult HoistForFunctionalControlFlow(
+LogicalResult HoistForControlFlow(
     Block* block, ModuleOp module,
-    llvm::SmallDenseMap<FuncOp, PartitionedCallLiftingInfo>*
+    llvm::SmallDenseMap<llvm::StringRef, PartitionedCallLiftingInfo>*
         lifted_partitioned_call_callees) {
-  // Remove identity nodes to avoid aliasing.
-  RemoveIdentity(block);
   for (Operation& op : llvm::make_early_inc_range(*block)) {
     if (auto while_op = llvm::dyn_cast<TF::WhileOp>(&op)) {
-      auto body = while_op.body_func();
-      auto cond = while_op.cond_func();
+      auto body = while_op.body_function();
+      auto cond = while_op.cond_function();
       // Recursively handle the nested control flow.
-      HoistForFunctionalControlFlow(&body.front(), module,
-                                    lifted_partitioned_call_callees);
-      HoistForFunctionalControlFlow(&cond.front(), module,
-                                    lifted_partitioned_call_callees);
+      HoistForControlFlow(&body.front(), module,
+                          lifted_partitioned_call_callees);
+      HoistForControlFlow(&cond.front(), module,
+                          lifted_partitioned_call_callees);
       if (failed(HandleWhileLoop(while_op, body, cond))) return failure();
     } else if (auto if_op = llvm::dyn_cast<TF::IfOp>(&op)) {
-      auto then_branch = if_op.then_func();
-      auto else_branch = if_op.else_func();
+      auto then_branch = if_op.then_function();
+      auto else_branch = if_op.else_function();
       // Recursively handle the nested control flow.
-      HoistForFunctionalControlFlow(&then_branch.front(), module,
-                                    lifted_partitioned_call_callees);
-      HoistForFunctionalControlFlow(&else_branch.front(), module,
-                                    lifted_partitioned_call_callees);
+      HoistForControlFlow(&then_branch.front(), module,
+                          lifted_partitioned_call_callees);
+      HoistForControlFlow(&else_branch.front(), module,
+                          lifted_partitioned_call_callees);
       if (failed(HandleCaseOrIfOp(if_op, {then_branch, else_branch})))
         return failure();
     } else if (auto case_op = llvm::dyn_cast<TF::CaseOp>(&op)) {
@@ -1008,16 +1229,17 @@ LogicalResult HoistForFunctionalControlFlow(
         FuncOp func =
             module.lookupSymbol<FuncOp>(branch.cast<FlatSymbolRefAttr>());
         // Recursively handle the nested control flow.
-        HoistForFunctionalControlFlow(&func.front(), module,
-                                      lifted_partitioned_call_callees);
+        HoistForControlFlow(&func.front(), module,
+                            lifted_partitioned_call_callees);
         branch_functions.push_back(func);
       }
       if (failed(HandleCaseOrIfOp(case_op, branch_functions))) return failure();
     } else if (auto call_op = llvm::dyn_cast<TF::PartitionedCallOp>(&op)) {
       auto callee = call_op.func();
-      if (!callee)
+      if (!callee) {
         return call_op.emitOpError(
             "resource lifting does not support call with nested references.");
+      }
       if (failed(HandlePartitionedCallOp(call_op, callee, module,
                                          lifted_partitioned_call_callees))) {
         // Nested control flow handling is done in HandlePartitionedCallOp().
@@ -1029,26 +1251,19 @@ LogicalResult HoistForFunctionalControlFlow(
                                          lifted_partitioned_call_callees))) {
         return failure();
       }
+    } else if (isa<TF::IfRegionOp, TF::CaseRegionOp, TF::WhileRegionOp>(op)) {
+      for (Region& region : op.getRegions())
+        HoistForControlFlow(&region.front(), module,
+                            lifted_partitioned_call_callees);
+      LogicalResult result = RegionResourceHoister::ReplaceOpWithNewOp(&op);
+      if (failed(result)) return failure();
     }
   }
 
-  // Remove unused local variables.
+  // After we have hoisted operations in the block, we may have added new read
+  // and writes of resources to this block. Clean them up by doing store-load
+  // forwarding.
   ForwardStoreToLoad(block);
-  llvm::SmallVector<TF::MlirLocalVarOp, 8> local_vars;
-  for (Operation& op : *block) {
-    if (auto local_var = llvm::dyn_cast<TF::MlirLocalVarOp>(&op)) {
-      local_vars.push_back(local_var);
-    }
-  }
-  for (auto local_var : local_vars) {
-    if (llvm::all_of(local_var.resource().getUsers(),
-                     [](const Operation* user) {
-                       return isa<TF::AssignVariableOp>(user);
-                     })) {
-      for (auto user : local_var.resource().getUsers()) user->erase();
-      local_var.erase();
-    }
-  }
   return success();
 }
 
@@ -1056,22 +1271,25 @@ LogicalResult HoistForFunctionalControlFlow(
 // Returns failure if there are remaining resource-type values that can not be
 // lifted.
 void ResourceOpLiftingPass::runOnOperation() {
-  llvm::SmallDenseMap<FuncOp, PartitionedCallLiftingInfo>
+  llvm::SmallDenseMap<llvm::StringRef, PartitionedCallLiftingInfo>
       lifted_partitioned_call_callees;
   ModuleOp module = getOperation();
-  auto result = module.walk([&](FuncOp func_op) {
+
+  if (failed(TF::CleanupAndCanonicalizeForResourceOpLifting(module)))
+    return signalPassFailure();
+
+  auto walk_result = module.walk([&](FuncOp func_op) {
     return func_op.walk([&](tf_device::ClusterOp cluster) {
-      if (failed(HoistForFunctionalControlFlow(
-              &cluster.GetBody(), module, &lifted_partitioned_call_callees)) ||
-          failed(HoistResourceOpsFromCluster(cluster, module))) {
-        return WalkResult::interrupt();
-      }
+      LogicalResult result = HoistForControlFlow(
+          &cluster.GetBody(), module, &lifted_partitioned_call_callees);
+      if (failed(result)) return WalkResult::interrupt();
+      result = RegionResourceHoister::ReplaceOpWithNewOp(cluster);
+      if (failed(result)) return WalkResult::interrupt();
       return WalkResult::advance();
     });
   });
-  if (result.wasInterrupted()) {
-    signalPassFailure();
-  }
+
+  if (walk_result.wasInterrupted()) return signalPassFailure();
 }
 
 struct ResourceOpLiftingForMainFunctionPass
@@ -1121,11 +1339,14 @@ LogicalResult ResourceLiftingForFunctionalControlFlow(FuncOp function) {
            << function.getBlocks().size();
   }
 
-  llvm::SmallDenseMap<FuncOp, PartitionedCallLiftingInfo>
+  if (failed(TF::CleanupAndCanonicalizeForResourceOpLifting(function)))
+    return failure();
+
+  llvm::SmallDenseMap<llvm::StringRef, PartitionedCallLiftingInfo>
       lifted_partitioned_call_callees;
-  return HoistForFunctionalControlFlow(&function.front(),
-                                       cast<ModuleOp>(function.getParentOp()),
-                                       &lifted_partitioned_call_callees);
+  return HoistForControlFlow(&function.front(),
+                             cast<ModuleOp>(function.getParentOp()),
+                             &lifted_partitioned_call_callees);
 }
 }  // namespace TF
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting_cleanup.cc b/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting_cleanup.cc
new file mode 100644
index 00000000000..97030595c99
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting_cleanup.cc
@@ -0,0 +1,464 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting_cleanup.h"
+
+#include "llvm/ADT/BitVector.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+
+namespace mlir {
+namespace {
+
+bool IsResource(Value value) {
+  return getElementTypeOrSelf(value.getType()).isa<TF::ResourceType>();
+}
+
+// Removes identity nodes in the block. The device computation does not need
+// such nodes to carry information.
+void RemoveIdentity(Block &block) {
+  for (auto &op : llvm::make_early_inc_range(block)) {
+    if (isa<TF::IdentityOp, TF::IdentityNOp>(&op)) {
+      op.replaceAllUsesWith(op.getOperands());
+      op.erase();
+    }
+  }
+}
+
+// Eliminate local variables that are only assigned to but never read, and thus
+// are dead.
+void RemoveDeadLocalVariables(Block &block) {
+  llvm::SmallVector<TF::MlirLocalVarOp, 8> local_vars;
+  for (Operation &op : block) {
+    if (auto local_var = llvm::dyn_cast<TF::MlirLocalVarOp>(&op)) {
+      local_vars.push_back(local_var);
+    }
+  }
+  for (auto local_var : local_vars) {
+    if (llvm::all_of(local_var.resource().getUsers(),
+                     [](const Operation *user) {
+                       return isa<TF::AssignVariableOp>(user);
+                     })) {
+      for (auto user : local_var.resource().getUsers()) user->erase();
+      local_var.erase();
+    }
+  }
+}
+
+LogicalResult CleanupAndCanonicalize(Operation *parent_op);
+
+// Eliminates unusued results from an operation `op` by cloning it with reduced
+// result types and doing appropriate use replacements. `results_to_eliminate`
+// is a bitvector of result positions to eliminate. If its null, then all unused
+// results of the operation will be eliminated.
+void EliminateUnusedResults(
+    Operation *op, const llvm::BitVector *results_to_eliminate = nullptr) {
+  auto can_eliminate = [&](OpResult &result) -> bool {
+    if (!result.use_empty()) return false;
+    if (results_to_eliminate)
+      return results_to_eliminate->test(result.getResultNumber());
+    else
+      return true;
+  };
+  SmallVector<Type, 4> new_result_types;
+  for (OpResult result : op->getResults()) {
+    if (can_eliminate(result)) continue;
+    new_result_types.push_back(result.getType());
+  }
+
+  // Rebuild the new operation with lesser number of results.
+  OpBuilder builder(op);
+  Operation *new_op = Operation::create(
+      op->getLoc(), op->getName(), new_result_types,
+      llvm::to_vector<4>(op->getOperands()), op->getAttrs(),
+      llvm::to_vector<4>(op->getSuccessors()), op->getNumRegions());
+  builder.insert(new_op);
+
+  // Move region bodies to the new operation.
+  for (auto it : llvm::zip(op->getRegions(), new_op->getRegions())) {
+    Region &old_region = std::get<0>(it);
+    Region &new_region = std::get<1>(it);
+    new_region.takeBody(old_region);
+  }
+
+  // Replace used results and erase the old op.
+  int next_result_idx = 0;
+  for (OpResult result : op->getResults()) {
+    if (can_eliminate(result)) continue;
+    result.replaceAllUsesWith(new_op->getResult(next_result_idx++));
+  }
+  op->erase();
+}
+
+// Clones a function if it cannot be patched in place. Clone if there are
+// multiple uses or unknown uses (for external functions). The cloned function
+// will be marked as private.
+FuncOp CloneFunctionIfNeeded(FuncOp func) {
+  ModuleOp module = func.getParentOfType<ModuleOp>();
+  auto func_uses = SymbolTable::getSymbolUses(func, &module.getBodyRegion());
+  if (func_uses.hasValue() && llvm::hasSingleElement(func_uses.getValue()))
+    return func;
+  FuncOp cloned = func.clone();
+  cloned.setVisibility(SymbolTable::Visibility::Private);
+  cloned.setName(func.getName().str() + "_lifted");
+  SymbolTable(module).insert(cloned);
+  return cloned;
+}
+
+// Eliminates unused results for If/Case operations. Also patches up the
+// branch functions to (a) drop the ununsed return values, and (b) as a result
+// if some argument becomes unused in all branches, drop that argument and the
+// corresponding if/case input operand.
+void EliminateUnusedResultsForIfCase(Operation *op, ArrayRef<FuncOp> branches) {
+  // Clone branch functions if needed since we will be mutating them.
+  SmallVector<FuncOp, 2> cloned_branches;
+  cloned_branches.reserve(branches.size());
+  for (FuncOp func : branches) {
+    FuncOp cloned = CloneFunctionIfNeeded(func);
+    cloned_branches.push_back(cloned);
+    if (cloned == func) continue;
+    // Patch up the op attribute to point to the new function.
+    for (NamedAttribute attr : op->getAttrs()) {
+      auto symref = attr.second.dyn_cast<FlatSymbolRefAttr>();
+      if (!symref) continue;
+      if (symref.getValue() != func.getName()) continue;
+      op->setAttr(attr.first,
+                  FlatSymbolRefAttr::get(cloned.getName(), op->getContext()));
+      break;
+    }
+  }
+
+  // Traverse results backward so that indices to be deleted stay unchanged.
+  for (OpResult result : llvm::reverse(op->getResults())) {
+    if (!result.use_empty()) continue;
+    int result_idx = result.getResultNumber();
+    for (FuncOp func : cloned_branches)
+      func.front().getTerminator()->eraseOperand(result_idx);
+  }
+
+  // Check which function arguments are unused in all branches. We can drop
+  // those as well.
+  int num_args = cloned_branches[0].getNumArguments();
+  llvm::BitVector used_args(num_args);
+  for (FuncOp func : branches) {
+    for (BlockArgument arg : func.getArguments()) {
+      if (!arg.use_empty()) used_args.set(arg.getArgNumber());
+    }
+  }
+
+  // There are some unused args that we can drop. Also drop the corresponding
+  // input operand.
+  if (used_args.count() != num_args) {
+    // Traverse arguments backward so that indices to be deleted stay unchanged.
+    for (int idx = num_args - 1; idx >= 0; --idx) {
+      if (used_args.test(idx)) continue;
+      for (FuncOp func : cloned_branches) func.eraseArgument(idx);
+      // For if/case, arg #i of attached function corresponds to operand #i+1
+      op->eraseOperand(idx + 1);
+    }
+  }
+
+  // Patch up function types (with less number of return values and potentially
+  // less number of arguments)
+  for (FuncOp func : cloned_branches) {
+    func.setType(FunctionType::get(
+        func.front().getArgumentTypes(),
+        func.front().getTerminator()->getOperandTypes(), func.getContext()));
+  }
+
+  EliminateUnusedResults(op);
+}
+
+// Eliminated unused results from a functional while.
+void EliminateUnusedResultsForWhile(TF::WhileOp op) {
+  FuncOp cond = op.cond_function();
+  FuncOp body = op.body_function();
+
+  llvm::BitVector can_eliminate(op.getNumResults());
+  for (OpResult result : llvm::reverse(op.getResults())) {
+    if (!result.use_empty()) continue;
+    int result_idx = result.getResultNumber();
+    BlockArgument cond_arg = cond.getArgument(result_idx);
+    BlockArgument body_arg = cond.getArgument(result_idx);
+    Operation *body_ret = body.front().getTerminator();
+    // We can eliminate a result if its unused and the corresponding argument
+    // is unused in cond and the only use in body is use it as a return value.
+    if (cond_arg.use_empty() && body_arg.hasOneUse() &&
+        body_arg.use_begin()->getOperandNumber() == result_idx &&
+        body_arg.use_begin()->getOwner() == body_ret) {
+      can_eliminate.set(result_idx);
+    }
+  }
+
+  if (can_eliminate.empty()) return;
+
+  FuncOp cloned_cond = CloneFunctionIfNeeded(cond);
+  FuncOp cloned_body = CloneFunctionIfNeeded(body);
+  op.condAttr(FlatSymbolRefAttr::get(cloned_cond.getName(), op.getContext()));
+  op.bodyAttr(FlatSymbolRefAttr::get(cloned_body.getName(), op.getContext()));
+
+  // Drop cond/body args and return value. WhileOp result will be dropped later
+  // in EliminateUnusedResults. Traverse in reverse order so that indices to be
+  // deleted stay unchanged.
+  for (int idx = op.getNumResults() - 1; idx >= 0; --idx) {
+    if (!can_eliminate.test(idx)) continue;
+    cloned_cond.eraseArgument(idx);
+    cloned_body.front().getTerminator()->eraseOperand(idx);
+    cloned_body.eraseArgument(idx);
+  }
+
+  // Patch up branch function types.
+  for (FuncOp func : {cloned_cond, cloned_body}) {
+    func.setType(FunctionType::get(
+        func.front().getArgumentTypes(),
+        func.front().getTerminator()->getOperandTypes(), func.getContext()));
+  }
+  EliminateUnusedResults(op, &can_eliminate);
+}
+
+// For resource results, replace all uses with the resource input to which the
+// result is tied to. After this, resource outputs of this op are expected to be
+// unused.
+LogicalResult ForwardCommonArgToOutput(Operation *op, ArrayRef<FuncOp> branches,
+                                       ValueRange branch_args,
+                                       bool &has_resource_result) {
+  // For while, the branch inputs and outputs need to match.
+  bool io_match = isa<TF::WhileOp>(op);
+
+  has_resource_result = false;
+  // Check if the same input argument number is passed through all functions.
+  for (OpResult result : op->getResults()) {
+    if (!IsResource(result)) continue;
+
+    has_resource_result = true;
+    int result_idx = result.getResultNumber();
+    Optional<int> common_arg_index;
+    for (FuncOp func : branches) {
+      auto ret = func.front().getTerminator();
+      auto block_arg = ret->getOperand(result_idx).dyn_cast<BlockArgument>();
+      if (!block_arg) {
+        return op->emitOpError("result #")
+               << result_idx << " not tied to function argument for branch @"
+               << func.getName();
+      }
+      if (!common_arg_index.hasValue()) {
+        common_arg_index = block_arg.getArgNumber();
+      } else if (common_arg_index.getValue() != block_arg.getArgNumber()) {
+        return op->emitError("result #")
+               << result_idx
+               << " is not tied to the same argument across all branches";
+      }
+    }
+
+    if (io_match && result_idx != common_arg_index.getValue()) {
+      return op->emitOpError("Result #")
+             << result_idx << " is tied to argument #"
+             << common_arg_index.getValue();
+    }
+
+    // Forward the corresponding input to the output
+    result.replaceAllUsesWith(branch_args[common_arg_index.getValue()]);
+  }
+  return success();
+}
+
+// Canonicalizes a function if. Forwards input argument to resource results and
+// then deletes the resource results.
+LogicalResult CanonicalizeFunctionalIfCase(Operation *op,
+                                           ArrayRef<FuncOp> branches,
+                                           ValueRange branch_args) {
+  for (FuncOp func : branches) {
+    if (failed(CleanupAndCanonicalize(func))) return failure();
+  }
+
+  bool has_resource_result = false;
+  if (failed(ForwardCommonArgToOutput(op, branches, branch_args,
+                                      has_resource_result)))
+    return failure();
+
+  // If no resource type results were found, no further cleanup needed.
+  if (!has_resource_result) return success();
+
+  // Drop unused results.
+  EliminateUnusedResultsForIfCase(op, branches);
+  return success();
+}
+
+// Canonicalizes a functional while. Forwards common argument to results and
+// drop resource results if posible.
+LogicalResult CanonicalizeFunctionalWhile(TF::WhileOp op) {
+  for (FuncOp func : {op.cond_function(), op.body_function()}) {
+    if (failed(CleanupAndCanonicalize(func))) return failure();
+  }
+
+  // For while, just use the body function to forward operand to result.
+  bool has_resource_result = false;
+  if (failed(ForwardCommonArgToOutput(op, {op.body_function()},
+                                      op.getOperands(), has_resource_result)))
+    return failure();
+  // If no resource type results were found, no further cleanup needed.
+  if (!has_resource_result) return success();
+
+  // Drop unused results.
+  EliminateUnusedResultsForWhile(op);
+  return success();
+}
+
+// Canonicalizes region based if/case and cluster operations. If the same
+// captured resource typed value is used for all region results, then that value
+// is forwared to the result and the result is dropped.
+LogicalResult CanonicalizeRegionIfCaseCluster(Operation *op) {
+  // Check if the same value is used for all region results for this output.
+  bool has_resource_result = false;
+  for (OpResult result : op->getResults()) {
+    if (!IsResource(result)) continue;
+    has_resource_result = true;
+    int result_idx = result.getResultNumber();
+
+    Value ret0 =
+        op->getRegion(0).front().getTerminator()->getOperand(result_idx);
+    for (Region &region : op->getRegions().drop_front()) {
+      Value ret = region.front().getTerminator()->getOperand(result_idx);
+      if (ret != ret0) {
+        return op->emitError("Result #")
+               << result_idx
+               << " not tied to the same capture across all regions";
+      }
+    }
+    result.replaceAllUsesWith(ret0);
+  }
+
+  if (!has_resource_result) return success();
+
+  // Eliminate unused region results. Traverse in reverse order so that
+  // indices to be deleted stay unchanged.
+  for (OpResult result : llvm::reverse(op->getResults())) {
+    if (!result.use_empty()) continue;
+    int result_idx = result.getResultNumber();
+    for (Region &region : op->getRegions())
+      region.front().getTerminator()->eraseOperand(result_idx);
+  }
+  EliminateUnusedResults(op);
+  return success();
+}
+
+// Canonicalizes a region based while. If the same value is passed through
+// the body, the result is replaced with the operand and all argument/results
+// and retuns values corresponding to that result are dropped.
+LogicalResult CanonicalizeWhileRegion(TF::WhileRegionOp op) {
+  Region &body = op.body();
+  Region &cond = op.cond();
+  llvm::BitVector can_eliminate(op.getNumResults());
+
+  // Traverse in reverse order so that indices to be deleted stay unchanged.
+  for (OpResult result : llvm::reverse(op.getResults())) {
+    if (!IsResource(result)) continue;
+    int result_idx = result.getResultNumber();
+    auto body_arg = body.front()
+                        .getTerminator()
+                        ->getOperand(result_idx)
+                        .dyn_cast<BlockArgument>();
+    if (!body_arg || body_arg.getArgNumber() != result_idx) {
+      return op.emitOpError("Result #") << result_idx << " is not tied to arg #"
+                                        << result_idx << " of the body";
+    }
+    body.getArgument(result_idx).replaceAllUsesWith(op.getOperand(result_idx));
+    cond.getArgument(result_idx).replaceAllUsesWith(op.getOperand(result_idx));
+    body.front().getTerminator()->eraseOperand(result_idx);
+    body.eraseArgument(result_idx);
+    cond.eraseArgument(result_idx);
+    result.replaceAllUsesWith(op.getOperand(result_idx));
+    op.getOperation()->eraseOperand(result_idx);
+    can_eliminate.set(result_idx);
+  }
+  EliminateUnusedResults(op, &can_eliminate);
+  return success();
+}
+
+// Removes identities and canonicalizes all operations within `parent_op`.
+LogicalResult CleanupAndCanonicalize(Operation *parent_op) {
+  auto walk_result = parent_op->walk([](Operation *op) {
+    // Cleanup code in attached regions.
+    for (Region &region : op->getRegions()) {
+      if (!llvm::hasSingleElement(region)) return WalkResult::interrupt();
+      RemoveIdentity(region.front());
+      RemoveDeadLocalVariables(region.front());
+    }
+
+    LogicalResult result = success();
+
+    // While condition cannot write to resource variables.
+    auto check_while_cond = [&](TF::AssignVariableOp assign) {
+      op->emitOpError("found resource write in loop condition.");
+      return WalkResult::interrupt();
+    };
+
+    if (auto if_op = dyn_cast<TF::IfOp>(op)) {
+      result = CanonicalizeFunctionalIfCase(
+          op, {if_op.then_function(), if_op.else_function()}, if_op.input());
+    } else if (auto case_op = dyn_cast<TF::CaseOp>(op)) {
+      SmallVector<FuncOp, 4> branches;
+      for (Attribute branch : case_op.branches()) {
+        auto sym = branch.cast<FlatSymbolRefAttr>();
+        branches.push_back(
+            SymbolTable::lookupNearestSymbolFrom<FuncOp>(op, sym));
+      }
+      result = CanonicalizeFunctionalIfCase(case_op, branches, case_op.input());
+    } else if (auto while_op = dyn_cast<TF::WhileOp>(op)) {
+      if (while_op.cond_function().walk(check_while_cond).wasInterrupted())
+        return WalkResult::interrupt();
+      result = CanonicalizeFunctionalWhile(while_op);
+    } else if (isa<TF::IfRegionOp, TF::CaseRegionOp, tf_device::ClusterOp>(
+                   op)) {
+      result = CanonicalizeRegionIfCaseCluster(op);
+    } else if (auto while_region = dyn_cast<TF::WhileRegionOp>(op)) {
+      if (while_region.cond().walk(check_while_cond).wasInterrupted())
+        return WalkResult::interrupt();
+      // For while region, the body input and output arg should match.
+      CanonicalizeWhileRegion(while_region);
+    } else if (auto call = dyn_cast<CallOpInterface>(op)) {
+      FuncOp func = dyn_cast<FuncOp>(call.resolveCallable());
+      if (!func) return WalkResult::interrupt();
+      result = CleanupAndCanonicalize(func);
+    }
+    return failed(result) ? WalkResult::interrupt() : WalkResult::advance();
+  });
+
+  return failure(walk_result.wasInterrupted());
+}
+
+}  // anonymous namespace
+
+namespace TF {
+
+LogicalResult CleanupAndCanonicalizeForResourceOpLifting(FuncOp func) {
+  return CleanupAndCanonicalize(func);
+}
+
+LogicalResult CleanupAndCanonicalizeForResourceOpLifting(ModuleOp module) {
+  auto walk_result = module.walk([](tf_device::ClusterOp cluster) {
+    if (failed(CleanupAndCanonicalize(cluster))) return WalkResult::interrupt();
+    return WalkResult::advance();
+  });
+  return failure(walk_result.wasInterrupted());
+}
+
+}  // namespace TF
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting_cleanup.h b/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting_cleanup.h
new file mode 100644
index 00000000000..626ef91bcf6
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting_cleanup.h
@@ -0,0 +1,47 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_RESOURCE_OP_LIFTING_CLEANUP_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_RESOURCE_OP_LIFTING_CLEANUP_H_
+
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+
+// Performs IR cleanup and canonicalization in preparation for Resource Op
+// Lifting pass. It does several things:
+// - Eliminate identity nodes to remove (most) of resource aliasing
+// - Canonicalize functional control flow. For functional control flow we
+//   expect that any resource output of these ops matches the corresponding
+//   input, and then forward that input to the output. Fails if this is not the
+//   case. If successful, the following invariants will hold true:
+//   (a) For if/case, any resource type results will be deleted.
+//   (b) For while, any resource type results will be unused.
+// - Canonicalize region based control flow. Again, any resource outputs are
+//   expected to be resolved to be one of the captured resource inputs. Fails
+//   if this is not the case. If successful, the following invariants will hold
+//   true:
+//   (a) For if/case, any resource type results will be deleted.
+//   (b) For while, any resource type results will be unused.
+namespace mlir {
+namespace TF {
+LogicalResult CleanupAndCanonicalizeForResourceOpLifting(ModuleOp module);
+LogicalResult CleanupAndCanonicalizeForResourceOpLifting(FuncOp func);
+
+}  // namespace TF
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_RESOURCE_OP_LIFTING_CLEANUP_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
index 597fbe2c0b1..eef879ca257 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <iterator>
 
 #include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/None.h"
 #include "llvm/ADT/PointerUnion.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
@@ -40,6 +41,8 @@ limitations under the License.
 #include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Interfaces/CallInterfaces.h"  // from @llvm-project
+#include "mlir/Interfaces/FoldInterfaces.h"  // from @llvm-project
+#include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
@@ -50,10 +53,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
-#include "tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.h"
-#include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
-#include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
-#include "tensorflow/core/framework/op.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/shape_inference_utils.h"
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/framework/types.pb.h"
 
@@ -115,12 +115,12 @@ Optional<SmallVector<Type, 4>> InferShapeForFunctionReturnType(FuncOp func) {
 
 // Returns if the shape inference pass supports an op outside the TF dialect.
 bool IsSupportedNonTFOp(Operation* op) {
-  return isa<ReturnOp, tf_device::ReturnOp, tf_executor::EnterOp,
-             tf_executor::ExitOp, tf_executor::FetchOp, tf_executor::GraphOp,
-             tf_executor::IslandOp, tf_executor::LoopCondOp,
-             tf_executor::MergeOp, tf_executor::NextIterationSinkOp,
-             tf_executor::SwitchNOp, tf_executor::SwitchOp,
-             tf_executor::YieldOp>(op);
+  return isa<ReturnOp, tf_device::ReturnOp, tf_device::ClusterOp,
+             tf_device::LaunchOp, tf_executor::EnterOp, tf_executor::ExitOp,
+             tf_executor::FetchOp, tf_executor::GraphOp, tf_executor::IslandOp,
+             tf_executor::LoopCondOp, tf_executor::MergeOp,
+             tf_executor::NextIterationSinkOp, tf_executor::SwitchNOp,
+             tf_executor::SwitchOp, tf_executor::YieldOp>(op);
 }
 
 // Returns whether a cast back would need to be inserted, e.g., whether the
@@ -155,57 +155,6 @@ void UpdateTypeAndInsertIncompatibleUseCasts(Dialect* tf_dialect, Type new_type,
   result.setType(new_type);
 }
 
-// Extracts a PartialTensorShape from the MLIR type.
-Optional<tensorflow::PartialTensorShape> GetShapeFromMlirType(Type t) {
-  if (auto ranked_type = t.dyn_cast<RankedTensorType>()) {
-    // Convert the MLIR shape indices (int64_t) to TensorFlow indices
-    // (int64).
-    ArrayRef<int64_t> shape = ranked_type.getShape();
-    SmallVector<int64, 8> tf_shape(shape.begin(), shape.end());
-    return tensorflow::PartialTensorShape({tf_shape.data(), tf_shape.size()});
-  }
-  return None;
-}
-
-// Gets the subtype's shape and data type for `type`. Templated to support both
-// ResourceType and VariantType.
-template <typename T>
-std::unique_ptr<std::vector<
-    std::pair<tensorflow::PartialTensorShape, tensorflow::DataType>>>
-GetSubtypesHelper(Type type) {
-  auto type_with_subtypes =
-      type.cast<TensorType>().getElementType().dyn_cast<T>();
-  if (!type_with_subtypes || type_with_subtypes.getSubtypes().empty()) {
-    return nullptr;
-  }
-  auto shapes_and_types = absl::make_unique<std::vector<
-      std::pair<tensorflow::PartialTensorShape, tensorflow::DataType>>>();
-  for (auto subtype : type_with_subtypes.getSubtypes()) {
-    auto shape = GetShapeFromMlirType(subtype);
-    // handle_shapes_and_types requires all shapes to be known. So if any
-    // subtype is unknown, clear the vector.
-    if (!shape) {
-      shapes_and_types = nullptr;
-      break;
-    }
-    tensorflow::DataType dtype;
-    auto status =
-        tensorflow::ConvertToDataType(subtype.getElementType(), &dtype);
-    assert(status.ok() && "Unknown element type");
-    shapes_and_types->emplace_back(*shape, dtype);
-  }
-  return shapes_and_types;
-}
-
-// Gets the subtype's shape and data type for `type`.
-std::unique_ptr<std::vector<
-    std::pair<tensorflow::PartialTensorShape, tensorflow::DataType>>>
-GetSubtypes(Type type) {
-  auto subclasses = GetSubtypesHelper<TF::ResourceType>(type);
-  if (subclasses) return subclasses;
-  return GetSubtypesHelper<TF::VariantType>(type);
-}
-
 // Returns whether type can be further refined.
 bool CanBeRefined(Type type) {
   auto shape_type = type.dyn_cast<ShapedType>();
@@ -292,8 +241,8 @@ bool InferShapeForCast(CastOp op, Dialect* tf_dialect) {
 // function result types.
 bool InferShapeForIf(IfOp op) {
   bool changed = false;
-  auto then_results = op.then_func().getType().getResults();
-  auto else_results = op.else_func().getType().getResults();
+  auto then_results = op.then_function().getType().getResults();
+  auto else_results = op.else_function().getType().getResults();
   for (auto it : llvm::zip(op.getResults(), then_results, else_results)) {
     // If then and else types do not match, skip refinement for that result.
     if (std::get<1>(it) != std::get<2>(it)) continue;
@@ -596,7 +545,7 @@ ShapeInference::ShapeInference(int64_t graph_version, MLIRContext* context,
                                bool propagate_caller_callee_constants)
     : graph_version_(graph_version),
       propagate_caller_callee_constants_(propagate_caller_callee_constants) {
-  tf_dialect_ = context->getRegisteredDialect<TensorFlowDialect>();
+  tf_dialect_ = context->getLoadedDialect<TensorFlowDialect>();
 }
 
 ShapeHandle ShapeInference::ComputeOutputAsShape(OpResult result,
@@ -697,11 +646,8 @@ bool ShapeInference::RefineShapeForPassThroughOps(Operation* op) {
     // TODO(jpienaar): The tf.Cast op, which is uniformly inserted at the
     // moment, cannot handle arbirary types (e.g., it can't handle quantized
     // types). This restriction can be relaxed if not only tf.Cast is used.
-    auto kind = t.getKind();
-    return (kind >= Type::FIRST_STANDARD_TYPE &&
-            kind < Type::LAST_STANDARD_TYPE) ||
-           (kind >= Type::FIRST_TENSORFLOW_TYPE &&
-            kind < Type::LAST_TENSORFLOW_TYPE);
+    return t.getDialect().getNamespace().empty() ||
+           isa<TensorFlowDialect>(t.getDialect());
   };
 
   bool changed = false;
@@ -747,6 +693,11 @@ bool ShapeInference::InferShapeForNonTFDialectOperation(Operation* op) {
     return RefineTypeForPassThroughOperands(op, terminator->getOperands(),
                                             op->getResults());
   }
+  if (auto cluster_op = dyn_cast<tf_device::ClusterOp>(op)) {
+    auto terminator = cluster_op.GetBody().getTerminator();
+    return RefineTypeForPassThroughOperands(op, terminator->getOperands(),
+                                            op->getResults());
+  }
   if (op->hasTrait<OpTrait::SameOperandsAndResultShape>()) {
     return RefineShapeForPassThroughOps(op);
   }
@@ -796,182 +747,54 @@ bool ShapeInference::InferShapeForSingleOperation(Operation* op) {
   if (auto if_region = dyn_cast<IfRegionOp>(op))
     return InferShapeForIfRegion(if_region);
 
-  StringRef op_name = op->getName().getStringRef();
-  // Drop the `tf.` prefix to query TF registry.
-  auto node_name =
-      op_name.drop_front(TensorFlowDialect::getDialectNamespace().size() + 1);
-
-  // Get information from the registry and check if we have a shape function for
-  // this op.
-  const tensorflow::OpRegistrationData* op_reg_data =
-      tensorflow::OpRegistry::Global()->LookUp(node_name.data());
-  if (!op_reg_data) {
-    LLVM_DEBUG(llvm::dbgs() << "Skipping inference for unregistered op '"
-                            << op->getName() << "'.\n");
-    return false;
-  }
-  if (op_reg_data->shape_inference_fn == nullptr) {
-    LLVM_DEBUG(llvm::dbgs()
-               << "Skipping inference for op without shape function '"
-               << op->getName() << "'.\n");
-    return false;
-  }
-
-  // Convert the operation to a NodeDef to be able to use the InferenceContext
-  // and the TensorFlow shape function.
-  auto node_def_or = tensorflow::ConvertTFDialectOpToNodeDef(
-      op, node_name, /*ignore_unregistered_attrs=*/true);
-  if (!node_def_or.ok()) {
-    LLVM_DEBUG(llvm::dbgs()
-               << "Error converting op '" << *op << "' to NodeDef: "
-               << node_def_or.status().error_message() << "\n");
-    return false;
-  }
-  std::unique_ptr<tensorflow::NodeDef> node_def =
-      std::move(node_def_or).ValueOrDie();
-
-  // Collect an array with input values for constant operands and input shapes
-  // for all the operands.
-  std::vector<const tensorflow::Tensor*> input_tensors(op->getNumOperands());
-  std::vector<tensorflow::PartialTensorShape> input_shapes(
-      op->getNumOperands());
-  std::vector<tensorflow::Tensor> tensors(op->getNumOperands());
-  std::vector<std::unique_ptr<std::vector<
-      std::pair<tensorflow::PartialTensorShape, tensorflow::DataType>>>>
-      handle_shapes_and_types(op->getNumOperands());
-  for (auto it : llvm::enumerate(op->getOperands())) {
-    Value operand = it.value();
-    size_t index = it.index();
-
-    // If the operand is constant, then convert it to Tensor.
+  // Return operand as a constant attribute.
+  auto operand_as_constant_fn = [&](Value operand) {
     ValuePort vp(operand);
     Attribute attr = ComputeOutputComponent(vp);
     if (!attr && matchPattern(operand, m_Constant(&attr)))
       RecordValue(vp, attr);
-    if (attr) {
-      tensorflow::Tensor* input_tensor = &tensors[index];
-      auto status =
-          tensorflow::ConvertToTensor(attr.cast<ElementsAttr>(), input_tensor);
-      if (status.ok()) {
-        input_tensors[index] = input_tensor;
-      } else {
-        LLVM_DEBUG(llvm::dbgs()
-                   << "Error converting input " << index << " of op '" << *op
-                   << "' to Tensor: " << status.error_message() << "\n");
-      }
-    }
+    return attr;
+  };
 
-    Type operand_type = operand.getType();
-    if (auto shape = GetShapeFromMlirType(operand_type)) {
-      input_shapes[index] = *shape;
-    }
-    // Collect the handle shapes and types for a resource/variant.
-    handle_shapes_and_types[index] = GetSubtypes(operand_type);
-  }
+  // Return op result as a shape.
+  auto op_result_as_shape_fn = [&](InferenceContext& context,
+                                   OpResult op_result) {
+    return ComputeOutputAsShape(op_result, &context);
+  };
 
-  // Perform the shape inference using an InferenceContext with the input
-  // shapes. This object is abstracting the information that the ShapeInference
-  // function operates on.
-  InferenceContext c(graph_version_, *node_def, op_reg_data->op_def,
-                     input_shapes, input_tensors,
-                     /*input_tensors_as_shapes=*/{}, handle_shapes_and_types);
-  auto status = c.Run(op_reg_data->shape_inference_fn);
-  if (!status.ok()) {
-    LLVM_DEBUG(llvm::dbgs() << "Shape inference error for '" << *op
-                            << "': " << status.error_message() << "\n");
+  // Return result element type at `index`.
+  auto result_element_type_fn = [&](int index) {
+    return op->getResult(index).getType().cast<TensorType>().getElementType();
+  };
+
+  llvm::SmallVector<ShapedTypeComponents, 4> inferred_return_shapes;
+  if (failed(InferReturnTypeComponentsForTFOp(
+          /*location=*/None, op, graph_version_, operand_as_constant_fn,
+          op_result_as_shape_fn, result_element_type_fn,
+          inferred_return_shapes)))
     return false;
-  }
-
-  // Determine if, during shape computation, the shape functions attempted to
-  // query an input operand as shape where the input was not known/constant.
-  bool requires_inputs =
-      any_of(llvm::seq<int>(0, c.num_inputs()), [&](int input) {
-        return c.requested_input_tensor_as_partial_shape(input) &&
-               !input_tensors[input];
-      });
-  if (requires_inputs) {
-    LLVM_DEBUG(llvm::dbgs() << "\trequired input\n");
-    std::vector<ShapeHandle> input_tensors_as_shapes;
-    for (int input : llvm::seq<int>(0, c.num_inputs())) {
-      if (c.requested_input_tensor_as_partial_shape(input) &&
-          !input_tensors[input]) {
-        LLVM_DEBUG(llvm::dbgs() << "Requesting " << input << " as shape\n");
-        auto op_result = op->getOperand(input).dyn_cast<OpResult>();
-        if (!op_result) continue;
-        // Resize on first valid shape computed.
-        input_tensors_as_shapes.resize(c.num_inputs());
-        auto handle = ComputeOutputAsShape(op_result, &c);
-        LLVM_DEBUG(llvm::dbgs() << "Requested " << input << " as shape "
-                                << (handle.Handle() ? "found" : "not found"));
-        if (handle.Handle()) input_tensors_as_shapes[input] = handle;
-      }
-    }
-
-    // Attempt to compute the unknown operands as shapes.
-    // Note: in the case where no partial outputs could be computed, this would
-    // be empty.
-    if (!input_tensors_as_shapes.empty()) {
-      c.set_input_tensors_as_shapes(input_tensors_as_shapes);
-      auto status = c.Run(op_reg_data->shape_inference_fn);
-      if (!status.ok()) {
-        LLVM_DEBUG(llvm::dbgs() << "Shape inference error for '" << *op
-                                << "': " << status.error_message() << "\n");
-        return false;
-      }
-    }
-  }
-
-  assert(c.num_outputs() == op->getNumResults() &&
-         "inference context matches the MLIR number of results.");
 
   // Update the shape for each of the operation result if the InferenceContext
   // has more precise shapes recorded.
   bool changed = false;
-  for (int output : llvm::seq<int>(0, c.num_outputs())) {
-    // Skip already statically shaped results.
-    Value result = op->getResult(output);
-    if (!CanBeRefined(result.getType())) continue;
-    auto shaped_type = result.getType().cast<ShapedType>();
+  for (auto result : llvm::zip(op->getResults(), inferred_return_shapes)) {
+    Value op_result = std::get<0>(result);
+    if (!CanBeRefined(op_result.getType())) continue;
 
-    ShapeHandle shape_handle = c.output(output);
-    LLVM_DEBUG(llvm::dbgs() << "Inferred output " << output << " : "
-                            << c.DebugString(shape_handle) << "\n");
-    auto get_tensor_type = [&c](const ShapeHandle& sh,
-                                Type element_type) -> TensorType {
-      if (!c.RankKnown(sh)) return UnrankedTensorType::get(element_type);
-      // Convert the shape from TensorFlow (int64) to MLIR (int64_t).
-      SmallVector<int64_t, 8> shape;
-      for (int dim : llvm::seq<int>(0, c.Rank(sh)))
-        shape.push_back(c.Value(c.Dim(sh, dim)));
-      return RankedTensorType::get(shape, element_type);
-    };
-    auto new_element_type = shaped_type.getElementType();
-    // Populate the handle shapes for a resource/variant.
-    if (new_element_type.isa<TF::ResourceType, TF::VariantType>()) {
-      auto handle_shapes_types = c.output_handle_shapes_and_types(output);
-      if (handle_shapes_types) {
-        SmallVector<TensorType, 1> subtypes;
-        OpBuilder b(op);
-        for (const auto& shape_n_type : *handle_shapes_types) {
-          Type element_type;
-          auto status =
-              tensorflow::ConvertDataType(shape_n_type.dtype, b, &element_type);
-          assert(status.ok() && "Unknown element type");
-          subtypes.push_back(get_tensor_type(shape_n_type.shape, element_type));
-        }
-        if (new_element_type.isa<TF::ResourceType>()) {
-          new_element_type = TF::ResourceType::get(subtypes, op->getContext());
-        } else {
-          new_element_type = TF::VariantType::get(subtypes, op->getContext());
-        }
-      }
-    }
-    auto new_type = get_tensor_type(shape_handle, new_element_type);
-    if (result.getType() == new_type) continue;
+    ShapedTypeComponents inferred = std::get<1>(result);
+    TensorType inferred_type;
+    if (inferred.hasRank())
+      inferred_type =
+          RankedTensorType::get(inferred.getDims(), inferred.getElementType());
+    else
+      inferred_type = UnrankedTensorType::get(inferred.getElementType());
 
-    UpdateTypeAndInsertIncompatibleUseCasts(tf_dialect_, new_type, op, result);
+    if (op_result.getType() == inferred_type) continue;
+    UpdateTypeAndInsertIncompatibleUseCasts(tf_dialect_, inferred_type, op,
+                                            op_result);
     changed = true;
   }
+
   if (changed)
     LLVM_DEBUG(llvm::dbgs()
                << "Modified after shape inference: '" << *op << "'\n");
@@ -1101,7 +924,7 @@ LogicalResult ShapeInference::PropagateShapeIntoAttachedFunctions(
   if (auto if_op = dyn_cast<TF::IfOp>(op)) {
     return PropagateShapeToFunctions(
         module, drop_begin(if_op.getOperandTypes(), 1),
-        {if_op.then_func(), if_op.else_func()}, max_iteration);
+        {if_op.then_function(), if_op.else_function()}, max_iteration);
   } else if (auto case_op = dyn_cast<TF::CaseOp>(op)) {
     SmallVector<FuncOp, 4> branches;
     for (Attribute branch : case_op.branches()) {
@@ -1114,7 +937,7 @@ LogicalResult ShapeInference::PropagateShapeIntoAttachedFunctions(
   } else if (auto while_op = dyn_cast<TF::WhileOp>(op)) {
     return PropagateShapeToFunctions(
         module, while_op.getOperandTypes(),
-        {while_op.cond_func(), while_op.body_func()}, max_iteration);
+        {while_op.cond_function(), while_op.body_function()}, max_iteration);
   } else if (auto call_op = dyn_cast<CallOpInterface>(op)) {
     if (auto func = dyn_cast<FuncOp>(call_op.resolveCallable())) {
       PropagateConstantToCallee(call_op, func, module);
@@ -1174,10 +997,11 @@ LogicalResult ShapeInference::TryToFold(Operation* op) {
     if (!dialect) return failure();
     // Only attempt TF dialect fallback if there are no unknown operands.
     if (some_unknown && dialect == tf_dialect_) return failure();
-    SmallVector<Attribute, 8> constants;
-    if (failed(dialect->constantFoldHook(op, constant_operands, constants)))
+    auto* interface = dialect->getRegisteredInterface<DialectFoldInterface>();
+    if (!interface) return failure();
+
+    if (failed(interface->fold(op, constant_operands, fold_results)))
       return failure();
-    fold_results.assign(constants.begin(), constants.end());
   }
 
   for (auto result : zip(op->getResults(), fold_results)) {
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/stack_ops_decomposition.cc b/tensorflow/compiler/mlir/tensorflow/transforms/stack_ops_decomposition.cc
index d3755a4a7d0..a0651c6d013 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/stack_ops_decomposition.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/stack_ops_decomposition.cc
@@ -163,7 +163,7 @@ LogicalResult HandleWhileOp(
     const llvm::SmallDenseMap<Value, Value>& data_var_to_size_var,
     llvm::StringMap<PartitionedCallStackOpsInfo>*
         decomposed_partitioned_call_callees) {
-  auto body = while_op.body_func();
+  auto body = while_op.body_function();
   llvm::SmallDenseMap<Value, Value> body_map;
   auto find_arg_stack_type = [&](int64_t index) -> llvm::Optional<Type> {
     auto it = data_var_to_size_var.find(while_op.getOperand(index));
@@ -187,7 +187,7 @@ LogicalResult HandleWhileOp(
     return failure();
   }
   // Cond should not change stacks in the arguments, so use an empty map.
-  auto cond = while_op.cond_func();
+  auto cond = while_op.cond_function();
   ModifyFunctionSignature(cond, nullptr, find_arg_stack_type);
   llvm::SmallDenseMap<Value, Value> empty_map;
   if (failed(DecomposeStackOpsInternal(&cond.front(), module, &empty_map,
@@ -231,8 +231,8 @@ LogicalResult HandleIfOp(
     const llvm::SmallDenseMap<Value, Value>& data_var_to_size_var,
     llvm::StringMap<PartitionedCallStackOpsInfo>*
         decomposed_partitioned_call_callees) {
-  auto then_func = if_op.then_func();
-  auto else_func = if_op.else_func();
+  auto then_func = if_op.then_function();
+  auto else_func = if_op.else_function();
   llvm::SmallDenseMap<Value, Value> then_map;
   llvm::SmallDenseMap<Value, Value> else_map;
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tensor_array_ops_decomposition.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tensor_array_ops_decomposition.cc
index b3a05c06a67..01de6a89c83 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tensor_array_ops_decomposition.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tensor_array_ops_decomposition.cc
@@ -443,12 +443,12 @@ llvm::SmallDenseMap<int64_t, llvm::SmallVector<string, 4>> AccessedGradients(
         insert(grad.handle(), grad.source().str());
       } else if (auto while_op = llvm::dyn_cast<TF::WhileOp>(&op)) {
         for (const auto& entry : AccessedGradients(
-                 {while_op.body_func(), while_op.cond_func()}, module))
+                 {while_op.body_function(), while_op.cond_function()}, module))
           for (const string& source : entry.getSecond())
             insert(while_op.getOperand(entry.getFirst()), source);
       } else if (auto if_op = llvm::dyn_cast<TF::IfOp>(&op)) {
-        for (const auto& entry :
-             AccessedGradients({if_op.then_func(), if_op.else_func()}, module))
+        for (const auto& entry : AccessedGradients(
+                 {if_op.then_function(), if_op.else_function()}, module))
           for (const string& source : entry.getSecond())
             insert(if_op.getOperand(entry.getFirst() + 1), source);
       } else if (auto call = llvm::dyn_cast<CallOpInterface>(&op)) {
@@ -509,8 +509,8 @@ LogicalResult HandleWhileOp(TF::WhileOp while_op, ModuleOp module,
                             llvm::SmallDenseMap<Value, TensorArrayStats>* stats,
                             llvm::StringMap<PartitionedCallTensorArrayOpsInfo>*
                                 decomposed_partitioned_call_callees) {
-  auto body = while_op.body_func();
-  auto cond = while_op.cond_func();
+  auto body = while_op.body_function();
+  auto cond = while_op.cond_function();
   auto grads = AccessedGradients({body, cond}, module);
   auto ta_arg_buffer_type = [&](int64_t index) -> Type {
     auto it = stats->find(while_op.getOperand(index));
@@ -592,8 +592,8 @@ LogicalResult HandleIfOp(TF::IfOp if_op, ModuleOp module,
                          llvm::SmallDenseMap<Value, TensorArrayStats>* stats,
                          llvm::StringMap<PartitionedCallTensorArrayOpsInfo>*
                              decomposed_partitioned_call_callees) {
-  auto then_branch = if_op.then_func();
-  auto else_branch = if_op.else_func();
+  auto then_branch = if_op.then_function();
+  auto else_branch = if_op.else_function();
   auto grads = AccessedGradients({then_branch, else_branch}, module);
   auto ta_arg_buffer_type = [&](int64_t index) -> Type {
     auto it = stats->find(if_op.getOperand(index + 1));
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tensor_device_copy_conversion.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tensor_device_copy_conversion.cc
new file mode 100644
index 00000000000..f14efeb91ce
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tensor_device_copy_conversion.cc
@@ -0,0 +1,81 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/Passes.h"
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/Pass/PassOptions.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
+
+namespace mlir {
+namespace TF {
+namespace {
+
+// Deletes the op and forwards the arguments.
+template <typename TF_Op>
+class PassThroughConversion : public mlir::OpConversionPattern<TF_Op> {
+ public:
+  explicit PassThroughConversion(MLIRContext *context)
+      : mlir::OpConversionPattern<TF_Op>(context) {}
+
+  LogicalResult matchAndRewrite(
+      TF_Op op, ArrayRef<mlir::Value> operands,
+      ConversionPatternRewriter &rewriter) const override {  // NOLINT
+    // Just forward the arguments to results.
+    rewriter.replaceOp(op, operands);
+    return success();
+  }
+};
+
+class TensorDeviceCopyConversionPass
+    : public PassWrapper<TensorDeviceCopyConversionPass, FunctionPass> {
+ public:
+  void runOnFunction() override {
+    mlir::OwningRewritePatternList patterns;
+    mlir::ConversionTarget target(getContext());
+
+    // TODO(tfrt-devs): when device placer is introduced in the lowering pass,
+    // we need to check if Identity op and it's previous op are placed on the
+    // same device. If not, we don't fold Identity op since it's used for tensor
+    // copying between devices.
+    patterns.insert<PassThroughConversion<TF::IdentityOp>,
+                    PassThroughConversion<TF::IdentityNOp>>(&getContext());
+
+    if (failed(applyPartialConversion(getFunction(), target, patterns))) {
+      signalPassFailure();
+    }
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<OperationPass<mlir::FuncOp>>
+CreateTensorDeviceCopyConversionPass() {
+  return std::make_unique<TensorDeviceCopyConversionPass>();
+}
+
+static mlir::PassRegistration<TensorDeviceCopyConversionPass>
+    tensor_device_copy_pass(
+        "tf-tensor-device-copy",
+        "Handle ops that copy tensors between devices. E.g., tf.Identity.");
+
+}  // namespace TF
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tensor_list_ops_decomposition.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tensor_list_ops_decomposition.cc
index 9634e4a8be3..da6757e6c94 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tensor_list_ops_decomposition.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tensor_list_ops_decomposition.cc
@@ -155,7 +155,7 @@ LogicalResult HandleWhileOp(
     llvm::StringMap<PartitionedCallDecompositionInfo>*
         decomposed_partitioned_call_callees) {
   // Rewrite body.
-  auto body = while_op.body_func();
+  auto body = while_op.body_function();
   llvm::SmallDenseMap<Value, SizeInfo> body_map;
   auto find_arg_tensor_list_type = [&](int64_t index) -> llvm::Optional<Type> {
     auto it = buffer_to_size->find(while_op.getOperand(index));
@@ -176,7 +176,7 @@ LogicalResult HandleWhileOp(
   auto output_buffer_to_size = AddTensorListSizesToReturn(body, body_map);
 
   // Rewrite cond.
-  auto cond = while_op.cond_func();
+  auto cond = while_op.cond_function();
   llvm::SmallDenseMap<Value, SizeInfo> cond_map;
   ModifyFunctionSignature(cond, cutil::GetSizeType(builder), &cond_map,
                           find_arg_tensor_list_type, arg_buffer_size_is_fixed);
@@ -701,9 +701,9 @@ LogicalResult DecomposeTensorListOpsInternal(
         return failure();
       }
     } else if (auto if_op = llvm::dyn_cast<TF::IfOp>(&op)) {
-      if (failed(HandleCaseOrIfOp(if_op, {if_op.then_func(), if_op.else_func()},
-                                  module, buffer_to_size,
-                                  decomposed_partitioned_call_callees))) {
+      if (failed(HandleCaseOrIfOp(
+              if_op, {if_op.then_function(), if_op.else_function()}, module,
+              buffer_to_size, decomposed_partitioned_call_callees))) {
         return failure();
       }
     } else if (auto case_op = llvm::dyn_cast<TF::CaseOp>(&op)) {
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/test_resource_alias_analysis.cc b/tensorflow/compiler/mlir/tensorflow/transforms/test_resource_alias_analysis.cc
new file mode 100644
index 00000000000..920b2024c0f
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/test_resource_alias_analysis.cc
@@ -0,0 +1,111 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <utility>
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Debug.h"
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Transforms/Passes.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+namespace mlir {
+namespace TF {
+namespace {
+
+// A pass that annotates each operation with a resource type result with the
+// aliasing values for each such result. Each value is assigned a unique ID, and
+// that ID is used to annotate the operations.
+struct TestResourceAliasAnalysis
+    : public TF::PerFunctionAggregateAnalysisConsumerPass<
+          TestResourceAliasAnalysis, TF::ResourceAliasAnalysis> {
+  void runOnFunction(FuncOp func,
+                     const TF::ResourceAliasAnalysis::Info& analysis) {
+    int64_t next_id = 0;
+    llvm::SmallDenseMap<Value, int64_t, 8> ids;
+
+    auto assign_id = [&](Value value) {
+      if (ids.find(value) == ids.end()) ids.insert({value, next_id++});
+    };
+
+    auto get_id = [&](Value value) -> int64_t {
+      auto it = ids.find(value);
+      assert(it != ids.end());
+      return it->second;
+    };
+
+    auto print_aliases = [&](InFlightDiagnostic& diag, Value value) {
+      diag << ", ID " << get_id(value) << " : ";
+      if (analysis.IsUnknownResource(value)) {
+        diag << "Unknown";
+      } else {
+        auto aliases = llvm::to_vector<4>(analysis.GetResourceAliases(value));
+        llvm::sort(aliases,
+                   [&](Value v1, Value v2) { return get_id(v1) < get_id(v2); });
+        llvm::interleaveComma(aliases, diag,
+                              [&](Value v) { diag << get_id(v); });
+      }
+    };
+
+    // Assign a unique ID to each value seen in this function.
+    func.walk([&](Operation* op) {
+      // For all attached regions, assign ID to the region arguments.
+      for (Region& region : op->getRegions()) {
+        for (auto region_arg : filter_resources(region.getArguments()))
+          assign_id(region_arg);
+      }
+
+      // Assign ID for all results.
+      for (auto result : filter_resources(op->getResults())) assign_id(result);
+    });
+
+    // Now walk each operation, and annotate it wil remarks for aliases for
+    // each resource type result
+    func.walk([&](Operation* op) {
+      // For all attached regions, assign ID to the region arguments.
+      for (Region& region : op->getRegions()) {
+        for (auto region_arg : filter_resources(region.getArguments())) {
+          InFlightDiagnostic diag = op->emitRemark("Region #")
+                                    << region.getRegionNumber() << ", Arg #"
+                                    << region_arg.getArgNumber();
+          print_aliases(diag, region_arg);
+        }
+      }
+
+      for (auto result : filter_resources(op->getResults())) {
+        InFlightDiagnostic diag = op->emitRemark("Result #")
+                                  << result.getResultNumber();
+        print_aliases(diag, result);
+      }
+    });
+  }
+};
+
+static mlir::PassRegistration<TestResourceAliasAnalysis> pass(
+    "tf-test-resource-alias-analysis",
+    "Add remarks based on resource alias analysis result, for testing "
+    "purpose.");
+
+}  // anonymous namespace
+}  // namespace TF
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/test_visitor_util.cc b/tensorflow/compiler/mlir/tensorflow/transforms/test_visitor_util.cc
new file mode 100644
index 00000000000..689becb796b
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/test_visitor_util.cc
@@ -0,0 +1,114 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <utility>
+
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Transforms/Passes.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/visitor_util.h"
+
+namespace tensorflow {
+namespace {
+
+std::string get_stage_description(const WalkStage &stage) {
+  if (stage.IsBeforeAllRegions()) return "before all regions";
+  if (stage.IsAfterAllRegions()) return "after all regions";
+  return "before region #" + std::to_string(stage.GetNextRegion());
+}
+
+// A pass that annotates each operation with an remarks that include a unique
+// step ID and a description of the visitor step.
+class TestVisitorUtil
+    : public mlir::PassWrapper<TestVisitorUtil, mlir::FunctionPass> {
+ public:
+  void runOnFunction() override {
+    mlir::FuncOp func = getOperation();
+    int step_id = 0;
+    GenericWalk(func, [&](mlir::Operation *op, const WalkStage &stage) {
+      op->emitRemark() << step_id++ << ": " << get_stage_description(stage);
+    });
+
+    // Exercise static inference of operation type
+    GenericWalk(func, [&](mlir::TF::IfRegionOp op, const WalkStage &stage) {
+      op.emitRemark() << step_id++ << ": " << get_stage_description(stage);
+    });
+  }
+};
+
+class TestVisitorUtilInterrupt
+    : public mlir::PassWrapper<TestVisitorUtilInterrupt, mlir::FunctionPass> {
+ public:
+  void runOnFunction() override {
+    mlir::FuncOp func = getOperation();
+    int step_id = 0;
+
+    auto walker = [&](mlir::Operation *op, const WalkStage &stage) {
+      if (auto interrupt_before_all =
+              op->getAttrOfType<mlir::BoolAttr>("interrupt_before_all"))
+        if (interrupt_before_all.getValue() && stage.IsBeforeAllRegions())
+          return mlir::WalkResult::interrupt();
+
+      if (auto interrupt_after_all =
+              op->getAttrOfType<mlir::BoolAttr>("interrupt_after_all"))
+        if (interrupt_after_all.getValue() && stage.IsAfterAllRegions())
+          return mlir::WalkResult::interrupt();
+
+      if (auto interrupt_after_region =
+              op->getAttrOfType<mlir::IntegerAttr>("interrupt_after_region"))
+        if (stage.IsAfterRegion(
+                static_cast<int>(interrupt_after_region.getInt())))
+          return mlir::WalkResult::interrupt();
+
+      op->emitRemark() << step_id++ << ": " << get_stage_description(stage);
+      return mlir::WalkResult::advance();
+    };
+
+    // Interrupt the walk based on attributes on the operation.
+    auto result = GenericWalk(func, walker);
+
+    if (result.wasInterrupted())
+      func.emitRemark() << step_id++ << ": walk was interrupted";
+
+    // Exercise static inference of operation type for interrupting callback.
+    result =
+        GenericWalk(func, [&](mlir::TF::IfRegionOp op, const WalkStage &stage) {
+          return walker(op, stage);
+        });
+
+    if (result.wasInterrupted())
+      func.emitRemark() << step_id++ << ": walk was interrupted";
+  }
+};
+
+mlir::PassRegistration<TestVisitorUtil> pass(
+    "tf-test-visitor-util",
+    "Add remarks that trace order of visiting operations using TF visitor "
+    "utilities.");
+
+mlir::PassRegistration<TestVisitorUtilInterrupt> pass_interrupt(
+    "tf-test-visitor-util-interrupt",
+    "Add remarks that trace order of visiting operations using TF visitor "
+    "utilities, interrupt version.");
+
+}  // anonymous namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tf_device_assignment.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tf_device_assignment.cc
index 2a770b2615d..f26887eb276 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tf_device_assignment.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tf_device_assignment.cc
@@ -34,7 +34,7 @@ class SimpleTFDeviceAssignmentPass
 
   void runOnFunction() override {
     Builder builder(&getContext());
-    Dialect* tf = getContext().getRegisteredDialect<TensorFlowDialect>();
+    Dialect* tf = getContext().getLoadedDialect<TensorFlowDialect>();
     getFunction().walk([&](Operation* op) {
       if (auto device_attr = op->getAttrOfType<StringAttr>("device")) {
         // We assign default device to ops with device attribute that is empty.
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tf_graph_optimization_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tf_graph_optimization_pass.cc
index 1e4caaf5dd6..52ac87ecf71 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tf_graph_optimization_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tf_graph_optimization_pass.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "mlir/IR/Identifier.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/import_model.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
@@ -43,6 +44,10 @@ namespace tensorflow {
 class GraphOptPass
     : public mlir::PassWrapper<GraphOptPass,
                                mlir::OperationPass<mlir::ModuleOp>> {
+  void getDependentDialects(mlir::DialectRegistry& registry) const override {
+    mlir::RegisterAllTensorFlowDialects(registry);
+  }
+
  public:
   explicit GraphOptPass(std::vector<tensorflow::GraphOptimizationPass*> passes)
       : passes_(std::move(passes)) {}
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_cluster_cleanup_attributes.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_cluster_cleanup_attributes.cc
new file mode 100644
index 00000000000..93098acdc9d
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_cluster_cleanup_attributes.cc
@@ -0,0 +1,60 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Transforms/Passes.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+
+// This pass eliminate `_tpu_replicate` and `device` attribute on operations
+// that are contained in a tf_device.cluster op.
+
+namespace mlir {
+namespace TFTPU {
+
+namespace {
+
+constexpr char kTPUReplicateAttr[] = "_tpu_replicate";
+constexpr char kDeviceAttr[] = "device";
+
+class TPUCleanupClusterAttributesPass
+    : public PassWrapper<TPUCleanupClusterAttributesPass,
+                         OperationPass<ModuleOp>> {
+ public:
+  void runOnOperation() override {
+    getOperation().walk([](tf_device::ClusterOp cluster) {
+      cluster.walk([](Operation *op) {
+        if (isa<tf_device::ClusterOp>(op)) return;
+        for (StringRef attr : {kTPUReplicateAttr, kDeviceAttr})
+          op->removeAttr(attr);
+      });
+    });
+  }
+};
+
+PassRegistration<TPUCleanupClusterAttributesPass> pass(
+    "tf-tpu-cleanup-cluster-attributes",
+    "Eliminate _tpu_replicate and other attributes from ops in a cluster");
+
+}  // namespace
+
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateTPUClusterCleanupAttributesPass() {
+  return std::make_unique<TPUCleanupClusterAttributesPass>();
+}
+
+}  // namespace TFTPU
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_cluster_formation.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_cluster_formation.cc
index 162ecd77d4f..c3f40154c79 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_cluster_formation.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_cluster_formation.cc
@@ -48,6 +48,7 @@ limitations under the License.
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 
@@ -70,55 +71,62 @@ constexpr char kBadTPUReplicateAttrMsg[] =
 using MetadataMap =
     llvm::SmallDenseMap<llvm::StringRef, MutableDictionaryAttr, 8>;
 
+// A set of operations in a cluster.
+using ClusterOps = llvm::SmallSetVector<Operation*, 8>;
+
 // Mapping for `_tpu_replicate` attribute to ops of a cluster.
-using ClusterMap = llvm::SmallDenseMap<llvm::StringRef,
-                                       llvm::SmallSetVector<Operation*, 8>, 8>;
+using ClusterMap = llvm::SmallDenseMap<llvm::StringRef, ClusterOps, 8>;
 
 struct TPUClusterFormation
-    : public PassWrapper<TPUClusterFormation, FunctionPass> {
-  void runOnFunction() override;
+    : public TF::PerFunctionAggregateAnalysisConsumerPass<
+          TPUClusterFormation, TF::ResourceAliasAnalysis> {
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<tf_device::TensorFlowDeviceDialect>();
+  }
+
+  void runOnFunction(
+      FuncOp func,
+      const TF::ResourceAliasAnalysis::Info& resource_alias_analysis);
 };
 
 // Creates a mapping from the TPUReplicateMetadata ops `_tpu_replicate`
 // attribute to its attributes and removes the ops. If multiple
 // TPUReplicateMetadata ops have the same `_tpu_replicate` attribute, an error
 // will be returned.
-LogicalResult CollectMetadata(Operation* op, MetadataMap* metadata_map) {
-  auto result =
-      op->walk([&](TF::TPUReplicateMetadataOp metadata_op) -> WalkResult {
-        MutableDictionaryAttr attrs = metadata_op.getAttrs();
+LogicalResult CollectMetadata(Block* block, MetadataMap* metadata_map) {
+  // Just look at top-level operations in the block (not nested ones)
+  for (Operation& op : llvm::make_early_inc_range(*block)) {
+    auto metadata_op = dyn_cast<TF::TPUReplicateMetadataOp>(op);
+    if (!metadata_op) continue;
 
-        // Missing or bad `_tpu_replicate` attribute.
-        auto tpu_replicate_attr = attrs.get(kTPUReplicateAttr);
-        if (!tpu_replicate_attr)
-          return metadata_op.emitError() << kBadTPUReplicateAttrMsg;
+    MutableDictionaryAttr attrs = metadata_op.getAttrs();
 
-        auto tpu_replicate_attr_str = tpu_replicate_attr.dyn_cast<StringAttr>();
-        if (!tpu_replicate_attr_str ||
-            tpu_replicate_attr_str.getValue().empty())
-          return metadata_op.emitError() << kBadTPUReplicateAttrMsg;
+    // Missing or bad `_tpu_replicate` attribute.
+    auto tpu_replicate_attr = attrs.get(kTPUReplicateAttr);
+    if (!tpu_replicate_attr)
+      return metadata_op.emitError() << kBadTPUReplicateAttrMsg;
 
-        // Remove `name` attribute.
-        attrs.remove(Identifier::get(kNameAttr, metadata_op.getContext()));
+    auto tpu_replicate_attr_str = tpu_replicate_attr.dyn_cast<StringAttr>();
+    if (!tpu_replicate_attr_str || tpu_replicate_attr_str.getValue().empty())
+      return metadata_op.emitError() << kBadTPUReplicateAttrMsg;
 
-        auto it = metadata_map->try_emplace(tpu_replicate_attr_str.getValue(),
-                                            std::move(attrs));
+    // Remove `name` attribute.
+    attrs.remove(Identifier::get(kNameAttr, metadata_op.getContext()));
 
-        // There are multiple TPUReplicateMetadata ops with the same
-        // `_tpu_replicate` attribute.
-        if (!it.second) {
-          return metadata_op.emitError()
-                 << "multiple TPUReplicateMetadata ops with the same '"
-                 << kTPUReplicateAttr << "' attribute '"
-                 << tpu_replicate_attr_str.getValue() << "' found";
-        }
+    auto it = metadata_map->try_emplace(tpu_replicate_attr_str.getValue(),
+                                        std::move(attrs));
 
-        metadata_op.erase();
-        return WalkResult::advance();
-      });
-
-  // Return failure if the walk was interrupted.
-  return failure(result.wasInterrupted());
+    // There are multiple TPUReplicateMetadata ops with the same
+    // `_tpu_replicate` attribute.
+    if (!it.second) {
+      return metadata_op.emitError()
+             << "multiple TPUReplicateMetadata ops with the same '"
+             << kTPUReplicateAttr << "' attribute '"
+             << tpu_replicate_attr_str.getValue() << "' found";
+    }
+    metadata_op.erase();
+  }
+  return success();
 }
 
 // Collects and clusters ops with the same `_tpu_replicate` attribute. This will
@@ -138,14 +146,34 @@ LogicalResult CollectAndGroupClusterOps(Block* block, ClusterMap* clusters) {
   return success();
 }
 
+// Collects all resource ids from an op.
+void CollectResourceIdsFromOp(
+    Operation& op,
+    const TF::ResourceAliasAnalysis::Info& resource_alias_analysis,
+    llvm::SmallDenseSet<int64_t>& observed_resource_ids) {
+  op.walk([&](Operation* inner_op) {
+    for (Value operand : TF::filter_resources(inner_op->getOperands())) {
+      if (resource_alias_analysis.IsUnknownResource(operand)) continue;
+      const auto& ids = resource_alias_analysis.GetResourceUniqueIds(operand);
+      observed_resource_ids.insert(ids.begin(), ids.end());
+    }
+    for (Value result : TF::filter_resources(inner_op->getResults())) {
+      if (resource_alias_analysis.IsUnknownResource(result)) continue;
+      const auto& ids = resource_alias_analysis.GetResourceUniqueIds(result);
+      observed_resource_ids.insert(ids.begin(), ids.end());
+    }
+  });
+}
+
 // Checks if an op should be moved after a cluster. There may be users of a
 // cluster interleaved among the cluster ops.
 bool ShouldMoveOpAfterCluster(
-    Block* block, Operation* op,
-    const llvm::SmallSetVector<Operation*, 8>& cluster_ops,
-    const llvm::SmallSetVector<Operation*, 8>& preceding_users) {
-  auto result = op->walk([&](Operation* op) {
-    for (Value operand : op->getOperands()) {
+    Block* block, Operation* op, const ClusterOps& cluster_ops,
+    const llvm::SmallSetVector<Operation*, 8>& preceding_users,
+    const TF::ResourceAliasAnalysis::Info& resource_alias_analysis,
+    const llvm::SmallDenseSet<int64_t>& observed_resource_ids) {
+  auto result = op->walk([&](Operation* inner_op) {
+    for (Value operand : inner_op->getOperands()) {
       Operation* def = operand.getDefiningOp();
       // Operands may not have a defining op (BlockArgument) or is from a
       // different block.
@@ -157,6 +185,14 @@ bool ShouldMoveOpAfterCluster(
         return WalkResult::interrupt();
       }
     }
+
+    // Check for uses of any resource in or after cluster.
+    for (Value operand : TF::filter_resources(inner_op->getOperands())) {
+      if (resource_alias_analysis.IsUnknownResource(operand)) continue;
+      auto ids = resource_alias_analysis.GetResourceUniqueIds(operand);
+      for (const auto& id : ids)
+        if (observed_resource_ids.contains(id)) return WalkResult::interrupt();
+    }
     return WalkResult::advance();
   });
 
@@ -165,16 +201,31 @@ bool ShouldMoveOpAfterCluster(
 
 // Collects ops that are before ops in the cluster but are users of other ops
 // in the cluster. This may happen because users of individual ops in the
-// cluster may be interleaved with other ops in the cluster.
+// cluster may be interleaved with other ops in the cluster. Resource id's are
+// also captured, to keep track of resource usage before, in, or after the
+// cluster.
+// TODO(lyandy): Extend this to handle all side effecting ops while handling
+// transitive data dependencies.
 llvm::SmallSetVector<Operation*, 8> CollectClusterPrecedingUsers(
-    Block* block, const llvm::SmallSetVector<Operation*, 8>& cluster_ops) {
+    Block* block, const ClusterOps& cluster_ops,
+    const TF::ResourceAliasAnalysis::Info& resource_alias_analysis) {
   llvm::SmallSetVector<Operation*, 8> preceding_users;
+  llvm::SmallDenseSet<int64_t> observed_resource_ids;
 
-  for (Operation& op : llvm::make_range(Block::iterator(cluster_ops.front()),
-                                        Block::iterator(cluster_ops.back())))
-    if (cluster_ops.count(&op) == 0 &&
-        ShouldMoveOpAfterCluster(block, &op, cluster_ops, preceding_users))
+  auto front = Block::iterator(cluster_ops.front());
+  auto back = Block::iterator(cluster_ops.back());
+  for (Operation& op : llvm::make_range(front, back)) {
+    if (cluster_ops.contains(&op)) {
+      CollectResourceIdsFromOp(op, resource_alias_analysis,
+                               observed_resource_ids);
+    } else if (ShouldMoveOpAfterCluster(
+                   block, &op, cluster_ops, preceding_users,
+                   resource_alias_analysis, observed_resource_ids)) {
       preceding_users.insert(&op);
+      CollectResourceIdsFromOp(op, resource_alias_analysis,
+                               observed_resource_ids);
+    }
+  }
 
   return preceding_users;
 }
@@ -185,7 +236,7 @@ llvm::SmallSetVector<Operation*, 8> CollectClusterPrecedingUsers(
 // outside of the cluster (i.e. results of ops in the cluster are only consumed
 // by other ops in the cluster) are pruned.
 llvm::SmallVector<Value, 8> CollectClusterResults(
-    Block* block, const llvm::SmallSetVector<Operation*, 8>& cluster_ops) {
+    Block* block, const ClusterOps& cluster_ops) {
   llvm::SmallVector<Value, 8> results;
 
   for (Operation* op : cluster_ops) {
@@ -204,61 +255,52 @@ llvm::SmallVector<Value, 8> CollectClusterResults(
 }
 
 // Creates a `tf_device.cluster` to wrap cluster ops.
-tf_device::ClusterOp CreateOpForCluster(Operation* last_cluster_op,
-                                        llvm::ArrayRef<Value> results) {
+tf_device::ClusterOp CreateClusterOp(
+    Block* block, const ClusterOps& cluster_ops, llvm::ArrayRef<Value> results,
+    llvm::ArrayRef<Operation*> preceding_users) {
   // `tf_device.cluster` will be placed at where the last op of the cluster is.
+  Operation* last_cluster_op = cluster_ops.back();
   OpBuilder builder(last_cluster_op);
 
   llvm::SmallVector<Type, 8> result_types;
   for (Value result : results) result_types.push_back(result.getType());
-
   auto cluster = builder.create<tf_device::ClusterOp>(last_cluster_op->getLoc(),
                                                       result_types);
 
-  cluster.body().push_back(new Block);
+  Block* body = new Block;
+  cluster.body().push_back(body);
+
+  // Move cluster ops to the cluster body. Also remove `_tpu_replicate` and
+  // `device` attribute from ops in the cluster as that information will be
+  // present in the `tf_device.cluster`. Do this for all ops including nested
+  // ops.
+  for (Operation* cluster_op : cluster_ops) {
+    cluster_op->moveBefore(body, body->end());
+    cluster_op->walk([&](Operation* inner_op) {
+      inner_op->removeAttr(kTPUReplicateAttr);
+      inner_op->removeAttr(kDeviceAttr);
+    });
+  }
 
   // Add terminator.
-  builder.setInsertionPointToEnd(&cluster.GetBody());
+  builder.setInsertionPointToEnd(body);
   builder.create<tf_device::ReturnOp>(last_cluster_op->getLoc(), results);
 
-  return cluster;
-}
-
-// Moves cluster ops to associated `tf_device.cluster` body.
-void MoveClusterOpsToCluster(
-    tf_device::ClusterOp cluster,
-    const llvm::SmallSetVector<Operation*, 8>& cluster_ops) {
-  MLIRContext* context = cluster.getContext();
-  Operation* terminator = cluster.GetBody().getTerminator();
-
-  for (Operation* cluster_op : cluster_ops) {
-    // Remove `_tpu_replicate` and `device` attribute from ops in the cluster
-    // as that information will be present in the `tf_device.cluster`.
-    cluster_op->removeAttr(Identifier::get(kTPUReplicateAttr, context));
-    cluster_op->removeAttr(Identifier::get(kDeviceAttr, context));
-    cluster_op->moveBefore(terminator);
-  }
-}
-
-// Replaces uses of cluster ops results outside of cluster with the associated
-// `tf_device.cluster` results.
-void UpdateClusterResultExternalUses(tf_device::ClusterOp cluster,
-                                     llvm::ArrayRef<Value> results) {
-  Block& cluster_block = cluster.GetBody();
+  // Replaces uses of cluster ops results outside of cluster with the associated
+  // `tf_device.cluster` results.
   for (auto ret_vals : llvm::zip(results, cluster.getResults())) {
     Value old_ret = std::get<0>(ret_vals);
     Value new_ret = std::get<1>(ret_vals);
-    for (auto& use : llvm::make_early_inc_range(old_ret.getUses()))
-      if (!cluster_block.findAncestorOpInBlock(*use.getOwner()))
-        use.set(new_ret);
+    for (auto& use : llvm::make_early_inc_range(old_ret.getUses())) {
+      Operation* user = use.getOwner();
+      if (!body->findAncestorOpInBlock(*user)) use.set(new_ret);
+    }
   }
-}
 
-// Moves users of cluster that are before the cluster to after the cluster.
-void MovePrecedingClusterUsers(tf_device::ClusterOp cluster,
-                               llvm::ArrayRef<Operation*> preceding_users) {
+  // Move users of cluster that are before the cluster to after the cluster.
   Operation* op_after_cluster = cluster.getOperation()->getNextNode();
   for (Operation* user : preceding_users) user->moveBefore(op_after_cluster);
+  return cluster;
 }
 
 // Sorts `tf.TPUReplicatedInput` ops by `index` attribute. Ops with an `index`
@@ -271,8 +313,7 @@ LogicalResult SortTPUReplicatedInputsByIndex(
     llvm::SmallVectorImpl<Operation*>* sorted_inputs) {
   llvm::SmallDenseSet<int64_t, 8> unique_indices;
   for (Operation* input : inputs) {
-    int64_t index =
-        llvm::cast<TF::TPUReplicatedInputOp>(input).index().getSExtValue();
+    int64_t index = llvm::cast<TF::TPUReplicatedInputOp>(input).index();
     if (index < -1)
       return input->emitOpError()
              << "requires index to be at least -1, but got " << index;
@@ -291,10 +332,8 @@ LogicalResult SortTPUReplicatedInputsByIndex(
   std::stable_sort(
       sorted_inputs->begin(), sorted_inputs->end(),
       [](Operation* l, Operation* r) {
-        int64_t l_index =
-            llvm::cast<TF::TPUReplicatedInputOp>(l).index().getSExtValue();
-        int64_t r_index =
-            llvm::cast<TF::TPUReplicatedInputOp>(r).index().getSExtValue();
+        int64_t l_index = llvm::cast<TF::TPUReplicatedInputOp>(l).index();
+        int64_t r_index = llvm::cast<TF::TPUReplicatedInputOp>(r).index();
         if (l_index == -1 && r_index != -1) return false;
         if (r_index == -1 && l_index != -1) return true;
         return l_index < r_index;
@@ -350,8 +389,7 @@ LogicalResult ReplicateCluster(tf_device::ClusterOp cluster, int num_replicas) {
       return input->emitOpError() << "requires " << num_inputs << " operands";
 
     auto tpu_replicated_input = llvm::cast<TF::TPUReplicatedInputOp>(input);
-    int64_t tpu_replicated_input_index =
-        tpu_replicated_input.index().getSExtValue();
+    int64_t tpu_replicated_input_index = tpu_replicated_input.index();
     if (is_packed) {
       packed_inputs.push_back(input->getOperand(0));
       packed_input_indices.push_back(tpu_replicated_input_index);
@@ -442,10 +480,30 @@ LogicalResult ReplicateCluster(tf_device::ClusterOp cluster, int num_replicas) {
 //   8. Wrap cluster (`tf_device.cluster`) in a `tf_device.replicate` if
 //      attribute `num_replicas` is greater than 1.
 //   9. Copy over TPUReplicateMetadata attributes to `tf_device.cluster`.
-LogicalResult FormClustersInBlock(Block* block,
-                                  const MetadataMap& metadata_map) {
+LogicalResult FormClustersInBlock(
+    Block* block,
+    const TF::ResourceAliasAnalysis::Info& resource_alias_analysis) {
+  MetadataMap metadata_map;
+  LogicalResult result = CollectMetadata(block, &metadata_map);
+  if (failed(result)) return result;
+
+  // If there is no TPUReplicateMetadata op in this block, process blocks in
+  // regions attached to the op's in the block.
+  if (metadata_map.empty()) {
+    for (Operation& op : *block) {
+      for (Region& region : op.getRegions()) {
+        if (!llvm::hasSingleElement(region))
+          return op.emitOpError("Expected single block region");
+        if (failed(
+                FormClustersInBlock(&region.front(), resource_alias_analysis)))
+          return failure();
+      }
+    }
+    return success();
+  }
+
   ClusterMap clusters;
-  LogicalResult result = CollectAndGroupClusterOps(block, &clusters);
+  result = CollectAndGroupClusterOps(block, &clusters);
   if (failed(result)) return result;
 
   for (const auto& cluster_metadata_and_ops : clusters) {
@@ -464,19 +522,14 @@ LogicalResult FormClustersInBlock(Block* block,
     }
 
     llvm::SmallSetVector<Operation*, 8> preceding_users =
-        CollectClusterPrecedingUsers(block, cluster_ops);
+        CollectClusterPrecedingUsers(block, cluster_ops,
+                                     resource_alias_analysis);
 
     llvm::SmallVector<Value, 8> results =
         CollectClusterResults(block, cluster_ops);
 
-    tf_device::ClusterOp cluster =
-        CreateOpForCluster(cluster_ops.back(), results);
-
-    MoveClusterOpsToCluster(cluster, cluster_ops);
-
-    UpdateClusterResultExternalUses(cluster, results);
-
-    MovePrecedingClusterUsers(cluster, preceding_users.getArrayRef());
+    tf_device::ClusterOp cluster = CreateClusterOp(
+        block, cluster_ops, results, preceding_users.getArrayRef());
 
     auto num_replicas = cluster_metadata->getSecond().get(kNumReplicasAttr);
     if (!num_replicas || !num_replicas.isa<mlir::IntegerAttr>())
@@ -496,17 +549,19 @@ LogicalResult FormClustersInBlock(Block* block,
   return success();
 }
 
-void TPUClusterFormation::runOnFunction() {
-  MetadataMap metadata_map;
-  if (failed(CollectMetadata(getFunction(), &metadata_map)))
+void TPUClusterFormation::runOnFunction(
+    FuncOp func,
+    const TF::ResourceAliasAnalysis::Info& resource_alias_analysis) {
+  if (!llvm::hasSingleElement(func)) {
+    func.emitOpError("Expecting a single block function");
+    return signalPassFailure();
+  }
+
+  if (failed(FormClustersInBlock(&func.front(), resource_alias_analysis)))
     return signalPassFailure();
 
-  for (Block& block : getFunction())
-    if (failed(FormClustersInBlock(&block, metadata_map)))
-      return signalPassFailure();
-
   // Remove TPUReplicatedInput and TPUReplicatedOutput nodes.
-  auto remove_result = getFunction().walk([&](Operation* op) {
+  auto remove_result = func.walk([&](Operation* op) {
     if (!llvm::isa<TF::TPUReplicatedInputOp, TF::TPUReplicatedOutputOp>(op))
       return WalkResult::advance();
 
@@ -533,7 +588,7 @@ void TPUClusterFormation::runOnFunction() {
 }
 }  // anonymous namespace
 
-std::unique_ptr<OperationPass<FuncOp>> CreateTPUClusterFormationPass() {
+std::unique_ptr<OperationPass<ModuleOp>> CreateTPUClusterFormationPass() {
   return std::make_unique<TPUClusterFormation>();
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_colocate_composite_resource_ops.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_colocate_composite_resource_ops.cc
new file mode 100644
index 00000000000..b4889f6e52c
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_colocate_composite_resource_ops.cc
@@ -0,0 +1,137 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h"
+
+namespace mlir {
+namespace TFTPU {
+namespace {
+
+// Pass that co-locates resource ops that use composite device resources
+// (packed tensors) with the underlying physical TPU device.
+struct TPUColocateCompositeResourceOps
+    : public PassWrapper<TPUColocateCompositeResourceOps, FunctionPass> {
+  void runOnFunction() override;
+};
+
+// Wraps single op in `tf_device.launch` for explicit device assignment.
+void WrapOpInLaunch(OpBuilder* builder, Location loc, Operation* op,
+                    llvm::StringRef device) {
+  builder->setInsertionPoint(op);
+  auto launch = builder->create<tf_device::LaunchOp>(
+      loc, builder->getStringAttr(device), op->getResultTypes());
+  launch.body().push_back(new Block);
+  op->replaceAllUsesWith(launch);
+
+  builder->setInsertionPointToEnd(&launch.GetBody());
+  builder->create<tf_device::ReturnOp>(loc, op->getResults());
+
+  // Move op inside cluster.
+  op->moveBefore(launch.GetBody().getTerminator());
+}
+
+llvm::SmallVector<Operation*, 4> GetResourceOpsUsingCompositeArgsInReplicate(
+    tf_device::ReplicateOp replicate) {
+  llvm::SmallVector<Operation*, 4> resource_users;
+  const auto add_resource_op_to_list = [&resource_users](Operation* op) {
+    if (!llvm::isa<TF::AssignVariableOp, TF::ReadVariableOp>(op)) return;
+
+    resource_users.emplace_back(op);
+  };
+
+  llvm::SmallVector<Operation*, 4> resource_users_to_visit;
+  for (auto composite_arguments : replicate.GetPackedBlockArguments()) {
+    for (auto resource_user : composite_arguments.getUsers())
+      resource_users_to_visit.emplace_back(resource_user);
+  }
+
+  while (!resource_users_to_visit.empty()) {
+    llvm::SmallVector<Operation*, 4> new_resource_users;
+
+    for (auto resource_user : resource_users_to_visit) {
+      add_resource_op_to_list(resource_user);
+
+      // Account for pass-through identity ops.
+      if (auto pass_through_identity =
+              llvm::dyn_cast<TF::IdentityOp>(resource_user)) {
+        for (auto identity_user : pass_through_identity.output().getUsers()) {
+          new_resource_users.emplace_back(identity_user);
+        }
+      }
+    }
+    resource_users_to_visit.swap(new_resource_users);
+  }
+
+  return resource_users;
+}
+
+void ColocateCompositeResourceOpsInReplicate(
+    tf_device::ReplicateOp replicate_op, OpBuilder* builder) {
+  auto devices = replicate_op.devices();
+  if (!devices) return;
+  if (!devices.getValue().get(tensorflow::GetDeviceAliasForLogicalCore(0)))
+    return;
+
+  const auto composite_resource_users =
+      GetResourceOpsUsingCompositeArgsInReplicate(replicate_op);
+  for (auto resource_user : composite_resource_users) {
+    WrapOpInLaunch(builder, resource_user->getLoc(), resource_user,
+                   tensorflow::GetDeviceAliasForLogicalCore(0));
+  }
+}
+
+void TPUColocateCompositeResourceOps::runOnFunction() {
+  // Find all the executes first, since we will mutate the nodes around each
+  // execute in the same tf_device.replicate op.
+  llvm::SmallVector<tf_device::LaunchOp, 8> execute_launches;
+  getFunction().walk([&](tf_device::LaunchOp op) {
+    if (op.WrapsSingleOp() &&
+        llvm::isa<TF::TPUExecuteOp, TF::TPUExecuteAndUpdateVariablesOp>(
+            op.GetBody().front()))
+      execute_launches.push_back(op);
+  });
+
+  OpBuilder builder(&getContext());
+  for (auto execute_launch : execute_launches) {
+    auto replicate = execute_launch.getParentOfType<tf_device::ReplicateOp>();
+    if (!replicate) continue;
+
+    ColocateCompositeResourceOpsInReplicate(replicate, &builder);
+  }
+}
+
+}  // namespace
+
+std::unique_ptr<OperationPass<FuncOp>> CreateTPUColocateCompositeResourceOps() {
+  return std::make_unique<TPUColocateCompositeResourceOps>();
+}
+
+static PassRegistration<TPUColocateCompositeResourceOps> pass(
+    "tf-tpu-colocate-composite-resource-ops",
+    "Colocate resource with composite device assignment to TPU device.");
+
+}  // namespace TFTPU
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_dynamic_layout_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_dynamic_layout_pass.cc
index 41362465cd9..59f36e03fbb 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_dynamic_layout_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_dynamic_layout_pass.cc
@@ -185,7 +185,7 @@ bool HandleReplicatedInputs(
     const TF::ResourceAliasAnalysis::Info& resource_alias_analysis) {
   // We need to know the devices to copy to.
   if (!replicate.devices()) return false;
-  int64_t num_replicas = replicate.n().getZExtValue();
+  int64_t num_replicas = replicate.n();
   auto inputs = replicate.getOperands()
                     .drop_front(replicate_arg_index * num_replicas)
                     .take_front(num_replicas);
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_head_tail_outside_compilation.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_head_tail_outside_compilation.cc
index 2be6ee7a78c..6e106b278fe 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_head_tail_outside_compilation.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_head_tail_outside_compilation.cc
@@ -23,10 +23,10 @@ limitations under the License.
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Support/FormatVariadic.h"
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Block.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/IR/Visitors.h"  // from @llvm-project
@@ -34,7 +34,9 @@ limitations under the License.
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/device_util.h"
@@ -113,12 +115,23 @@ tf_device::LaunchOp CreateLaunchForBlock(OpBuilder* builder, Operation* op,
   return launch;
 }
 
+// Checks if an operation is a supported TPU embedding op.
+bool IsEmbeddingOp(Operation* op) {
+  return isa<TF::EnqueueTPUEmbeddingRaggedTensorBatchOp,
+             TF::EnqueueTPUEmbeddingSparseTensorBatchOp,
+             TF::RecvTPUEmbeddingActivationsOp,
+             TF::SendTPUEmbeddingGradientsOp>(op);
+}
+
 // Returns a set of ops that are outside compiled and can be extracted to before
 // the TPU computation. These ops are either connected to the inputs of the TPU
 // computation or other ops that can be extracted, and have no operands from
 // other ops in the TPU computation that cannot be extracted.
 llvm::SmallVector<Operation*, 4> FindOutsideCompiledOpsAtHead(
+    const TF::SideEffectAnalysis& side_effect_analysis,
     tf_device::ClusterOp cluster) {
+  const auto& analysis = side_effect_analysis.GetAnalysisForFunc(
+      cluster.getParentOfType<FuncOp>());
   Region* cluster_region = &cluster.body();
   llvm::SmallSetVector<Operation*, 4> head_outside_compiled_ops;
 
@@ -127,6 +140,24 @@ llvm::SmallVector<Operation*, 4> FindOutsideCompiledOpsAtHead(
     if (!HasOutsideCompilationAttribute(&cluster_op)) continue;
     // An outside compiled op can be extracted if its operands are not from
     // other ops in the cluster that cannot be extracted.
+
+    // Check if the side effecting op right before this side effecting op, if
+    // it is side effecting, can be head extracted. Because of op ordering due
+    // to side effects, if this is not true, this op cannot be head extracted.
+    // TODO(lyandy): Remove special handling of embedding ops. Currently the IR
+    // is in a topological sort order and depending on that ordering, embedding
+    // ops may prevent other ops from being head extracted.
+    auto predecessors = analysis.DirectControlPredecessors(&cluster_op);
+    if (!predecessors.empty() && !IsEmbeddingOp(&cluster_op)) {
+      bool skip = false;
+      for (Operation* predecessor : llvm::reverse(predecessors)) {
+        if (IsEmbeddingOp(predecessor)) continue;
+        skip = !head_outside_compiled_ops.contains(predecessor);
+        break;
+      }
+      if (skip) continue;
+    }
+
     auto walk_result = cluster_op.walk([&](Operation* op) {
       for (Value operand : op->getOperands()) {
         Operation* operand_op = GetOpOfValue(operand);
@@ -168,11 +199,11 @@ void CreateHeadComputation(OpBuilder* builder, tf_device::ClusterOp cluster,
 // Extracts and move outside compiled ops that have no dependencies in the
 // cluster to before the cluster.
 mlir::LogicalResult LiftHeadOutsideCompiledOps(
-    OpBuilder* builder, const mlir::TF::RuntimeDevices& devices,
-    tf_device::ClusterOp cluster, std::string* host_device,
-    bool* cluster_updated) {
+    OpBuilder* builder, const TF::SideEffectAnalysis& side_effect_analysis,
+    const mlir::TF::RuntimeDevices& devices, tf_device::ClusterOp cluster,
+    std::string* host_device, bool* cluster_updated) {
   llvm::SmallVector<Operation*, 4> head_outside_compiled_ops =
-      FindOutsideCompiledOpsAtHead(cluster);
+      FindOutsideCompiledOpsAtHead(side_effect_analysis, cluster);
   if (head_outside_compiled_ops.empty()) return success();
   if (failed(tensorflow::GetHostDeviceOutsideComputation(devices, cluster,
                                                          host_device)))
@@ -191,9 +222,12 @@ mlir::LogicalResult LiftHeadOutsideCompiledOps(
 // TPU computation or other ops that can be extracted, and have no results used
 // by other ops in the TPU computation that cannot be extracted.
 void FindOutsideCompiledOpsAtTailAndClusterResults(
+    const TF::SideEffectAnalysis& side_effect_analysis,
     tf_device::ClusterOp cluster,
     llvm::SmallVectorImpl<Operation*>* tail_outside_compiled_ops,
     llvm::SmallVectorImpl<Value>* cluster_results) {
+  const auto& analysis = side_effect_analysis.GetAnalysisForFunc(
+      cluster.getParentOfType<FuncOp>());
   Region* cluster_region = &cluster.body();
   llvm::SmallSetVector<Operation*, 4> tail_outside_compiled_ops_set;
   Operation* terminator = cluster.GetBody().getTerminator();
@@ -205,6 +239,24 @@ void FindOutsideCompiledOpsAtTailAndClusterResults(
   for (Operation& cluster_op : cluster_ops) {
     if (!HasOutsideCompilationAttribute(&cluster_op)) continue;
 
+    // Check if the side effecting op right after this side effecting op, if
+    // it is side effecting, can be tail extracted. Because of op ordering due
+    // to side effects, if this is not true, this op cannot be tail extracted.
+    // TODO(lyandy): Remove special handling of embedding ops. Currently the IR
+    // is in a topological sort order and depending on that ordering, embedding
+    // ops may prevent other ops from being tail extracted.
+    auto successors = analysis.DirectControlSuccessors(
+        &cluster_op, [&terminator](Operation* op) { return op != terminator; });
+    if (!successors.empty() && !IsEmbeddingOp(&cluster_op)) {
+      bool skip = false;
+      for (Operation* successor : successors) {
+        if (IsEmbeddingOp(successor)) continue;
+        skip = !tail_outside_compiled_ops_set.contains(successor);
+        break;
+      }
+      if (skip) continue;
+    }
+
     llvm::SmallVector<int, 4> results_to_forward;
     bool can_be_extracted =
         llvm::all_of(cluster_op.getUsers(), [&](Operation* op) {
@@ -293,13 +345,14 @@ tf_device::ClusterOp UpdateClusterResults(
 // Extracts and move outside compiled ops that do not create dependencies in the
 // cluster to after the cluster.
 mlir::LogicalResult LiftTailOutsideCompiledOps(
-    OpBuilder* builder, const mlir::TF::RuntimeDevices& devices,
-    std::string host_device, tf_device::ClusterOp* cluster,
-    bool* cluster_updated) {
+    OpBuilder* builder, const TF::SideEffectAnalysis& side_effect_analysis,
+    const mlir::TF::RuntimeDevices& devices, std::string host_device,
+    tf_device::ClusterOp* cluster, bool* cluster_updated) {
   llvm::SmallVector<Operation*, 4> tail_outside_compiled_ops;
   llvm::SmallVector<Value, 4> cluster_results;
-  FindOutsideCompiledOpsAtTailAndClusterResults(
-      *cluster, &tail_outside_compiled_ops, &cluster_results);
+  FindOutsideCompiledOpsAtTailAndClusterResults(side_effect_analysis, *cluster,
+                                                &tail_outside_compiled_ops,
+                                                &cluster_results);
   if (tail_outside_compiled_ops.empty()) return success();
 
   if (host_device.empty())
@@ -331,7 +384,8 @@ void RemoveClusterAliasedOutputs(OpBuilder* builder,
   for (auto result :
        llvm::zip(cluster_terminator->getOperands(), cluster.getResults())) {
     Value cluster_terminator_operand = std::get<0>(result);
-    if (cluster.getOperation()->isProperAncestor(
+    if (cluster_terminator_operand.getDefiningOp() &&
+        cluster.getOperation()->isProperAncestor(
             cluster_terminator_operand.getDefiningOp())) {
       new_cluster_results.push_back(cluster_terminator_operand);
       new_cluster_result_types.push_back(cluster_terminator_operand.getType());
@@ -364,6 +418,7 @@ struct TPUExtractHeadTailOutsideCompilation
 };
 
 void TPUExtractHeadTailOutsideCompilation::runOnOperation() {
+  auto& side_effect_analysis = getAnalysis<TF::SideEffectAnalysis>();
   // Get runtime devices information from the closest parent module.
   auto module = getOperation();
   mlir::TF::RuntimeDevices devices;
@@ -378,10 +433,12 @@ void TPUExtractHeadTailOutsideCompilation::runOnOperation() {
   for (tf_device::ClusterOp cluster : clusters) {
     std::string host_device;
     bool cluster_updated = false;
-    if (failed(LiftHeadOutsideCompiledOps(&builder, devices, cluster,
-                                          &host_device, &cluster_updated)) ||
-        failed(LiftTailOutsideCompiledOps(&builder, devices, host_device,
-                                          &cluster, &cluster_updated)))
+    if (failed(LiftHeadOutsideCompiledOps(&builder, side_effect_analysis,
+                                          devices, cluster, &host_device,
+                                          &cluster_updated)) ||
+        failed(LiftTailOutsideCompiledOps(&builder, side_effect_analysis,
+                                          devices, host_device, &cluster,
+                                          &cluster_updated)))
       return signalPassFailure();
     if (cluster_updated) RemoveClusterAliasedOutputs(&builder, cluster);
   }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc
index cbea4ae6544..303b69c2730 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc
@@ -17,11 +17,21 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/FormatVariadic.h"
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
@@ -77,51 +87,315 @@ struct TPUExtractOutsideCompilation
   void runOnOperation() override;
 };
 
-// Collects and clusters ops in `block` with the same `_xla_outside_compilation`
-// attribute into `clusters` This returns an error if a
-// `_xla_outside_compilation` attribute of an op is empty.
-LogicalResult CollectAndGroupOutsideClusterOps(Block* block,
-                                               OutsideClusterMap* clusters) {
-  for (Operation& op : *block) {
-    if (auto attr = op.getAttrOfType<StringAttr>(kXlaOutsideCompilationAttr)) {
-      if (attr.getValue().empty())
-        return op.emitError()
-               << "attribute '" << kXlaOutsideCompilationAttr << "' is empty";
+// Holds information about control flow operations that wrap outside compiled
+// op. Currently only tf.IfRegion and tf.WhileRegion ops are supported.
+class ControlFlowStackInfo {
+ public:
+  enum ControlFlowBranchType { kIfThen, kIfElse, kWhileCond, kWhileBody };
 
-      auto it = clusters->try_emplace(attr.getValue());
-      it.first->getSecond().push_back(&op);
+  explicit ControlFlowStackInfo(Operation* wrapping_op, Operation* nested_op)
+      : callsite_op_(wrapping_op) {
+    if (auto control_flow_op = llvm::dyn_cast<TF::IfRegionOp>(callsite_op_)) {
+      auto parent_region = nested_op->getParentRegion();
+      if (&control_flow_op.then_branch() == parent_region) {
+        type_ = ControlFlowBranchType::kIfThen;
+      } else {
+        type_ = ControlFlowBranchType::kIfElse;
+      }
+    } else if (auto control_flow_op =
+                   llvm::dyn_cast<TF::WhileRegionOp>(callsite_op_)) {
+      auto parent_region = nested_op->getParentRegion();
+      if (&control_flow_op.cond() == parent_region) {
+        type_ = ControlFlowBranchType::kWhileCond;
+      } else {
+        type_ = ControlFlowBranchType::kWhileBody;
+      }
+    } else {
+      assert(false);
     }
   }
 
-  return success();
+  Value GetIfPredicateValue() {
+    auto if_op = llvm::cast<TF::IfRegionOp>(callsite_op_);
+    return if_op.cond();
+  }
+
+  ControlFlowBranchType GetBranchType() const { return type_; }
+
+  Operation* GetCallSiteOp() const { return callsite_op_; }
+
+ private:
+  ControlFlowBranchType type_;
+
+  // `this` does not hold ownership of `callsite_op_`.
+  Operation* callsite_op_;
+};
+
+// Returns a list of ControlFlowStackInfo that represents a stack of control
+// flow operations that wraps `op`.
+llvm::SmallVector<ControlFlowStackInfo, 4> GetControlFlowStackForOp(
+    tf_device::ClusterOp tpu_cluster, Operation* op) {
+  assert(tpu_cluster.getOperation()->isProperAncestor(op));
+
+  llvm::SmallVector<ControlFlowStackInfo, 4> controlflow_stack;
+  Operation* op_in_stack = op;
+  while (op_in_stack != tpu_cluster.getOperation()) {
+    auto parent_op = op_in_stack->getParentOp();
+    if (llvm::isa<TF::IfRegionOp, TF::WhileRegionOp>(parent_op)) {
+      controlflow_stack.insert(controlflow_stack.begin(),
+                               ControlFlowStackInfo(parent_op, op_in_stack));
+    }
+    op_in_stack = parent_op;
+  }
+
+  return controlflow_stack;
 }
 
-// Moves `cluster_ops` to associated `launch_op` body.
-void MoveOutsideClusterOpsToLaunchOp(tf_device::LaunchOp launch_op,
-                                     llvm::ArrayRef<Operation*> cluster_ops) {
-  MLIRContext* context = launch_op.getContext();
-  Operation* terminator = launch_op.GetBody().getTerminator();
+// Creates a IfRegionOp with `predicate` and then/else region with yield op and
+// an empty block.
+TF::IfRegionOp CloneEmptyIfWithPredicate(Value predicate, bool is_stateless,
+                                         Location loc, OpBuilder* builder) {
+  auto host_side_if = builder->create<TF::IfRegionOp>(
+      loc, llvm::SmallVector<Type, 4>{}, predicate, is_stateless);
 
+  // Create empty then branch region.
+  auto& then_branch = host_side_if.then_branch();
+  then_branch.push_back(new Block);
+  builder->setInsertionPointToEnd(&then_branch.front());
+  builder->create<TF::YieldOp>(loc, /*operands=*/ArrayRef<Value>{});
+
+  // Create empty else branch region.
+  auto& else_branch = host_side_if.else_branch();
+  else_branch.push_back(new Block);
+  builder->setInsertionPointToEnd(&else_branch.front());
+  builder->create<TF::YieldOp>(loc, /*operands=*/ArrayRef<Value>{});
+  return host_side_if;
+}
+
+// Replicates tf.IfRegion op to host side computation.
+Operation* ReplicateIf(const ControlFlowStackInfo& controlflow_info,
+                       llvm::StringRef outside_cluster_name,
+                       Value compilation_key, OpBuilder* builder,
+                       int* send_recv_counter) {
+  // Create XlaSendToHostOp to send predicate value from device to host.
+  OpBuilder::InsertPoint insert_point = builder->saveInsertionPoint();
+  auto if_callsite_op =
+      llvm::cast<TF::IfRegionOp>(controlflow_info.GetCallSiteOp());
+  builder->setInsertionPoint(if_callsite_op);
+
+  const auto predicate_send_recv_key =
+      llvm::formatv("if_predicate_channel_{0}_{1}", outside_cluster_name,
+                    *send_recv_counter)
+          .str();
+  *send_recv_counter += 1;
+
+  auto predicate = if_callsite_op.cond();
+  auto predicate_shape = predicate.getType();
+  builder->create<TF::XlaSendToHostOp>(if_callsite_op.getLoc(), predicate,
+                                       predicate_send_recv_key);
+
+  // Create XlaRecvAtHostOp to receive predicate value from host.
+  builder->restoreInsertionPoint(insert_point);
+  auto recv_predicate_at_host = builder->create<TF::_XlaRecvAtHostOp>(
+      if_callsite_op.getLoc(), llvm::ArrayRef<Type>{predicate_shape},
+      /*dynamic_key=*/compilation_key,
+      builder->getStringAttr(predicate_send_recv_key),
+      /*device_ordinal=*/builder->getI64IntegerAttr(0));
+
+  // Create host side if op.
+  return CloneEmptyIfWithPredicate(recv_predicate_at_host.getResult(0),
+                                   if_callsite_op.is_stateless(),
+                                   if_callsite_op.getLoc(), builder);
+}
+
+// Creates a WhileRegionOp cond and body regions with yield op and
+// an empty body.
+TF::WhileRegionOp CloneEmptyWhile(bool is_stateless,
+                                  uint64_t parallel_iterations, Location loc,
+                                  OpBuilder* builder) {
+  auto host_side_while = builder->create<TF::WhileRegionOp>(
+      loc, /*output=*/ArrayRef<Type>{}, /*input=*/ArrayRef<Value>{},
+      is_stateless, parallel_iterations);
+
+  // Create empty else branch region.
+  auto& body = host_side_while.body();
+  body.push_back(new Block);
+  builder->setInsertionPointToEnd(&body.front());
+  builder->create<TF::YieldOp>(loc, /*operands=*/ArrayRef<Value>{});
+  return host_side_while;
+}
+
+// Replicates tf.WhileRegion op to host side computation.
+Operation* ReplicateWhile(const ControlFlowStackInfo& controlflow_info,
+                          llvm::StringRef outside_cluster_name,
+                          Value compilation_key, OpBuilder* builder,
+                          int* send_recv_counter) {
+  // Create XlaSendToHostOp to send cond region output from device to host.
+  OpBuilder::InsertPoint insert_point = builder->saveInsertionPoint();
+  auto while_callsite_op =
+      llvm::cast<TF::WhileRegionOp>(controlflow_info.GetCallSiteOp());
+  builder->setInsertionPoint(while_callsite_op.cond().front().getTerminator());
+
+  const auto condition_send_recv_key =
+      llvm::formatv("while_condition_channel_{0}_{1}", outside_cluster_name,
+                    *send_recv_counter)
+          .str();
+  *send_recv_counter += 1;
+  auto condition =
+      while_callsite_op.cond().front().getTerminator()->getOperand(0);
+  builder->create<TF::XlaSendToHostOp>(while_callsite_op.getLoc(), condition,
+                                       condition_send_recv_key);
+  builder->restoreInsertionPoint(insert_point);
+
+  auto host_side_while = CloneEmptyWhile(
+      while_callsite_op.is_stateless(), while_callsite_op.parallel_iterations(),
+      while_callsite_op.getLoc(), builder);
+
+  // Create cond region and yield the condition from the device.
+  auto& cond = host_side_while.cond();
+  cond.push_back(new Block);
+  builder->setInsertionPointToEnd(&cond.front());
+  auto recv_condition_at_host = builder->create<TF::_XlaRecvAtHostOp>(
+      while_callsite_op.getLoc(), llvm::ArrayRef<Type>{condition.getType()},
+      /*dynamic_key=*/compilation_key,
+      builder->getStringAttr(condition_send_recv_key),
+      /*device_ordinal=*/builder->getI64IntegerAttr(0));
+  builder->create<TF::YieldOp>(while_callsite_op.getLoc(),
+                               recv_condition_at_host.getResults());
+
+  return host_side_while;
+}
+
+// TODO(b/157054714): Use a better abstraction instead of
+// _TPUCompileMlirOp and _XlaRecvAtHostOp and _XlaSendFromHostOp.
+// Creates a compilation key as placeholder. A placeholder compilation cache key
+// is created because it is a required input to _XlaRecvAtHost and
+// _XlaSendFromHost but the _TPUCompileMlir has not yet been created for the TPU
+// cluster that contains the outside compiled ops. This placeholder should be
+// replaced by the TPU cluster _TPUCompileMlir in a subsequent pass.
+Value CreateCompilationKeyPlaceholder(Location loc, OpBuilder* builder) {
+  auto result_type =
+      RankedTensorType::get({2}, builder->getType<TF::StringType>());
+  return builder->create<TF::_TPUCompileMlirPlaceholderProgramKeyOp>(
+      loc, /*program=*/result_type, llvm::ArrayRef<Value>{});
+}
+
+// Replicates the control flow operations that wraps outside compiled ops to
+// `destination_block`.
+Operation* ReplicateControlFlowStack(
+    llvm::StringRef outside_cluster_name,
+    const llvm::SmallVectorImpl<ControlFlowStackInfo>& stack_info,
+    tf_device::ClusterOp tpu_cluster, ModuleOp module, Value compilation_key,
+    Block* destination_block, int* send_recv_counter) {
+  assert(stack_info.size());
+  OpBuilder builder = OpBuilder::atBlockTerminator(destination_block);
+  Operation* previous_replicated_controlflow_op = nullptr;
+  for (const auto& controlflow_stack_info : stack_info) {
+    // Create control flow op given provided insertion point and
+    // ControlFlowStackInfo.
+    if (auto control_flow_op = llvm::dyn_cast<TF::IfRegionOp>(
+            controlflow_stack_info.GetCallSiteOp())) {
+      previous_replicated_controlflow_op =
+          ReplicateIf(controlflow_stack_info, outside_cluster_name,
+                      compilation_key, &builder, send_recv_counter);
+      auto if_op =
+          llvm::cast<TF::IfRegionOp>(previous_replicated_controlflow_op);
+      auto type = controlflow_stack_info.GetBranchType();
+
+      // Update the insertion point to proper region inside the newly created
+      // control flow op.
+      if (type == ControlFlowStackInfo::kIfThen) {
+        builder.setInsertionPoint(&if_op.then_branch().front().front());
+      } else {
+        builder.setInsertionPoint(&if_op.else_branch().front().front());
+      }
+    } else if (auto control_flow_op = llvm::dyn_cast<TF::WhileRegionOp>(
+                   controlflow_stack_info.GetCallSiteOp())) {
+      previous_replicated_controlflow_op =
+          ReplicateWhile(controlflow_stack_info, outside_cluster_name,
+                         compilation_key, &builder, send_recv_counter);
+      auto while_op =
+          llvm::cast<TF::WhileRegionOp>(previous_replicated_controlflow_op);
+      auto type = controlflow_stack_info.GetBranchType();
+      if (type == ControlFlowStackInfo::kWhileCond) {
+        builder.setInsertionPoint(&while_op.cond().front().front());
+      } else {
+        builder.setInsertionPoint(&while_op.body().front().front());
+      }
+    }
+  }
+
+  // Return operation which should be used to as the insertion point to create
+  // send/recv ops.
+  if (auto inner_most_if =
+          llvm::dyn_cast<TF::IfRegionOp>(previous_replicated_controlflow_op)) {
+    auto inner_most_controlflow_stack = stack_info.back();
+    if (inner_most_controlflow_stack.GetBranchType() ==
+        ControlFlowStackInfo::kIfThen) {
+      return inner_most_if.then_branch().front().getTerminator();
+    } else {
+      return inner_most_if.else_branch().front().getTerminator();
+    }
+  } else if (auto inner_most_while = llvm::dyn_cast<TF::WhileRegionOp>(
+                 previous_replicated_controlflow_op)) {
+    auto inner_most_controlflow_stack = stack_info.back();
+    if (inner_most_controlflow_stack.GetBranchType() ==
+        ControlFlowStackInfo::kWhileCond) {
+      return &inner_most_while.cond().front().front();
+    } else {
+      return inner_most_while.body().front().getTerminator();
+    }
+  }
+  return destination_block->getTerminator();
+}
+
+// Collects and clusters ops in `block` with the same `_xla_outside_compilation`
+// attribute into `clusters` This returns an error if a
+// `_xla_outside_compilation` attribute of an op is empty.
+// TODO(b/163141763): Make sure ops inside control flow regions are not outside
+// compiled if the entire control flow op is marked as outside compiled.
+LogicalResult CollectAndGroupOutsideClusterOps(Block* block,
+                                               OutsideClusterMap* clusters) {
+  auto walk_result = block->walk([&](Operation* op) {
+    if (auto attr = op->getAttrOfType<StringAttr>(kXlaOutsideCompilationAttr)) {
+      if (attr.getValue().empty()) {
+        op->emitError() << "attribute '" << kXlaOutsideCompilationAttr
+                        << "' is empty";
+        return WalkResult::interrupt();
+      }
+
+      auto it = clusters->try_emplace(attr.getValue());
+      it.first->getSecond().push_back(op);
+    }
+    return WalkResult::advance();
+  });
+
+  return failure(walk_result.wasInterrupted());
+}
+
+// Moves `cluster_ops` before `op`.
+void MoveOutsideClusterOpsBeforeOp(Operation* op,
+                                   llvm::ArrayRef<Operation*> cluster_ops,
+                                   MLIRContext* context) {
   for (Operation* cluster_op : cluster_ops) {
     // Remove `_xla_outside_compilation` and `device` attribute from ops in the
     // cluster as that information will be present in the `launch_op`.
     cluster_op->removeAttr(
         Identifier::get(kXlaOutsideCompilationAttr, context));
     cluster_op->removeAttr(Identifier::get(kDeviceAttr, context));
-    cluster_op->moveBefore(terminator);
+    cluster_op->moveBefore(op);
   }
 }
 
-// Creates a `tf_device::LaunchOp` to wrap cluster ops.
+// Creates a `tf_device.launch` to wrap cluster ops.
 tf_device::LaunchOp CreateLaunchOpForOutsideCluster(
     OpBuilder* builder, Operation* last_cluster_op,
     llvm::StringRef host_device) {
   // An empty string placeholder is used for the device as that will be later
   // populated with the device of the associated TPUReplicateMetadata op.
-  llvm::SmallVector<Type, 8> result_types;
   auto launch_op = builder->create<tf_device::LaunchOp>(
       last_cluster_op->getLoc(), builder->getStringAttr(host_device),
-      result_types);
+      /*result_types=*/ArrayRef<Type>{});
 
   launch_op.body().push_back(new Block);
 
@@ -133,21 +407,61 @@ tf_device::LaunchOp CreateLaunchOpForOutsideCluster(
   return launch_op;
 }
 
-// Extracts all externally provided operands of `cluster_ops`.
+// Extracts all externally provided operands of `host_cluster_ops`.
 llvm::SmallSetVector<Value, 4> GetExternalOperands(
-    llvm::ArrayRef<Operation*> cluster_ops) {
+    tf_device::ClusterOp tpu_cluster,
+    llvm::ArrayRef<Operation*> host_cluster_ops) {
   llvm::SmallSetVector<Value, 4> external_values;
 
-  for (Operation* op : cluster_ops) {
-    for (Value v : op->getOperands()) {
-      Operation* defining_op = v.getDefiningOp();
-      if (!defining_op) continue;
-      bool is_external = llvm::none_of(cluster_ops, [&](Operation* cluster_op) {
-        return defining_op == cluster_op;
-      });
+  for (Operation* host_cluster_op : host_cluster_ops) {
+    auto cluster_op_parent_region = host_cluster_op->getParentRegion();
+    host_cluster_op->walk([&](Operation* op) {
+      auto region = op->getParentRegion();
 
-      if (is_external) external_values.insert(v);
-    }
+      if (region == cluster_op_parent_region) {
+        // For op operands, add operand defining ops, if they are not included
+        // in `host_cluster_ops`.
+        for (Value v : op->getOperands()) {
+          Operation* defining_op = v.getDefiningOp();
+          bool is_external = false;
+          if (defining_op) {
+            is_external =
+                llvm::none_of(host_cluster_ops, [&](Operation* cluster_op) {
+                  return defining_op == cluster_op;
+                });
+          } else {
+            if (auto block_arg = v.dyn_cast<BlockArgument>()) {
+              if (block_arg.getParentRegion() == cluster_op_parent_region)
+                is_external = true;
+            }
+          }
+          if (is_external) external_values.insert(v);
+        }
+      } else {
+        llvm::SetVector<Value> external_captured_inputs;
+        visitUsedValuesDefinedAbove(*region, *region, [&](OpOperand* operand) {
+          Region* operand_defined_region = operand->get().getParentRegion();
+          if (!tpu_cluster.body().isAncestor(operand_defined_region)) return;
+          // If the host_cluster_op is regional control flow (if, while),
+          // then check if the operand_defined_region is an ancestor of the
+          // control flow regions.
+          if (auto if_op = llvm::dyn_cast<TF::IfRegionOp>(host_cluster_op)) {
+            if (if_op.then_branch().isAncestor(operand_defined_region) ||
+                if_op.else_branch().isAncestor(operand_defined_region))
+              return;
+          }
+          if (auto while_op =
+                  llvm::dyn_cast<TF::WhileRegionOp>(host_cluster_op)) {
+            if (while_op.cond().isAncestor(operand_defined_region) ||
+                while_op.body().isAncestor(operand_defined_region))
+              return;
+          }
+          external_captured_inputs.insert(operand->get());
+        });
+        external_values.insert(external_captured_inputs.begin(),
+                               external_captured_inputs.end());
+      }
+    });
   }
 
   return external_values;
@@ -212,34 +526,42 @@ TF::_XlaHostComputeMlirOp CreateHostCompute(
 }
 
 void MoveOutsideCompiledOps(
-    tf_device::ClusterOp tpu_cluster, llvm::StringRef outside_cluster_name,
-    tf_device::LaunchOp host_launch_op, llvm::ArrayRef<Operation*> cluster_ops,
+    ModuleOp module, tf_device::ClusterOp tpu_cluster,
+    llvm::StringRef outside_cluster_name, tf_device::LaunchOp host_launch_op,
+    llvm::ArrayRef<Operation*> cluster_ops,
     const llvm::SmallSetVector<Value, 4>& external_inputs,
     llvm::ArrayRef<Value> external_outputs) {
+  // Since ops in `cluster_ops` do not cross function/control flow boundary, it
+  // is sufficient to identify the control flow that wraps `cluster_ops` by
+  // looking at any arbitary op inside `cluster_ops`.
+  auto controlflow_stack =
+      GetControlFlowStackForOp(tpu_cluster, cluster_ops.front());
+
+  Value compilation_key;
+  if (!controlflow_stack.empty() || !external_inputs.empty() ||
+      !external_outputs.empty()) {
+    OpBuilder builder(&host_launch_op.GetBody().front());
+    compilation_key =
+        CreateCompilationKeyPlaceholder(tpu_cluster.getLoc(), &builder);
+  }
+
+  Operation* insertion_op = nullptr;
+  if (controlflow_stack.empty()) {
+    insertion_op = host_launch_op.GetBody().getTerminator();
+  } else {
+    int send_recv_counter = 0;
+    insertion_op = ReplicateControlFlowStack(
+        outside_cluster_name, controlflow_stack, tpu_cluster, module,
+        compilation_key, &host_launch_op.GetBody(), &send_recv_counter);
+  }
+
+  MLIRContext* context = host_launch_op.getContext();
   if (external_inputs.empty() && external_outputs.empty()) {
-    MoveOutsideClusterOpsToLaunchOp(host_launch_op, cluster_ops);
+    MoveOutsideClusterOpsBeforeOp(insertion_op, cluster_ops, context);
     return;
   }
 
-  OpBuilder builder(host_launch_op.GetBody().getTerminator());
-  auto result_type =
-      RankedTensorType::get({}, builder.getType<TF::StringType>());
-
-  std::string txt_metadata;
-  std::string txt_module;
-  // TODO(b/157054714): Use a better abstraction instead of _TPUCompileMlirOp
-  // and _XlaRecvAtHostOp and _XlaSendFromHostOp.
-
-  // A placeholder compilation cache key is created because it is a required
-  // input to _XlaRecvAtHost and _XlaSendFromHost but the _TPUCompileMlir has
-  // not yet been created for the TPU cluster that contains the outside compiled
-  // ops. This placeholder should be replaced by the TPU cluster _TPUCompileMlir
-  // in a subsequent pass.
-  auto compilation_key =
-      builder.create<TF::_TPUCompileMlirPlaceholderProgramKeyOp>(
-          tpu_cluster.getLoc(), /*program=*/result_type,
-          llvm::ArrayRef<Value>{});
-
+  OpBuilder builder(insertion_op);
   llvm::SmallVector<Type, 4> host_output_types;
   for (const auto& external_input : external_inputs)
     host_output_types.push_back(external_input.getType());
@@ -250,6 +572,7 @@ void MoveOutsideCompiledOps(
   std::string retvals_communication_key =
       llvm::formatv("host_compute_channel_{0}_retvals", outside_cluster_name)
           .str();
+
   auto recv_at_host = builder.create<TF::_XlaRecvAtHostOp>(
       tpu_cluster.getLoc(), host_output_types,
       /*dynamic_key=*/compilation_key,
@@ -259,9 +582,9 @@ void MoveOutsideCompiledOps(
   auto host_compute = CreateHostCompute(
       &builder, tpu_cluster, cluster_ops, external_inputs, external_outputs,
       args_communication_key, retvals_communication_key);
-  MoveOutsideClusterOpsToLaunchOp(host_launch_op, cluster_ops);
+  MoveOutsideClusterOpsBeforeOp(insertion_op, cluster_ops, context);
 
-  builder.setInsertionPoint(host_launch_op.GetBody().getTerminator());
+  builder.setInsertionPoint(insertion_op);
   builder.create<TF::_XlaSendFromHostOp>(
       tpu_cluster.getLoc(), external_outputs,
       /*dynamic_key=*/compilation_key,
@@ -279,7 +602,8 @@ void MoveOutsideCompiledOps(
 
 // Creates a `parallel_execute` op in place of launch with 'clusters` and
 // 'launch` as regions.
-void CreateParallelExecuteFromOutsideClusters(tf_device::ClusterOp tpu_cluster,
+void CreateParallelExecuteFromOutsideClusters(ModuleOp module,
+                                              tf_device::ClusterOp tpu_cluster,
                                               const OutsideClusterMap& clusters,
                                               llvm::StringRef host_device) {
   OpBuilder builder(tpu_cluster);
@@ -295,18 +619,18 @@ void CreateParallelExecuteFromOutsideClusters(tf_device::ClusterOp tpu_cluster,
 
     Block& outside_block =
         parallel_execute_op.GetRegionBlockWithIndex(cluster.index());
+
     builder.setInsertionPointToEnd(&outside_block);
     tf_device::LaunchOp host_launch_op = CreateLaunchOpForOutsideCluster(
         &builder, cluster_ops.back(), host_device);
 
     // Determine if there are any inputs that are provided out of cluster.
-    auto external_inputs = GetExternalOperands(cluster_ops);
+    auto external_inputs = GetExternalOperands(tpu_cluster, cluster_ops);
     auto external_outputs = GetExternalOutputs(cluster_ops);
 
-    MoveOutsideCompiledOps(tpu_cluster, cluster.value().getFirst(),
+    MoveOutsideCompiledOps(module, tpu_cluster, cluster.value().getFirst(),
                            host_launch_op, cluster_ops, external_inputs,
                            external_outputs);
-
     builder.setInsertionPointToEnd(&outside_block);
     builder.create<tf_device::ReturnOp>(tpu_cluster.getLoc(),
                                         ArrayRef<Value>{});
@@ -352,7 +676,8 @@ void TPUExtractOutsideCompilation::runOnOperation() {
         std::string host_device;
         tensorflow::GetHostDeviceOutsideComputation(devices, tpu_cluster,
                                                     &host_device);
-        CreateParallelExecuteFromOutsideClusters(tpu_cluster, clusters,
+
+        CreateParallelExecuteFromOutsideClusters(module, tpu_cluster, clusters,
                                                  host_device);
 
         return WalkResult::advance();
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_identity_pruning.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_identity_pruning.cc
new file mode 100644
index 00000000000..32b1eb340d6
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_identity_pruning.cc
@@ -0,0 +1,113 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <tuple>
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Region.h"  // from @llvm-project
+#include "mlir/Interfaces/CallInterfaces.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+namespace mlir {
+namespace TFTPU {
+
+namespace {
+
+// This pass removes Identity/IdentityN ops from the TPU computation and
+// reachable functions.
+// TODO(lyandy): Remove this pass once resource op lifting is migrated to use
+// resource alias analysis and support region based control flow. Removing
+// Identity ops may remove `_XlaSharding` annotation attribute if Identity ops
+// are used to propagate such information.
+
+struct TPUIdentityPruning
+    : public PassWrapper<TPUIdentityPruning, OperationPass<ModuleOp>> {
+  void runOnOperation() override;
+};
+
+// Collects all reachable functions (via call ops) from a given region.
+SmallVector<FuncOp, 4> CollectReachableFunctions(Region& region) {
+  llvm::SmallPtrSet<FuncOp, 4> reachable_funcs;
+
+  auto collect_reachable_funcs =
+      [&reachable_funcs](Region& src, SmallVectorImpl<FuncOp>& funcs_to_visit) {
+        src.walk([&reachable_funcs, &funcs_to_visit](CallOpInterface call_op) {
+          auto func = dyn_cast_or_null<FuncOp>(call_op.resolveCallable());
+          if (func && reachable_funcs.insert(func).second)
+            funcs_to_visit.push_back(func);
+        });
+      };
+
+  SmallVector<FuncOp, 4> funcs_to_visit;
+  collect_reachable_funcs(region, funcs_to_visit);
+
+  while (!funcs_to_visit.empty()) {
+    SmallVector<FuncOp, 4> new_funcs_to_visit;
+    for (FuncOp func_to_visit : funcs_to_visit) {
+      if (!func_to_visit.getCallableRegion()) continue;
+      collect_reachable_funcs(*func_to_visit.getCallableRegion(),
+                              new_funcs_to_visit);
+    }
+    funcs_to_visit.swap(new_funcs_to_visit);
+  }
+
+  return llvm::to_vector<4>(reachable_funcs);
+}
+
+// Removes Identity/IdentityN ops from a region and forwards its operands to its
+// results.
+void RemoveIdentityFromRegion(Region& region) {
+  region.walk([](Operation* op) {
+    if (isa<TF::IdentityOp, TF::IdentityNOp>(op)) {
+      op->replaceAllUsesWith(op->getOperands());
+      op->erase();
+    }
+  });
+}
+
+void TPUIdentityPruning::runOnOperation() {
+  SmallVector<tf_device::ClusterOp, 4> clusters;
+  getOperation().walk(
+      [&](tf_device::ClusterOp cluster) { clusters.push_back(cluster); });
+
+  for (tf_device::ClusterOp cluster : clusters) {
+    RemoveIdentityFromRegion(cluster.body());
+    auto reachable_funcs = CollectReachableFunctions(cluster.body());
+    for (FuncOp reachable_func : reachable_funcs)
+      RemoveIdentityFromRegion(*reachable_func.getCallableRegion());
+  }
+}
+
+}  // anonymous namespace
+
+std::unique_ptr<OperationPass<ModuleOp>> CreateTPUIdentityPruningPass() {
+  return std::make_unique<TPUIdentityPruning>();
+}
+
+static PassRegistration<TPUIdentityPruning> pass(
+    "tf-tpu-identity-pruning",
+    "Removes Identity/IdentityN ops from the TPU computation");
+
+}  // namespace TFTPU
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_outside_compilation_cluster.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_outside_compilation_cluster.cc
index be01b7644ea..900bdf6f519 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_outside_compilation_cluster.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_outside_compilation_cluster.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
@@ -22,6 +23,7 @@ limitations under the License.
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 
@@ -33,8 +35,10 @@ namespace {
 constexpr char kXlaOutsideCompilationAttr[] = "_xla_outside_compilation";
 
 struct TPUOutsideCompilationCluster
-    : public PassWrapper<TPUOutsideCompilationCluster, FunctionPass> {
-  void runOnFunction() override;
+    : public TF::PerFunctionAggregateAnalysisConsumerPass<
+          TPUOutsideCompilationCluster, TF::SideEffectAnalysis> {
+  void runOnFunction(FuncOp func,
+                     const TF::SideEffectAnalysis::Info& side_effect_analysis);
 };
 
 // Represents an outside compiled cluster. All ops that are added to the same
@@ -44,72 +48,86 @@ class OutsideCompiledCluster {
   explicit OutsideCompiledCluster(int number)
       : cluster_name_(llvm::formatv("cluster{0}", number).str()) {}
 
-  // Attempts to add an op to this cluster.
-  // This function requires all ops to be added before their uses.
-  bool AddOp(Operation* op) {
+  // Attempts to add an op to this cluster. Ops can be grouped to the same
+  // cluster if they have data dependency and are inside the same block.
+  bool AddOp(Operation* op,
+             const TF::SideEffectAnalysis::Info& side_effect_analysis) {
     // Check if the op is safe to add before adding it.
-    bool add = IsSafeToAdd(op);
-    if (add) {
-      // Set the ops kXlaOutsideCompilationAttr to the cluster name.
+    if (IsSafeToAdd(op, side_effect_analysis)) {
       op->setAttr(kXlaOutsideCompilationAttr,
                   StringAttr::get(cluster_name_, op->getContext()));
-
-      // Since we are adding the op to the cluster, the op is no longer
-      // considered a user of this cluster.
-      users_.erase(op);
+      host_cluster_ops_.insert(op);
+      return true;
     }
-
-    // Add this op's users to the cluster users.
-    users_.insert(op->user_begin(), op->user_end());
-    return add;
+    return false;
   }
 
  private:
   // Checks if it is safe for an op to be merged into this cluster.
-  bool IsSafeToAdd(Operation* op) {
+  bool IsSafeToAdd(Operation* op,
+                   const TF::SideEffectAnalysis::Info& side_effect_analysis) {
+    if (closed_) return false;
     // If the op is not marked for outside compilation it doesn't belong in a
     // cluster.
-    if (!op->getAttrOfType<StringAttr>(kXlaOutsideCompilationAttr))
+    if (!op->getAttrOfType<StringAttr>(kXlaOutsideCompilationAttr)) {
+      auto successors = side_effect_analysis.DirectControlSuccessors(op);
+      // If non outside compiled op with side effect successors is encountered,
+      // close this cluster to additions so that no cluster cyclic dependencies
+      // can be created.
+      if (!successors.empty()) {
+        closed_ = true;
+      }
       return false;
-
-    // Checks to see if the op's operands are related to this
-    // clusters users. If they are related, then there is an op between this
-    // op and the cluster. Since ops are added before their uses, there
-    // is no way for the op in-between to ever be added to this cluster
-    // therefore there is no way this op can ever be added to the cluster.
-    for (const Value& value : op->getOperands()) {
-      Operation* op_operand = value.getDefiningOp();
-      if (op_operand && users_.find(op_operand) != users_.end()) return false;
     }
-    return true;
+
+    if (host_cluster_ops_.empty()) return true;
+
+    // Checks to see if there is data dependency between ops in
+    // `host_cluster_ops_` and `op`.
+    const bool contains_data_dependency = llvm::any_of(
+        op->getUsers(),
+        [&](Operation* user) { return host_cluster_ops_.contains(user); });
+
+    const bool inside_same_block =
+        llvm::all_of(host_cluster_ops_, [&](Operation* op_in_cluster) {
+          return op_in_cluster->getBlock() == op->getBlock();
+        });
+
+    return inside_same_block && contains_data_dependency;
   }
 
-  // users_ stores the direct and indirect users of the outside compiled ops in
-  // this cluster. It does NOT store the outside compiled ops that are a part
-  // of this cluster that will be collectively extracted and run on the cpu.
-  // users_ is consulted when attempting to add a new outside compiled to the
-  // cluster. If the new op's operand(s) are already in users_, it means that
-  // the operand(s) were not added to the cluster so it is not safe to add the
-  // new op to the cluster either.
-  llvm::SmallPtrSet<Operation*, 8> users_;
+  // `host_cluster_op_` stores a set of ops that will be grouped and computed
+  // on host as single XlaHostCompute op. An outside compiled op can be grouped
+  // to a single cluster if it has data dependency to another op already in the
+  // cluster.
+  llvm::SmallPtrSet<Operation*, 8> host_cluster_ops_;
   std::string cluster_name_;
+  bool closed_ = false;  // Cluster is closed to further additions.
 };
 
-void TPUOutsideCompilationCluster::runOnFunction() {
+void TPUOutsideCompilationCluster::runOnFunction(
+    FuncOp func, const TF::SideEffectAnalysis::Info& side_effect_analysis) {
   llvm::SmallVector<OutsideCompiledCluster, 8> clusters;
   int cluster_counter = 0;
 
-  getFunction().walk([&](tf_device::ClusterOp tpu_cluster) {
-    for (Operation& op : tpu_cluster.GetBody()) {
+  func.walk([&](tf_device::ClusterOp tpu_cluster) {
+    llvm::SmallVector<Operation*, 4> tpu_cluster_ops;
+    tpu_cluster_ops.reserve(tpu_cluster.getBody()->getOperations().size());
+
+    tpu_cluster.walk([&](Operation* op) { tpu_cluster_ops.emplace_back(op); });
+
+    // In order to cluster ops feeding results to the same operation, traverse
+    // the ops in reverse order.
+    for (Operation* op : llvm::reverse(tpu_cluster_ops)) {
       // Try to add the op to existing clusters.
       bool added = false;
       for (auto& cluster : clusters)
-        if ((added = cluster.AddOp(&op))) break;
+        if ((added = cluster.AddOp(op, side_effect_analysis))) break;
 
       // If the op cannot be added to existing clusters, create a new cluster.
       if (!added) {
         OutsideCompiledCluster new_cluster(cluster_counter++);
-        new_cluster.AddOp(&op);
+        new_cluster.AddOp(op, side_effect_analysis);
         clusters.push_back(new_cluster);
       }
     }
@@ -118,7 +136,7 @@ void TPUOutsideCompilationCluster::runOnFunction() {
 
 }  // anonymous namespace
 
-std::unique_ptr<OperationPass<FuncOp>>
+std::unique_ptr<OperationPass<ModuleOp>>
 CreateTPUOutsideCompilationClusterPass() {
   return std::make_unique<TPUOutsideCompilationCluster>();
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_parallel_execute_sink_resource_write.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_parallel_execute_sink_resource_write.cc
new file mode 100644
index 00000000000..45773a128fd
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_parallel_execute_sink_resource_write.cc
@@ -0,0 +1,166 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <memory>
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+namespace mlir {
+namespace TFTPU {
+
+namespace {
+
+// A pass that moves `tf.AssignVariableOp` into a `tf_device.parallel_execute`
+// region if the `tf.AssignVariableOp` is the only consumer of a
+// `tf_device.parallel_execute` result. This will allow
+// TPUMergeVariablesWithExecute to merge resource writes without special
+// handling for `tf_device.parallel_execute`.
+struct TPUParallelExecuteSinkResourceWrite
+    : public PassWrapper<TPUParallelExecuteSinkResourceWrite, FunctionPass> {
+  void runOnFunction() override;
+};
+
+// Finds an AssignVariableOp that can be moved into the parallel_execute region.
+// These AssignVariableOps must be the only consumer of the respective
+// parallel_execute result, and the resource handle producer must be from an op
+// before or above the parallel_execute.
+TF::AssignVariableOp GetSingleUseResourceWrite(
+    tf_device::ParallelExecuteOp parallel_execute, Value result) {
+  if (!result.hasOneUse()) return nullptr;
+
+  OpOperand& use = *result.getUses().begin();
+  auto assign_var = dyn_cast<TF::AssignVariableOp>(use.getOwner());
+  if (!assign_var) return nullptr;
+
+  if (use.get() != assign_var.value()) return nullptr;
+
+  auto* resource_handle_op = assign_var.resource().getDefiningOp();
+  if (resource_handle_op == parallel_execute) return nullptr;
+
+  if (resource_handle_op &&
+      resource_handle_op->getBlock() ==
+          parallel_execute.getOperation()->getBlock() &&
+      parallel_execute.getOperation()->isBeforeInBlock(resource_handle_op))
+    return nullptr;
+
+  return assign_var;
+}
+
+// Finds AssignVariableOps that can be moved into a parallel_execute region and
+// moves them. Leftover parallel_execute results that were used by the
+// such AssignVariableOp are also pruned.
+void SinkResourceWritesIntoParallelExecute(
+    tf_device::ParallelExecuteOp parallel_execute) {
+  bool rewrite = false;
+  const int num_regions = parallel_execute.getNumRegions();
+  llvm::SmallVector<Value, 4> results_to_remap;
+
+  // Go through each region and find AssignVariableOps that can be moved into
+  // the parallel_execute region. Result indices by region index are collected,
+  // so they can be removed afterwards.
+  llvm::SmallVector<llvm::SmallVector<int, 4>, 4> results_to_remove_by_region;
+  results_to_remove_by_region.resize(num_regions);
+  for (int i = 0; i < num_regions; ++i) {
+    Block& block = parallel_execute.GetRegionBlockWithIndex(i);
+    auto results = parallel_execute.GetRegionOutputs(i);
+    auto& results_to_remove = results_to_remove_by_region[i];
+    results_to_remove.reserve(results.size());
+    Operation* terminator = block.getTerminator();
+    for (auto result : llvm::enumerate(results)) {
+      TF::AssignVariableOp assign_var =
+          GetSingleUseResourceWrite(parallel_execute, result.value());
+      if (!assign_var) {
+        results_to_remap.push_back(result.value());
+        continue;
+      }
+
+      // Move AssignVariableOp and update the value to be written to the
+      // resource variable to be the non forwarded value from within the
+      // parallel_execute region.
+      assign_var.getOperation()->moveBefore(terminator);
+      assign_var.valueMutable().assign(terminator->getOperand(result.index()));
+      results_to_remove.push_back(result.index());
+    }
+
+    rewrite |= !results_to_remove.empty();
+  }
+
+  if (!rewrite) return;
+
+  // Remove leftover unused results (terminator operands) from moving
+  // AssignVariabeOps into the parallel_execute region.
+  for (auto results_to_remove : llvm::enumerate(results_to_remove_by_region)) {
+    Block& block =
+        parallel_execute.GetRegionBlockWithIndex(results_to_remove.index());
+    Operation* terminator = block.getTerminator();
+    for (int index_to_remove : llvm::reverse(results_to_remove.value()))
+      terminator->eraseOperand(index_to_remove);
+  }
+
+  // Replace old parallel_execute with new parallel_execute by moving the
+  // regions to a new parallel_execute and remapping the results.
+  llvm::SmallVector<Type, 4> new_result_types;
+  new_result_types.reserve(results_to_remap.size());
+  for (Value old_result : results_to_remap)
+    new_result_types.push_back(old_result.getType());
+
+  OpBuilder builder(parallel_execute);
+  auto new_parallel_execute = builder.create<tf_device::ParallelExecuteOp>(
+      parallel_execute.getLoc(), num_regions, new_result_types);
+
+  for (auto region : llvm::zip(new_parallel_execute.getRegions(),
+                               parallel_execute.getRegions()))
+    std::get<0>(region)->takeBody(*std::get<1>(region));
+
+  for (auto result :
+       llvm::zip(results_to_remap, new_parallel_execute.getResults()))
+    std::get<0>(result).replaceAllUsesWith(std::get<1>(result));
+
+  parallel_execute.erase();
+}
+
+void TPUParallelExecuteSinkResourceWrite::runOnFunction() {
+  llvm::SmallVector<tf_device::ParallelExecuteOp, 4> parallel_executes;
+  getFunction().walk([&](tf_device::ParallelExecuteOp parallel_execute) {
+    parallel_executes.push_back(parallel_execute);
+  });
+
+  for (tf_device::ParallelExecuteOp parallel_execute : parallel_executes)
+    SinkResourceWritesIntoParallelExecute(parallel_execute);
+}
+
+}  // anonymous namespace
+
+std::unique_ptr<OperationPass<FuncOp>>
+CreateTPUParallelExecuteSinkResourceWritePass() {
+  return std::make_unique<TPUParallelExecuteSinkResourceWrite>();
+}
+
+static PassRegistration<TPUParallelExecuteSinkResourceWrite> pass(
+    "tf-tpu-parallel-execute-sink-resource-write",
+    "Moves tf.AssignVariableOp consumers of tf_device.parallel_execute into "
+    "tf_device.parallel_execute regions");
+
+}  // namespace TFTPU
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_resource_read_for_write.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_resource_read_for_write.cc
new file mode 100644
index 00000000000..cccd528da1d
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_resource_read_for_write.cc
@@ -0,0 +1,140 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+
+namespace mlir {
+namespace TFTPU {
+
+// A pass that finds TPU clusters with write only resource access and adds an
+// associated resource read, so the resource can later be fused into TPUExecute.
+namespace {
+struct TPUResourceReadForWrite
+    : public PassWrapper<TPUResourceReadForWrite, OperationPass<ModuleOp>> {
+  void runOnOperation() override;
+};
+
+// Helper struct holding a resource value and its associated type.
+struct ResourceValueAndSubtype {
+  Value resource;
+  Type subtype;
+};
+
+// Finds resource handle and type for result if result writes to a resource.
+ResourceValueAndSubtype GetResourceWriteResult(
+    tf_device::ClusterFuncOp cluster_func, Value result) {
+  ResourceValueAndSubtype resource;
+  if (!result.hasOneUse()) return resource;
+  Operation* result_user = *result.getUsers().begin();
+  auto assign_var = dyn_cast<TF::AssignVariableOp>(result_user);
+  if (!assign_var) return resource;
+
+  auto handle = assign_var.resource();
+  // Skip result if cluster writes to the same variable via multiple results.
+  for (Operation* handle_user : handle.getUsers()) {
+    if (handle_user == assign_var) continue;
+    auto assign_var_user = dyn_cast<TF::AssignVariableOp>(handle_user);
+    if (!assign_var_user) continue;
+    if (assign_var_user.value().getDefiningOp() == cluster_func)
+      return resource;
+  }
+
+  resource.resource = assign_var.resource();
+  resource.subtype = assign_var.value().getType();
+  return resource;
+}
+
+// Checks if resource is read by TPU cluster.
+bool ClusterFuncHasResourceRead(tf_device::ClusterFuncOp cluster_func,
+                                Value resource) {
+  for (Operation* resource_user : resource.getUsers())
+    if (auto read = dyn_cast<TF::ReadVariableOp>(resource_user))
+      for (Operation* read_user : read.value().getUsers())
+        if (read_user == cluster_func) return true;
+
+  return false;
+}
+
+void TPUResourceReadForWrite::runOnOperation() {
+  SmallVector<tf_device::ClusterFuncOp, 4> cluster_funcs;
+  getOperation().walk([&](tf_device::ClusterFuncOp cluster_func) {
+    cluster_funcs.push_back(cluster_func);
+  });
+
+  OpBuilder builder(&getContext());
+  // Add resource reads for resource writes from TPU cluster where for such
+  // resources the TPU cluster does not read from.
+  for (tf_device::ClusterFuncOp cluster_func : cluster_funcs) {
+    builder.setInsertionPoint(cluster_func);
+
+    SmallVector<Value, 4> read_operands;
+    for (Value result : cluster_func.getResults()) {
+      // TODO(lyandy): Update pass to use resource alias analysis.
+      auto resource_and_type = GetResourceWriteResult(cluster_func, result);
+      if (!resource_and_type.resource) continue;
+      if (ClusterFuncHasResourceRead(cluster_func, resource_and_type.resource))
+        continue;
+      auto new_read = builder.create<TF::ReadVariableOp>(
+          resource_and_type.resource.getLoc(), resource_and_type.subtype,
+          resource_and_type.resource);
+      read_operands.push_back(new_read.value());
+    }
+
+    if (read_operands.empty()) continue;
+
+    // Update caller and function types with new read operands.
+    auto operands = llvm::to_vector<4>(cluster_func.getOperands());
+    operands.append(read_operands.begin(), read_operands.end());
+
+    auto new_cluster_func = builder.create<tf_device::ClusterFuncOp>(
+        cluster_func.getLoc(), cluster_func.getResultTypes(), operands,
+        cluster_func.getAttrs());
+    cluster_func.replaceAllUsesWith(new_cluster_func);
+    FuncOp func = cluster_func.getFunc();
+    Block& block = func.front();
+    for (Value read_operand : read_operands)
+      block.addArgument(read_operand.getType());
+
+    func.setType(FunctionType::get(block.getArgumentTypes(),
+                                   func.getCallableResults(), &getContext()));
+    cluster_func.erase();
+  }
+}
+
+}  // namespace
+
+std::unique_ptr<OperationPass<ModuleOp>> CreateTPUResourceReadForWritePass() {
+  return std::make_unique<TPUResourceReadForWrite>();
+}
+
+static PassRegistration<TPUResourceReadForWrite> pass(
+    "tf-tpu-resource-read-for-write",
+    "Inserts tf.ReadVariableOp inputs to a TPU cluster for resource writes "
+    "with no reads");
+
+}  // namespace TFTPU
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
index ca77feafc05..86aeec81150 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/raw_ostream.h"
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/Module.h"  // from @llvm-project
@@ -42,6 +41,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/device_util.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.h"
 #include "tensorflow/compiler/xla/xla.pb.h"
@@ -154,11 +154,8 @@ LogicalResult EncapsulateFuncAndSerialize(FuncOp entry_func,
     symbol_table.insert(clone);
   }
 
-  // Serialize module and return.
-  {
-    llvm::raw_string_ostream os(*serialized_func_module);
-    module_for_func.get().print(os);
-  }
+  *serialized_func_module =
+      tensorflow::SerializeMlirModule(module_for_func.get());
   return success();
 }
 
@@ -409,12 +406,15 @@ Operation* BuildCompileOp(
   std::string txt_module;
   if (failed(EncapsulateFuncAndSerialize(func, &txt_module))) return nullptr;
 
-  auto result_type =
+  auto compilation_status_type =
       RankedTensorType::get({}, builder->getType<TF::StringType>());
+  auto program_type =
+      RankedTensorType::get({2}, builder->getType<TF::StringType>());
 
   auto compile_op = builder->create<TF::_TPUCompileMlirOp>(
-      cluster_func.getLoc(), /*compilation_status=*/result_type, /*program=*/
-      llvm::SmallVector<Type, 8>(num_cores_per_replica, result_type),
+      cluster_func.getLoc(),
+      /*compilation_status=*/compilation_status_type, /*program=*/
+      llvm::SmallVector<Type, 8>(num_cores_per_replica, program_type),
       compile_op_operands, txt_module, txt_metadata);
 
   return WrapOpInLaunch(builder, compile_op.getLoc(), compile_op,
@@ -598,9 +598,9 @@ void BuildTPUCompileSucceededAssertOp(Operation* compile_op,
 // func @main(%arg0: tensor<i1>) {
 //   %0 = "tf.Shape"(%arg0) : (tensor<i1>) -> tensor<?xi32>
 //   %1:2 = "tf._TPUCompileMlir"(%0) {device = "/CPU:0"} :
-//            (tensor<?xi32>) -> (tensor<!tf.string>, tensor<!tf.string>)
+//            (tensor<?xi32>) -> (tensor<!tf.string>, tensor<2x!tf.string>)
 //   %2 = "tf.TPUExecute"(%arg0, %1#0) {device = "/TPU:0"} :
-//            (tensor<i1>, tensor<!tf.string>) -> tensor<i1>
+//            (tensor<i1>, tensor<2x!tf.string>) -> tensor<i1>
 //   return
 // }
 //
@@ -624,9 +624,9 @@ void BuildTPUCompileSucceededAssertOp(Operation* compile_op,
 //                              {n = 2 : i32, devices = ["/TPU:0", "/TPU:1"]} {
 //     %1 = "tf.Shape"(%ri) : (tensor<i1>) -> tensor<?xi32>
 //     %2:2 = "tf._TPUCompileMlir"(%1) {device = "/CPU:0"} :
-//              (tensor<?xi32>) -> (tensor<!tf.string>, tensor<!tf.string>)
+//              (tensor<?xi32>) -> (tensor<!tf.string>, tensor<2x!tf.string>)
 //     %3 = "tf.TPUExecute"(%ri, %2#0) :
-//            (tensor<i1>, tensor<!tf.string>) -> tensor<i1>
+//            (tensor<i1>, tensor<2x!tf.string>) -> tensor<i1>
 //     tf_device.return %3 : tensor<i1>
 //   }
 //   return
@@ -644,7 +644,7 @@ LogicalResult Rewrite(
   int num_replicas = 1;
   tf_device::ReplicateOp replicate =
       cluster_func.getParentOfType<tf_device::ReplicateOp>();
-  if (replicate) num_replicas = replicate.n().getLimitedValue();
+  if (replicate) num_replicas = replicate.n();
 
   auto num_cores_per_replica_attr = cluster_func.getAttrOfType<IntegerAttr>(
       tensorflow::kNumCoresPerReplicaAttr);
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_space_to_depth_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_space_to_depth_pass.cc
index 204a674e632..ecfd6b33503 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_space_to_depth_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_space_to_depth_pass.cc
@@ -54,6 +54,11 @@ namespace {
 constexpr char kDeviceAttr[] = "device";
 typedef std::pair<TF::Conv2DOp, int64_t> Conv2DWithBlockSize;
 
+struct BlockArgumentInfo {
+  unsigned arg_num;
+  unsigned num_users;
+};
+
 // A pass that applies automatic space to depth transform for the first or
 // frontier convolutions consume host inputs on TPU.
 // This is done by adding space to depth transform op after host input and
@@ -108,7 +113,49 @@ struct TPUSpaceToDepthPass
   void runOnOperation() override;
 };
 
-// Handle padding before convolution for space to depth transform.
+// Updates func argument type to have the updated input shape.
+void UpdateFuncType(FuncOp func) {
+  auto arg_types = llvm::to_vector<8>(func.front().getArgumentTypes());
+  auto result_types =
+      llvm::to_vector<4>(func.front().getTerminator()->getOperandTypes());
+  func.setType(FunctionType::get(arg_types, result_types, func.getContext()));
+}
+
+void HandleFuncOp(Operation* op) {
+  auto func = llvm::cast<FuncOp>(op);
+  UpdateFuncType(func);
+}
+
+// Handles cast op between the first convolution and the block argument.
+LogicalResult HandleCast(TF::CastOp cast_op, ArrayRef<int64_t> new_shape) {
+  auto cast_input = cast_op.x();
+  // Update input type.
+  auto transform_result_type =
+      RankedTensorType::get(new_shape, getElementTypeOrSelf(cast_input));
+  cast_input.setType(transform_result_type);
+  auto block_arg = cast_input.dyn_cast<mlir::BlockArgument>();
+  auto cast_op_input = dyn_cast_or_null<TF::CastOp>(cast_input.getDefiningOp());
+  while (block_arg || cast_op_input) {
+    if (block_arg) {
+      // Change on device function type/shape.
+      HandleFuncOp(block_arg.getOwner()->getParentOp());
+      block_arg = nullptr;
+      cast_op_input = nullptr;
+    } else {
+      auto cast_input = cast_op_input.x();
+      // Update input type.
+      auto transform_result_type =
+          RankedTensorType::get(new_shape, getElementTypeOrSelf(cast_input));
+      cast_input.setType(transform_result_type);
+      // Update block arg and cast_op_input.
+      block_arg = cast_input.dyn_cast<mlir::BlockArgument>();
+      cast_op_input = dyn_cast_or_null<TF::CastOp>(cast_input.getDefiningOp());
+    }
+  }
+  return success();
+}
+
+// Handles padding before convolution for space to depth transform.
 LogicalResult HandlePad(TF::PadOp op, int32_t kernel_size, int32_t block_size) {
   auto ranked_type = op.input().getType().dyn_cast<RankedTensorType>();
   if (!ranked_type) return failure();
@@ -134,6 +181,10 @@ LogicalResult HandlePad(TF::PadOp op, int32_t kernel_size, int32_t block_size) {
       pad_input_shape[0], pad_input_shape[1] / block_size,
       pad_input_shape[2] / block_size,
       pad_input_shape[3] * block_size * block_size};
+  // Input of the pad op could be a cast op.
+  if (auto cast_op = dyn_cast_or_null<TF::CastOp>(input.getDefiningOp()))
+    if (failed(HandleCast(cast_op, transform_shape))) return failure();
+
   auto transform_result_type =
       RankedTensorType::get(transform_shape, getElementTypeOrSelf(input));
   input.setType(transform_result_type);
@@ -141,7 +192,7 @@ LogicalResult HandlePad(TF::PadOp op, int32_t kernel_size, int32_t block_size) {
   return success();
 }
 
-// Handle stride for the first convolution for the transform.
+// Handles stride for the first convolution for the transform.
 void HandleConv2DStride(TF::Conv2DOp conv2d) {
   MLIRContext* context = conv2d.getContext();
   SmallVector<int64_t, 4> values = {1, 1, 1, 1};
@@ -153,7 +204,7 @@ void HandleConv2DStride(TF::Conv2DOp conv2d) {
   conv2d.setAttr("strides", strides);
 }
 
-// Transform input shape for the first convolution.
+// Transforms input shape for the first convolution.
 void HandleConv2DInput(TF::Conv2DOp conv2d, int64_t block_size) {
   auto input = conv2d.input();
   auto input_shape = input.getType().cast<RankedTensorType>().getShape();
@@ -165,7 +216,7 @@ void HandleConv2DInput(TF::Conv2DOp conv2d, int64_t block_size) {
   input.setType(transform_result_type);
 }
 
-// Add padding for convolution filter for space to depth transform.
+// Adds padding for convolution filter for space to depth transform.
 TF::PadOp GetPadOpForConv2DFilter(ArrayRef<int64_t> filter_shape, Value filter,
                                   OpBuilder* builder, int32_t pad_h,
                                   int32_t pad_w) {
@@ -185,7 +236,7 @@ TF::PadOp GetPadOpForConv2DFilter(ArrayRef<int64_t> filter_shape, Value filter,
                                     paddings_value);
 }
 
-// Create reshape op for space to depth transform.
+// Creates reshape op for space to depth transform.
 TF::ReshapeOp GetReshapeOpForConv2DFilter(ArrayRef<int64_t> new_shape,
                                           Value input, OpBuilder* builder) {
   auto reshape_result_type =
@@ -199,7 +250,7 @@ TF::ReshapeOp GetReshapeOpForConv2DFilter(ArrayRef<int64_t> new_shape,
                                         input, reshape_value);
 }
 
-// Create transpose op for shape to depth transform.
+// Creates transpose op for shape to depth transform.
 TF::TransposeOp GetTransposeOpForConv2DFilter(OpBuilder* builder, Value input) {
   SmallVector<int32_t, 6> permutation = {0, 2, 1, 3, 4, 5};
   auto permute_type = RankedTensorType::get({6}, builder->getIntegerType(32));
@@ -259,7 +310,7 @@ void HandleConv2DFilter(TF::Conv2DOp conv2d, int64_t block_size) {
   conv2d.setOperand(1, final_reshape_op);
 }
 
-// Create slice op for filter in back prop pass.
+// Creates slice op for filter in back prop pass.
 TF::SliceOp GetSliceOpForConv2DBackPropFilter(
     ArrayRef<int32_t> old_filter_shape, Value input, OpBuilder* builder) {
   SmallVector<int64_t, 4> slice_size(old_filter_shape.begin(),
@@ -281,7 +332,7 @@ TF::SliceOp GetSliceOpForConv2DBackPropFilter(
                                       start_position, slice_size_op);
 }
 
-// Transform Conv2DBackPropFilter for space to depth.
+// Transforms Conv2DBackPropFilter for space to depth.
 void HandleConv2DBackPropFilter(TF::Conv2DBackpropFilterOp backprop,
                                 ArrayRef<int32_t> old_filter_shape,
                                 ArrayRef<int32_t> new_filter_shape,
@@ -354,22 +405,6 @@ void HandleConv2DBackPropFilter(TF::Conv2DBackpropFilterOp backprop,
   backprop.replaceAllUsesWith(slice_op.getResult());
 }
 
-// Update func arugument type to have the updated input shape.
-void UpdateFuncType(FuncOp func) {
-  llvm::SmallVector<Type, 8> arg_types;
-  arg_types.reserve(func.getNumArguments());
-  for (auto arg : func.getArguments()) arg_types.emplace_back(arg.getType());
-  auto terminator = func.front().getTerminator();
-  SmallVector<Type, 4> result_types(terminator->operand_type_begin(),
-                                    terminator->operand_type_end());
-  func.setType(FunctionType::get(arg_types, result_types, func.getContext()));
-}
-
-void HandleFuncOp(Operation* op) {
-  auto func = llvm::cast<FuncOp>(op);
-  UpdateFuncType(func);
-}
-
 // Checks if the input producer op is supported in this transform. Right now, we
 // only check if it is a host tf.IteratorGetNext.
 bool IsSupportedHostInputOp(Operation* op) {
@@ -397,9 +432,8 @@ TF::SpaceToDepthOp BuildSpaceToDepth(tf_device::ClusterFuncOp cluster_func,
       input_shape[3] * block_size * block_size};
   auto transform_result_type =
       RankedTensorType::get(transform_shape, getElementTypeOrSelf(input));
-  return builder.create<TF::SpaceToDepthOp>(cluster_func.getLoc(),
-                                            transform_result_type, input,
-                                            APInt(64, block_size));
+  return builder.create<TF::SpaceToDepthOp>(
+      cluster_func.getLoc(), transform_result_type, input, block_size);
 }
 
 // Performs transformation for a non-replicated input.
@@ -417,12 +451,13 @@ TF::SpaceToDepthOp HandleHostInput(Value input, int64_t index,
 // supported case (thus transform happened).
 bool HandleHostReplicatedInputs(int64_t index,
                                 tf_device::ClusterFuncOp cluster_func,
-                                int64_t replicate_arg_index,
+                                BlockArgument block_arg,
                                 tf_device::ReplicateOp replicate,
                                 int32_t block_size) {
+  int64_t replicate_arg_index = block_arg.getArgNumber();
   // We need to know the devices to copy to.
   if (!replicate.devices()) return false;
-  int64_t num_replicas = replicate.n().getZExtValue();
+  int64_t num_replicas = replicate.n();
   // Gets inputs at replicate_arg_index for each replica.
   auto inputs = replicate.getOperands()
                     .drop_front(replicate_arg_index * num_replicas)
@@ -439,6 +474,7 @@ bool HandleHostReplicatedInputs(int64_t index,
         BuildSpaceToDepth(cluster_func, entry.value(), block_size, input_shape);
     replicate.setOperand(num_replicas * replicate_arg_index + entry.index(),
                          space_to_depth);
+    block_arg.setType(space_to_depth.getType());
   }
   return true;
 }
@@ -457,9 +493,8 @@ void HandleCluster(tf_device::ClusterFuncOp cluster_func, int32_t block_size,
       // For a block argument, consider transforms only when it is a replicated
       // input (defining ops will be outside the replicate node).
       if (maybe_replicate == block_arg.getParentRegion()->getParentOp()) {
-        HandleHostReplicatedInputs(input.index(), cluster_func,
-                                   block_arg.getArgNumber(), maybe_replicate,
-                                   block_size);
+        HandleHostReplicatedInputs(input.index(), cluster_func, block_arg,
+                                   maybe_replicate, block_size);
       }
     } else {
       // For an op output, consider transforms only when 1) there is no
@@ -482,7 +517,7 @@ void HandleCluster(tf_device::ClusterFuncOp cluster_func, int32_t block_size,
   }
 }
 
-// Check if input shape of convolution is good for space to depth transform.
+// Checks if input shape of convolution is good for space to depth transform.
 bool Conv2DInputShapeCanTransform(Value input) {
   auto ranked_type = input.getType().dyn_cast<RankedTensorType>();
   if (!ranked_type) return false;
@@ -495,35 +530,59 @@ bool Conv2DInputShapeCanTransform(Value input) {
   return true;
 }
 
-// Checks if a convoluton can apply SpaceToDepth transform.
-// Only the first convolution in the graph whose batch size smaller than 8
-// and its input feature size smaller than 8 can be transformed.
-Optional<std::pair<unsigned, int>> GetConv2DInputArgNum(TF::Conv2DOp conv2d) {
-  if (conv2d.data_format() != "NHWC" || conv2d.strides().size() != 4) {
-    return None;
-  }
-  auto conv2d_input = conv2d.input();
-  if (auto block_arg = conv2d_input.dyn_cast<mlir::BlockArgument>()) {
-    if (!Conv2DInputShapeCanTransform(conv2d_input)) return None;
-    int num_users =
+// Get block argument id and number of users for the input arg.
+Optional<BlockArgumentInfo> GetBlockArgNum(Value arg) {
+  if (auto block_arg = arg.dyn_cast<mlir::BlockArgument>()) {
+    if (!Conv2DInputShapeCanTransform(arg)) return None;
+    unsigned num_users =
         std::distance(block_arg.getUsers().begin(), block_arg.getUsers().end());
-    return std::make_pair(block_arg.getArgNumber(), num_users);
+    BlockArgumentInfo block_arg_info = {block_arg.getArgNumber(), num_users};
+    return block_arg_info;
   }
+  return None;
+}
 
-  if (auto pad_op = llvm::dyn_cast<TF::PadOp>(conv2d_input.getDefiningOp())) {
-    auto pad_input = pad_op.input();
-    if (auto block_arg = pad_input.dyn_cast<mlir::BlockArgument>()) {
-      if (!Conv2DInputShapeCanTransform(pad_input)) return None;
-      int num_users = std::distance(block_arg.getUsers().begin(),
-                                    block_arg.getUsers().end());
-      return std::make_pair(block_arg.getArgNumber(), num_users);
+// Gets input block argument id and number of users for the input recursively.
+// Current supported ops between convolution input and the block arguments are
+// PadOp and CastOp.
+Optional<BlockArgumentInfo> GetInputBlockArgNum(Value input) {
+  auto block_arg_num = GetBlockArgNum(input);
+  if (block_arg_num.hasValue()) return block_arg_num;
+
+  Value next_input = input;
+  auto pad_op = dyn_cast_or_null<TF::PadOp>(next_input.getDefiningOp());
+  auto cast_op = dyn_cast_or_null<TF::CastOp>(next_input.getDefiningOp());
+
+  while (pad_op || cast_op) {
+    if (pad_op) {
+      auto block_arg_num = GetBlockArgNum(pad_op.input());
+      if (block_arg_num.hasValue()) return block_arg_num;
+      next_input = pad_op.input();
+    } else {
+      auto block_arg_num = GetBlockArgNum(cast_op.x());
+      if (block_arg_num.hasValue()) return block_arg_num;
+      next_input = cast_op.x();
     }
+    pad_op = dyn_cast_or_null<TF::PadOp>(next_input.getDefiningOp());
+    cast_op = dyn_cast_or_null<TF::CastOp>(next_input.getDefiningOp());
   }
 
   return None;
 }
 
-// Apply space to depth transform for the first convolution on TPU device.
+// Checks if a convoluton can apply SpaceToDepth transform.
+// Only the first convolution in the graph whose batch size smaller than 8
+// and its input feature size smaller than 8 can be transformed.
+Optional<BlockArgumentInfo> GetConv2DInputArgNum(TF::Conv2DOp conv2d) {
+  if (conv2d.data_format() != "NHWC" || conv2d.strides().size() != 4) {
+    return None;
+  }
+  // Current supported ops between convolution input and the block arguments are
+  // PadOp and CastOp.
+  return GetInputBlockArgNum(conv2d.input());
+}
+
+// Applies space to depth transform for the first convolution on TPU device.
 void HandleFirstConvolution(TF::Conv2DOp conv2d, int64_t block_size) {
   // Check if input and filter type are RankedTensorType.
   auto input_tensor_type =
@@ -563,8 +622,9 @@ void HandleFirstConvolution(TF::Conv2DOp conv2d, int64_t block_size) {
   SmallVector<int32_t, 4> new_filter_shape(filter_shape.begin(),
                                            filter_shape.end());
 
-  // Rewrite Conv2DBackPropFilter after the first convolution.
-  for (Operation* user : conv2d.getOperation()->getUsers()) {
+  // Rewrite Conv2DBackPropFilter that is the user of first convolution's input.
+  if (!conv2d_input.getDefiningOp()) return;
+  for (Operation* user : conv2d_input.getDefiningOp()->getUsers()) {
     if (auto backprop = dyn_cast<TF::Conv2DBackpropFilterOp>(user)) {
       HandleConv2DBackPropFilter(backprop, old_filter_shape, new_filter_shape,
                                  block_size);
@@ -572,7 +632,7 @@ void HandleFirstConvolution(TF::Conv2DOp conv2d, int64_t block_size) {
   }
 }
 
-// Get block size that is equal to stride from spatial dimension
+// Gets block size that is equal to stride from spatial dimension
 // from convolution.
 // Space to depth transform won't be triggered if block size <= 1.
 int32_t GetConv2DBlockSize(TF::Conv2DOp conv2d) {
@@ -608,7 +668,6 @@ void TPUSpaceToDepthPass::runOnOperation() {
   if (!device_func) return;
 
   TF::Conv2DOp first_conv;
-  Optional<ArrayRef<int64_t>> input_shape;
   // A map maps block argument id to the convolutions consumes them.
   llvm::SmallDenseMap<unsigned, std::vector<Conv2DWithBlockSize>>
       argnum_and_convolutions;
@@ -617,13 +676,13 @@ void TPUSpaceToDepthPass::runOnOperation() {
 
   // Find out the qualified convolutions and its block argument ids.
   auto conv2d_result = device_func.walk([&](TF::Conv2DOp conv2d) {
-    Optional<std::pair<unsigned, int>> arg_num_and_num_users =
+    Optional<BlockArgumentInfo> arg_num_and_num_users =
         GetConv2DInputArgNum(conv2d);
     if (arg_num_and_num_users.hasValue()) {
       // Get block size for the first convolution.
       int64_t block_size = GetConv2DBlockSize(conv2d);
-      auto arg_num = arg_num_and_num_users.getValue().first;
-      auto num_users = arg_num_and_num_users.getValue().second;
+      auto arg_num = arg_num_and_num_users.getValue().arg_num;
+      auto num_users = arg_num_and_num_users.getValue().num_users;
       argnum_and_convolutions[arg_num].emplace_back(conv2d, block_size);
       argnum_num_users[arg_num] = num_users;
       return WalkResult::interrupt();
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_variable_runtime_reformatting.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_variable_runtime_reformatting.cc
index 3262b83fc94..0e4ef76a54c 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_variable_runtime_reformatting.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_variable_runtime_reformatting.cc
@@ -174,7 +174,7 @@ AnnotateCompileOpAndGetExecuteArgToWhileArgsMapping(
   assert(metadata_str && "Missing compilation metadata");
   tensorflow::tpu::TPUCompileMetadataProto metadata;
   metadata.ParseFromString(std::string(metadata_str.getValue()));
-  int64_t num_replicas = replicate.n().getLimitedValue();
+  int64_t num_replicas = replicate.n();
   // Find the formattable operands of `execute`, which must be mirrored
   // variables (arguments of `replicate`), and must be pass-throughs from while
   // operands.
@@ -264,7 +264,7 @@ tf_device::ReplicateOp AddInputsToReplicateOp(
     tf_device::ReplicateOp replicate, ArrayRef<Value> new_inputs,
     const llvm::SmallDenseMap<llvm::StringRef, llvm::SmallVector<StringRef, 4>>&
         devices) {
-  int64_t num_replicas = replicate.n().getLimitedValue();
+  int64_t num_replicas = replicate.n();
   assert(new_inputs.size() == num_replicas);
 
   // As model parallelism is not yet supported, we assume that all ops are
@@ -423,7 +423,7 @@ void WrapOpInLaunch(OpBuilder* builder, Location loc, Operation* op,
 // Performs the transformation for a replicate op inside a while loop.
 void HandleReplicateOp(TF::WhileOp while_op, tf_device::ReplicateOp replicate,
                        MLIRContext* context) {
-  int64_t num_replicas = replicate.n().getLimitedValue();
+  int64_t num_replicas = replicate.n();
   if (num_replicas == 1) return;
   tf_device::LaunchOp execute_launch;
   for (auto execute_launch_op :
@@ -452,8 +452,8 @@ void HandleReplicateOp(TF::WhileOp while_op, tf_device::ReplicateOp replicate,
       !llvm::isa<TF::_TPUCompileMlirOp>(compile_launch.GetBody().front()))
     return;
 
-  FuncOp body = while_op.body_func();
-  FuncOp cond = while_op.cond_func();
+  FuncOp body = while_op.body_function();
+  FuncOp cond = while_op.cond_function();
 
   // Analyze the formattable inputs.
   auto execute_arg_to_outer_args =
@@ -537,9 +537,10 @@ void HandleReplicateOp(TF::WhileOp while_op, tf_device::ReplicateOp replicate,
   // Build a constant default key to specify that the unformatting should
   // transform the variables to the original format.
   builder.setInsertionPointAfter(while_op);
-  tensorflow::Tensor default_key_tensor(tensorflow::DT_STRING, {2});
+  tensorflow::Tensor default_key_tensor(tensorflow::DT_STRING, {3});
   default_key_tensor.vec<tensorflow::tstring>()(0) = kDefaultShardingValue;
   default_key_tensor.vec<tensorflow::tstring>()(1) = kDefaultShardingValue;
+  default_key_tensor.vec<tensorflow::tstring>()(2) = kDefaultShardingValue;
   auto default_state_key = builder.create<TF::ConstOp>(
       while_op.getLoc(),
       tensorflow::ConvertTensor(default_key_tensor, &builder).ValueOrDie());
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/breakup-islands.cc b/tensorflow/compiler/mlir/tensorflow/translate/breakup-islands.cc
index 0a69987deb0..b65f07c39ac 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/breakup-islands.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/breakup-islands.cc
@@ -43,6 +43,10 @@ namespace {
 
 class BreakUpIslands : public TF::PerFunctionAggregateAnalysisConsumerPass<
                            BreakUpIslands, TF::SideEffectAnalysis> {
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<tf_executor::TensorFlowExecutorDialect>();
+  }
+
  public:
   void runOnFunction(FuncOp func,
                      const TF::SideEffectAnalysis::Info& side_effect_analysis);
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc b/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
index 571d5e3e715..0445dbb698a 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
@@ -49,6 +49,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/export_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/translate_utils.h"
+#include "tensorflow/compiler/mlir/utils/name_utils.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/graph_to_functiondef.h"
@@ -80,46 +81,14 @@ constexpr char kInvalidExecutorGraphMsg[] =
 constexpr char kDeviceAttr[] = "tf.device";
 constexpr char kResourceArgUniqueIdAttr[] = "tf._resource_arg_unique_id";
 
-bool IsLegalChar(char c, bool first_char) {
-  if (isalpha(c)) return true;
-  if (isdigit(c)) return true;
-  if (c == '.') return true;
-  if (c == '_') return true;
-
-  // First character of a node name can only be a letter, digit, dot or
-  // underscore.
-  if (first_char) return false;
-
-  if (c == '/') return true;
-  if (c == '-') return true;
-
-  return false;
-}
-
-// Convert characters in name that are considered illegal in TensorFlow Node
-// name to '.'.
-std::string LegalizeNodeName(llvm::StringRef name) {
-  assert(!name.empty() && "expected non-empty name");
-
-  std::string legalized_name;
-  bool first = true;
-  for (auto c : name) {
-    if (IsLegalChar(c, first)) {
-      legalized_name += c;
-    } else {
-      legalized_name += '.';
-    }
-    first = false;
-  }
-
-  return legalized_name;
-}
-
 // OpOrArgLocNameMapper that legalizes the returned name.
 class LegalizedOpOrValLocNameMapper : public OpOrArgLocNameMapper {
  private:
   std::string GetName(OpOrVal op_or_val) override {
-    return LegalizeNodeName(OpOrArgLocNameMapper::GetName(op_or_val));
+    std::string name = OpOrArgLocNameMapper::GetName(op_or_val);
+    assert(!name.empty() && "expected non-empty name");
+    mlir::LegalizeNodeName(name);
+    return name;
   }
 };
 
@@ -523,13 +492,14 @@ StatusOr<std::unique_ptr<Graph>> Exporter::Convert(
       if (index >= num_data_results) break;
       // TODO(jpienaar): If there is a result index specified, ensure only one
       // and that it matches the result index of the op.
-      std::string orig_name(output_names[index]);
-      auto tensor_id = ParseTensorName(orig_name);
-      auto name = LegalizeNodeName(
-          llvm::StringRef(tensor_id.node().data(), tensor_id.node().size()));
+      std::string name(output_names[index]);
+      auto tensor_id = ParseTensorName(name);
+      std::string tensor_id_node(tensor_id.node());
+      assert(!tensor_id_node.empty() && "expected non-empty name");
+      mlir::LegalizeNodeName(tensor_id_node);
 
       // Ensure name does not get reused.
-      (void)exporter.op_to_name_.GetUniqueName(name);
+      (void)exporter.op_to_name_.GetUniqueName(tensor_id_node);
     }
   }
 
@@ -537,8 +507,9 @@ StatusOr<std::unique_ptr<Graph>> Exporter::Convert(
     TF_RET_CHECK(input_names.size() == block.getNumArguments());
     for (const auto& it : llvm::enumerate(function.getArguments())) {
       // TODO(lyandy): Update when changing feed/fetch import.
-      std::string orig_name(input_names[it.index()]);
-      std::string name = LegalizeNodeName(orig_name);
+      std::string name(input_names[it.index()]);
+      assert(!name.empty() && "expected non-empty name");
+      mlir::LegalizeNodeName(name);
       auto tensor_id = ParseTensorName(name);
       TF_RET_CHECK(tensor_id.index() == 0)
           << "input port designation not supported";
@@ -726,7 +697,7 @@ Status Exporter::Convert(mlir::ModuleOp module,
       mlir::Identifier::get("main", module.getContext());
   absl::optional<mlir::FuncOp> entry_func;
   FunctionDefLibrary flib;
-  auto tf_dialect = module.getContext()->getRegisteredDialect("tf");
+  auto tf_dialect = module.getContext()->getLoadedDialect("tf");
   for (auto function : module.getOps<mlir::FuncOp>()) {
     if (function.isExternal())
       return errors::FailedPrecondition("External functions not supported");
@@ -799,7 +770,7 @@ StatusOr<std::unique_ptr<GraphDef>> ConvertMlirToGraphdef(
 stream_executor::port::Status ConvertMlirFunctionToFunctionLibraryDef(
     mlir::FuncOp func, const GraphExportConfig& configs,
     FunctionDef* function_def) {
-  Dialect* tf_dialect = func.getContext()->getRegisteredDialect("tf");
+  Dialect* tf_dialect = func.getContext()->getLoadedDialect("tf");
   FunctionDefLibrary flib;
   TF_RETURN_IF_ERROR(
       Exporter::ConvertLibFunction(configs, tf_dialect, func, &flib));
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.cc b/tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.cc
index 3ca06e5efa9..727831a6055 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.cc
@@ -25,8 +25,8 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
-#include "tensorflow/compiler/mlir/tensorflow/utils/export_utils.h"
 #include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -34,7 +34,6 @@ limitations under the License.
 namespace tensorflow {
 
 namespace {
-using stream_executor::port::StatusOr;
 
 // Sets type list attribute with the given `name` to the given `types`. If the
 // attribute already exists with a different value, returns an error.
@@ -90,7 +89,7 @@ Status SetShapeAttribute(absl::string_view name, ContainerT shapes,
 // definitions and isn't a header file.
 #include "tensorflow/compiler/mlir/tensorflow/translate/derived_attr_populator.inc"
 
-// Collect all the unregistered attributes for an TF dialect operation.
+// Collects all the unregistered attributes for an TF dialect operation.
 // Attributes "name" and "device" are not included because they are not part
 // of an TF op attributes.
 Status GetUnregisteredAttrs(
@@ -123,17 +122,10 @@ Status GetUnregisteredAttrs(
   return Status::OK();
 }
 
-}  // namespace
-
-StatusOr<std::unique_ptr<NodeDef>> ConvertTFDialectOpToNodeDef(
-    mlir::Operation* inst, llvm::StringRef name,
-    bool ignore_unregistered_attrs) {
-  // Use auto generated function to populate derived attribute.
-  //
-  // Note: This only populates derived attributes for TensorFlow ops that are
-  // generated using the TableGen. Manually defined ops should have all the
-  // attributes present as native MLIR op attributes.
-
+// Collects all attribute names to ignore in an MLIR operation when exporting to
+// a TensorFlow NodeDef.
+StatusOr<absl::flat_hash_set<absl::string_view>> GetAttributesToIgnore(
+    mlir::Operation* inst, bool ignore_unregistered_attrs) {
   // The elements are owned by the MLIRContext.
   absl::flat_hash_set<absl::string_view> attrs_to_ignore;
   if (inst->isRegistered()) {
@@ -162,15 +154,25 @@ StatusOr<std::unique_ptr<NodeDef>> ConvertTFDialectOpToNodeDef(
     attrs_to_ignore.insert(attr_name.data());
   }
 
-  TF_ASSIGN_OR_RETURN(auto node_def,
-                      GetOperationNodeDef(attrs_to_ignore, inst, name));
+  return attrs_to_ignore;
+}
+
+// Populates all derived attributes of a MLIR operation in a proto
+// map<string, AttrValue>.
+Status PopulateDerivedAttributes(mlir::Operation* inst,
+                                 bool ignore_unregistered_attrs,
+                                 AttrValueMap* attributes) {
+  // Use auto generated function to populate derived attribute.
+  //
+  // Note: This only populates derived attributes for TensorFlow ops that are
+  // generated using the TableGen. Manually defined ops should have all the
+  // attributes present as native MLIR op attributes.
 
   // If the operation is not registered, we won't be able to infer any attribute
   if (inst->isRegistered()) {
-    TF_RETURN_WITH_CONTEXT_IF_ERROR(
-        PopulateDerivedAttrs(inst, node_def->mutable_attr()),
-        "When populating derived attrs for ",
-        inst->getName().getStringRef().str());
+    TF_RETURN_WITH_CONTEXT_IF_ERROR(PopulateDerivedAttrs(inst, attributes),
+                                    "When populating derived attrs for ",
+                                    inst->getName().getStringRef().str());
   }
 
   // Here we only add the shapes for the leading values with ShapedType,
@@ -185,10 +187,38 @@ StatusOr<std::unique_ptr<NodeDef>> ConvertTFDialectOpToNodeDef(
       mlir::TF::ResultShapeRange output_shapes = {
           mlir::TF::ResultShapeIterator(begin),
           mlir::TF::ResultShapeIterator(end)};
-      TF_RETURN_IF_ERROR(SetShapeAttribute("_output_shapes", output_shapes,
-                                           node_def->mutable_attr()));
+      TF_RETURN_IF_ERROR(
+          SetShapeAttribute("_output_shapes", output_shapes, attributes));
     }
   }
+
+  return Status::OK();
+}
+
+}  // namespace
+
+Status GetAttrValuesFromOperation(mlir::Operation* inst, llvm::StringRef name,
+                                  bool ignore_unregistered_attrs,
+                                  AttrValueMap* attributes) {
+  TF_ASSIGN_OR_RETURN(auto attrs_to_ignore,
+                      GetAttributesToIgnore(inst, ignore_unregistered_attrs));
+  TF_RETURN_WITH_CONTEXT_IF_ERROR(
+      ConvertAttributes(inst->getAttrs(), attrs_to_ignore, attributes),
+      "while converting attributes for node: ", name.str());
+  TF_RETURN_IF_ERROR(
+      PopulateDerivedAttributes(inst, ignore_unregistered_attrs, attributes));
+  return Status::OK();
+}
+
+StatusOr<std::unique_ptr<NodeDef>> ConvertTFDialectOpToNodeDef(
+    mlir::Operation* inst, llvm::StringRef name,
+    bool ignore_unregistered_attrs) {
+  TF_ASSIGN_OR_RETURN(auto attrs_to_ignore,
+                      GetAttributesToIgnore(inst, ignore_unregistered_attrs));
+  TF_ASSIGN_OR_RETURN(auto node_def,
+                      GetOperationNodeDef(attrs_to_ignore, inst, name));
+  TF_RETURN_IF_ERROR(PopulateDerivedAttributes(inst, ignore_unregistered_attrs,
+                                               node_def->mutable_attr()));
   return node_def;
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.h b/tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.h
index a19ad1f2940..bd260171a86 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.h
+++ b/tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.h
@@ -18,12 +18,22 @@ limitations under the License.
 
 #include "llvm/ADT/StringRef.h"
 #include "mlir/IR/Operation.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/utils/export_utils.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
 
 namespace tensorflow {
 
-// Converts an MLIR operation to TensorFlow NodeDef with given node name. This
+// Extracts the attributes of a MLIR operation and populates the converted
+// attributes in a proto map<string, AttrValue>.
+Status GetAttrValuesFromOperation(mlir::Operation* inst, llvm::StringRef name,
+                                  bool ignore_unregistered_attrs,
+                                  AttrValueMap* attributes);
+
+// Converts a MLIR operation to TensorFlow NodeDef with given node name. This
 // name should be unique to the graph it is being inserted to. If the
 // `ignore_unregistered_attrs` argument is set to true, the attributes which are
 // not in the op registry will be ignored. If the `ignore_unregistered_attrs`
@@ -31,9 +41,9 @@ namespace tensorflow {
 // ShapedType for the leading values with ShapedType in the results of the
 // nodes. Set it to true if the returned NodeDef will be executed by the linked
 // TF Eager runtime.
-stream_executor::port::StatusOr<std::unique_ptr<NodeDef>>
-ConvertTFDialectOpToNodeDef(mlir::Operation* inst, llvm::StringRef name,
-                            bool ignore_unregistered_attrs);
+StatusOr<std::unique_ptr<NodeDef>> ConvertTFDialectOpToNodeDef(
+    mlir::Operation* inst, llvm::StringRef name,
+    bool ignore_unregistered_attrs);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
index 27385e81262..153c537589c 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
@@ -64,6 +64,7 @@ limitations under the License.
 #include "tensorflow/cc/saved_model/loader_util.h"
 #include "tensorflow/compiler/jit/shape_inference_helpers.h"
 #include "tensorflow/compiler/mlir/op_or_arg_name_mapper.h"
+#include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
@@ -141,6 +142,13 @@ bool IsResourceOutputShapesAttribute(const AttrValue& attr_value,
   return false;
 }
 
+void LoadImporterDialects(mlir::MLIRContext& context) {
+  // Load dialects involved in the conversion
+  mlir::DialectRegistry registry;
+  mlir::RegisterAllTensorFlowDialects(registry);
+  registry.loadAll(&context);
+}
+
 // This class is used to generate new MLIR function name strings that are both
 // unique in the TF function library `flib_` and unique among the name strings
 // generated by the class object during its lifetime.
@@ -177,7 +185,8 @@ Status UpgradeLegacyGraph(Graph* graph, FunctionLibraryDefinition* flib_def,
       restrict_functionalization_to_tpu_nodes
           ? [](const Node* n) { return n->attrs().Find(kTpuReplicateAttr); }
           : NodeFilter{};
-  return FunctionalizeControlFlow(graph, flib_def, node_filter);
+  return FunctionalizeControlFlow(graph, flib_def, node_filter,
+                                  /*include_functions=*/true);
 }
 
 // Stateful helper class to import a TensorFlow model into an MLIR Module.
@@ -1934,22 +1943,18 @@ Status ImporterBase::ConvertNode(const Node& node) {
     }
   }
 
-  // Map If and StatelessIf op in TensorFlow to the common If op in MLIR and add
-  // the differentiating attribute.
-  if (node.IsIfNode()) {
-    result.name = mlir::OperationName(get_full_op_name("If"), context_);
-    mlir::BoolAttr val = builder_.getBoolAttr(node_type_name == "StatelessIf");
+  auto composite_control_flow_op = [&](const std::string& name) {
+    result.name = mlir::OperationName(get_full_op_name(name), context_);
+    bool stateless = absl::StartsWith(node_type_name, "Stateless");
+    mlir::BoolAttr val = builder_.getBoolAttr(stateless);
     result.attributes.push_back(builder_.getNamedAttr("is_stateless", val));
-  }
+  };
 
-  // Map While and StatelessWhile op in TensorFlow to the common While op in
-  // MLIR and add the differentiating attribute.
-  if (node.IsWhileNode()) {
-    result.name = mlir::OperationName(get_full_op_name("While"), context_);
-    mlir::BoolAttr val =
-        builder_.getBoolAttr(node_type_name == "StatelessWhile");
-    result.attributes.push_back(builder_.getNamedAttr("is_stateless", val));
-  }
+  // Map Case/If/While and StatelessCase/If/While op in TensorFlow to the common
+  // Case/If/While op in MLIR and add the differentiating attribute.
+  if (node.IsCaseNode()) composite_control_flow_op("Case");
+  if (node.IsIfNode()) composite_control_flow_op("If");
+  if (node.IsWhileNode()) composite_control_flow_op("While");
 
   // Register the mapping between the TF node and the newly created operation.
   node_values_[node.id()] =
@@ -2139,6 +2144,7 @@ StatusOr<mlir::OwningModuleRef> GraphDefImporter::Convert(
     mlir::MLIRContext* context, const Graph& graph,
     const GraphDebugInfo& debug_info, const FunctionLibraryDefinition& flib_def,
     const GraphImportConfig& specs, llvm::StringRef func_name) {
+  LoadImporterDialects(*context);
   mlir::OwningModuleRef module =
       mlir::ModuleOp::create(mlir::UnknownLoc::get(context));
   std::unordered_map<std::string, std::string> tf_name_to_mlir_name;
@@ -2873,7 +2879,7 @@ void AdjustBoundInputArgTypes(mlir::ModuleOp module) {
     mlir::OpBuilder builder(func.getBody());
     llvm::SmallVector<mlir::Type, 4> new_input_types;
     for (int i = 0, e = func.getNumArguments(); i < e; i++) {
-      auto arg = func.front().getArgument(i);
+      auto arg = func.getArgument(i);
       auto global_tensor = mlir::tf_saved_model::LookupBoundInputOfType<
           mlir::tf_saved_model::GlobalTensorOp>(func, i, symbol_table);
       if (global_tensor) {
@@ -3195,6 +3201,7 @@ Status CreateSavedModelIR(
 StatusOr<mlir::OwningModuleRef> SavedModelObjectGraphImporter::Convert(
     SavedModelV2Bundle* saved_model, absl::Span<std::string> exported_names,
     mlir::MLIRContext* context, bool add_default_attributes) {
+  LoadImporterDialects(*context);
   GraphDebugInfo dummy_debug_info;
   const GraphDebugInfo& debug_info =
       saved_model->debug_info() ? *saved_model->debug_info() : dummy_debug_info;
@@ -3274,6 +3281,7 @@ class SavedModelSignatureDefImporter {
   static StatusOr<mlir::OwningModuleRef> Convert(
       const SavedModelBundle& bundle, absl::Span<std::string> exported_names,
       mlir::MLIRContext* context, bool upgrade_legacy) {
+    LoadImporterDialects(*context);
     SavedModelSignatureDefImporter importer(bundle, exported_names, context);
     TF_RETURN_IF_ERROR(importer.InitializeGraph(upgrade_legacy));
     return importer.ConvertSignatures();
@@ -3646,6 +3654,8 @@ stream_executor::port::StatusOr<mlir::OwningModuleRef> ConvertFunctionToMlir(
   tensorflow::GraphDebugInfo dummy_debug_info;
   tensorflow::GraphImportConfig specs;
   specs.graph_as_function = true;
+  for (const auto* control_ret_node : fbody->control_ret_nodes)
+    specs.control_outputs.push_back(control_ret_node->name());
   return GraphDefImporter::Convert(context, *fbody->graph, dummy_debug_info,
                                    flib_def, specs, name);
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.cc b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.cc
index 1c7988d3a40..58377661a23 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.cc
@@ -219,22 +219,18 @@ StatusOr<mlir::OwningModuleRef> GraphdefToSplattedMlirTranslateFunction(
         if (auto attr = inst.getAttrOfType<mlir::ElementsAttr>(attr_id)) {
           mlir::Attribute rand_val;
           mlir::Type element_type = attr.getType().getElementType();
+          if (element_type.isa<mlir::IntegerType>()) {
+            rand_val = mlir::IntegerAttr::get(element_type, std::rand());
+          } else if (element_type.isF16() || element_type.isF32() ||
+                     element_type.isF64()) {
+            rand_val = mlir::FloatAttr::get(element_type,
+                                            std::rand() * 1.0 / RAND_MAX);
 
-          switch (element_type.getKind()) {
-            case mlir::StandardTypes::Integer:
-              rand_val = mlir::IntegerAttr::get(element_type, std::rand());
-              break;
-            case mlir::StandardTypes::F16:
-            case mlir::StandardTypes::F32:
-            case mlir::StandardTypes::F64:
-              rand_val = mlir::FloatAttr::get(element_type,
-                                              std::rand() * 1.0 / RAND_MAX);
-              break;
-            default:
-              inst.emitWarning()
-                  << "Skipping splat conversion for "
-                  << "an unsupported attribute type " << element_type;
-              continue;
+          } else {
+            inst.emitWarning()
+                << "Skipping splat conversion for "
+                << "an unsupported attribute type " << element_type;
+            continue;
           }
           auto new_attr =
               mlir::DenseElementsAttr::get(attr.getType(), rand_val);
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_registration.cc b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_registration.cc
index b646e14b71d..f63cb091a09 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_registration.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_registration.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "llvm/Support/MemoryBuffer.h"
 #include "mlir/IR/Module.h"  // from @llvm-project
 #include "mlir/Translation.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h"
@@ -86,6 +87,9 @@ static LogicalResult MlirToGraphdefTranslateFunction(
 }
 
 static TranslateFromMLIRRegistration mlir_to_graphdef_translate(
-    "mlir-to-graphdef", MlirToGraphdefTranslateFunction);
+    "mlir-to-graphdef", MlirToGraphdefTranslateFunction,
+    [](DialectRegistry& registry) {
+      mlir::RegisterAllTensorFlowDialects(registry);
+    });
 
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/translate_tf_dialect_op.cc b/tensorflow/compiler/mlir/tensorflow/translate/translate_tf_dialect_op.cc
index 5236bdeffbf..22e6559a0f2 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/translate_tf_dialect_op.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/translate_tf_dialect_op.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Module.h"  // from @llvm-project
 #include "mlir/Translation.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.h"
 
 namespace mlir {
@@ -67,6 +68,7 @@ static LogicalResult MlirToTfNodeDef(ModuleOp module,
 // Test only translation to convert a simple MLIR module with a single TF
 // dialect op to NodeDef.
 static TranslateFromMLIRRegistration translate_from_mlir_registration(
-    "test-only-mlir-to-tf-nodedef", MlirToTfNodeDef);
+    "test-only-mlir-to-tf-nodedef", MlirToTfNodeDef,
+    mlir::RegisterAllTensorFlowDialects);
 
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/attribute_utils.h b/tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h
similarity index 66%
rename from tensorflow/compiler/mlir/tensorflow/transforms/attribute_utils.h
rename to tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h
index 599a8df63d7..bd81cae5730 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/attribute_utils.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_ATTRIBUTE_UTILS_H_
-#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_ATTRIBUTE_UTILS_H_
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_ATTRIBUTE_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_ATTRIBUTE_UTILS_H_
 
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
@@ -36,7 +36,18 @@ inline void CopyUnderscoredAttributes(Operation *from, Operation *to) {
   });
 }
 
+// Copies attributes that are either `device` or whose name begins with an _
+// from `from` to `to`.
+// TODO(b/158769932): This should be a general feature instead post some policy
+// discussion.
+inline void CopyDeviceAndUnderscoredAttributes(Operation *from, Operation *to) {
+  auto device = mlir::Identifier::get("device", from->getContext());
+  CopyAttributes(from, to, [&device](const NamedAttribute &attr) {
+    return attr.first.strref().front() == '_' || attr.first == device;
+  });
+}
+
 }  // namespace TF
 }  // namespace mlir
 
-#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_ATTRIBUTE_UTILS_H_
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_ATTRIBUTE_UTILS_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc
index f06fe1280f0..bf894a6c551 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.h"
 
 #include "absl/types/optional.h"
+#include "absl/types/variant.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
@@ -31,12 +32,11 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/OpDefinition.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
-#include "mlir/Parser.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/register.h"
 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
@@ -49,12 +49,14 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/translate_utils.h"
 #include "tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.h"
 #include "tensorflow/compiler/mlir/xla/transforms/passes.h"
 #include "tensorflow/compiler/mlir/xla/type_to_shape.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/xla/service/hlo_sharding.h"
+#include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/platform/logging.h"
@@ -62,34 +64,19 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-// Parses the MLIR module from the mlir_module_string.
-Status ParseMlirModule(llvm::StringRef mlir_module_string,
-                       mlir::MLIRContext* mlir_context,
-                       mlir::OwningModuleRef* mlir_module) {
-  TF_RET_CHECK(!mlir_module_string.empty())
-      << "unexpected empty serialized MLIR module string";
-  TF_RET_CHECK(mlir_module) << "unexpected null MLIR module pointer";
-
-  // Make sure we catch any error reported by MLIR and forward it to the TF
-  // error reporting system.
-  mlir::StatusScopedDiagnosticHandler error_handler(mlir_context);
-
-  // Parse the module.
-  *mlir_module = mlir::parseSourceString(mlir_module_string, mlir_context);
-  if (!*mlir_module) {
-    return error_handler.Combine(
-        errors::InvalidArgument("could not parse MLIR module"));
+// Extracts shape from XlaArgument as TensorShape. If shape is a xla::Shape,
+// that is converted to a TensorShape.
+StatusOr<TensorShape> GetTensorShapeFromXlaArgument(const XlaArgument& arg) {
+  if (absl::holds_alternative<xla::Shape>(arg.shape)) {
+    TensorShape arg_shape;
+    TF_RETURN_IF_ERROR(
+        XLAShapeToTensorShape(absl::get<xla::Shape>(arg.shape), &arg_shape));
+    return arg_shape;
+  } else {
+    return absl::get<TensorShape>(arg.shape);
   }
-
-  return Status::OK();
 }
 
-// Arguments to a computation can be either a tensor or resource.
-struct TensorOrResourceShape {
-  TensorShape shape;
-  bool is_resource = false;
-};
-
 // Converts arg_shapes to xla::Shape's and store into xla_input_shapes.
 Status GetXlaInputShapes(
     mlir::ModuleOp module, llvm::ArrayRef<TensorOrResourceShape> arg_shapes,
@@ -276,69 +263,69 @@ Status RefineShapes(llvm::ArrayRef<TensorOrResourceShape> arg_shapes,
   return Status::OK();
 }
 
-static void RegisterDialects() {
-  static bool init_once = []() {
-    mlir::registerDialect<mlir::StandardOpsDialect>();
-    mlir::registerDialect<mlir::TF::TensorFlowDialect>();
-    mlir::registerDialect<mlir::shape::ShapeDialect>();
-    mlir::registerDialect<mlir::tf_executor::TensorFlowExecutorDialect>();
-    mlir::registerDialect<mlir::mhlo::MhloDialect>();
-    return true;
-  }();
-  (void)init_once;
+static void RegisterDialects(mlir::DialectRegistry& registry) {
+  mlir::RegisterAllTensorFlowDialects(registry);
+  mlir::mhlo::registerAllMhloDialects(registry);
 }
 
 }  //  namespace
 
+void CreateConvertMlirToXlaHloPipeline(
+    mlir::OpPassManager& pm, llvm::StringRef device_type,
+    llvm::MutableArrayRef<std::unique_ptr<mlir::Pass>>
+        custom_legalization_passes) {
+  pm.addPass(mlir::TF::CreateTFRegionControlFlowToFunctional());
+  pm.addNestedPass<mlir::FuncOp>(mlir::createCanonicalizerPass());
+  pm.addPass(mlir::TF::CreateTensorListOpsDecompositionPass());
+  pm.addPass(mlir::TF::CreateStackOpsDecompositionPass());
+  pm.addPass(mlir::TF::CreateTensorArrayOpsDecompositionPass());
+  pm.addPass(mlir::TFDevice::CreateDecomposeResourceOpsPass());
+  pm.addPass(mlir::TF::CreatePromoteResourcesToArgsPass());
+  pm.addPass(mlir::createSymbolDCEPass());
+  // Guarantee all functions have one use, which enables shape inference.
+  pm.addPass(mlir::TF::CreateGuaranteeAllFuncsOneUsePass());
+  pm.addPass(mlir::TF::CreateTFShapeInferencePass());
+  // LegalizeTFControlFlow encapsulates arguments for control flow operations
+  // with a tuple argument which break the assumption of resource lifting
+  // inside PromoteResourcesToArgs.
+  pm.addPass(mlir::mhlo::createLegalizeTFControlFlowPass());
+
+  pm.addNestedPass<mlir::FuncOp>(mlir::mhlo::createLegalizeTFPass(
+      /*allow_partial_conversion=*/true, /*legalize_chlo=*/true,
+      /*tf2xla_fallback_device_type=*/device_type));
+  for (auto& target_pass : custom_legalization_passes) {
+    pm.addNestedPass<mlir::FuncOp>(std::move(target_pass));
+  }
+  pm.addPass(mlir::mhlo::CreateLegalizeTFCommunicationPass());
+  pm.addNestedPass<mlir::FuncOp>(mlir::createCanonicalizerPass());
+  // Run shape inference pass to propagate shapes through tensor_cast operations
+  // from static to dynamic shapes. This could be generated if the shape
+  // inference was originally missing in a TF op but the corresponding HLO op
+  // had static shape after lowering.
+  pm.addPass(mlir::TF::CreateTFShapeInferencePass());
+  // Run LegalizeTFPass again because the previous legalization passes can
+  // expose more graph pruning and canonicalization opportunities that are
+  // necessary for the second LegalizeTFPass(allow_partial_conversion=false)
+  // invocation.
+  pm.addNestedPass<mlir::FuncOp>(mlir::mhlo::createLegalizeTFPass(
+      /*allow_partial_conversion=*/false, /*legalize_chlo=*/true,
+      /*tf2xla_fallback_device_type=*/device_type));
+  // In order to export to XLA, we must sink constants to control flow regions,
+  // since XLA uses functional control flow.
+  pm.addNestedPass<mlir::FuncOp>(
+      mlir::mhlo::createSinkConstantsToControlFlowPass());
+}
+
 Status ConvertMLIRToXlaComputation(
     mlir::ModuleOp module_op, llvm::StringRef device_type,
     xla::XlaComputation* xla_computation, bool use_tuple_args,
     bool return_tuple,
     const XlaHelpers::ShapeRepresentationFn shape_representation_fn,
-    std::vector<std::unique_ptr<mlir::Pass>> custom_legalization_passes) {
+    llvm::MutableArrayRef<std::unique_ptr<mlir::Pass>>
+        custom_legalization_passes) {
   mlir::PassManager tf2xla(module_op.getContext());
-  tf2xla.addNestedPass<mlir::FuncOp>(mlir::createCanonicalizerPass());
-  tf2xla.addPass(mlir::TF::CreateTensorListOpsDecompositionPass());
-  tf2xla.addPass(mlir::TF::CreateStackOpsDecompositionPass());
-  tf2xla.addPass(mlir::TF::CreateTensorArrayOpsDecompositionPass());
-  tf2xla.addPass(mlir::TFDevice::CreateDecomposeResourceOpsPass());
-  tf2xla.addPass(mlir::TF::CreatePromoteResourcesToArgsPass());
-  tf2xla.addPass(mlir::createSymbolDCEPass());
-  // Guarantee all functions have one use, which enables shape inference.
-  tf2xla.addPass(mlir::TF::CreateGuaranteeAllFuncsOneUsePass());
-  tf2xla.addPass(mlir::TF::CreateTFShapeInferencePass());
-  // LegalizeTFControlFlow encapsulates arguments for control flow operations
-  // with a tuple argument which break the assumption of resource lifting
-  // inside PromoteResourcesToArgs.
-  tf2xla.addPass(mlir::mhlo::createLegalizeTFControlFlowPass());
-
-  tf2xla.addNestedPass<mlir::FuncOp>(mlir::mhlo::createLegalizeTFPass(true));
-  for (auto& target_pass : custom_legalization_passes) {
-    tf2xla.addNestedPass<mlir::FuncOp>(std::move(target_pass));
-  }
-  tf2xla.addNestedPass<mlir::FuncOp>(mlir::createCanonicalizerPass());
-  tf2xla.addPass(mlir::TF::CreateTFShapeInferencePass());
-
-  // Leverage tf2xla kernels for ops that didn't get lowered in the previous
-  // legalization pass.
-  tf2xla.addPass(mlir::mhlo::createLegalizeTfWithTf2XlaPass(device_type));
-  tf2xla.addNestedPass<mlir::FuncOp>(mlir::createCanonicalizerPass());
-
-  // Run shape inference pass to propagate shapes through tensor_cast operations
-  // from static to dynamic shapes. This could be generated if the shape
-  // inference was originally missing in a TF op but the corresponding HLO op
-  // had static shape after lowering.
-  tf2xla.addPass(mlir::TF::CreateTFShapeInferencePass());
-
-  // Run LegalizeTFPass again because the previous legalization passes can
-  // expose more graph pruning and canonicalization opportunities that are
-  // necessary for the second LegalizeTFPass(allow_partial_conversion=false)
-  // invocation.
-  tf2xla.addNestedPass<mlir::FuncOp>(mlir::mhlo::createLegalizeTFPass(false));
-  // In order to export to XLA, we must sink constants to control flow regions,
-  // since XLA uses functional control flow.
-  tf2xla.addNestedPass<mlir::FuncOp>(
-      mlir::mhlo::createSinkConstantsToControlFlowPass());
+  CreateConvertMlirToXlaHloPipeline(tf2xla, device_type,
+                                    custom_legalization_passes);
 
   if (VLOG_IS_ON(1)) {
     // Print the whole module after each pass which requires disabling
@@ -369,12 +356,13 @@ Status ConvertMLIRToXlaComputation(
   return Status::OK();
 }
 
-static Status CompileMlirToXlaHlo(
+Status CompileMlirToXlaHlo(
     mlir::ModuleOp module_op, llvm::ArrayRef<TensorOrResourceShape> arg_shapes,
-    llvm::StringRef device_type, bool use_tuple_args,
+    llvm::StringRef device_type, bool use_tuple_args, bool use_return_tuple,
     XlaHelpers::ShapeRepresentationFn shape_representation_fn,
     XlaCompilationResult* compilation_result,
-    std::vector<std::unique_ptr<mlir::Pass>> custom_legalization_passes) {
+    llvm::MutableArrayRef<std::unique_ptr<mlir::Pass>>
+        custom_legalization_passes) {
   if (VLOG_IS_ON(1))
     tensorflow::DumpMlirOpToFile("mlir_compile_before", module_op);
 
@@ -391,9 +379,8 @@ static Status CompileMlirToXlaHlo(
   compilation_result->computation = std::make_shared<xla::XlaComputation>();
   TF_RETURN_IF_ERROR(ConvertMLIRToXlaComputation(
       module_op, device_type, compilation_result->computation.get(),
-      use_tuple_args,
-      /*return_tuple=*/true, shape_representation_fn,
-      std::move(custom_legalization_passes)));
+      use_tuple_args, use_return_tuple, shape_representation_fn,
+      custom_legalization_passes));
 
   // Construct mapping from XlaComputation's arg to input edges of execute
   // node.
@@ -420,21 +407,22 @@ Status CompileSerializedMlirToXlaHlo(
     llvm::StringRef device_type, bool use_tuple_args,
     const XlaHelpers::ShapeRepresentationFn shape_representation_fn,
     XlaCompilationResult* compilation_result,
-    std::vector<std::unique_ptr<mlir::Pass>> custom_legalization_passes) {
-  RegisterDialects();
+    llvm::MutableArrayRef<std::unique_ptr<mlir::Pass>>
+        custom_legalization_passes) {
   mlir::MLIRContext mlir_context;
+  RegisterDialects(mlir_context.getDialectRegistry());
   mlir::OwningModuleRef mlir_module;
 
   TF_RETURN_IF_ERROR(
-      ParseMlirModule(mlir_module_string, &mlir_context, &mlir_module));
+      DeserializeMlirModule(mlir_module_string, &mlir_context, &mlir_module));
   llvm::SmallVector<TensorOrResourceShape, 4> tensor_or_resource_shapes;
   tensor_or_resource_shapes.reserve(arg_shapes.size());
   for (const auto& arg_shape : arg_shapes)
     tensor_or_resource_shapes.push_back({arg_shape});
   return CompileMlirToXlaHlo(mlir_module.get(), tensor_or_resource_shapes,
                              device_type, use_tuple_args,
-                             shape_representation_fn, compilation_result,
-                             std::move(custom_legalization_passes));
+                             /*use_return_tuple=*/true, shape_representation_fn,
+                             compilation_result, custom_legalization_passes);
 }
 
 // Rewrites the given module with specified args. For each of the constant args,
@@ -442,8 +430,8 @@ Status CompileSerializedMlirToXlaHlo(
 // removed from the signature. For resource args, their subtypes are populated.
 // Returns the original indices for the other arguments on success.
 static StatusOr<std::vector<int>> RewriteWithArgs(
-    mlir::ModuleOp module, llvm::ArrayRef<const XlaArgument> args) {
-  mlir::FuncOp main_fn = module.lookupSymbol<mlir::FuncOp>("main");
+    mlir::ModuleOp module_op, llvm::ArrayRef<XlaArgument> args) {
+  mlir::FuncOp main_fn = module_op.lookupSymbol<mlir::FuncOp>("main");
   std::vector<int> params;
 
   bool has_resource_args = false;
@@ -455,7 +443,9 @@ static StatusOr<std::vector<int>> RewriteWithArgs(
     if (xla_arg.kind == XlaArgument::kResource) {
       mlir::Type element_type;
       TF_RETURN_IF_ERROR(ConvertDataType(xla_arg.type, builder, &element_type));
-      auto resource_shape = absl::get<TensorShape>(xla_arg.shape).dim_sizes();
+      TF_ASSIGN_OR_RETURN(TensorShape arg_shape,
+                          GetTensorShapeFromXlaArgument(xla_arg));
+      auto resource_shape = arg_shape.dim_sizes();
       llvm::SmallVector<int64_t, 4> resource_subtype_shape(
           resource_shape.begin(), resource_shape.end());
       auto resource_subtype =
@@ -481,7 +471,7 @@ static StatusOr<std::vector<int>> RewriteWithArgs(
                         ConvertTensor(xla_arg.constant_value, &builder));
     // TODO(hinsu): Use the actual location of the constant.
     auto constant = builder.create<mlir::TF::ConstOp>(
-        mlir::UnknownLoc::get(module.getContext()), value_attr);
+        mlir::UnknownLoc::get(module_op.getContext()), value_attr);
     mlir_arg.replaceAllUsesWith(constant);
     args_to_erase.push_back(idx);
   }
@@ -503,45 +493,66 @@ static StatusOr<std::vector<int>> RewriteWithArgs(
 }
 
 Status CompileGraphToXlaHlo(
-    const Graph& graph, llvm::ArrayRef<const XlaArgument> args,
-    llvm::StringRef device_type, bool use_tuple_args,
-    const FunctionLibraryDefinition& flib_def, const GraphDebugInfo& debug_info,
+    mlir::ModuleOp module_op, llvm::ArrayRef<XlaArgument> args,
+    llvm::StringRef device_type, bool use_tuple_args, bool use_return_tuple,
     const XlaHelpers::ShapeRepresentationFn shape_representation_fn,
     XlaCompilationResult* compilation_result,
-    std::vector<std::unique_ptr<mlir::Pass>> custom_legalization_passes) {
-  RegisterDialects();
-
-  mlir::MLIRContext context;
-  GraphImportConfig config;
-  config.graph_as_function = true;
-  auto module_or =
-      ConvertGraphToMlir(graph, debug_info, flib_def, config, &context);
-  if (!module_or.ok()) return module_or.status();
-
-  mlir::ModuleOp module = module_or.ValueOrDie().get();
+    llvm::MutableArrayRef<std::unique_ptr<mlir::Pass>>
+        custom_legalization_passes) {
   TF_ASSIGN_OR_RETURN(std::vector<int> remaining_params,
-                      RewriteWithArgs(module, {args.data(), args.size()}));
+                      RewriteWithArgs(module_op, args));
   llvm::SmallVector<TensorOrResourceShape, 4> arg_shapes;
   arg_shapes.reserve(remaining_params.size());
   for (unsigned idx : remaining_params) {
     const auto& arg = args[idx];
-    arg_shapes.push_back({absl::get<TensorShape>(arg.shape),
+    TF_ASSIGN_OR_RETURN(TensorShape arg_shape,
+                        GetTensorShapeFromXlaArgument(arg));
+    arg_shapes.push_back({arg_shape,
                           /*is_resource=*/arg.kind == XlaArgument::kResource});
   }
 
-  mlir::PassManager pm(&context);
+  mlir::PassManager pm(module_op.getContext());
   mlir::TF::StandardPipelineOptions tf_options;
   mlir::TF::CreateTFStandardPipeline(pm, tf_options);
   {
-    mlir::StatusScopedDiagnosticHandler diag_handler(module.getContext());
-    if (failed(pm.run(module))) return diag_handler.ConsumeStatus();
+    mlir::StatusScopedDiagnosticHandler diag_handler(module_op.getContext());
+    if (failed(pm.run(module_op))) return diag_handler.ConsumeStatus();
   }
 
   auto status = CompileMlirToXlaHlo(
-      module, arg_shapes, device_type, use_tuple_args, shape_representation_fn,
-      compilation_result, std::move(custom_legalization_passes));
+      module_op, arg_shapes, device_type, use_tuple_args, use_return_tuple,
+      shape_representation_fn, compilation_result, custom_legalization_passes);
   compilation_result->input_mapping = remaining_params;
   return status;
 }
 
+Status CompileGraphToXlaHlo(
+    const Graph& graph, llvm::ArrayRef<XlaArgument> args,
+    llvm::StringRef device_type, bool use_tuple_args,
+    const FunctionLibraryDefinition& flib_def, const GraphDebugInfo& debug_info,
+    const XlaHelpers::ShapeRepresentationFn shape_representation_fn,
+    XlaCompilationResult* compilation_result,
+    llvm::MutableArrayRef<std::unique_ptr<mlir::Pass>>
+        custom_legalization_passes) {
+  mlir::MLIRContext context;
+  RegisterDialects(context.getDialectRegistry());
+  GraphImportConfig config;
+  config.graph_as_function = true;
+  // Disable shape inference during import as some TensorFlow op fails during
+  // shape inference with dynamic shaped operands. This in turn causes the
+  // import to fail. Shape inference during import is going to be removed and
+  // the shape inference pass is run early in the pass pipeline, shape inference
+  // during import is not necessary.
+  config.enable_shape_inference = false;
+  auto module_or =
+      ConvertGraphToMlir(graph, debug_info, flib_def, config, &context);
+  if (!module_or.ok()) return module_or.status();
+
+  mlir::ModuleOp module_op = module_or.ValueOrDie().get();
+  return CompileGraphToXlaHlo(module_op, args, device_type, use_tuple_args,
+                              /*use_return_tuple=*/true,
+                              shape_representation_fn, compilation_result,
+                              custom_legalization_passes);
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.h b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.h
index 5c64a65ecbd..dac1c994d03 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.h
@@ -16,10 +16,13 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_COMPILE_MLIR_UTIL_H_
 #define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_COMPILE_MLIR_UTIL_H_
 
+#include <memory>
+
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
 #include "mlir/IR/Module.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "tensorflow/compiler/tf2xla/xla_argument.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
@@ -30,6 +33,14 @@ limitations under the License.
 
 namespace tensorflow {
 
+// Populates the supplied passmanager with the passes required to run the
+// TF MLIR to XLA HLO MLIR conversion/legalization. Custom legalization passes
+// can be populated in `custom_legalization_passes`.
+void CreateConvertMlirToXlaHloPipeline(
+    mlir::OpPassManager& pm, llvm::StringRef device_type,
+    llvm::MutableArrayRef<std::unique_ptr<mlir::Pass>>
+        custom_legalization_passes);
+
 // Lowers MLIR module to XLA HLO inside an XlaComputation. The input module
 // should only contain operations in tf dialect. If the input module contains
 // operation in the tf_executor dialect, for example, returns an error.
@@ -61,7 +72,24 @@ Status ConvertMLIRToXlaComputation(
     xla::XlaComputation* xla_computation, bool use_tuple_args,
     bool return_tuple,
     const XlaHelpers::ShapeRepresentationFn shape_representation_fn = nullptr,
-    std::vector<std::unique_ptr<mlir::Pass>> custom_legalization_passes = {});
+    llvm::MutableArrayRef<std::unique_ptr<mlir::Pass>>
+        custom_legalization_passes = {});
+
+// Helper struct representing argument tensor or resource handle shapes.
+struct TensorOrResourceShape {
+  TensorShape shape;
+  bool is_resource = false;
+};
+
+// Compiles a MLIR module into XLA HLO, generates all accompanying metadata and
+// stores them in CompilationResult.
+Status CompileMlirToXlaHlo(
+    mlir::ModuleOp module_op, llvm::ArrayRef<TensorOrResourceShape> arg_shapes,
+    llvm::StringRef device_type, bool use_tuple_args, bool use_return_tuple,
+    XlaHelpers::ShapeRepresentationFn shape_representation_fn,
+    XlaCompilationResult* compilation_result,
+    llvm::MutableArrayRef<std::unique_ptr<mlir::Pass>>
+        custom_legalization_passes);
 
 // Compiles a serialized MLIR module into XLA HLO, generates all accompanying
 // metadata and stores them in CompilationResult.
@@ -70,17 +98,33 @@ Status CompileSerializedMlirToXlaHlo(
     llvm::StringRef device_type, bool use_tuple_args,
     const XlaHelpers::ShapeRepresentationFn shape_representation_fn,
     XlaCompilationResult* compilation_result,
-    std::vector<std::unique_ptr<mlir::Pass>> custom_legalization_passes = {});
+    llvm::MutableArrayRef<std::unique_ptr<mlir::Pass>>
+        custom_legalization_passes = {});
 
-// Same as the above but takes input as TensorFlow Graph.
+// Compiles a TensorFlow Graph (already converted to MLIR, imported with
+// tf_executor dialect still present) into XLA HLO, generates all accompanying
+// metadata and stores them in CompilationResult. This will rewrite arguments
+// and run the TensorFlow standard pipeline prior to invoking
+// `CompileMlirToXlaHlo`.
+Status CompileGraphToXlaHlo(
+    mlir::ModuleOp module_op, llvm::ArrayRef<XlaArgument> args,
+    llvm::StringRef device_type, bool use_tuple_args, bool use_return_tuple,
+    const XlaHelpers::ShapeRepresentationFn shape_representation_fn,
+    XlaCompilationResult* compilation_result,
+    llvm::MutableArrayRef<std::unique_ptr<mlir::Pass>>
+        custom_legalization_passes);
+
+// Compiles a TensorFlow Graph into XLA HLO, generates all accompanying metadata
+// and stores them in CompilationResult.
 // TODO(lyandy): Allow populating of targets/control outputs.
 Status CompileGraphToXlaHlo(
-    const Graph& graph, llvm::ArrayRef<const XlaArgument> args,
+    const Graph& graph, llvm::ArrayRef<XlaArgument> args,
     llvm::StringRef device_type, bool use_tuple_args,
     const FunctionLibraryDefinition& flib_def, const GraphDebugInfo& debug_info,
     const XlaHelpers::ShapeRepresentationFn shape_representation_fn,
     XlaCompilationResult* compilation_result,
-    std::vector<std::unique_ptr<mlir::Pass>> custom_legalization_passes = {});
+    llvm::MutableArrayRef<std::unique_ptr<mlir::Pass>>
+        custom_legalization_passes = {});
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util_pass.cc b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util_pass.cc
new file mode 100644
index 00000000000..57267ff027f
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util_pass.cc
@@ -0,0 +1,31 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.h"
+
+namespace {
+void CreateConvertMlirToXlaHloPipelineWithDefaults(mlir::OpPassManager& pm) {
+  tensorflow::CreateConvertMlirToXlaHloPipeline(
+      pm, /*device_type=*/"XLA_CPU_JIT",
+      /*custom_legalization_passes=*/{});
+}
+
+mlir::PassPipelineRegistration<> pipeline(
+    "tf-to-hlo-pipeline",
+    "Convert TF dialect to HLO dialect (used for compilation in bridge).",
+    CreateConvertMlirToXlaHloPipelineWithDefaults);
+}  // anonymous namespace
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util_test.cc
deleted file mode 100644
index 6ebf6897bb1..00000000000
--- a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util_test.cc
+++ /dev/null
@@ -1,542 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.h"
-
-#include "tensorflow/cc/framework/scope.h"
-#include "tensorflow/cc/ops/function_ops.h"
-#include "tensorflow/cc/ops/resource_variable_ops.h"
-#include "tensorflow/compiler/tf2xla/shape_util.h"
-#include "tensorflow/compiler/tf2xla/xla_compiler.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/framework/tensor_testutil.h"
-#include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/graph/testlib.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/stream_executor/lib/statusor.h"
-
-namespace tensorflow {
-namespace {
-
-// A dummy shape representation function that simply converts given shape into
-// an xla::Shape without assigning any layouts.
-xla::StatusOr<xla::Shape> TestShapeRepresentation(const TensorShape& shape,
-                                                  DataType type,
-                                                  bool use_fast_memory) {
-  xla::Shape xla_shape;
-  TF_RETURN_IF_ERROR(TensorShapeToXLAShape(type, shape, &xla_shape));
-  return xla_shape;
-}
-
-TEST(CompileSerializedMlirToXlaHloTest, InvalidSerializedMlirModule) {
-  constexpr char invalid_mlir_module[] =
-      "totally @invalid MLIR module {here} <-";
-  std::vector<TensorShape> arg_shapes;
-  XlaCompiler::CompilationResult compilation_result;
-
-  Status s = CompileSerializedMlirToXlaHlo(
-      invalid_mlir_module, arg_shapes, "XLA_CPU_JIT",
-      /*use_tuple_args=*/true, TestShapeRepresentation, &compilation_result);
-  EXPECT_EQ(s.code(), tensorflow::errors::Code::INVALID_ARGUMENT);
-  EXPECT_EQ(s.ToString(),
-            "Invalid argument: could not parse MLIR module-:1:1: error: "
-            "custom op 'totally' is unknown\n");
-}
-
-constexpr llvm::StringRef kBinaryAddModule = R"(
-  module attributes {tf.versions = {producer = 179 : i32}} {
-    func @main(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
-      %0 = "tf.AddV2"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", name = "add"} : (tensor<f32>, tensor<f32>) -> tensor<f32>
-      return %0 : tensor<f32>
-    }
-  }
-)";
-
-TEST(CompileSerializedMlirToXlaHloTest, TupleArgs) {
-  std::vector<TensorShape> arg_shapes(2, TensorShape());
-  XlaCompiler::CompilationResult compilation_result;
-
-  Status s = CompileSerializedMlirToXlaHlo(
-      kBinaryAddModule, arg_shapes, "XLA_CPU_JIT",
-      /*use_tuple_args=*/true, TestShapeRepresentation, &compilation_result);
-  TF_ASSERT_OK(s);
-
-  const xla::HloModuleConfig module_config(
-      compilation_result.computation->GetProgramShape().ValueOrDie());
-  auto status_or_hlo_module = xla::HloModule::CreateFromProto(
-      compilation_result.computation->proto(), module_config);
-  TF_ASSERT_OK(status_or_hlo_module.status());
-  constexpr char expected_hlo_module_string[] = R"(HloModule main.6
-
-ENTRY %main.6 (arg_tuple.1: (f32[], f32[])) -> (f32[]) {
-  %arg_tuple.1 = (f32[], f32[]) parameter(0)
-  %get-tuple-element.2 = f32[] get-tuple-element((f32[], f32[]) %arg_tuple.1), index=0
-  %get-tuple-element.3 = f32[] get-tuple-element((f32[], f32[]) %arg_tuple.1), index=1
-  %add.4 = f32[] add(f32[] %get-tuple-element.2, f32[] %get-tuple-element.3)
-  ROOT %tuple.5 = (f32[]) tuple(f32[] %add.4)
-}
-
-)";
-  EXPECT_EQ(expected_hlo_module_string,
-            status_or_hlo_module.ValueOrDie()->ToString());
-
-  // Expect an in order input mapping.
-  EXPECT_EQ(compilation_result.input_mapping, std::vector<int>({0, 1}));
-
-  // Expect a single tuple-shape, containing two F32 scalars.
-  EXPECT_EQ(compilation_result.xla_input_shapes.size(), 1);
-  xla::Shape expected_input_shape =
-      xla::ShapeUtil::MakeTupleShape({xla::ShapeUtil::MakeShape(xla::F32, {}),
-                                      xla::ShapeUtil::MakeShape(xla::F32, {})});
-  EXPECT_EQ(compilation_result.xla_input_shapes.front(), expected_input_shape);
-
-  // Expect output shape is a tuple shape containing a single F32 Scalar type.
-  const xla::Shape output_shape =
-      xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {});
-  const xla::Shape tuple_output_shape =
-      xla::ShapeUtil::MakeTupleShape({output_shape});
-  EXPECT_EQ(compilation_result.xla_output_shape, tuple_output_shape);
-
-  // Expect exactly 1 OutputDescription.
-  EXPECT_EQ(compilation_result.outputs.size(), 1);
-  const XlaCompiler::OutputDescription& output_desc =
-      compilation_result.outputs.front();
-  EXPECT_EQ(output_desc.type, DataType::DT_FLOAT);
-  EXPECT_EQ(output_desc.shape, TensorShape());
-  EXPECT_FALSE(output_desc.is_constant);
-  EXPECT_FALSE(output_desc.is_tensor_list);
-
-  // Expect no resource updates from computation.
-  EXPECT_TRUE(compilation_result.resource_updates.empty());
-}
-
-TEST(CompileSerializedMlirToXlaHloTest, IndividualArgs) {
-  std::vector<TensorShape> arg_shapes(2, TensorShape());
-  XlaCompiler::CompilationResult compilation_result;
-
-  Status s = CompileSerializedMlirToXlaHlo(
-      kBinaryAddModule, arg_shapes, "XLA_CPU_JIT",
-      /*use_tuple_args=*/false, TestShapeRepresentation, &compilation_result);
-  TF_ASSERT_OK(s);
-
-  const xla::HloModuleConfig module_config(
-      compilation_result.computation->GetProgramShape().ValueOrDie());
-  auto status_or_hlo_module = xla::HloModule::CreateFromProto(
-      compilation_result.computation->proto(), module_config);
-  TF_ASSERT_OK(status_or_hlo_module.status());
-  constexpr char expected_hlo_module_string[] = R"(HloModule main.5
-
-ENTRY %main.5 (Arg_0.1: f32[], Arg_1.2: f32[]) -> (f32[]) {
-  %Arg_0.1 = f32[] parameter(0)
-  %Arg_1.2 = f32[] parameter(1)
-  %add.3 = f32[] add(f32[] %Arg_0.1, f32[] %Arg_1.2)
-  ROOT %tuple.4 = (f32[]) tuple(f32[] %add.3)
-}
-
-)";
-  EXPECT_EQ(expected_hlo_module_string,
-            status_or_hlo_module.ValueOrDie()->ToString());
-
-  // Expect an in order input mapping.
-  EXPECT_EQ(compilation_result.input_mapping, std::vector<int>({0, 1}));
-
-  // Expect two inputs, each containing a F32 scalar.
-  EXPECT_EQ(compilation_result.xla_input_shapes.size(), 2);
-  xla::Shape expected_input_shape = xla::ShapeUtil::MakeShape(xla::F32, {});
-  EXPECT_EQ(compilation_result.xla_input_shapes[0], expected_input_shape);
-  EXPECT_EQ(compilation_result.xla_input_shapes[1], expected_input_shape);
-
-  // Expect output shape is a tuple shape containing a single F32 Scalar type.
-  const xla::Shape output_shape =
-      xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {});
-  const xla::Shape tuple_output_shape =
-      xla::ShapeUtil::MakeTupleShape({output_shape});
-  EXPECT_EQ(compilation_result.xla_output_shape, tuple_output_shape);
-
-  // Expect exactly 1 OutputDescription.
-  EXPECT_EQ(compilation_result.outputs.size(), 1);
-  const XlaCompiler::OutputDescription& output_desc =
-      compilation_result.outputs.front();
-  EXPECT_EQ(output_desc.type, DataType::DT_FLOAT);
-  EXPECT_EQ(output_desc.shape, TensorShape());
-  EXPECT_FALSE(output_desc.is_constant);
-  EXPECT_FALSE(output_desc.is_tensor_list);
-
-  // Expect no resource updates from computation.
-  EXPECT_TRUE(compilation_result.resource_updates.empty());
-}
-
-// Tests that foldable ops are constant-folded to enable legalization of ops
-// that require compile time constant operand.
-TEST(CompileSerializedMlirToXlaHloTest, CompileTimeConstantFoldedSuccess) {
-  // "tf.Shape" can only be folded away after shape inference. tf.Reshape can
-  // only be lowered when tf.Shape is folded into a constant.
-  constexpr char mlir_module[] = R"(
-    module attributes {tf.versions = {producer = 179 : i32}} {
-      func @main(%arg0: tensor<10x19xf32>, %arg1: tensor<19x10xf32> {mhlo.is_same_data_across_replicas}) -> tensor<10x19xf32> {
-        %0 = "tf.Shape"(%arg0) : (tensor<10x19xf32>) -> tensor<2xi64>
-        %1 = "tf.Reshape"(%arg1, %0) : (tensor<19x10xf32>, tensor<2xi64>) -> tensor<10x19xf32>
-        return %1 : tensor<10x19xf32>
-      }
-    }
-  )";
-
-  std::vector<TensorShape> arg_shapes{TensorShape({10, 19}),
-                                      TensorShape({19, 10})};
-  XlaCompiler::CompilationResult compilation_result;
-
-  Status s = CompileSerializedMlirToXlaHlo(
-      mlir_module, arg_shapes, "XLA_CPU_JIT",
-      /*use_tuple_args=*/true, TestShapeRepresentation, &compilation_result);
-  TF_ASSERT_OK(s);
-
-  const xla::HloModuleConfig module_config(
-      compilation_result.computation->GetProgramShape().ValueOrDie());
-  auto status_or_hlo_module = xla::HloModule::CreateFromProto(
-      compilation_result.computation->proto(), module_config);
-  TF_ASSERT_OK(status_or_hlo_module.status());
-  constexpr char expected_hlo_module_string[] = R"(HloModule main.6
-
-ENTRY %main.6 (arg_tuple.1: (f32[10,19], f32[19,10])) -> (f32[10,19]) {
-  %arg_tuple.1 = (f32[10,19]{1,0}, f32[19,10]{1,0}) parameter(0), parameter_replication={false,true}
-  %get-tuple-element.2 = f32[10,19]{1,0} get-tuple-element((f32[10,19]{1,0}, f32[19,10]{1,0}) %arg_tuple.1), index=0
-  %get-tuple-element.3 = f32[19,10]{1,0} get-tuple-element((f32[10,19]{1,0}, f32[19,10]{1,0}) %arg_tuple.1), index=1
-  %reshape.4 = f32[10,19]{1,0} reshape(f32[19,10]{1,0} %get-tuple-element.3)
-  ROOT %tuple.5 = (f32[10,19]{1,0}) tuple(f32[10,19]{1,0} %reshape.4)
-}
-
-)";
-  EXPECT_EQ(expected_hlo_module_string,
-            status_or_hlo_module.ValueOrDie()->ToString());
-}
-
-TEST(CompileSerializedMlirToXlaHloTest, ShapeInference) {
-  constexpr char mlir_module[] = R"(
-    module attributes {tf.versions = {producer = 179 : i32}} {
-      func @main(%arg0: tensor<*xf32>, %arg1: tensor<?x19xf32>) -> tensor<?x19xf32> {
-        %0 = "tf.MatMul"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", transpose_a = false, transpose_b = false} : (tensor<*xf32>, tensor<?x19xf32>) -> tensor<?x19xf32>
-        return %0 : tensor<?x19xf32>
-      }
-    }
-  )";
-
-  std::vector<TensorShape> arg_shapes{TensorShape({10, 17}),
-                                      TensorShape({17, 19})};
-  XlaCompiler::CompilationResult compilation_result;
-
-  Status s = CompileSerializedMlirToXlaHlo(
-      mlir_module, arg_shapes, "XLA_CPU_JIT",
-      /*use_tuple_args=*/true, TestShapeRepresentation, &compilation_result);
-  TF_ASSERT_OK(s);
-
-  const xla::HloModuleConfig module_config(
-      compilation_result.computation->GetProgramShape().ValueOrDie());
-  auto status_or_hlo_module = xla::HloModule::CreateFromProto(
-      compilation_result.computation->proto(), module_config);
-  TF_ASSERT_OK(status_or_hlo_module.status());
-
-  constexpr char expected_signature[] =
-      R"((arg_tuple.1: (f32[10,17], f32[17,19])) -> (f32[10,19]))";
-  EXPECT_THAT(status_or_hlo_module.ValueOrDie()->ToString(),
-              ::testing::HasSubstr(expected_signature));
-}
-
-TEST(CompileSerializedMlirToXlaHloTest, ShapeInferenceAfterLegalization) {
-  constexpr char mlir_module[] = R"(
-    module attributes {tf.versions = {producer = 179 : i32}} {
-      func @main(%arg0: tensor<8x16x16x64xbf16>, %arg1: tensor<64xf32>) -> (tensor<8x16x16x64xbf16>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<*xf32>) {
-        %0:6 = "tf.FusedBatchNormV3"(%arg0, %arg1, %arg1, %arg1, %arg1) {data_format = "NHWC", device = "", epsilon = 9.99999974E-5 : f32, exponential_avg_factor = 1.000000e+00 : f32, is_training = false} : (tensor<8x16x16x64xbf16>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>) -> (tensor<8x16x16x64xbf16>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<*xf32>)
-        return %0#0, %0#1, %0#2, %0#3, %0#4, %0#5 : tensor<8x16x16x64xbf16>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<*xf32>
-      }
-    }
-  )";
-
-  std::vector<TensorShape> arg_shapes{TensorShape({8, 16, 16, 64}),
-                                      TensorShape({64})};
-  XlaCompiler::CompilationResult compilation_result;
-
-  Status s = CompileSerializedMlirToXlaHlo(
-      mlir_module, arg_shapes, "XLA_CPU_JIT",
-      /*use_tuple_args=*/true, TestShapeRepresentation, &compilation_result);
-  TF_ASSERT_OK(s);
-
-  const xla::HloModuleConfig module_config(
-      compilation_result.computation->GetProgramShape().ValueOrDie());
-  auto status_or_hlo_module = xla::HloModule::CreateFromProto(
-      compilation_result.computation->proto(), module_config);
-  TF_ASSERT_OK(status_or_hlo_module.status());
-
-  constexpr char expected_signature[] =
-      R"(-> (bf16[8,16,16,64], f32[64], f32[64], f32[64], f32[64], f32[0]))";
-  EXPECT_THAT(status_or_hlo_module.ValueOrDie()->ToString(),
-              ::testing::HasSubstr(expected_signature));
-}
-
-TEST(CompileSerializedMlirToXlaHloTest, ConstantFoldHook) {
-  constexpr char mlir_module[] = R"(
-module attributes {tf.versions = {producer = 179 : i32}} {
-  func @main() -> (tensor<0xi32>, tensor<0xi32>) {
-    %0 = "tf.Const"() {value = dense<[]> : tensor<0xi32>} : () -> tensor<0xi32>
-    %r0, %r1 = "tf.BroadcastGradientArgs"(%0, %0) {T = i32} : (tensor<0xi32>, tensor<0xi32>) -> (tensor<0xi32>, tensor<0xi32>)
-    return %r0, %r1 : tensor<0xi32>, tensor<0xi32>
-  }
-}
-)";
-
-  std::vector<TensorShape> arg_shapes(2, TensorShape());
-  XlaCompiler::CompilationResult compilation_result;
-
-  Status s = CompileSerializedMlirToXlaHlo(
-      mlir_module, arg_shapes, "XLA_CPU_JIT",
-      /*use_tuple_args=*/true, TestShapeRepresentation, &compilation_result);
-  TF_ASSERT_OK(s);
-
-  const xla::HloModuleConfig module_config(
-      compilation_result.computation->GetProgramShape().ValueOrDie());
-  auto status_or_hlo_module = xla::HloModule::CreateFromProto(
-      compilation_result.computation->proto(), module_config);
-  TF_ASSERT_OK(status_or_hlo_module.status());
-  constexpr char expected_hlo_module_string[] = R"(HloModule main.4
-
-ENTRY %main.4 (arg_tuple.1: ()) -> (s32[0], s32[0]) {
-  %arg_tuple.1 = () parameter(0)
-  %constant.2 = s32[0]{0} constant({})
-  ROOT %tuple.3 = (s32[0]{0}, s32[0]{0}) tuple(s32[0]{0} %constant.2, s32[0]{0} %constant.2)
-}
-
-)";
-  EXPECT_EQ(expected_hlo_module_string,
-            status_or_hlo_module.ValueOrDie()->ToString());
-}
-
-// The following xla::OpSharding protos are used:
-//  Serialized string:
-//   "\08\03\1A\02\01\02\22\02\00\01"
-//  Proto debug string:
-//   type: OTHER
-//   tile_assignment_dimensions: 1
-//   tile_assignment_dimensions: 2
-//   tile_assignment_devices: 0
-//   tile_assignment_devices: 1
-//
-//  Serialized string:
-//   "\08\01\1A\01\01\22\01\00"
-//  Proto debug string:
-//   type: MAXIMAL
-//   tile_assignment_dimensions: 1
-//   tile_assignment_devices: 0
-//
-//  Serialized string:
-//   ""
-//  Proto debug string (empty but would equivalent to):
-//   type: REPLICATED
-TEST(CompileSerializedMlirToXlaHloTest, ArgumentSharding) {
-  constexpr char mlir_module[] = R"(
-module attributes {tf.versions = {producer = 179 : i32}} {
-  func @main(%arg0: tensor<128x10xf32> {mhlo.sharding = "\08\03\1A\02\01\02\22\02\00\01"}, %arg1: tensor<10x1024xf32> {mhlo.sharding = "\08\01\1A\01\01\22\01\00"}, %arg2: tensor<128x1024xf32> {mhlo.sharding = ""}) {
-    return
-  }
-}
-)";
-
-  std::vector<TensorShape> arg_shapes{TensorShape({128, 10}),
-                                      TensorShape({10, 1024}),
-                                      TensorShape({128, 1024})};
-  XlaCompiler::CompilationResult compilation_result;
-
-  Status s = CompileSerializedMlirToXlaHlo(
-      mlir_module, arg_shapes, "XLA_CPU_JIT",
-      /*use_tuple_args=*/true, TestShapeRepresentation, &compilation_result);
-  TF_ASSERT_OK(s);
-
-  const xla::HloModuleConfig module_config(
-      compilation_result.computation->GetProgramShape().ValueOrDie());
-  auto status_or_hlo_module = xla::HloModule::CreateFromProto(
-      compilation_result.computation->proto(), module_config);
-  TF_ASSERT_OK(status_or_hlo_module.status());
-  constexpr char expected_hlo_module_string[] = R"(HloModule main.6
-
-ENTRY %main.6 (arg_tuple.1: (f32[128,10], f32[10,1024], f32[128,1024])) -> () {
-  %arg_tuple.1 = (f32[128,10]{1,0}, f32[10,1024]{1,0}, f32[128,1024]{1,0}) parameter(0), sharding={{devices=[1,2]0,1}, {maximal device=0}, {replicated}}
-  %get-tuple-element.2 = f32[128,10]{1,0} get-tuple-element((f32[128,10]{1,0}, f32[10,1024]{1,0}, f32[128,1024]{1,0}) %arg_tuple.1), index=0
-  %get-tuple-element.3 = f32[10,1024]{1,0} get-tuple-element((f32[128,10]{1,0}, f32[10,1024]{1,0}, f32[128,1024]{1,0}) %arg_tuple.1), index=1
-  %get-tuple-element.4 = f32[128,1024]{1,0} get-tuple-element((f32[128,10]{1,0}, f32[10,1024]{1,0}, f32[128,1024]{1,0}) %arg_tuple.1), index=2
-  ROOT %tuple.5 = () tuple()
-}
-
-)";
-  EXPECT_EQ(expected_hlo_module_string,
-            status_or_hlo_module.ValueOrDie()->ToString());
-}
-
-TEST(CompileSerializedMlirToXlaHloTest, BadArgumentSharding) {
-  constexpr char mlir_module[] = R"(
-module attributes {tf.versions = {producer = 179 : i32}} {
-  func @main(%arg0: tensor<128x10xf32> {mhlo.sharding = "bad_sharding"}) {
-    return
-  }
-}
-)";
-
-  std::vector<TensorShape> arg_shapes{TensorShape({128, 10})};
-  XlaCompiler::CompilationResult compilation_result;
-
-  Status s = CompileSerializedMlirToXlaHlo(
-      mlir_module, arg_shapes, "XLA_CPU_JIT",
-      /*use_tuple_args=*/true, TestShapeRepresentation, &compilation_result);
-  ASSERT_FALSE(s.ok());
-  EXPECT_EQ(s.error_message(),
-            "failed to parse argument sharding 0 'bad_sharding'");
-}
-
-TEST(CompileSerializedMlirToXlaHloTest, ResultSharding) {
-  constexpr char mlir_module[] = R"(
-module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 351 : i32}} {
-  func @main(%arg0: tensor<128x10xf32>, %arg1: tensor<10x1024xf32>, %arg2: tensor<128x1024xf32>) -> (tensor<128x10xf32> {mhlo.sharding = "\08\03\1A\02\01\02\22\02\00\01"}, tensor<10x1024xf32> {mhlo.sharding = "\08\01\1A\01\01\22\01\00"}, tensor<128x1024xf32> {mhlo.sharding = ""}) {
-    return %arg0, %arg1, %arg2 : tensor<128x10xf32>, tensor<10x1024xf32>, tensor<128x1024xf32>
-  }
-}
-)";
-
-  std::vector<TensorShape> arg_shapes{TensorShape({128, 10}),
-                                      TensorShape({10, 1024}),
-                                      TensorShape({128, 1024})};
-  XlaCompiler::CompilationResult compilation_result;
-
-  Status s = CompileSerializedMlirToXlaHlo(
-      mlir_module, arg_shapes, "XLA_CPU_JIT",
-      /*use_tuple_args=*/true, TestShapeRepresentation, &compilation_result);
-  TF_ASSERT_OK(s);
-
-  const xla::HloModuleConfig module_config(
-      compilation_result.computation->GetProgramShape().ValueOrDie());
-  auto status_or_hlo_module = xla::HloModule::CreateFromProto(
-      compilation_result.computation->proto(), module_config);
-  TF_ASSERT_OK(status_or_hlo_module.status());
-  constexpr char expected_hlo_module_string[] = R"(HloModule main.9
-
-ENTRY %main.9 (arg_tuple.1: (f32[128,10], f32[10,1024], f32[128,1024])) -> (f32[128,10], f32[10,1024], f32[128,1024]) {
-  %arg_tuple.1 = (f32[128,10]{1,0}, f32[10,1024]{1,0}, f32[128,1024]{1,0}) parameter(0)
-  %get-tuple-element.2 = f32[128,10]{1,0} get-tuple-element((f32[128,10]{1,0}, f32[10,1024]{1,0}, f32[128,1024]{1,0}) %arg_tuple.1), index=0
-  %reshape.5 = f32[128,10]{1,0} reshape(f32[128,10]{1,0} %get-tuple-element.2)
-  %get-tuple-element.3 = f32[10,1024]{1,0} get-tuple-element((f32[128,10]{1,0}, f32[10,1024]{1,0}, f32[128,1024]{1,0}) %arg_tuple.1), index=1
-  %reshape.6 = f32[10,1024]{1,0} reshape(f32[10,1024]{1,0} %get-tuple-element.3)
-  %get-tuple-element.4 = f32[128,1024]{1,0} get-tuple-element((f32[128,10]{1,0}, f32[10,1024]{1,0}, f32[128,1024]{1,0}) %arg_tuple.1), index=2
-  %reshape.7 = f32[128,1024]{1,0} reshape(f32[128,1024]{1,0} %get-tuple-element.4)
-  ROOT %tuple.8 = (f32[128,10]{1,0}, f32[10,1024]{1,0}, f32[128,1024]{1,0}) tuple(f32[128,10]{1,0} %reshape.5, f32[10,1024]{1,0} %reshape.6, f32[128,1024]{1,0} %reshape.7), sharding={{devices=[1,2]0,1}, {maximal device=0}, {replicated}}
-}
-
-)";
-  EXPECT_EQ(expected_hlo_module_string,
-            status_or_hlo_module.ValueOrDie()->ToString());
-}
-
-// Verify that conversion from Graph to MLIR and empty shape representation
-// function is successful.
-TEST(CompileGraphToXlaHlo, Basic) {
-  FunctionLibraryDefinition flib_def(OpRegistry::Global(), {});
-  Graph graph(OpRegistry::Global());
-
-  Node* arg = test::graph::Arg(&graph, 0, DT_FLOAT);
-  test::graph::Retval(&graph, 0, arg);
-
-  XlaCompiler::CompilationResult result;
-  XlaCompiler::Argument compiler_arg;
-  compiler_arg.kind = XlaCompiler::Argument::kParameter;
-  compiler_arg.shape = TensorShape();
-
-  TF_ASSERT_OK(
-      CompileGraphToXlaHlo(graph, /*args=*/{compiler_arg}, "XLA_CPU_JIT",
-                           /*use_tuple_args=*/false, flib_def, GraphDebugInfo(),
-                           /*shape_representation_fn=*/nullptr, &result));
-
-  const xla::HloModuleConfig module_config(
-      result.computation->GetProgramShape().ValueOrDie());
-  auto status_or_hlo_module = xla::HloModule::CreateFromProto(
-      result.computation->proto(), module_config);
-  ASSERT_TRUE(status_or_hlo_module.ok());
-
-  constexpr char expected_hlo_module_string[] = R"(HloModule main.3
-
-ENTRY %main.3 (Arg_0.1: f32[]) -> (f32[]) {
-  %Arg_0.1 = f32[] parameter(0)
-  ROOT %tuple.2 = (f32[]) tuple(f32[] %Arg_0.1)
-}
-
-)";
-
-  EXPECT_EQ(expected_hlo_module_string,
-            status_or_hlo_module.ValueOrDie()->ToString());
-}
-
-// Tests a conversion from Graph to MLIR with resource arguments.
-TEST(CompileGraphToXlaHlo, Resources) {
-  FunctionLibraryDefinition flib_def(OpRegistry::Global(), {});
-  Graph graph(OpRegistry::Global());
-
-  Scope scope = Scope::NewRootScope().ExitOnError();
-  auto val = ops::_Arg(scope.WithOpName("arg0"), DT_FLOAT, 0);
-  auto var = ops::_Arg(scope.WithOpName("arg1"), DT_RESOURCE, 1);
-  auto assign =
-      ops::AssignVariableOp(scope.WithOpName("assign_variable"), var, val);
-  TF_ASSERT_OK(scope.ToGraph(&graph));
-
-  XlaCompiler::CompilationResult result;
-  XlaCompiler::Argument arg0;
-  arg0.kind = XlaCompiler::Argument::kParameter;
-  arg0.shape = TensorShape({2});
-  XlaCompiler::Argument arg1;
-  arg1.kind = XlaCompiler::Argument::kResource;
-  arg1.shape = TensorShape({2});
-  arg1.type = DT_FLOAT;
-
-  TF_ASSERT_OK(
-      CompileGraphToXlaHlo(graph, /*args=*/{arg0, arg1}, "XLA_CPU_JIT",
-                           /*use_tuple_args=*/false, flib_def, GraphDebugInfo(),
-                           /*shape_representation_fn=*/nullptr, &result));
-
-  EXPECT_EQ(result.outputs.size(), 0);
-  ASSERT_EQ(result.resource_updates.size(), 1);
-  const auto& resource_update = result.resource_updates[0];
-  EXPECT_EQ(resource_update.input_index, 1);
-  EXPECT_EQ(resource_update.modified, true);
-  EXPECT_EQ(resource_update.shape, TensorShape({2}));
-  EXPECT_EQ(resource_update.type, DT_FLOAT);
-
-  const xla::HloModuleConfig module_config(
-      result.computation->GetProgramShape().ValueOrDie());
-  auto status_or_hlo_module = xla::HloModule::CreateFromProto(
-      result.computation->proto(), module_config);
-  ASSERT_TRUE(status_or_hlo_module.ok());
-
-  constexpr char expected_hlo_module_string[] =
-      R"(HloModule main.4, input_output_alias={ {0}: 1 }
-
-ENTRY %main.4 (Arg_0.1: f32[2], Arg_1.2: f32[2]) -> (f32[2]) {
-  %Arg_1.2 = f32[2]{0} parameter(1)
-  %Arg_0.1 = f32[2]{0} parameter(0)
-  ROOT %tuple.3 = (f32[2]{0}) tuple(f32[2]{0} %Arg_0.1)
-}
-
-)";
-
-  EXPECT_EQ(expected_hlo_module_string,
-            status_or_hlo_module.ValueOrDie()->ToString());
-}
-
-}  // namespace
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc
index 359314a64b0..05e1f059029 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc
@@ -36,8 +36,8 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/lib/bfloat16/bfloat16.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/bfloat16.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/tstring.h"
@@ -161,7 +161,7 @@ StatusOr<ElementsAttr> ConvertTensor(const Tensor& input_tensor,
     default:
       // TODO(shpeisman): restructure code to reuse dialect pointer across
       // calls.
-      auto* dialect = builder->getContext()->getRegisteredDialect("tf");
+      auto* dialect = builder->getContext()->getLoadedDialect("tf");
       return OpaqueElementsAttr::get(dialect, type, MangleTensor(input_tensor));
   }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor_test.cc
index bf96e3d1df4..6266a5e2195 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Dialect.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
@@ -33,16 +34,13 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-static void RegisterDialects() {
-  static bool init_once = []() {
-    mlir::registerDialect<mlir::TF::TensorFlowDialect>();
-    return true;
-  }();
-  (void)init_once;
+static void RegisterDialects(mlir::MLIRContext &context) {
+  context.loadDialect<mlir::TF::TensorFlowDialect>();
 }
 
 TEST(ConvertTypeToTensorTypeTest, UnrankedTensorType) {
   mlir::MLIRContext context;
+  RegisterDialects(context);
   mlir::Builder b(&context);
 
   PartialTensorShape output_shape =
@@ -52,6 +50,7 @@ TEST(ConvertTypeToTensorTypeTest, UnrankedTensorType) {
 
 TEST(ConvertTypeToTensorTypeTest, NonFullyDefinedRankedTensorType) {
   mlir::MLIRContext context;
+  RegisterDialects(context);
   mlir::Builder b(&context);
 
   PartialTensorShape output_shape = ConvertTypeToTensorShape(
@@ -61,6 +60,7 @@ TEST(ConvertTypeToTensorTypeTest, NonFullyDefinedRankedTensorType) {
 
 TEST(ConvertTypeToTensorTypeTest, FullyDefinedRankedTensorType) {
   mlir::MLIRContext context;
+  RegisterDialects(context);
   mlir::Builder b(&context);
 
   PartialTensorShape output_shape = ConvertTypeToTensorShape(
@@ -77,8 +77,8 @@ TEST(ConvertTypeToTensorTypeTest, ScalarTensorType) {
 }
 
 TEST(ConvertTypeToTensorTypeTest, ConvertStringTensor) {
-  RegisterDialects();
   mlir::MLIRContext context;
+  RegisterDialects(context);
   mlir::Builder b(&context);
 
   // Create the sample tensor to convert.
@@ -123,9 +123,8 @@ class ConvertTensorTest : public ::testing::Test {
 };
 
 TEST_F(ConvertTensorTest, Simple) {
-  RegisterDialects();
-
   mlir::MLIRContext context;
+  RegisterDialects(context);
   ASSERT_NO_FATAL_FAILURE(VerifyConversion<Eigen::half>(
       {Eigen::half(1.0)}, DT_HALF, mlir::FloatType::getF16(&context)));
   ASSERT_NO_FATAL_FAILURE(
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/convert_type.cc b/tensorflow/compiler/mlir/tensorflow/utils/convert_type.cc
index 0caceb69510..0d035e8f864 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/convert_type.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/convert_type.cc
@@ -91,64 +91,62 @@ Status ConvertDataType(DataType dtype, Builder builder, Type* type) {
 }
 
 Status ConvertScalarTypeToDataType(Type type, DataType* dtype) {
-  switch (type.getKind()) {
-    case mlir::StandardTypes::F16:
-      *dtype = DT_HALF;
-      return Status::OK();
-    case mlir::StandardTypes::F32:
-      *dtype = DT_FLOAT;
-      return Status::OK();
-    case mlir::StandardTypes::F64:
-      *dtype = DT_DOUBLE;
-      return Status::OK();
-    case mlir::StandardTypes::BF16:
-      *dtype = DT_BFLOAT16;
-      return Status::OK();
-    case mlir::StandardTypes::Integer: {
-      const auto& itype = type.cast<mlir::IntegerType>();
-      switch (itype.getWidth()) {
-        case 1:
-          *dtype = DT_BOOL;
-          return Status::OK();
-        case 8:
-          *dtype = itype.isUnsigned() ? DT_UINT8 : DT_INT8;
-          return Status::OK();
-        case 16:
-          *dtype = itype.isUnsigned() ? DT_UINT16 : DT_INT16;
-          return Status::OK();
-        case 32:
-          *dtype = itype.isUnsigned() ? DT_UINT32 : DT_INT32;
-          return Status::OK();
-        case 64:
-          *dtype = itype.isUnsigned() ? DT_UINT64 : DT_INT64;
-          return Status::OK();
-        default:
-          return errors::Unimplemented(
-              absl::StrCat("Converting ", debugString(type), " to DataType"));
-      }
-    }
-    case mlir::StandardTypes::Complex: {
-      auto etype = type.cast<mlir::ComplexType>().getElementType();
-      if (etype.isF32()) {
-        *dtype = DT_COMPLEX64;
-        return Status::OK();
-      } else if (etype.isF64()) {
-        *dtype = DT_COMPLEX128;
-        return Status::OK();
-      }
-      return errors::Unimplemented(
-          absl::StrCat("Converting ", debugString(type), " to DataType"));
-    }
-#define HANDLE_TF_TYPE(tftype, enumerant, name) \
-  case mlir::TF::TensorFlowTypes::enumerant:    \
-    *dtype = DT_##enumerant;                    \
+  if (type.isF16()) {
+    *dtype = DT_HALF;
     return Status::OK();
+  } else if (type.isF32()) {
+    *dtype = DT_FLOAT;
+    return Status::OK();
+  } else if (type.isF64()) {
+    *dtype = DT_DOUBLE;
+    return Status::OK();
+  } else if (type.isBF16()) {
+    *dtype = DT_BFLOAT16;
+    return Status::OK();
+  } else if (auto itype = type.dyn_cast<mlir::IntegerType>()) {
+    switch (itype.getWidth()) {
+      case 1:
+        *dtype = DT_BOOL;
+        return Status::OK();
+      case 8:
+        *dtype = itype.isUnsigned() ? DT_UINT8 : DT_INT8;
+        return Status::OK();
+      case 16:
+        *dtype = itype.isUnsigned() ? DT_UINT16 : DT_INT16;
+        return Status::OK();
+      case 32:
+        *dtype = itype.isUnsigned() ? DT_UINT32 : DT_INT32;
+        return Status::OK();
+      case 64:
+        *dtype = itype.isUnsigned() ? DT_UINT64 : DT_INT64;
+        return Status::OK();
+      default:
+        return errors::Unimplemented(
+            absl::StrCat("Converting ", debugString(type), " to DataType"));
+    }
+  } else if (auto complex_type = type.dyn_cast<mlir::ComplexType>()) {
+    auto etype = complex_type.getElementType();
+    if (etype.isF32()) {
+      *dtype = DT_COMPLEX64;
+      return Status::OK();
+    } else if (etype.isF64()) {
+      *dtype = DT_COMPLEX128;
+      return Status::OK();
+    }
+    return errors::Unimplemented(
+        absl::StrCat("Converting ", debugString(type), " to DataType"));
+  }
+
+#define HANDLE_TF_TYPE(tftype, enumerant, name) \
+  if (type.isa<mlir::TF::tftype##Type>()) {     \
+    *dtype = DT_##enumerant;                    \
+    return Status::OK();                        \
+  }
 // NOLINTNEXTLINE
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.def"
-    default:
-      return errors::Unimplemented(
-          absl::StrCat("Converting ", debugString(type), " to DataType"));
-  }
+
+  return errors::Unimplemented(
+      absl::StrCat("Converting ", debugString(type), " to DataType"));
 }
 
 Status ConvertToDataType(Type type, DataType* dtype) {
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/device_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/device_util.cc
index bf0b3b75ace..81892934efe 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/device_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/device_util.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <string>
 
+#include "absl/strings/string_view.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
@@ -25,6 +26,8 @@ limitations under the License.
 #include "llvm/Support/Regex.h"
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Diagnostics.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/core/common_runtime/device.h"
@@ -155,4 +158,19 @@ mlir::LogicalResult GetDevicesFromOp(mlir::Operation* op,
       llvm::formatv("unsupported '{0}' attribute", kDevicesAttr));
 }
 
+mlir::LogicalResult GetDeviceOrdinalFromDeviceString(mlir::Location loc,
+                                                     llvm::StringRef device,
+                                                     int64_t* device_ordinal) {
+  DeviceNameUtils::ParsedName parsed_name;
+  if (!DeviceNameUtils::ParseFullName(
+          absl::string_view(device.data(), device.size()), &parsed_name))
+    return mlir::emitError(loc) << "invalid device '" << device << "'";
+
+  if (!parsed_name.has_id)
+    return mlir::emitError(loc) << "device '" << device << "' has no id";
+
+  *device_ordinal = parsed_name.id;
+  return mlir::success();
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/device_util.h b/tensorflow/compiler/mlir/tensorflow/utils/device_util.h
index 893e118024c..14e48bf7710 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/device_util.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/device_util.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_DEVICE_UTIL_H_
 
 #include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h"
@@ -41,6 +42,12 @@ void AddDevicesToOp(mlir::Operation* op, const DeviceSet* device_set);
 mlir::LogicalResult GetDevicesFromOp(mlir::Operation* op,
                                      mlir::TF::RuntimeDevices* devices);
 
+// Parses a device string and returns its ordinal (id). This will return an
+// error if the device string is invalid or has no id.
+mlir::LogicalResult GetDeviceOrdinalFromDeviceString(mlir::Location loc,
+                                                     llvm::StringRef device,
+                                                     int64_t* device_ordinal);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_DEVICE_UTIL_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/device_util_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/device_util_test.cc
index bc849e1d116..1da1f5973f6 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/device_util_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/device_util_test.cc
@@ -205,5 +205,47 @@ TEST(DeviceUtilTest, GetGpuDeviceMetadata) {
   ASSERT_FALSE(meta_1.hasValue());
 }
 
+TEST(DeviceUtilTest, GetDeviceOrdinalFromDeviceString) {
+  const std::string tpu0 = "/job:worker/replica:0/task:0/device:TPU:0";
+  const std::string tpu1 = "/job:worker/replica:0/task:0/device:TPU:1";
+
+  mlir::MLIRContext context;
+  auto unknown_loc = mlir::UnknownLoc::get(&context);
+
+  int64_t device_ordinal0 = -1;
+  mlir::LogicalResult result0 =
+      GetDeviceOrdinalFromDeviceString(unknown_loc, tpu0, &device_ordinal0);
+  EXPECT_TRUE(mlir::succeeded(result0));
+  EXPECT_EQ(device_ordinal0, 0);
+
+  int64_t device_ordinal1 = -1;
+  mlir::LogicalResult result1 =
+      GetDeviceOrdinalFromDeviceString(unknown_loc, tpu1, &device_ordinal1);
+  EXPECT_TRUE(mlir::succeeded(result1));
+  EXPECT_EQ(device_ordinal1, 1);
+}
+
+TEST(DeviceUtilTest, GetDeviceOrdinalFromDeviceStringInvalid) {
+  mlir::MLIRContext context;
+  auto unknown_loc = mlir::UnknownLoc::get(&context);
+
+  int64_t device_ordinal = -1;
+  mlir::LogicalResult result = GetDeviceOrdinalFromDeviceString(
+      unknown_loc, "bad_device", &device_ordinal);
+  EXPECT_TRUE(mlir::failed(result));
+}
+
+TEST(DeviceUtilTest, GetDeviceOrdinalFromDeviceStringNoId) {
+  const std::string tpu_no_id = "/job:worker/replica:0/task:0/device:TPU";
+
+  mlir::MLIRContext context;
+  auto unknown_loc = mlir::UnknownLoc::get(&context);
+
+  int64_t device_ordinal = -1;
+  mlir::LogicalResult result =
+      GetDeviceOrdinalFromDeviceString(unknown_loc, tpu_no_id, &device_ordinal);
+  EXPECT_TRUE(mlir::failed(result));
+}
+
 }  // anonymous namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/error_util.h b/tensorflow/compiler/mlir/tensorflow/utils/error_util.h
index 4feb3837357..b5f2acc581d 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/error_util.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/error_util.h
@@ -27,7 +27,7 @@ limitations under the License.
 namespace mlir {
 
 // TensorFlow's Status is used for error reporting back to callers.
-using tensorflow::Status;
+using ::tensorflow::Status;
 
 // Diagnostic handler that collects all the diagnostics reported and can produce
 // a Status to return to callers. This is for the case where MLIR functions are
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc b/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc
index 0364b935b92..67c2aebf121 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/Casting.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
@@ -227,25 +228,13 @@ Status ConvertAttribute(const mlir::ArrayAttr& attr, AttrValue* value) {
   return Status::OK();
 }
 
-// Updates NodeDef constructed out of an MLIR If op to map it to either
-// TensorFlow StatelessIf or If op depending on the additional attribute.
-void UpdateCompositeIfOp(NodeDef* node_def) {
+// Updates NodeDef constructed out of an MLIR Case/IfW/While op to map it to
+// either TensorFlow StatelessX or X op depending on the additional attribute.
+void UpdateCompositeOp(NodeDef* node_def) {
   auto it = node_def->mutable_attr()->find("is_stateless");
   if (it != node_def->attr().end()) {
     if (it->second.b()) {
-      *node_def->mutable_op() = "StatelessIf";
-    }
-    node_def->mutable_attr()->erase(it);
-  }
-}
-
-// Updates NodeDef constructed out of an MLIR While op to map it to either
-// TensorFlow StatelessWhile or While op depending on the additional attribute.
-void UpdateCompositeWhileOp(NodeDef* node_def) {
-  auto it = node_def->mutable_attr()->find("is_stateless");
-  if (it != node_def->attr().end()) {
-    if (it->second.b()) {
-      *node_def->mutable_op() = "StatelessWhile";
+      *node_def->mutable_op() = "Stateless" + node_def->op();
     }
     node_def->mutable_attr()->erase(it);
   }
@@ -352,8 +341,9 @@ StatusOr<std::unique_ptr<NodeDef>> GetOperationNodeDef(
   TF_RETURN_IF_ERROR(ConvertLocation(
       inst->getLoc(), node_def->mutable_experimental_debug_info()));
 
-  if (node_def->op() == "If") UpdateCompositeIfOp(node_def.get());
-  if (node_def->op() == "While") UpdateCompositeWhileOp(node_def.get());
+  if (node_def->op() == "Case") UpdateCompositeOp(node_def.get());
+  if (node_def->op() == "If") UpdateCompositeOp(node_def.get());
+  if (node_def->op() == "While") UpdateCompositeOp(node_def.get());
 
   return node_def;
 }
@@ -379,65 +369,36 @@ Status ConvertAttributes(
       name = mangling_util::DemangleAttributeName(name);
     }
     AttrValue value;
-    switch (attr.getKind()) {
-      case mlir::StandardAttributes::SymbolRef: {
-        TF_RETURN_IF_ERROR(
-            ConvertAttribute(attr.cast<mlir::FlatSymbolRefAttr>(), &value));
-        func_call_attrs[string(name)] = value;
-        continue;
-      }
-      case mlir::StandardAttributes::Integer:
-        if (auto boolAttr = attr.dyn_cast<mlir::BoolAttr>()) {
-          TF_RETURN_IF_ERROR(ConvertAttribute(boolAttr, &value));
-        } else {
-          TF_RETURN_IF_ERROR(
-              ConvertAttribute(attr.cast<mlir::IntegerAttr>(), &value));
-        }
-        break;
-      case mlir::StandardAttributes::Float:
-        TF_RETURN_IF_ERROR(
-            ConvertAttribute(attr.cast<mlir::FloatAttr>(), &value));
-        break;
-      case mlir::StandardAttributes::String:
-        TF_RETURN_IF_ERROR(
-            ConvertAttribute(attr.cast<mlir::StringAttr>(), &value));
-        break;
-      case mlir::StandardAttributes::Array:
-        TF_RETURN_IF_ERROR(
-            ConvertAttribute(attr.cast<mlir::ArrayAttr>(), &value));
-        break;
-      case mlir::StandardAttributes::DenseIntOrFPElements:
-      case mlir::StandardAttributes::DenseStringElements:
-      case mlir::StandardAttributes::OpaqueElements:
-        TF_RETURN_IF_ERROR(
-            ConvertAttribute(attr.cast<mlir::ElementsAttr>(), &value));
-        break;
-      case mlir::StandardAttributes::Type:
-        TF_RETURN_IF_ERROR(
-            ConvertAttribute(attr.cast<mlir::TypeAttr>(), &value));
-        break;
-      case mlir::StandardAttributes::Unit:
-        TF_RETURN_IF_ERROR(
-            ConvertAttribute(attr.cast<mlir::UnitAttr>(), &value));
-        break;
-      case static_cast<unsigned>(mlir::TF::AttrKind::SHAPE):
-        TF_RETURN_IF_ERROR(
-            ConvertAttribute(attr.cast<mlir::TF::ShapeAttr>(), &value));
-        break;
-      case static_cast<unsigned>(mlir::TF::AttrKind::FUNC): {
-        TF_RETURN_IF_ERROR(
-            ConvertAttribute(attr.cast<mlir::TF::FuncAttr>(), &value));
-        func_call_attrs[string(name)] = value;
-        continue;
-      }
-      // AffineMap kind is not implemented.
-      case mlir::StandardAttributes::AffineMap:
-        return errors::Unimplemented("AffineMap attribute (needed for '",
-                                     name_strref, "') unimplemented");
-      default:
-        return errors::Unimplemented("Unhandled attribute kind for attribute '",
-                                     name_strref, '\'');
+    if (auto symbol_ref = attr.dyn_cast<mlir::SymbolRefAttr>()) {
+      TF_RETURN_IF_ERROR(
+          ConvertAttribute(symbol_ref.cast<mlir::FlatSymbolRefAttr>(), &value));
+      func_call_attrs[string(name)] = value;
+      continue;
     }
+    if (auto func_attr = attr.dyn_cast<mlir::TF::FuncAttr>()) {
+      TF_RETURN_IF_ERROR(ConvertAttribute(func_attr, &value));
+      func_call_attrs[string(name)] = value;
+      continue;
+    }
+    if (attr.isa<mlir::AffineMapAttr>()) {
+      // AffineMapAttr is not implemented.
+      return errors::Unimplemented("AffineMap attribute (needed for '",
+                                   name_strref, "') unimplemented");
+    }
+    TF_RETURN_IF_ERROR(
+        llvm::TypeSwitch<mlir::Attribute, Status>(attr)
+            .Case<mlir::BoolAttr, mlir::IntegerAttr, mlir::FloatAttr,
+                  mlir::StringAttr, mlir::ArrayAttr, mlir::ElementsAttr,
+                  mlir::TypeAttr, mlir::UnitAttr, mlir::TF::ShapeAttr>(
+                [&](auto derived_attr) {
+                  return ConvertAttribute(derived_attr, &value);
+                })
+            .Default([&](mlir::Attribute) {
+              return errors::Unimplemented(
+                  "Unhandled attribute kind for attribute '", name_strref,
+                  '\'');
+            }));
+
     // According to the NodeDef proto definition, an attribute name from the
     // input TensorFlow GraphDef shouldn't contain '.'. If it does appear in
     // the attribute from MLIR, it is treated as an attribute from function
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils.cc b/tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils.cc
new file mode 100644
index 00000000000..8e9495c0454
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils.cc
@@ -0,0 +1,56 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils.h"
+
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/Parser.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/core/platform/errors.h"
+
+namespace tensorflow {
+
+std::string SerializeMlirModule(mlir::ModuleOp module_op) {
+  std::string serialized_mlir_module;
+  llvm::raw_string_ostream os(serialized_mlir_module);
+  mlir::OpPrintingFlags print_flags;
+  print_flags.enableDebugInfo();
+  module_op.print(os, print_flags);
+  return std::move(os.str());
+}
+
+Status DeserializeMlirModule(llvm::StringRef serialized_mlir_module,
+                             mlir::MLIRContext* mlir_context,
+                             mlir::OwningModuleRef* mlir_module) {
+  TF_RET_CHECK(!serialized_mlir_module.empty())
+      << "unexpected empty serialized MLIR module string";
+  TF_RET_CHECK(mlir_module) << "unexpected null MLIR module pointer";
+
+  // Make sure we catch any error reported by MLIR and forward it to the TF
+  // error reporting system.
+  mlir::StatusScopedDiagnosticHandler error_handler(mlir_context);
+
+  // Parse the module.
+  *mlir_module = mlir::parseSourceString(serialized_mlir_module, mlir_context);
+  if (!*mlir_module)
+    return error_handler.Combine(
+        errors::InvalidArgument("could not parse MLIR module"));
+
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils.h b/tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils.h
new file mode 100644
index 00000000000..12d1c39132e
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils.h
@@ -0,0 +1,39 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_SERIALIZE_MLIR_MODULE_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_SERIALIZE_MLIR_MODULE_UTILS_H_
+
+#include <string>
+
+#include "llvm/ADT/StringRef.h"
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+
+// Prints a MLIR module `module_op` and returns it as a string.
+std::string SerializeMlirModule(mlir::ModuleOp module_op);
+
+// Parses a MLIR module from `mlir_module_string` into `mlir_module` with
+// context `mlir_context`.
+Status DeserializeMlirModule(llvm::StringRef serialized_mlir_module,
+                             mlir::MLIRContext* mlir_context,
+                             mlir::OwningModuleRef* mlir_module);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_SERIALIZE_MLIR_MODULE_UTILS_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/shape_inference_utils.cc b/tensorflow/compiler/mlir/tensorflow/utils/shape_inference_utils.cc
new file mode 100644
index 00000000000..d1815a4a88b
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/utils/shape_inference_utils.cc
@@ -0,0 +1,434 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tensorflow/utils/shape_inference_utils.h"
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Sequence.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Diagnostics.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Matchers.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/IR/Region.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Interfaces/DerivedAttributeOpInterface.h"  // from @llvm-project
+#include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/export_utils.h"
+#include "tensorflow/compiler/mlir/utils/array_container_utils.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_def_builder.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/types.h"
+
+#define DEBUG_TYPE "tf-shape-inference-utils"
+
+using ::tensorflow::int64;
+using tensorflow::shape_inference::DimensionHandle;
+using tensorflow::shape_inference::InferenceContext;
+using tensorflow::shape_inference::ShapeHandle;
+
+namespace mlir {
+namespace TF {
+
+namespace {
+
+// Extracts attributes from a MLIR operation, including derived attributes.
+NamedAttrList GetAllAttributesFromOperation(Operation* op) {
+  NamedAttrList attr_list;
+  attr_list.append(op->getAttrDictionary().getValue());
+
+  if (auto derived = dyn_cast<DerivedAttributeOpInterface>(op)) {
+    auto materialized = derived.materializeDerivedAttributes();
+    attr_list.append(materialized.getValue());
+  }
+
+  return attr_list;
+}
+
+// Extracts a PartialTensorShape from the MLIR type.
+Optional<tensorflow::PartialTensorShape> GetShapeFromMlirType(Type t) {
+  if (auto ranked_type = t.dyn_cast<RankedTensorType>()) {
+    // Convert the MLIR shape indices (int64_t) to TensorFlow indices
+    // (int64).
+    ArrayRef<int64_t> shape = ranked_type.getShape();
+    SmallVector<int64, 8> tf_shape(shape.begin(), shape.end());
+    return tensorflow::PartialTensorShape(
+        MutableArrayRefToSpan<int64>(tf_shape));
+  }
+  return None;
+}
+
+// Gets the subtype's shape and data type for `type`. Templated to support both
+// ResourceType and VariantType.
+template <typename T>
+std::unique_ptr<std::vector<
+    std::pair<tensorflow::PartialTensorShape, tensorflow::DataType>>>
+GetSubtypesHelper(Type type) {
+  auto type_with_subtypes =
+      type.cast<TensorType>().getElementType().dyn_cast<T>();
+  if (!type_with_subtypes || type_with_subtypes.getSubtypes().empty()) {
+    return nullptr;
+  }
+  auto shapes_and_types = std::make_unique<std::vector<
+      std::pair<tensorflow::PartialTensorShape, tensorflow::DataType>>>();
+  for (auto subtype : type_with_subtypes.getSubtypes()) {
+    auto shape = GetShapeFromMlirType(subtype);
+    // handle_shapes_and_types requires all shapes to be known. So if any
+    // subtype is unknown, clear the vector.
+    if (!shape) {
+      shapes_and_types = nullptr;
+      break;
+    }
+    tensorflow::DataType dtype;
+    auto status =
+        tensorflow::ConvertToDataType(subtype.getElementType(), &dtype);
+    assert(status.ok() && "Unknown element type");
+    shapes_and_types->emplace_back(*shape, dtype);
+  }
+  return shapes_and_types;
+}
+
+// Gets the subtype's shape and data type for `type`.
+std::unique_ptr<std::vector<
+    std::pair<tensorflow::PartialTensorShape, tensorflow::DataType>>>
+GetSubtypes(Type type) {
+  auto subclasses = GetSubtypesHelper<TF::ResourceType>(type);
+  if (subclasses) return subclasses;
+  return GetSubtypesHelper<TF::VariantType>(type);
+}
+
+// Returns a shape inference function call failure at `location`.
+LogicalResult EmitErrorFromShapeFunction(Optional<Location> location,
+                                         StringRef op_name,
+                                         StringRef error_message) {
+  LLVM_DEBUG(llvm::dbgs() << "Shape inference error for '" << op_name
+                          << "': " << error_message << "\n");
+  return emitOptionalError(
+      location,
+      llvm::formatv(
+          "TensorFlow shape inference function errored for op '{0}': {1}",
+          op_name, error_message)
+          .str());
+}
+
+// Extracts shape from a shape handle and inference context.
+Optional<SmallVector<int64_t, 8>> GetShapeFromHandle(InferenceContext& context,
+                                                     const ShapeHandle& sh) {
+  if (!context.RankKnown(sh)) return None;
+  SmallVector<int64_t, 8> shape;
+  for (int dim : llvm::seq<int>(0, context.Rank(sh)))
+    shape.push_back(context.Value(context.Dim(sh, dim)));
+  return shape;
+}
+
+// Creates a tensor type from a shape handle and element type.
+TensorType CreateTensorType(InferenceContext& context, const ShapeHandle& sh,
+                            Type element_type) {
+  auto shape = GetShapeFromHandle(context, sh);
+  if (shape.hasValue())
+    return RankedTensorType::get(shape.getValue(), element_type);
+  return UnrankedTensorType::get(element_type);
+}
+
+// Creates a ShapedTypeComponent from a shape handle and element type.
+ShapedTypeComponents CreateShapedTypeComponents(InferenceContext& context,
+                                                const ShapeHandle& sh,
+                                                Type element_type) {
+  auto shape = GetShapeFromHandle(context, sh);
+  if (shape.hasValue())
+    return ShapedTypeComponents(shape.getValue(), element_type);
+  return ShapedTypeComponents(element_type);
+}
+
+// Runs TensorFlow shape inference associated to the op type registered in the
+// TensorFlow op registry based on Graph version, operands, and attributes.
+// Invoking this shape function will invoke conversions of parameters to the
+// TensorFlow Graph equivalent data structures and back to MLIR equivalent data
+// structures. This does not use a natively implemented shape inference in MLIR,
+// and instead is temporary until shape functions are reimplemented/migrated to
+// being in MLIR instead of the TensorFlow op registry.
+LogicalResult InferReturnTypeComponentsFallback(
+    MLIRContext* context, StringRef op_name, int64_t graph_version,
+    Optional<Location> location, ValueRange operands,
+    const NamedAttrList& attributes, OperandAsConstantFn operand_as_constant_fn,
+    OpResultAsShapeFn op_result_as_shape_fn,
+    ResultElementTypeFn result_element_type_fn,
+    SmallVectorImpl<ShapedTypeComponents>& inferred_return_shapes) {
+  assert(op_name.startswith(TensorFlowDialect::getDialectNamespace()));
+  // Drop the `tf.` prefix to query TF registry.
+  std::string op_type =
+      op_name.drop_front(TensorFlowDialect::getDialectNamespace().size() + 1)
+          .str();
+
+  // Get information from the registry and check if we have a shape function for
+  // this op.
+  const tensorflow::OpRegistrationData* op_reg_data =
+      tensorflow::OpRegistry::Global()->LookUp(op_type);
+  if (!op_reg_data) {
+    LLVM_DEBUG(llvm::dbgs() << "Skipping inference for unregistered op '"
+                            << op_name << "'.\n");
+    return emitOptionalError(location, "op is unregistered");
+  }
+  if (!op_reg_data->shape_inference_fn) {
+    LLVM_DEBUG(llvm::dbgs()
+               << "Skipping inference for op without shape function '"
+               << op_name << "'.\n");
+    return emitOptionalError(location, "missing shape function");
+  }
+
+  // Convert the operation attributes to be able to use the InferenceContext
+  // and the TensorFlow shape function.
+  tensorflow::AttrValueMap converted_attributes;
+  NamedAttrList attributes_to_convert;
+  // Filter out unregistered attributes.
+  for (const auto& attr_def : op_reg_data->op_def.attr())
+    if (auto registered_attr = attributes.get(attr_def.name()))
+      attributes_to_convert.set(attr_def.name(), registered_attr);
+
+  auto attrs_status = tensorflow::ConvertAttributes(
+      attributes_to_convert, /*attrs_to_ignore=*/{}, &converted_attributes);
+  if (!attrs_status.ok()) {
+    LLVM_DEBUG(llvm::dbgs() << "Error creating attribute map for '" << op_name
+                            << "': " << attrs_status.error_message() << "\n");
+    return emitOptionalError(
+        location,
+        "failed to convert attributes to proto map<string, AttrValue>");
+  }
+
+  // Collect an array with input values for constant operands and input shapes
+  // for all the operands.
+  std::vector<const tensorflow::Tensor*> input_tensors(operands.size());
+  std::vector<tensorflow::PartialTensorShape> input_shapes(operands.size());
+  std::vector<tensorflow::Tensor> tensors(operands.size());
+  std::vector<std::unique_ptr<std::vector<
+      std::pair<tensorflow::PartialTensorShape, tensorflow::DataType>>>>
+      handle_shapes_and_types(operands.size());
+  for (auto it : llvm::enumerate(operands)) {
+    Value operand = it.value();
+    size_t index = it.index();
+
+    // If the operand is constant, then convert it to Tensor.
+    if (auto attr = operand_as_constant_fn(operand)) {
+      tensorflow::Tensor* input_tensor = &tensors[index];
+      auto status =
+          tensorflow::ConvertToTensor(attr.cast<ElementsAttr>(), input_tensor);
+      if (status.ok()) {
+        input_tensors[index] = input_tensor;
+      } else {
+        LLVM_DEBUG(llvm::dbgs() << "Error converting input " << index
+                                << " of op '" << op_name << "' to Tensor: "
+                                << status.error_message() << "\n");
+      }
+    }
+
+    Type operand_type = operand.getType();
+    if (auto shape = GetShapeFromMlirType(operand_type)) {
+      input_shapes[index] = *shape;
+    }
+    // Collect the handle shapes and types for a resource/variant.
+    handle_shapes_and_types[index] = GetSubtypes(operand_type);
+  }
+
+  // Perform the shape inference using an InferenceContext with the input
+  // shapes. This object is abstracting the information that the ShapeInference
+  // function operates on.
+  InferenceContext c(graph_version,
+                     tensorflow::AttrSlice(&converted_attributes),
+                     op_reg_data->op_def, input_shapes, input_tensors,
+                     /*input_tensors_as_shapes=*/{}, handle_shapes_and_types);
+  auto status = c.Run(op_reg_data->shape_inference_fn);
+  if (!status.ok())
+    return EmitErrorFromShapeFunction(location, op_name,
+                                      status.error_message());
+
+  // Determine if, during shape computation, the shape functions attempted to
+  // query an input operand as shape where the input was not known/constant.
+  bool requires_inputs =
+      any_of(llvm::seq<int>(0, c.num_inputs()), [&](int input) {
+        return c.requested_input_tensor_as_partial_shape(input) &&
+               !input_tensors[input];
+      });
+  if (requires_inputs) {
+    LLVM_DEBUG(llvm::dbgs() << "\trequired input\n");
+    std::vector<ShapeHandle> input_tensors_as_shapes;
+    for (int input : llvm::seq<int>(0, c.num_inputs())) {
+      if (c.requested_input_tensor_as_partial_shape(input) &&
+          !input_tensors[input]) {
+        LLVM_DEBUG(llvm::dbgs() << "Requesting " << input << " as shape\n");
+        auto op_result = operands[input].dyn_cast<OpResult>();
+        if (!op_result) continue;
+        // Resize on first valid shape computed.
+        input_tensors_as_shapes.resize(c.num_inputs());
+        auto handle = op_result_as_shape_fn(c, op_result);
+        LLVM_DEBUG(llvm::dbgs() << "Requested " << input << " as shape "
+                                << (handle.Handle() ? "found" : "not found"));
+        if (handle.Handle()) input_tensors_as_shapes[input] = handle;
+      }
+    }
+
+    // Attempt to compute the unknown operands as shapes.
+    // Note: in the case where no partial outputs could be computed, this
+    // would be empty.
+    if (!input_tensors_as_shapes.empty()) {
+      c.set_input_tensors_as_shapes(input_tensors_as_shapes);
+      auto status = c.Run(op_reg_data->shape_inference_fn);
+      if (!status.ok())
+        return EmitErrorFromShapeFunction(location, op_name,
+                                          status.error_message());
+    }
+  }
+
+  // Update the shape for each of the operation result if the InferenceContext
+  // has more precise shapes recorded.
+  for (int output : llvm::seq<int>(0, c.num_outputs())) {
+    ShapeHandle shape_handle = c.output(output);
+    LLVM_DEBUG(llvm::dbgs() << "Inferred output " << output << " : "
+                            << c.DebugString(shape_handle) << "\n");
+
+    Type new_element_type = result_element_type_fn(output);
+    // Populate the handle shapes for a resource/variant.
+    if (new_element_type &&
+        new_element_type.isa<TF::ResourceType, TF::VariantType>()) {
+      auto handle_shapes_types = c.output_handle_shapes_and_types(output);
+      if (handle_shapes_types) {
+        SmallVector<TensorType, 1> subtypes;
+        Builder b(context);
+        for (const auto& shape_n_type : *handle_shapes_types) {
+          Type element_type;
+          auto status =
+              tensorflow::ConvertDataType(shape_n_type.dtype, b, &element_type);
+          assert(status.ok() && "Unknown element type");
+          subtypes.push_back(
+              CreateTensorType(c, shape_n_type.shape, element_type));
+        }
+        if (new_element_type.isa<TF::ResourceType>()) {
+          new_element_type = TF::ResourceType::get(subtypes, context);
+        } else {
+          new_element_type = TF::VariantType::get(subtypes, context);
+        }
+      }
+    }
+    inferred_return_shapes.push_back(
+        CreateShapedTypeComponents(c, shape_handle, new_element_type));
+  }
+
+  return success();
+}
+
+}  // namespace
+
+LogicalResult InferReturnTypeComponentsForTFOp(
+    Optional<Location> location, Operation* op, int64_t graph_version,
+    OperandAsConstantFn operand_as_constant_fn,
+    OpResultAsShapeFn op_result_as_shape_fn,
+    ResultElementTypeFn result_element_type_fn,
+    SmallVectorImpl<ShapedTypeComponents>& inferred_return_shapes) {
+  auto attributes = GetAllAttributesFromOperation(op);
+  return InferReturnTypeComponentsFallback(
+      op->getContext(), op->getName().getStringRef(), graph_version, location,
+      op->getOperands(), attributes, operand_as_constant_fn,
+      op_result_as_shape_fn, result_element_type_fn, inferred_return_shapes);
+}
+
+LogicalResult InferReturnTypeComponentsForTFOp(
+    Optional<Location> location, Operation* op, int64_t graph_version,
+    SmallVectorImpl<ShapedTypeComponents>& inferred_return_shapes) {
+  if (auto type_op = dyn_cast<InferTypeOpInterface>(op)) {
+    auto attributes = GetAllAttributesFromOperation(op);
+    SmallVector<Type, 4> inferred_return_types;
+    auto result = type_op.inferReturnTypes(
+        op->getContext(), location, op->getOperands(),
+        DictionaryAttr::get(attributes, op->getContext()), op->getRegions(),
+        inferred_return_types);
+    if (failed(result)) return failure();
+
+    inferred_return_shapes.resize(inferred_return_types.size());
+    for (auto inferred_return_type : llvm::enumerate(inferred_return_types)) {
+      if (auto shaped_type =
+              inferred_return_type.value().dyn_cast<ShapedType>()) {
+        if (shaped_type.hasRank()) {
+          inferred_return_shapes[inferred_return_type.index()] =
+              ShapedTypeComponents(shaped_type.getShape(),
+                                   shaped_type.getElementType());
+        } else {
+          inferred_return_shapes[inferred_return_type.index()] =
+              ShapedTypeComponents(shaped_type.getElementType());
+        }
+      }
+    }
+
+    return success();
+  }
+
+  if (auto shape_type_op = dyn_cast<InferShapedTypeOpInterface>(op)) {
+    auto attributes = GetAllAttributesFromOperation(op);
+    return shape_type_op.inferReturnTypeComponents(
+        op->getContext(), location, op->getOperands(),
+        DictionaryAttr::get(attributes, op->getContext()), op->getRegions(),
+        inferred_return_shapes);
+  }
+
+  auto operand_as_constant_fn = [](Value operand) -> Attribute {
+    Attribute attr;
+    if (matchPattern(operand, m_Constant(&attr))) return attr;
+    return nullptr;
+  };
+
+  auto op_result_as_shape_fn = [](InferenceContext& ic,
+                                  OpResult op_result) -> ShapeHandle {
+    auto rt = op_result.getType().dyn_cast<RankedTensorType>();
+    if (!rt || rt.getRank() != 1 || !rt.hasStaticShape()) return {};
+
+    std::vector<DimensionHandle> dims(rt.getDimSize(0), ic.UnknownDim());
+    Attribute attr;
+    if (matchPattern(op_result, m_Constant(&attr))) {
+      auto elements = attr.dyn_cast<DenseIntElementsAttr>();
+      if (elements)
+        for (auto element : llvm::enumerate(elements.getIntValues()))
+          dims[element.index()] = ic.MakeDim(element.value().getSExtValue());
+    }
+    return ic.MakeShape(dims);
+  };
+
+  auto result_element_type_fn = [](int) -> Type { return nullptr; };
+
+  return InferReturnTypeComponentsForTFOp(
+      location, op, graph_version, operand_as_constant_fn,
+      op_result_as_shape_fn, result_element_type_fn, inferred_return_shapes);
+}
+
+}  // namespace TF
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/shape_inference_utils.h b/tensorflow/compiler/mlir/tensorflow/utils/shape_inference_utils.h
new file mode 100644
index 00000000000..eda2bc49514
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/utils/shape_inference_utils.h
@@ -0,0 +1,76 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_SHAPE_INFERENCE_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_SHAPE_INFERENCE_UTILS_H_
+
+#include <cstdint>
+
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace mlir {
+namespace TF {
+
+// Function that takes in a value and extracts a constant from it, if available.
+// If the value cannot be resolved as a constant, a nullptr will be returned.
+// Certain shape functions require constant values as arguments.
+using OperandAsConstantFn = llvm::function_ref<Attribute(Value)>;
+
+// Function that takes in an operation result and computes a shape (can be
+// partial) value. Certain shape functions require shape values as arguments.
+using OpResultAsShapeFn =
+    llvm::function_ref<tensorflow::shape_inference::ShapeHandle(
+        tensorflow::shape_inference::InferenceContext&, OpResult)>;
+
+// Function that takes a result index and returns the element type. Element
+// types are necessary for handle types (resource, variant).
+using ResultElementTypeFn = llvm::function_ref<Type(int)>;
+
+// Runs TensorFlow shape inference associated to the op type registered in the
+// TensorFlow op registry based on the Graph version, operands, and attributes.
+// Invoking this shape function will create conversions of parameters to the
+// TensorFlow Graph equivalent data structures and back to MLIR equivalent data
+// structures. This does not use a natively implemented shape inference in MLIR,
+// and instead is temporary until shape functions are reimplemented/migrated to
+// being in MLIR instead of the TensorFlow op registry.
+LogicalResult InferReturnTypeComponentsForTFOp(
+    Optional<Location> location, Operation* op, int64_t graph_version,
+    OperandAsConstantFn operand_as_constant_fn,
+    OpResultAsShapeFn op_result_as_shape_fn,
+    ResultElementTypeFn result_element_type_fn,
+    SmallVectorImpl<ShapedTypeComponents>& inferred_return_shapes);
+
+// Runs TensorFlow shape inference for an operation for a given Graph version.
+// If an operation implements the `InferTypeOpInterface` or
+// `InferShapedTypeOpInterface` interfaces, those are used instead but with
+// derived attributes populated. Otherwise the above function is used but with
+// default `operand_as_constant_fn` and `op_result_as_shape_fn` that only
+// extracts a value if the operands are constant (no partial evaluation, and an
+// empty `result_element_type_fn`. Element types with subtypes (DT_RESOURCE,
+// DT_VARIANT) are not supported.
+LogicalResult InferReturnTypeComponentsForTFOp(
+    Optional<Location> location, Operation* op, int64_t graph_version,
+    SmallVectorImpl<ShapedTypeComponents>& inferred_return_shapes);
+
+}  // namespace TF
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_SHAPE_INFERENCE_UTILS_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/tf_xla_mlir_translate.cc b/tensorflow/compiler/mlir/tensorflow/utils/tf_xla_mlir_translate.cc
new file mode 100644
index 00000000000..bcc3fe62f99
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/utils/tf_xla_mlir_translate.cc
@@ -0,0 +1,334 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include "absl/strings/str_join.h"
+#include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/Parser.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Translation.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_cl.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils.h"
+#include "tensorflow/compiler/mlir/utils/string_container_utils.h"
+#include "tensorflow/compiler/mlir/xla/xla_mlir_translate_cl.h"
+#include "tensorflow/compiler/tf2xla/xla_argument.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
+
+// NOLINTNEXTLINE
+llvm::cl::opt<std::string> input_types(
+    "tf-xla-input-types",
+    llvm::cl::desc("XLA input argument types (kinds), separated by ','. "
+                   "Supported types include ['parameter', 'resource']. If "
+                   "empty, all arguments are assumed to be parameters."),
+    llvm::cl::init(""));
+
+namespace tensorflow {
+
+namespace {
+
+mlir::LogicalResult PrintHloModuleText(
+    const XlaCompilationResult& compilation_result, llvm::raw_ostream& output) {
+  const xla::HloModuleConfig module_config(
+      compilation_result.computation->GetProgramShape().ValueOrDie());
+  auto status_or_hlo_module = xla::HloModule::CreateFromProto(
+      compilation_result.computation->proto(), module_config);
+  if (!status_or_hlo_module.ok()) {
+    LOG(ERROR) << "Conversion to HLO module failed: "
+               << status_or_hlo_module.status().ToString();
+    return mlir::failure();
+  }
+
+  xla::HloModule* hlo_module = status_or_hlo_module.ValueOrDie().get();
+
+  output << hlo_module->ToString();
+
+  if (!compilation_result.input_mapping.empty())
+    output << "// InputMapping {"
+           << absl::StrJoin(compilation_result.input_mapping, ", ") << "}\n";
+
+  for (const auto& xla_input_shape : compilation_result.xla_input_shapes)
+    output << "// XlaInputShape " << xla_input_shape.ToString() << '\n';
+
+  output << "// XlaOutputShape "
+         << compilation_result.xla_output_shape.ToString() << '\n';
+
+  for (const auto& xla_output_description : compilation_result.outputs) {
+    output << "// XlaOutputDescription type="
+           << DataTypeString(xla_output_description.type) << " shape=("
+           << absl::StrJoin(xla_output_description.shape.dim_sizes(), ", ")
+           << ')';
+    if (xla_output_description.input_index >= 0)
+      output << " input_index=" << xla_output_description.input_index;
+    if (xla_output_description.is_constant) output << " constant";
+    if (xla_output_description.is_tensor_list) output << " tensor_list";
+    output << '\n';
+  }
+
+  for (const auto& resource_update : compilation_result.resource_updates) {
+    output << "// ResourceUpdate input_index=" << resource_update.input_index
+           << " type=" << DataTypeString(resource_update.type) << " shape=("
+           << absl::StrJoin(resource_update.shape.dim_sizes(), " ") << ')';
+    if (resource_update.modified) output << " modified";
+    output << '\n';
+  }
+
+  return mlir::success();
+}
+
+Status ParseArgumentShapes(
+    absl::string_view input_shapes_str,
+    llvm::SmallVectorImpl<TensorOrResourceShape>& arg_shapes) {
+  arg_shapes.clear();
+  std::vector<std::vector<int>> input_shapes_vector;
+  TF_RETURN_IF_ERROR(ParseNodeShapes(input_shapes_str, input_shapes_vector));
+  arg_shapes.resize(input_shapes_vector.size());
+  for (const auto& shape : llvm::enumerate(input_shapes_vector))
+    TF_RETURN_IF_ERROR(TensorShapeUtils::MakeShape(
+        shape.value(), &arg_shapes[shape.index()].shape));
+
+  return Status::OK();
+}
+
+Status ParseDataTypes(absl::string_view data_types_str,
+                      llvm::SmallVectorImpl<DataType>& data_types) {
+  data_types.clear();
+  std::vector<std::string> input_dtypes_vector;
+  TF_RETURN_IF_ERROR(ParseNodeDataTypes(data_types_str, input_dtypes_vector));
+  data_types.resize(input_dtypes_vector.size(), DT_INVALID);
+  for (auto data_type : llvm::enumerate(input_dtypes_vector)) {
+    if (!DataType_Parse(data_type.value(), &data_types[data_type.index()]))
+      return errors::InvalidArgument("Invalid dtype at index ",
+                                     data_type.index(), ": ",
+                                     data_type.value());
+    const auto& resolved_dtype = data_types[data_type.index()];
+    if (resolved_dtype == DT_INVALID || resolved_dtype == DT_STRING ||
+        resolved_dtype == DT_RESOURCE || resolved_dtype == DT_VARIANT ||
+        IsRefType(resolved_dtype))
+      return errors::InvalidArgument("Unsupported dtype at index ",
+                                     data_type.index(), ": ",
+                                     data_type.value());
+  }
+
+  return Status::OK();
+}
+
+Status ParseArgumentKinds(
+    absl::string_view input_types_str,
+    llvm::SmallVectorImpl<XlaArgument::Kind>& argument_kinds) {
+  argument_kinds.clear();
+  if (input_types_str.empty()) return Status::OK();
+
+  std::vector<absl::string_view> argument_kind_strs =
+      absl::StrSplit(input_types_str, ',');
+  argument_kinds.reserve(argument_kind_strs.size());
+  for (const auto& argument_kind_str : llvm::enumerate(argument_kind_strs)) {
+    const auto& value = argument_kind_str.value();
+    if (value == "parameter") {
+      argument_kinds.push_back(XlaArgument::Kind::kParameter);
+    } else if (value == "resource") {
+      argument_kinds.push_back(XlaArgument::Kind::kResource);
+    } else {
+      return errors::InvalidArgument(
+          "Unsupported TF/XLA argument kind at index ",
+          argument_kind_str.index(), ": ", value);
+    }
+  }
+
+  return Status::OK();
+}
+
+Status ParseXlaArguments(absl::string_view input_shapes_str,
+                         absl::string_view input_dtypes_str,
+                         absl::string_view arg_kinds_str,
+                         llvm::SmallVectorImpl<XlaArgument>& xla_arguments) {
+  xla_arguments.clear();
+  std::vector<std::vector<int>> input_shapes_vector;
+  TF_RETURN_IF_ERROR(
+      tensorflow::ParseNodeShapes(input_shapes_str, input_shapes_vector));
+  llvm::SmallVector<DataType, 4> dtypes_vector;
+  TF_RETURN_IF_ERROR(ParseDataTypes(input_dtypes_str, dtypes_vector));
+  llvm::SmallVector<XlaArgument::Kind, 4> arg_kinds_vector;
+  TF_RETURN_IF_ERROR(ParseArgumentKinds(arg_kinds_str, arg_kinds_vector));
+
+  if (input_shapes_vector.empty())
+    input_shapes_vector.resize(dtypes_vector.size());
+
+  if (arg_kinds_vector.empty())
+    arg_kinds_vector.resize(input_shapes_vector.size(),
+                            XlaArgument::Kind::kParameter);
+
+  if (input_shapes_vector.size() != dtypes_vector.size() ||
+      input_shapes_vector.size() != arg_kinds_vector.size())
+    return errors::InvalidArgument(
+        "Input shapes, dtypes, and types/kinds must be of the same "
+        "length, but got ",
+        input_shapes_vector.size(), ", ", dtypes_vector.size(), ", and ",
+        arg_kinds_vector.size(), " respectively");
+
+  xla_arguments.resize(input_shapes_vector.size());
+  for (const auto& arg_components :
+       llvm::zip(xla_arguments, input_shapes_vector, dtypes_vector,
+                 arg_kinds_vector)) {
+    XlaArgument& arg = std::get<0>(arg_components);
+    TensorShape shape;
+    TF_RETURN_IF_ERROR(
+        TensorShapeUtils::MakeShape(std::get<1>(arg_components), &shape));
+    arg.shape = std::move(shape);
+    arg.type = std::get<2>(arg_components);
+    arg.kind = std::get<3>(arg_components);
+  }
+
+  return Status::OK();
+}
+
+}  // anonymous namespace
+
+static mlir::LogicalResult MlirTfToHloTextTranslateFunction(
+    mlir::ModuleOp module_op, llvm::raw_ostream& output) {
+  if (!module_op) return mlir::failure();
+
+  llvm::SmallVector<TensorOrResourceShape, 4> arg_shapes;
+  auto args_status =
+      ParseArgumentShapes(mlir::StringRefToView(input_shapes), arg_shapes);
+  if (!args_status.ok()) {
+    LOG(ERROR) << args_status.ToString();
+    return mlir::failure();
+  }
+
+  XlaCompilationResult compilation_result;
+  auto compilation_status = CompileMlirToXlaHlo(
+      module_op, arg_shapes, /*device_type=*/"XLA_CPU_JIT", emit_use_tuple_arg,
+      emit_return_tuple, IdentityShapeRepresentationFn(), &compilation_result,
+      /*custom_legalization_passes=*/{});
+  if (!compilation_status.ok()) {
+    LOG(ERROR) << "TF/XLA compilation failed: "
+               << compilation_status.ToString();
+    return mlir::failure();
+  }
+
+  return PrintHloModuleText(compilation_result, output);
+}
+
+static mlir::LogicalResult MlirTfGraphToHloTextTranslateFunction(
+    mlir::ModuleOp module_op, llvm::raw_ostream& output) {
+  if (!module_op) return mlir::failure();
+
+  llvm::SmallVector<XlaArgument, 4> xla_arguments;
+  auto args_status = ParseXlaArguments(
+      mlir::StringRefToView(input_shapes), mlir::StringRefToView(input_dtypes),
+      mlir::StringRefToView(input_types), xla_arguments);
+  if (!args_status.ok()) {
+    LOG(ERROR) << args_status.ToString();
+    return mlir::failure();
+  }
+
+  XlaCompilationResult compilation_result;
+  auto compilation_status = CompileGraphToXlaHlo(
+      module_op, xla_arguments, /*device_type=*/"XLA_CPU_JIT",
+      emit_use_tuple_arg, emit_return_tuple, IdentityShapeRepresentationFn(),
+      &compilation_result, /*custom_legalization_passes=*/{});
+  if (!compilation_status.ok()) {
+    LOG(ERROR) << "TF/XLA compilation failed: "
+               << compilation_status.ToString();
+    return mlir::failure();
+  }
+
+  return PrintHloModuleText(compilation_result, output);
+}
+
+static void RegisterMlirInputDialects(mlir::DialectRegistry& registry) {
+  registry.insert<mlir::StandardOpsDialect, mlir::TF::TensorFlowDialect>();
+}
+
+static void RegisterGraphInputDialects(mlir::DialectRegistry& registry) {
+  RegisterMlirInputDialects(registry);
+  registry.insert<mlir::tf_executor::TensorFlowExecutorDialect>();
+}
+
+static mlir::OwningModuleRef SerializedMlirStringAttrToMlirModuleTranslate(
+    llvm::StringRef input, mlir::MLIRContext* context) {
+  mlir::Attribute attr = mlir::parseAttribute(input, context);
+  if (!attr || !attr.isa<mlir::StringAttr>()) {
+    LOG(ERROR) << "Input is not parsable as a MLIR StringAttr.";
+    return nullptr;
+  }
+  auto str_attr = attr.cast<mlir::StringAttr>();
+
+  RegisterMlirInputDialects(context->getDialectRegistry());
+  mlir::OwningModuleRef module_ref;
+  auto status =
+      DeserializeMlirModule(str_attr.getValue().str(), context, &module_ref);
+  if (!status.ok()) {
+    LOG(ERROR) << status.ToString();
+    return nullptr;
+  }
+
+  return module_ref;
+}
+
+static mlir::LogicalResult MlirModuleToSerializedMlirStringAttrTranslate(
+    mlir::ModuleOp module_op, llvm::raw_ostream& output) {
+  output << "\"";
+  std::string serialized_module = SerializeMlirModule(module_op);
+  llvm::printEscapedString(serialized_module, output);
+  output << "\"";
+  return mlir::success();
+}
+
+}  // namespace tensorflow
+
+static mlir::TranslateFromMLIRRegistration MlirTfToHloTextTranslate(
+    "mlir-tf-to-hlo-text", tensorflow::MlirTfToHloTextTranslateFunction,
+    tensorflow::RegisterMlirInputDialects);
+
+static mlir::TranslateFromMLIRRegistration MlirTfGraphToHloTextTranslate(
+    "mlir-tf-graph-to-hlo-text",
+    tensorflow::MlirTfGraphToHloTextTranslateFunction,
+    tensorflow::RegisterGraphInputDialects);
+
+static mlir::TranslateToMLIRRegistration SerializedMlirStringAttrToMlirModule(
+    "mlir-tf-str-attr-to-mlir",
+    tensorflow::SerializedMlirStringAttrToMlirModuleTranslate);
+
+static mlir::TranslateFromMLIRRegistration MlirModuleToSerializedMlirStringAttr(
+    "mlir-tf-mlir-to-str-attr",
+    tensorflow::MlirModuleToSerializedMlirStringAttrTranslate,
+    tensorflow::RegisterMlirInputDialects);
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc
index 843d491c330..3516e3a65d9 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc
@@ -374,9 +374,8 @@ GetGeneralTPUExecutionDeviceAssignment(
     return (x + bound_x * (y + bound_y * z)) * bound_core + core;
   };
 
-  std::vector<bool> used_device_ids(
-      location_to_id(bound_x - 1, bound_y - 1, bound_z - 1, bound_core - 1),
-      false);
+  std::vector<bool> used_device_ids(bound_x * bound_y * bound_z * bound_core,
+                                    false);
   TPUDevicesAndHosts devices_and_hosts(
       num_replicas, llvm::SmallVector<TPUDeviceAndHost, 8>(
                         num_cores_per_replica, TPUDeviceAndHost()));
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc
index b23fbe7d73c..19eb5b2c476 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc
@@ -625,8 +625,8 @@ TEST(TPURewriteDeviceUtilTest, TestInvalidAttrForDeviceAssignmentDisallowed) {
 }
 
 TEST(TPURewriteDeviceUtilTest, TestGetHostFailDeviceMissingAttributes) {
-  mlir::registerDialect<mlir::tf_device::TensorFlowDeviceDialect>();
   mlir::MLIRContext context;
+  context.loadDialect<mlir::tf_device::TensorFlowDeviceDialect>();
   mlir::OwningModuleRef module_ref =
       mlir::ModuleOp::create(mlir::UnknownLoc::get(&context));
   mlir::OpBuilder builder(module_ref->getBodyRegion());
@@ -641,8 +641,8 @@ TEST(TPURewriteDeviceUtilTest, TestGetHostFailDeviceMissingAttributes) {
 }
 
 TEST(TPURewriteDeviceUtilTest, TestGetHostDeviceFailModelParallelism) {
-  mlir::registerDialect<mlir::tf_device::TensorFlowDeviceDialect>();
   mlir::MLIRContext context;
+  context.loadDialect<mlir::tf_device::TensorFlowDeviceDialect>();
   mlir::OwningModuleRef module_ref =
       mlir::ModuleOp::create(mlir::UnknownLoc::get(&context));
   mlir::OpBuilder builder(module_ref->getBodyRegion());
@@ -662,8 +662,8 @@ TEST(TPURewriteDeviceUtilTest, TestGetHostDeviceFailModelParallelism) {
 }
 
 TEST(TPURewriteDeviceUtilTest, TestGetHostDeviceFailMissingTopology) {
-  mlir::registerDialect<mlir::tf_device::TensorFlowDeviceDialect>();
   mlir::MLIRContext context;
+  context.loadDialect<mlir::tf_device::TensorFlowDeviceDialect>();
   mlir::OwningModuleRef module_ref =
       mlir::ModuleOp::create(mlir::UnknownLoc::get(&context));
   mlir::OpBuilder builder(module_ref->getBodyRegion());
@@ -682,8 +682,8 @@ TEST(TPURewriteDeviceUtilTest, TestGetHostDeviceFailMissingTopology) {
 }
 
 TEST(TPURewriteDeviceUtilTest, TestGetHostDeviceFailMissingDeviceAssignment) {
-  mlir::registerDialect<mlir::tf_device::TensorFlowDeviceDialect>();
   mlir::MLIRContext context;
+  context.loadDialect<mlir::tf_device::TensorFlowDeviceDialect>();
   mlir::OwningModuleRef module_ref =
       mlir::ModuleOp::create(mlir::UnknownLoc::get(&context));
   mlir::OpBuilder builder(module_ref->getBodyRegion());
@@ -702,8 +702,8 @@ TEST(TPURewriteDeviceUtilTest, TestGetHostDeviceFailMissingDeviceAssignment) {
 }
 
 TEST(TPURewriteDeviceUtilTest, TestGetHostDeviceFailBadDeviceAssignment) {
-  mlir::registerDialect<mlir::tf_device::TensorFlowDeviceDialect>();
   mlir::MLIRContext context;
+  context.loadDialect<mlir::tf_device::TensorFlowDeviceDialect>();
   mlir::OwningModuleRef module_ref =
       mlir::ModuleOp::create(mlir::UnknownLoc::get(&context));
   mlir::OpBuilder builder(module_ref->getBodyRegion());
@@ -725,8 +725,8 @@ TEST(TPURewriteDeviceUtilTest, TestGetHostDeviceFailBadDeviceAssignment) {
 }
 
 TEST(TPURewriteDeviceUtilTest, TestGetHostDeviceFailBadDeviceName) {
-  mlir::registerDialect<mlir::tf_device::TensorFlowDeviceDialect>();
   mlir::MLIRContext context;
+  context.loadDialect<mlir::tf_device::TensorFlowDeviceDialect>();
   mlir::OwningModuleRef module_ref =
       mlir::ModuleOp::create(mlir::UnknownLoc::get(&context));
   mlir::OpBuilder builder(module_ref->getBodyRegion());
@@ -750,8 +750,8 @@ TEST(TPURewriteDeviceUtilTest, TestGetHostDeviceFailBadDeviceName) {
 }
 
 TEST(TPURewriteDeviceUtilTest, TestGetHostDeviceTPUReplicate) {
-  mlir::registerDialect<mlir::tf_device::TensorFlowDeviceDialect>();
   mlir::MLIRContext context;
+  context.loadDialect<mlir::tf_device::TensorFlowDeviceDialect>();
   mlir::OwningModuleRef module_ref =
       mlir::ModuleOp::create(mlir::UnknownLoc::get(&context));
   mlir::OpBuilder builder(module_ref->getBodyRegion());
@@ -777,8 +777,8 @@ TEST(TPURewriteDeviceUtilTest, TestGetHostDeviceTPUReplicate) {
 }
 
 TEST(TPURewriteDeviceUtilTest, TestGetHostDeviceNotReplicated) {
-  mlir::registerDialect<mlir::tf_device::TensorFlowDeviceDialect>();
   mlir::MLIRContext context;
+  context.loadDialect<mlir::tf_device::TensorFlowDeviceDialect>();
   mlir::OwningModuleRef module_ref =
       mlir::ModuleOp::create(mlir::UnknownLoc::get(&context));
   mlir::OpBuilder builder(module_ref->getBodyRegion());
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/visitor_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/visitor_util.cc
new file mode 100644
index 00000000000..0647d42f315
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/utils/visitor_util.cc
@@ -0,0 +1,70 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tensorflow/utils/visitor_util.h"
+
+#include "mlir/IR/Operation.h"  // from @llvm-project
+
+namespace tensorflow {
+
+WalkStage::WalkStage(mlir::Operation *op)
+    : num_regions_(op->getNumRegions()), next_region_(0) {}
+
+namespace detail {
+
+/// Walk all of the operations nested under and including the given operations.
+void WalkOperations(mlir::Operation *op, VoidCallback callback) {
+  WalkStage stage(op);
+
+  for (auto &region : op->getRegions()) {
+    // Invoke callback on the parent op before visiting each child region.
+    callback(op, stage);
+    stage.Advance();
+
+    for (auto &block : region)
+      // Early increment here in the case where the operation is erased.
+      for (auto &nestedOp : llvm::make_early_inc_range(block))
+        WalkOperations(&nestedOp, callback);
+  }
+
+  // Invoke callback after all regions have been visited.
+  callback(op, stage);
+}
+
+/// Walk all of the operations nested under and including the given operations.
+/// This methods walks operations until an interrupt signal is received.
+mlir::WalkResult WalkOperations(mlir::Operation *op,
+                                InterruptCallback callback) {
+  WalkStage stage(op);
+
+  for (auto &region : op->getRegions()) {
+    // Invoke callback on the parent op before visiting each child region.
+    if (callback(op, stage).wasInterrupted())
+      return mlir::WalkResult::interrupt();
+
+    stage.Advance();
+
+    for (auto &block : region) {
+      // Early increment here in the case where the operation is erased.
+      for (auto &nestedOp : llvm::make_early_inc_range(block))
+        if (WalkOperations(&nestedOp, callback).wasInterrupted())
+          return mlir::WalkResult::interrupt();
+    }
+  }
+  return callback(op, stage);
+}
+
+}  // namespace detail
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/visitor_util.h b/tensorflow/compiler/mlir/tensorflow/utils/visitor_util.h
new file mode 100644
index 00000000000..31c1f4b62e6
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/utils/visitor_util.h
@@ -0,0 +1,168 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_VISITOR_UTIL_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_VISITOR_UTIL_H_
+
+#include <utility>
+
+#include "mlir/IR/Visitors.h"  // from @llvm-project
+
+// This file defines generic (pre/in/post)-order MLIR IR visitors/walkers. The
+// walk() utility that MLIR core provides traverses operations in a block/
+// blocks in a region in the program order, and these walkers do the same. When
+// operations have regions attached to them, the core MLIR walkers visit the
+// regions attached to an Op first, and then visit the op. So within the context
+// of a single Op, the traversal is post-order (considering the Op as the parent
+// node and regions as the children). For certain use cases, it may be more
+// efficient/desirable to visit the parent Op before visiting the attached
+// regions. As an example, if the attached regions have region arguments that
+// are related to the operation inputs (tf.WhileRegion is an example), then we
+// may want to propagate some information from the Op inputs to the region
+// inputs and then visit the regions to continue progagating that information
+// within the regions. With just post-order traversal, to acheive the same we
+// may need to schedule another walk so make sure child regions get visited.
+// A pre-order walk (within the context of a single operation) will avoid that.
+// Similarly, for certain operations, we may want to visit the Op both before
+// and after all regions have been visited (say to propagate information from
+// inputs -> region arguments and then from region results -> outputs).
+
+// In general, since the data flow between an operation and its regions is
+// opaque in MLIR, we may need to visit the operation in-between regions as well
+// if say region0 is transferring control back to the Op and from then to
+// region1. So a more general walker that supports pre/in/post-order walk is
+// desirable. To support this, the generic walkers defined below will invoke
+// the walk callback on the parent Op at each stage of the child region walk,
+// i.e., before visiting any region, in between regions, and after visiting all
+// regions. To indicate the current walk stage, the callback will also get a
+// `WalkState` parameter. The callback can inspect the current walk stage and
+// decide to take appropriate actions (incuding not doing anything). With this
+// the walker below can support pre/in/post-order walks as well as combined
+// walks (pre+in+post)-order walk.
+
+namespace tensorflow {
+
+// A class to indicate the current walk stage.
+class WalkStage {
+ public:
+  explicit WalkStage(mlir::Operation *op);
+
+  bool IsBeforeAllRegions() const { return next_region_ == 0; }
+  bool IsBeforeRegion(int region) const { return next_region_ == region; }
+  bool IsAfterRegion(int region) const { return next_region_ == region + 1; }
+  bool IsAfterAllRegions() const { return next_region_ == num_regions_; }
+  void Advance() { next_region_++; }
+  int GetNextRegion() const { return next_region_; }
+
+ private:
+  const int num_regions_;
+  int next_region_;
+};
+
+namespace detail {
+// This is similar to MLIR version, but works with multiple argument functions.
+// Helper templates to deduce the first argument of a callback parameter.
+template <typename Ret, typename Arg, typename... Rest>
+Arg first_argument_type(Ret (*)(Arg, Rest...));
+template <typename Ret, typename F, typename Arg, typename... Rest>
+Arg first_argument_type(Ret (F::*)(Arg, Rest...));
+template <typename Ret, typename F, typename Arg, typename... Rest>
+Arg first_argument_type(Ret (F::*)(Arg, Rest...) const);
+template <typename F>
+decltype(first_argument_type(&F::operator())) first_argument_type(F);
+
+/// Type definition of the first argument to the given callable 'T'.
+template <typename T>
+using first_argument = decltype(first_argument_type(std::declval<T>()));
+
+using VoidCallback =
+    llvm::function_ref<void(mlir::Operation *, const WalkStage &)>;
+using InterruptCallback =
+    llvm::function_ref<mlir::WalkResult(mlir::Operation *, const WalkStage &)>;
+
+// Walk all of the operations nested under and including the given operation.
+void WalkOperations(mlir::Operation *op, VoidCallback callback);
+
+// Walk all of the operations nested under and including the given operation.
+// This methods walks operations until an interrupt result is returned by the
+// callback.
+mlir::WalkResult WalkOperations(mlir::Operation *op,
+                                InterruptCallback callback);
+
+}  // namespace detail
+
+// Walk all of the operations nested under and including the given operation.
+// This method is selected for stage-aware callbacks that operate on Operation*.
+//
+// Example:
+//   tensorflow::walk(op, [](Operation *op, const WalkStage &stage) { ... });
+template <typename FuncTy, typename ArgT = detail::first_argument<FuncTy>,
+          typename RetT = decltype(std::declval<FuncTy>()(
+              std::declval<ArgT>(), std::declval<const WalkStage &>()))>
+typename std::enable_if<std::is_same<ArgT, mlir::Operation *>::value,
+                        RetT>::type
+GenericWalk(mlir::Operation *op, FuncTy &&callback) {
+  return detail::WalkOperations(
+      op, llvm::function_ref<RetT(ArgT, const WalkStage &)>(callback));
+}
+
+// Walk all of the operations of type 'ArgT' nested under and including the
+// given operation. This method is selected for void returning callbacks that
+// operate on a specific derived operation type.
+//
+// Example:
+//   tensorflow::walk(op, [](ReturnOp op, const WalkStage &stage) { ... });
+template <typename FuncTy, typename ArgT = detail::first_argument<FuncTy>,
+          typename RetT = decltype(std::declval<FuncTy>()(
+              std::declval<ArgT>(), std::declval<const WalkStage &>()))>
+typename std::enable_if<!std::is_same<ArgT, mlir::Operation *>::value &&
+                            std::is_same<RetT, void>::value,
+                        RetT>::type
+GenericWalk(mlir::Operation *op, FuncTy &&callback) {
+  auto wrapperFn = [&](mlir::Operation *op, const WalkStage &stage) {
+    if (auto derivedOp = llvm::dyn_cast<ArgT>(op)) callback(derivedOp, stage);
+  };
+  return detail::WalkOperations(op,
+                                static_cast<detail::VoidCallback>(wrapperFn));
+}
+
+// Walk all of the operations of type 'ArgT' nested under and including the
+// given operation. This method is selected for WalkReturn returning
+// interruptible callbacks that operate on a specific derived operation type.
+//
+// Example:
+//   tensorflow::walk(op, [](ReturnOp op, const WalkStage &stage) {
+//     if (some_invariant)
+//       return WalkResult::interrupt();
+//     return WalkResult::advance();
+//   });
+template <typename FuncTy, typename ArgT = detail::first_argument<FuncTy>,
+          typename RetT = decltype(std::declval<FuncTy>()(
+              std::declval<ArgT>(), std::declval<const WalkStage &>()))>
+typename std::enable_if<!std::is_same<ArgT, mlir::Operation *>::value &&
+                            std::is_same<RetT, mlir::WalkResult>::value,
+                        RetT>::type
+GenericWalk(mlir::Operation *op, FuncTy &&callback) {
+  auto wrapperFn = [&](mlir::Operation *op, const WalkStage &stage) {
+    if (auto derivedOp = llvm::dyn_cast<ArgT>(op))
+      return callback(derivedOp, stage);
+    return mlir::WalkResult::advance();
+  };
+  return detail::WalkOperations(
+      op, static_cast<detail::InterruptCallback>(wrapperFn));
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_VISITOR_UTIL_H_
diff --git a/tensorflow/compiler/mlir/tf_mlir_opt_main.cc b/tensorflow/compiler/mlir/tf_mlir_opt_main.cc
index 1416ac038d6..e48b14a6bc3 100644
--- a/tensorflow/compiler/mlir/tf_mlir_opt_main.cc
+++ b/tensorflow/compiler/mlir/tf_mlir_opt_main.cc
@@ -13,81 +13,29 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/InitLLVM.h"
-#include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/ToolOutputFile.h"
-#include "mlir/IR/AsmState.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Pass/PassManager.h"  // from @llvm-project
-#include "mlir/Support/FileUtilities.h"  // from @llvm-project
+#include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
+#include "mlir/InitAllDialects.h"  // from @llvm-project
+#include "mlir/InitAllPasses.h"  // from @llvm-project
 #include "mlir/Support/MlirOptMain.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/register.h"
 #include "tensorflow/compiler/mlir/init_mlir.h"
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
+#include "tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.h"
 #include "tensorflow/core/platform/init_main.h"
-#include "tensorflow/core/platform/logging.h"
-
-// NOLINTNEXTLINE
-static llvm::cl::opt<std::string> input_filename(llvm::cl::Positional,
-                                                 llvm::cl::desc("<input file>"),
-                                                 llvm::cl::init("-"));
-
-// NOLINTNEXTLINE
-static llvm::cl::opt<std::string> output_filename(
-    "o", llvm::cl::desc("Output filename"), llvm::cl::value_desc("filename"),
-    llvm::cl::init("-"));
-
-// NOLINTNEXTLINE
-static llvm::cl::opt<bool> split_input_file(
-    "split-input-file",
-    llvm::cl::desc("Split the input file into pieces and process each "
-                   "chunk independently"),
-    llvm::cl::init(false));
-
-// NOLINTNEXTLINE
-static llvm::cl::opt<bool> verify_diagnostics(
-    "verify-diagnostics",
-    llvm::cl::desc("Check that emitted diagnostics match "
-                   "expected-* lines on the corresponding line"),
-    llvm::cl::init(false));
-
-// NOLINTNEXTLINE
-static llvm::cl::opt<bool> verify_passes(
-    "verify-each",
-    llvm::cl::desc("Run the verifier after each transformation pass"),
-    llvm::cl::init(true));
-
-// NOLINTNEXTLINE
-static llvm::cl::opt<bool> allowUnregisteredDialects(
-    "allow-unregistered-dialect",
-    llvm::cl::desc("Allow operation with no registered dialects"),
-    llvm::cl::init(false));
 
 int main(int argc, char **argv) {
   tensorflow::InitMlir y(&argc, &argv);
 
-  // Register various MLIR command line options.
-  mlir::registerAsmPrinterCLOptions();
-  mlir::registerMLIRContextCLOptions();
-  mlir::registerPassManagerCLOptions();
+  mlir::registerAllPasses();
 
-  // Parse pass names in main to ensure static initialization completed.
-  mlir::PassPipelineCLParser pass_pipeline("", "Compiler passes to run");
-
-  llvm::cl::ParseCommandLineOptions(argc, argv,
-                                    "TF MLIR modular optimizer driver\n");
-
-  // Set up the input file.
-  std::string error_message;
-  auto file = mlir::openInputFile(input_filename, &error_message);
-  QCHECK(file) << error_message;
-
-  auto output = mlir::openOutputFile(output_filename, &error_message);
-  QCHECK(output) << error_message;
-
-  if (failed(mlir::MlirOptMain(output->os(), std::move(file), pass_pipeline,
-                               split_input_file, verify_diagnostics,
-                               verify_passes, allowUnregisteredDialects)))
-    return 1;
-  output->keep();
-  return 0;
+  mlir::DialectRegistry registry;
+  mlir::registerAllDialects(registry);
+  mlir::RegisterAllTensorFlowDialects(registry);
+  mlir::mhlo::registerAllMhloDialects(registry);
+  registry.insert<mlir::shape::ShapeDialect>();
+  registry.insert<mlir::TFL::TensorFlowLiteDialect>();
+  registry.insert<mlir::kernel_gen::tf_framework::TFFrameworkDialect>();
+  return failed(
+      mlir::MlirOptMain(argc, argv, "TensorFlow pass driver\n", registry));
 }
diff --git a/tensorflow/compiler/mlir/tf_mlir_translate_main.cc b/tensorflow/compiler/mlir/tf_mlir_translate_main.cc
index 8cfdfd01120..3ea92a70ec7 100644
--- a/tensorflow/compiler/mlir/tf_mlir_translate_main.cc
+++ b/tensorflow/compiler/mlir/tf_mlir_translate_main.cc
@@ -111,7 +111,6 @@ int main(int argc, char** argv) {
 
   if (import_saved_model_object_graph) {
     mlir::MLIRContext context;
-
     auto module_or = tensorflow::SavedModelObjectGraphToMlirImport(
         input_filename, tags, exported_names, &context);
     if (!module_or.status().ok()) return 1;
@@ -119,9 +118,8 @@ int main(int argc, char** argv) {
     module_or.ConsumeValueOrDie()->print(output->os());
   } else if (import_saved_model_signature_defs) {
     mlir::MLIRContext context;
-
     auto module_or = tensorflow::SavedModelSignatureDefsToMlirImport(
-        input_filename, tags, exported_names, &context);
+        input_filename, tags, exported_names, &context, upgrade_legacy);
     if (!module_or.status().ok()) return 1;
 
     module_or.ConsumeValueOrDie()->print(output->os());
diff --git a/tensorflow/compiler/mlir/tfjs/BUILD b/tensorflow/compiler/mlir/tfjs/BUILD
index 7d3091f921f..b1bf20e4e48 100644
--- a/tensorflow/compiler/mlir/tfjs/BUILD
+++ b/tensorflow/compiler/mlir/tfjs/BUILD
@@ -68,17 +68,6 @@ cc_library(
     alwayslink = 1,
 )
 
-cc_library(
-    name = "tensorflow_js_dialect_registration",
-    srcs = [
-        "ir/dialect_registration.cc",
-    ],
-    deps = [
-        ":tensorflow_js",
-    ],
-    alwayslink = 1,
-)
-
 gentbl(
     name = "tfjs_optimize_inc_gen",
     tbl_outs = [
@@ -107,7 +96,6 @@ cc_library(
     ],
     deps = [
         ":tensorflow_js",
-        ":tensorflow_js_dialect_registration",
         "//tensorflow/compiler/mlir/tensorflow",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:Analysis",
@@ -129,7 +117,6 @@ cc_library(
         ":tfjs_optimize",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:decode_constant_pass",
-        "//tensorflow/compiler/mlir/tensorflow:tensorflow_dialect_registration",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_passes",
         "//tensorflow/compiler/mlir/tensorflow:tf_graph_optimization_pass",
         "//tensorflow/compiler/mlir/tensorflow:tf_saved_model_passes",
@@ -149,12 +136,10 @@ cc_library(
     ],
     deps = [
         ":tensorflow_js",
-        ":tensorflow_js_dialect_registration",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:convert_graphdef",
         "//tensorflow/compiler/mlir/tensorflow:export_utils",
         "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_flags",
-        "//tensorflow/compiler/mlir/tensorflow:tensorflow_dialect_registration",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/core:framework",
         "//tensorflow/core:graph",
@@ -192,7 +177,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:AllPassesAndDialects",
+        "@llvm-project//mlir:AllPassesAndDialectsNoRegistration",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Parser",
         "@llvm-project//mlir:Pass",
@@ -236,3 +221,20 @@ tf_cc_binary(
         "@llvm-project//mlir:Support",
     ],
 )
+
+tf_cc_binary(
+    name = "tfjs-opt",
+    srcs = [
+        "tfjs_opt.cc",
+    ],
+    deps = [
+        ":tensorflow_js",
+        ":tensorflow_js_passes",
+        "//tensorflow/compiler/mlir:init_mlir",
+        "//tensorflow/compiler/mlir/lite:tensorflow_lite_legalize_tf",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "@llvm-project//mlir:AllPassesAndDialectsNoRegistration",
+        "@llvm-project//mlir:MlirOptLib",
+        "@llvm-project//mlir:StandardOps",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.cc b/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.cc
index 9ba875cdce4..5ea3f51b475 100644
--- a/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.cc
+++ b/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.cc
@@ -15,18 +15,17 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.h"
 
-namespace mlir {
-namespace tfjs {
-
 #define GET_OP_CLASSES
 #include "tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.cc.inc"
 
+namespace mlir {
+namespace tfjs {
+
 //===----------------------------------------------------------------------===//
 // TFJSDialect
 //===----------------------------------------------------------------------===//
 
-TFJSDialect::TFJSDialect(MLIRContext *context)
-    : Dialect(getDialectNamespace(), context) {
+void TFJSDialect::initialize() {
   addOperations<
 #define GET_OP_LIST
 #include "tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.cc.inc"
diff --git a/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.h b/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.h
index 9c98c9b0e19..bc52e3a0c7a 100644
--- a/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.h
+++ b/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.h
@@ -29,15 +29,9 @@ limitations under the License.
 #include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 
-namespace mlir {
-namespace tfjs {
-
 #include "tensorflow/compiler/mlir/tfjs/ir/tfjs_dialect.h.inc"
 
 #define GET_OP_CLASSES
 #include "tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.h.inc"
 
-}  // namespace tfjs
-}  // namespace mlir
-
 #endif  // TENSORFLOW_COMPILER_MLIR_TFJS_IR_TFJS_OPS_H_
diff --git a/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.td b/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.td
index 134aa010d8c..e2539c2f6d8 100644
--- a/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.td
+++ b/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.td
@@ -39,7 +39,7 @@ def TFJSDialect : Dialect {
     TF graphs to be deployed on TFJS.
   }];
 
-  let cppNamespace = "tfjs";
+  let cppNamespace = "::mlir::tfjs";
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/tfjs/tests/BUILD b/tensorflow/compiler/mlir/tfjs/tests/BUILD
index a4ebc997991..5789480c3ba 100644
--- a/tensorflow/compiler/mlir/tfjs/tests/BUILD
+++ b/tensorflow/compiler/mlir/tfjs/tests/BUILD
@@ -3,8 +3,11 @@ load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 package(licenses = ["notice"])
 
 glob_lit_tests(
-    data = [":test_utilities"],
-    driver = "@llvm-project//mlir:run_lit.sh",
+    data = [
+        ":test_utilities",
+        "@llvm-project//mlir:run_lit.sh",
+    ],
+    driver = "//tensorflow/compiler/mlir:run_lit.sh",
     test_file_exts = ["mlir"],
 )
 
@@ -13,7 +16,7 @@ filegroup(
     name = "test_utilities",
     testonly = True,
     data = [
-        "//tensorflow/compiler/mlir:tf-opt",
+        "//tensorflow/compiler/mlir/tfjs:tfjs-opt",
         "@llvm-project//llvm:FileCheck",
         "@llvm-project//llvm:not",
     ],
diff --git a/tensorflow/compiler/mlir/tfjs/tests/ops.mlir b/tensorflow/compiler/mlir/tfjs/tests/ops.mlir
index 0b7210118df..602f34657a0 100644
--- a/tensorflow/compiler/mlir/tfjs/tests/ops.mlir
+++ b/tensorflow/compiler/mlir/tfjs/tests/ops.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-opt -split-input-file -verify-diagnostics -tfl-runtime-verify %s | FileCheck %s
+// RUN: tfjs-opt -split-input-file -verify-diagnostics -tfl-runtime-verify %s | FileCheck %s
 
 // -----
 
diff --git a/tensorflow/compiler/mlir/tfjs/tests/optimize.mlir b/tensorflow/compiler/mlir/tfjs/tests/optimize.mlir
index 5f046dc5a8a..f4464ddd01d 100644
--- a/tensorflow/compiler/mlir/tfjs/tests/optimize.mlir
+++ b/tensorflow/compiler/mlir/tfjs/tests/optimize.mlir
@@ -1,5 +1,5 @@
 // Run optimize pass only and check the results.
-// RUN: tf-opt %s -tfjs-optimize | FileCheck %s
+// RUN: tfjs-opt %s -tfjs-optimize | FileCheck %s
 
 // CHECK-LABEL: prelu_fusion
 func @prelu_fusion(%arg0: tensor<2x3xf32>) -> tensor<2x3xf32> {
diff --git a/tensorflow/compiler/mlir/tfjs/tfjs_opt.cc b/tensorflow/compiler/mlir/tfjs/tfjs_opt.cc
new file mode 100644
index 00000000000..c6013128295
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfjs/tfjs_opt.cc
@@ -0,0 +1,33 @@
+/* Copyright 2020 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/InitAllPasses.h"  // from @llvm-project
+#include "mlir/Support/MlirOptMain.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/init_mlir.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.h"
+
+int main(int argc, char **argv) {
+  tensorflow::InitMlir y(&argc, &argv);
+
+  mlir::registerAllPasses();
+
+  mlir::DialectRegistry registry;
+  registry.insert<mlir::StandardOpsDialect>();
+  registry.insert<mlir::TF::TensorFlowDialect>();
+  registry.insert<mlir::tfjs::TFJSDialect>();
+  return failed(mlir::MlirOptMain(argc, argv, "TF JS pass driver\n", registry));
+}
diff --git a/tensorflow/compiler/mlir/tfjs/transforms/optimize.cc b/tensorflow/compiler/mlir/tfjs/transforms/optimize.cc
index c03a68471bc..a3678f7d154 100644
--- a/tensorflow/compiler/mlir/tfjs/transforms/optimize.cc
+++ b/tensorflow/compiler/mlir/tfjs/transforms/optimize.cc
@@ -37,6 +37,9 @@ namespace {
 
 // Optimize TFJS operations in functions.
 struct Optimize : public PassWrapper<Optimize, FunctionPass> {
+  void getDependentDialects(DialectRegistry &registry) const final {
+    registry.insert<TFJSDialect>();
+  }
   void runOnFunction() override;
 };
 
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
index 066ca221d5d..1fa224f3ac8 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
@@ -1,59 +1,127 @@
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
-load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+load(
+    "//tensorflow/core/platform/default:cuda_build_defs.bzl",
+    "if_cuda_is_configured",
+)
+load(
+    "@local_config_rocm//rocm:build_defs.bzl",
+    "if_rocm_is_configured",
+)
 
-licenses(["notice"])
+package(
+    default_visibility = [":friends"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 package_group(
     name = "friends",
     includes = ["//third_party/mlir:subpackages"],
-    packages = ["//tensorflow/compiler/mlir/..."],
+    packages = [
+        "//tensorflow/compiler/mlir/...",
+        "//tensorflow/core/kernels/mlir_generated/...",
+    ],
 )
 
 cc_library(
-    name = "cubin_creator",
-    srcs = ["cubin_creator.cc"],
-    hdrs = ["cubin_creator.h"],
-    copts = if_cuda(["-DGOOGLE_CUDA=1"]),
+    name = "kernel_creator",
+    srcs = ["kernel_creator.cc"],
+    hdrs = ["kernel_creator.h"],
+    copts = if_cuda_is_configured(["-DGOOGLE_CUDA=1"]) + if_rocm_is_configured(["-DTENSORFLOW_USE_ROCM=1"]),
     deps = [
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:AllPassesAndDialects",
-        "@llvm-project//mlir:GPUDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:LLVMDialect",
-        "@llvm-project//mlir:Parser",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:StandardOps",
-        "@llvm-project//mlir:TargetNVVMIR",
-        "@llvm-project//mlir:Transforms",
-        "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/hlo",
+        "//tensorflow/compiler/mlir/hlo:all_passes",
+        "//tensorflow/compiler/mlir/hlo:hlo_legalize_to_lhlo",
+        "//tensorflow/compiler/mlir/hlo:legalize_tanh_to_approximation",
+        "//tensorflow/compiler/mlir/hlo:legalize_to_linalg",
         "//tensorflow/compiler/mlir/hlo:lhlo",
-        "//tensorflow/compiler/mlir/xla:xla_legalize_tf",
+        "//tensorflow/compiler/mlir/hlo:lhlo_fuse_linalg",
+        "//tensorflow/compiler/mlir/hlo:lhlo_legalize_to_affine",
+        "//tensorflow/compiler/mlir/hlo:lhlo_legalize_to_gpu",
         "//tensorflow/compiler/mlir/hlo:materialize_broadcasts",  # buildcleaner: keep
+        "//tensorflow/compiler/mlir/hlo:transform_unranked_hlo",  # buildcleaner: keep
         "//tensorflow/compiler/mlir/hlo:unfuse_batch_norm",  # buildcleaner: keep
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tools/kernel_gen/transforms:passes",
+        "//tensorflow/compiler/mlir/xla:xla_legalize_tf",
         "//tensorflow/compiler/xla:debug_options_flags",
+        "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service/gpu:stream_executor_util",
         "//tensorflow/compiler/xla/service/gpu:target_constants",
         "//tensorflow/compiler/xla/service/gpu/llvm_gpu_backend",
         "//tensorflow/compiler/xla/service/mlir_gpu:kernel_lowering",
+        "//tensorflow/compiler/xla/service/mlir_gpu:passes",
         "//tensorflow/core:cuda_libdevice_path",
         "//tensorflow/core:lib",
-    ] + if_cuda(["//tensorflow/stream_executor/gpu:asm_compiler"]),
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:AffineToStandard",
+        "@llvm-project//mlir:AllPassesAndDialectsNoRegistration",
+        "@llvm-project//mlir:GPUDialect",
+        "@llvm-project//mlir:GPUToGPURuntimeTransforms",
+        "@llvm-project//mlir:GPUToNVVMTransforms",
+        "@llvm-project//mlir:GPUTransforms",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:LLVMDialect",
+        "@llvm-project//mlir:LLVMTransforms",
+        "@llvm-project//mlir:LinalgOps",
+        "@llvm-project//mlir:LinalgToLLVM",
+        "@llvm-project//mlir:LinalgTransforms",
+        "@llvm-project//mlir:NVVMDialect",
+        "@llvm-project//mlir:Parser",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SCFDialect",
+        "@llvm-project//mlir:SCFToGPUPass",
+        "@llvm-project//mlir:SCFToStandard",
+        "@llvm-project//mlir:SCFTransforms",
+        "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TargetNVVMIR",
+        "@llvm-project//mlir:TargetROCDLIR",
+        "@llvm-project//mlir:Transforms",
+    ],
 )
 
 tf_cc_binary(
-    name = "tf_to_cubin",
-    srcs = ["tf_to_cubin.cc"],
+    name = "tf_to_gpu_binary",
+    srcs = ["tf_to_gpu_binary.cc"],
     visibility = ["//tensorflow/core/kernels/mlir_generated:__pkg__"],
     deps = [
-        ":cubin_creator",
+        ":kernel_creator",
         "//tensorflow/compiler/mlir:init_mlir",
+        "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/core:lib",
+        "//tensorflow/stream_executor/lib",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:Pass",
+    ],
+)
+
+tf_cc_binary(
+    name = "tf_to_kernel",
+    srcs = ["tf_to_kernel.cc"],
+    visibility = ["//tensorflow/core/kernels/mlir_generated:__pkg__"],
+    deps = [
+        ":kernel_creator",
+        "//tensorflow/compiler/mlir:init_mlir",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/core:lib",
+        "//tensorflow/stream_executor/lib",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Analysis",
+        "@llvm-project//llvm:CodeGen",
+        "@llvm-project//llvm:Core",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//llvm:Target",
+        "@llvm-project//llvm:X86CodeGen",  # fixdeps: keep
+        "@llvm-project//llvm:X86Disassembler",  # fixdeps: keep
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:TargetLLVMIR",
     ],
 )
 
@@ -62,15 +130,28 @@ tf_cc_binary(
     srcs = ["tools/kernel-gen-opt/kernel-gen-opt.cc"],
     visibility = ["//tensorflow/compiler/mlir/tools/kernel_gen/tests:__pkg__"],
     deps = [
+        "//tensorflow/compiler/mlir/hlo:all_passes",
         "//tensorflow/compiler/mlir/hlo:hlo_dialect_registration",
-        "//tensorflow/compiler/mlir/tools/kernel_gen/ir:tf_framework_dialect_registration",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tools/kernel_gen/ir:tf_framework_ops",
         "//tensorflow/compiler/mlir/tools/kernel_gen/transforms:passes",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:AllPassesAndDialectsNoRegistration",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:MlirOptLib",
-        "@llvm-project//mlir:MlirOptMain",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Support",
     ],
 )
+
+exports_files(["tf_framework_c_interface.h"])
+
+cc_library(
+    name = "tf_framework_c_interface",
+    srcs = ["tf_framework_c_interface.cc"],
+    hdrs = ["tf_framework_c_interface.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "@llvm-project//mlir:mlir_runner_utils",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.cc b/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.cc
deleted file mode 100644
index 1f511e27d9e..00000000000
--- a/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.cc
+++ /dev/null
@@ -1,311 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-//===- cubin_creator.cc -----------------------------------------*- C++ -*-===//
-//
-// This file implements the function to compile a TF kernel function to a cubin.
-//
-//===----------------------------------------------------------------------===//
-#include "tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.h"
-
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/memory/memory.h"
-#include "absl/strings/escaping.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/None.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Support/Debug.h"
-#include "mlir/Dialect/GPU/GPUDialect.h"  // from @llvm-project
-#include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
-#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
-#include "mlir/IR/Dialect.h"  // from @llvm-project
-#include "mlir/IR/Function.h"  // from @llvm-project
-#include "mlir/IR/Operation.h"  // from @llvm-project
-#include "mlir/IR/StandardTypes.h"  // from @llvm-project
-#include "mlir/IR/Value.h"  // from @llvm-project
-#include "mlir/Parser.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Pass/PassManager.h"  // from @llvm-project
-#include "mlir/Target/NVVMIR.h"  // from @llvm-project
-#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h"
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
-#include "tensorflow/compiler/mlir/xla/transforms/passes.h"
-#include "tensorflow/compiler/xla/debug_options_flags.h"
-#include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h"
-#include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
-#include "tensorflow/compiler/xla/service/gpu/target_constants.h"
-#include "tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.h"
-#include "tensorflow/core/platform/cuda_libdevice_path.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/path.h"
-#if GOOGLE_CUDA
-#include "tensorflow/stream_executor/gpu/asm_compiler.h"
-#endif
-
-namespace {
-using tensorflow::Status;
-using xla::InternalError;
-using xla::StatusOr;
-
-StatusOr<std::string> GetLibdeviceDir(
-    const xla::HloModuleConfig& hlo_module_config) {
-  for (const std::string& cuda_root : tensorflow::CandidateCudaRoots(
-           hlo_module_config.debug_options().xla_gpu_cuda_data_dir())) {
-    std::string libdevice_dir =
-        tensorflow::io::JoinPath(cuda_root, "nvvm", "libdevice");
-    VLOG(2) << "Looking for libdevice at " << libdevice_dir;
-    if (tensorflow::Env::Default()->IsDirectory(libdevice_dir).ok()) {
-      VLOG(2) << "Found libdevice dir " << libdevice_dir;
-      return libdevice_dir;
-    }
-  }
-  return InternalError(
-      "Can't find libdevice directory ${CUDA_DIR}/nvvm/libdevice");
-}
-
-struct MaterializeBroadcastsPass
-    : public mlir::PassWrapper<MaterializeBroadcastsPass, mlir::FunctionPass> {
-  void runOnFunction() override {
-    mlir::ConversionTarget conversionTarget(getContext());
-    mlir::OwningRewritePatternList conversionPatterns;
-
-    // Consider the mhlo dialect legal for tests.
-    conversionTarget.addLegalDialect<mlir::mhlo::MhloDialect>();
-    // The conversion uses helpers from the Standard dialect.
-    conversionTarget.addLegalDialect<mlir::StandardOpsDialect>();
-
-    mlir::mhlo::SetupMaterializeBroadcastsLegality(&getContext(),
-                                                   &conversionTarget);
-    mlir::mhlo::PopulateMaterializeBroadcastsPatterns(&getContext(),
-                                                      &conversionPatterns);
-
-    if (failed(applyPartialConversion(getFunction(), conversionTarget,
-                                      conversionPatterns))) {
-      return signalPassFailure();
-    }
-  }
-};
-
-struct UnfuseBatchNormPass
-    : public mlir::PassWrapper<UnfuseBatchNormPass, mlir::FunctionPass> {
-  void runOnFunction() override {
-    mlir::OwningRewritePatternList patterns;
-    mlir::mhlo::PopulateUnfuseBatchNormPatterns(&getContext(), &patterns);
-    mlir::applyPatternsAndFoldGreedily(getOperation(), patterns);
-  }
-};
-
-Status LowerTfOpToLhloWithDynamicShapes(mlir::ModuleOp module) {
-  mlir::PassManager pm(module.getContext());
-  auto enable_if_vlog_is_on = [](mlir::Pass* pass, mlir::Operation* op) {
-    return VLOG_IS_ON(1);
-  };
-  pm.enableIRPrinting(/*shouldPrintBeforePass=*/{},
-                      /*shouldPrintAfterPass=*/enable_if_vlog_is_on,
-                      /*printModuleScope=*/false,
-                      /*printAfterOnlyOnChange=*/false, llvm::dbgs());
-  pm.addNestedPass<mlir::FuncOp>(mlir::mhlo::createLegalizeTFPass(false));
-  pm.addNestedPass<mlir::FuncOp>(
-      absl::make_unique<MaterializeBroadcastsPass>());
-  pm.addNestedPass<mlir::FuncOp>(absl::make_unique<UnfuseBatchNormPass>());
-  pm.addPass(mlir::mhlo::createLegalizeToLhloPass(
-      /*results_escape_functions=*/true));
-  pm.addNestedPass<mlir::FuncOp>(mlir::lmhlo::createLhloCopyRemovalPass());
-
-  if (failed(pm.run(module))) {
-    return InternalError("Lowering TF to LHLO failed.");
-  }
-  return Status::OK();
-}
-
-struct PropagateTensorFlowABIKnowledge
-    : public mlir::PassWrapper<PropagateTensorFlowABIKnowledge,
-                               mlir::OperationPass<mlir::LLVM::LLVMFuncOp>> {
-  explicit PropagateTensorFlowABIKnowledge(mlir::FunctionType type,
-                                           llvm::ArrayRef<uint32_t> same_shape_)
-      : func_type(type), same_shape(same_shape_) {}
-
-  void runOnOperation() override {
-    // We know due to tensorflow ABI that the offset is always 0 and that the
-    // innermost stride is always 1. To make this visible to the compiler,
-    // we insert constants into the code and replace usages accordingly.
-    // We do not change the signature so that we keep a somewhat stable ABI
-    // that is easy to undertand by tools.
-    // We also know that tensorflow aligns all allocated pointers by 16, so
-    // we pass this on. Furthermore, we know that arguments never alias. More
-    // precicely, they may only alias (due to reuse) if the kernel does not
-    // read from a position it previously has written to. We express this with
-    // the noalias attribute.
-    mlir::LLVM::LLVMFuncOp func = getOperation();
-
-    // This only works if the function is local and we can rewrite it.
-    if (func.isExternal()) return;
-
-    mlir::OpBuilder b(func.getBody());
-    // Steal the LLVM representation of the index type from the third argument.
-    auto index_type = func.getArgument(3).getType();
-    mlir::Value one = b.create<mlir::LLVM::ConstantOp>(
-        func.getLoc(), index_type, b.getIntegerAttr(b.getIndexType(), 1));
-    mlir::Value zero = b.create<mlir::LLVM::ConstantOp>(
-        func.getLoc(), index_type, b.getIntegerAttr(b.getIndexType(), 0));
-    uint32_t arg_pos = 0;
-    std::vector<uint32_t> positions;
-    // Collect the agument and return types of the surrounding function.
-    auto arg_types = llvm::to_vector<4>(llvm::concat<const mlir::Type>(
-        func_type.getInputs(), func_type.getResults()));
-    for (mlir::Type arg_type : arg_types) {
-      if (!arg_type.isa<mlir::MemRefType>()) {
-        func.emitError() << "argument of surrounding func is not ranked memref";
-        signalPassFailure();
-        return;
-      }
-      positions.push_back(arg_pos);
-      // Set alignment and aliasing on the pointers.
-      func.setArgAttr(arg_pos + 1, "llvm.noalias", b.getBoolAttr(true));
-      func.setArgAttr(arg_pos + 1, "llvm.align", b.getIndexAttr(16));
-      // Replace the offset with zero. Offset is argument number 3.
-      func.getArgument(arg_pos + 2).replaceAllUsesWith(zero);
-      // Forward over base_ptr, aligned_ptr, offset, size and stride arguments.
-      arg_pos += 3 + arg_type.cast<mlir::MemRefType>().getRank() * 2;
-      // Replace the last stride with constant 1.
-      func.getArgument(arg_pos - 1).replaceAllUsesWith(one);
-    }
-
-    // If we have knowledge that some arguments have the same shape, we
-    // can use that here. Simply replace usages of the shape parameters within
-    // the function body to a single shape parameter.
-    if (!same_shape.empty()) {
-      auto first = same_shape.front();
-      auto first_offset = positions.at(first);
-      auto first_type = arg_types[first].cast<mlir::ShapedType>();
-      uint32_t rank = first_type.getRank();
-      for (auto same : same_shape.drop_front(1)) {
-        uint32_t same_offset = positions.at(same);
-        auto same_type = arg_types[same].cast<mlir::ShapedType>();
-        if (same_type.getRank() != rank) {
-          func.emitOpError() << "same shape constraints on arguments with "
-                                "non-matching shapes: #"
-                             << first << " and #" << same;
-          signalPassFailure();
-          continue;
-        }
-
-        for (uint32_t i = 0; i < 2 * rank; ++i) {
-          // Replace uses for second arg data with first arg.
-          auto same_arg = func.getArgument(same_offset + 3 + i);
-          auto first_arg = func.getArgument(first_offset + 3 + i);
-          same_arg.replaceAllUsesWith(first_arg);
-        }
-      }
-    }
-  }
-
-  mlir::FunctionType func_type;
-  llvm::ArrayRef<uint32_t> same_shape;
-};
-
-Status PropagateTensorFlowABIKnowledgeToKernel(
-    mlir::ModuleOp module, llvm::ArrayRef<uint32_t> same_shape) {
-  // Grab the original signature from the single function.
-  auto func = *module.getBody()->op_begin<mlir::FuncOp>();
-
-  mlir::PassManager pm(module.getContext());
-  auto enable_if_vlog_is_on = [](mlir::Pass*, mlir::Operation*) {
-    return VLOG_IS_ON(1);
-  };
-  pm.enableIRPrinting(/*shouldPrintBeforePass=*/{},
-                      /*shouldPrintAfterPass=*/enable_if_vlog_is_on,
-                      /*printModuleScope=*/false,
-                      /*printAfterOnlyOnChange=*/false, llvm::dbgs());
-  auto& kernel_pm = pm.nest<::mlir::gpu::GPUModuleOp>();
-  kernel_pm.addNestedPass<mlir::LLVM::LLVMFuncOp>(
-      absl::make_unique<PropagateTensorFlowABIKnowledge>(func.getType(),
-                                                         same_shape));
-
-  if (failed(pm.run(module))) {
-    return InternalError("Static knowledge propagation failed.");
-  }
-  return Status::OK();
-}
-
-void RegisterDialects() {
-  static bool init_once = []() {
-    mlir::registerDialect<mlir::TF::TensorFlowDialect>();
-    return true;
-  }();
-  (void)init_once;
-}
-}  // namespace
-
-StatusOr<std::vector<uint8_t>> tensorflow::kernel_gen::GenerateCubinForTfCode(
-    llvm::StringRef tf_code, std::pair<int32_t, int32_t> compute_capability,
-    llvm::ArrayRef<uint32_t> tile_sizes, llvm::ArrayRef<uint32_t> same_shape,
-    llvm::ArrayRef<uint32_t> unroll_factors) {
-  RegisterDialects();
-  mlir::MLIRContext context;
-  mlir::OwningModuleRef module = mlir::parseSourceString(tf_code, &context);
-
-  TF_RETURN_IF_ERROR(LowerTfOpToLhloWithDynamicShapes(module.get()));
-  {
-    xla::mlir_gpu::LowerLHLOToGPUOptions options;
-    options.tile_sizes = tile_sizes;
-    options.unroll_factors = unroll_factors;
-    options.collapse_parallel_loops = false;
-    options.use_approximations = true;
-    TF_RETURN_IF_ERROR(xla::mlir_gpu::LowerLHLOToGPU(module.get(), options));
-  }
-  TF_RETURN_IF_ERROR(xla::mlir_gpu::LowerKernelBodiesToNVVM(module.get()));
-  TF_RETURN_IF_ERROR(
-      PropagateTensorFlowABIKnowledgeToKernel(module.get(), same_shape));
-
-  mlir::OwningModuleRef kernel_module =
-      xla::mlir_gpu::ExtractKernelModule(*module).ValueOrDie();
-  auto llvmModule = mlir::translateModuleToNVVMIR(*kernel_module);
-  if (!llvmModule) {
-    return InternalError("Could not translate MLIR module to NVVM");
-  }
-
-  llvmModule->setModuleIdentifier("acme");
-  llvmModule->setDataLayout(xla::gpu::nvptx::kDataLayout);
-
-  xla::HloModuleConfig config;
-  config.set_debug_options(xla::GetDebugOptionsFromFlags());
-
-  auto enable_fusion = [](llvm::TargetMachine* target) {
-    target->Options.AllowFPOpFusion = llvm::FPOpFusion::FPOpFusionMode::Fast;
-  };
-
-  TF_ASSIGN_OR_RETURN(std::string libdevice_dir, GetLibdeviceDir(config));
-  TF_ASSIGN_OR_RETURN(
-      std::string ptx,
-      xla::gpu::nvptx::CompileToPtx(llvmModule.get(), compute_capability,
-                                    config, libdevice_dir, enable_fusion));
-  VLOG(1) << ptx;
-
-#if GOOGLE_CUDA
-  return tensorflow::se::CompileGpuAsm(
-      std::get<0>(compute_capability), std::get<1>(compute_capability),
-      ptx.c_str(), xla::gpu::PtxOptsFromConfig(config));
-#else
-  return InternalError(
-      "GOOGLE_CUDA not defined. Did you specify --config=cuda ?");
-#endif
-}
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/ir/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/ir/BUILD
index 3a28d4815d2..29939f227db 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/ir/BUILD
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/ir/BUILD
@@ -35,13 +35,3 @@ cc_library(
         "@llvm-project//mlir:SideEffects",
     ],
 )
-
-cc_library(
-    name = "tf_framework_dialect_registration",
-    srcs = ["dialect_registration.cc"],
-    deps = [
-        ":tf_framework_ops",
-        "@llvm-project//mlir:IR",
-    ],
-    alwayslink = 1,
-)
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.cc b/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.cc
index e67b5fd7f85..b3d92773be4 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.cc
@@ -24,8 +24,7 @@ namespace mlir {
 namespace kernel_gen {
 namespace tf_framework {
 
-TFFrameworkDialect::TFFrameworkDialect(MLIRContext *context)
-    : Dialect(getDialectNamespace(), context) {
+void TFFrameworkDialect::initialize() {
   addOperations<
 #define GET_OP_LIST
 #include "tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.cc.inc"
@@ -49,19 +48,23 @@ Type TFFrameworkDialect::parseType(DialectAsmParser &parser) const {
 
 /// Print a type registered to this dialect.
 void TFFrameworkDialect::printType(Type type, DialectAsmPrinter &os) const {
-  switch (type.getKind()) {
-    case TFFrameworkTypes::OpKernelContextType:
-      os << "op_kernel_context";
-      return;
-    default:
-      llvm_unreachable("unexpected TF Framework type kind");
+  if (type.isa<OpKernelContextType>()) {
+    os << "op_kernel_context";
+    return;
   }
+  llvm_unreachable("unexpected TF Framework type kind");
+}
+
+template <typename OpTy>
+LogicalResult Verify(OpTy op) {
+  return success();
 }
 
 //===----------------------------------------------------------------------===//
 // AllocRawOp
 //===----------------------------------------------------------------------===//
-static LogicalResult Verify(AllocRawOp op) {
+template <>
+LogicalResult Verify<AllocRawOp>(AllocRawOp op) {
   // Check that the total number of operands matches the number of dynamic
   // dimensions specified in the memref type.
   unsigned result_dyn_dims = op.getType().getNumDynamicDims();
@@ -74,14 +77,9 @@ static LogicalResult Verify(AllocRawOp op) {
   return success();
 }
 
-//===----------------------------------------------------------------------===//
-// DeallocRawOp
-//===----------------------------------------------------------------------===//
-static LogicalResult Verify(DeallocRawOp op) { return success(); }
-
-#define GET_OP_CLASSES
-#include "tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.cc.inc"
-
 }  // namespace tf_framework
 }  // namespace kernel_gen
 }  // namespace mlir
+
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.cc.inc"
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.h b/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.h
index 8d6e433d9b9..aab090cc5e0 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.h
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.h
@@ -30,35 +30,20 @@ namespace mlir {
 namespace kernel_gen {
 namespace tf_framework {
 
-namespace TFFrameworkTypes {
-enum Kind {
-  OpKernelContextType = Type::FIRST_TF_FRAMEWORK_TYPE,
-};
-}  // namespace TFFrameworkTypes
-
 /// OpKernelContextType corresponds to C++ class OpKernelContext defined in
 /// tensorflow/core/framework/op_kernel.h
 class OpKernelContextType
     : public Type::TypeBase<OpKernelContextType, Type, TypeStorage> {
  public:
   using Base::Base;
-
-  static OpKernelContextType get(MLIRContext *context) {
-    return Base::get(context, TFFrameworkTypes::Kind::OpKernelContextType);
-  }
-
-  /// Support method to enable LLVM-style type casting.
-  static bool kindof(unsigned kind) {
-    return kind == TFFrameworkTypes::Kind::OpKernelContextType;
-  }
 };
 
-#define GET_OP_CLASSES
-#include "tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_dialect.h.inc"
-#include "tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.h.inc"
-
 }  // namespace tf_framework
 }  // namespace kernel_gen
 }  // namespace mlir
 
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_dialect.h.inc"
+#include "tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.h.inc"
+
 #endif  // TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_IR_TF_FRAMEWORK_OPS_H_
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.td b/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.td
index 65481ad377f..e6e29bcbdc2 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.td
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.td
@@ -29,7 +29,7 @@ def TFFramework_Dialect : Dialect {
     This dialect contains operations and types for that correspond to
     TensorFlow C++ Framework.
   }];
-  let cppNamespace = "kernel_gen::tf_framework";
+  let cppNamespace = "::mlir::kernel_gen::tf_framework";
 }
 
 def TFFramework_OpKernelContextType : DialectType<TFFramework_Dialect,
@@ -45,7 +45,7 @@ def TFFramework_OpKernelContextType : DialectType<TFFramework_Dialect,
 // Base class for TF Framework dialect ops.
 class TFFramework_Op<string mnemonic, list<OpTrait> traits = []> :
     Op<TFFramework_Dialect, mnemonic, traits> {
-  let verifier = "return Verify(*this);";
+  let verifier = "return Verify<$cppClass>(*this);";
 }
 
 //===----------------------------------------------------------------------===//
@@ -111,4 +111,15 @@ def TFFramework_DeallocRawOp : TFFramework_Op<"dealloc_raw",
   let assemblyFormat = "`(` $ctx `,` $memref `)` attr-dict `:` type($memref)";
 }
 
+//===----------------------------------------------------------------------===//
+// NullContextOp
+//===----------------------------------------------------------------------===//
+def TFFramework_NullContextOp : TFFramework_Op<"null_context",
+    [NoSideEffect]> {
+  let summary = "Creates a fake TF context that will be lowered to nullptr";
+  let description = [{Needed for testing}];
+  let results = (outs TFFramework_OpKernelContextType:$result);
+  let assemblyFormat = "`(` `)` attr-dict `:` type($result)";
+}
+
 #endif // TF_FRAMEWORK_OPS
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.cc b/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.cc
new file mode 100644
index 00000000000..68d1d581351
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.cc
@@ -0,0 +1,243 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+//===- kernel_creator.cc ----------------------------------------*- C++ -*-===//
+//
+// This file implements the function to compile a TF kernel function to gpu
+// binary (hsaco for AMD, cubin for NVIDIA) or to a gpu binary with host side.
+//
+//===----------------------------------------------------------------------===//
+#include "tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.h"
+
+#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"  // from @llvm-project
+#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"  // from @llvm-project
+#include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"  // from @llvm-project
+#include "mlir/Conversion/LinalgToLLVM/LinalgToLLVM.h"  // from @llvm-project
+#include "mlir/Conversion/SCFToGPU/SCFToGPUPass.h"  // from @llvm-project
+#include "mlir/Conversion/SCFToStandard/SCFToStandard.h"  // from @llvm-project
+#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"  // from @llvm-project
+#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"  // from @llvm-project
+#include "mlir/Dialect/GPU/GPUDialect.h"  // from @llvm-project
+#include "mlir/Dialect/GPU/ParallelLoopMapper.h"  // from @llvm-project
+#include "mlir/Dialect/GPU/Passes.h"  // from @llvm-project
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
+#include "mlir/Dialect/LLVMIR/NVVMDialect.h"  // from @llvm-project
+#include "mlir/Dialect/Linalg/Passes.h"  // from @llvm-project
+#include "mlir/Dialect/SCF/Passes.h"  // from @llvm-project
+#include "mlir/Dialect/SCF/SCF.h"  // from @llvm-project
+#include "mlir/Dialect/SCF/Transforms.h"  // from @llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/Parser.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Transforms/BufferPlacement.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "mlir/Transforms/Passes.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h"
+#include "tensorflow/compiler/mlir/xla/transforms/passes.h"
+#include "tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.h"
+#include "tensorflow/compiler/xla/service/mlir_gpu/passes.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/path.h"
+
+namespace tensorflow {
+namespace kernel_gen {
+namespace {
+
+using tensorflow::Status;
+using xla::InternalError;
+using xla::StatusOr;
+
+constexpr llvm::StringRef kGpuBinaryAttrName = "gpu.binary";
+
+Status LowerTFtoGPU(mlir::ModuleOp module, bool gpu_binary_only,
+                    llvm::ArrayRef<uint32_t> tile_sizes,
+                    llvm::ArrayRef<uint32_t> unroll_factors) {
+  mlir::PassManager pm(module.getContext());
+  applyPassManagerCLOptions(pm);
+
+  pm.addPass(mlir::mhlo::createLegalizeTFPass(false));
+  if (gpu_binary_only) {
+    pm.addNestedPass<mlir::FuncOp>(
+        mlir::kernel_gen::transforms::CreateMaterializeBroadcastsPass());
+    pm.addNestedPass<mlir::FuncOp>(
+        mlir::kernel_gen::transforms::CreateUnfuseBatchNormPass());
+    pm.addPass(mlir::mhlo::createLegalizeToLhloPass(
+        /*results_escape_functions=*/true));
+    // Moving `AllocOp`s and inserting missing `DeallocOp`s
+    pm.addPass(::mlir::createBufferPlacementPass());
+    pm.addNestedPass<mlir::FuncOp>(mlir::createCopyRemovalPass());
+    pm.addPass(mlir::kernel_gen::transforms::CreateShapeToDescriptorsPass());
+  } else {
+    pm.addPass(mlir::createTransformUnrankedHloPass());
+    pm.addPass(mlir::kernel_gen::transforms::CreateShapeToDescriptorsPass());
+    pm.addPass(mlir::kernel_gen::transforms::CreateBufferizePass());
+  }
+
+  // Clean up the IR for further processing.
+  pm.addPass(mlir::createCanonicalizerPass());
+  // We have to anticipate later unrolling in tiling to make sure that we get
+  // the requested tiling after unrolling. Compute the new tiling here if
+  // needed.
+  llvm::SmallVector<unsigned, 4> tiling_for_unrolling;
+  llvm::SmallVector<int64_t, 4> as_int64;
+  if (!unroll_factors.empty()) {
+    tiling_for_unrolling.reserve(tile_sizes.size());
+    for (auto pair : llvm::zip(tile_sizes, unroll_factors)) {
+      tiling_for_unrolling.push_back(std::get<0>(pair) * std::get<1>(pair));
+      as_int64.push_back(std::get<1>(pair));
+    }
+  } else {
+    tiling_for_unrolling.append(tile_sizes.begin(), tile_sizes.end());
+  }
+  // Transform LHLO operations to LinAlg.
+  pm.addPass(::mlir::lmhlo::createLegalizeLhloToLinalgPass());
+  // Fuse linalg operations.
+  pm.addPass(::mlir::lmhlo::createLhloFuseLinalgPass(
+      /*use_parallel_loops=*/true, tiling_for_unrolling));
+  // Transform the Linalg operations inside of the loop nest into parallel
+  // loops.
+  pm.addPass(::mlir::createConvertLinalgToParallelLoopsPass());
+  // Canonicalize the code to simplify index computations. This is needed so
+  // that loop bounds have the same value.
+  pm.addNestedPass<::mlir::FuncOp>(::mlir::createCanonicalizerPass());
+  pm.addNestedPass<::mlir::FuncOp>(::mlir::createCSEPass());
+  // Fuse the inner-most loops.
+  pm.addPass(xla::mlir_gpu::createFuseInnerParallelLoopsPass());
+  // Run CSE to ensure that loads and stores to the same subview get
+  // recognized as such.
+  pm.addNestedPass<::mlir::FuncOp>(::mlir::createCSEPass());
+  // Forward stores to buffers to loads.
+  pm.addPass(xla::mlir_gpu::createStoreForwardingPass());
+  // Remove now unused temporary buffers.
+  pm.addPass(xla::mlir_gpu::createDeadTempBufferRemovalPass());
+  if (!unroll_factors.empty()) {
+    pm.addPass(::mlir::createParallelLoopTilingPass(as_int64));
+  }
+  // Some basic cleanup.
+  pm.addNestedPass<::mlir::FuncOp>(::mlir::createCanonicalizerPass());
+  pm.addNestedPass<::mlir::FuncOp>(::mlir::createCSEPass());
+  // Greedily map the remaining loop to GPU hardware dimensions.
+  pm.addPass(xla::mlir_gpu::createMapParallelLoopsPass());
+  // Apply the mapping.
+  pm.addPass(mlir::createParallelLoopToGpuPass());
+
+  // Embed TF Framework ops.
+  if (!gpu_binary_only) {
+    pm.addPass(mlir::kernel_gen::tf_framework::CreateEmbedTFFrameworkPass());
+  }
+
+  // Some basic cleanup.
+  pm.addNestedPass<::mlir::FuncOp>(::mlir::createCanonicalizerPass());
+  pm.addNestedPass<::mlir::FuncOp>(::mlir::createCSEPass());
+  // Make loops with min bounds into a conditional plus static bounds.
+  // Only do this if we unrolled in the first place.
+  if (!unroll_factors.empty()) {
+    pm.addNestedPass<::mlir::FuncOp>(mlir::createForLoopSpecializationPass());
+  }
+  // Approximate Tanh using standard operations.
+  pm.addNestedPass<::mlir::FuncOp>(
+      ::mlir::mhlo::createLegalizeTanhToApproximationPass());
+  // Move scalar operations into the launch to ensure smaller signatures.
+  pm.addPass(xla::mlir_gpu::createMoveScalarComputationsIntoGpuLaunchPass());
+  // Take launches to launches with kernels.
+  pm.addPass(::mlir::createGpuKernelOutliningPass());
+
+  if (gpu_binary_only) {
+    // Make kernel signature deterministic so that we can call it externally.
+    pm.addPass(xla::mlir_gpu::createRewriteKernelSignaturePass());
+  }
+  pm.addPass(::mlir::createLowerAffinePass());
+  pm.addPass(::mlir::createLowerToCFGPass());
+  if (failed(pm.run(module))) {
+    return InternalError("Lowering to GPU kernels failed.");
+  }
+  return Status::OK();
+}
+
+Status LowerGPUToLLVM(mlir::ModuleOp module, bool gpu_binary_only,
+                      llvm::ArrayRef<uint32_t> same_shape,
+                      llvm::StringRef gpu_binary_attr_name,
+                      int32_t architecture) {
+  mlir::PassManager pm(module.getContext());
+  applyPassManagerCLOptions(pm);
+
+  auto& kernel_pm = pm.nest<mlir::gpu::GPUModuleOp>();
+  if (gpu_binary_only) {
+    // Grab the original signature from the single function.
+    auto func = *module.getBody()->op_begin<mlir::FuncOp>();
+    kernel_pm.addNestedPass<mlir::LLVM::LLVMFuncOp>(
+        mlir::kernel_gen::transforms::CreatePropagateTensorFlowABIKnowledgePass(
+            func.getType(), same_shape));
+  }
+  kernel_pm.addPass(mlir::createStripDebugInfoPass());
+  kernel_pm.addPass(mlir::kernel_gen::transforms::CreateGpuKernelToBlobPass(
+      gpu_binary_attr_name, architecture));
+
+  if (!gpu_binary_only) {
+    pm.addPass(mlir::kernel_gen::transforms::CreateTFKernelToLLVMPass());
+    pm.addPass(mlir::createCanonicalizerPass());
+    pm.addPass(mlir::createCSEPass());
+  }
+  return failed(pm.run(module)) ? InternalError("Lowering to LLVM IR failed.")
+                                : Status::OK();
+}
+
+}  // namespace
+
+StatusOr<mlir::OwningModuleRef> GenerateKernelForTfCode(
+    mlir::MLIRContext& context, llvm::StringRef tf_code, bool gpu_binary_only,
+    int32_t architecture, llvm::ArrayRef<uint32_t> tile_sizes,
+    llvm::ArrayRef<uint32_t> same_shape,
+    llvm::ArrayRef<uint32_t> unroll_factors) {
+  mlir::RegisterAllTensorFlowDialects(context.getDialectRegistry());
+  mlir::OwningModuleRef module = mlir::parseSourceString(tf_code, &context);
+  TF_RETURN_IF_ERROR(
+      LowerTFtoGPU(module.get(), gpu_binary_only, tile_sizes, unroll_factors));
+#if !defined(TENSORFLOW_USE_ROCM) && !defined(GOOGLE_CUDA)
+  return InternalError(
+      "Neither TENSORFLOW_USE_ROCM nor GOOGLE_CUDA are defined."
+      " Did you specify either --config=rocm or --config=cuda ?");
+#endif
+
+#if TENSORFLOW_USE_ROCM
+  TF_RETURN_IF_ERROR(xla::mlir_gpu::LowerKernelBodiesToROCDL(module.get()));
+#elif GOOGLE_CUDA
+  TF_RETURN_IF_ERROR(xla::mlir_gpu::LowerKernelBodiesToNVVM(module.get()));
+#endif
+  TF_RETURN_IF_ERROR(LowerGPUToLLVM(module.get(), gpu_binary_only, same_shape,
+                                    kGpuBinaryAttrName, architecture));
+  return module;
+}
+
+StatusOr<std::string> ExtractGpuBinary(mlir::ModuleOp module) {
+  auto gpu_modules = module.getOps<mlir::gpu::GPUModuleOp>();
+  if (std::distance(gpu_modules.begin(), gpu_modules.end()) != 1) {
+    return InternalError("There should be exactly one GPU Module");
+  }
+  mlir::gpu::GPUModuleOp gpu_mod = *gpu_modules.begin();
+  auto blob = gpu_mod.getAttrOfType<mlir::StringAttr>(kGpuBinaryAttrName);
+  if (blob == nullptr) {
+    return InternalError("No binary blob found in the module");
+  }
+  return blob.getValue().str();
+}
+
+}  // namespace kernel_gen
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.h b/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.h
similarity index 53%
rename from tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.h
rename to tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.h
index 47626ba9d0d..b168ec815de 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.h
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.h
@@ -13,30 +13,39 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-//===- cubin_creator.h ------------------------------------------*- C++ -*-===//
+//===- kernel_creator.h -----------------------------------------*- C++ -*-===//
 //
-// This file declares the function to compile a TF kernel function to a cubin.
+// This file declares the function to compile a TF kernel function to gpu
+// binary (hsaco for AMD, cubin for NVIDIA) or to a gpu binary with host side.
 //
 //===----------------------------------------------------------------------===//
-#ifndef TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_CUBIN_CREATOR_H_
-#define TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_CUBIN_CREATOR_H_
+#ifndef TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_KERNEL_CREATOR_H_
+#define TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_KERNEL_CREATOR_H_
 
 #include <utility>
-#include <vector>
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Module.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/statusor.h"
 
 namespace tensorflow {
 namespace kernel_gen {
-xla::StatusOr<std::vector<uint8_t>> GenerateCubinForTfCode(
-    llvm::StringRef tf_code,
-    std::pair<int32_t, int32_t> compute_capability = {7, 5},
-    llvm::ArrayRef<uint32_t> tile_sizes = {16, 64},
+
+// Converts TF code to LLVM/NVVM. If `gpu_binary_only` is true, then the
+// conversion stops after gpu_binary blob is generated. If `gpu_binary_only` is
+// false, lowers the host side to LLVM Dialect.
+xla::StatusOr<mlir::OwningModuleRef> GenerateKernelForTfCode(
+    mlir::MLIRContext& context, llvm::StringRef tf_code, bool gpu_binary_only,
+    int32_t architecture = 75, llvm::ArrayRef<uint32_t> tile_sizes = {16, 64},
     llvm::ArrayRef<uint32_t> same_shape = {},
     llvm::ArrayRef<uint32_t> unroll_factors = {});
+
+// Extracts gpu_binary from the converted module.
+xla::StatusOr<std::string> ExtractGpuBinary(mlir::ModuleOp module);
+
 }  // namespace kernel_gen
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_CUBIN_CREATOR_H_
+#endif  // TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_KERNEL_CREATOR_H_
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tf_framework_c_interface.cc b/tensorflow/compiler/mlir/tools/kernel_gen/tf_framework_c_interface.cc
new file mode 100644
index 00000000000..e75db59d885
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tf_framework_c_interface.cc
@@ -0,0 +1,49 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tools/kernel_gen/tf_framework_c_interface.h"
+
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace mlir {
+namespace kernel_gen {
+namespace tf_framework {
+namespace {
+
+using tensorflow::Allocator;
+
+Allocator* GetAllocator(void* op_kernel_ctx) {
+  auto* ctx = static_cast<tensorflow::OpKernelContext*>(op_kernel_ctx);
+  // TODO(pifon): Figure out how to set AllocatorAttributes correctly.
+  tensorflow::AllocatorAttributes attrs;
+  return ctx->get_allocator(attrs);
+}
+
+}  // namespace
+
+extern "C" void* _mlir_ciface_tf_alloc_raw(void* op_kernel_ctx,
+                                           size_t num_bytes) {
+  return GetAllocator(op_kernel_ctx)
+      ->AllocateRaw(Allocator::kAllocatorAlignment, num_bytes);
+}
+
+extern "C" void _mlir_ciface_tf_dealloc_raw(void* op_kernel_ctx, void* ptr) {
+  GetAllocator(op_kernel_ctx)->DeallocateRaw(ptr);
+}
+
+}  // namespace tf_framework
+}  // namespace kernel_gen
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tf_framework_c_interface.h b/tensorflow/compiler/mlir/tools/kernel_gen/tf_framework_c_interface.h
new file mode 100644
index 00000000000..143ebc95932
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tf_framework_c_interface.h
@@ -0,0 +1,35 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_TESTS_TF_FRAMEWORK_C_INTERFACE_H_
+#define TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_TESTS_TF_FRAMEWORK_C_INTERFACE_H_
+
+#include "mlir/ExecutionEngine/RunnerUtils.h"  // from @llvm-project
+
+namespace mlir {
+namespace kernel_gen {
+namespace tf_framework {
+
+extern "C" MLIR_RUNNERUTILS_EXPORT void* _mlir_ciface_tf_alloc_raw(
+    void* op_kernel_ctx, size_t num_bytes);
+
+extern "C" MLIR_RUNNERUTILS_EXPORT void _mlir_ciface_tf_dealloc_raw(
+    void* op_kernel_ctx, void* ptr);
+
+}  // namespace tf_framework
+}  // namespace kernel_gen
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_TESTS_TF_FRAMEWORK_C_INTERFACE_H_
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_cubin.cc b/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_gpu_binary.cc
similarity index 61%
rename from tensorflow/compiler/mlir/tools/kernel_gen/tf_to_cubin.cc
rename to tensorflow/compiler/mlir/tools/kernel_gen/tf_to_gpu_binary.cc
index 96831689600..c7cb92404f5 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_cubin.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_gpu_binary.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-//===- tf_to_cubin.cc -------------------------------------------*- C++ -*-===//
+//===- tf_to_gpu_binary.cc --------------------------------------*- C++ -*-===//
 //
-// This file implements the entry point to compile a tf op to a cubin file.
+// This file implements the entry point to compile a tf op to a gpu binary
 //
 //===----------------------------------------------------------------------===//
 #include <string>
@@ -23,10 +23,44 @@
 
 #include "absl/strings/string_view.h"
 #include "llvm/Support/CommandLine.h"
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/init_mlir.h"
-#include "tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.h"
+#include "tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+
+namespace tensorflow {
+namespace kernel_gen {
+namespace {
+
+xla::Status Run(llvm::StringRef input_file, llvm::StringRef output_file,
+                int32_t architecture, llvm::ArrayRef<uint32_t> tile_sizes,
+                llvm::ArrayRef<uint32_t> same_shape,
+                llvm::ArrayRef<uint32_t> unroll_factors) {
+  // Read TF code.
+  std::string tf_code;
+  TF_RETURN_IF_ERROR(
+      ReadFileToString(Env::Default(), input_file.str(), &tf_code));
+  // Compile.
+  mlir::MLIRContext context;
+  TF_ASSIGN_OR_RETURN(
+      mlir::OwningModuleRef module,
+      GenerateKernelForTfCode(context, tf_code, /*gpu_binary_only=*/true,
+                              architecture, tile_sizes, same_shape,
+                              unroll_factors));
+  // Extract gpu_binary.
+  TF_ASSIGN_OR_RETURN(std::string gpu_binary, ExtractGpuBinary(*module));
+
+  // Write gpu_binary blob.
+  TF_RETURN_IF_ERROR(
+      WriteStringToFile(Env::Default(), output_file.str(), gpu_binary));
+  return xla::Status::OK();
+}
+
+}  // namespace
+}  // namespace kernel_gen
+}  // namespace tensorflow
 
 int main(int argc, char** argv) {
   llvm::cl::opt<std::string> input_file("input", llvm::cl::desc("input file"),
@@ -51,38 +85,15 @@ int main(int argc, char** argv) {
       llvm::cl::ZeroOrMore, llvm::cl::CommaSeparated);
 
   tensorflow::InitMlir y(&argc, &argv);
+  mlir::registerPassManagerCLOptions();
   llvm::cl::ParseCommandLineOptions(argc, argv, "TF op GPU kernel generator\n");
 
-  std::pair<int32_t, int32_t> compute_capability(architecture / 10,
-                                                 architecture % 10);
-
-  std::string tf_code;
-  auto read_status = tensorflow::ReadFileToString(tensorflow::Env::Default(),
-                                                  input_file, &tf_code);
-  if (!read_status.ok()) {
-    LOG(ERROR) << read_status;
-    return 1;
-  }
-
-  auto cubin = tensorflow::kernel_gen::GenerateCubinForTfCode(
-      tf_code, compute_capability, tile_sizes, same_shape, unroll_factors);
-
-  if (!cubin.ok()) {
-    LOG(ERROR) << cubin.status();
-    return 1;
-  }
-
-  std::vector<uint8_t> cubin_data = cubin.ConsumeValueOrDie();
-
-  auto status = tensorflow::WriteStringToFile(
-      tensorflow::Env::Default(), output_file,
-      absl::string_view{reinterpret_cast<char*>(cubin_data.data()),
-                        cubin_data.size()});
-
+  auto status =
+      tensorflow::kernel_gen::Run(input_file, output_file, architecture,
+                                  tile_sizes, same_shape, unroll_factors);
   if (!status.ok()) {
     LOG(ERROR) << status;
     return 1;
   }
-
   return 0;
 }
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_kernel.cc b/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_kernel.cc
new file mode 100644
index 00000000000..2caa806551e
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_kernel.cc
@@ -0,0 +1,161 @@
+// Copyright 2020 The TensorFlow Runtime Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//===- tf_to_kernel.cc ------------------------------------------*- C++ -*-===//
+//
+// This file implements the entry point to compile a tf op to a kernel.
+//
+//===----------------------------------------------------------------------===//
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/CodeGen/CommandFlags.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Target/TargetMachine.h"
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Target/LLVMIR.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/init_mlir.h"
+#include "tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+
+namespace tensorflow {
+namespace kernel_gen {
+namespace {
+
+static llvm::codegen::RegisterCodeGenFlags CGF;
+
+std::unique_ptr<llvm::TargetMachine> GetTargetMachine(llvm::Module* module) {
+  llvm::Triple triple(module->getTargetTriple());
+  if (triple.getTriple().empty()) {
+    triple = llvm::Triple(llvm::sys::getDefaultTargetTriple());
+    module->setTargetTriple(triple.getTriple());
+  }
+
+  std::string error;
+  const llvm::Target* target =
+      llvm::TargetRegistry::lookupTarget("", triple, error);
+  if (!target) {
+    return nullptr;
+  }
+
+  llvm::TargetOptions target_options =
+      llvm::codegen::InitTargetOptionsFromCodeGenFlags();
+  return std::unique_ptr<llvm::TargetMachine>(target->createTargetMachine(
+      triple.str(), "generic", "", target_options, llvm::Reloc::Model::PIC_));
+}
+
+// Compiles the given MLIR module via LLVM into an executable binary format.
+xla::StatusOr<std::string> EmitToBinary(mlir::ModuleOp module) {
+  // Translate the module.
+  llvm::LLVMContext llvm_context;
+  std::unique_ptr<llvm::Module> llvm_module =
+      mlir::translateModuleToLLVMIR(module, llvm_context);
+
+  // Set up the output stream.
+  llvm::SmallString<8> outstr;
+  llvm::raw_svector_ostream ostream(outstr);
+  ostream.SetUnbuffered();
+
+  llvm::legacy::PassManager codegen_passes;
+  codegen_passes.add(new llvm::TargetLibraryInfoWrapperPass(
+      llvm::Triple(llvm_module->getTargetTriple())));
+
+  // TODO(b/163818770): Apply optimizations before dumping .a file.
+  auto target_machine = GetTargetMachine(llvm_module.get());
+  llvm_module->setDataLayout(target_machine->createDataLayout());
+  if (target_machine->addPassesToEmitFile(codegen_passes, ostream, nullptr,
+                                          llvm::CGFT_ObjectFile, false)) {
+    return xla::InternalError("Failed add passes to emit file");
+  }
+  codegen_passes.run(*llvm_module);
+  return ostream.str().str();
+}
+
+xla::Status Run(llvm::StringRef input_file, llvm::StringRef output_file,
+                int32_t architecture, llvm::ArrayRef<uint32_t> tile_sizes,
+                llvm::ArrayRef<uint32_t> same_shape,
+                llvm::ArrayRef<uint32_t> unroll_factors) {
+  // Read TF code.
+  std::string tf_code;
+  TF_RETURN_IF_ERROR(
+      ReadFileToString(Env::Default(), input_file.str(), &tf_code));
+  // Compile.
+  mlir::MLIRContext context;
+  TF_ASSIGN_OR_RETURN(
+      mlir::OwningModuleRef module,
+      GenerateKernelForTfCode(context, tf_code, /*gpu_binary_only=*/false,
+                              architecture, tile_sizes, same_shape,
+                              unroll_factors));
+  // Get binary.
+  TF_ASSIGN_OR_RETURN(std::string binary, EmitToBinary(*module));
+
+  // Write .a file.
+  TF_RETURN_IF_ERROR(
+      WriteStringToFile(Env::Default(), output_file.str(), binary));
+  return xla::Status::OK();
+}
+
+}  // namespace
+}  // namespace kernel_gen
+}  // namespace tensorflow
+
+int main(int argc, char** argv) {
+  llvm::cl::opt<std::string> input_file("input", llvm::cl::desc("input file"),
+                                        llvm::cl::value_desc("filename"),
+                                        llvm::cl::init("foo.mlir"));
+  llvm::cl::opt<std::string> output_file(
+      "output", llvm::cl::desc("output file"), llvm::cl::value_desc("filename"),
+      llvm::cl::init("foo.bin"));
+  llvm::cl::opt<int32_t> architecture(
+      "arch", llvm::cl::desc("target architecture (e.g. 50 for sm_50)"),
+      llvm::cl::init(50));
+  llvm::cl::list<uint32_t> tile_sizes(
+      "tile_sizes", llvm::cl::desc("tile sizes to use"), llvm::cl::ZeroOrMore,
+      llvm::cl::CommaSeparated);
+  llvm::cl::list<uint32_t> unroll_factors(
+      "unroll_factors",
+      llvm::cl::desc("factors to unroll by, separated by commas"),
+      llvm::cl::ZeroOrMore, llvm::cl::CommaSeparated);
+  llvm::cl::list<uint32_t> same_shape(
+      "same_shape",
+      llvm::cl::desc("arguments with same shape, separated by commas"),
+      llvm::cl::ZeroOrMore, llvm::cl::CommaSeparated);
+
+  tensorflow::InitMlir y(&argc, &argv);
+  llvm::InitializeNativeTarget();
+  llvm::InitializeNativeTargetAsmPrinter();
+  mlir::registerPassManagerCLOptions();
+  llvm::cl::ParseCommandLineOptions(argc, argv, "TF op GPU kernel generator\n");
+
+  auto status =
+      tensorflow::kernel_gen::Run(input_file, output_file, architecture,
+                                  tile_sizes, same_shape, unroll_factors);
+  if (!status.ok()) {
+    LOG(ERROR) << status;
+    return 1;
+  }
+  return 0;
+}
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tools/kernel-gen-opt/kernel-gen-opt.cc b/tensorflow/compiler/mlir/tools/kernel_gen/tools/kernel-gen-opt/kernel-gen-opt.cc
index c1af35617b1..85f1fafd436 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tools/kernel-gen-opt/kernel-gen-opt.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tools/kernel-gen-opt/kernel-gen-opt.cc
@@ -13,110 +13,27 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/InitLLVM.h"
-#include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/ToolOutputFile.h"
-#include "mlir/IR/AsmState.h"  // from @llvm-project
-#include "mlir/IR/Dialect.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/InitAllDialects.h"  // from @llvm-project
 #include "mlir/InitAllPasses.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Pass/PassManager.h"  // from @llvm-project
-#include "mlir/Support/FileUtilities.h"  // from @llvm-project
 #include "mlir/Support/MlirOptMain.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/register.h"
+#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/register_passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
+#include "tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.h"
 #include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h"
 
-// NOLINTNEXTLINE
-static llvm::cl::opt<std::string> inputFilename(llvm::cl::Positional,
-                                                llvm::cl::desc("<input file>"),
-                                                llvm::cl::init("-"));
-
-// NOLINTNEXTLINE
-static llvm::cl::opt<std::string> outputFilename(
-    "o", llvm::cl::desc("Output filename"), llvm::cl::value_desc("filename"),
-    llvm::cl::init("-"));
-
-// NOLINTNEXTLINE
-static llvm::cl::opt<bool> splitInputFile(
-    "split-input-file",
-    llvm::cl::desc("Split the input file into pieces and process each "
-                   "chunk independently"),
-    llvm::cl::init(false));
-
-// NOLINTNEXTLINE
-static llvm::cl::opt<bool> verifyDiagnostics(
-    "verify-diagnostics",
-    llvm::cl::desc("Check that emitted diagnostics match "
-                   "expected-* lines on the corresponding line"),
-    llvm::cl::init(false));
-
-// NOLINTNEXTLINE
-static llvm::cl::opt<bool> verifyPasses(
-    "verify-each",
-    llvm::cl::desc("Run the verifier after each transformation pass"),
-    llvm::cl::init(true));
-
-// NOLINTNEXTLINE
-static llvm::cl::opt<bool> allowUnregisteredDialects(
-    "allow-unregistered-dialect",
-    llvm::cl::desc("Allow operation with no registered dialects"),
-    llvm::cl::init(false));
-
-// NOLINTNEXTLINE
-static llvm::cl::opt<bool> showDialects(
-    "show-dialects", llvm::cl::desc("Print the list of registered dialects"),
-    llvm::cl::init(false));
-
 int main(int argc, char **argv) {
-  mlir::registerAllDialects();
   mlir::registerAllPasses();
-
-  mlir::mhlo::registerAllDialects();
+  mlir::mhlo::registerAllMhloPasses();
+  mlir::lmhlo::registerAllLmhloPasses();
   mlir::kernel_gen::registerKernelGenPasses();
 
-  llvm::InitLLVM y(argc, argv);
+  mlir::DialectRegistry registry;
+  mlir::registerAllDialects(registry);
+  mlir::mhlo::registerAllMhloDialects(registry);
+  mlir::RegisterAllTensorFlowDialects(registry);
+  registry.insert<mlir::kernel_gen::tf_framework::TFFrameworkDialect>();
 
-  // Register any pass manager command line options.
-  mlir::registerAsmPrinterCLOptions();
-  mlir::registerPassManagerCLOptions();
-  mlir::PassPipelineCLParser passPipeline("", "Compiler passes to run");
-
-  // Parse pass names in main to ensure static initialization completed.
-  llvm::cl::ParseCommandLineOptions(argc, argv,
-                                    "MLIR modular optimizer driver\n");
-
-  if (showDialects) {
-    mlir::MLIRContext context;
-    llvm::outs() << "Registered Dialects:\n";
-    for (mlir::Dialect *dialect : context.getRegisteredDialects()) {
-      llvm::outs() << dialect->getNamespace() << "\n";
-    }
-    return 0;
-  }
-
-  // Set up the input file.
-  std::string errorMessage;
-  auto file = mlir::openInputFile(inputFilename, &errorMessage);
-  if (!file) {
-    llvm::errs() << errorMessage << "\n";
-    return 1;
-  }
-
-  auto output = mlir::openOutputFile(outputFilename, &errorMessage);
-  if (!output) {
-    llvm::errs() << errorMessage << "\n";
-    exit(1);
-  }
-
-  if (failed(MlirOptMain(output->os(), std::move(file), passPipeline,
-                         splitInputFile, verifyDiagnostics, verifyPasses,
-                         allowUnregisteredDialects))) {
-    return 1;
-  }
-  // Keep the output file if the invocation of MlirOptMain was successful.
-  output->keep();
-  return 0;
+  return failed(
+      mlir::MlirOptMain(argc, argv, "MLIR HLO pass driver\n", registry));
 }
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/BUILD
index 0d346da9956..b853dea39d2 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/BUILD
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/BUILD
@@ -1,4 +1,12 @@
 load("//third_party/mlir:tblgen.bzl", "gentbl")
+load(
+    "//tensorflow/core/platform/default:cuda_build_defs.bzl",
+    "if_cuda_is_configured",
+)
+load(
+    "@local_config_rocm//rocm:build_defs.bzl",
+    "if_rocm_is_configured",
+)
 
 package(
     default_visibility = ["//tensorflow/compiler/mlir/tools/kernel_gen:friends"],
@@ -20,6 +28,21 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "bufferize",
+    srcs = ["bufferize.cc"],
+    hdrs = ["rewriters.h"],
+    deps = [
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SCFDialect",
+        "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:Transforms",
+    ],
+)
+
 cc_library(
     name = "embed_tf_framework",
     srcs = ["embed_tf_framework.cc"],
@@ -36,7 +59,7 @@ cc_library(
 )
 
 gentbl(
-    name = "tf_framework_passes_inc_gen",
+    name = "kernel_gen_passes_inc_gen",
     tbl_outs = [("-gen-pass-decls -name KernelGen", "kernel_gen_passes.h.inc")],
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "passes.td",
@@ -46,28 +69,57 @@ gentbl(
 cc_library(
     name = "passes",
     srcs = [
+        "bufferize_pass.cc",
         "embed_tf_framework_pass.cc",
+        "gpu_kernel_to_blob_pass.cc",
+        "materialize_broadcasts_pass.cc",
+        "propagate_tf_abi_knowledge_pass.cc",
         "shape_to_descriptors_pass.cc",
-        "tf_framework_legalize_to_llvm_pass.cc",
+        "tf_kernel_to_llvm_pass.cc",
+        "unfuse_batch_norm_pass.cc",
     ],
     hdrs = ["passes.h"],
+    copts = if_cuda_is_configured(["-DGOOGLE_CUDA=1"]) + if_rocm_is_configured(["-DTENSORFLOW_USE_ROCM=1"]),
     deps = [
+        "//tensorflow/compiler/mlir/hlo:materialize_broadcasts",  # buildcleaner: keep
+        "//tensorflow/compiler/mlir/hlo:unfuse_batch_norm",  # buildcleaner: keep
+        "//tensorflow/compiler/xla/service:hlo_module_config",
+        "//tensorflow/compiler/xla:debug_options_flags",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla/service/gpu:target_constants",
+        "//tensorflow/compiler/xla/service/gpu/llvm_gpu_backend",
+        "//tensorflow/core:cuda_libdevice_path",
+        "//tensorflow/core:lib",
+        ":bufferize",
         ":embed_tf_framework",
+        ":kernel_gen_passes_inc_gen",
         ":tf_framework_legalize_to_llvm",
-        ":tf_framework_passes_inc_gen",
-        "//tensorflow/compiler/mlir/tools/kernel_gen/ir:tf_framework_ops",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:GPUDialect",
+        "@llvm-project//mlir:GPUToGPURuntimeTransforms",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LLVMDialect",
         "@llvm-project//mlir:LLVMTransforms",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:Shape",
-        "@llvm-project//mlir:ShapeToSCF",
+        "@llvm-project//mlir:TargetNVVMIR",
+        "@llvm-project//mlir:TargetROCDLIR",
         "@llvm-project//mlir:ShapeToStandard",
         "@llvm-project//mlir:ShapeTransforms",
         "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:Transforms",
-    ],
+        "//tensorflow/compiler/mlir/hlo",
+        "//tensorflow/compiler/mlir/hlo:hlo_legalize_to_lhlo",
+        "//tensorflow/compiler/mlir/hlo:lhlo",
+        "//tensorflow/compiler/xla/service/gpu:stream_executor_util",
+        "//tensorflow/compiler/mlir/hlo:lhlo_legalize_to_llvm",
+        "//tensorflow/compiler/mlir/tools/kernel_gen/ir:tf_framework_ops",
+    ] + if_cuda_is_configured([
+        "//tensorflow/stream_executor/gpu:asm_compiler",
+    ]) + if_rocm_is_configured([
+        "//tensorflow/core/platform:rocm_rocdl_path",
+    ]),
 )
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/bufferize.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/bufferize.cc
new file mode 100644
index 00000000000..45b8c524650
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/bufferize.cc
@@ -0,0 +1,188 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file implements logic for translating mixed IR to buffer form.
+
+#include <cstddef>
+#include <memory>
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Dialect/SCF/SCF.h"  // from @llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/BlockAndValueMapping.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Transforms/BufferPlacement.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+
+namespace mlir {
+namespace kernel_gen {
+namespace transforms {
+
+namespace {
+
+class TensorFromElementsOpConverter
+    : public BufferAssignmentOpConversionPattern<TensorFromElementsOp> {
+ public:
+  using BufferAssignmentOpConversionPattern<
+      TensorFromElementsOp>::BufferAssignmentOpConversionPattern;
+
+  LogicalResult matchAndRewrite(
+      TensorFromElementsOp op, ArrayRef<Value> operands,
+      ConversionPatternRewriter &rewriter) const final {
+    Location loc = op.getLoc();
+    ShapedType result_type = op.getType().cast<ShapedType>();
+    int number_of_elements = op.elements().size();
+    MemRefType memref_type =
+        MemRefType::get({number_of_elements}, result_type.getElementType());
+    Value result = rewriter.create<AllocaOp>(loc, memref_type);
+    for (auto operand : llvm::enumerate(operands)) {
+      Value index = rewriter.create<ConstantIndexOp>(loc, operand.index());
+      rewriter.create<StoreOp>(loc, operand.value(), result, index);
+    }
+    rewriter.replaceOp(op, {result});
+    return success();
+  }
+};
+
+class DynamicTensorFromElementsOpConverter
+    : public BufferAssignmentOpConversionPattern<DynamicTensorFromElementsOp> {
+ public:
+  using BufferAssignmentOpConversionPattern<
+      DynamicTensorFromElementsOp>::BufferAssignmentOpConversionPattern;
+
+  LogicalResult matchAndRewrite(
+      DynamicTensorFromElementsOp op, ArrayRef<Value> operands,
+      ConversionPatternRewriter &rewriter) const final {
+    // Allocate memory on stack.
+    Location loc = op.getLoc();
+    DynamicTensorFromElementsOp::Adaptor transformed(operands);
+    RankedTensorType tensor_ty = op.getType().cast<RankedTensorType>();
+    MemRefType memref_type =
+        MemRefType::get(tensor_ty.getShape(), tensor_ty.getElementType());
+    Value result = rewriter.create<AllocaOp>(loc, memref_type,
+                                             transformed.dynamicExtents());
+
+    // Collect loop bounds.
+    int64_t rank = tensor_ty.getRank();
+    Value zero = rewriter.create<ConstantIndexOp>(loc, 0);
+    Value one = rewriter.create<ConstantIndexOp>(loc, 1);
+    SmallVector<Value, 4> lower_bounds(rank, zero);
+    SmallVector<Value, 4> steps(rank, one);
+    SmallVector<Value, 4> upper_bounds;
+    int next_dynamic_index = 0;
+    for (int i = 0; i < rank; i++) {
+      Value ub = tensor_ty.isDynamicDim(i)
+                     ? transformed.dynamicExtents()[next_dynamic_index++]
+                     : rewriter.create<ConstantIndexOp>(
+                           loc, memref_type.getDimSize(i));
+      upper_bounds.push_back(ub);
+    }
+
+    // Generate tensor elements.
+    rewriter.create<scf::ParallelOp>(
+        loc, lower_bounds, upper_bounds, steps,
+        [&](OpBuilder &b, Location loc, ValueRange ivs) {
+          BlockAndValueMapping mapping;
+          mapping.map(op.body().getArguments(), ivs);
+          for (auto &nested_op : op.getBody()->without_terminator())
+            b.clone(nested_op, mapping);
+          auto yield_op = llvm::cast<YieldOp>(op.getBody()->getTerminator());
+          b.create<StoreOp>(loc, mapping.lookup(yield_op.value()), result, ivs);
+          b.create<scf::YieldOp>(loc);
+        });
+
+    rewriter.replaceOp(op, {result});
+    return success();
+  }
+};
+
+class TensorLoadOpConversion
+    : public BufferAssignmentOpConversionPattern<TensorLoadOp> {
+ public:
+  using BufferAssignmentOpConversionPattern<
+      TensorLoadOp>::BufferAssignmentOpConversionPattern;
+
+  LogicalResult matchAndRewrite(
+      TensorLoadOp op, ArrayRef<Value> operands,
+      ConversionPatternRewriter &rewriter) const final {
+    TensorLoadOpAdaptor adaptor(operands);
+    rewriter.replaceOp(op, {adaptor.memref()});
+    return success();
+  }
+};
+
+class ExtractElementOpConversion
+    : public BufferAssignmentOpConversionPattern<ExtractElementOp> {
+ public:
+  using BufferAssignmentOpConversionPattern<
+      ExtractElementOp>::BufferAssignmentOpConversionPattern;
+
+  LogicalResult matchAndRewrite(
+      ExtractElementOp op, ArrayRef<Value> operands,
+      ConversionPatternRewriter &rewriter) const final {
+    ExtractElementOpAdaptor adaptor(operands);
+
+    if (!adaptor.aggregate().getType().isa<MemRefType>()) {
+      return failure();
+    }
+
+    rewriter.replaceOpWithNewOp<LoadOp>(op, adaptor.aggregate(),
+                                        adaptor.indices());
+    return success();
+  }
+};
+
+class TensorCastOpConverter
+    : public BufferAssignmentOpConversionPattern<TensorCastOp> {
+ public:
+  using BufferAssignmentOpConversionPattern<
+      TensorCastOp>::BufferAssignmentOpConversionPattern;
+
+  LogicalResult matchAndRewrite(
+      TensorCastOp op, ArrayRef<Value> operands,
+      ConversionPatternRewriter &rewriter) const final {
+    auto tensor_ty = op.getType().dyn_cast<RankedTensorType>();
+    if (!tensor_ty) return failure();
+
+    Value arg = operands.front();
+    auto arg_ty = arg.getType().dyn_cast<MemRefType>();
+    if (!arg_ty) return failure();
+
+    auto result_ty = converter->convertType(tensor_ty);
+    rewriter.replaceOpWithNewOp<MemRefCastOp>(op, arg, result_ty);
+
+    return success();
+  }
+};
+
+}  // namespace
+
+void populateStandardBufferizePattern(MLIRContext *context,
+                                      BufferAssignmentTypeConverter *converter,
+                                      OwningRewritePatternList *patterns) {
+  patterns->insert<ExtractElementOpConversion, TensorFromElementsOpConverter,
+                   DynamicTensorFromElementsOpConverter, TensorLoadOpConversion,
+                   TensorCastOpConverter>(context, converter);
+}
+
+}  // namespace transforms
+}  // namespace kernel_gen
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/bufferize_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/bufferize_pass.cc
new file mode 100644
index 00000000000..8ddbb15219f
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/bufferize_pass.cc
@@ -0,0 +1,125 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file implements logic for translating mixed IR to buffer form.
+// Currently it supports MHLO and some operations from the Standard dialect.
+
+#include <memory>
+
+#include "mlir/Dialect/SCF/SCF.h"  // from @llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Transforms/BufferPlacement.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
+#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h"
+#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
+#include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/rewriters.h"
+
+namespace mlir {
+namespace kernel_gen {
+namespace transforms {
+namespace {
+
+#define GEN_PASS_CLASSES
+#include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/kernel_gen_passes.h.inc"
+
+// TODO(herhut) : This could become a real pattern in bufferize pass. What we
+// would need to do is insert a copy to model the semantics correctly. The same
+// is true for the TensorLoad pattern that is already in there.  Then buffer
+// assignment free insertion and copy removal should clean this up for us.
+//
+// This patten erases `tensor_store(src_unranked_tensor, dst_unranked_memref)`
+// op and replaces the result of the defining op produced `dst_unranked_memref`
+// with the rewritten `src_unranked_tensor`.
+class UnrankedTensorStoreTestOnlyPattern
+    : public OpConversionPattern<mlir::TensorStoreOp> {
+ public:
+  using OpConversionPattern<mlir::TensorStoreOp>::OpConversionPattern;
+
+  LogicalResult matchAndRewrite(
+      mlir::TensorStoreOp op, ArrayRef<Value> operands,
+      ConversionPatternRewriter& rewriter) const final {
+    rewriter.replaceOp(op.memref().getDefiningOp(), op.tensor());
+    rewriter.replaceOp(op, {});
+    return success();
+  }
+};
+
+struct BufferizePass : public BufferizePassBase<BufferizePass> {
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<lmhlo::LmhloDialect>();
+  }
+
+ public:
+  void runOnOperation() override {
+    auto& context = getContext();
+    ConversionTarget target(context);
+    target.addLegalDialect<lmhlo::LmhloDialect, scf::SCFDialect,
+                           StandardOpsDialect>();
+    target.addLegalOp<ModuleOp, ModuleTerminatorOp>();
+    target.addIllegalDialect<mhlo::MhloDialect>();
+    target.addIllegalOp<DynamicTensorFromElementsOp, ExtractElementOp,
+                        TensorFromElementsOp, TensorLoadOp, YieldOp,
+                        TensorCastOp>();
+    target.addDynamicallyLegalOp<TensorStoreOp>([&](TensorStoreOp op) {
+      return !op.tensor().getType().isa<UnrankedTensorType>();
+    });
+
+    BufferAssignmentTypeConverter converter;
+    auto typesAreLegal = [&converter](Operation* op) {
+      return converter.isLegal(op->getOperandTypes()) &&
+             converter.isLegal(op->getResultTypes());
+    };
+    target.addDynamicallyLegalOp<FuncOp>([&](FuncOp op) {
+      auto inputs = op.getType().getInputs();
+      auto results = op.getType().getResults();
+      return converter.isLegal(inputs) && converter.isLegal(results) &&
+             converter.isLegal(&op.getBody());
+    });
+    target.addDynamicallyLegalOp<CallOp>(typesAreLegal);
+    target.addDynamicallyLegalOp<ReturnOp>(typesAreLegal);
+
+    OwningRewritePatternList patterns;
+    mhlo::populateHLOToLHLOConversionPattern(&context, &converter, &patterns);
+    populateWithBufferAssignmentOpConversionPatterns<ReturnOp, ReturnOp,
+                                                     lmhlo::CopyOp>(
+        &context, &converter, &patterns);
+    populateStandardBufferizePattern(&context, &converter, &patterns);
+    patterns.insert<UnrankedTensorStoreTestOnlyPattern>(&context);
+
+    auto module = getOperation();
+    if (failed(applyPartialConversion(module, target, patterns))) {
+      signalPassFailure();
+    }
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<OperationPass<ModuleOp> > CreateBufferizePass() {
+  return std::make_unique<BufferizePass>();
+}
+
+}  // namespace transforms
+}  // namespace kernel_gen
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/embed_tf_framework_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/embed_tf_framework_pass.cc
index a0cfcae65d1..6aea4d9c619 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/embed_tf_framework_pass.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/embed_tf_framework_pass.cc
@@ -36,6 +36,10 @@ static constexpr StringRef kTFEntry = "tf_entry";
 // * std.dealloc becomes tf_framework.dealloc_raw.
 class EmbedTFFrameworkPass
     : public EmbedTFFrameworkPassBase<EmbedTFFrameworkPass> {
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<mlir::kernel_gen::tf_framework::TFFrameworkDialect>();
+  }
+
  public:
   void runOnOperation() override {
     ModuleOp m = getOperation();
@@ -68,7 +72,7 @@ class EmbedTFFrameworkPass
 
 }  // namespace
 
-std::unique_ptr<OperationPass<ModuleOp> > createEmbedTFFrameworkPass() {
+std::unique_ptr<OperationPass<ModuleOp> > CreateEmbedTFFrameworkPass() {
   return std::make_unique<EmbedTFFrameworkPass>();
 }
 
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc
new file mode 100644
index 00000000000..773e12f2da3
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc
@@ -0,0 +1,150 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/Target/NVVMIR.h"  // from @llvm-project
+#include "mlir/Target/ROCDLIR.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
+#include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h"
+#include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
+#include "tensorflow/compiler/xla/service/gpu/target_constants.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/core/platform/cuda_libdevice_path.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/path.h"
+
+#if GOOGLE_CUDA
+#include "tensorflow/stream_executor/gpu/asm_compiler.h"
+#elif TENSORFLOW_USE_ROCM
+#include "tensorflow/core/platform/rocm_rocdl_path.h"
+#endif
+
+namespace mlir {
+namespace kernel_gen {
+namespace transforms {
+namespace {
+
+#define GEN_PASS_CLASSES
+#include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/kernel_gen_passes.h.inc"
+
+using xla::InternalError;
+
+class GpuKernelToBlobPass
+    : public GpuKernelToBlobPassBase<GpuKernelToBlobPass> {
+ public:
+  GpuKernelToBlobPass(mlir::StringRef blob_annotation, int32_t arch) {
+    blob_annotation_ = blob_annotation;
+    arch_ = arch;
+  }
+
+  void runOnOperation() override {
+    mlir::gpu::GPUModuleOp gpu_module = getOperation();
+    auto blob_or = GetGpuBinaryBlob(gpu_module);
+    if (blob_or.ok()) {
+      const auto& blob = blob_or.ValueOrDie();
+      std::string blob_string(blob.begin(), blob.end());
+      gpu_module.setAttr(blob_annotation_,
+                         mlir::StringAttr::get(blob_string, &getContext()));
+      return;
+    }
+    return signalPassFailure();
+  }
+
+  xla::StatusOr<std::vector<uint8_t>> GetGpuBinaryBlob(
+      mlir::gpu::GPUModuleOp gpu_module) {
+    llvm::LLVMContext llvmContext;
+#if TENSORFLOW_USE_ROCM
+    auto llvmModule = mlir::translateModuleToROCDLIR(gpu_module, llvmContext);
+    if (!llvmModule) {
+      return InternalError("Could not translate MLIR module to ROCDL IR");
+    }
+
+    llvmModule->setModuleIdentifier("acme");
+
+    xla::HloModuleConfig config;
+    config.set_debug_options(xla::GetDebugOptionsFromFlags());
+
+    std::string libdevice_dir = tensorflow::RocdlRoot();
+
+    return xla::gpu::amdgpu::CompileToHsaco(llvmModule.get(), arch_, config,
+                                            libdevice_dir);
+
+#elif GOOGLE_CUDA
+    auto llvmModule = mlir::translateModuleToNVVMIR(gpu_module, llvmContext);
+    if (!llvmModule) {
+      return InternalError("Could not translate MLIR module to NVVM");
+    }
+
+    llvmModule->setModuleIdentifier("acme");
+    llvmModule->setDataLayout(xla::gpu::nvptx::kDataLayout);
+
+    xla::HloModuleConfig config;
+    config.set_debug_options(xla::GetDebugOptionsFromFlags());
+
+    auto enable_fusion = [](llvm::TargetMachine* target) {
+      target->Options.AllowFPOpFusion = llvm::FPOpFusion::FPOpFusionMode::Fast;
+    };
+
+    int32_t cc_major = arch_ / 10;
+    int32_t cc_minor = arch_ % 10;
+    TF_ASSIGN_OR_RETURN(std::string libdevice_dir, GetLibdeviceDir(config));
+    TF_ASSIGN_OR_RETURN(
+        std::string ptx,
+        xla::gpu::nvptx::CompileToPtx(llvmModule.get(),
+                                      std::make_pair(cc_major, cc_minor),
+                                      config, libdevice_dir, enable_fusion));
+    VLOG(1) << ptx;
+
+    return tensorflow::se::CompileGpuAsm(cc_major, cc_minor, ptx.c_str(),
+                                         xla::gpu::PtxOptsFromConfig(config));
+#endif
+    return InternalError(
+        "Neither TENSORFLOW_USE_ROCM nor GOOGLE_CUDA are defined."
+        " Did you specify either --config=rocm or --config=cuda ?");
+  }
+
+ private:
+  xla::StatusOr<std::string> GetLibdeviceDir(
+      const xla::HloModuleConfig& hlo_module_config) {
+    for (const std::string& cuda_root : tensorflow::CandidateCudaRoots(
+             hlo_module_config.debug_options().xla_gpu_cuda_data_dir())) {
+      std::string libdevice_dir =
+          tensorflow::io::JoinPath(cuda_root, "nvvm", "libdevice");
+      VLOG(2) << "Looking for libdevice at " << libdevice_dir;
+      if (tensorflow::Env::Default()->IsDirectory(libdevice_dir).ok()) {
+        VLOG(2) << "Found libdevice dir " << libdevice_dir;
+        return libdevice_dir;
+      }
+    }
+    return InternalError(
+        "Can't find libdevice directory ${CUDA_DIR}/nvvm/libdevice");
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<OperationPass<gpu::GPUModuleOp>> CreateGpuKernelToBlobPass(
+    mlir::StringRef blob_annotation, int32_t architecture) {
+  return std::make_unique<GpuKernelToBlobPass>(blob_annotation, architecture);
+}
+
+}  // namespace transforms
+}  // namespace kernel_gen
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/materialize_broadcasts_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/materialize_broadcasts_pass.cc
new file mode 100644
index 00000000000..dd3f32e2b3c
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/materialize_broadcasts_pass.cc
@@ -0,0 +1,61 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
+#include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h"
+
+namespace mlir {
+namespace kernel_gen {
+namespace transforms {
+namespace {
+
+#define GEN_PASS_CLASSES
+#include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/kernel_gen_passes.h.inc"
+
+struct MaterializeBroadcastsPass
+    : public MaterializeBroadcastsPassBase<MaterializeBroadcastsPass> {
+  void runOnFunction() override {
+    mlir::ConversionTarget conversionTarget(getContext());
+    mlir::OwningRewritePatternList conversionPatterns;
+
+    // Consider the mhlo dialect legal for tests.
+    conversionTarget.addLegalDialect<mlir::mhlo::MhloDialect>();
+    // The conversion uses helpers from the Standard dialect.
+    conversionTarget.addLegalDialect<mlir::StandardOpsDialect>();
+
+    mlir::mhlo::SetupMaterializeBroadcastsLegality(&getContext(),
+                                                   &conversionTarget);
+    mlir::mhlo::PopulateMaterializeBroadcastsPatterns(&getContext(),
+                                                      &conversionPatterns);
+
+    if (failed(applyPartialConversion(getFunction(), conversionTarget,
+                                      conversionPatterns))) {
+      return signalPassFailure();
+    }
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<mlir::FunctionPass> CreateMaterializeBroadcastsPass() {
+  return std::make_unique<MaterializeBroadcastsPass>();
+}
+
+}  // namespace transforms
+}  // namespace kernel_gen
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h
index 13f367c9fe4..179059e54eb 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h
@@ -18,6 +18,8 @@ limitations under the License.
 
 #include <memory>
 
+#include "mlir/Dialect/GPU/GPUDialect.h"  // from @llvm-project
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
 #include "mlir/IR/Module.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 
@@ -25,23 +27,41 @@ namespace mlir {
 namespace kernel_gen {
 namespace tf_framework {
 
-// Test pass for applying TF Framework -> LLVM patterns.
-std::unique_ptr<OperationPass<ModuleOp> >
-createTestTFFrameworkLegalizeToLLVMPass();
-
 // Pass to replace some of the Standard ops with TF Framework ops.
 // * adds tf_framework::OpKernelContextType argument to the function
 // * std.alloc becomes tf_framework.alloc_raw
 // * std.dealloc becomes tf_framework.dealloc_raw
-std::unique_ptr<OperationPass<ModuleOp> > createEmbedTFFrameworkPass();
+std::unique_ptr<OperationPass<ModuleOp> > CreateEmbedTFFrameworkPass();
 
 }  // namespace tf_framework
 
 namespace transforms {
 
+// Pass for applying LLVM legalization patterns.
+std::unique_ptr<OperationPass<ModuleOp> > CreateTFKernelToLLVMPass();
+
 // Pass to tranform shape computations in shape dialect to standard and scf
 // using memref descriptors.
-std::unique_ptr<Pass> CreateShapeToDescriptorsPass();
+std::unique_ptr<OperationPass<ModuleOp> > CreateShapeToDescriptorsPass();
+
+// Pass to tranform computations on values to their corresponding parts on
+// buffers.
+std::unique_ptr<OperationPass<ModuleOp> > CreateBufferizePass();
+
+// Pass to materialize broadcasts.
+std::unique_ptr<FunctionPass> CreateMaterializeBroadcastsPass();
+
+// Pass to propagate TF ABI knowledge, e.g. offsets, alignment.
+std::unique_ptr<OperationPass<LLVM::LLVMFuncOp>>
+CreatePropagateTensorFlowABIKnowledgePass(
+    mlir::FunctionType type = {}, llvm::ArrayRef<uint32_t> same_shape = {});
+
+// Pass to annotate GPU Module with its PTX.
+std::unique_ptr<OperationPass<gpu::GPUModuleOp>> CreateGpuKernelToBlobPass(
+    mlir::StringRef blob_annotation = "", int32_t architecture = 0);
+
+// Pass to unfuse batch norm.
+std::unique_ptr<FunctionPass> CreateUnfuseBatchNormPass();
 
 }  // namespace transforms
 
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.td b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.td
index 61720674926..5264ef3ec94 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.td
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.td
@@ -13,25 +13,61 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TF_FRAMEWORK_PASSES
-#define TF_FRAMEWORK_PASSES
+#ifndef TF_KERNEL_GEN_PASSES
+#define TF_KERNEL_GEN_PASSES
 
 include "mlir/Pass/PassBase.td"
 
-def TestTFFrameworkLegalizeToLLVMPass
-    : Pass<"test-tf-framework-legalize-to-llvm", "ModuleOp"> {
-  let summary = "Test pass for applying TF Framework -> LLVM patterns.";
-  let constructor = "tf_framework::createTestTFFrameworkLegalizeToLLVMPass()";
+def TFKernelToLLVMPass : Pass<"tf-kernel-to-llvm", "ModuleOp"> {
+  let summary = "Pass for applying LLVM legalization patterns.";
+  let constructor = "transforms::CreateTFKernelToLLVMPass()";
 }
 
 def EmbedTFFrameworkPass : Pass<"embed-tf-framework", "ModuleOp"> {
   let summary = "Pass to embed TF Framework for allocation and error reporting";
-  let constructor = "tf_framework::createEmbedTFFrameworkPass()";
+  let constructor = "tf_framework::CreateEmbedTFFrameworkPass()";
 }
 
-def ShapeToDescriptorsPass : Pass<"test-shape-to-descriptors", "ModuleOp"> {
+def ShapeToDescriptorsPass : Pass<"shape-to-descriptors", "ModuleOp"> {
   let summary = "Pass to transform shape computations to descriptors";
   let constructor = "transforms::CreateShapeToDescriptorsPass()";
 }
 
-#endif // TF_FRAMEWORK_PASSES
+def BufferizePass : Pass<"bufferize", "ModuleOp"> {
+  let summary = "Pass to transform operations on values to buffer based ones";
+  let constructor = "transforms::CreateBufferizePass()";
+}
+
+def MaterializeBroadcastsPass : FunctionPass<"materialize-broadcast"> {
+  let summary = "Pass to materialize broadcasts";
+  let constructor = "transforms::CreateMaterializeBroadcastsPass()";
+}
+
+def UnfuseBatchNormPass : FunctionPass<"unfuse-batch-norm"> {
+  let summary = "Pass to unfuse batch norm";
+  let constructor = "transforms::CreateUnfuseBatchNormPass()";
+}
+
+def GpuKernelToBlobPass : Pass<"gpu-kernel-to-blob", "gpu::GPUModuleOp"> {
+  let summary = "Pass to annotate GPU Module with its PTX";
+  let options = [
+    Option<"blob_annotation_", "blob-annotation", "mlir::StringRef",
+           /*default=*/"", "Blob attribute name">,
+    Option<"arch_", "arch", "int32_t", /*default=*/"0", "GPU architecture">,
+  ];
+  let constructor = "transforms::CreateGpuKernelToBlobPass()";
+}
+
+def PropagateTensorFlowABIKnowledgePass
+    : Pass<"propagate-tf-abi-knowledge", "LLVM::LLVMFuncOp"> {
+  let summary = "Pass to propagate TF ABI knowledge, e.g. offsets, alignment";
+  let options = [
+    Option<"func_type_", "func-type", "mlir::FunctionType",
+           /*default=*/"", "Function type">,
+    ListOption<"same_shape_", "same-shape", "uint32_t",
+               "List of same shape args">,
+  ];
+  let constructor = "transforms::CreatePropagateTensorFlowABIKnowledgePass()";
+}
+
+#endif // TF_KERNEL_GEN_PASSES
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/propagate_tf_abi_knowledge_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/propagate_tf_abi_knowledge_pass.cc
new file mode 100644
index 00000000000..57a5fec527a
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/propagate_tf_abi_knowledge_pass.cc
@@ -0,0 +1,123 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
+#include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h"
+
+namespace mlir {
+namespace kernel_gen {
+namespace transforms {
+namespace {
+
+#define GEN_PASS_CLASSES
+#include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/kernel_gen_passes.h.inc"
+
+struct PropagateTensorFlowABIKnowledgePass
+    : public PropagateTensorFlowABIKnowledgePassBase<
+          PropagateTensorFlowABIKnowledgePass> {
+  explicit PropagateTensorFlowABIKnowledgePass(
+      mlir::FunctionType type, llvm::ArrayRef<uint32_t> same_shape) {
+    func_type_ = type;
+    same_shape_ = same_shape;
+  }
+
+  void runOnOperation() override {
+    // We know due to tensorflow ABI that the offset is always 0 and that the
+    // innermost stride is always 1. To make this visible to the compiler,
+    // we insert constants into the code and replace usages accordingly.
+    // We do not change the signature so that we keep a somewhat stable ABI
+    // that is easy to undertand by tools.
+    // We also know that tensorflow aligns all allocated pointers by 16, so
+    // we pass this on. Furthermore, we know that arguments never alias. More
+    // precicely, they may only alias (due to reuse) if the kernel does not
+    // read from a position it previously has written to. We express this with
+    // the noalias attribute.
+    mlir::LLVM::LLVMFuncOp func = getOperation();
+
+    // This only works if the function is local and we can rewrite it.
+    if (func.isExternal()) return;
+
+    mlir::OpBuilder b(func.getBody());
+    // Steal the LLVM representation of the index type from the third argument.
+    auto index_type = func.getArgument(3).getType();
+    mlir::Value one = b.create<mlir::LLVM::ConstantOp>(
+        func.getLoc(), index_type, b.getIntegerAttr(b.getIndexType(), 1));
+    mlir::Value zero = b.create<mlir::LLVM::ConstantOp>(
+        func.getLoc(), index_type, b.getIntegerAttr(b.getIndexType(), 0));
+    uint32_t arg_pos = 0;
+    std::vector<uint32_t> positions;
+    // Collect the agument and return types of the surrounding function.
+    auto arg_types = llvm::to_vector<4>(llvm::concat<const mlir::Type>(
+        func_type_.getInputs(), func_type_.getResults()));
+    for (mlir::Type arg_type : arg_types) {
+      if (!arg_type.isa<mlir::MemRefType>()) {
+        func.emitError() << "argument of surrounding func is not ranked memref";
+        return signalPassFailure();
+      }
+      positions.push_back(arg_pos);
+      // Set alignment and aliasing on the pointers.
+      func.setArgAttr(arg_pos + 1, "llvm.noalias", b.getBoolAttr(true));
+      func.setArgAttr(arg_pos + 1, "llvm.align", b.getIndexAttr(16));
+      // Replace the offset with zero. Offset is argument number 3.
+      func.getArgument(arg_pos + 2).replaceAllUsesWith(zero);
+      // Forward over base_ptr, aligned_ptr, offset, size and stride arguments.
+      arg_pos += 3 + arg_type.cast<mlir::MemRefType>().getRank() * 2;
+      // Replace the last stride with constant 1.
+      func.getArgument(arg_pos - 1).replaceAllUsesWith(one);
+    }
+
+    // If we have knowledge that some arguments have the same shape, we
+    // can use that here. Simply replace usages of the shape parameters within
+    // the function body to a single shape parameter.
+    if (same_shape_.empty()) {
+      return;
+    }
+    auto first = same_shape_.front();
+    auto first_offset = positions.at(first);
+    auto first_type = arg_types[first].cast<mlir::ShapedType>();
+    uint32_t rank = first_type.getRank();
+    for (int i = 1, e = same_shape_.size(); i < e; ++i) {
+      uint32_t same = same_shape_[i];
+      uint32_t same_offset = positions.at(same);
+      auto same_type = arg_types[same].cast<mlir::ShapedType>();
+      if (same_type.getRank() != rank) {
+        func.emitOpError() << "same shape constraints on arguments with "
+                              "non-matching shapes: #"
+                           << first << " and #" << same;
+        return signalPassFailure();
+      }
+
+      for (uint32_t i = 0; i < 2 * rank; ++i) {
+        // Replace uses for second arg data with first arg.
+        auto same_arg = func.getArgument(same_offset + 3 + i);
+        auto first_arg = func.getArgument(first_offset + 3 + i);
+        same_arg.replaceAllUsesWith(first_arg);
+      }
+    }
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<mlir::OperationPass<mlir::LLVM::LLVMFuncOp>>
+CreatePropagateTensorFlowABIKnowledgePass(mlir::FunctionType type,
+                                          llvm::ArrayRef<uint32_t> same_shape) {
+  return std::make_unique<PropagateTensorFlowABIKnowledgePass>(type,
+                                                               same_shape);
+}
+
+}  // namespace transforms
+}  // namespace kernel_gen
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/rewriters.h b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/rewriters.h
index 257e84b4a21..0f2a41b3de6 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/rewriters.h
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/rewriters.h
@@ -20,6 +20,8 @@ limitations under the License.
 
 namespace mlir {
 
+class BufferAssignmentPlacer;
+class BufferAssignmentTypeConverter;
 class LLVMTypeConverter;
 class MLIRContext;
 class OwningRewritePatternList;
@@ -37,6 +39,15 @@ void PopulateEmbedTFFrameworkConversionPatterns(
     MLIRContext *context, OwningRewritePatternList *patterns);
 
 }  // namespace tf_framework
+
+namespace transforms {
+
+/// Collects a set of patterns that bufferize operations from the standard
+/// dialect.
+void populateStandardBufferizePattern(MLIRContext *context,
+                                      BufferAssignmentTypeConverter *converter,
+                                      OwningRewritePatternList *patterns);
+}  // namespace transforms
 }  // namespace kernel_gen
 }  // namespace mlir
 
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/shape_to_descriptors_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/shape_to_descriptors_pass.cc
index 9c1b434b9b2..ab66c513e33 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/shape_to_descriptors_pass.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/shape_to_descriptors_pass.cc
@@ -16,7 +16,6 @@ limitations under the License.
 // This file combines patterns for lowering shape dialect to standard ops,
 // structured control flow and descriptors.
 
-#include "mlir/Conversion/ShapeToSCF/ShapeToSCF.h"  // from @llvm-project
 #include "mlir/Conversion/ShapeToStandard/ShapeToStandard.h"  // from @llvm-project
 #include "mlir/Dialect/SCF/SCF.h"  // from @llvm-project
 #include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
@@ -24,8 +23,8 @@ limitations under the License.
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h"
 
 namespace mlir {
 namespace kernel_gen {
@@ -37,6 +36,10 @@ namespace {
 
 struct ShapeToDescriptorsPass
     : public ShapeToDescriptorsPassBase<ShapeToDescriptorsPass> {
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<scf::SCFDialect>();
+  }
+
  public:
   void runOnOperation() override {
     MLIRContext &ctx = getContext();
@@ -51,7 +54,6 @@ struct ShapeToDescriptorsPass
     OwningRewritePatternList patterns;
     populateShapeRewritePatterns(&ctx, patterns);
     populateShapeToStandardConversionPatterns(patterns, &ctx);
-    populateShapeToSCFConversionPatterns(patterns, &ctx);
 
     // Apply conversion.
     auto module = getOperation();
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_framework_legalize_to_llvm.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_framework_legalize_to_llvm.cc
index 2edcaabd7b4..3ce111ff3ff 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_framework_legalize_to_llvm.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_framework_legalize_to_llvm.cc
@@ -101,6 +101,7 @@ class AllocRawOpConverter : public ConvertToLLVMCallOpPattern<AllocRawOp> {
 
  protected:
   StringRef GetFuncName() const override { return kCInterfaceAlloc; }
+
   LLVMType GetFuncType() const override {
     LLVMType llvm_void_ptr_type = getVoidPtrType();
     return LLVM::LLVMType::getFunctionTy(
@@ -175,10 +176,23 @@ class DeallocRawOpConverter : public ConvertToLLVMCallOpPattern<DeallocRawOp> {
   }
 };
 
+class NullContextOpConverter : public ConvertOpToLLVMPattern<NullContextOp> {
+ public:
+  using ConvertOpToLLVMPattern<NullContextOp>::ConvertOpToLLVMPattern;
+
+  LogicalResult matchAndRewrite(
+      Operation *op, ArrayRef<Value> operands,
+      ConversionPatternRewriter &rewriter) const override {
+    rewriter.replaceOpWithNewOp<LLVM::NullOp>(op, getVoidPtrType());
+    return success();
+  }
+};
+
 }  // namespace
 
 void PopulateTFFrameworkToLLVMConversionPatterns(
     LLVMTypeConverter *converter, OwningRewritePatternList *patterns) {
+  patterns->insert<NullContextOpConverter>(*converter);
   patterns->insert<AllocRawOpConverter, DeallocRawOpConverter>(*converter);
 }
 
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_framework_legalize_to_llvm_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_kernel_to_llvm_pass.cc
similarity index 63%
rename from tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_framework_legalize_to_llvm_pass.cc
rename to tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_kernel_to_llvm_pass.cc
index 916eedb55de..b2fcc424a50 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_framework_legalize_to_llvm_pass.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_kernel_to_llvm_pass.cc
@@ -13,25 +13,30 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"  // from @llvm-project
 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"  // from @llvm-project
 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"  // from @llvm-project
+#include "mlir/Dialect/GPU/GPUDialect.h"  // from @llvm-project
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
 #include "tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.h"
 #include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/rewriters.h"
 
 namespace mlir {
 namespace kernel_gen {
-namespace tf_framework {
+namespace transforms {
 namespace {
 
 #define GEN_PASS_CLASSES
 #include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/kernel_gen_passes.h.inc"
 
-class TestTFFrameworkToLLVMPass
-    : public TestTFFrameworkLegalizeToLLVMPassBase<TestTFFrameworkToLLVMPass> {
+class TFKernelToLLVMPass : public TFKernelToLLVMPassBase<TFKernelToLLVMPass> {
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<LLVM::LLVMDialect>();
+  }
+
  public:
   void runOnOperation() override {
     ModuleOp m = getOperation();
@@ -39,21 +44,25 @@ class TestTFFrameworkToLLVMPass
     // Populate type conversions.
     LLVMTypeConverter type_converter(m.getContext());
     type_converter.addConversion([&](tf_framework::OpKernelContextType type) {
-      return LLVM::LLVMType::getInt8PtrTy(type_converter.getDialect());
+      return LLVM::LLVMType::getInt8PtrTy(m.getContext());
     });
 
     // Populate patterns.
     OwningRewritePatternList patterns;
     populateStdToLLVMConversionPatterns(type_converter, patterns);
-    PopulateTFFrameworkToLLVMConversionPatterns(&type_converter, &patterns);
+    tf_framework::PopulateTFFrameworkToLLVMConversionPatterns(&type_converter,
+                                                              &patterns);
+    populateGpuToLLVMConversionPatterns(type_converter, patterns, "gpu.binary");
+    lmhlo::PopulateLhloToLLVMConversionPatterns(&type_converter, &patterns);
 
     // Set target.
     ConversionTarget target(getContext());
     target.addLegalDialect<LLVM::LLVMDialect>();
-    target.addIllegalDialect<tf_framework::TFFrameworkDialect>();
-    target.addLegalOp<ModuleOp, ModuleTerminatorOp>();
+    target
+        .addIllegalDialect<gpu::GPUDialect, tf_framework::TFFrameworkDialect>();
+    target.addIllegalOp<LLVM::DialectCastOp>();
 
-    if (failed(applyFullConversion(m, target, patterns))) {
+    if (failed(applyPartialConversion(m, target, patterns))) {
       signalPassFailure();
     }
   }
@@ -61,11 +70,10 @@ class TestTFFrameworkToLLVMPass
 
 }  // namespace
 
-std::unique_ptr<OperationPass<ModuleOp> >
-createTestTFFrameworkLegalizeToLLVMPass() {
-  return std::make_unique<TestTFFrameworkToLLVMPass>();
+std::unique_ptr<OperationPass<ModuleOp> > CreateTFKernelToLLVMPass() {
+  return std::make_unique<TFKernelToLLVMPass>();
 }
 
-}  // namespace tf_framework
+}  // namespace transforms
 }  // namespace kernel_gen
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/unfuse_batch_norm_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/unfuse_batch_norm_pass.cc
new file mode 100644
index 00000000000..d2773d91b07
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/unfuse_batch_norm_pass.cc
@@ -0,0 +1,44 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
+#include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h"
+
+namespace mlir {
+namespace kernel_gen {
+namespace transforms {
+namespace {
+
+#define GEN_PASS_CLASSES
+#include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/kernel_gen_passes.h.inc"
+
+struct UnfuseBatchNormPass
+    : public UnfuseBatchNormPassBase<UnfuseBatchNormPass> {
+  void runOnFunction() override {
+    mlir::OwningRewritePatternList patterns;
+    mlir::mhlo::PopulateUnfuseBatchNormPatterns(&getContext(), &patterns);
+    mlir::applyPatternsAndFoldGreedily(getOperation(), patterns);
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<mlir::FunctionPass> CreateUnfuseBatchNormPass() {
+  return std::make_unique<UnfuseBatchNormPass>();
+}
+
+}  // namespace transforms
+}  // namespace kernel_gen
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/utils/array_container_utils.h b/tensorflow/compiler/mlir/utils/array_container_utils.h
new file mode 100644
index 00000000000..c1a898185d9
--- /dev/null
+++ b/tensorflow/compiler/mlir/utils/array_container_utils.h
@@ -0,0 +1,51 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_UTILS_ARRAY_CONTAINER_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_UTILS_ARRAY_CONTAINER_UTILS_H_
+
+#include "absl/types/span.h"
+#include "llvm/ADT/ArrayRef.h"
+
+namespace mlir {
+
+template <typename T>
+inline llvm::ArrayRef<T> SpanToArrayRef(absl::Span<const T> span) {
+  return llvm::ArrayRef<T>(span.data(), span.size());
+}
+
+template <typename T>
+inline llvm::ArrayRef<T> SpanToArrayRef(absl::Span<T> span) {
+  return llvm::ArrayRef<T>(span.data(), span.size());
+}
+
+template <typename T>
+inline llvm::MutableArrayRef<T> SpanToMutableArrayRef(absl::Span<T> span) {
+  return llvm::MutableArrayRef<T>(span.data(), span.size());
+}
+
+template <typename T>
+inline absl::Span<const T> ArrayRefToSpan(llvm::ArrayRef<T> ref) {
+  return absl::Span<const T>(ref.data(), ref.size());
+}
+
+template <typename T>
+inline absl::Span<T> MutableArrayRefToSpan(llvm::MutableArrayRef<T> ref) {
+  return absl::Span<T>(ref.data(), ref.size());
+}
+
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_UTILS_ARRAY_CONTAINER_UTILS_H_
diff --git a/tensorflow/compiler/mlir/utils/name_utils.cc b/tensorflow/compiler/mlir/utils/name_utils.cc
new file mode 100644
index 00000000000..bc4e80f5aa1
--- /dev/null
+++ b/tensorflow/compiler/mlir/utils/name_utils.cc
@@ -0,0 +1,99 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/utils/name_utils.h"
+
+#include <cctype>
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "mlir/IR/Identifier.h"  // from @llvm-project
+
+namespace mlir {
+
+namespace {
+// Checks if a character is legal for a TensorFlow node name, with special
+// handling if a character is at the beginning.
+bool IsLegalChar(char c, bool first_char) {
+  if (isalpha(c)) return true;
+  if (isdigit(c)) return true;
+  if (c == '.') return true;
+  if (c == '_') return true;
+
+  // First character of a node name can only be a letter, digit, dot or
+  // underscore.
+  if (first_char) return false;
+
+  if (c == '/') return true;
+  if (c == '-') return true;
+
+  return false;
+}
+}  // anonymous namespace
+
+void LegalizeNodeName(std::string& name) {
+  if (name.empty()) return;
+
+  if (!IsLegalChar(name[0], /*first_char=*/true)) name[0] = '.';
+
+  for (char& c : llvm::drop_begin(name, 1))
+    if (!IsLegalChar(c, /*first_char=*/false)) c = '.';
+}
+
+std::string GetNameFromLoc(Location loc) {
+  llvm::SmallVector<llvm::StringRef, 8> loc_names;
+  llvm::SmallVector<Location, 8> locs;
+  locs.push_back(loc);
+  bool names_is_nonempty = false;
+
+  while (!locs.empty()) {
+    Location curr_loc = locs.pop_back_val();
+
+    if (auto name_loc = curr_loc.dyn_cast<NameLoc>()) {
+      // Add name in NameLoc. For NameLoc we also account for names due to ops
+      // in functions where the op's name is first.
+      auto name = name_loc.getName().strref().split('@').first;
+      loc_names.push_back(name);
+      if (!name.empty()) names_is_nonempty = true;
+      continue;
+    } else if (auto call_loc = curr_loc.dyn_cast<CallSiteLoc>()) {
+      // Add name if CallSiteLoc's callee has a NameLoc (as should be the
+      // case if imported with DebugInfo).
+      if (auto name_loc = call_loc.getCallee().dyn_cast<NameLoc>()) {
+        auto name = name_loc.getName().strref().split('@').first;
+        loc_names.push_back(name);
+        if (!name.empty()) names_is_nonempty = true;
+        continue;
+      }
+    } else if (auto fused_loc = curr_loc.dyn_cast<FusedLoc>()) {
+      // Push all locations in FusedLoc in reverse order, so locations are
+      // visited based on order in FusedLoc.
+      auto reversed_fused_locs = llvm::reverse(fused_loc.getLocations());
+      locs.append(reversed_fused_locs.begin(), reversed_fused_locs.end());
+      continue;
+    }
+
+    // Location is not a supported, so an empty StringRef is added.
+    loc_names.push_back(llvm::StringRef());
+  }
+
+  if (names_is_nonempty)
+    return llvm::join(loc_names.begin(), loc_names.end(), ";");
+
+  return "";
+}
+
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/utils/name_utils.h b/tensorflow/compiler/mlir/utils/name_utils.h
new file mode 100644
index 00000000000..4b08a41feec
--- /dev/null
+++ b/tensorflow/compiler/mlir/utils/name_utils.h
@@ -0,0 +1,35 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_UTILS_NAME_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_UTILS_NAME_UTILS_H_
+
+#include <string>
+
+#include "llvm/ADT/StringRef.h"
+#include "mlir/IR/Location.h"  // from @llvm-project
+
+namespace mlir {
+
+// Converts characters in name that are considered illegal in TensorFlow Node
+// name to '.'.
+void LegalizeNodeName(std::string& name);
+
+// Creates a TensorFlow node name from a location.
+std::string GetNameFromLoc(Location loc);
+
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_UTILS_NAME_UTILS_H_
diff --git a/tensorflow/compiler/mlir/utils/string_container_utils.h b/tensorflow/compiler/mlir/utils/string_container_utils.h
new file mode 100644
index 00000000000..fb2fa06ca4d
--- /dev/null
+++ b/tensorflow/compiler/mlir/utils/string_container_utils.h
@@ -0,0 +1,34 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_UTILS_STRING_CONTAINER_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_UTILS_STRING_CONTAINER_UTILS_H_
+
+#include "absl/strings/string_view.h"
+#include "llvm/ADT/StringRef.h"
+
+namespace mlir {
+
+inline absl::string_view StringRefToView(llvm::StringRef ref) {
+  return absl::string_view(ref.data(), ref.size());
+}
+
+inline llvm::StringRef StringViewToRef(absl::string_view view) {
+  return llvm::StringRef(view.data(), view.size());
+}
+
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_UTILS_STRING_CONTAINER_UTILS_H_
diff --git a/tensorflow/compiler/mlir/xla/BUILD b/tensorflow/compiler/mlir/xla/BUILD
index ada81634567..aa37181f9f0 100644
--- a/tensorflow/compiler/mlir/xla/BUILD
+++ b/tensorflow/compiler/mlir/xla/BUILD
@@ -15,6 +15,7 @@ package_group(
         "//learning/brain/experimental/mlir/...",
         "//learning/brain/google/xla/kernels/...",
         "//learning/brain/google/xla/mlir/...",
+        "//learning/deepmind/partir/...",
         "//learning/pathways/data_parallel/tf2xla/...",
         "//platforms/xla/...",
         "//tensorflow/compiler/mlir/...",
@@ -55,7 +56,9 @@ cc_library(
         "transforms/passes.h",
     ],
     deps = [
+        ":attribute_importer",
         ":type_to_shape",
+        ":xla_legalize_tf_with_tf2xla",
         "//tensorflow/compiler/mlir/hlo",
         "//tensorflow/compiler/mlir/hlo:chlo_legalize_to_hlo",
         "//tensorflow/compiler/mlir/hlo:convert_op_folder",
@@ -68,7 +71,7 @@ cc_library(
         "//tensorflow/compiler/xla/client/lib:conv_grad_size_util",
         "//tensorflow/core:framework",
         "//tensorflow/core/kernels:conv_grad_shape_utils",
-        "//tensorflow/core/lib/bfloat16",
+        "//tensorflow/core/platform:bfloat16",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:Analysis",
         "@llvm-project//mlir:Dialect",
@@ -94,6 +97,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:convert_type",
         "//tensorflow/compiler/mlir/tensorflow:export_tf_dialect_op",
         "//tensorflow/compiler/mlir/tensorflow:lower_tf_lib",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops",
         "//tensorflow/compiler/mlir/tensorflow:translate_utils",
         "//tensorflow/compiler/tf2xla:xla_compilation_device",
         "//tensorflow/compiler/tf2xla:xla_context",
@@ -130,7 +134,6 @@ cc_library(
         ":hlo_utils",
         ":mlir_hlo_to_hlo",
         "//tensorflow/compiler/mlir/hlo",
-        "//tensorflow/compiler/mlir/hlo:hlo_dialect_force_registration",
         "//tensorflow/compiler/mlir/hlo:lhlo",
         "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:statusor",
@@ -235,8 +238,8 @@ cc_library(
     hdrs = ["mlir_hlo_to_hlo.h"],
     deps = [
         ":type_to_shape",
+        "//tensorflow/compiler/mlir:name_utils",
         "//tensorflow/compiler/mlir/hlo",
-        "//tensorflow/compiler/mlir/hlo:hlo_dialect_force_registration",
         "//tensorflow/compiler/mlir/tensorflow:convert_type",
         "//tensorflow/compiler/mlir/tensorflow:error_util",
         "//tensorflow/compiler/tf2xla:common",
@@ -321,6 +324,16 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "translate_cl_options",
+    srcs = ["xla_mlir_translate_cl.cc"],
+    hdrs = ["xla_mlir_translate_cl.h"],
+    deps = [
+        "@llvm-project//llvm:Support",
+    ],
+    alwayslink = 1,
+)
+
 cc_library(
     name = "xla_mlir_translate",
     srcs = ["xla_mlir_translate.cc"],
@@ -329,8 +342,10 @@ cc_library(
         ":hlo_to_mlir_hlo",
         ":mhlo_to_lhlo_with_xla",
         ":mlir_hlo_to_hlo",
+        ":translate_cl_options",
         "//tensorflow/compiler/jit:xla_cpu_jit",
         "//tensorflow/compiler/jit:xla_gpu_jit",
+        "//tensorflow/compiler/mlir/hlo",
         "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:statusor",
@@ -339,6 +354,7 @@ cc_library(
         "//tensorflow/core:lib",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:Translation",
     ],
     alwayslink = 1,
@@ -385,14 +401,12 @@ cc_library(
         ":xla_legalize_tf_with_tf2xla",
         "//tensorflow/compiler/mlir/hlo",
         "//tensorflow/compiler/mlir/hlo:chlo_legalize_to_hlo",
-        "//tensorflow/compiler/mlir/hlo:hlo_dialect_force_registration",
         "//tensorflow/compiler/mlir/hlo:hlo_legalize_to_lhlo",
         "//tensorflow/compiler/mlir/hlo:legalize_control_flow",
         "//tensorflow/compiler/mlir/hlo:legalize_tanh_to_approximation",
         "//tensorflow/compiler/mlir/hlo:legalize_to_linalg",
         "//tensorflow/compiler/mlir/hlo:legalize_to_standard",
         "//tensorflow/compiler/mlir/hlo:lhlo",
-        "//tensorflow/compiler/mlir/hlo:lhlo_copy_removal",
         "//tensorflow/compiler/mlir/hlo:lhlo_fuse_linalg",
         "//tensorflow/compiler/mlir/hlo:lhlo_legalize_to_affine",
         "//tensorflow/compiler/mlir/hlo:lhlo_legalize_to_gpu",
diff --git a/tensorflow/compiler/mlir/xla/hlo_function_importer.cc b/tensorflow/compiler/mlir/xla/hlo_function_importer.cc
index d366a36c212..a63fc12c285 100644
--- a/tensorflow/compiler/mlir/xla/hlo_function_importer.cc
+++ b/tensorflow/compiler/mlir/xla/hlo_function_importer.cc
@@ -521,6 +521,13 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstruction(
               RandomDistributionToString(instruction->random_distribution())));
       }
     }
+    case HloOpcode::kRngBitGenerator: {
+      auto rng_op = Cast<HloRngBitGeneratorInstruction>(instruction);
+      auto op = func_builder->create<mlir::mhlo::RngBitGeneratorOp>(
+          loc, result_type,
+          func_builder->getI32IntegerAttr(rng_op->algorithm()), operands[0]);
+      return op.getOperation();
+    }
     case HloOpcode::kWhile: {
       auto op = func_builder->create<mlir::mhlo::WhileOp>(
           loc, operands[0].getType(), operands[0]);
diff --git a/tensorflow/compiler/mlir/xla/hlo_function_importer.h b/tensorflow/compiler/mlir/xla/hlo_function_importer.h
index db981bb0227..e0cc89004cf 100644
--- a/tensorflow/compiler/mlir/xla/hlo_function_importer.h
+++ b/tensorflow/compiler/mlir/xla/hlo_function_importer.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <unordered_map>
 
 #include "absl/types/optional.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/Function.h"  // from @llvm-project
@@ -62,7 +63,10 @@ class HloFunctionImporter {
       : context_(module.getContext()),
         module_(module),
         builder_(builder),
-        function_map_(function_map) {}
+        function_map_(function_map) {
+    context_->loadDialect<mlir::StandardOpsDialect>();
+    context_->loadDialect<mlir::mhlo::MhloDialect>();
+  }
 
   // Imports the given computation as a new function, if it hasn't been already
   // imported.
diff --git a/tensorflow/compiler/mlir/xla/hlo_module_importer.cc b/tensorflow/compiler/mlir/xla/hlo_module_importer.cc
index dd045da3899..9db5861934f 100644
--- a/tensorflow/compiler/mlir/xla/hlo_module_importer.cc
+++ b/tensorflow/compiler/mlir/xla/hlo_module_importer.cc
@@ -30,6 +30,12 @@ limitations under the License.
 
 namespace xla {
 
+HloModuleImporter::HloModuleImporter(mlir::ModuleOp module)
+    : module_(module), builder_(module.getContext()) {
+  module.getContext()->loadDialect<mlir::StandardOpsDialect>();
+  module.getContext()->loadDialect<mlir::mhlo::MhloDialect>();
+}
+
 Status HloModuleImporter::Import(const xla::HloModule& module) {
   // TODO(hinsu): Only import the entry computation here once all HLO ops with
   // reference to other computation are updated to have a region instead of a
diff --git a/tensorflow/compiler/mlir/xla/hlo_module_importer.h b/tensorflow/compiler/mlir/xla/hlo_module_importer.h
index 69ac1e28219..401299484ed 100644
--- a/tensorflow/compiler/mlir/xla/hlo_module_importer.h
+++ b/tensorflow/compiler/mlir/xla/hlo_module_importer.h
@@ -38,8 +38,7 @@ class Shape;
 // dialect. HloModuleImporter does not take ownership.
 class HloModuleImporter {
  public:
-  explicit HloModuleImporter(mlir::ModuleOp module)
-      : module_(module), builder_(module.getContext()) {}
+  explicit HloModuleImporter(mlir::ModuleOp module);
 
   // Import the HloModule into the MLIR Module.
   Status Import(const xla::HloModule& module);
diff --git a/tensorflow/compiler/mlir/xla/hlo_utils.cc b/tensorflow/compiler/mlir/xla/hlo_utils.cc
index cf78c81908d..b9d563a659d 100644
--- a/tensorflow/compiler/mlir/xla/hlo_utils.cc
+++ b/tensorflow/compiler/mlir/xla/hlo_utils.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/core/lib/bfloat16/bfloat16.h"
+#include "tensorflow/core/platform/bfloat16.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
@@ -83,6 +83,9 @@ StatusOr<llvm::SmallVector<AffineMap, 1>> GetPermutationIfAvailable(
     strides[dim] = accumulated_stride;
     accumulated_stride *= shape.dimensions(dim);
   }
+  if (accumulated_stride == 0) {
+    return llvm::SmallVector<AffineMap, 1>{};
+  }
   return llvm::SmallVector<AffineMap, 1>{
       makeStridedLinearLayoutMap(strides, /*offset=*/0, builder.getContext())};
 }
diff --git a/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc b/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc
index c94110d9102..ac5e01a0abf 100644
--- a/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc
+++ b/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc
@@ -312,6 +312,16 @@ StatusOr<XlaOp> MlirHloBuilder::RngOpInternal(
   return CreateOp(op_name, shape, operands);
 }
 
+StatusOr<XlaOp> MlirHloBuilder::RngBitGeneratorInternal(
+    const Shape& full_result_shape, RandomAlgorithm algorithm,
+    XlaOp initial_state) {
+  TF_ASSIGN_OR_RETURN(mlir::Type ty, ConvertShapeToType<mlir::RankedTensorType>(
+                                         full_result_shape, builder_));
+  auto op = builder_.create<mlir::mhlo::RngBitGeneratorOp>(
+      loc_, ty, builder_.getI32IntegerAttr(algorithm), GetValue(initial_state));
+  return MakeXlaOp(op);
+}
+
 StatusOr<XlaOp> MlirHloBuilder::ReshapeInternal(const Shape& shape,
                                                 XlaOp operand,
                                                 int64 inferred_dimension) {
@@ -351,6 +361,13 @@ StatusOr<XlaOp> MlirHloBuilder::InDimBroadcast(
   return MakeXlaOp(op.getResult());
 }
 
+StatusOr<XlaOp> MlirHloBuilder::AddInstruction(
+    HloInstructionProto&& instr, HloOpcode opcode,
+    absl::Span<const XlaOp> operands) {
+  return Unimplemented("MlirHloBuilder does not support op %s",
+                       HloOpcodeString(opcode));
+}
+
 StatusOr<XlaOp> MlirHloBuilder::Compare(const Shape& shape, XlaOp lhs,
                                         XlaOp rhs,
                                         ComparisonDirection direction) {
@@ -382,6 +399,31 @@ XlaOp MlirHloBuilder::CreateToken() {
   });
 }
 
+StatusOr<XlaOp> MlirHloBuilder::TriangularSolveInternal(
+    const Shape& shape, XlaOp a, XlaOp b, TriangularSolveOptions options) {
+  TF_ASSIGN_OR_RETURN(
+      mlir::Type result_ty,
+      ConvertShapeToType<mlir::RankedTensorType>(shape, builder_));
+  auto op = builder_.create<mlir::mhlo::TriangularSolveOp>(
+      loc_, result_ty, GetValue(a), GetValue(b),
+      builder_.getBoolAttr(options.left_side()),
+      builder_.getBoolAttr(options.lower()),
+      builder_.getBoolAttr(options.unit_diagonal()),
+      builder_.getStringAttr(
+          TriangularSolveOptions::Transpose_Name(options.transpose_a())));
+  return MakeXlaOp(op);
+}
+
+StatusOr<XlaOp> MlirHloBuilder::CholeskyInternal(const Shape& shape, XlaOp a,
+                                                 bool lower) {
+  TF_ASSIGN_OR_RETURN(
+      mlir::Type result_ty,
+      ConvertShapeToType<mlir::RankedTensorType>(shape, builder_));
+  auto op = builder_.create<mlir::mhlo::CholeskyOp>(
+      loc_, result_ty, GetValue(a), builder_.getBoolAttr(lower));
+  return MakeXlaOp(op);
+}
+
 StatusOr<XlaOp> MlirHloBuilder::InfeedWithTokenInternal(
     const Shape& infeed_instruction_shape, XlaOp token, const string& config) {
   TF_ASSIGN_OR_RETURN(mlir::Type result_type,
diff --git a/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.h b/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.h
index a12eb723465..00b7aa4d0b0 100644
--- a/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.h
+++ b/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.h
@@ -124,6 +124,13 @@ class MlirHloBuilder : public XlaBuilder {
                               FftType fft_type,
                               absl::Span<const int64> fft_length) override;
 
+  StatusOr<XlaOp> TriangularSolveInternal(
+      const Shape& shape, XlaOp a, XlaOp b,
+      TriangularSolveOptions options) override;
+
+  StatusOr<XlaOp> CholeskyInternal(const Shape& shape, XlaOp a,
+                                   bool lower) override;
+
   StatusOr<XlaOp> CustomCallInternal(
       const string& call_target_name, absl::Span<const XlaOp> operands,
       const Shape& shape, const string& opaque,
@@ -176,6 +183,9 @@ class MlirHloBuilder : public XlaBuilder {
   StatusOr<XlaOp> RngOpInternal(RandomDistribution distribution,
                                 absl::Span<const XlaOp> parameters,
                                 const Shape& shape) override;
+  StatusOr<XlaOp> RngBitGeneratorInternal(const Shape& full_result_shape,
+                                          RandomAlgorithm algorithm,
+                                          XlaOp initial_state) override;
 
   StatusOr<XlaOp> ReshapeInternal(const Shape& shape, XlaOp operand,
                                   int64 inferred_dimension) override;
@@ -189,6 +199,9 @@ class MlirHloBuilder : public XlaBuilder {
       const Shape& shape, XlaOp operand,
       absl::Span<const int64> broadcast_dimensions) override;
 
+  StatusOr<XlaOp> AddInstruction(HloInstructionProto&& instr, HloOpcode opcode,
+                                 absl::Span<const XlaOp> operands) override;
+
   StatusOr<XlaOp> Compare(const Shape& shape, XlaOp lhs, XlaOp rhs,
                           ComparisonDirection direction) override;
 
diff --git a/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc b/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
index e6d0b8f8dd8..d6ef39d03dd 100644
--- a/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
+++ b/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
@@ -41,6 +41,7 @@ limitations under the License.
 #include "mlir/IR/UseDefLists.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
+#include "tensorflow/compiler/mlir/utils/name_utils.h"
 #include "tensorflow/compiler/mlir/xla/type_to_shape.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/xla/client/lib/matrix.h"
@@ -105,6 +106,9 @@ static mlir::LogicalResult GetXlaOp(
 // TODO(hpucha): This should be consolidated into a general place.
 static int ConvertAPInt(llvm::APInt i) { return i.getSExtValue(); }
 
+static uint32_t Convertuint32_t(uint32_t i) { return i; }
+static uint64_t Convertuint64_t(uint64_t i) { return i; }
+
 // Convert APFloat to double.
 static double ConvertAPFloat(llvm::APFloat value) {
   const auto& semantics = value.getSemantics();
@@ -430,6 +434,27 @@ static xla::FrontendAttributes CreateOpFrontendAttributesFromAttribute(
   return frontend_attributes;
 }
 
+// Returns a OpMetadata proto based on the location of the op. If the location
+// is unknown, an empty proto is returned. `op_name` are populated with the op
+// location (converted). FileLineColLoc locations are populated by taking the
+// file name and line number, and populating `source_file` and `source_line`
+// respectively.
+static xla::OpMetadata CreateOpMetadataFromLocation(mlir::Operation* op) {
+  xla::OpMetadata metadata;
+  if (op->getLoc().isa<mlir::UnknownLoc>()) return metadata;
+
+  std::string name = mlir::GetNameFromLoc(op->getLoc());
+  mlir::LegalizeNodeName(name);
+  metadata.set_op_name(name);
+
+  if (auto file_line_col_loc = op->getLoc().dyn_cast<mlir::FileLineColLoc>()) {
+    metadata.set_source_file(file_line_col_loc.getFilename().str());
+    metadata.set_source_line(file_line_col_loc.getLine());
+  }
+
+  return metadata;
+}
+
 // Checks if all shardings are set.
 static bool AllOptionalShardingsAreSet(
     llvm::ArrayRef<absl::optional<xla::OpSharding>> shardings) {
@@ -761,7 +786,7 @@ LogicalResult ExportXlaOp(InfeedOp op, OpLoweringContext ctx) {
 LogicalResult ExportXlaOp(IotaOp op, OpLoweringContext ctx) {
   auto& value_map = *ctx.values;
   value_map[op] = xla::Iota(ctx.builder, xla::TypeToShape(op.getType()),
-                            op.iota_dimension().getSExtValue());
+                            op.iota_dimension());
   return success();
 }
 
@@ -882,6 +907,17 @@ LogicalResult ExportXlaOp(ReturnOp op, OpLoweringContext ctx) {
   return failure();
 }
 
+LogicalResult ExportXlaOp(RngBitGeneratorOp op, OpLoweringContext ctx) {
+  auto& value_map = *ctx.values;
+  auto result = op.getResult();
+  auto xla_arg_1 = value_map[*op.getODSOperands(0).begin()];
+  auto xla_result = xla::RngBitGenerator(
+      static_cast<xla::RandomAlgorithm>(op.rng_algorithm()), Unwrap(xla_arg_1),
+      xla::TypeToShape(result.getType()).tuple_shapes(1));
+  value_map[result] = xla_result;
+  return mlir::success();
+}
+
 LogicalResult ExportXlaOp(RngNormalOp op, OpLoweringContext ctx) {
   auto& value_map = *ctx.values;
   xla::XlaOp mu, sigma;
@@ -974,7 +1010,7 @@ LogicalResult ExportXlaOp(SortOp op, OpLoweringContext ctx) {
 
   auto& value_map = *ctx.values;
   value_map[op] = xla::Sort(GetTuple(op.operands(), ctx), comparator,
-                            op.dimension().getSExtValue(), op.is_stable());
+                            op.dimension(), op.is_stable());
   return success();
 }
 
diff --git a/tensorflow/compiler/mlir/xla/operator_writer_gen.cc b/tensorflow/compiler/mlir/xla/operator_writer_gen.cc
index 407a7d3da38..801c04496f0 100644
--- a/tensorflow/compiler/mlir/xla/operator_writer_gen.cc
+++ b/tensorflow/compiler/mlir/xla/operator_writer_gen.cc
@@ -165,6 +165,11 @@ static bool OperatorWritersMain(raw_ostream& os, RecordKeeper& records) {
         "frontend_attributes(lowering_context.builder, "
         "CreateOpFrontendAttributesFromAttribute(op));\n\n";
 
+  // Create a scoped object to assign op metadata to generated XLA ops.
+  os << "  xla::XlaScopedOpMetadataAssignment "
+        "op_metadata(lowering_context.builder, "
+        "CreateOpMetadataFromLocation(op));\n\n";
+
   // Retrieve all the definitions derived from HLO_Op and sort by record name.
   for (const auto* def : records.getAllDerivedDefinitions("HLO_Op")) {
     // Skip operations that have a custom exporter.
diff --git a/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/non_identity_layouts.hlotxt b/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/non_identity_layouts.hlotxt
index 3630d2d45e4..a83e36cff64 100644
--- a/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/non_identity_layouts.hlotxt
+++ b/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/non_identity_layouts.hlotxt
@@ -8,6 +8,6 @@ HloModule TestModule
 ENTRY TestComputation {
   x = f32[3, 2]{1,0} parameter(0)
 
-  // CHECK: "lmhlo.copy"(%{{.*}}, %{{.*}}) : (memref<3x2xf32>, memref<3x2xf32, #[[MAP]]>) -> ()
+  // CHECK: "lmhlo.copy"(%{{.*}}, %{{.*}}) {name = "copy.1"} : (memref<3x2xf32>, memref<3x2xf32, #[[MAP]]>) -> ()
   ROOT x.copy = f32[3, 2]{0,1} copy(x)
 }
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf-BatchMatMulV2.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf-BatchMatMulV2.mlir
index 69eaeeb946d..5a07d9303f0 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf-BatchMatMulV2.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf-BatchMatMulV2.mlir
@@ -17,9 +17,7 @@ func @batchmatmulv2_basic(%arg0: tensor<1x4x2xf32>, %arg1: tensor<3x2x4xf32>) ->
 // CHECK:           [[LHSSHAPEEXTENTS:%.*]] = shape.to_extent_tensor [[LHSBCASTSHAPE]]
 // CHECK:           [[LHSBCAST:%.*]] = "mhlo.dynamic_broadcast_in_dim"([[LHS]], [[LHSSHAPEEXTENTS]]) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<1x4x2xf32>, tensor<3xindex>) -> tensor<3x4x2xf32>
 // CHECK:           [[RHSBCASTSHAPE:%.*]] = shape.concat [[BCASTHEAD]], [[RHSTAIL]]
-// CHECK:           [[RHSSHAPEEXTENTS:%.*]] = shape.to_extent_tensor [[RHSBCASTSHAPE]]
-// CHECK:           [[RHSBCAST:%.*]] = "mhlo.dynamic_broadcast_in_dim"([[RHS]], [[RHSSHAPEEXTENTS]]) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<3x2x4xf32>, tensor<3xindex>) -> tensor<3x2x4xf32>
-// CHECK:           [[RESULT:%.*]] = "mhlo.dot_general"([[LHSBCAST]], [[RHSBCAST]]) {dot_dimension_numbers = {lhs_batching_dimensions = dense<0> : tensor<1xi64>, lhs_contracting_dimensions = dense<2> : tensor<1xi64>, rhs_batching_dimensions = dense<0> : tensor<1xi64>, rhs_contracting_dimensions = dense<1> : tensor<1xi64>}} : (tensor<3x4x2xf32>, tensor<3x2x4xf32>) -> tensor<3x4x4xf32>
+// CHECK:           [[RESULT:%.*]] = "mhlo.dot_general"([[LHSBCAST]], [[RHS]]) {dot_dimension_numbers = {lhs_batching_dimensions = dense<0> : tensor<1xi64>, lhs_contracting_dimensions = dense<2> : tensor<1xi64>, rhs_batching_dimensions = dense<0> : tensor<1xi64>, rhs_contracting_dimensions = dense<1> : tensor<1xi64>}} : (tensor<3x4x2xf32>, tensor<3x2x4xf32>) -> tensor<3x4x4xf32>
 // CHECK:           return [[RESULT]] : tensor<3x4x4xf32>
 // CHECK:         }
 
@@ -29,7 +27,6 @@ func @batchmatmulv2_basic(%arg0: tensor<1x4x2xf32>, %arg1: tensor<3x2x4xf32>) ->
 
 func @batchmatmulv2_lhs_batch(%arg0: tensor<3x4x2xf32>, %arg1: tensor<2x4xf32>) -> tensor<3x4x4xf32> {
 // CHECK-LABEL:   func @batchmatmulv2_lhs_batch
-// CHECK:           "mhlo.dynamic_broadcast_in_dim"({{.*}}, {{.*}}) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>}
 // CHECK:           "mhlo.dynamic_broadcast_in_dim"({{.*}}, {{.*}}) {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>}
 // CHECK:           "mhlo.dot_general"({{.*}}, {{.*}}) {dot_dimension_numbers = {
 // CHECK-SAME:        lhs_batching_dimensions = dense<0> : tensor<1xi64>,
@@ -43,7 +40,6 @@ func @batchmatmulv2_lhs_batch(%arg0: tensor<3x4x2xf32>, %arg1: tensor<2x4xf32>)
 func @batchmatmulv2_rhs_batch(%arg0: tensor<4x2xf32>, %arg1: tensor<3x2x4xf32>) -> tensor<3x4x4xf32> {
 // CHECK-LABEL:   func @batchmatmulv2_rhs_batch
 // CHECK:           "mhlo.dynamic_broadcast_in_dim"({{.*}}, {{.*}}) {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>}
-// CHECK:           "mhlo.dynamic_broadcast_in_dim"({{.*}}, {{.*}}) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>}
 // CHECK:           "mhlo.dot_general"({{.*}}, {{.*}}) {dot_dimension_numbers = {
 // CHECK-SAME:        lhs_batching_dimensions = dense<0> : tensor<1xi64>,
 // CHECK-SAME:        lhs_contracting_dimensions = dense<2> : tensor<1xi64>,
@@ -64,20 +60,20 @@ func @batchmatmulv2_dynamic(%arg0: tensor<?x?x?xf32>, %arg1: tensor<?x?x?xf32>)
   return %0 : tensor<?x?x?xf32>
 }
 
-func @batchmatmulv2_adj_real(%arg0: tensor<5x2xf32>, %arg1: tensor<2x4xf32>) -> tensor<5x4xf32> {
+func @batchmatmulv2_adj_real(%arg0: tensor<2x5xf32>, %arg1: tensor<4x2xf32>) -> tensor<5x4xf32> {
 // CHECK-LABEL:   func @batchmatmulv2_adj_real
 // CHECK:           "mhlo.dot_general"({{.*}}, {{.*}}) {dot_dimension_numbers = {
 // CHECK-SAME:        lhs_batching_dimensions = dense<> : tensor<0xi64>,
 // CHECK-SAME:        lhs_contracting_dimensions = dense<0> : tensor<1xi64>,
 // CHECK-SAME:        rhs_batching_dimensions = dense<> : tensor<0xi64>,
 // CHECK-SAME:        rhs_contracting_dimensions = dense<1> : tensor<1xi64>}}
-  %0 = "tf.BatchMatMulV2"(%arg0, %arg1) {adj_x = true, adj_y = true, device = ""} : (tensor<5x2xf32>, tensor<2x4xf32>) -> tensor<5x4xf32>
+  %0 = "tf.BatchMatMulV2"(%arg0, %arg1) {adj_x = true, adj_y = true, device = ""} : (tensor<2x5xf32>, tensor<4x2xf32>) -> tensor<5x4xf32>
   return %0 : tensor<5x4xf32>
 }
 
-func @batchmatmulv2_adj_complex(%arg0: tensor<5x2xcomplex<f32>>, %arg1: tensor<2x4xcomplex<f32>>) -> tensor<5x4xcomplex<f32>> {
+func @batchmatmulv2_adj_complex(%arg0: tensor<2x5xcomplex<f32>>, %arg1: tensor<4x2xcomplex<f32>>) -> tensor<5x4xcomplex<f32>> {
 // CHECK-LABEL:   func @batchmatmulv2_adj_complex(
-// CHECK-SAME:                                    [[LHS:%.*]]: tensor<5x2xcomplex<f32>>, [[RHS:%.*]]: tensor<2x4xcomplex<f32>>) -> tensor<5x4xcomplex<f32>> {
+// CHECK-SAME:                                    [[LHS:%.*]]: tensor<2x5xcomplex<f32>>, [[RHS:%.*]]: tensor<4x2xcomplex<f32>>) -> tensor<5x4xcomplex<f32>> {
 // CHECK:           [[LHSRE:%.*]] = "mhlo.real"([[LHS]])
 // CHECK:           [[LHSIM:%.*]] = "mhlo.imag"([[LHS]])
 // CHECK:           [[LHSIMNEG:%.*]] = "mhlo.negate"([[LHSIM]])
@@ -88,6 +84,6 @@ func @batchmatmulv2_adj_complex(%arg0: tensor<5x2xcomplex<f32>>, %arg1: tensor<2
 // CHECK:           [[RHSCONJ:%.*]] = "mhlo.complex"([[RHSRE]], [[RHSIMNEG]])
 // CHECK:           shape.shape_of [[LHSCONJ]]
 // CHECK:           shape.shape_of [[RHSCONJ]]
-  %0 = "tf.BatchMatMulV2"(%arg0, %arg1) {adj_x = true, adj_y = true, device = ""} : (tensor<5x2xcomplex<f32>>, tensor<2x4xcomplex<f32>>) -> tensor<5x4xcomplex<f32>>
+  %0 = "tf.BatchMatMulV2"(%arg0, %arg1) {adj_x = true, adj_y = true, device = ""} : (tensor<2x5xcomplex<f32>>, tensor<4x2xcomplex<f32>>) -> tensor<5x4xcomplex<f32>>
   return %0 : tensor<5x4xcomplex<f32>>
 }
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf-binary-elementwise.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf-binary-elementwise.mlir
index fd9c14c7c0f..887fdea5a21 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf-binary-elementwise.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf-binary-elementwise.mlir
@@ -2,6 +2,7 @@
 // (unlike the rest), since this is the primary use case for such ops and
 // verification of shapes and broadcasts is desired.
 // RUN: tf-opt "-xla-legalize-tf=allow-partial-conversion legalize-chlo=true" -canonicalize %s | FileCheck %s
+// RUN: tf-opt "-xla-legalize-tf=allow-partial-conversion legalize-chlo=false" %s | FileCheck --check-prefix CHLO %s
 
 //===----------------------------------------------------------------------===//
 // Binary op legalizations.
@@ -48,8 +49,8 @@ func @add_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<?x?xi32>) -> tensor<?x?xi3
   // CHECK-NEXT: shape.assuming %[[WITNESS:.+]]
   // CHECK-DAG:    %[[LHS_SHAPE:.+]] = shape.shape_of %arg0
   // CHECK-DAG:    %[[RHS_SHAPE:.+]] = shape.shape_of %arg1
-  // CHECK-NEXT:   %[[RESULT_SHAPE:.+]] = shape.broadcast %[[LHS_SHAPE]], %[[RHS_SHAPE]]
-  // CHECK-NEXT:   %[[RESULT_EXTENTS:.+]] = shape.to_extent_tensor %[[RESULT_SHAPE]]
+  // CHECK-NEXT:   %[[RESULT_SHAPE:.+]] = shape.broadcast %[[LHS_SHAPE]], %[[RHS_SHAPE]] : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  // CHECK-NEXT:   %[[RESULT_EXTENTS:.+]] = tensor_cast %[[RESULT_SHAPE]] : tensor<?xindex> to tensor<2xindex>
   // CHECK-NEXT:   %[[LHS_BCAST:.+]] = "mhlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
   // CHECK-NEXT:   %[[RHS_BCAST:.+]] = "mhlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}
   // CHECK-NEXT:   %[[RESULT:.+]] = mhlo.add %[[LHS_BCAST]], %[[RHS_BCAST]] : tensor<?x?xi32>
@@ -58,6 +59,15 @@ func @add_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<?x?xi32>) -> tensor<?x?xi3
   return %0: tensor<?x?xi32>
 }
 
+// CHECK-LABEL: func @broadcast_add_unranked
+// CHLO-LABEL: func @broadcast_add_unranked
+func @broadcast_add_unranked(%arg0: tensor<1xi32>, %arg1: tensor<*xi32>) -> tensor<*xi32> {
+  // CHECK: tf.Add
+  // CHLO: chlo.broadcast_add %arg0, %arg1
+  %0 = "tf.Add"(%arg0, %arg1) : (tensor<1xi32>, tensor<*xi32>) -> tensor<*xi32>
+  return %0: tensor<*xi32>
+}
+
 // CHECK-LABEL: func @div
 func @div(%arg0: tensor<2xi32>) -> tensor<2xi32> {
   // CHECK-NEXT:  %0 = mhlo.divide %arg0, %arg0 : tensor<2xi32>
@@ -139,9 +149,9 @@ func @broadcast_shift_right_unsigned(%arg0: tensor<4xui8>, %arg1: tensor<2x4xui8
 }
 
 // CHECK-LABEL: func @and
-func @and(%arg0: tensor<2xi1>) -> tensor<2xi1> {
+func @and(%arg0: tensor<2xi1>, %arg1: tensor<2xi1>) -> tensor<2xi1> {
   // CHECK-NEXT:  mhlo.and
-  %0 = "tf.LogicalAnd"(%arg0, %arg0) : (tensor<2xi1>, tensor<2xi1>) -> tensor<2xi1>
+  %0 = "tf.LogicalAnd"(%arg0, %arg1) : (tensor<2xi1>, tensor<2xi1>) -> tensor<2xi1>
   return %0: tensor<2xi1>
 }
 
@@ -153,9 +163,9 @@ func @and_unranked(%arg0: tensor<*xi1>, %arg1: tensor<*xi1>) -> tensor<*xi1> {
 }
 
 // CHECK-LABEL: func @or
-func @or(%arg0: tensor<2xi1>) -> tensor<2xi1> {
+func @or(%arg0: tensor<2xi1>, %arg1: tensor<2xi1>) -> tensor<2xi1> {
   // CHECK-NEXT:  mhlo.or
-  %0 = "tf.LogicalOr"(%arg0, %arg0) : (tensor<2xi1>, tensor<2xi1>) -> tensor<2xi1>
+  %0 = "tf.LogicalOr"(%arg0, %arg1) : (tensor<2xi1>, tensor<2xi1>) -> tensor<2xi1>
   return %0: tensor<2xi1>
 }
 
@@ -201,8 +211,8 @@ func @equal_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?xi1>
   // NOT-CHECK-NEXT: %[[WITNESS:.+]] = shape.cstr_broadcastable %[[LHS_SHAPE]], %[[RHS_SHAPE]]
   // NOT-CHECK-NEXT: shape.assuming %[[WITNESS]] -> (tensor<?xi1>) {
   // NOT-CHECK-DAG:    %[[LHS_SHAPE1:.+]] = shape.shape_of %arg0
-  // NOT-CHECK-NEXT:   %[[RESULT_SHAPE:.+]] = shape.broadcast %[[LHS_SHAPE1]], %[[RHS_SHAPE]]
-  // NOT-CHECK-NEXT:   %[[RESULT_EXTENTS:.+]] = shape.to_extent_tensor %[[RESULT_SHAPE]]
+  // NOT-CHECK-NEXT:   %[[RESULT_SHAPE:.+]] = shape.broadcast %[[LHS_SHAPE1]], %[[RHS_SHAPE]] : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  // NOT-CHECK-NEXT:   %[[RESULT_EXTENTS:.+]] = tensor_cast %[[RESULT_SHAPE]] : tensor<?xindex> to tensor<1xindex>
   // NOT-CHECK-DAG:    %[[LHS_BCAST:.+]] = "mhlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
   // NOT-CHECK-DAG:    %[[RHS_BCAST:.+]] = "mhlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
   // NOT-CHECK-NEXT:   %[[RESULT:.+]] = "mhlo.compare"(%[[LHS_BCAST]], %[[RHS_BCAST]]) {comparison_direction = "EQ"}
@@ -290,8 +300,8 @@ func @greater_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<?xi32>) -> tensor<?xi1
   // CHECK-NEXT: shape.assuming %[[WITNESS]]
   // CHECK-DAG:    %[[LHS_SHAPE1:.+]] = shape.shape_of %arg0
   // CHECK-DAG:    %[[RHS_SHAPE1:.+]] = shape.shape_of %arg1
-  // CHECK-NEXT:   %[[RESULT_SHAPE:.+]] = shape.broadcast %[[LHS_SHAPE1]], %[[RHS_SHAPE1]]
-  // CHECK-NEXT:   %[[RESULT_EXTENTS:.+]] = shape.to_extent_tensor %[[RESULT_SHAPE]]
+  // CHECK-NEXT:   %[[RESULT_SHAPE:.+]] = shape.broadcast %[[LHS_SHAPE1]], %[[RHS_SHAPE1]] : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  // CHECK-NEXT:   %[[RESULT_EXTENTS:.+]] = tensor_cast %[[RESULT_SHAPE]] : tensor<?xindex> to tensor<1xindex>
   // CHECK-DAG:    %[[LHS_BCAST:.+]] = "mhlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
   // CHECK-DAG:    %[[RHS_BCAST:.+]] = "mhlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
   // CHECK-NEXT:   "mhlo.compare"(%[[LHS_BCAST]], %[[RHS_BCAST]]) {comparison_direction = "GT"}
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf-communication.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf-communication.mlir
index f84a2f28a23..876a1bf03e7 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf-communication.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf-communication.mlir
@@ -169,7 +169,7 @@ func @send_to_host(%arg0: tensor<i32>) {
   // CHECK:      "mhlo.send"([[ARG0]], [[INIT_TOKEN]])
   // CHECK-SAME: channel_id = {handle = 1 : i64, type = 2 : i64}
   // CHECK-SAME: is_host_transfer = true
-  // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_original_type = "s32", _xla_host_transfer_rendezvous = "send_key"}
+  // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_original_type = "s32", _xla_host_transfer_rendezvous = "send_key_dtoh_0"}
   // CHECK-SAME: (tensor<i32>, !mhlo.token) -> !mhlo.token
   "tf.XlaSendToHost"(%arg0) {key = "send_key"} : (tensor<i32>) -> ()
   return
@@ -186,7 +186,7 @@ func @recv_from_host() -> tensor<i32> {
   // CHECK:      [[RECV_TUPLE:%.*]] = "mhlo.recv"([[INIT_TOKEN]])
   // CHECK-SAME: channel_id = {handle = 1 : i64, type = 3 : i64}
   // CHECK-SAME: is_host_transfer = true
-  // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_original_type = "s32", _xla_host_transfer_rendezvous = "recv_key"}
+  // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_original_type = "s32", _xla_host_transfer_rendezvous = "recv_key_htod_0"}
   // CHECK-SAME: (!mhlo.token) -> tuple<tensor<i32>, !mhlo.token>
 
 
@@ -407,6 +407,694 @@ func @callee2() attributes {sym_visibility = "private"} {
 
 // -----
 
+// Test cloned function rewrite also checks transitive function calls to
+// TF/XLA communication ops.
+
+// CHECK: func @callee3()
+func @callee3() {
+  // CHECK:      [[CALLEE3_INIT_TOKEN:%.*]] = "mhlo.create_token"
+
+  // CHECK:      call @callee4{{.+}}([[CALLEE3_INIT_TOKEN]])
+  call @callee4() : () -> ()
+  return
+}
+
+// CHECK: func @callee4()
+func @callee4() {
+  // CHECK:      [[CALLEE4_INIT_TOKEN:%.*]] = "mhlo.create_token"
+
+  // CHECK:      [[CALL_5:%.*]] = call @callee5([[CALLEE4_INIT_TOKEN]])
+  call @callee5() : () -> ()
+
+  // CHECK:      return
+  return
+}
+
+// CHECK: func @callee5([[CALLEE5_ARG0:%.*]]: !mhlo.token) -> !mhlo.token
+func @callee5() attributes {sym_visibility = "private"} {
+  // CHECK-NOT:  "mhlo.create_token"
+
+  // CHECK:      [[RECV_TUPLE:%.*]] = "mhlo.recv"([[CALLEE5_ARG0]])
+  // CHECK:      [[RECV_VAL:%.*]] = "mhlo.get_tuple_element"([[RECV_TUPLE]])
+  // CHECK-SAME: index = 0
+  // CHECK:      [[RECV_TOKEN:%.*]] = "mhlo.get_tuple_element"([[RECV_TUPLE]])
+  // CHECK-SAME: index = 1
+  %0 = "tf.XlaRecvFromHost"() {key = "recv_key", shape = #tf.shape<>} : () -> tensor<i32>
+
+  // CHECK:      return [[RECV_TOKEN]]
+  return
+}
+
+// CHECK: func @callee4{{.+}}([[CALLEE4_ARG0:%.*]]: !mhlo.token) -> !mhlo.token attributes {sym_visibility = "private"}
+// CHECK-NOT:  "mhlo.create_token"
+// CHECK:      [[CALL_5:%.*]] = call @callee5([[CALLEE4_ARG0]])
+// CHECK:      return [[CALL_5]]
+
+// -----
+
+// Tests `mhlo.if` with branches populated with TF/XLA communication ops.
+
+// CHECK-LABEL: func @if_both_branches
+// CHECK-SAME:  ([[ARG0:%.*]]: tensor<i1>, [[ARG1:%.*]]: tensor<f32>, [[ARG2:%.*]]: tensor<f32>)
+func @if_both_branches(%arg0: tensor<i1>, %arg1: tensor<f32>, %arg2: tensor<f32>) -> tensor<f32> {
+  // CHECK: [[INIT_TOKEN:%.*]] = "mhlo.create_token"
+  // CHECK: [[TRUE_TUPLE:%.*]] = "mhlo.tuple"([[ARG1]], [[INIT_TOKEN]])
+  // CHECK: [[FALSE_TUPLE:%.*]] = "mhlo.tuple"([[ARG2]], [[INIT_TOKEN]])
+
+  // CHECK: [[IF_TUPLE:%.*]] = "mhlo.if"([[ARG0]], [[TRUE_TUPLE]], [[FALSE_TUPLE]])
+  %0 = "mhlo.if"(%arg0, %arg1, %arg2) ( {
+  // CHECK: ^bb0([[TRUE_REGION_ARG:%.*]]: tuple<tensor<f32>, !mhlo.token>):
+  ^bb0(%arg3: tensor<f32>):
+    // CHECK-DAG:  [[TRUE_REGION_ARG_VALUE:%.*]] = "mhlo.get_tuple_element"([[TRUE_REGION_ARG]]) {index = 0
+    // CHECK-DAG:  [[TRUE_REGION_ARG_TOKEN:%.*]] = "mhlo.get_tuple_element"([[TRUE_REGION_ARG]]) {index = 1
+
+    // CHECK:      [[TRUE_SEND_TOKEN:%.*]] = "mhlo.send"([[TRUE_REGION_ARG_VALUE]], [[TRUE_REGION_ARG_TOKEN]])
+    // CHECK-SAME: channel_id = {handle = 1 : i64, type = 2 : i64}
+    // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_original_type = "f32", _xla_host_transfer_rendezvous = "send_if_true_dtoh_0"}
+
+    // CHECK:      [[TRUE_RECV_TUPLE:%.*]] = "mhlo.recv"([[TRUE_SEND_TOKEN]])
+    // CHECK-SAME: channel_id = {handle = 2 : i64, type = 3 : i64}
+    // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_original_type = "f32", _xla_host_transfer_rendezvous = "recv_if_true_htod_0"}
+    %1 = "tf._XlaHostComputeMlir"(%arg3) {recv_key = "recv_if_true", send_key = "send_if_true", tpu_core = 0 : i64} : (tensor<f32>) -> tensor<f32>
+
+    // CHECK-DAG:  [[TRUE_GET_TUPLE_ELEMENT0:%.*]] = "mhlo.get_tuple_element"([[TRUE_RECV_TUPLE]]) {index = 0
+    // CHECK-DAG:  [[TRUE_GET_TUPLE_ELEMENT1:%.*]] = "mhlo.get_tuple_element"([[TRUE_RECV_TUPLE]]) {index = 1
+    // CHECK:      [[TRUE_RETURN_TUPLE:%.*]] = "mhlo.tuple"([[TRUE_GET_TUPLE_ELEMENT0]], [[TRUE_GET_TUPLE_ELEMENT1]])
+    // CHECK:      "mhlo.return"([[TRUE_RETURN_TUPLE]])
+    "mhlo.return"(%1) : (tensor<f32>) -> ()
+  },  {
+  // CHECK: ^bb0([[FALSE_REGION_ARG:%.*]]: tuple<tensor<f32>, !mhlo.token>):
+  ^bb0(%arg3: tensor<f32>):
+    // CHECK-DAG:  [[FALSE_REGION_ARG_VALUE:%.*]] = "mhlo.get_tuple_element"([[FALSE_REGION_ARG]]) {index = 0
+    // CHECK-DAG:  [[FALSE_REGION_ARG_TOKEN:%.*]] = "mhlo.get_tuple_element"([[FALSE_REGION_ARG]]) {index = 1
+
+    // CHECK:      [[FALSE_SEND_TOKEN:%.*]] = "mhlo.send"([[FALSE_REGION_ARG_VALUE]], [[FALSE_REGION_ARG_TOKEN]])
+    // CHECK-SAME: channel_id = {handle = 3 : i64, type = 2 : i64}
+    // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_original_type = "f32", _xla_host_transfer_rendezvous = "send_if_false_dtoh_0"}
+
+    // CHECK:      [[FALSE_RECV_TUPLE:%.*]] = "mhlo.recv"([[FALSE_SEND_TOKEN]])
+    // CHECK-SAME: channel_id = {handle = 4 : i64, type = 3 : i64}
+    // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_original_type = "f32", _xla_host_transfer_rendezvous = "recv_if_false_htod_0"}
+    %1 = "tf._XlaHostComputeMlir"(%arg3) {recv_key = "recv_if_false", send_key = "send_if_false", tpu_core = 0 : i64} : (tensor<f32>) -> tensor<f32>
+
+    // CHECK-DAG:  [[FALSE_GET_TUPLE_ELEMENT0:%.*]] = "mhlo.get_tuple_element"([[FALSE_RECV_TUPLE]]) {index = 0
+    // CHECK-DAG:  [[FALSE_GET_TUPLE_ELEMENT1:%.*]] = "mhlo.get_tuple_element"([[FALSE_RECV_TUPLE]]) {index = 1
+    // CHECK:      [[FALSE_RETURN_TUPLE:%.*]] = "mhlo.tuple"([[FALSE_GET_TUPLE_ELEMENT0]], [[FALSE_GET_TUPLE_ELEMENT1]])
+    // CHECK:      "mhlo.return"([[FALSE_RETURN_TUPLE]])
+    "mhlo.return"(%1) : (tensor<f32>) -> ()
+
+  // CHECK: (tensor<i1>, tuple<tensor<f32>, !mhlo.token>, tuple<tensor<f32>, !mhlo.token>) -> tuple<tensor<f32>, !mhlo.token>
+  }) : (tensor<i1>, tensor<f32>, tensor<f32>) -> tensor<f32>
+
+  // CHECK:      [[IF_TUPLE_ELEMENT0:%.*]] = "mhlo.get_tuple_element"([[IF_TUPLE]])
+  // CHECK-SAME: index = 0
+  // CHECK:      return [[IF_TUPLE_ELEMENT0]]
+  return %0 : tensor<f32>
+}
+
+// -----
+
+// Tests `mhlo.if` with only the `true` branch populated with TF/XLA
+// communication ops.
+
+// CHECK-LABEL: func @if_true_branch
+// CHECK-SAME:  ([[ARG0:%.*]]: tensor<i1>, [[ARG1:%.*]]: tensor<f32>, [[ARG2:%.*]]: tensor<f32>)
+func @if_true_branch(%arg0: tensor<i1>, %arg1: tensor<f32>, %arg2: tensor<f32>) -> tensor<f32> {
+  // CHECK: [[INIT_TOKEN:%.*]] = "mhlo.create_token"
+  // CHECK: [[TRUE_TUPLE:%.*]] = "mhlo.tuple"([[ARG1]], [[INIT_TOKEN]])
+  // CHECK: [[FALSE_TUPLE:%.*]] = "mhlo.tuple"([[ARG2]], [[INIT_TOKEN]])
+
+  // CHECK: [[IF_TUPLE:%.*]] = "mhlo.if"([[ARG0]], [[TRUE_TUPLE]], [[FALSE_TUPLE]])
+  %0 = "mhlo.if"(%arg0, %arg1, %arg2) ( {
+  // CHECK: ^bb0([[TRUE_REGION_ARG:%.*]]: tuple<tensor<f32>, !mhlo.token>):
+  ^bb0(%arg3: tensor<f32>):
+    // CHECK-DAG:  [[TRUE_REGION_ARG_VALUE:%.*]] = "mhlo.get_tuple_element"([[TRUE_REGION_ARG]]) {index = 0
+    // CHECK-DAG:  [[TRUE_REGION_ARG_TOKEN:%.*]] = "mhlo.get_tuple_element"([[TRUE_REGION_ARG]]) {index = 1
+
+    // CHECK:      [[TRUE_SEND_TOKEN:%.*]] = "mhlo.send"([[TRUE_REGION_ARG_VALUE]], [[TRUE_REGION_ARG_TOKEN]])
+    // CHECK-SAME: channel_id = {handle = 1 : i64, type = 2 : i64}
+    // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_original_type = "f32", _xla_host_transfer_rendezvous = "send_if_true_dtoh_0"}
+
+    // CHECK:      [[TRUE_RECV_TUPLE:%.*]] = "mhlo.recv"([[TRUE_SEND_TOKEN]])
+    // CHECK-SAME: channel_id = {handle = 2 : i64, type = 3 : i64}
+    // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_original_type = "f32", _xla_host_transfer_rendezvous = "recv_if_true_htod_0"}
+    %1 = "tf._XlaHostComputeMlir"(%arg3) {recv_key = "recv_if_true", send_key = "send_if_true", tpu_core = 0 : i64} : (tensor<f32>) -> tensor<f32>
+
+    // CHECK-DAG:  [[TRUE_GET_TUPLE_ELEMENT0:%.*]] = "mhlo.get_tuple_element"([[TRUE_RECV_TUPLE]]) {index = 0
+    // CHECK-DAG:  [[TRUE_GET_TUPLE_ELEMENT1:%.*]] = "mhlo.get_tuple_element"([[TRUE_RECV_TUPLE]]) {index = 1
+    // CHECK:      [[TRUE_RETURN_TUPLE:%.*]] = "mhlo.tuple"([[TRUE_GET_TUPLE_ELEMENT0]], [[TRUE_GET_TUPLE_ELEMENT1]])
+    // CHECK:      "mhlo.return"([[TRUE_RETURN_TUPLE]])
+    "mhlo.return"(%1) : (tensor<f32>) -> ()
+  },  {
+  // CHECK: ^bb0([[FALSE_REGION_ARG:%.*]]: tuple<tensor<f32>, !mhlo.token>):
+  ^bb0(%arg3: tensor<f32>):
+    // CHECK-DAG:  [[FALSE_GET_TUPLE_ELEMENT0:%.*]] = "mhlo.get_tuple_element"([[FALSE_REGION_ARG]]) {index = 0
+    // CHECK-DAG:  [[FALSE_GET_TUPLE_ELEMENT1:%.*]] = "mhlo.get_tuple_element"([[FALSE_REGION_ARG]]) {index = 1
+    // CHECK:      [[FALSE_RETURN_TUPLE:%.*]] = "mhlo.tuple"([[FALSE_GET_TUPLE_ELEMENT0]], [[FALSE_GET_TUPLE_ELEMENT1]])
+    // CHECK:      "mhlo.return"([[FALSE_RETURN_TUPLE]])
+    "mhlo.return"(%arg3) : (tensor<f32>) -> ()
+
+  // CHECK: (tensor<i1>, tuple<tensor<f32>, !mhlo.token>, tuple<tensor<f32>, !mhlo.token>) -> tuple<tensor<f32>, !mhlo.token>
+  }) : (tensor<i1>, tensor<f32>, tensor<f32>) -> tensor<f32>
+
+  // CHECK:      [[IF_TUPLE_ELEMENT0:%.*]] = "mhlo.get_tuple_element"([[IF_TUPLE]])
+  // CHECK-SAME: index = 0
+  // CHECK:      return [[IF_TUPLE_ELEMENT0]]
+  return %0 : tensor<f32>
+}
+
+// -----
+
+// Tests `mhlo.if` with only the `false` branch populated with TF/XLA
+// communication ops.
+
+// CHECK-LABEL: func @if_false_branch
+// CHECK-SAME:  ([[ARG0:%.*]]: tensor<i1>, [[ARG1:%.*]]: tensor<f32>, [[ARG2:%.*]]: tensor<f32>)
+func @if_false_branch(%arg0: tensor<i1>, %arg1: tensor<f32>, %arg2: tensor<f32>) -> tensor<f32> {
+  // CHECK: [[INIT_TOKEN:%.*]] = "mhlo.create_token"
+  // CHECK: [[TRUE_TUPLE:%.*]] = "mhlo.tuple"([[ARG1]], [[INIT_TOKEN]])
+  // CHECK: [[FALSE_TUPLE:%.*]] = "mhlo.tuple"([[ARG2]], [[INIT_TOKEN]])
+
+  // CHECK: [[IF_TUPLE:%.*]] = "mhlo.if"([[ARG0]], [[TRUE_TUPLE]], [[FALSE_TUPLE]])
+  %0 = "mhlo.if"(%arg0, %arg1, %arg2) ( {
+  // CHECK: ^bb0([[TRUE_REGION_ARG:%.*]]: tuple<tensor<f32>, !mhlo.token>):
+  ^bb0(%arg3: tensor<f32>):
+    // CHECK-DAG:  [[TRUE_GET_TUPLE_ELEMENT0:%.*]] = "mhlo.get_tuple_element"([[TRUE_REGION_ARG]]) {index = 0
+    // CHECK-DAG:  [[TRUE_GET_TUPLE_ELEMENT1:%.*]] = "mhlo.get_tuple_element"([[TRUE_REGION_ARG]]) {index = 1
+    // CHECK:      [[TRUE_RETURN_TUPLE:%.*]] = "mhlo.tuple"([[TRUE_GET_TUPLE_ELEMENT0]], [[TRUE_GET_TUPLE_ELEMENT1]])
+    // CHECK:      "mhlo.return"([[TRUE_RETURN_TUPLE]])
+    "mhlo.return"(%arg3) : (tensor<f32>) -> ()
+  },  {
+  // CHECK: ^bb0([[FALSE_REGION_ARG:%.*]]: tuple<tensor<f32>, !mhlo.token>):
+  ^bb0(%arg3: tensor<f32>):
+    // CHECK-DAG:  [[FALSE_REGION_ARG_VALUE:%.*]] = "mhlo.get_tuple_element"([[FALSE_REGION_ARG]]) {index = 0
+    // CHECK-DAG:  [[FALSE_REGION_ARG_TOKEN:%.*]] = "mhlo.get_tuple_element"([[FALSE_REGION_ARG]]) {index = 1
+
+    // CHECK:      [[FALSE_SEND_TOKEN:%.*]] = "mhlo.send"([[FALSE_REGION_ARG_VALUE]], [[FALSE_REGION_ARG_TOKEN]])
+    // CHECK-SAME: channel_id = {handle = 1 : i64, type = 2 : i64}
+    // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_original_type = "f32", _xla_host_transfer_rendezvous = "send_if_false_dtoh_0"}
+
+    // CHECK:      [[FALSE_RECV_TUPLE:%.*]] = "mhlo.recv"([[FALSE_SEND_TOKEN]])
+    // CHECK-SAME: channel_id = {handle = 2 : i64, type = 3 : i64}
+    // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_original_type = "f32", _xla_host_transfer_rendezvous = "recv_if_false_htod_0"}
+    %1 = "tf._XlaHostComputeMlir"(%arg3) {recv_key = "recv_if_false", send_key = "send_if_false", tpu_core = 0 : i64} : (tensor<f32>) -> tensor<f32>
+
+    // CHECK-DAG:  [[FALSE_GET_TUPLE_ELEMENT0:%.*]] = "mhlo.get_tuple_element"([[FALSE_RECV_TUPLE]]) {index = 0
+    // CHECK-DAG:  [[FALSE_GET_TUPLE_ELEMENT1:%.*]] = "mhlo.get_tuple_element"([[FALSE_RECV_TUPLE]]) {index = 1
+    // CHECK:      [[FALSE_RETURN_TUPLE:%.*]] = "mhlo.tuple"([[FALSE_GET_TUPLE_ELEMENT0]], [[FALSE_GET_TUPLE_ELEMENT1]])
+    // CHECK:      "mhlo.return"([[FALSE_RETURN_TUPLE]])
+    "mhlo.return"(%1) : (tensor<f32>) -> ()
+
+  // CHECK: (tensor<i1>, tuple<tensor<f32>, !mhlo.token>, tuple<tensor<f32>, !mhlo.token>) -> tuple<tensor<f32>, !mhlo.token>
+  }) : (tensor<i1>, tensor<f32>, tensor<f32>) -> tensor<f32>
+
+  // CHECK:      [[IF_TUPLE_ELEMENT0:%.*]] = "mhlo.get_tuple_element"([[IF_TUPLE]])
+  // CHECK-SAME: index = 0
+  // CHECK:      return [[IF_TUPLE_ELEMENT0]]
+  return %0 : tensor<f32>
+}
+
+// -----
+
+// Tests `mhlo.if` with tuple arg from a `mhlo.tuple` only used by `mhlo.if` is
+// replaced.
+
+// CHECK-LABEL: func @if_replace_tuple_arg
+// CHECK-SAME:  ([[ARG0:%.*]]: tensor<i1>, [[ARG1:%.*]]: tensor<f32>, [[ARG2:%.*]]: tensor<f32>)
+func @if_replace_tuple_arg(%arg0: tensor<i1>, %arg1: tensor<f32>, %arg2: tensor<f32>) -> tensor<f32> {
+  // CHECK-NOT:  "mhlo.tuple"([[ARG1]], [[ARG2]])
+  // CHECK:      [[INIT_TOKEN:%.*]] = "mhlo.create_token"
+  // CHECK:      [[IF_ARG_TUPLE:%.*]] = "mhlo.tuple"([[ARG1]], [[ARG2]], [[INIT_TOKEN]])
+  %0 = "mhlo.tuple"(%arg1, %arg2) : (tensor<f32>, tensor<f32>) -> tuple<tensor<f32>, tensor<f32>>
+
+  // CHECK: [[IF_TUPLE:%.*]] = "mhlo.if"([[ARG0]], [[IF_ARG_TUPLE]], [[IF_ARG_TUPLE]])
+  %1 = "mhlo.if"(%arg0, %0, %0) ( {
+  ^bb0(%arg3: tuple<tensor<f32>, tensor<f32>>):
+    %2 = "mhlo.get_tuple_element"(%arg3) {index = 0 : i32} : (tuple<tensor<f32>, tensor<f32>>) -> tensor<f32>
+    "tf.XlaSendToHost"(%2) {key = "send_key"} : (tensor<f32>) -> ()
+    "mhlo.return"(%2) : (tensor<f32>) -> ()
+  },  {
+  ^bb0(%arg3: tuple<tensor<f32>, tensor<f32>>):
+    %2 = "mhlo.get_tuple_element"(%arg3) {index = 0 : i32} : (tuple<tensor<f32>, tensor<f32>>) -> tensor<f32>
+    "mhlo.return"(%2) : (tensor<f32>) -> ()
+  }) : (tensor<i1>, tuple<tensor<f32>, tensor<f32>>, tuple<tensor<f32>, tensor<f32>>) -> tensor<f32>
+  return %1 : tensor<f32>
+}
+
+// -----
+
+// Tests `mhlo.if` with tuple arg not from a `mhlo.tuple` is unpacked.
+
+// CHECK-LABEL: func @if_unpack_tuple_arg
+// CHECK-SAME:  ([[ARG0:%.*]]: tensor<i1>, [[ARG1:%.*]]: tuple<tensor<f32>, tensor<f32>>)
+func @if_unpack_tuple_arg(%arg0: tensor<i1>, %arg1: tuple<tensor<f32>, tensor<f32>>) -> tensor<f32> {
+  // CHECK:      [[INIT_TOKEN:%.*]] = "mhlo.create_token"
+  // CHECK-DAG:  [[IF_ARG_ELEMENT0:%.*]] = "mhlo.get_tuple_element"([[ARG1]]) {index = 0
+  // CHECK-DAG:  [[IF_ARG_ELEMENT1:%.*]] = "mhlo.get_tuple_element"([[ARG1]]) {index = 1
+  // CHECK:      [[IF_ARG_TUPLE:%.*]] = "mhlo.tuple"([[IF_ARG_ELEMENT0]], [[IF_ARG_ELEMENT1]], [[INIT_TOKEN]])
+
+  // CHECK:      [[IF_TUPLE:%.*]] = "mhlo.if"([[ARG0]], [[IF_ARG_TUPLE]], [[IF_ARG_TUPLE]])
+  %0 = "mhlo.if"(%arg0, %arg1, %arg1) ( {
+  ^bb0(%arg2: tuple<tensor<f32>, tensor<f32>>):
+    %1 = "mhlo.get_tuple_element"(%arg2) {index = 0 : i32} : (tuple<tensor<f32>, tensor<f32>>) -> tensor<f32>
+    "tf.XlaSendToHost"(%1) {key = "send_key"} : (tensor<f32>) -> ()
+    "mhlo.return"(%1) : (tensor<f32>) -> ()
+  },  {
+  ^bb0(%arg2: tuple<tensor<f32>, tensor<f32>>):
+    %1 = "mhlo.get_tuple_element"(%arg2) {index = 0 : i32} : (tuple<tensor<f32>, tensor<f32>>) -> tensor<f32>
+    "mhlo.return"(%1) : (tensor<f32>) -> ()
+  }) : (tensor<i1>, tuple<tensor<f32>, tensor<f32>>, tuple<tensor<f32>, tensor<f32>>) -> tensor<f32>
+  return %0 : tensor<f32>
+}
+
+// -----
+
+// Tests `mhlo.if` tuple result is extended with a `mhlo.token`.
+
+// CHECK-LABEL: func @if_extend_tuple_result
+func @if_extend_tuple_result(%arg0: tensor<i1>, %arg1: tuple<tensor<f32>, tensor<f32>>) -> tuple<tensor<f32>, tensor<f32>> {
+  // CHECK:      [[IF_TUPLE:%.*]] = "mhlo.if"
+  %0 = "mhlo.if"(%arg0, %arg1, %arg1) ( {
+  ^bb0(%arg2: tuple<tensor<f32>, tensor<f32>>):
+    %1 = "mhlo.get_tuple_element"(%arg2) {index = 0 : i32} : (tuple<tensor<f32>, tensor<f32>>) -> tensor<f32>
+    "tf.XlaSendToHost"(%1) {key = "send_key"} : (tensor<f32>) -> ()
+    "mhlo.return"(%arg2) : (tuple<tensor<f32>, tensor<f32>>) -> ()
+  },  {
+  ^bb0(%arg2: tuple<tensor<f32>, tensor<f32>>):
+    "mhlo.return"(%arg2) : (tuple<tensor<f32>, tensor<f32>>) -> ()
+  // CHECK:      (tensor<i1>, tuple<tensor<f32>, tensor<f32>, !mhlo.token>, tuple<tensor<f32>, tensor<f32>, !mhlo.token>) -> tuple<tensor<f32>, tensor<f32>, !mhlo.token>
+  }) : (tensor<i1>, tuple<tensor<f32>, tensor<f32>>, tuple<tensor<f32>, tensor<f32>>) -> tuple<tensor<f32>, tensor<f32>>
+
+  // CHECK-DAG:  [[IF_TUPLE_ELEMENT0:%.*]] = "mhlo.get_tuple_element"([[IF_TUPLE]]) {index = 0
+  // CHECK-DAG:  [[IF_TUPLE_ELEMENT1:%.*]] = "mhlo.get_tuple_element"([[IF_TUPLE]]) {index = 1
+  // CHECK:      [[IF_SUBTUPLE_RESULT:%.*]] = "mhlo.tuple"([[IF_TUPLE_ELEMENT0]], [[IF_TUPLE_ELEMENT1]])
+  // CHECK:      return [[IF_SUBTUPLE_RESULT]]
+  return %0 : tuple<tensor<f32>, tensor<f32>>
+}
+
+// -----
+
+// Tests nested `mhlo.if` containing TF/XLA communication ops.
+
+// CHECK-LABEL: func @if_nested
+// CHECK-SAME:  ([[ARG0:%.*]]: tensor<i1>, [[ARG1:%.*]]: tensor<f32>)
+func @if_nested(%arg0: tensor<i1>, %arg1: tensor<f32>) -> tensor<f32> {
+  // CHECK:      [[INIT_TOKEN:%.*]] = "mhlo.create_token"
+  // CHECK:      [[OUTER_IF_ARG_TUPLE:%.*]] = "mhlo.tuple"([[ARG1]], [[INIT_TOKEN]])
+
+  // CHECK:      "mhlo.if"([[ARG0]], [[OUTER_IF_ARG_TUPLE]], [[OUTER_IF_ARG_TUPLE]])
+  %0 = "mhlo.if"(%arg0, %arg1, %arg1) ( {
+  // CHECK-NEXT: ^bb0([[OUTER_IF_TRUE_ARG:%.*]]: tuple<tensor<f32>, !mhlo.token>):
+  ^bb0(%arg2: tensor<f32>):
+    // CHECK-DAG:  [[OUTER_IF_TRUE_ARG_ELEMENT0:%.*]] = "mhlo.get_tuple_element"([[OUTER_IF_TRUE_ARG]]) {index = 0
+    // CHECK-DAG:  [[OUTER_IF_TRUE_ARG_ELEMENT1:%.*]] = "mhlo.get_tuple_element"([[OUTER_IF_TRUE_ARG]]) {index = 1
+    // CHECK:      [[INNER_IF_ARG_TUPLE:%.*]] = "mhlo.tuple"([[OUTER_IF_TRUE_ARG_ELEMENT0]], [[OUTER_IF_TRUE_ARG_ELEMENT1]])
+
+    %1 = mhlo.constant dense<false> : tensor<i1>
+
+    // CHECK:      [[INNER_IF_TUPLE:%.*]] = "mhlo.if"({{%.*}}, [[INNER_IF_ARG_TUPLE]], [[INNER_IF_ARG_TUPLE]])
+    %2 = "mhlo.if"(%1, %arg2, %arg2) ( {
+    // CHECK-NEXT: ^bb0([[INNER_IF_TRUE_ARG:%.*]]: tuple<tensor<f32>, !mhlo.token>):
+    ^bb0(%arg3: tensor<f32>):
+      // CHECK-DAG:  [[INNER_IF_TRUE_ARG_ELEMENT0:%.*]] = "mhlo.get_tuple_element"([[INNER_IF_TRUE_ARG]]) {index = 0
+      // CHECK-DAG:  [[INNER_IF_TRUE_ARG_ELEMENT1:%.*]] = "mhlo.get_tuple_element"([[INNER_IF_TRUE_ARG]]) {index = 1
+
+      // CHECK:      [[SEND_TOKEN:%.*]] = "mhlo.send"([[INNER_IF_TRUE_ARG_ELEMENT0]], [[INNER_IF_TRUE_ARG_ELEMENT1]])
+      "tf.XlaSendToHost"(%arg3) {key = "send_key"} : (tensor<f32>) -> ()
+
+      // CHECK:      [[INNER_IF_TRUE_RESULT:%.*]] = "mhlo.tuple"([[INNER_IF_TRUE_ARG_ELEMENT0]], [[SEND_TOKEN]])
+      // CHECK:      "mhlo.return"([[INNER_IF_TRUE_RESULT]])
+      "mhlo.return"(%arg3) : (tensor<f32>) -> ()
+
+    // CHECK-NEXT: },  {
+    },  {
+
+    // CHECK-NEXT: ^bb0([[INNER_IF_FALSE_ARG:%.*]]: tuple<tensor<f32>, !mhlo.token>):
+    ^bb0(%arg3: tensor<f32>):
+      // CHECK-DAG:  [[INNER_IF_FALSE_ARG_ELEMENT0:%.*]] = "mhlo.get_tuple_element"([[INNER_IF_FALSE_ARG]]) {index = 0
+      // CHECK-DAG:  [[INNER_IF_FALSE_ARG_ELEMENT1:%.*]] = "mhlo.get_tuple_element"([[INNER_IF_FALSE_ARG]]) {index = 1
+      // CHECK:      [[INNER_IF_FALSE_RESULT:%.*]] = "mhlo.tuple"([[INNER_IF_FALSE_ARG_ELEMENT0]], [[INNER_IF_FALSE_ARG_ELEMENT1]])
+      // CHECK:      "mhlo.return"([[INNER_IF_FALSE_RESULT]])
+      "mhlo.return"(%arg3) : (tensor<f32>) -> ()
+    // CHECK-NEXT: (tensor<i1>, tuple<tensor<f32>, !mhlo.token>, tuple<tensor<f32>, !mhlo.token>) -> tuple<tensor<f32>, !mhlo.token>
+    }) : (tensor<i1>, tensor<f32>, tensor<f32>) -> tensor<f32>
+
+    // CHECK-DAG:  [[INNER_IF_TUPLE_ELEMENT1:%.*]] = "mhlo.get_tuple_element"([[INNER_IF_TUPLE]]) {index = 1
+    // CHECK:      [[OUTER_IF_TRUE_RESULT:%.*]] = "mhlo.tuple"([[OUTER_IF_TRUE_ARG_ELEMENT0]], [[INNER_IF_TUPLE_ELEMENT1]])
+    // CHECK:      "mhlo.return"([[OUTER_IF_TRUE_RESULT]])
+    "mhlo.return"(%arg2) : (tensor<f32>) -> ()
+
+  // CHECK-NEXT: },  {
+  },  {
+
+  // CHECK-NEXT: ^bb0([[OUTER_IF_FALSE_ARG:%.*]]: tuple<tensor<f32>, !mhlo.token>):
+  ^bb0(%arg2: tensor<f32>):
+    // CHECK-DAG:  [[OUTER_IF_FALSE_ARG_ELEMENT0:%.*]] = "mhlo.get_tuple_element"([[OUTER_IF_FALSE_ARG]]) {index = 0
+    // CHECK-DAG:  [[OUTER_IF_FALSE_ARG_ELEMENT1:%.*]] = "mhlo.get_tuple_element"([[OUTER_IF_FALSE_ARG]]) {index = 1
+    // CHECK:      [[OUTER_IF_FALSE_RESULT:%.*]] = "mhlo.tuple"([[OUTER_IF_FALSE_ARG_ELEMENT0]], [[OUTER_IF_FALSE_ARG_ELEMENT1]])
+    // CHECK:      "mhlo.return"([[OUTER_IF_FALSE_RESULT]])
+    "mhlo.return"(%arg2) : (tensor<f32>) -> ()
+  // CHECK-NEXT: (tensor<i1>, tuple<tensor<f32>, !mhlo.token>, tuple<tensor<f32>, !mhlo.token>) -> tuple<tensor<f32>, !mhlo.token>
+  }) : (tensor<i1>, tensor<f32>, tensor<f32>) -> tensor<f32>
+  return %0 : tensor<f32>
+}
+
+// -----
+
+// Tests `mhlo.if` containing a function call to TF/XLA communication ops.
+
+// CHECK-LABEL: func @if_function_call
+func @if_function_call(%arg0: tensor<i1>, %arg1: tensor<f32>) -> tensor<f32> {
+  // CHECK: "mhlo.if"
+  %0 = "mhlo.if"(%arg0, %arg1, %arg1) ( {
+  // CHECK: ^bb0([[TRUE_REGION_ARG:%.*]]: tuple<tensor<f32>, !mhlo.token>):
+  ^bb0(%arg2: tensor<f32>):
+    // CHECK-DAG:  [[TRUE_REGION_ARG_ELEMENT0:%.*]] = "mhlo.get_tuple_element"([[TRUE_REGION_ARG]]) {index = 0
+    // CHECK-DAG:  [[TRUE_REGION_ARG_ELEMENT1:%.*]] = "mhlo.get_tuple_element"([[TRUE_REGION_ARG]]) {index = 1
+    // CHECK:      [[CALL_TOKEN:%.*]] = call @callee([[TRUE_REGION_ARG_ELEMENT0]], [[TRUE_REGION_ARG_ELEMENT1]])
+    call @callee(%arg2) : (tensor<f32>) -> ()
+
+    // CHECK:      [[TRUE_RETURN_TUPLE:%.*]] = "mhlo.tuple"([[TRUE_REGION_ARG_ELEMENT0]], [[CALL_TOKEN]])
+    // CHECK:      "mhlo.return"([[TRUE_RETURN_TUPLE]])
+    "mhlo.return"(%arg2) : (tensor<f32>) -> ()
+  },  {
+  ^bb0(%arg2: tensor<f32>):
+    "mhlo.return"(%arg2) : (tensor<f32>) -> ()
+  }) : (tensor<i1>, tensor<f32>, tensor<f32>) -> tensor<f32>
+  return %0 : tensor<f32>
+}
+
+// CHECK-LABEL: func @callee
+// CHECK-SAME:  ([[CALLEE_ARG0:%.*]]: tensor<f32>, [[CALLEE_ARG1:%.*]]: !mhlo.token) -> !mhlo.token
+func @callee(%arg0: tensor<f32>) attributes {sym_visibility = "private"} {
+  // CHECK: [[SEND_TOKEN:%.*]] = "mhlo.send"
+  "tf.XlaSendToHost"(%arg0) {key = "send_key"} : (tensor<f32>) -> ()
+
+  // CHECK: return [[SEND_TOKEN]]
+  return
+}
+
+// -----
+
+// Tests `mhlo.if` containing multiple TF/XLA communication ops.
+
+// CHECK-LABEL: func @if_region_multiple_ops
+func @if_region_multiple_ops(%arg0: tensor<i1>, %arg1: tensor<f32>) {
+  // CHECK: "mhlo.if"
+  %0 = "mhlo.if"(%arg0, %arg1, %arg1) ( {
+  // CHECK: ^bb0([[TRUE_REGION_ARG:%.*]]: tuple<tensor<f32>, !mhlo.token>):
+  ^bb0(%arg2: tensor<f32>):
+    // CHECK: [[TRUE_REGION_ARG_ELEMENT0:%.*]] = "mhlo.get_tuple_element"([[TRUE_REGION_ARG]]) {index = 0
+    // CHECK: [[TRUE_REGION_ARG_ELEMENT1:%.*]] = "mhlo.get_tuple_element"([[TRUE_REGION_ARG]]) {index = 1
+
+    // CHECK: [[SEND0_TOKEN:%.*]] = "mhlo.send"([[TRUE_REGION_ARG_ELEMENT0]], [[TRUE_REGION_ARG_ELEMENT1]])
+    "tf.XlaSendToHost"(%arg2) {key = "send_key0"} : (tensor<f32>) -> ()
+
+    // CHECK: [[SEND1_TOKEN:%.*]] = "mhlo.send"([[TRUE_REGION_ARG_ELEMENT0]], [[SEND0_TOKEN]])
+    "tf.XlaSendToHost"(%arg2) {key = "send_key1"} : (tensor<f32>) -> ()
+
+    // CHECK: [[TRUE_RETURN_TUPLE:%.*]] = "mhlo.tuple"([[TRUE_REGION_ARG_ELEMENT0]], [[SEND1_TOKEN]])
+    // CHECK: "mhlo.return"([[TRUE_RETURN_TUPLE]])
+    "mhlo.return"(%arg2) : (tensor<f32>) -> ()
+  },  {
+  ^bb0(%arg2: tensor<f32>):
+    "mhlo.return"(%arg2) : (tensor<f32>) -> ()
+  }) : (tensor<i1>, tensor<f32>, tensor<f32>) -> tensor<f32>
+  return
+}
+
+// -----
+
+// Tests `mhlo.if` containing TF/XLA communication ops followed by other TF/XLA
+// communication ops.
+
+func @if_followed_by_communication_op(%arg0: tensor<i1>, %arg1: tensor<f32>) {
+  // CHECK: [[IF_TUPLE:%.*]] = "mhlo.if"
+  %0 = "mhlo.if"(%arg0, %arg1, %arg1) ( {
+  ^bb0(%arg2: tensor<f32>):
+    "tf.XlaSendToHost"(%arg2) {key = "send_key0"} : (tensor<f32>) -> ()
+    "mhlo.return"(%arg2) : (tensor<f32>) -> ()
+  },  {
+  ^bb0(%arg2: tensor<f32>):
+    "mhlo.return"(%arg2) : (tensor<f32>) -> ()
+  }) : (tensor<i1>, tensor<f32>, tensor<f32>) -> tensor<f32>
+
+  // CHECK: [[IF_TUPLE_ELEMENT1:%.*]] = "mhlo.get_tuple_element"([[IF_TUPLE]]) {index = 1
+
+  // CHECK: "mhlo.send"({{.*}}, [[IF_TUPLE_ELEMENT1]])
+  "tf.XlaSendToHost"(%arg1) {key = "send_key1"} : (tensor<f32>) -> ()
+  return
+}
+
+// -----
+
+// Tests `mhlo.while` with cond and body populated with TF/XLA communication
+// ops.
+
+// CHECK-LABEL: func @while_cond_body
+// CHECK-SAME:  ([[ARG0:%.*]]: tensor<f32>)
+func @while_cond_body(%arg0: tensor<f32>) -> tensor<f32> {
+  // CHECK: [[INIT_TOKEN:%.*]] = "mhlo.create_token"
+  // CHECK: [[ARG_TUPLE:%.*]] = "mhlo.tuple"([[ARG0]], [[INIT_TOKEN]])
+
+  // CHECK: [[WHILE_TUPLE:%.*]] = "mhlo.while"([[ARG_TUPLE]])
+  %0 = "mhlo.while"(%arg0) ( {
+  // CHECK: ^bb0([[COND_REGION_ARG:%.*]]: tuple<tensor<f32>, !mhlo.token>):
+  ^bb0(%arg1: tensor<f32>):
+    // CHECK-DAG:  [[COND_REGION_ARG_VALUE:%.*]] = "mhlo.get_tuple_element"([[COND_REGION_ARG]]) {index = 0
+    // CHECK-DAG:  [[COND_REGION_ARG_TOKEN:%.*]] = "mhlo.get_tuple_element"([[COND_REGION_ARG]]) {index = 1
+
+    // CHECK:      [[COND_SEND_TOKEN:%.*]] = "mhlo.send"([[COND_REGION_ARG_VALUE]], [[COND_REGION_ARG_TOKEN]])
+    // CHECK-SAME: channel_id = {handle = 1 : i64, type = 2 : i64}
+    // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_original_type = "f32", _xla_host_transfer_rendezvous = "send_while_cond_dtoh_0"}
+
+    // CHECK:      [[COND_RECV_TUPLE:%.*]] = "mhlo.recv"([[COND_SEND_TOKEN]])
+    // CHECK-SAME: channel_id = {handle = 2 : i64, type = 3 : i64}
+    // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_original_type = "f32", _xla_host_transfer_rendezvous = "recv_while_cond_htod_0"}
+    %1 = "tf._XlaHostComputeMlir"(%arg1) {recv_key = "recv_while_cond", send_key = "send_while_cond", tpu_core = 0 : i64} : (tensor<f32>) -> tensor<f32>
+
+    // CHECK-DAG:  [[COND_GET_TUPLE_ELEMENT0:%.*]] = "mhlo.get_tuple_element"([[COND_RECV_TUPLE]]) {index = 0
+    // CHECK-DAG:  [[COND_GET_TUPLE_ELEMENT1:%.*]] = "mhlo.get_tuple_element"([[COND_RECV_TUPLE]]) {index = 1
+
+    // CHECK:      [[COND_COMPARE:%.*]] = "mhlo.compare"([[COND_GET_TUPLE_ELEMENT0]], [[COND_GET_TUPLE_ELEMENT0]])
+    %2 = "mhlo.compare"(%1, %1) {comparison_direction = "LT"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
+
+    // CHECK:      "mhlo.return"([[COND_COMPARE]])
+    "mhlo.return"(%2) : (tensor<i1>) -> ()
+  },  {
+  // CHECK: ^bb0([[BODY_REGION_ARG:%.*]]: tuple<tensor<f32>, !mhlo.token>):
+  ^bb0(%arg1: tensor<f32>):
+    // CHECK-DAG:  [[BODY_REGION_ARG_VALUE:%.*]] = "mhlo.get_tuple_element"([[BODY_REGION_ARG]]) {index = 0
+    // CHECK-DAG:  [[BODY_REGION_ARG_TOKEN:%.*]] = "mhlo.get_tuple_element"([[BODY_REGION_ARG]]) {index = 1
+
+    // CHECK:      [[BODY_SEND_TOKEN:%.*]] = "mhlo.send"([[BODY_REGION_ARG_VALUE]], [[BODY_REGION_ARG_TOKEN]])
+    // CHECK-SAME: channel_id = {handle = 3 : i64, type = 2 : i64}
+    // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_original_type = "f32", _xla_host_transfer_rendezvous = "send_while_body_dtoh_0"}
+
+    // CHECK:      [[BODY_RECV_TUPLE:%.*]] = "mhlo.recv"([[BODY_SEND_TOKEN]])
+    // CHECK-SAME: channel_id = {handle = 4 : i64, type = 3 : i64}
+    // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_original_type = "f32", _xla_host_transfer_rendezvous = "recv_while_body_htod_0"}
+    %1 = "tf._XlaHostComputeMlir"(%arg1) {recv_key = "recv_while_body", send_key = "send_while_body", tpu_core = 0 : i64} : (tensor<f32>) -> tensor<f32>
+
+    // CHECK-DAG:  [[BODY_GET_TUPLE_ELEMENT0:%.*]] = "mhlo.get_tuple_element"([[BODY_RECV_TUPLE]]) {index = 0
+    // CHECK-DAG:  [[BODY_GET_TUPLE_ELEMENT1:%.*]] = "mhlo.get_tuple_element"([[BODY_RECV_TUPLE]]) {index = 1
+    // CHECK:      [[BODY_RETURN_TUPLE:%.*]] = "mhlo.tuple"([[BODY_GET_TUPLE_ELEMENT0]], [[BODY_GET_TUPLE_ELEMENT1]])
+    // CHECK:      "mhlo.return"([[BODY_RETURN_TUPLE]])
+    "mhlo.return"(%1) : (tensor<f32>) -> ()
+  // CHECK: (tuple<tensor<f32>, !mhlo.token>) -> tuple<tensor<f32>, !mhlo.token>
+  }) : (tensor<f32>) -> tensor<f32>
+
+  // CHECK:      [[WHILE_TUPLE_ELEMENT0:%.*]] = "mhlo.get_tuple_element"([[WHILE_TUPLE]])
+  // CHECK-SAME: index = 0
+  // CHECK:      return [[WHILE_TUPLE_ELEMENT0]]
+  return %0 : tensor<f32>
+}
+
+// -----
+
+// Tests `mhlo.while` with only the `cond` region populated with TF/XLA
+// communication ops.
+
+// CHECK-LABEL: func @while_cond
+// CHECK-SAME:  ([[ARG0:%.*]]: tensor<f32>)
+func @while_cond(%arg0: tensor<f32>) -> tensor<f32> {
+  // CHECK: [[INIT_TOKEN:%.*]] = "mhlo.create_token"
+  // CHECK: [[ARG_TUPLE:%.*]] = "mhlo.tuple"([[ARG0]], [[INIT_TOKEN]])
+
+  // CHECK: [[WHILE_TUPLE:%.*]] = "mhlo.while"([[ARG_TUPLE]])
+  %0 = "mhlo.while"(%arg0) ( {
+  // CHECK: ^bb0([[COND_REGION_ARG:%.*]]: tuple<tensor<f32>, !mhlo.token>):
+  ^bb0(%arg1: tensor<f32>):
+    // CHECK-DAG:  [[COND_REGION_ARG_VALUE:%.*]] = "mhlo.get_tuple_element"([[COND_REGION_ARG]]) {index = 0
+    // CHECK-DAG:  [[COND_REGION_ARG_TOKEN:%.*]] = "mhlo.get_tuple_element"([[COND_REGION_ARG]]) {index = 1
+
+    // CHECK:      [[COND_SEND_TOKEN:%.*]] = "mhlo.send"([[COND_REGION_ARG_VALUE]], [[COND_REGION_ARG_TOKEN]])
+    // CHECK-SAME: channel_id = {handle = 1 : i64, type = 2 : i64}
+    // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_original_type = "f32", _xla_host_transfer_rendezvous = "send_while_cond_dtoh_0"}
+
+    // CHECK:      [[COND_RECV_TUPLE:%.*]] = "mhlo.recv"([[COND_SEND_TOKEN]])
+    // CHECK-SAME: channel_id = {handle = 2 : i64, type = 3 : i64}
+    // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_original_type = "f32", _xla_host_transfer_rendezvous = "recv_while_cond_htod_0"}
+    %1 = "tf._XlaHostComputeMlir"(%arg1) {recv_key = "recv_while_cond", send_key = "send_while_cond", tpu_core = 0 : i64} : (tensor<f32>) -> tensor<f32>
+
+    // CHECK-DAG:  [[COND_GET_TUPLE_ELEMENT0:%.*]] = "mhlo.get_tuple_element"([[COND_RECV_TUPLE]]) {index = 0
+    // CHECK-DAG:  [[COND_GET_TUPLE_ELEMENT1:%.*]] = "mhlo.get_tuple_element"([[COND_RECV_TUPLE]]) {index = 1
+
+    // CHECK:      [[COND_COMPARE:%.*]] = "mhlo.compare"([[COND_GET_TUPLE_ELEMENT0]], [[COND_GET_TUPLE_ELEMENT0]])
+    %2 = "mhlo.compare"(%1, %1) {comparison_direction = "LT"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
+
+    // CHECK:      "mhlo.return"([[COND_COMPARE]])
+    "mhlo.return"(%2) : (tensor<i1>) -> ()
+  },  {
+  // CHECK: ^bb0([[BODY_REGION_ARG:%.*]]: tuple<tensor<f32>, !mhlo.token>):
+  ^bb0(%arg1: tensor<f32>):
+    // CHECK-DAG:  [[BODY_GET_TUPLE_ELEMENT0:%.*]] = "mhlo.get_tuple_element"([[BODY_REGION_ARG]]) {index = 0
+    // CHECK-DAG:  [[BODY_GET_TUPLE_ELEMENT1:%.*]] = "mhlo.get_tuple_element"([[BODY_REGION_ARG]]) {index = 1
+    // CHECK:      [[BODY_RETURN_TUPLE:%.*]] = "mhlo.tuple"([[BODY_GET_TUPLE_ELEMENT0]], [[BODY_GET_TUPLE_ELEMENT1]])
+    // CHECK:      "mhlo.return"([[BODY_RETURN_TUPLE]])
+    "mhlo.return"(%arg1) : (tensor<f32>) -> ()
+  // CHECK: (tuple<tensor<f32>, !mhlo.token>) -> tuple<tensor<f32>, !mhlo.token>
+  }) : (tensor<f32>) -> tensor<f32>
+
+  // CHECK:      [[WHILE_TUPLE_ELEMENT0:%.*]] = "mhlo.get_tuple_element"([[WHILE_TUPLE]])
+  // CHECK-SAME: index = 0
+  // CHECK:      return [[WHILE_TUPLE_ELEMENT0]]
+  return %0 : tensor<f32>
+}
+
+// -----
+
+// Tests `mhlo.while` with only the `body` region populated with TF/XLA
+// communication ops.
+
+// CHECK-LABEL: func @while_body
+// CHECK-SAME:  ([[ARG0:%.*]]: tensor<f32>)
+func @while_body(%arg0: tensor<f32>) -> tensor<f32> {
+  // CHECK: [[INIT_TOKEN:%.*]] = "mhlo.create_token"
+  // CHECK: [[ARG_TUPLE:%.*]] = "mhlo.tuple"([[ARG0]], [[INIT_TOKEN]])
+
+  // CHECK: [[WHILE_TUPLE:%.*]] = "mhlo.while"([[ARG_TUPLE]])
+  %0 = "mhlo.while"(%arg0) ( {
+  // CHECK: ^bb0([[COND_REGION_ARG:%.*]]: tuple<tensor<f32>, !mhlo.token>):
+  ^bb0(%arg1: tensor<f32>):
+    // CHECK-DAG:  [[COND_GET_TUPLE_ELEMENT0:%.*]] = "mhlo.get_tuple_element"([[COND_REGION_ARG]]) {index = 0
+    // CHECK-DAG:  [[COND_GET_TUPLE_ELEMENT1:%.*]] = "mhlo.get_tuple_element"([[COND_REGION_ARG]]) {index = 1
+
+    // CHECK:      [[COND_COMPARE:%.*]] = "mhlo.compare"([[COND_GET_TUPLE_ELEMENT0]], [[COND_GET_TUPLE_ELEMENT0]])
+    %2 = "mhlo.compare"(%arg1, %arg1) {comparison_direction = "LT"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
+
+    // CHECK:      "mhlo.return"([[COND_COMPARE]])
+    "mhlo.return"(%2) : (tensor<i1>) -> ()
+  },  {
+  // CHECK: ^bb0([[BODY_REGION_ARG:%.*]]: tuple<tensor<f32>, !mhlo.token>):
+  ^bb0(%arg1: tensor<f32>):
+    // CHECK-DAG:  [[BODY_REGION_ARG_VALUE:%.*]] = "mhlo.get_tuple_element"([[BODY_REGION_ARG]]) {index = 0
+    // CHECK-DAG:  [[BODY_REGION_ARG_TOKEN:%.*]] = "mhlo.get_tuple_element"([[BODY_REGION_ARG]]) {index = 1
+
+    // CHECK:      [[BODY_SEND_TOKEN:%.*]] = "mhlo.send"([[BODY_REGION_ARG_VALUE]], [[BODY_REGION_ARG_TOKEN]])
+    // CHECK-SAME: channel_id = {handle = 1 : i64, type = 2 : i64}
+    // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_original_type = "f32", _xla_host_transfer_rendezvous = "send_while_body_dtoh_0"}
+
+    // CHECK:      [[BODY_RECV_TUPLE:%.*]] = "mhlo.recv"([[BODY_SEND_TOKEN]])
+    // CHECK-SAME: channel_id = {handle = 2 : i64, type = 3 : i64}
+    // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_original_type = "f32", _xla_host_transfer_rendezvous = "recv_while_body_htod_0"}
+    %1 = "tf._XlaHostComputeMlir"(%arg1) {recv_key = "recv_while_body", send_key = "send_while_body", tpu_core = 0 : i64} : (tensor<f32>) -> tensor<f32>
+
+    // CHECK-DAG:  [[BODY_GET_TUPLE_ELEMENT0:%.*]] = "mhlo.get_tuple_element"([[BODY_RECV_TUPLE]]) {index = 0
+    // CHECK-DAG:  [[BODY_GET_TUPLE_ELEMENT1:%.*]] = "mhlo.get_tuple_element"([[BODY_RECV_TUPLE]]) {index = 1
+    // CHECK:      [[BODY_RETURN_TUPLE:%.*]] = "mhlo.tuple"([[BODY_GET_TUPLE_ELEMENT0]], [[BODY_GET_TUPLE_ELEMENT1]])
+    // CHECK:      "mhlo.return"([[BODY_RETURN_TUPLE]])
+    "mhlo.return"(%1) : (tensor<f32>) -> ()
+  // CHECK: (tuple<tensor<f32>, !mhlo.token>) -> tuple<tensor<f32>, !mhlo.token>
+  }) : (tensor<f32>) -> tensor<f32>
+
+  // CHECK:      [[WHILE_TUPLE_ELEMENT0:%.*]] = "mhlo.get_tuple_element"([[WHILE_TUPLE]])
+  // CHECK-SAME: index = 0
+  // CHECK:      return [[WHILE_TUPLE_ELEMENT0]]
+  return %0 : tensor<f32>
+}
+
+// -----
+
+// Tests `mhlo.while` containing TF/XLA communication ops followed by other
+// TF/XLA communication ops.
+
+func @while_followed_by_communication_op(%arg0: tensor<f32>) {
+  // CHECK: [[WHILE_TUPLE:%.*]] = "mhlo.while"
+  %0 = "mhlo.while"(%arg0) ( {
+  ^bb0(%arg1: tensor<f32>):
+    "tf.XlaSendToHost"(%arg1) {key = "send_key0"} : (tensor<f32>) -> ()
+    %1 = "mhlo.compare"(%arg1, %arg1) {comparison_direction = "LT"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
+    "mhlo.return"(%1) : (tensor<i1>) -> ()
+  },  {
+  ^bb0(%arg1: tensor<f32>):
+    "mhlo.return"(%arg1) : (tensor<f32>) -> ()
+  }) : (tensor<f32>) -> tensor<f32>
+
+  // CHECK: [[WHILE_TUPLE_ELEMENT1:%.*]] = "mhlo.get_tuple_element"([[WHILE_TUPLE]]) {index = 1
+
+  // CHECK: "mhlo.send"({{.*}}, [[WHILE_TUPLE_ELEMENT1]])
+  "tf.XlaSendToHost"(%arg0) {key = "send_key1"} : (tensor<f32>) -> ()
+  return
+}
+
+// -----
+
+// Tests unsupported parent of TF/XLA communication op.
+
+func @unsupported_ancestor(%arg0: tensor<?x?xf32>, %arg1: tensor<f32>) {
+  %0 = "mhlo.reduce"(%arg0, %arg1) ( {
+  ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
+    %1 = mhlo.add %arg2, %arg3 : tensor<f32>
+    // expected-error@+1 {{expects ancestor(s) to be of ['mhlo.if', 'func']}}
+    "tf._XlaHostComputeMlir"() {recv_key = "host_compute_channel_recv", send_key = "host_compute_channel_send", tpu_core = 0 : i64} : () -> ()
+    "mhlo.return"(%1) : (tensor<f32>) -> ()
+  }) {dimensions = dense<[1]> : tensor<1xi64>} : (tensor<?x?xf32>, tensor<f32>) -> tensor<?xf32>
+  return
+}
+
+// -----
+
+// Tests transitive unsupported parent of TF/XLA communication op.
+
+func @unsupported_ancestor(%arg0: tensor<?x?xf32>, %arg1: tensor<f32>) {
+  %0 = "mhlo.reduce"(%arg0, %arg1) ( {
+  ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
+    %1 = mhlo.add %arg2, %arg3 : tensor<f32>
+    // expected-error@+1 {{expects ancestor(s) to be of ['mhlo.if', 'func']}}
+    call @callee() : () -> ()
+    "mhlo.return"(%1) : (tensor<f32>) -> ()
+  }) {dimensions = dense<[1]> : tensor<1xi64>} : (tensor<?x?xf32>, tensor<f32>) -> tensor<?xf32>
+  return
+}
+
+func @callee() attributes {sym_visibility = "private"} {
+  "tf._XlaHostComputeMlir"() {recv_key = "host_compute_channel_recv", send_key = "host_compute_channel_send", tpu_core = 0 : i64} : () -> ()
+  return
+}
+
+// -----
+
+// Tests unsupported `mhlo.if` with region of more than one block and contains a
+// TF/XLA communication op.
+
+func @if_multiple_blocks(%arg0: tensor<i1>, %arg1: tensor<f32>) {
+  %0 = "mhlo.if"(%arg0, %arg1, %arg1) ( {
+  ^bb0(%arg2: tensor<f32>):
+    br ^bb1(%arg2 : tensor<f32>)
+  ^bb1(%arg3: tensor<f32>):
+    // expected-error@+1 {{expects single block region ancestor(s)}}
+    "tf.XlaSendToHost"(%arg3) {key = "send_key0"} : (tensor<f32>) -> ()
+    "mhlo.return"(%arg3) : (tensor<f32>) -> ()
+  },  {
+  ^bb0(%arg2: tensor<f32>):
+    "mhlo.return"(%arg2) : (tensor<f32>) -> ()
+  }) : (tensor<i1>, tensor<f32>, tensor<f32>) -> tensor<f32>
+  return
+}
+
+// -----
+
 // Tests function with more than one block that is to be rewritten emits an
 // error instead.
 
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf-control-flow.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf-control-flow.mlir
index 5a9089756a9..93eac3821b2 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf-control-flow.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf-control-flow.mlir
@@ -44,7 +44,7 @@ attributes  {tf._input_shapes = ["tfshape$", "tfshape$"]} {
 // CHECK-LABEL: func @case
 // CHECK-SAME:  %[[BRANCH_INDEX:.*]]: tensor<i32>, %[[ARG0:.*]]: tensor<f32>, %[[ARG1:.*]]: tensor<f32>) -> (tensor<f32>, tensor<f32>)
 func @case(%index: tensor<i32>, %arg0: tensor<f32>, %arg1: tensor<f32>) -> (tensor<f32>, tensor<f32>) {
-  %0:2 = "tf.Case"(%index, %arg0, %arg1) {branches = [@exponential, @log, @floor]} : (tensor<i32>, tensor<f32>, tensor<f32>) -> (tensor<f32>, tensor<f32>)
+  %0:2 = "tf.Case"(%index, %arg0, %arg1) {branches = [@exponential, @log, @floor], is_stateless = true} : (tensor<i32>, tensor<f32>, tensor<f32>) -> (tensor<f32>, tensor<f32>)
   // CHECK: %[[TUPLE_INPUT:.*]] = "mhlo.tuple"(%[[ARG0]], %[[ARG1]]) : (tensor<f32>, tensor<f32>) -> tuple<tensor<f32>, tensor<f32>>
   // CHECK: %[[CASE:.*]]:2 = "mhlo.case"(%[[BRANCH_INDEX]], %[[TUPLE_INPUT]], %[[TUPLE_INPUT]], %[[TUPLE_INPUT]]) ( {
   // CHECK:   ^bb0(%[[TUPLE_ARG:.*]]: tuple<tensor<f32>, tensor<f32>>):
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf-include-tf2xla-fallback.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf-include-tf2xla-fallback.mlir
new file mode 100644
index 00000000000..9f72820d15b
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf-include-tf2xla-fallback.mlir
@@ -0,0 +1,50 @@
+// RUN: tf-opt "-xla-legalize-tf=allow-partial-conversion use-tf2xla-fallback=false" -verify-diagnostics %s | FileCheck --check-prefix NO_FALLBACK %s
+// RUN: tf-opt "-xla-legalize-tf=use-tf2xla-fallback=true device-type=XLA_CPU_JIT" -verify-diagnostics %s | FileCheck --check-prefix SUPPORTED_FALLBACK_DEVICE %s
+// RUN: tf-opt "-xla-legalize-tf=allow-partial-conversion use-tf2xla-fallback=true" %s | FileCheck --check-prefix UNSPECIFIED_FALLBACK_DEVICE %s
+// RUN: tf-opt "-xla-legalize-tf=allow-partial-conversion use-tf2xla-fallback=true device-type=INVALID_DEVICE_TYPE" %s | FileCheck --check-prefix UNSUPPORTED_FALLBACK_DEVICE %s
+
+// We run this test four times:
+// 1) Legalize without using TF2XLA fallback (ops cannot be legalized).
+// 2) Use fallback with a device that supports all ops (ops can be legalized).
+// 3) Use fallback with unspecified device (ops cannot be legalized).
+// 4) Use fallback with specified but unsupported device (ops cannot be legalized).
+//
+// Note: For 3) and 4) we do not use `-verify-diagnostics` because these cases
+// produce remarks that don't occur for 1) and 2) and there is no way to check
+// the remarks only for 3) and 4) (except using two files).
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 268 : i32}} {
+
+// CHECK-LABEL: non_max_suppression_v4
+func @non_max_suppression_v4(%arg0: tensor<3x4xf32>, %arg1: tensor<3xf32>, %arg2: tensor<f32>, %arg3: tensor<f32>) -> tensor<2xi32> {
+  %max_size = mhlo.constant dense<2> : tensor<i32>
+  // NO_FALLBACK: tf.NonMaxSuppressionV4
+  // SUPPORTED_FALLBACK_DEVICE-NOT: tf.NonMaxSuppressionV4
+  // UNSPECIFIED_FALLBACK_DEVICE: tf.NonMaxSuppressionV4
+  // UNSUPPORTED_FALLBACK_DEVICE:  tf.NonMaxSuppressionV4
+  %0:2 = "tf.NonMaxSuppressionV4"(%arg0, %arg1, %max_size, %arg2, %arg3) {pad_to_max_output_size = true}: (tensor<3x4xf32>, tensor<3xf32>, tensor<i32>, tensor<f32>, tensor<f32>) -> (tensor<2xi32>, tensor<i32>)
+  return %0#0 : tensor<2xi32>
+}
+
+// CHECK-LABEL: mirror_pad
+func @mirror_pad(%arg0: tensor<2x3xcomplex<f64>>) -> tensor<4x7xcomplex<f64>> {
+  %0 = mhlo.constant dense<[[1, 1], [2, 2]]> : tensor<2x2xi32>
+  // NO_FALLBACK: tf.MirrorPad
+  // SUPPORTED_FALLBACK_DEVICE-NOT: tf.MirrorPad
+  // UNSPECIFIED_FALLBACK_DEVICE: tf.MirrorPad
+  // UNSUPPORTED_FALLBACK_DEVICE: tf.MirrorPad
+  %1 = "tf.MirrorPad"(%arg0, %0) {mode = "SYMMETRIC"} : (tensor<2x3xcomplex<f64>>, tensor<2x2xi32>) -> tensor<4x7xcomplex<f64>>
+  return %1 : tensor<4x7xcomplex<f64>>
+}
+
+// CHECK-LABEL: atan2
+func @atan2(%arg0: tensor<4x1xf32>, %arg1: tensor<4x1x4xf32>) -> tensor<4x4x4xf32> {
+  // NO_FALLBACK: tf.Atan2
+  // SUPPORTED_FALLBACK_DEVICE-NOT: tf.Atan2
+  // UNSPECIFIED_FALLBACK_DEVICE: tf.Atan2
+  // UNSUPPORTED_FALLBACK_DEVICE: tf.Atan2
+  %0 = "tf.Atan2"(%arg0, %arg1) : (tensor<4x1xf32>, tensor<4x1x4xf32>) -> tensor<4x4x4xf32>
+  return %0: tensor<4x4x4xf32>
+}
+
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf-with-tf2xla.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf-with-tf2xla.mlir
index cd351447303..8c8d99940de 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf-with-tf2xla.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf-with-tf2xla.mlir
@@ -220,13 +220,6 @@ func @sparse_to_dense(%arg0: tensor<3x2xi32>, %arg1: tensor<3xf32>, %arg2: tenso
   return %0 : tensor<3x3xf32>
 }
 
-// CHECK-LABEL: fft
-func @fft(%arg0: tensor<3x5x8xcomplex<f32>>) -> tensor<3x5x8xcomplex<f32>> {
-  // CHECK: "mhlo.fft"(%arg0)
-  %0 = "tf.FFT"(%arg0) : (tensor<3x5x8xcomplex<f32>>) -> tensor<3x5x8xcomplex<f32>>
-  return %0 : tensor<3x5x8xcomplex<f32>>
-}
-
 // CHECK-LABEL: reverse_sequence
 func @reverse_sequence(%arg0: tensor<4x2x3x1x1xi32>, %arg1: tensor<3xi32>) -> tensor<4x2x3x1x1xi32> {
   // CHECK-NOT: tf.ReverseSequence
@@ -265,6 +258,47 @@ func @non_max_suppression_v4(%arg0: tensor<3x4xf32>, %arg1: tensor<3xf32>, %arg2
   return %0#0 : tensor<2xi32>
 }
 
+// CHECK-LABEL: bessel_i0e
+func @bessel_i0e(%arg0: tensor<3xf16>, %arg1: tensor<3xf32>, %arg2: tensor<3xf64>) -> (tensor<3xf16>, tensor<3xf32>, tensor<3xf64>) {
+  // CHECK-NOT: tf.BesselI0e
+  %0 = "tf.BesselI0e"(%arg0) : (tensor<3xf16>) -> (tensor<3xf16>)
+  %1 = "tf.BesselI0e"(%arg1) : (tensor<3xf32>) -> (tensor<3xf32>)
+  %2 = "tf.BesselI0e"(%arg2) : (tensor<3xf64>) -> (tensor<3xf64>)
+  return %0, %1, %2 : tensor<3xf16>, tensor<3xf32>, tensor<3xf64>
+}
+
+// CHECK-LABEL: bessel_i1e
+func @bessel_i1e(%arg0: tensor<3xf16>, %arg1: tensor<3xf32>, %arg2: tensor<3xf64>) -> (tensor<3xf16>, tensor<3xf32>, tensor<3xf64>) {
+  // CHECK-NOT: tf.BesselI1e
+  %0 = "tf.BesselI1e"(%arg0) : (tensor<3xf16>) -> (tensor<3xf16>)
+  %1 = "tf.BesselI1e"(%arg1) : (tensor<3xf32>) -> (tensor<3xf32>)
+  %2 = "tf.BesselI1e"(%arg2) : (tensor<3xf64>) -> (tensor<3xf64>)
+  return %0, %1, %2 : tensor<3xf16>, tensor<3xf32>, tensor<3xf64>
+}
+
+// CHECK-LABEL: diag
+func @diag(%arg0: tensor<2xf32>) -> tensor<2x2xf32> {
+  // CHECK-NOT: tf.Diag
+  %0 = "tf.Diag"(%arg0) : (tensor<2xf32>) -> tensor<2x2xf32>
+  return %0 : tensor<2x2xf32>
+}
+
+// CHECK-LABEL: random_uniform_int
+func @random_uniform_int(%arg0: tensor<i32>, %arg1: tensor<i32>) -> tensor<1000xi32> {
+  %0 = "tf.Const"() {value = dense<1000> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK-NOT: tf.RandomUniformInt
+  %1 = "tf.RandomUniformInt"(%0, %arg0, %arg1) {seed = 0 : i64, seed2 = 0 : i64} : (tensor<1xi32>, tensor<i32>, tensor<i32>) -> tensor<1000xi32>
+  return %1 : tensor<1000xi32>
+}
+
+// CHECK-LABEL: multinomial
+func @multinomial(%arg0: tensor<2x4xf32>, %seed: tensor<i32>, %seed2: tensor<i32>) -> tensor<2x10xi32> {
+  // CHECK-NOT: tf.Multinomial
+  %samples = "tf.Const"() { value = dense<10> : tensor<i32> } : () -> tensor<i32>
+  %1 = "tf.Multinomial"(%arg0, %samples) {seed = 0, seed2 = 0}: (tensor<2x4xf32>, tensor<i32>) -> tensor<2x10xi32>
+  return %1 : tensor<2x10xi32>
+}
+
 // TODO(hinsu): Add a test with a valid TF op for which tf2xla kernel is
 // available but doesn't support this instance.
 }
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
index 3b4efc388eb..4c5ce2f74d9 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
@@ -1,5 +1,5 @@
 // RUN: tf-opt "-xla-legalize-tf=allow-partial-conversion legalize-chlo=false" %s | FILECHECK_OPTS="" FileCheck %s
-// RUN: tf-opt "-xla-legalize-tf=allow-partial-conversion legalize-chlo=true" -verify-diagnostics %s | FileCheck %s --check-prefix CHLO
+// RUN: tf-opt "-xla-legalize-tf=allow-partial-conversion legalize-chlo=true" -verify-diagnostics %s | FileCheck %s --check-prefix CHLO --dump-input-filter=all
 // This test runs twice:
 //   1. Through FILECHECK_OPTS="" FileCheck with chlo legalization disabled since verifying
 //      that the chlo ops emit produces more useful tests.
@@ -439,6 +439,17 @@ func @fusedBatchNormGradV3_Training_NCHW(%arg0: tensor<8x8x8x8xf32>, %arg1: tens
 // Bias op legalizations.
 //===----------------------------------------------------------------------===//
 
+// CHECK-LABEL: func @biasAdd_default
+func @biasAdd_default(%arg0: tensor<1x32x10x32xi32>, %arg1: tensor<32xi32>) -> tensor<1x32x10x32xi32> {
+  // CHECK: %[[ARG0_SHAPE:.+]] = shape.shape_of %arg0
+  // CHECK: %[[ARG0_EXTENTS:.+]] = shape.to_extent_tensor %[[ARG0_SHAPE]]
+  // CHECK: %[[ARG1_BCAST:.+]] = "mhlo.dynamic_broadcast_in_dim"(%arg1, %[[ARG0_EXTENTS]])
+  // CHECK-SAME:   {broadcast_dimensions = dense<3> : tensor<1xi64>}
+  // CHECK: %[[RESULT:.+]] = mhlo.add %arg0, %[[ARG1_BCAST]]
+  %0 = "tf.BiasAdd"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT"} : (tensor<1x32x10x32xi32>, tensor<32xi32>) -> tensor<1x32x10x32xi32>
+  return %0 : tensor<1x32x10x32xi32>
+}
+
 // CHECK-LABEL: func @biasAdd_NHWC
 func @biasAdd_NHWC(%arg0: tensor<1x32x10x32xi32>, %arg1: tensor<32xi32>) -> tensor<1x32x10x32xi32> {
   // CHECK: %[[ARG0_SHAPE:.+]] = shape.shape_of %arg0
@@ -1269,6 +1280,15 @@ func @maxpool_3d_same_padding(%arg0: tensor<2x8x13x25x7xf32>) -> tensor<2x8x4x7x
   return %0 : tensor<2x8x4x7x7xf32>
 }
 
+// CHECK-LABEL: maxpool_explicit_padding
+func @maxpool_explicit_padding(%arg0: tensor<2x12x20x7xi32>) -> tensor<2x3x5x7xi32> {
+  // CHECK: tf.MaxPool
+  // TODO(b/165938852): need to support explicit padding in max_pool.
+
+  %0 = "tf.MaxPool"(%arg0) {data_format = "NHWC", ksize = [1, 2, 2, 1], padding = "EXPLICIT", strides = [1, 4, 4, 1]} : (tensor<2x12x20x7xi32>) -> tensor<2x3x5x7xi32>
+  return %0 : tensor<2x3x5x7xi32>
+}
+
 //===----------------------------------------------------------------------===//
 // MaxPoolGrad op legalizations.
 //===----------------------------------------------------------------------===//
@@ -1499,6 +1519,35 @@ func @stateful_pcall_multi_in_out(%arg0: tensor<i32>, %arg1: tensor<i32>) -> (te
   return %arg1, %arg0 : tensor<i32>, tensor<i32>
 }
 
+//===----------------------------------------------------------------------===//
+// Elu op legalizations.
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: func @elu
+func @elu(%arg0: tensor<1xf32>) -> tensor<1xf32> {
+  // CHECK-DAG: %[[ZERO:.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
+  // CHECK-DAG: %[[PRED:.*]] = chlo.broadcast_compare %arg0, %[[ZERO]] {broadcast_dimensions = dense<> : tensor<0xi64>, comparison_direction = "GT"}
+  // CHECK-DAG: %[[EXP:.*]] = "mhlo.exponential_minus_one"(%arg0)
+  // CHECK: %[[RESULT:.*]] = "mhlo.select"(%[[PRED]], %arg0, %[[EXP]])
+  // CHECK: return %[[RESULT]]
+  %0 = "tf.Elu"(%arg0) : (tensor<1xf32>) -> tensor<1xf32>
+  return %0: tensor<1xf32>
+}
+
+// CHECK-LABEL: func @elu_grad
+// CHECK-SAME: (%[[GRADIENTS:.*]]: tensor<4x8xf32>, %[[FEATURES:.*]]: tensor<?x?xf32>)
+func @elu_grad(%gradients: tensor<4x8xf32>, %features: tensor<?x?xf32>) -> tensor<4x8xf32> {
+  // CHECK-DAG: %[[ZERO:.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
+  // CHECK-DAG: %[[ONE:.*]] = mhlo.constant dense<1.000000e+00> : tensor<f32>
+  // CHECK-DAG: %[[PRED:.*]] = chlo.broadcast_compare %[[FEATURES]], %[[ZERO]] {broadcast_dimensions = dense<> : tensor<0xi64>, comparison_direction = "GT"}
+  // CHECK-DAG: %[[ADD1:.*]] = chlo.broadcast_add %[[FEATURES]], %[[ONE]] {broadcast_dimensions = dense<> : tensor<0xi64>}
+  // CHECK-DAG: %[[MULGRAD:.*]] = "mhlo.multiply"(%[[GRADIENTS]], %[[ADD1]])
+  // CHECK: %[[RESULT:.*]] = "mhlo.select"(%[[PRED]], %[[GRADIENTS]], %[[MULGRAD]])
+  // CHECK: return %[[RESULT]]
+  %2 = "tf.EluGrad"(%gradients, %features) : (tensor<4x8xf32>, tensor<?x?xf32>) -> tensor<4x8xf32>
+  return %2 : tensor<4x8xf32>
+}
+
 //===----------------------------------------------------------------------===//
 // Relu op legalizations.
 //===----------------------------------------------------------------------===//
@@ -1726,6 +1775,20 @@ func @simple_logsoftmax(%arg0: tensor<2x3xf32>) -> tensor<2x3xf32> {
 // Fast Fourier Transform op legalization.
 //===----------------------------------------------------------------------===//
 
+// CHECK-LABEL: func @fft_1D
+func @fft_1D(%arg0: tensor<8xcomplex<f32>>) -> tensor<8xcomplex<f32>> {
+  // CHECK: "mhlo.fft"(%arg0) {fft_length = dense<8> : tensor<1xi64>, fft_type = "FFT"} : (tensor<8xcomplex<f32>>
+  %0 = "tf.FFT"(%arg0) : (tensor<8xcomplex<f32>>) -> tensor<8xcomplex<f32>>
+  return %0 : tensor<8xcomplex<f32>>
+}
+
+// CHECK-LABEL: func @ifft_1D
+func @ifft_1D(%arg0: tensor<8xcomplex<f32>>) -> tensor<8xcomplex<f32>> {
+  // CHECK: "mhlo.fft"(%arg0) {fft_length = dense<8> : tensor<1xi64>, fft_type = "IFFT"} : (tensor<8xcomplex<f32>>
+  %0 = "tf.IFFT"(%arg0) : (tensor<8xcomplex<f32>>) -> tensor<8xcomplex<f32>>
+  return %0 : tensor<8xcomplex<f32>>
+}
+
 // CHECK-LABEL: func @rfft_1D
 func @rfft_1D(%arg0: tensor<8xf32>) -> tensor<8xcomplex<f32>> {
   %fftlength = "tf.Const"() {value = dense<[8]> : tensor<1xi32>} : () -> (tensor<1xi32>)
@@ -1734,6 +1797,48 @@ func @rfft_1D(%arg0: tensor<8xf32>) -> tensor<8xcomplex<f32>> {
   return %0 : tensor<8xcomplex<f32>>
 }
 
+// CHECK-LABEL: func @rfft_1D_padded
+func @rfft_1D_padded(%arg0: tensor<7xf32>) -> tensor<8xcomplex<f32>> {
+  %fftlength = "tf.Const"() {value = dense<[8]> : tensor<1xi32>} : () -> (tensor<1xi32>)
+  // CHECK: %[[PADDED:.*]] = "mhlo.pad"(%arg0, %2) {edge_padding_high = dense<1> : tensor<1xi64>, edge_padding_low = dense<0> : tensor<1xi64>, interior_padding = dense<0> : tensor<1xi64>} : (tensor<7xf32>, tensor<f32>) -> tensor<8xf32>
+  // CHECK: "mhlo.fft"(%[[PADDED]]) {fft_length = dense<8> : tensor<1xi64>, fft_type = "RFFT"} : (tensor<8xf32>
+  %0 = "tf.RFFT"(%arg0, %fftlength) : (tensor<7xf32>, tensor<1xi32>) -> tensor<8xcomplex<f32>>
+  return %0 : tensor<8xcomplex<f32>>
+}
+
+// CHECK-LABEL: func @rfft_1D_sliced
+func @rfft_1D_sliced(%arg0: tensor<2x9xf32>) -> tensor<2x8xcomplex<f32>> {
+  %fftlength = "tf.Const"() {value = dense<[8]> : tensor<1xi32>} : () -> (tensor<1xi32>)
+  // CHECK: %[[SLICED:.*]] = "mhlo.slice"(%arg0) {limit_indices = dense<[2, 8]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<2x9xf32>) -> tensor<2x8xf32>
+  // CHECK: "mhlo.fft"(%[[SLICED]]) {fft_length = dense<8> : tensor<1xi64>, fft_type = "RFFT"} : (tensor<2x8xf32>
+  %0 = "tf.RFFT"(%arg0, %fftlength) : (tensor<2x9xf32>, tensor<1xi32>) -> tensor<2x8xcomplex<f32>>
+  return %0 : tensor<2x8xcomplex<f32>>
+}
+
+// CHECK-LABEL: func @irfft_1D
+func @irfft_1D(%arg0: tensor<8xcomplex<f32>>) -> tensor<5xf32> {
+  %fftlength = "tf.Const"() {value = dense<[8]> : tensor<1xi32>} : () -> (tensor<1xi32>)
+  // CHECK: %[[SLICED:.*]] = "mhlo.slice"(%arg0) {limit_indices = dense<5> : tensor<1xi64>, start_indices = dense<0> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} : (tensor<8xcomplex<f32>>) -> tensor<5xcomplex<f32>>
+  // CHECK: "mhlo.fft"(%[[SLICED]]) {fft_length = dense<5> : tensor<1xi64>, fft_type = "IRFFT"} : (tensor<5xcomplex<f32>>
+  %0 = "tf.IRFFT"(%arg0, %fftlength) : (tensor<8xcomplex<f32>>, tensor<1xi32>) -> tensor<5xf32>
+  return %0 : tensor<5xf32>
+}
+
+// CHECK-LABEL: fft_1D_dynamic
+func @fft_1D_dynamic(%arg0: tensor<?xcomplex<f32>>) -> tensor<8xcomplex<f32>> {
+  // CHECK: "tf.FFT"
+  %0 = "tf.FFT"(%arg0) : (tensor<?xcomplex<f32>>) -> tensor<8xcomplex<f32>>
+  return %0 : tensor<8xcomplex<f32>>
+}
+
+// CHECK-LABEL: rfft_1D_dynamic
+func @rfft_1D_dynamic(%arg0: tensor<?xf32>) -> tensor<8xcomplex<f32>> {
+  %fftlength = "tf.Const"() {value = dense<[8]> : tensor<1xi32>} : () -> (tensor<1xi32>)
+  // CHECK: "tf.RFFT"
+  %0 = "tf.RFFT"(%arg0, %fftlength) : (tensor<?xf32>, tensor<1xi32>) -> tensor<8xcomplex<f32>>
+  return %0 : tensor<8xcomplex<f32>>
+}
+
 //===----------------------------------------------------------------------===//
 // Shape op legalization.
 //===----------------------------------------------------------------------===//
@@ -1852,16 +1957,16 @@ func @abs_unranked(%arg0: tensor<*xf32>) -> tensor<*xf32> {
 // CHECK-LABEL: @acos
 // CHLO-LABEL: @acos
 func @acos(%arg0: tensor<2xf32>) -> tensor<2xf32> {
-  // CHECK:  "chlo.acos"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
+  // CHECK:  chlo.acos %arg0 : tensor<2xf32>
 // CHLO:   %[[VAL_1:.*]] = "mhlo.compare"({{.*}}) {comparison_direction = "NE"}
-// CHLO:   %[[VAL_3:.*]] = mhlo.constant dense<2.000000e+00>
-// CHLO:   %[[VAL_4:.*]] = mhlo.constant dense<1.000000e+00>
 // CHLO:   %[[VAL_5:.*]] = mhlo.multiply %arg0, %arg0
+// CHLO:   %[[VAL_4:.*]] = mhlo.constant dense<1.000000e+00>
 // CHLO:   %[[VAL_6:.*]] = mhlo.subtract %[[VAL_4]], %[[VAL_5]]
 // CHLO:   %[[VAL_7:.*]] = "mhlo.sqrt"(%[[VAL_6]])
 // CHLO:   %[[VAL_8:.*]] = mhlo.constant dense<1.000000e+00>
 // CHLO:   %[[VAL_9:.*]] = mhlo.add %[[VAL_8]], %arg0
 // CHLO:   %[[VAL_10:.*]] = mhlo.atan2 %[[VAL_7]], %[[VAL_9]]
+// CHLO:   %[[VAL_3:.*]] = mhlo.constant dense<2.000000e+00>
 // CHLO:   %[[VAL_11:.*]] = mhlo.multiply %[[VAL_3]], %[[VAL_10]]
 // CHLO:   %[[VAL_12:.*]] = mhlo.constant dense<3.14159274>
 // CHLO:   %[[VAL_13:.*]] = "mhlo.select"(%[[VAL_1]], %[[VAL_11]], %[[VAL_12]])
@@ -1870,6 +1975,44 @@ func @acos(%arg0: tensor<2xf32>) -> tensor<2xf32> {
   return %0 : tensor<2xf32>
 }
 
+// CHECK-LABEL: @acos_dynamic
+// CHLO-LABEL: @acos_dynamic
+func @acos_dynamic(%arg0: tensor<*xf32>) -> tensor<*xf32> {
+  // CHECK:  chlo.acos %arg0 : tensor<*xf32>
+  // `tf.Acos` is lowered to `chlo.constant_like` operations which can only be
+  // lowered further on ranked tensors.  Unranked CHLO must be transformed to
+  // ranked code before further lowering.
+  // CHLO: "tf.Acos"
+  %0 = "tf.Acos"(%arg0) : (tensor<*xf32>) -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
+
+// CHECK-LABEL: @tan
+// CHECK-SAME: (%[[ARG:.*]]: tensor<2xf32>) -> tensor<2xf32>
+// CHLO-LABEL: @tan
+// CHLO-SAME: (%[[ARG:.*]]: tensor<2xf32>) -> tensor<2xf32>
+func @tan(%arg : tensor<2xf32>) -> tensor<2xf32> {
+  // CHECK: chlo.tan %[[ARG]] : tensor<2xf32>
+  // CHLO: %[[SINE:.*]] = "mhlo.sine"(%[[ARG]])
+  // CHLO  %[[COSINE:.*]] = "mhlo.cosine"(%[[ARG]])
+  // CHLO  %[[RESULT:.*]] = "mhlo.divide"(%[[SINE]], %[[COSINE]])
+  %result = "tf.Tan"(%arg) : (tensor<2xf32>) -> tensor<2xf32>
+  return %result : tensor<2xf32>
+}
+
+// CHECK-LABEL: @tan_unranked
+// CHECK-SAME: (%[[ARG:.*]]: tensor<*xf32>) -> tensor<*xf32>
+// CHLO-LABEL: @tan_unranked
+// CHLO-SAME: (%[[ARG:.*]]: tensor<*xf32>) -> tensor<*xf32>
+func @tan_unranked(%arg : tensor<*xf32>) -> tensor<*xf32> {
+  // CHECK: chlo.tan %[[ARG]] : tensor<*xf32>
+  // CHLO: %[[SINE:.*]] = "mhlo.sine"(%[[ARG]])
+  // CHLO  %[[COSINE:.*]] = "mhlo.cosine"(%[[ARG]])
+  // CHLO  %[[RESULT:.*]] = "mhlo.divide"(%[[SINE]], %[[COSINE]])
+  %result = "tf.Tan"(%arg) : (tensor<*xf32>) -> tensor<*xf32>
+  return %result : tensor<*xf32>
+}
+
 // CHECK-LABEL: func @cast_dynamic_i2f
 func @cast_dynamic_i2f(%arg0: tensor<?xi32>) -> tensor<?xf32> {
   // CHECK: "mhlo.convert"(%arg0) : (tensor<?xi32>) -> tensor<?xf32>
@@ -2266,10 +2409,10 @@ func @reshape(%arg0: tensor<2xf32>, %arg1: tensor<2xi32>) -> tensor<2x1xf32> {
 }
 
 // CHECK-LABEL: reshape_dynamic
-func @reshape_dynamic(%arg0: tensor<?xf32>, %arg1: tensor<2xi32>) -> tensor<1x1xf32> {
-  // CHECK:  "mhlo.reshape"
-  %0 = "tf.Reshape"(%arg0, %arg1) : (tensor<?xf32>, tensor<2xi32>) -> tensor<1x1xf32>
-  return %0 : tensor<1x1xf32>
+func @reshape_dynamic(%arg0: tensor<?xf32>, %arg1: tensor<2xi32>) -> tensor<?x?xf32> {
+  // CHECK:  "mhlo.dynamic_reshape"
+  %0 = "tf.Reshape"(%arg0, %arg1) : (tensor<?xf32>, tensor<2xi32>) -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
 }
 
 // CHECK-LABEL: reshape_unranked
@@ -2300,6 +2443,25 @@ func @expand_dims(%arg0: tensor<2xf32>, %axis: tensor<i32>) -> tensor<1x2xf32> {
   return %0 : tensor<1x2xf32>
 }
 
+// CHECK-LABEL: expand_dims_dynamic
+func @expand_dims_dynamic(%arg0: tensor<?x?xf32>) -> tensor<?x1x?xf32> {
+  %axis = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> (tensor<i32>)
+
+  // CHECK-DAG: [[SHAPEOF:%.+]] = shape.shape_of %arg0
+  // CHECK-DAG: [[CST0:%.+]] = constant 0
+  // CHECK-DAG: [[CST1:%.+]] = constant 1
+  // CHECK-DAG: [[GETEXTENT0:%.+]] = shape.get_extent [[SHAPEOF]], [[CST0]]
+  // CHECK-DAG: [[CST1_0:%.+]] = constant 1
+  // CHECK-DAG: [[GETEXTENT1:%.+]] = shape.get_extent [[SHAPEOF]], [[CST1_0]]
+  // CHECK-DAG: [[FROMEXTENTS:%.+]] = shape.from_extents [[GETEXTENT0]], [[CST1]], [[GETEXTENT1]]
+  // CHECK-DAG: [[TOEXTENTS:%.+]] = shape.to_extent_tensor [[FROMEXTENTS]]
+  // CHECK-DAG: [[RESHAPE:%.+]] = "mhlo.dynamic_reshape"(%arg0, [[TOEXTENTS]])
+  %0 = "tf.ExpandDims"(%arg0, %axis) : (tensor<?x?xf32>, tensor<i32>) -> tensor<?x1x?xf32>
+
+  // CHECK: return [[RESHAPE]]
+  return %0 : tensor<?x1x?xf32>
+}
+
 // CHECK-LABEL: func @sign
 // CHECK-SAME: [[ARG:%arg.*]]: tensor<1x2x3x4xf32>
 func @sign(%arg0: tensor<1x2x3x4xf32>) -> tensor<1x2x3x4xf32> {
@@ -3463,6 +3625,20 @@ func @conv3d_backprop_filter(%input: tensor<2x8x8x8x1xf32>, %out_backprop: tenso
   return %result : tensor<2x8x8x8x1xf32>
 }
 
+// CHECK-LABEL: @collective_permute
+func @collective_permute(%arg0: tensor<128x32xf32>) -> tensor<128x32xf32> {
+  %source_target_pairs = "tf.Const" () {
+    value = dense<[[0, 1], [1, 2], [2, 3]]> : tensor<3x2xi32>
+  } : () -> tensor<3x2xi32>
+
+  // CHECK: "mhlo.collective_permute"
+  // CHECK-SAME: source_target_pairs = dense<{{\[}}[0, 1], [1, 2], [2, 3]]> : tensor<3x2xi64>
+  %0 = "tf.CollectivePermute"(%arg0, %source_target_pairs) {
+  } : (tensor<128x32xf32>, tensor<3x2xi32>) -> tensor<128x32xf32>
+
+  return %0 : tensor<128x32xf32>
+}
+
 // CHECK-LABEL: @cross_replica_sum
 func @cross_replica_sum(%input: tensor<10xf32>) -> tensor<10xf32> {
   %replica_groups = "tf.Const" () {
@@ -3483,8 +3659,9 @@ func @cross_replica_sum(%input: tensor<10xf32>) -> tensor<10xf32> {
 func @size_scalar_i32(%input: tensor<f32>) -> (tensor<i32>) {
   // CHECK: %[[CONST:.*]] = mhlo.constant dense<1>
   // CHECK-SAME: tensor<i32>
+  // CHECK: %[[CAST:.*]] = tensor_cast %[[CONST]] : tensor<i32> to tensor<i32>
   %size = "tf.Size"(%input) {T = "tfdtype$DT_FLOAT", out_type = "tfdtype$DT_INT32"} : (tensor<f32>) -> tensor<i32>
-  // CHECK: return %[[CONST]]
+  // CHECK: return %[[CAST]]
   return %size : tensor<i32>
 }
 
@@ -3492,8 +3669,9 @@ func @size_scalar_i32(%input: tensor<f32>) -> (tensor<i32>) {
 func @size_scalar_i64(%input: tensor<f32>) -> (tensor<i64>) {
   // CHECK: %[[CONST:.*]] = mhlo.constant dense<1>
   // CHECK-SAME: tensor<i64>
+  // CHECK: %[[CAST:.*]] = tensor_cast %[[CONST]] : tensor<i64> to tensor<i64>
   %size = "tf.Size"(%input) {T = "tfdtype$DT_FLOAT", out_type = "tfdtype$DT_INT64"} : (tensor<f32>) -> tensor<i64>
-  // CHECK: return %[[CONST]]
+  // CHECK: return %[[CAST]]
   return %size : tensor<i64>
 }
 
@@ -3754,7 +3932,7 @@ func @unsorted_segment_prod(%data: tensor<8x?x64xf32>, %segment_ids : tensor<?x1
 // CHECK-LABEL: @unsorted_segment_min
 func @unsorted_segment_min(%data: tensor<8x?x64xf32>, %segment_ids : tensor<?x16xi32>) -> (tensor<4x?xf32>) {
   %num_segments = "tf.Const"() {value = dense<4> : tensor<i32>} : () -> tensor<i32>
-  // CHECK: mhlo.constant dense<0x7F800000> : tensor<f32>
+  // CHECK: mhlo.constant dense<3.40282347E+38> : tensor<f32>
   // CHECK: mhlo.scatter
   // CHECK: mhlo.minimum
   %0 = "tf.UnsortedSegmentMin"(%data, %segment_ids, %num_segments) : (tensor<8x?x64xf32>, tensor<?x16xi32>, tensor<i32>) -> (tensor<4x?xf32>)
@@ -3764,7 +3942,7 @@ func @unsorted_segment_min(%data: tensor<8x?x64xf32>, %segment_ids : tensor<?x16
 // CHECK-LABEL: @unsorted_segment_max
 func @unsorted_segment_max(%data: tensor<8x?x64xf32>, %segment_ids : tensor<?x16xi32>) -> (tensor<4x?xf32>) {
   %num_segments = "tf.Const"() {value = dense<4> : tensor<i32>} : () -> tensor<i32>
-  // CHECK: mhlo.constant dense<0xFF800000> : tensor<f32>
+  // CHECK: mhlo.constant dense<-3.40282347E+38> : tensor<f32>
   // CHECK: mhlo.scatter
   // CHECK: mhlo.maximum
   %0 = "tf.UnsortedSegmentMax"(%data, %segment_ids, %num_segments) : (tensor<8x?x64xf32>, tensor<?x16xi32>, tensor<i32>) -> (tensor<4x?xf32>)
@@ -4581,21 +4759,65 @@ func @cumsum_static(%arg0: tensor<4xf32>) -> tensor<4xf32> {
 }
 
 // CHECK-LABEL: func @cumsum_exclusive
+// CHECK-SAME: [[X:%.*]]: tensor<4xf32>
 func @cumsum_exclusive(%arg0: tensor<4xf32>) -> tensor<4xf32> {
-  // CHECK: "tf.Cumsum"
+  // CHECK: [[AXIS:%.*]] = mhlo.constant dense<0> : tensor<i32>
+  // CHECK: [[CONVERT_X:%.*]] = "mhlo.convert"([[X]]) : (tensor<4xf32>) -> tensor<4xf32>
+  // CHECK: [[INIT:%.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
+  // CHECK: [[REDUCE:%.*]] = "mhlo.reduce_window"([[CONVERT_X]], [[INIT]]) ( {
+  // CHECK: ^bb0([[A:%.*]]: tensor<f32>, [[B:%.*]]: tensor<f32>):
+  // CHECK:   [[SUM:%.*]] = mhlo.add [[A]], [[B]] : tensor<f32>
+  // CHECK:   "mhlo.return"([[SUM]]) : (tensor<f32>) -> ()
+  // CHECK: }) {padding = dense<{{\[\[}}3, 0]]> : tensor<1x2xi64>, window_dimensions = dense<4> : tensor<1xi64>, window_strides = dense<1> : tensor<1xi64>} : (tensor<4xf32>, tensor<f32>) -> tensor<4xf32>
+  // CHECK: [[PAD:%.*]] = "mhlo.pad"([[REDUCE]], %{{.*}}) {edge_padding_high = dense<-1> : tensor<1xi64>, edge_padding_low = dense<1> : tensor<1xi64>, interior_padding = dense<0> : tensor<1xi64>} : (tensor<4xf32>, tensor<f32>) -> tensor<4xf32>
+  // CHECK: [[CONVERT_REDUCE:%.*]] = "mhlo.convert"([[PAD]]) : (tensor<4xf32>) -> tensor<4xf32>
+  // CHECK: return [[CONVERT_REDUCE]]
   %0 = "tf.Const"() {_output_shapes = ["tfshape$"], device = "", dtype = i32, value = dense<0> : tensor<i32>} : () -> tensor<i32>
   %1 = "tf.Cumsum"(%arg0, %0) {exclusive = true, reverse = false} : (tensor<4xf32>, tensor<i32>) -> tensor<4xf32>
   return %1 : tensor<4xf32>
 }
 
 // CHECK-LABEL: func @cumsum_reverse
+// CHECK-SAME: [[X:%.*]]: tensor<4xf32>
 func @cumsum_reverse(%arg0: tensor<4xf32>) -> tensor<4xf32> {
-  // CHECK: "tf.Cumsum"
+  // CHECK: [[AXIS:%.*]] = mhlo.constant dense<0> : tensor<i32>
+  // CHECK: [[REVERSE1:%.*]] = "mhlo.reverse"([[X]]) {dimensions = dense<0> : tensor<1xi64>} : (tensor<4xf32>) -> tensor<4xf32>
+  // CHECK: [[CONVERT_X:%.*]] = "mhlo.convert"([[REVERSE1]]) : (tensor<4xf32>) -> tensor<4xf32>
+  // CHECK: [[INIT:%.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
+  // CHECK: [[REDUCE:%.*]] = "mhlo.reduce_window"([[CONVERT_X]], [[INIT]]) ( {
+  // CHECK: ^bb0([[A:%.*]]: tensor<f32>, [[B:%.*]]: tensor<f32>):
+  // CHECK:   [[SUM:%.*]] = mhlo.add [[A]], [[B]] : tensor<f32>
+  // CHECK:   "mhlo.return"([[SUM]]) : (tensor<f32>) -> ()
+  // CHECK: }) {padding = dense<{{\[\[}}3, 0]]> : tensor<1x2xi64>, window_dimensions = dense<4> : tensor<1xi64>, window_strides = dense<1> : tensor<1xi64>} : (tensor<4xf32>, tensor<f32>) -> tensor<4xf32>
+  // CHECK: [[CONVERT_REDUCE:%.*]] = "mhlo.convert"([[REDUCE]]) : (tensor<4xf32>) -> tensor<4xf32>
+  // CHECK: [[REVERSE_BACK:%.*]] = "mhlo.reverse"([[CONVERT_REDUCE]]) {dimensions = dense<0> : tensor<1xi64>} : (tensor<4xf32>) -> tensor<4xf32>
+  // CHECK: return [[REVERSE_BACK]]
   %0 = "tf.Const"() {_output_shapes = ["tfshape$"], device = "", dtype = i32, value = dense<0> : tensor<i32>} : () -> tensor<i32>
   %1 = "tf.Cumsum"(%arg0, %0) {exclusive = false, reverse = true} : (tensor<4xf32>, tensor<i32>) -> tensor<4xf32>
   return %1 : tensor<4xf32>
 }
 
+// CHECK-LABEL: func @cumsum_exclusive_reverse
+// CHECK-SAME: [[X:%.*]]: tensor<4xf32>
+func @cumsum_exclusive_reverse(%arg0: tensor<4xf32>) -> tensor<4xf32> {
+  // CHECK: [[AXIS:%.*]] = mhlo.constant dense<0> : tensor<i32>
+  // CHECK: [[REVERSE1:%.*]] = "mhlo.reverse"([[X]]) {dimensions = dense<0> : tensor<1xi64>} : (tensor<4xf32>) -> tensor<4xf32>
+  // CHECK: [[CONVERT_X:%.*]] = "mhlo.convert"([[REVERSE1]]) : (tensor<4xf32>) -> tensor<4xf32>
+  // CHECK: [[INIT:%.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
+  // CHECK: [[REDUCE:%.*]] = "mhlo.reduce_window"([[CONVERT_X]], [[INIT]]) ( {
+  // CHECK: ^bb0([[A:%.*]]: tensor<f32>, [[B:%.*]]: tensor<f32>):
+  // CHECK:   [[SUM:%.*]] = mhlo.add [[A]], [[B]] : tensor<f32>
+  // CHECK:   "mhlo.return"([[SUM]]) : (tensor<f32>) -> ()
+  // CHECK: }) {padding = dense<{{\[\[}}3, 0]]> : tensor<1x2xi64>, window_dimensions = dense<4> : tensor<1xi64>, window_strides = dense<1> : tensor<1xi64>} : (tensor<4xf32>, tensor<f32>) -> tensor<4xf32>
+  // CHECK: [[PAD:%.*]] = "mhlo.pad"([[REDUCE]], %{{.*}}) {edge_padding_high = dense<-1> : tensor<1xi64>, edge_padding_low = dense<1> : tensor<1xi64>, interior_padding = dense<0> : tensor<1xi64>} : (tensor<4xf32>, tensor<f32>) -> tensor<4xf32>
+  // CHECK: [[CONVERT_REDUCE:%.*]] = "mhlo.convert"([[PAD]]) : (tensor<4xf32>) -> tensor<4xf32>
+  // CHECK: [[REVERSE_BACK:%.*]] = "mhlo.reverse"([[CONVERT_REDUCE]]) {dimensions = dense<0> : tensor<1xi64>} : (tensor<4xf32>) -> tensor<4xf32>
+  // CHECK: return [[REVERSE_BACK]]
+  %0 = "tf.Const"() {_output_shapes = ["tfshape$"], device = "", dtype = i32, value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %1 = "tf.Cumsum"(%arg0, %0) {exclusive = true, reverse = true} : (tensor<4xf32>, tensor<i32>) -> tensor<4xf32>
+  return %1 : tensor<4xf32>
+}
+
 // CHECK-LABEL: func @cumsum_dynamic
 func @cumsum_dynamic(%arg0: tensor<?xf32>, %arg1: tensor<i32>) -> tensor<?xf32> {
   // CHECK: "tf.Cumsum"
@@ -4603,6 +4825,24 @@ func @cumsum_dynamic(%arg0: tensor<?xf32>, %arg1: tensor<i32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
 
+//===----------------------------------------------------------------------===//
+// Cumprod op legalizations.
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: func @cumprod
+func @cumprod(%arg0: tensor<4xf32>) -> tensor<4xf32> {
+  // CHECK: [[INIT:%.*]] = mhlo.constant dense<1.000000e+00> : tensor<f32>
+  // CHECK: "mhlo.reduce_window"({{.*}}, [[INIT]]) ( {
+  // CHECK:   mhlo.mul
+  %0 = "tf.Const"() {_output_shapes = ["tfshape$"], device = "", dtype = i32, value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %1 = "tf.Cumprod"(%arg0, %0) {exclusive = false, reverse = false} : (tensor<4xf32>, tensor<i32>) -> tensor<4xf32>
+  return %1 : tensor<4xf32>
+}
+
+//===----------------------------------------------------------------------===//
+// Qr op legalization
+//===----------------------------------------------------------------------===//
+
 // CHECK:  func @qr([[VAL_0:%.*]]: tensor<500x100x75xf32>) -> (tensor<500x100x75xf32>, tensor<500x75x75xf32>)
 func @qr(%arg0: tensor<500x100x75xf32>) -> (tensor<500x100x75xf32>, tensor<500x75x75xf32>) {
   // The tf.Qr lowering is a full algorithm that is not effective to verify with
@@ -4697,3 +4937,37 @@ func @softplus_f64(%arg0: tensor<8x16xf64>) -> tensor<8x16xf64> {
   // CHECK:     return [[ENTRY_SELECT]] : tensor<8x16xf64>
   return %0 : tensor<8x16xf64>
 }
+
+// CHECK-LABEL: @xla_gather
+func @xla_gather(%arg0: tensor<200x100x300xf32>, %arg1: tensor<10x2xi32>) -> tensor<10x1x300xf32> {
+  %cst = "tf.Const"() { value = dense<[1, 1, 300]> : tensor<3xi64> } : () -> tensor<3xi64>
+
+  // CHECK: "mhlo.gather"
+  // CHECK-SAME: dimension_numbers =
+  // CHECK-SAME:   collapsed_slice_dims = dense<0> : tensor<1xi64>
+  // CHECK-SAME:   index_vector_dim = 1 : i64
+  // CHECK-SAME:   offset_dims = dense<1> : tensor<1xi64>
+  // CHECK-SAME:   start_index_map = dense<0> : tensor<1xi64>
+  // CHECK-SAME: indices_are_sorted = true
+  // CHECK-SAME: slice_sizes = dense<[1, 1, 300]> : tensor<3xi64>
+
+  %0 = "tf.XlaGather"(%arg0, %arg1, %cst) {dimension_numbers = "\0A\01\01\12\01\00\1A\01\00 \01", indices_are_sorted = true} : (tensor<200x100x300xf32>, tensor<10x2xi32>, tensor<3xi64>) -> tensor<10x1x300xf32>
+  return %0 : tensor<10x1x300xf32>
+}
+
+// CHECK-LABEL: @xla_gather_i32
+func @xla_gather_i32(%arg0: tensor<200x100x300xf32>, %arg1: tensor<10x2xi32>) -> tensor<10x1x300xf32> {
+  %cst = "tf.Const"() { value = dense<[1, 1, 300]> : tensor<3xi32> } : () -> tensor<3xi32>
+
+  // CHECK: "mhlo.gather"
+  // CHECK-SAME: dimension_numbers =
+  // CHECK-SAME:   collapsed_slice_dims = dense<0> : tensor<1xi64>
+  // CHECK-SAME:   index_vector_dim = 1 : i64
+  // CHECK-SAME:   offset_dims = dense<1> : tensor<1xi64>
+  // CHECK-SAME:   start_index_map = dense<0> : tensor<1xi64>
+  // CHECK-SAME: indices_are_sorted = true
+  // CHECK-SAME: slice_sizes = dense<[1, 1, 300]> : tensor<3xi64>
+
+  %0 = "tf.XlaGather"(%arg0, %arg1, %cst) {dimension_numbers = "\0A\01\01\12\01\00\1A\01\00 \01", indices_are_sorted = true} : (tensor<200x100x300xf32>, tensor<10x2xi32>, tensor<3xi32>) -> tensor<10x1x300xf32>
+  return %0 : tensor<10x1x300xf32>
+}
diff --git a/tensorflow/compiler/mlir/xla/tests/mlir_hlo_builder_test.cc b/tensorflow/compiler/mlir/xla/tests/mlir_hlo_builder_test.cc
index 1a3f0c16247..de8d6fc697b 100644
--- a/tensorflow/compiler/mlir/xla/tests/mlir_hlo_builder_test.cc
+++ b/tensorflow/compiler/mlir/xla/tests/mlir_hlo_builder_test.cc
@@ -42,13 +42,13 @@ class XlaBuilderTest : public ::testing::Test {
  protected:
   XlaBuilderTest()
       : name_(SetupTest()),
-        context_(),
         module_(mlir::ModuleOp::create(mlir::UnknownLoc::get(&context_))),
         builder_(&module_->getBodyRegion()),
-        xla_builder_(name_, builder_, module_->getLoc()) {}
+        xla_builder_(name_, builder_, module_->getLoc()) {
+    context_.loadDialect<mlir::mhlo::MhloDialect>();
+  }
 
   string SetupTest() {
-    mlir::registerDialect<mlir::mhlo::MhloDialect>();
     return ::testing::UnitTest::GetInstance()->current_test_info()->name();
   }
 
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/export.mlir b/tensorflow/compiler/mlir/xla/tests/translate/export.mlir
index 9929bd85b43..ff1bcadda7b 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/export.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/translate/export.mlir
@@ -362,7 +362,9 @@ func @main(%arg0: tensor<2x3xf32>, %arg1: tensor<5x5xf32>) -> tensor<1x2x3xf32>
 // CHECK:  [[VAL_1:%.*]] = f32[2,3] parameter(0)
 // CHECK:  [[VAL_2:%.*]] = f32[5,5] parameter(1)
 // CHECK:  ROOT
-// CHECK-SAME:  f32[1,2,3] custom-call(f32[2,3] [[VAL_1]], f32[5,5] [[VAL_2]]), custom_call_target="foo", backend_config="bar"
+// CHECK-SAME:  f32[1,2,3] custom-call(f32[2,3] [[VAL_1]], f32[5,5] [[VAL_2]])
+// CHECK-SAME:  custom_call_target="foo"
+// CHECK-SAME:  backend_config="bar"
 
 // -----
 
@@ -1087,3 +1089,15 @@ func @main(%arg: tensor<3x4xf32>, %token: !mhlo.token) -> !mhlo.token {
 }
 
 // CHECK-NOT:  frontend_attributes
+
+// -----
+
+// Checks exporting rng-bit-generator.
+
+// CHECK:  HloModule
+func @main(%arg: tensor<3xui64>) -> tuple<tensor<3xui64>, tensor<2x2xui32>> {
+// CHECK: %[[ARG0:.*]] = u64[3] parameter(0)
+// CHECK: ROOT %[[RESULT:.*]] = (u64[3], u32[2,2]) rng-bit-generator(u64[3] %[[ARG0]]), algorithm=rng_philox
+  %0 = "mhlo.rng_bit_generator"(%arg) {rng_algorithm = 2 : i32} : (tensor<3xui64>) -> tuple<tensor<3xui64>, tensor<2x2xui32>>
+  return %0 : tuple<tensor<3xui64>, tensor<2x2xui32>>
+}
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/export_errors.mlir b/tensorflow/compiler/mlir/xla/tests/translate/export_errors.mlir
index 97c53cb5f9f..0c2aee5a2fd 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/export_errors.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/translate/export_errors.mlir
@@ -2,6 +2,6 @@
 
 // CHECK: Opaque elements attr not supported
 func @main() {
-  %0 = "tf.Const"() {value = opaque<"tf", "0x0123456789ABCDEF"> : tensor<4xf32>} : () -> tensor<4xf32>
+  %0 = "mhlo.constant"() {value = opaque<"mhlo", "0x0123456789ABCDEF"> : tensor<4xf32>} : () -> tensor<4xf32>
   return
 }
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/import.hlotxt b/tensorflow/compiler/mlir/xla/tests/translate/import.hlotxt
index d89b1fa44e1..4d4e0213da8 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/import.hlotxt
+++ b/tensorflow/compiler/mlir/xla/tests/translate/import.hlotxt
@@ -1005,3 +1005,12 @@ add {
   // CHECK: "mhlo.not"(%[[ARG0]]) {name = "{{.*}}"} : (tensor<4xui16>) -> tensor<4xui16>
   ROOT %not.2 = u16[4] not(u16[4] %Arg_0.1)
 }
+
+// CHECK-LABEL:  func @rngbitgen
+// CHECK-SAME:    (%[[ARG0:.*]]: tensor<3xui64>)
+%rngbitgen (Arg_0.1: u64[3]) -> (u64[3], u32[2,2]) {
+  %Arg_0.1 = u64[3] parameter(0)
+  // CHECK: "mhlo.rng_bit_generator"(%[[ARG0]]) {rng_algorithm = 2 : i32} : (tensor<3xui64>) -> tuple<tensor<3xui64>, tensor<2x2xui32>>
+  ROOT %rng-bit-generator.2 = (u64[3], u32[2,2]) rng-bit-generator(u64[3] %Arg_0.1), algorithm=rng_philox
+}
+
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/location_to_op_metadata.mlir b/tensorflow/compiler/mlir/xla/tests/translate/location_to_op_metadata.mlir
new file mode 100644
index 00000000000..2182ce6106d
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/tests/translate/location_to_op_metadata.mlir
@@ -0,0 +1,43 @@
+// RUN: tf-mlir-translate -split-input-file -mlir-hlo-to-hlo-text %s | FileCheck %s --dump-input=always
+
+// CHECK-LABEL: %main
+func @main(%arg0: !mhlo.token) -> !mhlo.token {
+  %0 = "mhlo.after_all"(%arg0) : (!mhlo.token) -> !mhlo.token loc(unknown)
+  return %0 : !mhlo.token
+}
+
+// CHECK: after-all
+// CHECK-NOT: metadata
+
+// -----
+
+// CHECK-LABEL: %main
+func @main(%arg0: !mhlo.token) -> !mhlo.token {
+  %0 = "mhlo.after_all"(%arg0) : (!mhlo.token) -> !mhlo.token loc("AfterAll")
+  return %0 : !mhlo.token
+}
+
+// CHECK: after-all
+// CHECK-SAME: metadata={op_name="AfterAll"}
+
+// -----
+
+// CHECK-LABEL: %main
+func @main(%arg0: !mhlo.token) -> !mhlo.token {
+  %0 = "mhlo.after_all"(%arg0) : (!mhlo.token) -> !mhlo.token loc("name@function")
+  return %0 : !mhlo.token
+}
+
+// CHECK: after-all
+// CHECK-SAME: metadata={op_name="name"}
+
+// -----
+
+// CHECK-LABEL: %main
+func @main(%arg0: !mhlo.token) -> !mhlo.token {
+  %0 = "mhlo.after_all"(%arg0) : (!mhlo.token) -> !mhlo.token loc("file_name":2:8)
+  return %0 : !mhlo.token
+}
+
+// CHECK: after-all
+// CHECK-SAME: metadata={source_file="file_name" source_line=2}
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
index 0b420fff785..c990473a6d4 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 // This file implements logic for lowering TensorFlow dialect to XLA dialect.
 
+#include <cctype>
 #include <cstddef>
 #include <cstdint>
 #include <iterator>
@@ -42,6 +43,7 @@ limitations under the License.
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/chlo_ops.h"
 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
@@ -50,6 +52,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/utils/hlo_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.h"
+#include "tensorflow/compiler/mlir/xla/attribute_importer.h"
 #include "tensorflow/compiler/mlir/xla/transforms/passes.h"
 #include "tensorflow/compiler/xla/client/lib/conv_grad_size_util.h"
 #include "tensorflow/compiler/xla/client/padding.h"
@@ -57,7 +60,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/kernel_shape_util.h"
 #include "tensorflow/core/kernels/conv_grad_shape_utils.h"
-#include "tensorflow/core/lib/bfloat16/bfloat16.h"
+#include "tensorflow/core/platform/bfloat16.h"
 #include "tensorflow/core/util/padding.h"
 #include "tensorflow/core/util/tensor_format.h"
 
@@ -68,12 +71,22 @@ namespace {
 constexpr char kShardingAttr[] = "mhlo.sharding";
 
 class LegalizeTF : public PassWrapper<LegalizeTF, FunctionPass> {
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<chlo::HloClientDialect, mhlo::MhloDialect,
+                    shape::ShapeDialect, StandardOpsDialect>();
+  }
+
  public:
   LegalizeTF() = default;
   LegalizeTF(const LegalizeTF &) {}
-  explicit LegalizeTF(bool allow_partial_conversion, bool legalize_chlo) {
+  explicit LegalizeTF(bool allow_partial_conversion, bool legalize_chlo,
+                      llvm::Optional<StringRef> tf2xla_fallback_device_type) {
     allow_partial_conversion_ = allow_partial_conversion;
     legalize_chlo_ = legalize_chlo;
+    use_tf2xla_fallback_ = tf2xla_fallback_device_type.hasValue();
+    if (tf2xla_fallback_device_type.hasValue()) {
+      device_type_ = tf2xla_fallback_device_type.getValue().str();
+    }
   }
 
   /// Performs the lowering to XLA dialect.
@@ -89,15 +102,26 @@ class LegalizeTF : public PassWrapper<LegalizeTF, FunctionPass> {
       llvm::cl::desc(
           "Also legalizes intermediate chlo ops to hlo (default true)"),
       llvm::cl::init(true)};
+  Option<bool> use_tf2xla_fallback_{
+      *this, "use-tf2xla-fallback",
+      llvm::cl::desc(
+          "Also use TF2XLA fallback for legalization (default false)"),
+      llvm::cl::init(false)};
+  Option<std::string> device_type_{
+      *this, "device-type",
+      llvm::cl::desc(
+          "The device type used by TF2XLA fallback. Must be specified if "
+          "use-tf2xla-fallback is true, otherwise not used."),
+      llvm::cl::init("INVALID_DEVICE_TYPE")};
 };
 
 /// Returns if the given TF data format string is the default format.
 static bool IsDefaultDataFormat(StringRef format) { return format == "NHWC"; }
 
 /// Returns the feature dimension for the given format and input type.
-static size_t GetFeatureDimension(StringAttr format,
+static size_t GetFeatureDimension(StringRef format,
                                   RankedTensorType inputType) {
-  return IsDefaultDataFormat(format.getValue()) ? inputType.getRank() - 1 : 1;
+  return IsDefaultDataFormat(format) ? inputType.getRank() - 1 : 1;
 }
 
 // Gets all integer values from the given attribute and push them to `values`.
@@ -246,49 +270,21 @@ tensorflow::TensorShape ToTensorShape(
       sizes.begin(), sizes.end()));
 }
 
-// Returns minimal value for the given int or float element type.
-static ConstOp GetMinValueForType(Type ty, Location loc,
-                                  PatternRewriter *rewriter) {
-  RankedTensorType scalar_ty = RankedTensorType::get({}, ty);
-
-  DenseElementsAttr attr;
-  if (auto float_ty = ty.dyn_cast_or_null<FloatType>()) {
-    APFloat neg_inf =
-        APFloat::getInf(float_ty.getFloatSemantics(), /*negative=*/true);
-    attr = DenseElementsAttr::get(scalar_ty, neg_inf);
-  } else {
-    auto int_ty = ty.cast<IntegerType>();
-    APInt min_val = APInt::getSignedMinValue(int_ty.getWidth());
-    attr = DenseElementsAttr::get(scalar_ty, min_val);
-  }
-  return rewriter->create<ConstOp>(loc, attr);
-}
-
-// Returns maximal value for the given int or float element type.
-static ConstOp GetMaxValueForType(Type ty, Location loc,
-                                  PatternRewriter *rewriter) {
-  RankedTensorType scalar_ty = RankedTensorType::get({}, ty);
-
-  DenseElementsAttr attr;
-  if (auto float_ty = ty.dyn_cast_or_null<FloatType>()) {
-    APFloat pos_inf =
-        APFloat::getInf(float_ty.getFloatSemantics(), /*negative=*/false);
-    attr = DenseElementsAttr::get(scalar_ty, pos_inf);
-  } else {
-    auto int_ty = ty.cast<IntegerType>();
-    APInt max_val = APInt::getSignedMaxValue(int_ty.getWidth());
-    attr = DenseElementsAttr::get(scalar_ty, max_val);
-  }
-  return rewriter->create<ConstOp>(loc, attr);
-}
-
-// Returns int or float scalar DenseElementsAttr attribute with the given
-// element type and the value.
+// Returns int, float, or complex scalar DenseElementsAttr attribute with the
+// given element type and the value.
 static ConstOp GetScalarConstOfType(Type ty, Location loc, int64_t raw_value,
                                     OpBuilder *builder) {
   return builder->create<ConstOp>(loc, hlo::GetScalarOfType(ty, raw_value));
 }
 
+// Returns a limit scalar const op for the given type.
+// Requires FloatType or IntegerType
+static ConstOp GetScalarLimitConstOfType(Type ty, Location loc,
+                                         hlo::ScalarLimit limit,
+                                         OpBuilder *builder) {
+  return builder->create<ConstOp>(loc, hlo::GetScalarLimitOfType(ty, limit));
+}
+
 // Creates an mhlo::SliceOp where the major dimensions have full size, and
 // the minor dimensions have the provided offsets and sizes.
 static Value SliceInMinorDims(Location loc, Value v,
@@ -735,12 +731,33 @@ static void CreateWhile32(Location loc, int num_iterations,
 // BatchNorm op utilities.
 //===----------------------------------------------------------------------===//
 
-static IntegerAttr getFeatureDimensionAttr(Builder &b, StringAttr format,
+static IntegerAttr getFeatureDimensionAttr(Builder &b, StringRef format,
                                            Value input) {
   return b.getI64IntegerAttr(
       GetFeatureDimension(format, input.getType().cast<RankedTensorType>()));
 }
 
+//===----------------------------------------------------------------------===//
+// FFT op utilities.
+//===----------------------------------------------------------------------===//
+// Returns the 1D i64 elements attribute populated with the inner-most dim of
+// the value.
+static DenseIntElementsAttr GetInnerDimFromValue(ShapedType type,
+                                                 Builder *builder) {
+  if (type.getRank() == 0) {
+    return builder->getI64TensorAttr({});
+  }
+  return builder->getI64TensorAttr(type.getShape().back());
+}
+
+// Returns True if the inner-most dim is static.
+bool CheckInnerDimStatic(ShapedType type, Builder *builder) {
+  if (!type.hasRank()) {
+    return false;
+  }
+  return !type.isDynamicDim(type.getShape().size() - 1);
+}
+
 //===----------------------------------------------------------------------===//
 // MatMul op utilities.
 //===----------------------------------------------------------------------===//
@@ -1049,6 +1066,21 @@ static void BuildSortComparisonBody(llvm::ArrayRef<Type> element_types,
   builder->create<mhlo::ReturnOp>(loc, compare);
 }
 
+//===----------------------------------------------------------------------===//
+// XlaGather op utilities.
+//===----------------------------------------------------------------------===//
+
+bool HasValidGatherDims(StringAttr attr) {
+  ::xla::GatherDimensionNumbers dims;
+  return dims.ParseFromString(attr.getValue().str());
+}
+
+GatherDimensionNumbers GetGatherDimNumsAttr(StringAttr attr, Builder *builder) {
+  ::xla::GatherDimensionNumbers dims;
+  if (!dims.ParseFromString(attr.getValue().str())) return {};
+  return ::xla::ConvertGatherDimensionNumbers(dims, builder);
+}
+
 //===----------------------------------------------------------------------===//
 // Op converters.
 //===----------------------------------------------------------------------===//
@@ -1096,7 +1128,7 @@ class ConvertBiasAddOp : public OpRewritePattern<TF::BiasAddOp> {
                                 PatternRewriter &rewriter) const override {
     auto loc = op.getLoc();
     auto feature_dim = GetFeatureDimension(
-        op.data_formatAttr(), op.value().getType().cast<RankedTensorType>());
+        op.data_format(), op.value().getType().cast<RankedTensorType>());
     auto bias_broadcast = Broadcast1DToFeatureDim(loc, op.value(), op.bias(),
                                                   feature_dim, rewriter);
     rewriter.replaceOpWithNewOp<AddOp>(op, op.value(), bias_broadcast);
@@ -1675,6 +1707,80 @@ class ConvertEinsumOp : public OpRewritePattern<TF::EinsumOp> {
   }
 };
 
+template <typename OpTy>
+class ConvertFFTOp : public OpRewritePattern<OpTy> {
+ public:
+  using OpRewritePattern<OpTy>::OpRewritePattern;
+  LogicalResult matchAndRewrite(OpTy op,
+                                PatternRewriter &rewriter) const override {
+    auto input_ty = op.input().getType().template cast<ShapedType>();
+    if (!input_ty.hasRank()) {
+      return failure();
+    }
+    auto input_shape = input_ty.getShape();
+    DenseIntElementsAttr fft_length_attr;
+    if (!matchPattern(op.fft_length(), m_Constant(&fft_length_attr))) {
+      return failure();
+    }
+    int64_t fft_length;
+    if (fft_length_attr.getNumElements() != 0) {
+      fft_length = fft_length_attr.getValue<IntegerAttr>(0).getInt();
+    } else {
+      return failure();
+    }
+
+    std::string fft_string = "RFFT";
+    if (typeid(OpTy) == typeid(TF::IRFFTOp)) {
+      fft_length = fft_length / 2 + 1;
+      fft_string = "IRFFT";
+    }
+    auto loc = op.getLoc();
+
+    // The inner-most dim cannot be dynamic.
+    if (input_ty.isDynamicDim(input_shape.size() - 1)) {
+      return failure();
+    }
+
+    auto expected_shape = llvm::to_vector<4>(input_shape.drop_back());
+    expected_shape.push_back(fft_length);
+
+    // Zero pad or truncate the last axis
+    Value reshaped = op.input();
+    SmallVector<int64_t, 4> begin_indices(input_shape.size(), 0);
+    SmallVector<int64_t, 4> strides(input_shape.size(), 1);
+
+    // Last dim larger than fft_length, slice the input
+    if (input_shape.back() > fft_length) {
+      reshaped = rewriter.create<SliceOp>(
+          op.getLoc(),
+          RankedTensorType::get(expected_shape, input_ty.getElementType()),
+          op.input(), GetI64ElementsAttr(begin_indices, &rewriter),
+          GetI64ElementsAttr(expected_shape, &rewriter),
+          GetI64ElementsAttr(strides, &rewriter));
+
+      // Last dim smaller than fft_length, zero-pad the input
+    } else if (input_ty.getShape().back() < fft_length) {
+      SmallVector<int64_t, 4> no_padding(input_shape.size(), 0);
+      SmallVector<int64_t, 4> padding(input_shape.size() - 1, 0);
+      padding.push_back(fft_length - input_shape.back());
+      Value zero =
+          GetScalarConstOfType(input_ty.getElementType(), loc, 0, &rewriter);
+      reshaped = rewriter.create<PadOp>(
+          loc, RankedTensorType::get(expected_shape, input_ty.getElementType()),
+          op.input(), zero, GetI64ElementsAttr(no_padding, &rewriter),
+          GetI64ElementsAttr(padding, &rewriter),
+          GetI64ElementsAttr(no_padding, &rewriter));
+    }
+
+    rewriter.replaceOpWithNewOp<FftOp>(op, op.getType(), reshaped, fft_string,
+                                       rewriter.getI64TensorAttr(fft_length));
+    return success();
+  }
+};
+
+using ConvertRFFTOp = ConvertFFTOp<TF::RFFTOp>;
+using ConvertIRFFTOp = ConvertFFTOp<TF::IRFFTOp>;
+
 // The base class to convert TensorFlow FusedBatchNormGrad*Op to HLO
 // BatchNormGradOp for training and a sequence of binary ops for inference.
 // TODO(b/145536565): move to legalize_tf_patterns.td if it applies.
@@ -1708,7 +1814,7 @@ class ConvertFusedBatchNormGradBase
     act = rewriter.create<ConvertOp>(loc, act, kernel_type);
 
     auto feature_dim_attr =
-        getFeatureDimensionAttr(rewriter, op.data_formatAttr(), act);
+        getFeatureDimensionAttr(rewriter, op.data_format(), act);
     auto feature_dim = feature_dim_attr.getValue().getSExtValue();
 
     // Gets the result values.
@@ -1723,7 +1829,7 @@ class ConvertFusedBatchNormGradBase
 
       auto training_op = rewriter.create<BatchNormGradOp>(
           loc, result_type, act, scale, mean, var, grad, op.epsilon(),
-          feature_dim_attr.getValue());
+          feature_dim);
 
       x_backprop =
           rewriter.create<GetTupleElementOp>(loc, training_op.getResult(), 0);
@@ -1802,7 +1908,7 @@ class ConvertFusedBatchNormBase : public OpRewritePattern<FusedBatchNormOpT> {
   LogicalResult matchAndRewrite(FusedBatchNormOpT op,
                                 PatternRewriter &rewriter) const override {
     auto feature_dim =
-        getFeatureDimensionAttr(rewriter, op.data_formatAttr(), op.x());
+        getFeatureDimensionAttr(rewriter, op.data_format(), op.x());
 
     auto input_type_tensor = op.x().getType().template cast<TensorType>();
     auto input_element_type = input_type_tensor.getElementType();
@@ -1843,7 +1949,7 @@ class ConvertFusedBatchNormBase : public OpRewritePattern<FusedBatchNormOpT> {
 
       auto bn_train_op = rewriter.create<mhlo::BatchNormTrainingOp>(
           op.getLoc(), result_type, bn_train_input, op.scale(), op.offset(),
-          op.epsilon(), feature_dim.getValue());
+          op.epsilon(), feature_dim.getInt());
       // HLO op outputs a tuple of tensors. Extract those results.
       auto bn_train_op_result = bn_train_op.getResult();
       Value y_out = rewriter.create<mhlo::GetTupleElementOp>(
@@ -1930,7 +2036,7 @@ class ConvertFusedBatchNormBase : public OpRewritePattern<FusedBatchNormOpT> {
           op.getLoc(),
           /*result_type=*/bn_train_input_type_tensor, bn_train_input,
           op.scale(), op.offset(), op.mean(), op.variance(), op.epsilon(),
-          feature_dim.getValue());
+          feature_dim.getInt());
 
       // Convert back to input type to stay aligned with expected output type
       // for TF op.
@@ -2368,16 +2474,23 @@ class ConvertMaxPoolOp : public OpRewritePattern<OpTy> {
     Type element_type =
         op.input().getType().template cast<TensorType>().getElementType();
     if (!element_type.isSignlessIntOrFloat()) return failure();
+    tensorflow::Padding padding;
+    if (!GetPaddingFromString(op.padding().str(), &padding).ok())
+      return failure();
+    if (padding == tensorflow::Padding::EXPLICIT) {
+      return failure();
+    }
     Location loc = op.getLoc();
-    ConstOp init = GetMinValueForType(element_type, loc, &rewriter);
+    ConstOp init = GetScalarLimitConstOfType(element_type, loc,
+                                             hlo::kInfinityLowest, &rewriter);
 
     auto input_ty = op.input().getType().template dyn_cast<RankedTensorType>();
     if (!input_ty) return failure();
     DenseIntElementsAttr paddings_attr = GetReduceWindowPaddingAsAttr<num_dims>(
         input_ty.getShape(), op.ksize(), op.strides(), op.padding(), &rewriter);
     auto reduce = rewriter.create<ReduceWindowOp>(
-        loc, op.getType(), op.input(), init.getResult(),
-        GetI64ElementsAttr(op.ksize()), GetI64ElementsAttr(op.strides()),
+        loc, op.getType(), op.input(), init, GetI64ElementsAttr(op.ksize()),
+        GetI64ElementsAttr(op.strides()),
         /*base_dilations=*/DenseIntElementsAttr(),
         /*window_dilations=*/DenseIntElementsAttr(), paddings_attr);
     BuildReduceBody<MaxOp>(element_type, &reduce.body(), &rewriter);
@@ -3078,7 +3191,7 @@ class ConvertStridedSliceOp : public OpRewritePattern<TF::StridedSliceOp> {
     // axis. For instance, if there are 4 dims, we can support a
     // shrink_axis_mask of 0001 (1), 0011 (3), 0111 (7), or 1111 (15), but no
     // other.
-    bool shrink_axis_mask_ok = op.shrink_axis_mask().isMask();
+    bool shrink_axis_mask_ok = llvm::isMask_64(op.shrink_axis_mask());
     if (!shrink_axis_mask_ok)
       return rewriter.notifyMatchFailure(
           op,
@@ -3087,27 +3200,27 @@ class ConvertStridedSliceOp : public OpRewritePattern<TF::StridedSliceOp> {
 
     // When begin/end values are dynamic, the ellipsis mask, if set, must refer
     // to the last dimension.
-    int ellipsis_mask = op.ellipsis_mask().getZExtValue();
+    int ellipsis_mask = op.ellipsis_mask();
     if (!(ellipsis_mask == 0 || ellipsis_mask == (1 << last_dim)))
       return rewriter.notifyMatchFailure(
           op,
           "requires that ellipsis_mask, if set, refer to the last dimension of "
           "input (when begin/end values are dynamic)");
 
-    APInt begin_mask = op.begin_mask();
-    if (!begin_mask.isNullValue())
+    uint64_t begin_mask = op.begin_mask();
+    if (begin_mask)
       return rewriter.notifyMatchFailure(
           op,
           "requires that begin_mask is either set to 0 or not set when "
           "begin/end values are dynamic");
-    APInt end_mask = op.end_mask();
-    if (!end_mask.isNullValue())
+    uint64_t end_mask = op.end_mask();
+    if (end_mask)
       return rewriter.notifyMatchFailure(
           op,
           "requires that end_mask is either set to 0 or not set when begin/end "
           "values are dynamic");
-    APInt new_axis_mask = op.new_axis_mask();
-    if (!new_axis_mask.isNullValue())
+    uint64_t new_axis_mask = op.new_axis_mask();
+    if (new_axis_mask)
       return rewriter.notifyMatchFailure(
           op,
           "requires that new_axis_mask is either set to 0 or not set when "
@@ -3620,7 +3733,8 @@ class ConvertMaxOp
 
   static Value GetInitialValue(Type reduce_element_type, Location loc,
                                PatternRewriter *rewriter) {
-    return GetMinValueForType(reduce_element_type, loc, rewriter);
+    return GetScalarLimitConstOfType(reduce_element_type, loc,
+                                     hlo::kInfinityLowest, rewriter);
   }
 };
 
@@ -3637,7 +3751,8 @@ class ConvertMinOp
 
   static Value GetInitialValue(Type reduce_element_type, Location loc,
                                PatternRewriter *rewriter) {
-    return GetMaxValueForType(reduce_element_type, loc, rewriter);
+    return GetScalarLimitConstOfType(reduce_element_type, loc,
+                                     hlo::kInfinityMax, rewriter);
   }
 };
 
@@ -3773,7 +3888,8 @@ class ConvertArgMaxOp
 
   static Value GetInitialValue(Type reduce_element_type, Location loc,
                                PatternRewriter &rewriter) {
-    return GetMinValueForType(reduce_element_type, loc, &rewriter);
+    return GetScalarLimitConstOfType(reduce_element_type, loc,
+                                     hlo::kInfinityLowest, &rewriter);
   }
 
   static StringRef GetDirection() { return "GT"; }
@@ -4360,7 +4476,7 @@ class ConvertOneHotOp : public OpRewritePattern<TF::OneHotOp> {
     }
 
     int64_t depth = depth_attr.getValue<APInt>({}).getSExtValue();
-    int64_t axis = op.axis().getSExtValue();
+    int64_t axis = op.axis();
     if (axis == -1) axis = indices_shape.size();
 
     llvm::SmallVector<int64_t, 4> broadcast_dims(indices_shape.size());
@@ -4636,7 +4752,7 @@ class ConvertUnpackOp : public OpRewritePattern<TF::UnpackOp> {
     if (!value_type) return failure();
 
     int64_t value_rank = value_type.getRank();
-    int64_t axis = op.axis().getSExtValue();
+    int64_t axis = op.axis();
     if (axis < 0) axis += value_rank;
 
     // Parameters for constructing each slice.
@@ -4712,7 +4828,7 @@ class GenericConvertUnsortedSegmentReductionOp : public OpRewritePattern<OpTy> {
     auto output_type =
         RankedTensorType::get(output_shape, data_type.getElementType());
 
-    // Broadccast the initial value for reduction. This will become the
+    // Broadcast the initial value for reduction. This will become the
     // 'operand' parameter to scatter to for the final scatter op.
     Value init = ConcreteClass::GetInitialValue(data_type.getElementType(),
                                                 op.getLoc(), &rewriter);
@@ -4752,7 +4868,8 @@ class ConvertUnsortedSegmentMaxOp
 
   static Value GetInitialValue(Type reduce_element_type, Location loc,
                                PatternRewriter *rewriter) {
-    return GetMinValueForType(reduce_element_type, loc, rewriter);
+    return GetScalarLimitConstOfType(reduce_element_type, loc, hlo::kLowest,
+                                     rewriter);
   }
 };
 
@@ -4765,7 +4882,8 @@ class ConvertUnsortedSegmentMinOp
 
   static Value GetInitialValue(Type reduce_element_type, Location loc,
                                PatternRewriter *rewriter) {
-    return GetMaxValueForType(reduce_element_type, loc, rewriter);
+    return GetScalarLimitConstOfType(reduce_element_type, loc, hlo::kMax,
+                                     rewriter);
   }
 };
 
@@ -5007,7 +5125,12 @@ class ConvertInplaceUpdateOp : public OpRewritePattern<TF::InplaceUpdateOp> {
     SmallVector<Type, 4> unpacked_indices_type(
         indices_type.getDimSize(0),
         RankedTensorType::get({}, indices_type.getElementType()));
-    auto zero_attr = IntegerAttr::get(rewriter.getIntegerType(64), 0);
+    // Note on zero_attr integer type: DynamicUpdateSlice op start_indices are
+    // required to have matching types. This rewrite rule creates
+    // DynamicUpdateSlice ops where the first "start index" is always i32 and
+    // subsequent ones are constructed based on zero_attr. Thus the type
+    // for zero_attr needs to be i32 as well.
+    auto zero_attr = IntegerAttr::get(rewriter.getIntegerType(32), 0);
     auto unpacked_indices = rewriter.create<TF::UnpackOp>(
         op.getLoc(), unpacked_indices_type, indices, zero_attr);
 
@@ -5071,26 +5194,25 @@ class ConvertXlaDynamicUpdateSliceOp
   }
 };
 
-/// Converts the Cumsum TensorFlow op to the HLO ReduceWindow op by setting
-/// appropriate window dimensions, with 'add' as the reduction function.  The
-/// input tensor needs to have a static shape, and 'axis' must be const.  The
-/// TableGen pattern is not used for this rewrite because it involves regions.
-class ConvertCumsumOp : public OpRewritePattern<TF::CumsumOp> {
-  using OpRewritePattern<TF::CumsumOp>::OpRewritePattern;
+// Converts the Cumsum or Cumprod TensorFlow op to the HLO ReduceWindow op by
+// setting appropriate window dimensions, with the given aggregation op as the
+// reduction function. The input tensor needs to have a static shape, and 'axis'
+// must be const. The TableGen pattern is not used for this rewrite because it
+// involves regions.
+template <typename OpT, typename AggregationOp>
+class ConvertCumOp : public OpRewritePattern<OpT> {
+  using OpRewritePattern<OpT>::OpRewritePattern;
 
-  LogicalResult matchAndRewrite(TF::CumsumOp op,
+  LogicalResult matchAndRewrite(OpT op,
                                 PatternRewriter &rewriter) const override {
     auto input = op.x();
-    auto input_type = input.getType().dyn_cast<ShapedType>();
+    auto input_type = input.getType().template dyn_cast<ShapedType>();
     if (!input_type || !input_type.hasStaticShape()) {
       return failure();
     }
 
-    // TODO(jennik): Add support for the optional 'exclusive' and 'reverse'
-    // arguments.
-    if (op.exclusive() || op.reverse()) {
-      return failure();
-    }
+    ArrayRef<int64_t> input_shape = input_type.getShape();
+    int64_t rank = input_shape.size();
 
     // We can only match when the axis is a constant scalar.
     DenseIntElementsAttr axis_attr;
@@ -5098,15 +5220,6 @@ class ConvertCumsumOp : public OpRewritePattern<TF::CumsumOp> {
       return failure();
     }
 
-    // Convert if we need to enlarge the element type's bitwidth to avoid
-    // precision loss.
-    Type input_element_type = input_type.getElementType();
-    Type sum_element_type = GetSumAccumulationType(input_element_type);
-    input = rewriter.create<ConvertOp>(op.getLoc(), input, sum_element_type);
-
-    ArrayRef<int64_t> input_shape = input_type.getShape();
-    int64_t rank = input_shape.size();
-
     // Get the dimension to apply the reduction on, and offset properly if it is
     // negative.
     int64_t axis = (*axis_attr.begin()).getSExtValue();
@@ -5114,6 +5227,25 @@ class ConvertCumsumOp : public OpRewritePattern<TF::CumsumOp> {
       axis += rank;
     }
 
+    // If we're supposed to sum things up in the reverse direction, we reverse
+    // the input and then later reverse the output.
+    if (op.reverse()) {
+      llvm::SmallVector<int64_t, 4> dims_to_reverse({axis});
+      input = rewriter.create<ReverseOp>(
+          op.getLoc(), op.getType(), input,
+          GetI64ElementsAttr(dims_to_reverse, &rewriter));
+    }
+
+    // Convert if we need to enlarge the element type's bitwidth to avoid
+    // precision loss.
+    Type input_element_type = input_type.getElementType();
+
+    // TODO(hinsu): Handle complex element types.
+    if (!input_element_type.isIntOrFloat()) return failure();
+
+    Type sum_element_type = GetSumAccumulationType(input_element_type);
+    input = rewriter.create<ConvertOp>(op.getLoc(), input, sum_element_type);
+
     SmallVector<int64_t, 4> window_dims(rank, 1);
     SmallVector<int64_t, 4> window_strides(rank, 1);
     window_dims[axis] = input_shape[axis];
@@ -5124,8 +5256,9 @@ class ConvertCumsumOp : public OpRewritePattern<TF::CumsumOp> {
         RankedTensorType::get({rank, 2}, rewriter.getIntegerType(64)),
         paddings);
 
-    Value init =
-        GetScalarConstOfType(sum_element_type, op.getLoc(), 0, &rewriter);
+    int64_t init_value = (std::is_same<AggregationOp, AddOp>::value) ? 0 : 1;
+    Value init = GetScalarConstOfType(sum_element_type, op.getLoc(), init_value,
+                                      &rewriter);
 
     auto reduce = rewriter.create<ReduceWindowOp>(
         op.getLoc(), input_type, input, init,
@@ -5133,18 +5266,45 @@ class ConvertCumsumOp : public OpRewritePattern<TF::CumsumOp> {
         GetI64ElementsAttr(rewriter.getI64ArrayAttr(window_strides)),
         /*base_dilations=*/DenseIntElementsAttr(),
         /*window_dilations=*/DenseIntElementsAttr(), paddings_attr);
-    BuildReduceBody<AddOp>(sum_element_type, &reduce.body(), &rewriter);
+    BuildReduceBody<AggregationOp>(sum_element_type, &reduce.body(), &rewriter);
     Value result = reduce.getResult();
 
+    if (op.exclusive()) {
+      // In "exclusive" operation, the output will start with the "init" (0)
+      // values. There is no way to express that as a ReduceWindowOp, so run the
+      // normal operation, and then use a PadOp to add the 0 "column" on the
+      // left and cut away the last column on the right.
+      llvm::SmallVector<int64_t, 4> low_padding(rank, 0);
+      llvm::SmallVector<int64_t, 4> high_padding(rank, 0);
+      llvm::SmallVector<int64_t, 4> interior_padding(rank, 0);
+      low_padding[axis] = 1;
+      high_padding[axis] = -1;
+      result = rewriter.create<PadOp>(
+          op.getLoc(), op.getType(), result, init,
+          GetI64ElementsAttr(low_padding, &rewriter),
+          GetI64ElementsAttr(high_padding, &rewriter),
+          GetI64ElementsAttr(interior_padding, &rewriter));
+    }
+
     // Convert back if we enlarged the element type's bitwidth.
     result =
         rewriter.create<ConvertOp>(op.getLoc(), result, input_element_type);
 
+    if (op.reverse()) {
+      llvm::SmallVector<int64_t, 4> dims_to_reverse({axis});
+      result = rewriter.create<ReverseOp>(
+          op.getLoc(), op.getType(), result,
+          GetI64ElementsAttr(dims_to_reverse, &rewriter));
+    }
+
     rewriter.replaceOp(op, result);
     return success();
   }
 };
 
+using ConvertCumsumOp = ConvertCumOp<TF::CumsumOp, AddOp>;
+using ConvertCumprodOp = ConvertCumOp<TF::CumprodOp, MulOp>;
+
 // Converts the Tensorflow ShapeOp to a sequence of Shape dialect and Standard
 // dialect lowerings. This involves extracting the shape type, extracting and
 // converting each dimension to a known integer type, and repacking into a final
@@ -5173,6 +5333,101 @@ class ConvertShapeOp : public OpRewritePattern<TF::ShapeOp> {
   }
 };
 
+class ConvertDynamicReshapeOp : public OpRewritePattern<TF::ReshapeOp> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(TF::ReshapeOp op,
+                                PatternRewriter &rewriter) const override {
+    auto tensor = op.tensor();
+    auto shape = op.shape();
+
+    auto tensor_ty = tensor.getType().cast<ShapedType>();
+    auto shape_ty = shape.getType().cast<ShapedType>();
+    auto result_ty = op.getType().cast<ShapedType>();
+
+    if (!result_ty.hasRank() || !tensor_ty.hasRank() || !shape_ty.hasRank()) {
+      return failure();
+    }
+
+    // Handle with the static case.
+    if (result_ty.hasStaticShape()) {
+      return failure();
+    }
+
+    rewriter.replaceOpWithNewOp<mhlo::DynamicReshapeOp>(op, result_ty, tensor,
+                                                        shape);
+    return success();
+  }
+};
+
+class ConvertDynamicExpandDimsOp : public OpRewritePattern<TF::ExpandDimsOp> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(TF::ExpandDimsOp op,
+                                PatternRewriter &rewriter) const override {
+    auto input = op.input();
+    auto input_ty = input.getType().cast<ShapedType>();
+    auto result_ty = op.getType().cast<ShapedType>();
+    if (!result_ty.hasRank() || !input_ty.hasRank() ||
+        result_ty.hasStaticShape()) {
+      return failure();
+    }
+
+    DenseIntElementsAttr expand_dims_attr;
+    if (!matchPattern(op.dim(), m_Constant(&expand_dims_attr))) {
+      return failure();
+    }
+
+    auto shape = rewriter.create<shape::ShapeOfOp>(
+        op.getLoc(),
+        RankedTensorType::get({input_ty.getRank()}, rewriter.getIndexType()),
+        input);
+    auto expand_dims = llvm::to_vector<6>(expand_dims_attr.getIntValues());
+
+    llvm::SmallVector<Value, 4> dims;
+    dims.resize(result_ty.getRank());
+
+    auto inserted_dim = expand_dims_attr.getValue({})
+                            .cast<IntegerAttr>()
+                            .getValue()
+                            .getSExtValue();
+
+    // Handle the negative value use case.
+    if (inserted_dim < 0) {
+      inserted_dim += result_ty.getRank();
+      // This means the value is completely incorrect, just return.
+      if (inserted_dim < 0) {
+        return failure();
+      }
+    }
+
+    dims[inserted_dim] = rewriter.create<ConstantIndexOp>(op.getLoc(), 1);
+
+    for (int i = 0; i < dims.size() - 1; i++) {
+      // Add the extracted dim.
+      auto index = rewriter.create<ConstantIndexOp>(op.getLoc(), i);
+      auto dim = rewriter.create<shape::GetExtentOp>(
+          op.getLoc(), rewriter.getIndexType(), shape, index);
+
+      dims[i >= inserted_dim ? i + 1 : i] = dim;
+    }
+
+    auto from_extents = rewriter.create<shape::FromExtentsOp>(
+        op.getLoc(), shape::ShapeType::get(op.getContext()), dims);
+
+    auto to_extent_tensor = rewriter.create<shape::ToExtentTensorOp>(
+        op.getLoc(),
+        RankedTensorType::get({result_ty.getRank()}, rewriter.getIndexType()),
+        from_extents);
+
+    rewriter.replaceOpWithNewOp<mhlo::DynamicReshapeOp>(op, result_ty, input,
+                                                        to_extent_tensor);
+    return success();
+  }
+};
+
 // Converts a TF QR op to HLO.
 class ConvertQrOp : public OpRewritePattern<TF::QrOp> {
  public:
@@ -5672,7 +5927,7 @@ class ConvertQrOp : public OpRewritePattern<TF::QrOp> {
 void EmitLegalizationErrors(Operation *op,
                             const DenseSet<Operation *> &nonlegalized_ops) {
   // Track the legalization failures by mapping op name to information about
-  // that failure: the number of unlegalized occurances of the op, and one
+  // that failure: the number of unlegalized occurrences of the op, and one
   // example operation that failed.
   std::map<StringRef, std::pair<int, Operation *>> op_name_to_error_info;
   DenseSet<Operation *> error_ops;
@@ -5714,9 +5969,14 @@ void EmitLegalizationErrors(Operation *op,
 
 // Performs the lowering to XLA dialect.
 void LegalizeTF::runOnFunction() {
-  if (failed(
-          legalizeTF(getFunction(), allow_partial_conversion_, legalize_chlo_)))
+  llvm::Optional<StringRef> tf2xla_fallback_device_type = llvm::None;
+  if (use_tf2xla_fallback_) {
+    tf2xla_fallback_device_type = device_type_;
+  }
+  if (failed(legalizeTF(getFunction(), allow_partial_conversion_,
+                        legalize_chlo_, tf2xla_fallback_device_type))) {
     signalPassFailure();
+  }
 }
 
 static PassRegistration<LegalizeTF> pass(
@@ -5726,19 +5986,38 @@ static PassRegistration<LegalizeTF> pass(
 
 #include "tensorflow/compiler/mlir/xla/transforms/generated_legalize_tf.inc"
 
-LogicalResult legalizeTF(Operation *op, bool allow_partial_conversion,
-                         bool legalize_chlo) {
+LogicalResult legalizeTF(
+    Operation *op, bool allow_partial_conversion, bool legalize_chlo,
+    llvm::Optional<StringRef> tf2xla_fallback_device_type) {
   MLIRContext *context = op->getContext();
-
-  // Add lowering patterns to the list.
   OwningRewritePatternList patterns;
+  // Note that the `OperationConverter` orders patterns lexicographically by:
+  // 1) Ascending legalization depth (i.e., minimum number of patterns necessary
+  //    to arrive at conversion target).
+  // 2) Descending pattern benefit.
+  // 3) Order of patterns in `OwningRewritePatternList`.
+
+  // Add TF->HLO legalization patterns.
   PopulateLegalizeTfPatterns(context, &patterns);
 
+  // Add TF->TF lowering patterns.
+  TF::PopulateLoweringTFPatterns(context, &patterns);
+
+  // Add TF->HLO legalization patterns via TF2XLA fallback.
+  if (tf2xla_fallback_device_type.hasValue()) {
+    PopulateLegalizeTfWithTf2XlaPatterns(tf2xla_fallback_device_type.getValue(),
+                                         patterns);
+  }
+
   // Populate with CHLO->HLO lowerings to account for TF ops legalized to
   // CHLO first.
   if (legalize_chlo) {
     chlo::PopulateLegalizeChloToHloPatterns(context, &patterns);
   }
+  // ConstantLike op is convenient to create splat constants, but is
+  // canonicalized to plain HLO constant if statically shaped. Add the
+  // canonicalization pattern to pattern list to enable multi-hop lowering.
+  chlo::ConstantLikeOp::getCanonicalizationPatterns(patterns, context);
 
   ConversionTarget target(*context);
   if (legalize_chlo) {
@@ -5773,28 +6052,25 @@ LogicalResult legalizeTF(Operation *op, bool allow_partial_conversion,
 void PopulateLegalizeTfPatterns(MLIRContext *context,
                                 OwningRewritePatternList *patterns) {
   populateWithGenerated(context, patterns);
-
-  // Add patterns that lower some of the high level TensorFlow ops to lower
-  // level TensorFlow ops. So, we don't have to target all the TensorFlow ops
-  // here for lowering to HLO.
-  TF::PopulateLoweringTFPatterns(context, patterns);
   patterns->insert<
       ConvertAllOp, ConvertAnyOp, ConvertArgMaxOp, ConvertBatchMatMulV2Op,
       ConvertBiasAddOp, ConvertBroadcastToOp, ConvertBF16FloorDivOp,
       ConvertConv2DOp, ConvertConv3DOp, ConvertDepthConv2DOp,
       ConvertConv2DBackpropFilterOp, ConvertConv3DBackpropFilterOp,
       ConvertConv2DBackpropInputOp, ConvertConv3DBackpropInputOp,
-      ConvertCumsumOp, ConvertDiagPartOp, ConvertEinsumOp,
-      ConvertFusedBatchNormGradOp, ConvertFusedBatchNormGradV2Op,
-      ConvertFusedBatchNormGradV3Op, ConvertFusedBatchNormV2Op,
-      ConvertFusedBatchNormV3Op, ConvertInfeedDequeueTupleOp,
-      ConvertInplaceUpdateOp, ConvertLinSpaceOp, ConvertMaxOp, ConvertMinOp,
-      ConvertAvgPool2DOp, ConvertAvgPool3DOp, ConvertAvgPool2DGradOp,
-      ConvertAvgPool3DGradOp, ConvertMaxPool2DOp, ConvertMaxPool3DOp,
-      ConvertMaxPool2DGradOp, ConvertMaxPool3DGradOp, ConvertMeanOp,
-      ConvertOneHotOp, ConvertOutfeedEnqueueTupleOp, ConvertProdOp, ConvertQrOp,
-      ConvertDynamicRangeOp, ConvertMatrixDiagPartV3Op, ConvertRangeOp,
-      ConvertSelectV2Op, ConvertSigmoidOp, ConvertShapeOp, ConvertSizeOp,
+      ConvertCumprodOp, ConvertCumsumOp, ConvertDiagPartOp,
+      ConvertDynamicExpandDimsOp, ConvertDynamicReshapeOp, ConvertEinsumOp,
+      ConvertRFFTOp, ConvertIRFFTOp, ConvertFusedBatchNormGradOp,
+      ConvertFusedBatchNormGradV2Op, ConvertFusedBatchNormGradV3Op,
+      ConvertFusedBatchNormV2Op, ConvertFusedBatchNormV3Op,
+      ConvertInfeedDequeueTupleOp, ConvertInplaceUpdateOp, ConvertLinSpaceOp,
+      ConvertMaxOp, ConvertMinOp, ConvertAvgPool2DOp, ConvertAvgPool3DOp,
+      ConvertAvgPool2DGradOp, ConvertAvgPool3DGradOp, ConvertMaxPool2DOp,
+      ConvertMaxPool3DOp, ConvertMaxPool2DGradOp, ConvertMaxPool3DGradOp,
+      ConvertMeanOp, ConvertOneHotOp, ConvertOutfeedEnqueueTupleOp,
+      ConvertProdOp, ConvertQrOp, ConvertDynamicRangeOp,
+      ConvertMatrixDiagPartV3Op, ConvertRangeOp, ConvertSelectV2Op,
+      ConvertSigmoidOp, ConvertShapeOp, ConvertSizeOp,
       ConvertSoftmaxOp<TF::LogSoftmaxOp, true>,
       ConvertSoftmaxOp<TF::SoftmaxOp, false>, ConvertSplitOp, ConvertSplitVOp,
       ConvertStridedSliceOp, ConvertStridedSliceGradOp, ConvertSumOp,
@@ -5806,8 +6082,10 @@ void PopulateLegalizeTfPatterns(MLIRContext *context,
 }
 
 std::unique_ptr<OperationPass<FuncOp>> createLegalizeTFPass(
-    bool allow_partial_conversion, bool legalize_chlo) {
-  return std::make_unique<LegalizeTF>(allow_partial_conversion, legalize_chlo);
+    bool allow_partial_conversion, bool legalize_chlo,
+    llvm::Optional<StringRef> tf2xla_fallback_device_type) {
+  return std::make_unique<LegalizeTF>(allow_partial_conversion, legalize_chlo,
+                                      tf2xla_fallback_device_type);
 }
 
 }  // end namespace mhlo
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_communication.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_communication.cc
index 588e31ab669..6320ad2032b 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_communication.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_communication.cc
@@ -22,15 +22,20 @@ limitations under the License.
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Sequence.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/xla/type_to_shape.h"
@@ -49,45 +54,104 @@ const char kXlaHostTransferOriginalTypeAttr[] =
     "_xla_host_transfer_original_type";
 
 // A pass that legalizes TF/XLA communication ops, propagate their respective
-// tokens (for ordering), and rewrite their respective functions when necessary.
+// tokens (for ordering), and rewrite their respective functions and control
+// flow ops when necessary.
 // Note, this currently does not handle nested modules/functions or region based
-// ops (e.g. control flow).
+// ops other than certain control flow ops (`mhlo.if`, `mhlo.while`).
 class LegalizeTFCommunication
     : public PassWrapper<LegalizeTFCommunication, OperationPass<ModuleOp>> {
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<mhlo::MhloDialect>();
+  }
+
  public:
   void runOnOperation() override;
 };
 
-// Checks if a function has any communication ops.
-bool HasCommunicationOps(FuncOp func) {
-  auto result = func.walk([](Operation* op) {
-    if (isa<TF::_XlaHostComputeMlirOp, TF::XlaSendToHostOp,
-            TF::XlaRecvFromHostOp>(op))
+// Checks if an op is a TF/XLA communication op.
+bool IsCommunicationOp(Operation* op) {
+  return isa<TF::_XlaHostComputeMlirOp, TF::XlaSendToHostOp,
+             TF::XlaRecvFromHostOp>(op);
+}
+
+// Checks if an op is a supported HLO control flow op.
+bool IsControlFlowOp(Operation* op) { return isa<IfOp, WhileOp>(op); }
+
+// Collects control flow op ancestors of a given op, up until FuncOp. If any
+// ancestor is not a control flow op or a FuncOp, or of a single block region,
+// an error will be returned.
+LogicalResult GetControlFlowAncestors(
+    Operation* op, llvm::SmallPtrSetImpl<Operation*>& control_flow_ops,
+    llvm::SmallPtrSetImpl<Block*>& control_flow_blocks) {
+  Block* block = op->getBlock();
+  Operation* parent = block->getParentOp();
+  while (block && parent && !isa<FuncOp>(parent)) {
+    if (!IsControlFlowOp(parent))
+      return op->emitOpError()
+             << "expects ancestor(s) to be of ['" << IfOp::getOperationName()
+             << "', '" << FuncOp::getOperationName() << "']";
+
+    if (!llvm::hasSingleElement(block->getParent()->getBlocks()))
+      return op->emitOpError() << "expects single block region ancestor(s)";
+
+    control_flow_ops.insert(parent);
+    control_flow_blocks.insert(block);
+
+    parent = block->getParentOp();
+    block = parent->getBlock();
+  }
+  return success();
+}
+
+// Finds communication ops in a function. `control_flow_ops` and
+// `control_flow_blocks` will be populated with control flow op ancestors for
+// every communication op.
+LogicalResult FindCommunicationOps(
+    FuncOp func, llvm::SmallPtrSetImpl<Operation*>& control_flow_ops,
+    llvm::SmallPtrSetImpl<Block*>& control_flow_blocks,
+    bool& has_communication_ops) {
+  auto result = func.walk([&](Operation* op) {
+    if (!IsCommunicationOp(op)) return WalkResult::advance();
+    has_communication_ops = true;
+    if (failed(
+            GetControlFlowAncestors(op, control_flow_ops, control_flow_blocks)))
       return WalkResult::interrupt();
     return WalkResult::advance();
   });
-  return result.wasInterrupted();
+  return failure(result.wasInterrupted());
 }
 
-// Helper struct holding a function and optional cloned version. If `clone` is
-// set, function calls to `original` will be replaced with `clone`.
-struct FuncAndClone {
+// Helper struct holding a function to be rewritten, it's control flow ops that
+// lead to a communication op or function call with a communication op
+// (transitively), and an optional clone of itself. If `clone` is set, function
+// calls to `original` will be replaced with `clone`.
+struct FuncToRewrite {
   FuncOp original;
+  llvm::SmallPtrSet<Operation*, 4> control_flow_ops;
+  llvm::SmallPtrSet<Block*, 4> control_flow_blocks;
   FuncOp clone;
 };
 
 // Finds all functions that need to be rewritten with communication ops and
 // and associated tokens.
-llvm::SmallDenseMap<StringRef, FuncAndClone> GetFunctionsToRewrite(
-    ModuleOp module) {
+LogicalResult GetFunctionsToRewrite(
+    ModuleOp module,
+    llvm::SmallDenseMap<StringRef, FuncToRewrite>& funcs_to_rewrite) {
   // Find functions containing communication ops.
-  llvm::SmallDenseMap<StringRef, FuncAndClone> funcs;
   SmallVector<FuncOp, 4> funcs_to_visit;
   for (FuncOp func : module.getOps<FuncOp>()) {
-    if (HasCommunicationOps(func)) {
-      funcs.insert({func.getName(), {func, /*clone=*/nullptr}});
-      funcs_to_visit.push_back(func);
-    }
+    FuncToRewrite func_to_rewrite{/*original=*/func, /*control_flow_ops=*/{},
+                                  /*control_flow_blocks=*/{},
+                                  /*clone=*/nullptr};
+    bool has_communication_ops = false;
+    if (failed(FindCommunicationOps(func, func_to_rewrite.control_flow_ops,
+                                    func_to_rewrite.control_flow_blocks,
+                                    has_communication_ops)))
+      return failure();
+
+    if (!has_communication_ops) continue;
+    funcs_to_rewrite.insert({func.getName(), func_to_rewrite});
+    funcs_to_visit.push_back(func);
   }
 
   // Find functions that call functions with communication ops, transitively.
@@ -100,13 +164,30 @@ llvm::SmallDenseMap<StringRef, FuncAndClone> GetFunctionsToRewrite(
         // Only `mlir::CallOp` is supported as this requires knowing how to
         // rewrite arguments and results to a function.
         if (!isa<mlir::CallOp>(use.getUser())) continue;
-        auto caller_func = use.getUser()->getParentOfType<FuncOp>();
-        if (!caller_func) continue;
-        if (funcs
-                .insert(
-                    {caller_func.getName(), {caller_func, /*clone=*/nullptr}})
-                .second)
-          new_funcs_to_visit.push_back(caller_func);
+        auto caller_parent_func = use.getUser()->getParentOfType<FuncOp>();
+        if (!caller_parent_func) continue;
+
+        FuncToRewrite func_to_rewrite{/*original=*/caller_parent_func,
+                                      /*control_flow_ops=*/{},
+                                      /*control_flow_blocks=*/{},
+                                      /*clone=*/nullptr};
+        if (failed(GetControlFlowAncestors(
+                use.getUser(), func_to_rewrite.control_flow_ops,
+                func_to_rewrite.control_flow_blocks)))
+          return failure();
+
+        auto it = funcs_to_rewrite.insert(
+            {caller_parent_func.getName(), func_to_rewrite});
+        if (it.second) {
+          new_funcs_to_visit.push_back(caller_parent_func);
+        } else {
+          it.first->getSecond().control_flow_ops.insert(
+              func_to_rewrite.control_flow_ops.begin(),
+              func_to_rewrite.control_flow_ops.end());
+          it.first->getSecond().control_flow_blocks.insert(
+              func_to_rewrite.control_flow_blocks.begin(),
+              func_to_rewrite.control_flow_blocks.end());
+        }
       }
     }
 
@@ -116,8 +197,9 @@ llvm::SmallDenseMap<StringRef, FuncAndClone> GetFunctionsToRewrite(
   // Clone public functions that need to be rewritten. Function calls to this
   // function will be replaced with the cloned function.
   SymbolTable symbol_table(module);
-  for (auto& func : funcs) {
-    if (func.getSecond().original.isPublic()) {
+  for (auto& func : funcs_to_rewrite) {
+    if (func.getSecond().original.isPublic() &&
+        !func.getSecond().original.symbolKnownUseEmpty(module)) {
       auto clone = func.getSecond().original.clone();
       clone.setVisibility(SymbolTable::Visibility::Private);
       symbol_table.insert(clone);
@@ -125,7 +207,7 @@ llvm::SmallDenseMap<StringRef, FuncAndClone> GetFunctionsToRewrite(
     }
   }
 
-  return funcs;
+  return success();
 }
 
 // Assigns op sharding to an op for a given device core.
@@ -137,11 +219,17 @@ void SetOpSharding(Operation* op, int64_t tpu_core) {
 }
 
 // Assigns frontend attributes holding information about data type and
-// TensorFlow rendezvous channel name.
-void SetFrontendAttributes(Operation* op, StringRef key, Type type) {
+// TensorFlow rendezvous channel name. The TensorFlow rendezvous channel name is
+// handled differently as individual names are used per data send and receive.
+void SetFrontendAttributes(Operation* op, int32_t index, StringRef key,
+                           Type type, bool device_to_host) {
   MLIRContext* context = op->getContext();
 
-  auto rendezvous_name = StringAttr::get(key, context);
+  std::string formatted_key =
+      device_to_host ? llvm::formatv("{0}_dtoh_{1}", key, index).str()
+                     : llvm::formatv("{0}_htod_{1}", key, index).str();
+
+  auto rendezvous_name = StringAttr::get(formatted_key, context);
   auto rendezvous_name_attr = NamedAttribute(
       Identifier::get(kXlaHostTransferRendezvousNameAttr, context),
       rendezvous_name);
@@ -161,24 +249,10 @@ void SetFrontendAttributes(Operation* op, StringRef key, Type type) {
   op->setAttr(kFrontendAttributesAttr, frontend_attributes);
 }
 
-// Assigns frontend attributes holding information about data type and
-// TensorFlow rendezvous channel name specific to `tf._XlaHostComputeMlir`.
-// TensorFlow rendezvous channel name is handled differently as individual names
-// are used per data send and receive.
-void SetFrontendAttributes(Operation* op, int32_t index, StringRef key,
-                           Type type, bool device_to_host) {
-  std::string formatted_key =
-      device_to_host ? llvm::formatv("{0}_dtoh_{1}", key, index).str()
-                     : llvm::formatv("{0}_htod_{1}", key, index).str();
-
-  return SetFrontendAttributes(op, formatted_key, type);
-}
-
-// Creates a `mhlo.send` op for sending value `operand`. If `index` is set,
-// `key` will be rewritten with a suffix and index. If `tpu_core` is set, op
-// sharding for the respective device will be set.
+// Creates a `mhlo.send` op for sending value `operand`. If `tpu_core` is set,
+// op sharding for the respective device will be set.
 Value CreateSendOp(OpBuilder& builder, int64_t& channel_id, Location loc,
-                   Value operand, StringRef key, const Optional<size_t>& index,
+                   Value operand, StringRef key, size_t index,
                    const Optional<int64_t>& tpu_core, Value token) {
   // type 2 == DEVICE_TO_HOST
   auto channel_handle = ChannelHandle::get(
@@ -188,23 +262,18 @@ Value CreateSendOp(OpBuilder& builder, int64_t& channel_id, Location loc,
       loc, token.getType(), operand, token, channel_handle,
       /*is_host_transfer=*/builder.getBoolAttr(true));
 
-  if (index) {
-    SetFrontendAttributes(send, *index, key, operand.getType(),
-                          /*device_to_host=*/true);
-  } else {
-    SetFrontendAttributes(send, key, operand.getType());
-  }
+  SetFrontendAttributes(send, index, key, operand.getType(),
+                        /*device_to_host=*/true);
 
   if (tpu_core) SetOpSharding(send, *tpu_core);
 
   return send.getResult();
 }
 
-// Creates a `mhlo.recv` op for receiving a value. If `index` is set, `key` will
-// be rewritten with a suffix and index. If `tpu_core` is set, op sharding for
-// the respective device will be set.
+// Creates a `mhlo.recv` op for receiving a value. If `tpu_core` is set, op
+// sharding for the respective device will be set.
 Value CreateRecvOp(OpBuilder& builder, int64_t& channel_id, Location loc,
-                   Value result, StringRef key, const Optional<size_t>& index,
+                   Value result, StringRef key, size_t index,
                    const Optional<int64_t>& tpu_core, Value token) {
   // type 3 == HOST_TO_DEVICE
   auto channel_handle = ChannelHandle::get(
@@ -216,12 +285,10 @@ Value CreateRecvOp(OpBuilder& builder, int64_t& channel_id, Location loc,
   auto recv =
       builder.create<RecvOp>(loc, recv_result_type, token, channel_handle,
                              /*is_host_transfer=*/builder.getBoolAttr(true));
-  if (index) {
-    SetFrontendAttributes(recv, *index, key, result_type,
-                          /*device_to_host=*/false);
-  } else {
-    SetFrontendAttributes(recv, key, result.getType());
-  }
+
+  SetFrontendAttributes(recv, index, key, result_type,
+                        /*device_to_host=*/false);
+
   if (tpu_core) SetOpSharding(recv, *tpu_core);
 
   auto get_tuple_element =
@@ -291,7 +358,7 @@ Value RewriteSendToHostOp(OpBuilder& builder, int64_t& channel_id,
   builder.setInsertionPoint(send_to_host);
   token = CreateSendOp(builder, channel_id, send_to_host.getLoc(),
                        send_to_host.input(), send_to_host.key(),
-                       /*index=*/llvm::None, /*tpu_core=*/llvm::None, token);
+                       /*index=*/0, /*tpu_core=*/llvm::None, token);
 
   send_to_host.erase();
   return token;
@@ -303,7 +370,7 @@ Value RewriteRecvFromHostOp(OpBuilder& builder, int64_t& channel_id,
   builder.setInsertionPoint(recv_from_host);
   token = CreateRecvOp(builder, channel_id, recv_from_host.getLoc(),
                        recv_from_host.output(), recv_from_host.key(),
-                       /*index=*/llvm::None, /*tpu_core=*/llvm::None, token);
+                       /*index=*/0, /*tpu_core=*/llvm::None, token);
 
   recv_from_host.erase();
   return token;
@@ -329,94 +396,489 @@ Value RewriteCallOp(OpBuilder& builder, CallOp call,
   return new_call.getResults().back();
 }
 
-// Updates function terminator and type if a token is to be emitted by the
-// function.
-void RewriteFunctionTerminatorAndUpdateType(OpBuilder& builder, FuncOp func,
-                                            Block& func_body, Value token) {
-  // If the function signature is changed, update to emit a token and update
-  // the function type.
-  Operation* terminator = func_body.getTerminator();
-  auto new_results = llvm::to_vector<4>(terminator->getOperands());
-  new_results.push_back(token);
-  builder.setInsertionPoint(terminator);
-  auto new_return =
-      builder.create<mlir::ReturnOp>(terminator->getLoc(), new_results);
-  terminator->erase();
+// Helper struct holding state of which op to visit to next. If `op` is in a
+// control flow op region, `region_idx` will be set with the respective region
+// index. `token` will be current token from the last communication op/control
+// flow op transitive communication ops.
+struct OpVisitorState {
+  Optional<unsigned> region_idx;
+  Value token;
+  Operation* op;
+};
 
+// Creates a tuple from a sequence of values.
+Value CreateTuple(OpBuilder& builder, Location loc, ArrayRef<Value> operands) {
+  return builder.create<TupleOp>(loc, operands).getResult();
+}
+
+// Replaces a value `value` with a new value but the token attached. If `value`
+// is not a tuple, a new tuple is formed with `token`. If `value` is a tuple,
+// `value` is extended instead. New tuple values created are cached.
+Value GetValueWithToken(OpBuilder& builder, Value value, Value token,
+                        llvm::SmallDenseMap<Value, Value>& rewritten_values) {
+  // If value with token already exists, reuse it.
+  auto it = rewritten_values.find(value);
+  if (it != rewritten_values.end()) return it->getSecond();
+
+  auto create_tuple = [&](ArrayRef<Value> operands) {
+    auto new_result = CreateTuple(builder, value.getLoc(), operands);
+    rewritten_values.insert({value, new_result});
+    return new_result;
+  };
+
+  auto tuple_type = value.getType().dyn_cast<TupleType>();
+  // `value` is not a tuple, create a new tuple.
+  if (!tuple_type) return create_tuple({value, token});
+
+  // Extend tuple if `value` is a tuple.
+  // If `value` is an op result and the owner is a `mhlo.tuple`, simply unpack
+  // the tuple.
+  if (auto tuple_op = value.getDefiningOp<TupleOp>()) {
+    auto tuple_operands = llvm::to_vector<4>(tuple_op.getOperands());
+    tuple_operands.push_back(token);
+    return create_tuple(tuple_operands);
+  }
+
+  // `value` is not created via a `mhlo.tuple` directly, unpack individual
+  // elements directly with `mhlo.get_tuple_element`.
+  SmallVector<Value, 4> tuple_operands;
+  for (auto idx : llvm::seq<int32_t>(0, tuple_type.getTypes().size()))
+    tuple_operands.push_back(
+        builder.create<GetTupleElementOp>(value.getLoc(), value, idx)
+            .getResult());
+
+  tuple_operands.push_back(token);
+  return create_tuple(tuple_operands);
+}
+
+// Extends a type to include a `mhlo.token` type. If `type` is not a tuple type,
+// a new tuple type with `type` and `mhlo.token` type is created instead.
+TupleType GetTypeWithToken(OpBuilder& builder, Type type) {
+  auto token_type = TokenType::get(builder.getContext());
+  if (auto tuple_type = type.dyn_cast<TupleType>()) {
+    auto result_types = llvm::to_vector<4>(tuple_type.getTypes());
+    result_types.push_back(token_type);
+    return builder.getTupleType(result_types);
+  }
+
+  return builder.getTupleType({type, token_type});
+}
+
+// Creates a slice of a tuple `value` with `mhlo.get_tuple_element` from index 0
+// to `end`, exclusive.
+Value CreateSubTuple(OpBuilder& builder, Value value, size_t end) {
+  SmallVector<Value, 4> tuple_operands;
+  for (auto idx : llvm::seq<int32_t>(0, end))
+    tuple_operands.push_back(
+        builder.create<GetTupleElementOp>(value.getLoc(), value, idx)
+            .getResult());
+
+  return CreateTuple(builder, value.getLoc(), tuple_operands);
+}
+
+// Replaces uses of `value` with `replacement`. If `value` is not a tuple type,
+// an explicit `mhlo.get_tuple_element` is created to unpack the tuple and
+// return the first element. Otherwise, `mhlo.get_tuple_element` users are
+// simply updated with `replacement`, and all other users are updated with a
+// slice of `replacement`.
+void ReplaceWithTupleResult(OpBuilder& builder, Value value,
+                            Value replacement) {
+  auto tuple_type = value.getType().dyn_cast<TupleType>();
+  if (!tuple_type) {
+    if (!value.use_empty()) {
+      auto new_element = builder.create<GetTupleElementOp>(replacement.getLoc(),
+                                                           replacement, 0);
+      value.replaceAllUsesWith(new_element.getResult());
+    }
+    return;
+  }
+
+  Value sub_tuple;
+  for (auto& use : llvm::make_early_inc_range(value.getUses())) {
+    if (isa<GetTupleElementOp>(use.getOwner())) {
+      use.set(replacement);
+      continue;
+    }
+
+    if (!sub_tuple)
+      sub_tuple = CreateSubTuple(builder, replacement, tuple_type.size());
+
+    use.set(sub_tuple);
+  }
+}
+
+// Replaces control flow op block single block argument with new block argument
+// of type `new_type` (tuple type). The last element of the new block argument
+// (token) is returned.
+Value UpdateControlFlowBlockArgWithToken(OpBuilder& builder, Block& block,
+                                         Type token_type) {
+  assert(block.getNumArguments() == 1);
+  builder.setInsertionPointToStart(&block);
+  auto new_arg = block.addArgument(token_type);
+  ReplaceWithTupleResult(builder, block.getArgument(0), new_arg);
+  block.eraseArgument(0);
+  return builder
+      .create<GetTupleElementOp>(new_arg.getLoc(), new_arg,
+                                 token_type.cast<TupleType>().size() - 1)
+      .getResult();
+}
+
+// Updates control flow op terminator with an extra element `token`. If the
+// original return value is not a tuple, a new tuple is formed. Otherwise the
+// tuple is extended.
+void RewriteControlFlowTerminator(OpBuilder& builder, Operation* terminator,
+                                  Value token) {
+  assert(terminator->getNumOperands() == 1);
+  assert(terminator->getBlock()->getNumArguments() == 1);
+  // `mhlo.while` cond terminator does not need to be rewritten as it always
+  // returns a tensor<i1> predicate value.
+  if (auto while_parent = dyn_cast_or_null<WhileOp>(terminator->getParentOp()))
+    if (terminator->getParentRegion() == &while_parent.cond()) return;
+
+  builder.setInsertionPoint(terminator);
+  llvm::SmallDenseMap<Value, Value> rewritten_operands;
+  Value new_result = GetValueWithToken(builder, terminator->getOperand(0),
+                                       token, rewritten_operands);
+  terminator->setOperand(0, new_result);
+}
+
+// Rewrites a `mhlo.if` op to receive and forward a `mhlo.token`. Operands to
+// the op for all of its regions are extended to have an extra operand `token`.
+void RewriteRegionIfOp(OpBuilder& builder, IfOp region_if,
+                       SmallVectorImpl<OpVisitorState>& ops_to_visit,
+                       Value token) {
+  llvm::SmallDenseMap<Value, Value> rewritten_operands;
+
+  // Rewrite all region operands to have an extra operand `token`.
+  Value new_true_operand = GetValueWithToken(builder, region_if.true_arg(),
+                                             token, rewritten_operands);
+  Value new_false_operand = GetValueWithToken(builder, region_if.false_arg(),
+                                              token, rewritten_operands);
+
+  auto new_result_type = GetTypeWithToken(builder, region_if.getType());
+
+  // Create new `mhlo.if` op with extra token operands and result.
+  auto new_if = builder.create<IfOp>(region_if.getLoc(), new_result_type,
+                                     region_if.pred(), new_true_operand,
+                                     new_false_operand);
+
+  // Move all regions from the old `mhlo.if` op to its replacement.
+  new_if.true_branch().takeBody(region_if.true_branch());
+  new_if.false_branch().takeBody(region_if.false_branch());
+
+  // Forward result from old `mhlo.if` with replacement, and unpack result when
+  // necessary.
+  ReplaceWithTupleResult(builder, region_if.getResult(), new_if.getResult());
+
+  auto new_token = builder.create<GetTupleElementOp>(
+      new_if.getLoc(), new_if.getResult(),
+      new_if.getResult().getType().cast<TupleType>().size() - 1);
+
+  region_if.erase();
+
+  // Remove leftover operands to old `mhlo.if` if they have no uses.
+  for (auto& rewritten_operand : rewritten_operands)
+    if (auto tuple_op = rewritten_operand.getFirst().getDefiningOp<TupleOp>())
+      if (tuple_op.use_empty()) tuple_op.erase();
+
+  // Next op to visit. The replacement is visited but at its first region. The
+  // token result of the new region if is propagated.
+  ops_to_visit.push_back({/*region_idx=*/0, new_token, new_if});
+}
+
+// Rewrites a `mhlo.if`/`mhlo.while` region to receive and forward a
+// `mhlo.token`. The block argument is updated to have an extra `mhlo.token`
+// element. If the region block is to be rewritten, the next op to visit is set
+// to the first op in the block. Otherwise the terminator is updated to forward
+// `token`.
+void RewriteControlFlowOpRegion(
+    OpBuilder& builder, Operation* region_op, unsigned region_idx,
+    Type block_arg_type, SmallVectorImpl<OpVisitorState>& ops_to_visit,
+    const llvm::SmallPtrSetImpl<Block*>& control_flow_blocks, Value token) {
+  ops_to_visit.push_back({region_idx + 1, token, region_op});
+
+  Region& region = region_op->getRegion(region_idx);
+  assert(llvm::hasSingleElement(region));
+
+  auto block_token = UpdateControlFlowBlockArgWithToken(builder, region.front(),
+                                                        block_arg_type);
+
+  if (control_flow_blocks.contains(&region.front())) {
+    ops_to_visit.push_back({/*region_idx=*/llvm::None, block_token,
+                            block_token.getDefiningOp()->getNextNode()});
+    return;
+  }
+
+  RewriteControlFlowTerminator(builder, region.front().getTerminator(),
+                               block_token);
+}
+
+// Rewrites an `mhlo.if` op or its region. If `region_idx` is not set, the op
+// operands and results are rewritten. If `region_idx` is set, region
+// `region_idx` is rewritten to take in and return an additional token. Returns
+// true if the op or its region was rewritten.
+bool ProcessRegionIfOp(OpBuilder& builder, IfOp region_if,
+                       Optional<unsigned> region_idx,
+                       SmallVectorImpl<OpVisitorState>& ops_to_visit,
+                       const llvm::SmallPtrSetImpl<Block*>& control_flow_blocks,
+                       Value token) {
+  builder.setInsertionPoint(region_if);
+
+  if (!region_idx) {
+    RewriteRegionIfOp(builder, region_if, ops_to_visit, token);
+    return true;
+  }
+
+  if (*region_idx < region_if.getNumRegions()) {
+    RewriteControlFlowOpRegion(builder, region_if, *region_idx,
+                               region_if.getOperand(*region_idx + 1).getType(),
+                               ops_to_visit, control_flow_blocks, token);
+    return true;
+  }
+
+  return false;
+}
+
+// Rewrites a `mhlo.while` op to receive and forward a `mhlo.token`. Operands to
+// the op for all of its regions are extended to have an extra operand `token`.
+void RewriteRegionWhileOp(OpBuilder& builder, WhileOp region_while,
+                          SmallVectorImpl<OpVisitorState>& ops_to_visit,
+                          Value token) {
+  llvm::SmallDenseMap<Value, Value> rewritten_operands;
+
+  // Rewrite region operand to have an extra operand `token`.
+  Value new_val_operand =
+      GetValueWithToken(builder, region_while.val(), token, rewritten_operands);
+
+  auto new_result_type = GetTypeWithToken(builder, region_while.getType());
+
+  // Create new `mhlo.while` op with extra token operand and result.
+  auto new_while = builder.create<WhileOp>(region_while.getLoc(),
+                                           new_result_type, new_val_operand);
+
+  // Move all regions from the old `mhlo.while` op to its replacement.
+  new_while.cond().takeBody(region_while.cond());
+  new_while.body().takeBody(region_while.body());
+
+  // Forward result from old `mhlo.while` with replacement, and unpack result
+  // when necessary.
+  ReplaceWithTupleResult(builder, region_while.getResult(),
+                         new_while.getResult());
+
+  auto new_token = builder.create<GetTupleElementOp>(
+      new_while.getLoc(), new_while.getResult(),
+      new_while.getResult().getType().cast<TupleType>().size() - 1);
+
+  region_while.erase();
+
+  // Remove leftover operands to old `mhlo.while` if they have no uses.
+  for (auto& rewritten_operand : rewritten_operands)
+    if (auto tuple_op = rewritten_operand.getFirst().getDefiningOp<TupleOp>())
+      if (tuple_op.use_empty()) tuple_op.erase();
+
+  // Next op to visit. The replacement is visited but at its first region. The
+  // token result of the new region if is propagated.
+  ops_to_visit.push_back({/*region_idx=*/0, new_token, new_while});
+}
+
+// Rewrites an `mhlo.while` op or its region. If `region_idx` is not set, the op
+// operands and results are rewritten. If `region_idx` is set, region
+// `region_idx` is rewritten to take in and return an additional token. Returns
+// true if the op or its region was rewritten.
+bool ProcessRegionWhileOp(
+    OpBuilder& builder, WhileOp region_while, Optional<unsigned> region_idx,
+    SmallVectorImpl<OpVisitorState>& ops_to_visit,
+    const llvm::SmallPtrSetImpl<Block*>& control_flow_blocks, Value token) {
+  builder.setInsertionPoint(region_while);
+
+  if (!region_idx) {
+    RewriteRegionWhileOp(builder, region_while, ops_to_visit, token);
+    return true;
+  }
+
+  if (*region_idx < region_while.getNumRegions()) {
+    RewriteControlFlowOpRegion(builder, region_while, *region_idx,
+                               region_while.val().getType(), ops_to_visit,
+                               control_flow_blocks, token);
+    return true;
+  }
+
+  return false;
+}
+
+// Updates function type based on current function body block arguments and
+// terminator operand types.
+void UpdateFunctionType(OpBuilder& builder, FuncOp func, Block& func_body) {
   auto new_argument_types = llvm::to_vector<4>(func_body.getArgumentTypes());
-  auto new_result_types = llvm::to_vector<4>(new_return.getOperandTypes());
+  auto new_result_types =
+      llvm::to_vector<4>(func_body.getTerminator()->getOperandTypes());
   func.setType(FunctionType::get(new_argument_types, new_result_types,
                                  builder.getContext()));
 }
 
-// Rewrites a function body and communication ops inside. The function may
-// either be rewritten to create a token or take in and return a token,
-// depending on its visibility and if there are any callers.
+// Replaces a function terminator `return` with another `return` that has an
+// extra `mhlo.token` operand.
+void RewriteFunctionTerminator(OpBuilder& builder, mlir::ReturnOp terminator,
+                               Value token) {
+  auto new_results = llvm::to_vector<4>(terminator.getOperands());
+  new_results.push_back(token);
+  builder.setInsertionPoint(terminator);
+  builder.create<mlir::ReturnOp>(terminator.getLoc(), new_results);
+  terminator.erase();
+}
+
+// Rewrites a function body and communication ops inside. Region control flow
+// are updated when necessary, to propagate tokens. The function may either be
+// rewritten to create a token or take in and return a token, depending on its
+// visibility and if there are any callers.
 LogicalResult RewriteFunction(
     OpBuilder& builder, int64_t& channel_id, ModuleOp module, FuncOp func,
-    const llvm::SmallDenseMap<StringRef, FuncAndClone>& funcs) {
+    const llvm::SmallDenseMap<StringRef, FuncToRewrite>& funcs,
+    const llvm::SmallPtrSetImpl<Operation*>& control_flow_ops,
+    const llvm::SmallPtrSetImpl<Block*>& control_flow_blocks, bool is_clone) {
   MLIRContext* context = module.getContext();
   if (!llvm::hasSingleElement(func.getBody()))
     return func.emitError()
            << "'" << FuncOp::getOperationName()
            << "' ops with more than one block are not supported";
 
-  bool rewrite_block = !func.isPublic() && !func.symbolKnownUseEmpty(module);
+  bool rewrite_block =
+      is_clone || (!func.isPublic() && !func.symbolKnownUseEmpty(module));
   Block& func_body = func.front();
 
   builder.setInsertionPointToStart(&func_body);
-  auto token_type = mlir::mhlo::TokenType::get(context);
+  auto token_type = TokenType::get(context);
   // If a function is public, it's signature should not be modified, and instead
   // a token will be created. Otherwise a token block argument is inserted.
-  Value token = rewrite_block
-                    ? func_body.addArgument(token_type)
+  Value init_token =
+      rewrite_block ? func_body.addArgument(token_type)
                     : builder.create<CreateTokenOp>(func.getLoc(), token_type)
                           .getResult();
 
-  for (Operation& op : llvm::make_early_inc_range(func_body)) {
-    if (auto host_compute = dyn_cast<TF::_XlaHostComputeMlirOp>(op)) {
+  // Stack to keep track of region based control flow op nesting and current
+  // op to visit.
+  SmallVector<OpVisitorState, 4> ops_to_visit{
+      {/*region_idx=*/llvm::None, init_token, &func_body.front()}};
+
+  while (!ops_to_visit.empty()) {
+    OpVisitorState op_to_visit = ops_to_visit.pop_back_val();
+    Operation* curr_op = op_to_visit.op;
+
+    Value token = op_to_visit.token;
+    // Ops may be removed, so the next op is kept track of beforehand.
+    Operation* next_op = curr_op->getNextNode();
+
+    if (auto host_compute = dyn_cast<TF::_XlaHostComputeMlirOp>(curr_op)) {
       token = RewriteHostComputeOp(builder, channel_id, host_compute, token);
-    } else if (auto send_to_host = dyn_cast<TF::XlaSendToHostOp>(op)) {
+    } else if (auto send_to_host = dyn_cast<TF::XlaSendToHostOp>(curr_op)) {
       token = RewriteSendToHostOp(builder, channel_id, send_to_host, token);
-    } else if (auto recv_from_host = dyn_cast<TF::XlaRecvFromHostOp>(op)) {
+    } else if (auto recv_from_host = dyn_cast<TF::XlaRecvFromHostOp>(curr_op)) {
       token = RewriteRecvFromHostOp(builder, channel_id, recv_from_host, token);
-    } else if (auto call = dyn_cast<mlir::CallOp>(op)) {
+    } else if (auto call = dyn_cast<mlir::CallOp>(curr_op)) {
       // Only `mlir::CallOp` is supported as this requires knowing how to
       // rewrite arguments and results to a function.
       auto it = funcs.find(call.getCallee());
-      if (it == funcs.end()) continue;
-      FuncOp clone = it->getSecond().clone;
-      Optional<StringRef> symbol_name =
-          clone ? Optional<StringRef>(clone.getName()) : llvm::None;
-      // If the function being called is to be cloned, update the call to also
-      // point to the cloned function.
-      token = RewriteCallOp(builder, call, symbol_name, token);
+      if (it != funcs.end()) {
+        FuncOp clone = it->getSecond().clone;
+        Optional<StringRef> symbol_name =
+            clone ? Optional<StringRef>(clone.getName()) : llvm::None;
+        // If the function being called is to be cloned, update the call to also
+        // point to the cloned function.
+        token = RewriteCallOp(builder, call, symbol_name, token);
+      }
+    } else if (auto region_if = dyn_cast<IfOp>(curr_op)) {
+      if (op_to_visit.region_idx || control_flow_ops.contains(region_if))
+        if (ProcessRegionIfOp(builder, region_if, op_to_visit.region_idx,
+                              ops_to_visit, control_flow_blocks, token))
+          continue;
+    } else if (auto region_while = dyn_cast<WhileOp>(curr_op)) {
+      if (op_to_visit.region_idx || control_flow_ops.contains(region_while))
+        if (ProcessRegionWhileOp(builder, region_while, op_to_visit.region_idx,
+                                 ops_to_visit, control_flow_blocks, token))
+          continue;
+    } else if (auto region_terminator = dyn_cast<mhlo::ReturnOp>(curr_op)) {
+      RewriteControlFlowTerminator(builder, region_terminator, token);
+      // There is no next op afer the control flow op terminator, simply let
+      // stack have one less element.
+      continue;
+    } else if (auto func_terminator = dyn_cast<mlir::ReturnOp>(curr_op)) {
+      if (rewrite_block)
+        RewriteFunctionTerminator(builder, func_terminator, token);
+
+      // There is no next op afer the function terminator, simply let stack have
+      // one less element/be empty.
+      continue;
     }
+
+    // Visit next op.
+    ops_to_visit.push_back({/*region_idx=*/llvm::None, token, next_op});
   }
 
-  if (rewrite_block)
-    RewriteFunctionTerminatorAndUpdateType(builder, func, func_body, token);
+  if (rewrite_block) UpdateFunctionType(builder, func, func_body);
 
   return success();
 }
 
+// Checks if a function call is pointing to a function with communication ops.
+bool IsFunctionCallWithCommunication(
+    Operation* op,
+    const llvm::SmallDenseMap<StringRef, FuncToRewrite>& funcs_to_rewrite) {
+  if (auto call = dyn_cast<mlir::CallOp>(op))
+    return funcs_to_rewrite.count(call.callee());
+
+  return false;
+}
+
+// Collects all control flow op ancestors of communication ops or function calls
+// with communication ops (transitively).
+void GetCommunicationControlFlowOps(
+    FuncOp func,
+    const llvm::SmallDenseMap<StringRef, FuncToRewrite>& funcs_to_rewrite,
+    llvm::SmallPtrSetImpl<Operation*>& control_flow_ops,
+    llvm::SmallPtrSetImpl<Block*>& control_flow_blocks) {
+  func.walk([&](Operation* op) {
+    if (IsCommunicationOp(op) ||
+        IsFunctionCallWithCommunication(op, funcs_to_rewrite))
+      if (failed(GetControlFlowAncestors(op, control_flow_ops,
+                                         control_flow_blocks)))
+        llvm_unreachable(
+            "checking original function for control flow ancestors should have "
+            "errored first");
+  });
+}
+
 void LegalizeTFCommunication::runOnOperation() {
   auto module = getOperation();
-  llvm::SmallDenseMap<StringRef, FuncAndClone> funcs =
-      GetFunctionsToRewrite(module);
+  llvm::SmallDenseMap<StringRef, FuncToRewrite> funcs_to_rewrite;
+  if (failed(GetFunctionsToRewrite(module, funcs_to_rewrite)))
+    return signalPassFailure();
 
   // Module level counter to make sure Channel Id's are unique.
   int64_t channel_id = 1;
   OpBuilder builder(&getContext());
-  for (const auto& func_and_name : funcs) {
-    FuncOp func = func_and_name.getSecond().original;
-    if (failed(RewriteFunction(builder, channel_id, module, func, funcs)))
+  for (const auto& func_and_name : funcs_to_rewrite) {
+    const auto& func_to_rewrite = func_and_name.getSecond();
+    FuncOp func = func_to_rewrite.original;
+    if (failed(RewriteFunction(builder, channel_id, module, func,
+                               funcs_to_rewrite,
+                               func_to_rewrite.control_flow_ops,
+                               func_to_rewrite.control_flow_blocks,
+                               /*is_clone=*/false)))
       return signalPassFailure();
 
     FuncOp clone = func_and_name.getSecond().clone;
     if (!clone) continue;
-    if (failed(RewriteFunction(builder, channel_id, module, clone, funcs)))
-      return signalPassFailure();
+    llvm::SmallPtrSet<Operation*, 4> clone_control_flow_ops;
+    llvm::SmallPtrSet<Block*, 4> clone_control_flow_blocks;
+    GetCommunicationControlFlowOps(clone, funcs_to_rewrite,
+                                   clone_control_flow_ops,
+                                   clone_control_flow_blocks);
+    if (failed(RewriteFunction(builder, channel_id, module, clone,
+                               funcs_to_rewrite, clone_control_flow_ops,
+                               clone_control_flow_blocks,
+                               /*is_clone=*/true)))
+      llvm_unreachable(
+          "rewriting of original function should have errored first");
   }
 }
 
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_control_flow.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_control_flow.cc
index 760252331e0..4e76baa6805 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_control_flow.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_control_flow.cc
@@ -119,8 +119,8 @@ void LowerIf(TF::IfOp op, ModuleOp module) {
   // Import the regions for both the true and false cases. These regions
   // must be updated to tuple the return results together and use the xla hlo
   // return op.
-  ImportXlaRegion(op.then_func(), &if_op.true_branch(), loc);
-  ImportXlaRegion(op.else_func(), &if_op.false_branch(), loc);
+  ImportXlaRegion(op.then_function(), &if_op.true_branch(), loc);
+  ImportXlaRegion(op.else_function(), &if_op.false_branch(), loc);
 
   // De-tuple the results of the xla hlo if result.
   Detuple(if_op.getResult(), op.getResults(), &builder);
@@ -172,8 +172,8 @@ void LowerWhile(TF::WhileOp op, ModuleOp module) {
 
   // Import the regions for both the cond and body. These regions must be
   // updated to tuple the return results together and use the xla hlo return op.
-  ImportXlaRegion(op.body_func(), &while_op.body(), loc);
-  ImportXlaRegion(op.cond_func(), &while_op.cond(), loc,
+  ImportXlaRegion(op.body_function(), &while_op.body(), loc);
+  ImportXlaRegion(op.cond_function(), &while_op.cond(), loc,
                   /*tuple_return=*/false);
 
   // De-tuple the results of the xla hlo while.
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
index 0ef62deed7d..b1460421f16 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
@@ -31,7 +31,7 @@ def IEEEFloatTensor : TensorOf<[F16, F32, F64]>;
 //===----------------------------------------------------------------------===//
 
 def FeatureDimension : NativeCodeCall<
-    "getFeatureDimensionAttr($_builder, $0, $1)">;
+    "getFeatureDimensionAttr($_builder, $0.getValue(), $1)">;
 def FalseBoolAttr : AttrConstraint<CPred<"!$_self.getValue()">>;
 def TrueBoolAttr : AttrConstraint<CPred<"$_self.getValue()">>;
 
@@ -51,6 +51,10 @@ def GetHLOAxisFromTFAxisVariadic : NativeCodeCall<
   "$0, (*$1.begin()).getType().cast<RankedTensorType>().getRank(), "
   "&$_builder)">;
 
+def CastElementsToI64Elements : NativeCodeCall<
+  "hlo::ConvertElementsAttr("
+    "$0, $_builder.getIntegerType(64)).cast<DenseIntElementsAttr>()">;
+
 def : Pattern<
     (TF_FusedBatchNormOp:$root $x, $scale, $offset, $mean, $variance, $epsilon,
                          $exponential_avg_factor, $data_format,
@@ -82,7 +86,7 @@ def AreBroadcastCompatible : Constraint<CPred<"AreBroadcastCompatible($0, $1)">,
     "types must be broadcastable">;
 
 class DirectBinaryPat<Op FromOp, Op ToOp>
-  : Pat<(FromOp AnyRankedTensor:$l, AnyRankedTensor:$r),
+  : Pat<(FromOp AnyTensor:$l, AnyTensor:$r),
         (ToOp $l, $r, (BinBroadcastDimensions $l, $r))>;
 
 foreach fromToBinPair = [[TF_AddOp, HLOClient_BroadcastAddOp],
@@ -128,7 +132,7 @@ def : Pat<(TF_FloorDivOp AnyRankedTensor:$l, AnyRankedTensor:$r),
 //   return x / y;
 // }
 //
-// BraodcastToDimensions is used to compute the broadcast attr to higher
+// BroadcastToDimensions is used to compute the broadcast attr to higher
 // dimensions. This computes the broadcast of 'l' to broadcast('l', 'r')
 // without returning the broadcast of 'r' to broadcast('l', 'r').
 //
@@ -143,14 +147,14 @@ def : Pat<(TF_FloorDivOp AnyRankedTensor:$l, AnyRankedTensor:$r),
           (HLOClient_BroadcastCompareOp $r, (HLO_ConstOp (GetScalarOfType<0> $r)),
            (NullDenseIntElementsAttr), HLO_COMPARISON_DIRECTION_LT),
           (BinBroadcastDimensions $l, $r), HLO_COMPARISON_DIRECTION_EQ),
-        (HLOClient_BroadcastDivOp $l, $r, (BinBroadcastDimensions $l, $r)),
-          (HLOClient_BroadcastDivOp
-           (HLO_NegOp:$neg (HLOClient_BroadcastAddOp (HLO_AbsOp $l),
+         (HLOClient_BroadcastDivOp $l, $r, (BinBroadcastDimensions $l, $r)),
+         (HLOClient_BroadcastDivOp
+          (HLO_NegOp:$neg (HLOClient_BroadcastAddOp (HLO_AbsOp $l),
                        (HLOClient_BroadcastSubOp (HLO_AbsOp $r),
                         (HLO_ConstOp (GetScalarOfType<1> $r)),
                         (NullDenseIntElementsAttr)),
                      (BinBroadcastDimensions $l, $r))),
-           (HLO_AbsOp:$abs $r), (BinBroadcastDimensions $neg, $abs))),
+          (HLO_AbsOp:$abs $r), (BinBroadcastDimensions $neg, $abs))),
         [(SignedIntTensor $l)]>;
 
 // Performs a substitution of FloorMod designed to correct for possibly negative
@@ -175,8 +179,8 @@ def : Pat<(TF_FloorModOp AnyRankedTensor:$l, AnyRankedTensor:$r),
           (BinBroadcastDimensions $rem, $r_zeros), HLO_COMPARISON_DIRECTION_LT),
          (BinBroadcastDimensions $r_cmp, $rem_cmp), HLO_COMPARISON_DIRECTION_NE),
         (NullDenseIntElementsAttr)),
-        (HLOClient_BroadcastAddOp $r,
-         $rem, (BinBroadcastDimensions $r, $rem)), $rem)>;
+       (HLOClient_BroadcastAddOp $r,
+        $rem, (BinBroadcastDimensions $r, $rem)), $rem)>;
 
 //===----------------------------------------------------------------------===//
 // Logical & bitwise binary op patterns.
@@ -255,12 +259,16 @@ def : Pat<(TF_ConcatV2Op $inputs, (TF_ConstOp OneElementAttr:$axis)),
           [(HasRankedFirstOperand $inputs)]>;
 
 //===----------------------------------------------------------------------===//
-// CrossReplicaSum op patterns.
+// CollectivePermute op patterns.
 //===----------------------------------------------------------------------===//
 
-def CastElementsToI64Elements : NativeCodeCall<
-  "hlo::ConvertElementsAttr("
-    "$0, $_builder.getIntegerType(64)).cast<DenseIntElementsAttr>()">;
+def : Pat<(TF_CollectivePermuteOp $input, (TF_ConstOp $source_target_pairs)),
+          (HLO_CollectivePermuteOp $input,
+            (CastElementsToI64Elements $source_target_pairs))>;
+
+//===----------------------------------------------------------------------===//
+// CrossReplicaSum op patterns.
+//===----------------------------------------------------------------------===//
 
 def : Pat<(TF_CrossReplicaSumOp $input, (TF_ConstOp $group_assignment)),
           (HLO_CrossReplicaSumOp $input,
@@ -277,9 +285,19 @@ def : Pat<(TF_AllToAllOp AnyRankedTensor:$input, (TF_ConstOp $group_assignment),
 // FFT op patterns.
 //===----------------------------------------------------------------------===//
 
-def : Pat<(TF_RFFTOp $input, (TF_ConstOp I32ElementsAttr:$fft_length)),
-          (HLO_FftOp $input, HLO_FFT_TYPE_RFFT,
-           (CastElementsToI64Elements $fft_length))>;
+def GetInnerDimFromValue : NativeCodeCall<
+  "GetInnerDimFromValue($0.getType().cast<ShapedType>(), &$_builder)">;
+
+def CheckInnerDimStatic
+  : Constraint<CPred<"CheckInnerDimStatic($0.getType().cast<ShapedType>(), &$_builder)">>;
+
+def : Pat<(TF_FFTOp:$res $input),
+          (HLO_FftOp $input, HLO_FFT_TYPE_FFT, (GetInnerDimFromValue $res)),
+          [(CheckInnerDimStatic $input)]>;
+
+def : Pat<(TF_IFFTOp:$res $input),
+          (HLO_FftOp $input, HLO_FFT_TYPE_IFFT, (GetInnerDimFromValue $res)),
+          [(CheckInnerDimStatic $input)]>;
 
 //===----------------------------------------------------------------------===//
 // GatherV2 op patterns.
@@ -427,6 +445,35 @@ def : Pat<(TF_ConstOp:$res ElementsAttr:$value),
           (TensorCastOp (HLO_ConstOp $value)),
           [(HLO_Tensor $res)]>;
 
+//===----------------------------------------------------------------------===//
+// Elu op patterns.
+//===----------------------------------------------------------------------===//
+
+def : Pat<(TF_EluOp AnyRankedTensor:$features),
+          (HLO_SelectOp
+           (HLOClient_BroadcastCompareOp
+              $features,
+              (HLO_ConstOp:$zero (GetScalarOfType<0> $features)),
+              (BinBroadcastDimensions $zero, $features),
+              HLO_COMPARISON_DIRECTION_GT),
+           $features,
+           (HLO_Expm1Op $features))>;
+
+def : Pat<(TF_EluGradOp AnyStaticShapeTensor:$gradients, AnyRankedTensor:$features),
+           (HLO_SelectOp
+            (HLOClient_BroadcastCompareOp
+              $features,
+              (HLO_ConstOp:$zero (GetScalarOfType<0> $features)),
+              (BinBroadcastDimensions $zero, $features),
+              HLO_COMPARISON_DIRECTION_GT),
+            $gradients,
+            (HLO_MulOp
+             $gradients,
+             (HLOClient_BroadcastAddOp
+               $features,
+               (HLO_ConstOp:$one (GetScalarOfType<1> $features)),
+               (BinBroadcastDimensions $one, $features))))>;
+
 //===----------------------------------------------------------------------===//
 // Relu op patterns.
 //===----------------------------------------------------------------------===//
@@ -542,24 +589,12 @@ foreach Mapping = [
                    [TF_SinOp, HLO_SinOp],
                    [TF_SqrtOp, HLO_SqrtOp],
                    [TF_TanhOp, HLO_TanhOp],
+                   [TF_TanOp, HLOClient_TanOp],
                   ] in {
  def : Pat<(Mapping[0] HLO_Tensor:$input),
            (Mapping[1] $input)>;
 }
 
-// Expand acos to MHLO dialect as follows:
-//   acos(x) = 2 * atan(sqrt(1 - x^2) / (1 + x)) if x != -1
-//           = pi                                if x == -1
-def : Pat<(HLOClient_AcosOp $input), (HLO_SelectOp
-  (HLO_CompareOp $input, (HLO_ConstOp (ConstantSplat<"0"> $input)),
-    HLO_COMPARISON_DIRECTION_NE),
-  (HLO_MulOp (HLO_ConstOp (ConstantSplat<"2"> $input)),
-    (HLO_Atan2Op (HLO_SqrtOp (HLO_SubOp
-                               (HLO_ConstOp (ConstantSplat<"1"> $input)),
-                               (HLO_MulOp $input, $input))),
-      (HLO_AddOp (HLO_ConstOp (ConstantSplat<"1"> $input)), $input))),
-  (HLO_ConstOp (ConstantSplat<"M_PI"> $input)))>;
-
 // TODO(bixia): Lower Cast with a Complex type source operand or with
 // Truncate=True for floating point value conversions.
 def : Pat<(TF_CastOp HLO_Tensor:$arg, ConstBoolAttrFalse),
@@ -594,6 +629,9 @@ def : Pat<(TF_BitcastOp:$res HLO_Tensor:$arg),
           (HLO_BitcastConvertOp $arg),
           [(BothElementTypesSameWidthIntOrFloat $res, $arg)]>;
 
+// TODO(jpienaar): Lower constant like to constant to broadcast if dynamic
+// and going to MHLO.
+
 //===----------------------------------------------------------------------===//
 // Random ops.
 //===----------------------------------------------------------------------===//
@@ -657,3 +695,19 @@ def : Pattern<(TF_SoftplusOp AnyTensor:$features),
                 ),
                 (replaceWithValue $output)
               ]>;
+
+//===----------------------------------------------------------------------===//
+// XlaGather op.
+//===----------------------------------------------------------------------===//
+
+def ToGatherDimNumsAttr : NativeCodeCall<"GetGatherDimNumsAttr($0, &$_builder)">;
+
+def HasValidGatherDims : Constraint<CPred<"HasValidGatherDims($0)">>;
+
+def : Pat<(TF_XlaGatherOp $operand, $start_indices, (TF_ConstOp $slice_sizes),
+                          $dimension_numbers, $indices_are_sorted),
+          (HLO_GatherOp $operand, $start_indices,
+                        (ToGatherDimNumsAttr $dimension_numbers),
+                        (CastElementsToI64Elements $slice_sizes),
+                        $indices_are_sorted),
+          [(HasValidGatherDims $dimension_numbers)]>;
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
index bb50fc198c8..b06edcd3db8 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/op_or_arg_name_mapper.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
@@ -74,17 +75,14 @@ limitations under the License.
 
 namespace mlir {
 namespace mhlo {
-namespace {
 
-template <typename T, size_t N>
-using InlinedVector = tensorflow::gtl::InlinedVector<T, N>;  // non-absl ok
-
-static bool IsOpAllowlisted(Operation* op) {
+bool IsOpAllowedTf2XlaFallback(Operation* op) {
   // Allowlisted TensorFlow ops are known to have well behaved tf2xla kernels
   // building valid MLIR using MlirHloBuilder.
   // TODO(hinsu): Drop explicit allowlist when MLIR based bridge is enabled for
   // all tf2xla kernels.
   // clang-format off
+
   static llvm::SmallDenseSet<mlir::TypeID, 512> ops = {
     TypeID::get<TF::AbsOp>(),
     TypeID::get<TF::AcoshOp>(),
@@ -104,6 +102,11 @@ static bool IsOpAllowlisted(Operation* op) {
     TypeID::get<TF::AtanhOp>(),
     TypeID::get<TF::AtanOp>(),
     TypeID::get<TF::BatchMatMulV2Op>(),
+    TypeID::get<TF::BatchToSpaceNDOp>(),
+    TypeID::get<TF::BatchToSpaceOp>(),
+    TypeID::get<TF::BesselI0eOp>(),
+    TypeID::get<TF::BesselI1eOp>(),
+    TypeID::get<TF::BetaincOp>(),
     TypeID::get<TF::BiasAddGradOp>(),
     TypeID::get<TF::BiasAddOp>(),
     TypeID::get<TF::BitwiseAndOp>(),
@@ -112,12 +115,17 @@ static bool IsOpAllowlisted(Operation* op) {
     TypeID::get<TF::BucketizeOp>(),
     TypeID::get<TF::CastOp>(),
     TypeID::get<TF::ClipByValueOp>(),
+    TypeID::get<TF::CholeskyOp>(),
     TypeID::get<TF::ComplexAbsOp>(),
     TypeID::get<TF::ConjugateTransposeOp>(),
     TypeID::get<TF::CoshOp>(),
     TypeID::get<TF::CrossOp>(),
     TypeID::get<TF::DataFormatDimMapOp>(),
     TypeID::get<TF::DataFormatVecPermuteOp>(),
+    TypeID::get<TF::DepthToSpaceOp>(),
+    TypeID::get<TF::DepthwiseConv2dNativeBackpropFilterOp>(),
+    TypeID::get<TF::DepthwiseConv2dNativeBackpropInputOp>(),
+    TypeID::get<TF::DiagOp>(),
     TypeID::get<TF::DigammaOp>(),
     TypeID::get<TF::DivNoNanOp>(),
     TypeID::get<TF::EluGradOp>(),
@@ -126,6 +134,7 @@ static bool IsOpAllowlisted(Operation* op) {
     TypeID::get<TF::ErfcOp>(),
     TypeID::get<TF::ErfOp>(),
     TypeID::get<TF::Expm1Op>(),
+    TypeID::get<TF::ExtractImagePatchesOp>(),
     TypeID::get<TF::FFT2DOp>(),
     TypeID::get<TF::FFT3DOp>(),
     TypeID::get<TF::FFTOp>(),
@@ -137,10 +146,11 @@ static bool IsOpAllowlisted(Operation* op) {
     TypeID::get<TF::HSVToRGBOp>(),
     TypeID::get<TF::IFFT2DOp>(),
     TypeID::get<TF::IFFT3DOp>(),
-    TypeID::get<TF::IFFTOp>(),
     TypeID::get<TF::IRFFT2DOp>(),
     TypeID::get<TF::IRFFT3DOp>(),
-    TypeID::get<TF::IRFFTOp>(),
+    TypeID::get<TF::IgammaOp>(),
+    TypeID::get<TF::IgammacOp>(),
+    TypeID::get<TF::IgammaGradAOp>(),
     TypeID::get<TF::InvertOp>(),
     TypeID::get<TF::InvOp>(),
     TypeID::get<TF::LRNOp>(),
@@ -151,24 +161,38 @@ static bool IsOpAllowlisted(Operation* op) {
     TypeID::get<TF::LessEqualOp>(),
     TypeID::get<TF::LessOp>(),
     TypeID::get<TF::LgammaOp>(),
+    TypeID::get<TF::ListDiffOp>(),
     TypeID::get<TF::LogicalAndOp>(),
     TypeID::get<TF::LogicalNotOp>(),
     TypeID::get<TF::LogicalOrOp>(),
     TypeID::get<TF::LogOp>(),
+    TypeID::get<TF::LowerBoundOp>(),
     TypeID::get<TF::MatMulOp>(),
+    TypeID::get<TF::MatrixDiagV3Op>(),
+    TypeID::get<TF::MatrixInverseOp>(),
+    TypeID::get<TF::MatrixSetDiagV3Op>(),
+    TypeID::get<TF::MatrixSolveOp>(),
+    TypeID::get<TF::MatrixTriangularSolveOp>(),
     TypeID::get<TF::MirrorPadOp>(),
     TypeID::get<TF::MulOp>(),
+    TypeID::get<TF::MultinomialOp>(),
     TypeID::get<TF::NegOp>(),
     TypeID::get<TF::NonMaxSuppressionV4Op>(),
     TypeID::get<TF::NotEqualOp>(),
     TypeID::get<TF::PadOp>(),
     TypeID::get<TF::PlaceholderWithDefaultOp>(),
     TypeID::get<TF::PowOp>(),
+    // TODO(hinsu): Canonicalize QuantizeAndDequantize and
+    // QuantizeAndDequantizeV2 to QuantizeAndDequantizeV3 by converting
+    // attributes to operands.
+    TypeID::get<TF::QuantizeAndDequantizeOp>(),
+    TypeID::get<TF::QuantizeAndDequantizeV2Op>(),
+    TypeID::get<TF::QuantizeAndDequantizeV3Op>(),
     TypeID::get<TF::RFFT2DOp>(),
     TypeID::get<TF::RFFT3DOp>(),
     TypeID::get<TF::RGBToHSVOp>(),
+    TypeID::get<TF::RandomUniformIntOp>(),
     TypeID::get<TF::RealDivOp>(),
-    TypeID::get<TF::ReciprocalOp>(),
     TypeID::get<TF::ReciprocalGradOp>(),
     TypeID::get<TF::Relu6GradOp>(),
     TypeID::get<TF::ResizeBilinearOp>(),
@@ -177,6 +201,7 @@ static bool IsOpAllowlisted(Operation* op) {
     TypeID::get<TF::ReverseSequenceOp>(),
     TypeID::get<TF::RightShiftOp>(),
     TypeID::get<TF::RintOp>(),
+    TypeID::get<TF::RollOp>(),
     TypeID::get<TF::RoundOp>(),
     TypeID::get<TF::SelectV2Op>(),
     TypeID::get<TF::SelfAdjointEigV2Op>(),
@@ -188,9 +213,17 @@ static bool IsOpAllowlisted(Operation* op) {
     TypeID::get<TF::SoftplusGradOp>(),
     TypeID::get<TF::SoftsignGradOp>(),
     TypeID::get<TF::SoftsignOp>(),
+    TypeID::get<TF::SpaceToBatchNDOp>(),
+    TypeID::get<TF::SpaceToBatchOp>(),
+    TypeID::get<TF::SpaceToDepthOp>(),
     TypeID::get<TF::SparseToDenseOp>(),
     TypeID::get<TF::SqrtGradOp>(),
     TypeID::get<TF::SquareOp>(),
+    TypeID::get<TF::StatelessMultinomialOp>(),
+    TypeID::get<TF::StatelessRandomNormalOp>(),
+    TypeID::get<TF::StatelessRandomUniformOp>(),
+    TypeID::get<TF::StatelessRandomUniformIntOp>(),
+    TypeID::get<TF::StatelessTruncatedNormalOp>(),
     TypeID::get<TF::SubOp>(),
     TypeID::get<TF::TanOp>(),
     TypeID::get<TF::TransposeOp>(),
@@ -198,6 +231,7 @@ static bool IsOpAllowlisted(Operation* op) {
     TypeID::get<TF::TruncatedNormalOp>(),
     TypeID::get<TF::TruncateModOp>(),
     TypeID::get<TF::UnpackOp>(),
+    TypeID::get<TF::UpperBoundOp>(),
     TypeID::get<TF::XdivyOp>(),
     TypeID::get<TF::XlaBroadcastHelperOp>(),
     TypeID::get<TF::XlaConvOp>(),
@@ -215,6 +249,11 @@ static bool IsOpAllowlisted(Operation* op) {
   return ops.count(abstractOp->typeID);
 }
 
+namespace {
+
+template <typename T, size_t N>
+using InlinedVector = tensorflow::gtl::InlinedVector<T, N>;  // non-absl ok
+
 static std::unique_ptr<tensorflow::StaticDeviceMgr> CreateDeviceMgr(
     const std::string& device_type) {
   // Register compilation kernels for all registered XLA backends.
@@ -492,12 +531,14 @@ tensorflow::XlaExpression Tf2XlaRewriter::GetExprForOperand(Value operand,
 
 class Tf2XlaRewritePattern : public RewritePattern {
  public:
+  // Set benefit to 0 (= least benefit) so this pattern is only used as a
+  // fallback.
   explicit Tf2XlaRewritePattern(const std::string& device_type)
-      : RewritePattern(1, MatchAnyOpTypeTag()), device_type_(device_type) {}
+      : RewritePattern(0, MatchAnyOpTypeTag()), device_type_(device_type) {}
 
   LogicalResult matchAndRewrite(Operation* op,
                                 PatternRewriter& rewriter) const override {
-    if (!IsOpAllowlisted(op)) return failure();
+    if (!IsOpAllowedTf2XlaFallback(op)) return failure();
     return Tf2XlaRewriter::RewriteOp(op, rewriter, device_type_);
   }
 
@@ -527,8 +568,7 @@ class LegalizeTF : public PassWrapper<LegalizeTF, FunctionPass> {
   // global device type for all TensorFlow ops.
   Option<std::string> device_type_{
       *this, "device-type",
-      llvm::cl::desc("XLA device type for execution of TensorFlow ops. "
-                     "Supports XLA_CPU_JIT and XLA_TPU_JIT for now.")};
+      llvm::cl::desc("XLA device type for execution of TensorFlow ops.")};
 };
 
 static PassRegistration<LegalizeTF> pass(
diff --git a/tensorflow/compiler/mlir/xla/transforms/mhlo_to_lhlo_with_xla.cc b/tensorflow/compiler/mlir/xla/transforms/mhlo_to_lhlo_with_xla.cc
index 832bad2dcc8..ef362d95b97 100644
--- a/tensorflow/compiler/mlir/xla/transforms/mhlo_to_lhlo_with_xla.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/mhlo_to_lhlo_with_xla.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "mlir/IR/AffineMap.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Dialect.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Module.h"  // from @llvm-project
@@ -34,6 +35,7 @@ limitations under the License.
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassOptions.h"  // from @llvm-project
 #include "mlir/Translation.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
 #include "tensorflow/compiler/mlir/xla/hlo_function_importer.h"
 #include "tensorflow/compiler/mlir/xla/hlo_utils.h"
@@ -134,6 +136,11 @@ Status ConvertModule(std::unique_ptr<HloModule> hlo_module, ModuleOp module,
 // MLIR LHLO.
 class XlaHloToLhloPass
     : public PassWrapper<XlaHloToLhloPass, OperationPass<ModuleOp>> {
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<mlir::StandardOpsDialect, mlir::mhlo::MhloDialect,
+                    mlir::lmhlo::LmhloDialect>();
+  }
+
  public:
   XlaHloToLhloPass() = default;
   XlaHloToLhloPass(const XlaHloToLhloPass&) {}
@@ -182,7 +189,10 @@ template <typename OpType>
 StatusOr<OpType> LhloDialectEmitter::CreateOpWithoutAttrs(
     HloInstruction* instr) {
   Location loc = getLocation(instr);
-  ArrayRef<std::pair<Identifier, Attribute>> attrs;
+  std::pair<Identifier, Attribute> attrs[] = {
+      {Identifier::get("name", builder_.getContext()),
+       builder_.getStringAttr(instr->name())},
+  };
   ArrayRef<Type> rets{};
 
   llvm::SmallVector<Value, 4> operands;
@@ -252,15 +262,14 @@ Status LhloDialectEmitter::DefaultAction(HloInstruction* instr) {
   return Status::OK();
 }
 
-StatusOr<mlir::Operation*> LhloDialectEmitter::EmitSortOp(
-    HloInstruction* instr) {
+StatusOr<lmhlo::SortOp> LhloDialectEmitter::EmitSortOp(HloInstruction* instr) {
   TF_ASSIGN_OR_RETURN(auto sort, CreateOpWithoutAttrs<lmhlo::SortOp>(instr));
   auto* sort_instr = ::xla::Cast<::xla::HloSortInstruction>(instr);
   sort.dimensionAttr(builder_.getI64IntegerAttr(sort_instr->sort_dimension()));
   sort.is_stableAttr(builder_.getBoolAttr(sort_instr->is_stable()));
   TF_RETURN_IF_ERROR(::xla::HloFunctionImporter::ImportAsRegion(
       *sort_instr->called_computations()[0], &sort.comparator(), &builder_));
-  return sort.getOperation();
+  return sort;
 }
 
 Status LhloDialectEmitter::HandleSort(HloInstruction* instr) {
@@ -327,19 +336,17 @@ Status LhloDialectEmitter::CreateView(const HloInstruction* instr,
 // create another view to adjust the slice for the shape of the instruction.
 Status LhloDialectEmitter::GetOrCreateView(const HloInstruction* instr,
                                            SmallVectorImpl<Value>* values) {
-  // In terms of cache key, we have several choices:
-  // * Use `instr`. It's the easiest, but it creates different cache entries for
-  // aliased buffers, which could have been deduplicated.
-  // * Use the actual content as the key, aka a tree of allocation slices.
-  // * Somewhere in the middle, use the allocation slice for the instruction. If
-  // `instr` is a tuple, the key is the allocated buffer for the tuple itself
-  // (an array of pointers).
+  // Cache generated ViewOp and StaticMemRefCastOp by instruction. We could have
+  // gone fancier to do the following cacheing:
+  //   %range = ViewOp(%allocation, %offset) : memref<i8xSIZE>
+  //   %typed_range = ViewOp(%range) : memref<f32x...>
   //
-  // We choose the third approach for simplicity.
-  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice slice,
-                      assignment_.GetUniqueTopLevelSlice(instr));
-  SliceKey slice_key(slice.allocation(), slice.offset(), slice.size());
-  auto result = slices_.try_emplace(slice_key, llvm::SmallVector<Value, 4>{});
+  // where %range is cached. This in theory gives easier time for alias
+  // analysis, since the identity of %range defines alias. However,
+  // %typed_range can't be cached, as different buffers with different types and
+  // shapes may still alias. Creating two ViewOps doesn't seem to worth the
+  // effort for a slightly easier aliasing, so we don't over optimize here.
+  auto result = slices_.try_emplace(instr, llvm::SmallVector<Value, 4>{});
   llvm::SmallVectorImpl<Value>& new_values = result.first->second;
   if (result.second) {
     ::xla::ShapeIndex shape_index;
@@ -439,7 +446,7 @@ Status LhloDialectEmitter::Initialize() {
   builder_.setInsertionPointToEnd(block);
 
   auto return_op = builder_.create<ReturnOp>(builder_.getUnknownLoc());
-  builder_ = mlir::OpBuilder(return_op);
+  builder_ = OpBuilder(return_op);
 
   return Status::OK();
 }
@@ -450,6 +457,9 @@ std::unique_ptr<OperationPass<ModuleOp>> createXlaHloToLhloWithXlaPass() {
 
 Status HloToLhloModule(const BufferAssignment& assignment,
                        const HloModule& hlo_module, ModuleOp module) {
+  module.getContext()
+      ->loadDialect<StandardOpsDialect, mhlo::MhloDialect,
+                    lmhlo::LmhloDialect>();
   HloComputation* computation = hlo_module.entry_computation();
 
   LhloDialectEmitter emitter(assignment, *computation, module);
@@ -463,15 +473,14 @@ Status HloToLhloModule(const BufferAssignment& assignment,
   return computation->AcceptOrdered(&emitter, ordering);
 }
 
-mlir::OwningModuleRef HloTextToLhloTranslateFunction(
-    llvm::StringRef input, mlir::MLIRContext* context) {
+OwningModuleRef HloTextToLhloTranslateFunction(llvm::StringRef input,
+                                               MLIRContext* context) {
   StatusOr<std::unique_ptr<HloModule>> maybe_module =
       xla::ParseAndReturnUnverifiedModule(
           absl::string_view(input.data(), input.size()));
   TF_CHECK_OK(maybe_module.status());
 
-  mlir::OwningModuleRef module =
-      mlir::ModuleOp::create(mlir::UnknownLoc::get(context));
+  OwningModuleRef module = ModuleOp::create(UnknownLoc::get(context));
 
   TF_CHECK_OK(
       ConvertModule(maybe_module.ConsumeValueOrDie(), module.get(), "Host"));
diff --git a/tensorflow/compiler/mlir/xla/transforms/mhlo_to_lhlo_with_xla.h b/tensorflow/compiler/mlir/xla/transforms/mhlo_to_lhlo_with_xla.h
index bdc977616b1..89514116254 100644
--- a/tensorflow/compiler/mlir/xla/transforms/mhlo_to_lhlo_with_xla.h
+++ b/tensorflow/compiler/mlir/xla/transforms/mhlo_to_lhlo_with_xla.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/Module.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 
@@ -41,7 +42,7 @@ class LhloDialectEmitter : public ::xla::DfsHloVisitorWithDefault {
         builder_(module.getContext()),
         i8_type_(builder_.getIntegerType(8)) {}
 
-  ::xla::StatusOr<mlir::Operation*> EmitSortOp(::xla::HloInstruction* instr);
+  ::xla::StatusOr<lmhlo::SortOp> EmitSortOp(::xla::HloInstruction* instr);
 
  private:
   template <typename OpType>
@@ -86,9 +87,9 @@ class LhloDialectEmitter : public ::xla::DfsHloVisitorWithDefault {
   // (see below).
   llvm::DenseMap<const ::xla::BufferAllocation*, Value> allocations_;
 
-  // This map provides access to MLIR buffers for each HLO instruction, keyed by
-  // its buffer slice. A slice is contained in a BufferAllocation, and has an
-  // offset and a size.
+  // This map provides access to MLIR buffers for each HLO instruction, keyed
+  // instruction identity. A slice is contained in a BufferAllocation, and has
+  // an offset and a size.
   //
   // As for why we don't use HloInstruction*, see GetOrCreateView(), but mostly
   // we want to leverage better of the aliased buffers.
@@ -101,8 +102,8 @@ class LhloDialectEmitter : public ::xla::DfsHloVisitorWithDefault {
   //
   // `slices_` is populated lazily in the `GetOrCreateView()` helper as we
   // process every instruction.
-  using SliceKey = std::tuple<const ::xla::BufferAllocation*, int64_t, int64_t>;
-  llvm::DenseMap<SliceKey, llvm::SmallVector<Value, 1>> slices_;
+  llvm::DenseMap<const xla::HloInstruction*, llvm::SmallVector<Value, 1>>
+      slices_;
 
   // The BufferAssignment computed by XLA ahead of time.
   const ::xla::BufferAssignment& assignment_;
diff --git a/tensorflow/compiler/mlir/xla/transforms/passes.h b/tensorflow/compiler/mlir/xla/transforms/passes.h
index 8850581f0bd..45166941620 100644
--- a/tensorflow/compiler/mlir/xla/transforms/passes.h
+++ b/tensorflow/compiler/mlir/xla/transforms/passes.h
@@ -36,8 +36,13 @@ namespace mhlo {
 
 /// Lowers from TF dialect to HLO dialect. When allow_partial_conversion is
 /// false, emits an error if there is any operation that can't be legalized.
+/// When `tf2xla_fallback_device_type` is not `None`, also uses legalization
+/// patterns from TF2XLA fallback for provided device type (see
+/// legalize_tf_with_tf2xla.cc for details). By default, TF2XLA fallback is not
+/// used.
 std::unique_ptr<OperationPass<FuncOp>> createLegalizeTFPass(
-    bool allow_partial_conversion = false, bool legalize_chlo = true);
+    bool allow_partial_conversion = false, bool legalize_chlo = true,
+    llvm::Optional<StringRef> tf2xla_fallback_device_type = llvm::None);
 
 /// Lowers from TF dialect to HLO dialect using tf2xla op kernels for the
 /// specified device type.
@@ -53,6 +58,9 @@ void PopulateLegalizeTfWithTf2XlaPatterns(llvm::StringRef device_type,
 void PopulateLegalizeTfPatterns(MLIRContext* context,
                                 OwningRewritePatternList* patterns);
 
+/// Checks whether the op is supported by the Tf2Xla fallback for legalization.
+bool IsOpAllowedTf2XlaFallback(Operation* op);
+
 /// Lowers from TF dialect's control flow to HLO dialect's control flow.
 std::unique_ptr<OperationPass<ModuleOp>> createLegalizeTFControlFlowPass();
 
@@ -60,8 +68,14 @@ std::unique_ptr<OperationPass<ModuleOp>> createLegalizeTFControlFlowPass();
 /// dialect using the conversion patterns registered by the HLO dialect. When
 /// allow_partial_conversion is false, emits an error if there is any operation
 /// that can't be legalized.
-LogicalResult legalizeTF(Operation* op, bool allow_partial_conversion = false,
-                         bool legalize_chlo = true);
+/// When `tf2xla_fallback_device_type` is not `None`, also uses legalization
+/// patterns from TF2XLA fallback for provided device type (see
+/// legalize_tf_with_tf2xla.cc for details). By default, TF2XLA fallback is not
+/// used.
+LogicalResult legalizeTF(
+    Operation* op, bool allow_partial_conversion = false,
+    bool legalize_chlo = true,
+    llvm::Optional<StringRef> tf2xla_fallback_device_type = llvm::None);
 
 // Legalizes TF/XLA communication ops (TF dialect) to HLO dialect communication
 // ops.
diff --git a/tensorflow/compiler/mlir/xla/type_to_shape.cc b/tensorflow/compiler/mlir/xla/type_to_shape.cc
index afc36916348..b725f56b455 100644
--- a/tensorflow/compiler/mlir/xla/type_to_shape.cc
+++ b/tensorflow/compiler/mlir/xla/type_to_shape.cc
@@ -43,47 +43,41 @@ using xla::ShapeUtil;
 namespace xla {
 
 PrimitiveType TypeToPrimitiveType(mlir::Type type) {
-  switch (type.getKind()) {
-    case mlir::StandardTypes::BF16:
-      return PrimitiveType::BF16;
-    case mlir::StandardTypes::Complex: {
-      mlir::Type element_ty = type.cast<mlir::ComplexType>().getElementType();
-      switch (element_ty.getKind()) {
-        case mlir::StandardTypes::F32:
-          return PrimitiveType::C64;
-        case mlir::StandardTypes::F64:
-          return PrimitiveType::C128;
-        default:
-          return PrimitiveType::PRIMITIVE_TYPE_INVALID;
-      }
+  if (type.isBF16()) {
+    return PrimitiveType::BF16;
+  } else if (type.isF16()) {
+    return PrimitiveType::F16;
+  } else if (type.isF32()) {
+    return PrimitiveType::F32;
+  } else if (type.isF64()) {
+    return PrimitiveType::F64;
+  } else if (auto complex_type = type.dyn_cast<mlir::ComplexType>()) {
+    mlir::Type element_ty = complex_type.getElementType();
+    if (element_ty.isF32()) {
+      return PrimitiveType::C64;
+
+    } else if (element_ty.isF64()) {
+      return PrimitiveType::C128;
     }
-    case mlir::StandardTypes::F16:
-      return PrimitiveType::F16;
-    case mlir::StandardTypes::F32:
-      return PrimitiveType::F32;
-    case mlir::StandardTypes::F64:
-      return PrimitiveType::F64;
-    case mlir::StandardTypes::Integer: {
-      const auto integer = type.cast<IntegerType>();
-      bool is_unsigned = integer.isUnsigned();
-      switch (integer.getWidth()) {
-        case 1:
-          return PrimitiveType::PRED;
-        case 8:
-          return is_unsigned ? PrimitiveType::U8 : PrimitiveType::S8;
-        case 16:
-          return is_unsigned ? PrimitiveType::U16 : PrimitiveType::S16;
-        case 32:
-          return is_unsigned ? PrimitiveType::U32 : PrimitiveType::S32;
-        case 64:
-          return is_unsigned ? PrimitiveType::U64 : PrimitiveType::S64;
-        default:
-          return PrimitiveType::PRIMITIVE_TYPE_INVALID;
-      }
+    return PrimitiveType::PRIMITIVE_TYPE_INVALID;
+  } else if (auto integer_type = type.dyn_cast<mlir::IntegerType>()) {
+    bool is_unsigned = integer_type.isUnsigned();
+    switch (integer_type.getWidth()) {
+      case 1:
+        return PrimitiveType::PRED;
+      case 8:
+        return is_unsigned ? PrimitiveType::U8 : PrimitiveType::S8;
+      case 16:
+        return is_unsigned ? PrimitiveType::U16 : PrimitiveType::S16;
+      case 32:
+        return is_unsigned ? PrimitiveType::U32 : PrimitiveType::S32;
+      case 64:
+        return is_unsigned ? PrimitiveType::U64 : PrimitiveType::S64;
+      default:
+        return PrimitiveType::PRIMITIVE_TYPE_INVALID;
     }
-    default:
-      return PrimitiveType::PRIMITIVE_TYPE_INVALID;
   }
+  return PrimitiveType::PRIMITIVE_TYPE_INVALID;
 }
 
 StatusOr<Shape> TypeToShape(
@@ -108,108 +102,89 @@ Shape TypeToShape(mlir::Type type) {
   if (ptype != PrimitiveType::PRIMITIVE_TYPE_INVALID)
     return ShapeUtil::MakeShape(ptype, {});
 
-  switch (type.getKind()) {
-    case mlir::StandardTypes::BF16:
-    case mlir::StandardTypes::F32:
-    case mlir::StandardTypes::F64:
-    case mlir::StandardTypes::Integer: {
-      auto* context = type.getContext();
-      mlir::emitError(mlir::UnknownLoc::get(context))
-          << "lowering should have been handled by primitive type lowering for "
-          << debugString(type);
-      break;
+  if (type.isBF16() || type.isF32() || type.isF64() ||
+      type.isa<mlir::IntegerType>()) {
+    auto* context = type.getContext();
+    mlir::emitError(mlir::UnknownLoc::get(context))
+        << "lowering should have been handled by primitive type lowering for "
+        << debugString(type);
+  } else if (auto v = type.dyn_cast<mlir::VectorType>()) {
+    llvm::SmallVector<int64, 4> span(v.getShape().begin(), v.getShape().end());
+    mlir::Type element_type = v.getElementType();
+    PrimitiveType primitive_type = TypeToPrimitiveType(element_type);
+    if (primitive_type != PrimitiveType::PRIMITIVE_TYPE_INVALID)
+      return ShapeUtil::MakeShape(primitive_type, span);
+  } else if (auto m = type.dyn_cast<mlir::MemRefType>()) {
+    llvm::SmallVector<int64, 6> span(m.getShape().begin(), m.getShape().end());
+    mlir::Type element_type = m.getElementType();
+    // Treat a memref of a vector as if it was a memref of primitive type with
+    // the vector dimensions at the end.
+    if (auto v = element_type.dyn_cast<mlir::VectorType>()) {
+      element_type = v.getElementType();
+      span.insert(span.end(), v.getShape().begin(), v.getShape().end());
     }
-    case mlir::StandardTypes::Vector: {
-      const auto v = type.cast<VectorType>();
-      llvm::SmallVector<int64, 4> span(v.getShape().begin(),
-                                       v.getShape().end());
-      mlir::Type element_type = v.getElementType();
-      PrimitiveType primitive_type = TypeToPrimitiveType(element_type);
-      if (primitive_type != PrimitiveType::PRIMITIVE_TYPE_INVALID)
-        return ShapeUtil::MakeShape(primitive_type, span);
-      break;
-    }
-    case mlir::StandardTypes::MemRef: {
-      const auto m = type.cast<MemRefType>();
-      llvm::SmallVector<int64, 6> span(m.getShape().begin(),
-                                       m.getShape().end());
-      mlir::Type element_type = m.getElementType();
-      // Treat a memref of a vector as if it was a memref of primitive type with
-      // the vector dimensions at the end.
-      if (auto v = element_type.dyn_cast<VectorType>()) {
-        element_type = v.getElementType();
-        span.insert(span.end(), v.getShape().begin(), v.getShape().end());
+    PrimitiveType primitive_type = TypeToPrimitiveType(element_type);
+    if (primitive_type == PrimitiveType::PRIMITIVE_TYPE_INVALID) return {};
+    // For the primitive type case, the shape of the memref is similar to the
+    // vector type case (i.e., it is, modulo the layout, the same dimensions
+    // and primitive type).
+    if (m.getAffineMaps().empty())
+      return ShapeUtil::MakeShape(primitive_type, span);
+
+    if (m.getAffineMaps().size() == 1) {
+      llvm::SmallVector<int64_t, 4> strides;
+      int64_t offset;
+      if (failed(mlir::getStridesAndOffset(m, strides, offset))) return {};
+
+      llvm::SmallVector<std::pair<int64_t, int>, 4> strides_with_indices;
+      for (const auto& e : llvm::enumerate(strides)) {
+        strides_with_indices.push_back({e.value(), e.index()});
       }
-      PrimitiveType primitive_type = TypeToPrimitiveType(element_type);
-      if (primitive_type == PrimitiveType::PRIMITIVE_TYPE_INVALID) break;
-      // For the primitive type case, the shape of the memref is similar to the
-      // vector type case (i.e., it is, modulo the layout, the same dimensions
-      // and primitive type).
-      if (m.getAffineMaps().empty())
-        return ShapeUtil::MakeShape(primitive_type, span);
+      std::sort(strides_with_indices.begin(), strides_with_indices.end());
 
-      if (m.getAffineMaps().size() == 1) {
-        llvm::SmallVector<int64_t, 4> strides;
-        int64_t offset;
-        if (failed(mlir::getStridesAndOffset(m, strides, offset))) return {};
+      llvm::SmallVector<int64, 4> minor_to_major;
+      int64_t stride = 1;
+      for (const auto& pr : strides_with_indices) {
+        minor_to_major.push_back(pr.second);
 
-        llvm::SmallVector<std::pair<int64_t, int>, 4> strides_with_indices;
-        for (const auto& e : llvm::enumerate(strides)) {
-          strides_with_indices.push_back({e.value(), e.index()});
-        }
-        std::sort(strides_with_indices.begin(), strides_with_indices.end());
+        // Either the affine map is not perfectly strided, or the dimensions
+        // recovered from strides don't match the actual dimensions in shapes.
+        if (stride != pr.first) return {};
 
-        llvm::SmallVector<int64, 4> minor_to_major;
-        int64_t stride = 1;
-        for (const auto& pr : strides_with_indices) {
-          minor_to_major.push_back(pr.second);
-
-          // Either the affine map is not perfectly strided, or the dimensions
-          // recovered from strides don't match the actual dimensions in shapes.
-          if (stride != pr.first) return {};
-
-          stride *= m.getShape()[pr.second];
-        }
-
-        llvm::SmallVector<int64, 4> dimensions(m.getShape().begin(),
-                                               m.getShape().end());
-        return ::xla::ShapeUtil::MakeShapeWithLayout(primitive_type, dimensions,
-                                                     minor_to_major);
+        stride *= m.getShape()[pr.second];
       }
-      break;
+
+      llvm::SmallVector<int64, 4> dimensions(m.getShape().begin(),
+                                             m.getShape().end());
+      return ::xla::ShapeUtil::MakeShapeWithLayout(primitive_type, dimensions,
+                                                   minor_to_major);
     }
-    case mlir::StandardTypes::RankedTensor: {
-      // TODO(jpienaar): This is only handling the base case with primitive
-      // element type.
-      const auto t = type.cast<RankedTensorType>();
-      llvm::SmallVector<int64, 4> span(t.getShape().begin(),
-                                       t.getShape().end());
-      // Only fully static shapes are supported.
-      // TODO(b/115638799): Update once xla::Shape can support dynamic shapes.
-      if (std::find(t.getShape().begin(), t.getShape().end(), -1) !=
-          t.getShape().end())
-        break;
-      mlir::Type element_type = t.getElementType();
-      PrimitiveType primitive_type = TypeToPrimitiveType(element_type);
-      // Only primitive element type supported.
-      if (primitive_type != PrimitiveType::PRIMITIVE_TYPE_INVALID)
-        return ShapeUtil::MakeShape(primitive_type, span);
-      break;
+  } else if (auto t = type.dyn_cast<mlir::RankedTensorType>()) {
+    // TODO(jpienaar): This is only handling the base case with primitive
+    // element type.
+    llvm::SmallVector<int64, 4> span(t.getShape().begin(), t.getShape().end());
+    // Only fully static shapes are supported.
+    // TODO(b/115638799): Update once xla::Shape can support dynamic shapes.
+    if (std::find(t.getShape().begin(), t.getShape().end(), -1) !=
+        t.getShape().end())
+      return {};
+    mlir::Type element_type = t.getElementType();
+    PrimitiveType primitive_type = TypeToPrimitiveType(element_type);
+    // Only primitive element type supported.
+    if (primitive_type != PrimitiveType::PRIMITIVE_TYPE_INVALID)
+      return ShapeUtil::MakeShape(primitive_type, span);
+  } else if (auto tuple_type = type.dyn_cast<mlir::TupleType>()) {
+    llvm::SmallVector<Shape, 4> shapes;
+    shapes.reserve(tuple_type.size());
+    for (mlir::Type sub_type : tuple_type.getTypes()) {
+      shapes.push_back(TypeToShape(sub_type));
     }
-    case mlir::StandardTypes::Tuple: {
-      const auto t = type.cast<mlir::TupleType>();
-      llvm::SmallVector<Shape, 4> shapes;
-      shapes.reserve(t.size());
-      for (mlir::Type sub_type : t.getTypes()) {
-        shapes.push_back(TypeToShape(sub_type));
-      }
-      return ShapeUtil::MakeTupleShape(shapes);
-    }
-    case mlir::mhlo::HLOTypes::Token:
-      return ShapeUtil::MakeTokenShape();
-    default:
-      break;
+    return ShapeUtil::MakeTupleShape(shapes);
+
+  } else if (type.isa<mlir::mhlo::TokenType>()) {
+    return ShapeUtil::MakeTokenShape();
   }
+
   // Return empty XLA shape to signify error. No MLIR Type maps to a empty
   // Shape.
   return {};
diff --git a/tensorflow/compiler/mlir/xla/xla_mlir_translate.cc b/tensorflow/compiler/mlir/xla/xla_mlir_translate.cc
index 158671a6242..4ad44d1bd77 100644
--- a/tensorflow/compiler/mlir/xla/xla_mlir_translate.cc
+++ b/tensorflow/compiler/mlir/xla/xla_mlir_translate.cc
@@ -17,11 +17,15 @@ limitations under the License.
 
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/Dialect.h"  // from @llvm-project
 #include "mlir/IR/Module.h"  // from @llvm-project
 #include "mlir/Translation.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
 #include "tensorflow/compiler/mlir/xla/hlo_to_mlir_hlo.h"
 #include "tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.h"
 #include "tensorflow/compiler/mlir/xla/transforms/mhlo_to_lhlo_with_xla.h"
+#include "tensorflow/compiler/mlir/xla/xla_mlir_translate_cl.h"
 #include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
@@ -30,19 +34,6 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/protobuf.h"
 
-// NOLINTNEXTLINE
-static llvm::cl::opt<bool> emit_use_tuple_arg(
-    "emit-use-tuple-args",
-    llvm::cl::desc(
-        "Emit HLO modules using tuples as args for the entry computation"),
-    llvm::cl::init(false));
-
-// NOLINTNEXTLINE
-static llvm::cl::opt<bool> emit_return_tuple(
-    "emit-return-tuple",
-    llvm::cl::desc("Emit HLO modules with entry computations returning tuple"),
-    llvm::cl::init(false));
-
 namespace xla {
 
 namespace {
@@ -173,11 +164,17 @@ static mlir::LogicalResult MlirHloToHloTextTranslateFunction(
 
 }  // namespace xla
 
+static void RegisterInputDialects(mlir::DialectRegistry& registry) {
+  registry.insert<mlir::StandardOpsDialect, mlir::mhlo::MhloDialect>();
+}
+
 static mlir::TranslateFromMLIRRegistration MlirHloToHloTranslate(
-    "mlir-hlo-to-hlo", xla::MlirHloToHloTranslateFunction);
+    "mlir-hlo-to-hlo", xla::MlirHloToHloTranslateFunction,
+    RegisterInputDialects);
 
 static mlir::TranslateFromMLIRRegistration MlirHloToHloTextTranslate(
-    "mlir-hlo-to-hlo-text", xla::MlirHloToHloTextTranslateFunction);
+    "mlir-hlo-to-hlo-text", xla::MlirHloToHloTextTranslateFunction,
+    RegisterInputDialects);
 
 static mlir::TranslateToMLIRRegistration HloToHloMlirTranslate(
     "hlo-to-mlir-hlo", xla::HloToMlirHloTranslateFunction);
diff --git a/tensorflow/compiler/mlir/xla/xla_mlir_translate_cl.cc b/tensorflow/compiler/mlir/xla/xla_mlir_translate_cl.cc
new file mode 100644
index 00000000000..bfe4ed3844f
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/xla_mlir_translate_cl.cc
@@ -0,0 +1,29 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/xla/xla_mlir_translate_cl.h"
+
+// NOLINTNEXTLINE
+llvm::cl::opt<bool> emit_use_tuple_arg(
+    "emit-use-tuple-args",
+    llvm::cl::desc(
+        "Emit HLO modules using tuples as args for the entry computation"),
+    llvm::cl::init(false));
+
+// NOLINTNEXTLINE
+llvm::cl::opt<bool> emit_return_tuple(
+    "emit-return-tuple",
+    llvm::cl::desc("Emit HLO modules with entry computations returning tuple"),
+    llvm::cl::init(false));
diff --git a/tensorflow/compiler/mlir/xla/xla_mlir_translate_cl.h b/tensorflow/compiler/mlir/xla/xla_mlir_translate_cl.h
new file mode 100644
index 00000000000..1d5a29a5fdb
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/xla_mlir_translate_cl.h
@@ -0,0 +1,28 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_XLA_XLA_MLIR_TRANSLATE_CL_H_
+#define TENSORFLOW_COMPILER_MLIR_XLA_XLA_MLIR_TRANSLATE_CL_H_
+
+#include "llvm/Support/CommandLine.h"
+
+// This file contains command-line options aimed to provide the parameters
+// required by the MLIR module to XLA HLO conversion. It is only intended to be
+// included by binaries.
+
+extern llvm::cl::opt<bool> emit_use_tuple_arg;
+extern llvm::cl::opt<bool> emit_return_tuple;
+
+#endif  // TENSORFLOW_COMPILER_MLIR_XLA_XLA_MLIR_TRANSLATE_CL_H_
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index a3134fc1c94..30b8a7e5561 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -265,6 +265,7 @@ tf_xla_py_test(
     name = "categorical_op_test",
     size = "small",
     srcs = ["categorical_op_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -283,6 +284,7 @@ tf_xla_py_test(
     name = "cholesky_op_test",
     size = "medium",
     srcs = ["cholesky_op_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -347,6 +349,7 @@ tf_xla_py_test(
     size = "small",
     timeout = "moderate",
     srcs = ["searchsorted_op_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -389,6 +392,7 @@ tf_xla_py_test(
     size = "small",
     timeout = "moderate",
     srcs = ["matrix_inverse_op_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -411,6 +415,7 @@ tf_xla_py_test(
     size = "small",
     timeout = "moderate",
     srcs = ["matrix_solve_op_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -429,6 +434,7 @@ tf_xla_py_test(
     size = "small",
     timeout = "moderate",
     srcs = ["matrix_triangular_solve_op_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -469,7 +475,6 @@ tf_xla_py_test(
     enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
-        "many_xla_args",
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
         "no_rocm",
     ],
@@ -533,6 +538,7 @@ tf_xla_py_test(
     name = "depthwise_conv_op_test",
     size = "medium",
     srcs = ["depthwise_conv_op_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     shard_count = 5,
     tags = [
@@ -632,6 +638,7 @@ tf_xla_py_test(
     name = "extract_image_patches_op_test",
     size = "small",
     srcs = ["extract_image_patches_op_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -688,6 +695,7 @@ tf_xla_py_test(
     name = "fft_test",
     size = "medium",
     srcs = ["fft_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     shard_count = 6,
     tags = [
@@ -783,6 +791,7 @@ tf_xla_py_test(
     name = "listdiff_op_test",
     size = "small",
     srcs = ["listdiff_op_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -821,6 +830,7 @@ tf_xla_py_test(
     name = "manip_ops_test",
     size = "small",
     srcs = ["manip_ops_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -858,6 +868,7 @@ tf_xla_py_test(
     size = "medium",
     timeout = "long",
     srcs = ["matrix_diag_ops_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -927,6 +938,7 @@ tf_xla_py_test(
     name = "pooling_ops_test",
     size = "medium",
     srcs = ["pooling_ops_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     shard_count = 20,
     tags = [
@@ -1005,6 +1017,7 @@ tf_xla_py_test(
         "cpu",
         "cpu_ondemand",
     ],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     shard_count = 5,
     tags = [
@@ -1031,6 +1044,7 @@ tf_xla_py_test(
         "cpu",
         "cpu_ondemand",
     ],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     shard_count = 5,
     tags = [
@@ -1113,6 +1127,7 @@ tf_xla_py_test(
     name = "reverse_ops_test",
     size = "medium",
     srcs = ["reverse_ops_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -1164,6 +1179,7 @@ tf_xla_py_test(
     name = "scan_ops_test",
     size = "medium",
     srcs = ["scan_ops_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -1182,6 +1198,7 @@ tf_xla_py_test(
     name = "segment_reduction_ops_test",
     size = "medium",
     srcs = ["segment_reduction_ops_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -1204,6 +1221,7 @@ tf_xla_py_test(
     name = "spacetobatch_op_test",
     size = "medium",
     srcs = ["spacetobatch_op_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     shard_count = 3,
     tags = [
@@ -1279,6 +1297,7 @@ tf_xla_py_test(
     name = "stateless_random_ops_test",
     size = "medium",
     srcs = ["stateless_random_ops_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -1562,6 +1581,7 @@ tf_xla_py_test(
     name = "xla_device_test",
     size = "small",
     srcs = ["xla_device_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -1685,6 +1705,7 @@ tf_cuda_cc_test(
     deps = [
         "//tensorflow/cc:cc_ops",
         "//tensorflow/compiler/jit",
+        "//tensorflow/compiler/jit:flags",
         "//tensorflow/compiler/jit:xla_kernel_creator",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/core:core_cpu",
@@ -1883,6 +1904,7 @@ tf_xla_py_test(
     name = "special_math_test",
     size = "medium",
     srcs = ["special_math_test.py"],
+    enable_mlir_bridge = True,
     shard_count = 5,
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
diff --git a/tensorflow/compiler/tests/cholesky_op_test.py b/tensorflow/compiler/tests/cholesky_op_test.py
index 4bd2dfd9244..41877d39381 100644
--- a/tensorflow/compiler/tests/cholesky_op_test.py
+++ b/tensorflow/compiler/tests/cholesky_op_test.py
@@ -28,7 +28,6 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import linalg_ops
-from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
@@ -61,7 +60,7 @@ class CholeskyOpTest(xla_test.XLATestCase):
           dtypes.as_dtype(x.dtype), shape=x.shape)
       with self.test_scope():
         chol = linalg_ops.cholesky(placeholder)
-      verification = math_ops.matmul(chol, chol, adjoint_b=True)
+      verification = test_util.matmul_without_tf32(chol, chol, adjoint_b=True)
       self._verifyCholeskyBase(sess, placeholder, x, chol, verification, atol)
 
   def testBasic(self):
diff --git a/tensorflow/compiler/tests/matrix_triangular_solve_op_test.py b/tensorflow/compiler/tests/matrix_triangular_solve_op_test.py
index 0202c582ef3..08aad66abe1 100644
--- a/tensorflow/compiler/tests/matrix_triangular_solve_op_test.py
+++ b/tensorflow/compiler/tests/matrix_triangular_solve_op_test.py
@@ -29,7 +29,6 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import linalg_ops
-from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
@@ -65,7 +64,8 @@ class MatrixTriangularSolveOpTest(xla_test.XLATestCase):
       with self.test_scope():
         x = linalg_ops.matrix_triangular_solve(
             placeholder_a, placeholder_b, lower=lower, adjoint=adjoint)
-      verification = math_ops.matmul(placeholder_ca, x, adjoint_a=adjoint)
+      verification = test_util.matmul_without_tf32(
+          placeholder_ca, x, adjoint_a=adjoint)
       self._VerifyTriangularSolveBase(sess, placeholder_a, placeholder_ca,
                                       placeholder_b, a, clean_a, b,
                                       verification, atol)
@@ -135,6 +135,7 @@ class MatrixTriangularSolveOpTest(xla_test.XLATestCase):
     self._VerifyTriangularSolve(
         a.astype(np.float32), b.astype(np.float32), True, False, 1e-4)
 
+  @test_util.disable_mlir_bridge("Error handling")
   def testNonSquareCoefficientMatrix(self):
     rng = np.random.RandomState(0)
     for dtype in self.float_types:
@@ -145,6 +146,7 @@ class MatrixTriangularSolveOpTest(xla_test.XLATestCase):
           linalg_ops.matrix_triangular_solve(a, b)
 
   @test_util.run_v2_only  # Different error types
+  @test_util.disable_mlir_bridge("Error handling")
   def testWrongDimensionsV2(self):
     randn = np.random.RandomState(0).randn
     for dtype in self.float_types:
@@ -156,6 +158,7 @@ class MatrixTriangularSolveOpTest(xla_test.XLATestCase):
         linalg_ops.matrix_triangular_solve(lhs, rhs)
 
   @test_util.run_v1_only("Different error types")
+  @test_util.disable_mlir_bridge("Error handling")
   def testWrongDimensionsV1(self):
     randn = np.random.RandomState(0).randn
     for dtype in self.float_types:
diff --git a/tensorflow/compiler/tests/qr_op_test.py b/tensorflow/compiler/tests/qr_op_test.py
index 5fcf254db82..f396e61f3d1 100644
--- a/tensorflow/compiler/tests/qr_op_test.py
+++ b/tensorflow/compiler/tests/qr_op_test.py
@@ -24,12 +24,18 @@ from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.compiler.tests import xla_test
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_without_tensor_float_32(
+    "XLA QR op calls matmul. Also, matmul used for verification. Also with "
+    'TensorFloat-32, mysterious "Unable to launch cuBLAS gemm" error '
+    "occasionally occurs")
+# TODO(b/165435566): Fix "Unable to launch cuBLAS gemm" error
 class QrOpTest(xla_test.XLATestCase, parameterized.TestCase):
 
   def AdjustedNorm(self, x):
@@ -73,7 +79,7 @@ class QrOpTest(xla_test.XLATestCase, parameterized.TestCase):
 
     with self.session() as sess:
       x_tf = array_ops.placeholder(dtype)
-      with self.test_scope():
+      with self.device_scope():
         q_tf, r_tf = linalg_ops.qr(x_tf, full_matrices=full_matrices)
       q_tf_val, r_tf_val = sess.run([q_tf, r_tf], feed_dict={x_tf: x_np})
 
diff --git a/tensorflow/compiler/tests/randomized_tests.cc b/tensorflow/compiler/tests/randomized_tests.cc
index 9f963110cf3..0f19affc8e3 100644
--- a/tensorflow/compiler/tests/randomized_tests.cc
+++ b/tensorflow/compiler/tests/randomized_tests.cc
@@ -63,9 +63,9 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/lib/bfloat16/bfloat16.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/bfloat16.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/session.h"
 #include "tensorflow/core/public/session_options.h"
diff --git a/tensorflow/compiler/tests/scan_ops_test.py b/tensorflow/compiler/tests/scan_ops_test.py
index 7c36f8b13ca..440b7672d98 100644
--- a/tensorflow/compiler/tests/scan_ops_test.py
+++ b/tensorflow/compiler/tests/scan_ops_test.py
@@ -24,6 +24,7 @@ from tensorflow.compiler.tests import xla_test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
@@ -129,6 +130,7 @@ class CumsumTest(xla_test.XLATestCase):
       for axis in range(-6, 6, 3):
         self._compareAll(x, axis)
 
+  @test_util.disable_mlir_bridge("Error handling")
   def testInvalidAxis(self):
     x = np.arange(0, 10).reshape([2, 5]).astype(np.float32)
     with self.session(), self.test_scope():
@@ -207,6 +209,7 @@ class CumprodTest(xla_test.XLATestCase):
       for axis in range(-6, 6, 3):
         self._compareAll(x, axis)
 
+  @test_util.disable_mlir_bridge("Error handling")
   def testInvalidAxis(self):
     x = np.arange(0, 10).reshape([2, 5]).astype(np.float32)
     with self.session(), self.test_scope():
diff --git a/tensorflow/compiler/tests/stateful_random_ops_test.py b/tensorflow/compiler/tests/stateful_random_ops_test.py
index 343969c40d7..239b99de19e 100644
--- a/tensorflow/compiler/tests/stateful_random_ops_test.py
+++ b/tensorflow/compiler/tests/stateful_random_ops_test.py
@@ -25,7 +25,9 @@ import numpy as np
 
 from tensorflow.compiler.tests import xla_test
 from tensorflow.python.client import device_lib
+from tensorflow.python.compat import compat
 from tensorflow.python.eager import def_function
+from tensorflow.python.framework import config
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
@@ -156,6 +158,10 @@ class StatefulRandomOpsTest(xla_test.XLATestCase, parameterized.TestCase):
   def testNewStateThreeFry(self):
     """Tests that the new state is correct (for ThreeFry).
     """
+    if compat.forward_compatible(2020, 10, 25):
+      self.skipTest("The expected values in this test is inconsistent with "
+                    "CPU/GPU. testXLAEqualsCPU has the correct checks of the "
+                    "new states for the new version.")
     with ops.device(xla_device_name()):
       counter = 57
       key = 0x1234
@@ -171,6 +177,10 @@ class StatefulRandomOpsTest(xla_test.XLATestCase, parameterized.TestCase):
   def testNewStatePhilox(self):
     """Tests that the new state is correct (for Philox).
     """
+    if compat.forward_compatible(2020, 10, 25):
+      self.skipTest("The expected values in this test is inconsistent with "
+                    "CPU/GPU. testXLAEqualsCPU has the correct checks of the "
+                    "new states for the new version.")
     with ops.device(xla_device_name()):
       counter_low = 57
       counter_high = 283
@@ -204,13 +214,39 @@ class StatefulRandomOpsTest(xla_test.XLATestCase, parameterized.TestCase):
     """Tests that XLA and CPU kernels generate the same integers."""
     seed = 1234
     shape = [315, 49]
-    with ops.device("/device:CPU:0"):
-      cpu = (random.Generator.from_seed(seed=seed, alg=random.RNG_ALG_PHILOX)
-             .uniform_full_int(shape=shape, dtype=dtype))
-    with ops.device(xla_device_name()):
-      xla = (random.Generator.from_seed(seed=seed, alg=random.RNG_ALG_PHILOX)
-             .uniform_full_int(shape=shape, dtype=dtype))
-    self.assertAllEqual(cpu, xla)
+    if compat.forward_compatible(2020, 10, 25):
+      with ops.device("/device:CPU:0"):
+        cpu_gen = random.Generator.from_seed(
+            seed=seed, alg=random.RNG_ALG_PHILOX)
+      with ops.device(xla_device_name()):
+        xla_gen = random.Generator.from_seed(
+            seed=seed, alg=random.RNG_ALG_PHILOX)
+      # Repeat multiple times to make sure that the state after
+      # number-generation are the same between CPU and XLA.
+      for _ in range(5):
+        with ops.device("/device:CPU:0"):
+          # Test both number-generation and skip
+          cpu = cpu_gen.uniform_full_int(shape=shape, dtype=dtype)
+          cpu_gen.skip(100)
+        with ops.device(xla_device_name()):
+          xla = xla_gen.uniform_full_int(shape=shape, dtype=dtype)
+          xla_gen.skip(100)
+        self.assertAllEqual(cpu, xla)
+        self.assertAllEqual(cpu_gen.state, xla_gen.state)
+    else:
+      # The old version doesn't guarantee that CPU and XLA are in the same state
+      # after number-generation, which is a bug.
+      with ops.device("/device:CPU:0"):
+        cpu = (
+            random.Generator.from_seed(
+                seed=seed, alg=random.RNG_ALG_PHILOX).uniform_full_int(
+                    shape=shape, dtype=dtype))
+      with ops.device(xla_device_name()):
+        xla = (
+            random.Generator.from_seed(
+                seed=seed, alg=random.RNG_ALG_PHILOX).uniform_full_int(
+                    shape=shape, dtype=dtype))
+      self.assertAllEqual(cpu, xla)
 
   def _testRngIsNotConstant(self, rng, dtype):
     # Tests that 'rng' does not always return the same value.
@@ -364,4 +400,5 @@ class StatefulRandomOpsTest(xla_test.XLATestCase, parameterized.TestCase):
 
 if __name__ == "__main__":
   ops.enable_eager_execution()
+  config.set_soft_device_placement(False)
   test.main()
diff --git a/tensorflow/compiler/tests/stateless_random_ops_test.py b/tensorflow/compiler/tests/stateless_random_ops_test.py
index f9d792806b0..23e827f18e8 100644
--- a/tensorflow/compiler/tests/stateless_random_ops_test.py
+++ b/tensorflow/compiler/tests/stateless_random_ops_test.py
@@ -21,7 +21,11 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.compiler.tests import xla_test
+from tensorflow.python.compiler.xla import xla
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import config
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.kernel_tests.random import util as \
 random_test_util
 from tensorflow.python.ops import array_ops
@@ -39,6 +43,26 @@ class StatelessRandomOpsTest(xla_test.XLATestCase):
       allowed_types.update({dtypes.int32, dtypes.int64})
     return self.all_tf_types & allowed_types
 
+  @test_util.run_v2_only
+  def testForcedCompile(self):
+    """Tests whole-function forced-compilation.
+
+    This test checks that stateless_random_* can be used in forced-compilation
+    scenarios (e.g. TPU). The new version of stateless_random_* requires the
+    intermediate tensor `alg` to be compile-time constant, so we need to check
+    that this requirement is met. We use xla.compile instead of tf.function's
+    experimental_compile because the latter doesn't throw an error even if the
+    compile-time-constant constraint is not met.
+    """
+    if config.list_logical_devices('TPU'):
+      self.skipTest('To accommodate OSS, xla.compile support for TPU is not '
+                    'linked in.')
+    @def_function.function
+    def f(x):
+      return xla.compile(
+          lambda x: stateless.stateless_random_normal([], seed=x), [x])
+    f([1, 2])
+
   def testDeterminism(self):
     # Stateless values should be equal iff the seeds are equal (roughly)
     with self.session(), self.test_scope():
@@ -138,7 +162,7 @@ class StatelessRandomOpsBenchmark(test.Benchmark):
 
   def _benchmarkUniform(self, name, dtype, use_xla_jit):
 
-    def BuilderFn():
+    def builder_fn():
       shape = (10, 1000, 1000)
       seed_var = variables.Variable((312, 456),
                                     dtype=dtypes.int32,
@@ -147,7 +171,7 @@ class StatelessRandomOpsBenchmark(test.Benchmark):
           shape, seed=seed_var, dtype=dtype)
       return '%s.shape%s' % (name, shape), [random_t]
 
-    xla_test.Benchmark(self, BuilderFn, use_xla_jit=use_xla_jit, device='cpu')
+    xla_test.Benchmark(self, builder_fn, use_xla_jit=use_xla_jit, device='cpu')
 
   def benchmarkUniformF32(self):
     self._benchmarkUniform(
@@ -167,4 +191,5 @@ class StatelessRandomOpsBenchmark(test.Benchmark):
 
 
 if __name__ == '__main__':
+  config.set_soft_device_placement(False)
   test.main()
diff --git a/tensorflow/compiler/tests/ternary_ops_test.py b/tensorflow/compiler/tests/ternary_ops_test.py
index 7bbfecff403..4109fdc64a5 100644
--- a/tensorflow/compiler/tests/ternary_ops_test.py
+++ b/tensorflow/compiler/tests/ternary_ops_test.py
@@ -214,7 +214,6 @@ class TernaryOpsTest(xla_test.XLATestCase, parameterized.TestCase):
             upper,
             expected=np.minimum(np.maximum(x, lower), upper))
 
-  @test_util.disable_mlir_bridge('Enable tf.Betainc Compilation')
   def testBetaincSanity(self):
     # This operation is only supported for float32 and float64.
     for dtype in self.numeric_types & {np.float32, np.float64}:
@@ -252,7 +251,6 @@ class TernaryOpsTest(xla_test.XLATestCase, parameterized.TestCase):
           'atol': 2e-4
       },
   )
-  @test_util.disable_mlir_bridge('Enable tf.Betainc Compilation')
   def testBetainc(self, sigma, rtol, atol):
     # This operation is only supported for float32 and float64.
     for dtype in self.numeric_types & {np.float32, np.float64}:
diff --git a/tensorflow/compiler/tests/unary_ops_composition_test.cc b/tensorflow/compiler/tests/unary_ops_composition_test.cc
index 569261de094..0e40c497c24 100644
--- a/tensorflow/compiler/tests/unary_ops_composition_test.cc
+++ b/tensorflow/compiler/tests/unary_ops_composition_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/synchronization/notification.h"
+#include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
@@ -43,6 +44,11 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
+static bool Initialized = [] {
+  tensorflow::GetXlaDeviceFlags()->tf_xla_enable_xla_devices = true;
+  return true;
+}();
+
 class UnaryOpsCompositionTest : public OpsTestBase {
  protected:
   template <typename T>
diff --git a/tensorflow/compiler/tests/unary_ops_test.py b/tensorflow/compiler/tests/unary_ops_test.py
index eb022da6895..b5f82bcff12 100644
--- a/tensorflow/compiler/tests/unary_ops_test.py
+++ b/tensorflow/compiler/tests/unary_ops_test.py
@@ -96,7 +96,7 @@ class UnaryOpsTest(xla_test.XLATestCase):
     self.assertAllEqual(result, expected)
 
   @test_util.disable_mlir_bridge(
-      "MlirHloBuilder::Iota missing required for xla::Diag")
+      "Handle complex element type in DiagPart lowering")
   def testAllTypeOps(self):
     for dtype in self.numeric_types - {np.int8, np.uint8}:
       self._assertOpOutputMatchesExpected(
@@ -538,8 +538,6 @@ class UnaryOpsTest(xla_test.XLATestCase):
             np.array([-40, 40], dtype=dtype),
             expected=np.array([1.0, 0.025], dtype=dtype))
 
-  @test_util.disable_mlir_bridge(
-      "TODO(b/153812660): Handle tf.QuantizeAndDequantize compilation")
   def testQuantizeAndDequantize(self):
     for dtype in self.float_types:
 
@@ -1070,8 +1068,6 @@ class UnaryOpsTest(xla_test.XLATestCase):
         ],
         equality_test=self.ListsAreClose)
 
-  @test_util.disable_mlir_bridge(
-      "TODO(b/153812660): Handle tf.DepthToSpace compilation")
   def testDepthToSpace(self):
 
     def make_op(data_format):
@@ -1118,14 +1114,12 @@ class UnaryOpsTest(xla_test.XLATestCase):
       self._assertOpOutputMatchesExpected(
           make_op("NCHW_VECT_C"),
           np.arange(32, dtype=dtype).reshape((1, 8, 1, 1, 4)),
-          expected=np.array([[[[[0, 1], [8, 9]], [[16, 17], [24, 25]]],
-                              [[[2, 3], [10, 11]], [[18, 19], [26, 27]]],
-                              [[[4, 5], [12, 13]], [[20, 21], [28, 29]]],
-                              [[[6, 7], [14, 15]], [[22, 23], [30, 31]]]]],
+          expected=np.array([[[[[0, 1, 2, 3], [8, 9, 10, 11]],
+                               [[16, 17, 18, 19], [24, 25, 26, 27]]],
+                              [[[4, 5, 6, 7], [12, 13, 14, 15]],
+                               [[20, 21, 22, 23], [28, 29, 30, 31]]]]],
                             dtype=dtype))
 
-  @test_util.disable_mlir_bridge(
-      "TODO(b/153812660): Handle tf.SpaceToDepth compilation")
   def testSpaceToDepth(self):
 
     def make_op(data_format):
@@ -1172,11 +1166,11 @@ class UnaryOpsTest(xla_test.XLATestCase):
       self._assertOpOutputMatchesExpected(
           make_op("NCHW_VECT_C"),
           np.arange(32, dtype=dtype).reshape((1, 2, 2, 2, 4)),
-          expected=np.array([[[[[0, 1, 2, 3, 16, 17, 18, 19]]],
-                              [[[4, 5, 6, 7, 20, 21, 22, 23]]],
-                              [[[8, 9, 10, 11, 24, 25, 26, 27]]],
-                              [[[12, 13, 14, 15, 28, 29, 30, 31]]]]],
-                            dtype=dtype))
+          expected=np.array(
+              [[[[[0, 1, 2, 3]]], [[[16, 17, 18, 19]]], [[[4, 5, 6, 7]]],
+                [[[20, 21, 22, 23]]], [[[8, 9, 10, 11]]], [[[24, 25, 26, 27]]],
+                [[[12, 13, 14, 15]]], [[[28, 29, 30, 31]]]]],
+              dtype=dtype))
 
   def _assertSoftplusMatchesExpected(self,
                                      features,
diff --git a/tensorflow/compiler/tests/xla_device_gpu_test.py b/tensorflow/compiler/tests/xla_device_gpu_test.py
index 1e30ebd55d0..304405c82ce 100644
--- a/tensorflow/compiler/tests/xla_device_gpu_test.py
+++ b/tensorflow/compiler/tests/xla_device_gpu_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.client import session as session_lib
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -27,6 +28,10 @@ from tensorflow.python.platform import test
 
 class XlaDeviceGpuTest(test.TestCase):
 
+  def __init__(self, method_name="runTest"):
+    super(XlaDeviceGpuTest, self).__init__(method_name)
+    context.context().enable_xla_devices()
+
   def testCopiesToAndFromGpuWork(self):
     """Tests that copies between GPU and XLA devices work."""
     if not test.is_gpu_available():
diff --git a/tensorflow/compiler/tests/xla_ops_test.py b/tensorflow/compiler/tests/xla_ops_test.py
index 0d6ae81ef6e..3e9f5e8c5dd 100644
--- a/tensorflow/compiler/tests/xla_ops_test.py
+++ b/tensorflow/compiler/tests/xla_ops_test.py
@@ -79,6 +79,25 @@ class XlaOpsNumericalTest(xla_test.XLATestCase, parameterized.TestCase):
           args=(v,),
           expected=np.tile(v, (7, 42, 1, 1)))
 
+  @test_util.disable_mlir_bridge('Not supported yet')
+  def testGather(self):
+    operand = np.arange(10, dtype=np.int32).reshape([2, 5])
+    start_indices = np.array([2], np.int32)
+    slice_sizes = np.array([1, 3], np.int32)
+
+    def gather(operand, start_indices):
+      dimension_numbers = xla_data_pb2.GatherDimensionNumbers()
+      dimension_numbers.offset_dims.extend([1])
+      dimension_numbers.collapsed_slice_dims.extend([0])
+      dimension_numbers.start_index_map.extend([0])
+      dimension_numbers.index_vector_dim = 1
+      return xla.gather(operand, start_indices, dimension_numbers, slice_sizes)
+
+    self._assertOpOutputMatchesExpected(
+        gather,
+        args=(operand, start_indices),
+        expected=np.array([[5, 6, 7]]))
+
   @test_util.disable_mlir_bridge('Dynamic result types not supported')
   def testShiftRightLogical(self):
     self._assertOpOutputMatchesExpected(
diff --git a/tensorflow/compiler/tests/xla_test.py b/tensorflow/compiler/tests/xla_test.py
index 3b057ed8b17..de97c6ff210 100644
--- a/tensorflow/compiler/tests/xla_test.py
+++ b/tensorflow/compiler/tests/xla_test.py
@@ -83,6 +83,8 @@ class XLATestCase(test.TestCase):
 
   def __init__(self, method_name='runTest'):
     super(XLATestCase, self).__init__(method_name)
+    if 'XLA' in FLAGS.test_device:
+      context.context().enable_xla_devices()
     context.context().enable_mlir_bridge = test_util.is_mlir_bridge_enabled()
 
     self.device = FLAGS.test_device
@@ -235,8 +237,8 @@ class XLATestCase(test.TestCase):
         'test_session not supported on XLATestCase, please use session')
 
   @contextlib.contextmanager
-  def test_scope(self):
-    """Test scope that runs tests on `self.device`.
+  def device_scope(self):
+    """Scope that runs tests on `self.device`.
 
     Yields:
       A scope to apply to the operators under test.
@@ -244,6 +246,15 @@ class XLATestCase(test.TestCase):
     with ops.device('device:{}:0'.format(self.device)):
       yield
 
+  def test_scope(self):
+    """Deprecated alias of `device_scope`.
+
+    This should be avoided as the name starts with `test`, so test runners
+    treat it as a test. This interferes with class decorators that operate on
+    each test method.
+    """
+    return self.device_scope()
+
 
 def Benchmark(tf_bench,
               builder_fn,
diff --git a/tensorflow/compiler/tf2tensorrt/BUILD b/tensorflow/compiler/tf2tensorrt/BUILD
index 0718bd8cd65..44fb5513886 100644
--- a/tensorflow/compiler/tf2tensorrt/BUILD
+++ b/tensorflow/compiler/tf2tensorrt/BUILD
@@ -11,7 +11,6 @@ load(
     "tf_custom_op_library_additional_deps",
     "tf_gen_op_libs",
     "tf_gen_op_wrapper_py",
-    "tf_gpu_kernel_library",
 )
 
 # buildifier: disable=same-origin-load
@@ -81,6 +80,7 @@ tf_cuda_cc_test(
 
 cc_library(
     name = "common_utils",
+    srcs = ["common/utils.cc"],
     hdrs = ["common/utils.h"],
     copts = tf_copts(),
     deps = [
@@ -539,20 +539,6 @@ tf_cuda_cc_test(
     ],
 )
 
-tf_gpu_kernel_library(
-    name = "plugin_cast",
-    srcs = ["plugin/plugin_cast.cu.cc"],
-    deps = [
-        ":trt_plugins",
-        "@com_google_absl//absl/strings",
-        "//tensorflow/core/platform:logging",
-        "//tensorflow/core:framework_lite",
-    ] + if_tensorrt([
-        "@local_config_cuda//cuda:cuda_headers",
-        "@local_config_tensorrt//:tensorrt",
-    ]),
-)
-
 tf_cuda_library(
     name = "trt_plugins",
     srcs = ["plugin/trt_plugin.cc"],
@@ -602,6 +588,7 @@ pybind_extension(
     link_in_framework = True,
     module_name = "_pywrap_py_utils",
     deps = [
+        ":common_utils",
         ":py_utils",
         "//tensorflow/core/platform:env",
         "//tensorflow/core/platform:logging",
diff --git a/tensorflow/compiler/tf2tensorrt/common/utils.cc b/tensorflow/compiler/tf2tensorrt/common/utils.cc
new file mode 100644
index 00000000000..6679ca04513
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/common/utils.cc
@@ -0,0 +1,99 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2tensorrt/common/utils.h"
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#include "absl/base/call_once.h"
+#include "absl/strings/str_join.h"
+#include "third_party/tensorrt/NvInferPlugin.h"
+#endif
+
+namespace tensorflow {
+namespace tensorrt {
+
+std::tuple<int, int, int> GetLinkedTensorRTVersion() {
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+  return std::tuple<int, int, int>{NV_TENSORRT_MAJOR, NV_TENSORRT_MINOR,
+                                   NV_TENSORRT_PATCH};
+#else
+  return std::tuple<int, int, int>{0, 0, 0};
+#endif
+}
+
+std::tuple<int, int, int> GetLoadedTensorRTVersion() {
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+  int ver = getInferLibVersion();
+  int major = ver / 1000;
+  ver = ver - major * 1000;
+  int minor = ver / 100;
+  int patch = ver - minor * 100;
+  return std::tuple<int, int, int>{major, minor, patch};
+#else
+  return std::tuple<int, int, int>{0, 0, 0};
+#endif
+}
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+namespace tensorflow {
+namespace tensorrt {
+namespace {
+
+void InitializeTrtPlugins(nvinfer1::ILogger* trt_logger) {
+  LOG(INFO) << "Linked TensorRT version: "
+            << absl::StrJoin(GetLinkedTensorRTVersion(), ".");
+  LOG(INFO) << "Loaded TensorRT version: "
+            << absl::StrJoin(GetLoadedTensorRTVersion(), ".");
+
+  bool plugin_initialized = initLibNvInferPlugins(trt_logger, "");
+  if (!plugin_initialized) {
+    LOG(ERROR) << "Failed to initialize TensorRT plugins, and conversion may "
+                  "fail later.";
+  }
+
+  int num_trt_plugins = 0;
+  nvinfer1::IPluginCreator* const* trt_plugin_creator_list =
+      getPluginRegistry()->getPluginCreatorList(&num_trt_plugins);
+  if (!trt_plugin_creator_list) {
+    LOG_WARNING_WITH_PREFIX << "Can not find any TensorRT plugins in registry.";
+  } else {
+    VLOG(1) << "Found the following " << num_trt_plugins
+            << " TensorRT plugins in registry:";
+    for (int i = 0; i < num_trt_plugins; ++i) {
+      if (!trt_plugin_creator_list[i]) {
+        LOG_WARNING_WITH_PREFIX
+            << "TensorRT plugin at index " << i
+            << " is not accessible (null pointer returned by "
+               "getPluginCreatorList for this plugin)";
+      } else {
+        VLOG(1) << "  " << trt_plugin_creator_list[i]->getPluginName();
+      }
+    }
+  }
+}
+
+}  // namespace
+
+void MaybeInitializeTrtPlugins(nvinfer1::ILogger* trt_logger) {
+  static absl::once_flag once;
+  absl::call_once(once, InitializeTrtPlugins, trt_logger);
+}
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+#endif
diff --git a/tensorflow/compiler/tf2tensorrt/common/utils.h b/tensorflow/compiler/tf2tensorrt/common/utils.h
index b428733ecd4..b76b75de783 100644
--- a/tensorflow/compiler/tf2tensorrt/common/utils.h
+++ b/tensorflow/compiler/tf2tensorrt/common/utils.h
@@ -16,15 +16,33 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_TF2TENSORRT_COMMON_UTILS_H_
 #define TENSORFLOW_COMPILER_TF2TENSORRT_COMMON_UTILS_H_
 
+#include <tuple>
+
+namespace tensorflow {
+namespace tensorrt {
+// Returns the compile time TensorRT library version information
+// {Maj, Min, Patch}.
+std::tuple<int, int, int> GetLinkedTensorRTVersion();
+
+// Returns the runtime time TensorRT library version information
+// {Maj, Min, Patch}.
+std::tuple<int, int, int> GetLoadedTensorRTVersion();
+}  // namespace tensorrt
+}  // namespace tensorflow
+
 #if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 #include "tensorflow/core/platform/logging.h"
+#include "third_party/tensorrt/NvInfer.h"
 
 namespace tensorflow {
 namespace tensorrt {
 
 #define LOG_WARNING_WITH_PREFIX LOG(WARNING) << "TF-TRT Warning: "
 
+// Initializes the TensorRT plugin registry if this hasn't been done yet.
+void MaybeInitializeTrtPlugins(nvinfer1::ILogger* trt_logger);
+
 }  // namespace tensorrt
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
index c4fc3e4f5da..2804a381e0c 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
@@ -733,6 +733,8 @@ Status ConvertAfterShapes(const ConversionParams& params) {
   }
   segment_options.minimum_segment_size = params.minimum_segment_size;
   segment_options.use_implicit_batch = params.use_implicit_batch;
+  if (segment_options.use_implicit_batch)
+    segment_options.maximum_batch_size = params.max_batch_size;
   segment_options.allow_dynamic_non_batch_dim =
       AllowDynamicNonBatchDimension(params);
 
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph_test.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_graph_test.cc
index 3b0553426c0..be3bb51dbed 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph_test.cc
@@ -151,7 +151,8 @@ TEST(ConvertGraphTest, GetDeviceAndAllocator) {
 
 class ConvertAfterShapesTest : public ::testing::Test {
  public:
-  Status RunConvertAfterShape(Scope s, GraphDef* output_graph_def) {
+  Status RunConvertAfterShape(Scope s, GraphDef* output_graph_def,
+                              int maximum_batch_size = 1000) {
     // Create GraphProperties.
     grappler::GrapplerItem item;
     TF_EXPECT_OK(s.ToGraphDef(&item.graph));
@@ -162,6 +163,7 @@ class ConvertAfterShapesTest : public ::testing::Test {
     const std::vector<string> output_names{"output"};
     ConversionParams params;
     params.output_names = &output_names;
+    params.max_batch_size = maximum_batch_size;
     params.max_workspace_size_bytes = 8 << 20;
     params.output_graph_def = output_graph_def;
     params.minimum_segment_size = 1;
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
index dc5acbb4f50..c0c3f25177e 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
@@ -1197,42 +1197,6 @@ Status TrtNodeValidator::ConvertConstToWeights(
   return status;
 }
 
-static void InitializeTrtPlugins(nvinfer1::ILogger* trt_logger) {
-  static mutex plugin_mutex(LINKER_INITIALIZED);
-  static bool plugin_initialized = false;
-  mutex_lock lock(plugin_mutex);
-  if (plugin_initialized) return;
-
-  LOG(INFO) << "Linked TensorRT version: " << GetLinkedTensorRTVersion();
-  LOG(INFO) << "Loaded TensorRT version: " << GetLoadedTensorRTVersion();
-
-  plugin_initialized = initLibNvInferPlugins(trt_logger, "");
-  if (!plugin_initialized) {
-    LOG(ERROR) << "Failed to initialize TensorRT plugins, and conversion may "
-                  "fail later.";
-  }
-
-  int num_trt_plugins = 0;
-  nvinfer1::IPluginCreator* const* trt_plugin_creator_list =
-      getPluginRegistry()->getPluginCreatorList(&num_trt_plugins);
-  if (!trt_plugin_creator_list) {
-    LOG_WARNING_WITH_PREFIX << "Can not find any TensorRT plugins in registry.";
-  } else {
-    VLOG(1) << "Found the following " << num_trt_plugins
-            << " TensorRT plugins in registry:";
-    for (int i = 0; i < num_trt_plugins; ++i) {
-      if (!trt_plugin_creator_list[i]) {
-        LOG_WARNING_WITH_PREFIX
-            << "TensorRT plugin at index " << i
-            << " is not accessible (null pointer returned by "
-               "getPluginCreatorList for this plugin)";
-      } else {
-        VLOG(1) << "  " << trt_plugin_creator_list[i]->getPluginName();
-      }
-    }
-  }
-}
-
 // static
 StatusOr<std::unique_ptr<Converter>> Converter::Create(
     TrtPrecisionMode precision_mode, bool use_calibration,
@@ -1249,7 +1213,7 @@ Converter::Converter(TrtPrecisionMode precision_mode, bool use_calibration,
     : precision_mode_(precision_mode),
       use_calibration_(use_calibration),
       use_implicit_batch_(use_implicit_batch) {
-  InitializeTrtPlugins(trt_logger);
+  MaybeInitializeTrtPlugins(trt_logger);
   this->RegisterOpConverters();
 }
 
@@ -1434,7 +1398,8 @@ Status Converter::BuildCudaEngine(
   TF_RETURN_IF_ERROR(
       TrtPrecisionModeToName(precision_mode_, &precision_mode_str));
   string trt_network_name = StrCat(
-      "TF:", TF_VERSION_STRING, ", ", "TRT:", GetLoadedTensorRTVersion(), "-",
+      "TF:", TF_VERSION_STRING, ", ",
+      "TRT:", absl::StrJoin(GetLoadedTensorRTVersion(), "."), "-",
       "Precision:", precision_mode_str, ", ", "Calibration:", use_calibration_,
       ", ", "Max-Batch-Size:", max_batch_size, ", ",
       "Max-Workspace-Size:", max_workspace_size_bytes);
@@ -2410,6 +2375,40 @@ Status ConvertTranspose(OpConverterParams* params) {
   return Status::OK();
 }
 
+Status ConvertShape(OpConverterParams* params) {
+  const auto& inputs = params->inputs;
+  TF_RETURN_IF_ERROR(
+      CheckInputsWeights(*params, {{"input", TrtInputArg::kBoth}}));
+  if (params->use_implicit_batch) {
+    return errors::Unimplemented(
+        "Shape is only supported for explicit batch mode.");
+  }
+  if (HasStaticShape(inputs.at(0).GetTrtDims())) {
+    if (params->validation_only) return Status::OK();
+    nvinfer1::Dims input_dims = inputs.at(0).GetTrtDims();
+    nvinfer1::Dims output_dims{1, {input_dims.nbDims}};
+    // Create a const node with the values of output_dims
+    TRT_ShapedWeights weight = params->weight_store->GetTempWeights(
+        nvinfer1::DataType::kINT32, output_dims);
+    int32* values_ptr = static_cast<int32*>(weight.GetValues());
+    std::copy(input_dims.d, input_dims.d + input_dims.nbDims, values_ptr);
+    auto output = params->converter->CreateConstantLayer(weight, output_dims);
+    params->outputs->push_back(TRT_TensorOrWeights(output));
+    return Status::OK();
+  }
+#if IS_TRT_VERSION_GE(6, 0, 0, 0)
+  if (params->validation_only) return Status::OK();
+  nvinfer1::IShapeLayer* shape_layer =
+      params->converter->network()->addShape(*inputs.at(0).tensor());
+  TFTRT_RETURN_ERROR_IF_NULLPTR(shape_layer, params->node_def.name());
+  params->outputs->push_back(TRT_TensorOrWeights(shape_layer->getOutput(0)));
+  return Status::OK();
+#else
+  return errors::Unavailable(
+      "Shape op conversion requires TensorRT 6 or above");
+#endif
+}
+
 Status ConvertReshape(OpConverterParams* params) {
   const auto& inputs = params->inputs;
   TF_RETURN_IF_ERROR(
@@ -3749,6 +3748,7 @@ Status ConvertActivation(OpConverterParams* params) {
       params->converter->network()->addActivation(*inputs.at(0).tensor(),
                                                   op_pair->second);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  layer->setName(node_def.name().c_str());
   // Set parameters.
 #if IS_TRT_VERSION_GE(5, 1, 2, 0)
   if (node_def.op() == "Elu") {
@@ -3849,9 +3849,10 @@ Status ConvertRelu6(OpConverterParams* params) {
   nvinfer1::IActivationLayer* layer =
       params->converter->network()->addActivation(
           *inputs.at(0).tensor(), nvinfer1::ActivationType::kCLIP);
+  TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
   layer->setAlpha(0.0f);
   layer->setBeta(6.0f);
-  TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  layer->setName(node_def.name().c_str());
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
   params->converter->ProvideQuantizationRange(output_tensor, 0.0f, 6.0f);
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
@@ -4407,6 +4408,7 @@ Status ConvertUnary(OpConverterParams* params) {
   nvinfer1::IUnaryLayer* layer =
       params->converter->network()->addUnary(*tensor, op_pair->second);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  layer->setName(node_def.name().c_str());
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
 
   // Set quantization ranges.
@@ -4484,7 +4486,7 @@ Status ConvertReduce(OpConverterParams* params) {
     int trt_axis;
     TF_RETURN_IF_ERROR(
         ConvertAxis(tf_axes_list[i], tensor->getDimensions().nbDims,
-                    node_def.name(), /*use_implicit_batch=*/true, &trt_axis));
+                    node_def.name(), params->use_implicit_batch, &trt_axis));
     axes |= (1 << trt_axis);
   }
 
@@ -5055,6 +5057,7 @@ Status ConvertFusedBatchNorm(OpConverterParams* params) {
       combined_scale_weights.GetTrtWeights(),
       dummy_power_weights.GetTrtWeights());
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  layer->setName(node_def.name().c_str());
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return Status::OK();
@@ -5974,6 +5977,7 @@ static void RegisterValidatableOpConverters(
     (*registration)[pool_op_type] = ConvertPool3D;
   }
 #endif
+  (*registration)["Shape"] = ConvertShape;
   (*registration)["Rsqrt"] = ConvertRsqrt;
   (*registration)["Slice"] = ConvertSlice;
   (*registration)["Softmax"] = ConvertSoftmax;
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
index 29eb24d2316..b127337e02a 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
@@ -1709,12 +1709,12 @@ class ParameterizedOpConverterTestBase
           std::tuple<TrtTestMode, DataType, TrtPrecisionMode>> {
  public:
   ParameterizedOpConverterTestBase()
-      : trt_mode(std::get<0>(GetParam())),
-        tf_type(std::get<1>(GetParam())),
-        converter_precision(std::get<2>(GetParam())) {}
+      : trt_mode_(std::get<0>(GetParam())),
+        tf_type_(std::get<1>(GetParam())),
+        converter_precision_(std::get<2>(GetParam())) {}
 
   void Reset() {
-    OpConverterTest::Reset(converter_precision, trt_mode);
+    OpConverterTest::Reset(converter_precision_, trt_mode_);
     input_data_.clear();
   }
 
@@ -1750,7 +1750,7 @@ class ParameterizedOpConverterTestBase
     if (!partial_input_shape_dims.empty()) {
       partial_shape = partial_input_shape_dims;
     } else {
-      if (trt_mode == TrtTestMode::kDynamicShape) {
+      if (trt_mode_ == TrtTestMode::kDynamicShape) {
         // In dynamic shape mode we make all dims unknown.
         partial_shape = std::vector<int32>(dims.size(), -1);
       } else {
@@ -1776,7 +1776,7 @@ class ParameterizedOpConverterTestBase
   void AddTestTensor(const string& name, const std::vector<int32>& dims,
                      const std::vector<T>& values = {},
                      const std::vector<int32>& partial_input_shape_dims = {}) {
-    AddTestTensor<T>(name, dims, tf_type, values, partial_input_shape_dims);
+    AddTestTensor<T>(name, dims, tf_type_, values, partial_input_shape_dims);
   }
 
   // Builds and runs the converted network. Checks output tensor shape. Tests
@@ -1785,7 +1785,8 @@ class ParameterizedOpConverterTestBase
   void BuildAndRun(const string& name,
                    const std::vector<std::vector<int>>& expected_output_dims,
                    const Status& expected_runtime_status,
-                   const std::vector<Matcher<std::vector<float>>>& matcher) {
+                   const std::vector<Matcher<std::vector<float>>>& matcher,
+                   const std::vector<DataType>& out_tf_types = {}) {
     TensorShape shape;
     const int n_output = expected_output_dims.size();
     ASSERT_EQ(n_output, matcher.size());
@@ -1794,12 +1795,14 @@ class ParameterizedOpConverterTestBase
       TF_EXPECT_OK(
           TensorShapeUtils::MakeShape(expected_output_dims[i], &shape));
       string out_name = (n_output == 1) ? name : StrCat(name, ":", i);
-      InputOutputData data{out_name,
-                           ConstructTensor(shape.num_elements(), 0, tf_type)};
+      DataType out_tf_type =
+          out_tf_types.size() > i ? out_tf_types[i] : tf_type_;
+      InputOutputData data{
+          out_name, ConstructTensor(shape.num_elements(), 0, out_tf_type)};
       output_data.push_back(data);
     }
-    ASSERT_FALSE(input_data_.empty());
-    const int batch_size = input_data_[0].tensor.shape().dim_size(0);
+    const int batch_size =
+        input_data_.empty() ? 1 : input_data_[0].tensor.shape().dim_size(0);
     Status stat =
         OpConverterTest::BuildAndRun(input_data_, &output_data, batch_size);
     ASSERT_EQ(expected_runtime_status.ok(), stat.ok())
@@ -1824,20 +1827,22 @@ class ParameterizedOpConverterTestBase
                        const std::vector<int>& expected_output_dims,
                        const Status& expected_conversion_status,
                        const Status& expected_runtime_status,
-                       const Matcher<std::vector<float>>& matcher) {
+                       const Matcher<std::vector<float>>& matcher,
+                       const std::vector<DataType>& out_tf_types = {}) {
     RunValidationAndConversion(node_def, expected_conversion_status,
                                name.c_str(), expected_output_dims);
     if (expected_conversion_status.ok()) {
       BuildAndRun(name, std::vector<std::vector<int>>({expected_output_dims}),
                   expected_runtime_status,
-                  std::vector<Matcher<std::vector<float>>>({matcher}));
+                  std::vector<Matcher<std::vector<float>>>({matcher}),
+                  out_tf_types);
     }
   }
 
  protected:
-  const TrtTestMode trt_mode;
-  const DataType tf_type;
-  const TrtPrecisionMode converter_precision;
+  const TrtTestMode trt_mode_;
+  const DataType tf_type_;
+  const TrtPrecisionMode converter_precision_;
   DataVec input_data_;
 };
 
@@ -2070,7 +2075,7 @@ TEST_P(OpConverterTest1, ConvertFusedBatchNorm) {
                                      37.342354, 41.013527, 30.9738,   34.469433,
                                      45.018955, 48.59309,  59.369415, 63.04059};
   for (auto get_node_def : get_node_def_vec) {
-    NodeDef tmp_node_def = get_node_def(tf_type, "NCHW", true, 0);
+    NodeDef tmp_node_def = get_node_def(tf_type_, "NCHW", true, 0);
     std::string op_name = tmp_node_def.op();
     std::vector<TestParam> test_param{
         {"NHWC", 0, false, 0,
@@ -2092,7 +2097,7 @@ TEST_P(OpConverterTest1, ConvertFusedBatchNorm) {
          errors::Unimplemented(StrCat("The input \"variance\" for ", op_name,
                                       " must be a constant, at my_batchnorm"))},
         {"NCHW", 0, false, 0.01}};  // The last one is the only test that runs.
-    if (trt_mode == TrtTestMode::kDynamicShape) {
+    if (trt_mode_ == TrtTestMode::kDynamicShape) {
       test_param.push_back(
           {"NCHW", 0, false, 0.01,
            errors::InvalidArgument(
@@ -2102,7 +2107,7 @@ TEST_P(OpConverterTest1, ConvertFusedBatchNorm) {
     for (auto p : test_param) {
       Reset();
       NodeDef node_def =
-          get_node_def(tf_type, p.data_format, p.is_training, p.epsilon);
+          get_node_def(tf_type_, p.data_format, p.is_training, p.epsilon);
       for (int i = 0; i < node_input.size(); i++) {
         if (i == 0 || i == p.tensor_input_idx) {
           // The first input (x) is always added as a tensor, and it hase shape
@@ -2121,7 +2126,7 @@ TEST_P(OpConverterTest1, ConvertFusedBatchNorm) {
           // the first arg is a tensor. TODO(tfeher) Check if one can relax this
           // restriction.
           Status expected_status =
-              (i != 0 && trt_mode == TrtTestMode::kImplicitBatch)
+              (i != 0 && trt_mode_ == TrtTestMode::kImplicitBatch)
                   ? errors::InvalidArgument(
                         StrCat("Batch size doesn't match for tensor ",
                                node_input[i].name,
@@ -2129,19 +2134,19 @@ TEST_P(OpConverterTest1, ConvertFusedBatchNorm) {
                                "converter batch size: 3 vs 2"))
                   : Status::OK();
           std::vector<int> partial_input_shape;
-          if (i == 0 && trt_mode == TrtTestMode::kDynamicShape &&
+          if (i == 0 && trt_mode_ == TrtTestMode::kDynamicShape &&
               !p.keep_channel_unknown) {
             // keep channel dim static (known)
             partial_input_shape.resize(4, -1);
             partial_input_shape[1] = node_input[i].dims[1];
           }
-          AddTestTensor(node_input[i].name, node_input[i].dims, tf_type,
+          AddTestTensor(node_input[i].name, node_input[i].dims, tf_type_,
                         node_input[i].val, partial_input_shape,
                         expected_status);
 
         } else {
           AddTestWeights(node_input[i].name, node_input[i].dims,
-                         node_input[i].val, tf_type);
+                         node_input[i].val, tf_type_);
         }
       }
       TestOpConverter("my_batchnorm", node_def, node_input[0].dims,
@@ -2149,12 +2154,12 @@ TEST_P(OpConverterTest1, ConvertFusedBatchNorm) {
                       ArrayFloatNear(expected_output));
     }
   }
-}  // namespace convert
+}
 
 TEST_P(OpConverterTest1, ConvertTranspose) {
   // Get the NodeDef for Transpose.
   Scope s = Scope::NewRootScope();
-  auto input = ops::Placeholder(s.WithOpName("input"), tf_type);
+  auto input = ops::Placeholder(s.WithOpName("input"), tf_type_);
   auto weights = ops::Placeholder(s.WithOpName("weights"), DT_INT32);
   auto transpose = ops::Transpose(s.WithOpName("my_transpose"), input, weights);
   const NodeDef& node_def = transpose.operation.node()->def();
@@ -2182,13 +2187,13 @@ TEST_P(OpConverterTest1, ConvertTranspose) {
           {},
           {3, 2, 1, 1},
           {3, 2, 1, 0},
-          (trt_mode == TrtTestMode::kImplicitBatch)
+          (trt_mode_ == TrtTestMode::kImplicitBatch)
               ? Status(error::UNIMPLEMENTED,
                        "Transpose at batch dimension is not supported")
               : Status::OK()},
       TestParamBase{{1, 1, 2, 3}, {}, {1, 3, 1, 2}, {0, 3, 1, 2}},
   };
-  if (trt_mode == TrtTestMode::kDynamicShape) {
+  if (trt_mode_ == TrtTestMode::kDynamicShape) {
     // Dynamic shape tests where some shapes are known
     test_params.push_back(TestParamBase{
         {1, 1, 2, 3}, {-1, 1, 2, -1}, {1, 3, 1, 2}, {0, 3, 1, 2}});
@@ -2309,6 +2314,55 @@ TEST_F(OpConverterTest, ConvertReshape) {
   }
 }
 
+TEST_P(OpConverterTest1, ConvertShape) {
+  // Get the NodeDef for Shape op.
+  Scope s = Scope::NewRootScope();
+  auto input = ops::Placeholder(s.WithOpName("input"), tf_type_);
+  auto shape = ops::Shape(s.WithOpName("my_shape"), input);
+  const NodeDef& node_def = shape.operation.node()->def();
+
+  Status conversion_status =
+      (trt_mode_ == TrtTestMode::kImplicitBatch)
+          ? errors::Unimplemented(
+                "Shape is only supported for explicit batch mode.")
+          : Status::OK();
+  std::vector<TestParamBase> test_params = {
+// TODO(b/166274212): Enable the test parameter for TensorRT 7.1.3.
+#if !IS_TRT_VERSION_GE(7, 1, 3, 0)
+    TestParamBase{{1, 2, 3}, {}, {3}, {}, conversion_status},
+#endif
+    // Add input as weight (we use non empty param ({1}) to trigger this).
+    TestParamBase{{1, 2, 3}, {}, {3}, {1}, conversion_status},
+  };
+
+  auto input_is_weight = [](const TestParamBase p) { return !p.param.empty(); };
+  for (auto p : test_params) {
+    SCOPED_TRACE(p);
+    Reset();
+    // The number of elements of the input tensor. We leave it 0 in case we do
+    // not need to add an input tensor. This happens in explicit batch mode: the
+    // shape is known at conversion time and therefore the shape is added to the
+    // network as a constant layer. In this case the single node network that
+    // we use for the unit test have no actual input tensor when it is converted
+    // to a TensorRT network.
+    int n_elements = 0;
+    if (input_is_weight(p) || trt_mode_ != TrtTestMode::kExplicitBatch) {
+      // Calculate the number of elements for adding input data.
+      n_elements = std::accumulate(p.input_dims.begin(), p.input_dims.end(), 1,
+                                   std::multiplies<int>());
+    }
+    std::vector<float> input_val(n_elements, 1);
+    if (!input_is_weight(p)) {
+      AddTestTensor("input", p.input_dims, input_val);
+    } else {
+      AddTestWeights("input", p.input_dims, input_val, tf_type_);
+    }
+    TestOpConverter("my_shape", node_def, p.expected_output_dims, p.status,
+                    p.runtime_status, ElementsAreArray(p.input_dims),
+                    {DT_INT32});
+  }
+}
+
 // Helper function for testing MatMul and BatchMatMul
 // get_matmul corresponds to the function used to generate the node. It should
 // accept (DataType, transpose_a, transpose_b) as parameters.
@@ -2566,7 +2620,7 @@ TEST_P(OpConverterTest2, ConvertBiasAdd) {
   for (const string& data_format : {"NHWC", "NCHW"}) {
     for (const int trt_input_rank : {1, 2, 3, 4}) {
       Reset();
-      NodeDef node_def = get_biasadd_nodedef(data_format, tf_type);
+      NodeDef node_def = get_biasadd_nodedef(data_format, tf_type_);
 
       // Add input, dims_array will be like {2, 1, ..., 1, 3}
       std::vector<int32> dims_array(trt_input_rank + 1, 1);
@@ -2588,7 +2642,7 @@ TEST_P(OpConverterTest2, ConvertBiasAdd) {
       for (int i = 0; i < channel_size; ++i) {
         bias[i] = i + 1;  // bias will be {1, 2, 3, ...}
       }
-      AddTestWeights("weights", {channel_size}, bias, tf_type);
+      AddTestWeights("weights", {channel_size}, bias, tf_type_);
 
       // Build and run the engine.
       std::vector<float> output_data;
@@ -2624,7 +2678,7 @@ NodeDef GetBinaryOpNodeDef(DataType dtype) {
 TEST_P(OpConverterTest2, ConvertBinary) {
   {
     AttrValue dtype;
-    dtype.set_type(tf_type);
+    dtype.set_type(tf_type_);
     // Both inputs are weights.
     Reset();
     NodeDef node_def =
@@ -2669,19 +2723,19 @@ TEST_P(OpConverterTest2, ConvertBinary) {
         if (!op_test_info.count(op_name)) {
           FAIL() << "Binary op test map does not contain op " << op_name;
         }
-        NodeDef node_def = op_test_info[op_name].first(tf_type);
+        NodeDef node_def = op_test_info[op_name].first(tf_type_);
         std::vector<std::string> input_names;
         std::vector<std::vector<int>> input_dims;
         std::vector<std::vector<float>> input_values;
         if (operand_1_is_tensor) {
           AddTestTensor("input1", {2, 1, 2}, {3, 6, 3, 6});
         } else {
-          AddTestWeights("input1", {1, 2}, std::vector<float>{3, 6}, tf_type);
+          AddTestWeights("input1", {1, 2}, std::vector<float>{3, 6}, tf_type_);
         }
         if (operand_2_is_tensor) {
           AddTestTensor("input2", {2, 2, 1}, {2, 3, 2, 3});
         } else {
-          AddTestWeights("input2", {2, 1}, std::vector<float>{2, 3}, tf_type);
+          AddTestWeights("input2", {2, 1}, std::vector<float>{2, 3}, tf_type_);
         }
         TestOpConverter("my_binary", node_def, {2, 2, 2}, Status::OK(),
                         Status::OK(),
@@ -2888,10 +2942,10 @@ TEST_P(OpConverterTest2, ConvertSquare) {
     // Input is weights, should fail.
     Reset();
     Scope s = Scope::NewRootScope();
-    auto input = ops::Placeholder(s.WithOpName("input"), tf_type);
+    auto input = ops::Placeholder(s.WithOpName("input"), tf_type_);
     auto square = ops::Square(s.WithOpName("my_square"), input);
     NodeDef node_def = square.operation.node()->def();
-    AddTestWeights("input", {1, 2, 3}, {1, 2, 3, 4, -5, 6}, tf_type);
+    AddTestWeights("input", {1, 2, 3}, {1, 2, 3, 4, -5, 6}, tf_type_);
     RunValidationAndConversion(
         node_def, error::UNIMPLEMENTED,
         "The input \"x\" for Square must be a tensor, at my_square");
@@ -2900,7 +2954,7 @@ TEST_P(OpConverterTest2, ConvertSquare) {
   Reset();
 
   Scope s = Scope::NewRootScope();
-  auto input = ops::Placeholder(s.WithOpName("input"), tf_type);
+  auto input = ops::Placeholder(s.WithOpName("input"), tf_type_);
   auto square = ops::Square(s.WithOpName("my_square"), input);
   NodeDef node_def = square.operation.node()->def();
 
@@ -2913,7 +2967,7 @@ TEST_P(OpConverterTest2, ConvertSquare) {
     inputs[i] = value;
     expected_outputs[i] = value * value;
   }
-  AddTestTensor("input", {1, 1, 20}, tf_type, inputs);
+  AddTestTensor("input", {1, 1, 20}, tf_type_, inputs);
 
   TestOpConverter("my_square", node_def, {1, 1, 20}, Status::OK(), Status::OK(),
                   ArrayFloatNear(expected_outputs, 0));
@@ -3040,7 +3094,7 @@ TEST_P(OpConverterTest1, ConvertActivation) {
   {
     // Input is weights, should fail.
     Reset();
-    const NodeDef& node_def = CreateUnaryOp<ops::Relu>(tf_type);
+    const NodeDef& node_def = CreateUnaryOp<ops::Relu>(tf_type_);
     AddTestWeights<int32>("input", {1, 2, 3}, {-3, -2, -1, 0, 1, 2});
     RunValidationAndConversion(
         node_def, error::UNIMPLEMENTED,
@@ -3097,7 +3151,7 @@ TEST_P(OpConverterTest1, ConvertActivation) {
       FAIL() << "Activation op test map does not contain op " << op_name;
     }
     Reset();
-    NodeDef node_def = op_map[op_name].first(tf_type);
+    NodeDef node_def = op_map[op_name].first(tf_type_);
     const std::vector<float> input = {-100, -2, -1, 0, 1, 88};
     AddTestTensor("input", p.input_dims, input);
 
@@ -3125,7 +3179,7 @@ TEST_P(OpConverterTest1, ConvertActivation) {
 TEST_P(OpConverterTest1, ConvertExpandDims) {
   // Get the NodeDef for ExpandDims.
   Scope s = Scope::NewRootScope();
-  auto input = ops::Placeholder(s.WithOpName("input"), tf_type);
+  auto input = ops::Placeholder(s.WithOpName("input"), tf_type_);
   auto weights = ops::Placeholder(s.WithOpName("weights"), DT_INT32);
   auto expanddims =
       ops::ExpandDims(s.WithOpName("my_expanddims"), input, weights);
@@ -3153,7 +3207,7 @@ TEST_P(OpConverterTest1, ConvertExpandDims) {
                     {},
                     {1, 1, 1, 2, 3},
                     {0},
-                    trt_mode == TrtTestMode::kImplicitBatch
+                    trt_mode_ == TrtTestMode::kImplicitBatch
                         ? Status(error::UNIMPLEMENTED,
                                  "TensorRT does not allow manipulation of the "
                                  "batch dimension, at my_expanddims")
@@ -3162,7 +3216,7 @@ TEST_P(OpConverterTest1, ConvertExpandDims) {
                     {},
                     {1, 1, 1, 2, 3},
                     {-5},
-                    trt_mode == TrtTestMode::kImplicitBatch
+                    trt_mode_ == TrtTestMode::kImplicitBatch
                         ? Status(error::UNIMPLEMENTED,
                                  "TensorRT does not allow manipulation of the "
                                  "batch dimension, at my_expanddims")
@@ -3200,7 +3254,7 @@ TEST_P(OpConverterTest1, ConvertExpandDims) {
 }
 
 TEST_P(OpConverterTest1, ConvertSqueeze) {
-  const bool use_implicit_batch = (trt_mode == TrtTestMode::kImplicitBatch);
+  const bool use_implicit_batch = (trt_mode_ == TrtTestMode::kImplicitBatch);
   // Get the NodeDef for Squeeze.
   auto get_squeeze_nodedef = [](std::vector<int> axes,
                                 DataType tf_type) -> NodeDef {
@@ -3223,7 +3277,7 @@ TEST_P(OpConverterTest1, ConvertSqueeze) {
           {},            // input partial dims
           {2, 3},        // expected output dims
           {},            // axis
-          trt_mode == TrtTestMode::kExplicitBatch
+          trt_mode_ == TrtTestMode::kExplicitBatch
               ? Status::OK()
               : Status{error::UNIMPLEMENTED,
                        "Squeeze is not implemented for empty squeeze_dims, at "
@@ -3282,7 +3336,7 @@ TEST_P(OpConverterTest1, ConvertSqueeze) {
              "Dimension 2 with size 2 cannot be squeezed because it must be "
              "size 1, at my_squeeze"}};
 
-  if (trt_mode == TrtTestMode::kDynamicShape) {
+  if (trt_mode_ == TrtTestMode::kDynamicShape) {
     // In this test we try to squeeze axis=2 which has size > 1. In dynamic
     // shape mode the converter sees only -1, so it cannot catch this error.
     squeeze_non_singleton.status = Status::OK();  // conversion status
@@ -3297,7 +3351,7 @@ TEST_P(OpConverterTest1, ConvertSqueeze) {
   for (TestParamBase p : test_params) {
     SCOPED_TRACE(p);
     Reset();
-    NodeDef node_def = get_squeeze_nodedef(p.param, tf_type);
+    NodeDef node_def = get_squeeze_nodedef(p.param, tf_type_);
     AddTestTensor("input", p.input_dims, {1, 2, 3, 4, 5, 6},
                   p.partial_input_dims);
     TestOpConverter("my_squeeze", node_def, p.expected_output_dims, p.status,
@@ -4052,14 +4106,14 @@ TEST_F(OpConverterTest, ConvertSlice) {
 
 TEST_P(OpConverterTest1, ConvertConv2D) {
   // Get nodedef for Conv2D layer.
-  DataType tf_type_loc = tf_type;
+  DataType tf_type = tf_type_;
   auto get_conv2d_nodedef =
-      [tf_type_loc](std::vector<int> strides = {1, 1, 1, 1},
-                    string padding = "SAME", string data_format = "NCHW",
-                    std::vector<int> dilations = {1, 1, 1, 1}) -> NodeDef {
+      [tf_type](std::vector<int> strides = {1, 1, 1, 1},
+                string padding = "SAME", string data_format = "NCHW",
+                std::vector<int> dilations = {1, 1, 1, 1}) -> NodeDef {
     Scope s = Scope::NewRootScope();
-    auto input = ops::Placeholder(s.WithOpName("input"), tf_type_loc);
-    auto filter = ops::Placeholder(s.WithOpName("weights"), tf_type_loc);
+    auto input = ops::Placeholder(s.WithOpName("input"), tf_type);
+    auto filter = ops::Placeholder(s.WithOpName("weights"), tf_type);
     ops::Conv2D::Attrs attrs =
         ops::Conv2D::Attrs().DataFormat(data_format).Dilations(dilations);
     auto conv2d = ops::Conv2D(s.WithOpName("my_conv2d"), input, filter, strides,
@@ -4152,12 +4206,12 @@ TEST_P(OpConverterTest1, ConvertConv2D) {
         node_def, error::UNIMPLEMENTED,
         "Stride must be 1 for batch and channel dimensions, at my_conv2d");
   }
-  if (trt_mode == TrtTestMode::kDynamicShape) {
+  if (trt_mode_ == TrtTestMode::kDynamicShape) {
     Reset();
     NodeDef node_def = get_conv2d_nodedef();
     // Channel dim unknown, should fail.
     AddTestTensorWithTFDims("input", {-1, -1, -1, -1},
-                            TfDataTypeToTrt(tf_type));
+                            TfDataTypeToTrt(tf_type_));
     AddTestWeights<float>("weights", {1, 2, 1, 1}, {-1, 1});
     RunValidationAndConversion(
         node_def, error::INVALID_ARGUMENT,
@@ -4179,8 +4233,6 @@ TEST_P(OpConverterTest1, ConvertConv2D) {
 
   // Ok.
   std::vector<TestParams> ok_params = {
-// TODO(b/162447069): Enable the test parameters for TRT 7.1.3.x.
-#if !IS_TRT_VERSION_GE(7, 1, 3, 0)
     // Basic
     TestParams{/*input_dims=*/{1, 1, 2, 3},
                /*input=*/{0, 1, 2, 3, 3, 4},
@@ -4192,9 +4244,6 @@ TEST_P(OpConverterTest1, ConvertConv2D) {
                /*dilations=*/{1, 1, 1, 1},
                /*expected_output_dims=*/{1, 1, 2, 2},
                /*expected_output=*/{1, 1, 0, 1}},
-#endif
-// TODO(b/162448349): Enable the test parameters for TRT 7.1.3.x.
-#if !IS_TRT_VERSION_GE(7, 1, 3, 0)
     // SAME padding (Asymmetric)
     TestParams{/*input_dims=*/{1, 1, 2, 3},
                /*input=*/{0, 1, 2, 3, 3, 4},
@@ -4217,9 +4266,6 @@ TEST_P(OpConverterTest1, ConvertConv2D) {
                /*dilations=*/{1, 1, 1, 1},
                /*expected_output_dims=*/{1, 1, 2, 3},
                /*expected_output=*/{1, 2, -1, 3, 1, -3}},
-#endif
-// TODO(b/162447069): Enable the test parameters for TRT 7.1.3.x.
-#if !IS_TRT_VERSION_GE(7, 1, 3, 0)
     // NHWC
     TestParams{/*input_dims=*/{1, 2, 3, 1},
                /*input=*/{0, 1, 2, 3, 3, 4},
@@ -4253,7 +4299,6 @@ TEST_P(OpConverterTest1, ConvertConv2D) {
                /*dilations=*/{1, 1, 1, 1},
                /*expected_output_dims=*/{1, 1, 2, 2},
                /*expected_output=*/{1, 0, 1, 3}},
-#endif
   };
 
   for (int i = 0; i < ok_params.size(); i++) {
@@ -4262,15 +4307,15 @@ TEST_P(OpConverterTest1, ConvertConv2D) {
         get_conv2d_nodedef(ok_params[i].strides, ok_params[i].padding,
                            ok_params[i].data_format, ok_params[i].dilations);
     std::vector<int> partial_input_shape;
-    if (trt_mode == TrtTestMode::kDynamicShape) {
+    if (trt_mode_ == TrtTestMode::kDynamicShape) {
       // The channel dim cannot have unknown size, fix that.
       partial_input_shape.resize(ok_params[i].input_dims.size(), -1);
       int channel_id = (ok_params[i].data_format == "NCHW") ? 1 : 3;
       partial_input_shape[channel_id] = ok_params[i].input_dims[channel_id];
     }
 
-    AddTestTensor("input", ok_params[i].input_dims, tf_type, ok_params[i].input,
-                  partial_input_shape);
+    AddTestTensor("input", ok_params[i].input_dims, tf_type_,
+                  ok_params[i].input, partial_input_shape);
     AddTestWeights<float>("weights", ok_params[i].filter_dims,
                           ok_params[i].filter);
 
@@ -4797,7 +4842,7 @@ TEST_P(OpConverterTest1, ConvertPool) {
   for (int nDim : test_nDims) {
     // Input is weights, should fail.
     Reset();
-    NodeDef node_def = get_pool_nodedef(tf_type, nDim);
+    NodeDef node_def = get_pool_nodedef(tf_type_, nDim);
 
     AddTestWeights<float>("input", {1, 1, 1, 2, 3}, {1, 2, 3, 4, 5, 6});
     RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
@@ -4906,7 +4951,7 @@ TEST_P(OpConverterTest1, ConvertPool) {
       for (bool is_max_pooling : {true, false}) {
         Reset();
         NodeDef node_def =
-            get_pool_nodedef(tf_type, nDim, ksize, strides, p.padding,
+            get_pool_nodedef(tf_type_, nDim, ksize, strides, p.padding,
                              data_format, is_max_pooling);
         AddTestTensor("input", input_dims, input);
         TestOpConverter("my_pool", node_def, expected_output_dims, Status::OK(),
@@ -4968,7 +5013,7 @@ TEST_F(OpConverterTest, ConvertTopK) {
 TEST_P(OpConverterTest3, ConvertGather) {
   // Get the NodeDef for GatherV2.
   Scope s = Scope::NewRootScope();
-  auto params = ops::Placeholder(s.WithOpName("params"), tf_type);
+  auto params = ops::Placeholder(s.WithOpName("params"), tf_type_);
   auto indices = ops::Placeholder(s.WithOpName("indices"), DT_INT32);
   auto axis = ops::Placeholder(s.WithOpName("axis"), DT_INT32);
   auto gather = ops::GatherV2(s.WithOpName("my_gather"), params, indices, axis);
@@ -4976,7 +5021,7 @@ TEST_P(OpConverterTest3, ConvertGather) {
   {
     // Axis is a tensor, should fail.
     Reset();
-    AddTestTensor("params", {1, 1, 2, 3}, tf_type, {});
+    AddTestTensor("params", {1, 1, 2, 3}, tf_type_, {});
     AddTestTensor("indices", {1, 2}, DT_INT32, {});
     AddTestTensor("axis", {1}, DT_INT32, {});
     RunValidationAndConversion(
@@ -5021,7 +5066,7 @@ TEST_P(OpConverterTest3, ConvertGather) {
                  /*expected_output_shape=*/{2, 1, 1, 3},
                  /*expected_output=*/{4, 5, 6, 1, 2, 3},
                  /*params_is_tensor=*/true,
-                 trt_mode == TrtTestMode::kImplicitBatch
+                 trt_mode_ == TrtTestMode::kImplicitBatch
                      ? Status{error::UNIMPLEMENTED,
                               "TensorRT does not allow manipulation of the"
                               " batch dimension, at my_gather"}
@@ -5034,7 +5079,7 @@ TEST_P(OpConverterTest3, ConvertGather) {
                  /*expected_output_shape=*/{2, 1, 2, 1},
                  /*expected_output=*/{3, 1, 6, 4},
                  /*params_is_tensor=*/true,
-                 trt_mode == TrtTestMode::kImplicitBatch
+                 trt_mode_ == TrtTestMode::kImplicitBatch
                      ? Status{error::UNIMPLEMENTED,
                               "Indices must have a batch size of 1 when params"
                               " is a tensor."}
@@ -5048,7 +5093,7 @@ TEST_P(OpConverterTest3, ConvertGather) {
                  /*expected_output_shape=*/{2, 1, 2},
                  /*expected_output=*/{2, 3, 5, 6},
                  /*params_is_tensor=*/false,
-                 trt_mode == TrtTestMode::kImplicitBatch
+                 trt_mode_ == TrtTestMode::kImplicitBatch
                      ? Status{error::UNIMPLEMENTED,
                               "The input axis must be zero when params is a"
                               " weight."}
@@ -5061,13 +5106,13 @@ TEST_P(OpConverterTest3, ConvertGather) {
                  /*expected_output_shape=*/{2},
                  /*expected_output=*/{2, 4},
                  /*params_is_tensor=*/true,
-                 trt_mode == TrtTestMode::kImplicitBatch  // conversion_status
+                 trt_mode_ == TrtTestMode::kImplicitBatch  // conversion_status
                      ? Status{error::UNIMPLEMENTED,
                               "TensorRT does not allow manipulation of the "
                               "batch dimension, at my_gather"}
                      : Status::OK(),
-                 Status::OK(),                            // runtime_status
-                 trt_mode == TrtTestMode::kImplicitBatch  // add_index_status
+                 Status::OK(),                             // runtime_status
+                 trt_mode_ == TrtTestMode::kImplicitBatch  // add_index_status
                      ? Status{error::INVALID_ARGUMENT,
                               "Batch size doesn't match for tensor indices: "
                               "Provided batch size does not match converter "
@@ -5182,7 +5227,7 @@ TEST_P(OpConverterTest3, ConvertGather) {
     if (p.params_is_tensor) {
       AddTestTensor("params", p.params_shape, params_input);
     } else {
-      AddTestWeights("params", p.params_shape, params_input, tf_type);
+      AddTestWeights("params", p.params_shape, params_input, tf_type_);
     }
     AddTestTensor("indices", p.indices_shape, DT_INT32, p.indices, {},
                   p.add_index_status);
@@ -5192,6 +5237,150 @@ TEST_P(OpConverterTest3, ConvertGather) {
   }
 }
 
+template <typename OpType>
+NodeDef CreateReduceOp(DataType tf_type, bool keep_dims) {
+  Scope s = Scope::NewRootScope();
+  auto input = ops::Placeholder(s.WithOpName("input"), tf_type);
+  auto axis = ops::Placeholder(s.WithOpName("axis"), DT_INT32);
+  typename OpType::Attrs op_attrs;
+  op_attrs.keep_dims_ = keep_dims;
+  auto op = OpType(s.WithOpName("my_reduce"), input, axis, op_attrs);
+  return op.operation.node()->def();
+}
+
+// Applies reduction op on sub-sequences of input
+// output[i] = reduce(input[m * i : m * (i +1)])
+std::vector<float> CalcReduce(string op_name, std::vector<float> input, int m,
+                              float (*op)(float, float), float init) {
+  std::vector<float> output(input.size() / m);
+  for (int i = 0; i < output.size(); i++) {
+    auto begin = input.begin() + i * m;
+    auto end = input.begin() + (i + 1) * m;
+    output[i] = std::accumulate(begin, end, init, op);
+    if (op_name == "Mean") {
+      output[i] /= m;
+    }
+  }
+  return output;
+}
+TEST_P(OpConverterTest1, ConvertReduce) {
+  {
+    // Input is weights, should fail.
+    Reset();
+    const NodeDef node_def = CreateReduceOp<ops::Sum>(tf_type_, false);
+    AddTestWeights<float>("input", {1, 2, 3}, {-3, -2, -1, 0, 1, 2});
+    AddTestWeights<int32>("axis", {1}, {1});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "The input \"input\" for Sum must be a tensor, at my_reduce");
+  }
+  {
+    // Axis is weights, should fail.
+    Reset();
+    const NodeDef node_def = CreateReduceOp<ops::Sum>(tf_type_, false);
+    AddTestTensor("input", {1, 2, 3}, {-3, -2, -1, 0, 1, 2});
+    AddTestTensor("axis", {1}, DT_INT32, {1});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "The input \"axis\" for Sum must be a constant, at my_reduce");
+  }
+  using OpFunc = std::function<NodeDef(DataType, bool)>;
+  using ValFunc = float (*)(float, float);
+  struct ReduceTestDescriptor {
+    string name;
+    OpFunc get_node;
+    ValFunc val_func;
+    float init_val;
+  };
+  std::vector<ReduceTestDescriptor> op_test_info{
+      {"Sum", CreateReduceOp<ops::Sum>, [](float x, float y) { return x + y; },
+       0},
+      {"Prod", CreateReduceOp<ops::Prod>,
+       [](float x, float y) { return x * y; }, 1},
+      {"Mean", CreateReduceOp<ops::Mean>,
+       [](float x, float y) { return x + y; }, 0},
+      {"Min", CreateReduceOp<ops::Min>,
+       [](float x, float y) { return y < x ? y : x; }, 1000},
+      {"Max", CreateReduceOp<ops::Max>,
+       [](float x, float y) { return x < y ? y : x; }, -1000}};
+
+  std::vector<float> input_values{1, 2, 3, 4, 5, 6};
+  struct TestParams {
+    std::vector<int> input_dims;
+    std::vector<float> input_values;
+    // Helper array contains the same elements as input but permuted in a way
+    // that the reduction can be calculated over contiguous elements using
+    // CalcReduce
+    std::vector<float> helper_array;
+    std::vector<int> axis;
+    int stride;  // product of input_dims along axis
+    Status conversion_status;
+  };
+  std::vector<TestParams> params{
+      // Out of range tests
+      TestParams{{2, 3, 1}, input_values, input_values, {3}, 3},
+      TestParams{{2, 3, 1}, input_values, input_values, {-4}, 3},
+      // Ok tests
+      TestParams{{2, 3, 1}, input_values, {1, 4, 2, 5, 3, 6}, {0}, 2},
+      TestParams{{2, 3, 1}, input_values, input_values, {1}, 3},
+      TestParams{{2, 3, 1}, input_values, input_values, {2}, 1},
+      TestParams{{2, 3, 1}, input_values, input_values, {0, 1}, 6},
+      // Ok tests with negative axis values
+      TestParams{{2, 3, 1}, input_values, {1, 4, 2, 5, 3, 6}, {-3}, 2},
+      TestParams{{2, 3, 1}, input_values, input_values, {-2}, 3},
+      TestParams{{2, 3, 1}, input_values, input_values, {-1}, 1},
+      TestParams{{2, 3, 1}, input_values, input_values, {-3, 1}, 6},
+  };
+
+  for (bool keep_dims : {false, true}) {
+    for (auto& op : op_test_info) {
+      for (auto p : params) {
+        SCOPED_TRACE(StrCat(op.name, keep_dims ? "keep_dims" : ""));
+        Reset();
+        NodeDef node_def = op.get_node(tf_type_, keep_dims);
+
+        AddTestTensor("input", p.input_dims, p.input_values);
+        AddTestWeights<int32>("axis", {static_cast<int>(p.axis.size())},
+                              p.axis);
+        std::vector<int> expected_output_dims(p.input_dims);
+
+        // Set expected output dim and conversion error messages
+        for (int ax : p.axis) {
+          int rank = p.input_dims.size();
+          if (ax >= rank || ax < -rank) {
+            p.conversion_status =
+                errors::InvalidArgument("Axis value of ", ax,
+                                        " is out of bounds, must be in "
+                                        "range [",
+                                        -rank, ", ", rank, "), at my_reduce");
+          } else {
+            int ax_positive = ax >= 0 ? ax : ax + rank;
+            // Zero marks elements that we will remove later.
+            expected_output_dims[ax_positive] = keep_dims ? 1 : 0;
+            if (trt_mode_ == TrtTestMode::kImplicitBatch &&
+                (ax == 0 || ax == -rank)) {
+              p.conversion_status = errors::Unimplemented(
+                  "TensorRT does not allow manipulation of the batch "
+                  "dimension, at my_reduce");
+            }
+          }
+        }
+        expected_output_dims.erase(std::remove(expected_output_dims.begin(),
+                                               expected_output_dims.end(), 0),
+                                   expected_output_dims.end());
+        VLOG(2) << "out dims "
+                << absl::StrCat("[", absl::StrJoin(expected_output_dims, ","),
+                                "]");
+        std::vector<float> expected_values = CalcReduce(
+            op.name, p.helper_array, p.stride, op.val_func, op.init_val);
+        TestOpConverter("my_reduce", node_def, expected_output_dims,
+                        p.conversion_status, Status::OK(),
+                        ArrayFloatNear(expected_values));
+      }
+    }
+  }
+}
+
 NodeDef CreateCastOp(DataType tf_type) {
   Scope s = Scope::NewRootScope();
   auto input = ops::Placeholder(s.WithOpName("input"), DT_HALF);
@@ -5204,7 +5393,7 @@ TEST_P(OpConverterTest1, ConvertUnary) {
   {
     // Input is weights, should fail.
     Reset();
-    const NodeDef node_def = CreateUnaryOp<ops::Neg>(tf_type);
+    const NodeDef node_def = CreateUnaryOp<ops::Neg>(tf_type_);
     AddTestWeights<float>("input", {1, 2, 3}, {-3, -2, -1, 0, 1, 2});
     RunValidationAndConversion(
         node_def, error::UNIMPLEMENTED,
@@ -5260,7 +5449,7 @@ TEST_P(OpConverterTest1, ConvertUnary) {
     if (!op_map.count(op_name)) {
       FAIL() << "Unary op test map does not contain op " << op_name;
     }
-    NodeDef node_def = op_map[op_name].first(tf_type);
+    NodeDef node_def = op_map[op_name].first(tf_type_);
 
     // TODO(bixia): we assume this test is only instantiated for DT_FLOAT for
     // now. Need to find a better way to express input and output types.
@@ -5268,7 +5457,7 @@ TEST_P(OpConverterTest1, ConvertUnary) {
     // TODO(tfeher): improve tests by defining an expected output data type and
     // check that. Currently only the shape and values of the output are
     // checked.
-    DataType input_tf_type = op_name == "Cast" ? DT_HALF : tf_type;
+    DataType input_tf_type = op_name == "Cast" ? DT_HALF : tf_type_;
 
     std::vector<float> input_values{-0.9f, 0.6f, 0.0f, -3.5f, 100.0f, 2.9f};
     AddTestTensor("input", p.input_dims, input_tf_type, input_values);
@@ -5835,7 +6024,7 @@ TEST_P(OpConverterTest2, ConvertPack) {
        /*axis=*/1,
        /*expected_output_dims=*/{1, 2, 2, 3},
        /*expected_output=*/InitTestVector<float>(12),
-       trt_mode == TrtTestMode::kImplicitBatch
+       trt_mode_ == TrtTestMode::kImplicitBatch
            ? Status{error::UNIMPLEMENTED,
                     "The input \"values_1\" for Pack must be a tensor, at "
                     "my_pack"}
@@ -5861,7 +6050,7 @@ TEST_P(OpConverterTest2, ConvertPack) {
        /*axis=*/-4,
        /*expected_output_dims=*/{2, 1, 2, 3},
        /*expected_output=*/InitTestVector<float>(12),
-       trt_mode == TrtTestMode::kImplicitBatch
+       trt_mode_ == TrtTestMode::kImplicitBatch
            ? Status{error::UNIMPLEMENTED,
                     "TensorRT does not allow manipulation of the batch "
                     "dimension, at my_pack"}
@@ -5921,7 +6110,7 @@ TEST_P(OpConverterTest2, ConvertPack) {
       },
   };
   // Inputs have inconsistent shapes, should fail.
-  if (trt_mode != TrtTestMode::kDynamicShape) {
+  if (trt_mode_ != TrtTestMode::kDynamicShape) {
     params.push_back(TestParams{
         /*input_shapes=*/{{1, 2, 3}, {1, 3, 2}},
         /*partial_input_shapes=*/{{}, {}},
@@ -5941,7 +6130,7 @@ TEST_P(OpConverterTest2, ConvertPack) {
     // TODO(tfeher) Add dynamic shapes test once TRT handles shape error
     // decently
   }
-  if (trt_mode == TrtTestMode::kDynamicShape) {
+  if (trt_mode_ == TrtTestMode::kDynamicShape) {
     // Test with mixed dynamic / static shape input tensors
     params.push_back(
         TestParams{/*input_shapes=*/{{1, 2, 3}, {1, 2, 3}},
@@ -5957,14 +6146,14 @@ TEST_P(OpConverterTest2, ConvertPack) {
     const int num_inputs = p.input_shapes.size();
     EXPECT_EQ(num_inputs, p.input_values.size());
 
-    NodeDef node_def = GetPackNodeDef(tf_type, num_inputs, p.axis);
+    NodeDef node_def = GetPackNodeDef(tf_type_, num_inputs, p.axis);
     // Create inputs.
     for (int j = 0; j < num_inputs; ++j) {
       if (j == 1 && p.input_1_is_weight) {
         AddTestWeights(StrCat("values_", j), p.input_shapes[j],
-                       p.input_values[j], tf_type);
+                       p.input_values[j], tf_type_);
       } else {
-        AddTestTensor(StrCat("values_", j), p.input_shapes[j], tf_type,
+        AddTestTensor(StrCat("values_", j), p.input_shapes[j], tf_type_,
                       p.input_values[j], p.partial_input_shapes[j]);
       }
     }
@@ -6492,7 +6681,7 @@ TEST_P(OpConverterTest2, ConvertSquaredDifference) {
   {
     // Input is a weight, should fail.
     Reset();
-    NodeDef node_def = GetSquaredDifferenceNodeDef(tf_type);
+    NodeDef node_def = GetSquaredDifferenceNodeDef(tf_type_);
     AddTestWeights<float>("x", {1, 2, 3}, {1, 2, 3, 4, 5, 6});
     AddTestTensor("y", {1, 1, 2, 3});
     RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
@@ -6519,7 +6708,7 @@ TEST_P(OpConverterTest2, ConvertSquaredDifference) {
        /*value_y=*/std::vector<float>(7 * 5, 0),
        /*expected_output_dims=*/{1, 1, 2, 3},
        /*expected_output=*/common_input,
-       trt_mode == TrtTestMode::kDynamicShape
+       trt_mode_ == TrtTestMode::kDynamicShape
            ? Status::OK()
            : errors::InvalidArgument("Infeasible broadcast scheme"),
        errors::Internal(
@@ -6545,7 +6734,7 @@ TEST_P(OpConverterTest2, ConvertSquaredDifference) {
 
   for (auto p : params) {
     Reset();
-    NodeDef node_def = GetSquaredDifferenceNodeDef(tf_type);
+    NodeDef node_def = GetSquaredDifferenceNodeDef(tf_type_);
     AddTestTensor("x", p.dims_x, p.value_x);
     AddTestTensor("y", p.dims_y, p.value_y);
     TestOpConverter("my_squared_diff", node_def, p.expected_output_dims,
@@ -6581,7 +6770,7 @@ template <typename OpType, DataType dtype>
 void TestConvertResize(OpConverterTest* test) {
   typedef typename EnumToDataType<dtype>::Type CType;
 
-  std::vector<ResizeTestParams<CType>> params{
+  std::vector<ResizeTestParams<CType>> params {
 // TODO(b/162442839): Enable the test parameters for TRT 7.1.3.x.
 #if !IS_TRT_VERSION_GE(7, 1, 3, 0)
     {
diff --git a/tensorflow/compiler/tf2tensorrt/convert/utils.cc b/tensorflow/compiler/tf2tensorrt/convert/utils.cc
index a69960005fc..1fc0d13c993 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/utils.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/utils.cc
@@ -241,36 +241,6 @@ int GetNumberOfEngineInputs(const nvinfer1::ICudaEngine* engine) {
 
 #endif
 
-string GetLinkedTensorRTVersion() {
-  int major, minor, patch;
-#if GOOGLE_CUDA && GOOGLE_TENSORRT
-  major = NV_TENSORRT_MAJOR;
-  minor = NV_TENSORRT_MINOR;
-  patch = NV_TENSORRT_PATCH;
-#else
-  major = 0;
-  minor = 0;
-  patch = 0;
-#endif
-  return absl::StrCat(major, ".", minor, ".", patch);
-}
-
-string GetLoadedTensorRTVersion() {
-  int major, minor, patch;
-#if GOOGLE_CUDA && GOOGLE_TENSORRT
-  int ver = getInferLibVersion();
-  major = ver / 1000;
-  ver = ver - major * 1000;
-  minor = ver / 100;
-  patch = ver - minor * 100;
-#else
-  major = 0;
-  minor = 0;
-  patch = 0;
-#endif
-  return absl::StrCat(major, ".", minor, ".", patch);
-}
-
 absl::string_view GetDeviceName(const Node* node) {
   if (node->has_assigned_device_name()) {
     return node->assigned_device_name();
diff --git a/tensorflow/compiler/tf2tensorrt/convert/utils.h b/tensorflow/compiler/tf2tensorrt/convert/utils.h
index a0505c3f922..7570dff1c9d 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/utils.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/utils.h
@@ -117,14 +117,6 @@ Status TrtDimsToTensorShape(const nvinfer1::Dims trt_dims,
 Status TfTypeToTrtType(DataType tf_type, nvinfer1::DataType* trt_type);
 Status TrtTypeToTfType(nvinfer1::DataType trt_type, DataType* tf_type);
 
-// Returns a string that includes compile time TensorRT library version
-// information {Maj, Min, Patch}.
-string GetLinkedTensorRTVersion();
-
-// Returns a string that includes runtime time TensorRT library version
-// information {Maj, Min, Patch}.
-string GetLoadedTensorRTVersion();
-
 // Returns true if an engine built for cached_shapes can also run actual_shapes.
 bool AreShapesCompatible(const std::vector<TensorShape>& actual_shapes,
                          const std::vector<TensorShape>& cached_shapes);
diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
index 58d1c611463..5b2ae822d59 100644
--- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
@@ -800,6 +800,9 @@ StatusOr<std::pair<EngineContext*, int>> TRTEngineOp::GetEngine(
 
     TrtUniquePtrType<IRuntime> infer(nvinfer1::createInferRuntime(logger));
     infer->setGpuAllocator(allocator);
+    // Need to initialize plugins in order to deserialize engines that contain
+    // plugins.
+    MaybeInitializeTrtPlugins(&logger);
     TrtUniquePtrType<nvinfer1::ICudaEngine> static_engine(
         infer->deserializeCudaEngine(serialized_segment_.c_str(),
                                      serialized_segment_.size(), nullptr));
diff --git a/tensorflow/compiler/tf2tensorrt/plugin/plugin_cast.cu.cc b/tensorflow/compiler/tf2tensorrt/plugin/plugin_cast.cu.cc
deleted file mode 100644
index 141a7d1f462..00000000000
--- a/tensorflow/compiler/tf2tensorrt/plugin/plugin_cast.cu.cc
+++ /dev/null
@@ -1,236 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "absl/strings/str_cat.h"
-#include "tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.h"
-#include "tensorflow/core/platform/logging.h"
-
-#if GOOGLE_CUDA && GOOGLE_TENSORRT
-#define EIGEN_USE_GPU  // For definition of Eigen::GpuDevice.
-#include "third_party/gpus/cuda/include/cuda_runtime_api.h"
-#include "tensorflow/core/util/gpu_kernel_helper.h"
-#include "third_party/tensorrt/NvInfer.h"
-
-namespace tensorflow {
-namespace tensorrt {
-using nvinfer1::DataType;
-using nvinfer1::Dims;
-using nvinfer1::IPluginCreator;
-using nvinfer1::IPluginV2;
-using nvinfer1::IPluginV2Ext;
-using nvinfer1::PluginField;
-using nvinfer1::PluginFieldCollection;
-using nvinfer1::PluginFieldType;
-using nvinfer1::PluginFormat;
-
-template <typename SrcT, typename DstT>
-__global__ void Cast(const SrcT* input, int num_elements, DstT* output) {
-  for (int i : CudaGridRangeX(num_elements)) {
-    output[i] = static_cast<DstT>(input[i]);
-  }
-}
-
-template <typename SrcT, typename DstT>
-void RunCast(const SrcT* d_input, int num_elements, DstT* d_output,
-             cudaStream_t stream) {
-  const int threads_per_block = 256;
-  const int blocks_per_grid =
-      (num_elements + threads_per_block - 1) / threads_per_block;
-  TF_CHECK_OK(CudaLaunchKernel(Cast<SrcT, DstT>, threads_per_block,
-                               blocks_per_grid, 0, stream, d_input,
-                               num_elements, d_output));
-}
-
-const char* kPluginName = "TfTrtPluginCast";
-
-class CastPlugin : public TrtPlugin {
- public:
-  CastPlugin(DataType src_type, DataType dst_type)
-      : src_type_(src_type), dst_type_(dst_type) {}
-
-  CastPlugin(const void* serialized_data, size_t length)
-      : TrtPlugin(serialized_data, length) {
-    const char* buffer = static_cast<const char*>(serialized_data);
-    src_type_ = ReadFromBuffer<DataType>(&buffer);
-    dst_type_ = ReadFromBuffer<DataType>(&buffer);
-    src_dims_ = ReadFromBuffer<Dims>(&buffer);
-  }
-
-  CastPlugin(const CastPlugin& rhs)
-      : TrtPlugin(rhs),
-        src_type_(rhs.src_type_),
-        dst_type_(rhs.dst_type_),
-        src_dims_(rhs.src_dims_) {}
-
-  // Methods from IPluginV2Ext.
-
-  DataType getOutputDataType(int index, const DataType* input_types,
-                             int num_inputs) const override {
-    DCHECK_EQ(0, index);
-    DCHECK_EQ(1, num_inputs);
-    return dst_type_;
-  }
-
-  bool isOutputBroadcastAcrossBatch(int output_index,
-                                    const bool* input_is_broadcasted,
-                                    int num_inputs) const override {
-    return false;
-  }
-
-  bool canBroadcastInputAcrossBatch(int input_index) const override {
-    return false;
-  }
-
-  void configurePlugin(const Dims* input_dims, int num_inputs,
-                       const Dims* output_dims, int num_outputs,
-                       const DataType* input_types,
-                       const DataType* output_types,
-                       const bool* input_is_broadcast,
-                       const bool* output_is_broadcast,
-                       PluginFormat float_format, int max_batch_size) override {
-    DCHECK_EQ(1, num_inputs);
-    DCHECK_EQ(1, num_outputs);
-    DCHECK(src_type_ == input_types[0]);
-    DCHECK(dst_type_ == output_types[0]);
-    src_dims_ = input_dims[0];
-  }
-
-  IPluginV2Ext* clone() const override { return new CastPlugin(*this); }
-
-  // Methods from IPluginV2.
-
-  const char* getPluginType() const override { return kPluginName; };
-
-  const char* getPluginVersion() const override { return kTfTrtPluginVersion; };
-
-  int getNbOutputs() const override { return 1; }
-
-  Dims getOutputDimensions(int index, const Dims* inputs,
-                           int num_input_dims) override {
-    DCHECK_EQ(0, index);
-    DCHECK_EQ(1, num_input_dims);
-    return inputs[0];
-  }
-
-  bool supportsFormat(DataType type, PluginFormat format) const override {
-    return type == DataType::kFLOAT || type == DataType::kINT32;
-  }
-
-  size_t getWorkspaceSize(int max_batch_size) const override { return 0; }
-
-  int enqueue(int batch_size, const void* const* inputs, void** outputs, void*,
-              cudaStream_t stream) override {
-    int num_elements = batch_size;
-    for (int i = 0; i < src_dims_.nbDims; i++) {
-      num_elements *= src_dims_.d[i];
-    }
-    const void* input = inputs[0];
-    void* output = outputs[0];
-    DCHECK_NE(static_cast<int>(src_type_), static_cast<int>(dst_type_));
-
-    switch (src_type_) {
-      case DataType::kFLOAT:
-        RunCast(reinterpret_cast<const float*>(input), num_elements,
-                reinterpret_cast<int32*>(output), stream);
-        break;
-      case DataType::kINT32:
-        RunCast(reinterpret_cast<const int32*>(input), num_elements,
-                reinterpret_cast<float*>(output), stream);
-        break;
-      default:
-        return 1;  // Indicates a failure.
-    }
-    return 0;
-  }
-
-  size_t getSerializationSize() const override {
-    return 2 * sizeof(DataType) + sizeof(Dims);
-  }
-
-  void serialize(void* serialized_data) const override {
-    char* buffer = static_cast<char*>(serialized_data);
-    WriteToBuffer(src_type_, &buffer);
-    WriteToBuffer(dst_type_, &buffer);
-    WriteToBuffer(src_dims_, &buffer);
-  }
-
- private:
-  DataType src_type_;
-  DataType dst_type_;
-  Dims src_dims_;
-};
-
-class CastPluginCreator : public IPluginCreator {
- public:
-  CastPluginCreator() {
-    setPluginNamespace(kTfTrtPluginNamespace);
-    plugin_fields_.emplace_back(
-        PluginField("SrcT", nullptr, PluginFieldType::kINT32, 1));
-    plugin_fields_.emplace_back(
-        PluginField("DstT", nullptr, PluginFieldType::kINT32, 1));
-
-    field_collection_.nbFields = plugin_fields_.size();
-    field_collection_.fields = plugin_fields_.data();
-  }
-
-  const char* getPluginName() const override { return kPluginName; }
-
-  const char* getPluginVersion() const override { return kTfTrtPluginVersion; }
-
-  const PluginFieldCollection* getFieldNames() override {
-    return &field_collection_;
-  }
-
-  IPluginV2* createPlugin(
-      const char* name,
-      const PluginFieldCollection* field_collection) override {
-    const PluginField* fields = field_collection->fields;
-    DataType src_type, dst_type;
-    for (int i = 0; i < field_collection->nbFields; ++i) {
-      const char* attr_name = fields[i].name;
-      if (!strcmp(attr_name, "SrcT")) {
-        src_type = *static_cast<const DataType*>(fields[i].data);
-      } else if (!strcmp(attr_name, "DstT")) {
-        dst_type = *static_cast<const DataType*>(fields[i].data);
-      } else {
-        return nullptr;
-      }
-    }
-    return new CastPlugin(src_type, dst_type);
-  }
-
-  IPluginV2* deserializePlugin(const char* name, const void* serial_data,
-                               size_t serial_len) override {
-    return new CastPlugin(serial_data, serial_len);
-  }
-
-  void setPluginNamespace(const char* plugin_namespace) override {
-    namespace_ = plugin_namespace;
-  }
-
-  const char* getPluginNamespace() const override { return namespace_.c_str(); }
-
- private:
-  PluginFieldCollection field_collection_;
-  std::vector<PluginField> plugin_fields_;
-  std::string namespace_;
-};
-
-REGISTER_TFTRT_PLUGIN(CastPluginCreator);
-
-}  // namespace tensorrt
-}  // namespace tensorflow
-
-#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/segment/segment.cc b/tensorflow/compiler/tf2tensorrt/segment/segment.cc
index 1337a733f91..021e28ec6f0 100644
--- a/tensorflow/compiler/tf2tensorrt/segment/segment.cc
+++ b/tensorflow/compiler/tf2tensorrt/segment/segment.cc
@@ -676,6 +676,21 @@ void AddSegmentForNode(const grappler::GraphProperties* graph_properties,
       device_name);
 }
 
+bool OpBatchSizeExceedMaximumBatchSize(
+    const grappler::GraphProperties* graph_properties, const Node* node,
+    bool use_implicit_batch, absl::optional<int> maximum_batch_size) {
+  ClusterBatchSize cluster_batch_size =
+      GetClusterBatchSizeForNode(graph_properties, node, use_implicit_batch);
+  if (cluster_batch_size.HasStaticBatchValue() &&
+      maximum_batch_size.has_value() &&
+      cluster_batch_size.GetStaticBatchValue() > maximum_batch_size.value()) {
+    VLOG(2) << "OP batch size " << cluster_batch_size.GetStaticBatchValue()
+            << "  max_batch_size " << maximum_batch_size.value();
+    return true;
+  }
+  return false;
+}
+
 }  // namespace
 
 Status SegmentGraph(const Graph* tf_graph,
@@ -690,6 +705,10 @@ Status SegmentGraph(const Graph* tf_graph,
         "Explicit batch mode should allow dynamic non-batch dimensions");
   }
 
+  if (options.use_implicit_batch && !options.maximum_batch_size.has_value()) {
+    return errors::Internal("Implicit batch mode requires maximum_batch_size");
+  }
+
   if (!options.allow_dynamic_non_batch_dim && !graph_properties) {
     return errors::Internal(
         "Need graph propertities to disallow dynamic non-batch dimensions");
@@ -768,6 +787,14 @@ Status SegmentGraph(const Graph* tf_graph,
             << "(Op type: " << node->tf_node()->type_string() << "), "
             << "(Op name: " << node->name() << ")";
         exclude_node("Denylisted with the env var TF_TRT_OP_DENYLIST");
+      } else if (OpBatchSizeExceedMaximumBatchSize(
+                     graph_properties, node->tf_node(),
+                     options.use_implicit_batch, options.maximum_batch_size)) {
+        LOG_WARNING_WITH_PREFIX
+            << "Implicit batch mode requires OP batch size not larger than "
+            << "the converter maximum batch size: "
+            << "(Op name: " << node->name() << ")";
+        exclude_node("OP batch size too large");
       } else {
         VLOG(2) << "Accepted as a TF-TRT candidate, "
                 << "(Op type: " << node->tf_node()->type_string() << "), "
diff --git a/tensorflow/compiler/tf2tensorrt/segment/segment.h b/tensorflow/compiler/tf2tensorrt/segment/segment.h
index 3f79983cfd2..bab6e089fa4 100644
--- a/tensorflow/compiler/tf2tensorrt/segment/segment.h
+++ b/tensorflow/compiler/tf2tensorrt/segment/segment.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <set>
 #include <vector>
 
+#include "absl/types/optional.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
@@ -38,6 +39,9 @@ struct SegmentOptions {
   // Segment must contain at least this many nodes.
   int minimum_segment_size = 2;
   bool use_implicit_batch = true;
+  // The maximum batch size used to build the engines in the graph, when
+  // use_implicit_batch is true.
+  absl::optional<int> maximum_batch_size = absl::nullopt;
   // When use_implicit_batch is false or when we are building dynamic engines,
   // we allow dynamic non-batch dimensions.
   bool allow_dynamic_non_batch_dim = false;
diff --git a/tensorflow/compiler/tf2tensorrt/segment/segment_test.cc b/tensorflow/compiler/tf2tensorrt/segment/segment_test.cc
index bf277328fe7..ee406c9743f 100644
--- a/tensorflow/compiler/tf2tensorrt/segment/segment_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/segment/segment_test.cc
@@ -108,8 +108,9 @@ class SegmentTest : public ::testing::Test {
     segment_options_.allow_dynamic_non_batch_dim = true;
   }
 
-  void EnableImplicitBatchModeForStaticEngine() {
+  void EnableImplicitBatchModeForStaticEngine(int maximum_batch_size = 1000) {
     segment_options_.use_implicit_batch = true;
+    segment_options_.maximum_batch_size = maximum_batch_size;
     segment_options_.allow_dynamic_non_batch_dim = false;
   }
 
@@ -487,7 +488,11 @@ TEST_F(SegmentTest, TwoChainsDiffBatchSizes) {
   const std::set<string> all_nodes = {"const-scalar", "output-0", "output-1"};
   EnableImplicitBatchModeForStaticEngine();
   RunTest(&g, &static_graph_properties, all_nodes, all_nodes, all_nodes,
-          {{"output-0", "const-scalar"}});
+          /*expected_segments=*/{{"output-0", "const-scalar"}});
+
+  EnableImplicitBatchModeForStaticEngine(1);
+  RunTest(&g, &static_graph_properties, all_nodes, all_nodes, all_nodes,
+          /*expected_segments=*/{});
 }
 
 TEST_F(SegmentTest, SameRankImplicitBroadcastingStaticBatchSize) {
diff --git a/tensorflow/compiler/tf2tensorrt/segment/union_find.h b/tensorflow/compiler/tf2tensorrt/segment/union_find.h
index b91f5771ce5..54bbc251e4f 100644
--- a/tensorflow/compiler/tf2tensorrt/segment/union_find.h
+++ b/tensorflow/compiler/tf2tensorrt/segment/union_find.h
@@ -109,8 +109,9 @@ class ClusterBatchSize {
     return s;
   }
 
- private:
   bool HasStaticBatchValue() const { return static_batch_value_.has_value(); }
+
+ private:
   bool HasDynamicBatchValue() const { return has_dynamic_batch_value_; }
 
  private:
diff --git a/tensorflow/compiler/tf2tensorrt/utils/py_utils.cc b/tensorflow/compiler/tf2tensorrt/utils/py_utils.cc
index a8e24aa8983..3f8a11f7410 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/py_utils.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/py_utils.cc
@@ -41,31 +41,5 @@ bool IsGoogleTensorRTEnabled() {
 #endif
 }
 
-void GetLinkedTensorRTVersion(int* major, int* minor, int* patch) {
-#if GOOGLE_CUDA && GOOGLE_TENSORRT
-  *major = NV_TENSORRT_MAJOR;
-  *minor = NV_TENSORRT_MINOR;
-  *patch = NV_TENSORRT_PATCH;
-#else
-  *major = 0;
-  *minor = 0;
-  *patch = 0;
-#endif
-}
-
-void GetLoadedTensorRTVersion(int* major, int* minor, int* patch) {
-#if GOOGLE_CUDA && GOOGLE_TENSORRT
-  int ver = getInferLibVersion();
-  *major = ver / 1000;
-  ver = ver - *major * 1000;
-  *minor = ver / 100;
-  *patch = ver - *minor * 100;
-#else
-  *major = 0;
-  *minor = 0;
-  *patch = 0;
-#endif
-}
-
 }  // namespace tensorrt
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2tensorrt/utils/py_utils.h b/tensorflow/compiler/tf2tensorrt/utils/py_utils.h
index f52bb6f1bad..9b24eb36cf9 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/py_utils.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/py_utils.h
@@ -21,12 +21,6 @@ namespace tensorrt {
 
 bool IsGoogleTensorRTEnabled();
 
-// Return compile time TensorRT library version information {Maj, Min, Patch}.
-void GetLinkedTensorRTVersion(int* major, int* minor, int* patch);
-
-// Return runtime time TensorRT library version information {Maj, Min, Patch}.
-void GetLoadedTensorRTVersion(int* major, int* minor, int* patch);
-
 }  // namespace tensorrt
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/tf2tensorrt/utils/py_utils_wrapper.cc b/tensorflow/compiler/tf2tensorrt/utils/py_utils_wrapper.cc
index 03f77c6bd5f..52252f125ac 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/py_utils_wrapper.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/py_utils_wrapper.cc
@@ -16,18 +16,15 @@ limitations under the License.
 #include <tuple>
 
 #include "pybind11/pybind11.h"
+#include "tensorflow/compiler/tf2tensorrt/common/utils.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/py_utils.h"
 
 std::tuple<int, int, int> get_linked_tensorrt_version() {
-  int major, minor, patch;
-  tensorflow::tensorrt::GetLinkedTensorRTVersion(&major, &minor, &patch);
-  return std::tuple<int, int, int>{major, minor, patch};
+  return tensorflow::tensorrt::GetLinkedTensorRTVersion();
 }
 
 std::tuple<int, int, int> get_loaded_tensorrt_version() {
-  int major, minor, patch;
-  tensorflow::tensorrt::GetLoadedTensorRTVersion(&major, &minor, &patch);
-  return std::tuple<int, int, int>{major, minor, patch};
+  return tensorflow::tensorrt::GetLoadedTensorRTVersion();
 }
 
 PYBIND11_MODULE(_pywrap_py_utils, m) {
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.cc
index d4f3a524577..a73877bc3cc 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.cc
@@ -74,7 +74,7 @@ void* TRTDeviceAllocator::allocate(uint64_t size, uint64_t alignment,
   // algorithm uses too much memory. If we don't fail immediately building the
   // engine can be *very* slow with TensorRT7 when GPU memory is limited.
   AllocationAttributes attributes;
-  attributes.no_retry_on_failure = true;
+  attributes.retry_on_failure = false;
   void* mem = allocator_->AllocateRaw(alignment, total_size, attributes);
   if (!mem) return nullptr;
 
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.cc
index 70a0a9a7b65..2f31865751f 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <algorithm>
 #include <functional>
 
+#include "absl/algorithm/container.h"
 #include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
 
 #if GOOGLE_CUDA && GOOGLE_TENSORRT
@@ -35,14 +36,16 @@ void TrtShapeOptimizationProfile::InitProfiles() {
             << "for each input (min=opt=max).";
   }
   for (auto& shape_vec : input_shapes_) {
-    std::vector<nvinfer1::Dims> dimvec;
-    for (auto& shape : shape_vec) {
-      dimvec.push_back(TensorShapeToTrtDims(shape, false));
+    if (!shape_vec.empty()) {
+      std::vector<nvinfer1::Dims> dimvec(shape_vec.size());
+      absl::c_transform(shape_vec, dimvec.begin(), [](TensorShape shape) {
+        return TensorShapeToTrtDims(shape, false);
+      });
+      // Set min=opt=max.
+      OptimizationProfileConfig profConfig{dimvec, dimvec, dimvec};
+      profiles_.push_back(std::move(profConfig));
+      VLOG(1) << "Created profile " << profiles_.back().DebugString();
     }
-    // We set min=opt=max.
-    OptimizationProfileConfig profConfig{dimvec, dimvec, dimvec};
-    profiles_.push_back(std::move(profConfig));
-    VLOG(1) << "Created profile " << profiles_.back().DebugString();
   }
 }
 
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index 1e57c11b2cf..1a91f54afc9 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -337,7 +337,6 @@ cc_library(
     visibility = [":friends"],
     deps = [
         ":common",
-        ":frontend_attributes_util",
         ":host_compute_metadata_proto_cc",
         ":rearrange_function_argument",
         ":sharding_util",
@@ -353,23 +352,17 @@ cc_library(
         "//tensorflow/compiler/jit:common",
         "//tensorflow/compiler/jit:flags",
         "//tensorflow/compiler/jit:shape_inference",
-        "//tensorflow/compiler/jit:xla_cluster_util",
+        "//tensorflow/compiler/mlir:array_container_utils",
         "//tensorflow/compiler/mlir/tensorflow:compile_mlir_util_no_tf_dialect_passes",
-        "//tensorflow/compiler/tf2xla/lib:util",
-        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:protobuf_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/client",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/client/lib:arithmetic",
-        "//tensorflow/compiler/xla/client/lib:constants",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
@@ -378,11 +371,8 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:ops",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:stream_executor_no_cuda",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
         "@com_google_absl//absl/types:variant",
     ],
@@ -787,6 +777,7 @@ tf_cc_test(
         "//tensorflow/cc:function_ops",
         "//tensorflow/cc:functional_ops",
         "//tensorflow/cc:ops",
+        "//tensorflow/compiler/jit:flags",
         "//tensorflow/compiler/jit:xla_cluster_util",
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
         "//tensorflow/core:core_cpu_internal",
@@ -828,9 +819,9 @@ cc_library(
         ":frontend_attributes_util",
         ":functionalize_control_flow_util",
         ":tf2xla_util",
-        "//tensorflow/compiler/jit:union_find",
         "//tensorflow/compiler/tf2xla/ops:xla_ops",
         "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:union_find",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
@@ -856,9 +847,9 @@ cc_library(
         ":functionalize_control_flow_util",
         ":functionalize_while",
         ":tf2xla_util",
-        "//tensorflow/compiler/jit:union_find",
         "//tensorflow/compiler/tf2xla/ops:xla_ops",
         "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:union_find",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
@@ -944,9 +935,9 @@ cc_library(
         ":functionalize_cond",
         ":functionalize_control_flow_util",
         ":tf2xla_util",
-        "//tensorflow/compiler/jit:union_find",
         "//tensorflow/compiler/tf2xla/ops:xla_ops",
         "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:union_find",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
@@ -1087,6 +1078,7 @@ tf_cuda_cc_test(
         "//tensorflow/cc:ops",
         "//tensorflow/cc:scope",
         "//tensorflow/compiler/jit",
+        "//tensorflow/compiler/jit:flags",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
diff --git a/tensorflow/compiler/tf2xla/const_analysis_test.cc b/tensorflow/compiler/tf2xla/const_analysis_test.cc
index 936b74f7b33..c7c8702b49b 100644
--- a/tensorflow/compiler/tf2xla/const_analysis_test.cc
+++ b/tensorflow/compiler/tf2xla/const_analysis_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/cc/ops/function_ops.h"
 #include "tensorflow/cc/ops/functional_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/jit/xla_cluster_util.h"
 #include "tensorflow/core/common_runtime/process_function_library_runtime.h"
 #include "tensorflow/core/graph/algorithm.h"
@@ -217,5 +218,10 @@ TEST(ConstAnalysisTest, RespectExplicitAttr_1) {
   EXPECT_EQ(const_args, std::vector<bool>({true}));
 }
 
+static bool Initialized = [] {
+  tensorflow::GetXlaDeviceFlags()->tf_xla_enable_xla_devices = true;
+  return true;
+}();
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/functionalize_cond.cc b/tensorflow/compiler/tf2xla/functionalize_cond.cc
index 54abccb4cfc..452b102fade 100644
--- a/tensorflow/compiler/tf2xla/functionalize_cond.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_cond.cc
@@ -25,9 +25,10 @@ limitations under the License.
 #include "absl/strings/match.h"
 #include "absl/strings/str_join.h"
 #include "absl/types/optional.h"
-#include "tensorflow/compiler/jit/union_find.h"
 #include "tensorflow/compiler/tf2xla/frontend_attributes_util.h"
+#include "tensorflow/compiler/tf2xla/functionalize_control_flow_util.h"
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
+#include "tensorflow/compiler/xla/union_find.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/shape_refiner.h"
 #include "tensorflow/core/framework/graph_to_functiondef.h"
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
index 10b26f9801c..2a3e35e0ffd 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
@@ -23,12 +23,12 @@ limitations under the License.
 
 #include "absl/memory/memory.h"
 #include "absl/types/optional.h"
-#include "tensorflow/compiler/jit/union_find.h"
 #include "tensorflow/compiler/tf2xla/functionalize_cond.h"
 #include "tensorflow/compiler/tf2xla/functionalize_control_flow_util.h"
 #include "tensorflow/compiler/tf2xla/functionalize_while.h"
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/union_find.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/graph_optimizer.h"
@@ -46,12 +46,254 @@ limitations under the License.
 
 namespace tensorflow {
 
+// Helper functions for functionalizing control flow in functions.
+
+// Maps function name to
+// - new function name, if the function body was functionalized
+// - absl::nullopt, if not
+using FuncMap = std::map<string, absl::optional<string>>;
+using FuncMapIter = std::map<string, absl::optional<string>>::const_iterator;
+
+// Returns whether function has been processed before.
+bool FunctionHasBeenProcessed(FuncMapIter func_iter, const FuncMap* func_map) {
+  return func_iter != func_map->end();
+}
+
+// Returns whether function has been modified (i.e., functionalized) before.
+bool FunctionHasBeenModified(FuncMapIter func_iter) {
+  return func_iter->second.has_value();
+}
+
+// Returns a name for the new functionalized version of a function.
+string GetNewFunctionName(
+    const string& func_name, Node* n,
+    AssociatedFunctionInfo::AssociatedFunctionType func_type,
+    FunctionLibraryDefinition* fld) {
+  // For SymbolicGradient, `func_name` is always "SymbolicGradient" which
+  // is not very informative. Use node name instead.
+  return (
+      func_type ==
+              AssociatedFunctionInfo::AssociatedFunctionType::kSymbolicGradient
+          ? fld->UniqueFunctionName(absl::StrCat(n->name(), "_f15n_"))
+          : fld->UniqueFunctionName(absl::StrCat(func_name, "_f15n_")));
+}
+
+// Returns name to which a modified function has been mapped.
+const string& GetMappedFunctionName(FuncMapIter func_iter) {
+  DCHECK(func_iter->second.has_value());
+  return func_iter->second.value();
+}
+
+// Updates `func_map` with function given by `canonicalized_name`.
+void UpdateFunctionMap(FuncMap* func_map, const string& canonicalized_name,
+                       const string& new_func_name, bool function_modified) {
+  // If function was modified store its new name, otherwise add empty entry to
+  // record that function has been processed and does not need to be rewritten.
+  (*func_map)[canonicalized_name] =
+      function_modified ? absl::make_optional(new_func_name) : absl::nullopt;
+}
+
+// Adds new function def to graph's function library if necessary.
+Status AddFunctionDefToGraphLibrary(
+    const string& func_name, const AssociatedFunctionInfo& associated_function,
+    Graph* graph, FunctionLibraryDefinition* fld) {
+  const OpRegistrationData* op_reg_data;
+  // We have to be careful with adding the function def since there are three
+  // different `OpRegistryInterface`s involved here:
+  // `fld`, `graph->flib_def()` and `graph->flib_def().default_registry()`.
+  // We have already added the function def to `fld` before calling this
+  // function but for the subsequent `RewriteAssociatedFunction` call we need
+  // the function def to be in one of the other two registries, otherwise
+  // `RewriteAssociatedFunction` will fail for the `kFunctionCallNode` case
+  // because it cannot find the associated function def.
+  // On the other hand, we should not add the function def if it is already
+  // contained in one of the last two registries, this would lead to errors when
+  // the function def is already in one registry and we try to add it to the
+  // other one (if we try to add it to the same it's fine). This can happen in
+  // cases where one of the last two registries is identical to `fld` (which we
+  // already updated).
+  // Therefore, before adding the function def we have to check if it's already
+  // contained in either `graph->flib_def()` or
+  // `graph->flib_def().default_registry()` which is done in the following line
+  // (we have to use `LookUp` instead of `Contains` or `Find` because the latter
+  // both don't check the default registry).
+  if (graph->flib_def().LookUp(func_name, &op_reg_data).ok())
+    return Status::OK();
+
+  const FunctionDef* new_fdef = fld->Find(func_name);
+  DCHECK(new_fdef != nullptr);
+  FunctionDefLibrary fdef_lib;
+  *(fdef_lib.add_function()) = *new_fdef;
+  return graph->AddFunctionLibrary(fdef_lib);
+}
+
+// Functionalizes function given by `func_name`. Update `func_map` accordingly.
+Status FunctionalizeControlFlowForFunction(
+    const string& func_name, const string& new_func_name,
+    const protobuf::Map<string, tensorflow::AttrValue>& attrs,
+    FunctionLibraryDefinition* fld, FunctionLibraryRuntime* flr,
+    FuncMap* func_map, bool* function_modified,
+    const NodeFilter& node_filter = {});
+
+// Functionalizes all functions that are (directly or indirectly) associated to
+// any node in `graph`. Adds processed functions to `func_map`.
+Status FunctionalizeControlFlowForNodeAssociatedFunctions(
+    FuncMap* func_map, Graph* graph, FunctionLibraryDefinition* fld,
+    FunctionLibraryRuntime* flr, bool* any_function_modified,
+    const NodeFilter& node_filter) {
+  std::vector<std::pair<Node*, std::vector<AssociatedFunctionInfo>>>
+      nodes_to_associated_functions;
+  for (auto* n : graph->nodes()) {
+    auto associated_functions = GetAssociatedFunctions(*n, fld);
+    if (!associated_functions.empty()) {
+      nodes_to_associated_functions.push_back({n, associated_functions});
+    }
+  }
+  for (const auto& pair : nodes_to_associated_functions) {
+    Node* n = pair.first;
+    auto associated_functions = pair.second;
+    for (auto& associated_function : associated_functions) {
+      // Note that if `n` is a function call node, then potential calls of
+      // `RewriteAssociatedFunction` below might delete `n` and create a new
+      // node instead, making `n` an invalid pointer. That's fine because in
+      // that case `n` only has one associated function, so this loop has only
+      // one iteration and we don't use `n` again after the rewrite.
+      // The invariant is guaranteed by `GetAssociatedFunctions` and confirmed
+      // below.
+      DCHECK(associated_function.type() !=
+                 AssociatedFunctionInfo::kFunctionCallNode ||
+             associated_functions.size() == 1);
+
+      // Process one node-function-pair.
+      string func_name = associated_function.func_name();
+      string canonicalized_name =
+          Canonicalize(func_name, AttrSlice(&associated_function.attrs()));
+      auto func_iter = func_map->find(canonicalized_name);
+      string new_func_name;
+      if (FunctionHasBeenProcessed(func_iter, func_map)) {
+        if (FunctionHasBeenModified(func_iter)) {
+          *any_function_modified = true;
+          new_func_name = GetMappedFunctionName(func_iter);
+          TF_RETURN_IF_ERROR(RewriteAssociatedFunction(
+              graph, n, fld, associated_function, new_func_name));
+        }
+        continue;
+      }
+      // Function is processed for the first time.
+      bool function_modified = false;
+      new_func_name =
+          GetNewFunctionName(func_name, n, associated_function.type(), fld);
+      // Perform functionalization for current function.
+      TF_RETURN_IF_ERROR(FunctionalizeControlFlowForFunction(
+          func_name, new_func_name, associated_function.attrs(), fld, flr,
+          func_map, &function_modified, node_filter));
+      UpdateFunctionMap(func_map, canonicalized_name, new_func_name,
+                        function_modified);
+      if (function_modified) {
+        *any_function_modified = true;
+        TF_RETURN_IF_ERROR(AddFunctionDefToGraphLibrary(
+            new_func_name, associated_function, graph, fld));
+        TF_RETURN_IF_ERROR(RewriteAssociatedFunction(
+            graph, n, fld, associated_function, new_func_name));
+      }
+    }
+  }
+  return Status::OK();
+}
+
+Status FunctionalizeControlFlowForFunction(
+    const string& func_name, const string& new_func_name,
+    const protobuf::Map<string, tensorflow::AttrValue>& attrs,
+    FunctionLibraryDefinition* fld, FunctionLibraryRuntime* flr,
+    FuncMap* func_map, bool* function_modified, const NodeFilter& node_filter) {
+  *function_modified = false;
+
+  // Convert the function to a graph.
+  FunctionLibraryRuntime::Handle handle;
+  TF_RETURN_IF_ERROR(flr->Instantiate(func_name, AttrSlice(&attrs), &handle));
+  Status ret_status = Status::OK();
+  auto cleanup_handle = gtl::MakeCleanup([&]() {
+    auto s = flr->ReleaseHandle(handle);
+    if (!s.ok()) {
+      ret_status.Update(s);
+    }
+  });
+  const FunctionBody* body = flr->GetFunctionBody(handle);
+  Graph* g = body->graph;
+
+  // Check if the graph has Switch or Merge node.
+  bool has_switch_or_merge = false;
+  for (Node* n : body->graph->nodes()) {
+    // Skip nodes that are filtered out.
+    if (node_filter && !node_filter(n)) continue;
+    if (n->type_string() == "Switch" || n->type_string() == "Merge") {
+      has_switch_or_merge = true;
+      break;
+    }
+  }
+  // Before functionalizing control flow in `g` we functionalize control flow
+  // in functions (directly or indirectly) associated with nodes in `g`.
+  TF_RETURN_IF_ERROR(FunctionalizeControlFlowForNodeAssociatedFunctions(
+      func_map, g, fld, flr, function_modified, node_filter));
+
+  if (has_switch_or_merge) {
+    *function_modified = true;
+
+    // Functionalize the function body.
+    if (VLOG_IS_ON(4)) {
+      DumpGraphToFile(
+          absl::StrCat("functionalize_control_flow_before_fdef_", func_name),
+          *g, fld);
+    }
+    TF_RETURN_IF_ERROR(FunctionalizeControlFlow(g, fld, node_filter));
+    if (VLOG_IS_ON(4)) {
+      DumpGraphToFile(
+          absl::StrCat("functionalize_control_flow_after_fdef_", func_name), *g,
+          fld);
+    }
+  }
+  if (*function_modified) {
+    // Add rewritten FunctionDef into library.
+    FunctionDef functionalized_fdef;
+    TF_RETURN_IF_ERROR(
+        GraphToFunctionDef(*g, new_func_name, &functionalized_fdef));
+    if (func_name == new_func_name) {
+      VLOG(2) << "Replacing function " << func_name;
+      TF_RETURN_IF_ERROR(
+          fld->ReplaceFunction(new_func_name, functionalized_fdef));
+    } else {
+      VLOG(2) << "Adding function " << new_func_name;
+      TF_RETURN_IF_ERROR(fld->AddFunctionDef(functionalized_fdef));
+    }
+  }
+
+  return ret_status;
+}
+
 Status FunctionalizeControlFlow(Graph* graph,
                                 FunctionLibraryDefinition* library,
-                                const NodeFilter& node_filter) {
+                                const NodeFilter& node_filter,
+                                bool include_functions) {
   VLOG(2) << "FunctionalizeControlFlow (initial): "
           << DumpGraphToFile("functionalize_initial", *graph, library);
 
+  if (include_functions) {
+    // Functionalize control flow in functions that are (directly or indirectly)
+    // associated with a node in `graph`.
+    auto pflr = absl::make_unique<ProcessFunctionLibraryRuntime>(
+        /*device_mgr=*/nullptr, tensorflow::Env::Default(),
+        /*config=*/nullptr, TF_GRAPH_DEF_VERSION, library,
+        tensorflow::OptimizerOptions());
+    // `pflr` has only one `FunctionLibraryRuntime`, for `kDefaultFLRDevice`
+    // (because we constructed it with `device_mgr = nullptr`).
+    FunctionLibraryRuntime* flr =
+        pflr->GetFLR(ProcessFunctionLibraryRuntime::kDefaultFLRDevice);
+
+    FuncMap func_map;
+    bool modified = false;
+    TF_RETURN_IF_ERROR(FunctionalizeControlFlowForNodeAssociatedFunctions(
+        &func_map, graph, library, flr, &modified, node_filter));
+  }
   // Functionalize and remove while loops from graph.
   TF_RETURN_IF_ERROR(FunctionalizeWhileLoop(graph, library, node_filter));
 
@@ -68,153 +310,19 @@ Status FunctionalizeControlFlow(Graph* graph,
 
 Status FunctionalizeControlFlowForGraphDef(GraphDef* graph_def,
                                            FunctionLibraryDefinition* library,
-                                           const NodeFilter& node_filter) {
+                                           const NodeFilter& node_filter,
+                                           bool include_functions) {
   FunctionDefLibrary function_lib = graph_def->library();
   Graph graph(OpRegistry::Global());
 
   TF_RETURN_IF_ERROR(ConvertGraphDefToGraph({}, *graph_def, &graph));
-  TF_RETURN_IF_ERROR(FunctionalizeControlFlow(&graph, library, node_filter));
+  TF_RETURN_IF_ERROR(FunctionalizeControlFlow(&graph, library, node_filter,
+                                              include_functions));
   graph.ToGraphDef(graph_def);
   std::swap(*graph_def->mutable_library(), function_lib);
   return Status::OK();
 }
 
-Status FunctionalizeControlFlowForFunction(
-    const string& func_name, const string& new_func_name,
-    const protobuf::Map<string, tensorflow::AttrValue>& attrs,
-    FunctionLibraryDefinition* fld, FunctionLibraryRuntime* flr,
-    std::map<string, absl::optional<string>>* canonicalized_name_to_new_name,
-    bool* modified) {
-  *modified = false;
-
-  // Convert the function to Graph.
-  FunctionLibraryRuntime::Handle handle;
-  TF_RETURN_IF_ERROR(flr->Instantiate(func_name, AttrSlice(&attrs), &handle));
-  Status ret_status = Status::OK();
-  auto cleanup_handle = gtl::MakeCleanup([&]() {
-    auto s = flr->ReleaseHandle(handle);
-    if (!s.ok()) {
-      ret_status.Update(s);
-    }
-  });
-  const FunctionBody* body = flr->GetFunctionBody(handle);
-  Graph* g = body->graph;
-
-  // Check if the graph has Switch or Merge node.
-  bool has_switch_or_merge = false;
-  for (Node* n : body->graph->nodes()) {
-    if (n->type_string() == "Switch" || n->type_string() == "Merge") {
-      has_switch_or_merge = true;
-      break;
-    }
-  }
-  // We cannot return here directly if the graph has no Switch/Merge.
-  // It might contain function call nodes, or If/While nodes with Switch/Merge
-  // in function body. We still need to rewrite those functions and modify
-  // corresponding nodes.
-
-  // If any node has associated functions, functionalize them first.
-  // Gather nodes with associated functions first, because rewriting those nodes
-  // might involve node deletion/addition. Avoid modifying nodes while iterating
-  // it.
-  std::vector<std::pair<Node*, std::vector<AssociatedFunctionInfo>>>
-      nodes_to_associated_functions;
-  for (auto* n : g->nodes()) {
-    auto associated_functions = GetAssociatedFunctions(*n, fld);
-    if (!associated_functions.empty()) {
-      nodes_to_associated_functions.push_back({n, associated_functions});
-    }
-  }
-  for (const auto& iter : nodes_to_associated_functions) {
-    Node* n = iter.first;
-    auto associated_functions = iter.second;
-    for (auto& associated_function : associated_functions) {
-      string name = associated_function.func_name();
-      string canonicalized_name =
-          Canonicalize(name, AttrSlice(&associated_function.attrs()));
-      auto iter = canonicalized_name_to_new_name->find(canonicalized_name);
-      string new_name;
-      bool function_modified;
-      if (iter != canonicalized_name_to_new_name->end()) {
-        // If we already processed this function, check if it was rewritten. If
-        // the function was rewritten, the entry will be non-empty. Otherwise
-        // the entry will be empty.
-        function_modified = iter->second.has_value();
-        if (function_modified) {
-          new_name = iter->second.value();
-        }
-      } else {
-        if (associated_function.type() ==
-            AssociatedFunctionInfo::AssociatedFunctionType::kSymbolicGradient) {
-          // For SymbolicGradient, `name` is always "SymbolicGradient",
-          // which is not very informative. Use node name instead.
-          new_name = fld->UniqueFunctionName(absl::StrCat(n->name(), "_f15n_"));
-        } else {
-          new_name = fld->UniqueFunctionName(absl::StrCat(name, "_f15n_"));
-        }
-        TF_RETURN_IF_ERROR(FunctionalizeControlFlowForFunction(
-            name, new_name, associated_function.attrs(), fld, flr,
-            canonicalized_name_to_new_name, &function_modified));
-        if (function_modified) {
-          // If the function was rewritten, add an non-empty entry. So later we
-          // know we have processed this function, and it was rewritten into
-          // another function.
-          (*canonicalized_name_to_new_name)[canonicalized_name] = new_name;
-        } else {
-          // If the function was not rewritten, add an empty entry. So later
-          // we know we have processed this function, and it does not need to be
-          // rewritten.
-          (*canonicalized_name_to_new_name)[canonicalized_name] = absl::nullopt;
-        }
-      }
-      if (function_modified) {
-        *modified = true;
-
-        // Notice that if "n" is a function call, RewriteAssociatedFunction()
-        // will delete it and create a new node instead, making "n" an invalid
-        // pointer. That's fine because in that case, associated_functions will
-        // only have one member and the loop will only run once.
-        TF_RETURN_IF_ERROR(RewriteAssociatedFunction(
-            g, n, fld, associated_function, new_name));
-      }
-    }
-  }
-
-  if (has_switch_or_merge) {
-    *modified = true;
-
-    // Functionalize the function body.
-    if (VLOG_IS_ON(4)) {
-      DumpGraphToFile(
-          absl::StrCat("functionalize_control_flow_before_fdef_", func_name),
-          *g, fld);
-    }
-    TF_RETURN_IF_ERROR(FunctionalizeControlFlow(g, fld));
-    if (VLOG_IS_ON(4)) {
-      DumpGraphToFile(
-          absl::StrCat("functionalize_control_flow_after_fdef_", func_name), *g,
-          fld);
-    }
-  }
-
-  if (*modified) {
-    // Add rewritten FunctionDef into library.
-    FunctionDef functionalized_fdef;
-    TF_RETURN_IF_ERROR(
-        GraphToFunctionDef(*g, new_func_name, &functionalized_fdef));
-    if (func_name == new_func_name) {
-      VLOG(2) << "Replacing function " << func_name;
-      TF_RETURN_IF_ERROR(
-          fld->ReplaceFunction(new_func_name, functionalized_fdef));
-    } else {
-      VLOG(2) << "Adding function " << new_func_name;
-      TF_RETURN_IF_ERROR(fld->AddFunctionDef(functionalized_fdef));
-    }
-  }
-
-  return ret_status;
-}
-
 Status FunctionalizeControlFlowForXlaPass::Run(
     const GraphOptimizationPassOptions& options) {
   Graph* graph = options.graph->get();
@@ -241,7 +349,7 @@ Status FunctionalizeControlFlowForXlaPass::Run(
           // XlaLaunch ops are generated by EncapsulateXlaComputationsPass.
           {"XlaLaunch", "function"},
       };
-  std::map<string, absl::optional<string>> canonicalized_name_to_new_name;
+  FuncMap func_map;
   bool fld_modified = false;
   for (Node* n : graph->nodes()) {
     auto it = kNodeTypeToFunctionAttrMapping->find(n->type_string());
@@ -258,7 +366,7 @@ Status FunctionalizeControlFlowForXlaPass::Run(
     bool modified;
     TF_RETURN_IF_ERROR(FunctionalizeControlFlowForFunction(
         func.name(), new_func_name, func.attr(), options.flib_def, flr,
-        &canonicalized_name_to_new_name, &modified));
+        &func_map, &modified));
     if (modified) {
       n->ClearAttr(func_attr);
       func.set_name(new_func_name);
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.h b/tensorflow/compiler/tf2xla/functionalize_control_flow.h
index f9e751e2d67..46abae27878 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow.h
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.h
@@ -30,6 +30,13 @@ namespace tensorflow {
 //
 // If `node_filter` is defined, then only loops and conditions for whose
 // nodes `node_filter` returns true are functionalized.
+
+// If `include_functions` is true, then loops and conditions inside of functions
+// that are associated with nodes in `graph` (e.g., a function called from a
+// node in `graph`) are also functionalized, otherwise they are not.
+// This also handles transitive cases, e.g., a function body will be
+// functionalized when it is called in another function that is called by some
+// node in `graph` (and so on). The node filter also applies here.
 //
 // Precondition:
 // For any node in a loop or condition for which `node_filter` returns true,
@@ -43,11 +50,13 @@ namespace tensorflow {
 // satisfies the above conditions.
 Status FunctionalizeControlFlow(Graph* graph,
                                 FunctionLibraryDefinition* library,
-                                const NodeFilter& node_filter = {});
+                                const NodeFilter& node_filter = {},
+                                bool include_functions = false);
 
 Status FunctionalizeControlFlowForGraphDef(GraphDef* graph_def,
                                            FunctionLibraryDefinition* library,
-                                           const NodeFilter& node_filter = {});
+                                           const NodeFilter& node_filter = {},
+                                           bool include_functions = false);
 
 // This pass looks at the graph, and turns V1 control flow structure
 // (Switch/Merge/etc.) into V2 control flow structure (If/While).
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
index 79a042ad680..951ebdd7ec1 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
@@ -27,12 +27,15 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/graph_to_functiondef.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/graph/validate.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/version.h"
+#include "tensorflow/core/util/dump_graph.h"
 #include "tensorflow/core/util/equal_graph_def.h"
 
 namespace tensorflow {
@@ -63,18 +66,41 @@ Status FindIfThenAndElse(const GraphDef& graph, string* op_name,
 //     math_ops.less(y, x), lambda: math_ops.multiply(y, 17),
 //     lambda: math_ops.add(x, 23))
 //
-// Tests different node filters.
-class ConditionalTestFixture : public ::testing::TestWithParam<bool> {
+// Tests different node filters and functionalization inside of a function.
+class ConditionalTestFixture
+    : public ::testing::TestWithParam<std::tuple<bool, bool>> {
  protected:
-  void SetUp() override { restrict_to_tpu_nodes_ = GetParam(); }
+  void SetUp() override {
+    restrict_to_tpu_nodes_ = std::get<0>(GetParam());
+    wrap_condition_in_function_ = std::get<1>(GetParam());
+  }
   void RunTest();
 
  private:
+  void BuildCondGraph(Graph* cond_graph);
+  void CheckGraphDef(const GraphDef& graph_def,
+                     const FunctionLibraryDefinition& library);
+
   bool restrict_to_tpu_nodes_ = false;
+  bool wrap_condition_in_function_ = false;
 };
 
-void ConditionalTestFixture::RunTest() {
-  Graph graph(OpRegistry::Global());
+TEST_P(ConditionalTestFixture, ConditionalTests) { RunTest(); }
+
+INSTANTIATE_TEST_SUITE_P(
+    FunctionalizeControlFlow, ConditionalTestFixture,
+    ::testing::Combine(::testing::Bool(), ::testing::Bool()),
+    [](const ::testing::TestParamInfo<ConditionalTestFixture::ParamType>&
+           info) {
+      bool restrict_to_tpu_nodes = std::get<0>(info.param);
+      bool wrap_cond_in_function = std::get<1>(info.param);
+      string name =
+          absl::StrCat(restrict_to_tpu_nodes ? "with_filter" : "without_filter",
+                       wrap_cond_in_function ? "_in_function" : "_in_graph");
+      return name;
+    });
+
+void ConditionalTestFixture::BuildCondGraph(Graph* cond_graph) {
   {
     Scope scope = Scope::NewRootScope().ExitOnError();
 
@@ -102,13 +128,117 @@ void ConditionalTestFixture::RunTest() {
     auto merge = ops::Merge(scope.WithOpName("cond/Merge"),
                             std::initializer_list<Input>{add, mul});
 
-    TF_EXPECT_OK(scope.ToGraph(&graph));
+    TF_EXPECT_OK(scope.ToGraph(cond_graph));
 
     // Set `_tpu_replicate` attribute for all nodes.
-    for (Node* n : graph.nodes()) {
+    for (Node* n : cond_graph->nodes()) {
       n->AddAttr("_tpu_replicate", "cluster");
     }
   }
+}
+
+void ConditionalTestFixture::CheckGraphDef(
+    const GraphDef& graph_def, const FunctionLibraryDefinition& library) {
+  string op_name;
+  NameAttrList then_fn;
+  NameAttrList else_fn;
+  TF_EXPECT_OK(FindIfThenAndElse(graph_def, &op_name, &then_fn, &else_fn));
+  InstantiationResultForTest else_result;
+  TF_EXPECT_OK(
+      InstantiateFunctionForTest(else_fn.name(), library, &else_result));
+
+  // Outer graph
+  {
+    Scope scope = Scope::NewRootScope().ExitOnError();
+    auto y = ops::Placeholder(scope.WithOpName("y"), DT_INT32);
+    auto x = ops::Placeholder(scope.WithOpName("x"), DT_INT32);
+    auto less = ops::Less(scope.WithOpName("cond/Less"), y, x);
+    auto if_op =
+        ops::If(scope.WithOpName(op_name), less,
+                std::initializer_list<Input>{less, y, x}, {DT_INT32}, then_fn,
+                else_fn, ops::If::OutputShapes({PartialTensorShape()}));
+    auto id = ops::Identity(scope.WithOpName("cond/Merge"), if_op.output[0]);
+    GraphDef expected;
+    TF_EXPECT_OK(scope.ToGraphDef(&expected));
+    TF_EXPECT_GRAPH_EQ(expected, graph_def);
+  }
+
+  // then body.
+  {
+    Scope scope = Scope::NewRootScope().ExitOnError();
+    auto arg_0 = ops::_Arg(scope.WithOpName("arg0"), DT_BOOL, 0);
+    auto arg_1 = ops::_Arg(scope.WithOpName("arg1"), DT_INT32, 1);
+    auto arg_2 = ops::_Arg(scope.WithOpName("arg2"), DT_INT32, 2);
+    auto identity = ops::Identity(scope.WithOpName("cond/Identity"), arg_0);
+    auto cond = ops::Const(
+        scope.WithOpName("cond").WithControlDependencies(identity), 17);
+    auto mul = ops::Mul(scope.WithOpName("cond/Mul"), arg_1, cond);
+    auto retval0 = ops::_Retval(scope.WithOpName("retval0_RetVal"), mul, 0);
+
+    GraphDef expected;
+    TF_EXPECT_OK(scope.ToGraphDef(&expected));
+
+    InstantiationResultForTest result;
+    TF_EXPECT_OK(InstantiateFunctionForTest(then_fn.name(), library, &result));
+
+    EXPECT_EQ(DataTypeVector{DT_INT32}, result.ret_types);
+    EXPECT_EQ((DataTypeVector{DT_BOOL, DT_INT32, DT_INT32}), result.arg_types);
+    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+  }
+
+  // else body.
+  {
+    Scope scope = Scope::NewRootScope().ExitOnError();
+    auto arg_0 = ops::_Arg(scope.WithOpName("arg0"), DT_BOOL, 0);
+    auto arg_1 = ops::_Arg(scope.WithOpName("arg1"), DT_INT32, 1);
+    auto arg_2 = ops::_Arg(scope.WithOpName("arg2"), DT_INT32, 2);
+    auto identity = ops::Identity(scope.WithOpName("cond/Identity_1"), arg_0);
+    auto cond_1 = ops::Const(
+        scope.WithOpName("cond_1").WithControlDependencies(identity), 23);
+    auto add = ops::Add(scope.WithOpName("cond/false/add"), arg_2, cond_1);
+    auto retval0 = ops::_Retval(scope.WithOpName("retval0_RetVal"), add, 0);
+
+    GraphDef expected;
+    TF_EXPECT_OK(scope.ToGraphDef(&expected));
+
+    InstantiationResultForTest result;
+    TF_EXPECT_OK(InstantiateFunctionForTest(else_fn.name(), library, &result));
+
+    EXPECT_EQ(DataTypeVector{DT_INT32}, result.ret_types);
+    EXPECT_EQ((DataTypeVector{DT_BOOL, DT_INT32, DT_INT32}), result.arg_types);
+    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+  }
+}
+
+void ConditionalTestFixture::RunTest() {
+  Graph graph(OpRegistry::Global());
+  if (wrap_condition_in_function_) {
+    // Wrap condition in a function which is called from `graph`.
+    Scope scope = Scope::NewRootScope().ExitOnError();
+    auto source = ops::Placeholder(scope.WithOpName("source"), DT_INT32);
+
+    Graph cond_graph(OpRegistry::Global());
+    BuildCondGraph(&cond_graph);
+
+    FunctionDef cond_fdef;
+    TF_ASSERT_OK(GraphToFunctionDef(cond_graph, "cond_fn", &cond_fdef));
+
+    FunctionDefLibrary fdef_lib;
+    *(fdef_lib.add_function()) = cond_fdef;
+    TF_ASSERT_OK(scope.graph()->AddFunctionLibrary(fdef_lib));
+    NodeDef cond_fn;
+    cond_fn.set_name("cond_node");
+    cond_fn.set_op("cond_fn");
+    *(cond_fn.add_input()) = "source";
+    Status status;
+    scope.graph()->AddNode(cond_fn, &status);
+    TF_ASSERT_OK(status);
+    TF_ASSERT_OK(scope.ToGraph(&graph));
+  } else {
+    // Build condition in `graph`.
+    BuildCondGraph(&graph);
+  }
+  FunctionLibraryDefinition library(graph.flib_def());
   // If `restrict_to_tpu_nodes_` is true let filter function return true for
   // `_tpu_replicate` nodes.
   NodeFilter node_filter =
@@ -116,99 +246,47 @@ void ConditionalTestFixture::RunTest() {
           ? [](const Node* n) { return n->attrs().Find("_tpu_replicate"); }
           : NodeFilter{};
 
-  FunctionLibraryDefinition library(OpRegistry::Global(), {});
   GraphDef optimized_graph_def;
   graph.ToGraphDef(&optimized_graph_def);
-  TF_ASSERT_OK(FunctionalizeControlFlowForGraphDef(&optimized_graph_def,
-                                                   &library, node_filter));
-  TF_ASSERT_OK(FunctionalizeControlFlow(&graph, &library, node_filter));
-  GraphDef converted_graph_def;
-  graph.ToGraphDef(&converted_graph_def);
+  TF_ASSERT_OK(FunctionalizeControlFlowForGraphDef(
+      &optimized_graph_def, &library, node_filter,
+      /*include_functions=*/wrap_condition_in_function_));
+  TF_ASSERT_OK(FunctionalizeControlFlow(
+      &graph, &library, node_filter,
+      /*include_functions=*/wrap_condition_in_function_));
 
-  for (const GraphDef& graph_def : {optimized_graph_def, converted_graph_def}) {
-    string op_name;
-    NameAttrList then_fn;
-    NameAttrList else_fn;
-    TF_EXPECT_OK(FindIfThenAndElse(graph_def, &op_name, &then_fn, &else_fn));
-    InstantiationResultForTest else_result;
-    TF_EXPECT_OK(
-        InstantiateFunctionForTest(else_fn.name(), library, &else_result));
+  if (wrap_condition_in_function_) {
+    // Check if function body was functionalized.
+    auto pflr = absl::make_unique<ProcessFunctionLibraryRuntime>(
+        /*device_mgr=*/nullptr, tensorflow::Env::Default(),
+        /*config=*/nullptr, TF_GRAPH_DEF_VERSION, &library,
+        tensorflow::OptimizerOptions());
+    FunctionLibraryRuntime* flr =
+        pflr->GetFLR(ProcessFunctionLibraryRuntime::kDefaultFLRDevice);
+    FunctionLibraryRuntime::Handle handle;
 
-    // Outer graph
-    {
-      Scope scope = Scope::NewRootScope().ExitOnError();
-      auto y = ops::Placeholder(scope.WithOpName("y"), DT_INT32);
-      auto x = ops::Placeholder(scope.WithOpName("x"), DT_INT32);
-      auto less = ops::Less(scope.WithOpName("cond/Less"), y, x);
-      auto if_op =
-          ops::If(scope.WithOpName(op_name), less,
-                  std::initializer_list<Input>{less, y, x}, {DT_INT32}, then_fn,
-                  else_fn, ops::If::OutputShapes({PartialTensorShape()}));
-      auto id = ops::Identity(scope.WithOpName("cond/Merge"), if_op.output[0]);
-      GraphDef expected;
-      TF_EXPECT_OK(scope.ToGraphDef(&expected));
-      TF_EXPECT_GRAPH_EQ(expected, graph_def);
-    }
-
-    // then body.
-    {
-      Scope scope = Scope::NewRootScope().ExitOnError();
-      auto arg_0 = ops::_Arg(scope.WithOpName("arg0"), DT_BOOL, 0);
-      auto arg_1 = ops::_Arg(scope.WithOpName("arg1"), DT_INT32, 1);
-      auto arg_2 = ops::_Arg(scope.WithOpName("arg2"), DT_INT32, 2);
-      auto identity = ops::Identity(scope.WithOpName("cond/Identity"), arg_0);
-      auto cond = ops::Const(
-          scope.WithOpName("cond").WithControlDependencies(identity), 17);
-      auto mul = ops::Mul(scope.WithOpName("cond/Mul"), arg_1, cond);
-      auto retval0 = ops::_Retval(scope.WithOpName("retval0_RetVal"), mul, 0);
-
-      GraphDef expected;
-      TF_EXPECT_OK(scope.ToGraphDef(&expected));
-
-      InstantiationResultForTest result;
-      TF_EXPECT_OK(
-          InstantiateFunctionForTest(then_fn.name(), library, &result));
-
-      EXPECT_EQ(DataTypeVector{DT_INT32}, result.ret_types);
-      EXPECT_EQ((DataTypeVector{DT_BOOL, DT_INT32, DT_INT32}),
-                result.arg_types);
-      TF_EXPECT_GRAPH_EQ(expected, result.gdef);
-    }
-
-    // else body.
-    {
-      Scope scope = Scope::NewRootScope().ExitOnError();
-      auto arg_0 = ops::_Arg(scope.WithOpName("arg0"), DT_BOOL, 0);
-      auto arg_1 = ops::_Arg(scope.WithOpName("arg1"), DT_INT32, 1);
-      auto arg_2 = ops::_Arg(scope.WithOpName("arg2"), DT_INT32, 2);
-      auto identity = ops::Identity(scope.WithOpName("cond/Identity_1"), arg_0);
-      auto cond_1 = ops::Const(
-          scope.WithOpName("cond_1").WithControlDependencies(identity), 23);
-      auto add = ops::Add(scope.WithOpName("cond/false/add"), arg_2, cond_1);
-      auto retval0 = ops::_Retval(scope.WithOpName("retval0_RetVal"), add, 0);
-
-      GraphDef expected;
-      TF_EXPECT_OK(scope.ToGraphDef(&expected));
-
-      InstantiationResultForTest result;
-      TF_EXPECT_OK(
-          InstantiateFunctionForTest(else_fn.name(), library, &result));
-
-      EXPECT_EQ(DataTypeVector{DT_INT32}, result.ret_types);
-      EXPECT_EQ((DataTypeVector{DT_BOOL, DT_INT32, DT_INT32}),
-                result.arg_types);
-      TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    // Functionalized function name is the type string of `cond_node`.
+    string func_name;
+    for (Node* n : graph.nodes()) {
+      if (n->name() == "cond_node") {
+        func_name = n->type_string();
+        break;
+      }
     }
+    TF_ASSERT_OK(flr->Instantiate(func_name, AttrSlice(), &handle));
+    const FunctionBody* body = flr->GetFunctionBody(handle);
+    GraphDef graph_def;
+    body->graph->ToGraphDef(&graph_def);
+    CheckGraphDef(graph_def, library);
+  } else {
+    // Check if graphs were functionalized.
+    CheckGraphDef(optimized_graph_def, library);
+    GraphDef converted_graph_def;
+    graph.ToGraphDef(&converted_graph_def);
+    CheckGraphDef(converted_graph_def, library);
   }
 }
 
-TEST_P(ConditionalTestFixture, ConditionalTests) { RunTest(); }
-
-INSTANTIATE_TEST_SUITE_P(
-    FunctionalizeControlFlow, ConditionalTestFixture, ::testing::Bool(),
-    [](const ::testing::TestParamInfo<ConditionalTestFixture::ParamType>&
-           info) { return info.param ? "with_filter" : "without_filter"; });
-
 // Returns the names of the "cond" and "body" functions for the While node
 // in a graph.
 Status FindWhileCondAndBody(const GraphDef& graph, NameAttrList* cond,
diff --git a/tensorflow/compiler/tf2xla/functionalize_while.cc b/tensorflow/compiler/tf2xla/functionalize_while.cc
index dce5efe5557..79412c4abc8 100644
--- a/tensorflow/compiler/tf2xla/functionalize_while.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_while.cc
@@ -24,11 +24,11 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "absl/strings/match.h"
 #include "absl/types/optional.h"
-#include "tensorflow/compiler/jit/union_find.h"
 #include "tensorflow/compiler/tf2xla/frontend_attributes_util.h"
 #include "tensorflow/compiler/tf2xla/functionalize_cond.h"
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/union_find.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/graph_to_functiondef.h"
 #include "tensorflow/core/framework/node_def_builder.h"
diff --git a/tensorflow/compiler/tf2xla/fused_batchnorm_reserve_space_test.cc b/tensorflow/compiler/tf2xla/fused_batchnorm_reserve_space_test.cc
index 1a26f974989..02f178f9acf 100644
--- a/tensorflow/compiler/tf2xla/fused_batchnorm_reserve_space_test.cc
+++ b/tensorflow/compiler/tf2xla/fused_batchnorm_reserve_space_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/cc/ops/array_ops.h"
 #include "tensorflow/cc/ops/const_op.h"
 #include "tensorflow/cc/ops/nn_ops.h"
+#include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/core/framework/device_attributes.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -139,5 +140,11 @@ TEST(FusedBatchnormReserveSpaceTest, Test) {
   test::ExpectClose(results[0], results[1], /*atol=*/1e-4);
   test::ExpectClose(results[2], results[3], /*atol=*/1e-4);
 }
+
+static bool Initialized = [] {
+  tensorflow::GetXlaDeviceFlags()->tf_xla_enable_xla_devices = true;
+  return true;
+}();
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index 26051c98cb7..0edd918a92d 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -108,6 +108,7 @@ tf_kernel_library(
         "stack_ops.cc",
         "stateful_random_ops.cc",
         "stateless_random_ops.cc",
+        "stateless_random_ops_v2.cc",
         "strided_slice_op.cc",
         "tensor_array_ops.cc",
         "tensor_list_ops.cc",
@@ -187,6 +188,7 @@ tf_kernel_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/kernels:stateful_random_ops_header",
+        "//tensorflow/core/kernels:stateless_random_ops_v2_header",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
diff --git a/tensorflow/compiler/tf2xla/kernels/broadcast_to_op.cc b/tensorflow/compiler/tf2xla/kernels/broadcast_to_op.cc
index d7a8e67dd33..807c061b60f 100644
--- a/tensorflow/compiler/tf2xla/kernels/broadcast_to_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/broadcast_to_op.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/lib/broadcast.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -28,13 +29,26 @@ class BroadcastToOp : public XlaOpKernel {
       : XlaOpKernel(context) {}
 
   void Compile(XlaOpKernelContext* context) override {
-    const TensorShape input_shape = context->InputShape(0);
     TensorShape output_shape;
     OP_REQUIRES_OK(context, context->ConstantInputAsShape(1, &output_shape));
+    auto output_status_or =
+        BroadcastTo(context->Input(0), output_shape.dim_sizes());
+    OP_REQUIRES_OK(context, output_status_or.status());
+    auto output = output_status_or.ValueOrDie();
+    std::vector<bool> dynamic_dims;
+    OP_REQUIRES_OK(
+        context, context->ResolveInputDynamismIntoPredVector(1, &dynamic_dims));
+    for (int64 dim = 0; dim < dynamic_dims.size(); ++dim) {
+      if (dynamic_dims[dim]) {
+        output = xla::SetDimensionSize(
+            output,
+            xla::Reshape(xla::Slice(context->Input(1), {dim}, {dim + 1}, {1}),
+                         {}),
+            dim);
+      }
+    }
 
-    auto output = BroadcastTo(context->Input(0), output_shape.dim_sizes());
-    OP_REQUIRES_OK(context, output.status());
-    context->SetOutput(0, output.ValueOrDie());
+    context->SetOutput(0, output);
   }
 };
 
diff --git a/tensorflow/compiler/tf2xla/kernels/dequantize_op.cc b/tensorflow/compiler/tf2xla/kernels/dequantize_op.cc
index 7ac38369eb4..ad94c1383f8 100644
--- a/tensorflow/compiler/tf2xla/kernels/dequantize_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/dequantize_op.cc
@@ -63,36 +63,27 @@ class DequantizeOp : public XlaOpKernel {
   void Compile(XlaOpKernelContext* ctx) override {
     DataType input_type = ctx->input_type(0);
 
-    double minrange, maxrange;
-
-    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsFloatScalar(1, &minrange));
-    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsFloatScalar(2, &maxrange));
-
-    float min_range = static_cast<float>(minrange);
-    float max_range = static_cast<float>(maxrange);
-    float full_range, half_range;
+    xla::XlaOp input = ctx->Input(0);
+    xla::XlaOp output = xla::ConvertElementType(input, xla::F32);
+    xla::XlaOp min_range = xla::ConvertElementType(ctx->Input(1), xla::F32);
+    xla::XlaOp max_range = xla::ConvertElementType(ctx->Input(2), xla::F32);
+    xla::XlaOp full_range;
+    xla::XlaOp half_range;
     if (input_type == DT_QINT8) {
-      full_range = get_fullrange<qint8>();
-      half_range = (full_range + 1.0f) / 2.0f;
+      full_range = ScalarLike(output, get_fullrange<qint8>());
+      half_range =
+          (full_range + ScalarLike(output, 1.0f)) / ScalarLike(output, 2.0f);
     } else {
       OP_REQUIRES(ctx, input_type == DT_QUINT8,
                   errors::InvalidArgument(
                       "Only support DT_QINT8 or DT_QUINT8, got ", input_type));
-      full_range = get_fullrange<quint8>();
-      half_range = 0.0f;
+      full_range = ScalarLike(output, get_fullrange<quint8>());
+      half_range = ScalarLike(output, 0.0f);
     }
 
-    float scale_factor = (max_range - min_range) / full_range;
+    xla::XlaOp scale = (max_range - min_range) / full_range;
 
-    xla::XlaOp input = ctx->Input(0);
-    xla::XlaOp output;
-
-    output = xla::ConvertElementType(input, xla::F32);
-
-    auto scale = ScalarLike(output, scale_factor);
-    auto halfrange = ScalarLike(output, half_range);
-    output = xla::Add(xla::Mul(xla::Add(output, halfrange), scale),
-                      ScalarLike(output, min_range));
+    output = xla::Add(xla::Mul(xla::Add(output, half_range), scale), min_range);
 
     if (dtype_ == DT_BFLOAT16) {
       output = xla::ConvertElementType(output, xla::BF16);
diff --git a/tensorflow/compiler/tf2xla/kernels/gather_scatter_ops.cc b/tensorflow/compiler/tf2xla/kernels/gather_scatter_ops.cc
index 19aa85f9d42..b4b18dd2b36 100644
--- a/tensorflow/compiler/tf2xla/kernels/gather_scatter_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/gather_scatter_ops.cc
@@ -49,7 +49,8 @@ class GatherOp : public XlaOpKernel {
   bool indices_are_sorted_;
 };
 
-REGISTER_XLA_OP(Name("XlaGather"), GatherOp);
+REGISTER_XLA_OP(Name("XlaGather").CompileTimeConstantInput("slice_sizes"),
+                GatherOp);
 
 class ScatterOp : public XlaOpKernel {
  public:
diff --git a/tensorflow/compiler/tf2xla/kernels/replica_id_op.cc b/tensorflow/compiler/tf2xla/kernels/replica_id_op.cc
index 46585a26769..71920372cde 100644
--- a/tensorflow/compiler/tf2xla/kernels/replica_id_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/replica_id_op.cc
@@ -30,7 +30,8 @@ class XlaReplicaIdOp : public XlaOpKernel {
 };
 
 void XlaReplicaIdOp::Compile(XlaOpKernelContext* ctx) {
-  ctx->SetOutput(0, xla::ReplicaId(ctx->builder()));
+  ctx->SetOutput(
+      0, xla::ConvertElementType(xla::ReplicaId(ctx->builder()), xla::S32));
 }
 
 REGISTER_XLA_OP(Name("XlaReplicaId"), XlaReplicaIdOp);
diff --git a/tensorflow/compiler/tf2xla/kernels/reshape_op.cc b/tensorflow/compiler/tf2xla/kernels/reshape_op.cc
index bf9a9150ea6..213045e428a 100644
--- a/tensorflow/compiler/tf2xla/kernels/reshape_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reshape_op.cc
@@ -19,8 +19,10 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -108,32 +110,73 @@ class ReshapeOp : public XlaOpKernel {
 
     VLOG(2) << "Reshape from " << input_shape.DebugString() << " to "
             << shape.DebugString() << ", unknown_index=" << unknown_index;
+    auto input_xla_shape = ctx->InputXlaShape(0);
+    if (input_xla_shape->is_static()) {
+      ctx->SetOutput(0, xla::Reshape(ctx->Input(0), shape.dim_sizes()));
+      return;
+    }
+    // Handing dynamic reshapes if input contains a dynamic dimension.
+    std::vector<xla::XlaOp> output_dim_sizes;
+    std::vector<bool> dims_are_dynamic;
+    for (int64 i = 0; i < shape.dims(); ++i) {
+      output_dim_sizes.push_back(
+          xla::Reshape(xla::Slice(ctx->Input(1), {i}, {i + 1}, {1}), {}));
+    }
+    OP_REQUIRES_OK(
+        ctx, ctx->ResolveInputDynamismIntoPredVector(1, &dims_are_dynamic));
+    if (unknown_index == -1) {
+      // No unknown index.
+      ctx->SetOutput(0,
+                     xla::DynamicReshape(ctx->Input(0), output_dim_sizes,
+                                         shape.dim_sizes(), dims_are_dynamic));
+      return;
+    }
+    auto common_factors =
+        xla::CommonFactors(input_shape.dim_sizes(), shape.dim_sizes());
 
-    shape_input.clear();
-    // Run get input again, this time with dynamic dimension represented as
-    // "-1"
-    ctx->set_dynamic_dimension_is_minus_one(true);
-    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(1, &shape_input));
-
-    int dynamic_dimension = -1;
-
-    for (int d = 0; d < num_dims; ++d) {
-      const int32 size = shape_input[d];
-      if (size == -1) {
-        if (dynamic_dimension == -1) {
-          dynamic_dimension = d;
+    // Find common_factors that the input belongs to.
+    for (int64 i = 0; i < common_factors.size() - 1; ++i) {
+      auto start = common_factors[i];
+      auto end = common_factors[i + 1];
+      bool input_is_dynamic = false;
+      // product of all input dims in this group. E.g., in
+      // reshape(Tensor([2, 3, 3]), [3, -1, 3]) product of the group
+      // containing -1 will be 6.
+      xla::XlaOp product = xla::One(ctx->builder(), xla::S32);
+      for (int64 dim = start.first; dim < end.first; ++dim) {
+        if (input_xla_shape->is_dynamic_dimension(dim)) {
+          input_is_dynamic = true;
+        }
+        product = xla::Mul(product, xla::GetDimensionSize(ctx->Input(0), dim));
+      }
+      bool unknown_dim_in_group = false;
+      // The real size for the -1 dimension in a reshape. E.g., in
+      // reshape(Tensor([2, 3, 3]), [3, -1, 3]) this will be 2.
+      xla::XlaOp unknown_dim_size = product;
+      for (int64 dim = start.second; dim < end.second; ++dim) {
+        if (dim == unknown_index) {
+          unknown_dim_in_group = true;
         } else {
-          if (unknown_index != d) {
-            dynamic_dimension = d;
-          }
+          unknown_dim_size = xla::Div(unknown_dim_size, output_dim_sizes[dim]);
         }
       }
-    }
 
-    // Pass unknown_index to Xla::Reshape as a hint for dynamic shape inference
-    // in XLA to know which output dimension is dynamic.
-    ctx->SetOutput(0, xla::ReshapeWithInferredDimension(
-                          ctx->Input(0), shape.dim_sizes(), dynamic_dimension));
+      if (unknown_dim_in_group) {
+        // If input dim is dynamic, output dim at the -1 position must be
+        // dynamic. Similarly, if input dim is static, output dim has to be
+        // static at the -1 dimension.
+        dims_are_dynamic[unknown_index] = input_is_dynamic;
+        output_dim_sizes[unknown_index] = unknown_dim_size;
+
+        ctx->SetOutput(
+            0, xla::DynamicReshape(ctx->Input(0), output_dim_sizes,
+                                   shape.dim_sizes(), dims_are_dynamic));
+        VLOG(2) << "Reshape from " << ctx->InputXlaShape(0)->ToString()
+                << " to " << xla::VectorString(shape.dim_sizes())
+                << ", dynamic_dims=" << xla::VectorString(dims_are_dynamic);
+        return;
+      }
+    }
   }
 };
 
diff --git a/tensorflow/compiler/tf2xla/kernels/segment_reduction_ops.cc b/tensorflow/compiler/tf2xla/kernels/segment_reduction_ops.cc
index 97359f81eee..d63b8146491 100644
--- a/tensorflow/compiler/tf2xla/kernels/segment_reduction_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/segment_reduction_ops.cc
@@ -74,12 +74,44 @@ class UnsortedSegmentReduce : public XlaOpKernel {
                                   " vs. ", indices_shape.dim_size(d)));
     }
     xla::XlaBuilder* builder = ctx->builder();
+    // data shape = [indices_shape, segment_shape]
+    // buffer shape = [num_segment, segment_shape]
+    // We now create the buffer shape by reverse enginerring data shape into
+    // indices shape and segment shape.
     TensorShape buffer_shape = data_shape;
     buffer_shape.RemoveDimRange(0, indices_shape.dims());
     buffer_shape.InsertDim(0, num_segments);
+
     auto buffer =
         xla::Broadcast(InitialValue(builder), buffer_shape.dim_sizes());
 
+    // Build dynamic dim sizes for buffer, as well as whether each dimension
+    // size is dynamic or static. We build two parts: num_sgement part and
+    // segment_shape part.
+    std::vector<xla::XlaOp> buffer_dims;
+    std::vector<bool> buffer_dims_are_dynamic;
+    // Build the "num_segment" part.
+    bool num_segments_is_dynamic;
+    OP_REQUIRES_OK(
+        ctx, ctx->ResolveInputDynamismIntoPred(2, &num_segments_is_dynamic));
+
+    buffer_dims.insert(buffer_dims.begin(), ctx->Input(2));
+    buffer_dims_are_dynamic.insert(buffer_dims_are_dynamic.begin(),
+                                   num_segments_is_dynamic);
+    // Build the segment shape part.
+    for (int64 i = indices_shape.dims(); i < data_shape.dims(); ++i) {
+      buffer_dims.push_back(xla::GetDimensionSize(data, i));
+      buffer_dims_are_dynamic.push_back(
+          ctx->InputXlaShape(0)->is_dynamic_dimension(i));
+    }
+
+    for (int64 i = 0; i < buffer_dims.size(); ++i) {
+      if (buffer_dims_are_dynamic[i]) {
+        // For each dynamic dimension, call set-dimension-size on it.
+        buffer = xla::SetDimensionSize(buffer, buffer_dims[i], i);
+      }
+    }
+
     auto combiner = [this](xla::XlaOp a, xla::XlaOp b,
                            xla::XlaBuilder* builder) { return Combine(a, b); };
 
diff --git a/tensorflow/compiler/tf2xla/kernels/shape_op.cc b/tensorflow/compiler/tf2xla/kernels/shape_op.cc
index 85917af6a65..75faa2eac81 100644
--- a/tensorflow/compiler/tf2xla/kernels/shape_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/shape_op.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 // XLA-specific Shape Ops.
 
+#include "absl/strings/str_format.h"
 #include "tensorflow/compiler/tf2xla/kernels/shape_util.h"
 #include "tensorflow/compiler/tf2xla/kernels/tensor_list_utils.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
@@ -24,6 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -65,6 +67,47 @@ class ShapeOp : public XlaOpKernel {
 
 REGISTER_XLA_OP(Name("Shape").CompilationOnly().IsMetadataOp(), ShapeOp);
 
+class XlaSetBoundOp : public XlaOpKernel {
+ public:
+  explicit XlaSetBoundOp(OpKernelConstruction* context)
+      : XlaOpKernel(context) {}
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    const TensorShape input_shape = ctx->InputShape("input");
+    const TensorShape bound_shape = ctx->InputShape("bound");
+
+    OP_REQUIRES(
+        ctx,
+        ctx->InputType("bound") == DT_INT32 &&
+            ctx->InputType("input") == DT_INT32,
+        errors::InvalidArgument(
+            "XlaSetBound can only set bound for int32 scalar value: got",
+            input_shape.DebugString()));
+
+    OP_REQUIRES(
+        ctx, input_shape.dims() == 0,
+        errors::InvalidArgument("XlaSetBound should only be used to set a "
+                                "bound to the an int32 scalar value: got",
+                                input_shape.DebugString()));
+
+    OP_REQUIRES(
+        ctx, bound_shape.dims() == 0,
+        errors::InvalidArgument("XlaSetBound should only be used to set a "
+                                "bound to the an int32 scalar value: got",
+                                bound_shape.DebugString()));
+    int64 bound;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar("bound", &bound));
+
+    xla::XlaOp result = xla::CustomCall(
+        ctx->builder(), "SetBound", {ctx->Input("input")},
+        ctx->InputXlaShape("input").ValueOrDie(), absl::StrFormat("%d", bound));
+    ctx->SetOutput(0, result);
+  }
+};
+
+REGISTER_XLA_OP(Name("XlaSetBound").CompileTimeConstantInput("bound"),
+                XlaSetBoundOp);
+
 class ShapeNOp : public XlaOpKernel {
  public:
   explicit ShapeNOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
diff --git a/tensorflow/compiler/tf2xla/kernels/stateful_random_ops.cc b/tensorflow/compiler/tf2xla/kernels/stateful_random_ops.cc
index 46d4b70606e..a46cceddced 100644
--- a/tensorflow/compiler/tf2xla/kernels/stateful_random_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/stateful_random_ops.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/rng_alg.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/lib/math/math_util.h"
@@ -180,7 +181,7 @@ Status CompileImpl(
   }
   xla::Literal alg_literal;
   TF_RETURN_IF_ERROR(ctx->ConstantInput(alg_input_idx, &alg_literal));
-  auto alg = alg_literal.Get<Algorithm>({});
+  Algorithm alg = Algorithm(alg_literal.Get<int>({}));
   if (!(alg == RNG_ALG_THREEFRY || alg == RNG_ALG_PHILOX)) {
     return errors::InvalidArgument("Unsupported algorithm id: ", alg);
   }
@@ -407,5 +408,80 @@ REGISTER_XLA_OP(Name("StatefulUniformFullInt")
                                     {DT_INT32, DT_UINT32, DT_INT64, DT_UINT64}),
                 StatefulUniformFullIntOp);
 
+xla::XlaOp IncreaseCounter(Algorithm const& alg, xla::XlaOp counter,
+                           xla::XlaOp delta) {
+  // Multiplying 256 to be consistent with the CPU/GPU kernels
+  delta = delta * ConstantR0WithType(delta.builder(), xla::U64, 256);
+  if (alg == RNG_ALG_PHILOX) {
+    return xla::PhiloxIncreaseCounter(counter, delta);
+  } else {
+    return counter + delta;
+  }
+}
+
+xla::XlaOp PadRight(xla::XlaOp a, int n) {
+  return xla::Pad(a, xla::ScalarLike(a, 0),
+                  xla::MakeEdgePaddingConfig({{0, n}}));
+}
+
+template <typename AlgEnumType = int64, bool read_old_value = false>
+class RngSkipOp : public XlaOpKernel {
+ public:
+  explicit RngSkipOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    const int state_input_idx = 0;
+    const int alg_input_idx = 1;
+    const int delta_input_idx = 2;
+    xla::XlaOp var;
+    TensorShape var_shape;
+    OP_REQUIRES_OK(ctx,
+                   ctx->ReadVariableInput(state_input_idx, STATE_ELEMENT_DTYPE,
+                                          &var_shape, &var));
+    xla::Literal alg_literal;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInput(alg_input_idx, &alg_literal));
+    Algorithm alg = Algorithm(alg_literal.Get<AlgEnumType>({}));
+    OP_REQUIRES(ctx, alg == RNG_ALG_THREEFRY || alg == RNG_ALG_PHILOX,
+                errors::InvalidArgument("Unsupported algorithm id: ", alg));
+    OP_REQUIRES_OK(ctx, CheckStateShape(alg, var_shape));
+    if (read_old_value) {
+      auto counter_size = GetCounterSize(alg);
+      xla::XlaOp output = var;
+      if (RNG_MAX_COUNTER_SIZE > counter_size) {
+        // Because the size of `var` depends on the algorithm while we want the
+        // output to have a fixed size (to help shape inference), we fix the
+        // output size to be the maximal state size among algorithms, and right-
+        // pad it with zeros if var's size is smaller than that.
+        output = PadRight(output, RNG_MAX_COUNTER_SIZE - counter_size);
+      }
+      ctx->SetOutput(0, output);
+    }
+    xla::XlaOp counter;
+    xla::XlaOp key;
+    std::tie(counter, key) = StateAndKeyFromVariable(alg, var);
+    xla::XlaOp delta = ctx->Input(delta_input_idx);
+    delta = BitcastConvertType(delta, xla::U64);
+    auto new_counter = IncreaseCounter(alg, counter, delta);
+    var = StateAndKeyToVariable(alg, new_counter, key);
+    xla::PrimitiveType state_element_type;
+    OP_REQUIRES_OK(
+        ctx, DataTypeToPrimitiveType(STATE_ELEMENT_DTYPE, &state_element_type));
+    var = BitcastConvertType(var, state_element_type);
+    OP_REQUIRES_OK(
+        ctx, ctx->AssignVariable(state_input_idx, STATE_ELEMENT_DTYPE, var));
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(RngSkipOp);
+};
+
+REGISTER_XLA_OP(Name("RngSkip").CompileTimeConstantInput("algorithm"),
+                RngSkipOp<>);
+
+using RngReadAndSkipOp = RngSkipOp<int32, true>;
+
+REGISTER_XLA_OP(Name("RngReadAndSkip").CompileTimeConstantInput("alg"),
+                RngReadAndSkipOp);
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc b/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
index 13c3dbe489e..e606812bc4e 100644
--- a/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
@@ -111,6 +111,8 @@ xla::XlaOp StatelessRngUniform(absl::string_view device_type_string,
   }
 }
 
+namespace {
+
 xla::XlaOp StatelessRngUniformFullInt(absl::string_view device_type_string,
                                       xla::XlaOp seeds,
                                       const xla::Shape& shape) {
@@ -140,8 +142,6 @@ xla::XlaOp StatelessRngUniformFullInt(absl::string_view device_type_string,
   }
 }
 
-namespace {
-
 class StatelessRandomUniformOp : public XlaOpKernel {
  public:
   explicit StatelessRandomUniformOp(OpKernelConstruction* ctx)
diff --git a/tensorflow/compiler/tf2xla/kernels/stateless_random_ops_v2.cc b/tensorflow/compiler/tf2xla/kernels/stateless_random_ops_v2.cc
new file mode 100644
index 00000000000..e46fec3c576
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/stateless_random_ops_v2.cc
@@ -0,0 +1,485 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/stateless_random_ops_v2.h"
+
+#include <cmath>
+
+#include "tensorflow/compiler/tf2xla/kernels/random_ops_util.h"
+#include "tensorflow/compiler/tf2xla/lib/random.h"
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/type_util.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/client/lib/math.h"
+#include "tensorflow/compiler/xla/client/lib/prng.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/rng_alg.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/math/math_util.h"
+
+namespace tensorflow {
+
+namespace {
+
+inline xla::RandomAlgorithm AlgorithmToRandomAlgorithm(Algorithm const& alg) {
+  if (alg == RNG_ALG_PHILOX) {
+    return xla::RandomAlgorithm::RNG_PHILOX;
+  }
+  return xla::RandomAlgorithm::RNG_THREE_FRY;
+}
+
+inline Algorithm RandomAlgorithmToAlgorithm(xla::RandomAlgorithm const& alg) {
+  if (alg == xla::RandomAlgorithm::RNG_PHILOX) {
+    return RNG_ALG_PHILOX;
+  }
+  return RNG_ALG_THREEFRY;
+}
+
+xla::XlaOp GetCounter(xla::RandomAlgorithm const& alg, xla::XlaOp state) {
+  Algorithm alg_ = RandomAlgorithmToAlgorithm(alg);
+  return xla::Slice(state, {RNG_KEY_SIZE},
+                    {RNG_KEY_SIZE + GetCounterSize(alg_)}, {1});
+}
+
+xla::RngOutput BitGenerator(xla::RandomAlgorithm const& alg, xla::XlaOp key,
+                            xla::XlaOp counter, const xla::Shape& shape) {
+  key = BitcastConvertType(key, xla::U64);
+  counter = BitcastConvertType(counter, xla::U64);
+  xla::XlaOp state = xla::ConcatInDim(key.builder(), {key, counter}, 0);
+  xla::XlaOp result = xla::RngBitGenerator(alg, state, shape);
+  auto new_counter = GetCounter(alg, xla::GetTupleElement(result, 0));
+  new_counter = BitcastConvertType(new_counter, xla::S64);
+  return xla::RngOutput{/*value=*/xla::GetTupleElement(result, 1),
+                        /*state=*/new_counter};
+}
+
+std::tuple<xla::XlaOp, xla::XlaOp, Algorithm> GetKeyCounterAlg(
+    absl::string_view device_type_string, xla::XlaOp key) {
+  // The Philox algorithm may cause performance regression on other devices.
+  // Turn on the Philox algorithm for the CPU and GPU backends only.
+  if (device_type_string == DEVICE_GPU_XLA_JIT ||
+      device_type_string == DEVICE_CPU_XLA_JIT) {
+    auto counter_key = xla::ScramblePhiloxKey(key);
+    return std::make_tuple(counter_key.second, counter_key.first,
+                           RNG_ALG_PHILOX);
+  } else {
+    auto counter_shape =
+        xla::ShapeUtil::MakeShape(xla::U64, {RNG_MAX_COUNTER_SIZE});
+    auto counter = xla::Zeros(key.builder(), counter_shape);
+    return std::make_tuple(key, counter, RNG_ALG_THREEFRY);
+  }
+}
+
+}  // namespace
+
+xla::RngOutput StatelessRngUniformV2(xla::RandomAlgorithm const& alg,
+                                   xla::XlaOp key, xla::XlaOp counter,
+                                   const xla::Shape& shape, xla::XlaOp minval,
+                                   xla::XlaOp maxval) {
+  xla::XlaBuilder* builder = key.builder();
+  xla::PrimitiveType type = shape.element_type();
+  using std::placeholders::_1;
+  using std::placeholders::_2;
+  using std::placeholders::_3;
+  auto generator = std::bind(BitGenerator, alg, _1, _2, _3);
+  switch (type) {
+    case xla::F32:
+    case xla::F64:
+      return xla::UniformFloatingPointDistribution(key, counter, generator,
+                                                   minval, maxval, shape);
+    case xla::S32:
+    case xla::S64:
+    case xla::U32:
+    case xla::U64:
+      return UniformIntDistribution(key, counter, generator, minval, maxval,
+                                    shape);
+      break;
+    default:
+      return {builder->ReportError(xla::Unimplemented(
+                  "Types other than F32, S32, S64, U32 and U64 are not "
+                  "implemented by "
+                  "StatelessRngUniformV2; got %s",
+                  xla::primitive_util::LowercasePrimitiveTypeName(type))),
+              counter};
+  }
+}
+
+namespace {
+
+xla::RngOutput StatelessRngUniformFullInt(xla::RandomAlgorithm const& alg,
+                                          xla::XlaOp key, xla::XlaOp counter,
+                                          const xla::Shape& shape) {
+  xla::XlaBuilder* builder = key.builder();
+
+  xla::PrimitiveType type = shape.element_type();
+  xla::RngOutput output = BitGenerator(alg, key, counter, shape);
+  switch (type) {
+    case xla::U32:
+    case xla::U64:
+      return output;
+    case xla::S32:
+    case xla::S64:
+      return xla::RngOutput{BitcastConvertType(output.value, type),
+                            output.state};
+    default:
+      return {
+          builder->ReportError(xla::Unimplemented(
+              "Types other than U32, S32, U64 and S64 are not implemented by "
+              "StatelessRngUniformFullInt; got: %s",
+              xla::primitive_util::LowercasePrimitiveTypeName(type))),
+          output.state};
+  }
+}
+
+Status GetAlgorithm(XlaOpKernelContext* ctx, int alg_input_idx,
+                    xla::RandomAlgorithm* alg) {
+  auto alg_shape = ctx->InputShape(alg_input_idx);
+  if (alg_shape.dims() != 0) {
+    return errors::InvalidArgument("algorithm must be of shape [], not ",
+                                   alg_shape.DebugString());
+  }
+  xla::Literal alg_literal;
+  TF_RETURN_IF_ERROR(ctx->ConstantInput(alg_input_idx, &alg_literal));
+  auto alg_ = Algorithm(alg_literal.Get<int>({}));
+  *alg = AlgorithmToRandomAlgorithm(alg_);
+  return Status::OK();
+}
+
+xla::XlaOp MaybeSliceCounter(xla::RandomAlgorithm const& alg,
+                             TensorShape const& counter_shape,
+                             xla::XlaOp counter) {
+  auto input_counter_size = counter_shape.dim_size(0);
+  auto real_counter_size = GetCounterSize(RandomAlgorithmToAlgorithm(alg));
+  if (input_counter_size > real_counter_size) {
+    counter = xla::Slice(counter, {0}, {real_counter_size}, {1});
+  }
+  return counter;
+}
+
+class StatelessRandomUniformOp : public XlaOpKernel {
+ public:
+  explicit StatelessRandomUniformOp(OpKernelConstruction* ctx)
+      : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dtype", &dtype_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::XlaBuilder* builder = ctx->builder();
+
+    TensorShape shape;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsShape(0, &shape));
+
+    const int key_input_idx = 1;
+    const int counter_input_idx = 2;
+    const int alg_input_idx = 3;
+    xla::XlaOp key = ctx->Input(key_input_idx);
+    xla::XlaOp counter = ctx->Input(counter_input_idx);
+
+    xla::RandomAlgorithm alg;
+    OP_REQUIRES_OK(ctx, GetAlgorithm(ctx, alg_input_idx, &alg));
+
+    auto counter_shape = ctx->InputShape(counter_input_idx);
+    OP_REQUIRES_OK(ctx, CheckKeyCounterShape(RandomAlgorithmToAlgorithm(alg),
+                                             ctx->InputShape(key_input_idx),
+                                             counter_shape));
+
+    DataType rng_dtype = dtype_ == DT_DOUBLE ? DT_DOUBLE : DT_FLOAT;
+    xla::Shape xla_shape;
+    OP_REQUIRES_OK(ctx, TensorShapeToXLAShape(rng_dtype, shape, &xla_shape));
+    xla::PrimitiveType rng_primitive_type = xla_shape.element_type();
+
+    counter = MaybeSliceCounter(alg, counter_shape, counter);
+
+    auto result = StatelessRngUniformV2(
+        alg, key, counter, xla_shape,
+        xla::ConstantR0WithType(builder, rng_primitive_type, 0.0),
+        xla::ConstantR0WithType(builder, rng_primitive_type, 1.0));
+    auto uniform = MaybeConvertF32ToBF16(result.value, dtype_);
+    ctx->SetOutput(0, uniform);
+  }
+
+ private:
+  DataType dtype_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(StatelessRandomUniformOp);
+};
+
+REGISTER_XLA_OP(Name("StatelessRandomUniformV2")
+                    .CompileTimeConstantInput("shape")
+                    .CompileTimeConstantInput("alg")
+                    .TypeConstraint("dtype",
+                                    {DT_DOUBLE, DT_FLOAT, DT_BFLOAT16}),
+                StatelessRandomUniformOp);
+
+class StatelessRandomUniformIntOp : public XlaOpKernel {
+ public:
+  explicit StatelessRandomUniformIntOp(OpKernelConstruction* ctx)
+      : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dtype", &dtype_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    TensorShape shape;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsShape(0, &shape));
+
+    const int key_input_idx = 1;
+    const int counter_input_idx = 2;
+    const int alg_input_idx = 3;
+    xla::XlaOp key = ctx->Input(key_input_idx);
+    xla::XlaOp counter = ctx->Input(counter_input_idx);
+
+    xla::RandomAlgorithm alg;
+    OP_REQUIRES_OK(ctx, GetAlgorithm(ctx, alg_input_idx, &alg));
+
+    auto counter_shape = ctx->InputShape(counter_input_idx);
+    OP_REQUIRES_OK(ctx, CheckKeyCounterShape(RandomAlgorithmToAlgorithm(alg),
+                                             ctx->InputShape(key_input_idx),
+                                             counter_shape));
+
+    const int minval_input_idx = 4;
+    const int maxval_input_idx = 5;
+    TensorShape minval_shape = ctx->InputShape(minval_input_idx);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(minval_shape),
+                errors::InvalidArgument("minval must be scalar, got shape ",
+                                        minval_shape.DebugString()));
+    TensorShape maxval_shape = ctx->InputShape(maxval_input_idx);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(maxval_shape),
+                errors::InvalidArgument("maxval must be scalar, got shape ",
+                                        maxval_shape.DebugString()));
+
+    xla::XlaOp minval = ctx->Input(minval_input_idx);
+    xla::XlaOp maxval = ctx->Input(maxval_input_idx);
+
+    xla::Shape xla_shape;
+    OP_REQUIRES_OK(ctx, TensorShapeToXLAShape(dtype_, shape, &xla_shape));
+
+    counter = MaybeSliceCounter(alg, counter_shape, counter);
+    auto result =
+        StatelessRngUniformV2(alg, key, counter, xla_shape, minval, maxval);
+    ctx->SetOutput(0, result.value);
+  }
+
+ private:
+  DataType dtype_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(StatelessRandomUniformIntOp);
+};
+
+REGISTER_XLA_OP(Name("StatelessRandomUniformIntV2")
+                    .CompileTimeConstantInput("shape")
+                    .CompileTimeConstantInput("alg")
+                    .TypeConstraint("dtype",
+                                    {DT_INT32, DT_INT64, DT_UINT32, DT_UINT64}),
+                StatelessRandomUniformIntOp);
+
+class StatelessRandomUniformFullIntOp : public XlaOpKernel {
+ public:
+  explicit StatelessRandomUniformFullIntOp(OpKernelConstruction* ctx)
+      : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dtype", &dtype_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    TensorShape shape;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsShape(0, &shape));
+
+    const int key_input_idx = 1;
+    const int counter_input_idx = 2;
+    const int alg_input_idx = 3;
+    xla::XlaOp key = ctx->Input(key_input_idx);
+    xla::XlaOp counter = ctx->Input(counter_input_idx);
+
+    xla::RandomAlgorithm alg;
+    OP_REQUIRES_OK(ctx, GetAlgorithm(ctx, alg_input_idx, &alg));
+
+    auto counter_shape = ctx->InputShape(counter_input_idx);
+    OP_REQUIRES_OK(ctx, CheckKeyCounterShape(RandomAlgorithmToAlgorithm(alg),
+                                             ctx->InputShape(key_input_idx),
+                                             counter_shape));
+
+    xla::Shape xla_shape;
+    OP_REQUIRES_OK(ctx, TensorShapeToXLAShape(dtype_, shape, &xla_shape));
+
+    counter = MaybeSliceCounter(alg, counter_shape, counter);
+    auto result = StatelessRngUniformFullInt(alg, key, counter, xla_shape);
+    ctx->SetOutput(0, result.value);
+  }
+
+ private:
+  DataType dtype_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(StatelessRandomUniformFullIntOp);
+};
+
+REGISTER_XLA_OP(Name("StatelessRandomUniformFullIntV2")
+                    .CompileTimeConstantInput("shape")
+                    .CompileTimeConstantInput("alg")
+                    .TypeConstraint("dtype",
+                                    {DT_INT32, DT_INT64, DT_UINT32, DT_UINT64}),
+                StatelessRandomUniformFullIntOp);
+
+class StatelessRandomNormalOp : public XlaOpKernel {
+ public:
+  explicit StatelessRandomNormalOp(OpKernelConstruction* ctx)
+      : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dtype", &dtype_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    TensorShape shape;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsShape(0, &shape));
+
+    const int key_input_idx = 1;
+    const int counter_input_idx = 2;
+    const int alg_input_idx = 3;
+    xla::XlaOp key = ctx->Input(key_input_idx);
+    xla::XlaOp counter = ctx->Input(counter_input_idx);
+
+    xla::RandomAlgorithm alg;
+    OP_REQUIRES_OK(ctx, GetAlgorithm(ctx, alg_input_idx, &alg));
+
+    auto counter_shape = ctx->InputShape(counter_input_idx);
+    OP_REQUIRES_OK(ctx, CheckKeyCounterShape(RandomAlgorithmToAlgorithm(alg),
+                                             ctx->InputShape(key_input_idx),
+                                             counter_shape));
+
+    DataType rng_dtype = dtype_ == DT_DOUBLE ? DT_DOUBLE : DT_FLOAT;
+
+    xla::Shape xla_shape;
+    OP_REQUIRES_OK(ctx, TensorShapeToXLAShape(rng_dtype, shape, &xla_shape));
+
+    using std::placeholders::_1;
+    using std::placeholders::_2;
+    using std::placeholders::_3;
+    auto generator = std::bind(BitGenerator, alg, _1, _2, _3);
+    counter = MaybeSliceCounter(alg, counter_shape, counter);
+    auto result = xla::NormalFloatingPointDistribution(key, counter, generator,
+                                                       xla_shape);
+    auto normal = MaybeConvertF32ToBF16(result.value, dtype_);
+    ctx->SetOutput(0, normal);
+  }
+
+ private:
+  DataType dtype_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(StatelessRandomNormalOp);
+};
+
+REGISTER_XLA_OP(Name("StatelessRandomNormalV2")
+                    .CompileTimeConstantInput("shape")
+                    .CompileTimeConstantInput("alg")
+                    .TypeConstraint("dtype",
+                                    {DT_DOUBLE, DT_FLOAT, DT_BFLOAT16}),
+                StatelessRandomNormalOp);
+
+class StatelessTruncatedNormalOp : public XlaOpKernel {
+ public:
+  explicit StatelessTruncatedNormalOp(OpKernelConstruction* ctx)
+      : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dtype", &dtype_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    TensorShape shape;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsShape(0, &shape));
+
+    const int key_input_idx = 1;
+    const int counter_input_idx = 2;
+    const int alg_input_idx = 3;
+    xla::XlaOp key = ctx->Input(key_input_idx);
+    xla::XlaOp counter = ctx->Input(counter_input_idx);
+
+    xla::RandomAlgorithm alg;
+    OP_REQUIRES_OK(ctx, GetAlgorithm(ctx, alg_input_idx, &alg));
+
+    auto counter_shape = ctx->InputShape(counter_input_idx);
+    OP_REQUIRES_OK(ctx, CheckKeyCounterShape(RandomAlgorithmToAlgorithm(alg),
+                                             ctx->InputShape(key_input_idx),
+                                             counter_shape));
+
+    xla::XlaBuilder* builder = ctx->builder();
+
+    DataType rng_dtype = dtype_ == DT_DOUBLE ? DT_DOUBLE : DT_FLOAT;
+    xla::Shape xla_shape;
+    OP_REQUIRES_OK(ctx, TensorShapeToXLAShape(rng_dtype, shape, &xla_shape));
+
+    counter = MaybeSliceCounter(alg, counter_shape, counter);
+    auto result = StatelessRngUniformV2(
+        alg, key, counter, xla_shape,
+        xla::MinPositiveNormalValue(builder, xla_shape.element_type()),
+        xla::One(builder, xla_shape.element_type()));
+    xla::XlaOp truncated_normal = TruncatedNormal(result.value);
+    truncated_normal = MaybeConvertF32ToBF16(truncated_normal, dtype_);
+    ctx->SetOutput(0, truncated_normal);
+  }
+
+ private:
+  DataType dtype_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(StatelessTruncatedNormalOp);
+};
+
+REGISTER_XLA_OP(Name("StatelessTruncatedNormalV2")
+                    .CompileTimeConstantInput("shape")
+                    .CompileTimeConstantInput("alg")
+                    .TypeConstraint("dtype",
+                                    {DT_DOUBLE, DT_FLOAT, DT_BFLOAT16}),
+                StatelessTruncatedNormalOp);
+
+class GetKeyCounterAlgOp : public XlaOpKernel {
+ public:
+  explicit GetKeyCounterAlgOp(OpKernelConstruction* ctx)
+      : XlaOpKernel(ctx),
+        device_type_string_(ctx->device_type().type_string()) {}
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    TensorShape seed_shape = ctx->InputShape(0);
+    OP_REQUIRES(ctx, seed_shape == TensorShape({2}),
+                errors::InvalidArgument("seed must have shape [2], not ",
+                                        seed_shape.DebugString()));
+    xla::XlaOp seed = ctx->Input(0);
+
+    xla::XlaBuilder* builder = seed.builder();
+    xla::XlaOp seed0 = xla::Reshape(xla::Slice(seed, {0}, {1}, {1}), {});
+    xla::XlaOp seed1 = xla::Reshape(xla::Slice(seed, {1}, {2}, {1}), {});
+    xla::XlaOp key = ConvertElementType(seed0, xla::U64) |
+                     ShiftLeft(ConvertElementType(seed1, xla::U64),
+                               ConstantR0WithType(builder, xla::U64, 32));
+    auto key_counter_alg = GetKeyCounterAlg(device_type_string_, key);
+    key = std::get<0>(key_counter_alg);
+    auto counter = std::get<1>(key_counter_alg);
+    auto alg = std::get<2>(key_counter_alg);
+    key = xla::Reshape(key, {RNG_KEY_SIZE});
+    ctx->SetOutput(0, key);
+    ctx->SetOutput(1, counter);
+    ctx->SetOutput(2, ConstantR0(builder, static_cast<int>(alg)));
+  }
+
+ private:
+  string device_type_string_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(GetKeyCounterAlgOp);
+};
+
+REGISTER_XLA_OP(Name("StatelessRandomGetKeyCounterAlg"), GetKeyCounterAlgOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
index 784b790767c..943d92982cb 100644
--- a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "tensorflow/core/util/strided_slice_op.h"
 
+#include <vector>
+
+#include "absl/algorithm/container.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/tf2xla/literal_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
@@ -23,16 +26,20 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/ops_util.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/mem.h"
 
 namespace tensorflow {
 namespace {
+using errors::InvalidArgument;
 
 class StridedSliceOp : public XlaOpKernel {
  public:
@@ -48,7 +55,6 @@ class StridedSliceOp : public XlaOpKernel {
   void Compile(XlaOpKernelContext* ctx) override {
     const TensorShape input_shape = ctx->InputShape(0);
     const TensorShape begin_shape = ctx->InputShape("begin");
-
     OP_REQUIRES(
         ctx, begin_shape.dims() == 1,
         errors::InvalidArgument("'begin' input has to be a rank 1 vector"));
@@ -78,20 +84,24 @@ class StridedSliceOp : public XlaOpKernel {
     TensorShape final_shape;
     PartialTensorShape dummy_processing_shape, partial_final_shape;
     bool dummy = false;
-    OP_REQUIRES_OK(ctx, ValidateStridedSliceOp(
-                            begin_is_constant ? &begin_tensor : nullptr,
-                            end_is_constant ? &end_tensor : nullptr,
-                            strides_tensor, input_shape, begin_mask_, end_mask_,
-                            ellipsis_mask_, new_axis_mask_, shrink_axis_mask_,
-                            &dummy_processing_shape, &partial_final_shape,
-                            &dummy, &dummy, &dummy, &begin, &end, &strides));
+    absl::InlinedVector<int64, 4> output_to_sparse_mapping;
+    absl::InlinedVector<int64, 4> output_to_processing_mapping;
+    OP_REQUIRES_OK(
+        ctx,
+        ValidateStridedSliceOp(
+            begin_is_constant ? &begin_tensor : nullptr,
+            end_is_constant ? &end_tensor : nullptr, strides_tensor,
+            input_shape, begin_mask_, end_mask_, ellipsis_mask_, new_axis_mask_,
+            shrink_axis_mask_, &dummy_processing_shape, &partial_final_shape,
+            &dummy, &dummy, &dummy, &begin, &end, &strides,
+            &output_to_sparse_mapping, &output_to_processing_mapping));
 
-    OP_REQUIRES(ctx, partial_final_shape.AsTensorShape(&final_shape),
-                errors::InvalidArgument(
-                    "XLA can't deduce compile time constant output "
-                    "shape for strided slice: ",
-                    partial_final_shape.DebugString(),
-                    ", output shape must be a compile-time constant"));
+    OP_REQUIRES(
+        ctx, partial_final_shape.AsTensorShape(&final_shape),
+        InvalidArgument("XLA can't deduce compile time constant output "
+                        "shape for strided slice: ",
+                        partial_final_shape.DebugString(),
+                        ", output shape must be a compile-time constant"));
 
     xla::XlaOp slice = ctx->Input(0);
     if (begin_is_constant && end_is_constant) {
@@ -119,69 +129,84 @@ class StridedSliceOp : public XlaOpKernel {
       auto operand_shape_or = ctx->builder()->GetShape(ctx->Input(0));
       OP_REQUIRES_OK(ctx, operand_shape_or.status());
       xla::Shape xla_shape = operand_shape_or.ValueOrDie();
-      if (xla_shape.is_static()) {
-        // Static output shape, return a static slice.
-        slice = xla::Reshape(slice, final_shape.dim_sizes());
+      std::vector<bool> begins_are_dynamic;
+      OP_REQUIRES_OK(
+          ctx, ctx->ResolveInputDynamismIntoPredVector(1, &begins_are_dynamic));
+      std::vector<bool> ends_are_dynamic;
+      OP_REQUIRES_OK(
+          ctx, ctx->ResolveInputDynamismIntoPredVector(2, &ends_are_dynamic));
+      bool begins_are_static = absl::c_all_of(
+          begins_are_dynamic, [](bool dynamic) { return !dynamic; });
+      OP_REQUIRES(ctx, begins_are_static,
+                  errors::InvalidArgument(
+                      "XLA can't use dynamic begin values for slice."));
+      bool ends_are_static = absl::c_all_of(
+          ends_are_dynamic, [](bool dynamic) { return !dynamic; });
+      // Static output shape, return a static slice.
+      slice = xla::Reshape(slice, final_shape.dim_sizes());
+      if (xla_shape.is_static() && ends_are_static) {
         ctx->SetOutput(0, slice);
         return;
       }
-      auto input_dim_sizes = input_shape.dim_sizes();
 
-      for (int64 i = 0; i < xla_shape.rank(); ++i) {
-        if (xla_shape.is_dynamic_dimension(i)) {
-          input_dim_sizes[i] = -1;
+      for (int64 i = 0; i < final_shape.dims(); ++i) {
+        int64 input_index = output_to_processing_mapping[i];
+        if (input_index == -1) {
+          continue;
         }
-      }
-      PartialTensorShape input_partial_shape(input_dim_sizes);
-      partial_final_shape.Clear();
-      end.clear();
-      strides.clear();
-      begin.clear();
-      // Run shape inferenference again with partial shape.
-      OP_REQUIRES_OK(ctx, ValidateStridedSliceOp(
-                              &begin_tensor, &end_tensor, strides_tensor,
-                              input_partial_shape, begin_mask_, end_mask_,
-                              ellipsis_mask_, new_axis_mask_, shrink_axis_mask_,
-                              &dummy_processing_shape, &partial_final_shape,
-                              &dummy, &dummy, &dummy, &begin, &end, &strides));
-      if (partial_final_shape.AsTensorShape(&final_shape)) {
-        // Static output shape, return a static slice.
-        slice = xla::Reshape(slice, final_shape.dim_sizes());
-        ctx->SetOutput(0, slice);
-        return;
-      }
+        bool input_is_dynamic = xla_shape.is_dynamic_dimension(input_index);
 
-      // We consider slicing a dynamic tensor t with negative indices as a
-      // dynamic sized slice. E.g., t[: -n], the result length is shape(t) - n
-      for (int64 i = 0; i < partial_final_shape.dims(); ++i) {
-        bool dynamic_dim = partial_final_shape.dim_size(i) - 1;
-        bool backward_slice = end[i] < 0;
-        if (dynamic_dim && backward_slice) {
+        int64 sparse_index = output_to_sparse_mapping[i];
+        bool end_is_dynamic =
+            sparse_index == -1 ? false : ends_are_dynamic[sparse_index];
+        bool backward_slice = sparse_index == -1
+                                  ? false
+                                  : end_literal.Get<int32>({sparse_index}) < 0;
+        if ((input_is_dynamic && backward_slice) || end_is_dynamic) {
           OP_REQUIRES(
-              ctx, strides[i] == 1,
+              ctx, strides[input_index] == 1,
               errors::InvalidArgument("XLA has not implemented dynamic "
                                       "sized slice with non-trival stride yet. "
                                       "Please file a bug against XLA"));
-
-          OP_REQUIRES(ctx, begin[i] >= 0,
-                      errors::InvalidArgument(
-                          "XLA has not implemented dynamic "
-                          "sized slice with negative begin index %lld. "
-                          "Please file a bug against XLA",
-                          begin[i]));
           // If there is a dynamic dimension, properly set dimension size of
           // the result.
-          auto operand_size = xla::GetDimensionSize(ctx->Input(0), i);
-
-          operand_size = xla::Add(
-              operand_size, xla::ConstantR0<int32>(ctx->builder(), end[i]));
+          auto operand_size = xla::GetDimensionSize(ctx->Input(0), input_index);
+          if (backward_slice) {
+            // We consider slicing a dynamic tensor t with negative indices as
+            // a dynamic sized slice. E.g., t[: -n], the result length is
+            // shape(t) - n.
+            OP_REQUIRES(ctx, !end_is_dynamic,
+                        errors::InvalidArgument(
+                            "XLA has not implemented dynamic "
+                            "sized slice with dynamic negative index %lld. "));
+            operand_size = xla::Add(
+                operand_size,
+                xla::ConstantR0<int32>(ctx->builder(),
+                                       end_literal.Get<int32>({sparse_index})));
+          } else {
+            // The end of slice with dynamic slice size is the min of operand
+            // shape and slice size. E.g., t[:end_size], result size is
+            // min(shape(t), end_size).
+            xla::XlaOp end_size;
+            if (end_is_dynamic) {
+              end_size = xla::Reshape(xla::Slice(ctx->Input(2), {sparse_index},
+                                                 {sparse_index + 1}, {1}),
+                                      {});
+            } else {
+              end_size =
+                  xla::ConstantR0<int32>(ctx->builder(), end[input_index]);
+            }
+            operand_size = xla::Min(operand_size, end_size);
+          }
           slice = xla::SetDimensionSize(
               slice,
-              xla::Sub(operand_size,
-                       xla::ConstantR0<int32>(ctx->builder(), begin[i])),
+              xla::Sub(operand_size, xla::ConstantR0<int32>(
+                                         ctx->builder(), begin[input_index])),
               i);
         }
       }
+      ctx->SetOutput(0, slice);
+      return;
     } else {
       // When output shape is fully defined, it must be a size one slice:
       //
@@ -239,9 +264,9 @@ class StridedSliceOp : public XlaOpKernel {
 
       std::vector<int64> output_shape_dim_sizes;
       slice = xla::DynamicSlice(slice, start_indices, slice_sizes);
+      slice = xla::Reshape(slice, final_shape.dim_sizes());
+      ctx->SetOutput(0, slice);
     }
-    slice = xla::Reshape(slice, final_shape.dim_sizes());
-    ctx->SetOutput(0, slice);
   }
 
  private:
@@ -267,6 +292,83 @@ class StridedSliceGradOp : public XlaOpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("Index", &index_type_));
   }
 
+  // When the begin / end is unknown, compile the gradient into dynamic update
+  // slice into a broadcasted 0s.
+  //
+  //    Broadcasted 0
+  // +----------------------+
+  // |         +----+       |
+  // |<-begin->|grad|<-end->| <== Dynamic update grad into 0s.
+  // |         +----+       |
+  // +----------------------+
+  void CompileAsDynamicUpdateSlice(XlaOpKernelContext* ctx,
+                                   const TensorShape& input_shape,
+                                   const xla::Literal& strides_literal) {
+    bool dummy = false;
+    Tensor strides_tensor;
+    PartialTensorShape processing_shape, final_shape;
+    absl::InlinedVector<int64, 4> begin;
+    absl::InlinedVector<int64, 4> end;
+    absl::InlinedVector<int64, 4> strides;
+
+    absl::InlinedVector<int64, 4> output_to_sparse_mapping;
+    absl::InlinedVector<int64, 4> output_to_processing_mapping;
+
+    OP_REQUIRES_OK(ctx, LiteralToHostTensor(strides_literal, index_type_,
+                                            &strides_tensor));
+    OP_REQUIRES_OK(
+        ctx, ValidateStridedSliceOp(
+                 nullptr, nullptr, strides_tensor, input_shape, begin_mask_,
+                 end_mask_, ellipsis_mask_, new_axis_mask_, shrink_axis_mask_,
+                 &processing_shape, &final_shape, &dummy, &dummy, &dummy,
+                 &begin, &end, &strides, &output_to_sparse_mapping,
+                 &output_to_processing_mapping));
+    for (int64 i = 0; i < processing_shape.dims(); ++i) {
+      OP_REQUIRES(
+          ctx, strides[i] == 1,
+          errors::InvalidArgument("Strides in strided slice grad have to be "
+                                  "one when inputs are not constant."));
+    }
+
+    auto zero = XlaHelpers::Zero(ctx->builder(), ctx->expected_output_dtype(0));
+    zero = xla::Broadcast(zero, input_shape.dim_sizes());
+    xla::XlaOp grad = ctx->Input(4);
+    xla::Shape grad_shape = ctx->InputXlaShape(4).ValueOrDie();
+    // Undo any new/shrink axes.
+    VLOG(1) << "xla grad shape" << grad_shape;
+    VLOG(1) << "input_shape" << input_shape.DebugString();
+    std::vector<xla::XlaOp> begins(processing_shape.dims(),
+                                   xla::Zero(ctx->builder(), xla::S32));
+    for (int64 i = 0; i < grad_shape.rank(); ++i) {
+      // Use grad shape, which is known, to update unknown processing shape.
+      // Grad shape is the output of the ValidateStridedSliceOp function in
+      // forward pass, thus we use output_to_processing_mapping.
+      if (output_to_processing_mapping[i] != -1) {
+        processing_shape.set_dim(output_to_processing_mapping[i],
+                                 grad_shape.dimensions(i));
+      }
+
+      // Similarly, use output_to_sparse_mapping to find out corresponding
+      // begin dim of the output, as indices for dynamic update slice.
+      int64 begin_dim = output_to_sparse_mapping[i];
+      if (begin_dim != -1) {
+        auto begin_index =
+            xla::Slice(ctx->Input(1), {begin_dim}, {begin_dim + 1}, {1});
+        auto begin_index_scalar = xla::Reshape(
+            xla::ShapeUtil::MakeScalarShape(xla::S32), begin_index);
+        begins[output_to_sparse_mapping[i]] = begin_index_scalar;
+      }
+    }
+    VLOG(1) << "processing_shape" << processing_shape.DebugString();
+    TensorShape full_processing_shape;
+    OP_REQUIRES(ctx, processing_shape.AsTensorShape(&full_processing_shape),
+                errors::InvalidArgument(
+                    "Processing shape ", processing_shape.DebugString(),
+                    " can't be fully inferred from grad shape"));
+    grad = xla::Reshape(grad, full_processing_shape.dim_sizes());
+    grad = xla::DynamicUpdateSlice(zero, grad, begins);
+    ctx->SetOutput(0, grad);
+  }
   void Compile(XlaOpKernelContext* ctx) override {
     TensorShape processing_shape, final_shape;
     absl::InlinedVector<int64, 4> begin;
@@ -275,12 +377,15 @@ class StridedSliceGradOp : public XlaOpKernel {
 
     TensorShape input_shape;
     OP_REQUIRES_OK(ctx, ctx->ConstantInputAsShape(0, &input_shape));
-
     xla::Literal begin_literal, end_literal, strides_literal;
-    OP_REQUIRES_OK(ctx, ctx->ConstantInput(1, &begin_literal));
-    OP_REQUIRES_OK(ctx, ctx->ConstantInput(2, &end_literal));
-    OP_REQUIRES_OK(ctx, ctx->ConstantInput(3, &strides_literal));
 
+    bool begin_is_constant = ctx->ConstantInput(1, &begin_literal).ok();
+    bool end_is_constant = ctx->ConstantInput(2, &end_literal).ok();
+    OP_REQUIRES_OK(ctx, ctx->ConstantInput(3, &strides_literal));
+    if (!(begin_is_constant && end_is_constant)) {
+      CompileAsDynamicUpdateSlice(ctx, input_shape, strides_literal);
+      return;
+    }
     Tensor begin_tensor, end_tensor, strides_tensor;
     OP_REQUIRES_OK(
         ctx, LiteralToHostTensor(begin_literal, index_type_, &begin_tensor));
@@ -423,7 +528,12 @@ class StridedSliceAssignOp : public XlaOpKernel {
 
     TensorShape lhs_shape;
     xla::XlaOp lhs;
-    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, dtype_, &lhs_shape, &lhs));
+    if (ctx->input_type(0) == DT_RESOURCE) {
+      OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, dtype_, &lhs_shape, &lhs));
+    } else {
+      lhs_shape = ctx->InputShape(0);
+      lhs = ctx->Input(0);
+    }
 
     const TensorShape rhs_shape = ctx->InputShape(4);
 
@@ -481,7 +591,11 @@ class StridedSliceAssignOp : public XlaOpKernel {
 
     lhs = xla::DynamicUpdateSlice(lhs, rhs, slice_begin);
 
-    OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, dtype_, lhs));
+    if (ctx->input_type(0) == DT_RESOURCE) {
+      OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, dtype_, lhs));
+    } else {
+      ctx->SetOutput(0, lhs);
+    }
   }
 
  private:
@@ -497,5 +611,11 @@ REGISTER_XLA_OP(Name("ResourceStridedSliceAssign")
                     .CompileTimeConstantInput("strides"),
                 StridedSliceAssignOp);
 
+REGISTER_XLA_OP(Name("TensorStridedSliceUpdate")
+                    .CompileTimeConstantInput("begin")
+                    .CompileTimeConstantInput("end")
+                    .CompileTimeConstantInput("strides"),
+                StridedSliceAssignOp);
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc b/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
index 976ff91f6ce..1ea0e797675 100644
--- a/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
@@ -45,22 +45,32 @@ namespace tensorflow {
 namespace {
 
 // GetTensorListDynamicDims collects the dynamic dimensions that a tensorlist
-// may carry and returns them in a 2D vector: int64[ElementSize][DimSize]. If a
-// dimension is static, a constant dimension is returned.
+// may carry and returns them in a 2D vector: XlaOp[ElementSize][DimSize]. If a
+// dimension is static, a constant dimension is returned. If a dim is dynamic, a
+// dynamic XlaOp representing the dynamic size is returned.
 xla::StatusOr<std::vector<std::vector<xla::XlaOp>>> GetTensorListDynamicDims(
     XlaOpKernelContext* ctx, const xla::Shape& element_shape,
     const xla::Shape& list_shape, int64 num_elements) {
   std::vector<int64> dynamic_sizes;
-  ctx->set_dynamic_dimension_is_minus_one(true);
   // The multiplier can be a dynamic value.
   TF_RETURN_IF_ERROR(ctx->ConstantInputAsIntVector(0, &dynamic_sizes));
+  std::vector<bool> dims_are_dynamic;
+  TF_RETURN_IF_ERROR(
+      ctx->ResolveInputDynamismIntoPredVector(0, &dims_are_dynamic));
+  bool leading_dim_is_dynamic;
+  TF_RETURN_IF_ERROR(
+      ctx->ResolveInputDynamismIntoPred(1, &leading_dim_is_dynamic));
   std::vector<std::vector<xla::XlaOp>> list_dynamic_dims;
   // Set dynamic dimension size to 0 for initialization value.
   std::vector<xla::XlaOp> dynamic_dims;
-  // Leading dim is a static dimension.
-  dynamic_dims.push_back(xla::ConstantR0<int32>(ctx->builder(), num_elements));
+  if (leading_dim_is_dynamic) {
+    dynamic_dims.push_back(ctx->Input(1));
+  } else {
+    dynamic_dims.push_back(
+        xla::ConstantR0<int32>(ctx->builder(), num_elements));
+  }
   for (int64 dim = 0; dim < element_shape.dimensions_size(); ++dim) {
-    if (ctx->is_dynamic_dimension(dynamic_sizes[dim])) {
+    if (dims_are_dynamic[dim]) {
       auto dynamic_dim_size = xla::Slice(ctx->Input(0), {dim}, {dim + 1}, {1});
       dynamic_dim_size = xla::Reshape(dynamic_dim_size, {});
       dynamic_dim_size = xla::ConvertElementType(dynamic_dim_size, xla::S32);
@@ -80,11 +90,12 @@ class TensorListLengthOp : public XlaOpKernel {
 
   void Compile(XlaOpKernelContext* ctx) override {
     int64 leading_dim;
-    OP_REQUIRES_OK(ctx,
-                   GetLeadingDimForTensorList(ctx->Input(0), &leading_dim));
-    Tensor length_tensor(DT_INT32, {});
-    length_tensor.scalar<int32>()() = static_cast<int32>(leading_dim);
-    ctx->SetConstantOutput(0, length_tensor);
+    xla::XlaOp leading_dim_size;
+    bool leading_dim_is_dynamic;
+    OP_REQUIRES_OK(ctx, GetLeadingDimForTensorList(ctx->Input(0), &leading_dim,
+                                                   &leading_dim_is_dynamic,
+                                                   &leading_dim_size));
+    ctx->SetOutput(0, leading_dim_size);
   }
 
  private:
@@ -134,6 +145,9 @@ class TensorListReserveOp : public XlaOpKernel {
   void Compile(XlaOpKernelContext* ctx) override {
     int64 num_elements;
     OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar(1, &num_elements));
+    bool num_element_is_dynamic;
+    OP_REQUIRES_OK(
+        ctx, ctx->ResolveInputDynamismIntoPred(1, &num_element_is_dynamic));
     OP_REQUIRES(
         ctx, num_elements >= 0,
         errors::InvalidArgument(
@@ -156,7 +170,8 @@ class TensorListReserveOp : public XlaOpKernel {
     if (got_shape) {
       xla::Shape list_shape;
       OP_REQUIRES_OK(ctx, GetTensorListShapeFromElementShape(
-                              element_shape, num_elements, &list_shape));
+                              element_shape, num_elements,
+                              num_element_is_dynamic, &list_shape));
       // Set up dynamic dimension sizes to create the zero tensor.
       auto list_dynamic_dims_or = GetTensorListDynamicDims(
           ctx, element_shape, list_shape, num_elements);
@@ -175,8 +190,8 @@ class TensorListReserveOp : public XlaOpKernel {
       return;
     }
 
-    xla::XlaOp result =
-        BuildUninitializedTensorList(ctx->builder(), num_elements);
+    xla::XlaOp result = BuildUninitializedTensorList(
+        ctx->builder(), num_elements, num_element_is_dynamic, ctx->Input(1));
     ctx->SetTensorListOutput(0, result);
   }
 
@@ -200,6 +215,9 @@ class EmptyTensorListOp : public XlaOpKernel {
   void Compile(XlaOpKernelContext* ctx) override {
     int64 max_num_elements;
     OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar(1, &max_num_elements));
+    bool num_element_is_dynamic;
+    OP_REQUIRES_OK(
+        ctx, ctx->ResolveInputDynamismIntoPred(1, &num_element_is_dynamic));
     OP_REQUIRES(ctx, max_num_elements >= 0,
                 errors::InvalidArgument(
                     "XLA compilation requires a fixed tensor list size. Set "
@@ -210,9 +228,9 @@ class EmptyTensorListOp : public XlaOpKernel {
 
     if (dtype_ != DT_VARIANT) {
       // We are creating a non-nested TensorList.
-      // If element shape is compile time constant and it's not "unknown rank"
-      // shape (-1), create an initialized TensorList. Otherwise create an
-      // uninitialized TensorList.
+      // If element shape is compile time constant and it's not "unknown
+      // rank" shape (-1), create an initialized TensorList. Otherwise
+      // create an uninitialized TensorList.
       xla::XlaOp element_shape_handle = ctx->Input(0);
       xla::PrimitiveType type;
       OP_REQUIRES_OK(ctx, DataTypeToPrimitiveType(dtype_, &type));
@@ -224,7 +242,8 @@ class EmptyTensorListOp : public XlaOpKernel {
       if (got_shape) {
         xla::Shape list_shape;
         OP_REQUIRES_OK(ctx, GetTensorListShapeFromElementShape(
-                                element_shape, max_num_elements, &list_shape));
+                                element_shape, max_num_elements,
+                                num_element_is_dynamic, &list_shape));
         // Set up dynamic dimension sizes to create the zero tensor.
         auto list_dynamic_dims_or = GetTensorListDynamicDims(
             ctx, element_shape, list_shape, max_num_elements);
@@ -243,7 +262,8 @@ class EmptyTensorListOp : public XlaOpKernel {
     // We are creating a nested TensorList or a non-nested TensorList with
     // unknown shape. Just create an uninitialized TensorList.
     xla::XlaOp result =
-        BuildUninitializedTensorList(ctx->builder(), max_num_elements);
+        BuildUninitializedTensorList(ctx->builder(), max_num_elements,
+                                     num_element_is_dynamic, ctx->Input(1));
     ctx->SetTensorListOutput(0, result);
   }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.cc b/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.cc
index aa71e4d4364..156f9bfea40 100644
--- a/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.cc
@@ -189,28 +189,42 @@ Status SetTensorListPushIndex(xla::XlaOp list, xla::XlaOp push_index,
 }
 
 xla::XlaOp BuildUninitializedTensorList(xla::XlaBuilder* b,
-                                        int64 leading_dimension) {
+                                        int64 leading_dimension,
+                                        bool leading_size_is_dynamic,
+                                        xla::XlaOp leading_dim_size) {
   auto zero =
       xla::ConstantLiteral(b, xla::LiteralUtil::Zero(xla::PrimitiveType::S32));
-  return xla::Broadcast(zero, std::vector<int64>{leading_dimension});
+  auto broadcast = xla::Broadcast(zero, std::vector<int64>{leading_dimension});
+  if (leading_size_is_dynamic) {
+    return xla::SetDimensionSize(broadcast, leading_dim_size, 0);
+  } else {
+    return broadcast;
+  }
 }
 
-Status GetLeadingDimForTensorList(xla::XlaOp list, int64* leading_dim) {
+Status GetLeadingDimForTensorList(xla::XlaOp list, int64* leading_dim,
+                                  bool* leading_dim_is_dynamic,
+                                  xla::XlaOp* leading_dim_dynamic_size) {
   bool is_initialized;
   TF_RETURN_IF_ERROR(IsTensorListInitialized(list, &is_initialized));
   TF_ASSIGN_OR_RETURN(xla::Shape list_shape, list.builder()->GetShape(list));
   if (is_initialized) {
     auto buffer_shape = xla::ShapeUtil::GetTupleElementShape(list_shape, 0);
+    *leading_dim_is_dynamic = buffer_shape.is_dynamic_dimension(0);
+    auto buffer = xla::GetTupleElement(list, 0);
     *leading_dim = buffer_shape.dimensions(0);
+    *leading_dim_dynamic_size = xla::GetDimensionSize(buffer, 0);
   } else {
+    *leading_dim_is_dynamic = list_shape.is_dynamic_dimension(0);
     *leading_dim = list_shape.dimensions(0);
+    *leading_dim_dynamic_size = xla::GetDimensionSize(list, 0);
   }
   return Status::OK();
 }
 
 Status GetTensorListShapeFromElementTensorListShape(
     const xla::Shape& element_tensor_list_shape, int64 leading_dim,
-    xla::Shape* tensor_list_shape) {
+    bool leading_dim_is_dynamic, xla::Shape* tensor_list_shape) {
   std::vector<xla::Shape> shapes;
   int tuple_size = xla::ShapeUtil::TupleElementCount(element_tensor_list_shape);
   for (int i = 0; i < tuple_size; i++) {
@@ -220,6 +234,9 @@ Status GetTensorListShapeFromElementTensorListShape(
     dimensions.insert(dimensions.begin(), leading_dim);
     shapes.push_back(
         xla::ShapeUtil::MakeShape(shape.element_type(), dimensions));
+    if (leading_dim_is_dynamic) {
+      shapes.back().set_dynamic_dimension(0, true);
+    }
   }
   shapes.push_back(
       xla::ShapeUtil::MakeShape(xla::PrimitiveType::S32, std::vector<int64>{}));
@@ -229,6 +246,7 @@ Status GetTensorListShapeFromElementTensorListShape(
 
 Status GetTensorListShapeFromElementShape(const xla::Shape& element_shape,
                                           int64 leading_dim,
+                                          bool leading_dim_is_dynamic,
                                           xla::Shape* tensor_list_shape) {
   if (!element_shape.IsArray()) {
     return errors::InvalidArgument(
@@ -236,12 +254,12 @@ Status GetTensorListShapeFromElementShape(const xla::Shape& element_shape,
         "shape. But element shape is ",
         element_shape.DebugString());
   }
-
   std::vector<xla::Shape> shapes;
   std::vector<int64> dimensions = xla::SpanToVector(element_shape.dimensions());
   dimensions.insert(dimensions.begin(), leading_dim);
   shapes.push_back(
       xla::ShapeUtil::MakeShape(element_shape.element_type(), dimensions));
+  shapes.back().set_dynamic_dimension(0, leading_dim_is_dynamic);
   shapes.push_back(
       xla::ShapeUtil::MakeShape(xla::PrimitiveType::S32, std::vector<int64>{}));
   *tensor_list_shape = xla::ShapeUtil::MakeTupleShape(shapes);
@@ -279,7 +297,10 @@ Status GetInitializedTensorListForElement(xla::XlaOp list, xla::XlaOp element,
                                           bool element_is_tensor_list,
                                           xla::XlaOp* initialized_list) {
   int64 leading_dim;
-  TF_RETURN_IF_ERROR(GetLeadingDimForTensorList(list, &leading_dim));
+  xla::XlaOp leading_dim_dynamic_size;
+  bool leading_dim_is_dynamic;
+  TF_RETURN_IF_ERROR(GetLeadingDimForTensorList(
+      list, &leading_dim, &leading_dim_is_dynamic, &leading_dim_dynamic_size));
 
   xla::XlaBuilder* b = list.builder();
   xla::Shape list_shape;
@@ -287,12 +308,11 @@ Status GetInitializedTensorListForElement(xla::XlaOp list, xla::XlaOp element,
 
   if (element_is_tensor_list) {
     TF_RETURN_IF_ERROR(GetTensorListShapeFromElementTensorListShape(
-        element_shape, leading_dim, &list_shape));
+        element_shape, leading_dim, leading_dim_is_dynamic, &list_shape));
   } else {
     TF_RETURN_IF_ERROR(GetTensorListShapeFromElementShape(
-        element_shape, leading_dim, &list_shape));
+        element_shape, leading_dim, leading_dim_is_dynamic, &list_shape));
   }
-
   bool is_initialized;
   TF_RETURN_IF_ERROR(IsTensorListInitialized(list, &is_initialized));
   if (is_initialized) {
@@ -312,8 +332,7 @@ Status GetInitializedTensorListForElement(xla::XlaOp list, xla::XlaOp element,
     for (int64 i = 0; i < list_shape.tuple_shapes_size() - 1; ++i) {
       std::vector<xla::XlaOp> dynamic_dims;
       const xla::Shape& shape = list_shape.tuple_shapes(i);
-      // Leading dim is a static dimension.
-      dynamic_dims.push_back(xla::ConstantR0<int32>(b, leading_dim));
+      dynamic_dims.push_back(leading_dim_dynamic_size);
       xla::XlaOp sub_element;
       if (element_is_tensor_list) {
         sub_element = xla::GetTupleElement(element, i);
@@ -504,7 +523,9 @@ Status ExecuteTensorListGetItem(xla::XlaOp list, xla::XlaOp index,
 
   xla::XlaOp list_part = xla::GetTupleElement(list, 0);
   xla::XlaOp read = xla::DynamicSlice(list_part, start_indices, slice_shape);
-  for (int64 i = 0; i < buffer_shape.dimensions_size(); ++i) {
+  // Propagate dynamic dimensions from buffer to the sliced buffer, except for
+  // leading dimension (which is always static 1).
+  for (int64 i = 1; i < buffer_shape.dimensions_size(); ++i) {
     if (buffer_shape.is_dynamic_dimension(i)) {
       auto buffer = xla::GetTupleElement(list, 0);
       auto gds = xla::GetDimensionSize(buffer, i);
diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.h b/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.h
index ef3c8badf71..549ccd5aece 100644
--- a/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.h
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.h
@@ -60,17 +60,22 @@ Status SetTensorListPushIndex(xla::XlaOp list, xla::XlaOp push_index,
 
 // Returns an uninitialized TensorList.
 xla::XlaOp BuildUninitializedTensorList(xla::XlaBuilder* b,
-                                        int64 leading_dimension);
+                                        int64 leading_dimension,
+                                        bool leading_size_is_dynamic,
+                                        xla::XlaOp leading_dim_size);
 
-// Returns leading dimension for the TensorList.
-// Input can be initialized or uninitialized TensorList.
-// Non-nested and nested TensorLists are both supported.
-Status GetLeadingDimForTensorList(xla::XlaOp list, int64* leading_dim);
+// Returns leading dimension for the TensorList as well as a dynamic op
+// representing the dynamic size. Input can be initialized or uninitialized
+// TensorList. Non-nested and nested TensorLists are both supported.
+Status GetLeadingDimForTensorList(xla::XlaOp list, int64* leading_dim,
+                                  bool* leading_dim_is_dynamic,
+                                  xla::XlaOp* leading_dim_dynamic_size);
 
 // Returns TensorList shape for the element shape.
 // Element shape must be a normal tensor shape.
 Status GetTensorListShapeFromElementShape(const xla::Shape& element_shape,
                                           int64 leading_dim,
+                                          bool leading_dim_is_dynamic,
                                           xla::Shape* tensor_list_shape);
 
 // Returns a TensorList filled by zeros with the given shape.
diff --git a/tensorflow/compiler/tf2xla/kernels/while_op.cc b/tensorflow/compiler/tf2xla/kernels/while_op.cc
index fe7a5898011..a94411f1b30 100644
--- a/tensorflow/compiler/tf2xla/kernels/while_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/while_op.cc
@@ -513,10 +513,26 @@ void XlaWhileOp::Compile(XlaOpKernelContext* ctx) {
         // Prepare dynamic dimensions for element shapes.
         std::vector<std::vector<xla::XlaOp>> list_dynamic_dims;
         for (int64 i = 0; i < list_shape.tuple_shapes_size() - 1; ++i) {
-          // Set dynamic dimension size to 0 for initilization value.
           std::vector<xla::XlaOp> dynamic_dims;
+
           const xla::Shape& shape = list_shape.tuple_shapes(i);
-          for (int64 dim = 0; dim < shape.dimensions_size(); ++dim) {
+
+          // We already have the dynamic size of leading dimension outside of
+          // the while loop without initializing the TensorList inside the while
+          // loop.
+          if (shape.is_dynamic_dimension(0)) {
+            xla::XlaOp leading_dim_size = xla::GetDimensionSize(input, 0);
+            dynamic_dims.push_back(leading_dim_size);
+          } else {
+            int32 dim_size = shape.dimensions(0);
+            dynamic_dims.push_back(
+                xla::ConstantR0<int32>(ctx->builder(), dim_size));
+          }
+
+          // Set dynamic dimension size to 0 for element value. Inside the while
+          // loop, TensorlistSetItem will properly set the element shape's
+          // dynamic diemnsion.
+          for (int64 dim = 1; dim < shape.dimensions_size(); ++dim) {
             int32 dim_size = shape.dimensions(dim);
             if (shape.is_dynamic_dimension(dim)) {
               dim_size = 0;
diff --git a/tensorflow/compiler/tf2xla/lib/data_format.cc b/tensorflow/compiler/tf2xla/lib/data_format.cc
index e5913a8bbf3..eb1ab79d165 100644
--- a/tensorflow/compiler/tf2xla/lib/data_format.cc
+++ b/tensorflow/compiler/tf2xla/lib/data_format.cc
@@ -62,7 +62,7 @@ xla::StatusOr<xla::XlaOp> Expand(xla::XlaOp input, int64 dim) {
   std::vector<int64> expanded_shape =
       xla::SpanToVector(input_shape.dimensions());
   expanded_shape[dim] /= 4;
-  expanded_shape.insert(expanded_shape.begin() + dim, 4);
+  expanded_shape.insert(expanded_shape.begin() + dim + 1, 4);
 
   // Move the newly created dimension to the end with a transpose.
   std::vector<int64> permutation;
diff --git a/tensorflow/compiler/tf2xla/mlir_tf2xla.cc b/tensorflow/compiler/tf2xla/mlir_tf2xla.cc
index abaeb305104..92a83436346 100644
--- a/tensorflow/compiler/tf2xla/mlir_tf2xla.cc
+++ b/tensorflow/compiler/tf2xla/mlir_tf2xla.cc
@@ -90,18 +90,6 @@ Status ConvertOutputInfo(const tf2xla::Config& config,
   return ParseOutputArrayInfo(array_names, &specs->outputs);
 }
 
-static void RegisterDialects() {
-  static bool init_once = []() {
-    mlir::registerDialect<mlir::tf_executor::TensorFlowExecutorDialect>();
-    mlir::registerDialect<mlir::TF::TensorFlowDialect>();
-    mlir::registerDialect<mlir::StandardOpsDialect>();
-    mlir::registerDialect<mlir::mhlo::MhloDialect>();
-    mlir::registerDialect<mlir::shape::ShapeDialect>();
-    return true;
-  }();
-  (void)init_once;
-}
-
 }  // namespace
 
 Status ConvertGraphDefToXlaViaMlir(
@@ -150,7 +138,6 @@ Status ConvertGraphDefToXlaViaMlir(
     }
   }
 
-  RegisterDialects();
   mlir::MLIRContext context;
   TF_ASSIGN_OR_RETURN(
       mlir::OwningModuleRef module,
diff --git a/tensorflow/compiler/tf2xla/ops/xla_ops.cc b/tensorflow/compiler/tf2xla/ops/xla_ops.cc
index f4b9e9654d2..2f895b17219 100644
--- a/tensorflow/compiler/tf2xla/ops/xla_ops.cc
+++ b/tensorflow/compiler/tf2xla/ops/xla_ops.cc
@@ -291,6 +291,16 @@ dimension_numbers: a serialized xla::DotDimensionNumbers proto.
 precision_config: a serialized xla::PrecisionConfig proto.
 )doc");
 
+REGISTER_OP("XlaSetBound")
+    .Input("input: int32")
+    .Input("bound: int32")
+    .Output("output: int32")
+    .SetShapeFn(shape_inference::UnknownShape)
+    .Doc(
+        R"doc(Set a bound for the given input value as a hint to Xla compiler,
+        returns the same value.
+)doc");
+
 REGISTER_OP("XlaDynamicSlice")
     .Input("input: T")
     .Input("start_indices: Tindices")
diff --git a/tensorflow/compiler/tf2xla/python/xla.py b/tensorflow/compiler/tf2xla/python/xla.py
index 846dafa2570..19104518b71 100644
--- a/tensorflow/compiler/tf2xla/python/xla.py
+++ b/tensorflow/compiler/tf2xla/python/xla.py
@@ -387,6 +387,14 @@ def reduce_window(operand,
 
 replica_id = gen_xla_ops.xla_replica_id
 
+# Set a static bound for the given input value as a hint to Xla compiler,
+# returns the same value.
+# Usage:
+# def f(t, p):
+#   p = xla.set_bound(p, 3) # Tells xla the constraint that p <= 3.
+#   return t[:p]            # xla knows the bound of the slice is 3.
+set_bound = gen_xla_ops.xla_set_bound
+
 
 def reshape(x, new_sizes, dimensions=None, name=None):
   if dimensions is not None:
diff --git a/tensorflow/compiler/tf2xla/resource_operation_table.cc b/tensorflow/compiler/tf2xla/resource_operation_table.cc
index 2db431c0413..860c3a40424 100644
--- a/tensorflow/compiler/tf2xla/resource_operation_table.cc
+++ b/tensorflow/compiler/tf2xla/resource_operation_table.cc
@@ -83,6 +83,8 @@ CreateResourceOpInfoMap() {
   add("ResourceScatterSub"                   , kReadWrite, kVariable);
   add("ResourceScatterUpdate"                , kReadWrite, kVariable);
   add("ResourceStridedSliceAssign"           , kReadWrite, kVariable);
+  add("RngReadAndSkip"                       , kReadWrite, kVariable);
+  add("RngSkip"                              , kReadWrite, kVariable);
   add("StatefulStandardNormalV2"             , kReadWrite, kVariable);
   add("StatefulTruncatedNormal"              , kReadWrite, kVariable);
   add("StatefulUniform"                      , kReadWrite, kVariable);
diff --git a/tensorflow/compiler/tf2xla/tf2xla.cc b/tensorflow/compiler/tf2xla/tf2xla.cc
index 242a2b04ab9..3cf9df64b0b 100644
--- a/tensorflow/compiler/tf2xla/tf2xla.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla.cc
@@ -137,7 +137,6 @@ Status ConvertVarHandlesToAotVarHandles(GraphDef* graph_def) {
       const auto& it = node.attr().find("allowed_devices");
       if (it != node.attr().end()) {
         if (!it->second.list().s().empty()) {
-          // TODO(b/149512838): Support non-empty allowed devices.
           return errors::InvalidArgument(
               "VarHandleOp with non-empty allowed devices is not supported.");
         }
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index 635b7170d82..b22dc05eaa1 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/jit/shape_inference.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.h"
+#include "tensorflow/compiler/mlir/utils/array_container_utils.h"
 #include "tensorflow/compiler/tf2xla/graph_compiler.h"
 #include "tensorflow/compiler/tf2xla/rearrange_function_argument.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
@@ -36,6 +37,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/protobuf_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/common_runtime/device.h"
@@ -732,7 +734,7 @@ Status XlaCompiler::CompileFunction(
     VLOG(1) << "Using MLIR bridge";
     GraphDebugInfo debug_info;
     TF_RETURN_IF_ERROR(CompileGraphToXlaHlo(
-        std::move(*graph), {args.data(), args.size()},
+        std::move(*graph), mlir::SpanToArrayRef<XlaCompiler::Argument>(args),
         options_.device_type.type_string(), options.use_tuple_arg,
         *options_.flib_def, debug_info, options_.shape_representation_fn,
         result));
@@ -990,20 +992,6 @@ Status XlaCompiler::BuildArguments(
       tuple = xla::Parameter(builder, 0, (*input_shapes)[0], "arg_tuple");
     }
 
-    for (int i = 0, end = input_to_args->size(); i < end; ++i) {
-      const XlaCompiler::Argument& arg = args[input_to_args->at(i)];
-      for (const auto& dim_and_arg_num : arg.dynamic_dim_to_arg_num_map) {
-        int dynamic_size_param_index = arg_to_inputs.at(dim_and_arg_num.second);
-        VLOG(1) << "Setting dynamic binding " << i << " -> "
-                << dynamic_size_param_index;
-
-        TF_RETURN_IF_ERROR(builder->SetDynamicBinding(
-            /*dynamic_size_param_num=*/0, {dynamic_size_param_index},
-            /*target_param_num=*/0, /*target_param_index=*/{i},
-            dim_and_arg_num.first));
-      }
-    }
-
     for (std::vector<int>::size_type i = 0; i < input_to_args->size(); ++i) {
       auto it = arg_shardings.find(i);
       xla::XlaScopedShardingAssignment assign_sharding(
@@ -1035,16 +1023,17 @@ Status XlaCompiler::BuildArguments(
                                         absl::StrCat("arg", i));
       }
     }
+  }
 
-    for (int i = 0, end = input_to_args->size(); i < end; ++i) {
-      const XlaCompiler::Argument& arg = args[input_to_args->at(i)];
-      for (const auto& dim_and_arg_num : arg.dynamic_dim_to_arg_num_map) {
-        int dynamic_size_param_index = arg_to_inputs.at(dim_and_arg_num.second);
-        TF_RETURN_IF_ERROR(builder->SetDynamicBinding(
-            /*dynamic_size_param_num=*/dynamic_size_param_index, {},
-            /*target_param_num=*/i, /*target_param_index=*/{},
-            dim_and_arg_num.first));
-      }
+  for (int i = 0, end = input_to_args->size(); i < end; ++i) {
+    const XlaCompiler::Argument& arg = args[input_to_args->at(i)];
+    for (const auto& dim_and_arg_num : arg.dynamic_dim_to_arg_num_map) {
+      int dynamic_size_param_index = arg_to_inputs.at(dim_and_arg_num.second);
+      VLOG(1) << "Setting dynamic size " << i << " -> "
+              << dynamic_size_param_index;
+      arg_handles[i] = xla::SetDimensionSize(
+          arg_handles[i], arg_handles[dynamic_size_param_index],
+          dim_and_arg_num.first);
     }
   }
 
@@ -1155,7 +1144,11 @@ Status ValidateGraph(const Graph* graph,
       return errors::InvalidArgument(absl::StrCat(
           "Detected unsupported operations when trying to compile graph ", name,
           " on ", device_type.type_string(), ": ", node->def().op(), " (",
-          s.error_message(), ")", FormatNodeForError(*node)));
+          s.error_message(), ")", FormatNodeForError(*node),
+          "One approach is to outside compile the unsupported ops to run on "
+          "CPUs by enabling soft placement "
+          "`tf.config.set_soft_device_placement(True)`."
+          " This has a potential performance penalty."));
     }
     return Status::OK();
   };
@@ -1370,8 +1363,15 @@ Status XlaCompiler::SetDeviceToHostMetadata(
     const string& key, absl::Span<const DataType> types,
     absl::Span<const TensorShape> shapes) {
   if (host_compute_sends_.find(key) != host_compute_sends_.end()) {
-    return errors::InvalidArgument(
-        "Duplicate calls to SetDeviceToHostMetadata with key ", key);
+    tf2xla::HostTransferMetadata& existing_transfer = host_compute_sends_[key];
+    tf2xla::HostTransferMetadata new_transfer;
+    SetTransfer(key, types, shapes, &new_transfer);
+    if (xla::protobuf_util::ProtobufEquals(existing_transfer, new_transfer)) {
+      return Status::OK();
+    } else {
+      return errors::InvalidArgument(
+          "Duplicate calls to SetDeviceToHostMetadata with key ", key);
+    }
   }
   tf2xla::HostTransferMetadata& transfer = host_compute_sends_[key];
   SetTransfer(key, types, shapes, &transfer);
@@ -1396,9 +1396,16 @@ Status XlaCompiler::GetDeviceToHostShapes(
 Status XlaCompiler::SetHostToDeviceMetadata(
     const string& key, absl::Span<const DataType> types,
     absl::Span<const TensorShape> shapes) {
-  if (host_compute_recvs_.find(key) != host_compute_sends_.end()) {
-    return errors::InvalidArgument(
-        "Duplicate calls to SetHostToDeviceMetadata with key ", key);
+  if (host_compute_recvs_.find(key) != host_compute_recvs_.end()) {
+    tf2xla::HostTransferMetadata& existing_transfer = host_compute_recvs_[key];
+    tf2xla::HostTransferMetadata new_transfer;
+    SetTransfer(key, types, shapes, &new_transfer);
+    if (xla::protobuf_util::ProtobufEquals(existing_transfer, new_transfer)) {
+      return Status::OK();
+    } else {
+      return errors::InvalidArgument(
+          "Duplicate calls to SetHostToDeviceMetadata with key ", key);
+    }
   }
   tf2xla::HostTransferMetadata& transfer = host_compute_recvs_[key];
   SetTransfer(key, types, shapes, &transfer);
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.h b/tensorflow/compiler/tf2xla/xla_compiler.h
index b0d93cde846..762700eaea8 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.h
+++ b/tensorflow/compiler/tf2xla/xla_compiler.h
@@ -129,8 +129,6 @@ class XlaCompiler {
 
     // Resource updates are converted into input / output of xla. The two
     // buffers are aliased with other if this option is true.
-    //
-    // Currently only supports TPU.
     bool alias_resource_update = false;
   };
 
diff --git a/tensorflow/compiler/tf2xla/xla_compiler_test.cc b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
index 5df508d60b3..f348552050b 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler_test.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
@@ -1897,5 +1897,63 @@ TEST_F(XlaCompilerTest, AliasResourceUpdates) {
   EXPECT_EQ(alias.entries(0).parameter_number(), 0);
 }
 
+// Tests that passing in an exact duplicate input to SetDeviceToHostMeatadata
+// is not an error.
+TEST_F(XlaCompilerTest, SetDeviceToHostMetadataExactDuplicate) {
+  XlaCompiler compiler(DefaultOptions());
+
+  const string& key = "comm_key";
+  std::vector<DataType> types{DT_INT32};
+  std::vector<TensorShape> shapes{TensorShape({2})};
+
+  TF_ASSERT_OK(compiler.SetDeviceToHostMetadata(key, types, shapes));
+  TF_ASSERT_OK(compiler.SetDeviceToHostMetadata(key, types, shapes));
+}
+
+// Tests that passing in a mismatched duplicate input to
+// SetDeviceToHostMeatadata is not an error.
+TEST_F(XlaCompilerTest, SetDeviceToHostMetadataMismatchedDuplicate) {
+  XlaCompiler compiler(DefaultOptions());
+
+  const string& key = "comm_key";
+  std::vector<DataType> types{DT_INT32};
+  std::vector<TensorShape> shapes{TensorShape({2})};
+  std::vector<DataType> types2{DT_FLOAT};
+  std::vector<TensorShape> shapes2{TensorShape({1})};
+
+  TF_ASSERT_OK(compiler.SetDeviceToHostMetadata(key, types, shapes));
+  Status status = compiler.SetDeviceToHostMetadata(key, types2, shapes2);
+  EXPECT_EQ(status.code(), error::Code::INVALID_ARGUMENT);
+}
+
+// Tests that passing in an exact duplicate input to SetHostToDeviceMeatadata
+// is not an error.
+TEST_F(XlaCompilerTest, SetHostToDeviceMetadataExactDuplicate) {
+  XlaCompiler compiler(DefaultOptions());
+
+  const string& key = "comm_key";
+  std::vector<DataType> types{DT_INT32};
+  std::vector<TensorShape> shapes{TensorShape({2})};
+
+  TF_ASSERT_OK(compiler.SetHostToDeviceMetadata(key, types, shapes));
+  TF_ASSERT_OK(compiler.SetHostToDeviceMetadata(key, types, shapes));
+}
+
+// Tests that passing in a mismatched duplicate input to
+// SetHostToDeviceMeatadata is not an error.
+TEST_F(XlaCompilerTest, SetHostToDeviceMetadataMismatchedDuplicate) {
+  XlaCompiler compiler(DefaultOptions());
+
+  const string& key = "comm_key";
+  std::vector<DataType> types{DT_INT32};
+  std::vector<TensorShape> shapes{TensorShape({2})};
+  std::vector<DataType> types2{DT_FLOAT};
+  std::vector<TensorShape> shapes2{TensorShape({1})};
+
+  TF_ASSERT_OK(compiler.SetHostToDeviceMetadata(key, types, shapes));
+  Status status = compiler.SetHostToDeviceMetadata(key, types2, shapes2);
+  EXPECT_EQ(status.code(), error::Code::INVALID_ARGUMENT);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/xla_expression.cc b/tensorflow/compiler/tf2xla/xla_expression.cc
index 34e108bb6bf..f0cc8d26709 100644
--- a/tensorflow/compiler/tf2xla/xla_expression.cc
+++ b/tensorflow/compiler/tf2xla/xla_expression.cc
@@ -101,6 +101,48 @@ xla::XlaOp XlaExpression::AsXlaOp(xla::XlaBuilder* builder) const {
   });
 }
 
+xla::StatusOr<Tensor> XlaExpression::ResolveDynamism(
+    xla::Client* client) const {
+  switch (kind()) {
+    case Kind::kConstant: {
+      // Constant values are considered static.
+      Tensor constant_false(DT_BOOL, constant_value().shape());
+      auto flat = constant_false.flat<bool>();
+      for (int64 i = 0; i < flat.size(); ++i) flat(i) = false;
+      return constant_false;
+    }
+    case Kind::kXlaOp:
+      break;
+    case Kind::kTensorList:
+      TF_FALLTHROUGH_INTENDED;
+    case Kind::kResource:
+      TF_FALLTHROUGH_INTENDED;
+    case Kind::kInvalid:
+      return errors::InvalidArgument(
+          "ResolveDynamism called on unsupported XlaExpression: ",
+          HumanString());
+  }
+
+  if (!client)
+    return errors::InvalidArgument("client is required to resolve constant");
+
+  TF_ASSIGN_OR_RETURN(xla::XlaComputation constant_graph,
+                      handle().builder()->BuildDynamicInferenceGraph(handle()));
+
+  TF_ASSIGN_OR_RETURN(TensorShape shape, GetShape());
+
+  // The XLA layout is specified minor to major, and TensorFlow uses a major to
+  // minor order.
+  std::vector<int64> layout_indices(shape.dims());
+  std::iota(layout_indices.rbegin(), layout_indices.rend(), 0);
+  xla::Layout layout = xla::LayoutUtil::MakeLayout(layout_indices);
+  TF_ASSIGN_OR_RETURN(xla::Literal literal,
+                      client->ComputeConstant(constant_graph, &layout));
+  Tensor tensor(DT_BOOL);
+  TF_RETURN_IF_ERROR(LiteralToHostTensor(literal, DT_BOOL, &tensor));
+  return tensor;
+}
+
 xla::StatusOr<absl::optional<Tensor>> XlaExpression::ResolveConstant(
     xla::Client* client, bool dynamic_dimension_is_minus_one) const {
   switch (kind()) {
diff --git a/tensorflow/compiler/tf2xla/xla_expression.h b/tensorflow/compiler/tf2xla/xla_expression.h
index 3010964c5b7..3546368ff7b 100644
--- a/tensorflow/compiler/tf2xla/xla_expression.h
+++ b/tensorflow/compiler/tf2xla/xla_expression.h
@@ -99,6 +99,10 @@ class XlaExpression {
   xla::StatusOr<absl::optional<Tensor>> ResolveConstant(
       xla::Client* client, bool dynamic_dimension_is_minus_one = false) const;
 
+  // ResolveDynamism computes where a value inside this op is dynamic or can be
+  // inferred at compile time.
+  xla::StatusOr<Tensor> ResolveDynamism(xla::Client* client) const;
+
   // Returns the shape of the tensor.
   // The shape of a resource is the shape of a resource handle (i.e., a scalar),
   // not the shape of the resource's value.
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.cc b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
index 735a6c7291e..c2d1906e47a 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
@@ -243,6 +243,74 @@ Status XlaOpKernelContext::ConstantInputAsFloatScalar(int index, double* out) {
   return LiteralToFloat64Scalar(literal, out);
 }
 
+static Status LiteralToPredVector(const xla::LiteralSlice& literal,
+                                  std::vector<bool>* out) {
+  if (literal.shape().rank() != 1) {
+    return errors::InvalidArgument("value is not 1D, rank: ",
+                                   literal.shape().rank());
+  }
+  int64 size = xla::ShapeUtil::ElementsIn(literal.shape());
+  if (literal.shape().element_type() != xla::PRED) {
+    return errors::InvalidArgument("value is not PRED");
+  }
+  for (int64 i = 0; i < size; ++i) {
+    out->push_back(literal.Get<bool>({i}));
+  }
+  return Status::OK();
+}
+
+Status XlaOpKernelContext::ResolveInputDynamismIntoPred(int index, bool* out) {
+  xla::Literal literal;
+  XlaExpression e = InputExpression(index);
+  auto* client = compiler() ? compiler()->client() : nullptr;
+  xla::StatusOr<Tensor> dynamism_or_status = e.ResolveDynamism(client);
+  if (!dynamism_or_status.ok()) {
+    Status status = dynamism_or_status.status();
+    errors::AppendToMessage(&status, "while evaluating input dynamism", index,
+                            " of ", context_->op_kernel().type_string());
+    return status;
+  }
+  Tensor dynamism = dynamism_or_status.ValueOrDie();
+
+  Tensor temp(dynamism.dtype());
+  TensorShape tensor_shape({});
+  if (!temp.CopyFrom(dynamism, tensor_shape)) {
+    return errors::InvalidArgument(
+        context_->op_kernel().name(), " input ", index, " has shape ",
+        dynamism.shape().DebugString(), " which is not a R0 ", tensor_shape);
+  }
+
+  TF_ASSIGN_OR_RETURN(literal, HostTensorToLiteral(temp));
+  *out = literal.Get<bool>({});
+  return Status::OK();
+}
+
+Status XlaOpKernelContext::ResolveInputDynamismIntoPredVector(
+    int index, std::vector<bool>* out) {
+  xla::Literal literal;
+  XlaExpression e = InputExpression(index);
+  auto* client = compiler() ? compiler()->client() : nullptr;
+  xla::StatusOr<Tensor> dynamism_or_status = e.ResolveDynamism(client);
+  if (!dynamism_or_status.ok()) {
+    Status status = dynamism_or_status.status();
+    errors::AppendToMessage(&status, "while evaluating input dynamism", index,
+                            " of ", context_->op_kernel().type_string());
+    return status;
+  }
+  Tensor dynamism = dynamism_or_status.ValueOrDie();
+
+  Tensor temp(dynamism.dtype());
+  TensorShape tensor_shape({InputShape(index).num_elements()});
+  if (!temp.CopyFrom(dynamism, tensor_shape)) {
+    return errors::InvalidArgument(
+        context_->op_kernel().name(), " input ", index, " has shape ",
+        dynamism.shape().DebugString(), " which is not a R1 ", tensor_shape);
+  }
+
+  TF_ASSIGN_OR_RETURN(literal, HostTensorToLiteral(temp));
+  return LiteralToPredVector(literal, out);
+}
+
 // Converts an int32 or int64 1D literal to an int64 vector.
 static Status LiteralToInt64Vector(const xla::LiteralSlice& literal,
                                    std::vector<int64>* out) {
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.h b/tensorflow/compiler/tf2xla/xla_op_kernel.h
index 3cf51e6ec6f..1ed343ba20f 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.h
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.h
@@ -116,7 +116,10 @@ class XlaOpKernelContext {
   // returns a one-element list.
   Status InputList(absl::string_view name, std::vector<xla::XlaOp>* handles,
                    std::vector<TensorShape>* shapes);
-
+  // Evaluates input and returns their dynamism vector in a vector of
+  // predicates.
+  Status ResolveInputDynamismIntoPredVector(int index, std::vector<bool>* out);
+  Status ResolveInputDynamismIntoPred(int index, bool* out);
   // Helper methods for constant inputs.
 
   // Evaluates input `index` and stores it in `*constant_literal`. If the
diff --git a/tensorflow/compiler/tf2xla/xla_op_registry.cc b/tensorflow/compiler/tf2xla/xla_op_registry.cc
index e37f4659185..9948fe6d1b9 100644
--- a/tensorflow/compiler/tf2xla/xla_op_registry.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_registry.cc
@@ -365,6 +365,19 @@ std::vector<const KernelDef*> XlaOpRegistry::DeviceKernels(
   return ops;
 }
 
+/*static*/ const std::unordered_set<std::string>*
+XlaOpRegistry::CompileTimeConstantInputArgNames(const string& op) {
+  XlaOpRegistry& registry = Instance();
+  mutex_lock lock(registry.mutex_);
+  auto it = registry.ops_.find(op);
+  static auto empty_set = new std::unordered_set<std::string>;
+  if (it == registry.ops_.end() || it->second.empty()) {
+    return empty_set;
+  } else {
+    return &it->second.front()->compile_time_constant_inputs;
+  }
+}
+
 /* static */ Status XlaOpRegistry::CompileTimeConstantInputs(
     const NodeDef& node_def, const OpKernel* op_kernel, const OpDef* op_def,
     std::vector<int>* result) {
@@ -385,21 +398,10 @@ std::vector<const KernelDef*> XlaOpRegistry::DeviceKernels(
                                compile_time_constant_inputs_from_attr.end()));
     compile_time_constant_inputs = &compile_time_constant_inputs_from_attr;
   } else {
-    const string& op = node_def.op();
-
-    XlaOpRegistry& registry = Instance();
-    mutex_lock lock(registry.mutex_);
-    auto it = registry.ops_.find(op);
-    if (it == registry.ops_.end() || it->second.empty()) {
+    compile_time_constant_inputs =
+        CompileTimeConstantInputArgNames(node_def.op());
+    if (compile_time_constant_inputs->empty()) {
       return Status::OK();
-    } else {
-      // The test in IsCompatible ensures that if there are multiple matching
-      // registrations for this op name, they all have the same value of
-      // compile_time_constant_inputs, so only the first match is returned.
-      //
-      // TODO(sanjoy): This can probably be a std::vector<string>.
-      compile_time_constant_inputs =
-          &it->second.front()->compile_time_constant_inputs;
     }
   }
 
diff --git a/tensorflow/compiler/tf2xla/xla_op_registry.h b/tensorflow/compiler/tf2xla/xla_op_registry.h
index af720fb4bb9..9533acb6a0c 100644
--- a/tensorflow/compiler/tf2xla/xla_op_registry.h
+++ b/tensorflow/compiler/tf2xla/xla_op_registry.h
@@ -198,6 +198,11 @@ class XlaOpRegistry {
                                      /*op_def=*/nullptr, result);
   }
 
+  // Return names of arguments for a given op which are supposed to be
+  // constants.
+  static const std::unordered_set<std::string>*
+  CompileTimeConstantInputArgNames(const string& op);
+
   // Returns true if `op` is a "metadata" op, one that only looks at the shapes
   // of its operands and not their values.
   static bool IsMetadataOp(const string& op);
diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD
index 35fa6a617f0..598112e00df 100644
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@@ -969,6 +969,11 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "union_find",
+    hdrs = ["union_find.h"],
+)
+
 # -----------------------------------------------------------------------------
 
 # This is a headers target that extra XLA devices can use to prevent circular dependencies.  Devices that are compiled as separate shared objects can also use it to prevent linking of library code.
diff --git a/tensorflow/compiler/xla/array.h b/tensorflow/compiler/xla/array.h
index 392cd9bd359..a85d551769c 100644
--- a/tensorflow/compiler/xla/array.h
+++ b/tensorflow/compiler/xla/array.h
@@ -289,13 +289,19 @@ class Array {
   }
 
   // Fills the array with random normal variables with the specified mean.
-  void FillRandom(const T& stddev, const double mean = 0.0,
-                  const int seed = 12345) {
+  void FillRandom(const T& stddev, double mean = 0.0, int seed = 12345) {
+    FillRandomDouble(static_cast<double>(stddev), mean, seed);
+  }
+
+  void FillRandomDouble(double stddev, double mean = 0.0, int seed = 12345) {
     std::mt19937 g(seed);
-    std::normal_distribution<double> distribution(mean,
-                                                  static_cast<double>(stddev));
+    std::normal_distribution<double> distribution(mean, stddev);
     for (int64 i = 0; i < num_elements(); ++i) {
-      values_[i] = static_cast<T>(distribution(g));
+      if (std::is_same<T, bool>()) {
+        values_[i] = static_cast<T>(distribution(g) > 0.0);
+      } else {
+        values_[i] = static_cast<T>(distribution(g));
+      }
     }
   }
 
diff --git a/tensorflow/compiler/xla/bit_cast.h b/tensorflow/compiler/xla/bit_cast.h
index 90e9a5c25dd..feb548c9433 100644
--- a/tensorflow/compiler/xla/bit_cast.h
+++ b/tensorflow/compiler/xla/bit_cast.h
@@ -29,7 +29,7 @@ limitations under the License.
 #include "absl/base/casts.h"
 #include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/lib/bfloat16/bfloat16.h"
+#include "tensorflow/core/platform/bfloat16.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/client/lib/BUILD b/tensorflow/compiler/xla/client/lib/BUILD
index 06fd8ceeb2b..6cd77bf9f19 100644
--- a/tensorflow/compiler/xla/client/lib/BUILD
+++ b/tensorflow/compiler/xla/client/lib/BUILD
@@ -55,9 +55,13 @@ xla_test(
 cc_library(
     name = "comparators",
     srcs = ["comparators.cc"],
-    hdrs = ["comparators.h"],
+    hdrs = [
+        "comparators.h",
+        "//tensorflow/compiler/xla:literal_util",
+    ],
     deps = [
         ":constants",
+        "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
@@ -195,6 +199,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
     ],
 )
@@ -300,6 +305,20 @@ xla_test(
         "//tensorflow/compiler/xla/tests:test_macros_header",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
+        "//tensorflow/core/platform:tensor_float_32_utils",
+    ],
+)
+
+cc_library(
+    name = "lu_decomposition",
+    srcs = ["lu_decomposition.cc"],
+    hdrs = ["lu_decomposition.h"],
+    deps = [
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/client:xla_builder",
     ],
 )
 
@@ -340,6 +359,9 @@ cc_library(
     hdrs = ["sorting.h"],
     deps = [
         ":comparators",
+        ":constants",
+        ":loops",
+        ":slicing",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
@@ -571,6 +593,7 @@ cc_library(
         ":loops",
         ":math",
         ":matrix",
+        ":qr",
         ":slicing",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
diff --git a/tensorflow/compiler/xla/client/lib/arithmetic.cc b/tensorflow/compiler/xla/client/lib/arithmetic.cc
index 20d9930341f..744cdcea14c 100644
--- a/tensorflow/compiler/xla/client/lib/arithmetic.cc
+++ b/tensorflow/compiler/xla/client/lib/arithmetic.cc
@@ -137,7 +137,7 @@ XlaComputation CreateMinMaxComputation(XlaBuilder* outer_builder,
     arg_max = Select(eq, tie_id, arg_max);
   }
   Tuple(b, {max, arg_max});
-  return b->Build().ConsumeValueOrDie();
+  return b->BuildAndNoteError();
 }
 
 XlaOp ArgMinMax(XlaOp input, PrimitiveType output_type, int axis, bool is_min,
diff --git a/tensorflow/compiler/xla/client/lib/comparators.cc b/tensorflow/compiler/xla/client/lib/comparators.cc
index 74e89b767cf..c9d6cea740d 100644
--- a/tensorflow/compiler/xla/client/lib/comparators.cc
+++ b/tensorflow/compiler/xla/client/lib/comparators.cc
@@ -32,85 +32,13 @@ limitations under the License.
 namespace xla {
 namespace {
 
-using XlaOpGenerator = XlaOp (*)(XlaOp, XlaOp, absl::Span<const int64>);
-
-XlaOp BitcastConvertFloatingPointToIntegral(const XlaOp& value,
-                                            int64 bit_width) {
-  PrimitiveType signed_type;
-  PrimitiveType unsigned_type;
-  XlaOp max_value;
-  switch (bit_width) {
-    case 16:
-      max_value =
-          ConstantR0(value.builder(),
-                     static_cast<uint16>(std::numeric_limits<int16>::max()));
-      signed_type = S16;
-      unsigned_type = U16;
-      break;
-    case 32:
-      max_value =
-          ConstantR0(value.builder(),
-                     static_cast<uint32>(std::numeric_limits<int32>::max()));
-      signed_type = S32;
-      unsigned_type = U32;
-      break;
-    case 64:
-      max_value =
-          ConstantR0(value.builder(),
-                     static_cast<uint64>(std::numeric_limits<int64>::max()));
-      signed_type = S64;
-      unsigned_type = U64;
-      break;
-    default:
-      return value.builder()->ReportError(
-          InvalidArgument("Invalid bit width %lld for Comparator floating "
-                          "point parameter.",
-                          bit_width));
-  }
-  // Switch from a floating point value to a integer value in such a way that
-  // when using the integer value to compare, we get the same result for normal
-  // values, and -Nan is treated as the smallest value, and Nan is treated as
-  // the largest value.
-  // If f is a float, and
-  // x = bit_cast<int32>(f);
-  // y = x < 0 ? numeric_limits<int32>::max() - x : x;
-  // then y is ordered as an int32 such that finite values have the obvious
-  // order, -0 is ordered before 0, and -NaN and NaN appear at the beginning
-  // and end of the ordering.
-  // Note that in order to avoid -x to overflow, we calculate
-  // numeric_limits<int32>::max() - x as unsigned, and then convert back to
-  // signed.
-  auto signed_value = BitcastConvertType(value, signed_type);
-  auto unsigned_value = BitcastConvertType(value, unsigned_type);
-  auto flipped_value =
-      BitcastConvertType(Sub(max_value, unsigned_value), signed_type);
-  auto is_negative = Lt(signed_value, Zero(value.builder(), signed_type));
-  return Select(is_negative, flipped_value, signed_value);
-}
-
-void ConvertFloatingPoint(const PrimitiveType& operand_type, XlaOp* lhs_param,
-                          XlaOp* rhs_param) {
-  if (primitive_util::IsFloatingPointType(operand_type)) {
-    PrimitiveType compare_type = operand_type;
-    // Special-case handling for BF16. We currently do not support direct
-    // comparisons with BF16, so we convert to F32 and then use the F32
-    // comparison logic.
-    if (compare_type == BF16) {
-      compare_type = F32;
-      *lhs_param = ConvertElementType(*lhs_param, F32);
-      *rhs_param = ConvertElementType(*rhs_param, F32);
-    }
-    int64 bit_width = primitive_util::BitWidth(compare_type);
-    *lhs_param = BitcastConvertFloatingPointToIntegral(*lhs_param, bit_width);
-    *rhs_param = BitcastConvertFloatingPointToIntegral(*rhs_param, bit_width);
-  }
-}
+using XlaCompareOp = XlaOp (*)(XlaOp, XlaOp, absl::Span<const int64>);
 
 XlaComputation CreateScalarComparisonComputation(
     const string& name, const std::vector<PrimitiveType>& operand_types,
-    XlaBuilder* builder, XlaOpGenerator generator) {
+    XlaBuilder* builder, XlaCompareOp generator) {
   CHECK_NE(operand_types.size(), 0);
-  std::vector<absl::optional<XlaOpGenerator>> generators(operand_types.size());
+  std::vector<absl::optional<XlaCompareOp>> generators(operand_types.size());
   generators[0] = generator;
   return CreateScalarComparisonComputation(name, operand_types, generators,
                                            builder);
@@ -119,7 +47,7 @@ XlaComputation CreateScalarComparisonComputation(
 
 XlaComputation CreateScalarComparisonComputation(
     const string& name, const std::vector<PrimitiveType>& operand_types,
-    const std::vector<absl::optional<XlaOpGenerator>>& generators,
+    const std::vector<absl::optional<XlaCompareOp>>& generators,
     XlaBuilder* builder) {
   // Create a default computation where we compare only the first two
   // parameters of type 'operand_types[0]'.
@@ -146,7 +74,6 @@ XlaComputation CreateScalarComparisonComputation(
                                absl::StrCat("p.", parameter_count, ".lhs"));
     auto rhs_param = Parameter(b.get(), parameter_count * 2 + 1, scalar_shape,
                                absl::StrCat("p.", parameter_count, ".rhs"));
-    ConvertFloatingPoint(operand_type, &lhs_param, &rhs_param);
     lhs_params.emplace_back(lhs_param);
     rhs_params.emplace_back(rhs_param);
     if (generators[parameter_count].has_value()) {
@@ -157,7 +84,12 @@ XlaComputation CreateScalarComparisonComputation(
 
   CHECK_NE(parameter_count, 0);
 
-  Shape shape = b->GetShape(lhs_params[0]).ValueOrDie();
+  auto shape_or = b->GetShape(lhs_params[0]);
+  if (!shape_or.ok()) {
+    b->ReportError(shape_or.status());
+    return {};
+  }
+  Shape shape = shape_or.ValueOrDie();
   shape.set_element_type(PRED);
   XlaOp param_equal = Broadcast(One(b.get(), shape.element_type()),
                                 AsInt64Slice(shape.dimensions()));
@@ -169,7 +101,8 @@ XlaComputation CreateScalarComparisonComputation(
                       generators[i].value()(lhs_params[i], rhs_params[i], {}),
                       result);
       if (i != last_generator_index) {
-        param_equal = And(param_equal, Eq(lhs_params[i], rhs_params[i]));
+        param_equal =
+            And(param_equal, EqTotalOrder(lhs_params[i], rhs_params[i]));
       }
     }
   }
@@ -181,14 +114,14 @@ XlaComputation CreateScalarComparisonComputation(
 XlaComputation CreateScalarLtComputation(
     const std::vector<PrimitiveType>& operand_types, XlaBuilder* builder) {
   return CreateScalarComparisonComputation("compare-less-than", operand_types,
-                                           builder, Lt);
+                                           builder, LtTotalOrder);
 }
 
 // Creates a scalar greater-than computation and returns it.
 XlaComputation CreateScalarGtComputation(
     const std::vector<PrimitiveType>& operand_types, XlaBuilder* builder) {
-  return CreateScalarComparisonComputation("compare-greater-than",
-                                           operand_types, builder, Gt);
+  return CreateScalarComparisonComputation(
+      "compare-greater-than", operand_types, builder, GtTotalOrder);
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/comparators.h b/tensorflow/compiler/xla/client/lib/comparators.h
index 25924d4a4f4..a82a84799aa 100644
--- a/tensorflow/compiler/xla/client/lib/comparators.h
+++ b/tensorflow/compiler/xla/client/lib/comparators.h
@@ -43,14 +43,13 @@ XlaComputation CreateScalarGtComputation(
     const std::vector<PrimitiveType>& operand_types, XlaBuilder* builder);
 
 // Creates a scalar comparison computation and returns it. This function takes
-// an std::vector<absl::optional<XlaOpGenerator>> and compare the operands
-// where the generator isn't nullopt with the specified comparator
-// at that location.
+// a vector of comparator functions to compare the operands where the function
+// isn't nullopt with the specified comparator at that location.
 XlaComputation CreateScalarComparisonComputation(
     const string& name, const std::vector<PrimitiveType>& operand_types,
     const std::vector<
         absl::optional<XlaOp (*)(XlaOp, XlaOp, absl::Span<const int64>)>>&
-        generators,
+        comparators,
     XlaBuilder* builder);
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/logdet.cc b/tensorflow/compiler/xla/client/lib/logdet.cc
index 8f37c393922..18cd0870f2a 100644
--- a/tensorflow/compiler/xla/client/lib/logdet.cc
+++ b/tensorflow/compiler/xla/client/lib/logdet.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/lib/loops.h"
 #include "tensorflow/compiler/xla/client/lib/math.h"
 #include "tensorflow/compiler/xla/client/lib/matrix.h"
+#include "tensorflow/compiler/xla/client/lib/qr.h"
 #include "tensorflow/compiler/xla/client/lib/slicing.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
@@ -33,13 +34,46 @@ limitations under the License.
 
 namespace xla {
 
-// let G = root(A) be the Cholesky root of the matrix A
-// log(det(A)) = 2*sum(log(vecdiag(G)))
+// log(det(A)) = sum(log(vecdiag(QR(A).r))), since R is triangular and Q is
+// orthonormal
 XlaOp LogDet(XlaOp a) {
-  XlaOp cholesky = Cholesky(a, /*bool lower=*/true);
+  return a.builder()->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape a_shape, a.builder()->GetShape(a));
+    // Compute the number of Householder transformations required on 'a' by
+    // determining the number of rows in 'a' that are already triangular. The
+    // determinant of Q is -1 ^ (number of Householder transfomations)
+    auto rows = Iota(a.builder(), ShapeUtil::ChangeElementType(a_shape, S32),
+                     a_shape.rank() - 2);
+    auto cols = Iota(a.builder(), ShapeUtil::ChangeElementType(a_shape, S32),
+                     a_shape.rank() - 1);
+    auto in_lower_triangle = Lt(cols, rows);
+    auto is_zero = Eq(a, ScalarLike(a, 0));
+    auto num_zeros_in_triangle_per_row = Einsum(
+        ConvertElementType(And(in_lower_triangle, is_zero), S32), "...a->...");
+    TF_ASSIGN_OR_RETURN(auto row_shape,
+                        a.builder()->GetShape(num_zeros_in_triangle_per_row));
+    rows = Iota(a.builder(), row_shape, row_shape.rank() - 1);
+    auto num_triangle_rows =
+        Einsum(ConvertElementType(Eq(rows, num_zeros_in_triangle_per_row), S32),
+               "...a->...");
+    auto num_rows =
+        ScalarLike(num_triangle_rows, a_shape.dimensions(a_shape.rank() - 2));
 
-  return ScalarLike(a, 2) *
-         Einsum(Log(cholesky), "...aa->...", xla::PrecisionConfig::HIGHEST);
+    TF_ASSIGN_OR_RETURN(auto qr, QRDecomposition(a, true));
+    // Get the and log of the determinant based on the values along the diagonal
+    // of R.
+    auto log_abs_det = Einsum(Log(Abs(qr.r)), "...aa->...");
+    auto sign_diag = Reduce(
+        Sign(Einsum(qr.r, "...aa->...a")),
+        One(a.builder(), a_shape.element_type()),
+        CreateScalarMultiplyComputation(a_shape.element_type(), a.builder()),
+        {a_shape.rank() - 2});
+    return sign_diag * log_abs_det *
+           Select(ConvertElementType(Rem(num_rows - num_triangle_rows,
+                                         ScalarLike(num_triangle_rows, 2)),
+                                     PRED),
+                  ScalarLike(sign_diag, -1.0), ScalarLike(sign_diag, 1.0));
+  });
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/logdet_test.cc b/tensorflow/compiler/xla/client/lib/logdet_test.cc
index 54af41f77f6..319d819ed98 100644
--- a/tensorflow/compiler/xla/client/lib/logdet_test.cc
+++ b/tensorflow/compiler/xla/client/lib/logdet_test.cc
@@ -51,6 +51,26 @@ XLA_TEST_F(LogDetTest, Simple) {
                              xla::ErrorSpec(1e-4));
 }
 
+XLA_TEST_F(LogDetTest, SimpleTriangle) {
+  xla::XlaBuilder builder(TestName());
+
+  xla::Array2D<float> a_vals({
+      {4, 6, 8, 10},
+      {4, -39, 62, 73},
+      {0, 0, -146, 166},
+      {4, 6, 8, 320},
+  });
+
+  float expected = -15.9131355f;
+
+  xla::XlaOp a;
+  auto a_data = CreateR2Parameter<float>(a_vals, 0, "a", &builder, &a);
+  xla::LogDet(a);
+
+  ComputeAndCompareR0<float>(&builder, expected, {a_data.get()},
+                             xla::ErrorSpec(1e-4));
+}
+
 XLA_TEST_F(LogDetTest, SimpleBatched) {
   xla::XlaBuilder builder(TestName());
 
diff --git a/tensorflow/compiler/xla/client/lib/lu_decomposition.cc b/tensorflow/compiler/xla/client/lib/lu_decomposition.cc
new file mode 100644
index 00000000000..2920b6f56b5
--- /dev/null
+++ b/tensorflow/compiler/xla/client/lib/lu_decomposition.cc
@@ -0,0 +1,57 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/client/lib/lu_decomposition.h"
+
+#include <vector>
+
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/types.h"
+
+namespace xla {
+
+LuDecompositionResult LuDecomposition(XlaOp a) {
+  XlaBuilder* builder = a.builder();
+  XlaOp result = builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
+    const int ndims = a_shape.rank();
+    TF_RET_CHECK(ndims >= 2);
+    const int64 m = ShapeUtil::GetDimension(a_shape, -2);
+    const int64 n = ShapeUtil::GetDimension(a_shape, -1);
+    const int num_batch_dims = a_shape.dimensions().size() - 2;
+    const std::vector<int64> batch_dims(
+        a_shape.dimensions().begin(),
+        a_shape.dimensions().begin() + num_batch_dims);
+
+    std::vector<int64> pivot_dims = batch_dims;
+    pivot_dims.push_back(std::min(m, n));
+    std::vector<int64> perm_dims = batch_dims;
+    perm_dims.push_back(m);
+    Shape lu_shape = ShapeUtil::MakeTupleShape(
+        {a_shape, ShapeUtil::MakeShape(S32, pivot_dims),
+         ShapeUtil::MakeShape(S32, perm_dims)});
+    // The TPU compiler has a rewrite pass that lowers an LuDecomposition
+    // CustomCall.
+    // TODO(phawkins): upgrade LU decomposition to a first-class HLO operator
+    // and implement it on other backends.
+    return CustomCall(a.builder(), "LuDecomposition", {a}, lu_shape);
+  });
+  return LuDecompositionResult{GetTupleElement(result, 0),
+                               GetTupleElement(result, 1),
+                               GetTupleElement(result, 2)};
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/lu_decomposition.h b/tensorflow/compiler/xla/client/lib/lu_decomposition.h
new file mode 100644
index 00000000000..3f5703510a3
--- /dev/null
+++ b/tensorflow/compiler/xla/client/lib/lu_decomposition.h
@@ -0,0 +1,61 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_LIB_LU_DECOMPOSITION_H_
+#define TENSORFLOW_COMPILER_XLA_CLIENT_LIB_LU_DECOMPOSITION_H_
+
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla {
+
+// Computes the LU decomposition with partial pivoting of a batch of matrices.
+//
+// Given a (batched) matrix a with shape [..., m, n], computes the matrix
+// decomposition A = P @ L @ U where P is a permutation matrix, L is a
+// lower-triangular matrix with unit diagonal entries, and U is an
+// upper-triangular matrix.
+//
+// L and U are returned as a single matrix [..., m, n] containing both L and U
+// packed in the same array. The unit diagonal of L is not represented
+// explicitly.
+//
+// The permutation matrix P is returned in two forms, both as `pivots`, which is
+// an s32[..., min(m, n)] array that describes a sequence of row-swaps in the
+// style of LAPACK's xGETRF API, and `permutation`, which is a s32[..., m] array
+// which gives the permutation to apply to the rows. We return both
+// representations because they are each useful for different purposes; `pivots`
+// is useful for computing the sign of a determinant, whereas `permutation` can
+// be used via a Gather operation to permute the rows of a matrix.
+//
+// This method is only implemented on TPU at the moment.
+// TODO(b/168208200): the implementation only supports F32 arrays. Handle the
+// complex case.
+struct LuDecompositionResult {
+  // The LU decomposition, with both L and U packed into an array with shape
+  // [..., m, n].
+  XlaOp lu;
+  // An array of shape s32[..., min(m, n)] containing the pivot rows.
+  XlaOp pivots;
+  // An array of shape s32[..., m], containing an another representation of the
+  // pivots as a permutation.
+  XlaOp permutation;
+};
+
+LuDecompositionResult LuDecomposition(XlaOp a);
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_LU_DECOMPOSITION_H_
diff --git a/tensorflow/compiler/xla/client/lib/math.cc b/tensorflow/compiler/xla/client/lib/math.cc
index 6fdaab58686..cd9f88a74ce 100644
--- a/tensorflow/compiler/xla/client/lib/math.cc
+++ b/tensorflow/compiler/xla/client/lib/math.cc
@@ -1111,11 +1111,28 @@ XlaOp RoundToEven(XlaOp x) {
 
 // acos(x) = 2 * atan(sqrt(1 - x^2) / (1 + x)) if x != -1
 //           pi                                if x == -1
+// For complex:
+// acos(x) = -(i * log(x + i * sqrt((1 + x) * (1 - x))))
 XlaOp Acos(XlaOp x) {
-  return Select(Ne(x, FullLike(x, -1)),
-                ScalarLike(x, 2.0) * Atan2(Sqrt(ScalarLike(x, 1.0) - x * x),
-                                           ScalarLike(x, 1.0) + x),
-                FullLike(x, M_PI));
+  XlaBuilder* b = x.builder();
+  return b->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(auto shape, b->GetShape(x));
+
+    if (primitive_util::IsComplexType(shape.element_type())) {
+      auto one = ScalarLike(x, 1);
+      auto imag_one = Complex(
+          Zero(b, primitive_util::ComplexComponentType(shape.element_type())),
+          One(b, primitive_util::ComplexComponentType(shape.element_type())));
+
+      auto result =
+          Neg(imag_one * Log(x + imag_one * Sqrt((one + x) * (one - x))));
+      return result;
+    }
+    return Select(Ne(x, FullLike(x, -1)),
+                  ScalarLike(x, 2.0) * Atan2(Sqrt(ScalarLike(x, 1.0) - x * x),
+                                             ScalarLike(x, 1.0) + x),
+                  FullLike(x, M_PI));
+  });
 }
 
 // asin(x) = 2 * atan(x / (1 + sqrt(1 - x^2)))
diff --git a/tensorflow/compiler/xla/client/lib/math_test.cc b/tensorflow/compiler/xla/client/lib/math_test.cc
index cb79b2ef7db..ae4d839d8fa 100644
--- a/tensorflow/compiler/xla/client/lib/math_test.cc
+++ b/tensorflow/compiler/xla/client/lib/math_test.cc
@@ -660,5 +660,19 @@ XLA_TEST_F(MathTest, BesselI1eDouble) {
   ComputeAndCompareR1<double>(&builder, expected, {}, error_spec_);
 }
 
+XLA_TEST_F(MathTest, AcosComplexValues) {
+  XlaBuilder builder(TestName());
+  auto x = ConstantR1<std::complex<float>>(
+      &builder, {{0, 0}, {0, 1}, {1, 1}, {0.8, 0.2}});
+
+  Acos(x);
+  std::vector<std::complex<float>> expected = {
+      {1.5707963267948966, 0},
+      {1.5707963267948966, -0.881373587019543},
+      {0.9045568943023814, -1.0612750619050357},
+      {0.7011246914497526, -0.30527648462436596}};
+  ComputeAndCompareR1<std::complex<float>>(&builder, expected, {}, error_spec_);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/matrix.cc b/tensorflow/compiler/xla/client/lib/matrix.cc
index b7721f2bbc5..dbb73602801 100644
--- a/tensorflow/compiler/xla/client/lib/matrix.cc
+++ b/tensorflow/compiler/xla/client/lib/matrix.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "absl/strings/str_join.h"
 #include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
@@ -235,85 +236,93 @@ XlaOp UpperTriangle(XlaOp x) { return Triangle(x, false); }
 XlaOp LowerTriangle(XlaOp x) { return Triangle(x, true); }
 
 namespace {
-std::vector<int64> EinsumDiagonalLabels(absl::Span<const int64> config) {
+absl::optional<std::array<std::vector<int64>, 3>> EinsumDiagonalLabels(
+    absl::Span<const int64> config) {
   std::vector<int64> unique_labels;
+  std::vector<int64> reduce_dims;
+  std::vector<int64> broadcast_dims;
   for (auto label = config.begin(); label != config.end(); ++label) {
     auto first_label = absl::c_find(config, *label);
+    auto dim = label - config.begin();
     if (first_label == label) {
       unique_labels.push_back(*label);
+      broadcast_dims.push_back(dim);
+    } else {
+      reduce_dims.push_back(dim);
     }
   }
   if (unique_labels.size() == config.size()) {
-    unique_labels.clear();
+    return absl::nullopt;
   }
-  return unique_labels;
+  return {{unique_labels, reduce_dims, broadcast_dims}};
 }
-}  // namespace
 
-xla::XlaOp EinsumDiagonal(XlaOp x, absl::Span<const int64> config) {
+// Masks a tensor such that only the diagonal of repeated indices are non-zero.
+// The result of this can be used to create a diagonal matrix with an identity
+// reduction.
+xla::XlaOp EinsumDiagonalMask(XlaOp x, absl::Span<const int64> config) {
   XlaBuilder* builder = x.builder();
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    if (EinsumDiagonalLabels(config).empty()) {
-      return x;
-    }
     TF_ASSIGN_OR_RETURN(Shape x_shape, builder->GetShape(x));
     Shape iota_shape = x_shape;
     iota_shape.set_element_type(S32);
     XlaOp mask = ConstantR0(builder, true);
 
-    absl::InlinedVector<int64, 8> reduce_dims;
     for (auto label = config.begin(); label != config.end(); ++label) {
       const int64 dim = label - config.begin();
       auto first_label = absl::c_find(config, *label);
-      if (first_label == label) {
-        continue;
+      if (first_label != label) {
+        const int64 first_dim = first_label - config.begin();
+        mask = And(mask, Eq(Iota(builder, iota_shape, first_dim),
+                            Iota(builder, iota_shape, dim)));
       }
-      reduce_dims.push_back(dim);
-      const int64 first_dim = first_label - config.begin();
-      mask = And(mask, Eq(Iota(builder, iota_shape, first_dim),
-                          Iota(builder, iota_shape, dim)));
     }
-    auto zero = ScalarLike(x, 0);
-    return Reduce(Select(mask, x, zero), zero,
-                  CreateScalarIdentityWithZeroComputation(
-                      x_shape.element_type(), builder),
-                  reduce_dims);
+    return Select(mask, x, ZerosLike(x));
   });
 }
 
-Status ValidateEinsumNumericDimensions(absl::Span<const int64> x_config,
-                                       absl::Span<const int64> y_config,
-                                       absl::Span<const int64> output_config) {
-  for (auto dim : output_config) {
-    if (absl::c_linear_search(x_config, dim) ||
-        absl::c_linear_search(y_config, dim)) {
-      if (absl::c_count(output_config, dim) > 1) {
-        return InvalidArgument("Einsum has repeated output dimension.");
-      }
-      continue;
+xla::XlaOp EinsumDiagonal(XlaOp x, absl::Span<const int64> config) {
+  XlaBuilder* builder = x.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    auto labels = EinsumDiagonalLabels(config);
+    if (!labels) {
+      return x;
     }
-    return InvalidArgument(
-        "Einsum has output dimension without corresponding input dimension.");
-  }
-  for (auto dim : x_config) {
-    if (absl::c_linear_search(y_config, dim) ||
-        absl::c_linear_search(output_config, dim)) {
-      if (absl::c_count(x_config, dim) > 1) {
-        return InvalidArgument("Einsum has repeated lhs dimension.");
-      }
-    }
-  }
-  for (auto dim : y_config) {
-    if (absl::c_linear_search(x_config, dim) ||
-        absl::c_linear_search(output_config, dim)) {
-      if (absl::c_count(y_config, dim) > 1) {
-        return InvalidArgument("Einsum has repeated rhs dimension.");
-      }
-    }
-  }
-  return Status::OK();
+    auto zero = ScalarLike(x, 0);
+    TF_ASSIGN_OR_RETURN(Shape x_shape, builder->GetShape(x));
+    return Reduce(EinsumDiagonalMask(x, config), zero,
+                  CreateScalarIdentityWithZeroComputation(
+                      x_shape.element_type(), builder),
+                  labels->at(1));
+  });
 }
 
+xla::XlaOp EinsumInverseDiagonal(XlaOp x, absl::Span<const int64> config) {
+  XlaBuilder* builder = x.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    auto labels = EinsumDiagonalLabels(config);
+    if (!labels) {
+      return x;
+    }
+    TF_ASSIGN_OR_RETURN(Shape x_shape, builder->GetShape(x));
+    std::vector<int64> broadcast_sizes;
+    int64 x_dim = 0;
+    for (auto label = config.begin(); label != config.end(); ++label) {
+      auto first_label = absl::c_find(config, *label);
+      if (first_label == label) {
+        broadcast_sizes.push_back(x_shape.dimensions(x_dim));
+        ++x_dim;
+      } else {
+        broadcast_sizes.push_back(
+            broadcast_sizes[first_label - config.begin()]);
+      }
+    }
+    x = BroadcastInDim(x, broadcast_sizes, labels->at(2));
+    return EinsumDiagonalMask(x, config);
+  });
+}
+}  // namespace
+
 namespace {
 // Helper method to remove dimensions from a shape and dot dimension numbers
 // used to implement implicit broadcasting.
@@ -347,21 +356,23 @@ xla::XlaOp Einsum(xla::XlaOp x, absl::Span<const int64> x_config, xla::XlaOp y,
   XlaBuilder* builder = x.builder();
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     auto x_diagonal_labels = EinsumDiagonalLabels(x_config);
+    if (x_diagonal_labels) {
+      return Einsum(EinsumDiagonal(x, x_config), x_diagonal_labels->at(0), y,
+                    y_config, output_config, precision);
+    }
     auto y_diagonal_labels = EinsumDiagonalLabels(y_config);
-    if (!x_diagonal_labels.empty() && !y_diagonal_labels.empty()) {
-      return Einsum(EinsumDiagonal(x, x_config), x_diagonal_labels,
-                    EinsumDiagonal(y, y_config), y_diagonal_labels,
-                    output_config, precision);
-    } else if (!x_diagonal_labels.empty()) {
-      return Einsum(EinsumDiagonal(x, x_config), x_diagonal_labels, y, y_config,
-                    output_config, precision);
-    } else if (!y_diagonal_labels.empty()) {
-      return Einsum(x, x_config, EinsumDiagonal(y, y_config), y_diagonal_labels,
-                    output_config, precision);
+    if (y_diagonal_labels) {
+      return Einsum(x, x_config, EinsumDiagonal(y, y_config),
+                    y_diagonal_labels->at(0), output_config, precision);
+    }
+    auto output_diagonal_labels = EinsumDiagonalLabels(output_config);
+    if (output_diagonal_labels) {
+      return EinsumInverseDiagonal(
+          Einsum(x, x_config, y, y_config, output_diagonal_labels->at(0),
+                 precision),
+          output_config);
     }
 
-    TF_RETURN_IF_ERROR(
-        ValidateEinsumNumericDimensions(x_config, y_config, output_config));
     TF_ASSIGN_OR_RETURN(Shape x_shape, builder->GetShape(x));
     TF_ASSIGN_OR_RETURN(Shape y_shape, builder->GetShape(y));
     const int64 x_rank = x_config.size();
@@ -372,41 +383,37 @@ xla::XlaOp Einsum(xla::XlaOp x, absl::Span<const int64> x_config, xla::XlaOp y,
     absl::flat_hash_set<int64> output_map;
 
     for (auto d : x_config) {
-      if (!x_map.insert(d).second) {
-        return InvalidArgument("XLA Einsum does not support rhs tracing");
-      }
+      x_map.insert(d);
     }
 
     for (auto d : y_config) {
-      if (!y_map.insert(d).second) {
-        return InvalidArgument("XLA Einsum does not support lhs tracing");
-      }
+      y_map.insert(d);
     }
 
     for (auto d : output_config) {
-      if (!output_map.insert(d).second) {
-        return InvalidArgument("XLA Einsum does not support output tracing");
-      }
+      output_map.insert(d);
     }
 
     DotDimensionNumbers dnums;
-    std::vector<int64> lhs_outer_dims;
     auto is_batch_dim = [&](int64 d) {
       return x_map.contains(d) && y_map.contains(d) && output_map.contains(d);
     };
     auto is_contracting = [&](int64 d) {
       return x_map.contains(d) && y_map.contains(d);
     };
+
     auto rhs_dimension_number = [&](int64 d) {
       return absl::c_find(y_config, d) - y_config.begin();
     };
 
     absl::InlinedVector<int64, 8> rhs_outer_dims;
+    absl::InlinedVector<int64, 8> lhs_outer_dims;
     absl::InlinedVector<int64, 8> rhs_delete_dims;
     absl::InlinedVector<int64, 8> lhs_delete_dims;
     for (int64 i = 0; i < x_rank; ++i) {
       auto dim_name = x_config[i];
       const int64 rhs_dim = rhs_dimension_number(dim_name);
+
       if (is_batch_dim(dim_name)) {
         if (x_shape.dimensions(i) == y_shape.dimensions(rhs_dim)) {
           dnums.add_lhs_batch_dimensions(i);
@@ -442,63 +449,90 @@ xla::XlaOp Einsum(xla::XlaOp x, absl::Span<const int64> x_config, xla::XlaOp y,
     }
 
     absl::c_sort(rhs_outer_dims);
-
     absl::InlinedVector<int64, 8> output_transpose_dims;
-    absl::InlinedVector<int64, 8> output_reduce_dims;
-    auto output_dimension_number = [&](int64 d) {
+
+    auto output_dimension_number = [&](int64 d) -> absl::optional<int64> {
       auto pos = absl::c_find(output_config, d);
       if (pos == output_config.end()) {
-        const int64 dim =
-            output_transpose_dims.size() + output_reduce_dims.size();
-        output_reduce_dims.push_back(dim);
-      } else {
-        output_transpose_dims.push_back(pos - output_config.begin());
+        return absl::nullopt;
       }
+      return pos - output_config.begin();
     };
 
     for (auto d : dnums.lhs_batch_dimensions()) {
-      output_dimension_number(x_config[d]);
+      output_transpose_dims.push_back(*output_dimension_number(x_config[d]));
     }
 
     for (auto d : lhs_outer_dims) {
-      output_dimension_number(x_config[d]);
+      if (auto output_dim = output_dimension_number(x_config[d])) {
+        output_transpose_dims.push_back(*output_dim);
+        continue;
+      }
+      lhs_delete_dims.push_back(d);
     }
 
     for (auto d : rhs_outer_dims) {
-      output_dimension_number(y_config[d]);
+      if (auto output_dim = output_dimension_number(y_config[d])) {
+        output_transpose_dims.push_back(*output_dim);
+        continue;
+      }
+      rhs_delete_dims.push_back(d);
     }
 
+    const int64 transpose_rank = output_transpose_dims.size();
     std::vector<int64> transpose_dims(output_rank);
-    for (int64 i = 0; i < output_rank; ++i) {
+    for (int64 i = 0; i < transpose_rank; ++i) {
       transpose_dims[output_transpose_dims[i]] = i;
     }
 
     // Remove ones that where broadcasted from the x and the y shape and adjust
     // the dimension numbers that are more minor than those dimensions.
+    absl::c_sort(lhs_delete_dims);
     DeleteDimsFromContainer(lhs_delete_dims, &x_shape,
                             dnums.mutable_lhs_batch_dimensions(),
                             dnums.mutable_lhs_contracting_dimensions());
+
+    absl::c_sort(rhs_delete_dims);
     DeleteDimsFromContainer(rhs_delete_dims, &y_shape,
                             dnums.mutable_rhs_batch_dimensions(),
                             dnums.mutable_rhs_contracting_dimensions());
     if (!lhs_delete_dims.empty()) {
-      x = Reshape(x, x_shape.dimensions());
+      x = Reduce(x, ScalarLike(x, 0),
+                 CreateScalarAddComputation(x_shape.element_type(), builder),
+                 lhs_delete_dims);
     }
 
     if (!rhs_delete_dims.empty()) {
-      y = Reshape(y, y_shape.dimensions());
+      y = Reduce(y, ScalarLike(y, 0),
+                 CreateScalarAddComputation(y_shape.element_type(), builder),
+                 rhs_delete_dims);
     }
 
     PrecisionConfig precision_proto;
     precision_proto.add_operand_precision(precision);
     precision_proto.add_operand_precision(precision);
     auto dot = DotGeneral(x, y, dnums, &precision_proto);
-    if (!output_reduce_dims.empty()) {
-      dot = Reduce(dot, ScalarLike(dot, 0),
-                   CreateScalarAddComputation(x_shape.element_type(), builder),
-                   output_reduce_dims);
+    dot = Transpose(dot, transpose_dims);
+    if (transpose_rank == output_rank) {
+      return dot;
     }
-    return Transpose(dot, transpose_dims);
+
+    auto is_output_only = [&](int64 d) {
+      return output_map.contains(d) && !x_map.contains(d) && !y_map.contains(d);
+    };
+
+    int64 dot_dim = 0;
+    std::vector<int64> new_dims;
+    new_dims.reserve(output_rank);
+    TF_ASSIGN_OR_RETURN(Shape dot_shape, builder->GetShape(dot));
+    for (auto d : output_config) {
+      if (is_output_only(d)) {
+        new_dims.push_back(1);
+      } else {
+        new_dims.push_back(dot_shape.dimensions(dot_dim));
+      }
+    }
+    return Reshape(dot, new_dims);
   });
 }
 
diff --git a/tensorflow/compiler/xla/client/lib/matrix.h b/tensorflow/compiler/xla/client/lib/matrix.h
index 46f70ed27b9..1a9f72dedf2 100644
--- a/tensorflow/compiler/xla/client/lib/matrix.h
+++ b/tensorflow/compiler/xla/client/lib/matrix.h
@@ -112,14 +112,6 @@ StatusOr<std::array<std::vector<int64>, 3>> ParseEinsumString(
 // Returns an empty string if the einsum string already has an ->.
 std::string NormalizeEinsumString(absl::string_view einsum_config);
 
-// Determine if each dimension label is in at least two inputs.
-//
-// NOTE: This function is meant for testing, there is no need to call it
-// directly.
-Status ValidateEinsumNumericDimensions(absl::Span<const int64> x_config,
-                                       absl::Span<const int64> y_config,
-                                       absl::Span<const int64> output_config);
-
 // Supports two operand einsum notation like "ab,cb->ac".
 xla::XlaOp Einsum(
     xla::XlaOp x, xla::XlaOp y, absl::string_view einsum_config,
@@ -128,9 +120,6 @@ xla::XlaOp Einsum(
     xla::XlaOp x, absl::string_view einsum_config,
     xla::PrecisionConfig::Precision precision = xla::PrecisionConfig::DEFAULT);
 
-// Handles repeated indices within an operand by taking the tensor diagonal of
-// the input.
-xla::XlaOp EinsumDiagonal(XlaOp x, absl::Span<const int64> config);
 
 // Same as above but supporting numeric labels on dimensions. So "ab,cb->ac"
 // becomes:
diff --git a/tensorflow/compiler/xla/client/lib/matrix_test.cc b/tensorflow/compiler/xla/client/lib/matrix_test.cc
index ebbf39ec096..628447c289e 100644
--- a/tensorflow/compiler/xla/client/lib/matrix_test.cc
+++ b/tensorflow/compiler/xla/client/lib/matrix_test.cc
@@ -233,12 +233,23 @@ XLA_TEST_F(MatrixTest, ParseEinsumString) {
   };
 
   std::vector<std::vector<string>> good_test_cases = {
-      {"ab", "bc", "ac"},           {"Bab", "Bbc", "Bac"},
-      {"ab", "cd", "dcba"},         {"abc", "abd", "cbd"},
-      {"...ab", "...bc", "...ac"},  {"a...bc", "...abd", "cbd..."},
-      {"...ab", "...bc", "ac"},     {"...b", "...bc", "...c"},
-      {"...abz", "...bc", "...ac"}, {"...ab", "...bcz", "...ac"},
-      {"abz", "bc", "ac"},          {"ab", "bcz", "ac"},
+      {"ab", "bc", "ac"},
+      {"Bab", "Bbc", "Bac"},
+      {"ab", "cd", "dcba"},
+      {"abc", "abd", "cbd"},
+      {"...ab", "...bc", "...ac"},
+      {"a...bc", "...abd", "cbd..."},
+      {"...ab", "...bc", "ac"},
+      {"...b", "...bc", "...c"},
+      {"...abz", "...bc", "...ac"},
+      {"...ab", "...bcz", "...ac"},
+      {"abz", "bc", "ac"},
+      {"ab", "bcz", "ac"},
+
+      {"a", "b", "c"},
+      {"...a", "...b", "...c"},
+      {"abb", "bcc", "ac"},
+      {"ab", "bc", "ad"},
   };
   for (auto test_case : good_test_cases) {
     auto parse_result_or_status =
@@ -249,9 +260,6 @@ XLA_TEST_F(MatrixTest, ParseEinsumString) {
     for (int i = 0; i < 3; ++i) {
       EXPECT_EQ(parse_result[i], to_vec(test_case[i]));
     }
-    EXPECT_TRUE(ValidateEinsumNumericDimensions(
-                    parse_result[0], parse_result[1], parse_result[2])
-                    .ok());
   }
 
   std::vector<string> einsum_strings_that_fail_parsing = {
@@ -261,24 +269,6 @@ XLA_TEST_F(MatrixTest, ParseEinsumString) {
     auto parse_result_or_status = ParseEinsumString(test_case, 3, 3);
     EXPECT_FALSE(parse_result_or_status.status().ok());
   }
-  std::vector<std::vector<string>> einsum_strings_that_fail_numeric_validation =
-      {
-          {"a", "b", "c"},
-          {"...a", "...b", "...c"},
-          {"abb", "bcc", "ac"},
-          {"ab", "bc", "ad"},
-      };
-
-  for (auto test_case : einsum_strings_that_fail_numeric_validation) {
-    auto parse_result_or_status =
-        ParseEinsumString(to_string(test_case[0], test_case[1], test_case[2]),
-                          test_case[0].size(), test_case[1].size());
-    EXPECT_TRUE(parse_result_or_status.status().ok());
-    auto parse_result = parse_result_or_status.ValueOrDie();
-    EXPECT_FALSE(ValidateEinsumNumericDimensions(
-                     parse_result[0], parse_result[1], parse_result[2])
-                     .ok());
-  }
 }
 
 XLA_TEST_F(MatrixTest, NormalizeEinsumString) {
diff --git a/tensorflow/compiler/xla/client/lib/prng.cc b/tensorflow/compiler/xla/client/lib/prng.cc
index 044a742eddd..60086773d18 100644
--- a/tensorflow/compiler/xla/client/lib/prng.cc
+++ b/tensorflow/compiler/xla/client/lib/prng.cc
@@ -426,32 +426,36 @@ RngOutput PhiloxRngBit64(XlaOp op_key, XlaOp initial_state,
 XlaOp ConvertRandomBitsToUniformFloatingPoint(XlaOp bits, XlaOp minval,
                                               XlaOp maxval) {
   XlaBuilder* builder = bits.builder();
-  PrimitiveType value_type =
-      builder->GetShape(minval).ConsumeValueOrDie().element_type();
-  PrimitiveType bit_type =
-      builder->GetShape(bits).ConsumeValueOrDie().element_type();
-  CHECK((value_type == F32 && bit_type == U32) ||
-        (value_type == F64 && bit_type == U64));
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(const Shape* minval_shape,
+                        builder->GetShapePtr(minval));
+    TF_ASSIGN_OR_RETURN(const Shape* bits_shape, builder->GetShapePtr(bits));
+    PrimitiveType value_type = minval_shape->element_type();
+    PrimitiveType bit_type = bits_shape->element_type();
+    CHECK((value_type == F32 && bit_type == U32) ||
+          (value_type == F64 && bit_type == U64));
 
-  // Form random mantissa bits for float/double, with a leading 1 bit.
-  int num_float_bits = primitive_util::BitWidth(value_type);
-  // Subtract one as SignificandWidth includes the leading 1 bit.
-  int num_mantissa_bits = primitive_util::SignificandWidth(value_type) - 1;
+    // Form random mantissa bits for float/double, with a leading 1 bit.
+    int num_float_bits = primitive_util::BitWidth(value_type);
+    // Subtract one as SignificandWidth includes the leading 1 bit.
+    int num_mantissa_bits = primitive_util::SignificandWidth(value_type) - 1;
 
-  // Ignore the exponent bits and convert the mantissa bits to the floating
-  // point type.
-  bits = ShiftRightLogical(
-      bits, ScalarLike(bits, num_float_bits - num_mantissa_bits));
+    // Ignore the exponent bits and convert the mantissa bits to the floating
+    // point type.
+    bits = ShiftRightLogical(
+        bits, ScalarLike(bits, num_float_bits - num_mantissa_bits));
 
-  // We have an integer-valued floating point number in the range
-  // [0, 2**{num_mantissa_bits}).
-  XlaOp values = ConvertElementType(bits, value_type);
+    // We have an integer-valued floating point number in the range
+    // [0, 2**{num_mantissa_bits}).
+    XlaOp values = ConvertElementType(bits, value_type);
 
-  // Divide by 2**{-num_mantissa_bits} to get a number in the range [0.0, 1.0).
-  values = values * ScalarLike(values, std::ldexp(1., -num_mantissa_bits));
+    // Divide by 2**{-num_mantissa_bits} to get a number in the range
+    // [0.0, 1.0).
+    values = values * ScalarLike(values, std::ldexp(1., -num_mantissa_bits));
 
-  // Multiply and add to shift to the range [minval, maxval).
-  return values * (maxval - minval) + minval;
+    // Multiply and add to shift to the range [minval, maxval).
+    return values * (maxval - minval) + minval;
+  });
 }
 
 XlaOp ConvertRandomBitsToUniformInt(XlaOp bits, XlaOp minval, XlaOp maxval,
@@ -483,6 +487,10 @@ std::pair<XlaOp, XlaOp> BoxMullerTransform(XlaOp x0, XlaOp x1) {
 
 }  // namespace
 
+XlaOp PhiloxIncreaseCounter(XlaOp counter, XlaOp delta) {
+  return Uint128ToOp(Uint128AddUint64(Uint128FromOp(counter), delta));
+}
+
 RngOutput ThreeFryBitGenerator(XlaOp key, XlaOp initial_state,
                                const Shape& shape) {
   PrimitiveType type = shape.element_type();
diff --git a/tensorflow/compiler/xla/client/lib/prng.h b/tensorflow/compiler/xla/client/lib/prng.h
index 107fd884de3..20ad223403d 100644
--- a/tensorflow/compiler/xla/client/lib/prng.h
+++ b/tensorflow/compiler/xla/client/lib/prng.h
@@ -89,6 +89,9 @@ RngOutput NormalFloatingPointDistribution(XlaOp key, XlaOp initial_state,
 xla::XlaOp ConcatScalars(xla::XlaBuilder* builder,
                          absl::Span<const xla::XlaOp> scalars);
 
+// Increases Philox counter (an uint128) by a delta (an uint64).
+xla::XlaOp PhiloxIncreaseCounter(xla::XlaOp counter, xla::XlaOp delta);
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_PRNG_H_
diff --git a/tensorflow/compiler/xla/client/lib/qr.cc b/tensorflow/compiler/xla/client/lib/qr.cc
index b2eecbac309..09fa465a865 100644
--- a/tensorflow/compiler/xla/client/lib/qr.cc
+++ b/tensorflow/compiler/xla/client/lib/qr.cc
@@ -127,29 +127,24 @@ Status House(XlaOp x, XlaOp k, absl::Span<const int64> batch_dims,
 // def qr(a):
 //   m = a.shape[0]
 //   n = a.shape[1]
-//   vs = np.zeros([m, n])
 //   taus = np.zeros([n])
 //   for j in xrange(min(m, n)):
 //     v, tau, beta = house(a[:, j], j)
-//     # Unusually, we apply the Householder transformation to the entirety of
-//     # a, wasting FLOPs to maintain the static shape invariant that XLA
-//     # requires. For columns that precede j this has no effect.
-//     a[:, :] -= tau * np.dot(v[:, np.newaxis],
-//                              np.dot(v[np.newaxis, :], a[:, :]))
+//     a[:, j+1:] -= tau * np.dot(v[:, np.newaxis],
+//                                np.dot(v[np.newaxis, :], a[:, j+1:]))
 //     # Form column j explicitly rather than relying on the precision of the
 //     # Householder update.
 //     a[j, j] = beta
-//     a[j+1:, j] = np.zeros([m - j - 1], dtype=a.dtype)
-//     vs[:, j] = v
+//     a[j+1:, j] = v[j+1:]
 //     taus[j] = tau
-//   return (q, vs, taus)
+//   return (a, taus)
 struct QRBlockResult {
-  // The factored R value
-  XlaOp r;
+  // The upper-triangular matrix R, packed together with the lower-triangular
+  // elementary Householder reflectors `vs` below the diagonal.
+  XlaOp a;
 
   // Representation of the Householder matrices I - beta v v.T
   XlaOp taus;  // Shape: [..., n]
-  XlaOp vs;    // Shape: [..., m, n]
 };
 StatusOr<QRBlockResult> QRBlock(XlaOp a, PrecisionConfig::Precision precision) {
   XlaBuilder* builder = a.builder();
@@ -176,57 +171,52 @@ StatusOr<QRBlockResult> QRBlock(XlaOp a, PrecisionConfig::Precision precision) {
   auto qr_body_fn = [&](XlaOp j, absl::Span<const XlaOp> values,
                         XlaBuilder* builder) -> StatusOr<std::vector<XlaOp>> {
     auto a = values[0];
-    auto vs = values[1];
-    auto taus = values[2];
+    auto taus = values[1];
 
-    // v, beta = house(a[:, j], j)
+    // v, tau, beta = house(a[:, j], j)
     auto x = DynamicSliceInMinorDims(a, {j}, {1});
     XlaOp v, tau, beta;
     TF_RETURN_IF_ERROR(House(Collapse(x, {num_dims - 2, num_dims - 1}), j,
                              batch_dims, m, &v, &tau, &beta));
 
+    const int64 minor_dim = batch_dims.size();
+    auto iota_mn = Iota(
+        builder, ShapeUtil::MakeShape(S32, ConcatVectors(batch_dims, {m, n})),
+        minor_dim + 1);
+
     std::vector<int64> shape = batch_dims;
     shape.push_back(1);
     shape.push_back(m);
     auto v_broadcast = Reshape(v, shape);
-    // a[:, :] -= tau * np.dot(v[:, np.newaxis],
-    //                          np.dot(v[np.newaxis, :], a[:, :]))
-    auto vva = BatchDot(v_broadcast, a, precision);
+    // a[:, j+1:] -= tau * (v[:, np.newaxis] @ (v[np.newaxis, :] @ a[:, j+1:]))
+    // We use masking rather than a loop-variant shape to handle the j+1:
+    // indexing.
+    auto vva = BatchDot(v_broadcast, Select(Lt(j, iota_mn), a, ZerosLike(a)),
+                        precision);
     vva = BatchDot(v_broadcast, true, vva, false, precision);
     a = a - Mul(tau, vva,
                 /*broadcast_dimensions=*/batch_dim_indices);
 
-    // It is more precise to populate column 'k' explicitly, rather than
-    // computing it implicitly by applying the Householder transformation.
-    // a[k,k] = beta
-    // a[k+1:,k] = np.zeros([m-k-1], dtype=a.dtype)
+    // a[j, j] = beta
+    // a[j+1:,j] = v[j+1:]
     auto iota = Reshape(Iota(a.builder(), S32, m), {m, 1});
     auto predecessor_mask = ConvertElementType(Lt(iota, j), type);
     auto mask = Broadcast(ConvertElementType(Eq(iota, j), type),
                           std::vector<int64>(batch_dims.size(), 1));
+    auto successor_mask = Gt(Iota(a.builder(), S32, m), j);
     auto new_x = Mul(x, predecessor_mask,
                      /*broadcast_dimensions=*/{num_dims - 2, num_dims - 1}) +
                  Mul(beta, mask, /*broadcast_dimensions=*/batch_dim_indices);
+    new_x = Add(
+        new_x, Select(Broadcast(successor_mask, batch_dims), v, ZerosLike(v)),
+        /*broadcast_dimensions=*/ConcatVectors(batch_dim_indices, {minor_dim}));
     // Update a[:,j]
     std::vector<int64> dim_ids(num_dims);
     std::iota(dim_ids.begin(), dim_ids.end(), 0);
     new_x = BroadcastInDim(new_x, ConcatVectors(batch_dims, {m, n}),
                            /*broadcast_dimensions=*/dim_ids);
-    const int64 minor_dim = batch_dims.size();
-    auto iota_mn = Iota(
-        builder, ShapeUtil::MakeShape(S32, ConcatVectors(batch_dims, {m, n})),
-        minor_dim + 1);
     a = Select(Eq(iota_mn, j), new_x, a);
 
-    // vs[:, j] = v
-    std::vector<int64> vs_broadcast_dims(batch_dims.size() + 1);
-    std::iota(vs_broadcast_dims.begin(), vs_broadcast_dims.end(), 0);
-    auto vs_zeros = ZerosLike(vs);
-    auto vs_update = Select(
-        Eq(iota_mn, j),
-        Add(vs_zeros, v, /*broadcast_dimensions=*/vs_broadcast_dims), vs_zeros);
-    vs = vs + vs_update;
-
     // taus[j] = tau
     std::vector<int64> tau_broadcast_dims(batch_dims.size());
     std::iota(tau_broadcast_dims.begin(), tau_broadcast_dims.end(), 0);
@@ -240,40 +230,38 @@ StatusOr<QRBlockResult> QRBlock(XlaOp a, PrecisionConfig::Precision precision) {
         Add(taus_zeros, tau, /*broadcast_dimensions=*/tau_broadcast_dims),
         taus_zeros);
     taus = taus + taus_update;
-    return std::vector<XlaOp>{a, vs, taus};
+    return std::vector<XlaOp>{a, taus};
   };
 
-  auto vs = Zeros(
-      builder, ShapeUtil::MakeShape(type, ConcatVectors(batch_dims, {m, n})));
   auto taus = Zeros(builder,
                     ShapeUtil::MakeShape(type, ConcatVectors(batch_dims, {n})));
 
   TF_ASSIGN_OR_RETURN(auto values, ForEachIndex(std::min(m, n), S32, qr_body_fn,
-                                                {a, vs, taus}, "qr", builder));
+                                                {a, taus}, "qr", builder));
 
   QRBlockResult result;
-  result.r = values[0];
-  result.vs = values[1];
-  result.taus = values[2];
+  result.a = values[0];
+  result.taus = values[1];
   return result;
 }
 
-// Computes W and Y such that I-WY is equivalent to the sequence of Householder
-// transformations given by vs and taus.
-// Golub and van Loan, "Matrix Computations", algorithm 5.1.2.
-// Y = np.zeros([m, n])
-// W = np.zeros([m, n])
-// Y[:, 0] = vs[:, 0]
-// W[:, 0] = -taus[0] * vs[:, 0]
-// for j in xrange(1, n):
-//   v = vs[:, j]
-//   z = -taus[j] * v - taus[j] * np.dot(W, np.dot(Y.T, v))
-//   W[:, j] = z
-//   Y[:, j] = v
-// return W
-// There is no need to return Y since at termination of the loop it is equal to
-// vs.
-StatusOr<XlaOp> ComputeWYRepresentation(PrimitiveType type,
+// Computes T such that (I - Y @ T @ Y^t) is a product of the elementary
+// Householder reflectors given by `vs` and `taus`.
+//
+// Schreiber, Robert, and Charles Van Loan. "A storage-efficient WY
+// representation for products of Householder transformations." SIAM Journal on
+// Scientific and Statistical Computing 10.1 (1989): 53-57.
+//
+// def compact_wy(vs, taus):
+//   m, n = vs.shape[-2:]
+//   t = np.eye(n) * -taus
+//   # We premultiply Y.T @ vs, since we would prefer to compute a single matrix
+//   # multiplication to many matrix-vector products.
+//   vtv = -taus[None, :] * np.triu(vs.T @ vs, 1) + np.eye(n)
+//   for i in range(1, n):
+//     t[:, i] = np.dot(t, vtv[:, i])
+//   return t
+StatusOr<XlaOp> CompactWYRepresentation(PrimitiveType type,
                                         absl::Span<const int64> batch_dims,
                                         XlaOp vs, XlaOp taus, int64 m, int64 n,
                                         PrecisionConfig::Precision precision) {
@@ -284,50 +272,38 @@ StatusOr<XlaOp> ComputeWYRepresentation(PrimitiveType type,
   auto body_fn = [&](XlaOp j, absl::Span<const XlaOp> values,
                      XlaBuilder* builder) -> StatusOr<std::vector<XlaOp>> {
     // w has shape [..., m, n]
-    auto w = values[0];
-    const auto vs = values[1];
-    const auto taus = values[2];
+    auto t = values[0];
+    const auto vtv = values[1];
 
     // Want j values in range [1, ... n).
     j = j + ConstantR0<int32>(builder, 1);
-    // vs has shape [..., m, 1]
-    auto v = DynamicSliceInMinorDims(vs, {j}, {1});
-    // beta has shape [..., 1]
-    auto beta = DynamicSliceInMinorDims(taus, {j}, {1});
-
-    auto iota_mn = Iota(
-        builder, ShapeUtil::MakeShape(S32, ConcatVectors(batch_dims, {m, n})),
-        n_index);
-
-    // y has shape [..., m, n]
-    auto y = Select(Ge(iota_mn, j), ZerosLike(vs), vs);
 
     // yv has shape [..., n, 1]
-    auto yv = BatchDot(y, true, v, false, precision);
-    // wyv has shape [..., m, 1]
-    auto wyv = BatchDot(w, yv, precision);
+    auto yv = DynamicSliceInMinorDims(vtv, {j}, {1});
 
-    auto z = Mul(
-        -beta, v + wyv,
-        /*broadcast_dimensions=*/ConcatVectors(batch_dim_indices, {n_index}));
+    // wyv has shape [..., n, 1]
+    auto z = BatchDot(t, yv, precision);
 
-    w = DynamicUpdateSliceInMinorDims(w, z, {j});
+    t = DynamicUpdateSliceInMinorDims(t, z, {j});
 
-    return std::vector<XlaOp>{w, vs, taus};
+    return std::vector<XlaOp>{t, vtv};
   };
 
   XlaBuilder* builder = vs.builder();
-  auto w = Zeros(builder,
-                 ShapeUtil::MakeShape(type, ConcatVectors(batch_dims, {m, n})));
-  auto v = SliceInMinorDims(vs, {0}, {1});
-  auto beta = SliceInMinorDims(taus, {0}, {1});
-  auto bv =
-      Mul(-beta, v,
-          /*broadcast_dimensions=*/ConcatVectors(batch_dim_indices, {n_index}));
-  w = UpdateSliceInMinorDims(w, bv, {0});
 
-  TF_ASSIGN_OR_RETURN(auto values, ForEachIndex(n - 1, S32, body_fn,
-                                                {w, vs, taus}, "wy", builder));
+  auto tau_scale = BroadcastInDim(-taus, ConcatVectors(batch_dims, {1, n}),
+                                  ConcatVectors(batch_dim_indices, {n_index}));
+
+  auto eye = Broadcast(IdentityMatrix(builder, type, n, n), batch_dims);
+  auto t = eye * tau_scale;
+
+  auto vtv =
+      BatchDot(vs, /*transpose_x=*/true, vs, /*transpose_y=*/false, precision);
+  vtv = Select(TriangleMask(vtv, 0), ZerosLike(vtv), vtv) * tau_scale;
+  vtv = vtv + eye;
+
+  TF_ASSIGN_OR_RETURN(
+      auto values, ForEachIndex(n - 1, S32, body_fn, {t, vtv}, "wy", builder));
   return values[0];
 }
 
@@ -340,14 +316,12 @@ StatusOr<XlaOp> ComputeWYRepresentation(PrimitiveType type,
 //   q = np.eye(m)
 //   for i in xrange(0, min(m, n), block_size):
 //     k = min(block_size, min(m, n) - s)
-//     (a, vs, taus) = qr(a[i:, i:i+k])
-//     y = vs
-//     w = ComputeWYRepresentation(vs, taus, m-i, k)
-//     a[i:, i+r:] += np.dot(y, np.dot(w.T, a[i:, i+k:]))
-//     q[:, i:] += np.dot(q[:, i:], np.dot(w, y.T))
+//     (a, taus) = qr(a[i:, i:i+k])
+//     y = np.eye(m, n) + np.tril(a, -1)
+//     t = CompactWYRepresentation(vs, taus, m-i, k)
+//     a[i:, i+k:] += (y @ t.T) @ (y.T @ a[i:, i+k:])
+//     q[:, i:] += (q[:, i:] @ y) @ (y @ t.T).T
 //   return (q, a)
-// TODO(phawkins): consider using UT transformations (in the form I - V U V')
-// rather than WY transformations.
 StatusOr<QRDecompositionResult> QRDecomposition(
     XlaOp a, bool full_matrices, int64 block_size,
     PrecisionConfig::Precision precision) {
@@ -381,27 +355,34 @@ StatusOr<QRDecompositionResult> QRDecomposition(
 
     auto a_block = SliceInMinorDims(a, {i, i}, {m, i + k});
     TF_ASSIGN_OR_RETURN(auto qr_block, QRBlock(a_block, precision));
+    auto y = Add(
+        IdentityMatrix(builder, type, m - i, k),
+        Select(TriangleMask(qr_block.a, -1), qr_block.a, ZerosLike(qr_block.a)),
+        /*broadcast_dimensions=*/{num_dims - 2, num_dims - 1});
 
-    a = UpdateSliceInMinorDims(a, qr_block.r, {i, i});
+    a = UpdateSliceInMinorDims(a, qr_block.a, {i, i});
 
-    // Compute the I-WY block representation of a product of Householder
-    // matrices.
+    // Compute the I + Y @ T @ Y^t block representation of a product of
+    // Householder matrices.
     TF_ASSIGN_OR_RETURN(
-        auto w, ComputeWYRepresentation(type, batch_dims, qr_block.vs,
-                                        qr_block.taus, m - i, k, precision));
-    auto y = qr_block.vs;
+        auto t, CompactWYRepresentation(type, batch_dims, y, qr_block.taus,
+                                        m - i, k, precision));
 
-    // a[i:, i+k:] += np.dot(Y, np.dot(W.T, a[i:, i+k:]))
+    // a[i:, i+k:] += (y @ t.T) @ (y.T @ a[i:, i+k:])
+    auto yt =
+        BatchDot(y, /*transpose_x=*/false, t, /*transpose_y=*/true, precision);
     auto a_panel = SliceInMinorDims(a, {i, i + k}, {m, n});
-    auto a_update = BatchDot(w, true, a_panel, false, precision);
-    a_update = BatchDot(y, a_update, precision);
+    auto a_update = BatchDot(y, /*transpose_x=*/true, a_panel,
+                             /*transpose_y=*/false, precision);
+    a_update = BatchDot(yt, a_update, precision);
     a_panel = a_panel + a_update;
     a = UpdateSliceInMinorDims(a, a_panel, {i, i + k});
 
-    // q[:, i:] += np.dot(np.dot(q[:, i:], W), Y.T))
+    // q[:, i:] += (q[:, i:] @ y) @ (y @ t.T).T
     auto q_panel = SliceInMinorDims(q, {0, i}, {m, m});
-    auto q_update = BatchDot(q_panel, w, precision);
-    q_update = BatchDot(q_update, false, y, true, precision);
+    auto q_update = BatchDot(q_panel, y, precision);
+    q_update = BatchDot(q_update, /*transpose_x=*/false, yt,
+                        /*transpose_y=*/true, precision);
     q_panel = q_panel + q_update;
     q = UpdateSliceInMinorDims(q, q_panel, {0, i});
   }
@@ -414,7 +395,7 @@ StatusOr<QRDecompositionResult> QRDecomposition(
     a = SliceInMinorDims(a, {0, 0}, {p, n});
   }
   result.q = q;
-  result.r = a;
+  result.r = UpperTriangle(a);
   return result;
 }
 
diff --git a/tensorflow/compiler/xla/client/lib/qr_test.cc b/tensorflow/compiler/xla/client/lib/qr_test.cc
index a61f243e126..f1d2e4ddb1c 100644
--- a/tensorflow/compiler/xla/client/lib/qr_test.cc
+++ b/tensorflow/compiler/xla/client/lib/qr_test.cc
@@ -27,12 +27,15 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/tensor_float_32_utils.h"
 
 namespace {
 
 using QrTest = xla::ClientLibraryTestBase;
 
 XLA_TEST_F(QrTest, Simple) {
+  // Test fails with TensorFloat-32 enabled
+  tensorflow::enable_tensor_float_32_execution(false);
   xla::XlaBuilder builder(TestName());
 
   xla::Array2D<float> a_vals({
@@ -61,6 +64,8 @@ XLA_TEST_F(QrTest, Simple) {
 }
 
 XLA_TEST_F(QrTest, ZeroDiagonal) {
+  // Test fails with TensorFloat-32 enabled
+  tensorflow::enable_tensor_float_32_execution(false);
   xla::XlaBuilder builder(TestName());
 
   xla::Array2D<float> a_vals({
@@ -88,6 +93,8 @@ XLA_TEST_F(QrTest, ZeroDiagonal) {
 }
 
 XLA_TEST_F(QrTest, SimpleBatched) {
+  // Test fails with TensorFloat-32 enabled
+  tensorflow::enable_tensor_float_32_execution(false);
   xla::XlaBuilder builder(TestName());
 
   xla::Array3D<float> a_vals({
diff --git a/tensorflow/compiler/xla/client/lib/quantize.h b/tensorflow/compiler/xla/client/lib/quantize.h
index 26dbbd5b00b..320dfcbf062 100644
--- a/tensorflow/compiler/xla/client/lib/quantize.h
+++ b/tensorflow/compiler/xla/client/lib/quantize.h
@@ -25,7 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/lib/bfloat16/bfloat16.h"
+#include "tensorflow/core/platform/bfloat16.h"
 
 namespace xla {
 
diff --git a/tensorflow/compiler/xla/client/lib/self_adjoint_eig.cc b/tensorflow/compiler/xla/client/lib/self_adjoint_eig.cc
index 1c0680b883a..58905e4ca6f 100644
--- a/tensorflow/compiler/xla/client/lib/self_adjoint_eig.cc
+++ b/tensorflow/compiler/xla/client/lib/self_adjoint_eig.cc
@@ -228,7 +228,7 @@ StatusOr<std::vector<XlaOp>> WhileLoopFn(
     auto max_sweeps = ScalarLike(k, max_sweep_updates);
     auto sweep_update_cond = Gt(max_sweeps, k);
 
-    auto norms = ComputeFrobeniusNorms(values[2]).ValueOrDie();
+    TF_ASSIGN_OR_RETURN(auto norms, ComputeFrobeniusNorms(values[2]));
     auto tol = norms.total_norm * values[3];
     auto tol_cond = ReduceAll(Lt(tol, norms.off_diagonal_norm),
                               xla::ConstantR0<bool>(cond_builder, false),
@@ -400,7 +400,7 @@ SelfAdjointEigResult SelfAdjointEig(XlaOp a, bool lower, int64 max_iter,
     return result;
   };
   auto shape_with_status = builder->GetShape(a);
-  if (!shape_with_status.status().ok()) {
+  if (!shape_with_status.ok()) {
     return return_error(shape_with_status.status());
   }
   Shape a_shape = shape_with_status.ValueOrDie();
@@ -450,7 +450,7 @@ SelfAdjointEigResult SelfAdjointEig(XlaOp a, bool lower, int64 max_iter,
       S32,                     //
       "CyclicJacobi",          //
       builder);
-  if (!output_with_status.status().ok()) {
+  if (!output_with_status.ok()) {
     return return_error(output_with_status.status());
   }
 
@@ -460,7 +460,11 @@ SelfAdjointEigResult SelfAdjointEig(XlaOp a, bool lower, int64 max_iter,
   result.v = output[1];
   result.w = GetMatrixDiagonal(output[2]);
 
-  return SortByEigenvalues(result).ValueOrDie();
+  auto result_or = SortByEigenvalues(result);
+  if (!result_or.ok()) {
+    return return_error(result_or.status());
+  }
+  return result_or.ValueOrDie();
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/slicing_test.cc b/tensorflow/compiler/xla/client/lib/slicing_test.cc
index 8e2e713c45c..10e27285f02 100644
--- a/tensorflow/compiler/xla/client/lib/slicing_test.cc
+++ b/tensorflow/compiler/xla/client/lib/slicing_test.cc
@@ -206,10 +206,12 @@ XLA_TEST_F(SlicingTest, DoubleEmptyIndexSelect) {
   xla::XlaOp input, index;
   Literal l(ShapeUtil::MakeShape(F32, {0, 1, 2, 0}));
   Literal i(ShapeUtil::MakeShape(S32, {0}));
-  auto input_data =
-      CreateParameterAndTransferLiteral(0, l, "input", &builder, &input);
-  auto index_data =
-      CreateParameterAndTransferLiteral(1, i, "index", &builder, &index);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input_data,
+      CreateParameterAndTransferLiteral(0, l, "input", &builder, &input));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto index_data,
+      CreateParameterAndTransferLiteral(1, i, "index", &builder, &index));
   TorchIndexSelect(input, index, 0);
   ComputeAndCompareLiteral(&builder, l, {input_data.get(), index_data.get()});
 }
@@ -219,8 +221,9 @@ XLA_TEST_F(SlicingTest, EmptyIndexSelectNonZero) {
 
   xla::XlaOp input, index;
   Literal l(ShapeUtil::MakeShape(F32, {0, 2}));
-  auto input_data =
-      CreateParameterAndTransferLiteral(0, l, "input", &builder, &input);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input_data,
+      CreateParameterAndTransferLiteral(0, l, "input", &builder, &input));
   auto index_data =
       CreateR1Parameter<int>({0, 0, 0}, 1, "index", &builder, &index);
   TorchIndexSelect(input, index, 0);
diff --git a/tensorflow/compiler/xla/client/lib/sorting.cc b/tensorflow/compiler/xla/client/lib/sorting.cc
index 750237c2000..abb0054558f 100644
--- a/tensorflow/compiler/xla/client/lib/sorting.cc
+++ b/tensorflow/compiler/xla/client/lib/sorting.cc
@@ -16,6 +16,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/lib/sorting.h"
 
 #include "tensorflow/compiler/xla/client/lib/comparators.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/client/lib/loops.h"
+#include "tensorflow/compiler/xla/client/lib/slicing.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -27,6 +30,20 @@ XlaOp TopK(XlaOp input, int64 k) {
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape input_shape, builder->GetShape(input));
     int last_dim = input_shape.dimensions_size() - 1;
+    int64 last_dim_size = input_shape.dimensions(last_dim);
+    // TODO(b/148796364): tune these constants for better performance.
+    const int64 kPerPartitionSize = 8192;        // 2^13
+    const int64 kLastDimSizeThreshold = 524288;  // 2^19
+    const int64 kMinNumPartitions = 8;
+    const int64 kMinimalK = 1000;
+    if ((k >= kMinimalK) && (k < kPerPartitionSize) &&
+        (kPerPartitionSize / k > 2) && last_dim_size >= kLastDimSizeThreshold) {
+      int64 num_partitions =
+          CeilOfRatio(last_dim_size - k, kPerPartitionSize - k);
+      if (num_partitions >= kMinNumPartitions) {
+        return TopKWithPartitions(input, k, num_partitions);
+      }
+    }
 
     Shape iota_shape =
         ShapeUtil::MakeShape(S32, AsInt64Slice(input_shape.dimensions()));
@@ -80,30 +97,35 @@ XlaOp TopKWithPartitions(XlaOp input, int64 k, int64 num_partitions) {
       }
     }
 
-    XlaOp values, indices;
-    for (int64 partition = 0; partition < num_partitions; partition++) {
-      std::vector<int64> start_indices(input_shape.dimensions_size(), 0);
-      std::vector<int64> limit_indices(input_dims.begin(), input_dims.end());
-      std::vector<int64> strides(input_shape.dimensions_size(), 1);
-      start_indices[last_dim] = partition * per_partition_size;
-      limit_indices[last_dim] =
-          std::min((partition + 1) * per_partition_size, last_dim_size);
-      // Slice value and indices for this partition..
-      XlaOp sliced_input = Slice(input, start_indices, limit_indices, strides);
+    auto topk_body_fn =
+        [&](XlaOp partition, absl::Span<const XlaOp> values_and_indices,
+            XlaBuilder* builder) -> StatusOr<std::vector<XlaOp>> {
+      auto values = values_and_indices[0];
+      auto indices = values_and_indices[1];
+      auto input = values_and_indices[2];
+      auto iota_s32 = values_and_indices[3];
+
+      // Slice value and indices for this partition.
+      XlaOp start = Mul(Add(partition, ConstantR0<int32>(builder, 1)),
+                        ConstantR0<int32>(builder, per_partition_size));
+      XlaOp sliced_input =
+          DynamicSliceInMinorDims(input, {start}, {per_partition_size});
       XlaOp sliced_indices =
-          Slice(iota_s32, start_indices, limit_indices, strides);
+          DynamicSliceInMinorDims(iota_s32, {start}, {per_partition_size});
       // Concat with previous results.
-      if (partition > 0) {
-        sliced_input = ConcatInDim(builder, {values, sliced_input}, last_dim);
-        sliced_indices =
-            ConcatInDim(builder, {indices, sliced_indices}, last_dim);
-      }
+      sliced_input = ConcatInDim(builder, {values, sliced_input}, last_dim);
+      sliced_indices =
+          ConcatInDim(builder, {indices, sliced_indices}, last_dim);
       // Sort this slice
       XlaOp sort_result =
           Sort({sliced_input, sliced_indices},
                CreateScalarGtComputation({input_shape.element_type(), S32},
                                          sliced_indices.builder()),
-               last_dim, /*is_stable=*/true);
+               last_dim, true);
+
+      std::vector<int64> start_indices(input_shape.dimensions_size(), 0);
+      std::vector<int64> limit_indices(input_dims.begin(), input_dims.end());
+      std::vector<int64> strides(input_shape.dimensions_size(), 1);
       // Slice topk.
       start_indices[last_dim] = 0;
       limit_indices[last_dim] = k;
@@ -111,8 +133,42 @@ XlaOp TopKWithPartitions(XlaOp input, int64 k, int64 num_partitions) {
                      limit_indices, strides);
       indices = Slice(GetTupleElement(sort_result, 1), start_indices,
                       limit_indices, strides);
-    }
-    return Tuple(builder, {values, indices});
+      return std::vector<XlaOp>{values, indices, input, iota_s32};
+    };
+
+    // Get the values and indices for the first topk so that they can
+    // be passed to the while loop.
+    std::vector<int64> start_indices(input_shape.dimensions_size(), 0);
+    std::vector<int64> limit_indices(input_dims.begin(), input_dims.end());
+    std::vector<int64> strides(input_shape.dimensions_size(), 1);
+    start_indices[last_dim] = 0;
+    limit_indices[last_dim] = per_partition_size;
+    // Slice value and indices for the first partition.
+    XlaOp sliced_input = Slice(input, start_indices, limit_indices, strides);
+    XlaOp sliced_indices =
+        Slice(iota_s32, start_indices, limit_indices, strides);
+    // Sort this slice
+    XlaOp sort_result =
+        Sort({sliced_input, sliced_indices},
+             CreateScalarGtComputation({input_shape.element_type(), S32},
+                                       sliced_indices.builder()),
+             last_dim, /*is_stable=*/true);
+
+    // Slice topk.
+    start_indices[last_dim] = 0;
+    limit_indices[last_dim] = k;
+    XlaOp values = Slice(GetTupleElement(sort_result, 0), start_indices,
+                         limit_indices, strides);
+    XlaOp indices = Slice(GetTupleElement(sort_result, 1), start_indices,
+                          limit_indices, strides);
+
+    // Pass the result of the first TopK to the while loop and do
+    // num_partition - 1 iterations.
+    TF_ASSIGN_OR_RETURN(auto values_and_indices,
+                        ForEachIndex(num_partitions - 1, S32, topk_body_fn,
+                                     {values, indices, input, iota_s32},
+                                     "topk_with_partition", builder));
+    return Tuple(builder, {values_and_indices[0], values_and_indices[1]});
   });
 }
 
diff --git a/tensorflow/compiler/xla/client/lib/sorting_test.cc b/tensorflow/compiler/xla/client/lib/sorting_test.cc
index e01f6faf59e..e820d5bfe6f 100644
--- a/tensorflow/compiler/xla/client/lib/sorting_test.cc
+++ b/tensorflow/compiler/xla/client/lib/sorting_test.cc
@@ -118,6 +118,19 @@ XLA_TEST_F(SortingTest, TopK3From8Values5Partitions) {
   ComputeAndCompareR1<float>(&builder, {7.0, 6.0, 5.0}, {});
 }
 
+XLA_TEST_F(SortingTest, DISABLED_TopKLargeInput) {
+  XlaBuilder builder(TestName());
+  Array<float> input({2, 1000000});
+  input.FillRandom(1.0f, 2.0f);
+  auto x =
+      CreateConstantFromLiteral(LiteralUtil::CreateFromArray(input), &builder);
+  Array2D<float> expected_array(2, 1000);
+  expected_array.Fill(2.0f);
+  xla::GetTupleElement(xla::TopK(x, 1000), 0);
+  ErrorSpec error_spec(10.0f, 10.0f);
+  ComputeAndCompareR2<float>(&builder, expected_array, {}, error_spec);
+}
+
 XLA_TEST_F(SortingTest, TopK3From8Indices5Partitions) {
   XlaBuilder builder(TestName());
   auto x_rev =
diff --git a/tensorflow/compiler/xla/client/lib/svd.cc b/tensorflow/compiler/xla/client/lib/svd.cc
index 646875a20a2..80ea4d644c0 100644
--- a/tensorflow/compiler/xla/client/lib/svd.cc
+++ b/tensorflow/compiler/xla/client/lib/svd.cc
@@ -837,8 +837,11 @@ SVDResult SVD(XlaOp a, int64 max_iter, float epsilon,
 
   auto eps = ScalarLike(a, epsilon);
 
-  SVDResult svd_result =
-      HouseHolderBidiagonalization(a, eps, precision).ValueOrDie();
+  auto svd_result_or = HouseHolderBidiagonalization(a, eps, precision);
+  if (!svd_result_or.ok()) {
+    return return_error(svd_result_or.status());
+  }
+  SVDResult svd_result = svd_result_or.ValueOrDie();
 
   auto output_with_status = WhileLoopFn(
       {
@@ -861,7 +864,13 @@ SVDResult SVD(XlaOp a, int64 max_iter, float epsilon,
   svd_result.u = output[1];
   svd_result.v = output[2];
   svd_result.d = output[3];
-  svd_result = SortBySingularValuesAndPostProcessing(svd_result).ValueOrDie();
+
+  svd_result_or = SortBySingularValuesAndPostProcessing(svd_result);
+  if (!svd_result_or.ok()) {
+    return return_error(svd_result_or.status());
+  }
+  svd_result = svd_result_or.ValueOrDie();
+
   if (maybe_transpose) {
     std::swap(svd_result.u, svd_result.v);
   }
diff --git a/tensorflow/compiler/xla/client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_builder.cc
index 52f61408cbb..3e2a4eb53a7 100644
--- a/tensorflow/compiler/xla/client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_builder.cc
@@ -26,12 +26,15 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/match.h"
+#include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
+#include "absl/types/span.h"
 #include "tensorflow/compiler/xla/client/sharding_builder.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/comparison_util.h"
 #include "tensorflow/compiler/xla/execution_options_util.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_input_output_alias_config.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
@@ -39,6 +42,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/macros.h"
 
 namespace xla {
 
@@ -71,6 +76,58 @@ void SetProtoIdAndName(T* entry, const string& base_name, char separator,
   entry->set_id(id);
   entry->set_name(GetFullName(base_name, separator, id));
 }
+
+ShapeProto ConvertShapeProtoToPred(const ShapeProto& shape_proto) {
+  return ShapeUtil::ChangeElementType(Shape(shape_proto), PRED).ToProto();
+}
+
+void SetInstructionAsConstant(HloInstructionProto* instr, int64 id,
+                              const Shape& shape, bool pred) {
+  Literal literal = LiteralUtil::CreateR0(pred);
+  Literal literal_broadcast = literal.Broadcast(shape, {}).ValueOrDie();
+  *instr->mutable_shape() = shape.ToProto();
+  *instr->mutable_literal() = literal_broadcast.ToProto();
+  *instr->mutable_opcode() = HloOpcodeString(HloOpcode::kConstant);
+}
+
+// Converts a HloComputation into ReducerOr with predicate types.
+HloComputationProto CreateReduceOr(int64 reducer_id,
+                                   HloComputationProto* original_reducer) {
+  HloComputationProto reducer;
+  SetProtoIdAndName(&reducer, StrCat("reduce_or"), kNameSeparator, reducer_id);
+  std::vector<int64> operands_id;
+  for (auto& inst : original_reducer->instructions()) {
+    // Copy params.
+    if (StringToHloOpcode(inst.opcode()).ValueOrDie() ==
+        HloOpcode::kParameter) {
+      HloInstructionProto* new_param = reducer.add_instructions();
+      *new_param = inst;
+      *new_param->mutable_shape() = ConvertShapeProtoToPred(inst.shape());
+      operands_id.push_back(inst.id());
+    }
+    if (inst.id() == original_reducer->root_id()) {
+      HloInstructionProto* new_root = reducer.add_instructions();
+      *new_root = inst;
+      *new_root->mutable_shape() = ConvertShapeProtoToPred(inst.shape());
+      *new_root->mutable_opcode() = HloOpcodeString(HloOpcode::kOr);
+      new_root->clear_operand_ids();
+      for (int64 operand_id : operands_id) {
+        new_root->add_operand_ids(operand_id);
+      }
+      reducer.set_root_id(inst.id());
+    }
+  }
+  return reducer;
+}
+
+bool InstrIsSetBound(const HloInstructionProto* instr_proto) {
+  HloOpcode opcode = StringToHloOpcode(instr_proto->opcode()).ValueOrDie();
+  if (opcode == HloOpcode::kCustomCall &&
+      instr_proto->custom_call_target() == "SetBound") {
+    return true;
+  }
+  return false;
+}
 }  // namespace
 
 namespace internal {
@@ -247,7 +304,6 @@ void XlaBuilder::IsConstantVisitor(const int64 op_handle,
       // GetDimensionSize is always considered constant in XLA -- If a dynamic
       // dimension is presented, -1 is returned.
       break;
-
     // Non functional ops.
     case HloOpcode::kRng:
     case HloOpcode::kAllReduce:
@@ -260,6 +316,11 @@ void XlaBuilder::IsConstantVisitor(const int64 op_handle,
       // cannot be constant.  We cannot set is_functional=false in other similar
       // cases since we're already relying on IsConstant to return true.
     case HloOpcode::kCustomCall:
+      if (instr.custom_call_target() == "SetBound") {
+        // Set bound is considered constant -- the bound is used as the value.
+        break;
+      }
+      TF_FALLTHROUGH_INTENDED;
     case HloOpcode::kWhile:
       // TODO(b/32495713): We aren't checking the condition and body
       // computations themselves.
@@ -446,7 +507,7 @@ StatusOr<XlaComputation> XlaBuilder::Build(int64 root_id,
                              alias.param_index.ToString().c_str());
     }
     TF_RETURN_IF_ERROR(config.SetUpAlias(alias.output_index, alias.param_number,
-                                         alias.param_index));
+                                         alias.param_index, alias.kind));
   }
   *module->mutable_input_output_alias() = config.ToProto();
   return Status::OK();
@@ -529,7 +590,8 @@ XlaOp XlaBuilder::UnaryOp(HloOpcode unop, XlaOp operand) {
 
 XlaOp XlaBuilder::BinaryOp(HloOpcode binop, XlaOp lhs, XlaOp rhs,
                            absl::Span<const int64> broadcast_dimensions,
-                           absl::optional<ComparisonDirection> direction) {
+                           absl::optional<ComparisonDirection> direction,
+                           absl::optional<Comparison::Type> type) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* lhs_shape, GetShapePtr(lhs));
     TF_ASSIGN_OR_RETURN(const Shape* rhs_shape, GetShapePtr(rhs));
@@ -587,7 +649,11 @@ XlaOp XlaBuilder::BinaryOp(HloOpcode binop, XlaOp lhs, XlaOp rhs,
         return InvalidArgument(
             "kCompare expects a ComparisonDirection, but none provided.");
       }
-      return Compare(shape, updated_lhs, updated_rhs, *direction);
+      if (type == absl::nullopt) {
+        return Compare(shape, updated_lhs, updated_rhs, *direction);
+      } else {
+        return Compare(shape, updated_lhs, updated_rhs, *direction, *type);
+      }
     }
 
     if (direction.has_value()) {
@@ -610,8 +676,16 @@ XlaOp XlaBuilder::BinaryOpNoBroadcast(HloOpcode binop, const Shape& shape,
 
 StatusOr<XlaOp> XlaBuilder::Compare(const Shape& shape, XlaOp lhs, XlaOp rhs,
                                     ComparisonDirection direction) {
+  return Compare(shape, lhs, rhs, direction,
+                 Comparison::DefaultComparisonType(shape.element_type()));
+}
+
+StatusOr<XlaOp> XlaBuilder::Compare(const Shape& shape, XlaOp lhs, XlaOp rhs,
+                                    ComparisonDirection direction,
+                                    Comparison::Type type) {
   HloInstructionProto instr;
   instr.set_comparison_direction(ComparisonDirectionToString(direction));
+  instr.set_comparison_type(ComparisonTypeToString(type));
   *instr.mutable_shape() = shape.ToProto();
   return AddInstruction(std::move(instr), HloOpcode::kCompare, {lhs, rhs});
 }
@@ -1022,6 +1096,36 @@ XlaOp XlaBuilder::Reshape(const Shape& shape, XlaOp operand,
   });
 }
 
+XlaOp XlaBuilder::DynamicReshape(XlaOp operand,
+                                 absl::Span<const XlaOp> dim_sizes,
+                                 absl::Span<const int64> new_size_bounds,
+                                 const std::vector<bool>& dims_are_dynamic) {
+  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
+    std::vector<const Shape*> dim_size_shape_ptrs;
+    TF_ASSIGN_OR_RETURN(const auto& dim_size_shapes,
+                        GetOperandShapes(dim_sizes));
+
+    absl::c_transform(dim_size_shapes, std::back_inserter(dim_size_shape_ptrs),
+                      [](const Shape& shape) { return &shape; });
+    TF_ASSIGN_OR_RETURN(const Shape shape,
+                        ShapeInference::InferDynamicReshapeShape(
+                            *operand_shape, dim_size_shape_ptrs,
+                            new_size_bounds, dims_are_dynamic));
+    TF_RETURN_IF_ERROR(first_error_);
+    std::vector<XlaOp> operands;
+    operands.reserve(1 + dim_sizes.size());
+    operands.push_back(operand);
+    for (const XlaOp& dim_size : dim_sizes) {
+      operands.push_back(dim_size);
+    }
+    HloInstructionProto instr;
+    *instr.mutable_shape() = shape.ToProto();
+    return AddInstruction(std::move(instr), HloOpcode::kDynamicReshape,
+                          operands);
+  });
+}
+
 XlaOp XlaBuilder::Collapse(XlaOp operand, absl::Span<const int64> dimensions) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     if (dimensions.size() <= 1) {
@@ -1364,6 +1468,25 @@ StatusOr<XlaOp> XlaBuilder::FftInternal(
   return AddInstruction(std::move(instr), HloOpcode::kFft, {operand});
 }
 
+StatusOr<XlaOp> XlaBuilder::TriangularSolveInternal(
+    const Shape& shape, XlaOp a, XlaOp b, TriangularSolveOptions options) {
+  HloInstructionProto instr;
+  *instr.mutable_triangular_solve_options() = std::move(options);
+  *instr.mutable_shape() = shape.ToProto();
+
+  return AddInstruction(std::move(instr), HloOpcode::kTriangularSolve, {a, b});
+}
+
+StatusOr<XlaOp> XlaBuilder::CholeskyInternal(const Shape& shape, XlaOp a,
+                                             bool lower) {
+  HloInstructionProto instr;
+  xla::CholeskyOptions& options = *instr.mutable_cholesky_options();
+  options.set_lower(lower);
+  *instr.mutable_shape() = shape.ToProto();
+
+  return AddInstruction(std::move(instr), HloOpcode::kCholesky, {a});
+}
+
 XlaOp XlaBuilder::Infeed(const Shape& shape, const string& config) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
@@ -1874,7 +1997,6 @@ XlaOp XlaBuilder::RngUniform(XlaOp a, XlaOp b, const Shape& shape) {
 XlaOp XlaBuilder::RngBitGenerator(RandomAlgorithm algorithm,
                                   XlaOp initial_state, const Shape& shape) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    HloInstructionProto instr;
     TF_RETURN_IF_ERROR(ShapeUtil::ValidateShapeWithOptionalLayout(shape));
     TF_ASSIGN_OR_RETURN(Shape state_shape, GetShape(initial_state));
     Shape output_shape = shape;
@@ -1893,14 +2015,22 @@ XlaOp XlaBuilder::RngBitGenerator(RandomAlgorithm algorithm,
         return InvalidArgument("Unsupported shape for RngBitGenerator: %s",
                                PrimitiveType_Name(output_shape.element_type()));
     }
-    *instr.mutable_shape() =
-        ShapeUtil::MakeTupleShape({state_shape, output_shape}).ToProto();
-    instr.set_rng_algorithm(algorithm);
-    return AddInstruction(std::move(instr), HloOpcode::kRngBitGenerator,
-                          {initial_state});
+    return RngBitGeneratorInternal(
+        ShapeUtil::MakeTupleShape({state_shape, output_shape}), algorithm,
+        initial_state);
   });
 }
 
+StatusOr<XlaOp> XlaBuilder::RngBitGeneratorInternal(
+    const Shape& full_result_shape, RandomAlgorithm algorithm,
+    XlaOp initial_state) {
+  HloInstructionProto instr;
+  *instr.mutable_shape() = full_result_shape.ToProto();
+  instr.set_rng_algorithm(algorithm);
+  return AddInstruction(std::move(instr), HloOpcode::kRngBitGenerator,
+                        {initial_state});
+}
+
 XlaOp XlaBuilder::While(const XlaComputation& condition,
                         const XlaComputation& body, XlaOp init) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
@@ -2466,6 +2596,7 @@ XlaOp XlaBuilder::AllToAll(XlaOp operand, int64 split_dimension,
         }
         *(shape.mutable_tuple_shapes(i)->mutable_layout()) = *layout;
       }
+      instr.set_constrain_layout(true);
     }
     *instr.mutable_shape() = shape.ToProto();
 
@@ -2842,6 +2973,249 @@ StatusOr<bool> XlaBuilder::IsConstant(XlaOp operand) const {
   return is_constant;
 }
 
+StatusOr<XlaComputation> XlaBuilder::BuildDynamicInferenceGraph(XlaOp root_op) {
+  TF_ASSIGN_OR_RETURN(const HloInstructionProto* root,
+                      LookUpInstruction(root_op));
+
+  HloComputationProto entry;
+  SetProtoIdAndName(&entry, StrCat(name_, "_dynamic_inference"), kNameSeparator,
+                    GetNextId());
+  ProgramShapeProto* program_shape = entry.mutable_program_shape();
+  *program_shape->mutable_result() =
+      ShapeUtil::ChangeElementType(Shape(root->shape()), PRED).ToProto();
+
+  std::vector<HloComputationProto> called_computatons;
+  // Process instruction and copy it into the new graph. The new node in the new
+  // graph with have id set to `id`.
+  auto process_instruction = [&](const HloInstructionProto* instr_proto,
+                                 bool need_rewrite, int64 id,
+                                 absl::Span<int64 const> operand_ids) {
+    // Rewrite the instruction with following rules:
+    // - Unary ops: Convert into bitcast (identity) with type Pred.
+    // - Binary ops: Convert into binary or.
+    // - Select: Convert into binary or with its two data operands.
+    // - Concat / Tuple/ GTE / Bitcast: Copy.
+    // - Param: Convert to constant True.
+    // - GetDimensionSize: Convert to constant True if dimension is dynamic,
+    // contant False if dimension is static.
+    // - Reduce: Convert to reduce or.
+    // - Constant: Convert to constant False.
+    // - Other ops: Not supported.
+    // Create the instruction for the new handle.
+    TF_ASSIGN_OR_RETURN(HloOpcode opcode,
+                        StringToHloOpcode(instr_proto->opcode()));
+    auto* new_instr = entry.add_instructions();
+    *new_instr = *instr_proto;
+    new_instr->set_id(id);
+    new_instr->mutable_operand_ids()->Clear();
+    for (auto operand_id : operand_ids) {
+      new_instr->mutable_operand_ids()->Add(operand_id);
+    }
+
+    if (!need_rewrite) {
+      *new_instr->mutable_name() =
+          GetFullName(instr_proto->opcode(), kNameSeparator, id);
+      return Status::OK();
+    }
+    *new_instr->mutable_shape() = ConvertShapeProtoToPred(instr_proto->shape());
+    Shape new_shape(new_instr->shape());
+    switch (opcode) {
+      case HloOpcode::kAbs:
+      case HloOpcode::kRoundNearestAfz:
+      case HloOpcode::kBitcast:
+      case HloOpcode::kCeil:
+      case HloOpcode::kCollectivePermuteDone:
+      case HloOpcode::kCos:
+      case HloOpcode::kClz:
+      case HloOpcode::kExp:
+      case HloOpcode::kExpm1:
+      case HloOpcode::kFloor:
+      case HloOpcode::kImag:
+      case HloOpcode::kIsFinite:
+      case HloOpcode::kLog:
+      case HloOpcode::kLog1p:
+      case HloOpcode::kNot:
+      case HloOpcode::kNegate:
+      case HloOpcode::kPopulationCount:
+      case HloOpcode::kReal:
+      case HloOpcode::kRsqrt:
+      case HloOpcode::kLogistic:
+      case HloOpcode::kSign:
+      case HloOpcode::kSin:
+      case HloOpcode::kConvert:
+      case HloOpcode::kSqrt:
+      case HloOpcode::kCbrt:
+      case HloOpcode::kTanh:
+        CHECK_EQ(instr_proto->operand_ids_size(), 1);
+        *new_instr->mutable_opcode() = HloOpcodeString(HloOpcode::kBitcast);
+        break;
+      case HloOpcode::kAdd:
+      case HloOpcode::kAtan2:
+      case HloOpcode::kDivide:
+      case HloOpcode::kComplex:
+      case HloOpcode::kMaximum:
+      case HloOpcode::kMinimum:
+      case HloOpcode::kMultiply:
+      case HloOpcode::kPower:
+      case HloOpcode::kRemainder:
+      case HloOpcode::kSubtract:
+      case HloOpcode::kCompare:
+      case HloOpcode::kAnd:
+      case HloOpcode::kOr:
+      case HloOpcode::kXor:
+      case HloOpcode::kShiftLeft:
+      case HloOpcode::kShiftRightArithmetic:
+      case HloOpcode::kShiftRightLogical:
+        CHECK_EQ(instr_proto->operand_ids_size(), 2);
+        *new_instr->mutable_opcode() = HloOpcodeString(HloOpcode::kOr);
+        break;
+      case HloOpcode::kSelect:
+        break;
+      case HloOpcode::kGather:
+        break;
+      case HloOpcode::kReduce: {
+        int64 reducer_id = new_instr->called_computation_ids(0);
+        called_computatons.push_back(
+            CreateReduceOr(reducer_id, &embedded_[reducer_id]));
+        break;
+      }
+      case HloOpcode::kTuple:
+      case HloOpcode::kTranspose:
+      case HloOpcode::kGetTupleElement:
+      case HloOpcode::kSlice:
+      case HloOpcode::kBroadcast:
+      case HloOpcode::kConcatenate:
+      case HloOpcode::kReshape:
+        break;
+      case HloOpcode::kGetDimensionSize: {
+        int64 dimension = instr_proto->dimensions(0);
+        int64 operand_handle = instr_proto->operand_ids(0);
+        TF_ASSIGN_OR_RETURN(const HloInstructionProto* operand_proto,
+                            LookUpInstructionByHandle(operand_handle));
+
+        SetInstructionAsConstant(
+            new_instr, id, new_shape,
+            operand_proto->shape().is_dynamic_dimension(dimension));
+        break;
+      }
+      case HloOpcode::kConstant:
+        SetInstructionAsConstant(new_instr, id, new_shape, false);
+        break;
+      case HloOpcode::kCustomCall:
+        if (instr_proto->custom_call_target() == "SetBound") {
+          SetInstructionAsConstant(new_instr, id, new_shape, true);
+          break;
+        } else {
+          return InvalidArgument(
+              "Dynamic inferencing on custom call %s is not supported",
+              instr_proto->DebugString());
+        }
+      case HloOpcode::kParameter:
+        SetInstructionAsConstant(new_instr, id, new_shape, true);
+        break;
+      default:
+        return InvalidArgument("Dynamic inferencing %s is not supported",
+                               instr_proto->DebugString());
+    }
+    *new_instr->mutable_name() =
+        GetFullName(instr_proto->opcode(), kNameSeparator, id);
+    return Status::OK();
+  };
+
+  struct WorkItem {
+    explicit WorkItem(int64 handle, bool need_rewrite)
+        : handle(handle), need_rewrite(need_rewrite), visited(false) {}
+    int64 handle;
+    // If need_rewrite is true, the instruction will be copied and rewrite into
+    // a pred instruction indicating if each value is dynamic. If need_rewrite
+    // is false, simply copy the instruction to the output graph.
+    // E.g.,
+    // For select(P, A, B), we need to rewrite A and B into predicates, but
+    // don't need to rewrite P.
+    bool need_rewrite;
+    // Used in dfs to remember the ids of processed operands of this item.
+    std::vector<int64> processed_operands;
+    // Whether this node been visited before or not.
+    bool visited;
+  };
+  // Only copy each pair of {handle, need_rewrite} once. Value is the id in the
+  // new graph.
+  absl::flat_hash_map<std::pair<int64, bool>, int64> seen;
+  // Monotonically increasing id to assign to new instructions.
+  int64 global_id = 0;
+  // The result id of the last rewritten item -- return value of last stack
+  // item.
+  int64 stacktop_id = -1;
+  std::vector<WorkItem> worklist;
+  worklist.push_back(WorkItem(root->id(), true));
+  while (!worklist.empty()) {
+    WorkItem& item = worklist.back();
+    auto item_key = std::make_pair(item.handle, item.need_rewrite);
+    auto iter = seen.find(item_key);
+    // Already processed this item. Return previous results.
+    if (iter != seen.end()) {
+      stacktop_id = iter->second;
+      worklist.pop_back();
+      continue;
+    }
+
+    int64 next_operand = item.processed_operands.size();
+    TF_ASSIGN_OR_RETURN(const HloInstructionProto* instr_proto,
+                        LookUpInstructionByHandle(item.handle));
+    VLOG(3) << "Processing" << instr_proto->name();
+    if (!item.visited) {
+      item.visited = true;
+    } else {
+      // Record previous processed operand.
+      item.processed_operands.push_back(stacktop_id);
+      next_operand++;
+    }
+    TF_ASSIGN_OR_RETURN(HloOpcode opcode,
+                        StringToHloOpcode(instr_proto->opcode()));
+    if (next_operand >= instr_proto->operand_ids_size() ||
+        opcode == HloOpcode::kGetDimensionSize ||
+        InstrIsSetBound(instr_proto)) {
+      // No more operands to process, process self.
+      int64 new_id = ++global_id;
+      VLOG(3) << "new_id: " << new_id << "instr: " << instr_proto->name();
+      TF_RETURN_IF_ERROR(process_instruction(instr_proto, item.need_rewrite,
+                                             new_id, item.processed_operands));
+      stacktop_id = new_id;
+      seen[item_key] = stacktop_id;
+      worklist.pop_back();
+      continue;
+    }
+
+    WorkItem next_item(instr_proto->operand_ids(next_operand), true);
+    if (opcode == HloOpcode::kSelect && next_operand == 0) {
+      next_item.need_rewrite = false;
+    }
+    if (opcode == HloOpcode::kGather && next_operand == 1) {
+      next_item.need_rewrite = false;
+    }
+    // Push next operand into worklist.
+    worklist.push_back(next_item);
+  }
+  TF_RET_CHECK(stacktop_id != -1);
+  entry.set_root_id(stacktop_id);
+  absl::c_sort(*entry.mutable_instructions(),
+               [](const HloInstructionProto& p1,
+                  const HloInstructionProto& p2) { return p1.id() < p2.id(); });
+  XlaComputation computation(entry.id());
+  HloModuleProto* module = computation.mutable_proto();
+  module->set_name(entry.name());
+  module->set_id(entry.id());
+  module->set_entry_computation_name(entry.name());
+  module->set_entry_computation_id(entry.id());
+  *module->mutable_host_program_shape() = *program_shape;
+  for (auto& called_comp : called_computatons) {
+    *module->add_computations() = called_comp;
+  }
+  *module->add_computations() = std::move(entry);
+  XLA_VLOG_LINES(3, module->DebugString());
+  return std::move(computation);
+}
+
 StatusOr<XlaComputation> XlaBuilder::BuildConstantSubGraph(
     XlaOp root_op, bool dynamic_dimension_is_minus_one) {
   TF_ASSIGN_OR_RETURN(bool is_constant, IsConstant(root_op));
@@ -2886,26 +3260,33 @@ StatusOr<XlaComputation> XlaBuilder::BuildConstantSubGraph(
                         LookUpInstructionByHandle(handle));
 
     if (instr_proto->opcode() ==
-        HloOpcodeString(HloOpcode::kGetDimensionSize)) {
-      // At this point, BuildConstantSubGraph should never encounter a
-      // GetDimensionSize with a dynamic dimension. IsConstant check would have
-      // failed at the beginning of this function.
-      //
-      // Replace GetDimensionSize with a Constant representing the static bound
-      // of the shape.
-      int64 dimension = instr_proto->dimensions(0);
-      int64 operand_handle = instr_proto->operand_ids(0);
-      TF_ASSIGN_OR_RETURN(const HloInstructionProto* operand_proto,
-                          LookUpInstructionByHandle(operand_handle));
+            HloOpcodeString(HloOpcode::kGetDimensionSize) ||
+        InstrIsSetBound(instr_proto)) {
+      int32 constant_value = -1;
+      if (instr_proto->opcode() ==
+          HloOpcodeString(HloOpcode::kGetDimensionSize)) {
+        // At this point, BuildConstantSubGraph should never encounter a
+        // GetDimensionSize with a dynamic dimension. IsConstant check would
+        // have failed at the beginning of this function.
+        //
+        // Replace GetDimensionSize with a Constant representing the static
+        // bound of the shape.
+        int64 dimension = instr_proto->dimensions(0);
+        int64 operand_handle = instr_proto->operand_ids(0);
+        TF_ASSIGN_OR_RETURN(const HloInstructionProto* operand_proto,
+                            LookUpInstructionByHandle(operand_handle));
 
-      int32 constant_dimension_size = -1;
-      if (!(operand_proto->shape().is_dynamic_dimension(dimension) &&
-            dynamic_dimension_is_minus_one)) {
-        constant_dimension_size =
-            static_cast<int32>(operand_proto->shape().dimensions(dimension));
+        if (!(operand_proto->shape().is_dynamic_dimension(dimension) &&
+              dynamic_dimension_is_minus_one)) {
+          constant_value =
+              static_cast<int32>(operand_proto->shape().dimensions(dimension));
+        }
+      } else {
+        TF_RET_CHECK(
+            absl::SimpleAtoi(instr_proto->backend_config(), &constant_value));
       }
 
-      Literal literal = LiteralUtil::CreateR0(constant_dimension_size);
+      Literal literal = LiteralUtil::CreateR0(constant_value);
 
       HloInstructionProto const_instr;
       *const_instr.mutable_shape() = literal.shape().ToProto();
@@ -2937,6 +3318,9 @@ StatusOr<XlaComputation> XlaBuilder::BuildConstantSubGraph(
     if (instr_src->opcode() == HloOpcodeString(HloOpcode::kGetDimensionSize)) {
       continue;
     }
+    if (InstrIsSetBound(instr_src)) {
+      continue;
+    }
     auto* instr = entry.add_instructions();
 
     *instr = *instr_src;
@@ -3215,6 +3599,13 @@ XlaOp Reshape(const Shape& shape, XlaOp operand) {
   return operand.builder()->Reshape(shape, operand);
 }
 
+XlaOp DynamicReshape(XlaOp operand, absl::Span<const XlaOp> dim_sizes,
+                     absl::Span<const int64> new_size_bounds,
+                     const std::vector<bool>& dims_are_dynamic) {
+  return operand.builder()->DynamicReshape(operand, dim_sizes, new_size_bounds,
+                                           dims_are_dynamic);
+}
+
 XlaOp ReshapeWithInferredDimension(XlaOp operand,
                                    absl::Span<const int64> new_sizes,
                                    int64 inferred_dimension) {
@@ -3274,31 +3665,71 @@ XlaOp Eq(const XlaOp lhs, const XlaOp rhs,
   return Compare(lhs, rhs, broadcast_dimensions, ComparisonDirection::kEq);
 }
 
+XlaOp EqTotalOrder(const XlaOp lhs, const XlaOp rhs,
+                   absl::Span<const int64> broadcast_dimensions) {
+  auto compare_type = Comparison::Type::kFloatTotalOrder;
+  return Compare(lhs, rhs, broadcast_dimensions, ComparisonDirection::kEq,
+                 compare_type);
+}
+
 XlaOp Ne(const XlaOp lhs, const XlaOp rhs,
          absl::Span<const int64> broadcast_dimensions) {
   return Compare(lhs, rhs, broadcast_dimensions, ComparisonDirection::kNe);
 }
 
+XlaOp NeTotalOrder(const XlaOp lhs, const XlaOp rhs,
+                   absl::Span<const int64> broadcast_dimensions) {
+  auto compare_type = Comparison::Type::kFloatTotalOrder;
+  return Compare(lhs, rhs, broadcast_dimensions, ComparisonDirection::kNe,
+                 compare_type);
+}
+
 XlaOp Ge(const XlaOp lhs, const XlaOp rhs,
          absl::Span<const int64> broadcast_dimensions) {
   return Compare(lhs, rhs, broadcast_dimensions, ComparisonDirection::kGe);
 }
 
+XlaOp GeTotalOrder(const XlaOp lhs, const XlaOp rhs,
+                   absl::Span<const int64> broadcast_dimensions) {
+  auto compare_type = Comparison::Type::kFloatTotalOrder;
+  return Compare(lhs, rhs, broadcast_dimensions, ComparisonDirection::kGe,
+                 compare_type);
+}
+
 XlaOp Gt(const XlaOp lhs, const XlaOp rhs,
          absl::Span<const int64> broadcast_dimensions) {
   return Compare(lhs, rhs, broadcast_dimensions, ComparisonDirection::kGt);
 }
 
+XlaOp GtTotalOrder(const XlaOp lhs, const XlaOp rhs,
+                   absl::Span<const int64> broadcast_dimensions) {
+  auto compare_type = Comparison::Type::kFloatTotalOrder;
+  return Compare(lhs, rhs, broadcast_dimensions, ComparisonDirection::kGt,
+                 compare_type);
+}
+
 XlaOp Le(const XlaOp lhs, const XlaOp rhs,
          absl::Span<const int64> broadcast_dimensions) {
   return Compare(lhs, rhs, broadcast_dimensions, ComparisonDirection::kLe);
 }
 
+XlaOp LeTotalOrder(const XlaOp lhs, const XlaOp rhs,
+                   absl::Span<const int64> broadcast_dimensions) {
+  auto compare_type = Comparison::Type::kFloatTotalOrder;
+  return Compare(lhs, rhs, broadcast_dimensions, ComparisonDirection::kLe,
+                 compare_type);
+}
 XlaOp Lt(const XlaOp lhs, const XlaOp rhs,
          absl::Span<const int64> broadcast_dimensions) {
   return Compare(lhs, rhs, broadcast_dimensions, ComparisonDirection::kLt);
 }
 
+XlaOp LtTotalOrder(const XlaOp lhs, const XlaOp rhs,
+                   absl::Span<const int64> broadcast_dimensions) {
+  return Compare(lhs, rhs, broadcast_dimensions, ComparisonDirection::kLt,
+                 Comparison::Type::kFloatTotalOrder);
+}
+
 XlaOp Compare(const XlaOp lhs, const XlaOp rhs,
               absl::Span<const int64> broadcast_dimensions,
               ComparisonDirection direction) {
@@ -3306,6 +3737,13 @@ XlaOp Compare(const XlaOp lhs, const XlaOp rhs,
                                  broadcast_dimensions, direction);
 }
 
+XlaOp Compare(const XlaOp lhs, const XlaOp rhs,
+              absl::Span<const int64> broadcast_dimensions,
+              ComparisonDirection direction, Comparison::Type compare_type) {
+  return lhs.builder()->BinaryOp(HloOpcode::kCompare, lhs, rhs,
+                                 broadcast_dimensions, direction, compare_type);
+}
+
 XlaOp Compare(const XlaOp lhs, const XlaOp rhs, ComparisonDirection direction) {
   return Compare(lhs, rhs, {}, direction);
 }
@@ -3386,36 +3824,26 @@ XlaOp TriangularSolve(XlaOp a, XlaOp b, bool left_side, bool lower,
                       TriangularSolveOptions::Transpose transpose_a) {
   XlaBuilder* builder = a.builder();
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape* a_shape, builder->GetShapePtr(a));
     TF_ASSIGN_OR_RETURN(const Shape* b_shape, builder->GetShapePtr(b));
-    xla::TriangularSolveOptions& options =
-        *instr.mutable_triangular_solve_options();
+    xla::TriangularSolveOptions options;
     options.set_left_side(left_side);
     options.set_lower(lower);
     options.set_unit_diagonal(unit_diagonal);
     options.set_transpose_a(transpose_a);
     TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferTriangularSolveShape(
                                          *a_shape, *b_shape, options));
-    *instr.mutable_shape() = shape.ToProto();
-
-    return builder->AddInstruction(std::move(instr),
-                                   HloOpcode::kTriangularSolve, {a, b});
+    return builder->TriangularSolveInternal(shape, a, b, std::move(options));
   });
 }
 
 XlaOp Cholesky(XlaOp a, bool lower) {
   XlaBuilder* builder = a.builder();
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape* a_shape, builder->GetShapePtr(a));
-    xla::CholeskyOptions& options = *instr.mutable_cholesky_options();
-    options.set_lower(lower);
     TF_ASSIGN_OR_RETURN(Shape shape,
                         ShapeInference::InferCholeskyShape(*a_shape));
-    *instr.mutable_shape() = shape.ToProto();
-
-    return builder->AddInstruction(std::move(instr), HloOpcode::kCholesky, {a});
+    return builder->CholeskyInternal(shape, a, lower);
   });
 }
 
diff --git a/tensorflow/compiler/xla/client/xla_builder.h b/tensorflow/compiler/xla/client/xla_builder.h
index 1960d0c4632..cd9809c2a20 100644
--- a/tensorflow/compiler/xla/client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_builder.h
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/dynamic_parameter_binding.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
+#include "tensorflow/compiler/xla/service/hlo_input_output_alias_config.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -163,6 +164,15 @@ class XlaBuilder {
   // OpMetadata attached until a call to ClearOpMetadata.
   void SetOpMetadata(OpMetadata metadata) { metadata_ = std::move(metadata); }
 
+  // Swaps the passed op metadata with the ones currently set.
+  //
+  // Returns the old op metadata.
+  OpMetadata SwapOpMetadata(OpMetadata metadata) {
+    OpMetadata old_metadata = std::move(metadata_);
+    metadata_ = std::move(metadata);
+    return old_metadata;
+  }
+
   // Similar to SetOpMetadata, but only set the metadata for the next op.
   void SetOneShotOpMetadata(OpMetadata metadata) {
     metadata_ = std::move(metadata);
@@ -277,6 +287,31 @@ class XlaBuilder {
   StatusOr<XlaComputation> BuildConstantSubGraph(
       XlaOp root_op, bool dynamic_dimension_is_uint_max = false);
 
+  // Similar to BuildConstantSubGraph, but with root element type changed to
+  // boolean. A true value in the root indicates that the value is dynamic while
+  // false value indicates that the value is a constant. This will copy the
+  // needed ops/computations to the subgraph.
+  //
+  // E.g.,
+  // Compuptation {
+  //   a = 3
+  //   b = param(0)
+  //   ROOT Tuple(a + b, a + 1, b + 1)
+  // }
+  // Calling BuildDynamicInferenceGraph on root will produce the following
+  // graph:
+  //
+  // Compuptation {
+  //   a = False
+  //   b = True
+  //   ROOT Tuple(a | b, a, b)
+  // }
+  //
+  // The result, which is (True, False, True) after evaluation, can be
+  // interpreted as "First element is dynamic; Second element is static; Third
+  // element is dynamic".
+  StatusOr<XlaComputation> BuildDynamicInferenceGraph(XlaOp root_op);
+
   // Returns the first error that was encountered while building the
   // computation. When an error is encountered, by default we return a vacuous
   // XlaOp and inform the user of the error that occurred while
@@ -340,6 +375,7 @@ class XlaBuilder {
   //
   // TODO(b/119520625): Remove this API once we have more dynamic shape infra
   // ready.
+  ABSL_DEPRECATED("Use SetDimensionSize to set a dynamic dimension.")
   Status SetDynamicBinding(int64 dynamic_size_param_num,
                            ShapeIndex dynamic_size_param_index,
                            int64 target_param_num,
@@ -349,12 +385,16 @@ class XlaBuilder {
   // not available until the computation is built, and eventual error in the
   // arguments of this API will be detected only at computation Build() time.
   //
-  // Note: Aliasing API is 'may-alias' and only donated buffer at runtime will
-  // be aliased with output. If a buffer is not donated at runtime, a copy will
-  // be inserted by XLA to prevent buffer clobbering.
+  // Note: Except when 'must-alias' is true, alias is assumed to be 'may-alias'
+  // and only donated buffer at runtime will be aliased with output. If a buffer
+  // is not donated at runtime, a copy will be inserted by XLA to prevent buffer
+  // clobbering.
   void SetUpAlias(const ShapeIndex& output_index, int64 param_number,
-                  const ShapeIndex& param_index) {
-    input_output_aliases_.push_back({output_index, param_number, param_index});
+                  const ShapeIndex& param_index,
+                  HloInputOutputAliasConfig::AliasKind kind =
+                      HloInputOutputAliasConfig::AliasKind::kMayAlias) {
+    input_output_aliases_.push_back(
+        {output_index, param_number, param_index, kind});
   }
 
   // Describes an input/output alias as inserted by the SetUpAlias() API.
@@ -365,6 +405,8 @@ class XlaBuilder {
     int64 param_number;
     // Specifies the index of the aliased buffer in the parameter
     ShapeIndex param_index;
+    // Specifies if the alias is a must alias or may alias.
+    HloInputOutputAliasConfig::AliasKind kind;
   };
 
   // Looks up the HloInstruction and sets the frontend attribute "attribute" to
@@ -422,6 +464,10 @@ class XlaBuilder {
   XlaOp Reshape(const Shape& shape, XlaOp operand,
                 int64 inferred_dimension = -1);
 
+  XlaOp DynamicReshape(XlaOp operand, absl::Span<const XlaOp> dim_sizes,
+                       absl::Span<const int64> new_size_bounds,
+                       const std::vector<bool>& dims_are_dynamic);
+
   XlaOp Collapse(XlaOp operand, absl::Span<const int64> dimensions);
 
   XlaOp Slice(XlaOp operand, absl::Span<const int64> start_indices,
@@ -521,6 +567,12 @@ class XlaBuilder {
                                       FftType fft_type,
                                       absl::Span<const int64> fft_length);
 
+  virtual StatusOr<XlaOp> TriangularSolveInternal(
+      const Shape& shape, XlaOp a, XlaOp b, TriangularSolveOptions options);
+
+  virtual StatusOr<XlaOp> CholeskyInternal(const Shape& shape, XlaOp a,
+                                           bool lower);
+
   XlaOp Infeed(const Shape& shape, const string& config = "");
   XlaOp InfeedWithToken(XlaOp token, const Shape& shape, const string& config);
   virtual StatusOr<XlaOp> InfeedWithTokenInternal(
@@ -669,6 +721,11 @@ class XlaBuilder {
 
   XlaOp RngBitGenerator(RandomAlgorithm algorithm, XlaOp initial_state,
                         const Shape& shape);
+  // Internal variant for the op with the full result shape containing both data
+  // and state shape as a tuple.
+  virtual StatusOr<XlaOp> RngBitGeneratorInternal(
+      const Shape& full_result_shape, RandomAlgorithm algorithm,
+      XlaOp initial_state);
 
   XlaOp While(const XlaComputation& condition, const XlaComputation& body,
               XlaOp init);
@@ -741,8 +798,13 @@ class XlaBuilder {
 
   XlaOp RemoveDynamicDimension(XlaOp operand, int64 dimension);
 
-  StatusOr<XlaOp> AddInstruction(HloInstructionProto&& instr, HloOpcode opcode,
-                                 absl::Span<const XlaOp> operands = {});
+  virtual StatusOr<XlaOp> AddInstruction(HloInstructionProto&& instr,
+                                         HloOpcode opcode,
+                                         absl::Span<const XlaOp> operands);
+  StatusOr<XlaOp> AddInstruction(HloInstructionProto&& instr,
+                                 HloOpcode opcode) {
+    return AddInstruction(std::move(instr), opcode, /*operands=*/{});
+  }
 
   void AddCalledComputation(const XlaComputation& computation,
                             HloInstructionProto* instr);
@@ -760,14 +822,17 @@ class XlaBuilder {
   // broadcast_dimensions specifies which dimensions to use for broadcasting
   // when the operation is between tensors of different ranks. The direction is
   // only used if opcode is kCompare.
-  XlaOp BinaryOp(
-      HloOpcode binop, XlaOp lhs, XlaOp rhs,
-      absl::Span<const int64> broadcast_dimensions,
-      absl::optional<Comparison::Direction> direction = absl::nullopt);
+  XlaOp BinaryOp(HloOpcode binop, XlaOp lhs, XlaOp rhs,
+                 absl::Span<const int64> broadcast_dimensions,
+                 absl::optional<ComparisonDirection> direction = absl::nullopt,
+                 absl::optional<Comparison::Type> type = absl::nullopt);
 
   // Internal helper method for binary op compare without broadcast dimensions.
   virtual StatusOr<XlaOp> Compare(const Shape& shape, XlaOp lhs, XlaOp rhs,
-                                  Comparison::Direction direction);
+                                  ComparisonDirection direction);
+  virtual StatusOr<XlaOp> Compare(const Shape& shape, XlaOp lhs, XlaOp rhs,
+                                  ComparisonDirection direction,
+                                  Comparison::Type type);
 
   // Internal helper method that does the building for an arbitrary binary op
   // with same ranked operands that doesn't broadcast.
@@ -905,6 +970,10 @@ class XlaBuilder {
 
   friend XlaOp Reshape(const Shape& shape, XlaOp operand);
 
+  friend XlaOp DynamicReshape(XlaOp operand, absl::Span<const XlaOp> dim_sizes,
+                              absl::Span<const int64> new_size_bounds,
+                              const std::vector<bool>& dims_are_dynamic);
+
   friend XlaOp ReshapeWithInferredDimension(XlaOp operand,
                                             absl::Span<const int64> new_sizes,
                                             int64 inferred_dimension);
@@ -933,22 +1002,13 @@ class XlaBuilder {
   friend XlaOp Select(XlaOp pred, XlaOp on_true, XlaOp on_false);
   friend XlaOp Tuple(XlaBuilder* builder, absl::Span<const XlaOp> elements);
   friend XlaOp GetTupleElement(XlaOp tuple_data, int64 index);
-  friend XlaOp Eq(XlaOp lhs, XlaOp rhs,
-                  absl::Span<const int64> broadcast_dimensions);
-  friend XlaOp Ne(XlaOp lhs, XlaOp rhs,
-                  absl::Span<const int64> broadcast_dimensions);
-  friend XlaOp Ge(XlaOp lhs, XlaOp rhs,
-                  absl::Span<const int64> broadcast_dimensions);
-  friend XlaOp Gt(XlaOp lhs, XlaOp rhs,
-                  absl::Span<const int64> broadcast_dimensions);
-  friend XlaOp Lt(XlaOp lhs, XlaOp rhs,
-                  absl::Span<const int64> broadcast_dimensions);
-  friend XlaOp Le(XlaOp lhs, XlaOp rhs,
-                  absl::Span<const int64> broadcast_dimensions);
   friend XlaOp Compare(XlaOp lhs, XlaOp rhs,
                        absl::Span<const int64> broadcast_dimensions,
                        ComparisonDirection direction);
-  friend XlaOp Compare(XlaOp lhs, XlaOp rhs, ComparisonDirection direction);
+  friend XlaOp Compare(XlaOp lhs, XlaOp rhs,
+                       absl::Span<const int64> broadcast_dimensions,
+                       ComparisonDirection direction,
+                       Comparison::Type compare_type);
   friend XlaOp Dot(XlaOp lhs, XlaOp rhs,
                    const PrecisionConfig* precision_config);
   friend XlaOp DotGeneral(XlaOp lhs, XlaOp rhs,
@@ -1288,6 +1348,25 @@ class XlaScopedFrontendAttributesAssignment {
 
   TF_DISALLOW_COPY_AND_ASSIGN(XlaScopedFrontendAttributesAssignment);
 };
+
+// RAII-style object: sets the current op metadata in builder on construction,
+// and sets back to the previous assignment on destruction.
+class XlaScopedOpMetadataAssignment {
+ public:
+  XlaScopedOpMetadataAssignment(xla::XlaBuilder* builder, OpMetadata metadata)
+      : builder_(builder) {
+    saved_ = builder_->SwapOpMetadata(metadata);
+  }
+
+  ~XlaScopedOpMetadataAssignment() { builder_->SwapOpMetadata(saved_); }
+
+ private:
+  xla::XlaBuilder* const builder_;
+  OpMetadata saved_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(XlaScopedOpMetadataAssignment);
+};
+
 // Free functions for building XlaOps. The intention is that these will
 // become the public API for building XlaOps rather than calling methods on
 // XlaBuilder directly.
@@ -1427,9 +1506,16 @@ XlaOp Pad(XlaOp operand, XlaOp padding_value,
 XlaOp Reshape(XlaOp operand, absl::Span<const int64> dimensions,
               absl::Span<const int64> new_sizes);
 
-// Enqueues an operation onto the computation that collapses the operand, from
-// first to last dimension (C order), then reshapes it to the given dimension
-// sizes. Conceptually, this is a limited form of "shape casting".
+// Enqueues a dynamic reshape operation. The dynamic reshape takes additional
+// XlaOps as sizes for the result dimension. The result dim i is a dynamic
+// dimension dimension if dims_are_dynamic[i] is true.
+XlaOp DynamicReshape(XlaOp operand, absl::Span<const XlaOp> dim_sizes,
+                     absl::Span<const int64> new_size_bounds,
+                     const std::vector<bool>& dims_are_dynamic);
+
+// Enqueues an operation onto the computation that collapses the operand,
+// from first to last dimension (C order), then reshapes it to the given
+// dimension sizes. Conceptually, this is a limited form of "shape casting".
 XlaOp Reshape(XlaOp operand, absl::Span<const int64> new_sizes);
 
 // Enqueues a Reshape op that uses an explicit target shape.
@@ -1542,29 +1628,44 @@ XlaOp GetTupleElement(XlaOp tuple_data, int64 index);
 // Enqueues an equal-to comparison instruction onto the computation.
 XlaOp Eq(XlaOp lhs, XlaOp rhs,
          absl::Span<const int64> broadcast_dimensions = {});
+XlaOp EqTotalOrder(XlaOp lhs, XlaOp rhs,
+                   absl::Span<const int64> broadcast_dimensions = {});
 
 // Enqueues a not-equal comparison instruction onto the computation.
 XlaOp Ne(XlaOp lhs, XlaOp rhs,
          absl::Span<const int64> broadcast_dimensions = {});
+XlaOp NeTotalOrder(XlaOp lhs, XlaOp rhs,
+                   absl::Span<const int64> broadcast_dimensions = {});
 
 // Enqueues a greater-or-equal comparison instruction onto the computation.
 XlaOp Ge(XlaOp lhs, XlaOp rhs,
          absl::Span<const int64> broadcast_dimensions = {});
+XlaOp GeTotalOrder(XlaOp lhs, XlaOp rhs,
+                   absl::Span<const int64> broadcast_dimensions = {});
 
 // Enqueues a greater-than comparison instruction onto the computation.
 XlaOp Gt(XlaOp lhs, XlaOp rhs,
          absl::Span<const int64> broadcast_dimensions = {});
+XlaOp GtTotalOrder(XlaOp lhs, XlaOp rhs,
+                   absl::Span<const int64> broadcast_dimensions = {});
 
 // Enqueues a less-than comparison instruction onto the computation.
 XlaOp Lt(XlaOp lhs, XlaOp rhs,
          absl::Span<const int64> broadcast_dimensions = {});
+XlaOp LtTotalOrder(XlaOp lhs, XlaOp rhs,
+                   absl::Span<const int64> broadcast_dimensions = {});
 
 // Enqueues a less-or-equal comparison instruction onto the computation.
 XlaOp Le(XlaOp lhs, XlaOp rhs,
          absl::Span<const int64> broadcast_dimensions = {});
+XlaOp LeTotalOrder(XlaOp lhs, XlaOp rhs,
+                   absl::Span<const int64> broadcast_dimensions = {});
 
 // Enqueues a comparison instruction onto the computation (optionally without
 // broadcast_dimensions for consistency with others).
+XlaOp Compare(XlaOp lhs, XlaOp rhs,
+              absl::Span<const int64> broadcast_dimensions,
+              ComparisonDirection direction, Comparison::Type compare_type);
 XlaOp Compare(XlaOp lhs, XlaOp rhs,
               absl::Span<const int64> broadcast_dimensions,
               ComparisonDirection direction);
diff --git a/tensorflow/compiler/xla/comparison_util.cc b/tensorflow/compiler/xla/comparison_util.cc
index 47fb69e3bce..06dd9642cac 100644
--- a/tensorflow/compiler/xla/comparison_util.cc
+++ b/tensorflow/compiler/xla/comparison_util.cc
@@ -54,32 +54,59 @@ StatusOr<Comparison::Direction> StringToComparisonDirection(
   return it->second;
 }
 
-Comparison::Comparison(Direction dir, PrimitiveType type) : dir_(dir) {
+StatusOr<Comparison::Type> StringToComparisonType(
+    absl::string_view compare_type_name) {
+  static auto* type_map = new absl::flat_hash_map<string, Comparison::Type>({
+      {"FLOAT", Comparison::Type::kFloat},
+      {"TOTALORDER", Comparison::Type::kFloatTotalOrder},
+      {"SIGNED", Comparison::Type::kSigned},
+      {"UNSIGNED", Comparison::Type::kUnsigned},
+  });
+  auto it = type_map->find(compare_type_name);
+  if (it == type_map->end()) {
+    return InvalidArgument("Unknown comparison type: %s", compare_type_name);
+  }
+  return it->second;
+}
+
+std::string ComparisonTypeToString(Comparison::Type type) {
+  switch (type) {
+    case Comparison::Type::kFloat:
+      return "FLOAT";
+    case Comparison::Type::kFloatTotalOrder:
+      return "TOTALORDER";
+    case Comparison::Type::kSigned:
+      return "SIGNED";
+    case Comparison::Type::kUnsigned:
+      return "UNSIGNED";
+  }
+}
+
+Comparison::Comparison(Direction dir, PrimitiveType type)
+    : dir_(dir), type_(DefaultComparisonType(type)) {}
+
+Comparison::Type Comparison::DefaultComparisonType(PrimitiveType type) {
   switch (type) {
     case S8:
     case S16:
     case S32:
     case S64:
-      type_ = Type::kSigned;
-      break;
+      return Type::kSigned;
     case PRED:
     case U8:
     case U16:
     case U32:
     case U64:
-      type_ = Type::kUnsigned;
-      break;
+      return Type::kUnsigned;
     case F16:
     case F32:
     case BF16:
     case F64:
     case C64:
     case C128:
-      type_ = Type::kFloat;
-      break;
+      return Type::kFloat;
     default:
       LOG(FATAL) << "Unsupported comparison mode."
-                 << ComparisonDirectionToString(dir) << ":"
                  << PrimitiveType_Name(type) << "\n";
   }
 }
@@ -164,20 +191,6 @@ bool Comparison::IsAntireflexive() const {
   }
 }
 
-/* static */ const char* Comparison::ComparisonTypeToString(
-    Comparison::Type type) {
-  switch (type) {
-    case Type::kFloat:
-      return "f";
-    case Type::kFloatTotalOrder:
-      return "ft";
-    case Type::kSigned:
-      return "s";
-    case Type::kUnsigned:
-      return "u";
-  }
-}
-
 std::string Comparison::ToString(std::string prefix1,
                                  std::string prefix2) const {
   return prefix1 + std::string(ComparisonDirectionToString(dir_)) + prefix2 +
diff --git a/tensorflow/compiler/xla/comparison_util.h b/tensorflow/compiler/xla/comparison_util.h
index 11335c6b5ba..33ae2c67106 100644
--- a/tensorflow/compiler/xla/comparison_util.h
+++ b/tensorflow/compiler/xla/comparison_util.h
@@ -103,11 +103,11 @@ class Comparison {
   bool Compare(const T a, const T b) const {
     return GetComparator<T>()(a, b);
   }
+  static Type DefaultComparisonType(PrimitiveType t);
 
  private:
   static Direction Converse(Direction dir);
   static Direction Inverse(Direction dir);
-  static const char* ComparisonTypeToString(Type type);
 
   const Direction dir_;
   Type type_;
@@ -117,10 +117,14 @@ inline std::ostream& operator<<(std::ostream& os, const Comparison& cmp) {
   return os << cmp.ToString();
 }
 string ComparisonDirectionToString(Comparison::Direction direction);
+std::string ComparisonTypeToString(Comparison::Type type);
 
 StatusOr<Comparison::Direction> StringToComparisonDirection(
     absl::string_view direction_name);
 
+StatusOr<Comparison::Type> StringToComparisonType(
+    absl::string_view compare_type_name);
+
 using ComparisonDirection = Comparison::Direction;
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/experimental/xla_sharding/xla_sharding.py b/tensorflow/compiler/xla/experimental/xla_sharding/xla_sharding.py
index 16563bab5bc..a926e8b3c88 100644
--- a/tensorflow/compiler/xla/experimental/xla_sharding/xla_sharding.py
+++ b/tensorflow/compiler/xla/experimental/xla_sharding/xla_sharding.py
@@ -89,6 +89,32 @@ class Sharding(object):
             tile_assignment_dimensions=dims,
             tile_assignment_devices=list(flattened_devices)))
 
+  @classmethod
+  def partial_tile(cls, tile_assignment):
+    """Returns a partially tiled sharding attribute.
+
+    This is similar to tile(), but tile_assignment has one more dimension than
+    the tensor, and tiles in the last dimension of tile_assignment are
+    replicated.
+
+    Args:
+      tile_assignment: An np.ndarray describing the topology of the tiling and
+        which device will compute which part of the topology.
+
+    Raises:
+      TypeError: tile_assignment was not of np.array type.
+    """
+    if not isinstance(tile_assignment, _np.ndarray):
+      raise TypeError('PartialTile assignment must be of type np.ndarray')
+    dims = list(tile_assignment.shape)
+    flattened_devices = tile_assignment.reshape(-1, order='C')
+    return Sharding(
+        proto=xla_data_pb2.OpSharding(
+            type=xla_data_pb2.OpSharding.OTHER,
+            tile_assignment_dimensions=dims,
+            tile_assignment_devices=list(flattened_devices),
+            replicate_on_last_tile_dim=True))
+
   @classmethod
   def split(cls, tensor, split_dimension, num_devices, input_shape=None):
     """Returns a Sharding that splits a tensor across a dimension.
@@ -245,6 +271,23 @@ def split(tensor,
   return tensor
 
 
+def partial_tile(tensor, tile_assignment, use_sharding_op=False):
+  """Returns a tensor that has tiled sharding.
+
+  Args:
+    tensor: A tf.Tensor to shard.
+    tile_assignment: An np.ndarray describing the topology of the tiling and
+      which device will compute which part of the topology. It must have one
+      more dimension than tensor, and the last dimension represents partially
+      replicated tiles.
+    use_sharding_op: If true, adds a sharding op to set the sharding.
+  """
+  if use_sharding_op:
+    tensor = tf2xla.sharding(tensor)
+  Sharding.partial_tile(tile_assignment).apply_to_tensor(tensor)
+  return tensor
+
+
 def get_op_sharding(op):
   """Returns sharding attribute of an op.
 
@@ -313,20 +356,30 @@ def mesh_split(tensor,
     use_sharding_op: If true, adds a sharding op to set the sharding.
 
   Raises:
-    ValueError: The number of tensor split dimensions is different from device
-      mesh rank.
+    ValueError: The number of tensor split dimensions is larger than device mesh
+      rank.
   """
   permutation = [d for d in tensor_split_dims_mapping if d >= 0]
-  if len(permutation) != len(device_mesh.shape):
+  if len(permutation) > len(device_mesh.shape):
     raise ValueError(
-        'Number of tensor split dimensions (%r) is different from device mesh '
+        'Number of tensor split dimensions (%r) is larger than device mesh '
         'rank (%r). tensor_split_dims_mapping: %r, device_mesh.shape: %r' %
         (len(permutation), len(
             device_mesh.shape), tensor_split_dims_mapping, device_mesh.shape))
-  tile_assignment = _np.transpose(device_mesh, permutation)
+  # Append replicated dimensions to the end.
+  transpose_permutation = permutation + [
+      d for d in range(len(device_mesh.shape)) if d not in permutation
+  ]
+  tile_assignment = _np.transpose(device_mesh, transpose_permutation)
   tile_shape = [
       1 if d < 0 else device_mesh.shape[d] for d in tensor_split_dims_mapping
   ]
+  partial = len(permutation) < len(device_mesh.shape)
+  if partial:
+    tile_shape.append(_np.prod(device_mesh.shape) // _np.prod(tile_shape))
   tile_assignment = _np.reshape(tile_assignment, tile_shape)
 
+  if partial:
+    return partial_tile(
+        tensor, tile_assignment, use_sharding_op=use_sharding_op)
   return tile(tensor, tile_assignment, use_sharding_op=use_sharding_op)
diff --git a/tensorflow/compiler/xla/g3doc/index.md b/tensorflow/compiler/xla/g3doc/index.md
index 51d666fba9a..45abd9b4c92 100644
--- a/tensorflow/compiler/xla/g3doc/index.md
+++ b/tensorflow/compiler/xla/g3doc/index.md
@@ -121,8 +121,8 @@ example.
 
 ### AOT (Ahead-of-time) compilation for CPU with `tfcompile`
 
-You can also use a standalone [`tfcompile`](./tfcompile) tool,
-which converts TensorFlow graph into executable code (for x86-64 CPU only).
+You can also use a standalone [`tfcompile`](./tfcompile.md) tool, which converts
+TensorFlow graph into executable code (for x86-64 CPU only).
 
 ## Inspect compiled programs
 
@@ -196,7 +196,7 @@ Apart from TensorFlow, XLA programs can be generated by:
     [XLA source](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/compiler/xla)
     on Github!
 
-<iframe frameborder="0" allowfullscreen="1" allow="accelerometer; autoplay;
-encrypted-media; gyroscope; picture-in-picture" width="640" height="360"
+<iframe frameborder="0" allow="accelerometer; autoplay;
+encrypted-media; gyroscope; picture-in-picture; fullscreen" width="640" height="360"
 src="https://www.youtube.com/embed/kAOanJczHA0?origin=https%3A%2F%2Fwww.tensorflow.org&amp;autohide=1&amp;showinfo=0&amp;video-id=kAOanJczHA0&amp;enablejsapi=1&amp;widgetid=1"
 id="widget2" data-title="YouTube video player"></iframe>
diff --git a/tensorflow/compiler/xla/g3doc/operation_semantics.md b/tensorflow/compiler/xla/g3doc/operation_semantics.md
index 3031bfbf2e2..051c1539f6b 100644
--- a/tensorflow/compiler/xla/g3doc/operation_semantics.md
+++ b/tensorflow/compiler/xla/g3doc/operation_semantics.md
@@ -1235,7 +1235,10 @@ floating-point types.
 
 Where `Op` is one of `Eq` (equal-to), `Ne` (not equal-to), `Ge`
 (greater-or-equal-than), `Gt` (greater-than), `Le` (less-or-equal-than), `Lt`
-(less-than).
+(less-than). Another set of operators, EqTotalOrder, NeTotalOrder, GeTotalOrder,
+GtTotalOrder, LeTotalOrder, and LtTotalOrder, provide the same functionalities,
+except that they additionally support a total order over the floating point
+numbers, by enforcing -NaN < -Inf < -Finite < -0 < +0 < +Finite < +Inf < +NaN.
 
 Arguments | Type    | Semantics
 --------- | ------- | ----------------------------------------
diff --git a/tensorflow/compiler/xla/g3doc/tutorials/autoclustering_xla.ipynb b/tensorflow/compiler/xla/g3doc/tutorials/autoclustering_xla.ipynb
index c0160f2766c..d7799093583 100644
--- a/tensorflow/compiler/xla/g3doc/tutorials/autoclustering_xla.ipynb
+++ b/tensorflow/compiler/xla/g3doc/tutorials/autoclustering_xla.ipynb
@@ -169,7 +169,7 @@
         "  model.set_weights(initial_weights)\n",
         "\n",
         "warmup(model, x_train, y_train, x_test, y_test)\n",
-        "%time train_model(model, x_train, y_train, x_test, y_test)\n",
+        "train_model(model, x_train, y_train, x_test, y_test)\n",
         "\n",
         "scores = model.evaluate(x_test, y_test, verbose=1)\n",
         "print('Test loss:', scores[0])\n",
diff --git a/tensorflow/compiler/xla/literal.cc b/tensorflow/compiler/xla/literal.cc
index 3807e6d3a56..d26e0881c53 100644
--- a/tensorflow/compiler/xla/literal.cc
+++ b/tensorflow/compiler/xla/literal.cc
@@ -1004,14 +1004,20 @@ absl::optional<int64> LiteralBase::GetIntegralAsS64(
   switch (shape().element_type()) {
     case PRED:
       return Get<bool>(multi_index);
+    case S8:
+      return Get<int8>(multi_index);
     case U8:
       return Get<uint8>(multi_index);
+    case S16:
+      return Get<int16>(multi_index);
+    case U16:
+      return Get<uint16>(multi_index);
     case S32:
       return Get<int32>(multi_index);
-    case S64:
-      return Get<int64>(multi_index);
     case U32:
       return Get<uint32>(multi_index);
+    case S64:
+      return Get<int64>(multi_index);
     case U64:
       return Get<uint64>(multi_index);
     default:
diff --git a/tensorflow/compiler/xla/pjrt/BUILD b/tensorflow/compiler/xla/pjrt/BUILD
index 6e61e0600a0..54240587282 100644
--- a/tensorflow/compiler/xla/pjrt/BUILD
+++ b/tensorflow/compiler/xla/pjrt/BUILD
@@ -59,6 +59,10 @@ cc_library(
     name = "tracked_device_buffer",
     srcs = ["tracked_device_buffer.cc"],
     hdrs = ["tracked_device_buffer.h"],
+    visibility = [
+        "//learning/pathways/data_parallel:__pkg__",
+        "//tensorflow:internal",
+    ],
     deps = [
         ":event_pool",
         ":local_device_state",
@@ -204,6 +208,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core/common_runtime:bfc_allocator",
         "//tensorflow/core/common_runtime/gpu:gpu_mem_allocator",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/stream_executor:tf_allocator_adapter",
     ] + if_cuda(["@local_config_nccl//:nccl"]),
 )
diff --git a/tensorflow/compiler/xla/pjrt/cpu_device.cc b/tensorflow/compiler/xla/pjrt/cpu_device.cc
index be70c16fc12..e2543bda7df 100644
--- a/tensorflow/compiler/xla/pjrt/cpu_device.cc
+++ b/tensorflow/compiler/xla/pjrt/cpu_device.cc
@@ -25,8 +25,8 @@ static const char kCpuPlatformName[] = "cpu";
 
 CpuDevice::CpuDevice(int id,
                      std::unique_ptr<LocalDeviceState> local_device_state)
-    : Device(id, std::move(local_device_state), kCpuPlatformName,
-             /*device_kind=*/kCpuPlatformName) {}
+    : PjRtDevice(id, std::move(local_device_state), kCpuPlatformName,
+                 /*device_kind=*/kCpuPlatformName) {}
 
 StatusOr<std::shared_ptr<PjRtClient>> GetCpuClient(bool asynchronous) {
   TF_ASSIGN_OR_RETURN(se::Platform * platform,
@@ -39,7 +39,7 @@ StatusOr<std::shared_ptr<PjRtClient>> GetCpuClient(bool asynchronous) {
   TF_ASSIGN_OR_RETURN(LocalClient * client,
                       ClientLibrary::GetOrCreateLocalClient(options));
 
-  std::vector<std::unique_ptr<Device>> devices;
+  std::vector<std::unique_ptr<PjRtDevice>> devices;
   for (int i = 0; i < client->device_count(); ++i) {
     se::StreamExecutorConfig config;
     config.ordinal = i;
diff --git a/tensorflow/compiler/xla/pjrt/cpu_device.h b/tensorflow/compiler/xla/pjrt/cpu_device.h
index c70d90ae228..ad0079b1c4a 100644
--- a/tensorflow/compiler/xla/pjrt/cpu_device.h
+++ b/tensorflow/compiler/xla/pjrt/cpu_device.h
@@ -23,7 +23,7 @@ limitations under the License.
 
 namespace xla {
 
-class CpuDevice : public Device {
+class CpuDevice : public PjRtDevice {
  public:
   CpuDevice(int id, std::unique_ptr<LocalDeviceState> local_device_state);
 };
diff --git a/tensorflow/compiler/xla/pjrt/distributed/BUILD b/tensorflow/compiler/xla/pjrt/distributed/BUILD
index 5cada95390c..175b4268dda 100644
--- a/tensorflow/compiler/xla/pjrt/distributed/BUILD
+++ b/tensorflow/compiler/xla/pjrt/distributed/BUILD
@@ -52,6 +52,9 @@ cc_library(
 tf_cc_test(
     name = "service_test",
     srcs = ["service_test.cc"],
+    tags = [
+        "nomsan",  # b/163629207
+    ],
     deps = [
         ":protocol_proto_cc",
         ":service",
@@ -106,6 +109,9 @@ cc_library(
 tf_cc_test(
     name = "client_server_test",
     srcs = ["client_server_test.cc"],
+    tags = [
+        "nomsan",  # b/163629207
+    ],
     deps = [
         ":client",
         ":protocol_proto_cc",
diff --git a/tensorflow/compiler/xla/pjrt/distributed/client.cc b/tensorflow/compiler/xla/pjrt/distributed/client.cc
index 55b02c6a09e..43c0c7b277d 100644
--- a/tensorflow/compiler/xla/pjrt/distributed/client.cc
+++ b/tensorflow/compiler/xla/pjrt/distributed/client.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <chrono>  // NOLINT
 
+#include "absl/time/time.h"
 #include "tensorflow/compiler/xla/pjrt/distributed/protocol.h"
 #include "tensorflow/compiler/xla/pjrt/distributed/util.h"
 
@@ -36,6 +37,7 @@ xla::Status DistributedRuntimeClient::Connect(
   ctx.set_deadline(absl::ToChronoTime(absl::Now() + rpc_timeout_));
   ConnectRequest request;
   request.set_protocol_version(kDistributedRuntimeProtocolVersion);
+  request.set_timeout_milliseconds(absl::ToInt64Milliseconds(rpc_timeout_));
   *request.mutable_local_topology() = local_topology;
   VLOG(10) << "Connect: " << request.DebugString();
   ConnectResponse response;
diff --git a/tensorflow/compiler/xla/pjrt/distributed/protocol.h b/tensorflow/compiler/xla/pjrt/distributed/protocol.h
index 4daa939ac8d..e8be43006f7 100644
--- a/tensorflow/compiler/xla/pjrt/distributed/protocol.h
+++ b/tensorflow/compiler/xla/pjrt/distributed/protocol.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 namespace xla {
 
-static constexpr int kDistributedRuntimeProtocolVersion = 1;
+static constexpr int kDistributedRuntimeProtocolVersion = 2;
 
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/pjrt/distributed/protocol.proto b/tensorflow/compiler/xla/pjrt/distributed/protocol.proto
index 18bfa221110..c3bbb3a7f5d 100644
--- a/tensorflow/compiler/xla/pjrt/distributed/protocol.proto
+++ b/tensorflow/compiler/xla/pjrt/distributed/protocol.proto
@@ -61,6 +61,7 @@ message ConnectRequest {
   int32 protocol_version = 1;  // Always 1 at present.
 
   LocalTopologyProto local_topology = 2;
+  int32 timeout_milliseconds = 3;
 }
 
 message ConnectResponse {
diff --git a/tensorflow/compiler/xla/pjrt/distributed/service.cc b/tensorflow/compiler/xla/pjrt/distributed/service.cc
index 3325fcd8319..868529637de 100644
--- a/tensorflow/compiler/xla/pjrt/distributed/service.cc
+++ b/tensorflow/compiler/xla/pjrt/distributed/service.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/pjrt/distributed/service.h"
 
+#include "absl/time/time.h"
 #include "tensorflow/compiler/xla/pjrt/distributed/protocol.h"
 #include "tensorflow/compiler/xla/pjrt/distributed/util.h"
 #include "tensorflow/compiler/xla/status.h"
@@ -69,11 +70,12 @@ void BuildGlobalTopology(absl::Span<LocalTopologyProto> local_topologies,
     mu_.AssertHeld();
     return num_nodes_present_ == nodes_.size();
   };
+  auto connect_timeout = absl::Milliseconds(request->timeout_milliseconds());
   if (!mu_.AwaitWithTimeout(absl::Condition(&all_nodes_present),
-                            kConnectTimeout)) {
+                            connect_timeout)) {
     return ToGrpcStatus(tensorflow::errors::DeadlineExceeded(
         "Timed out after %s waiting for all nodes to call Connect()",
-        absl::FormatDuration(kConnectTimeout)));
+        absl::FormatDuration(connect_timeout)));
   }
 
   if (node_id == 0) {
diff --git a/tensorflow/compiler/xla/pjrt/distributed/service.h b/tensorflow/compiler/xla/pjrt/distributed/service.h
index 9ecbdb3cc7c..fe323d9f3b2 100644
--- a/tensorflow/compiler/xla/pjrt/distributed/service.h
+++ b/tensorflow/compiler/xla/pjrt/distributed/service.h
@@ -50,8 +50,6 @@ class DistributedRuntimeServiceImpl final
                              KeyValueSetResponse* response) override;
 
  private:
-  const absl::Duration kConnectTimeout = absl::Seconds(120);
-
   absl::Mutex mu_;
   enum class State { kInitializing, kRunning };
   State state_ ABSL_GUARDED_BY(mu_) = State::kInitializing;
diff --git a/tensorflow/compiler/xla/pjrt/gpu_multistream_test.cc b/tensorflow/compiler/xla/pjrt/gpu_multistream_test.cc
index d54be61fbb8..298c41c7f58 100644
--- a/tensorflow/compiler/xla/pjrt/gpu_multistream_test.cc
+++ b/tensorflow/compiler/xla/pjrt/gpu_multistream_test.cc
@@ -32,7 +32,7 @@ TEST(GpuMultiStream, Basics) {
       GetNvidiaGpuClient(/*asynchronous=*/true, GpuAllocatorConfig(),
                          /*distributed_client=*/nullptr, /*node_id=*/0));
 
-  Device* device = client->local_devices().at(0);
+  PjRtDevice* device = client->local_devices().at(0);
 
   int n = 1024;
   Shape shape = ShapeUtil::MakeShape(S32, {n});
diff --git a/tensorflow/compiler/xla/pjrt/interpreter_device.cc b/tensorflow/compiler/xla/pjrt/interpreter_device.cc
index f7138a8c181..c1149f2dbf9 100644
--- a/tensorflow/compiler/xla/pjrt/interpreter_device.cc
+++ b/tensorflow/compiler/xla/pjrt/interpreter_device.cc
@@ -25,8 +25,8 @@ static const char kInterpreterPlatformName[] = "interpreter";
 
 InterpreterDevice::InterpreterDevice(
     int id, std::unique_ptr<LocalDeviceState> local_device_state)
-    : Device(id, std::move(local_device_state), kInterpreterPlatformName,
-             /*device_kind=*/kInterpreterPlatformName) {}
+    : PjRtDevice(id, std::move(local_device_state), kInterpreterPlatformName,
+                 /*device_kind=*/kInterpreterPlatformName) {}
 
 StatusOr<std::shared_ptr<PjRtClient>> GetInterpreterClient() {
   TF_ASSIGN_OR_RETURN(se::Platform * platform,
@@ -40,7 +40,7 @@ StatusOr<std::shared_ptr<PjRtClient>> GetInterpreterClient() {
   TF_ASSIGN_OR_RETURN(LocalClient * client,
                       ClientLibrary::GetOrCreateLocalClient(options));
 
-  std::vector<std::unique_ptr<Device>> devices;
+  std::vector<std::unique_ptr<PjRtDevice>> devices;
   se::StreamExecutor* executor =
       client->backend().stream_executor(0).ValueOrDie();
   auto device_state = absl::make_unique<LocalDeviceState>(
diff --git a/tensorflow/compiler/xla/pjrt/interpreter_device.h b/tensorflow/compiler/xla/pjrt/interpreter_device.h
index 58b210ad762..cf732f70124 100644
--- a/tensorflow/compiler/xla/pjrt/interpreter_device.h
+++ b/tensorflow/compiler/xla/pjrt/interpreter_device.h
@@ -23,7 +23,7 @@ limitations under the License.
 
 namespace xla {
 
-class InterpreterDevice : public Device {
+class InterpreterDevice : public PjRtDevice {
  public:
   InterpreterDevice(int id,
                     std::unique_ptr<LocalDeviceState> local_device_state);
diff --git a/tensorflow/compiler/xla/pjrt/nvidia_gpu_device.cc b/tensorflow/compiler/xla/pjrt/nvidia_gpu_device.cc
index edffaf6c877..6e387f8738f 100644
--- a/tensorflow/compiler/xla/pjrt/nvidia_gpu_device.cc
+++ b/tensorflow/compiler/xla/pjrt/nvidia_gpu_device.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_host_allocator.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_mem_allocator.h"
+#include "tensorflow/core/util/env_var.h"
 #include "tensorflow/stream_executor/tf_allocator_adapter.h"
 
 namespace xla {
@@ -89,12 +90,20 @@ StatusOr<std::unique_ptr<se::MultiDeviceAdapter>> CreateBFCAllocator(
   CHECK_GT(local_devices.size(), 0);
   const se::Platform* platform = local_devices.front()->executor()->platform();
   std::vector<se::MultiDeviceAdapter::AllocatorWithStream> allocators;
+  bool enable_unified_memory;
+  Status status = tensorflow::ReadBoolFromEnvVar("TF_FORCE_UNIFIED_MEMORY",
+                                                 false, &enable_unified_memory);
+  if (!status.ok()) {
+    LOG(ERROR) << "Unable to read TF_FORCE_UNIFIED_MEMORY: "
+               << status.error_message();
+  }
+
   for (auto& local_device : local_devices) {
     se::StreamExecutor* executor = local_device->executor();
     int device_ordinal = executor->device_ordinal();
     auto sub_allocator = absl::make_unique<tensorflow::GPUMemAllocator>(
         executor, tensorflow::PlatformGpuId(device_ordinal),
-        /*use_unified_memory=*/false,
+        /*use_unified_memory=*/enable_unified_memory,
         /*alloc_visitors=*/std::vector<tensorflow::SubAllocator::Visitor>(),
         /*free_visitors=*/std::vector<tensorflow::SubAllocator::Visitor>());
 
@@ -104,7 +113,10 @@ StatusOr<std::unique_ptr<se::MultiDeviceAdapter>> CreateBFCAllocator(
       return Unavailable("Failed to query available memory from device %i",
                          device_ordinal);
     }
-    size_t allocator_memory = free_memory * memory_fraction;
+    // To allow full GPU memory to be visible to the BFC allocator if using
+    // unified memory.
+    size_t allocator_memory =
+        enable_unified_memory ? total_memory : free_memory * memory_fraction;
     if (preallocate) {
       LOG(INFO) << "XLA backend allocating " << allocator_memory
                 << " bytes on device " << device_ordinal
@@ -207,9 +219,9 @@ StatusOr<std::string> NcclIdStore::GetNcclUniqueId(const NcclCliqueKey& key) {
   return cache_.emplace(key_string, result.ValueOrDie()).first->second;
 }
 
-std::vector<std::unique_ptr<Device>> BuildLocalDevices(
+std::vector<std::unique_ptr<PjRtDevice>> BuildLocalDevices(
     std::vector<std::unique_ptr<LocalDeviceState>> local_device_states) {
-  std::vector<std::unique_ptr<Device>> devices;
+  std::vector<std::unique_ptr<PjRtDevice>> devices;
   for (auto& local_device : local_device_states) {
     int device_ordinal = local_device->device_ordinal();
     const se::DeviceDescription& description =
@@ -225,7 +237,7 @@ std::vector<std::unique_ptr<Device>> BuildLocalDevices(
 Status BuildDistributedDevices(
     std::vector<std::unique_ptr<LocalDeviceState>> local_device_states,
     std::shared_ptr<DistributedRuntimeClient> distributed_client, int node_id,
-    std::vector<std::unique_ptr<Device>>* devices,
+    std::vector<std::unique_ptr<PjRtDevice>>* devices,
     GpuExecutableRunOptions* gpu_executable_run_options) {
   LocalTopologyProto local_topology;
   local_topology.set_node_id(node_id);
@@ -286,8 +298,8 @@ Status BuildDistributedDevices(
 GpuDevice::GpuDevice(int id,
                      std::unique_ptr<LocalDeviceState> local_device_state,
                      std::string device_kind, int node_id)
-    : Device(id, std::move(local_device_state), kGpuPlatformName,
-             std::move(device_kind), node_id) {}
+    : PjRtDevice(id, std::move(local_device_state), kGpuPlatformName,
+                 std::move(device_kind), node_id) {}
 
 StatusOr<std::shared_ptr<PjRtClient>> GetNvidiaGpuClient(
     bool asynchronous, const GpuAllocatorConfig& allocator_config,
@@ -302,7 +314,7 @@ StatusOr<std::shared_ptr<PjRtClient>> GetNvidiaGpuClient(
   auto host_memory_allocator =
       GetGpuHostAllocator(local_device_states.front()->executor());
 
-  std::vector<std::unique_ptr<Device>> devices;
+  std::vector<std::unique_ptr<PjRtDevice>> devices;
   auto gpu_run_options = absl::make_unique<GpuExecutableRunOptions>();
   if (distributed_client) {
     TF_RETURN_IF_ERROR(BuildDistributedDevices(
diff --git a/tensorflow/compiler/xla/pjrt/nvidia_gpu_device.h b/tensorflow/compiler/xla/pjrt/nvidia_gpu_device.h
index bf59ddef3a9..4f22a169bd8 100644
--- a/tensorflow/compiler/xla/pjrt/nvidia_gpu_device.h
+++ b/tensorflow/compiler/xla/pjrt/nvidia_gpu_device.h
@@ -25,7 +25,7 @@ limitations under the License.
 
 namespace xla {
 
-class GpuDevice : public Device {
+class GpuDevice : public PjRtDevice {
  public:
   GpuDevice(int id, std::unique_ptr<LocalDeviceState> local_device_state,
             std::string device_kind, int node_id);
diff --git a/tensorflow/compiler/xla/pjrt/pjrt_client.cc b/tensorflow/compiler/xla/pjrt/pjrt_client.cc
index c5dce4a37f7..099c7729679 100644
--- a/tensorflow/compiler/xla/pjrt/pjrt_client.cc
+++ b/tensorflow/compiler/xla/pjrt/pjrt_client.cc
@@ -112,19 +112,19 @@ limitations under the License.
 
 namespace xla {
 
-StatusOr<LocalDeviceState*> Device::GetLocalDeviceState() const {
+StatusOr<LocalDeviceState*> PjRtDevice::GetLocalDeviceState() const {
   if (local_device_state_) {
     return local_device_state_.get();
   }
   return InvalidArgument("Device %s is not a local device.", DebugString());
 }
 
-std::string Device::DebugString() const {
+std::string PjRtDevice::DebugString() const {
   return absl::StrCat(platform_name(), ":", id());
 }
 
 StatusOr<DeviceAssignment> DevicesToDeviceAssignment(
-    absl::Span<const std::vector<Device*>> devices) {
+    absl::Span<const std::vector<PjRtDevice*>> devices) {
   if (devices.empty()) {
     return InvalidArgument(
         "Device assignment passed to Compile() must be non-empty.");
@@ -175,7 +175,7 @@ class CpuAllocator : public tensorflow::Allocator {
 
 PjRtClient::PjRtClient(
     std::string platform_name, LocalClient* client,
-    std::vector<std::unique_ptr<Device>> devices, int host_id,
+    std::vector<std::unique_ptr<PjRtDevice>> devices, int host_id,
     std::unique_ptr<se::DeviceMemoryAllocator> allocator,
     std::unique_ptr<tensorflow::Allocator> host_memory_allocator,
     bool should_stage_host_to_device_transfers,
@@ -201,7 +201,7 @@ PjRtClient::PjRtClient(
     host_memory_allocator_ = std::make_unique<CpuAllocator>();
   }
 
-  for (const std::unique_ptr<Device>& device : devices_) {
+  for (const std::unique_ptr<PjRtDevice>& device : devices_) {
     CHECK(id_to_device_.insert({device->id(), device.get()}).second)
         << "Duplicate device id: " << device->id();
 
@@ -376,8 +376,9 @@ void RecordUsage(PjRtBuffer::ScopedHold device_buffer,
 // It is safe to delete the returned PjRtBuffer without further
 // synchronization if an error occurs before the buffer is used.
 StatusOr<std::unique_ptr<PjRtBuffer>> AllocateDestinationBuffer(
-    const Shape& on_host_shape, Device* device, LocalDeviceState* local_device,
-    se::Stream* copy_stream, bool is_uninitialized_create, PjRtClient* client) {
+    const Shape& on_host_shape, PjRtDevice* device,
+    LocalDeviceState* local_device, se::Stream* copy_stream,
+    bool is_uninitialized_create, PjRtClient* client) {
   if (on_host_shape.IsTuple() && on_host_shape.tuple_shapes_size() == 0) {
     return InvalidArgument("Can't make a buffer from an empty tuple");
   }
@@ -574,7 +575,7 @@ StatusOr<std::unique_ptr<PjRtBuffer>> PjRtBuffer::FromHostBuffer(
     const void* data, const Shape& shape,
     HostBufferSemantics host_buffer_semantics,
     std::shared_ptr<void> buffer_reference, PjRtClient* client,
-    Device* device) {
+    PjRtDevice* device) {
   tensorflow::profiler::TraceMe traceme("PjRtBuffer::FromHostBuffer");
   VLOG(2) << "PjRtBuffer::FromHostBuffer: shape: " << shape.ToString()
           << " device: " << device->DebugString();
@@ -736,7 +737,7 @@ StatusOr<std::unique_ptr<PjRtBuffer>> PjRtBuffer::FromHostBuffer(
 
 /* static */
 StatusOr<std::unique_ptr<PjRtBuffer>> PjRtBuffer::CreateUninitialized(
-    const Shape& shape, PjRtClient* client, Device* device) {
+    const Shape& shape, PjRtClient* client, PjRtDevice* device) {
   tensorflow::profiler::TraceMe traceme("PjRtBuffer::CreateUninitialized");
   VLOG(2) << "PjRtBuffer::CreateUninitialized: shape: " << shape.ToString()
           << " device: " << device->DebugString();
@@ -755,7 +756,7 @@ StatusOr<std::unique_ptr<PjRtBuffer>> PjRtBuffer::CreateUninitialized(
 
 /* static */
 StatusOr<std::unique_ptr<PjRtBuffer>> PjRtBuffer::FromHostLiteral(
-    const LiteralSlice& literal, PjRtClient* client, Device* device) {
+    const LiteralSlice& literal, PjRtClient* client, PjRtDevice* device) {
   tensorflow::profiler::TraceMe traceme("PjRtBuffer::FromHostLiteral");
   VLOG(2) << "PjRtBuffer::FromHostLiteral: shape: "
           << literal.shape().ToString() << " device: " << device->DebugString();
@@ -815,7 +816,7 @@ StatusOr<std::unique_ptr<PjRtBuffer>> PjRtBuffer::FromHostLiteral(
 }
 
 /*static*/ void PjRtBuffer::MakeCrossHostReceiveBuffers(
-    absl::Span<const Shape> shapes, PjRtClient* client, Device* device,
+    absl::Span<const Shape> shapes, PjRtClient* client, PjRtDevice* device,
     PjRtCrossHostRecvNotifier&& notifier) {
   if (shapes.empty()) {
     notifier(InvalidArgument(
@@ -849,7 +850,7 @@ StatusOr<std::unique_ptr<PjRtBuffer>> PjRtBuffer::FromHostLiteral(
 
 PjRtBuffer::PjRtBuffer(Shape on_host_shape, Shape on_device_shape,
                        std::shared_ptr<TrackedDeviceBuffer> device_buffer,
-                       PjRtClient* client, Device* device)
+                       PjRtClient* client, PjRtDevice* device)
     : client_(client),
       on_host_shape_(std::move(on_host_shape)),
       on_device_shape_(std::move(on_device_shape)),
@@ -1189,7 +1190,7 @@ PjRtBuffer::ScopedHold PjRtBuffer::GetBufferWithHold(ScopedHold::Type type) {
 StatusOr<std::pair<std::unique_ptr<PjRtBuffer>,
                    std::shared_ptr<BufferSequencingEvent>>>
 PjRtBuffer::CopyToDeviceHelper(
-    Device* dst_device, LocalDeviceState* dst_local_device,
+    PjRtDevice* dst_device, LocalDeviceState* dst_local_device,
     LocalDeviceState* transfer_local_device, se::Stream* transfer_stream,
     std::shared_ptr<TrackedDeviceBuffer> src_device_buffer) {
   TF_ASSIGN_OR_RETURN(
@@ -1249,7 +1250,7 @@ PjRtBuffer::CopyToDeviceHelper(
 }
 
 StatusOr<std::unique_ptr<PjRtBuffer>> PjRtBuffer::CopyToDevice(
-    Device* dst_device) {
+    PjRtDevice* dst_device) {
   tensorflow::profiler::TraceMe traceme("PjRtBuffer::CopyToDevice");
   if (dst_device == device_) {
     return InvalidArgument(
@@ -1342,8 +1343,6 @@ namespace {
 // Helper struct for the tuple that is transiently constructed to hold the
 // arguments of an execution.
 struct TupleHandle {
-  // The tuple's shape on the host.
-  Shape on_host_shape;
   // The ExecutionInput describing the tuple.
   ExecutionInput execution_input;
   // A definition event that has been recorded on the host_to_device stream
@@ -1414,8 +1413,7 @@ StatusOr<TupleHandle> MakeTupleHelper(
 
   auto transfer_event = std::make_shared<BufferSequencingEvent>();
   transfer_event->SetSequencingEvent(event_or.ConsumeValueOrDie(), stream);
-  return TupleHandle({std::move(on_host_shape), std::move(execution_input),
-                      std::move(transfer_event)});
+  return TupleHandle({std::move(execution_input), std::move(transfer_event)});
 }
 
 // Converts a ScopedShapedBuffer returned from an execution into a
@@ -1423,20 +1421,20 @@ StatusOr<TupleHandle> MakeTupleHelper(
 std::unique_ptr<PjRtBuffer> OutputBufferHelper(
     ScopedShapedBuffer* result_buffer,
     std::shared_ptr<BufferSequencingEvent> definition_event, PjRtClient* client,
-    Device* device, LocalDeviceState* local_device) {
+    PjRtDevice* device, LocalDeviceState* local_device) {
   std::shared_ptr<TrackedDeviceBuffer> out_buffer =
       TrackedDeviceBuffer::FromScopedShapedBuffer(result_buffer,
                                                   {definition_event});
-  auto py_buffer = absl::make_unique<PjRtBuffer>(
+  auto pjrt_buffer = absl::make_unique<PjRtBuffer>(
       result_buffer->on_host_shape(), result_buffer->on_device_shape(),
       std::move(out_buffer), client, device);
-  RecordUsage(py_buffer->GetBufferWithUsageHold(), local_device, local_device,
+  RecordUsage(pjrt_buffer->GetBufferWithUsageHold(), local_device, local_device,
               definition_event, local_device->compute_stream(),
               /*prefer_to_retain_reference=*/false);
-  return py_buffer;
+  return pjrt_buffer;
 }
 
-static Device* LookupDevice(const PjRtClient& client, int device_id) {
+static PjRtDevice* LookupDevice(const PjRtClient& client, int device_id) {
   auto it = client.id_to_device().find(device_id);
   CHECK(it != client.id_to_device().end())
       << "Unknown device id: " << device_id;
@@ -1450,7 +1448,7 @@ PjRtExecutable::PjRtExecutable(
     bool parameter_is_tupled_arguments,
     std::shared_ptr<DeviceAssignment> device_assignment,
     std::vector<std::pair<int, int>> local_logical_device_ids,
-    std::vector<Device*> local_devices, PjRtClient* client)
+    std::vector<PjRtDevice*> local_devices, PjRtClient* client)
     : client_(client),
       device_assignment_(std::move(device_assignment)),
       parameter_is_tupled_arguments_(parameter_is_tupled_arguments),
@@ -1508,15 +1506,64 @@ const std::string& PjRtExecutable::name() const {
   }
 }
 
+bool PjRtExecutable::MustDonateParameter(int executable_idx,
+                                         int parameter) const {
+  return parameters_that_must_be_donated_[executable_idx].contains(parameter);
+}
+
+StatusOr<std::vector<ExecutionInput>>
+PjRtExecutable::MakeExecutionInputsAndWaitForEvents(
+    int device_ordinal, const ExecuteOptions& options,
+    absl::Span<PjRtBuffer* const> argument_handles,
+    absl::Span<const PjRtBuffer::ScopedHold> device_buffers,
+    absl::flat_hash_set<BufferSequencingEvent*>& events) const {
+  std::vector<ExecutionInput> execution_inputs;
+  LocalDeviceState* device_state = &client_->device_state(device_ordinal);
+  // Lift tuple_handle outside the conditional so that the event it returns is
+  // not destroyed until after the loop below that waits on events.
+  absl::optional<TupleHandle> tuple_handle;
+  if (parameter_is_tupled_arguments_ && !options.arguments_are_tupled) {
+    TF_ASSIGN_OR_RETURN(tuple_handle,
+                        MakeTupleHelper(client_, device_state, argument_handles,
+                                        device_buffers, device_ordinal));
+    events.insert(tuple_handle->event.get());
+    execution_inputs.emplace_back(std::move(tuple_handle->execution_input));
+  } else {
+    execution_inputs.reserve(argument_handles.size());
+    for (int i = 0; i < argument_handles.size(); ++i) {
+      PjRtBuffer* handle = argument_handles[i];
+
+      // Make an ExecutionInput from the device buffer.
+      execution_inputs.emplace_back(handle->on_device_shape(),
+                                    handle->on_host_shape());
+      ExecutionInput& execution_input = execution_inputs.back();
+      ShapeTree<MaybeOwningDeviceMemory>::iterator input_iterator =
+          execution_input.MutableBuffers()->begin();
+      ShapeTree<MaybeOwningDeviceMemory>::iterator iterator_end =
+          execution_input.MutableBuffers()->end();
+      device_buffers[i].AddToInput(&input_iterator, iterator_end,
+                                   &execution_input, client_->allocator());
+      CHECK(input_iterator == iterator_end);
+    }
+  }
+
+  for (BufferSequencingEvent* event : events) {
+    event->WaitForEventOnStream(device_state->compute_stream());
+  }
+
+  return execution_inputs;
+}
+
 // Enqueues a computation onto the compute stream. Each buffer returned in
 // device_buffers has a usage hold added that must be dropped on error or
 // converted on success.
 StatusOr<ScopedShapedBuffer> PjRtExecutable::EnqueueExecution(
     absl::Span<PjRtBuffer* const> argument_handles, int replica, int partition,
     int executable_idx, const RunId& run_id, const ExecuteOptions& options,
-    Device* device, std::vector<PjRtBuffer::ScopedHold>* device_buffers,
+    PjRtDevice* device, std::vector<PjRtBuffer::ScopedHold>* device_buffers,
     std::shared_ptr<DeviceAssignment> device_assignment) const {
   int device_ordinal = device->local_device_state()->device_ordinal();
+  LocalDeviceState* device_state = &client_->device_state(device_ordinal);
   tensorflow::profiler::TraceMeConsumer activity(
       "LocalExecutable::Execute", tensorflow::profiler::ContextType::kPjRt,
       run_id.ToInt());
@@ -1524,10 +1571,7 @@ StatusOr<ScopedShapedBuffer> PjRtExecutable::EnqueueExecution(
           << " mapped to device ordinal for execution: " << device_ordinal;
 
   absl::flat_hash_set<BufferSequencingEvent*> events;
-  std::vector<ExecutionInput> execution_inputs;
   device_buffers->reserve(argument_handles.size());
-  const absl::flat_hash_set<int>& parameters_that_must_be_donated =
-      parameters_that_must_be_donated_[executable_idx];
   for (int i = 0; i < argument_handles.size(); ++i) {
     PjRtBuffer* handle = argument_handles[i];
     if (handle->device() != device) {
@@ -1536,8 +1580,7 @@ StatusOr<ScopedShapedBuffer> PjRtExecutable::EnqueueExecution(
           "device %s, but replica is assigned to device %s.",
           i, replica, handle->device()->DebugString(), device->DebugString());
     }
-    bool must_donate = parameters_that_must_be_donated.find(i) !=
-                       parameters_that_must_be_donated.end();
+    bool must_donate = MustDonateParameter(executable_idx, i);
     device_buffers->emplace_back(handle->GetBufferWithHold(
         must_donate ? PjRtBuffer::ScopedHold::kDonation
                     : PjRtBuffer::ScopedHold::kUsage));
@@ -1571,37 +1614,10 @@ StatusOr<ScopedShapedBuffer> PjRtExecutable::EnqueueExecution(
     }
   }
 
-  LocalDeviceState* device_state = &client_->device_state(device_ordinal);
-  absl::optional<TupleHandle> tuple_handle;
-  if (parameter_is_tupled_arguments_ && !options.arguments_are_tupled) {
-    TF_ASSIGN_OR_RETURN(tuple_handle,
-                        MakeTupleHelper(client_, device_state, argument_handles,
-                                        *device_buffers, device_ordinal));
-    events.insert(tuple_handle->event.get());
-    execution_inputs.emplace_back(std::move(tuple_handle->execution_input));
-  } else {
-    execution_inputs.reserve(argument_handles.size());
-    for (int i = 0; i < argument_handles.size(); ++i) {
-      PjRtBuffer* handle = argument_handles[i];
-
-      const PjRtBuffer::ScopedHold& device_buffer = (*device_buffers)[i];
-      // Make an ExecutionInput from the device buffer.
-      execution_inputs.emplace_back(handle->on_device_shape(),
-                                    handle->on_host_shape());
-      ExecutionInput& execution_input = execution_inputs.back();
-      ShapeTree<MaybeOwningDeviceMemory>::iterator input_iterator =
-          execution_input.MutableBuffers()->begin();
-      ShapeTree<MaybeOwningDeviceMemory>::iterator iterator_end =
-          execution_input.MutableBuffers()->end();
-      device_buffer.AddToInput(&input_iterator, iterator_end, &execution_input,
-                               client_->allocator());
-      CHECK(input_iterator == iterator_end);
-    }
-  }
-
-  for (BufferSequencingEvent* event : events) {
-    event->WaitForEventOnStream(device_state->compute_stream());
-  }
+  TF_ASSIGN_OR_RETURN(
+      std::vector<ExecutionInput> execution_inputs,
+      MakeExecutionInputsAndWaitForEvents(
+          device_ordinal, options, argument_handles, *device_buffers, events));
 
   ExecutableRunOptions run_options;
   run_options.set_stream(device_state->compute_stream());
@@ -1676,11 +1692,45 @@ StatusOr<ScopedShapedBuffer> PjRtExecutable::EnqueueExecution(
   return result_buffer_or_status.ConsumeValueOrDie().ConsumeResult();
 }
 
+std::vector<std::unique_ptr<PjRtBuffer>> PjRtExecutable::MakeOutputBuffers(
+    int device_ordinal, const ExecuteOptions& options,
+    ScopedShapedBuffer result_buffer,
+    std::shared_ptr<BufferSequencingEvent> definition_event,
+    PjRtDevice* device) const {
+  std::vector<std::unique_ptr<PjRtBuffer>> outputs;
+  LocalDeviceState* device_state = &client_->device_state(device_ordinal);
+  if (options.untuple_result && result_buffer.on_host_shape().IsTuple()) {
+    int tuple_count = result_buffer.on_host_shape().tuple_shapes_size();
+    outputs.reserve(tuple_count);
+    // Take ownership of each of the output values, leaving only the root table
+    // in result_buffer.
+    for (int i = 0; i < tuple_count; ++i) {
+      ScopedShapedBuffer tuple_buffer = result_buffer.TakeSubTree({i});
+      outputs.push_back(OutputBufferHelper(&tuple_buffer, definition_event,
+                                           client_, device, device_state));
+    }
+    if (device_state->allocation_model() == LocalDeviceState::kSynchronous) {
+      // Don't release the root buffer until after execution completes.
+      ShapedBuffer root_buffer_holder = result_buffer.release();
+      se::DeviceMemoryBase root_buffer = root_buffer_holder.root_buffer();
+      device_state->ThenExecuteOnCallbackThread(
+          device_state->compute_stream(),
+          [root_buffer, allocator{client_->allocator()}, device_ordinal]() {
+            TF_CHECK_OK(allocator->Deallocate(device_ordinal, root_buffer));
+          });
+    }
+  } else {
+    outputs.push_back(OutputBufferHelper(&result_buffer, definition_event,
+                                         client_, device, device_state));
+  }
+  return outputs;
+}
+
 StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
 PjRtExecutable::ExecuteHelper(absl::Span<PjRtBuffer* const> argument_handles,
                               int replica, int partition, const RunId& run_id,
                               const ExecuteOptions& options,
-                              Device* device) const {
+                              PjRtDevice* device) const {
   std::shared_ptr<DeviceAssignment> device_assignment;
   if (device == nullptr) {
     CHECK(device_assignment_ != nullptr);
@@ -1737,31 +1787,9 @@ PjRtExecutable::ExecuteHelper(absl::Span<PjRtBuffer* const> argument_handles,
   }
   auto definition_event = std::make_shared<BufferSequencingEvent>();
   definition_event->SetSequencingEvent(event_or.ConsumeValueOrDie(), stream);
-  std::vector<std::unique_ptr<PjRtBuffer>> outputs;
-  if (options.untuple_result && result_buffer.on_host_shape().IsTuple()) {
-    int tuple_count = result_buffer.on_host_shape().tuple_shapes_size();
-    outputs.reserve(tuple_count);
-    // Take ownership of each of the output values, leaving only the root table
-    // in result_buffer.
-    for (int i = 0; i < tuple_count; ++i) {
-      ScopedShapedBuffer tuple_buffer = result_buffer.TakeSubTree({i});
-      outputs.push_back(OutputBufferHelper(&tuple_buffer, definition_event,
-                                           client_, device, device_state));
-    }
-    if (device_state->allocation_model() == LocalDeviceState::kSynchronous) {
-      // Don't release the root buffer until after execution completes.
-      ShapedBuffer root_buffer_holder = result_buffer.release();
-      se::DeviceMemoryBase root_buffer = root_buffer_holder.root_buffer();
-      device_state->ThenExecuteOnCallbackThread(
-          device_state->compute_stream(),
-          [root_buffer, allocator{client_->allocator()}, device_ordinal]() {
-            TF_CHECK_OK(allocator->Deallocate(device_ordinal, root_buffer));
-          });
-    }
-  } else {
-    outputs.push_back(OutputBufferHelper(&result_buffer, definition_event,
-                                         client_, device, device_state));
-  }
+  std::vector<std::unique_ptr<PjRtBuffer>> outputs =
+      MakeOutputBuffers(device_ordinal, options, std::move(result_buffer),
+                        definition_event, device);
 
   for (PjRtBuffer::ScopedHold& b : device_buffers) {
     // prefer_to_retain_reference=false because when using the
@@ -1801,7 +1829,7 @@ StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> PjRtExecutable::Execute(
 
 StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
 PjRtExecutable::ExecuteOnLocalDevice(
-    absl::Span<PjRtBuffer* const> argument_handles, Device* device,
+    absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
     const ExecuteOptions& options) const {
   if (device_assignment_ == nullptr) {
     VLOG(1) << "Executing portable single-core program on "
@@ -1867,7 +1895,7 @@ PjRtExecutable::ExecuteOnLocalDevices(
     for (int i = 0; i < num_local_devices; ++i) {
       const int replica = local_logical_device_ids_[i].first;
       const int partition = local_logical_device_ids_[i].second;
-      Device* device = local_devices_[i];
+      PjRtDevice* device = local_devices_[i];
       const LocalDeviceState& device_state = *device->local_device_state();
       device_state.execute_thread()->Schedule([&, replica, partition, i] {
         results[i] = ExecuteHelper(argument_handles[i], replica, partition,
@@ -2114,12 +2142,12 @@ StatusOr<std::pair<std::vector<Shape>, Shape>> GetShardedProgramShapes(
   build_options.set_result_layout(result_layout);
 
   std::vector<std::pair<int, int>> local_logical_device_ids;
-  std::vector<Device*> local_devices;
+  std::vector<PjRtDevice*> local_devices;
   if (device_assignment != nullptr) {
     for (int replica = 0; replica < num_replicas; ++replica) {
       for (int partition = 0; partition < num_partitions; ++partition) {
         int device_id = (*device_assignment)(replica, partition);
-        Device* device = LookupDevice(*client, device_id);
+        PjRtDevice* device = LookupDevice(*client, device_id);
         if (device->host_id() != client->host_id()) {
           VLOG(3) << "Non-local device: " << device_id;
           continue;
diff --git a/tensorflow/compiler/xla/pjrt/pjrt_client.h b/tensorflow/compiler/xla/pjrt/pjrt_client.h
index bb9093a8bf7..39711534f79 100644
--- a/tensorflow/compiler/xla/pjrt/pjrt_client.h
+++ b/tensorflow/compiler/xla/pjrt/pjrt_client.h
@@ -52,17 +52,18 @@ namespace xla {
 
 class PjRtClient;
 
-class Device {
+class PjRtDevice {
  public:
-  explicit Device(int id, std::unique_ptr<LocalDeviceState> local_device_state,
-                  std::string platform_name, std::string device_kind,
-                  int host_id = 0)
+  explicit PjRtDevice(int id,
+                      std::unique_ptr<LocalDeviceState> local_device_state,
+                      std::string platform_name, std::string device_kind,
+                      int host_id = 0)
       : id_(id),
         local_device_state_(std::move(local_device_state)),
         host_id_(host_id),
         platform_name_(std::move(platform_name)),
         device_kind_(std::move(device_kind)) {}
-  virtual ~Device() {}
+  virtual ~PjRtDevice() {}
 
   // The ID of this device. IDs are unique among devices of this type
   // (e.g. CPUs, GPUs). On multi-host platforms, this will be unique across all
@@ -130,7 +131,7 @@ class PjRtClient {
   // `allocator` may null, in which case the platform default allocator is used.
   explicit PjRtClient(
       std::string platform_name, LocalClient* client,
-      std::vector<std::unique_ptr<Device>> devices, int host_id,
+      std::vector<std::unique_ptr<PjRtDevice>> devices, int host_id,
       std::unique_ptr<se::DeviceMemoryAllocator> allocator,
       std::unique_ptr<tensorflow::Allocator> host_memory_allocator,
       bool should_stage_host_to_device_transfers,
@@ -142,11 +143,15 @@ class PjRtClient {
 
   int device_count() const { return devices_.size(); }
   int local_device_count() const { return local_devices_.size(); }
-  const std::vector<std::unique_ptr<Device>>& devices() const {
+  const std::vector<std::unique_ptr<PjRtDevice>>& devices() const {
     return devices_;
   }
-  const std::vector<Device*>& local_devices() const { return local_devices_; }
-  const std::map<int, Device*>& id_to_device() const { return id_to_device_; }
+  const std::vector<PjRtDevice*>& local_devices() const {
+    return local_devices_;
+  }
+  const std::map<int, PjRtDevice*>& id_to_device() const {
+    return id_to_device_;
+  }
   int host_id() const { return host_id_; }
   const std::string& platform_name() const { return platform_name_; }
 
@@ -210,11 +215,11 @@ class PjRtClient {
   std::unique_ptr<tensorflow::Allocator> host_memory_allocator_;
 
   // Includes all devices, including non-local devices on multi-host platforms.
-  std::vector<std::unique_ptr<Device>> devices_;
+  std::vector<std::unique_ptr<PjRtDevice>> devices_;
   // Maps Device::id() to the corresponding Device. Includes all devices.
-  std::map<int, Device*> id_to_device_;
+  std::map<int, PjRtDevice*> id_to_device_;
   // Local devices indexed by local device ordinal.
-  std::vector<Device*> local_devices_;
+  std::vector<PjRtDevice*> local_devices_;
   int host_id_;
 
   se::DeviceMemoryAllocator* allocator_;
@@ -233,7 +238,7 @@ class PjRtClient {
 // Converts a 2D set of Device objects indexed by [replica][partition] into an
 // xla::DeviceAssignment.
 StatusOr<DeviceAssignment> DevicesToDeviceAssignment(
-    absl::Span<const std::vector<Device*>> devices);
+    absl::Span<const std::vector<PjRtDevice*>> devices);
 
 // Holds a reference from Python to a tuple of device buffers. A PjRtBuffer
 // can be either valid or invalid. An invalid buffer is one that has never been
@@ -417,7 +422,7 @@ class PjRtBuffer {
 
   // Returns a buffer with uninitialized contents.
   static StatusOr<std::unique_ptr<PjRtBuffer>> CreateUninitialized(
-      const Shape& shape, PjRtClient* client, Device* device);
+      const Shape& shape, PjRtClient* client, PjRtDevice* device);
 
   // Describes the semantics the caller to FromHostBuffer expects from the
   // runtime, in a total order from most restrictive to least restrictive.
@@ -449,13 +454,13 @@ class PjRtBuffer {
       const void* data, const Shape& shape,
       HostBufferSemantics host_buffer_semantics,
       std::shared_ptr<void> buffer_reference, PjRtClient* client,
-      Device* device);
+      PjRtDevice* device);
 
   // Note that literal must remain in scope until the transfer has completed, so
   // the caller should, for example, wait for BlockHostUntilReady() completes on
   // the return value before letting literal go out of scope.
   static StatusOr<std::unique_ptr<PjRtBuffer>> FromHostLiteral(
-      const LiteralSlice& literal, PjRtClient* client, Device* device);
+      const LiteralSlice& literal, PjRtClient* client, PjRtDevice* device);
 
   // Asynchronously makes a vector of PjRtBuffers that can be used to receive
   // cross host transfers using `client` on `device'. `shapes` must be the exact
@@ -467,12 +472,13 @@ class PjRtBuffer {
   // sending host and used in a call to CopyToRemoteDevice. None of the recv
   // buffers will become ready until *all* of the sends have completed.
   static void MakeCrossHostReceiveBuffers(absl::Span<const Shape> shapes,
-                                          PjRtClient* client, Device* device,
+                                          PjRtClient* client,
+                                          PjRtDevice* device,
                                           PjRtCrossHostRecvNotifier&& notifier);
 
   PjRtBuffer(Shape on_host_shape, Shape on_device_shape,
              std::shared_ptr<TrackedDeviceBuffer> device_buffer,
-             PjRtClient* client, Device* device);
+             PjRtClient* client, PjRtDevice* device);
   ~PjRtBuffer();
 
   PjRtBuffer(const PjRtBuffer&) = delete;
@@ -482,7 +488,7 @@ class PjRtBuffer {
 
   const Shape& on_host_shape() const { return on_host_shape_; }
   const Shape& on_device_shape() const { return on_device_shape_; }
-  Device* device() const { return device_; }
+  PjRtDevice* device() const { return device_; }
   const std::string& platform_name() const { return client_->platform_name(); }
   PjRtClient* client() const { return client_; }
   bool IsEmptyTuple() const {
@@ -556,7 +562,7 @@ class PjRtBuffer {
 
   // Copies the buffer to device `dst_device`. Returns an error if the buffer is
   // already on dst_device.
-  StatusOr<std::unique_ptr<PjRtBuffer>> CopyToDevice(Device* dst_device);
+  StatusOr<std::unique_ptr<PjRtBuffer>> CopyToDevice(PjRtDevice* dst_device);
 
   // Copies the buffer to the remote device encoded in serialized_descriptor.
   // This call must be preceded by a call to MakeCrossHostReceiveBuffers on the
@@ -629,7 +635,7 @@ class PjRtBuffer {
 
   StatusOr<std::pair<std::unique_ptr<PjRtBuffer>,
                      std::shared_ptr<BufferSequencingEvent>>>
-  CopyToDeviceHelper(Device* dst_device, LocalDeviceState* dst_local_device,
+  CopyToDeviceHelper(PjRtDevice* dst_device, LocalDeviceState* dst_local_device,
                      LocalDeviceState* transfer_local_device,
                      se::Stream* transfer_stream,
                      std::shared_ptr<TrackedDeviceBuffer> src_device_buffer);
@@ -637,7 +643,7 @@ class PjRtBuffer {
   PjRtClient* const client_;
   const Shape on_host_shape_;
   const Shape on_device_shape_;
-  Device* const device_;
+  PjRtDevice* const device_;
 
   mutable absl::Mutex mu_;
   std::shared_ptr<TrackedDeviceBuffer> device_buffer_ TF_GUARDED_BY(mu_);
@@ -668,6 +674,11 @@ struct CompileOptions {
   bool compile_portable_executable = false;
 };
 
+class ExecuteContext {
+ public:
+  virtual ~ExecuteContext() = default;
+};
+
 struct ExecuteOptions {
   // If true, the client must pass a single PjRtBuffer which contains all of
   // the arguments as a single XLA tuple, otherwise each argument must be
@@ -682,6 +693,9 @@ struct ExecuteOptions {
   // multi-host programs are launched in different orders on different hosts,
   // the launch IDs may be used by the runtime to detect the mismatch.
   int32 launch_id = 0;
+  // If non-null, an opaque context passed to an execution that may be used to
+  // supply additional arguments to a derived class of PjRtExecutable.
+  const ExecuteContext* context = nullptr;
 };
 
 // Represents a compiled computation that can be executed given handles to
@@ -699,7 +713,7 @@ class PjRtExecutable {
                  bool parameter_is_tupled_arguments,
                  std::shared_ptr<DeviceAssignment> device_assignment,
                  std::vector<std::pair<int, int>> local_logical_device_ids,
-                 std::vector<Device*> local_devices, PjRtClient* client);
+                 std::vector<PjRtDevice*> local_devices, PjRtClient* client);
 
   virtual ~PjRtExecutable() = default;
 
@@ -733,14 +747,16 @@ class PjRtExecutable {
     return local_logical_device_ids_;
   }
 
-  const std::vector<Device*>& local_devices() const { return local_devices_; }
+  const std::vector<PjRtDevice*>& local_devices() const {
+    return local_devices_;
+  }
 
   StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> Execute(
       absl::Span<PjRtBuffer* const> argument_handles,
       const ExecuteOptions& options) const;
 
   StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecuteOnLocalDevice(
-      absl::Span<PjRtBuffer* const> argument_handles, Device* device,
+      absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
       const ExecuteOptions& options) const;
 
   // Execute on local devices. Takes a sequence of argument lists (one argument
@@ -756,22 +772,42 @@ class PjRtExecutable {
 
   const string& name() const;
 
+ protected:
+  bool parameter_is_tupled_arguments() const {
+    return parameter_is_tupled_arguments_;
+  }
+
  private:
   // Initializes information about which arguments to which executables must be
   // donated due to aliases that were specified by the computation.
   Status SetUpDonation(PjRtClient* client, bool tuple_inputs);
 
+  virtual bool MustDonateParameter(int executable_idx, int parameter) const;
+
+  virtual StatusOr<std::vector<ExecutionInput>>
+  MakeExecutionInputsAndWaitForEvents(
+      int device_ordinal, const ExecuteOptions& options,
+      absl::Span<PjRtBuffer* const> argument_handles,
+      absl::Span<const PjRtBuffer::ScopedHold> device_buffers,
+      absl::flat_hash_set<BufferSequencingEvent*>& events) const;
+
   StatusOr<ScopedShapedBuffer> EnqueueExecution(
       absl::Span<PjRtBuffer* const> argument_handles, int replica,
       int partition, int executable_idx, const RunId& run_id,
-      const ExecuteOptions& options, Device* device,
+      const ExecuteOptions& options, PjRtDevice* device,
       std::vector<PjRtBuffer::ScopedHold>* device_buffers,
       std::shared_ptr<DeviceAssignment> device_assignment) const;
 
+  virtual std::vector<std::unique_ptr<PjRtBuffer>> MakeOutputBuffers(
+      int device_ordinal, const ExecuteOptions& options,
+      ScopedShapedBuffer result_buffer,
+      std::shared_ptr<BufferSequencingEvent> definition_event,
+      PjRtDevice* device) const;
+
   StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecuteHelper(
       absl::Span<PjRtBuffer* const> argument_handles, int replica,
       int partition, const RunId& run_id, const ExecuteOptions& options,
-      Device* device = nullptr) const;
+      PjRtDevice* device = nullptr) const;
 
   // Create shared pointers so we can free them after the execution: with
   // asynchronous execution, the process being executed can outlive the
@@ -800,7 +836,7 @@ class PjRtExecutable {
   // assigned.
   // shared_ptrs instead of unique_ptrs to play well with the Python bindings
   // (see xla.cc).
-  std::vector<Device*> local_devices_;
+  std::vector<PjRtDevice*> local_devices_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/primitive_util.cc b/tensorflow/compiler/xla/primitive_util.cc
index 2143d1dfbe7..c932469c56a 100644
--- a/tensorflow/compiler/xla/primitive_util.cc
+++ b/tensorflow/compiler/xla/primitive_util.cc
@@ -112,6 +112,21 @@ xla::PrimitiveType UnsignedIntegralTypeForBitWidth(int64 src_bitwidth) {
   }
 }
 
+xla::PrimitiveType SignedIntegralTypeForBitWidth(int64 src_bitwidth) {
+  switch (src_bitwidth) {
+    case 8:
+      return xla::S8;
+    case 16:
+      return xla::S16;
+    case 32:
+      return xla::S32;
+    case 64:
+      return xla::S64;
+    default:
+      return xla::PRIMITIVE_TYPE_INVALID;
+  }
+}
+
 PrimitiveType ComplexComponentType(PrimitiveType complex_type) {
   switch (complex_type) {
     case C64:
diff --git a/tensorflow/compiler/xla/primitive_util.h b/tensorflow/compiler/xla/primitive_util.h
index 034c14e8930..1228b4f9a32 100644
--- a/tensorflow/compiler/xla/primitive_util.h
+++ b/tensorflow/compiler/xla/primitive_util.h
@@ -153,6 +153,8 @@ int BitWidth(PrimitiveType type);
 
 PrimitiveType UnsignedIntegralTypeForBitWidth(int64 src_bitwidth);
 
+PrimitiveType SignedIntegralTypeForBitWidth(int64 src_bitwidth);
+
 // Returns the real, imag component type underlying the given complex type.
 // LOG(FATAL)'s if complex_type is not complex.
 PrimitiveType ComplexComponentType(PrimitiveType complex_type);
diff --git a/tensorflow/compiler/xla/python/BUILD b/tensorflow/compiler/xla/python/BUILD
index aa55a39218d..6ad1d789d48 100644
--- a/tensorflow/compiler/xla/python/BUILD
+++ b/tensorflow/compiler/xla/python/BUILD
@@ -6,7 +6,10 @@ load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow:tensorflow.bzl", "pybind_extension")
 
 package(
-    default_visibility = ["//tensorflow:internal"],
+    default_visibility = [
+        "//learning/pathways/data_parallel/jax:__subpackages__",
+        "//tensorflow:internal",
+    ],
     licenses = ["notice"],  # Apache 2.0
 )
 
@@ -155,7 +158,7 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/core/lib/bfloat16",
+        "//tensorflow/core/platform:bfloat16",
         "//tensorflow/core/platform:logging",
         "//third_party/py/numpy:headers",
         "//third_party/python_runtime:headers",  # buildcleaner: keep
@@ -242,6 +245,34 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "jax_jit",
+    srcs = ["jax_jit.cc"],
+    hdrs = ["jax_jit.h"],
+    copts = [
+        "-fexceptions",
+        "-fno-strict-aliasing",
+    ],
+    features = ["-use_header_modules"],
+    visibility = ["//visibility:private"],
+    deps = [
+        ":py_client",
+        ":pytree",
+        ":types",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/pjrt:pjrt_client",
+        "//tensorflow/core/platform:status",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/types:optional",
+        "@pybind11",
+    ],
+)
+
 cc_library(
     name = "ops",
     srcs = ["ops.cc"],
@@ -257,6 +288,7 @@ cc_library(
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:comparators",
+        "//tensorflow/compiler/xla/client/lib:lu_decomposition",
         "//tensorflow/compiler/xla/client/lib:math",
         "//tensorflow/compiler/xla/client/lib:qr",
         "//tensorflow/compiler/xla/client/lib:self_adjoint_eig",
@@ -327,6 +359,27 @@ cc_library(
     ],
 )
 
+# TODO(phawkins): this library is really part of JAX. Find a better home for it.
+cc_library(
+    name = "pytree",
+    srcs = ["pytree.cc"],
+    hdrs = ["pytree.h"],
+    copts = [
+        "-fexceptions",
+        "-fno-strict-aliasing",
+    ],
+    features = ["-use_header_modules"],
+    deps = [
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/hash",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@pybind11",
+    ],
+)
+
 config_setting(
     name = "enable_gpu",
     values = {"define": "xla_python_enable_gpu=true"},
@@ -346,8 +399,10 @@ pybind_extension(
     deps = [
         ":bfloat16",
         ":dlpack",
+        ":jax_jit",
         ":ops",
         ":py_client",
+        ":pytree",
         ":python_ref_manager",
         ":outfeed_receiver_py",
         ":traceback",
diff --git a/tensorflow/compiler/xla/python/bfloat16.cc b/tensorflow/compiler/xla/python/bfloat16.cc
index 1f21b3fb242..b70244cc3ef 100644
--- a/tensorflow/compiler/xla/python/bfloat16.cc
+++ b/tensorflow/compiler/xla/python/bfloat16.cc
@@ -27,7 +27,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/core/lib/bfloat16/bfloat16.h"
+#include "tensorflow/core/platform/bfloat16.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/python/dlpack.cc b/tensorflow/compiler/xla/python/dlpack.cc
index 4fc17172ea7..67afa25d23e 100644
--- a/tensorflow/compiler/xla/python/dlpack.cc
+++ b/tensorflow/compiler/xla/python/dlpack.cc
@@ -193,7 +193,7 @@ StatusOr<std::vector<int64>> StridesToLayout(absl::Span<int64 const> dims,
   return minor_to_major;
 }
 
-StatusOr<DLDeviceType> DLDeviceTypeForDevice(const Device& device) {
+StatusOr<DLDeviceType> DLDeviceTypeForDevice(const PjRtDevice& device) {
   const se::Platform* platform =
       device.local_device_state()->executor()->platform();
   if (platform->id() == se::host::kHostPlatformId) {
@@ -205,15 +205,15 @@ StatusOr<DLDeviceType> DLDeviceTypeForDevice(const Device& device) {
                          device.DebugString());
 }
 
-StatusOr<DLContext> DLContextForDevice(const Device& device) {
+StatusOr<DLContext> DLContextForDevice(const PjRtDevice& device) {
   DLContext context;
   TF_ASSIGN_OR_RETURN(context.device_type, DLDeviceTypeForDevice(device));
   context.device_id = device.local_device_state()->device_ordinal();
   return context;
 }
 
-StatusOr<Device*> DeviceForDLContext(const PjRtClient& client,
-                                     const DLContext& context) {
+StatusOr<PjRtDevice*> DeviceForDLContext(const PjRtClient& client,
+                                         const DLContext& context) {
   se::Platform::Id platform_id;
   switch (context.device_type) {
     case kDLCPU:
@@ -226,7 +226,7 @@ StatusOr<Device*> DeviceForDLContext(const PjRtClient& client,
       return InvalidArgument("Unknown/unsupported DLPack device type %d",
                              context.device_type);
   }
-  auto it = absl::c_find_if(client.local_devices(), [&](Device* device) {
+  auto it = absl::c_find_if(client.local_devices(), [&](PjRtDevice* device) {
     return device->local_device_state()->executor()->platform()->id() ==
                platform_id &&
            device->local_device_state()->device_ordinal() == context.device_id;
@@ -313,7 +313,7 @@ StatusOr<std::unique_ptr<PyBuffer>> DLPackManagedTensorToBuffer(
         dlmt->dl_tensor.ndim);
   }
   TF_ASSIGN_OR_RETURN(
-      Device * device,
+      PjRtDevice * device,
       DeviceForDLContext(*client->pjrt_client(), dlmt->dl_tensor.ctx));
   absl::Span<int64 const> dimensions(
       reinterpret_cast<int64*>(dlmt->dl_tensor.shape), dlmt->dl_tensor.ndim);
@@ -321,7 +321,8 @@ StatusOr<std::unique_ptr<PyBuffer>> DLPackManagedTensorToBuffer(
                       DLDataTypeToPrimitiveType(dlmt->dl_tensor.dtype));
 
   std::vector<int64> minor_to_major;
-  if (dlmt->dl_tensor.strides && !absl::c_find(dimensions, 0)) {
+  if (dlmt->dl_tensor.strides &&
+      absl::c_find(dimensions, 0) == dimensions.end()) {
     absl::Span<int64 const> strides(
         reinterpret_cast<int64*>(dlmt->dl_tensor.strides),
         dlmt->dl_tensor.ndim);
diff --git a/tensorflow/compiler/xla/python/jax_jit.cc b/tensorflow/compiler/xla/python/jax_jit.cc
new file mode 100644
index 00000000000..2c364573e5b
--- /dev/null
+++ b/tensorflow/compiler/xla/python/jax_jit.cc
@@ -0,0 +1,830 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This files implements the `jax.jit` dispatch and just-in-time feature.
+//
+// In a nutshell, `Jit(f)` returns a callable that will dispatch (i.e. forward
+// based on passed arguments dtypes/shapes/identity) the execution to a
+// just-in-time compiled XLA Executable. All of that is done in C++ for
+// performance reasons.
+//
+// This file contains the utilities to:
+// (a) inspect arguments and describe their structure, dtype/shapes, etc.
+// (b) keep a mapping from function signatures to compiled XLA Executables.
+
+#include "tensorflow/compiler/xla/python/jax_jit.h"
+
+#include <exception>
+#include <memory>
+#include <stdexcept>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/synchronization/notification.h"
+#include "absl/types/optional.h"
+#include "pybind11/cast.h"
+#include "pybind11/numpy.h"
+#include "pybind11/pybind11.h"
+#include "pybind11/pytypes.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/compiler/xla/python/py_buffer.h"
+#include "tensorflow/compiler/xla/python/py_executable.h"
+#include "tensorflow/compiler/xla/python/pytree.h"
+#include "tensorflow/compiler/xla/python/types.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace xla {
+
+namespace py = pybind11;
+
+// TODO(phawkins): Add support for Tracers.
+// TODO(jblespiau): Use absl Status.
+
+namespace {
+
+thread_local bool disable_jit;
+void SetDisableJit(bool disable_jit_) { disable_jit = disable_jit_; }
+bool GetDisableJit() { return disable_jit; }
+
+// Describes the abstract shape and dtype of an argument.
+struct ArgSignature {
+  // This is the XLA dtype of the object.
+  xla::PrimitiveType dtype;
+  // JAX arguments can be of weak type, if and only if they are Python scalars
+  // or `DeviceArray` values such that `aval.weak_type` is true.
+  bool weak_type;
+  absl::InlinedVector<int64, 4> shape;
+  bool operator==(const ArgSignature& other) const {
+    return std::tie(dtype, weak_type, shape) ==
+           std::tie(other.dtype, other.weak_type, other.shape);
+  }
+  bool operator!=(const ArgSignature& other) const { return !(*this == other); }
+
+  std::string DebugString() const {
+    std::string result = "";
+    if (weak_type) {
+      absl::StrAppend(&result, "weak_");
+    }
+    absl::StrAppend(&result, xla::PrimitiveType_Name(dtype));
+    absl::StrAppend(&result, "[", absl::StrJoin(shape, ","), "]");
+    return result;
+  }
+};
+
+template <typename H>
+H AbslHashValue(H h, const ArgSignature& s) {
+  h = H::combine(std::move(h), s.dtype);
+  if (!s.shape.empty()) {
+    h = H::combine_contiguous(std::move(h), &s.shape.front(), s.shape.size());
+  }
+  return h;
+}
+
+// The signature of Python jitted function call, partitioned into:
+// - dynamic positional arguments (i.e. positional args which are not static)
+// - static positional arguments (i.e. the args associated to static_argnums)
+// - keyword arguments
+// The CallSignature should unambiguously identify a function call, thus,
+// equality is based on:
+// (a) Same PyTree for all dynamic positional arguments and keyword arguments
+// (a) equality of the arguments and keyword arguments ArgSignature
+// (a) equality (delegated to Python) of the static arguments.
+struct CallSignature {
+  struct KwargEntry {
+    // To avoid comparing strings, we intern the kwargs strings.
+    // The compilation cache holds a reference to all the keys.
+    py::handle key;
+    PyTreeDef value_treedef;
+    bool operator==(const KwargEntry& other) const {
+      return key.ptr() == other.key.ptr() &&
+             value_treedef == other.value_treedef;
+    }
+    bool operator!=(const KwargEntry& other) const { return !(*this == other); }
+  };
+
+  // Only contains the arguments associated to `static_argnums`, sorted in the
+  // order of their argnum index.
+  std::vector<py::object> static_args;
+  // A PyTreeDef for each positional dynamic (i.e. not static) argument.
+  std::vector<PyTreeDef> dynamic_positional_args_treedef;
+  // Keyword arguments. Sorted by the interned keyword pointers.
+  std::vector<KwargEntry> keyword_args;
+  // Shape and dtype for both the dynamic positional arguments and the keyword
+  // arguments (sorted by interned keyword pointers).
+  std::vector<ArgSignature> dynamic_args_signatures;
+  PjRtDevice* device;
+
+  bool operator==(const CallSignature& other) const {
+    return std::tie(dynamic_positional_args_treedef, static_args, keyword_args,
+                    dynamic_args_signatures, device) ==
+           std::tie(other.dynamic_positional_args_treedef, other.static_args,
+                    other.keyword_args, other.dynamic_args_signatures,
+                    other.device);
+  }
+  bool operator!=(const CallSignature& other) const {
+    return !(*this == other);
+  }
+
+  // To be used when we want to keep ownership of Python values referenced by
+  // the `CallSignature` (i.e. when we insert an entry).
+  void IncRef() const;
+  // The destructor of the cache should call this on all entries.
+  void DecRef() const;
+
+  std::string DebugString() const;
+};
+
+void CallSignature::IncRef() const {
+  for (const auto& kw : keyword_args) {
+    kw.key.inc_ref();
+  }
+}
+
+void CallSignature::DecRef() const {
+  for (const auto& kw : keyword_args) {
+    kw.key.dec_ref();
+  }
+}
+
+template <typename H>
+H AbslHashValue(H h, const CallSignature::KwargEntry& kw) {
+  h = H::combine(std::move(h), kw.key.ptr(), kw.value_treedef);
+  return h;
+}
+
+template <typename H>
+H AbslHashValue(H h, const CallSignature& s) {
+  // /!\ important: We cannot include static arguments to the hash, because
+  // the py::object must be hashable for absl. We can try delegating to the
+  // Python __hash__, but there are many non-hashable Python types such as
+  // np.ndarray.
+  // TODO(jblespiau): We should either ban non-hashable objects from jit or we
+  // should hash them by object identity.
+  h = H::combine_contiguous(std::move(h),
+                            &s.dynamic_positional_args_treedef.front(),
+                            s.dynamic_positional_args_treedef.size());
+  h = H::combine_contiguous(std::move(h), &s.keyword_args.front(),
+                            s.keyword_args.size());
+  h = H::combine_contiguous(std::move(h), &s.dynamic_args_signatures.front(),
+                            s.dynamic_args_signatures.size());
+  h = H::combine(std::move(h), s.device);
+  return h;
+}
+
+std::string CallSignature::DebugString() const {
+  std::vector<std::string> static_args_str;
+  static_args_str.reserve(static_args.size());
+  for (auto& static_arg : static_args) {
+    static_args_str.emplace_back(py::cast<std::string>(static_arg.str()));
+  }
+
+  std::vector<std::string> signature_str;
+  signature_str.reserve(dynamic_args_signatures.size());
+
+  for (auto& arg_signature : dynamic_args_signatures) {
+    signature_str.emplace_back(arg_signature.DebugString());
+  }
+  std::vector<std::string> tree_def_str;
+  signature_str.reserve(dynamic_positional_args_treedef.size());
+  for (auto& tree_def : dynamic_positional_args_treedef) {
+    tree_def_str.emplace_back(tree_def.ToString());
+  }
+  std::vector<std::string> keyword_names;
+  keyword_names.reserve(keyword_args.size());
+  for (auto& kwarg_entry : keyword_args) {
+    keyword_names.emplace_back(py::cast<std::string>(kwarg_entry.key));
+    tree_def_str.emplace_back(kwarg_entry.value_treedef.ToString());
+  }
+  return absl::StrCat(
+      static_args.size(), " static_args: ", absl::StrJoin(static_args_str, ","),
+      "\n",  // new line
+      keyword_args.size(), " keyword args:", absl::StrJoin(keyword_names, ","),
+      "\n",  // new-line
+      dynamic_positional_args_treedef.size(), " positional args.\n",
+      dynamic_args_signatures.size(),
+      " dynamic args (positional+keyword):\n   - ",
+      absl::StrJoin(signature_str, ", "), "\n   - ",
+      absl::StrJoin(tree_def_str, " | "));
+}
+
+struct CacheEntry {
+  std::shared_ptr<xla::PyExecutable> executable;
+  xla::PjRtDevice* device;
+  PyTreeDef out_pytree_def;
+  // These are the objects required to create a `DeviceArray` object.
+  // We use Python types within the vector because this is what we will be
+  // returning to Python. No need to convert back and forth.
+  // We need py::object to maintain the objects alive.
+  std::vector<py::object> out_avals;
+  std::vector<py::object> out_lazy_exprs;
+  // Ensures a single thread performs the compilation for a given executable.
+  //
+  // The first thread (holding the GIL) will create the CacheEntry associated to
+  // a signature and if the object has been insterted already, other threads
+  // will wait for the notification.
+  absl::Notification compilation_complete;
+  absl::optional<std::exception> compilation_error = absl::nullopt;
+};
+
+// A `CompiledFunction` is associated to a `jax.jit(f)` and takes care of the
+// bookkeeping of the different signatures used and the dispatch of calls to
+// the correct underlying `PyExecutable`. This class is thread-safe.
+class CompiledFunction {
+ public:
+  CompiledFunction(py::function fun, py::function cache_miss_fun,
+                   py::function python_f_jitted, bool jax_enable_x64,
+                   bool jax_disable_jit, std::vector<int> static_argnums);
+  ~CompiledFunction();
+
+  // This function will:
+  // (a) flatten the inputs using pytree
+  // (b) get buffer objects from the arguments
+  // (c) call the executable
+  // (d) construct `DeviceArray` objects from the outputs
+  // (e) reconstruct the `PyTree`.
+  py::object Call(py::args args, py::kwargs kwargs);
+
+  // This allows `inspect.signature(cpp_jitted_f)` from Python.
+  py::object __signature__() {
+    static const auto* inspect = new py::module(py::module::import("inspect"));
+    return inspect->attr("signature")(fun_);
+  }
+
+ private:
+  CacheEntry& GetCacheEntry(const py::args& args, const py::kwargs& kwargs,
+                            const CallSignature& signature,
+                            absl::optional<py::tuple> cache_miss_return);
+  CacheEntry& SetAndReturnCacheEntry(
+      const py::args& args, const py::kwargs& kwargs,
+      const CallSignature& signature,
+      absl::optional<py::tuple> cache_miss_return = absl::nullopt);
+  bool JitIsDisabled() { return GetDisableJit() || jax_disable_jit_; }
+
+  const py::function fun_;  // The Python function to jit.
+  // The Python function in charge of returning a `xla::PyExecutable` from
+  // the arguments passed to `jitted_f`.
+  const py::function cache_miss_fun_;
+  // A function to call as fallback. This is the result of calling the Python
+  // `jax.jit`.
+  // TODO(jblespiau): Delete this when the C++ codepath supports all features.
+  const py::function python_f_jitted_;
+
+  // The value of the Python flag when the object was created.
+  const bool jax_enable_x64_;
+  const bool jax_disable_jit_;
+
+  // We need to know the static arguments to remove them from the arguments
+  // passed to the underlying PyExecutable. In sorted order.
+  std::vector<int> static_argnums_;
+  // We need a `unique_ptr` here to ensure value pointer stability.
+  absl::flat_hash_map<CallSignature, std::unique_ptr<CacheEntry>> executables_;
+
+  // As top-level functions are decorated with `jax.jit`, when
+  // `CompiledFunction` is being instantiated from Python, the clients are not
+  // yet available (done after GoogleInit). They will be during the first call
+  // to `Call`.
+  std::shared_ptr<xla::PyClient> pyclient_ = nullptr;
+  xla::PjRtDevice* default_device_ = nullptr;
+
+  // IMPORTANT: The GIL is not always held, because we call back to Python and
+  // Python will release the GIL.
+  // Thus, we protect the critical section modifying the `executables_` map
+  // and more generally the compilation with some `absl::Notification`.
+  // The first thread reaching such point will be responsible to create the
+  // notification for the executable and others will wait until notified.
+  // It's safe because the first thread will be holding the GIL while
+  // initializing the `Notification`.
+  //
+  // absl::optional<absl::Notification> is not supported
+  bool first_compilation_started_ = false;
+  absl::Notification first_compilation_complete_;
+  absl::optional<std::exception> first_compilation_error_ = absl::nullopt;
+};
+
+CompiledFunction::CompiledFunction(py::function fun,
+                                   py::function cache_miss_fun,
+                                   py::function python_f_jitted,
+                                   bool jax_enable_x64, bool jax_disable_jit,
+                                   std::vector<int> static_argnums)
+    : fun_(std::move(fun)),
+      cache_miss_fun_(std::move(cache_miss_fun)),
+      python_f_jitted_(std::move(python_f_jitted)),
+      jax_enable_x64_(jax_enable_x64),
+      jax_disable_jit_(jax_disable_jit),
+      static_argnums_(std::move(static_argnums)) {
+  std::sort(static_argnums_.begin(), static_argnums_.end());
+}
+
+CompiledFunction::~CompiledFunction() {
+  for (const auto& entry : executables_) {
+    entry.first.DecRef();
+  }
+}
+
+namespace {
+
+// The resulting information of the parsing and conversion of the arguments.
+struct ParsedArgumentsAsBuffers {
+  // The call signature will be filled during 2 steps:
+  // - `FlattenArguments` will fill the static arguments and the pytree
+  //    structures
+  // - the shapes and dtypes are filled later, by `ParseAndTransferArguments`.
+  CallSignature signature;
+  // The concatenation of the dynamic positional arguments and the sorted
+  // keyword arguments. We do not need ownership, thus the py::handle.
+  // TODO(jblespiau): We do not need py::object here and py::handle suffice and
+  // will prevent any counter increment.
+  std::vector<py::object> flat_dynamic_args;
+  std::vector<py::object> keep_alive_objects;
+
+  // The following is only valid if the parsing succeeds.
+  std::vector<xla::PjRtBuffer*> arg_buffers;
+  // We may need to keep some objects around, because:
+  // (a) we need to extend the lifetime of objects created within
+  //    `ConvertArgsToBuffers`
+  // (b) `arg_buffers` do not maintain ownership
+  std::vector<absl::variant<std::unique_ptr<xla::PyBuffer>,
+                            std::unique_ptr<xla::PjRtBuffer>>>
+      keep_alive;
+};
+
+// Filter out static arguments, flatten and concatenate other arguments (i.e.
+// dynamic positional and keyword arguments), filling `arguments` in place.
+void FlattenArguments(const py::args& args, const py::kwargs& py_kwargs,
+                      absl::Span<int const> static_argnums,
+                      ParsedArgumentsAsBuffers& arguments) {
+  arguments.flat_dynamic_args.reserve(args.size() + py_kwargs.size() -
+                                      static_argnums.size());
+  arguments.signature.dynamic_positional_args_treedef.reserve(
+      args.size() - static_argnums.size());
+
+  // Positional arguments.
+  for (size_t i = 0; i < args.size(); ++i) {
+    if (std::find(static_argnums.begin(), static_argnums.end(), i) ==
+        static_argnums.end()) {
+      PyTreeDef pytree_def;
+      pytree_def.FlattenInto(args[i], arguments.flat_dynamic_args);
+      arguments.signature.dynamic_positional_args_treedef.push_back(pytree_def);
+    } else {
+      arguments.signature.static_args.emplace_back(
+          // borrow is mandatory here.
+          py::reinterpret_borrow<py::object>(args[i]));
+    }
+  }
+
+  // Keyword arguments.
+  std::vector<std::pair<py::handle, py::handle>> kwargs(py_kwargs.begin(),
+                                                        py_kwargs.end());
+  // We first intern the keys, then sort them (by pointer) and then create
+  // the signatures.
+  arguments.signature.keyword_args.resize(kwargs.size());
+  for (size_t i = 0; i < kwargs.size(); ++i) {
+    // Intern the key if not already interned.
+    if (!PyUnicode_CHECK_INTERNED(kwargs[i].first.ptr())) {
+      PyObject* key = kwargs[i].first.ptr();
+      kwargs[i].first.inc_ref();
+      PyUnicode_InternInPlace(&key);
+      arguments.keep_alive_objects.push_back(
+          py::reinterpret_steal<py::object>(key));
+      kwargs[i].first = py::handle(key);
+    }
+  }
+
+  std::sort(kwargs.begin(), kwargs.end(),
+            [](const std::pair<py::handle, py::handle>& a,
+               const std::pair<py::handle, py::handle>& b) {
+              return a.first.ptr() < b.first.ptr();
+            });
+  for (size_t i = 0; i < kwargs.size(); ++i) {
+    arguments.signature.keyword_args[i].key = kwargs[i].first;
+    arguments.signature.keyword_args[i].value_treedef.FlattenInto(
+        kwargs[i].second, arguments.flat_dynamic_args);
+  }
+}
+
+template <typename CppType, typename Pybind11Type>
+std::unique_ptr<xla::PjRtBuffer> ConvertToScalarBuffer(
+    const py::handle& scalar, xla::PjRtClient* client,
+    xla::PjRtDevice* device) {
+  CppType data = py::cast<Pybind11Type>(scalar);
+  xla::Shape shape = xla::ShapeUtil::MakeShapeWithType<CppType>({});
+  return ValueOrThrow(xla::PjRtBuffer::FromHostBuffer(
+      &data, shape,
+      xla::PjRtBuffer::HostBufferSemantics::kImmutableOnlyDuringCall, nullptr,
+      client, device));
+}
+
+// Convert a scalar to the associated PjRtBuffer or raises an error if it is
+// not convertible (thus, this must be called after other checks).
+StatusOr<std::unique_ptr<xla::PjRtBuffer>> ScalarToBuffer(
+    py::handle scalar, bool jax_enable_x64, xla::PjRtClient* client,
+    xla::PjRtDevice* device) {
+  // Important: In Python, isinstance(True, int) returns True. Thus, we have
+  // to check for bool before int.
+  if (py::isinstance<py::bool_>(scalar)) {
+    return ConvertToScalarBuffer<bool, py::bool_>(scalar, client, device);
+  } else if (py::isinstance<py::int_>(scalar)) {
+    if (jax_enable_x64) {
+      return ConvertToScalarBuffer<int64, py::int_>(scalar, client, device);
+    } else {
+      return ConvertToScalarBuffer<int, py::int_>(scalar, client, device);
+    }
+  } else if (py::isinstance<py::float_>(scalar)) {
+    if (jax_enable_x64) {
+      return ConvertToScalarBuffer<double, py::float_>(scalar, client, device);
+
+    } else {
+      return ConvertToScalarBuffer<float, py::float_>(scalar, client, device);
+    }
+  } else if (PyComplex_Check(scalar.ptr())) {
+    Py_complex result = PyComplex_AsCComplex(scalar.ptr());
+    if (result.real == -1.0 && PyErr_Occurred()) {
+      PyErr_Clear();
+      throw std::runtime_error("Could not convert the complex number");
+    }
+    if (jax_enable_x64) {
+      xla::complex128 data(result.real, result.imag);
+      xla::Shape shape = xla::ShapeUtil::MakeShapeWithType<xla::complex128>({});
+      return ValueOrThrow(xla::PjRtBuffer::FromHostBuffer(
+          &data, shape,
+          xla::PjRtBuffer::HostBufferSemantics::kImmutableOnlyDuringCall,
+          nullptr, client, device));
+    } else {
+      xla::complex64 data(result.real, result.imag);
+      xla::Shape shape = xla::ShapeUtil::MakeShapeWithType<xla::complex64>({});
+      return ValueOrThrow(xla::PjRtBuffer::FromHostBuffer(
+          &data, shape,
+          xla::PjRtBuffer::HostBufferSemantics::kImmutableOnlyDuringCall,
+          nullptr, client, device));
+    }
+  }
+  return InvalidArgument(
+      "%s", absl::StrCat(
+                "Not supported: The C++ jax jit execution path, only accepts "
+                "DeviceArray, Numpy arrays, or Python scalars. Got type ",
+                py::cast<std::string>(scalar.get_type().str())));
+}
+
+const py::dtype* DtypeTo32BitDtype(const py::dtype& dtype) {
+  static const auto* int64_dt = new py::dtype("int64");
+  static const auto* int32_dt = new py::dtype("int32");
+  static const auto* uint64_dt = new py::dtype("uint64");
+  static const auto* uint32_dt = new py::dtype("uint32");
+  static const auto* float64_dt = new py::dtype("float64");
+  static const auto* float32_dt = new py::dtype("float32");
+  static const auto* complex64_dt = new py::dtype("complex64");
+  static const auto* complex128_dt = new py::dtype("complex128");
+
+  if (dtype == *int64_dt) {
+    return int32_dt;
+  }
+  if (dtype == *float64_dt) {
+    return float32_dt;
+  }
+  if (dtype == *uint64_dt) {
+    return uint32_dt;
+  }
+  if (dtype == *complex128_dt) {
+    return complex64_dt;
+  }
+
+  return nullptr;
+}
+
+// Converts flattened arguments contained in ParsedArgumentsAsBuffers in
+// place. If arguments are `DeviceArray`, they must all be on the same `Device`.
+//
+// Returns `OkStatus()` on success.
+Status ConvertArgsToBuffers(bool jax_enable_x64, xla::PyClient& pyclient,
+                            xla::PjRtDevice* default_device,
+                            ParsedArgumentsAsBuffers& arguments) {
+  std::vector<xla::PjRtBuffer*>& arg_buffers = arguments.arg_buffers;
+  auto& keep_alive = arguments.keep_alive;
+
+  int num_flat_dynamic_args = arguments.flat_dynamic_args.size();
+  arg_buffers.reserve(num_flat_dynamic_args);
+  arguments.signature.dynamic_args_signatures.reserve(num_flat_dynamic_args);
+
+  static const auto* xla_module =
+      new py::module(py::module::import("jax.interpreters.xla"));
+  const auto& device_array = xla_module->attr("DeviceArray");
+
+  static const auto* numpy_module = new py::module(py::module::import("numpy"));
+  const auto& array = numpy_module->attr("array");
+
+  // TODO(phawkins): consider device stickiness.
+  // We first check whether any `DeviceArray` is present and whether they are
+  // attached to any specific device. See also
+  // https://github.com/google/jax/pull/1884
+  // https://github.com/google/jax/pull/1916 for the rationale why the
+  // computation follows the data locality.
+  // It's also similar to PyTorch's behavior.
+  xla::PjRtDevice* data_device = nullptr;
+  for (py::handle arg : arguments.flat_dynamic_args) {
+    if (py::isinstance(arg, device_array)) {
+      xla::PyBuffer* buffer;
+      try {
+        // This can fail, e.g. when device_buffer is a `DeviceConstant`.
+        buffer = py::cast<xla::PyBuffer*>(arg.attr("device_buffer"));
+      } catch (const py::cast_error& e) {
+        return InvalidArgument(
+            "%s",
+            absl::StrCat("[jaxjit] Unsupported subclass of `DeviceArray`: "
+                         "`device_buffer` field is of type ",
+                         py::cast<std::string>(
+                             arg.attr("device_buffer").get_type().str()),
+                         " while a `PyBuffer` was expected."
+
+                         ));
+      }
+      xla::PjRtDevice* device = buffer->buffer()->device();
+      if (data_device && (device != data_device)) {
+        return InvalidArgument(
+            "%s",
+            absl::StrCat(
+                "Arguments to a jit-compiled function must be colocated on the "
+                "same device. Arguments were found to be on the two following "
+                "different devices: ",
+                device->DebugString(), " and ", data_device->DebugString()));
+      } else {
+        data_device = device;
+      }
+    }
+  }
+  if (!data_device) {
+    // No `DeviceArray` were found default to `default_device`.
+    data_device = default_device;
+  }
+  CHECK(data_device);
+  arguments.signature.device = data_device;
+  xla::PjRtClient* pjrt_client = data_device->client();
+
+  for (py::handle arg : arguments.flat_dynamic_args) {
+    // We do not support here d2d transparent transfers.
+    // We assumes all the `DeviceArray` are already on the correct and shared
+    // device.
+    if (py::isinstance(arg, device_array)) {
+      xla::PyBuffer* buffer =
+          py::cast<xla::PyBuffer*>(arg.attr("device_buffer"));
+      arg_buffers.push_back(buffer->buffer());
+      ArgSignature sig;
+      sig.dtype = buffer->shape().element_type();
+      sig.shape.assign(buffer->shape().dimensions().begin(),
+                       buffer->shape().dimensions().end());
+      sig.weak_type = py::cast<py::bool_>(arg.attr("aval").attr("weak_type"));
+      arguments.signature.dynamic_args_signatures.push_back(std::move(sig));
+    } else if (py::isinstance<py::array>(arg)) {
+      // TODO(jblespiau): Can we improve this call? Do we need the underlying
+      // GlobalPyRefManager() and co?
+      py::array numpy_array = py::cast<py::array>(arg);
+      // If jax_enable_x64 is not set, we need to coerce 32 bits types.
+      // Note that this is calling back to Python!
+      if (!jax_enable_x64) {
+        const py::dtype* to_dtype = DtypeTo32BitDtype(numpy_array.dtype());
+        if (to_dtype) {
+          numpy_array = array(numpy_array, to_dtype);
+        }
+      }
+      std::unique_ptr<xla::PyBuffer> buffer =
+          ValueOrThrow(pyclient.BufferFromPyval(
+              numpy_array, data_device,
+              /*force_copy=*/false, /*host_buffer_semantics=*/
+              xla::PjRtBuffer::HostBufferSemantics::kZeroCopy));
+      arg_buffers.push_back(buffer->buffer());
+
+      ArgSignature sig;
+      sig.dtype = buffer->shape().element_type();
+      sig.weak_type = false;
+      sig.shape.assign(buffer->shape().dimensions().begin(),
+                       buffer->shape().dimensions().end());
+      arguments.signature.dynamic_args_signatures.push_back(sig);
+
+      keep_alive.emplace_back(std::move(buffer));
+    } else {
+      StatusOr<std::unique_ptr<xla::PjRtBuffer>> buffer =
+          ScalarToBuffer(arg, jax_enable_x64, pjrt_client, data_device);
+      if (!buffer.ok()) {
+        return buffer.status();
+      }
+      arg_buffers.push_back(buffer.ValueOrDie().get());
+      ArgSignature sig;
+      sig.dtype = buffer.ValueOrDie()->on_host_shape().element_type();
+      sig.weak_type = true;
+      arguments.signature.dynamic_args_signatures.push_back(sig);
+
+      keep_alive.emplace_back(std::move(buffer).ValueOrDie());
+    }
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
+CacheEntry& CompiledFunction::GetCacheEntry(
+    const py::args& args, const py::kwargs& kwargs,
+    const CallSignature& signature,
+    absl::optional<py::tuple> cache_miss_return) {
+  auto found_iterator = executables_.find(signature);
+  if (found_iterator != executables_.end()) {  // Cache hit!
+    if (!found_iterator->second->compilation_complete.HasBeenNotified()) {
+      py::gil_scoped_release gil_release;
+      found_iterator->second->compilation_complete.WaitForNotification();
+      if (found_iterator->second->compilation_error) {
+        throw found_iterator->second->compilation_error.value();
+      }
+    }
+    return *(found_iterator->second);
+  }
+  return SetAndReturnCacheEntry(args, kwargs, signature, cache_miss_return);
+}
+CacheEntry& CompiledFunction::SetAndReturnCacheEntry(
+    const py::args& args, const py::kwargs& kwargs,
+    const CallSignature& signature,
+    absl::optional<py::tuple> cache_miss_return) {
+  // We need to insert the element.
+  auto result = executables_.emplace(signature, std::make_unique<CacheEntry>());
+  auto it = result.first;
+  CacheEntry& cache_entry = *(it->second.get());
+  // CallSignatures in the cache own their keyword argument reference.
+  result.first->first.IncRef();
+
+  // Cache miss? Call the Python cache miss function.
+  py::tuple executable_and_pytree;
+  if (cache_miss_return) {
+    executable_and_pytree = cache_miss_return.value();
+  } else {
+    try {
+      executable_and_pytree = cache_miss_fun_(*args, **kwargs);
+    } catch (const std::exception& e) {
+      cache_entry.compilation_error = e;
+      cache_entry.compilation_complete.Notify();
+      throw;
+    }
+  }
+  if (executable_and_pytree.size() != 4) {
+    throw std::runtime_error(
+        "AssertionError: The cache miss function should return 4 "
+        "arguments.");
+  }
+  cache_entry.executable = py::cast<std::shared_ptr<xla::PyExecutable>>(
+      std::move(executable_and_pytree[0]));
+  int num_devices =
+      cache_entry.executable->pjrt_executable().local_devices().size();
+  if (num_devices != 1) {
+    throw std::runtime_error(absl::StrCat(
+        "Running on more than a single device is not currently supported."
+        "The underlying PjRtExecutable has ",
+        num_devices));
+  }
+  cache_entry.device =
+      cache_entry.executable->pjrt_executable().local_devices()[0];
+  cache_entry.out_pytree_def = py::cast<PyTreeDef>(executable_and_pytree[1]);
+
+  py::list shaped_arrays =
+      py::reinterpret_borrow<py::object>(executable_and_pytree[2]);
+  py::list lazy_expressions =
+      py::reinterpret_borrow<py::object>(executable_and_pytree[3]);
+
+  cache_entry.out_avals.reserve(shaped_arrays.size());
+  cache_entry.out_lazy_exprs.reserve(lazy_expressions.size());
+
+  int num_outputs = shaped_arrays.size();
+  for (int i = 0; i < num_outputs; ++i) {
+    py::object shaped_array =
+        py::reinterpret_borrow<py::object>(shaped_arrays[i]);
+    py::object lazy_expr =
+        py::reinterpret_borrow<py::object>(lazy_expressions[i]);
+
+    cache_entry.out_avals.push_back(shaped_array);
+    cache_entry.out_lazy_exprs.push_back(lazy_expr);
+  }
+
+  cache_entry.compilation_complete.Notify();
+  return cache_entry;
+}
+
+py::object CompiledFunction::Call(py::args args, py::kwargs kwargs) {
+  if (JitIsDisabled()) {
+    return fun_(*args, **kwargs);
+  }
+  ParsedArgumentsAsBuffers arguments;
+  FlattenArguments(args, kwargs, static_argnums_, arguments);
+
+  // TODO(jblespiau): It would be preferable to have a single location for
+  // locking code.
+  absl::optional<py::tuple> cache_miss_result = absl::nullopt;
+  if (!default_device_) {
+    // TODO(jblespiau): This code will deadlock if a jitted function
+    // recursively calls itself.
+    if (first_compilation_started_) {
+      if (!first_compilation_complete_.HasBeenNotified()) {
+        py::gil_scoped_release gil_release;
+        first_compilation_complete_.WaitForNotification();
+        if (first_compilation_error_) {
+          throw first_compilation_error_.value();
+        }
+      }
+    } else {
+      first_compilation_started_ = true;
+      try {
+        cache_miss_result = cache_miss_fun_(*args, **kwargs);
+      } catch (const std::exception& e) {
+        first_compilation_error_ = e;
+        first_compilation_complete_.Notify();
+        throw;
+      }
+      auto executable = py::cast<std::shared_ptr<xla::PyExecutable>>(
+          cache_miss_result.value()[0]);
+
+      pyclient_ = executable->client();
+      default_device_ = executable->LocalDevices()[0].contents;
+      first_compilation_complete_.Notify();
+    }
+  }
+
+  // The C++ jit do not support Tracers arguments yet. The Python-based jit
+  // function will be called if any of the dynamic arguments is unsupported.
+  if (!ConvertArgsToBuffers(jax_enable_x64_, *pyclient_, default_device_,
+                            arguments)
+           .ok()) {
+    return python_f_jitted_(*args, **kwargs);
+  }
+
+  CacheEntry& cache_entry =
+      GetCacheEntry(args, kwargs, arguments.signature, cache_miss_result);
+
+  std::vector<std::unique_ptr<xla::PyBuffer>> outputs =
+      ValueOrThrow(cache_entry.executable->PjRtExecute(arguments.arg_buffers));
+
+  static const auto* xla_module =
+      new py::module(py::module::import("jax.interpreters.xla"));
+  const auto& device_array = xla_module->attr("DeviceArray");
+
+  const std::vector<py::object>& out_avals = cache_entry.out_avals;
+  const std::vector<py::object>& out_lazy_exprs = cache_entry.out_lazy_exprs;
+
+  py::list flat_device_arrays;
+  for (int i = 0; i < outputs.size(); ++i) {
+    flat_device_arrays.append(device_array(
+        /*aval=*/out_avals[i], /*device=*/outputs[i]->device(),
+        /*lazy_expr=*/out_lazy_exprs[i],
+        /*device_buffer=*/std::move(outputs[i])));
+  }
+  return cache_entry.out_pytree_def.Unflatten(flat_device_arrays);
+}
+
+}  // namespace
+
+void BuildJaxjitSubmodule(pybind11::module& m) {
+  py::module jitlib = m.def_submodule("jax_jit", "Jax C++ jit library");
+
+  py::class_<CompiledFunction, std::unique_ptr<CompiledFunction>> cfun(
+      jitlib, "CompiledFunction");
+  cfun.def("__call__", &CompiledFunction::Call);
+  cfun.def_property_readonly("__signature__", &CompiledFunction::__signature__);
+
+  jitlib.def("set_disable_jit", &SetDisableJit);
+  jitlib.def("get_disable_jit", &GetDisableJit);
+  jitlib.def(
+      "jit",
+      [](py::function fun, py::function cache_miss_fun,
+         py::function fallback_on_unsupported_argument, bool jax_enable_x64,
+         bool jax_disable_jit,
+         std::vector<int> static_argnums) -> std::unique_ptr<CompiledFunction> {
+        return std::make_unique<CompiledFunction>(
+            std::move(fun), std::move(cache_miss_fun),
+            std::move(fallback_on_unsupported_argument), jax_enable_x64,
+            jax_disable_jit, std::move(static_argnums));
+      });
+
+  // Only for testing purposes
+  jitlib.def("_ScalarToBuffer", [](py::handle scalar, bool jax_enable_x64,
+                                   std::shared_ptr<xla::PyClient> client) {
+    xla::PjRtClient* pjrt_client = client->pjrt_client();
+
+    return std::make_unique<xla::PyBuffer>(
+        client,
+        ScalarToBuffer(scalar, jax_enable_x64, pjrt_client,
+                       pjrt_client->local_devices()[0])
+            .ValueOrDie(),
+        nullptr);
+  });
+}
+
+}  // namespace xla
diff --git a/tensorflow/python/util/tf32.cc b/tensorflow/compiler/xla/python/jax_jit.h
similarity index 74%
rename from tensorflow/python/util/tf32.cc
rename to tensorflow/compiler/xla/python/jax_jit.h
index 7dece6ccdae..2b1603aac27 100644
--- a/tensorflow/python/util/tf32.cc
+++ b/tensorflow/compiler/xla/python/jax_jit.h
@@ -13,10 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "pybind11/pybind11.h"
-#include "tensorflow/core/platform/tf32_utils.h"
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_JAX_JIT_H_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_JAX_JIT_H_
 
-PYBIND11_MODULE(_pywrap_tf32_execution, m) {
-  m.def("allow", &tensorflow::allow_tf32_execution);
-  m.def("is_allowed", &tensorflow::tf32_execution_allowed);
-}
+#include "pybind11/pybind11.h"
+
+namespace xla {
+
+void BuildJaxjitSubmodule(pybind11::module& m);
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_JAX_JIT_H_
diff --git a/tensorflow/compiler/xla/python/ops.cc b/tensorflow/compiler/xla/python/ops.cc
index 3ac4709b160..f8099412c73 100644
--- a/tensorflow/compiler/xla/python/ops.cc
+++ b/tensorflow/compiler/xla/python/ops.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "pybind11/attr.h"
 #include "pybind11/pybind11.h"
 #include "tensorflow/compiler/xla/client/lib/comparators.h"
+#include "tensorflow/compiler/xla/client/lib/lu_decomposition.h"
 #include "tensorflow/compiler/xla/client/lib/math.h"
 #include "tensorflow/compiler/xla/client/lib/qr.h"
 #include "tensorflow/compiler/xla/client/lib/self_adjoint_eig.h"
@@ -186,6 +187,13 @@ void BuildOpsSubmodule(py::module* m) {
         return std::make_pair(qr.q, qr.r);
       },
       py::arg("operand"), py::arg("full_matrices"));
+  ops.def(
+      "LU",
+      [](XlaOp a) -> StatusOr<std::tuple<XlaOp, XlaOp, XlaOp>> {
+        LuDecompositionResult lu = LuDecomposition(a);
+        return std::make_tuple(lu.lu, lu.pivots, lu.permutation);
+      },
+      py::arg("operand"));
   ops.def(
       "Eigh",
       [](XlaOp a, bool lower, int64 max_iter,
diff --git a/tensorflow/compiler/xla/python/outfeed_receiver.cc b/tensorflow/compiler/xla/python/outfeed_receiver.cc
index 7c029ca7d19..f6067e650c0 100644
--- a/tensorflow/compiler/xla/python/outfeed_receiver.cc
+++ b/tensorflow/compiler/xla/python/outfeed_receiver.cc
@@ -101,14 +101,14 @@ uint32_t constexpr kOutfeedCidShutdown = 0;
 // Encapsulates data received from a device outfeed.
 class OutfeedData {
  public:
-  OutfeedData(Device* device, uint32_t consumer_id, Shape shape)
+  OutfeedData(PjRtDevice* device, uint32_t consumer_id, Shape shape)
       : device_(device),
         consumer_id_(consumer_id),
         shape_(shape),
         literal_(nullptr),
         literal_size_bytes_(0) {}
 
-  Device* device() { return device_; }
+  PjRtDevice* device() { return device_; }
   uint32_t consumer_id() const { return consumer_id_; }
   Shape shape() const { return shape_; }
   std::unique_ptr<Literal> literal() {
@@ -123,7 +123,7 @@ class OutfeedData {
   std::string DebugString() const;
 
  private:
-  Device* device_;
+  PjRtDevice* device_;
   uint32_t consumer_id_;
   Shape shape_;
   std::unique_ptr<Literal> literal_;
@@ -187,8 +187,8 @@ class OutfeedReceiverImpl {
   Status SendShutdownOutfeedHeader(int device_idx);
 
   // Receives a raw Literal from a device outfeed.
-  StatusOr<std::unique_ptr<Literal>> ReceiveRawFromOutfeed(const Device* device,
-                                                           const Shape& shape);
+  StatusOr<std::unique_ptr<Literal>> ReceiveRawFromOutfeed(
+      const PjRtDevice* device, const Shape& shape);
 
   // Enqueues received data in the callbaback queue.
   void EnqueueReceivedData(std::unique_ptr<OutfeedData> received)
@@ -200,7 +200,7 @@ class OutfeedReceiverImpl {
 
   OutfeedReceiver::Callback callback_;
   // The devices on which we are listening.
-  std::vector<Device*> devices_;
+  std::vector<PjRtDevice*> devices_;
   // Maximum bytes capacity of the callback queue.
   uint64_t max_callback_queue_size_bytes_;
 
@@ -283,7 +283,7 @@ void OutfeedReceiverImpl::DeviceListenerThreadLoop(int device_idx) {
     absl::MutexLock lock(&mu_);
     ++num_listening_threads_;
   }
-  Device* device = devices_[device_idx];
+  PjRtDevice* device = devices_[device_idx];
   while (true) {
     Shape header_shape = ShapeUtil::MakeShape(U32, {kOutfeedHeaderWords});
     std::unique_ptr<Literal> header =
@@ -339,7 +339,7 @@ void OutfeedReceiverImpl::EnqueueReceivedData(
 }
 
 StatusOr<std::unique_ptr<Literal>> OutfeedReceiverImpl::ReceiveRawFromOutfeed(
-    const Device* device, const Shape& shape) {
+    const PjRtDevice* device, const Shape& shape) {
   std::shared_ptr<Literal> literal_shared;
 
   TF_ASSIGN_OR_RETURN(LocalDeviceState * local_device,
@@ -390,7 +390,7 @@ void OutfeedReceiverImpl::CallbackThreadLoop() {
 }
 
 Status OutfeedReceiverImpl::SendShutdownOutfeedHeader(int device_idx) {
-  const Device* device = devices_[device_idx];
+  const PjRtDevice* device = devices_[device_idx];
   constexpr int consumer_id = kOutfeedCidShutdown;
   VLOG(2) << "[" << device->DebugString()
           << "] SendSpecialHeader cons=" << consumer_id;
diff --git a/tensorflow/compiler/xla/python/outfeed_receiver.h b/tensorflow/compiler/xla/python/outfeed_receiver.h
index a8dcc559810..46e2e5d9526 100644
--- a/tensorflow/compiler/xla/python/outfeed_receiver.h
+++ b/tensorflow/compiler/xla/python/outfeed_receiver.h
@@ -33,7 +33,7 @@ class OutfeedReceiver {
  public:
   // A callback takes: device, consumer id, received.
   using Callback =
-      std::function<void(Device*, uint32_t, std::shared_ptr<Literal>)>;
+      std::function<void(PjRtDevice*, uint32_t, std::shared_ptr<Literal>)>;
 
   // Constructs the receiver for the given clients and callback function.
   //
diff --git a/tensorflow/compiler/xla/python/outfeed_receiver_py.cc b/tensorflow/compiler/xla/python/outfeed_receiver_py.cc
index d297df332ff..a732ab8e21a 100644
--- a/tensorflow/compiler/xla/python/outfeed_receiver_py.cc
+++ b/tensorflow/compiler/xla/python/outfeed_receiver_py.cc
@@ -40,7 +40,7 @@ class OutfeedReceiverForPython {
  public:
   // A callback to Python takes: consumer id, received literal.
   using CallbackToPython =
-      std::function<void(ClientAndPtr<Device>, uint32_t, pybind11::object)>;
+      std::function<void(ClientAndPtr<PjRtDevice>, uint32_t, pybind11::object)>;
 
   OutfeedReceiverForPython(CallbackToPython callback_python,
                            std::vector<std::shared_ptr<PyClient>> clients,
@@ -48,7 +48,7 @@ class OutfeedReceiverForPython {
       : callback_python_(std::move(callback_python)),
         clients_(std::move(clients)) {
     OutfeedReceiver::Callback callback =
-        [this](Device* device, uint32_t consumer_id,
+        [this](PjRtDevice* device, uint32_t consumer_id,
                std::shared_ptr<Literal> literal) {
           this->Callback(device, consumer_id, std::move(literal));
         };
@@ -86,7 +86,7 @@ class OutfeedReceiverForPython {
                                                   arrays);
   }
 
-  void Callback(Device* device, uint32_t consumer_id,
+  void Callback(PjRtDevice* device, uint32_t consumer_id,
                 std::shared_ptr<Literal> literal) {
     {
       absl::MutexLock lock(&mu_);
@@ -106,7 +106,7 @@ class OutfeedReceiverForPython {
         LiteralToPython(std::move(literal)).ValueOrDie();
     // The callback_ should handle all exceptions in user-code. If we get
     // an exception here, it is a bug in the callback and we should stop.
-    callback_python_(WrapWithClient<Device>(*it, device), consumer_id,
+    callback_python_(WrapWithClient<PjRtDevice>(*it, device), consumer_id,
                      std::move(literal_python));
   }
 
diff --git a/tensorflow/compiler/xla/python/outfeed_receiver_test.cc b/tensorflow/compiler/xla/python/outfeed_receiver_test.cc
index e8a5063b70b..919dafe2e0b 100644
--- a/tensorflow/compiler/xla/python/outfeed_receiver_test.cc
+++ b/tensorflow/compiler/xla/python/outfeed_receiver_test.cc
@@ -78,11 +78,11 @@ TEST(OutfeedReceiverTest, ReceiveOutfeedSimple) {
   std::vector<PjRtClient*> clients{cpu_client.get()};
 
   auto receiver = absl::make_unique<Accumulator>();
-  OutfeedReceiver::Callback callback = [&receiver](
-                                           Device* device, uint32_t consumer_id,
-                                           std::shared_ptr<Literal> data) {
-    receiver->Receive(consumer_id, data);
-  };
+  OutfeedReceiver::Callback callback =
+      [&receiver](PjRtDevice* device, uint32_t consumer_id,
+                  std::shared_ptr<Literal> data) {
+        receiver->Receive(consumer_id, data);
+      };
   auto outfeed_receiver =
       std::make_shared<OutfeedReceiver>(callback, clients, 128);
   outfeed_receiver->Start();
@@ -111,11 +111,11 @@ TEST(OutfeedReceiverTest, ReceiveOutfeedTwoComputations) {
   std::vector<PjRtClient*> clients{cpu_client.get()};
 
   auto receiver = absl::make_unique<Accumulator>();
-  OutfeedReceiver::Callback callback = [&receiver](
-                                           Device* device, uint32_t consumer_id,
-                                           std::shared_ptr<Literal> data) {
-    receiver->Receive(consumer_id, data);
-  };
+  OutfeedReceiver::Callback callback =
+      [&receiver](PjRtDevice* device, uint32_t consumer_id,
+                  std::shared_ptr<Literal> data) {
+        receiver->Receive(consumer_id, data);
+      };
   auto outfeed_receiver =
       std::make_shared<OutfeedReceiver>(callback, clients, 128);
   outfeed_receiver->Start();
@@ -156,11 +156,11 @@ TEST(OutfeedReceiverTest, ReceiveOutfeedTwoOutfeed) {
   std::vector<PjRtClient*> clients{cpu_client.get()};
 
   auto receiver = absl::make_unique<Accumulator>();
-  OutfeedReceiver::Callback callback = [&receiver](
-                                           Device* device, uint32_t consumer_id,
-                                           std::shared_ptr<Literal> data) {
-    receiver->Receive(consumer_id, data);
-  };
+  OutfeedReceiver::Callback callback =
+      [&receiver](PjRtDevice* device, uint32_t consumer_id,
+                  std::shared_ptr<Literal> data) {
+        receiver->Receive(consumer_id, data);
+      };
   auto outfeed_receiver =
       std::make_shared<OutfeedReceiver>(callback, clients, 128);
   outfeed_receiver->Start();
@@ -199,11 +199,11 @@ TEST(OutfeedReceiverTest, DifferentShapeForConsumerIdError) {
   std::vector<PjRtClient*> clients{cpu_client.get()};
 
   auto receiver = absl::make_unique<Accumulator>();
-  OutfeedReceiver::Callback callback = [&receiver](
-                                           Device* device, uint32_t consumer_id,
-                                           std::shared_ptr<Literal> data) {
-    receiver->Receive(consumer_id, data);
-  };
+  OutfeedReceiver::Callback callback =
+      [&receiver](PjRtDevice* device, uint32_t consumer_id,
+                  std::shared_ptr<Literal> data) {
+        receiver->Receive(consumer_id, data);
+      };
   auto outfeed_receiver =
       std::make_shared<OutfeedReceiver>(callback, clients, 128);
   outfeed_receiver->Start();
@@ -233,11 +233,11 @@ TEST(OutfeedReceiverTest, InvalidConsumerIdError) {
   std::vector<PjRtClient*> clients{cpu_client.get()};
 
   auto receiver = absl::make_unique<Accumulator>();
-  OutfeedReceiver::Callback callback = [&receiver](
-                                           Device* device, uint32_t consumer_id,
-                                           std::shared_ptr<Literal> data) {
-    receiver->Receive(consumer_id, data);
-  };
+  OutfeedReceiver::Callback callback =
+      [&receiver](PjRtDevice* device, uint32_t consumer_id,
+                  std::shared_ptr<Literal> data) {
+        receiver->Receive(consumer_id, data);
+      };
   auto outfeed_receiver =
       std::make_shared<OutfeedReceiver>(callback, clients, 128);
   outfeed_receiver->Start();
diff --git a/tensorflow/compiler/xla/python/py_buffer.cc b/tensorflow/compiler/xla/python/py_buffer.cc
index ed4787310b4..b32fe047530 100644
--- a/tensorflow/compiler/xla/python/py_buffer.cc
+++ b/tensorflow/compiler/xla/python/py_buffer.cc
@@ -51,12 +51,12 @@ PyBuffer::~PyBuffer() {
   }
 }
 
-ClientAndPtr<Device> PyBuffer::device() const {
+ClientAndPtr<PjRtDevice> PyBuffer::device() const {
   return WrapWithClient(client_, buffer_->device());
 }
 
 StatusOr<std::unique_ptr<PyBuffer>> PyBuffer::CopyToDevice(
-    const ClientAndPtr<Device>& dst_device) const {
+    const ClientAndPtr<PjRtDevice>& dst_device) const {
   CHECK(dst_device.get() != nullptr);
   GlobalPyRefManager()->CollectGarbage();
   std::unique_ptr<PjRtBuffer> out;
diff --git a/tensorflow/compiler/xla/python/py_buffer.h b/tensorflow/compiler/xla/python/py_buffer.h
index 76791e969cb..d7906574ec1 100644
--- a/tensorflow/compiler/xla/python/py_buffer.h
+++ b/tensorflow/compiler/xla/python/py_buffer.h
@@ -38,12 +38,12 @@ class PyBuffer {
   std::shared_ptr<PyClient> client() const { return client_; }
   PjRtBuffer* buffer() const { return buffer_.get(); }
 
-  ClientAndPtr<Device> device() const;
+  ClientAndPtr<PjRtDevice> device() const;
   const std::string& platform_name() const { return buffer_->platform_name(); }
   bool is_deleted() const { return buffer_->IsDeleted(); }
 
   StatusOr<std::unique_ptr<PyBuffer>> CopyToDevice(
-      const ClientAndPtr<Device>& dst_device) const;
+      const ClientAndPtr<PjRtDevice>& dst_device) const;
 
   void Delete() { return buffer_->Delete(); }
 
diff --git a/tensorflow/compiler/xla/python/py_client.cc b/tensorflow/compiler/xla/python/py_client.cc
index 1f07c6e2042..6df11322564 100644
--- a/tensorflow/compiler/xla/python/py_client.cc
+++ b/tensorflow/compiler/xla/python/py_client.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/python/py_client.h"
 
+#include <memory>
+
 #include "absl/container/flat_hash_map.h"
 #include "tensorflow/compiler/xla/python/py_buffer.h"
 #include "tensorflow/compiler/xla/python/py_executable.h"
@@ -31,8 +33,8 @@ namespace pprof = tensorflow::tfprof::pprof;
 PyClient::PyClient(std::shared_ptr<PjRtClient> pjrt_client)
     : pjrt_client_(std::move(pjrt_client)) {}
 
-std::vector<ClientAndPtr<Device>> PyClient::Devices() {
-  std::vector<ClientAndPtr<Device>> devices;
+std::vector<ClientAndPtr<PjRtDevice>> PyClient::Devices() {
+  std::vector<ClientAndPtr<PjRtDevice>> devices;
   devices.reserve(pjrt_client_->devices().size());
   for (const auto& device : pjrt_client_->devices()) {
     devices.push_back(WrapWithClient(shared_from_this(), device.get()));
@@ -40,21 +42,21 @@ std::vector<ClientAndPtr<Device>> PyClient::Devices() {
   return devices;
 }
 
-std::vector<ClientAndPtr<Device>> PyClient::LocalDevices() {
-  std::vector<ClientAndPtr<Device>> devices;
+std::vector<ClientAndPtr<PjRtDevice>> PyClient::LocalDevices() {
+  std::vector<ClientAndPtr<PjRtDevice>> devices;
   devices.reserve(pjrt_client_->local_devices().size());
-  for (Device* device : pjrt_client_->local_devices()) {
+  for (PjRtDevice* device : pjrt_client_->local_devices()) {
     devices.push_back(WrapWithClient(shared_from_this(), device));
   }
   return devices;
 }
 
-StatusOr<std::vector<std::vector<ClientAndPtr<Device>>>>
+StatusOr<std::vector<std::vector<ClientAndPtr<PjRtDevice>>>>
 PyClient::GetDefaultDeviceAssignment(int num_replicas, int num_partitions) {
   TF_ASSIGN_OR_RETURN(
       DeviceAssignment device_assignment,
       pjrt_client_->GetDefaultDeviceAssignment(num_replicas, num_partitions));
-  std::vector<std::vector<ClientAndPtr<Device>>> result;
+  std::vector<std::vector<ClientAndPtr<PjRtDevice>>> result;
   result.resize(num_replicas);
   for (int r = 0; r < num_replicas; ++r) {
     result[r].resize(num_partitions);
@@ -68,12 +70,12 @@ PyClient::GetDefaultDeviceAssignment(int num_replicas, int num_partitions) {
   return result;
 }
 
-StatusOr<std::vector<ClientAndPtr<Device>>>
+StatusOr<std::vector<ClientAndPtr<PjRtDevice>>>
 PyClient::GetDefaultDeviceAssignment1D(int num_replicas) {
   TF_ASSIGN_OR_RETURN(DeviceAssignment device_assignment,
                       pjrt_client_->GetDefaultDeviceAssignment(
                           num_replicas, /*num_partitions=*/1));
-  std::vector<ClientAndPtr<Device>> result;
+  std::vector<ClientAndPtr<PjRtDevice>> result;
   for (int i = 0; i < num_replicas; ++i) {
     int device_id = device_assignment(i, 0);
     auto iter = pjrt_client_->id_to_device().find(device_id);
@@ -84,7 +86,7 @@ PyClient::GetDefaultDeviceAssignment1D(int num_replicas) {
 }
 
 StatusOr<std::unique_ptr<PyBuffer>> PyClient::BufferFromPyval(
-    const pybind11::object& argument, Device* device, bool force_copy,
+    const pybind11::object& argument, PjRtDevice* device, bool force_copy,
     PjRtBuffer::HostBufferSemantics host_buffer_semantics) {
   if (device == nullptr) {
     TF_RET_CHECK(!pjrt_client_->local_devices().empty());
@@ -104,7 +106,6 @@ StatusOr<std::unique_ptr<PyBuffer>> PyClient::BufferFromPyval(
     return InvalidArgument("from_python argument must be an array.");
   }
 
-  TF_ASSIGN_OR_RETURN(PythonBufferTree tree, GetPythonBufferTree(argument));
   std::shared_ptr<PythonRefManager::ManagedPyObjects> py_buffer_ref =
       GlobalPyRefManager()->ManageReference(std::move(c->array));
 
@@ -121,7 +122,7 @@ StatusOr<std::unique_ptr<PyBuffer>> PyClient::BufferFromPyval(
                                     std::move(traceback));
 }
 
-StatusOr<std::unique_ptr<PyExecutable>> PyClient::Compile(
+StatusOr<std::shared_ptr<PyExecutable>> PyClient::Compile(
     const XlaComputation& computation, CompileOptions options) {
   std::unique_ptr<PjRtExecutable> executable;
   absl::optional<std::string> fingerprint;
@@ -134,7 +135,7 @@ StatusOr<std::unique_ptr<PyExecutable>> PyClient::Compile(
                         pjrt_client_->ExecutableFingerprint(*executable));
   }
   auto traceback = Traceback::Get();
-  return std::make_unique<PyExecutable>(
+  return std::make_shared<PyExecutable>(
       shared_from_this(), std::move(executable), std::move(traceback),
       std::move(fingerprint));
 }
@@ -205,7 +206,7 @@ namespace {
 struct HeapProfileKey {
   Traceback* traceback;
   int64 size;
-  Device* device;
+  PjRtDevice* device;
   bool operator==(const HeapProfileKey& other) const;
 };
 
diff --git a/tensorflow/compiler/xla/python/py_client.h b/tensorflow/compiler/xla/python/py_client.h
index d33f3dadd7d..f12a4ae4f0a 100644
--- a/tensorflow/compiler/xla/python/py_client.h
+++ b/tensorflow/compiler/xla/python/py_client.h
@@ -100,14 +100,14 @@ class PyClient : public std::enable_shared_from_this<PyClient> {
   int device_count() const { return pjrt_client_->device_count(); }
   int host_id() const { return pjrt_client_->host_id(); }
 
-  std::vector<ClientAndPtr<Device>> Devices();
-  std::vector<ClientAndPtr<Device>> LocalDevices();
+  std::vector<ClientAndPtr<PjRtDevice>> Devices();
+  std::vector<ClientAndPtr<PjRtDevice>> LocalDevices();
 
-  StatusOr<std::vector<std::vector<ClientAndPtr<Device>>>>
+  StatusOr<std::vector<std::vector<ClientAndPtr<PjRtDevice>>>>
   GetDefaultDeviceAssignment(int num_replicas, int num_partitions);
 
   // TODO(skye): delete after all callers can handle 2D output
-  StatusOr<std::vector<ClientAndPtr<Device>>> GetDefaultDeviceAssignment1D(
+  StatusOr<std::vector<ClientAndPtr<PjRtDevice>>> GetDefaultDeviceAssignment1D(
       int num_replicas);
 
   StatusOr<ChannelHandle> CreateChannelHandle() {
@@ -121,10 +121,10 @@ class PyClient : public std::enable_shared_from_this<PyClient> {
   }
 
   StatusOr<std::unique_ptr<PyBuffer>> BufferFromPyval(
-      const pybind11::object& argument, Device* device, bool force_copy,
+      const pybind11::object& argument, PjRtDevice* device, bool force_copy,
       PjRtBuffer::HostBufferSemantics host_buffer_semantics);
 
-  StatusOr<std::unique_ptr<PyExecutable>> Compile(
+  StatusOr<std::shared_ptr<PyExecutable>> Compile(
       const XlaComputation& computation, CompileOptions options);
 
   pybind11::bytes HeapProfile();
diff --git a/tensorflow/compiler/xla/python/py_executable.cc b/tensorflow/compiler/xla/python/py_executable.cc
index b2cd2af56ea..53891b96846 100644
--- a/tensorflow/compiler/xla/python/py_executable.cc
+++ b/tensorflow/compiler/xla/python/py_executable.cc
@@ -37,7 +37,9 @@ PyExecutable::PyExecutable(std::shared_ptr<PyClient> client,
   if (next_) {
     next_->prev_ = this;
   }
+  options_.untuple_result = true;
   if (fingerprint_) {
+    options_.launch_id = tensorflow::Fingerprint32(*fingerprint_);
     VLOG(1) << "Fingerprint for executable " << executable_->name() << ": "
             << *fingerprint_;
   }
@@ -56,30 +58,42 @@ PyExecutable::~PyExecutable() {
   }
 }
 
-std::vector<ClientAndPtr<Device>> PyExecutable::LocalDevices() const {
-  std::vector<ClientAndPtr<Device>> devices;
+std::vector<ClientAndPtr<PjRtDevice>> PyExecutable::LocalDevices() const {
+  std::vector<ClientAndPtr<PjRtDevice>> devices;
   devices.reserve(executable_->local_devices().size());
-  for (Device* device : executable_->local_devices()) {
+  for (PjRtDevice* device : executable_->local_devices()) {
     devices.push_back(WrapWithClient(client_, device));
   }
   return devices;
 }
 
+StatusOr<std::vector<std::unique_ptr<PyBuffer>>> PyExecutable::PjRtExecute(
+    absl::Span<PjRtBuffer* const> args) {
+  std::vector<std::unique_ptr<PjRtBuffer>> output_buffers;
+  {
+    py::gil_scoped_release gil_release;
+    TF_ASSIGN_OR_RETURN(output_buffers, executable_->Execute(args, options_));
+  }
+  auto traceback = Traceback::Get();
+  std::vector<std::unique_ptr<PyBuffer>> outputs;
+  outputs.reserve(output_buffers.size());
+  for (auto& buffer : output_buffers) {
+    outputs.push_back(
+        std::make_unique<PyBuffer>(client_, std::move(buffer), traceback));
+  }
+  return outputs;
+}
+
 StatusOr<std::vector<std::unique_ptr<PyBuffer>>> PyExecutable::Execute(
     absl::Span<PyBuffer* const> args) {
   std::vector<std::unique_ptr<PjRtBuffer>> output_buffers;
   {
     py::gil_scoped_release gil_release;
-    ExecuteOptions options;
-    options.untuple_result = true;
-    if (fingerprint_) {
-      options.launch_id = tensorflow::Fingerprint32(*fingerprint_);
-    }
     std::vector<PjRtBuffer*> arg_buffers(args.size());
     absl::c_transform(args, arg_buffers.begin(),
                       [](PyBuffer* buf) { return buf->buffer(); });
     TF_ASSIGN_OR_RETURN(output_buffers,
-                        executable_->Execute(arg_buffers, options));
+                        executable_->Execute(arg_buffers, options_));
   }
   auto traceback = Traceback::Get();
   std::vector<std::unique_ptr<PyBuffer>> outputs;
@@ -97,11 +111,6 @@ PyExecutable::ExecuteOnLocalDevices(
   std::vector<std::vector<std::unique_ptr<PjRtBuffer>>> output_buffers;
   {
     py::gil_scoped_release gil_release;
-    ExecuteOptions options;
-    options.untuple_result = true;
-    if (fingerprint_) {
-      options.launch_id = tensorflow::Fingerprint32(*fingerprint_);
-    }
     std::vector<std::vector<PjRtBuffer*>> arg_buffers(args.size());
     for (int computation = 0; computation < args.size(); ++computation) {
       arg_buffers[computation].resize(args[computation].size());
@@ -109,7 +118,7 @@ PyExecutable::ExecuteOnLocalDevices(
                         [](PyBuffer* buf) { return buf->buffer(); });
     }
     TF_ASSIGN_OR_RETURN(output_buffers, executable_->ExecuteOnLocalDevices(
-                                            arg_buffers, options));
+                                            arg_buffers, options_));
   }
   auto traceback = Traceback::Get();
   std::vector<std::vector<std::unique_ptr<PyBuffer>>> outputs;
diff --git a/tensorflow/compiler/xla/python/py_executable.h b/tensorflow/compiler/xla/python/py_executable.h
index 1051d065335..2e51548ae51 100644
--- a/tensorflow/compiler/xla/python/py_executable.h
+++ b/tensorflow/compiler/xla/python/py_executable.h
@@ -47,7 +47,7 @@ class PyExecutable {
     return executable_->local_logical_device_ids();
   }
 
-  std::vector<ClientAndPtr<Device>> LocalDevices() const;
+  std::vector<ClientAndPtr<PjRtDevice>> LocalDevices() const;
 
   int64 SizeOfGeneratedCodeInBytes() const {
     return executable_->SizeOfGeneratedCodeInBytes();
@@ -58,6 +58,10 @@ class PyExecutable {
   StatusOr<std::vector<std::unique_ptr<PyBuffer>>> Execute(
       absl::Span<PyBuffer* const> args);
 
+  // Same as above, but take as inputs `PjRtBuffer*`. Only targets C++ code.
+  StatusOr<std::vector<std::unique_ptr<PyBuffer>>> PjRtExecute(
+      absl::Span<PjRtBuffer* const> args);
+
   StatusOr<std::vector<std::vector<std::unique_ptr<PyBuffer>>>>
   ExecuteOnLocalDevices(absl::Span<const std::vector<PyBuffer*>> args);
 
@@ -65,6 +69,8 @@ class PyExecutable {
 
   Traceback* traceback() { return traceback_.get(); }
 
+  const PjRtExecutable& pjrt_executable() const { return *executable_; }
+
  private:
   friend class PyClient;
 
@@ -77,6 +83,9 @@ class PyExecutable {
   // aren't implemented.
   absl::optional<std::string> fingerprint_;
 
+  // The options to pass to `executable_.Execute`.
+  ExecuteOptions options_;
+
   // Doubly-linked list of all executables known to the client. Protected by the
   // GIL.
   PyExecutable* next_;
diff --git a/tensorflow/compiler/xla/python/pytree.cc b/tensorflow/compiler/xla/python/pytree.cc
new file mode 100644
index 00000000000..bf0bb1a8d93
--- /dev/null
+++ b/tensorflow/compiler/xla/python/pytree.cc
@@ -0,0 +1,648 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Caution: this code uses exceptions. The exception use is local to the
+// binding code and the idiomatic way to emit Python exceptions.
+
+#include "tensorflow/compiler/xla/python/pytree.h"
+
+#include <memory>
+#include <stdexcept>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/hash/hash.h"
+#include "absl/memory/memory.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/str_join.h"
+#include "pybind11/pybind11.h"
+#include "pybind11/pytypes.h"
+#include "pybind11/stl.h"
+
+namespace xla {
+
+namespace py = pybind11;
+
+/*static*/ CustomNodeRegistry* CustomNodeRegistry::Singleton() {
+  static auto* registry = new CustomNodeRegistry;
+  return registry;
+}
+
+/*static*/ void CustomNodeRegistry::Register(py::object type,
+                                             py::function to_iterable,
+                                             py::function from_iterable) {
+  CustomNodeRegistry* registry = Singleton();
+  auto registration = absl::make_unique<Registration>();
+  registration->type = type;
+  registration->to_iterable = std::move(to_iterable);
+  registration->from_iterable = std::move(from_iterable);
+  auto it = registry->registrations_.emplace(type, std::move(registration));
+  if (!it.second) {
+    throw std::invalid_argument(
+        absl::StrFormat("Duplicate custom PyTreeDef type registration for %s.",
+                        py::repr(type)));
+  }
+}
+
+/*static*/ const CustomNodeRegistry::Registration* CustomNodeRegistry::Lookup(
+    py::handle type) {
+  CustomNodeRegistry* registry = Singleton();
+  auto it =
+      registry->registrations_.find(py::reinterpret_borrow<py::object>(type));
+  return it == registry->registrations_.end() ? nullptr : it->second.get();
+}
+
+bool PyTreeDef::operator==(const PyTreeDef& other) const {
+  if (traversal_.size() != other.traversal_.size()) {
+    return false;
+  }
+  for (size_t i = 0; i < traversal_.size(); ++i) {
+    const Node& a = traversal_[i];
+    const Node& b = other.traversal_[i];
+    if (a.kind != b.kind || a.arity != b.arity ||
+        (a.node_data.ptr() == nullptr) != (b.node_data.ptr() == nullptr) ||
+        a.custom != b.custom) {
+      return false;
+    }
+    if (a.node_data && a.node_data.not_equal(b.node_data)) {
+      return false;
+    }
+    // We don't need to test equality of num_leaves and num_nodes since they
+    // are derivable from the other node data.
+  }
+  return true;
+}
+
+/*static*/ PyTreeDef::Kind PyTreeDef::GetKind(
+    const py::handle& obj, CustomNodeRegistry::Registration const** custom) {
+  const PyObject* ptr = obj.ptr();
+  if (PyTuple_CheckExact(ptr)) return Kind::kTuple;
+  if (PyList_CheckExact(ptr)) return Kind::kList;
+  if (PyDict_CheckExact(ptr)) return Kind::kDict;
+  if ((*custom = CustomNodeRegistry::Lookup(obj.get_type()))) {
+    return Kind::kCustom;
+  } else if (py::isinstance<py::none>(obj)) {
+    return Kind::kNone;
+  } else if (py::isinstance<py::tuple>(obj) && py::hasattr(obj, "_fields")) {
+    // We can only identify namedtuples heuristically, here by the presence of
+    // a _fields attribute.
+    return Kind::kNamedTuple;
+  } else {
+    return Kind::kLeaf;
+  }
+}
+
+void PyTreeDef::FlattenInto(py::handle handle,
+                            std::vector<py::object>& leaves) {
+  Node node;
+  int start_num_nodes = traversal_.size();
+  int start_num_leaves = leaves.size();
+  node.kind = GetKind(handle, &node.custom);
+  if (node.kind == Kind::kNone) {
+    // Nothing to do.
+  } else if (node.kind == Kind::kTuple) {
+    py::tuple tuple = py::reinterpret_borrow<py::tuple>(handle);
+    node.arity = tuple.size();
+    for (py::handle entry : tuple) {
+      FlattenInto(entry, leaves);
+    }
+  } else if (node.kind == Kind::kList) {
+    py::list list = py::reinterpret_borrow<py::list>(handle);
+    node.arity = list.size();
+    for (py::handle entry : list) {
+      FlattenInto(entry, leaves);
+    }
+  } else if (node.kind == Kind::kDict) {
+    py::dict dict = py::reinterpret_borrow<py::dict>(handle);
+    py::list keys = py::reinterpret_steal<py::list>(PyDict_Keys(dict.ptr()));
+    if (PyList_Sort(keys.ptr())) {
+      throw std::runtime_error("Dictionary key sort failed.");
+    }
+    for (py::handle key : keys) {
+      FlattenInto(dict[key], leaves);
+    }
+    node.arity = dict.size();
+    node.node_data = std::move(keys);
+  } else if (node.kind == Kind::kCustom) {
+    py::tuple out = py::cast<py::tuple>(node.custom->to_iterable(handle));
+    if (out.size() != 2) {
+      throw std::runtime_error(
+          "PyTree custom to_iterable function should return a pair");
+    }
+    node.node_data = out[1];
+    node.arity = 0;
+    for (py::handle entry : py::cast<py::iterable>(out[0])) {
+      ++node.arity;
+      FlattenInto(entry, leaves);
+    }
+  } else if (node.kind == Kind::kNamedTuple) {
+    py::tuple tuple = py::reinterpret_borrow<py::tuple>(handle);
+    node.arity = tuple.size();
+    node.node_data = py::reinterpret_borrow<py::object>(tuple.get_type());
+    for (py::handle entry : tuple) {
+      FlattenInto(entry, leaves);
+    }
+  } else {
+    assert(node.kind == Kind::kLeaf);
+    leaves.push_back(pybind11::reinterpret_borrow<py::object>(handle));
+  }
+  node.num_nodes = traversal_.size() - start_num_nodes + 1;
+  node.num_leaves = leaves.size() - start_num_leaves;
+  traversal_.push_back(std::move(node));
+}
+
+/*static*/ std::pair<std::vector<py::object>, std::unique_ptr<PyTreeDef>>
+PyTreeDef::Flatten(py::handle x) {
+  std::vector<py::object> leaves;
+  auto tree = absl::make_unique<PyTreeDef>();
+  tree->FlattenInto(x, leaves);
+  return std::make_pair(std::move(leaves), std::move(tree));
+}
+
+/*static*/ bool PyTreeDef::AllLeaves(const py::iterable& x) {
+  const CustomNodeRegistry::Registration* custom;
+  for (const py::handle& h : x) {
+    if (GetKind(h, &custom) != Kind::kLeaf) return false;
+  }
+  return true;
+}
+
+py::object PyTreeDef::Unflatten(py::iterable leaves) const {
+  std::vector<py::object> agenda;
+  auto it = leaves.begin();
+  int leaf_count = 0;
+  for (const Node& node : traversal_) {
+    if (agenda.size() < node.arity) {
+      throw std::logic_error("Too few elements for TreeDef node.");
+    }
+    switch (node.kind) {
+      case Kind::kLeaf:
+        if (it == leaves.end()) {
+          throw std::invalid_argument(absl::StrFormat(
+              "Too few leaves for PyTreeDef; expected %d, got %d", num_leaves(),
+              leaf_count));
+        }
+        agenda.push_back(py::reinterpret_borrow<py::object>(*it));
+        ++it;
+        ++leaf_count;
+        break;
+
+      case Kind::kNone:
+      case Kind::kTuple:
+      case Kind::kNamedTuple:
+      case Kind::kList:
+      case Kind::kDict:
+      case Kind::kCustom: {
+        const int size = agenda.size();
+        absl::Span<py::object> span;
+        if (node.arity > 0) {
+          span = absl::Span<py::object>(&agenda[size - node.arity], node.arity);
+        }
+        py::object o = MakeNode(node, span);
+        agenda.resize(size - node.arity);
+        agenda.push_back(o);
+        break;
+      }
+    }
+  }
+  if (it != leaves.end()) {
+    throw std::invalid_argument(absl::StrFormat(
+        "Too many leaves for PyTreeDef; expected %d.", num_leaves()));
+  }
+  if (agenda.size() != 1) {
+    throw std::logic_error("PyTreeDef traversal did not yield a singleton.");
+  }
+  return std::move(agenda.back());
+}
+
+/*static*/ py::object PyTreeDef::MakeNode(const PyTreeDef::Node& node,
+                                          absl::Span<py::object> children) {
+  if (children.size() != node.arity) {
+    throw std::logic_error("Node arity mismatch.");
+  }
+  switch (node.kind) {
+    case Kind::kLeaf:
+      throw std::logic_error("MakeNode not implemented for leaves.");
+
+    case Kind::kNone:
+      return py::none();
+
+    case Kind::kTuple:
+    case Kind::kNamedTuple: {
+      py::tuple tuple(node.arity);
+      for (int i = 0; i < node.arity; ++i) {
+        tuple[i] = std::move(children[i]);
+      }
+      if (node.kind == Kind::kNamedTuple) {
+        return node.node_data(*tuple);
+      } else {
+        return std::move(tuple);
+      }
+    }
+
+    case Kind::kList: {
+      py::list list(node.arity);
+      for (int i = 0; i < node.arity; ++i) {
+        list[i] = std::move(children[i]);
+      }
+      return std::move(list);
+    }
+
+    case Kind::kDict: {
+      py::dict dict;
+      py::list keys = py::reinterpret_borrow<py::list>(node.node_data);
+      for (int i = 0; i < node.arity; ++i) {
+        dict[keys[i]] = std::move(children[i]);
+      }
+      return std::move(dict);
+      break;
+    }
+    case Kind::kCustom: {
+      py::tuple tuple(node.arity);
+      for (int i = 0; i < node.arity; ++i) {
+        tuple[i] = std::move(children[i]);
+      }
+      return node.custom->from_iterable(node.node_data, tuple);
+    }
+  }
+  throw std::logic_error("Unreachable code.");
+}
+
+py::list PyTreeDef::FlattenUpTo(py::handle xs) const {
+  py::list leaves(num_leaves());
+  std::vector<py::object> agenda;
+  agenda.push_back(py::reinterpret_borrow<py::object>(xs));
+  auto it = traversal_.rbegin();
+  int leaf = num_leaves() - 1;
+  while (!agenda.empty()) {
+    if (it == traversal_.rend()) {
+      throw std::invalid_argument(absl::StrFormat(
+          "Tree structures did not match: %s vs %s", py::repr(xs), ToString()));
+    }
+    const Node& node = *it;
+    py::object object = agenda.back();
+    agenda.pop_back();
+    ++it;
+
+    switch (node.kind) {
+      case Kind::kLeaf:
+        if (leaf < 0) {
+          throw std::logic_error("Leaf count mismatch.");
+        }
+        leaves[leaf] = py::reinterpret_borrow<py::object>(object);
+        --leaf;
+        break;
+
+      case Kind::kNone:
+        break;
+
+      case Kind::kTuple: {
+        if (!PyTuple_CheckExact(object.ptr())) {
+          throw std::invalid_argument(
+              absl::StrFormat("Expected tuple, got %s.", py::repr(object)));
+        }
+        py::tuple tuple = py::reinterpret_borrow<py::tuple>(object);
+        if (tuple.size() != node.arity) {
+          throw std::invalid_argument(
+              absl::StrFormat("Tuple arity mismatch: %d != %d; tuple: %s.",
+                              tuple.size(), node.arity, py::repr(object)));
+        }
+        for (py::handle entry : tuple) {
+          agenda.push_back(py::reinterpret_borrow<py::object>(entry));
+        }
+        break;
+      }
+
+      case Kind::kList: {
+        if (!PyList_CheckExact(object.ptr())) {
+          throw std::invalid_argument(
+              absl::StrFormat("Expected list, got %s.", py::repr(object)));
+        }
+        py::list list = py::reinterpret_borrow<py::list>(object);
+        if (list.size() != node.arity) {
+          throw std::invalid_argument(
+              absl::StrFormat("List arity mismatch: %d != %d; list: %s.",
+                              list.size(), node.arity, py::repr(object)));
+        }
+        for (py::handle entry : list) {
+          agenda.push_back(py::reinterpret_borrow<py::object>(entry));
+        }
+        break;
+      }
+
+      case Kind::kDict: {
+        if (!PyDict_CheckExact(object.ptr())) {
+          throw std::invalid_argument(
+              absl::StrFormat("Expected dict, got %s.", py::repr(object)));
+        }
+        py::dict dict = py::reinterpret_borrow<py::dict>(object);
+        py::list keys =
+            py::reinterpret_steal<py::list>(PyDict_Keys(dict.ptr()));
+        if (PyList_Sort(keys.ptr())) {
+          throw std::runtime_error("Dictionary key sort failed.");
+        }
+        if (keys.not_equal(node.node_data)) {
+          throw std::invalid_argument(
+              absl::StrFormat("Dict key mismatch; expected keys: %s; dict: %s.",
+                              py::repr(node.node_data), py::repr(object)));
+        }
+        for (py::handle key : keys) {
+          agenda.push_back(dict[key]);
+        }
+        break;
+      }
+
+      case Kind::kNamedTuple: {
+        if (!py::isinstance<py::tuple>(object) ||
+            !py::hasattr(object, "_fields")) {
+          throw std::invalid_argument(absl::StrFormat(
+              "Expected named tuple, got %s.", py::repr(object)));
+        }
+        py::tuple tuple = py::reinterpret_borrow<py::tuple>(object);
+        if (tuple.size() != node.arity) {
+          throw std::invalid_argument(absl::StrFormat(
+              "Named tuple arity mismatch: %d != %d; tuple: %s.", tuple.size(),
+              node.arity, py::repr(object)));
+        }
+        if (tuple.get_type().not_equal(node.node_data)) {
+          throw std::invalid_argument(absl::StrFormat(
+              "Named tuple type mismatch: expected type: %s, tuple: %s.",
+              py::repr(node.node_data), py::repr(object)));
+        }
+        for (py::handle entry : tuple) {
+          agenda.push_back(py::reinterpret_borrow<py::object>(entry));
+        }
+        break;
+      }
+
+      case Kind::kCustom: {
+        auto* registration = CustomNodeRegistry::Lookup(object.get_type());
+        if (registration != node.custom) {
+          throw std::invalid_argument(absl::StrFormat(
+              "Custom node type mismatch: expected type: %s, value: %s.",
+              py::repr(node.custom->type), py::repr(object)));
+        }
+        py::tuple out = py::cast<py::tuple>(node.custom->to_iterable(object));
+        if (out.size() != 2) {
+          throw std::runtime_error(
+              "PyTree custom to_iterable function should return a pair");
+        }
+        if (node.node_data.not_equal(out[1])) {
+          throw std::invalid_argument(absl::StrFormat(
+              "Mismatch custom node data: %s != %s; value: %s.",
+              py::repr(node.node_data), py::repr(out[1]), py::repr(object)));
+        }
+        int arity = 0;
+        for (py::handle entry : py::cast<py::iterable>(out[0])) {
+          ++arity;
+          agenda.push_back(py::reinterpret_borrow<py::object>(entry));
+        }
+        if (arity != node.arity) {
+          throw std::invalid_argument(absl::StrFormat(
+              "Custom type arity mismatch: %d != %d; value: %s.", arity,
+              node.arity, py::repr(object)));
+        }
+        break;
+      }
+    }
+  }
+  if (it != traversal_.rend() || leaf != -1) {
+    throw std::invalid_argument(absl::StrFormat(
+        "Tree structures did not match: %s vs %s", py::repr(xs), ToString()));
+  }
+  return leaves;
+}
+
+py::object PyTreeDef::Walk(const py::function& f_node, py::handle f_leaf,
+                           py::iterable leaves) const {
+  std::vector<py::object> agenda;
+  auto it = leaves.begin();
+  for (const Node& node : traversal_) {
+    switch (node.kind) {
+      case Kind::kLeaf: {
+        if (it == leaves.end()) {
+          throw std::invalid_argument("Too few leaves for PyTreeDef");
+        }
+
+        py::object leaf = py::reinterpret_borrow<py::object>(*it);
+        agenda.push_back(f_leaf.is_none() ? std::move(leaf)
+                                          : f_leaf(std::move(leaf)));
+        ++it;
+        break;
+      }
+
+      case Kind::kNone:
+      case Kind::kTuple:
+      case Kind::kNamedTuple:
+      case Kind::kList:
+      case Kind::kDict:
+      case Kind::kCustom: {
+        if (agenda.size() < node.arity) {
+          throw std::logic_error("Too few elements for custom type.");
+        }
+        py::tuple tuple(node.arity);
+        for (int i = node.arity - 1; i >= 0; --i) {
+          tuple[i] = agenda.back();
+          agenda.pop_back();
+        }
+        agenda.push_back(f_node(tuple));
+      }
+    }
+  }
+  if (it != leaves.end()) {
+    throw std::invalid_argument("Too many leaves for PyTreeDef");
+  }
+  if (agenda.size() != 1) {
+    throw std::logic_error("PyTreeDef traversal did not yield a singleton.");
+  }
+  return std::move(agenda.back());
+}
+
+py::object PyTreeDef::FromIterableTreeHelper(
+    py::handle xs,
+    std::vector<PyTreeDef::Node>::const_reverse_iterator* it) const {
+  if (*it == traversal_.rend()) {
+    throw std::invalid_argument("Tree structures did not match.");
+  }
+  const Node& node = **it;
+  ++*it;
+  if (node.kind == Kind::kLeaf) {
+    return py::reinterpret_borrow<py::object>(xs);
+  }
+  py::iterable iterable = py::reinterpret_borrow<py::iterable>(xs);
+  std::vector<py::object> ys;
+  ys.reserve(node.arity);
+  for (py::handle x : iterable) {
+    ys.push_back(py::reinterpret_borrow<py::object>(x));
+  }
+  if (ys.size() != node.arity) {
+    throw std::invalid_argument("Arity mismatch between trees");
+  }
+  for (int j = node.arity - 1; j >= 0; --j) {
+    ys[j] = FromIterableTreeHelper(ys[j], it);
+  }
+
+  return MakeNode(node, absl::MakeSpan(ys));
+}
+
+py::object PyTreeDef::FromIterableTree(py::handle xs) const {
+  auto it = traversal_.rbegin();
+  py::object out = FromIterableTreeHelper(xs, &it);
+  if (it != traversal_.rend()) {
+    throw std::invalid_argument("Tree structures did not match.");
+  }
+  return out;
+}
+
+std::unique_ptr<PyTreeDef> PyTreeDef::Compose(const PyTreeDef& inner) const {
+  auto out = absl::make_unique<PyTreeDef>();
+  for (const Node& n : traversal_) {
+    if (n.kind == Kind::kLeaf) {
+      absl::c_copy(inner.traversal_, std::back_inserter(out->traversal_));
+    } else {
+      out->traversal_.push_back(n);
+    }
+  }
+  const auto& root = traversal_.back();
+  const auto& inner_root = inner.traversal_.back();
+  // TODO(tomhennigan): This should update all nodes in the traversal.
+  auto& out_root = out->traversal_.back();
+  out_root.num_nodes = (root.num_nodes - root.num_leaves) +
+                       (inner_root.num_nodes * root.num_leaves);
+  out_root.num_leaves *= inner_root.num_leaves;
+  return out;
+}
+
+/*static*/ std::unique_ptr<PyTreeDef> PyTreeDef::Tuple(
+    const std::vector<PyTreeDef>& defs) {
+  auto out = absl::make_unique<PyTreeDef>();
+  for (const PyTreeDef& def : defs) {
+    absl::c_copy(def.traversal_, std::back_inserter(out->traversal_));
+  }
+  Node node;
+  node.kind = Kind::kTuple;
+  node.arity = defs.size();
+  out->traversal_.push_back(node);
+  return out;
+}
+
+std::vector<std::unique_ptr<PyTreeDef>> PyTreeDef::Children() const {
+  std::vector<std::unique_ptr<PyTreeDef>> children;
+  if (traversal_.empty()) {
+    return children;
+  }
+  Node const& root = traversal_.back();
+  children.resize(root.arity);
+  int pos = traversal_.size() - 1;
+  for (int i = root.arity - 1; i >= 0; --i) {
+    children[i] = absl::make_unique<PyTreeDef>();
+    const Node& node = traversal_.at(pos - 1);
+    if (pos < node.num_nodes) {
+      throw std::logic_error("children() walked off start of array");
+    }
+    std::copy(traversal_.begin() + pos - node.num_nodes,
+              traversal_.begin() + pos,
+              std::back_inserter(children[i]->traversal_));
+    pos -= node.num_nodes;
+  }
+  if (pos != 0) {
+    throw std::logic_error("pos != 0 at end of PyTreeDef::Children");
+  }
+  return children;
+}
+
+std::string PyTreeDef::ToString() const {
+  std::vector<std::string> agenda;
+  for (const Node& node : traversal_) {
+    if (agenda.size() < node.arity) {
+      throw std::logic_error("Too few elements for container.");
+    }
+
+    std::string kind;
+    switch (node.kind) {
+      case Kind::kLeaf:
+        agenda.push_back("*");
+        continue;
+      case Kind::kNone:
+        kind = "None";
+        break;
+      case Kind::kNamedTuple:
+        kind = "namedtuple";
+        break;
+      case Kind::kTuple:
+        kind = "tuple";
+        break;
+      case Kind::kList:
+        kind = "list";
+        break;
+      case Kind::kDict:
+        kind = "dict";
+        break;
+      case Kind::kCustom:
+        kind = static_cast<std::string>(py::str(node.custom->type));
+        break;
+    }
+
+    std::string children =
+        absl::StrJoin(agenda.end() - node.arity, agenda.end(), ",");
+    agenda.erase(agenda.end() - node.arity, agenda.end());
+
+    std::string data;
+    if (node.node_data) {
+      data = absl::StrFormat("[%s]", py::str(node.node_data));
+    }
+
+    agenda.push_back(
+        absl::StrFormat("PyTreeDef(%s%s, [%s])", kind, data, children));
+  }
+
+  if (agenda.size() != 1) {
+    throw std::logic_error("PyTreeDef traversal did not yield a singleton.");
+  }
+  return std::move(agenda.back());
+}
+
+void BuildPytreeSubmodule(py::module& m) {
+  py::module pytree = m.def_submodule("pytree", "Python tree library");
+  pytree.def("flatten", &PyTreeDef::Flatten);
+  pytree.def("tuple", &PyTreeDef::Tuple);
+  pytree.def("all_leaves", &PyTreeDef::AllLeaves);
+
+  py::class_<PyTreeDef>(m, "PyTreeDef")
+      .def("unflatten", &PyTreeDef::Unflatten)
+      .def("flatten_up_to", &PyTreeDef::FlattenUpTo)
+      .def("compose", &PyTreeDef::Compose)
+      .def("walk", &PyTreeDef::Walk)
+      .def("from_iterable_tree", &PyTreeDef::FromIterableTree)
+      .def("children", &PyTreeDef::Children)
+      .def_property_readonly("num_leaves", &PyTreeDef::num_leaves)
+      .def_property_readonly("num_nodes", &PyTreeDef::num_nodes)
+      .def("__repr__", &PyTreeDef::ToString)
+      .def("__eq__",
+           [](const PyTreeDef& a, const PyTreeDef& b) { return a == b; })
+      .def("__ne__",
+           [](const PyTreeDef& a, const PyTreeDef& b) { return a != b; })
+      .def("__hash__",
+           [](const PyTreeDef& t) { return absl::Hash<PyTreeDef>()(t); });
+
+  pytree.def("register_node", [](py::object type, py::function to_iterable,
+                                 py::function from_iterable) {
+    return CustomNodeRegistry::Register(type, to_iterable, from_iterable);
+  });
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/pytree.h b/tensorflow/compiler/xla/python/pytree.h
new file mode 100644
index 00000000000..69cd93a7d08
--- /dev/null
+++ b/tensorflow/compiler/xla/python/pytree.h
@@ -0,0 +1,214 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_PYTREE_H_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_PYTREE_H_
+
+// See https://jax.readthedocs.io/en/latest/pytrees.html for the documentation
+// about pytree.
+
+// Caution: this code uses exceptions. The exception use is local to the
+// binding code and the idiomatic way to emit Python exceptions.
+
+#include <memory>
+#include <stdexcept>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/hash/hash.h"
+#include "absl/memory/memory.h"
+#include "pybind11/pybind11.h"
+#include "pybind11/pytypes.h"
+#include "pybind11/stl.h"
+
+namespace xla {
+
+// Registry of custom node types.
+class CustomNodeRegistry {
+ public:
+  struct Registration {
+    // The Python type object, used to identify the type.
+    pybind11::object type;
+    // A function with signature: object -> (iterable, aux_data)
+    pybind11::function to_iterable;
+    // A function with signature: (aux_data, iterable) -> object
+    pybind11::function from_iterable;
+  };
+
+  // Registers a new custom type. Objects of `type` will be treated as container
+  // node types in PyTrees.
+  static void Register(pybind11::object type, pybind11::function to_iterable,
+                       pybind11::function from_iterable);
+
+  // Finds the custom type registration for `type`. Returns nullptr if none
+  // exists.
+  static const Registration* Lookup(pybind11::handle type);
+
+ private:
+  static CustomNodeRegistry* Singleton();
+
+  struct TypeHash {
+    size_t operator()(const pybind11::object& t) const {
+      return pybind11::hash(t);
+    }
+  };
+  struct TypeEq {
+    bool operator()(const pybind11::object& a,
+                    const pybind11::object& b) const {
+      return a.equal(b);
+    }
+  };
+  absl::flat_hash_map<pybind11::object, std::unique_ptr<Registration>, TypeHash,
+                      TypeEq>
+      registrations_;
+};
+
+// A PyTreeDef describes the tree structure of a PyTree. A PyTree is a tree of
+// Python values, where the interior nodes are tuples, lists, dictionaries, or
+// user-defined containers, and the leaves are other objects.
+class PyTreeDef {
+ public:
+  PyTreeDef() = default;
+
+  // Flattens a Pytree into a list of leaves and a PyTreeDef.
+  static std::pair<std::vector<pybind11::object>, std::unique_ptr<PyTreeDef>>
+  Flatten(pybind11::handle x);
+
+  // Recursive helper used to implement Flatten().
+  void FlattenInto(pybind11::handle handle,
+                   std::vector<pybind11::object>& leaves);
+
+  // Tests whether the given list is a flat list of leaves.
+  static bool AllLeaves(const pybind11::iterable& x);
+
+  // Flattens a Pytree up to this PyTreeDef. 'this' must be a tree prefix of
+  // the tree-structure of 'x'. For example, if we flatten a value
+  // [(1, (2, 3)), {"foo": 4}] with a treedef [(*, *), *], the result is the
+  // list of leaves [1, (2, 3), {"foo": 4}].
+  pybind11::list FlattenUpTo(pybind11::handle x) const;
+
+  // Returns an unflattened PyTree given an iterable of leaves and a PyTreeDef.
+  pybind11::object Unflatten(pybind11::iterable leaves) const;
+
+  // Composes two PyTreeDefs, replacing the leaves of this tree with copies of
+  // `inner`.
+  std::unique_ptr<PyTreeDef> Compose(const PyTreeDef& inner) const;
+
+  // Makes a Tuple PyTreeDef out of a vector of PyTreeDefs.
+  static std::unique_ptr<PyTreeDef> Tuple(const std::vector<PyTreeDef>& defs);
+
+  std::vector<std::unique_ptr<PyTreeDef>> Children() const;
+
+  // Maps a function over a PyTree structure, applying f_leaf to each leaf, and
+  // f_node to each container node.
+  // TODO(phawkins): use flattening everywhere instead and delete this method.
+  pybind11::object Walk(const pybind11::function& f_node,
+                        pybind11::handle f_leaf,
+                        pybind11::iterable leaves) const;
+
+  // Given a tree of iterables with the same node/leaf structure as this PyTree,
+  // build the corresponding PyTree.
+  // TODO(phawkins): use flattening everywhere instead and delete this method.
+  pybind11::object FromIterableTree(pybind11::handle xs) const;
+
+  int num_leaves() const {
+    if (traversal_.empty()) {
+      return 0;
+    }
+    return traversal_.back().num_leaves;
+  }
+
+  int num_nodes() const { return traversal_.size(); }
+
+  size_t Hash() const;
+
+  bool operator==(const PyTreeDef& other) const;
+  bool operator!=(const PyTreeDef& other) const { return !(*this == other); }
+
+  std::string ToString() const;
+
+ private:
+  enum class Kind {
+    kLeaf,        // An opaque leaf node
+    kNone,        // None.
+    kTuple,       // A tuple
+    kNamedTuple,  // A collections.namedtuple
+    kList,        // A list
+    kDict,        // A dict
+    kCustom,      // A custom type.
+  };
+
+  struct Node {
+    Kind kind = Kind::kLeaf;
+
+    // Arity for non-kLeaf types.
+    int arity = 0;
+
+    // Kind-specific auxiliary data. For a kNamedTuple, contains the tuple type
+    // object. For a kDict, contains a sorted list of keys. For a kCustom type,
+    // contains the auxiliary data returned by the `to_iterable` function.
+    pybind11::object node_data;
+
+    const CustomNodeRegistry::Registration* custom = nullptr;
+
+    // Number of leaf nodes in the subtree rooted at this node.
+    int num_leaves = 0;
+
+    // Number of leaf and interior nodes in the subtree rooted at this node.
+    int num_nodes = 0;
+  };
+  template <typename H>
+  friend H AbslHashValue(H h, const Node& n);
+
+  template <typename H>
+  friend H AbslHashValue(H h, const PyTreeDef& t);
+
+  // Helper that manufactures an instance of a node given its children.
+  static pybind11::object MakeNode(const Node& node,
+                                   absl::Span<pybind11::object> children);
+
+  // Recursive helper used to implement FromIterableTree()
+  pybind11::object FromIterableTreeHelper(
+      pybind11::handle xs,
+      std::vector<PyTreeDef::Node>::const_reverse_iterator* it) const;
+
+  // Computes the node kind of a given Python object.
+  static Kind GetKind(const pybind11::handle& obj,
+                      CustomNodeRegistry::Registration const** custom);
+
+  // Nodes, in a post-order traversal. We use an ordered traversal to minimize
+  // allocations, and post-order corresponds to the order we need to rebuild the
+  // tree structure.
+  std::vector<Node> traversal_;
+};
+
+template <typename H>
+H AbslHashValue(H h, const PyTreeDef::Node& n) {
+  h = H::combine(std::move(h), n.kind, n.arity, n.custom);
+  return h;
+}
+
+template <typename H>
+H AbslHashValue(H h, const PyTreeDef& t) {
+  return H::combine_contiguous(std::move(h), t.traversal_.data(),
+                               t.traversal_.size());
+}
+
+void BuildPytreeSubmodule(pybind11::module& m);
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_PYTREE_H_
diff --git a/tensorflow/compiler/xla/python/tpu_driver/BUILD b/tensorflow/compiler/xla/python/tpu_driver/BUILD
index 4725becdedf..70aeb3f2a86 100644
--- a/tensorflow/compiler/xla/python/tpu_driver/BUILD
+++ b/tensorflow/compiler/xla/python/tpu_driver/BUILD
@@ -115,6 +115,24 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "pod_tpu_driver",
+    srcs = ["pod_tpu_driver.cc"],
+    deps = [
+        ":grpc_tpu_driver",
+        ":tpu_driver",
+        ":tpu_driver_proto_cc",
+        "@com_google_absl//absl/container:btree",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "//tensorflow/compiler/xla/pjrt:semaphore",
+        "//tensorflow/compiler/xla/pjrt:worker_thread",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        tf_grpc_cc_dependency(),
+    ] + external_deps(),
+    alwayslink = 1,
+)
+
 go_proto_library(
     name = "tpu_service_go_proto",
     compatible_with = ["//buildenv/target:gce"],
diff --git a/tensorflow/compiler/xla/python/tpu_driver/client/BUILD b/tensorflow/compiler/xla/python/tpu_driver/client/BUILD
index c460cc36f08..30a220ece45 100644
--- a/tensorflow/compiler/xla/python/tpu_driver/client/BUILD
+++ b/tensorflow/compiler/xla/python/tpu_driver/client/BUILD
@@ -24,6 +24,7 @@ cc_library(
         "//tensorflow/compiler/xla/python/tpu_driver",
         "//tensorflow/compiler/xla/python/tpu_driver:direct_tpu_driver",
         "//tensorflow/compiler/xla/python/tpu_driver:grpc_tpu_driver",
+        "//tensorflow/compiler/xla/python/tpu_driver:pod_tpu_driver",
         "//tensorflow/compiler/xla/python/tpu_driver:recording_tpu_driver",
         "//tensorflow/compiler/xla/python/tpu_driver:tpu_driver_proto_cc",
         "//tensorflow/compiler/xla/service:computation_placer",
diff --git a/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.cc b/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.cc
index e78f04ff980..0602d096aaa 100644
--- a/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.cc
+++ b/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.cc
@@ -37,8 +37,8 @@ namespace xla {
 
 TpuDevice::TpuDevice(int id, int host_id, const std::array<int, 3>& coords,
                      int core_on_chip)
-    : xla::Device(id, /*local_device_state=*/nullptr, kTpuPlatform,
-                  /*device_kind=*/"Cloud TPU", host_id),
+    : xla::PjRtDevice(id, /*local_device_state=*/nullptr, kTpuPlatform,
+                      /*device_kind=*/"Cloud TPU", host_id),
       coords_(coords),
       core_on_chip_(core_on_chip) {}
 
@@ -47,9 +47,9 @@ std::string TpuDevice::DebugString() const {
                          coords_[0], coords_[1], coords_[2], core_on_chip_);
 }
 
-xla::StatusOr<std::vector<std::shared_ptr<xla::Device>>>
+xla::StatusOr<std::vector<std::shared_ptr<xla::PjRtDevice>>>
 TpuDevice::GetTpuDevices(const tpu_driver::SystemInfo& system_info) {
-  std::vector<std::shared_ptr<Device>> devices;
+  std::vector<std::shared_ptr<PjRtDevice>> devices;
   for (const auto& chip : system_info.tpu_chip()) {
     auto& coord = chip.chip_coord();
     std::array<int, 3> coords_array = {coord.x(), coord.y(), coord.z()};
@@ -78,7 +78,7 @@ StatusOr<std::shared_ptr<PyTpuClient>> PyTpuClient::Get(
   tpu_driver::SystemInfo system_info;
   client->QuerySystemInfo(&system_info);
 
-  TF_ASSIGN_OR_RETURN(std::vector<std::shared_ptr<Device>> devices,
+  TF_ASSIGN_OR_RETURN(std::vector<std::shared_ptr<PjRtDevice>> devices,
                       TpuDevice::GetTpuDevices(system_info));
 
   return std::make_shared<PyTpuClient>(kTpuPlatform, std::move(client),
@@ -88,13 +88,13 @@ StatusOr<std::shared_ptr<PyTpuClient>> PyTpuClient::Get(
 
 PyTpuClient::PyTpuClient(std::string platform_name,
                          std::unique_ptr<tpu_driver::TpuDriver> driver,
-                         std::vector<std::shared_ptr<Device>> devices,
+                         std::vector<std::shared_ptr<PjRtDevice>> devices,
                          int host_id)
     : platform_name_(std::move(platform_name)),
       driver_(std::move(driver)),
       devices_(std::move(devices)),
       host_id_(host_id) {
-  for (const std::shared_ptr<Device>& device : devices_) {
+  for (const std::shared_ptr<PjRtDevice>& device : devices_) {
     CHECK(id_to_device_.insert({device->id(), device}).second)
         << "Duplicate device id: " << device->id();
 
@@ -173,7 +173,7 @@ static Status CheckDataType(xla::PrimitiveType dtype) {
 StatusOr<std::unique_ptr<PyTpuBuffer>> PyTpuBuffer::FromLiterals(
     std::vector<BorrowingLiteral> leaves, const Shape& tuple_shape,
     std::shared_ptr<void> leaves_references,
-    std::shared_ptr<PyTpuClient> client, std::shared_ptr<Device> device) {
+    std::shared_ptr<PyTpuClient> client, std::shared_ptr<PjRtDevice> device) {
   tensorflow::profiler::TraceMe traceme("PyTpuBuffer::FromLiterals");
   VLOG(1) << "PyTpuBuffer::FromLiterals: shape: " << tuple_shape.DebugString()
           << " device: " << device->DebugString();
@@ -229,7 +229,7 @@ StatusOr<std::unique_ptr<PyTpuBuffer>> PyTpuBuffer::FromLiterals(
 /* static */
 StatusOr<std::unique_ptr<PyTpuBuffer>> PyTpuBuffer::MakeTuple(
     absl::Span<PyTpuBuffer* const> buffers, std::shared_ptr<PyTpuClient> client,
-    std::shared_ptr<Device> device) {
+    std::shared_ptr<PjRtDevice> device) {
   std::vector<Shape> child_shapes;
   std::vector<std::shared_ptr<TpuSharedBuffer>> child_device_buffers;
   std::vector<tpu_driver::BufferHandle*> child_handle_ptrs;
@@ -388,7 +388,7 @@ PyTpuBuffer::DestructureTuple() {
 }
 
 StatusOr<std::unique_ptr<PyTpuBuffer>> PyTpuBuffer::CopyToDevice(
-    std::shared_ptr<Device> dst_device) {
+    std::shared_ptr<PjRtDevice> dst_device) {
   tensorflow::profiler::TraceMe traceme("PyTpuBuffer::CopyToDevice");
   if (on_host_shape_.IsTuple()) {
     return Unimplemented("CopyToDevice for tuples is not supported.");
@@ -433,7 +433,7 @@ Status PyTpuBuffer::BlockHostUntilReady() {
 /* static */
 StatusOr<std::unique_ptr<PyTpuBuffer>> PyTpuBuffer::AllocateBuffer(
     const Shape& shape, std::shared_ptr<PyTpuClient> client,
-    std::shared_ptr<Device> device) {
+    std::shared_ptr<PjRtDevice> device) {
   tensorflow::profiler::TraceMe traceme("PyTpuBuffer::AllocateBuffer");
   VLOG(1) << "PyTpuBuffer::AllocateBuffer: shape: " << shape.DebugString()
           << " device: " << device->DebugString();
@@ -465,7 +465,7 @@ StatusOr<std::unique_ptr<PyTpuBuffer>> PyTpuBuffer::AllocateBuffer(
 /*static*/
 StatusOr<std::unique_ptr<PyTpuBuffer>> PyTpuBuffer::CreateBuffer(
     const Shape& non_tuple_shape, absl::optional<BufferInitializer> initializer,
-    std::shared_ptr<PyTpuClient> client, std::shared_ptr<Device> device) {
+    std::shared_ptr<PyTpuClient> client, std::shared_ptr<PjRtDevice> device) {
   tensorflow::profiler::TraceMe traceme("PyTpuBuffer::CreateBuffer");
   VLOG(1) << "PyTpuBuffer::CreateBuffer: shape: "
           << non_tuple_shape.DebugString()
@@ -493,8 +493,8 @@ StatusOr<std::unique_ptr<PyTpuBuffer>> PyTpuBuffer::CreateBuffer(
       std::vector<std::shared_ptr<TpuSharedBuffer>>(), client);
 }
 
-static std::shared_ptr<Device> LookupDevice(const PyTpuClient& client,
-                                            int device_id) {
+static std::shared_ptr<PjRtDevice> LookupDevice(const PyTpuClient& client,
+                                                int device_id) {
   auto it = client.id_to_device().find(device_id);
   CHECK(it != client.id_to_device().end())
       << "Unknown device id: " << device_id;
@@ -516,7 +516,7 @@ PyTpuExecutable::PyTpuExecutable(
   for (int replica = 0; replica < num_replicas; ++replica) {
     for (int partition = 0; partition < num_partitions; ++partition) {
       int device_id = device_assignment_(replica, partition);
-      std::shared_ptr<Device> device = LookupDevice(*client_, device_id);
+      std::shared_ptr<PjRtDevice> device = LookupDevice(*client_, device_id);
       if (device->host_id() != client_->host_id()) {
         VLOG(3) << "Non-local device: " << device_id;
         continue;
@@ -541,7 +541,7 @@ PyTpuExecutable::ExecuteResult PyTpuExecutable::ExecuteHelper(
     absl::Span<PyTpuBuffer* const> this_core_arguments, int replica,
     int partition, const RunId& run_id) {
   const int device_id = device_assignment_(replica, partition);
-  std::shared_ptr<Device> device = LookupDevice(*client_, device_id);
+  std::shared_ptr<PjRtDevice> device = LookupDevice(*client_, device_id);
   CHECK_EQ(device->host_id(), client_->host_id());
   tensorflow::profiler::TraceMe traceme("PyTpuExecutable::Execute");
   VLOG(3) << "Replica " << replica << ", partition " << partition
@@ -588,7 +588,7 @@ PyTpuExecutable::ExecuteResult PyTpuExecutable::ExecuteHelper(
 static const absl::Duration kWarnExecutionDelay = absl::Seconds(10);
 
 // Delay before terminating a stalled execute call.
-static const absl::Duration kMaxExecutionDelay = absl::Seconds(120);
+static const absl::Duration kMaxExecutionDelay = absl::Minutes(60);
 
 Status WaitForExecuteEvent(tpu_driver::Event* event) {
   absl::optional<Status> opt_status;
diff --git a/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.h b/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.h
index 4c45df181db..c2a424677fd 100644
--- a/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.h
+++ b/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.h
@@ -38,7 +38,7 @@ namespace xla {
 
 constexpr char kTpuPlatform[] = "tpu";
 
-class TpuDevice : public Device {
+class TpuDevice : public PjRtDevice {
  public:
   TpuDevice(int id, int host_id, const std::array<int, 3>& coords,
             int core_on_chip);
@@ -48,8 +48,8 @@ class TpuDevice : public Device {
 
   std::string DebugString() const override;
 
-  static xla::StatusOr<std::vector<std::shared_ptr<xla::Device>>> GetTpuDevices(
-      const tpu_driver::SystemInfo& system_info);
+  static xla::StatusOr<std::vector<std::shared_ptr<xla::PjRtDevice>>>
+  GetTpuDevices(const tpu_driver::SystemInfo& system_info);
 
  private:
   const std::array<int, 3> coords_;
@@ -66,7 +66,7 @@ class PyTpuClient {
 
   explicit PyTpuClient(std::string platform_name,
                        std::unique_ptr<tpu_driver::TpuDriver> driver,
-                       std::vector<std::shared_ptr<Device>> devices,
+                       std::vector<std::shared_ptr<PjRtDevice>> devices,
                        int host_id);
   virtual ~PyTpuClient() = default;
 
@@ -83,11 +83,11 @@ class PyTpuClient {
 
   int device_count() const { return devices_.size(); }
   int local_device_count() const { return local_devices_.size(); }
-  const std::vector<std::shared_ptr<Device>>& devices() { return devices_; }
-  const std::vector<std::shared_ptr<Device>>& local_devices() {
+  const std::vector<std::shared_ptr<PjRtDevice>>& devices() { return devices_; }
+  const std::vector<std::shared_ptr<PjRtDevice>>& local_devices() {
     return local_devices_;
   }
-  const std::map<int, std::shared_ptr<Device>>& id_to_device() const {
+  const std::map<int, std::shared_ptr<PjRtDevice>>& id_to_device() const {
     return id_to_device_;
   }
   int host_id() const { return host_id_; }
@@ -110,11 +110,11 @@ class PyTpuClient {
   std::unique_ptr<tpu_driver::TpuDriver> driver_;
 
   // Includes all devices, including non-local devices on multi-host platforms.
-  std::vector<std::shared_ptr<Device>> devices_;
+  std::vector<std::shared_ptr<PjRtDevice>> devices_;
   // Maps Device::id() to the corresponding Device. Includes all devices.
-  std::map<int, std::shared_ptr<Device>> id_to_device_;
+  std::map<int, std::shared_ptr<PjRtDevice>> id_to_device_;
   // Local devices indexed by local device ordinal.
-  std::vector<std::shared_ptr<Device>> local_devices_;
+  std::vector<std::shared_ptr<PjRtDevice>> local_devices_;
   int host_id_;
 
   // A thread pool for scheduling core executions in parallel.
@@ -128,7 +128,7 @@ struct TpuSharedBuffer final {
   TpuSharedBuffer(tpu_driver::TpuDriver* driver,
                   std::unique_ptr<tpu_driver::BufferHandle> handle,
                   std::vector<std::shared_ptr<tpu_driver::Event>> wait_for_use,
-                  std::shared_ptr<Device> src_device)
+                  std::shared_ptr<PjRtDevice> src_device)
       : driver(driver),
         device(std::move(src_device)),
         handle(std::move(handle)),
@@ -143,7 +143,7 @@ struct TpuSharedBuffer final {
   }
 
   tpu_driver::TpuDriver* const driver;
-  const std::shared_ptr<Device> device;
+  const std::shared_ptr<PjRtDevice> device;
 
   std::unique_ptr<tpu_driver::BufferHandle> handle;
   std::vector<std::shared_ptr<tpu_driver::Event>> wait_for_use;
@@ -162,12 +162,12 @@ class PyTpuBuffer {
   static StatusOr<std::unique_ptr<PyTpuBuffer>> FromLiterals(
       std::vector<BorrowingLiteral> leaves_literals, const Shape& tuple_shape,
       std::shared_ptr<void> leaves_reference,
-      std::shared_ptr<PyTpuClient> client, std::shared_ptr<Device> device);
+      std::shared_ptr<PyTpuClient> client, std::shared_ptr<PjRtDevice> device);
 
   // Supports nested tuple creation.
   static StatusOr<std::unique_ptr<PyTpuBuffer>> MakeTuple(
       absl::Span<PyTpuBuffer* const> buffers,
-      std::shared_ptr<PyTpuClient> client, std::shared_ptr<Device> device);
+      std::shared_ptr<PyTpuClient> client, std::shared_ptr<PjRtDevice> device);
 
   PyTpuBuffer() = delete;
   PyTpuBuffer(Shape on_host_shape,
@@ -181,7 +181,7 @@ class PyTpuBuffer {
   PyTpuBuffer& operator=(PyTpuBuffer&&) = delete;
 
   const Shape& on_host_shape() const { return on_host_shape_; }
-  std::shared_ptr<Device> device() const { return device_; }
+  std::shared_ptr<PjRtDevice> device() const { return device_; }
   const std::string& platform_name() const { return client_->platform_name(); }
   std::shared_ptr<PyTpuClient> client() const { return client_; }
 
@@ -210,7 +210,7 @@ class PyTpuBuffer {
   // Copies the buffer to target device `dst_device` and returns a PyTpuBuffer
   // object holding the context to the target device buffer.
   StatusOr<std::unique_ptr<PyTpuBuffer>> CopyToDevice(
-      std::shared_ptr<Device> dst_device);
+      std::shared_ptr<PjRtDevice> dst_device);
 
   // Blocks the host until the buffer's value has been computed and is ready for
   // immediate use on the device. Useful in particular for timing benchmarks.
@@ -220,7 +220,7 @@ class PyTpuBuffer {
   // tuple, the returned buffer corresponds to the root tuple buffer.
   static StatusOr<std::unique_ptr<PyTpuBuffer>> AllocateBuffer(
       const Shape& shape, std::shared_ptr<PyTpuClient> client,
-      std::shared_ptr<Device> device);
+      std::shared_ptr<PjRtDevice> device);
 
  private:
   // Initializes a just allocated device buffer. The returned event will be
@@ -231,11 +231,11 @@ class PyTpuBuffer {
   static StatusOr<std::unique_ptr<PyTpuBuffer>> CreateBuffer(
       const Shape& non_tuple_shape,
       absl::optional<BufferInitializer> initializer,
-      std::shared_ptr<PyTpuClient> client, std::shared_ptr<Device> device);
+      std::shared_ptr<PyTpuClient> client, std::shared_ptr<PjRtDevice> device);
 
   const std::shared_ptr<PyTpuClient> client_;
   const Shape on_host_shape_;
-  const std::shared_ptr<Device> device_;
+  const std::shared_ptr<PjRtDevice> device_;
 
   // If this is a tuple, `device_buffer_` stores the tuple buffer and
   // `child_buffers_` stores the child buffers; else, `device_buffer_` stores
@@ -302,7 +302,7 @@ class PyTpuExecutable {
     return local_logical_device_ids_;
   }
 
-  const std::vector<std::shared_ptr<Device>>& local_devices() const {
+  const std::vector<std::shared_ptr<PjRtDevice>>& local_devices() const {
     return local_devices_;
   }
 
@@ -350,7 +350,7 @@ class PyTpuExecutable {
   // assigned.
   // shared_ptrs instead of unique_ptrs to play well with the Python bindings
   // (see xla.cc).
-  std::vector<std::shared_ptr<Device>> local_devices_;
+  std::vector<std::shared_ptr<PjRtDevice>> local_devices_;
 
   xla::Shape result_shape_;
 };
diff --git a/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client_extension.cc b/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client_extension.cc
index 9a794b79c5c..5d526b51899 100644
--- a/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client_extension.cc
+++ b/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client_extension.cc
@@ -40,11 +40,12 @@ PYBIND11_MODULE(tpu_client_extension, m) {
       .def("host_id", &PyTpuClient::host_id)
       .def("get_default_device_assignment",
            [](PyTpuClient* client, int num_replicas, int num_partitions)
-               -> StatusOr<std::vector<std::vector<std::shared_ptr<Device>>>> {
+               -> StatusOr<
+                   std::vector<std::vector<std::shared_ptr<PjRtDevice>>>> {
              TF_ASSIGN_OR_RETURN(DeviceAssignment device_assignment,
                                  client->GetDefaultDeviceAssignment(
                                      num_replicas, num_partitions));
-             std::vector<std::vector<std::shared_ptr<Device>>> result;
+             std::vector<std::vector<std::shared_ptr<PjRtDevice>>> result;
              result.resize(num_replicas);
              for (int r = 0; r < num_replicas; ++r) {
                result[r].resize(num_partitions);
@@ -60,11 +61,11 @@ PYBIND11_MODULE(tpu_client_extension, m) {
       // TODO(skye): delete after all callers can handle 2D output
       .def("get_default_device_assignment",
            [](PyTpuClient* client, int num_replicas)
-               -> StatusOr<std::vector<std::shared_ptr<Device>>> {
+               -> StatusOr<std::vector<std::shared_ptr<PjRtDevice>>> {
              TF_ASSIGN_OR_RETURN(DeviceAssignment device_assignment,
                                  client->GetDefaultDeviceAssignment(
                                      num_replicas, /*num_partitions=*/1));
-             std::vector<std::shared_ptr<Device>> result;
+             std::vector<std::shared_ptr<PjRtDevice>> result;
              for (int i = 0; i < num_replicas; ++i) {
                int device_id = device_assignment(i, 0);
                auto iter = client->id_to_device().find(device_id);
@@ -96,7 +97,8 @@ PYBIND11_MODULE(tpu_client_extension, m) {
       .def(
           "buffer_from_pyval",
           [](std::shared_ptr<PyTpuClient> client,
-             const pybind11::object& argument, std::shared_ptr<Device> device,
+             const pybind11::object& argument,
+             std::shared_ptr<PjRtDevice> device,
              bool force_copy) -> StatusOr<std::unique_ptr<PyTpuBuffer>> {
             if (device == nullptr) {
               TF_RET_CHECK(!client->local_devices().empty());
@@ -145,7 +147,7 @@ PYBIND11_MODULE(tpu_client_extension, m) {
   py::class_<PyTpuBuffer>(m, "PyTpuBuffer")
       .def_property_readonly("client", &PyTpuBuffer::client)
       .def("copy_to_device",
-           [](PyTpuBuffer* buffer, std::shared_ptr<Device> dst_device) {
+           [](PyTpuBuffer* buffer, std::shared_ptr<PjRtDevice> dst_device) {
              CHECK(dst_device != nullptr);
              GlobalPyRefManager()->CollectGarbage();
              py::gil_scoped_release gil_release;
@@ -202,7 +204,7 @@ PYBIND11_MODULE(tpu_client_extension, m) {
       .def_property_readonly("traceback",
                              [](PyTpuExecutable*) { return py::none(); });
 
-  py::class_<TpuDevice, Device, std::shared_ptr<TpuDevice>>(m, "TpuDevice")
+  py::class_<TpuDevice, PjRtDevice, std::shared_ptr<TpuDevice>>(m, "TpuDevice")
       .def_property_readonly("coords", &TpuDevice::coords)
       .def_property_readonly("core_on_chip", &TpuDevice::core_on_chip)
       .def("__repr__", [](const TpuDevice& device) {
diff --git a/tensorflow/compiler/xla/python/tpu_driver/pod_tpu_driver.cc b/tensorflow/compiler/xla/python/tpu_driver/pod_tpu_driver.cc
new file mode 100644
index 00000000000..ac54df39895
--- /dev/null
+++ b/tensorflow/compiler/xla/python/tpu_driver/pod_tpu_driver.cc
@@ -0,0 +1,806 @@
+// Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "absl/container/btree_map.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/str_split.h"
+#include "tensorflow/compiler/xla/pjrt/semaphore.h"
+#include "tensorflow/compiler/xla/pjrt/worker_thread.h"
+#include "tensorflow/compiler/xla/python/tpu_driver/grpc_tpu_driver.h"
+#include "tensorflow/compiler/xla/python/tpu_driver/tpu_driver.h"
+#include "tensorflow/compiler/xla/python/tpu_driver/tpu_driver.pb.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/protobuf/error_codes.pb.h"
+
+namespace tpu_driver {
+namespace {
+
+using xla::Status;
+using xla::WorkerThread;
+
+const char kPodTpuDriverPrefix[] = "grpc+pod://";
+
+class PodTpuDriver;
+
+class PodEvent : public Event {
+ public:
+  explicit PodEvent(PodTpuDriver* driver, int64_t operation_id)
+      : driver_(driver), operation_id_(operation_id) {}
+  int64_t operation_id() const { return operation_id_; }
+
+  xla::Status Await() override;
+
+  absl::optional<xla::Status> AwaitWithTimeout(
+      absl::Duration duration) override;
+
+  void AddCallback(std::function<void(Status)> callback) override;
+
+ private:
+  PodTpuDriver* driver_;
+  const int64_t operation_id_;
+};
+
+class CombinedEvent : public PodEvent {
+ public:
+  explicit CombinedEvent(PodTpuDriver* driver, int64_t operation_id,
+                         std::vector<std::shared_ptr<Event>> events)
+      : PodEvent(driver, operation_id), events_(events) {}
+
+  xla::Status Await() override {
+    for (auto& event : events_) {
+      TF_RETURN_IF_ERROR(event->Await());
+    }
+    return Status::OK();
+  }
+
+  absl::optional<xla::Status> AwaitWithTimeout(
+      absl::Duration duration) override {
+    // TODO(frankchn): This might extend the timeout.
+    for (auto& event : events_) {
+      auto status = event->AwaitWithTimeout(duration);
+      if (status == absl::nullopt) {
+        return absl::nullopt;
+      } else {
+        TF_RETURN_IF_ERROR(status.value());
+      }
+    }
+    return Status::OK();
+  }
+
+  void AddCallback(std::function<void(Status)> callback) override {
+    // TODO(frankchn): This may return before every event is done.
+    events_[0]->AddCallback(std::move(callback));
+  }
+
+ private:
+  std::vector<std::shared_ptr<Event>> events_;
+};
+
+class PodBufferHandle : public BufferHandle {
+ public:
+  explicit PodBufferHandle(PodTpuDriver* driver, int64_t operation_id,
+                           int64_t size_in_bytes,
+                           absl::optional<xla::ShapeProto> shape,
+                           int64_t core_id)
+      : driver_(driver),
+        operation_id_(operation_id),
+        size_in_bytes_(size_in_bytes),
+        shape_(shape),
+        event_(std::make_shared<PodEvent>(driver_, operation_id_)),
+        core_id_(core_id) {}
+
+  std::shared_ptr<Event> OnReady() override { return event_; }
+  int64_t size_in_bytes() override { return size_in_bytes_; }
+  absl::optional<xla::ShapeProto> shape() override { return shape_; }
+
+  int64_t operation_id() const { return operation_id_; }
+  int64_t core_id() const { return core_id_; }
+
+ private:
+  PodTpuDriver* driver_;
+  const int64_t operation_id_;
+  const int64_t size_in_bytes_;
+  const absl::optional<xla::ShapeProto> shape_;
+  std::shared_ptr<PodEvent> event_;
+  const int64_t core_id_;
+};
+
+class PodCompiledProgramHandle : public CompiledProgramHandle {
+ public:
+  explicit PodCompiledProgramHandle(PodTpuDriver* driver, int64_t operation_id)
+      : driver_(driver),
+        operation_id_(operation_id),
+        event_(std::make_shared<PodEvent>(driver_, operation_id_)) {}
+
+  std::shared_ptr<Event> OnReady() override { return event_; }
+
+  xla::Status program_shape(xla::ProgramShapeProto* program_shape) override;
+
+  int64_t operation_id() const { return operation_id_; }
+
+ private:
+  PodTpuDriver* driver_;
+  const int64_t operation_id_;
+  std::shared_ptr<PodEvent> event_;
+};
+
+class PodLoadedProgramHandle : public LoadedProgramHandle {
+ public:
+  explicit PodLoadedProgramHandle(PodTpuDriver* driver, int64_t operation_id,
+                                  int64_t core_id)
+      : driver_(driver),
+        operation_id_(operation_id),
+        core_id_(core_id),
+        event_(std::make_shared<PodEvent>(driver_, operation_id_)) {}
+
+  std::shared_ptr<Event> OnReady() override { return event_; }
+
+  int64_t operation_id() const { return operation_id_; }
+  int64_t core_id() const { return core_id_; }
+
+ private:
+  PodTpuDriver* driver_;
+  const int64_t operation_id_;
+  const int64_t core_id_;
+  std::shared_ptr<PodEvent> event_;
+};
+
+struct EventInFlight {
+  std::shared_ptr<Event> underlying_event;
+  std::function<std::shared_ptr<Event>(void)> create_fn;
+
+  absl::flat_hash_set<int64_t> incomplete_deps;
+  std::vector<std::function<void(Status)>> callbacks;
+};
+
+class PodTpuDriver : public TpuDriver {
+ public:
+  explicit PodTpuDriver(const TpuDriverConfig& config,
+                        std::shared_ptr<::grpc::ChannelCredentials> creds)
+      : config_(config),
+        creds_(creds),
+        event_thread_(tensorflow::Env::Default(), "grpc_pod_event_thread") {
+    std::vector<std::string> workers = absl::StrSplit(
+        absl::StripPrefix(config.worker(), kPodTpuDriverPrefix), ',');
+    for (const auto& worker : workers) {
+      TpuDriverConfig worker_config(config_);
+      *(worker_config.mutable_worker()) = absl::StrCat("grpc://", worker);
+      drivers_.push_back(
+          CreateGrpcTpuDriver(worker_config, creds_).ConsumeValueOrDie());
+    }
+
+    int cumulative_core_id = 0;
+    absl::flat_hash_set<std::tuple<int, int, int>> processed_chips;
+
+    for (int driver_num = 0; driver_num < workers.size(); ++driver_num) {
+      SystemInfo driver_info;
+      drivers_[driver_num]->QuerySystemInfo(&driver_info);
+
+      for (const auto& tpu_chip : driver_info.tpu_chip()) {
+        std::tuple<int, int, int> coord{tpu_chip.chip_coord().x(),
+                                        tpu_chip.chip_coord().y(),
+                                        tpu_chip.chip_coord().z()};
+        // We only want to add chips that we have not seen before if we are in a
+        // TPU pod slice, or we are only seeing local cores (e.g. we are
+        // connected to individual TPUs or we are in a test environment).
+        if (!processed_chips.contains(coord) ||
+            driver_info.core_count() == driver_info.local_core_size()) {
+          *(pod_info_.add_tpu_chip()) = tpu_chip;
+          processed_chips.insert(coord);
+        }
+      }
+
+      *(pod_info_.mutable_cpu()) = driver_info.cpu();
+    }
+
+    // Process all the unique chips that we have seen.
+    for (auto& tpu_chip : *pod_info_.mutable_tpu_chip()) {
+      for (auto& tpu_core : *tpu_chip.mutable_core()) {
+        int current_core = cumulative_core_id++;
+
+        core_to_driver_.push_back(drivers_[tpu_chip.host_id()].get());
+        core_to_driver_id_.push_back(tpu_chip.host_id());
+        core_to_driver_core_.push_back(tpu_core.id());
+
+        tpu_core.set_id(current_core);
+        tpu_core.set_core_on_host_index(current_core);
+        *(pod_info_.add_local_core()) = tpu_core;
+      }
+
+      // We are setting host_id to zero because we want this to look like one
+      // host with many cores from the perspective of tpu_client.cc.
+      tpu_chip.set_host_id(0);
+    }
+
+    pod_info_.set_chip_count(pod_info_.tpu_chip_size());
+    pod_info_.set_core_count(pod_info_.local_core_size());
+
+    // We want this to look like one host with many TPU chips/cores connected.
+    pod_info_.set_host_count(1);
+    pod_info_.set_host_id(0);
+  }
+
+  ~PodTpuDriver() override {
+    // TODO(frankchn): Unload all handles, and wait for all events to finish.
+  }
+
+  void QuerySystemInfo(SystemInfo* system_info) override {
+    *system_info = pod_info_;
+  }
+
+  xla::Status Reset() override {
+    for (auto& driver : drivers_) {
+      TF_RETURN_IF_ERROR(driver->Reset());
+    }
+    return xla::Status::OK();
+  }
+
+  std::unique_ptr<BufferHandle> Allocate(
+      int32_t core_id, MemoryRegion region, int64_t num_bytes,
+      absl::Span<Event* const> wait_for) override {
+    int64_t operation_id = GetOperationId();
+    auto deps = GetDependencyOperationIds(wait_for);
+
+    ScheduleRequest(
+        operation_id,
+        [this, core_id, region, num_bytes, operation_id]() {
+          absl::MutexLock l(&mu_);
+          underlying_buffers_.insert(
+              {operation_id,
+               core_to_driver_[core_id]->Allocate(core_to_driver_core_[core_id],
+                                                  region, num_bytes, {})});
+          return underlying_buffers_[operation_id]->OnReady();
+        },
+        deps);
+
+    return absl::make_unique<PodBufferHandle>(this, operation_id, num_bytes,
+                                              absl::nullopt, core_id);
+  }
+
+  std::unique_ptr<BufferHandle> Allocate(
+      int32_t core_id, MemoryRegion region, const xla::ShapeProto& shape,
+      absl::Span<Event* const> wait_for) override {
+    int64_t operation_id = GetOperationId();
+    auto deps = GetDependencyOperationIds(wait_for);
+
+    ScheduleRequest(
+        operation_id,
+        [this, core_id, region, shape, operation_id]() {
+          absl::MutexLock l(&mu_);
+          underlying_buffers_.insert(
+              {operation_id,
+               core_to_driver_[core_id]->Allocate(core_to_driver_core_[core_id],
+                                                  region, shape, {})});
+          return underlying_buffers_[operation_id]->OnReady();
+        },
+        deps);
+
+    return absl::make_unique<PodBufferHandle>(
+        this, operation_id, ComputeBytesFromShape(shape), shape, core_id);
+  }
+
+  std::unique_ptr<BufferHandle> AllocateTuple(
+      int32_t core_id, MemoryRegion region,
+      absl::Span<BufferHandle* const> children,
+      absl::Span<Event* const> wait_for) override {
+    int64_t operation_id = GetOperationId();
+    auto deps = GetDependencyOperationIds(wait_for);
+
+    std::vector<int64_t> children_ids;
+    for (int i = 0; i < children.size(); ++i) {
+      auto child_op_id =
+          static_cast<PodBufferHandle* const>(children[i])->operation_id();
+      deps.insert(child_op_id);
+      children_ids.push_back(child_op_id);
+    }
+
+    ScheduleRequest(
+        operation_id,
+        [this, core_id, region, children_ids, operation_id]() {
+          absl::MutexLock l(&mu_);
+
+          std::vector<BufferHandle*> child_buffers;
+          child_buffers.reserve(children_ids.size());
+          for (int i = 0; i < children_ids.size(); ++i) {
+            child_buffers.push_back(underlying_buffers_[children_ids[i]].get());
+          }
+
+          underlying_buffers_.insert(
+              {operation_id,
+               core_to_driver_[core_id]->AllocateTuple(
+                   core_to_driver_core_[core_id], region, child_buffers, {})});
+          return underlying_buffers_[operation_id]->OnReady();
+        },
+        deps);
+
+    return absl::make_unique<PodBufferHandle>(this, operation_id, 0,
+                                              absl::nullopt, core_id);
+  }
+
+  std::shared_ptr<Event> Deallocate(
+      std::unique_ptr<BufferHandle> handle,
+      absl::Span<Event* const> wait_for) override {
+    int64_t operation_id = GetOperationId();
+    auto deps = GetDependencyOperationIds(wait_for);
+    deps.insert(static_cast<PodBufferHandle*>(handle.get())->operation_id());
+
+    auto op_id = static_cast<PodBufferHandle*>(handle.get())->operation_id();
+    auto core_id = static_cast<PodBufferHandle*>(handle.get())->core_id();
+
+    ScheduleRequest(
+        operation_id,
+        [this, op_id, core_id]() {
+          absl::MutexLock l(&mu_);
+          auto buf_iter = underlying_buffers_.find(op_id);
+          auto underlying_hn = std::move(buf_iter->second);
+          underlying_buffers_.erase(buf_iter);
+
+          return core_to_driver_[core_id]->Deallocate(std::move(underlying_hn),
+                                                      {});
+        },
+        deps);
+
+    return std::make_shared<PodEvent>(this, operation_id);
+  }
+
+  std::shared_ptr<Event> TransferToDevice(
+      const void* src, BufferHandle* dst,
+      absl::Span<Event* const> wait_for) override {
+    int64_t operation_id = GetOperationId();
+    auto deps = GetDependencyOperationIds(wait_for);
+    deps.insert(static_cast<PodBufferHandle*>(dst)->operation_id());
+
+    auto op_id = static_cast<PodBufferHandle*>(dst)->operation_id();
+    auto core_id = static_cast<PodBufferHandle*>(dst)->core_id();
+
+    ScheduleRequest(
+        operation_id,
+        [this, src, op_id, core_id]() {
+          absl::MutexLock l(&mu_);
+          auto buf_iter = underlying_buffers_.find(op_id);
+          return core_to_driver_[core_id]->TransferToDevice(
+              src, buf_iter->second.get(), {});
+        },
+        deps);
+
+    return std::make_shared<PodEvent>(this, operation_id);
+  }
+
+  std::shared_ptr<Event> TransferFromDevice(
+      const BufferHandle* src, void* dst,
+      absl::Span<Event* const> wait_for) override {
+    int64_t operation_id = GetOperationId();
+    auto deps = GetDependencyOperationIds(wait_for);
+    deps.insert(static_cast<const PodBufferHandle*>(src)->operation_id());
+
+    auto op_id = static_cast<const PodBufferHandle*>(src)->operation_id();
+    auto core_id = static_cast<const PodBufferHandle*>(src)->core_id();
+
+    ScheduleRequest(
+        operation_id,
+        [this, dst, op_id, core_id]() {
+          absl::MutexLock l(&mu_);
+          auto buf_iter = underlying_buffers_.find(op_id);
+          return core_to_driver_[core_id]->TransferFromDevice(
+              buf_iter->second.get(), dst, {});
+        },
+        deps);
+
+    return std::make_shared<PodEvent>(this, operation_id);
+  }
+
+  std::shared_ptr<Event> TransferFromDeviceToDevice(
+      const BufferHandle* src, BufferHandle* dst,
+      absl::Span<Event* const> wait_for) override {
+    int64_t operation_id = GetOperationId();
+    auto deps = GetDependencyOperationIds(wait_for);
+    deps.insert(static_cast<const PodBufferHandle*>(src)->operation_id());
+    deps.insert(static_cast<PodBufferHandle*>(dst)->operation_id());
+
+    auto src_op_id = static_cast<const PodBufferHandle*>(src)->operation_id();
+    auto dst_op_id = static_cast<PodBufferHandle*>(dst)->operation_id();
+    auto core_id = static_cast<PodBufferHandle*>(dst)->core_id();
+
+    ScheduleRequest(
+        operation_id,
+        [this, src_op_id, dst_op_id, core_id]() {
+          absl::MutexLock l(&mu_);
+          auto src_iter = underlying_buffers_.find(src_op_id);
+          auto dst_iter = underlying_buffers_.find(dst_op_id);
+          return core_to_driver_[core_id]->TransferFromDeviceToDevice(
+              src_iter->second.get(), dst_iter->second.get(), {});
+        },
+        deps);
+
+    return std::make_shared<PodEvent>(this, operation_id);
+  }
+
+  std::unique_ptr<CompiledProgramHandle> CompileProgram(
+      const xla::HloProto& source, int32_t num_replicas,
+      absl::Span<Event* const> wait_for) override {
+    int64_t operation_id = GetOperationId();
+    auto deps = GetDependencyOperationIds(wait_for);
+
+    ScheduleRequest(
+        operation_id,
+        [this, operation_id, source, num_replicas]() {
+          absl::MutexLock l(&mu_);
+          auto cph_iterator =
+              underlying_cph_
+                  .insert(
+                      {operation_id,
+                       std::vector<std::unique_ptr<CompiledProgramHandle>>()})
+                  .first;
+
+          std::vector<std::shared_ptr<Event>> collected_events;
+          for (int i = 0; i < drivers_.size(); ++i) {
+            auto current_cph =
+                drivers_[i]->CompileProgram(source, num_replicas, {});
+            cph_iterator->second.push_back(std::move(current_cph));
+            collected_events.push_back(cph_iterator->second[i]->OnReady());
+          }
+          return std::make_shared<CombinedEvent>(this, operation_id,
+                                                 collected_events);
+        },
+        deps);
+
+    return absl::make_unique<PodCompiledProgramHandle>(this, operation_id);
+  }
+
+  std::unique_ptr<LoadedProgramHandle> LoadProgram(
+      int32_t core_id, const CompiledProgramHandle* handle,
+      absl::Span<Event* const> wait_for) override {
+    int64_t operation_id = GetOperationId();
+    auto deps = GetDependencyOperationIds(wait_for);
+    deps.insert(
+        static_cast<const PodCompiledProgramHandle*>(handle)->operation_id());
+    auto cph_op_id =
+        static_cast<const PodCompiledProgramHandle*>(handle)->operation_id();
+
+    ScheduleRequest(
+        operation_id,
+        [this, operation_id, cph_op_id, core_id]() {
+          absl::MutexLock l(&mu_);
+          auto cph_iter = underlying_cph_.find(cph_op_id);
+
+          underlying_lph_.insert(
+              {operation_id,
+               core_to_driver_[core_id]->LoadProgram(
+                   core_to_driver_core_[core_id],
+                   cph_iter->second[core_to_driver_id_[core_id]].get(), {})});
+
+          return underlying_lph_[operation_id]->OnReady();
+        },
+        deps);
+
+    return absl::make_unique<PodLoadedProgramHandle>(this, operation_id,
+                                                     core_id);
+  }
+
+  std::shared_ptr<Event> UnloadProgram(
+      std::unique_ptr<LoadedProgramHandle> handle,
+      absl::Span<Event* const> wait_for) override {
+    int64_t operation_id = GetOperationId();
+    auto deps = GetDependencyOperationIds(wait_for);
+    deps.insert(
+        static_cast<PodLoadedProgramHandle*>(handle.get())->operation_id());
+    auto op_id =
+        static_cast<PodLoadedProgramHandle*>(handle.get())->operation_id();
+    auto core_id =
+        static_cast<PodLoadedProgramHandle*>(handle.get())->core_id();
+
+    ScheduleRequest(
+        operation_id,
+        [this, op_id, core_id]() {
+          absl::MutexLock l(&mu_);
+
+          auto lph_iter = underlying_lph_.find(op_id);
+          auto event = core_to_driver_[core_id]->UnloadProgram(
+              std::move(lph_iter->second), {});
+          underlying_lph_.erase(lph_iter);
+
+          return event;
+        },
+        deps);
+
+    return std::make_shared<PodEvent>(this, operation_id);
+  }
+
+  std::shared_ptr<Event> ExecuteProgram(
+      LoadedProgramHandle* program, absl::Span<BufferHandle* const> inputs,
+      absl::Span<BufferHandle* const> outputs,
+      const xla::DeviceAssignmentProto& device_assignment,
+      absl::Span<Event* const> wait_for) override {
+    int64_t operation_id = GetOperationId();
+    auto deps = GetDependencyOperationIds(wait_for);
+    deps.insert(static_cast<PodLoadedProgramHandle*>(program)->operation_id());
+
+    auto op_id = static_cast<PodLoadedProgramHandle*>(program)->operation_id();
+    auto core_id = static_cast<PodLoadedProgramHandle*>(program)->core_id();
+
+    std::vector<int64_t> input_op_ids;
+    std::vector<int64_t> output_op_ids;
+
+    for (auto* input : inputs) {
+      auto input_dep =
+          static_cast<PodBufferHandle* const>(input)->operation_id();
+      input_op_ids.push_back(input_dep);
+      deps.insert(input_dep);
+    }
+    for (auto* output : outputs) {
+      auto output_dep =
+          static_cast<PodBufferHandle* const>(output)->operation_id();
+      output_op_ids.push_back(output_dep);
+      deps.insert(output_dep);
+    }
+
+    ScheduleRequest(
+        operation_id,
+        [this, core_id, op_id, input_op_ids, output_op_ids,
+         device_assignment]() {
+          absl::MutexLock l(&mu_);
+
+          std::vector<BufferHandle*> underlying_inputs;
+          std::vector<BufferHandle*> underlying_outputs;
+
+          underlying_inputs.reserve(input_op_ids.size());
+          for (auto input_op_id : input_op_ids) {
+            underlying_inputs.push_back(underlying_buffers_[input_op_id].get());
+          }
+          underlying_outputs.reserve(output_op_ids.size());
+          for (auto output_op_id : output_op_ids) {
+            underlying_outputs.push_back(
+                underlying_buffers_[output_op_id].get());
+          }
+
+          LoadedProgramHandle* handle = underlying_lph_[op_id].get();
+          return core_to_driver_[core_id]->ExecuteProgram(
+              handle, underlying_inputs, underlying_outputs, device_assignment,
+              {});
+        },
+        deps);
+
+    return std::make_shared<PodEvent>(this, operation_id);
+  }
+
+  std::unique_ptr<TpuLinearizer> GetLinearizer() override {
+    return drivers_[0]->GetLinearizer();
+  }
+
+  // Helper methods for Event scheduling
+
+  absl::optional<Status> WaitForEvent(int64_t event_id,
+                                      absl::Duration duration) {
+    std::shared_ptr<Event> underlying_event;
+
+    {
+      absl::MutexLock l(&event_mu_);
+      auto event = events_.find(event_id);
+
+      if (event == events_.end()) {
+        auto event_status = abnormal_event_status_.find(event_id);
+        if (event_status == abnormal_event_status_.end()) {
+          return Status::OK();
+        } else {
+          return event_status->second;
+        }
+      }
+
+      auto done = [this, event_id]() {
+        event_mu_.AssertHeld();
+        return events_[event_id].underlying_event != nullptr;
+      };
+
+      auto status =
+          event_mu_.AwaitWithTimeout(absl::Condition(&done), duration);
+      if (!status) {
+        return absl::nullopt;
+      }
+      underlying_event = events_[event_id].underlying_event;
+    }
+
+    // Wait for the underlying event without holding on to the event_lock_, or
+    // else incoming events will not be processed.
+    return underlying_event->AwaitWithTimeout(duration);
+  }
+
+  void AddCallbackForEvent(int64_t event_id, std::function<void(Status)> fn) {
+    absl::MutexLock l(&event_mu_);
+    auto event = events_.find(event_id);
+
+    if (event == events_.end()) {
+      auto event_status = abnormal_event_status_.find(event_id);
+      if (event_status == abnormal_event_status_.end()) {
+        fn(Status::OK());
+      } else {
+        fn(event_status->second);
+      }
+    }
+
+    if (event->second.underlying_event != nullptr) {
+      event->second.underlying_event->AddCallback(fn);
+    } else {
+      event->second.callbacks.push_back(std::move(fn));
+    }
+  }
+
+  xla::Status GetCompiledProgramShape(int64_t op_id,
+                                      xla::ProgramShapeProto* program_shape) {
+    absl::MutexLock l(&mu_);
+
+    auto done = [this, op_id]() {
+      mu_.AssertHeld();
+      return underlying_cph_.contains(op_id);
+    };
+    mu_.Await(absl::Condition(&done));
+
+    return underlying_cph_[op_id][0]->program_shape(program_shape);
+  }
+
+ private:
+  const TpuDriverConfig& config_;
+  std::shared_ptr<::grpc::ChannelCredentials> creds_;
+
+  std::vector<std::unique_ptr<TpuDriver>> drivers_;
+  std::vector<int32_t> core_to_driver_id_;
+  std::vector<TpuDriver*> core_to_driver_;
+  std::vector<int32_t> core_to_driver_core_;
+  SystemInfo pod_info_;
+
+  absl::Mutex mu_;
+  absl::Mutex event_mu_;
+
+  absl::flat_hash_map<int64_t, std::unique_ptr<BufferHandle>>
+      underlying_buffers_ ABSL_GUARDED_BY(mu_);
+  absl::flat_hash_map<int64_t,
+                      std::vector<std::unique_ptr<CompiledProgramHandle>>>
+      underlying_cph_ ABSL_GUARDED_BY(mu_);
+  absl::flat_hash_map<int64_t, std::unique_ptr<LoadedProgramHandle>>
+      underlying_lph_ ABSL_GUARDED_BY(mu_);
+
+  absl::btree_map<int64_t, EventInFlight> events_ ABSL_GUARDED_BY(event_mu_);
+  absl::flat_hash_map<int64_t, Status> abnormal_event_status_
+      ABSL_GUARDED_BY(event_mu_);
+
+  std::atomic<int64_t> operation_id_counter_{0};
+
+  WorkerThread event_thread_;
+
+  int64_t GetOperationId() { return operation_id_counter_++; }
+
+  absl::flat_hash_set<int64_t> GetDependencyOperationIds(
+      absl::Span<Event* const> wait_for) {
+    absl::flat_hash_set<int64_t> deps;
+    for (auto* event : wait_for) {
+      deps.insert(static_cast<PodEvent* const>(event)->operation_id());
+    }
+    return deps;
+  }
+
+  // EventCompleted is executed on the event_thread_ worker thread. We want
+  // to propagate the fact that the event is completed to any subsequent events
+  // that might depend on this event.
+  void EventCompleted(int64_t event_id, Status status) {
+    absl::MutexLock l(&event_mu_);
+
+    absl::btree_map<int64_t, EventInFlight>::iterator curr_event;
+    if (!status.ok()) abnormal_event_status_.insert({event_id, status});
+    curr_event = events_.find(event_id);
+
+    DCHECK(curr_event->second.callbacks.empty());
+    DCHECK(curr_event->second.incomplete_deps.empty());
+
+    for (auto& event : events_) {
+      event.second.incomplete_deps.erase(event_id);
+      // The if statement conditions on both
+      //  - all previous events have completed (incomplete_deps.empty())
+      //  - the op creating this event has not been called yet
+      //    (event.second.create_fn != nullptr)
+      // We call the create_fn that creates the event and adds any relevant
+      // callbacks to the actual event, before setting create_fn to nullptr
+      // to indicate that it has already been called
+      if (event.second.incomplete_deps.empty() &&
+          event.second.create_fn != nullptr) {
+        // We were the last unfilled dependency, all other dependencies are
+        // filled. We can now fire the create function.
+        event.second.underlying_event = event.second.create_fn();
+        for (auto& fn : event.second.callbacks) {
+          event.second.underlying_event->AddCallback(std::move(fn));
+        }
+        event.second.callbacks.clear();
+        event.second.create_fn = nullptr;
+      }
+    }
+
+    // We erase the current event to signal that it has finished.
+    events_.erase(curr_event);
+  }
+
+  void ScheduleRequest(int64_t operation_id,
+                       std::function<std::shared_ptr<Event>(void)> fn,
+                       const absl::flat_hash_set<int64_t>& deps) {
+    absl::MutexLock l(&event_mu_);
+    absl::btree_map<int64_t, EventInFlight>::iterator event;
+    absl::flat_hash_set<int64_t> incomplete_deps;
+
+    event = events_.insert({operation_id, {}}).first;
+    for (const auto& dep : deps) {
+      if (events_.count(dep) > 0) incomplete_deps.insert(dep);
+    }
+
+    if (incomplete_deps.empty()) {
+      // All dependencies have been fulfilled, we execute the request
+      // immediately and add a callback to inform our event fulfilled thread
+      // when it is done.
+      event->second.create_fn = nullptr;
+      event->second.underlying_event = fn();
+      event->second.underlying_event->AddCallback(
+          [this, operation_id](Status status) {
+            event_thread_.Schedule([this, operation_id, status]() {
+              EventCompleted(operation_id, status);
+            });
+          });
+    } else {
+      // There are some dependencies that are not yet fulfilled. We attach
+      // the request to the event, and will execute it in the EventFulfilled
+      // worker thread when all its dependencies are fulfilled.
+      event->second.create_fn = std::move(fn);
+      event->second.incomplete_deps = std::move(incomplete_deps);
+      event->second.callbacks.push_back([this, operation_id](Status status) {
+        event_thread_.Schedule([this, operation_id, status]() {
+          EventCompleted(operation_id, status);
+        });
+      });
+    }
+  }
+};
+
+xla::Status PodEvent::Await() {
+  return driver_->WaitForEvent(operation_id_, absl::InfiniteDuration()).value();
+}
+
+absl::optional<xla::Status> PodEvent::AwaitWithTimeout(
+    absl::Duration duration) {
+  return driver_->WaitForEvent(operation_id_, duration);
+}
+
+void PodEvent::AddCallback(std::function<void(Status)> callback) {
+  driver_->AddCallbackForEvent(operation_id_, std::move(callback));
+}
+
+xla::StatusOr<std::unique_ptr<TpuDriver>> CreatePodTpuDriver(
+    const TpuDriverConfig& config,
+    std::shared_ptr<::grpc::ChannelCredentials> creds) {
+  return std::unique_ptr<TpuDriver>(new PodTpuDriver(config, creds));
+}
+
+xla::Status PodCompiledProgramHandle::program_shape(
+    xla::ProgramShapeProto* program_shape) {
+  return driver_->GetCompiledProgramShape(operation_id(), program_shape);
+}
+
+}  // namespace
+
+REGISTER_TPU_DRIVER(kPodTpuDriverPrefix,
+                    [](const TpuDriverConfig& config)
+                        -> xla::StatusOr<std::unique_ptr<TpuDriver>> {
+                      return CreatePodTpuDriver(
+                          config,
+                          ::grpc::InsecureChannelCredentials());  // NOLINT
+                    });
+
+}  // namespace tpu_driver
diff --git a/tensorflow/compiler/xla/python/xla.cc b/tensorflow/compiler/xla/python/xla.cc
index 9590c5d57c3..06605660b63 100644
--- a/tensorflow/compiler/xla/python/xla.cc
+++ b/tensorflow/compiler/xla/python/xla.cc
@@ -44,11 +44,13 @@ limitations under the License.
 #include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
 #include "tensorflow/compiler/xla/python/bfloat16.h"
 #include "tensorflow/compiler/xla/python/dlpack.h"
+#include "tensorflow/compiler/xla/python/jax_jit.h"
 #include "tensorflow/compiler/xla/python/ops.h"
 #include "tensorflow/compiler/xla/python/outfeed_receiver_py.h"
 #include "tensorflow/compiler/xla/python/py_buffer.h"
 #include "tensorflow/compiler/xla/python/py_executable.h"
 #include "tensorflow/compiler/xla/python/python_ref_manager.h"
+#include "tensorflow/compiler/xla/python/pytree.h"
 #include "tensorflow/compiler/xla/python/traceback.h"
 #include "tensorflow/compiler/xla/python/types.h"
 #include "tensorflow/compiler/xla/service/custom_call_target_registry.h"
@@ -169,13 +171,13 @@ class TraceMeWrapper : public tensorflow::profiler::TraceMeWrapper {
 void BuildProfilerSubmodule(py::module* m) {
   py::module profiler =
       m->def_submodule("profiler", "TensorFlow profiler integration");
-  py::class_<tensorflow::ProfilerServer,
-             std::unique_ptr<tensorflow::ProfilerServer>>
+  py::class_<tensorflow::profiler::ProfilerServer,
+             std::unique_ptr<tensorflow::profiler::ProfilerServer>>
       profiler_server_class(profiler, "ProfilerServer");
   profiler.def(
       "start_server",
-      [](int port) -> std::unique_ptr<tensorflow::ProfilerServer> {
-        auto server = absl::make_unique<tensorflow::ProfilerServer>();
+      [](int port) -> std::unique_ptr<tensorflow::profiler::ProfilerServer> {
+        auto server = absl::make_unique<tensorflow::profiler::ProfilerServer>();
         server->StartProfilerServer(port);
         return server;
       },
@@ -437,26 +439,26 @@ PYBIND11_MODULE(xla_extension, m) {
                 device_assignment);
           });
 
-  py::class_<Device, ClientAndPtr<Device>>(
+  py::class_<PjRtDevice, ClientAndPtr<PjRtDevice>>(
       m, "Device",
       "A descriptor of an available device.\n\nSubclasses are used to "
       "represent specific types of devices, e.g. CPUs, GPUs. Subclasses may "
       "have additional properties specific to that device type.")
       .def_property_readonly(
-          "id", &Device::id,
+          "id", &PjRtDevice::id,
           "Integer ID of this device.\n\nUnique across all available devices "
           "of this type, including remote devices on multi-host platforms.")
-      .def_property_readonly("host_id", &Device::host_id,
+      .def_property_readonly("host_id", &PjRtDevice::host_id,
                              "Integer ID of this device's host.\n\n"
                              "This is always 0 except on multi-host platforms.")
-      .def_property_readonly("platform", &Device::platform_name)
-      .def_property_readonly("device_kind", &Device::device_kind)
+      .def_property_readonly("platform", &PjRtDevice::platform_name)
+      .def_property_readonly("device_kind", &PjRtDevice::device_kind)
       .def_property_readonly(
           "client",
-          [](const ClientAndPtr<Device>& device) { return device.client; })
-      .def("__str__", &Device::DebugString)
+          [](const ClientAndPtr<PjRtDevice>& device) { return device.client; })
+      .def("__str__", &PjRtDevice::DebugString)
       .def("transfer_to_infeed",
-           [](const Device& device, const LiteralSlice& literal) {
+           [](const PjRtDevice& device, const LiteralSlice& literal) {
              GlobalPyRefManager()->CollectGarbage();
              py::gil_scoped_release gil_release;
              TF_ASSIGN_OR_RETURN(LocalDeviceState * local_device,
@@ -466,7 +468,8 @@ PYBIND11_MODULE(xla_extension, m) {
            })
       .def(
           "transfer_from_outfeed",
-          [](const Device& device, const Shape& shape) -> StatusOr<py::object> {
+          [](const PjRtDevice& device,
+             const Shape& shape) -> StatusOr<py::object> {
             GlobalPyRefManager()->CollectGarbage();
             std::shared_ptr<Literal> literal_shared;
             {
@@ -490,12 +493,12 @@ PYBIND11_MODULE(xla_extension, m) {
             return LiteralToPython(std::move(literal_shared));
           });
 
-  py::class_<CpuDevice, Device, ClientAndPtr<CpuDevice>>(m, "CpuDevice")
+  py::class_<CpuDevice, PjRtDevice, ClientAndPtr<CpuDevice>>(m, "CpuDevice")
       .def("__repr__", [](const CpuDevice& device) {
         return absl::StrFormat("CpuDevice(id=%i)", device.id());
       });
 
-  py::class_<GpuDevice, Device, ClientAndPtr<GpuDevice>>(m, "GpuDevice")
+  py::class_<GpuDevice, PjRtDevice, ClientAndPtr<GpuDevice>>(m, "GpuDevice")
       .def("__repr__", [](const GpuDevice& device) {
         return absl::StrFormat("GpuDevice(id=%i)", device.id());
       });
@@ -654,7 +657,7 @@ PYBIND11_MODULE(xla_extension, m) {
   PyTypeObject* buffer_type = reinterpret_cast<PyTypeObject*>(buffer.ptr());
   buffer_type->tp_as_buffer = PyBuffer::BufferProtocol();
 
-  py::class_<PyExecutable, std::unique_ptr<PyExecutable>> executable(
+  py::class_<PyExecutable, std::shared_ptr<PyExecutable>> executable(
       m, "Executable");
   executable.def_property_readonly("client", &PyExecutable::client)
       .def("local_logical_device_ids", &PyExecutable::local_logical_device_ids)
@@ -737,7 +740,7 @@ PYBIND11_MODULE(xla_extension, m) {
       .def(py::init([](const py::bytes& serialized_hlo_module_proto)
                         -> std::unique_ptr<XlaComputation> {
         HloModuleProto proto;
-        proto.ParseFromString(serialized_hlo_module_proto);
+        proto.ParseFromString(std::string(serialized_hlo_module_proto));
         return absl::make_unique<XlaComputation>(proto);
       }))
       .def("get_hlo_module", &GetHloModule)
@@ -897,6 +900,8 @@ PYBIND11_MODULE(xla_extension, m) {
   BuildOpsSubmodule(&m);
   BuildProfilerSubmodule(&m);
   BuildOutfeedReceiverSubmodule(&m);
+  BuildPytreeSubmodule(m);
+  BuildJaxjitSubmodule(m);
 
   py::class_<DistributedRuntimeService,
              std::unique_ptr<DistributedRuntimeService>>
diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
index 38c55c6fe5d..da548ca1f0d 100644
--- a/tensorflow/compiler/xla/python/xla_client.py
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -113,16 +113,17 @@ def _get_local_backends():
 
   _local_backends = collections.OrderedDict()
   for name, factory in _local_backend_factories.items():
-    logging.vlog(2, "Initializing backend '%s'" % name)
+    logging.vlog(1, "Initializing backend '%s'" % name)
     try:
       backend = factory()
-    except RuntimeError:
+    except RuntimeError as err:
       if name == 'cpu':
         # We always expect CPU to initialize successfully.
         raise
       else:
         # If the backend isn't built into the binary, or if it has no devices,
         # we expect a RuntimeError.
+        logging.vlog(1, "Error initializing backend '%s': %s" % (name, err))
         continue
     _local_backends[name] = backend
   return _local_backends
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 49431b19a69..6874d00445c 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -83,6 +83,7 @@ cc_library(
     deps = [
         ":bfloat16_support",
         ":hlo",
+        ":hlo_dataflow_analysis",
         ":hlo_pass",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
@@ -1431,6 +1432,7 @@ cc_library(
         ":hlo_live_range",
         ":hlo_ordering",
         ":hlo_proto_cc",
+        ":memory_space_assignment_repacking",
         ":tuple_points_to_analysis",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
@@ -1683,6 +1685,7 @@ cc_library(
     hdrs = ["multi_output_fusion.h"],
     deps = [
         ":hlo",
+        ":hlo_dataflow_analysis",
         ":hlo_dce",
         ":hlo_pass",
         ":hlo_reachability",
@@ -1700,7 +1703,10 @@ cc_library(
 cc_library(
     name = "hlo_creation_utils",
     srcs = ["hlo_creation_utils.cc"],
-    hdrs = ["hlo_creation_utils.h"],
+    hdrs = [
+        "hlo_creation_utils.h",
+        "//tensorflow/compiler/xla:literal_util",
+    ],
     deps = [
         ":hlo",
         ":hlo_module_config",
@@ -1816,6 +1822,21 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "comparison_expander",
+    srcs = ["comparison_expander.cc"],
+    hdrs = ["comparison_expander.h"],
+    deps = [
+        ":hlo",
+        ":hlo_creation_utils",
+        ":hlo_pass",
+        ":op_expander_pass",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/client/lib:comparators",
+    ],
+)
+
 cc_library(
     name = "scatter_expander",
     srcs = ["scatter_expander.cc"],
@@ -1824,6 +1845,7 @@ cc_library(
         ":hlo",
         ":hlo_creation_utils",
         ":hlo_pass",
+        ":op_expander_pass",
         ":while_util",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:statusor",
@@ -2259,6 +2281,7 @@ tf_cc_test(
     srcs = ["gather_expander_test.cc"],
     deps = [
         ":gather_expander",
+        ":hlo_query",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:test_macros_header",
@@ -2319,9 +2342,11 @@ cc_library(
         ":call_inliner",
         ":hlo",
         ":hlo_casting_utils",
+        ":hlo_cse",
         ":hlo_dce",
         ":hlo_pass",
         ":hlo_pass_pipeline",
+        ":hlo_verifier",
         ":tuple_simplifier",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
@@ -2665,6 +2690,7 @@ cc_library(
         ":hlo_casting_utils",
         ":hlo_dce",
         ":hlo_pass",
+        ":shape_inference",
         "//tensorflow/compiler/xla:comparison_util",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
@@ -2688,7 +2714,6 @@ xla_test(
         ":dynamic_padder",
         ":hlo",
         ":hlo_dce",
-        ":hlo_get_dimension_size_rewriter",
         ":hlo_matchers",
         ":hlo_parser",
         "//tensorflow/compiler/xla:debug_options_flags",
@@ -3407,6 +3432,35 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "memory_space_assignment_repacking",
+    hdrs = ["memory_space_assignment_repacking.h"],
+    deps = [
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+    ],
+)
+
+cc_library(
+    name = "memory_space_assignment_best_fit_repacker",
+    srcs = ["memory_space_assignment_best_fit_repacker.cc"],
+    hdrs = ["memory_space_assignment_best_fit_repacker.h"],
+    deps = [
+        ":heap_simulator",
+        ":memory_space_assignment_repacking",
+    ],
+)
+
+tf_cc_test(
+    name = "memory_space_assignment_best_fit_repacker_test",
+    srcs = ["memory_space_assignment_best_fit_repacker_test.cc"],
+    deps = [
+        ":memory_space_assignment_best_fit_repacker",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
+    ],
+)
+
 cc_library(
     name = "memory_space_assignment",
     srcs = ["memory_space_assignment.cc"],
@@ -3414,6 +3468,7 @@ cc_library(
     deps = [
         ":heap_simulator",
         ":hlo_cost_analysis",
+        ":memory_space_assignment_repacking",
         ":memory_space_assignment_utils",
         "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/core/lib/math:math_util",
@@ -3968,42 +4023,6 @@ tf_cc_test(
     ],
 )
 
-cc_library(
-    name = "hlo_get_dimension_size_rewriter",
-    srcs = ["hlo_get_dimension_size_rewriter.cc"],
-    hdrs = ["hlo_get_dimension_size_rewriter.h"],
-    deps = [
-        ":dynamic_dimension_inference",
-        ":hlo",
-        ":hlo_pass",
-        ":shape_inference",
-        "//tensorflow/compiler/xla:literal_util",
-        "@com_google_absl//absl/algorithm:container",
-    ],
-)
-
-tf_cc_test(
-    name = "hlo_get_dimension_size_rewriter_test",
-    srcs = ["hlo_get_dimension_size_rewriter_test.cc"],
-    deps = [
-        ":hlo",
-        ":hlo_get_dimension_size_rewriter",
-        ":hlo_matchers",
-        ":hlo_parser",
-        "//tensorflow/compiler/xla:literal",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/compiler/xla/tests:test_utils",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-    ],
-)
-
 cc_library(
     name = "maybe_owning_device_memory",
     srcs = [
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index 0b588048e4a..4e7bd85e557 100755
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -665,7 +665,7 @@ Status AlgebraicSimplifierVisitor::ScalarMultiplyReduction(
     HloInstruction* inst;
     HloInstruction* user;
     int64 index;
-    std::tie (inst, user, index) = operands.back();
+    std::tie(inst, user, index) = operands.back();
     operands.pop_back();
 
     // Skip the op types that are not commutative with multiply.
@@ -913,7 +913,7 @@ Status AlgebraicSimplifierVisitor::HandleAdd(HloInstruction* add) {
        (Match(lhs, m::Multiply(m::Op(&c), m::Op(&a))) &&
         Match(rhs, m::MultiplyAnyOrder(m::Op().Is(c), m::Op(&b))))) &&
       (ShapeUtil::ElementIsIntegral(add->shape()) ||
-       IsAllFpConstantPowerOf2(c))) {
+       options_.enable_floats_are_real() || IsAllFpConstantPowerOf2(c))) {
     return ReplaceWithNewInstruction(
         add, HloInstruction::CreateBinary(
                  add->shape(), HloOpcode::kMultiply,
@@ -1236,6 +1236,10 @@ Status AlgebraicSimplifierVisitor::HandleConcatenate(
     return Status::OK();
   }
 
+  if (options_.is_layout_sensitive()) {
+    return Status::OK();
+  }
+
   // Check if we can merge "adjacent" slice operands which take slices from the
   // same other op. For simplicity we only merge unstrided slices.
   int64 concatenate_dimension = concatenate->concatenate_dimension();
@@ -1296,7 +1300,15 @@ Status AlgebraicSimplifierVisitor::HandleConcatenate(
       auto replacement =
           computation_->AddInstruction(concatenate->CloneWithNewOperands(
               concatenate->shape(), new_operands));
-      ReplaceInstructionIfSameShape(concatenate, replacement);
+
+      // Recurse to handle multiple disjoint sequence of inputs. The
+      // logic above merge only 1 sequential series of
+      // inputs. Otherwise, it can lead to the FixPass optimization
+      // hitting its threshold.
+      if (ReplaceInstructionIfSameShape(concatenate, replacement)) {
+        return HandleConcatenate(replacement);
+      }
+
       return Status::OK();
     }
   }
@@ -1335,6 +1347,23 @@ Status AlgebraicSimplifierVisitor::HandleConcatenate(
             operands[pad_value_operand]->mutable_operand(0), padding_config));
     return ReplaceInstruction(concatenate, pad);
   }
+
+  if (absl::c_count(operands, operands[0]) == operands.size() &&
+      operands[0]->shape().dimensions(concatenate_dimension) == 1) {
+    Shape new_shape = operands[0]->shape();
+    absl::InlinedVector<int64, 8> broadcast_dims;
+    for (int64 i = 0; i < new_shape.rank(); ++i) {
+      if (i == concatenate_dimension) {
+        continue;
+      }
+      broadcast_dims.push_back(i);
+    }
+    new_shape.DeleteDimension(concatenate_dimension);
+    return ReplaceInstruction(
+        concatenate,
+        MakeBroadcastHlo(MakeReshapeHlo(new_shape, operands[0]).ValueOrDie(),
+                         broadcast_dims, concatenate->shape()));
+  }
   return Status::OK();
 }
 
@@ -2479,6 +2508,20 @@ Status AlgebraicSimplifierVisitor::HandleGather(HloInstruction* gather) {
   if (ShapeUtil::IsZeroElementArray(operand_shape)) {
     return ReplaceInstruction(gather, MakeScalarLike(gather, 0));
   }
+
+  // Gathering from a scalar operand is simply a broadcast of that scalar
+  if (ShapeUtil::IsEffectiveScalar(operand_shape)) {
+    HloInstruction* new_operand = gather->mutable_operand(0);
+    if (operand_shape.rank()) {
+      TF_ASSIGN_OR_RETURN(new_operand,
+                          MakeReshapeHlo(ShapeUtil::MakeScalarShape(
+                                             operand_shape.element_type()),
+                                         new_operand));
+    }
+    HloInstruction* new_gather =
+        MakeBroadcastHlo(new_operand, {}, gather->shape());
+    return ReplaceInstruction(gather, new_gather);
+  }
   // If the operand of a gather is very small, it is easier to fuse a
   // sequence of selects.
   const Shape& index_shape = gather->operand(1)->shape();
@@ -2667,6 +2710,17 @@ Status AlgebraicSimplifierVisitor::HandleMultiply(HloInstruction* multiply) {
     return Status::OK();
   }
 
+  {
+    HloInstruction* abs_operand;
+    if (lhs == rhs && Match(lhs, m::Abs(m::Op(&abs_operand))) &&
+        !ShapeUtil::ElementIsComplex(abs_operand->shape())) {
+      TF_RETURN_IF_ERROR(multiply->ReplaceOperandWith(0, abs_operand));
+      TF_RETURN_IF_ERROR(multiply->ReplaceOperandWith(1, abs_operand));
+      changed_ = true;
+      return Status::OK();
+    }
+  }
+
   {
     HloInstruction *convert_operand, *operand;
     // Mul(Convert(Pred), operand) => select(pred, operand, 0)
@@ -2691,7 +2745,7 @@ Status AlgebraicSimplifierVisitor::HandleMultiply(HloInstruction* multiply) {
     // Mul(Mul(x, constant1), Mul(y, constant2)) => Mul(Mul(x, y),
     // constant1*constant2)
     if (Match(multiply,
-              m::Multiply(
+              m::MultiplyAnyOrder(
                   m::MultiplyAnyOrder(m::NonConstant(&a), m::Constant(&c1)),
                   m::MultiplyAnyOrder(m::NonConstant(&b), m::Constant(&c2))))) {
       TF_ASSIGN_OR_RETURN(auto* product_of_constants,
@@ -2713,6 +2767,29 @@ Status AlgebraicSimplifierVisitor::HandleMultiply(HloInstruction* multiply) {
     }
   }
 
+  {
+    HloInstruction *a, *c1, *c2;
+    // Mul(Mul(a, constant1), constant2) => Mul(a, constant1*constant2)
+    if (Match(multiply,
+              m::MultiplyAnyOrder(
+                  m::MultiplyAnyOrder(m::NonConstant(&a), m::Constant(&c1)),
+                  m::Constant(&c2)))) {
+      TF_ASSIGN_OR_RETURN(auto* product_of_constants,
+                          MakeBinaryHlo(HloOpcode::kMultiply, c1, c2));
+      if (ShapeUtil::IsScalar(product_of_constants->shape()) &&
+          !ShapeUtil::IsScalar(multiply->shape())) {
+        product_of_constants =
+            computation_->AddInstruction(HloInstruction::CreateBroadcast(
+                multiply->shape(), product_of_constants, {}));
+      }
+
+      return ReplaceWithNewInstruction(
+          multiply,
+          HloInstruction::CreateBinary(multiply->shape(), HloOpcode::kMultiply,
+                                       a, product_of_constants));
+    }
+  }
+
   {
     HloInstruction *a, *b, *constant, *op;
     // Mul(Mul(a, constant1), Broadcast(b)) =>
@@ -3245,6 +3322,9 @@ Status AlgebraicSimplifierVisitor::HandlePad(HloInstruction* pad) {
   // padding with a pad with non-negative padding followed by a slice.
   bool all_zero = true;
   bool has_negative = false;
+  // Used to possibly split off the unchanged padding dimensions.
+  std::vector<int64> padding_dimensions;
+  int64 dimension_index = 0;
   for (auto& padding_dimension : pad->padding_config().dimensions()) {
     if (padding_dimension.edge_padding_low() < 0 ||
         padding_dimension.edge_padding_high() < 0) {
@@ -3253,12 +3333,93 @@ Status AlgebraicSimplifierVisitor::HandlePad(HloInstruction* pad) {
     if (padding_dimension.edge_padding_low() != 0 ||
         padding_dimension.edge_padding_high() != 0) {
       all_zero = false;
+      padding_dimensions.push_back(dimension_index);
+    } else if (padding_dimension.interior_padding()) {
+      padding_dimensions.push_back(dimension_index);
     }
+    dimension_index++;
   }
 
   if (all_zero) {
-    ReplaceInstructionIfSameShape(pad, pad->mutable_operand(0));
-    return Status::OK();
+    if (ReplaceInstructionIfSameShape(pad, pad->mutable_operand(0))) {
+      return Status::OK();
+    }
+  }
+
+  // The context of this optimization can be found at b/163617402
+  // It tries to capture the case of pad(broadcast(x)), where
+  // x->shape().dimensions(), or broadcast(x)->dimensions(), is
+  // a subset of the padded dimensions in pad->config(),
+  // and the padded dimensions in pad->config() is in turn a strict
+  // subset of broadcast->shape().dimensions(). The combined op can be
+  // rewritten to broadcast2(pad(broadcast1(x))), where broadcast1 extends
+  // x  with dimensions that need to be padded, and broadcast2 extends
+  // the result of padding to full dimensions.
+  // TODO(qyi): for future extensions: The condition for broadcast(x)
+  // ->dimensions() to be a subset of padded dimensions in pad->config()
+  // does not have to be strictly required, but it makes the calculation
+  // for optimization easier, so it is required by the current implementation.
+  // Only the second condition between the padded dimensions and the
+  // dimensions of the final shape have to be enforced for the optimization
+  // to make sense. If needed to remove the first constraint, the shape
+  // calculations across the implementation need to be re-adjusted.
+  auto pad_dims = padding_dimensions.size();
+  if (pad_dims < dimension_index &&
+      pad->operand(0)->opcode() == HloOpcode::kBroadcast &&
+      pad->operand(0)->user_count() == 1 &&
+      pad->operand(0)->operand(0)->shape().rank() <= pad_dims) {
+    // Check broadcast operand dimensions is a subset of pading_dimensions.
+    // If not, skip the optimization.
+    bool opt_is_valid = true;
+    std::vector<int64> broadcast_dimensions;
+    HloBroadcastInstruction* broadcast =
+        static_cast<HloBroadcastInstruction*>(pad->mutable_operand(0));
+    for (auto broadcast_index : broadcast->dimensions()) {
+      bool found = false;
+      for (int i = 0; i < pad_dims; ++i) {
+        if (broadcast_index == padding_dimensions[i]) {
+          broadcast_dimensions.push_back(i);
+          found = true;
+          break;
+        }
+      }
+      if (!found) {
+        opt_is_valid = false;
+        break;
+      }
+    }
+    if (opt_is_valid) {
+      auto pad_shape = pad->shape();
+      auto broadcast_shape = broadcast->shape();
+      auto pad_shape1 = pad_shape;
+      auto broadcast_shape1 = broadcast_shape;
+      PaddingConfig pad_config;
+      for (int i = padding_dimensions.size() - 1; i >= 0; --i) {
+        int64 j = padding_dimensions[i];
+        while (--dimension_index > j) {
+          broadcast_shape1.DeleteDimension(dimension_index);
+          pad_shape1.DeleteDimension(dimension_index);
+        }
+      }
+      while (--dimension_index >= 0) {
+        broadcast_shape1.DeleteDimension(dimension_index);
+        pad_shape1.DeleteDimension(dimension_index);
+      }
+      for (auto dimension_to_pad : padding_dimensions) {
+        auto dimension = pad_config.add_dimensions();
+        *dimension = pad->padding_config().dimensions(dimension_to_pad);
+      }
+      *broadcast->mutable_shape() = broadcast_shape1;
+      *broadcast->mutable_dimensions() = broadcast_dimensions;
+      simplifier_->UpdateLayout(broadcast->mutable_shape());
+      auto pad2 =
+          computation_->AddInstruction(pad->CloneWithNewShape(pad_shape1));
+      *pad2->mutable_padding_config() = pad_config;
+      simplifier_->UpdateLayout(pad2->mutable_shape());
+      auto broadcast2 = computation_->AddInstruction(
+          HloInstruction::CreateBroadcast(pad_shape, pad2, padding_dimensions));
+      return ReplaceInstruction(pad, broadcast2);
+    }
   }
 
   if (has_negative) {
@@ -3293,7 +3454,8 @@ Status AlgebraicSimplifierVisitor::HandlePad(HloInstruction* pad) {
         pad->shape(), nonzero_pad->mutable_shape()));
     simplifier_->UpdateLayout(nonzero_pad->mutable_shape());
 
-    // Second, construct the slice instruction to perform the negative padding.
+    // Second, construct the slice instruction to perform the negative
+    // padding.
     std::vector<int64> start_indices;
     std::vector<int64> end_indices;
     std::vector<int64> strides;
@@ -4140,8 +4302,8 @@ Status AlgebraicSimplifierVisitor::HandleDynamicSlice(
     return ReplaceWithNewInstruction(dynamic_slice, std::move(new_broadcast));
   }
 
-  // Convert a dynamic slice into a slice if all offsets are  constant and the
-  // operand is not constant. If ev
+  // Convert a dynamic slice into a slice if all offsets are constant and the
+  // operand is not constant.
   if (operand->opcode() != HloOpcode::kConstant &&
       absl::c_all_of(absl::MakeSpan(dynamic_slice->operands().begin() + 1,
                                     dynamic_slice->operands().end()),
@@ -5109,10 +5271,10 @@ StatusOr<bool> AlgebraicSimplifierVisitor::SwapConvOperands(
   if (!reverse_dimensions.empty()) {
     TF_ASSIGN_OR_RETURN(kernel, MakeReverseHlo(kernel, reverse_dimensions));
   }
-  TF_ASSIGN_OR_RETURN(
-      HloInstruction * new_convolution,
-      MakeConvolveHlo(kernel, input, /*feature_group_count=*/1, swapped_window,
-                      swapped_dnums, precision_config));
+  TF_ASSIGN_OR_RETURN(HloInstruction * new_convolution,
+                      MakeConvolveHlo(kernel, input, /*feature_group_count=*/1,
+                                      /*batch_group_count=*/1, swapped_window,
+                                      swapped_dnums, precision_config));
 
   convolution->SetupDerivedInstruction(new_convolution);
   TF_RETURN_IF_ERROR(ReplaceInstruction(convolution, new_convolution));
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.h b/tensorflow/compiler/xla/service/algebraic_simplifier.h
index 9f2a3404116..cabecec4eb8 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.h
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.h
@@ -97,6 +97,14 @@ class AlgebraicSimplifierOptions {
     return enable_scalar_multiply_reduction_;
   }
 
+  // Also the algebraic simplifer to treat floating point values like real
+  // numbers.
+  void set_enable_floats_are_real(bool enable_floats_are_real) {
+    enable_floats_are_real_ = enable_floats_are_real;
+  }
+
+  bool enable_floats_are_real() const { return enable_floats_are_real_; }
+
   // If enable_window_reduce_replacement is true, the kReduceWindow instruction
   // can be optimized by replacement with simpler operations.
   void set_enable_window_reduce_to_reduce_replacement(
@@ -158,6 +166,7 @@ class AlgebraicSimplifierOptions {
   bool enable_conv_simplification_{true};
   bool enable_conv_operand_swap_{true};
   bool enable_scalar_multiply_reduction_{false};
+  bool enable_floats_are_real_{false};
   bool enable_window_reduce_to_reduce_replacement_{true};
   bool enable_reduce_of_reshape_{true};
   bool replace_transpose_with_bitcast_{true};
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
index 90ca44714f7..c4f3ea4087b 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
@@ -117,6 +117,22 @@ TEST_F(AlgebraicSimplifierTest, FactorFpAddition) {
                   m::ConstantScalar(0.125))));
 }
 
+// (Abs(A)) * (Abs(A)) => (A*A)
+TEST_F(AlgebraicSimplifierTest, SquareOfAbs) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p = f32[] parameter(0)
+      a = f32[] abs(p)
+      ROOT z = f32[] multiply(a, a)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Multiply(m::Parameter(0), m::Parameter(0))));
+}
+
 // (A*C1) * (B*C2) => (A*B)*(C1*C2)
 TEST_F(AlgebraicSimplifierTest, MultiplyChain) {
   const char* kModuleStr = R"(
@@ -140,6 +156,26 @@ TEST_F(AlgebraicSimplifierTest, MultiplyChain) {
           m::MultiplyAnyOrder(m::ConstantScalar(2), m::ConstantScalar(4)))));
 }
 
+// (a*C1)*C2 => a*(C1*C2)
+TEST_F(AlgebraicSimplifierTest, MultiplyChain2) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = f32[] parameter(0)
+      a = f32[] constant(2)
+      b = f32[] constant(4)
+      c = f32[] multiply(p0, a)
+      ROOT y = f32[] multiply(c, b)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::MultiplyAnyOrder(
+                  m::Parameter(0), m::MultiplyAnyOrder(m::ConstantScalar(2),
+                                                       m::ConstantScalar(4)))));
+}
+
 // MUL(MUL(X, BROADCAST(constant)), BROADCAST(Y)) ==>
 // MUL(X, BROADCAST(MUL(Y, BROADCAST(constant))))
 TEST_F(AlgebraicSimplifierTest, MultiplyBroadcastReassoc) {
@@ -2299,7 +2335,7 @@ TEST_F(AlgebraicSimplifierTest, ConcatenateOfBroadcastBecomesPad) {
 TEST_F(AlgebraicSimplifierTest, SimplifyConcatenateOfSlices) {
   auto m = CreateNewVerifiedModule();
   Shape r2f32 = ShapeUtil::MakeShape(F32, {100, 99});
-  Shape concat_shape = ShapeUtil::MakeShape(F32, {50, 80});
+  Shape concat_shape = ShapeUtil::MakeShape(F32, {50, 90});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
       HloInstruction::CreateParameter(0, r2f32, "param0"));
@@ -2346,10 +2382,15 @@ TEST_F(AlgebraicSimplifierTest, SimplifyConcatenateOfSlices) {
   HloInstruction* slice7 = builder.AddInstruction(HloInstruction::CreateSlice(
       ShapeUtil::MakeShape(F32, {50, 10}), param1, /*start_indices=*/{50, 79},
       /*limit_indices=*/{100, 89}, /*strides=*/{1, 1}));
+  // Can merge 'slice7' and 'slice8'.
+  HloInstruction* slice8 = builder.AddInstruction(HloInstruction::CreateSlice(
+      ShapeUtil::MakeShape(F32, {50, 10}), param1, /*start_indices=*/{50, 89},
+      /*limit_indices=*/{100, 99}, /*strides=*/{1, 1}));
 
   builder.AddInstruction(HloInstruction::CreateConcatenate(
       concat_shape,
-      {slice0, slice1, slice2, slice3, slice4, slice5, slice6, slice7}, 1));
+      {slice0, slice1, slice2, slice3, slice4, slice5, slice6, slice7, slice8},
+      1));
   auto computation = m->AddEntryComputation(builder.Build());
 
   AlgebraicSimplifier simplifier(default_options_);
@@ -2364,6 +2405,12 @@ TEST_F(AlgebraicSimplifierTest, SimplifyConcatenateOfSlices) {
       ShapeUtil::Equal(computation->root_instruction()->operand(3)->shape(),
                        ShapeUtil::MakeShape(F32, {50, 30})));
   EXPECT_EQ(computation->root_instruction()->operand(3)->slice_starts(1), 40);
+
+  // The operand 6 should be  merge of 'slice7' and 'slice8', so its
+  // shape should have dimensions {50, 20}
+  EXPECT_TRUE(
+      ShapeUtil::Equal(computation->root_instruction()->operand(5)->shape(),
+                       ShapeUtil::MakeShape(F32, {50, 20})));
 }
 
 // Test that a simplification which changes layouts is not performed if layout
@@ -4823,6 +4870,25 @@ TEST_F(AlgebraicSimplifierTest, SliceOfConcatNonScalarInput) {
   EXPECT_EQ(root->slice_limits(0), 2);
 }
 
+TEST_F(AlgebraicSimplifierTest, ConcatToBroadcast) {
+  const char* hlo_string = R"(
+    HloModule module
+
+    ENTRY test {
+      p = f32[2,1,4] parameter(0)
+      ROOT concat = f32[2,6,4] concatenate(p,p,p,p,p,p), dimensions={1}
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AlgebraicSimplifierOptions options;
+  AlgebraicSimplifier simplifier(options);
+  EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, GmockMatch(m::Broadcast(m::Reshape(m::Parameter(0)))));
+}
+
 TEST_F(AlgebraicSimplifierTest, NegateNegate) {
   const char* hlo_string = R"(
     HloModule module
@@ -5608,6 +5674,30 @@ INSTANTIATE_TEST_SUITE_P(
     DotOfGatherSimplificationTestInstantiation, DotOfGatherSimplificationTest,
     ::testing::ValuesIn(DotOfGatherPositiveNegativeTests()));
 
+TEST_F(AlgebraicSimplifierTest, GatherOfScalarToBroadcast) {
+  const char* hlo_string = R"(
+  HloModule repeat
+
+  ENTRY main {
+    o = f32[1,1] parameter(0)
+    i = s32[100,2] parameter(1)
+    ROOT g = f32[100] gather(o, i), collapsed_slice_dims={0,1},
+                                  start_index_map={0,1},
+                                  index_vector_dim=1,
+                                  offset_dims={},
+                                  slice_sizes={1,1}
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AlgebraicSimplifierOptions options;
+  AlgebraicSimplifier simplifier(options);
+  EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, GmockMatch(m::Broadcast(m::Reshape(m::Parameter(0)))));
+}
+
 TEST_F(AlgebraicSimplifierTest, TupleReduceReshape) {
   const char* hlo_string = R"(
 HloModule module
@@ -6892,5 +6982,57 @@ TEST_F(AlgebraicSimplifierTest, UnaryVariadicReduce) {
               GmockMatch(m::Add(m::Parameter(0), m::Parameter(1))));
 }
 
+TEST_F(AlgebraicSimplifierTest, BroadcastAndPadReorder) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      c1 = pred[] constant(true)
+      b2 = pred[32,1,768]{2,1,0} broadcast(pred[] c1), dimensions={}
+      c3 = pred[] constant(false)
+      ROOT p4 = pred[4096,1,768]{2,1,0} pad(pred[32,1,768]{2,1,0} b2, pred[] c3), padding=0_4064x0_0x0_0
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Broadcast(
+                  m::Pad(m::Broadcast(m::Constant()), m::Constant()))));
+}
+
+TEST_F(AlgebraicSimplifierTest, BroadcastAndPadReorderWithUse) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      c1 = pred[] constant(true)
+      b2 = pred[1,768,32]{2,1,0} broadcast(pred[] c1), dimensions={}
+      c3 = pred[] constant(false)
+      p4 = pred[1,768,4096]{2,1,0} pad(pred[1,768,32]{2,1,0} b2, pred[] c3), padding=0_0x0_0x0_4064
+      ROOT p5 = (pred[1,768,4096]{2,1,0}) tuple(pred[1,768,4096]{2,1,0} p4)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Tuple(m::Broadcast(
+                  m::Pad(m::Broadcast(m::Constant()), m::Constant())))));
+}
+
+TEST_F(AlgebraicSimplifierTest, BroadcastAndPadReorderWithNonScalar) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      c1 = pred[32] parameter(0)
+      b2 = pred[1,768,32]{2,1,0} broadcast(pred[32] c1), dimensions={2}
+      c3 = pred[] constant(false)
+      p4 = pred[1,768,4096]{2,1,0} pad(pred[1,768,32]{2,1,0} b2, pred[] c3), padding=0_0x0_0x0_4064
+      ROOT p5 = (pred[1,768,4096]{2,1,0}) tuple(pred[1,768,4096]{2,1,0} p4)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Tuple(m::Broadcast(
+                  m::Pad(m::Broadcast(m::Parameter()), m::Constant())))));
+}
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/all_reduce_combiner.cc b/tensorflow/compiler/xla/service/all_reduce_combiner.cc
index 9d8f03c92ca..5fb4935a4b1 100644
--- a/tensorflow/compiler/xla/service/all_reduce_combiner.cc
+++ b/tensorflow/compiler/xla/service/all_reduce_combiner.cc
@@ -268,6 +268,11 @@ StatusOr<bool> AllReduceCombiner::Run(HloModule* module) {
   VLOG(1) << "Running AllReduceCombiner with threshold of "
           << combine_threshold_in_bytes_ << " bytes";
 
+  if (combine_threshold_in_bytes_ <= 0 || combine_threshold_count_ <= 0) {
+    VLOG(1) << "Skip AllReduceCombiner because the threshold is zero";
+    return false;
+  }
+
   if (hlo_query::ContainsLayoutConstrainedAllReduce(*module)) {
     VLOG(1) << "Skip AllReduceCombiner because the module contains all-reduce "
                "with constrained layouts";
diff --git a/tensorflow/compiler/xla/service/all_reduce_simplifier.cc b/tensorflow/compiler/xla/service/all_reduce_simplifier.cc
index 541006f04d5..18a0fdc1a70 100644
--- a/tensorflow/compiler/xla/service/all_reduce_simplifier.cc
+++ b/tensorflow/compiler/xla/service/all_reduce_simplifier.cc
@@ -31,27 +31,7 @@ StatusOr<bool> AllReduceSimplifier::Run(HloModule* module) {
   TF_ASSIGN_OR_RETURN(
       auto replication,
       HloReplicationAnalysis::Run(module, /*cross_partition_spmd=*/false));
-  std::vector<HloInstruction*> all_reduces_to_replace;
-  for (auto computation : module->computations()) {
-    for (HloInstruction* inst : computation->MakeInstructionPostOrder()) {
-      if (!inst->shape().IsArray()) {
-        // We currently do not change tuple-shaped all-reduce.
-        // Until XLA will support Token fed AllReduce(), the PyTorch client code
-        // uses a fake data token (constant) which relies on this pass to not
-        // optimize out (being fed within a tuple input).
-        continue;
-      }
-      if (inst->IsCrossReplicaAllReduce() &&
-          replication->HloInstructionIsReplicatedAt(inst->operand(0), {})) {
-        all_reduces_to_replace.push_back(inst);
-      }
-    }
-  }
-
-  bool changed = false;
-  if (all_reduces_to_replace.empty()) {
-    return changed;
-  }
+  std::vector<std::pair<HloInstruction*, int64>> all_reduces_to_replace;
 
   // Returns the size of a replica group if all groups have the same size, or -1
   // if they have different sizes.
@@ -71,7 +51,40 @@ StatusOr<bool> AllReduceSimplifier::Run(HloModule* module) {
     return replica_group_size;
   };
 
-  for (auto all_reduce : all_reduces_to_replace) {
+  for (auto computation : module->computations()) {
+    for (HloInstruction* inst : computation->MakeInstructionPostOrder()) {
+      if (!inst->shape().IsArray()) {
+        // We currently do not change tuple-shaped all-reduce.
+        // Until XLA will support Token fed AllReduce(), the PyTorch client code
+        // uses a fake data token (constant) which relies on this pass to not
+        // optimize out (being fed within a tuple input).
+        continue;
+      }
+      if (!inst->IsCrossReplicaAllReduce()) {
+        continue;
+      }
+      int64 group_size = get_replica_group_size(inst);
+      if (group_size == -1) {
+        continue;
+      }
+      if (replication->HloInstructionIsReplicatedAt(inst->operand(0), {}) ||
+          group_size == 1) {
+        all_reduces_to_replace.push_back({inst, group_size});
+      }
+    }
+  }
+
+  bool changed = false;
+
+  for (auto all_reduce_and_group_size : all_reduces_to_replace) {
+    auto all_reduce = all_reduce_and_group_size.first;
+    const int64 replica_group_size = all_reduce_and_group_size.second;
+    if (replica_group_size == 1) {
+      TF_RETURN_IF_ERROR(all_reduce->parent()->ReplaceInstruction(
+          all_reduce, all_reduce->mutable_operand(0)));
+      changed = true;
+      continue;
+    }
     if (all_reduce->to_apply()->instruction_count() != 3 ||
         all_reduce->to_apply()->num_parameters() != 2) {
       continue;
@@ -79,10 +92,6 @@ StatusOr<bool> AllReduceSimplifier::Run(HloModule* module) {
     HloInstruction* replacement;
     switch (all_reduce->to_apply()->root_instruction()->opcode()) {
       case HloOpcode::kAdd: {
-        int64 replica_group_size = get_replica_group_size(all_reduce);
-        if (replica_group_size == -1) {
-          continue;
-        }
         // Create the multiplier:
         //   broadcast(convert_to_matching_type(s32 group size))
         auto multiplier =
diff --git a/tensorflow/compiler/xla/service/all_reduce_simplifier_test.cc b/tensorflow/compiler/xla/service/all_reduce_simplifier_test.cc
index 4914836b34a..1e938594cc3 100644
--- a/tensorflow/compiler/xla/service/all_reduce_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/all_reduce_simplifier_test.cc
@@ -167,5 +167,30 @@ test {
           m::Parameter(0), m::AllReduce(m::Parameter(1)))));
 }
 
+TEST_F(AllReduceSimplifierTest, TrivialSubgroupAllReduce) {
+  const char* kModuleStr = R"(
+HloModule m
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add.2 = f32[] add(a, b)
+}
+
+
+test {
+  p0 = f32[8,16] parameter(0), parameter_replication={false}
+  ROOT all-reduce = f32[8,16] all-reduce(p0),
+    replica_groups={{0},{1},{2},{3},{4},{5},{6},{7}},
+    to_apply=sum
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(
+                                           kModuleStr, /*replica_count=*/8));
+  AllReduceSimplifier simplifier(/*replica_count=*/8);
+  EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Parameter(0)));
+}
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/bfloat16_conversion_folding.cc b/tensorflow/compiler/xla/service/bfloat16_conversion_folding.cc
index 23d2a9225a8..73210e6b3dc 100644
--- a/tensorflow/compiler/xla/service/bfloat16_conversion_folding.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_conversion_folding.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -159,19 +160,20 @@ Status BFloat16ConversionFoldingVisitor::TryFoldBF16Conversions(
 
 Status BFloat16ConversionFoldingVisitor::DefaultAction(HloInstruction* hlo) {
   // Do not fold BF16 conversions for instructions related to tuples, entry and
-  // exit of a computation, fusion, convert, side-effecting instructions and
-  // control flow.
-  if (hlo->opcode() == HloOpcode::kTuple ||            //
-      hlo->opcode() == HloOpcode::kGetTupleElement ||  //
-      hlo->opcode() == HloOpcode::kConstant ||         //
-      hlo->opcode() == HloOpcode::kParameter ||        //
-      hlo->opcode() == HloOpcode::kFusion ||           //
-      hlo->opcode() == HloOpcode::kBitcastConvert ||   //
-      hlo->opcode() == HloOpcode::kConvert ||          //
-      hlo->opcode() == HloOpcode::kCall ||             //
-      hlo->opcode() == HloOpcode::kCustomCall ||       //
-      hlo->opcode() == HloOpcode::kWhile ||            //
-      hlo->opcode() == HloOpcode::kConditional ||      //
+  // exit of a computation, fusion, convert, side-effecting instructions,
+  // in-place operations and control flow.
+  if (hlo->opcode() == HloOpcode::kTuple ||                      //
+      hlo->opcode() == HloOpcode::kGetTupleElement ||            //
+      hlo->opcode() == HloOpcode::kConstant ||                   //
+      hlo->opcode() == HloOpcode::kParameter ||                  //
+      hlo->opcode() == HloOpcode::kFusion ||                     //
+      hlo->opcode() == HloOpcode::kBitcastConvert ||             //
+      hlo->opcode() == HloOpcode::kConvert ||                    //
+      hlo->opcode() == HloOpcode::kCall ||                       //
+      hlo->opcode() == HloOpcode::kCustomCall ||                 //
+      hlo->opcode() == HloOpcode::kWhile ||                      //
+      hlo->opcode() == HloOpcode::kConditional ||                //
+      HloDataflowAnalysis::IsInPlaceOperation(hlo->opcode()) ||  //
       hlo->HasSideEffectNoRecurse()) {
     return Status::OK();
   }
diff --git a/tensorflow/compiler/xla/service/bfloat16_propagation.cc b/tensorflow/compiler/xla/service/bfloat16_propagation.cc
index a0fe0eaa1d9..f9e19493a86 100644
--- a/tensorflow/compiler/xla/service/bfloat16_propagation.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_propagation.cc
@@ -598,6 +598,31 @@ bool BFloat16Propagation::ResolveInconsistencyOfAliasingBuffersHelper(
         type = F32;
         break;
       }
+      // In order to find aliases due to in-place operations, use
+      // GetInPlaceInputOutputPairs. Ideally, we'd use HloAliasAnalysis here,
+      // but this code works with HloModules that aren't ready yet to use
+      // HloAliasAnalysis (e.g., their computation graphs may not have been
+      // flattened yet).
+      for (const auto& operand_and_output_index :
+           HloDataflowAnalysis::GetInPlaceInputOutputPairs(hlo)) {
+        if (operand_and_output_index.second == index) {
+          const HloUse& operand = operand_and_output_index.first;
+          for (const auto* value :
+               dataflow_
+                   ->GetValueSet(hlo->operand(operand.operand_number),
+                                 operand.operand_index)
+                   .values()) {
+            auto value_type = ValueTypeAfterChange(value);
+            if (value_type == BF16) {
+              continue;
+            }
+            CHECK_EQ(value_type, F32);
+            type = F32;
+            break;
+          }
+        }
+      }
+
       // It's possible that a user has been changed from BF16 to F32
       // during this final adjustment pass, so we need to check
       // AllUsersConsumeBF16() again.
diff --git a/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc b/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc
index 02d79025f1b..9a898833373 100644
--- a/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc
@@ -1156,4 +1156,30 @@ ENTRY entry {
   EXPECT_FALSE(PropagatePrecision(module.get()));
 }
 
+TEST_F(BFloat16PropagationTest, DynamicUpdateSlice) {
+  // This test is crafted so that the DUS has an f32 input (due to parameter)
+  // and bf16 output (due to dot). But we should enforce DUS operand 0 and
+  // output to get the same precision since it's an in-place operation.
+  const string module_str = R"(
+HloModule Module
+
+ENTRY main {
+  param = f32[128,128] parameter(0)
+  constant.1 = f32[] constant(0)
+  broadcast.6 = f32[128,1] broadcast(constant.1), dimensions={}
+  constant.3 = s32[] constant(0)
+  dynamic-update-slice = f32[128,128] dynamic-update-slice(param, broadcast.6, constant.3, constant.3)
+  ROOT dot = f32[128,128] dot(dynamic-update-slice, dynamic-update-slice), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  EXPECT_FALSE(PropagatePrecision(module.get()));
+
+  HloInstruction* dus = module->entry_computation()->GetInstructionWithName(
+      "dynamic-update-slice");
+  EXPECT_FALSE(OutputsBF16(dus));
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.cc b/tensorflow/compiler/xla/service/buffer_assignment.cc
index 6cd58b86f0c..db34f054f35 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment.cc
@@ -1007,102 +1007,6 @@ bool BufferAssigner::MaybeAssignBuffer(BufferAllocation* allocation,
   return true;
 }  // namespace xla
 
-Status BufferAssigner::MergeInplaceOpBuffers(BufferAssignment* assignment) {
-  // Try allocate same buffer for dynamic update slice's operand and output.
-
-  // If memory_space_assignment is run and there is information about a color in
-  // preset assignments, don't merge those buffers. We expect
-  // memory_space_assignment to have merged these buffers. If
-  // memory_space_assignment didn't merge these buffers and have assigned
-  // different offsets to the operand and the output buffer, merging the buffers
-  // can cause memory corruption if memory_space_assignment assigned a different
-  // buffer at the same offset.
-  absl::flat_hash_set<int64> excluded_colors;
-  if (preset_assignments_) {
-    for (const auto& color_and_info :
-         preset_assignments_->assignment_informations()) {
-      excluded_colors.insert(color_and_info.first);
-    }
-  }
-
-  // TODO(yunxing): Moving this logic to alias analysis and add must-alias rule
-  // to operations that can be done in place.
-  for (HloComputation* computation : assignment->module().computations()) {
-    for (HloInstruction* instruction : computation->instructions()) {
-      if (!(instruction->opcode() == HloOpcode::kDynamicUpdateSlice ||
-            (instruction->opcode() == HloOpcode::kFusion &&
-             (instruction->fused_expression_root()->opcode() ==
-              HloOpcode::kDynamicUpdateSlice)))) {
-        continue;
-      }
-      if (instruction->parent()->IsFusionComputation()) {
-        continue;
-      }
-      if (instruction->operand_count() == 0) {
-        continue;
-      }
-
-      // The operand can't share the same buffer with the user based on dataflow
-      // analysis.
-      if (!assignment->dataflow_analysis().CanShareOperandBufferWithUser(
-              instruction->mutable_operand(0), {}, instruction, {})) {
-        continue;
-      }
-      HloBuffer& instruction_buffer =
-          assignment->alias_analysis().GetUniqueBufferAt(instruction, {});
-
-      HloBuffer& operand_buffer =
-          assignment->alias_analysis().GetUniqueBufferAt(
-              instruction->operand(0), {});
-
-      // The instruction or operand color is excluded because it was assigned by
-      // memory_space_assignment.
-      if (excluded_colors.contains(instruction_buffer.color()) ||
-          excluded_colors.contains(operand_buffer.color())) {
-        continue;
-      }
-
-      // Already have the same buffer. No need to merge those.
-      if (instruction_buffer.id() == operand_buffer.id()) {
-        continue;
-      }
-
-      // Do not perform in-place dynamic update slice if the operand buffer is
-      // read-only.
-      if (HloBufferIsReadOnly(operand_buffer)) {
-        continue;
-      }
-
-      bool interfere = false;
-
-      for (const HloValue* instruction_value : instruction_buffer.values()) {
-        for (const HloValue* operand_value : operand_buffer.values()) {
-          if (assignment->hlo_ordering().MayInterfere(
-                  *instruction_value, *operand_value,
-                  assignment->dataflow_analysis())) {
-            interfere = true;
-            break;
-          }
-        }
-      }
-      if (interfere) {
-        continue;
-      }
-      if (assignment->alias_analysis().BufferLivesOut(instruction_buffer)) {
-        continue;
-      }
-      if (instruction_buffer.color() != operand_buffer.color()) {
-        continue;
-      }
-      VLOG(3) << "Merging inplace " << instruction_buffer << " and "
-              << operand_buffer;
-      assignment->alias_analysis().MergeBuffers(instruction_buffer,
-                                                operand_buffer);
-    }
-  }
-  return Status::OK();
-}
-
 Status BufferAssigner::AssignSingleHloBuffer(
     const HloBuffer* hlo_buffer, bool is_thread_local,
     absl::flat_hash_map<const HloComputation*,
@@ -1424,13 +1328,16 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering(
   // Returns a heap algorithm that chooses the best result from several
   // algorithms.
   auto get_heap_algorithm = [&](int64 alignment) {
-    auto algorithms =
-        absl::make_unique<std::vector<std::unique_ptr<HeapAlgorithm>>>();
-    algorithms->push_back(absl::make_unique<GlobalDecreasingSizeBestFitHeap>(
-        alignment, GlobalDecreasingSizeBestFitHeap::kSpatial));
-    algorithms->push_back(absl::make_unique<GlobalDecreasingSizeBestFitHeap>(
-        alignment, GlobalDecreasingSizeBestFitHeap::kTemporal));
-    return absl::make_unique<ChooseBestHeapAlgorithm>(std::move(algorithms));
+    auto algorithms = absl::make_unique<
+        std::vector<std::unique_ptr<HeapAlgorithm<HloValue>>>>();
+    algorithms->push_back(
+        absl::make_unique<GlobalDecreasingSizeBestFitHeap<HloValue>>(
+            alignment, GlobalDecreasingSizeBestFitHeap<HloValue>::kSpatial));
+    algorithms->push_back(
+        absl::make_unique<GlobalDecreasingSizeBestFitHeap<HloValue>>(
+            alignment, GlobalDecreasingSizeBestFitHeap<HloValue>::kTemporal));
+    return absl::make_unique<ChooseBestHeapAlgorithm<HloValue>>(
+        std::move(algorithms));
   };
 
   if (run_whole_module_heap_simulation) {
@@ -1461,7 +1368,7 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering(
       options.buffers_to_assign = &single_colored_set.second;
 
       TF_ASSIGN_OR_RETURN(
-          HeapSimulator::Result result,
+          HeapSimulator::Result<HloValue> result,
           HeapSimulator::Run(
               get_heap_algorithm(alignment), assignment->module(), schedule,
               assignment->alias_analysis(), assignment->buffer_size_, options));
@@ -1487,7 +1394,7 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering(
         HeapSimulator::Options options;
         options.buffers_to_assign = &single_colored_set.second;
         TF_ASSIGN_OR_RETURN(
-            HeapSimulator::Result result,
+            HeapSimulator::Result<HloValue> result,
             HeapSimulator::Run(get_heap_algorithm(alignment), *computation,
                                *instruction_sequence,
                                assignment->alias_analysis(),
@@ -1582,7 +1489,7 @@ std::vector<const HloValue*> ComputePeakMemoryLogicalBuffers(
 }  // namespace
 
 void BufferAssigner::AssignBuffersFromHeapSimulator(
-    const HeapSimulator::Result& result, BufferAssignment* assignment,
+    const HeapSimulator::Result<HloValue>& result, BufferAssignment* assignment,
     BufferValue::Color color) {
   if (assignment->stats_.preallocated_temp_fragmentation_bytes == -1) {
     assignment->stats_.preallocated_temp_fragmentation_bytes =
@@ -1651,7 +1558,6 @@ StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::CreateAssignment(
   VLOG(3) << "After coloring:";
   XLA_VLOG_LINES(3,
                  assignment->alias_analysis().dataflow_analysis().ToString());
-  TF_RETURN_IF_ERROR(MergeInplaceOpBuffers(assignment.get()));
 
   std::vector<const HloComputation*> thread_local_computations;
   std::vector<const HloComputation*> global_computations;
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.h b/tensorflow/compiler/xla/service/buffer_assignment.h
index 50a4750601b..dfde46ca4b1 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.h
+++ b/tensorflow/compiler/xla/service/buffer_assignment.h
@@ -635,10 +635,6 @@ class BufferAssigner {
       absl::flat_hash_set<const HloBuffer*>* assigned_buffers,
       BufferAssignment* assignment);
 
-  // Promotes operations (DUS, scatter) to be done in place: If an operation can
-  // be done in place, merge its buffer with its operand buffer.
-  Status MergeInplaceOpBuffers(BufferAssignment* assignment);
-
   // Assigns a single hlo buffer to an HLO allocation.
   Status AssignSingleHloBuffer(
       const HloBuffer* hlo_buffer, bool is_thread_local,
@@ -661,9 +657,9 @@ class BufferAssigner {
 
   // Uses the results of the heap simulator to create a single allocation, with
   // LogicalBuffers packed to specific offsets.
-  void AssignBuffersFromHeapSimulator(const HeapSimulator::Result& result,
-                                      BufferAssignment* assignment,
-                                      LogicalBuffer::Color color);
+  void AssignBuffersFromHeapSimulator(
+      const HeapSimulator::Result<HloValue>& result,
+      BufferAssignment* assignment, LogicalBuffer::Color color);
 
   // Tries to assign the given instruction to the given buffer. Returns if the
   // assignment was successful.
diff --git a/tensorflow/compiler/xla/service/buffer_assignment_test.cc b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
index bc024f7144b..b49ca649f9a 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
@@ -1925,8 +1925,10 @@ ENTRY main {
   TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_text));
   HloInstruction* parameter =
       m->entry_computation()->GetInstructionWithName("get-tuple-element.4");
-  HloInstruction* dus =
+  HloInstruction* dus1 =
       m->entry_computation()->GetInstructionWithName("dynamic-update-slice.5");
+  HloInstruction* dus2 =
+      m->entry_computation()->GetInstructionWithName("dynamic-update-slice.9");
 
   auto buffers = RunBufferAssignment(m.get());
 
@@ -1934,8 +1936,10 @@ ENTRY main {
     const BufferAllocation& parameter_alloc =
         GetTopLevelAllocation(*buffers, parameter);
 
-    const BufferAllocation& dus_alloc = GetTopLevelAllocation(*buffers, dus);
-    EXPECT_NE(parameter_alloc, dus_alloc);
+    const BufferAllocation& dus1_alloc = GetTopLevelAllocation(*buffers, dus1);
+    EXPECT_EQ(parameter_alloc, dus1_alloc);
+    const BufferAllocation& dus2_alloc = GetTopLevelAllocation(*buffers, dus2);
+    EXPECT_EQ(parameter_alloc, dus2_alloc);
   }
 }
 
diff --git a/tensorflow/compiler/xla/service/cholesky_expander.cc b/tensorflow/compiler/xla/service/cholesky_expander.cc
index 20576cdc52d..ffb0fb4e6ef 100644
--- a/tensorflow/compiler/xla/service/cholesky_expander.cc
+++ b/tensorflow/compiler/xla/service/cholesky_expander.cc
@@ -35,8 +35,6 @@ limitations under the License.
 
 namespace xla {
 
-namespace {
-
 // The Cholesky–Banachiewicz algorithm. See
 // https://en.wikipedia.org/wiki/Cholesky_decomposition#The_Cholesky–Banachiewicz_and_Cholesky–Crout_algorithms
 // for a description.
@@ -54,78 +52,70 @@ namespace {
 //     l = temp / l[..., j, j) * mask + l
 //   return l
 // Returns a (result, error) pair.
-std::pair<XlaOp, XlaOp> CholeskyUnblocked(
+StatusOr<std::pair<XlaOp, XlaOp>> CholeskyExpander::CholeskyUnblocked(
     XlaOp a, PrecisionConfig::Precision precision) {
   XlaBuilder* builder = a.builder();
-  auto result = [&]() -> StatusOr<std::pair<XlaOp, XlaOp>> {
-    TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
-    const int n_dims = a_shape.rank();
-    const int64 n = ShapeUtil::GetDimension(a_shape, -1);
-    auto major_dims = AsInt64Slice(a_shape.dimensions())
-                          .subspan(
-                              /*pos=*/0,
-                              /*len=*/n_dims - 2);
+  TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
+  const int n_dims = a_shape.rank();
+  const int64 n = ShapeUtil::GetDimension(a_shape, -1);
+  auto major_dims = AsInt64Slice(a_shape.dimensions())
+                        .subspan(
+                            /*pos=*/0,
+                            /*len=*/n_dims - 2);
 
-    auto matrix_dims = AsInt64Slice(a_shape.dimensions())
-                           .subspan(
-                               /*pos=*/0,
-                               /*len=*/n_dims);
+  auto matrix_dims = AsInt64Slice(a_shape.dimensions())
+                         .subspan(
+                             /*pos=*/0,
+                             /*len=*/n_dims);
 
-    XlaOp l = ZerosLike(a);
+  XlaOp l = ZerosLike(a);
 
-    // Construct the for loop body to iterate over rows.
-    auto body_fn =
-        [&](XlaOp i, absl::Span<const XlaOp> loop_vars,
-            XlaBuilder* body_builder) -> StatusOr<std::vector<XlaOp>> {
-      std::vector<int64> row_shape_dims(major_dims.begin(), major_dims.end());
-      std::vector<int64> col_shape_dims(major_dims.begin(), major_dims.end());
-      auto body_a = loop_vars[0];
-      auto body_l = loop_vars[1];
-      auto seen_error = loop_vars[2];
-      auto iota_row = Iota(body_builder, ShapeUtil::MakeShape(S32, matrix_dims),
-                           n_dims - 1);
-      auto iota_col = Iota(body_builder, ShapeUtil::MakeShape(S32, matrix_dims),
-                           n_dims - 2);
+  // Construct the for loop body to iterate over rows.
+  auto body_fn = [&](XlaOp i, absl::Span<const XlaOp> loop_vars,
+                     XlaBuilder* body_builder) -> StatusOr<std::vector<XlaOp>> {
+    std::vector<int64> row_shape_dims(major_dims.begin(), major_dims.end());
+    std::vector<int64> col_shape_dims(major_dims.begin(), major_dims.end());
+    auto body_a = loop_vars[0];
+    auto body_l = loop_vars[1];
+    auto seen_error = loop_vars[2];
+    auto iota_row =
+        Iota(body_builder, ShapeUtil::MakeShape(S32, matrix_dims), n_dims - 1);
+    auto iota_col =
+        Iota(body_builder, ShapeUtil::MakeShape(S32, matrix_dims), n_dims - 2);
 
-      auto mask_pred = Ge(iota_col, iota_row);
-      mask_pred = And(mask_pred, Eq(iota_row, i));
-      auto mask_zeros =
-          Zeros(body_builder,
-                ShapeUtil::MakeShape(a_shape.element_type(), matrix_dims));
-      // L * L.T, This matrix has of a lot of multiplying with zero
-      // (namely, L[:, j:] = 0) and redundant computation, but it is faster
-      // than slice.
-      auto l_square = BatchDot(body_l, false, body_l, true, precision);
+    auto mask_pred = Ge(iota_col, iota_row);
+    mask_pred = And(mask_pred, Eq(iota_row, i));
+    auto mask_zeros =
+        Zeros(body_builder,
+              ShapeUtil::MakeShape(a_shape.element_type(), matrix_dims));
+    // L * L.T, This matrix has of a lot of multiplying with zero
+    // (namely, L[:, j:] = 0) and redundant computation, but it is faster
+    // than slice.
+    auto l_square = BatchDot(body_l, false, body_l, true, precision);
 
-      // A - L*L.T
-      l_square = body_a - l_square;
-      auto l_ii = DynamicSliceInMinorDims(l_square, {i, i}, {1, 1});
-      l_ii = Sqrt(l_ii);
-      // L = (A - L*L.T) / l_ii * mask + L
-      body_l = Select(mask_pred, l_square / l_ii, mask_zeros) + body_l;
+    // A - L*L.T
+    l_square = body_a - l_square;
+    auto l_ii = DynamicSliceInMinorDims(l_square, {i, i}, {1, 1});
+    l_ii = Sqrt(l_ii);
+    // L = (A - L*L.T) / l_ii * mask + L
+    body_l = Select(mask_pred, l_square / l_ii, mask_zeros) + body_l;
 
-      seen_error =
-          Or(seen_error, Any(Or(Le(l_ii, ZerosLike(l_ii)), IsNan(l_ii))));
+    seen_error =
+        Or(seen_error, Any(Or(Le(l_ii, ZerosLike(l_ii)), IsNan(l_ii))));
 
-      return std::vector<XlaOp>{body_a, body_l, seen_error};
-    };
+    return std::vector<XlaOp>{body_a, body_l, seen_error};
+  };
 
-    TF_ASSIGN_OR_RETURN(
-        auto cholesky_while,
-        ForEachIndex(n, S32, body_fn, {a, l, ConstantR0<bool>(builder, false)},
-                     "unblocked", builder));
+  TF_ASSIGN_OR_RETURN(
+      auto cholesky_while,
+      ForEachIndex(n, S32, body_fn, {a, l, ConstantR0<bool>(builder, false)},
+                   "unblocked", builder));
 
-    return std::make_pair(cholesky_while[1], cholesky_while[2]);
-  }();
-  if (!result.ok()) {
-    XlaOp error = builder->ReportError(result.status());
-    return {error, error};
-  }
-  return result.ValueOrDie();
+  return std::make_pair(cholesky_while[1], cholesky_while[2]);
 }
 
-XlaOp BuildCholesky(XlaOp a, int64 block_size,
-                    PrecisionConfig::Precision precision) {
+XlaOp CholeskyExpander::BuildCholesky(XlaOp a, int64 block_size,
+                                      PrecisionConfig::Precision precision) {
   XlaBuilder* builder = a.builder();
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
@@ -162,6 +152,7 @@ XlaOp BuildCholesky(XlaOp a, int64 block_size,
     XlaOp seen_error = ConstantR0<bool>(builder, false);
     for (int64 i = 0; i < n; i += block_size) {
       int64 k = std::min(block_size, n - i);
+      auto panel = SliceInMinorDims(a, {i, i}, {n, i + k});
       if (i > 0) {
         // TODO(phawkins): consider implementing SYRK for the diagonal part of
         // the panel.
@@ -169,28 +160,34 @@ XlaOp BuildCholesky(XlaOp a, int64 block_size,
         auto lhs = SliceInMinorDims(l, {i, 0}, {n, i});
         auto rhs = SliceInMinorDims(l, {i, 0}, {i + k, i});
         auto delta = BatchDot(lhs, false, rhs, true, precision);
-        auto before = SliceInMinorDims(a, {i, i}, {n, i + k});
-        a = UpdateSliceInMinorDims(a, before - delta, {i, i});
+        panel = panel - delta;
       }
 
       // l[i:i+k, i:i+k] = cholesky_unblocked(a[i:i+k, i:i+k])
-      auto x = SliceInMinorDims(a, {i, i}, {i + k, i + k});
+      auto x = SliceInMinorDims(panel, {0, 0}, {k, k});
       XlaOp factorized;
+      // TODO(b/167896062): A failure in one element of a batch shouldn't fail
+      // other elements.
       XlaOp factorized_error;
-      std::tie(factorized, factorized_error) = CholeskyUnblocked(x, precision);
+      if (k == 1) {
+        factorized = Sqrt(x);
+        factorized_error = Any(IsNan(factorized));
+      } else {
+        TF_ASSIGN_OR_RETURN(auto tile_output, CholeskyUnblocked(x, precision));
+        std::tie(factorized, factorized_error) = tile_output;
+      }
       seen_error = Or(seen_error, factorized_error);
       l = UpdateSliceInMinorDims(l, factorized, {i, i});
 
       if (i + k < n) {
         // l[i+k:, i:i+k] =
         //     trsm_right_transpose(l[i:i+k, i:i+k], a[i+k:, i:i+k])
-        auto panel = SliceInMinorDims(a, {i + k, i}, {n, i + k});
-        auto update =
-            TriangularSolve(factorized, panel,
-                            /*left_side=*/false,
-                            /*lower=*/true,
-                            /*unit_diagonal=*/false,
-                            /*transpose_a=*/TriangularSolveOptions::TRANSPOSE);
+        auto update = TriangularSolve(
+            factorized, SliceInMinorDims(panel, {k, 0}, {n - i, k}),
+            /*left_side=*/false,
+            /*lower=*/true,
+            /*unit_diagonal=*/false,
+            /*transpose_a=*/TriangularSolveOptions::TRANSPOSE);
         l = UpdateSliceInMinorDims(l, update, {i + k, i});
       }
     }
@@ -199,8 +196,6 @@ XlaOp BuildCholesky(XlaOp a, int64 block_size,
   });
 }
 
-}  // namespace
-
 bool CholeskyExpander::InstructionMatchesPattern(HloInstruction* instruction) {
   return instruction->opcode() == HloOpcode::kCholesky;
 }
diff --git a/tensorflow/compiler/xla/service/cholesky_expander.h b/tensorflow/compiler/xla/service/cholesky_expander.h
index d2958db1b8c..ee8531d0f48 100644
--- a/tensorflow/compiler/xla/service/cholesky_expander.h
+++ b/tensorflow/compiler/xla/service/cholesky_expander.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_CHOLESKY_EXPANDER_H_
 
 #include "absl/container/flat_hash_map.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/service/op_expander_pass.h"
 
 namespace xla {
@@ -31,7 +32,13 @@ class CholeskyExpander : public OpExpanderPass {
   StatusOr<HloInstruction*> ExpandInstruction(
       HloInstruction* instruction) override;
 
+  virtual StatusOr<std::pair<XlaOp, XlaOp>> CholeskyUnblocked(
+      XlaOp a, PrecisionConfig::Precision precision);
+
  private:
+  XlaOp BuildCholesky(XlaOp a, int64 block_size,
+                      PrecisionConfig::Precision precision);
+
   // Mapping from op signatures to existing computations.
   absl::flat_hash_map<string, HloComputation*> computation_cache_;
 };
diff --git a/tensorflow/compiler/xla/service/comparison_expander.cc b/tensorflow/compiler/xla/service/comparison_expander.cc
new file mode 100644
index 00000000000..5c88ff8cae2
--- /dev/null
+++ b/tensorflow/compiler/xla/service/comparison_expander.cc
@@ -0,0 +1,133 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/comparison_expander.h"
+
+#include "tensorflow/compiler/xla/client/lib/comparators.h"
+#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/util.h"
+
+namespace xla {
+
+HloInstruction* BitcastConvertFloatingPointToIntegral(
+    HloComputation* computation, HloInstruction* value,
+    const Shape& signed_shape, const Shape& unsigned_shape,
+    HloInstruction* zero, HloInstruction* max_value) {
+  // Switch from a floating point value to a integer value in such a way that
+  // when using the integer value to compare, we get the same result for normal
+  // values, and -Nan is treated as the smallest value, and Nan is treated as
+  // the largest value.
+  // If f is a float, and
+  // x = bit_cast<int32>(f);
+  // y = x < 0 ? numeric_limits<int32>::max() - x : x;
+  // then y is ordered as an int32 such that finite values have the obvious
+  // order, -0 is ordered before 0, and -NaN and NaN appear at the beginning
+  // and end of the ordering.
+  // Note that in order to avoid -x to overflow, we calculate
+  // numeric_limits<int32>::max() - x as unsigned, and then convert back to
+  // signed.
+  auto signed_value = computation->AddInstruction(
+      HloInstruction::CreateBitcastConvert(signed_shape, value));
+  auto unsigned_value = computation->AddInstruction(
+      HloInstruction::CreateBitcastConvert(unsigned_shape, value));
+  auto flipped_value = computation->AddInstruction(HloInstruction::CreateBinary(
+      unsigned_shape, HloOpcode::kSubtract, max_value, unsigned_value));
+  flipped_value = computation->AddInstruction(
+      HloInstruction::CreateBitcastConvert(signed_shape, flipped_value));
+  auto compare_shape = signed_shape;
+  compare_shape.set_element_type(PRED);
+  auto is_negative = computation->AddInstruction(HloInstruction::CreateCompare(
+      compare_shape, signed_value, zero, ComparisonDirection::kLt));
+  return computation->AddInstruction(
+      HloInstruction::CreateTernary(signed_shape, HloOpcode::kSelect,
+                                    is_negative, flipped_value, signed_value));
+}
+
+bool ComparisonExpander::InstructionMatchesPattern(
+    HloInstruction* instruction) {
+  if (HloCompareInstruction* compare =
+          dynamic_cast<HloCompareInstruction*>(instruction)) {
+    HloInstruction* lhs = instruction->operands()[0];
+    if (compare->type() == Comparison::Type::kFloatTotalOrder &&
+        primitive_util::IsFloatingPointType(lhs->shape().element_type())) {
+      return true;
+    }
+  }
+  return false;
+}
+
+StatusOr<HloInstruction*> ComparisonExpander::ExpandInstruction(
+    HloInstruction* instruction) {
+  CHECK(instruction->opcode() == HloOpcode::kCompare);
+  HloCompareInstruction* compare =
+      static_cast<HloCompareInstruction*>(instruction);
+  CHECK(compare->type() == Comparison::Type::kFloatTotalOrder);
+  HloComputation* computation = instruction->parent();
+  HloInstruction* lhs = instruction->operands()[0];
+  HloInstruction* rhs = instruction->operands()[1];
+  Shape compare_shape = lhs->shape();
+  PrimitiveType compare_type = compare_shape.element_type();
+  CHECK(primitive_util::IsFloatingPointType(compare_type));
+  // Special-case handling for BF16. We currently do not support direct
+  // comparisons with BF16, so we convert to F32 and then use the F32
+  // comparison logic.
+  if (compare_type == BF16) {
+    compare_type = F32;
+    compare_shape.set_element_type(compare_type);
+    lhs = computation->AddInstruction(
+        HloInstruction::CreateConvert(compare_shape, lhs));
+    rhs = computation->AddInstruction(
+        HloInstruction::CreateConvert(compare_shape, rhs));
+  }
+
+  int64 bit_width = primitive_util::BitWidth(compare_type);
+  PrimitiveType signed_type =
+      primitive_util::SignedIntegralTypeForBitWidth(bit_width);
+  PrimitiveType unsigned_type =
+      primitive_util::UnsignedIntegralTypeForBitWidth(bit_width);
+  auto signed_shape = compare_shape;
+  signed_shape.set_element_type(signed_type);
+  auto unsigned_shape = compare_shape;
+  unsigned_shape.set_element_type(unsigned_type);
+  auto zero_value = computation->AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::Zero(signed_type)));
+  zero_value = computation->AddInstruction(HloInstruction::CreateBroadcast(
+      signed_shape, zero_value, zero_value->shape().dimensions()));
+  auto max_signed = computation->AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::MaxValue(signed_type)));
+  auto max_shape = max_signed->shape();
+  max_shape.set_element_type(unsigned_type);
+  auto max_unsigned = computation->AddInstruction(
+      HloInstruction::CreateConvert(max_shape, max_signed));
+  auto max_value = computation->AddInstruction(HloInstruction::CreateBroadcast(
+      unsigned_shape, max_unsigned, max_shape.dimensions()));
+  lhs = BitcastConvertFloatingPointToIntegral(
+      computation, lhs, signed_shape, unsigned_shape, zero_value, max_value);
+  rhs = BitcastConvertFloatingPointToIntegral(
+      computation, rhs, signed_shape, unsigned_shape, zero_value, max_value);
+  auto new_compare = computation->AddInstruction(HloInstruction::CreateCompare(
+      instruction->shape(), lhs, rhs, compare->direction(),
+      Comparison::Type::kSigned));
+  VLOG(2) << "New comparison instruction for total order:"
+          << new_compare->ToString() << "\n";
+  return new_compare;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/comparison_expander.h b/tensorflow/compiler/xla/service/comparison_expander.h
new file mode 100644
index 00000000000..df8b5dc0137
--- /dev/null
+++ b/tensorflow/compiler/xla/service/comparison_expander.h
@@ -0,0 +1,47 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_COMPARISON_EXPANDER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_COMPARISON_EXPANDER_H_
+
+#include <utility>
+
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/service/op_expander_pass.h"
+
+namespace xla {
+
+// A pass which performs expansion of the comparison operator to support total
+// order comparison of floating point numbers.
+class ComparisonExpander : public OpExpanderPass {
+ public:
+  explicit ComparisonExpander() = default;
+  ~ComparisonExpander() override = default;
+  absl::string_view name() const override { return "comparison-expander"; }
+
+ private:
+  // Returns `true` if `instruction` should be expanded by this pass.
+  bool InstructionMatchesPattern(HloInstruction* instruction) override;
+  // Returns a replacement for `instruction`, or nullptr if no replacement is
+  // needed (e.g. only the to_apply subcomputation of the instruction was
+  // modified).
+  StatusOr<HloInstruction*> ExpandInstruction(
+      HloInstruction* instruction) override;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_COMPARISON_EXPANDER_H_
diff --git a/tensorflow/compiler/xla/service/conditional_code_motion.cc b/tensorflow/compiler/xla/service/conditional_code_motion.cc
index cdda0aeb925..bd72ad22cb2 100644
--- a/tensorflow/compiler/xla/service/conditional_code_motion.cc
+++ b/tensorflow/compiler/xla/service/conditional_code_motion.cc
@@ -29,11 +29,13 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/call_inliner.h"
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_cse.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
+#include "tensorflow/compiler/xla/service/hlo_verifier.h"
 #include "tensorflow/compiler/xla/service/tuple_simplifier.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -95,12 +97,23 @@ class BoundaryVisitor {
   absl::flat_hash_set<HloInstruction*> visited_;
 };
 
+template <class OpCollection>
+int64 CountNonLeafOps(const OpCollection& ops) {
+  absl::flat_hash_set<HloInstruction*> op_set;
+  for (auto op : ops) {
+    if (!op_set.contains(op) && op->opcode() != HloOpcode::kConstant) {
+      op_set.insert(op);
+    }
+  }
+  return op_set.size();
+}
+
 // Returns estimation of potential reuses carried by a given pair of
 // instructions.  Use different integers to classify different levels
 // of reuses This is used as a placeholder only, assuming all
 // instructions can be fused to enable data reuses
 int64 ReusesCarriedBy(HloInstruction* op, HloInstruction* user) {
-  VLOG(1) << "ConditionalCodeMotion: Add reuses carried by instr: "
+  VLOG(2) << "ConditionalCodeMotion: Add reuses carried by instr: "
           << op->ToString() << "=>" << user->ToString() << "\n";
   switch (user->opcode()) {
     case HloOpcode::kGetTupleElement:
@@ -114,9 +127,11 @@ int64 ReusesCarriedBy(HloInstruction* op, HloInstruction* user) {
     case HloOpcode::kConstant:
     case HloOpcode::kGetTupleElement:
       return 0;
+    case HloOpcode::kConditional:
+      return 10;
     default:
       // Assume fusion will not happen anyway if user count > 1)
-      if (op->user_count() > 1) {
+      if (CountNonLeafOps(op->users()) > 1) {
         return 0;
       }
       return 10;
@@ -432,7 +447,8 @@ StatusOr<bool> ConditionalCodeMotion::MoveInstructionOut(
   if (to_move_out.empty()) {
     return false;
   }
-  VLOG(1) << "number of boundaries to move out:" << to_move_out.size() << "\n";
+  VLOG(1) << "Modifying code--number of boundaries to move out:"
+          << to_move_out.size() << "\n";
   HloComputation* conditional_parent = conditional->parent();
   // save the old users before add new conditional user instructions
   std::vector<HloInstruction*> old_conditional_users = conditional->users();
@@ -441,7 +457,7 @@ StatusOr<bool> ConditionalCodeMotion::MoveInstructionOut(
   absl::flat_hash_map<HloInstruction*, Boundary> hoisted_instructions;
   // Insert GetTupleElement before the instructions whose operands might still
   // be within the conditional.
-  VLOG(2) << "before opt:"
+  VLOG(1) << "before opt:"
           << conditional_parent->ToString(HloPrintOptions::Fingerprint())
           << "\n";
   int64 op_index = 0;
@@ -470,16 +486,22 @@ StatusOr<bool> ConditionalCodeMotion::MoveInstructionOut(
   HloInstruction* old_root =
       conditional->branch_computation(0)->root_instruction();
   for (auto user_instr : old_conditional_users) {
+    VLOG(2) << "Checking conditional user: " << user_instr->ToString() << "\n";
     CHECK(user_instr->opcode() == HloOpcode::kGetTupleElement);
     auto tuple_opd = static_cast<HloGetTupleElementInstruction*>(user_instr);
     int64 index = tuple_opd->tuple_index();
+    CHECK(old_root->operands().size() > index);
     HloInstruction* old_opd = old_root->operands()[index];
+    CHECK(ContainsKey(hoisted_instructions, old_opd));
     HloInstruction* new_opd = hoisted_instructions[old_opd].operands()[0];
     CHECK(old_opd != nullptr);
     CHECK(new_opd != nullptr);
+    VLOG(2) << "Try replace all uses of :" << old_opd->ToString() << "\n";
     TF_RETURN_IF_ERROR(user_instr->ReplaceAllUsesWith(new_opd));
     TF_RETURN_IF_ERROR(conditional_parent->RemoveInstruction(user_instr));
   }
+  VLOG(2) << "Done changing conditional users\n"
+          << conditional_parent->ToString() << "\n";
   // Create tuple element within each branch and set it as root.
   int64 branch_count = conditional->branch_count();
   for (int i = 0; i < branch_count; i++) {
@@ -487,9 +509,8 @@ StatusOr<bool> ConditionalCodeMotion::MoveInstructionOut(
     std::vector<HloInstruction*> elements;
     for (auto b1 : new_boundaries) {
       HloInstruction* op = b1.operands()[i];
-      VLOG(1) << "branch count=" << i << "\n";
       CHECK(op != nullptr);
-      VLOG(1) << "Adding to root " << i << " with " << op->ToString() << "\n";
+      VLOG(2) << "Adding to root " << i << " with " << op->ToString() << "\n";
       elements.push_back(op);
     }
     HloInstruction* tuple =
@@ -498,8 +519,16 @@ StatusOr<bool> ConditionalCodeMotion::MoveInstructionOut(
     VLOG(2) << "computation is :" << computation->ToString() << "\n";
     // Remove hoisted instructions from the branches.
     for (auto b2 : to_move_out) {
-      VLOG(2) << "Removing boundary:" << b2.ToString() << "\n";
-      TF_RETURN_IF_ERROR(computation->RemoveInstruction(b2.operands()[i]));
+      auto instr_to_remove = b2.operands()[i];
+      // Double check to make sure it is safe to delete the instruction.
+      // Complications may arise due to some operations in the alternative
+      // branches (branches 1..n) being placed into the boundaries multiple
+      // times.
+      if (!computation->IsMarkedAsDead(instr_to_remove) &&
+          instr_to_remove->user_count() == 0) {
+        VLOG(2) << "Removing boundary:" << b2.ToString() << "\n";
+        TF_RETURN_IF_ERROR(computation->RemoveInstruction(instr_to_remove));
+      }
     }
   }
   // Change conditional instruction shape to the shape of the new root.
@@ -507,7 +536,7 @@ StatusOr<bool> ConditionalCodeMotion::MoveInstructionOut(
       conditional->branch_computation(0)->root_instruction();
   *conditional->mutable_shape() = new_root->shape();
   //
-  VLOG(2) << "done moving instructions out of branches\n"
+  VLOG(1) << "done moving instructions out of branches\n"
           << conditional_parent->ToString(HloPrintOptions::Fingerprint())
           << "\n";
   return true;
@@ -520,48 +549,89 @@ StatusOr<bool> ConditionalCodeMotion::MoveInstructionIn(
   if (to_move_in.empty()) {
     return false;
   }
-  VLOG(1) << "number of boundaries to move in:" << to_move_in.size() << "\n";
-  HloComputation* conditional_parent = conditional->parent();
-  VLOG(2) << "before opt:"
-          << conditional_parent->ToString(HloPrintOptions::Fingerprint())
+  VLOG(1) << "Modifying code---number of boundaries to move in:"
+          << to_move_in.size() << "\n";
+  VLOG(1) << "before opt:"
+          << conditional->parent()->ToString(HloPrintOptions::Fingerprint())
           << "\n";
   // Mapping instructions to be moved to their new representations.
   absl::flat_hash_map<HloInstruction*, Boundary> hoisted_instructions;
   int64 to_move_in_size = to_move_in.size();
   int64 branch_count = conditional->branch_count();
-  int64 op_index = conditional->shape().tuple_shapes_size();
-  // Map conditional to its old root, then create a new root instruction in each
-  // branch.
-  Boundary b(Boundary::Position::kInsideBranch);
+  // Number of old conditional entries still to be used outside.
+  // If conditional shape is not tuple, will create a tuple and use subscript
+  // 0 to save the old operand being used.
+  int64 op_index = conditional->shape().IsTuple()
+                       ? conditional->shape().tuple_shapes_size() - 1
+                       : 0;
+  HloGetTupleElementInstruction* tuple_use =
+      dynamic_cast<HloGetTupleElementInstruction*>(to_move_in[0].operands()[0]);
+  int64 use_index = (tuple_use != nullptr) ? tuple_use->tuple_index() : -1;
+  VLOG(2) << "Tuple use index = " << use_index << "\n";
+  // Use to map the tuple_use instruction to its operand;
+  Boundary b_opd_use(Boundary::Position::kInsideBranch);
+  Boundary b_old_root(Boundary::Position::kInsideBranch);
+  // Create a new root instruction in each branch.
   for (int i = 0; i < branch_count; i++) {
     auto computation = conditional->branch_computation(i);
     auto old_root = computation->root_instruction();
-    b.mutable_operands().push_back(old_root);
-    HloInstruction* new_root = nullptr;
+    b_old_root.mutable_operands().push_back(old_root);
+    std::vector<HloInstruction*> operands;
     if (old_root->opcode() == HloOpcode::kTuple) {
-      new_root = computation->AddInstruction(old_root->Clone());
-    } else {
-      std::vector<HloInstruction*> operands;
-      if (!old_root->shape().IsTuple()) {
-        operands.push_back(old_root);
-      } else {
-        const Shape& old_shape = old_root->shape();
-        for (int64 i = 0; i < old_shape.tuple_shapes_size(); ++i) {
-          auto element =
-              computation->AddInstruction(HloInstruction::CreateGetTupleElement(
-                  old_shape.tuple_shapes(i), old_root, i));
-          operands.push_back(element);
+      // Use operands of old_root directly, so old_root can be removed later.
+      for (int i = 0; i < old_root->operand_count(); ++i) {
+        if (i != use_index) {
+          operands.push_back(old_root->operands()[i]);
+        } else {  // Map conditional use to the tuple operand.
+          b_opd_use.mutable_operands().push_back(old_root->operands()[i]);
         }
       }
-      new_root =
-          computation->AddInstruction(HloInstruction::CreateTuple(operands));
+    } else if (old_root->shape().IsTuple()) {
+      // If old_root is not a kTuple but has tuple shape, elements within the
+      // tuple must be extracted first to be used by the new instructions.
+      const Shape& old_shape = old_root->shape();
+      for (int64 i = 0; i < old_shape.tuple_shapes_size(); ++i) {
+        auto element =
+            computation->AddInstruction(HloInstruction::CreateGetTupleElement(
+                old_shape.tuple_shapes(i), old_root, i));
+        if (i != use_index) {
+          operands.push_back(element);
+        } else {
+          b_opd_use.mutable_operands().push_back(element);
+        }
+      }
+    } else {
+      // If old_root is not a tuple and does not have tuple shape, use it
+      // to replace the conditional directly in the new computation.
+      b_opd_use.mutable_operands().push_back(conditional);
     }
+
+    HloInstruction* new_root =
+        computation->AddInstruction(HloInstruction::CreateTuple(operands));
     VLOG(2) << "setting new root: " << new_root->ToString() << "\n";
-    computation->set_root_instruction(new_root);
+    computation->set_root_instruction(new_root,
+                                      /*accept_different_shape*/ true);
+    if (old_root->opcode() == HloOpcode::kTuple) {
+      TF_RETURN_IF_ERROR(computation->RemoveInstruction(old_root));
+    }
     VLOG(2) << "new branch computation: " << computation->ToString() << "\n";
   }
-  hoisted_instructions[conditional] = b;
-  for (int64 i = 0; i < to_move_in_size; i++) {
+  // Update get tuple element index of the conditional.
+  if (use_index != -1) {
+    for (auto* user : conditional->users()) {
+      if (user->opcode() == HloOpcode::kGetTupleElement &&
+          user->tuple_index() > use_index) {
+        user->set_tuple_index(user->tuple_index() - 1);
+      }
+    }
+  }
+  hoisted_instructions[conditional] = b_old_root;
+  int64 cp_start = 0;
+  if (use_index >= 0) {
+    hoisted_instructions[tuple_use] = b_opd_use;
+    cp_start = 1;
+  }
+  for (int64 i = cp_start; i < to_move_in_size; i++) {
     Boundary b_to_move = to_move_in[i];
     HloInstruction* op = b_to_move.operands()[0];
     CHECK(op != nullptr);
@@ -591,12 +661,12 @@ StatusOr<bool> ConditionalCodeMotion::MoveInstructionIn(
     }
     if (to_be_used_outside) {
       // Modify uses of instructions outside of the conditionals
-      HloInstruction* gtr = conditional_parent->AddInstruction(
+      HloInstruction* gtr = conditional->parent()->AddInstruction(
           HloInstruction::CreateGetTupleElement(op->shape(), conditional,
                                                 op_index++));
       TF_RETURN_IF_ERROR(op->ReplaceAllUsesWith(gtr));
-      if (conditional_parent->root_instruction() == op) {
-        conditional_parent->set_root_instruction(gtr);
+      if (conditional->parent()->root_instruction() == op) {
+        conditional->parent()->set_root_instruction(gtr);
       }
     }
   }
@@ -606,8 +676,8 @@ StatusOr<bool> ConditionalCodeMotion::MoveInstructionIn(
   HloInstruction* new_root =
       conditional->branch_computation(0)->root_instruction();
   *conditional->mutable_shape() = new_root->shape();
-  VLOG(2) << "Before removing instructions:" << conditional_parent->ToString()
-          << "\n";
+  VLOG(2) << "Before removing instructions:"
+          << conditional->parent()->ToString() << "\n";
   // Remove hoisted instructions from the branches.
   for (int64 i = to_move_in_size - 1; i >= 0; i--) {
     Boundary boundary_to_move_in = to_move_in[i];
@@ -616,10 +686,20 @@ StatusOr<bool> ConditionalCodeMotion::MoveInstructionIn(
     for (auto user : op->users()) {
       VLOG(2) << "Has User: " << user->ToString() << "\n";
     }
-    TF_RETURN_IF_ERROR(conditional_parent->RemoveInstruction(op));
+    TF_RETURN_IF_ERROR(conditional->parent()->RemoveInstruction(op));
   }
-  VLOG(2) << "Done moving instructions inside branches\n"
-          << conditional_parent->ToString(HloPrintOptions::Fingerprint())
+
+  // Reset shapes of user gtes to the new shape.
+  if (use_index != -1) {
+    for (auto* user : conditional->users()) {
+      if (user->opcode() == HloOpcode::kGetTupleElement) {
+        *user->mutable_shape() =
+            conditional->shape().tuple_shapes(user->tuple_index());
+      }
+    }
+  }
+  VLOG(1) << "Done moving instructions inside branches\n"
+          << conditional->parent()->ToString(HloPrintOptions::Fingerprint())
           << "\n";
   return true;
 }
@@ -631,6 +711,7 @@ class GroupConnectedBoundaries {
   HloInstruction* conditional_;
   HloComputation* conditional_parent_;
   bool is_layout_sensitive_;
+  // Instructions that have been visited but are not going to be moved.
   absl::flat_hash_set<HloInstruction*> visited_;
 
  public:
@@ -639,7 +720,7 @@ class GroupConnectedBoundaries {
       : conditional_(conditional),
         conditional_parent_(conditional->parent()),
         is_layout_sensitive_(is_layout_sensitive) {}
-  // Returns true if `instruction` is worth hoisting out.
+  // Returns true if `instruction` is worth hoisting.
   bool WorthHoisting(HloInstruction* instruction) {
     // This is needed for the "moving-in" transformation, to prevent the root
     // of the parent computation (which contains the conditional) to be moved
@@ -663,13 +744,14 @@ class GroupConnectedBoundaries {
           case HloOpcode::kReshape:
             return true;
           default:
-            VLOG(1) << "Instruction is convert and its operand is not know to "
+            VLOG(2) << "Instruction is convert and its operand is not know to "
                        "be worth hoisting\n";
             return false;
         }
       case HloOpcode::kAllReduce:
       case HloOpcode::kAdd:
       case HloOpcode::kPower:
+      case HloOpcode::kCopy:
       case HloOpcode::kConstant:
       case HloOpcode::kSubtract:
       case HloOpcode::kMultiply:
@@ -680,24 +762,28 @@ class GroupConnectedBoundaries {
       case HloOpcode::kGetTupleElement:
         return true;
       default:
-        VLOG(1) << "Instruction is not known to be worth hoisting\n";
+        VLOG(2) << "Instruction is not known to be worth hoisting\n";
         return false;
     }
   }
   int64 ReusesBeforeBoundary(HloInstruction* user) {
     int64 reuses = 0;
     for (auto op : user->operands()) {
+      // The operand must be an instruction that is not going to be moved (if
+      // user is inside the conditional); otherwise it must be the conditional
+      // itself and its user must be outside of the conditional.
+      if (!ContainsKey(visited_, op) && op != conditional_) {
+        continue;
+      }
       // Only consider single-user cases as reuseable.
-      if (ContainsKey(visited_, op) && op->user_count() == 1) {
+      if (user->opcode() == HloOpcode::kGetTupleElement &&
+          user->user_count() == 1) {
+        reuses += ReusesCarriedBy(op, user->users()[0]);
+      } else if (op->user_count() == 1) {
         reuses += ReusesCarriedBy(op, user);
-      } else if (op->opcode() == HloOpcode::kConditional &&
-                 user->opcode() == HloOpcode::kGetTupleElement) {
-        if (user->user_count() == 1) {
-          reuses += ReusesCarriedBy(op, user->users()[0]);
-        }
       }
     }
-    VLOG(1) << "Reuses before instruction " << user->ToString() << ":" << reuses
+    VLOG(2) << "Reuses before instruction " << user->ToString() << ":" << reuses
             << "\n";
     return reuses;
   }
@@ -735,7 +821,7 @@ class GroupConnectedBoundaries {
       } else if (ContainsKey(visited_, op)) {
         reuses += ReusesCarriedBy(user, op);
       }
-      VLOG(1) << "reuses after instruction " << user->ToString() << ":"
+      VLOG(2) << "reuses after instruction " << user->ToString() << ":"
               << reuses << "\n";
       return reuses;
     }
@@ -744,7 +830,8 @@ class GroupConnectedBoundaries {
 
   int64 BenefitForMovingBoundaries(const std::vector<Boundary>& boundaries) {
     int64 reuses_before = 0, reuses_after = 0;
-    if (boundaries.size() == 1 && boundaries[0].IsOutsideBranch()) {
+    if (boundaries.size() == 1 && boundaries[0].IsOutsideBranch() &&
+        boundaries[0].operands()[0]->opcode() == HloOpcode::kGetTupleElement) {
       // The only boundary of moving-in is the get_tuple_element op.
       return -1;
     }
@@ -754,16 +841,16 @@ class GroupConnectedBoundaries {
         continue;
       }
       reuses_before += ReusesBeforeBoundary(op);
-      VLOG(1) << "Reuses before boundary so far: " << reuses_before << "\n";
+      VLOG(2) << "Reuses before boundary so far: " << reuses_before << "\n";
       reuses_after += ReusesAfterBoundary(op);
-      VLOG(1) << "Reuese after boundary so far : " << reuses_after << "\n";
+      VLOG(2) << "Reuese after boundary so far : " << reuses_after << "\n";
     }
     if (reuses_after == 0 && reuses_before == 0) {
       return -1;
     } else if (boundaries[0].IsInsideBranch()) {
       return reuses_after - reuses_before;
     } else {
-      return reuses_before - reuses_after;
+      return reuses_before - reuses_after - 1;
     }
   }
 
@@ -779,17 +866,6 @@ class GroupConnectedBoundaries {
     }
     return b2;
   }
-  int64 CountNonLeafOps(const xla::HloInstruction::InstructionVector& ops) {
-    int64 count = 0;
-    absl::flat_hash_set<HloInstruction*> op_set;
-    for (auto op : ops) {
-      if (!op_set.contains(op) && op->opcode() != HloOpcode::kConstant) {
-        count++;
-        op_set.insert(op);
-      }
-    }
-    return count;
-  }
   // This function is reused both for moving the boundary outside or into a
   // conditional. As the result, the readability is somewhat compromised.
   // It might be nice to refactor this function to factor the outside-inside
@@ -800,12 +876,12 @@ class GroupConnectedBoundaries {
     visitor.AddToWorkList(boundary);
     while (visitor.HasNextBoundary()) {
       Boundary b = visitor.PopNextBoundary();
-      VLOG(1) << "visiting boundary " << b.ToString() << "\n";
+      VLOG(2) << "visiting boundary " << b.ToString() << "\n";
       if ((b.IsOutsideBranch() || InstructionWithinBranchIdentical(
                                       b.operands(), is_layout_sensitive_)) &&
           WorthHoisting(b.operands()[0])) {
         connected_boundaries_.push_back(b);
-        VLOG(1) << "boundary can be moved\n";
+        VLOG(2) << "boundary can be moved\n";
         int64 operand_count = (b.IsInsideBranch())
                                   ? b.operands()[0]->operand_count()
                                   : b.operands()[0]->users().size();
@@ -829,20 +905,21 @@ class GroupConnectedBoundaries {
           }
         }
       } else {
-        VLOG(1) << "boundary cannot be moved\n";
+        VLOG(2) << "boundary cannot be moved\n";
         visited_.insert(b.operands()[0]);
         new_boundaries_.push_back(b);
       }
     }
   }
-  std::vector<Boundary> BoundariesToMoveInOrOut(const Boundary& b) {
+  std::vector<Boundary> BoundariesToMoveInOrOut(HloInstruction* conditional,
+                                                const Boundary& b) {
     // At the beginning of optimization, a conditional itself is added to a
     // worklist. Here the conditional is expanded into two sets of boundaries:
     // the first set contains the boundary that is inside branches and
     // contains the root of all branches; the second set of boundaries
     // contains all the users of the conditional.
     HloInstruction* inst = b.operands()[0];
-    if (inst->opcode() == HloOpcode::kConditional) {
+    if (inst == conditional) {
       int branch_count = inst->branch_count();
       // Add conditional roots as a new boundary to visit.
       Boundary boundary_in(Boundary::Position::kInsideBranch);
@@ -873,10 +950,11 @@ ConditionalCodeMotion::Decision ConditionalCodeMotion::ConsiderCodeMotion(
     HloInstruction* conditional, const Boundary& cur_boundary,
     std::vector<Boundary>& to_move, std::vector<Boundary>& new_boundaries) {
   GroupConnectedBoundaries connect(conditional, is_layout_sensitive_);
-  auto move_in_or_out = connect.BoundariesToMoveInOrOut(cur_boundary);
+  auto move_in_or_out =
+      connect.BoundariesToMoveInOrOut(conditional, cur_boundary);
   if (!move_in_or_out.empty()) {
     auto benefit = connect.BenefitForMovingBoundaries(move_in_or_out);
-    VLOG(1) << "benefit of moving in or out "
+    VLOG(2) << "benefit of moving in or out "
             << cur_boundary.operands()[0]->ToString() << ":" << benefit << "\n";
     if (benefit >= 0) {
       new_boundaries.clear();
@@ -896,19 +974,62 @@ ConditionalCodeMotion::Decision ConditionalCodeMotion::ConsiderCodeMotion(
 }
 
 StatusOr<bool> ConditionalCodeMotion::Run(HloModule* module) {
+  bool changed = false;
+  bool cleanup_changed = false;
+  {
+    HloPassPipeline subpipeline("before_conditional_code_motion");
+    subpipeline.AddPass<HloCSE>(/*is_layout_sensitive=*/is_layout_sensitive_);
+    subpipeline.AddPass<HloDCE>();
+    TF_ASSIGN_OR_RETURN(auto cleanup_changed_now, subpipeline.Run(module));
+    cleanup_changed |= cleanup_changed_now;
+  }
   // Gather all the conditional ops in the module ahead of time, to avoid
   // potential complications of modifying the code that affecting traversal.
   std::vector<HloInstruction*> conditional_ops;
+  // Track how many times each branch computation is shared.
+  absl::flat_hash_map<HloComputation*, int> conditional_computations;
   for (auto* comp : module->MakeComputationPostOrder()) {
     for (auto* instr : comp->MakeInstructionPostOrder()) {
       if (instr->opcode() == HloOpcode::kConditional) {
-        conditional_ops.push_back(instr);
+        int branch_count = instr->branch_count();
+        for (int i = 0; i < branch_count; ++i) {
+          HloComputation* branch_i = instr->branch_computation(i);
+          if (ContainsKey(conditional_computations, branch_i)) {
+            conditional_computations[branch_i]++;
+          } else {
+            conditional_computations[branch_i] = 0;
+          }
+        }
+        if (instr->shape().IsTuple()) {
+          bool can_change_tuple_shape = true;
+          for (auto user : instr->users()) {
+            VLOG(2) << "user is : " << user->ToString() << "\n";
+            if (user->opcode() != HloOpcode::kGetTupleElement) {
+              can_change_tuple_shape = false;
+            }
+          }
+          if (can_change_tuple_shape) {
+            conditional_ops.push_back(instr);
+          }
+        } else {
+          conditional_ops.push_back(instr);
+        }
       }
     }
   }
 
-  bool changed = false;
   for (HloInstruction* conditional : conditional_ops) {
+    int branch_count = conditional->branch_count();
+    // check for shared conditional computations
+    bool conditional_is_shared = false;
+    for (int i = 0; i < branch_count; ++i) {
+      HloComputation* branch_i = conditional->branch_computation(i);
+      if (conditional_computations[branch_i] > 0) {
+        conditional_is_shared = true;
+        break;
+      }
+    }
+
     // Boundaries to move out or to move into the branches.
     std::vector<Boundary> to_move_out, to_move_in, new_boundaries;
     // The conditional is moved into a worklist as the seed (starting point).
@@ -926,6 +1047,33 @@ StatusOr<bool> ConditionalCodeMotion::Run(HloModule* module) {
       Boundary boundary = visitor.PopNextBoundary();
       VLOG(2) << "Analyzing boundary:" << boundary.ToString() << "\n";
       d = ConsiderCodeMotion(conditional, boundary, to_move, next_boundary);
+      if (d != Decision::kNoChange && conditional_is_shared) {
+        for (int i = 0; i < branch_count; ++i) {
+          HloComputation* branch_i = conditional->branch_computation(i);
+          if (conditional_computations[branch_i] > 0) {
+            // Cloning is absolutely needed if the computation is shared by
+            // different branches, but the cloning can be potentially avoided
+            // if the sharing is only among branches of the same conditional.
+            // If cloning these branches causes a problem due to space issues,
+            // a fix can pass a vector of unique branches to the actual
+            // transformations, as an alternative representation of the
+            // conditional branches to be modified. Right now we assume the
+            // overhead of cloning is minimal since later stages of the compiler
+            // inline all the computations anyway.
+            HloComputation* clone_i =
+                conditional->parent()->parent()->AddEmbeddedComputation(
+                    branch_i->Clone());
+            conditional->set_branch_computation(i, clone_i);
+            conditional_computations[branch_i]--;
+          }
+        }
+        to_move.clear();
+        next_boundary.clear();
+        VLOG(2) << "Cloned branches as needed: " << conditional->ToString()
+                << "\n";
+        // Need to reanalyze the cloned code to generate correct result.
+        d = ConsiderCodeMotion(conditional, boundary, to_move, next_boundary);
+      }
       switch (d) {
         case Decision::kMoveOutOfBranch:
           VLOG(2) << "Decision is move out of branch\n";
@@ -961,22 +1109,14 @@ StatusOr<bool> ConditionalCodeMotion::Run(HloModule* module) {
           MoveInstructionIn(conditional, to_move_in, new_boundaries));
       VLOG(2) << "moving in result:" << result << "\n";
       changed |= result;
-    }
-  }
-  // handling convert rematerialization/hoisting
-  if (!changed && pursue_full_conditional_code_motion_) {
-    std::vector<HloInstruction*> conditional_ops;
-    for (auto* comp : module->MakeComputationPostOrder()) {
-      for (auto* instr : comp->MakeInstructionPostOrder()) {
-        if (instr->opcode() == HloOpcode::kConditional) {
-          conditional_ops.push_back(instr);
-        }
-      }
-    }
-    for (HloInstruction* conditional_op : conditional_ops) {
+    } else if (pursue_full_conditional_code_motion_ && !conditional_is_shared) {
+      // Invoke special handling for convert rematerialization/hoisting
+      // We need to make sure no sharing is present in the branches because no
+      // cloning has been done by the earlier analysis.
+      // TOOD[b/165848866]: extend solution to handle cloning for special move.
       TF_ASSIGN_OR_RETURN(
           bool convert_result,
-          ConvertSpecialMove(conditional_op, is_layout_sensitive_));
+          ConvertSpecialMove(conditional, is_layout_sensitive_));
       changed |= convert_result;
     }
   }
@@ -986,8 +1126,11 @@ StatusOr<bool> ConditionalCodeMotion::Run(HloModule* module) {
     subpipeline.AddPass<HloDCE>();
     subpipeline.AddPass<TupleSimplifier>();
     subpipeline.AddPass<HloDCE>();
-    TF_ASSIGN_OR_RETURN(bool cleanup_changed, subpipeline.Run(module));
-    changed |= cleanup_changed;
+    TF_ASSIGN_OR_RETURN(auto cleanup_changed_now, subpipeline.Run(module));
+    cleanup_changed |= cleanup_changed_now;
+  }
+  if (cleanup_changed) {
+    VLOG(2) << "subpipeline cleanup have modified code\n";
   }
   return changed;
 }
diff --git a/tensorflow/compiler/xla/service/conditional_code_motion_test.cc b/tensorflow/compiler/xla/service/conditional_code_motion_test.cc
index b0a6ba92f48..3b772221446 100644
--- a/tensorflow/compiler/xla/service/conditional_code_motion_test.cc
+++ b/tensorflow/compiler/xla/service/conditional_code_motion_test.cc
@@ -158,6 +158,44 @@ ENTRY main {
   EXPECT_THAT(root, AllOf(op::Tuple(op::Convert())));
 }
 
+TEST_F(ConditionalCodeMotionTest, ConditionalShapeNotMutable) {
+  absl::string_view hlo_string =
+      R"(
+HloModule RemoveDotOpOut
+
+on_true {
+  %arg_tuple.1 = (f32[93184,4]{1,0}) parameter(0)
+  %get-tuple-element.1 = f32[93184,4]{1,0} get-tuple-element(%arg_tuple.1), index=0
+  %reshape.8493 = f32[2,512,364]{2,1,0} reshape(f32[93184,4]{1,0} %get-tuple-element.1)
+  %add.8493 = f32[2,512,364]{2,1,0} add(f32[2,512,364]{2,1,0} %reshape.8493, f32[2,512,364]{2,1,0} %reshape.8493)
+  %convert.2894 = bf16[2,512,364]{2,1,0} convert(f32[2,512,364]{2,1,0} %add.8493)
+  ROOT %tuple.1 = ( bf16[2,512,364]{2,1,0}) tuple(%convert.2894)
+}
+
+on_false {
+  %arg_tuple.2 = (f32[93184,4]{1,0}) parameter(0)
+  %get-tuple-element.3 = f32[93184,4]{1,0} get-tuple-element(%arg_tuple.2), index=0
+  %reshape.9717 = f32[2,512,364]{2,1,0} reshape(f32[93184,4]{1,0} %get-tuple-element.3)
+  %add.8493 = f32[2,512,364]{2,1,0} add(f32[2,512,364]{2,1,0} %reshape.9717, f32[2,512,364]{2,1,0} %reshape.9717)
+  %sub.8493 = f32[2,512,364]{2,1,0} subtract(f32[2,512,364]{2,1,0} %add.8493, f32[2,512,364]{2,1,0} %reshape.9717)
+  %convert.3604 = bf16[2,512,364]{2,1,0} convert(f32[2,512,364]{2,1,0} %reshape.9717), metadata={op_type="Cast" op_name="gradients/Cast_125_grad/Cast"}
+  ROOT %tuple.2 = (bf16[2,512,364]{2,1,0}) tuple(%convert.3604)
+}
+
+ENTRY main {
+  pred.1 = pred[] parameter(0)
+  arg_tuple.11 = (f32[93184,4]{1,0}) parameter(1)
+  arg_tuple.22 = (f32[93184,4]{1,0}) parameter(2)
+  conditional = (bf16[2,512,364]{2,1,0}) conditional(pred.1, arg_tuple.11, arg_tuple.22), true_computation=on_true, false_computation=on_false
+  get-first-index = bf16[2,512,364]{2,1,0} get-tuple-element(conditional), index=0
+  ROOT result = (bf16[2,512,364]{2,1,0}, (bf16[2,512,364]{2,1,0})) tuple(get-first-index, conditional)
+}
+)";
+  auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  ConditionalCodeMotion pass(true, true);
+  ASSERT_FALSE(pass.Run(&*module).ValueOrDie());
+}
+
 TEST_F(ConditionalCodeMotionTest, MoveConvertOut) {
   absl::string_view hlo_string =
       R"(
@@ -580,6 +618,347 @@ ENTRY main {
   HloInstruction* root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, AllOf(op::GetTupleElement(op::Conditional())));
 }
+
+TEST_F(ConditionalCodeMotionTest, NoMoveInWithMultipleGTE) {
+  absl::string_view hlo_string =
+      R"(
+HloModule RemoveIdenticalInstruction
+
+on_true {
+  arg_tuple.1 = (f32[10]) parameter(0)
+  get-tuple-element.1 = f32[10] get-tuple-element(arg_tuple.1), index=0
+  add.1 = f32[10] add(get-tuple-element.1, get-tuple-element.1)
+  ROOT tuple.3 = (f32[10]) tuple(add.1)
+}
+
+on_false {
+  arg_tuple.2 = (f32[10]) parameter(0)
+  get-tuple-element.2 = f32[10] get-tuple-element(arg_tuple.2), index=0
+  mul.1 = f32[10] multiply(get-tuple-element.2, get-tuple-element.2)
+  ROOT tuple.4 = (f32[10]) tuple(mul.1)
+}
+
+ENTRY main {
+  pred.1 = pred[] parameter(0)
+  tuple.1 = (f32[10]) parameter(1)
+  tuple.2 = (f32[10]) parameter(2)
+  conditional = (f32[10])
+    conditional(pred.1, tuple.1, tuple.2), true_computation=on_true,
+    false_computation=on_false
+  get-first-index = f32[10] get-tuple-element(conditional), index=0
+  get-first-index.2 = f32[10] get-tuple-element(conditional), index=0
+  pow.1 = f32[10] power(get-first-index, get-first-index)
+  ROOT tuple.3 = (f32[10], f32[10]) tuple(pow.1, get-first-index.2)
+}
+)";
+  auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  ConditionalCodeMotion pass(true, true);
+  ASSERT_FALSE(pass.Run(&*module).ValueOrDie());
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root,
+              op::Tuple(op::Power(), op::GetTupleElement(op::Conditional())));
+}
+
+TEST_F(ConditionalCodeMotionTest, MovePowInWithSharedBranch) {
+  absl::string_view hlo_string =
+      R"(
+HloModule RemoveIdenticalInstruction
+
+branch {
+  arg_tuple.1 = (f32[10]) parameter(0)
+  get-tuple-element.1 = f32[10] get-tuple-element(arg_tuple.1), index=0
+  add.1 = f32[10] add(get-tuple-element.1, get-tuple-element.1)
+  ROOT tuple.3 = (f32[10]) tuple(add.1)
+}
+
+ENTRY main {
+  pred.1 = pred[] parameter(0)
+  tuple.1 = (f32[10]) parameter(1)
+  tuple.2 = (f32[10]) parameter(2)
+  conditional = (f32[10])
+    conditional(pred.1, tuple.1, tuple.2), true_computation=branch,
+    false_computation=branch
+  get-first-index = f32[10] get-tuple-element(conditional), index=0
+  ROOT pow.1 = f32[10] power(get-first-index, get-first-index)
+}
+)";
+  auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  ConditionalCodeMotion pass(true, true);
+  ASSERT_TRUE(pass.Run(&*module).ValueOrDie());
+  const HloInstruction* conditional =
+      FindInstruction(module.get(), "conditional");
+  const HloComputation* on_true = conditional->branch_computation(0);
+  ASSERT_EQ(on_true->instruction_count(), 5);
+  const HloComputation* on_false = conditional->branch_computation(1);
+  ASSERT_EQ(on_false->instruction_count(), 5);
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::GetTupleElement(op::Conditional())));
+}
+
+TEST_F(ConditionalCodeMotionTest, MovePowInWithNonTupleRoot) {
+  absl::string_view hlo_string =
+      R"(
+HloModule RemoveIdenticalInstruction
+
+branch {
+  arg_tuple.1 = (f32[10]) parameter(0)
+  get-tuple-element.1 = f32[10] get-tuple-element(arg_tuple.1), index=0
+  ROOT add.1 = f32[10] add(get-tuple-element.1, get-tuple-element.1)
+}
+
+ENTRY main {
+  pred.1 = pred[] parameter(0)
+  tuple.1 = (f32[10]) parameter(1)
+  tuple.2 = (f32[10]) parameter(2)
+  conditional = f32[10]
+    conditional(pred.1, tuple.1, tuple.2), true_computation=branch,
+    false_computation=branch
+  ROOT pow.1 = f32[10] power(conditional, conditional)
+}
+)";
+  auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  ConditionalCodeMotion pass(true, true);
+  ASSERT_TRUE(pass.Run(&*module).ValueOrDie());
+  const HloInstruction* conditional =
+      FindInstruction(module.get(), "conditional");
+  const HloComputation* on_true = conditional->branch_computation(0);
+  ASSERT_EQ(on_true->instruction_count(), 5);
+  const HloComputation* on_false = conditional->branch_computation(1);
+  ASSERT_EQ(on_false->instruction_count(), 5);
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::GetTupleElement(op::Conditional())));
+}
+
+TEST_F(ConditionalCodeMotionTest, MovePowInWithEmptyBranch) {
+  absl::string_view hlo_string =
+      R"(
+HloModule RemoveIdenticalInstruction
+
+branch1 {
+  arg_tuple.1 = (f32[10]) parameter(0)
+  get-tuple-element.1 = f32[10] get-tuple-element(arg_tuple.1), index=0
+  add.1 = f32[10] add(get-tuple-element.1, get-tuple-element.1)
+  ROOT tuple.3 = (f32[10]) tuple(add.1)
+}
+
+branch2 {
+  ROOT arg_tuple.1 = (f32[10]) parameter(0)
+}
+
+ENTRY main {
+  pred.1 = pred[] parameter(0)
+  tuple.1 = (f32[10]) parameter(1)
+  tuple.2 = (f32[10]) parameter(2)
+  conditional = (f32[10])
+    conditional(pred.1, tuple.1, tuple.2), true_computation=branch1,
+    false_computation=branch2
+  get-first-index = f32[10] get-tuple-element(conditional), index=0
+  ROOT pow.1 = f32[10] power(get-first-index, get-first-index)
+}
+)";
+  auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  ConditionalCodeMotion pass(true, true);
+  ASSERT_TRUE(pass.Run(&*module).ValueOrDie());
+  const HloInstruction* conditional =
+      FindInstruction(module.get(), "conditional");
+  const HloComputation* on_true = conditional->branch_computation(0);
+  ASSERT_EQ(on_true->instruction_count(), 5);
+  const HloComputation* on_false = conditional->branch_computation(1);
+  ASSERT_EQ(on_false->instruction_count(), 4);
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::GetTupleElement(op::Conditional())));
+}
+
+TEST_F(ConditionalCodeMotionTest, MovePowInWithNonTupleParameter) {
+  absl::string_view hlo_string =
+      R"(
+HloModule RemoveIdenticalInstruction
+
+branch {
+  arg.1 = f32[10] parameter(0)
+  ROOT add.1 = f32[10] add(arg.1, arg.1)
+}
+
+ENTRY main {
+  pred.1 = pred[] parameter(0)
+  tuple.1 = f32[10] parameter(1)
+  tuple.2 = f32[10] parameter(2)
+  conditional = f32[10]
+    conditional(pred.1, tuple.1, tuple.2), true_computation=branch,
+    false_computation=branch
+  ROOT pow.1 = f32[10] power(conditional, conditional)
+}
+)";
+  auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  ConditionalCodeMotion pass(true, true);
+  ASSERT_TRUE(pass.Run(&*module).ValueOrDie());
+  const HloInstruction* conditional =
+      FindInstruction(module.get(), "conditional");
+  const HloComputation* on_true = conditional->branch_computation(0);
+  ASSERT_EQ(on_true->instruction_count(), 4);
+  const HloComputation* on_false = conditional->branch_computation(1);
+  ASSERT_EQ(on_false->instruction_count(), 4);
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::GetTupleElement(op::Conditional())));
+}
+
+TEST_F(ConditionalCodeMotionTest, MoveCopyInBranch) {
+  absl::string_view hlo_string =
+      R"(
+HloModule RemoveIdenticalInstruction
+
+branch1 {
+  arg_tuple.1 = (s32[], f32[10,3]{0,1}) parameter(0)
+  constant.1 = s32[] constant(4)
+  get-tuple-element.1 = s32[] get-tuple-element(arg_tuple.1), index=0
+  add.1 = s32[] add(get-tuple-element.1, constant.1)
+  get-tuple-element.2 = f32[10,3]{0,1} get-tuple-element(arg_tuple.1), index=1
+  slice.1 = f32[4,3]{0,1} slice(get-tuple-element.2),
+   slice={[0:4:1], [0:3:1]}
+  constant.2 = f32[] constant(0.0)
+  ROOT tuple.1 = (f32[4,3]{0,1}, s32[],f32[]) tuple(slice.1, add.1, constant.2)
+}
+
+branch2 {
+  arg_tuple.2 = (s32[], f32[4,3]{1,0}) parameter(0)
+  get-tuple-element.3 = s32[] get-tuple-element(arg_tuple.2), index=0
+  copy.1 = s32[] copy(get-tuple-element.3)
+  get-tuple-element.4 = f32[4,3]{1,0} get-tuple-element(arg_tuple.2), index=1
+  copy.2 = f32[4,3]{0,1} copy(get-tuple-element.4)
+  constant.2 = f32[] constant(0.0)
+  ROOT tuple.2 = (f32[4,3]{0,1}, s32[], f32[]) tuple(copy.2, copy.1, constant.2)
+}
+
+ENTRY main {
+  pred.1 = pred[] parameter(0)
+  tuple.3 = (s32[], f32[10,3]{0,1}) parameter(1)
+  tuple.4 = (s32[], f32[4,3]{1,0}) parameter(2)
+  conditional = (f32[4,3]{0,1}, s32[], f32[])
+    conditional(pred.1, tuple.3, tuple.4), true_computation=branch1,
+    false_computation=branch2
+  get-zero-index = f32[4,3]{0,1} get-tuple-element(conditional), index=0
+  get-first-index = s32[] get-tuple-element(conditional), index=1
+  get-second-index = f32[] get-tuple-element(conditional), index=2
+  copy.3 = f32[4,3]{1,0} copy(get-zero-index)
+  ROOT tuple.5 = (f32[4,3]{0,1}, s32[], f32[]) tuple(copy.3, get-first-index,
+                 get-second-index)
+}
+)";
+  auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  ConditionalCodeMotion pass(true, true);
+  ASSERT_TRUE(pass.Run(&*module).ValueOrDie());
+  VLOG(1) << module->ToString();
+
+  const HloInstruction* conditional =
+      FindInstruction(module.get(), "conditional");
+  const HloComputation* on_true = conditional->branch_computation(0);
+  ASSERT_EQ(on_true->instruction_count(), 9);
+  const HloComputation* on_false = conditional->branch_computation(1);
+  ASSERT_EQ(on_false->instruction_count(), 8);
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root,
+              AllOf(op::Tuple(op::GetTupleElement(op::Conditional(), 2),
+                              op::GetTupleElement(op::Conditional(), 0),
+                              op::GetTupleElement(op::Conditional(), 1))));
+}
+
+TEST_F(ConditionalCodeMotionTest, MoveReplicatedTupleEntryOut) {
+  absl::string_view hlo_string =
+      R"(
+HloModule RemoveIdenticalInstruction
+
+%add.64 (x.139: bf16[], y.139: bf16[]) -> bf16[] {
+  %x.139 = bf16[]{:T(512)} parameter(0)
+  %y.139 = bf16[]{:T(512)} parameter(1)
+  ROOT %add.44073 = bf16[]{:T(512)} add(bf16[]{:T(512)} %x.139, bf16[]{:T(512)} %y.139)
+}
+
+%add.181 (x.256: bf16[], y.256: bf16[]) -> bf16[] {
+  %x.256 = bf16[]{:T(512)} parameter(0)
+  %y.256 = bf16[]{:T(512)} parameter(1)
+  ROOT %add.44842 = bf16[]{:T(512)} add(bf16[]{:T(512)} %x.256, bf16[]{:T(512)} %y.256)
+}
+
+on_true {
+  arg_tuple.1 = (bf16[2,54,168,128], bf16[2,52,168,128]) parameter(0)
+  get-tuple-element.11 = bf16[2,54,168,128] get-tuple-element(arg_tuple.1), index=0
+  get-tuple-element.12 = bf16[2,52,168,128] get-tuple-element(arg_tuple.1), index=1
+  convolution.1 = bf16[3,3,128,128] convolution(bf16[2,54,168,128]
+    get-tuple-element.11, bf16[2,52,168,128]
+    get-tuple-element.12), window={size=52x168 pad=0_0x1_1},
+    dim_labels=f01b_i01o->01bf
+  all-reduce.1 = bf16[3,3,128,128]
+    all-reduce(bf16[3,3,128,128] %convolution.1),
+    channel_id=188, replica_groups={{0,1}}, use_global_device_ids=true,
+    to_apply=%add.64
+  convert.1 = f32[3,3,128,128] convert(bf16[3,3,128,128] %all-reduce.1)
+  all-reduce.3 = bf16[3,3,128,128]
+    all-reduce(bf16[3,3,128,128] %convolution.1),
+    channel_id=188, replica_groups={{0,1}}, use_global_device_ids=true,
+    to_apply=%add.64
+  convert.3 = f32[3,3,128,128] convert(bf16[3,3,128,128] %all-reduce.3)
+  ROOT tuple.1 = (f32[3,3,128,128], f32[3,3,128,128]) tuple(convert.1, convert.3)
+}
+
+on_false {
+  arg_tuple.2 = (bf16[2,86,104,128], bf16[2,84,104,128]) parameter(0)
+  get-tuple-element.21 = bf16[2,86,104,128]
+    get-tuple-element(arg_tuple.2), index=0
+  get-tuple-element.22 = bf16[2,84,104,128]
+    get-tuple-element(arg_tuple.2), index=1
+  convolution.2 = bf16[3,3,128,128]
+    convolution(bf16[2,86,104,128] get-tuple-element.21, bf16[2,84,104,128]
+    get-tuple-element.22), window={size=84x104 pad=0_0x1_1},
+    dim_labels=f01b_i01o->01bf
+  all-reduce.2 = bf16[3,3,128,128]
+    all-reduce(bf16[3,3,128,128] %convolution.2),
+    channel_id=485, replica_groups={{0,1}}, use_global_device_ids=true,
+    to_apply=%add.181
+  convert.2 = f32[3,3,128,128]
+    convert(bf16[3,3,128,128] %all-reduce.2)
+  ROOT tuple.2 = (f32[3,3,128,128], f32[3,3,128,128]) tuple(convert.2, convert.2)
+}
+
+ENTRY main {
+  pred.1 = pred[] parameter(0)
+  arg_tuple.3 = (bf16[2,54,168,128], bf16[2,52,168,128]) parameter(1)
+  arg_tuple.4 = (bf16[2,86,104,128], bf16[2,84,104,128]) parameter(2)
+  conditional = (f32[3,3,128,128], f32[3,3,128,128])
+    conditional(pred.1, arg_tuple.3, arg_tuple.4), true_computation=on_true,
+    false_computation=on_false
+  get-first-index = f32[3,3,128,128]
+    get-tuple-element(conditional), index=0
+  add.1 = f32[3,3,128,128] add(f32[3,3,128,128] get-first-index, f32[3,3,128,128] get-first-index)
+  ROOT result = (f32[3,3,128,128]) tuple(add.1)
+}
+)";
+  auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  ConditionalCodeMotion pass(true, true);
+  ASSERT_TRUE(pass.Run(&*module).ValueOrDie());
+  const HloInstruction* conditional =
+      FindInstruction(module.get(), "conditional");
+  const HloComputation* on_true = conditional->branch_computation(0);
+  ASSERT_EQ(on_true->instruction_count(), 5);
+  const HloComputation* on_false = conditional->branch_computation(1);
+  ASSERT_EQ(on_false->instruction_count(), 5);
+
+  // Checks if conditional shape has changed.
+  ASSERT_TRUE(ShapeUtil::Compatible(
+      conditional->shape(), ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(
+                                BF16, {3, 3, 128, 128})})));
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root,
+      AllOf(op::Tuple(op::Add(
+          op::Convert(op::AllReduce(op::GetTupleElement(op::Conditional()))),
+          op::Convert(
+              op::AllReduce(op::GetTupleElement(op::Conditional())))))));
+}
+
 }  // namespace conditional_opt
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/convolution_group_converter.cc b/tensorflow/compiler/xla/service/convolution_group_converter.cc
index 323bf44dcd3..f5506b894fd 100644
--- a/tensorflow/compiler/xla/service/convolution_group_converter.cc
+++ b/tensorflow/compiler/xla/service/convolution_group_converter.cc
@@ -300,7 +300,8 @@ Status ConvolutionVisitor::HandleBatchGroupCount(HloInstruction* convolution) {
     window_dim->set_window_dilation(1);
     HloInstruction* new_convolution =
         MakeConvolveHlo(activation, filter, convolution->feature_group_count(),
-                        window, dim_numbers, convolution->precision_config())
+                        /*batch_group_count=*/1, window, dim_numbers,
+                        convolution->precision_config())
             .ValueOrDie();
     convolution->SetupDerivedInstruction(new_convolution);
     TF_CHECK_OK(computation_->ReplaceInstruction(
@@ -649,7 +650,8 @@ Status ConvolutionVisitor::HandleConvolution(HloInstruction* convolution) {
   window_dim->set_window_reversal(false);
   window_dim->set_window_dilation(1);
   HloInstruction* new_convolution =
-      MakeConvolveHlo(activation, filter, 1, window, dim_numbers,
+      MakeConvolveHlo(activation, filter, /*feature_group_count=*/1,
+                      /*batch_group_count=*/1, window, dim_numbers,
                       convolution->precision_config())
           .ValueOrDie();
   convolution->SetupDerivedInstruction(new_convolution);
diff --git a/tensorflow/compiler/xla/service/copy_insertion.cc b/tensorflow/compiler/xla/service/copy_insertion.cc
index b88120d8128..f2e37ca23b6 100644
--- a/tensorflow/compiler/xla/service/copy_insertion.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion.cc
@@ -362,6 +362,19 @@ Status AddCopiesForConditional(const HloAliasAnalysis& alias_analysis,
   return Status::OK();
 }
 
+// Add copies for the operands of in-place operations. RemoveUnnecessaryCopies
+// will remove the unnecessary copies.
+Status AddCopiesForInPlaceOperation(const HloAliasAnalysis& alias_analysis,
+                                    HloInstruction* in_place_op,
+                                    int64 operand_number) {
+  VLOG(2) << "Adding copies for in-place operation " << in_place_op->name();
+  HloInstruction* operand = in_place_op->mutable_operand(operand_number);
+  TF_ASSIGN_OR_RETURN(HloInstruction * deep_copy,
+                      in_place_op->parent()->DeepCopyInstruction(operand));
+  TF_RETURN_IF_ERROR(operand->ReplaceUseWith(in_place_op, deep_copy));
+  return Status::OK();
+}
+
 // Conservatively adds copies before root instruction of entry computation and
 // each aliased parameter to resolve interference of aliased input and output
 // buffer. We later rely on RemoveUnnecessaryCopies to drop the unnecessary
@@ -509,6 +522,12 @@ class CopyRemover {
     // value. The map is used to construct the copy info map below.
     absl::flat_hash_map<const HloValue*, ValueNode*> value_to_node;
     for (const HloBuffer& buffer : alias_analysis.buffers()) {
+      // No copies should have been inserted within fused computations, so no
+      // need to remove them. HloOrdering isn't compatible with HloValues inside
+      // fusions, so skip copy removal for them.
+      if (buffer.values().at(0)->defining_instruction()->IsFused()) {
+        continue;
+      }
       // Verify values contained in the buffer are strictly ordered. This
       // should always be the case after adding copies to eliminate
       // interference. Specifically, the addition of the control flow edges
@@ -591,7 +610,7 @@ class CopyRemover {
   void CreateCopyMap(
       const HloModule& module,
       const absl::flat_hash_map<const HloValue*, ValueNode*>& value_to_node) {
-    for (HloComputation* computation : module.computations()) {
+    for (HloComputation* computation : module.MakeNonfusionComputations()) {
       for (HloInstruction* instruction : computation->instructions()) {
         // Add copies with unambiguous source values to the map. Copies with
         // ambiguous sources are not removable.
@@ -1005,7 +1024,7 @@ Status CopyInsertion::AddCopiesToResolveInterference(HloModule* module) {
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloAliasAnalysis> alias_analysis,
                       HloAliasAnalysis::Run(module, can_share_buffer_));
 
-  for (HloComputation* computation : module->MakeComputationPostOrder()) {
+  for (HloComputation* computation : module->MakeNonfusionComputations()) {
     for (HloInstruction* instruction :
          computation->MakeInstructionPostOrder()) {
       if (instruction->opcode() == HloOpcode::kWhile) {
@@ -1013,6 +1032,15 @@ Status CopyInsertion::AddCopiesToResolveInterference(HloModule* module) {
       } else if (instruction->opcode() == HloOpcode::kConditional) {
         TF_RETURN_IF_ERROR(
             AddCopiesForConditional(*alias_analysis, instruction));
+      } else {
+        for (const auto& operand_and_output_index :
+             HloDataflowAnalysis::GetInPlaceInputOutputPairs(instruction)) {
+          const HloUse& operand = operand_and_output_index.first;
+          CHECK_EQ(operand.operand_index, ShapeIndex{})
+              << "Support for non-{} shape operand not currently implemented.";
+          TF_RETURN_IF_ERROR(AddCopiesForInPlaceOperation(
+              *alias_analysis, instruction, operand.operand_number));
+        }
       }
     }
   }
diff --git a/tensorflow/compiler/xla/service/copy_insertion_test.cc b/tensorflow/compiler/xla/service/copy_insertion_test.cc
index 3ee6b200da5..78730cbdcb8 100644
--- a/tensorflow/compiler/xla/service/copy_insertion_test.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion_test.cc
@@ -2530,5 +2530,250 @@ ENTRY Entry {
   EXPECT_EQ(CountCopies(*module), 1);
 }
 
+TEST_F(CopyInsertionTest, DynamicUpdateSliceNoCopy) {
+  absl::string_view hlo_string = R"(
+HloModule Module
+
+ENTRY main {
+  param = f32[1280,1,128] parameter(0)
+  negate = f32[1280,1,128] negate(param)
+  constant.1 = f32[] constant(0)
+  broadcast.6 = f32[128,1,128] broadcast(constant.1), dimensions={}
+  constant.3 = s32[] constant(0)
+  ROOT dynamic-update-slice.5 = f32[1280,1,128] dynamic-update-slice(negate, broadcast.6, constant.3, constant.3, constant.3)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  InsertCopies(module.get());
+  EXPECT_EQ(CountCopies(*module), 0);
+}
+
+TEST_F(CopyInsertionTest, FusedDynamicUpdateSliceNoCopy) {
+  absl::string_view hlo_string = R"(
+HloModule Module
+
+fused_computation {
+  param0 = f32[1280,1,128] parameter(0)
+  constant.1 = f32[] constant(0)
+  broadcast.6 = f32[128,1,128] broadcast(constant.1), dimensions={}
+  constant.3 = s32[] constant(0)
+  ROOT dynamic-update-slice.5 = f32[1280,1,128] dynamic-update-slice(param0, broadcast.6, constant.3, constant.3, constant.3)
+}
+
+ENTRY main {
+  param = f32[1280,1,128] parameter(0)
+  negate = f32[1280,1,128] negate(param)
+  ROOT fusion = f32[1280,1,128] fusion(negate), kind=kLoop, calls=fused_computation
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  InsertCopies(module.get());
+  EXPECT_EQ(CountCopies(*module), 0);
+}
+
+TEST_F(CopyInsertionTest, DynamicUpdateSliceCopy) {
+  absl::string_view hlo_string = R"(
+HloModule Module
+
+ENTRY main {
+  param = f32[1280,1,128] parameter(0)
+  negate = f32[1280,1,128] negate(param)
+  constant.1 = f32[] constant(0)
+  broadcast.6 = f32[128,1,128] broadcast(constant.1), dimensions={}
+  constant.3 = s32[] constant(0)
+  add = f32[1280,1,128] add(negate, negate)
+  dynamic-update-slice.5 = f32[1280,1,128] dynamic-update-slice(negate, broadcast.6, constant.3, constant.3, constant.3)
+  ROOT tuple = (f32[1280,1,128], f32[1280,1,128]) tuple(add, dynamic-update-slice.5)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  InsertCopies(module.get());
+  EXPECT_EQ(CountCopies(*module), 1);
+}
+
+TEST_F(CopyInsertionTest, DynamicUpdateSliceParameterShareCopy) {
+  absl::string_view hlo_string = R"(
+HloModule Module
+
+ENTRY main {
+  param = f32[1280,1,128] parameter(0)
+  constant.1 = f32[] constant(0)
+  broadcast.6 = f32[128,1,128] broadcast(constant.1), dimensions={}
+  constant.3 = s32[] constant(0)
+  ROOT dynamic-update-slice.5 = f32[1280,1,128] dynamic-update-slice(param, broadcast.6, constant.3, constant.3, constant.3)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  InsertCopies(module.get());
+  EXPECT_EQ(CountCopies(*module), 1);
+}
+
+TEST_F(CopyInsertionTest, FusedDynamicUpdateSliceCopy) {
+  absl::string_view hlo_string = R"(
+HloModule Module
+
+fused_computation {
+  param0 = f32[1280,1,128] parameter(0)
+  constant.1 = f32[] constant(0)
+  broadcast.6 = f32[128,1,128] broadcast(constant.1), dimensions={}
+  constant.3 = s32[] constant(0)
+  ROOT dynamic-update-slice.5 = f32[1280,1,128] dynamic-update-slice(param0, broadcast.6, constant.3, constant.3, constant.3)
+}
+
+ENTRY main {
+  param = f32[1280,1,128] parameter(0)
+  negate = f32[1280,1,128] negate(param)
+  add = f32[1280,1,128] add(negate, negate)
+  fusion = f32[1280,1,128] fusion(negate), kind=kLoop, calls=fused_computation
+  ROOT tuple = (f32[1280,1,128], f32[1280,1,128]) tuple(negate, fusion)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  InsertCopies(module.get());
+  EXPECT_EQ(CountCopies(*module), 1);
+}
+
+TEST_F(CopyInsertionTest, ChainDynamicUpdateSliceCopy) {
+  absl::string_view hlo_string = R"(
+HloModule Module
+
+ENTRY main {
+  state = (s32[], f32[1280,1,128]{2,1,0}) parameter(0)
+  constant.1 = f32[] constant(0)
+  broadcast.6 = f32[128,1,128]{2,1,0} broadcast(constant.1), dimensions={}
+  get-tuple-element.4 = f32[1280,1,128]{2,1,0} get-tuple-element(state), index=1
+  get-tuple-element.3 = s32[] get-tuple-element(state), index=0
+  constant.2 = s32[] constant(128)
+  add.5 = s32[] add(get-tuple-element.3, constant.2)
+  constant.3 = s32[] constant(0)
+  dynamic-update-slice.5 = f32[1280,1,128]{2,1,0} dynamic-update-slice(get-tuple-element.4, broadcast.6, constant.3, constant.3, constant.3)
+  dynamic-update-slice.9 = f32[1280,1,128]{2,1,0} dynamic-update-slice(dynamic-update-slice.5, broadcast.6, constant.3, constant.3, constant.3)
+  ROOT tuple.85 = (s32[], s32[], s32[2]{0}, f32[1280,1,128]{2,1,0}) tuple(add.5, dynamic-update-slice.9)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  InsertCopies(module.get());
+  EXPECT_EQ(CountCopies(*module), 1);
+}
+
+TEST_F(CopyInsertionTest, FusedDynamicUpdateSliceCopy2) {
+  absl::string_view hlo_string = R"(
+HloModule Module
+
+fused_computation.1 {
+  param0 = f32[1280,1,128] parameter(0)
+  constant.1 = f32[] constant(0)
+  broadcast.6 = f32[128,1,128] broadcast(constant.1), dimensions={}
+  constant.3 = s32[] constant(0)
+  ROOT dynamic-update-slice.5 = f32[1280,1,128] dynamic-update-slice(param0, broadcast.6, constant.3, constant.3, constant.3)
+}
+
+fused_computation.2 {
+  param0 = f32[1280,1,128] parameter(0)
+  param1 = f32[1280,1,128] parameter(1)
+  slice = f32[128,1,128] slice(param1), slice={[0:128], [0:1], [0:128]}
+  constant.3 = s32[] constant(0)
+  ROOT dynamic-update-slice.5 = f32[1280,1,128] dynamic-update-slice(param0, slice, constant.3, constant.3, constant.3)
+}
+
+ENTRY main {
+  param = f32[1280,1,128] parameter(0)
+  negate = f32[1280,1,128] negate(param)
+  add = f32[1280,1,128] add(negate, negate)
+  fusion1 = f32[1280,1,128] fusion(negate), kind=kLoop, calls=fused_computation.1
+  ROOT fusion2 = f32[1280,1,128] fusion(fusion1, negate), kind=kLoop, calls=fused_computation.2
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  InsertCopies(module.get());
+  EXPECT_EQ(CountCopies(*module), 1);
+}
+
+TEST_F(CopyInsertionTest, MultiOutputFusedDynamicUpdateSliceCopy) {
+  // Tests multi-output fusion with two DUS outputs, requiring two copies.
+  absl::string_view hlo_string = R"(
+HloModule Module
+
+fused_computation {
+  param0 = f32[1280,1,128] parameter(0)
+  param1 = f32[1280,1,128] parameter(1)
+  param2 = f32[1280,1,128] parameter(2)
+  constant.1 = f32[] constant(0)
+  broadcast.6 = f32[128,1,128] broadcast(constant.1), dimensions={}
+  constant.3 = s32[] constant(0)
+  add.1 = f32[1280,1,128] add(param0, param0)
+  dynamic-update-slice.5 = f32[1280,1,128] dynamic-update-slice(param1, broadcast.6, constant.3, constant.3, constant.3)
+  dynamic-update-slice.6 = f32[1280,1,128] dynamic-update-slice(param2, broadcast.6, constant.3, constant.3, constant.3)
+  ROOT tuple.1 = (f32[1280,1,128], f32[1280,1,128], f32[1280,1,128]) tuple(add.1, dynamic-update-slice.5, dynamic-update-slice.6)
+}
+
+ENTRY main {
+  param = f32[1280,1,128] parameter(0)
+  negate0 = f32[1280,1,128] negate(param)
+  negate1 = f32[1280,1,128] negate(param)
+  negate2 = f32[1280,1,128] negate(param)
+  fusion = (f32[1280,1,128], f32[1280,1,128], f32[1280,1,128]) fusion(negate0, negate1, negate2), kind=kLoop, calls=fused_computation
+  gte0 = f32[1280,1,128] get-tuple-element(fusion), index=0
+  gte1 = f32[1280,1,128] get-tuple-element(fusion), index=1
+  gte2 = f32[1280,1,128] get-tuple-element(fusion), index=2
+  add0 = f32[1280,1,128] add(negate0, gte0)
+  add1 = f32[1280,1,128] add(negate1, gte1)
+  add2 = f32[1280,1,128] add(negate2, gte2)
+  ROOT tuple = (f32[1280,1,128], f32[1280,1,128], f32[1280,1,128]) tuple(add0, add1, add2)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  InsertCopies(module.get());
+  EXPECT_EQ(CountCopies(*module), 2);
+}
+
+TEST_F(CopyInsertionTest, MultiOutputFusedDynamicUpdateSliceNoCopy) {
+  // Same as above, but negate1 is not used beyond fusion, so it only needs one
+  // copy for negate0.
+  absl::string_view hlo_string = R"(
+HloModule Module
+
+fused_computation {
+  param0 = f32[1280,1,128] parameter(0)
+  param1 = f32[1280,1,128] parameter(1)
+  param2 = f32[1280,1,128] parameter(2)
+  constant.1 = f32[] constant(0)
+  broadcast.6 = f32[128,1,128] broadcast(constant.1), dimensions={}
+  constant.3 = s32[] constant(0)
+  add.1 = f32[1280,1,128] add(param0, param0)
+  dynamic-update-slice.5 = f32[1280,1,128] dynamic-update-slice(param1, broadcast.6, constant.3, constant.3, constant.3)
+  dynamic-update-slice.6 = f32[1280,1,128] dynamic-update-slice(param2, broadcast.6, constant.3, constant.3, constant.3)
+  ROOT tuple.1 = (f32[1280,1,128], f32[1280,1,128], f32[1280,1,128]) tuple(add.1, dynamic-update-slice.5, dynamic-update-slice.6)
+}
+
+ENTRY main {
+  param = f32[1280,1,128] parameter(0)
+  negate0 = f32[1280,1,128] negate(param)
+  negate1 = f32[1280,1,128] negate(param)
+  negate2 = f32[1280,1,128] negate(param)
+  fusion = (f32[1280,1,128], f32[1280,1,128], f32[1280,1,128]) fusion(negate0, negate1, negate2), kind=kLoop, calls=fused_computation
+  gte0 = f32[1280,1,128] get-tuple-element(fusion), index=0
+  gte1 = f32[1280,1,128] get-tuple-element(fusion), index=1
+  gte2 = f32[1280,1,128] get-tuple-element(fusion), index=2
+  add0 = f32[1280,1,128] add(negate0, gte0)
+  add1 = f32[1280,1,128] add(gte1, gte1)
+  add2 = f32[1280,1,128] add(negate2, gte2)
+  ROOT tuple = (f32[1280,1,128], f32[1280,1,128], f32[1280,1,128]) tuple(add0, add1, add2)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  InsertCopies(module.get());
+  EXPECT_EQ(CountCopies(*module), 1);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index 6eaf43902fe..4e25d667d03 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -130,21 +130,24 @@ cc_library(
         ":target_machine_features",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/types:span",
+        "@llvm-project//mlir:Affine",
         "@llvm-project//mlir:AllPassesAndDialectsNoRegistration",
-        "@llvm-project//mlir:ExecutionEngineUtils",
         "@llvm-project//mlir:LLVMDialect",
+        "@llvm-project//mlir:LinalgOps",
+        "@llvm-project//mlir:SCFDialect",
+        "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:VectorOps",
         "//tensorflow/compiler/xla/service:copy_insertion",
-        "//tensorflow/compiler/xla/service:hlo_casting_utils",
         "//tensorflow/compiler/xla/service:dump",
         "//tensorflow/compiler/xla/service:topk_rewriter",
         "//tensorflow/compiler/xla/service:map_inliner",
         "//tensorflow/compiler/xla/service:rng_bit_generator_expander",
         "//tensorflow/compiler/xla/service:tree_reduction_rewriter",
-        "//tensorflow/compiler/xla/service:hlo_get_dimension_size_rewriter",
         "//tensorflow/compiler/xla/service:conditional_canonicalizer",
         "//tensorflow/compiler/xla/service:conditional_to_select",
         "//tensorflow/compiler/xla/service:slow_operation_alarm",
         "//tensorflow/compiler/xla/service:scatter_expander",
+        "//tensorflow/compiler/xla/service:comparison_expander",
         "//tensorflow/compiler/xla/service:slice_sinker",
         "//tensorflow/compiler/xla:cpu_function_runtime",
         "//tensorflow/compiler/xla:literal",
@@ -183,6 +186,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo_verifier",
         "//tensorflow/compiler/xla/service:indexed_array_analysis",
         "//tensorflow/compiler/xla/service:llvm_compiler",
+        "//tensorflow/compiler/xla/service:gather_expander",
         "//tensorflow/compiler/xla/service:reshape_mover",
         "//tensorflow/compiler/xla/service:rng_expander",
         "//tensorflow/compiler/xla/service:sort_simplifier",
@@ -197,7 +201,6 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "@llvm-project//llvm:Core",
-        "@llvm-project//llvm:MC",
         "@llvm-project//llvm:Object",
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:Target",
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index 0826d7b8ce1..e6c72e60636 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -42,7 +42,12 @@ limitations under the License.
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"  // from @llvm-project
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
+#include "mlir/Dialect/Linalg/IR/LinalgTypes.h"  // from @llvm-project
+#include "mlir/Dialect/SCF/SCF.h"  // from @llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/Dialect/Vector/VectorOps.h"  // from @llvm-project
 #include "mlir/InitAllDialects.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/cpu_function_runtime.h"
 #include "tensorflow/compiler/xla/literal.h"
@@ -54,6 +59,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/call_inliner.h"
 #include "tensorflow/compiler/xla/service/cholesky_expander.h"
+#include "tensorflow/compiler/xla/service/comparison_expander.h"
 #include "tensorflow/compiler/xla/service/conditional_canonicalizer.h"
 #include "tensorflow/compiler/xla/service/conditional_simplifier.h"
 #include "tensorflow/compiler/xla/service/conditional_to_select.h"
@@ -77,13 +83,13 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/dynamic_index_splitter.h"
 #include "tensorflow/compiler/xla/service/dynamic_padder.h"
 #include "tensorflow/compiler/xla/service/flatten_call_graph.h"
+#include "tensorflow/compiler/xla/service/gather_expander.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_constant_folding.h"
 #include "tensorflow/compiler/xla/service/hlo_cse.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
 #include "tensorflow/compiler/xla/service/hlo_element_type_converter.h"
-#include "tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_memory_scheduler.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
@@ -120,6 +126,21 @@ limitations under the License.
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/dynamic_annotations.h"
 
+namespace {
+
+// We need to explicitly load all the dialects we will involved in emitting the
+// IR. This is only needed because of how MLIR is bolted into XLA and does not
+// make use of the MLIR infrastructure (like using a proper pass pipeline).
+// Hopefully this will all go away at some point in favor of a better
+// integration.
+void LoadMLIRDialects(mlir::MLIRContext& context) {
+  context.loadDialect<mlir::linalg::LinalgDialect, mlir::scf::SCFDialect,
+                      mlir::vector::VectorDialect, mlir::StandardOpsDialect,
+                      mlir::AffineDialect>();
+}
+
+}  // namespace
+
 namespace xla {
 namespace cpu {
 using BufferInfo = cpu_function_runtime::BufferInfo;
@@ -163,8 +184,6 @@ CpuCompiler::CpuCompiler() {
   // Initialize LLVM's MC layer for the native target.
   llvm::InitializeNativeTarget();
   llvm::InitializeNativeTargetAsmPrinter();
-
-  mlir::registerAllDialects();
 }
 
 namespace {
@@ -260,6 +279,7 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
   pipeline.AddPass<ConditionalToSelect>();
   pipeline.AddPass<MapInliner>();
 
+  pipeline.AddPass<ComparisonExpander>();
   pipeline.AddPass<CholeskyExpander>();
   pipeline.AddPass<TriangularSolveExpander>();
 
@@ -288,8 +308,7 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
       /*expansion_type=*/LogisticExpansionType::kExp);
   pipeline.AddPass<ConditionalCanonicalizer>();
   pipeline.AddPass<DynamicPadder>();
-  pipeline.AddPass<ScatterExpander>();
-  pipeline.AddPass<HloGetDimensionSizeRewriter>();
+  pipeline.AddPass<ScatterExpander>(ScatterExpander::kEliminateAllScatters);
   pipeline.AddPass<ConvCanonicalization>(target_machine_features);
   {
     auto& pass =
@@ -303,6 +322,7 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
     pass.AddPass<AlgebraicSimplifier>(options);
     pass.AddPass<SortSimplifier>();
     pass.AddPass<HloDCE>();
+    pass.AddPass<GatherExpander>(GatherExpander::kEliminateSimpleGathers);
 
     // BatchNormExpander can create zero-sized ops, so zero-sized HLO
     // elimination has to come after that pass.
@@ -620,10 +640,10 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
 
   // Compile must be thread-safe so create a new LLVM context for the module.
   mlir::MLIRContext mlir_context;
-  auto llvm_module = absl::make_unique<llvm::Module>(
-      "__compute_module",
-      mlir_context.getRegisteredDialect<mlir::LLVM::LLVMDialect>()
-          ->getLLVMContext());
+  LoadMLIRDialects(mlir_context);
+  llvm::LLVMContext llvm_context;
+  auto llvm_module =
+      absl::make_unique<llvm::Module>("__compute_module", llvm_context);
 
   auto jit = absl::make_unique<SimpleOrcJIT>(
       CompilerTargetOptions(module->config()),
@@ -832,10 +852,9 @@ CpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
 
   // Compile must be thread-safe so create a new LLVM context for the module.
   mlir::MLIRContext mlir_context;
-  llvm::Module llvm_module(
-      "__compute_module",
-      mlir_context.getRegisteredDialect<mlir::LLVM::LLVMDialect>()
-          ->getLLVMContext());
+  LoadMLIRDialects(mlir_context);
+  llvm::LLVMContext llvm_context;
+  llvm::Module llvm_module("__compute_module", llvm_context);
   llvm_module.setDataLayout(target_machine->createDataLayout());
   llvm_module.setTargetTriple(triple.getTriple());
   if (pic_level != llvm::PICLevel::NotPIC) {
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
index 0abcc91a1d7..7431e829b8e 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
@@ -247,6 +247,12 @@ StatusOr<ExecutionOutput> CpuExecutable::CreateResultShapedBuffer(
       ExecutionInput& input = arguments[alias->parameter_number];
       MaybeOwningDeviceMemory* maybe_owning_memory =
           input.MutableBuffer(alias->parameter_index);
+      if (alias->must_alias() && !maybe_owning_memory->HasOwnership()) {
+        return InvalidArgument(
+            "An input was configured to be must-alias at "
+            "compile time but not donated at runtime: %s",
+            alias->ToString());
+      }
       if (absl::optional<se::OwningDeviceMemory> owning =
               maybe_owning_memory->Release()) {
         // If the caller passes the ownership of the device memory, reuse it
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
index 9460cc55e10..42c6c9839bf 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
@@ -95,7 +95,7 @@ bool CpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
   // Cost condition: not fuse (simple, expensive producers) and (consumers who
   // reuse operand elements).
   if (producer->opcode() != HloOpcode::kFusion && is_expensive(*producer) &&
-      consumer->ReusesOperandElements(operand_index)) {
+      ReusesOperandElements(consumer, operand_index)) {
     VLOG(2) << "Fusion is not profitable.";
     return false;
   }
@@ -132,7 +132,7 @@ bool CpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
       fusion_node_evaluations_.emplace(consumer,
                                        FusionNodeIndexingEvaluation(consumer));
     }
-    if (fusion_node_evaluations_.at(consumer).AverageCodeDuplicationTooHigh(
+    if (fusion_node_evaluations_.at(consumer).CodeDuplicationTooHigh(
             producer)) {
       return false;
     }
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index 72f4d5369c8..36566d6c25f 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -1640,7 +1640,7 @@ IrEmitter::ShardedVectorType IrEmitter::CreateShardedVectorType(
 
     if (current_size_fragment >= vector_register_size_in_elements) {
       auto vector_type = llvm::VectorType::get(
-          element_ir_type, vector_register_size_in_elements);
+          element_ir_type, vector_register_size_in_elements, false);
       sharded_vector_type.insert(
           sharded_vector_type.end(),
           current_size_fragment / vector_register_size_in_elements,
@@ -1656,7 +1656,7 @@ IrEmitter::ShardedVectorType IrEmitter::CreateShardedVectorType(
     // of two are all legal vector sizes (or at least can be lowered easily by
     // LLVM).
     sharded_vector_type.push_back(
-        llvm::VectorType::get(element_ir_type, current_size_fragment));
+        llvm::VectorType::get(element_ir_type, current_size_fragment, false));
   }
   return sharded_vector_type;
 }
@@ -2412,11 +2412,14 @@ Status IrEmitter::HandleTopK(HloInstruction* hlo) {
       EmitBufferPointer(out_values_slice, hlo->shape().tuple_shapes(0));
   llvm::Value* out_indices_ptr =
       EmitBufferPointer(out_indices_slice, hlo->shape().tuple_shapes(1));
-  EmitCallToFunc(runtime::kTopKF32SymbolName,
-                 {b_.getInt64(has_batch ? input->shape().dimensions(0) : 1),
-                  b_.getInt64(input->shape().dimensions().back()),
-                  b_.getInt64(k), values_ptr, out_values_ptr, out_indices_ptr},
-                 b_.getVoidTy());
+  EmitCallToFunc(
+      runtime::kTopKF32SymbolName,
+      {b_.getInt64(has_batch ? input->shape().dimensions(0) : 1),
+       b_.getInt64(input->shape().dimensions().back()), b_.getInt64(k),
+       BitCast(values_ptr, b_.getFloatTy()->getPointerTo()),
+       BitCast(out_values_ptr, b_.getFloatTy()->getPointerTo()),
+       BitCast(out_indices_ptr, b_.getInt32Ty()->getPointerTo())},
+      b_.getVoidTy());
 
   llvm_ir::EmitTuple(GetIrArrayFor(hlo), {out_values_ptr, out_indices_ptr},
                      &b_);
diff --git a/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc b/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc
index 8d9229c1223..3afdd9c163e 100644
--- a/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc
+++ b/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc
@@ -115,7 +115,7 @@ void RewriteCalls(
 
   // Upcast to vector type if input is a scalar.
   if (vector_width == 1) {
-    llvm::Type* v1_type = llvm::VectorType::get(input->getType(), 1);
+    llvm::Type* v1_type = llvm::VectorType::get(input->getType(), 1, false);
     input = b.CreateInsertElement(llvm::UndefValue::get(v1_type), input,
                                   uint64_t{0});
   }
@@ -264,8 +264,8 @@ llvm::Value* GenerateVF32Exp(llvm::IRBuilder<>* b, llvm::Value* input,
   z = vsl.Add(one, z);
 
   // Convert n' to an i32.  This is safe because we clamped it above.
-  llvm::Value* n_i32 =
-      b->CreateFPToSI(n, llvm::VectorType::get(b->getInt32Ty(), vector_width));
+  llvm::Value* n_i32 = b->CreateFPToSI(
+      n, llvm::VectorType::get(b->getInt32Ty(), vector_width, false));
 
   auto splat_i32 = [&](int32 v) {
     return b->CreateVectorSplat(vector_width, b->getInt32(v));
@@ -329,7 +329,7 @@ llvm::Value* GenerateVF32Log(llvm::IRBuilder<>* b, llvm::Value* input,
   llvm::Value* vector_constant_23 =
       b->CreateVectorSplat(vector_width, b->getInt32(23));
   llvm::Type* i32_vector_type =
-      llvm::VectorType::get(b->getInt32Ty(), vector_width);
+      llvm::VectorType::get(b->getInt32Ty(), vector_width, false);
 
   llvm::Value* emm0 = b->CreateLShr(b->CreateBitCast(tmp0, i32_vector_type),
                                     vector_constant_23);
diff --git a/tensorflow/compiler/xla/service/cpu/mlir_emitter.cc b/tensorflow/compiler/xla/service/cpu/mlir_emitter.cc
index ff48f554ce6..ae23f224207 100644
--- a/tensorflow/compiler/xla/service/cpu/mlir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/mlir_emitter.cc
@@ -32,7 +32,8 @@ namespace cpu {
 namespace {
 
 // Lower an MLIR module to an LLVM module.
-std::unique_ptr<llvm::Module> MakeLLVMModule(mlir::OwningModuleRef module) {
+std::unique_ptr<llvm::Module> MakeLLVMModule(mlir::OwningModuleRef module,
+                                             llvm::LLVMContext *context) {
   // When set, the LLVM backend will be allowed to reassociate floating-point
   // reductions, which enables much more efficient "horizontal" SIMD
   // implementations.
@@ -47,7 +48,7 @@ std::unique_ptr<llvm::Module> MakeLLVMModule(mlir::OwningModuleRef module) {
       mlir::LowerVectorToLLVMOptions().setReassociateFPReductions(
           kReassociateFPReductions)));
   CHECK(succeeded(manager.run(*module)));
-  return mlir::translateModuleToLLVMIR(*module);
+  return mlir::translateModuleToLLVMIR(*module, *context);
 }
 
 // Get arguments to pass a memref to an mlir function.
@@ -114,7 +115,8 @@ Status EmitMlirFuncAndCall(
   emitter(&op_builder, function);
 
   // Now link it all into the main LLVM module.
-  auto mlir_llvm_module = MakeLLVMModule(std::move(mlir_module));
+  auto mlir_llvm_module =
+      MakeLLVMModule(std::move(mlir_module), &b->getContext());
   mlir_llvm_module->setDataLayout(llvm_module->getDataLayout());
   llvm::Linker::linkModules(
       *llvm_module, std::move(mlir_llvm_module), llvm::Linker::None,
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc
index 225102e6ae6..48f2248d2d7 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc
+++ b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc
@@ -143,7 +143,8 @@ int64 ParallelTaskAssignment::GetTargetParallelTaskCount(
   // TODO(b/27458679) Parallelize instructions which are skipped here.
   auto opcode = instruction->opcode();
   if (llvm_ir::MayBeImplementedAsInPlaceDynamicUpdateSlice(instruction) ||
-      instruction->shape().IsTuple() || opcode == HloOpcode::kRng) {
+      instruction->shape().IsTuple() || opcode == HloOpcode::kRng ||
+      opcode == HloOpcode::kConstant) {
     return 1;
   }
 
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc
index e22210a61f2..5b454379876 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc
@@ -191,5 +191,19 @@ TEST_F(ParallelTaskAssignmentTest, AllReduceNotParallelized) {
   EXPECT_FALSE(changed);
 }
 
+TEST_F(ParallelTaskAssignmentTest, ConstantNotParallelized) {
+  constexpr char hlo_string[] = R"(
+  HloModule TestTaskParallel_constant
+    ENTRY const {
+      ROOT constant = f32[1234567] constant({...})
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunParallelTaskAssigner(m.get()));
+  EXPECT_FALSE(changed);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/vector_support_library.cc b/tensorflow/compiler/xla/service/cpu/vector_support_library.cc
index 0d2eab9fd42..48aa32f6b8f 100644
--- a/tensorflow/compiler/xla/service/cpu/vector_support_library.cc
+++ b/tensorflow/compiler/xla/service/cpu/vector_support_library.cc
@@ -33,7 +33,7 @@ VectorSupportLibrary::VectorSupportLibrary(PrimitiveType primitive_type,
   scalar_type_ = llvm_ir::PrimitiveTypeToIrType(
       primitive_type, b_->GetInsertBlock()->getModule());
   scalar_pointer_type_ = llvm::PointerType::getUnqual(scalar_type_);
-  vector_type_ = llvm::VectorType::get(scalar_type_, vector_size);
+  vector_type_ = llvm::VectorType::get(scalar_type_, vector_size, false);
   vector_pointer_type_ = llvm::PointerType::getUnqual(vector_type_);
 }
 
@@ -155,7 +155,7 @@ llvm::Type* VectorSupportLibrary::IntegerTypeForFloatSize(bool vector) {
   int64 float_size_bits = data_layout.getTypeSizeInBits(scalar_type());
   llvm::Type* scalar_int_type = b()->getIntNTy(float_size_bits);
   if (vector) {
-    return llvm::VectorType::get(scalar_int_type, vector_size());
+    return llvm::VectorType::get(scalar_int_type, vector_size(), false);
   } else {
     return scalar_int_type;
   }
diff --git a/tensorflow/compiler/xla/service/cpu/vector_support_library.h b/tensorflow/compiler/xla/service/cpu/vector_support_library.h
index f1a0b0a4406..cbed232897f 100644
--- a/tensorflow/compiler/xla/service/cpu/vector_support_library.h
+++ b/tensorflow/compiler/xla/service/cpu/vector_support_library.h
@@ -276,7 +276,7 @@ class VectorSupportLibrary {
     llvm::Constant* scalar_value = llvm::ConstantFP::get(type->getContext(), f);
     if (llvm::isa<llvm::VectorType>(type)) {
       return llvm::ConstantVector::getSplat(
-          llvm::ElementCount(vector_size(), /*Scalable=*/false), scalar_value);
+          llvm::ElementCount::getFixed(vector_size()), scalar_value);
     }
     return scalar_value;
   }
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
index b0def1a2dd8..60d832a940a 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
@@ -245,6 +245,7 @@ class DfsHloVisitorBase {
   virtual Status HandleBitcast(HloInstructionPtr hlo) = 0;
   virtual Status HandleBroadcast(HloInstructionPtr hlo) = 0;
   virtual Status HandleReshape(HloInstructionPtr hlo) = 0;
+  virtual Status HandleDynamicReshape(HloInstructionPtr hlo) = 0;
   virtual Status HandleTranspose(HloInstructionPtr hlo) = 0;
   virtual Status HandleParameter(HloInstructionPtr hlo) = 0;
   virtual Status HandleFusion(HloInstructionPtr hlo) = 0;
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
index b1d674fe467..3d1a9a3c894 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
@@ -198,6 +198,9 @@ class DfsHloVisitorWithDefaultBase
   Status HandlePad(HloInstructionPtr pad) override {
     return DefaultAction(pad);
   }
+  Status HandleDynamicReshape(HloInstructionPtr dynamic_reshape) override {
+    return DefaultAction(dynamic_reshape);
+  }
   Status HandleReshape(HloInstructionPtr reshape) override {
     return DefaultAction(reshape);
   }
diff --git a/tensorflow/compiler/xla/service/dot_as_convolution_util.cc b/tensorflow/compiler/xla/service/dot_as_convolution_util.cc
index 4670ce6940a..3adde5f7d48 100644
--- a/tensorflow/compiler/xla/service/dot_as_convolution_util.cc
+++ b/tensorflow/compiler/xla/service/dot_as_convolution_util.cc
@@ -49,14 +49,11 @@ bool ConvSpatialDimensionIsParallel(const WindowDimension& wd, int64 lhs_size) {
   return false;
 }
 
-/* static */ absl::optional<DotGeneralAsConvolutionDimsInfo>
-ParseDotGeneralFromConvolution(const HloInstruction* conv) {
+/* static */ DotConvolutionDimsInfo ParseConvolutionDimsInfo(
+    const HloInstruction* conv) {
   CHECK_EQ(conv->opcode(), HloOpcode::kConvolution);
-  if (conv->feature_group_count() != 1 || conv->batch_group_count() != 1) {
-    return absl::nullopt;
-  }
   const auto& conv_dims = conv->convolution_dimension_numbers();
-  DotGeneralAsConvolutionDimsInfo dims;
+  DotConvolutionDimsInfo dims;
   dims.lhs_non_contracting_dims.push_back(
       {conv_dims.input_batch_dimension(), -1,
        conv_dims.output_batch_dimension(), -1});
@@ -98,10 +95,10 @@ ParseDotGeneralFromConvolution(const HloInstruction* conv) {
         // padding N - 1,  high padding N - 1 and window reversal.
         dims.rhs_non_contracting_dims.push_back({lhs, rhs, output, i});
       } else {
-        return absl::nullopt;
+        dims.conv_spatial_dims.push_back({lhs, rhs, output, i});
       }
     } else {
-      return absl::nullopt;
+      dims.conv_spatial_dims.push_back({lhs, rhs, output, i});
     }
   }
 
@@ -110,8 +107,7 @@ ParseDotGeneralFromConvolution(const HloInstruction* conv) {
 
 StatusOr<std::unique_ptr<HloInstruction>>
 CreateShardedConvForDotGeneralConvolution(
-    const HloInstruction& conv,
-    const DotGeneralAsConvolutionDimsInfo& dot_dnums,
+    const HloInstruction& conv, const DotConvolutionDimsInfo& dot_dnums,
     HloInstruction* sharded_lhs_hlo, HloInstruction* sharded_rhs_hlo) {
   CHECK_EQ(conv.opcode(), HloOpcode::kConvolution);
   const auto& conv_dnums = conv.convolution_dimension_numbers();
@@ -141,16 +137,66 @@ CreateShardedConvForDotGeneralConvolution(
     wd->set_padding_high(wd->size() - 1);
     wd->set_padding_low(wd->size() - 1);
   }
-  TF_ASSIGN_OR_RETURN(Shape sharded_conv_shape,
-                      ShapeInference::InferConvolveShape(
-                          sharded_lhs_hlo->shape(), sharded_rhs_hlo->shape(),
-                          /*feature_group_count=*/1,
-                          /*batch_group_count=*/1, window, conv_dnums));
+  TF_ASSIGN_OR_RETURN(
+      Shape sharded_conv_shape,
+      ShapeInference::InferConvolveShape(
+          sharded_lhs_hlo->shape(), sharded_rhs_hlo->shape(),
+          /*feature_group_count=*/conv.feature_group_count(),
+          /*batch_group_count=*/conv.batch_group_count(), window, conv_dnums));
   *sharded_conv_shape.mutable_layout() = conv.shape().layout();
   return HloInstruction::CreateConvolve(
       sharded_conv_shape, sharded_lhs_hlo, sharded_rhs_hlo,
-      /*feature_group_count=*/1,
-      /*batch_group_count=*/1, window, conv_dnums, conv.precision_config());
+      /*feature_group_count=*/conv.feature_group_count(),
+      /*batch_group_count=*/conv.batch_group_count(), window, conv_dnums,
+      conv.precision_config());
+}
+
+DotConvolutionDimsInfo ParseDotGeneralFromDot(const HloInstruction* dot) {
+  const auto& dot_dim_numbs = dot->dot_dimension_numbers();
+  dot_as_convolution_util::DotConvolutionDimsInfo dnums;
+  for (int64 i = 0; i < dot_dim_numbs.lhs_batch_dimensions().size(); ++i) {
+    dnums.batch_dims.emplace_back();
+    dnums.batch_dims.back().lhs = dot_dim_numbs.lhs_batch_dimensions(i);
+    dnums.batch_dims.back().rhs = dot_dim_numbs.rhs_batch_dimensions(i);
+    dnums.batch_dims.back().output = i;
+    dnums.batch_dims.back().spatial_dim = -1;
+  }
+  for (int64 i = 0; i < dot_dim_numbs.lhs_contracting_dimensions().size();
+       ++i) {
+    dnums.contracting_dims.emplace_back();
+    dnums.contracting_dims.back().lhs =
+        dot_dim_numbs.lhs_contracting_dimensions(i);
+    dnums.contracting_dims.back().rhs =
+        dot_dim_numbs.rhs_contracting_dimensions(i);
+    dnums.contracting_dims.back().output = -1;
+    dnums.contracting_dims.back().spatial_dim = -1;
+  }
+  for (int64 i = 0; i < dot->operand(0)->shape().rank(); ++i) {
+    if (!absl::c_linear_search(dot_dim_numbs.lhs_batch_dimensions(), i) &&
+        !absl::c_linear_search(dot_dim_numbs.lhs_contracting_dimensions(), i)) {
+      dnums.lhs_non_contracting_dims.emplace_back();
+      dnums.lhs_non_contracting_dims.back().lhs = i;
+      dnums.lhs_non_contracting_dims.back().rhs = -1;
+      dnums.lhs_non_contracting_dims.back().output =
+          dot_dim_numbs.lhs_batch_dimensions_size() +
+          dnums.lhs_non_contracting_dims.size() - 1;
+      dnums.lhs_non_contracting_dims.back().spatial_dim = -1;
+    }
+  }
+  for (int64 i = 0; i < dot->operand(1)->shape().rank(); ++i) {
+    if (!absl::c_linear_search(dot_dim_numbs.rhs_batch_dimensions(), i) &&
+        !absl::c_linear_search(dot_dim_numbs.rhs_contracting_dimensions(), i)) {
+      dnums.rhs_non_contracting_dims.emplace_back();
+      dnums.rhs_non_contracting_dims.back().lhs = -1;
+      dnums.rhs_non_contracting_dims.back().rhs = i;
+      dnums.rhs_non_contracting_dims.back().output =
+          dot_dim_numbs.lhs_batch_dimensions_size() +
+          dnums.lhs_non_contracting_dims.size() +
+          dnums.rhs_non_contracting_dims.size() - 1;
+      dnums.rhs_non_contracting_dims.back().spatial_dim = -1;
+    }
+  }
+  return dnums;
 }
 
 }  // namespace dot_as_convolution_util
diff --git a/tensorflow/compiler/xla/service/dot_as_convolution_util.h b/tensorflow/compiler/xla/service/dot_as_convolution_util.h
index 6a7cacf812d..16a542208d2 100644
--- a/tensorflow/compiler/xla/service/dot_as_convolution_util.h
+++ b/tensorflow/compiler/xla/service/dot_as_convolution_util.h
@@ -25,8 +25,9 @@ limitations under the License.
 namespace xla {
 namespace dot_as_convolution_util {
 
-// Describes the dimensions of a convolution that can be interpreted as a dot.
-struct DotGeneralAsConvolutionDimsInfo {
+// Describes the dimensions of a convolution that can be interpreted as a dot
+// or a normal convolution.
+struct DotConvolutionDimsInfo {
   // The dimension numbers for the operands and output corresponding to a
   // logical dimension (e.g., batch, contracting, non-contracting). If an
   // operand or the output doesn't have the logical dimension, it is set to
@@ -43,23 +44,22 @@ struct DotGeneralAsConvolutionDimsInfo {
   std::vector<DimNums> contracting_dims;
   std::vector<DimNums> lhs_non_contracting_dims;
   std::vector<DimNums> rhs_non_contracting_dims;
+  std::vector<DimNums> conv_spatial_dims;
 };
 
-// Parses a convolution and returns a DotGeneralAsConvolutionDimsInfo if it can
-// be interpreted as a dot, or absl::nullopt otherwise.
-absl::optional<DotGeneralAsConvolutionDimsInfo> ParseDotGeneralFromConvolution(
-    const HloInstruction* conv);
+// Parses a convolution and returns a DotGeneralAsConvolutionDimsInfo. If it can
+// be interpreted as a dot, there is no conv_spatial_dims.
+DotConvolutionDimsInfo ParseConvolutionDimsInfo(const HloInstruction* conv);
 
 // Creates sharded convolution instruction that can be interpreted as a dot.
 // This is a utility for per-op partitioners.
 //  - 'conv' is the original convolution instruction.
-//  - 'dot_dnums' is the result of ParseDotGeneralFromConvolution() for 'conv'.
+//  - 'dot_dnums' is the result of ParseDotConvolutionDimsInfo() for 'conv'.
 //  - 'sharded_lhs_hlo' and 'sharded_rhs_hlo' are sharded inputs for the result
 //    convolution instruction.
 StatusOr<std::unique_ptr<HloInstruction>>
 CreateShardedConvForDotGeneralConvolution(
-    const HloInstruction& conv,
-    const DotGeneralAsConvolutionDimsInfo& dot_dnums,
+    const HloInstruction& conv, const DotConvolutionDimsInfo& dot_dnums,
     HloInstruction* sharded_lhs_hlo, HloInstruction* sharded_rhs_hlo);
 
 // Check if a spatial dim is parallel batch dimension.
@@ -68,6 +68,10 @@ CreateShardedConvForDotGeneralConvolution(
 // dilation B.
 bool ConvSpatialDimensionIsParallel(const WindowDimension& wd, int64 lhs_size);
 
+// Returns a DotConvolutionDimsInfo from a kDot instruction, where all
+// the spatial_dim values are set to -1.
+DotConvolutionDimsInfo ParseDotGeneralFromDot(const HloInstruction* dot);
+
 }  // namespace dot_as_convolution_util
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc b/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc
index 2f2456863e9..80f98775c01 100644
--- a/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc
+++ b/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc
@@ -97,6 +97,8 @@ class DynamicDimensionInferenceVisitor : public DfsHloVisitorWithDefault {
 
   Status HandleTranspose(HloInstruction* hlo) override;
 
+  Status HandleDynamicReshape(HloInstruction* hlo) override;
+
   Status HandleReshape(HloInstruction* hlo) override;
 
   Status HandleSort(HloInstruction* hlo) override;
@@ -621,6 +623,18 @@ Status DynamicDimensionInferenceVisitor::HandleClamp(HloInstruction* hlo) {
   return PassThroughDynamicDimension(hlo);
 }
 
+Status DynamicDimensionInferenceVisitor::HandleDynamicReshape(
+    HloInstruction* hlo) {
+  HloDynamicReshapeInstruction* dynamic_reshape =
+      Cast<HloDynamicReshapeInstruction>(hlo);
+  for (int64 i = 0; i < hlo->shape().rank(); ++i) {
+    if (hlo->shape().is_dynamic_dimension(i)) {
+      parent_->SetDynamicSize(hlo, {}, i, dynamic_reshape->dim_sizes(i));
+    }
+  }
+  return Status::OK();
+}
+
 Status DynamicDimensionInferenceVisitor::HandleReshape(HloInstruction* hlo) {
   return ForEachOperandDynamicDimension(
       hlo,
@@ -805,7 +819,8 @@ Status DynamicDimensionInferenceVisitor::HandleReshape(HloInstruction* hlo) {
         }
 
         if (input_dim_size > output_dim_size) {
-          TF_RET_CHECK(input_dim_size % output_dim_size == 0);
+          TF_RET_CHECK(input_dim_size % output_dim_size == 0)
+              << reshape->ToString();
           const int64 divisor = input_dim_size / output_dim_size;
           HloInstruction* divisor_hlo =
               hlo->parent()->AddInstruction(HloInstruction::CreateConstant(
diff --git a/tensorflow/compiler/xla/service/dynamic_dimension_inference_test.cc b/tensorflow/compiler/xla/service/dynamic_dimension_inference_test.cc
index b5a17619edf..69f64c31a2f 100644
--- a/tensorflow/compiler/xla/service/dynamic_dimension_inference_test.cc
+++ b/tensorflow/compiler/xla/service/dynamic_dimension_inference_test.cc
@@ -1248,5 +1248,34 @@ TEST_F(DynamicDimensionInferenceTest, InfersCustomOp) {
   EXPECT_TRUE(handler_called);
 }
 
+TEST_F(DynamicDimensionInferenceTest, DynamicReshapeOp) {
+  auto builder = HloComputation::Builder(TestName());
+  auto input = builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShape(F32, {9}), "data_input"));
+  auto six = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(6)));
+  // Creates an input of shape [<=9], dynamic size is 6.
+  auto dynamic_input =
+      builder.AddInstruction(HloInstruction::CreateSetDimensionSize(
+          ShapeUtil::MakeShape(F32, {9}, {true}), input, six, 0));
+  auto dynamic_size = builder.AddInstruction(HloInstruction::CreateParameter(
+      1, ShapeUtil::MakeShape(S32, {}), "size_param"));
+  auto three = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(3)));
+
+  // Reshape [<=9] into [3, <=3]
+
+  auto dynamic_reshape =
+      builder.AddInstruction(HloInstruction::CreateDynamicReshape(
+          ShapeUtil::MakeShape(F32, {3, 3}, {false, true}), dynamic_input,
+          {three, dynamic_size}));
+
+  module_->AddEntryComputation(builder.Build());
+
+  TF_ASSERT_OK(RunInference());
+  EXPECT_EQ(inference_->GetDynamicSize(dynamic_reshape, {}, 0), nullptr);
+  EXPECT_EQ(inference_->GetDynamicSize(dynamic_reshape, {}, 1), dynamic_size);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/dynamic_padder.cc b/tensorflow/compiler/xla/service/dynamic_padder.cc
index c1f9da599e8..b4c56113239 100644
--- a/tensorflow/compiler/xla/service/dynamic_padder.cc
+++ b/tensorflow/compiler/xla/service/dynamic_padder.cc
@@ -32,6 +32,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/shape_inference.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -125,6 +127,74 @@ StatusOr<HloInstruction*> ChooseIdentityValue(HloInstruction* inst,
   }
 }
 
+StatusOr<bool> ReplaceGetSize(
+    HloInstruction* instr,
+    DynamicDimensionInference* dynamic_dimension_inference) {
+  if (instr->opcode() != HloOpcode::kGetDimensionSize) {
+    return false;
+  }
+  HloComputation* computation = instr->parent();
+
+  TF_ASSIGN_OR_RETURN(auto legal_shape,
+                      ShapeInference::InferGetDimensionSizeShape(
+                          instr->operand(0)->shape(), instr->dimension()));
+  TF_RET_CHECK(ShapeUtil::Equal(instr->shape(), legal_shape))
+      << "instr->shape() " << instr->shape().ToString() << " , "
+      << "legal_shape " << legal_shape.ToString();
+  TF_RET_CHECK(ShapeUtil::HasPrimitiveType(instr->shape(), S32));
+  HloInstruction* operand = instr->mutable_operand(0);
+  int64 dim = instr->dimension();
+  HloInstruction* dynamic_size =
+      dynamic_dimension_inference->GetDynamicSize(operand, {}, dim);
+  if (dynamic_size != nullptr) {
+    TF_RETURN_IF_ERROR(instr->ReplaceAllUsesWith(dynamic_size));
+    // The dependency between a instruction and its dynamic dimensions is not
+    // modeled in the IR. As instr is being replaced by dynamic_size, also tell
+    // dynamic dimension inference that the instruction is being replaced.
+    dynamic_dimension_inference->ReplaceAllDynamicDimensionUsesWith(
+        instr, dynamic_size);
+  } else {
+    int32 size = instr->operand(0)->shape().dimensions(dim);
+    HloInstruction* new_instr = computation->AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(size)));
+    TF_RETURN_IF_ERROR(instr->ReplaceAllUsesWith(new_instr));
+    dynamic_dimension_inference->ReplaceAllDynamicDimensionUsesWith(instr,
+                                                                    new_instr);
+  }
+  return true;
+}
+
+StatusOr<bool> ReplaceSetSize(HloInstruction* instr) {
+  if (instr->opcode() != HloOpcode::kSetDimensionSize) {
+    return false;
+  }
+
+  TF_RET_CHECK(Shape::Equal().IgnoreDynamicDimension()(
+      instr->shape(), instr->operand(0)->shape()))
+      << "instr->shape() " << instr->shape().ToString() << " , "
+      << "instruction operand shape " << instr->operand(0)->shape();
+  HloInstruction* operand = instr->mutable_operand(0);
+
+  TF_RETURN_IF_ERROR(instr->ReplaceAllUsesWith(operand));
+  return true;
+}
+
+StatusOr<bool> ReplaceSetBound(HloInstruction* instr) {
+  if (instr->opcode() != HloOpcode::kCustomCall ||
+      instr->custom_call_target() != "SetBound") {
+    return false;
+  }
+
+  TF_RET_CHECK(Shape::Equal().IgnoreDynamicDimension()(
+      instr->shape(), instr->operand(0)->shape()))
+      << "instr->shape() " << instr->shape().ToString() << " , "
+      << "instruction operand shape " << instr->operand(0)->shape();
+  HloInstruction* operand = instr->mutable_operand(0);
+
+  TF_RETURN_IF_ERROR(instr->ReplaceAllUsesWith(operand));
+  return true;
+}
+
 bool ShouldSkipPadOnOperand(const HloInstruction* inst, int64 operand_num,
                             int64 dimension) {
   if ((inst->opcode() == HloOpcode::kReduceWindow ||
@@ -1236,6 +1306,18 @@ StatusOr<bool> DynamicPadder::Run(HloModule* module) {
             changed, RewriteDynamicReshape(inst, &dynamic_dimension_inference));
         continue;
       }
+
+      if (inst->opcode() == HloOpcode::kDynamicReshape) {
+        TF_ASSIGN_OR_RETURN(
+            changed, RewriteDynamicReshape(inst, &dynamic_dimension_inference));
+        auto* static_reshape =
+            computation->AddInstruction(HloInstruction::CreateReshape(
+                inst->shape(), inst->mutable_operand(0)));
+        TF_RETURN_IF_ERROR(inst->ReplaceAllUsesWith(static_reshape));
+        TF_RETURN_IF_ERROR(dynamic_dimension_inference.ForwardDynamicSize(
+            inst, static_reshape, {}));
+        continue;
+      }
       for (int64 operand_num = 0; operand_num < inst->operand_count();
            ++operand_num) {
         HloInstruction* original_operand = inst->mutable_operand(operand_num);
@@ -1292,6 +1374,25 @@ StatusOr<bool> DynamicPadder::Run(HloModule* module) {
         /*require_dynamic_output=*/require_dynamic_output));
   }
 
+  for (auto* computation : module->computations()) {
+    for (auto instruction : computation->MakeInstructionPostOrder()) {
+      TF_ASSIGN_OR_RETURN(
+          bool replaced_get_size,
+          ReplaceGetSize(instruction, &dynamic_dimension_inference));
+      changed = changed || replaced_get_size;
+    }
+  }
+
+  for (auto* computation : module->computations()) {
+    for (auto instruction : computation->MakeInstructionPostOrder()) {
+      TF_ASSIGN_OR_RETURN(bool replaced_set_size, ReplaceSetSize(instruction));
+      TF_ASSIGN_OR_RETURN(bool replaced_set_bound,
+                          ReplaceSetBound(instruction));
+      changed = changed || replaced_set_size;
+      changed = changed || replaced_set_bound;
+    }
+  }
+
   HloDCE dce;
   TF_ASSIGN_OR_RETURN(changed, dce.Run(module));
   VLOG(2) << "Post DynamicPadder HLO:";
diff --git a/tensorflow/compiler/xla/service/dynamic_padder_test.cc b/tensorflow/compiler/xla/service/dynamic_padder_test.cc
index e8f429d9db6..3855531a97b 100644
--- a/tensorflow/compiler/xla/service/dynamic_padder_test.cc
+++ b/tensorflow/compiler/xla/service/dynamic_padder_test.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
-#include "tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
@@ -380,10 +379,15 @@ class ExecutionTest : public HloTestBase {
   Literal PadAndExecute(std::unique_ptr<HloModule> module,
                         absl::Span<Literal* const> arguments,
                         bool slice_dynamic_output = true) {
+    if (!slice_dynamic_output) {
+      auto new_config = module->config();
+      new_config.mutable_entry_computation_layout()
+          ->mutable_result_layout()
+          ->ClearDynamicShape();
+      module->set_config(new_config);
+    }
     DynamicPadder padder(slice_dynamic_output);
     TF_CHECK_OK(padder.Run(module.get()).status());
-    HloGetDimensionSizeRewriter rewriter;
-    TF_CHECK_OK(rewriter.Run(module.get()).status());
     HloDCE dce;
     TF_CHECK_OK(dce.Run(module.get()).status());
     return ExecuteAndTransfer(std::move(module), arguments);
@@ -1179,6 +1183,84 @@ ENTRY main {
   EXPECT_EQ(result, expected);
 }
 
+XLA_TEST_F(ExecutionTest, DynamicReshapeDoubleDynamicDimensions) {
+  const string hlo_text = R"(
+HloModule TensorFlowScatterV1
+
+ENTRY main {
+  param = s32[2, 3, 3] parameter(0)
+  size = s32[] constant(2)
+  param_padded_partial = s32[2, <=3, 3] set-dimension-size(param, size),
+    dimensions={1}
+  param_padded = s32[2, <=3, <=3] set-dimension-size(param_padded_partial, size),
+    dimensions={2}
+  result_size = s32[] constant(8)
+  ROOT reshaped = s32[<=18] dynamic-reshape(param_padded, result_size)
+}
+)";
+
+  // First dimension (1) is dynamic. Since dynamic size is 0, result is also 0.
+  Literal operand = LiteralUtil::CreateR3<int32>(
+      {{{0, 1, 2}, {3, 4, 5}, {6, 7, 8}}, {{0, 1, 2}, {3, 4, 5}, {6, 7, 8}}});
+  auto module = GetHloModule(hlo_text);
+
+  Literal result = PadAndExecute(std::move(module), {&operand}, false);
+  result.SetDynamicSize(0, 8);
+  // Padded data looks like this (P is padding which is ignored).
+  // [[0, 1, P]
+  // [3, 4, P]
+  // [P, P, P]]
+  //
+  // [[0, 1, P]
+  // [3, 4, P]
+  // [P, P, P]]
+  //
+  // Reshaping (with correct reshape rewriting) produces:
+  // [0, 1, 3, 4, 0, 1, 3, 4]
+  Literal expected = LiteralUtil::CreateR1<int32>({0, 1, 3, 4, 0, 1, 3, 4});
+
+  EXPECT_EQ(result, expected);
+}
+
+XLA_TEST_F(ExecutionTest, DynamicReshapeOutputDoubleDynamicDimensions) {
+  const string hlo_text = R"(
+HloModule TensorFlowScatterV1
+
+ENTRY main {
+  param = s32[18] parameter(0)
+  eight = s32[] constant(8)
+  param_dynamic = s32[<=18] set-dimension-size(param, eight), dimensions={0}
+  two = s32[] constant(2)
+  // every dimension has dynamic size two.
+  ROOT reshaped = s32[2, <=3, <=3] dynamic-reshape(param_dynamic, two, two, two)
+}
+)";
+  Literal operand = LiteralUtil::CreateR1<int32>(
+      {0, 1, 3, 4, 0, 1, 3, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1});
+
+  auto module = GetHloModule(hlo_text);
+
+  Literal result = PadAndExecute(std::move(module), {&operand}, false);
+
+  result.SetDynamicSize(1, 2);
+  result.SetDynamicSize(2, 2);
+  // Padded operand is:
+  // [0, 1, 3, 4, 0, 1, 3, 4, P, P ....]
+  //
+  // Reshaping it should produce:
+  // [[0, 1, P]
+  // [3, 4, P]
+  // [P, P, P]]
+  //
+  // [[0, 1, P]
+  // [3, 4, P]
+  // [P, P, P]]
+  Literal expected =
+      LiteralUtil::CreateR3<int32>({{{0, 1}, {3, 4}}, {{0, 1}, {3, 4}}});
+
+  EXPECT_EQ(result, expected);
+}
+
 XLA_TEST_F(ExecutionTest, SetGetDimensionSize) {
   const string hlo_text = R"(
 HloModule TensorFlowScatterV1
@@ -1371,5 +1453,70 @@ ENTRY main {
   EXPECT_EQ(result, expected);
 }
 
+namespace op = xla::testing::opcode_matchers;
+
+class HloDimensionSizeLegalizerTest : public HloTestBase {
+ protected:
+  HloDimensionSizeLegalizerTest() {}
+};
+
+TEST_F(HloDimensionSizeLegalizerTest, Ok) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+HloModule _
+ENTRY gds {
+  p = s32[3,4] parameter(0)
+  size0 = s32[] get-dimension-size(p), dimensions={0}
+  size1 = s32[] get-dimension-size(p), dimensions={1}
+  ROOT mul = s32[] multiply(size0, size1)
+})")
+                    .ValueOrDie();
+  DynamicPadder pass;
+  EXPECT_TRUE(pass.Run(module.get()).ValueOrDie());
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Multiply(op::Constant(), op::Constant()));
+}
+
+TEST_F(HloDimensionSizeLegalizerTest, GetSetSetDimensionSizeRewriter) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+HloModule _
+ENTRY gds {
+  p = s32[3,4] parameter(0)
+  size0 = s32[] get-dimension-size(p), dimensions={0}
+  p_copy = s32[3,4] copy(p)
+  p_copy_dynamic = s32[<=3, 4] set-dimension-size(p_copy, size0), dimensions={0}
+  size1 = s32[] get-dimension-size(p_copy_dynamic), dimensions={0}
+  ROOT mul = s32[] multiply(size0, size1)
+})")
+                    .ValueOrDie();
+  DynamicPadder pass;
+  EXPECT_TRUE(pass.Run(module.get()).ValueOrDie());
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Multiply(op::Constant(), op::Constant()));
+}
+
+TEST_F(HloDimensionSizeLegalizerTest, IllegalType) {
+  auto module = ParseAndReturnUnverifiedModule(R"(
+HloModule _
+ENTRY gds {
+  p = s32[3]{0} parameter(0)
+  ROOT gds = s64[] get-dimension-size(p), dimensions={0}
+})")
+                    .ValueOrDie();
+  DynamicPadder pass;
+  EXPECT_FALSE(pass.Run(module.get()).ok());
+}
+
+TEST_F(HloDimensionSizeLegalizerTest, IllegalDimension) {
+  auto module = ParseAndReturnUnverifiedModule(R"(
+HloModule _
+ENTRY gds {
+  p = f32[2,5] parameter(0)
+  ROOT gds = s32[] get-dimension-size(p), dimensions={2}
+})")
+                    .ValueOrDie();
+  DynamicPadder pass;
+  EXPECT_FALSE(pass.Run(module.get()).ok());
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/fusion_node_indexing_evaluation.cc b/tensorflow/compiler/xla/service/fusion_node_indexing_evaluation.cc
index 75d39298aa3..ab6a3d01d21 100644
--- a/tensorflow/compiler/xla/service/fusion_node_indexing_evaluation.cc
+++ b/tensorflow/compiler/xla/service/fusion_node_indexing_evaluation.cc
@@ -27,33 +27,25 @@ namespace xla {
 FusionNodeIndexingEvaluation::FusionNodeIndexingEvaluation(
     const HloInstruction* fusion)
     : fusion_(fusion) {
-  total_emitted_instructions_ = 0;
   HloInstruction* root = fusion->fused_expression_root();
   indexing_users_[root].insert(fusion);
   index_usage_count_[fusion] = 1;
   RecomputeCache();
 }
 
-bool FusionNodeIndexingEvaluation::AverageCodeDuplicationTooHigh(
+bool FusionNodeIndexingEvaluation::CodeDuplicationTooHigh(
     const HloInstruction* producer) const {
   // This constant is arbitrarily chosen. Essentially we don't want to have too
   // much code duplication, because it slows down the compilation time. There is
   // a tradeoff between compilation time and runtime here.
   const int64 kAllowedCodeDuplication = 15;
 
-  // index_usage_count_ contains an entry for each instruction in the fusion
-  // computation (except parameter instructions), plus an entry for the 'fusion'
-  // instruction. So the size of this map is already one bigger than the number
-  // of instructions in the fusion node that are emitted, thus accounting for
-  // the number of instructions after 'producer' is fused.
-  return EvaluateTotalEmittedInstructions(producer) /
-             index_usage_count_.size() >
-         kAllowedCodeDuplication;
+  return EvaluateEmittedInstructions(producer) > kAllowedCodeDuplication;
 }
 
-int64 FusionNodeIndexingEvaluation::EvaluateTotalEmittedInstructions(
+int64 FusionNodeIndexingEvaluation::EvaluateEmittedInstructions(
     const HloInstruction* producer) const {
-  int64 total = total_emitted_instructions_;
+  int64 total = 0;
   for (const auto* user : indexing_users_.at(producer)) {
     total += index_usage_count_.at(user);
   }
@@ -96,19 +88,9 @@ void FusionNodeIndexingEvaluation::UpdateIndexUsageCount(
     const HloInstruction* instruction) {
   int64 total = 0;
   for (const auto* user : indexing_users_[instruction]) {
-    int64 weight = 1;
-    // Concatenate is special: the index differs for each operand, so
-    // in the worst case we have to deal with as many index values as
-    // the number of operands of Concatenate. By considering the worst
-    // case, we are more conservative than necessary regarding
-    // counting the index usage.
-    if (user->opcode() == HloOpcode::kConcatenate) {
-      weight = user->operand_count();
-    }
-    total += index_usage_count_.at(user) * weight;
+    total += index_usage_count_.at(user);
   }
   CHECK(index_usage_count_.emplace(instruction, total).second);
-  total_emitted_instructions_ += total;
 }
 
 void FusionNodeIndexingEvaluation::UpdateIndexingUsersOfOperands(
diff --git a/tensorflow/compiler/xla/service/fusion_node_indexing_evaluation.h b/tensorflow/compiler/xla/service/fusion_node_indexing_evaluation.h
index 9630986d188..b85bf9104c7 100644
--- a/tensorflow/compiler/xla/service/fusion_node_indexing_evaluation.h
+++ b/tensorflow/compiler/xla/service/fusion_node_indexing_evaluation.h
@@ -26,17 +26,14 @@ class FusionNodeIndexingEvaluation {
  public:
   explicit FusionNodeIndexingEvaluation(const HloInstruction* fusion);
 
-  // Evaluate the average number of times an instruction is emitted inside the
-  // fusion node, if 'producer' is fused into 'fusion_'. If this average
-  // duplication is "too high" (some arbitrary chosen constant), returns
-  // true.
-  bool AverageCodeDuplicationTooHigh(const HloInstruction* producer) const;
+  // Evaluate the number of times 'producer' would be emitted if it is fused
+  // into 'fusion_'. If the duplication is "too high" (some arbitrary chosen
+  // constant), returns true.
+  bool CodeDuplicationTooHigh(const HloInstruction* producer) const;
 
-  // Evaluate the total number of times an instruction is emitted inside the
-  // fusion node, if 'producer' is fused into 'fusion_'. An instruction may be
-  // emitted several times, once for each different index value with which it is
-  // indexed.
-  int64 EvaluateTotalEmittedInstructions(const HloInstruction* producer) const;
+  // Evaluate the number of times 'producer' would be emitted if it is fused
+  // into 'fusion_'.
+  int64 EvaluateEmittedInstructions(const HloInstruction* producer) const;
 
   // Update the evaluation cache after having fused 'producer' into 'fusion_'.
   // 'producer' is the cloned instruction which is now part of the fusion
@@ -84,9 +81,6 @@ class FusionNodeIndexingEvaluation {
 
   // The fusion instruction.
   const HloInstruction* fusion_;
-
-  // The total number of emitted instructions.
-  int64 total_emitted_instructions_;
 };
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/fusion_node_indexing_evaluation_test.cc b/tensorflow/compiler/xla/service/fusion_node_indexing_evaluation_test.cc
index b20f52d2d62..b00abdc9abf 100644
--- a/tensorflow/compiler/xla/service/fusion_node_indexing_evaluation_test.cc
+++ b/tensorflow/compiler/xla/service/fusion_node_indexing_evaluation_test.cc
@@ -29,7 +29,7 @@ using FusionNodeIndexingEvaluationTest = HloTestBase;
 
 // Subclass of InstructionFusion exposing the protected methods Fuse and
 // FuseInstruction for testing. Also adds the FusionNodeIndexingEvaluation to
-// track the average code duplication due to indexing HloInstructions with
+// track the code duplication due to indexing HloInstructions with
 // different index values.
 class InstructionFusionForTesting : public InstructionFusion {
  public:
@@ -61,8 +61,8 @@ class InstructionFusionForTesting : public InstructionFusion {
     return InstructionFusion::Fuse(producer, consumer);
   }
 
-  int64 EvaluateTotalEmittedInstructions(const HloInstruction* producer,
-                                         const HloInstruction* consumer) {
+  int64 EvaluateEmittedInstructions(const HloInstruction* producer,
+                                    const HloInstruction* consumer) {
     if (consumer->opcode() != HloOpcode::kFusion) {
       return 0;
     }
@@ -71,8 +71,8 @@ class InstructionFusionForTesting : public InstructionFusion {
       fusion_node_evaluations_.emplace(consumer,
                                        FusionNodeIndexingEvaluation(consumer));
     }
-    return fusion_node_evaluations_.at(consumer)
-        .EvaluateTotalEmittedInstructions(producer);
+    return fusion_node_evaluations_.at(consumer).EvaluateEmittedInstructions(
+        producer);
   }
 
  private:
@@ -109,8 +109,7 @@ TEST_F(FusionNodeIndexingEvaluationTest, FuseThreeInstructions) {
   HloInstruction* slice1 = sub->mutable_operand(0);
   HloInstruction* slice2 = sub->mutable_operand(1);
   auto fusion = instruction_fusion.Fuse(slice1, sub);
-  EXPECT_EQ(instruction_fusion.EvaluateTotalEmittedInstructions(slice2, fusion),
-            3);
+  EXPECT_EQ(instruction_fusion.EvaluateEmittedInstructions(slice2, fusion), 1);
   instruction_fusion.Fuse(slice2, fusion);
 }
 
@@ -151,37 +150,31 @@ TEST_F(FusionNodeIndexingEvaluationTest, ExponentialDuplicationPattern) {
   HloInstruction* slice2_1 = add2->mutable_operand(1);
   auto fusion = instruction_fusion.Fuse(slice2_0, add2);
   // So far we have fused add2 and slice2.0. So when we also fuse slice2.1, we
-  // expect to emit 3 instructions.
-  EXPECT_EQ(
-      instruction_fusion.EvaluateTotalEmittedInstructions(slice2_1, fusion), 3);
+  // expect to emit it 1 time.
+  EXPECT_EQ(instruction_fusion.EvaluateEmittedInstructions(slice2_1, fusion),
+            1);
   instruction_fusion.Fuse(slice2_1, fusion);
   HloInstruction* add1 = fusion->mutable_operand(0);
   EXPECT_EQ(add1->opcode(), HloOpcode::kAdd);
-  // If we fuse add1 into 'fusion', it needs to be emitted twice, adding 2 to
-  // the sum.
-  EXPECT_EQ(instruction_fusion.EvaluateTotalEmittedInstructions(add1, fusion),
-            5);
+  // If we fuse add1 into 'fusion', it needs to be emitted twice.
+  EXPECT_EQ(instruction_fusion.EvaluateEmittedInstructions(add1, fusion), 2);
   instruction_fusion.Fuse(add1, fusion);
   HloInstruction* slice1_0 = fusion->mutable_operand(0);
   EXPECT_EQ(slice1_0->opcode(), HloOpcode::kSlice);
-  // If we fuse slice1.0 into 'fusion', it needs to be emitted twice, adding 2
-  // to the sum.
-  EXPECT_EQ(
-      instruction_fusion.EvaluateTotalEmittedInstructions(slice1_0, fusion), 7);
+  // If we fuse slice1.0 into 'fusion', it needs to be emitted twice.
+  EXPECT_EQ(instruction_fusion.EvaluateEmittedInstructions(slice1_0, fusion),
+            2);
   instruction_fusion.Fuse(slice1_0, fusion);
   HloInstruction* slice1_1 = fusion->mutable_operand(0);
   EXPECT_EQ(slice1_1->opcode(), HloOpcode::kSlice);
-  // If we fuse slice1.1 into 'fusion', it needs to be emitted twice, adding 2
-  // to the sum.
-  EXPECT_EQ(
-      instruction_fusion.EvaluateTotalEmittedInstructions(slice1_1, fusion), 9);
+  // If we fuse slice1.1 into 'fusion', it needs to be emitted twice.
+  EXPECT_EQ(instruction_fusion.EvaluateEmittedInstructions(slice1_1, fusion),
+            2);
   instruction_fusion.Fuse(slice1_1, fusion);
   HloInstruction* add0 = fusion->mutable_operand(0);
   EXPECT_EQ(add0->opcode(), HloOpcode::kAdd);
-  // If we fuse add0 into 'fusion', it needs to be emitted twice, adding 4 to
-  // the sum.
-  EXPECT_EQ(instruction_fusion.EvaluateTotalEmittedInstructions(add0, fusion),
-            13);
+  // If we fuse add0 into 'fusion', it needs to be emitted four times.
+  EXPECT_EQ(instruction_fusion.EvaluateEmittedInstructions(add0, fusion), 4);
   instruction_fusion.Fuse(add0, fusion);
 }
 
@@ -212,10 +205,9 @@ ENTRY entry_computation {
   HloInstruction* add0 = fusion->mutable_operand(0);
   EXPECT_EQ(add0->opcode(), HloOpcode::kAdd);
   // Here, the cache for the fusion node needs to be recomputed. Make sure we
-  // still get the same evaluation as before when we incrementally built the
+  // still get the same evaluation as before when we incrementally build the
   // cache.
-  EXPECT_EQ(instruction_fusion.EvaluateTotalEmittedInstructions(add0, fusion),
-            13);
+  EXPECT_EQ(instruction_fusion.EvaluateEmittedInstructions(add0, fusion), 4);
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gather_expander.cc b/tensorflow/compiler/xla/service/gather_expander.cc
index 1838f65e6ea..d38873a501d 100644
--- a/tensorflow/compiler/xla/service/gather_expander.cc
+++ b/tensorflow/compiler/xla/service/gather_expander.cc
@@ -269,6 +269,22 @@ static StatusOr<HloInstruction*> PermuteBatchAndOffsetDims(
   return MakeTransposeHlo(accumulator, permutation);
 }
 
+// Computes how many trips a loop implementing this gather op would take.
+static int64 GatherLoopTripCount(HloInstruction* gather_instr) {
+  HloInstruction* start_indices = gather_instr->mutable_operand(1);
+  const Shape& start_indices_shape = start_indices->shape();
+  const GatherDimensionNumbers& dim_numbers =
+      gather_instr->gather_dimension_numbers();
+
+  int64 trip_count = 1;
+  for (int64 i = 0, e = start_indices_shape.dimensions_size(); i < e; i++) {
+    if (i != dim_numbers.index_vector_dim()) {
+      trip_count *= start_indices_shape.dimensions(i);
+    }
+  }
+  return trip_count;
+}
+
 // High Level Algorithm
 //
 // We follow the following steps in sequence:
@@ -311,20 +327,13 @@ StatusOr<HloInstruction*> GatherExpander::ExpandInstruction(
   HloComputation* computation = gather_instr->parent();
   HloInstruction* operand = gather_instr->mutable_operand(0);
   HloInstruction* start_indices = gather_instr->mutable_operand(1);
-  const Shape& start_indices_shape = start_indices->shape();
   const Shape& output_shape = gather_instr->shape();
   int64 output_rank = output_shape.dimensions_size();
 
   const GatherDimensionNumbers& dim_numbers =
       gather_instr->gather_dimension_numbers();
 
-  int64 gather_loop_trip_count = 1;
-  for (int64 i = 0, e = start_indices_shape.dimensions_size(); i < e; i++) {
-    if (i != dim_numbers.index_vector_dim()) {
-      gather_loop_trip_count *= start_indices_shape.dimensions(i);
-    }
-  }
-
+  int64 gather_loop_trip_count = GatherLoopTripCount(gather_instr);
   if (!IsInt32(gather_loop_trip_count)) {
     return Unimplemented(
         "Gather operations with more than 2147483647 gather indices are not "
@@ -373,7 +382,11 @@ bool GatherExpander::InstructionMatchesPattern(HloInstruction* inst) {
   return inst->opcode() == HloOpcode::kGather &&
          // Avoid expanding gather ops that produce zero sized tensors,
          // instead punt these to ZeroSizedHloElimination.
-         !ShapeUtil::IsZeroElementArray(inst->shape());
+         !ShapeUtil::IsZeroElementArray(inst->shape()) &&
+         // In kEliminateSimpleGathers mode, we only simplify instructions
+         // which can be represented without a loop -- i.e. we only simplify
+         // gathers which have a trip count of 1.
+         (mode_ == kEliminateAllGathers || GatherLoopTripCount(inst) == 1);
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gather_expander.h b/tensorflow/compiler/xla/service/gather_expander.h
index 5625a37cb46..e665fcd713c 100644
--- a/tensorflow/compiler/xla/service/gather_expander.h
+++ b/tensorflow/compiler/xla/service/gather_expander.h
@@ -21,10 +21,30 @@ limitations under the License.
 namespace xla {
 
 // This pass rewrites gather operations into (roughly) while loops of dynamic
-// slices.  This lets backends that don't support gather directly to
-// nevertheless have a minimum level of support.
+// slices.
+//
+// This pass can be used two ways:
+//
+//  - kEliminateAllGathers: For backends that don't support gather, this pass
+//    can convert every gather to a loop.
+//
+//  - kEliminateSimpleGathers: For backends that *do* support gather, this pass
+//    can strength-reduce "simple" gathers -- specifically, gathers that can be
+//    represented without a loop -- to dyanmic-slices.
+//
+// Note that even in kEliminateSimpleGathers mode, this pass may still expand a
+// gather into a loop (with a trip-count of 1).  It's up to other simplification
+// passes to remove the loop.
+//
 class GatherExpander : public OpExpanderPass {
  public:
+  enum Mode {
+    kEliminateAllGathers,
+    kEliminateSimpleGathers,
+  };
+
+  explicit GatherExpander(Mode m) : mode_(m) {}
+
   absl::string_view name() const override { return "gather_expander"; }
 
  protected:
@@ -32,6 +52,9 @@ class GatherExpander : public OpExpanderPass {
 
   StatusOr<HloInstruction*> ExpandInstruction(
       HloInstruction* gather_inst) override;
+
+ private:
+  Mode mode_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gather_expander_test.cc b/tensorflow/compiler/xla/service/gather_expander_test.cc
index 706327091d9..4b0808e9aaf 100644
--- a/tensorflow/compiler/xla/service/gather_expander_test.cc
+++ b/tensorflow/compiler/xla/service/gather_expander_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gather_expander.h"
 
+#include "tensorflow/compiler/xla/service/hlo_query.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
@@ -42,7 +43,9 @@ ENTRY main {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
 
-  Status status = GatherExpander{}.Run(module.get()).status();
+  Status status = GatherExpander{GatherExpander::kEliminateAllGathers}
+                      .Run(module.get())
+                      .status();
   EXPECT_EQ(status.code(), tensorflow::error::UNIMPLEMENTED);
 
   ASSERT_THAT(
@@ -68,7 +71,9 @@ ENTRY main {
 )";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, GatherExpander{}.Run(module.get()));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed,
+      GatherExpander{GatherExpander::kEliminateAllGathers}.Run(module.get()));
   ASSERT_TRUE(changed);
 
   HloInstruction* while_instr = nullptr;
@@ -129,7 +134,9 @@ ENTRY main {
   OpMetadata metadata;
   metadata.set_op_name("Gather");
   module->entry_computation()->root_instruction()->set_metadata(metadata);
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, GatherExpander{}.Run(module.get()));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed,
+      GatherExpander{GatherExpander::kEliminateAllGathers}.Run(module.get()));
   ASSERT_TRUE(changed);
 
   HloInstruction* while_instr = nullptr;
@@ -147,5 +154,54 @@ ENTRY main {
          "after gather expansion";
   EXPECT_EQ(while_instr->metadata().op_name(), "Gather");
 }
+
+TEST_F(GatherExpanderTest, EliminateSimpleGathersSkipsNontrivialGather) {
+  const string hlo_text = R"(
+HloModule TensorFlowGatherV1
+
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2] parameter(1)
+  ROOT gather = s32[2,3] gather(operand, indices),
+      offset_dims={1},
+      collapsed_slice_dims={0},
+      start_index_map={0},
+      index_vector_dim=1,
+      slice_sizes={1, 3}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_text));
+  GatherExpander pass(GatherExpander::kEliminateSimpleGathers);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHloPass(&pass, module.get()));
+  ASSERT_FALSE(changed);
+}
+
+TEST_F(GatherExpanderTest, EliminateSimpleGathersRewritesTrivialGather) {
+  const string hlo_text = R"(
+HloModule test
+
+ENTRY main {
+  operand = s32[100] parameter(0)
+  indices = s32[1] parameter(1)
+  ROOT gather = s32[10] gather(operand, indices),
+      offset_dims={0},
+      collapsed_slice_dims={},
+      start_index_map={0},
+      index_vector_dim=0,
+      slice_sizes={10}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_text));
+  GatherExpander pass(GatherExpander::kEliminateAllGathers);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHloPass(&pass, module.get()));
+  ASSERT_TRUE(changed);
+  ASSERT_FALSE(hlo_query::ContainsInstrWithOpcode(module->entry_computation(),
+                                                  {HloOpcode::kGather}));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index 8dfd73e9a6a..c861ceffc05 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -254,11 +254,18 @@ cc_library(
         ":target_util",
         ":thunk",
         ":thunk_emitter",
+        "//tensorflow/compiler/mlir/hlo",
+        "//tensorflow/compiler/mlir/hlo:lhlo",
+        "//tensorflow/compiler/mlir/xla:hlo_utils",
+        "//tensorflow/compiler/mlir/xla:mhlo_to_lhlo_with_xla",
+        "//tensorflow/compiler/mlir/xla:mlir_hlo_to_hlo",
+        "//tensorflow/compiler/mlir/xla:type_to_shape",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:union_find",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
@@ -286,10 +293,13 @@ cc_library(
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Core",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:StandardOps",
     ],
 )
 
@@ -1025,6 +1035,24 @@ cc_library(
     ],
 )
 
+tf_cc_test(
+    name = "gpu_conv_padding_legalization_test",
+    srcs = ["gpu_conv_padding_legalization_test.cc"],
+    tags = tf_cuda_tests_tags(),
+    deps = [
+        ":gpu_conv_padding_legalization",
+        ":ir_emission_utils",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_matchers",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
+        "//tensorflow/core:test",
+    ],
+)
+
 cc_library(
     name = "cudnn_pad_for_convolutions",
     srcs = ["cudnn_pad_for_convolutions.cc"],
@@ -1144,6 +1172,7 @@ cc_library(
         ":gpu_sanitize_constant_names",
         ":gpu_scatter_expander",
         ":horizontal_fusion",
+        ":horizontal_input_fusion",
         ":instruction_fusion",
         ":ir_emission_utils",
         ":ir_emitter",
@@ -1158,6 +1187,7 @@ cc_library(
         ":target_constants",
         ":tree_reduction_rewriter",
         ":variadic_op_splitter",
+        "//tensorflow/compiler/mlir/xla:mhlo_to_lhlo_with_xla",
         "//tensorflow/compiler/xla:protobuf_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
@@ -1168,6 +1198,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:batchnorm_expander",
         "//tensorflow/compiler/xla/service:buffer_assignment",
         "//tensorflow/compiler/xla/service:call_inliner",
+        "//tensorflow/compiler/xla/service:comparison_expander",
         "//tensorflow/compiler/xla/service:conditional_canonicalizer",
         "//tensorflow/compiler/xla/service:conditional_simplifier",
         "//tensorflow/compiler/xla/service:convolution_4d_expander",
@@ -1177,13 +1208,13 @@ cc_library(
         "//tensorflow/compiler/xla/service:dynamic_padder",
         "//tensorflow/compiler/xla/service:executable",
         "//tensorflow/compiler/xla/service:flatten_call_graph",
+        "//tensorflow/compiler/xla/service:gather_expander",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_constant_folding",
         "//tensorflow/compiler/xla/service:hlo_cse",
         "//tensorflow/compiler/xla/service:hlo_dataflow_analysis",
         "//tensorflow/compiler/xla/service:hlo_dce",
         "//tensorflow/compiler/xla/service:hlo_element_type_converter",
-        "//tensorflow/compiler/xla/service:hlo_get_dimension_size_rewriter",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/compiler/xla/service:hlo_pass_pipeline",
         "//tensorflow/compiler/xla/service:hlo_proto_util",
@@ -1214,6 +1245,8 @@ cc_library(
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Core",
+        "@llvm-project//mlir:AllPassesAndDialectsNoRegistration",
+        "@llvm-project//mlir:IR",
     ],
 )
 
@@ -1480,6 +1513,7 @@ cc_library(
     hdrs = ["stream_executor_util.h"],
     copts = tf_copts(),
     deps = [
+        ":launch_dimensions",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
@@ -1718,6 +1752,7 @@ cc_library(
     srcs = ["horizontal_fusion.cc"],
     hdrs = ["horizontal_fusion.h"],
     deps = [
+        ":gpu_fusible",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_creation_utils",
@@ -1754,6 +1789,45 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "horizontal_input_fusion",
+    srcs = ["horizontal_input_fusion.cc"],
+    hdrs = ["horizontal_input_fusion.h"],
+    deps = [
+        ":gpu_fusible",
+        ":ir_emission_utils",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_creation_utils",
+        "//tensorflow/compiler/xla/service:hlo_pass",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+tf_cc_test(
+    name = "horizontal_input_fusion_test",
+    srcs = ["horizontal_input_fusion_test.cc"],
+    tags = tf_cuda_tests_tags(),
+    deps = [
+        ":horizontal_input_fusion",
+        ":multi_output_fusion",
+        "//tensorflow/compiler/jit:xla_gpu_jit",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla/service:hlo_matchers",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/service:hlo_pass_pipeline",
+        "//tensorflow/compiler/xla/service/gpu/tests:gpu_codegen_test",
+        "//tensorflow/compiler/xla/tests:filecheck",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+    ],
+)
+
 cc_library(
     name = "reduction_degenerate_dim_remover",
     srcs = ["reduction_degenerate_dim_remover.cc"],
@@ -1909,16 +1983,3 @@ cc_library(
         "@llvm-project//mlir:LLVMDialect",
     ],
 )
-
-# Library with XLA thunks dialect static initialization.
-cc_library(
-    name = "xla_thunks_dialect_registration",
-    srcs = [
-        "ir/dialect_registration.cc",
-    ],
-    deps = [
-        ":xla_thunks_ops",
-        "@llvm-project//mlir:IR",
-    ],
-    alwayslink = 1,
-)
diff --git a/tensorflow/compiler/xla/service/gpu/buffer_comparator.cc b/tensorflow/compiler/xla/service/gpu/buffer_comparator.cc
index 9b192aaa8e1..10a565308de 100644
--- a/tensorflow/compiler/xla/service/gpu/buffer_comparator.cc
+++ b/tensorflow/compiler/xla/service/gpu/buffer_comparator.cc
@@ -613,10 +613,13 @@ static StatusOr<bool> DeviceCompare(se::Stream* stream,
   LaunchDimensions dim =
       CalculateLaunchDimensions(buffer_shape, gpu_device_info);
 
-  stream->ThenLaunch(se::ThreadDim(dim.threads_per_block()),
-                     se::BlockDim(dim.block_count()), *comparison_kernel,
-                     lhs_typed, rhs_typed, static_cast<float>(kTolerance),
-                     buffer_size, out_param.cref());
+  LaunchDimensions::Dim3D thread_counts = dim.thread_counts_per_block();
+  LaunchDimensions::Dim3D block_counts = dim.block_counts();
+  stream->ThenLaunch(
+      se::ThreadDim(thread_counts.x, thread_counts.y, thread_counts.z),
+      se::BlockDim(block_counts.x, block_counts.y, block_counts.z),
+      *comparison_kernel, lhs_typed, rhs_typed, static_cast<float>(kTolerance),
+      buffer_size, out_param.cref());
 
   uint64 result = -1;
   CHECK_EQ(out_param->size(), sizeof(result));
diff --git a/tensorflow/compiler/xla/service/gpu/fusion_merger.cc b/tensorflow/compiler/xla/service/gpu/fusion_merger.cc
index 60e4cb84b09..fa066e9d320 100644
--- a/tensorflow/compiler/xla/service/gpu/fusion_merger.cc
+++ b/tensorflow/compiler/xla/service/gpu/fusion_merger.cc
@@ -201,8 +201,7 @@ Status FusionInstructionMerger::HandleFusion(HloInstruction* fusion) {
   // Merging into all users enables the removal of 'fusion' from the
   // computation.
   if (!absl::c_all_of(fusion->users(), [&](const HloInstruction* user) {
-        return user->opcode() == HloOpcode::kFusion &&
-               IsProducerConsumerFusible(*fusion, *user);
+        return IsProducerConsumerFusible(*fusion, *user);
       })) {
     VLOG(3) << "Not merging " << fusion->name()
             << ": Some of its users are not loop/input fusion kernels.";
@@ -230,18 +229,15 @@ Status FusionInstructionMerger::HandleFusion(HloInstruction* fusion) {
   // This is done to avoid the duplication of expensive instructions, which
   // would occur if 'fusion' were merged into multiple users.
   //
-  // If 'fusion' has just one user, then an earlier fusion pass chose not to
-  // fuse this producer/consumer pair (likely because of expensive instruction
-  // re-use by the consumer), and so we honor that choice here as well.
-  //
-  // Moreover, if we are going to save a "lot" in memory bandwidth then we
+  // However, if we are going to save a "lot" in memory bandwidth then we
   // ignore how expensive the fusion instructions are.  The heuristic used to
   // determine "a lot" is the following: merging must reduce memory traffic by a
   // factor of 0.3, and the amount of memory accessed must not be entirely
   // trivial (above 1K).  This likely has room for improvement in the future.
 
   bool allow_expensive_ops =
-      merged_to_current_bytes_ratio < 0.3 && current_bytes_transferred > 1024;
+      fusion->user_count() == 1 ||
+      (merged_to_current_bytes_ratio < 0.3 && current_bytes_transferred > 1024);
 
   if (!allow_expensive_ops &&
       absl::c_any_of(fusion->fused_instructions(),
@@ -286,7 +282,15 @@ Status FusionInstructionMerger::HandleFusion(HloInstruction* fusion) {
   // Merge fused instructions from 'fusion' into each user.
   std::vector<HloInstruction*> users = fusion->users();
   for (HloInstruction* user : users) {
-    user->MergeFusionInstruction(fusion);
+    if (user->opcode() == HloOpcode::kFusion) {
+      user->MergeFusionInstruction(fusion);
+    } else {
+      HloInstruction* fused_user =
+          computation_->AddInstruction(HloInstruction::CreateFusion(
+              user->shape(), ChooseFusionKind(*fusion, *user), user));
+      TF_CHECK_OK(computation_->ReplaceInstruction(user, fused_user));
+      fused_user->MergeFusionInstruction(fusion);
+    }
     changed_ = true;
   }
   ++total_merged_;
@@ -299,7 +303,7 @@ Status FusionInstructionMerger::HandleFusion(HloInstruction* fusion) {
                            })
           << " }";
   // Remove 'fusion' instruction.
-  CHECK_EQ(0, fusion->user_count());
+  CHECK_EQ(0, fusion->user_count()) << fusion->ToString();
   return computation_->RemoveInstruction(fusion);
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/fusion_merger_test.cc b/tensorflow/compiler/xla/service/gpu/fusion_merger_test.cc
index 42891154c23..d08c732e611 100644
--- a/tensorflow/compiler/xla/service/gpu/fusion_merger_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/fusion_merger_test.cc
@@ -234,6 +234,54 @@ TEST_F(FusionMergerTest, WillMergeIntoInputFusion) {
               op::Fusion(op::Parameter()));
 }
 
+TEST_F(FusionMergerTest, WillMergeIntoUnfusedConsumer) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+    HloModule jit_matmul.36
+
+    max (parameter.13: f32[], parameter.14: f32[]) -> f32[] {
+      parameter.13 = f32[] parameter(0)
+      parameter.14 = f32[] parameter(1)
+      ROOT maximum.15 = f32[] maximum(f32[] parameter.13, f32[] parameter.14)
+    }
+
+    add (parameter.29: f32[], parameter.30: f32[]) -> f32[] {
+      parameter.29 = f32[] parameter(0)
+      parameter.30 = f32[] parameter(1)
+      ROOT add.31 = f32[] add(f32[] parameter.29, f32[] parameter.30)
+    }
+
+    fused_computation.1 (param_1.4: f32[200,200,200], param_2.1: f32[200,200]) -> f32[200,200] {
+      param_1.4 = f32[200,200,200]{2,1,0} parameter(0)
+      param_2.1 = f32[200,200]{1,0} parameter(1)
+      broadcast.3 = f32[200,200,200]{2,1,0} broadcast(f32[200,200]{1,0} param_2.1), dimensions={0,2}
+      subtract.0 = f32[200,200,200]{2,1,0} subtract(f32[200,200,200]{2,1,0} param_1.4, f32[200,200,200]{2,1,0} broadcast.3)
+      exponential.0 = f32[200,200,200]{2,1,0} exponential(f32[200,200,200]{2,1,0} subtract.0)
+      constant.27 = f32[] constant(0)
+      ROOT reduce.0 = f32[200,200]{1,0} reduce(f32[200,200,200]{2,1,0} exponential.0, f32[] constant.27), dimensions={1}, to_apply=add
+    }
+
+    fused_computation.3 (param_0.7: f32[200,200], param_1.9: f32[200,200]) -> f32[200,200,200] {
+      param_1.9 = f32[200,200]{1,0} parameter(1)
+      broadcast.10 = f32[200,200,200]{2,1,0} broadcast(f32[200,200]{1,0} param_1.9), dimensions={0,1}
+      param_0.7 = f32[200,200]{1,0} parameter(0)
+      broadcast.8 = f32[200,200,200]{2,1,0} broadcast(f32[200,200]{1,0} param_0.7), dimensions={1,2}
+      ROOT add.1 = f32[200,200,200]{2,1,0} add(f32[200,200,200]{2,1,0} broadcast.10, f32[200,200,200]{2,1,0} broadcast.8)
+    }
+
+    ENTRY entry (parameter.1: f32[200,200], parameter.2: f32[200,200]) -> f32[200,200] {
+      parameter.2 = f32[200,200]{1,0} parameter(1)
+      parameter.1 = f32[200,200]{1,0} parameter(0)
+      fusion.3 = f32[200,200,200]{2,1,0} fusion(f32[200,200]{1,0} parameter.2, f32[200,200]{1,0} parameter.1), kind=kLoop, calls=fused_computation.3
+      constant.11 = f32[] constant(-inf)
+      reduce.16 = f32[200,200]{1,0} reduce(f32[200,200,200]{2,1,0} fusion.3, f32[] constant.11), dimensions={1}, to_apply=max
+      ROOT fusion.1 = f32[200,200]{1,0} fusion(f32[200,200,200]{2,1,0} fusion.3, f32[200,200]{1,0} reduce.16), kind=kInput, calls=fused_computation.1
+    })")
+                    .ValueOrDie();
+  EXPECT_TRUE(FusionMerger().Run(module.get()).ValueOrDie());
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Fusion(op::Fusion(), op::Parameter(), op::Parameter()));
+}
+
 TEST_F(FusionMergerTest, WillNotMergeReduceUnfriendlyLayouts) {
   auto module = ParseAndReturnVerifiedModule(R"(
     HloModule m
@@ -398,6 +446,29 @@ TEST_F(FusionMergerTest, WillMergeExpensiveFusionsIfSavesMemory) {
   EXPECT_TRUE(FusionMerger().Run(module.get()).ValueOrDie());
 }
 
+TEST_F(FusionMergerTest, WillMergeExpensiveFusionsWithSingleConsumer) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+    HloModule m
+
+    %f_b (p: f32[1024,1024,1024]) -> f32[1024,1024,1024] {
+      %p = f32[1024,1024,1024] parameter(0)
+      ROOT %t = f32[1024,1024,1024] tanh(%p)
+    }
+
+    %f_c (p: f32[1024,1024,1024]) -> f32[1024,1024,1024] {
+      %p = f32[1024,1024,1024] parameter(0)
+      ROOT %t = f32[1024,1024,1024] add(%p, %p)
+    }
+
+    ENTRY entry {
+      p0 = f32[1024,1024,1024] parameter(0)
+      f1 = f32[1024,1024,1024] fusion(p0), kind=kLoop, calls=%f_b
+      ROOT f2 = f32[1024,1024,1024] fusion(f1), kind=kLoop, calls=%f_c
+    })")
+                    .ValueOrDie();
+  EXPECT_TRUE(FusionMerger().Run(module.get()).ValueOrDie());
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
index f2d29b5d11f..cc4de2c1099 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@@ -29,12 +29,15 @@ limitations under the License.
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/InitAllDialects.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/protobuf_util.h"
 #include "tensorflow/compiler/xla/service/algebraic_simplifier.h"
 #include "tensorflow/compiler/xla/service/all_reduce_combiner.h"
 #include "tensorflow/compiler/xla/service/batchnorm_expander.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/call_inliner.h"
+#include "tensorflow/compiler/xla/service/comparison_expander.h"
 #include "tensorflow/compiler/xla/service/conditional_canonicalizer.h"
 #include "tensorflow/compiler/xla/service/conditional_simplifier.h"
 #include "tensorflow/compiler/xla/service/convolution_4d_expander.h"
@@ -43,6 +46,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/dynamic_index_splitter.h"
 #include "tensorflow/compiler/xla/service/dynamic_padder.h"
 #include "tensorflow/compiler/xla/service/flatten_call_graph.h"
+#include "tensorflow/compiler/xla/service/gather_expander.h"
 #include "tensorflow/compiler/xla/service/gpu/alias_passthrough_params.h"
 #include "tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_rewriter.h"
 #include "tensorflow/compiler/xla/service/gpu/fusion_merger.h"
@@ -56,6 +60,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/gpu_sanitize_constant_names.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_scatter_expander.h"
 #include "tensorflow/compiler/xla/service/gpu/horizontal_fusion.h"
+#include "tensorflow/compiler/xla/service/gpu/horizontal_input_fusion.h"
 #include "tensorflow/compiler/xla/service/gpu/instruction_fusion.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emitter_context.h"
@@ -79,7 +84,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
 #include "tensorflow/compiler/xla/service/hlo_element_type_converter.h"
-#include "tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
@@ -139,6 +143,9 @@ Status GpuCompiler::OptimizeHloModule(
     pipeline.AddPass<RngExpander>();
     pipeline.AddPass<RngBitGeneratorExpander>(RandomAlgorithm::RNG_PHILOX);
 
+    // Comparison total order expander
+    pipeline.AddPass<ComparisonExpander>();
+
     // Remove zero-sized HLO from the input so that other passes don't have to
     // handle it.
     pipeline.AddPass<ZeroSizedHloElimination>();
@@ -190,11 +197,12 @@ Status GpuCompiler::OptimizeHloModule(
           /*layout_sensitive=*/false,
           /*allow_mixed_precision=*/false);
 
-      pipeline.AddPass<HloGetDimensionSizeRewriter>();
-
       // BatchNormExpander can create zero-sized ops, so zero-sized HLO
       // elimination has to come after that pass.
-      pipeline.AddPass<ZeroSizedHloElimination>();
+      pass.AddPass<ZeroSizedHloElimination>();
+
+      pass.AddPass<GatherExpander>(GatherExpander::kEliminateSimpleGathers);
+      pass.AddPass<ScatterExpander>(ScatterExpander::kEliminateSimpleScatters);
 
       AlgebraicSimplifierOptions options;
       // When transposes appear in a fusion node, we can easily adjust the
@@ -295,11 +303,13 @@ Status GpuCompiler::OptimizeHloModule(
 
     HloPassPipeline horizontal_fusion("horizontal_fusion");
     horizontal_fusion.AddPass<GpuHorizontalFusion>();
+    horizontal_fusion.AddPass<GpuHorizontalInputFusion>();
     horizontal_fusion.AddPass<HloCSE>(/*is_layout_sensitive=*/true,
                                       /*only_fusion_computations=*/true);
     horizontal_fusion.AddPass<HloDCE>();
     TF_RETURN_IF_ERROR(horizontal_fusion.Run(hlo_module).status());
   }
+
   {
     HloPassPipeline pipeline("all_reduce_combiner");
     pipeline.AddPass<AllReduceCombiner>(
@@ -476,7 +486,8 @@ static Status CompileModuleToLlvmIrImpl(
     int pointer_size, const HloProfileIndexMap* profile_index_map,
     std::unique_ptr<llvm::Module>* llvm_module,
     std::unique_ptr<BufferAssignment>* buffer_assignment,
-    std::unique_ptr<ThunkSchedule>* thunk_schedule) {
+    std::unique_ptr<ThunkSchedule>* thunk_schedule,
+    std::vector<GpuExecutable::ConstantInfo>* constants) {
   *llvm_module = absl::make_unique<llvm::Module>("", *llvm_context);
 
   (*llvm_module)->setTargetTriple(target_triple);
@@ -509,15 +520,19 @@ static Status CompileModuleToLlvmIrImpl(
   DumpHloModuleIfEnabled(*hlo_module, **buffer_assignment,
                          "after_optimizations");
 
+  mlir::MLIRContext mlir_context;
+
   IrEmitterContext ir_emitter_context(
       hlo_module, buffer_assignment->get(), platform_name, gpu_device_info,
-      cuda_compute_capability, profile_index_map, llvm_module->get());
+      cuda_compute_capability, profile_index_map, &mlir_context,
+      llvm_module->get());
 
   HloComputation* entry_computation = hlo_module->entry_computation();
-  IrEmitterUnnested ir_emitter(hlo_module->config(), entry_computation,
-                               &ir_emitter_context);
 
-  TF_RETURN_IF_ERROR(ir_emitter.EmitConstantGlobals());
+  TF_ASSIGN_OR_RETURN(
+      auto ir_emitter,
+      IrEmitterUnnested::Create(hlo_module->config(), entry_computation,
+                                &ir_emitter_context));
 
   {
     XLA_SCOPED_LOGGING_TIMER("GpuCompiler::RunBackend - IR emission");
@@ -526,9 +541,10 @@ static Status CompileModuleToLlvmIrImpl(
     ThunkSequence thunk_sequence;
     absl::Span<HloInstruction* const> order = hlo_schedule->ThunkLaunchOrder();
     for (HloInstruction* instruction : order) {
-      TF_RETURN_IF_ERROR(instruction->Visit(&ir_emitter));
-      TF_RETURN_IF_ERROR(ir_emitter.Postprocess(instruction));
-      std::unique_ptr<ThunkSequence> thunks = ir_emitter.ConsumeThunkSequence();
+      TF_RETURN_IF_ERROR(instruction->Visit(ir_emitter.get()));
+      TF_RETURN_IF_ERROR(ir_emitter->Postprocess(instruction));
+      std::unique_ptr<ThunkSequence> thunks =
+          ir_emitter->ConsumeThunkSequence();
 
       // The invariants between each input HloInstruction* and output Thunk* are
       // not all explicitly checked, but at least we can document them here:
@@ -566,6 +582,10 @@ static Status CompileModuleToLlvmIrImpl(
     *thunk_schedule = absl::make_unique<ThunkSchedule>(
         std::make_unique<ThunkSequence>(std::move(thunk_sequence)),
         std::move(stream_assignment), std::move(thunk_to_hlo));
+
+    if (constants) {
+      *constants = std::move(ir_emitter_context.constants());
+    }
   }
 
   return Status::OK();
@@ -631,12 +651,13 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
   std::unique_ptr<llvm::Module> llvm_module;
   std::unique_ptr<BufferAssignment> buffer_assignment;
   std::unique_ptr<ThunkSchedule> thunk_schedule;
+  std::vector<GpuExecutable::ConstantInfo> constants;
 
   TF_RETURN_IF_ERROR(CompileModuleToLlvmIrImpl(
       module.get(), &llvm_context, target_triple_, data_layout_,
       stream_exec->platform()->Name(), gpu_device_info, cuda_compute_capability,
       GetCanShareBuffer(), pointer_size_, profile_index_map.get(), &llvm_module,
-      &buffer_assignment, &thunk_schedule));
+      &buffer_assignment, &thunk_schedule, &constants));
 
   if (user_pre_optimization_hook_) {
     user_pre_optimization_hook_(*llvm_module);
@@ -682,7 +703,7 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
       backend_result.first, backend_result.second, gpu_version,
       std::move(thunk_schedule), std::move(module),
       std::move(buffer_assignment), std::move(profile_printer),
-      std::move(profile_index_map));
+      std::move(profile_index_map), std::move(constants));
   if (embed_ir_in_executable) {
     DCHECK_NE("", ir_module_string_before_opt);
     gpu_executable->set_ir_module_string(ir_module_string_before_opt);
@@ -716,7 +737,7 @@ StatusOr<std::unique_ptr<llvm::Module>> CompileModuleToLlvmIr(
       hlo_module, llvm_context, target_triple, data_layout, platform_name,
       gpu_device_info, cuda_compute_capability, DummyCanShareBufferFunction,
       pointer_size, /*profile_index_map=*/nullptr, &llvm_module,
-      &buffer_assignment, &thunk_schedule));
+      &buffer_assignment, &thunk_schedule, nullptr));
   return llvm_module;
 }
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_conv_padding_legalization.cc b/tensorflow/compiler/xla/service/gpu/gpu_conv_padding_legalization.cc
index 5fa102ac785..94f9a96c0fe 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_conv_padding_legalization.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_conv_padding_legalization.cc
@@ -313,7 +313,11 @@ bool GpuConvPaddingLegalization::CanonicalizeBackwardInputConvolution(
                             new_backward_conv_window.mutable_dimensions(i));
     }
     // Decreasing the padding by X *increases* the size of our output by X.
-    int64 dim = backward_conv_dnums.output_spatial_dimensions(i);
+    // Note that we have swapped input spatial dimensions with output spatial
+    // dimensions to be compatible with the cuDNN API, so
+    // input_spatial_dimensions(i) gives the i-th spatial dimension of the
+    // output.
+    int64 dim = backward_conv_dnums.input_spatial_dimensions(i);
     new_backward_conv_shape.set_dimensions(
         dim, new_backward_conv_shape.dimensions(dim) +
                  std::abs(padding_low - padding_high));
@@ -353,7 +357,11 @@ bool GpuConvPaddingLegalization::CanonicalizeBackwardInputConvolution(
   for (size_t i = 0; i < backward_conv->window().dimensions_size(); ++i) {
     int64 padding_low = backward_conv->window().dimensions(i).padding_low();
     int64 padding_high = backward_conv->window().dimensions(i).padding_high();
-    int64 dim = backward_conv_dnums.output_spatial_dimensions(i);
+    // Note that we have swapped input spatial dimensions with output spatial
+    // dimensions to be compatible with the cuDNN API, so
+    // input_spatial_dimensions(i) gives the i-th spatial dimension of the
+    // output.
+    int64 dim = backward_conv_dnums.input_spatial_dimensions(i);
     if (padding_low > padding_high) {
       // If the amount of low padding (of the old backward convolution) is
       // larger, we internally pad the low end of the activations and slice
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_conv_padding_legalization_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_conv_padding_legalization_test.cc
new file mode 100644
index 00000000000..c214486e18f
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/gpu_conv_padding_legalization_test.cc
@@ -0,0 +1,97 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/gpu_conv_padding_legalization.h"
+
+#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+namespace op = xla::testing::opcode_matchers;
+using ::testing::_;
+
+using GpuConvPaddingLegalizationTest = HloTestBase;
+
+TEST_F(GpuConvPaddingLegalizationTest, BackwardInputConvolve) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+  HloModule convolution_module
+ENTRY %convolution (operand f64[2,2,2,3]{3,2,1,0}) -> (f64[2,2,4,4]{3,2,1,0}, u8[0]) {
+  %operand = f64[2,2,2,3]{3,2,1,0} parameter(0)
+  %kernel = f64[2,3,2,3]{3,2,1,0} constant(
+  {
+    { /*i0=0*/
+    { /*i1=0*/
+      { 0.29629629629629628, 0.30246913580246915, 0.30864197530864196 },
+      { 0.31481481481481483, 0.32098765432098764, 0.3271604938271605 }
+    },
+    { /*i1=1*/
+      { 0.25925925925925924, 0.26543209876543211, 0.27160493827160492 },
+      { 0.27777777777777779, 0.2839506172839506, 0.29012345679012347 }
+    },
+    { /*i1=2*/
+      { 0.22222222222222221, 0.22839506172839505, 0.23456790123456789 },
+      { 0.24074074074074073, 0.24691358024691357, 0.25308641975308643 }
+    }
+    },
+    { /*i0=1*/
+    { /*i1=0*/
+      { 0.18518518518518517, 0.19135802469135801, 0.19753086419753085 },
+      { 0.20370370370370369, 0.20987654320987653, 0.21604938271604937 }
+    },
+    { /*i1=1*/
+      { 0.14814814814814814, 0.15432098765432098, 0.16049382716049382 },
+      { 0.16666666666666666, 0.1728395061728395, 0.17901234567901234 }
+    },
+    { /*i2=2*/
+      { 0.1111111111111111, 0.11728395061728394, 0.12345679012345678 },
+      { 0.12962962962962962, 0.13580246913580246, 0.1419753086419753 }
+    }
+    }
+  })
+  %reverse = f64[2,3,2,3]{3,2,1,0} reverse(%kernel), dimensions={0,1}
+  ROOT %custom-call = (f64[2,2,4,4]{3,2,1,0}, u8[0]{0}) custom-call(f64[2,2,2,3]{3,2,1,0} %operand, f64[2,3,2,3]{3,2,1,0} %reverse), window={size=2x3 stride=2x2 pad=0_0x0_1}, dim_labels=bf01_01io->b01f, custom_call_target="__cudnn$convBackwardInput", backend_config="{\"algorithm\":\"0\",\"tensor_ops_enabled\":false,\"conv_result_scale\":1,\"activation_mode\":\"0\",\"side_input_scale\":0}"
+}
+                                               )")
+                    .ValueOrDie();
+  ASSERT_TRUE(GpuConvPaddingLegalization().Run(module.get()).ValueOrDie());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root,
+              op::Tuple(op::Slice(op::GetTupleElement(
+                            op::CustomCall(kCudnnConvBackwardInputCallTarget, _,
+                                           op::Reverse(op::Constant())),
+                            0)),
+                        op::GetTupleElement()));
+  auto slice = root->operand(0);
+  Shape expected_slice_shape = ShapeUtil::MakeShape(F64, {2, 2, 4, 4});
+  EXPECT_TRUE(ShapeUtil::Equal(slice->shape(), expected_slice_shape));
+  auto conv = slice->operand(0);
+  Shape expected_conv_shape = ShapeUtil::MakeShape(F64, {2, 2, 4, 5});
+  EXPECT_TRUE(ShapeUtil::Equal(conv->shape(), expected_conv_shape));
+}
+
+}  // anonymous namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index 469f2919fba..c963dfb2b2a 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -60,14 +60,16 @@ GpuExecutable::GpuExecutable(
     std::shared_ptr<HloModule> hlo_module,
     std::shared_ptr<const BufferAssignment> assignment,
     std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
-    std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
+    std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map,
+    std::vector<ConstantInfo> globals)
     : Executable(std::move(hlo_module), std::move(hlo_profile_printer_data),
                  std::move(hlo_profile_index_map)),
       text_(text),
       binary_(binary),
       gpu_version_(gpu_version),
       thunk_schedule_(std::move(thunk_schedule)),
-      assignment_(std::move(assignment)) {
+      assignment_(std::move(assignment)),
+      constants_(std::move(globals)) {
   CHECK(has_module() && assignment_);
   GpuDebugInfoManager::Get()->RegisterModule(module().name(), shared_module(),
                                              assignment_);
@@ -280,28 +282,23 @@ GpuExecutable::ResolveConstantGlobals(se::Stream* stream) {
   se::ModuleHandle module_handle;
   TF_RETURN_IF_ERROR(executor->LoadModule(module_spec, &module_handle));
 
-  for (BufferAllocation::Index i = 0; i < assignment_->Allocations().size();
-       ++i) {
-    const BufferAllocation& allocation = assignment_->GetAllocation(i);
-    if (allocation.is_constant()) {
-      TF_ASSIGN_OR_RETURN(
-          se::DeviceMemoryBase global,
-          executor->GetUntypedSymbol(
-              llvm_ir::ConstantBufferAllocationToGlobalName(allocation),
-              module_handle));
-      VLOG(3) << "Resolved global "
-              << llvm_ir::ConstantBufferAllocationToGlobalName(allocation)
-              << " to " << global.opaque();
-      InsertOrDie(&globals, i, global);
+  for (const auto& info : constants_) {
+    const Literal& literal = info.content;
 
-      const Literal& literal =
-          llvm_ir::LiteralForConstantAllocation(allocation);
-      CHECK(literal.shape().IsArray());
-      if (!ShouldEmitLiteralInLlvmIr(literal)) {
-        VLOG(3) << "H2D memcpy for constant with shape "
-                << ShapeUtil::HumanString(literal.shape());
-        stream->ThenMemcpy(&global, literal.untyped_data(), allocation.size());
-      }
+    TF_ASSIGN_OR_RETURN(auto global, executor->GetUntypedSymbol(
+                                         info.symbol_name, module_handle));
+    VLOG(3) << "Resolved global " << info.symbol_name << " to "
+            << global.opaque();
+
+    CHECK(literal.shape().IsArray());
+    if (!ShouldEmitLiteralInLlvmIr(literal)) {
+      VLOG(3) << "H2D memcpy for constant with shape "
+              << ShapeUtil::HumanString(literal.shape());
+      stream->ThenMemcpy(&global, literal.untyped_data(), literal.size_bytes());
+    }
+
+    if (info.allocation_index != -1) {
+      InsertOrDie(&globals, info.allocation_index, global);
     }
   }
 
@@ -334,7 +331,11 @@ StatusOr<se::DeviceMemoryBase> GpuExecutable::BufferForAllocation(
     }
     return registered_buffer;
   } else if (allocation.is_constant()) {
-    return FindOrDie(*globals, arg_idx);
+    auto it = globals->find(arg_idx);
+    if (it == globals->end()) {
+      return se::DeviceMemoryBase();
+    }
+    return it->second;
   } else {
     // Allocate each allocation that might escape, or is the temp buffer.
     CHECK(allocation.maybe_live_out() || allocation.IsPreallocatedTempBuffer());
@@ -480,6 +481,12 @@ StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStream(
       ExecutionInput& input = arguments[alias->parameter_number];
       MaybeOwningDeviceMemory* maybe_owning_memory =
           input.MutableBuffer(alias->parameter_index);
+      if (alias->must_alias() && !maybe_owning_memory->HasOwnership()) {
+        return InvalidArgument(
+            "An input was configured to be must-alias at "
+            "compile time but not donated at runtime: %s",
+            alias->ToString());
+      }
       if (absl::optional<se::OwningDeviceMemory> owning =
               maybe_owning_memory->Release()) {
         // If the caller passes the ownership of the device memory, reuse it
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.h b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
index 516fa9b269a..613880fd44b 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
@@ -49,6 +49,12 @@ namespace gpu {
 // This is an immutable data type after initialization, and thus thread safe.
 class GpuExecutable : public Executable {
  public:
+  struct ConstantInfo {
+    std::string symbol_name;
+    xla::Literal content;
+    int allocation_index = -1;
+  };
+
   // We need to share ownership of hlo_module and assignment with profiler to
   // safely keep a reference to these objects during tracing period, thus they
   // are passed as shared pointers.
@@ -58,7 +64,8 @@ class GpuExecutable : public Executable {
                 std::shared_ptr<HloModule> hlo_module,
                 std::shared_ptr<const BufferAssignment> assignment,
                 std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
-                std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map);
+                std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map,
+                std::vector<ConstantInfo> constants);
   ~GpuExecutable() override;
 
   int64 SizeOfGeneratedCodeInBytes() const override;
@@ -169,6 +176,8 @@ class GpuExecutable : public Executable {
   std::map<stream_executor::StreamExecutor*, BufferAllocToDeviceMemoryMap>
       module_globals_ TF_GUARDED_BY(module_handle_mutex_);
 
+  std::vector<ConstantInfo> constants_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(GpuExecutable);
 };
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc b/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
index bb4184ff76f..e56fe4dd74b 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
@@ -143,29 +143,27 @@ bool IsInputFusibleReduction(const HloInstruction& instr) {
          IsReductionFromOrToContiguousDimensions(instr);
 }
 
+const HloInstruction* GetRealHeroForMultiOutputFusion(
+    const HloInstruction& instr) {
+  if (instr.opcode() != HloOpcode::kFusion) {
+    return &instr;
+  }
+  auto fused_expression_root = instr.fused_expression_root();
+  if (!instr.IsMultiOutputFusion()) {
+    return fused_expression_root;
+  }
+  // If possible, we want to pick a reduction-from-or-to-contiguous-dims
+  // operand of the fusion root, because it has the most constraints.
+  for (const auto* inst : fused_expression_root->operands()) {
+    if (IsReductionFromOrToContiguousDimensions(*inst)) {
+      return inst;
+    }
+  }
+  return fused_expression_root->operands()[0];
+}
+
 bool ShapesCompatibleForMultiOutputFusion(const HloInstruction& instr1,
                                           const HloInstruction& instr2) {
-  // Returns the instructions that determines the emitter used for lowering,
-  // sometimes referred to as "the real hero".
-  auto get_real_hero =
-      [&](const HloInstruction* instr) -> const HloInstruction* {
-    if (instr->opcode() != HloOpcode::kFusion) {
-      return instr;
-    }
-    auto fused_expression_root = instr->fused_expression_root();
-    if (!instr->IsMultiOutputFusion()) {
-      return fused_expression_root;
-    }
-    // If possible, we want to pick a reduction-to-vector operand of the
-    // fusion root, because it has the most constraints.
-    for (const auto* inst : fused_expression_root->operands()) {
-      if (IsReductionFromOrToContiguousDimensions(*inst)) {
-        return inst;
-      }
-    }
-    return fused_expression_root->operands()[0];
-  };
-
   // Multi-output fusion kernels share a common parallel loop. The loop
   // dimensions are determined by instruction shapes.
   auto get_loop_shape = [&](const HloInstruction* element_instr) {
@@ -181,8 +179,8 @@ bool ShapesCompatibleForMultiOutputFusion(const HloInstruction& instr1,
   // root ops should have equal output shapes. An exception are
   // reduction-to-vector ops. Here the input shapes of the reduction (first
   // operand shape) and the reduction dimensions need to match.
-  auto* instr_1 = get_real_hero(&instr1);
-  auto* instr_2 = get_real_hero(&instr2);
+  auto* instr_1 = GetRealHeroForMultiOutputFusion(instr1);
+  auto* instr_2 = GetRealHeroForMultiOutputFusion(instr2);
   if (IsReductionFromOrToContiguousDimensions(*instr_1) &&
       IsReductionFromOrToContiguousDimensions(*instr_2) &&
       !AreFusedReductionOutputsConsistent({instr_1, instr_2}, instr_1)) {
@@ -347,8 +345,13 @@ static int64 SharedMemoryUsage(const HloInstruction& instr) {
 // This limit is also often good for performance.  In a fusion with many
 // operands, each GPU thread likely has to do a lot of work, and so possibly
 // uses a lot of registers, thus limiting occupancy.
+//
+// If the fusion is a producer/consumer fusion and instr1 is the
+// consumer and instr2 is the producer, set is_consumer_producer_fusion
+// to true to enable more fusion.
 bool FusionWouldBeTooLarge(const HloInstruction& instr1,
-                           const HloInstruction& instr2) {
+                           const HloInstruction& instr2,
+                           bool is_consumer_producer_fusion) {
   if (SharedMemoryUsage(instr1) + SharedMemoryUsage(instr2) >
       kSharedMemoryBudgetInBytes) {
     VLOG(5) << "Shared memory usage of fusion of " << instr1.ToString()
@@ -404,6 +407,17 @@ bool FusionWouldBeTooLarge(const HloInstruction& instr1,
   // producer -> consumer relationship.
   operands.erase(&instr1);
   operands.erase(&instr2);
+
+  // If we generate the same numbers of inputs and outputs as
+  // before, it won't be bigger after fusion. So accept the fusion.
+  // As this is a consumer_producer fusion, this does not change the
+  // consumer numbers of output. So no need to check it.
+  if (is_consumer_producer_fusion &&
+      operands.size() <= instr1.operands().size()) {
+    return false;
+  }
+
+  // Does the new fusion have more operands and outputs than the max?
   return operands.size() + num_output_buffers > kMaxOperandsAndOutputsPerFusion;
 }
 
@@ -490,5 +504,24 @@ HloInstruction::FusionKind ChooseFusionKind(const HloInstruction& /*producer*/,
                                   : HloInstruction::FusionKind::kLoop;
 }
 
+bool IsConsumerTheOnlyNonRootUser(const HloInstruction& instr,
+                                  const HloInstruction& consumer) {
+  return absl::c_all_of(instr.users(), [&](const HloInstruction* user) {
+    if (user->opcode() == HloOpcode::kGetTupleElement) {
+      // Skip GTE.
+      return IsConsumerTheOnlyNonRootUser(*user, consumer);
+    }
+    if (user == &consumer) {
+      // `user` is `consumer`.
+      return true;
+    }
+    if (user == user->parent()->root_instruction()) {
+      // Consumed by ROOT.
+      return true;
+    }
+    return false;
+  });
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_fusible.h b/tensorflow/compiler/xla/service/gpu/gpu_fusible.h
index e2a42ecb0a3..5296b8b4096 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_fusible.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_fusible.h
@@ -64,14 +64,23 @@ bool IsInputFusibleScatter(const HloInstruction& instr);
 // Determines whether the combination of `instr1` and `instr2` into a (possibly
 // multi-output) fusion would be "too large" -- i.e., have more operands and
 // outputs than is allowed or occupy too much shared memory.
+// If the fusion is a producer/consumer fusion and instr1 is the
+// consumer and instr2 is the producer, set consumer_producer_fusion
+// to true to enable more fusion.
 bool FusionWouldBeTooLarge(const HloInstruction& instr1,
-                           const HloInstruction& instr2);
+                           const HloInstruction& instr2,
+                           bool is_consumer_producer_fusion = false);
 
 // Check if fusing producer and consumer will generate a nested loop, e.g. both
 // producer and consumer are `reduce-window` HLO instructions.
 bool CreatesNestedLoop(const HloInstruction& producer,
                        const HloInstruction& consumer);
 
+// Returns the instruction that determines the emitter used for lowering,
+// sometimes referred to as "the real hero".
+const HloInstruction* GetRealHeroForMultiOutputFusion(
+    const HloInstruction& instr);
+
 // Whether instruction shapes are compatible for multi-output fusion, i.e.
 // whether the emitters support lowering the resulting fusion.
 // This function works for both, sibling and producer-consumer multi-output
@@ -101,6 +110,10 @@ bool IsFusibleAsMultiOutputFusionRoot(const HloInstruction& instr);
 HloInstruction::FusionKind ChooseFusionKind(const HloInstruction& producer,
                                             const HloInstruction& consumer);
 
+// Returns whether `consumer` is the only non-root user of `instr`.
+bool IsConsumerTheOnlyNonRootUser(const HloInstruction& instr,
+                                  const HloInstruction& consumer);
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_scatter_expander.cc b/tensorflow/compiler/xla/service/gpu/gpu_scatter_expander.cc
index 6287f1e3ca2..31f011fa734 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_scatter_expander.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_scatter_expander.cc
@@ -23,26 +23,11 @@ limitations under the License.
 
 namespace xla {
 
-StatusOr<bool> GpuScatterExpander::Run(HloModule* module) {
-  auto is_nontrivial_scatter = [](HloInstruction* inst) {
-    // TODO(b/129698548): Scattering elements larger than 64 bits is not
-    // supported by XLA:GPU.
-    return inst->opcode() == HloOpcode::kScatter &&
-           inst->shape().element_type() == C128;
-  };
-
-  std::vector<HloInstruction*> scatter_instrs;
-  for (HloComputation* computation : module->MakeNonfusionComputations()) {
-    absl::c_copy_if(computation->instructions(),
-                    std::back_inserter(scatter_instrs), is_nontrivial_scatter);
-  }
-
-  for (HloInstruction* inst : scatter_instrs) {
-    TF_ASSIGN_OR_RETURN(HloInstruction * expanded_root, ExpandScatter(inst));
-    TF_RETURN_IF_ERROR(inst->parent()->ReplaceInstruction(inst, expanded_root));
-  }
-
-  return !scatter_instrs.empty();
+bool GpuScatterExpander::InstructionMatchesPattern(HloInstruction* inst) {
+  // TODO(b/129698548): Scattering elements larger than 64 bits is not
+  // supported by XLA:GPU.
+  return inst->opcode() == HloOpcode::kScatter &&
+         primitive_util::BitWidth(inst->shape().element_type()) > 64;
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_scatter_expander.h b/tensorflow/compiler/xla/service/gpu/gpu_scatter_expander.h
index 0818b32474f..92acb909729 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_scatter_expander.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_scatter_expander.h
@@ -20,10 +20,17 @@ limitations under the License.
 
 namespace xla {
 
+// Legalizes scatters on the GPU.
 class GpuScatterExpander : public ScatterExpander {
  public:
+  // Although we pass kEliminateAllScatters, we override this behavior in
+  // InstruuctionMatchesPattern and select only some scatters to expand.
+  GpuScatterExpander() : ScatterExpander(kEliminateAllScatters) {}
+
   absl::string_view name() const override { return "gpu_scatter_expander"; }
-  StatusOr<bool> Run(HloModule* module) override;
+
+ protected:
+  bool InstructionMatchesPattern(HloInstruction* inst) override;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc
index 5d38d1b727c..26a22005dae 100644
--- a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc
+++ b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc
@@ -83,6 +83,8 @@ void HloToIrBindings::EmitBasePointersForHlos(
           if (non_io_hlo->opcode() == HloOpcode::kConstant) {
             llvm::Value* global_for_constant = module_->getGlobalVariable(
                 llvm_ir::ConstantHloToGlobalName(*non_io_hlo));
+            CHECK(global_for_constant)
+                << llvm_ir::ConstantHloToGlobalName(*non_io_hlo);
             BindHloToIrValue(*non_io_hlo, global_for_constant);
           } else {
             llvm::Type* pointee_type =
@@ -117,11 +119,11 @@ static bool HasMeaningfulName(llvm::Value* value) {
   return false;
 }
 
-llvm::Value* HloToIrBindings::GetTypedIrValue(const HloInstruction& hlo,
-                                              ShapeIndexView shape_index,
-                                              llvm::Value* ir_value) {
-  llvm::Type* pointee_type = llvm_ir::ShapeToIrType(
-      ShapeUtil::GetSubshape(hlo.shape(), shape_index), module_);
+llvm::Value* CastToTypedValue(const Shape& shape, llvm::Value* ir_value,
+                              llvm::IRBuilder<>* b) {
+  llvm::Type* pointee_type =
+      llvm_ir::ShapeToIrType(shape, b->GetInsertBlock()->getModule());
+
   llvm::Type* dest_type = pointee_type->getPointerTo();
 
   llvm::Value* typed_ir_value;
@@ -129,9 +131,17 @@ llvm::Value* HloToIrBindings::GetTypedIrValue(const HloInstruction& hlo,
     typed_ir_value = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
         llvm::cast<llvm::GlobalVariable>(ir_value), dest_type);
   } else {
-    typed_ir_value = b_->CreatePointerBitCastOrAddrSpaceCast(
+    typed_ir_value = b->CreatePointerBitCastOrAddrSpaceCast(
         ir_value, pointee_type->getPointerTo());
   }
+  return typed_ir_value;
+}
+
+llvm::Value* HloToIrBindings::GetTypedIrValue(const HloInstruction& hlo,
+                                              ShapeIndexView shape_index,
+                                              llvm::Value* ir_value) {
+  auto typed_ir_value = CastToTypedValue(
+      ShapeUtil::GetSubshape(hlo.shape(), shape_index), ir_value, b_);
   if (!HasMeaningfulName(ir_value)) {
     ir_value->setName(llvm_ir::IrName(&hlo, "raw"));
   }
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h
index 5eef6727801..3813ec6c949 100644
--- a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h
+++ b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h
@@ -116,6 +116,10 @@ class HloToIrBindings {
   llvm::Value* temp_buffer_base_ = nullptr;
 };
 
+// Converts `ir_value` with type i8* to a typed LLVM Value* based on `shape`.
+llvm::Value* CastToTypedValue(const Shape& shape, llvm::Value* ir_value,
+                              llvm::IRBuilder<>* b);
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/gpu/horizontal_fusion.cc b/tensorflow/compiler/xla/service/gpu/horizontal_fusion.cc
index 6d663c66b50..d11d1659d51 100644
--- a/tensorflow/compiler/xla/service/gpu/horizontal_fusion.cc
+++ b/tensorflow/compiler/xla/service/gpu/horizontal_fusion.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/layout_util.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_fusible.h"
 #include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/util/env_var.h"
@@ -137,25 +138,6 @@ bool IsFusionSupported(const HloInstruction& instr) {
   return true;
 }
 
-bool IsConsumerTheOnlyNonRootUser(const HloInstruction& instr,
-                                  const HloInstruction& consumer) {
-  return absl::c_all_of(instr.users(), [&](const HloInstruction* user) {
-    if (user->opcode() == HloOpcode::kGetTupleElement) {
-      // Skip GTE.
-      return IsConsumerTheOnlyNonRootUser(*user, consumer);
-    } else if (user == &consumer) {
-      // `user` is `consumer`.
-      return true;
-    } else if (user == user->parent()->root_instruction()) {
-      // Consumed by ROOT is always fine, since it is impossible to create
-      // cycles through ROOT.
-      return true;
-    } else {
-      return false;
-    }
-  });
-}
-
 // Returns whether `instr` is a profitable candidate to be horizontally fused.
 // Since the primary benefit of horizontal fusion comes from reducing the
 // kernel launch overhead, we want to exclude the instructions with
diff --git a/tensorflow/compiler/xla/service/gpu/horizontal_input_fusion.cc b/tensorflow/compiler/xla/service/gpu/horizontal_input_fusion.cc
new file mode 100644
index 00000000000..58ed9f18840
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/horizontal_input_fusion.cc
@@ -0,0 +1,168 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/horizontal_input_fusion.h"
+
+#include <algorithm>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
+#include "tensorflow/compiler/xla/layout_util.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_fusible.h"
+#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
+#include "tensorflow/core/platform/errors.h"
+
+namespace xla {
+namespace gpu {
+
+namespace {
+
+// Gets the representative input shape of the multi-output fusion.
+Shape GetInputShapeForMultiOutputFusion(const HloInstruction& instr) {
+  // Get the HLO that determines the emitter used for lowering.
+  const HloInstruction* real_hero = GetRealHeroForMultiOutputFusion(instr);
+  if (real_hero->operands().empty()) {
+    // Simply return an empty shape if the representative node has no input
+    // operands.
+    return Shape();
+  } else {
+    return real_hero->operand(0)->shape();
+  }
+}
+
+class HorizontalInputFusionImpl {
+ public:
+  explicit HorizontalInputFusionImpl(HloComputation* computation)
+      : computation_(computation) {}
+
+  ~HorizontalInputFusionImpl() {}
+
+  StatusOr<bool> Run();
+
+ private:
+  HloComputation* computation_;
+};  // HorizontalInputFusionImpl
+
+// Compares one-by-one the dimensions of `shape_a` and `shape_b` from left to
+// right.
+bool CompareShapeDimsFromLeftToRight(const Shape& shape_a,
+                                     const Shape& shape_b) {
+  if (shape_a.rank() != shape_b.rank()) {
+    return shape_a.rank() < shape_b.rank();
+  }
+  auto dims_a = shape_a.dimensions();
+  auto dims_b = shape_b.dimensions();
+  for (size_t i = 0; i < dims_a.size(); ++i) {
+    if (dims_a[i] != dims_b[i]) {
+      return dims_a[i] < dims_b[i];
+    }
+  }
+  return true;
+}
+
+std::vector<HloInstruction*> FindAndSortFusionCandidates(
+    HloInstruction* consumer) {
+  absl::flat_hash_set<HloInstruction*> fusion_instr_set;
+  for (auto opnd : consumer->operands()) {
+    HloInstruction* predecessor = opnd->LatestNonGteAncestor();
+    // Find out the input fusion instructions whose only consumer is `consumer`.
+    // This guarantees that fusing these candidates will never create cycles, as
+    // there is no back edge.
+    if (IsReduceInputFusion(*predecessor) &&
+        IsConsumerTheOnlyNonRootUser(*predecessor, *consumer)) {
+      fusion_instr_set.insert(predecessor);
+    }
+  }
+
+  std::vector<HloInstruction*> fusion_instrs;
+  fusion_instrs.insert(fusion_instrs.end(), fusion_instr_set.begin(),
+                       fusion_instr_set.end());
+
+  std::sort(fusion_instrs.begin(), fusion_instrs.end(),
+            [&](const HloInstruction* a, const HloInstruction* b) {
+              Shape shape_a = GetInputShapeForMultiOutputFusion(*a);
+              Shape shape_b = GetInputShapeForMultiOutputFusion(*b);
+              if (!ShapeUtil::EqualIgnoringElementType(shape_a, shape_b)) {
+                // Sort shapes according to dimensions, so that the same input
+                // shapes will be placed adjacent each other.
+                return CompareShapeDimsFromLeftToRight(shape_a, shape_b);
+              }
+              // Sort `fusion_instrs` according to instruction counts, because
+              // we'd like to fuse together computations of similar sizes.
+              return a->fused_instruction_count() <
+                     b->fused_instruction_count();
+            });
+
+  return fusion_instrs;
+}
+
+StatusOr<bool> HorizontalInputFusionImpl::Run() {
+  bool changed = false;
+  XLA_VLOG_LINES(3, computation_->ToString());
+
+  // Using def-to-use order is sound since we do not modify users.
+  std::vector<HloInstruction*> def_to_use_order =
+      computation_->MakeInstructionPostOrder();
+  for (size_t i = 0; i < def_to_use_order.size(); ++i) {
+    auto consumer = def_to_use_order[i];
+    auto candidates = FindAndSortFusionCandidates(consumer);
+    if (candidates.empty()) {
+      continue;
+    }
+
+    size_t fusion_anchor_id = 0;
+    for (size_t j = 1; j < candidates.size(); ++j) {
+      HloInstruction* fusion_anchor = candidates[fusion_anchor_id];
+      HloInstruction* fused = candidates[j];
+      if (ShapesCompatibleForMultiOutputFusion(*fusion_anchor, *fused) &&
+          !FusionWouldBeTooLarge(*fusion_anchor, *fused)) {
+        VLOG(3) << "Fuse " << fused->ToString() << " into "
+                << fusion_anchor->ToString();
+        fusion_anchor->MergeFusionInstructionIntoMultiOutput(fused);
+        changed = true;
+      } else {
+        // Update the `fusion_anchor_id` since `fused` is either not
+        // compatible or not beneficial to be fused with current fusion anchor.
+        VLOG(3) << j - fusion_anchor_id - 1 << " instructions are fused.";
+        fusion_anchor_id = j;
+      }
+    }
+  }
+
+  return changed;
+}
+
+}  // namespace
+
+StatusOr<bool> GpuHorizontalInputFusion::RunOnComputation(
+    HloComputation* computation) {
+  HorizontalInputFusionImpl horizontal_fusion_impl(computation);
+  return horizontal_fusion_impl.Run();
+}
+
+StatusOr<bool> GpuHorizontalInputFusion::Run(HloModule* module) {
+  bool changed = false;
+  VLOG(2) << "Run horizontal input fusion.";
+  for (auto* comp : module->MakeNonfusionComputations()) {
+    TF_ASSIGN_OR_RETURN(changed, RunOnComputation(comp));
+  }
+
+  return changed;
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/horizontal_input_fusion.h b/tensorflow/compiler/xla/service/gpu/horizontal_input_fusion.h
new file mode 100644
index 00000000000..85313d03412
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/horizontal_input_fusion.h
@@ -0,0 +1,57 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_HORIZONTAL_INPUT_FUSION_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_HORIZONTAL_INPUT_FUSION_H_
+
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace xla {
+namespace gpu {
+
+// This optimization pass horizontally fuses kInput fusions to both reduce the
+// kernel launch overhead and increase parallelism degree. See
+// GpuHorizontalFusion for general description and motivation about horizontal
+// fusion. GpuHorizontalFusion deals with kLoop fusions while this pass deals
+// with kInput fusions.
+//
+// Following GpuHorizontalFusion, a simple yet effective heuristic is used
+// to search the fusion candidates while avoiding creating cycles. That is,
+// we simply search for fusion candidates by looking for instructions whose
+// outputs are all consumed by the same instruction. This catches the typical
+// target cases; often, the candidate instructions are just consumed by the
+// ROOT tuple of the entry computation.
+class GpuHorizontalInputFusion : public HloModulePass {
+ public:
+  GpuHorizontalInputFusion() {}
+
+  absl::string_view name() const override {
+    return "gpu_horizontal_input_fusion";
+  }
+
+  StatusOr<bool> Run(HloModule* module) override;
+
+ private:
+  StatusOr<bool> RunOnComputation(HloComputation*);
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_HORIZONTAL_INPUT_FUSION_H_
diff --git a/tensorflow/compiler/xla/service/gpu/horizontal_input_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/horizontal_input_fusion_test.cc
new file mode 100644
index 00000000000..88fdd3ec293
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/horizontal_input_fusion_test.cc
@@ -0,0 +1,146 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/horizontal_input_fusion.h"
+
+#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/tests/filecheck.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+namespace op = xla::testing::opcode_matchers;
+
+class HorizontalInputFusionTest : public GpuCodegenTest {};
+
+TEST_F(HorizontalInputFusionTest, BasicTest) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+ HloModule BasicTest
+
+  %add_f16 {
+    %x = f16[] parameter(0)
+    %y = f16[] parameter(1)
+    ROOT %add = f16[] add(%x, %y)
+  }
+
+ fused_computation.1 {
+   arg.1 = f16[1024]{0} parameter(0)
+   constant0 = f16[] constant(0)
+   ROOT reduce1 = f16[] reduce(arg.1, constant0), dimensions={0}, to_apply=%add_f16
+ }
+
+ fused_computation.2 {
+   arg.1 = f16[1024]{0} parameter(0)
+   constant0 = f16[] constant(0)
+   ROOT reduce1 = f16[] reduce(arg.1, constant0), dimensions={0}, to_apply=%add_f16
+ }
+
+ ENTRY entry_computation {
+   arg.1 = f16[1024]{0} parameter(0)
+   arg.2 = f16[1024]{0} parameter(1)
+   fusion.1 = f16[] fusion(arg.1), kind=kInput, calls=fused_computation.1
+   fusion.2 = f16[] fusion(arg.2), kind=kInput, calls=fused_computation.2
+   ROOT tuple.1 = (f16[], f16[]) tuple(fusion.1, fusion.2)
+ }
+)")
+                    .ValueOrDie();
+
+  EXPECT_TRUE(GpuHorizontalInputFusion().Run(module.get()).ValueOrDie());
+
+  const HloInstruction* entry_root =
+      module->entry_computation()->root_instruction();
+  EXPECT_THAT(entry_root, op::Tuple((op::GetTupleElement(op::Fusion())),
+                                    (op::GetTupleElement(op::Fusion()))));
+
+  const HloInstruction* fusion = entry_root->operand(0)->operand(0);
+  ASSERT_TRUE(fusion->IsMultiOutputFusion());
+  EXPECT_THAT(fusion->fused_expression_root(),
+              op::Tuple(op::Reduce(), op::Reduce()));
+}
+
+TEST_F(HorizontalInputFusionTest, ManyInputFusions) {
+  auto module = CreateNewVerifiedModule();
+
+  HloComputation* reduce_computation;
+  {
+    auto embedded_builder = HloComputation::Builder("add");
+    auto lhs = embedded_builder.AddInstruction(HloInstruction::CreateParameter(
+        0, ShapeUtil::MakeShape(F32, {}), "lhs"));
+    auto rhs = embedded_builder.AddInstruction(HloInstruction::CreateParameter(
+        1, ShapeUtil::MakeShape(F32, {}), "rhs"));
+    embedded_builder.AddInstruction(
+        HloInstruction::CreateBinary(lhs->shape(), HloOpcode::kAdd, lhs, rhs));
+    reduce_computation =
+        module->AddEmbeddedComputation(embedded_builder.Build());
+  }
+
+  HloComputation::Builder builder(TestName());
+  std::vector<HloInstruction*> var_outs;
+  auto input_shape = ShapeUtil::MakeShape(F32, {1024, 1024});
+  auto output_shape = ShapeUtil::MakeShape(F32, {1024});
+  for (int64 i = 0; i < 130; ++i) {
+    // %fused_computation.3 (param_0: f32[1024,1024], param_1: f32[]) ->
+    //  f32[1024] {
+    //   %param_0 = f32[1024,1024]{1,0} parameter(0)
+    //   %param_1 = f32[] parameter(1)
+    //   %broadcast = f32[1024,1024]{1,0} broadcast(f32[] %param_1),
+    //   dimensions={}
+    //   %multiply = f32[1024,1024]{1,0}
+    //       multiply(f32[1024,1024]{1,0} %param_0, f32[1024,1024]{1,0}
+    //       %broadcast)
+    //   %constant0 = f32[] constant(0)
+    //   ROOT %reduce = f32[1024]{0}
+    //       reduce(f32[1024,1024]{1,0} %multiply, f32[] %constant0),
+    //           dimensions={1}, to_apply=%add
+    // }
+    HloInstruction* param_var_in = builder.AddInstruction(
+        HloInstruction::CreateParameter(i * 2 + 0, input_shape, "var.in"));
+    HloInstruction* param_alpha =
+        builder.AddInstruction(HloInstruction::CreateParameter(
+            i * 2 + 1, ShapeUtil::MakeShape(F32, {}), "alpha"));
+    auto alpha_broadcasted = builder.AddInstruction(
+        HloInstruction::CreateBroadcast(input_shape, param_alpha, {}));
+    auto mul = builder.AddInstruction(HloInstruction::CreateBinary(
+        input_shape, HloOpcode::kMultiply, param_var_in, alpha_broadcasted));
+    HloInstruction* const0 = builder.AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0)));
+    auto reduce = builder.AddInstruction(HloInstruction::CreateReduce(
+        output_shape, mul, const0, {1}, reduce_computation));
+    var_outs.push_back(reduce);
+  }
+  builder.AddInstruction(HloInstruction::CreateTuple(var_outs));
+  module->AddEntryComputation(builder.Build());
+
+  // Verify that horizontal fusion is kicked in. Check that there are multiple
+  // `reduce` instructions fused into the same fusion. 6 is just a randomly
+  // picked number as we don't exactly know how large the fusion will be
+  // created due to the `FusionWouldBeTooLarge` constraint.
+  CompileAndVerifyIr(module->Clone(), R"(CHECK: reduce-group-6)",
+                     /*match_optimized_ir=*/false);
+
+  // Testing with the entire gpu optimization pipeline.
+  EXPECT_TRUE(RunAndCompare(std::move(module), ErrorSpec{1e-5, 1e-5}));
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
index b994ead17ca..b90e4d85f80 100644
--- a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
@@ -60,18 +60,22 @@ bool GpuInstructionFusion::ShouldFuseInexpensiveChecks(HloInstruction* consumer,
 
   // Output fusions are not currently supported on GPUs.
   if (producer->opcode() == HloOpcode::kFusion) {
+    VLOG(4) << "Producer " << producer->name() << " is a fusion op";
     return false;
   }
   // Cost condition: not fuse (simple, expensive producers) and (consumers who
   // reuse operand elements).
-  if (producer->opcode() != HloOpcode::kFusion &&
-      consumer->ReusesOperandElements(operand_index) &&
-      is_expensive(*producer)) {
+  if (producer->opcode() != HloOpcode::kFusion && is_expensive(*producer) &&
+      ReusesOperandElements(consumer, operand_index)) {
+    VLOG(4) << "Do not fuse simple, expensive producer " << producer->name()
+            << " and consumer which reuses operand elements.";
     return false;
   }
 
   if (!IsProducerConsumerFusible(*producer, *consumer) ||
       !InstructionFusion::ShouldFuse(consumer, operand_index)) {
+    VLOG(4) << "Producer " << producer->name()
+            << " is not fusible or should not be fused.";
     return false;
   }
   return true;
@@ -87,7 +91,8 @@ bool GpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
   auto producer = consumer->operand(operand_index);
 
   // The following checks are potentially expensive.
-  if (FusionWouldBeTooLarge(*consumer, *producer)) {
+  if (FusionWouldBeTooLarge(*consumer, *producer,
+                            /*is_consumer_producer_fusion=*/true)) {
     VLOG(5) << "Fusion of (" << producer->ToString() << ") into ("
             << consumer->ToString() << ") would be too large";
     return false;
@@ -107,8 +112,12 @@ bool GpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
     fusion_node_evaluations_.emplace(consumer,
                                      FusionNodeIndexingEvaluation(consumer));
   }
-  return !fusion_node_evaluations_.at(consumer).AverageCodeDuplicationTooHigh(
-      producer);
+  if (fusion_node_evaluations_.at(consumer).CodeDuplicationTooHigh(producer)) {
+    VLOG(5) << "Fusion of " << producer->name() << " into " << consumer->name()
+            << " would result in overly large code duplication.";
+    return false;
+  }
+  return true;
 }
 
 bool GpuInstructionFusion::ShouldFuseIntoMultiOutput(HloInstruction* consumer,
diff --git a/tensorflow/compiler/xla/service/gpu/ir/xla_thunks_ops.cc b/tensorflow/compiler/xla/service/gpu/ir/xla_thunks_ops.cc
index 4dbd3196ae6..154612824ef 100644
--- a/tensorflow/compiler/xla/service/gpu/ir/xla_thunks_ops.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir/xla_thunks_ops.cc
@@ -28,7 +28,7 @@ namespace mlir {
 namespace xla_thunks {
 
 XLAThunksDialect::XLAThunksDialect(MLIRContext *context)
-    : Dialect(getDialectNamespace(), context) {
+    : Dialect(getDialectNamespace(), context, TypeID::get<XLAThunksDialect>()) {
   addOperations<
 #define GET_OP_LIST
 #include "tensorflow/compiler/xla/service/gpu/ir/xla_thunks_ops.cc.inc"
diff --git a/tensorflow/compiler/xla/service/gpu/ir/xla_thunks_ops.td b/tensorflow/compiler/xla/service/gpu/ir/xla_thunks_ops.td
index 38602550864..eb203e6917d 100644
--- a/tensorflow/compiler/xla/service/gpu/ir/xla_thunks_ops.td
+++ b/tensorflow/compiler/xla/service/gpu/ir/xla_thunks_ops.td
@@ -21,12 +21,6 @@ limitations under the License.
 include "mlir/Dialect/LLVMIR/LLVMOpBase.td"
 include "mlir/IR/OpBase.td"
 
-class LLVMPointerTo<Type ty>
-    : ContainerType<ty,
-                    CPred<"$_self.cast<::mlir::LLVM::LLVMType>().isPointerTy()">,
-                    "$_self.cast<::mlir::LLVM::LLVMType>().getPointerElementTy()",
-                    "LLVM pointer">;
-
 def XLAThunks_Dialect : Dialect {
   let name = "xla_thunks";
   let cppNamespace = "xla_thunks";
@@ -45,12 +39,12 @@ def AllocationSlice : StructAttr<"AllocationSlice", XLAThunks_Dialect, [
 
 def MemzeroThunkOp : ThunkOp<"execute_memzero_thunk"> {
   let arguments = (ins
-    LLVMPointerTo<LLVMI<8>>:$execute_params,
+    LLVM_PointerTo<LLVM_i8>:$execute_params,
     AllocationSlice:$allocation_slice
   );
   let results = (outs
     I<1>:$ok,
-    LLVMPointerTo<LLVMI<8>>:$error_message
+    LLVM_PointerTo<LLVM_i8>:$error_message
   );
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
index 6309d7fcdee..9d4ec358bd3 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
@@ -433,7 +433,7 @@ llvm::Value* EmitFullWarpShuffleDown(llvm::Value* value, llvm::Value* offset,
       builder->CreateZExt(
           builder->CreateBitCast(value, builder->getIntNTy(bit_width)),
           builder->getIntNTy(32 * num_segments)),
-      llvm::VectorType::get(builder->getInt32Ty(), num_segments));
+      llvm::VectorType::get(builder->getInt32Ty(), num_segments, false));
   for (int i = 0; i < num_segments; ++i) {
     llvm::Value* insert_val;
     if (target_triple.isNVPTX()) {
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
index 31203b9c5f0..2215881271c 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
@@ -30,12 +30,14 @@ limitations under the License.
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/service/elemental_ir_emitter.h"
 #include "tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_constants.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emitter_nested.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h"
 #include "tensorflow/compiler/xla/service/gpu/launch_dimensions.h"
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h"
@@ -98,6 +100,64 @@ Status IrEmitter::DefaultAction(HloInstruction* hlo) {
                 .MakeElementGenerator(hlo, operand_to_generator));
 }
 
+Status IrEmitter::EmitConstants(const HloComputation& computation,
+                                bool lookup_indices) {
+  for (HloInstruction* instr : computation.instructions()) {
+    if (instr->opcode() != HloOpcode::kConstant) {
+      continue;
+    }
+    Literal& literal = *Cast<HloConstantInstruction>(instr)->mutable_literal();
+    const bool should_emit_initializer = ShouldEmitLiteralInLlvmIr(literal);
+    llvm::ArrayType* global_type =
+        llvm::ArrayType::get(b_.getInt8Ty(), literal.size_bytes());
+    llvm::Constant* initializer =
+        should_emit_initializer
+            ? llvm_ir::ConvertLiteralToIrConstant(literal, module_)
+            : llvm::ConstantAggregateZero::get(global_type);
+    if (should_emit_initializer) {
+      VLOG(3) << "Emitted initializer for constant with shape "
+              << ShapeUtil::HumanString(literal.shape());
+    }
+
+    // These globals will be looked up by name by GpuExecutable so we need to
+    // give them an external linkage.  Not all of their uses are visible in
+    // the LLVM IR (e.g. TupleThunk) so we can't give then a linkage that
+    // merely preserves their names (like available_externally), we also need
+    // to ensure that they stick around even if they're "unused".
+    //
+    // We may have to be more more clever here in the future if we notice that
+    // we're keeping around too many globals because of their linkage.
+    unsigned global_address_space = llvm_ir::GetGlobalMemoryAddressSpace(
+        *ir_emitter_context_->llvm_module());
+
+    std::string global_name = llvm_ir::ConstantHloToGlobalName(*instr);
+
+    llvm::GlobalVariable* global_for_const = new llvm::GlobalVariable(
+        global_type, /*isConstant=*/should_emit_initializer,
+        llvm::GlobalValue::ExternalLinkage,
+        /*Initializer=*/initializer, global_name,
+        /*TLMode=*/llvm::GlobalValue::NotThreadLocal,
+        /*AddressSpace=*/global_address_space,
+        /*isExternallyInitialized=*/false);
+    global_for_const->setAlignment(llvm::Align(kConstantBufferAlignBytes));
+    ir_emitter_context_->llvm_module()->getGlobalList().push_back(
+        global_for_const);
+
+    GpuExecutable::ConstantInfo info;
+    info.symbol_name = global_name;
+    info.content = literal.Clone();
+    if (lookup_indices) {
+      auto maybe_slice =
+          ir_emitter_context_->buffer_assignment().GetUniqueSlice(instr, {});
+      if (maybe_slice.ok()) {
+        info.allocation_index = maybe_slice.ValueOrDie().index();
+      }
+    }
+    ir_emitter_context_->constants().push_back(std::move(info));
+  }
+  return Status::OK();
+}
+
 Status IrEmitter::HandleConstant(HloInstruction* constant) {
   return Status::OK();
 }
@@ -175,10 +235,12 @@ Status IrEmitter::EmitCallToNestedComputation(
   llvm::Function*& emitted_function =
       computation_to_ir_function_[&nested_computation];
   if (emitted_function == nullptr) {
-    IrEmitterNested ir_emitter_nested(hlo_module_config_, nested_computation,
-                                      ir_emitter_context_);
-    TF_RETURN_IF_ERROR(ir_emitter_nested.CodegenNestedComputation());
-    emitted_function = ir_emitter_nested.GetEmittedFunction();
+    TF_ASSIGN_OR_RETURN(
+        auto ir_emitter_nested,
+        IrEmitterNested::Create(hlo_module_config_, nested_computation,
+                                ir_emitter_context_));
+    TF_RETURN_IF_ERROR(ir_emitter_nested->CodegenNestedComputation());
+    emitted_function = ir_emitter_nested->GetEmittedFunction();
   }
 
   // Operands are in default address space for non-AMDGPU target.
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.h b/tensorflow/compiler/xla/service/gpu/ir_emitter.h
index 50e9f06ef08..1a387528220 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.h
@@ -105,6 +105,12 @@ class IrEmitter : public DfsHloVisitorWithDefault,
 
   llvm::IRBuilder<>* builder() { return &b_; }
 
+  // Emits constants to generated LLVM IR, and also populate related
+  // inforamtion to ir_emitter_context for large-constant initializations. If
+  // `lookup_indices` is true, the allocation index associated with the constant
+  // is also populated.
+  Status EmitConstants(const HloComputation& computation, bool lookup_indices);
+
  protected:
   // Constructs an IrEmitter with the given IrEmitter context.
   // ir_emitter_context is owned by the caller and should outlive the IrEmitter
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_context.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_context.h
index 9c43f80dc60..34b93ca5b3f 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_context.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_context.h
@@ -17,13 +17,19 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_IR_EMITTER_CONTEXT_H_
 
 #include "llvm/IR/Module.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
 #include "tensorflow/compiler/xla/service/gpu/launch_dimensions.h"
 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
 #include "tensorflow/compiler/xla/service/name_uniquer.h"
 
 namespace xla {
 namespace gpu {
+
 // IrEmitterContext encapsulates common (mutable and immutable) data structures
 // used by both IrEmitterNested and IrEmitterUnnested, such as the buffer
 // assignment and the name uniquer.
@@ -34,14 +40,20 @@ class IrEmitterContext {
       const HloModule* hlo_module, const BufferAssignment* buffer_assignment,
       std::string platform_name, GpuDeviceInfo gpu_device_info,
       absl::optional<CudaComputeCapability> cuda_compute_capability,
-      const HloProfileIndexMap* profile_index_map, llvm::Module* llvm_module)
+      const HloProfileIndexMap* profile_index_map,
+      mlir::MLIRContext* mlir_context, llvm::Module* llvm_module)
       : hlo_module_(hlo_module),
         buffer_assignment_(buffer_assignment),
         platform_name_(std::move(platform_name)),
         gpu_device_info_(gpu_device_info),
         cuda_compute_capability_(cuda_compute_capability),
         profile_index_map_(profile_index_map),
-        llvm_module_(llvm_module) {}
+        mlir_context_(mlir_context),
+        llvm_module_(llvm_module) {
+    mlir_context_
+        ->loadDialect<mlir::lmhlo::LmhloDialect, mlir::mhlo::MhloDialect,
+                      mlir::StandardOpsDialect>();
+  }
   // Disallow copy and assign.
   IrEmitterContext(const IrEmitterContext&) = delete;
   IrEmitterContext& operator=(const IrEmitterContext&) = delete;
@@ -57,9 +69,12 @@ class IrEmitterContext {
     return cuda_compute_capability_;
   }
   const HloProfileIndexMap* profile_index_map() { return profile_index_map_; }
+  mlir::MLIRContext* mlir_context() { return mlir_context_; }
   llvm::Module* llvm_module() { return llvm_module_; }
   NameUniquer* name_uniquer() { return &name_uniquer_; }
 
+  std::vector<GpuExecutable::ConstantInfo>& constants() { return constants_; }
+
  private:
   const HloModule* hlo_module_;
   const BufferAssignment* buffer_assignment_;
@@ -67,8 +82,10 @@ class IrEmitterContext {
   GpuDeviceInfo gpu_device_info_;
   absl::optional<CudaComputeCapability> cuda_compute_capability_;
   const HloProfileIndexMap* profile_index_map_;
+  mlir::MLIRContext* mlir_context_;
   llvm::Module* llvm_module_;
   NameUniquer name_uniquer_;
+  std::vector<GpuExecutable::ConstantInfo> constants_;
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.cc
index e96c5f05e60..5fc091ed8e7 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.cc
@@ -41,6 +41,16 @@ IrEmitterNested::IrEmitterNested(const HloModuleConfig& hlo_module_config,
     : IrEmitter(hlo_module_config, ir_emitter_context, /*is_nested=*/true),
       nested_computation_(nested_computation) {}
 
+StatusOr<std::unique_ptr<IrEmitterNested>> IrEmitterNested::Create(
+    const HloModuleConfig& hlo_module_config,
+    const HloComputation& nested_computation,
+    IrEmitterContext* ir_emitter_context) {
+  std::unique_ptr<IrEmitterNested> emitter(new IrEmitterNested(
+      hlo_module_config, nested_computation, ir_emitter_context));
+  TF_RETURN_IF_ERROR(emitter->EmitConstants(nested_computation, false));
+  return emitter;
+}
+
 // Nested function serves the same purpose on GPU as a thread-local function on
 // a CPU.
 Status IrEmitterNested::CodegenNestedComputation() {
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.h
index ce825851bcc..8ed76cabcda 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.h
@@ -39,12 +39,11 @@ namespace gpu {
 //
 class IrEmitterNested : public IrEmitter {
  public:
-  // Constructs an LLVM IR emitter for a nested HLO computation. `function` is
-  // the containing IR function this emitter produces IR to. See
-  // IrEmitter::IrEmitter for the meanings of other arguments.
-  IrEmitterNested(const HloModuleConfig& hlo_module_config,
-                  const HloComputation& nested_computation,
-                  IrEmitterContext* ir_emitter_context);
+  static StatusOr<std::unique_ptr<IrEmitterNested>> Create(
+      const HloModuleConfig& hlo_module_config,
+      const HloComputation& nested_computation,
+      IrEmitterContext* ir_emitter_context);
+
   IrEmitterNested(const IrEmitterNested&) = delete;
   IrEmitterNested& operator=(const IrEmitterNested&) = delete;
 
@@ -62,6 +61,13 @@ class IrEmitterNested : public IrEmitter {
   Status CodegenNestedComputation();
 
  private:
+  // Constructs an LLVM IR emitter for a nested HLO computation. `function` is
+  // the containing IR function this emitter produces IR to. See
+  // IrEmitter::IrEmitter for the meanings of other arguments.
+  IrEmitterNested(const HloModuleConfig& hlo_module_config,
+                  const HloComputation& nested_computation,
+                  IrEmitterContext* ir_emitter_context);
+
   const HloComputation& nested_computation_;
   llvm::Function* emitted_function_;
 };
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 34cdfb4ecf0..f7627c348b6 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "absl/container/inlined_vector.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
 #include "absl/types/optional.h"
 #include "absl/types/span.h"
 #include "llvm/ADT/StringRef.h"
@@ -36,6 +37,13 @@ limitations under the License.
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
+#include "tensorflow/compiler/mlir/xla/hlo_utils.h"
+#include "tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.h"
+#include "tensorflow/compiler/mlir/xla/type_to_shape.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
@@ -82,6 +90,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/union_find.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/window_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -133,7 +142,7 @@ void UpdateLaunchDimensions(const LaunchDimensions& launch_dims, Thunk* thunk,
   llvm::LLVMContext& llvm_context = llvm_module->getContext();
   llvm::ConstantInt* threads_per_block_ir_value = llvm::ConstantInt::get(
       llvm::IntegerType::get(llvm_context, /*NumBits=*/32),
-      launch_dims.threads_per_block());
+      launch_dims.thread_counts_per_block().x);
   // Our launch bounds are exact, so we can specify them as reqntidx rather than
   // maxntidx.
   nvvm_annotations_node->addOperand(llvm::MDNode::get(
@@ -143,13 +152,85 @@ void UpdateLaunchDimensions(const LaunchDimensions& launch_dims, Thunk* thunk,
        llvm::ConstantAsMetadata::get(threads_per_block_ir_value)}));
 }
 
+int64_t GetAllocationIndex(mlir::BlockArgument func_arg) {
+  auto func_op =
+      mlir::cast<mlir::FuncOp>(func_arg.getParentRegion()->getParentOp());
+  return func_op
+      .getArgAttrOfType<mlir::IntegerAttr>(func_arg.getArgNumber(),
+                                           "lmhlo.alloc")
+      .getValue()
+      .getSExtValue();
+}
+
+StatusOr<BufferAllocation::Slice> GetAllocationSliceForMlir(
+    mlir::Value v, absl::Span<const BufferAllocation> allocations) {
+  int64 size = v.getType().cast<mlir::MemRefType>().getSizeInBits() / 8;
+
+  if (auto arg = v.dyn_cast<mlir::BlockArgument>()) {
+    return BufferAllocation::Slice(&allocations[GetAllocationIndex(arg)], 0,
+                                   size);
+  }
+
+  // We match two patterns here:
+  // * v = ViewOp(arg);
+  // * v = StaticMemRefCastOp(ViewOp(arg));
+  if (mlir::Operation* op = v.getDefiningOp()) {
+    if (auto cast = mlir::dyn_cast<mlir::lmhlo::StaticMemRefCastOp>(op)) {
+      mlir::Value source = cast.getViewSource();
+      op = source.getDefiningOp();
+      if (!op) {
+        return Unimplemented("StaticMemRefCastOp has to wrap an op");
+      }
+    }
+    if (auto view = mlir::dyn_cast<mlir::ViewOp>(op)) {
+      return BufferAllocation::Slice(
+          &allocations[GetAllocationIndex(
+              view.source().cast<mlir::BlockArgument>())],
+          mlir::cast<mlir::ConstantOp>(view.byte_shift().getDefiningOp())
+              .value()
+              .cast<mlir::IntegerAttr>()
+              .getValue()
+              .getSExtValue(),
+          size);
+    }
+    return Unimplemented("StaticMemRefCastOp has to wrap a ViewOp");
+  }
+
+  return Unimplemented(
+      "Operand has to be in the form of ViewOp(arg) or "
+      "StaticMemRefCastOp(ViewOp(arg))");
+}
+
+absl::string_view GetHloName(mlir::Operation* op) {
+  if (auto attr = op->getAttrOfType<mlir::StringAttr>("name")) {
+    auto ref = attr.getValue();
+    return absl::string_view(ref.data(), ref.size());
+  }
+  return "";
+}
+
 }  // namespace
 
 IrEmitterUnnested::IrEmitterUnnested(const HloModuleConfig& hlo_module_config,
                                      const HloComputation* hlo_computation,
                                      IrEmitterContext* ir_emitter_context)
     : IrEmitter(hlo_module_config, ir_emitter_context, /*is_nested=*/false),
-      hlo_computation_(hlo_computation) {}
+      hlo_computation_(hlo_computation),
+      mlir_scratch_module_(mlir::ModuleOp::create(
+          mlir::Builder(ir_emitter_context->mlir_context()).getUnknownLoc())),
+      lhlo_scratch_emitter_(ir_emitter_context_->buffer_assignment(),
+                            *hlo_computation, mlir_scratch_module_.get()) {}
+
+StatusOr<std::unique_ptr<IrEmitterUnnested>> IrEmitterUnnested::Create(
+    const HloModuleConfig& hlo_module_config,
+    const HloComputation* hlo_computation,
+    IrEmitterContext* ir_emitter_context) {
+  auto emitter = std::unique_ptr<IrEmitterUnnested>(new IrEmitterUnnested(
+      hlo_module_config, hlo_computation, ir_emitter_context));
+  TF_RETURN_IF_ERROR(emitter->lhlo_scratch_emitter_.Initialize());
+  TF_RETURN_IF_ERROR(emitter->EmitConstants(*hlo_computation, true));
+  return std::move(emitter);
+}
 
 Status IrEmitterUnnested::Postprocess(HloInstruction* hlo) {
   bindings_.UnbindAllLocalIrValues();
@@ -157,12 +238,11 @@ Status IrEmitterUnnested::Postprocess(HloInstruction* hlo) {
 }
 
 llvm::Function* IrEmitterUnnested::BuildKernelPrototype(
-    const HloInstruction& inst,
-    absl::Span<const BufferAllocation* const> args) {
+    absl::string_view name, absl::Span<const BufferAllocation* const> args) {
   // Compute the kernel name. The opcode string may contain "-" which cannot be
   // in a PTX function name, so sanitize the name before uniquifying it.
   string kernel_name = ir_emitter_context_->name_uniquer()->GetUniqueName(
-      llvm_ir::SanitizeFunctionName(inst.name()));
+      llvm_ir::SanitizeFunctionName(std::string(name)));
 
   // Create the kernel and add it to the module.
   llvm::Module* module = ir_emitter_context_->llvm_module();
@@ -358,7 +438,8 @@ Status IrEmitterUnnested::HandleDot(HloInstruction* dot) {
 }
 
 Status IrEmitterUnnested::HandleConditional(HloInstruction* conditional) {
-  AddThunkToThunkSequence(BuildConditionalThunk(conditional));
+  TF_ASSIGN_OR_RETURN(auto thunk, BuildConditionalThunk(conditional));
+  AddThunkToThunkSequence(std::move(thunk));
   return Status::OK();
 }
 
@@ -1037,10 +1118,13 @@ Status IrEmitterUnnested::HandleWhile(HloInstruction* xla_while) {
   // Build ForThunk for conformant while loops, otherwise build WhileThunk.
   auto config = xla_while->backend_config<WhileLoopBackendConfig>();
   if (config.ok() && config.ValueOrDie().has_known_trip_count()) {
-    AddThunkToThunkSequence(
+    TF_ASSIGN_OR_RETURN(
+        auto thunk,
         BuildForThunk(xla_while, config.ValueOrDie().known_trip_count().n()));
+    AddThunkToThunkSequence(std::move(thunk));
   } else {
-    AddThunkToThunkSequence(BuildWhileThunk(xla_while));
+    TF_ASSIGN_OR_RETURN(auto thunk, BuildWhileThunk(xla_while));
+    AddThunkToThunkSequence(std::move(thunk));
   }
   return Status::OK();
 }
@@ -1263,37 +1347,110 @@ Status IrEmitterUnnested::HandleSelect(HloInstruction* select) {
   return IrEmitter::HandleSelect(select);
 }
 
+StatusOr<const HloComputation*>
+IrEmitterUnnested::GetOrCreateSubComputationFromRegion(mlir::Region* region) {
+  std::unique_ptr<HloModule>& module = scratch_nested_computations_[region];
+  if (module == nullptr) {
+    xla::XlaComputation xla_computation;
+    TF_RETURN_IF_ERROR(ConvertRegionToComputation(region, &xla_computation));
+    TF_ASSIGN_OR_RETURN(auto program_shape, xla_computation.GetProgramShape());
+    TF_ASSIGN_OR_RETURN(
+        module, HloModule::CreateFromProto(xla_computation.proto(),
+                                           HloModuleConfig(program_shape)));
+  }
+  return module->entry_computation();
+}
+
 Status IrEmitterUnnested::HandleSort(HloInstruction* sort) {
+  MlirEmitterInput result;
+
+  TF_ASSIGN_OR_RETURN(auto sort_op, lhlo_scratch_emitter_.EmitSortOp(sort));
+  result.op = sort_op;
+  result.name = GetHloName(sort_op);
+  // The name in sort op has no semantics, and it's for debug only. If the name
+  // doesn't exist, we should use a namer (e.g. count-based).
+  // TODO(timshen): use a namer instead of relying on the HloInstruction names.
+  if (result.name.empty()) {
+    result.name = sort->name();
+  }
+  const auto& buffer_assignment = ir_emitter_context_->buffer_assignment();
+  auto& slice = result.extra_slice;
+  TF_ASSIGN_OR_RETURN(slice.buffer_slice,
+                      buffer_assignment.GetUniqueSlice(sort, {}));
+  slice.written = true;
+  slice.shape = sort->shape();
+
+  result.thunk_info = GetThunkInfo(sort);
+
+  return EmitSortFromMlir(result);
+}
+
+Status IrEmitterUnnested::EmitSortFromMlir(MlirEmitterInput input) {
+  absl::Span<const BufferAllocation> allocations(
+      ir_emitter_context_->buffer_assignment().Allocations());
+  auto sort_op = mlir::cast<mlir::lmhlo::SortOp>(input.op);
+
+  int operand_count = sort_op.operands().size();
+  std::vector<xla::Shape> operand_shapes(operand_count);
+  std::vector<MlirBufferSlice> slices;
+  std::vector<xla::Shape> output_shapes(sort_op.output().size());
+
+  for (int i = 0; i < operand_count; i++) {
+    operand_shapes[i] =
+        TypeToShape(sort_op.operands()[i].getType().cast<mlir::MemRefType>());
+  }
+
+  // Craft n + 1 slices, where the first n are output parameters, and the last
+  // is the on-device tuple storage. We don't need n operands because sorting
+  // kernels are always in-place.
+  for (int i = 0; i < operand_count; i++) {
+    output_shapes[i] =
+        TypeToShape(sort_op.output()[i].getType().cast<mlir::MemRefType>());
+    MlirBufferSlice slice;
+    TF_ASSIGN_OR_RETURN(
+        slice.buffer_slice,
+        GetAllocationSliceForMlir(sort_op.output()[i], allocations));
+    slice.written = true;
+    slice.shape = operand_shapes[i];
+    slices.push_back(slice);
+  }
+  slices.push_back(input.extra_slice);
+
   std::vector<std::unique_ptr<Thunk>> thunks;
-  Shape keys_shape = sort->operand(0)->shape();
-  int64 dimension_to_sort = sort->dimensions(0);
-  for (int64 i = 0; i < sort->operand_count(); ++i) {
-    ShapeIndex shape_index =
-        sort->operand_count() > 1 ? ShapeIndex({i}) : ShapeIndex({});
+
+  Shape keys_shape = operand_shapes[0];
+  int64 dimension_to_sort = sort_op.dimension();
+  for (int64 i = 0; i < operand_count; ++i) {
     // We assume that the layout of all involved operands and outputs is the
     // same.
-    TF_RET_CHECK(LayoutUtil::LayoutsInShapesEqual(keys_shape,
-                                                  sort->operand(i)->shape()));
-    TF_RET_CHECK(LayoutUtil::LayoutsInShapesEqual(
-        keys_shape, ShapeUtil::GetSubshape(sort->shape(), shape_index)));
+    TF_RET_CHECK(
+        LayoutUtil::LayoutsInShapesEqual(keys_shape, operand_shapes[i]));
+    TF_RET_CHECK(
+        LayoutUtil::LayoutsInShapesEqual(keys_shape, output_shapes[i]));
 
     // If possible, we share buffers. If that is not possible, we need to copy
     // the values, because the emitter does the sorting in-place.
-    auto destination_buffer = GetAllocationSlice(*sort, shape_index);
-    auto source_address = GetAllocationSlice(*sort->operand(i));
+    TF_ASSIGN_OR_RETURN(
+        auto destination_buffer,
+        GetAllocationSliceForMlir(sort_op.output()[i], allocations));
+    TF_ASSIGN_OR_RETURN(
+        auto source_address,
+        GetAllocationSliceForMlir(sort_op.operands()[i], allocations));
     if (destination_buffer != source_address) {
       // TODO(b/26783907): Figure out why we never seem to share buffers for
       // key/value sort.
+      VLOG(2) << input.name << " requires initial D2D copy for operand " << i;
       thunks.push_back(absl::make_unique<DeviceToDeviceCopyThunk>(
           Thunk::ThunkInfo(),
           /*source_address=*/source_address,
           /*destination_buffer=*/destination_buffer,
-          /*mem_size=*/ShapeUtil::ByteSizeOf(sort->operand(i)->shape())));
+          /*mem_size=*/ShapeUtil::ByteSizeOf(operand_shapes[i])));
     }
   }
 
   uint64 dimension_to_sort_bound = keys_shape.dimensions(dimension_to_sort);
   int64 num_stages = tensorflow::Log2Ceiling(dimension_to_sort_bound);
+  VLOG(2) << input.name << " requires " << num_stages << " stages.";
   CHECK_GE(1ULL << num_stages, dimension_to_sort_bound);
   CHECK_LT(1ULL << (num_stages - 1), dimension_to_sort_bound);
 
@@ -1357,10 +1514,10 @@ Status IrEmitterUnnested::HandleSort(HloInstruction* sort) {
   // we have not enough threads, or not enough shared memory. Also it does not
   // give a speedup if the tile size is < 128.
   int64 total_shared_memory_needed = 0;
-  for (int64 i = 0; i < sort->operand_count(); ++i) {
+  for (int64 i = 0; i < operand_count; ++i) {
     total_shared_memory_needed +=
-        kTileSize * ShapeUtil::ByteSizeOfPrimitiveType(
-                        sort->operand(i)->shape().element_type());
+        kTileSize *
+        ShapeUtil::ByteSizeOfPrimitiveType(operand_shapes[i].element_type());
   }
   bool no_tiling =
       kTileSize < 128 ||
@@ -1368,34 +1525,51 @@ Status IrEmitterUnnested::HandleSort(HloInstruction* sort) {
           ir_emitter_context_->gpu_device_info().threads_per_block_limit ||
       total_shared_memory_needed >
           ir_emitter_context_->gpu_device_info().shared_memory_per_block;
+  VLOG(2) << absl::StreamFormat(
+      "%s %s use tiling. No tiling if any of the following is true: "
+      "kTileSize=%d < 128, "
+      "kThreadsPerBlock=%d > threads_per_block_limit=%d, "
+      "total_shared_memory_needed=%d > shared_memory_per_block=%d",
+      input.name, (no_tiling ? "won't" : "will"), kTileSize, kThreadsPerBlock,
+      ir_emitter_context_->gpu_device_info().threads_per_block_limit,
+      total_shared_memory_needed,
+      ir_emitter_context_->gpu_device_info().shared_memory_per_block);
 
   uint64 num_blocks = CeilOfRatio(num_iterations, kThreadsPerBlock);
   LaunchDimensions tiled_launch_dimensions(num_blocks, kThreadsPerBlock);
+  VLOG(2) << absl::StreamFormat("%s launch dims: %d blocks, %d threads/block",
+                                input.name, num_blocks, kThreadsPerBlock);
 
+  std::vector<llvm_ir::IrArray> ir_arrays;
   auto emit_kernel = [&](absl::Span<const int64> xor_masks) {
-    thunks.push_back(
-        BuildKernelThunk(sort, /*implements_whole_instruction=*/false));
+    VLOG(2) << absl::StreamFormat(
+        "%s uses kernel for xor masks [%s]", input.name,
+        absl::StrJoin(xor_masks, ", ", [](std::string* out, int64 xor_mask) {
+          absl::StrAppendFormat(out, "0x%x", xor_mask);
+        }));
+    thunks.push_back(BuildKernelThunkForMlir(input.name, Thunk::ThunkInfo(),
+                                             slices, &ir_arrays));
     LaunchDimensions launch_dimensions = xor_masks.size() > 1
                                              ? tiled_launch_dimensions
                                              : standard_launch_dimensions;
     UpdateLaunchDimensions(launch_dimensions, thunks.back().get(),
                            ir_emitter_context_->llvm_module());
     std::vector<IrArray> values_arrays;
-    values_arrays.reserve(sort->operand_count());
-    for (int64 i = 0; i < sort->operand_count(); ++i) {
-      ShapeIndex shape_index =
-          sort->operand_count() > 1 ? ShapeIndex({i}) : ShapeIndex({});
-      values_arrays.push_back(GetIrArray(*sort, *sort, shape_index));
+    values_arrays.reserve(operand_count);
+    for (int64 i = 0; i < operand_count; ++i) {
+      values_arrays.push_back(ir_arrays[i]);
     }
+    TF_ASSIGN_OR_RETURN(
+        const HloComputation* comparator,
+        GetOrCreateSubComputationFromRegion(&sort_op.comparator()));
     return llvm_ir::EmitSortInPlace(
-        dimension_to_sort, values_arrays, IrName(sort), xor_masks, &b_,
+        dimension_to_sort, values_arrays, IrName(input.name), xor_masks, &b_,
         launch_dimensions,
         xor_masks.size() > 1 ? num_iterations_in_sort_dim
                              : standard_num_iterations_in_sort_dim,
         kTileSize,
         [&](absl::Span<llvm::Value* const> operands, llvm::Value* output) {
-          return EmitCallToNestedComputation(*sort->to_apply(), operands,
-                                             output);
+          return EmitCallToNestedComputation(*comparator, operands, output);
         });
   };
   std::vector<int64> xor_masks;
@@ -1421,15 +1595,19 @@ Status IrEmitterUnnested::HandleSort(HloInstruction* sort) {
   if (!xor_masks.empty()) {
     TF_RETURN_IF_ERROR(emit_kernel(xor_masks));
   }
+  VLOG(2) << absl::StreamFormat(
+      "%s requires %d thunks (including any D2D copies)", input.name,
+      thunks.size());
 
-  AddThunkToThunkSequence(absl::make_unique<SequentialThunk>(
-      GetThunkInfo(sort), std::move(thunks)));
-  if (sort->operand_count() > 1) {
+  AddThunkToThunkSequence(
+      absl::make_unique<SequentialThunk>(input.thunk_info, std::move(thunks)));
+  if (operand_count > 1) {
     // Emit the tuple as part of the last stage of sorting.
     // We are currently in the block sorted.in_bounds.after.
     b_.SetInsertPoint(b_.GetInsertBlock()->getTerminator());
-    llvm_ir::EmitTuple(GetIrArray(*sort, *sort),
-                       ConstructIrArrayForOutputs(*sort), &b_);
+    llvm_ir::EmitTuple(
+        ir_arrays[operand_count],
+        absl::MakeSpan(ir_arrays).subspan(0, ir_arrays.size() - 1), &b_);
   }
   return Status::OK();
 }
@@ -1567,24 +1745,6 @@ Status IrEmitterUnnested::HandleAfterAll(HloInstruction* after_all) {
   return Status::OK();
 }
 
-// Describes how to access a particular subshape for an HLO.  For instance if
-// `.hlo_index` is {1} and `.gte_index` is {3, 4} then buffer for `.instr` at
-// ShapeIndex {1} (i.e. the buffer for the second tuple element of hlo) is found
-// at `.buffer_slice`[3][4].  That is, `.slice` is a void***, which we
-// dereference twice -- first at index 3, and then at index 4 -- to get the
-// address of our buffer.
-struct HloBufferSlice {
-  const HloInstruction* instr;
-  ShapeIndex hlo_index;
-
-  // The root buffer to look at.
-  BufferAllocation::Slice buffer_slice;
-
-  // Describes how to dereference starting at that buffer to get to the buffer
-  // in question.
-  ShapeIndex gte_index;
-};
-
 // Figures out how to access the buffers for all subshapes of hlo's operands and
 // for hlo itself (i.e. all the buffers produced by HLO).
 //
@@ -1693,22 +1853,22 @@ static std::vector<HloBufferSlice> GetHloBufferSlices(
   return result;
 }
 
-std::unique_ptr<KernelThunk> IrEmitterUnnested::BuildKernelThunk(
-    const HloInstruction* inst, bool implements_whole_instruction) {
-  const BufferAssignment& buffer_assn =
-      ir_emitter_context_->buffer_assignment();
-
-  std::vector<HloBufferSlice> hlo_slices =
-      GetHloBufferSlices(inst, buffer_assn);
+std::unique_ptr<KernelThunk>
+IrEmitterUnnested::BuildKernelThunkFromBufferSlices(
+    absl::string_view name, Thunk::ThunkInfo thunk_info,
+    absl::Span<const BufferSlice* const> slices,
+    std::function<void(const BufferSlice*, llvm::Value*)>
+        bind_slice_to_ir_value) {
+  const auto& buffer_assn = ir_emitter_context_->buffer_assignment();
 
   // Figure out which buffer allocations need to be passed as arguments to our
-  // kernel.  This is simply all of the allocations referenced in hlo_slices,
+  // kernel.  This is simply all of the allocations referenced in slices,
   // plus the XLA temp buffer (if we have it).  We always include the temp
   // buffer because even if the kernel itself doesn't use it, a nested
   // subcomputation within the kernel (e.g. a kMap's computation) might.
   std::unordered_set<const BufferAllocation*> buffers_needed;
-  for (const auto& hlo_buffer_slice : hlo_slices) {
-    buffers_needed.insert(hlo_buffer_slice.buffer_slice.allocation());
+  for (auto* slice : slices) {
+    buffers_needed.insert(slice->buffer_slice.allocation());
   }
   absl::optional<const BufferAllocation*> temp_buffer;
   for (const BufferAllocation& alloc : buffer_assn.Allocations()) {
@@ -1737,7 +1897,7 @@ std::unique_ptr<KernelThunk> IrEmitterUnnested::BuildKernelThunk(
                  return a->index() < b->index();
                });
 
-  llvm::Function* kernel = BuildKernelPrototype(*inst, non_constant_buffers);
+  llvm::Function* kernel = BuildKernelPrototype(name, non_constant_buffers);
 
   // Build a map from a BufferAllocation to the corresponding argument in our
   // kernel.
@@ -1771,24 +1931,19 @@ std::unique_ptr<KernelThunk> IrEmitterUnnested::BuildKernelThunk(
 
   // For each buffer our kernel might want to touch, bind it to a value derived
   // from our kernel args.
-  for (const auto& hlo_buffer_slice : hlo_slices) {
-    const HloInstruction* instr = hlo_buffer_slice.instr;
-    const ShapeIndex& index = hlo_buffer_slice.hlo_index;
-    const BufferAllocation::Slice& slice = hlo_buffer_slice.buffer_slice;
-    const ShapeIndex& gte_index = hlo_buffer_slice.gte_index;
-
-    VLOG(3) << "Buffer for " << instr->ToString() << " at " << index.ToString()
-            << " is found in slice " << slice.ToString() << " at GTE index "
-            << gte_index.ToString();
+  for (auto* slice : slices) {
+    const BufferAllocation::Slice& buffer_slice = slice->buffer_slice;
+    const ShapeIndex& gte_index = slice->gte_index;
 
     llvm::Value* loc;
-    if (slice.allocation()->is_constant()) {
+    if (buffer_slice.allocation()->is_constant()) {
       loc = ir_emitter_context_->llvm_module()->getGlobalVariable(
-          llvm_ir::ConstantBufferAllocationToGlobalName(*slice.allocation()));
+          llvm_ir::ConstantBufferAllocationToGlobalName(
+              *buffer_slice.allocation()));
       CHECK_NE(loc, nullptr);
     } else {
-      loc = InBoundsGEP(kernel_args.at(slice.allocation()),
-                        {b_.getInt64(slice.offset())});
+      loc = InBoundsGEP(kernel_args.at(buffer_slice.allocation()),
+                        {b_.getInt64(buffer_slice.offset())});
     }
 
     // If gte_index is nonempty, we have to dereference `loc` to get to the
@@ -1800,7 +1955,7 @@ std::unique_ptr<KernelThunk> IrEmitterUnnested::BuildKernelThunk(
       loc = Load(InBoundsGEP(loc, {b_.getInt64(idx)}));
     }
 
-    bindings_.BindHloToIrValue(*instr, loc, index);
+    bind_slice_to_ir_value(slice, loc);
   }
 
   // Bind the temp buffer so that nested subcomputations can find it if they
@@ -1812,9 +1967,66 @@ std::unique_ptr<KernelThunk> IrEmitterUnnested::BuildKernelThunk(
         llvm::ConstantPointerNull::get(b_.getInt8PtrTy()));
   }
 
-  return absl::make_unique<KernelThunk>(
+  return absl::make_unique<KernelThunk>(thunk_info, non_constant_buffers,
+                                        std::string(kernel->getName()));
+}
+
+std::unique_ptr<KernelThunk> IrEmitterUnnested::BuildKernelThunk(
+    const HloInstruction* inst, bool implements_whole_instruction) {
+  std::vector<HloBufferSlice> hlo_slices =
+      GetHloBufferSlices(inst, ir_emitter_context_->buffer_assignment());
+
+  std::vector<BufferSlice*> slice_ptrs;
+  slice_ptrs.reserve(hlo_slices.size());
+  for (auto& slice : hlo_slices) {
+    slice_ptrs.push_back(&slice);
+  }
+
+  return BuildKernelThunkFromBufferSlices(
+      inst->name(),
       implements_whole_instruction ? GetThunkInfo(inst) : Thunk::ThunkInfo(),
-      non_constant_buffers, std::string(kernel->getName()));
+      slice_ptrs, [this](const BufferSlice* slice, llvm::Value* value) {
+        const HloBufferSlice* hlo_buffer_slice =
+            static_cast<const HloBufferSlice*>(slice);
+        const HloInstruction* instr = hlo_buffer_slice->instr;
+        const ShapeIndex& index = hlo_buffer_slice->hlo_index;
+        VLOG(3) << "Buffer for " << instr->ToString() << " at "
+                << index.ToString() << " is found in slice "
+                << hlo_buffer_slice->buffer_slice.ToString() << " at GTE index "
+                << hlo_buffer_slice->gte_index.ToString();
+
+        bindings_.BindHloToIrValue(*instr, value, index);
+      });
+}
+
+std::unique_ptr<KernelThunk> IrEmitterUnnested::BuildKernelThunkForMlir(
+    absl::string_view name, Thunk::ThunkInfo thunk_info,
+    absl::Span<const MlirBufferSlice> slices,
+    std::vector<llvm_ir::IrArray>* ir_arrays) {
+  absl::flat_hash_set<BufferAllocation::Slice> buffers_written;
+  std::vector<const BufferSlice*> slice_ptrs;
+  slice_ptrs.reserve(slices.size());
+  for (auto& slice : slices) {
+    slice_ptrs.push_back(&slice);
+    if (slice.written) {
+      buffers_written.insert(slice.buffer_slice);
+    }
+  }
+
+  ir_arrays->clear();
+  return BuildKernelThunkFromBufferSlices(
+      name, thunk_info, slice_ptrs,
+      [&](const BufferSlice* slice, llvm::Value* value) {
+        const auto& mlir_slice = static_cast<const MlirBufferSlice&>(*slice);
+
+        llvm_ir::IrArray ir_array(
+            CastToTypedValue(mlir_slice.shape, value, &b_), mlir_slice.shape);
+        if (!buffers_written.contains(slice->buffer_slice)) {
+          ir_array.MarkInvariantOverWholeProgram(&value->getContext());
+        }
+
+        ir_arrays->push_back(ir_array);
+      });
 }
 
 StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildInitializerThunk(
@@ -2021,7 +2233,7 @@ Status CheckConditionalBuffersShareAllocation(
 
 }  // namespace
 
-std::unique_ptr<Thunk> IrEmitterUnnested::BuildWhileThunk(
+StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildWhileThunk(
     const HloInstruction* hlo) {
   // Check that all while-related buffers share an allocation.
   TF_CHECK_OK(CheckWhileBuffersShareAllocation(
@@ -2029,24 +2241,26 @@ std::unique_ptr<Thunk> IrEmitterUnnested::BuildWhileThunk(
 
   // Generate thunk sequence for while 'condition'.
   HloComputation* condition = hlo->while_condition();
-  IrEmitterUnnested ir_emitter_condition(hlo_module_config_, condition,
-                                         ir_emitter_context_);
-  TF_CHECK_OK(condition->Accept(&ir_emitter_condition));
+  TF_ASSIGN_OR_RETURN(auto ir_emitter_condition,
+                      IrEmitterUnnested::Create(hlo_module_config_, condition,
+                                                ir_emitter_context_));
+  TF_RETURN_IF_ERROR(condition->Accept(ir_emitter_condition.get()));
 
   // Generate thunk sequence for while 'body'.
   HloComputation* body = hlo->while_body();
-  IrEmitterUnnested ir_emitter_body(hlo_module_config_, body,
-                                    ir_emitter_context_);
-  TF_CHECK_OK(body->Accept(&ir_emitter_body));
+  TF_ASSIGN_OR_RETURN(
+      auto ir_emitter_body,
+      IrEmitterUnnested::Create(hlo_module_config_, body, ir_emitter_context_));
+  TF_RETURN_IF_ERROR(body->Accept(ir_emitter_body.get()));
 
-  return absl::make_unique<WhileThunk>(
+  return std::unique_ptr<Thunk>(new WhileThunk(
       GetThunkInfo(hlo),
       GetAllocationSlice(*condition->root_instruction()),  // cond result
-      ir_emitter_condition.ConsumeThunkSequence(),
-      ir_emitter_body.ConsumeThunkSequence());
+      ir_emitter_condition->ConsumeThunkSequence(),
+      ir_emitter_body->ConsumeThunkSequence()));
 }
 
-std::unique_ptr<Thunk> IrEmitterUnnested::BuildForThunk(
+StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildForThunk(
     const HloInstruction* hlo, const int64 loop_limit) {
   // Check that all while-related buffers share an allocation.
   TF_CHECK_OK(CheckWhileBuffersShareAllocation(
@@ -2054,15 +2268,16 @@ std::unique_ptr<Thunk> IrEmitterUnnested::BuildForThunk(
 
   // Generate thunk sequence for while 'body' (will be used a For loop body).
   HloComputation* body = hlo->while_body();
-  IrEmitterUnnested ir_emitter_body(hlo_module_config_, body,
-                                    ir_emitter_context_);
-  TF_CHECK_OK(body->Accept(&ir_emitter_body));
+  TF_ASSIGN_OR_RETURN(
+      auto ir_emitter_body,
+      IrEmitterUnnested::Create(hlo_module_config_, body, ir_emitter_context_));
+  TF_RETURN_IF_ERROR(body->Accept(ir_emitter_body.get()));
 
-  return absl::make_unique<ForThunk>(GetThunkInfo(hlo), loop_limit,
-                                     ir_emitter_body.ConsumeThunkSequence());
+  return std::unique_ptr<Thunk>(new ForThunk(
+      GetThunkInfo(hlo), loop_limit, ir_emitter_body->ConsumeThunkSequence()));
 }
 
-std::unique_ptr<Thunk> IrEmitterUnnested::BuildConditionalThunk(
+StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildConditionalThunk(
     const HloInstruction* hlo) {
   // Check that the buffers used in conditional are shared with the operands and
   // result appropriately.
@@ -2074,15 +2289,17 @@ std::unique_ptr<Thunk> IrEmitterUnnested::BuildConditionalThunk(
   for (int j = 0; j < hlo->branch_count(); ++j) {
     branch_operands.emplace_back(GetAllocationSlice(*hlo->operand(j + 1)));
     HloComputation* branch_computation = hlo->branch_computation(j);
-    IrEmitterUnnested ir_emitter(hlo_module_config_, branch_computation,
-                                 ir_emitter_context_);
-    TF_CHECK_OK(branch_computation->Accept(&ir_emitter));
-    branch_thunks.push_back(std::move(*ir_emitter.ConsumeThunkSequence()));
+    TF_ASSIGN_OR_RETURN(
+        auto ir_emitter,
+        IrEmitterUnnested::Create(hlo_module_config_, branch_computation,
+                                  ir_emitter_context_));
+    TF_CHECK_OK(branch_computation->Accept(ir_emitter.get()));
+    branch_thunks.push_back(std::move(*ir_emitter->ConsumeThunkSequence()));
   }
 
-  return absl::make_unique<ConditionalThunk>(
+  return std::unique_ptr<Thunk>(new ConditionalThunk(
       GetThunkInfo(hlo), GetAllocationSlice(*hlo->operand(0)), branch_operands,
-      std::move(branch_thunks));
+      std::move(branch_thunks)));
 }
 
 Status IrEmitterUnnested::EmitTargetElementLoopInThunk(
@@ -2775,6 +2992,28 @@ void IrEmitterUnnested::EmitPrintfWithThreadId(
   });
 }
 
+namespace {
+
+// Obtains the corresponding index of the out_instr in the outputs of the
+// `unnested_hlo`.
+ShapeIndex CreateShapeIndexForOutputInstruction(
+    const HloInstruction& unnested_hlo, const HloInstruction& out_instr) {
+  if (!unnested_hlo.IsMultiOutputFusion()) {
+    return ShapeIndex({});
+  }
+  const auto& all_outputs = unnested_hlo.fused_expression_root()->operands();
+  for (size_t i = 0; i < all_outputs.size(); ++i) {
+    if (all_outputs[i] == &out_instr) {
+      return ShapeIndex({static_cast<int64>(i)});
+    }
+  }
+  LOG(FATAL) << " Fusion root does not contain output instruction; "
+             << " fusion: " << unnested_hlo.ToString()
+             << ", output instruction: " << out_instr.ToString();
+}
+
+}  // namespace
+
 void IrEmitterUnnested::EmitTileElementForReduction(
     HloInstruction* unnested_hlo, const Shape& reduction_operand_shape,
     absl::Span<HloInstruction* const> output_instructions,
@@ -2782,7 +3021,6 @@ void IrEmitterUnnested::EmitTileElementForReduction(
     const ReductionCodegenInfo& reduction_info,
     absl::Span<HloComputation* const> reducers, int64 x_iter_num) {
   VLOG(10) << "Emit tile element for reduce " << unnested_hlo->ToString();
-  bool returns_tuple = output_instructions.size() > 1;
   int partial_result_index = reduction_info.IsRowReduction() ? 0 : x_iter_num;
 
   InlinedVector<llvm_ir::ElementGenerator, 1> input_gens;
@@ -2799,7 +3037,8 @@ void IrEmitterUnnested::EmitTileElementForReduction(
 
     for (int i = 0, e = output_instructions.size(); i != e; ++i) {
       const HloInstruction* inst = output_instructions[i];
-      ShapeIndex idx = returns_tuple ? ShapeIndex({i}) : ShapeIndex({});
+      ShapeIndex idx =
+          CreateShapeIndexForOutputInstruction(*unnested_hlo, *inst);
       if (IsReductionFromOrToContiguousDimensions(*inst)) {
         input_gens.push_back(fused_emitter.GetGenerator(inst->operand(0)));
       } else {
@@ -3532,71 +3771,41 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo(
                               reduction_dimensions.is_row_reduction);
 }
 
-Status IrEmitterUnnested::EmitReductionFromOrToContiguousDimensions(
+void IrEmitterUnnested::EmitIRForReduction(
     HloInstruction* unnested_hlo,
-    absl::Span<HloInstruction* const> output_instructions) {
-  bool returns_tuple = output_instructions.size() > 1;
-  VLOG(10) << "Emitting reduction to vector " << unnested_hlo->ToString();
-
+    absl::Span<HloInstruction* const> output_instructions,
+    ReductionCodegenInfo* reduction_info, const Shape& input_shape) {
   std::vector<HloInstruction*> reduce_instructions;
   InlinedVector<ShapeIndex, 1> reduction_output_shape_indices;
   InlinedVector<HloComputation*, 1> reducers;
-
-  // Build an initializer thunk to initialize each reduction output.
-  std::vector<std::unique_ptr<Thunk>> thunks;
-  for (int i = 0; i < output_instructions.size(); ++i) {
+  for (size_t i = 0; i < output_instructions.size(); ++i) {
     if (!IsReductionFromOrToContiguousDimensions(*output_instructions[i])) {
       continue;
     }
 
     HloInstruction* output_instruction = output_instructions[i];
     reduce_instructions.push_back(output_instruction);
-    ShapeIndex idx = returns_tuple ? ShapeIndex({i}) : ShapeIndex({});
-    reduction_output_shape_indices.push_back(idx);
+    reduction_output_shape_indices.push_back(
+        CreateShapeIndexForOutputInstruction(*unnested_hlo,
+                                             *output_instruction));
     reducers.push_back(output_instruction->to_apply());
-
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<Thunk> initializer_thunk,
-                        BuildInitializerThunk(unnested_hlo, idx));
-    thunks.push_back(std::move(initializer_thunk));
   }
+  CHECK(reduce_instructions.size() != 0)
+      << " expect at least one reduce instructions.";
 
-  const HloInstruction* first_reduce = reduce_instructions.at(0);
-  if (output_instructions.size() > 1) {
-    if (!AreFusedReductionOutputsConsistent(output_instructions,
-                                            first_reduce)) {
-      return InternalError("Inconsistent reduction fusion outputs");
-    }
-  }
-
-  // Build a kernel thunk to compute all the outputs.
-  std::unique_ptr<KernelThunk> kernel_thunk =
-      BuildKernelThunk(unnested_hlo, /*implements_whole_instruction=*/false);
-
-  const Shape& input_shape = first_reduce->operand(0)->shape();
-  // The layout of a reduction input is either set by LayoutAssignment for
-  // unnested kReduce or by InstructionFusion for fused kReduce.
-  CHECK(input_shape.has_layout()) << "LayoutAssignment or InstructionFusion "
-                                     "doesn't set the input layout of "
-                                  << first_reduce->ToString();
-
-  ReductionCodegenInfo reduction_info =
-      ComputeReductionCodegenInfo(unnested_hlo, first_reduce);
   const KernelMappingScheme& mapping_scheme =
-      reduction_info.GetKernelMappingScheme();
+      reduction_info->GetKernelMappingScheme();
   LaunchDimensions launch_dimensions(mapping_scheme.GetNumberOfBlocks(),
                                      mapping_scheme.GetThreadsPerBlock());
-  VLOG(3) << "Launch dimensions of " << unnested_hlo->name()
-          << ": number of blocks: " << mapping_scheme.GetNumberOfBlocks()
-          << " - threads per block: " << mapping_scheme.GetThreadsPerBlock();
   llvm::Type* index_ty = GetIndexTypeForKernel(
       unnested_hlo, launch_dimensions.launch_bound(), &b_);
-  EmitPrologueForReduction(unnested_hlo, &reduction_info, reduce_instructions,
+  EmitPrologueForReduction(unnested_hlo, reduction_info, reduce_instructions,
                            index_ty);
   EmitElementFunction emit_reduction_tile =
       [&](const llvm_ir::IrArray::Index& index, llvm::Value* y_loc,
           llvm::Value* x_loc, int64 x_iter_num) {
         EmitTileElementForReduction(unnested_hlo, input_shape,
-                                    output_instructions, index, reduction_info,
+                                    output_instructions, index, *reduction_info,
                                     reducers, x_iter_num);
       };
 
@@ -3605,70 +3814,185 @@ Status IrEmitterUnnested::EmitReductionFromOrToContiguousDimensions(
       [&](const ThreadIdInfo& thread_id_info, const IrArray::Index& index,
           const string& loop_name, llvm::Value* tile_height,
           llvm::Value* tile_width, KernelSupportLibrary* ksl) {
-        EmitTile(reduction_info.GetKernelMappingScheme(), index, loop_name, ksl,
-                 thread_id_info, tile_height, tile_width, emit_reduction_tile);
+        EmitTile(reduction_info->GetKernelMappingScheme(), index, loop_name,
+                 ksl, thread_id_info, tile_height, tile_width,
+                 emit_reduction_tile);
       });
-  EmitEpilogueForReduction(index_ty, unnested_hlo, reduction_info,
+  EmitEpilogueForReduction(index_ty, unnested_hlo, *reduction_info,
                            reduce_instructions, reduction_output_shape_indices,
                            reducers, tiling_kernel_info);
+}
 
+namespace {
+
+// Returns whether the `instr` is either a constant, a scalar, or a
+// broadcasted constant/scalar.
+bool IsBroadcastedConstantOrScalar(const HloInstruction& instr) {
+  return instr.IsConstant() || ShapeUtil::IsScalar(instr.shape()) ||
+         (HloOpcode::kBroadcast == instr.opcode() &&
+          (instr.operand(0)->IsConstant() ||
+           ShapeUtil::IsScalar(instr.operand(0)->shape())));
+}
+
+// Divides output_instructions into groups. Different groups will be executed
+// in parallel. Generally speaking, we'd like to run the reduce instructions
+// in parallel without incurring too much recomputation overhead. The current
+// heuristic is to place reduce instructions who share nothing or only
+// (broadcasted) scalars/constants into different groups; otherwise, they are
+// placed in the same group. Non-reduce instructions always go with the reduce
+// instructions into the same group so long as they share any predecessors.
+std::vector<std::vector<HloInstruction*>> DivideOutputInstructionsIntoGroups(
+    HloInstruction* unnested_hlo,
+    absl::Span<HloInstruction* const> output_instructions) {
+  CHECK(!output_instructions.empty());
+  if (output_instructions.size() == 1) {
+    return {{output_instructions[0]}};
+  }
+
+  std::vector<tensorflow::UnionFind<HloInstruction*>> disjoint_sets(
+      output_instructions.size());
+  for (size_t i = 0; i < output_instructions.size(); ++i) {
+    disjoint_sets[i].Get() = output_instructions[i];
+  }
+
+  std::unique_ptr<HloReachabilityMap> reachability_map =
+      HloReachabilityMap::Build(unnested_hlo->fused_instructions_computation());
+  for (auto* instr : unnested_hlo->fused_instructions()) {
+    std::vector<int64> reached_output_ids;
+    for (size_t oid = 0; oid < output_instructions.size(); ++oid) {
+      if (HloOpcode::kReduce == output_instructions[oid]->opcode() &&
+          (IsBroadcastedConstantOrScalar(*instr))) {
+        // Do not group output reduce instructions through broadcasted
+        // constants or scalars, as the recomputation should be acceptable.
+        VLOG(3) << "Skip broadcasted constant or scalar " << instr->ToString();
+        continue;
+      }
+      // Now group output instructions if they have common predecessors.
+      if (reachability_map->IsReachable(instr, output_instructions[oid])) {
+        VLOG(3) << "Reaching " << output_instructions[oid]->ToString()
+                << " from " << instr->ToString();
+        reached_output_ids.push_back(oid);
+      }
+    }
+    for (size_t j = 1; j < reached_output_ids.size(); ++j) {
+      disjoint_sets[reached_output_ids[0]].Merge(
+          &disjoint_sets[reached_output_ids[j]]);
+    }
+  }
+  // Place output instructions in the same set into the same group.
+  absl::flat_hash_map<HloInstruction*, std::vector<HloInstruction*>> groups;
+  for (size_t oid = 0; oid < output_instructions.size(); ++oid) {
+    groups[disjoint_sets[oid].Get()].push_back(output_instructions.at(oid));
+  }
+
+  std::vector<std::vector<HloInstruction*>> ret;
+  absl::c_for_each(
+      groups, [&](auto& iter) { ret.emplace_back(std::move(iter.second)); });
+  return ret;
+}
+
+}  // namespace
+
+Status IrEmitterUnnested::EmitReductionFromOrToContiguousDimensions(
+    HloInstruction* unnested_hlo,
+    absl::Span<HloInstruction* const> output_instructions) {
+  bool returns_tuple = output_instructions.size() > 1;
+  VLOG(10) << "Emitting reduction to vector " << unnested_hlo->ToString();
+
+  // Build an initializer thunk to initialize each reduction output.
+  std::vector<std::unique_ptr<Thunk>> thunks;
+  for (int i = 0; i < output_instructions.size(); ++i) {
+    if (!IsReductionFromOrToContiguousDimensions(*output_instructions[i])) {
+      continue;
+    }
+
+    ShapeIndex idx = returns_tuple ? ShapeIndex({i}) : ShapeIndex({});
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<Thunk> initializer_thunk,
+                        BuildInitializerThunk(unnested_hlo, idx));
+    thunks.push_back(std::move(initializer_thunk));
+  }
+
+  // Build a kernel thunk to compute all the outputs.
+  const HloInstruction* first_reduce = nullptr;
+  for (int i = 0; i < output_instructions.size(); ++i) {
+    if (IsReductionFromOrToContiguousDimensions(*output_instructions[i])) {
+      first_reduce = output_instructions[i];
+      break;
+    }
+  }
+  CHECK(first_reduce);
+  if (output_instructions.size() > 1) {
+    if (!AreFusedReductionOutputsConsistent(output_instructions,
+                                            first_reduce)) {
+      return InternalError("Inconsistent reduction fusion outputs");
+    }
+  }
+  const Shape& input_shape = first_reduce->operand(0)->shape();
+  // The layout of a reduction input is either set by LayoutAssignment for
+  // unnested kReduce or by InstructionFusion for fused kReduce.
+  CHECK(input_shape.has_layout()) << "LayoutAssignment or InstructionFusion "
+                                     "doesn't set the input layout of "
+                                  << first_reduce->ToString();
+
+  // Group output instructions. Each group will be executed in parallel.
+  std::vector<std::vector<HloInstruction*>> instr_groups =
+      DivideOutputInstructionsIntoGroups(unnested_hlo, output_instructions);
+  VLOG(2) << StrCat("Generate in ", instr_groups.size(), " groups for ",
+                    unnested_hlo->ToString());
+  std::unique_ptr<KernelThunk> kernel_thunk =
+      BuildKernelThunk(unnested_hlo, /*implements_whole_instruction=*/false);
+  KernelSupportLibrary ksl(&b_, llvm_ir::UnrollMode::kDefaultUnroll);
+  for (size_t i = 0; i < instr_groups.size(); ++i) {
+    // Create a new ReductionCodegenInfo instance as it contains states for
+    // code generation per reduction group. For now, let's always use the very
+    // first reduce as representative to construct ReductionCodegenInfo, since
+    // all the reductions are required to have the same shape and layout as
+    // verified by `AreFusedReductionOutputsConsistent()`. We can loosen the
+    // constraint later when the needs arise.
+    ReductionCodegenInfo reduction_info =
+        ComputeReductionCodegenInfo(unnested_hlo, first_reduce);
+    auto emit_reduction_func = [&] {
+      EmitIRForReduction(unnested_hlo, instr_groups[i], &reduction_info,
+                         input_shape);
+    };
+    // Use raw block_id_y to select the i-th parallel reduction to run. Using
+    // block_id_y instead of block_id_x simplifies the index calculation
+    // for reduction code generation as the block_id_y is orthogonal to
+    // the indices used within the reductions.
+    llvm::CallInst* raw_block_id_y = gpu::EmitCallToTargetIntrinsic(
+        gpu::TargetIntrinsicID::kBlockIdy, {}, {}, &b_);
+    llvm_ir::AddRangeMetadata(0, instr_groups.size(),
+                              llvm::cast<llvm::Instruction>(raw_block_id_y));
+    llvm::Value* guarding_cond =
+        b_.CreateICmpEQ(raw_block_id_y, b_.getInt32(i));
+    ksl.If(StrCat("reduce-group-", i), guarding_cond, emit_reduction_func);
+  }
+  ReductionCodegenInfo reduction_info =
+      ComputeReductionCodegenInfo(unnested_hlo, first_reduce);
+  const KernelMappingScheme& mapping_scheme =
+      reduction_info.GetKernelMappingScheme();
+  // block_y_count is set to instr_groups.size(), so that each reduction group
+  // can be run in parallel by a different BlockIdy.
+  LaunchDimensions launch_dimensions(
+      {/*x=*/mapping_scheme.GetNumberOfBlocks(),
+       /*y=*/static_cast<int64>(instr_groups.size()),
+       /*z=*/1},
+      {/*x=*/mapping_scheme.GetThreadsPerBlock(), /*y=*/1, /*z=*/1});
+  VLOG(3) << "Launch dimensions of " << unnested_hlo->name()
+          << ": number of blocks: " << mapping_scheme.GetNumberOfBlocks()
+          << " - threads per block: " << mapping_scheme.GetThreadsPerBlock();
   UpdateLaunchDimensions(launch_dimensions, kernel_thunk.get(),
                          ir_emitter_context_->llvm_module());
 
   thunks.push_back(std::move(kernel_thunk));
-  auto sequential_thunk = absl::make_unique<SequentialThunk>(
-      GetThunkInfo(unnested_hlo), std::move(thunks));
+  std::unique_ptr<SequentialThunk> sequential_thunk =
+      absl::make_unique<SequentialThunk>(GetThunkInfo(unnested_hlo),
+                                         std::move(thunks));
   AddThunkToThunkSequence(std::move(sequential_thunk));
 
   return Status::OK();
 }
 
-Status IrEmitterUnnested::EmitConstantGlobals() {
-  for (const BufferAllocation& allocation :
-       ir_emitter_context_->buffer_assignment().Allocations()) {
-    if (!allocation.is_constant()) {
-      continue;
-    }
-
-    const Literal& literal = llvm_ir::LiteralForConstantAllocation(allocation);
-    const bool should_emit_initializer = ShouldEmitLiteralInLlvmIr(literal);
-    llvm::ArrayType* global_type =
-        llvm::ArrayType::get(b_.getInt8Ty(), allocation.size());
-    llvm::Constant* initializer =
-        should_emit_initializer
-            ? llvm_ir::ConvertLiteralToIrConstant(literal, module_)
-            : llvm::ConstantAggregateZero::get(global_type);
-    if (should_emit_initializer) {
-      VLOG(3) << "Emitted initializer for constant with shape "
-              << ShapeUtil::HumanString(literal.shape());
-    }
-
-    // These globals will be looked up by name by GpuExecutable so we need to
-    // give them an external linkage.  Not all of their uses are visible in
-    // the LLVM IR (e.g. TupleThunk) so we can't give then a linkage that
-    // merely preserves their names (like available_externally), we also need
-    // to ensure that they stick around even if they're "unused".
-    //
-    // We may have to be more more clever here in the future if we notice that
-    // we're keeping around too many globals because of their linkage.
-    unsigned global_address_space = llvm_ir::GetGlobalMemoryAddressSpace(
-        *ir_emitter_context_->llvm_module());
-    llvm::GlobalVariable* global_for_const = new llvm::GlobalVariable(
-        global_type, /*isConstant=*/should_emit_initializer,
-        llvm::GlobalValue::ExternalLinkage,
-        /*Initializer=*/initializer,
-        llvm_ir::ConstantBufferAllocationToGlobalName(allocation),
-        /*TLMode=*/llvm::GlobalValue::NotThreadLocal,
-        /*AddressSpace=*/global_address_space,
-        /*isExternallyInitialized=*/false);
-    global_for_const->setAlignment(llvm::Align(kConstantBufferAlignBytes));
-    ir_emitter_context_->llvm_module()->getGlobalList().push_back(
-        global_for_const);
-  }
-
-  return Status::OK();
-}
-
 // Emits code for slices based on the below structure. An if statement with
 // a guarding condition is generated for each ROOT slice.
 //
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
index 019fcdf21db..c36f0b7840d 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_IR_EMITTER_UNNESTED_H_
 
 #include "absl/container/inlined_vector.h"
+#include "tensorflow/compiler/mlir/xla/transforms/mhlo_to_lhlo_with_xla.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emitter.h"
 #include "tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h"
 #include "tensorflow/compiler/xla/service/gpu/sequential_thunk.h"
@@ -28,6 +29,40 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+struct BufferSlice {
+  // The root buffer to look at.
+  BufferAllocation::Slice buffer_slice;
+
+  // Describes how to dereference starting at that buffer to get to the buffer
+  // in question.
+  ShapeIndex gte_index;
+};
+
+// Describes how to access a particular subshape for an HLO.  For instance if
+// `.hlo_index` is {1} and `.gte_index` is {3, 4} then buffer for `.instr` at
+// ShapeIndex {1} (i.e. the buffer for the second tuple element of hlo) is
+// found at `.buffer_slice`[3][4].  That is, `.slice` is a void***, which we
+// dereference twice -- first at index 3, and then at index 4 -- to get the
+// address of our buffer.
+struct HloBufferSlice : public BufferSlice {
+  const HloInstruction* instr;
+  ShapeIndex hlo_index;
+};
+
+struct MlirBufferSlice : public BufferSlice {
+  // The buffer is modified by the kernel.
+  bool written;
+
+  Shape shape;
+};
+
+struct MlirEmitterInput {
+  mlir::Operation* op;
+  absl::string_view name;
+  Thunk::ThunkInfo thunk_info;
+  MlirBufferSlice extra_slice;
+};
+
 // Emits LLVM IR for an "unnested computation".
 //
 // An unnested computation is an HloComputation which you run by executing one
@@ -89,12 +124,14 @@ class IrEmitterUnnested : public IrEmitter,
       const string& loop_name, llvm::Value* tile_height,
       llvm::Value* tile_width, KernelSupportLibrary* ksl)>;
 
-  IrEmitterUnnested(const HloModuleConfig& hlo_module_config,
-                    const HloComputation* hlo_computation,
-                    IrEmitterContext* ir_emitter_context);
   IrEmitterUnnested(const IrEmitterUnnested&) = delete;
   IrEmitterUnnested& operator=(const IrEmitterUnnested&) = delete;
 
+  static StatusOr<std::unique_ptr<IrEmitterUnnested>> Create(
+      const HloModuleConfig& hlo_module_config,
+      const HloComputation* hlo_computation,
+      IrEmitterContext* ir_emitter_context);
+
   // Transfers the ownship of thunk_sequence_ out.
   std::unique_ptr<ThunkSequence> ConsumeThunkSequence() {
     return std::make_unique<ThunkSequence>(std::move(thunk_sequence_));
@@ -124,6 +161,7 @@ class IrEmitterUnnested : public IrEmitter,
   Status HandleScatter(HloInstruction* scatter) override;
   Status HandleSelect(HloInstruction* select) override;
   Status HandleSort(HloInstruction* sort) override;
+  Status EmitSortFromMlir(MlirEmitterInput input);
   Status HandleTriangularSolve(HloInstruction* hlo) override;
   Status HandleTupleSelect(HloInstruction* tuple_select) override;
   Status HandleAllReduce(HloInstruction* crs) override;
@@ -142,12 +180,13 @@ class IrEmitterUnnested : public IrEmitter,
       const HloInstruction& hlo, const llvm_ir::ElementGenerator& body_emitter,
       KernelThunk* thunk, int unroll_factor);
 
-  // Emits LLVM global variables corresponding to constant instructions.
-  Status EmitConstantGlobals();
-
   Status Postprocess(HloInstruction* hlo) override;
 
  private:
+  IrEmitterUnnested(const HloModuleConfig& hlo_module_config,
+                    const HloComputation* hlo_computation,
+                    IrEmitterContext* ir_emitter_context);
+
   // Add a owning Thunk object to the thunk sequence.
   void AddThunkToThunkSequence(std::unique_ptr<Thunk> thunk) override {
     thunk_sequence_.emplace_back(std::move(thunk));
@@ -264,8 +303,7 @@ class IrEmitterUnnested : public IrEmitter,
   // Builds the prototype of the IR kernel for `inst` and adds it to the module.
   // This kernel takes as arguments pointers to the given buffer allocations.
   llvm::Function* BuildKernelPrototype(
-      const HloInstruction& inst,
-      absl::Span<const BufferAllocation* const> args);
+      absl::string_view name, absl::Span<const BufferAllocation* const> args);
 
   // Helper for writing extra outputs from inside a reduce kernel.
   Status EmitExtraOutputsForReduce(
@@ -331,6 +369,16 @@ class IrEmitterUnnested : public IrEmitter,
   // }
   // ```
   //
+  // Moreover, a heuristic is implemented to divide the reduce instructions
+  // into groups for parallelization (see `DivideOutputInstructionsIntoGroups`
+  // for details about the heuristic.) Reduce instructions in the same group
+  // will run sequentially while different groups will run in parallel.
+  //
+  // we use raw block_id_y to select the reduce groups for execution without
+  // complicating the index calculation in the code generation of the reduce
+  // instructions. In other words, a block_id_y is assigned to a group and so
+  // different groups can be run in parallel.
+  //
   // output_instructions: Output instructions in the computation: instruction
   // itself if it's not a fusion, fusion root if fusion is not multi-output, and
   // elements of the fusion multi-output tuple otherwise.
@@ -363,11 +411,10 @@ class IrEmitterUnnested : public IrEmitter,
   // the process. `scatter` may be fused, scatter indices are taken from
   // `scatter_indices_gen`, updates from`updates_gen`. The output buffer is
   // expected to have the operand values in it already. If unique_indices
-  // is false, we will use an atomic update. Using false for unique_indices
-  // is safe only when it is guaranteed that there are no duplicate
-  // indices.
-  // When using unique_indices=true, it is the caller's responsibility to
-  // ensure there is no overlap.
+  // is false, we will use an atomic update. Using true for unique_indices
+  // behaves properly only when it is guaranteed that the indices to be
+  // updated do not overlap. The caller is responsible for ensuring this is
+  // the case.
   Status EmitScatter(Thunk* thunk, HloInstruction* scatter,
                      const llvm_ir::ElementGenerator& scatter_indices_gen,
                      const llvm_ir::ElementGenerator& updates_gen);
@@ -478,6 +525,12 @@ class IrEmitterUnnested : public IrEmitter,
       absl::Span<HloComputation* const> reducers,
       const TilingKernelInfo& tiling_kernel_info);
 
+  // Emits code for reductions in the output_instructions.
+  void EmitIRForReduction(HloInstruction* unnested_hlo,
+                          absl::Span<HloInstruction* const> output_instructions,
+                          ReductionCodegenInfo* reduction_info,
+                          const Shape& input_shape);
+
   // For each reducer, emits the shuffle-down loop to accumulate the partial
   // result to the global result.
   void EmitFullWarpShuffleDownLoopForAllReduces(
@@ -490,6 +543,12 @@ class IrEmitterUnnested : public IrEmitter,
       HloComputation* reducer, llvm::Type* element_type,
       llvm::Value* partial_result_address);
 
+  std::unique_ptr<KernelThunk> BuildKernelThunkFromBufferSlices(
+      absl::string_view name, Thunk::ThunkInfo thunk_info,
+      absl::Span<const BufferSlice* const> slices,
+      std::function<void(const BufferSlice*, llvm::Value*)>
+          bind_slice_to_ir_value);
+
   // Returns a KernelThunk that invokes the kernel emitted for `inst`. The
   // caller needs to make sure `inst` outlives the lifetime of the returned
   // Thunk object. 'implements_whole_instruction' specifies whether this
@@ -498,6 +557,11 @@ class IrEmitterUnnested : public IrEmitter,
   std::unique_ptr<KernelThunk> BuildKernelThunk(
       const HloInstruction* inst, bool implements_whole_instruction);
 
+  std::unique_ptr<KernelThunk> BuildKernelThunkForMlir(
+      absl::string_view name, Thunk::ThunkInfo thunk_info,
+      absl::Span<const MlirBufferSlice> slices,
+      std::vector<llvm_ir::IrArray>* ir_arrays);
+
   // Returns a thunk that, given a reduce or select-and-scatter op,
   // initializes its memory to the appropriate initial value.
   StatusOr<std::unique_ptr<Thunk>> BuildInitializerThunk(
@@ -505,17 +569,18 @@ class IrEmitterUnnested : public IrEmitter,
 
   // Returns a WhileThunk that invokes thunk sequences for 'condition' and
   // 'body' sub-computations of while instruction 'hlo'.
-  std::unique_ptr<Thunk> BuildWhileThunk(const HloInstruction* hlo);
+  StatusOr<std::unique_ptr<Thunk>> BuildWhileThunk(const HloInstruction* hlo);
 
   // Returns a ForThunk which executes 'loop_limit' invocations of a thunk
   // sequence from the 'body' sub-computation of the while instruction 'hlo'.
-  std::unique_ptr<Thunk> BuildForThunk(const HloInstruction* hlo,
-                                       const int64 loop_limit);
+  StatusOr<std::unique_ptr<Thunk>> BuildForThunk(const HloInstruction* hlo,
+                                                 const int64 loop_limit);
 
   // Returns a ConditionalThunk which executes the thunk sequence for the
   // 'branch_computation' corresponding to the predicate/branch_index of the
   // given conditional instruction.
-  std::unique_ptr<Thunk> BuildConditionalThunk(const HloInstruction* hlo);
+  StatusOr<std::unique_ptr<Thunk>> BuildConditionalThunk(
+      const HloInstruction* hlo);
 
   // Emits current thread id with the given type.
   //
@@ -545,6 +610,9 @@ class IrEmitterUnnested : public IrEmitter,
       absl::optional<int64> thread_id_filter = absl::nullopt,
       absl::optional<int64> block_id_filter = absl::nullopt);
 
+  StatusOr<const HloComputation*> GetOrCreateSubComputationFromRegion(
+      mlir::Region* region);
+
   // Returns the last generated thunk.
   Thunk* LastThunk() const { return thunk_sequence_.back().get(); }
 
@@ -555,6 +623,14 @@ class IrEmitterUnnested : public IrEmitter,
 
   // The HloComputation that this IrEmitter emits code for.
   const HloComputation* hlo_computation_;
+
+  mlir::OwningModuleRef mlir_scratch_module_;
+
+  // This is for cache-purpose only. It has no significant semantics.
+  mlir::LhloDialectEmitter lhlo_scratch_emitter_;
+
+  absl::flat_hash_map<const mlir::Region*, std::unique_ptr<HloModule>>
+      scratch_nested_computations_;
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc b/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
index 19fef37db7e..6c138258aa0 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
@@ -115,9 +115,8 @@ Status KernelThunk::ExecuteOnStream(const ExecuteParams& params) {
 
   auto op_profiler =
       params.profiler->MakeScopedInstructionProfiler(profile_index());
-  return ExecuteKernelOnStream(*kernel, buffer_args,
-                               launch_dimensions.threads_per_block(),
-                               launch_dimensions.block_count(), params.stream);
+  return ExecuteKernelOnStream(*kernel, buffer_args, launch_dimensions,
+                               params.stream);
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/launch_dimensions.cc b/tensorflow/compiler/xla/service/gpu/launch_dimensions.cc
index 3668a521ec7..c23e8112cb0 100644
--- a/tensorflow/compiler/xla/service/gpu/launch_dimensions.cc
+++ b/tensorflow/compiler/xla/service/gpu/launch_dimensions.cc
@@ -26,8 +26,11 @@ namespace gpu {
 
 std::ostream& operator<<(std::ostream& out,
                          const LaunchDimensions& launch_dims) {
-  out << absl::StrFormat("[block: %d, thread: %d]", launch_dims.block_count(),
-                         launch_dims.threads_per_block());
+  LaunchDimensions::Dim3D block_counts = launch_dims.block_counts();
+  LaunchDimensions::Dim3D thread_counts = launch_dims.thread_counts_per_block();
+  out << absl::StrFormat("[block: {%d, %d, %d}, thread: {%d, %d, %d}]",
+                         block_counts.x, block_counts.y, block_counts.z,
+                         thread_counts.x, thread_counts.y, thread_counts.z);
   return out;
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/launch_dimensions.h b/tensorflow/compiler/xla/service/gpu/launch_dimensions.h
index 1a5a9d618e4..dbe5a037e43 100644
--- a/tensorflow/compiler/xla/service/gpu/launch_dimensions.h
+++ b/tensorflow/compiler/xla/service/gpu/launch_dimensions.h
@@ -29,24 +29,37 @@ namespace gpu {
 // number of threads per block.
 class LaunchDimensions {
  public:
+  struct Dim3D {
+    int64 x, y, z;
+  };
+
   // The default constructor creates a launch dimension that indicate
   // single-threaded execution.
-  LaunchDimensions() : block_count_(1), threads_per_block_(1) {}
+  LaunchDimensions()
+      : block_counts_({1, 1, 1}), thread_counts_per_block_({1, 1, 1}) {}
 
-  LaunchDimensions(int64 block_count, int64 threads_per_block)
-      : block_count_(block_count), threads_per_block_(threads_per_block) {}
+  LaunchDimensions(int64 block_x_count, int64 thread_x_count_per_block)
+      : block_counts_({block_x_count, 1, 1}),
+        thread_counts_per_block_({thread_x_count_per_block, 1, 1}) {}
 
-  bool IsSinglethreaded() const {
-    return block_count_ == 1 && threads_per_block_ == 1;
+  LaunchDimensions(const Dim3D& block_counts,
+                   const Dim3D& thread_counts_per_block)
+      : block_counts_(block_counts),
+        thread_counts_per_block_(thread_counts_per_block) {}
+
+  Dim3D block_counts() const { return block_counts_; }
+
+  Dim3D thread_counts_per_block() const { return thread_counts_per_block_; }
+
+  int64 launch_bound() const {
+    return block_counts_.x * thread_counts_per_block_.x * block_counts_.y *
+           thread_counts_per_block_.y * block_counts_.z *
+           thread_counts_per_block_.z;
   }
 
-  int64 block_count() const { return block_count_; }
-  int64 threads_per_block() const { return threads_per_block_; }
-  int64 launch_bound() const { return block_count() * threads_per_block(); }
-
  private:
-  int64 block_count_;
-  int64 threads_per_block_;
+  Dim3D block_counts_;
+  Dim3D thread_counts_per_block_;
 };
 
 std::ostream& operator<<(std::ostream& out,
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
index 1228a1b4823..04af67a70b9 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
@@ -62,8 +62,10 @@ limitations under the License.
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/random.h"
 #include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/core/util/env_var.h"
 
 namespace xla {
 namespace gpu {
@@ -86,14 +88,21 @@ static string GetSmName(std::pair<int, int> compute_capability) {
   int sm_version = 30;
   // If the current compute capability isn't known, fallback to the
   // most recent version before it.
-  for (int v : {75, 72, 70, 62, 61, 60, 53, 52, 50, 37, 35, 32, 30}) {
+  int supported_versions[] = {75, 72, 70, 62, 61, 60, 53,
+                              52, 50, 37, 35, 32, 30};
+  for (int v : supported_versions) {
     if (v <= compute_capability_version) {
       sm_version = v;
       break;
     }
   }
 
-  if (sm_version != compute_capability_version) {
+  // If the current CC isn't supported by LLVM and it is newer then
+  // the max supported LLVM version, do not warn about it. The end
+  // user can't do anything about this. PTX compiled for SM75 will
+  // run on SM80 too.
+  if (sm_version != compute_capability_version &&
+      compute_capability_version < supported_versions[0]) {
     LOG(WARNING) << "Unknown compute capability (" << compute_capability.first
                  << ", " << compute_capability.second << ") ."
                  << "Defaulting to telling LLVM that we're compiling for sm_"
@@ -570,6 +579,60 @@ static std::vector<string> GetROCDLPaths(int amdgpu_version,
   return result;
 }
 
+struct HsacoCacheEntry {
+  uint64 hash;
+  std::string ir;
+  int gfx;
+  std::vector<uint8> hsaco;
+};
+
+struct HsacoCache {
+ protected:
+  std::vector<HsacoCacheEntry> cache;
+  std::mutex m_mutex;
+  int request_count = 0;
+  int hit_count = 0;
+
+ public:
+  static bool Find(const std::string& ir, uint64_t& hash, int gfx,
+                   std::vector<uint8>& hsaco);
+  static void Add(const std::string& ir, uint64_t hash, int gfx,
+                  const std::vector<uint8>& hsaco);
+};
+
+static HsacoCache g_hsacoCache;
+
+bool HsacoCache::Find(const std::string& ir, uint64_t& hash, int gfx,
+                      std::vector<uint8>& hsaco) {
+  std::lock_guard<std::mutex> lg(g_hsacoCache.m_mutex);
+  hash = std::hash<std::string>{}(ir);
+  bool hit = false;
+  for (auto& x : g_hsacoCache.cache) {
+    if (x.hash != hash) continue;
+    if (x.gfx != gfx) continue;
+    if (x.ir != ir) continue;
+    hsaco = x.hsaco;
+    hit = true;
+    break;
+  }
+  g_hsacoCache.request_count++;
+  if (hit) g_hsacoCache.hit_count++;
+  if (!(g_hsacoCache.request_count % 50))
+    VLOG(1) << "HSACO cache: " << g_hsacoCache.request_count << " requests, "
+            << g_hsacoCache.hit_count << " hits";
+  return hit;
+}
+
+void HsacoCache::Add(const std::string& ir, uint64_t hash, int gfx,
+                     const std::vector<uint8>& hsaco) {
+  std::lock_guard<std::mutex> lg(g_hsacoCache.m_mutex);
+  g_hsacoCache.cache.resize(g_hsacoCache.cache.size() + 1);
+  g_hsacoCache.cache.back().ir = ir;
+  g_hsacoCache.cache.back().hash = hash;
+  g_hsacoCache.cache.back().gfx = gfx;
+  g_hsacoCache.cache.back().hsaco = hsaco;
+}
+
 // Emits the given module to HSA Code Object. target_machine is an initialized
 // TargetMachine for the AMDGPU target.
 StatusOr<std::vector<uint8>> EmitModuleToHsaco(
@@ -584,18 +647,29 @@ StatusOr<std::vector<uint8>> EmitModuleToHsaco(
   std::string tempdir_name = tempdir_vector.front();
   VLOG(1) << "Compile-time artifacts located at: " << tempdir_name;
 
+  bool keep_tempfiles = false;
+  TF_CHECK_OK(tensorflow::ReadBoolFromEnvVar("TF_ROCM_KEEP_XLA_TEMPFILES",
+                                             /*default_val=*/false,
+                                             &keep_tempfiles));
   // Prepare filenames for all stages of compilation:
   // IR, binary ISA, and HSACO.
-  std::string ir_filename = absl::StrCat(module->getModuleIdentifier(), ".ll");
+  std::string random_number = std::to_string(tensorflow::random::New64());
+  std::string ir_filename =
+      absl::StrCat(module->getModuleIdentifier(), random_number + ".ll");
   std::string ir_path = tensorflow::io::JoinPath(tempdir_name, ir_filename);
 
+  std::string ir_opt_filename =
+      absl::StrCat(module->getModuleIdentifier(), random_number + "_opt.ll");
+  std::string ir_opt_path =
+      tensorflow::io::JoinPath(tempdir_name, ir_opt_filename);
+
   std::string isabin_filename =
-      absl::StrCat(module->getModuleIdentifier(), ".o");
+      absl::StrCat(module->getModuleIdentifier(), random_number + ".o");
   std::string isabin_path =
       tensorflow::io::JoinPath(tempdir_name, isabin_filename);
 
   std::string hsaco_filename =
-      absl::StrCat(module->getModuleIdentifier(), ".hsaco");
+      absl::StrCat(module->getModuleIdentifier(), random_number + ".hsaco");
   std::string hsaco_path =
       tensorflow::io::JoinPath(tempdir_name, hsaco_filename);
 
@@ -613,7 +687,7 @@ StatusOr<std::vector<uint8>> EmitModuleToHsaco(
   std::string module_id = module->getModuleIdentifier();
   IrDumpingPassManager codegen_passes(
       ReplaceFilenameExtension(tensorflow::io::Basename(module_id),
-                               "-amdgpu.dummy"),
+                               random_number + "-amdgpu.dummy"),
       "", false);
   codegen_passes.add(new llvm::TargetLibraryInfoWrapperPass(
       llvm::Triple(module->getTargetTriple())));
@@ -627,6 +701,12 @@ StatusOr<std::vector<uint8>> EmitModuleToHsaco(
   codegen_passes.run(*module);
   isabin_fs->flush();
 
+  if (keep_tempfiles) {
+    std::unique_ptr<llvm::raw_fd_ostream> ir_fs(
+        new llvm::raw_fd_ostream(ir_opt_path, ec, llvm::sys::fs::F_None));
+    module->print(*ir_fs, nullptr);
+    ir_fs->flush();
+  }
   // Locate lld.
   // TODO(whchung@gmail.com): change to tensorflow::ROCmRoot() after
   // ROCm-Device-Libs PR.
@@ -652,9 +732,9 @@ StatusOr<std::vector<uint8>> EmitModuleToHsaco(
   int lld_result =
       llvm::sys::ExecuteAndWait(*lld_program, llvm_ir::AsArrayRef(lld_args),
                                 llvm::None, {}, 0, 0, &error_message);
-
   if (lld_result) {
-    return xla::InternalError("ld.lld execute fail: %s", error_message);
+    return xla::InternalError("ld.lld execute fail: %s, error code %d",
+                              error_message, lld_result);
   }
 
   // Read HSACO.
@@ -664,6 +744,12 @@ StatusOr<std::vector<uint8>> EmitModuleToHsaco(
   std::vector<uint8> hsaco(hsaco_file_size);
   hsaco_file.seekg(0, std::ios::beg);
   hsaco_file.read(reinterpret_cast<char*>(&hsaco[0]), hsaco_file_size);
+  hsaco_file.close();
+  if (!keep_tempfiles) {
+    remove(ir_path.c_str());
+    remove(isabin_path.c_str());
+    remove(hsaco_path.c_str());
+  }
   return hsaco;
 }
 
@@ -728,6 +814,20 @@ StatusOr<std::vector<uint8>> CompileToHsaco(
 
   std::vector<uint8> hsaco;
   std::unique_ptr<llvm::TargetMachine> target_machine;
+  std::string str;
+  llvm::raw_string_ostream stream(str);
+  stream << *module;
+  // Delete the first two lines, since they usually vary even when the rest of
+  // the code is the same (but verify that they are what we expect).
+  if (str.size() >= 13 && str.substr(0, 13) == "; ModuleID = ") {
+    auto pos = str.find("\n");
+    if (pos != std::string::npos) str = str.substr(pos + 1);
+  }
+  if (str.size() >= 18 && str.substr(0, 18) == "source_filename = ") {
+    auto pos = str.find("\n");
+    if (pos != std::string::npos) str = str.substr(pos + 1);
+  }
+  str += hlo_module_config.compilation_cache_key();
   {
     tensorflow::profiler::TraceMe activity(
         [&] { return absl::StrCat("Compiling IR", module->getName().str()); },
@@ -739,6 +839,21 @@ StatusOr<std::vector<uint8>> CompileToHsaco(
       return xla::InternalError(
           "Incompatible AMD GCN ISA version was specified.");
     }
+    uint64_t hash;
+    if (HsacoCache::Find(str, hash, *amdgpu_version, hsaco)) {
+      VLOG(1) << "HSACO cache hit";
+      return hsaco;
+    }
+    VLOG(1) << "HSACO cache miss";
+    bool dump_lls = false;
+    if (dump_lls) {
+      static int hsaco_count = 0;
+      std::string name = "/tmp/" + std::to_string(hsaco_count) + ".ll";
+      hsaco_count++;
+      std::ofstream ofs(name);
+      ofs << str;
+      ofs.close();
+    }
 
     llvm::Triple default_target_triple("amdgcn--amdhsa-amdgiz");
     // Construct LLVM TargetMachine for AMDGPU.
@@ -754,6 +869,7 @@ StatusOr<std::vector<uint8>> CompileToHsaco(
 
     // Lower optimized LLVM module to HSA code object.
     TF_ASSIGN_OR_RETURN(hsaco, EmitModuleToHsaco(module, target_machine.get()));
+    HsacoCache::Add(str, hash, *amdgpu_version, hsaco);
   }
   return hsaco;
 }
diff --git a/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc b/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc
index f9937ba77de..6b7b31e8288 100644
--- a/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc
@@ -75,7 +75,7 @@ ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(absl::string_view loop_name,
   std::vector<llvm_ir::IrArray::Index> array_indices;
   llvm::Value* block_id =
       EmitCallToTargetIntrinsic(TargetIntrinsicID::kBlockIdx, {}, {}, b_);
-  llvm_ir::AddRangeMetadata(0, launch_dimensions_.block_count(),
+  llvm_ir::AddRangeMetadata(0, launch_dimensions_.block_counts().x,
                             static_cast<llvm::Instruction*>(block_id));
   block_id = b_->CreateZExtOrTrunc(block_id, index_type, "block_id");
 
@@ -85,16 +85,17 @@ ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(absl::string_view loop_name,
   // %ntid.x is currently specified as 1024.
   llvm::Value* thread_id =
       EmitCallToTargetIntrinsic(TargetIntrinsicID::kThreadIdx, {}, {}, b_);
-  llvm_ir::AddRangeMetadata(0, launch_dimensions_.threads_per_block(),
+  llvm_ir::AddRangeMetadata(0, launch_dimensions_.thread_counts_per_block().x,
                             static_cast<llvm::Instruction*>(thread_id));
   thread_id = b_->CreateZExtOrTrunc(thread_id, index_type, "thread_id");
 
   llvm::Value* linear_index_base = b_->CreateAdd(
-      b_->CreateMul(block_id,
-                    llvm::ConstantInt::get(
-                        index_type, launch_dimensions_.threads_per_block()),
-                    "",
-                    /*HasNUW=*/true, /*HasNSW=*/true),
+      b_->CreateMul(
+          block_id,
+          llvm::ConstantInt::get(
+              index_type, launch_dimensions_.thread_counts_per_block().x),
+          "",
+          /*HasNUW=*/true, /*HasNSW=*/true),
       thread_id, "linear_index", /*HasNUW=*/true, /*HasNSW=*/true);
 
   // Add an @llvm.assume(linear_index < threads_per_block * num_blocks).
@@ -109,9 +110,9 @@ ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(absl::string_view loop_name,
       llvm::Intrinsic::assume,
       {b_->CreateICmpULT(
           linear_index_base,
-          llvm::ConstantInt::get(index_type,
-                                 launch_dimensions_.threads_per_block() *
-                                     launch_dimensions_.block_count()),
+          llvm::ConstantInt::get(
+              index_type, launch_dimensions_.thread_counts_per_block().x *
+                              launch_dimensions_.block_counts().x),
           "linear_index_in_range")},
       {}, b_);
 
diff --git a/tensorflow/compiler/xla/service/gpu/stream_executor_util.cc b/tensorflow/compiler/xla/service/gpu/stream_executor_util.cc
index d7468a31377..8ea7c57c978 100644
--- a/tensorflow/compiler/xla/service/gpu/stream_executor_util.cc
+++ b/tensorflow/compiler/xla/service/gpu/stream_executor_util.cc
@@ -209,16 +209,18 @@ StatusOr<std::unique_ptr<se::KernelBase>> CreateKernel(
 
 Status ExecuteKernelOnStream(const se::KernelBase& kernel,
                              absl::Span<const se::DeviceMemoryBase> args,
-                             int64 threads_per_block, int64 block_count,
-                             se::Stream* stream) {
+                             const LaunchDimensions& dims, se::Stream* stream) {
   static constexpr int kKernelArgsLimit = 1024;
   auto kernel_args = absl::make_unique<se::KernelArgsArray<kKernelArgsLimit>>();
   for (const se::DeviceMemoryBase& buf : args) {
     kernel_args->add_device_memory_argument(buf);
   }
-  return stream->parent()->Launch(stream, se::ThreadDim(threads_per_block),
-                                  se::BlockDim(block_count), kernel,
-                                  *kernel_args);
+  LaunchDimensions::Dim3D thread_counts = dims.thread_counts_per_block();
+  LaunchDimensions::Dim3D block_counts = dims.block_counts();
+  return stream->parent()->Launch(
+      stream, se::ThreadDim(thread_counts.x, thread_counts.y, thread_counts.z),
+      se::BlockDim(block_counts.x, block_counts.y, block_counts.z), kernel,
+      *kernel_args);
 }
 
 se::GpuAsmOpts PtxOptsFromConfig(const HloModuleConfig& hlo_module_config) {
diff --git a/tensorflow/compiler/xla/service/gpu/stream_executor_util.h b/tensorflow/compiler/xla/service/gpu/stream_executor_util.h
index 0a5e0e93a51..6696d1957b3 100644
--- a/tensorflow/compiler/xla/service/gpu/stream_executor_util.h
+++ b/tensorflow/compiler/xla/service/gpu/stream_executor_util.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/layout.h"
+#include "tensorflow/compiler/xla/service/gpu/launch_dimensions.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -71,8 +72,7 @@ StatusOr<std::unique_ptr<se::KernelBase>> CreateKernel(
 // Runs loaded kernel on the stream with the provided arguments.
 Status ExecuteKernelOnStream(const se::KernelBase& kernel,
                              absl::Span<const se::DeviceMemoryBase> args,
-                             int64 threads_per_block, int64 block_count,
-                             se::Stream* stream);
+                             const LaunchDimensions& dims, se::Stream* stream);
 
 // Create GpuAsmOpts out of HloModuleConfig.
 se::GpuAsmOpts PtxOptsFromConfig(const HloModuleConfig& hlo_module_config);
diff --git a/tensorflow/compiler/xla/service/gpu/tests/BUILD b/tensorflow/compiler/xla/service/gpu/tests/BUILD
index a2bddd2d0d7..f6e3e965166 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/tests/BUILD
@@ -219,6 +219,28 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "parallel_reduction_test",
+    srcs = [
+        "parallel_reduction_test.cc",
+    ],
+    tags = tf_cuda_tests_tags() + ["no_rocm"],
+    deps = [
+        ":gpu_codegen_test",
+        "//tensorflow/compiler/xla/service:gpu_plugin",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_module_config",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/service/gpu:gpu_executable",
+        "//tensorflow/compiler/xla/tests:filecheck",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:llvm_irgen_test_base",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 tf_cc_test(
     name = "gpu_copy_test",
     srcs = ["gpu_copy_test.cc"],
@@ -375,6 +397,8 @@ tf_cc_test(
         ":gpu_codegen_test",
         "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/service/gpu:gpu_fusible",
+        "//tensorflow/compiler/xla/service/gpu:instruction_fusion",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -458,6 +482,35 @@ xla_test(
     ],
 )
 
+tf_cc_test(
+    name = "sorting_test",
+    srcs = [
+        "sorting_test.cc",
+    ],
+    tags = tf_cuda_tests_tags() + [
+        "no_rocm",
+    ],
+    deps = [
+        ":gpu_codegen_test",
+        "//tensorflow/compiler/xla:debug_options_flags",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:xla_proto_cc",
+        "//tensorflow/compiler/xla/service:gpu_plugin",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_module_config",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/service/gpu:gpu_executable",
+        "//tensorflow/compiler/xla/tests:filecheck",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:llvm_irgen_test_base",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/stream_executor/lib",
+        "@com_google_absl//absl/memory",
+    ],
+)
+
 tf_cc_binary(
     name = "hlo_to_llvm_ir",
     srcs = ["hlo_to_llvm_ir.cc"],
@@ -499,8 +552,15 @@ filegroup(
 # Binary with only the thunks dialect registered, for testing purposes.
 tf_cc_binary(
     name = "xla-thunks-opt",
+    srcs = ["xla_thunks_opt.cc"],
     deps = [
-        "//tensorflow/compiler/mlir:tf_mlir_opt_main",
-        "//tensorflow/compiler/xla/service/gpu:xla_thunks_dialect_registration",
+        "//tensorflow/compiler/mlir:init_mlir",
+        "//tensorflow/compiler/mlir/hlo:hlo_dialect_registration",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/xla/service/gpu:xla_thunks_ops",
+        "//tensorflow/core:lib",
+        "@llvm-project//mlir:AllPassesAndDialectsNoRegistration",
+        "@llvm-project//mlir:MlirOptLib",
+        "@llvm-project//mlir:Shape",
     ],
 )
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_fusion_test.cc
index 674b436a8e3..811705d2b17 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_fusion_test.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include <utility>
 
+#include "tensorflow/compiler/xla/service/gpu/gpu_fusible.h"
+#include "tensorflow/compiler/xla/service/gpu/instruction_fusion.h"
 #include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
@@ -54,6 +56,37 @@ TEST_F(GpuFusionTest, FusedReshape) {
       )");
 }
 
+// Check that we limit the number of operands to fusions we create.
+TEST_F(GpuFusionTest, FusedBiggerThenThresholdButDoNotChangeTheFusionl) {
+  constexpr int64 kNumParams = kMaxOperandsAndOutputsPerFusion + 1;
+
+  // Compute
+  //   p0 + p1 + p2 + ... + pn,
+  // Use so many parameters that they do not fit into one fusion.
+  auto module = CreateNewVerifiedModule();
+  HloComputation::Builder b(TestName());
+  Shape input_shape = ShapeUtil::MakeShape(F32, {10, 100});
+  Shape slice_shape = ShapeUtil::MakeShape(F32, {10, 2});
+  Shape concat_shape = ShapeUtil::MakeShape(F32, {10, 2 * kNumParams});
+  HloInstruction* input =
+      b.AddInstruction(HloInstruction::CreateParameter(0, input_shape, "p"));
+
+  std::vector<HloInstruction*> slice_params;
+  for (int64 i = 0; i < kNumParams; ++i) {
+    slice_params.push_back(b.AddInstruction(HloInstruction::CreateSlice(
+        slice_shape, input, {0, 0}, {10, 2}, {1, 1})));
+  }
+  b.AddInstruction(
+      HloInstruction::CreateConcatenate(concat_shape, slice_params, 1));
+  module->AddEntryComputation(b.Build());
+  EXPECT_TRUE(GpuInstructionFusion(false).Run(module.get()).ValueOrDie());
+  EXPECT_TRUE(module->entry_computation()->root_instruction()->opcode() ==
+              HloOpcode::kFusion);
+  for (HloInstruction* instr : module->entry_computation()->instructions()) {
+    EXPECT_TRUE(instr->opcode() != HloOpcode::kSlice);
+  }
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_noalias_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_noalias_test.cc
index 1e39a4deaa7..8ec00d73711 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_noalias_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_noalias_test.cc
@@ -51,7 +51,7 @@ TEST_F(GpuNoAliasTest, Concat) {
   hlo_module->AddEntryComputation(std::move(computation));
 
   CompileAndVerifyIr(std::move(hlo_module),
-                     R"(CHECK-LABEL: define void @fusion
+                     R"(CHECK-LABEL: define{{.*}}void @fusion
                         CHECK-SAME: i8* noalias align {{[0-9]*}} dereferenceable({{[0-9]*}}) %[[OUTPUT_ALLOC:[a-z0-9]*]]
                         CHECK: %fusion.raw = {{.*}} %[[OUTPUT_ALLOC]])",
                      /*match_optimized_ir=*/false);
diff --git a/tensorflow/compiler/xla/service/gpu/tests/parallel_reduction_test.cc b/tensorflow/compiler/xla/service/gpu/tests/parallel_reduction_test.cc
new file mode 100644
index 00000000000..06e547dfe34
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/parallel_reduction_test.cc
@@ -0,0 +1,190 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/tests/filecheck.h"
+
+namespace xla {
+namespace gpu {
+
+namespace {
+
+class ParallelReductionTest : public GpuCodegenTest {
+  DebugOptions GetDebugOptionsForTest() override {
+    DebugOptions debug_options = GpuCodegenTest::GetDebugOptionsForTest();
+    // The test contains a MOF fusion and the XLA optimizer passes
+    // don't like this.
+    debug_options.set_xla_disable_all_hlo_passes(true);
+    return debug_options;
+  }
+};
+
+TEST_F(ParallelReductionTest, TwoParallelReductions) {
+  const char* hlo_text = R"(
+HloModule TwoParallelReductions
+
+%add_f32 {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %add = f32[] add(%x, %y)
+}
+
+%fused_computation {
+  %param0 = f32[1024] parameter(0)
+  %param1 = f32[1024] parameter(1)
+  %constant0 = f32[] constant(0)
+  %reduce1 = f32[] reduce(%param0, %constant0), dimensions={0}, to_apply=%add_f32
+  %reduce2 = f32[] reduce(%param1, %constant0), dimensions={0}, to_apply=%add_f32
+  ROOT %tuple = (f32[], f32[]) tuple(%reduce1, %reduce2)
+}
+
+ENTRY %cluster {
+  %param0 = f32[1024] parameter(0)
+  %param1 = f32[1024] parameter(1)
+  ROOT %fusion = (f32[], f32[])
+      fusion(%param0, %param1), kind=kInput, calls=%fused_computation
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(hlo_text));
+  CompileAndVerifyIr(std::move(hlo_module),
+                     R"(
+CHECK: reduce-group-0
+CHECK: reduce-group-1
+CHECK-NOT: reduce-group-2
+)",
+                     /*match_optimized_ir=*/false);
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
+}
+
+TEST_F(ParallelReductionTest, ManyParallelReductions) {
+  std::unique_ptr<VerifiedHloModule> module = CreateNewVerifiedModule();
+  // Simply use a number not too large to avoid long compilation time
+  // and not too small for meaningful test.
+  const size_t num_reduces = 32;
+
+  HloComputation* reduce_computation;
+  {
+    auto embedded_builder = HloComputation::Builder("add");
+    HloInstruction* lhs =
+        embedded_builder.AddInstruction(HloInstruction::CreateParameter(
+            0, ShapeUtil::MakeShape(F32, {}), "lhs"));
+    HloInstruction* rhs =
+        embedded_builder.AddInstruction(HloInstruction::CreateParameter(
+            1, ShapeUtil::MakeShape(F32, {}), "rhs"));
+    embedded_builder.AddInstruction(
+        HloInstruction::CreateBinary(lhs->shape(), HloOpcode::kAdd, lhs, rhs));
+    reduce_computation =
+        module->AddEmbeddedComputation(embedded_builder.Build());
+  }
+
+  Shape input_shape = ShapeUtil::MakeShape(F32, {1024});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {});
+  HloComputation* fusion_computation;
+  {
+    auto fusion_builder = HloComputation::Builder("fusion_computation");
+    std::vector<HloInstruction*> outputs;
+    HloInstruction* constant = fusion_builder.AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0)));
+    for (size_t i = 0; i < num_reduces; ++i) {
+      HloInstruction* param = fusion_builder.AddInstruction(
+          HloInstruction::CreateParameter(i, input_shape, "param"));
+      HloInstruction* output =
+          fusion_builder.AddInstruction(HloInstruction::CreateReduce(
+              output_shape, param, constant, {0}, reduce_computation));
+      outputs.push_back(output);
+    }
+    fusion_builder.AddInstruction(HloInstruction::CreateTuple(outputs));
+    fusion_computation = module->AddEmbeddedComputation(fusion_builder.Build());
+  }
+
+  HloComputation::Builder b(TestName());
+  std::vector<HloInstruction*> entry_params;
+  std::vector<Shape> output_shapes;
+  for (size_t i = 0; i < num_reduces; ++i) {
+    HloInstruction* param = b.AddInstruction(
+        HloInstruction::CreateParameter(i, input_shape, "param"));
+    entry_params.push_back(param);
+    output_shapes.push_back(output_shape);
+  }
+  b.AddInstruction(HloInstruction::CreateFusion(
+      ShapeUtil::MakeTupleShape(output_shapes),
+      HloInstruction::FusionKind::kInput, entry_params, fusion_computation));
+  module->AddEntryComputation(b.Build());
+
+  EXPECT_TRUE(RunAndCompare(std::move(module), ErrorSpec{1e-5, 1e-5}));
+}
+
+TEST_F(ParallelReductionTest, ThreeReductionGroups) {
+  const char* hlo_text = R"(
+HloModule ThreeReductionGroups
+
+%add_f32 {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %add = f32[] add(%x, %y)
+}
+
+%fused_computation {
+  %param0 = f32[1024,128] parameter(0)
+  %param1 = f32[1024,128] parameter(1)
+  %param2 = f32[1024,128] parameter(2)
+  %constant0 = f32[] constant(0)
+  // %mul0, %reduce0, and %reduce1 should go into a group.
+  %broadcast0 = f32[1024,128] broadcast(%constant0), dimensions={}
+  %mul0 = f32[1024,128] multiply(param0, broadcast0)
+  %reduce0 = f32[128] reduce(%mul0, %constant0), dimensions={0}, to_apply=%add_f32
+  %reduce1 = f32[128] reduce(%param0, %constant0), dimensions={0}, to_apply=%add_f32
+  // %reduce2 and %reduce3 should go into another group.
+  %reduce2 = f32[128] reduce(%param1, %constant0), dimensions={0}, to_apply=%add_f32
+  %reduce3 = f32[128] reduce(%param1, %constant0), dimensions={0}, to_apply=%add_f32
+  // %reduce4 and %mul2 should go into the other group, although broadcast0 is
+  // reused.
+  %mul1 = f32[1024,128] multiply(param2, broadcast0)
+  %reduce4 = f32[128] reduce(%mul1, %constant0), dimensions={0}, to_apply=%add_f32
+  %mul2 = f32[1024,128] multiply(param2, param2)
+  ROOT %tuple =
+      (f32[1024, 128], f32[128], f32[128], f32[128], f32[128], f32[128], f32[1024, 128])
+      tuple(%mul2, %reduce0, %reduce4, %reduce3, %reduce2, %reduce1, %mul0)
+}
+
+ENTRY %cluster {
+  %param0 = f32[1024,128] parameter(0)
+  %param1 = f32[1024,128] parameter(1)
+  %param2 = f32[1024,128] parameter(2)
+  ROOT %fusion =
+      (f32[1024, 128], f32[128], f32[128], f32[128], f32[128], f32[128], f32[1024, 128])
+      fusion(%param0, %param1, %param2), kind=kInput, calls=%fused_computation
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(hlo_text));
+  CompileAndVerifyIr(std::move(hlo_module),
+                     R"(
+CHECK: reduce-group-0
+CHECK: reduce-group-1
+CHECK: reduce-group-2
+CHECK-NOT: reduce-group-3
+)",
+                     /*match_optimized_ir=*/false);
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc
index 215c2e627ae..5f97452ff71 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc
@@ -336,8 +336,17 @@ ENTRY %cluster {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> optimized_module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  CompileAndOptionallyVerifyPtx(std::move(optimized_module),
-                                R"(
+  const se::DeviceDescription& device_description =
+      backend().default_stream_executor()->GetDeviceDescription();
+  int cc_major = 0, cc_minor = 0;
+  device_description.cuda_compute_capability(&cc_major, &cc_minor);
+
+  string expected;
+  if (cc_major < 6) {
+    // We do not vectorize for GPU before Pascal.
+    expected = "CHECK-NOT: ld.global.nc.v2.f32";
+  } else {
+    expected = R"(
 CHECK: ld.global.nc.v2.f32
 CHECK: st.global.v2.f32
 CHECK: st.global.v2.f32
@@ -350,7 +359,9 @@ CHECK: st.global.v2.f32
 CHECK: ld.global.nc.v2.f32
 CHECK: st.global.v2.f32
 CHECK: st.global.v2.f32
-)");
+)";
+  }
+  CompileAndOptionallyVerifyPtx(std::move(optimized_module), expected);
 
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
 }
diff --git a/tensorflow/compiler/xla/service/gpu/tests/scatter.hlo b/tensorflow/compiler/xla/service/gpu/tests/scatter.hlo
index c9e7daeb3bc..f625abe6612 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/scatter.hlo
+++ b/tensorflow/compiler/xla/service/gpu/tests/scatter.hlo
@@ -1,6 +1,6 @@
 // RUN: hlo_to_llvm_ir %s | FileCheck %s
 
-// CHECK-LABEL: define void @scatter_TensorFlowScatterV1(i8* noalias align 64 dereferenceable(36) %alloc0, i8* noalias align 16 dereferenceable(36) %alloc1, i8* noalias align 16 dereferenceable(24) %alloc2, i8* noalias align 16 dereferenceable(8) %alloc3) {
+// CHECK-LABEL: define void @scatter_TensorFlowScatterV1(i8* noalias align 16 dereferenceable(36) %alloc0, i8* noalias align 16 dereferenceable(24) %alloc1, i8* noalias align 16 dereferenceable(8) %alloc2) {
 // CHECK: entry:
 // CHECK:         %[[VAL_32:.*]] = alloca i32, align 4
 // CHECK:         %[[VAL_0:.*]] = getelementptr inbounds i8, i8* %[[VAL_1:.*]], i64 0
@@ -43,8 +43,8 @@
 // CHECK:         store atomic i32 %[[VAL_36]], i32* %[[VAL_31]] unordered, align 4
 // CHECK:         br label %[[VAL_23]]
 // CHECK: !nvvm.annotations = !{!0, !1}
-// CHECK: !0 = !{void (i8*, i8*, i8*, i8*)* @scatter_TensorFlowScatterV1, !"kernel", i32 1}
-// CHECK: !1 = !{void (i8*, i8*, i8*, i8*)* @scatter_TensorFlowScatterV1, !"reqntidx", i32 6}
+// CHECK: !0 = !{void (i8*, i8*, i8*)* @scatter_TensorFlowScatterV1, !"kernel", i32 1}
+// CHECK: !1 = !{void (i8*, i8*, i8*)* @scatter_TensorFlowScatterV1, !"reqntidx", i32 6}
 // CHECK: !2 = !{i32 0, i32 1}
 // CHECK: !3 = !{i32 0, i32 6}
 // CHECK: !4 = !{}
@@ -72,7 +72,7 @@ ENTRY main {
 
 // -----
 
-// CHECK-LABEL: define void @scatter_ScatterIntoScalar(i8* noalias align 64 dereferenceable(4) %alloc0, i8* noalias align 16 dereferenceable(4) %alloc1, i8* noalias align 16 dereferenceable(4) %alloc2, i8* noalias align 16 %alloc3) {
+// CHECK-LABEL: define void @scatter_ScatterIntoScalar(i8* noalias align 16 dereferenceable(4) %alloc0, i8* noalias align 16 dereferenceable(4) %alloc1, i8* noalias align 16 %alloc2) {
 // CHECK:       entry:
 // CHECK:         %[[VAL_60:.*]] = alloca i32, align 4
 // CHECK:         %[[VAL_37:.*]] = getelementptr inbounds i8, i8* %[[VAL_38:.*]], i64 0
@@ -104,8 +104,8 @@ ENTRY main {
 // CHECK:         store atomic i32 %[[VAL_62]], i32* %[[VAL_39]] unordered, align 4
 // CHECK:         br label %[[VAL_57]]
 // CHECK: !nvvm.annotations = !{!0, !1}
-// CHECK: !0 = !{void (i8*, i8*, i8*, i8*)* @scatter_ScatterIntoScalar, !"kernel", i32 1}
-// CHECK: !1 = !{void (i8*, i8*, i8*, i8*)* @scatter_ScatterIntoScalar, !"reqntidx", i32 1}
+// CHECK: !0 = !{void (i8*, i8*, i8*)* @scatter_ScatterIntoScalar, !"kernel", i32 1}
+// CHECK: !1 = !{void (i8*, i8*, i8*)* @scatter_ScatterIntoScalar, !"reqntidx", i32 1}
 // CHECK: !2 = !{i32 0, i32 1}
 // CHECK: !3 = !{}
 
@@ -131,7 +131,7 @@ ENTRY main {
 
 // -----
 
-// CHECK-LABEL: define void @scatter_TensorFlowScatter_Mul(i8* noalias align 64 dereferenceable(36) %alloc0, i8* noalias align 16 dereferenceable(36) %alloc1, i8* noalias align 16 dereferenceable(24) %alloc2, i8* noalias align 16 dereferenceable(8) %alloc3) {
+// CHECK-LABEL: define void @scatter_TensorFlowScatter_Mul(i8* noalias align 16 dereferenceable(36) %alloc0, i8* noalias align 16 dereferenceable(24) %alloc1, i8* noalias align 16 dereferenceable(8) %alloc2) {
 // CHECK:         %[[VAL_63:.*]] = alloca i32, align 4
 // CHECK:         %[[VAL_64:.*]] = alloca i32, align 4
 // CHECK:         %[[VAL_98:.*]] = alloca i32, align 4
@@ -188,8 +188,8 @@ ENTRY main {
 // CHECK:         %[[VAL_109:.*]] = extractvalue { i32, i1 } %[[VAL_107]], 1
 // CHECK:         br i1 %[[VAL_109]], label %[[VAL_96]], label %[[VAL_104]]
 // CHECK: !nvvm.annotations = !{!0, !1}
-// CHECK: !0 = !{void (i8*, i8*, i8*, i8*)* @scatter_TensorFlowScatter_Mul, !"kernel", i32 1}
-// CHECK: !1 = !{void (i8*, i8*, i8*, i8*)* @scatter_TensorFlowScatter_Mul, !"reqntidx", i32 6}
+// CHECK: !0 = !{void (i8*, i8*, i8*)* @scatter_TensorFlowScatter_Mul, !"kernel", i32 1}
+// CHECK: !1 = !{void (i8*, i8*, i8*)* @scatter_TensorFlowScatter_Mul, !"reqntidx", i32 6}
 // CHECK: !2 = !{i32 0, i32 1}
 // CHECK: !3 = !{i32 0, i32 6}
 // CHECK: !4 = !{}
@@ -216,7 +216,7 @@ ENTRY main {
 
 // -----
 
-// CHECK-LABEL: define void @scatter_ScalarUpdate(i8* noalias align 64 dereferenceable(16) %alloc0, i8* noalias align 16 dereferenceable(16) %alloc1, i8* noalias align 16 dereferenceable(4) %alloc2, i8* noalias align 16 dereferenceable(4) %alloc3) {
+// CHECK-LABEL: define void @scatter_ScalarUpdate(i8* noalias align 16 dereferenceable(16) %alloc0, i8* noalias align 16 dereferenceable(4) %alloc1, i8* noalias align 16 dereferenceable(4) %alloc2) {
 // CHECK:       entry:
 // CHECK:         %[[VAL_146:.*]] = alloca i32, align 4
 // CHECK:         %[[VAL_118:.*]] = getelementptr inbounds i8, i8* %[[VAL_119:.*]], i64 0
@@ -253,8 +253,8 @@ ENTRY main {
 // CHECK:         store atomic i32 %[[VAL_148]], i32* %[[VAL_145]] unordered, align 4
 // CHECK:         br label %[[VAL_138]]
 // CHECK: !nvvm.annotations = !{!0, !1}
-// CHECK: !0 = !{void (i8*, i8*, i8*, i8*)* @scatter_ScalarUpdate, !"kernel", i32 1}
-// CHECK: !1 = !{void (i8*, i8*, i8*, i8*)* @scatter_ScalarUpdate, !"reqntidx", i32 1}
+// CHECK: !0 = !{void (i8*, i8*, i8*)* @scatter_ScalarUpdate, !"kernel", i32 1}
+// CHECK: !1 = !{void (i8*, i8*, i8*)* @scatter_ScalarUpdate, !"reqntidx", i32 1}
 // CHECK: !2 = !{i32 0, i32 1}
 // CHECK: !3 = !{}
 
diff --git a/tensorflow/compiler/xla/service/gpu/tests/sorting.hlo b/tensorflow/compiler/xla/service/gpu/tests/sorting.hlo
new file mode 100644
index 00000000000..4d29a8df116
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/sorting.hlo
@@ -0,0 +1,382 @@
+// RUN: hlo_to_llvm_ir %s | FileCheck %s
+
+HloModule TestModule
+
+compare {
+  p.0.lhs = f32[] parameter(0)
+  p.0.rhs = f32[] parameter(1)
+  ROOT lt = pred[] compare(p.0.lhs, p.0.rhs), direction=LT
+}
+
+// CHECK: define void @sort(i8* noalias align 64 dereferenceable(24) [[ALLOC0:%.*]])
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[COMPARE_RETURN_BUFFER:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[ALLOC0:%.*]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to [2 x [3 x float]]*
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[ALLOC0]], i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to [2 x [3 x float]]*
+// CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !6
+// CHECK-NEXT:    [[BLOCK_ID:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !7
+// CHECK-NEXT:    [[THREAD_ID:%.*]] = zext i32 [[TMP5]] to i64
+// CHECK-NEXT:    [[TMP6:%.*]] = mul nuw nsw i64 [[BLOCK_ID]], 4
+// CHECK-NEXT:    [[LINEAR_INDEX:%.*]] = add nuw nsw i64 [[TMP6]], [[THREAD_ID]]
+// CHECK-NEXT:    [[LINEAR_INDEX_IN_RANGE:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4
+// CHECK-NEXT:    call void @llvm.assume(i1 [[LINEAR_INDEX_IN_RANGE]])
+// CHECK-NEXT:    [[TMP7:%.*]] = udiv i64 [[LINEAR_INDEX]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = urem i64 [[TMP7]], 2
+// CHECK-NEXT:    [[TMP9:%.*]] = udiv i64 [[LINEAR_INDEX]], 2
+// CHECK-NEXT:    [[TMP10:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4
+// CHECK-NEXT:    br i1 [[TMP10]], label [[SORT_IN_BOUNDS_TRUE:%.*]], label [[SORT_IN_BOUNDS_AFTER:%.*]]
+// CHECK:       sort.in_bounds-after:
+// CHECK-NEXT:    ret void
+// CHECK:       sort.in_bounds-true:
+// CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP8]], 2
+// CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 1
+// CHECK-NEXT:    [[TMP13:%.*]] = icmp slt i64 [[TMP11]], [[TMP12]]
+// CHECK-NEXT:    [[TMP14:%.*]] = icmp slt i64 [[TMP12]], 3
+// CHECK-NEXT:    [[TMP15:%.*]] = and i1 [[TMP13]], [[TMP14]]
+// CHECK-NEXT:    br i1 [[TMP15]], label [[SMALLER_COMPARISON_INDEX_TRUE:%.*]], label [[SMALLER_COMPARISON_INDEX_AFTER:%.*]]
+// CHECK:       smaller_comparison_index-after:
+// CHECK-NEXT:    br label [[SORT_IN_BOUNDS_AFTER]]
+// CHECK:       smaller_comparison_index-true:
+// CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP1]], i64 0, i64 [[TMP9]], i64 [[TMP12]]
+// CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP1]], i64 0, i64 [[TMP9]], i64 [[TMP11]]
+// CHECK-NEXT:    call void @region_0_4(float* [[TMP16]], float* [[TMP17]], i8* [[COMPARE_RETURN_BUFFER]])
+// CHECK-NEXT:    [[TMP18:%.*]] = load i8, i8* [[COMPARE_RETURN_BUFFER]], align 1
+// CHECK-NEXT:    [[BOOLEAN_PREDICATE:%.*]] = icmp ne i8 [[TMP18]], 0
+// CHECK-NEXT:    br i1 [[BOOLEAN_PREDICATE]], label [[IS_SMALLER_THAN_TRUE:%.*]], label [[IS_SMALLER_THAN_AFTER:%.*]]
+// CHECK:       is_smaller_than-after:
+// CHECK-NEXT:    br label [[SMALLER_COMPARISON_INDEX_AFTER]]
+// CHECK:       is_smaller_than-true:
+// CHECK-NEXT:    [[TMP19:%.*]] = load float, float* [[TMP16]], align 4
+// CHECK-NEXT:    [[TMP20:%.*]] = load float, float* [[TMP17]], align 4
+// CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP1]], i64 0, i64 [[TMP9]], i64 [[TMP11]]
+// CHECK-NEXT:    store float [[TMP19]], float* [[TMP21]], align 4
+// CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP1]], i64 0, i64 [[TMP9]], i64 [[TMP12]]
+// CHECK-NEXT:    store float [[TMP20]], float* [[TMP22]], align 4
+// CHECK-NEXT:    br label [[IS_SMALLER_THAN_AFTER]]
+
+// CHECK: define internal void @region_0_4(float* dereferenceable(4) [[P_0_LHS_TYPED:%.*]], float* dereferenceable(4) [[P_0_RHS_TYPED:%.*]], i8* dereferenceable(1) [[OUTPUT_ARG:%.*]])
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[COMPARE_3_TYPED:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[ARG_0_1_TYPED:%.*]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[ARG_1_2_TYPED:%.*]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp olt float [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i8
+// CHECK-NEXT:    store i8 [[TMP3]], i8* [[COMPARE_3_TYPED]], align 1
+// CHECK-NEXT:    [[LOAD_RET_VALUE:%.*]] = load i8, i8* [[COMPARE_3_TYPED]], align 1
+// CHECK-NEXT:    store i8 [[LOAD_RET_VALUE]], i8* [[OUTPUT_ARG:%.*]], align 1
+// CHECK-NEXT:    ret void
+
+// CHECK: define void @sort__1(i8* noalias align 64 dereferenceable(24) [[ALLOC0:%.*]]) {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[COMPARE_RETURN_BUFFER:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[ALLOC0:%.*]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to [2 x [3 x float]]*
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[ALLOC0]], i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to [2 x [3 x float]]*
+// CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !6
+// CHECK-NEXT:    [[BLOCK_ID:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !7
+// CHECK-NEXT:    [[THREAD_ID:%.*]] = zext i32 [[TMP5]] to i64
+// CHECK-NEXT:    [[TMP6:%.*]] = mul nuw nsw i64 [[BLOCK_ID]], 4
+// CHECK-NEXT:    [[LINEAR_INDEX:%.*]] = add nuw nsw i64 [[TMP6]], [[THREAD_ID]]
+// CHECK-NEXT:    [[LINEAR_INDEX_IN_RANGE:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4
+// CHECK-NEXT:    call void @llvm.assume(i1 [[LINEAR_INDEX_IN_RANGE]])
+// CHECK-NEXT:    [[TMP7:%.*]] = udiv i64 [[LINEAR_INDEX]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = urem i64 [[TMP7]], 2
+// CHECK-NEXT:    [[TMP9:%.*]] = udiv i64 [[LINEAR_INDEX]], 2
+// CHECK-NEXT:    [[TMP10:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4
+// CHECK-NEXT:    br i1 [[TMP10]], label [[SORT_IN_BOUNDS_TRUE:%.*]], label [[SORT_IN_BOUNDS_AFTER:%.*]]
+// CHECK:       sort.in_bounds-after:
+// CHECK-NEXT:    ret void
+// CHECK:       sort.in_bounds-true:
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP8]], 3
+// CHECK-NEXT:    [[TMP12:%.*]] = icmp slt i64 [[TMP8]], [[TMP11]]
+// CHECK-NEXT:    [[TMP13:%.*]] = icmp slt i64 [[TMP11]], 3
+// CHECK-NEXT:    [[TMP14:%.*]] = and i1 [[TMP12]], [[TMP13]]
+// CHECK-NEXT:    br i1 [[TMP14]], label [[SMALLER_COMPARISON_INDEX_TRUE:%.*]], label [[SMALLER_COMPARISON_INDEX_AFTER:%.*]]
+// CHECK:       smaller_comparison_index-after:
+// CHECK-NEXT:    br label [[SORT_IN_BOUNDS_AFTER]]
+// CHECK:       smaller_comparison_index-true:
+// CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP1]], i64 0, i64 [[TMP9]], i64 [[TMP11]]
+// CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP1]], i64 0, i64 [[TMP9]], i64 [[TMP8]]
+// CHECK-NEXT:    call void @region_0_4(float* [[TMP15]], float* [[TMP16]], i8* [[COMPARE_RETURN_BUFFER]])
+// CHECK-NEXT:    [[TMP17:%.*]] = load i8, i8* [[COMPARE_RETURN_BUFFER]], align 1
+// CHECK-NEXT:    [[BOOLEAN_PREDICATE:%.*]] = icmp ne i8 [[TMP17]], 0
+// CHECK-NEXT:    br i1 [[BOOLEAN_PREDICATE]], label [[IS_SMALLER_THAN_TRUE:%.*]], label [[IS_SMALLER_THAN_AFTER:%.*]]
+// CHECK:       is_smaller_than-after:
+// CHECK-NEXT:    br label [[SMALLER_COMPARISON_INDEX_AFTER]]
+// CHECK:       is_smaller_than-true:
+// CHECK-NEXT:    [[TMP18:%.*]] = load float, float* [[TMP15]], align 4
+// CHECK-NEXT:    [[TMP19:%.*]] = load float, float* [[TMP16]], align 4
+// CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP1]], i64 0, i64 [[TMP9]], i64 [[TMP8]]
+// CHECK-NEXT:    store float [[TMP18]], float* [[TMP20]], align 4
+// CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP1]], i64 0, i64 [[TMP9]], i64 [[TMP11]]
+// CHECK-NEXT:    store float [[TMP19]], float* [[TMP21]], align 4
+// CHECK-NEXT:    br label [[IS_SMALLER_THAN_AFTER]]
+
+// CHECK: define void @sort__2(i8* noalias align 64 dereferenceable(24) [[ALLOC0:%.*]]) {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[COMPARE_RETURN_BUFFER:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[ALLOC0:%.*]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to [2 x [3 x float]]*
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[ALLOC0]], i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to [2 x [3 x float]]*
+// CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !6
+// CHECK-NEXT:    [[BLOCK_ID:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !7
+// CHECK-NEXT:    [[THREAD_ID:%.*]] = zext i32 [[TMP5]] to i64
+// CHECK-NEXT:    [[TMP6:%.*]] = mul nuw nsw i64 [[BLOCK_ID]], 4
+// CHECK-NEXT:    [[LINEAR_INDEX:%.*]] = add nuw nsw i64 [[TMP6]], [[THREAD_ID]]
+// CHECK-NEXT:    [[LINEAR_INDEX_IN_RANGE:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4
+// CHECK-NEXT:    call void @llvm.assume(i1 [[LINEAR_INDEX_IN_RANGE]])
+// CHECK-NEXT:    [[TMP7:%.*]] = udiv i64 [[LINEAR_INDEX]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = urem i64 [[TMP7]], 2
+// CHECK-NEXT:    [[TMP9:%.*]] = udiv i64 [[LINEAR_INDEX]], 2
+// CHECK-NEXT:    [[TMP10:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4
+// CHECK-NEXT:    br i1 [[TMP10]], label [[SORT_IN_BOUNDS_TRUE:%.*]], label [[SORT_IN_BOUNDS_AFTER:%.*]]
+// CHECK:       sort.in_bounds-after:
+// CHECK-NEXT:    ret void
+// CHECK:       sort.in_bounds-true:
+// CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP8]], 2
+// CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 1
+// CHECK-NEXT:    [[TMP13:%.*]] = icmp slt i64 [[TMP11]], [[TMP12]]
+// CHECK-NEXT:    [[TMP14:%.*]] = icmp slt i64 [[TMP12]], 3
+// CHECK-NEXT:    [[TMP15:%.*]] = and i1 [[TMP13]], [[TMP14]]
+// CHECK-NEXT:    br i1 [[TMP15]], label [[SMALLER_COMPARISON_INDEX_TRUE:%.*]], label [[SMALLER_COMPARISON_INDEX_AFTER:%.*]]
+// CHECK:       smaller_comparison_index-after:
+// CHECK-NEXT:    br label [[SORT_IN_BOUNDS_AFTER]]
+// CHECK:       smaller_comparison_index-true:
+// CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP1]], i64 0, i64 [[TMP9]], i64 [[TMP12]]
+// CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP1]], i64 0, i64 [[TMP9]], i64 [[TMP11]]
+// CHECK-NEXT:    call void @region_0_4(float* [[TMP16]], float* [[TMP17]], i8* [[COMPARE_RETURN_BUFFER]])
+// CHECK-NEXT:    [[TMP18:%.*]] = load i8, i8* [[COMPARE_RETURN_BUFFER]], align 1
+// CHECK-NEXT:    [[BOOLEAN_PREDICATE:%.*]] = icmp ne i8 [[TMP18]], 0
+// CHECK-NEXT:    br i1 [[BOOLEAN_PREDICATE]], label [[IS_SMALLER_THAN_TRUE:%.*]], label [[IS_SMALLER_THAN_AFTER:%.*]]
+// CHECK:       is_smaller_than-after:
+// CHECK-NEXT:    br label [[SMALLER_COMPARISON_INDEX_AFTER]]
+// CHECK:       is_smaller_than-true:
+// CHECK-NEXT:    [[TMP19:%.*]] = load float, float* [[TMP16]], align 4
+// CHECK-NEXT:    [[TMP20:%.*]] = load float, float* [[TMP17]], align 4
+// CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP1]], i64 0, i64 [[TMP9]], i64 [[TMP11]]
+// CHECK-NEXT:    store float [[TMP19]], float* [[TMP21]], align 4
+// CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP1]], i64 0, i64 [[TMP9]], i64 [[TMP12]]
+// CHECK-NEXT:    store float [[TMP20]], float* [[TMP22]], align 4
+// CHECK-NEXT:    br label [[IS_SMALLER_THAN_AFTER]]
+ENTRY main {
+  x = f32[2, 3] parameter(0)
+  ROOT sort = f32[2, 3] sort(x), dimensions={1}, to_apply=compare
+}
+
+// -----
+
+HloModule TestModule
+
+compare {
+  p.0.lhs = s32[] parameter(0)
+  p.0.rhs = s32[] parameter(1)
+  p.1.lhs = f32[] parameter(2)
+  p.1.rhs = f32[] parameter(3)
+  ROOT lt = pred[] compare(p.1.lhs, p.1.rhs), direction=LT
+}
+
+// CHECK: define void @sort(i8* noalias align 64 dereferenceable(24) [[ALLOC0:%.*]], i8* noalias align 64 dereferenceable(24) [[ALLOC1:%.*]], i8* noalias align 64 dereferenceable(16) [[ALLOC4:%.*]])
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[COMPARE_RETURN_BUFFER:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[ALLOC0:%.*]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to [2 x [3 x i32]]*
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[ALLOC1:%.*]], i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to [2 x [3 x float]]*
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[ALLOC4:%.*]], i64 0
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]*
+// CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !6
+// CHECK-NEXT:    [[BLOCK_ID:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !7
+// CHECK-NEXT:    [[THREAD_ID:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = mul nuw nsw i64 [[BLOCK_ID]], 4
+// CHECK-NEXT:    [[LINEAR_INDEX:%.*]] = add nuw nsw i64 [[TMP8]], [[THREAD_ID]]
+// CHECK-NEXT:    [[LINEAR_INDEX_IN_RANGE:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4
+// CHECK-NEXT:    call void @llvm.assume(i1 [[LINEAR_INDEX_IN_RANGE]])
+// CHECK-NEXT:    [[TMP9:%.*]] = udiv i64 [[LINEAR_INDEX]], 1
+// CHECK-NEXT:    [[TMP10:%.*]] = urem i64 [[TMP9]], 2
+// CHECK-NEXT:    [[TMP11:%.*]] = udiv i64 [[LINEAR_INDEX]], 2
+// CHECK-NEXT:    [[TMP12:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4
+// CHECK-NEXT:    br i1 [[TMP12]], label [[SORT_IN_BOUNDS_TRUE:%.*]], label [[SORT_IN_BOUNDS_AFTER:%.*]]
+// CHECK:       sort.in_bounds-after:
+// CHECK-NEXT:    ret void
+// CHECK:       sort.in_bounds-true:
+// CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP10]], 2
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 1
+// CHECK-NEXT:    [[TMP15:%.*]] = icmp slt i64 [[TMP13]], [[TMP14]]
+// CHECK-NEXT:    [[TMP16:%.*]] = icmp slt i64 [[TMP14]], 3
+// CHECK-NEXT:    [[TMP17:%.*]] = and i1 [[TMP15]], [[TMP16]]
+// CHECK-NEXT:    br i1 [[TMP17]], label [[SMALLER_COMPARISON_INDEX_TRUE:%.*]], label [[SMALLER_COMPARISON_INDEX_AFTER:%.*]]
+// CHECK:       smaller_comparison_index-after:
+// CHECK-NEXT:    br label [[SORT_IN_BOUNDS_AFTER]]
+// CHECK:       smaller_comparison_index-true:
+// CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[TMP1]], i64 0, i64 [[TMP11]], i64 [[TMP14]]
+// CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[TMP1]], i64 0, i64 [[TMP11]], i64 [[TMP13]]
+// CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP3]], i64 0, i64 [[TMP11]], i64 [[TMP14]]
+// CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP3]], i64 0, i64 [[TMP11]], i64 [[TMP13]]
+// CHECK-NEXT:    call void @region_0_6(i32* [[TMP18]], i32* [[TMP19]], float* [[TMP20]], float* [[TMP21]], i8* [[COMPARE_RETURN_BUFFER]])
+// CHECK-NEXT:    [[TMP22:%.*]] = load i8, i8* [[COMPARE_RETURN_BUFFER]], align 1
+// CHECK-NEXT:    [[BOOLEAN_PREDICATE:%.*]] = icmp ne i8 [[TMP22]], 0
+// CHECK-NEXT:    br i1 [[BOOLEAN_PREDICATE]], label [[IS_SMALLER_THAN_TRUE:%.*]], label [[IS_SMALLER_THAN_AFTER:%.*]]
+// CHECK:       is_smaller_than-after:
+// CHECK-NEXT:    br label [[SMALLER_COMPARISON_INDEX_AFTER]]
+// CHECK:       is_smaller_than-true:
+// CHECK-NEXT:    [[TMP23:%.*]] = load i32, i32* [[TMP18]], align 4
+// CHECK-NEXT:    [[TMP24:%.*]] = load i32, i32* [[TMP19]], align 4
+// CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[TMP1]], i64 0, i64 [[TMP11]], i64 [[TMP13]]
+// CHECK-NEXT:    store i32 [[TMP23]], i32* [[TMP25]], align 4
+// CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[TMP1]], i64 0, i64 [[TMP11]], i64 [[TMP14]]
+// CHECK-NEXT:    store i32 [[TMP24]], i32* [[TMP26]], align 4
+// CHECK-NEXT:    [[TMP27:%.*]] = load float, float* [[TMP20]], align 4
+// CHECK-NEXT:    [[TMP28:%.*]] = load float, float* [[TMP21]], align 4
+// CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP3]], i64 0, i64 [[TMP11]], i64 [[TMP13]]
+// CHECK-NEXT:    store float [[TMP27]], float* [[TMP29]], align 4
+// CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP3]], i64 0, i64 [[TMP11]], i64 [[TMP14]]
+// CHECK-NEXT:    store float [[TMP28]], float* [[TMP30]], align 4
+// CHECK-NEXT:    br label [[IS_SMALLER_THAN_AFTER]]
+
+// CHECK: define internal void @region_0_6(i32* dereferenceable(4) [[P_0_LHS_TYPED:%.*]], i32* dereferenceable(4) [[P_0_RHS_TYPED:%.*]], float* dereferenceable(4) [[P_1_LHS_TYPED:%.*]], float* dereferenceable(4) [[P_1_RHS_TYPED:%.*]], i8* dereferenceable(1) [[OUTPUT_ARG:%.*]])
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[COMPARE_5_TYPED:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[ARG_2_3_TYPED:%.*]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[ARG_3_4_TYPED:%.*]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp olt float [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i8
+// CHECK-NEXT:    store i8 [[TMP3]], i8* [[COMPARE_5_TYPED]], align 1
+// CHECK-NEXT:    [[LOAD_RET_VALUE:%.*]] = load i8, i8* [[COMPARE_5_TYPED]], align 1
+// CHECK-NEXT:    store i8 [[LOAD_RET_VALUE]], i8* [[OUTPUT_ARG:%.*]], align 1
+// CHECK-NEXT:    ret void
+
+// CHECK: define void @sort__1(i8* noalias align 64 dereferenceable(24) [[ALLOC0:%.*]], i8* noalias align 64 dereferenceable(24) [[ALLOC1:%.*]], i8* noalias align 64 dereferenceable(16) [[ALLOC4:%.*]])
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[COMPARE_RETURN_BUFFER:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[ALLOC0:%.*]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to [2 x [3 x i32]]*
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[ALLOC1:%.*]], i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to [2 x [3 x float]]*
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[ALLOC4:%.*]], i64 0
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]*
+// CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !6
+// CHECK-NEXT:    [[BLOCK_ID:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !7
+// CHECK-NEXT:    [[THREAD_ID:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = mul nuw nsw i64 [[BLOCK_ID]], 4
+// CHECK-NEXT:    [[LINEAR_INDEX:%.*]] = add nuw nsw i64 [[TMP8]], [[THREAD_ID]]
+// CHECK-NEXT:    [[LINEAR_INDEX_IN_RANGE:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4
+// CHECK-NEXT:    call void @llvm.assume(i1 [[LINEAR_INDEX_IN_RANGE]])
+// CHECK-NEXT:    [[TMP9:%.*]] = udiv i64 [[LINEAR_INDEX]], 1
+// CHECK-NEXT:    [[TMP10:%.*]] = urem i64 [[TMP9]], 2
+// CHECK-NEXT:    [[TMP11:%.*]] = udiv i64 [[LINEAR_INDEX]], 2
+// CHECK-NEXT:    [[TMP12:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4
+// CHECK-NEXT:    br i1 [[TMP12]], label [[SORT_IN_BOUNDS_TRUE:%.*]], label [[SORT_IN_BOUNDS_AFTER:%.*]]
+// CHECK:       sort.in_bounds-after:
+// CHECK-NEXT:    ret void
+// CHECK:       sort.in_bounds-true:
+// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP10]], 3
+// CHECK-NEXT:    [[TMP14:%.*]] = icmp slt i64 [[TMP10]], [[TMP13]]
+// CHECK-NEXT:    [[TMP15:%.*]] = icmp slt i64 [[TMP13]], 3
+// CHECK-NEXT:    [[TMP16:%.*]] = and i1 [[TMP14]], [[TMP15]]
+// CHECK-NEXT:    br i1 [[TMP16]], label [[SMALLER_COMPARISON_INDEX_TRUE:%.*]], label [[SMALLER_COMPARISON_INDEX_AFTER:%.*]]
+// CHECK:       smaller_comparison_index-after:
+// CHECK-NEXT:    br label [[SORT_IN_BOUNDS_AFTER]]
+// CHECK:       smaller_comparison_index-true:
+// CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[TMP1]], i64 0, i64 [[TMP11]], i64 [[TMP13]]
+// CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[TMP1]], i64 0, i64 [[TMP11]], i64 [[TMP10]]
+// CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP3]], i64 0, i64 [[TMP11]], i64 [[TMP13]]
+// CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP3]], i64 0, i64 [[TMP11]], i64 [[TMP10]]
+// CHECK-NEXT:    call void @region_0_6(i32* [[TMP17]], i32* [[TMP18]], float* [[TMP19]], float* [[TMP20]], i8* [[COMPARE_RETURN_BUFFER]])
+// CHECK-NEXT:    [[TMP21:%.*]] = load i8, i8* [[COMPARE_RETURN_BUFFER]], align 1
+// CHECK-NEXT:    [[BOOLEAN_PREDICATE:%.*]] = icmp ne i8 [[TMP21]], 0
+// CHECK-NEXT:    br i1 [[BOOLEAN_PREDICATE]], label [[IS_SMALLER_THAN_TRUE:%.*]], label [[IS_SMALLER_THAN_AFTER:%.*]]
+// CHECK:       is_smaller_than-after:
+// CHECK-NEXT:    br label [[SMALLER_COMPARISON_INDEX_AFTER]]
+// CHECK:       is_smaller_than-true:
+// CHECK-NEXT:    [[TMP22:%.*]] = load i32, i32* [[TMP17]], align 4
+// CHECK-NEXT:    [[TMP23:%.*]] = load i32, i32* [[TMP18]], align 4
+// CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[TMP1]], i64 0, i64 [[TMP11]], i64 [[TMP10]]
+// CHECK-NEXT:    store i32 [[TMP22]], i32* [[TMP24]], align 4
+// CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[TMP1]], i64 0, i64 [[TMP11]], i64 [[TMP13]]
+// CHECK-NEXT:    store i32 [[TMP23]], i32* [[TMP25]], align 4
+// CHECK-NEXT:    [[TMP26:%.*]] = load float, float* [[TMP19]], align 4
+// CHECK-NEXT:    [[TMP27:%.*]] = load float, float* [[TMP20]], align 4
+// CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP3]], i64 0, i64 [[TMP11]], i64 [[TMP10]]
+// CHECK-NEXT:    store float [[TMP26]], float* [[TMP28]], align 4
+// CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP3]], i64 0, i64 [[TMP11]], i64 [[TMP13]]
+// CHECK-NEXT:    store float [[TMP27]], float* [[TMP29]], align 4
+// CHECK-NEXT:    br label [[IS_SMALLER_THAN_AFTER]]
+
+// CHECK: define void @sort__2(i8* noalias align 64 dereferenceable(24) [[ALLOC0:%.*]], i8* noalias align 64 dereferenceable(24) [[ALLOC1:%.*]], i8* noalias align 64 dereferenceable(16) [[ALLOC4:%.*]])
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[COMPARE_RETURN_BUFFER:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[ALLOC0:%.*]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to [2 x [3 x i32]]*
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[ALLOC1:%.*]], i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to [2 x [3 x float]]*
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[ALLOC4:%.*]], i64 0
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]*
+// CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !6
+// CHECK-NEXT:    [[BLOCK_ID:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !7
+// CHECK-NEXT:    [[THREAD_ID:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = mul nuw nsw i64 [[BLOCK_ID]], 4
+// CHECK-NEXT:    [[LINEAR_INDEX:%.*]] = add nuw nsw i64 [[TMP8]], [[THREAD_ID]]
+// CHECK-NEXT:    [[LINEAR_INDEX_IN_RANGE:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4
+// CHECK-NEXT:    call void @llvm.assume(i1 [[LINEAR_INDEX_IN_RANGE]])
+// CHECK-NEXT:    [[TMP9:%.*]] = udiv i64 [[LINEAR_INDEX]], 1
+// CHECK-NEXT:    [[TMP10:%.*]] = urem i64 [[TMP9]], 2
+// CHECK-NEXT:    [[TMP11:%.*]] = udiv i64 [[LINEAR_INDEX]], 2
+// CHECK-NEXT:    [[TMP12:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4
+// CHECK-NEXT:    br i1 [[TMP12]], label [[SORT_IN_BOUNDS_TRUE:%.*]], label [[SORT_IN_BOUNDS_AFTER:%.*]]
+// CHECK:       sort.in_bounds-after:
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast [2 x [3 x i32]]* [[TMP1]] to i8*
+// CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0
+// CHECK-NEXT:    store i8* [[TMP13]], i8** [[TMP14]], align 8
+// CHECK-NEXT:    [[TMP15:%.*]] = bitcast [2 x [3 x float]]* [[TMP3]] to i8*
+// CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1
+// CHECK-NEXT:    store i8* [[TMP15]], i8** [[TMP16]], align 8
+// CHECK-NEXT:    ret void
+// CHECK:       sort.in_bounds-true:
+// CHECK-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP10]], 2
+// CHECK-NEXT:    [[TMP18:%.*]] = xor i64 [[TMP17]], 1
+// CHECK-NEXT:    [[TMP19:%.*]] = icmp slt i64 [[TMP17]], [[TMP18]]
+// CHECK-NEXT:    [[TMP20:%.*]] = icmp slt i64 [[TMP18]], 3
+// CHECK-NEXT:    [[TMP21:%.*]] = and i1 [[TMP19]], [[TMP20]]
+// CHECK-NEXT:    br i1 [[TMP21]], label [[SMALLER_COMPARISON_INDEX_TRUE:%.*]], label [[SMALLER_COMPARISON_INDEX_AFTER:%.*]]
+// CHECK:       smaller_comparison_index-after:
+// CHECK-NEXT:    br label [[SORT_IN_BOUNDS_AFTER]]
+// CHECK:       smaller_comparison_index-true:
+// CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[TMP1]], i64 0, i64 [[TMP11]], i64 [[TMP18]]
+// CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[TMP1]], i64 0, i64 [[TMP11]], i64 [[TMP17]]
+// CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP3]], i64 0, i64 [[TMP11]], i64 [[TMP18]]
+// CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP3]], i64 0, i64 [[TMP11]], i64 [[TMP17]]
+// CHECK-NEXT:    call void @region_0_6(i32* [[TMP22]], i32* [[TMP23]], float* [[TMP24]], float* [[TMP25]], i8* [[COMPARE_RETURN_BUFFER]])
+// CHECK-NEXT:    [[TMP26:%.*]] = load i8, i8* [[COMPARE_RETURN_BUFFER]], align 1
+// CHECK-NEXT:    [[BOOLEAN_PREDICATE:%.*]] = icmp ne i8 [[TMP26]], 0
+// CHECK-NEXT:    br i1 [[BOOLEAN_PREDICATE]], label [[IS_SMALLER_THAN_TRUE:%.*]], label [[IS_SMALLER_THAN_AFTER:%.*]]
+// CHECK:       is_smaller_than-after:
+// CHECK-NEXT:    br label [[SMALLER_COMPARISON_INDEX_AFTER]]
+// CHECK:       is_smaller_than-true:
+// CHECK-NEXT:    [[TMP27:%.*]] = load i32, i32* [[TMP22]], align 4
+// CHECK-NEXT:    [[TMP28:%.*]] = load i32, i32* [[TMP23]], align 4
+// CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[TMP1]], i64 0, i64 [[TMP11]], i64 [[TMP17]]
+// CHECK-NEXT:    store i32 [[TMP27]], i32* [[TMP29]], align 4
+// CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[TMP1]], i64 0, i64 [[TMP11]], i64 [[TMP18]]
+// CHECK-NEXT:    store i32 [[TMP28]], i32* [[TMP30]], align 4
+// CHECK-NEXT:    [[TMP31:%.*]] = load float, float* [[TMP24]], align 4
+// CHECK-NEXT:    [[TMP32:%.*]] = load float, float* [[TMP25]], align 4
+// CHECK-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP3]], i64 0, i64 [[TMP11]], i64 [[TMP17]]
+// CHECK-NEXT:    store float [[TMP31]], float* [[TMP33]], align 4
+// CHECK-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP3]], i64 0, i64 [[TMP11]], i64 [[TMP18]]
+// CHECK-NEXT:    store float [[TMP32]], float* [[TMP34]], align 4
+// CHECK-NEXT:    br label [[IS_SMALLER_THAN_AFTER]]
+ENTRY main {
+  x = s32[2, 3] parameter(0)
+  y = f32[2, 3] parameter(1)
+  ROOT sort = (s32[2, 3], f32[2, 3]) sort(x, y), dimensions={1}, to_apply=compare
+}
diff --git a/tensorflow/compiler/xla/service/gpu/tests/sorting_test.cc b/tensorflow/compiler/xla/service/gpu/tests/sorting_test.cc
new file mode 100644
index 00000000000..197a0c6cfeb
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/sorting_test.cc
@@ -0,0 +1,71 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <utility>
+
+#include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
+#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/tests/filecheck.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/xla.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+
+namespace xla {
+namespace gpu {
+
+namespace {
+
+class SortingTest : public GpuCodegenTest {
+ protected:
+  HloModuleConfig ConfigWithoutLayoutAssignment() {
+    HloModuleConfig config;
+    auto debug_options = HloTestBase::GetDebugOptionsForTest();
+    // Disable layout_assignment to use the preassigned layouts.
+    debug_options.add_xla_disable_hlo_passes("layout-assignment");
+    config.set_debug_options(debug_options);
+    return config;
+  }
+};
+
+TEST_F(SortingTest, Regression1) {
+  const char* hlo_text = R"(
+HloModule TestModule
+
+compare {
+  p.0.lhs = f32[] parameter(0)
+  p.0.rhs = f32[] parameter(1)
+  ROOT lt = pred[] compare(p.0.lhs, p.0.rhs), direction=LT
+}
+
+ENTRY TestComputation {
+  x = f32[3, 2]{1, 0} parameter(0)
+  x.copy = f32[3, 2]{0, 1} copy(x)
+  ROOT sort = f32[3, 2]{0, 1} sort(x.copy), dimensions={1}, to_apply=compare
+}
+
+)";
+
+  EXPECT_TRUE(RunAndCompareNoHloPasses(hlo_text, ErrorSpec{1e-5, 1e-5}));
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/tests/xla_thunks_opt.cc b/tensorflow/compiler/xla/service/gpu/tests/xla_thunks_opt.cc
new file mode 100644
index 00000000000..97c3b3a5bde
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/xla_thunks_opt.cc
@@ -0,0 +1,39 @@
+/* Copyright 2020 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
+#include "mlir/InitAllDialects.h"  // from @llvm-project
+#include "mlir/InitAllPasses.h"  // from @llvm-project
+#include "mlir/Support/MlirOptMain.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/register.h"
+#include "tensorflow/compiler/mlir/init_mlir.h"
+#include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
+#include "tensorflow/compiler/xla/service/gpu/ir/xla_thunks_ops.h"
+#include "tensorflow/core/platform/init_main.h"
+
+int main(int argc, char **argv) {
+  tensorflow::InitMlir y(&argc, &argv);
+
+  mlir::registerAllPasses();
+
+  mlir::DialectRegistry registry;
+  mlir::registerAllDialects(registry);
+  mlir::RegisterAllTensorFlowDialects(registry);
+  mlir::mhlo::registerAllMhloDialects(registry);
+  registry.insert<mlir::shape::ShapeDialect>();
+  registry.insert<mlir::xla_thunks::XLAThunksDialect>();
+  return failed(
+      mlir::MlirOptMain(argc, argv, "XLA-Thunk pass driver\n", registry));
+}
diff --git a/tensorflow/compiler/xla/service/heap_simulator.cc b/tensorflow/compiler/xla/service/heap_simulator.cc
index 10751752571..2e2b668eba7 100644
--- a/tensorflow/compiler/xla/service/heap_simulator.cc
+++ b/tensorflow/compiler/xla/service/heap_simulator.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/service/hlo_live_range.h"
 #include "tensorflow/compiler/xla/service/hlo_schedule.h"
+#include "tensorflow/compiler/xla/service/memory_space_assignment_repacking.h"
 #include "tensorflow/compiler/xla/util.h"
 
 namespace xla {
@@ -55,9 +56,10 @@ StatusOr<int64> HeapSimulator::MinimumMemoryForModule(
   // rather than summing each computation, since it gives us a better lower
   // bound, by minimizing the liveness of sub-computations.
   TF_ASSIGN_OR_RETURN(
-      HeapSimulator::Result result,
-      HeapSimulator::Run(absl::make_unique<NoFragmentationStatsHeap>(), *module,
-                         schedule, *alias_analysis, size_function));
+      HeapSimulator::Result<HloValue> result,
+      HeapSimulator::Run(
+          absl::make_unique<NoFragmentationStatsHeap<HloValue>>(), *module,
+          schedule, *alias_analysis, size_function));
   return result.heap_size;
 }
 
@@ -69,10 +71,11 @@ StatusOr<int64> HeapSimulator::MinimumMemoryForComputation(
     const absl::flat_hash_map<const HloComputation*, int64>*
         memory_by_computation) {
   TF_ASSIGN_OR_RETURN(
-      HeapSimulator::Result result,
-      HeapSimulator::Run(absl::make_unique<NoFragmentationStatsHeap>(),
-                         computation, sequence, alias_analysis, size_function,
-                         HeapSimulator::Options(), memory_by_computation));
+      HeapSimulator::Result<HloValue> result,
+      HeapSimulator::Run(
+          absl::make_unique<NoFragmentationStatsHeap<HloValue>>(), computation,
+          sequence, alias_analysis, size_function, HeapSimulator::Options(),
+          memory_by_computation));
   return result.heap_size;
 }
 
@@ -82,16 +85,17 @@ StatusOr<int64> HeapSimulator::MinimumMemoryForComputation(
     const LogicalBuffer::SizeFunction& size_function,
     const HloSchedule* schedule) {
   TF_ASSIGN_OR_RETURN(
-      HeapSimulator::Result result,
-      HeapSimulator::Run(absl::make_unique<NoFragmentationStatsHeap>(),
-                         computation, sequence, alias_analysis, size_function,
-                         schedule, HeapSimulator::Options()));
+      HeapSimulator::Result<HloValue> result,
+      HeapSimulator::Run(
+          absl::make_unique<NoFragmentationStatsHeap<HloValue>>(), computation,
+          sequence, alias_analysis, size_function, schedule,
+          HeapSimulator::Options()));
   return result.heap_size;
 }
 
 /*static*/
-StatusOr<HeapSimulator::Result> HeapSimulator::Run(
-    std::unique_ptr<HeapAlgorithm> algorithm, const HloModule& module,
+StatusOr<HeapSimulator::Result<HloValue>> HeapSimulator::Run(
+    std::unique_ptr<HeapAlgorithm<HloValue>> algorithm, const HloModule& module,
     const HloSchedule& schedule, const HloAliasAnalysis& alias_analysis,
     const BufferValue::SizeFunction& size_fn, const Options& options) {
   HeapSimulator heap(std::move(algorithm), size_fn, options, &schedule);
@@ -108,8 +112,9 @@ StatusOr<HeapSimulator::Result> HeapSimulator::Run(
 }
 
 /*static*/
-StatusOr<HeapSimulator::Result> HeapSimulator::Run(
-    std::unique_ptr<HeapAlgorithm> algorithm, const HloComputation& computation,
+StatusOr<HeapSimulator::Result<HloValue>> HeapSimulator::Run(
+    std::unique_ptr<HeapAlgorithm<HloValue>> algorithm,
+    const HloComputation& computation,
     const HloInstructionSequence& instruction_sequence,
     const HloAliasAnalysis& alias_analysis,
     const BufferValue::SizeFunction& size_fn, const Options& options,
@@ -128,8 +133,9 @@ StatusOr<HeapSimulator::Result> HeapSimulator::Run(
 }
 
 /*static*/
-StatusOr<HeapSimulator::Result> HeapSimulator::Run(
-    std::unique_ptr<HeapAlgorithm> algorithm, const HloComputation& computation,
+StatusOr<HeapSimulator::Result<HloValue>> HeapSimulator::Run(
+    std::unique_ptr<HeapAlgorithm<HloValue>> algorithm,
+    const HloComputation& computation,
     const HloInstructionSequence& instruction_sequence,
     const HloAliasAnalysis& alias_analysis,
     const BufferValue::SizeFunction& size_fn, const HloSchedule* schedule,
@@ -326,12 +332,13 @@ Status HeapSimulator::RunComputation(
 }
 
 HeapSimulator::HeapSimulator(
-    std::unique_ptr<HeapAlgorithm> algorithm,
+    std::unique_ptr<HeapAlgorithm<HloValue>> algorithm,
     const BufferValue::SizeFunction& size_fn, const Options& options,
     const HloSchedule* schedule,
     const absl::flat_hash_map<const HloComputation*, int64>*
         memory_by_computation)
-    : no_fragmentation_stats_(absl::make_unique<NoFragmentationStatsHeap>()),
+    : no_fragmentation_stats_(
+          absl::make_unique<NoFragmentationStatsHeap<HloValue>>()),
       algorithm_(std::move(algorithm)),
       size_fn_(size_fn),
       options_(options),
@@ -396,8 +403,8 @@ void HeapSimulator::ShareBuffer(const HloValue* buffer, const HloValue* shared,
                  shared);
 }
 
-HeapSimulator::Result HeapSimulator::Finish() {
-  Result result = algorithm_->Finish();
+HeapSimulator::Result<HloValue> HeapSimulator::Finish() {
+  Result<HloValue> result = algorithm_->Finish();
 
   // Post-process the result to add chunks for shared buffers.  An empty chunk
   // map means that either no buffers were allocated, or the heap was only
@@ -411,7 +418,7 @@ HeapSimulator::Result HeapSimulator::Finish() {
   }
 
   // Fragmentation is the difference between the actual and ideal sizes.
-  const Result no_frag_result = no_fragmentation_stats_->Finish();
+  const Result<HloValue> no_frag_result = no_fragmentation_stats_->Finish();
   result.fragmentation_size = result.heap_size - no_frag_result.heap_size;
 
   // Copy the debug trace we collected to the final result.
@@ -437,14 +444,17 @@ void HeapSimulator::FillDebugTrace(HeapSimulatorTrace::Event::Kind kind,
   }
 }
 
-void NoFragmentationStatsHeap::Alloc(const HloValue* buffer, int64 size) {
+template <typename BufferType>
+void NoFragmentationStatsHeap<BufferType>::Alloc(const BufferType* buffer,
+                                                 int64 size) {
   current_heap_size_ += size;
   if (current_heap_size_ > max_heap_size_) {
     max_heap_size_ = current_heap_size_;
   }
 }
 
-void NoFragmentationStatsHeap::AccountForSubcomputationMemory(
+template <typename BufferType>
+void NoFragmentationStatsHeap<BufferType>::AccountForSubcomputationMemory(
     const HloInstruction* instruction, int64 alloc_size_by_instruction,
     const absl::flat_hash_map<const HloComputation*, int64>&
         memory_by_computation) {
@@ -472,11 +482,15 @@ void NoFragmentationStatsHeap::AccountForSubcomputationMemory(
       std::max(max_heap_size_, current_heap_size_ + max_subcomputation_bytes);
 }
 
-void NoFragmentationStatsHeap::Free(const HloValue* buffer, int64 size) {
+template <typename BufferType>
+void NoFragmentationStatsHeap<BufferType>::Free(const BufferType* buffer,
+                                                int64 size) {
   current_heap_size_ -= size;
 }
 
-HeapSimulator::Result NoFragmentationStatsHeap::Finish() {
+template <typename BufferType>
+HeapSimulator::Result<BufferType>
+NoFragmentationStatsHeap<BufferType>::Finish() {
   // The result.chunk_map is empty, since we only collect stats, and don't
   // actually compute chunk assignments.
   Result result;
@@ -484,7 +498,8 @@ HeapSimulator::Result NoFragmentationStatsHeap::Finish() {
   return result;
 }
 
-GlobalDecreasingSizeBestFitHeap::GlobalDecreasingSizeBestFitHeap(
+template <typename BufferType>
+GlobalDecreasingSizeBestFitHeap<BufferType>::GlobalDecreasingSizeBestFitHeap(
     int64 alignment, Type type)
     : alignment_(alignment) {
   if (type == kTemporal) {
@@ -495,8 +510,10 @@ GlobalDecreasingSizeBestFitHeap::GlobalDecreasingSizeBestFitHeap(
   }
 }
 
-GlobalDecreasingSizeBestFitHeap::BufferIntervalCompare
-GlobalDecreasingSizeBestFitHeap::GetTemporalBufferIntervalCompare() const {
+template <typename BufferType>
+typename GlobalDecreasingSizeBestFitHeap<BufferType>::BufferIntervalCompare
+GlobalDecreasingSizeBestFitHeap<BufferType>::GetTemporalBufferIntervalCompare()
+    const {
   return [&](const BufferInterval& x, const BufferInterval& y) {
     int64 x_end = x.end;
     for (auto colocation : GetTransitiveColocations(x)) {
@@ -515,12 +532,14 @@ GlobalDecreasingSizeBestFitHeap::GetTemporalBufferIntervalCompare() const {
     if (x.size != y.size) {
       return x.size > y.size;
     }
-    return x.buffer->id() < y.buffer->id();
+    return *x.buffer < *y.buffer;
   };
 }
 
-/*static*/ GlobalDecreasingSizeBestFitHeap::BufferIntervalCompare
-GlobalDecreasingSizeBestFitHeap::GetSpatialBufferIntervalCompare() {
+template <typename BufferType>
+/*static*/ typename GlobalDecreasingSizeBestFitHeap<
+    BufferType>::BufferIntervalCompare
+GlobalDecreasingSizeBestFitHeap<BufferType>::GetSpatialBufferIntervalCompare() {
   return [&](const BufferInterval& x, const BufferInterval& y) {
     if (x.size != y.size) {
       return x.size > y.size;
@@ -528,12 +547,13 @@ GlobalDecreasingSizeBestFitHeap::GetSpatialBufferIntervalCompare() {
     if (x.end - x.start != y.end - y.start) {
       return x.end - x.start > y.end - y.start;
     }
-    return x.buffer->id() < y.buffer->id();
+    return *x.buffer < *y.buffer;
   };
 }
 
-void GlobalDecreasingSizeBestFitHeap::Alloc(const HloValue* buffer,
-                                            int64 size) {
+template <typename BufferType>
+void GlobalDecreasingSizeBestFitHeap<BufferType>::Alloc(
+    const BufferType* buffer, int64 size) {
   // Degenerate case: 0-sized buffers are always allocated at offset 0.
   if (size == 0) {
     result_.chunk_map.emplace(buffer, Chunk{0, 0});
@@ -546,9 +566,9 @@ void GlobalDecreasingSizeBestFitHeap::Alloc(const HloValue* buffer,
   ++current_time_;
 }
 
-void GlobalDecreasingSizeBestFitHeap::ShareWith(const HloValue* buffer,
-                                                const HloValue* share_with,
-                                                int64 size) {
+template <typename BufferType>
+void GlobalDecreasingSizeBestFitHeap<BufferType>::ShareWith(
+    const BufferType* buffer, const BufferType* share_with, int64 size) {
   // Degenerate case: 0-sized buffers are always allocated at offset 0.
   if (size == 0) {
     result_.chunk_map.emplace(buffer, Chunk{0, 0});
@@ -562,15 +582,16 @@ void GlobalDecreasingSizeBestFitHeap::ShareWith(const HloValue* buffer,
   ++current_time_;
 }
 
-absl::flat_hash_set<const HloValue*>
-GlobalDecreasingSizeBestFitHeap::GetTransitiveColocations(
+template <typename BufferType>
+absl::flat_hash_set<const BufferType*>
+GlobalDecreasingSizeBestFitHeap<BufferType>::GetTransitiveColocations(
     const BufferInterval& interval) const {
-  absl::flat_hash_set<const HloValue*> result;
+  absl::flat_hash_set<const BufferType*> result;
   std::vector<const BufferInterval*> worklist = {&interval};
   while (!worklist.empty()) {
     const BufferInterval* item = worklist.back();
     worklist.pop_back();
-    for (const HloValue* buffer_colocated : item->colocations) {
+    for (const BufferType* buffer_colocated : item->colocations) {
       result.insert(buffer_colocated);
       worklist.push_back(&buffer_intervals_.at(buffer_colocated));
     }
@@ -579,7 +600,9 @@ GlobalDecreasingSizeBestFitHeap::GetTransitiveColocations(
   return result;
 }
 
-void GlobalDecreasingSizeBestFitHeap::Free(const HloValue* buffer, int64 size) {
+template <typename BufferType>
+void GlobalDecreasingSizeBestFitHeap<BufferType>::Free(const BufferType* buffer,
+                                                       int64 size) {
   // Degenerate case: 0-sized buffers are always allocated at offset 0.
   if (size == 0) {
     return;
@@ -785,7 +808,9 @@ std::vector<Chunk> BufferIntervalTree::ChunksOverlappingInTime(
   return result;
 }
 
-HeapSimulator::Result GlobalDecreasingSizeBestFitHeap::Finish() {
+template <typename BufferType>
+HeapSimulator::Result<BufferType>
+GlobalDecreasingSizeBestFitHeap<BufferType>::Finish() {
   std::vector<BufferInterval> sorted_buffer_intervals =
       GetSortedBufferIntervals();
 
@@ -803,8 +828,10 @@ HeapSimulator::Result GlobalDecreasingSizeBestFitHeap::Finish() {
   return result_;
 }
 
-std::vector<GlobalDecreasingSizeBestFitHeap::BufferInterval>
-GlobalDecreasingSizeBestFitHeap::GetSortedBufferIntervals() const {
+template <typename BufferType>
+std::vector<
+    typename GlobalDecreasingSizeBestFitHeap<BufferType>::BufferInterval>
+GlobalDecreasingSizeBestFitHeap<BufferType>::GetSortedBufferIntervals() const {
   std::vector<BufferInterval> sorted_buffer_intervals;
   for (auto& entry : buffer_intervals_) {
     sorted_buffer_intervals.push_back(entry.second);
@@ -814,8 +841,9 @@ GlobalDecreasingSizeBestFitHeap::GetSortedBufferIntervals() const {
   return sorted_buffer_intervals;
 }
 
-GlobalDecreasingSizeBestFitHeap::ChunkCandidate
-GlobalDecreasingSizeBestFitHeap::FindChunkCandidate(
+template <typename BufferType>
+typename GlobalDecreasingSizeBestFitHeap<BufferType>::ChunkCandidate
+GlobalDecreasingSizeBestFitHeap<BufferType>::FindChunkCandidate(
     const GlobalDecreasingSizeBestFitHeap::BufferInterval& buffer_interval,
     int64 preferred_offset) const {
   VLOG(1) << "Finding chunks for buffer: "
@@ -912,9 +940,12 @@ GlobalDecreasingSizeBestFitHeap::FindChunkCandidate(
   return chunk_candidate;
 }
 
-void GlobalDecreasingSizeBestFitHeap::CommitChunk(
-    const GlobalDecreasingSizeBestFitHeap::BufferInterval& buffer_interval,
-    GlobalDecreasingSizeBestFitHeap::ChunkCandidate chunk_candidate) {
+template <typename BufferType>
+void GlobalDecreasingSizeBestFitHeap<BufferType>::CommitChunk(
+    const GlobalDecreasingSizeBestFitHeap<BufferType>::BufferInterval&
+        buffer_interval,
+    GlobalDecreasingSizeBestFitHeap<BufferType>::ChunkCandidate
+        chunk_candidate) {
   // Update the maximum heap size according to the one determined by the chunk
   // candidate.
   result_.heap_size = chunk_candidate.heap_size;
@@ -930,13 +961,16 @@ void GlobalDecreasingSizeBestFitHeap::CommitChunk(
   AddToChunkMap(buffer_interval.buffer, chunk_candidate.chunk);
 }
 
-void GlobalDecreasingSizeBestFitHeap::AddToChunkMap(const HloValue* buffer,
-                                                    Chunk chunk) {
+template <typename BufferType>
+void GlobalDecreasingSizeBestFitHeap<BufferType>::AddToChunkMap(
+    const BufferType* buffer, Chunk chunk) {
   const auto emplace_result = result_.chunk_map.emplace(buffer, chunk);
   DCHECK(emplace_result.second);
 }
 
-HeapSimulator::Result ChooseBestHeapAlgorithm::Finish() {
+template <typename BufferType>
+HeapSimulator::Result<BufferType>
+ChooseBestHeapAlgorithm<BufferType>::Finish() {
   DCHECK(!algorithms_.empty());
   std::vector<Result> results(algorithms_.size());
   int64 min_size = INT64_MAX;
@@ -953,4 +987,9 @@ HeapSimulator::Result ChooseBestHeapAlgorithm::Finish() {
   return results[min_size_index];
 }
 
+template class GlobalDecreasingSizeBestFitHeap<HloValue>;
+template class GlobalDecreasingSizeBestFitHeap<
+    MemorySpaceAssignmentRepacker::AllocationBlock>;
+template class ChooseBestHeapAlgorithm<HloValue>;
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/heap_simulator.h b/tensorflow/compiler/xla/service/heap_simulator.h
index d3b781ded0c..b47ff685139 100644
--- a/tensorflow/compiler/xla/service/heap_simulator.h
+++ b/tensorflow/compiler/xla/service/heap_simulator.h
@@ -40,7 +40,9 @@ limitations under the License.
 namespace xla {
 
 // Forward declare classes defined below.
+template <typename BufferType>
 class HeapAlgorithm;
+template <typename BufferType>
 class NoFragmentationStatsHeap;
 
 // HeapSimulator assigns buffer offsets by running a simulation of a regular
@@ -66,9 +68,10 @@ class HeapSimulator {
   };
 
   // Result represents the result of the heap simulation.
+  template <typename BufferType>
   struct Result {
     // The assignment of buffers to chunks.
-    absl::flat_hash_map<const HloValue*, Chunk> chunk_map;
+    absl::flat_hash_map<const BufferType*, Chunk> chunk_map;
 
     // The total size in bytes of the heap, containing all assigned chunks.
     int64 heap_size = 0;
@@ -128,19 +131,19 @@ class HeapSimulator {
   // to running on a per-computation basis, since we can re-use buffer space for
   // called sub-computations.
   //
-  static StatusOr<Result> Run(std::unique_ptr<HeapAlgorithm> algorithm,
-                              const HloModule& module,
-                              const HloSchedule& schedule,
-                              const HloAliasAnalysis& alias_analysis,
-                              const BufferValue::SizeFunction& size_fn,
-                              const Options& options = Options());
+  static StatusOr<Result<HloValue>> Run(
+      std::unique_ptr<HeapAlgorithm<HloValue>> algorithm,
+      const HloModule& module, const HloSchedule& schedule,
+      const HloAliasAnalysis& alias_analysis,
+      const BufferValue::SizeFunction& size_fn,
+      const Options& options = Options());
 
   // Same as above, but runs on a single computation. The 'instruction_sequence'
   // must contain a topologically-consistent total ordering of all instructions
   // in the computation. The result is invalid if instructions are not run in
   // exactly this sequence.
-  static StatusOr<Result> Run(
-      std::unique_ptr<HeapAlgorithm> algorithm,
+  static StatusOr<Result<HloValue>> Run(
+      std::unique_ptr<HeapAlgorithm<HloValue>> algorithm,
       const HloComputation& computation,
       const HloInstructionSequence& instruction_sequence,
       const HloAliasAnalysis& alias_analysis,
@@ -151,8 +154,8 @@ class HeapSimulator {
 
   // Same as above, but runs on with a schedule that covers all nested
   // computations.
-  static StatusOr<Result> Run(
-      std::unique_ptr<HeapAlgorithm> algorithm,
+  static StatusOr<Result<HloValue>> Run(
+      std::unique_ptr<HeapAlgorithm<HloValue>> algorithm,
       const HloComputation& computation,
       const HloInstructionSequence& instruction_sequence,
       const HloAliasAnalysis& alias_analysis,
@@ -163,7 +166,7 @@ class HeapSimulator {
   // If 'schedule' is non-null, it is used to find kCall and kWhile
   // sub-computations, and the heap simulation for those sub-computations will
   // be run recursively. I.e. the simulation is run over the whole module.
-  HeapSimulator(std::unique_ptr<HeapAlgorithm> algorithm,
+  HeapSimulator(std::unique_ptr<HeapAlgorithm<HloValue>> algorithm,
                 const BufferValue::SizeFunction& size_fn,
                 const Options& options, const HloSchedule* schedule = nullptr,
                 const absl::flat_hash_map<const HloComputation*, int64>*
@@ -187,7 +190,7 @@ class HeapSimulator {
   //  Two buffers belong to the same shared group.
   //  Eight of the buffer has no shared group assigned.
   bool InSameSharedGroup(const HloValue* left, const HloValue* right);
-  Result Finish();
+  Result<HloValue> Finish();
 
   void FillDebugTrace(HeapSimulatorTrace::Event::Kind kind,
                       const HloValue* buffer, const HloInstruction* instruction,
@@ -196,8 +199,9 @@ class HeapSimulator {
   // Counterintuitive: the algorithm_ itself can be a NoFragmentationStatsHeap,
   // in which case we are calculating the same allocs/frees twice in the
   // simulation.
-  const std::unique_ptr<NoFragmentationStatsHeap> no_fragmentation_stats_;
-  const std::unique_ptr<HeapAlgorithm> algorithm_;
+  const std::unique_ptr<NoFragmentationStatsHeap<HloValue>>
+      no_fragmentation_stats_;
+  const std::unique_ptr<HeapAlgorithm<HloValue>> algorithm_;
   const BufferValue::SizeFunction size_fn_;
   const Options options_;
   // schedule_ is set by buffer assignment, and memory_by_computation_ is
@@ -220,15 +224,16 @@ class HeapSimulator {
 // offsets to buffers.  A sequence of Alloc / Free calls will be made, with the
 // same semantics as a regular memory heap.  Finish will be called at the end to
 // collect the simulation results.
+template <typename BufferType>
 class HeapAlgorithm {
  public:
   using Chunk = HeapSimulator::Chunk;
-  using Result = HeapSimulator::Result;
+  using Result = HeapSimulator::Result<BufferType>;
 
   virtual ~HeapAlgorithm() = default;
 
   // Alloc allocates a buffer of 'size' bytes.
-  virtual void Alloc(const HloValue* buffer, int64 size) = 0;
+  virtual void Alloc(const BufferType* buffer, int64 size) = 0;
 
   // Takes memory usage of subcomputations into account when calculating the
   // memory usage of a computation. Currently, we don't handle buffer aliasing
@@ -247,7 +252,7 @@ class HeapAlgorithm {
           memory_by_computation) {}
 
   // Free de-allocates a previously allocated buffer.
-  virtual void Free(const HloValue* buffer, int64 size) = 0;
+  virtual void Free(const BufferType* buffer, int64 size) = 0;
 
   // Indicates that a buffer has to be collocated with another buffer. In
   // addition to Alloc and Free, the heap simulator exposes a concept of buffer
@@ -255,7 +260,7 @@ class HeapAlgorithm {
   // the buffer, it associates the buffer with a previously allocated (or
   // shared) buffer.  Each group of mutually-shared buffers points to a single
   // SharedGroup instance, which is a shared control block.
-  virtual void ShareWith(const HloValue* buffer, const HloValue* share_with,
+  virtual void ShareWith(const BufferType* buffer, const BufferType* share_with,
                          int64 size) {
     Alloc(buffer, size);
   }
@@ -269,19 +274,22 @@ class HeapAlgorithm {
 // this is the absolute minimum size for a given instruction sequence.  The
 // result.chunk_map returned in Finish is always empty, since we only collect
 // stats, and don't actually compute chunk assignments.
-class NoFragmentationStatsHeap : public HeapAlgorithm {
+template <typename BufferType>
+class NoFragmentationStatsHeap : public HeapAlgorithm<BufferType> {
  public:
+  using Result = HeapSimulator::Result<BufferType>;
+
   NoFragmentationStatsHeap() = default;
   ~NoFragmentationStatsHeap() override = default;
 
-  void Alloc(const HloValue* buffer, int64 size) override;
+  void Alloc(const BufferType* buffer, int64 size) override;
 
   void AccountForSubcomputationMemory(
       const HloInstruction* instruction, int64 alloc_size_by_instruction,
       const absl::flat_hash_map<const HloComputation*, int64>&
           memory_by_computation) override;
 
-  void Free(const HloValue* buffer, int64 size) override;
+  void Free(const BufferType* buffer, int64 size) override;
 
   Result Finish() override;
 
@@ -336,8 +344,12 @@ class BufferIntervalTree {
 // alloc/free time. It internally tracks the allocated buffers and their live
 // intervals; when allocating a buffer, it finds the best-fit free chunk during
 // its live interval.
-class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm {
+template <typename BufferType>
+class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm<BufferType> {
  public:
+  using Result = HeapSimulator::Result<BufferType>;
+  using Chunk = HeapSimulator::Chunk;
+
   enum Type {
     kSpatial = 0,
     kTemporal,
@@ -345,7 +357,7 @@ class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm {
 
   // BufferInterval stores a buffer's size and time interval.
   struct BufferInterval {
-    const HloValue* buffer;
+    const BufferType* buffer;
     int64 size;
     // Alloc time of the buffer.
     int64 start;
@@ -353,7 +365,7 @@ class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm {
     int64 end;
 
     // Colocation buffers that need to be collocated with this one.
-    std::vector<const HloValue*> colocations;
+    std::vector<const BufferType*> colocations;
 
     // True if this buffer needs an allocation. False if it is collocated with
     // other buffer.
@@ -368,10 +380,10 @@ class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm {
                                            Type type = kSpatial);
   ~GlobalDecreasingSizeBestFitHeap() override {}
 
-  void Alloc(const HloValue* buffer, int64 size) override;
-  void Free(const HloValue* buffer, int64 size) override;
+  void Alloc(const BufferType* buffer, int64 size) override;
+  void Free(const BufferType* buffer, int64 size) override;
 
-  void ShareWith(const HloValue* buffer, const HloValue* share_with,
+  void ShareWith(const BufferType* buffer, const BufferType* share_with,
                  int64 size) override;
 
   Result Finish() override;
@@ -404,7 +416,7 @@ class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm {
   void CommitChunk(const BufferInterval& buffer_interval,
                    ChunkCandidate chunk_candidate);
   // Adds the buffer and the chunk to the result chunk map.
-  virtual void AddToChunkMap(const HloValue* buffer, Chunk chunk);
+  virtual void AddToChunkMap(const BufferType* buffer, Chunk chunk);
 
   // Return a BufferIntervalCompare function that sorts by live ranges.  A live
   // range is defined by the range between the start of the first buffer and the
@@ -413,7 +425,7 @@ class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm {
   // contiguous.
   BufferIntervalCompare GetTemporalBufferIntervalCompare() const;
 
-  absl::flat_hash_map<const HloValue*, BufferInterval> buffer_intervals_;
+  absl::flat_hash_map<const BufferType*, BufferInterval> buffer_intervals_;
   Result result_;
   BufferIntervalCompare buffer_interval_compare_;
   BufferIntervalTree interval_tree_;
@@ -428,33 +440,37 @@ class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm {
   // Returns all transitive colocated buffers of this buffer interval. I.e., If
   // a buffer A is colocated with B and B is colocated with C, this function
   // returns all three of them.
-  absl::flat_hash_set<const HloValue*> GetTransitiveColocations(
+  absl::flat_hash_set<const BufferType*> GetTransitiveColocations(
       const BufferInterval& interval) const;
 };
 
 // A heap algorithm that chooses the best results from other algorithms added to
 // it.
-class ChooseBestHeapAlgorithm : public HeapAlgorithm {
+template <typename BufferType>
+class ChooseBestHeapAlgorithm : public HeapAlgorithm<BufferType> {
  public:
+  using Result = HeapSimulator::Result<BufferType>;
+
   ChooseBestHeapAlgorithm(
-      std::unique_ptr<std::vector<std::unique_ptr<HeapAlgorithm>>> algorithms)
+      std::unique_ptr<std::vector<std::unique_ptr<HeapAlgorithm<BufferType>>>>
+          algorithms)
       : algorithms_(std::move(*algorithms)) {}
   ~ChooseBestHeapAlgorithm() override {}
 
-  void Alloc(const HloValue* buffer, int64 size) override {
+  void Alloc(const BufferType* buffer, int64 size) override {
     for (auto& algorithm : algorithms_) {
       algorithm->Alloc(buffer, size);
     }
   }
 
-  void ShareWith(const HloValue* buffer, const HloValue* share_with,
+  void ShareWith(const BufferType* buffer, const BufferType* share_with,
                  int64 size) override {
     for (auto& algorithm : algorithms_) {
       algorithm->ShareWith(buffer, share_with, size);
     }
   }
 
-  void Free(const HloValue* buffer, int64 size) override {
+  void Free(const BufferType* buffer, int64 size) override {
     for (auto& algorithm : algorithms_) {
       algorithm->Free(buffer, size);
     }
@@ -463,7 +479,7 @@ class ChooseBestHeapAlgorithm : public HeapAlgorithm {
   Result Finish() override;
 
  private:
-  std::vector<std::unique_ptr<HeapAlgorithm>> algorithms_;
+  std::vector<std::unique_ptr<HeapAlgorithm<BufferType>>> algorithms_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/heap_simulator_test.cc b/tensorflow/compiler/xla/service/heap_simulator_test.cc
index b5b711cab4f..8f7668b4965 100644
--- a/tensorflow/compiler/xla/service/heap_simulator_test.cc
+++ b/tensorflow/compiler/xla/service/heap_simulator_test.cc
@@ -228,7 +228,7 @@ const char kFinish[] = "Finish";
 using CallSequence = std::vector<std::pair<string, const HloValue*>>;
 
 // HeapCallRecorder is a dummy heap algorithm that simply records its calls.
-class HeapCallRecorder : public HeapAlgorithm {
+class HeapCallRecorder : public HeapAlgorithm<HloValue> {
  public:
   explicit HeapCallRecorder(CallSequence* calls) : calls_(calls) {}
   ~HeapCallRecorder() override {}
@@ -396,7 +396,7 @@ class HeapSimulatorTracker {
   std::unique_ptr<HloModule> module_;
   std::unique_ptr<HloAliasAnalysis> alias_analysis_;
   CallSequence actual_calls_;
-  HeapSimulator::Result result_;
+  HeapSimulator::Result<HloValue> result_;
 };
 
 class HeapSimulatorTest : public HloTestBase {
@@ -976,12 +976,12 @@ class HeapAlgorithmTestBase : public ::testing::Test {
 class NoFragmentationStatsHeapTest : public HeapAlgorithmTestBase {};
 
 TEST_F(NoFragmentationStatsHeapTest, Empty) {
-  NoFragmentationStatsHeap heap;
+  NoFragmentationStatsHeap<HloValue> heap;
   EXPECT_EQ(0, heap.Finish().heap_size);
 }
 
 TEST_F(NoFragmentationStatsHeapTest, Simple) {
-  NoFragmentationStatsHeap heap;
+  NoFragmentationStatsHeap<HloValue> heap;
   heap.Alloc(buffer_a_, 10);
   heap.Alloc(buffer_b_, 20);
   heap.Alloc(buffer_c_, 30);
@@ -994,7 +994,7 @@ TEST_F(NoFragmentationStatsHeapTest, Simple) {
 }
 
 TEST_F(NoFragmentationStatsHeapTest, Mixed) {
-  NoFragmentationStatsHeap heap;
+  NoFragmentationStatsHeap<HloValue> heap;
   heap.Alloc(buffer_a_, 10);  // max: A
 
   heap.Alloc(buffer_b_, 20);  // max: A+B
@@ -1013,7 +1013,7 @@ TEST_F(NoFragmentationStatsHeapTest, Mixed) {
 class GlobalDecreasingSizeBestFitHeapTest : public HeapAlgorithmTestBase {
  protected:
   class InheritedGlobalDecreasingSizeBestFitHeap
-      : public GlobalDecreasingSizeBestFitHeap {
+      : public GlobalDecreasingSizeBestFitHeap<HloValue> {
    public:
     InheritedGlobalDecreasingSizeBestFitHeap()
         : GlobalDecreasingSizeBestFitHeap(/*alignment=*/1) {}
@@ -1048,8 +1048,8 @@ class GlobalDecreasingSizeBestFitHeapTest : public HeapAlgorithmTestBase {
 };
 
 TEST_F(GlobalDecreasingSizeBestFitHeapTest, Empty) {
-  GlobalDecreasingSizeBestFitHeap heap(/*alignment=*/1);
-  const HeapSimulator::Result result = heap.Finish();
+  GlobalDecreasingSizeBestFitHeap<HloValue> heap(/*alignment=*/1);
+  const HeapSimulator::Result<HloValue> result = heap.Finish();
   EXPECT_EQ(0, result.heap_size);
   EXPECT_EQ(0, result.chunk_map.size());
 }
@@ -1068,7 +1068,7 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, DecreasingSize) {
   //   |         |   d   |
   //   |         +-------+
   //   -----------------> time
-  GlobalDecreasingSizeBestFitHeap heap(/*alignment=*/1);
+  GlobalDecreasingSizeBestFitHeap<HloValue> heap(/*alignment=*/1);
   heap.Alloc(buffer_a_, 10);
   heap.Alloc(buffer_b_, 30);
   heap.Alloc(buffer_c_, 20);
@@ -1078,7 +1078,7 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, DecreasingSize) {
   heap.Free(buffer_c_, 20);
   heap.Free(buffer_d_, 40);
 
-  const HeapSimulator::Result result = heap.Finish();
+  const HeapSimulator::Result<HloValue> result = heap.Finish();
   EXPECT_EQ(100, result.heap_size);
   EXPECT_EQ(10, result.chunk_map.at(buffer_a_).size);
   EXPECT_EQ(30, result.chunk_map.at(buffer_b_).size);
@@ -1107,7 +1107,7 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, DecreasingSizeWithAlignment) {
   //   |         |       |
   //   |         +-------+
   //   ---------------------> time
-  GlobalDecreasingSizeBestFitHeap heap(/*alignment=*/20);
+  GlobalDecreasingSizeBestFitHeap<HloValue> heap(/*alignment=*/20);
   heap.Alloc(buffer_a_, 10);
   heap.Alloc(buffer_b_, 20);
   heap.Alloc(buffer_c_, 50);
@@ -1117,7 +1117,7 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, DecreasingSizeWithAlignment) {
   heap.Free(buffer_c_, 50);
   heap.Free(buffer_d_, 40);
 
-  const HeapSimulator::Result result = heap.Finish();
+  const HeapSimulator::Result<HloValue> result = heap.Finish();
   EXPECT_EQ(120, result.heap_size);
   EXPECT_EQ(10, result.chunk_map.at(buffer_a_).size);
   EXPECT_EQ(20, result.chunk_map.at(buffer_b_).size);
@@ -1148,7 +1148,7 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, BestFit) {
   //   |           |       |
   //   |           +-------+
   //   ---------------------> time
-  GlobalDecreasingSizeBestFitHeap heap(/*alignment=*/1);
+  GlobalDecreasingSizeBestFitHeap<HloValue> heap(/*alignment=*/1);
   heap.Alloc(buffer_a_, 10);
   heap.Alloc(buffer_b_, 20);
   heap.Alloc(buffer_c_, 40);
@@ -1160,7 +1160,7 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, BestFit) {
   heap.Free(buffer_d_, 30);
   heap.Free(buffer_e_, 50);
 
-  const HeapSimulator::Result result = heap.Finish();
+  const HeapSimulator::Result<HloValue> result = heap.Finish();
   EXPECT_EQ(140, result.heap_size);
   EXPECT_EQ(10, result.chunk_map.at(buffer_a_).size);
   EXPECT_EQ(20, result.chunk_map.at(buffer_b_).size);
@@ -1184,7 +1184,7 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, Colocated) {
   //   ||      |+----+|       |
   //   |+--a---++-b--++---c---+
   //   ---------------------> time
-  GlobalDecreasingSizeBestFitHeap heap(/*alignment=*/1);
+  GlobalDecreasingSizeBestFitHeap<HloValue> heap(/*alignment=*/1);
   heap.Alloc(buffer_a_, 40);
   heap.Free(buffer_a_, 40);
   heap.Alloc(buffer_b_, 20);
@@ -1192,7 +1192,7 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, Colocated) {
   heap.ShareWith(buffer_c_, buffer_a_, 40);
   heap.Free(buffer_c_, 40);
 
-  const HeapSimulator::Result result = heap.Finish();
+  const HeapSimulator::Result<HloValue> result = heap.Finish();
   EXPECT_EQ(40, result.heap_size);
   EXPECT_EQ(40, result.chunk_map.at(buffer_a_).size);
   EXPECT_EQ(20, result.chunk_map.at(buffer_b_).size);
@@ -1212,7 +1212,7 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, ColocatedII) {
   //   ||      |      |       | <--- colocate with a
   //   |+--a---+      +---c---+
   //   ---------------------> time
-  GlobalDecreasingSizeBestFitHeap heap(/*alignment=*/1);
+  GlobalDecreasingSizeBestFitHeap<HloValue> heap(/*alignment=*/1);
   heap.Alloc(buffer_a_, 40);
   heap.Free(buffer_a_, 40);
   heap.Alloc(buffer_b_, 20);
@@ -1221,7 +1221,7 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, ColocatedII) {
   heap.Free(buffer_c_, 40);
   heap.Free(buffer_b_, 20);
 
-  const HeapSimulator::Result result = heap.Finish();
+  const HeapSimulator::Result<HloValue> result = heap.Finish();
   EXPECT_EQ(60, result.heap_size);
   EXPECT_EQ(40, result.chunk_map.at(buffer_a_).size);
   EXPECT_EQ(20, result.chunk_map.at(buffer_b_).size);
@@ -1242,7 +1242,7 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, ColocatedIII) {
   //   |       |               |
   //   |       +-------b-------+
   //   ---------------------> time
-  GlobalDecreasingSizeBestFitHeap heap(/*alignment=*/1);
+  GlobalDecreasingSizeBestFitHeap<HloValue> heap(/*alignment=*/1);
   heap.Alloc(buffer_a_, 10);
   heap.Free(buffer_a_, 10);
   heap.Alloc(buffer_b_, 30);
@@ -1251,7 +1251,7 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, ColocatedIII) {
   heap.Free(buffer_c_, 10);
   heap.Free(buffer_b_, 30);
 
-  const HeapSimulator::Result result = heap.Finish();
+  const HeapSimulator::Result<HloValue> result = heap.Finish();
   EXPECT_EQ(40, result.heap_size);
   EXPECT_EQ(10, result.chunk_map.at(buffer_a_).size);
   EXPECT_EQ(30, result.chunk_map.at(buffer_b_).size);
diff --git a/tensorflow/compiler/xla/service/hlo.proto b/tensorflow/compiler/xla/service/hlo.proto
index 960f60fe882..c3a7b3a5c14 100644
--- a/tensorflow/compiler/xla/service/hlo.proto
+++ b/tensorflow/compiler/xla/service/hlo.proto
@@ -35,7 +35,7 @@ import "tensorflow/compiler/xla/xla_data.proto";
 option cc_enable_arenas = true;
 
 // Serialization of HloInstruction.
-// Next ID: 72
+// Next ID: 74
 message HloInstructionProto {
   reserved 10;
   reserved "parameter_name";
@@ -248,6 +248,12 @@ message HloInstructionProto {
 
   // RNG algorithm used by kRngBitGenerator.
   xla.RandomAlgorithm rng_algorithm = 70;
+
+  // The comparison type used for kCompare.
+  string comparison_type = 72;
+
+  // Specifies if this is a cross-program-prefetch, used by kCopyStart.
+  bool is_cross_program_prefetch = 73;
 }
 
 // Serialization of HloComputation.
@@ -283,6 +289,16 @@ message HloScheduleProto {
   map<int64, InstructionSequence> sequences = 1;
 }
 
+enum Kind {
+  // Define a UNDEFINED_ALIAS equal to zero to get around the default-0 proto3
+  // behavior and missing has_*() APIs.
+  UNDEFINED_ALIAS = 0;
+  // The buffers may or may not alias at runtime.
+  MAY_ALIAS = 1;
+  // The buffers must alias at runtime.
+  MUST_ALIAS = 2;
+}
+
 message HloInputOutputAliasProto {
   // The following proto describes a pair of aliased an input
   // (described by parameter number and a ShapeIndex of the parameter)
@@ -304,8 +320,8 @@ message HloInputOutputAliasProto {
     int64 parameter_number = 2;
     // ShapeIndex of the parameter instruction.
     repeated int64 parameter_shape_index = 3;
-    reserved 4;
-    reserved "kind";
+    // The kind of alias to be setup.
+    Kind kind = 4;
   }
 
   repeated AliasEntryProto entries = 1;
diff --git a/tensorflow/compiler/xla/service/hlo_alias_analysis.cc b/tensorflow/compiler/xla/service/hlo_alias_analysis.cc
index 384ae272dc1..cf09ddeec27 100644
--- a/tensorflow/compiler/xla/service/hlo_alias_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_alias_analysis.cc
@@ -308,6 +308,39 @@ class BufferValueMap {
     }
   }
 
+  void ComputeInPlaceOperationAliasedBuffers(
+      const HloValue& value, std::vector<BufferNumber>* aliased_buffers) {
+    VLOG(3) << "Compute aliases for in-place operations (e.g. "
+               "kDynamicUpdateSlice and kScatter)";
+    for (const HloPosition& position : value.positions()) {
+      HloInstruction* instruction = position.instruction;
+      for (const auto& operand_and_output_index :
+           HloDataflowAnalysis::GetInPlaceInputOutputPairs(instruction)) {
+        if (position.index == operand_and_output_index.second) {
+          const HloUse& operand = operand_and_output_index.first;
+          const HloValue& operand_value = dataflow_.GetUniqueValueAt(
+              instruction->operand(operand.operand_number),
+              operand.operand_index);
+          VLOG(3) << " operand value " << operand_value.ToShortString()
+                  << " aliases.";
+          aliased_buffers->push_back(GetBufferForValue(operand_value));
+        }
+      }
+    }
+
+    for (const HloUse& use : value.uses()) {
+      for (const auto& operand_and_output_index :
+           HloDataflowAnalysis::GetInPlaceInputOutputPairs(use.instruction)) {
+        if (use == operand_and_output_index.first) {
+          const HloValue& use_value = dataflow_.GetUniqueValueAt(
+              use.instruction, operand_and_output_index.second);
+          VLOG(3) << " use value " << use_value.ToShortString() << " aliases.";
+          aliased_buffers->push_back(GetBufferForValue(use_value));
+        }
+      }
+    }
+  }
+
   // Compute and return a vector of buffers that the given value must be
   // contained in due to HLO aliasing rules.
   std::vector<BufferNumber> ComputeAliasedBuffers(const HloValue& value) {
@@ -318,6 +351,7 @@ class BufferValueMap {
     ComputeInputOutputAliasedBuffers(value, &aliased_buffers);
     ComputeWhileAliasedBuffers(value, &aliased_buffers);
     ComputeConditionalAliasedBuffers(value, &aliased_buffers);
+    ComputeInPlaceOperationAliasedBuffers(value, &aliased_buffers);
     // Uniquify aliased buffers.
     absl::c_sort(aliased_buffers);
     aliased_buffers.erase(
diff --git a/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc
index 2666cb0872d..5e94f1d173e 100644
--- a/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc
@@ -1062,6 +1062,118 @@ TEST_F(HloAliasAnalysisTest, MergeBuffersReverse) {
   analysis.BufferLivesOut(analysis.buffers()[0]);
 }
 
+TEST_F(HloAliasAnalysisTest, DynamicUpdateSlice) {
+  Shape shape = ShapeUtil::MakeShape(F32, {8});
+  Shape update_shape = ShapeUtil::MakeShape(F32, {4});
+  Shape index_shape = ShapeUtil::MakeShape(S32, {});
+  auto builder = HloComputation::Builder(TestName());
+  auto param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, shape, "param0"));
+  auto param1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, update_shape, "param1"));
+  auto param2 = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, index_shape, "param2"));
+  auto copy0 = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kCopy, param0));
+  auto dynamic_update_slice = builder.AddInstruction(
+      HloInstruction::CreateDynamicUpdateSlice(shape, copy0, param1, {param2}));
+
+  module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
+
+  HloAliasAnalysis& analysis = RunAnalysis();
+
+  EXPECT_EQ(analysis.GetUniqueBufferAt(copy0),
+            analysis.GetUniqueBufferAt(dynamic_update_slice));
+}
+
+TEST_F(HloAliasAnalysisTest, DynamicUpdateSliceMultiOutputFusion) {
+  absl::string_view hlo_string = R"(
+HloModule Module
+
+fused_computation {
+  param0 = f32[1280,1,128] parameter(0)
+  param1 = f32[1280,1,128] parameter(1)
+  param2 = f32[1280,1,128] parameter(2)
+  constant.1 = f32[] constant(0)
+  broadcast.6 = f32[128,1,128] broadcast(constant.1), dimensions={}
+  constant.3 = s32[] constant(0)
+  add.1 = f32[1280,1,128] add(param0, param0)
+  dynamic-update-slice.5 = f32[1280,1,128] dynamic-update-slice(param1, broadcast.6, constant.3, constant.3, constant.3)
+  dynamic-update-slice.6 = f32[1280,1,128] dynamic-update-slice(param2, broadcast.6, constant.3, constant.3, constant.3)
+  ROOT tuple.1 = (f32[1280,1,128], f32[1280,1,128], f32[1280,1,128]) tuple(add.1, dynamic-update-slice.5, dynamic-update-slice.6)
+}
+
+ENTRY main {
+  param = f32[1280,1,128] parameter(0)
+  negate0 = f32[1280,1,128] negate(param)
+  negate1 = f32[1280,1,128] negate(param)
+  negate2 = f32[1280,1,128] negate(param)
+  ROOT fusion = (f32[1280,1,128], f32[1280,1,128], f32[1280,1,128]) fusion(negate0, negate1, negate2), kind=kLoop, calls=fused_computation
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(module_, ParseAndReturnVerifiedModule(hlo_string));
+
+  SCOPED_TRACE(module_->ToString());
+
+  HloAliasAnalysis& analysis = RunAnalysis();
+  LOG(INFO) << analysis.ToString();
+
+  // Expect negate1 and negate2 to alias with fusion{1} and fusion{2}
+  // respectively (due to DUS), but not negate0 and fusion{0}.
+  const HloInstruction* fusion =
+      module_->entry_computation()->GetInstructionWithName("fusion");
+  const HloInstruction* negate0 =
+      module_->entry_computation()->GetInstructionWithName("negate0");
+  const HloInstruction* negate1 =
+      module_->entry_computation()->GetInstructionWithName("negate1");
+  const HloInstruction* negate2 =
+      module_->entry_computation()->GetInstructionWithName("negate2");
+  EXPECT_EQ(analysis.GetUniqueBufferAt(negate1),
+            analysis.GetUniqueBufferAt(fusion, {1}));
+  EXPECT_EQ(analysis.GetUniqueBufferAt(negate2),
+            analysis.GetUniqueBufferAt(fusion, {2}));
+  EXPECT_NE(analysis.GetUniqueBufferAt(negate0),
+            analysis.GetUniqueBufferAt(fusion, {0}));
+}
+
+TEST_F(HloAliasAnalysisTest, ChainedDynamicUpdateSliceFusion) {
+  // CPU and GPU backends may generate fusions with dynamic update slices
+  // feeding each other. They expect the fusion to not be in-place if that is
+  // the case.
+  absl::string_view hlo_string = R"(
+HloModule Module
+
+fused_computation {
+  param0 = f32[1280,1,128] parameter(0)
+  constant.1 = f32[] constant(0)
+  broadcast.6 = f32[128,1,128] broadcast(constant.1), dimensions={}
+  constant.3 = s32[] constant(0)
+  dynamic-update-slice.5 = f32[1280,1,128] dynamic-update-slice(param0, broadcast.6, constant.3, constant.3, constant.3)
+  ROOT dynamic-update-slice.6 = f32[1280,1,128] dynamic-update-slice(dynamic-update-slice.5, broadcast.6, constant.3, constant.3, constant.3)
+}
+
+ENTRY main {
+  param = f32[1280,1,128] parameter(0)
+  negate0 = f32[1280,1,128] negate(param)
+  ROOT fusion = f32[1280,1,128] fusion(negate0), kind=kLoop, calls=fused_computation
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(module_, ParseAndReturnVerifiedModule(hlo_string));
+
+  SCOPED_TRACE(module_->ToString());
+
+  HloAliasAnalysis& analysis = RunAnalysis();
+  LOG(INFO) << analysis.ToString();
+
+  const HloInstruction* fusion =
+      module_->entry_computation()->GetInstructionWithName("fusion");
+  const HloInstruction* negate0 =
+      module_->entry_computation()->GetInstructionWithName("negate0");
+  EXPECT_NE(analysis.GetUniqueBufferAt(negate0),
+            analysis.GetUniqueBufferAt(fusion));
+}
+
 TEST_F(HloAliasAnalysisTest, BitcastInterference) {
   // A bitcast value simultaneously live with its operand should not cause
   // interference.
diff --git a/tensorflow/compiler/xla/service/hlo_computation.cc b/tensorflow/compiler/xla/service/hlo_computation.cc
index 438aa6ff05f..75a6dcdfdd2 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation.cc
@@ -545,7 +545,7 @@ string HloComputation::ToString(
     if (options.print_percent()) {
       s << "%";
     }
-    if (options.print_ids() || !IsEntryComputation()) {
+    if (options.print_ids()) {
       // Exclude entry computation's name because it includes and leads to
       // non-deterministic fingerprint.
       s << PrintName(name(), options.print_ids()) << " ";
@@ -836,8 +836,9 @@ ProgramShape HloComputation::ComputeProgramShape(bool include_ids) const {
   return program_shape;
 }
 
-bool HloComputation::Equal(const HloComputation& other,
-                           bool is_layout_sensitive) const {
+bool HloComputation::EqualInternal(const HloComputation& other,
+                                   bool is_layout_sensitive,
+                                   bool ignore_channel_id_values) const {
   if (this == &other) {
     return true;
   }
@@ -855,15 +856,21 @@ bool HloComputation::Equal(const HloComputation& other,
       continue;
     }
     visited.emplace(pair);
-    // TODO(b/123082518): Avoid recursively invoking == because it may
+    // TODO(b/123082518): Avoid recursively invoking Equal because it may
     // cause a stack overflow with deeply nested subcomputations.
-    bool identical_ignoring_operands = pair.first->Identical(
-        *pair.second,
-        [](const HloInstruction*, const HloInstruction*) { return true; },
-        [](const HloComputation* a, const HloComputation* b) {
-          return *a == *b;
-        },
-        is_layout_sensitive);
+    auto operands_eq = [](const HloInstruction*, const HloInstruction*) {
+      return true;
+    };
+    auto comp_eq = [&](const HloComputation* a, const HloComputation* b) {
+      return a->EqualInternal(*b, is_layout_sensitive,
+                              ignore_channel_id_values);
+    };
+    bool identical_ignoring_operands =
+        ignore_channel_id_values
+            ? pair.first->IdenticalIgnoringChannelIdValues(
+                  *pair.second, operands_eq, comp_eq, is_layout_sensitive)
+            : pair.first->Identical(*pair.second, operands_eq, comp_eq,
+                                    is_layout_sensitive);
     if (!identical_ignoring_operands) {
       return false;
     }
diff --git a/tensorflow/compiler/xla/service/hlo_computation.h b/tensorflow/compiler/xla/service/hlo_computation.h
index d640007886c..1dcf1d9d7d3 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.h
+++ b/tensorflow/compiler/xla/service/hlo_computation.h
@@ -310,7 +310,19 @@ class HloComputation {
   ProgramShape ComputeProgramShape(bool include_ids = true) const;
 
   // Return whether `*this` and `other` are functionally equivalent.
-  bool Equal(const HloComputation& other, bool is_layout_sensitive) const;
+  bool Equal(const HloComputation& other, bool is_layout_sensitive) const {
+    return EqualInternal(other, is_layout_sensitive,
+                         /*ignore_channel_id_values=*/false);
+  }
+
+  // Same as Equal() but ignores channel ID value mismatches on instructions, as
+  // long as the two instructions both have channel IDs or neither has a channel
+  // ID.
+  bool EqualIgnoringChannelIdValues(const HloComputation& other,
+                                    bool is_layout_sensitive) const {
+    return EqualInternal(other, is_layout_sensitive,
+                         /*ignore_channel_id_values=*/true);
+  }
 
   // Return whether `*this` and `other` are functionally equivalent.
   bool operator==(const HloComputation& other) const {
@@ -489,6 +501,10 @@ class HloComputation {
   HloInstruction* AddInstructionInternal(
       std::unique_ptr<HloInstruction> instruction);
 
+  // Internal helper for comparison with different options.
+  bool EqualInternal(const HloComputation& other, bool is_layout_sensitive,
+                     bool ignore_channel_id_values) const;
+
   // Fuses HLOs in instructions_to_fuse into fusion_instruction.
   //
   // Pre-condition: fusion_instruction's opcode is kFusion.
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
index 72b15db0dcd..939c713fc18 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
@@ -486,6 +486,10 @@ Status HloCostAnalysis::HandleReshape(const HloInstruction*) {
   return Status::OK();
 }
 
+Status HloCostAnalysis::HandleDynamicReshape(const HloInstruction*) {
+  return Status::OK();
+}
+
 Status HloCostAnalysis::HandleBatchNormTraining(const HloInstruction*) {
   // TODO(b/62294698): Implement cost analysis for batch-norm-training.
   return Status::OK();
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.h b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
index d9085dd7785..f101e3819c9 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
@@ -113,6 +113,7 @@ class HloCostAnalysis : public ConstDfsHloVisitor {
   Status HandleBroadcast(const HloInstruction* broadcast) override;
   Status HandlePad(const HloInstruction* pad) override;
   Status HandleReshape(const HloInstruction* reshape) override;
+  Status HandleDynamicReshape(const HloInstruction* reshape) override;
   Status HandleAddDependency(const HloInstruction* add_dependency) override;
   Status HandleAfterAll(const HloInstruction* token) override;
   Status HandleTranspose(const HloInstruction* transpose) override;
diff --git a/tensorflow/compiler/xla/service/hlo_creation_utils.cc b/tensorflow/compiler/xla/service/hlo_creation_utils.cc
index 4ba67888409..4aeeb6d27ac 100644
--- a/tensorflow/compiler/xla/service/hlo_creation_utils.cc
+++ b/tensorflow/compiler/xla/service/hlo_creation_utils.cc
@@ -92,16 +92,17 @@ StatusOr<HloInstruction*> MakeSliceHlo(HloInstruction* operand,
 
 StatusOr<HloInstruction*> MakeConvolveHlo(
     HloInstruction* lhs, HloInstruction* rhs, int64 feature_group_count,
-    const Window& window, const ConvolutionDimensionNumbers& dimension_numbers,
+    int64 batch_group_count, const Window& window,
+    const ConvolutionDimensionNumbers& dimension_numbers,
     const PrecisionConfig& precision_config) {
   HloComputation* computation = lhs->parent();
   CHECK_EQ(computation, rhs->parent());
   TF_ASSIGN_OR_RETURN(Shape convolve_shape,
                       ShapeInference::InferConvolveShape(
-                          lhs->shape(), rhs->shape(), feature_group_count, 1,
-                          window, dimension_numbers));
+                          lhs->shape(), rhs->shape(), feature_group_count,
+                          batch_group_count, window, dimension_numbers));
   return computation->AddInstruction(HloInstruction::CreateConvolve(
-      convolve_shape, lhs, rhs, feature_group_count, 1, window,
+      convolve_shape, lhs, rhs, feature_group_count, batch_group_count, window,
       dimension_numbers, precision_config));
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_creation_utils.h b/tensorflow/compiler/xla/service/hlo_creation_utils.h
index 2b17ae3d967..53eeeffb858 100644
--- a/tensorflow/compiler/xla/service/hlo_creation_utils.h
+++ b/tensorflow/compiler/xla/service/hlo_creation_utils.h
@@ -61,7 +61,8 @@ StatusOr<HloInstruction*> MakeSliceHlo(HloInstruction* operand,
 // containing `lhs` and `rhs` (`lhs` and `rhs` must be in the same computation).
 StatusOr<HloInstruction*> MakeConvolveHlo(
     HloInstruction* lhs, HloInstruction* rhs, int64 feature_group_count,
-    const Window& window, const ConvolutionDimensionNumbers& dimension_numbers,
+    int64 batch_group_count, const Window& window,
+    const ConvolutionDimensionNumbers& dimension_numbers,
     const PrecisionConfig& precision_config);
 
 // Creates a transpose HLO instruction and adds it to the computation containing
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
index a46d20d5808..72899ffe163 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <algorithm>
 #include <queue>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "absl/algorithm/container.h"
@@ -32,6 +33,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_value.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -42,7 +44,45 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
+namespace {
+// CalculatePostOrderSchedule traverses a module and assign a ordinal to each
+// instruction based the postorder dependency.
+int64 CalculatePostOrderScheduleHelper(
+    const HloComputation* comp, int64 start_ordinal,
+    absl::flat_hash_map<HloInstruction*, int64>* ordinal_map) {
+  int64 ordinal = start_ordinal;
+  for (HloInstruction* instruction : comp->MakeInstructionPostOrder()) {
+    if (instruction->opcode() == HloOpcode::kCall ||
+        instruction->opcode() == HloOpcode::kConditional) {
+      for (const HloComputation* called_computation :
+           instruction->called_computations()) {
+        ordinal = CalculatePostOrderScheduleHelper(called_computation, ordinal,
+                                                   ordinal_map);
+      }
+    }
+    if (instruction->opcode() == HloOpcode::kWhile) {
+      ordinal = CalculatePostOrderScheduleHelper(instruction->while_condition(),
+                                                 ordinal, ordinal_map);
+      ordinal = CalculatePostOrderScheduleHelper(instruction->while_body(),
+                                                 ordinal, ordinal_map);
+    }
+    // It's possible that in some unit tests the computation graph is not
+    // flatten (meaning we could have multiple callers for one computation). In
+    // that case the oridinal_map will see the instruction multiple times. We
+    // consider that case to be ok as it only shows up in unit tests.
+    ordinal_map->insert({instruction, ordinal++});
+  }
+  return ordinal;
+}
 
+absl::flat_hash_map<HloInstruction*, int64> CalculatePostOrderSchedule(
+    const HloModule& module) {
+  absl::flat_hash_map<HloInstruction*, int64> map;
+  CalculatePostOrderScheduleHelper(module.entry_computation(), 0, &map);
+  return map;
+}
+
+}  // namespace
 using absl::StrAppend;
 using absl::StrCat;
 
@@ -757,27 +797,35 @@ bool HloDataflowAnalysis::UpdateInstructionValueSet(
 }
 
 void HloDataflowAnalysis::Propagate() {
-  std::queue<HloInstruction*> worklist;
+  using Work = std::pair<int64, HloInstruction*>;
+  // Avoid duplicating work by preferring work items early in the post order
+  // schedule. Intuitively, we start from entry parameters and propagate buffers
+  // updates throughout the module only once.
+  std::priority_queue<Work, std::vector<Work>, std::greater<Work>> worklist;
   absl::flat_hash_set<HloInstruction*> workset;
-  auto add_to_worklist = [&worklist, &workset](HloInstruction* instruction) {
+  auto priority_map = CalculatePostOrderSchedule(module_);
+  auto add_to_worklist = [&priority_map, &worklist,
+                          &workset](HloInstruction* instruction) {
     if (workset.insert(instruction).second) {
-      worklist.push(instruction);
+      worklist.emplace(priority_map[instruction], instruction);
     }
   };
 
-  for (HloComputation* computation : module_.computations()) {
-    for (HloInstruction* instruction : computation->instructions()) {
+  auto comps = module_.MakeComputationPostOrder();
+  for (HloComputation* computation : comps) {
+    for (HloInstruction* instruction :
+         computation->MakeInstructionPostOrder()) {
       add_to_worklist(instruction);
     }
   }
   VLOG(1) << "SSA_FORM_: " << ssa_form_;
 
   while (!worklist.empty()) {
-    HloInstruction* instruction = worklist.front();
+    HloInstruction* instruction = worklist.top().second;
     auto add_to_worklist = [&](HloInstruction* todo) {
       if (workset.insert(todo).second) {
         VLOG(1) << "  Adding todo : " << todo->name();
-        worklist.push(todo);
+        worklist.emplace(priority_map[todo], todo);
       }
     };
     worklist.pop();
@@ -1130,69 +1178,49 @@ bool HloDataflowAnalysis::DoesNotUseOperandBuffer(
   return true;
 }
 
-// Given a fusion whose root is a dynamic-update-slice op, determines whether
-// the fusion's output buffer can be shared with the buffer of fusion_param,
-// which must be a fused parameter of the fusion.
-//
-// Preconditions:
-//
-//  - fusion's root is a dynamic-update-slice op.
-//  - fusion_param is a parameter within the fusion.
-//
-// fusion_param may point to a subelement of the actual parameter instruction if
-// the param is a tuple; i.e. fusion_param->index() need not be the empty list.
-//
-// Returns true if:
-//
-//  * fusion_param is used by the root of dynamic-update-slice as the "base" of
-//    the update, i.e. the thing being updated, AND
-//  * all other uses of fusion_param are dynamic-slices that slice the same
-//    indices as are overwritten in the dynamic-update-slice.
-//
-// In the case that there are no other uses of fusion_param (last bullet point
-// is vacuously true) it's easy to see why an in-place DUS is safe; this is just
-// the "natural" implementation of DUS.  If there are other users, in-place DUS
-// is safe on the assumption that the thread which writes element i of the
-// output will be the only one to read element i of fusion_param (via the
-// dynamic-slice ops).
-static bool CanDoInPlaceDynamicUpdateSlice(HloInstruction* fusion,
-                                           const HloValue& fusion_param_value) {
-  auto* root =
-      Cast<HloDynamicUpdateSliceInstruction>(fusion->fused_expression_root());
-  auto* fusion_param = fusion_param_value.instruction();
-  CHECK_EQ(fusion_param->opcode(), HloOpcode::kParameter);
-  CHECK_EQ(fusion_param->parent(), fusion->fused_instructions_computation());
+/*static*/ bool HloDataflowAnalysis::IsInPlaceOperation(HloOpcode opcode) {
+  return opcode == HloOpcode::kDynamicUpdateSlice ||
+         opcode == HloOpcode::kScatter;
+}
 
-  // fusion_param must be used by the root as the "base" of the
-  // dynamic-update-slice.  The natural way to check this would be
-  //
-  //   `if (root->operand(0) != fusion_param)`
-  //
-  // but we also have to handle the case where the fusion parameter is
-  // tuple-shaped and we're considering just one element of that tuple, i.e.
-  // fusion_param.index() != {}.
-  if (absl::c_count_if(fusion_param_value.uses(), [&](const HloUse& use) {
-        return use.instruction == root;
-      }) != 1) {
-    return false;
+/*static*/ std::vector<std::pair<HloUse, ShapeIndex>>
+HloDataflowAnalysis::GetInPlaceInputOutputPairs(HloInstruction* instruction) {
+  if (IsInPlaceOperation(instruction->opcode())) {
+    return {{HloUse{instruction, 0, {}}, {}}};
+  } else if (instruction->opcode() != HloOpcode::kFusion) {
+    return {};
   }
-
-  // All other uses of fusion_param must be dynamic-slices that slice the same
-  // indices as are overwritten by the dynamic-update-slice.
-  for (const HloUse& use : fusion_param_value.uses()) {
-    auto* user = use.instruction;
-    if (user == root) {
-      continue;
+  std::vector<std::pair<HloUse, ShapeIndex>> input_output_pairs;
+  for (auto& indexed_shape : ShapeUtil::GetLeafShapes(instruction->shape())) {
+    const HloInstruction* hlo_generating_output =
+        instruction->fused_expression_root();
+    for (int64 i = 0; i < indexed_shape.index.size(); ++i) {
+      if (hlo_generating_output->opcode() == HloOpcode::kTuple) {
+        hlo_generating_output =
+            hlo_generating_output->operand(indexed_shape.index[i]);
+      } else {
+        CHECK_EQ(i, indexed_shape.index.size() - 1);
+      }
     }
 
-    // Check that `user` is a dynamic-slice op and has the same slice indices as
-    // `root`.
-    auto* ds = DynCast<HloDynamicSliceInstruction>(user);
-    if (!ds || ds->index_operands() != root->index_operands()) {
-      return false;
+    if (IsInPlaceOperation(hlo_generating_output->opcode())) {
+      ShapeIndex operand_index;
+      const HloInstruction* fusion_parameter =
+          hlo_generating_output->operand(0);
+      while (fusion_parameter->opcode() == HloOpcode::kGetTupleElement) {
+        operand_index.push_front(fusion_parameter->tuple_index());
+        fusion_parameter = fusion_parameter->operand(0);
+      }
+
+      if (fusion_parameter->opcode() == HloOpcode::kParameter) {
+        input_output_pairs.emplace_back(
+            HloUse{instruction, fusion_parameter->parameter_number(),
+                   operand_index},
+            indexed_shape.index);
+      }
     }
   }
-  return true;
+  return input_output_pairs;
 }
 
 bool HloDataflowAnalysis::CanShareOperandBufferWithUser(
@@ -1213,24 +1241,17 @@ bool HloDataflowAnalysis::CanShareOperandBufferWithUser(
     return false;
   }
 
-  if (user->opcode() == HloOpcode::kFusion) {
-    // Get the parameter associated with 'operand';
-    HloInstruction* fusion_param =
-        user->fused_parameter(user->operand_index(operand));
-
-    const HloValue& fusion_param_value =
-        GetValueDefinedAt(fusion_param, operand_index);
-
-    // TODO(b/80315712): This code is in a bit of a weird intermediate state
-    // at the moment. The in-place DUS check really needs to be common to all
-    // backends, so it runs first. Then we run the backend-specific check if
-    // provided, or go through the target-independent check if not.
-    // Unfortunately, the notionally "target-independent" path actually contains
-    // some target-specific code, so we can't run all of it *in addition* to the
-    // target-specific function, like the interface documentation says.
-    if (user->fused_expression_root()->opcode() ==
-        HloOpcode::kDynamicUpdateSlice) {
-      return CanDoInPlaceDynamicUpdateSlice(user, fusion_param_value);
+  // Must-alias relationship returns true for in-place operations (DUS and DUS
+  // fusions), regardless of the backend.
+  for (const auto& operand_and_output_index :
+       GetInPlaceInputOutputPairs(user)) {
+    if (operand_and_output_index.second != user_index) {
+      continue;
+    }
+    for (const HloUse& use : GetUniqueValueAt(operand, operand_index).uses()) {
+      if (use == operand_and_output_index.first) {
+        return true;
+      }
     }
   }
 
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
index bec592aeb20..ffa307d71dd 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
@@ -49,6 +49,9 @@ class HloDataflowAnalysis {
   // Infrastructure for passing may-alias hints: HLO passes can populate the
   // may-alias table. If an empty optional is returned, default rules are used.
   //
+  // Must-alias rules (as defined by GetInPlaceInputOutputPairs) cannot be
+  // overriden using backend-specific overrides.
+  //
   // The first parameter of the function should be the instruction, the
   // second parameter should be an operand of the instruction. The third
   // parameter should be the output index of the instruction.
@@ -160,6 +163,15 @@ class HloDataflowAnalysis {
 
   const HloModule& module() const { return module_; }
 
+  // Returns true if the operation is an in-place operation and its operand 0
+  // must alias with the output.
+  static bool IsInPlaceOperation(HloOpcode opcode);
+
+  // Returns a vector consisting of the HloUse (operand number and shape index)
+  // and output shape index of the in-place operations within this HLO.
+  static std::vector<std::pair<HloUse, ShapeIndex>> GetInPlaceInputOutputPairs(
+      HloInstruction* instruction);
+
  protected:
   HloDataflowAnalysis(const HloModule& module, bool ssa_form,
                       bool bitcast_defines_value = false,
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
index 1bbbb248bbc..1fa6fe95c40 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
@@ -1229,10 +1229,10 @@ TEST_P(HloDataflowAnalysisTest, CopyStartAndCopyDone) {
   auto builder = HloComputation::Builder(TestName());
   auto constant = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
-  auto copy_start = builder.AddInstruction(HloInstruction::CreateUnary(
+  auto copy_start = builder.AddInstruction(HloInstruction::CreateCopyStart(
       ShapeUtil::MakeTupleShape({constant->shape(), constant->shape(),
                                  ShapeUtil::MakeShape(U32, {})}),
-      HloOpcode::kCopyStart, constant));
+      constant));
   auto copy_done = builder.AddInstruction(HloInstruction::CreateUnary(
       constant->shape(), HloOpcode::kCopyDone, copy_start));
   module_->AddEntryComputation(builder.Build());
@@ -2324,36 +2324,6 @@ TEST_F(CanShareOperandBufferWithUserTest,
       dataflow_analysis_->CanShareOperandBufferWithUser(param, {}, fusion, {}));
 }
 
-TEST_F(CanShareOperandBufferWithUserTest, DUSWithSliceWithDifferentIndices) {
-  const char* kModule = R"(
-    HloModule test
-
-    fused_computation {
-      p0 = f32[10,20,30] parameter(0)
-      p1 = s32[] parameter(1)
-      p2 = s32[] parameter(2)
-      p3 = s32[] parameter(3)
-      slice = f32[1,1,30] dynamic-slice(p0, p1, p2, p3), dynamic_slice_sizes={1,1,30}
-      ROOT dus = f32[10,20,30] dynamic-update-slice(p0, slice, p1, p3, p2)
-    }
-
-    ENTRY test {
-      p0 = f32[10,20,30] parameter(0)
-      p1 = s32[] parameter(1)
-      p2 = s32[] parameter(2)
-      p3 = s32[] parameter(3)
-      ROOT fusion = f32[10,20,30] fusion(p0, p1, p2, p3), kind=kLoop, calls=fused_computation
-    }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(module_, ParseAndReturnVerifiedModule(kModule));
-  auto* fusion = module_->entry_computation()->root_instruction();
-  auto* param = module_->entry_computation()->parameter_instruction(0);
-
-  RunAnalysis();
-  EXPECT_FALSE(
-      dataflow_analysis_->CanShareOperandBufferWithUser(param, {}, fusion, {}));
-}
-
 TEST_F(CanShareOperandBufferWithUserTest, DUSWithSliceWithSameIndices) {
   const char* kModule = R"(
     HloModule test
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.cc b/tensorflow/compiler/xla/service/hlo_evaluator.cc
index 66e9e01fc38..acccf7aac9a 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.cc
@@ -1573,9 +1573,9 @@ class OutputBatchIndexToInputIndex {
     int64 index_vector_dim = dim_numbers_.index_vector_dim();
     for (int64 i = 0, e = index_vector_.size(); i < e; i++) {
       index_vector_index_[index_vector_dim] = i;
-      // TODO(george): OK what should happen here?
-      // seems OK to crash though.
-      index_vector_[i] = *start_indices_.GetIntegralAsS64(index_vector_index_);
+      auto start_index = start_indices_.GetIntegralAsS64(index_vector_index_);
+      TF_RET_CHECK(start_index.has_value());
+      index_vector_[i] = *start_index;
     }
     return Status::OK();
   }
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
index 9226cd556ff..b91ec9d86ee 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
@@ -48,22 +48,26 @@ template <typename T>
 struct is_complex_t : absl::disjunction<std::is_same<T, complex64>,
                                         std::is_same<T, complex128>> {};
 
+namespace detail {
+template <typename T>
+using unsigned_promoted_type_t =
+    std::make_unsigned_t<decltype(std::declval<T>() + std::declval<T>())>;
+}
+
 // ToArithmeticSafeType(T t):
-//  - converts `t` to the bitwise-equivalent `unsigned T` if T is a signed
+//  - converts `t` to an unsigned integer at least as wide as `int` if T is an
 //    integer, and
 //  - otherwise returns `t` unchanged.
 //
 // It's UB in C++ to under/overflow a signed integer, so we wrap all arithmetic
 // in this type to force 2's complement behavior.
 template <typename T,
-          typename std::enable_if<std::is_integral<T>::value &&
-                                  std::is_signed<T>::value>::type* = nullptr>
-typename std::make_unsigned<T>::type ToArithmeticSafeType(T t) {
-  return static_cast<typename std::make_unsigned<T>::type>(t);
+          typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
+detail::unsigned_promoted_type_t<T> ToArithmeticSafeType(T t) {
+  return static_cast<detail::unsigned_promoted_type_t<T>>(t);
 }
 template <typename T,
-          typename std::enable_if<!std::is_integral<T>::value ||
-                                  !std::is_signed<T>::value>::type* = nullptr>
+          typename std::enable_if<!std::is_integral<T>::value>::type* = nullptr>
 T ToArithmeticSafeType(T t) {
   return std::move(t);
 }
@@ -1153,7 +1157,10 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
       const int64 feature_group_index =
           out_index[output_z_dim] / output_feature_group_size;
 
-      const int64 batch_group_index = out_index[output_z_dim];
+      const int64 depthwise_multiplier =
+          batch_group_count > 1 ? output_z_size / input_batch_size : 1;
+      const int64 batch_group_index =
+          out_index[output_z_dim] / depthwise_multiplier;
 
       ElementwiseT result_val = static_cast<ElementwiseT>(0);
       DimensionVector rhs_spatial_index(dnums.kernel_spatial_dimensions_size(),
@@ -1214,7 +1221,6 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
               feature_group_index * input_feature_group_size + rhs_iz;
 
           int64 lhs_linear_index = lhs_linear_spatial_index;
-
           lhs_linear_index += out_index[output_batch_dim] *
                               lhs_dim_multipliers[input_batch_dim];
 
@@ -1229,7 +1235,6 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
               lhs_dim_multipliers[input_batch_dim];
 
           lhs_linear_index += iz * lhs_dim_multipliers[input_z_dim];
-
           int64 rhs_linear_index = rhs_linear_spatial_index;
 
           rhs_linear_index += out_index[output_z_dim] *
diff --git a/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.cc b/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.cc
deleted file mode 100644
index 9415e20af7b..00000000000
--- a/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.cc
+++ /dev/null
@@ -1,120 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.h"
-
-#include "absl/algorithm/container.h"
-#include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/service/dynamic_dimension_inference.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
-#include "tensorflow/compiler/xla/service/shape_inference.h"
-
-namespace xla {
-
-namespace {
-
-StatusOr<bool> ReplaceGetSize(
-    HloInstruction* instr,
-    DynamicDimensionInference* dynamic_dimension_inference) {
-  if (instr->opcode() != HloOpcode::kGetDimensionSize) {
-    return false;
-  }
-  HloComputation* computation = instr->parent();
-
-  TF_ASSIGN_OR_RETURN(auto legal_shape,
-                      ShapeInference::InferGetDimensionSizeShape(
-                          instr->operand(0)->shape(), instr->dimension()));
-  TF_RET_CHECK(ShapeUtil::Equal(instr->shape(), legal_shape))
-      << "instr->shape() " << instr->shape().ToString() << " , "
-      << "legal_shape " << legal_shape.ToString();
-  TF_RET_CHECK(ShapeUtil::HasPrimitiveType(instr->shape(), S32));
-  HloInstruction* operand = instr->mutable_operand(0);
-  int64 dim = instr->dimension();
-  HloInstruction* dynamic_size =
-      dynamic_dimension_inference->GetDynamicSize(operand, {}, dim);
-  if (dynamic_size != nullptr) {
-    TF_RETURN_IF_ERROR(instr->ReplaceAllUsesWith(dynamic_size));
-    // The dependency between a instruction and its dynamic dimensions is not
-    // modeled in the IR. As instr is being replaced by dynamic_size, also tell
-    // dynamic dimension inference that the instruction is being replaced.
-    dynamic_dimension_inference->ReplaceAllDynamicDimensionUsesWith(
-        instr, dynamic_size);
-  } else {
-    int32 size = instr->operand(0)->shape().dimensions(dim);
-    HloInstruction* new_instr = computation->AddInstruction(
-        HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(size)));
-    TF_RETURN_IF_ERROR(instr->ReplaceAllUsesWith(new_instr));
-    dynamic_dimension_inference->ReplaceAllDynamicDimensionUsesWith(instr,
-                                                                    new_instr);
-  }
-  return true;
-}
-
-StatusOr<bool> ReplaceSetSize(HloInstruction* instr) {
-  if (instr->opcode() != HloOpcode::kSetDimensionSize) {
-    return false;
-  }
-
-  TF_RET_CHECK(Shape::Equal().IgnoreDynamicDimension()(
-      instr->shape(), instr->operand(0)->shape()))
-      << "instr->shape() " << instr->shape().ToString() << " , "
-      << "instruction operand shape " << instr->operand(0)->shape();
-  HloInstruction* operand = instr->mutable_operand(0);
-
-  TF_RETURN_IF_ERROR(instr->ReplaceAllUsesWith(operand));
-  return true;
-}
-
-}  // namespace
-
-StatusOr<bool> HloGetDimensionSizeRewriter::Run(HloModule* module) {
-  bool changed = false;
-  HloProto proto;
-  TF_ASSIGN_OR_RETURN(DynamicDimensionInference inference,
-                      DynamicDimensionInference::Run(module));
-  *proto.mutable_hlo_module() = module->ToProto();
-  // It's important to replace get-dimension-size first before
-  // set-dimension-size for the case below:
-  //  static_op    dynamic_size
-  //    |             |
-  //  set-dimension-size // Marks the dimension as dynamic
-  //    |
-  //  get-dimension-size
-  //
-  // If we replace set dimension size first, we'd have
-  //
-  //  static_op
-  //    |
-  //  get-dimension-size
-  //
-  // This will get static size of the op, which is incorrect.
-  for (auto* computation : module->computations()) {
-    for (auto instruction : computation->MakeInstructionPostOrder()) {
-      TF_ASSIGN_OR_RETURN(bool replaced_get_size,
-                          ReplaceGetSize(instruction, &inference));
-      changed = changed || replaced_get_size;
-    }
-  }
-  for (auto* computation : module->computations()) {
-    for (auto instruction : computation->MakeInstructionPostOrder()) {
-      TF_ASSIGN_OR_RETURN(bool replaced_set_size, ReplaceSetSize(instruction));
-      changed = changed || replaced_set_size;
-    }
-  }
-  return changed;
-}
-
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter_test.cc b/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter_test.cc
deleted file mode 100644
index b1491e96095..00000000000
--- a/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter_test.cc
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.h"
-
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_matchers.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
-#include "tensorflow/compiler/xla/service/hlo_parser.h"
-#include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
-#include "tensorflow/compiler/xla/tests/literal_test_util.h"
-#include "tensorflow/compiler/xla/tests/test_utils.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/platform/types.h"
-
-namespace xla {
-namespace {
-
-namespace op = xla::testing::opcode_matchers;
-
-class HloGetDimensionSizeRewriterTest : public HloTestBase {
- protected:
-  HloGetDimensionSizeRewriterTest() {}
-};
-
-TEST_F(HloGetDimensionSizeRewriterTest, Ok) {
-  auto module = ParseAndReturnVerifiedModule(R"(
-HloModule _
-ENTRY gds {
-  p = s32[3,4] parameter(0)
-  size0 = s32[] get-dimension-size(p), dimensions={0}
-  size1 = s32[] get-dimension-size(p), dimensions={1}
-  ROOT mul = s32[] multiply(size0, size1)
-})")
-                    .ValueOrDie();
-  HloGetDimensionSizeRewriter pass;
-  EXPECT_TRUE(pass.Run(module.get()).ValueOrDie());
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
-              op::Multiply(op::Constant(), op::Constant()));
-}
-
-TEST_F(HloGetDimensionSizeRewriterTest, GetSetSetDimensionSizeRewriter) {
-  auto module = ParseAndReturnVerifiedModule(R"(
-HloModule _
-ENTRY gds {
-  p = s32[3,4] parameter(0)
-  size0 = s32[] get-dimension-size(p), dimensions={0}
-  p_copy = s32[3,4] copy(p)
-  p_copy_dynamic = s32[<=3, 4] set-dimension-size(p_copy, size0), dimensions={0}
-  size1 = s32[] get-dimension-size(p_copy_dynamic), dimensions={0}
-  ROOT mul = s32[] multiply(size0, size1)
-})")
-                    .ValueOrDie();
-  HloGetDimensionSizeRewriter pass;
-  EXPECT_TRUE(pass.Run(module.get()).ValueOrDie());
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
-              op::Multiply(op::Constant(), op::Constant()));
-}
-
-TEST_F(HloGetDimensionSizeRewriterTest, IllegalType) {
-  auto module = ParseAndReturnUnverifiedModule(R"(
-HloModule _
-ENTRY gds {
-  p = s32[3]{0} parameter(0)
-  ROOT gds = s64[] get-dimension-size(p), dimensions={0}
-})")
-                    .ValueOrDie();
-  HloGetDimensionSizeRewriter pass;
-  EXPECT_FALSE(pass.Run(module.get()).ok());
-}
-
-TEST_F(HloGetDimensionSizeRewriterTest, IllegalDimension) {
-  auto module = ParseAndReturnUnverifiedModule(R"(
-HloModule _
-ENTRY gds {
-  p = f32[2,5] parameter(0)
-  ROOT gds = s32[] get-dimension-size(p), dimensions={2}
-})")
-                    .ValueOrDie();
-  HloGetDimensionSizeRewriter pass;
-  EXPECT_FALSE(pass.Run(module.get()).ok());
-}
-
-}  // namespace
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index d7e8984dee8..164e92ae8e8 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -1012,6 +1012,7 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
     case HloOpcode::kGather:
     case HloOpcode::kPad:
     case HloOpcode::kReshape:
+    case HloOpcode::kDynamicReshape:
     case HloOpcode::kReverse:
     case HloOpcode::kTupleSelect:
     case HloOpcode::kTranspose:
diff --git a/tensorflow/compiler/xla/service/hlo_input_output_alias_config.cc b/tensorflow/compiler/xla/service/hlo_input_output_alias_config.cc
index e123161720b..34bc30d641f 100644
--- a/tensorflow/compiler/xla/service/hlo_input_output_alias_config.cc
+++ b/tensorflow/compiler/xla/service/hlo_input_output_alias_config.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_input_output_alias_config.h"
 
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 
 namespace xla {
@@ -24,9 +25,10 @@ bool HloInputOutputAliasConfig::OutputHasAlias(
   return alias_.element(output_index).has_value();
 }
 
-Status HloInputOutputAliasConfig::SetUpAlias(const ShapeIndex& output_index,
-                                             int64 param_number,
-                                             const ShapeIndex& param_index) {
+Status HloInputOutputAliasConfig::SetUpAlias(
+    const ShapeIndex& output_index, int64 param_number,
+    const ShapeIndex& param_index,
+    HloInputOutputAliasConfig::AliasKind must_alias) {
   TF_RET_CHECK(ShapeUtil::IndexIsValid(alias_.shape(), output_index))
       << "Trying to set up alias at " << output_index.ToString()
       << " which is an invalid index for shape "
@@ -41,7 +43,8 @@ Status HloInputOutputAliasConfig::SetUpAlias(const ShapeIndex& output_index,
       param_number, param_index.ToString(), output_index.ToString(),
       alias_.element(output_index)->parameter_number,
       alias_.element(output_index)->parameter_index.ToString());
-  (*alias_.mutable_element(output_index)) = Alias(param_number, param_index);
+  (*alias_.mutable_element(output_index)) =
+      Alias(param_number, param_index, must_alias);
   VLOG(4) << "Set up alias between output index " << output_index.ToString()
           << " and parameter " << param_index << " at index "
           << param_index.ToString();
@@ -61,6 +64,11 @@ HloInputOutputAliasProto HloInputOutputAliasConfig::ToProto() const {
           for (int64 i : data->parameter_index) {
             entry.add_parameter_shape_index(i);
           }
+          if (data->must_alias()) {
+            entry.set_kind(Kind::MUST_ALIAS);
+          } else {
+            entry.set_kind(Kind::MAY_ALIAS);
+          }
           result.add_entries()->Swap(&entry);
         }
       });
@@ -77,8 +85,9 @@ StatusOr<HloInputOutputAliasConfig> HloInputOutputAliasConfig::CreateFromProto(
     int64 param_number = entry.parameter_number();
     ShapeIndex param_index(entry.parameter_shape_index().begin(),
                            entry.parameter_shape_index().end());
+    AliasKind kind = entry.kind() == Kind::MAY_ALIAS ? kMayAlias : kMustAlias;
     TF_RETURN_IF_ERROR(
-        result.SetUpAlias(output_index, param_number, param_index));
+        result.SetUpAlias(output_index, param_number, param_index, kind));
   }
   return result;
 }
@@ -93,9 +102,9 @@ string HloInputOutputAliasConfig::ToString() const {
 
   ForEachAlias([&](const ShapeIndex& output_index, const Alias& alias) {
     pieces.push_back(absl::StrFormat(
-        "  OutputIndex %s is aliased with parameter %lld at %s:",
-        output_index.ToString(), alias.parameter_number,
-        alias.parameter_index.ToString()));
+        "  OutputIndex %s is %saliased with parameter %lld at %s:",
+        output_index.ToString(), alias.kind == kMustAlias ? "must-" : "may-",
+        alias.parameter_number, alias.parameter_index.ToString()));
   });
   return absl::StrJoin(pieces, "\n");
 }
@@ -112,6 +121,19 @@ string HloInputOutputAliasConfig::ToShortString() const {
   return absl::StrJoin(pieces, ", ");
 }
 
+bool HloInputOutputAliasConfig::ParameterMustAlias(
+    int64 param_number, const ShapeIndex& param_index) const {
+  bool result = false;
+  alias_.ForEachElement(
+      [&](const xla::ShapeIndex&, absl::optional<Alias> alias) {
+        if (alias && alias->parameter_number == param_number &&
+            alias->parameter_index == param_index && alias->must_alias()) {
+          result = true;
+        }
+      });
+  return result;
+}
+
 absl::optional<ShapeIndex> HloInputOutputAliasConfig::GetAliasedOutput(
     int64 param_number, const ShapeIndex& param_index) const {
   absl::optional<ShapeIndex> output;
diff --git a/tensorflow/compiler/xla/service/hlo_input_output_alias_config.h b/tensorflow/compiler/xla/service/hlo_input_output_alias_config.h
index d5ca28e9387..d5630467783 100644
--- a/tensorflow/compiler/xla/service/hlo_input_output_alias_config.h
+++ b/tensorflow/compiler/xla/service/hlo_input_output_alias_config.h
@@ -32,22 +32,32 @@ class HloModule;
 // parameter index in the entry computation.
 class HloInputOutputAliasConfig {
  public:
+  // The kind of aliases which can be set. A kMayAlias is one setup at
+  // compilation time by the user, and has to be respected. A kMustAlias one
+  // might be setup by the compiler, if it decides it is convenient to do so.
+  enum AliasKind {
+    kMayAlias,
+    kMustAlias,
+  };
   // Defines the alias information for a given output buffer. A given output
   // buffer shape index can refer only to one parameter+index.
   struct Alias {
-    Alias(int64 parameter_number, ShapeIndex parameter_index)
+    Alias(int64 parameter_number, ShapeIndex parameter_index,
+          AliasKind kind = kMayAlias)
         : parameter_number(parameter_number),
-          parameter_index(std::move(parameter_index)) {}
+          parameter_index(std::move(parameter_index)),
+          kind(kind) {}
 
     int64 parameter_number;
     ShapeIndex parameter_index;
+    AliasKind kind;
+
+    bool must_alias() const { return kind == kMustAlias; }
 
     std::string ToString() {
-      if (parameter_index.empty()) {
-        return absl::StrCat(parameter_number);
-      }
-      return absl::StrFormat("(%lld, %s)", parameter_number,
-                             parameter_index.ToString());
+      return absl::StrFormat("(%lld, %s, %s)", parameter_number,
+                             parameter_index.ToString(),
+                             kind == kMustAlias ? "must-alias" : "may-alias");
     }
   };
 
@@ -61,7 +71,8 @@ class HloInputOutputAliasConfig {
   // Sets up alias config from `output_index` to `param_index` at
   // `param_number`.
   Status SetUpAlias(const ShapeIndex& output_index, int64 param_number,
-                    const ShapeIndex& param_index);
+                    const ShapeIndex& param_index,
+                    AliasKind must_alias = kMayAlias);
 
   // Returns true if the given parameter is aliased with one of the output
   // buffers.
@@ -92,6 +103,11 @@ class HloInputOutputAliasConfig {
   absl::optional<Alias> GetAliasedParameter(
       const ShapeIndex& output_index) const;
 
+  // Returns if the parameter at the given parameter number and parameter
+  // index must-alias with an output.
+  bool ParameterMustAlias(int64 param_number,
+                          const ShapeIndex& param_index) const;
+
   using AliasFn =
       std::function<void(const ShapeIndex& output_index, const Alias&)>;
 
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index 94d53ebe0b1..251261a677f 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -167,6 +167,11 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
                               absl::Span<const int64>(fft_length));
       break;
     }
+    case HloOpcode::kCopyStart: {
+      instruction = CreateCopyStart(shape, operands(0),
+                                    proto.is_cross_program_prefetch());
+      break;
+    }
     case HloOpcode::kCompare: {
       // Auto-upgraded from deprecated opcode skips the following.
       if (!comparison_direction) {
@@ -174,8 +179,19 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
             comparison_direction,
             StringToComparisonDirection(proto.comparison_direction()));
       }
-      instruction =
-          CreateCompare(shape, operands(0), operands(1), *comparison_direction);
+      auto comparison_type_str = proto.comparison_type();
+      if (!comparison_type_str.empty()) {
+        // If a comparison type is specified, it *must* be valid.
+        TF_ASSIGN_OR_RETURN(auto comparison_type,
+                            StringToComparisonType(comparison_type_str));
+        instruction = CreateCompare(shape, operands(0), operands(1),
+                                    *comparison_direction, comparison_type);
+      } else {
+        // Allow the specify of comparison type to be optional.
+        // The comparison type will be determined by the types of the operands.
+        instruction = CreateCompare(shape, operands(0), operands(1),
+                                    *comparison_direction);
+      }
       break;
     }
     case HloOpcode::kTriangularSolve: {
@@ -689,6 +705,17 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       instruction = CreateReshape(shape, operands(0), inferred_dimension);
       break;
     }
+    case HloOpcode::kDynamicReshape: {
+      TF_RET_CHECK(shape.IsArray() && operands(0)->shape().IsArray() &&
+                   ShapeUtil::ElementsIn(shape) ==
+                       ShapeUtil::ElementsIn(operands(0)->shape()))
+          << "shape: " << ShapeUtil::HumanString(shape)
+          << " operand: " << ShapeUtil::HumanString(operands(0)->shape());
+      const auto& operand_vector = all_operands();
+      instruction = CreateDynamicReshape(
+          shape, operands(0), absl::MakeSpan(operand_vector).subspan(1));
+      break;
+    }
     default: {
       instruction = absl::WrapUnique(new HloInstruction(opcode, shape));
       for (const int64 operand_id : proto.operand_ids()) {
@@ -817,7 +844,6 @@ HloInstruction::CreateRngBitGenerator(const Shape& shape, HloInstruction* state,
     case HloOpcode::kCeil:
     case HloOpcode::kCollectivePermuteDone:
     case HloOpcode::kCopy:
-    case HloOpcode::kCopyStart:
     case HloOpcode::kCopyDone:
     case HloOpcode::kCos:
     case HloOpcode::kClz:
@@ -924,10 +950,18 @@ HloInstruction::CreateRngBitGenerator(const Shape& shape, HloInstruction* state,
                                               fft_length);
 }
 
+/* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateCopyStart(
+    const Shape& shape, HloInstruction* operand,
+    bool is_cross_program_prefetch) {
+  return absl::make_unique<HloCopyStartInstruction>(shape, operand,
+                                                    is_cross_program_prefetch);
+}
+
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateCompare(
     const Shape& shape, HloInstruction* lhs, HloInstruction* rhs,
-    ComparisonDirection direction) {
-  return absl::make_unique<HloCompareInstruction>(shape, lhs, rhs, direction);
+    ComparisonDirection direction, absl::optional<Comparison::Type> type) {
+  return absl::make_unique<HloCompareInstruction>(shape, lhs, rhs, direction,
+                                                  type);
 }
 
 /* static */ std::unique_ptr<HloInstruction>
@@ -1361,6 +1395,19 @@ HloInstruction::CreateBroadcastSequence(
                                                   inferred_dimension);
 }
 
+/* static */ std::unique_ptr<HloInstruction>
+HloInstruction::CreateDynamicReshape(
+    const Shape& shape, HloInstruction* data_operand,
+    absl::Span<HloInstruction* const> dim_sizes) {
+  CHECK_EQ(ShapeUtil::ElementsIn(shape),
+           ShapeUtil::ElementsIn(data_operand[0].shape()))
+      << "shape: " << ShapeUtil::HumanString(shape)
+      << " operand: " << ShapeUtil::HumanString(data_operand[0].shape());
+  CHECK_EQ(shape.rank(), dim_sizes.size());
+  return absl::make_unique<HloDynamicReshapeInstruction>(shape, data_operand,
+                                                         dim_sizes);
+}
+
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateTranspose(
     const Shape& shape, HloInstruction* operand,
     absl::Span<const int64> dimensions) {
@@ -1557,6 +1604,7 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
     case HloOpcode::kTranspose:
     case HloOpcode::kBroadcast:
     case HloOpcode::kReshape:
+    case HloOpcode::kDynamicReshape:
     case HloOpcode::kMap:
     case HloOpcode::kSlice:
     case HloOpcode::kConstant:
@@ -1894,6 +1942,56 @@ Status HloInstruction::CopyAllControlDepsFrom(const HloInstruction* inst) {
   return Status::OK();
 }
 
+bool HloInstruction::IdenticalInternal(
+    const HloInstruction& other,
+    const std::function<bool(const HloInstruction*, const HloInstruction*)>&
+        eq_operands,
+    const std::function<bool(const HloComputation*, const HloComputation*)>&
+        eq_computations,
+    bool layout_sensitive, bool ignore_channel_id_values) const {
+  // An instruction is always identical to itself.
+  if (this == &other) {
+    return true;
+  }
+
+  // Identical instruction must have the same opcode, shape, and identical
+  // operands.
+  if (opcode() != other.opcode()) {
+    return false;
+  }
+  if (!(layout_sensitive ? ShapeUtil::Equal(shape(), other.shape())
+                         : ShapeUtil::Compatible(shape(), other.shape()))) {
+    return false;
+  }
+  if (operands().size() != other.operands().size()) {
+    return false;
+  }
+
+  // Two AllReduces are Identical if they have the same channel_id.
+  // Their operands don't have to be Identical.
+  if (!IsCrossModuleAllReduce()) {
+    // Use an explicit loop rather than ContainerEquals, because copying
+    // around std::functions may be too expensive in some cases.
+    for (size_t i = 0; i < operands().size(); ++i) {
+      if (!eq_operands(operand(i), other.operand(i))) {
+        return false;
+      }
+    }
+  }
+
+  if (backend_config_ != other.backend_config_) {
+    return false;
+  }
+
+  if (ignore_channel_id_values) {
+    if (auto channel_inst = DynCast<HloChannelInstruction>(this)) {
+      return channel_inst->IdenticalSlowPathIgnoringChannelIdValues(
+          other, eq_computations);
+    }
+  }
+  return IdenticalSlowPath(other, eq_computations);
+}
+
 void HloInstruction::AppendOperand(HloInstruction* operand) {
   if (operand->parent() != nullptr) {
     DCHECK(!operand->parent()->IsMarkedAsDead(operand))
@@ -1995,6 +2093,7 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kReal:
     case HloOpcode::kRemainder:
     case HloOpcode::kReshape:
+    case HloOpcode::kDynamicReshape:
     case HloOpcode::kReplicaId:
     case HloOpcode::kRoundNearestAfz:
     case HloOpcode::kRsqrt:
@@ -2800,7 +2899,8 @@ HloInstructionProto HloInstruction::ToProto() const {
 
 string HloInstruction::ToCategory() const {
   if (opcode() == HloOpcode::kTranspose || opcode() == HloOpcode::kCopy ||
-      opcode() == HloOpcode::kReshape) {
+      opcode() == HloOpcode::kReshape ||
+      opcode() == HloOpcode::kDynamicReshape) {
     return "data formatting";
   }
 
@@ -3021,6 +3121,8 @@ Status HloInstruction::Visit(DfsHloVisitorBase<HloInstructionPtr>* visitor) {
       return visitor->HandlePad(this);
     case HloOpcode::kReshape:
       return visitor->HandleReshape(this);
+    case HloOpcode::kDynamicReshape:
+      return visitor->HandleDynamicReshape(this);
     case HloOpcode::kTranspose:
       return visitor->HandleTranspose(this);
     case HloOpcode::kReverse:
@@ -3318,6 +3420,11 @@ class HloInstruction::FusionReusesParamElements {
         // that.
         value_it = cache->find(&hlo);
         value_it->second = new_val;
+        // Fold() minimizes the UseKind value. If it is already minimum, we can
+        // break the loop early.
+        if (new_val == UseKind::kReuse) {
+          break;
+        }
       }
     }
     return value_it->second;
@@ -3939,6 +4046,10 @@ const Shape& HloInstruction::outfeed_shape() const {
   return Cast<HloOutfeedInstruction>(this)->outfeed_shape();
 }
 
+Shape* HloInstruction::mutable_outfeed_shape() {
+  return Cast<HloOutfeedInstruction>(this)->mutable_outfeed_shape();
+}
+
 const string& HloInstruction::outfeed_config() const {
   return Cast<HloOutfeedInstruction>(this)->outfeed_config();
 }
@@ -4077,6 +4188,10 @@ const DomainMetadata& HloInstruction::user_side_metadata() const {
   return Cast<HloDomainInstruction>(this)->user_side_metadata();
 }
 
+bool HloInstruction::is_cross_program_prefetch() const {
+  return Cast<HloCopyStartInstruction>(this)->is_cross_program_prefetch();
+}
+
 ComparisonDirection HloInstruction::comparison_direction() const {
   return Cast<HloCompareInstruction>(this)->direction();
 }
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index e29323c25b4..e21ae719e4d 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -592,10 +592,17 @@ class HloInstruction {
       const Shape& shape, HloInstruction* operand, FftType fft_type,
       absl::Span<const int64> fft_length);
 
+  // Creates a copy-start op, indicating whether this is a cross-program
+  // prefetch or not.
+  static std::unique_ptr<HloInstruction> CreateCopyStart(
+      const Shape& shape, HloInstruction* operand,
+      bool is_cross_program_prefetch = false);
+
   // Creates a compare op, performing the comparison specified in direction.
   static std::unique_ptr<HloInstruction> CreateCompare(
       const Shape& shape, HloInstruction* lhs, HloInstruction* rhs,
-      Comparison::Direction direction);
+      Comparison::Direction direction,
+      absl::optional<Comparison::Type> type = absl::nullopt);
 
   static std::unique_ptr<HloInstruction> CreateTriangularSolve(
       const Shape& shape, HloInstruction* a, HloInstruction* b,
@@ -878,6 +885,14 @@ class HloInstruction {
       const Shape& shape, HloInstruction* operand,
       int64 inferred_dimension = -1);
 
+  // Creates a dynamic reshape instruction. Similar to reshape but dynamic
+  // dimensions sizes are provided as additional variadic arguments.
+  //
+  // Precondition: dim_sizes.size() == shape.rank()
+  static std::unique_ptr<HloInstruction> CreateDynamicReshape(
+      const Shape& shape, HloInstruction* data_operand,
+      absl::Span<HloInstruction* const> dim_sizes);
+
   // Creates a transpose instruction which permutes the operand dimensions.
   static std::unique_ptr<HloInstruction> CreateTranspose(
       const Shape& shape, HloInstruction* operand,
@@ -1107,41 +1122,23 @@ class HloInstruction {
       const std::function<bool(const HloComputation*, const HloComputation*)>&
           eq_computations = std::equal_to<const HloComputation*>(),
       bool layout_sensitive = true) const {
-    // An instruction is always identical to itself.
-    if (this == &other) {
-      return true;
-    }
+    return IdenticalInternal(other, eq_operands, eq_computations,
+                             layout_sensitive,
+                             /*ignore_channel_id_values=*/false);
+  }
 
-    // Identical instruction must have the same opcode, shape, and identical
-    // operands.
-    if (opcode() != other.opcode()) {
-      return false;
-    }
-    if (!(layout_sensitive ? ShapeUtil::Equal(shape(), other.shape())
-                           : ShapeUtil::Compatible(shape(), other.shape()))) {
-      return false;
-    }
-    if (operands().size() != other.operands().size()) {
-      return false;
-    }
-
-    // Two AllReduces are Identical if they have the same channel_id.
-    // Their operands don't have to be Identical.
-    if (!IsCrossModuleAllReduce()) {
-      // Use an explicit loop rather than ContainerEquals, because copying
-      // around std::functions may be too expensive in some cases.
-      for (size_t i = 0; i < operands().size(); ++i) {
-        if (!eq_operands(operand(i), other.operand(i))) {
-          return false;
-        }
-      }
-    }
-
-    if (backend_config_ != other.backend_config_) {
-      return false;
-    }
-
-    return IdenticalSlowPath(other, eq_computations);
+  // Same as Identical() but ignores channel ID value mismatches, as long as
+  // both have channel IDs or neither has a channel ID.
+  bool IdenticalIgnoringChannelIdValues(
+      const HloInstruction& other,
+      const std::function<bool(const HloInstruction*, const HloInstruction*)>&
+          eq_operands = std::equal_to<const HloInstruction*>(),
+      const std::function<bool(const HloComputation*, const HloComputation*)>&
+          eq_computations = std::equal_to<const HloComputation*>(),
+      bool layout_sensitive = true) const {
+    return IdenticalInternal(other, eq_operands, eq_computations,
+                             layout_sensitive,
+                             /*ignore_channel_id_values=*/true);
   }
 
   // Generates a hash value of an HLO instruction. Hash considers
@@ -1772,6 +1769,9 @@ class HloInstruction {
   // Returns the shape for the Outfeed instruction.
   const Shape& outfeed_shape() const;
 
+  // Returns the mutable shape for the Outfeed instruction.
+  Shape* mutable_outfeed_shape();
+
   // Delegates to HloCollectiveInstruction::replica_groups.
   const std::vector<ReplicaGroup>& replica_groups() const;
 
@@ -1856,6 +1856,9 @@ class HloInstruction {
   // Delegates to HloDomainInstruction::user_side_metadata().
   const DomainMetadata& user_side_metadata() const;
 
+  // Delegates to HloCopyStartInstruction::is_cross_program_prefetch().
+  bool is_cross_program_prefetch() const;
+
   // Delegates to HloCompareInstruction::direction().
   ComparisonDirection comparison_direction() const;
 
@@ -1944,6 +1947,14 @@ class HloInstruction {
  private:
   friend class HloComputation;
 
+  bool IdenticalInternal(
+      const HloInstruction& other,
+      const std::function<bool(const HloInstruction*, const HloInstruction*)>&
+          eq_operands,
+      const std::function<bool(const HloComputation*, const HloComputation*)>&
+          eq_computations,
+      bool layout_sensitive, bool ignore_channel_id_values) const;
+
   // Implementation for non-common logic of CloneWithNewOperands.
   virtual std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
       const Shape& shape, absl::Span<HloInstruction* const> new_operands,
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.cc b/tensorflow/compiler/xla/service/hlo_instructions.cc
index 3d34fa03a80..c4c31dba9a4 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.cc
+++ b/tensorflow/compiler/xla/service/hlo_instructions.cc
@@ -204,12 +204,54 @@ std::unique_ptr<HloInstruction> HloFftInstruction::CloneWithNewOperandsImpl(
                                               fft_length_);
 }
 
-HloCompareInstruction::HloCompareInstruction(const Shape& shape,
-                                             HloInstruction* lhs,
-                                             HloInstruction* rhs,
-                                             ComparisonDirection direction)
+HloCopyStartInstruction::HloCopyStartInstruction(const Shape& shape,
+                                                 HloInstruction* operand,
+                                                 bool is_cross_program_prefetch)
+    : HloInstruction(HloOpcode::kCopyStart, shape),
+      is_cross_program_prefetch_(is_cross_program_prefetch) {
+  AppendOperand(operand);
+}
+
+HloInstructionProto HloCopyStartInstruction::ToProto() const {
+  HloInstructionProto proto = HloInstruction::ToProto();
+  proto.set_is_cross_program_prefetch(is_cross_program_prefetch_);
+  return proto;
+}
+
+std::vector<string> HloCopyStartInstruction::ExtraAttributesToStringImpl(
+    const HloPrintOptions& options) const {
+  std::vector<string> result;
+  if (is_cross_program_prefetch()) {
+    result.push_back("is_cross_program_prefetch=true");
+  }
+  return result;
+}
+
+bool HloCopyStartInstruction::IdenticalSlowPath(
+    const HloInstruction& other,
+    const std::function<bool(const HloComputation*, const HloComputation*)>&
+        eq_computations) const {
+  const auto& casted_other = static_cast<const HloCopyStartInstruction&>(other);
+  return is_cross_program_prefetch() ==
+         casted_other.is_cross_program_prefetch();
+}
+
+std::unique_ptr<HloInstruction>
+HloCopyStartInstruction::CloneWithNewOperandsImpl(
+    const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+    HloCloneContext* context) const {
+  CHECK_EQ(new_operands.size(), 1);
+  return absl::make_unique<HloCopyStartInstruction>(
+      shape, new_operands[0], is_cross_program_prefetch());
+}
+
+HloCompareInstruction::HloCompareInstruction(
+    const Shape& shape, HloInstruction* lhs, HloInstruction* rhs,
+    ComparisonDirection direction, absl::optional<Comparison::Type> type)
     : HloInstruction(HloOpcode::kCompare, shape),
-      compare_(direction, lhs->shape().element_type()) {
+      compare_(direction, type ? (*type)
+                               : Comparison::DefaultComparisonType(
+                                     lhs->shape().element_type())) {
   AppendOperand(lhs);
   AppendOperand(rhs);
 }
@@ -218,12 +260,21 @@ HloInstructionProto HloCompareInstruction::ToProto() const {
   HloInstructionProto proto = HloInstruction::ToProto();
   proto.set_comparison_direction(
       ComparisonDirectionToString(compare_.GetDirection()));
+  proto.set_comparison_type(ComparisonTypeToString(compare_.GetType()));
   return proto;
 }
 
 std::vector<string> HloCompareInstruction::ExtraAttributesToStringImpl(
     const HloPrintOptions& options) const {
-  return {StrCat("direction=", ComparisonDirectionToString(direction()))};
+  std::vector<string> result;
+  result.push_back(
+      StrCat("direction=", ComparisonDirectionToString(direction())));
+  if (compare_.GetType() !=
+      Comparison::DefaultComparisonType(operand(0)->shape().element_type())) {
+    result.push_back(
+        StrCat("type=", ComparisonTypeToString(compare_.GetType())));
+  }
+  return result;
 }
 
 bool HloCompareInstruction::IdenticalSlowPath(
@@ -238,8 +289,8 @@ std::unique_ptr<HloInstruction> HloCompareInstruction::CloneWithNewOperandsImpl(
     const Shape& shape, absl::Span<HloInstruction* const> new_operands,
     HloCloneContext* context) const {
   CHECK_EQ(new_operands.size(), 2);
-  return absl::make_unique<HloCompareInstruction>(shape, new_operands[0],
-                                                  new_operands[1], direction());
+  return absl::make_unique<HloCompareInstruction>(
+      shape, new_operands[0], new_operands[1], direction(), type());
 }
 
 namespace {
@@ -396,7 +447,10 @@ std::vector<string> HloChannelInstruction::ExtraAttributesToStringImpl(
 bool HloChannelInstruction::IdenticalSlowPath(
     const HloInstruction& other,
     const std::function<bool(const HloComputation*, const HloComputation*)>&
-    /*eq_computations*/) const {
+        eq_computations) const {
+  if (!IdenticalSlowPathIgnoringChannelIdValues(other, eq_computations)) {
+    return false;
+  }
   const auto& casted_other = static_cast<const HloChannelInstruction&>(other);
   return channel_id() == casted_other.channel_id();
 }
@@ -424,7 +478,7 @@ std::vector<string> HloSendRecvInstruction::ExtraAttributesToStringImpl(
   return attrs;
 }
 
-bool HloSendRecvInstruction::IdenticalSlowPath(
+bool HloSendRecvInstruction::IdenticalSlowPathIgnoringChannelIdValues(
     const HloInstruction& other,
     const std::function<bool(const HloComputation*, const HloComputation*)>&
         eq_computations) const {
@@ -545,13 +599,14 @@ std::vector<string> HloCollectiveInstruction::ExtraAttributesToStringImpl(
   return result;
 }
 
-bool HloCollectiveInstruction::IdenticalSlowPath(
+bool HloCollectiveInstruction::IdenticalSlowPathIgnoringChannelIdValues(
     const HloInstruction& other,
     const std::function<bool(const HloComputation*, const HloComputation*)>&
         eq_computations) const {
   const auto& casted_other =
       static_cast<const HloCollectiveInstruction&>(other);
-  return HloChannelInstruction::IdenticalSlowPath(other, eq_computations) &&
+  return HloChannelInstruction::IdenticalSlowPathIgnoringChannelIdValues(
+             other, eq_computations) &&
          constrain_layout() == casted_other.constrain_layout() &&
          absl::c_equal(replica_groups(), casted_other.replica_groups(),
                        [](const ReplicaGroup& a, const ReplicaGroup& b) {
@@ -594,12 +649,13 @@ HloInstructionProto HloAllGatherInstruction::ToProto() const {
   return proto;
 }
 
-bool HloAllGatherInstruction::IdenticalSlowPath(
+bool HloAllGatherInstruction::IdenticalSlowPathIgnoringChannelIdValues(
     const HloInstruction& other,
     const std::function<bool(const HloComputation*, const HloComputation*)>&
         eq_computations) const {
   const auto& casted_other = static_cast<const HloAllGatherInstruction&>(other);
-  return HloCollectiveInstruction::IdenticalSlowPath(other, eq_computations) &&
+  return HloCollectiveInstruction::IdenticalSlowPathIgnoringChannelIdValues(
+             other, eq_computations) &&
          all_gather_dimension_ == casted_other.all_gather_dimension() &&
          use_global_device_ids() == casted_other.use_global_device_ids();
 }
@@ -640,12 +696,13 @@ std::vector<string> HloAllReduceInstruction::ExtraAttributesToStringImpl(
   return result;
 }
 
-bool HloAllReduceInstruction::IdenticalSlowPath(
+bool HloAllReduceInstruction::IdenticalSlowPathIgnoringChannelIdValues(
     const HloInstruction& other,
     const std::function<bool(const HloComputation*, const HloComputation*)>&
         eq_computations) const {
   const auto& casted_other = static_cast<const HloAllReduceInstruction&>(other);
-  return HloCollectiveInstruction::IdenticalSlowPath(other, eq_computations) &&
+  return HloCollectiveInstruction::IdenticalSlowPathIgnoringChannelIdValues(
+             other, eq_computations) &&
          constrain_layout() == casted_other.constrain_layout() &&
          use_global_device_ids() == casted_other.use_global_device_ids() &&
          eq_computations(to_apply(), casted_other.to_apply());
@@ -696,12 +753,13 @@ std::vector<string> HloAllToAllInstruction::ExtraAttributesToStringImpl(
   return result;
 }
 
-bool HloAllToAllInstruction::IdenticalSlowPath(
+bool HloAllToAllInstruction::IdenticalSlowPathIgnoringChannelIdValues(
     const HloInstruction& other,
     const std::function<bool(const HloComputation*, const HloComputation*)>&
         eq_computations) const {
   const auto& casted_other = static_cast<const HloAllToAllInstruction&>(other);
-  return HloCollectiveInstruction::IdenticalSlowPath(other, eq_computations) &&
+  return HloCollectiveInstruction::IdenticalSlowPathIgnoringChannelIdValues(
+             other, eq_computations) &&
          split_dimension_ == casted_other.split_dimension();
 }
 
@@ -737,7 +795,7 @@ HloCollectivePermuteInstruction::ExtraAttributesToStringImpl(
   return result;
 }
 
-bool HloCollectivePermuteInstruction::IdenticalSlowPath(
+bool HloCollectivePermuteInstruction::IdenticalSlowPathIgnoringChannelIdValues(
     const HloInstruction& other,
     const std::function<bool(const HloComputation*, const HloComputation*)>&
         eq_computations) const {
@@ -746,7 +804,8 @@ bool HloCollectivePermuteInstruction::IdenticalSlowPath(
   }
   const auto& casted_other =
       static_cast<const HloCollectivePermuteInstruction&>(other);
-  return HloChannelInstruction::IdenticalSlowPath(other, eq_computations) &&
+  return HloChannelInstruction::IdenticalSlowPathIgnoringChannelIdValues(
+             other, eq_computations) &&
          absl::c_equal(source_target_pairs(),
                        casted_other.source_target_pairs(),
                        [](const std::pair<int64, int64>& a,
@@ -1017,6 +1076,25 @@ HloBroadcastInstruction::CloneWithNewOperandsImpl(
                                                     dimensions());
 }
 
+HloDynamicReshapeInstruction::HloDynamicReshapeInstruction(
+    const Shape& shape, HloInstruction* data_operand,
+    absl::Span<HloInstruction* const> dim_sizes)
+    : HloInstruction(HloOpcode::kDynamicReshape, shape) {
+  AppendOperand(data_operand);
+  for (auto operand : dim_sizes) {
+    AppendOperand(operand);
+  }
+}
+
+std::unique_ptr<HloInstruction>
+HloDynamicReshapeInstruction::CloneWithNewOperandsImpl(
+    const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+    HloCloneContext* context) const {
+  CHECK_GE(new_operands.size(), 1);
+  return absl::make_unique<HloDynamicReshapeInstruction>(
+      shape, new_operands[0], new_operands.subspan(1));
+}
+
 HloReshapeInstruction::HloReshapeInstruction(const Shape& shape,
                                              HloInstruction* operand,
                                              int64 inferred_dimension)
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.h b/tensorflow/compiler/xla/service/hlo_instructions.h
index 51317b32bd0..821849bb02f 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.h
+++ b/tensorflow/compiler/xla/service/hlo_instructions.h
@@ -132,12 +132,36 @@ class HloFftInstruction : public HloInstruction {
   std::vector<int64> fft_length_;
 };
 
+class HloCopyStartInstruction : public HloInstruction {
+ public:
+  explicit HloCopyStartInstruction(const Shape& shape, HloInstruction* operand,
+                                   bool is_cross_program_prefetch);
+
+  bool is_cross_program_prefetch() const { return is_cross_program_prefetch_; }
+  HloInstructionProto ToProto() const override;
+
+ private:
+  std::vector<string> ExtraAttributesToStringImpl(
+      const HloPrintOptions& options) const override;
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      const std::function<bool(const HloComputation*, const HloComputation*)>&
+          eq_computations) const override;
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+
+  bool is_cross_program_prefetch_;
+};
+
 class HloCompareInstruction : public HloInstruction {
  public:
   explicit HloCompareInstruction(const Shape& shape, HloInstruction* lhs,
                                  HloInstruction* rhs,
-                                 ComparisonDirection direction);
+                                 ComparisonDirection direction,
+                                 absl::optional<Comparison::Type> type);
   ComparisonDirection direction() const { return compare_.GetDirection(); }
+  Comparison::Type type() const { return compare_.GetType(); }
   HloInstructionProto ToProto() const override;
 
  private:
@@ -220,6 +244,15 @@ class HloChannelInstruction : public HloInstruction {
   absl::optional<int64> channel_id() const { return channel_id_; }
   void set_channel_id(const absl::optional<int64>& channel_id);
 
+  // Whether this instruction is identical to `other` except for the values of
+  // channel IDs, as long as both have channel IDs or neither has a channel ID.
+  virtual bool IdenticalSlowPathIgnoringChannelIdValues(
+      const HloInstruction& other,
+      const std::function<bool(const HloComputation*, const HloComputation*)>&
+          eq_computations) const {
+    return channel_id_.has_value() == other.channel_id().has_value();
+  }
+
  protected:
   explicit HloChannelInstruction(HloOpcode opcode, const Shape& shape,
                                  const absl::optional<int64>& channel_id);
@@ -228,10 +261,13 @@ class HloChannelInstruction : public HloInstruction {
 
   std::vector<string> ExtraAttributesToStringImpl(
       const HloPrintOptions& options) const override;
+
+  // Do not override IdenticalSlowPath(). Override
+  // IdenticalSlowPathIgnoringChannelIdValues() instead.
   bool IdenticalSlowPath(
       const HloInstruction& other,
       const std::function<bool(const HloComputation*, const HloComputation*)>&
-          eq_computations) const override;
+          eq_computations) const final;
 
   absl::optional<int64> channel_id_;
 };
@@ -251,7 +287,7 @@ class HloSendRecvInstruction : public HloChannelInstruction {
  private:
   std::vector<string> ExtraAttributesToStringImpl(
       const HloPrintOptions& options) const override;
-  bool IdenticalSlowPath(
+  bool IdenticalSlowPathIgnoringChannelIdValues(
       const HloInstruction& other,
       const std::function<bool(const HloComputation*, const HloComputation*)>&
           eq_computations) const override;
@@ -339,7 +375,7 @@ class HloCollectiveInstruction : public HloChannelInstruction {
 
   std::vector<string> ExtraAttributesToStringImpl(
       const HloPrintOptions& options) const override;
-  bool IdenticalSlowPath(
+  bool IdenticalSlowPathIgnoringChannelIdValues(
       const HloInstruction& other,
       const std::function<bool(const HloComputation*, const HloComputation*)>&
           eq_computations) const override;
@@ -366,7 +402,7 @@ class HloAllGatherInstruction : public HloCollectiveInstruction {
   HloInstructionProto ToProto() const override;
 
  private:
-  bool IdenticalSlowPath(
+  bool IdenticalSlowPathIgnoringChannelIdValues(
       const HloInstruction& other,
       const std::function<bool(const HloComputation*, const HloComputation*)>&
           eq_computations) const override;
@@ -410,7 +446,7 @@ class HloAllReduceInstruction : public HloCollectiveInstruction {
   HloInstructionProto ToProto() const override;
 
  private:
-  bool IdenticalSlowPath(
+  bool IdenticalSlowPathIgnoringChannelIdValues(
       const HloInstruction& other,
       const std::function<bool(const HloComputation*, const HloComputation*)>&
           eq_computations) const override;
@@ -447,7 +483,7 @@ class HloAllToAllInstruction : public HloCollectiveInstruction {
   HloInstructionProto ToProto() const override;
 
  private:
-  bool IdenticalSlowPath(
+  bool IdenticalSlowPathIgnoringChannelIdValues(
       const HloInstruction& other,
       const std::function<bool(const HloComputation*, const HloComputation*)>&
           eq_computations) const override;
@@ -477,7 +513,7 @@ class HloCollectivePermuteInstruction : public HloChannelInstruction {
  private:
   std::vector<string> ExtraAttributesToStringImpl(
       const HloPrintOptions& options) const override;
-  bool IdenticalSlowPath(
+  bool IdenticalSlowPathIgnoringChannelIdValues(
       const HloInstruction& other,
       const std::function<bool(const HloComputation*, const HloComputation*)>&
           eq_computations) const override;
@@ -677,6 +713,25 @@ class HloBroadcastInstruction : public HloInstruction {
   std::vector<int64> dimensions_;
 };
 
+class HloDynamicReshapeInstruction : public HloInstruction {
+ public:
+  explicit HloDynamicReshapeInstruction(
+      const Shape& shape, HloInstruction* data_operand,
+      absl::Span<HloInstruction* const> dim_sizes);
+
+  // Returns the input dim sizes dimensions, which is operands[1:]
+  absl::Span<HloInstruction* const> dim_sizes() const {
+    return absl::MakeSpan(operands()).subspan(1, operand_count());
+  }
+
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+
+  // Returns the input dim size dimension, which is operands[1+i]
+  HloInstruction* dim_sizes(int64 i) const { return operands()[i + 1]; }
+};
+
 class HloReshapeInstruction : public HloInstruction {
  public:
   explicit HloReshapeInstruction(const Shape& shape, HloInstruction* operand,
@@ -1139,6 +1194,8 @@ class HloOutfeedInstruction : public HloInstruction {
                                  absl::string_view outfeed_config);
   // Returns the shape for the Outfeed instruction.
   const Shape& outfeed_shape() const { return outfeed_shape_; }
+  // Returns the mutable shape for the Outfeed instruction.
+  Shape* mutable_outfeed_shape() { return &outfeed_shape_; }
   // Returns the config for the Outfeed instruction.
   const string& outfeed_config() const { return outfeed_config_; }
   void set_outfeed_config(const string& config) { outfeed_config_ = config; }
diff --git a/tensorflow/compiler/xla/service/hlo_lexer.cc b/tensorflow/compiler/xla/service/hlo_lexer.cc
index 5502665e886..749193a83ef 100644
--- a/tensorflow/compiler/xla/service/hlo_lexer.cc
+++ b/tensorflow/compiler/xla/service/hlo_lexer.cc
@@ -281,6 +281,7 @@ TokKind HloLexer::LexIdentifier() {
   KEYWORD(ROOT);
   KEYWORD(maximal);
   KEYWORD(replicated);
+  KEYWORD(last_tile_dim_replicate);
 
 #undef KEYWORD
 
@@ -495,6 +496,8 @@ string TokKindToString(TokKind kind) {
       return "kw_maximal";
     case TokKind::kw_replicated:
       return "kw_replicated";
+    case TokKind::kw_last_tile_dim_replicate:
+      return "kw_last_tile_dim_replicate";
     case TokKind::kw_nan:
       return "kw_nan";
     case TokKind::kw_inf:
diff --git a/tensorflow/compiler/xla/service/hlo_lexer.h b/tensorflow/compiler/xla/service/hlo_lexer.h
index 6a59f180ad8..b8c7debaab4 100644
--- a/tensorflow/compiler/xla/service/hlo_lexer.h
+++ b/tensorflow/compiler/xla/service/hlo_lexer.h
@@ -61,6 +61,7 @@ enum class TokKind {
   kw_false,
   kw_maximal,
   kw_replicated,
+  kw_last_tile_dim_replicate,
   kw_nan,
   kw_inf,
 
diff --git a/tensorflow/compiler/xla/service/hlo_matchers_test.cc b/tensorflow/compiler/xla/service/hlo_matchers_test.cc
index cb5cbd05d65..9c6509d8b73 100644
--- a/tensorflow/compiler/xla/service/hlo_matchers_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_matchers_test.cc
@@ -276,10 +276,10 @@ TEST_F(HloMatchersTest, AsyncCopyMatcher) {
       /*element_size_in_bits=*/0, /*memory_space=*/2);
 
   auto p0 = HloInstruction::CreateParameter(0, shape_memspace1, "p0");
-  auto copy_start = HloInstruction::CreateUnary(
+  auto copy_start = HloInstruction::CreateCopyStart(
       ShapeUtil::MakeTupleShape(
           {shape_memspace2, shape_memspace1, ShapeUtil::MakeShape(U32, {})}),
-      HloOpcode::kCopyStart, p0.get());
+      p0.get());
   auto copy_done = HloInstruction::CreateUnary(
       shape_memspace2, HloOpcode::kCopyDone, copy_start.get());
 
diff --git a/tensorflow/compiler/xla/service/hlo_memory_scheduler_test.cc b/tensorflow/compiler/xla/service/hlo_memory_scheduler_test.cc
index 8ee8d332aff..076e31dc8eb 100644
--- a/tensorflow/compiler/xla/service/hlo_memory_scheduler_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_memory_scheduler_test.cc
@@ -50,9 +50,9 @@ int64 PeakMemoryUseOfEntryComputation(
 
   HloComputation* computation = module->entry_computation();
   const HloInstructionSequence& sequence = schedule.sequence(computation);
-  return HeapSimulator::Run(absl::make_unique<NoFragmentationStatsHeap>(),
-                            *computation, sequence, *alias_analysis,
-                            size_function)
+  return HeapSimulator::Run(
+             absl::make_unique<NoFragmentationStatsHeap<HloValue>>(),
+             *computation, sequence, *alias_analysis, size_function)
       .ValueOrDie()
       .heap_size;
 }
diff --git a/tensorflow/compiler/xla/service/hlo_module_config.cc b/tensorflow/compiler/xla/service/hlo_module_config.cc
index eaed707607d..8158d198799 100644
--- a/tensorflow/compiler/xla/service/hlo_module_config.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_config.cc
@@ -51,12 +51,14 @@ string HloModuleConfig::compilation_cache_key() const {
   string key = absl::StrCat("profiling=", hlo_profiling_enabled());
   StrAppend(&key, "::(");
   std::vector<string> params;
-  for (const ShapeLayout& param_layout :
-       entry_computation_layout_->parameter_layouts()) {
-    params.push_back(param_layout.shape().DebugString());
+  if (entry_computation_layout_.has_value()) {
+    for (const ShapeLayout& param_layout :
+         entry_computation_layout_->parameter_layouts()) {
+      params.push_back(param_layout.shape().DebugString());
+    }
+    StrAppend(&key, absl::StrJoin(params, ", "), ") => ",
+              entry_computation_layout_->result_shape().SerializeAsString());
   }
-  StrAppend(&key, absl::StrJoin(params, ", "), ") => ",
-            entry_computation_layout_->result_shape().SerializeAsString());
   if (seed() != 0) {
     // TODO(b/32083678): force recompilation to reset global state.
     static std::atomic<int> counter{0};
diff --git a/tensorflow/compiler/xla/service/hlo_opcode.h b/tensorflow/compiler/xla/service/hlo_opcode.h
index 1625d0bbae4..b50c7d9a584 100644
--- a/tensorflow/compiler/xla/service/hlo_opcode.h
+++ b/tensorflow/compiler/xla/service/hlo_opcode.h
@@ -123,6 +123,7 @@ namespace xla {
   V(kRemainder, "remainder", 2)                                        \
   V(kReplicaId, "replica-id", 0)                                       \
   V(kReshape, "reshape", 1)                                            \
+  V(kDynamicReshape, "dynamic-reshape", kHloOpcodeIsVariadic)          \
   V(kReverse, "reverse", 1)                                            \
   V(kRng, "rng", kHloOpcodeIsVariadic)                                 \
   V(kRngGetAndUpdateState, "rng-get-and-update-state", 0)              \
diff --git a/tensorflow/compiler/xla/service/hlo_opcode_test.cc b/tensorflow/compiler/xla/service/hlo_opcode_test.cc
index 136e6702b21..cceb60a70e9 100644
--- a/tensorflow/compiler/xla/service/hlo_opcode_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_opcode_test.cc
@@ -58,6 +58,7 @@ TEST(HloOpcodeTest, OpcodeProperties) {
       case HloOpcode::kCustomCall:
       case HloOpcode::kDynamicSlice:
       case HloOpcode::kDynamicUpdateSlice:
+      case HloOpcode::kDynamicReshape:
       case HloOpcode::kFusion:
       case HloOpcode::kMap:
       case HloOpcode::kReduce:
diff --git a/tensorflow/compiler/xla/service/hlo_parser.cc b/tensorflow/compiler/xla/service/hlo_parser.cc
index 0530062c43b..e2bbda3a607 100644
--- a/tensorflow/compiler/xla/service/hlo_parser.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser.cc
@@ -194,6 +194,7 @@ class HloParserImpl : public HloParser {
     kBracedHloComputationList,
     kFftType,
     kComparisonDirection,
+    kComparisonType,
     kWindow,
     kConvolutionDimensionNumbers,
     kSharding,
@@ -327,6 +328,7 @@ class HloParserImpl : public HloParser {
   bool ParseOpcode(HloOpcode* result);
   bool ParseFftType(FftType* result);
   bool ParseComparisonDirection(ComparisonDirection* result);
+  bool ParseComparisonType(Comparison::Type* result);
   bool ParseFusionKind(HloInstruction::FusionKind* result);
   bool ParseRandomDistribution(RandomDistribution* result);
   bool ParseRandomAlgorithm(RandomAlgorithm* result);
@@ -552,33 +554,39 @@ bool HloParserImpl::ParseAliasing(AliasingData* data) {
       return false;
     }
 
-    if (lexer_.GetKind() != TokKind::kLparen) {
-      // Short form: "{0}: 0", output index "{}" is assumed.
-      int64 param_num;
-      ParseInt64(&param_num);
-      data->emplace(std::piecewise_construct, std::forward_as_tuple(out),
-                    std::forward_as_tuple(param_num, ShapeIndex{}));
-    } else {
-      // Long form: "{0}: (0, {0})", output index is explicitly specified.
-      if (!ParseToken(TokKind::kLparen, errmsg)) {
-        return false;
-      }
-      int64 param_num;
-      ParseInt64(&param_num);
-      if (!ParseToken(TokKind::kComma, errmsg)) {
-        return false;
-      }
-      ShapeIndex param_idx;
-      if (!ParseShapeIndex(&param_idx)) {
-        return false;
-      }
-      data->emplace(std::piecewise_construct, std::forward_as_tuple(out),
-                    std::forward_as_tuple(param_num, param_idx));
-      if (!ParseToken(TokKind::kRparen, errmsg)) {
-        return false;
+    if (!ParseToken(TokKind::kLparen, errmsg)) {
+      return false;
+    }
+    int64 param_num;
+    ParseInt64(&param_num);
+    if (!ParseToken(TokKind::kComma, errmsg)) {
+      return false;
+    }
+    ShapeIndex param_idx;
+    if (!ParseShapeIndex(&param_idx)) {
+      return false;
+    }
+
+    HloInputOutputAliasConfig::AliasKind alias_kind =
+        HloInputOutputAliasConfig::kMayAlias;
+    if (EatIfPresent(TokKind::kComma)) {
+      std::string type;
+      ParseName(&type);
+      if (type == "must-alias") {
+        alias_kind = HloInputOutputAliasConfig::kMustAlias;
+      } else if (type == "may-alias") {
+        alias_kind = HloInputOutputAliasConfig::kMayAlias;
+      } else {
+        return TokenError("Unexpected aliasing kind; expected SYSTEM or USER");
       }
     }
 
+    data->emplace(std::piecewise_construct, std::forward_as_tuple(out),
+                  std::forward_as_tuple(param_num, param_idx, alias_kind));
+    if (!ParseToken(TokKind::kRparen, errmsg)) {
+      return false;
+    }
+
     if (!EatIfPresent(TokKind::kComma)) {
       break;
     }
@@ -624,8 +632,9 @@ bool HloParserImpl::ParseHloModule(HloModule* module) {
   if (aliasing_data) {
     HloInputOutputAliasConfig alias_config(module->result_shape());
     for (auto& p : *aliasing_data) {
-      Status st = alias_config.SetUpAlias(p.first, p.second.parameter_number,
-                                          p.second.parameter_index);
+      Status st =
+          alias_config.SetUpAlias(p.first, p.second.parameter_number,
+                                  p.second.parameter_index, p.second.kind);
       if (!st.ok()) {
         return TokenError(st.error_message());
       }
@@ -874,7 +883,6 @@ bool HloParserImpl::ParseInstructionRhs(HloComputation::Builder* builder,
     case HloOpcode::kClz:
     case HloOpcode::kCollectivePermuteDone:
     case HloOpcode::kCopy:
-    case HloOpcode::kCopyStart:
     case HloOpcode::kCopyDone:
     case HloOpcode::kCos:
     case HloOpcode::kExp:
@@ -1082,6 +1090,20 @@ bool HloParserImpl::ParseInstructionRhs(HloComputation::Builder* builder,
       }
       break;
     }
+    case HloOpcode::kCopyStart: {
+      // If the is_cross_program_prefetch attribute is not present then default
+      // to false.
+      optional<bool> is_cross_program_prefetch = false;
+      attrs["is_cross_program_prefetch"] = {/*required=*/false, AttrTy::kBool,
+                                            &is_cross_program_prefetch};
+      if (!ParseOperands(&operands, /*expected_size=*/1) ||
+          !ParseAttributes(attrs)) {
+        return false;
+      }
+      instruction = builder->AddInstruction(HloInstruction::CreateCopyStart(
+          shape, operands[0], *is_cross_program_prefetch));
+      break;
+    }
     case HloOpcode::kReplicaId: {
       if (!ParseOperands(&operands, /*expected_size=*/0) ||
           !ParseAttributes(attrs)) {
@@ -1099,6 +1121,16 @@ bool HloParserImpl::ParseInstructionRhs(HloComputation::Builder* builder,
           builder->AddInstruction(HloInstruction::CreatePartitionId());
       break;
     }
+    case HloOpcode::kDynamicReshape: {
+      if (!ParseOperands(&operands) || !ParseAttributes(attrs)) {
+        return false;
+      }
+      instruction =
+          builder->AddInstruction(HloInstruction::CreateDynamicReshape(
+              shape, operands[0],
+              absl::Span<HloInstruction* const>(operands).subspan(1)));
+      break;
+    }
     case HloOpcode::kReshape: {
       optional<int64> inferred_dimension;
       attrs["inferred_dimension"] = {/*required=*/false, AttrTy::kInt64,
@@ -1355,14 +1387,16 @@ bool HloParserImpl::ParseInstructionRhs(HloComputation::Builder* builder,
     }
     case HloOpcode::kCompare: {
       optional<ComparisonDirection> direction;
+      optional<Comparison::Type> type;
       attrs["direction"] = {/*required=*/true, AttrTy::kComparisonDirection,
                             &direction};
+      attrs["type"] = {/*required=*/false, AttrTy::kComparisonType, &type};
       if (!ParseOperands(&operands, /*expected_size=*/2) ||
           !ParseAttributes(attrs)) {
         return false;
       }
       instruction = builder->AddInstruction(HloInstruction::CreateCompare(
-          shape, operands[0], operands[1], *direction));
+          shape, operands[0], operands[1], *direction, type));
       break;
     }
     case HloOpcode::kCholesky: {
@@ -2129,6 +2163,7 @@ bool HloParserImpl::ParseSingleSharding(OpSharding* sharding,
   LocTy loc = lexer_.GetLoc();
   bool maximal = false;
   bool replicated = false;
+  bool last_tile_dim_replicate = false;
   std::vector<int64> devices;
   std::vector<int64> tile_assignment_dimensions;
   while (lexer_.GetKind() != TokKind::kRbrace) {
@@ -2180,6 +2215,10 @@ bool HloParserImpl::ParseSingleSharding(OpSharding* sharding,
         }
         break;
       }
+      case TokKind::kw_last_tile_dim_replicate:
+        last_tile_dim_replicate = true;
+        lexer_.Lex();
+        break;
       case TokKind::kRbrace:
         break;
       default:
@@ -2218,6 +2257,7 @@ bool HloParserImpl::ParseSingleSharding(OpSharding* sharding,
     for (int64 device : devices) {
       sharding->add_tile_assignment_devices(device);
     }
+    sharding->set_replicate_on_last_tile_dim(last_tile_dim_replicate);
   }
 
   lexer_.Lex();
@@ -3005,6 +3045,14 @@ bool HloParserImpl::ParseAttributeHelper(
             ->emplace(result);
         return true;
       }
+      case AttrTy::kComparisonType: {
+        Comparison::Type result;
+        if (!ParseComparisonType(&result)) {
+          return false;
+        }
+        static_cast<optional<Comparison::Type>*>(attr_out_ptr)->emplace(result);
+        return true;
+      }
       case AttrTy::kEnum: {
         if (lexer_.GetKind() != TokKind::kIdent) {
           return TokenError("expects an enumeration value");
@@ -4132,6 +4180,21 @@ bool HloParserImpl::ParseComparisonDirection(ComparisonDirection* result) {
   return true;
 }
 
+bool HloParserImpl::ParseComparisonType(Comparison::Type* result) {
+  VLOG(1) << "ParseComparisonType";
+  if (lexer_.GetKind() != TokKind::kIdent) {
+    return TokenError("expects comparison type");
+  }
+  std::string val = lexer_.GetStrVal();
+  auto status_or_result = StringToComparisonType(val);
+  if (!status_or_result.ok()) {
+    return TokenError(StrFormat("expects comparison type but sees: %s", val));
+  }
+  *result = status_or_result.ValueOrDie();
+  lexer_.Lex();
+  return true;
+}
+
 bool HloParserImpl::ParseFusionKind(HloInstruction::FusionKind* result) {
   VLOG(3) << "ParseFusionKind";
   if (lexer_.GetKind() != TokKind::kIdent) {
diff --git a/tensorflow/compiler/xla/service/hlo_parser_test.cc b/tensorflow/compiler/xla/service/hlo_parser_test.cc
index 484578e5e0e..620e67c3a2f 100644
--- a/tensorflow/compiler/xla/service/hlo_parser_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser_test.cc
@@ -230,7 +230,7 @@ R"(HloModule SelectR1F32WithCmpR1F32sFromParamsSmall_module
 ENTRY %SelectR1F32WithCmpR1F32sFromParamsSmall.v4 (v1: f32[4], v2: f32[4]) -> f32[4] {
   %v1 = f32[4]{0} parameter(0), sharding={maximal device=1}
   %v2 = f32[4]{0} parameter(1), sharding={maximal device=1}
-  %greater-than = pred[4]{0} compare(f32[4]{0} %v1, f32[4]{0} %v2), direction=GT, sharding={replicated}
+  %greater-than = pred[4]{0} compare(f32[4]{0} %v1, f32[4]{0} %v2), direction=GT, type=TOTALORDER, sharding={replicated}
   ROOT %select = f32[4]{0} select(pred[4]{0} %greater-than, f32[4]{0} %v1, f32[4]{0} %v2), sharding={}
 }
 
@@ -318,7 +318,7 @@ R"(HloModule CopyStartAndCopyDone_module
 
 ENTRY %CopyStartAndCopyDone (v1: f32[], v2: f32[2,3]) -> (f32[], f32[2,3]) {
   %v1 = f32[] parameter(0)
-  %copy-start.1 = (f32[], f32[], u32[]) copy-start(f32[] %v1)
+  %copy-start.1 = (f32[], f32[], u32[]) copy-start(f32[] %v1), is_cross_program_prefetch=true
   %copy-done.1 = f32[] copy-done((f32[], f32[], u32[]) %copy-start.1)
   %v2 = f32[2,3]{1,0:S(1)} parameter(1)
   %copy-start.2 = (f32[2,3]{1,0:S(2)}, f32[2,3]{1,0:S(1)}, u32[]) copy-start(f32[2,3]{1,0:S(1)} %v2)
@@ -512,7 +512,7 @@ R"(HloModule R4F32OverlapSmall_module
 %ge_F32.v3 (lhs: f32[], rhs: f32[]) -> pred[] {
   %lhs = f32[] parameter(0)
   %rhs = f32[] parameter(1)
-  ROOT %greater-than-or-equal-to = pred[] compare(f32[] %lhs, f32[] %rhs), direction=GE
+  ROOT %greater-than-or-equal-to = pred[] compare(f32[] %lhs, f32[] %rhs), direction=GE, type=TOTALORDER
 }
 
 %add_F32.v3 (lhs.1: f32[], rhs.1: f32[]) -> f32[] {
@@ -2399,7 +2399,7 @@ ENTRY c2 {
 
 TEST_F(HloParserTest, SimpleAliasing) {
   const string original = R"(
-HloModule Module, input_output_alias={ {0}: (0, {0}), {1}: (0, {1}) }
+HloModule Module, input_output_alias={ {0}: (0, {0}, must-alias), {1}: (0, {1}) }
 
 ENTRY entry {
   %p = (f32[], f32[]) parameter(0)
@@ -2413,42 +2413,13 @@ ENTRY entry {
   std::unique_ptr<HloModule> parsed_module = module.ConsumeValueOrDie();
   EXPECT_EQ(parsed_module->input_output_alias_config().GetAliasedOutput(0, {0}),
             ShapeIndex{0});
+
+  EXPECT_TRUE(
+      parsed_module->input_output_alias_config().ParameterMustAlias(0, {0}));
   EXPECT_EQ(parsed_module->input_output_alias_config().GetAliasedOutput(0, {1}),
             ShapeIndex{1});
-}
-
-TEST_F(HloParserTest, SimpleAliasingShortForm) {
-  const string original = R"(
-HloModule Module, input_output_alias={ {0}: 0, {1}: 1 }
-
-ENTRY entry {
-  %p0 = f32[] parameter(0)
-  %p1 = f32[] parameter(1)
-  ROOT %out = (f32[], f32[]) tuple(%p0, %p1)
-}
-  )";
-  auto module = ParseAndReturnVerifiedModule(original);
-  TF_ASSERT_OK(module.status());
-  std::unique_ptr<HloModule> parsed_module = module.ConsumeValueOrDie();
-  EXPECT_EQ(parsed_module->input_output_alias_config().GetAliasedOutput(0, {}),
-            ShapeIndex{0});
-  EXPECT_EQ(parsed_module->input_output_alias_config().GetAliasedOutput(1, {}),
-            ShapeIndex{1});
-}
-
-TEST_F(HloParserTest, SimpleAliasingShortFormError) {
-  const string original = R"(
-HloModule Module, input_output_alias={ {0}: A, {1}: 1 }
-
-ENTRY entry {
-  %p0 = f32[] parameter(0)
-  %p1 = f32[] parameter(1)
-  ROOT %out = (f32[], f32[]) tuple(%p0, %p1)
-}
-  )";
-  ExpectHasSubstr(
-      ParseAndReturnUnverifiedModule(original).status().error_message(),
-      "expects integer");
+  EXPECT_FALSE(
+      parsed_module->input_output_alias_config().ParameterMustAlias(0, {1}));
 }
 
 TEST_F(HloParserTest, NestedAliasing) {
@@ -2626,6 +2597,21 @@ TEST_F(HloParserTest, ParseSharding) {
   EXPECT_EQ(sharding.ToString(), original);
 }
 
+TEST_F(HloParserTest, ParseShardingPartialReplication) {
+  const string original = "{devices=[2,2]0,1,2,3 last_tile_dim_replicate}";
+  TF_ASSERT_OK_AND_ASSIGN(HloSharding sharding, ParseSharding(original));
+  EXPECT_EQ(sharding.ToString(), original);
+  Array<int64> group_tiling({2});
+  group_tiling(0) = 0;
+  group_tiling(1) = 1;
+  std::vector<int64> group0_members({0, 1});
+  std::vector<int64> group1_members({2, 3});
+  EXPECT_EQ(
+      HloSharding::PartialTile(group_tiling, {group0_members, group1_members})
+          .ToString(),
+      original);
+}
+
 TEST_F(HloParserTest, ParseFrontendAttributes) {
   const string original =
       R"({attr_a="test_a",attr_b="b",attr_c="s64",attr_d="a/b"})";
diff --git a/tensorflow/compiler/xla/service/hlo_pass_fix.h b/tensorflow/compiler/xla/service/hlo_pass_fix.h
index a22a394c6a4..1de231a9a86 100644
--- a/tensorflow/compiler/xla/service/hlo_pass_fix.h
+++ b/tensorflow/compiler/xla/service/hlo_pass_fix.h
@@ -43,11 +43,12 @@ class HloPassFix : public Pass {
     while (changed_this_iteration) {
       TF_ASSIGN_OR_RETURN(changed_this_iteration, Pass::Run(module));
       changed |= changed_this_iteration;
-      VLOG(3) << "changed_this_iteration: " << changed_this_iteration;
+      VLOG(3) << Pass::name() << " iteration " << iteration_count
+              << " changed_this_iteration: " << changed_this_iteration;
       ++iteration_count;
       if (iteration_count == kLimit) {
-        VLOG(1) << "Unexpectedly high number of iterations in HLO passes, "
-                   "exiting fixed point loop.";
+        VLOG(1) << "Unexpectedly high number of iterations in HLO passes '"
+                << Pass::name() << "' exiting fixed point loop.";
         // Return false in case this is fixed point is nested.
         return false;
       }
diff --git a/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc b/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
index b07ab10827a..3b7b0b61f0a 100644
--- a/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
+++ b/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
@@ -69,6 +69,9 @@ StatusOr<bool> HloPassPipeline::RunPassesInternal(
     }
     TF_ASSIGN_OR_RETURN(bool pass_changed, RunHelper(pass, hlo));
     changed |= pass_changed;
+    if (pass_changed) {
+      VLOG(3) << "  Pass caused changes" << pass->name();
+    }
     TF_RETURN_IF_ERROR(RunInvariantCheckers(hlo, pass_name));
     last_pass_name = string(pass_name);
     if (!pass->IsPassPipeline()) {
diff --git a/tensorflow/compiler/xla/service/hlo_runner.cc b/tensorflow/compiler/xla/service/hlo_runner.cc
index 83130108dd7..3a5e7ca6f40 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.cc
+++ b/tensorflow/compiler/xla/service/hlo_runner.cc
@@ -259,9 +259,15 @@ StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
   return ExecuteReplicated(executable.get(), options, device_assignment);
 }
 
-StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
-    Executable* executable, const ReplicatedExecuteOptions& options,
-    DeviceAssignment* device_assignment, ExecutionProfile* profile) {
+StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicatedImpl(
+    std::function<StatusOr<std::vector<ScopedShapedBuffer>>(
+        const std::vector<ServiceExecutableRunOptions>&,
+        const std::vector<absl::Span<const ShapedBuffer* const>>&)>
+        execution_helper,
+    std::function<int64(int64)> argument_count_provider,
+    std::function<const Literal*(int64, int64)> argument_provider,
+    const ReplicatedExecuteOptions& options,
+    DeviceAssignment* device_assignment) {
   std::vector<std::unique_ptr<se::Stream>> streams;
   std::vector<ServiceExecutableRunOptions> service_run_options;
 
@@ -269,12 +275,19 @@ StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
   // This reserve() call is necessary for correctness, because
   // argument_buffer_ptrs contains pointers into the elements of
   // argument_buffers.
-  argument_buffers.reserve(options.num_replicas * options.arguments.size());
+  const int64 total_argument_count = [&]() {
+    int64 total = 0;
+    for (int64 i = 0; i < options.num_replicas; ++i) {
+      total += argument_count_provider(i);
+    }
+    return total;
+  }();
+  argument_buffers.reserve(total_argument_count);
 
   // Plus one so we can safely get &argument_buffer_ptrs[0] in case there are
   // no arguments.
-  std::vector<const ShapedBuffer*> argument_buffer_ptrs(
-      options.num_replicas * options.arguments.size() + 1);
+  std::vector<const ShapedBuffer*> argument_buffer_ptrs(total_argument_count +
+                                                        1);
   std::vector<absl::Span<const ShapedBuffer* const>> argument_buffer_slices;
   int64 index = 0;
   RunId run_id;
@@ -288,7 +301,10 @@ StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
         device, streams.back().get(), device_assignment, run_id));
 
     // Copy arguments to device.
-    for (const Literal* argument : options.arguments) {
+    const int64 argument_count = argument_count_provider(i);
+    for (int64 arg_index = 0; arg_index < argument_count; arg_index++) {
+      const Literal* const argument = argument_provider(i, arg_index);
+      TF_RET_CHECK(argument != nullptr);
       TF_ASSIGN_OR_RETURN(
           ScopedShapedBuffer argument_buffer,
           backend().transfer_manager()->AllocateScopedShapedBuffer(
@@ -299,8 +315,7 @@ StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
       argument_buffer_ptrs[index++] = &argument_buffers.back();
     }
     argument_buffer_slices.emplace_back(
-        &argument_buffer_ptrs[index - options.arguments.size()],
-        options.arguments.size());
+        &argument_buffer_ptrs[index - argument_count], argument_count);
   }
 
   std::unique_ptr<tensorflow::thread::ThreadPool> pool;
@@ -355,39 +370,9 @@ StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
   }
 
   LOG(INFO) << "Replicated execution started";
-  std::vector<ScopedShapedBuffer> results;
-  if (!options.use_threads) {
-    TF_ASSIGN_OR_RETURN(results,
-                        executable->ExecuteOnStreams(service_run_options,
-                                                     argument_buffer_slices));
-  } else {
-    tensorflow::mutex mutex;
-    std::vector<StatusOr<ScopedShapedBuffer>> thread_results(
-        options.num_replicas);
-    {
-      LOG(INFO) << "Creating thread pool for " << options.num_replicas
-                << " replicas";
-      tensorflow::thread::ThreadPool pool(tensorflow::Env::Default(),
-                                          "replicas", options.num_replicas);
-      for (int64 i = 0; i < options.num_replicas; ++i) {
-        pool.Schedule([&, i] {
-          auto result = executable->ExecuteOnStream(
-              &service_run_options[i], argument_buffer_slices[i], nullptr);
-          tensorflow::mutex_lock lock(mutex);
-          thread_results[i] = std::move(result);
-        });
-      }
-
-      // Note: the thread pool destructor guarantees it completes all work
-      // before we leave this scope.
-    }
-    for (auto& thread_result : thread_results) {
-      if (!thread_result.ok()) {
-        return thread_result.status();
-      }
-      results.push_back(std::move(thread_result).ValueOrDie());
-    }
-  }
+  TF_ASSIGN_OR_RETURN(
+      std::vector<ScopedShapedBuffer> results,
+      execution_helper(service_run_options, argument_buffer_slices));
   LOG(INFO) << "Replicated execution terminated";
 
   std::vector<Literal> exec_results;
@@ -401,6 +386,104 @@ StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
   return std::move(exec_results);
 }
 
+StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
+    Executable* executable, const ReplicatedExecuteOptions& options,
+    DeviceAssignment* device_assignment, ExecutionProfile* profile) {
+  return ExecuteReplicatedImpl(
+      [&](const std::vector<ServiceExecutableRunOptions>& service_run_options,
+          const std::vector<absl::Span<const ShapedBuffer* const>>&
+              argument_buffer_slices)
+          -> StatusOr<std::vector<ScopedShapedBuffer>> {
+        std::vector<ScopedShapedBuffer> results;
+        if (!options.use_threads) {
+          TF_ASSIGN_OR_RETURN(
+              results, executable->ExecuteOnStreams(service_run_options,
+                                                    argument_buffer_slices));
+        } else {
+          tensorflow::mutex mutex;
+          std::vector<StatusOr<ScopedShapedBuffer>> thread_results(
+              options.num_replicas);
+          {
+            LOG(INFO) << "Creating thread pool for " << options.num_replicas
+                      << " replicas";
+            tensorflow::thread::ThreadPool pool(
+                tensorflow::Env::Default(), "replicas", options.num_replicas);
+            for (int64 i = 0; i < options.num_replicas; ++i) {
+              pool.Schedule([&, i] {
+                auto result = executable->ExecuteOnStream(
+                    &service_run_options[i], argument_buffer_slices[i],
+                    nullptr);
+                tensorflow::mutex_lock lock(mutex);
+                thread_results[i] = std::move(result);
+              });
+            }
+
+            // Note: the thread pool destructor guarantees it completes all work
+            // before we leave this scope.
+          }
+          for (auto& thread_result : thread_results) {
+            if (!thread_result.ok()) {
+              return thread_result.status();
+            }
+            results.push_back(std::move(thread_result).ValueOrDie());
+          }
+        }
+        return results;
+      },
+      [&](int64 replica) { return options.arguments.size(); },
+      [&](int64 replica, int64 index) { return options.arguments[index]; },
+      options, device_assignment);
+}
+
+StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
+    std::function<Executable*(int64)> executable_provider,
+    std::function<int64(int64)> argument_count_provider,
+    std::function<const Literal*(int64, int64)> argument_provider,
+    const ReplicatedExecuteOptions& options) {
+  TF_ASSIGN_OR_RETURN(
+      DeviceAssignment device_assignment,
+      backend().computation_placer()->AssignDevices(options.num_replicas, 1));
+  return ExecuteReplicatedImpl(
+      [&](const std::vector<ServiceExecutableRunOptions>& service_run_options,
+          const std::vector<absl::Span<const ShapedBuffer* const>>&
+              argument_buffer_slices)
+          -> StatusOr<std::vector<ScopedShapedBuffer>> {
+        TF_RET_CHECK(options.use_threads);
+        std::vector<ScopedShapedBuffer> results;
+        tensorflow::mutex mutex;
+        std::vector<StatusOr<ScopedShapedBuffer>> thread_results(
+            options.num_replicas);
+        {
+          LOG(INFO) << "Creating thread pool for " << options.num_replicas
+                    << " replicas";
+          tensorflow::thread::ThreadPool pool(tensorflow::Env::Default(),
+                                              "replicas", options.num_replicas);
+          for (int64 i = 0; i < options.num_replicas; ++i) {
+            for (const auto& arg : argument_buffer_slices[i]) {
+              TF_RET_CHECK(arg != nullptr);
+            }
+            pool.Schedule([&, i] {
+              auto result = executable_provider(i)->ExecuteOnStream(
+                  &service_run_options[i], argument_buffer_slices[i], nullptr);
+              tensorflow::mutex_lock lock(mutex);
+              thread_results[i] = std::move(result);
+            });
+          }
+
+          // Note: the thread pool destructor guarantees it completes all work
+          // before we leave this scope.
+        }
+        for (auto& thread_result : thread_results) {
+          if (!thread_result.ok()) {
+            return thread_result.status();
+          }
+          results.push_back(std::move(thread_result).ValueOrDie());
+        }
+        return results;
+      },
+      argument_count_provider, argument_provider, options, &device_assignment);
+}
+
 StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
     std::unique_ptr<HloModule> module,
     const ReplicatedExecuteOptions& options) {
diff --git a/tensorflow/compiler/xla/service/hlo_runner.h b/tensorflow/compiler/xla/service/hlo_runner.h
index 7e8b301ab54..733bb8bff54 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.h
+++ b/tensorflow/compiler/xla/service/hlo_runner.h
@@ -176,6 +176,17 @@ class HloRunner {
       Executable* executable, const ReplicatedExecuteOptions& options,
       DeviceAssignment* device_assignment, ExecutionProfile* profile = nullptr);
 
+  // Same as above, but with different reusable Executables. This may update the
+  // profile information in *executables.
+  //
+  // Note that this call ignores ReplicatedExecutionOptions::run_hlo_passes,
+  // since we've already compiled the Executable.
+  StatusOr<std::vector<Literal>> ExecuteReplicated(
+      std::function<Executable*(int64)> executable_provider,
+      std::function<int64(int64)> argument_count_provider,
+      std::function<const Literal*(int64, int64)> argument_provider,
+      const ReplicatedExecuteOptions& options);
+
   // If backend is not created in the constructor, creates and returns the
   // default backend. If creation fails, crashes the program.
   //
@@ -193,6 +204,17 @@ class HloRunner {
       int64 device, se::Stream* stream, DeviceAssignment* device_assignment,
       RunId run_id);
 
+  // Common implementation code for ExecuteReplicated() above.
+  StatusOr<std::vector<Literal>> ExecuteReplicatedImpl(
+      std::function<StatusOr<std::vector<ScopedShapedBuffer>>(
+          const std::vector<ServiceExecutableRunOptions>&,
+          const std::vector<absl::Span<const ShapedBuffer* const>>&)>
+          execution_helper,
+      std::function<int64(int64)> argument_count_provider,
+      std::function<const Literal*(int64, int64)> argument_provider,
+      const ReplicatedExecuteOptions& options,
+      DeviceAssignment* device_assignment);
+
   std::unique_ptr<Backend> backend_;
 };
 
diff --git a/tensorflow/compiler/xla/service/hlo_sharding.cc b/tensorflow/compiler/xla/service/hlo_sharding.cc
index b0a03707efb..4244cdaceea 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding.cc
+++ b/tensorflow/compiler/xla/service/hlo_sharding.cc
@@ -39,6 +39,47 @@ HloSharding HloSharding::Tile1D(const Shape& input_shape, int64 num_tiles) {
   return HloSharding(assignment);
 }
 
+HloSharding HloSharding::PartialTile(
+    const Array<int64>& group_tile_assignment,
+    absl::Span<const absl::Span<const int64>> replication_groups) {
+  auto new_tile_dims = group_tile_assignment.dimensions();
+  new_tile_dims.push_back(replication_groups[0].size());
+  auto new_tile_assignment = Array<int64>(new_tile_dims);
+  new_tile_assignment.Each([&](absl::Span<const int64> indices, int64* device) {
+    std::vector<int64> group_index(indices.begin(), indices.end());
+    group_index.pop_back();
+    int64 group = group_tile_assignment(group_index);
+    *device = replication_groups[group][indices.back()];
+  });
+  return PartialTile(new_tile_assignment);
+}
+
+HloSharding HloSharding::PartialTile(
+    const Array<int64>& tile_assignment_last_dim_replicate) {
+  std::vector<std::set<int64>> sorted_groups(
+      tile_assignment_last_dim_replicate.num_elements() /
+      tile_assignment_last_dim_replicate.dimensions().back());
+  auto get_group_id = [&](absl::Span<const int64> indices) {
+    int64 group_id = 0;
+    for (int64 i = 0; i < indices.size() - 1; ++i) {
+      group_id *= tile_assignment_last_dim_replicate.dim(i);
+      group_id += indices[i];
+    }
+    return group_id;
+  };
+  tile_assignment_last_dim_replicate.Each(
+      [&](absl::Span<const int64> indices, const int64 device) {
+        sorted_groups[get_group_id(indices)].insert(device);
+      });
+  Array<int64> sorted_tile(tile_assignment_last_dim_replicate.dimensions());
+  sorted_tile.Each([&](absl::Span<const int64> indices, int64* device) {
+    auto begin = sorted_groups[get_group_id(indices)].begin();
+    *device = *begin;
+    sorted_groups[get_group_id(indices)].erase(begin);
+  });
+  return HloSharding(sorted_tile, /*replicate_on_last_tile_dim=*/true);
+}
+
 HloSharding HloSharding::Tuple(const ShapeTree<HloSharding>& sub_shardings) {
   std::vector<HloSharding> flattened_list;
   flattened_list.reserve(sub_shardings.leaf_count());
@@ -101,8 +142,10 @@ string HloSharding::ToString() const {
     return StrCat(
         "{maximal device=", static_cast<int64>(*tile_assignment_.begin()), "}");
   }
-  return StrCat("{devices=[", StrJoin(tile_assignment_.dimensions(), ","), "]",
-                StrJoin(tile_assignment_, ","), "}");
+  return StrCat(
+      "{devices=[", StrJoin(tile_assignment_.dimensions(), ","), "]",
+      StrJoin(tile_assignment_, ","),
+      replicate_on_last_tile_dim_ ? " last_tile_dim_replicate}" : "}");
 }
 
 bool HloSharding::UsesDevice(int64 device) const {
@@ -148,6 +191,9 @@ std::vector<int64> HloSharding::TileIndexForDevice(int64 device) const {
     }
   });
   CHECK(!ret_index.empty());
+  if (replicate_on_last_tile_dim_) {
+    ret_index.pop_back();
+  }
   return ret_index;
 }
 
@@ -157,6 +203,12 @@ int64 HloSharding::DeviceForTileIndex(absl::Span<const int64> index) const {
   if (maximal_) {
     return *tile_assignment_.begin();
   }
+  if (replicate_on_last_tile_dim_ &&
+      index.size() < tile_assignment().num_dimensions()) {
+    std::vector<int64> first_replicated_index(index.begin(), index.end());
+    first_replicated_index.push_back(0);
+    return tile_assignment_(first_replicated_index);
+  }
   return tile_assignment_(index);
 }
 
@@ -167,8 +219,11 @@ std::vector<int64> HloSharding::TileOffsetForDevice(const Shape& shape,
   if (maximal_) {
     return std::vector<int64>(shape.dimensions_size(), 0);
   }
-
-  CHECK_EQ(shape.dimensions_size(), tile_assignment_.num_dimensions());
+  if (replicate_on_last_tile_dim_) {
+    CHECK_EQ(shape.dimensions_size(), tile_assignment_.num_dimensions() - 1);
+  } else {
+    CHECK_EQ(shape.dimensions_size(), tile_assignment_.num_dimensions());
+  }
   std::vector<int64> index = TileIndexForDevice(device);
   for (int64 i = 0; i < index.size(); ++i) {
     const int64 shape_dim = shape.dimensions(i);
@@ -187,7 +242,8 @@ std::vector<int64> HloSharding::TileLimitForDevice(const Shape& shape,
                               shape.dimensions().end());
   }
 
-  CHECK_EQ(shape.dimensions_size(), tile_assignment_.num_dimensions());
+  CHECK_EQ(shape.dimensions_size() + (ReplicateOnLastTileDim() ? 1 : 0),
+           tile_assignment_.num_dimensions());
   std::vector<int64> index = TileIndexForDevice(device);
   for (int64 i = 0; i < index.size(); ++i) {
     const int64 shape_dim = shape.dimensions(i);
@@ -341,8 +397,10 @@ Status HloSharding::ValidateNonTuple(const Shape& shape,
     return Status::OK();
   }
 
-  // The tile assignment tensor must have the same rank as the input.
-  if (shape.rank() != tile_assignment_.num_dimensions()) {
+  // The tile assignment tensor must have the same rank as the input, or input
+  // rank + 1 for replicate_on_last_tile_dim_.
+  if (shape.rank() + (replicate_on_last_tile_dim_ ? 1 : 0) !=
+      tile_assignment_.num_dimensions()) {
     return tensorflow::errors::InvalidArgument(
         "Number of tile assignment dimensions is different to the input rank. "
         "sharding=",
@@ -403,7 +461,8 @@ Status HloSharding::ValidateNonTuple(const Shape& shape,
                          proto.tile_assignment_dimensions().end()));
   std::copy(proto.tile_assignment_devices().begin(),
             proto.tile_assignment_devices().end(), tile_assignment.begin());
-  return HloSharding(tile_assignment);
+  return proto.replicate_on_last_tile_dim() ? PartialTile(tile_assignment)
+                                            : HloSharding(tile_assignment);
 }
 
 OpSharding HloSharding::ToProto() const {
@@ -429,6 +488,7 @@ OpSharding HloSharding::ToProto() const {
     result.set_type(OpSharding::MAXIMAL);
   } else {
     result.set_type(OpSharding::OTHER);
+    result.set_replicate_on_last_tile_dim(ReplicateOnLastTileDim());
   }
   return result;
 }
@@ -464,6 +524,17 @@ Shape HloSharding::TileShape(const Shape& shape, int64 device) const {
   return result_shape;
 }
 
+int64 HloSharding::NumTiles() const {
+  if (IsTileMaximal()) {
+    return 1;
+  }
+  if (ReplicateOnLastTileDim()) {
+    return tile_assignment().num_elements() /
+           tile_assignment().dimensions().back();
+  }
+  return tile_assignment().num_elements();
+}
+
 HloSharding HloSharding::GetSubSharding(const Shape& shape,
                                         const ShapeIndex& index) const {
   CHECK(IsTuple());
@@ -516,6 +587,9 @@ size_t HloSharding::Hash() const {
   for (uint32 v : tile_assignment_) {
     h = tensorflow::Hash64Combine(h, std::hash<uint32>{}(v));
   }
+  if (replicate_on_last_tile_dim_) {
+    h = tensorflow::Hash64Combine(h, std::hash<uint32>{}(1));
+  }
   return h;
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_sharding.h b/tensorflow/compiler/xla/service/hlo_sharding.h
index 20fa7232e65..e7ba2bc0680 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding.h
+++ b/tensorflow/compiler/xla/service/hlo_sharding.h
@@ -54,6 +54,19 @@ class HloSharding {
     return HloSharding(tile_assignment);
   }
 
+  // Creates a new sharding where data is replicated within each replication
+  // group, and sharded across replication groups according to
+  // group_tile_assignment. Replication group members will be sorted.
+  static HloSharding PartialTile(
+      const Array<int64>& group_tile_assignment,
+      absl::Span<const absl::Span<const int64>> replication_groups);
+
+  // Creates a partially replicated tiled sharding with device-level tile
+  // assignment, where the last dimension is the additional replication
+  // dimension. Replication group members will be sorted.
+  static HloSharding PartialTile(
+      const Array<int64>& tile_assignment_last_dim_replicate);
+
   // Creates a new sharding which splits a one-dimensional input shape into
   // `num_tiles` tiles.
   static HloSharding Tile1D(const Shape& input_shape, int64 num_tiles);
@@ -115,6 +128,11 @@ class HloSharding {
     });
   }
 
+  // Returns if the sharding has partial replication and partial sharding. If
+  // true, data is sharded according to other dimensions of tile_assignment(),
+  // but replicated across devices along the last dimension.
+  bool ReplicateOnLastTileDim() const { return replicate_on_last_tile_dim_; }
+
   // Returns true if the sharding defines an operation on the given device.
   bool UsesDevice(int64 device) const;
 
@@ -132,6 +150,10 @@ class HloSharding {
 
   // Returns the device that should execute the given tile.
   // It is an error to call this if is_replicated() is true.
+  // When ReplicateOnLastTileDim() == true, if index.size() == data rank, it
+  // returns the first device in that replicated subgroup; otherwise,
+  // index.size() should be the same as tile_assignment()'s rank and specifies
+  // the member of the replication subgroup.
   // REQUIRES: !IsTuple()
   int64 DeviceForTileIndex(absl::Span<const int64> index) const;
 
@@ -188,7 +210,8 @@ class HloSharding {
   bool operator==(const HloSharding& other) const {
     return replicated_ == other.replicated_ && maximal_ == other.maximal_ &&
            tile_assignment_ == other.tile_assignment_ &&
-           tuple_elements_ == other.tuple_elements_;
+           tuple_elements_ == other.tuple_elements_ &&
+           replicate_on_last_tile_dim_ == other.replicate_on_last_tile_dim_;
   }
   bool operator!=(const HloSharding& other) const { return !(*this == other); }
 
@@ -220,12 +243,17 @@ class HloSharding {
   // REQUIRES: !IsTuple()
   Shape TileShape(const Shape& shape, int64 device) const;
 
+  // Gets the number of tiles. If it has partial replication, this will not
+  // equal the device count.
+  int64 NumTiles() const;
+
  private:
   HloSharding()
       : replicated_(true),
         maximal_(true),
         tuple_(false),
-        tile_assignment_({0}) {}
+        tile_assignment_({0}),
+        replicate_on_last_tile_dim_(false) {}
   // device_id values:
   // -2: magic number to mean unassigned device, used by spatial partitioning
   // -1: the id of the host
@@ -236,18 +264,22 @@ class HloSharding {
       : replicated_(false),
         maximal_(true),
         tuple_(false),
-        tile_assignment_({1}, device_id) {}
-  explicit HloSharding(const Array<int64>& tile_assignment)
+        tile_assignment_({1}, device_id),
+        replicate_on_last_tile_dim_(false) {}
+  explicit HloSharding(const Array<int64>& tile_assignment,
+                       bool replicate_on_last_tile_dim = false)
       : replicated_(false),
         maximal_(false),
         tuple_(false),
-        tile_assignment_(tile_assignment) {}
+        tile_assignment_(tile_assignment),
+        replicate_on_last_tile_dim_(replicate_on_last_tile_dim) {}
   explicit HloSharding(const std::vector<HloSharding>& tuple_shardings)
       : replicated_(false),
         maximal_(false),
         tuple_(true),
         tile_assignment_({0}),
-        tuple_elements_(tuple_shardings) {}
+        tuple_elements_(tuple_shardings),
+        replicate_on_last_tile_dim_(false) {}
 
   // Checks that the number of elements in tuple_elements_ is consistent with
   // the tuple shape passes as argument.
@@ -283,6 +315,11 @@ class HloSharding {
   // present for the root. This is a flattened list of all the leaf shardings in
   // a tuple shape, by pre-order walk (ShapeTree iterator order).
   std::vector<HloSharding> tuple_elements_;
+  // This flag is to support partial replication and partial sharding. If it is
+  // true, tile_assignment_ will have an extra dimension in addition to the data
+  // shape rank, and the added last dimension represents the subgroups of
+  // replications, i.e., elements in slice [..., :] will be replicated.
+  bool replicate_on_last_tile_dim_;
 };
 
 std::ostream& operator<<(std::ostream& out, const HloSharding& sharding);
diff --git a/tensorflow/compiler/xla/service/hlo_sharding_util.cc b/tensorflow/compiler/xla/service/hlo_sharding_util.cc
index 94c348cdeaa..da4e3d61a81 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding_util.cc
+++ b/tensorflow/compiler/xla/service/hlo_sharding_util.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_sharding_util.h"
 
 #include <map>
+#include <vector>
 
 #include "absl/algorithm/container.h"
 #include "tensorflow/compiler/xla/array.h"
@@ -105,21 +106,28 @@ HloSharding TransposeSharding(const HloSharding& sharding,
   if (sharding.IsTileMaximal()) {
     return sharding;
   }
-  const int64 rank = dimensions.size();
+  auto perm_dimensions = dimensions;
+  if (sharding.ReplicateOnLastTileDim() &&
+      dimensions.size() < sharding.tile_assignment().num_dimensions()) {
+    perm_dimensions.push_back(dimensions.size());
+  }
+  const int64 rank = perm_dimensions.size();
   std::vector<int64> tile_assignment_dim(rank);
   for (int64 i = 0; i < rank; ++i) {
-    tile_assignment_dim[i] = sharding.tile_assignment().dim(dimensions[i]);
+    tile_assignment_dim[i] = sharding.tile_assignment().dim(perm_dimensions[i]);
   }
   Array<int64> tile_assignment = sharding.tile_assignment();
   tile_assignment.Reshape(tile_assignment_dim);
   tile_assignment.Each([&](absl::Span<const int64> indices, int64* value) {
     std::vector<int64> src_indices(indices.size(), -1);
     for (int64 i = 0; i < indices.size(); ++i) {
-      src_indices[dimensions[i]] = indices[i];
+      src_indices[perm_dimensions[i]] = indices[i];
     }
     *value = sharding.tile_assignment()(src_indices);
   });
-  return HloSharding::Tile(tile_assignment);
+  return sharding.ReplicateOnLastTileDim()
+             ? HloSharding::PartialTile(tile_assignment)
+             : HloSharding::Tile(tile_assignment);
 }
 
 absl::optional<HloSharding> ReshapeSharding(const Shape& source_shape,
@@ -226,8 +234,14 @@ absl::optional<HloSharding> ReshapeSharding(const Shape& source_shape,
     }
   }
   Array<int64> new_tile_assignment = sharding.tile_assignment();
+  if (sharding.ReplicateOnLastTileDim()) {
+    target_tile_assignment_dimensions.push_back(
+        sharding.tile_assignment().dimensions().back());
+  }
   new_tile_assignment.Reshape(target_tile_assignment_dimensions);
-  return HloSharding::Tile(new_tile_assignment);
+  return sharding.ReplicateOnLastTileDim()
+             ? HloSharding::PartialTile(new_tile_assignment)
+             : HloSharding::Tile(new_tile_assignment);
 }
 
 HloSharding ReverseSharding(const HloSharding& sharding,
@@ -245,7 +259,9 @@ HloSharding ReverseSharding(const HloSharding& sharding,
     }
     *device = sharding.tile_assignment()(original_indices);
   });
-  return HloSharding::Tile(new_tile_assignment);
+  return sharding.ReplicateOnLastTileDim()
+             ? HloSharding::PartialTile(new_tile_assignment)
+             : HloSharding::Tile(new_tile_assignment);
 }
 
 HloSharding ReshapeToTileDimension(const HloSharding& sharding, int64 dim,
@@ -331,17 +347,26 @@ HloSharding GatherOutputSharding(const HloSharding& index_sharding,
       index_dim++;
     }
   }
+
+  if (index_sharding.ReplicateOnLastTileDim()) {
+    output_tile_assignment_dims.push_back(
+        index_sharding.tile_assignment().dimensions().back());
+  }
+
   Array<int64> new_tile_assignment = index_sharding.tile_assignment();
   if (new_tile_assignment.num_elements() !=
       Product(output_tile_assignment_dims)) {
     return HloSharding::Replicate();
   }
   new_tile_assignment.Reshape(output_tile_assignment_dims);
-  return HloSharding::Tile(new_tile_assignment);
+  return index_sharding.ReplicateOnLastTileDim()
+             ? HloSharding::PartialTile(new_tile_assignment)
+             : HloSharding::Tile(new_tile_assignment);
 }
 
 HloSharding GatherIndexSharding(const HloSharding& output_sharding,
                                 const HloInstruction* hlo) {
+  CHECK(hlo->opcode() == HloOpcode::kGather);
   if (output_sharding.IsTileMaximal()) {
     return output_sharding;
   }
@@ -354,13 +379,28 @@ HloSharding GatherIndexSharding(const HloSharding& output_sharding,
           output_sharding.tile_assignment().dim(i));
     }
   }
+  int64 index_rank = hlo->operand(1)->shape().rank();
+
+  // Vector indices sharding is not supported yet.
+  if (index_rank > index_tile_assignment_dims.size()) {
+    index_tile_assignment_dims.insert(
+        index_tile_assignment_dims.begin() + dnums.index_vector_dim(), 1);
+  }
+
+  if (output_sharding.ReplicateOnLastTileDim()) {
+    index_tile_assignment_dims.push_back(
+        output_sharding.tile_assignment().dimensions().back());
+  }
+
   Array<int64> new_tile_assignment = output_sharding.tile_assignment();
   if (new_tile_assignment.num_elements() !=
       Product(index_tile_assignment_dims)) {
     return HloSharding::Replicate();
   }
   new_tile_assignment.Reshape(index_tile_assignment_dims);
-  return HloSharding::Tile(new_tile_assignment);
+  return output_sharding.ReplicateOnLastTileDim()
+             ? HloSharding::PartialTile(new_tile_assignment)
+             : HloSharding::Tile(new_tile_assignment);
 }
 
 HloSharding GatherEffectiveOutputSharding(const HloInstruction& hlo) {
@@ -430,13 +470,19 @@ HloSharding ScatterIndexSharding(const HloSharding& data_sharding,
   if (index_tile_assignment_dims.size() < hlo->operand(1)->shape().rank()) {
     index_tile_assignment_dims.push_back(1);
   }
+  if (data_sharding.ReplicateOnLastTileDim()) {
+    index_tile_assignment_dims.push_back(
+        data_sharding.tile_assignment().dimensions().back());
+  }
   Array<int64> new_tile_assignment = data_sharding.tile_assignment();
   if (new_tile_assignment.num_elements() !=
       Product(index_tile_assignment_dims)) {
     return HloSharding::Replicate();
   }
   new_tile_assignment.Reshape(index_tile_assignment_dims);
-  return HloSharding::Tile(new_tile_assignment);
+  return data_sharding.ReplicateOnLastTileDim()
+             ? HloSharding::PartialTile(new_tile_assignment)
+             : HloSharding::Tile(new_tile_assignment);
 }
 
 HloSharding ScatterDataSharding(const HloSharding& index_sharding,
@@ -456,13 +502,19 @@ HloSharding ScatterDataSharding(const HloSharding& index_sharding,
       index_dim++;
     }
   }
+  if (index_sharding.ReplicateOnLastTileDim()) {
+    data_tile_assignment_dims.push_back(
+        index_sharding.tile_assignment().dimensions().back());
+  }
   Array<int64> new_tile_assignment = index_sharding.tile_assignment();
   if (new_tile_assignment.num_elements() !=
       Product(data_tile_assignment_dims)) {
     return HloSharding::Replicate();
   }
   new_tile_assignment.Reshape(data_tile_assignment_dims);
-  return HloSharding::Tile(new_tile_assignment);
+  return index_sharding.ReplicateOnLastTileDim()
+             ? HloSharding::PartialTile(new_tile_assignment)
+             : HloSharding::Tile(new_tile_assignment);
 }
 
 HloSharding ScatterEffectiveIndexSharding(const HloSharding& index_sharding,
@@ -589,9 +641,15 @@ absl::optional<HloSharding> PassthroughOperandToGatherOutputOrScatterUpdate(
     }
     passthrough_tile[offset_dim] = dim_partitions;
   }
+  if (operand_sharding.ReplicateOnLastTileDim()) {
+    passthrough_tile.push_back(
+        operand_sharding.tile_assignment().dimensions().back());
+  }
   Array<int64> tile_assignment = operand_sharding.tile_assignment();
   tile_assignment.Reshape(passthrough_tile);
-  return HloSharding::Tile(tile_assignment);
+  return operand_sharding.ReplicateOnLastTileDim()
+             ? HloSharding::PartialTile(tile_assignment)
+             : HloSharding::Tile(tile_assignment);
 }
 
 // Inverse of PassthroughOperandToGatherOutputOrScatterUpdate.
@@ -625,12 +683,19 @@ absl::optional<HloSharding> PassthroughGatherOutputOrScatterUpdateToOperand(
     }
     passthrough_tile[i] = dim_partitions;
   }
+
+  if (update_or_gather_sharding.ReplicateOnLastTileDim()) {
+    passthrough_tile.push_back(
+        update_or_gather_sharding.tile_assignment().dimensions().back());
+  }
   Array<int64> tile_assignment = update_or_gather_sharding.tile_assignment();
   if (tile_assignment.num_elements() != Product(passthrough_tile)) {
     return absl::nullopt;
   }
   tile_assignment.Reshape(passthrough_tile);
-  return HloSharding::Tile(tile_assignment);
+  return update_or_gather_sharding.ReplicateOnLastTileDim()
+             ? HloSharding::PartialTile(tile_assignment)
+             : HloSharding::Tile(tile_assignment);
 }
 
 }  // namespace
@@ -777,5 +842,119 @@ std::vector<int64> DevicesForSharding(
   return devices;
 }
 
+HloSharding PartiallyReplicateTiledShardingOnDims(
+    const HloSharding& sharding, const std::vector<int64>& dims_to_replicate) {
+  if (sharding.IsTileMaximal()) {
+    return sharding;
+  }
+  int64 group_count = 1;
+  for (int64 dim : dims_to_replicate) {
+    if (sharding.ReplicateOnLastTileDim()) {
+      CHECK_LT(dim, sharding.tile_assignment().num_dimensions());
+    }
+    group_count *= sharding.tile_assignment().dim(dim);
+  }
+  if (group_count == 1) {
+    return sharding;
+  }
+  if (group_count == sharding.NumTiles()) {
+    return HloSharding::Replicate();
+  }
+  std::vector<int64> dim_permutation(
+      sharding.tile_assignment().num_dimensions());
+  std::iota(dim_permutation.begin(), dim_permutation.end(), 0);
+  absl::c_sort(dim_permutation, [&](const int64 a, const int64 b) {
+    return absl::c_linear_search(dims_to_replicate, a) <
+           absl::c_linear_search(dims_to_replicate, b);
+  });
+  auto transposed = TransposeSharding(sharding, dim_permutation);
+  auto new_tile = transposed.tile_assignment();
+  std::vector<int64> new_tile_shape(
+      sharding.tile_assignment().dimensions().begin(),
+      sharding.tile_assignment().dimensions().end());
+  for (int64 dim : dims_to_replicate) {
+    new_tile_shape[dim] = 1;
+  }
+  if (sharding.ReplicateOnLastTileDim()) {
+    new_tile_shape.back() *= group_count;
+  } else {
+    new_tile_shape.push_back(group_count);
+  }
+  new_tile.Reshape(new_tile_shape);
+  return HloSharding::PartialTile(new_tile);
+}
+
+HloSharding RemoveShapeDimensions(const HloSharding& sharding,
+                                  const std::vector<int64>& dims_to_remove) {
+  if (sharding.IsTileMaximal() || dims_to_remove.empty()) {
+    return sharding;
+  }
+  std::vector<int64> new_tile_shape;
+  new_tile_shape.reserve(sharding.tile_assignment().num_dimensions() -
+                         dims_to_remove.size());
+  for (int64 i = 0; i < sharding.tile_assignment().num_dimensions(); ++i) {
+    if (absl::c_linear_search(dims_to_remove, i)) {
+      CHECK_EQ(sharding.tile_assignment().dim(i), 1);
+    } else {
+      new_tile_shape.push_back(sharding.tile_assignment().dim(i));
+    }
+  }
+  auto new_tile = sharding.tile_assignment();
+  new_tile.Reshape(new_tile_shape);
+  return sharding.ReplicateOnLastTileDim() ? HloSharding::PartialTile(new_tile)
+                                           : HloSharding::Tile(new_tile);
+}
+
+absl::optional<HloSharding> TransposeShardingWithCollapsedDims(
+    const HloSharding& source, absl::Span<int64 const> src_to_tgt,
+    absl::Span<int64 const> tgt_to_src) {
+  if (source.IsTileMaximal()) {
+    return source;
+  }
+  if (source.ReplicateOnLastTileDim() &&
+      src_to_tgt.size() < source.tile_assignment().num_dimensions()) {
+    std::vector<int64> new_src_to_tgt(src_to_tgt.begin(), src_to_tgt.end());
+    new_src_to_tgt.push_back(tgt_to_src.size());
+    std::vector<int64> new_tgt_to_src(tgt_to_src.begin(), tgt_to_src.end());
+    new_tgt_to_src.push_back(src_to_tgt.size());
+    return TransposeShardingWithCollapsedDims(source, new_src_to_tgt,
+                                              new_tgt_to_src);
+  }
+  std::vector<int64> tgt_dims_skipping_new(tgt_to_src.size(), -1);
+  int64 skipped_tgt_dims = 0;
+  for (int64 i = 0; i < tgt_to_src.size(); ++i) {
+    if (tgt_to_src[i] < 0) {
+      skipped_tgt_dims++;
+    } else {
+      tgt_dims_skipping_new[i] = i - skipped_tgt_dims;
+    }
+  }
+  int64 skipped_src_dims = absl::c_count(src_to_tgt, -1);
+  std::vector<int64> perm(src_to_tgt.size());
+  for (int64 i = 0; i < src_to_tgt.size(); ++i) {
+    if (src_to_tgt[i] < 0) {
+      if (source.tile_assignment().dim(i) > 1) {
+        return absl::nullopt;
+      }
+      perm[src_to_tgt.size() - skipped_src_dims] = i;
+      skipped_src_dims--;
+    } else {
+      perm[tgt_dims_skipping_new[src_to_tgt[i]]] = i;
+    }
+  }
+  auto tgt_sharding = hlo_sharding_util::TransposeSharding(source, perm);
+  auto reshape_tiles = tgt_sharding.tile_assignment();
+  std::vector<int64> tgt_tiles(tgt_to_src.size(), 1);
+  for (int64 i = 0; i < tgt_tiles.size(); ++i) {
+    if (tgt_to_src[i] >= 0) {
+      tgt_tiles[i] = reshape_tiles.dim(tgt_dims_skipping_new[i]);
+    }
+  }
+  reshape_tiles.Reshape(tgt_tiles);
+  return source.ReplicateOnLastTileDim()
+             ? HloSharding::PartialTile(reshape_tiles)
+             : HloSharding::Tile(reshape_tiles);
+}
+
 }  // namespace hlo_sharding_util
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_sharding_util.h b/tensorflow/compiler/xla/service/hlo_sharding_util.h
index cc4068121ae..0de01fcab7e 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding_util.h
+++ b/tensorflow/compiler/xla/service/hlo_sharding_util.h
@@ -163,6 +163,24 @@ IdentityValueAndHloOpcodeForScatterReduceComputation(
 std::vector<int64> DevicesForSharding(
     const HloSharding& sharding, const std::vector<int64>& available_devices);
 
+// Returns a sharding that replicates data across devices along the given
+// dimensions in the original sharding.
+HloSharding PartiallyReplicateTiledShardingOnDims(
+    const HloSharding& sharding, const std::vector<int64>& dims_to_replicate);
+
+// Returns a sharding the removes given tile dimensions.
+//
+// Precondition: if not tile maximal, the size of each tile dimension must be 1.
+HloSharding RemoveShapeDimensions(const HloSharding& sharding,
+                                  const std::vector<int64>& dims_to_remove);
+
+// Similar to TransposeSharding(), but allows removing/adding non-partitioned
+// dimensions. In src_to_tgt and tgt_to_src, -1 represents a non-existing
+// dimension.
+absl::optional<HloSharding> TransposeShardingWithCollapsedDims(
+    const HloSharding& source, absl::Span<int64 const> src_to_tgt,
+    absl::Span<int64 const> tgt_to_src);
+
 }  // namespace hlo_sharding_util
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index d395fddcc5d..0346e9077a0 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -703,6 +703,20 @@ Status ShapeVerifier::HandleBroadcast(HloInstruction* broadcast) {
   return Status::OK();
 }
 
+Status ShapeVerifier::HandleDynamicReshape(HloInstruction* dynamic_reshape) {
+  // Check for mixed precision.
+  const Shape& operand_shape = dynamic_reshape->operand(0)->shape();
+  TF_RET_CHECK(SameElementType(dynamic_reshape->shape(), operand_shape));
+  TF_RET_CHECK(ShapeUtil::ElementsIn(dynamic_reshape->shape()) ==
+               ShapeUtil::ElementsIn(operand_shape));
+  TF_RET_CHECK(dynamic_reshape->shape().rank() + 1 ==
+               dynamic_reshape->operand_count());
+  for (int64 i = 1; i < dynamic_reshape->operand_count(); ++i) {
+    TF_RET_CHECK(dynamic_reshape->operand(i)->shape().element_type() == S32);
+  }
+  return Status::OK();
+}
+
 Status ShapeVerifier::HandleReshape(HloInstruction* reshape) {
   // Check for mixed precision.
   const Shape& operand_shape = reshape->operand(0)->shape();
@@ -1023,7 +1037,7 @@ namespace {
 // inputs.
 Status CheckMixedPrecisionOperands(const HloInstruction* instruction) {
   switch (instruction->opcode()) {
-    // White list the following opcodes for mixed-precision check, because
+    // Allow-list the following opcodes for mixed-precision check, because
     // they involve data pass through or grouping via tuples, where the
     // precisions of buffers can be different.
     case HloOpcode::kCall:
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.h b/tensorflow/compiler/xla/service/hlo_verifier.h
index 85b02e0518c..03fca5938ff 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.h
+++ b/tensorflow/compiler/xla/service/hlo_verifier.h
@@ -78,6 +78,7 @@ class ShapeVerifier : public DfsHloVisitor {
   Status HandleBitcast(HloInstruction* bitcast) override;
   Status HandleBroadcast(HloInstruction* broadcast) override;
   Status HandleReshape(HloInstruction* reshape) override;
+  Status HandleDynamicReshape(HloInstruction* dynamic_reshape) override;
   Status HandleTranspose(HloInstruction* transpose) override;
   Status HandleParameter(HloInstruction*) override;
   Status HandleFusion(HloInstruction*) override;
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.cc b/tensorflow/compiler/xla/service/instruction_fusion.cc
index 8d8930615b2..11472f55792 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion.cc
@@ -102,6 +102,7 @@ bool IsAlwaysDuplicable(const HloInstruction& instruction) {
     case HloOpcode::kReducePrecision:
     case HloOpcode::kReplicaId:
     case HloOpcode::kReshape:
+    case HloOpcode::kDynamicReshape:
     case HloOpcode::kReverse:
     case HloOpcode::kRoundNearestAfz:
     case HloOpcode::kSelect:
@@ -515,11 +516,12 @@ StatusOr<bool> InstructionFusion::Run(HloModule* module) {
         continue;
       }
 
-      VLOG(5) << "Considering fusion of: " << instruction->ToString();
       std::vector<int64>& sorted_operand_numbers = next_entry.second;
 
       for (int64 i : sorted_operand_numbers) {
         HloInstruction* operand = instruction->mutable_operand(i);
+        VLOG(5) << "Considering fusion of: " << instruction->ToString()
+                << " with operand " << operand->name();
 
         if (!operand->IsFusible()) {
           VLOG(3) << "Operand (" << operand->ToString() << ") is not fusible";
@@ -600,6 +602,9 @@ StatusOr<bool> InstructionFusion::Run(HloModule* module) {
     VLOG(1) << FusionConfigToString(*fusion_config);
     module->set_config(module_config);
   }
+
+  reachability_.reset();
+
   VLOG(1) << "Fusion count: " << fuse_count;
 
   return changed;
@@ -709,4 +714,23 @@ HloInstruction::FusionKind InstructionFusion::ChooseKind(
   return HloInstruction::FusionKind::kLoop;
 }
 
+bool InstructionFusion::ReusesOperandElements(const HloInstruction* consumer,
+                                              int64 operand_index) {
+  auto operand = consumer->operand(operand_index);
+  auto it = reused_fusion_operands_.find(consumer);
+  if (it != reused_fusion_operands_.end() && it->second.contains(operand)) {
+    return true;
+  }
+  bool reuses = consumer->ReusesOperandElements(operand_index);
+  // If a parameter was reused, we can cache this information. Fusion
+  // computations only ever grow, so it becomes more likely that a parameter is
+  // reused, but a reused parameter will never become *not* reused.
+  if (reuses) {
+    // We cache the operand corresponding to the fusion parameter, because the
+    // parameter pointers would be invalidated after the next fusion.
+    reused_fusion_operands_[consumer].insert(operand);
+  }
+  return reuses;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.h b/tensorflow/compiler/xla/service/instruction_fusion.h
index 90d9da48e33..d51bf700371 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.h
+++ b/tensorflow/compiler/xla/service/instruction_fusion.h
@@ -1,4 +1,3 @@
-#include "absl/container/flat_hash_map.h"
 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -20,6 +19,8 @@ limitations under the License.
 #include <functional>
 #include <utility>
 
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/compiler/xla/service/fusion_queue.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
@@ -138,6 +139,11 @@ class InstructionFusion : public HloModulePass {
     return config_collection_mode_;
   }
 
+  // Returns whether 'consumer' may reuse elements of its `operand_index`th
+  // operand.
+  bool ReusesOperandElements(const HloInstruction* consumer,
+                             int64 operand_index);
+
  private:
   // The set of producers whose consumers we cannot fuse into.
   using HloInstructionSet = std::unordered_set<HloInstruction*>;
@@ -172,6 +178,11 @@ class InstructionFusion : public HloModulePass {
   // Configuration mode.
   FusionConfigCollection config_collection_mode_;
 
+  // Caches which operands are reused inside fusion computations.
+  absl::flat_hash_map<const HloInstruction*,
+                      absl::flat_hash_set<const HloInstruction*>>
+      reused_fusion_operands_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(InstructionFusion);
 };
 
diff --git a/tensorflow/compiler/xla/service/interpreter/BUILD b/tensorflow/compiler/xla/service/interpreter/BUILD
index 7a4eefc1ab6..3444d4cae42 100644
--- a/tensorflow/compiler/xla/service/interpreter/BUILD
+++ b/tensorflow/compiler/xla/service/interpreter/BUILD
@@ -34,6 +34,7 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/service:algebraic_simplifier",
         "//tensorflow/compiler/xla/service:cholesky_expander",
+        "//tensorflow/compiler/xla/service:comparison_expander",
         "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/compiler/xla/service:computation_placer",
         "//tensorflow/compiler/xla/service:custom_call_target_registry",
diff --git a/tensorflow/compiler/xla/service/interpreter/compiler.cc b/tensorflow/compiler/xla/service/interpreter/compiler.cc
index 1649be2ca8f..a059482d832 100644
--- a/tensorflow/compiler/xla/service/interpreter/compiler.cc
+++ b/tensorflow/compiler/xla/service/interpreter/compiler.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/service/algebraic_simplifier.h"
 #include "tensorflow/compiler/xla/service/cholesky_expander.h"
+#include "tensorflow/compiler/xla/service/comparison_expander.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
 #include "tensorflow/compiler/xla/service/custom_call_target_registry.h"
 #include "tensorflow/compiler/xla/service/dynamic_index_splitter.h"
@@ -81,6 +82,7 @@ Status InterpreterCompiler::RunHloOptimization(HloModule* hlo_module) {
 
   pipeline.AddPass<DynamicIndexSplitter>();
   pipeline.AddPass<CholeskyExpander>();
+  pipeline.AddPass<ComparisonExpander>();
   pipeline.AddPass<TriangularSolveExpander>();
   pipeline.AddPass<LayoutAssignment>(
       hlo_module->mutable_entry_computation_layout(),
diff --git a/tensorflow/compiler/xla/service/interpreter/executable.cc b/tensorflow/compiler/xla/service/interpreter/executable.cc
index cc7fdeaf0f6..1446b55f5a8 100644
--- a/tensorflow/compiler/xla/service/interpreter/executable.cc
+++ b/tensorflow/compiler/xla/service/interpreter/executable.cc
@@ -52,6 +52,7 @@ InterpreterExecutable::InterpreterExecutable(
 }
 
 StatusOr<Literal> InterpreterExecutable::Evaluate(
+    const ServiceExecutableRunOptions* run_options,
     const HloComputation& computation, absl::Span<const Literal> arg_literals) {
   // Execute the graph using the HloEvaluator.
   tensorflow::mutex_lock lock(evaluator_lock_);
diff --git a/tensorflow/compiler/xla/service/interpreter/executable.h b/tensorflow/compiler/xla/service/interpreter/executable.h
index ce68a8472f5..514ed029a22 100644
--- a/tensorflow/compiler/xla/service/interpreter/executable.h
+++ b/tensorflow/compiler/xla/service/interpreter/executable.h
@@ -51,7 +51,8 @@ class InterpreterExecutable : public InterpreterExecutableBase {
   static int64 ShapeSizeBytes(const Shape& shape);
 
  protected:
-  StatusOr<Literal> Evaluate(const HloComputation& computation,
+  StatusOr<Literal> Evaluate(const ServiceExecutableRunOptions* run_options,
+                             const HloComputation& computation,
                              absl::Span<const Literal> arg_literals) override
       TF_LOCKS_EXCLUDED(evaluator_lock_);
 
diff --git a/tensorflow/compiler/xla/service/interpreter/executable_base.cc b/tensorflow/compiler/xla/service/interpreter/executable_base.cc
index 4b6a8aa5202..745750bffe1 100644
--- a/tensorflow/compiler/xla/service/interpreter/executable_base.cc
+++ b/tensorflow/compiler/xla/service/interpreter/executable_base.cc
@@ -50,11 +50,15 @@ StatusOr<ExecutionOutput> InterpreterExecutableBase::ExecuteAsyncOnStream(
   // TransferManager methods below.
   std::vector<ShapedBuffer> argument_buffers;
   argument_buffers.reserve(arguments.size());
+  int device_ordinal = run_options->device_ordinal();
+  if (device_ordinal < 0) {
+    device_ordinal = 0;
+  }
   for (auto& argument : arguments) {
     const ShapeTree<MaybeOwningDeviceMemory>& buffers = argument.Buffers();
     argument_buffers.push_back(ShapedBuffer(buffers.shape(), buffers.shape(),
                                             /*platform=*/nullptr,
-                                            /*device_ordinal=*/0));
+                                            /*device_ordinal=*/device_ordinal));
     auto in_it = buffers.begin();
     auto out_it = argument_buffers.back().buffers().begin();
     for (; in_it != buffers.end(); ++in_it, ++out_it) {
@@ -118,7 +122,7 @@ StatusOr<ExecutionOutput> InterpreterExecutableBase::ExecuteAsyncOnStream(
   }
 
   TF_ASSIGN_OR_RETURN(Literal result_literal,
-                      Evaluate(*computation, arg_literals));
+                      Evaluate(run_options, *computation, arg_literals));
   // Shrink the generated dynamic shape into static shape.
   result_literal = result_literal.ToStatic();
 
diff --git a/tensorflow/compiler/xla/service/interpreter/executable_base.h b/tensorflow/compiler/xla/service/interpreter/executable_base.h
index a02ab7af8d0..eb47841a179 100644
--- a/tensorflow/compiler/xla/service/interpreter/executable_base.h
+++ b/tensorflow/compiler/xla/service/interpreter/executable_base.h
@@ -44,6 +44,7 @@ class InterpreterExecutableBase : public Executable {
 
  protected:
   virtual StatusOr<Literal> Evaluate(
+      const ServiceExecutableRunOptions* run_options,
       const HloComputation& computation,
       absl::Span<const Literal> arg_literals) = 0;
 
diff --git a/tensorflow/compiler/xla/service/interpreter/executor.h b/tensorflow/compiler/xla/service/interpreter/executor.h
index 9e4bdeb2b2d..9416b11a07e 100644
--- a/tensorflow/compiler/xla/service/interpreter/executor.h
+++ b/tensorflow/compiler/xla/service/interpreter/executor.h
@@ -38,7 +38,6 @@ limitations under the License.
 #include "tensorflow/stream_executor/launch_dim.h"
 #include "tensorflow/stream_executor/plugin.h"
 #include "tensorflow/stream_executor/rng.h"
-#include "tensorflow/stream_executor/shared_memory_config.h"
 #include "tensorflow/stream_executor/stream.h"
 #include "tensorflow/stream_executor/stream_executor.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
@@ -182,15 +181,6 @@ class XlaInterpreterExecutor : public internal::StreamExecutorInterface {
     return true;
   }
 
-  SharedMemoryConfig GetDeviceSharedMemoryConfig() override {
-    return SharedMemoryConfig::kDefault;
-  }
-
-  port::Status SetDeviceSharedMemoryConfig(SharedMemoryConfig config) override {
-    return port::Status{port::error::UNIMPLEMENTED,
-                        "Shared memory not supported"};
-  }
-
   std::unique_ptr<internal::EventInterface> CreateEventImplementation()
       override {
     return nullptr;
diff --git a/tensorflow/compiler/xla/service/layout_assignment.cc b/tensorflow/compiler/xla/service/layout_assignment.cc
index bea0f1fb93c..55569cfde0e 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment.cc
@@ -1891,7 +1891,7 @@ Status LayoutAssignment::RunOnComputation(
             ? ShapeUtil::GetSubshape(instruction->literal().shape(),
                                      buffer.index())
                   .layout()
-            : LayoutUtil::GetDefaultLayoutForShape(buffer.shape());
+            : GetUnconstrainedLayout(buffer);
     TF_RETURN_IF_ERROR(constraints.SetBufferLayout(new_layout, buffer,
                                                    /*mandatory=*/false));
 
@@ -2278,6 +2278,7 @@ bool LayoutAssignment::InstructionCanChangeLayout(
     case HloOpcode::kReduce:
     case HloOpcode::kReplicaId:
     case HloOpcode::kReshape:
+    case HloOpcode::kDynamicReshape:
     case HloOpcode::kRng:
     case HloOpcode::kRngBitGenerator:
     case HloOpcode::kRngGetAndUpdateState:
diff --git a/tensorflow/compiler/xla/service/layout_assignment.h b/tensorflow/compiler/xla/service/layout_assignment.h
index a04d056c618..def620bcee9 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.h
+++ b/tensorflow/compiler/xla/service/layout_assignment.h
@@ -27,6 +27,7 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/service/call_graph.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
@@ -338,6 +339,9 @@ class LayoutAssignment : public HloModulePass {
       const ResultLayoutConstraint& layout_constraint,
       LayoutConstraints* constraints);
 
+  virtual Layout GetUnconstrainedLayout(const LogicalBuffer& buffer) {
+    return LayoutUtil::GetDefaultLayoutForShape(buffer.shape());
+  }
   // Called after layouts of an instruction have been finalized to allow
   // subclasses to check for platform specific assumptions.
   virtual Status Verify(const HloInstruction* instruction) {
diff --git a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc
index 0371ce71874..6aa33a10d64 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc
@@ -244,16 +244,7 @@ bool FusedIrEmitter::IsFusedIrEmitterInefficient(
       } else {
         total = 0;
         for (const auto* user : indexing_users[instruction]) {
-          int64 weight = 1;
-          // Concatenate is special: the index differs for each operand, so
-          // in the worst case we have to deal with as many index values as
-          // the number of operands of Concatenate. By considering the worst
-          // case, we are more conservative than necessary regarding
-          // refusing to fuse.
-          if (user->opcode() == HloOpcode::kConcatenate) {
-            weight = user->operand_count();
-          }
-          total += index_usage_count[user] * weight;
+          total += index_usage_count[user];
         }
       }
       for (const auto* operand : instruction->operands()) {
@@ -298,15 +289,9 @@ bool FusedIrEmitter::IsFusedIrEmitterInefficient(
     evaluate_fusion_computation(producer);
   }
 
-  // Sum up the total number of emitted ops.
-  int64 total = 0;
-  for (const auto& entry : index_usage_count) {
-    total += entry.second;
-  }
-
   // Check that the code duplication has at most a factor of 15 (where 15 is an
   // arbitrary constant that seems to work).
-  return total > 15 * index_usage_count.size();
+  return index_usage_count[producer] > 15;
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
index b01ae2efe43..2963d546380 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
@@ -415,9 +415,10 @@ llvm::Instruction* AddRangeMetadata(int64 lower, int64 upper,
   return inst;
 }
 
-string IrName(string a) {
-  a.erase(std::remove(a.begin(), a.end(), '%'), a.end());
-  return a;
+string IrName(absl::string_view a) {
+  std::string s(a);
+  s.erase(std::remove(s.begin(), s.end(), '%'), s.end());
+  return s;
 }
 
 string IrName(absl::string_view a, absl::string_view b) {
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
index 642965b6470..c0a55e4da33 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
@@ -87,7 +87,7 @@ string DumpModuleToString(const llvm::Module& module);
 //   - joining all of the nonempty inputs by '.', and then
 //   - removing all '%'s.
 //
-string IrName(string a);
+string IrName(absl::string_view a);
 string IrName(absl::string_view a, absl::string_view b);
 string IrName(const HloInstruction* a, absl::string_view b = "");
 
diff --git a/tensorflow/compiler/xla/service/llvm_ir/tuple_ops.cc b/tensorflow/compiler/xla/service/llvm_ir/tuple_ops.cc
index daf98478194..d89a9c2e0a5 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/tuple_ops.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/tuple_ops.cc
@@ -62,10 +62,11 @@ void EmitTuple(const IrArray& tuple, absl::Span<llvm::Value* const> operands,
                llvm::IRBuilder<>* b) {
   llvm::Module* module = getModuleFromBuilder(b);
   for (size_t i = 0; i < operands.size(); ++i) {
+    auto* cast =
+        b->CreatePointerCast(operands[i], PrimitiveTypeToIrType(TUPLE, module));
     auto* store = b->CreateStore(
-        b->CreatePointerCast(operands[i], PrimitiveTypeToIrType(TUPLE, module)),
-        b->CreateInBoundsGEP(tuple.GetBasePointer(),
-                             {b->getInt64(0), b->getInt64(i)}));
+        cast, b->CreateInBoundsGEP(tuple.GetBasePointer(),
+                                   {b->getInt64(0), b->getInt64(i)}));
     tuple.AnnotateLoadStoreInstructionWithMetadata(store);
   }
 }
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment.cc b/tensorflow/compiler/xla/service/memory_space_assignment.cc
index 0f7daa67800..2963fde9036 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment.cc
+++ b/tensorflow/compiler/xla/service/memory_space_assignment.cc
@@ -80,7 +80,7 @@ float MemorySpaceAssignmentCostAnalysis::GetAlternateMemoryBenefit(
 }
 
 float MemorySpaceAssignmentCostAnalysis::GetMemoryBoundedness(
-    const GlobalDecreasingSizeBestFitHeap::BufferInterval& interval,
+    const GlobalDecreasingSizeBestFitHeap<HloValue>::BufferInterval& interval,
     MemorySpaceAssignmentCostAnalysis::Cache* cache) const {
   const HloInstruction& defining_instruction =
       *interval.buffer->defining_instruction();
@@ -119,14 +119,10 @@ float MemorySpaceAssignmentCostAnalysis::GetMemoryBoundedness(
     }
   }
 
-  // Get performance slowdown in seconds of prefetching current BufferInterval
-  // causing to other BufferIntervals.
-  float alternate_mem_slowdown =
-      GetInstructionElapsedDueToMemorySlowdown(interval.size);
-
-  // Divide by the size of the buffer to prioritize smaller buffers that will
-  // give the largest alternate memory benefit.
-  return (alternate_mem_benefit - alternate_mem_slowdown) / interval.size;
+  // Penalize larger buffers by dividing the benefit by the square root of the
+  // size. Empirically, we observed this resulted in better performance compared
+  // to dividing by the size.
+  return alternate_mem_benefit / std::sqrt(interval.size);
 }
 
 int MemorySpaceAssignmentCostAnalysis::CalculateWhileLoopNestLevel(
@@ -236,15 +232,26 @@ int64 InstructionCountPrefetchIntervalPicker::PreferredEvictionEndTime(
 }
 
 int64 InstructionCountPrefetchIntervalPicker::LatestPrefetchStartTime(
-    const HloUse& use, int64 start_time, int64 end_time) const {
+    const Shape& shape, int64 start_time, int64 end_time,
+    const HloUse* use) const {
   return end_time - min_overlap_count_;
 }
 
+int64 InstructionCountPrefetchIntervalPicker::PreferredPrefetchStartTime(
+    const Shape& shape, int64 earliest_prefetch_start_time,
+    int64 latest_prefetch_start_time, int64 prefetch_end_time) const {
+  return std::max(earliest_prefetch_start_time,
+                  prefetch_end_time - max_overlap_count_);
+}
+
 void InstructionCountPrefetchIntervalPicker::Begin(const HloUse& use,
                                                    int64 start_time,
                                                    int64 end_time) {
   end_time_ = end_time;
-  current_prefetch_time_ = std::max(start_time, end_time_ - max_overlap_count_);
+  const Shape& shape = ShapeUtil::GetSubshape(
+      use.instruction->operand(use.operand_number)->shape(), use.operand_index);
+  current_prefetch_time_ =
+      PreferredPrefetchStartTime(shape, start_time, end_time, end_time);
 }
 
 int64 InstructionCountPrefetchIntervalPicker::Next() {
@@ -361,18 +368,22 @@ int64 CostAnalysisPrefetchIntervalPicker::PreferredEvictionEndTime(
 }
 
 int64 CostAnalysisPrefetchIntervalPicker::LatestPrefetchStartTime(
-    const HloUse& use, int64 start_time, int64 end_time) const {
-  const Shape& shape = ShapeUtil::GetSubshape(
-      use.instruction->operand(use.operand_number)->shape(), use.operand_index);
+    const Shape& shape, int64 start_time, int64 end_time,
+    const HloUse* use) const {
   // Find the earliest time that satisfies max_async_copy_to_overlap_ratio_.
   float async_copy_elapsed = cost_analysis_.GetAsyncCopyElapsed(shape);
-  // Estimate the time we would save by having this op in alternate memory.
-  float elapsed_time = cost_analysis_.GetInstructionElapsed(*use.instruction);
-  float elapsed_time_in_alternate_mem =
-      cost_analysis_.GetInstructionElapsedInAlternateMemory(
-          *use.instruction, use.operand_number,
-          /*output_in_alternate_mem=*/false);
-  float inst_elapsed_reduction = elapsed_time - elapsed_time_in_alternate_mem;
+  // If there is a use, estimate the time we would save by having this op in
+  // alternate memory.
+  float inst_elapsed_reduction = 0.0f;
+  if (use) {
+    float elapsed_time =
+        cost_analysis_.GetInstructionElapsed(*use->instruction);
+    float elapsed_time_in_alternate_mem =
+        cost_analysis_.GetInstructionElapsedInAlternateMemory(
+            *use->instruction, use->operand_number,
+            /*output_in_alternate_mem=*/false);
+    inst_elapsed_reduction = elapsed_time - elapsed_time_in_alternate_mem;
+  }
   int end_nest_level = while_nest_level_[end_time];
 
   // Find the latest time we're allowed to start prefetching.
@@ -390,6 +401,33 @@ int64 CostAnalysisPrefetchIntervalPicker::LatestPrefetchStartTime(
   return latest_prefetch_time;
 }
 
+int64 CostAnalysisPrefetchIntervalPicker::PreferredPrefetchStartTime(
+    const Shape& shape, int64 earliest_prefetch_start_time,
+    int64 latest_prefetch_start_time, int64 prefetch_end_time) const {
+  // Between the earliest and latest prefetch interval, find the interval
+  // closest to the preferred interval and start iterating from there.
+  float async_copy_elapsed = cost_analysis_.GetAsyncCopyElapsed(shape);
+  int64 preferred_prefetch_start_time = earliest_prefetch_start_time;
+  float preferred_interval =
+      preferred_async_copy_to_overlap_ratio_ * async_copy_elapsed;
+  float best_interval = GetLogicalIntervalElapsed(earliest_prefetch_start_time,
+                                                  prefetch_end_time);
+  int end_nest_level = while_nest_level_[prefetch_end_time];
+  for (int64 prefetch_start_time = earliest_prefetch_start_time + 1;
+       prefetch_start_time <= latest_prefetch_start_time;
+       ++prefetch_start_time) {
+    float interval =
+        GetLogicalIntervalElapsed(prefetch_start_time, prefetch_end_time);
+    if (while_nest_level_[prefetch_start_time] == end_nest_level &&
+        std::abs(preferred_interval - interval) <
+            std::abs(preferred_interval - best_interval)) {
+      best_interval = interval;
+      preferred_prefetch_start_time = prefetch_start_time;
+    }
+  }
+  return preferred_prefetch_start_time;
+}
+
 int64 CostAnalysisPrefetchIntervalPicker::LatestPrefetchEndTime(
     int64 original_prefetch_end_time, int64 proposed_prefetch_end_time) const {
   // Iterate towards the beginning until we find a suitable end time that is the
@@ -422,7 +460,8 @@ void CostAnalysisPrefetchIntervalPicker::Begin(const HloUse& use,
 
   // Find the latest time we're allowed to start prefetching.
   float min_interval = min_async_copy_to_overlap_ratio_ * async_copy_elapsed_;
-  latest_prefetch_time_ = LatestPrefetchStartTime(use, start_time, end_time);
+  latest_prefetch_time_ =
+      LatestPrefetchStartTime(shape, start_time, end_time, &use);
 
   // Find the earliest time we're allowed to start prefetching.
   float max_interval = max_async_copy_to_overlap_ratio_ *
@@ -443,24 +482,10 @@ void CostAnalysisPrefetchIntervalPicker::Begin(const HloUse& use,
     return;
   }
 
-  // Between the earliest and latest prefetch interval, find the interval
-  // closest to the preferred interval and start iterating from there.
-  int64 starting_prefetch_time = earliest_prefetch_time_;
+  int64 starting_prefetch_time = PreferredPrefetchStartTime(
+      shape, earliest_prefetch_time_, latest_prefetch_time_, end_logical_time_);
   float preferred_interval =
       preferred_async_copy_to_overlap_ratio_ * async_copy_elapsed_;
-  float best_interval =
-      GetLogicalIntervalElapsed(earliest_prefetch_time_, end_logical_time_);
-  for (int64 prefetch_time = earliest_prefetch_time_ + 1;
-       prefetch_time <= latest_prefetch_time_; ++prefetch_time) {
-    float interval =
-        GetLogicalIntervalElapsed(prefetch_time, end_logical_time_);
-    if (while_nest_level_[prefetch_time] == end_nest_level &&
-        std::abs(preferred_interval - interval) <
-            std::abs(preferred_interval - best_interval)) {
-      best_interval = interval;
-      starting_prefetch_time = prefetch_time;
-    }
-  }
   VLOG(4) << "Interval min/max/preferred = " << min_interval << " "
           << max_interval << " " << preferred_interval
           << " prefetch time earliest/latest/starting = "
@@ -570,7 +595,8 @@ std::string CostAnalysisPrefetchIntervalPicker::ToNoCopyDebugString(
 
 absl::optional<float>
 CostAnalysisPrefetchIntervalPicker::BufferIntervalAlternateMemoryBenefit(
-    const GlobalDecreasingSizeBestFitHeap::BufferInterval& interval) const {
+    const GlobalDecreasingSizeBestFitHeap<HloValue>::BufferInterval& interval)
+    const {
   return cost_analysis_.GetMemoryBoundedness(interval);
 }
 
@@ -610,7 +636,9 @@ std::string MemorySpaceAssignment::AllocationValue::ToShortString() const {
 }
 
 void AlternateMemoryBestFitHeap::CreateAllocationValues(
-    const HloValue* value, std::vector<AllocationValue>* allocation_values) {
+    const AlternateMemoryBestFitHeap::BufferInterval& buffer_interval,
+    std::vector<AllocationValue>& allocation_values) const {
+  const HloValue* value = buffer_interval.buffer;
   VLOG(3) << "Creating AllocationValues for: " << value->ToString();
 
   // Find and sort all non-trivial (excluding GTE, Tuple, and bitcast)
@@ -638,10 +666,10 @@ void AlternateMemoryBestFitHeap::CreateAllocationValues(
 
   // Create an AllocationValue for each non-trivial position.
   absl::flat_hash_set<const HloComputation*> computations;
-  int beginning_idx = allocation_values->size();
+  int beginning_idx = allocation_values.size();
   for (int i = 0; i < positions.size(); ++i) {
     const HloPosition& position = positions.at(i);
-    allocation_values->emplace_back(value, position);
+    allocation_values.emplace_back(value, position, buffer_interval.size);
   }
 
   std::vector<HloUse> uses(value->uses());
@@ -662,8 +690,8 @@ void AlternateMemoryBestFitHeap::CreateAllocationValues(
     HloComputation* use_computation = use.instruction->parent();
 
     AllocationValue* last_allocation_value = nullptr;
-    for (int i = beginning_idx; i < allocation_values->size(); ++i) {
-      AllocationValue* allocation_value = &allocation_values->at(i);
+    for (int i = beginning_idx; i < allocation_values.size(); ++i) {
+      AllocationValue* allocation_value = &allocation_values.at(i);
       if (allocation_value->computation() == use_computation &&
           instruction_schedule.at(
               allocation_value->defining_position().instruction) < use_time) {
@@ -674,9 +702,9 @@ void AlternateMemoryBestFitHeap::CreateAllocationValues(
     last_allocation_value->AddUse(use, use_time);
   }
 
-  for (int i = beginning_idx; i < allocation_values->size(); ++i) {
+  for (int i = beginning_idx; i < allocation_values.size(); ++i) {
     VLOG(3) << "Created allocation value: "
-            << allocation_values->at(i).ToString();
+            << allocation_values.at(i).ToString();
   }
 }
 
@@ -731,9 +759,9 @@ void AlternateMemoryBestFitHeap::FindAliases(
   }
 }
 
-std::vector<const GlobalDecreasingSizeBestFitHeap::BufferInterval*>
+std::vector<const AlternateMemoryBestFitHeap::BufferInterval*>
 AlternateMemoryBestFitHeap::GetSortedColocatedIntervals(
-    const GlobalDecreasingSizeBestFitHeap::BufferInterval& interval) const {
+    const AlternateMemoryBestFitHeap::BufferInterval& interval) const {
   std::vector<const BufferInterval*> colocated_intervals;
   std::vector<const BufferInterval*> worklist = {&interval};
   while (!worklist.empty()) {
@@ -862,7 +890,7 @@ bool AlternateMemoryBestFitHeap::IsUseAllowedInAlternateMemory(
 }
 
 void AlternateMemoryBestFitHeap::AppendBufferInfoDebugString(
-    const GlobalDecreasingSizeBestFitHeap::BufferInterval& interval,
+    const AlternateMemoryBestFitHeap::BufferInterval& interval,
     std::string* debug_str) const {
   // Columns in buffer information:
   // buffer_id: int. This value can be used to match the allocation in
@@ -920,27 +948,27 @@ void AlternateMemoryBestFitHeap::AppendBufferInfoDebugString(
 }
 
 void AlternateMemoryBestFitHeap::AppendAllocationInfoDebugString(
-    const GlobalDecreasingSizeBestFitHeap::BufferInterval& interval,
+    const AllocationValue& value,
     const MemorySpaceAssignment::Allocation& allocation,
-    std::string* debug_str) const {
+    std::string& debug_str) const {
   // Columns in allocation information:
   // buffer_id: int. This value can be used the match with buffer info.
   // size: int. In bytes.
   // offset: int. In bytes.
   // start_time: int. Logical start time of the allocation.
   // end_time: int. Logical end time of the allocation.
-  if (debug_str->empty()) {
+  if (debug_str.empty()) {
     // Append the column names.
-    absl::StrAppend(debug_str, "buffer_id,size,offset,start_time,end_time\n");
+    absl::StrAppend(&debug_str, "buffer_id,size,offset,start_time,end_time\n");
   }
   if (allocation.memory_space() == MemorySpace::kAlternate) {
     const HloBuffer& buffer =
-        alias_analysis_.GetBufferContainingValue(*interval.buffer);
-    absl::StrAppend(debug_str, buffer.id(), ",");
-    absl::StrAppend(debug_str, interval.size, ",");
-    absl::StrAppend(debug_str, allocation.chunk().offset, ",");
-    absl::StrAppend(debug_str, allocation.start_time(), ",");
-    absl::StrAppend(debug_str, allocation.end_time(), "\n");
+        alias_analysis_.GetBufferContainingValue(*value.value());
+    absl::StrAppend(&debug_str, buffer.id(), ",");
+    absl::StrAppend(&debug_str, value.size(), ",");
+    absl::StrAppend(&debug_str, allocation.chunk().offset, ",");
+    absl::StrAppend(&debug_str, allocation.start_time(), ",");
+    absl::StrAppend(&debug_str, allocation.end_time(), "\n");
   }
 }
 
@@ -952,7 +980,7 @@ void AlternateMemoryBestFitHeap::DumpDebugStringsIfEnabled() const {
   options_.dump_fn("allocinfo", allocation_info_str_);
 }
 
-HeapSimulator::Result AlternateMemoryBestFitHeap::Finish() {
+HeapSimulator::Result<HloValue> AlternateMemoryBestFitHeap::Finish() {
   std::vector<BufferInterval> sorted_buffer_intervals =
       GetSortedBufferIntervals();
 
@@ -971,6 +999,16 @@ HeapSimulator::Result AlternateMemoryBestFitHeap::Finish() {
     }
   }
 
+  for (const auto& interval : sorted_buffer_intervals) {
+    auto colocated_intervals = GetSortedColocatedIntervals(interval);
+    if (AreIntervalsReservedInAlternateMemory(colocated_intervals)) {
+      // Increment the reserved part of alternate memory so that it is not
+      // available for other buffers.
+      reserved_in_bytes_ += options_.size_fn(*interval.buffer);
+    }
+  }
+  VLOG(2) << "Total reserved bytes = " << reserved_in_bytes_;
+
   for (auto& interval : sorted_buffer_intervals) {
     if (!interval.need_allocation) {
       continue;
@@ -994,12 +1032,17 @@ HeapSimulator::Result AlternateMemoryBestFitHeap::Finish() {
       continue;
     }
 
+    if (interval.size > available_heap_size()) {
+      VLOG(3) << "Skip " << interval.buffer->ToShortString()
+              << " because the buffer is larger than the heap size.";
+      continue;
+    }
+
     auto colocated_intervals = GetSortedColocatedIntervals(interval);
 
     if (AreIntervalsReservedInAlternateMemory(colocated_intervals)) {
       VLOG(3) << "Interval " << interval.buffer->ToShortString()
-              << " is reserved in the alternate memory. Total reserved bytes = "
-              << reserved_in_bytes_;
+              << " is reserved in the alternate memory.";
       for (const BufferInterval* colocated_interval : colocated_intervals) {
         const HloValue* value = colocated_interval->buffer;
         // Color all of the aliased reserved buffers here because reserved
@@ -1015,10 +1058,6 @@ HeapSimulator::Result AlternateMemoryBestFitHeap::Finish() {
               options_.alternate_memory_space);
         }
       }
-      // Increment the reserved part of alternate memory so that it is not
-      // available for other buffers. Since all colocated intervals should have
-      // the same size, just use the first one.
-      reserved_in_bytes_ += options_.size_fn(*colocated_intervals[0]->buffer);
       continue;
     }
 
@@ -1039,16 +1078,46 @@ HeapSimulator::Result AlternateMemoryBestFitHeap::Finish() {
 
     AppendBufferInfoDebugString(interval, &buffer_info_str_);
 
+    std::vector<AllocationValue> allocation_values;
+    CreateAllocationValuesFromColocatedIntervals(colocated_intervals,
+                                                 allocation_values);
+
     // Retry allocating this value with larger limits if allocation fails.
+    bool repacked = false;
     for (int retry_number = 0; retry_number < options_.max_retries;
          retry_number++) {
-      final_retry_ = (retry_number == options_.max_retries - 1);
+      bool final_retry = (retry_number == options_.max_retries - 1);
       options_.prefetch_interval_picker->SetRetryNumber(retry_number);
-      bool success = AllocateColocatedIntervals(colocated_intervals);
-      if (success) {
+      Result result =
+          AllocateAllocationValues(absl::MakeSpan(allocation_values));
+      VLOG(2) << "Allocation result = "
+              << absl::StrFormat("%x", static_cast<int>(result));
+      if (result_requires_uncommit(result) ||
+          (!final_retry && result_failed_because_of_async_copy(result))) {
+        UncommitPendingChunks(absl::MakeSpan(allocation_values));
+        VLOG(2) << "Couldn't allocate. Retry number " << retry_number;
+      } else if (result_is(result, Result::kFailOutOfMemory) &&
+                 num_repacks_ < options_.max_repacks && !repacked) {
+        UncommitPendingChunks(absl::MakeSpan(allocation_values));
+        ++num_repacks_;
+        repacked = true;
+        CHECK_NE(options_.repacker, nullptr);
+        std::vector<MemorySpaceAssignmentRepacker::AllocationBlock*>
+            repack_allocation_blocks;
+        ExportAllocationsForRepacking(repack_allocation_blocks);
+        VLOG(2) << "Repacking.";
+        auto repack_status =
+            options_.repacker->Repack(absl::MakeSpan(repack_allocation_blocks));
+        CHECK_EQ(repack_status.status(), Status::OK());
+        VLOG(2) << "Repack complete. Modified = " << *repack_status;
+        if (*repack_status) {
+          ImportRepackedAllocations();
+          --retry_number;
+        }
+      } else {
+        FinalizeAllocations(absl::MakeSpan(allocation_values));
         break;
       }
-      VLOG(2) << "Couldn't allocate. Retry number " << retry_number;
     }
   }
 
@@ -1061,9 +1130,10 @@ HeapSimulator::Result AlternateMemoryBestFitHeap::Finish() {
   return result_;
 }
 
-bool AlternateMemoryBestFitHeap::AllocateColocatedIntervals(
-    const std::vector<const AlternateMemoryBestFitHeap::BufferInterval*>&
-        colocated_intervals) {
+void AlternateMemoryBestFitHeap::CreateAllocationValuesFromColocatedIntervals(
+    absl::Span<const AlternateMemoryBestFitHeap::BufferInterval* const>
+        colocated_intervals,
+    std::vector<MemorySpaceAssignment::AllocationValue>& allocation_values) {
   // TODO(berkin): For now, place the phi values due to conditionals in
   // default memory.
   for (const BufferInterval* colocated_interval : colocated_intervals) {
@@ -1084,25 +1154,29 @@ bool AlternateMemoryBestFitHeap::AllocateColocatedIntervals(
   }
 
   // Create AllocationValues for all the colocated intervals.
-  std::vector<AllocationValue> allocation_values;
   for (const auto& colocated_interval : colocated_intervals) {
-    CreateAllocationValues(colocated_interval->buffer, &allocation_values);
+    CreateAllocationValues(*colocated_interval, allocation_values);
   }
   FindAliases(&allocation_values);
+}
+
+AlternateMemoryBestFitHeap::Result
+AlternateMemoryBestFitHeap::AllocateAllocationValues(
+    absl::Span<MemorySpaceAssignment::AllocationValue> allocation_values) {
   const auto& instruction_schedule = hlo_live_range_.instruction_schedule();
 
   // Data structure to contain the preferred offset for a given computation.
   // We ensure that the same offset will be allocated outside the while loop
   // as well as inside the while loop.
-  absl::flat_hash_map<const HloComputation*, int64>
+  absl::flat_hash_map<const HloComputation*, AliasedOffset*>
       preferred_offset_for_computation;
 
-  bool allocation_success = true;
-  for (auto& allocation_value : allocation_values) {
+  Result result = Result::kSuccess;
+  for (AllocationValue& allocation_value : allocation_values) {
     int64 definition_time =
         instruction_schedule.at(allocation_value.defining_instruction());
 
-    absl::optional<int64> preferred_offset;
+    AliasedOffset* preferred_offset = nullptr;
     auto preferred_offset_it =
         preferred_offset_for_computation.find(allocation_value.computation());
     if (preferred_offset_it != preferred_offset_for_computation.end()) {
@@ -1201,10 +1275,13 @@ bool AlternateMemoryBestFitHeap::AllocateColocatedIntervals(
         }
       }
 
-      // Bitcasts don't define buffers and don't directly consume buffers.  Skip
-      // allocating buffers for bitcast uses. The uses that feed from bitcasts
-      // will be handled specially.
-      if (hlo_use.instruction->opcode() != HloOpcode::kBitcast) {
+      // Bitcasts don't define buffers and don't directly consume buffers. Skip
+      // allocating buffers for bitcast uses (unless they are the root
+      // instruction). The uses that feed from bitcasts will be handled
+      // specially.
+      if (hlo_use.instruction->opcode() != HloOpcode::kBitcast ||
+          hlo_use.instruction ==
+              hlo_use.instruction->parent()->root_instruction()) {
         AllocationRequest request;
         // Rarely, (e.g., when conditional true and false parameters are the
         // same), definition time can be the time of the conditional and use
@@ -1212,20 +1289,19 @@ bool AlternateMemoryBestFitHeap::AllocateColocatedIntervals(
         request.start_time = std::min(definition_time, use_time);
         request.end_time = use_time;
         request.latest_prefetch_time = latest_prefetch_time;
-        request.size = colocated_intervals[0]->size;
+        request.size = allocation_value.size();
         request.allow_no_copy_alternate_mem_allocation =
             allow_no_copy_alternate_mem_allocation;
         request.earliest_prefetch_time = earliest_prefetch_time;
         request.preferred_offset = preferred_offset;
         request.use = &use;
         request.allocation_value = &allocation_value;
-        if (!AllocateSegment(request)) {
+        result_mark(AllocateSegment(request), result);
+        if (result_requires_uncommit(result)) {
           // If the allocation finding failed (e.g., due to running out of
           // asynchronous copies), then fall back to allocating the buffer
           // entirely in the default memory.
-          UncommitPendingChunks();
-          allocation_success = false;
-          break;
+          return result;
         }
 
         // If there are multiple uses, they can try using the memory allocation
@@ -1248,27 +1324,11 @@ bool AlternateMemoryBestFitHeap::AllocateColocatedIntervals(
       if (hlo_use.instruction->opcode() == HloOpcode::kWhile &&
           aliased_allocation->memory_space() == MemorySpace::kAlternate) {
         preferred_offset_for_computation[hlo_use.instruction->while_body()] =
-            aliased_allocation->chunk().offset;
-      }
-    }
-    if (!allocation_success) {
-      break;
-    }
-  }
-  if (allocation_success) {
-    for (AllocationValue& allocation_value : allocation_values) {
-      for (auto& allocation : *allocation_value.allocation_sequence()) {
-        AppendAllocationInfoDebugString(*colocated_intervals[0], *allocation,
-                                        &allocation_info_str_);
-        allocations_->push_back(std::move(allocation));
+            GetAliasedOffset(*aliased_allocation);
       }
     }
   }
-
-  pending_chunks_.clear();
-  pending_async_copies_.clear();
-  pending_required_assignments_.clear();
-  return allocation_success;
+  return result;
 }
 
 bool operator<(const AsynchronousCopy& a, const AsynchronousCopy& b) {
@@ -1305,6 +1365,28 @@ absl::optional<AsynchronousCopy> AsynchronousCopyOrdering::ViolatesOrdering(
   return absl::nullopt;
 }
 
+AlternateMemoryBestFitHeap::AliasedOffset*
+AlternateMemoryBestFitHeap::GetAliasedOffset(
+    const MemorySpaceAssignment::Allocation& allocation) {
+  auto aliased_offset_it = aliased_offset_map_.find(&allocation);
+  CHECK(aliased_offset_it != aliased_offset_map_.end());
+  return aliased_offset_it->second;
+}
+
+void AlternateMemoryBestFitHeap::CreateOrAddToAliasedOffset(
+    const MemorySpaceAssignment::Allocation& allocation,
+    AlternateMemoryBestFitHeap::AliasedOffset* aliased_offset) {
+  CHECK(allocation.memory_space() == MemorySpace::kAlternate);
+  CHECK(!aliased_offset_map_.contains(&allocation));
+  if (!aliased_offset) {
+    aliased_offsets_.push_back({allocation.chunk().offset});
+    aliased_offset = &aliased_offsets_.back();
+  }
+  CHECK_EQ(allocation.chunk().offset, aliased_offset->offset);
+  CHECK(aliased_offset->allocations.insert(&allocation).second);
+  aliased_offset_map_[&allocation] = aliased_offset;
+}
+
 /*static*/ MemorySpaceAssignment::Allocation*
 AlternateMemoryBestFitHeap::GetLiveAllocationAt(
     const MemorySpaceAssignment::AllocationSequence& allocations, int64 time) {
@@ -1345,27 +1427,87 @@ void AlternateMemoryBestFitHeap::AllocateCrossProgramPrefetchBuffer(
   // Find the earliest use.
   const auto& instruction_schedule = hlo_live_range_.instruction_schedule();
   auto uses = buffer->uses();
-  auto first_use =
-      absl::c_min_element(uses, [&](const HloUse& lhs, const HloUse& rhs) {
-        return instruction_schedule.at(lhs.instruction) <
-               instruction_schedule.at(rhs.instruction);
-      });
+  auto use_schedule_compare = [&](const HloUse& lhs, const HloUse& rhs) {
+    return instruction_schedule.at(lhs.instruction) <
+           instruction_schedule.at(rhs.instruction);
+  };
+  auto first_use = absl::c_min_element(uses, use_schedule_compare);
   int64 latest_prefetch_time = instruction_schedule.at(first_use->instruction);
 
+  // Find the latest use time.
+  int64 last_use_time = instruction_schedule.at(
+      absl::c_max_element(uses, use_schedule_compare)->instruction);
+  for (const HloValue* colocation : prefetch_candidate->colocations) {
+    last_use_time = std::max(
+        last_use_time,
+        instruction_schedule.at(
+            absl::c_max_element(colocation->uses(), use_schedule_compare)
+                ->instruction));
+  }
+
+  int64 end_of_program_prefetch_end_time = instruction_schedule.size() - 1;
+  int64 end_of_program_prefetch_start_time =
+      options_.prefetch_interval_picker->PreferredPrefetchStartTime(
+          buffer->defining_position().shape(), last_use_time,
+          end_of_program_prefetch_end_time, end_of_program_prefetch_end_time);
+  VLOG(2) << "last use time = " << last_use_time
+          << ", end-of-program prefetch start time = "
+          << end_of_program_prefetch_start_time;
+  bool free_buffer =
+      (end_of_program_prefetch_start_time > last_use_time &&
+       end_of_program_prefetch_start_time < end_of_program_prefetch_end_time);
+  int64 cross_program_prefetch_end_time =
+      free_buffer ? last_use_time : prefetch_candidate->end;
+
   AddAsyncCopy(*allocations.back(), MemorySpace::kAlternate,
                chunk_candidate.chunk, prefetch_candidate->start,
-               prefetch_candidate->end, latest_prefetch_time, &allocations);
+               cross_program_prefetch_end_time, latest_prefetch_time,
+               &allocations, /*aliased_offset=*/nullptr,
+               /*is_cross_program_prefetch=*/true);
   absl::c_for_each(uses, [&](auto& use) { allocations.back()->AddUse(use); });
+  AliasedOffset* cross_program_prefetch_offset =
+      GetAliasedOffset(*allocations.back());
+
+  if (free_buffer) {
+    VLOG(2) << "Adding an end-of-program prefetch for freed "
+               "cross-program-prefetched buffer.";
+    AddAsyncCopy(*allocations.front(), MemorySpace::kAlternate,
+                 chunk_candidate.chunk, end_of_program_prefetch_start_time,
+                 end_of_program_prefetch_end_time,
+                 end_of_program_prefetch_end_time, &allocations,
+                 cross_program_prefetch_offset);
+    CHECK_EQ(cross_program_prefetch_offset->offset,
+             allocations.back()->chunk().offset);
+  }
+
   for (auto& allocation : allocations) {
     allocations_->push_back(std::move(allocation));
   }
 
-  pending_chunks_.clear();
-  pending_async_copies_.clear();
-  pending_required_assignments_.clear();
+  // Add a repack allocation block for the Allocation objects in alternate
+  // memory.
+  CHECK_EQ(repack_allocation_blocks_.size(), 0);
+  for (const auto& allocation : *allocations_) {
+    if (allocation->memory_space() == MemorySpace::kAlternate) {
+      repack_allocation_blocks_.push_back(MakeRepackAllocationBlock(
+          allocation->start_time(), allocation->end_time(),
+          allocation->chunk().size, allocation->chunk().offset,
+          static_cast<int64>(repack_allocation_blocks_.size()),
+          allocation.get()));
+      RepackAllocationBlock* inserted = &repack_allocation_blocks_.back();
+      for (RepackAllocationBlock& colocation : repack_allocation_blocks_) {
+        colocation.colocations.push_back(inserted);
+        if (&colocation != inserted) {
+          inserted->colocations.push_back(&colocation);
+        }
+      }
+    }
+  }
+
+  ClearPendingChunks();
 }
 
-absl::optional<RequiredMemoryAssignment>
+absl::optional<AlternateMemoryBestFitHeap::RequiredMemoryAssignment>
 AlternateMemoryBestFitHeap::RequiredMemoryAssignmentAt(const HloValue* buffer,
                                                        int64 time) const {
   auto required_assignment_it = required_assignments_.find(buffer);
@@ -1383,7 +1525,7 @@ AlternateMemoryBestFitHeap::RequiredMemoryAssignmentAt(const HloValue* buffer,
   return required_assignment_at_time;
 }
 
-absl::optional<RequiredMemoryAssignment>
+absl::optional<AlternateMemoryBestFitHeap::RequiredMemoryAssignment>
 AlternateMemoryBestFitHeap::AliasedRequiredAssignmentForUse(
     const AllocationValue::Use& use) const {
   absl::optional<RequiredMemoryAssignment> required_assignment;
@@ -1409,26 +1551,26 @@ AlternateMemoryBestFitHeap::AliasedRequiredAssignmentForUse(
 void AlternateMemoryBestFitHeap::AddAliasedRequiredAssignment(
     const HloInstruction* instruction, ShapeIndex index,
     const MemorySpaceAssignment::Allocation* aliased_allocation) {
-  absl::optional<Chunk> chunk;
+  AliasedOffset* offset = nullptr;
   if (aliased_allocation->memory_space() == MemorySpace::kAlternate) {
-    chunk = aliased_allocation->chunk();
+    offset = GetAliasedOffset(*aliased_allocation);
   }
   AddRequiredAssignment(instruction, index, aliased_allocation->memory_space(),
-                        chunk);
+                        offset);
 }
 
 void AlternateMemoryBestFitHeap::AddRequiredAssignment(
     const HloValue* value, const HloInstruction* instruction,
     MemorySpaceAssignment::MemorySpace memory_space, int64 time,
-    absl::optional<HeapSimulator::Chunk> chunk) {
+    AliasedOffset* offset) {
   // Check for existing required assignment at this time and make sure it is the
   // same as this if there is one.
   auto existing_required_assignment = RequiredMemoryAssignmentAt(value, time);
   if (existing_required_assignment) {
     CHECK(memory_space == existing_required_assignment->memory_space)
         << "inst = " << instruction->ToString() << " at " << time;
-    CHECK((!chunk && !existing_required_assignment->chunk) ||
-          chunk->offset == existing_required_assignment->chunk->offset);
+    CHECK((!offset && !existing_required_assignment->offset) ||
+          offset == existing_required_assignment->offset);
     VLOG(3) << "Not adding required assignment because there is one already: "
             << value->ToShortString() << " at " << time << " at "
             << (memory_space == MemorySpace::kDefault ? "def" : "alt");
@@ -1436,7 +1578,7 @@ void AlternateMemoryBestFitHeap::AddRequiredAssignment(
     VLOG(3) << "Adding required assignment: " << value->ToShortString()
             << " at " << time << " at "
             << (memory_space == MemorySpace::kDefault ? "def" : "alt");
-    RequiredMemoryAssignment required_assignment{memory_space, time, chunk};
+    RequiredMemoryAssignment required_assignment{memory_space, time, offset};
     required_assignments_[value].push_back(required_assignment);
     pending_required_assignments_.push_back({value, required_assignment});
   }
@@ -1444,13 +1586,13 @@ void AlternateMemoryBestFitHeap::AddRequiredAssignment(
 
 void AlternateMemoryBestFitHeap::AddRequiredAssignment(
     const HloInstruction* instruction, ShapeIndex index,
-    MemorySpace memory_space, absl::optional<Chunk> chunk) {
+    MemorySpace memory_space, AliasedOffset* offset) {
   const HloValue* value =
       &alias_analysis_.dataflow_analysis().GetUniqueValueAt(instruction, index);
   int64 instruction_time =
       hlo_live_range_.instruction_schedule().at(instruction);
   AddRequiredAssignment(value, instruction, memory_space, instruction_time,
-                        chunk);
+                        offset);
 }
 
 void AlternateMemoryBestFitHeap::AddInputAndOutputRequiredAssignments() {
@@ -1539,7 +1681,38 @@ bool AlternateMemoryBestFitHeap::AreIntervalsReservedInAlternateMemory(
   return false;
 }
 
-void AlternateMemoryBestFitHeap::UncommitPendingChunks() {
+void AlternateMemoryBestFitHeap::ExportAllocationsForRepacking(
+    std::vector<MemorySpaceAssignmentRepacker::AllocationBlock*>& allocations) {
+  for (RepackAllocationBlock& allocation_block : repack_allocation_blocks_) {
+    allocations.push_back(&allocation_block);
+  }
+}
+
+void AlternateMemoryBestFitHeap::ImportRepackedAllocations() {
+  interval_tree_ = {};
+  for (RepackAllocationBlock& allocation_block : repack_allocation_blocks_) {
+    MemorySpaceAssignment::Allocation* allocation = allocation_block.allocation;
+    VLOG(3) << "Moved " << allocation->ToString() << ", size "
+            << allocation->chunk().size << ", (" << allocation_block.start_time
+            << ", " << allocation_block.end_time << ") from "
+            << allocation_block.initial_offset << " to "
+            << allocation_block.offset;
+    allocation_block.allocation->mutable_chunk()->offset =
+        allocation_block.offset;
+    interval_tree_.Add(allocation_block.start_time, allocation_block.end_time,
+                       {allocation_block.offset, allocation_block.size});
+    allocation_block.initial_offset = allocation_block.offset;
+    allocation_block.offset = -1;
+  }
+}
+
+void AlternateMemoryBestFitHeap::UncommitPendingChunks(
+    absl::Span<AllocationValue> allocation_values) {
+  // Clear the allocation sequence of the allocation values so that in case we
+  // retry allocation after uncommitting.
+  for (AllocationValue& allocation_value : allocation_values) {
+    allocation_value.allocation_sequence()->clear();
+  }
   for (const auto& interval_and_chunk : pending_chunks_) {
     const BufferInterval& interval = interval_and_chunk.first;
     const Chunk& chunk = interval_and_chunk.second.chunk;
@@ -1568,8 +1741,8 @@ void AlternateMemoryBestFitHeap::UncommitPendingChunks() {
                     ? "def"
                     : "alt")
             << " time = " << required_assignment.time << " off = "
-            << (required_assignment.chunk ? required_assignment.chunk->offset
-                                          : -1);
+            << (required_assignment.offset ? required_assignment.offset->offset
+                                           : -1);
     for (auto it = required_assignment_vector.begin();
          it != required_assignment_vector.end(); ++it) {
       if (*it == value_and_required_assignment.second) {
@@ -1578,9 +1751,56 @@ void AlternateMemoryBestFitHeap::UncommitPendingChunks() {
       }
     }
   }
+  ClearPendingChunks();
+}
+
+void AlternateMemoryBestFitHeap::FinalizeAllocations(
+    absl::Span<AllocationValue> allocation_values) {
+  absl::flat_hash_map<const AliasedOffset*,
+                      std::vector<MemorySpaceAssignment::Allocation*>>
+      colocation_map;
+  for (AllocationValue& allocation_value : allocation_values) {
+    for (auto& allocation : *allocation_value.allocation_sequence()) {
+      AppendAllocationInfoDebugString(allocation_value, *allocation,
+                                      allocation_info_str_);
+      allocations_->push_back(std::move(allocation));
+      MemorySpaceAssignment::Allocation* inserted_allocation =
+          allocations_->back().get();
+      if (inserted_allocation->memory_space() == MemorySpace::kAlternate) {
+        colocation_map[GetAliasedOffset(*inserted_allocation)].push_back(
+            inserted_allocation);
+      }
+    }
+  }
+  // The allocations that have the same AliasedOffset need to be colocated.
+  // Export these to repack_allocation_blocks_ so that we can repack them to
+  // reduce fragmentation.
+  for (auto& colocation : colocation_map) {
+    std::vector<MemorySpaceAssignmentRepacker::AllocationBlock*> colocations;
+    for (MemorySpaceAssignment::Allocation* colocated_allocation :
+         colocation.second) {
+      repack_allocation_blocks_.push_back(MakeRepackAllocationBlock(
+          colocated_allocation->start_time(), colocated_allocation->end_time(),
+          colocated_allocation->chunk().size,
+          colocated_allocation->chunk().offset,
+          static_cast<int64>(repack_allocation_blocks_.size()),
+          colocated_allocation));
+      colocations.push_back(&repack_allocation_blocks_.back());
+    }
+    for (MemorySpaceAssignmentRepacker::AllocationBlock* repack_block :
+         colocations) {
+      repack_block->colocations = colocations;
+    }
+  }
+  ClearPendingChunks();
+}
+
+void AlternateMemoryBestFitHeap::ClearPendingChunks() {
   pending_chunks_.clear();
   pending_async_copies_.clear();
   pending_required_assignments_.clear();
+  aliased_offset_map_.clear();
+  aliased_offsets_.clear();
 }
 
 void AlternateMemoryBestFitHeap::AddToPendingChunks(
@@ -1593,7 +1813,7 @@ void AlternateMemoryBestFitHeap::AddToPendingChunks(
   CommitChunk(buffer_interval, chunk_candidate);
 }
 
-bool AlternateMemoryBestFitHeap::AllocateSegment(
+AlternateMemoryBestFitHeap::Result AlternateMemoryBestFitHeap::AllocateSegment(
     const AllocationRequest& request) {
   auto allocation_sequence = request.allocation_value->allocation_sequence();
   // start_time == end_time is a special case where the value is consumed
@@ -1604,7 +1824,7 @@ bool AlternateMemoryBestFitHeap::AllocateSegment(
         GetLiveAllocationAt(*allocation_sequence, request.end_time);
     CHECK_NE(allocation, nullptr);
     allocation->AddUse(request.use->hlo_use);
-    return true;
+    return Result::kSuccess;
   }
 
   const HloPosition& defining_position =
@@ -1656,24 +1876,37 @@ bool AlternateMemoryBestFitHeap::AllocateSegment(
       const auto& prev_allocation = allocation_sequence->back();
       CHECK(prev_allocation->memory_space() ==
             required_assignment_at_start->memory_space);
-      CHECK_EQ(prev_allocation->chunk().offset,
-               required_assignment_at_start->chunk->offset);
+      CHECK_EQ(GetAliasedOffset(*prev_allocation),
+               required_assignment_at_start->offset);
       prev_allocation->Extend(request.start_time);
     } else {
+      absl::optional<Chunk> aliased_chunk = absl::nullopt;
+      if (required_assignment_at_start->memory_space ==
+          MemorySpace::kAlternate) {
+        aliased_chunk =
+            Chunk{required_assignment_at_start->offset->offset, request.size};
+      }
       allocation_sequence->push_back(
           absl::make_unique<MemorySpaceAssignment::Allocation>(
               defining_position, required_assignment_at_start->memory_space,
-              required_assignment_at_start->chunk, request.start_time,
-              request.start_time));
+              aliased_chunk, request.start_time, request.start_time));
+      if (required_assignment_at_start->memory_space ==
+          MemorySpace::kAlternate) {
+        CreateOrAddToAliasedOffset(*allocation_sequence->back(),
+                                   required_assignment_at_start->offset);
+      }
     }
   }
 
+  Result allocation_result = Result::kSuccess;
   // First try keeping the allocation entirely in the alternate memory.
   if (required_memory_space_at_start != MemorySpace::kDefault &&
       required_memory_space_at_end != MemorySpace::kDefault &&
-      request.allow_no_copy_alternate_mem_allocation &&
-      AllocateInAlternateMemoryNoCopy(request)) {
-    return true;
+      request.allow_no_copy_alternate_mem_allocation) {
+    allocation_result = AllocateInAlternateMemoryNoCopy(request);
+    if (allocation_result == Result::kSuccess) {
+      return Result::kSuccess;
+    }
   }
 
   auto prev_allocation_it = allocation_sequence->rbegin();
@@ -1692,8 +1925,10 @@ bool AlternateMemoryBestFitHeap::AllocateSegment(
       (*prev_allocation_it)->defining_position() == defining_position) {
     // If there was an allocation for this HloValue that was in the alternate
     // memory space, we also need to perform an eviction.
-    if (!Evict(request)) {
-      return false;
+    Result eviction_result = Evict(request);
+    if (eviction_result != Result::kSuccess) {
+      // A non-success eviction requires us to uncommit previous allocations.
+      return result_mark(Result::kFailRequiresUncommit, eviction_result);
     }
     prev_allocation_in_default_mem_it = allocation_sequence->rbegin();
   } else if (prev_allocation_in_default_mem_it == allocation_sequence->rend()) {
@@ -1714,38 +1949,36 @@ bool AlternateMemoryBestFitHeap::AllocateSegment(
         << "Not trying to prefetch because use requires buffer in default mem.";
     (*prev_allocation_in_default_mem_it)->Extend(request.end_time);
     (*prev_allocation_in_default_mem_it)->AddUse(request.use->hlo_use);
-    return true;
+    return Result::kSuccess;
   }
 
   // Finally, try to prefetch the buffer into alternate memory.
-  if (Prefetch(request, **prev_allocation_in_default_mem_it)) {
-    return true;
-  }
-  if (!final_retry_ && prefetch_failed_due_to_async_copy_) {
-    // If prefetching failed due to asynchronous copy and we're not in our final
-    // try, return false (failure) so that we can retry this interval with
-    // larger limits.
-    return false;
+  Result prefetch_result =
+      Prefetch(request, **prev_allocation_in_default_mem_it);
+  if (prefetch_result == Result::kSuccess) {
+    return Result::kSuccess;
   }
+  result_mark(prefetch_result, allocation_result);
 
   // If the end assignment was required to be in alternate memory but that
   // wasn't possible, then this allocation is invalid.
   if (required_memory_space_at_end == MemorySpace::kAlternate) {
-    return false;
+    return result_mark(Result::kFailRequiresUncommit, allocation_result);
   }
 
   // If a copy wasn't inserted, then add this use to the latest allocation in
   // default memory.
   (*prev_allocation_in_default_mem_it)->Extend(request.end_time);
   (*prev_allocation_in_default_mem_it)->AddUse(request.use->hlo_use);
-  return true;
+  return allocation_result;
 }
 
 void AlternateMemoryBestFitHeap::AddAsyncCopy(
     const MemorySpaceAssignment::Allocation& prev_allocation,
     MemorySpace memory_space, absl::optional<Chunk> chunk, int64 start_time,
     int64 end_time, int64 copy_done_schedule_before_time,
-    MemorySpaceAssignment::AllocationSequence* allocations) {
+    MemorySpaceAssignment::AllocationSequence* allocations,
+    AliasedOffset* aliased_offset, bool is_cross_program_prefetch) {
   VLOG(3) << "Copy to "
           << (memory_space == MemorySpaceAssignment::MemorySpace::kDefault
                   ? "default"
@@ -1757,7 +1990,7 @@ void AlternateMemoryBestFitHeap::AddAsyncCopy(
   allocations->push_back(
       absl::make_unique<MemorySpaceAssignment::CopyAllocation>(
           prev_allocation, memory_space, chunk, start_time, end_time,
-          copy_done_schedule_before_time));
+          copy_done_schedule_before_time, is_cross_program_prefetch));
 
   // Register the additional async copy with the interval tree to keep track of
   // the limit at any given time.
@@ -1767,6 +2000,7 @@ void AlternateMemoryBestFitHeap::AddAsyncCopy(
     prefetch_interval_tree_.Add(start_time, copy_done_schedule_before_time,
                                 kDummyChunk);
     async_copy_ordering_.AddCopy(pending_async_copies_.back());
+    CreateOrAddToAliasedOffset(*allocations->back(), aliased_offset);
   } else {
     eviction_interval_tree_.Add(start_time, copy_done_schedule_before_time,
                                 kDummyChunk);
@@ -1805,7 +2039,8 @@ AlternateMemoryBestFitHeap::ViolatesAsyncCopyOrdering(int64 start_time,
   return async_copy_ordering_.ViolatesOrdering(start_time, end_time);
 }
 
-bool AlternateMemoryBestFitHeap::AllocateInAlternateMemoryNoCopy(
+AlternateMemoryBestFitHeap::Result
+AlternateMemoryBestFitHeap::AllocateInAlternateMemoryNoCopy(
     const AllocationRequest& request) {
   MemorySpaceAssignment::Allocation* prev_allocation = nullptr;
   bool can_eliminate_copy = false;
@@ -1824,7 +2059,7 @@ bool AlternateMemoryBestFitHeap::AllocateInAlternateMemoryNoCopy(
   }
 
   if (!can_eliminate_copy) {
-    return false;
+    return Result::kFailPrevAllocationNotInAlternateMem;
   }
 
   const HloPosition& defining_position =
@@ -1832,7 +2067,7 @@ bool AlternateMemoryBestFitHeap::AllocateInAlternateMemoryNoCopy(
   if (!options_.prefetch_interval_picker->CanAllocateInAlternateMemoryNoCopy(
           defining_position.shape(), request.start_time + 1,
           request.end_time)) {
-    return false;
+    return Result::kFailLiveRangeTooLong;
   }
 
   BufferInterval alternate_mem_interval;
@@ -1842,9 +2077,9 @@ bool AlternateMemoryBestFitHeap::AllocateInAlternateMemoryNoCopy(
   alternate_mem_interval.start = request.start_time;
 
   // Prefer the offset that was previously used for the previous allocation.
-  absl::optional<int64> preferred_offset;
+  AliasedOffset* preferred_offset = nullptr;
   if (prev_allocation != nullptr) {
-    preferred_offset = prev_allocation->chunk().offset;
+    preferred_offset = GetAliasedOffset(*prev_allocation);
     // If there is a previous allocation, set the start time one after the end
     // of the previous allocation's end.
     alternate_mem_interval.start = prev_allocation->end_time() + 1;
@@ -1854,13 +2089,13 @@ bool AlternateMemoryBestFitHeap::AllocateInAlternateMemoryNoCopy(
     // Sanity check that if there is a preferred offset provided in the request,
     // it matches with the previous allocation.
     CHECK(!preferred_offset || request.preferred_offset == preferred_offset)
-        << "preferred_offset = " << *preferred_offset
-        << ", request.preferred_offset = " << *request.preferred_offset;
+        << "preferred_offset = " << preferred_offset->offset
+        << ", request.preferred_offset = " << request.preferred_offset->offset;
     preferred_offset = request.preferred_offset;
   }
 
   VLOG(3) << "We can eliminate copy to alternate memory. Preferred offset = "
-          << (preferred_offset ? *preferred_offset : -1);
+          << (preferred_offset ? preferred_offset->offset : -1);
   // In case there are additional uses after this use, we rely on the last use
   // time to try to reserve a chunk in the heap simulator. This is to prevent
   // the following scenario:
@@ -1908,15 +2143,19 @@ bool AlternateMemoryBestFitHeap::AllocateInAlternateMemoryNoCopy(
           absl::make_unique<MemorySpaceAssignment::Allocation>(
               defining_position, MemorySpace::kAlternate,
               chunk_candidate->chunk, request.start_time, request.end_time));
+      CreateOrAddToAliasedOffset(
+          *request.allocation_value->allocation_sequence()->back(),
+          preferred_offset);
     }
     request.allocation_value->allocation_sequence()->back()->AddUse(
         request.use->hlo_use);
-    return true;
+    return Result::kSuccess;
   }
-  return false;
+  return Result::kFailOutOfMemory;
 }
 
-bool AlternateMemoryBestFitHeap::Evict(const AllocationRequest& request) {
+AlternateMemoryBestFitHeap::Result AlternateMemoryBestFitHeap::Evict(
+    const AllocationRequest& request) {
   CHECK_GT(request.allocation_value->allocation_sequence()->size(), 0);
   MemorySpaceAssignment::Allocation* prev_allocation =
       request.allocation_value->allocation_sequence()->back().get();
@@ -1970,7 +2209,8 @@ bool AlternateMemoryBestFitHeap::Evict(const AllocationRequest& request) {
     AddAsyncCopy(*prev_allocation, MemorySpace::kDefault,
                  /*chunk=*/absl::nullopt, eviction_start_time,
                  prev_allocation->end_time(), eviction_end_time,
-                 request.allocation_value->allocation_sequence());
+                 request.allocation_value->allocation_sequence(),
+                 /*aliased_offset=*/nullptr);
   } else {
     if (eviction_violates_outstanding_copies) {
       VLOG(3) << "This violates the maximum async copies.";
@@ -1988,7 +2228,8 @@ bool AlternateMemoryBestFitHeap::Evict(const AllocationRequest& request) {
         VLOG(3) << "Eviction successful.";
         AddAsyncCopy(*prev_allocation, MemorySpace::kDefault,
                      /*chunk=*/absl::nullopt, time, time + 1, time + 1,
-                     request.allocation_value->allocation_sequence());
+                     request.allocation_value->allocation_sequence(),
+                     /*aliased_offset=*/nullptr);
         eviction_scheduled = true;
         break;
       }
@@ -2005,22 +2246,27 @@ bool AlternateMemoryBestFitHeap::Evict(const AllocationRequest& request) {
               << " and "
               << hlo_live_range_.flattened_instruction_sequence()
                      .instructions()[eviction_end_time];
-      return false;
+      // return false;
+      return Result::kFailOutOfAsyncCopies;
     }
   }
-  return true;
+  // return true;
+  return Result::kSuccess;
 }
 
 int64 AlternateMemoryBestFitHeap::FindPrefetchEndTime(
     const AllocationRequest& request, int64 earliest_prefetch_time) const {
   int64 prefetch_end_time = request.latest_prefetch_time;
 
+  const HloUse& use = request.use->hlo_use;
+  const Shape& shape = ShapeUtil::GetSubshape(
+      use.instruction->operand(use.operand_number)->shape(), use.operand_index);
   for (int retry_number = 0;
        retry_number < options_.prefetch_copy_done_reorder_max_retries;
        ++retry_number) {
     int64 latest_prefetch_time =
         options_.prefetch_interval_picker->LatestPrefetchStartTime(
-            request.use->hlo_use, earliest_prefetch_time, prefetch_end_time);
+            shape, earliest_prefetch_time, prefetch_end_time, &use);
     VLOG(4) << "Latest prefetch start time = " << latest_prefetch_time
             << ", earliest prefetch start time = " << earliest_prefetch_time
             << ", prefetch end time = " << prefetch_end_time;
@@ -2058,7 +2304,7 @@ int64 AlternateMemoryBestFitHeap::FindPrefetchEndTime(
   return prefetch_end_time;
 }
 
-bool AlternateMemoryBestFitHeap::Prefetch(
+AlternateMemoryBestFitHeap::Result AlternateMemoryBestFitHeap::Prefetch(
     const AllocationRequest& request,
     const MemorySpaceAssignment::Allocation& prev_allocation_in_default_mem) {
   // Try partially placing the buffer in the alternate space. The time that is
@@ -2092,15 +2338,12 @@ bool AlternateMemoryBestFitHeap::Prefetch(
   BufferInterval alternate_mem_interval;
   alternate_mem_interval.buffer = request.allocation_value->value();
   alternate_mem_interval.size = request.size;
-  // If any of the prefetch intervals couldn't be used due to number of
-  // outstanding async copy limit or async copy ordering, set
-  // prefetch_failed_due_to_async_copy_.
-  prefetch_failed_due_to_async_copy_ = false;
   // While uses might be allowed to have additional outstanding prefetches.
   int64 extra_async_copy_limit =
       request.use->hlo_use.instruction->opcode() == HloOpcode::kWhile
           ? options_.while_use_extra_outstanding_prefetch_limit
           : 0;
+  Result result = Result::kSuccess;
   while (!options_.prefetch_interval_picker->Done()) {
     alternate_mem_interval.start = options_.prefetch_interval_picker->Next();
     CHECK_LT(alternate_mem_interval.start, prefetch_end_time);
@@ -2111,14 +2354,14 @@ bool AlternateMemoryBestFitHeap::Prefetch(
     if (ViolatesAsyncCopyOrdering(alternate_mem_interval.start,
                                   prefetch_end_time)) {
       VLOG(4) << "This would violate asynchronous copy ordering.";
-      prefetch_failed_due_to_async_copy_ = true;
+      result_mark(Result::kFailViolatesAsyncCopyOrdering, result);
       continue;
     }
     if (ViolatesMaximumOutstandingAsyncCopies(
             alternate_mem_interval.start, prefetch_end_time,
             /*is_prefetch=*/true, extra_async_copy_limit)) {
       VLOG(4) << "This would violate the outstanding async copy limit.";
-      prefetch_failed_due_to_async_copy_ = true;
+      result_mark(Result::kFailOutOfAsyncCopies, result);
       continue;
     }
 
@@ -2138,20 +2381,27 @@ bool AlternateMemoryBestFitHeap::Prefetch(
       AddAsyncCopy(prev_allocation_in_default_mem, MemorySpace::kAlternate,
                    chunk_candidate->chunk, alternate_mem_interval.start,
                    request.end_time, prefetch_end_time,
-                   request.allocation_value->allocation_sequence());
+                   request.allocation_value->allocation_sequence(),
+                   request.preferred_offset);
 
       request.allocation_value->allocation_sequence()->back()->AddUse(
           request.use->hlo_use);
-      prefetch_failed_due_to_async_copy_ = false;
-      return true;
+      return Result::kSuccess;
     }
+    result_mark(Result::kFailOutOfMemory, result);
+  }
+  // If we didn't consider any prefetch intervals, then the live range was too
+  // short.
+  if (result == Result::kSuccess) {
+    return Result::kFailLiveRangeTooShort;
+  } else {
+    return result;
   }
-  return false;
 }
 
 absl::optional<AlternateMemoryBestFitHeap::ChunkCandidate>
 AlternateMemoryBestFitHeap::FindBestChunkCandidate(
-    const AllocationRequest& request, absl::optional<int64> preferred_offset,
+    const AllocationRequest& request, const AliasedOffset* preferred_offset,
     BufferInterval* alternate_mem_interval) const {
   int64 end_time = request.end_time;
   if (!preferred_offset) {
@@ -2197,8 +2447,8 @@ AlternateMemoryBestFitHeap::FindBestChunkCandidate(
   // only.
   alternate_mem_interval->end = end_time;
   ChunkCandidate chunk_candidate =
-      FindChunkCandidate(*alternate_mem_interval, *preferred_offset);
-  if (chunk_candidate.chunk.offset == *preferred_offset) {
+      FindChunkCandidate(*alternate_mem_interval, preferred_offset->offset);
+  if (chunk_candidate.chunk.offset == preferred_offset->offset) {
     return chunk_candidate;
   }
   return absl::nullopt;
@@ -2252,8 +2502,8 @@ MemorySpaceAssignment::GetMemoryBoundednessBufferIntervalCompare(
       return x_memory_boundedness > y_memory_boundedness;
     }
     // Tie-break if the memory boundedness is the same.
-    return GlobalDecreasingSizeBestFitHeap::GetSpatialBufferIntervalCompare()(
-        x, y);
+    return GlobalDecreasingSizeBestFitHeap<
+        HloValue>::GetSpatialBufferIntervalCompare()(x, y);
   };
 }
 
@@ -2295,6 +2545,9 @@ bool IsCrossProgramPrefetchCandidate(
   return value.instruction()->parent() ==
              value.instruction()->GetModule()->entry_computation() &&
          value.instruction()->opcode() == HloOpcode::kParameter &&
+         (!value.shape().has_layout() ||
+          value.shape().layout().memory_space() !=
+              options.alternate_memory_space) &&
          value.index().size() == 1 && value.shape().IsArray() &&
          !value.uses().empty() &&
          options.size_fn(value) <= options.max_size_in_bytes &&
@@ -2321,7 +2574,9 @@ FindCrossProgramPrefetchCandidate(
     const HloAliasAnalysis& alias_analysis, const HloLiveRange& hlo_live_range,
     const MemorySpaceAssignment::Options& options) {
   std::vector<MemorySpaceAssignment::BufferInterval> candidates;
-  for (HloValue* value : alias_analysis.dataflow_analysis().values()) {
+  for (const HloBuffer& buffer : alias_analysis.buffers()) {
+    CHECK_GE(buffer.values().size(), 1);
+    const HloValue* value = buffer.values().at(0);
     if (IsCrossProgramPrefetchCandidate(*value, options)) {
       MemorySpaceAssignment::BufferInterval interval;
       interval.buffer = value;
@@ -2329,6 +2584,7 @@ FindCrossProgramPrefetchCandidate(
       interval.start = 0;
       interval.end = hlo_live_range.schedule_end_time();
       interval.need_allocation = true;
+      interval.colocations = {++buffer.values().begin(), buffer.values().end()};
       candidates.emplace_back(interval);
     }
   }
@@ -2541,15 +2797,21 @@ HloInstruction* MemorySpaceAssignment::Allocation::AddGetTupleElements() {
 }
 
 std::string MemorySpaceAssignment::Allocation::ToString() const {
-  return absl::StrCat("Allocation in ",
-                      memory_space_ == MemorySpace::kDefault ? "def" : "alt",
-                      " defined at ", defining_position_.ToString());
+  std::string memory_space_str = "def";
+  if (memory_space_ == MemorySpace::kAlternate) {
+    memory_space_str = absl::StrCat("alt (off: ", chunk_->offset, ")");
+  }
+  return absl::StrCat("Allocation in ", memory_space_str, " defined at ",
+                      defining_position_.ToString());
 }
 
 std::string MemorySpaceAssignment::CopyAllocation::ToString() const {
-  return absl::StrCat("Copy Allocation in ",
-                      memory_space_ == MemorySpace::kDefault ? "def" : "alt",
-                      " from ", prev_allocation_.ToString());
+  std::string memory_space_str = "def";
+  if (memory_space_ == MemorySpace::kAlternate) {
+    memory_space_str = absl::StrCat("alt (off: ", chunk_->offset, ")");
+  }
+  return absl::StrCat("Copy Allocation in ", memory_space_str, " from ",
+                      prev_allocation_.ToString());
 }
 
 Status MemorySpaceAssignment::CopyAllocation::Process(
@@ -2558,9 +2820,9 @@ Status MemorySpaceAssignment::CopyAllocation::Process(
   Shape shape = defining_position().shape();
   HloInstruction* producing_instruction = AddGetTupleElements();
   HloComputation* computation = producing_instruction->parent();
-  copy_start_ = computation->AddInstruction(HloInstruction::CreateUnary(
+  copy_start_ = computation->AddInstruction(HloInstruction::CreateCopyStart(
       ShapeUtil::MakeTupleShape({shape, shape, ShapeUtil::MakeShape(U32, {})}),
-      HloOpcode::kCopyStart, producing_instruction));
+      producing_instruction, is_cross_program_prefetch_));
   copy_done_ = computation->AddInstruction(
       HloInstruction::CreateUnary(shape, HloOpcode::kCopyDone, copy_start_));
   VLOG(4) << "Created " << copy_start_->name()
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment.h b/tensorflow/compiler/xla/service/memory_space_assignment.h
index 87f7dd2ddae..409a44d319d 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment.h
+++ b/tensorflow/compiler/xla/service/memory_space_assignment.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/heap_simulator.h"
 #include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
+#include "tensorflow/compiler/xla/service/memory_space_assignment_repacking.h"
 
 namespace xla {
 
@@ -105,7 +106,7 @@ class MemorySpaceAssignmentCostAnalysis {
   // BufferInterval.  The larger this number, the higher priority it will be
   // placed in the alternate memory.
   float GetMemoryBoundedness(
-      const GlobalDecreasingSizeBestFitHeap::BufferInterval& interval,
+      const GlobalDecreasingSizeBestFitHeap<HloValue>::BufferInterval& interval,
       Cache* cache = nullptr) const;
 
   // Returns the elapsed time in seconds due to compute only.
@@ -199,8 +200,15 @@ class PrefetchIntervalPicker {
                                          int64 latest_end_time) const = 0;
 
   // Returns the latest time that a prefetch can start.
-  virtual int64 LatestPrefetchStartTime(const HloUse& use, int64 start_time,
-                                        int64 end_time) const = 0;
+  virtual int64 LatestPrefetchStartTime(const Shape& shape, int64 start_time,
+                                        int64 end_time,
+                                        const HloUse* use) const = 0;
+
+  // Returns the preferred time that a prefetch can start.
+  virtual int64 PreferredPrefetchStartTime(const Shape& shape,
+                                           int64 earliest_prefetch_start_time,
+                                           int64 latest_prefetch_start_time,
+                                           int64 prefetch_end_time) const = 0;
 
   // Returns the latest time that a prefetch can end that is less than or equal
   // to proposed_prefetch_end_time.
@@ -234,7 +242,8 @@ class PrefetchIntervalPicker {
   // of placing the BufferInterval in the alternate memory. The larger value,
   // the more beneficial.
   virtual absl::optional<float> BufferIntervalAlternateMemoryBenefit(
-      const GlobalDecreasingSizeBestFitHeap::BufferInterval& interval) const {
+      const GlobalDecreasingSizeBestFitHeap<HloValue>::BufferInterval& interval)
+      const {
     return absl::nullopt;
   }
 
@@ -267,8 +276,14 @@ class InstructionCountPrefetchIntervalPicker : public PrefetchIntervalPicker {
   int64 PreferredEvictionEndTime(const Shape& shape, int64 start_time,
                                  int64 latest_end_time) const override;
 
-  int64 LatestPrefetchStartTime(const HloUse& use, int64 start_time,
-                                int64 end_time) const override;
+  int64 LatestPrefetchStartTime(const Shape& shape, int64 start_time,
+                                int64 end_time,
+                                const HloUse* use) const override;
+
+  int64 PreferredPrefetchStartTime(const Shape& shape,
+                                   int64 earliest_prefetch_start_time,
+                                   int64 latest_prefetch_start_time,
+                                   int64 prefetch_end_time) const override;
 
   void Begin(const HloUse& use, int64 start_time, int64 end_time) override;
 
@@ -306,11 +321,18 @@ class CostAnalysisPrefetchIntervalPicker : public PrefetchIntervalPicker {
   int64 PreferredEvictionEndTime(const Shape& shape, int64 start_time,
                                  int64 latest_end_time) const override;
 
-  int64 LatestPrefetchStartTime(const HloUse& use, int64 start_time,
-                                int64 end_time) const override;
   int64 LatestPrefetchEndTime(int64 original_prefetch_end_time,
                               int64 proposed_prefetch_end_time) const override;
 
+  int64 LatestPrefetchStartTime(const Shape& shape, int64 start_time,
+                                int64 end_time,
+                                const HloUse* use) const override;
+
+  int64 PreferredPrefetchStartTime(const Shape& shape,
+                                   int64 earliest_prefetch_start_time,
+                                   int64 latest_prefetch_start_time,
+                                   int64 prefetch_end_time) const override;
+
   void Begin(const HloUse& use, int64 start_time, int64 end_time) override;
 
   int64 Next() override;
@@ -323,7 +345,7 @@ class CostAnalysisPrefetchIntervalPicker : public PrefetchIntervalPicker {
                                   int64 end_time) const override;
 
   absl::optional<float> BufferIntervalAlternateMemoryBenefit(
-      const GlobalDecreasingSizeBestFitHeap::BufferInterval& interval)
+      const GlobalDecreasingSizeBestFitHeap<HloValue>::BufferInterval& interval)
       const override;
 
  private:
@@ -354,7 +376,7 @@ class CostAnalysisPrefetchIntervalPicker : public PrefetchIntervalPicker {
   int64 end_logical_time_;
   int64 earliest_prefetch_time_;
   int64 latest_prefetch_time_;
-  bool using_increasing_prefetch_time_iterator_;
+  bool using_increasing_prefetch_time_iterator_ = true;
   int64 increasing_prefetch_time_iterator_;
   int64 decreasing_prefetch_time_iterator_;
 };
@@ -369,9 +391,10 @@ class CostAnalysisPrefetchIntervalPicker : public PrefetchIntervalPicker {
 class MemorySpaceAssignment {
  public:
   using Chunk = HeapSimulator::Chunk;
-  using BufferInterval = GlobalDecreasingSizeBestFitHeap::BufferInterval;
+  using BufferInterval =
+      GlobalDecreasingSizeBestFitHeap<HloValue>::BufferInterval;
   using BufferIntervalCompare =
-      GlobalDecreasingSizeBestFitHeap::BufferIntervalCompare;
+      GlobalDecreasingSizeBestFitHeap<HloValue>::BufferIntervalCompare;
   using IsAllowedInAlternateMemoryFunction =
       std::function<bool(const HloValue&)>;
 
@@ -379,6 +402,9 @@ class MemorySpaceAssignment {
   // space and a fast and small alternate memory space.
   enum class MemorySpace { kDefault, kAlternate };
 
+  // Forward declaration for Allocation.
+  class Allocation;
+
   // The different options to be passed to the Run() API.
   struct Options {
     // Backend-specific integer value that describes the alternate memory.
@@ -424,6 +450,15 @@ class MemorySpaceAssignment {
     // copies or asynchronous copy ordering.
     int64 max_retries = 1;
 
+    // The maximum number of repacks that we are willing to perform in case we
+    // can't allocate a buffer due to running out of memory. If this value is
+    // greater than 0, repacker must be non-nullptr.
+    int64 max_repacks = 0;
+
+    // The repacking algorithm to reduce fragmentation. Must be non-null if
+    // max_repacks is greater than 0.
+    MemorySpaceAssignmentRepacker* repacker = nullptr;
+
     // If true, tries allocating buffers across (e.g., before and inside a while
     // loop body) sequential calls (kWhile, kCall, and kConditional).
     bool allocate_across_sequential_calls = false;
@@ -511,6 +546,7 @@ class MemorySpaceAssignment {
     const std::vector<HloUse>& uses() const { return uses_; }
     MemorySpace memory_space() const { return memory_space_; }
     Chunk chunk() const { return *chunk_; }
+    Chunk* mutable_chunk() { return &*chunk_; }
     void set_start_time(int64 start_time) { start_time_ = start_time; }
     int64 start_time() const { return start_time_; }
     int64 end_time() const { return end_time_; }
@@ -545,12 +581,14 @@ class MemorySpaceAssignment {
    public:
     CopyAllocation(const Allocation& prev_allocation, MemorySpace memory_space,
                    absl::optional<Chunk> chunk, int64 start_time,
-                   int64 end_time, int64 copy_done_schedule_before_time)
+                   int64 end_time, int64 copy_done_schedule_before_time,
+                   bool is_cross_program_prefetch = false)
         : Allocation(/*defining_position=*/{nullptr, {}}, memory_space, chunk,
                      start_time, end_time),
           prev_allocation_(prev_allocation),
           copy_start_schedule_after_(start_time),
-          copy_done_schedule_before_(copy_done_schedule_before_time) {}
+          copy_done_schedule_before_(copy_done_schedule_before_time),
+          is_cross_program_prefetch_(is_cross_program_prefetch) {}
 
     bool is_copy_allocation() const override { return true; }
 
@@ -590,6 +628,10 @@ class MemorySpaceAssignment {
       copy_start_schedule_after_ = copy_start_schedule_after;
     }
 
+    bool is_cross_program_prefetch() const {
+      return is_cross_program_prefetch_;
+    }
+
     bool operator==(const CopyAllocation& other) const;
     std::string ToString() const override;
 
@@ -601,6 +643,7 @@ class MemorySpaceAssignment {
     // is before copy_done_schedule_before_.
     int64 copy_start_schedule_after_;
     int64 copy_done_schedule_before_;
+    bool is_cross_program_prefetch_;
     HloInstruction* copy_start_;
     HloInstruction* copy_done_;
   };
@@ -687,13 +730,15 @@ class MemorySpaceAssignment {
       std::vector<HloPosition> aliases;
     };
 
-    AllocationValue(const HloValue* value, const HloPosition& position)
-        : value_(value), defining_position_(position) {}
+    AllocationValue(const HloValue* value, const HloPosition& position,
+                    int64 size)
+        : value_(value), defining_position_(position), size_(size) {}
 
     const HloPosition& defining_position() const { return defining_position_; }
     const HloInstruction* defining_instruction() const {
       return defining_position().instruction;
     }
+    int64 size() const { return size_; }
     const std::vector<Use>& uses() const { return uses_; }
     std::vector<Use>& uses() { return uses_; }
     const HloValue* value() const { return value_; }
@@ -712,6 +757,7 @@ class MemorySpaceAssignment {
    private:
     const HloValue* value_;
     HloPosition defining_position_;
+    int64 size_;
     std::vector<Use> uses_;
     AllocationSequence allocation_sequence_;
   };
@@ -825,29 +871,6 @@ class MemorySpaceAssignment {
   absl::flat_hash_map<int64, std::vector<HloInstruction*>> schedule_before_;
 };
 
-// This struct contains mandatory memory assignments at a given time. E.g., an
-// input's required memory assignment time would correspond to the definition
-// time of the parameter instruction, and an output's time would correspond to
-// the time of last use.
-struct RequiredMemoryAssignment {
-  MemorySpaceAssignment::MemorySpace memory_space;
-  int64 time;
-  absl::optional<HeapSimulator::Chunk> chunk;
-
-  bool equals_ignoring_time(const RequiredMemoryAssignment& other) const {
-    return memory_space == other.memory_space && chunk == other.chunk;
-  }
-
-  bool operator==(const RequiredMemoryAssignment& other) const {
-    return memory_space == other.memory_space && time == other.time &&
-           chunk == other.chunk;
-  }
-
-  bool operator!=(const RequiredMemoryAssignment& other) const {
-    return !(*this == other);
-  }
-};
-
 // A struct representing an asynchronous copy with its logical start and end
 // time and its destination memory space.
 struct AsynchronousCopy {
@@ -896,7 +919,8 @@ class AsynchronousCopyOrdering {
 
 // This class inherits from GlobalDecreasingSizeBestFitHeap with a notion of
 // maximum size.
-class AlternateMemoryBestFitHeap : public GlobalDecreasingSizeBestFitHeap {
+class AlternateMemoryBestFitHeap
+    : public GlobalDecreasingSizeBestFitHeap<HloValue> {
  public:
   using MemorySpace = MemorySpaceAssignment::MemorySpace;
   using AllocationValue = MemorySpaceAssignment::AllocationValue;
@@ -923,9 +947,23 @@ class AlternateMemoryBestFitHeap : public GlobalDecreasingSizeBestFitHeap {
   void AllocateCrossProgramPrefetchBuffer(
       HloModule* module, absl::optional<BufferInterval> prefetch_candidate);
 
-  HeapSimulator::Result Finish() override;
+  HeapSimulator::Result<HloValue> Finish() override;
 
  private:
+  // We inherit AllocationBlock struct to attach the Allocation information to
+  // make importing repacked offsets easier.
+  struct RepackAllocationBlock
+      : MemorySpaceAssignmentRepacker::AllocationBlock {
+    MemorySpaceAssignment::Allocation* allocation;
+  };
+
+  // A data structure we use to associate Allocation objects that are aliased
+  // and must get the same offset.
+  struct AliasedOffset {
+    int64 offset;
+    absl::flat_hash_set<const MemorySpaceAssignment::Allocation*> allocations;
+  };
+
   // An allocation request for a use segment. A use segment is the time segment
   // between the definition and the first use, and the time segment between the
   // uses of a buffer. For example, the time between the definition and Use1, is
@@ -953,11 +991,101 @@ class AlternateMemoryBestFitHeap : public GlobalDecreasingSizeBestFitHeap {
     int64 size;
     bool allow_no_copy_alternate_mem_allocation;
     absl::optional<int64> earliest_prefetch_time;
-    absl::optional<int64> preferred_offset;
+    AliasedOffset* preferred_offset;
     const MemorySpaceAssignment::AllocationValue::Use* use;
     MemorySpaceAssignment::AllocationValue* allocation_value;
   };
 
+  // This struct contains mandatory memory assignments at a given time. E.g., an
+  // input's required memory assignment time would correspond to the definition
+  // time of the parameter instruction, and an output's time would correspond to
+  // the time of last use.
+  struct RequiredMemoryAssignment {
+    MemorySpaceAssignment::MemorySpace memory_space;
+    int64 time;
+    AliasedOffset* offset;
+
+    bool equals_ignoring_time(const RequiredMemoryAssignment& other) const {
+      return memory_space == other.memory_space && offset == other.offset;
+    }
+
+    bool operator==(const RequiredMemoryAssignment& other) const {
+      return memory_space == other.memory_space && time == other.time &&
+             offset == other.offset;
+    }
+
+    bool operator!=(const RequiredMemoryAssignment& other) const {
+      return !(*this == other);
+    }
+  };
+
+  // Result of an allocation, prefetch, eviction etc. request.  The result is
+  // either kSuccess or a bitwise OR of one or more failures. The values are
+  // unique powers of two. To check if a result contains a particular failure,
+  // use the result_is method. To add a new failure to a result, use the
+  // result_mark method.
+  enum class Result {
+    // Successful allocation.
+    kSuccess = 0,
+    // Allocation failed because we ran out of alternate memory.
+    kFailOutOfMemory = 1,
+    // A no-copy allocation couldn't be performed because the previous
+    // allocation wasn't in the alternate memory space.
+    kFailPrevAllocationNotInAlternateMem = 2,
+    // A no-copy allocation couldn't be performed because the live range was too
+    // long.
+    kFailLiveRangeTooLong = 4,
+    // A prefetching couldn't be performed because the live range was too short.
+    kFailLiveRangeTooShort = 8,
+    // Ran out of outstanding asynchronous copy limit either during prefetching
+    // or eviction.
+    kFailOutOfAsyncCopies = 16,
+    // A prefetching couldn't be performed because the asynchronous copy
+    // ordering was violated.
+    kFailViolatesAsyncCopyOrdering = 32,
+    // An allocation failure happened that requires uncommitting all the pending
+    // allocations. Usually this is due to a situation requiring an eviction but
+    // the eviction couldn't be performed.
+    kFailRequiresUncommit = 64
+  };
+
+  // Return true if the result belongs to a failure.
+  static bool result_is(Result result, Result failure) {
+    return static_cast<int>(result) & static_cast<int>(failure);
+  }
+
+  // Mark (bitwise OR) a failure to the result.
+  static Result result_mark(Result failure, Result& result) {
+    result = static_cast<Result>(static_cast<int>(result) |
+                                 static_cast<int>(failure));
+    return result;
+  }
+
+  // Return true if the result is a failure that requires us to uncommit pending
+  // chunks.
+  static bool result_requires_uncommit(Result result) {
+    return result_is(result, Result::kFailRequiresUncommit);
+  }
+
+  // Return true if the result is a failure either due to running out of
+  // outstanding asynchronous copies or due to violating asynchronous copy
+  // ordering.
+  static bool result_failed_because_of_async_copy(Result result) {
+    return result_is(result, Result::kFailOutOfAsyncCopies) ||
+           result_is(result, Result::kFailViolatesAsyncCopyOrdering);
+  }
+
+  // Returns the AliasedOffset object associated with the allocation.
+  AliasedOffset* GetAliasedOffset(
+      const MemorySpaceAssignment::Allocation& allocation);
+
+  // If aliased_offset is non-null, this method adds the allocation to
+  // aliased_offset. Otherwise, it creates a new AliasedOffset object and adds
+  // the allocation to this new AliasedOffset.
+  void CreateOrAddToAliasedOffset(
+      const MemorySpaceAssignment::Allocation& allocation,
+      AliasedOffset* aliased_offset);
+
   // Given an allocation sequence, returns the live allocation at time with a
   // preference towards allocations in alternate memory. Returns nullptr if no
   // allocation is alive at that time.
@@ -968,17 +1096,24 @@ class AlternateMemoryBestFitHeap : public GlobalDecreasingSizeBestFitHeap {
   bool IsUseAllowedInAlternateMemory(const AllocationValue& value,
                                      const HloUse& use) const;
 
-  // Given an HloValue, creates AllocationValue objects and corresponding
+  // Given a BufferInterval, creates AllocationValue objects and corresponding
   // AllocationSequences and appends them into allocation_sequence_list_.
-  void CreateAllocationValues(const HloValue* value,
-                              std::vector<AllocationValue>* allocation_values);
+  void CreateAllocationValues(
+      const BufferInterval& buffer_interval,
+      std::vector<AllocationValue>& allocation_values) const;
 
-  // Finds allocations for colocated intervals. Colocated intervals consist of
-  // one or more BufferIntervals, each with a different HloValue. All of the
-  // intervals within colocated intervals have a must-alias relationship with
-  // each other. Returns true if allocation succeeded.
-  bool AllocateColocatedIntervals(
-      const std::vector<const BufferInterval*>& colocated_intervals);
+  // Given colocated intervals, populates allocation_values with the
+  // corresponding AllocationValue objects.
+  void CreateAllocationValuesFromColocatedIntervals(
+      absl::Span<const BufferInterval* const> colocated_intervals,
+      std::vector<AllocationValue>& allocation_values);
+
+  // Finds allocations for allocation values generated from colocated intervals.
+  // All of the allocation values have a must-alias relationship with each
+  // other. Returns either kSuccess if all of the sites could be placed in the
+  // alternate memory or a bitwise OR of failure reasons why they couldn't
+  Result AllocateAllocationValues(
+      absl::Span<AllocationValue> allocation_values);
 
   // Go through all the uses in the AllocationValues and find the aliasing
   // positions.
@@ -996,24 +1131,26 @@ class AlternateMemoryBestFitHeap : public GlobalDecreasingSizeBestFitHeap {
   //     if there is enough space and if the prefetch interval picker allows.
   //
   // If an eviction (2) was requested and was unsuccessful, this method returns
-  // false. This means we could not find a suitable allocation, so all previous
-  // allocations for this buffer must be removed and allocated in the default
-  // memory. Otherwise, this method returns true.
-  bool AllocateSegment(const AllocationRequest& request);
+  // Result::kFailRequiresUncommit. This means we could not find a suitable
+  // allocation, so all previous allocations for this buffer must be removed and
+  // allocated in the default memory. Otherwise, this method may return
+  // Result::kSuccess if the buffer could be placed in alternate memory or some
+  // other Result with an OR of reasons why the buffer couldn't be placed in
+  // alternate memory.
+  Result AllocateSegment(const AllocationRequest& request);
 
-  // Try allocating in alternate memory without any copies. Returns true if
-  // successful.
-  bool AllocateInAlternateMemoryNoCopy(const AllocationRequest& request);
+  // Try allocating in alternate memory without any copies.
+  Result AllocateInAlternateMemoryNoCopy(const AllocationRequest& request);
 
-  // Try evicting to default memory space. Returns true if successful.
-  bool Evict(const AllocationRequest& request);
+  // Try evicting to default memory space.
+  Result Evict(const AllocationRequest& request);
 
   // Returns the time a copy done of a prefetch should be scheduled.
   int64 FindPrefetchEndTime(const AllocationRequest& request,
                             int64 earliest_prefetch_time) const;
 
-  // Try prefetching to alternate memory space. Returns true if successful.
-  bool Prefetch(
+  // Try prefetching to alternate memory space.
+  Result Prefetch(
       const AllocationRequest& request,
       const MemorySpaceAssignment::Allocation& prev_allocation_in_default_mem);
 
@@ -1021,7 +1158,7 @@ class AlternateMemoryBestFitHeap : public GlobalDecreasingSizeBestFitHeap {
   // availability if no preferred offset is given, or at the preferred_offset if
   // it is given.
   absl::optional<ChunkCandidate> FindBestChunkCandidate(
-      const AllocationRequest& request, absl::optional<int64> preferred_offset,
+      const AllocationRequest& request, const AliasedOffset* preferred_offset,
       BufferInterval* alternate_mem_interval) const;
 
   // Returns the required assignment at a particular time, if available.
@@ -1043,10 +1180,10 @@ class AlternateMemoryBestFitHeap : public GlobalDecreasingSizeBestFitHeap {
   void AddRequiredAssignment(const HloValue* value,
                              const HloInstruction* instruction,
                              MemorySpace memory_space, int64 time,
-                             absl::optional<Chunk> chunk = absl::nullopt);
+                             AliasedOffset* offset = nullptr);
   void AddRequiredAssignment(const HloInstruction* instruction,
                              ShapeIndex index, MemorySpace memory_space,
-                             absl::optional<Chunk> chunk = absl::nullopt);
+                             AliasedOffset* offset = nullptr);
 
   // Adds input and outputs as required assignments.
   void AddInputAndOutputRequiredAssignments();
@@ -1081,12 +1218,24 @@ class AlternateMemoryBestFitHeap : public GlobalDecreasingSizeBestFitHeap {
   absl::optional<AsynchronousCopy> ViolatesAsyncCopyOrdering(
       int64 start_time, int64 end_time) const;
 
+  // Exports the allocations for repacking and puts them into the vector in the
+  // parameter.
+  void ExportAllocationsForRepacking(
+      std::vector<MemorySpaceAssignmentRepacker::AllocationBlock*>&
+          allocations);
+
+  // Imports repacked allocations and updates the internal data structures
+  // consistent with the new packing.
+  void ImportRepackedAllocations();
+
   // Adds an asynchronous copy to the allocations.
   void AddAsyncCopy(const MemorySpaceAssignment::Allocation& prev_allocation,
                     MemorySpace memory_space, absl::optional<Chunk> chunk,
                     int64 start_time, int64 end_time,
                     int64 copy_done_schedule_before_time,
-                    MemorySpaceAssignment::AllocationSequence* allocations);
+                    MemorySpaceAssignment::AllocationSequence* allocations,
+                    AliasedOffset* aliased_offset,
+                    bool is_cross_program_prefetch = false);
 
   // This method is used for committing the chunk candidate but adding it to
   // pending_chunks_ so that we can "uncommit" them in case we need to roll back
@@ -1095,17 +1244,24 @@ class AlternateMemoryBestFitHeap : public GlobalDecreasingSizeBestFitHeap {
                           const ChunkCandidate& chunk_candidate);
   // If we need to remove the allocations for this allocation sequence, this
   // removes pending chunks and asynchronous copies in the respective pending
-  // buffers from the interval trees.
-  void UncommitPendingChunks();
+  // buffers from the interval trees. If an allocation request returns
+  // kFailRequiresUncommit, this method must be called.
+  void UncommitPendingChunks(absl::Span<AllocationValue> allocation_values);
+
+  // Finalizes the allocations where they can no longer be uncommitted.
+  void FinalizeAllocations(absl::Span<AllocationValue> allocation_values);
+
+  // Clears all pending chunks and asynchronous copies.
+  void ClearPendingChunks();
 
   // Append buffer and allocation infos for debugging and dump it into a file,
   // if enabled.
   void AppendBufferInfoDebugString(const BufferInterval& interval,
                                    std::string* debug_str) const;
   void AppendAllocationInfoDebugString(
-      const BufferInterval& interval,
+      const AllocationValue& value,
       const MemorySpaceAssignment::Allocation& allocation,
-      std::string* debug_str) const;
+      std::string& debug_str) const;
   void DumpDebugStringsIfEnabled() const;
 
   // Returns the available heap size in the alternate memory.
@@ -1113,6 +1269,22 @@ class AlternateMemoryBestFitHeap : public GlobalDecreasingSizeBestFitHeap {
     return options_.max_size_in_bytes - reserved_in_bytes_;
   }
 
+  // Creates and returns a RepackAllocationBlock.
+  static RepackAllocationBlock MakeRepackAllocationBlock(
+      int64 start_time, int64 end_time, int64 size, int64 initial_offset,
+      int64 id, MemorySpaceAssignment::Allocation* allocation) {
+    RepackAllocationBlock allocation_block;
+    allocation_block.start_time = start_time;
+    allocation_block.end_time = end_time;
+    allocation_block.size = size;
+    allocation_block.offset = -1;
+    allocation_block.initial_offset = initial_offset;
+    allocation_block.id = id;
+    allocation_block.colocations = {};
+    allocation_block.allocation = allocation;
+    return allocation_block;
+  }
+
   MemorySpaceAssignment::AllocationSequence* allocations_;
   const MemorySpaceAssignment::Options& options_;
   const HloAliasAnalysis& alias_analysis_;
@@ -1122,19 +1294,26 @@ class AlternateMemoryBestFitHeap : public GlobalDecreasingSizeBestFitHeap {
   BufferIntervalTree prefetch_interval_tree_;
   BufferIntervalTree eviction_interval_tree_;
   AsynchronousCopyOrdering async_copy_ordering_;
+  // A list of RepackAllocationBlock objects that mirrors allocation sequences,
+  // used for repacking. We use a list here because we need pointer stability
+  // for aliased allocations.
+  std::list<RepackAllocationBlock> repack_allocation_blocks_;
+  int64 num_repacks_ = 0;
   std::vector<std::pair<BufferInterval, ChunkCandidate>> pending_chunks_;
   std::vector<AsynchronousCopy> pending_async_copies_;
   std::vector<std::pair<const HloValue*, RequiredMemoryAssignment>>
       pending_required_assignments_;
+  // The data structure that contains AliasedOffset objects and Allocation to
+  // AliasedOffset map for efficient lookup.
+  std::list<AliasedOffset> aliased_offsets_;
+  absl::flat_hash_map<const MemorySpaceAssignment::Allocation*, AliasedOffset*>
+      aliased_offset_map_;
   // This map contains required memory assignments for HloValues (e.g., input
   // and outputs).
   absl::flat_hash_map<const HloValue*, std::vector<RequiredMemoryAssignment>>
       required_assignments_;
   // Number of bytes reserved in alternate memory space.
   int64 reserved_in_bytes_ = 0;
-  // Variables to control allocation retries.
-  bool final_retry_;
-  bool prefetch_failed_due_to_async_copy_;
   // Debug strings.
   std::string buffer_info_str_;
   std::string allocation_info_str_;
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment_best_fit_repacker.cc b/tensorflow/compiler/xla/service/memory_space_assignment_best_fit_repacker.cc
new file mode 100644
index 00000000000..53b092f1939
--- /dev/null
+++ b/tensorflow/compiler/xla/service/memory_space_assignment_best_fit_repacker.cc
@@ -0,0 +1,88 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/memory_space_assignment_best_fit_repacker.h"
+
+#include "tensorflow/compiler/xla/service/heap_simulator.h"
+
+namespace xla {
+
+namespace {
+
+using AllocationBlock = MemorySpaceAssignmentRepacker::AllocationBlock;
+using Type = GlobalDecreasingSizeBestFitHeap<AllocationBlock>::Type;
+
+// This class inherits GlobalDecreasingSizeBestFitHeap and converts
+// AllocationBlock objects into BufferIntervals that the heap algorithm
+// understands.
+class BestFitRepacker
+    : public GlobalDecreasingSizeBestFitHeap<AllocationBlock> {
+ public:
+  BestFitRepacker(int64 max_size, int64 alignment, Type type)
+      : GlobalDecreasingSizeBestFitHeap<AllocationBlock>(alignment, type),
+        max_size_(max_size) {}
+
+  void ImportAllocationBlocks(absl::Span<AllocationBlock*> allocations) {
+    allocation_blocks_ = allocations;
+    for (AllocationBlock* allocation_block : allocations) {
+      // Check if any of the colocations are already added to buffer_intervals_.
+      bool need_allocation = true;
+      auto aliased_it = absl::c_find_if(
+          allocation_block->colocations, [&](AllocationBlock* search) {
+            return buffer_intervals_.contains(search);
+          });
+      if (aliased_it != allocation_block->colocations.end()) {
+        buffer_intervals_[*aliased_it].colocations.push_back(allocation_block);
+        need_allocation = false;
+      }
+      buffer_intervals_[allocation_block] = {allocation_block,
+                                             allocation_block->size,
+                                             allocation_block->start_time,
+                                             allocation_block->end_time,
+                                             {},
+                                             need_allocation};
+    }
+  }
+
+  bool Repack() {
+    Finish();
+    bool success = result_.heap_size <= max_size_;
+    if (success) {
+      for (AllocationBlock* block : allocation_blocks_) {
+        auto chunk_it = result_.chunk_map.find(block);
+        if (chunk_it != result_.chunk_map.end()) {
+          block->offset = chunk_it->second.offset;
+        }
+      }
+    }
+    return success;
+  }
+
+ private:
+  int64 max_size_;
+  absl::Span<AllocationBlock*> allocation_blocks_;
+};
+
+}  // namespace
+
+StatusOr<bool> MemorySpaceAssignmentBestFitRepacker::Repack(
+    absl::Span<AllocationBlock*> allocations) {
+  BestFitRepacker best_fit_repacker =
+      BestFitRepacker(max_size_, alignment_, type_);
+  best_fit_repacker.ImportAllocationBlocks(allocations);
+  return best_fit_repacker.Repack();
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment_best_fit_repacker.h b/tensorflow/compiler/xla/service/memory_space_assignment_best_fit_repacker.h
new file mode 100644
index 00000000000..6937b8b0e8c
--- /dev/null
+++ b/tensorflow/compiler/xla/service/memory_space_assignment_best_fit_repacker.h
@@ -0,0 +1,44 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_BEST_FIT_REPACKER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_BEST_FIT_REPACKER_H_
+
+#include "tensorflow/compiler/xla/service/heap_simulator.h"
+#include "tensorflow/compiler/xla/service/memory_space_assignment_repacking.h"
+
+namespace xla {
+
+// This is a repacker algorithm that wraps around best fit heap algorithm in
+// heap simulator.
+class MemorySpaceAssignmentBestFitRepacker
+    : public MemorySpaceAssignmentRepacker {
+ public:
+  using Type = GlobalDecreasingSizeBestFitHeap<AllocationBlock>::Type;
+
+  explicit MemorySpaceAssignmentBestFitRepacker(
+      int64 max_size, int64 alignment,
+      Type type = GlobalDecreasingSizeBestFitHeap<AllocationBlock>::kTemporal)
+      : MemorySpaceAssignmentRepacker(max_size, alignment), type_(type) {}
+
+  StatusOr<bool> Repack(absl::Span<AllocationBlock*> allocations) override;
+
+ private:
+  Type type_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_BEST_FIT_REPACKER_H_
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment_best_fit_repacker_test.cc b/tensorflow/compiler/xla/service/memory_space_assignment_best_fit_repacker_test.cc
new file mode 100644
index 00000000000..44da2828eac
--- /dev/null
+++ b/tensorflow/compiler/xla/service/memory_space_assignment_best_fit_repacker_test.cc
@@ -0,0 +1,89 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/memory_space_assignment_best_fit_repacker.h"
+
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+
+class MemorySpaceAssignmentBestFitRepackerTest : public ::testing::Test {
+ protected:
+  using AllocationBlock = MemorySpaceAssignmentRepacker::AllocationBlock;
+
+  MemorySpaceAssignmentBestFitRepackerTest() : repacker_(100, 1) {}
+
+  AllocationBlock* MakeAllocationBlock(int64 start_time, int64 end_time,
+                                       int64 size, int64 initial_offset = -1) {
+    allocation_blocks_.push_back({start_time,
+                                  end_time,
+                                  size,
+                                  -1,
+                                  initial_offset,
+                                  static_cast<int64>(allocation_blocks_.size()),
+                                  {}});
+    AllocationBlock* block = &allocation_blocks_.back();
+    block->colocations.push_back(block);
+    return block;
+  }
+
+  std::list<AllocationBlock> allocation_blocks_;
+  MemorySpaceAssignmentBestFitRepacker repacker_;
+};
+
+TEST_F(MemorySpaceAssignmentBestFitRepackerTest, Simple) {
+  std::vector<AllocationBlock*> allocation_blocks;
+  allocation_blocks.push_back(MakeAllocationBlock(10, 20, 10));
+  allocation_blocks.push_back(MakeAllocationBlock(5, 25, 15));
+  EXPECT_TRUE(*repacker_.Repack(absl::MakeSpan(allocation_blocks)));
+
+  EXPECT_EQ(allocation_blocks[0]->offset, 15);
+  EXPECT_EQ(allocation_blocks[1]->offset, 0);
+}
+
+TEST_F(MemorySpaceAssignmentBestFitRepackerTest, Colocation) {
+  std::vector<AllocationBlock*> allocation_blocks;
+  allocation_blocks.push_back(MakeAllocationBlock(0, 2, 10));
+  allocation_blocks.push_back(MakeAllocationBlock(10, 20, 10));
+  // Allocation blocks 0 and 1 are colocated.
+  allocation_blocks[0]->colocations.push_back(allocation_blocks[1]);
+  allocation_blocks[1]->colocations.push_back(allocation_blocks[0]);
+  allocation_blocks.push_back(MakeAllocationBlock(5, 25, 15));
+  EXPECT_TRUE(*repacker_.Repack(absl::MakeSpan(allocation_blocks)));
+
+  EXPECT_EQ(allocation_blocks[0]->offset, 15);
+  EXPECT_EQ(allocation_blocks[1]->offset, 15);
+  EXPECT_EQ(allocation_blocks[2]->offset, 0);
+}
+
+TEST_F(MemorySpaceAssignmentBestFitRepackerTest, TooLarge) {
+  // Memory size is 100, total size of buffers is 105.
+  std::vector<AllocationBlock*> allocation_blocks;
+  allocation_blocks.push_back(MakeAllocationBlock(10, 20, 10));
+  allocation_blocks.push_back(MakeAllocationBlock(5, 25, 15));
+  allocation_blocks.push_back(MakeAllocationBlock(15, 20, 10));
+  allocation_blocks.push_back(MakeAllocationBlock(12, 22, 50));
+  allocation_blocks.push_back(MakeAllocationBlock(10, 18, 20));
+  EXPECT_FALSE(*repacker_.Repack(absl::MakeSpan(allocation_blocks)));
+
+  // Make sure the buffers didn't get offset assignments.
+  EXPECT_EQ(allocation_blocks[0]->offset, -1);
+  EXPECT_EQ(allocation_blocks[1]->offset, -1);
+  EXPECT_EQ(allocation_blocks[2]->offset, -1);
+  EXPECT_EQ(allocation_blocks[3]->offset, -1);
+  EXPECT_EQ(allocation_blocks[4]->offset, -1);
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment_repacking.h b/tensorflow/compiler/xla/service/memory_space_assignment_repacking.h
new file mode 100644
index 00000000000..eb2f0698a95
--- /dev/null
+++ b/tensorflow/compiler/xla/service/memory_space_assignment_repacking.h
@@ -0,0 +1,70 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_REPACKING_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_REPACKING_H_
+
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/types.h"
+
+namespace xla {
+
+// An interface to define allocation repacking algorithms.
+class MemorySpaceAssignmentRepacker {
+ public:
+  MemorySpaceAssignmentRepacker(int64 max_size, int64 alignment)
+      : max_size_(max_size), alignment_(alignment) {}
+  virtual ~MemorySpaceAssignmentRepacker() = default;
+
+  // A contiguous block of allocation consisting of start and end (logical)
+  // times, size, and the initial offset. After repacking, if the repacking was
+  // successful and the allocations were modified, the offset field holds the
+  // new offset. To support aliased allocations, AllocationBlock also includes a
+  // vector of AllocationBlock pointers, called colocations. All AllocationBlock
+  // objects within the colocations must get the same offset. The id should be
+  // unique and is used to ensure determinism for comparison tie-breaker.
+  struct AllocationBlock {
+    int64 start_time;
+    int64 end_time;
+    int64 size;
+    int64 offset;
+    int64 initial_offset;
+    int64 id;
+    std::vector<AllocationBlock*> colocations;
+
+    std::string ToString() const {
+      return absl::StrCat("[", start_time, ", ", end_time, "] : size = ", size,
+                          ", offset = ", offset,
+                          " initial offset = ", initial_offset);
+    }
+
+    // This is required by BufferIntervalCompare as a tie breaker. Use a unique
+    // and deterministic id.
+    bool operator<(const AllocationBlock& other) const { return id < other.id; }
+  };
+
+  // Repack the AllocationBlocks provided in the parameter. Returns true if
+  // allocations have been modified and false if not. Returns a non-ok status if
+  // there was an error.
+  virtual StatusOr<bool> Repack(absl::Span<AllocationBlock*> allocations) = 0;
+
+ protected:
+  int64 max_size_;
+  int64 alignment_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_REPACKING_H_
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment_test.cc b/tensorflow/compiler/xla/service/memory_space_assignment_test.cc
index a52a4caa12c..5af61eac5d1 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/memory_space_assignment_test.cc
@@ -71,19 +71,22 @@ class MemorySpaceAssignmentTest : public HloTestBase,
 
   std::unique_ptr<PresetAssignments> AssignMemorySpace(
       HloModule* module, int64 max_outstanding_async_copies = -1,
-      int64 max_prefetch_interval = 10, int64 min_prefetch_interval = 2) {
+      int64 max_prefetch_interval = 10, int64 min_prefetch_interval = 2,
+      absl::optional<MemorySpaceAssignment::Options> options = absl::nullopt) {
     InstructionCountPrefetchIntervalPicker prefetch_interval_picker(
         min_prefetch_interval, max_prefetch_interval);
     return AssignMemorySpace(module, max_outstanding_async_copies,
                              /*buffer_interval_compare=*/{},
-                             &prefetch_interval_picker);
+                             &prefetch_interval_picker, options);
   }
 
   std::unique_ptr<PresetAssignments> AssignMemorySpace(
       HloModule* module, int64 max_outstanding_async_copies,
       absl::optional<MemorySpaceAssignment::BufferIntervalCompare>
           buffer_interval_compare,
-      PrefetchIntervalPicker* prefetch_interval_picker) {
+      PrefetchIntervalPicker* prefetch_interval_picker,
+      absl::optional<MemorySpaceAssignment::Options>
+          memory_space_assignment_options = absl::nullopt) {
     auto size_fn = [](const BufferValue& buffer) {
       return ShapeUtil::ByteSizeOf(buffer.shape(), /*pointer_size=*/8);
     };
@@ -117,9 +120,15 @@ class MemorySpaceAssignmentTest : public HloTestBase,
     }
 
     MemorySpaceAssignment::Options options;
+    if (memory_space_assignment_options) {
+      options = *memory_space_assignment_options;
+    } else {
+      options.max_size_in_bytes = 128;
+      options.alignment_in_bytes = 8;
+      options.verify = true;
+    }
+
     options.alternate_memory_space = kAlternateMemorySpace;
-    options.max_size_in_bytes = 128;
-    options.alignment_in_bytes = 8;
     options.buffer_interval_compare = buffer_interval_compare;
     options.prefetch_interval_picker = prefetch_interval_picker;
     options.size_fn = size_fn;
@@ -127,7 +136,6 @@ class MemorySpaceAssignmentTest : public HloTestBase,
     options.max_outstanding_prefetches = max_outstanding_async_copies;
     options.max_outstanding_evictions = max_outstanding_async_copies;
     options.allocate_across_sequential_calls = GetParam();
-    options.verify = true;
 
     auto alias_analysis = HloAliasAnalysis::Run(module).ValueOrDie();
     std::unique_ptr<HloLiveRange> hlo_live_range =
@@ -224,6 +232,24 @@ class MemorySpaceAssignmentTest : public HloTestBase,
     return copies;
   }
 
+  int64 GetAlternateMemoryOffset(const PresetAssignments& preset_assignments,
+                                 const HloInstruction* instruction,
+                                 const ShapeIndex& index = {}) const {
+    // Returns the offset of the assignment, -1 if it's not in the alternate
+    // memory.
+    const HloModule* module = instruction->parent()->parent();
+    auto alias_analysis = HloAliasAnalysis::Run(module).ValueOrDie();
+    HloBuffer& buffer = alias_analysis->GetUniqueBufferAt(instruction, index);
+    for (auto& pos_and_chunk : preset_assignments.chunks()) {
+      for (auto& value : buffer.values()) {
+        if (pos_and_chunk.first == value->defining_position()) {
+          return pos_and_chunk.second.offset;
+        }
+      }
+    }
+    return -1;
+  }
+
   std::unique_ptr<HloModule> CreateEvictAndPrefetchModule() {
     HloComputation::Builder builder(TestName());
     Shape shape = ShapeUtil::MakeShape(F32, {2, 3});
@@ -4058,6 +4084,340 @@ TEST_P(MemorySpaceAssignmentTest, MoveCopyDoneEarlier) {
             find_schedule_index(cos->operand(0)));
 }
 
+TEST_P(MemorySpaceAssignmentTest, BitcastRoot) {
+  // Tests against a bug where the root of entry computation is a bitcast
+  // instruction and it ends up getting an allocation in the alternate memory.
+  absl::string_view hlo_string = R"(
+HloModule primitive_computation_gather.4, is_scheduled=true
+
+%while_body {
+  %param.1 = (s32[], f32[3,3,3]) parameter(0)
+  %get-tuple-element.32 = s32[] get-tuple-element(%param.1), index=0
+  %copy.6 = s32[] copy(s32[] %get-tuple-element.32)
+  %constant.8 = s32[] constant(1)
+  %add = s32[] add(s32[] %copy.6, s32[] %constant.8)
+  %get-tuple-element.35 = f32[3,3,3] get-tuple-element(%param.1), index=1
+  negate = f32[3,3,3] negate(get-tuple-element.35)
+  ROOT %tuple.10 = (s32[], f32[3,3,3]) tuple(s32[] %add, f32[3,3,3] negate)
+}
+
+%while_cond {
+  %param.0 = (s32[], f32[3,3,3]) parameter(0)
+  %get-tuple-element = s32[] get-tuple-element(%param.0), index=0
+  %constant.3 = s32[] constant(3)
+  ROOT %compare = pred[] compare(s32[] %get-tuple-element, s32[] %constant.3), direction=LT
+}
+
+ENTRY %primitive_computation_gather.4 (parameter.1: f32[3,10,5], parameter.2: s32[3,1]) -> f32[3,3,3] {
+  %constant.1 = s32[] constant(0)
+  %copy.11 = s32[] copy(s32[] %constant.1)
+  %constant = f32[] constant(0)
+  %broadcast = f32[3,3,3] broadcast(f32[] %constant), dimensions={}
+  %tuple.8 = (s32[], f32[3,10,5], s32[3,1], f32[3,3,3]) tuple(s32[] %copy.11, f32[3,3,3] %broadcast)
+  %while = (s32[], f32[3,3,3]) while(%tuple.8), condition=%while_cond, body=%while_body
+  %get-tuple-element.7 = f32[3,3,3] get-tuple-element(%while), index=1
+  ROOT %bitcast.1 = f32[3,3,3] bitcast(f32[3,3,3] %get-tuple-element.7)
+}
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  AssignMemorySpace(module.get());
+
+  const HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_TRUE(!root->shape().has_layout() ||
+              root->shape().layout().memory_space() == kDefaultMemorySpace);
+}
+
+// A mock MemorySpaceAssignmentRepacker class that accepst a map of
+// (start_time,offset) -> new_offset values. Using this map, the repacker
+// repacks the allocations to the new_offset.
+class FakeMemorySpaceAssignmentRepacker : public MemorySpaceAssignmentRepacker {
+ public:
+  explicit FakeMemorySpaceAssignmentRepacker(
+      absl::flat_hash_map<std::pair<int64, int64>, int64>& repack_map,
+      std::function<void(absl::Span<AllocationBlock*>)> check_fun = nullptr)
+      : MemorySpaceAssignmentRepacker(/*max_size=*/128, /*alignment=*/8),
+        repack_map_(repack_map),
+        check_fun_(check_fun) {}
+
+  StatusOr<bool> Repack(absl::Span<AllocationBlock*> allocations) override {
+    bool modified = false;
+    for (AllocationBlock* block : allocations) {
+      absl::flat_hash_set<int64> colocations;
+      std::string colocations_str;
+      for (const AllocationBlock* colocation : block->colocations) {
+        absl::StrAppend(&colocations_str, colocation->id, ", ");
+        colocations.insert(colocation->id);
+      }
+      VLOG(1) << "Alloc id: " << block->id << " time: [" << block->start_time
+              << ", " << block->end_time << "] size: " << block->size
+              << " init offset: " << block->initial_offset << " colocations: {"
+              << colocations_str << "}";
+      auto it = repack_map_.find({block->start_time, block->initial_offset});
+      if (it != repack_map_.end()) {
+        modified = true;
+        block->offset = it->second;
+      } else {
+        block->offset = block->initial_offset;
+      }
+      for (AllocationBlock* colocation : block->colocations) {
+        if (it != repack_map_.end()) {
+          colocation->offset = it->second;
+        } else {
+          colocation->offset = colocation->initial_offset;
+        }
+      }
+    }
+    if (check_fun_) {
+      check_fun_(allocations);
+    }
+
+    return modified;
+  }
+
+ private:
+  // A map from (start_time, offset) to new_offset.
+  absl::flat_hash_map<std::pair<int64, int64>, int64> repack_map_;
+  std::function<void(absl::Span<AllocationBlock*>)> check_fun_;
+};
+
+TEST_P(MemorySpaceAssignmentTest, Repack) {
+  // We initially perform the following allocations at these offsets.
+  //
+  //    Max memory
+  //  -------------------------------------------
+  //
+  //
+  //
+  //
+  //      +------------+
+  //      |     b      |
+  //      +------------+
+  //  +-------+                 +------------+
+  //  |   a   |                 |     n      |
+  //  +-------+                 +------------+
+  //  -------------------------------------------
+  //    Min memory          time ->
+  //
+  // Next up, we try to allocate the prefetch for m. However due to
+  // fragmentation, this won't be possible:
+  //
+  //    Max memory
+  //  -------------------------------------------
+  //
+  //
+  //
+  //                +---------+
+  //      +------------+      |
+  //      |     b   |  |      |
+  //      +------------+      |
+  //  +-------+     |         | +------------+
+  //  |   a   |     |    d    | |     n      |
+  //  +-------+     +---------+ +------------+
+  //  -------------------------------------------
+  //    Min memory          time ->
+  //
+  // We then call repack to repack the existing allocations which allows us to
+  // allocate the prefetch for m:
+  //
+  //    Max memory
+  //  -------------------------------------------
+  //                +---------+
+  //                |         |
+  //                |         |
+  //                |         |
+  //  +-------+     |         |
+  //  |   a   |     |    d    |
+  //  +-------+     +---------+
+  //      +------------+        +------------+
+  //      |      b     |        |     n      |
+  //      +------------+        +------------+
+  //  -------------------------------------------
+  //    Min memory          time ->
+  absl::string_view hlo_string = R"(
+  HloModule bug, is_scheduled=true
+
+  ENTRY Entry {
+    param0 = f32[8,3] parameter(0)
+    param1 = f32[2,4] parameter(1)
+    a = f32[2,4] sine(param1)
+    b = f32[2,4] cosine(param1)
+    c = f32[8,3] negate(param0)
+    j = f32[2,4] negate(a)
+    d = f32[8,3] tanh(param0)
+    k = f32[2,4] negate(j)
+    l = f32[2,4] add(b, k)
+    m = f32[8,3] negate(d)
+    n = f32[2,4] sine(l)
+    o = f32[8,3] negate(m)
+    p = f32[2,4] negate(n)
+    q = f32[8,3] negate(m)
+    ROOT tuple = (f32[2,4], f32[8,3], f32[8,3]) tuple(p, q, o)
+  }
+  )";
+
+  MemorySpaceAssignment::BufferIntervalCompare buffer_interval_compare =
+      [](const MemorySpaceAssignment::BufferInterval& a,
+         const MemorySpaceAssignment::BufferInterval& b) {
+        auto get_opcode_priority = [](const HloOpcode& opcode) {
+          switch (opcode) {
+            case HloOpcode::kSin:
+              return 0;
+            case HloOpcode::kCos:
+              return 1;
+            case HloOpcode::kTanh:
+              return 2;
+            default:
+              return 3;
+          }
+        };
+
+        return get_opcode_priority(a.buffer->defining_instruction()->opcode()) <
+               get_opcode_priority(b.buffer->defining_instruction()->opcode());
+      };
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  InstructionCountPrefetchIntervalPicker prefetch_interval_picker(2, 10);
+  absl::flat_hash_map<std::pair<int64, int64>, int64> repack_map;
+  // Move "a" from offset 0 to 32.
+  repack_map[{2, 0}] = 32;
+  // Move "b" from offset 32 to 0.
+  repack_map[{3, 32}] = 0;
+  FakeMemorySpaceAssignmentRepacker repacker =
+      FakeMemorySpaceAssignmentRepacker(repack_map);
+  MemorySpaceAssignment::Options options;
+  options.max_size_in_bytes = 128;
+  options.alignment_in_bytes = 8;
+  options.verify = true;
+  options.max_repacks = 1;
+  options.repacker = &repacker;
+  AssignMemorySpace(module.get(), /*max_outstanding_async_copies=*/-1,
+                    buffer_interval_compare, &prefetch_interval_picker,
+                    options);
+
+  // If repacking succeeds, we should find the buffer for d in alternate memory.
+  const HloInstruction* d =
+      module->entry_computation()->GetInstructionWithName("d");
+  EXPECT_EQ(d->shape().layout().memory_space(), kAlternateMemorySpace);
+}
+
+TEST_P(MemorySpaceAssignmentTest, RepackExportsAliasedOffsets) {
+  // This test is that we are correctly exporting aliased offsets for repacking.
+  // In this example, the buffer produced at HLO "a" will be allocated first,
+  // and will consist of four allocations:
+  //    1) a produced in the alternate memory (and then evicted to the default
+  //    memory). 2) a prefetched to the alternate memory to be used by q and
+  //    while HLOs. 3) a used within the while loop body. 4) the output of while
+  //    HLO, used by u.
+  //
+  // Since a will be allocated first (the test is crafted to prioritize sine
+  // HLO), all four allocations should get the same (zero) offsets. However,
+  // while allocations 2, 3, and 4 need to be colocated with each other,
+  // allocation 1 doesn't need to be colocated with the other three.
+  absl::string_view hlo_string = R"(
+  HloModule bug, is_scheduled=true
+
+  while_condition {
+    param1 = (f32[2,4], f32[2,4]) parameter(0)
+    ROOT cond = pred[] constant(true)
+  }
+
+  while_body {
+    param2 = (f32[2,4], f32[2,4]) parameter(0)
+    gte2 = f32[2,4] get-tuple-element(param2), index=0
+    gte3 = f32[2,4] get-tuple-element(param2), index=1
+    add = f32[2,4] add(gte2, gte3)
+    ROOT tuple2 = (f32[2,4], f32[2,4]) tuple(add, gte3)
+  }
+
+  ENTRY Entry {
+    param0 = f32[2,4] parameter(0)
+    a = f32[2,4] sine(param0)
+    b = f32[2,4] negate(a)
+    c = f32[2,4] negate(b)
+    d = f32[2,4] negate(c)
+    e = f32[2,4] negate(d)
+    f = f32[2,4] negate(e)
+    g = f32[2,4] negate(f)
+    h = f32[2,4] negate(g)
+    i = f32[2,4] negate(h)
+    j = f32[2,4] negate(i)
+    k = f32[2,4] negate(j)
+    l = f32[2,4] negate(k)
+    m = f32[2,4] negate(l)
+    n = f32[2,4] negate(m)
+    o = f32[2,4] negate(n)
+    p = f32[2,4] negate(o)
+    q = f32[2,4] add(p, a)
+    tuple = (f32[2,4], f32[2,4]) tuple(q, a)
+    while = (f32[2,4], f32[2,4]) while(tuple), condition=while_condition, body=while_body
+    gte0 = f32[2,4] get-tuple-element(while), index=0
+    gte1 = f32[2,4] get-tuple-element(while), index=1
+    r = f32[2,4] negate(gte0)
+    s = f32[2,4] negate(r)
+    t = f32[2,4] negate(s)
+    constant = f32[] constant(0)
+    broadcast = f32[8,4] broadcast(constant), dimensions={}
+    cos = f32[8,4] cosine(broadcast)
+    u = f32[2,4] add(t, gte1)
+    v = f32[2,4] add(u, param0)
+    w = f32[8,4] negate(cos)
+    ROOT tuple3 = (f32[2,4], f32[8,4]) tuple(v, w)
+  }
+  )";
+
+  MemorySpaceAssignment::BufferIntervalCompare buffer_interval_compare =
+      [](const MemorySpaceAssignment::BufferInterval& a,
+         const MemorySpaceAssignment::BufferInterval& b) {
+        auto get_opcode_priority = [](const HloOpcode& opcode) {
+          switch (opcode) {
+            case HloOpcode::kSin:
+              return 0;
+            case HloOpcode::kCos:
+              return 1;
+            case HloOpcode::kTanh:
+              return 2;
+            default:
+              return 3;
+          }
+        };
+
+        return get_opcode_priority(a.buffer->defining_instruction()->opcode()) <
+               get_opcode_priority(b.buffer->defining_instruction()->opcode());
+      };
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  InstructionCountPrefetchIntervalPicker prefetch_interval_picker(2, 10);
+  absl::flat_hash_map<std::pair<int64, int64>, int64> repack_map;
+
+  // Expect that of the four separate allocations for the "a" buffer, the first
+  // and the next three are in separate colocations.
+  auto check_fun =
+      [](absl::Span<MemorySpaceAssignmentRepacker::AllocationBlock*>
+             allocations) {
+        EXPECT_TRUE(allocations.at(0)->colocations.size() == 1 ||
+                    allocations.at(0)->colocations.size() == 3);
+        EXPECT_EQ(allocations.at(1)->colocations.size(), 3);
+        EXPECT_EQ(allocations.at(2)->colocations.size(), 3);
+        EXPECT_TRUE(allocations.at(3)->colocations.size() == 1 ||
+                    allocations.at(3)->colocations.size() == 3);
+      };
+  FakeMemorySpaceAssignmentRepacker repacker =
+      FakeMemorySpaceAssignmentRepacker(repack_map, check_fun);
+  MemorySpaceAssignment::Options options;
+  options.max_size_in_bytes = 128;
+  options.alignment_in_bytes = 8;
+  options.verify = true;
+  options.max_repacks = 1;
+  options.repacker = &repacker;
+  AssignMemorySpace(module.get(), /*max_outstanding_async_copies=*/-1,
+                    buffer_interval_compare, &prefetch_interval_picker,
+                    options);
+}
+
 TEST_P(MemorySpaceAssignmentTest, Determinism) {
   // Run memory space assignment a few times to make sure every time it compiles
   // to the same thing.
@@ -4073,6 +4433,47 @@ TEST_P(MemorySpaceAssignmentTest, Determinism) {
   }
 }
 
+TEST_P(MemorySpaceAssignmentTest, InPlaceOp) {
+  // Tests that in-place ops like DynamicUpdateSlice get the same allocation as
+  // its input.
+  absl::string_view hlo_string = R"(
+HloModule Module, is_scheduled=true
+
+fused_computation {
+  param0 = f32[2,3] parameter(0)
+  constant.1 = f32[] constant(0)
+  broadcast = f32[2,1] broadcast(constant.1), dimensions={}
+  constant.3 = s32[] constant(0)
+  ROOT dynamic-update-slice.5 = f32[2,3] dynamic-update-slice(param0, broadcast, constant.3, constant.3)
+}
+
+ENTRY main {
+  param = f32[2,3] parameter(0)
+  negate = f32[2,3] negate(param)
+  fusion = f32[2,3] fusion(negate), kind=kLoop, calls=fused_computation
+  ROOT add = f32[2,3] add(fusion, fusion)
+}
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  auto preset_assignments = AssignMemorySpace(module.get());
+  HloInstruction* negate_instruction =
+      module->entry_computation()->GetInstructionWithName("negate");
+  int64 negate_offset =
+      GetAlternateMemoryOffset(*preset_assignments, negate_instruction);
+  HloInstruction* fusion_instruction =
+      module->entry_computation()->GetInstructionWithName("fusion");
+  int64 fusion_offset =
+      GetAlternateMemoryOffset(*preset_assignments, fusion_instruction);
+  // We expect negate and fusion to get the same offsets.
+  EXPECT_EQ(negate_offset, fusion_offset);
+  const bool allocate_across_sequential_calls = GetParam();
+  if (allocate_across_sequential_calls) {
+    EXPECT_NE(negate_offset, -1);
+  }
+}
+
 INSTANTIATE_TEST_SUITE_P(MemorySpaceAssignmentInstantiation,
                          MemorySpaceAssignmentTest,
                          ::testing::Values(false, true));
@@ -4354,6 +4755,166 @@ TEST_P(MemorySpaceAssignmentTest, CrossProgramPrefetchFusionTest) {
   EXPECT_EQ(cross_program_prefetches.size(), 0);
 }
 
+TEST_P(MemorySpaceAssignmentTest, CrossProgramPrefetchPinnedTest) {
+  HloComputation::Builder builder(TestName());
+
+  constexpr int kBatch = 8;
+  constexpr int kFeature = 8;
+  constexpr int kOutput = 2;
+
+  auto lhs_shape = ShapeUtil::MakeShape(F32, {kBatch, kFeature});
+  auto rhs_shape = ShapeUtil::MakeShapeWithLayout(
+      F32, {kFeature, kOutput},
+      /*minor_to_major=*/{1, 0}, /*tiles=*/{}, /*element_size_in_bits=*/0,
+      kAlternateMemorySpace);
+  auto result_shape = ShapeUtil::MakeShape(F32, {kBatch, kOutput});
+  auto tuple_shape = ShapeUtil::MakeTupleShape({lhs_shape, rhs_shape});
+  HloInstruction* param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "p0"));
+
+  auto lhs = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(lhs_shape, param, 0));
+  auto rhs = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(rhs_shape, param, 1));
+
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  auto dot = builder.AddInstruction(HloInstruction::CreateDot(
+      result_shape, lhs, rhs, dot_dnums, DefaultPrecisionConfig(2)));
+
+  auto module = CreateNewVerifiedModule();
+  HloComputation* computation = module->AddEntryComputation(builder.Build());
+
+  HloSchedule schedule(module.get());
+  schedule.set_sequence(computation, {param, lhs, rhs, dot});
+  TF_CHECK_OK(module->set_schedule(schedule));
+
+  AssignMemorySpace(module.get());
+
+  auto cross_program_prefetches = module->CrossProgramPrefetches();
+  EXPECT_EQ(cross_program_prefetches.size(), 0);
+}
+
+TEST_P(MemorySpaceAssignmentTest, CrossProgramPrefetchReuse) {
+  // This test is for checking if the cross-program-prefetched buffer is freed
+  // after its last use and there is an end-of-program prefetch.
+  absl::string_view hlo_string = R"(
+  HloModule cross_program_prefetch, is_scheduled=true
+
+  ENTRY CrossProgramPrefetch {
+    p0 = (f32[8,8]{1,0}, f32[8,2]{1,0}) parameter(0)
+    get-tuple-element = f32[8,8]{1,0} get-tuple-element(p0), index=0
+    get-tuple-element.1 = f32[8,2]{1,0} get-tuple-element(p0), index=1
+    dot = f32[8,2]{1,0} dot(get-tuple-element, get-tuple-element.1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    negate.1 = f32[8,2]{1,0} negate(dot)
+    negate.2 = f32[8,2]{1,0} negate(negate.1)
+    negate.3 = f32[8,2]{1,0} negate(negate.2)
+    negate.4 = f32[8,2]{1,0} negate(negate.3)
+    negate.5 = f32[8,2]{1,0} negate(negate.4)
+    negate.6 = f32[8,2]{1,0} negate(negate.5)
+    negate.7 = f32[8,2]{1,0} negate(negate.6)
+    negate.8 = f32[8,2]{1,0} negate(negate.7)
+    ROOT negate.9 = f32[8,2]{1,0} negate(negate.8)
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AssignMemorySpace(module.get(), /*max_outstanding_async_copies=*/-1,
+                    /*max_prefetch_interval=*/5, /*min_prefetch_interval=*/2);
+
+  auto cross_program_prefetches = module->CrossProgramPrefetches();
+  EXPECT_EQ(cross_program_prefetches.size(), 1);
+  if (!cross_program_prefetches.empty()) {
+    EXPECT_EQ(cross_program_prefetches[0].first, 0);
+    EXPECT_EQ(cross_program_prefetches[0].second, ShapeIndex({1}));
+  }
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloDataflowAnalysis> dataflow_analysis,
+      HloDataflowAnalysis::Run(*module));
+  const HloValue& cross_program_prefetched_value =
+      dataflow_analysis->GetValueDefinedAt(
+          module->entry_computation()->parameter_instruction(0), {1});
+  // Expect that there are two prefetches that use this value, one is the
+  // cross-program prefetch, the other is the end-of-program prefetch.
+  auto is_cross_program_prefetch = [](const HloUse& use) {
+    return use.instruction->opcode() == HloOpcode::kCopyStart &&
+           use.instruction->is_cross_program_prefetch();
+  };
+  EXPECT_EQ(absl::c_count_if(cross_program_prefetched_value.uses(),
+                             is_cross_program_prefetch),
+            1);
+  auto is_end_of_program_prefetch = [](const HloUse& use) {
+    return use.instruction->opcode() == HloOpcode::kCopyStart &&
+           !use.instruction->is_cross_program_prefetch();
+  };
+  EXPECT_EQ(absl::c_count_if(cross_program_prefetched_value.uses(),
+                             is_end_of_program_prefetch),
+            1);
+}
+
+TEST_P(MemorySpaceAssignmentTest, CrossProgramPrefetchNoReuse) {
+  // This tests the scenario that the cross-program-prefetched buffer is used
+  // again close to the end of the computation. In this case, it is better not
+  // to free the buffer.
+  absl::string_view hlo_string = R"(
+  HloModule cross_program_prefetch, is_scheduled=true
+
+  ENTRY CrossProgramPrefetch {
+    p0 = (f32[8,8]{1,0}, f32[8,2]{1,0}) parameter(0)
+    get-tuple-element = f32[8,8]{1,0} get-tuple-element(p0), index=0
+    get-tuple-element.1 = f32[8,2]{1,0} get-tuple-element(p0), index=1
+    dot = f32[8,2]{1,0} dot(get-tuple-element, get-tuple-element.1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    negate.1 = f32[8,2]{1,0} negate(dot)
+    negate.2 = f32[8,2]{1,0} negate(negate.1)
+    negate.3 = f32[8,2]{1,0} negate(negate.2)
+    negate.4 = f32[8,2]{1,0} negate(negate.3)
+    negate.5 = f32[8,2]{1,0} negate(negate.4)
+    negate.6 = f32[8,2]{1,0} negate(negate.5)
+    negate.7 = f32[8,2]{1,0} negate(negate.6)
+    negate.8 = f32[8,2]{1,0} negate(negate.7)
+    ROOT dot.2 = f32[2,2]{1,0} dot(negate.8, get-tuple-element.1), lhs_contracting_dims={0}, rhs_contracting_dims={0}
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AssignMemorySpace(module.get(), /*max_outstanding_async_copies=*/-1,
+                    /*max_prefetch_interval=*/5, /*min_prefetch_interval=*/2);
+
+  auto cross_program_prefetches = module->CrossProgramPrefetches();
+  EXPECT_EQ(cross_program_prefetches.size(), 1);
+  if (!cross_program_prefetches.empty()) {
+    EXPECT_EQ(cross_program_prefetches[0].first, 0);
+    EXPECT_EQ(cross_program_prefetches[0].second, ShapeIndex({1}));
+  }
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloDataflowAnalysis> dataflow_analysis,
+      HloDataflowAnalysis::Run(*module));
+  const HloValue& cross_program_prefetched_value =
+      dataflow_analysis->GetValueDefinedAt(
+          module->entry_computation()->parameter_instruction(0), {1});
+  // Expect that there is one prefetch that use this value, the cross-program
+  // prefetch. There shouldn't be an end-of-program prefetch.
+  auto is_cross_program_prefetch = [](const HloUse& use) {
+    return use.instruction->opcode() == HloOpcode::kCopyStart &&
+           use.instruction->is_cross_program_prefetch();
+  };
+  EXPECT_EQ(absl::c_count_if(cross_program_prefetched_value.uses(),
+                             is_cross_program_prefetch),
+            1);
+  auto is_end_of_program_prefetch = [](const HloUse& use) {
+    return use.instruction->opcode() == HloOpcode::kCopyStart &&
+           !use.instruction->is_cross_program_prefetch();
+  };
+  EXPECT_EQ(absl::c_count_if(cross_program_prefetched_value.uses(),
+                             is_end_of_program_prefetch),
+            0);
+}
+
 using CostAnalysisPrefetchIntervalPickerTest = HloTestBase;
 
 TEST_F(CostAnalysisPrefetchIntervalPickerTest, PrefetchIntervalOrder) {
@@ -4578,11 +5139,12 @@ TEST_F(CostAnalysisPrefetchIntervalPickerTest, NestedWhile) {
 
   HloInstruction* root = module->entry_computation()->root_instruction();
   const HloUse use{root, /*operand_number=*/1, /*operand_index=*/{}};
+  const Shape& shape = root->operand(1)->shape();
 
   // We expect the root's latest prefetch start time to be before the while loop
   // (logical time 4).
-  EXPECT_EQ(interval_picker.LatestPrefetchStartTime(use, /*start_time=*/0,
-                                                    /*end_time=*/23),
+  EXPECT_EQ(interval_picker.LatestPrefetchStartTime(shape, /*start_time=*/0,
+                                                    /*end_time=*/23, &use),
             4);
 }
 
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment_utils.cc b/tensorflow/compiler/xla/service/memory_space_assignment_utils.cc
index 0215f007c9c..0c44ae0d766 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment_utils.cc
+++ b/tensorflow/compiler/xla/service/memory_space_assignment_utils.cc
@@ -17,21 +17,21 @@ limitations under the License.
 
 namespace xla {
 
-bool MemorySpaceAssignmentUtils::IsIntervalAllowedInAlternateMemory(
-    const GlobalDecreasingSizeBestFitHeap::BufferInterval& interval) {
+bool MemorySpaceAssignmentUtils::IsValueAllowedInAlternateMemory(
+    const HloValue* value) {
   // If the buffer is a tuple, don't use this algorithm for now. The buffers
   // that are pointed to by the tuple will still use this algorithm.  Because
   // tuples are cheap to place in the alternate memory (they are just pointers)
   // we don't need to use prefetch/evict logic.
-  if (interval.buffer->shape().IsTuple()) {
-    VLOG(4) << "Keeping value " << interval.buffer->ToShortString()
+  if (value->shape().IsTuple()) {
+    VLOG(4) << "Keeping value " << value->ToShortString()
             << " in default mem because it is a tuple.";
     return false;
   }
 
   // Don't place scalars in the alternate memory.
-  if (ShapeUtil::IsEffectiveScalar(interval.buffer->shape())) {
-    VLOG(4) << "Keeping value " << interval.buffer->ToShortString()
+  if (ShapeUtil::IsEffectiveScalar(value->shape())) {
+    VLOG(4) << "Keeping value " << value->ToShortString()
             << " in default mem because it is a scalar.";
     return false;
   }
@@ -44,10 +44,10 @@ bool MemorySpaceAssignmentUtils::IsIntervalAllowedInAlternateMemory(
   // allocate TupleSelect in the alternate memory space.
   // TODO(berkin): Not allocating add-dependencies either since they need to be
   // treated specially. We should revisit this later.
-  for (const HloPosition& position : interval.buffer->positions()) {
+  for (const HloPosition& position : value->positions()) {
     if (position.instruction->opcode() == HloOpcode::kTupleSelect ||
         position.instruction->opcode() == HloOpcode::kAddDependency) {
-      VLOG(4) << "Keeping value " << interval.buffer->ToShortString()
+      VLOG(4) << "Keeping value " << value->ToShortString()
               << " in default mem because it has a tuple-select or "
               << "add-dependency position.";
       return false;
@@ -56,18 +56,18 @@ bool MemorySpaceAssignmentUtils::IsIntervalAllowedInAlternateMemory(
 
   // Send and Recv HLOs return a request identifier. These should not be
   // allocated in the alternate memory.
-  for (const HloPosition& position : interval.buffer->positions()) {
+  for (const HloPosition& position : value->positions()) {
     if ((position.instruction->opcode() == HloOpcode::kSend ||
          position.instruction->opcode() == HloOpcode::kRecv)) {
       // TODO(berkin): Send/recv buffers need a stable buffer allocation
       // throughout sending/receiving. Disable memory space allocation for these
       // for now.
       if (position.index == ShapeIndex({0})) {
-        VLOG(4) << "Keeping value " << interval.buffer->ToShortString()
+        VLOG(4) << "Keeping value " << value->ToShortString()
                 << " in default mem because it is a send/recv buffer.";
         return false;
       } else if (position.index == ShapeIndex({1})) {
-        VLOG(4) << "Keeping value " << interval.buffer->ToShortString()
+        VLOG(4) << "Keeping value " << value->ToShortString()
                 << " in default mem because it is a request identifier for "
                    "send/recv.";
         return false;
@@ -78,11 +78,11 @@ bool MemorySpaceAssignmentUtils::IsIntervalAllowedInAlternateMemory(
          position.instruction->opcode() == HloOpcode::kCollectivePermuteDone)) {
       // Disable memory space allocation for these for now.
       if (position.index == ShapeIndex({0})) {
-        VLOG(4) << "Keeping value " << interval.buffer->ToShortString()
+        VLOG(4) << "Keeping value " << value->ToShortString()
                 << " in default mem because it is a collective-permute buffer.";
         return false;
       } else if (position.index == ShapeIndex({1})) {
-        VLOG(4) << "Keeping value " << interval.buffer->ToShortString()
+        VLOG(4) << "Keeping value " << value->ToShortString()
                 << " in default mem because it is a collective-permute buffer.";
         return false;
       }
@@ -92,4 +92,10 @@ bool MemorySpaceAssignmentUtils::IsIntervalAllowedInAlternateMemory(
   return true;
 }
 
+bool MemorySpaceAssignmentUtils::IsIntervalAllowedInAlternateMemory(
+    const GlobalDecreasingSizeBestFitHeap<HloValue>::BufferInterval& interval) {
+  return IsValueAllowedInAlternateMemory(interval.buffer) &&
+         absl::c_all_of(interval.colocations, IsValueAllowedInAlternateMemory);
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment_utils.h b/tensorflow/compiler/xla/service/memory_space_assignment_utils.h
index 651ac107c25..082efa5eb64 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment_utils.h
+++ b/tensorflow/compiler/xla/service/memory_space_assignment_utils.h
@@ -26,7 +26,11 @@ class MemorySpaceAssignmentUtils {
   // Returns true if this buffer is allowed to be placed in the alternate
   // memory.
   static bool IsIntervalAllowedInAlternateMemory(
-      const GlobalDecreasingSizeBestFitHeap::BufferInterval& interval);
+      const GlobalDecreasingSizeBestFitHeap<HloValue>::BufferInterval&
+          interval);
+
+  // Returns true if the HloValue is allowed to be placed in alternate memory.
+  static bool IsValueAllowedInAlternateMemory(const HloValue* value);
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/BUILD b/tensorflow/compiler/xla/service/mlir_gpu/BUILD
index 2bcf5fa7dae..1990e962802 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/BUILD
+++ b/tensorflow/compiler/xla/service/mlir_gpu/BUILD
@@ -41,9 +41,12 @@ cc_library(
     srcs = ["emission_context.cc"],
     hdrs = ["emission_context.h"],
     deps = [
+        "//tensorflow/compiler/mlir/hlo",
+        "//tensorflow/compiler/mlir/hlo:lhlo",
         "//tensorflow/compiler/xla/service:hlo",
         "@com_google_absl//absl/strings",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:StandardOps",
     ],
 )
 
@@ -82,8 +85,9 @@ cc_library(
         ":kernel_lowering",
         ":lhlo_dialect_emitter",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@llvm-project//llvm:Core",
         "@llvm-project//mlir:GPUDialect",
-        "@llvm-project//mlir:AllPassesAndDialects",
+        "@llvm-project//mlir:AllPassesAndDialectsNoRegistration",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LLVMDialect",
         "@llvm-project//mlir:LLVMTransforms",
@@ -148,51 +152,68 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/stream_executor:stream_executor_headers",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@llvm-project//llvm:Core",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LLVMDialect",
         "@llvm-project//mlir:StandardOps",
     ],
 )
 
+cc_library(
+    name = "passes",
+    srcs = ["passes.cc"],
+    hdrs = ["passes.h"],
+    deps = [
+        "//tensorflow/compiler/mlir/hlo:lhlo",
+        "@com_google_absl//absl/memory",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:GPUDialect",
+        "@llvm-project//mlir:GPUTransforms",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SCFDialect",
+        "@llvm-project//mlir:SCFTransforms",
+        "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:Transforms",
+    ],
+)
+
 cc_library(
     name = "kernel_lowering",
     srcs = ["kernel_lowering.cc"],
     hdrs = ["kernel_lowering.h"],
     deps = [
+        ":passes",
         "//tensorflow/compiler/mlir/hlo",
-        "//tensorflow/compiler/mlir/hlo:hlo_dialect_force_registration",
         "//tensorflow/compiler/mlir/hlo:hlo_legalize_to_lhlo",
         "//tensorflow/compiler/mlir/hlo:legalize_tanh_to_approximation",
         "//tensorflow/compiler/mlir/hlo:legalize_to_linalg",
         "//tensorflow/compiler/mlir/hlo:lhlo",
-        "//tensorflow/compiler/mlir/hlo:lhlo_copy_removal",
         "//tensorflow/compiler/mlir/hlo:lhlo_fuse_linalg",
         "//tensorflow/compiler/mlir/hlo:lhlo_legalize_to_affine",
         "//tensorflow/compiler/mlir/hlo:lhlo_legalize_to_gpu",
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
-        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
-        "@llvm-project//mlir:Affine",
         "@llvm-project//mlir:AffineToStandardTransforms",
         "@llvm-project//mlir:CFGTransforms",
         "@llvm-project//mlir:GPUDialect",
         "@llvm-project//mlir:GPUToNVVMTransforms",
+        "@llvm-project//mlir:GPUToROCDLTransforms",
         "@llvm-project//mlir:GPUTransforms",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LLVMDialect",
         "@llvm-project//mlir:LLVMTransforms",
-        "@llvm-project//mlir:LinalgOps",
         "@llvm-project//mlir:LinalgToLLVM",
         "@llvm-project//mlir:LinalgTransforms",
         "@llvm-project//mlir:NVVMDialect",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:ROCDLDialect",
         "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:SCFToGPUPass",
         "@llvm-project//mlir:SCFTransforms",
         "@llvm-project//mlir:StandardOps",
-        "@llvm-project//mlir:Support",
         "@llvm-project//mlir:Transforms",
     ],
 )
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/emission_context.cc b/tensorflow/compiler/xla/service/mlir_gpu/emission_context.cc
index ca979262df0..06c7ebd1099 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/emission_context.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/emission_context.cc
@@ -16,8 +16,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/mlir_gpu/emission_context.h"
 
 #include "absl/strings/substitute.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 
 namespace xla {
@@ -25,6 +28,8 @@ namespace mlir_gpu {
 
 EmissionContext::EmissionContext(std::unique_ptr<HloModule> module)
     : module_(std::move(module)), context_() {
+  context_.loadDialect<mlir::mhlo::MhloDialect, mlir::lmhlo::LmhloDialect,
+                       mlir::StandardOpsDialect>();
   error_handler_ = [](const ErrorMap& instructions_with_error,
                       HloModule* module) {
     std::set<const HloComputation*> computations_with_error;
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/BUILD b/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/BUILD
index 8f56548ce77..eb7cd2115f3 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/BUILD
+++ b/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/BUILD
@@ -72,12 +72,14 @@ tf_cc_test(
         "//tensorflow/core:test_main",
         "//tensorflow/core/platform:test",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:Affine",
         "@llvm-project//mlir:AffineToStandardTransforms",
-        "@llvm-project//mlir:AllPassesAndDialects",
+        "@llvm-project//mlir:AllPassesAndDialectsNoRegistration",
         "@llvm-project//mlir:CFGTransforms",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LLVMTransforms",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:Transforms",
     ],
 )
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/conv_emitter_test.cc b/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/conv_emitter_test.cc
index d5cad385324..c868d205310 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/conv_emitter_test.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/conv_emitter_test.cc
@@ -20,6 +20,8 @@ limitations under the License.
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/Conversion/SCFToStandard/SCFToStandard.h"  // from @llvm-project
 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"  // from @llvm-project
+#include "mlir/Dialect/Affine/IR/AffineOps.h"  // from @llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Module.h"  // from @llvm-project
@@ -46,6 +48,7 @@ std::string CompileHloConvAndGetMlir(absl::string_view hlo_text) {
       hlo_module.entry_computation()->root_instruction();
 
   mlir::MLIRContext context;
+  context.loadDialect<mlir::AffineDialect, mlir::StandardOpsDialect>();
   mlir::OwningModuleRef mlir_module(
       mlir::ModuleOp::create(mlir::UnknownLoc::get(&context)));
 
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
index 2e3fa00ca86..a9e4a2390fd 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
@@ -18,423 +18,33 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"  // from @llvm-project
 #include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"  // from @llvm-project
+#include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h"  // from @llvm-project
 #include "mlir/Conversion/LinalgToLLVM/LinalgToLLVM.h"  // from @llvm-project
 #include "mlir/Conversion/SCFToGPU/SCFToGPUPass.h"  // from @llvm-project
 #include "mlir/Conversion/SCFToStandard/SCFToStandard.h"  // from @llvm-project
 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"  // from @llvm-project
-#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"  // from @llvm-project
-#include "mlir/Dialect/Affine/IR/AffineOps.h"  // from @llvm-project
 #include "mlir/Dialect/GPU/GPUDialect.h"  // from @llvm-project
-#include "mlir/Dialect/GPU/ParallelLoopMapper.h"  // from @llvm-project
 #include "mlir/Dialect/GPU/Passes.h"  // from @llvm-project
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"  // from @llvm-project
-#include "mlir/Dialect/Linalg/IR/LinalgOps.h"  // from @llvm-project
+#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"  // from @llvm-project
 #include "mlir/Dialect/Linalg/Passes.h"  // from @llvm-project
 #include "mlir/Dialect/SCF/Passes.h"  // from @llvm-project
-#include "mlir/Dialect/SCF/SCF.h"  // from @llvm-project
 #include "mlir/Dialect/SCF/Transforms.h"  // from @llvm-project
-#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/BlockAndValueMapping.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
-#include "mlir/IR/Function.h"  // from @llvm-project
-#include "mlir/IR/Module.h"  // from @llvm-project
-#include "mlir/IR/OperationSupport.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
-#include "mlir/IR/Region.h"  // from @llvm-project
-#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/Dialect.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Transforms/BufferPlacement.h"  // from @llvm-project
-#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
-#include "mlir/Transforms/LoopUtils.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
-#include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h"
 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
+#include "tensorflow/compiler/xla/service/mlir_gpu/passes.h"
 #include "tensorflow/compiler/xla/util.h"
 
 namespace xla {
 namespace mlir_gpu {
-namespace {
-
-using ::mlir::lmhlo::FusionOp;
-
-// Replaces a FusionOp by the operations contained in its region.
-struct FusionOpRemover
-    : public mlir::PassWrapper<FusionOpRemover, ::mlir::FunctionPass> {
-  void runOnFunction() override {
-    getFunction().walk([&](FusionOp op) {
-      mlir::OpBuilder builder(op);
-      // FusionOp has a single region with a single block, so we can just walk
-      // over it and clone operations to the outside.
-      mlir::BlockAndValueMapping mapping;
-      for (auto& nested_op : op.region().front().without_terminator()) {
-        auto clone = builder.clone(nested_op, mapping);
-        for (auto pair :
-             llvm::zip(nested_op.getResults(), clone->getResults())) {
-          mapping.map(std::get<0>(pair), std::get<1>(pair));
-        }
-      }
-      op.erase();
-    });
-  }
-};
-
-// Simple pass that replaces a load that immediately follows a store to the
-// same address with the stored value. This needs generalization.
-struct StoreForwardingPass
-    : mlir::PassWrapper<StoreForwardingPass, mlir::FunctionPass> {
-  mlir::StoreOp findStore(mlir::Operation* op,
-                          std::function<bool(mlir::StoreOp)> matches) {
-    // Search from op upwards in the current block.
-    mlir::Block* block = op->getBlock();
-    auto startFromIt =
-        std::find_if(block->rbegin(), block->rend(),
-                     [op](mlir::Operation& other) { return &other == op; });
-    for (auto storeOpIt = startFromIt; storeOpIt != block->rend();
-         ++storeOpIt) {
-      auto storeOp = llvm::dyn_cast<mlir::StoreOp>(&*(storeOpIt));
-      if (!storeOp || !matches(storeOp)) {
-        continue;
-      }
-
-      return storeOp;
-    }
-    // No store operation found. Continue search outside of the parallel
-    // loop if block is in a parallel loop.
-    if (auto parallelOp =
-            llvm::dyn_cast<mlir::scf::ParallelOp>(block->getParentOp())) {
-      return findStore(parallelOp.getOperation(), matches);
-    }
-    return {};
-  }
-
-  // Recursively search defining ops for AllocOp. Return either AllocOp if it is
-  // found or nullptr.
-  mlir::Operation* SearchAllocOp(mlir::Value memref) {
-    mlir::Operation* defOp = memref.getDefiningOp();
-    while (auto subviewOp = mlir::dyn_cast_or_null<mlir::SubViewOp>(defOp)) {
-      defOp = subviewOp.source().getDefiningOp();
-    }
-    if (auto allocOp = mlir::dyn_cast_or_null<mlir::AllocOp>(defOp)) {
-      return allocOp.getOperation();
-    }
-    return nullptr;
-  }
-
-  // Retrieves AllocOp from the cache or actually looks for it.
-  mlir::Operation* GetAllocOp(
-      mlir::Value memref,
-      llvm::DenseMap<mlir::Value, mlir::Operation*>* memrefToAllocOp) {
-    auto allocOpIt = memrefToAllocOp->find(memref);
-    if (allocOpIt != memrefToAllocOp->end()) {
-      return allocOpIt->second;
-    }
-    auto allocOp = SearchAllocOp(memref);
-    memrefToAllocOp->insert({memref, allocOp});
-    return allocOp;
-  }
-
-  void runOnFunction() override {
-    llvm::DenseMap<mlir::Value, mlir::Operation*> memrefToAllocOp;
-
-    getFunction().walk([&](mlir::LoadOp loadOp) {
-      auto storeOp = findStore(loadOp, [&](mlir::StoreOp storeOp) {
-        mlir::Operation* storeOpAlloc =
-            GetAllocOp(storeOp.memref(), &memrefToAllocOp);
-        mlir::Operation* loadOpAlloc =
-            GetAllocOp(loadOp.memref(), &memrefToAllocOp);
-        return storeOpAlloc && loadOpAlloc && (storeOpAlloc == loadOpAlloc);
-      });
-      if (!storeOp) {
-        return;
-      }
-      auto storeIndices = storeOp.getIndices();
-      auto loadIndices = loadOp.getIndices();
-      if (!std::equal(storeIndices.begin(), storeIndices.end(),
-                      loadIndices.begin(), loadIndices.end())) {
-        return;
-      }
-      loadOp.replaceAllUsesWith(storeOp.getValueToStore());
-      loadOp.erase();
-    });
-  }
-};
-
-// Simple pass that removes temporary buffers that are only written to but
-// never read from or that are read but the read value is not used.
-// Needs an analysis that proves that loads and stores are side-effect free
-// (in bounds, no aliasing, etc.).
-struct DeadTempBufferRemoval
-    : mlir::PassWrapper<DeadTempBufferRemoval, ::mlir::FunctionPass> {
-  bool operationConsideredDead(mlir::Operation* op) {
-    for (auto result : op->getResults()) {
-      if (!llvm::all_of(result.getUsers(), [&](mlir::Operation* op) {
-            // Store and Dealloc is OK.
-            if (llvm::isa<mlir::StoreOp, mlir::DeallocOp>(op)) {
-              return true;
-            }
-            // Load without uses is also ok.
-            if (auto loadOp = llvm::dyn_cast<mlir::LoadOp>(op)) {
-              return loadOp.use_empty();
-            }
-            // Subview is ok if it is dead itself.
-            if (llvm::isa<mlir::SubViewOp>(op)) {
-              return operationConsideredDead(op);
-            }
-            return false;
-          })) {
-        return false;
-      }
-    }
-    return true;
-  }
-
-  void recursiveErase(mlir::Operation* op,
-                      llvm::SmallVectorImpl<mlir::Operation*>* erase_list) {
-    for (auto result : op->getResults()) {
-      for (auto user : llvm::make_early_inc_range(result.getUsers())) {
-        recursiveErase(user, erase_list);
-      }
-    }
-    erase_list->push_back(op);
-  }
-
-  void runOnFunction() override {
-    llvm::SmallVector<mlir::Operation*, 8> dead_ops;
-    getFunction().walk([&](mlir::AllocOp allocOp) {
-      if (!operationConsideredDead(allocOp)) {
-        return;
-      }
-
-      // TODO(herhut): There should be a generic helper for this.
-      recursiveErase(allocOp, &dead_ops);
-    });
-    for (auto op : dead_ops) {
-      op->erase();
-    }
-  }
-};
-
-// TODO(herhut): Move this to MLIR core.
-struct MoveScalarComputationsIntoGpuLaunch
-    : mlir::PassWrapper<MoveScalarComputationsIntoGpuLaunch,
-                        mlir::FunctionPass> {
-  static bool isInliningBeneficiary(mlir::Operation* op) {
-    return llvm::isa<mlir::ConstantOp, mlir::DimOp, mlir::SelectOp,
-                     mlir::CmpIOp>(op);
-  }
-
-  static bool extractBeneficiaryOps(
-      mlir::Operation* op, llvm::SmallVectorImpl<mlir::Operation*>* ops,
-      llvm::SetVector<mlir::Value> args) {
-    if (!isInliningBeneficiary(op)) {
-      return false;
-    }
-
-    ops->push_back(op);
-    for (auto operand : op->getOperands()) {
-      // It is an existing arg, keep going.
-      if (args.count(operand)) {
-        continue;
-      }
-      mlir::Operation* definingOp = operand.getDefiningOp();
-      if (!definingOp || !extractBeneficiaryOps(definingOp, ops, args)) {
-        return false;
-      }
-    }
-    return true;
-  }
-
-  static void inlineOperationsIntoLaunch(mlir::gpu::LaunchOp launch) {
-    llvm::SetVector<mlir::Value> used_above;
-    mlir::getUsedValuesDefinedAbove(launch.body(), used_above);
-    mlir::BlockAndValueMapping inlined_map;
-    for (mlir::Value v : used_above) {
-      llvm::SmallVector<mlir::Operation*, 8> ops_to_move;
-      mlir::Operation* definingOp = v.getDefiningOp();
-      if (definingOp &&
-          extractBeneficiaryOps(definingOp, &ops_to_move, used_above)) {
-        mlir::OpBuilder b(launch.body());
-        for (mlir::Operation* op : llvm::reverse(ops_to_move)) {
-          auto result = b.clone(*op, inlined_map);
-          for (auto pair : llvm::zip(op->getResults(), result->getResults())) {
-            mlir::replaceAllUsesInRegionWith(std::get<0>(pair),
-                                             std::get<1>(pair), launch.body());
-          }
-          inlined_map.map(op->getResults(), result->getResults());
-        }
-      }
-    }
-  }
-
-  void runOnFunction() override {
-    mlir::FuncOp fun = getFunction();
-    fun.walk(
-        [](mlir::gpu::LaunchOp launch) { inlineOperationsIntoLaunch(launch); });
-  }
-};
-
-// Sort the operands to the kernel for a deterministic order. First operands
-// that are defined by function arguments, followed by operands that are
-// returned from the function. This only works for simple functions without
-// control flow and can be used in cases where the kernel is extracted and used
-// independently of the host-side code.
-struct RewriteKernelSignature
-    : mlir::PassWrapper<RewriteKernelSignature, mlir::FunctionPass> {
-  void runOnFunction() override {
-    mlir::FuncOp func = getFunction();
-    mlir::ModuleOp module = func.getParentOfType<mlir::ModuleOp>();
-    getFunction().walk([&](mlir::gpu::LaunchFuncOp launchOp) {
-      mlir::gpu::GPUFuncOp kernel =
-          module.lookupSymbol<mlir::gpu::GPUFuncOp>(launchOp.kernel());
-
-      if (kernel.getNumFuncArguments() !=
-          func.getNumArguments() + func.getNumResults()) {
-        kernel.emitError()
-            << "number of kernel arguments does not match number"
-            << "of arguments and results of surrounding function";
-        signalPassFailure();
-        return;
-      }
-      if (!llvm::hasSingleElement(func)) {
-        func.emitError() << "surrounding function has more than one block";
-        signalPassFailure();
-        return;
-      }
-
-      // Compute a map from function arguments to kernel function operands.
-      mlir::BlockAndValueMapping func_to_kernel;
-      for (mlir::BlockArgument arg : func.getArguments()) {
-        for (int i = 0, e = launchOp.getNumKernelOperands(); i < e; ++i) {
-          if (launchOp.getKernelOperand(i) == arg) {
-            func_to_kernel.map(arg, kernel.getArgument(i));
-            break;
-          }
-        }
-      }
-      // Also add function results that are computed by the launch.
-      mlir::Operation* returnOp = func.getBody().back().getTerminator();
-      for (mlir::Value result : returnOp->getOperands()) {
-        for (int i = 0, e = launchOp.getNumKernelOperands(); i < e; ++i) {
-          if (launchOp.getKernelOperand(i) == result) {
-            func_to_kernel.map(result, kernel.getArgument(i));
-            break;
-          }
-        }
-      }
-
-      // Create a new kernel function with modified signature. It will have the
-      // parameters and result types of the original funcion as its parameter
-      // type and otherwise will be void.
-      auto gpu_module = kernel.getParentOfType<mlir::gpu::GPUModuleOp>();
-      mlir::OpBuilder kernel_builder(gpu_module.body());
-      auto operand_types = llvm::to_vector<4>(llvm::concat<const mlir::Type>(
-          func.getType().getInputs(), func.getType().getResults()));
-      auto new_kernel = kernel_builder.create<mlir::gpu::GPUFuncOp>(
-          kernel.getLoc(), kernel.getName(),
-          kernel_builder.getFunctionType(operand_types, {}));
-      new_kernel.setAttr(mlir::gpu::GPUDialect::getKernelFuncAttrName(),
-                         kernel_builder.getUnitAttr());
-
-      // Create a map from old kernel argument to new one.
-      mlir::BlockAndValueMapping old_kernel_to_new;
-      for (int i = 0, e = func.getNumArguments(); i < e; ++i) {
-        mlir::Value func_arg = func.getArgument(i);
-        mlir::Value new_kernel_arg = new_kernel.getArgument(i);
-        mlir::Value old_kernel_arg = func_to_kernel.lookupOrNull(func_arg);
-        if (!old_kernel_arg) {
-          kernel.emitOpError()
-              << "argument " << i
-              << " to containing function is not an argument to the kernel";
-          signalPassFailure();
-          return;
-        }
-        old_kernel_to_new.map(old_kernel_arg, new_kernel_arg);
-      }
-      for (int i = 0, e = returnOp->getNumOperands(); i < e; ++i) {
-        mlir::Value ret_op = returnOp->getOperand(i);
-        mlir::Value new_kernel_arg =
-            new_kernel.getArgument(func.getNumArguments() + i);
-        mlir::Value old_kernel_arg = func_to_kernel.lookupOrNull(ret_op);
-        if (!old_kernel_arg) {
-          kernel.emitOpError()
-              << "result " << i
-              << " of containing function is not an argument to the kernel";
-          signalPassFailure();
-          return;
-        }
-        old_kernel_to_new.map(old_kernel_arg, new_kernel_arg);
-      }
-      // Steal the body by appending the blocks and inserting a branch.
-      kernel.body().cloneInto(&new_kernel.getBody(), old_kernel_to_new);
-      kernel_builder.setInsertionPointToEnd(&new_kernel.body().front());
-      kernel_builder.create<mlir::BranchOp>(
-          new_kernel.getLoc(), &*std::next(new_kernel.body().begin()));
-      // Now create a new launchOp calling the new kernel. We need to forward
-      // the arguments of the surrounding function and operands to the return.
-      mlir::SmallVector<mlir::Value, 4> new_operands;
-      new_operands.reserve(new_kernel.getNumFuncArguments());
-      new_operands.append(func.args_begin(), func.args_end());
-      new_operands.append(returnOp->operand_begin(), returnOp->operand_end());
-      mlir::OpBuilder launch_builder(launchOp);
-      launch_builder.create<mlir::gpu::LaunchFuncOp>(
-          launchOp.getLoc(), new_kernel, launchOp.getGridSizeOperandValues(),
-          launchOp.getBlockSizeOperandValues(), new_operands);
-      // Launch does not have results, so we can just erase it. And the kernel
-      // also needs to go.
-      launchOp.erase();
-      kernel.erase();
-    });
-  }
-};
-
-// Extract_element(mhlo_scalars_to_dimension_tensor(v_i), i) -> v_i
-//
-// We need to direct fusion to the inner loops. This cannot be done with
-// a passmanager alone ATM, as nested pass managers require operations to
-// be closed from above.
-struct MapParallelLoops
-    : public mlir::PassWrapper<MapParallelLoops, mlir::FunctionPass> {
-  void runOnFunction() override {
-    mlir::greedilyMapParallelSCFToGPU(getFunction().getBody());
-  }
-};
-
-// We need to direct fusion to the inner loops. This cannot be done with
-// a passmanager alone ATM, as nested pass managers require operations to
-// be closed from above.
-struct FuseInnerParallelLoops
-    : public mlir::PassWrapper<FuseInnerParallelLoops, mlir::FunctionPass> {
-  void runOnFunction() override {
-    getFunction().walk([](mlir::scf::ParallelOp op) {
-      mlir::scf::naivelyFuseParallelOps(op.region());
-    });
-  }
-};
-
-// Collapse all loop dimension into the first one.
-struct ParallelLoopCollapsingToFirstDim
-    : public mlir::PassWrapper<ParallelLoopCollapsingToFirstDim,
-                               mlir::OperationPass<mlir::ModuleOp>> {
-  void runOnOperation() override {
-    mlir::Operation* module = getOperation();
-
-    module->walk([&](mlir::scf::ParallelOp op) {
-      unsigned num_loops = op.getNumLoops();
-      std::vector<unsigned> combinedLoops;
-      combinedLoops.reserve(num_loops);
-      for (unsigned i = 0; i < num_loops; ++i) {
-        combinedLoops.push_back(i);
-      }
-      mlir::collapseParallelLoops(op, {combinedLoops});
-    });
-  }
-};
-}  // namespace
 
 Status LowerLHLOToGPU(mlir::ModuleOp module, LowerLHLOToGPUOptions options) {
   mlir::PassManager pm(module.getContext());
@@ -461,9 +71,9 @@ Status LowerLHLOToGPU(mlir::ModuleOp module, LowerLHLOToGPUOptions options) {
   // Moving `AllocOp`s and inserting missing `DeallocOp`s
   pm.addPass(::mlir::createBufferPlacementPass());
   // Next, we can strip the outer fusion operation.
-  pm.addPass(absl::make_unique<FusionOpRemover>());
+  pm.addPass(createFusionOpRemoverPass());
   // Remove unnecessary LHLO copies.
-  pm.addPass(::mlir::lmhlo::createLhloCopyRemovalPass());
+  pm.addPass(::mlir::createCopyRemovalPass());
   // Transform LHLO operations to LinAlg.
   pm.addPass(::mlir::lmhlo::createLegalizeLhloToLinalgPass());
   // Fuse linalg operations.
@@ -479,26 +89,26 @@ Status LowerLHLOToGPU(mlir::ModuleOp module, LowerLHLOToGPUOptions options) {
   pm.addNestedPass<::mlir::FuncOp>(::mlir::createCanonicalizerPass());
   pm.addNestedPass<::mlir::FuncOp>(::mlir::createCSEPass());
   // Fuse the inner-most loops.
-  pm.addPass(absl::make_unique<FuseInnerParallelLoops>());
+  pm.addPass(createFuseInnerParallelLoopsPass());
   // Run CSE to ensure that loads and stores to the same subview get
   // recognized as such.
   pm.addNestedPass<::mlir::FuncOp>(::mlir::createCSEPass());
   // Forward stores to buffers to loads.
-  pm.addPass(absl::make_unique<StoreForwardingPass>());
+  pm.addPass(createStoreForwardingPass());
   // Remove now unused temporary buffers.
-  pm.addPass(absl::make_unique<DeadTempBufferRemoval>());
+  pm.addPass(createDeadTempBufferRemovalPass());
   if (!options.unroll_factors.empty()) {
     pm.addPass(::mlir::createParallelLoopTilingPass(as_int64));
   }
   // Project all loop dimensions to X if necessary.
   if (options.collapse_parallel_loops) {
-    pm.addPass(absl::make_unique<ParallelLoopCollapsingToFirstDim>());
+    pm.addPass(createParallelLoopCollapsingToFirstDimPass());
   }
   // Some basic cleanup.
   pm.addNestedPass<::mlir::FuncOp>(::mlir::createCanonicalizerPass());
   pm.addNestedPass<::mlir::FuncOp>(::mlir::createCSEPass());
   // Greedily map the remaining loop to GPU hardware dimensions.
-  pm.addPass(absl::make_unique<MapParallelLoops>());
+  pm.addPass(createMapParallelLoopsPass());
   // Apply the mapping.
   pm.addPass(mlir::createParallelLoopToGpuPass());
   // Some basic cleanup.
@@ -515,13 +125,13 @@ Status LowerLHLOToGPU(mlir::ModuleOp module, LowerLHLOToGPUOptions options) {
         ::mlir::mhlo::createLegalizeTanhToApproximationPass());
   }
   // Move scalar operations into the launch to ensure smaller signatures.
-  pm.addPass(absl::make_unique<MoveScalarComputationsIntoGpuLaunch>());
+  pm.addPass(createMoveScalarComputationsIntoGpuLaunchPass());
   // Take launches to launches with kernels.
   pm.addPass(::mlir::createGpuKernelOutliningPass());
   // Make sure the kernel signature resembled the original function's
   // signature
   if (options.rewrite_signature) {
-    pm.addPass(absl::make_unique<RewriteKernelSignature>());
+    pm.addPass(createRewriteKernelSignaturePass());
   }
   if (failed(pm.run(module))) {
     return InternalError("Lowering to GPU kernels failed.");
@@ -536,6 +146,10 @@ namespace {
 class LowerToNVVMPass
     : public ::mlir::PassWrapper<
           LowerToNVVMPass, ::mlir::OperationPass<::mlir::gpu::GPUModuleOp>> {
+  void getDependentDialects(mlir::DialectRegistry& registry) const override {
+    registry.insert<mlir::NVVM::NVVMDialect, mlir::LLVM::LLVMDialect>();
+  }
+
  public:
   void runOnOperation() override {
     ::mlir::gpu::GPUModuleOp m = getOperation();
@@ -585,6 +199,85 @@ Status LowerKernelBodiesToNVVM(mlir::ModuleOp module) {
   return Status::OK();
 }
 
+namespace {
+
+/// A pass that does the final lowering to ROCDL. It collects all the patterns
+/// that are currently required, currently mixing std, linalg and gpu.
+class LowerToROCDLPass
+    : public ::mlir::PassWrapper<
+          LowerToROCDLPass, ::mlir::OperationPass<::mlir::gpu::GPUModuleOp>> {
+  void getDependentDialects(mlir::DialectRegistry& registry) const override {
+    registry.insert<mlir::ROCDL::ROCDLDialect, mlir::LLVM::LLVMDialect>();
+  }
+
+ public:
+  void runOnOperation() override {
+    ::mlir::gpu::GPUModuleOp m = getOperation();
+
+    ::mlir::OwningRewritePatternList patterns;
+    ::mlir::populateGpuRewritePatterns(m.getContext(), patterns);
+    ::mlir::applyPatternsAndFoldGreedily(m, patterns);
+    patterns.clear();
+
+    ::mlir::LLVMTypeConverter converter(m.getContext());
+    ::mlir::populateStdToLLVMConversionPatterns(converter, patterns);
+    // TODO(b/145824979) Remove linalg once sliceop is in std.
+    ::mlir::populateLinalgToLLVMConversionPatterns(converter, patterns,
+                                                   &getContext());
+    ::mlir::populateGpuToROCDLConversionPatterns(converter, patterns);
+    ::mlir::populateAffineToStdConversionPatterns(patterns, m.getContext());
+
+    ::mlir::ConversionTarget target(getContext());
+    target.addIllegalDialect<::mlir::gpu::GPUDialect>();
+    target
+        .addIllegalOp<mlir::LLVM::CosOp, mlir::LLVM::ExpOp, mlir::LLVM::FAbsOp,
+                      mlir::LLVM::FCeilOp, mlir::LLVM::LogOp,
+                      mlir::LLVM::Log10Op, mlir::LLVM::Log2Op>();
+    target.addIllegalOp<mlir::FuncOp>();
+    target.addLegalDialect<::mlir::LLVM::LLVMDialect>();
+    target.addLegalDialect<::mlir::ROCDL::ROCDLDialect>();
+    // TODO(csigg): Remove once we support replacing non-root ops.
+    target.addLegalOp<::mlir::gpu::GPUModuleOp, ::mlir::gpu::ModuleEndOp,
+                      ::mlir::gpu::YieldOp>();
+    if (failed(mlir::applyFullConversion(m, target, patterns))) {
+      signalPassFailure();
+    }
+  }
+};
+
+}  // namespace
+
+Status LowerKernelBodiesToROCDL(mlir::ModuleOp module) {
+  // We cannot verify as the signature of the kernel is rewritten.
+  ::mlir::PassManager pm(module.getContext(), /*verifyPasses=*/false);
+  applyPassManagerCLOptions(pm);
+
+  auto enable_if_vlog_is_on = [](mlir::Pass*, mlir::Operation*) {
+    return VLOG_IS_ON(1);
+  };
+  pm.enableIRPrinting(/*shouldPrintBeforePass=*/{},
+                      /*shouldPrintAfterPass=*/enable_if_vlog_is_on,
+                      /*printModuleScope=*/false,
+                      /*printAfterOnlyOnChange=*/false,
+                      /*out=*/llvm::dbgs());
+
+  // Rewrite kernel functions to LLVM IR.
+  auto& kernelPm = pm.nest<::mlir::gpu::GPUModuleOp>();
+  kernelPm.addPass(::mlir::createLowerToCFGPass());
+  kernelPm.addPass(absl::make_unique<LowerToROCDLPass>());
+
+  // Some basic cleanup.
+  kernelPm.addNestedPass<::mlir::FuncOp>(::mlir::createCanonicalizerPass());
+  kernelPm.addNestedPass<::mlir::FuncOp>(::mlir::createCSEPass());
+  // Remove all location information to prevent a debug build.
+  kernelPm.addPass(::mlir::createStripDebugInfoPass());
+
+  if (failed(pm.run(module))) {
+    return InternalError("Lowering to ROCDL IR failed.");
+  }
+  return Status::OK();
+}
+
 StatusOr<mlir::ModuleOp> ExtractKernelModule(mlir::ModuleOp module) {
   auto kernelModule = ::mlir::ModuleOp::create(module.getLoc());
   // TODO(b/137624192): This also needs to resolve naming conflicts.
@@ -595,5 +288,6 @@ StatusOr<mlir::ModuleOp> ExtractKernelModule(mlir::ModuleOp module) {
   });
   return kernelModule;
 }
+
 }  // namespace mlir_gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.h b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.h
index bd633bb06cb..290550142ec 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.h
+++ b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.h
@@ -36,6 +36,8 @@ Status LowerLHLOToGPU(mlir::ModuleOp module,
 
 Status LowerKernelBodiesToNVVM(mlir::ModuleOp module);
 
+Status LowerKernelBodiesToROCDL(mlir::ModuleOp module);
+
 StatusOr<mlir::ModuleOp> ExtractKernelModule(mlir::ModuleOp module);
 
 }  // namespace mlir_gpu
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.cc b/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.cc
index 194eb4618d3..b275dd4525f 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <utility>
 
+#include "llvm/IR/DataLayout.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
@@ -203,9 +204,13 @@ LhloDialectEmitter::LhloDialectEmitter(
       builder_(mlir_module_.getContext()),
       buffer_assignment_(assignment),
       platform_(platform) {
-  LLVMDialect* llvmDialect =
-      mlir_module.getContext()->getRegisteredDialect<LLVMDialect>();
-  pointer_size_ = llvmDialect->getLLVMModule().getDataLayout().getPointerSize();
+  llvm::DataLayout data_layout("");
+  if (auto data_layout_attr = mlir_module.getAttrOfType<mlir::StringAttr>(
+          mlir::LLVM::LLVMDialect::getDataLayoutAttrName())) {
+    data_layout.reset(data_layout_attr.getValue());
+  }
+
+  pointer_size_ = data_layout.getPointerSize();
 }
 
 void LhloDialectEmitter::AddThunkToThunkSequence(std::unique_ptr<Thunk> thunk) {
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.cc b/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.cc
index 458522f89e6..26c9e155c0c 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.cc
@@ -25,23 +25,8 @@ limitations under the License.
 
 namespace xla {
 namespace mlir_gpu {
-namespace {
 
-using ::mlir::MLIRContext;
-using ::mlir::LLVM::LLVMDialect;
-
-int64 ConfigureLLVMModuleAndGetPointerSize(MLIRContext* context) {
-  LLVMDialect* dialect = context->getRegisteredDialect<LLVMDialect>();
-  llvm::Module& module = dialect->getLLVMModule();
-  module.setTargetTriple(gpu::nvptx::kTargetTriple);
-  module.setDataLayout(gpu::nvptx::kDataLayout);
-  return module.getDataLayout().getPointerSize();
-}
-
-}  // namespace
-
-MlirCompiler::MlirCompiler()
-    : pointer_size_(ConfigureLLVMModuleAndGetPointerSize(&context_)) {}
+MlirCompiler::MlirCompiler() : data_layout_("") {}
 
 se::Platform::Id MlirCompiler::PlatformId() const {
   return stream_executor::cuda::kCudaPlatformId;
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.h b/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.h
index a7b2f9446fa..261e249c0a1 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.h
+++ b/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_MLIR_GPU_MLIR_COMPILER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_MLIR_GPU_MLIR_COMPILER_H_
 
+#include "llvm/IR/DataLayout.h"
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Module.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/service/compiler.h"
@@ -58,7 +59,7 @@ class MlirCompiler : public Compiler {
 
  protected:
   ::mlir::MLIRContext context_;
-  int64 pointer_size_;
+  llvm::DataLayout data_layout_;
   IRHook module_hook_;
   ErrorHandler error_handler_;
 };
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc b/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc
index 2c2076bbd97..2e94c1a54f2 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "llvm/IR/LLVMContext.h"
 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"  // from @llvm-project
 #include "mlir/Dialect/GPU/GPUDialect.h"  // from @llvm-project
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
@@ -103,7 +104,7 @@ class MlirCompilerImpl : public MlirCompiler {
                      const AotCompilationOptions& options) override;
 
   HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const override {
-    int64 pointer_size = pointer_size_;
+    int64 pointer_size = data_layout_.getPointerSize();
     return [pointer_size](const Shape& shape) {
       return ShapeUtil::ByteSizeOf(shape, pointer_size);
     };
@@ -292,10 +293,10 @@ Status InsertBufferLoadPreduleIntoKernel(
     BufferAssignment* assignment,
     const std::vector<const BufferAllocation*>& buffers) {
   mlir::OpBuilder builder(kernel.getBody());
-  auto llvm_dialect = kernel.getContext()->getRegisteredDialect<LLVMDialect>();
-  auto offset_type = LLVMType::getInt64Ty(llvm_dialect);
-  auto ptr_type = LLVMType::getInt8PtrTy(llvm_dialect);
-  auto void_type = LLVMType::getVoidTy(llvm_dialect);
+  auto* context = kernel.getContext();
+  auto offset_type = LLVMType::getInt64Ty(context);
+  auto ptr_type = LLVMType::getInt8PtrTy(context);
+  auto void_type = LLVMType::getVoidTy(context);
   auto loc = kernel.getLoc();
 
   auto num_original_args = kernel.getNumArguments();
@@ -461,9 +462,9 @@ StatusOr<std::unique_ptr<Executable>> MlirCompilerImpl::RunBackend(
   // must also be used to determine the thunk launch schedule.
   std::unique_ptr<StreamAssignment> stream_assignment =
       xla::gpu::AssignStreams(*module);
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<GpuHloSchedule> hlo_schedule,
-      GpuHloSchedule::Build(*module, *stream_assignment, pointer_size_));
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<GpuHloSchedule> hlo_schedule,
+                      GpuHloSchedule::Build(*module, *stream_assignment,
+                                            data_layout_.getPointerSize()));
 
   // Run buffer analysis on the HLO graph. This analysis figures out which
   // temporary buffers are required to run the computation.
@@ -543,7 +544,11 @@ StatusOr<std::unique_ptr<Executable>> MlirCompilerImpl::RunBackend(
   TF_RETURN_IF_ERROR(
       module_hook_.invoke(IRHook::LoweringStage::KERNEL, *kernel_module));
 
-  auto llvmModule = mlir::translateModuleToNVVMIR(*kernel_module);
+  // Translate to LLVM IR in a fresh context. The module is further translated
+  // to textual PTX and a CUBIN blob so there is no need for the context to live
+  // longer than this function.
+  llvm::LLVMContext llvmContext;
+  auto llvmModule = mlir::translateModuleToNVVMIR(*kernel_module, llvmContext);
 
   if (!llvmModule) {
     return InternalError("Translation to LLVM failed");
@@ -575,7 +580,7 @@ StatusOr<std::unique_ptr<Executable>> MlirCompilerImpl::RunBackend(
   return {absl::make_unique<GpuExecutable>(
       ptx, cubin, GetGpuVersion(stream_exec), std::move(thunk_schedule),
       emission_context.releaseHloModule(), std::move(buffer_assignment),
-      nullptr, nullptr)};
+      nullptr, nullptr, std::vector<GpuExecutable::ConstantInfo>())};
 }
 
 StatusOr<std::vector<std::unique_ptr<Executable>>> MlirCompilerImpl::Compile(
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/passes.cc b/tensorflow/compiler/xla/service/mlir_gpu/passes.cc
new file mode 100644
index 00000000000..887f14e90d9
--- /dev/null
+++ b/tensorflow/compiler/xla/service/mlir_gpu/passes.cc
@@ -0,0 +1,423 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/mlir_gpu/passes.h"
+
+#include "absl/memory/memory.h"
+#include "llvm/ADT/SetVector.h"
+#include "mlir/Dialect/GPU/GPUDialect.h"  // from @llvm-project
+#include "mlir/Dialect/GPU/ParallelLoopMapper.h"  // from @llvm-project
+#include "mlir/Dialect/SCF/SCF.h"  // from @llvm-project
+#include "mlir/Dialect/SCF/Transforms.h"  // from @llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/BlockAndValueMapping.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/Transforms/LoopUtils.h"  // from @llvm-project
+#include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
+
+namespace xla {
+namespace mlir_gpu {
+namespace {
+
+struct FusionOpRemoverPass
+    : public mlir::PassWrapper<FusionOpRemoverPass, ::mlir::FunctionPass> {
+  void runOnFunction() override {
+    getFunction().walk([&](mlir::lmhlo::FusionOp op) {
+      mlir::OpBuilder builder(op);
+      // FusionOp has a single region with a single block, so we can just walk
+      // over it and clone operations to the outside.
+      mlir::BlockAndValueMapping mapping;
+      for (auto& nested_op : op.region().front().without_terminator()) {
+        auto clone = builder.clone(nested_op, mapping);
+        for (auto pair :
+             llvm::zip(nested_op.getResults(), clone->getResults())) {
+          mapping.map(std::get<0>(pair), std::get<1>(pair));
+        }
+      }
+      op.erase();
+    });
+  }
+};
+
+struct StoreForwardingPass
+    : mlir::PassWrapper<StoreForwardingPass, mlir::FunctionPass> {
+  mlir::StoreOp findStore(mlir::Operation* op,
+                          std::function<bool(mlir::StoreOp)> matches) {
+    // Search from op upwards in the current block.
+    mlir::Block* block = op->getBlock();
+    auto startFromIt =
+        std::find_if(block->rbegin(), block->rend(),
+                     [op](mlir::Operation& other) { return &other == op; });
+    for (auto storeOpIt = startFromIt; storeOpIt != block->rend();
+         ++storeOpIt) {
+      auto storeOp = llvm::dyn_cast<mlir::StoreOp>(&*(storeOpIt));
+      if (!storeOp || !matches(storeOp)) {
+        continue;
+      }
+
+      return storeOp;
+    }
+    // No store operation found. Continue search outside of the parallel
+    // loop if block is in a parallel loop.
+    if (auto parallelOp =
+            llvm::dyn_cast<mlir::scf::ParallelOp>(block->getParentOp())) {
+      return findStore(parallelOp.getOperation(), matches);
+    }
+    return {};
+  }
+
+  // Recursively search defining ops for AllocOp. Return either AllocOp if it is
+  // found or nullptr.
+  mlir::Operation* SearchAllocOp(mlir::Value memref) {
+    mlir::Operation* defOp = memref.getDefiningOp();
+    while (auto subviewOp = mlir::dyn_cast_or_null<mlir::SubViewOp>(defOp)) {
+      defOp = subviewOp.source().getDefiningOp();
+    }
+    if (auto allocOp = mlir::dyn_cast_or_null<mlir::AllocOp>(defOp)) {
+      return allocOp.getOperation();
+    }
+    return nullptr;
+  }
+
+  // Retrieves AllocOp from the cache or actually looks for it.
+  mlir::Operation* GetAllocOp(
+      mlir::Value memref,
+      llvm::DenseMap<mlir::Value, mlir::Operation*>* memrefToAllocOp) {
+    auto allocOpIt = memrefToAllocOp->find(memref);
+    if (allocOpIt != memrefToAllocOp->end()) {
+      return allocOpIt->second;
+    }
+    auto allocOp = SearchAllocOp(memref);
+    memrefToAllocOp->insert({memref, allocOp});
+    return allocOp;
+  }
+
+  void runOnFunction() override {
+    llvm::DenseMap<mlir::Value, mlir::Operation*> memrefToAllocOp;
+
+    getFunction().walk([&](mlir::LoadOp loadOp) {
+      auto storeOp = findStore(loadOp, [&](mlir::StoreOp storeOp) {
+        mlir::Operation* storeOpAlloc =
+            GetAllocOp(storeOp.memref(), &memrefToAllocOp);
+        mlir::Operation* loadOpAlloc =
+            GetAllocOp(loadOp.memref(), &memrefToAllocOp);
+        return storeOpAlloc && loadOpAlloc && (storeOpAlloc == loadOpAlloc);
+      });
+      if (!storeOp) {
+        return;
+      }
+      auto storeIndices = storeOp.getIndices();
+      auto loadIndices = loadOp.getIndices();
+      if (!std::equal(storeIndices.begin(), storeIndices.end(),
+                      loadIndices.begin(), loadIndices.end())) {
+        return;
+      }
+      loadOp.replaceAllUsesWith(storeOp.getValueToStore());
+      loadOp.erase();
+    });
+  }
+};
+
+struct DeadTempBufferRemovalPass
+    : mlir::PassWrapper<DeadTempBufferRemovalPass, ::mlir::FunctionPass> {
+  bool operationConsideredDead(mlir::Operation* op) {
+    for (auto result : op->getResults()) {
+      if (!llvm::all_of(result.getUsers(), [&](mlir::Operation* op) {
+            // Store and Dealloc is OK.
+            if (llvm::isa<mlir::StoreOp, mlir::DeallocOp>(op)) {
+              return true;
+            }
+            // Load without uses is also ok.
+            if (auto loadOp = llvm::dyn_cast<mlir::LoadOp>(op)) {
+              return loadOp.use_empty();
+            }
+            // Subview is ok if it is dead itself.
+            if (llvm::isa<mlir::SubViewOp>(op)) {
+              return operationConsideredDead(op);
+            }
+            return false;
+          })) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  void recursiveErase(mlir::Operation* op,
+                      llvm::SmallVectorImpl<mlir::Operation*>* erase_list) {
+    for (auto result : op->getResults()) {
+      for (auto user : llvm::make_early_inc_range(result.getUsers())) {
+        recursiveErase(user, erase_list);
+      }
+    }
+    erase_list->push_back(op);
+  }
+
+  void runOnFunction() override {
+    llvm::SmallVector<mlir::Operation*, 8> dead_ops;
+    getFunction().walk([&](mlir::AllocOp allocOp) {
+      if (!operationConsideredDead(allocOp)) {
+        return;
+      }
+
+      // TODO(herhut): There should be a generic helper for this.
+      recursiveErase(allocOp, &dead_ops);
+    });
+    for (auto op : dead_ops) {
+      op->erase();
+    }
+  }
+};
+
+struct MoveScalarComputationsIntoGpuLaunchPass
+    : mlir::PassWrapper<MoveScalarComputationsIntoGpuLaunchPass,
+                        mlir::FunctionPass> {
+  static bool isInliningBeneficiary(mlir::Operation* op) {
+    return llvm::isa<mlir::ConstantOp, mlir::DimOp, mlir::SelectOp,
+                     mlir::CmpIOp>(op);
+  }
+
+  static bool extractBeneficiaryOps(
+      mlir::Operation* op, llvm::SmallVectorImpl<mlir::Operation*>* ops,
+      llvm::SetVector<mlir::Value> args) {
+    if (!isInliningBeneficiary(op)) {
+      return false;
+    }
+
+    ops->push_back(op);
+    for (auto operand : op->getOperands()) {
+      // It is an existing arg, keep going.
+      if (args.count(operand)) {
+        continue;
+      }
+      mlir::Operation* definingOp = operand.getDefiningOp();
+      if (!definingOp || !extractBeneficiaryOps(definingOp, ops, args)) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  static void inlineOperationsIntoLaunch(mlir::gpu::LaunchOp launch) {
+    llvm::SetVector<mlir::Value> used_above;
+    mlir::getUsedValuesDefinedAbove(launch.body(), used_above);
+    mlir::BlockAndValueMapping inlined_map;
+    for (mlir::Value v : used_above) {
+      llvm::SmallVector<mlir::Operation*, 8> ops_to_move;
+      mlir::Operation* definingOp = v.getDefiningOp();
+      if (definingOp &&
+          extractBeneficiaryOps(definingOp, &ops_to_move, used_above)) {
+        mlir::OpBuilder b(launch.body());
+        for (mlir::Operation* op : llvm::reverse(ops_to_move)) {
+          auto result = b.clone(*op, inlined_map);
+          for (auto pair : llvm::zip(op->getResults(), result->getResults())) {
+            mlir::replaceAllUsesInRegionWith(std::get<0>(pair),
+                                             std::get<1>(pair), launch.body());
+          }
+          inlined_map.map(op->getResults(), result->getResults());
+        }
+      }
+    }
+  }
+
+  void runOnFunction() override {
+    mlir::FuncOp fun = getFunction();
+    fun.walk(
+        [](mlir::gpu::LaunchOp launch) { inlineOperationsIntoLaunch(launch); });
+  }
+};
+
+struct RewriteKernelSignaturePass
+    : mlir::PassWrapper<RewriteKernelSignaturePass, mlir::FunctionPass> {
+  void runOnFunction() override {
+    mlir::FuncOp func = getFunction();
+    mlir::ModuleOp module = func.getParentOfType<mlir::ModuleOp>();
+    getFunction().walk([&](mlir::gpu::LaunchFuncOp launchOp) {
+      mlir::gpu::GPUFuncOp kernel =
+          module.lookupSymbol<mlir::gpu::GPUFuncOp>(launchOp.kernel());
+
+      if (kernel.getNumFuncArguments() !=
+          func.getNumArguments() + func.getNumResults()) {
+        kernel.emitError()
+            << "number of kernel arguments does not match number"
+            << "of arguments and results of surrounding function";
+        signalPassFailure();
+        return;
+      }
+      if (!llvm::hasSingleElement(func)) {
+        func.emitError() << "surrounding function has more than one block";
+        signalPassFailure();
+        return;
+      }
+
+      // Compute a map from function arguments to kernel function operands.
+      mlir::BlockAndValueMapping func_to_kernel;
+      for (mlir::BlockArgument arg : func.getArguments()) {
+        for (int i = 0, e = launchOp.getNumKernelOperands(); i < e; ++i) {
+          if (launchOp.getKernelOperand(i) == arg) {
+            func_to_kernel.map(arg, kernel.getArgument(i));
+            break;
+          }
+        }
+      }
+      // Also add function results that are computed by the launch.
+      mlir::Operation* returnOp = func.getBody().back().getTerminator();
+      for (mlir::Value result : returnOp->getOperands()) {
+        for (int i = 0, e = launchOp.getNumKernelOperands(); i < e; ++i) {
+          if (launchOp.getKernelOperand(i) == result) {
+            func_to_kernel.map(result, kernel.getArgument(i));
+            break;
+          }
+        }
+      }
+
+      // Create a new kernel function with modified signature. It will have the
+      // parameters and result types of the original funcion as its parameter
+      // type and otherwise will be void.
+      auto gpu_module = kernel.getParentOfType<mlir::gpu::GPUModuleOp>();
+      mlir::OpBuilder kernel_builder(gpu_module.body());
+      auto operand_types = llvm::to_vector<4>(llvm::concat<const mlir::Type>(
+          func.getType().getInputs(), func.getType().getResults()));
+      auto new_kernel = kernel_builder.create<mlir::gpu::GPUFuncOp>(
+          kernel.getLoc(), kernel.getName(),
+          kernel_builder.getFunctionType(operand_types, {}));
+      new_kernel.setAttr(mlir::gpu::GPUDialect::getKernelFuncAttrName(),
+                         kernel_builder.getUnitAttr());
+
+      // Create a map from old kernel argument to new one.
+      mlir::BlockAndValueMapping old_kernel_to_new;
+      for (int i = 0, e = func.getNumArguments(); i < e; ++i) {
+        mlir::Value func_arg = func.getArgument(i);
+        mlir::Value new_kernel_arg = new_kernel.getArgument(i);
+        mlir::Value old_kernel_arg = func_to_kernel.lookupOrNull(func_arg);
+        if (!old_kernel_arg) {
+          kernel.emitOpError()
+              << "argument " << i
+              << " to containing function is not an argument to the kernel";
+          signalPassFailure();
+          return;
+        }
+        old_kernel_to_new.map(old_kernel_arg, new_kernel_arg);
+      }
+      for (int i = 0, e = returnOp->getNumOperands(); i < e; ++i) {
+        mlir::Value ret_op = returnOp->getOperand(i);
+        mlir::Value new_kernel_arg =
+            new_kernel.getArgument(func.getNumArguments() + i);
+        mlir::Value old_kernel_arg = func_to_kernel.lookupOrNull(ret_op);
+        if (!old_kernel_arg) {
+          kernel.emitOpError()
+              << "result " << i
+              << " of containing function is not an argument to the kernel";
+          signalPassFailure();
+          return;
+        }
+        old_kernel_to_new.map(old_kernel_arg, new_kernel_arg);
+      }
+      // Steal the body by appending the blocks and inserting a branch.
+      kernel.body().cloneInto(&new_kernel.getBody(), old_kernel_to_new);
+      kernel_builder.setInsertionPointToEnd(&new_kernel.body().front());
+      kernel_builder.create<mlir::BranchOp>(
+          new_kernel.getLoc(), &*std::next(new_kernel.body().begin()));
+      // Now create a new launchOp calling the new kernel. We need to forward
+      // the arguments of the surrounding function and operands to the return.
+      mlir::SmallVector<mlir::Value, 4> new_operands;
+      new_operands.reserve(new_kernel.getNumFuncArguments());
+      new_operands.append(func.args_begin(), func.args_end());
+      new_operands.append(returnOp->operand_begin(), returnOp->operand_end());
+      mlir::OpBuilder launch_builder(launchOp);
+      launch_builder.create<mlir::gpu::LaunchFuncOp>(
+          launchOp.getLoc(), new_kernel, launchOp.getGridSizeOperandValues(),
+          launchOp.getBlockSizeOperandValues(), new_operands);
+      // Launch does not have results, so we can just erase it. And the kernel
+      // also needs to go.
+      launchOp.erase();
+      kernel.erase();
+    });
+  }
+};
+
+struct MapParallelLoopsPass
+    : public mlir::PassWrapper<MapParallelLoopsPass, mlir::FunctionPass> {
+  void runOnFunction() override {
+    mlir::greedilyMapParallelSCFToGPU(getFunction().getBody());
+  }
+};
+
+struct FuseInnerParallelLoopsPass
+    : public mlir::PassWrapper<FuseInnerParallelLoopsPass, mlir::FunctionPass> {
+  void runOnFunction() override {
+    getFunction().walk([](mlir::scf::ParallelOp op) {
+      mlir::scf::naivelyFuseParallelOps(op.region());
+    });
+  }
+};
+
+struct ParallelLoopCollapsingToFirstDimPass
+    : public mlir::PassWrapper<ParallelLoopCollapsingToFirstDimPass,
+                               mlir::OperationPass<mlir::ModuleOp>> {
+  void runOnOperation() override {
+    mlir::Operation* module = getOperation();
+
+    module->walk([&](mlir::scf::ParallelOp op) {
+      unsigned num_loops = op.getNumLoops();
+      std::vector<unsigned> combinedLoops;
+      combinedLoops.reserve(num_loops);
+      for (unsigned i = 0; i < num_loops; ++i) {
+        combinedLoops.push_back(i);
+      }
+      mlir::collapseParallelLoops(op, {combinedLoops});
+    });
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<mlir::FunctionPass> createFusionOpRemoverPass() {
+  return absl::make_unique<FusionOpRemoverPass>();
+}
+
+std::unique_ptr<mlir::FunctionPass> createStoreForwardingPass() {
+  return absl::make_unique<StoreForwardingPass>();
+}
+
+std::unique_ptr<mlir::FunctionPass> createDeadTempBufferRemovalPass() {
+  return absl::make_unique<DeadTempBufferRemovalPass>();
+}
+
+std::unique_ptr<mlir::FunctionPass>
+createMoveScalarComputationsIntoGpuLaunchPass() {
+  return absl::make_unique<MoveScalarComputationsIntoGpuLaunchPass>();
+}
+
+std::unique_ptr<mlir::FunctionPass> createRewriteKernelSignaturePass() {
+  return absl::make_unique<RewriteKernelSignaturePass>();
+}
+
+std::unique_ptr<mlir::FunctionPass> createFuseInnerParallelLoopsPass() {
+  return absl::make_unique<FuseInnerParallelLoopsPass>();
+}
+
+std::unique_ptr<mlir::FunctionPass> createMapParallelLoopsPass() {
+  return absl::make_unique<MapParallelLoopsPass>();
+}
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+createParallelLoopCollapsingToFirstDimPass() {
+  return absl::make_unique<ParallelLoopCollapsingToFirstDimPass>();
+}
+
+}  // namespace mlir_gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/passes.h b/tensorflow/compiler/xla/service/mlir_gpu/passes.h
new file mode 100644
index 00000000000..e3840628a2e
--- /dev/null
+++ b/tensorflow/compiler/xla/service/mlir_gpu/passes.h
@@ -0,0 +1,66 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_MLIR_GPU_PASSES_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_MLIR_GPU_PASSES_H_
+
+#include <memory>
+
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace xla {
+namespace mlir_gpu {
+
+// TODO(herhut, pifon): Move these passes to MLIR Core.
+
+/// Replaces a FusionOp by the operations contained in its region.
+std::unique_ptr<mlir::FunctionPass> createFusionOpRemoverPass();
+
+/// Replaces a load that immediately follows a store to the same address with
+/// the stored value. This needs generalization.
+std::unique_ptr<mlir::FunctionPass> createStoreForwardingPass();
+
+/// Removes temporary buffers that are only written to but never read from or
+/// that are read but the read value is not used. Needs an analysis that proves
+/// that loads and stores are side-effect free (in bounds, no aliasing, etc.).
+std::unique_ptr<mlir::FunctionPass> createDeadTempBufferRemovalPass();
+
+/// Moves scalar computations to the GPULaunchOp body.
+std::unique_ptr<mlir::FunctionPass>
+createMoveScalarComputationsIntoGpuLaunchPass();
+
+/// Sorts the operands to the kernel for a deterministic order. First operands
+/// that are defined by function arguments, followed by operands that are
+/// returned from the function. This only works for simple functions without
+/// control flow and can be used in cases where the kernel is extracted and used
+/// independently of the host-side code.
+std::unique_ptr<mlir::FunctionPass> createRewriteKernelSignaturePass();
+
+/// We need to direct fusion to the inner loops. This cannot be done with
+/// a passmanager alone ATM, as nested pass managers require operations to
+/// be closed from above.
+std::unique_ptr<mlir::FunctionPass> createFuseInnerParallelLoopsPass();
+
+/// Greedily maps loops to GPU hardware dimensions.
+std::unique_ptr<mlir::FunctionPass> createMapParallelLoopsPass();
+
+/// Collapses all loop dimension into the first one.
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+createParallelLoopCollapsingToFirstDimPass();
+
+}  // namespace mlir_gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_MLIR_GPU_PASSES_H_
diff --git a/tensorflow/compiler/xla/service/multi_output_fusion.cc b/tensorflow/compiler/xla/service/multi_output_fusion.cc
index a21cec538d1..c5c2d081686 100644
--- a/tensorflow/compiler/xla/service/multi_output_fusion.cc
+++ b/tensorflow/compiler/xla/service/multi_output_fusion.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "absl/container/flat_hash_set.h"
 #include "tensorflow/compiler/xla/debug_options_flags.h"
+#include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
@@ -338,6 +339,21 @@ bool MultiOutputFusion::LegalToFuseMainConstraints(HloInstruction* instr1,
   if (!ShapesCompatibleForFusion(instr1, instr2)) {
     return false;
   }
+
+  // If both nodes are in-place operations and they use a common in-place
+  // operand, we can't fuse these two.
+  for (const auto& operand_and_output_index1 :
+       HloDataflowAnalysis::GetInPlaceInputOutputPairs(instr1)) {
+    const HloInstruction* operand =
+        instr1->operand(operand_and_output_index1.first.operand_number);
+    for (const auto& operand_and_output_index2 :
+         HloDataflowAnalysis::GetInPlaceInputOutputPairs(instr2)) {
+      if (operand ==
+          instr2->operand(operand_and_output_index2.first.operand_number)) {
+        return false;
+      }
+    }
+  }
   return true;
 }
 
diff --git a/tensorflow/compiler/xla/service/pattern_matcher.h b/tensorflow/compiler/xla/service/pattern_matcher.h
index febbf9294b0..eb29fa89098 100644
--- a/tensorflow/compiler/xla/service/pattern_matcher.h
+++ b/tensorflow/compiler/xla/service/pattern_matcher.h
@@ -351,8 +351,7 @@ class AllOfPattern {
 // Returns a pattern that represents the conjunction of all input patterns. All
 // patterns need to match in order to have the AllOf pattern match.
 template <typename Item, typename... Patterns>
-detail::AllOfPattern<typename std::remove_const<Item>::type, Patterns...> AllOf(
-    const Patterns&... patterns) {
+auto AllOf(const Patterns&... patterns) {
   return detail::AllOfPattern<typename std::remove_const<Item>::type,
                               Patterns...>(patterns...);
 }
@@ -361,10 +360,8 @@ detail::AllOfPattern<typename std::remove_const<Item>::type, Patterns...> AllOf(
 //
 // This transformation is necessary for good pretty-printing.
 template <typename Item, typename... InnerPs, typename... OuterPs>
-detail::AllOfPattern<typename std::remove_const<Item>::type, InnerPs...,
-                     OuterPs...>
-AllOf(const detail::AllOfPattern<Item, InnerPs...>& inner_p,
-      const OuterPs&... outer_ps) {
+auto AllOf(const detail::AllOfPattern<Item, InnerPs...>& inner_p,
+           const OuterPs&... outer_ps) {
   // Invoke constructor of AllOfPattern<Item, InnerPs..., OuterPs...>.
   auto make_all_of = [](const InnerPs&... inner_ps,
                         const OuterPs&... outer_ps) {
@@ -453,10 +450,7 @@ template <typename LayoutType, typename Impl>
 class LayoutPattern {
  private:
   template <typename NewImpl>
-  auto AppendImpl(NewImpl new_impl) const
-      -> LayoutPattern<LayoutType,
-                       decltype(AllOf<::xla::Layout>(std::declval<Impl>(),
-                                                     std::move(new_impl)))> {
+  auto AppendImpl(NewImpl new_impl) const {
     auto new_allof = AllOf<::xla::Layout>(impl_, std::move(new_impl));
     return LayoutPattern<LayoutType, decltype(new_allof)>(std::move(new_allof),
                                                           matched_layout_);
@@ -495,14 +489,12 @@ class LayoutPattern {
 
   // Modifies the pattern to match only if the layout equals the given proto.
   // The layout must outlive the returned pattern.
-  constexpr auto EqualTo(const ::xla::Layout* layout) const
-      -> decltype(this->AppendImpl(LayoutPatternEqualImpl(layout))) {
+  constexpr auto EqualTo(const ::xla::Layout* layout) const {
     return AppendImpl(LayoutPatternEqualImpl(layout));
   }
 
   // Modifies the pattern to match only if the layout has a dense format.
-  constexpr auto WithDenseFormat() const
-      -> decltype(this->AppendImpl(LayoutPatternFormatImpl(DENSE))) {
+  constexpr auto WithDenseFormat() const {
     return AppendImpl(LayoutPatternFormatImpl(DENSE));
   }
 
@@ -626,17 +618,14 @@ class AnyOfPattern {
 // patterns. The returned pattern matches from left to right, and stops on the
 // first match.
 template <typename Item, typename... Patterns>
-detail::AnyOfPattern<typename std::remove_const<Item>::type, Patterns...> AnyOf(
-    const Patterns&... patterns) {
+auto AnyOf(const Patterns&... patterns) {
   return detail::AnyOfPattern<typename std::remove_const<Item>::type,
                               Patterns...>(patterns...);
 }
 
 // Creates a layout pattern that will capture the matched layout in the
 // argument.
-inline constexpr detail::LayoutPattern<const ::xla::Layout,
-                                       detail::LayoutPatternBaseImpl>
-Layout(const ::xla::Layout** matched_layout = nullptr) {
+inline constexpr auto Layout(const ::xla::Layout** matched_layout = nullptr) {
   return detail::LayoutPattern<const ::xla::Layout,
                                detail::LayoutPatternBaseImpl>(
       detail::LayoutPatternBaseImpl(), matched_layout);
@@ -644,9 +633,7 @@ Layout(const ::xla::Layout** matched_layout = nullptr) {
 
 // Creates a layout pattern that will capture the matched layout in the
 // argument.
-inline constexpr detail::LayoutPattern<::xla::Layout,
-                                       detail::LayoutPatternBaseImpl>
-Layout(::xla::Layout** matched_layout) {
+inline constexpr auto Layout(::xla::Layout** matched_layout) {
   return detail::LayoutPattern<::xla::Layout, detail::LayoutPatternBaseImpl>(
       detail::LayoutPatternBaseImpl(), matched_layout);
 }
@@ -939,10 +926,7 @@ template <typename ShapeType, typename Impl>
 class ShapePattern {
  private:
   template <typename NewImpl>
-  auto AppendImpl(NewImpl new_impl) const
-      -> ShapePattern<ShapeType,
-                      decltype(AllOf<::xla::Shape>(std::declval<Impl>(),
-                                                   std::move(new_impl)))> {
+  auto AppendImpl(NewImpl new_impl) const {
     auto new_all_of = AllOf<::xla::Shape>(impl_, std::move(new_impl));
     return ShapePattern<ShapeType, decltype(new_all_of)>(std::move(new_all_of),
                                                          matched_shape_);
@@ -988,80 +972,66 @@ class ShapePattern {
 
   // Modifies the pattern to match only if the shape equals the given proto.
   // The layout must outlive the returned pattern.
-  constexpr auto EqualTo(const ::xla::Shape* shape) const
-      -> decltype(this->AppendImpl(ShapePatternEqualImpl(shape))) {
+  constexpr auto EqualTo(const ::xla::Shape* shape) const {
     return AppendImpl(ShapePatternEqualImpl(shape));
   }
 
   // Modifies the pattern to match only if the shape is compatible to the given
   // proto. The layout must outlive the returned pattern.
-  constexpr auto CompatibleTo(const ::xla::Shape* shape) const
-      -> decltype(this->AppendImpl(ShapePatternCompatibleImpl(shape))) {
+  constexpr auto CompatibleTo(const ::xla::Shape* shape) const {
     return AppendImpl(ShapePatternCompatibleImpl(shape));
   }
 
   // Modifies the pattern to match only if the shape has the given element type.
-  constexpr auto WithElementType(PrimitiveType element_type) const
-      -> decltype(this->AppendImpl(ShapePatternElementTypeImpl(element_type))) {
+  constexpr auto WithElementType(PrimitiveType element_type) const {
     return AppendImpl(ShapePatternElementTypeImpl(element_type));
   }
 
   // Modifies the pattern to match only if the shape is scalar.
-  constexpr auto IsScalar() const
-      -> decltype(this->AppendImpl(ShapePatternIsScalarImpl())) {
+  constexpr auto IsScalar() const {
     return AppendImpl(ShapePatternIsScalarImpl());
   }
 
   // Modifies the pattern to match only if the shape is an array.
-  constexpr auto IsArray() const
-      -> decltype(this->AppendImpl(ShapePatternIsArrayImpl())) {
+  constexpr auto IsArray() const {
     return AppendImpl(ShapePatternIsArrayImpl());
   }
 
   // Modifies the pattern to match only if the shape is a tuple.
-  constexpr auto IsTuple() const
-      -> decltype(this->AppendImpl(ShapePatternIsTupleImpl())) {
+  constexpr auto IsTuple() const {
     return AppendImpl(ShapePatternIsTupleImpl());
   }
 
-  constexpr auto IsEffectiveScalar() const
-      -> decltype(this->AppendImpl(ShapePatternEffectiveScalarImpl())) {
+  constexpr auto IsEffectiveScalar() const {
     return AppendImpl(ShapePatternEffectiveScalarImpl());
   }
 
   // Modifies the pattern to match only if the shape has the given rank.
-  constexpr auto WithRank(int64 rank) const
-      -> decltype(this->AppendImpl(ShapePatternRankImpl(rank))) {
+  constexpr auto WithRank(int64 rank) const {
     return AppendImpl(ShapePatternRankImpl(rank));
   }
 
   // Modifies the pattern to match only if the shape has a layout that matches
   // the given pattern.
   template <typename LayoutType, typename LayoutImpl>
-  auto WithLayout(const LayoutPattern<LayoutType, LayoutImpl>& layout) const
-      -> decltype(this->AppendImpl(
-          ShapePatternLayoutImpl<LayoutType, LayoutImpl>(layout))) {
+  auto WithLayout(const LayoutPattern<LayoutType, LayoutImpl>& layout) const {
     return AppendImpl(ShapePatternLayoutImpl<LayoutType, LayoutImpl>(layout));
   }
 
-  constexpr auto WithLayoutEqualTo(const ::xla::Layout* layout) const
-      -> decltype(this->WithLayout(Layout().EqualTo(layout))) {
+  constexpr auto WithLayoutEqualTo(const ::xla::Layout* layout) const {
     return WithLayout(Layout().EqualTo(layout));
   }
 
-  constexpr auto IsDenseArray() const
-      -> decltype(this->WithLayout(Layout().WithDenseFormat())) {
+  constexpr auto IsDenseArray() const {
     return WithLayout(Layout().WithDenseFormat());
   }
 
   // Modifies the pattern to match only if the shape has a subshape that matches
   // the given pattern.
   template <typename SubshapeType, typename SubshapeImpl>
-  auto WithSubshape(ShapeIndexView index,
-                    const ShapePattern<SubshapeType, SubshapeImpl>& subshape)
-      const -> decltype(this->AppendImpl(
-          ShapePatternSubshapeImpl<SubshapeType, SubshapeImpl>(index,
-                                                               subshape))) {
+  auto WithSubshape(
+      ShapeIndexView index,
+      const ShapePattern<SubshapeType, SubshapeImpl>& subshape) const {
     return AppendImpl(
         ShapePatternSubshapeImpl<SubshapeType, SubshapeImpl>(index, subshape));
   }
@@ -1101,17 +1071,13 @@ class ShapePattern {
 }  // namespace detail
 
 // Creates a shape pattern that will capture the matched layout in the argument.
-inline constexpr detail::ShapePattern<const ::xla::Shape,
-                                      detail::ShapePatternBaseImpl>
-Shape(const ::xla::Shape** matched_shape = nullptr) {
+inline constexpr auto Shape(const ::xla::Shape** matched_shape = nullptr) {
   return detail::ShapePattern<const ::xla::Shape, detail::ShapePatternBaseImpl>(
       detail::ShapePatternBaseImpl(), matched_shape);
 }
 
 // Creates a shape pattern that will capture the matched layout in the argument.
-inline constexpr detail::ShapePattern<::xla::Shape,
-                                      detail::ShapePatternBaseImpl>
-Shape(::xla::Shape** matched_shape) {
+inline constexpr auto Shape(::xla::Shape** matched_shape) {
   return detail::ShapePattern<::xla::Shape, detail::ShapePatternBaseImpl>(
       detail::ShapePatternBaseImpl(), matched_shape);
 }
@@ -1797,9 +1763,7 @@ template <typename HloInstructionType, typename Impl>
 class HloInstructionPattern {
  private:
   template <typename NewImpl>
-  auto AppendImpl(NewImpl new_impl) const -> HloInstructionPattern<
-      HloInstructionType, decltype(AllOf<::xla::HloInstruction>(
-                              std::declval<Impl>(), std::move(new_impl)))> {
+  auto AppendImpl(NewImpl new_impl) const {
     auto new_allof = AllOf<::xla::HloInstruction>(impl_, std::move(new_impl));
     return HloInstructionPattern<HloInstructionType, decltype(new_allof)>(
         std::move(new_allof), matched_inst_);
@@ -1837,51 +1801,38 @@ class HloInstructionPattern {
   }
 
   // Modifies the pattern to match only if the instruction has the given name.
-  auto WithName(absl::string_view name) const
-      -> decltype(this->AppendImpl(HloInstructionPatternNameImpl(name))) {
+  auto WithName(absl::string_view name) const {
     return AppendImpl(HloInstructionPatternNameImpl(name));
   }
 
   // Modifies the pattern to match only if the instruction has the given opcode.
-  auto WithOpcode(HloOpcode opcode) const
-      -> decltype(this->AppendImpl(HloInstructionPatternOpcodeImpl(opcode,
-                                                                   false))) {
+  auto WithOpcode(HloOpcode opcode) const {
     return AppendImpl(HloInstructionPatternOpcodeImpl(opcode, false));
   }
 
   // Modifies the pattern to match only the custom call with a given target.
-  auto WithCustomCallTarget(absl::string_view custom_call_target) const
-      -> decltype(this->AppendImpl(
-          HloInstructionCustomCallTargetImpl(custom_call_target))) {
+  auto WithCustomCallTarget(absl::string_view custom_call_target) const {
     return AppendImpl(HloInstructionCustomCallTargetImpl(custom_call_target));
   }
 
-  auto WithNumOperands(int64 num_operands) const -> decltype(
-      this->AppendImpl(HloInstructionPatternNumOperandsImpl(num_operands))) {
+  auto WithNumOperands(int64 num_operands) const {
     return AppendImpl(HloInstructionPatternNumOperandsImpl(num_operands));
   }
 
   // Modifies the pattern to match only if the instruction does not have the
   // given opcode.
-  auto WithoutOpcode(HloOpcode opcode) const
-      -> decltype(this->AppendImpl(HloInstructionPatternOpcodeImpl(opcode,
-                                                                   true))) {
+  auto WithoutOpcode(HloOpcode opcode) const {
     return AppendImpl(HloInstructionPatternOpcodeImpl(opcode, true));
   }
 
-  constexpr auto Is(const HloInstruction* instr) const
-      -> decltype(this->AppendImpl(HloInstructionIsImpl(instr))) {
+  constexpr auto Is(const HloInstruction* instr) const {
     return AppendImpl(HloInstructionIsImpl(instr));
   }
 
   // Modifies the pattern to match only if the instruction is a constant.
-  constexpr auto IsConstant() const
-      -> decltype(this->WithOpcode(HloOpcode::kConstant)) {
-    return WithOpcode(HloOpcode::kConstant);
-  }
+  constexpr auto IsConstant() const { return WithOpcode(HloOpcode::kConstant); }
 
-  constexpr auto IsConstantScalar() const -> decltype(this->AppendImpl(
-      HloConstantScalarImpl</*Dummy*/ int>(/*match_effective_scalar=*/false))) {
+  constexpr auto IsConstantScalar() const {
     return AppendImpl(
         HloConstantScalarImpl</*Dummy*/ int>(/*match_effective_scalar=*/false));
   }
@@ -1889,39 +1840,32 @@ class HloInstructionPattern {
   // This does not check that T has the same type as the instruction, so e.g.
   // IsConstantScalar(1.0) may match a constant of shape int32[].
   template <typename ScalarTy>
-  constexpr auto IsConstantScalar(const ScalarTy& val) const
-      -> decltype(this->AppendImpl(HloConstantScalarImpl<ScalarTy>(
-          val, /*match_effective_scalar=*/false))) {
+  constexpr auto IsConstantScalar(const ScalarTy& val) const {
     return AppendImpl(
         HloConstantScalarImpl<ScalarTy>(val, /*match_effective_scalar=*/false));
   }
 
-  constexpr auto IsConstantEffectiveScalar() const -> decltype(this->AppendImpl(
-      HloConstantScalarImpl</*Dummy*/ int>(/*match_effective_scalar=*/true))) {
+  constexpr auto IsConstantEffectiveScalar() const {
     return AppendImpl(
         HloConstantScalarImpl</*Dummy*/ int>(/*match_effective_scalar=*/true));
   }
 
   template <typename ScalarTy>
-  constexpr auto IsConstantEffectiveScalar(const ScalarTy& val) const
-      -> decltype(this->AppendImpl(HloConstantScalarImpl<ScalarTy>(
-          val, /*match_effective_scalar=*/true))) {
+  constexpr auto IsConstantEffectiveScalar(const ScalarTy& val) const {
     return AppendImpl(
         HloConstantScalarImpl<ScalarTy>(val, /*match_effective_scalar=*/true));
   }
 
   // Modifies the pattern to match only if the instruction is not a constant.
-  constexpr auto IsNonConstant() const
-      -> decltype(this->WithoutOpcode(HloOpcode::kConstant)) {
+  constexpr auto IsNonConstant() const {
     return WithoutOpcode(HloOpcode::kConstant);
   }
 
   // Modifies the pattern to match only if the instruction has a shape that
   // matches the given pattern.
   template <typename ShapeType, typename ShapeImpl>
-  constexpr auto WithShape(const ShapePattern<ShapeType, ShapeImpl>& shape)
-      const -> decltype(this->AppendImpl(
-          HloInstructionPatternShapeImpl<ShapeType, ShapeImpl>(shape))) {
+  constexpr auto WithShape(
+      const ShapePattern<ShapeType, ShapeImpl>& shape) const {
     return AppendImpl(
         HloInstructionPatternShapeImpl<ShapeType, ShapeImpl>(shape));
   }
@@ -1929,16 +1873,14 @@ class HloInstructionPattern {
   // Make this a templated function to work around gcc 4.9.4 template infinite
   // recursion bug.
   template <typename Dummy = void>
-  constexpr auto WithShapeEqualTo(const ::xla::Shape* shape) const
-      -> decltype(this->WithShape(Shape().EqualTo(shape))) {
+  constexpr auto WithShapeEqualTo(const ::xla::Shape* shape) const {
     return WithShape(Shape().EqualTo(shape));
   }
 
   // Make this a templated function to work around gcc 4.9.4 template infinite
   // recursion bug.
   template <typename Dummy = void>
-  constexpr auto WithShapeCompatibleTo(const ::xla::Shape* shape) const
-      -> decltype(this->WithShape(Shape().CompatibleTo(shape))) {
+  constexpr auto WithShapeCompatibleTo(const ::xla::Shape* shape) const {
     return WithShape(Shape().CompatibleTo(shape));
   }
 
@@ -1947,10 +1889,7 @@ class HloInstructionPattern {
   template <typename OperandType, typename OperandImpl>
   constexpr auto WithOperand(
       int64 operand_index,
-      const HloInstructionPattern<OperandType, OperandImpl>& operand) const
-      -> decltype(this->AppendImpl(
-          HloInstructionPatternOperandImpl<OperandType, OperandImpl>(
-              operand_index, operand))) {
+      const HloInstructionPattern<OperandType, OperandImpl>& operand) const {
     return AppendImpl(
         HloInstructionPatternOperandImpl<OperandType, OperandImpl>(
             operand_index, operand));
@@ -1960,11 +1899,7 @@ class HloInstructionPattern {
             typename OperandImpl2>
   constexpr auto WithBinaryOperandsAnyOrder(
       const HloInstructionPattern<OperandType1, OperandImpl1>& op1,
-      const HloInstructionPattern<OperandType2, OperandImpl2>& op2) const
-      -> decltype(this->AppendImpl(
-          HloInstructionPatternBinaryOperandsAnyOrderImpl<
-              OperandType1, OperandImpl1, OperandType2, OperandImpl2>(op1,
-                                                                      op2))) {
+      const HloInstructionPattern<OperandType2, OperandImpl2>& op2) const {
     return AppendImpl(
         HloInstructionPatternBinaryOperandsAnyOrderImpl<
             OperandType1, OperandImpl1, OperandType2, OperandImpl2>(op1, op2));
@@ -1972,46 +1907,39 @@ class HloInstructionPattern {
 
   // Modifies the pattern to match only if the instruction is a fusion node with
   // the given kind.
-  constexpr auto WithFusionKind(HloInstruction::FusionKind kind) const
-      -> decltype(this->AppendImpl(HloInstructionPatternFusionKindImpl(kind))) {
+  constexpr auto WithFusionKind(HloInstruction::FusionKind kind) const {
     return AppendImpl(HloInstructionPatternFusionKindImpl(kind));
   }
 
   // Modifies the pattern to match only if the instruction is a
   // get-tuple-element with the given tuple index.
-  constexpr auto WithTupleIndex(int64 tuple_index) const -> decltype(
-      this->AppendImpl(HloInstructionPatternTupleIndexImpl(tuple_index))) {
+  constexpr auto WithTupleIndex(int64 tuple_index) const {
     return AppendImpl(HloInstructionPatternTupleIndexImpl(tuple_index));
   }
 
   // Modifies the pattern to match only if the instruction is a parameter
   // with the given parameter number.
-  constexpr auto WithParameterNum(int64 parameter_num) const -> decltype(
-      this->AppendImpl(HloInstructionPatternParameterNumImpl(parameter_num))) {
+  constexpr auto WithParameterNum(int64 parameter_num) const {
     return AppendImpl(HloInstructionPatternParameterNumImpl(parameter_num));
   }
 
   // Modifies the pattern to match if the instruction is used exactly once.
   // Does not match if the instruction is used twice by the same user (e.g.
   // multiply(x,x)).
-  constexpr auto WithOneUse() const
-      -> decltype(this->AppendImpl(HloInstructionPatternOneUseImpl())) {
+  constexpr auto WithOneUse() const {
     return AppendImpl(HloInstructionPatternOneUseImpl());
   }
 
   // Modifies the pattern to match if the instruction is used by exactly one
   // other instruction.  Will match if the instruction is used twice, so long as
   // it's by the same user (e.g.  multiply(x,x)).
-  constexpr auto WithOneUser() const
-      -> decltype(this->AppendImpl(HloInstructionPatternOneUserImpl())) {
+  constexpr auto WithOneUser() const {
     return AppendImpl(HloInstructionPatternOneUserImpl());
   }
 
   // Modifies the pattern to match only if the instruction has the given
   // comparison direction.
-  auto WithComparisonDirection(ComparisonDirection direction) const
-      -> decltype(this->AppendImpl(
-          HloInstructionPatternComparisonDirectionImpl(direction))) {
+  auto WithComparisonDirection(ComparisonDirection direction) const {
     return AppendImpl(HloInstructionPatternComparisonDirectionImpl(direction));
   }
 
@@ -2028,9 +1956,7 @@ class HloInstructionPattern {
 
 // Creates an instruction pattern that will capture the matched instruction in
 // the argument.
-inline constexpr detail::HloInstructionPattern<
-    const ::xla::HloInstruction, detail::HloInstructionPatternBaseImpl>
-Op(const ::xla::HloInstruction** matched_inst = nullptr) {
+inline constexpr auto Op(const ::xla::HloInstruction** matched_inst = nullptr) {
   return detail::HloInstructionPattern<const ::xla::HloInstruction,
                                        detail::HloInstructionPatternBaseImpl>(
       detail::HloInstructionPatternBaseImpl(), matched_inst);
@@ -2038,24 +1964,19 @@ Op(const ::xla::HloInstruction** matched_inst = nullptr) {
 
 // Creates an instruction pattern that will capture the matched instruction in
 // the argument.
-inline constexpr detail::HloInstructionPattern<
-    ::xla::HloInstruction, detail::HloInstructionPatternBaseImpl>
-Op(::xla::HloInstruction** matched_inst) {
+inline constexpr auto Op(::xla::HloInstruction** matched_inst) {
   return detail::HloInstructionPattern<::xla::HloInstruction,
                                        detail::HloInstructionPatternBaseImpl>(
       detail::HloInstructionPatternBaseImpl(), matched_inst);
 }
 
 // Helpers for nullary instructions.
-#define XLA_NULLOP_PATTERN(NAME)                                      \
-  inline auto NAME()->decltype(Op().WithOpcode(HloOpcode::k##NAME)) { \
-    return Op().WithOpcode(HloOpcode::k##NAME);                       \
-  }                                                                   \
-                                                                      \
-  template <typename HloInstructionType>                              \
-  inline auto NAME(HloInstructionType** matched_inst)                 \
-      ->decltype(Op(matched_inst).WithOpcode(HloOpcode::k##NAME)) {   \
-    return Op(matched_inst).WithOpcode(HloOpcode::k##NAME);           \
+#define XLA_NULLOP_PATTERN(NAME)                                     \
+  inline auto NAME() { return Op().WithOpcode(HloOpcode::k##NAME); } \
+                                                                     \
+  template <typename HloInstructionType>                             \
+  inline auto NAME(HloInstructionType** matched_inst) {              \
+    return Op(matched_inst).WithOpcode(HloOpcode::k##NAME);          \
   }
 XLA_NULLOP_PATTERN(Constant)
 XLA_NULLOP_PATTERN(Parameter)
@@ -2064,28 +1985,21 @@ XLA_NULLOP_PATTERN(Rng)
 #undef XLA_NULLOP_PATTERN
 
 // Helpers for unary instructions.
-#define XLA_UNOP_PATTERN(NAME)                                        \
-  inline auto NAME()->decltype(Op().WithOpcode(HloOpcode::k##NAME)) { \
-    return Op().WithOpcode(HloOpcode::k##NAME);                       \
-  }                                                                   \
-                                                                      \
-  template <typename Arg>                                             \
-  inline auto NAME(Arg&& arg)->decltype(                              \
-      Op().WithOpcode(HloOpcode::k##NAME)                             \
-          .WithOperand(0, std::forward<Arg>(arg))) {                  \
-    return Op()                                                       \
-        .WithOpcode(HloOpcode::k##NAME)                               \
-        .WithOperand(0, std::forward<Arg>(arg));                      \
-  }                                                                   \
-                                                                      \
-  template <typename HloInstructionType, typename Arg>                \
-  inline auto NAME(HloInstructionType** matched_inst, Arg&& arg)      \
-      ->decltype(Op(matched_inst)                                     \
-                     .WithOpcode(HloOpcode::k##NAME)                  \
-                     .WithOperand(0, std::forward<Arg>(arg))) {       \
-    return Op(matched_inst)                                           \
-        .WithOpcode(HloOpcode::k##NAME)                               \
-        .WithOperand(0, std::forward<Arg>(arg));                      \
+#define XLA_UNOP_PATTERN(NAME)                                       \
+  inline auto NAME() { return Op().WithOpcode(HloOpcode::k##NAME); } \
+                                                                     \
+  template <typename Arg>                                            \
+  inline auto NAME(Arg&& arg) {                                      \
+    return Op()                                                      \
+        .WithOpcode(HloOpcode::k##NAME)                              \
+        .WithOperand(0, std::forward<Arg>(arg));                     \
+  }                                                                  \
+                                                                     \
+  template <typename HloInstructionType, typename Arg>               \
+  inline auto NAME(HloInstructionType** matched_inst, Arg&& arg) {   \
+    return Op(matched_inst)                                          \
+        .WithOpcode(HloOpcode::k##NAME)                              \
+        .WithOperand(0, std::forward<Arg>(arg));                     \
   }
 XLA_UNOP_PATTERN(Abs)
 XLA_UNOP_PATTERN(RoundNearestAfz)
@@ -2124,55 +2038,40 @@ XLA_UNOP_PATTERN(Transpose)
 #undef XLA_UNOP_PATTERN
 
 // Helpers for binary instructions.
-#define XLA_BINOP_PATTERN(NAME)                                             \
-  inline auto NAME()->decltype(Op().WithOpcode(HloOpcode::k##NAME)) {       \
-    return Op().WithOpcode(HloOpcode::k##NAME);                             \
-  }                                                                         \
-                                                                            \
-  template <typename Lhs, typename Rhs>                                     \
-  inline auto NAME(Lhs&& lhs, Rhs&& rhs)                                    \
-      ->decltype(Op().WithOpcode(HloOpcode::k##NAME)                        \
-                     .WithOperand(0, std::forward<Lhs>(lhs))                \
-                     .WithOperand(1, std::forward<Rhs>(rhs))) {             \
-    return Op()                                                             \
-        .WithOpcode(HloOpcode::k##NAME)                                     \
-        .WithOperand(0, std::forward<Lhs>(lhs))                             \
-        .WithOperand(1, std::forward<Rhs>(rhs));                            \
-  }                                                                         \
-                                                                            \
-  template <typename HloInstructionType, typename Lhs, typename Rhs>        \
-  inline auto NAME(HloInstructionType** matched_inst, Lhs&& lhs, Rhs&& rhs) \
-      ->decltype(Op(matched_inst)                                           \
-                     .WithOpcode(HloOpcode::k##NAME)                        \
-                     .WithOperand(0, std::forward<Lhs>(lhs))                \
-                     .WithOperand(1, std::forward<Rhs>(rhs))) {             \
-    return Op(matched_inst)                                                 \
-        .WithOpcode(HloOpcode::k##NAME)                                     \
-        .WithOperand(0, std::forward<Lhs>(lhs))                             \
-        .WithOperand(1, std::forward<Rhs>(rhs));                            \
+#define XLA_BINOP_PATTERN(NAME)                                               \
+  inline auto NAME() { return Op().WithOpcode(HloOpcode::k##NAME); }          \
+                                                                              \
+  template <typename Lhs, typename Rhs>                                       \
+  inline auto NAME(Lhs&& lhs, Rhs&& rhs) {                                    \
+    return Op()                                                               \
+        .WithOpcode(HloOpcode::k##NAME)                                       \
+        .WithOperand(0, std::forward<Lhs>(lhs))                               \
+        .WithOperand(1, std::forward<Rhs>(rhs));                              \
+  }                                                                           \
+                                                                              \
+  template <typename HloInstructionType, typename Lhs, typename Rhs>          \
+  inline auto NAME(HloInstructionType** matched_inst, Lhs&& lhs, Rhs&& rhs) { \
+    return Op(matched_inst)                                                   \
+        .WithOpcode(HloOpcode::k##NAME)                                       \
+        .WithOperand(0, std::forward<Lhs>(lhs))                               \
+        .WithOperand(1, std::forward<Rhs>(rhs));                              \
   }
 
-#define XLA_COMMUTATIVE_BINOP_PATTERN(NAME)                                 \
-  XLA_BINOP_PATTERN(NAME)                                                   \
-                                                                            \
-  template <typename HloInstructionType, typename Lhs, typename Rhs>        \
-  inline auto NAME##AnyOrder(HloInstructionType** matched_inst, Lhs&& lhs,  \
-                             Rhs&& rhs)                                     \
-      ->decltype(Op(matched_inst)                                           \
-                     .WithOpcode(HloOpcode::k##NAME)                        \
-                     .WithBinaryOperandsAnyOrder(std::forward<Lhs>(lhs),    \
-                                                 std::forward<Rhs>(rhs))) { \
-    return Op(matched_inst)                                                 \
-        .WithOpcode(HloOpcode::k##NAME)                                     \
-        .WithBinaryOperandsAnyOrder(std::forward<Lhs>(lhs),                 \
-                                    std::forward<Rhs>(rhs));                \
-  }                                                                         \
-  template <typename Lhs, typename Rhs>                                     \
-  inline auto NAME##AnyOrder(Lhs&& lhs, Rhs&& rhs)                          \
-      ->decltype(NAME##AnyOrder<const HloInstruction>(                      \
-          nullptr, std::forward<Lhs>(lhs), std::forward<Rhs>(rhs))) {       \
-    return NAME##AnyOrder<const HloInstruction>(                            \
-        nullptr, std::forward<Lhs>(lhs), std::forward<Rhs>(rhs));           \
+#define XLA_COMMUTATIVE_BINOP_PATTERN(NAME)                                \
+  XLA_BINOP_PATTERN(NAME)                                                  \
+                                                                           \
+  template <typename HloInstructionType, typename Lhs, typename Rhs>       \
+  inline auto NAME##AnyOrder(HloInstructionType** matched_inst, Lhs&& lhs, \
+                             Rhs&& rhs) {                                  \
+    return Op(matched_inst)                                                \
+        .WithOpcode(HloOpcode::k##NAME)                                    \
+        .WithBinaryOperandsAnyOrder(std::forward<Lhs>(lhs),                \
+                                    std::forward<Rhs>(rhs));               \
+  }                                                                        \
+  template <typename Lhs, typename Rhs>                                    \
+  inline auto NAME##AnyOrder(Lhs&& lhs, Rhs&& rhs) {                       \
+    return NAME##AnyOrder<const HloInstruction>(                           \
+        nullptr, std::forward<Lhs>(lhs), std::forward<Rhs>(rhs));          \
   }
 XLA_COMMUTATIVE_BINOP_PATTERN(Add)
 XLA_BINOP_PATTERN(Atan2)
@@ -2202,16 +2101,10 @@ XLA_BINOP_PATTERN(ShiftRightLogical)
 
 // Helpers for ternary instructions.
 #define XLA_TERNOP_PATTERN(NAME)                                       \
-  inline auto NAME()->decltype(Op().WithOpcode(HloOpcode::k##NAME)) {  \
-    return Op().WithOpcode(HloOpcode::k##NAME);                        \
-  }                                                                    \
+  inline auto NAME() { return Op().WithOpcode(HloOpcode::k##NAME); }   \
                                                                        \
   template <typename Arg0, typename Arg1, typename Arg2>               \
-  inline auto NAME(Arg0&& arg0, Arg1&& arg1, Arg2&& arg2)              \
-      ->decltype(Op().WithOpcode(HloOpcode::k##NAME)                   \
-                     .WithOperand(0, std::forward<Arg0>(arg0))         \
-                     .WithOperand(1, std::forward<Arg1>(arg1))         \
-                     .WithOperand(2, std::forward<Arg2>(arg2))) {      \
+  inline auto NAME(Arg0&& arg0, Arg1&& arg1, Arg2&& arg2) {            \
     return Op()                                                        \
         .WithOpcode(HloOpcode::k##NAME)                                \
         .WithOperand(0, std::forward<Arg0>(arg0))                      \
@@ -2222,12 +2115,7 @@ XLA_BINOP_PATTERN(ShiftRightLogical)
   template <typename HloInstructionType, typename Arg0, typename Arg1, \
             typename Arg2>                                             \
   inline auto NAME(HloInstructionType** matched_inst, Arg0&& arg0,     \
-                   Arg1&& arg1, Arg2&& arg2)                           \
-      ->decltype(Op(matched_inst)                                      \
-                     .WithOpcode(HloOpcode::k##NAME)                   \
-                     .WithOperand(0, std::forward<Arg0>(arg0))         \
-                     .WithOperand(1, std::forward<Arg1>(arg1))         \
-                     .WithOperand(2, std::forward<Arg2>(arg2))) {      \
+                   Arg1&& arg1, Arg2&& arg2) {                         \
     return Op(matched_inst)                                            \
         .WithOpcode(HloOpcode::k##NAME)                                \
         .WithOperand(0, std::forward<Arg0>(arg0))                      \
@@ -2241,17 +2129,13 @@ XLA_TERNOP_PATTERN(Select);
 
 namespace detail {
 template <typename Matcher, typename FirstArg>
-inline auto WithOperands(Matcher&& m, int64 operand_num, FirstArg&& first_arg)
-    -> decltype(m.WithOperand(operand_num, std::forward<FirstArg>(first_arg))) {
+inline auto WithOperands(Matcher&& m, int64 operand_num, FirstArg&& first_arg) {
   return m.WithOperand(operand_num, std::forward<FirstArg>(first_arg));
 }
 
 template <typename Matcher, typename FirstArg, typename... Args>
 inline auto WithOperands(Matcher&& m, int64 operand_num, FirstArg&& first_arg,
-                         Args&&... args)
-    -> decltype(WithOperands(m.WithOperand(operand_num,
-                                           std::forward<FirstArg>(first_arg)),
-                             operand_num + 1, std::forward<Args>(args)...)) {
+                         Args&&... args) {
   return WithOperands(
       m.WithOperand(operand_num, std::forward<FirstArg>(first_arg)),
       operand_num + 1, std::forward<Args>(args)...);
@@ -2259,26 +2143,17 @@ inline auto WithOperands(Matcher&& m, int64 operand_num, FirstArg&& first_arg,
 }  // namespace detail
 
 #define XLA_VARIADIC_OP_PATTERN(NAME)                                         \
-  inline auto NAME()->decltype(Op().WithOpcode(HloOpcode::k##NAME)) {         \
-    return Op().WithOpcode(HloOpcode::k##NAME);                               \
-  }                                                                           \
+  inline auto NAME() { return Op().WithOpcode(HloOpcode::k##NAME); }          \
                                                                               \
   template <typename... Args>                                                 \
-  inline auto NAME(Args&&... args)                                            \
-      ->decltype(detail::WithOperands(Op().WithOpcode(HloOpcode::k##NAME)     \
-                                          .WithNumOperands(sizeof...(Args)),  \
-                                      0, std::forward<Args>(args)...)) {      \
+  inline auto NAME(Args&&... args) {                                          \
     return detail::WithOperands(                                              \
         Op().WithOpcode(HloOpcode::k##NAME).WithNumOperands(sizeof...(Args)), \
         /*operand_num=*/0, std::forward<Args>(args)...);                      \
   }                                                                           \
                                                                               \
   template <typename HloInstructionType, typename... Args>                    \
-  inline auto NAME(HloInstructionType** matched_inst, Args&&... args)         \
-      ->decltype(detail::WithOperands(Op(matched_inst)                        \
-                                          .WithOpcode(HloOpcode::k##NAME)     \
-                                          .WithNumOperands(sizeof...(Args)),  \
-                                      0, std::forward<Args>(args)...)) {      \
+  inline auto NAME(HloInstructionType** matched_inst, Args&&... args) {       \
     return detail::WithOperands(Op(matched_inst)                              \
                                     .WithOpcode(HloOpcode::k##NAME)           \
                                     .WithNumOperands(sizeof...(Args)),        \
@@ -2299,63 +2174,46 @@ XLA_VARIADIC_OP_PATTERN(Sort);
 XLA_VARIADIC_OP_PATTERN(Tuple);
 
 // Helpers for comparison instructions.
-#define XLA_COMPARE_PATTERN(NAME)                                              \
-  inline auto NAME()->decltype(                                                \
-      Op().WithOpcode(HloOpcode::kCompare)                                     \
-          .WithComparisonDirection(ComparisonDirection::k##NAME)) {            \
-    return Op()                                                                \
-        .WithOpcode(HloOpcode::kCompare)                                       \
-        .WithComparisonDirection(ComparisonDirection::k##NAME);                \
-  }                                                                            \
-                                                                               \
-  template <typename Lhs, typename Rhs>                                        \
-  inline auto NAME(Lhs&& lhs, Rhs&& rhs)                                       \
-      ->decltype(Op().WithOpcode(HloOpcode::kCompare)                          \
-                     .WithOperand(0, std::forward<Lhs>(lhs))                   \
-                     .WithOperand(1, std::forward<Rhs>(rhs))                   \
-                     .WithComparisonDirection(ComparisonDirection::k##NAME)) { \
-    return Op()                                                                \
-        .WithOpcode(HloOpcode::kCompare)                                       \
-        .WithOperand(0, std::forward<Lhs>(lhs))                                \
-        .WithOperand(1, std::forward<Rhs>(rhs))                                \
-        .WithComparisonDirection(ComparisonDirection::k##NAME);                \
-  }                                                                            \
-                                                                               \
-  template <typename HloInstructionType, typename Lhs, typename Rhs>           \
-  inline auto NAME(HloInstructionType** matched_inst, Lhs&& lhs, Rhs&& rhs)    \
-      ->decltype(Op(matched_inst)                                              \
-                     .WithOpcode(HloOpcode::kCompare)                          \
-                     .WithOperand(0, std::forward<Lhs>(lhs))                   \
-                     .WithOperand(1, std::forward<Rhs>(rhs))                   \
-                     .WithComparisonDirection(ComparisonDirection::k##NAME)) { \
-    return Op(matched_inst)                                                    \
-        .WithOpcode(HloOpcode::kCompare)                                       \
-        .WithOperand(0, std::forward<Lhs>(lhs))                                \
-        .WithOperand(1, std::forward<Rhs>(rhs))                                \
-        .WithComparisonDirection(ComparisonDirection::k##NAME);                \
+#define XLA_COMPARE_PATTERN(NAME)                                             \
+  inline auto NAME() {                                                        \
+    return Op()                                                               \
+        .WithOpcode(HloOpcode::kCompare)                                      \
+        .WithComparisonDirection(ComparisonDirection::k##NAME);               \
+  }                                                                           \
+                                                                              \
+  template <typename Lhs, typename Rhs>                                       \
+  inline auto NAME(Lhs&& lhs, Rhs&& rhs) {                                    \
+    return Op()                                                               \
+        .WithOpcode(HloOpcode::kCompare)                                      \
+        .WithOperand(0, std::forward<Lhs>(lhs))                               \
+        .WithOperand(1, std::forward<Rhs>(rhs))                               \
+        .WithComparisonDirection(ComparisonDirection::k##NAME);               \
+  }                                                                           \
+                                                                              \
+  template <typename HloInstructionType, typename Lhs, typename Rhs>          \
+  inline auto NAME(HloInstructionType** matched_inst, Lhs&& lhs, Rhs&& rhs) { \
+    return Op(matched_inst)                                                   \
+        .WithOpcode(HloOpcode::kCompare)                                      \
+        .WithOperand(0, std::forward<Lhs>(lhs))                               \
+        .WithOperand(1, std::forward<Rhs>(rhs))                               \
+        .WithComparisonDirection(ComparisonDirection::k##NAME);               \
   }
 
-#define XLA_COMMUTATIVE_COMPARE_PATTERN(NAME)                               \
-  XLA_COMPARE_PATTERN(NAME)                                                 \
-                                                                            \
-  template <typename HloInstructionType, typename Lhs, typename Rhs>        \
-  inline auto NAME##AnyOrder(HloInstructionType** matched_inst, Lhs&& lhs,  \
-                             Rhs&& rhs)                                     \
-      ->decltype(Op(matched_inst)                                           \
-                     .WithOpcode(HloOpcode::kCompare)                       \
-                     .WithBinaryOperandsAnyOrder(std::forward<Lhs>(lhs),    \
-                                                 std::forward<Rhs>(rhs))) { \
-    return Op(matched_inst)                                                 \
-        .WithOpcode(HloOpcode::kCompare)                                    \
-        .WithBinaryOperandsAnyOrder(std::forward<Lhs>(lhs),                 \
-                                    std::forward<Rhs>(rhs));                \
-  }                                                                         \
-  template <typename Lhs, typename Rhs>                                     \
-  inline auto NAME##AnyOrder(Lhs&& lhs, Rhs&& rhs)                          \
-      ->decltype(NAME##AnyOrder<const HloInstruction>(                      \
-          nullptr, std::forward<Lhs>(lhs), std::forward<Rhs>(rhs))) {       \
-    return NAME##AnyOrder<const HloInstruction>(                            \
-        nullptr, std::forward<Lhs>(lhs), std::forward<Rhs>(rhs));           \
+#define XLA_COMMUTATIVE_COMPARE_PATTERN(NAME)                              \
+  XLA_COMPARE_PATTERN(NAME)                                                \
+                                                                           \
+  template <typename HloInstructionType, typename Lhs, typename Rhs>       \
+  inline auto NAME##AnyOrder(HloInstructionType** matched_inst, Lhs&& lhs, \
+                             Rhs&& rhs) {                                  \
+    return Op(matched_inst)                                                \
+        .WithOpcode(HloOpcode::kCompare)                                   \
+        .WithBinaryOperandsAnyOrder(std::forward<Lhs>(lhs),                \
+                                    std::forward<Rhs>(rhs));               \
+  }                                                                        \
+  template <typename Lhs, typename Rhs>                                    \
+  inline auto NAME##AnyOrder(Lhs&& lhs, Rhs&& rhs) {                       \
+    return NAME##AnyOrder<const HloInstruction>(                           \
+        nullptr, std::forward<Lhs>(lhs), std::forward<Rhs>(rhs));          \
   }
 
 XLA_COMMUTATIVE_COMPARE_PATTERN(Eq);
@@ -2366,23 +2224,17 @@ XLA_COMPARE_PATTERN(Le);
 XLA_COMPARE_PATTERN(Lt);
 
 // Helpers for matching non-constant instructions.
-inline auto NonConstant() -> decltype(Op().IsNonConstant()) {
-  return Op().IsNonConstant();
-}
+inline auto NonConstant() { return Op().IsNonConstant(); }
 
 template <typename HloInstructionType>
-inline auto NonConstant(HloInstructionType** matched_inst)
-    -> decltype(Op(matched_inst).IsNonConstant()) {
+inline auto NonConstant(HloInstructionType** matched_inst) {
   return Op(matched_inst).IsNonConstant();
 }
 
 // Add overloads for GetTupleElement which take a int64 specifying which tuple
 // element is selected.
 template <typename Arg>
-inline auto GetTupleElement(Arg&& arg, int64 tuple_index)
-    -> decltype(Op().WithOpcode(HloOpcode::kGetTupleElement)
-                    .WithOperand(0, std::forward<Arg>(arg))
-                    .WithTupleIndex(tuple_index)) {
+inline auto GetTupleElement(Arg&& arg, int64 tuple_index) {
   return Op()
       .WithOpcode(HloOpcode::kGetTupleElement)
       .WithOperand(0, std::forward<Arg>(arg))
@@ -2391,11 +2243,7 @@ inline auto GetTupleElement(Arg&& arg, int64 tuple_index)
 
 template <typename HloInstructionType, typename Arg>
 inline auto GetTupleElement(HloInstructionType** matched_inst, Arg&& arg,
-                            int64 tuple_index)
-    -> decltype(Op(matched_inst)
-                    .WithOpcode(HloOpcode::kGetTupleElement)
-                    .WithOperand(0, std::forward<Arg>(arg))
-                    .WithTupleIndex(tuple_index)) {
+                            int64 tuple_index) {
   return Op(matched_inst)
       .WithOpcode(HloOpcode::kGetTupleElement)
       .WithOperand(0, std::forward<Arg>(arg))
@@ -2404,62 +2252,50 @@ inline auto GetTupleElement(HloInstructionType** matched_inst, Arg&& arg,
 
 // Add overloads for Parameter which take an int64 specifying the parameter
 // number.
-inline auto Parameter(int64 parameter_num) -> decltype(
-    Op().WithOpcode(HloOpcode::kParameter).WithParameterNum(parameter_num)) {
+inline auto Parameter(int64 parameter_num) {
   return Op().WithOpcode(HloOpcode::kParameter).WithParameterNum(parameter_num);
 }
 template <typename HloInstructionType>
-inline auto Parameter(HloInstructionType** matched_inst, int64 parameter_num)
-    -> decltype(Op(matched_inst)
-                    .WithOpcode(HloOpcode::kParameter)
-                    .WithParameterNum(parameter_num)) {
+inline auto Parameter(HloInstructionType** matched_inst, int64 parameter_num) {
   return Op(matched_inst)
       .WithOpcode(HloOpcode::kParameter)
       .WithParameterNum(parameter_num);
 }
 
-inline auto ConstantScalar() -> decltype(Op().IsConstantScalar()) {
-  return Op().IsConstantScalar();
-}
+inline auto ConstantScalar() { return Op().IsConstantScalar(); }
 
 template <typename HloInstructionType>
-inline auto ConstantScalar(HloInstructionType** matched_inst)
-    -> decltype(Op(matched_inst).IsConstantScalar()) {
+inline auto ConstantScalar(HloInstructionType** matched_inst) {
   return Op(matched_inst).IsConstantScalar();
 }
 
 template <typename ScalarTy>
-inline auto ConstantScalar(ScalarTy val)
-    -> decltype(Op().IsConstantScalar(val)) {
+inline auto ConstantScalar(ScalarTy val) {
   return Op().IsConstantScalar(val);
 }
 
 template <typename HloInstructionType, typename ScalarTy>
-inline auto ConstantScalar(HloInstructionType** matched_inst, ScalarTy val)
-    -> decltype(Op(matched_inst).IsConstantScalar(val)) {
+inline auto ConstantScalar(HloInstructionType** matched_inst, ScalarTy val) {
   return Op(matched_inst).IsConstantScalar(val);
 }
 
-inline auto ConstantEffectiveScalar() -> decltype(Op().IsConstantScalar()) {
+inline auto ConstantEffectiveScalar() {
   return Op().IsConstantEffectiveScalar();
 }
 
 template <typename HloInstructionType>
-inline auto ConstantEffectiveScalar(HloInstructionType** matched_inst)
-    -> decltype(Op(matched_inst).IsConstantScalar()) {
+inline auto ConstantEffectiveScalar(HloInstructionType** matched_inst) {
   return Op(matched_inst).IsConstantEffectiveScalar();
 }
 
 template <typename ScalarTy>
-inline auto ConstantEffectiveScalar(ScalarTy val)
-    -> decltype(Op().IsConstantEffectiveScalar(val)) {
+inline auto ConstantEffectiveScalar(ScalarTy val) {
   return Op().IsConstantEffectiveScalar(val);
 }
 
 template <typename HloInstructionType, typename ScalarTy>
 inline auto ConstantEffectiveScalar(HloInstructionType** matched_inst,
-                                    ScalarTy val)
-    -> decltype(Op(matched_inst).IsConstantEffectiveScalar(val)) {
+                                    ScalarTy val) {
   return Op(matched_inst).IsConstantEffectiveScalar(val);
 }
 
diff --git a/tensorflow/compiler/xla/service/scatter_expander.cc b/tensorflow/compiler/xla/service/scatter_expander.cc
index e3a3feb8640..bd99f920ea0 100644
--- a/tensorflow/compiler/xla/service/scatter_expander.cc
+++ b/tensorflow/compiler/xla/service/scatter_expander.cc
@@ -325,6 +325,22 @@ static StatusOr<std::vector<HloInstruction*>> ScatterLoopBody(
       {updated_operand, scatter_indices, updates}};
 }
 
+static int64 ScatterTripCount(HloInstruction* scatter) {
+  // Compute the trip count for the while loop to be used for scatter. This
+  // should be the number of indices we should scatter into the operand.
+  HloInstruction* scatter_indices = scatter->mutable_operand(1);
+  const Shape& scatter_indices_shape = scatter_indices->shape();
+  const ScatterDimensionNumbers& dim_numbers =
+      scatter->scatter_dimension_numbers();
+  int64 scatter_loop_trip_count = 1;
+  for (int64 i = 0, e = scatter_indices_shape.dimensions_size(); i < e; i++) {
+    if (i != dim_numbers.index_vector_dim()) {
+      scatter_loop_trip_count *= scatter_indices_shape.dimensions(i);
+    }
+  }
+  return scatter_loop_trip_count;
+}
+
 // High Level Algorithm.
 //
 // 1. Canonicalize the scatter_indices tensor such that it has rank 2, where
@@ -342,7 +358,7 @@ static StatusOr<std::vector<HloInstruction*>> ScatterLoopBody(
 //         from c. and d. using the update_computation of scatter.
 //      f. Write the updated value of the slice into the operand tensor.
 
-StatusOr<HloInstruction*> ScatterExpander::ExpandScatter(
+StatusOr<HloInstruction*> ScatterExpander::ExpandInstruction(
     HloInstruction* scatter) {
   HloInstruction* operand = scatter->mutable_operand(0);
   HloInstruction* scatter_indices = scatter->mutable_operand(1);
@@ -358,13 +374,7 @@ StatusOr<HloInstruction*> ScatterExpander::ExpandScatter(
 
   // Compute the trip count for the while loop to be used for scatter. This
   // should be the number of indices we should scatter into the operand.
-  const Shape& scatter_indices_shape = scatter_indices->shape();
-  int64 scatter_loop_trip_count = 1;
-  for (int64 i = 0, e = scatter_indices_shape.dimensions_size(); i < e; i++) {
-    if (i != dim_numbers.index_vector_dim()) {
-      scatter_loop_trip_count *= scatter_indices_shape.dimensions(i);
-    }
-  }
+  int64 scatter_loop_trip_count = ScatterTripCount(scatter);
   if (!IsInt32(scatter_loop_trip_count)) {
     return Unimplemented(
         "Scatter operations with more than 2147483647 scatter indices are not "
@@ -408,23 +418,9 @@ StatusOr<HloInstruction*> ScatterExpander::ExpandScatter(
   return scatter_loop_result.front();
 }
 
-StatusOr<bool> ScatterExpander::Run(HloModule* module) {
-  std::vector<HloInstruction*> scatter_instrs;
-  for (HloComputation* computation : module->MakeNonfusionComputations()) {
-    for (HloInstruction* instr : computation->instructions()) {
-      if (instr->opcode() == HloOpcode::kScatter) {
-        scatter_instrs.push_back(instr);
-      }
-    }
-  }
-
-  for (auto instr : scatter_instrs) {
-    TF_ASSIGN_OR_RETURN(HloInstruction * expanded_root, ExpandScatter(instr));
-    TF_RETURN_IF_ERROR(
-        instr->parent()->ReplaceInstruction(instr, expanded_root));
-  }
-
-  return !scatter_instrs.empty();
+bool ScatterExpander::InstructionMatchesPattern(HloInstruction* inst) {
+  return inst->opcode() == HloOpcode::kScatter &&
+         (mode_ == kEliminateAllScatters || ScatterTripCount(inst) == 1);
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/scatter_expander.h b/tensorflow/compiler/xla/service/scatter_expander.h
index 533af060bc9..aa59e7ec3b0 100644
--- a/tensorflow/compiler/xla/service/scatter_expander.h
+++ b/tensorflow/compiler/xla/service/scatter_expander.h
@@ -16,17 +16,43 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_SCATTER_EXPANDER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_SCATTER_EXPANDER_H_
 
-#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/service/op_expander_pass.h"
 
 namespace xla {
 
-class ScatterExpander : public HloModulePass {
+// This pass rewrites scatter operations into (roughly) while loops of
+// dynamic-update-slices.
+//
+// This pass can be used in two ways:
+//
+//   - kEliminateAllScatters: For backends that don't support scatter, this pass
+//     can convert every scatter into a loop.
+//
+//   - kEliminateSimpleScatters: For backends that *do* support scatter, this
+//     pass can strength-reduce "simple" scatters -- specifically, scatters that
+//     can be represented without a loop -- to dynamic-update-slices.
+//
+// Note that even in kEliminateSimpleScatters mode, this pass may still expand a
+// scatter into a loop (with a trip-count of 1).  It's up to other
+// simplification passes to remove the loop.
+class ScatterExpander : public OpExpanderPass {
  public:
+  enum Mode {
+    kEliminateAllScatters,
+    kEliminateSimpleScatters,
+  };
+
+  explicit ScatterExpander(Mode m) : mode_(m) {}
+
   absl::string_view name() const override { return "scatter_expander"; }
-  StatusOr<bool> Run(HloModule* module) override;
 
  protected:
-  StatusOr<HloInstruction*> ExpandScatter(HloInstruction* scatter);
+  bool InstructionMatchesPattern(HloInstruction* inst) override;
+
+  StatusOr<HloInstruction*> ExpandInstruction(HloInstruction* scatter) override;
+
+ private:
+  Mode mode_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/scatter_expander_test.cc b/tensorflow/compiler/xla/service/scatter_expander_test.cc
index 3852b82c1ef..9f4cc5406d8 100644
--- a/tensorflow/compiler/xla/service/scatter_expander_test.cc
+++ b/tensorflow/compiler/xla/service/scatter_expander_test.cc
@@ -57,11 +57,79 @@ TEST_F(ScatterExpanderTest, ScatterOperandWithoutLayout) {
                           ParseAndReturnVerifiedModule(kModuleStr));
 
   // The HLO parser changes all no layout shapes from the input to have a
-  // default layout, clear the layout of the scatter operand for testing.
+  // default layout. Clear the layout of the scatter operand for testing.
   HloInstruction* scatter_operand = FindInstruction(module.get(), "operand");
   scatter_operand->mutable_shape()->clear_layout();
 
-  ScatterExpander scatter_expander;
+  ScatterExpander scatter_expander(ScatterExpander::kEliminateAllScatters);
+  TF_ASSERT_OK_AND_ASSIGN(bool result,
+                          RunHloPass(&scatter_expander, module.get()));
+  EXPECT_TRUE(result);
+}
+
+TEST_F(ScatterExpanderTest, EliminateSimpleScattersSkipsNontrivialScatter) {
+  const char* kModuleStr = R"(
+    HloModule scatter_expander
+
+    scatter_computation {
+      parameter0 = s32[] parameter(0)
+      ROOT parameter1 = s32[] parameter(1)
+    }
+
+    ENTRY kernel_entry {
+      operand = s32[3,3] parameter(0)
+      indices = s32[2] parameter(1)
+      updates = s32[2,3] parameter(2)
+      ROOT scatter = s32[3,3] scatter(operand, indices, updates),
+          to_apply=scatter_computation,
+          update_window_dims={1},
+          inserted_window_dims={0},
+          scatter_dims_to_operand_dims={0},
+          index_vector_dim=1
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr));
+
+  // The HLO parser changes all no layout shapes from the input to have a
+  // default layout. Clear the layout of the scatter operand for testing.
+  HloInstruction* scatter_operand = FindInstruction(module.get(), "operand");
+  scatter_operand->mutable_shape()->clear_layout();
+
+  ScatterExpander scatter_expander(ScatterExpander::kEliminateSimpleScatters);
+  TF_ASSERT_OK_AND_ASSIGN(bool result,
+                          RunHloPass(&scatter_expander, module.get()));
+  EXPECT_FALSE(result);
+}
+
+TEST_F(ScatterExpanderTest, EliminateSimpleScattersRewritesTrivialScatter) {
+  const char* kModuleStr = R"(
+    HloModule scatter_expander
+
+    scatter_computation {
+      parameter0 = s32[] parameter(0)
+      ROOT parameter1 = s32[] parameter(1)
+    }
+
+    ENTRY kernel_entry {
+      operand = s32[5] iota(), iota_dimension=0
+      indices = s32[1] parameter(0)
+      update = s32[] constant(0)
+      ROOT scatter = s32[5]{0} scatter(operand, indices, update),
+        update_window_dims={}, inserted_window_dims={0},
+        scatter_dims_to_operand_dims={0}, index_vector_dim=0,
+        to_apply=scatter_computation
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr));
+
+  // The HLO parser changes all no layout shapes from the input to have a
+  // default layout. Clear the layout of the scatter operand for testing.
+  HloInstruction* scatter_operand = FindInstruction(module.get(), "operand");
+  scatter_operand->mutable_shape()->clear_layout();
+
+  ScatterExpander scatter_expander(ScatterExpander::kEliminateSimpleScatters);
   TF_ASSERT_OK_AND_ASSIGN(bool result,
                           RunHloPass(&scatter_expander, module.get()));
   EXPECT_TRUE(result);
diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc
index 8e39e32e4c3..a96c9c34260 100644
--- a/tensorflow/compiler/xla/service/shape_inference.cc
+++ b/tensorflow/compiler/xla/service/shape_inference.cc
@@ -2825,6 +2825,38 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
   return output_shape;
 }
 
+/* static */ StatusOr<Shape> ShapeInference::InferDynamicReshapeShape(
+    const Shape& operand, absl::Span<const Shape* const> dim_size_shapes,
+    absl::Span<const int64> new_size_bounds,
+    const std::vector<bool>& dims_are_dynamic) {
+  if (new_size_bounds.size() != dims_are_dynamic.size()) {
+    return InvalidArgument(
+        "DynamicReshape has to have the same number of elements in new_sizes "
+        "(%d) and dims_are_dynamic (%d)",
+        new_size_bounds.size(), dims_are_dynamic.size());
+  }
+
+  for (const Shape* dim_size_shape : dim_size_shapes) {
+    if (dim_size_shape->element_type() != S32 && dim_size_shape->rank() != 0) {
+      return InvalidArgument(
+          "DynamicReshape's dim size has to be scalar S32, got (%s): ",
+          dim_size_shape->ToString());
+    }
+  }
+
+  Shape inferred_shape = ShapeUtil::MakeShape(
+      operand.element_type(), new_size_bounds, dims_are_dynamic);
+  if (ShapeUtil::ElementsIn(operand) != ShapeUtil::ElementsIn(inferred_shape)) {
+    return InvalidArgument(
+        "Reshape operation has mismatched element counts: from=%d (%s) "
+        "to=%d (%s).",
+        ShapeUtil::ElementsIn(operand), ShapeUtil::HumanString(operand),
+        ShapeUtil::ElementsIn(inferred_shape),
+        ShapeUtil::HumanString(inferred_shape));
+  }
+  return inferred_shape;
+}
+
 /* static */ StatusOr<Shape> ShapeInference::InferReshapeShape(
     const Shape& operand, absl::Span<const int64> dimensions,
     absl::Span<const int64> new_sizes, int64 inferred_dimension) {
diff --git a/tensorflow/compiler/xla/service/shape_inference.h b/tensorflow/compiler/xla/service/shape_inference.h
index d47d96ab52d..f03e4e5fa98 100644
--- a/tensorflow/compiler/xla/service/shape_inference.h
+++ b/tensorflow/compiler/xla/service/shape_inference.h
@@ -241,6 +241,15 @@ class ShapeInference {
                                            absl::Span<const int64> new_sizes,
                                            int64 inferred_dimension);
 
+  // Infers the shape produced by a dynamic reshape operation from the element
+  // type of its operand and the new dimension sizes specified. The result shape
+  // will have dynamic dimensions as specific in `dim_is_dynamic` and bound
+  // `new_size_bounds`.
+  static StatusOr<Shape> InferDynamicReshapeShape(
+      const Shape& operand, absl::Span<const Shape* const> dim_size_shapes,
+      absl::Span<const int64> new_size_bounds,
+      const std::vector<bool>& dims_are_dynamic);
+
   // Infers the shape produced by a transpose operation from the element type of
   // its operand and its dimensions field.
   static StatusOr<Shape> InferTransposeShape(
diff --git a/tensorflow/compiler/xla/service/sharding_propagation.cc b/tensorflow/compiler/xla/service/sharding_propagation.cc
index 5d85fb5189c..6524973a08e 100644
--- a/tensorflow/compiler/xla/service/sharding_propagation.cc
+++ b/tensorflow/compiler/xla/service/sharding_propagation.cc
@@ -91,9 +91,7 @@ bool IsShardingMoreSpecific(const HloSharding& lhs, const HloSharding& rhs) {
     return is_better;
   }
   if (!rhs.IsTileMaximal()) {
-    // If we already have a non-tile-maximal sharding then we can't improve
-    // that.
-    return false;
+    return lhs.NumTiles() > rhs.NumTiles();
   } else if (!rhs.IsReplicated()) {
     // If we are not replicated then only tiled (not tile maximal) shardings
     // can improve us.
@@ -122,22 +120,158 @@ HloSharding MergeForMoreSpecificSharding(const HloSharding& a,
   return IsShardingMoreSpecific(a, b) ? a : b;
 }
 
+// Tries to refine `to_merge` by combining with `old`. Returns if the final
+// `to_merge` is more specific than `old`. May combine partial sharding in
+// addition to MergeForMoreSpecificSharding().
+bool MergeSharding(const HloSharding& old, HloSharding* to_merge,
+                   bool may_combine_partial_sharding) {
+  if (old.IsTuple()) {
+    CHECK(to_merge->IsTuple());
+    bool changed = false;
+    for (int64 i = 0; i < old.tuple_elements().size(); ++i) {
+      changed |=
+          MergeSharding(old.tuple_elements()[i], &to_merge->tuple_elements()[i],
+                        may_combine_partial_sharding);
+    }
+    return changed;
+  }
+  if (!may_combine_partial_sharding || !old.ReplicateOnLastTileDim() ||
+      !to_merge->ReplicateOnLastTileDim() ||
+      old.tile_assignment().num_elements() !=
+          to_merge->tile_assignment().num_elements()) {
+    return IsShardingMoreSpecific(*to_merge, old);
+  }
+  // Combine the tile dimension sizes from new and old.
+  int64 num_devices = old.tile_assignment().num_elements();
+  std::vector<int64> new_tile_dims;
+  bool compatible = true;
+  new_tile_dims.reserve(to_merge->tile_assignment().num_dimensions());
+  for (int64 i = 0; i < to_merge->tile_assignment().num_dimensions() - 1; ++i) {
+    int64 new_dim = to_merge->tile_assignment().dim(i);
+    int64 old_dim = old.tile_assignment().dim(i);
+    if (new_dim == 1) {
+      new_tile_dims.push_back(old_dim);
+    } else if (old_dim == 1) {
+      new_tile_dims.push_back(new_dim);
+    } else if (new_dim == old_dim) {
+      new_tile_dims.push_back(new_dim);
+    } else {
+      compatible = false;
+      break;
+    }
+  }
+  int64 replication = num_devices / Product(new_tile_dims);
+  if (!compatible || num_devices % Product(new_tile_dims) != 0 ||
+      replication >= old.tile_assignment().dimensions().back()) {
+    return IsShardingMoreSpecific(*to_merge, old);
+  }
+  new_tile_dims.push_back(replication);
+  Array<int64> new_tile(new_tile_dims);
+  // Maps from replication group ID to sorted members.
+  absl::flat_hash_map<int64, std::set<int64>> old_group_members;
+  absl::flat_hash_map<int64, std::set<int64>> new_group_members;
+  auto get_group_index = [&](absl::Span<const int64> tile_indices,
+                             const HloSharding& sharding) {
+    int64 group_id = 0;
+    for (int64 i = 0; i < tile_indices.size() - 1; ++i) {
+      group_id *= to_merge->tile_assignment().dim(i);
+      group_id += tile_indices[i];
+    }
+    return group_id;
+  };
+  old.tile_assignment().Each(
+      [&](absl::Span<const int64> indices, int64 device) {
+        old_group_members[get_group_index(indices, old)].insert(device);
+      });
+  to_merge->tile_assignment().Each(
+      [&](absl::Span<const int64> indices, int64 device) {
+        new_group_members[get_group_index(indices, *to_merge)].insert(device);
+      });
+  // Try to find the intersection of old and new replication groups, in
+  // order to determine the merged tile assignment.
+  new_tile.Each([&](absl::Span<const int64> indices, int64* device) {
+    if (!compatible) {
+      return;
+    }
+    std::vector<int64> old_index(indices.begin(), indices.end());
+    std::vector<int64> new_index = old_index;
+    for (int64 i = 0; i < indices.size() - 1; ++i) {
+      if (old.tile_assignment().dim(i) == 1) {
+        old_index[i] = 0;
+      }
+      if (to_merge->tile_assignment().dim(i) == 1) {
+        new_index[i] = 0;
+      }
+    }
+    int64 old_group_id = get_group_index(old_index, old);
+    int64 new_group_id = get_group_index(new_index, *to_merge);
+    if (old_group_members[old_group_id].empty() ||
+        new_group_members[new_group_id].empty() ||
+        *old_group_members[old_group_id].begin() !=
+            *new_group_members[new_group_id].begin()) {
+      compatible = false;
+      return;
+    }
+    *device = *old_group_members[old_group_id].begin();
+    old_group_members[old_group_id].erase(*device);
+    new_group_members[new_group_id].erase(*device);
+  });
+  if (compatible) {
+    if (replication == 1) {
+      new_tile_dims.pop_back();
+      new_tile.Reshape(new_tile_dims);
+      *to_merge = HloSharding::Tile(new_tile);
+    } else {
+      *to_merge = HloSharding::PartialTile(new_tile);
+    }
+    return true;
+  }
+  return IsShardingMoreSpecific(*to_merge, old);
+}
+
 // Updates the sharding of the specified instruction with the specified sharding
 // if it is better than the current one and returns true if a new sharding have
-// been applied.
-bool MaybeImproveInstructionSharding(const HloSharding& sharding,
-                                     HloInstruction* instruction) {
+// been applied. If may_combine_partial_sharding is true, this may combine the
+// new and existing sharding if they are both partial tiling partial
+// replication.
+bool MaybeImproveInstructionSharding(HloSharding sharding,
+                                     HloInstruction* instruction,
+                                     bool may_combine_partial_sharding) {
   // We don't want to propagate tile maximal shardings.
   if (!IsSpatiallyPartitioned(sharding)) {
     return false;
   }
   // Any sharding is better then no sharding.
   if (!instruction->has_sharding()) {
-    instruction->set_sharding(sharding);
+    instruction->set_sharding(std::move(sharding));
     return true;
   }
-  if (IsShardingMoreSpecific(sharding, instruction->sharding())) {
-    instruction->set_sharding(sharding);
+  int64 sharding_tiles = sharding.NumTiles();
+  if (MergeSharding(instruction->sharding(), &sharding,
+                    may_combine_partial_sharding)) {
+    // Override existing tiled sharding only when the new sharding is compatible
+    // with the existing one. This avoids unexpected resharding when `sharding`
+    // just has more tiles than existing sharding but they are not mergeable.
+    if (instruction->shape().IsArray() &&
+        !instruction->sharding().IsTileMaximal() &&
+        sharding.NumTiles() == sharding_tiles) {
+      std::vector<int64> diff_dims;
+      for (int64 i = 0; i < instruction->shape().rank(); ++i) {
+        if (instruction->sharding().tile_assignment().dim(i) ==
+            sharding.tile_assignment().dim(i)) {
+          continue;
+        }
+        if (instruction->sharding().tile_assignment().dim(i) != 1) {
+          return false;
+        }
+        diff_dims.push_back(i);
+      }
+      if (hlo_sharding_util::PartiallyReplicateTiledShardingOnDims(
+              sharding, diff_dims) != instruction->sharding()) {
+        return false;
+      }
+    }
+    instruction->set_sharding(std::move(sharding));
     return true;
   }
   return false;
@@ -277,6 +411,7 @@ const HloInstruction* PickRepresentativeOperand(
     case HloOpcode::kDot:
     case HloOpcode::kDynamicSlice:
     case HloOpcode::kDynamicUpdateSlice:
+    case HloOpcode::kDynamicReshape:
     case HloOpcode::kFft:
     case HloOpcode::kFusion:
     case HloOpcode::kGather:
@@ -361,12 +496,114 @@ bool SupportSpatialPartitioning(const HloInstruction* instruction,
   }
 }
 
+bool InferDotShardingFromOperands(
+    HloInstruction* instruction,
+    const dot_as_convolution_util::DotConvolutionDimsInfo& dnums,
+    bool may_combine_partial_sharding) {
+  auto from_operand = [&](int64 operand_index) {
+    auto operand = instruction->operand(operand_index);
+    const HloSharding& operand_sharding = operand->sharding();
+    if (operand_sharding.IsTileMaximal()) {
+      return operand_sharding;
+    }
+    std::vector<int64> contracting_dims;
+    contracting_dims.reserve(dnums.contracting_dims.size());
+    for (const auto& dim : dnums.contracting_dims) {
+      contracting_dims.push_back(operand_index == 0 ? dim.lhs : dim.rhs);
+    }
+    // It's possible that some size-1 spatial dims of convolutions are parsed as
+    // non-contracting dims. We might have tiled dimensions on them.
+    for (const auto& dim : operand_index == 0
+                               ? dnums.rhs_non_contracting_dims
+                               : dnums.lhs_non_contracting_dims) {
+      int64 d = operand_index == 0 ? dim.lhs : dim.rhs;
+      if (d > 0) {
+        contracting_dims.push_back(d);
+      }
+    }
+    auto replicate_contracting_dims =
+        hlo_sharding_util::PartiallyReplicateTiledShardingOnDims(
+            operand_sharding, contracting_dims);
+    std::vector<int64> out_dims_to_op_perm(instruction->shape().rank(), -1);
+    std::vector<int64> op_dims_to_output_perm(operand->shape().rank(), -1);
+    for (const auto& dim : dnums.batch_dims) {
+      out_dims_to_op_perm[dim.output] = operand_index == 0 ? dim.lhs : dim.rhs;
+      op_dims_to_output_perm[operand_index == 0 ? dim.lhs : dim.rhs] =
+          dim.output;
+    }
+    for (const auto& dim : operand_index == 0
+                               ? dnums.lhs_non_contracting_dims
+                               : dnums.rhs_non_contracting_dims) {
+      out_dims_to_op_perm[dim.output] = operand_index == 0 ? dim.lhs : dim.rhs;
+      op_dims_to_output_perm[operand_index == 0 ? dim.lhs : dim.rhs] =
+          dim.output;
+    }
+    return *hlo_sharding_util::TransposeShardingWithCollapsedDims(
+        replicate_contracting_dims, op_dims_to_output_perm,
+        out_dims_to_op_perm);
+  };
+  bool changed = false;
+  int64 larger_operand =
+      ShapeUtil::ByteSizeOf(instruction->operand(0)->shape()) >=
+              ShapeUtil::ByteSizeOf(instruction->operand(1)->shape())
+          ? 0
+          : 1;
+  if (IsSpatiallyPartitioned(instruction->operand(larger_operand))) {
+    changed |= MaybeImproveInstructionSharding(from_operand(larger_operand),
+                                               instruction,
+                                               may_combine_partial_sharding);
+  }
+  if (IsSpatiallyPartitioned(instruction->operand(1 - larger_operand))) {
+    changed |= MaybeImproveInstructionSharding(from_operand(1 - larger_operand),
+                                               instruction,
+                                               may_combine_partial_sharding);
+  }
+  return changed;
+}
+
 // Convolution handling for InferShardingFromOperands().
 bool InferConvolutionShardingFromOperands(HloInstruction* instruction,
-                                          bool aggressive_prop) {
+                                          int64 aggressiveness,
+                                          bool may_combine_partial_sharding) {
+  auto get_partitions_for_dims =
+      [&](const HloInstruction* inst,
+          absl::Span<
+              const dot_as_convolution_util::DotConvolutionDimsInfo::DimNums>
+              dims,
+          int lhs_or_rhs) {
+        int64 partitions = 1;
+        if (!inst->has_sharding()) {
+          return partitions;
+        }
+        const auto& sharding = inst->sharding();
+        if (sharding.IsTileMaximal()) {
+          return partitions;
+        }
+        for (const auto& dim : dims) {
+          if (lhs_or_rhs == 0) {
+            partitions *= sharding.tile_assignment().dim(dim.lhs);
+          } else {
+            CHECK_EQ(lhs_or_rhs, 1);
+            partitions *= sharding.tile_assignment().dim(dim.rhs);
+          }
+        }
+        return partitions;
+      };
+  auto dot_dims =
+      dot_as_convolution_util::ParseConvolutionDimsInfo(instruction);
+  const int64 lhs_conv_spatial_partitions = get_partitions_for_dims(
+      instruction->operand(0), dot_dims.conv_spatial_dims, 0);
+  const int64 rhs_conv_spatial_partitions = get_partitions_for_dims(
+      instruction->operand(1), dot_dims.conv_spatial_dims, 1);
+  if (dot_dims.conv_spatial_dims.empty() ||
+      (lhs_conv_spatial_partitions == 1 && rhs_conv_spatial_partitions == 1 &&
+       instruction->batch_group_count() == 1 &&
+       instruction->feature_group_count() == 1)) {
+    return InferDotShardingFromOperands(instruction, dot_dims,
+                                        may_combine_partial_sharding);
+  }
   const auto& dnums = instruction->convolution_dimension_numbers();
   const HloInstruction* lhs = instruction->operand(0);
-  const HloInstruction* rhs = instruction->operand(1);
   auto get_tiled_sharding_based_on_lhs = [&] {
     CHECK(!lhs->sharding().IsTileMaximal());
     std::vector<int64> output_to_lhs_indices(instruction->shape().rank());
@@ -381,103 +618,12 @@ bool InferConvolutionShardingFromOperands(HloInstruction* instruction,
     return hlo_sharding_util::TransposeSharding(lhs->sharding(),
                                                 output_to_lhs_indices);
   };
-  auto get_tiled_sharding_based_on_rhs = [&] {
-    CHECK(!rhs->sharding().IsTileMaximal());
-    std::vector<int64> output_to_rhs_indices(instruction->shape().rank());
-    output_to_rhs_indices[dnums.output_batch_dimension()] =
-        dnums.kernel_input_feature_dimension();
-    output_to_rhs_indices[dnums.output_feature_dimension()] =
-        dnums.kernel_output_feature_dimension();
-    for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
-      output_to_rhs_indices[dnums.output_spatial_dimensions(i)] =
-          dnums.kernel_spatial_dimensions(i);
-    }
-    return hlo_sharding_util::TransposeSharding(rhs->sharding(),
-                                                output_to_rhs_indices);
-  };
-  if (auto dot_dims = dot_as_convolution_util::ParseDotGeneralFromConvolution(
-          instruction)) {
-    // lhs_or_rhs: lhs is 0 and rhs is 1. Skips dimensions with size 1.
-    auto partitioned_only_along_non_trivial_dims =
-        [&](const HloSharding& sharding,
-            std::vector<dot_as_convolution_util::
-                            DotGeneralAsConvolutionDimsInfo::DimNums>& dims,
-            int64 lhs_or_rhs) {
-          if (sharding.IsTileMaximal()) {
-            return false;
-          }
-          int64 partition_count = 1;
-          for (const auto& dim : dims) {
-            if (lhs_or_rhs == 0) {
-              if (lhs->shape().dimensions(dim.lhs) == 1) {
-                continue;
-              }
-              partition_count *= sharding.tile_assignment().dim(dim.lhs);
-            } else {
-              if (rhs->shape().dimensions(dim.rhs) == 1) {
-                continue;
-              }
-              CHECK_EQ(lhs_or_rhs, 1);
-              partition_count *= sharding.tile_assignment().dim(dim.rhs);
-            }
-          }
-          return partition_count == sharding.tile_assignment().num_elements();
-        };
-    // If LHS/RHS is partitioned only along the batch dimensions, propagate
-    // the sharding to the output, since batch dimensions are the easiest to
-    // partition.
-    if (IsSpatiallyPartitioned(lhs) &&
-        partitioned_only_along_non_trivial_dims(lhs->sharding(),
-                                                dot_dims->batch_dims, 0)) {
-      return MaybeImproveInstructionSharding(get_tiled_sharding_based_on_lhs(),
-                                             instruction);
-    }
-    if (IsSpatiallyPartitioned(rhs) &&
-        partitioned_only_along_non_trivial_dims(rhs->sharding(),
-                                                dot_dims->batch_dims, 1)) {
-      return MaybeImproveInstructionSharding(get_tiled_sharding_based_on_rhs(),
-                                             instruction);
-    }
-    if (aggressive_prop) {
-      // If LHS/RHS is partitioned only along the non-contracting
-      // dimensions, propagate the sharding to the output.
-      const bool can_propagate_from_lhs =
-          IsSpatiallyPartitioned(lhs) &&
-          partitioned_only_along_non_trivial_dims(
-              lhs->sharding(), dot_dims->lhs_non_contracting_dims, 0);
-      const bool can_propagate_from_rhs =
-          IsSpatiallyPartitioned(rhs) &&
-          partitioned_only_along_non_trivial_dims(
-              rhs->sharding(), dot_dims->rhs_non_contracting_dims, 1);
-      // If we can propagate from both operands, choose the larger one which
-      // should help us reduce communications.
-      if (can_propagate_from_lhs && can_propagate_from_rhs) {
-        if (Product(lhs->shape().dimensions()) >=
-            Product(rhs->shape().dimensions())) {
-          return MaybeImproveInstructionSharding(
-              get_tiled_sharding_based_on_lhs(), instruction);
-        } else {
-          return MaybeImproveInstructionSharding(
-              get_tiled_sharding_based_on_rhs(), instruction);
-        }
-      }
-      if (can_propagate_from_lhs) {
-        return MaybeImproveInstructionSharding(
-            get_tiled_sharding_based_on_lhs(), instruction);
-      }
-      if (can_propagate_from_rhs) {
-        return MaybeImproveInstructionSharding(
-            get_tiled_sharding_based_on_rhs(), instruction);
-      }
-    }
-  }
-
   if (!IsSpatiallyPartitioned(lhs)) {
     return false;
   }
   if (lhs->sharding().IsReplicated()) {
-    return MaybeImproveInstructionSharding(HloSharding::Replicate(),
-                                           instruction);
+    return MaybeImproveInstructionSharding(
+        HloSharding::Replicate(), instruction, may_combine_partial_sharding);
   }
 
   if (IsConvolutionKernelSmall(instruction)) {
@@ -488,11 +634,28 @@ bool InferConvolutionShardingFromOperands(HloInstruction* instruction,
       return false;
     }
     return MaybeImproveInstructionSharding(get_tiled_sharding_based_on_lhs(),
-                                           instruction);
+                                           instruction,
+                                           may_combine_partial_sharding);
   }
   // If the kernel is large (e.g backward convolution) then we only support
   // replicated output.
-  return MaybeImproveInstructionSharding(HloSharding::Replicate(), instruction);
+  return MaybeImproveInstructionSharding(HloSharding::Replicate(), instruction,
+                                         may_combine_partial_sharding);
+}
+
+bool CanPropagateThroughAtAgressiveLevel(const HloInstruction& inst,
+                                         int64 aggressiveness) {
+  // At minimum agressiveness, only allow pass-through ops.
+  if (aggressiveness < 1 && !inst.IsElementwise() &&
+      inst.opcode() != HloOpcode::kTranspose &&
+      inst.opcode() != HloOpcode::kReshape) {
+    return false;
+  }
+  // Broadcast propagation should have at least aggressiveness 2.
+  if (aggressiveness < 2 && inst.opcode() == HloOpcode::kBroadcast) {
+    return false;
+  }
+  return true;
 }
 
 // Tries to update the sharding of the specified instruction based on its
@@ -500,7 +663,11 @@ bool InferConvolutionShardingFromOperands(HloInstruction* instruction,
 // changed and false otherwise.
 bool InferShardingFromOperands(HloInstruction* instruction,
                                const ComputationMap& computation_map,
-                               bool is_spmd, bool aggressive_prop) {
+                               bool is_spmd, int64 aggressiveness) {
+  if (!CanPropagateThroughAtAgressiveLevel(*instruction, aggressiveness)) {
+    return false;
+  }
+  const bool may_combine_partial_sharding = is_spmd && aggressiveness > 0;
   if (!SupportSpatialPartitioning(instruction, computation_map, is_spmd)) {
     // If an array shaped HLO doesn't support spatial partitioning but at least
     // one of its operand is replicated then we make the HLO replicated as well.
@@ -512,8 +679,8 @@ bool InferShardingFromOperands(HloInstruction* instruction,
     if (absl::c_any_of(instruction->operands(), [](const HloInstruction* op) {
           return op->has_sharding() && op->sharding().IsReplicated();
         })) {
-      return MaybeImproveInstructionSharding(HloSharding::Replicate(),
-                                             instruction);
+      return MaybeImproveInstructionSharding(
+          HloSharding::Replicate(), instruction, may_combine_partial_sharding);
     }
     return false;
   }
@@ -526,7 +693,8 @@ bool InferShardingFromOperands(HloInstruction* instruction,
       }
       HloSharding new_sharding = operand->sharding().GetSubSharding(
           operand->shape(), {instruction->tuple_index()});
-      return MaybeImproveInstructionSharding(new_sharding, instruction);
+      return MaybeImproveInstructionSharding(
+          std::move(new_sharding), instruction, may_combine_partial_sharding);
     }
     case HloOpcode::kTuple: {
       if (absl::c_none_of(instruction->operands(),
@@ -591,60 +759,60 @@ bool InferShardingFromOperands(HloInstruction* instruction,
         if (!IsSpatiallyPartitioned(operand)) {
           continue;
         }
-        auto get_maybe_tuple_sharding = [&](const HloSharding& sharding) {
+        auto get_maybe_tuple_sharding = [&](HloSharding sharding) {
           if (instruction->operand_count() == 2) {
             return sharding;
           }
           std::vector<HloSharding> tuple(instruction->operand_count() / 2,
-                                         sharding);
+                                         std::move(sharding));
           return HloSharding::Tuple(instruction->shape(), tuple);
         };
-        if (operand->sharding().IsReplicated()) {
+        if (operand->sharding().IsReplicated() ||
+            (!is_spmd &&
+             absl::c_any_of(instruction->dimensions(), [operand](int64 dim) {
+               return operand->sharding().tile_assignment().dim(dim) > 1;
+             }))) {
+          // We are reducing along one of the sharded dimensions. We only
+          // support this in SPMD.
           changed |= MaybeImproveInstructionSharding(
-              get_maybe_tuple_sharding(HloSharding::Replicate()), instruction);
+              get_maybe_tuple_sharding(HloSharding::Replicate()), instruction,
+              may_combine_partial_sharding);
           continue;
         }
-        if (absl::c_any_of(instruction->dimensions(), [operand](int64 dim) {
-              return operand->sharding().tile_assignment().dim(dim) > 1;
-            })) {
-          // We are reducing along one of the sharded dimensions. We don't
-          // support tiled sharding in this case.
+        auto after_partial_replication =
+            operand->sharding().IsReplicated()
+                ? operand->sharding()
+                : hlo_sharding_util::PartiallyReplicateTiledShardingOnDims(
+                      operand->sharding(), instruction->dimensions());
+        if (after_partial_replication.IsReplicated()) {
           changed |= MaybeImproveInstructionSharding(
-              get_maybe_tuple_sharding(HloSharding::Replicate()), instruction);
-        } else {
-          // We are reducing along some of the non-sharded dimensions. The
-          // result sharding should be the same as the operand sharding with the
-          // reduction dimensions removed as they are removed from the result
-          // shape.
-          std::vector<int64> target_tile_assignment_dimensions;
-          const auto& dimensions = instruction->dimensions();
-          for (int64 i = 0; i < operand->shape().rank(); ++i) {
-            if (absl::c_find(dimensions, i) == dimensions.end()) {
-              target_tile_assignment_dimensions.push_back(
-                  operand->sharding().tile_assignment().dim(i));
-            }
-          }
-          Array<int64> new_tile_assignment =
-              operand->sharding().tile_assignment();
-          new_tile_assignment.Reshape(target_tile_assignment_dimensions);
-          // Use the same sharding for all tuple elements, because they are part
-          // of the same reduce instruction.
-          HloSharding new_sharding =
-              get_maybe_tuple_sharding(HloSharding::Tile(new_tile_assignment));
-          changed |= MaybeImproveInstructionSharding(new_sharding, instruction);
+              get_maybe_tuple_sharding(HloSharding::Replicate()), instruction,
+              may_combine_partial_sharding);
+          continue;
         }
+        // Use the same sharding for all tuple elements, because they are part
+        // of the same reduce instruction.
+        HloSharding new_sharding =
+            get_maybe_tuple_sharding(hlo_sharding_util::RemoveShapeDimensions(
+                after_partial_replication, instruction->dimensions()));
+        changed |= MaybeImproveInstructionSharding(
+            std::move(new_sharding), instruction, may_combine_partial_sharding);
       }
       return changed;
     }
     case HloOpcode::kBroadcast: {
-      const HloInstruction* op = instruction->operand(0);
-      if (!IsSpatiallyPartitioned(op) || op->sharding().IsReplicated()) {
+      // Make forward propagation through broadcast low priority to avoid
+      // resharding after broadcast.
+      if (aggressiveness < 3) {
         return false;
       }
-      // Heuristic: If an operand is more than 8 times fewer elements than its
-      // output, do not propagate sharding.
-      if (ShapeUtil::ElementsIn(instruction->shape()) >
-          8 * ShapeUtil::ElementsIn(op->shape())) {
+      // Do not override existing tile sharding. This is likely from users.
+      if (IsSpatiallyPartitioned(instruction) &&
+          !instruction->sharding().IsTileMaximal()) {
+        return false;
+      }
+      const HloInstruction* op = instruction->operand(0);
+      if (!IsSpatiallyPartitioned(op) || op->sharding().IsReplicated()) {
         return false;
       }
       // The output will be tiled along the broadcasted dimension the same way
@@ -662,13 +830,22 @@ bool InferShardingFromOperands(HloInstruction* instruction,
               op->sharding().tile_assignment().dim(source_dim));
         }
       }
+      if (op->sharding().ReplicateOnLastTileDim()) {
+        target_tile_assignment_dimensions.push_back(
+            op->sharding().tile_assignment().dimensions().back());
+      }
       Array<int64> new_tile_assignment = op->sharding().tile_assignment();
       new_tile_assignment.Reshape(target_tile_assignment_dimensions);
-      HloSharding new_sharding = HloSharding::Tile(new_tile_assignment);
-      return MaybeImproveInstructionSharding(new_sharding, instruction);
+      HloSharding new_sharding =
+          op->sharding().ReplicateOnLastTileDim()
+              ? HloSharding::PartialTile(new_tile_assignment)
+              : HloSharding::Tile(new_tile_assignment);
+      return MaybeImproveInstructionSharding(
+          std::move(new_sharding), instruction, may_combine_partial_sharding);
     }
     case HloOpcode::kConvolution:
-      return InferConvolutionShardingFromOperands(instruction, aggressive_prop);
+      return InferConvolutionShardingFromOperands(instruction, aggressiveness,
+                                                  may_combine_partial_sharding);
     case HloOpcode::kTranspose: {
       const HloInstruction* input = instruction->operand(0);
       if (!IsSpatiallyPartitioned(input)) {
@@ -676,7 +853,8 @@ bool InferShardingFromOperands(HloInstruction* instruction,
       }
       HloSharding sharding = hlo_sharding_util::TransposeSharding(
           input->sharding(), instruction->dimensions());
-      return MaybeImproveInstructionSharding(sharding, instruction);
+      return MaybeImproveInstructionSharding(std::move(sharding), instruction,
+                                             may_combine_partial_sharding);
     }
     case HloOpcode::kReduceWindow: {
       const HloInstruction* lhs = instruction->operand(0);
@@ -694,7 +872,8 @@ bool InferShardingFromOperands(HloInstruction* instruction,
                 << instruction->ToString();
         return false;
       }
-      return MaybeImproveInstructionSharding(lhs->sharding(), instruction);
+      return MaybeImproveInstructionSharding(lhs->sharding(), instruction,
+                                             may_combine_partial_sharding);
     }
     case HloOpcode::kSelectAndScatter: {
       // Shard according to first operand, as output keeps the same shape.
@@ -713,7 +892,8 @@ bool InferShardingFromOperands(HloInstruction* instruction,
                 << instruction->ToString();
         return false;
       }
-      return MaybeImproveInstructionSharding(lhs->sharding(), instruction);
+      return MaybeImproveInstructionSharding(lhs->sharding(), instruction,
+                                             may_combine_partial_sharding);
     }
     case HloOpcode::kReshape: {
       if (!IsSpatiallyPartitioned(instruction->operand(0))) {
@@ -724,8 +904,9 @@ bool InferShardingFromOperands(HloInstruction* instruction,
               instruction->operand(0)->shape(), instruction->shape(),
               instruction->operand(0)->sharding());
       if (new_sharding.has_value()) {
-        return MaybeImproveInstructionSharding(new_sharding.value(),
-                                               instruction);
+        return MaybeImproveInstructionSharding(std::move(*new_sharding),
+                                               instruction,
+                                               may_combine_partial_sharding);
       }
       return false;
     }
@@ -736,83 +917,13 @@ bool InferShardingFromOperands(HloInstruction* instruction,
       return MaybeImproveInstructionSharding(
           hlo_sharding_util::ReverseSharding(
               instruction->operand(0)->sharding(), instruction->dimensions()),
-          instruction);
+          instruction, may_combine_partial_sharding);
     }
     case HloOpcode::kDot: {
-      auto& dot_dim_numbs = instruction->dot_dimension_numbers();
-      // Batch dimensions are the same for lhs and rhs on dot operations.
-      int64 num_batch_dims = dot_dim_numbs.lhs_batch_dimensions_size();
-      std::vector<int64> contracting_dims(2);
-      contracting_dims[0] = dot_dim_numbs.lhs_contracting_dimensions(0);
-      contracting_dims[1] = dot_dim_numbs.rhs_contracting_dimensions(0);
-      std::vector<const HloSharding*> ops_sharding(2, nullptr);
-      for (int64 op_num = 0; op_num < 2; ++op_num) {
-        const HloInstruction* op = instruction->operand(op_num);
-        if (IsSpatiallyPartitioned(op)) {
-          ops_sharding[op_num] = &op->sharding();
-        }
-      }
-      if (ops_sharding[0] == nullptr && ops_sharding[1] == nullptr) {
-        return false;
-      }
-
-      // Select representative operand.
-      int64 representative_op = -1;
-      if (ops_sharding[0] == nullptr) {
-        representative_op = 1;
-      } else if (ops_sharding[1] == nullptr) {
-        representative_op = 0;
-      } else if (ops_sharding[0]->IsReplicated() &&
-                 ops_sharding[1]->IsReplicated()) {
-        // Both replicated -> replicate
-        return MaybeImproveInstructionSharding(HloSharding::Replicate(),
-                                               instruction);
-      } else if (!ops_sharding[0]->IsReplicated() &&
-                 !ops_sharding[1]->IsReplicated()) {
-        // Both tile sharded. The dot spatial partitioning implementation
-        // replicates the operand corresponding to the non-tiled dimension:
-        // dot(lhs, rhs), sharding={devices=[1, ..., n, 1]} replicates rhs
-        // dot(lhs, rhs), sharding={devices=[1, ..., 1, n]} replicates lhs
-        // so set sharding in order to replicate the smaller of lhs and rhs
-        representative_op =
-            ShapeUtil::ByteSizeOf(instruction->operand(0)->shape()) <
-                    ShapeUtil::ByteSizeOf(instruction->operand(1)->shape())
-                ? 1
-                : 0;
-      } else {
-        // One is replicated and the other is tiled - pick the tiled one.
-        representative_op = ops_sharding[0]->IsReplicated() ? 1 : 0;
-      }
-
-      if (ops_sharding[representative_op]->IsReplicated()) {
-        return MaybeImproveInstructionSharding(HloSharding::Replicate(),
-                                               instruction);
-      } else {
-        // Tile-shard instruction according to representative op.
-        auto sharding = *ops_sharding[representative_op];
-        if (instruction->shape().dimensions_size() !=
-            sharding.tile_assignment().num_dimensions()) {
-          // It is necessarily the case of a matrix x vector, with
-          // representative_op being the matrix, because the vector op has the
-          // same shape as instruction.
-          CHECK_EQ(sharding.tile_assignment().num_dimensions(),
-                   instruction->shape().dimensions_size() + 1);
-          // Reshape sharding so that last dimension is 1, and then remove
-          // last dimension.
-          std::vector<int64> non_batch_dims(
-              sharding.tile_assignment().num_dimensions() - num_batch_dims);
-          absl::c_iota(non_batch_dims, num_batch_dims);
-          sharding = hlo_sharding_util::ReshapeToTileDimension(
-              sharding, num_batch_dims, non_batch_dims);
-          auto tile_assignment = sharding.tile_assignment();
-          auto dimensions = tile_assignment.dimensions();
-          CHECK_EQ(dimensions.back(), 1);
-          dimensions.pop_back();
-          tile_assignment.Reshape(dimensions);
-          sharding = HloSharding::Tile(tile_assignment);
-        }
-        return MaybeImproveInstructionSharding(sharding, instruction);
-      }
+      const auto& dnums =
+          dot_as_convolution_util::ParseDotGeneralFromDot(instruction);
+      return InferDotShardingFromOperands(instruction, dnums,
+                                          may_combine_partial_sharding);
     }
     case HloOpcode::kParameter: {
       auto parent_it = computation_map.find(instruction->parent());
@@ -826,7 +937,8 @@ bool InferShardingFromOperands(HloInstruction* instruction,
             if (parent->called_computations()[i - 1] == instruction->parent()) {
               if (parent->operand(i)->has_sharding()) {
                 return MaybeImproveInstructionSharding(
-                    parent->operand(i)->sharding(), instruction);
+                    parent->operand(i)->sharding(), instruction,
+                    may_combine_partial_sharding);
               }
               return false;
             }
@@ -853,15 +965,15 @@ bool InferShardingFromOperands(HloInstruction* instruction,
       if (instruction->shape().IsTuple()) {
         return MaybeImproveInstructionSharding(
             HloSharding::SingleTuple(instruction->shape(), operand->sharding()),
-            instruction);
+            instruction, may_combine_partial_sharding);
       } else {
-        return MaybeImproveInstructionSharding(operand->sharding(),
-                                               instruction);
+        return MaybeImproveInstructionSharding(operand->sharding(), instruction,
+                                               may_combine_partial_sharding);
       }
     }
     case HloOpcode::kDynamicSlice:
     case HloOpcode::kDynamicUpdateSlice: {
-      auto propagate_slicing = [instruction]() {
+      auto propagate_slicing = [&]() {
         const HloInstruction* operand =
             instruction->opcode() == HloOpcode::kDynamicSlice
                 ? instruction->operand(0)
@@ -872,7 +984,8 @@ bool InferShardingFromOperands(HloInstruction* instruction,
 
         if (operand->sharding().IsReplicated()) {
           return MaybeImproveInstructionSharding(HloSharding::Replicate(),
-                                                 instruction);
+                                                 instruction,
+                                                 may_combine_partial_sharding);
         }
 
         const auto& tile_assignment = operand->sharding().tile_assignment();
@@ -883,10 +996,10 @@ bool InferShardingFromOperands(HloInstruction* instruction,
             return false;
           }
         }
-        return MaybeImproveInstructionSharding(operand->sharding(),
-                                               instruction);
+        return MaybeImproveInstructionSharding(operand->sharding(), instruction,
+                                               may_combine_partial_sharding);
       };
-      auto propagate_base = [instruction]() {
+      auto propagate_base = [&]() {
         if (instruction->opcode() != HloOpcode::kDynamicUpdateSlice) {
           return false;
         }
@@ -894,7 +1007,8 @@ bool InferShardingFromOperands(HloInstruction* instruction,
           return false;
         }
         return MaybeImproveInstructionSharding(
-            instruction->operand(0)->sharding(), instruction);
+            instruction->operand(0)->sharding(), instruction,
+            may_combine_partial_sharding);
       };
       return propagate_slicing() || propagate_base();
     }
@@ -903,15 +1017,17 @@ bool InferShardingFromOperands(HloInstruction* instruction,
       if (IsSpatiallyPartitioned(instruction->operand(1))) {
         HloSharding new_sharding = hlo_sharding_util::GatherOutputSharding(
             instruction->operand(1)->sharding(), instruction);
-        changed |= MaybeImproveInstructionSharding(new_sharding, instruction);
+        changed |= MaybeImproveInstructionSharding(
+            std::move(new_sharding), instruction, may_combine_partial_sharding);
       }
       if (is_spmd && IsSpatiallyPartitioned(instruction->operand(0))) {
         auto maybe_from_data =
             hlo_sharding_util::GatherOutputShardingFromDataOperand(
                 instruction->operand(0)->sharding(), *instruction);
         if (maybe_from_data) {
-          changed |=
-              MaybeImproveInstructionSharding(*maybe_from_data, instruction);
+          changed |= MaybeImproveInstructionSharding(
+              std::move(*maybe_from_data), instruction,
+              may_combine_partial_sharding);
         }
       }
       return changed;
@@ -920,7 +1036,8 @@ bool InferShardingFromOperands(HloInstruction* instruction,
       bool changed = false;
       if (is_spmd && IsSpatiallyPartitioned(instruction->operand(0))) {
         changed |= MaybeImproveInstructionSharding(
-            instruction->operand(0)->sharding(), instruction);
+            instruction->operand(0)->sharding(), instruction,
+            may_combine_partial_sharding);
       }
       if (!IsSpatiallyPartitioned(instruction->operand(1)) &&
           !IsSpatiallyPartitioned(instruction->operand(2))) {
@@ -931,12 +1048,13 @@ bool InferShardingFromOperands(HloInstruction* instruction,
             hlo_sharding_util::ScatterOutputShardingFromUpdate(
                 instruction->operand(2)->sharding(), *instruction);
         if (maybe_from_update) {
-          changed |=
-              MaybeImproveInstructionSharding(*maybe_from_update, instruction);
+          changed |= MaybeImproveInstructionSharding(
+              std::move(*maybe_from_update), instruction,
+              may_combine_partial_sharding);
         }
       }
-      changed |= MaybeImproveInstructionSharding(HloSharding::Replicate(),
-                                                 instruction);
+      changed |= MaybeImproveInstructionSharding(
+          HloSharding::Replicate(), instruction, may_combine_partial_sharding);
       return changed;
     }
     case HloOpcode::kWhile: {
@@ -948,50 +1066,143 @@ bool InferShardingFromOperands(HloInstruction* instruction,
         sharding =
             MergeForMoreSpecificSharding(sharding, instruction->sharding());
       }
-      return MaybeImproveInstructionSharding(sharding, instruction);
+      return MaybeImproveInstructionSharding(std::move(sharding), instruction,
+                                             may_combine_partial_sharding);
     }
     default: {
+      if (instruction->IsElementwise() && may_combine_partial_sharding) {
+        bool changed = false;
+        for (auto operand : instruction->operands()) {
+          if (IsSpatiallyPartitioned(operand)) {
+            changed |= MaybeImproveInstructionSharding(
+                operand->sharding(), instruction, may_combine_partial_sharding);
+          }
+        }
+        return changed;
+      }
       const HloInstruction* operand = PickRepresentativeOperand(instruction);
       if (!operand || !IsSpatiallyPartitioned(operand)) {
         return false;
       }
-      return MaybeImproveInstructionSharding(operand->sharding(), instruction);
+      return MaybeImproveInstructionSharding(operand->sharding(), instruction,
+                                             may_combine_partial_sharding);
     }
   }
   return false;
 }
 
+HloSharding InferDotOperandSharding(
+    const HloInstruction* instruction,
+    const dot_as_convolution_util::DotConvolutionDimsInfo& dnums,
+    int64 operand_index, bool may_combine_partial_sharding) {
+  auto operand = instruction->operand(operand_index);
+  auto other = instruction->operand(1 - operand_index);
+  std::vector<int64> output_dims_to_replicate;
+  std::vector<int64> other_operand_dims_to_replicate;
+  for (const auto& dim : operand_index == 0 ? dnums.rhs_non_contracting_dims
+                                            : dnums.lhs_non_contracting_dims) {
+    output_dims_to_replicate.push_back(dim.output);
+    other_operand_dims_to_replicate.push_back(operand_index == 0 ? dim.rhs
+                                                                 : dim.lhs);
+  }
+  // If this dot is interpreted from a conv, then contracting dims may have
+  // corresponding spatial dimensions in the output, and this operand's
+  // non-contracting dims may have corresponding spatial dims in the other
+  // operand.
+  for (const auto& dim : dnums.contracting_dims) {
+    if (dim.output >= 0) {
+      output_dims_to_replicate.push_back(dim.output);
+    }
+  }
+  for (const auto& dim : operand_index == 0 ? dnums.lhs_non_contracting_dims
+                                            : dnums.rhs_non_contracting_dims) {
+    int64 other_dim = operand_index == 0 ? dim.rhs : dim.lhs;
+    if (other_dim >= 0) {
+      other_operand_dims_to_replicate.push_back(other_dim);
+    }
+  }
+  auto output_other_dims_replicated =
+      hlo_sharding_util::PartiallyReplicateTiledShardingOnDims(
+          instruction->sharding(), output_dims_to_replicate);
+  std::vector<int64> output_to_operand_dims(instruction->shape().rank(), -1);
+  std::vector<int64> operand_to_output_dims(operand->shape().rank(), -1);
+  for (const auto& dim : dnums.batch_dims) {
+    output_to_operand_dims[dim.output] = operand_index == 0 ? dim.lhs : dim.rhs;
+    operand_to_output_dims[operand_index == 0 ? dim.lhs : dim.rhs] = dim.output;
+  }
+  for (const auto& dim : operand_index == 0 ? dnums.lhs_non_contracting_dims
+                                            : dnums.rhs_non_contracting_dims) {
+    output_to_operand_dims[dim.output] = operand_index == 0 ? dim.lhs : dim.rhs;
+    operand_to_output_dims[operand_index == 0 ? dim.lhs : dim.rhs] = dim.output;
+  }
+  auto sharding = *hlo_sharding_util::TransposeShardingWithCollapsedDims(
+      output_other_dims_replicated, output_to_operand_dims,
+      operand_to_output_dims);
+  if (IsSpatiallyPartitioned(other)) {
+    auto other_operand_dims_replicated =
+        hlo_sharding_util::PartiallyReplicateTiledShardingOnDims(
+            other->sharding(), other_operand_dims_to_replicate);
+    std::vector<int64> other_to_operand_dims(other->shape().rank(), -1);
+    std::vector<int64> operand_to_other_dims(operand->shape().rank(), -1);
+    for (const auto& dim : dnums.batch_dims) {
+      other_to_operand_dims[operand_index == 0 ? dim.rhs : dim.lhs] =
+          operand_index == 0 ? dim.lhs : dim.rhs;
+      operand_to_other_dims[operand_index == 0 ? dim.lhs : dim.rhs] =
+          operand_index == 0 ? dim.rhs : dim.lhs;
+    }
+    for (const auto& dim : dnums.contracting_dims) {
+      other_to_operand_dims[operand_index == 0 ? dim.rhs : dim.lhs] =
+          operand_index == 0 ? dim.lhs : dim.rhs;
+      operand_to_other_dims[operand_index == 0 ? dim.lhs : dim.rhs] =
+          operand_index == 0 ? dim.rhs : dim.lhs;
+    }
+    HloSharding sharding_from_other =
+        *hlo_sharding_util::TransposeShardingWithCollapsedDims(
+            other_operand_dims_replicated, other_to_operand_dims,
+            operand_to_other_dims);
+    if (MergeSharding(sharding, &sharding_from_other,
+                      may_combine_partial_sharding)) {
+      sharding = std::move(sharding_from_other);
+    }
+  }
+  return sharding;
+}
+
 // Return the sharding that should be propagated from user to instruction.
 absl::optional<HloSharding> GetShardingFromUser(
     const HloInstruction& instruction, const HloInstruction& user,
-    bool aggressive_prop, bool is_spmd) {
+    int64 aggressiveness, bool is_spmd) {
+  if (!CanPropagateThroughAtAgressiveLevel(user, aggressiveness)) {
+    return absl::nullopt;
+  }
   if (!IsSpatiallyPartitioned(&user)) {
     return absl::nullopt;
   }
+  const bool may_combine_partial_sharding = is_spmd && aggressiveness > 0;
   switch (user.opcode()) {
     case HloOpcode::kBroadcast: {
       if (user.sharding().IsReplicated()) {
         return user.sharding();
       }
-      // Only support when none of the partitioned dimensions in the broadcast
-      // output belong to new dimensions.
+      std::vector<int64> dims_to_replicate;
+      bool needs_replication = false;
       for (int64 i = 0; i < user.shape().rank(); ++i) {
-        if (user.sharding().tile_assignment().dim(i) > 1 &&
-            absl::c_count(user.dimensions(), i) == 0) {
-          return absl::nullopt;
+        if (absl::c_count(user.dimensions(), i) == 0) {
+          dims_to_replicate.push_back(i);
+          if (user.sharding().tile_assignment().dim(i) > 1) {
+            needs_replication = true;
+          }
         }
       }
-
-      // The instruction (operand of broadcast) will be tiled the same way
-      // as the output.
-      std::vector<int64> target_tile_assignment_dimensions;
-      for (int64 output_dim : user.dimensions()) {
-        target_tile_assignment_dimensions.push_back(
-            user.sharding().tile_assignment().dim(output_dim));
+      // If not SPMD, only support when none of the partitioned dimensions in
+      // the broadcast output belong to new dimensions.
+      if (!is_spmd && needs_replication) {
+        return absl::nullopt;
       }
-      Array<int64> new_tile_assignment = user.sharding().tile_assignment();
-      new_tile_assignment.Reshape(target_tile_assignment_dimensions);
-      return HloSharding::Tile(new_tile_assignment);
+      return hlo_sharding_util::RemoveShapeDimensions(
+          hlo_sharding_util::PartiallyReplicateTiledShardingOnDims(
+              user.sharding(), dims_to_replicate),
+          dims_to_replicate);
     }
     case HloOpcode::kConcatenate: {
       if (user.sharding().IsReplicated()) {
@@ -1036,64 +1247,11 @@ absl::optional<HloSharding> GetShardingFromUser(
       return HloSharding::Tile(new_tile_assignment);
     }
     case HloOpcode::kConvolution: {
-      if (auto dot_dims =
-              dot_as_convolution_util::ParseDotGeneralFromConvolution(&user)) {
-        const auto& dnums = user.convolution_dimension_numbers();
-        auto partitioned_only_along_non_trivial_dims =
-            [&](const HloSharding& sharding,
-                std::vector<dot_as_convolution_util::
-                                DotGeneralAsConvolutionDimsInfo::DimNums>&
-                    dims) {
-              if (sharding.IsTileMaximal()) {
-                return false;
-              }
-              int64 partition_count = 1;
-              for (const auto& dim : dims) {
-                if (user.shape().dimensions(dim.output) == 1) {
-                  continue;
-                }
-                partition_count *= sharding.tile_assignment().dim(dim.output);
-              }
-              return partition_count ==
-                     sharding.tile_assignment().num_elements();
-            };
-        // If output is partitioned only along the batch dimensions, or only
-        // along the non-contracting dimensions, propagate the sharding to the
-        // operand.
-        if (&instruction == user.operand(0) &&
-            (partitioned_only_along_non_trivial_dims(user.sharding(),
-                                                     dot_dims->batch_dims) ||
-             partitioned_only_along_non_trivial_dims(
-                 user.sharding(), dot_dims->lhs_non_contracting_dims))) {
-          std::vector<int64> lhs_to_output_indices(user.shape().rank());
-          lhs_to_output_indices[dnums.input_batch_dimension()] =
-              dnums.output_batch_dimension();
-          lhs_to_output_indices[dnums.input_feature_dimension()] =
-              dnums.output_feature_dimension();
-          for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
-            lhs_to_output_indices[dnums.input_spatial_dimensions(i)] =
-                dnums.output_spatial_dimensions(i);
-          }
-          return hlo_sharding_util::TransposeSharding(user.sharding(),
-                                                      lhs_to_output_indices);
-        }
-        if (&instruction == user.operand(1) &&
-            (partitioned_only_along_non_trivial_dims(user.sharding(),
-                                                     dot_dims->batch_dims) ||
-             partitioned_only_along_non_trivial_dims(
-                 user.sharding(), dot_dims->rhs_non_contracting_dims))) {
-          std::vector<int64> rhs_to_output_indices(user.shape().rank());
-          rhs_to_output_indices[dnums.kernel_input_feature_dimension()] =
-              dnums.output_batch_dimension();
-          rhs_to_output_indices[dnums.kernel_output_feature_dimension()] =
-              dnums.output_feature_dimension();
-          for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
-            rhs_to_output_indices[dnums.kernel_spatial_dimensions(i)] =
-                dnums.output_spatial_dimensions(i);
-          }
-          return hlo_sharding_util::TransposeSharding(user.sharding(),
-                                                      rhs_to_output_indices);
-        }
+      auto dot_dims = dot_as_convolution_util::ParseConvolutionDimsInfo(&user);
+      if (dot_dims.conv_spatial_dims.empty()) {
+        int64 op_idx = user.operand_index(&instruction);
+        return InferDotOperandSharding(&user, dot_dims, op_idx,
+                                       may_combine_partial_sharding);
       }
       return absl::nullopt;
     }
@@ -1175,33 +1333,10 @@ absl::optional<HloSharding> GetShardingFromUser(
       return new_sharding;
     }
     case HloOpcode::kDot: {
-      if (user.sharding().IsReplicated()) {
-        return user.sharding();
-      }
-      auto& dim_numbers = user.dot_dimension_numbers();
       int64 op_idx = user.operand_index(&instruction);
-      // Batch dimensions are the same on lhs and rhs for dot operations.
-      int64 num_batch_dims = dim_numbers.lhs_batch_dimensions_size();
-      int64 num_spatial_dims =
-          instruction.shape().dimensions_size() - num_batch_dims;
-      if (num_spatial_dims == 1) {
-        // This is the vector of a matrix x vector operation -> replicate,
-        // since tiling on the vector would necessarily be on the contracting
-        // dimension, which we don't support.
-        CHECK_EQ(op_idx, 1);
-        return HloSharding::Replicate();
-      }
-      // Instruction is necessarily a matrix because it is one of the operands
-      // of a matrix x matrix operation.
-      CHECK_EQ(num_spatial_dims, 2);
-      // Propagate tile sharding to the bigger operand, and replicate the other.
-      auto other_op = user.operand(op_idx ^ 1);
-      if (ShapeUtil::ByteSizeOf(instruction.shape()) >
-          ShapeUtil::ByteSizeOf(other_op->shape())) {
-        return user.sharding();
-      } else {
-        return HloSharding::Replicate();
-      }
+      auto dnums = dot_as_convolution_util::ParseDotGeneralFromDot(&user);
+      return InferDotOperandSharding(&user, dnums, op_idx,
+                                     may_combine_partial_sharding);
     }
     case HloOpcode::kReduce: {
       if (instruction.shape().rank() == 0) {
@@ -1216,10 +1351,11 @@ absl::optional<HloSharding> GetShardingFromUser(
         return user_sharding;
       }
       std::vector<int64> target_tile_assignment_dimensions(
-          instruction.shape().rank());
+          instruction.shape().rank() +
+          (user_sharding.ReplicateOnLastTileDim() ? 1 : 0));
       const auto& dimensions = user.dimensions();
       int64 next_output_dim = 0;
-      for (int64 i = 0; i < instruction.shape().rank(); ++i) {
+      for (int64 i = 0; i < target_tile_assignment_dimensions.size(); ++i) {
         if (absl::c_find(dimensions, i) == dimensions.end()) {
           target_tile_assignment_dimensions[i] =
               user_sharding.tile_assignment().dim(next_output_dim++);
@@ -1229,7 +1365,9 @@ absl::optional<HloSharding> GetShardingFromUser(
       }
       auto tile_assignment = user_sharding.tile_assignment();
       tile_assignment.Reshape(target_tile_assignment_dimensions);
-      return HloSharding::Tile(tile_assignment);
+      return user_sharding.ReplicateOnLastTileDim()
+                 ? HloSharding::PartialTile(tile_assignment)
+                 : HloSharding::Tile(tile_assignment);
     }
     case HloOpcode::kSort: {
       if (user.sharding().IsTuple()) {
@@ -1299,17 +1437,21 @@ absl::optional<HloSharding> GetShardingFromUser(
 // false otherwise.
 bool InferShardingFromUsers(HloInstruction* instruction,
                             const ComputationMap& computation_map,
-                            bool aggressive_prop, bool is_spmd) {
+                            int64 aggressiveness, bool is_spmd) {
+  if (aggressiveness < 2 && instruction->opcode() == HloOpcode::kBroadcast) {
+    return false;
+  }
   if (!SupportSpatialPartitioning(instruction, computation_map, is_spmd)) {
     return false;
   }
   bool improved_sharding = false;
+  const bool may_combine_partial_sharding = is_spmd && aggressiveness > 0;
   for (const HloInstruction* user : instruction->users()) {
     absl::optional<HloSharding> user_sharding =
-        GetShardingFromUser(*instruction, *user, aggressive_prop, is_spmd);
+        GetShardingFromUser(*instruction, *user, aggressiveness, is_spmd);
     if (user_sharding) {
-      improved_sharding |=
-          MaybeImproveInstructionSharding(*user_sharding, instruction);
+      improved_sharding |= MaybeImproveInstructionSharding(
+          std::move(*user_sharding), instruction, may_combine_partial_sharding);
     }
   }
   return improved_sharding;
@@ -1579,10 +1721,12 @@ StatusOr<bool> ShardingPropagation::Run(HloModule* module) {
   // strictly improve the sharding of the graph and it can't be improved
   // indefinitely.
   int64 iterations = 0;
-  auto run_to_fix_point = [&](bool aggressive_prop) {
-    bool changed = true;
-    while (changed) {
-      changed = false;
+  auto run_to_fix_point = [&](int64 aggressiveness) {
+    absl::flat_hash_set<const HloInstruction*> already_inferred_from_operands;
+    absl::flat_hash_set<const HloInstruction*> already_inferred_from_users;
+    bool changed_last_iter = true;
+    while (changed_last_iter) {
+      changed_last_iter = false;
       int64 inferred_from_operand_counter = 0;
       int64 inferred_from_user_counter = 0;
       int64 instruction_counter = 0;
@@ -1595,42 +1739,55 @@ StatusOr<bool> ShardingPropagation::Run(HloModule* module) {
         for (const HloInstruction* instruction : instructions) {
           already_sharded_counter += (instruction->has_sharding() ? 1 : 0);
         }
-
-        // Remove the instructions where the sharding was provided from the
-        // outside so we don't modify them.
-        instructions.erase(
-            std::remove_if(instructions.begin(), instructions.end(),
-                           [&](HloInstruction* instruction) {
-                             return provided_shardings.contains(instruction);
-                           }),
-            instructions.end());
-
         // First iterate the HLO graph in post order taking shardings from
         // operands.
         for (HloInstruction* instruction : instructions) {
+          if (already_inferred_from_operands.contains(instruction) ||
+              provided_shardings.contains(instruction)) {
+            continue;
+          }
+          already_inferred_from_operands.insert(instruction);
           if (InferShardingFromOperands(instruction, computation_map, is_spmd_,
-                                        aggressive_prop)) {
+                                        aggressiveness)) {
             ++inferred_from_operand_counter;
-            changed = true;
+            any_changed = true;
             VLOG(2) << "Add sharding (forward-pass): "
                     << instruction->ToString();
             maybe_computation_propagation(instruction);
+            for (auto operand : instruction->operands()) {
+              already_inferred_from_users.erase(operand);
+            }
+            for (auto user : instruction->users()) {
+              already_inferred_from_operands.erase(user);
+            }
+            changed_last_iter = true;
           }
         }
 
         // Then iterate the HLO graph in reverse post order taking shardings
         // from users.
         for (auto it = instructions.rbegin(); it != instructions.rend(); ++it) {
-          if (InferShardingFromUsers(*it, computation_map, aggressive_prop,
+          if (already_inferred_from_users.contains(*it) ||
+              provided_shardings.contains(*it)) {
+            continue;
+          }
+          already_inferred_from_users.insert(*it);
+          if (InferShardingFromUsers(*it, computation_map, aggressiveness,
                                      is_spmd_)) {
             ++inferred_from_user_counter;
-            changed = true;
+            any_changed = true;
             VLOG(2) << "Add sharding (backward-pass): " << (*it)->ToString();
             maybe_computation_propagation(*it);
+            for (auto operand : (*it)->operands()) {
+              already_inferred_from_users.erase(operand);
+            }
+            for (auto user : (*it)->users()) {
+              already_inferred_from_operands.erase(user);
+            }
+            changed_last_iter = true;
           }
         }
       }
-      any_changed |= changed;
       VLOG(1) << "Sharding propagation iteration " << iterations << ";";
       VLOG(1) << "  total instructions: " << instruction_counter;
       VLOG(1) << "  instructions already sharded: " << already_sharded_counter;
@@ -1638,11 +1795,13 @@ StatusOr<bool> ShardingPropagation::Run(HloModule* module) {
               << inferred_from_operand_counter;
       VLOG(1) << "  shardings inferred from users: "
               << inferred_from_user_counter;
+      VLOG(1) << "  aggressiveness: " << aggressiveness;
       ++iterations;
     }
   };
-  run_to_fix_point(false);
-  run_to_fix_point(true);
+  for (int64 aggressiveness = 0; aggressiveness < 4; ++aggressiveness) {
+    run_to_fix_point(aggressiveness);
+  }
 
   VLOG(1) << "Sharding propagation completed after " << iterations
           << " iterations";
diff --git a/tensorflow/compiler/xla/service/sharding_propagation_test.cc b/tensorflow/compiler/xla/service/sharding_propagation_test.cc
index 594130daf0b..8c4d8fc24ff 100644
--- a/tensorflow/compiler/xla/service/sharding_propagation_test.cc
+++ b/tensorflow/compiler/xla/service/sharding_propagation_test.cc
@@ -65,22 +65,6 @@ ENTRY %elementwise {
               op::Sharding("{devices=[1,2,2,1]0,1,2,3}"));
 }
 
-TEST_F(ShardingPropagationTest, BroadcastForwardPassNoSharding) {
-  const char* const hlo_string = R"(
-HloModule module
-ENTRY %broadcast {
-  %param0 = f32[7,11]{1,0} parameter(0),
-    sharding={devices=[2,2]0,1,2,3}
-  %broadcast = f32[5,7,11,13]{3,2,1,0} broadcast(%param0), dimensions={1,2}
-  ROOT %copy = f32[5,7,11,13]{3,2,1,0} copy(%broadcast)
-})";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  TF_ASSERT_OK_AND_ASSIGN(bool changed,
-                          ShardingPropagation().Run(module.get()));
-  EXPECT_FALSE(changed);
-}
-
 // Regression Test for b/129569657.
 TEST_F(ShardingPropagationTest, BroadcastForwardPass) {
   const char* const hlo_string = R"(
@@ -118,6 +102,25 @@ ENTRY %broadcast {
               op::Sharding("{devices=[1,2,2,1]0,1,2,3}"));
 }
 
+TEST_F(ShardingPropagationTest, BroadcastForwardPartial) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %broadcast {
+  %param0 = f32[3,2048]parameter(0),
+    sharding={devices=[1,2,2]0,1,2,3 last_tile_dim_replicate}
+  %broadcast = f32[3,2048,3] broadcast(%param0), dimensions={0,1}
+  ROOT %copy = f32[3,2048,3] copy(%broadcast)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, ShardingPropagation(/*is_spmd=*/true).Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(
+      FindInstruction(module.get(), "broadcast"),
+      op::Sharding("{devices=[1,2,1,2]0,1,2,3 last_tile_dim_replicate}"));
+}
+
 TEST_F(ShardingPropagationTest, BroadcastUser) {
   const char* const hlo_string = R"(
 HloModule module
@@ -136,6 +139,25 @@ ENTRY %broadcast {
               op::Sharding("{devices=[2,4]0,1,2,3,4,5,6,7}"));
 }
 
+TEST_F(ShardingPropagationTest, BroadcastUserPartial) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %broadcast {
+  %param0 = f32[24,8]{0,1} parameter(0)
+  %copy = f32[24,8]{0,1} copy(%param0)
+  ROOT %broadcast = f32[4,24,6,8] broadcast(%copy), dimensions={1,3},
+    sharding={devices=[4,2,1,1]0,1,2,3,4,5,6,7}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, ShardingPropagation(/*is_spmd=*/true).Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(
+      FindInstruction(module.get(), "copy"),
+      op::Sharding("{devices=[2,1,4]0,2,4,6,1,3,5,7 last_tile_dim_replicate}"));
+}
+
 TEST_F(ShardingPropagationTest, MaximalReduceForwardPass) {
   const char* const hlo_string = R"(
 HloModule module
@@ -184,6 +206,78 @@ ENTRY %reduce {
               op::Sharding("{devices=[2,2]0,1,2,3}"));
 }
 
+TEST_F(ShardingPropagationTest, ReducePartiallyOnTiledDims) {
+  const char* const hlo_string = R"(
+HloModule module
+%add {
+  %lhs = f32[] parameter(0)
+  %rhs = f32[] parameter(1)
+  ROOT %add = f32[] add(%lhs, %rhs)
+}
+ENTRY %reduce {
+  %param0 = f32[8,8] parameter(0), sharding={devices=[2,2]0,1,2,3}
+  %init = f32[] parameter(1)
+  %reduce = f32[8] reduce(%param0, %init), dimensions={0}, to_apply=%add
+  ROOT %copy = f32[8] copy(%reduce)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, ShardingPropagation(/*is_spmd=*/true).Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "reduce"),
+              op::Sharding("{devices=[2,2]0,2,1,3 last_tile_dim_replicate}"));
+}
+
+TEST_F(ShardingPropagationTest, ReducePartiallyOnTiledDims2) {
+  const char* const hlo_string = R"(
+HloModule module
+%add {
+  %lhs = f32[] parameter(0)
+  %rhs = f32[] parameter(1)
+  ROOT %add = f32[] add(%lhs, %rhs)
+}
+ENTRY %reduce {
+  %param0 = f32[8,8] parameter(0), sharding={devices=[2,2,2]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+  %init = f32[] parameter(1)
+  %reduce = f32[8] reduce(%param0, %init), dimensions={0}, to_apply=%add
+  ROOT %copy = f32[8] copy(%reduce)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, ShardingPropagation(/*is_spmd=*/true).Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(
+      FindInstruction(module.get(), "reduce"),
+      op::Sharding("{devices=[2,4]0,1,4,5,2,3,6,7 last_tile_dim_replicate}"));
+}
+
+TEST_F(ShardingPropagationTest, ReducePartiallyBackward) {
+  const char* const hlo_string = R"(
+HloModule module
+%add {
+  %lhs = f32[] parameter(0)
+  %rhs = f32[] parameter(1)
+  ROOT %add = f32[] add(%lhs, %rhs)
+}
+ENTRY %reduce {
+  %param0 = f32[8,8] parameter(0)
+  %input = f32[8,8] copy(%param0)
+  %init = f32[] parameter(1)
+  %reduce = f32[8] reduce(%input, %init), dimensions={0}, to_apply=%add,
+    sharding={devices=[2,2]0,1,2,3 last_tile_dim_replicate}
+  ROOT %copy = f32[8] copy(%reduce)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "input"),
+              op::Sharding("{devices=[1,2,2]0,1,2,3 last_tile_dim_replicate}"));
+}
+
 TEST_F(ShardingPropagationTest, ShardedTupleReduceForwardAndBackwardPass) {
   const char* const hlo_string = R"(
 HloModule module
@@ -420,6 +514,26 @@ ENTRY %pad {
               op::Sharding("{devices=[2,2]0,1,2,3}"));
 }
 
+TEST_F(ShardingPropagationTest, PartialReplicatedPadForwardPass) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %pad {
+  %input = f32[11,17]{1,0} parameter(0),
+    sharding={devices=[2,2,2]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+  %pad_value = f32[] parameter(1)
+  %pad = f32[27,51]{1,0} pad(%input, %pad_value), padding=2_4_1x1_1_2
+  ROOT %copy = f32[27,51]{1,0} copy(%pad)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(
+      FindInstruction(module.get(), "pad"),
+      op::Sharding("{devices=[2,2,2]0,1,2,3,4,5,6,7 last_tile_dim_replicate}"));
+}
+
 TEST_F(ShardingPropagationTest, ShardedPreferredOverReplicated) {
   const char* const hlo_string = R"(
 HloModule module
@@ -446,6 +560,43 @@ ENTRY %replicated {
               op::Sharding("{devices=[1,2,2,1]0,1,2,3}"));
 }
 
+TEST_F(ShardingPropagationTest, PartialReplicateReshapeForwardPass) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %reshape {
+  %param0 = f32[1430,1]{1,0} parameter(0),
+    sharding={devices=[2,1,2]0,1,2,3 last_tile_dim_replicate}
+  %reshape = f32[10,11,13]{2,1,0} reshape(%param0)
+  ROOT %copy = f32[10,11,13]{2,1,0} copy(%reshape)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, ShardingPropagation(/*is_spmd=*/true).Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(
+      FindInstruction(module.get(), "reshape"),
+      op::Sharding("{devices=[2,1,1,2]0,1,2,3 last_tile_dim_replicate}"));
+}
+
+TEST_F(ShardingPropagationTest, PartialReplicateReshapeBackwardPass) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %reshape {
+  %param0 = f32[2002,1]{1,0} parameter(0)
+  %copy = f32[2002,1]{1,0} copy(f32[2002,1]{1,0} %param0)
+  ROOT %reshape = f32[14,11,13]{2,1,0} reshape(%copy),
+    sharding={devices=[2,1,1,2]0,1,2,3 last_tile_dim_replicate}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, ShardingPropagation(/*is_spmd=*/true).Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "copy"),
+              op::Sharding("{devices=[2,1,2]0,1,2,3 last_tile_dim_replicate}"));
+}
+
 TEST_F(ShardingPropagationTest, DontShardTuplesIfAllInputIsMaximal) {
   const char* const hlo_string = R"(
 HloModule module
@@ -506,6 +657,25 @@ ENTRY %slice {
               op::Sharding("{devices=[2,1]0,1}"));
 }
 
+TEST_F(ShardingPropagationTest, PartialReplicatedStridedSlice) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY %slice {
+  %param = f32[17,13]{1,0} parameter(0),
+    sharding={devices=[2,1,2]0,1,2,3 last_tile_dim_replicate}
+  %slice = f32[7,5]{1,0} slice(%param), slice={[1:15:2], [5:10:1]}
+  ROOT %tuple = (f32[7,5]{1,0}) tuple(%slice)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "slice"),
+              op::Sharding("{devices=[2,1,2]0,1,2,3 last_tile_dim_replicate}"));
+}
+
 TEST_F(ShardingPropagationTest, ReduceWindowBackwardPass) {
   const char* const hlo_string = R"(
 HloModule module
@@ -565,13 +735,15 @@ ENTRY conv {
   %rhs = f32[2,2,1]{2,1,0} parameter(1)
   %conv = f32[3,2,3]{2,1,0} convolution(%lhs, %rhs),
     window={size=1}, dim_labels=bf0_oi0->bf0
-  ROOT %tuple = f32[3,2,3]{2,1,0} tuple(%conv)
+  ROOT %tuple = (f32[3,2,3]{2,1,0}) tuple(%conv)
 })";
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
   TF_ASSERT_OK_AND_ASSIGN(bool changed,
                           ShardingPropagation().Run(module.get()));
-  EXPECT_FALSE(changed);
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "conv"),
+              op::Sharding("{replicated}"));
 }
 
 TEST_F(ShardingPropagationTest, ConvolutionDifferentDimensionNumbers) {
@@ -937,7 +1109,7 @@ ENTRY %conv {
   %p0_copy_0 = f32[8,256,128] copy(%param.0),
     sharding={devices=[1,4,1]0,1,2,3}
   %p1_copy_0 = f32[8,128,512] copy(%param.1),
-    sharding={devices=[1,2,2]0,1,2,3}
+    sharding={devices=[1,1,4]0,1,2,3}
   %p2_copy = f32[8,128] copy(%param.2)
   %dot_prop_rhs = f32[8,256,512] dot(%p0_copy_0, %p1_copy_0),
     lhs_batch_dims={0}, rhs_batch_dims={0},
@@ -966,16 +1138,18 @@ ENTRY %conv {
                           ShardingPropagation().Run(module.get()));
   EXPECT_TRUE(changed);
   EXPECT_THAT(FindInstruction(module.get(), "dot_prop_rhs"),
-              op::Sharding("{devices=[1,2,2]0,1,2,3}"));
+              op::Sharding("{devices=[1,1,4]0,1,2,3}"));
   EXPECT_THAT(FindInstruction(module.get(), "dot_prop_lhs"),
-              op::Sharding("{devices=[1,2,2]0,1,2,3}"));
+              op::Sharding("{devices=[1,4,1]0,1,2,3}"));
   EXPECT_THAT(FindInstruction(module.get(), "dot_mat_vec"),
               op::Sharding("{devices=[1,4]0,1,2,3}"));
 
-  EXPECT_THAT(FindInstruction(module.get(), "p0_copy_1"),
-              op::Sharding("{replicated}"));
-  EXPECT_THAT(FindInstruction(module.get(), "p1_copy_1"),
-              op::Sharding("{devices=[1,2,2]0,1,2,3}"));
+  EXPECT_THAT(
+      FindInstruction(module.get(), "p0_copy_1"),
+      op::Sharding("{devices=[1,2,1,2]0,1,2,3 last_tile_dim_replicate}"));
+  EXPECT_THAT(
+      FindInstruction(module.get(), "p1_copy_1"),
+      op::Sharding("{devices=[1,1,2,2]0,2,1,3  last_tile_dim_replicate}"));
   EXPECT_THAT(FindInstruction(module.get(), "dot_back_prop_rhs"),
               op::Sharding("{devices=[1,2,2]0,1,2,3}"));
 }
@@ -1004,6 +1178,146 @@ ENTRY %conv {
               op::Sharding("{devices=[2,2,1]0,1,2,3}"));
 }
 
+TEST_F(ShardingPropagationTest, DotMergeOperands) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %conv {
+  %p0 = f32[8,256,512] parameter(0),
+    sharding={devices=[2,2,1,2]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+  %p1 = f32[8,128,512] parameter(1),
+    sharding={devices=[2,2,1,2]0,2,1,3,4,6,5,7 last_tile_dim_replicate}
+  %dot = f32[8,256,128] dot(%p0, %p1),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={2}
+  ROOT %copy = f32[8,256,128] copy(%dot)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, ShardingPropagation(/*is_spmd=*/true).Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "dot"),
+              op::Sharding("{devices=[2,2,2]0,1,2,3,4,5,6,7}"));
+}
+
+TEST_F(ShardingPropagationTest, DotMergeOperands2) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %conv {
+  %p0 = f32[8,256,512] parameter(0), sharding={devices=[2,2,2]0,1,2,3,4,5,6,7}
+  %p1 = f32[8,128,512] parameter(1), sharding={devices=[2,2,2]0,1,2,3,4,5,6,7}
+  %dot = f32[8,256,128] dot(%p0, %p1),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={2}
+  ROOT %copy = f32[8,256,128] copy(%dot)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, ShardingPropagation(/*is_spmd=*/true).Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(
+      FindInstruction(module.get(), "dot"),
+      op::Sharding(
+          "{devices=[2,2,1,2]0,1,2,3,4,5,6,7 last_tile_dim_replicate}"));
+}
+
+TEST_F(ShardingPropagationTest, BackwardDotFromContracting) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %conv {
+  %p0 = f32[8,256,512] parameter(0), sharding={devices=[2,2,2]0,1,2,3,4,5,6,7}
+  %p1 = f32[8,128,512] parameter(1)
+  %copy1 = f32[8,128,512] copy(%p1)
+  %dot = f32[8,256,128] dot(%p0, %copy1),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={2},
+    sharding={devices=[2,1,2,2]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+  ROOT %copy = f32[8,256,128] copy(%dot)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, ShardingPropagation(/*is_spmd=*/true).Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "copy1"),
+              op::Sharding("{devices=[2,2,2]0,1,2,3,4,5,6,7}"));
+}
+
+TEST_F(ShardingPropagationTest, ConvAsDotOnTrivialDims) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %conv {
+  %lhs = f32[128,1,1,1001] parameter(0), sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[1,1,1024,1001] parameter(1), sharding={devices=[1,2,1,1]0,1}
+  %convolution = f32[128,1,1,1024] convolution(%lhs, %rhs),
+    window={size=1x1 rhs_reversal=1x1}, dim_labels=b01f_01oi->b01f
+  ROOT %copy = f32[128,1,1,1024] copy(%convolution)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, ShardingPropagation(/*is_spmd=*/true).Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "convolution"),
+              op::Sharding("{devices=[1,1,2,1]0,1}"));
+}
+
+TEST_F(ShardingPropagationTest, ConvAsDotOnTrivialDimsBackward) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %conv {
+  %p0 = f32[128,5,5,128] parameter(0)
+  %lhs = f32[128,5,5,128] copy(%p0)
+  %p1 = f32[5,5,128,768] parameter(1)
+  %rhs = f32[5,5,128,768] copy(%p1)
+  %convolution = f32[128,1,1,768] convolution(%lhs, %rhs), window={size=5x5},
+    dim_labels=b01f_01io->b01f, sharding={devices=[1,2,1,1]0,1}
+  ROOT %copy = f32[128,1,1,768] copy(%convolution)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, ShardingPropagation(/*is_spmd=*/true).Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "lhs"),
+              op::Sharding("{replicated}"));
+  EXPECT_THAT(FindInstruction(module.get(), "rhs"),
+              op::Sharding("{replicated}"));
+}
+
+TEST_F(ShardingPropagationTest,
+       ConvolutionFilterIFOFPartitionedInputPartialReplicate) {
+  const char* const hlo_string = R"(
+  HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,112,112,12] parameter(0)
+  %lhs.copy = f32[128,112,112,12] copy(f32[128,112,112,12] %lhs),
+    sharding={devices=[1,1,1,2,2]0,1,2,3 last_tile_dim_replicate}
+  %rhs = f32[7,7,12,64] parameter(1)
+  %rhs.copy = f32[7,7,12,64] copy(f32[7,7,12,64] %rhs),
+    sharding={devices=[1,1,2,2]0,1,2,3}
+  %conv = f32[128,56,56,64] convolution(
+    f32[128,112,112,12] %lhs.copy,
+    f32[7,7,12,64] %rhs.copy),
+    window={size=7x7 stride=2x2 pad=3_3x3_3},
+    dim_labels=b01f_01io->b01f
+  ROOT %copy = f32[128,56,56,64] copy(conv)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, ShardingPropagation(/*is_spmd=*/true).Run(module.get()));
+  VLOG(1) << module->ToString();
+
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(
+      FindInstruction(module.get(), "conv"),
+      op::Sharding("{devices=[1,1,1,2,2]0,2,1,3 last_tile_dim_replicate}"));
+}
+
 TEST_F(ShardingPropagationTest, ConcatFromUserUnshardedDim) {
   const char* const hlo_string = R"(
 HloModule module
@@ -1155,15 +1469,15 @@ ENTRY entry {
   EXPECT_THAT(FindInstruction(module.get(), "ttr"),
               op::Sharding("{devices=[2,1]0,1}"));
   EXPECT_THAT(FindInstruction(module.get(), "tr"),
-              op::Sharding("{{devices=[2,1]0,1}}"));
+              op::Sharding("{{devices=[1,3]0,1,2}}"));
   EXPECT_THAT(FindInstruction(module.get(), "fp"),
               op::Sharding("{{devices=[1,3]0,1,2}}"));
   EXPECT_THAT(FindInstruction(module.get(), "fgte"),
               op::Sharding("{devices=[1,3]0,1,2}"));
   EXPECT_THAT(FindInstruction(module.get(), "fr"),
-              op::Sharding("{{devices=[2,1]0,1}}"));
+              op::Sharding("{{devices=[1,3]0,1,2}}"));
   EXPECT_THAT(FindInstruction(module.get(), "conditional"),
-              op::Sharding("{{devices=[2,1]0,1}}"));
+              op::Sharding("{{devices=[1,3]0,1,2}}"));
 }
 
 TEST_F(ShardingPropagationTest, TupleFromUser) {
@@ -1515,6 +1829,28 @@ ENTRY entry {
               op::Sharding("{devices=[2,1]0,1}"));
 }
 
+TEST_F(ShardingPropagationTest, GatherFromIndex_PartialReplicate) {
+  const char* hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %input = f32[2,9] parameter(0), sharding={replicated}
+  %indices = s32[3] parameter(1),
+   sharding={devices=[2,2]0,1,2,3 last_tile_dim_replicate}
+  %gather = f32[3,9] gather(%input, %indices), offset_dims={1},
+    collapsed_slice_dims={0}, start_index_map={0}, index_vector_dim=1,
+    slice_sizes={1,9}
+  ROOT %copy = f32[3,9] copy(%gather)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "gather"),
+              op::Sharding("{devices=[2,1,2]0,1,2,3 last_tile_dim_replicate}"));
+}
+
 TEST_F(ShardingPropagationTest, GatherFromDataOperand) {
   const char* hlo_string = R"(
 HloModule module
@@ -1536,6 +1872,28 @@ ENTRY entry {
               op::Sharding("{devices=[1,2]0,1}"));
 }
 
+TEST_F(ShardingPropagationTest, GatherFromDataOperand_PartialReplicate) {
+  const char* hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %input = f32[2,9] parameter(0),
+    sharding={devices=[1,2,2]0,1,2,3 last_tile_dim_replicate}
+  %indices = s32[3] parameter(1), sharding={replicated}
+  %gather = f32[3,9] gather(%input, %indices), offset_dims={1},
+    collapsed_slice_dims={0}, start_index_map={0}, index_vector_dim=1,
+    slice_sizes={1,9}
+  ROOT %copy = f32[3,9] copy(%gather)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, ShardingPropagation(/*is_spmd=*/true).Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "gather"),
+              op::Sharding("{devices=[1,2,2]0,1,2,3 last_tile_dim_replicate}"));
+}
+
 TEST_F(ShardingPropagationTest, GatherToIndex) {
   const char* hlo_string = R"(
 HloModule module
@@ -1557,6 +1915,98 @@ ENTRY entry {
               op::Sharding("{devices=[2]0,1}"));
 }
 
+TEST_F(ShardingPropagationTest, GatherToIndex_PartialReplicate) {
+  const char* hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %input = f32[2,9] parameter(0), sharding={replicated}
+  %p1 = s32[3] parameter(1)
+  %indices = s32[3] copy(%p1)
+  ROOT %gather = f32[3,9] gather(%input, %indices), offset_dims={1},
+    collapsed_slice_dims={0}, start_index_map={0}, index_vector_dim=1,
+    slice_sizes={1,9},
+    sharding={devices=[2,1,2]0,1,2,3 last_tile_dim_replicate}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "indices"),
+              op::Sharding("{devices=[2,2]0,1,2,3 last_tile_dim_replicate}"));
+}
+
+TEST_F(ShardingPropagationTest, GatherToIndex2) {
+  const char* hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %input = bf16[2,4819,4] parameter(0), sharding={replicated}
+  %p1 = s32[2,1000,2] parameter(1)
+  %indices = s32[2,1000,2] copy(%p1)
+  ROOT %gather = bf16[2,1000,4]
+    gather(bf16[2,4819,4] %input, s32[2,1000,2] %indices),
+    offset_dims={2}, collapsed_slice_dims={0,1},
+    start_index_map={0,1}, index_vector_dim=2, slice_sizes={1,1,4},
+    sharding={devices=[1,2,1]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "indices"),
+              op::Sharding("{devices=[1,2,1]0,1}"));
+}
+
+TEST_F(ShardingPropagationTest, GatherToIndex2_PartialReplicate) {
+  const char* hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %input = bf16[2,4819,4] parameter(0), sharding={replicated}
+  %p1 = s32[2,1000,2] parameter(1)
+  %indices = s32[2,1000,2] copy(%p1)
+  ROOT %gather = bf16[2,1000,4]
+    gather(bf16[2,4819,4] %input, s32[2,1000,2] %indices),
+    offset_dims={2}, collapsed_slice_dims={0,1},
+    start_index_map={0,1}, index_vector_dim=2, slice_sizes={1,1,4},
+    sharding={devices=[1,2,1,2]0,1,2,3 last_tile_dim_replicate}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(
+      FindInstruction(module.get(), "indices"),
+      op::Sharding("{devices=[1,2,1,2]0,1,2,3 last_tile_dim_replicate}"));
+}
+
+TEST_F(ShardingPropagationTest, GatherToIndex3) {
+  const char* hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %input = bf16[2,4819,4] parameter(0), sharding={replicated}
+  %p1 = s32[2,2,1000] parameter(1)
+  %indices = s32[2,2,1000] copy(%p1)
+  ROOT %gather = bf16[2,1000,4]
+    gather(bf16[2,4819,4] %input, s32[2,2,1000] %indices),
+    offset_dims={2}, collapsed_slice_dims={0,1},
+    start_index_map={0,1}, index_vector_dim=1, slice_sizes={1,1,4},
+    sharding={devices=[1,2,1]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "indices"),
+              op::Sharding("{devices=[1,1,2]0,1}"));
+}
+
 TEST_F(ShardingPropagationTest, GatherToDataOperand) {
   const char* hlo_string = R"(
 HloModule module
@@ -1578,6 +2028,27 @@ ENTRY entry {
               op::Sharding("{devices=[1,2]0,1}"));
 }
 
+TEST_F(ShardingPropagationTest, GatherToDataOperand_PartialReplicate) {
+  const char* hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %p0 = f32[2,9] parameter(0)
+  %input = f32[2,9] copy(%p0)
+  %indices = s32[3] parameter(1), sharding={replicated}
+  ROOT %gather = f32[3,9] gather(%input, %indices), offset_dims={1},
+    collapsed_slice_dims={0}, start_index_map={0}, index_vector_dim=1,
+    slice_sizes={1,9}, sharding={devices=[1,2,2]0,1,2,3 last_tile_dim_replicate}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, ShardingPropagation(/*is_spmd=*/true).Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "input"),
+              op::Sharding("{devices=[1,2,2]0,1,2,3 last_tile_dim_replicate}"));
+}
+
 TEST_F(ShardingPropagationTest, DataOperandToScatter) {
   const char* const hlo_string = R"(
 HloModule module
@@ -1609,6 +2080,38 @@ ENTRY entry {
               op::Sharding("{devices=[1,2]0,1}"));
 }
 
+TEST_F(ShardingPropagationTest, DataOperandToScatter_PartialReplicate) {
+  const char* const hlo_string = R"(
+HloModule module
+
+add (lhs: f32[], rhs: f32[]) -> f32[] {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT sum = f32[] add(lhs, rhs)
+}
+
+ENTRY entry {
+  %input = f32[2,9] parameter(0),
+   sharding={devices=[1,2,2]0,1,2,3 last_tile_dim_replicate}
+  %indices = s32[3] parameter(1), sharding={replicated}
+  %updates = f32[3,9] parameter(2), sharding={replicated}
+  %scatter = f32[2,9] scatter(%input, %indices, %updates),
+      to_apply=add,
+      update_window_dims={1},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=1
+  ROOT %copy = f32[2,9] copy(%scatter)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, ShardingPropagation(/*is_spmd=*/true).Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "scatter"),
+              op::Sharding("{devices=[1,2,2]0,1,2,3 last_tile_dim_replicate}"));
+}
+
 TEST_F(ShardingPropagationTest, UpdateOperandToScatter) {
   const char* const hlo_string = R"(
 HloModule module
@@ -1640,6 +2143,70 @@ ENTRY entry {
               op::Sharding("{devices=[1,2]0,1}"));
 }
 
+TEST_F(ShardingPropagationTest, UpdateOperandToScatter_PartialReplicate) {
+  const char* const hlo_string = R"(
+HloModule module
+
+add (lhs: f32[], rhs: f32[]) -> f32[] {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT sum = f32[] add(lhs, rhs)
+}
+
+ENTRY entry {
+  %input = f32[2,9] parameter(0), sharding={replicated}
+  %indices = s32[3] parameter(1), sharding={replicated}
+  %updates = f32[3,9] parameter(2),
+    sharding={devices=[1,2,2]0,1,2,3 last_tile_dim_replicate}
+  %scatter = f32[2,9] scatter(%input, %indices, %updates),
+      to_apply=add,
+      update_window_dims={1},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=1
+  ROOT %copy = f32[2,9] copy(%scatter)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, ShardingPropagation(/*is_spmd=*/true).Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "scatter"),
+              op::Sharding("{devices=[1,2,2]0,1,2,3 last_tile_dim_replicate}"));
+}
+
+TEST_F(ShardingPropagationTest, ScatterToDataOperand_PartialReplicate) {
+  const char* const hlo_string = R"(
+HloModule module
+
+add (lhs: f32[], rhs: f32[]) -> f32[] {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT sum = f32[] add(lhs, rhs)
+}
+
+ENTRY entry {
+  %p0 = f32[2,9] parameter(0)
+  %input = f32[2,9] copy(%p0)
+  %indices = s32[3] parameter(1), sharding={replicated}
+  %updates = f32[3,9] parameter(2), sharding={replicated}
+  ROOT %scatter = f32[2,9] scatter(%input, %indices, %updates),
+      to_apply=add,
+      update_window_dims={1},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=1,
+      sharding={devices=[1,2,2]0,1,2,3 last_tile_dim_replicate}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "input"),
+              op::Sharding("{devices=[1,2,2]0,1,2,3 last_tile_dim_replicate}"));
+}
+
 TEST_F(ShardingPropagationTest, ScatterToDataOperand) {
   const char* const hlo_string = R"(
 HloModule module
@@ -1671,6 +2238,38 @@ ENTRY entry {
               op::Sharding("{devices=[1,2]0,1}"));
 }
 
+TEST_F(ShardingPropagationTest, ScatterToUpdateOperand_PartialReplicate) {
+  const char* const hlo_string = R"(
+HloModule module
+
+add (lhs: f32[], rhs: f32[]) -> f32[] {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT sum = f32[] add(lhs, rhs)
+}
+
+ENTRY entry {
+  %input = f32[2,9] parameter(0)
+  %indices = s32[3] parameter(1), sharding={replicated}
+  %p2 = f32[3,9] parameter(2)
+  %updates = f32[3,9] copy(%p2)
+  ROOT %scatter = f32[2,9] scatter(%input, %indices, %updates),
+      to_apply=add,
+      update_window_dims={1},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=1,
+      sharding={devices=[1,2,2]0,1,2,3 last_tile_dim_replicate}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, ShardingPropagation(/*is_spmd=*/true).Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "updates"),
+              op::Sharding("{devices=[1,2,2]0,1,2,3 last_tile_dim_replicate}"));
+}
+
 TEST_F(ShardingPropagationTest, ScatterToUpdateOperand) {
   const char* const hlo_string = R"(
 HloModule module
@@ -1733,6 +2332,38 @@ ENTRY entry {
               op::Sharding("{devices=[2]0,1}"));
 }
 
+TEST_F(ShardingPropagationTest, ScatterUpdateToIndex_PartialReplicate) {
+  const char* const hlo_string = R"(
+HloModule module
+
+add (lhs: f32[], rhs: f32[]) -> f32[] {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT sum = f32[] add(lhs, rhs)
+}
+
+ENTRY entry {
+  %input = f32[2,9] parameter(0), sharding={replicated}
+  %p1 = s32[3] parameter(1), sharding={replicated}
+  %indices = s32[3] copy(%p1)
+  %updates = f32[3,9] parameter(2),
+    sharding={devices=[2,1,2]0,1,2,3 last_tile_dim_replicate}
+  ROOT %scatter = f32[2,9] scatter(%input, %indices, %updates),
+      to_apply=add,
+      update_window_dims={1},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=1, sharding={replicated}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "indices"),
+              op::Sharding("{devices=[2,2]0,1,2,3 last_tile_dim_replicate}"));
+}
+
 TEST_F(ShardingPropagationTest, ScatterIndexToUpdate) {
   const char* const hlo_string = R"(
 HloModule module
@@ -1764,5 +2395,130 @@ ENTRY entry {
               op::Sharding("{devices=[2,1]0,1}"));
 }
 
+TEST_F(ShardingPropagationTest, ScatterIndexToUpdate_PartialReplicate) {
+  const char* const hlo_string = R"(
+HloModule module
+
+add (lhs: f32[], rhs: f32[]) -> f32[] {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT sum = f32[] add(lhs, rhs)
+}
+
+ENTRY entry {
+  %input = f32[2,9] parameter(0), sharding={replicated}
+  %indices = s32[3] parameter(1),
+    sharding={devices=[2,2]0,1,2,3 last_tile_dim_replicate}
+  %p2 = f32[3,9] parameter(2), sharding={replicated}
+  %updates = f32[3,9] copy(%p2)
+  ROOT %scatter = f32[2,9] scatter(%input, %indices, %updates),
+      to_apply=add,
+      update_window_dims={1},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=1, sharding={replicated}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "updates"),
+              op::Sharding("{devices=[2,1,2]0,1,2,3 last_tile_dim_replicate}"));
+}
+
+TEST_F(ShardingPropagationTest, PartialShardingOnElementwise) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %p0 = f32[2,9] parameter(0), sharding={devices=[1,2,2]0,1,2,3 last_tile_dim_replicate}
+  %p1 = f32[2,9] parameter(1), sharding={devices=[2,1,2]0,2,1,3 last_tile_dim_replicate}
+  %lhs = f32[2,9] copy(%p0)
+  %rhs = f32[2,9] copy(%p1)
+  %add = f32[2,9] add(%lhs, %rhs)
+  ROOT %copy = f32[2,9] copy(%add)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, ShardingPropagation(/*is_spmd=*/true).Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "lhs"),
+              op::Sharding("{devices=[2,2]0,2,1,3}"));
+  EXPECT_THAT(FindInstruction(module.get(), "rhs"),
+              op::Sharding("{devices=[2,2]0,2,1,3}"));
+  EXPECT_THAT(FindInstruction(module.get(), "add"),
+              op::Sharding("{devices=[2,2]0,2,1,3}"));
+}
+
+TEST_F(ShardingPropagationTest, PartialShardingOnElementwise2) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %p0 = f32[2,9] parameter(0), sharding={devices=[1,2,4]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+  %p1 = f32[2,9] parameter(1), sharding={devices=[2,1,4]0,1,4,5,2,3,6,7 last_tile_dim_replicate}
+  %lhs = f32[2,9] copy(%p0)
+  %rhs = f32[2,9] copy(%p1)
+  %add = f32[2,9] add(%lhs, %rhs)
+  ROOT %copy = f32[2,9] copy(%add)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, ShardingPropagation(/*is_spmd=*/true).Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(
+      FindInstruction(module.get(), "lhs"),
+      op::Sharding("{devices=[2,2,2]0,1,4,5,2,3,6,7 last_tile_dim_replicate}"));
+  EXPECT_THAT(
+      FindInstruction(module.get(), "rhs"),
+      op::Sharding("{devices=[2,2,2]0,1,4,5,2,3,6,7 last_tile_dim_replicate}"));
+  EXPECT_THAT(
+      FindInstruction(module.get(), "add"),
+      op::Sharding("{devices=[2,2,2]0,1,4,5,2,3,6,7 last_tile_dim_replicate}"));
+}
+
+TEST_F(ShardingPropagationTest, PartialShardingTransposeForwardPass) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %transpose {
+  %param = f32[7,11,13]{2,1,0} parameter(0),
+    sharding={devices=[2,1,2,2]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+  %transpose = f32[11,13,7]{2,1,0} transpose(%param), dimensions={1,2,0}
+  ROOT %copy = f32[11,13,7]{2,1,0} copy(%transpose)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, ShardingPropagation(/*is_spmd=*/true).Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(
+      FindInstruction(module.get(), "transpose"),
+      op::Sharding(
+          "{devices=[1,2,2,2]0,1,4,5,2,3,6,7 last_tile_dim_replicate}"));
+}
+
+TEST_F(ShardingPropagationTest, PartialShardingTransposeBackwardPass) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %transpose {
+  %param = f32[7,11,13]{2,1,0} parameter(0)
+  %copy = f32[7,11,13]{2,1,0} copy(%param)
+  ROOT %transpose = f32[11,13,7]{2,1,0} transpose(%copy), dimensions={1,2,0},
+    sharding={devices=[1,2,2,2]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, ShardingPropagation(/*is_spmd=*/true).Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(
+      FindInstruction(module.get(), "copy"),
+      op::Sharding(
+          "{devices=[2,1,2,2]0,1,4,5,2,3,6,7 last_tile_dim_replicate}"));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/spmd/BUILD b/tensorflow/compiler/xla/service/spmd/BUILD
index ce19934bb88..5fd7b7850cf 100644
--- a/tensorflow/compiler/xla/service/spmd/BUILD
+++ b/tensorflow/compiler/xla/service/spmd/BUILD
@@ -23,6 +23,7 @@ cc_library(
         "spmd_partitioner_util.cc",
     ],
     hdrs = [
+        "convolution_handler.h",
         "spmd_partitioner.h",
         "spmd_partitioner_util.h",
     ],
@@ -48,6 +49,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:pattern_matcher",
         "//tensorflow/compiler/xla/service:shape_inference",
         "//tensorflow/compiler/xla/service:tuple_simplifier",
+        "//tensorflow/core:lib",
         "//tensorflow/core/platform:numbers",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -73,3 +75,16 @@ tf_cc_test(
         "//tensorflow/core:test",
     ],
 )
+
+cc_library(
+    name = "schedule_aware_all_gather_cse",
+    srcs = ["schedule_aware_all_gather_cse.cc"],
+    hdrs = ["schedule_aware_all_gather_cse.h"],
+    deps = [
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_casting_utils",
+        "//tensorflow/compiler/xla/service:hlo_pass",
+        "//tensorflow/stream_executor/lib",
+        "@com_google_absl//absl/container:flat_hash_map",
+    ],
+)
diff --git a/tensorflow/compiler/xla/service/spmd/convolution_handler.cc b/tensorflow/compiler/xla/service/spmd/convolution_handler.cc
index 01d7ea2ff14..81419c55109 100644
--- a/tensorflow/compiler/xla/service/spmd/convolution_handler.cc
+++ b/tensorflow/compiler/xla/service/spmd/convolution_handler.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/compiler/xla/service/spmd/convolution_handler.h"
+
 #include "absl/algorithm/container.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/dot_as_convolution_util.h"
@@ -32,24 +34,32 @@ limitations under the License.
 
 namespace xla {
 namespace spmd {
+
 namespace {
 
-// Partition convolution.
-StatusOr<HloInstruction*> PartitionConvolution(
-    PartitionedHlo lhs, PartitionedHlo rhs, const Shape& output_base_shape,
-    const HloSharding& output_sharding, const Window& conv_window,
-    HloInstruction* original_hlo, int64 num_partitions,
-    const SpmdPartitionerOptions& options, HloInstruction* partition_id,
-    HloModule* module, SpmdBuilder* b);
-
-// Partition convolution with only paralell dims are tiled
-StatusOr<HloInstruction*> PartitionConvolutionWithParallelDimension(
+// Partition convolution with batch group count.
+StatusOr<HloInstruction*> PartitionConvolutionWithBatchGroupCount(
     PartitionedHlo lhs, PartitionedHlo rhs, const Shape& output_base_shape,
     const HloSharding& output_sharding, const Window& conv_window,
     HloInstruction* original_hlo, int64 num_partitions, SpmdBuilder* b) {
   TF_RET_CHECK(original_hlo->opcode() == HloOpcode::kConvolution);
+  if (original_hlo->batch_group_count() == 1 ||
+      original_hlo->batch_group_count() < num_partitions) {
+    return nullptr;
+  }
 
   const auto& dnums = original_hlo->convolution_dimension_numbers();
+  // Only supports batch_group_size equals input_batch_size case.
+  const int64 input_batch_size =
+      lhs.base_shape().dimensions(dnums.input_batch_dimension());
+  const int64 kernel_output_feature_size =
+      rhs.base_shape().dimensions(dnums.kernel_output_feature_dimension());
+  if (input_batch_size != kernel_output_feature_size ||
+      original_hlo->batch_group_count() != input_batch_size) {
+    return nullptr;
+  }
+
+  // Map RHS indices to LHS indices.
   std::vector<int64> rhs_to_lhs_indices(output_base_shape.rank());
   rhs_to_lhs_indices[dnums.kernel_output_feature_dimension()] =
       dnums.input_batch_dimension();
@@ -59,73 +69,167 @@ StatusOr<HloInstruction*> PartitionConvolutionWithParallelDimension(
     rhs_to_lhs_indices[dnums.kernel_spatial_dimensions(i)] =
         dnums.input_spatial_dimensions(i);
   }
+
+  // Map LHS indices to RHS indices.
   std::vector<int64> lhs_to_rhs_indices(output_base_shape.rank());
   for (int64 i = 0; i < rhs_to_lhs_indices.size(); ++i) {
     lhs_to_rhs_indices[rhs_to_lhs_indices[i]] = i;
   }
+
+  // Map LHS indices to output indices.
+  std::vector<int64> lhs_to_output_indices(lhs.base_shape().rank(), -1);
+  lhs_to_output_indices[dnums.input_batch_dimension()] =
+      dnums.output_feature_dimension();
+  lhs_to_output_indices[dnums.input_feature_dimension()] =
+      dnums.output_batch_dimension();
+  for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
+    lhs_to_output_indices[dnums.input_spatial_dimensions(i)] =
+        dnums.output_spatial_dimensions(i);
+  }
+
+  // Align LHS or RHS to other operand if input batch dim or kernel output
+  // feature dim is partitioned.
   auto aligned_rhs_sharding =
       hlo_sharding_util::TransposeSharding(lhs.sharding(), rhs_to_lhs_indices);
   auto aligned_lhs_sharding =
       hlo_sharding_util::TransposeSharding(rhs.sharding(), lhs_to_rhs_indices);
 
-  // Handling cases where all the partitioned dimensions are parallel
-  // dimensions.
-  int64 lhs_parallel_dim_partitions = 1;
-  int64 rhs_parallel_dim_partitions = 1;
-  std::vector<int64> parallel_spatial_dims;
-  for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
-    int64 lhs_dim = dnums.input_spatial_dimensions(i);
-    int64 lhs_size = lhs.base_shape().dimensions(lhs_dim);
-    const auto& wd = conv_window.dimensions(i);
-    int64 rhs_dim = dnums.kernel_spatial_dimensions(i);
-    if (dot_as_convolution_util::ConvSpatialDimensionIsParallel(wd, lhs_size)) {
-      parallel_spatial_dims.emplace_back(i);
-      lhs_parallel_dim_partitions *= ShardCountAtDim(lhs.sharding(), lhs_dim);
-      rhs_parallel_dim_partitions *= ShardCountAtDim(rhs.sharding(), rhs_dim);
-    }
-  }
-  bool lhs_partition_dims_are_parallel =
-      (lhs_parallel_dim_partitions == num_partitions);
-  bool rhs_partition_dims_are_parallel =
-      (rhs_parallel_dim_partitions == num_partitions);
-
-  // If there is a parallel dim and all the partitioned dimensions are parallel
-  // dimensions in either LHS or RHS, simply create partitioned convolutions.
-  if (parallel_spatial_dims.empty() || ((!lhs_partition_dims_are_parallel) &&
-                                        (!rhs_partition_dims_are_parallel))) {
+  bool lhs_batch_dim_is_partitioned =
+      (ShardCountAtDim(lhs.sharding(), dnums.input_batch_dimension()) ==
+       num_partitions);
+  bool rhs_output_feature_dim_is_partitioned =
+      (ShardCountAtDim(rhs.sharding(),
+                       dnums.kernel_output_feature_dimension()) ==
+       num_partitions);
+  if (!lhs_batch_dim_is_partitioned && !rhs_output_feature_dim_is_partitioned) {
     return nullptr;
   }
-  // Reshard LHS or RHS to partition at parallel dimensions as the other
-  // operand.
-  if (lhs_partition_dims_are_parallel) {
+  // Reshard LHS or RHS to partition at batch dimension or output feature
+  // dimension as the other operand.
+  if (lhs_batch_dim_is_partitioned) {
+    rhs = rhs.Reshard(aligned_rhs_sharding);
+  } else {
+    lhs = lhs.Reshard(aligned_lhs_sharding);
+  }
+  // Align output sharding after LHS and RHS sharding are consistent.
+  auto aligned_output_sharding = hlo_sharding_util::TransposeSharding(
+      lhs.sharding(), lhs_to_output_indices);
+
+  // Get LHS and RHS sharded shape.
+  auto lhs_shard_shape = MakePartitionedShape(lhs.base_shape(), lhs.sharding());
+  auto rhs_shard_shape = MakePartitionedShape(rhs.base_shape(), rhs.sharding());
+  const int64 batch_group_count =
+      CeilOfRatio(original_hlo->batch_group_count(), num_partitions);
+  // Create partitioned convolution.
+  TF_ASSIGN_OR_RETURN(
+      Shape sharded_conv_shape,
+      ShapeInference::InferConvolveShape(
+          lhs_shard_shape, rhs_shard_shape, original_hlo->feature_group_count(),
+          batch_group_count, conv_window, dnums));
+  auto sharded_conv = b->AddInstruction(HloInstruction::CreateConvolve(
+      sharded_conv_shape, lhs.hlo(), rhs.hlo(),
+      original_hlo->feature_group_count(), batch_group_count, conv_window,
+      dnums, original_hlo->precision_config()));
+  sharded_conv->set_sharding(aligned_output_sharding);
+  return PartitionedHlo(sharded_conv, output_base_shape, lhs.state())
+      .Reshard(output_sharding)
+      .hlo();
+}
+
+// Partition convolution with feature group count.
+StatusOr<HloInstruction*> PartitionConvolutionWithFeatureGroupCount(
+    PartitionedHlo lhs, PartitionedHlo rhs, const Shape& output_base_shape,
+    const HloSharding& output_sharding, const Window& conv_window,
+    HloInstruction* original_hlo, int64 num_partitions, SpmdBuilder* b) {
+  TF_RET_CHECK(original_hlo->opcode() == HloOpcode::kConvolution);
+  if (original_hlo->feature_group_count() == 1 ||
+      original_hlo->feature_group_count() < num_partitions) {
+    return nullptr;
+  }
+
+  const auto& dnums = original_hlo->convolution_dimension_numbers();
+  const int64 input_feature_size =
+      lhs.base_shape().dimensions(dnums.input_feature_dimension());
+  const int64 kernel_output_feature_size =
+      rhs.base_shape().dimensions(dnums.kernel_output_feature_dimension());
+  if (input_feature_size != kernel_output_feature_size ||
+      input_feature_size % original_hlo->feature_group_count() != 0) {
+    return nullptr;
+  }
+
+  // Align RHS indices to LHS.
+  std::vector<int64> rhs_to_lhs_indices(output_base_shape.rank());
+  rhs_to_lhs_indices[dnums.kernel_output_feature_dimension()] =
+      dnums.input_feature_dimension();
+  rhs_to_lhs_indices[dnums.kernel_input_feature_dimension()] =
+      dnums.input_batch_dimension();
+  for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
+    rhs_to_lhs_indices[dnums.kernel_spatial_dimensions(i)] =
+        dnums.input_spatial_dimensions(i);
+  }
+
+  // Align LHS indices to RHS.
+  std::vector<int64> lhs_to_rhs_indices(output_base_shape.rank());
+  for (int64 i = 0; i < rhs_to_lhs_indices.size(); ++i) {
+    lhs_to_rhs_indices[rhs_to_lhs_indices[i]] = i;
+  }
+
+  // Align LHS indices to output.
+  std::vector<int64> lhs_to_output_indices(output_base_shape.rank());
+  lhs_to_output_indices[dnums.input_feature_dimension()] =
+      dnums.output_feature_dimension();
+  lhs_to_output_indices[dnums.input_batch_dimension()] =
+      dnums.output_batch_dimension();
+  for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
+    lhs_to_output_indices[dnums.input_spatial_dimensions(i)] =
+        dnums.output_spatial_dimensions(i);
+  }
+
+  // Align LHS or RHS if input_feature_dim or kernel_output_feature_dim is
+  // partitioned.
+  auto aligned_rhs_sharding =
+      hlo_sharding_util::TransposeSharding(lhs.sharding(), rhs_to_lhs_indices);
+  auto aligned_lhs_sharding =
+      hlo_sharding_util::TransposeSharding(rhs.sharding(), lhs_to_rhs_indices);
+
+  bool lhs_feature_dim_is_partitioned =
+      (ShardCountAtDim(lhs.sharding(), dnums.input_feature_dimension()) ==
+       num_partitions);
+  bool rhs_output_feature_dim_is_partitioned =
+      (ShardCountAtDim(rhs.sharding(),
+                       dnums.kernel_output_feature_dimension()) ==
+       num_partitions);
+  if (!lhs_feature_dim_is_partitioned &&
+      !rhs_output_feature_dim_is_partitioned) {
+    return nullptr;
+  }
+  // Reshard LHS or RHS to partition at input feature dimension or output
+  // feature dimension as the other operand.
+  if (lhs_feature_dim_is_partitioned) {
     rhs = rhs.Reshard(aligned_rhs_sharding);
   } else {
     lhs = lhs.Reshard(aligned_lhs_sharding);
   }
 
-  // Get LHS and RHS sharded shape.
+  // Align output sharding after LHS and RHS sharding are consistent.
+  auto aligned_output_sharding = hlo_sharding_util::TransposeSharding(
+      lhs.sharding(), lhs_to_output_indices);
+
   auto lhs_shard_shape = MakePartitionedShape(lhs.base_shape(), lhs.sharding());
   auto rhs_shard_shape = MakePartitionedShape(rhs.base_shape(), rhs.sharding());
+  int64 feature_group_count =
+      CeilOfRatio(original_hlo->feature_group_count(), num_partitions);
 
-  // Update convolution window.
-  auto new_window = conv_window;
-  for (const auto& spatial_dim : parallel_spatial_dims) {
-    auto wd = new_window.mutable_dimensions(spatial_dim);
-    wd->set_size(lhs_shard_shape.dimensions(
-        dnums.input_spatial_dimensions(spatial_dim)));
-    wd->set_stride(std::max<int64>(1, wd->size() - 1));
-    wd->set_base_dilation(wd->size());
-  }
   TF_ASSIGN_OR_RETURN(
       Shape sharded_conv_shape,
       ShapeInference::InferConvolveShape(
-          lhs_shard_shape, rhs_shard_shape, original_hlo->feature_group_count(),
-          original_hlo->batch_group_count(), new_window, dnums));
+          lhs_shard_shape, rhs_shard_shape, feature_group_count,
+          original_hlo->batch_group_count(), conv_window, dnums));
   auto sharded_conv = b->AddInstruction(HloInstruction::CreateConvolve(
-      sharded_conv_shape, lhs.hlo(), rhs.hlo(),
-      original_hlo->feature_group_count(), original_hlo->batch_group_count(),
-      new_window, dnums, original_hlo->precision_config()));
-  sharded_conv->set_sharding(original_hlo->sharding());
+      sharded_conv_shape, lhs.hlo(), rhs.hlo(), feature_group_count,
+      original_hlo->batch_group_count(), conv_window, dnums,
+      original_hlo->precision_config()));
+  sharded_conv->set_sharding(aligned_output_sharding);
   return PartitionedHlo(sharded_conv, output_base_shape, lhs.state())
       .Reshard(output_sharding)
       .hlo();
@@ -214,7 +318,7 @@ PartitionConvolutionWithSpatialDimensionHaloExchangeOnRHS(
     int64 lhs_dimension = dnums.input_spatial_dimensions(i);
     int64 rhs_dimension = dnums.kernel_spatial_dimensions(i);
     int64 shard_count = rhs.sharding().tile_assignment().dim(rhs_dimension);
-    auto wd = conv_window.dimensions(i);
+    const auto& wd = conv_window.dimensions(i);
     if (wd.base_dilation() != 1 || wd.window_reversal()) {
       return nullptr;
     }
@@ -260,7 +364,7 @@ PartitionConvolutionWithSpatialDimensionHaloExchangeOnRHS(
     // Calculate the left and right halo sizes as described in the comments
     // above. It calculcates the halo sizes with dilation, so we apply
     // CeilOfRatio({left,right}_halo_size, window_dilation).
-    auto wd = conv_window.dimensions(i);
+    const auto& wd = conv_window.dimensions(i);
     int64 padding_low = wd.padding_low();
     int64 padding_high = wd.padding_high();
     int64 base = lhs.base_shape().dimensions(lhs_dimension);
@@ -430,7 +534,7 @@ PartitionConvolutionWithSpatialDimensionHaloExchangeOnLHS(
     lhs_to_rhs_indices[rhs_to_lhs_indices[i]] = i;
   }
 
-  Window window = conv_window;
+  const Window& window = conv_window;
   std::vector<int64> reversed_rhs_dims;
   for (int64 i = 0; i < window.dimensions_size(); ++i) {
     if (window.dimensions(i).window_reversal()) {
@@ -505,7 +609,7 @@ PartitionConvolutionWithSpatialDimensionHaloExchangeOnLHS(
     int64 lhs_dimension = dnums.input_spatial_dimensions(i);
     int64 rhs_dimension = dnums.kernel_spatial_dimensions(i);
     int64 shard_count = lhs.sharding().tile_assignment().dim(lhs_dimension);
-    auto wd = window.dimensions(i);
+    const auto& wd = window.dimensions(i);
     if (wd.base_dilation() != 1) {
       // TODO(wangtao): support parallel dim if it is replicate here.
       return nullptr;
@@ -540,7 +644,7 @@ PartitionConvolutionWithSpatialDimensionHaloExchangeOnLHS(
 
     // Calculate the left and right halo sizes as described in the comments
     // above.
-    auto wd = window.dimensions(i);
+    const auto& wd = window.dimensions(i);
     int64 padding_low = wd.padding_low();
     int64 padding_high = wd.padding_high();
     int64 base = lhs.base_shape().dimensions(lhs_dimension);
@@ -692,116 +796,6 @@ StatusOr<HloInstruction*> PartitionConvolutionTiledOutput(
       shard_shape.dimensions()));
 }
 
-StatusOr<HloInstruction*> PartitionConvolutionGroupOnParallelDim(
-    PartitionedHlo lhs, PartitionedHlo rhs, const Shape& output_base_shape,
-    const HloSharding& output_sharding, const Window& conv_window,
-    HloInstruction* original_hlo, const ConvolutionDimsMapping& dims_mapping,
-    int64 num_partitions, const SpmdPartitionerOptions& options,
-    HloInstruction* partition_id, HloModule* module, SpmdBuilder* b) {
-  std::vector<int64> lhs_dims;
-  std::vector<int64> rhs_dims;
-  std::vector<int64> output_dims;
-  auto lhs_sharding_dims_adjusted_to_output =
-      lhs.sharding().IsReplicated()
-          ? std::vector<int64>(lhs.base_shape().rank(), 1)
-          : lhs.sharding().tile_assignment().dimensions();
-  auto rhs_sharding_dims_adjusted_to_output =
-      rhs.sharding().IsReplicated()
-          ? std::vector<int64>(rhs.base_shape().rank(), 1)
-          : rhs.sharding().tile_assignment().dimensions();
-  auto output_sharding_dims_adjusted_to_lhs =
-      output_sharding.tile_assignment().dimensions();
-  bool lhs_rhs_dims_matching = true;
-  for (const auto& dim : dims_mapping.parallel_spatial_dims) {
-    lhs_dims.push_back(dim.lhs);
-    rhs_dims.push_back(dim.rhs);
-    output_dims.push_back(dim.output);
-    if (lhs_sharding_dims_adjusted_to_output[dim.lhs] !=
-        rhs_sharding_dims_adjusted_to_output[dim.rhs]) {
-      lhs_rhs_dims_matching = false;
-    }
-    lhs_sharding_dims_adjusted_to_output[dim.lhs] =
-        output_sharding.tile_assignment().dim(dim.output);
-    rhs_sharding_dims_adjusted_to_output[dim.rhs] =
-        output_sharding.tile_assignment().dim(dim.output);
-    output_sharding_dims_adjusted_to_lhs[dim.output] =
-        lhs.sharding().tile_assignment().dim(dim.lhs);
-  }
-  auto lhs_grouped = GroupShardingOnDims(lhs.sharding(), lhs_dims);
-  auto rhs_grouped = GroupShardingOnDims(rhs.sharding(), rhs_dims);
-  auto output_grouped = GroupShardingOnDims(output_sharding, output_dims);
-  if (lhs_rhs_dims_matching) {
-    if (ShapeUtil::ByteSizeOf(lhs.base_shape()) >
-        ShapeUtil::ByteSizeOf(rhs.base_shape())) {
-      rhs_grouped = AlignGroupsWith(std::move(rhs_grouped), lhs_grouped);
-      rhs = rhs.Reshard(UngroupSharding(rhs_grouped));
-    } else {
-      lhs_grouped = AlignGroupsWith(std::move(lhs_grouped), rhs_grouped);
-      lhs = lhs.Reshard(UngroupSharding(lhs_grouped));
-    }
-    auto reshaped_output_tiling = output_sharding.tile_assignment();
-    reshaped_output_tiling.Reshape(output_sharding_dims_adjusted_to_lhs);
-    output_grouped = AlignGroupsWith(
-        GroupShardingOnDims(HloSharding::Tile(reshaped_output_tiling),
-                            output_dims),
-        lhs_grouped);
-  } else {
-    auto reshaped_lhs_tiling = lhs.sharding().tile_assignment();
-    reshaped_lhs_tiling.Reshape(lhs_sharding_dims_adjusted_to_output);
-    lhs_grouped = AlignGroupsWith(
-        GroupShardingOnDims(HloSharding::Tile(reshaped_lhs_tiling), lhs_dims),
-        output_grouped);
-    lhs = lhs.Reshard(UngroupSharding(lhs_grouped));
-    auto reshaped_rhs_tiling = rhs.sharding().tile_assignment();
-    reshaped_rhs_tiling.Reshape(rhs_sharding_dims_adjusted_to_output);
-    rhs_grouped = AlignGroupsWith(
-        GroupShardingOnDims(HloSharding::Tile(reshaped_rhs_tiling), rhs_dims),
-        output_grouped);
-    rhs = rhs.Reshard(UngroupSharding(rhs_grouped));
-  }
-
-  // Update LHS and RHS sharding and shape.
-  lhs.hlo()->set_sharding(lhs_grouped.sharding);
-  rhs.hlo()->set_sharding(rhs_grouped.sharding);
-  CHECK(lhs.hlo() != rhs.hlo() || lhs_grouped.sharding == rhs_grouped.sharding);
-  auto per_group_partitioner_state = CreatePerGroupPartitioningState(
-      lhs.state(), lhs_grouped.device_groups, b);
-  auto grouped_lhs_base_shape =
-      GetPerGroupBaseShape(lhs_grouped, lhs.base_shape());
-  auto grouped_lhs_shard_shape =
-      MakePartitionedShape(grouped_lhs_base_shape, lhs.sharding());
-  // Update convolution window with the new shape
-  auto new_window = conv_window;
-  for (const auto& dim : dims_mapping.parallel_spatial_dims) {
-    auto wd = new_window.mutable_dimensions(dim.spatial);
-    wd->set_size(grouped_lhs_shard_shape.dimensions(dim.lhs));
-    wd->set_stride(std::max<int64>(1, wd->size() - 1));
-    wd->set_base_dilation(wd->size());
-  }
-
-  auto new_partition_id =
-      lhs.state().collective_ops_creator.create_partition_id(b);
-  TF_ASSIGN_OR_RETURN(
-      auto conv,
-      PartitionConvolution(
-          PartitionedHlo(lhs.hlo(), grouped_lhs_base_shape,
-                         per_group_partitioner_state),
-          PartitionedHlo(rhs.hlo(),
-                         GetPerGroupBaseShape(rhs_grouped, rhs.base_shape()),
-                         per_group_partitioner_state),
-          GetPerGroupBaseShape(output_grouped, output_base_shape),
-          output_grouped.sharding, new_window, original_hlo,
-          num_partitions / output_grouped.device_groups.size(), options,
-          new_partition_id, module, b));
-  // Reset the LHS sharding to the ungrouped one.
-  lhs.hlo()->set_sharding(UngroupSharding(lhs_grouped));
-  rhs.hlo()->set_sharding(UngroupSharding(rhs_grouped));
-  conv->set_sharding(UngroupSharding(output_grouped));
-  return PartitionedHlo(conv, output_base_shape, lhs.state())
-      .Reshard(output_sharding)
-      .hlo();
-}
-
 // Partition convolution with only one kind of dims partitioned.
 StatusOr<HloInstruction*> PartitionConvolutionBaseCase(
     PartitionedHlo lhs, PartitionedHlo rhs, const Shape& output_base_shape,
@@ -811,13 +805,26 @@ StatusOr<HloInstruction*> PartitionConvolutionBaseCase(
     HloModule* module, SpmdBuilder* b) {
   TF_RET_CHECK(original_hlo->opcode() == HloOpcode::kConvolution);
 
-  // Case 1: Either RHS or LHS is only partitioned at parallel dimensions.
-  TF_ASSIGN_OR_RETURN(auto parallel_partitioned_conv,
-                      PartitionConvolutionWithParallelDimension(
-                          lhs, rhs, output_base_shape, output_sharding,
-                          conv_window, original_hlo, num_partitions, b));
-  if (parallel_partitioned_conv) {
-    return parallel_partitioned_conv;
+  // Case 1: Handle depthwise convolution with batch group count or
+  // feature group count.
+  if (original_hlo->batch_group_count() > 1) {
+    TF_ASSIGN_OR_RETURN(auto parallel_partitioned_conv,
+                        PartitionConvolutionWithBatchGroupCount(
+                            lhs, rhs, output_base_shape, output_sharding,
+                            conv_window, original_hlo, num_partitions, b));
+    if (parallel_partitioned_conv) {
+      return parallel_partitioned_conv;
+    }
+  }
+
+  if (original_hlo->feature_group_count() > 1) {
+    TF_ASSIGN_OR_RETURN(auto parallel_partitioned_conv,
+                        PartitionConvolutionWithFeatureGroupCount(
+                            lhs, rhs, output_base_shape, output_sharding,
+                            conv_window, original_hlo, num_partitions, b));
+    if (parallel_partitioned_conv) {
+      return parallel_partitioned_conv;
+    }
   }
 
   // Case 2: both RHS and LHS are tiled.
@@ -862,13 +869,15 @@ StatusOr<HloInstruction*> PartitionConvolutionBaseCase(
   return nullptr;
 }
 
+}  // namespace
+
 // Partition convolution.
 StatusOr<HloInstruction*> PartitionConvolution(
     PartitionedHlo lhs, PartitionedHlo rhs, const Shape& output_base_shape,
-    const HloSharding& output_sharding, const Window& conv_window,
-    HloInstruction* original_hlo, int64 num_partitions,
-    const SpmdPartitionerOptions& options, HloInstruction* partition_id,
-    HloModule* module, SpmdBuilder* b) {
+    const HloSharding& output_sharding, const DotConvDimsMapping& dims_mapping,
+    const Window& conv_window, HloInstruction* original_hlo,
+    int64 num_partitions, const SpmdPartitionerOptions& options,
+    HloInstruction* partition_id, HloModule* module, SpmdBuilder* b) {
   TF_RET_CHECK(original_hlo->opcode() == HloOpcode::kConvolution);
 
   TF_ASSIGN_OR_RETURN(
@@ -880,133 +889,57 @@ StatusOr<HloInstruction*> PartitionConvolution(
     return try_partitioned_conv;
   }
 
-  const auto& dnums = original_hlo->convolution_dimension_numbers();
-  spmd::ConvolutionDimsMapping mapping;
-  for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
-    int64 lhs_dim = dnums.input_spatial_dimensions(i);
-    int64 lhs_size = lhs.base_shape().dimensions(lhs_dim);
-    const auto& wd = original_hlo->window().dimensions(i);
-    int64 rhs_dim = dnums.kernel_spatial_dimensions(i);
-    int64 output_dim = dnums.output_spatial_dimensions(i);
-    if (dot_as_convolution_util::ConvSpatialDimensionIsParallel(wd, lhs_size)) {
-      mapping.parallel_spatial_dims.emplace_back();
-      mapping.parallel_spatial_dims.back().lhs = lhs_dim;
-      mapping.parallel_spatial_dims.back().rhs = rhs_dim;
-      mapping.parallel_spatial_dims.back().output = output_dim;
-      mapping.parallel_spatial_dims.back().spatial = i;
-    } else {
-      mapping.non_parallel_spatial_dims.emplace_back();
-      mapping.non_parallel_spatial_dims.back().lhs = lhs_dim;
-      mapping.non_parallel_spatial_dims.back().rhs = rhs_dim;
-      mapping.non_parallel_spatial_dims.back().output = output_dim;
-      mapping.non_parallel_spatial_dims.back().spatial = i;
-    }
-  }
-
-  // lhs_rhs_or_output: 0 lhs, 1 rhs, 2 output.
-  auto get_partitions_for_dims =
-      [&](const HloSharding& sharding,
-          absl::Span<const ConvolutionDimsMapping::DimsMapping> dims,
-          int lhs_rhs_or_output) {
-        int64 partitions = 1;
-        if (sharding.IsTileMaximal()) {
-          return partitions;
-        }
-        for (const auto& dim : dims) {
-          if (lhs_rhs_or_output == 0) {
-            partitions *= sharding.tile_assignment().dim(dim.lhs);
-          } else if (lhs_rhs_or_output == 1) {
-            partitions *= sharding.tile_assignment().dim(dim.rhs);
-          } else {
-            CHECK_EQ(lhs_rhs_or_output, 2);
-            partitions *= sharding.tile_assignment().dim(dim.output);
-          }
-        }
-        return partitions;
-      };
-
-  const int64 lhs_parallel_spatial_partitions =
-      get_partitions_for_dims(lhs.sharding(), mapping.parallel_spatial_dims, 0);
-  const int64 rhs_parallel_spatial_partitions =
-      get_partitions_for_dims(rhs.sharding(), mapping.parallel_spatial_dims, 1);
-  const int64 output_parallel_spatial_partitions = get_partitions_for_dims(
-      original_hlo->sharding(), mapping.parallel_spatial_dims, 2);
-
-  // Recursively partition on different types of dimensions.
-  //
-  // Case 1: Group partitions by parallel spatial dims.
-  if (lhs_parallel_spatial_partitions == rhs_parallel_spatial_partitions &&
-      lhs_parallel_spatial_partitions == output_parallel_spatial_partitions &&
-      lhs_parallel_spatial_partitions > 1) {
-    TF_ASSIGN_OR_RETURN(auto try_partitioned_conv,
-                        PartitionConvolutionGroupOnParallelDim(
-                            lhs, rhs, output_base_shape, output_sharding,
-                            conv_window, original_hlo, mapping, num_partitions,
-                            options, partition_id, module, b));
-    if (try_partitioned_conv) {
-      return try_partitioned_conv;
-    }
-  }
-
   return nullptr;
 }
 
-}  // namespace
-
 Status SpmdPartitioningVisitor::HandleConvolution(HloInstruction* hlo) {
-  auto dot_dnums = dot_as_convolution_util::ParseDotGeneralFromConvolution(hlo);
-  if (dot_dnums) {
-    // Use HandleDotHelper() for convs that are actually einsums.
-    spmd::DotGeneralDimsMapping mapping;
-    for (const auto& dims : dot_dnums->batch_dims) {
-      mapping.batch_dims.emplace_back();
-      mapping.batch_dims.back().lhs = dims.lhs;
-      mapping.batch_dims.back().rhs = dims.rhs;
-      mapping.batch_dims.back().output = dims.output;
-    }
-    for (const auto& dims : dot_dnums->contracting_dims) {
-      mapping.contracting_dims.emplace_back();
-      mapping.contracting_dims.back().lhs = dims.lhs;
-      mapping.contracting_dims.back().rhs = dims.rhs;
-      mapping.contracting_dims.back().output = dims.output;
-    }
-    for (const auto& dims : dot_dnums->lhs_non_contracting_dims) {
-      mapping.lhs_non_contracting_dims.emplace_back();
-      mapping.lhs_non_contracting_dims.back().lhs = dims.lhs;
-      mapping.lhs_non_contracting_dims.back().rhs = dims.rhs;
-      mapping.lhs_non_contracting_dims.back().output = dims.output;
-    }
-    for (const auto& dims : dot_dnums->rhs_non_contracting_dims) {
-      mapping.rhs_non_contracting_dims.emplace_back();
-      mapping.rhs_non_contracting_dims.back().lhs = dims.lhs;
-      mapping.rhs_non_contracting_dims.back().rhs = dims.rhs;
-      mapping.rhs_non_contracting_dims.back().output = dims.output;
-    }
-    auto create_sharded_conv =
-        [&](HloInstruction* lhs_hlo, HloInstruction* rhs_hlo,
-            spmd::SpmdBuilder* b) -> StatusOr<HloInstruction*> {
-      TF_ASSIGN_OR_RETURN(
-          auto sharded_conv,
-          dot_as_convolution_util::CreateShardedConvForDotGeneralConvolution(
-              *hlo, *dot_dnums, lhs_hlo, rhs_hlo));
-      return b->AddInstruction(std::move(sharded_conv));
-    };
-    return HandleDotHelper(hlo, mapping, create_sharded_conv);
+  auto dims_info = dot_as_convolution_util::ParseConvolutionDimsInfo(hlo);
+  spmd::DotConvDimsMapping mapping;
+  for (const auto& dims : dims_info.batch_dims) {
+    mapping.batch_dims.emplace_back();
+    mapping.batch_dims.back().lhs = dims.lhs;
+    mapping.batch_dims.back().rhs = dims.rhs;
+    mapping.batch_dims.back().output = dims.output;
+    mapping.batch_dims.back().spatial = dims.spatial_dim;
   }
-
-  auto lhs = GetPartitionedHlo(hlo->operand(0));
-  auto rhs = GetPartitionedHlo(hlo->operand(1));
-  TF_ASSIGN_OR_RETURN(
-      auto partitioned_conv,
-      PartitionConvolution(lhs, rhs, hlo->shape(), hlo->sharding(),
-                           hlo->window(), hlo, num_partitions_, options_,
-                           partition_id_, module_, &b_));
-
-  if (partitioned_conv) {
-    SetPartitionedHlo(hlo, [&] { return partitioned_conv; });
-    return Status::OK();
+  for (const auto& dims : dims_info.contracting_dims) {
+    mapping.contracting_dims.emplace_back();
+    mapping.contracting_dims.back().lhs = dims.lhs;
+    mapping.contracting_dims.back().rhs = dims.rhs;
+    mapping.contracting_dims.back().output = dims.output;
+    mapping.contracting_dims.back().spatial = dims.spatial_dim;
   }
-  return DefaultAction(hlo);
+  for (const auto& dims : dims_info.lhs_non_contracting_dims) {
+    mapping.lhs_non_contracting_dims.emplace_back();
+    mapping.lhs_non_contracting_dims.back().lhs = dims.lhs;
+    mapping.lhs_non_contracting_dims.back().rhs = dims.rhs;
+    mapping.lhs_non_contracting_dims.back().output = dims.output;
+    mapping.lhs_non_contracting_dims.back().spatial = dims.spatial_dim;
+  }
+  for (const auto& dims : dims_info.rhs_non_contracting_dims) {
+    mapping.rhs_non_contracting_dims.emplace_back();
+    mapping.rhs_non_contracting_dims.back().lhs = dims.lhs;
+    mapping.rhs_non_contracting_dims.back().rhs = dims.rhs;
+    mapping.rhs_non_contracting_dims.back().output = dims.output;
+    mapping.rhs_non_contracting_dims.back().spatial = dims.spatial_dim;
+  }
+  for (const auto& dims : dims_info.conv_spatial_dims) {
+    mapping.conv_spatial_dims.emplace_back();
+    mapping.conv_spatial_dims.back().lhs = dims.lhs;
+    mapping.conv_spatial_dims.back().rhs = dims.rhs;
+    mapping.conv_spatial_dims.back().output = dims.output;
+    mapping.conv_spatial_dims.back().spatial = dims.spatial_dim;
+  }
+  auto create_sharded_conv =
+      [&](HloInstruction* lhs_hlo, HloInstruction* rhs_hlo,
+          spmd::SpmdBuilder* b) -> StatusOr<HloInstruction*> {
+    TF_ASSIGN_OR_RETURN(
+        auto sharded_conv,
+        dot_as_convolution_util::CreateShardedConvForDotGeneralConvolution(
+            *hlo, dims_info, lhs_hlo, rhs_hlo));
+    return b->AddInstruction(std::move(sharded_conv));
+  };
+  return HandleDotHelper(hlo, mapping, create_sharded_conv);
 }
 
 }  // namespace spmd
diff --git a/tensorflow/compiler/xla/service/spmd/convolution_handler.h b/tensorflow/compiler/xla/service/spmd/convolution_handler.h
new file mode 100644
index 00000000000..dced14a4872
--- /dev/null
+++ b/tensorflow/compiler/xla/service/spmd/convolution_handler.h
@@ -0,0 +1,39 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_SPMD_CONVOLUTION_HANDLER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_SPMD_CONVOLUTION_HANDLER_H_
+
+#include "tensorflow/compiler/xla/service/dot_as_convolution_util.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_sharding.h"
+#include "tensorflow/compiler/xla/service/spmd/spmd_partitioner.h"
+
+namespace xla {
+namespace spmd {
+
+// Partition convolution.
+StatusOr<HloInstruction*> PartitionConvolution(
+    PartitionedHlo lhs, PartitionedHlo rhs, const Shape& output_base_shape,
+    const HloSharding& output_sharding, const DotConvDimsMapping& dims_mapping,
+    const Window& conv_window, HloInstruction* original_hlo,
+    int64 num_partitions, const SpmdPartitionerOptions& options,
+    HloInstruction* partition_id, HloModule* module, SpmdBuilder* b);
+
+}  // namespace spmd
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_SPMD_CONVOLUTION_HANDLER_H_
diff --git a/tensorflow/compiler/xla/service/spmd/dot_handler.cc b/tensorflow/compiler/xla/service/spmd/dot_handler.cc
index 55ebe120d01..25c21ba60f2 100644
--- a/tensorflow/compiler/xla/service/spmd/dot_handler.cc
+++ b/tensorflow/compiler/xla/service/spmd/dot_handler.cc
@@ -24,18 +24,20 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_sharding.h"
 #include "tensorflow/compiler/xla/service/hlo_sharding_util.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
+#include "tensorflow/compiler/xla/service/spmd/convolution_handler.h"
 #include "tensorflow/compiler/xla/service/spmd/spmd_partitioner.h"
 #include "tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/platform/numbers.h"
 
 namespace xla {
 namespace spmd {
 
 Status SpmdPartitioningVisitor::HandleDot(HloInstruction* hlo) {
-  DotGeneralDimsMapping mapping;
+  DotConvDimsMapping mapping;
   const auto& dnums = hlo->dot_dimension_numbers();
   int64 next_output_dim = 0;
   for (int64 i = 0; i < dnums.lhs_batch_dimensions_size(); ++i) {
@@ -87,8 +89,8 @@ namespace {
 
 StatusOr<HloInstruction*> PartitionBaseCase(
     PartitionedHlo lhs, PartitionedHlo rhs, const Shape& output_base_shape,
-    const HloSharding& output_sharding,
-    const DotGeneralDimsMapping& dims_mapping, int64 num_partitions,
+    const HloSharding& output_sharding, const DotConvDimsMapping& dims_mapping,
+    int64 num_partitions,
     const std::function<StatusOr<HloInstruction*>(
         HloInstruction*, HloInstruction*, SpmdBuilder*)>& create_sharded_dot,
     HloModule* module, HloInstruction* original_hlo, int64 lhs_batch_partitions,
@@ -97,11 +99,17 @@ StatusOr<HloInstruction*> PartitionBaseCase(
     int64 lhs_non_contracting_partitions, int64 rhs_non_contracting_partitions,
     int64 output_lhs_non_contracting_partitions,
     int64 output_rhs_non_contracting_partitions,
-    int64 threshold_for_windowed_einsum_mib, SpmdBuilder* b,
+    const SpmdPartitionerOptions& options, SpmdBuilder* b,
     std::vector<SpmdPartitioningVisitor::WindowedDotGeneralLoop>*
-        windowed_dot_general_loops) {
+        windowed_dot_general_loops,
+    bool may_reshard_without_detecting_match) {
   const HloSharding& lhs_sharding = lhs.sharding();
   const HloSharding& rhs_sharding = rhs.sharding();
+  if (lhs_sharding.ReplicateOnLastTileDim() ||
+      rhs_sharding.ReplicateOnLastTileDim() ||
+      output_sharding.ReplicateOnLastTileDim()) {
+    return nullptr;
+  }
   std::vector<int64> lhs_to_rhs_indices(lhs.base_shape().rank(), -1);
   std::vector<int64> lhs_to_output_indices(lhs.base_shape().rank(), -1);
   std::vector<int64> rhs_to_lhs_indices(rhs.base_shape().rank(), -1);
@@ -109,7 +117,7 @@ StatusOr<HloInstruction*> PartitionBaseCase(
   std::vector<int64> output_to_lhs_indices(output_base_shape.rank(), -1);
   std::vector<int64> output_to_rhs_indices(output_base_shape.rank(), -1);
   auto populate_indices_mapping =
-      [&](const DotGeneralDimsMapping::DimsMapping& mapping) {
+      [&](const DotConvDimsMapping::DimsMapping& mapping) {
         if (mapping.lhs >= 0) {
           lhs_to_rhs_indices[mapping.lhs] = mapping.rhs;
           lhs_to_output_indices[mapping.lhs] = mapping.output;
@@ -135,24 +143,27 @@ StatusOr<HloInstruction*> PartitionBaseCase(
   for (const auto& mapping : dims_mapping.rhs_non_contracting_dims) {
     populate_indices_mapping(mapping);
   }
+  for (const auto& mapping : dims_mapping.conv_spatial_dims) {
+    populate_indices_mapping(mapping);
+  }
   auto lhs_sharding_transposed_to_match_rhs =
-      TransposeShardingWithCollapsedDims(lhs_sharding, lhs_to_rhs_indices,
-                                         rhs_to_lhs_indices);
+      hlo_sharding_util::TransposeShardingWithCollapsedDims(
+          lhs_sharding, lhs_to_rhs_indices, rhs_to_lhs_indices);
   auto rhs_sharding_transposed_to_match_lhs =
-      TransposeShardingWithCollapsedDims(rhs_sharding, rhs_to_lhs_indices,
-                                         lhs_to_rhs_indices);
+      hlo_sharding_util::TransposeShardingWithCollapsedDims(
+          rhs_sharding, rhs_to_lhs_indices, lhs_to_rhs_indices);
   auto lhs_sharding_transposed_to_match_output =
-      TransposeShardingWithCollapsedDims(lhs_sharding, lhs_to_output_indices,
-                                         output_to_lhs_indices);
+      hlo_sharding_util::TransposeShardingWithCollapsedDims(
+          lhs_sharding, lhs_to_output_indices, output_to_lhs_indices);
   auto rhs_sharding_transposed_to_match_output =
-      TransposeShardingWithCollapsedDims(rhs_sharding, rhs_to_output_indices,
-                                         output_to_rhs_indices);
+      hlo_sharding_util::TransposeShardingWithCollapsedDims(
+          rhs_sharding, rhs_to_output_indices, output_to_rhs_indices);
   auto output_sharding_transposed_to_match_lhs =
-      TransposeShardingWithCollapsedDims(output_sharding, output_to_lhs_indices,
-                                         lhs_to_output_indices);
+      hlo_sharding_util::TransposeShardingWithCollapsedDims(
+          output_sharding, output_to_lhs_indices, lhs_to_output_indices);
   auto output_sharding_transposed_to_match_rhs =
-      TransposeShardingWithCollapsedDims(output_sharding, output_to_rhs_indices,
-                                         rhs_to_output_indices);
+      hlo_sharding_util::TransposeShardingWithCollapsedDims(
+          output_sharding, output_to_rhs_indices, rhs_to_output_indices);
 
   // LHS and RHS are partitioned the same way and only partitioned in batch
   // dimensions.
@@ -401,7 +412,7 @@ StatusOr<HloInstruction*> PartitionBaseCase(
   if (output_lhs_non_contracting_partitions == num_partitions &&
       output_sharding_transposed_to_match_lhs == lhs_sharding &&
       ShapeSizeInBytes(rhs.base_shape()) >=
-          threshold_for_windowed_einsum_mib * 1024 * 1024) {
+          options.threshold_for_windowed_einsum_mib * 1024 * 1024) {
     if (rhs_contracting_partitions == num_partitions) {
       return emit_windowed_dot_general(0, 1, true, false);
     }
@@ -415,7 +426,7 @@ StatusOr<HloInstruction*> PartitionBaseCase(
   if (output_rhs_non_contracting_partitions == num_partitions &&
       output_sharding_transposed_to_match_rhs == rhs_sharding &&
       ShapeSizeInBytes(lhs.base_shape()) >=
-          threshold_for_windowed_einsum_mib * 1024 * 1024) {
+          options.threshold_for_windowed_einsum_mib * 1024 * 1024) {
     if (lhs_contracting_partitions == num_partitions) {
       return emit_windowed_dot_general(1, 0, true, false);
     }
@@ -485,29 +496,36 @@ StatusOr<HloInstruction*> PartitionBaseCase(
     return dot;
   }
 
-  // Output is batch partitioned.
-  if (output_batch_partitions == num_partitions) {
-    auto resharded_lhs = lhs.Reshard(*output_sharding_transposed_to_match_lhs);
-    auto resharded_rhs = rhs.Reshard(*output_sharding_transposed_to_match_rhs);
-    TF_ASSIGN_OR_RETURN(auto dot, create_sharded_dot(resharded_lhs.hlo(),
-                                                     resharded_rhs.hlo(), b));
-    return dot;
-  }
-  // Output is partitioned along LHS non-contracting dimensions.
-  if (output_lhs_non_contracting_partitions == num_partitions) {
-    auto resharded_lhs = lhs.Reshard(*output_sharding_transposed_to_match_lhs);
-    auto replicated_rhs = rhs.Reshard(HloSharding::Replicate());
-    TF_ASSIGN_OR_RETURN(auto dot, create_sharded_dot(resharded_lhs.hlo(),
-                                                     replicated_rhs.hlo(), b));
-    return dot;
-  }
-  // Output is partitioned along RHS non-contracting dimensions.
-  if (output_rhs_non_contracting_partitions == num_partitions) {
-    auto replicated_lhs = lhs.Reshard(HloSharding::Replicate());
-    auto resharded_rhs = rhs.Reshard(*output_sharding_transposed_to_match_rhs);
-    TF_ASSIGN_OR_RETURN(auto dot, create_sharded_dot(replicated_lhs.hlo(),
-                                                     resharded_rhs.hlo(), b));
-    return dot;
+  if (may_reshard_without_detecting_match) {
+    // Output is batch partitioned.
+    if (output_batch_partitions == num_partitions) {
+      auto resharded_lhs =
+          lhs.Reshard(*output_sharding_transposed_to_match_lhs);
+      auto resharded_rhs =
+          rhs.Reshard(*output_sharding_transposed_to_match_rhs);
+      TF_ASSIGN_OR_RETURN(auto dot, create_sharded_dot(resharded_lhs.hlo(),
+                                                       resharded_rhs.hlo(), b));
+      return dot;
+    }
+    // Output is partitioned along LHS non-contracting dimensions.
+    if (output_lhs_non_contracting_partitions == num_partitions) {
+      auto resharded_lhs =
+          lhs.Reshard(*output_sharding_transposed_to_match_lhs);
+      auto replicated_rhs = rhs.Reshard(HloSharding::Replicate());
+      TF_ASSIGN_OR_RETURN(
+          auto dot,
+          create_sharded_dot(resharded_lhs.hlo(), replicated_rhs.hlo(), b));
+      return dot;
+    }
+    // Output is partitioned along RHS non-contracting dimensions.
+    if (output_rhs_non_contracting_partitions == num_partitions) {
+      auto replicated_lhs = lhs.Reshard(HloSharding::Replicate());
+      auto resharded_rhs =
+          rhs.Reshard(*output_sharding_transposed_to_match_rhs);
+      TF_ASSIGN_OR_RETURN(auto dot, create_sharded_dot(replicated_lhs.hlo(),
+                                                       resharded_rhs.hlo(), b));
+      return dot;
+    }
   }
 
   // Returns true if it is beneficial to reshard the operand at `operand_idx`
@@ -558,27 +576,35 @@ StatusOr<HloInstruction*> PartitionBaseCase(
 
 StatusOr<HloInstruction*> PartitionDot(
     PartitionedHlo lhs, PartitionedHlo rhs, const Shape& output_base_shape,
-    const HloSharding& output_sharding,
-    const DotGeneralDimsMapping& dims_mapping, int64 num_partitions,
+    const HloSharding& output_sharding, const DotConvDimsMapping& dims_mapping,
+    int64 num_partitions,
     const std::function<StatusOr<HloInstruction*>(
         HloInstruction*, HloInstruction*, SpmdBuilder*)>& create_sharded_dot,
     HloModule* module, HloInstruction* original_hlo,
-    int64 threshold_for_windowed_einsum_mib, SpmdBuilder* b,
+    const SpmdPartitionerOptions& options, SpmdBuilder* b,
     std::vector<SpmdPartitioningVisitor::WindowedDotGeneralLoop>*
         windowed_dot_general_loops);
 
 StatusOr<HloInstruction*> PartitionDotGroupOnBatch(
     PartitionedHlo lhs, PartitionedHlo rhs, const Shape& output_base_shape,
-    const HloSharding& output_sharding,
-    const DotGeneralDimsMapping& dims_mapping, int64 num_partitions,
-    int64 lhs_contracting_partitions, int64 rhs_contracting_partitions,
-    int64 lhs_non_contracting_partitions, int64 rhs_non_contracting_partitions,
+    const HloSharding& output_sharding, const DotConvDimsMapping& dims_mapping,
+    int64 num_partitions, int64 lhs_contracting_partitions,
+    int64 rhs_contracting_partitions, int64 lhs_non_contracting_partitions,
+    int64 rhs_non_contracting_partitions,
     const std::function<StatusOr<HloInstruction*>(
         HloInstruction*, HloInstruction*, SpmdBuilder*)>& create_sharded_dot,
     HloModule* module, HloInstruction* original_hlo,
-    int64 threshold_for_windowed_einsum_mib, SpmdBuilder* b,
+    bool require_matching_devices_to_group,
+    const SpmdPartitionerOptions& options, SpmdBuilder* b,
     std::vector<SpmdPartitioningVisitor::WindowedDotGeneralLoop>*
         windowed_dot_general_loops) {
+  std::vector<std::pair<HloInstruction*, HloSharding>>
+      top_level_sharding_to_reset;
+  auto cleaner = tensorflow::gtl::MakeCleanup([&] {
+    for (auto& to_reset : top_level_sharding_to_reset) {
+      to_reset.first->set_sharding(to_reset.second);
+    }
+  });
   std::vector<int64> lhs_dims;
   std::vector<int64> rhs_dims;
   std::vector<int64> output_dims;
@@ -608,16 +634,20 @@ StatusOr<HloInstruction*> PartitionDotGroupOnBatch(
     output_sharding_dims_adjusted_to_lhs[dim.output] =
         lhs.sharding().tile_assignment().dim(dim.lhs);
   }
+  if (require_matching_devices_to_group && lhs_rhs_dims_matching) {
+    lhs_rhs_dims_matching =
+        rhs.sharding() == UngroupSharding(AlignGroupsWith(
+                              GroupShardingOnDims(rhs.sharding(), rhs_dims),
+                              GroupShardingOnDims(lhs.sharding(), lhs_dims)));
+  }
   auto output_grouped = GroupShardingOnDims(output_sharding, output_dims);
   PartitionedHlo per_group_lhs = lhs;
   PartitionedHlo per_group_rhs = rhs;
-  auto lhs_sharding = lhs.sharding();
-  auto rhs_sharding = rhs.sharding();
   if (lhs_rhs_dims_matching) {
     auto lhs_grouped = GroupShardingOnDims(lhs.sharding(), lhs_dims);
     auto rhs_grouped = GroupShardingOnDims(rhs.sharding(), rhs_dims);
-    if (ShapeUtil::ByteSizeOf(lhs.base_shape()) >
-        ShapeUtil::ByteSizeOf(rhs.base_shape())) {
+    if (ShapeUtil::ByteSizeOf(lhs.hlo()->shape()) >
+        ShapeUtil::ByteSizeOf(rhs.hlo()->shape())) {
       rhs_grouped = AlignGroupsWith(std::move(rhs_grouped), lhs_grouped);
       rhs = rhs.Reshard(UngroupSharding(rhs_grouped));
     } else {
@@ -627,12 +657,17 @@ StatusOr<HloInstruction*> PartitionDotGroupOnBatch(
     auto reshaped_output_tiling = output_sharding.tile_assignment();
     reshaped_output_tiling.Reshape(output_sharding_dims_adjusted_to_lhs);
     output_grouped = AlignGroupsWith(
-        GroupShardingOnDims(HloSharding::Tile(reshaped_output_tiling),
-                            output_dims),
+        GroupShardingOnDims(
+            output_sharding.ReplicateOnLastTileDim()
+                ? HloSharding::PartialTile(reshaped_output_tiling)
+                : HloSharding::Tile(reshaped_output_tiling),
+            output_dims),
         lhs_grouped);
     auto per_group_partitioner_state = CreatePerGroupPartitioningState(
         lhs.state(), lhs_grouped.device_groups, b);
+    top_level_sharding_to_reset.emplace_back(lhs.hlo(), lhs.sharding());
     lhs.hlo()->set_sharding(lhs_grouped.sharding);
+    top_level_sharding_to_reset.emplace_back(rhs.hlo(), rhs.sharding());
     rhs.hlo()->set_sharding(rhs_grouped.sharding);
     CHECK(lhs.hlo() != rhs.hlo() ||
           lhs_grouped.sharding == rhs_grouped.sharding);
@@ -654,9 +689,9 @@ StatusOr<HloInstruction*> PartitionDotGroupOnBatch(
             int64 other_contracting_dim_partitions,
             std::vector<int64>* sharding_dims_adjusted_to_output)
         -> absl::optional<PartitionedHlo> {
-      if (operand.sharding().IsReplicated()) {
+      if (operand.sharding().IsTileMaximal()) {
         auto partially_sharded = PerGroupSliceFromReplicated(
-            operand.hlo(), operand.state().partition_id,
+            operand.Replicate().hlo(), operand.state().partition_id,
             output_grouped.device_groups, batch_dims,
             output_grouped.group_dim_sizes, b);
         partially_sharded->set_sharding(HloSharding::Replicate());
@@ -678,9 +713,16 @@ StatusOr<HloInstruction*> PartitionDotGroupOnBatch(
         }
         int64 ratio = Product(*sharding_dims_adjusted_to_output) /
                       reshaped_tiling.num_elements();
-        if (ratio == non_contracting_dim_partitions &&
-            (ratio != contracting_dim_partitions ||
-             contracting_dim_partitions == other_contracting_dim_partitions)) {
+        if (operand.sharding().ReplicateOnLastTileDim() &&
+            reshaped_tiling.dimensions().back() % ratio == 0) {
+          sharding_dims_adjusted_to_output->back() /= ratio;
+          if (sharding_dims_adjusted_to_output->back() == 1) {
+            sharding_dims_adjusted_to_output->pop_back();
+          }
+        } else if (ratio == non_contracting_dim_partitions &&
+                   (ratio != contracting_dim_partitions ||
+                    contracting_dim_partitions ==
+                        other_contracting_dim_partitions)) {
           for (int64 dim : non_contracting_dims) {
             (*sharding_dims_adjusted_to_output)[dim] = 1;
           }
@@ -688,6 +730,8 @@ StatusOr<HloInstruction*> PartitionDotGroupOnBatch(
           for (int64 dim : contracting_dims) {
             (*sharding_dims_adjusted_to_output)[dim] = 1;
           }
+        } else {
+          return absl::nullopt;
         }
       }
       // If the operand is initially sharded more ways than the output in the
@@ -699,9 +743,19 @@ StatusOr<HloInstruction*> PartitionDotGroupOnBatch(
       }
       reshaped_tiling.Reshape(*sharding_dims_adjusted_to_output);
       auto grouped = AlignGroupsWith(
-          GroupShardingOnDims(HloSharding::Tile(reshaped_tiling), batch_dims),
+          GroupShardingOnDims(operand.base_shape().rank() <
+                                      sharding_dims_adjusted_to_output->size()
+                                  ? HloSharding::PartialTile(reshaped_tiling)
+                                  : HloSharding::Tile(reshaped_tiling),
+                              batch_dims),
           output_grouped);
+      if (require_matching_devices_to_group &&
+          operand.sharding() != UngroupSharding(grouped)) {
+        return absl::nullopt;
+      }
       auto resharded = operand.Reshard(UngroupSharding(grouped));
+      top_level_sharding_to_reset.emplace_back(resharded.hlo(),
+                                               resharded.sharding());
       resharded.hlo()->set_sharding(grouped.sharding);
       return PartitionedHlo(resharded.hlo(),
                             GetPerGroupBaseShape(grouped, operand.base_shape()),
@@ -754,12 +808,8 @@ StatusOr<HloInstruction*> PartitionDotGroupOnBatch(
                    GetPerGroupBaseShape(output_grouped, output_base_shape),
                    output_grouped.sharding, dims_mapping,
                    num_partitions / output_grouped.device_groups.size(),
-                   create_sharded_dot, module, original_hlo,
-                   threshold_for_windowed_einsum_mib, b,
+                   create_sharded_dot, module, original_hlo, options, b,
                    windowed_dot_general_loops));
-  // Make sure the operands' sharding are set to the ungrouped ones.
-  lhs.hlo()->set_sharding(lhs_sharding);
-  rhs.hlo()->set_sharding(rhs_sharding);
   dot->set_sharding(UngroupSharding(output_grouped));
   return PartitionedHlo(dot, output_base_shape, lhs.state())
       .Reshard(output_sharding)
@@ -769,65 +819,96 @@ StatusOr<HloInstruction*> PartitionDotGroupOnBatch(
 StatusOr<HloInstruction*> PartitionDotGroupOnNonContracting(
     bool lhs_matching, PartitionedHlo matching, PartitionedHlo other,
     int64 matching_contracting_partitions, int64 other_contracting_partitions,
-    int64 matching_non_contracting_partitions,
+    absl::Span<const DotConvDimsMapping::DimsMapping>
+        partitioned_non_contractin_dims,
     int64 other_non_contracting_partitions,
     int64 output_other_non_contracting_partitions,
     const Shape& output_base_shape, const HloSharding& output_sharding,
-    const DotGeneralDimsMapping& dims_mapping, int64 num_partitions,
+    const DotConvDimsMapping& dims_mapping, int64 num_partitions,
     const std::function<StatusOr<HloInstruction*>(
         HloInstruction*, HloInstruction*, SpmdBuilder*)>& create_sharded_dot,
     HloModule* module, HloInstruction* original_hlo,
-    int64 threshold_for_windowed_einsum_mib, SpmdBuilder* b,
+    bool require_matching_devices_to_group,
+    const SpmdPartitionerOptions& options, SpmdBuilder* b,
     std::vector<SpmdPartitioningVisitor::WindowedDotGeneralLoop>*
         windowed_dot_general_loops) {
-  const bool may_replicate_other_contracting_dims =
-      (other_contracting_partitions == matching_non_contracting_partitions &&
-       other_non_contracting_partitions ==
-           output_other_non_contracting_partitions);
-  const bool may_replicate_other_non_contracting_dims =
-      matching_non_contracting_partitions == other_non_contracting_partitions &&
-      matching_contracting_partitions == other_contracting_partitions;
-  std::vector<int64> other_group_dims;
-  if (may_replicate_other_contracting_dims &&
-      (!may_replicate_other_non_contracting_dims ||
-       ShapeUtil::ByteSizeOf(other.base_shape()) <=
-           ShapeUtil::ByteSizeOf(output_base_shape))) {
-    for (const auto& dim : dims_mapping.contracting_dims) {
-      other_group_dims.push_back(lhs_matching ? dim.rhs : dim.lhs);
+  std::vector<std::pair<HloInstruction*, HloSharding>>
+      top_level_sharding_to_reset;
+  auto cleaner = tensorflow::gtl::MakeCleanup([&] {
+    for (auto& to_reset : top_level_sharding_to_reset) {
+      to_reset.first->set_sharding(to_reset.second);
     }
-  } else if (may_replicate_other_non_contracting_dims) {
-    for (const auto& dim : lhs_matching
-                               ? dims_mapping.rhs_non_contracting_dims
-                               : dims_mapping.lhs_non_contracting_dims) {
-      other_group_dims.push_back(lhs_matching ? dim.rhs : dim.lhs);
-    }
-  } else if (!other.sharding().IsReplicated()) {
-    return nullptr;
-  }
+  });
+
   auto matching_sharding_dims =
       matching.sharding().tile_assignment().dimensions();
   std::vector<int64> matching_dims;
   std::vector<int64> output_dims;
+  int64 group_count = 1;
   // Make sure the partitioning on matching's non-contracting dimensions
   // defines the same device groups for both matching and output.
-  for (const auto& dim : lhs_matching ? dims_mapping.lhs_non_contracting_dims
-                                      : dims_mapping.rhs_non_contracting_dims) {
+  for (const auto& dim : partitioned_non_contractin_dims) {
     int64 md = lhs_matching ? dim.lhs : dim.rhs;
     matching_sharding_dims[md] =
         output_sharding.tile_assignment().dim(dim.output);
     matching_dims.push_back(md);
     output_dims.push_back(dim.output);
+    group_count *= output_sharding.tile_assignment().dim(dim.output);
   }
   auto output_grouped = GroupShardingOnDims(output_sharding, output_dims);
   auto reshaped_matching_tiling = matching.sharding().tile_assignment();
   reshaped_matching_tiling.Reshape(matching_sharding_dims);
   auto matching_grouped = AlignGroupsWith(
-      GroupShardingOnDims(HloSharding::Tile(reshaped_matching_tiling),
-                          matching_dims),
+      GroupShardingOnDims(
+          matching.sharding().ReplicateOnLastTileDim()
+              ? HloSharding::PartialTile(reshaped_matching_tiling)
+              : HloSharding::Tile(reshaped_matching_tiling),
+          matching_dims),
       output_grouped);
+  if (require_matching_devices_to_group &&
+      matching.sharding() != UngroupSharding(matching_grouped)) {
+    return nullptr;
+  }
+
+  std::vector<int64> other_group_dims;
+  if (other.sharding().ReplicateOnLastTileDim() &&
+      other.sharding().tile_assignment().dimensions().back() % group_count ==
+          0) {
+    other_group_dims.push_back(other.base_shape().rank());
+  } else {
+    const bool may_replicate_other_contracting_dims =
+        (other_contracting_partitions == group_count &&
+         other_non_contracting_partitions ==
+             output_other_non_contracting_partitions);
+    const bool may_replicate_other_non_contracting_dims =
+        group_count == other_non_contracting_partitions &&
+        matching_contracting_partitions == other_contracting_partitions;
+    if (auto found_dims = FindMatchingPartitionedDimsForGrouping(
+            other.sharding(), output_grouped.device_groups)) {
+      other_group_dims = std::move(*found_dims);
+    } else if (may_replicate_other_contracting_dims &&
+               (!may_replicate_other_non_contracting_dims ||
+                ShapeUtil::ByteSizeOf(other.hlo()->shape()) <=
+                    ShapeUtil::ByteSizeOf(MakePartitionedShape(
+                        output_base_shape, output_sharding)))) {
+      for (const auto& dim : dims_mapping.contracting_dims) {
+        other_group_dims.push_back(lhs_matching ? dim.rhs : dim.lhs);
+      }
+    } else if (may_replicate_other_non_contracting_dims) {
+      for (const auto& dim : lhs_matching
+                                 ? dims_mapping.rhs_non_contracting_dims
+                                 : dims_mapping.lhs_non_contracting_dims) {
+        other_group_dims.push_back(lhs_matching ? dim.rhs : dim.lhs);
+      }
+    } else {
+      other = other.Replicate();
+    }
+  }
+
   matching = matching.Reshard(UngroupSharding(matching_grouped));
   auto per_group_partitioner_state = CreatePerGroupPartitioningState(
       matching.state(), matching_grouped.device_groups, b);
+  top_level_sharding_to_reset.emplace_back(matching.hlo(), matching.sharding());
   matching.hlo()->set_sharding(matching_grouped.sharding);
   auto matching_p = PartitionedHlo(
       matching.hlo(),
@@ -835,13 +916,31 @@ StatusOr<HloInstruction*> PartitionDotGroupOnNonContracting(
       per_group_partitioner_state);
 
   auto partially_replicated_other = other.hlo();
-  if (!other.sharding().IsReplicated()) {
+  if (other_group_dims.size() == 1 &&
+      other_group_dims[0] == other.base_shape().rank()) {
+    // Group on replication dim.
+    auto grouped = AlignGroupsWith(
+        GroupShardingOnDims(
+            other.sharding(), {other_group_dims[0]},
+            {other.sharding().tile_assignment().dimensions().back() /
+             group_count}),
+        output_grouped);
+    other = other.Reshard(UngroupSharding(grouped));
+    partially_replicated_other = other.hlo();
+    top_level_sharding_to_reset.emplace_back(other.hlo(), other.sharding());
+    partially_replicated_other->set_sharding(grouped.sharding);
+  } else if (!other.sharding().IsReplicated()) {
     auto other_grouped =
         AlignGroupsWith(GroupShardingOnDims(other.sharding(), other_group_dims),
                         output_grouped, /*ignore_group_order=*/true);
     other = other.Reshard(UngroupSharding(other_grouped));
     partially_replicated_other =
-        other.ReplicatePartial(other_grouped.group_dims);
+        other
+            .Reshard(hlo_sharding_util::PartiallyReplicateTiledShardingOnDims(
+                other.sharding(), other_grouped.group_dims))
+            .hlo();
+    top_level_sharding_to_reset.emplace_back(
+        partially_replicated_other, partially_replicated_other->sharding());
     partially_replicated_other->set_sharding(other_grouped.sharding);
   }
   auto other_p = PartitionedHlo(partially_replicated_other, other.base_shape(),
@@ -853,31 +952,188 @@ StatusOr<HloInstruction*> PartitionDotGroupOnNonContracting(
                    GetPerGroupBaseShape(output_grouped, output_base_shape),
                    output_grouped.sharding, dims_mapping,
                    num_partitions / matching_grouped.device_groups.size(),
-                   create_sharded_dot, module, original_hlo,
-                   threshold_for_windowed_einsum_mib, b,
+                   create_sharded_dot, module, original_hlo, options, b,
                    windowed_dot_general_loops));
-  // Reset matching's sharding to the ungrouped one.
-  matching.hlo()->set_sharding(UngroupSharding(matching_grouped));
   return dot;
 }
 
+StatusOr<HloInstruction*> PartitionDotGroupOnContracting(
+    PartitionedHlo lhs, PartitionedHlo rhs,
+    absl::Span<const DotConvDimsMapping::DimsMapping>
+        partitioned_contractin_dims,
+    int64 output_batch_partitions, int64 output_lhs_non_contracting_partitions,
+    int64 output_rhs_non_contracting_partitions, const Shape& output_base_shape,
+    const HloSharding& output_sharding, const DotConvDimsMapping& dims_mapping,
+    int64 num_partitions,
+    const std::function<StatusOr<HloInstruction*>(
+        HloInstruction*, HloInstruction*, SpmdBuilder*)>& create_sharded_dot,
+    HloModule* module, HloInstruction* original_hlo,
+    bool require_matching_devices_to_group,
+    const SpmdPartitionerOptions& options, SpmdBuilder* b,
+    std::vector<SpmdPartitioningVisitor::WindowedDotGeneralLoop>*
+        windowed_dot_general_loops) {
+  std::vector<std::pair<HloInstruction*, HloSharding>>
+      top_level_sharding_to_reset;
+  auto cleaner = tensorflow::gtl::MakeCleanup([&] {
+    for (auto& to_reset : top_level_sharding_to_reset) {
+      to_reset.first->set_sharding(to_reset.second);
+    }
+  });
+  auto lhs_sharding = lhs.sharding();
+  auto rhs_sharding = rhs.sharding();
+  auto lhs_tile_shape = lhs_sharding.tile_assignment().dimensions();
+  auto rhs_tile_shape = rhs_sharding.tile_assignment().dimensions();
+  std::vector<int64> lhs_dims;
+  std::vector<int64> rhs_dims;
+  int64 group_count = 1;
+  for (const auto& dim : partitioned_contractin_dims) {
+    lhs_dims.push_back(dim.lhs);
+    rhs_dims.push_back(dim.rhs);
+    group_count *= lhs_sharding.tile_assignment().dim(dim.lhs);
+  }
+  if (ShapeUtil::ByteSizeOf(lhs.hlo()->shape()) >
+      ShapeUtil::ByteSizeOf(rhs.hlo()->shape())) {
+    for (const auto& dim : partitioned_contractin_dims) {
+      rhs_tile_shape[dim.rhs] = lhs_tile_shape[dim.lhs];
+    }
+    auto new_tile = rhs.sharding().tile_assignment();
+    new_tile.Reshape(rhs_tile_shape);
+    rhs_sharding = rhs_sharding.ReplicateOnLastTileDim()
+                       ? HloSharding::PartialTile(new_tile)
+                       : HloSharding::Tile(new_tile);
+  } else {
+    for (const auto& dim : partitioned_contractin_dims) {
+      lhs_tile_shape[dim.lhs] = rhs_tile_shape[dim.rhs];
+    }
+    auto new_tile = lhs.sharding().tile_assignment();
+    new_tile.Reshape(lhs_tile_shape);
+    lhs_sharding = lhs_sharding.ReplicateOnLastTileDim()
+                       ? HloSharding::PartialTile(new_tile)
+                       : HloSharding::Tile(new_tile);
+  }
+  auto lhs_grouped = GroupShardingOnDims(lhs_sharding, lhs_dims);
+  auto rhs_grouped = GroupShardingOnDims(rhs_sharding, rhs_dims);
+  if (ShapeUtil::ByteSizeOf(lhs.hlo()->shape()) >
+      ShapeUtil::ByteSizeOf(rhs.hlo()->shape())) {
+    rhs_grouped = AlignGroupsWith(rhs_grouped, lhs_grouped);
+    rhs_sharding = UngroupSharding(rhs_grouped);
+    if (require_matching_devices_to_group && rhs.sharding() != rhs_sharding) {
+      return nullptr;
+    }
+    rhs = rhs.Reshard(rhs_sharding);
+  } else {
+    lhs_grouped = AlignGroupsWith(lhs_grouped, rhs_grouped);
+    lhs_sharding = UngroupSharding(lhs_grouped);
+    if (require_matching_devices_to_group && lhs.sharding() != lhs_sharding) {
+      return nullptr;
+    }
+    lhs = lhs.Reshard(lhs_sharding);
+  }
+  top_level_sharding_to_reset.emplace_back(lhs.hlo(), lhs_sharding);
+  lhs.hlo()->set_sharding(lhs_grouped.sharding);
+  top_level_sharding_to_reset.emplace_back(rhs.hlo(), rhs_sharding);
+  rhs.hlo()->set_sharding(rhs_grouped.sharding);
+
+  HloSharding inner_output_sharding = HloSharding::Replicate();
+  HloSharding outer_output_tmp_sharding = HloSharding::Replicate();
+  if (output_sharding.ReplicateOnLastTileDim() &&
+      output_sharding.tile_assignment().dimensions().back() % group_count ==
+          0) {
+    auto grouped = AlignGroupsWith(
+        GroupShardingOnDims(
+            output_sharding,
+            {output_sharding.tile_assignment().num_dimensions() - 1},
+            {output_sharding.tile_assignment().dimensions().back() /
+             group_count}),
+        lhs_grouped);
+    outer_output_tmp_sharding = UngroupSharding(grouped);
+    inner_output_sharding = std::move(grouped.sharding);
+  } else {
+    std::vector<int64> group_dims;
+    if (auto found_dims = FindMatchingPartitionedDimsForGrouping(
+            output_sharding, lhs_grouped.device_groups)) {
+      group_dims = std::move(*found_dims);
+    } else if (output_lhs_non_contracting_partitions == group_count ||
+               output_rhs_non_contracting_partitions == group_count ||
+               output_batch_partitions == group_count) {
+      if (output_lhs_non_contracting_partitions == group_count) {
+        for (const auto& dim : dims_mapping.lhs_non_contracting_dims) {
+          group_dims.push_back(dim.output);
+        }
+      } else if (output_rhs_non_contracting_partitions == group_count) {
+        for (const auto& dim : dims_mapping.rhs_non_contracting_dims) {
+          group_dims.push_back(dim.output);
+        }
+      } else {
+        for (const auto& dim : dims_mapping.batch_dims) {
+          group_dims.push_back(dim.output);
+        }
+      }
+    }
+    if (!group_dims.empty()) {
+      auto grouped = AlignGroupsWith(
+          GroupShardingOnDims(output_sharding, group_dims), lhs_grouped);
+      inner_output_sharding = grouped.sharding;
+      outer_output_tmp_sharding =
+          hlo_sharding_util::PartiallyReplicateTiledShardingOnDims(
+              UngroupSharding(grouped), group_dims);
+    }
+  }
+  auto inner_state = CreatePerGroupPartitioningState(
+      lhs.state(), lhs_grouped.device_groups, b);
+  TF_ASSIGN_OR_RETURN(
+      auto dot,
+      PartitionDot(
+          PartitionedHlo(lhs.hlo(),
+                         GetPerGroupBaseShape(lhs_grouped, lhs.base_shape()),
+                         inner_state),
+          PartitionedHlo(rhs.hlo(),
+                         GetPerGroupBaseShape(rhs_grouped, rhs.base_shape()),
+                         inner_state),
+          MakePartitionedShape(output_base_shape, outer_output_tmp_sharding),
+          inner_output_sharding, dims_mapping, num_partitions / group_count,
+          create_sharded_dot, module, original_hlo, options, b,
+          windowed_dot_general_loops));
+  if (!dot) {
+    return nullptr;
+  }
+  std::vector<int64> other_lhs_dims;
+  for (int64 i = 0; i < lhs_sharding.tile_assignment().num_dimensions(); ++i) {
+    if (!absl::c_linear_search(lhs_dims, i)) {
+      other_lhs_dims.push_back(i);
+    }
+  }
+  auto inverse_grouped = GroupShardingOnDims(lhs_sharding, other_lhs_dims);
+  auto ar =
+      CreatePerGroupPartitioningState(lhs.state(),
+                                      inverse_grouped.device_groups, b)
+          .collective_ops_creator.create_cross_partition_all_reduce(
+              b, dot, MakeBinaryAdd(output_base_shape.element_type(), module),
+              {}, (*lhs.state().next_channel_id)++);
+  ar->set_sharding(outer_output_tmp_sharding);
+  return PartitionedHlo(ar, output_base_shape, lhs.state())
+      .Reshard(output_sharding)
+      .hlo();
+}
+
 // Recursive partitioning function. If there are partial dimensions matching in
 // the operands and output, group the devices and recursively partition the
 // in-group dot.
 StatusOr<HloInstruction*> PartitionDot(
     PartitionedHlo lhs, PartitionedHlo rhs, const Shape& output_base_shape,
-    const HloSharding& output_sharding,
-    const DotGeneralDimsMapping& dims_mapping, int64 num_partitions,
+    const HloSharding& output_sharding, const DotConvDimsMapping& dims_mapping,
+    int64 num_partitions,
     const std::function<StatusOr<HloInstruction*>(
         HloInstruction*, HloInstruction*, SpmdBuilder*)>& create_sharded_dot,
     HloModule* module, HloInstruction* original_hlo,
-    int64 threshold_for_windowed_einsum_mib, SpmdBuilder* b,
+    bool require_matching_devices_to_group,
+    const SpmdPartitionerOptions& options, SpmdBuilder* b,
     std::vector<SpmdPartitioningVisitor::WindowedDotGeneralLoop>*
         windowed_dot_general_loops) {
   // lhs_rhs_or_output: 0 lhs, 1 rhs, 2 output.
   auto get_partitions_for_dims =
       [&](const HloSharding& sharding,
-          absl::Span<const DotGeneralDimsMapping::DimsMapping> dims,
+          absl::Span<const DotConvDimsMapping::DimsMapping> dims,
           int lhs_rhs_or_output) {
         int64 partitions = 1;
         if (sharding.IsTileMaximal()) {
@@ -913,6 +1169,52 @@ StatusOr<HloInstruction*> PartitionDot(
       output_sharding, dims_mapping.lhs_non_contracting_dims, 2);
   const int64 output_rhs_non_contracting_partitions = get_partitions_for_dims(
       output_sharding, dims_mapping.rhs_non_contracting_dims, 2);
+  const int64 lhs_conv_spatial_partitions = get_partitions_for_dims(
+      lhs.sharding(), dims_mapping.conv_spatial_dims, 0);
+  const int64 rhs_conv_spatial_partitions = get_partitions_for_dims(
+      rhs.sharding(), dims_mapping.conv_spatial_dims, 1);
+  const int64 output_conv_spatial_partitions = get_partitions_for_dims(
+      output_sharding, dims_mapping.conv_spatial_dims, 2);
+  // Before we find partial matches along the dimensions, invoke base case again
+  // without may_reshard_without_detecting_match.
+
+  // Try partition the purely spatially-partitioned convolution with convolution
+  // spatial dimension partitioned or depthwise parallel dimension partitioned.
+  if (!dims_mapping.conv_spatial_dims.empty() &&
+      (lhs_conv_spatial_partitions > 1 || rhs_conv_spatial_partitions > 1 ||
+       output_conv_spatial_partitions > 1 ||
+       original_hlo->batch_group_count() > 1 ||
+       original_hlo->feature_group_count() > 1)) {
+    const auto& conv_dnums = original_hlo->convolution_dimension_numbers();
+    auto window = original_hlo->window();
+
+    // TODO(wangtao): remove this hack by passing create_sharded_conv to
+    // PartitionConv.
+    // Update convolution window when it is in the recursive call for
+    // batch_dims.
+    if (original_hlo->batch_group_count() == 1 &&
+        original_hlo->feature_group_count() == 1 &&
+        !ShapeUtil::Compatible(original_hlo->shape(), output_base_shape)) {
+      for (const auto& dim : dims_mapping.batch_dims) {
+        auto wd = window.mutable_dimensions(dim.spatial);
+        wd->set_size(lhs.hlo()->shape().dimensions(
+            conv_dnums.input_spatial_dimensions(dim.spatial)));
+        wd->set_stride(std::max<int64>(1, wd->size() - 1));
+        wd->set_base_dilation(wd->size());
+      }
+    }
+
+    TF_ASSIGN_OR_RETURN(
+        auto partitioned_conv,
+        PartitionConvolution(lhs, rhs, output_base_shape, output_sharding,
+                             dims_mapping, window, original_hlo, num_partitions,
+                             options, lhs.state().partition_id, module, b));
+
+    if (partitioned_conv) {
+      return partitioned_conv;
+    }
+  }
+
   TF_ASSIGN_OR_RETURN(
       auto try_partitioned_dot,
       PartitionBaseCase(
@@ -922,8 +1224,9 @@ StatusOr<HloInstruction*> PartitionDot(
           lhs_contracting_partitions, rhs_contracting_partitions,
           lhs_non_contracting_partitions, rhs_non_contracting_partitions,
           output_lhs_non_contracting_partitions,
-          output_rhs_non_contracting_partitions,
-          threshold_for_windowed_einsum_mib, b, windowed_dot_general_loops));
+          output_rhs_non_contracting_partitions, options, b,
+          windowed_dot_general_loops,
+          /*may_reshard_without_detecting_match=*/false));
   if (try_partitioned_dot) {
     return try_partitioned_dot;
   }
@@ -941,7 +1244,7 @@ StatusOr<HloInstruction*> PartitionDot(
             num_partitions, lhs_contracting_partitions,
             rhs_contracting_partitions, lhs_non_contracting_partitions,
             rhs_non_contracting_partitions, create_sharded_dot, module,
-            original_hlo, threshold_for_windowed_einsum_mib, b,
+            original_hlo, require_matching_devices_to_group, options, b,
             windowed_dot_general_loops));
     if (dot) {
       return dot;
@@ -974,19 +1277,180 @@ StatusOr<HloInstruction*> PartitionDot(
                          : rhs_contracting_partitions,
             lhs_matching ? rhs_contracting_partitions
                          : lhs_contracting_partitions,
-            lhs_matching ? lhs_non_contracting_partitions
-                         : rhs_non_contracting_partitions,
+            lhs_matching ? dims_mapping.lhs_non_contracting_dims
+                         : dims_mapping.rhs_non_contracting_dims,
             lhs_matching ? rhs_non_contracting_partitions
                          : lhs_non_contracting_partitions,
             lhs_matching ? output_rhs_non_contracting_partitions
                          : output_lhs_non_contracting_partitions,
             output_base_shape, output_sharding, dims_mapping, num_partitions,
             create_sharded_dot, module, original_hlo,
-            threshold_for_windowed_einsum_mib, b, windowed_dot_general_loops));
+            require_matching_devices_to_group, options, b,
+            windowed_dot_general_loops));
     if (dot) {
       return dot;
     }
   }
+  if (lhs_non_contracting_partitions > 1 &&
+      output_lhs_non_contracting_partitions > 1) {
+    // If part of LHS non-contracting dims match output, try them.
+    std::vector<DotConvDimsMapping::DimsMapping> matching_dims;
+    for (const auto& dim : dims_mapping.lhs_non_contracting_dims) {
+      int64 lhs_partitions = lhs.sharding().tile_assignment().dim(dim.lhs);
+      if (lhs_partitions > 1 &&
+          lhs_partitions == output_sharding.tile_assignment().dim(dim.output)) {
+        matching_dims.push_back(dim);
+      }
+    }
+    if (!matching_dims.empty()) {
+      TF_ASSIGN_OR_RETURN(
+          auto dot,
+          PartitionDotGroupOnNonContracting(
+              /*lhs_matching=*/true, lhs, rhs, lhs_contracting_partitions,
+              rhs_contracting_partitions, matching_dims,
+              rhs_non_contracting_partitions,
+              output_rhs_non_contracting_partitions, output_base_shape,
+              output_sharding, dims_mapping, num_partitions, create_sharded_dot,
+              module, original_hlo, require_matching_devices_to_group, options,
+              b, windowed_dot_general_loops));
+      if (dot) {
+        return dot;
+      }
+    }
+  }
+  if (rhs_non_contracting_partitions > 1 &&
+      output_rhs_non_contracting_partitions > 1) {
+    // If part of RHS non-contracting dims match output, try them.
+    std::vector<DotConvDimsMapping::DimsMapping> matching_dims;
+    for (const auto& dim : dims_mapping.rhs_non_contracting_dims) {
+      int64 rhs_partitions = rhs.sharding().tile_assignment().dim(dim.rhs);
+      if (rhs_partitions > 1 &&
+          rhs_partitions == output_sharding.tile_assignment().dim(dim.output)) {
+        matching_dims.push_back(dim);
+      }
+    }
+    if (!matching_dims.empty()) {
+      TF_ASSIGN_OR_RETURN(
+          auto dot,
+          PartitionDotGroupOnNonContracting(
+              /*lhs_matching=*/false, rhs, lhs, rhs_contracting_partitions,
+              lhs_contracting_partitions, matching_dims,
+              lhs_non_contracting_partitions,
+              output_lhs_non_contracting_partitions, output_base_shape,
+              output_sharding, dims_mapping, num_partitions, create_sharded_dot,
+              module, original_hlo, require_matching_devices_to_group, options,
+              b, windowed_dot_general_loops));
+      if (dot) {
+        return dot;
+      }
+    }
+  }
+
+  // Case 3: Group partitions by contracting dimensions.
+  if (lhs_contracting_partitions == rhs_contracting_partitions &&
+      lhs_contracting_partitions > 1) {
+    TF_ASSIGN_OR_RETURN(
+        auto dot,
+        PartitionDotGroupOnContracting(
+            lhs, rhs, dims_mapping.contracting_dims, output_batch_partitions,
+            output_lhs_non_contracting_partitions,
+            output_rhs_non_contracting_partitions, output_base_shape,
+            output_sharding, dims_mapping, num_partitions, create_sharded_dot,
+            module, original_hlo, require_matching_devices_to_group, options, b,
+            windowed_dot_general_loops));
+    if (dot) {
+      return dot;
+    }
+  }
+  if (lhs_contracting_partitions > 1 && rhs_contracting_partitions > 1) {
+    // If part of contracting dims match, try them.
+    std::vector<DotConvDimsMapping::DimsMapping> matching_dims;
+    for (const auto& dim : dims_mapping.contracting_dims) {
+      int64 lhs_partitions = lhs.sharding().tile_assignment().dim(dim.lhs);
+      if (lhs_partitions > 1 &&
+          lhs_partitions == rhs.sharding().tile_assignment().dim(dim.rhs)) {
+        matching_dims.push_back(dim);
+      }
+    }
+    if (!matching_dims.empty()) {
+      TF_ASSIGN_OR_RETURN(
+          auto dot,
+          PartitionDotGroupOnContracting(
+              lhs, rhs, matching_dims, output_batch_partitions,
+              output_lhs_non_contracting_partitions,
+              output_rhs_non_contracting_partitions, output_base_shape,
+              output_sharding, dims_mapping, num_partitions, create_sharded_dot,
+              module, original_hlo, require_matching_devices_to_group, options,
+              b, windowed_dot_general_loops));
+      if (dot) {
+        return dot;
+      }
+    }
+  }
+
+  // Case 4: If operands are replicated but output is partially replicated,
+  // recursive call with partial replication removed.
+  if (lhs.sharding().IsReplicated() && rhs.sharding().IsReplicated() &&
+      output_sharding.ReplicateOnLastTileDim()) {
+    auto grouped_output =
+        GroupShardingOnDims(output_sharding, {output_base_shape.rank()});
+    auto inner_state = CreatePerGroupPartitioningState(
+        lhs.state(), grouped_output.device_groups, b);
+    TF_ASSIGN_OR_RETURN(
+        auto dot,
+        PartitionDot(PartitionedHlo(lhs.hlo(), lhs.base_shape(), inner_state),
+                     PartitionedHlo(rhs.hlo(), rhs.base_shape(), inner_state),
+                     output_base_shape, grouped_output.sharding, dims_mapping,
+                     output_sharding.NumTiles(), create_sharded_dot, module,
+                     original_hlo, options, b, windowed_dot_general_loops));
+    if (dot) {
+      return dot;
+    }
+  }
+
+  // We failed to find partial matches, invoke base case again with
+  // may_reshard_without_detecting_match.
+  TF_ASSIGN_OR_RETURN(
+      auto dot,
+      PartitionBaseCase(
+          lhs, rhs, output_base_shape, output_sharding, dims_mapping,
+          num_partitions, create_sharded_dot, module, original_hlo,
+          lhs_batch_partitions, rhs_batch_partitions, output_batch_partitions,
+          lhs_contracting_partitions, rhs_contracting_partitions,
+          lhs_non_contracting_partitions, rhs_non_contracting_partitions,
+          output_lhs_non_contracting_partitions,
+          output_rhs_non_contracting_partitions, options, b,
+          windowed_dot_general_loops,
+          /*may_reshard_without_detecting_match=*/true));
+  if (dot) {
+    return dot;
+  }
+  return nullptr;
+}
+
+StatusOr<HloInstruction*> PartitionDot(
+    PartitionedHlo lhs, PartitionedHlo rhs, const Shape& output_base_shape,
+    const HloSharding& output_sharding, const DotConvDimsMapping& dims_mapping,
+    int64 num_partitions,
+    const std::function<StatusOr<HloInstruction*>(
+        HloInstruction*, HloInstruction*, SpmdBuilder*)>& create_sharded_dot,
+    HloModule* module, HloInstruction* original_hlo,
+    const SpmdPartitionerOptions& options, SpmdBuilder* b,
+    std::vector<SpmdPartitioningVisitor::WindowedDotGeneralLoop>*
+        windowed_dot_general_loops) {
+  // First try partitioning without resharding the groups, then try allow
+  // resharding the groups.
+  for (bool require_matching_devices_to_group : {true, false}) {
+    TF_ASSIGN_OR_RETURN(
+        auto try_partition,
+        PartitionDot(lhs, rhs, output_base_shape, output_sharding, dims_mapping,
+                     num_partitions, create_sharded_dot, module, original_hlo,
+                     require_matching_devices_to_group, options, b,
+                     windowed_dot_general_loops));
+    if (try_partition) {
+      return try_partition;
+    }
+  }
 
   // Default action.
   TF_ASSIGN_OR_RETURN(auto dot, create_sharded_dot(lhs.Replicate().hlo(),
@@ -1000,7 +1464,7 @@ StatusOr<HloInstruction*> PartitionDot(
 }  // namespace
 
 Status SpmdPartitioningVisitor::HandleDotHelper(
-    HloInstruction* hlo, const DotGeneralDimsMapping& dims_mapping,
+    HloInstruction* hlo, const DotConvDimsMapping& dims_mapping,
     const std::function<StatusOr<HloInstruction*>(
         HloInstruction*, HloInstruction*, SpmdBuilder*)>& create_sharded_dot) {
   auto& lhs = GetPartitionedHlo(hlo->operand(0));
@@ -1008,9 +1472,8 @@ Status SpmdPartitioningVisitor::HandleDotHelper(
   TF_ASSIGN_OR_RETURN(
       auto partitioned_dot,
       PartitionDot(lhs, rhs, hlo->shape(), hlo->sharding(), dims_mapping,
-                   num_partitions_, create_sharded_dot, module_, hlo,
-                   options_.threshold_for_windowed_einsum_mib, &b_,
-                   &windowed_dot_general_loops_));
+                   num_partitions_, create_sharded_dot, module_, hlo, options_,
+                   &b_, &windowed_dot_general_loops_));
   SetPartitionedHlo(hlo, [&] { return partitioned_dot; });
   return Status::OK();
 }
diff --git a/tensorflow/compiler/xla/service/spmd/schedule_aware_all_gather_cse.cc b/tensorflow/compiler/xla/service/spmd/schedule_aware_all_gather_cse.cc
new file mode 100644
index 00000000000..bdc96afba88
--- /dev/null
+++ b/tensorflow/compiler/xla/service/spmd/schedule_aware_all_gather_cse.cc
@@ -0,0 +1,132 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/spmd/schedule_aware_all_gather_cse.h"
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+
+namespace xla {
+namespace {
+
+HloCollectiveInstruction* MayConsiderAsAllGather(HloInstruction* hlo,
+                                                 bool for_replicas) {
+  auto coll = DynCast<HloCollectiveInstruction>(hlo);
+  if (!coll) {
+    return nullptr;
+  }
+  if (coll->constrain_layout()) {
+    return nullptr;
+  }
+  if (for_replicas == coll->channel_id().has_value()) {
+    return nullptr;
+  }
+  if (coll->opcode() == HloOpcode::kAllGather) {
+    return coll;
+  }
+  // Consider broadcast -> dynamic-update-slice -> all-reduce as all-gather.
+  if (coll->opcode() == HloOpcode::kAllReduce && coll->shape().IsArray()) {
+    auto operand = coll->operand(0);
+    return operand->opcode() == HloOpcode::kDynamicUpdateSlice &&
+                   operand->operand(0)->opcode() == HloOpcode::kBroadcast
+               ? coll
+               : nullptr;
+  }
+  return nullptr;
+}
+
+StatusOr<bool> RunOnComputation(HloComputation* comp, bool for_replicas,
+                                int64 distance_threshold) {
+  // We consider estimate the live ranges of all-gathers by comparing their
+  // users' distance to the root, e.g., height.
+  absl::flat_hash_map<const HloInstruction*, int64> height;
+  auto ordered_hlos = comp->MakeInstructionPostOrder();
+  int64 max_height = 0;
+  for (auto it = ordered_hlos.rbegin(); it != ordered_hlos.rend(); ++it) {
+    auto hlo = *it;
+    int64 h = 0;
+    for (auto user : hlo->users()) {
+      h = std::max(h, height[user]) + 1;
+    }
+    max_height = std::max(max_height, h);
+    height[hlo] = h;
+  }
+
+  auto lowest_user_height = [&](const HloInstruction* hlo) {
+    int64 lowest = height[hlo];
+    for (auto user : hlo->users()) {
+      lowest = std::min(lowest, height[user]);
+    }
+    return lowest;
+  };
+
+  absl::flat_hash_map<const HloInstruction*,
+                      std::vector<HloCollectiveInstruction*>>
+      operand_to_ag;
+  bool changed = false;
+  for (auto hlo : ordered_hlos) {
+    auto ag = MayConsiderAsAllGather(hlo, for_replicas);
+    if (!ag) {
+      continue;
+    }
+
+    auto& earlier_ags = operand_to_ag[ag->operand(0)];
+    bool found = false;
+    int64 ag_height = height[ag];
+    for (auto& eag : earlier_ags) {
+      auto old_channel_id = ag->channel_id();
+      if (eag->channel_id() && ag->channel_id()) {
+        ag->set_channel_id(eag->channel_id());
+      }
+      if (!eag->Identical(*ag)) {
+        ag->set_channel_id(old_channel_id);
+        continue;
+      }
+      found = true;
+      ag->set_channel_id(old_channel_id);
+      if (lowest_user_height(eag) > ag_height + distance_threshold) {
+        eag = ag;
+        continue;
+      }
+      changed = true;
+      VLOG(1) << "Replacing " << ag->ToString() << " with " << eag->ToString();
+      TF_RETURN_IF_ERROR(ag->ReplaceAllUsesWith(eag));
+      break;
+    }
+    if (!found) {
+      earlier_ags.push_back(ag);
+    }
+  }
+  return changed;
+}
+
+}  // namespace
+
+StatusOr<bool> ScheduleAwareAllGatherCSE::Run(HloModule* module) {
+  bool changed = false;
+  for (auto comp : module->computations()) {
+    TF_ASSIGN_OR_RETURN(
+        auto comp_changed,
+        RunOnComputation(comp, for_replicas_, distance_threshold_));
+    changed |= comp_changed;
+  }
+  return changed;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/spmd/schedule_aware_all_gather_cse.h b/tensorflow/compiler/xla/service/spmd/schedule_aware_all_gather_cse.h
new file mode 100644
index 00000000000..4653286ae97
--- /dev/null
+++ b/tensorflow/compiler/xla/service/spmd/schedule_aware_all_gather_cse.h
@@ -0,0 +1,49 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_SPMD_SCHEDULE_AWARE_ALL_GATHER_CSE_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_SPMD_SCHEDULE_AWARE_ALL_GATHER_CSE_H_
+
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+
+// Performs CSE for all-gather if their users are within reasonable live range.
+class ScheduleAwareAllGatherCSE : public HloModulePass {
+ public:
+  // distance_threshold: maximum live range (in number of HLO instructions on
+  //   the path) to consider CSE.
+  // for_replicas: specifies if this pass is for cross-replica or
+  //   cross-partition all-gathers.
+  explicit ScheduleAwareAllGatherCSE(int64 distance_threshold,
+                                     bool for_replicas)
+      : distance_threshold_(distance_threshold), for_replicas_(for_replicas) {}
+
+  ~ScheduleAwareAllGatherCSE() override = default;
+  absl::string_view name() const override {
+    return "schedule-aware-all-gather-cse";
+  }
+
+  StatusOr<bool> Run(HloModule* module) override;
+
+ private:
+  int64 distance_threshold_;
+  bool for_replicas_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_SPMD_SCHEDULE_AWARE_ALL_GATHER_CSE_H_
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
index 2d76966a494..ceb81330639 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "absl/types/optional.h"
@@ -216,20 +217,147 @@ HloInstruction* SpmdBuilder::AddInstruction(
   if (visiting_hlo_) {
     instructions_[visiting_hlo_].push_back(hlo);
   }
+  if (hlo->opcode() == HloOpcode::kBroadcast) {
+    for (int64 i = 0; i < hlo->shape().rank(); ++i) {
+      if (!absl::c_linear_search(hlo->dimensions(), i)) {
+        broadcast_dims_[hlo].insert(i);
+      }
+    }
+  }
+  if (hlo->IsElementwise() && hlo->operand_count() > 0) {
+    absl::flat_hash_set<int64> broadcast_dims;
+    for (int64 i = 0; i < hlo->shape().rank(); ++i) {
+      broadcast_dims.insert(i);
+    }
+    for (int64 i = 0; i < hlo->operand_count(); ++i) {
+      auto it = broadcast_dims_.find(hlo->operand(i));
+      if (it == broadcast_dims_.end()) {
+        broadcast_dims.clear();
+        break;
+      }
+      for (int64 i = 0; i < hlo->shape().rank(); ++i) {
+        if (!it->second.contains(i)) {
+          broadcast_dims.erase(i);
+        }
+      }
+    }
+    if (!broadcast_dims.empty()) {
+      broadcast_dims_[hlo] = std::move(broadcast_dims);
+    }
+  }
+  if (hlo->opcode() == HloOpcode::kTranspose) {
+    auto it = broadcast_dims_.find(hlo->operand(0));
+    if (it != broadcast_dims_.end()) {
+      absl::flat_hash_set<int64> xpose_broadcast_dims;
+      std::vector<int64> reverse_map(hlo->shape().rank());
+      for (int64 i = 0; i < reverse_map.size(); ++i) {
+        reverse_map[hlo->dimensions(i)] = i;
+      }
+      for (int64 dim : it->second) {
+        xpose_broadcast_dims.insert(reverse_map[dim]);
+      }
+      broadcast_dims_[hlo] = std::move(xpose_broadcast_dims);
+    }
+  }
+  if (hlo->opcode() == HloOpcode::kReshape &&
+      Product(hlo->shape().dimensions()) > 0) {
+    auto it = broadcast_dims_.find(hlo->operand(0));
+    if (it != broadcast_dims_.end()) {
+      absl::flat_hash_set<int64> reshape_broadcast_dims;
+      for (int64 i = 0; i < hlo->shape().rank(); ++i) {
+        reshape_broadcast_dims.insert(i);
+      }
+      std::vector<int64> before_dim_size_stack;
+      std::vector<int64> after_dim_size_stack;
+      for (int64 i = hlo->operand(0)->shape().rank() - 1; i >= 0; --i) {
+        before_dim_size_stack.push_back(hlo->operand(0)->shape().dimensions(i));
+      }
+      for (int64 i = hlo->shape().rank() - 1; i >= 0; --i) {
+        after_dim_size_stack.push_back(hlo->shape().dimensions(i));
+      }
+      while (!before_dim_size_stack.empty() && !after_dim_size_stack.empty()) {
+        int64 before_size = before_dim_size_stack.back();
+        int64 after_size = after_dim_size_stack.back();
+        int64 current_before_dim =
+            hlo->operand(0)->shape().rank() - before_dim_size_stack.size();
+        int64 current_after_dim =
+            hlo->shape().rank() - after_dim_size_stack.size();
+        before_dim_size_stack.pop_back();
+        after_dim_size_stack.pop_back();
+        if (!it->second.contains(current_before_dim)) {
+          reshape_broadcast_dims.erase(current_after_dim);
+        }
+        if (before_size == after_size) {
+          continue;
+        }
+        if (before_size % after_size == 0) {
+          // Split dim.
+          before_dim_size_stack.push_back(before_size / after_size);
+        } else if (after_size % before_size == 0) {
+          // Merge dim.
+          after_dim_size_stack.push_back(after_size / before_size);
+        } else {
+          // Other cases, mark all remaining dims as non-broadcast.
+          for (int64 i = current_after_dim; i < hlo->shape().rank(); ++i) {
+            reshape_broadcast_dims.erase(i);
+          }
+          break;
+        }
+      }
+      if (!before_dim_size_stack.empty() || !after_dim_size_stack.empty()) {
+        reshape_broadcast_dims.clear();
+      }
+      if (!reshape_broadcast_dims.empty()) {
+        broadcast_dims_[hlo] = std::move(reshape_broadcast_dims);
+      }
+    }
+  }
+  if (hlo->opcode() == HloOpcode::kSlice ||
+      hlo->opcode() == HloOpcode::kDynamicSlice) {
+    auto it = broadcast_dims_.find(hlo->operand(0));
+    if (it != broadcast_dims_.end()) {
+      auto dims = it->second;
+      broadcast_dims_[hlo] = std::move(dims);
+    }
+  }
+  if (hlo->opcode() == HloOpcode::kPad) {
+    auto it = broadcast_dims_.find(hlo->operand(0));
+    if (it != broadcast_dims_.end()) {
+      absl::flat_hash_set<int64> pad_broadcast_dims;
+      for (int64 i = 0; i < hlo->shape().rank(); ++i) {
+        const auto& dim = hlo->padding_config().dimensions(i);
+        if (dim.edge_padding_low() == 0 && dim.edge_padding_high() == 0 &&
+            dim.interior_padding() == 0 && it->second.contains(i)) {
+          pad_broadcast_dims.insert(i);
+        }
+      }
+      if (!pad_broadcast_dims.empty()) {
+        broadcast_dims_[hlo] = std::move(pad_broadcast_dims);
+      }
+    }
+  }
   return hlo;
 }
 
 PartitionedHlo PartitionedHlo::Reshard(const HloSharding& target) {
   auto& cache = state_.reshard_cache->per_hlo_cache[hlo()].reshard_cache;
-  for (auto& entry : cache) {
-    if (entry.first == target) {
-      return entry.second;
+  const bool is_to_replicate =
+      hlo_->shape().IsArray() && target.NumTiles() < sharding().NumTiles();
+  if (!is_to_replicate || state_.partitioner->options().cache_all_gather) {
+    for (auto& entry : cache) {
+      if (entry.first == target) {
+        return entry.second;
+      }
     }
   }
-  cache.emplace_back(target, ReshardNoCache(target));
-  state_.reshard_cache->per_hlo_cache[cache.back().second.hlo()]
+  auto resharded = ReshardNoCache(target);
+  state_.reshard_cache->per_hlo_cache[resharded.hlo()]
       .reshard_cache.emplace_back(sharding(), *this);
-  return cache.back().second;
+  if (!is_to_replicate || state_.partitioner->options().cache_all_gather) {
+    cache.emplace_back(target, std::move(resharded));
+    return cache.back().second;
+  }
+  return resharded;
 }
 
 PartitionedHlo PartitionedHlo::ReshardNoCache(const HloSharding& target) {
@@ -282,6 +410,20 @@ PartitionedHlo PartitionedHlo::ReshardNoCache(const HloSharding& target) {
     return ReshardWithAllToAll(target, *src_tgt_dims);
   }
 
+  if (!target.IsTileMaximal() && sharding().ReplicateOnLastTileDim()) {
+    auto try_reshard = ReshardFromPartialReplicateWithDynamicSlice(target);
+    if (try_reshard.has_value()) {
+      return try_reshard.value();
+    }
+  }
+
+  if (!sharding().IsTileMaximal() && target.ReplicateOnLastTileDim()) {
+    auto try_reshard = ReshardToPartialReplicateWithAllGather(target);
+    if (try_reshard.has_value()) {
+      return try_reshard.value();
+    }
+  }
+
   // If not replicated yet, first replicate and then reshard to use one of the
   // two implementations below.
   if (!sharding().IsReplicated()) {
@@ -296,6 +438,19 @@ PartitionedHlo PartitionedHlo::ReshardNoCache(const HloSharding& target) {
     return PartitionedHlo(copy, base_shape_, state_);
   }
 
+  // 'Replicated' to partial replicated.
+  if (target.ReplicateOnLastTileDim()) {
+    std::vector<int64> group_dims(target.tile_assignment().num_dimensions() -
+                                  1);
+    std::iota(group_dims.begin(), group_dims.end(), 0);
+    auto target_grouped = GroupShardingOnDims(target, group_dims);
+    auto partially_sharded = PerGroupSliceFromReplicated(
+        hlo_, state_.partition_id, target_grouped.device_groups, group_dims,
+        target_grouped.group_dim_sizes, state_.b);
+    partially_sharded->set_sharding(target);
+    return PartitionedHlo(partially_sharded, base_shape(), state_);
+  }
+
   // 'Replicated' to 'Tiled'.
   auto padded_hlo =
       PadBaseShapeBeforeUnevenTiledSharding(hlo_, target, state_.b);
@@ -651,6 +806,14 @@ PartitionedHlo::ReshardAsWindowedInput(const Window& window,
 }
 
 PartitionedHlo PartitionedHlo::Replicate() {
+  auto& cache = state_.reshard_cache->per_hlo_cache[hlo()].reshard_cache;
+  if (state_.partitioner->options().cache_all_gather) {
+    for (auto& entry : cache) {
+      if (entry.first.IsReplicated()) {
+        return entry.second;
+      }
+    }
+  }
   const HloSharding& sharding = hlo_->sharding();
   const Shape& shape = hlo_->shape();
   CHECK(!shape.IsTuple() && shape.element_type() != TOKEN);
@@ -658,7 +821,6 @@ PartitionedHlo PartitionedHlo::Replicate() {
   if (sharding.IsReplicated()) {
     return *this;
   }
-  auto& cache = state_.reshard_cache->per_hlo_cache[hlo()].reshard_cache;
   for (auto& entry : cache) {
     if (entry.first.IsReplicated()) {
       return entry.second;
@@ -667,8 +829,11 @@ PartitionedHlo PartitionedHlo::Replicate() {
   auto update_cache = [&](PartitionedHlo resharded) {
     state_.reshard_cache->per_hlo_cache[resharded.hlo()]
         .reshard_cache.emplace_back(sharding, *this);
-    cache.emplace_back(HloSharding::Replicate(), std::move(resharded));
-    return cache.back().second;
+    if (state_.partitioner->options().cache_all_gather) {
+      cache.emplace_back(HloSharding::Replicate(), std::move(resharded));
+      return cache.back().second;
+    }
+    return resharded;
   };
   // 'Single Device' to 'Repliated'.
   if (sharding.IsTileMaximal()) {
@@ -724,11 +889,160 @@ HloInstruction* PartitionedHlo::ReplicatePartial(absl::Span<const int64> dims) {
     std::vector<int64> strides(target_shape.rank(), 1);
     result = state_.b->AddInstruction(
         HloInstruction::CreateSlice(target_shape, result, start_indices,
-                                    base_shape_.dimensions(), strides));
+                                    target_shape.dimensions(), strides));
   }
   return result;
 }
 
+absl::optional<PartitionedHlo>
+PartitionedHlo::ReshardToPartialReplicateWithAllGather(
+    const HloSharding& target) {
+  if (!target.ReplicateOnLastTileDim()) {
+    return absl::nullopt;
+  }
+  // Tiled/partial replicate to partial replicate
+  // Get the comptible sharding to target with resharding by all reduce.
+  auto compatible_sharding =
+      PartialReplicateReshardCompatibleSharding(target, sharding());
+  if (!compatible_sharding.has_value()) {
+    return absl::nullopt;
+  }
+
+  const auto& temp_sharding = compatible_sharding.value();
+  auto partitioned_hlo = *this;
+  // Use collective permute to adjust device assignment if needed.
+  if (CanReshardWithCollectivePermute(sharding(), temp_sharding)) {
+    partitioned_hlo =
+        partitioned_hlo.ReshardWithCollectivePermute(temp_sharding);
+  }
+
+  // Get replicate dims and replicate factor of each dimensions.
+  int64 rank = hlo_->shape().rank();
+  std::vector<int64> replicate_dims;
+  std::vector<int64> replicate_factors;
+  for (int64 dim = 0; dim < rank; dim++) {
+    int64 replicate_factor = temp_sharding.tile_assignment().dim(dim) /
+                             target.tile_assignment().dim(dim);
+    if (replicate_factor > 1) {
+      replicate_dims.emplace_back(dim);
+      replicate_factors.emplace_back(replicate_factor);
+    }
+  }
+
+  // Do left halo exchange if all-reduce directly will remove useful data
+  // from the source.
+  auto halo_exchange = TileToPartialReplicateHaloExchange(
+      partitioned_hlo.hlo_, base_shape_, temp_sharding, target, replicate_dims,
+      partitioned_hlo.state().collective_ops_creator,
+      partitioned_hlo.state().next_channel_id,
+      partitioned_hlo.state().partition_id, partitioned_hlo.state().b);
+  if (!halo_exchange.has_value()) {
+    return absl::nullopt;
+  }
+  auto halo_exchange_hlo = halo_exchange.value();
+  // Grouped on replicate dimensions.
+  auto sharding_grouped =
+      GroupShardingOnDims(temp_sharding, replicate_dims, replicate_factors);
+  auto per_group_partitioner_state = CreatePerGroupPartitioningState(
+      partitioned_hlo.state(), sharding_grouped.device_groups,
+      partitioned_hlo.state().b);
+  auto base_shape = MakePartitionedShape(base_shape_, target);
+  // It's possible that halo_exchange_hlo == hlo.hlo().
+  // Record the sharding of hlo here, and reset it before return.
+  auto original_sharding = partitioned_hlo.sharding();
+  halo_exchange_hlo->set_sharding(sharding_grouped.sharding);
+  auto partial_replicate_hlo = PartitionedHlo(halo_exchange_hlo, base_shape,
+                                              per_group_partitioner_state);
+  HloInstruction* result =
+      partial_replicate_hlo.ReplicatePartial(replicate_dims);
+  partitioned_hlo.hlo()->set_sharding(original_sharding);
+  result->set_sharding(target);
+  return PartitionedHlo(result, base_shape_, partitioned_hlo.state());
+}
+
+absl::optional<PartitionedHlo>
+PartitionedHlo::ReshardFromPartialReplicateWithDynamicSlice(
+    const HloSharding& target) {
+  if (!sharding().ReplicateOnLastTileDim()) {
+    return absl::nullopt;
+  }
+
+  // Get the temp sharding target from partial replicate to target tile dims.
+  // target_compatible_sharding has the same tile_assignment dimensions
+  // as the target and can reshard to target by collective permute.
+  // target_compatible_sharding could have different device assignment as
+  // targe. sharding() can reshard to target_compatible_sharding by
+  // dynamic slice.
+  auto target_compatible_sharding =
+      PartialReplicateReshardCompatibleSharding(sharding(), target);
+  // Reshard to target_compatible_sharding by dynamic slice.
+  if (!target_compatible_sharding.has_value()) {
+    return absl::nullopt;
+  }
+  std::vector<int64> expand_tile_dims;
+  std::vector<int64> tiling_dim_factors;
+  int64 rank = hlo_->shape().rank();
+  tiling_dim_factors.reserve(target.tile_assignment().num_dimensions());
+  const auto& temp_target_sharding = target_compatible_sharding.value();
+  for (int64 dim = 0; dim < rank; dim++) {
+    if (temp_target_sharding.tile_assignment().dim(dim) >
+        sharding().tile_assignment().dim(dim)) {
+      expand_tile_dims.push_back(dim);
+    }
+    tiling_dim_factors.emplace_back(
+        temp_target_sharding.tile_assignment().dim(dim) /
+        sharding().tile_assignment().dim(dim));
+  }
+
+  // Add another dimension in tiling_dim_factors if target is partial replicate.
+  if (target.ReplicateOnLastTileDim()) {
+    tiling_dim_factors.emplace_back(
+        target.tile_assignment().dimensions().back());
+  }
+
+  // Get per_group partitioner state.
+  std::vector<int64> group_dims(sharding().tile_assignment().num_dimensions() -
+                                1);
+  std::iota(group_dims.begin(), group_dims.end(), 0);
+  auto sharding_grouped = GroupShardingOnDims(sharding(), group_dims);
+  auto per_group_partitioner_state = CreatePerGroupPartitioningState(
+      state_, sharding_grouped.device_groups, state_.b);
+  // 2. Get the padded_hlo, do right halo exchange if needed.
+  auto padded_hlo = PadFromPartialReplicateShape(
+      hlo_, base_shape_, sharding(), temp_target_sharding, expand_tile_dims,
+      state_.collective_ops_creator, state_.next_channel_id,
+      state_.partition_id, state_.b);
+  if (!padded_hlo.has_value()) {
+    return absl::nullopt;
+  }
+  // 3. Slice out the tile from replicate ones.
+  auto shard_shape = MakePartitionedShape(base_shape_, temp_target_sharding);
+  // device assignment within each group is sorted in
+  // HloSharding::PartialTile, thus partiton_id within each group can be
+  // matched with the order in tile_assignment.
+  Array<int64> tiling_assignment(tiling_dim_factors);
+  tiling_assignment.FillIota(0);
+  auto slice = state_.b->AddInstruction(HloInstruction::CreateDynamicSlice(
+      shard_shape, padded_hlo.value(),
+      MakePartitionOffsets(padded_hlo.value()->shape(),
+                           target.ReplicateOnLastTileDim()
+                               ? HloSharding::PartialTile(tiling_assignment)
+                               : HloSharding::Tile(tiling_assignment),
+                           per_group_partitioner_state.partition_id,
+                           per_group_partitioner_state.b),
+      shard_shape.dimensions()));
+  slice->set_sharding(temp_target_sharding);
+  auto result = PartitionedHlo(slice, base_shape_, state_);
+  // If temp_target_sharding's device assignment is different from target,
+  // use collective permute to reshard.
+  if (CanReshardWithCollectivePermute(temp_target_sharding, target)) {
+    return result.ReshardWithCollectivePermute(target);
+  }
+  // If device assignment in temp_target_sharding and target are the same,
+  // return result directly.
+  return result;
+}
+
 PartitionedHlo PartitionedHlo::Broadcast() const {
   const Shape& shape = hlo_->shape();
   const HloSharding& sharding = hlo_->sharding();
@@ -813,8 +1127,9 @@ PartitionedHlo PartitionedHlo::ReshardWithAllToAll(
         sharding().tile_assignment().dim(source_dim);
     temp_target_tile.Reshape(temp_target_tile_dims);
   }
-  auto temp_target = HloSharding::Tile(temp_target_tile);
-
+  auto temp_target = target.ReplicateOnLastTileDim()
+                         ? HloSharding::PartialTile(temp_target_tile)
+                         : HloSharding::Tile(temp_target_tile);
   auto padded_shape = hlo_->shape();
   padded_shape.set_dimensions(
       target_dim,
@@ -904,6 +1219,27 @@ PartitionedHlo PartitionedHlo::ReshardWithCollectivePermute(
     const HloSharding& target) const {
   CHECK(CanReshardWithCollectivePermute(sharding(), target))
       << sharding().ToString() << " to " << target.ToString();
+  if (auto broadcast_dims = state_.b->BroadcastDimsForCreatedHlo(hlo())) {
+    if (!(*broadcast_dims)->empty()) {
+      // If hlo() has broadcast dims, check if data is already the same between
+      // source/destination pairs.
+      std::vector<int64> broadcast_dims_vector;
+      for (int64 i = 0; i < hlo()->shape().rank(); ++i) {
+        if ((*broadcast_dims)->contains(i)) {
+          broadcast_dims_vector.push_back(i);
+        }
+      }
+      if (hlo_sharding_util::PartiallyReplicateTiledShardingOnDims(
+              sharding(), broadcast_dims_vector) ==
+          hlo_sharding_util::PartiallyReplicateTiledShardingOnDims(
+              target, broadcast_dims_vector)) {
+        auto copy = state_.b->AddInstruction(HloInstruction::CreateUnary(
+            hlo()->shape(), HloOpcode::kCopy, hlo()));
+        copy->set_sharding(target);
+        return PartitionedHlo(copy, base_shape_, state_);
+      }
+    }
+  }
   std::vector<std::pair<int64, int64>> src_dst_pairs;
   sharding().tile_assignment().Each(
       [&](absl::Span<const int64> indices, int64 src_device) {
@@ -1075,7 +1411,7 @@ namespace {
 // gather/scatter slice size 1.
 bool GatherScatterOperandPartitionedOnlyOnTrivialSliceDims(
     const PartitionedHlo& operand, absl::Span<const int64> index_map,
-    absl::Span<const int64> slice_size, int64 num_partitions) {
+    absl::Span<const int64> slice_size) {
   if (operand.sharding().IsTileMaximal()) {
     return false;
   }
@@ -1086,7 +1422,7 @@ bool GatherScatterOperandPartitionedOnlyOnTrivialSliceDims(
           operand.sharding().tile_assignment().dim(dim);
     }
   }
-  return trivial_slice_dims_partitions == num_partitions;
+  return trivial_slice_dims_partitions == operand.sharding().NumTiles();
 }
 
 // Returns the min and max for the indices (replicated) in a scatter/gather
@@ -1209,6 +1545,16 @@ Status SpmdPartitioningVisitor::HandleScatter(HloInstruction* hlo) {
         case HloOpcode::kAnd:
           identity = CreateOne(operand.hlo()->shape(), &b_);
           break;
+        case HloOpcode::kMinimum:
+          identity = CreateConstant(
+              operand.hlo()->shape(),
+              LiteralUtil::MaxValue(hlo->shape().element_type()), &b_);
+          break;
+        case HloOpcode::kMaximum:
+          identity = CreateConstant(
+              operand.hlo()->shape(),
+              LiteralUtil::MinValue(hlo->shape().element_type()), &b_);
+          break;
         default:
           return DefaultAction(hlo);
       }
@@ -1221,14 +1567,29 @@ Status SpmdPartitioningVisitor::HandleScatter(HloInstruction* hlo) {
         update_dim_to_index_dim[update_scatter_dims[i]] = indices_scatter_dim;
         index_dim_to_update_dim[indices_scatter_dim] = update_scatter_dims[i];
       }
-      auto new_updates_sharding = TransposeShardingWithCollapsedDims(
-          indices.sharding(), index_dim_to_update_dim, update_dim_to_index_dim);
+      auto new_updates_sharding =
+          hlo_sharding_util::TransposeShardingWithCollapsedDims(
+              indices.sharding(), index_dim_to_update_dim,
+              update_dim_to_index_dim);
       CHECK(new_updates_sharding.has_value());
       updates = updates.Reshard(*new_updates_sharding);
+      // Update collective_ops_creator and partition_id for partial replicate.
+      auto collective_ops_creator = collective_ops_creator_;
+      auto partition_id = partition_id_;
+      if (indices.sharding().ReplicateOnLastTileDim()) {
+        auto sharding_grouped = GroupShardingOnDims(
+            indices.sharding(),
+            {indices.sharding().tile_assignment().num_dimensions() - 1});
+        auto per_group_partitioner_state = CreatePerGroupPartitioningState(
+            indices.state(), sharding_grouped.device_groups, &b_);
+        collective_ops_creator =
+            per_group_partitioner_state.collective_ops_creator;
+        partition_id = per_group_partitioner_state.partition_id;
+      }
       // To avoid accumulating the initial operand multiple times during
-      // all-reduce, we use zero operands for all non-zero partitions.
+      // all-reduce, we use identity operands for all non-zero partitions.
       auto not_partition_zero = b_.AddInstruction(HloInstruction::CreateConvert(
-          ShapeUtil::MakeScalarShape(PRED), partition_id_));
+          ShapeUtil::MakeScalarShape(PRED), partition_id));
       not_partition_zero = b_.AddInstruction(HloInstruction::CreateBroadcast(
           ShapeUtil::ChangeElementType(identity->shape(), PRED),
           not_partition_zero, {}));
@@ -1239,7 +1600,7 @@ Status SpmdPartitioningVisitor::HandleScatter(HloInstruction* hlo) {
       auto pscatter = b_.AddInstruction(scatter->CloneWithNewOperands(
           scatter->shape(), {select_operand, indices.hlo(), updates.hlo()}));
       auto all_reduce =
-          collective_ops_creator_.create_cross_partition_all_reduce(
+          collective_ops_creator.create_cross_partition_all_reduce(
               &b_, pscatter, scatter->to_apply(), {}, NewChannel());
       all_reduce->set_sharding(HloSharding::Replicate());
       SetPartitionedHlo(hlo, [&]() {
@@ -1269,8 +1630,7 @@ Status SpmdPartitioningVisitor::HandleScatter(HloInstruction* hlo) {
       return Status::OK();
     }
     if (GatherScatterOperandPartitionedOnlyOnTrivialSliceDims(
-            operand, scatter_dims_to_operand_dims, slice_size,
-            num_partitions_) &&
+            operand, scatter_dims_to_operand_dims, slice_size) &&
         ShapeSizeInBytes(updates.base_shape()) <
             ShapeSizeInBytes(scatter->shape())) {
       // Operand is sharded on trivial slice dims (update slice size 1). We can
@@ -1712,6 +2072,16 @@ Status SpmdPartitioningVisitor::HandleReshape(HloInstruction* hlo) {
     return Status::OK();
   }
 
+  // Check if operand sharding and sharding are both tiled or partial replicate.
+  // If both of them are partial replicate, check num_replications are the same.
+  if (operand.sharding().ReplicateOnLastTileDim() !=
+          sharding.ReplicateOnLastTileDim() ||
+      (sharding.ReplicateOnLastTileDim() &&
+       (operand.sharding().tile_assignment().dimensions().back() !=
+        sharding.tile_assignment().dimensions().back()))) {
+    return DefaultAction(hlo);
+  }
+
   // Try use halo exchange for certain split-dim/merge-dims cases.
   // ReshapeSharding failed in these cases probably due to uneven partitioning,
   // where halo exchange could help. Specifically we check the following
@@ -1747,7 +2117,14 @@ Status SpmdPartitioningVisitor::HandleReshape(HloInstruction* hlo) {
   Array<int64> new_input_tile_assignment = sharding.tile_assignment();
   new_input_tile_assignment.Reshape(
       operand.sharding().tile_assignment().dimensions());
-  operand = operand.Reshard(HloSharding::Tile(new_input_tile_assignment));
+  auto aligned_sharding =
+      sharding.ReplicateOnLastTileDim()
+          ? HloSharding::PartialTile(new_input_tile_assignment)
+          : HloSharding::Tile(new_input_tile_assignment);
+  operand = operand.Reshard(aligned_sharding);
+  auto replication_count = sharding.ReplicateOnLastTileDim()
+                               ? sharding.tile_assignment().dimensions().back()
+                               : 1;
 
   int64 input_dim_size = operand.base_shape().dimensions(input_sharded_dim);
   int64 output_dim_size = hlo->shape().dimensions(output_sharded_dim);
@@ -1770,7 +2147,7 @@ Status SpmdPartitioningVisitor::HandleReshape(HloInstruction* hlo) {
       dim->set_padding_low(0);
       if (i == input_sharded_dim) {
         dim->set_padding_high(output_shard_size * split_factor *
-                                  num_partitions_ -
+                                  num_partitions_ / replication_count -
                               input_dim_size);
       } else {
         dim->set_padding_high(0);
@@ -1808,8 +2185,8 @@ Status SpmdPartitioningVisitor::HandleReshape(HloInstruction* hlo) {
     tmp_reshape->set_sharding(hlo->sharding());
     auto tmp_full_shape = tmp_shard_shape;
     tmp_full_shape.set_dimensions(
-        output_sharded_dim,
-        tmp_shard_shape.dimensions(output_sharded_dim) * num_partitions_);
+        output_sharded_dim, tmp_shard_shape.dimensions(output_sharded_dim) *
+                                num_partitions_ / replication_count);
     auto tmp_output =
         PartitionedHlo(tmp_reshape, tmp_full_shape, MakePartitioningState());
 
@@ -1826,7 +2203,7 @@ Status SpmdPartitioningVisitor::HandleReshape(HloInstruction* hlo) {
       if (i == output_sharded_dim) {
         dim->set_padding_high(output_dim_size -
                               tmp_shard_shape.dimensions(output_sharded_dim) *
-                                  num_partitions_);
+                                  num_partitions_ / replication_count);
       } else {
         dim->set_padding_high(0);
       }
@@ -1951,67 +2328,22 @@ Status SpmdPartitioningVisitor::HandleBroadcast(HloInstruction* hlo) {
   auto& operand = GetPartitionedHlo(hlo->operand(0));
 
   // Tiled output.
-  std::vector<int64> wanted_input_tile_size(operand.base_shape().rank());
-  std::vector<int64> sharded_new_dims;
-  for (int64 i = 0; i < operand.base_shape().rank(); ++i) {
-    wanted_input_tile_size[i] =
-        hlo->sharding().tile_assignment().dim(hlo->dimensions(i));
-  }
+  std::vector<int64> new_dims;
   for (int64 i = 0; i < hlo->shape().rank(); ++i) {
-    if (!absl::c_linear_search(hlo->dimensions(), i) &&
-        hlo->sharding().tile_assignment().dim(i) > 1) {
-      sharded_new_dims.push_back(i);
+    if (!absl::c_linear_search(hlo->dimensions(), i)) {
+      new_dims.push_back(i);
     }
   }
-  if (sharded_new_dims.empty()) {
-    // The new dimensions are replicated, so that we can do the adjustment on
-    // the input.
-    Array<int64> wanted_input_tile_assignment(wanted_input_tile_size);
-    wanted_input_tile_assignment.Each(
-        [&](absl::Span<const int64> indices, int64* val) {
-          std::vector<int64> indices_in_broadcast(hlo->shape().rank(), 0);
-          for (int64 i = 0; i < operand.base_shape().rank(); ++i) {
-            indices_in_broadcast[hlo->dimensions(i)] = indices[i];
-          }
-          *val = hlo->sharding().tile_assignment()(indices_in_broadcast);
-        });
-    SetPartitionedHlo(hlo, [&] {
-      return b_.AddInstruction(hlo->CloneWithNewOperands(
-          MakePartitionedShape(hlo->shape(), hlo->sharding()),
-          {operand.Reshard(HloSharding::Tile(wanted_input_tile_assignment))
-               .hlo()}));
-    });
-  } else {
-    auto input = operand.Reshard(HloSharding::Replicate()).hlo();
-    // We pad and shard the input first, then broadcast to the final shard
-    // shape.
-    auto output_offsets =
-        MakePartitionOffsets(hlo->shape(), hlo->sharding(), partition_id_, &b_);
-    std::vector<HloInstruction*> input_offsets(operand.base_shape().rank());
-    auto output_shard_shape =
-        MakePartitionedShape(hlo->shape(), hlo->sharding());
-    auto input_shard_shape = input->shape();
-    auto padded_input_shape = input->shape();
-    for (int64 i = 0; i < input_offsets.size(); ++i) {
-      input_offsets[i] = output_offsets[hlo->dimensions(i)];
-      input_shard_shape.set_dimensions(
-          i, output_shard_shape.dimensions(hlo->dimensions(i)));
-      padded_input_shape.set_dimensions(
-          i, hlo->sharding().tile_assignment().dim(hlo->dimensions(i)) *
-                 input_shard_shape.dimensions(i));
-    }
-    auto padded_input = PadToShape(input, padded_input_shape, &b_);
-    auto input_shard =
-        ShapeUtil::Compatible(input_shard_shape, padded_input->shape())
-            ? padded_input
-            : b_.AddInstruction(HloInstruction::CreateDynamicSlice(
-                  input_shard_shape, padded_input, input_offsets,
-                  input_shard_shape.dimensions()));
-    SetPartitionedHlo(hlo, [&] {
-      return b_.AddInstruction(
-          hlo->CloneWithNewOperands(output_shard_shape, {input_shard}));
-    });
-  }
+  auto desired_input_sharding = hlo_sharding_util::RemoveShapeDimensions(
+      hlo_sharding_util::PartiallyReplicateTiledShardingOnDims(hlo->sharding(),
+                                                               new_dims),
+      new_dims);
+  auto input = operand.Reshard(desired_input_sharding).hlo();
+  auto output_shard_shape = MakePartitionedShape(hlo->shape(), hlo->sharding());
+  SetPartitionedHlo(hlo, [&] {
+    return b_.AddInstruction(
+        hlo->CloneWithNewOperands(output_shard_shape, {input}));
+  });
   return Status::OK();
 }
 
@@ -2134,8 +2466,10 @@ Status SpmdPartitioningVisitor::HandleGather(HloInstruction* hlo) {
         output_dim_to_index_dim[batch_dims[i]] = indices_batch_dim;
         index_dim_to_output_dim[indices_batch_dim] = batch_dims[i];
       }
-      auto pgather_sharding = TransposeShardingWithCollapsedDims(
-          indices.sharding(), index_dim_to_output_dim, output_dim_to_index_dim);
+      auto pgather_sharding =
+          hlo_sharding_util::TransposeShardingWithCollapsedDims(
+              indices.sharding(), index_dim_to_output_dim,
+              output_dim_to_index_dim);
       CHECK(pgather_sharding.has_value());
       pgather->set_sharding(*pgather_sharding);
       SetPartitionedHlo(hlo, [&]() {
@@ -2171,8 +2505,7 @@ Status SpmdPartitioningVisitor::HandleGather(HloInstruction* hlo) {
       return Status::OK();
     }
     if (GatherScatterOperandPartitionedOnlyOnTrivialSliceDims(
-            operand, start_index_map, gather->gather_slice_sizes(),
-            num_partitions_) &&
+            operand, start_index_map, gather->gather_slice_sizes()) &&
         ShapeSizeInBytes(gather->shape()) <
             ShapeSizeInBytes(gather->operand(0)->shape())) {
       indices = indices.Reshard(HloSharding::Replicate());
@@ -2234,7 +2567,17 @@ Status SpmdPartitioningVisitor::HandleGather(HloInstruction* hlo) {
           pgather->shape(), HloOpcode::kSelect, broadcast_filter,
           CreateZero(pgather->shape(), &b_), pgather));
       // Combine from different partitions.
-      auto ar = collective_ops_creator_.create_cross_partition_all_reduce(
+      auto collective_ops_creator = collective_ops_creator_;
+      if (operand.sharding().ReplicateOnLastTileDim()) {
+        auto sharding_grouped = GroupShardingOnDims(
+            operand.sharding(),
+            {operand.sharding().tile_assignment().num_dimensions() - 1});
+        auto per_group_partitioner_state = CreatePerGroupPartitioningState(
+            operand.state(), sharding_grouped.device_groups, &b_);
+        collective_ops_creator =
+            per_group_partitioner_state.collective_ops_creator;
+      }
+      auto ar = collective_ops_creator.create_cross_partition_all_reduce(
           &b_, filtered,
           MakeBinaryAdd(filtered->shape().element_type(), module_), {},
           NewChannel());
@@ -2492,7 +2835,13 @@ Status SpmdPartitioningVisitor::HandleReduce(HloInstruction* hlo) {
                         .Reshard(HloSharding::Replicate())
                         .hlo());
     inputs.push_back(GetPartitionedHlo(hlo->operand(operand_id)));
-    if (operand_id > 0) {
+    if (hlo->shape().IsTuple() && operand_id == 0) {
+      // We cannot do tuple-reduce where partitioned dimensions are reduced.
+      // Partially replicate on those dims.
+      inputs[0] = inputs[0].Reshard(
+          hlo_sharding_util::PartiallyReplicateTiledShardingOnDims(
+              inputs[0].sharding(), hlo->dimensions()));
+    } else {
       // Make sure all operands are sharded in the same way.
       inputs.back() = inputs.back().Reshard(inputs[0].sharding());
     }
@@ -2500,28 +2849,6 @@ Status SpmdPartitioningVisitor::HandleReduce(HloInstruction* hlo) {
       inputs.back() = inputs.back().PadWithValue(inits[operand_id]);
     }
   }
-  bool reduce_sharded_dimension = false;
-  if (!inputs[0].sharding().IsTileMaximal()) {
-    reduce_sharded_dimension = absl::c_any_of(hlo->dimensions(), [&](int64 i) {
-      return inputs[0].sharding().tile_assignment().dim(i) > 1;
-    });
-
-    // reduce_sharded_dimension is not supported for tuple-shaped reduces.
-    if (reduce_sharded_dimension && input_count > 1) {
-      return DefaultAction(hlo);
-    }
-
-    // Currently we only support reducing all or none of the sharded
-    // dimensions.
-    if (reduce_sharded_dimension) {
-      for (int64 i = 0; i < inputs[0].base_shape().rank(); ++i) {
-        if (inputs[0].sharding().tile_assignment().dim(i) > 1 &&
-            absl::c_count(hlo->dimensions(), i) == 0) {
-          return DefaultAction(hlo);
-        }
-      }
-    }
-  }
 
   std::vector<Shape*> new_operand_shapes(input_count * 2);
   for (int64 i = 0; i < input_count; ++i) {
@@ -2533,7 +2860,6 @@ Status SpmdPartitioningVisitor::HandleReduce(HloInstruction* hlo) {
       auto reduce_shape,
       ShapeInference::InferReduceShape(new_operand_shapes, hlo->dimensions(),
                                        hlo->to_apply()->ComputeProgramShape()));
-  *reduce_shape.mutable_layout() = hlo->shape().layout();
 
   std::vector<HloInstruction*> input_hlos(input_count);
   for (int64 i = 0; i < input_count; ++i) {
@@ -2544,36 +2870,35 @@ Status SpmdPartitioningVisitor::HandleReduce(HloInstruction* hlo) {
   local_reduce->set_metadata(hlo->metadata());
 
   SetPartitionedHlo(hlo, [&]() {
-    HloInstruction* reduce;
+    HloInstruction* reduce = local_reduce;
+    const bool reduce_sharded_dimension =
+        !inputs[0].sharding().IsTileMaximal() &&
+        absl::c_any_of(hlo->dimensions(), [&](int64 i) {
+          return inputs[0].sharding().tile_assignment().dim(i) > 1;
+        });
     if (reduce_sharded_dimension) {
       CHECK(local_reduce->shape().IsArray());
-      reduce = collective_ops_creator_.create_cross_partition_all_reduce(
-          &b_, local_reduce, hlo->to_apply(), {}, NewChannel());
-      reduce->set_sharding(HloSharding::Replicate());
-    } else {
-      reduce = local_reduce;
-      if (inputs[0].sharding().IsTileMaximal()) {
-        reduce->set_sharding(inputs[0].sharding());
-      } else {
-        // Remove tile assignment dimensions that are reduced.
-        std::vector<int64> tile_dimensions;
-        for (int64 i = 0; i < input_hlos[0]->shape().rank(); ++i) {
-          if (absl::c_count(hlo->dimensions(), i) == 0) {
-            tile_dimensions.push_back(
-                inputs[0].sharding().tile_assignment().dim(i));
-          }
+      std::vector<int64> preserved_dims;
+      for (int64 i = 0; i < inputs[0].base_shape().rank(); ++i) {
+        if (!absl::c_linear_search(hlo->dimensions(), i)) {
+          preserved_dims.push_back(i);
         }
-        Array<int64> new_tile = inputs[0].sharding().tile_assignment();
-        new_tile.Reshape(tile_dimensions);
-        auto sharding = HloSharding::Tile(new_tile);
-        if (input_count > 1) {
-          std::vector<HloSharding> tuple(input_count, sharding);
-          sharding = HloSharding::Tuple(hlo->shape(), tuple);
-        }
-        reduce->set_sharding(sharding);
       }
+      if (inputs[0].sharding().ReplicateOnLastTileDim()) {
+        preserved_dims.push_back(inputs[0].base_shape().rank());
+      }
+      auto grouped = GroupShardingOnDims(inputs[0].sharding(), preserved_dims);
+      auto grouped_state = CreatePerGroupPartitioningState(
+          inputs[0].state(), grouped.device_groups, &b_);
+      reduce = grouped_state.collective_ops_creator
+                   .create_cross_partition_all_reduce(
+                       &b_, local_reduce, hlo->to_apply(), {}, NewChannel());
     }
-
+    auto sharding = hlo_sharding_util::RemoveShapeDimensions(
+        hlo_sharding_util::PartiallyReplicateTiledShardingOnDims(
+            inputs[0].sharding(), hlo->dimensions()),
+        hlo->dimensions());
+    reduce->set_sharding(sharding);
     return PartitionedHlo(reduce, hlo->shape(), MakePartitioningState())
         .Reshard(hlo->sharding())
         .hlo();
@@ -2692,18 +3017,37 @@ Status SpmdPartitioningVisitor::HandleRng(HloInstruction* hlo) {
   }
 
   TF_RET_CHECK(!hlo->sharding().IsTileMaximal());
-  SetPartitionedHlo(hlo, [&] {
-    // Replicate the operands and run partitioned Rng on all devices.
-    std::vector<HloInstruction*> new_operands;
-    for (int64 i = 0; i < hlo->operand_count(); ++i) {
-      new_operands.push_back(GetPartitionedHlo(hlo->operand(i))
-                                 .Reshard(HloSharding::Replicate())
-                                 .hlo());
-    }
-    return b_.AddInstruction(HloInstruction::CreateRng(
+  // Replicate the operands and run partitioned Rng on all devices.
+  std::vector<HloInstruction*> new_operands;
+  for (int64 i = 0; i < hlo->operand_count(); ++i) {
+    new_operands.push_back(GetPartitionedHlo(hlo->operand(i))
+                               .Reshard(HloSharding::Replicate())
+                               .hlo());
+  }
+
+  if (!hlo->sharding().ReplicateOnLastTileDim()) {
+    SetPartitionedHlo(hlo, [&] {
+      return b_.AddInstruction(HloInstruction::CreateRng(
+          MakePartitionedShape(hlo->shape(), hlo->sharding()),
+          hlo->random_distribution(), new_operands));
+    });
+  } else {
+    std::vector<int64> group_dims(
+        hlo->sharding().tile_assignment().num_dimensions() - 1);
+    std::iota(group_dims.begin(), group_dims.end(), 0);
+    auto sharding_grouped = GroupShardingOnDims(hlo->sharding(), group_dims);
+    auto per_group_state = CreatePerGroupPartitioningState(
+        MakePartitioningState(), sharding_grouped.device_groups, &b_);
+    auto rng = b_.AddInstruction(HloInstruction::CreateRng(
         MakePartitionedShape(hlo->shape(), hlo->sharding()),
         hlo->random_distribution(), new_operands));
-  });
+    rng->set_sharding(HloSharding::AssignDevice(0));
+    SetPartitionedHlo(hlo, [&]() {
+      return PartitionedHlo(rng, rng->shape(), per_group_state)
+          .Replicate()
+          .hlo();
+    });
+  }
   return Status::OK();
 }
 
@@ -3258,7 +3602,7 @@ StatusOr<bool> SpmdPartitioner::Run(HloModule* module) {
     HloPassPipeline pass("spmd-cleanup");
     pass.AddPass<TupleSimplifier>();
     pass.AddPass<HloDCE>();
-    pass.AddPass<HloCSE>(/*is_layout_sensitive=*/true);
+    pass.AddPass<HloCSE>(/*is_layout_sensitive=*/false);
     pass.AddPass<FlattenCallGraph>();
     TF_RETURN_IF_ERROR(pass.Run(module).status());
   }
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.h b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.h
index a612c16bdae..b09ea0c8e0b 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.h
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <unordered_map>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
@@ -47,6 +48,12 @@ struct SpmdPartitionerOptions {
 
   // Whether the entry computations' signature could change after partitioning.
   bool allow_module_signature_change = false;
+
+  // Whether to use cached all-gather to avoid repeatedly replicate a tiled
+  // tensor. If it is set to false, the result tends to be more
+  // memory-efficient, and the compiler can use the ScheduleAwareAllGatherCSE
+  // pass to CSE some all-gathers which are relatively close to each other.
+  bool cache_all_gather = true;
 };
 
 // Class to wrap the computation builder to capture information during SPMD
@@ -68,6 +75,16 @@ class SpmdBuilder : public HloComputation::Builder {
 
   HloInstruction* visiting_hlo() const { return visiting_hlo_; }
 
+  // Wrapper of queries to broadcast_dims_.
+  absl::optional<const absl::flat_hash_set<int64>*> BroadcastDimsForCreatedHlo(
+      const HloInstruction* hlo) {
+    auto it = broadcast_dims_.find(hlo);
+    if (it == broadcast_dims_.end()) {
+      return absl::nullopt;
+    }
+    return &it->second;
+  }
+
  private:
   // Currently visiting instruction.
   HloInstruction* visiting_hlo_;
@@ -75,6 +92,12 @@ class SpmdBuilder : public HloComputation::Builder {
   // Map from the currently visiting (old) instruction to new instructions
   // created during SPMD partitioning.
   HloInstructionMap<std::vector<HloInstruction*>> instructions_;
+
+  // Maps from each created instruction to a set of dimensions that are from
+  // broadcasts or elementwise ops over broadcasts. This means elements along
+  // these dimensions have the same value.
+  absl::flat_hash_map<const HloInstruction*, absl::flat_hash_set<int64>>
+      broadcast_dims_;
 };
 
 // A set of functions that create the cross-partition collective ops.
@@ -180,6 +203,8 @@ class SpmdPartitioner : public HloModulePass {
       int64 channel_id, absl::Span<const int64> selected_dims,
       const SPMDCollectiveOpsCreator& collectives_creator);
 
+  const SpmdPartitionerOptions& options() { return options_; }
+
  protected:
   virtual std::unique_ptr<SpmdPartitioningVisitor> CreateVisitor(
       HloComputation* computation, int64 num_partitions, int64 num_replicas,
@@ -305,6 +330,14 @@ class PartitionedHlo {
   // Helper function to reshard the tensor using CollectivePermute.
   PartitionedHlo ReshardWithCollectivePermute(const HloSharding& target) const;
 
+  // Helper function to reshard to partial replicate using AllGather.
+  absl::optional<PartitionedHlo> ReshardToPartialReplicateWithAllGather(
+      const HloSharding& target);
+
+  // Helper function to reshard from partial replicate using DynamicSlice.
+  absl::optional<PartitionedHlo> ReshardFromPartialReplicateWithDynamicSlice(
+      const HloSharding& target);
+
   // SPMD instruction.
   HloInstruction* hlo_;
 
@@ -314,27 +347,11 @@ class PartitionedHlo {
   PartitioningState state_;
 };
 
-struct DotGeneralDimsMapping {
+struct DotConvDimsMapping {
   // The dimension numbers for the operands and output corresponding to a
   // logical dimension (e.g., batch, contracting, non-contracting). If an
   // operand or the output doesn't have the logical dimension, it is set to
   // -1.
-  struct DimsMapping {
-    int64 lhs;
-    int64 rhs;
-    int64 output;
-  };
-  std::vector<DimsMapping> batch_dims;
-  std::vector<DimsMapping> contracting_dims;
-  std::vector<DimsMapping> lhs_non_contracting_dims;
-  std::vector<DimsMapping> rhs_non_contracting_dims;
-};
-
-struct ConvolutionDimsMapping {
-  // The dimension numbers for the operands and output corresponding to a
-  // logical dimension (e.g., batch, parallel, non-parallel). If an
-  // operand or the output doesn't have the logical dimension, it is set to
-  // -1.
   struct DimsMapping {
     int64 lhs;
     int64 rhs;
@@ -342,8 +359,11 @@ struct ConvolutionDimsMapping {
     // input mapped to index in input_spatial_dimensions().
     int64 spatial;
   };
-  std::vector<DimsMapping> parallel_spatial_dims;
-  std::vector<DimsMapping> non_parallel_spatial_dims;
+  std::vector<DimsMapping> batch_dims;
+  std::vector<DimsMapping> contracting_dims;
+  std::vector<DimsMapping> lhs_non_contracting_dims;
+  std::vector<DimsMapping> rhs_non_contracting_dims;
+  std::vector<DimsMapping> conv_spatial_dims;
 };
 
 class SpmdPartitioningVisitor : public DfsHloVisitorWithDefault {
@@ -388,7 +408,7 @@ class SpmdPartitioningVisitor : public DfsHloVisitorWithDefault {
 
   // Implementation of dot partitioning given DotGeneralDimsMapping.
   Status HandleDotHelper(
-      HloInstruction* hlo, const DotGeneralDimsMapping& dims_mapping,
+      HloInstruction* hlo, const DotConvDimsMapping& dims_mapping,
       const std::function<StatusOr<HloInstruction*>(
           HloInstruction*, HloInstruction*, SpmdBuilder*)>& create_sharded_dot);
 
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc
index d5342e3e1f4..f3bd971df69 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc
@@ -138,8 +138,7 @@ ENTRY entry {
               op::AllReduce(op::Select(
                   op::Broadcast(op::Compare(op::PartitionId(), op::Constant())),
                   op::Constant(), op::Broadcast())),
-              op::Reshape(op::DynamicSlice(op::Constant(), op::PartitionId(),
-                                           op::Constant())),
+              op::Reshape(op::DynamicSlice(op::Constant(), op::PartitionId())),
               op::Constant())),
           op::Shape("s32[1,3]")));
 }
@@ -161,8 +160,7 @@ ENTRY entry {
       op::Copy(op::AllReduce(AllOf(
           op::DynamicUpdateSlice(
               op::Broadcast(), AllOf(op::Constant(), op::Shape("s32[1,3]")),
-              op::Reshape(op::DynamicSlice(op::Constant(), op::PartitionId(),
-                                           op::Constant())),
+              op::Reshape(op::DynamicSlice(op::Constant(), op::PartitionId())),
               op::Constant()),
           op::Shape("s32[2,3]")))));
 }
@@ -184,8 +182,7 @@ ENTRY entry {
       op::Copy(op::Copy(op::AllReduce(AllOf(
           op::DynamicUpdateSlice(
               op::Broadcast(), AllOf(op::Constant(), op::Shape("s32[1,3]")),
-              op::Reshape(op::DynamicSlice(op::Constant(), op::PartitionId(),
-                                           op::Constant())),
+              op::Reshape(op::DynamicSlice(op::Constant(), op::PartitionId())),
               op::Constant()),
           op::Shape("s32[2,3]"))))));
 }
@@ -279,8 +276,8 @@ ENTRY entry {
   HloInstruction* root = module->entry_computation()->root_instruction();
   ASSERT_THAT(root, op::Tuple());
 
-  auto offset = op::Reshape(
-      op::DynamicSlice(op::Constant(), op::PartitionId(), op::Constant()));
+  auto offset =
+      op::Reshape(op::DynamicSlice(op::Constant(), op::PartitionId()));
 
   EXPECT_THAT(root->operand(0),
               op::DynamicSlice(op::GetTupleElement(op::Parameter()), offset,
@@ -305,13 +302,13 @@ ENTRY entry {
                           PartitionComputation(hlo_string, /*num_devices=*/2));
   HloInstruction* root = module->entry_computation()->root_instruction();
   EXPECT_THAT(
-      root, op::Copy(op::AllReduce(op::DynamicUpdateSlice(
-                op::Broadcast(),
-                op::GetTupleElement(
-                    AllOf(op::Infeed(), op::Shape("(f32[4,2]{1,0}, token[])"))),
-                op::Reshape(op::DynamicSlice(op::Constant(), op::PartitionId(),
-                                             op::Constant())),
-                op::Constant()))));
+      root,
+      op::Copy(op::AllReduce(op::DynamicUpdateSlice(
+          op::Broadcast(),
+          op::GetTupleElement(
+              AllOf(op::Infeed(), op::Shape("(f32[4,2]{1,0}, token[])"))),
+          op::Reshape(op::DynamicSlice(op::Constant(), op::PartitionId())),
+          op::Constant()))));
 }
 
 TEST_F(SpmdPartitioningTest, UnevenTiledInfeed) {
@@ -564,6 +561,27 @@ ENTRY entry {
                                                  op::Constant())))));
 }
 
+TEST_F(SpmdPartitioningTest,
+       BroadcastBothOldAndNewDimsShardedPartiallySharded) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  param = f32[4,3] parameter(0),
+    sharding={devices=[1,2,4]0,1,4,5,2,3,6,7 last_tile_dim_replicate}
+  ROOT broadcast = f32[4,4,3] broadcast(param), dimensions={1,2},
+    sharding={devices=[2,1,2,2]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root,
+      AllOf(op::Shape("f32[2,4,2]"),
+            op::Broadcast(AllOf(op::Shape("f32[4,2]"), op::Parameter(0)))));
+}
+
 TEST_F(SpmdPartitioningTest,
        ConvWithParallelDimAndNonParallelSpatialDimPartitioned) {
   const char* const hlo_string = R"(
@@ -1985,6 +2003,36 @@ ENTRY entry {
   EXPECT_THAT(root, op::DynamicSlice(pad, _));
 }
 
+TEST_F(SpmdPartitioningTest, PartialReplicatePad) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[11,7] parameter(0),
+    sharding={devices=[1,2,2]0,1,2,3 last_tile_dim_replicate}
+  %param1 = f32[] parameter(1), sharding={replicated}
+  ROOT %pad = f32[27,22] pad(%param0, %param1), padding=2_4_1x2_1_2,
+    sharding={devices=[1,2,2]0,1,2,3 last_tile_dim_replicate}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+  auto root = module->entry_computation()->root_instruction();
+
+  auto param0 = AllOf(op::Parameter(), op::Shape("f32[11,4]"));
+  auto after_halo_exchange =
+      AllOf(op::Shape("f32[11,4]"),
+            op::DynamicSlice(
+                AllOf(op::Shape("f32[11,5]"),
+                      op::Concatenate(op::CollectivePermute(op::Slice(param0)),
+                                      param0)),
+                op::Constant(), _));
+  auto pad = op::Pad(after_halo_exchange, op::Parameter(1));
+  EXPECT_THAT(root, AllOf(op::DynamicSlice(pad, op::Constant(), _),
+                          op::Shape("f32[27,11]")));
+}
+
 TEST_F(SpmdPartitioningTest, SliceAlongNonPartitionedDimension) {
   const char* const hlo_string = R"(
 HloModule module
@@ -2042,6 +2090,61 @@ ENTRY entry {
             op::Shape("f32[63,14,126]")));
 }
 
+TEST_F(SpmdPartitioningTest,
+       PartialReplicateSliceAlongNonPartitionedDimension) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[128,14,257] parameter(0), sharding={devices=[1,1,2,2]0,1,2,3 last_tile_dim_replicate}
+  ROOT %slice = f32[128,11,257] slice(%param0),
+    slice={[0:128:1], [2:13:1], [0:257:1]}, sharding={devices=[1,1,2,2]0,1,2,3 last_tile_dim_replicate}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto param0 = AllOf(op::Parameter(), op::Shape("f32[128,14,129]"));
+  EXPECT_THAT(root, AllOf(op::Slice(param0), op::Shape("f32[128,11,129]")));
+}
+
+TEST_F(SpmdPartitioningTest, PartialReplicateSliceAlongPartitionedDimension) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[128,14,257] parameter(0), sharding={devices=[1,1,2,2]0,1,2,3 last_tile_dim_replicate}
+  ROOT %slice = f32[63,14,251] slice(%param0),
+    slice={[2:128:2], [0:14:1], [5:256:1]}, sharding={devices=[1,1,2,2]0,1,2,3 last_tile_dim_replicate}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto param0 = AllOf(op::Parameter(), op::Shape("f32[128,14,129]"));
+  EXPECT_THAT(
+      root,
+      AllOf(
+          op::Slice(AllOf(
+              op::DynamicSlice(
+                  AllOf(op::Concatenate(
+                            param0,
+                            AllOf(op::CollectivePermute(op::Slice(param0)),
+                                  op::Shape("f32[128,14,2]"))),
+                        op::Shape("f32[128,14,131]")),
+                  op::Constant(), op::Constant(),
+                  op::Add(op::Multiply(op::Reshape(op::DynamicSlice(
+                                           op::Constant(), op::PartitionId())),
+                                       op::Constant()),
+                          op::Constant())),
+              op::Shape("f32[128,14,126]"))),
+          op::Shape("f32[63,14,126]")));
+}
+
 TEST_F(SpmdPartitioningTest, SortAlongNonPartitionedDimension) {
   const char* const hlo_string = R"(
 HloModule module
@@ -2577,6 +2680,79 @@ ENTRY entry {
   EXPECT_THAT(root, AllOf(op::Transpose(), op::Shape("f32[16,2,38,38]")));
 }
 
+TEST_F(SpmdPartitioningTest, PartialReplicateShardableTranspose) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[16,38,38,4] parameter(0)
+  %param0.copy = f32[16,38,38,4] copy(%param0),
+    sharding={devices=[1,2,1,1,2]0,1,2,3 last_tile_dim_replicate}
+  ROOT %transpose = f32[16,4,38,38] transpose(%param0.copy),
+    dimensions={0,3,1,2},
+    sharding={devices=[1,1,2,1,2]0,1,2,3 last_tile_dim_replicate}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto param0 = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[16,19,38,4]"));
+  EXPECT_THAT(root, AllOf(op::Transpose(param0), op::Shape("f32[16,4,19,38]")));
+}
+
+TEST_F(SpmdPartitioningTest, PartialReplicateNonShardableTranspose) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[16,38,38,4] parameter(0)
+  %param0.copy = f32[16,38,38,4] copy(%param0),
+    sharding={devices=[1,2,1,1,2]0,1,2,3 last_tile_dim_replicate}
+  ROOT %transpose = f32[16,4,38,38] transpose(%param0.copy),
+    dimensions={0,3,1,2},
+    sharding={devices=[1,2,1,1,2]0,1,2,3 last_tile_dim_replicate}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto resahrd = AllOf(op::Reshape(op::Transpose(op::Reshape(op::AllToAll()))),
+                       op::Shape("f32[16,38,38,2]"));
+  EXPECT_THAT(root, AllOf(op::Transpose(), op::Shape("f32[16,2,38,38]")));
+}
+
+TEST_F(SpmdPartitioningTest, PartialReplicateMultiDimensionShardedTranspose) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[16,38,38,4] parameter(0)
+  %param0.copy = f32[16,38,38,4] copy(%param0),
+    sharding={devices=[2,2,1,1,2]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+  ROOT %transpose = f32[38,4,16,38] transpose(%param0.copy),
+    dimensions={1,3,0,2},
+    sharding={devices=[2,1,2,1,2]0,1,4,5,2,3,6,7 last_tile_dim_replicate}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto param0 = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Reshape(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[8,19,38,4]"));
+  EXPECT_THAT(root, AllOf(op::Transpose(param0), op::Shape("f32[19,4,8,38]")));
+}
+
 TEST_F(SpmdPartitioningTest, ShardableReshape) {
   const char* const hlo_string = R"(
 HloModule module
@@ -2600,6 +2776,30 @@ ENTRY entry {
   EXPECT_THAT(root, AllOf(op::Reshape(param0), op::Shape("f32[19,38,4,81]")));
 }
 
+TEST_F(SpmdPartitioningTest, PartialReplicateShardableReshape) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[38,38,324] parameter(0)
+  %param0.copy = f32[38,38,324] copy(%param0),
+    sharding={devices=[2,1,1,2]0,1,2,3 last_tile_dim_replicate}
+  ROOT %reshape = f32[38,38,4,81] reshape(%param0.copy),
+    sharding={devices=[2,1,1,1,2]0,1,2,3 last_tile_dim_replicate}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto param0 =
+      AllOf(op::Copy(op::DynamicSlice(op::Parameter(), op::Reshape(),
+                                      op::Constant(), op::Constant())),
+            op::Shape("f32[19,38,324]"));
+  EXPECT_THAT(root, AllOf(op::Reshape(param0), op::Shape("f32[19,38,4,81]")));
+}
+
 TEST_F(SpmdPartitioningTest, NonShardableReshape) {
   const char* const hlo_string = R"(
 HloModule module
@@ -2652,6 +2852,30 @@ ENTRY entry {
   EXPECT_THAT(root, AllOf(exchanged, op::Shape("s32[3,2,1,7,5]")));
 }
 
+TEST_F(SpmdPartitioningTest, PartialReplicateReshapeMergeDimsWithHaloExchange) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %input = s32[2,3,7,10] parameter(0),
+    sharding={devices=[1,1,2,1,2]0,1,2,3 last_tile_dim_replicate}
+  ROOT %reshape = s32[3,2,1,14,5] reshape(%input),
+    sharding={devices=[1,1,1,2,1,2]0,1,2,3 last_tile_dim_replicate}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+
+  auto reshape =
+      AllOf(op::Reshape(op::Parameter(0)), op::Shape("s32[3,2,1,8,5]"));
+  auto halo = op::CollectivePermute(op::Slice(reshape));
+  auto exchanged =
+      op::DynamicSlice(op::Concatenate(halo, reshape), _, _, _, _, _);
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(exchanged, op::Shape("s32[3,2,1,7,5]")));
+}
+
 // Produces an invalid module after transformation.
 TEST_F(SpmdPartitioningTest, InceptionV3_4_way_ReduceWindowDilated) {
   const char* const hlo_string = R"(
@@ -2746,6 +2970,35 @@ ENTRY entry {
               AllOf(op::Reduce(param0, op::Constant()), op::Shape("f32[64]")));
 }
 
+TEST_F(SpmdPartitioningTest, PartialTiledToPartialTiledReduce) {
+  const char* const hlo_string = R"(
+HloModule module
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add = f32[] add(a, b)
+}
+
+ENTRY entry {
+  %param0 = f32[4,4] parameter(0),
+    sharding={devices=[2,2,2]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+  %constant.1 = f32[] constant(0), sharding={replicated}
+  ROOT %reduce = f32[4] reduce(%param0, %constant.1), dimensions={0},
+    to_apply=%sum,
+    sharding={devices=[2,4]0,1,4,5,2,3,6,7 last_tile_dim_replicate}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root,
+              AllOf(op::AllReduce(op::Reduce(op::Parameter(0), op::Constant())),
+                    op::Shape("f32[2]")));
+}
+
 TEST_F(SpmdPartitioningTest, TiledToTiledTupleReduce) {
   const char* const hlo_string = R"(
 HloModule module
@@ -2781,6 +3034,48 @@ ENTRY %main {
                           op::Shape("(f32[14], s32[14])")));
 }
 
+TEST_F(SpmdPartitioningTest, TiledToTiledTupleReduce2) {
+  const char* const hlo_string = R"(
+HloModule module
+
+%minmax_func {
+  %lhs_value = f32[] parameter(0)
+  %rhs_value = f32[] parameter(2)
+  %compare.2 = pred[] compare(%lhs_value, %rhs_value), direction=GT
+  %select.4 = f32[] select(%compare.2, %lhs_value, %rhs_value)
+  %lhs_index = s32[] parameter(1)
+  %rhs_index = s32[] parameter(3)
+  %select.5 = s32[] select(%compare.2, %lhs_index, %rhs_index)
+  ROOT %tuple.2 = (f32[], s32[]) tuple(%select.4, %select.5)
+}
+
+ENTRY %main {
+  %param0 = f32[28,10] parameter(0), sharding={devices=[2,2]0,1,2,3}
+  %param1 = s32[28,10] parameter(1), sharding={devices=[2,2]0,1,2,3}
+  %init0 = f32[] parameter(2)
+  %init1 = s32[] parameter(3)
+  ROOT %reduce = (f32[28], s32[28]) reduce(%param0, %param1, %init0, %init1),
+    dimensions={1}, to_apply=%minmax_func,
+    sharding={{devices=[2,2]0,1,2,3 last_tile_dim_replicate},
+              {devices=[2,2]0,1,2,3 last_tile_dim_replicate}}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+
+  auto lhs =
+      AllOf(op::Shape("f32[14,10]"),
+            op::AllReduce(op::DynamicUpdateSlice(_, op::Parameter(0), _, _)));
+  auto rhs =
+      AllOf(op::Shape("s32[14,10]"),
+            op::AllReduce(op::DynamicUpdateSlice(_, op::Parameter(1), _, _)));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root,
+              AllOf(op::Reduce(lhs, rhs, op::Parameter(2), op::Parameter(3)),
+                    op::Shape("(f32[14], s32[14])")));
+}
+
 TEST_F(SpmdPartitioningTest, TiledToTiledReduceOutputReshard) {
   const char* const hlo_string = R"(
 HloModule module
@@ -3633,6 +3928,35 @@ ENTRY entry {
                           op::Shape("s32[2]")));
 }
 
+TEST_F(SpmdPartitioningTest, PartialReplicatedRng) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = s32[] parameter(0), sharding={replicated}
+  %rhs = s32[] parameter(1), sharding={replicated}
+  ROOT %rng = s32[8]{0} rng(%lhs, %rhs),
+      distribution=rng_uniform,
+      sharding={devices=[2,4]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(op::Parameter(0), op::Shape("s32[]"));
+  auto rhs = AllOf(op::Parameter(1), op::Shape("s32[]"));
+  auto partition_id =
+      AllOf(op::Reshape(op::DynamicSlice(op::Constant(), op::PartitionId())),
+            op::Shape("u32[]"));
+  EXPECT_THAT(
+      root, AllOf(op::AllReduce(op::Select(
+                      op::Broadcast(op::Compare(partition_id, op::Constant())),
+                      op::Rng(lhs, rhs), op::Broadcast(op::Constant()))),
+                  op::Shape("s32[4]")));
+}
+
 TEST_F(SpmdPartitioningTest, DynamicSliceAlongNonPartitionedDimension) {
   const char* const hlo_string = R"(
 HloModule module
@@ -3710,6 +4034,26 @@ ENTRY entry {
                           op::Shape("f32[3,5]")));
 }
 
+TEST_F(SpmdPartitioningTest, PassthroughGather_PartialReplicate) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %input = f32[2,9] parameter(0),
+    sharding={devices=[1,2,2]0,1,2,3 last_tile_dim_replicate}
+  %indices = s32[3] parameter(1), sharding={replicated}
+  ROOT %gather = f32[3,9] gather(%input, %indices), offset_dims={1},
+    collapsed_slice_dims={0}, start_index_map={0}, index_vector_dim=1,
+    slice_sizes={1,9}, sharding={devices=[1,2,2]0,1,2,3 last_tile_dim_replicate}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Gather(op::Parameter(0), op::Parameter(1)),
+                          op::Shape("f32[3,5]")));
+}
+
 TEST_F(SpmdPartitioningTest, IndexPassthroughGather) {
   const char* const hlo_string = R"(
 HloModule module
@@ -3729,6 +4073,27 @@ ENTRY entry {
                           op::Shape("f32[8,2,2]")));
 }
 
+TEST_F(SpmdPartitioningTest, IndexPassthroughGather_PartialReplicate) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %input = f32[2,9,8] parameter(0), sharding={replicated}
+  %indices = s32[4,2,4] parameter(1),
+    sharding={devices=[2,1,2,2]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+  ROOT %gather = f32[8,4,4] gather(%input, %indices), offset_dims={0},
+    collapsed_slice_dims={0,1}, start_index_map={0,1}, index_vector_dim=1,
+    slice_sizes={1,1,8},
+    sharding={devices=[1,2,2,2]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Gather(op::Parameter(0), op::Parameter(1)),
+                          op::Shape("f32[8,2,2]")));
+}
+
 TEST_F(SpmdPartitioningTest, GatherPartitionedOnTrivialSliceDims) {
   const char* const hlo_string = R"(
 HloModule module
@@ -3743,8 +4108,39 @@ ENTRY entry {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           PartitionComputation(hlo_string, /*num_devices=*/2));
   VLOG(1) << module->ToString();
-  auto offset = op::Reshape(
-      op::DynamicSlice(op::Constant(), op::PartitionId(), op::Constant()));
+  auto offset =
+      op::Reshape(op::DynamicSlice(op::Constant(), op::PartitionId()));
+  auto min = AllOf(op::Broadcast(offset), op::Shape("s32[2,3]"));
+  auto max = AllOf(op::Broadcast(op::Add(offset, op::Constant())),
+                   op::Shape("s32[2,3]"));
+  auto clamp = op::Clamp(min, op::Parameter(1), max);
+  auto gather = op::Gather(op::Parameter(0), op::Subtract(clamp, min));
+  auto mask =
+      op::Or(op::Lt(op::Parameter(1), min), op::Gt(op::Parameter(1), max));
+  auto masked =
+      op::Select(op::Broadcast(mask), op::Broadcast(op::Constant()), gather);
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::AllReduce(masked), op::Shape("f32[2,3,9]")));
+}
+
+TEST_F(SpmdPartitioningTest,
+       GatherPartitionedOnTrivialSliceDims_PartialReplicate) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %input = f32[17,9] parameter(0),
+    sharding={devices=[2,1,2]0,1,2,3 last_tile_dim_replicate}
+  %indices = s32[2,3] parameter(1), sharding={replicated}
+  ROOT %gather = f32[2,3,9] gather(%input, %indices), offset_dims={2},
+    collapsed_slice_dims={0}, start_index_map={0}, index_vector_dim=2,
+    slice_sizes={1,9}, sharding={replicated}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+  auto offset =
+      op::Reshape(op::DynamicSlice(op::Constant(), op::PartitionId()));
   auto min = AllOf(op::Broadcast(offset), op::Shape("s32[2,3]"));
   auto max = AllOf(op::Broadcast(op::Add(offset, op::Constant())),
                    op::Shape("s32[2,3]"));
@@ -3788,6 +4184,39 @@ ENTRY entry {
                           op::Shape("f32[2,5]")));
 }
 
+TEST_F(SpmdPartitioningTest, PassthroughScatter_PartialReplicate) {
+  const char* const hlo_string = R"(
+HloModule module
+
+add (lhs: f32[], rhs: f32[]) -> f32[] {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT sum = f32[] add(lhs, rhs)
+}
+
+ENTRY entry {
+  %input = f32[2,9] parameter(0),
+    sharding={devices=[1,2,2]0,1,2,3 last_tile_dim_replicate}
+  %indices = s32[3] parameter(1), sharding={replicated}
+  %updates = f32[3,9] parameter(2),
+    sharding={devices=[1,2,2]0,1,2,3 last_tile_dim_replicate}
+  ROOT %scatter = f32[2,9] scatter(%input, %indices, %updates),
+      to_apply=add,
+      update_window_dims={1},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=1,
+      sharding={devices=[1,2,2]0,1,2,3 last_tile_dim_replicate}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Scatter(op::Parameter(0), op::Parameter(1),
+                                      op::Parameter(2)),
+                          op::Shape("f32[2,5]")));
+}
+
 TEST_F(SpmdPartitioningTest, IndexPassthroughScatter) {
   const char* const hlo_string = R"(
 HloModule module
@@ -3822,6 +4251,76 @@ ENTRY entry {
             op::Shape("f32[2,9,8]")));
 }
 
+TEST_F(SpmdPartitioningTest, IndexPassthroughScatter_PartialReplicate) {
+  const char* const hlo_string = R"(
+HloModule module
+
+add (lhs: f32[], rhs: f32[]) -> f32[] {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT sum = f32[] add(lhs, rhs)
+}
+
+ENTRY entry {
+  %input = f32[2,9,8] parameter(0), sharding={replicated}
+  %indices = s32[4,2,4] parameter(1),
+    sharding={devices=[2,1,2,2]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+  %updates = f32[4,4,8] parameter(2),
+    sharding={devices=[2,2,1,2]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+  ROOT %scatter = f32[2,9,8] scatter(%input, %indices, %updates),
+      to_apply=add,
+      update_window_dims={2},
+      inserted_window_dims={0,1},
+      scatter_dims_to_operand_dims={0,1},
+      index_vector_dim=1, sharding={replicated}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root,
+      AllOf(op::AllReduce(op::Scatter(
+                op::Select(op::Broadcast(op::Convert(op::Reshape())),
+                           op::Broadcast(op::Constant()), op::Parameter(0)),
+                op::Parameter(1), op::Parameter(2))),
+            op::Shape("f32[2,9,8]")));
+}
+
+TEST_F(SpmdPartitioningTest, IndexPassthroughScatter_Min) {
+  const char* const hlo_string = R"(
+HloModule module
+
+min (lhs: f32[], rhs: f32[]) -> f32[] {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT min = f32[] minimum(lhs, rhs)
+}
+
+ENTRY entry {
+  %input = f32[2,9,8] parameter(0), sharding={replicated}
+  %indices = s32[4,2,4] parameter(1), sharding={devices=[2,1,2]0,1,2,3}
+  %updates = f32[4,4,8] parameter(2), sharding={devices=[2,2,1]0,1,2,3}
+  ROOT %scatter = f32[2,9,8] scatter(%input, %indices, %updates),
+      to_apply=min,
+      update_window_dims={2},
+      inserted_window_dims={0,1},
+      scatter_dims_to_operand_dims={0,1},
+      index_vector_dim=1, sharding={replicated}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root,
+      AllOf(op::AllReduce(op::Scatter(
+                op::Select(op::Broadcast(op::Convert(op::PartitionId())),
+                           op::Broadcast(op::Constant()), op::Parameter(0)),
+                op::Parameter(1), op::Parameter(2))),
+            op::Shape("f32[2,9,8]")));
+}
+
 TEST_F(SpmdPartitioningTest, ScatterPartitionedOnTrivialSliceDims) {
   const char* const hlo_string = R"(
 HloModule module
@@ -3846,8 +4345,45 @@ ENTRY entry {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           PartitionComputation(hlo_string, /*num_devices=*/2));
   VLOG(1) << module->ToString();
-  auto offset = op::Reshape(
-      op::DynamicSlice(op::Constant(), op::PartitionId(), op::Constant()));
+  auto offset =
+      op::Reshape(op::DynamicSlice(op::Constant(), op::PartitionId()));
+  auto indices = op::Subtract(
+      op::Parameter(1), AllOf(op::Broadcast(offset), op::Shape("s32[2,3]")));
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root,
+              AllOf(op::Scatter(op::Parameter(0), indices, op::Parameter(2)),
+                    op::Shape("f32[9,9]")));
+}
+
+TEST_F(SpmdPartitioningTest,
+       ScatterPartitionedOnTrivialSliceDims_PartialReplicate) {
+  const char* const hlo_string = R"(
+HloModule module
+
+add (lhs: f32[], rhs: f32[]) -> f32[] {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT sum = f32[] add(lhs, rhs)
+}
+
+ENTRY entry {
+  %input = f32[17,9] parameter(0),
+    sharding={devices=[2,1,2]0,1,2,3 last_tile_dim_replicate}
+  %indices = s32[2,3] parameter(1), sharding={replicated}
+  %updates = f32[2,3,9] parameter(2), sharding={replicated}
+  ROOT %scatter = f32[17,9] scatter(%input, %indices, %updates),
+      to_apply=add,
+      update_window_dims={2},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=2,
+      sharding={devices=[2,1,2]0,1,2,3 last_tile_dim_replicate}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+  auto offset =
+      op::Reshape(op::DynamicSlice(op::Constant(), op::PartitionId()));
   auto indices = op::Subtract(
       op::Parameter(1), AllOf(op::Broadcast(offset), op::Shape("s32[2,3]")));
   HloInstruction* root = module->entry_computation()->root_instruction();
@@ -4035,7 +4571,7 @@ HloModule module
 
 ENTRY entry {
   %lhs = f32[48,12] parameter(0), sharding={devices=[2,2]0,1,2,3}
-  %rhs = f32[32,12] parameter(1), sharding={devices=[2,2]0,1,2,3}
+  %rhs = f32[32,12] parameter(1), sharding={devices=[2,2]0,2,1,3}
   ROOT %dot = f32[48,32] dot(%lhs, %rhs),
     lhs_batch_dims={}, rhs_batch_dims={},
     lhs_contracting_dims={1}, rhs_contracting_dims={1},
@@ -4052,8 +4588,8 @@ ENTRY entry {
             op::AllReduce(op::DynamicUpdateSlice(_, lhs, _, _)));
   auto rhs = AllOf(op::Shape("f32[16,6]"), op::Parameter(1));
   auto partial_replicated_rhs =
-      AllOf(op::Shape("f32[16,12]"), op::AllReduce(op::DynamicUpdateSlice(
-                                         _, op::CollectivePermute(rhs), _, _)));
+      AllOf(op::Shape("f32[16,12]"),
+            op::AllReduce(op::DynamicUpdateSlice(_, rhs, _, _)));
   auto root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root,
               AllOf(op::Dot(partial_replicated_lhs, partial_replicated_rhs),
@@ -4264,6 +4800,1099 @@ ENTRY entry {
   EXPECT_THAT(root, AllOf(op::Shape("f32[4,4,12,32]"), op::Reshape(xpose)));
 }
 
+TEST_F(SpmdPartitioningTest, SimpleDotPartial) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[2,24,100] parameter(0),
+    sharding={devices=[2,1,1,2]0,1,2,3 last_tile_dim_replicate}
+  %rhs = f32[2,32,100] parameter(1),
+    sharding={devices=[2,1,1,2]0,1,2,3 last_tile_dim_replicate}
+  ROOT %dot = f32[2,24,32] dot(%lhs, %rhs),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={2},
+    sharding={devices=[2,1,1,2]0,1,2,3 last_tile_dim_replicate}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+
+  auto lhs = AllOf(op::Shape("f32[1,24,100]"), op::Parameter(0));
+  auto rhs = AllOf(op::Shape("f32[1,32,100]"), op::Parameter(1));
+  auto dot = AllOf(op::Shape("f32[1,24,32]"), op::Dot(lhs, rhs));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, dot);
+}
+
+TEST_F(SpmdPartitioningTest, DotPartialContracting) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[24,100] parameter(0),
+    sharding={devices=[1,2,2]0,1,2,3 last_tile_dim_replicate}
+  %rhs = f32[32,100] parameter(1),
+    sharding={devices=[1,2,2]0,1,2,3 last_tile_dim_replicate}
+  ROOT %dot = f32[24,32] dot(%lhs, %rhs),
+    lhs_batch_dims={}, rhs_batch_dims={},
+    lhs_contracting_dims={1}, rhs_contracting_dims={1},
+    sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+
+  auto lhs = AllOf(op::Shape("f32[24,50]"), op::Parameter(0));
+  auto rhs = AllOf(op::Shape("f32[32,50]"), op::Parameter(1));
+  auto dot = AllOf(op::Shape("f32[24,32]"), op::Dot(lhs, rhs));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::AllReduce(dot));
+}
+
+TEST_F(SpmdPartitioningTest, DotPartialContracting2) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[24,100] parameter(0),
+    sharding={devices=[1,2,2]0,1,2,3 last_tile_dim_replicate}
+  %rhs = f32[32,100] parameter(1),
+    sharding={devices=[1,2,2]0,1,2,3 last_tile_dim_replicate}
+  ROOT %dot = f32[24,32] dot(%lhs, %rhs),
+    lhs_batch_dims={}, rhs_batch_dims={},
+    lhs_contracting_dims={1}, rhs_contracting_dims={1},
+    sharding={devices=[2,1,2]0,2,1,3 last_tile_dim_replicate}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+
+  auto lhs = AllOf(op::Shape("f32[24,50]"), op::Parameter(0));
+  auto rhs = AllOf(op::Shape("f32[32,50]"), op::Parameter(1));
+  auto dot =
+      AllOf(op::Shape("f32[12,32]"),
+            op::Dot(AllOf(op::Shape("f32[12,50]"), op::DynamicSlice(lhs, _, _)),
+                    rhs));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::AllReduce(dot));
+}
+
+TEST_F(SpmdPartitioningTest, DotPartialContracting3) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[24,100] parameter(0),
+    sharding={devices=[1,2,4]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+  %rhs = f32[32,100] parameter(1),
+    sharding={devices=[1,2,4]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+  ROOT %dot = f32[24,32] dot(%lhs, %rhs),
+    lhs_batch_dims={}, rhs_batch_dims={},
+    lhs_contracting_dims={1}, rhs_contracting_dims={1},
+    sharding={devices=[1,2,4]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+  VLOG(1) << module->ToString();
+
+  auto lhs = AllOf(op::Shape("f32[24,50]"), op::Parameter(0));
+  auto rhs =
+      AllOf(op::Shape("f32[16,50]"), op::DynamicSlice(op::Parameter(1), _, _));
+  auto dot = AllOf(op::Shape("f32[24,16]"), op::Dot(lhs, rhs));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::CollectivePermute(op::AllReduce(dot)));
+}
+
+TEST_F(SpmdPartitioningTest, DotBatchAndPartialContracting) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[4,24,100] parameter(0),
+    sharding={devices=[2,2,2]0,1,2,3,4,5,6,7}
+  %rhs = f32[4,32,100] parameter(1),
+    sharding={devices=[2,1,2,2]0,2,1,3,4,6,5,7 last_tile_dim_replicate}
+  ROOT %dot = f32[4,24,32] dot(%lhs, %rhs),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={2},
+    sharding={devices=[2,2,1,2]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+  VLOG(1) << module->ToString();
+
+  auto lhs = AllOf(op::Shape("f32[2,12,50]"), op::Parameter(0));
+  auto rhs = AllOf(op::Shape("f32[2,32,50]"), op::Parameter(1));
+  auto dot = AllOf(op::Shape("f32[2,12,32]"), op::Dot(lhs, rhs));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::AllReduce(dot));
+}
+
+TEST_F(SpmdPartitioningTest, DotPartialNonContracting) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[24,8,100] parameter(0),
+    sharding={devices=[2,1,1,2]0,1,2,3 last_tile_dim_replicate}
+  %rhs = f32[32,100] parameter(1), sharding={devices=[2,2]0,2,1,3}
+  ROOT %dot = f32[24,8,32] dot(%lhs, %rhs),
+    lhs_batch_dims={}, rhs_batch_dims={},
+    lhs_contracting_dims={2}, rhs_contracting_dims={1},
+    sharding={devices=[2,1,2]0,1,2,3}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+
+  auto lhs = AllOf(op::Shape("f32[12,8,100]"), op::Parameter(0));
+  auto rhs = AllOf(op::Shape("f32[16,50]"), op::Parameter(1));
+  auto partially_replicated_rhs =
+      AllOf(op::Shape("f32[16,100]"),
+            op::AllReduce(op::DynamicUpdateSlice(op::Broadcast(_), rhs, _, _)));
+  auto dot =
+      AllOf(op::Shape("f32[12,8,16]"), op::Dot(lhs, partially_replicated_rhs));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, dot);
+}
+
+TEST_F(SpmdPartitioningTest, DotPartialNonContractingPartialMatch) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[24,8,100] parameter(0), sharding={devices=[2,2,1]0,1,2,3}
+  %rhs = f32[32,100] parameter(1),
+    sharding={devices=[2,1,2]0,2,1,3 last_tile_dim_replicate}
+  ROOT %dot = f32[24,8,32] dot(%lhs, %rhs),
+    lhs_batch_dims={}, rhs_batch_dims={},
+    lhs_contracting_dims={2}, rhs_contracting_dims={1},
+    sharding={devices=[2,1,2]0,1,2,3}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+
+  auto lhs = AllOf(op::Shape("f32[12,4,100]"), op::Parameter(0));
+  auto rhs = AllOf(op::Shape("f32[16,100]"), op::Parameter(1));
+  auto partially_replicated_lhs = AllOf(
+      op::Shape("f32[12,8,100]"),
+      op::AllReduce(op::DynamicUpdateSlice(op::Broadcast(_), lhs, _, _, _)));
+  auto dot =
+      AllOf(op::Shape("f32[12,8,16]"), op::Dot(partially_replicated_lhs, rhs));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, dot);
+}
+
+TEST_F(SpmdPartitioningTest, DotPartialContractingPartialMatch) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[24,8,100] parameter(0), sharding={devices=[1,2,2]0,1,2,3}
+  %rhs = f32[32,8,100] parameter(1),
+    sharding={devices=[1,1,2,2]0,2,1,3 last_tile_dim_replicate}
+  ROOT %dot = f32[24,32] dot(%lhs, %rhs),
+    lhs_batch_dims={}, rhs_batch_dims={},
+    lhs_contracting_dims={1,2}, rhs_contracting_dims={1,2},
+    sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+
+  auto lhs = AllOf(op::Shape("f32[24,4,50]"), op::Parameter(0));
+  auto rhs = AllOf(op::Shape("f32[32,8,50]"), op::Parameter(1));
+  auto dot = AllOf(op::Shape("f32[24,32]"),
+                   op::Dot(lhs, AllOf(op::Shape("f32[32,4,50]"),
+                                      op::DynamicSlice(rhs, _, _, _))));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::AllReduce(op::AllReduce(dot)));
+}
+
+TEST_F(SpmdPartitioningTest, DotNonContractingPartialMatchContractingMatch) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[24,8,100] parameter(0), sharding={devices=[2,1,2]0,1,2,3}
+  %rhs = f32[100,50] parameter(1), sharding={devices=[2,2]0,2,1,3}
+  ROOT %dot = f32[24,8,50] dot(%lhs, %rhs),
+    lhs_batch_dims={}, rhs_batch_dims={},
+    lhs_contracting_dims={2}, rhs_contracting_dims={0},
+    sharding={devices=[2,2,1]0,1,2,3}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+
+  auto lhs = AllOf(op::Shape("f32[12,8,50]"), op::Parameter(0));
+  auto rhs = AllOf(op::Shape("f32[50,25]"), op::Parameter(1));
+  auto dot = AllOf(
+      op::Shape("f32[12,8,50]"),
+      op::Dot(lhs, AllOf(op::Shape("f32[50,50]"),
+                         op::AllReduce(op::DynamicUpdateSlice(_, rhs, _, _)))));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Shape("f32[12,4,50]"),
+                          op::DynamicSlice(op::AllReduce(dot), _, _, _)))
+      << module->ToString();
+}
+
+TEST_F(SpmdPartitioningTest, DotLHSMutiNonContractingRHSNotMatch) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[24,8,10] parameter(0), sharding={devices=[2,2,1]0,1,2,3}
+  %rhs = f32[10,50] parameter(1),
+    sharding={devices=[2,1,2]0,2,1,3 last_tile_dim_replicate}
+  ROOT %dot = f32[24,8,50] dot(%lhs, %rhs),
+    lhs_batch_dims={}, rhs_batch_dims={},
+    lhs_contracting_dims={2}, rhs_contracting_dims={0},
+    sharding={devices=[2,2,1]0,1,2,3}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+
+  auto lhs = AllOf(op::Shape("f32[12,4,10]"), op::Parameter(0));
+  auto rhs = AllOf(op::Shape("f32[5,50]"), op::Parameter(1));
+  auto dot = AllOf(
+      op::Shape("f32[12,4,50]"),
+      op::Dot(lhs, AllOf(op::Shape("f32[10,50]"),
+                         op::AllReduce(op::DynamicUpdateSlice(_, rhs, _, _)))));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, dot) << module->ToString();
+}
+
+TEST_F(SpmdPartitioningTest,
+       ElementwiseTest_PartialReplicateToTiledHaloExchange) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  constant = f32[6,3]{1,0}
+    constant({{1,3,7},{5,1,4},{1,2,8},{2,3,7},{5,2,4},{2,2,8}}),
+    sharding={replicated}
+  constant.1 = f32[6,3]{1,0}
+    constant({{2,7,2},{2,9,2},{2,6,2},{3,7,2},{2,9,3},{2,3,2}}),
+    sharding={replicated}
+  multiply = f32[6,3]{1,0} multiply(constant, constant.1),
+    sharding={devices=[2,1,2]0,1,2,3 last_tile_dim_replicate}
+  ROOT add = f32[6,3]{1,0} add(multiply, constant.1),
+    sharding={devices=[4,1]0,1,2,3}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+  auto partial_replicate_lhs =
+      AllOf(op::Shape("f32[3,3]"),
+            op::DynamicSlice(op::Constant(), op::Reshape(), op::Constant()));
+  auto partial_replicate_rhs =
+      AllOf(op::Shape("f32[3,3]"),
+            op::DynamicSlice(op::Constant(), op::Reshape(), op::Constant()));
+  auto multiply =
+      AllOf(op::Shape("f32[3,3]"),
+            op::Multiply(partial_replicate_lhs, partial_replicate_rhs));
+  auto right_halo =
+      AllOf(op::Shape("f32[1,3]"), op::CollectivePermute(op::Slice(multiply)));
+  auto add_lhs = AllOf(
+      op::Shape("f32[2,3]"),
+      op::DynamicSlice(
+          op::DynamicSlice(
+              op::Pad(op::Concatenate(multiply, right_halo), op::Constant()),
+              op::Reshape(), op::Constant()),
+          op::Reshape(), op::Constant()));
+  auto add_rhs = AllOf(op::Shape("f32[2,3]"),
+                       op::DynamicSlice(op::Pad(op::Constant(), op::Constant()),
+                                        op::Reshape(), op::Constant()));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Shape("f32[2,3]"), op::Add(add_lhs, add_rhs)));
+}
+
+TEST_F(SpmdPartitioningTest, TileToPartialReplicateReshard) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[8,8] parameter(0)
+  %copy = f32[8,8] copy(%param0),
+    sharding={devices=[2,2]0,1,2,3}
+  ROOT %copy0 = f32[8,8] copy(%copy),
+    sharding={devices=[2,1,2]0,1,2,3 last_tile_dim_replicate}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+  auto tiled = AllOf(op::Shape("f32[4,4]"),
+                     op::Copy(op::DynamicSlice(op::Parameter(0), op::Reshape(),
+                                               op::Reshape())));
+  auto partially_replicated = AllOf(
+      op::Shape("f32[4,8]"), op::Copy(op::AllReduce(op::DynamicUpdateSlice(
+                                 op::Broadcast(_), tiled, _, _))));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, partially_replicated);
+}
+
+TEST_F(SpmdPartitioningTest, PartialReplicateToTileReshard) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[8,8] parameter(0)
+  %copy = f32[8,8] copy(%param0),
+    sharding={devices=[2,1,2]0,1,2,3 last_tile_dim_replicate}
+  ROOT %copy0 = f32[8,8] copy(%copy),
+    sharding={devices=[2,2]0,1,2,3}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+  auto partially_replicated =
+      AllOf(op::Shape("f32[4,8]"),
+            op::Copy(op::DynamicSlice(op::Parameter(0), op::Reshape(),
+                                      op::Constant())));
+  auto tiled = AllOf(op::Shape("f32[4,4]"),
+                     op::Copy(op::DynamicSlice(partially_replicated,
+                                               op::Constant(), op::Reshape())));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, tiled);
+}
+
+TEST_F(SpmdPartitioningTest,
+       PartialReplicateToPartialReplicateReshard_AllReduce) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[8,8] parameter(0)
+  %copy = f32[8,8] copy(param0),
+    sharding={devices=[2,2,2]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+  ROOT %copy0 = f32[8,8] copy(%copy),
+    sharding={devices=[2,1,4]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+
+  VLOG(1) << module->ToString();
+  auto partially_replicated_init =
+      AllOf(op::Shape("f32[4,4]"),
+            op::Copy(op::DynamicSlice(op::Parameter(0), op::Reshape(),
+                                      op::Reshape())));
+  auto partially_replicated =
+      AllOf(op::Shape("f32[4,8]"),
+            op::Copy(op::AllReduce(op::DynamicUpdateSlice(
+                op::Broadcast(_), partially_replicated_init, _, _))));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, partially_replicated);
+}
+
+TEST_F(SpmdPartitioningTest,
+       PartialReplicateToPartialReplicateReshard_DynamicSlice) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[8,8] parameter(0)
+  %copy = f32[8,8] copy(%param0),
+    sharding={devices=[2,1,4]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+  ROOT %copy0 = f32[8,8] copy(%copy),
+    sharding={devices=[2,2,2]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+  VLOG(1) << module->ToString();
+  auto partially_replicated =
+      AllOf(op::Shape("f32[4,8]"),
+            op::Copy(op::DynamicSlice(op::Parameter(0), op::Reshape(),
+                                      op::Constant())));
+  auto tiled = AllOf(op::Shape("f32[4,4]"),
+                     op::Copy(op::DynamicSlice(partially_replicated,
+                                               op::Constant(), op::Reshape())));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, tiled);
+}
+
+TEST_F(SpmdPartitioningTest,
+       PartialReplicateToPartialReplicateReshard_DynamicSlice2) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[8,8] parameter(0)
+  %copy = f32[8,8] copy(%param0),
+    sharding={devices=[1,1,8]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+  ROOT %copy0 = f32[8,8] copy(%copy),
+    sharding={devices=[2,2,2]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+  VLOG(1) << module->ToString();
+  auto partially_replicated =
+      AllOf(op::Shape("f32[8,8]"),
+            op::Copy(op::DynamicSlice(op::Parameter(0), op::Constant(),
+                                      op::Constant())));
+  auto tiled = AllOf(op::Shape("f32[4,4]"),
+                     op::Copy(op::DynamicSlice(partially_replicated,
+                                               op::Reshape(), op::Reshape())));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, tiled);
+}
+
+TEST_F(SpmdPartitioningTest,
+       PartialReplicateToPartialReplicateReshardWithCollectivePermute) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[8,8] parameter(0)
+  %copy = f32[8,8] copy(param0),
+    sharding={devices=[2,2,2]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+  ROOT %copy0 = f32[8,8] copy(%copy),
+    sharding={devices=[1,2,4]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+
+  VLOG(1) << module->ToString();
+  auto partially_replicated_init =
+      AllOf(op::Shape("f32[4,4]"),
+            op::CollectivePermute(op::Copy(op::DynamicSlice(
+                op::Parameter(0), op::Reshape(), op::Reshape()))));
+  auto partially_replicated =
+      AllOf(op::Shape("f32[8,4]"),
+            op::Copy(op::AllReduce(op::DynamicUpdateSlice(
+                op::Broadcast(_), partially_replicated_init, _, _))));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, partially_replicated);
+}
+
+TEST_F(SpmdPartitioningTest,
+       PartialReplicateToPartialReplicateReshardCollectivePermute1) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[8,8] parameter(0)
+  %copy = f32[8,8] copy(%param0),
+    sharding={devices=[1,2,4]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+  ROOT %copy0 = f32[8,8] copy(%copy),
+    sharding={devices=[2,2,2]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+  VLOG(1) << module->ToString();
+  auto partially_replicated =
+      AllOf(op::Shape("f32[8,4]"),
+            op::Copy(op::DynamicSlice(op::Parameter(0), op::Constant(),
+                                      op::Reshape())));
+  auto tiled =
+      AllOf(op::Shape("f32[4,4]"),
+            op::Copy(op::CollectivePermute(op::DynamicSlice(
+                partially_replicated, op::Reshape(), op::Constant()))));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, tiled);
+}
+
+TEST_F(SpmdPartitioningTest,
+       PartialReplicateToPartialReplicateReshardHaloExchange) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[6,3] parameter(0)
+  %copy = f32[6,3] copy(param0),
+    sharding={devices=[4,1,2]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+  ROOT %copy0 = f32[6,3] copy(%copy),
+    sharding={devices=[2,1,4]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+
+  VLOG(1) << module->ToString();
+  auto partially_replicated_init =
+      AllOf(op::Shape("f32[2,3]"),
+            op::Copy(op::DynamicSlice(op::Pad(op::Parameter(0), op::Constant()),
+                                      op::Reshape(), op::Constant())));
+  auto slice =
+      AllOf(op::Shape("f32[2,3]"),
+            op::DynamicSlice(op::Concatenate(op::CollectivePermute(op::Slice(
+                                                 partially_replicated_init)),
+                                             partially_replicated_init),
+                             _, _));
+  auto partially_replicated =
+      AllOf(op::Shape("f32[3,3]"),
+            op::Copy(op::Slice(op::AllReduce(
+                op::DynamicUpdateSlice(op::Broadcast(_), slice, _, _)))));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, partially_replicated);
+}
+
+TEST_F(SpmdPartitioningTest,
+       PartialReplicateToPartialReplicateReshardHaloExchange1) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[6,3] parameter(0)
+  %copy = f32[6,3] copy(param0),
+    sharding={devices=[2,1,4]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+  ROOT %copy0 = f32[6,3] copy(%copy),
+    sharding={devices=[4,1,2]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+
+  VLOG(1) << module->ToString();
+  auto partially_replicated_init =
+      AllOf(op::Shape("f32[3,3]"),
+            op::Copy(op::DynamicSlice(op::Parameter(0), op::Reshape(),
+                                      op::Constant())));
+  auto slice = AllOf(
+      op::Shape("f32[4,3]"),
+      op::DynamicSlice(op::Pad(op::Concatenate(partially_replicated_init,
+                                               op::CollectivePermute(op::Slice(
+                                                   partially_replicated_init))),
+                               op::Constant()),
+                       _, _));
+  auto partially_replicated =
+      AllOf(op::Shape("f32[2,3]"), op::Copy(op::DynamicSlice(slice, _, _)));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, partially_replicated);
+}
+
+TEST_F(SpmdPartitioningTest, PartitionConvWithBathGroupCount) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[16,801,1,1024] parameter(0)
+  %lhs.copy = f32[16,801,1,1024] copy(%lhs),
+    sharding={devices=[1,1,1,2]0,1}
+  %rhs = f32[16,801,1,1024] parameter(1)
+  %rhs.copy = f32[16,801,1,1024] copy(%rhs),
+    sharding={devices=[1,1,1,2]0,1}
+  ROOT %conv = f32[5,1,1,1024] convolution(%lhs.copy, %rhs.copy),
+    dim_labels=f01b_i01o->01bf,batch_group_count=1024,
+    window={size=801x1 pad=2_2x0_0},
+    sharding={devices=[1,1,1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+
+  VLOG(1) << module->ToString();
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Constant(),
+                                op::Constant(), op::Reshape())),
+      op::Shape("f32[16,801,1,512]"));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Constant(),
+                                op::Constant(), op::Reshape())),
+      op::Shape("f32[16,801,1,512]"));
+  EXPECT_THAT(root,
+              AllOf(op::Convolution(lhs, rhs), op::Shape("f32[5,1,1,512]")));
+}
+
+TEST_F(SpmdPartitioningTest, PartitionConvWithBathGroupCountRHSAlignWithLHS) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[16,801,1,1024] parameter(0)
+  %lhs.copy = f32[16,801,1,1024] copy(%lhs),
+    sharding={devices=[1,1,1,2]0,1}
+  %rhs = f32[16,801,1,1024] parameter(1)
+  %rhs.copy = f32[16,801,1,1024] copy(%rhs),
+    sharding={devices=[1,2,1,1]0,1}
+  ROOT %conv = f32[5,1,1,1024] convolution(%lhs.copy, %rhs.copy),
+    dim_labels=f01b_i01o->01bf,batch_group_count=1024,
+    window={size=801x1 pad=2_2x0_0},
+    sharding={devices=[1,1,1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Constant(),
+                                op::Constant(), op::Reshape())),
+      op::Shape("f32[16,801,1,512]"));
+  auto rhs = AllOf(op::Copy(op::DynamicSlice(
+                       op::Pad(op::Parameter(), op::Constant()), op::Constant(),
+                       op::Reshape(), op::Constant(), op::Constant())),
+                   op::Shape("f32[16,401,1,1024]"));
+  auto resharded_rhs = AllOf(
+      op::Slice(op::Reshape(op::Transpose(op::AllToAll(op::Reshape(rhs))))),
+      op::Shape("f32[16,801,1,512]"));
+  EXPECT_THAT(root, AllOf(op::Convolution(lhs, resharded_rhs),
+                          op::Shape("f32[5,1,1,512]")));
+}
+
+TEST_F(SpmdPartitioningTest, PartitionConvWithBathGroupCountLHSAlignWithRHS) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[16,801,1,1024] parameter(0)
+  %lhs.copy = f32[16,801,1,1024] copy(%lhs),
+    sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[16,801,1,1024] parameter(1)
+  %rhs.copy = f32[16,801,1,1024] copy(%rhs),
+    sharding={devices=[1,1,1,2]0,1}
+  ROOT %conv = f32[5,1,1,1024] convolution(%lhs.copy, %rhs.copy),
+    dim_labels=f01b_i01o->01bf,batch_group_count=1024,
+    window={size=801x1 pad=2_2x0_0},
+    sharding={devices=[1,1,1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  auto root = module->entry_computation()->root_instruction();
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Constant(),
+                                op::Constant(), op::Reshape())),
+      op::Shape("f32[16,801,1,512]"));
+  auto lhs = AllOf(op::Copy(op::DynamicSlice(
+                       op::Pad(op::Parameter(), op::Constant()), op::Constant(),
+                       op::Reshape(), op::Constant(), op::Constant())),
+                   op::Shape("f32[16,401,1,1024]"));
+  auto resharded_lhs = AllOf(
+      op::Slice(op::Reshape(op::Transpose(op::AllToAll(op::Reshape(lhs))))),
+      op::Shape("f32[16,801,1,512]"));
+  EXPECT_THAT(root, AllOf(op::Convolution(resharded_lhs, rhs),
+                          op::Shape("f32[5,1,1,512]")));
+}
+
+TEST_F(SpmdPartitioningTest,
+       PartitionConvWithBathGroupCountOutputAlignWithLHS) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[16,801,1,1024] parameter(0)
+  %lhs.copy = f32[16,801,1,1024] copy(%lhs),
+    sharding={devices=[1,1,1,2]0,1}
+  %rhs = f32[16,801,1,1024] parameter(1)
+  %rhs.copy = f32[16,801,1,1024] copy(%rhs),
+    sharding={devices=[1,1,1,2]0,1}
+  ROOT %conv = f32[5,1,1,1024] convolution(%lhs.copy, %rhs.copy),
+    dim_labels=f01b_i01o->01bf,batch_group_count=1024,
+    window={size=801x1 pad=2_2x0_0},
+    sharding={devices=[2,1,1,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Constant(),
+                                op::Constant(), op::Reshape())),
+      op::Shape("f32[16,801,1,512]"));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Constant(),
+                                op::Constant(), op::Reshape())),
+      op::Shape("f32[16,801,1,512]"));
+  auto conv = AllOf(op::Convolution(lhs, rhs), op::Shape("f32[5,1,1,512]"));
+  EXPECT_THAT(root, AllOf(op::Reshape(op::Transpose(op::AllToAll(
+                              op::Reshape(op::Pad(conv, op::Constant()))))),
+                          op::Shape("f32[3,1,1,1024]")));
+}
+
+TEST_F(SpmdPartitioningTest,
+       PartitionConvWithBathGroupCountOutputAlignWithRHS) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[16,801,1,1024] parameter(0)
+  %lhs.copy = f32[16,801,1,1024] copy(%lhs),
+    sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[16,801,1,1024] parameter(1)
+  %rhs.copy = f32[16,801,1,1024] copy(%rhs),
+    sharding={devices=[1,1,1,2]0,1}
+  ROOT %conv = f32[5,1,1,1024] convolution(%lhs.copy, %rhs.copy),
+    dim_labels=f01b_i01o->01bf,batch_group_count=1024,
+    window={size=801x1 pad=2_2x0_0},
+    sharding={devices=[2,1,1,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  auto root = module->entry_computation()->root_instruction();
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Constant(),
+                                op::Constant(), op::Reshape())),
+      op::Shape("f32[16,801,1,512]"));
+  auto lhs = AllOf(op::Copy(op::DynamicSlice(
+                       op::Pad(op::Parameter(), op::Constant()), op::Constant(),
+                       op::Reshape(), op::Constant(), op::Constant())),
+                   op::Shape("f32[16,401,1,1024]"));
+  auto resharded_lhs = AllOf(
+      op::Slice(op::Reshape(op::Transpose(op::AllToAll(op::Reshape(lhs))))),
+      op::Shape("f32[16,801,1,512]"));
+  auto conv =
+      AllOf(op::Convolution(resharded_lhs, rhs), op::Shape("f32[5,1,1,512]"));
+  EXPECT_THAT(root, AllOf(op::Reshape(op::Transpose(op::AllToAll(
+                              op::Reshape(op::Pad(conv, op::Constant()))))),
+                          op::Shape("f32[3,1,1,1024]")));
+}
+
+TEST_F(SpmdPartitioningTest, PartitionConvWithFeatureGroupCount) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[16,801,1,1024] parameter(0)
+  %lhs.copy = f32[16,801,1,1024] copy(%lhs),
+    sharding={devices=[1,1,1,2]0,1}
+  %rhs = f32[5,1,1,1024] parameter(1)
+  %rhs.copy = f32[5,1,1,1024] copy(%rhs),
+    sharding={devices=[1,1,1,2]0,1}
+  ROOT %conv = f32[16,801,1,1024] convolution(%lhs.copy, %rhs.copy),
+    dim_labels=b01f_01io->b01f,feature_group_count=1024,
+    window={size=5x1 pad=2_2x0_0},
+    sharding={devices=[1,1,1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Constant(),
+                                op::Constant(), op::Reshape())),
+      op::Shape("f32[16,801,1,512]"));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Constant(),
+                                op::Constant(), op::Reshape())),
+      op::Shape("f32[5,1,1,512]"));
+  EXPECT_THAT(root,
+              AllOf(op::Convolution(lhs, rhs), op::Shape("f32[16,801,1,512]")));
+}
+
+TEST_F(SpmdPartitioningTest,
+       PartitionConvWithFeatureGroupCountRHSAlignWithLHS) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[16,801,1,1024] parameter(0)
+  %lhs.copy = f32[16,801,1,1024] copy(%lhs),
+    sharding={devices=[1,1,1,2]0,1}
+  %rhs = f32[5,1,1,1024] parameter(1)
+  %rhs.copy = f32[5,1,1,1024] copy(%rhs),
+    sharding={devices=[2,1,1,1]0,1}
+  ROOT %conv = f32[16,801,1,1024] convolution(%lhs.copy, %rhs.copy),
+    dim_labels=b01f_01io->b01f,feature_group_count=1024,
+    window={size=5x1 pad=2_2x0_0},
+    sharding={devices=[1,1,1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Constant(),
+                                op::Constant(), op::Reshape())),
+      op::Shape("f32[16,801,1,512]"));
+  auto rhs = AllOf(op::Copy(op::DynamicSlice(
+                       op::Pad(op::Parameter(), op::Constant()), op::Reshape(),
+                       op::Constant(), op::Constant(), op::Constant())),
+                   op::Shape("f32[3,1,1,1024]"));
+  auto resharded_rhs = AllOf(
+      op::Slice(op::Reshape(op::Transpose(op::AllToAll(op::Reshape(rhs))))),
+      op::Shape("f32[5,1,1,512]"));
+  EXPECT_THAT(root, AllOf(op::Convolution(lhs, resharded_rhs),
+                          op::Shape("f32[16,801,1,512]")));
+}
+
+TEST_F(SpmdPartitioningTest,
+       PartitionConvWithFeatureGroupCountLHSAlignWithRHS) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[16,801,1,1024] parameter(0)
+  %lhs.copy = f32[16,801,1,1024] copy(%lhs),
+    sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[5,1,1,1024] parameter(1)
+  %rhs.copy = f32[5,1,1,1024] copy(%rhs),
+    sharding={devices=[1,1,1,2]0,1}
+  ROOT %conv = f32[16,801,1,1024] convolution(%lhs.copy, %rhs.copy),
+    dim_labels=b01f_01io->b01f,feature_group_count=1024,
+    window={size=5x1 pad=2_2x0_0},
+    sharding={devices=[1,1,1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(op::Copy(op::DynamicSlice(
+                       op::Pad(op::Parameter(), op::Constant()), op::Constant(),
+                       op::Reshape(), op::Constant(), op::Constant())),
+                   op::Shape("f32[16,401,1,1024]"));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Constant(),
+                                op::Constant(), op::Reshape())),
+      op::Shape("f32[5,1,1,512]"));
+  auto resharded_lhs = AllOf(
+      op::Slice(op::Reshape(op::Transpose(op::AllToAll(op::Reshape(lhs))))),
+      op::Shape("f32[16,801,1,512]"));
+  EXPECT_THAT(root, AllOf(op::Convolution(resharded_lhs, rhs),
+                          op::Shape("f32[16,801,1,512]")));
+}
+
+TEST_F(SpmdPartitioningTest,
+       PartitionConvWithFeatureGroupCountAlignOuputWithLHS) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[16,801,1,1024] parameter(0)
+  %lhs.copy = f32[16,801,1,1024] copy(%lhs),
+    sharding={devices=[1,1,1,2]0,1}
+  %rhs = f32[5,1,1,1024] parameter(1)
+  %rhs.copy = f32[5,1,1,1024] copy(%rhs),
+    sharding={devices=[1,1,1,2]0,1}
+  ROOT %conv = f32[16,801,1,1024] convolution(%lhs.copy, %rhs.copy),
+    dim_labels=b01f_01io->b01f,feature_group_count=1024,
+    window={size=5x1 pad=2_2x0_0},
+    sharding={devices=[2,1,1,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Constant(),
+                                op::Constant(), op::Reshape())),
+      op::Shape("f32[16,801,1,512]"));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Constant(),
+                                op::Constant(), op::Reshape())),
+      op::Shape("f32[5,1,1,512]"));
+  auto conv = AllOf(op::Convolution(lhs, rhs), op::Shape("f32[16,801,1,512]"));
+  EXPECT_THAT(root,
+              AllOf(op::Reshape(op::Transpose(op::AllToAll(op::Reshape(conv)))),
+                    op::Shape("f32[8,801,1,1024]")));
+}
+
+TEST_F(SpmdPartitioningTest,
+       PartitionConvWithFeatureGroupCountAlignOuputWithRHS) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[16,801,1,1024] parameter(0)
+  %lhs.copy = f32[16,801,1,1024] copy(%lhs),
+    sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[5,1,1,1024] parameter(1)
+  %rhs.copy = f32[5,1,1,1024] copy(%rhs),
+    sharding={devices=[1,1,1,2]0,1}
+  ROOT %conv = f32[16,801,1,1024] convolution(%lhs.copy, %rhs.copy),
+    dim_labels=b01f_01io->b01f,feature_group_count=1024,
+    window={size=5x1 pad=2_2x0_0},
+    sharding={devices=[2,1,1,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(op::Copy(op::DynamicSlice(
+                       op::Pad(op::Parameter(), op::Constant()), op::Constant(),
+                       op::Reshape(), op::Constant(), op::Constant())),
+                   op::Shape("f32[16,401,1,1024]"));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Constant(),
+                                op::Constant(), op::Reshape())),
+      op::Shape("f32[5,1,1,512]"));
+  auto resharded_lhs = AllOf(
+      op::Slice(op::Reshape(op::Transpose(op::AllToAll(op::Reshape(lhs))))),
+      op::Shape("f32[16,801,1,512]"));
+  auto conv = AllOf(op::Convolution(resharded_lhs, rhs),
+                    op::Shape("f32[16,801,1,512]"));
+  EXPECT_THAT(root,
+              AllOf(op::Reshape(op::Transpose(op::AllToAll(op::Reshape(conv)))),
+                    op::Shape("f32[8,801,1,1024]")));
+}
+
+TEST_F(SpmdPartitioningTest, PartitionConvWithFeatureGroupCountBackProp) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[16,801,1,1024] parameter(0)
+  %lhs.copy = f32[16,801,1,1024] copy(%lhs),
+    sharding={devices=[1,1,1,2]0,1}
+  %rhs = f32[5,1,1024,1] parameter(1)
+  %rhs.copy = f32[5,1,1024,1] copy(%rhs),
+    sharding={devices=[1,1,2,1]0,1}
+  ROOT %conv = f32[16,801,1,1024] convolution(%lhs.copy, %rhs.copy),
+    dim_labels=b01f_01oi->b01f,feature_group_count=1024,
+    window={size=5x1 pad=2_2x0_0 rhs_reversal=1x1},
+    sharding={devices=[1,1,1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Constant(),
+                                op::Constant(), op::Reshape())),
+      op::Shape("f32[16,801,1,512]"));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Constant(),
+                                op::Reshape(), op::Constant())),
+      op::Shape("f32[5,1,512,1]"));
+  EXPECT_THAT(root,
+              AllOf(op::Convolution(lhs, rhs), op::Shape("f32[16,801,1,512]")));
+}
+
+TEST_F(SpmdPartitioningTest, NoReshardOnBroadcastDims) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[2,3] parameter(0)
+  %param1 = f32[2,3,20] parameter(1)
+  %br0 = f32[20,2,20,3,20] broadcast(%param0), dimensions={1,3}, sharding={devices=[2,1,2,1,2]0,1,2,3,4,5,6,7}
+  %br1 = f32[20,2,20,3,20] broadcast(%param1), dimensions={1,3,4}, sharding={devices=[2,1,2,1,2]0,1,2,3,4,5,6,7}
+  %add = f32[20,2,20,3,20] add(%br0, %br1), sharding={devices=[2,1,2,1,2]0,1,2,3,4,5,6,7}
+  %reshape = f32[10,4,10,6,20] reshape(%br0), sharding={devices=[2,1,2,1,2]0,1,2,3,4,5,6,7}
+  %transpose = f32[2,3,20,20,20] transpose(%br0), dimensions={1,3,0,2,4}, sharding={devices=[1,1,2,2,2]0,1,2,3,4,5,6,7}
+  %copy_add0 = f32[20,2,20,3,20] copy(%add), sharding={devices=[2,1,2,1,2]6,7,2,3,4,5,0,1}
+  %copy_add1 = f32[20,2,20,3,20] copy(%add), sharding={devices=[2,1,2,1,2]7,6,3,2,5,4,0,1}
+  %copy_reshape = f32[10,4,10,6,20] copy(%reshape), sharding={devices=[2,1,2,1,2]7,6,3,2,5,4,0,1}
+  %copy_transpose = f32[2,3,20,20,20] copy(%transpose), sharding={devices=[1,1,2,2,2]7,6,3,2,5,4,0,1}
+  ROOT %tuple = (f32[20,2,20,3,20], f32[20,2,20,3,20], f32[10,4,10,6,20], f32[2,3,20,20,20])
+    tuple(%copy_add0, %copy_add1, %copy_reshape, %copy_transpose),
+    sharding={{devices=[2,1,2,1,2]6,7,2,3,4,5,0,1},{devices=[2,1,2,1,2]7,6,3,2,5,4,0,1},{devices=[2,1,2,1,2]7,6,3,2,5,4,0,1},{devices=[1,1,2,2,2]7,6,3,2,5,4,0,1}}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+  VLOG(1) << module->ToString();
+  auto root = module->entry_computation()->root_instruction();
+  // Reshard on copy_add0 only happens on broadcast dims, can be skipped.
+  auto copy_add0 =
+      op::Copy(op::Copy(op::Add(op::Broadcast(_), op::Broadcast(_))));
+  // Reshard on copy_add1 also happens on non-broadcast dims.
+  auto copy_add1 = op::Copy(
+      op::CollectivePermute(op::Add(op::Broadcast(_), op::Broadcast(_))));
+  // Reshard on copy_reshape only happens on broadcast dims, can be skipped.
+  auto copy_reshape = op::Copy(op::Copy(op::Reshape(op::Broadcast(_))));
+  // Reshard on copy_transpose only happens on broadcast dims, can be skipped.
+  auto copy_transpose = op::Copy(op::Copy(op::Transpose(op::Broadcast(_))));
+  EXPECT_THAT(root,
+              op::Tuple(copy_add0, copy_add1, copy_reshape, copy_transpose));
+}
+
+TEST_F(SpmdPartitioningTest,
+       ConvolutionFilterIFOFPartitionedInputPartialReplicate) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,112,112,12] parameter(0)
+  %lhs.copy = f32[128,112,112,12] copy(f32[128,112,112,12] %lhs),
+    sharding={devices=[1,1,1,2,2]0,1,2,3 last_tile_dim_replicate}
+  %rhs = f32[7,7,12,64] parameter(1)
+  %rhs.copy = f32[7,7,12,64] copy(f32[7,7,12,64] %rhs),
+    sharding={devices=[1,1,2,2]0,1,2,3}
+  ROOT %conv = f32[128,56,56,64] convolution(
+    f32[128,112,112,12] %lhs.copy,
+    f32[7,7,12,64] %rhs.copy),
+    window={size=7x7 stride=2x2 pad=3_3x3_3},
+    dim_labels=b01f_01io->b01f,
+    sharding={devices=[1,1,1,2,2]0,1,2,3 last_tile_dim_replicate}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Constant(),
+                                op::Constant(), op::Reshape())),
+      op::Shape("f32[128,112,112,6]"));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Constant(),
+                                op::Reshape(), op::Reshape())),
+      op::Shape("f32[7,7,6,32]"));
+
+  EXPECT_THAT(
+      root,
+      AllOf(op::CollectivePermute(op::AllReduce(op::Convolution(lhs, rhs))),
+            op::Shape("f32[128,56,56,32]")));
+}
+
+TEST_F(SpmdPartitioningTest,
+       ConvolutionInputKernelNonContractingDimPartialReplicate) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,56,56,256] parameter(0)
+  %lhs.copy = f32[128,56,56,256] copy(%lhs),
+  sharding={devices=[1,1,1,2,2]0,1,2,3 last_tile_dim_replicate}
+  %rhs = f32[128,28,28,512] parameter(1)
+  %rhs.copy = f32[128,28,28,512] copy(%rhs),
+  sharding={devices=[1,1,1,2,2]0,1,2,3 last_tile_dim_replicate}
+  ROOT %conv = f32[1,1,256,512] convolution(%lhs.copy, %rhs.copy),
+    window={size=28x28 pad=0_-1x0_-1 rhs_dilate=2x2}, dim_labels=f01b_i01o->01bf,
+    sharding={devices=[1,1,2,2]0,1,2,3}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Constant(),
+                                op::Constant(), op::Reshape())),
+      op::Shape("f32[128,56,56,128]"));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Constant(),
+                                op::Constant(), op::Reshape())),
+      op::Shape("f32[128,28,28,256]"));
+
+  EXPECT_THAT(root, AllOf(op::Convolution(lhs, op::CollectivePermute(rhs)),
+                          op::Shape("f32[1,1,128,256]")));
+}
+
 }  // namespace
 }  // namespace spmd
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.cc b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.cc
index 29def16f89d..0edbd4f2b8d 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.cc
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.cc
@@ -29,12 +29,15 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_sharding.h"
 #include "tensorflow/compiler/xla/service/hlo_sharding_util.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
+#include "tensorflow/compiler/xla/service/shape_inference.h"
 #include "tensorflow/compiler/xla/service/spmd/spmd_partitioner.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/window_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
@@ -47,6 +50,23 @@ bool HasReplicatedSharding(const HloSharding& sharding) {
   return sharding.IsReplicated();
 }
 
+HloInstruction* CreateConstant(const Shape& shape, Literal value,
+                               SpmdBuilder* b) {
+  if (shape.IsTuple()) {
+    std::vector<HloInstruction*> elements;
+    for (int64 i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
+      elements.push_back(CreateConstant(
+          ShapeUtil::GetTupleElementShape(shape, i), value.Clone(), b));
+    }
+    return b->AddInstruction(HloInstruction::CreateTuple(elements));
+  }
+
+  CHECK(
+      ShapeUtil::IsScalarWithElementType(value.shape(), shape.element_type()));
+  auto c = b->AddInstruction(HloInstruction::CreateConstant(std::move(value)));
+  return b->AddInstruction(HloInstruction::CreateBroadcast(shape, c, {}));
+}
+
 HloInstruction* CreateZero(const Shape& shape, SpmdBuilder* b) {
   if (shape.IsTuple()) {
     std::vector<HloInstruction*> elements;
@@ -183,13 +203,17 @@ std::vector<HloInstruction*> MakePartitionOffsets(
     absl::Span<const int64> dims) {
   CHECK(!shape.IsTuple());
 
-  Array2D<int32> offset_array(
-      {sharding.tile_assignment().num_elements(), shape.rank()});
-  offset_array.Each([&](int64 i, int64 j, int32* value) {
-    *value = sharding.TileOffsetForDevice(shape, i)[j];
-  });
-  auto offset_table = b->AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR2FromArray2D(offset_array)));
+  std::vector<std::vector<int32>> offset_arrays(shape.rank());
+  for (int64 i = 0; i < shape.rank(); ++i) {
+    offset_arrays[i].resize(sharding.tile_assignment().num_elements());
+  }
+  auto shard_shape = MakePartitionedShape(shape, sharding);
+  sharding.tile_assignment().Each(
+      [&](absl::Span<const int64> indices, int64 device) {
+        for (int64 i = 0; i < shape.rank(); ++i) {
+          offset_arrays[i][device] = indices[i] * shard_shape.dimensions(i);
+        }
+      });
   std::vector<HloInstruction*> offsets;
   for (int64 i = 0; i < shape.rank(); ++i) {
     if (sharding.tile_assignment().dim(i) == 1 ||
@@ -197,11 +221,10 @@ std::vector<HloInstruction*> MakePartitionOffsets(
       offsets.push_back(b->AddInstruction(
           HloInstruction::CreateConstant(LiteralUtil::Zero(S32))));
     } else {
+      auto offset_table = b->AddInstruction(HloInstruction::CreateConstant(
+          LiteralUtil::CreateR1<int32>(offset_arrays[i])));
       auto index = b->AddInstruction(HloInstruction::CreateDynamicSlice(
-          ShapeUtil::MakeShape(S32, {1, 1}), offset_table,
-          {partition_id, b->AddInstruction(HloInstruction::CreateConstant(
-                             LiteralUtil::CreateR0<uint32>(i)))},
-          {1, 1}));
+          ShapeUtil::MakeShape(S32, {1}), offset_table, {partition_id}, {1}));
       offsets.push_back(b->AddInstruction(
           HloInstruction::CreateReshape(ShapeUtil::MakeShape(S32, {}), index)));
     }
@@ -212,8 +235,11 @@ std::vector<HloInstruction*> MakePartitionOffsets(
 std::vector<HloInstruction*> MakeTiledPartitionOrdinals(
     const HloSharding& sharding, HloInstruction* partition_id, SpmdBuilder* b) {
   CHECK(!sharding.IsTileMaximal());
-  auto table_shape =
-      ShapeUtil::MakeShape(S32, sharding.tile_assignment().dimensions());
+  auto dimensions = sharding.tile_assignment().dimensions();
+  if (sharding.ReplicateOnLastTileDim()) {
+    dimensions.pop_back();
+  }
+  auto table_shape = ShapeUtil::MakeShape(S32, dimensions);
   return MakePartitionOffsets(table_shape, sharding, partition_id, b);
 }
 
@@ -270,12 +296,341 @@ HloInstruction* PadBaseShapeBeforeUnevenTiledSharding(
   return PadToShape(hlo, padded_base_shape, b);
 }
 
+absl::optional<HloSharding> PartialReplicateReshardCompatibleSharding(
+    const HloSharding& partial_sharding, const HloSharding& target_sharding) {
+  if (!partial_sharding.ReplicateOnLastTileDim()) {
+    return absl::nullopt;
+  }
+  int64 rank = partial_sharding.tile_assignment().num_dimensions() - 1;
+  int64 target_rank = target_sharding.tile_assignment().num_dimensions() -
+                      (target_sharding.ReplicateOnLastTileDim() ? 1 : 0);
+  if (target_rank != rank) {
+    return absl::nullopt;
+  }
+
+  absl::flat_hash_map<int64, int64> device_to_replication_group;
+  partial_sharding.tile_assignment().Each(
+      [&](absl::Span<const int64> indices, int64 device) {
+        int64 gid = 0;
+        for (int64 i = 0; i < rank; ++i) {
+          gid *= partial_sharding.tile_assignment().dim(i);
+          gid += indices[i];
+        }
+        device_to_replication_group[device] = gid;
+      });
+
+  // A dimension is expanded when target_tile_size > partial_tile_size and
+  // target_tile_size % partial_tile_size == 0.
+  // expand_tile_dims_positions is the index of the expand_dim.
+  std::vector<int64> expand_tile_dims_indices(rank, -1);
+  // expand_tile_size = target_tile_size / partial_tile_size.
+  std::vector<int64> expand_tile_sizes;
+  int num_expand_dims = 0;
+  for (int64 dim = 0; dim < rank; dim++) {
+    int64 partial_tile_size = partial_sharding.tile_assignment().dim(dim);
+    int64 target_tile_size = target_sharding.tile_assignment().dim(dim);
+    if (target_tile_size % partial_tile_size != 0 ||
+        target_tile_size < partial_tile_size) {
+      return absl::nullopt;
+    }
+
+    if (target_tile_size > partial_tile_size) {
+      expand_tile_dims_indices[dim] = num_expand_dims++;
+      expand_tile_sizes.emplace_back(target_tile_size / partial_tile_size);
+    }
+  }
+
+  // Reshape the partial replicate tile_dimensions.
+  int64 num_target_replication = 1;
+  if (target_sharding.ReplicateOnLastTileDim()) {
+    num_target_replication =
+        target_sharding.tile_assignment().dimensions().back();
+  }
+  auto reshape_dimensions = partial_sharding.tile_assignment().dimensions();
+  int64 num_replication = reshape_dimensions.back();
+  if (num_replication / num_target_replication != Product(expand_tile_sizes) ||
+      num_replication % num_target_replication != 0) {
+    return absl::nullopt;
+  }
+
+  reshape_dimensions.pop_back();
+  reshape_dimensions.insert(reshape_dimensions.end(), expand_tile_sizes.begin(),
+                            expand_tile_sizes.end());
+
+  if (target_sharding.ReplicateOnLastTileDim()) {
+    reshape_dimensions.push_back(num_target_replication);
+  }
+
+  auto reshape_tile_assignment = partial_sharding.tile_assignment();
+  reshape_tile_assignment.Reshape(reshape_dimensions);
+
+  // Transpose.
+  std::vector<int64> perm;
+  perm.reserve(rank + expand_tile_sizes.size());
+  for (int64 dim = 0; dim < rank; dim++) {
+    perm.emplace_back(dim);
+    if (expand_tile_dims_indices[dim] > -1) {
+      perm.emplace_back(expand_tile_dims_indices[dim] + rank);
+    }
+  }
+  auto transpose_sharding = hlo_sharding_util::TransposeSharding(
+      target_sharding.ReplicateOnLastTileDim()
+          ? HloSharding::PartialTile(reshape_tile_assignment)
+          : HloSharding::Tile(reshape_tile_assignment),
+      perm);
+
+  // Reshape to target shape
+  auto transpose_tile_assignment = transpose_sharding.tile_assignment();
+  transpose_tile_assignment.Reshape(
+      target_sharding.tile_assignment().dimensions());
+
+  bool groups_matching = true;
+  target_sharding.tile_assignment().Each(
+      [&](absl::Span<const int64> indices, int64 device) {
+        if (device_to_replication_group[device] !=
+            device_to_replication_group[transpose_tile_assignment(indices)]) {
+          groups_matching = false;
+        }
+      });
+
+  if (groups_matching) {
+    return target_sharding;
+  }
+  return target_sharding.ReplicateOnLastTileDim()
+             ? HloSharding::PartialTile(transpose_tile_assignment)
+             : HloSharding::Tile(transpose_tile_assignment);
+}
+
+absl::optional<HloInstruction*> TileToPartialReplicateHaloExchange(
+    HloInstruction* hlo, const Shape& base_shape,
+    const HloSharding& src_sharding, const HloSharding& dst_sharding,
+    const std::vector<int64>& replicate_dims,
+    const SPMDCollectiveOpsCreator& collective_ops_creator,
+    int64* next_channel_id, HloInstruction* partition_id, SpmdBuilder* b) {
+  // Source is tile sharding.
+  auto padded_src_shape =
+      GetPaddedShapeForUnevenPartitioning(base_shape, src_sharding);
+  // Target is partial replicate.
+  auto padded_dst_shape =
+      GetPaddedShapeForUnevenPartitioning(base_shape, dst_sharding);
+  if (ShapeUtil::Compatible(padded_dst_shape, hlo->shape())) {
+    return hlo;
+  }
+
+  auto partition_ordinals =
+      MakeTiledPartitionOrdinals(dst_sharding, partition_id, b);
+
+  auto result = hlo;
+  auto hlo_shape = hlo->shape();
+  for (auto dim : replicate_dims) {
+    int64 dst_shard_count = dst_sharding.tile_assignment().dim(dim);
+    int64 src_per_shard_size =
+        padded_src_shape.dimensions(dim) / dst_shard_count;
+    // Calculate per shard size using the sharding to compare if dst_sharding
+    // needs more padding at the end.
+    int64 dst_per_shard_size =
+        padded_dst_shape.dimensions(dim) / dst_shard_count;
+
+    // If src per shard doesn't have redudant data.
+    if (src_per_shard_size <= dst_per_shard_size || dst_shard_count == 1) {
+      continue;
+    }
+
+    // If src_per_shard * replicate_factor > dst_per_shard , need to
+    // re-distribute the data between each shard using collective permute. For
+    // example, if dimension size is 6 and shard 4 ways in the src but needs to
+    // shard 2 ways in the dst. 4 way sharding has 2 element in each shard,
+    // while 2 way sharding has 3 elements, the last element in the first shard
+    // will be sliced out. re-distribution is needed.
+    //
+    // 1. Calculate left_halo size.
+    // left-halo size is
+    //   (src_per_shard_size - dst_per_shard_size) * i / replicate_factor
+    int64 replicate_factor = src_sharding.tile_assignment().dim(dim) /
+                             dst_sharding.tile_assignment().dim(dim);
+    OffsetCalculation left_halo_size_function =
+        OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+            src_per_shard_size - dst_per_shard_size, 0, replicate_factor));
+
+    // 2. Calculate right_halo size.
+    // right-halo size is 0
+    OffsetCalculation right_halo_size_function =
+        OffsetCalculation(MultiplyAddDivideOffsetCalculation(0, 0, 1));
+
+    auto concat = result;
+    // 3. Halo exchange.
+    auto halo_exchange_result = ExchangeHalo(
+        result, left_halo_size_function, right_halo_size_function, dim,
+        src_sharding, collective_ops_creator, next_channel_id, b);
+
+    if (halo_exchange_result.has_value()) {
+      concat = halo_exchange_result.value();
+    } else {
+      return absl::nullopt;
+    }
+
+    // 4. Slice the valid result.
+    // Slice offset is
+    // (dst_shard_count - i - 1) *
+    // (src_per_shard_size - dst_per_shard_size)
+    // i is the index in dst_sharindg.
+    auto zero_s32 = b->AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::Zero(S32)));
+    OffsetCalculation start_offset_on_padded_concat_calculation =
+        OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+            dst_per_shard_size - src_per_shard_size,
+            (src_per_shard_size - dst_per_shard_size) * (dst_shard_count - 1),
+            1));
+    auto slice_shape = concat->shape();
+    slice_shape.set_dimensions(dim,
+                               padded_src_shape.dimensions(dim) /
+                                   src_sharding.tile_assignment().dim(dim));
+    std::vector<HloInstruction*> slice_offsets(concat->shape().rank(),
+                                               zero_s32);
+    slice_offsets[dim] = start_offset_on_padded_concat_calculation.Calculate(
+        partition_ordinals[dim], b);
+    result = b->AddInstruction(HloInstruction::CreateDynamicSlice(
+        slice_shape, concat, slice_offsets, slice_shape.dimensions()));
+  }
+  return result;
+}
+
+absl::optional<HloInstruction*> PadFromPartialReplicateShape(
+    HloInstruction* hlo, const Shape& base_shape,
+    const HloSharding& src_sharding, const HloSharding& dst_sharding,
+    const std::vector<int64>& expand_tile_dims,
+    const SPMDCollectiveOpsCreator& collective_ops_creator,
+    int64* next_channel_id, HloInstruction* partition_id, SpmdBuilder* b) {
+  auto padded_src_shape =
+      GetPaddedShapeForUnevenPartitioning(base_shape, src_sharding);
+  auto padded_dst_shape =
+      GetPaddedShapeForUnevenPartitioning(base_shape, dst_sharding);
+  if (ShapeUtil::Compatible(padded_dst_shape, hlo->shape())) {
+    return hlo;
+  }
+
+  auto partition_ordinals =
+      MakeTiledPartitionOrdinals(src_sharding, partition_id, b);
+
+  HloInstruction* result = hlo;
+  auto zero = b->AddInstruction(HloInstruction::CreateConstant(
+      LiteralUtil::Zero(hlo->shape().element_type())));
+  std::vector<int64> expand_dims_without_halo_exchange;
+  // Pad the dimensions needs halo exchange and record the padded dims that
+  // won't need halo exchange.
+  for (auto dim : expand_tile_dims) {
+    int64 src_shard_count = src_sharding.tile_assignment().dim(dim);
+    int64 src_per_shard_size =
+        padded_src_shape.dimensions(dim) / src_shard_count;
+    // Calculate per shard size using the sharding to compare if dst_sharding
+    // needs more padding at the end.
+    int64 dst_per_shard_size =
+        padded_dst_shape.dimensions(dim) / src_shard_count;
+
+    // If dst_sharding doesn't need more padding at the end.
+    if (src_per_shard_size >= dst_per_shard_size) {
+      continue;
+    }
+    // If src sharding at this dimension is not partitoned, simply pad to
+    // the desired shape.
+    if (src_shard_count == 1) {
+      expand_dims_without_halo_exchange.emplace_back(dim);
+      continue;
+    }
+
+    // If dst_padding needs more padding at the end, need to re-distribute the
+    // data between each shard using collective permute.
+    // For example, if dimension size is 6 and shard 2 ways in the src but
+    // needs to shard 4 ways in the dst. 4 ways needs padding 2 0s at the end
+    // and has 2 elements at each shard, while 2 way sharding has 3 elements
+    // in each shard, re-distribution is needed.
+    //
+    // 1. Calculate left_halo size.
+    // left-halo size is 0
+    OffsetCalculation left_halo_size_function =
+        OffsetCalculation(MultiplyAddDivideOffsetCalculation(0, 0, 1));
+
+    // 2. Calculate right_halo size.
+    // right-halo size is D * (i + 1) - S * (i + 1) = (D - S) * i + (D - S)
+    OffsetCalculation right_halo_size_function =
+        OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+            dst_per_shard_size - src_per_shard_size,
+            dst_per_shard_size - src_per_shard_size, 1));
+
+    auto concat = result;
+    // 3. Halo exchange.
+    auto halo_exchange_result = ExchangeHalo(
+        result, left_halo_size_function, right_halo_size_function, dim,
+        src_sharding, collective_ops_creator, next_channel_id, b);
+
+    if (halo_exchange_result.has_value()) {
+      concat = halo_exchange_result.value();
+    } else {
+      return absl::nullopt;
+    }
+
+    // 4. Pad.
+    std::vector<int64> zero_padding(concat->shape().rank());
+    PaddingConfig pad_config = window_util::MakeSymmetricPadding(zero_padding);
+    pad_config.mutable_dimensions(dim)->set_edge_padding_low(0);
+    int64 max_right_halo_size =
+        right_halo_size_function.MaxInRange(0, src_shard_count - 1);
+    pad_config.mutable_dimensions(dim)->set_edge_padding_high(std::max(
+        0LL, padded_dst_shape.dimensions(dim) -
+                 padded_src_shape.dimensions(dim) - max_right_halo_size));
+    auto padded_concat_shape = ShapeInference::InferPadShape(
+                                   concat->shape(), zero->shape(), pad_config)
+                                   .ValueOrDie();
+    concat = b->AddInstruction(HloInstruction::CreatePad(
+        padded_concat_shape, concat, zero, pad_config));
+
+    // 5. Slice the valid result.
+    // Slice offset is (D-S) * i
+    auto zero_s32 = b->AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::Zero(S32)));
+    OffsetCalculation start_offset_on_padded_concat_calculation =
+        OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+            dst_per_shard_size - src_per_shard_size, 0, 1));
+    auto slice_shape = concat->shape();
+    slice_shape.set_dimensions(dim, dst_per_shard_size);
+    std::vector<HloInstruction*> slice_offsets(concat->shape().rank(),
+                                               zero_s32);
+    slice_offsets[dim] = start_offset_on_padded_concat_calculation.Calculate(
+        partition_ordinals[dim], b);
+    result = b->AddInstruction(HloInstruction::CreateDynamicSlice(
+        slice_shape, concat, slice_offsets, slice_shape.dimensions()));
+  }
+
+  // Pad other dimensions that won't need halo exchange with a single pad.
+  if (!expand_dims_without_halo_exchange.empty()) {
+    std::vector<int64> zero_padding(result->shape().rank());
+    PaddingConfig pad_config = window_util::MakeSymmetricPadding(zero_padding);
+
+    auto padded_shape = result->shape();
+    for (auto dim : expand_dims_without_halo_exchange) {
+      pad_config.mutable_dimensions(dim)->set_edge_padding_low(0);
+      pad_config.mutable_dimensions(dim)->set_edge_padding_high(
+          padded_dst_shape.dimensions(dim) - padded_src_shape.dimensions(dim));
+      padded_shape.set_dimensions(dim, result->shape().dimensions(dim) +
+                                           padded_dst_shape.dimensions(dim) -
+                                           padded_src_shape.dimensions(dim));
+    }
+    result = b->AddInstruction(
+        HloInstruction::CreatePad(padded_shape, result, zero, pad_config));
+  }
+
+  return result;
+}
+
 absl::optional<int64> UniqueTiledDim(const HloSharding& sharding) {
   if (sharding.IsTileMaximal()) {
     return absl::nullopt;
   }
   int64 dim = -1;
-  for (int64 i = 0; i < sharding.tile_assignment().num_dimensions(); ++i) {
+  int64 rank = sharding.ReplicateOnLastTileDim()
+                   ? sharding.tile_assignment().num_dimensions() - 1
+                   : sharding.tile_assignment().num_dimensions();
+  for (int64 i = 0; i < rank; ++i) {
     if (sharding.tile_assignment().dim(i) > 1) {
       if (dim != -1) {
         return absl::nullopt;
@@ -925,7 +1280,8 @@ GetReshardAllToAllSourceTargetDims(const HloSharding& source,
                                    const HloSharding& target) {
   if (source.IsTileMaximal() || target.IsTileMaximal() ||
       source.tile_assignment().num_dimensions() !=
-          target.tile_assignment().num_dimensions()) {
+          target.tile_assignment().num_dimensions() ||
+      source.NumTiles() != target.NumTiles()) {
     return absl::nullopt;
   }
   // Record partition count to index for indices that have different partition
@@ -1010,61 +1366,112 @@ bool CanReshardWithCollectivePermute(const HloSharding& source,
   return !source.IsTileMaximal() && !target.IsTileMaximal() &&
          source.tile_assignment().dimensions() ==
              target.tile_assignment().dimensions() &&
+         source.ReplicateOnLastTileDim() == target.ReplicateOnLastTileDim() &&
          source.tile_assignment() != target.tile_assignment();
 }
 
 GroupedSharding GroupShardingOnDims(const HloSharding& sharding,
                                     absl::Span<const int64> group_dims) {
+  std::vector<int64> group_dim_shards(group_dims.size(), 1);
+  return GroupShardingOnDims(sharding, group_dims, group_dim_shards);
+}
+
+GroupedSharding GroupShardingOnDims(const HloSharding& sharding,
+                                    absl::Span<const int64> group_dims,
+                                    absl::Span<const int64> group_dim_shards) {
   CHECK(!sharding.IsTileMaximal());
   std::vector<int64> grouped_tiling_dims =
       sharding.tile_assignment().dimensions();
   std::vector<int64> group_dim_sizes(group_dims.size());
   for (int64 i = 0; i < group_dims.size(); ++i) {
-    group_dim_sizes[i] = grouped_tiling_dims[group_dims[i]];
-    grouped_tiling_dims[group_dims[i]] = 1;
+    CHECK_EQ(grouped_tiling_dims[group_dims[i]] % group_dim_shards[i], 0);
+    group_dim_sizes[i] =
+        grouped_tiling_dims[group_dims[i]] / group_dim_shards[i];
+    grouped_tiling_dims[group_dims[i]] = group_dim_shards[i];
   }
+
   std::vector<std::vector<int64>> device_groups(Product(group_dim_sizes));
   sharding.tile_assignment().Each(
       [&](absl::Span<const int64> indices, int64 device) {
         int64 group_id = 0;
-        for (int64 dim : group_dims) {
-          group_id *= sharding.tile_assignment().dim(dim);
-          group_id += indices[dim];
+        for (int64 i = 0; i < group_dims.size(); ++i) {
+          group_id *= sharding.tile_assignment().dim(group_dims[i]) /
+                      group_dim_shards[i];
+          group_id += indices[group_dims[i]] / group_dim_shards[i];
         }
         device_groups[group_id].push_back(device);
       });
-  Array<int64> grouped_tiling(grouped_tiling_dims);
-  grouped_tiling.FillIota(0);
-  return GroupedSharding(
+  auto grouped = GroupedSharding(
       std::move(device_groups),
       std::vector<int64>(group_dims.begin(), group_dims.end()),
       std::move(group_dim_sizes), sharding.tile_assignment().num_dimensions(),
-      HloSharding::Tile(grouped_tiling));
+      HloSharding::Replicate());
+  if (sharding.ReplicateOnLastTileDim()) {
+    grouped.data_rank--;
+  }
+  if (Product(grouped_tiling_dims) == 1 ||
+      (sharding.ReplicateOnLastTileDim() &&
+       Product(grouped_tiling_dims) == grouped_tiling_dims.back())) {
+    return grouped;
+  }
+  if (sharding.ReplicateOnLastTileDim() && grouped_tiling_dims.back() == 1) {
+    grouped_tiling_dims.pop_back();
+  }
+  Array<int64> grouped_tiling(grouped_tiling_dims);
+  grouped_tiling.FillIota(0);
+  grouped.sharding = sharding.ReplicateOnLastTileDim() &&
+                             grouped_tiling_dims.size() ==
+                                 sharding.tile_assignment().num_dimensions()
+                         ? HloSharding::PartialTile(grouped_tiling)
+                         : HloSharding::Tile(grouped_tiling);
+  return grouped;
 }
 
 HloSharding UngroupSharding(const GroupedSharding& grouped_sharding) {
-  CHECK(!grouped_sharding.sharding.IsTileMaximal());
-  std::vector<int64> tiling_dims =
-      grouped_sharding.sharding.tile_assignment().dimensions();
+  std::vector<int64> tiling_dims;
+  bool partial_sharding = false;
+  auto grouped_tiling = grouped_sharding.sharding.tile_assignment();
+  if (grouped_sharding.sharding.IsTileMaximal()) {
+    tiling_dims = std::vector<int64>(grouped_sharding.data_rank, 1);
+    if (grouped_sharding.device_groups[0].size() != 1) {
+      // This is partial sharding.
+      tiling_dims.push_back(grouped_sharding.device_groups[0].size());
+      partial_sharding = true;
+    }
+    grouped_tiling = Array<int64>(tiling_dims);
+    grouped_tiling.FillIota(0);
+  } else {
+    partial_sharding = grouped_sharding.sharding.ReplicateOnLastTileDim();
+    tiling_dims = grouped_sharding.sharding.tile_assignment().dimensions();
+    if (absl::c_linear_search(grouped_sharding.group_dims,
+                              tiling_dims.size())) {
+      tiling_dims.push_back(1);
+      grouped_tiling.Reshape(tiling_dims);
+      partial_sharding = true;
+    }
+  }
   for (int64 i = 0; i < grouped_sharding.group_dims.size(); ++i) {
-    tiling_dims[grouped_sharding.group_dims[i]] =
-        grouped_sharding.group_dim_sizes[i];
+    int64 dim = grouped_sharding.group_dims[i];
+    tiling_dims[dim] *= grouped_sharding.group_dim_sizes[i];
   }
   Array<int64> tiling(tiling_dims);
-  grouped_sharding.sharding.tile_assignment().Each(
-      [&](absl::Span<const int64> indices, int64 device) {
-        std::vector<int64> ungrouped_inds(indices.begin(), indices.end());
-        for (int64 g = 0; g < grouped_sharding.device_groups.size(); ++g) {
-          int64 remaining_group_index = g;
-          for (int64 i = grouped_sharding.group_dims.size() - 1; i >= 0; --i) {
-            ungrouped_inds[grouped_sharding.group_dims[i]] =
-                remaining_group_index % grouped_sharding.group_dim_sizes[i];
-            remaining_group_index /= grouped_sharding.group_dim_sizes[i];
-          }
-          tiling(ungrouped_inds) = grouped_sharding.device_groups[g][device];
-        }
-      });
-  return HloSharding::Tile(tiling);
+  grouped_tiling.Each([&](absl::Span<const int64> indices, int64 device) {
+    std::vector<int64> ungrouped_inds(indices.begin(), indices.end());
+    for (int64 g = 0; g < grouped_sharding.device_groups.size(); ++g) {
+      int64 remaining_group_index = g;
+      for (int64 i = grouped_sharding.group_dims.size() - 1; i >= 0; --i) {
+        int64 dim = grouped_sharding.group_dims[i];
+        int64 groups_in_this_dim = grouped_sharding.group_dim_sizes[i];
+        ungrouped_inds[dim] = (remaining_group_index % groups_in_this_dim) *
+                                  grouped_tiling.dim(dim) +
+                              indices[dim];
+        remaining_group_index /= groups_in_this_dim;
+      }
+      tiling(ungrouped_inds) = grouped_sharding.device_groups[g][device];
+    }
+  });
+  return partial_sharding ? HloSharding::PartialTile(tiling)
+                          : HloSharding::Tile(tiling);
 }
 
 GroupedSharding AlignGroupsWith(GroupedSharding grouped_sharding,
@@ -1118,12 +1525,15 @@ GroupedSharding AlignGroupsWith(GroupedSharding grouped_sharding,
           grouped_sharding.device_groups[g], reference.device_groups[ref_g]);
     }
   }
-  if (matching_groups) {
+  if (matching_groups && !grouped_sharding.sharding.IsTileMaximal()) {
     auto tiles = grouped_sharding.sharding.tile_assignment();
     tiles.Each([&](absl::Span<const int64> indices, int64* device) {
       *device = original_src_to_ref_permutation[*device];
     });
-    grouped_sharding.sharding = HloSharding::Tile(tiles);
+    grouped_sharding.sharding =
+        grouped_sharding.sharding.ReplicateOnLastTileDim()
+            ? HloSharding::PartialTile(tiles)
+            : HloSharding::Tile(tiles);
   }
   grouped_sharding.device_groups = std::move(reference.device_groups);
   return grouped_sharding;
@@ -1134,6 +1544,9 @@ Shape GetPerGroupBaseShape(const GroupedSharding& grouped_sharding,
   auto result = original_base_shape;
   for (int64 i = 0; i < grouped_sharding.group_dims.size(); ++i) {
     int64 dim = grouped_sharding.group_dims[i];
+    if (dim >= original_base_shape.rank()) {
+      continue;
+    }
     int64 groups = grouped_sharding.group_dim_sizes[i];
     result.set_dimensions(dim, result.dimensions(dim) / groups);
   }
@@ -1305,49 +1718,6 @@ HloInstruction* PerGroupSliceFromReplicated(
       shard_shape.dimensions()));
 }
 
-absl::optional<HloSharding> TransposeShardingWithCollapsedDims(
-    const HloSharding& source, absl::Span<int64 const> src_to_tgt,
-    absl::Span<int64 const> tgt_to_src) {
-  if (source.IsTileMaximal()) {
-    return source;
-  }
-  std::vector<int64> tgt_dims_skipping_new(tgt_to_src.size(), -1);
-  int64 skipped_tgt_dims = 0;
-  for (int64 i = 0; i < tgt_to_src.size(); ++i) {
-    if (tgt_to_src[i] < 0) {
-      skipped_tgt_dims++;
-    } else {
-      tgt_dims_skipping_new[i] = i - skipped_tgt_dims;
-    }
-  }
-  int64 skipped_src_dims = absl::c_count(src_to_tgt, -1);
-  std::vector<int64> perm(src_to_tgt.size());
-  for (int64 i = 0; i < src_to_tgt.size(); ++i) {
-    if (src_to_tgt[i] < 0) {
-      if (source.tile_assignment().dim(i) > 1) {
-        return absl::nullopt;
-      }
-      perm[src_to_tgt.size() - skipped_src_dims] = i;
-      skipped_src_dims--;
-    } else {
-      perm[tgt_dims_skipping_new[src_to_tgt[i]]] = i;
-    }
-  }
-  auto tgt_sharding = hlo_sharding_util::TransposeSharding(source, perm);
-  if (skipped_tgt_dims == 0) {
-    return tgt_sharding;
-  }
-  auto reshape_tiles = tgt_sharding.tile_assignment();
-  std::vector<int64> tgt_tiles(tgt_to_src.size(), 1);
-  for (int64 i = 0; i < tgt_tiles.size(); ++i) {
-    if (tgt_to_src[i] >= 0) {
-      tgt_tiles[i] = reshape_tiles.dim(tgt_dims_skipping_new[i]);
-    }
-  }
-  reshape_tiles.Reshape(tgt_tiles);
-  return HloSharding::Tile(reshape_tiles);
-}
-
 absl::optional<HloOpcode> ParseReductionComputation(
     const HloComputation* reduction_comp) {
   if (reduction_comp->num_parameters() != 2) {
@@ -1366,5 +1736,47 @@ absl::optional<HloOpcode> ParseReductionComputation(
   return root->opcode();
 }
 
+absl::optional<std::vector<int64>> FindMatchingPartitionedDimsForGrouping(
+    const HloSharding& sharding,
+    const std::vector<std::vector<int64>>& device_groups) {
+  if (sharding.NumTiles() < device_groups.size() || device_groups.size() < 2 ||
+      device_groups[0].size() < 2) {
+    return absl::nullopt;
+  }
+  int64 rank = sharding.tile_assignment().num_dimensions();
+  if (sharding.ReplicateOnLastTileDim()) {
+    rank--;
+  }
+  absl::flat_hash_map<int64, std::vector<int64>> device_to_index;
+  sharding.tile_assignment().Each(
+      [&](absl::Span<const int64> index, int64 device) {
+        device_to_index[device] =
+            std::vector<int64>(index.begin(), index.begin() + rank);
+      });
+  std::vector<int64> dims;
+  int64 group_count = 1;
+  for (int64 i = 0; i < rank; ++i) {
+    if (device_to_index[device_groups[0][0]][i] ==
+        device_to_index[device_groups[0][1]][i]) {
+      dims.push_back(i);
+      group_count *= sharding.tile_assignment().dim(i);
+    }
+  }
+  if (group_count != device_groups.size()) {
+    return absl::nullopt;
+  }
+  for (const auto& group : device_groups) {
+    for (int64 i = 1; i < group.size(); ++i) {
+      if (absl::c_any_of(dims, [&](const int64 dim) {
+            return device_to_index[group[i]][dim] !=
+                   device_to_index[group[0]][dim];
+          })) {
+        return absl::nullopt;
+      }
+    }
+  }
+  return dims;
+}
+
 }  // namespace spmd
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h
index 10b630e31ee..4fc193d9622 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h
@@ -33,6 +33,10 @@ namespace spmd {
 // Returns true if the given sharding contains any replicated sharding.
 bool HasReplicatedSharding(const HloSharding& sharding);
 
+// Creates constant value instructions of the given shape. The literal must be a
+// scalar shape and is broadcast to the given shape.
+HloInstruction* CreateConstant(const Shape& shape, Literal value,
+                               SpmdBuilder* b);
 // Creates zero value instructions of the given shape.
 HloInstruction* CreateZero(const Shape& shape, SpmdBuilder* b);
 
@@ -287,19 +291,25 @@ bool CanReshardWithCollectivePermute(const HloSharding& source,
 struct GroupedSharding {
   GroupedSharding(std::vector<std::vector<int64>> device_groups,
                   std::vector<int64> group_dims,
-                  std::vector<int64> group_dim_sizes, int64 rank,
+                  std::vector<int64> group_dim_sizes, int64 data_rank,
                   HloSharding grouped_sharding)
       : device_groups(std::move(device_groups)),
         group_dims(std::move(group_dims)),
         group_dim_sizes(std::move(group_dim_sizes)),
+        data_rank(data_rank),
         sharding(std::move(grouped_sharding)) {}
   std::vector<std::vector<int64>> device_groups;
   std::vector<int64> group_dims;
   std::vector<int64> group_dim_sizes;
-  int64 rank;
+  int64 data_rank;
   HloSharding sharding;
 };
 
+// Creates a GroupedSharding for a tiled sharding with group dim shard sizes.
+GroupedSharding GroupShardingOnDims(const HloSharding& sharding,
+                                    absl::Span<const int64> group_dims,
+                                    absl::Span<const int64> group_dim_shards);
+
 // Creates a GroupedSharding for a tiled sharding.
 GroupedSharding GroupShardingOnDims(const HloSharding& sharding,
                                     absl::Span<const int64> group_dims);
@@ -331,18 +341,50 @@ HloInstruction* PerGroupSliceFromReplicated(
     absl::Span<const int64> group_dims, absl::Span<const int64> group_dim_sizes,
     SpmdBuilder* b);
 
-// Similar to hlo_sharding_util::TransposeSharding(), but allows removing/adding
-// non-partitioned dimensions. In src_to_tgt and tgt_to_src, -1 represents a
-// non-existing dimension.
-absl::optional<HloSharding> TransposeShardingWithCollapsedDims(
-    const HloSharding& source, absl::Span<int64 const> src_to_tgt,
-    absl::Span<int64 const> tgt_to_src);
-
 // Returns the opcode if `reduction_comp` represents a simple binary elementwise
 // computation on the two operands.
 absl::optional<HloOpcode> ParseReductionComputation(
     const HloComputation* reduction_comp);
 
+// Pad the shape from partial replicate shape for `dst_sharding`.
+// If dst_sharding needs more padding and per_shard_size increased in
+// dst_sharding, halo exchange on the right side is needed.
+absl::optional<HloInstruction*> PadFromPartialReplicateShape(
+    HloInstruction* hlo, const Shape& base_shape,
+    const HloSharding& src_sharding, const HloSharding& dst_sharding,
+    const std::vector<int64>& expand_tile_dims,
+    const SPMDCollectiveOpsCreator& collective_ops_creator,
+    int64* next_channel_id, HloInstruction* partition_id, SpmdBuilder* b);
+
+// Get the compatible sharding from a partial replicate sharding to a desired
+// target tiled sharding.
+// Compatible means replicate sharding can transform to the target tile
+// dimensions by dynamic slice.
+// For example, if partial_sharding is
+// {devices=[1,2,2]0,1,2,3 last_tile_dim_replicate}
+// Target sharding is {devices=[2,2]0,1,2,3}, the returned compatible sharding
+// will be sharding={devices=[2,2]0,2,1,3}.
+// If patial replicate sharding is not partial replicate or can't reshard to
+// target_tile_dims by dynamic slice, return absl::nullopt.
+// If target_sharding is already compatible, returns it.
+absl::optional<HloSharding> PartialReplicateReshardCompatibleSharding(
+    const HloSharding& partial_sharding, const HloSharding& target_sharding);
+
+// Do left halo exchange if all-reduce directly from tile sharding to partial
+// replicate sharding will remove useful data from the source.
+absl::optional<HloInstruction*> TileToPartialReplicateHaloExchange(
+    HloInstruction* hlo, const Shape& base_shape,
+    const HloSharding& src_sharding, const HloSharding& dst_sharding,
+    const std::vector<int64>& replicate_dims,
+    const SPMDCollectiveOpsCreator& collective_ops_creator,
+    int64* next_channel_id, HloInstruction* partition_id, SpmdBuilder* b);
+
+// Finds a list of dimensions that can be grouped on such that it will have the
+// specified device groups. Group order and dimension order are ignored.
+absl::optional<std::vector<int64>> FindMatchingPartitionedDimsForGrouping(
+    const HloSharding& sharding,
+    const std::vector<std::vector<int64>>& device_groups);
+
 }  // namespace spmd
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/triangular_solve_expander.cc b/tensorflow/compiler/xla/service/triangular_solve_expander.cc
index d54eb9e78c3..4015c69e3e2 100644
--- a/tensorflow/compiler/xla/service/triangular_solve_expander.cc
+++ b/tensorflow/compiler/xla/service/triangular_solve_expander.cc
@@ -89,16 +89,23 @@ XlaOp DiagonalBlocks(XlaOp a, int64 block_size) {
     // The last block might be smaller than the block size,
     // so we will need to pad it
     if (n % block_size != 0) {
-      // Pad with zeros
+      // Pad with identity matrix.
       auto last_blocks =
           SliceInMinorDims(a, {n - n % block_size, n - n % block_size}, {n, n});
       PaddingConfig config = MakeNoPaddingConfig(ndims);
       int64 padding = block_size - n % block_size;
-      config.mutable_dimensions(ndims - 1)->set_edge_padding_high(padding);
       config.mutable_dimensions(ndims - 2)->set_edge_padding_high(padding);
       last_blocks =
           Pad(last_blocks, Zero(builder, shape.element_type()), config);
 
+      auto eye =
+          IdentityMatrix(builder, shape.element_type(), padding, padding);
+      config = MakeNoPaddingConfig(ndims);
+      config.mutable_dimensions(ndims - 2)->set_edge_padding_low(n %
+                                                                 block_size);
+      eye = Pad(eye, Zero(builder, shape.element_type()), config);
+      last_blocks = ConcatInDim(builder, {last_blocks, eye}, ndims - 1);
+
       // Add a singleton dimension
       // i.e. [..., block_size, block_size] -> [..., 1, block_size, block_size]
       TF_ASSIGN_OR_RETURN(Shape blocks_shape, builder->GetShape(last_blocks));
@@ -121,134 +128,6 @@ XlaOp DiagonalBlocks(XlaOp a, int64 block_size) {
   });
 }
 
-XlaOp InvertDiagonalBlocks(XlaOp diag_blocks, bool lower, bool transpose_a,
-                           bool conjugate_a,
-                           PrecisionConfig::Precision precision) {
-  XlaBuilder* builder = diag_blocks.builder();
-  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    // Input is a batch of square lower triangular square matrices. Its shape is
-    // (..., size, size). We resize this to (num_blocks, size, size).
-    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(diag_blocks));
-    int64 block_size = ShapeUtil::GetDimension(shape, -1);
-    int64 num_blocks = ShapeUtil::ElementsIn(shape) /
-                       tensorflow::MathUtil::IPow(block_size, 2);
-    diag_blocks = Reshape(diag_blocks, {num_blocks, block_size, block_size});
-
-    // The input must be triangular because we rely on that when doing
-    // multiplications later on
-    diag_blocks = Triangle(diag_blocks, /*lower=*/lower);
-
-    // Rescale blocks to be unit triangular, but avoid dividing by
-    // zero (which can happen if the last block was padded) otherwise it will
-    // introduce nans which will propagate
-    auto diags = GetMatrixDiagonal(diag_blocks);
-    auto ones = FullLike(diags, 1);
-    diags = Select(Eq(diags, Zero(builder, shape.element_type())), ones, diags);
-    auto scaled_diag_blocks = Div(diag_blocks, diags, {0, 2});
-
-    // We can now use the fact that for an upper triangular matrix
-    // [[L11, 0], [L21, L22]], given the inverses L11' and L22', we have
-    // L22' = -L22' * L21 * L11'. In our case, L21 is a vector and our blocks
-    // have been rescaled to be unit triangular, so L22 = L22' = 1.
-
-    // Initialize the output matrix with -1s on the diagonal. We use -1 instead
-    // of 1 because we cannot do matrix-vector multiplies with variable shapes
-    // inside of a loop, or do irregularly shaped in-place updates. Hence,
-    // L21 <- -L22 * L21 * L11 cannot be done naively. Instead, we update the
-    // entire row i.e. we calculate
-    // [L21 L22 0] <- -[L21 L22 0] @ diag_blocks([L11', -I, -I])
-    // which means [L21 L22 0] <- [-L21 * L11', L22, 0].
-    auto identity =
-        IdentityMatrix(builder, shape.element_type(), block_size, block_size);
-    auto neg_identity = -identity;
-
-    // The first or last  diagonal element should be set to 1 instead of -1
-    // though, since we never update it
-    auto pos_one = Reshape(One(builder, shape.element_type()), {1, 1});
-    auto start_index = ConstantR0<int>(builder, (lower) ? 0 : block_size - 1);
-    auto output_block =
-        DynamicUpdateSlice(neg_identity, pos_one,
-                           /*start_indices=*/{start_index, start_index});
-
-    // Broadcast diag([1, -1, -1, ...]) to every block
-    XlaOp output = Broadcast(output_block,
-                             /*broadcast_sizes=*/{num_blocks});
-
-    // Now we construct a loop that performs matrix-vector multiplications
-    // inverting the blocks one row at a time
-    std::vector<Shape> tuple_shapes = {
-        // The loop iteration counter is a scalar, incremented each iteration.
-        ShapeUtil::MakeShape(S32, {}),
-        // The output has the shape of A, with one row updated each iteration.
-        ShapeUtil::MakeShape(shape.element_type(),
-                             {num_blocks, block_size, block_size}),
-        // The input is a loop invariant.
-        ShapeUtil::MakeShape(shape.element_type(),
-                             {num_blocks, block_size, block_size})};
-    Shape tuple_shape = ShapeUtil::MakeTupleShape(tuple_shapes);
-
-    auto init_i = One(builder, S32);
-    auto init = Tuple(builder, {init_i, output, scaled_diag_blocks});
-
-    // Construct the loop condition function.
-    std::unique_ptr<XlaBuilder> condb =
-        builder->CreateSubBuilder("InvertDiagCond");
-    {
-      auto i = GetTupleElement(
-          Parameter(condb.get(), 0, tuple_shape, "InvertDiagCondTuple"), 0);
-      Lt(i, ConstantR0<int32>(condb.get(), block_size));
-    }
-    TF_ASSIGN_OR_RETURN(auto cond, condb->Build());
-
-    // Construct the loop body function.
-    std::unique_ptr<XlaBuilder> bodyb =
-        builder->CreateSubBuilder("InvertDiagBody");
-    {
-      auto input_tuple =
-          Parameter(bodyb.get(), 0, tuple_shape, "InvertDiagBodyTuple");
-
-      auto i = GetTupleElement(input_tuple, 0);
-      auto body_out = GetTupleElement(input_tuple, 1);
-      auto body_input = GetTupleElement(input_tuple, 2);
-
-      auto zero = ConstantR0<int32>(bodyb.get(), 0);
-      auto j = (lower) ? i : ScalarLike(i, block_size - 1) - i;
-      auto input_row =
-          DynamicSlice(body_input, {zero, j, zero},
-                       /*slice_sizes=*/{num_blocks, 1, block_size});
-
-      // We want -L21 L11^{-1}
-      DotDimensionNumbers dnums;
-      dnums.add_lhs_batch_dimensions(0);
-      dnums.add_rhs_batch_dimensions(0);
-      dnums.add_lhs_contracting_dimensions(2);
-      dnums.add_rhs_contracting_dimensions(1);
-      PrecisionConfig precision_proto;
-      precision_proto.add_operand_precision(precision);
-      precision_proto.add_operand_precision(precision);
-      auto update = -DotGeneral(input_row, body_out, dnums, &precision_proto);
-
-      body_out = DynamicUpdateSlice(body_out, update, {zero, j, zero});
-
-      auto next_i = i + ScalarLike(i, 1);
-      Tuple(bodyb.get(), {next_i, body_out, body_input});
-    }
-    TF_ASSIGN_OR_RETURN(auto body, bodyb->Build());
-
-    // Construct the While loop and return the result,
-    // return while_loop(cond_fun, body_fun, init)[1]
-    auto invert_while = While(cond, body, init);
-    auto inv_diag_blocks = GetTupleElement(invert_while, 1);
-
-    // Undo the scaling
-    inv_diag_blocks = Div(inv_diag_blocks, diags,
-                          /*broadcast_dimensions=*/{0, 1});
-
-    // Reshape back to original batch major dimensions
-    return Reshape(inv_diag_blocks, AsInt64Slice(shape.dimensions()));
-  });
-}
-
 XlaOp SolveWithInvertedDiagonalBlocks(XlaOp a, XlaOp b, XlaOp inv_diag_blocks,
                                       bool left_side, bool lower,
                                       bool transpose_a, bool conjugate_a,
@@ -357,10 +236,140 @@ XlaOp SolveWithInvertedDiagonalBlocks(XlaOp a, XlaOp b, XlaOp inv_diag_blocks,
   });
 }
 
-XlaOp BuildTriangularSolve(XlaOp a, XlaOp b, bool left_side, bool lower,
-                           bool transpose_a, bool conjugate_a,
-                           bool unit_diagonal, int64 block_size,
-                           PrecisionConfig::Precision precision) {
+}  // namespace
+
+XlaOp TriangularSolveExpander::InvertDiagonalBlocks(
+    XlaOp diag_blocks, bool lower_triangular,
+    PrecisionConfig::Precision precision) {
+  XlaBuilder* builder = diag_blocks.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    // Input is a batch of square lower triangular square matrices. Its shape is
+    // (..., size, size). We resize this to (num_blocks, size, size).
+    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(diag_blocks));
+    int64 block_size = ShapeUtil::GetDimension(shape, -1);
+    int64 num_blocks = ShapeUtil::ElementsIn(shape) /
+                       tensorflow::MathUtil::IPow(block_size, 2);
+    diag_blocks = Reshape(diag_blocks, {num_blocks, block_size, block_size});
+
+    // The input must be triangular because we rely on that when doing
+    // multiplications later on
+    diag_blocks = Triangle(diag_blocks, /*lower=*/lower_triangular);
+
+    // Rescale blocks to be unit triangular, but avoid dividing by
+    // zero (which can happen if the last block was padded) otherwise it will
+    // introduce nans which will propagate
+    auto diags = GetMatrixDiagonal(diag_blocks);
+    auto ones = FullLike(diags, 1);
+    diags = Select(Eq(diags, Zero(builder, shape.element_type())), ones, diags);
+    auto scaled_diag_blocks = Div(diag_blocks, diags, {0, 2});
+
+    // We can now use the fact that for an upper triangular matrix
+    // [[L11, 0], [L21, L22]], given the inverses L11' and L22', we have
+    // L22' = -L22' * L21 * L11'. In our case, L21 is a vector and our blocks
+    // have been rescaled to be unit triangular, so L22 = L22' = 1.
+
+    // Initialize the output matrix with -1s on the diagonal. We use -1 instead
+    // of 1 because we cannot do matrix-vector multiplies with variable shapes
+    // inside of a loop, or do irregularly shaped in-place updates. Hence,
+    // L21 <- -L22 * L21 * L11 cannot be done naively. Instead, we update the
+    // entire row i.e. we calculate
+    // [L21 L22 0] <- -[L21 L22 0] @ diag_blocks([L11', -I, -I])
+    // which means [L21 L22 0] <- [-L21 * L11', L22, 0].
+    auto identity =
+        IdentityMatrix(builder, shape.element_type(), block_size, block_size);
+    auto neg_identity = -identity;
+
+    // The first or last  diagonal element should be set to 1 instead of -1
+    // though, since we never update it
+    auto pos_one = Reshape(One(builder, shape.element_type()), {1, 1});
+    auto start_index =
+        ConstantR0<int>(builder, lower_triangular ? 0 : block_size - 1);
+    auto output_block =
+        DynamicUpdateSlice(neg_identity, pos_one,
+                           /*start_indices=*/{start_index, start_index});
+
+    // Broadcast diag([1, -1, -1, ...]) to every block
+    XlaOp output = Broadcast(output_block,
+                             /*broadcast_sizes=*/{num_blocks});
+
+    // Now we construct a loop that performs matrix-vector multiplications
+    // inverting the blocks one row at a time
+    std::vector<Shape> tuple_shapes = {
+        // The loop iteration counter is a scalar, incremented each iteration.
+        ShapeUtil::MakeShape(S32, {}),
+        // The output has the shape of A, with one row updated each iteration.
+        ShapeUtil::MakeShape(shape.element_type(),
+                             {num_blocks, block_size, block_size}),
+        // The input is a loop invariant.
+        ShapeUtil::MakeShape(shape.element_type(),
+                             {num_blocks, block_size, block_size})};
+    Shape tuple_shape = ShapeUtil::MakeTupleShape(tuple_shapes);
+
+    auto init_i = One(builder, S32);
+    auto init = Tuple(builder, {init_i, output, scaled_diag_blocks});
+
+    // Construct the loop condition function.
+    std::unique_ptr<XlaBuilder> condb =
+        builder->CreateSubBuilder("InvertDiagCond");
+    {
+      auto i = GetTupleElement(
+          Parameter(condb.get(), 0, tuple_shape, "InvertDiagCondTuple"), 0);
+      Lt(i, ConstantR0<int32>(condb.get(), block_size));
+    }
+    TF_ASSIGN_OR_RETURN(auto cond, condb->Build());
+
+    // Construct the loop body function.
+    std::unique_ptr<XlaBuilder> bodyb =
+        builder->CreateSubBuilder("InvertDiagBody");
+    {
+      auto input_tuple =
+          Parameter(bodyb.get(), 0, tuple_shape, "InvertDiagBodyTuple");
+
+      auto i = GetTupleElement(input_tuple, 0);
+      auto body_out = GetTupleElement(input_tuple, 1);
+      auto body_input = GetTupleElement(input_tuple, 2);
+
+      auto zero = ConstantR0<int32>(bodyb.get(), 0);
+      auto j = lower_triangular ? i : ScalarLike(i, block_size - 1) - i;
+      auto input_row =
+          DynamicSlice(body_input, {zero, j, zero},
+                       /*slice_sizes=*/{num_blocks, 1, block_size});
+
+      // We want -L21 L11^{-1}
+      DotDimensionNumbers dnums;
+      dnums.add_lhs_batch_dimensions(0);
+      dnums.add_rhs_batch_dimensions(0);
+      dnums.add_lhs_contracting_dimensions(2);
+      dnums.add_rhs_contracting_dimensions(1);
+      PrecisionConfig precision_proto;
+      precision_proto.add_operand_precision(precision);
+      precision_proto.add_operand_precision(precision);
+      auto update = -DotGeneral(input_row, body_out, dnums, &precision_proto);
+
+      body_out = DynamicUpdateSlice(body_out, update, {zero, j, zero});
+
+      auto next_i = i + ScalarLike(i, 1);
+      Tuple(bodyb.get(), {next_i, body_out, body_input});
+    }
+    TF_ASSIGN_OR_RETURN(auto body, bodyb->Build());
+
+    // Construct the While loop and return the result,
+    // return while_loop(cond_fun, body_fun, init)[1]
+    auto invert_while = While(cond, body, init);
+    auto inv_diag_blocks = GetTupleElement(invert_while, 1);
+    // Undo the scaling
+    inv_diag_blocks = Div(inv_diag_blocks, diags,
+                          /*broadcast_dimensions=*/{0, 1});
+
+    // Reshape back to original batch major dimensions
+    return Reshape(inv_diag_blocks, AsInt64Slice(shape.dimensions()));
+  });
+}
+
+XlaOp TriangularSolveExpander::BuildTriangularSolve(
+    XlaOp a, XlaOp b, bool left_side, bool lower, bool transpose_a,
+    bool conjugate_a, bool unit_diagonal, int64 block_size,
+    PrecisionConfig::Precision precision) {
   XlaBuilder* builder = a.builder();
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
@@ -422,6 +431,11 @@ XlaOp BuildTriangularSolve(XlaOp a, XlaOp b, bool left_side, bool lower,
       return b;
     }
 
+    // Degenerate case: 1x1 matrices.
+    if (ShapeUtil::GetDimension(a_shape, -1) == 1) {
+      return unit_diagonal ? b : Div(b, MaybeConjugate(a, conjugate_a));
+    }
+
     // TODO(phawkins): consider pushing triangle masking into
     // InvertDiagonalBlocks.
     if (unit_diagonal) {
@@ -440,8 +454,7 @@ XlaOp BuildTriangularSolve(XlaOp a, XlaOp b, bool left_side, bool lower,
     auto diag_blocks = DiagonalBlocks(a, block_size);
 
     // We invert these blocks in parallel using batched matrix-vector products
-    auto inv_diag_blocks = InvertDiagonalBlocks(diag_blocks, lower, transpose_a,
-                                                conjugate_a, precision);
+    auto inv_diag_blocks = InvertDiagonalBlocks(diag_blocks, lower, precision);
 
     // We now find the solution using GEMMs
     auto x =
@@ -452,8 +465,6 @@ XlaOp BuildTriangularSolve(XlaOp a, XlaOp b, bool left_side, bool lower,
   });
 }
 
-}  // namespace
-
 TriangularSolveExpander::TriangularSolveExpander(int64 block_size)
     : block_size_(block_size) {}
 
diff --git a/tensorflow/compiler/xla/service/triangular_solve_expander.h b/tensorflow/compiler/xla/service/triangular_solve_expander.h
index 362e8557229..3f9e58a3246 100644
--- a/tensorflow/compiler/xla/service/triangular_solve_expander.h
+++ b/tensorflow/compiler/xla/service/triangular_solve_expander.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_TRIANGULAR_SOLVE_EXPANDER_H_
 
 #include "absl/container/flat_hash_map.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/service/op_expander_pass.h"
 
 namespace xla {
@@ -35,6 +36,14 @@ class TriangularSolveExpander : public OpExpanderPass {
   StatusOr<HloInstruction*> ExpandInstruction(
       HloInstruction* instruction) override;
 
+  virtual XlaOp InvertDiagonalBlocks(XlaOp diag_blocks, bool lower_triangular,
+                                     PrecisionConfig::Precision precision);
+
+  XlaOp BuildTriangularSolve(XlaOp a, XlaOp b, bool left_side, bool lower,
+                             bool transpose_a, bool conjugate_a,
+                             bool unit_diagonal, int64 block_size,
+                             PrecisionConfig::Precision precision);
+
  private:
   // Block size for BuildTriangularSolve
   const int64 block_size_;
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
index c66f9d96a50..e2b977ad493 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
@@ -333,10 +333,10 @@ TEST_F(TuplePointsToAnalysisTest, CopyStartAndCopyDone) {
   auto builder = HloComputation::Builder(TestName());
   auto constant = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
-  auto copy_start = builder.AddInstruction(HloInstruction::CreateUnary(
+  auto copy_start = builder.AddInstruction(HloInstruction::CreateCopyStart(
       ShapeUtil::MakeTupleShape({constant->shape(), constant->shape(),
                                  ShapeUtil::MakeShape(U32, {})}),
-      HloOpcode::kCopyStart, constant));
+      constant));
   auto copy_done = builder.AddInstruction(HloInstruction::CreateUnary(
       constant->shape(), HloOpcode::kCopyDone, copy_start));
 
diff --git a/tensorflow/compiler/xla/service/while_loop_simplifier.cc b/tensorflow/compiler/xla/service/while_loop_simplifier.cc
index c80123bcd50..785fdecbfa0 100644
--- a/tensorflow/compiler/xla/service/while_loop_simplifier.cc
+++ b/tensorflow/compiler/xla/service/while_loop_simplifier.cc
@@ -37,23 +37,15 @@ namespace m = match;
 using absl::optional;
 using hlo_query::ContainsInstrWithOpcode;
 
-// Tries to remove elements in a while loop's tuple that aren't used within the
-// loop.
-//
-// Specifically, if a loop is tuple-shaped, and there exists some element of
-// that tuple that is not used by the loop condition and is not used by the loop
-// body except to pass it to the next iteration of the loop, then we can remove
-// that element from the loop's tuples.
-static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
-  CHECK_EQ(while_op->opcode(), HloOpcode::kWhile);
-
-  // Don't try this transformation if the while loop isn't removable, since if
-  // it succeeds ultimately we're going to have to replace the old while loop
-  // with a new one.
-  if (!while_op->parent()->IsSafelyRemovable(while_op)) {
-    VLOG(2) << "Can't remove dead parameters from non-removable while op.";
-    return false;
-  }
+// This is a utility function that removes the given tuple indices from the
+// while loop init, body, and condition. The final shape returned is still the
+// same as before.
+static StatusOr<HloInstruction*> RemoveDeadTupleIndices(
+    HloInstruction* while_op, absl::flat_hash_set<int64>& used_tuple_indices) {
+  // Build up maps from the old/new to the new/old tuple indices.
+  std::vector<int64> new_to_old_tuple_idx(used_tuple_indices.begin(),
+                                          used_tuple_indices.end());
+  absl::c_sort(new_to_old_tuple_idx);
 
   HloModule* module = while_op->GetModule();
   HloComputation* computation = while_op->parent();
@@ -62,107 +54,8 @@ static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
   HloComputation* while_body = while_op->while_body();
   HloInstruction* while_body_root = while_body->root_instruction();
 
-  if (!while_init->shape().IsTuple()) {
-    VLOG(2) << "While op's carried value isn't tuple shaped.";
-    return false;
-  }
-
-  if (while_body_root->opcode() != HloOpcode::kTuple) {
-    VLOG(2) << "While body's root is not a tuple(...) instruction.";
-    return false;
-  }
-
   auto print_no_metadata = HloPrintOptions().set_print_metadata(false);
 
-  // Bail if param0 of while_cond or while_body has users which aren't of type
-  // get-tuple-element.
-  for (const HloInstruction* instr : {while_body->parameter_instruction(0),
-                                      while_cond->parameter_instruction(0)}) {
-    for (const HloInstruction* user : instr->users()) {
-      if (user->opcode() != HloOpcode::kGetTupleElement) {
-        VLOG(2) << "Cowardly refusing to analyze while loop with "
-                << instr->ToString(print_no_metadata)
-                << " used by non-GTE instruction "
-                << user->ToString(print_no_metadata) << " in computation "
-                << instr->parent()->name();
-        return false;
-      }
-    }
-  }
-
-  const int64 tuple_size = ShapeUtil::TupleElementCount(while_init->shape());
-  if (tuple_size == 0) {
-    VLOG(2) << "Can't remove elements from while loop's tuple -- it's already "
-               "empty.";
-    return false;
-  }
-
-  absl::flat_hash_set<int64> used_tuple_indices;
-  for (HloComputation* comp : {while_body, while_cond}) {
-    // The HLO verifier ensures that while_input's shape matches while_init's
-    // shape, which we verified above is a tuple.
-    HloInstruction* while_input = comp->parameter_instruction(0);
-
-    for (const HloInstruction* user : while_input->users()) {
-      // This user doesn't count if it's only used by the while body's root, and
-      // the root places the tuple element into the same index of the tuple as
-      // it came from.  That just amounts to us carrying the variable through
-      // the loop.
-      //
-      // Careful: HloInstruction::operand_index returns the first index the
-      // operand appears in, but it may appear more than once!
-      if (user->user_count() == 1 && user->users().front() == while_body_root &&
-          while_body_root->operand_index(user) == user->tuple_index() &&
-          absl::c_count(while_body_root->operands(), user) == 1) {
-        continue;
-      }
-
-      used_tuple_indices.insert(user->tuple_index());
-      if (used_tuple_indices.size() == tuple_size) {
-        VLOG(2) << "Loop " << while_op->ToString(print_no_metadata)
-                << " uses all of its inputs; no simplification possible.";
-        return false;
-      }
-    }
-  }
-
-  // If a tuple element is not passed unmodified from the while body's param0
-  // through to the while body's root, count that element as "used", since
-  // removing that element would be observable.
-  for (int64 i = 0; i < while_body_root->operand_count(); ++i) {
-    if (used_tuple_indices.contains(i)) {
-      continue;
-    }
-
-    auto* operand = while_body_root->operand(i);
-    if (operand->opcode() != HloOpcode::kGetTupleElement ||
-        operand->operand(0) != while_body->parameter_instruction(0) ||
-        operand->tuple_index() != i) {
-      VLOG(2) << "Tuple index " << i
-              << " is not passed through loop body unmodified.";
-      used_tuple_indices.insert(i);
-
-      if (used_tuple_indices.size() == tuple_size) {
-        VLOG(2) << "Loop " << while_op->ToString(print_no_metadata)
-                << " uses all of its inputs; no simplification possible.";
-        return false;
-      }
-    }
-  }
-
-  // If we got here, used_tuple_indices.size() < tuple_size, meaning some
-  // elements of the loop's tuple aren't used by while_body or while_cond.
-  CHECK_LT(used_tuple_indices.size(), tuple_size);
-
-  VLOG(1) << "Eliminating " << tuple_size - used_tuple_indices.size()
-          << " elements from tuple of "
-          << while_op->ToString(print_no_metadata);
-
-  // Build up maps from the old/new to the new/old tuple indices.
-  std::vector<int64> new_to_old_tuple_idx(used_tuple_indices.begin(),
-                                          used_tuple_indices.end());
-  absl::c_sort(new_to_old_tuple_idx);
-
   absl::flat_hash_map<int64, int64> old_to_new_tuple_idx;
   for (int64 new_idx = 0; new_idx < new_to_old_tuple_idx.size(); ++new_idx) {
     int64 old_idx = new_to_old_tuple_idx[new_idx];
@@ -288,6 +181,7 @@ static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
   // The tuple simplifier will then simplify this if possible, removing
   // new_tuple and while_init.
   std::vector<HloInstruction*> new_tuple_elems;
+  const int64 tuple_size = ShapeUtil::TupleElementCount(while_init->shape());
   for (int64 old_idx = 0; old_idx < tuple_size; ++old_idx) {
     auto new_tuple_idx_it = old_to_new_tuple_idx.find(old_idx);
     if (new_tuple_idx_it != old_to_new_tuple_idx.end()) {
@@ -305,9 +199,293 @@ static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
   HloInstruction* new_tuple =
       computation->AddInstruction(HloInstruction::CreateTuple(new_tuple_elems));
   TF_RETURN_IF_ERROR(computation->ReplaceInstruction(while_op, new_tuple));
+
+  return new_while_op;
+}
+
+// Tries to remove elements in a while loop's tuple that aren't used within the
+// loop.
+//
+// Specifically, if a loop is tuple-shaped, and there exists some element of
+// that tuple that is not used by the loop condition and is not used by the loop
+// body except to pass it to the next iteration of the loop, then we can remove
+// that element from the loop's tuples.
+static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
+  CHECK_EQ(while_op->opcode(), HloOpcode::kWhile);
+
+  // Don't try this transformation if the while loop isn't removable, since if
+  // it succeeds ultimately we're going to have to replace the old while loop
+  // with a new one.
+  if (!while_op->parent()->IsSafelyRemovable(while_op)) {
+    VLOG(2) << "Can't remove dead parameters from non-removable while op.";
+    return false;
+  }
+
+  HloInstruction* while_init = while_op->mutable_operand(0);
+  HloComputation* while_cond = while_op->while_condition();
+  HloComputation* while_body = while_op->while_body();
+  HloInstruction* while_body_root = while_body->root_instruction();
+
+  if (!while_init->shape().IsTuple()) {
+    VLOG(2) << "While op's carried value isn't tuple shaped.";
+    return false;
+  }
+
+  if (while_body_root->opcode() != HloOpcode::kTuple) {
+    VLOG(2) << "While body's root is not a tuple(...) instruction.";
+    return false;
+  }
+
+  auto print_no_metadata = HloPrintOptions().set_print_metadata(false);
+
+  // Bail if param0 of while_cond or while_body has users which aren't of type
+  // get-tuple-element.
+  for (const HloInstruction* instr : {while_body->parameter_instruction(0),
+                                      while_cond->parameter_instruction(0)}) {
+    for (const HloInstruction* user : instr->users()) {
+      if (user->opcode() != HloOpcode::kGetTupleElement) {
+        VLOG(2) << "Cowardly refusing to analyze while loop with "
+                << instr->ToString(print_no_metadata)
+                << " used by non-GTE instruction "
+                << user->ToString(print_no_metadata) << " in computation "
+                << instr->parent()->name();
+        return false;
+      }
+    }
+  }
+
+  const int64 tuple_size = ShapeUtil::TupleElementCount(while_init->shape());
+  if (tuple_size == 0) {
+    VLOG(2) << "Can't remove elements from while loop's tuple -- it's already "
+               "empty.";
+    return false;
+  }
+
+  absl::flat_hash_set<int64> used_tuple_indices;
+  for (HloComputation* comp : {while_body, while_cond}) {
+    // The HLO verifier ensures that while_input's shape matches while_init's
+    // shape, which we verified above is a tuple.
+    HloInstruction* while_input = comp->parameter_instruction(0);
+
+    for (const HloInstruction* user : while_input->users()) {
+      // This user doesn't count if it's only used by the while body's root, and
+      // the root places the tuple element into the same index of the tuple as
+      // it came from.  That just amounts to us carrying the variable through
+      // the loop.
+      //
+      // Careful: HloInstruction::operand_index returns the first index the
+      // operand appears in, but it may appear more than once!
+      if (user->user_count() == 1 && user->users().front() == while_body_root &&
+          while_body_root->operand_index(user) == user->tuple_index() &&
+          absl::c_count(while_body_root->operands(), user) == 1) {
+        continue;
+      }
+
+      used_tuple_indices.insert(user->tuple_index());
+      if (used_tuple_indices.size() == tuple_size) {
+        VLOG(2) << "Loop " << while_op->ToString(print_no_metadata)
+                << " uses all of its inputs; no simplification possible.";
+        return false;
+      }
+    }
+  }
+
+  // If a tuple element is not passed unmodified from the while body's param0
+  // through to the while body's root, count that element as "used", since
+  // removing that element would be observable.
+  for (int64 i = 0; i < while_body_root->operand_count(); ++i) {
+    if (used_tuple_indices.contains(i)) {
+      continue;
+    }
+
+    auto* operand = while_body_root->operand(i);
+    if (operand->opcode() != HloOpcode::kGetTupleElement ||
+        operand->operand(0) != while_body->parameter_instruction(0) ||
+        operand->tuple_index() != i) {
+      VLOG(2) << "Tuple index " << i
+              << " is not passed through loop body unmodified.";
+      used_tuple_indices.insert(i);
+
+      if (used_tuple_indices.size() == tuple_size) {
+        VLOG(2) << "Loop " << while_op->ToString(print_no_metadata)
+                << " uses all of its inputs; no simplification possible.";
+        return false;
+      }
+    }
+  }
+
+  // If we got here, used_tuple_indices.size() < tuple_size, meaning some
+  // elements of the loop's tuple aren't used by while_body or while_cond.
+  CHECK_LT(used_tuple_indices.size(), tuple_size);
+
+  VLOG(1) << "Eliminating " << tuple_size - used_tuple_indices.size()
+          << " elements from tuple of "
+          << while_op->ToString(print_no_metadata);
+
+  TF_ASSIGN_OR_RETURN(while_op,
+                      RemoveDeadTupleIndices(while_op, used_tuple_indices));
+
   return true;
 }
 
+// This is a helper function for TryRemoveRepeatedWhileTupleIndices. It removes
+// duplicates by replacing them with tuple_index, followed by a call to
+// RemoveDeadTupleIndices.
+static StatusOr<HloInstruction*> TryRemoveRepeatedWhileTupleIndicesHelper(
+    HloInstruction* while_op, const int64 tuple_index,
+    absl::flat_hash_set<int64>& duplicates) {
+  HloComputation* while_cond = while_op->while_condition();
+  HloComputation* while_body = while_op->while_body();
+  HloInstruction* while_init = while_op->mutable_operand(0);
+
+  VLOG(2) << "while_init " << while_init->ToString() << " operands "
+          << while_init->operand_count();
+  VLOG(2) << "while_body_root " << while_body->root_instruction()->ToString()
+          << " operands " << while_body->root_instruction()->operand_count();
+
+  // Change the loop body and condition such that uses of the duplicates are
+  // replaced with the original tuple element.
+  for (HloComputation* comp : {while_body, while_cond}) {
+    auto new_get = comp->AddInstruction(HloInstruction::CreateGetTupleElement(
+        comp->parameter_instruction(0)->shape().tuple_shapes(tuple_index),
+        comp->parameter_instruction(0), tuple_index));
+
+    std::vector<HloInstruction*> instrs_to_replace;
+    for (auto* instr : comp->instructions()) {
+      if (instr->opcode() == HloOpcode::kGetTupleElement &&
+          duplicates.contains(instr->tuple_index()) &&
+          instr->operand(0) == comp->parameter_instruction(0)) {
+        instrs_to_replace.push_back(instr);
+      }
+    }
+
+    for (auto instr : instrs_to_replace) {
+      TF_RETURN_IF_ERROR(comp->ReplaceInstruction(instr, new_get));
+    }
+  }
+
+  // We know which tuple indices are useful; i.e, those which aren't duplicates.
+  absl::flat_hash_set<int64> used_tuple_indices;
+  for (int index = 0; index < while_init->shape().tuple_shapes_size();
+       ++index) {
+    if (!duplicates.count(index)) {
+      used_tuple_indices.insert(index);
+    }
+  }
+
+  // Remove the duplicate tuple elements.
+  TF_ASSIGN_OR_RETURN(while_op,
+                      RemoveDeadTupleIndices(while_op, used_tuple_indices));
+
+  return while_op;
+}
+
+// If the while loop init passes the same values to several tuple indices, and
+// if the body keeps on passing them through, we can remove the duplicates.
+static StatusOr<bool> TryRemoveRepeatedWhileTupleIndices(
+    HloInstruction* while_op) {
+  CHECK_EQ(while_op->opcode(), HloOpcode::kWhile);
+
+  int index_to_investigate = 0;
+  // Don't try this transformation if the while loop isn't removable, since if
+  // it succeeds ultimately we're going to have to replace the old while loop
+  // with a new one.
+  if (!while_op->parent()->IsSafelyRemovable(while_op)) {
+    VLOG(2) << "Can't remove dead parameters from non-removable while op.";
+    return false;
+  }
+
+  HloInstruction* while_init = while_op->mutable_operand(0);
+  HloComputation* while_cond = while_op->while_condition();
+  HloComputation* while_body = while_op->while_body();
+  HloInstruction* while_body_root = while_body->root_instruction();
+
+  if (!while_init->shape().IsTuple()) {
+    VLOG(2) << "While op's carried value isn't tuple shaped.";
+    return false;
+  }
+
+  bool changed = false;
+  while (index_to_investigate < while_init->shape().tuple_shapes_size()) {
+    if (!while_init->shape().IsTuple() ||
+        while_init->opcode() != HloOpcode::kTuple) {
+      VLOG(2) << "While op's carried value isn't tuple shaped.";
+      return false;
+    }
+
+    if (while_body_root->opcode() != HloOpcode::kTuple) {
+      VLOG(2) << "While body's root is not a tuple(...) instruction.";
+      return false;
+    }
+
+    auto& while_shape = while_init->shape();
+    VLOG(2) << "Iterating " << index_to_investigate;
+
+    absl::flat_hash_set<int64> duplicates;
+    auto* pivot_init_elem = while_init->operand(index_to_investigate);
+    auto* pivot_body_elem = while_body_root->operand(index_to_investigate);
+    if (pivot_body_elem->opcode() == HloOpcode::kGetTupleElement &&
+        pivot_body_elem->operand(0) == while_body->parameter_instruction(0)) {
+      if (pivot_body_elem->tuple_index() != index_to_investigate) {
+        VLOG(2) << "Mismatch between pivot_body_elem->tuple_index() "
+                << pivot_body_elem->tuple_index() << " index_to_investigate "
+                << index_to_investigate;
+        index_to_investigate++;
+        continue;
+      }
+    } else {
+      index_to_investigate++;
+      continue;
+    }
+
+    // Look from index_to_investigate onwards to see if it is repeated.
+    for (int64 i = index_to_investigate + 1;
+         i < while_shape.tuple_shapes_size(); ++i) {
+      auto* init_elem = while_init->operand(i);
+      auto* body_elem = while_body_root->operand(i);
+      if (body_elem->opcode() == HloOpcode::kGetTupleElement &&
+          body_elem->operand(0) == while_body->parameter_instruction(0)) {
+        if (body_elem->tuple_index() != i) {
+          VLOG(2) << "Mismatch between body_elem->tuple_index() "
+                  << body_elem->tuple_index() << " i " << i;
+          continue;
+        }
+      } else {
+        continue;
+      }
+
+      if (pivot_init_elem == init_elem) {
+        VLOG(2) << "init_elem " << init_elem->ToString() << " pivot_init_elem "
+                << pivot_init_elem->ToString();
+        VLOG(2) << "body_elem " << body_elem->ToString() << " pivot_body_elem "
+                << pivot_body_elem->ToString();
+        duplicates.insert(i);
+      }
+    }
+
+    // If duplicates are found, call the helper to remove them.
+    if (!duplicates.empty()) {
+      VLOG(2) << "Duplicate found " << duplicates.size() << " pivot_init "
+              << pivot_init_elem->ToString();
+      TF_ASSIGN_OR_RETURN(while_op,
+                          TryRemoveRepeatedWhileTupleIndicesHelper(
+                              while_op, index_to_investigate, duplicates));
+      changed = true;
+      VLOG(2) << "Changed while_op " << while_op->ToString()
+              << " while_op operand count " << while_op->operand_count();
+      // Update the while loop variables so we can continue looking for
+      // duplicates of a different index.
+      while_init = while_op->mutable_operand(0);
+      while_cond = while_op->while_condition();
+      while_body = while_op->while_body();
+      while_body_root = while_body->root_instruction();
+    }
+    index_to_investigate++;
+  }
+
+  return changed;
+}
+
 // Removes each loop parameter (i.e. member of the while loop tuple) that is a
 // constant and is the same in the while loop body and the while loop init.
 static StatusOr<bool> TryRemoveConstantParams(HloInstruction* while_op) {
@@ -1048,6 +1226,7 @@ StatusOr<bool> WhileLoopSimplifier::Run(HloModule* module) {
 
     TF_ASSIGN_OR_RETURN(result, TryRemoveWhileLoop(while_op));
     changed |= result;
+
     if (result) {
       // Don't continue simplifying after successfully removing the while loop
       // -- that would result in use-after-free nastiness.
@@ -1067,6 +1246,12 @@ StatusOr<bool> WhileLoopSimplifier::Run(HloModule* module) {
     // successful, meaning that `while_op` is no longer valid after one of these
     // transformations returns true.
 
+    TF_ASSIGN_OR_RETURN(result, TryRemoveRepeatedWhileTupleIndices(while_op));
+    changed |= result;
+    if (result) {
+      continue;
+    }
+
     TF_ASSIGN_OR_RETURN(result, TryFlattenNestedTuples(while_op));
     changed |= result;
     if (result) {
@@ -1074,6 +1259,7 @@ StatusOr<bool> WhileLoopSimplifier::Run(HloModule* module) {
     }
 
     TF_ASSIGN_OR_RETURN(result, TryRemoveDeadWhileParams(while_op));
+
     changed |= result;
     if (result) {
       continue;
diff --git a/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc b/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc
index d715fb3857a..c93cb5dc347 100644
--- a/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc
@@ -794,5 +794,51 @@ TEST_F(WhileLoopSimplifierTest, MergeInductionVariables_SkipS16) {
           .ValueOrDie());
 }
 
+TEST_F(WhileLoopSimplifierTest, RemoveRepeatedParams) {
+  const string hlo_string = R"(
+  HloModule SwappingTupleElements
+
+  SwappingTupleElements.body {
+    loop_var = (s32[], s32[], s32[]) parameter(0)
+    get-tuple-element = s32[] get-tuple-element(loop_var), index=0
+    get-tuple-element.1 = s32[] get-tuple-element(loop_var), index=1
+    get-tuple-element.2 = s32[] get-tuple-element(loop_var), index=2
+    y = s32[] add(get-tuple-element.1, get-tuple-element.2)
+    ROOT tuple = (s32[], s32[], s32[]) tuple(s32[] get-tuple-element, y,
+      s32[] get-tuple-element.2)
+  }
+
+  SwappingTupleElements.always_true {
+   param = (s32[], s32[], s32[]) parameter(0)
+   get-tuple-element = s32[] get-tuple-element(param), index=0
+   get-tuple-element.1 = s32[] get-tuple-element(param), index=1
+   ROOT less-than = pred[] compare(get-tuple-element, get-tuple-element.1), direction=LT
+  }
+
+  ENTRY SwappingTupleElements {
+   x = s32[] parameter(0)
+   y = s32[] parameter(1)
+   tuple.1 = (s32[], s32[], s32[]) tuple(s32[] x, s32[] y, s32[] x)
+   ROOT while = (s32[], s32[], s32[]) while(tuple.1),
+     condition=SwappingTupleElements.always_true,
+     body=SwappingTupleElements.body
+  }
+  )";
+
+  auto m = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  EXPECT_TRUE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
+  HloInstruction* new_while = FindFirstWhile(m.get());
+  Shape new_while_shape = ParseShape("(s32[], s32[])").ValueOrDie();
+  EXPECT_TRUE(ShapeUtil::Equal(new_while->shape(), new_while_shape));
+  EXPECT_TRUE(ShapeUtil::Equal(
+      new_while->while_body()->root_instruction()->shape(), new_while_shape));
+  EXPECT_TRUE(ShapeUtil::Equal(
+      new_while->while_body()->parameter_instruction(0)->shape(),
+      new_while_shape));
+  EXPECT_TRUE(ShapeUtil::Equal(
+      new_while->while_condition()->parameter_instruction(0)->shape(),
+      new_while_shape));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/shape_layout.h b/tensorflow/compiler/xla/shape_layout.h
index b4982f1d8e4..64c9635f335 100644
--- a/tensorflow/compiler/xla/shape_layout.h
+++ b/tensorflow/compiler/xla/shape_layout.h
@@ -61,6 +61,10 @@ class ShapeLayout {
   // Returns the shape (with layouts).
   const Shape& shape() const { return shape_; }
 
+  // Clear dynamic dimensions of this module. Pretending the module creates
+  // static results. Useful in inspecting full outputs when testing.
+  void ClearDynamicShape() { shape_.clear_dynamic_dimensions(); }
+
   // Checks that a layout is set for the shape, and returns a reference to the
   // layout directly on the shape. Shape must not be a tuple.
   const Layout& layout() const;
diff --git a/tensorflow/compiler/xla/shape_tree.h b/tensorflow/compiler/xla/shape_tree.h
index 73bb3327784..b1c96e9becf 100644
--- a/tensorflow/compiler/xla/shape_tree.h
+++ b/tensorflow/compiler/xla/shape_tree.h
@@ -70,6 +70,8 @@ struct IndexTableEntry {
 
 template <typename ContainerType, typename IteratorType, typename ValueType>
 class ShapeTreeIterator;
+template <typename ContainerType, typename IteratorType, typename ValueType>
+class ShapeTreeLeafIterator;
 
 // A ShapeTree<T> is a recursive data structure which mirrors the structure of a
 // XLA shape and holds a value of type T for each subshape (i.e. tuple or array)
@@ -158,23 +160,25 @@ class ShapeTree {
   using reverse_iterator = std::reverse_iterator<iterator>;
   using const_reverse_iterator = std::reverse_iterator<const_iterator>;
 
+  using leaf_iterator =
+      ShapeTreeLeafIterator<std::vector<Node>,
+                            typename std::vector<Node>::iterator,
+                            std::pair<ShapeIndex, T>>;
+  using const_leaf_iterator =
+      ShapeTreeLeafIterator<const std::vector<Node>,
+                            typename std::vector<Node>::const_iterator,
+                            const std::pair<ShapeIndex, T>>;
+  using reverse_leaf_iterator = std::reverse_iterator<leaf_iterator>;
+  using const_reverse_leaf_iterator =
+      std::reverse_iterator<const_leaf_iterator>;
+
   // begin/end for iterating over all nodes.
-  iterator begin() {
-    return iterator(&nodes_, nodes_.begin(),
-                    /*iterate_leaves_only=*/false);
-  }
-  iterator end() {
-    return iterator(&nodes_, nodes_.end(),
-                    /*iterate_leaves_only=*/false);
-  }
+  iterator begin() { return iterator(&nodes_, nodes_.begin()); }
+  iterator end() { return iterator(&nodes_, nodes_.end()); }
   const_iterator begin() const {
-    return const_iterator(&nodes_, nodes_.begin(),
-                          /*iterate_leaves_only=*/false);
-  }
-  const_iterator end() const {
-    return const_iterator(&nodes_, nodes_.end(),
-                          /*iterate_leaves_only=*/false);
+    return const_iterator(&nodes_, nodes_.begin());
   }
+  const_iterator end() const { return const_iterator(&nodes_, nodes_.end()); }
 
   // rbegin/rend for iterating over all nodes in reverse.
   reverse_iterator rbegin() { return reverse_iterator(end()); }
@@ -188,37 +192,33 @@ class ShapeTree {
 
   // leaf_begin()/leaf_end() iterates over all leaf nodes (nodes with no
   // children).
-  iterator leaf_begin() {
-    return iterator(&nodes_, nodes_.begin(),
-                    /*iterate_leaves_only=*/true);
+  leaf_iterator leaf_begin() { return leaf_iterator(&nodes_, nodes_.begin()); }
+  leaf_iterator leaf_end() { return leaf_iterator(&nodes_, nodes_.end()); }
+  const_leaf_iterator leaf_begin() const {
+    return const_leaf_iterator(&nodes_, nodes_.begin());
   }
-  iterator leaf_end() {
-    return iterator(&nodes_, nodes_.end(),
-                    /*iterate_leaves_only=*/true);
-  }
-  const_iterator leaf_begin() const {
-    return const_iterator(&nodes_, nodes_.begin(),
-                          /*iterate_leaves_only=*/true);
-  }
-  const_iterator leaf_end() const {
-    return const_iterator(&nodes_, nodes_.end(),
-                          /*iterate_leaves_only=*/true);
+  const_leaf_iterator leaf_end() const {
+    return const_leaf_iterator(&nodes_, nodes_.end());
   }
   // range-based iterator for leaf_begin()/leaf_end().
-  tensorflow::gtl::iterator_range<iterator> leaves() {
+  tensorflow::gtl::iterator_range<leaf_iterator> leaves() {
     return tensorflow::gtl::make_range(leaf_begin(), leaf_end());
   }
-  tensorflow::gtl::iterator_range<const_iterator> leaves() const {
+  tensorflow::gtl::iterator_range<const_leaf_iterator> leaves() const {
     return tensorflow::gtl::make_range(leaf_begin(), leaf_end());
   }
 
-  reverse_iterator leaf_rbegin() { return reverse_iterator(leaf_end()); }
-  reverse_iterator leaf_rend() { return reverse_iterator(leaf_begin()); }
-  const_reverse_iterator leaf_rbegin() const {
-    return const_reverse_iterator(leaf_end());
+  reverse_leaf_iterator leaf_rbegin() {
+    return reverse_leaf_iterator(leaf_end());
   }
-  const_reverse_iterator leaf_rend() const {
-    return const_reverse_iterator(leaf_begin());
+  reverse_leaf_iterator leaf_rend() {
+    return reverse_leaf_iterator(leaf_begin());
+  }
+  const_reverse_leaf_iterator leaf_rbegin() const {
+    return const_reverse_leaf_iterator(leaf_end());
+  }
+  const_reverse_leaf_iterator leaf_rend() const {
+    return const_reverse_leaf_iterator(leaf_begin());
   }
 
   // Returns an iterator pointing to the given ShapeIndex.
@@ -226,12 +226,12 @@ class ShapeTree {
   iterator find(ShapeIndexView index) {
     Node* element = Lookup(index);
     auto element_iter = nodes_.begin() + (element - &nodes_[0]);
-    return iterator(&nodes_, element_iter, /*iterate_leaves_only=*/false);
+    return iterator(&nodes_, element_iter);
   }
   const_iterator find(ShapeIndexView index) const {
-    Node* element = Lookup(index);
+    const Node* element = Lookup(index);
     auto element_iter = nodes_.cbegin() + (element - &nodes_[0]);
-    return const_iterator(&nodes_, element_iter, /*iterate_leaves_only=*/false);
+    return const_iterator(&nodes_, element_iter);
   }
 
   // Returns the number of leaf nodes in the tree.
@@ -343,21 +343,11 @@ template <typename ContainerType, typename IteratorType, typename ValueType>
 class ShapeTreeIterator
     : public std::iterator<std::bidirectional_iterator_tag, ValueType> {
  public:
-  ShapeTreeIterator(ContainerType* nodes, IteratorType node,
-                    bool iterate_leaves_only)
-      : nodes_(nodes),
-        node_(std::move(node)),
-        iterate_leaves_only_(iterate_leaves_only) {
-    while (iterate_leaves_only && node_ != nodes_->end() && !node_->is_leaf) {
-      ++node_;
-    }
-  }
+  ShapeTreeIterator(ContainerType* nodes, IteratorType node)
+      : nodes_(nodes), node_(std::move(node)) {}
 
   ShapeTreeIterator& operator++() {
     ++node_;
-    while (iterate_leaves_only_ && node_ != nodes_->end() && !node_->is_leaf) {
-      ++node_;
-    }
     return *this;
   }
   ShapeTreeIterator operator++(int) {
@@ -368,9 +358,6 @@ class ShapeTreeIterator
 
   ShapeTreeIterator& operator--() {
     --node_;
-    while (iterate_leaves_only_ && node_ > nodes_->begin() && !node_->is_leaf) {
-      --node_;
-    }
     return *this;
   }
   ShapeTreeIterator operator--(int) {
@@ -385,14 +372,66 @@ class ShapeTreeIterator
   bool operator!=(const ShapeTreeIterator& other) const {
     return node_ != other.node_;
   }
-  ValueType& operator*() { return node_->data; }
-  ValueType* operator->() { return &node_->data; }
+  ValueType& operator*() const { return node_->data; }
+  ValueType* operator->() const { return &node_->data; }
+
+ private:
+  ContainerType* nodes_;
+  IteratorType node_;
+};
+
+// Internal iterator that performs a pre-order walk of the leaves. This is cheap
+// to copy. The iterator value_type is equivalent to a std::pair<ShapeIndex,T>&,
+// similar to std::map.
+template <typename ContainerType, typename IteratorType, typename ValueType>
+class ShapeTreeLeafIterator
+    : public std::iterator<std::bidirectional_iterator_tag, ValueType> {
+ public:
+  ShapeTreeLeafIterator(ContainerType* nodes, IteratorType node)
+      : nodes_(nodes), node_(std::move(node)) {
+    while (node_ != nodes_->end() && !node_->is_leaf) {
+      ++node_;
+    }
+  }
+
+  ShapeTreeLeafIterator& operator++() {
+    ++node_;
+    while (node_ != nodes_->end() && !node_->is_leaf) {
+      ++node_;
+    }
+    return *this;
+  }
+  ShapeTreeLeafIterator operator++(int) {
+    auto i = *this;
+    ++(*this);
+    return i;
+  }
+
+  ShapeTreeLeafIterator& operator--() {
+    --node_;
+    while (node_ > nodes_->begin() && !node_->is_leaf) {
+      --node_;
+    }
+    return *this;
+  }
+  ShapeTreeLeafIterator operator--(int) {
+    auto i = *this;
+    --(*this);
+    return i;
+  }
+
+  bool operator==(const ShapeTreeLeafIterator& other) const {
+    return node_ == other.node_;
+  }
+  bool operator!=(const ShapeTreeLeafIterator& other) const {
+    return node_ != other.node_;
+  }
+  ValueType& operator*() const { return node_->data; }
+  ValueType* operator->() const { return &node_->data; }
 
  private:
   ContainerType* nodes_;
   IteratorType node_;
-  // True if we should not include interior nodes in our walk.
-  const bool iterate_leaves_only_;
 };
 
 template <typename T>
@@ -648,7 +687,9 @@ void ShapeTree<T>::CopySubtreeFrom(const ShapeTree<T>& other,
                                    const ShapeIndex& target_base_index) {
   CHECK(ShapeUtil::Compatible(
       ShapeUtil::GetSubshape(shape(), target_base_index),
-      ShapeUtil::GetSubshape(other.shape(), source_base_index)));
+      ShapeUtil::GetSubshape(other.shape(), source_base_index)))
+      << ShapeUtil::GetSubshape(shape(), target_base_index) << " vs "
+      << ShapeUtil::GetSubshape(other.shape(), source_base_index);
   ForEachMutableElement([this, &other, &source_base_index, &target_base_index](
                             const ShapeIndex& index, T* data) {
     // Copy the data element only if index is in the
diff --git a/tensorflow/compiler/xla/shape_tree_test.cc b/tensorflow/compiler/xla/shape_tree_test.cc
index 2b6c484bc4f..c294355e269 100644
--- a/tensorflow/compiler/xla/shape_tree_test.cc
+++ b/tensorflow/compiler/xla/shape_tree_test.cc
@@ -485,6 +485,30 @@ TEST_F(ShapeTreeTest, ReverseIterateOrder) {
                }));
 }
 
+// Ensures that we can find an element at an index that we know ahead of time to
+// be occupied in a 'ShapeTree' via the 'find' API.
+TEST_F(ShapeTreeTest, Find) {
+  ShapeTree<int> t(nested_tuple_shape_, 42);
+  auto found = t.find({1, 0});
+  EXPECT_NE(found, t.end());
+  // The found key must be the same key we searched for.
+  EXPECT_EQ(found->first, ShapeIndex({1, 0}));
+  // The 'ShapeTree' has 42 at every position.
+  EXPECT_EQ(found->second, 42);
+}
+
+// Ensures that we can find an element at an index that we know ahead of time to
+// be occupied in a 'const ShapeTree' via the 'find' API.
+TEST_F(ShapeTreeTest, ConstFind) {
+  const ShapeTree<int> t(nested_tuple_shape_, 42);
+  auto found = t.find({1, 0});
+  EXPECT_NE(found, t.end());
+  // The found key must be the same key we searched for.
+  EXPECT_EQ(found->first, ShapeIndex({1, 0}));
+  // The 'ShapeTree' has 42 at every position.
+  EXPECT_EQ(found->second, 42);
+}
+
 TEST_F(ShapeTreeTest, IterateOrderLeaves) {
   ShapeTree<int> t(nested_tuple_shape_, 42);
   std::vector<ShapeIndex> v;
diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc
index 02fcaafd19d..0833919b124 100644
--- a/tensorflow/compiler/xla/shape_util.cc
+++ b/tensorflow/compiler/xla/shape_util.cc
@@ -783,9 +783,18 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
 
 /* static */ Shape ShapeUtil::ChangeElementType(const Shape& original,
                                                 PrimitiveType type) {
-  Shape new_shape = original;
-  new_shape.set_element_type(type);
-  return new_shape;
+  if (original.IsTuple()) {
+    std::vector<Shape> new_operands;
+    new_operands.reserve(original.tuple_shapes_size());
+    for (const Shape& operand : original.tuple_shapes()) {
+      new_operands.push_back(ChangeElementType(operand, type));
+    }
+    return MakeTupleShape(new_operands);
+  } else {
+    Shape new_shape = original;
+    new_shape.set_element_type(type);
+    return new_shape;
+  }
 }
 
 /* static */ bool ShapeUtil::IndexIsValid(const Shape& shape,
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index 927f9d14883..d9110ed1f35 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -380,12 +380,7 @@ xla_test(
     name = "conv_depthwise_backprop_filter_test",
     timeout = "long",
     srcs = ["conv_depthwise_backprop_filter_test.cc"],
-    # these backends do not natively handle batch group counts.
-    disabled_backends = [
-        "gpu",
-        "cpu",
-    ],
-    shard_count = 6,
+    shard_count = 40,
     deps = [
         ":test_macros_header",
         "//tensorflow/compiler/xla:execution_options_util",
@@ -2088,6 +2083,31 @@ xla_test(
     ],
 )
 
+xla_test(
+    name = "dynamism_inference_test",
+    srcs = ["dynamism_inference_test.cc"],
+    deps = [
+        ":test_macros_header",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/client:client_library",
+        "//tensorflow/compiler/xla/client:global_data",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
+        "//tensorflow/compiler/xla/client/lib:prng",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/compiler/xla/tests:test_utils",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 xla_test(
     name = "compute_constant_test",
     srcs = ["compute_constant_test.cc"],
@@ -2674,5 +2694,6 @@ xla_test(
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
+        "//tensorflow/core/platform:tensor_float_32_utils",
     ],
 )
diff --git a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
index a956b85a940..ef4ce24a839 100644
--- a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
@@ -1203,6 +1203,16 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareEqF32s) {
   ComputeAndCompareR1<bool>(&builder, {false, false, true, false, false}, {});
 }
 
+XLA_TEST_F(ArrayElementwiseOpTest, CompareEqF32sTO) {
+  SetFastMathDisabled(true);
+  XlaBuilder builder(TestName());
+  auto lhs = ConstantR1<float>(&builder, {-2.5f, 25.5f, 2.25f, NAN, 6.0f});
+  auto rhs = ConstantR1<float>(&builder, {10.0f, 5.0f, 2.25f, NAN, NAN});
+  EqTotalOrder(lhs, rhs);
+
+  ComputeAndCompareR1<bool>(&builder, {false, false, true, true, false}, {});
+}
+
 XLA_TEST_F(ArrayElementwiseOpTest, CompareEqZeroElementF32s) {
   XlaBuilder builder(TestName());
   auto lhs = ConstantR1<float>(&builder, {});
@@ -1222,6 +1232,22 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareGeF32s) {
   ComputeAndCompareR1<bool>(&builder, {false, true, true, false, false}, {});
 }
 
+XLA_TEST_F(ArrayElementwiseOpTest, CompareGeF32sTO) {
+  SetFastMathDisabled(true);
+  XlaBuilder builder(TestName());
+  // For portability, need to represent NAN using the following call.
+  // The C++ standard does not specify if quiet_NaN() sets the sign bit of
+  // its result. The call to std::fabs will ensure that it is not set.
+  auto nan = std::fabs(std::numeric_limits<float>::quiet_NaN());
+  auto lhs =
+      ConstantR1<float>(&builder, {-2.5f, 25.5f, 2.25f, nan, 6.0f, 6.0f});
+  auto rhs = ConstantR1<float>(&builder, {10.0f, 5.0f, 1.0f, 10.0f, nan, -nan});
+  GeTotalOrder(lhs, rhs);
+
+  ComputeAndCompareR1<bool>(&builder, {false, true, true, true, false, true},
+                            {});
+}
+
 XLA_TEST_F(ArrayElementwiseOpTest, CompareGtF32s) {
   SetFastMathDisabled(true);
   XlaBuilder builder(TestName());
diff --git a/tensorflow/compiler/xla/tests/buffer_donation_test.cc b/tensorflow/compiler/xla/tests/buffer_donation_test.cc
index 856ea7c9b44..f78083fe2af 100644
--- a/tensorflow/compiler/xla/tests/buffer_donation_test.cc
+++ b/tensorflow/compiler/xla/tests/buffer_donation_test.cc
@@ -61,7 +61,7 @@ class BufferDonationTest : public HloTestBase {
                    absl::Span<Literal const> argument_literals,
                    absl::Span<bool const> donate_arguments,
                    absl::Span<bool const> expected_runtime_aliasing,
-                   const Literal& expected) {
+                   const Literal& expected, std::string expected_failure = "") {
     // Create a copy of the output shape because the HLO module is std::moved
     // into the compiler and may be deallocated.
     const Shape output_shape = hlo_module->result_shape();
@@ -123,10 +123,19 @@ class BufferDonationTest : public HloTestBase {
           ExecutionInput(std::move(owned_buffers), argument_literal.shape()));
     }
 
-    TF_ASSERT_OK_AND_ASSIGN(
-        ExecutionOutput output,
+    StatusOr<ExecutionOutput> output_status =
         executable->ExecuteAsyncOnStream(&service_run_options, std::move(args),
-                                         /*hlo_execution_profile=*/nullptr));
+                                         /*hlo_execution_profile=*/nullptr);
+    if (!expected_failure.empty()) {
+      ASSERT_FALSE(output_status.ok());
+      ASSERT_TRUE(absl::StrContains(output_status.status().error_message(),
+                                    expected_failure))
+          << "got: \n"
+          << output_status.status().error_message() << " \nvs want\n"
+          << expected_failure;
+      return;
+    }
+    ExecutionOutput output = output_status.ConsumeValueOrDie();
 
     se::DeviceMemoryBase result_root_buffer = output.Result().root_buffer();
     LOG(INFO) << "result allocation = " << result_root_buffer.opaque()
@@ -303,5 +312,37 @@ ENTRY entry {
 #endif
 }
 
+TEST_F(BufferDonationTest, TestMustAliasNotDonated) {
+  HloModuleConfig config;
+
+  StatusOr<std::unique_ptr<VerifiedHloModule>> module =
+      ParseAndReturnVerifiedModule(R"(
+HloModule module
+
+ENTRY entry {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT out = (f32[], f32[]) tuple(a, b)
+}
+  )",
+                                   config);
+
+  TF_ASSERT_OK(module->get()->input_output_alias_config().SetUpAlias(
+      {0}, 0, {}, HloInputOutputAliasConfig::kMustAlias));
+
+  std::vector<Literal> args;
+  args.push_back(LiteralUtil::CreateR0<float>(0.1));
+  args.push_back(LiteralUtil::CreateR0<float>(0.2));
+  Literal expected = LiteralUtil::MakeTupleFromSlices(
+      {LiteralUtil::CreateR0<float>(0.1), LiteralUtil::CreateR0<float>(0.2)});
+
+#ifndef XLA_TEST_BACKEND_INTERPRETER
+  RunAndCheck(std::move(*module), args,
+              /*donate_arguments=*/{false, false}, {true, false}, expected,
+              "An input was configured to be must-alias at "
+              "compile time but not donated at runtime:");
+#endif
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/cholesky_test.cc b/tensorflow/compiler/xla/tests/cholesky_test.cc
index e7f5ca5ed8e..616b404b425 100644
--- a/tensorflow/compiler/xla/tests/cholesky_test.cc
+++ b/tensorflow/compiler/xla/tests/cholesky_test.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/tensor_float_32_utils.h"
 
 namespace xla {
 namespace {
@@ -181,6 +182,8 @@ class RandomCholeskyTest
       public ::testing::WithParamInterface<CholeskyTestCase> {};
 
 XLA_TEST_P(RandomCholeskyTest, Random) {
+  // Test fails with TensorFloat-32 enabled
+  tensorflow::enable_tensor_float_32_execution(false);
   XlaBuilder builder(TestName());
 
   auto test_params = GetParam();
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.cc b/tensorflow/compiler/xla/tests/client_library_test_base.cc
index 0e99ede5d01..6acbb7a9cf0 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.cc
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.cc
@@ -605,7 +605,7 @@ XlaOp ClientLibraryTestBase::CreateConstantFromLiteral(const Literal& literal,
                                       : LiteralSlice(literal));
 }
 
-std::unique_ptr<GlobalData>
+StatusOr<std::unique_ptr<GlobalData>>
 ClientLibraryTestBase::CreateParameterAndTransferLiteral(int64 parameter_number,
                                                          const Literal& literal,
                                                          const string& name,
@@ -637,15 +637,14 @@ Literal ClientLibraryTestBase::MaybeConvertLiteralToBfloat16(
   return literal.Clone();
 }
 
-std::unique_ptr<GlobalData>
+StatusOr<std::unique_ptr<GlobalData>>
 ClientLibraryTestBase::CreateParameterAndTransferLiteral(
     int64 parameter_number, const Literal& literal, const string& name,
     const DeviceHandle* device_handle, XlaBuilder* builder,
     XlaOp* data_handle) {
   Literal param_literal = MaybeConvertLiteralToBfloat16(literal);
-  std::unique_ptr<GlobalData> data =
-      client_->TransferToServer(param_literal, device_handle)
-          .ConsumeValueOrDie();
+  TF_ASSIGN_OR_RETURN(auto data,
+                      client_->TransferToServer(param_literal, device_handle));
   *data_handle =
       Parameter(builder, parameter_number, param_literal.shape(), name);
   return data;
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.h b/tensorflow/compiler/xla/tests/client_library_test_base.h
index 17bb70bdb42..3c9e37b8fa4 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.h
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.h
@@ -270,14 +270,14 @@ class ClientLibraryTestBase : public ManifestCheckingTest {
   // server, then stores into "data_handle" the global handle for that
   // parameter. When the use_bfloat16 flag is set but the literal has F32
   // elements, the literal will be converted to BF16 before being transferred.
-  std::unique_ptr<GlobalData> CreateParameterAndTransferLiteral(
+  StatusOr<std::unique_ptr<GlobalData>> CreateParameterAndTransferLiteral(
       int64 parameter_number, const Literal& literal, const string& name,
       XlaBuilder* builder, XlaOp* data_handle);
 
   // As above, but the caller can specify the device that the literal is
   // transferred to. If device_handle is nullptr, the literal will be
   // transferred to the default device.
-  std::unique_ptr<GlobalData> CreateParameterAndTransferLiteral(
+  StatusOr<std::unique_ptr<GlobalData>> CreateParameterAndTransferLiteral(
       int64 parameter_number, const Literal& literal, const string& name,
       const DeviceHandle* device_handle, XlaBuilder* builder,
       XlaOp* data_handle);
diff --git a/tensorflow/compiler/xla/tests/conv_depthwise_backprop_filter_test.cc b/tensorflow/compiler/xla/tests/conv_depthwise_backprop_filter_test.cc
index ff7e7955876..4a7070a32f3 100644
--- a/tensorflow/compiler/xla/tests/conv_depthwise_backprop_filter_test.cc
+++ b/tensorflow/compiler/xla/tests/conv_depthwise_backprop_filter_test.cc
@@ -45,13 +45,20 @@ class BatchGroupedConvolution2DTest
       public ::testing::WithParamInterface<
           ::testing::tuple<BatchGroupedConvolution2DSpec, bool>> {};
 
-static std::vector<BatchGroupedConvolution2DSpec> GetConv2DTestCases() {
+class BatchGroupedConvolution2DDepthTest
+    : public HloTestBase,
+      public ::testing::WithParamInterface<
+          ::testing::tuple<BatchGroupedConvolution2DSpec, bool>> {};
+
+static std::vector<BatchGroupedConvolution2DSpec> GetConv2DTestCases(
+    bool use_depth_multiplier) {
   std::vector<BatchGroupedConvolution2DSpec> config_set;
   std::vector<std::vector<int64>> config_options = {
-      {8, 5, 3, 2},      {4, 5, 5, 2},    {8, 7, 4, 128},
-      {16, 20, 20, 256}, {256, 7, 5, 4},  {256, 6, 6, 4},
-      {256, 8, 8, 512},  {64, 7, 7, 960}, {64, 14, 14, 576}};
+      {129, 10, 3, 2}, {4, 3, 3, 258}, {8, 4, 2, 128},
+      {8, 3, 2, 256},  {256, 7, 5, 4}, {128, 6, 6, 4},
+      {32, 5, 2, 129}, {16, 4, 3, 2},  {16, 3, 2, 64}};
 
+  int64 counter = 2;
   for (auto option : config_options) {
     int64 feature = option[3];
     int64 activation_size = option[1];
@@ -65,10 +72,16 @@ static std::vector<BatchGroupedConvolution2DSpec> GetConv2DTestCases() {
 
     config.activation_dims = {batch, activation_size, activation_size, feature};
 
-    config.kernel_dims = {batch, kernel_size, kernel_size, feature};
-
+    const int64 depthwise_multiplier = use_depth_multiplier ? counter++ : 1;
+    config.kernel_dims = {batch, kernel_size, kernel_size,
+                          feature * depthwise_multiplier};
+    // Don't let the counter grow too much, else the compute demand will grow.
+    if (counter == 4) {
+      counter = 2;
+    }
     int64 output_space_size = 3 + activation_size - kernel_size;
-    config.output_dims = {output_space_size, output_space_size, feature, 1};
+    config.output_dims = {output_space_size, output_space_size,
+                          feature * depthwise_multiplier, 1};
 
     config.activation_and_kernel_layout = {0, 3, 1, 2};
     config.output_layout = {2, 3, 0, 1};
@@ -123,11 +136,13 @@ string BatchGroupedConvolution2DTestDataToString(
 }
 
 string BuildHloTextBatchGroupedConvolution2D(
-    const BatchGroupedConvolution2DSpec& spec, bool use_bfloat16) {
+    const BatchGroupedConvolution2DSpec& spec, bool use_bfloat16,
+    bool scheduled = false) {
   const string data_type = GetFloatDataType(use_bfloat16);
+  const string scheduled_tag = scheduled ? ",is_scheduled=true" : "";
   return absl::StrFormat(
       R"(
-    HloModule TensorFlowDepthwiseConv, is_scheduled=true
+    HloModule TensorFlowDepthwiseConv %s
 
     ENTRY main {
       activation = %s[%s]{%s} parameter(0)
@@ -137,7 +152,7 @@ string BuildHloTextBatchGroupedConvolution2D(
           batch_group_count=%d
     }
     )",
-      data_type, absl::StrJoin(spec.activation_dims, ","),
+      scheduled_tag, data_type, absl::StrJoin(spec.activation_dims, ","),
       absl::StrJoin(spec.activation_and_kernel_layout, ","), data_type,
       absl::StrJoin(spec.kernel_dims, ","),
       absl::StrJoin(spec.activation_and_kernel_layout, ","), data_type,
@@ -161,23 +176,26 @@ XLA_TEST_P(BatchGroupedConvolution2DTest, DoIt) {
   }
 #endif
 
-  const string hlo_text =
-      BuildHloTextBatchGroupedConvolution2D(spec, use_bfloat16);
+  const string hlo_text = BuildHloTextBatchGroupedConvolution2D(
+      spec, use_bfloat16, /*scheduled=*/false);
 
-  EXPECT_TRUE(RunAndCompareNoHloPasses(
-      hlo_text, ErrorSpec{0.01, 0.01}, [](HloModule* module) -> Status {
-        BFloat16MixedPrecisionRemoval remover;
-        TF_RETURN_IF_ERROR(remover.Run(module).status());
-        Despecializer despecializer;
-        return despecializer.Run(module).status();
-      }));
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{0.01, 0.01}));
 }
 
 INSTANTIATE_TEST_CASE_P(
     BatchGroupedConvolution2DTestWithRandomIndices,
     BatchGroupedConvolution2DTest,
-    ::testing::Combine(::testing::ValuesIn(GetConv2DTestCases()),
-                       ::testing::Bool()),
+    ::testing::Combine(
+        ::testing::ValuesIn(GetConv2DTestCases(/*use_depth_multiplier=*/false)),
+        ::testing::Bool()),
+    BatchGroupedConvolution2DTestDataToString);
+
+INSTANTIATE_TEST_CASE_P(
+    BatchGroupedConvolution2DDepthMultiplierTestWithRandomIndices,
+    BatchGroupedConvolution2DTest,
+    ::testing::Combine(
+        ::testing::ValuesIn(GetConv2DTestCases(/*use_depth_multiplier=*/true)),
+        ::testing::Bool()),
     BatchGroupedConvolution2DTestDataToString);
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/tests/dot_operation_test.cc b/tensorflow/compiler/xla/tests/dot_operation_test.cc
index 60ba27b2050..e06e2972f1c 100644
--- a/tensorflow/compiler/xla/tests/dot_operation_test.cc
+++ b/tensorflow/compiler/xla/tests/dot_operation_test.cc
@@ -69,12 +69,14 @@ XLA_TEST_F(DotOperationTest, DotOfInputTupleElem) {
   XlaBuilder builder(TestName());
 
   XlaOp param;
-  auto param_data = CreateParameterAndTransferLiteral(
-      0,
-      LiteralUtil::MakeTupleFromSlices(
-          {LiteralUtil::CreateR2<float>({{1, 2}, {3, 4}}),
-           LiteralUtil::CreateR2<float>({{5, 6}, {7, 8}})}),
-      "arg0", &builder, &param);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto param_data,
+      CreateParameterAndTransferLiteral(
+          0,
+          LiteralUtil::MakeTupleFromSlices(
+              {LiteralUtil::CreateR2<float>({{1, 2}, {3, 4}}),
+               LiteralUtil::CreateR2<float>({{5, 6}, {7, 8}})}),
+          "arg0", &builder, &param));
   auto lhs = GetTupleElement(param, 0);
   auto rhs = GetTupleElement(param, 1);
   Dot(lhs, rhs);
diff --git a/tensorflow/compiler/xla/tests/dynamism_inference_test.cc b/tensorflow/compiler/xla/tests/dynamism_inference_test.cc
new file mode 100644
index 00000000000..a7e032448e0
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/dynamism_inference_test.cc
@@ -0,0 +1,242 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/strings/match.h"
+#include "tensorflow/compiler/xla/client/client_library.h"
+#include "tensorflow/compiler/xla/client/global_data.h"
+#include "tensorflow/compiler/xla/client/lib/prng.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/layout_util.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/compiler/xla/tests/test_utils.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+namespace {
+
+// An enumerator for the client types that we want to iterate over in
+// the various tests.
+enum class ClientType { kLocal, kCompileOnly };
+ClientType client_types[] = {ClientType::kLocal, ClientType::kCompileOnly};
+
+class DynamismInferenceTest : public ::testing::Test {
+ public:
+  explicit DynamismInferenceTest(se::Platform* platform = nullptr)
+      : platform_(platform) {}
+
+  string TestName() const {
+    return ::testing::UnitTest::GetInstance()->current_test_info()->name();
+  }
+
+  Client* ClientOrDie(se::Platform* platform, ClientType client_type) {
+    if (client_type == ClientType::kLocal) {
+      StatusOr<Client*> result =
+          ClientLibrary::GetOrCreateLocalClient(platform);
+      TF_CHECK_OK(result.status())
+          << "could not create LocalClient for testing";
+      return result.ValueOrDie();
+    } else if (client_type == ClientType::kCompileOnly) {
+      StatusOr<Client*> result =
+          ClientLibrary::GetOrCreateCompileOnlyClient(platform);
+      TF_CHECK_OK(result.status())
+          << "could not create CompileOnlyClient for testing";
+      return result.ValueOrDie();
+    }
+    LOG(FATAL) << "invalid client_type value";
+  }
+
+  StatusOr<Literal> ComputeDynamismLiteral(Client* client, XlaOp operand,
+                                           XlaBuilder* builder,
+                                           Layout* output_layout = nullptr) {
+    TF_ASSIGN_OR_RETURN(auto subgraph,
+                        builder->BuildDynamicInferenceGraph(operand));
+    TF_ASSIGN_OR_RETURN(auto computed,
+                        client->ComputeConstant(subgraph, output_layout));
+    return std::move(computed);
+  }
+
+  StatusOr<bool> ComputeDynamismScalar(Client* client, XlaOp operand,
+                                       XlaBuilder* builder,
+                                       ShapeIndex index = {}) {
+    TF_ASSIGN_OR_RETURN(auto literal, ComputeDynamismLiteral(client, operand,
+                                                             builder, nullptr));
+    return literal.Get<bool>({}, index);
+  }
+
+  se::Platform* platform_;
+};
+
+TEST_F(DynamismInferenceTest, ScalarInt32Literal) {
+  for (ClientType client_type : client_types) {
+    Client* client = ClientOrDie(platform_, client_type);
+    XlaBuilder b(TestName());
+    auto computation = ConstantR0<int32>(&b, 42);
+
+    auto value = ComputeDynamismScalar(client, computation, &b);
+    ASSERT_TRUE(value.ok()) << value.status();
+    // A constant is not dynamic.
+    EXPECT_EQ(value.ValueOrDie(), false);
+  }
+}
+
+TEST_F(DynamismInferenceTest, TupleSimple) {
+  for (ClientType client_type : client_types) {
+    Client* client = ClientOrDie(platform_, client_type);
+    XlaBuilder b(TestName());
+    auto c = ConstantR0<int32>(&b, 42);
+    auto p = Parameter(&b, 0, ShapeUtil::MakeScalarShape(S32), "p0");
+
+    auto tuple = Tuple(&b, {c, p});
+    EXPECT_EQ(ComputeDynamismScalar(client, tuple, &b, {0}).ValueOrDie(),
+              false);
+    EXPECT_EQ(ComputeDynamismScalar(client, tuple, &b, {1}).ValueOrDie(), true);
+  }
+}
+
+TEST_F(DynamismInferenceTest, TupleGteKeepsDynamism) {
+  for (ClientType client_type : client_types) {
+    Client* client = ClientOrDie(platform_, client_type);
+    XlaBuilder b(TestName());
+    auto c = ConstantR0<int32>(&b, 42);
+    auto p = Parameter(&b, 0, ShapeUtil::MakeScalarShape(S32), "p0");
+
+    auto tuple = Tuple(&b, {c, p});
+    auto gte0 = GetTupleElement(tuple, 0);
+    auto gte1 = GetTupleElement(tuple, 1);
+    auto tuple_2 = Tuple(&b, {gte0, gte1});
+    EXPECT_EQ(ComputeDynamismScalar(client, tuple_2, &b, {0}).ValueOrDie(),
+              false);
+    EXPECT_EQ(ComputeDynamismScalar(client, tuple_2, &b, {1}).ValueOrDie(),
+              true);
+  }
+}
+
+TEST_F(DynamismInferenceTest, PredValueUsedTwice) {
+  for (ClientType client_type : client_types) {
+    Client* client = ClientOrDie(platform_, client_type);
+    XlaBuilder b(TestName());
+    auto c = ConstantR0<int32>(&b, 42);
+    auto p = Parameter(&b, 0, ShapeUtil::MakeScalarShape(S32), "p0");
+    auto pred = Eq(c, p);
+    auto result = Select(pred, p, c);
+    EXPECT_EQ(ComputeDynamismScalar(client, result, &b, {}).ValueOrDie(),
+              false);
+  }
+}
+
+TEST_F(DynamismInferenceTest, ConcatSliceReshapeKeepsDynamism) {
+  for (ClientType client_type : client_types) {
+    Client* client = ClientOrDie(platform_, client_type);
+    XlaBuilder b(TestName());
+    auto c = ConstantR0<int32>(&b, 42);
+    auto p = Parameter(&b, 0, ShapeUtil::MakeScalarShape(S32), "p0");
+
+    auto concat = ConcatScalars(&b, {c, p});
+    auto slice0 = SliceInDim(concat, 0, 1, 1, 0);
+    auto reshape0 = Reshape(slice0, {});
+    auto slice1 = SliceInDim(concat, 1, 2, 1, 0);
+    auto reshape1 = Reshape(slice1, {});
+    auto tuple_2 = Tuple(&b, {reshape0, reshape1});
+    EXPECT_EQ(ComputeDynamismScalar(client, tuple_2, &b, {0}).ValueOrDie(),
+              false);
+    EXPECT_EQ(ComputeDynamismScalar(client, tuple_2, &b, {1}).ValueOrDie(),
+              true);
+  }
+}
+
+TEST_F(DynamismInferenceTest, ParameterIsDynamic) {
+  for (ClientType client_type : client_types) {
+    Client* client = ClientOrDie(platform_, client_type);
+    XlaBuilder b(TestName());
+    auto computation = Parameter(&b, 0, ShapeUtil::MakeScalarShape(S32), "p0");
+
+    auto value = ComputeDynamismScalar(client, computation, &b);
+    ASSERT_TRUE(value.ok()) << value.status();
+    // A parameter is considered dynamic.
+    EXPECT_EQ(value.ValueOrDie(), true);
+  }
+}
+
+TEST_F(DynamismInferenceTest, UnaryOpKeepsDynamism) {
+  for (ClientType client_type : client_types) {
+    Client* client = ClientOrDie(platform_, client_type);
+    XlaBuilder b(TestName());
+    auto c = ConstantR0<int32>(&b, 42);
+    auto p = Parameter(&b, 0, ShapeUtil::MakeScalarShape(S32), "p0");
+
+    auto neg0 = Neg(c);
+    auto neg1 = Neg(p);
+    auto tuple_2 = Tuple(&b, {neg0, neg1});
+    EXPECT_EQ(ComputeDynamismScalar(client, tuple_2, &b, {0}).ValueOrDie(),
+              false);
+    EXPECT_EQ(ComputeDynamismScalar(client, tuple_2, &b, {1}).ValueOrDie(),
+              true);
+  }
+}
+
+TEST_F(DynamismInferenceTest, BinaryOpsOrsDynamism) {
+  for (ClientType client_type : client_types) {
+    Client* client = ClientOrDie(platform_, client_type);
+    XlaBuilder b(TestName());
+    auto c = ConstantR0<int32>(&b, 42);
+    auto p = Parameter(&b, 0, ShapeUtil::MakeScalarShape(S32), "p0");
+
+    // Static value + static value = static
+    auto add1 = Add(c, c);
+    // Dynamic value + dynamic value = dynamic
+    auto add2 = Add(p, c);
+    auto tuple_2 = Tuple(&b, {add1, add2});
+    EXPECT_EQ(ComputeDynamismScalar(client, tuple_2, &b, {0}).ValueOrDie(),
+              false);
+    EXPECT_EQ(ComputeDynamismScalar(client, tuple_2, &b, {1}).ValueOrDie(),
+              true);
+  }
+}
+
+TEST_F(DynamismInferenceTest, GetDimensionSize) {
+  for (ClientType client_type : client_types) {
+    Client* client = ClientOrDie(platform_, client_type);
+    XlaBuilder b(TestName());
+    // param = Param([<=2, 3])
+    // get_dimension_size(param, 0) is dynamic
+    // get_dimension_size(param, 1) is static
+    auto p = Parameter(&b, 0, ShapeUtil::MakeShape(S32, {2, 3}, {true, false}),
+                       "p0");
+
+    auto gds0 = GetDimensionSize(p, 0);
+    auto gds1 = GetDimensionSize(p, 1);
+    auto tuple_2 = Tuple(&b, {gds0, gds1});
+    EXPECT_EQ(ComputeDynamismScalar(client, tuple_2, &b, {0}).ValueOrDie(),
+              true);
+    EXPECT_EQ(ComputeDynamismScalar(client, tuple_2, &b, {1}).ValueOrDie(),
+              false);
+  }
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/exhaustive_binary_16_bit_test.cc b/tensorflow/compiler/xla/tests/exhaustive_binary_16_bit_test.cc
index 09c91d4be14..dca8e31e792 100644
--- a/tensorflow/compiler/xla/tests/exhaustive_binary_16_bit_test.cc
+++ b/tensorflow/compiler/xla/tests/exhaustive_binary_16_bit_test.cc
@@ -123,8 +123,16 @@ BINARY_TEST_16BIT(Min, {
 })
 
 // TODO(bixia): Pow fails with bfloat16 on CPU.
-BINARY_TEST_16BIT(DISABLED_ON_CPU(Pow),
-                  { Run(AddEmptyBroadcastDimension(Pow), std::pow); })
+BINARY_TEST_16BIT(DISABLED_ON_CPU(Pow), {
+  // See b/162664705.
+  known_incorrect_fn_ = [](int64 val) {
+    Eigen::bfloat16 f;
+    uint16_t val_16 = val;
+    memcpy(&f, &val_16, 2);
+    return std::isnan(f);
+  };
+  Run(AddEmptyBroadcastDimension(Pow), std::pow);
+})
 
 // TODO(bixia): Atan2 fails with bfloat16 on CPU.
 BINARY_TEST_16BIT(DISABLED_ON_CPU(Atan2),
diff --git a/tensorflow/compiler/xla/tests/exhaustive_binary_test_f32_f64.cc b/tensorflow/compiler/xla/tests/exhaustive_binary_test_f32_f64.cc
index 14d3b343b6c..c6feedf9e7f 100644
--- a/tensorflow/compiler/xla/tests/exhaustive_binary_test_f32_f64.cc
+++ b/tensorflow/compiler/xla/tests/exhaustive_binary_test_f32_f64.cc
@@ -114,6 +114,10 @@ BINARY_TEST_FLOAT_32(Min, {
 //
 // TODO(bixia): Need to investigate the failure on CPU and file bugs.
 BINARY_TEST_FLOAT_32(DISABLED_ON_CPU(AbsComplex), {
+  // TODO(timshen): see b/162664705.
+  known_incorrect_fn_ = [this](int64 val) {
+    return std::isnan(this->ConvertValue(val));
+  };
   auto host_abs_complex = [](float x, float y) {
     return std::abs(std::complex<float>(x, y));
   };
@@ -198,6 +202,10 @@ BINARY_TEST_FLOAT_64(Min, {
 
 // TODO(bixia): Need to investigate the failure on CPU and file bugs.
 BINARY_TEST_FLOAT_64(DISABLED_ON_CPU(AbsComplex), {
+  // TODO(timshen): see b/162664705.
+  known_incorrect_fn_ = [this](int64 val) {
+    return std::isnan(this->ConvertValue(val));
+  };
   auto host_abs_complex = [](double x, double y) {
     return std::abs(std::complex<double>(x, y));
   };
diff --git a/tensorflow/compiler/xla/tests/exhaustive_unary_test_complex.cc b/tensorflow/compiler/xla/tests/exhaustive_unary_test_complex.cc
index b361bf94a6d..6a638d2106f 100644
--- a/tensorflow/compiler/xla/tests/exhaustive_unary_test_complex.cc
+++ b/tensorflow/compiler/xla/tests/exhaustive_unary_test_complex.cc
@@ -97,6 +97,10 @@ using ExhaustiveC128UnaryTest = ExhaustiveComplexUnaryTestBase<C128>;
 
 // TODO(b/138578594): Enable the test for the CPU backend after fixing the bug.
 UNARY_TEST_COMPLEX_64(DISABLED_ON_CPU(Log), {
+  // TODO(timshen): see b/162664705.
+  known_incorrect_fn_ = [this](int64 val) {
+    return std::isnan(this->ConvertValue(val));
+  };
   Run(Log, [](complex64 x) { return std::log<float>(x); });
 })
 
diff --git a/tensorflow/compiler/xla/tests/gather_operation_test.cc b/tensorflow/compiler/xla/tests/gather_operation_test.cc
index 0fd5f191db0..0f8a4c1e273 100644
--- a/tensorflow/compiler/xla/tests/gather_operation_test.cc
+++ b/tensorflow/compiler/xla/tests/gather_operation_test.cc
@@ -711,6 +711,24 @@ ENTRY main {
   RunTest(hlo_text, &operand, &start_indices);
 }
 
+XLA_TEST_F(GatherOperationTest, GatherFromScalarNonZeroIndices) {
+  const string hlo_text = R"(
+HloModule GatherFromScalar
+
+ENTRY main {
+  operand = f32[1,1,1] parameter(0)
+  indices = s32[2,3,50] parameter(1)
+  ROOT gather = f32[1,2,50] gather(operand, indices),
+      offset_dims={0},
+      collapsed_slice_dims={0,1},
+      start_index_map={1,0,2},
+      index_vector_dim=1,
+      slice_sizes={1,1,1}
+}
+)";
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{0, 0}));
+}
+
 class GatherClientLibraryTest : public ClientLibraryTestBase {};
 
 // Disabled on interpreter since ExecuteAsyncOnStream is not supported.
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.cc b/tensorflow/compiler/xla/tests/hlo_test_base.cc
index d0b6e5f80ed..663e7d81006 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.cc
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.cc
@@ -230,6 +230,19 @@ StatusOr<std::vector<Literal>> HloTestBase::ExecuteReplicated(
                                         device_assignment);
 }
 
+StatusOr<std::vector<Literal>> HloTestBase::ExecuteReplicated(
+    std::function<Executable*(int64)> executable_provider,
+    std::function<int64(int64)> argument_count_provider,
+    std::function<const Literal*(int64, int64)> argument_provider,
+    int64 num_replicas, bool run_hlo_passes) {
+  HloRunner::ReplicatedExecuteOptions options;
+  options.num_replicas = num_replicas;
+  options.run_hlo_passes = run_hlo_passes;
+  options.use_threads = true;
+  return test_runner_.ExecuteReplicated(
+      executable_provider, argument_count_provider, argument_provider, options);
+}
+
 StatusOr<std::unique_ptr<HloModule>> HloTestBase::MakeReferenceModule(
     const HloModule& test_module,
     const std::function<void(HloModule*)>& reference_preprocessor) {
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.h b/tensorflow/compiler/xla/tests/hlo_test_base.h
index 17c2a55ba5b..fc680e39682 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.h
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.h
@@ -169,6 +169,13 @@ class HloTestBase : public ManifestCheckingTest {
       int64 num_replicas, DeviceAssignment* device_assignment,
       bool run_hlo_passes, bool use_threads);
 
+  // Same as above, but allows passing different programs for replicas.
+  StatusOr<std::vector<Literal>> ExecuteReplicated(
+      std::function<Executable*(int64)> executable_provider,
+      std::function<int64(int64)> argument_count_provider,
+      std::function<const Literal*(int64, int64)> argument_provider,
+      int64 num_replicas, bool run_hlo_passes);
+
   // Executes the given hlo module on two backends and compares results.
   //
   // 'arguments': the input of the hlo module.
diff --git a/tensorflow/compiler/xla/tests/reduce_window_test.cc b/tensorflow/compiler/xla/tests/reduce_window_test.cc
index b209669715e..7e5b699d5e2 100644
--- a/tensorflow/compiler/xla/tests/reduce_window_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_window_test.cc
@@ -365,8 +365,9 @@ XLA_TEST_P(ReduceWindowTest, R4UnitWindow) {
   Literal input_literal = LiteralUtil::CreateR4FromArray4DWithLayout(
       input_array, LayoutUtil::MakeLayout({0, 3, 2, 1}));
   XlaOp input;
-  auto input_data = CreateParameterAndTransferLiteral(
-      0, input_literal, "parameter", &builder_, &input);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input_data, CreateParameterAndTransferLiteral(
+                           0, input_literal, "parameter", &builder_, &input));
 
   Padding padding = Padding::kSame;
   ReduceWindowAdd(input, {1, 1, 7, 1}, {1, 4, 1, 1}, padding);
@@ -423,8 +424,9 @@ XLA_TEST_P(ReduceWindowTest, R4SecondMinorStride) {
   Literal input_literal = LiteralUtil::CreateR4FromArray4DWithLayout(
       input_array, LayoutUtil::MakeLayout({3, 2, 1, 0}));
   XlaOp input;
-  auto input_data = CreateParameterAndTransferLiteral(
-      0, input_literal, "parameter", &builder_, &input);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input_data, CreateParameterAndTransferLiteral(
+                           0, input_literal, "parameter", &builder_, &input));
 
   int win_len = 1;
   int stride = 8;
@@ -444,8 +446,9 @@ XLA_TEST_P(ReduceWindowTest, R4SecondMinorUnitStride) {
   Literal input_literal = LiteralUtil::CreateR4FromArray4DWithLayout(
       input_array, LayoutUtil::MakeLayout({3, 2, 1, 0}));
   XlaOp input;
-  auto input_data = CreateParameterAndTransferLiteral(
-      0, input_literal, "parameter", &builder_, &input);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input_data, CreateParameterAndTransferLiteral(
+                           0, input_literal, "parameter", &builder_, &input));
 
   int win_len = 3;
   int stride = 1;
@@ -465,8 +468,9 @@ XLA_TEST_P(ReduceWindowTest, R4SecondMinorWin) {
   Literal input_literal = LiteralUtil::CreateR4FromArray4DWithLayout(
       input_array, LayoutUtil::MakeLayout({3, 2, 1, 0}));
   XlaOp input;
-  auto input_data = CreateParameterAndTransferLiteral(
-      0, input_literal, "parameter", &builder_, &input);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input_data, CreateParameterAndTransferLiteral(
+                           0, input_literal, "parameter", &builder_, &input));
 
   int win_len = 8;
   int stride = 5;
@@ -631,8 +635,9 @@ class R4ReduceWindowTest : public ReduceWindowTestBase,
     Literal input_literal = LiteralUtil::CreateR4FromArray4DWithLayout(
         input, LayoutUtil::MakeLayout(param.layout));
     XlaOp parameter;
-    auto input_arg = CreateParameterAndTransferLiteral(0, input_literal, "p0",
-                                                       &b, &parameter);
+    TF_ASSERT_OK_AND_ASSIGN(auto input_arg,
+                            CreateParameterAndTransferLiteral(
+                                0, input_literal, "p0", &b, &parameter));
 
     std::vector<std::pair<int64, int64>> padding(4);
     for (int i = 0; i < 4; ++i) {
@@ -1243,7 +1248,9 @@ class R2ReduceWindowTest : public ReduceWindowTestBase,
         input, LayoutUtil::MakeLayout(param.layout));
 
     XlaOp parameter;
-    CreateParameterAndTransferLiteral(0, input_literal, "p0", &b, &parameter);
+    TF_ASSERT_OK(CreateParameterAndTransferLiteral(0, input_literal, "p0", &b,
+                                                   &parameter)
+                     .status());
 
     std::vector<std::pair<int64, int64>> padding(2);
     for (int i = 0; i < 2; ++i) {
@@ -1443,8 +1450,9 @@ XLA_TEST_P(R1ReduceWindowTest, DoIt) {
   Literal input_literal =
       LiteralUtil::CreateR1(absl::Span<const float>(input_vector));
   XlaOp parameter;
-  auto input_arg =
-      CreateParameterAndTransferLiteral(0, input_literal, "p0", &b, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input_arg, CreateParameterAndTransferLiteral(0, input_literal, "p0",
+                                                        &b, &parameter));
 
   std::vector<std::pair<int64, int64>> padding(1);
   padding[0] = {param.pad_low[0], param.pad_high[0]};
diff --git a/tensorflow/compiler/xla/tests/reshape_test.cc b/tensorflow/compiler/xla/tests/reshape_test.cc
index 298136002e9..890156cc650 100644
--- a/tensorflow/compiler/xla/tests/reshape_test.cc
+++ b/tensorflow/compiler/xla/tests/reshape_test.cc
@@ -57,8 +57,9 @@ XLA_TEST_P(ReshapeTest, CollapseTrivial1x1) {
   input_array.Fill(1.0f);
   auto input_literal = LiteralUtil::CreateR2FromArray2D(input_array);
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, input_literal, "parameter",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(
+                      0, input_literal, "parameter", &builder, &parameter));
   Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1});
 
   auto expected_literal = LiteralUtil::CreateR1<float>({1.0f});
@@ -70,8 +71,9 @@ XLA_TEST_P(ReshapeTest, CollapseTrivialR1EmptyDims) {
   XlaBuilder builder(TestName());
   auto input_literal = LiteralUtil::CreateR1<float>({1.0f});
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, input_literal, "parameter",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(
+                      0, input_literal, "parameter", &builder, &parameter));
   Collapse(/*operand=*/parameter, /*dimensions=*/{});
 
   auto expected_literal = LiteralUtil::CreateR1<float>({1.0f});
@@ -83,8 +85,9 @@ XLA_TEST_P(ReshapeTest, CollapseTrivialR1OnlyDim) {
   XlaBuilder builder(TestName());
   auto input_literal = LiteralUtil::CreateR1<float>({1.0f});
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, input_literal, "parameter",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(
+                      0, input_literal, "parameter", &builder, &parameter));
   Collapse(/*operand=*/parameter, /*dimensions=*/{0});
 
   auto expected_literal = LiteralUtil::CreateR1<float>({1.0f});
@@ -99,8 +102,9 @@ XLA_TEST_P(ReshapeTest, SingleElementArrayToScalar) {
   input_array.Fill(1.0f);
   auto input_literal = LiteralUtil::CreateR2FromArray2D(input_array);
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, input_literal, "parameter",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(
+                      0, input_literal, "parameter", &builder, &parameter));
   auto reshape = Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1},
                          /*new_sizes=*/{});
   auto new_shape = builder.GetShape(reshape).ConsumeValueOrDie();
@@ -115,8 +119,9 @@ XLA_TEST_P(ReshapeTest, ScalarToSingleElementArray) {
 
   Literal param0_literal = LiteralUtil::CreateR0<float>(1.0f);
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, param0_literal, "param0",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(0, param0_literal, "param0",
+                                                    &builder, &parameter));
   auto a = Neg(parameter);
   Reshape(/*operand=*/a, /*dimensions=*/{}, /*new_sizes=*/{1});
 
@@ -130,8 +135,9 @@ XLA_TEST_P(ReshapeTest, Trivial0x3) {
   Array2D<float> input_array(0, 3);
   auto input_literal = LiteralUtil::CreateR2FromArray2D(input_array);
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
+                                                    &builder, &parameter));
   Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1});
   auto expected_literal = LiteralUtil::CreateR1<float>({});
   ComputeAndCompareLiteral(&builder, expected_literal, {input.get()},
@@ -144,8 +150,9 @@ XLA_TEST_P(ReshapeTest, Trivial0x3WithParameter) {
   Literal param0_literal =
       LiteralUtil::CreateR2FromArray2D<float>(Array2D<float>(0, 3));
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, param0_literal, "param0",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(0, param0_literal, "param0",
+                                                    &builder, &parameter));
   Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1});
   auto expected_literal = LiteralUtil::CreateR1<float>({});
   ComputeAndCompareLiteral(&builder, expected_literal, {input.get()},
@@ -157,8 +164,9 @@ XLA_TEST_P(ReshapeTest, Trivial3x0) {
   Array2D<float> input_array(3, 0);
   auto input_literal = LiteralUtil::CreateR2FromArray2D(input_array);
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
+                                                    &builder, &parameter));
   Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1});
   auto expected_literal = LiteralUtil::CreateR1<float>({});
   ComputeAndCompareLiteral(&builder, expected_literal, {input.get()},
@@ -170,8 +178,9 @@ XLA_TEST_P(ReshapeTest, Trivial1x3) {
   XlaBuilder builder(TestName());
   auto input_literal = LiteralUtil::CreateR2<float>({{1.0f, 2.0f, 3.0f}});
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
+                                                    &builder, &parameter));
   Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1});
   auto expected_literal = LiteralUtil::CreateR1<float>({1.0f, 2.0f, 3.0f});
   ComputeAndCompareLiteral(&builder, expected_literal, {input.get()},
@@ -183,8 +192,9 @@ XLA_TEST_P(ReshapeTest, Trivial3x1) {
   XlaBuilder builder(TestName());
   auto input_literal = LiteralUtil::CreateR2<float>({{1.0f}, {2.0f}, {3.0f}});
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
+                                                    &builder, &parameter));
   Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1});
   auto expected_literal = LiteralUtil::CreateR1<float>({1.0f, 2.0f, 3.0f});
   ComputeAndCompareLiteral(&builder, expected_literal, {input.get()},
@@ -196,8 +206,9 @@ XLA_TEST_P(ReshapeTest, R1ToR2_0_To_2x0) {
   XlaBuilder builder(TestName());
   auto input_literal = LiteralUtil::CreateR1<float>({});
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
+                                                    &builder, &parameter));
   Reshape(/*operand=*/parameter, /*dimensions=*/{0},
           /*new_sizes=*/{2, 0});
   auto expected_literal = LiteralUtil::CreateR2<float>({{}, {}});
@@ -211,8 +222,9 @@ XLA_TEST_P(ReshapeTest, R1ToR2_6_To_2x3) {
   auto input_literal =
       LiteralUtil::CreateR1<float>({1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f});
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
+                                                    &builder, &parameter));
   Reshape(/*operand=*/parameter, /*dimensions=*/{0},
           /*new_sizes=*/{2, 3});
   auto expected_literal =
@@ -226,8 +238,9 @@ XLA_TEST_P(ReshapeTest, Reshape0x2To2x0) {
   XlaBuilder builder(TestName());
   auto input_literal = LiteralUtil::CreateFromArray(Array2D<float>(0, 2));
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
+                                                    &builder, &parameter));
   Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1},
           /*new_sizes=*/{2, 0});
   auto expected_literal = LiteralUtil::CreateR2<float>({{}, {}});
@@ -241,8 +254,9 @@ XLA_TEST_P(ReshapeTest, ReshapeRowToCol) {
   auto simple = MakeLinspaceArray2D(1.0f, 3.0f, 1, 3);
   auto input_literal = LiteralUtil::CreateFromArray(*simple);
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
+                                                    &builder, &parameter));
   Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1},
           /*new_sizes=*/{3, 1});
 
@@ -258,8 +272,9 @@ XLA_TEST_P(ReshapeTest, TransposeAsReshape) {
   auto a4x3 = MakeLinspaceArray2D(1.0f, 12.0f, 4, 3);
   auto input_literal = LiteralUtil::CreateFromArray(*a4x3);
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
+                                                    &builder, &parameter));
   Reshape(/*operand=*/parameter, /*dimensions=*/{1, 0},
           /*new_sizes=*/{3, 4});
 
@@ -274,8 +289,9 @@ XLA_TEST_P(ReshapeTest, Transpose0x4) {
   XlaBuilder builder(TestName());
   auto input_literal = LiteralUtil::CreateFromArray(Array2D<float>(0, 4));
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
+                                                    &builder, &parameter));
   Transpose(parameter, {1, 0});
   auto expected_literal = LiteralUtil::CreateR2<float>({{}, {}, {}, {}});
   ComputeAndCompareLiteral(&builder, expected_literal, {input.get()},
@@ -288,8 +304,9 @@ XLA_TEST_P(ReshapeTest, Transpose4x3) {
   auto a4x3 = MakeLinspaceArray2D(1.0f, 12.0f, 4, 3);
   auto input_literal = LiteralUtil::CreateFromArray(*a4x3);
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
+                                                    &builder, &parameter));
   Transpose(parameter, {1, 0});
 
   auto expected = ReferenceUtil::TransposeArray2D(*a4x3);
@@ -304,8 +321,9 @@ XLA_TEST_P(ReshapeTest, ReshapeSplitNoShuffleZeroElements) {
   XlaBuilder builder(TestName());
   auto input_literal = LiteralUtil::CreateFromArray(Array2D<float>(6, 0));
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
+                                                    &builder, &parameter));
   Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1},
           /*new_sizes=*/{2, 3, 0, 0});
   auto expected_literal =
@@ -318,8 +336,9 @@ XLA_TEST_P(ReshapeTest, ReshapeR4ToR2ZeroElements) {
   XlaBuilder builder(TestName());
   auto input_literal = LiteralUtil::CreateFromArray(Array4D<float>(2, 3, 4, 0));
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
+                                                    &builder, &parameter));
   Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1, 2, 3},
           /*new_sizes=*/{24, 0});
   auto expected_literal = LiteralUtil::CreateFromArray(Array2D<float>(24, 0));
@@ -334,8 +353,9 @@ XLA_TEST_P(ReshapeTest, ReshapeSplitNoShuffle) {
   auto a4x3 = MakeLinspaceArray2D(1.0f, 12.0f, 4, 3);
   auto input_literal = LiteralUtil::CreateFromArray(*a4x3);
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
+                                                    &builder, &parameter));
   Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1},
           /*new_sizes=*/{2, 6});
 
@@ -349,8 +369,9 @@ XLA_TEST_P(ReshapeTest, ReshapeSplitAndShuffleZeroElements) {
   XlaBuilder builder(TestName());
   auto input_literal = LiteralUtil::CreateFromArray(Array2D<float>(0, 6));
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
+                                                    &builder, &parameter));
   Reshape(/*operand=*/parameter, /*dimensions=*/{1, 0},
           /*new_sizes=*/{3, 0});
   auto expected_literal = LiteralUtil::CreateFromArray(Array2D<float>(3, 0));
@@ -365,8 +386,9 @@ XLA_TEST_P(ReshapeTest, ReshapeSplitAndShuffle) {
   auto a4x3 = MakeLinspaceArray2D(1.0f, 12.0f, 4, 3);
   auto input_literal = LiteralUtil::CreateFromArray(*a4x3);
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
+                                                    &builder, &parameter));
   Reshape(/*operand=*/parameter, /*dimensions=*/{1, 0},
           /*new_sizes=*/{2, 6});
   Array2D<float> expected({{1.0f, 4.0f, 7.0f, 10.0f, 2.0f, 5.0f},
@@ -391,8 +413,9 @@ XLA_TEST_P(ReshapeTest, DocR3_R1_Collapse_012) {
   XlaBuilder builder(TestName());
   auto input_literal = LiteralUtil::CreateFromArray(ArrayForDocR3Tests());
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
+                                                    &builder, &parameter));
   Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1, 2},
           /*new_sizes=*/{24});
   auto expected_literal = LiteralUtil::CreateR1<float>(
@@ -406,8 +429,9 @@ XLA_TEST_P(ReshapeTest, DocR3_R2_Collapse_012_Refine_83) {
   XlaBuilder builder(TestName());
   auto input_literal = LiteralUtil::CreateFromArray(ArrayForDocR3Tests());
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
+                                                    &builder, &parameter));
   Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1, 2},
           /*new_sizes=*/{8, 3});
   auto expected_literal = LiteralUtil::CreateR2<float>({{10, 11, 12},
@@ -426,8 +450,9 @@ XLA_TEST_P(ReshapeTest, DocR3_R1_Collapse_120) {
   XlaBuilder builder(TestName());
   auto input_literal = LiteralUtil::CreateFromArray(ArrayForDocR3Tests());
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
+                                                    &builder, &parameter));
   Reshape(/*operand=*/parameter, /*dimensions=*/{1, 2, 0},
           /*new_sizes=*/{24});
   auto expected_literal = LiteralUtil::CreateR1<float>(
@@ -441,8 +466,9 @@ XLA_TEST_P(ReshapeTest, DocR3_R2_Collapse_120_Refine_83) {
   XlaBuilder builder(TestName());
   auto input_literal = LiteralUtil::CreateFromArray(ArrayForDocR3Tests());
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
+                                                    &builder, &parameter));
   Reshape(/*operand=*/parameter, /*dimensions=*/{1, 2, 0},
           /*new_sizes=*/{8, 3});
   auto expected_literal = LiteralUtil::CreateR2<float>({{10, 20, 30},
@@ -461,8 +487,9 @@ XLA_TEST_P(ReshapeTest, DocR3_R3_Collapse_120_Refine_262) {
   XlaBuilder builder(TestName());
   auto input_literal = LiteralUtil::CreateFromArray(ArrayForDocR3Tests());
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
+                                                    &builder, &parameter));
   Reshape(/*operand=*/parameter, /*dimensions=*/{1, 2, 0},
           /*new_sizes=*/{2, 6, 2});
   auto expected_literal = LiteralUtil::CreateR3<float>(
@@ -494,8 +521,9 @@ XLA_TEST_P(ReshapeTest, FullyConnectedCollapse) {
   t2x2x2x3.FillWithYX(*filler2x3);
   auto input_literal = LiteralUtil::CreateFromArray(t2x2x2x3);
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
+                                                    &builder, &parameter));
   Collapse(/*operand=*/parameter, /*dimensions=*/{1, 2, 3});
   auto expected_literal = LiteralUtil::CreateR2<float>(
       {{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
@@ -519,8 +547,9 @@ XLA_TEST_P(ReshapeTest, FullyConnectedCollapseDesugared) {
   t(1, 0, 1, 1) = 7;
   auto input_literal = LiteralUtil::CreateFromArray(t);
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
+                                                    &builder, &parameter));
   Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1, 2, 3},
           /*new_sizes=*/{2, 4});
 
@@ -542,8 +571,9 @@ XLA_TEST_P(ReshapeTest, ToScalar) {
     input_literal.Set<float>(zeros, 83.0f);
 
     XlaOp parameter;
-    auto input = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                   &b, &parameter);
+    TF_ASSERT_OK_AND_ASSIGN(
+        auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
+                                                      &b, &parameter));
     Reshape(parameter, dimensions, {});
 
     auto expected_literal = LiteralUtil::CreateR0<float>(83.0f);
@@ -556,8 +586,9 @@ XLA_TEST_P(ReshapeTest, BadDimensions) {
   XlaBuilder b(TestName());
   auto input_literal = LiteralUtil::CreateR1<float>({1.0f});
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, input_literal, "input", &b,
-                                                 &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
+                                                    &b, &parameter));
   Reshape(parameter, {}, {});
   EXPECT_THAT(
       ExecuteToString(&b, {}),
@@ -568,8 +599,9 @@ XLA_TEST_P(ReshapeTest, BadNewSizes) {
   XlaBuilder b(TestName());
   auto input_literal = LiteralUtil::CreateR1<float>({1.0f, 2.0f});
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, input_literal, "input", &b,
-                                                 &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
+                                                    &b, &parameter));
   Reshape(parameter, {1}, {});
   EXPECT_THAT(ExecuteToString(&b, {}),
               ::testing::HasSubstr("mismatched element counts"));
@@ -604,8 +636,9 @@ XLA_TEST_P(ReshapeTest, R4Dim0MinorLayoutToR2Dim0MajorLayout) {
        LayoutUtil::MakeLayout({0, 1, 2, 3}));
   // clang-format on
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
+                                                    &builder, &parameter));
 
   Reshape(parameter, /*dimensions=*/{0, 1, 2, 3}, /*new_sizes=*/{2, 8});
 
@@ -639,8 +672,9 @@ XLA_TEST_P(ReshapeTest, R2ToR4_3x8_To_3x2x1x4) {
       {200, 201, 202, 203, 204, 205, 206, 207},
   });
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
+                                                    &builder, &parameter));
   Reshape(parameter, /*dimensions=*/{0, 1}, /*new_sizes=*/{3, 2, 1, 4});
 
   // clang-format off
@@ -666,8 +700,9 @@ XLA_TEST_P(ReshapeTest, R2ToR4_3x8_To_3x2x1x4_Dimensions_10) {
       {200, 201, 202, 203, 204, 205, 206, 207},
   });
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
+                                                    &builder, &parameter));
   Reshape(parameter, /*dimensions=*/{1, 0}, /*new_sizes=*/{3, 2, 1, 4});
 
   // clang-format off
@@ -694,8 +729,9 @@ XLA_TEST_P(ReshapeTest, R4ToR2_2x1x1x1_To_2x1) {
   Literal input_literal = LiteralUtil::CreateR4FromArray4DWithLayout(
       input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
   XlaOp parameter;
-  auto input_data = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                      &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(auto input_data,
+                          CreateParameterAndTransferLiteral(
+                              0, input_literal, "input", &builder, &parameter));
   Reshape(parameter, /*dimensions=*/{0, 1, 2, 3}, /*new_sizes=*/{2, 1});
 
   Literal expected = LiteralUtil::ReshapeSlice({2, 1}, {1, 0}, input_literal);
@@ -713,8 +749,9 @@ XLA_TEST_P(ReshapeTest, R4ToR2_2x1x4x1_To_4x2) {
   Literal input_literal = LiteralUtil::CreateR4FromArray4DWithLayout(
       input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
   XlaOp parameter;
-  auto input_data = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                      &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(auto input_data,
+                          CreateParameterAndTransferLiteral(
+                              0, input_literal, "input", &builder, &parameter));
   Reshape(parameter, /*dimensions=*/{0, 1, 2, 3}, /*new_sizes=*/{4, 2});
 
   Literal expected = LiteralUtil::ReshapeSlice({4, 2}, {1, 0}, input_literal);
@@ -733,8 +770,9 @@ XLA_TEST_P(ReshapeTest, R4ToR2_5x10x2x3_To_5x60_Dimensions_0213) {
   Literal input_literal = LiteralUtil::CreateR4FromArray4DWithLayout(
       input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
   XlaOp parameter;
-  auto input_data = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                      &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(auto input_data,
+                          CreateParameterAndTransferLiteral(
+                              0, input_literal, "input", &builder, &parameter));
   Reshape(parameter, /*dimensions=*/{0, 2, 1, 3},
           /*new_sizes=*/{5, 60});
 
@@ -759,8 +797,9 @@ XLA_TEST_P(ReshapeTest, NoopReshape) {
   Literal input_literal = LiteralUtil::CreateR4FromArray4DWithLayout(
       input_array, LayoutUtil::MakeLayout({1, 2, 3, 0}));
   XlaOp parameter;
-  auto input_data = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                      &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(auto input_data,
+                          CreateParameterAndTransferLiteral(
+                              0, input_literal, "input", &builder, &parameter));
   Reshape(parameter, /*dimensions=*/{3, 0, 1, 2},
           /*new_sizes=*/{7, 2, 3, 5});
   XlaComputation computation = builder.Build().ConsumeValueOrDie();
@@ -793,8 +832,9 @@ XLA_TEST_P(ReshapeTest, R4ToR4Reshape_Trivial) {
         {{13, 14, 15, 16}, {17, 18, 19, 20}, {21, 22, 23, 24}}}});
 
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, literal_1x2x3x4, "input",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(0, literal_1x2x3x4, "input",
+                                                    &builder, &parameter));
   Reshape(parameter, /*dimensions=*/{0, 1, 2, 3},
           /*new_sizes=*/{1, 2, 3, 4});
 
@@ -808,8 +848,9 @@ XLA_TEST_P(ReshapeTest, R4ToR4Reshape) {
 
   XlaBuilder builder(TestName());
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, literal_1x2x3x4, "input",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(0, literal_1x2x3x4, "input",
+                                                    &builder, &parameter));
   Reshape(parameter, /*dimensions=*/{1, 3, 2, 0},
           /*new_sizes=*/{2, 4, 3, 1});
 
@@ -840,8 +881,9 @@ XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeSimple) {
       input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
   XlaBuilder builder(TestName());
   XlaOp parameter;
-  auto input_data = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                      &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(auto input_data,
+                          CreateParameterAndTransferLiteral(
+                              0, input_literal, "input", &builder, &parameter));
   Reshape(parameter, /*dimensions=*/{0, 1, 3, 2},
           /*new_sizes=*/new_bounds);
 
@@ -867,8 +909,9 @@ XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeMajorFirstEffectiveR2) {
       input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
   XlaBuilder builder(TestName());
   XlaOp parameter;
-  auto input_data = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                      &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(auto input_data,
+                          CreateParameterAndTransferLiteral(
+                              0, input_literal, "input", &builder, &parameter));
   Reshape(parameter, /*dimensions=*/{0, 1, 3, 2},
           /*new_sizes=*/new_bounds);
 
@@ -894,8 +937,9 @@ XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeMajorFirstMinorEffectiveR1) {
       input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
   XlaBuilder builder(TestName());
   XlaOp parameter;
-  auto input_data = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                      &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(auto input_data,
+                          CreateParameterAndTransferLiteral(
+                              0, input_literal, "input", &builder, &parameter));
   Reshape(parameter, /*dimensions=*/{0, 1, 3, 2},
           /*new_sizes=*/new_bounds);
 
@@ -922,8 +966,9 @@ XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeMajorFirstMinorEffectiveR1InR2) {
       input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
   XlaBuilder builder(TestName());
   XlaOp parameter;
-  auto input_data = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                      &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(auto input_data,
+                          CreateParameterAndTransferLiteral(
+                              0, input_literal, "input", &builder, &parameter));
   Reshape(parameter, /*dimensions=*/{0, 1, 3, 2},
           /*new_sizes=*/new_bounds);
 
@@ -949,8 +994,9 @@ XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeTrivialR2) {
       input, LayoutUtil::MakeLayout({0, 1, 2, 3}));
   XlaBuilder builder(TestName());
   XlaOp parameter;
-  auto input_data = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                      &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(auto input_data,
+                          CreateParameterAndTransferLiteral(
+                              0, input_literal, "input", &builder, &parameter));
   Reshape(parameter, /*dimensions=*/{1, 0, 2, 3},
           /*new_sizes=*/new_bounds);
 
diff --git a/tensorflow/compiler/jit/union_find.h b/tensorflow/compiler/xla/union_find.h
similarity index 100%
rename from tensorflow/compiler/jit/union_find.h
rename to tensorflow/compiler/xla/union_find.h
diff --git a/tensorflow/compiler/xla/util.cc b/tensorflow/compiler/xla/util.cc
index 1fbce96625b..4034e5fdd27 100644
--- a/tensorflow/compiler/xla/util.cc
+++ b/tensorflow/compiler/xla/util.cc
@@ -31,10 +31,10 @@ limitations under the License.
 #include "absl/strings/str_split.h"
 #include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/lib/bfloat16/bfloat16.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/math/math_util.h"
 #include "tensorflow/core/lib/strings/numbers.h"
+#include "tensorflow/core/platform/bfloat16.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/numbers.h"
diff --git a/tensorflow/compiler/xla/xla_data.proto b/tensorflow/compiler/xla/xla_data.proto
index e8b6105d3fe..d334f879c3e 100644
--- a/tensorflow/compiler/xla/xla_data.proto
+++ b/tensorflow/compiler/xla/xla_data.proto
@@ -627,6 +627,11 @@ message OpSharding {
   // applied, this is inferred from the instruction this sharding gets attached
   // to.
   repeated OpSharding tuple_shardings = 5;
+
+  // Only used for OTHER type. If true, data is sharded according to other
+  // dimensions of tile_assignment(), but replicated across devices along the
+  // last dimension. (Experimental)
+  bool replicate_on_last_tile_dim = 6;
 }
 
 // Describes the replica groups in a cross replica op (e.g., all-reduce and
diff --git a/tensorflow/compiler/xrt/BUILD b/tensorflow/compiler/xrt/BUILD
index 6a704be4adb..172a970d207 100644
--- a/tensorflow/compiler/xrt/BUILD
+++ b/tensorflow/compiler/xrt/BUILD
@@ -96,6 +96,7 @@ tf_gen_op_libs(
         "xrt_execute_op",
     ],
     deps = [
+        "//tensorflow/compiler/jit:flags",
         "//tensorflow/core:lib",
     ],
 )
diff --git a/tensorflow/compiler/xrt/ops/xrt_state_ops.cc b/tensorflow/compiler/xrt/ops/xrt_state_ops.cc
index a4be39b96c6..321d7409103 100644
--- a/tensorflow/compiler/xrt/ops/xrt_state_ops.cc
+++ b/tensorflow/compiler/xrt/ops/xrt_state_ops.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
@@ -20,6 +21,11 @@ limitations under the License.
 
 namespace tensorflow {
 
+static bool Initialized = [] {
+  tensorflow::GetXlaDeviceFlags()->tf_xla_enable_xla_devices = true;
+  return true;
+}();
+
 REGISTER_OP("XRTAllocate")
     .Input("allocation: string")
     .Output("handle: int64")
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 161a0a95856..6da5c43ce82 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -72,7 +72,6 @@ load(
     "if_ios",
     "if_mobile",
     "if_not_windows",
-    "if_tpu",
     "tf_android_core_proto_headers",
     "tf_cc_test",
     "tf_cc_test_mkl",
@@ -117,6 +116,7 @@ load(
     "tf_protos_all_impl",
     "tf_protos_grappler_impl",
     "tf_protos_profiler_impl",
+    "tf_tpu_dependencies",
 )
 load(
     "//tensorflow/core/platform:rules_cc.bzl",
@@ -318,7 +318,6 @@ alias(
 cc_library(
     name = "lib_proto_parsing",
     hdrs = [
-        "//tensorflow/core/lib/bfloat16:bfloat16.h",
         "//tensorflow/core/lib/core:legacy_lib_proto_parsing_headers",
         "//tensorflow/core/lib/strings:legacy_lib_proto_parsing_headers",
         "//tensorflow/core/platform:lib_proto_parsing_hdrs",
@@ -328,7 +327,6 @@ cc_library(
         ":platform_base",
         "@com_google_absl//absl/strings",
         "@double_conversion//:double-conversion",
-        "//tensorflow/core/lib/bfloat16",
         "//tensorflow/core/lib/core:errors",
         "//tensorflow/core/lib/core:stringpiece",
         "//tensorflow/core/lib/core:status",
@@ -353,6 +351,7 @@ cc_library(
 cc_library(
     name = "lib",
     hdrs = [
+        # TODO(rmlarsen): Remove bfloat16.h once dependency in third_party/swift is updated.
         "//tensorflow/core/lib/bfloat16:bfloat16.h",
         "//tensorflow/core/lib/core:legacy_lib_core_headers",
         "//tensorflow/core/lib/gtl:legacy_lib_gtl_headers",
@@ -489,6 +488,7 @@ tf_cuda_library(
         "//tensorflow/core/framework:register_types_traits.h",
         "//tensorflow/core/framework:resource_mgr.h",
         "//tensorflow/core/framework:resource_op_kernel.h",
+        "//tensorflow/core/framework:rng_alg.h",
         "//tensorflow/core/framework:selective_registration.h",
         "//tensorflow/core/framework:session_state.h",
         "//tensorflow/core/framework:shape_inference.h",
@@ -582,7 +582,6 @@ cc_library(
         "//tensorflow/core/framework:numeric_types.h",
         "//tensorflow/core/framework:tensor_types.h",
         "//tensorflow/core/framework:type_traits.h",
-        "//tensorflow/core/lib/bfloat16:bfloat16.h",
         "//tensorflow/core/platform:framework_lite_hdrs",
         "//tensorflow/core/platform/default:integral_types.h",
         "//tensorflow/core/platform/default:logging.h",
@@ -593,7 +592,6 @@ cc_library(
             "@nsync//:nsync_cpp",
         ] + [
             "//third_party/eigen3",
-            "//tensorflow/core/lib/bfloat16",
             "//tensorflow/core/platform:dynamic_annotations",
             "//tensorflow/core/platform:platform_port",
             "//tensorflow/core/platform:thread_annotations",
@@ -629,6 +627,7 @@ tf_gen_op_libs(
         "io_ops",
         "linalg_ops",
         "list_ops",
+        "map_ops",
         "lookup_ops",
         "manip_ops",
         "math_ops",
@@ -654,6 +653,7 @@ tf_gen_op_libs(
         "spectral_ops",
         "state_ops",
         "stateless_random_ops",
+        "stateless_random_ops_v2",
         "summary_ops",
         "training_ops",
     ],
@@ -672,6 +672,8 @@ tf_gen_op_libs(
         ":lib",
         ":protos_all_cc",
         # TODO(b/162630222): remove this dependency.
+        "//tensorflow/c/kernels:histogram_summary_op_lib",
+        "//tensorflow/c/kernels:merge_summary_op_lib",
         "//tensorflow/c/kernels:summary_op_lib",
     ],
 )
@@ -843,6 +845,7 @@ cc_library(
         ":io_ops_op_lib",
         ":linalg_ops_op_lib",
         ":list_ops_op_lib",
+        ":map_ops_op_lib",
         ":logging_ops_op_lib",
         ":lookup_ops_op_lib",
         ":manip_ops_op_lib",
@@ -870,11 +873,14 @@ cc_library(
         ":spectral_ops_op_lib",
         ":state_ops_op_lib",
         ":stateless_random_ops_op_lib",
+        ":stateless_random_ops_v2_op_lib",
         ":string_ops_op_lib",
         ":training_ops_op_lib",
         ":user_ops_op_lib",
         ":word2vec_ops",
         "//tensorflow/c/kernels:bitcast_op_lib",
+        "//tensorflow/c/kernels:histogram_summary_op_lib",
+        "//tensorflow/c/kernels:merge_summary_op_lib",
         "//tensorflow/c/kernels:summary_op_lib",
         "//tensorflow/compiler/mlir/tensorflow:mlir_passthrough_op",
     ] + if_chromiumos(
@@ -983,8 +989,10 @@ cc_library(
     name = "all_kernels_impl",
     visibility = [":__subpackages__"],
     deps = [
-        "//tensorflow/c/kernels:summary_op",
         "//tensorflow/c/kernels:bitcast_op",
+        "//tensorflow/c/kernels:histogram_summary_op",
+        "//tensorflow/c/kernels:merge_summary_op",
+        "//tensorflow/c/kernels:summary_op",
         "//tensorflow/core/kernels:array",
         "//tensorflow/core/kernels:audio",
         "//tensorflow/core/kernels:batch_kernels",
@@ -1008,9 +1016,8 @@ cc_library(
         "//tensorflow/core/kernels:functional_ops",
         "//tensorflow/core/kernels:grappler",
         "//tensorflow/core/kernels:histogram_op",
-        "//tensorflow/core/kernels:image",
         "//tensorflow/core/kernels:io",
-        "//tensorflow/core/kernels/linalg:linalg",
+        "//tensorflow/core/kernels:isotonic_regression_op",
         "//tensorflow/core/kernels:lookup",
         "//tensorflow/core/kernels:logging",
         "//tensorflow/core/kernels:manip",
@@ -1044,32 +1051,34 @@ cc_library(
         "//tensorflow/core/kernels:summary_kernels",
         "//tensorflow/core/kernels:training_ops",
         "//tensorflow/core/kernels:word2vec_kernels",
+        "//tensorflow/core/kernels/linalg:linalg",
+        "//tensorflow/core/kernels/image:image",
         "//tensorflow/core/kernels/sparse:kernels",
     ] + if_not_windows([
         "//tensorflow/core/kernels/neon:neon_depthwise_conv_op",
     ]) + if_mkl([
-        "//tensorflow/core/kernels:mkl_aggregate_ops",
-        "//tensorflow/core/kernels:mkl_concat_op",
-        "//tensorflow/core/kernels:mkl_dequantize_op",
-        "//tensorflow/core/kernels:mkl_conv_op",
-        "//tensorflow/core/kernels:mkl_cwise_ops_common",
-        "//tensorflow/core/kernels:mkl_fused_batch_norm_op",
-        "//tensorflow/core/kernels:mkl_identity_op",
-        "//tensorflow/core/kernels:mkl_input_conversion_op",
-        "//tensorflow/core/kernels:mkl_lrn_op",
-        "//tensorflow/core/kernels:mkl_pooling_ops",
-        "//tensorflow/core/kernels:mkl_qmatmul_op",
-        "//tensorflow/core/kernels:mkl_requantize_ops",
-        "//tensorflow/core/kernels:mkl_quantize_op",
-        "//tensorflow/core/kernels:mkl_relu_op",
-        "//tensorflow/core/kernels:mkl_reshape_op",
-        "//tensorflow/core/kernels:mkl_slice_op",
-        "//tensorflow/core/kernels:mkl_softmax_op",
-        "//tensorflow/core/kernels:mkl_transpose_op",
-        "//tensorflow/core/kernels:mkl_batch_matmul_op",
-        "//tensorflow/core/kernels:mkl_matmul_op",
-        "//tensorflow/core/kernels:mkl_tfconv_op",
-        "//tensorflow/core/kernels:mkl_tmp_bf16_ops",
+        "//tensorflow/core/kernels/mkl:mkl_aggregate_ops",
+        "//tensorflow/core/kernels/mkl:mkl_concat_op",
+        "//tensorflow/core/kernels/mkl:mkl_dequantize_op",
+        "//tensorflow/core/kernels/mkl:mkl_conv_op",
+        "//tensorflow/core/kernels/mkl:mkl_cwise_ops_common",
+        "//tensorflow/core/kernels/mkl:mkl_fused_batch_norm_op",
+        "//tensorflow/core/kernels/mkl:mkl_identity_op",
+        "//tensorflow/core/kernels/mkl:mkl_input_conversion_op",
+        "//tensorflow/core/kernels/mkl:mkl_lrn_op",
+        "//tensorflow/core/kernels/mkl:mkl_pooling_ops",
+        "//tensorflow/core/kernels/mkl:mkl_qmatmul_op",
+        "//tensorflow/core/kernels/mkl:mkl_requantize_ops",
+        "//tensorflow/core/kernels/mkl:mkl_quantize_op",
+        "//tensorflow/core/kernels/mkl:mkl_relu_op",
+        "//tensorflow/core/kernels/mkl:mkl_reshape_op",
+        "//tensorflow/core/kernels/mkl:mkl_slice_op",
+        "//tensorflow/core/kernels/mkl:mkl_softmax_op",
+        "//tensorflow/core/kernels/mkl:mkl_transpose_op",
+        "//tensorflow/core/kernels/mkl:mkl_batch_matmul_op",
+        "//tensorflow/core/kernels/mkl:mkl_matmul_op",
+        "//tensorflow/core/kernels/mkl:mkl_tfconv_op",
+        "//tensorflow/core/kernels/mkl:mkl_tmp_bf16_ops",
     ]) + if_cuda_or_rocm([
         "//tensorflow/core/kernels:cudnn_rnn_kernels",
     ]) + if_cuda([
@@ -1080,9 +1089,7 @@ cc_library(
     ]) + if_tensorrt([
         "//tensorflow/compiler/tf2tensorrt:trt_engine_resource_op_kernels",
         "//tensorflow/compiler/tf2tensorrt:trt_op_kernels",
-    ]) + if_tpu([
-        "//tensorflow/core/tpu/kernels",
-    ]),
+    ]) + tf_tpu_dependencies(),
 )
 
 cc_library(
@@ -1107,6 +1114,8 @@ cc_library(
         # these also dynamically loading.
         "//tensorflow/core/kernels:dataset_ops",  # Depends on grappler
         "//tensorflow/core/kernels:list_kernels",  # Depends on variant_op_registry.h
+        "//tensorflow/core/kernels:map_kernels",
+        "//tensorflow/core/kernels:tensor_map",
     ],
 )
 
@@ -1158,7 +1167,7 @@ cc_library(
 )
 
 # Test support library needed for higher-level (TensorFlow-specific) tests
-cc_library(
+tf_cuda_library(
     name = "testlib",
     testonly = 1,
     srcs = [
@@ -1251,7 +1260,6 @@ filegroup(
         "//tensorflow/core/example:mobile_srcs_no_runtime",
         "//tensorflow/core/framework:attr_value_proto_text_srcs",
         "//tensorflow/core/framework:mobile_srcs_no_runtime",
-        "//tensorflow/core/lib/bfloat16:mobile_srcs_no_runtime",
         "//tensorflow/core/lib/core:mobile_srcs_no_runtime",
         "//tensorflow/core/lib/gtl:mobile_srcs_no_runtime",
         "//tensorflow/core/lib/hash:mobile_srcs_no_runtime",
@@ -1290,6 +1298,7 @@ filegroup(
         "//tensorflow/core/graph:mobile_srcs_only_runtime",
         "//tensorflow/core/kernels:mobile_srcs",
         "//tensorflow/core/lib/io:mobile_srcs_only_runtime",
+        "//tensorflow/core/nccl:mobile_srcs",
         "//tensorflow/core/profiler:mobile_srcs",
         "//tensorflow/core/public:mobile_srcs_only_runtime",
         "//tensorflow/core/util/sparse:mobile_srcs_only_runtime",
@@ -1689,7 +1698,6 @@ filegroup(
         "//tensorflow/core/framework:resource_handle.h",
         "//tensorflow/core/platform:legacy_lib_internal_headers",
         "//tensorflow/core/platform:lib_internal_private_hdrs",
-        "//tensorflow/core/lib/bfloat16:bfloat16.h",
         "//tensorflow/core/lib/core:legacy_lib_core_all_headers",
         "//tensorflow/core/lib/gtl:legacy_lib_gtl_all_headers",
         "//tensorflow/core/lib/histogram:legacy_lib_histogram_all_headers",
@@ -1806,7 +1814,6 @@ cc_library(
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "//third_party/eigen3",
-        "//tensorflow/core/lib/bfloat16",
         "//tensorflow/core/lib/core:arena",
         "//tensorflow/core/lib/core:bitmap",
         "//tensorflow/core/lib/core:blocking_counter",
@@ -1887,6 +1894,7 @@ cc_library(
         "//tensorflow/core/lib/strings:strcat",
         "//tensorflow/core/lib/strings:stringprintf",
         "//tensorflow/core/platform:abi",
+        "//tensorflow/core/platform:bfloat16",
         "//tensorflow/core/platform:base64",
         "//tensorflow/core/platform:blocking_counter",
         "//tensorflow/core/platform:casts",
@@ -1973,6 +1981,7 @@ cc_library(
         ":lib",
         ":lib_internal",
         "//tensorflow/core/platform:gif",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -2005,6 +2014,11 @@ alias(
     actual = "//tensorflow/core/lib/png:png_io",
 )
 
+alias(
+    name = "portable_png_internal",
+    actual = "//tensorflow/core/lib/png:png_io",
+)
+
 alias(
     name = "android_png_internal",
     actual = "//tensorflow/core/lib/png:png_io",
@@ -2013,7 +2027,6 @@ alias(
 cc_library(
     name = "tflite_portable_logging",
     hdrs = [
-        "//tensorflow/core/lib/bfloat16:bfloat16.h",
         "//tensorflow/core/platform:tflite_portable_logging_hdrs",
         "//tensorflow/core/platform/default:integral_types.h",
         "//tensorflow/core/platform/default:logging.h",
@@ -2034,8 +2047,8 @@ cc_library(
 )
 
 cc_library(
-    name = "android_jpeg_internal",
-    srcs = if_android([
+    name = "portable_jpeg_internal",
+    srcs = if_mobile([
         "lib/jpeg/jpeg_handle.cc",
         "lib/jpeg/jpeg_mem.cc",
         "//tensorflow/core/platform:jpeg_hdrs",
@@ -2043,14 +2056,13 @@ cc_library(
     hdrs = [
         "lib/jpeg/jpeg_handle.h",
         "lib/jpeg/jpeg_mem.h",
-        "//tensorflow/core/lib/bfloat16:bfloat16.h",
         "//tensorflow/core/lib/core:legacy_lib_core_stringpiece_header",
         "//tensorflow/core/platform:jpeg_internal_hdrs",
         "//tensorflow/core/platform/default:integral_types.h",
         "//tensorflow/core/platform/default:logging.h",
     ],
     copts = tf_copts(),
-    linkopts = ["-ldl"],
+    linkopts = if_android(["-ldl"]),
     deps = [
         ":core_stringpiece",
         "//tensorflow/core/platform:dynamic_annotations",
@@ -2063,14 +2075,13 @@ cc_library(
 )
 
 cc_library(
-    name = "android_gif_internal",
-    srcs = if_android([
+    name = "portable_gif_internal",
+    srcs = if_mobile([
         "lib/gif/gif_io.cc",
         "//tensorflow/core/platform:gif_hdrs",
     ]),
     hdrs = [
         "lib/gif/gif_io.h",
-        "//tensorflow/core/lib/bfloat16:bfloat16.h",
         "//tensorflow/core/lib/core:legacy_lib_core_stringpiece_header",
         "//tensorflow/core/lib/gtl:legacy_android_gif_internal_headers",
         "//tensorflow/core/platform:gif_internal_hdrs",
@@ -2078,21 +2089,27 @@ cc_library(
         "//tensorflow/core/platform/default:logging.h",
     ],
     copts = tf_copts(),
-    linkopts = ["-ldl"],
+    linkopts = if_android(["-ldl"]),
     deps = [
-        "//tensorflow/core/lib/strings:numbers",
-        "//tensorflow/core/lib/strings:strcat",
         "//tensorflow/core/platform:dynamic_annotations",
         "//tensorflow/core/platform:gif",
         "//tensorflow/core/platform:logging",
-        "//tensorflow/core/platform:numbers",
-        "//tensorflow/core/platform:strcat",
         "//tensorflow/core/platform:stringpiece",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/strings",
     ],
 )
 
+alias(
+    name = "android_jpeg_internal",
+    actual = ":portable_jpeg_internal",
+)
+
+alias(
+    name = "android_gif_internal",
+    actual = ":portable_gif_internal",
+)
+
 alias(
     name = "error_codes_proto_impl",
     actual = "//tensorflow/core/protobuf:error_codes_proto_impl",
@@ -2693,27 +2710,27 @@ tf_cc_test_mkl(
         "//tensorflow/core/kernels:ops_util",
         "//third_party/eigen3",
     ] + if_mkl([
-        "//tensorflow/core/kernels:mkl_aggregate_ops",
-        "//tensorflow/core/kernels:mkl_batch_matmul_op",
-        "//tensorflow/core/kernels:mkl_concat_op",
-        "//tensorflow/core/kernels:mkl_conv_op",
-        "//tensorflow/core/kernels:mkl_cwise_ops_common",
-        "//tensorflow/core/kernels:mkl_dequantize_op",
-        "//tensorflow/core/kernels:mkl_fused_batch_norm_op",
-        "//tensorflow/core/kernels:mkl_identity_op",
-        "//tensorflow/core/kernels:mkl_input_conversion_op",
-        "//tensorflow/core/kernels:mkl_lrn_op",
-        "//tensorflow/core/kernels:mkl_matmul_op",
-        "//tensorflow/core/kernels:mkl_pooling_ops",
-        "//tensorflow/core/kernels:mkl_qmatmul_op",
-        "//tensorflow/core/kernels:mkl_quantize_op",
-        "//tensorflow/core/kernels:mkl_relu_op",
-        "//tensorflow/core/kernels:mkl_reshape_op",
-        "//tensorflow/core/kernels:mkl_slice_op",
-        "//tensorflow/core/kernels:mkl_softmax_op",
-        "//tensorflow/core/kernels:mkl_tfconv_op",
-        "//tensorflow/core/kernels:mkl_transpose_op",
-        "//tensorflow/core/kernels:mkl_tmp_bf16_ops",
+        "//tensorflow/core/kernels/mkl:mkl_aggregate_ops",
+        "//tensorflow/core/kernels/mkl:mkl_batch_matmul_op",
+        "//tensorflow/core/kernels/mkl:mkl_concat_op",
+        "//tensorflow/core/kernels/mkl:mkl_conv_op",
+        "//tensorflow/core/kernels/mkl:mkl_cwise_ops_common",
+        "//tensorflow/core/kernels/mkl:mkl_dequantize_op",
+        "//tensorflow/core/kernels/mkl:mkl_fused_batch_norm_op",
+        "//tensorflow/core/kernels/mkl:mkl_identity_op",
+        "//tensorflow/core/kernels/mkl:mkl_input_conversion_op",
+        "//tensorflow/core/kernels/mkl:mkl_lrn_op",
+        "//tensorflow/core/kernels/mkl:mkl_matmul_op",
+        "//tensorflow/core/kernels/mkl:mkl_pooling_ops",
+        "//tensorflow/core/kernels/mkl:mkl_qmatmul_op",
+        "//tensorflow/core/kernels/mkl:mkl_quantize_op",
+        "//tensorflow/core/kernels/mkl:mkl_relu_op",
+        "//tensorflow/core/kernels/mkl:mkl_reshape_op",
+        "//tensorflow/core/kernels/mkl:mkl_slice_op",
+        "//tensorflow/core/kernels/mkl:mkl_softmax_op",
+        "//tensorflow/core/kernels/mkl:mkl_tfconv_op",
+        "//tensorflow/core/kernels/mkl:mkl_transpose_op",
+        "//tensorflow/core/kernels/mkl:mkl_tmp_bf16_ops",
     ]),
 )
 
@@ -2965,6 +2982,8 @@ filegroup(
     srcs = [
         # PNG data
         "//tensorflow/core/lib/png:testdata",
+        "//tensorflow/core/lib/ssim:testdata",
+        "//tensorflow/core/lib/psnr:testdata",
         # JPEG data
         "lib/jpeg/testdata/jpeg_merge_test1.jpg",
         "lib/jpeg/testdata/jpeg_merge_test1_cmyk.jpg",
@@ -2986,44 +3005,14 @@ filegroup(
         # GIF data with optimization
         "lib/gif/testdata/optimized.gif",
         # BMP data
-        "lib/bmp/testdata/lena.bmp",
-        "lib/bmp/testdata/rgb_small.bmp",
-        "lib/bmp/testdata/rgb_small_255.bmp",
-        "lib/bmp/testdata/rgba_small.bmp",
-        "lib/bmp/testdata/rgba_small_255.bmp",
-        "lib/bmp/testdata/grayscale_small.bmp",
-        "lib/bmp/testdata/grayscale_small_3channels.bmp",
-        "lib/bmp/testdata/grayscale_small_4channels.bmp",
-        # SSIM, PSNR data
-        "lib/ssim/testdata/checkerboard1.png",
-        "lib/ssim/testdata/checkerboard2.png",
-        "lib/ssim/testdata/checkerboard3.png",
-        "lib/psnr/testdata/cat_q20.jpg",
-        "lib/psnr/testdata/cat_q72.jpg",
-        "lib/psnr/testdata/cat_q95.jpg",
+        "//tensorflow/core/lib/bmp:bmp_testdata",
     ],
     visibility = ["//visibility:public"],
 )
 
-filegroup(
+alias(
     name = "lmdb_testdata",
-    testonly = 1,
-    srcs = [
-        # A simple key-value store:
-        #   0 : 'b'
-        #   1 : 'b'
-        #    ...
-        #   9 : 'b'
-        # Which is then overwritten with:
-        #   0 : 'a'
-        #   1 : 'b'
-        #    ...
-        #   9 : 'j'
-        "lib/lmdb/testdata/data.mdb",
-        # LMDB, being a memory-mapped database, uses a different file format on
-        # big-endian systems.
-        "lib/lmdb/testdata/data_bigendian.mdb",
-    ],
+    actual = "//tensorflow/core/lib/lmdb:lmdb_testdata",
     visibility = ["//visibility:public"],
 )
 
diff --git a/tensorflow/core/api_def/BUILD b/tensorflow/core/api_def/BUILD
index dfa0b78cb17..e72f74e26e4 100644
--- a/tensorflow/core/api_def/BUILD
+++ b/tensorflow/core/api_def/BUILD
@@ -37,9 +37,9 @@ filegroup(
     visibility = ["//tensorflow:internal"],
 )
 
-filegroup(
+alias(
     name = "java_api_def",
-    srcs = glob(["java_api/*"]),
+    actual = "//tensorflow/core/api_def/java_api:java_api_def",
     visibility = ["//tensorflow:internal"],
 )
 
diff --git a/tensorflow/core/api_def/base_api/api_def_Acos.pbtxt b/tensorflow/core/api_def/base_api/api_def_Acos.pbtxt
index 2184b644b23..dc018aec4aa 100644
--- a/tensorflow/core/api_def/base_api/api_def_Acos.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_Acos.pbtxt
@@ -1,4 +1,11 @@
 op {
   graph_op_name: "Acos"
   summary: "Computes acos of x element-wise."
+  description: <<END
+
+  Provided an input tensor, the `tf.math.acos` operation returns the inverse cosine of each element of the tensor. If `y = tf.math.cos(x)` then, `x = tf.math.acos(y)`.
+
+  Input range is `[-1, 1]` and the output has a range of `[0, pi]`.
+
+END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_Add.pbtxt b/tensorflow/core/api_def/base_api/api_def_Add.pbtxt
index 7a408af380a..db0c12a0c59 100644
--- a/tensorflow/core/api_def/base_api/api_def_Add.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_Add.pbtxt
@@ -4,5 +4,10 @@ op {
   description: <<END
 *NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
 [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
+Given two input tensors, the `tf.add` operation computes the sum for every element in the tensor.
+
+Both input and output have a range `(-inf, inf)`.
+
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_ApplyCenteredRMSProp.pbtxt b/tensorflow/core/api_def/base_api/api_def_ApplyCenteredRMSProp.pbtxt
index c88d18d3b20..03f4020c45a 100644
--- a/tensorflow/core/api_def/base_api/api_def_ApplyCenteredRMSProp.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ApplyCenteredRMSProp.pbtxt
@@ -34,6 +34,12 @@ END
     name: "rho"
     description: <<END
 Decay rate. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "momentum"
+    description: <<END
+Momentum Scale. Must be a scalar.
 END
   }
   in_arg {
diff --git a/tensorflow/core/api_def/base_api/api_def_BatchFunction.pbtxt b/tensorflow/core/api_def/base_api/api_def_BatchFunction.pbtxt
index a7792dc9bf2..b2cace5c3bc 100644
--- a/tensorflow/core/api_def/base_api/api_def_BatchFunction.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BatchFunction.pbtxt
@@ -117,6 +117,7 @@ So, for example, in the following code
           batch_timeout_micros=100000,  # 100ms
           allowed_batch_sizes=[3, 10],
           batching_queue="")
+  ```
 
 If more than one session.run call is simultaneously trying to compute `b`
 the values of `a` will be gathered, non-deterministically concatenated
diff --git a/tensorflow/core/api_def/base_api/api_def_CollectiveGatherV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_CollectiveGatherV2.pbtxt
new file mode 100644
index 00000000000..234e978d721
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CollectiveGatherV2.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "CollectiveGatherV2"
+  summary: "Mutually accumulates multiple tensors of identical type and shape."
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_DataFormatVecPermute.pbtxt b/tensorflow/core/api_def/base_api/api_def_DataFormatVecPermute.pbtxt
index d87c088899e..5e736078f18 100644
--- a/tensorflow/core/api_def/base_api/api_def_DataFormatVecPermute.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_DataFormatVecPermute.pbtxt
@@ -24,8 +24,27 @@ END
 destination data format.
 END
   }
-  summary: "Returns the permuted vector/tensor in the destination data format given the"
+  summary: "Permute input tensor from `src_format` to `dst_format`."
   description: <<END
-one in the source data format.
+Input tensor must be a vector of size 4, or a 4x2 tensor.
+
+For example, with `src_format` of `NHWC`, `dst_format` of `NCHW`, and inputs:
+```
+[1, 2, 3, 4]
+```
+and
+```
+[[1, 2, 3, 4],
+ [5, 6, 7, 8]]
+```
+, the outputs will be (respectively):
+```
+[1, 4, 2, 3]
+```
+and
+```
+[[1, 4, 2, 3],
+ [5, 8, 6, 7]]
+```
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_EmptyTensorMap.pbtxt b/tensorflow/core/api_def/base_api/api_def_EmptyTensorMap.pbtxt
new file mode 100644
index 00000000000..fb5ce3d5413
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_EmptyTensorMap.pbtxt
@@ -0,0 +1,7 @@
+op {
+  graph_op_name: "EmptyTensorMap"
+  summary: "Creates and returns an empty tensor map."
+  description: <<END
+handle: an empty tensor map
+END
+}
\ No newline at end of file
diff --git a/tensorflow/core/api_def/base_api/api_def_ImageProjectiveTransformV3.pbtxt b/tensorflow/core/api_def/base_api/api_def_ImageProjectiveTransformV3.pbtxt
new file mode 100644
index 00000000000..f8658d74501
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ImageProjectiveTransformV3.pbtxt
@@ -0,0 +1,63 @@
+op {
+  graph_op_name: "ImageProjectiveTransformV3"
+  visibility: HIDDEN
+  in_arg {
+    name: "images"
+    description: <<END
+4-D with shape `[batch, height, width, channels]`.
+END
+  }
+  in_arg {
+    name: "transforms"
+    description: <<END
+2-D Tensor, `[batch, 8]` or `[1, 8]` matrix, where each row corresponds to a 3 x 3
+projective transformation matrix, with the last entry assumed to be 1. If there
+is one row, the same transformation will be applied to all images.
+END
+  }
+  in_arg {
+    name: "output_shape"
+    description: <<END
+1-D Tensor [new_height, new_width].
+END
+  }
+  in_arg {
+    name: "fill_value"
+    description: <<END
+float, the value to be filled when fill_mode is constant".
+END 
+  }
+  out_arg {
+    name: "transformed_images"
+    description: <<END
+4-D with shape
+`[batch, new_height, new_width, channels]`.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+Input dtype.
+END
+  }
+  attr {
+    name: "interpolation"
+    description: <<END
+Interpolation method, "NEAREST" or "BILINEAR".
+END
+  }
+  attr {
+    name: "fill_mode"
+    description: <<END
+Fill mode, "REFLECT", "WRAP", or "CONSTANT".
+END
+  }
+  summary: "Applies the given transform to each of the images."
+  description: <<END
+If one row of `transforms` is `[a0, a1, a2, b0, b1, b2, c0, c1]`, then it maps
+the *output* point `(x, y)` to a transformed *input* point
+`(x', y') = ((a0 x + a1 y + a2) / k, (b0 x + b1 y + b2) / k)`, where
+`k = c0 x + c1 y + 1`. If the transformed point lays outside of the input
+image, the output pixel is set to fill_value.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_IsotonicRegression.pbtxt b/tensorflow/core/api_def/base_api/api_def_IsotonicRegression.pbtxt
new file mode 100644
index 00000000000..3a737420005
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_IsotonicRegression.pbtxt
@@ -0,0 +1,24 @@
+op {
+  graph_op_name: "IsotonicRegression"
+  visibility: HIDDEN
+  in_arg {
+    name: "input"
+    description: <<END
+A (batch_size, dim)-tensor holding a batch of inputs.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A (batch_size, dim)-tensor holding the per-batch element solutions.
+END
+  }
+    out_arg {
+    name: "segments"
+    description: <<END
+An int32 (batch_size, dim)-tensor with the segments.
+END
+  }
+  attr { name: "output_dtype"  description: "Dtype of output." }
+  summary: "Solves a batch of isotonic regression problems."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LogMatrixDeterminant.pbtxt b/tensorflow/core/api_def/base_api/api_def_LogMatrixDeterminant.pbtxt
index 8245f7d300c..018326c3ad3 100644
--- a/tensorflow/core/api_def/base_api/api_def_LogMatrixDeterminant.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_LogMatrixDeterminant.pbtxt
@@ -26,9 +26,9 @@ one or more square matrices.
 The input is a tensor of shape `[N, M, M]` whose inner-most 2 dimensions
 form square matrices. The outputs are two tensors containing the signs and
 absolute values of the log determinants for all N input submatrices
-`[..., :, :]` such that the determinant = sign*exp(log_abs_determinant).
-The log_abs_determinant is computed as det(P)*sum(log(diag(LU))) where LU
-is the LU decomposition of the input and P is the corresponding
+`[..., :, :]` such that `determinant = sign*exp(log_abs_determinant)`.
+The `log_abs_determinant` is computed as `det(P)*sum(log(diag(LU)))` where `LU`
+is the `LU` decomposition of the input and `P` is the corresponding
 permutation matrix.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceApplyCenteredRMSProp.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceApplyCenteredRMSProp.pbtxt
index 9cc033cc897..3f306768711 100644
--- a/tensorflow/core/api_def/base_api/api_def_ResourceApplyCenteredRMSProp.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceApplyCenteredRMSProp.pbtxt
@@ -34,6 +34,12 @@ END
     name: "rho"
     description: <<END
 Decay rate. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "momentum"
+    description: <<END
+Momentum Scale. Must be a scalar.
 END
   }
   in_arg {
diff --git a/tensorflow/core/api_def/base_api/api_def_RngReadAndSkip.pbtxt b/tensorflow/core/api_def/base_api/api_def_RngReadAndSkip.pbtxt
new file mode 100644
index 00000000000..e2c5bed2a87
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RngReadAndSkip.pbtxt
@@ -0,0 +1,35 @@
+op {
+  graph_op_name: "RngReadAndSkip"
+  visibility: HIDDEN
+  in_arg {
+    name: "resource"
+    description: <<END
+The handle of the resource variable that stores the state of the RNG.
+END
+  }
+  in_arg {
+    name: "alg"
+    description: <<END
+The RNG algorithm.
+END
+  }
+  in_arg {
+    name: "delta"
+    description: <<END
+The amount of advancement.
+END
+  }
+  out_arg {
+    name: "value"
+    description: <<END
+The old value of the resource variable, before incrementing. Since state size is algorithm-dependent, this output will be right-padded with zeros to reach shape int64[3] (the current maximal state size among algorithms).
+END
+  }
+  summary: "Advance the counter of a counter-based RNG."
+  description: <<END
+The state of the RNG after
+`rng_read_and_skip(n)` will be the same as that after `uniform([n])`
+(or any other distribution). The actual increment added to the
+counter is an unspecified implementation choice.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_StatelessRandomGetKeyCounterAlg.pbtxt b/tensorflow/core/api_def/base_api/api_def_StatelessRandomGetKeyCounterAlg.pbtxt
new file mode 100644
index 00000000000..de5dd5490e5
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StatelessRandomGetKeyCounterAlg.pbtxt
@@ -0,0 +1,32 @@
+op {
+  graph_op_name: "StatelessRandomGetKeyCounterAlg"
+  visibility: HIDDEN
+  in_arg {
+    name: "seed"
+    description: <<END
+2 seeds (shape [2]).
+END
+  }
+  out_arg {
+    name: "key"
+    description: <<END
+Key for the counter-based RNG algorithm (shape uint64[1]).
+END
+  }
+  out_arg {
+    name: "counter"
+    description: <<END
+Counter for the counter-based RNG algorithm. Since counter size is algorithm-dependent, this output will be right-padded with zeros to reach shape uint64[2] (the current maximal counter size among algorithms).
+END
+  }
+  out_arg {
+    name: "alg"
+    description: <<END
+The RNG algorithm (shape int32[]).
+END
+  }
+  summary: "Picks the best algorithm based on device, and scrambles seed into key and counter."
+  description: <<END
+This op picks the best counter-based RNG algorithm based on device, and scrambles a shape-[2] seed into a key and a counter, both needed by the counter-based algorithm. The scrambling is opaque but approximately satisfies the property that different seed results in different key/counter pair (which will in turn result in different random numbers).
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_StatelessRandomNormalV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_StatelessRandomNormalV2.pbtxt
new file mode 100644
index 00000000000..e1e5a237a9b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StatelessRandomNormalV2.pbtxt
@@ -0,0 +1,46 @@
+op {
+  graph_op_name: "StatelessRandomNormalV2"
+  visibility: HIDDEN
+  in_arg {
+    name: "shape"
+    description: <<END
+The shape of the output tensor.
+END
+  }
+  in_arg {
+    name: "key"
+    description: <<END
+Key for the counter-based RNG algorithm (shape uint64[1]).
+END
+  }
+  in_arg {
+    name: "counter"
+    description: <<END
+Initial counter for the counter-based RNG algorithm (shape uint64[2] or uint64[1] depending on the algorithm). If a larger vector is given, only the needed portion on the left (i.e. [:N]) will be used.
+END
+  }
+  in_arg {
+    name: "alg"
+    description: <<END
+The RNG algorithm (shape int32[]).
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Random values with specified shape.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The type of the output.
+END
+  }
+  summary: "Outputs deterministic pseudorandom values from a normal distribution."
+  description: <<END
+The generated values will have mean 0 and standard deviation 1.
+
+The outputs are a deterministic function of `shape`, `key`, `counter` and `alg`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_StatelessRandomUniformFullIntV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_StatelessRandomUniformFullIntV2.pbtxt
new file mode 100644
index 00000000000..566dc96d2b5
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StatelessRandomUniformFullIntV2.pbtxt
@@ -0,0 +1,46 @@
+op {
+  graph_op_name: "StatelessRandomUniformFullIntV2"
+  visibility: HIDDEN
+  in_arg {
+    name: "shape"
+    description: <<END
+The shape of the output tensor.
+END
+  }
+  in_arg {
+    name: "key"
+    description: <<END
+Key for the counter-based RNG algorithm (shape uint64[1]).
+END
+  }
+  in_arg {
+    name: "counter"
+    description: <<END
+Initial counter for the counter-based RNG algorithm (shape uint64[2] or uint64[1] depending on the algorithm). If a larger vector is given, only the needed portion on the left (i.e. [:N]) will be used.
+END
+  }
+  in_arg {
+    name: "alg"
+    description: <<END
+The RNG algorithm (shape int32[]).
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Random values with specified shape.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The type of the output.
+END
+  }
+  summary: "Outputs deterministic pseudorandom random integers from a uniform distribution."
+  description: <<END
+The generated values are uniform integers covering the whole range of `dtype`.
+
+The outputs are a deterministic function of `shape`, `key`, `counter` and `alg`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_StatelessRandomUniformIntV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_StatelessRandomUniformIntV2.pbtxt
new file mode 100644
index 00000000000..ecc1292f9bc
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StatelessRandomUniformIntV2.pbtxt
@@ -0,0 +1,58 @@
+op {
+  graph_op_name: "StatelessRandomUniformIntV2"
+  visibility: HIDDEN
+  in_arg {
+    name: "shape"
+    description: <<END
+The shape of the output tensor.
+END
+  }
+  in_arg {
+    name: "key"
+    description: <<END
+Key for the counter-based RNG algorithm (shape uint64[1]).
+END
+  }
+  in_arg {
+    name: "counter"
+    description: <<END
+Initial counter for the counter-based RNG algorithm (shape uint64[2] or uint64[1] depending on the algorithm). If a larger vector is given, only the needed portion on the left (i.e. [:N]) will be used.
+END
+  }
+  in_arg {
+    name: "alg"
+    description: <<END
+The RNG algorithm (shape int32[]).
+END
+  }
+  in_arg {
+    name: "minval"
+    description: <<END
+Minimum value (inclusive, scalar).
+END
+  }
+  in_arg {
+    name: "maxval"
+    description: <<END
+Maximum value (exclusive, scalar).
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Random values with specified shape.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The type of the output.
+END
+  }
+  summary: "Outputs deterministic pseudorandom random integers from a uniform distribution."
+  description: <<END
+The generated values follow a uniform distribution in the range `[minval, maxval)`.
+
+The outputs are a deterministic function of `shape`, `key`, `counter`, `alg`, `minval` and `maxval`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_StatelessRandomUniformV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_StatelessRandomUniformV2.pbtxt
new file mode 100644
index 00000000000..96ede0ce220
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StatelessRandomUniformV2.pbtxt
@@ -0,0 +1,47 @@
+op {
+  graph_op_name: "StatelessRandomUniformV2"
+  visibility: HIDDEN
+  in_arg {
+    name: "shape"
+    description: <<END
+The shape of the output tensor.
+END
+  }
+  in_arg {
+    name: "key"
+    description: <<END
+Key for the counter-based RNG algorithm (shape uint64[1]).
+END
+  }
+  in_arg {
+    name: "counter"
+    description: <<END
+Initial counter for the counter-based RNG algorithm (shape uint64[2] or uint64[1] depending on the algorithm). If a larger vector is given, only the needed portion on the left (i.e. [:N]) will be used.
+END
+  }
+  in_arg {
+    name: "alg"
+    description: <<END
+The RNG algorithm (shape int32[]).
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Random values with specified shape.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The type of the output.
+END
+  }
+  summary: "Outputs deterministic pseudorandom random values from a uniform distribution."
+  description: <<END
+The generated values follow a uniform distribution in the range `[0, 1)`. The
+lower bound 0 is included in the range, while the upper bound 1 is excluded.
+
+The outputs are a deterministic function of `shape`, `key`, `counter` and `alg`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_StatelessSampleDistortedBoundingBox.pbtxt b/tensorflow/core/api_def/base_api/api_def_StatelessSampleDistortedBoundingBox.pbtxt
new file mode 100644
index 00000000000..2c5e32a0c1e
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StatelessSampleDistortedBoundingBox.pbtxt
@@ -0,0 +1,144 @@
+op {
+  graph_op_name: "StatelessSampleDistortedBoundingBox"
+  in_arg {
+    name: "image_size"
+    description: <<END
+1-D, containing `[height, width, channels]`.
+END
+  }
+  in_arg {
+    name: "bounding_boxes"
+    description: <<END
+3-D with shape `[batch, N, 4]` describing the N bounding boxes
+associated with the image.
+END
+  }
+  in_arg {
+    name: "seed"
+    description: <<END
+1-D with shape `[2]`. The seed to the random number generator. Must have dtype
+`int32` or `int64`. (When using XLA, only `int32` is allowed.)
+END
+  }
+  in_arg {
+    name: "min_object_covered"
+    description: <<END
+The cropped area of the image must contain at least this
+fraction of any bounding box supplied. The value of this parameter should be
+non-negative. In the case of 0, the cropped area does not need to overlap
+any of the bounding boxes supplied.
+END
+  }
+  out_arg {
+    name: "begin"
+    description: <<END
+1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
+`tf.slice`.
+END
+  }
+  out_arg {
+    name: "size"
+    description: <<END
+1-D, containing `[target_height, target_width, -1]`. Provide as input to
+`tf.slice`.
+END
+  }
+  out_arg {
+    name: "bboxes"
+    description: <<END
+3-D with shape `[1, 1, 4]` containing the distorted bounding box.
+Provide as input to `tf.image.draw_bounding_boxes`.
+END
+  }
+  attr {
+    name: "aspect_ratio_range"
+    description: <<END
+The cropped area of the image must have an aspect ratio =
+width / height within this range.
+END
+  }
+  attr {
+    name: "area_range"
+    description: <<END
+The cropped area of the image must contain a fraction of the
+supplied image within this range.
+END
+  }
+  attr {
+    name: "max_attempts"
+    description: <<END
+Number of attempts at generating a cropped region of the image
+of the specified constraints. After `max_attempts` failures, return the entire
+image.
+END
+  }
+  attr {
+    name: "use_image_if_no_bounding_boxes"
+    description: <<END
+Controls behavior if no bounding boxes supplied.
+If true, assume an implicit bounding box covering the whole input. If false,
+raise an error.
+END
+  }
+  summary: "Generate a randomly distorted bounding box for an image deterministically."
+  description: <<END
+Bounding box annotations are often supplied in addition to ground-truth labels
+in image recognition or object localization tasks. A common technique for
+training such a system is to randomly distort an image while preserving its
+content, i.e. *data augmentation*. This Op, given the same `seed`,
+deterministically outputs a randomly distorted localization of an object, i.e.
+bounding box, given an `image_size`, `bounding_boxes` and a series of
+constraints.
+
+The output of this Op is a single bounding box that may be used to crop the
+original image. The output is returned as 3 tensors: `begin`, `size` and
+`bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
+image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
+what the bounding box looks like.
+
+Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
+bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
+the height of the underlying image.
+
+The output of this Op is guaranteed to be the same given the same `seed` and is
+independent of how many times the function is called, and independent of global
+seed settings (e.g. `tf.random.set_seed`).
+
+Example usage:
+
+>>> image = np.array([[[1], [2], [3]], [[4], [5], [6]], [[7], [8], [9]]])
+>>> bbox = tf.constant(
+...   [0.0, 0.0, 1.0, 1.0], dtype=tf.float32, shape=[1, 1, 4])
+>>> seed = (1, 2)
+>>> # Generate a single distorted bounding box.
+>>> bbox_begin, bbox_size, bbox_draw = (
+...   tf.image.stateless_sample_distorted_bounding_box(
+...     tf.shape(image), bounding_boxes=bbox, seed=seed))
+>>> # Employ the bounding box to distort the image.
+>>> tf.slice(image, bbox_begin, bbox_size)
+<tf.Tensor: shape=(2, 2, 1), dtype=int64, numpy=
+array([[[1],
+        [2]],
+       [[4],
+        [5]]])>
+>>> # Draw the bounding box in an image summary.
+>>> colors = np.array([[1.0, 0.0, 0.0], [0.0, 0.0, 1.0]])
+>>> tf.image.draw_bounding_boxes(
+...   tf.expand_dims(tf.cast(image, tf.float32),0), bbox_draw, colors)
+<tf.Tensor: shape=(1, 3, 3, 1), dtype=float32, numpy=
+array([[[[1.],
+         [1.],
+         [3.]],
+        [[1.],
+         [1.],
+         [6.]],
+        [[7.],
+         [8.],
+         [9.]]]], dtype=float32)>
+
+Note that if no bounding box information is available, setting
+`use_image_if_no_bounding_boxes = true` will assume there is a single implicit
+bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
+false and no bounding boxes are supplied, an error is raised.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_StatelessTruncatedNormalV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_StatelessTruncatedNormalV2.pbtxt
new file mode 100644
index 00000000000..2f5ba025af6
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StatelessTruncatedNormalV2.pbtxt
@@ -0,0 +1,48 @@
+op {
+  graph_op_name: "StatelessTruncatedNormalV2"
+  visibility: HIDDEN
+  in_arg {
+    name: "shape"
+    description: <<END
+The shape of the output tensor.
+END
+  }
+  in_arg {
+    name: "key"
+    description: <<END
+Key for the counter-based RNG algorithm (shape uint64[1]).
+END
+  }
+  in_arg {
+    name: "counter"
+    description: <<END
+Initial counter for the counter-based RNG algorithm (shape uint64[2] or uint64[1] depending on the algorithm). If a larger vector is given, only the needed portion on the left (i.e. [:N]) will be used.
+END
+  }
+  in_arg {
+    name: "alg"
+    description: <<END
+The RNG algorithm (shape int32[]).
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Random values with specified shape.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The type of the output.
+END
+  }
+  summary: "Outputs deterministic pseudorandom values from a truncated normal distribution."
+  description: <<END
+The generated values follow a normal distribution with mean 0 and standard
+deviation 1, except that values whose magnitude is more than 2 standard
+deviations from the mean are dropped and re-picked.
+
+The outputs are a deterministic function of `shape`, `key`, `counter` and `alg`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorMapErase.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorMapErase.pbtxt
new file mode 100644
index 00000000000..9de772c0f31
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorMapErase.pbtxt
@@ -0,0 +1,9 @@
+op {
+  graph_op_name: "TensorMapErase"
+  summary: "Returns a tensor map with item from given key erased."
+  description: <<END
+input_handle: the original map
+output_handle: the map with value from given key removed
+key: the key of the value to be erased
+END
+}
\ No newline at end of file
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorMapHasKey.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorMapHasKey.pbtxt
new file mode 100644
index 00000000000..fc46a3abfd9
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorMapHasKey.pbtxt
@@ -0,0 +1,9 @@
+op {
+  graph_op_name: "TensorMapHasKey"
+  summary: "Returns whether the given key exists in the map."
+  description: <<END
+input_handle: the input map
+key: the key to check
+has_key: whether the key is already in the map or not
+END
+}
\ No newline at end of file
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorMapInsert.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorMapInsert.pbtxt
new file mode 100644
index 00000000000..4f3dba60d3e
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorMapInsert.pbtxt
@@ -0,0 +1,10 @@
+op {
+  graph_op_name: "TensorMapInsert"
+  summary: "Returns a map that is the 'input_handle' with the given key-value pair inserted."
+  description: <<END
+input_handle: the original map
+output_handle: the map with key and value inserted
+key: the key to be inserted
+value: the value to be inserted
+END
+}
\ No newline at end of file
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorMapLookup.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorMapLookup.pbtxt
new file mode 100644
index 00000000000..f6f30f95845
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorMapLookup.pbtxt
@@ -0,0 +1,9 @@
+op {
+  graph_op_name: "TensorMapLookup"
+  summary: "Returns the value from a given key in a tensor map."
+  description: <<END
+input_handle: the input map
+key: the key to be looked up
+value: the value found from the given key
+END
+}
\ No newline at end of file
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorMapSize.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorMapSize.pbtxt
new file mode 100644
index 00000000000..4a2f3aef65f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorMapSize.pbtxt
@@ -0,0 +1,8 @@
+op {
+  graph_op_name: "TensorMapSize"
+  summary: "Returns the number of tensors in the input tensor map."
+  description: <<END
+input_handle: the input map
+size: the number of tensors in the map
+END
+}
\ No newline at end of file
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorMapStackKeys.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorMapStackKeys.pbtxt
new file mode 100644
index 00000000000..a8ecb43328a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorMapStackKeys.pbtxt
@@ -0,0 +1,8 @@
+op {
+  graph_op_name: "TensorMapStackKeys"
+  summary: "Returns a Tensor stack of all keys in a tensor map."
+  description: <<END
+input_handle: the input map
+keys: the returned Tensor of all keys in the map
+END
+}
\ No newline at end of file
diff --git a/tensorflow/core/api_def/base_api/api_def_WriteAudioSummary.pbtxt b/tensorflow/core/api_def/base_api/api_def_WriteAudioSummary.pbtxt
index 520952cd411..e7e0a95cabc 100644
--- a/tensorflow/core/api_def/base_api/api_def_WriteAudioSummary.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_WriteAudioSummary.pbtxt
@@ -1,4 +1,9 @@
 op {
   graph_op_name: "WriteAudioSummary"
   visibility: HIDDEN
+  summary: "Writes an audio summary."
+  description: <<END
+Writes encoded audio summary `tensor` at `step` with `tag` using summary `writer`.
+`sample_rate` is the audio sample rate is Hz.
+END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_WriteGraphSummary.pbtxt b/tensorflow/core/api_def/base_api/api_def_WriteGraphSummary.pbtxt
index 3653477b206..258978c3e4e 100644
--- a/tensorflow/core/api_def/base_api/api_def_WriteGraphSummary.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_WriteGraphSummary.pbtxt
@@ -1,4 +1,8 @@
 op {
   graph_op_name: "WriteGraphSummary"
   visibility: HIDDEN
+  summary: "Writes a graph summary."
+  description: <<END
+Writes TensorFlow graph `tensor` at `step` using summary `writer`.
+END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_WriteHistogramSummary.pbtxt b/tensorflow/core/api_def/base_api/api_def_WriteHistogramSummary.pbtxt
index 26e14826305..88fcbcee7b3 100644
--- a/tensorflow/core/api_def/base_api/api_def_WriteHistogramSummary.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_WriteHistogramSummary.pbtxt
@@ -1,4 +1,8 @@
 op {
   graph_op_name: "WriteHistogramSummary"
   visibility: HIDDEN
+  summary: "Writes a histogram summary."
+  description: <<END
+Writes histogram `values` at `step` with `tag` using summary `writer`.
+END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_WriteImageSummary.pbtxt b/tensorflow/core/api_def/base_api/api_def_WriteImageSummary.pbtxt
index 78db8700f0c..cff4c111b6f 100644
--- a/tensorflow/core/api_def/base_api/api_def_WriteImageSummary.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_WriteImageSummary.pbtxt
@@ -1,4 +1,9 @@
 op {
   graph_op_name: "WriteImageSummary"
   visibility: HIDDEN
+  summary: "Writes an image summary."
+  description: <<END
+Writes image `tensor` at `step` with `tag` using summary `writer`.
+`tensor` is image with shape [height, width, channels].
+END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_WriteRawProtoSummary.pbtxt b/tensorflow/core/api_def/base_api/api_def_WriteRawProtoSummary.pbtxt
index 42c3bd0238e..05f48be96aa 100644
--- a/tensorflow/core/api_def/base_api/api_def_WriteRawProtoSummary.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_WriteRawProtoSummary.pbtxt
@@ -1,4 +1,8 @@
 op {
   graph_op_name: "WriteRawProtoSummary"
   visibility: HIDDEN
+  summary: "Writes a serialized proto summary."
+  description: <<END
+Writes `tensor`, a serialized proto at `step` using summary `writer`.
+END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_WriteScalarSummary.pbtxt b/tensorflow/core/api_def/base_api/api_def_WriteScalarSummary.pbtxt
index 7bae8638d25..d6c653e569a 100644
--- a/tensorflow/core/api_def/base_api/api_def_WriteScalarSummary.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_WriteScalarSummary.pbtxt
@@ -1,4 +1,8 @@
 op {
   graph_op_name: "WriteScalarSummary"
   visibility: HIDDEN
+  summary: "Writes a scalar summary."
+  description: <<END
+Writes scalar `value` at `step` with `tag` using summary `writer`.
+END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_WriteSummary.pbtxt b/tensorflow/core/api_def/base_api/api_def_WriteSummary.pbtxt
index db86883e21e..d652ada01b1 100644
--- a/tensorflow/core/api_def/base_api/api_def_WriteSummary.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_WriteSummary.pbtxt
@@ -1,4 +1,8 @@
 op {
   graph_op_name: "WriteSummary"
   visibility: HIDDEN
+  summary: "Writes a tensor summary."
+  description: <<END
+Writes `tensor` at `step` with `tag` using summary `writer`.
+END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_XlaRecvFromHost.pbtxt b/tensorflow/core/api_def/base_api/api_def_XlaRecvFromHost.pbtxt
index 1ca7ae081a3..a8356944eca 100644
--- a/tensorflow/core/api_def/base_api/api_def_XlaRecvFromHost.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_XlaRecvFromHost.pbtxt
@@ -1,3 +1,10 @@
 op {
   graph_op_name: "XlaRecvFromHost"
+  summary: "An op to receive a tensor from the host."
+  description: <<END
+output: the tensor that will be received from the host.
+Toutput: element type for output.
+shape: shape for output.
+key: A unique identifier for this region used to match up host transfers.
+END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_XlaSendToHost.pbtxt b/tensorflow/core/api_def/base_api/api_def_XlaSendToHost.pbtxt
index ef6f5e22fc0..4ef9c562542 100644
--- a/tensorflow/core/api_def/base_api/api_def_XlaSendToHost.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_XlaSendToHost.pbtxt
@@ -9,4 +9,10 @@ op {
   attr {
     name: "key"
   }
+  summary: "An op to send a tensor to the host."
+  description: <<END
+input: the tensor that will be sent to the host.
+Tinput: element type for input.
+key: A unique identifier for this region used to match up host transfers.
+END
 }
diff --git a/tensorflow/core/api_def/java_api/BUILD b/tensorflow/core/api_def/java_api/BUILD
new file mode 100644
index 00000000000..6aa8bb64961
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/BUILD
@@ -0,0 +1,13 @@
+# Description:
+#   Provides Java ApiDef access and ApiDef validation for TensorFlow.
+
+package(
+    default_visibility = ["//visibility:private"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+filegroup(
+    name = "java_api_def",
+    srcs = glob(["*"]),
+    visibility = ["//tensorflow:internal"],
+)
diff --git a/tensorflow/core/api_def/python_api/api_def_DiagPart.pbtxt b/tensorflow/core/api_def/python_api/api_def_DiagPart.pbtxt
index 6a149848f69..c50370ccc78 100644
--- a/tensorflow/core/api_def/python_api/api_def_DiagPart.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_DiagPart.pbtxt
@@ -1,10 +1,4 @@
 op {
   graph_op_name: "DiagPart"
-  endpoint {
-    name: "linalg.tensor_diag_part"
-  }
-  endpoint {
-    name: "diag_part"
-    deprecation_version: 2
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_StatelessSampleDistortedBoundingBox.pbtxt b/tensorflow/core/api_def/python_api/api_def_StatelessSampleDistortedBoundingBox.pbtxt
new file mode 100644
index 00000000000..2ee453ee2f5
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_StatelessSampleDistortedBoundingBox.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "StatelessSampleDistortedBoundingBox"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/common_runtime/BUILD b/tensorflow/core/common_runtime/BUILD
index 2dbcfdbee38..788f7dda5e1 100644
--- a/tensorflow/core/common_runtime/BUILD
+++ b/tensorflow/core/common_runtime/BUILD
@@ -13,9 +13,6 @@ load(
 # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_cc_test_gpu")
 
-# buildifier: disable=same-origin-load
-load("//tensorflow:tensorflow.bzl", "tf_cc_tests_gpu")
-
 # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
 
@@ -91,7 +88,6 @@ cc_library(
     deps = [
         ":core_cpu",
         "//tensorflow/core/common_runtime/gpu:gpu_runtime",
-        "//tensorflow/core/common_runtime/sycl:sycl_runtime",
     ] + if_tpu(["//tensorflow/core/tpu:tpu_runtime"]),
 )
 
@@ -269,6 +265,7 @@ filegroup(
         "threadpool_device.h",
         "process_state.h",
         "pool_allocator.h",
+        "permuter.h",
     ] + if_mkl(["//tensorflow/core/graph:mkl_graph_util_header"]),
 )
 
@@ -298,6 +295,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/profiler/lib:connected_traceme",
         "//tensorflow/core/profiler/lib:traceme",
     ],
 )
@@ -381,7 +379,6 @@ cc_library(
     hdrs = ["collective_param_resolver_local.h"],
     copts = tf_copts(),
     deps = [
-        ":device",
         ":device_mgr",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -548,6 +545,9 @@ cc_library(
     deps = [
         ":device_mgr",
         "//tensorflow/core:framework",
+        "//tensorflow/core/framework:device_attributes_proto_cc",
+        "//tensorflow/core/platform:errors",
+        "//tensorflow/core/platform:status",
     ],
 )
 
@@ -664,6 +664,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/profiler/lib:connected_traceme",
         "//tensorflow/core/profiler/lib:traceme",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -1050,10 +1051,13 @@ cc_library(
     deps = [
         ":function",
         ":optimization_registry",
+        "@com_google_absl//absl/base",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
     ] + mkl_deps(),
     alwayslink = 1,
 )
@@ -1073,6 +1077,7 @@ cc_library(
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
     ] + mkl_deps(),
     alwayslink = 1,
 )
@@ -1125,6 +1130,27 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "permuter",
+    srcs = ["permuter.cc"],
+    hdrs = ["permuter.h"],
+    copts = tf_copts(),
+    deps = [
+        ":base_collective_executor",
+        ":collective_rma_local",
+        ":collective_util",
+        ":copy_tensor",
+        ":device",
+        ":device_mgr",
+        ":dma_helper",
+        ":process_util",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/profiler/lib:traceme",
+    ],
+    alwayslink = 1,
+)
+
 cc_library(
     name = "pool_allocator",
     srcs = ["pool_allocator.cc"],
@@ -1581,6 +1607,7 @@ tf_cuda_library(
         ":parallel_concat_optimizer",
         ":partitioning_utils",
         ":pending_counts",
+        ":permuter",
         ":placer",
         ":pool_allocator",
         ":process_state",
@@ -1639,6 +1666,7 @@ tf_cuda_library(
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "//tensorflow/core/grappler:grappler_item",
@@ -1711,6 +1739,7 @@ tf_cuda_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/debug:debug_graph_utils",
         "//tensorflow/core/kernels:function_ops",
+        "//tensorflow/core/nccl:collective_communicator",
         "//tensorflow/core/profiler/lib:connected_traceme",
         "//tensorflow/core/profiler/lib:profiler_backends",
         "//tensorflow/core/profiler/lib:profiler_session",
@@ -1840,6 +1869,7 @@ tf_cc_tests(
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
         "//tensorflow/core/kernels:ops_util",
+        "//tensorflow/core/nccl:collective_communicator",
         "//tensorflow/core/platform:regexp",
         "//tensorflow/core/util:protos_test_cc",
         "//third_party/eigen3",
@@ -1885,9 +1915,9 @@ tf_cc_tests(
     ],
 )
 
-tf_cc_tests_gpu(
+tf_cuda_cc_test(
     name = "ring_reducer_test",
-    size = "medium",
+    size = "small",
     srcs = [
         "ring_reducer_test.cc",
     ],
@@ -1900,7 +1930,6 @@ tf_cc_tests_gpu(
         "//tensorflow/core:all_kernels",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
-        "//tensorflow/core:gpu_runtime",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:ops",
@@ -1908,14 +1937,13 @@ tf_cc_tests_gpu(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
-        "//tensorflow/core/util:protos_test_cc",
         "@com_google_absl//absl/memory",
     ],
 )
 
-tf_cc_tests_gpu(
+tf_cuda_cc_test(
     name = "ring_gatherer_test",
-    size = "medium",
+    size = "small",
     srcs = [
         "ring_gatherer_test.cc",
     ],
@@ -1935,15 +1963,13 @@ tf_cc_tests_gpu(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
-        "//tensorflow/core/common_runtime/gpu:gpu_runtime",
-        "//tensorflow/core/util:protos_test_cc",
         "@com_google_absl//absl/memory",
     ],
 )
 
-tf_cc_tests_gpu(
+tf_cuda_cc_test(
     name = "hierarchical_tree_broadcaster_test",
-    size = "medium",
+    size = "small",
     srcs = [
         "hierarchical_tree_broadcaster_test.cc",
     ],
@@ -1963,8 +1989,32 @@ tf_cc_tests_gpu(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
-        "//tensorflow/core/common_runtime/gpu:gpu_runtime",
-        "//tensorflow/core/util:protos_test_cc",
+        "@com_google_absl//absl/memory",
+    ],
+)
+
+tf_cuda_cc_test(
+    name = "permuter_test",
+    size = "small",
+    srcs = [
+        "permuter_test.cc",
+    ],
+    linkstatic = tf_kernel_tests_linkstatic(),
+    tags = ["no_cuda_on_cpu_tap"],
+    deps = [
+        ":core",
+        ":core_cpu",
+        ":core_cpu_internal",
+        "//tensorflow/core:all_kernels",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:ops",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
         "@com_google_absl//absl/memory",
     ],
 )
@@ -2175,6 +2225,7 @@ tf_cuda_cc_test(
     srcs = ["direct_session_test.cc"],
     args = [] + if_cuda(["--heap_check=local"]),  # The GPU tracer leaks memory
     linkstatic = tf_kernel_tests_linkstatic(),
+    tags = ["notsan"],  # b/168317266
     deps = [
         ":core_cpu",
         ":core_cpu_internal",
@@ -2217,6 +2268,9 @@ tf_cc_test(
     size = "small",
     srcs = ["direct_session_test.cc"],
     linkstatic = tf_kernel_tests_linkstatic(),
+    tags = [
+        "notsan",  #b/168811551
+    ],
     deps = [
         ":core",
         ":core_cpu",
@@ -2597,27 +2651,27 @@ tf_cc_test_mkl(
         "//tensorflow/core/kernels:ops_util",
         "//third_party/eigen3",
     ] + if_mkl([
-        "//tensorflow/core/kernels:mkl_aggregate_ops",
-        "//tensorflow/core/kernels:mkl_batch_matmul_op",
-        "//tensorflow/core/kernels:mkl_concat_op",
-        "//tensorflow/core/kernels:mkl_conv_op",
-        "//tensorflow/core/kernels:mkl_cwise_ops_common",
-        "//tensorflow/core/kernels:mkl_dequantize_op",
-        "//tensorflow/core/kernels:mkl_fused_batch_norm_op",
-        "//tensorflow/core/kernels:mkl_identity_op",
-        "//tensorflow/core/kernels:mkl_input_conversion_op",
-        "//tensorflow/core/kernels:mkl_lrn_op",
-        "//tensorflow/core/kernels:mkl_matmul_op",
-        "//tensorflow/core/kernels:mkl_pooling_ops",
-        "//tensorflow/core/kernels:mkl_qmatmul_op",
-        "//tensorflow/core/kernels:mkl_quantize_op",
-        "//tensorflow/core/kernels:mkl_relu_op",
-        "//tensorflow/core/kernels:mkl_reshape_op",
-        "//tensorflow/core/kernels:mkl_slice_op",
-        "//tensorflow/core/kernels:mkl_softmax_op",
-        "//tensorflow/core/kernels:mkl_tfconv_op",
-        "//tensorflow/core/kernels:mkl_transpose_op",
-        "//tensorflow/core/kernels:mkl_tmp_bf16_ops",
+        "//tensorflow/core/kernels/mkl:mkl_aggregate_ops",
+        "//tensorflow/core/kernels/mkl:mkl_batch_matmul_op",
+        "//tensorflow/core/kernels/mkl:mkl_concat_op",
+        "//tensorflow/core/kernels/mkl:mkl_conv_op",
+        "//tensorflow/core/kernels/mkl:mkl_cwise_ops_common",
+        "//tensorflow/core/kernels/mkl:mkl_dequantize_op",
+        "//tensorflow/core/kernels/mkl:mkl_fused_batch_norm_op",
+        "//tensorflow/core/kernels/mkl:mkl_identity_op",
+        "//tensorflow/core/kernels/mkl:mkl_input_conversion_op",
+        "//tensorflow/core/kernels/mkl:mkl_lrn_op",
+        "//tensorflow/core/kernels/mkl:mkl_matmul_op",
+        "//tensorflow/core/kernels/mkl:mkl_pooling_ops",
+        "//tensorflow/core/kernels/mkl:mkl_qmatmul_op",
+        "//tensorflow/core/kernels/mkl:mkl_quantize_op",
+        "//tensorflow/core/kernels/mkl:mkl_relu_op",
+        "//tensorflow/core/kernels/mkl:mkl_reshape_op",
+        "//tensorflow/core/kernels/mkl:mkl_slice_op",
+        "//tensorflow/core/kernels/mkl:mkl_softmax_op",
+        "//tensorflow/core/kernels/mkl:mkl_tfconv_op",
+        "//tensorflow/core/kernels/mkl:mkl_transpose_op",
+        "//tensorflow/core/kernels/mkl:mkl_tmp_bf16_ops",
     ]),
 )
 
diff --git a/tensorflow/core/common_runtime/base_collective_executor.cc b/tensorflow/core/common_runtime/base_collective_executor.cc
index 80820c9022c..8a059cf3d35 100644
--- a/tensorflow/core/common_runtime/base_collective_executor.cc
+++ b/tensorflow/core/common_runtime/base_collective_executor.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "tensorflow/core/platform/refcount.h"
 #include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/lib/connected_traceme.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 
 #define VALUE_IN_DEBUG_STRING false
@@ -255,6 +256,7 @@ void BaseCollectiveExecutor::ExecuteAsync(OpKernelContext* ctx,
   Tensor* output = ctx->mutable_output(0);
   const Tensor* input = (col_params.instance.type == REDUCTION_COLLECTIVE ||
                          col_params.instance.type == GATHER_COLLECTIVE ||
+                         col_params.instance.type == PERMUTE_COLLECTIVE ||
                          (col_params.instance.type == BROADCAST_COLLECTIVE &&
                           col_params.is_source))
                             ? &ctx->input(0)
@@ -268,8 +270,8 @@ void BaseCollectiveExecutor::ExecuteAsync(OpKernelContext* ctx,
   }
   core::ScopedUnref unref(col_impl);
   auto col_ctx = std::make_shared<CollectiveContext>(
-      this, dev_mgr_, ctx, CtxParams(ctx), col_params, exec_key, step_id_,
-      input, output);
+      this, cem_->GetNcclCommunicator(), dev_mgr_, ctx, CtxParams(ctx),
+      col_params, exec_key, step_id_, input, output);
   status = col_impl->InitializeCollectiveContext(col_ctx);
   if (!status.ok()) {
     done_safe(status);
@@ -278,16 +280,18 @@ void BaseCollectiveExecutor::ExecuteAsync(OpKernelContext* ctx,
   // Run on an unbounded work queue that can handle blocking work so as to not
   // starve executor threads.
   col_impl->Ref();
-  remote_access_->RunClosure([col_impl, col_ctx, done_safe, ctx]() {
+  profiler::TraceMeProducer producer("BaseCollectiveExecutor::ExecuteAsync");
+  RunClosure([col_impl, col_ctx, done_safe, ctx,
+              context_id = producer.GetContextId()]() {
     core::ScopedUnref unref(col_impl);
-    profiler::TraceMe activity(
+    profiler::TraceMeConsumer consumer(
         [ctx] {
           string op = profiler::TraceMeOp(ctx->op_kernel().name_view(),
                                           ctx->op_kernel().type_string_view());
           return profiler::TraceMeEncode(std::move(op),
                                          {{"id", ctx->step_id()}});
         },
-        profiler::TraceMeLevel::kInfo);
+        context_id);
     col_impl->Ref();
     col_impl->Run([col_impl, col_ctx, done_safe](const Status& s) {
       core::ScopedUnref unref(col_impl);
@@ -297,8 +301,8 @@ void BaseCollectiveExecutor::ExecuteAsync(OpKernelContext* ctx,
 }
 
 void BaseCollectiveExecutor::CompleteParamsAsync(
-    const string& device, CollectiveParams* cp, CancellationManager* cancel_mgr,
-    StatusCallback done) {
+    const DeviceAttributes& device, CollectiveParams* cp,
+    CancellationManager* cancel_mgr, StatusCallback done) {
   cp->instance.gpu_ring_order = *gpu_ring_order_;
   const auto is_callback_called = std::make_shared<std::atomic<bool>>(false);
   auto done_with_timeout = done;
diff --git a/tensorflow/core/common_runtime/base_collective_executor.h b/tensorflow/core/common_runtime/base_collective_executor.h
index 8c579856d7d..4081b887add 100644
--- a/tensorflow/core/common_runtime/base_collective_executor.h
+++ b/tensorflow/core/common_runtime/base_collective_executor.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/buf_rendezvous.h"
 #include "tensorflow/core/framework/collective.h"
 #include "tensorflow/core/framework/device_attributes.pb.h"
+#include "tensorflow/core/platform/unbounded_work_queue.h"
 
 namespace tensorflow {
 class CollectiveImplementation;
@@ -95,14 +96,15 @@ CollectiveAdapter* MakeCollectiveAdapter(Tensor* output, int num_chunks,
 class BaseCollectiveExecutor : public CollectiveExecutor {
  public:
   BaseCollectiveExecutor(CollectiveExecutorMgrInterface* cem,
-                         PerStepCollectiveRemoteAccess* remote_access,
-                         int64 step_id, const DeviceMgr* dev_mgr,
-                         const string* gpu_ring_order)
+                         CollectiveRemoteAccess* remote_access, int64 step_id,
+                         const DeviceMgr* dev_mgr, const string* gpu_ring_order,
+                         std::shared_ptr<UnboundedWorkQueue> work_queue)
       : CollectiveExecutor(cem),
         step_id_(step_id),
         dev_mgr_(dev_mgr),
         remote_access_(remote_access),
-        gpu_ring_order_(gpu_ring_order) {}
+        gpu_ring_order_(gpu_ring_order),
+        work_queue_(std::move(work_queue)) {}
 
   ~BaseCollectiveExecutor() override;
 
@@ -111,39 +113,16 @@ class BaseCollectiveExecutor : public CollectiveExecutor {
   void ExecuteAsync(OpKernelContext* ctx, const CollectiveParams& col_params,
                     const string& exec_key, StatusCallback done) override;
 
-  void CompleteParamsAsync(const string& device, CollectiveParams* cp,
+  void CompleteParamsAsync(const DeviceAttributes& device, CollectiveParams* cp,
                            CancellationManager* cancel_mgr,
                            StatusCallback done) override;
 
-  PerStepCollectiveRemoteAccess* remote_access() override {
+  CollectiveRemoteAccess* remote_access() override {
     return remote_access_.get();
   }
 
-  void RecvFromPeer(const string& peer_device, const string& peer_task,
-                    bool peer_is_local, const string& key, Device* to_device,
-                    DeviceContext* to_device_ctx,
-                    const AllocatorAttributes& to_alloc_attr, Tensor* to_tensor,
-                    const DeviceLocality& client_locality, int stream_index,
-                    const StatusCallback& done) override {
-    remote_access_->RecvFromPeer(
-        peer_device, peer_task, peer_is_local, key, to_device, to_device_ctx,
-        to_alloc_attr, to_tensor, client_locality, stream_index, done);
-  }
-
-  void PostToPeer(const string& peer_device, const string& peer_task,
-                  const string& key, Device* from_device,
-                  DeviceContext* from_device_ctx,
-                  const AllocatorAttributes& from_alloc_attr,
-                  const Tensor* from_tensor,
-                  const DeviceLocality& client_locality,
-                  const StatusCallback& done) override {
-    remote_access_->PostToPeer(peer_device, peer_task, key, from_device,
-                               from_device_ctx, from_alloc_attr, from_tensor,
-                               client_locality, done);
-  }
-
   void RunClosure(std::function<void()> closure) override {
-    remote_access_->RunClosure(std::move(closure));
+    work_queue_->Schedule(std::move(closure));
   }
 
   // If we need to enforce an ordering on any portion of collective
@@ -159,8 +138,11 @@ class BaseCollectiveExecutor : public CollectiveExecutor {
  protected:
   const int64 step_id_;
   const DeviceMgr* dev_mgr_;  // Not owned.
-  std::unique_ptr<PerStepCollectiveRemoteAccess> remote_access_;
+  std::unique_ptr<CollectiveRemoteAccess> remote_access_;
   const string* gpu_ring_order_;  // Not owned.
+  // Ownership of `work_queue_` is shared between `this` and
+  // `CollectiveExecutorMgr`.
+  std::shared_ptr<UnboundedWorkQueue> work_queue_;
   mutex launch_mu_;
   condition_variable launch_cv_;
   // collective instance key -> number of local devices for which NCCL ops have
diff --git a/tensorflow/core/common_runtime/bfc_allocator.cc b/tensorflow/core/common_runtime/bfc_allocator.cc
index 440ed235455..11f28655f05 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.cc
+++ b/tensorflow/core/common_runtime/bfc_allocator.cc
@@ -230,7 +230,7 @@ void* BFCAllocator::AllocateRawInternalWithRetry(
 void* BFCAllocator::AllocateRaw(size_t unused_alignment, size_t num_bytes,
                                 const AllocationAttributes& allocation_attr) {
   VLOG(1) << "AllocateRaw " << Name() << "  " << num_bytes;
-  if (allocation_attr.no_retry_on_failure) {
+  if (!allocation_attr.retry_on_failure) {
     // Return immediately upon the first failure if this is for allocating an
     // optional scratch space.
     bool dump_log_on_failure = VLOG_IS_ON(2);
diff --git a/tensorflow/core/common_runtime/collective_executor_mgr.cc b/tensorflow/core/common_runtime/collective_executor_mgr.cc
index e9e0082195d..4b7f5796dd6 100644
--- a/tensorflow/core/common_runtime/collective_executor_mgr.cc
+++ b/tensorflow/core/common_runtime/collective_executor_mgr.cc
@@ -26,12 +26,14 @@ namespace tensorflow {
 CollectiveExecutorMgr::CollectiveExecutorMgr(
     const ConfigProto& config, const DeviceMgr* dev_mgr,
     std::unique_ptr<DeviceResolverInterface> dev_resolver,
-    std::unique_ptr<ParamResolverInterface> param_resolver)
+    std::unique_ptr<ParamResolverInterface> param_resolver,
+    std::unique_ptr<NcclCommunicatorInterface> nccl_communicator)
     : dev_mgr_(dev_mgr),
       dev_resolver_(std::move(dev_resolver)),
       param_resolver_(std::move(param_resolver)),
       gpu_ring_order_(
           config.gpu_options().experimental().collective_ring_order()),
+      nccl_communicator_(std::move(nccl_communicator)),
       work_queue_(std::make_shared<UnboundedWorkQueue>(Env::Default(),
                                                        "collective_ops")) {}
 
@@ -58,10 +60,10 @@ CollectiveExecutor* CollectiveExecutorMgr::FindOrCreate(int64 step_id) {
 }
 
 CollectiveExecutor* CollectiveExecutorMgr::Create(int64 step_id) {
-  CollectiveRemoteAccessLocal* rma = new CollectiveRemoteAccessLocal(
-      dev_mgr_, dev_resolver_.get(), work_queue_, step_id);
+  CollectiveRemoteAccessLocal* rma =
+      new CollectiveRemoteAccessLocal(dev_mgr_, dev_resolver_.get(), step_id);
   return new BaseCollectiveExecutor(this, rma, step_id, dev_mgr_,
-                                    &gpu_ring_order_);
+                                    &gpu_ring_order_, work_queue_);
 }
 
 void CollectiveExecutorMgr::Cleanup(int64 step_id) {
diff --git a/tensorflow/core/common_runtime/collective_executor_mgr.h b/tensorflow/core/common_runtime/collective_executor_mgr.h
index 0b2a260ad24..6d65e7fb50b 100644
--- a/tensorflow/core/common_runtime/collective_executor_mgr.h
+++ b/tensorflow/core/common_runtime/collective_executor_mgr.h
@@ -22,12 +22,15 @@ limitations under the License.
 namespace tensorflow {
 class ConfigProto;
 class DeviceMgr;
+class NcclManager;
 
 class CollectiveExecutorMgr : public CollectiveExecutorMgrInterface {
  public:
-  CollectiveExecutorMgr(const ConfigProto& config, const DeviceMgr* dev_mgr,
-                        std::unique_ptr<DeviceResolverInterface> dev_resolver,
-                        std::unique_ptr<ParamResolverInterface> param_resolver);
+  CollectiveExecutorMgr(
+      const ConfigProto& config, const DeviceMgr* dev_mgr,
+      std::unique_ptr<DeviceResolverInterface> dev_resolver,
+      std::unique_ptr<ParamResolverInterface> param_resolver,
+      std::unique_ptr<NcclCommunicatorInterface> nccl_communicator);
 
   virtual ~CollectiveExecutorMgr();
 
@@ -43,6 +46,10 @@ class CollectiveExecutorMgr : public CollectiveExecutorMgrInterface {
     return dev_resolver_.get();
   }
 
+  NcclCommunicatorInterface* GetNcclCommunicator() const override {
+    return nccl_communicator_.get();
+  }
+
   void GetStepSequenceAsync(const GetStepSequenceRequest* request,
                             GetStepSequenceResponse* response,
                             const StatusCallback& done) override;
@@ -64,6 +71,7 @@ class CollectiveExecutorMgr : public CollectiveExecutorMgrInterface {
   std::unique_ptr<DeviceResolverInterface> dev_resolver_;
   std::unique_ptr<ParamResolverInterface> param_resolver_;
   string gpu_ring_order_;
+  std::unique_ptr<NcclCommunicatorInterface> nccl_communicator_;
   // Unbounded work queue for scheduling potentially-blocking work during
   // collective op execution.  Ownership is shared between `this` and
   // `CollectiveRemoteAccessLocal`.
diff --git a/tensorflow/core/common_runtime/collective_executor_mgr_test.cc b/tensorflow/core/common_runtime/collective_executor_mgr_test.cc
index 784ea4adda1..1f0f4e82d86 100644
--- a/tensorflow/core/common_runtime/collective_executor_mgr_test.cc
+++ b/tensorflow/core/common_runtime/collective_executor_mgr_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/nccl/collective_communicator.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/session_options.h"
 
@@ -47,7 +48,8 @@ class CollectiveExecutorMgrTest : public ::testing::Test {
         new CollectiveParamResolverLocal(cp, device_mgr_.get(), drl.get(),
                                          task_name));
     cme_.reset(new CollectiveExecutorMgr(cp, device_mgr_.get(), std::move(drl),
-                                         std::move(prl)));
+                                         std::move(prl),
+                                         MaybeCreateNcclCommunicator()));
   }
 
   std::unique_ptr<CollectiveExecutorMgr> cme_;
diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local.cc b/tensorflow/core/common_runtime/collective_param_resolver_local.cc
index ba21abcbaa8..b009ce051c9 100644
--- a/tensorflow/core/common_runtime/collective_param_resolver_local.cc
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local.cc
@@ -17,7 +17,7 @@ limitations under the License.
 #include <stddef.h>
 
 #include <algorithm>
-#include <unordered_map>
+#include <unordered_set>
 #include <utility>
 
 #include "tensorflow/core/common_runtime/device_mgr.h"
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/protobuf/config.pb.h"
@@ -74,12 +75,21 @@ const char* GetCollectiveName(const CollectiveParams* cp, bool nccl) {
       return "undef";
   }
 }
+
+string TaskNameFromDeviceName(const string& device_name) {
+  DeviceNameUtils::ParsedName parsed_device;
+  CHECK(DeviceNameUtils::ParseFullName(device_name, &parsed_device));
+  string task_name;
+  CHECK(DeviceNameUtils::GetTaskName(parsed_device, &task_name));
+  return task_name;
+}
 }  // namespace
 
 void CollectiveParamResolverLocal::CompleteGroupLocal(
-    const string& device, CollectiveParams* cp, const GroupRecCallback& done) {
-  VLOG(1) << "CompleteGroupLocal device=" << device << " cp: " << cp << ": "
-          << cp->ToString();
+    const DeviceAttributes& device, CollectiveParams* cp,
+    const GroupRecCallback& done) {
+  VLOG(1) << "CompleteGroupLocal device=" << device.name() << " cp: " << cp
+          << ": " << cp->ToString();
   std::vector<StatusCallback> to_be_called;
   GroupRec* gr = nullptr;
   Status status;
@@ -139,13 +149,13 @@ void CollectiveParamResolverLocal::CompleteGroupLocal(
     // status.
     VLOG(2) << "gr device_type=" << gr->group.device_type
             << " cp device_type=" << cp->group.device_type
-            << " current device=" << device;
+            << " current device=" << device.name();
     if (gr->status.ok()) {
       // Check for consistency with existing GroupRec.
       if (cp->group.device_type != gr->group.device_type) {
         gr->status = errors::Internal(
-            "Collective Op ", cp->name, " is assigned to device ", device,
-            " with type ", cp->group.device_type.type_string(),
+            "Collective Op ", cp->name, " is assigned to device ",
+            device.name(), " with type ", cp->group.device_type.type_string(),
             " and group_key ", cp->group.group_key, " but that group has type ",
             gr->group.device_type.type_string());
       } else if (cp->group.group_size != gr->group.group_size) {
@@ -157,38 +167,47 @@ void CollectiveParamResolverLocal::CompleteGroupLocal(
     }
     if (gr->status.ok()) {
       // Insert device if not already present.
-      auto it = gr->device_set.find(device);
-      if (it == gr->device_set.end()) {
-        if (gr->device_set.size() == gr->group.group_size) {
+      auto it = gr->devices.find(device.name());
+      if (it == gr->devices.end()) {
+        if (gr->devices.size() == gr->group.group_size) {
           // The group is already full.
           gr->status = errors::Internal(
-              "Collective Op ", cp->name, " is assigned to device ", device,
-              " and group_key ", cp->group.group_key,
+              "Collective Op ", cp->name, " is assigned to device ",
+              device.name(), " and group_key ", cp->group.group_key,
               " but that group doesn't contain that device.");
         } else {
           // This is a new device that has not yet joined the group.
-          gr->device_set.insert(device);
-          gr->device_list.push_back(device);
-          DeviceNameUtils::ParsedName parsed_device;
-          DeviceNameUtils::ParseFullName(device, &parsed_device);
-          string task_name = strings::StrCat("/job:", parsed_device.job,
-                                             "/replica:", parsed_device.replica,
-                                             "/task:", parsed_device.task);
-          gr->task_set.insert(task_name);
-          gr->task_list.push_back(task_name);
-          gr->group.num_tasks = static_cast<int32>(gr->task_set.size());
+          gr->devices[device.name()] = device;
+          if (gr->devices.size() == gr->group.group_size) {
+            // The group is full after adding this device, calculate the number
+            // of tasks.
+            std::unordered_set<string> tasks;
+            for (const auto& item : gr->devices) {
+              tasks.insert(TaskNameFromDeviceName(item.first));
+            }
+            gr->group.num_tasks = static_cast<int32>(tasks.size());
+          }
           if (VLOG_IS_ON(1)) {
             string dev_buf;
-            for (const auto& d : gr->device_set) {
-              strings::StrAppend(&dev_buf, ",", d);
+            for (const auto& d : gr->devices) {
+              strings::StrAppend(&dev_buf, ",", d.first);
             }
             VLOG(1) << "CompleteGroupLocal group_key=" << gr->group.group_key
                     << " group_size=" << gr->group.group_size << " (current"
                     << " devices)=(" << dev_buf << ") (number of"
                     << " devices pending)="
-                    << (gr->group.group_size - gr->device_set.size());
+                    << (gr->group.group_size - gr->devices.size());
           }
         }
+      } else {
+        // If the device already exists, check if the incarnation matches.
+        if (it->second.incarnation() != device.incarnation()) {
+          gr->status = errors::FailedPrecondition(
+              "Device ", device.name(),
+              " current incarnation doesn't match with one in the group. This "
+              "usually means this worker has restarted but the collective "
+              "leader hasn't, or this worker connects to a wrong cluster.");
+        }
       }
     }
 
@@ -196,13 +215,13 @@ void CollectiveParamResolverLocal::CompleteGroupLocal(
       cp->group.runtime_details = gr->group.runtime_details;
       // If the group is not yet complete, queue to wait for it.
       VLOG(2) << "group_size " << gr->group.group_size << " set size "
-              << gr->device_set.size() << " gr " << gr;
+              << gr->devices.size() << " gr " << gr;
 
-      if (gr->device_set.size() < gr->group.group_size) {
+      if (gr->devices.size() < gr->group.group_size) {
         gr->waiting.push_back(std::bind(done, std::placeholders::_1, gr));
         return;
       }
-      CHECK_EQ(gr->device_set.size(), gr->group.group_size);
+      CHECK_EQ(gr->devices.size(), gr->group.group_size);
     }
     // At this point, we either have a full group, or an error status.  Ensure
     // that all callbacks are invoked with the appropriate status.
@@ -476,15 +495,23 @@ void CollectiveParamResolverLocal::SetDefaultRank(const string& device,
 
 void CollectiveParamResolverLocal::InitInstanceSharedParams(
     const GroupRec* gr, const CollectiveParams* cp, InstanceRec* ir,
-    const StatusCallback& done) {
+    const StatusCallback& done) TF_NO_THREAD_SAFETY_ANALYSIS {
+  std::vector<DeviceAttributes> attributes;
   ir->shared.instance = cp->instance;
   {
     mutex_lock gl(gr->mu);
     ir->shared.group = gr->group;
-    ir->shared.instance.device_names.assign(gr->device_list.begin(),
-                                            gr->device_list.end());
-    ir->shared.instance.task_names.assign(gr->task_list.begin(),
-                                          gr->task_list.end());
+    ir->shared.instance.device_names.clear();
+    ir->shared.instance.task_names.clear();
+    ir->shared.instance.device_names.reserve(gr->devices.size());
+    ir->shared.instance.task_names.reserve(gr->devices.size());
+    attributes.reserve(gr->devices.size());
+    for (const auto& item : gr->devices) {
+      ir->shared.instance.device_names.push_back(item.first);
+      ir->shared.instance.task_names.push_back(
+          TaskNameFromDeviceName(item.first));
+      attributes.push_back(item.second);
+    }
     VLOG(2) << "Initialized names for instance: "
             << ir->shared.instance.ToString();
   }
@@ -502,6 +529,9 @@ void CollectiveParamResolverLocal::InitInstanceSharedParams(
   // GetDeviceAttributesAsync will use those fields to launch RPCs.
   CompleteTaskIsLocal(task_name_, &ir->shared);
 
+  // TODO(b/151232436): clean up the following code since we no longer need to
+  // execute it in a callback.
+
   // Because the callback may execute in a different thread, we release
   // ir->out_mu here.  Before releasing, we mark it as unavailable for other
   // threads.
@@ -509,30 +539,24 @@ void CollectiveParamResolverLocal::InitInstanceSharedParams(
   const auto device_names = ir->shared.instance.device_names;
   const auto task_names = ir->shared.instance.task_names;
   ir->out_mu.unlock();
-  std::vector<DeviceAttributes>* attributes = new std::vector<DeviceAttributes>;
-  // Suppress linter warning about access to shared without mutex because in
-  // principle the members are locked due to out_mu_available=false.
-  dev_resolver_->GetAllDeviceAttributesAsync(
-      ir->shared.instance.device_names,  // NOLINT
-      ir->shared.instance.task_names,    // NOLINT
-      attributes,
-      [this, gr, cp, ir, attributes, done](const Status& s)
-          TF_EXCLUSIVE_LOCK_FUNCTION(ir->out_mu) {
-            // Then we recover the lock in the callback thread that will hold it
-            // through the rest of the call chain.  Signal the cv now, any
-            // waiting threads will wake only when out_mu is released later.
-            ir->out_mu.lock();
-            DCHECK(!ir->out_mu_available);
-            ir->out_mu_available = true;
-            ir->out_cv.notify_all();
-            if (s.ok()) {
-              CompleteDefaultRanking(gr, cp, ir, *attributes);
-              done(Status::OK());
-            } else {
-              done(s);
-            }
-            delete attributes;
-          });
+  auto complete_init = [this, gr, cp, ir, attributes, done](const Status& s)
+                           TF_EXCLUSIVE_LOCK_FUNCTION(ir->out_mu) {
+                             // Then we recover the lock in the callback thread
+                             // that will hold it through the rest of the call
+                             // chain.  Signal the cv now, any waiting threads
+                             // will wake only when out_mu is released later.
+                             ir->out_mu.lock();
+                             DCHECK(!ir->out_mu_available);
+                             ir->out_mu_available = true;
+                             ir->out_cv.notify_all();
+                             if (s.ok()) {
+                               CompleteDefaultRanking(gr, cp, ir, attributes);
+                               done(Status::OK());
+                             } else {
+                               done(s);
+                             }
+                           };
+  complete_init(Status::OK());
 }
 
 // NOTE(ayushd): The DeviceLocality objects in attributes will have LocalLinks
@@ -682,15 +706,15 @@ void CollectiveParamResolverLocal::CallInitInstanceSharedParams(
 }
 
 void CollectiveParamResolverLocal::CompleteParamsAsync(
-    const string& device, CollectiveParams* cp, CancellationManager* cancel_mgr,
-    const StatusCallback& done) {
-  VLOG(1) << "CompleteParams local " << device << " for " << cp << ": "
+    const DeviceAttributes& device, CollectiveParams* cp,
+    CancellationManager* cancel_mgr, const StatusCallback& done) {
+  VLOG(1) << "CompleteParams local " << device.name() << " for " << cp << ": "
           << cp->ToString();
   CompleteGroupLocal(
       device, cp,
       [this, device, cp, done](const Status& s, const GroupRec* gr) {
         if (s.ok()) {
-          CompleteInstanceLocal(device, gr, cp, cp->is_source, done);
+          CompleteInstanceLocal(device.name(), gr, cp, cp->is_source, done);
         } else {
           done(s);
         }
diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local.h b/tensorflow/core/common_runtime/collective_param_resolver_local.h
index 40f0f00affc..51cfa893d00 100644
--- a/tensorflow/core/common_runtime/collective_param_resolver_local.h
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local.h
@@ -19,9 +19,11 @@ limitations under the License.
 #include <memory>
 #include <set>
 #include <string>
+#include <unordered_map>
 #include <vector>
 
 #include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 
@@ -45,7 +47,7 @@ class CollectiveParamResolverLocal : public ParamResolverInterface {
 
   ~CollectiveParamResolverLocal() override {}
 
-  void CompleteParamsAsync(const string& device, CollectiveParams* cp,
+  void CompleteParamsAsync(const DeviceAttributes& device, CollectiveParams* cp,
                            CancellationManager* cancel_mgr,
                            const StatusCallback& done) override;
 
@@ -70,10 +72,7 @@ class CollectiveParamResolverLocal : public ParamResolverInterface {
     CollGroupParams group;
     mutable mutex mu;
     Status status TF_GUARDED_BY(mu);
-    std::set<string> device_set TF_GUARDED_BY(mu);
-    std::vector<string> device_list TF_GUARDED_BY(mu);
-    std::set<string> task_set TF_GUARDED_BY(mu);
-    std::vector<string> task_list TF_GUARDED_BY(mu);
+    std::unordered_map<string, DeviceAttributes> devices TF_GUARDED_BY(mu);
     std::vector<StatusCallback> waiting TF_GUARDED_BY(mu);
   };
 
@@ -85,7 +84,7 @@ class CollectiveParamResolverLocal : public ParamResolverInterface {
   // callback.
   typedef std::function<void(const Status& s, const GroupRec* gr)>
       GroupRecCallback;
-  void CompleteGroupLocal(const string& device, CollectiveParams* cp,
+  void CompleteGroupLocal(const DeviceAttributes& device, CollectiveParams* cp,
                           const GroupRecCallback& done)
       TF_LOCKS_EXCLUDED(group_mu_);
 
diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc b/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc
index f23f03dc406..b117632dbd2 100644
--- a/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_resolver_local.h"
 #include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -86,6 +87,12 @@ class CollectiveParamResolverLocalTest : public ::testing::Test {
     }
   }
 
+  DeviceAttributes GetDeviceAttributes(const string& device_name) {
+    Device* device = nullptr;
+    TF_CHECK_OK(device_mgr_->LookupDevice(device_name, &device));
+    return device->attributes();
+  }
+
   string task_name_;
   std::unique_ptr<DeviceMgr> device_mgr_;
   std::unique_ptr<DeviceResolverLocal> drl_;
@@ -187,12 +194,13 @@ TEST_F(CollectiveParamResolverLocalTest, CompleteParamsReduction1Task) {
     cp->instance.impl_details.subdiv_offsets.push_back(0);
     cp->is_source = false;
     Env::Default()->SchedClosure([this, i, cp, &note, &statuses]() {
-      prl_->CompleteParamsAsync(cp->instance.device_names[0], cp,
-                                nullptr /*CancellationManager*/,
-                                [&statuses, &note, i](const Status& s) {
-                                  statuses[i] = s;
-                                  note[i].Notify();
-                                });
+      prl_->CompleteParamsAsync(
+          GetDeviceAttributes(cp->instance.device_names[0]), cp,
+          nullptr /*CancellationManager*/,
+          [&statuses, &note, i](const Status& s) {
+            statuses[i] = s;
+            note[i].Notify();
+          });
     });
   }
   for (int i = 0; i < NUM_DEVS; ++i) {
@@ -240,12 +248,13 @@ TEST_F(CollectiveParamResolverLocalTest, CompleteParamsBroadcast1Task) {
     CollectiveParams* cp = &cps[i];
     InitializeCollectiveParamsForBroadcast(kInstanceKey, i, i == 1, cp);
     Env::Default()->SchedClosure([this, i, cp, &note, &statuses]() {
-      prl_->CompleteParamsAsync(cp->instance.device_names[0], cp,
-                                nullptr /*CancellationManager*/,
-                                [&statuses, &note, i](const Status& s) {
-                                  statuses[i] = s;
-                                  note[i].Notify();
-                                });
+      prl_->CompleteParamsAsync(
+          GetDeviceAttributes(cp->instance.device_names[0]), cp,
+          nullptr /*CancellationManager*/,
+          [&statuses, &note, i](const Status& s) {
+            statuses[i] = s;
+            note[i].Notify();
+          });
     });
   }
   for (int i = 0; i < NUM_DEVS; ++i) {
@@ -278,12 +287,13 @@ TEST_F(CollectiveParamResolverLocalTest, CompleteParamsBroadcastForgotSender) {
     CollectiveParams* cp = &cps[i];
     InitializeCollectiveParamsForBroadcast(kInstanceKey, i, false, cp);
     Env::Default()->SchedClosure([this, i, cp, &note, &statuses]() {
-      prl_->CompleteParamsAsync(cp->instance.device_names[0], cp,
-                                nullptr /*CancellationManager*/,
-                                [&statuses, &note, i](const Status& s) {
-                                  statuses[i] = s;
-                                  note[i].Notify();
-                                });
+      prl_->CompleteParamsAsync(
+          GetDeviceAttributes(cp->instance.device_names[0]), cp,
+          nullptr /*CancellationManager*/,
+          [&statuses, &note, i](const Status& s) {
+            statuses[i] = s;
+            note[i].Notify();
+          });
     });
   }
   for (int i = 0; i < NUM_DEVS; ++i) {
@@ -326,8 +336,8 @@ TEST_F(CollectiveParamResolverLocalTest, AbortPendingGroup) {
           strings::StrCat("/job:localhost/replica:0/task:0/device:CPU:", i);
       cp[i] = MakeCollectiveParams(/*group_key*/ 100, /*instance_key*/ 100,
                                    /*is_source*/ i == 0);
-      prl_->CompleteParamsAsync(device, &cp[i], &cancel_mgr,
-                                [&done](const Status& s) {
+      prl_->CompleteParamsAsync(GetDeviceAttributes(device), &cp[i],
+                                &cancel_mgr, [&done](const Status& s) {
                                   EXPECT_EQ(s.code(), error::ABORTED);
                                   EXPECT_EQ(s.error_message(), "__aborted__");
                                   done.DecrementCount();
@@ -355,8 +365,8 @@ TEST_F(CollectiveParamResolverLocalTest, AbortPendingInstance) {
             strings::StrCat("/job:localhost/replica:0/task:0/device:CPU:", i);
         cp[i] = MakeCollectiveParams(group_key, instance_key,
                                      /*is_source*/ i == 0);
-        prl_->CompleteParamsAsync(device, &cp[i], &cancel_mgr,
-                                  [&done](const Status& s) {
+        prl_->CompleteParamsAsync(GetDeviceAttributes(device), &cp[i],
+                                  &cancel_mgr, [&done](const Status& s) {
                                     EXPECT_EQ(s.code(), error::OK);
                                     done.DecrementCount();
                                   });
@@ -373,12 +383,13 @@ TEST_F(CollectiveParamResolverLocalTest, AbortPendingInstance) {
               strings::StrCat("/job:localhost/replica:0/task:0/device:CPU:", i);
           cp[i] = MakeCollectiveParams(group_key, instance_key + 1,
                                        /*is_source*/ i == 0);
-          prl_->CompleteParamsAsync(
-              device, &cp[i], &cancel_mgr, [&done](const Status& s) {
-                EXPECT_EQ(s.code(), error::ABORTED);
-                EXPECT_EQ(s.error_message(), "__aborted__");
-                done.DecrementCount();
-              });
+          prl_->CompleteParamsAsync(GetDeviceAttributes(device), &cp[i],
+                                    &cancel_mgr, [&done](const Status& s) {
+                                      EXPECT_EQ(s.code(), error::ABORTED);
+                                      EXPECT_EQ(s.error_message(),
+                                                "__aborted__");
+                                      done.DecrementCount();
+                                    });
           start.DecrementCount();
         });
   }
@@ -402,8 +413,8 @@ TEST_F(CollectiveParamResolverLocalTest, CompleteParamsAfterAbortion) {
             strings::StrCat("/job:localhost/replica:0/task:0/device:CPU:", i);
         cp[i] = MakeCollectiveParams(group_key, instance_key,
                                      /*is_source*/ i == 0);
-        prl_->CompleteParamsAsync(device, &cp[i], &cancel_mgr,
-                                  [&done](const Status& s) {
+        prl_->CompleteParamsAsync(GetDeviceAttributes(device), &cp[i],
+                                  &cancel_mgr, [&done](const Status& s) {
                                     EXPECT_EQ(s.code(), error::OK);
                                     done.DecrementCount();
                                   });
@@ -418,7 +429,7 @@ TEST_F(CollectiveParamResolverLocalTest, CompleteParamsAfterAbortion) {
     Notification done;
     auto cp = MakeCollectiveParams(group_key, instance_key,
                                    /*is_source*/ true);
-    prl_->CompleteParamsAsync(device, &cp, &cancel_mgr,
+    prl_->CompleteParamsAsync(GetDeviceAttributes(device), &cp, &cancel_mgr,
                               [&done](const Status& s) {
                                 EXPECT_EQ(s.code(), error::ABORTED);
                                 EXPECT_EQ(s.error_message(), "__aborted__");
@@ -457,7 +468,8 @@ TEST_F(CollectiveParamResolverLocalTest, AbortNormalCompleteParamsAsync) {
               auto cp =
                   MakeCollectiveParams(/* group_key*/ key, /*instance_key*/ key,
                                        /*is_source*/ i == 0);
-              prl_->CompleteParamsAsync(device, &cp, &cancel_mgr,
+              prl_->CompleteParamsAsync(GetDeviceAttributes(device), &cp,
+                                        &cancel_mgr,
                                         [&status, &n](const Status& s) {
                                           status = s;
                                           n.Notify();
diff --git a/tensorflow/core/common_runtime/collective_rma_local.cc b/tensorflow/core/common_runtime/collective_rma_local.cc
index 4cd9f820c2e..b958a25c091 100644
--- a/tensorflow/core/common_runtime/collective_rma_local.cc
+++ b/tensorflow/core/common_runtime/collective_rma_local.cc
@@ -21,9 +21,6 @@ namespace tensorflow {
 
 void CollectiveRemoteAccessLocal::StartAbort(const Status& s) {
   buf_rendezvous_.StartAbort(s);
-  if (errors::IsFailedPrecondition(s)) {
-    dev_resolver_->ClearCache();
-  }
 }
 
 void CollectiveRemoteAccessLocal::RecvFromPeer(
@@ -108,6 +105,13 @@ void CollectiveRemoteAccessLocal::PostToPeer(
                              from_alloc_attr, done);
 }
 
+void CollectiveRemoteAccessLocal::CheckPeerHealth(const string& peer_task,
+                                                  const StatusCallback& done) {
+  // Assume local devices are always healthy.
+  done(errors::Internal(
+      "CheckPeerHealth is not supposed to be called for local collectives"));
+}
+
 /*static*/
 void CollectiveRemoteAccessLocal::MemCpyAsync(
     DeviceContext* src_dev_ctx, DeviceContext* dst_dev_ctx, Device* src_dev,
diff --git a/tensorflow/core/common_runtime/collective_rma_local.h b/tensorflow/core/common_runtime/collective_rma_local.h
index b5d02f4d2bd..12aca901054 100644
--- a/tensorflow/core/common_runtime/collective_rma_local.h
+++ b/tensorflow/core/common_runtime/collective_rma_local.h
@@ -19,20 +19,17 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/framework/collective.h"
 #include "tensorflow/core/framework/rendezvous.h"
-#include "tensorflow/core/platform/unbounded_work_queue.h"
 
 namespace tensorflow {
 
 // Basic implementation of PerStepCollectiveRemoteAccess.
-class CollectiveRemoteAccessLocal : public PerStepCollectiveRemoteAccess {
+class CollectiveRemoteAccessLocal : public CollectiveRemoteAccess {
  public:
   CollectiveRemoteAccessLocal(const DeviceMgr* dev_mgr,
                               DeviceResolverInterface* dev_resolver,
-                              std::shared_ptr<UnboundedWorkQueue> work_queue,
                               int64 step_id)
       : dev_mgr_(dev_mgr),
         dev_resolver_(dev_resolver),
-        work_queue_(std::move(work_queue)),
         buf_rendezvous_(step_id, dev_mgr),
         step_id_(step_id) {}
 
@@ -56,29 +53,8 @@ class CollectiveRemoteAccessLocal : public PerStepCollectiveRemoteAccess {
                   const DeviceLocality& client_locality,
                   const StatusCallback& done) override;
 
-  void RunClosure(std::function<void()> closure) override {
-    work_queue_->Schedule(std::move(closure));
-  }
-
-  void GetAllDeviceAttributesAsync(const std::vector<string>& devices,
-                                   const std::vector<string>& tasks,
-                                   std::vector<DeviceAttributes>* attributes,
-                                   const StatusCallback& done) override {
-    dev_resolver_->GetAllDeviceAttributesAsync(devices, tasks, attributes,
-                                               done);
-  }
-
-  void GetDeviceAttributesAsync(const string& device, const string& task,
-                                DeviceAttributes* attributes,
-                                const StatusCallback& done) override {
-    dev_resolver_->GetDeviceAttributesAsync(device, task, attributes, done);
-  }
-
-  void ClearTask(const string& task) override {
-    dev_resolver_->ClearTask(task);
-  }
-
-  void ClearCache() override { dev_resolver_->ClearCache(); }
+  void CheckPeerHealth(const string& peer_task,
+                       const StatusCallback& done) override;
 
   BufRendezvous* buf_rendezvous() override { return &buf_rendezvous_; }
 
@@ -96,9 +72,6 @@ class CollectiveRemoteAccessLocal : public PerStepCollectiveRemoteAccess {
  protected:
   const DeviceMgr* dev_mgr_;               // not owned
   DeviceResolverInterface* dev_resolver_;  // not owned
-  // Ownership of `work_queue_` is shared between `this` and
-  // `CollectiveExecutorMgr`.
-  std::shared_ptr<UnboundedWorkQueue> work_queue_;
   BufRendezvous buf_rendezvous_;
   int64 step_id_;
 };
diff --git a/tensorflow/core/common_runtime/collective_rma_local_test.cc b/tensorflow/core/common_runtime/collective_rma_local_test.cc
index b7b85e3de66..2c606147f7d 100644
--- a/tensorflow/core/common_runtime/collective_rma_local_test.cc
+++ b/tensorflow/core/common_runtime/collective_rma_local_test.cc
@@ -50,8 +50,8 @@ class CollectiveRemoteAccessLocalTest : public ::testing::Test {
     drl_ = absl::make_unique<DeviceResolverLocal>(device_mgr_.get());
     prl_ = absl::make_unique<CollectiveParamResolverLocal>(
         cp, device_mgr_.get(), drl_.get(), kTaskName);
-    rma_ = absl::make_unique<CollectiveRemoteAccessLocal>(
-        device_mgr_.get(), drl_.get(), work_queue_, kStepId);
+    rma_ = absl::make_unique<CollectiveRemoteAccessLocal>(device_mgr_.get(),
+                                                          drl_.get(), kStepId);
   }
 
   ~CollectiveRemoteAccessLocalTest() override = default;
@@ -151,5 +151,16 @@ TEST_F(CollectiveRemoteAccessLocalTest, PostRecvCPU1_2) {
   EXPECT_NE(DMAHelper::base(&source_tensor), DMAHelper::base(&sink_tensor));
 }
 
+TEST_F(CollectiveRemoteAccessLocalTest, CheckHealth) {
+  Status status;
+  Notification done;
+  rma_->CheckPeerHealth(kTaskName, [&status, &done](const Status& s) {
+    status = s;
+    done.Notify();
+  });
+  done.WaitForNotification();
+  EXPECT_TRUE(errors::IsInternal(status));
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/constant_folding_test.cc b/tensorflow/core/common_runtime/constant_folding_test.cc
index e621b3b5006..38368c4015e 100644
--- a/tensorflow/core/common_runtime/constant_folding_test.cc
+++ b/tensorflow/core/common_runtime/constant_folding_test.cc
@@ -13,15 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/common_runtime/constant_folding.h"
+
 #include <map>
 #include <string>
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/cc/ops/nn_ops.h"
-#include "tensorflow/core/common_runtime/constant_folding.h"
-
 #include "tensorflow/cc/ops/array_ops_internal.h"
+#include "tensorflow/cc/ops/nn_ops.h"
 #include "tensorflow/cc/ops/sendrecv_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/common_runtime/device.h"
@@ -687,8 +687,10 @@ class TestTFFileSystem : public ::tensorflow::NullFileSystem {
       : ::tensorflow::NullFileSystem(),
         data_tensor_(test::AsTensor<double>({1., 2., 3., 4.}, {2, 2})) {}
 
+  using ::tensorflow::NullFileSystem::NewReadOnlyMemoryRegionFromFile;
+
   ::tensorflow::Status NewReadOnlyMemoryRegionFromFile(
-      const string& fname,
+      const string& fname, ::tensorflow::TransactionToken* token,
       std::unique_ptr<::tensorflow::ReadOnlyMemoryRegion>* result) override {
     if (fname != kTestMemRegionName) {
       return ::tensorflow::errors::Unimplemented(
diff --git a/tensorflow/core/common_runtime/device_factory.h b/tensorflow/core/common_runtime/device_factory.h
index 9d911c20e25..f10a718db05 100644
--- a/tensorflow/core/common_runtime/device_factory.h
+++ b/tensorflow/core/common_runtime/device_factory.h
@@ -123,7 +123,6 @@ class Registrar {
   //
   // The default priority values for built-in devices is:
   // GPU: 210
-  // SYCL: 200
   // GPUCompatibleCPU: 70
   // ThreadPoolDevice: 60
   // Default: 50
diff --git a/tensorflow/core/common_runtime/device_resolver_local.cc b/tensorflow/core/common_runtime/device_resolver_local.cc
index 12e1e28296d..ba4cfa44f6e 100644
--- a/tensorflow/core/common_runtime/device_resolver_local.cc
+++ b/tensorflow/core/common_runtime/device_resolver_local.cc
@@ -15,35 +15,34 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_resolver_local.h"
 
 #include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/platform/errors.h"
 
 namespace tensorflow {
 
-void DeviceResolverLocal::GetAllDeviceAttributesAsync(
-    const std::vector<string>& devices, const std::vector<string>& tasks,
-    std::vector<DeviceAttributes>* attributes, const StatusCallback& done) {
-  attributes->clear();
-  for (const string& device_name : devices) {
-    Device* dev;
-    Status s = dev_mgr_->LookupDevice(device_name, &dev);
-    if (!s.ok()) {
-      done(s);
-      return;
-    }
-    attributes->push_back(dev->attributes());
+Status DeviceResolverLocal::GetDeviceAttributes(const string& device,
+                                                DeviceAttributes* attributes) {
+  Device* dev;
+  // LookupDevice returns InvalidArgument if the device is not found.
+  Status s = dev_mgr_->LookupDevice(device, &dev);
+  if (errors::IsInvalidArgument(s)) {
+    return errors::NotFound(device, " not found");
+  } else if (!s.ok()) {
+    return s;
   }
-  done(Status::OK());
+  *attributes = dev->attributes();
+  return Status::OK();
 }
 
-void DeviceResolverLocal::GetDeviceAttributesAsync(const string& device,
-                                                   const string& task,
-                                                   DeviceAttributes* attributes,
-                                                   const StatusCallback& done) {
-  Device* dev;
-  Status s = dev_mgr_->LookupDevice(device, &dev);
-  if (s.ok()) {
-    *attributes = dev->attributes();
-  }
-  done(s);
+Status DeviceResolverLocal::GetAllDeviceAttributes(
+    const string& task, std::vector<DeviceAttributes>* attributes) {
+  return errors::Internal(
+      "GetTaskCached is not supposed to be called in local collectives");
+}
+
+Status DeviceResolverLocal::UpdateDeviceAttributes(
+    const std::vector<DeviceAttributes>& attributes) {
+  return errors::Internal(
+      "UpdateDeviceAttributes shouldn't be called with local collectives");
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/device_resolver_local.h b/tensorflow/core/common_runtime/device_resolver_local.h
index 53a3c87a158..adb859abc1f 100644
--- a/tensorflow/core/common_runtime/device_resolver_local.h
+++ b/tensorflow/core/common_runtime/device_resolver_local.h
@@ -16,9 +16,11 @@ limitations under the License.
 #define TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_RESOLVER_LOCAL_H_
 
 #include <string>
+#include <vector>
 
 #include "tensorflow/core/framework/collective.h"
 #include "tensorflow/core/framework/device_attributes.pb.h"
+#include "tensorflow/core/platform/status.h"
 
 namespace tensorflow {
 class DeviceMgr;
@@ -26,22 +28,16 @@ class DeviceMgr;
 // Implements DeviceResolverInterface in a single-task context.
 class DeviceResolverLocal : public DeviceResolverInterface {
  public:
-  DeviceResolverLocal(const DeviceMgr* dev_mgr) : dev_mgr_(dev_mgr) {}
+  explicit DeviceResolverLocal(const DeviceMgr* dev_mgr) : dev_mgr_(dev_mgr) {}
 
-  virtual ~DeviceResolverLocal() {}
+  Status GetDeviceAttributes(const string& device,
+                             DeviceAttributes* attributes) override;
 
-  void GetAllDeviceAttributesAsync(const std::vector<string>& devices,
-                                   const std::vector<string>& tasks,
-                                   std::vector<DeviceAttributes>* attributes,
-                                   const StatusCallback& done) override;
+  Status GetAllDeviceAttributes(
+      const string& task, std::vector<DeviceAttributes>* attributes) override;
 
-  void GetDeviceAttributesAsync(const string& device, const string& task,
-                                DeviceAttributes* attributes,
-                                const StatusCallback& done) override;
-
-  void ClearTask(const string& task) override {}
-
-  void ClearCache() override {}
+  Status UpdateDeviceAttributes(
+      const std::vector<DeviceAttributes>& attributes) override;
 
  protected:
   const DeviceMgr* dev_mgr_;
diff --git a/tensorflow/core/common_runtime/device_resolver_local_test.cc b/tensorflow/core/common_runtime/device_resolver_local_test.cc
index 0ddf36907d8..45ea5654eb3 100644
--- a/tensorflow/core/common_runtime/device_resolver_local_test.cc
+++ b/tensorflow/core/common_runtime/device_resolver_local_test.cc
@@ -17,7 +17,6 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
-#include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
@@ -46,35 +45,27 @@ class DeviceResolverLocalTest : public ::testing::Test {
 };
 
 TEST_F(DeviceResolverLocalTest, GetDeviceAttributesKnown) {
-  std::vector<DeviceAttributes> attributes;
-  std::vector<string> devices{"/job:localhost/replica:0/task:0/device:CPU:1",
-                              "/job:localhost/replica:0/task:0/device:CPU:2"};
-  Notification note;
-  Status status;
-  drl_->GetAllDeviceAttributesAsync(devices, /*tasks=*/{}, &attributes,
-                                    [&note, &status](const Status& s) {
-                                      status = s;
-                                      note.Notify();
-                                    });
-  note.WaitForNotification();
-  TF_EXPECT_OK(status);
-  EXPECT_EQ(2, attributes.size());
+  DeviceAttributes attributes;
+  TF_EXPECT_OK(drl_->GetDeviceAttributes(
+      "/job:localhost/replica:0/task:0/device:CPU:1", &attributes));
+  EXPECT_EQ(attributes.name(), "/job:localhost/replica:0/task:0/device:CPU:1");
 }
 
 TEST_F(DeviceResolverLocalTest, GetDeviceAttributesUnknown) {
+  DeviceAttributes attributes;
+  EXPECT_TRUE(errors::IsNotFound(drl_->GetDeviceAttributes(
+      "/job:localhost/replica:0/task:0/device:CPU:9", &attributes)));
+}
+
+TEST_F(DeviceResolverLocalTest, GetAllDeviceAttributes) {
   std::vector<DeviceAttributes> attributes;
-  // In some builds there may be 1 GPU, but there should never be 9.
-  std::vector<string> devices{"/job:localhost/replica:0/task:0/device:GPU:9"};
-  Notification note;
-  Status status;
-  drl_->GetAllDeviceAttributesAsync(devices, /*tasks=*/{}, &attributes,
-                                    [&note, &status](const Status& s) {
-                                      status = s;
-                                      note.Notify();
-                                    });
-  note.WaitForNotification();
-  EXPECT_FALSE(status.ok());
-  EXPECT_EQ(0, attributes.size());
+  EXPECT_TRUE(errors::IsInternal(
+      drl_->GetAllDeviceAttributes(/*task*/ "", &attributes)));
+}
+
+TEST_F(DeviceResolverLocalTest, UpdateDeviceAttributes) {
+  std::vector<DeviceAttributes> attributes;
+  EXPECT_TRUE(errors::IsInternal(drl_->UpdateDeviceAttributes(attributes)));
 }
 
 }  // namespace
diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index a1bbcde94bd..0636af5cb9e 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -65,6 +65,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/nccl/collective_communicator.h"
 #include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/logging.h"
@@ -183,7 +184,7 @@ class DirectSessionFactory : public SessionFactory {
 
     // Must do this before the CPU allocator is created.
     if (options.config.graph_options().build_cost_model() > 0) {
-      EnableCPUAllocatorFullStats(true);
+      EnableCPUAllocatorFullStats();
     }
     std::vector<std::unique_ptr<Device>> devices;
     TF_RETURN_IF_ERROR(DeviceFactory::AddDevices(
@@ -554,7 +555,8 @@ Status DirectSession::RunInternal(
                                            drl.get(),
                                            "/job:localhost/replica:0/task:0"));
       collective_executor_mgr_.reset(new CollectiveExecutorMgr(
-          options_.config, device_mgr_.get(), std::move(drl), std::move(cprl)));
+          options_.config, device_mgr_.get(), std::move(drl), std::move(cprl),
+          MaybeCreateNcclCommunicator()));
     }
     run_state.collective_executor.reset(new CollectiveExecutor::Handle(
         collective_executor_mgr_->FindOrCreate(step_id), true /*inherit_ref*/));
diff --git a/tensorflow/core/common_runtime/direct_session_test.cc b/tensorflow/core/common_runtime/direct_session_test.cc
index eab508662e6..613449f572e 100644
--- a/tensorflow/core/common_runtime/direct_session_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_test.cc
@@ -1965,7 +1965,6 @@ static void TestSessionInterOpThreadsImpl(bool use_function_lib,
       ->set_constant_folding(RewriterConfig::OFF);
   (*options.config.mutable_device_count())["CPU"] = 2;
   (*options.config.mutable_device_count())["GPU"] = 0;
-  (*options.config.mutable_device_count())["SYCL"] = 0;
 
   auto* p = options.config.add_session_inter_op_thread_pool();
   if (use_global_pools) p->set_global_name("large pool");
diff --git a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
index ee4cc2d8384..c314d296fd9 100644
--- a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
@@ -175,16 +175,10 @@ static void TestHWAccelerator(bool enableHWTrace) {
   test::FillValues<float>(&x_tensor, {1, 1});
   Node* x = test::graph::Constant(&graph, x_tensor);
   x->set_assigned_device_name("/job:localhost/replica:0/task:0/device:GPU:0");
-#ifdef TENSORFLOW_USE_SYCL
-  x->set_assigned_device_name("/job:localhost/replica:0/task:0/device:SYCL:0");
-#endif  // TENSORFLOW_USE_SYCL
 
   // y = A * x
   Node* y = test::graph::Matmul(&graph, a, x, false, false);
   y->set_assigned_device_name("/job:localhost/replica:0/task:0/device:GPU:0");
-#ifdef TENSORFLOW_USE_SYCL
-  y->set_assigned_device_name("/job:localhost/replica:0/task:0/device:SYCL:0");
-#endif  // TENSORFLOW_USE_SYCL
 
   Node* y_neg = test::graph::Unary(&graph, "Neg", y);
   y_neg->set_assigned_device_name("/job:localhost/replica:0/task:0/cpu:0");
@@ -195,9 +189,6 @@ static void TestHWAccelerator(bool enableHWTrace) {
   SessionOptions options;
   (*options.config.mutable_device_count())["CPU"] = 1;
   (*options.config.mutable_device_count())["GPU"] = 1;
-#ifdef TENSORFLOW_USE_SYCL
-  (*options.config.mutable_device_count())["SYCL"] = 1;
-#endif  // TENSORFLOW_USE_SYCL
   options.config.set_allow_soft_placement(true);
   options.config.mutable_graph_options()->set_build_cost_model(1);
   std::unique_ptr<Session> session(NewSession(options));
diff --git a/tensorflow/core/common_runtime/eager/BUILD b/tensorflow/core/common_runtime/eager/BUILD
index b4c905f220e..38f5f85f528 100644
--- a/tensorflow/core/common_runtime/eager/BUILD
+++ b/tensorflow/core/common_runtime/eager/BUILD
@@ -82,6 +82,7 @@ tf_cuda_library(
         "//tensorflow/c/eager:immediate_execution_operation",
         "//tensorflow/core/distributed_runtime:rendezvous_mgr_interface",
         "//tensorflow/core/distributed_runtime:worker_env",
+        "//tensorflow/core/nccl:collective_communicator",
     ] + select({
         "//tensorflow:android": [
             "//tensorflow/core:portable_tensorflow_lib_lite",
@@ -309,9 +310,7 @@ KERNEL_AND_DEVICE_DEPS = [
     "//tensorflow/core:lib_internal",
     "//tensorflow/core:protos_all_cc",
     "//tensorflow/core/profiler/lib:annotated_traceme",
-    "//tensorflow/core/profiler/lib:connected_traceme",
     "//tensorflow/core/profiler/lib:traceme",
-    "//tensorflow/core/profiler/lib:traceme_encode",
     "//tensorflow/core/grappler/optimizers:meta_optimizer",
 ]
 
@@ -337,7 +336,7 @@ tf_cuda_library(
         ],
         "//tensorflow:windows": KERNEL_AND_DEVICE_DEPS,
         "//conditions:default": KERNEL_AND_DEVICE_DEPS + [
-            "//tensorflow/compiler/jit:xla_kernel_creator_util",
+            "//tensorflow/compiler/jit:xla_kernel_creator",
         ],
     }),
 )
diff --git a/tensorflow/core/common_runtime/eager/context.cc b/tensorflow/core/common_runtime/eager/context.cc
index 93b78ed6a26..206434343d4 100644
--- a/tensorflow/core/common_runtime/eager/context.cc
+++ b/tensorflow/core/common_runtime/eager/context.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/process_function_library_runtime.h"
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/nccl/collective_communicator.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/platform.h"
 // clang-format on
@@ -72,8 +73,7 @@ auto* eager_context_created =
 
 EagerContext::EagerContext(
     const SessionOptions& opts,
-    ContextDevicePlacementPolicy default_device_placement_policy,
-    ContextMirroringPolicy default_mirroring_policy, bool async,
+    ContextDevicePlacementPolicy default_device_placement_policy, bool async,
     const bool lazy_copy_function_remote_inputs, const DeviceMgr* device_mgr,
     bool device_mgr_owned, Rendezvous* rendezvous,
     const CustomKernelCreator* custom_kernel_creator,
@@ -81,7 +81,6 @@ EagerContext::EagerContext(
     : ImmediateExecutionContext(kEager),
       opts_(opts),
       default_device_placement_policy_(default_device_placement_policy),
-      default_mirroring_policy_(default_mirroring_policy),
       local_device_manager_(device_mgr, device_mgr_owned),
       host_cpu_device_(device_mgr->HostCPU()),
       rendezvous_(rendezvous),
@@ -122,7 +121,7 @@ EagerContext::EagerContext(
       "/job:localhost/replica:0/task:0"));
   collective_executor_mgr_.Reset(
       new CollectiveExecutorMgr(opts.config, local_device_mgr(), std::move(drl),
-                                std::move(cprl)),
+                                std::move(cprl), MaybeCreateNcclCommunicator()),
       /*owned=*/true);
 }
 
@@ -170,24 +169,15 @@ AbstractTensorInterface* EagerContext::CreateTensor(
 
 AbstractTensorInterface* EagerContext::CreateTensor(
     DataType dtype, const int64_t* dims, int num_dims, void* data, size_t len,
-    bool convert_string, MemoryReleaser memory_releaser,
-    void* memory_releaser_arg) {
+    MemoryReleaser memory_releaser, void* memory_releaser_arg) {
   TF_Tensor* tensor_wrapper =
       TF_NewTensor(static_cast<TF_DataType>(dtype), dims, num_dims, data, len,
                    memory_releaser, memory_releaser_arg);
 
-  if (convert_string) {
-    tensorflow::Tensor tensor;
-    Status status = TF_TensorToTensor(tensor_wrapper, &tensor);
-    TF_DeleteTensor(tensor_wrapper);
-    if (!status.ok()) return nullptr;
-    return new TensorInterface(std::move(tensor));
-  } else {
-    AbstractTensorInterface* result = nullptr;
-    std::swap(result, tensor_wrapper->tensor);
-    TF_DeleteTensor(tensor_wrapper);
-    return result;
-  }
+  AbstractTensorInterface* result = nullptr;
+  std::swap(result, tensor_wrapper->tensor);
+  TF_DeleteTensor(tensor_wrapper);
+  return result;
 }
 
 void EagerContext::ResetPFLR(const DeviceMgr* device_mgr, Env* env,
@@ -412,25 +402,6 @@ ContextDevicePlacementPolicy EagerContext::GetDevicePlacementPolicy() const {
   return default_device_placement_policy_;
 }
 
-void EagerContext::SetThreadLocalMirroringPolicy(
-    ContextMirroringPolicy policy) {
-  mutex_lock ml(policy_map_mu_);
-  mirroring_policy_[std::this_thread::get_id()] = policy;
-}
-
-ContextMirroringPolicy EagerContext::GetMirroringPolicy() const {
-  tf_shared_lock l(policy_map_mu_);
-  auto policy_map_it = mirroring_policy_.find(std::this_thread::get_id());
-  if (policy_map_it != mirroring_policy_.end()) {
-    return policy_map_it->second;
-  }
-  return default_mirroring_policy_;
-}
-
-bool EagerContext::MirrorTensors() const {
-  return GetMirroringPolicy() == MIRRORING_ALL;
-}
-
 bool EagerContext::LazyCopyFunctionRemoteInputs() const {
   return lazy_copy_function_remote_inputs_;
 }
@@ -671,7 +642,8 @@ Status EagerContext::MaybeRegisterFunctionRemotely(const FunctionDef& fdef) {
 
     eager::EnqueueResponse* response = new eager::EnqueueResponse();
     eager_client->StreamingEnqueueAsync(
-        request.get(), response, [request, response](const Status& status) {
+        /*call_opts=*/nullptr, request.get(), response,
+        [request, response](const Status& status) {
           if (!status.ok()) {
             LOG(ERROR) << "Failed to register function remotely due to "
                        << status.error_message()
@@ -714,7 +686,7 @@ Status EagerContext::RegisterExistingFunctionsOnRemoteWorkers(
     for (int i = 0; i < requests.size(); i++) {
       auto response = std::make_shared<eager::EnqueueResponse>();
       eager_client->StreamingEnqueueAsync(
-          requests[i].get(), response.get(),
+          /*call_opts=*/nullptr, requests[i].get(), response.get(),
           [request = requests[i], response](const Status& s) {
             if (!s.ok()) {
               LOG(ERROR) << "Failed to register function remotely due to "
@@ -825,7 +797,7 @@ Status EagerContext::SyncExecutors() {
 
     eager::EnqueueResponse* response = new eager::EnqueueResponse();
     eager_client->StreamingEnqueueAsync(
-        &request, response,
+        /*call_opts=*/nullptr, &request, response,
         [response, target, &counter, &s = statuses[i]](const Status& status) {
           s = status;
           delete response;
diff --git a/tensorflow/core/common_runtime/eager/context.h b/tensorflow/core/common_runtime/eager/context.h
index 286eb44fbeb..3785df7a579 100644
--- a/tensorflow/core/common_runtime/eager/context.h
+++ b/tensorflow/core/common_runtime/eager/context.h
@@ -94,18 +94,6 @@ enum ContextDevicePlacementPolicy {
 };
 // LINT.ThenChange(//tensorflow/c/eager/c_api.h)
 
-// LINT.IfChange
-// Note: Keep in sync with exported copy of enum in eager/c_api_experimental.h.
-enum ContextMirroringPolicy {
-  // Do not maintain mirrors in a TensorHandle, instead make new TensorHandle
-  // copies with their own lifetime.
-  MIRRORING_NONE = 0,
-  // Mirroring any remote tensor handles, associating them with the lifetime of
-  // the local TensorHandle.
-  MIRRORING_ALL = 1,
-};
-// LINT.ThenChange(//tensorflow/c/eager/c_api_experimental.h)
-
 class RunMetadataListener {
  public:
   virtual ~RunMetadataListener() {}
@@ -126,7 +114,7 @@ class CustomDevice {
                                       const string& target_device_name,
                                       TensorHandle** result) = 0;
 
-  virtual Status Execute(EagerOperation* op, TensorHandle** retvals,
+  virtual Status Execute(const EagerOperation* op, TensorHandle** retvals,
                          int* num_retvals) = 0;
 };
 
@@ -149,8 +137,7 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
 
   EagerContext(const SessionOptions& opts,
                ContextDevicePlacementPolicy default_device_placement_policy,
-               ContextMirroringPolicy default_mirroring_policy, bool async,
-               const bool lazy_copy_function_remote_inputs,
+               bool async, const bool lazy_copy_function_remote_inputs,
                const DeviceMgr* device_mgr, bool device_mgr_owned,
                Rendezvous* rendezvous,
                const CustomKernelCreator* custom_kernel_creator,
@@ -174,7 +161,6 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
       DataType dtype, absl::Span<const int64> dim_sizes) override;
   AbstractTensorInterface* CreateTensor(DataType dtype, const int64_t* dims,
                                         int num_dims, void* data, size_t len,
-                                        bool convert_string,
                                         MemoryReleaser memory_releaser,
                                         void* memory_releaser_arg) override;
 
@@ -235,14 +221,6 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
   Status SelectDevice(DeviceNameUtils::ParsedName preferred,
                       const NodeDef& ndef, Device** out) const;
 
-  // Sets the implicit copy policy for the current thread.
-  void SetThreadLocalMirroringPolicy(ContextMirroringPolicy);
-
-  // Returns the implicit copy policy for the current thread.
-  ContextMirroringPolicy GetMirroringPolicy() const;
-
-  bool MirrorTensors() const;
-
   bool LazyCopyFunctionRemoteInputs() const;
 
   bool FindFunctionByName(const string& name) const;
@@ -459,6 +437,11 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
 
   tensorflow::ServerInterface* GetServer() { return server_.get(); }
 
+  // For LLVM style RTTI.
+  static bool classof(const AbstractContext* ptr) {
+    return ptr->getKind() == kEager;
+  }
+
 #endif  // IS_MOBILE_PLATFORM
 
   // Closes remote eager contexts, waits for all RPCs to finish, and
@@ -553,15 +536,12 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
 
   SessionOptions opts_;
   const ContextDevicePlacementPolicy default_device_placement_policy_;
-  const ContextMirroringPolicy default_mirroring_policy_;
 
   // Note: we cannot use C++11 thread_local here as there is no concept of a
   // thread-local-object-local variable in C++11.
   mutable mutex policy_map_mu_;
   std::unordered_map<std::thread::id, ContextDevicePlacementPolicy>
       device_placement_policy_ TF_GUARDED_BY(policy_map_mu_);
-  std::unordered_map<std::thread::id, ContextMirroringPolicy> mirroring_policy_
-      TF_GUARDED_BY(policy_map_mu_);
 
   OwnedOrUnownedHelper<const DeviceMgr> local_device_manager_;
   // Maintain copy of all previously created local device managers.
@@ -660,11 +640,6 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
       std::unique_ptr<eager::RemoteMgr, std::function<void(eager::RemoteMgr*)>>
           remote_mgr);
 
-  // For LLVM style RTTI.
-  static bool classof(const AbstractContext* ptr) {
-    return ptr->getKind() == kEager;
-  }
-
   // The server_ is not const since we release it when the context is destroyed.
   // Therefore the server_ object is not marked as const (even though it should
   // be).
diff --git a/tensorflow/core/common_runtime/eager/context_test.cc b/tensorflow/core/common_runtime/eager/context_test.cc
index e577b1d8152..10ca2eb8660 100644
--- a/tensorflow/core/common_runtime/eager/context_test.cc
+++ b/tensorflow/core/common_runtime/eager/context_test.cc
@@ -56,7 +56,6 @@ class EagerContextTest : public ::testing::Test {
     InitDeviceManager();
     context_ = new EagerContext(
         opts, policy,
-        /* default_mirroring_policy */ MIRRORING_NONE,
         /* async */ false,
         /* lazy_copy_function_remote_inputs */ false, device_manager_,
         /* device_mgr_owned */ false, /* rendezvous */ nullptr,
diff --git a/tensorflow/core/common_runtime/eager/core.cc b/tensorflow/core/common_runtime/eager/core.cc
index 43daf37f6b2..d1e1218a370 100644
--- a/tensorflow/core/common_runtime/eager/core.cc
+++ b/tensorflow/core/common_runtime/eager/core.cc
@@ -191,6 +191,9 @@ Status EagerContext::RegisterFunction(AbstractFunction* f) {
 // eager_operation.cc we can avoid a circular dependency between them.
 Status EagerOperation::Execute(absl::Span<AbstractTensorHandle*> retvals,
                                int* num_retvals) {
+  for (int i = 0; i < Inputs().size(); ++i) {
+    TF_RETURN_IF_ERROR(Inputs()[i]->WaitUnknownDevice());
+  }
   // Run eager placement logic.
   VariantDevice device;
   TF_RETURN_IF_ERROR(eager::MaybePinToCustomDevice(&device, *this));
@@ -205,12 +208,18 @@ Status EagerOperation::Execute(absl::Span<AbstractTensorHandle*> retvals,
       device = ctx_.HostCPU();
     }
   }
+
+  tensorflow::TensorHandle** retval_array =
+      reinterpret_cast<tensorflow::TensorHandle**>(retvals.data());
+  if (VariantDeviceIsCustom(device)) {
+    return absl::get<CustomDevice*>(device)->Execute(this, retval_array,
+                                                     num_retvals);
+  }
+
   if (device != kVariantDeviceNull) {
     SetDevice(device);
   }
-  return EagerExecute(
-      this, reinterpret_cast<tensorflow::TensorHandle**>(retvals.data()),
-      num_retvals);
+  return EagerExecute(this, retval_array, num_retvals);
 }
 
 }  //  namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/eager_op_rewrite_registry_test.cc b/tensorflow/core/common_runtime/eager/eager_op_rewrite_registry_test.cc
index b433cc4dbb2..25b03e266f7 100644
--- a/tensorflow/core/common_runtime/eager/eager_op_rewrite_registry_test.cc
+++ b/tensorflow/core/common_runtime/eager/eager_op_rewrite_registry_test.cc
@@ -47,9 +47,8 @@ TEST(EagerOpRewriteRegistryTest, RegisterRewritePass) {
       "CPU", {}, "/job:localhost/replica:0/task:0/device:CPU:0"));
   tensorflow::EagerContext* ctx = new tensorflow::EagerContext(
       SessionOptions(),
-      tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
-      tensorflow::ContextMirroringPolicy::MIRRORING_NONE, false, false,
-      &device_mgr, false, nullptr, nullptr);
+      tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT, false,
+      false, &device_mgr, false, nullptr, nullptr);
   EagerOperation orig_op(ctx);
   std::unique_ptr<tensorflow::EagerOperation> out_op;
   EXPECT_EQ(Status::OK(),
diff --git a/tensorflow/core/common_runtime/eager/eager_operation.cc b/tensorflow/core/common_runtime/eager/eager_operation.cc
index 947b67a4dab..6d1ecf64fcc 100644
--- a/tensorflow/core/common_runtime/eager/eager_operation.cc
+++ b/tensorflow/core/common_runtime/eager/eager_operation.cc
@@ -277,11 +277,6 @@ Status EagerOperation::AddInputList(
   return InferInputListAttrs(inputs.size());
 }
 
-Status EagerOperation::SetUseXla(bool enable) {
-  use_xla_ = enable;
-  return Status::OK();
-}
-
 Status EagerOperation::Reset(
     const char* op, const char* device_name, bool remote,
     EagerExecutor* executor,
@@ -313,7 +308,6 @@ Status EagerOperation::Reset(
         "registered in the binary running in this process.");
   }
   attrs_.Reset(op);
-  use_xla_ = false;
   stack_trace_.reset();
   is_function_ = is_function;
   cancellation_manager_ = nullptr;
diff --git a/tensorflow/core/common_runtime/eager/eager_operation.h b/tensorflow/core/common_runtime/eager/eager_operation.h
index 327411e19c9..2e35dd43582 100644
--- a/tensorflow/core/common_runtime/eager/eager_operation.h
+++ b/tensorflow/core/common_runtime/eager/eager_operation.h
@@ -120,8 +120,6 @@ class EagerOperation : public ImmediateExecutionOperation {
   Status InputLength(const char* input_name, int* length) override;
   Status OutputLength(const char* output_name, int* length) override;
 
-  Status SetUseXla(bool enable) override;
-
   void SetStackTrace(AbstractStackTrace stack_trace) override {
     stack_trace_ = stack_trace;
   }
@@ -227,7 +225,6 @@ class EagerOperation : public ImmediateExecutionOperation {
   // updated accordingly.
   VariantDevice device_;
 
-  bool use_xla_ = false;
   absl::optional<AbstractStackTrace> stack_trace_;
   bool is_function_;  // Conceptually const, but can't be because of Reset
   bool colocation_exempt_;
@@ -257,6 +254,11 @@ inline EagerOperation* OperationFromInterface(
   return down_cast<EagerOperation*>(operation);
 }
 
+inline const EagerOperation* OperationFromInterface(
+    const ImmediateExecutionOperation* operation) {
+  return down_cast<const EagerOperation*>(operation);
+}
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EAGER_OPERATION_H_
diff --git a/tensorflow/core/common_runtime/eager/eager_operation_test.cc b/tensorflow/core/common_runtime/eager/eager_operation_test.cc
index 352c7f03365..0c98db9c87f 100644
--- a/tensorflow/core/common_runtime/eager/eager_operation_test.cc
+++ b/tensorflow/core/common_runtime/eager/eager_operation_test.cc
@@ -27,9 +27,8 @@ TEST(EagerOperationTest, DeviceName) {
       "CPU", {}, "/job:localhost/replica:0/task:0/device:CPU:0"));
   auto ctx = new EagerContext(
       SessionOptions(),
-      tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
-      tensorflow::ContextMirroringPolicy::MIRRORING_NONE, false, false,
-      &device_mgr, false, nullptr, nullptr, nullptr);
+      tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT, false,
+      false, &device_mgr, false, nullptr, nullptr, nullptr);
 
   auto op = new EagerOperation(ctx);
 
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index 577ac4c9c1f..cfb849c78f0 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -94,12 +94,7 @@ bool KernelCacheEnabled(const OpDef& op_def) {
     return false;
   }
   // TODO(b/162540360): Revisit a way to mark kernels as uncachable once we have
-  // 5+ such kernels to exclude.
-  //
-  // RuntimeFallback requires that this kernel should not be cached.
-  if (op_def.name() == "_BatchFunctionFallback") {
-    return false;
-  }
+  // 5+ kernels to exclude.
   return true;
 }
 
@@ -556,8 +551,9 @@ Status GetOrCreateKernelAndDevice(
           ctx.GetCollectiveExecutorHandle(), ctx.HostCPU()));
     }
 
-    TF_RETURN_IF_ERROR(
-        kernel->Init({ctx.LogDevicePlacement()}, ndef, graph_collector));
+    TF_RETURN_IF_ERROR(kernel->Init(
+        {ctx.LogDevicePlacement(), ctx.LazyCopyFunctionRemoteInputs()}, ndef,
+        graph_collector));
 
     if (op->is_function()) {
       ctx.AddKernelToCache(cache_key, kernel.get());
@@ -589,6 +585,107 @@ Status GetOrCreateKernelAndDevice(
   return Status::OK();
 }
 
+Status CreateUnshapedOutput(
+    const KernelAndDevice& kernel, const int output_num, Device* output_device,
+    const DataType& output_dtype,
+    const absl::optional<EagerRemoteFunctionParams>& remote_func_params,
+    EagerContext* ctx, TensorHandle** output) {
+#if defined(IS_MOBILE_PLATFORM)
+  return errors::Unimplemented(
+      "Remote outputs are not available on mobile devices.");
+#else  // !IS_MOBILE_PLATFORM
+  int64 op_id;
+  if (remote_func_params.has_value()) {
+    op_id = remote_func_params.value().op_id;
+  } else {
+    return errors::InvalidArgument(
+        "Unable to find a remote op id for a remote output of ", kernel.name());
+  }
+  string remote_task;
+  if (!DeviceNameUtils::GetTaskName(output_device->parsed_name(),
+                                    &remote_task)) {
+    return errors::InvalidArgument(
+        "Unable to find remote task corresponding to device ",
+        output_device->name());
+  }
+  if (ctx->RemoteMgr()->IsMaster()) {
+    *output = TensorHandle::CreateUnshapedRemoteHandle(
+        op_id, output_num, remote_task, output_dtype, output_device, ctx);
+  } else {
+    *output = TensorHandle::CreateLazyRemoteHandle(op_id, output_num,
+                                                   output_dtype, output_device,
+                                                   /*is_ready=*/false, ctx);
+  }
+  return Status::OK();
+#endif  // !IS_MOBILE_PLATFORM
+}
+
+Status AddOrExecuteNode(core::RefCountPtr<KernelAndDevice> kernel,
+                        EagerOperation* op, TensorHandle** retvals) {
+  EagerExecutor& executor = op->Executor();
+  EagerContext& ctx = op->EagerContext();
+  GraphCollector* graph_collector = nullptr;
+  if (ctx.ShouldStoreGraphs()) {
+    graph_collector = ctx.GetGraphCollector();
+  }
+  const int num_outputs = kernel->num_outputs();
+  absl::optional<EagerRemoteFunctionParams> remote_func_params =
+      op->remote_func_params();
+  if (kernel->IsCrossProcess() && !remote_func_params.has_value()) {
+    // Create an eager op id for a cross-process function if not exist.
+#if defined(IS_MOBILE_PLATFORM)
+    return errors::Unimplemented(
+        "Cross-process functions are not supported on mobile devices.");
+#else  // !IS_MOBILE_PLATFORM
+    const int64 op_id = ctx.RemoteMgr()->NextOpId();
+    remote_func_params =
+        EagerRemoteFunctionParams{op_id, /*step_id=*/absl::nullopt};
+#endif  // !IS_MOBILE_PLATFORM
+  }
+  if (executor.Async()) {
+    const DataTypeVector& output_dtypes = kernel->output_dtypes();
+    for (int i = 0, end = num_outputs; i < end; ++i) {
+      Device* output_device = ctx.CanonicalDevice(kernel->OutputDevice(i));
+      if (output_device == nullptr || output_device->IsLocal()) {
+        retvals[i] = TensorHandle::CreateEmptyLocalHandle(
+            /* d= */ output_device, /* op_device= */ kernel->device(),
+            /* resource_device= */ kernel->OutputResourceDevice(i),
+            output_dtypes[i], &ctx);
+      } else {
+        TF_RETURN_IF_ERROR(
+            CreateUnshapedOutput(*kernel, i, output_device, output_dtypes[i],
+                                 remote_func_params, &ctx, &retvals[i]));
+      }
+    }
+    auto node = absl::make_unique<AsyncExecuteNode>(
+        &ctx, op->Inputs(), remote_func_params, std::move(kernel),
+        graph_collector, op->GetCancellationManager(),
+        absl::Span<TensorHandle*>(retvals, num_outputs), op->GetStackTrace());
+    // Release the inputs from the eager operation since the AsyncExecuteNode
+    // would have taken ownership. This allows the inputs to be forwarded if
+    // possible.
+    op->Clear();
+    // For async mode, execution order will make sure that all
+    // input handles are ready before executing them.
+    // TODO(b/137118203): Consider executing "cheap" kernels inline for
+    // performance.
+    return executor.AddOrExecute(std::move(node));
+  } else {
+    for (int i = 0, end = num_outputs; i < end; ++i) {
+      retvals[i] = nullptr;
+    }
+    ExecuteNode node(&ctx, op->Inputs(), remote_func_params, kernel,
+                     graph_collector, op->GetCancellationManager(),
+                     {retvals, static_cast<size_t>(num_outputs)});
+    Status s = executor.SyncExecute(&node);
+    // We release the inputs AFTER executing the operation in sync mode since
+    // ExecuteNode does not increment the reference count and thus does not have
+    // ownership of the inputs while executing.
+    op->Clear();
+    return s;
+  }
+}
+
 // There are a lot of references to devices in this function and around.
 // Here is what they mean:
 //  EagerOperation::Device(): The device on which the user requested the op
@@ -631,47 +728,7 @@ Status EagerLocalExecute(EagerOperation* op, TensorHandle** retvals,
     }
   }
 
-  GraphCollector* graph_collector = nullptr;
-  if (ctx.ShouldStoreGraphs()) {
-    graph_collector = ctx.GetGraphCollector();
-  }
-
-  Status s;
-  if (executor.Async()) {
-    const DataTypeVector& output_dtypes = kernel->output_dtypes();
-    for (int i = 0, end = num_outputs; i < end; ++i) {
-      retvals[i] = TensorHandle::CreateEmptyLocalHandle(
-          /* d= */ ctx.CanonicalDevice(kernel->OutputDevice(i)),
-          /* op_device= */ kernel->device(),
-          /* resource_device= */ kernel->OutputResourceDevice(i),
-          output_dtypes[i], &ctx);
-    }
-    auto node = absl::make_unique<AsyncExecuteNode>(
-        &ctx, op->Inputs(), op->remote_func_params(), std::move(kernel),
-        graph_collector, op->GetCancellationManager(),
-        absl::Span<TensorHandle*>(retvals, num_outputs), op->GetStackTrace());
-    // Release the inputs from the eager operation since the AsyncExecuteNode
-    // would have taken ownership. This allows the inputs to be forwarded if
-    // possible.
-    op->Clear();
-    // For async mode, execution order will make sure that all
-    // input handles are ready before executing them.
-    // TODO(b/137118203): Consider executing "cheap" kernels inline for
-    // performance.
-    s = executor.AddOrExecute(std::move(node));
-  } else {
-    for (int i = 0, end = num_outputs; i < end; ++i) {
-      retvals[i] = nullptr;
-    }
-    ExecuteNode node(&ctx, op->Inputs(), op->remote_func_params(), kernel,
-                     graph_collector, op->GetCancellationManager(),
-                     {retvals, static_cast<size_t>(num_outputs)});
-    s = executor.SyncExecute(&node);
-    // We release the inputs AFTER executing the operation in sync mode since
-    // ExecuteNode does not increment the reference count and thus does not have
-    // ownership of the inputs while executing.
-    op->Clear();
-  }
+  Status s = AddOrExecuteNode(std::move(kernel), op, retvals);
   // Since the operation failed, we need to Unref any outputs if they were
   // allocated.
   if (!s.ok()) {
@@ -866,14 +923,15 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
     // execute.
 
     // The device_ and resource_device_ of this TensorHandle might be
-    // incorrect. It is pretty hard to make it correct because for
-    // multi-device functions, we don't know the output device until the
-    // function is instantiated. Luckily, we don't need to know the correct
-    // remote device here. We just need to know that it is remote. If we need
-    // to copy this tensor to this process, the remote end will know the
-    // correct device of this handle.
+    // incorrect. For multi-device functions, we don't know the output device
+    // until the function is instantiated on a remote worker. Luckily, we don't
+    // need to know the correct remote device here. We just need to know that it
+    // is remote. If we need copy this tensor to this process or run any ops
+    // which take this tensor as an input, block until the correct device is
+    // set.
+    const bool unknown_device = op->is_function();
     retvals[i] = TensorHandle::CreateUnshapedRemoteHandle(
-        id, i, remote_task, output_dtypes[i], op_device, &ctx);
+        id, i, remote_task, output_dtypes[i], op_device, &ctx, unknown_device);
   }
 
   if (ctx.LazyCopyFunctionRemoteInputs()) {
@@ -896,7 +954,7 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
 
   std::unique_ptr<EagerNode> node(new eager::RemoteExecuteNode(
       &op->EagerContext(), std::move(request), op_device,
-      ctx.GetContextViewId(), eager_client.get(),
+      ctx.GetContextViewId(), eager_client.get(), op->GetCancellationManager(),
       op->MutableAttrs()->BuildNodeDef(), op->EagerContext().FuncLibDef(),
       op->Inputs(), {retvals, num_outputs}));
 
@@ -922,18 +980,34 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
 }
 #endif  // IS_MOBILE_PLATFORM
 
-Status GetKernelOutputs(std::vector<Tensor>* outputs, int num_outputs,
-                        TensorHandle** retvals, EagerContext* ctx,
-                        KernelAndDevice* kernel) {
+Status GetKernelOutputs(
+    std::vector<EagerKernelRet>* outputs, int num_outputs,
+    TensorHandle** retvals, EagerContext* ctx, KernelAndDevice* kernel,
+    const absl::optional<EagerRemoteFunctionParams>& remote_func_params) {
   for (int i = 0, end = num_outputs; i < end; ++i) {
     if (retvals[i] == nullptr) {
-      retvals[i] = TensorHandle::CreateLocalHandle(
-          std::move((*outputs)[i]),
-          /* d= */ ctx->CanonicalDevice(kernel->OutputDevice(i)),
-          /* op_device= */ kernel->device(),
-          /* resource_device= */ kernel->OutputResourceDevice(i), ctx);
+      EagerKernelRet& ret = (*outputs)[i];
+      Device* output_device = ctx->CanonicalDevice(kernel->OutputDevice(i));
+      if (ret.index() == 0) {
+        retvals[i] = TensorHandle::CreateLocalHandle(
+            std::move(absl::get<Tensor>(ret)),
+            /* d= */ output_device,
+            /* op_device= */ kernel->device(),
+            /* resource_device= */ kernel->OutputResourceDevice(i), ctx);
+      } else {
+        const DataTypeVector& output_dtypes = kernel->output_dtypes();
+        TF_RETURN_IF_ERROR(
+            CreateUnshapedOutput(*kernel, i, output_device, output_dtypes[i],
+                                 remote_func_params, ctx, &retvals[i]));
+#if !defined(IS_MOBILE_PLATFORM)
+        TF_RETURN_IF_ERROR(
+            retvals[i]->SetRemoteShape(absl::get<TensorShape>(ret),
+                                       output_device, ctx->GetContextViewId()));
+#endif  // IS_MOBILE_PLATFORM
+      }
     } else {
-      if (TF_PREDICT_FALSE(kernel->device() != retvals[i]->op_device())) {
+      if (!kernel->IsFunction() &&
+          TF_PREDICT_FALSE(kernel->device() != retvals[i]->op_device())) {
         return errors::Internal(
             "Kernel output tensor handle has a different op device than the "
             "kernel. This should never happen.");
@@ -945,9 +1019,21 @@ Status GetKernelOutputs(std::vector<Tensor>* outputs, int num_outputs,
             "the specified kernel output device. This should never happen.");
       }
 
-      TF_RETURN_IF_ERROR(
-          retvals[i]->SetTensor(std::move((*outputs)[i]),
-                                ctx->CanonicalDevice(kernel->OutputDevice(i))));
+      EagerKernelRet& ret = (*outputs)[i];
+      if (ret.index() == 0) {
+        TF_RETURN_IF_ERROR(retvals[i]->SetTensor(
+            std::move(absl::get<Tensor>(ret)),
+            ctx->CanonicalDevice(kernel->OutputDevice(i))));
+      } else {
+#if defined(IS_MOBILE_PLATFORM)
+        return errors::Unimplemented(
+            "Remote outputs are not available on mobile devices.");
+#else  // !IS_MOBILE_PLATFORM
+        TF_RETURN_IF_ERROR(retvals[i]->SetRemoteShape(
+            absl::get<TensorShape>(ret),
+            absl::get<Device*>(retvals[i]->device()), ctx->GetContextViewId()));
+#endif  // !IS_MOBILE_PLATFORM
+      }
     }
   }
   return Status::OK();
@@ -984,11 +1070,6 @@ Status EagerExecute(EagerOperation* op, TensorHandle** retvals,
       [&] { return absl::StrCat("EagerExecute: ", op->Name()); },
       profiler::TraceMeLevel::kInfo);
 
-  if (VariantDeviceIsCustom(op->Device())) {
-    return absl::get<CustomDevice*>(op->Device())
-        ->Execute(op, retvals, num_retvals);
-  }
-
   if (!op->Executor().Async()) {
     // In sync mode, always clear error to maintain the same behavior as before.
     // TODO(b/141004939): Remove this.
@@ -1027,7 +1108,7 @@ Status EagerKernelExecute(
     absl::Span<TensorHandle*> retvals) {
   profiler::TraceMe activity("EagerKernelExecute",
                              profiler::TraceMeLevel::kInfo);
-  std::vector<Tensor> outputs(1);
+  std::vector<EagerKernelRet> outputs(1);
 
   ExecuteNodeArgs inputs(op_inputs.size());
   TF_RETURN_IF_ERROR(inputs.Init(ctx, op_inputs, kernel));
@@ -1052,7 +1133,7 @@ Status EagerKernelExecute(
         "happen. Please file a bug with the TensorFlow team.");
   }
   return GetKernelOutputs(&outputs, retvals.size(), retvals.data(), ctx,
-                          kernel.get());
+                          kernel.get(), remote_func_params);
 }
 
 namespace {
@@ -1128,6 +1209,7 @@ Status LocalEagerCopyToDevice(TensorHandle* h, EagerContext* ctx,
 Status EagerCopyToDevice(TensorHandle* h, EagerContext* ctx,
                          EagerExecutor* executor, Device* device, bool mirror,
                          TensorHandle** result) {
+  TF_RETURN_IF_ERROR(h->WaitUnknownDevice());
   auto send_device = h->DeviceOrHostCPU(*ctx);
   if (VariantDeviceIsCustom(send_device)) {
     return errors::Unimplemented(
@@ -1234,7 +1316,7 @@ void EagerKernelExecuteAsync(
     GraphCollector* graph_collector, CancellationManager* cancellation_manager,
     TensorHandle** retvals, int num_outputs, StatusCallback done) {
   auto inputs = std::make_shared<ExecuteNodeArgs>(op_inputs.size());
-  auto outputs = std::make_shared<std::vector<Tensor>>(1);
+  auto outputs = std::make_shared<std::vector<EagerKernelRet>>(1);
 
   Status s = inputs->Init(ctx, op_inputs, kernel);
   if (!s.ok()) {
@@ -1247,7 +1329,8 @@ void EagerKernelExecuteAsync(
       ctx->StepContainer(), *inputs, outputs.get(), cancellation_manager,
       remote_func_params,
       [retvals, inputs, outputs, num_outputs, ctx, graph_collector,
-       kernel_raw = kernel.get(), done = std::move(done)](const Status& s) {
+       remote_func_params, kernel_raw = kernel.get(),
+       done = std::move(done)](const Status& s) {
         auto wrapped_done = [&](const Status& s) {
           kernel_raw->Unref();
           done(s);
@@ -1261,7 +1344,7 @@ void EagerKernelExecuteAsync(
         }
         DCHECK_EQ(num_outputs, outputs->size());
         wrapped_done(GetKernelOutputs(outputs.get(), num_outputs, retvals, ctx,
-                                      kernel_raw));
+                                      kernel_raw, remote_func_params));
       });
 }
 }  // namespace
@@ -1321,7 +1404,12 @@ void EagerLocalExecuteAsync(EagerOperation* op, TensorHandle** retvals,
   }
 
   for (int i = 0, end = num_outputs; i < end; ++i) {
-    retvals[i] = nullptr;
+    const DataTypeVector& output_dtypes = kernel->output_dtypes();
+    retvals[i] = TensorHandle::CreateEmptyLocalHandle(
+        /* d= */ ctx.CanonicalDevice(kernel->OutputDevice(i)),
+        /* op_device= */ kernel->device(),
+        /* resource_device= */ kernel->OutputResourceDevice(i),
+        output_dtypes[i], &ctx);
   }
 
   EagerKernelExecuteAsync(
diff --git a/tensorflow/core/common_runtime/eager/execute_node_test.cc b/tensorflow/core/common_runtime/eager/execute_node_test.cc
index 83fbcf5017e..b7fae205748 100644
--- a/tensorflow/core/common_runtime/eager/execute_node_test.cc
+++ b/tensorflow/core/common_runtime/eager/execute_node_test.cc
@@ -67,9 +67,8 @@ TEST(ExecuteNodeTest, ExecuteNodeArgs) {
 
   auto ctx = new EagerContext(
       SessionOptions(),
-      tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
-      tensorflow::ContextMirroringPolicy::MIRRORING_NONE, false, false,
-      &device_mgr, false, nullptr, nullptr, nullptr);
+      tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT, false,
+      false, &device_mgr, false, nullptr, nullptr, nullptr);
 
   // Set a RemoteMgr to the EagerContext.
   auto remote_mgr = absl::make_unique<eager::RemoteMgr>(
@@ -94,9 +93,9 @@ TEST(ExecuteNodeTest, ExecuteNodeArgs) {
       TensorHandle::CreateLocalHandle(std::move(t1), device0, device0, ctx);
   // Create two remote TensorHandles
   TensorHandle* h2 = TensorHandle::CreateLazyRemoteHandle(
-      /*op_id=*/1, /*output_num=*/0, dtype, device1, ctx);
+      /*op_id=*/1, /*output_num=*/0, dtype, device1, /*is_ready=*/true, ctx);
   TensorHandle* h3 = TensorHandle::CreateLazyRemoteHandle(
-      /*op_id=*/2, /*output_num=*/1, dtype, device1, ctx);
+      /*op_id=*/2, /*output_num=*/1, dtype, device1, /*is_ready=*/true, ctx);
   // Create a packed TensorHandle
   TensorHandle* packed_h = nullptr;
   TF_ASSERT_OK(TensorHandle::CreatePackedHandle({h1, h2}, ctx, &packed_h));
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.cc b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
index 5b7232f539a..a022b6defbc 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.cc
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
@@ -40,15 +40,10 @@ limitations under the License.
 #include "tensorflow/core/platform/fingerprint.h"
 #include "tensorflow/core/platform/setround.h"
 #include "tensorflow/core/profiler/lib/annotated_traceme.h"
-#include "tensorflow/core/profiler/lib/connected_traceme.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
-#include "tensorflow/core/profiler/lib/traceme_encode.h"
 #include "tensorflow/core/public/version.h"
 #include "tensorflow/core/util/tensor_slice_reader_cache.h"
 #if !defined(IS_MOBILE_PLATFORM)
-#if !defined(PLATFORM_WINDOWS)
-#include "tensorflow/compiler/jit/xla_kernel_creator_util.h"
-#endif  // !PLATFORM_WINDOWS
 #include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
 #endif  // !IS_MOBILE_PLATFORM
 
@@ -226,7 +221,8 @@ Status KernelAndDeviceFunc::InstantiateFunc(const Context& ctx,
 Status KernelAndDeviceFunc::Init(const Context& ctx, const NodeDef& ndef,
                                  GraphCollector* graph_collector) {
   TF_RETURN_IF_ERROR(InstantiateFunc(ctx, ndef, graph_collector));
-  return pflr_->GetOutputDevices(handle_, &output_devices_);
+  return pflr_->GetOutputDevices(handle_, &output_devices_,
+                                 ctx.eager_lazy_copy);
 }
 
 namespace {
@@ -242,7 +238,8 @@ struct OpExecutionState : public core::RefCounted {
 
 Status KernelAndDeviceOp::Run(
     ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
-    std::vector<Tensor>* outputs, CancellationManager* cancellation_manager,
+    std::vector<EagerKernelRet>* outputs,
+    CancellationManager* cancellation_manager,
     const absl::optional<EagerRemoteFunctionParams>& remote_func_params) {
   OpKernelContext::Params params;
   params.device = device_;
@@ -319,7 +316,8 @@ Status KernelAndDeviceOp::Run(
 
 Status KernelAndDeviceFunc::Run(
     ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
-    std::vector<Tensor>* outputs, CancellationManager* cancellation_manager,
+    std::vector<EagerKernelRet>* outputs,
+    CancellationManager* cancellation_manager,
     const absl::optional<EagerRemoteFunctionParams>& remote_func_params) {
   Notification n;
   Status status;
@@ -334,7 +332,8 @@ Status KernelAndDeviceFunc::Run(
 
 void KernelAndDeviceFunc::RunAsync(
     ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
-    std::vector<Tensor>* outputs, CancellationManager* cancellation_manager,
+    std::vector<EagerKernelRet>* outputs,
+    CancellationManager* cancellation_manager,
     const absl::optional<EagerRemoteFunctionParams>& remote_func_params,
     std::function<void(const Status&)> done) {
   std::shared_ptr<FunctionLibraryRuntime::Options> opts = nullptr;
@@ -387,33 +386,13 @@ void KernelAndDeviceFunc::RunAsync(
 
   outputs->clear();
 
-  profiler::TraceMeProducer activity(
-      // To TraceMeConsumers in ExecutorState::Process/Finish.
-      [&] {
-        return profiler::TraceMeEncode(
-            "FunctionRun", {{"id", opts->step_id}, {"_r", 1} /*root_event*/});
-      },
-      profiler::ContextType::kTfExecutor, opts->step_id,
-      profiler::TraceMeLevel::kInfo);
-  std::vector<FunctionRet>* function_rets = new std::vector<FunctionRet>;
-  pflr_->Run(*opts, handle_, inputs, function_rets,
-             [opts, outputs, function_rets, rendezvous, local_cm,
-              step_container, this, done = std::move(done)](const Status& s) {
+  pflr_->Run(*opts, handle_, inputs, outputs,
+             [opts, rendezvous, local_cm, step_container, this,
+              done = std::move(done)](const Status& s) {
                rendezvous->Unref();
                if (step_container == nullptr) {
                  this->step_container_.CleanUp();
                }
-               if (s.ok()) {
-                 // TODO(b/162618595): Change the type of `outputs` to
-                 // support TensorShapes for remote outputs and remove the
-                 // FunctionRet to Tensor conversion here.
-                 for (const auto& ret : *function_rets) {
-                   if (ret.index() == 0) {
-                     outputs->push_back(absl::get<Tensor>(ret));
-                   }
-                 }
-               }
-               delete function_rets;
                done(s);
              });
 }
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.h b/tensorflow/core/common_runtime/eager/kernel_and_device.h
index 87c2d7a5510..0a765510d7b 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.h
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.h
@@ -82,6 +82,8 @@ class EagerKernelArgs : public FunctionArgsInterface {
   gtl::InlinedVector<TensorValue, 4> tensor_args_;
 };
 
+typedef absl::variant<Tensor, TensorShape> EagerKernelRet;
+
 // KernelAndDevice encapsulates the logic needed to run a computation eagerly.
 // The computation can be a single instantiated kernel (implemented by
 // KernelAndDeviceOp below) or a multi-device function (implemented by
@@ -95,6 +97,7 @@ class KernelAndDevice : public core::RefCounted {
  public:
   struct Context {
     bool log_device_placement = false;
+    bool eager_lazy_copy = false;
   };
 
   // Populates this with a kernel appropriate for 'ndef'.
@@ -124,10 +127,13 @@ class KernelAndDevice : public core::RefCounted {
 
   virtual bool IsFunction() { return false; }
 
+  virtual bool IsCrossProcess() { return false; }
+
   // TODO(ashankar): Handle list-valued inputs.
   virtual Status Run(
       ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
-      std::vector<Tensor>* outputs, CancellationManager* cancellation_manager,
+      std::vector<EagerKernelRet>* outputs,
+      CancellationManager* cancellation_manager,
       const absl::optional<EagerRemoteFunctionParams>& remote_func_params) = 0;
 
   // Execute kernel asynchronously when applicable. Different from `Run` which
@@ -140,7 +146,8 @@ class KernelAndDevice : public core::RefCounted {
   // from sync execution.
   virtual void RunAsync(
       ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
-      std::vector<Tensor>* outputs, CancellationManager* cancellation_manager,
+      std::vector<EagerKernelRet>* outputs,
+      CancellationManager* cancellation_manager,
       const absl::optional<EagerRemoteFunctionParams>& remote_func_params,
       StatusCallback done) = 0;
 
@@ -203,14 +210,15 @@ class KernelAndDeviceOp final : public KernelAndDevice {
               GraphCollector* graph_collector) override;
 
   Status Run(ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
-             std::vector<Tensor>* outputs,
+             std::vector<EagerKernelRet>* outputs,
              CancellationManager* cancellation_manager,
              const absl::optional<EagerRemoteFunctionParams>&
                  remote_func_params) override;
 
   void RunAsync(
       ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
-      std::vector<Tensor>* outputs, CancellationManager* cancellation_manager,
+      std::vector<EagerKernelRet>* outputs,
+      CancellationManager* cancellation_manager,
       const absl::optional<EagerRemoteFunctionParams>& remote_func_params,
       StatusCallback done) override {
     // Trivial async implementation on top of the sync version
@@ -288,6 +296,8 @@ class KernelAndDeviceFunc : public KernelAndDevice {
 
   bool IsFunction() override { return true; };
 
+  bool IsCrossProcess() override { return is_cross_process_; }
+
   Status InstantiateFunc(const Context& ctx, const NodeDef& ndef,
                          GraphCollector* graph_collector);
 
@@ -295,14 +305,15 @@ class KernelAndDeviceFunc : public KernelAndDevice {
               GraphCollector* graph_collector) override;
 
   Status Run(ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
-             std::vector<Tensor>* outputs,
+             std::vector<EagerKernelRet>* outputs,
              CancellationManager* cancellation_manager,
              const absl::optional<EagerRemoteFunctionParams>&
                  remote_func_params) override;
 
   void RunAsync(
       ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
-      std::vector<Tensor>* outputs, CancellationManager* cancellation_manager,
+      std::vector<EagerKernelRet>* outputs,
+      CancellationManager* cancellation_manager,
       const absl::optional<EagerRemoteFunctionParams>& remote_func_params,
       StatusCallback done) override;
 
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device_test.cc b/tensorflow/core/common_runtime/eager/kernel_and_device_test.cc
index a7aac4a8f6d..33e85b25fb4 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device_test.cc
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device_test.cc
@@ -133,7 +133,7 @@ void BM_KernelAndDeviceRun(int iters) {
   gtl::InlinedVector<TensorValue, 4> inputs;
   inputs.push_back(TensorValue(&t));
   inputs.push_back(TensorValue(&t));
-  std::vector<Tensor> outputs;
+  std::vector<EagerKernelRet> outputs;
   NodeDef ndef(AttrBuilder("MatMul")
                    .Set("T", DT_FLOAT)
                    .Set("transpose_a", false)
diff --git a/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite.cc b/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite.cc
index f2339806814..07b340f5958 100644
--- a/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite.cc
+++ b/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite.cc
@@ -51,11 +51,6 @@ class MklEagerOpRewrite : public EagerOpRewrite {
   static Status CreateGenericMklOp(EagerOperation* orig_op,
                                    std::unique_ptr<EagerOperation>* mkl_op);
 
-  // Creates new MKL op for Conv2D, Conv2DBackpropInput and
-  // Conv2DBackpropFilter.
-  static Status CreateMklConv2DOp(
-      EagerOperation* orig_op, std::unique_ptr<EagerOperation>* mkl_conv2d_op);
-
   // Rewrite rule for Conv2D, Conv2DBackpropInput and Conv2DBackpropFilter.
   static bool RewriteConv2D(EagerOperation* op);
 
@@ -91,9 +86,21 @@ MklEagerOpRewrite::MklEagerOpRewrite(string name, string file, string line)
     : EagerOpRewrite(name, file, line), registered_kernels_map_() {
   InsertMKLEagerOps({"BatchMatMul", AlwaysRewrite, CreateGenericMklOp});
   InsertMKLEagerOps({"BatchMatMulV2", AlwaysRewrite, CreateGenericMklOp});
-  InsertMKLEagerOps({"Conv2D", RewriteConv2D, CreateMklConv2DOp});
-  InsertMKLEagerOps({"Conv2DBackpropInput", RewriteConv2D, CreateMklConv2DOp});
-  InsertMKLEagerOps({"Conv2DBackpropFilter", RewriteConv2D, CreateMklConv2DOp});
+  InsertMKLEagerOps({"Conv2D", RewriteConv2D, CreateGenericMklOp});
+  InsertMKLEagerOps(
+      {"Conv2DBackpropFilter", RewriteConv2D, CreateGenericMklOp});
+  InsertMKLEagerOps({"Conv2DBackpropInput", RewriteConv2D, CreateGenericMklOp});
+  InsertMKLEagerOps({"Conv3D", RewriteConv2D, CreateGenericMklOp});
+  InsertMKLEagerOps(
+      {"Conv3DBackpropFilterV2", RewriteConv2D, CreateGenericMklOp});
+  InsertMKLEagerOps(
+      {"Conv3DBackpropInputV2", RewriteConv2D, CreateGenericMklOp});
+  InsertMKLEagerOps(
+      {"DepthwiseConv2dNative", RewriteConv2D, CreateGenericMklOp});
+  InsertMKLEagerOps({"DepthwiseConv2dNativeBackpropFilter", RewriteConv2D,
+                     CreateGenericMklOp});
+  InsertMKLEagerOps({"DepthwiseConv2dNativeBackpropInput", RewriteConv2D,
+                     CreateGenericMklOp});
   InsertMKLEagerOps({"MatMul", AlwaysRewrite, CreateGenericMklOp});
 };
 
@@ -142,16 +149,9 @@ Status MklEagerOpRewrite::SetupNewOp(
 
 Status MklEagerOpRewrite::CreateGenericMklOp(
     EagerOperation* orig_op, std::unique_ptr<EagerOperation>* mkl_op) {
-  const string mkl_op_name = mkl_op_registry::GetMklOpName(orig_op->Name());
-  TF_CHECK_OK(SetupNewOp(orig_op, mkl_op_name, mkl_op));
-  return Status::OK();
-}
-
-Status MklEagerOpRewrite::CreateMklConv2DOp(
-    EagerOperation* orig_op, std::unique_ptr<EagerOperation>* mkl_conv2d_op) {
   const string mkl_op_name =
-      mkl_op_registry::GetMklEagerOpName(orig_op->Name());
-  TF_CHECK_OK(SetupNewOp(orig_op, mkl_op_name, mkl_conv2d_op));
+      mkl_op_registry::GetMklNativeOpName(orig_op->Name());
+  TF_CHECK_OK(SetupNewOp(orig_op, mkl_op_name, mkl_op));
   return Status::OK();
 }
 
@@ -207,10 +207,10 @@ bool MklEagerOpRewrite::SlowCheckIfKernelRegistered(string op_name,
                                                     DataType dt) {
   // Find if the eager op_name exists in mkl_eager_ops_ list.
   auto element = mkl_eager_ops_.find(op_name);
-  if (element != mkl_eager_ops_.end() && dt == DT_FLOAT) {
+  if (element != mkl_eager_ops_.end()) {
     // Eager Op exists. So verify registry and return registered or not.
     return (mkl_op_registry::IsMklNameChangeOp(
-                mkl_op_registry::GetMklEagerOpName(op_name), dt) ||
+                mkl_op_registry::GetMklNativeOpName(op_name), dt) ||
             mkl_op_registry::IsMklNameChangeOp(
                 mkl_op_registry::GetMklOpName(op_name), dt));
   } else {
diff --git a/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite_test.cc b/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite_test.cc
index 91ca800cbac..4131be42128 100644
--- a/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite_test.cc
+++ b/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite_test.cc
@@ -19,10 +19,11 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/eager/eager_op_rewrite_registry.h"
 #include "tensorflow/core/framework/rendezvous.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/util/mkl_util.h"
 
 namespace tensorflow {
 
-class EagerOpRewriteTest {
+class EagerOpRewriteTest : public ::testing::Test {
  public:
   EagerOpRewriteTest() {}
 
@@ -39,8 +40,7 @@ class EagerOpRewriteTest {
     tensorflow::EagerContext* eager_ctx = new tensorflow::EagerContext(
         SessionOptions(),
         tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
-        tensorflow::ContextMirroringPolicy::MIRRORING_NONE, async,
-        lazy_remote_tensor_copy, device_mgr.get(), false, rendezvous,
+        async, lazy_remote_tensor_copy, device_mgr.get(), false, rendezvous,
         GetDefaultCustomKernelCreator());
 
     EagerExecutor executor_(false);
@@ -68,71 +68,51 @@ class EagerOpRewriteTest {
   }
 };
 
-TEST(EagerOpRewriteTest, Conv2D) {
-  const string orig_op_name = "Conv2D";
-  std::unique_ptr<tensorflow::EagerOperation> orig_op =
-      EagerOpRewriteTest::CreateOp(orig_op_name);
+#define CONV_OPS                                                      \
+  "Conv2D", "Conv2DBackpropInput", "Conv2DBackpropFilter", "Conv3D",  \
+      "Conv3DBackpropFilterV2", "Conv3DBackpropInputV2",              \
+      "DepthwiseConv2dNative", "DepthwiseConv2dNativeBackpropFilter", \
+      "DepthwiseConv2dNativeBackpropInput"
 
-  orig_op->MutableAttrs()->Set("T", DT_FLOAT);
-  orig_op->MutableAttrs()->Set("padding", "VALID");
+#define REGISTER_TEST(NAME, T, INPUT)                                 \
+  TEST_F(EagerOpRewriteTest, NAME##_##T) {                            \
+    std::vector<string> conv_ops = {CONV_OPS};                        \
+    for (int i = 0; i < conv_ops.size(); ++i) {                       \
+      auto orig_op = CreateOp(conv_ops[i]);                           \
+      orig_op->MutableAttrs()->Set("T", T);                           \
+      orig_op->MutableAttrs()->Set("padding", "VALID");               \
+      CheckRewrite(orig_op.get(),                                     \
+                   mkl_op_registry::GetMklNativeOpName(conv_ops[i])); \
+    }                                                                 \
+  }
+REGISTER_TEST_ALL_TYPES(ConvOps_Positive);
+#undef REGISTER_TEST
 
-  EagerOpRewriteTest::CheckRewrite(orig_op.get(), "_MklEagerConv2D");
-}
+#define REGISTER_TEST(NAME, T, INPUT)                      \
+  TEST_F(EagerOpRewriteTest, NAME##_##T) {                 \
+    std::vector<string> conv_ops = {CONV_OPS};             \
+    for (int i = 0; i < conv_ops.size(); ++i) {            \
+      auto orig_op = CreateOp(conv_ops[i]);                \
+      orig_op->MutableAttrs()->Set("T", T);                \
+      orig_op->MutableAttrs()->Set("padding", "EXPLICIT"); \
+      CheckRewrite(orig_op.get(), conv_ops[i]);            \
+    }                                                      \
+  }
+REGISTER_TEST_ALL_TYPES(ConvOpsExplicitPadding_Negative);
+#undef REGISTER_TEST
 
-TEST(EagerOpRewriteTest, Conv2D_Explicit_Padding) {
-  const string orig_op_name = "Conv2D";
-  std::unique_ptr<tensorflow::EagerOperation> orig_op =
-      EagerOpRewriteTest::CreateOp(orig_op_name);
-
-  orig_op->MutableAttrs()->Set("T", DT_FLOAT);
-  orig_op->MutableAttrs()->Set("padding", "EXPLICIT");
-
-  EagerOpRewriteTest::CheckRewrite(orig_op.get(), "Conv2D");
-}
-
-TEST(EagerOpRewriteTest, Conv2DBackpropInput) {
-  const string orig_op_name = "Conv2DBackpropInput";
-  std::unique_ptr<tensorflow::EagerOperation> orig_op =
-      EagerOpRewriteTest::CreateOp(orig_op_name);
-
-  orig_op->MutableAttrs()->Set("T", DT_FLOAT);
-  orig_op->MutableAttrs()->Set("padding", "VALID");
-
-  EagerOpRewriteTest::CheckRewrite(orig_op.get(),
-                                   "_MklEagerConv2DBackpropInput");
-}
-
-TEST(EagerOpRewriteTest, Conv2DBackpropFilter) {
-  const string orig_op_name = "Conv2DBackpropFilter";
-  std::unique_ptr<tensorflow::EagerOperation> orig_op =
-      EagerOpRewriteTest::CreateOp(orig_op_name);
-
-  orig_op->MutableAttrs()->Set("T", DT_FLOAT);
-  orig_op->MutableAttrs()->Set("padding", "VALID");
-
-  EagerOpRewriteTest::CheckRewrite(orig_op.get(),
-                                   "_MklEagerConv2DBackpropFilter");
-}
-
-TEST(EagerOpRewriteTest, BatchMatMul) {
-  const string orig_op_name = "BatchMatMul";
-  std::unique_ptr<tensorflow::EagerOperation> orig_op =
-      EagerOpRewriteTest::CreateOp(orig_op_name);
-
-  orig_op->MutableAttrs()->Set("T", DT_FLOAT);
-
-  EagerOpRewriteTest::CheckRewrite(orig_op.get(), "_MklBatchMatMul");
-}
-
-TEST(EagerOpRewriteTest, MatMul) {
-  const string orig_op_name = "MatMul";
-  std::unique_ptr<tensorflow::EagerOperation> orig_op =
-      EagerOpRewriteTest::CreateOp(orig_op_name);
-
-  orig_op->MutableAttrs()->Set("T", DT_FLOAT);
-
-  EagerOpRewriteTest::CheckRewrite(orig_op.get(), "_MklMatMul");
-}
+#define REGISTER_TEST(NAME, T, INPUT)                            \
+  TEST_F(EagerOpRewriteTest, NAME##_##T) {                       \
+    std::vector<string> ops = {"BatchMatMul", "MatMul"};         \
+    for (int i = 0; i < ops.size(); ++i) {                       \
+      auto orig_op = CreateOp(ops[i]);                           \
+      orig_op->MutableAttrs()->Set("T", T);                      \
+      CheckRewrite(orig_op.get(),                                \
+                   mkl_op_registry::GetMklNativeOpName(ops[i])); \
+    }                                                            \
+  }
+REGISTER_TEST_ALL_TYPES(MostOps_Positive);
+#undef REGISTER_TEST
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/common_runtime/eager/placement_test.cc b/tensorflow/core/common_runtime/eager/placement_test.cc
index 4ea38d2f5f9..8e923e6ea8d 100644
--- a/tensorflow/core/common_runtime/eager/placement_test.cc
+++ b/tensorflow/core/common_runtime/eager/placement_test.cc
@@ -85,7 +85,6 @@ class PlacementTest : public ::testing::Test {
     InitDeviceManager();
     context_ = new EagerContext(
         opts, policy,
-        /* default_mirroring_policy */ MIRRORING_NONE,
         /* async */ false,
         /* lazy_copy_function_remote_inputs */ false, device_manager_,
         /* device_mgr_owned */ false, /* rendezvous */ nullptr,
diff --git a/tensorflow/core/common_runtime/eager/placement_utils.cc b/tensorflow/core/common_runtime/eager/placement_utils.cc
index 148c6c6ce03..619715f1cae 100644
--- a/tensorflow/core/common_runtime/eager/placement_utils.cc
+++ b/tensorflow/core/common_runtime/eager/placement_utils.cc
@@ -185,34 +185,35 @@ Status MaybePinToCustomDevice(VariantDevice* device, const EagerOperation& op) {
   if (VariantDeviceIsCustom(op.Device())) {
     *device = op.Device();
     return Status::OK();
+  } else if (!op.DeviceName().empty()) {
+    // Don't override explicit placements.
+    return Status::OK();
   }
 
+  // Ops are placed on a custom device if there's no other explicit requested
+  // placement and there is only one custom device in the op inputs.
   if (!op.Inputs().empty()) {
-    // We keep track of what we've seen with devices instead of booleans to be
-    // able to provide a meaningful error message below.
-    VariantDevice first = op.Inputs()[0]->device();
-    VariantDevice different = first;  // A different input device, if any.
-    VariantDevice custom = first;     // The first custom device seen, or an
-                                      // arbitrary non-custom device otherwise.
-    for (size_t i = 1; first == different && i < op.Inputs().size(); ++i) {
-      VariantDevice device = op.Inputs()[i]->device();
-      if (device != first) {
-        different = device;
-      }
-      if (!VariantDeviceIsCustom(custom) && VariantDeviceIsCustom(device)) {
-        custom = device;
-      }
-      if (different != first && VariantDeviceIsCustom(custom)) {
-        return errors::InvalidArgument(absl::StrCat(
-            "If an operation has one of its inputs in a custom device, then "
-            "all inputs should be on that same device. Operation ",
-            op.Name(), " has one input in custom device ",
-            VariantDeviceName(custom),
-            " and at least one input in a different device ",
-            VariantDeviceName(custom == first ? different : first)));
+    CustomDevice* first = nullptr;
+    for (const TensorHandle* input : op.Inputs()) {
+      if (VariantDeviceIsCustom(input->device())) {
+        CustomDevice* current = absl::get<CustomDevice*>(input->device());
+        if (first == nullptr) {
+          first = current;
+        } else if (first != current) {
+          return errors::InvalidArgument(absl::StrCat(
+              "If an operation has one of its inputs in a custom device, then "
+              "all inputs should be on that same custom device or another "
+              "physical device. Operation ",
+              op.Name(),
+              " has one input in custom "
+              "device ",
+              VariantDeviceName(first),
+              " and at least one input in a different custom device ",
+              VariantDeviceName(current)));
+        }
       }
     }
-    if (different == first && VariantDeviceIsCustom(custom)) {
+    if (first != nullptr) {
       *device = first;
       return Status::OK();
     }
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.cc b/tensorflow/core/common_runtime/eager/tensor_handle.cc
index d7b2ef4be1e..620685ea3c1 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.cc
@@ -115,6 +115,20 @@ bool TensorHandle::PackedTensorHandleData::IsReady() const {
   return true;
 }
 
+Status TensorHandle::PackedTensorHandleData::WaitReady(
+    const char* caller) const {
+  {
+    tf_shared_lock l(mu_);
+    if (!is_poisoned_.ok()) {
+      return is_poisoned_;
+    }
+  }
+  for (auto* handle : handles_) {
+    TF_RETURN_IF_ERROR(handle->WaitReady(caller));
+  }
+  return Status::OK();
+}
+
 void TensorHandle::PackedTensorHandleData::Poison(Status status) {
   mutex_lock l(mu_);
   is_poisoned_ = status;
@@ -370,14 +384,16 @@ TensorHandle::TensorHandle(std::vector<TensorHandle*>&& handles, Device* device,
 #if !defined(IS_MOBILE_PLATFORM)
 TensorHandle* TensorHandle::CreateUnshapedRemoteHandle(
     int64 op_id, int32 output_num, const string& remote_task,
-    tensorflow::DataType dtype, Device* d, EagerContext* ctx) {
-  return new TensorHandle(op_id, output_num, remote_task, dtype, d, ctx);
+    tensorflow::DataType dtype, Device* d, EagerContext* ctx,
+    const bool unknown_device) {
+  return new TensorHandle(op_id, output_num, remote_task, dtype, d, ctx,
+                          unknown_device);
 }
 
 TensorHandle::TensorHandle(int64 op_id, int32 output_num,
                            const string& remote_task,
                            tensorflow::DataType dtype, Device* d,
-                           EagerContext* ctx)
+                           EagerContext* ctx, const bool unknown_device)
     : ImmediateExecutionTensorHandle(kEager),
       dtype(dtype),
       device_(d),
@@ -385,6 +401,7 @@ TensorHandle::TensorHandle(int64 op_id, int32 output_num,
       resource_device_(dtype == DT_RESOURCE ? d : nullptr),
       resource_remote_device_incarnation_(
           GetRemoteDeviceIncarnation(resource_device_)),
+      unknown_device_(unknown_device),
       ctx_(ctx),
       data_(absl::in_place_type<RemoteTensorHandleData>, op_id, output_num,
             remote_task, ctx) {
@@ -392,17 +409,15 @@ TensorHandle::TensorHandle(int64 op_id, int32 output_num,
            << " device: " << VariantDeviceDebugString(device_);
 }
 
-TensorHandle* TensorHandle::CreateLazyRemoteHandle(int64 op_id,
-                                                   int32 output_num,
-                                                   tensorflow::DataType dtype,
-                                                   Device* d,
-                                                   EagerContext* ctx) {
-  return new TensorHandle(op_id, output_num, dtype, d, ctx);
+TensorHandle* TensorHandle::CreateLazyRemoteHandle(
+    int64 op_id, int32 output_num, tensorflow::DataType dtype, Device* d,
+    const bool is_ready, EagerContext* ctx) {
+  return new TensorHandle(op_id, output_num, dtype, d, is_ready, ctx);
 }
 
 TensorHandle::TensorHandle(int64 op_id, int32 output_num,
                            tensorflow::DataType dtype, Device* d,
-                           EagerContext* ctx)
+                           const bool is_ready, EagerContext* ctx)
     : ImmediateExecutionTensorHandle(kEager),
       dtype(dtype),
       device_(d),
@@ -412,7 +427,7 @@ TensorHandle::TensorHandle(int64 op_id, int32 output_num,
           GetRemoteDeviceIncarnation(resource_device_)),
       ctx_(ctx),
       data_(absl::in_place_type<RemoteTensorHandleData>, op_id, output_num,
-            ctx->GetContextViewId()) {
+            ctx->GetContextViewId(), is_ready) {
   DVLOG(3) << "Creating Lazy Remote TensorHandle: " << this
            << " device: " << VariantDeviceDebugString(device_);
 }
@@ -431,6 +446,11 @@ bool TensorHandle::IsReady() const {
   return absl::visit([](auto& data) { return data.IsReady(); }, data_);
 }
 
+Status TensorHandle::WaitReady(const char* caller) const {
+  return absl::visit([caller](auto& data) { return data.WaitReady(caller); },
+                     data_);
+}
+
 TensorHandle::HandleType TensorHandle::Type() const {
   if (data_.index() == 0) {
     return LOCAL;
@@ -518,6 +538,17 @@ Status TensorHandle::TensorValue(const Device* d, tensorflow::TensorValue* t) {
   return mirror.TensorValue(t);
 }
 
+Status TensorHandle::WaitUnknownDevice() const {
+  if (unknown_device_) {
+    TF_RETURN_IF_ERROR(absl::visit(
+        [](auto& data) {
+          return data.WaitReady("TensorHandle::UnknownDevice");
+        },
+        data_));
+  }
+  return Status::OK();
+}
+
 VariantDevice TensorHandle::DeviceOrHostCPU(const EagerContext& ctx) const {
   if (VariantDeviceIsCustom(device_)) {
     return device_;
@@ -786,13 +817,21 @@ Status TensorHandle::AddResourceShapeMirror(const Device* d, int64 op_id,
 
   resource_shape_mirrors_.emplace(
       std::piecewise_construct, std::forward_as_tuple(d->name()),
-      std::forward_as_tuple(op_id, output_num, ctx->GetContextViewId()));
+      std::forward_as_tuple(op_id, output_num, ctx->GetContextViewId(),
+                            /*is_ready=*/true));
 
   return Status::OK();
 }
 
 Status TensorHandle::SetRemoteShape(const TensorShape& shape, const Device* d,
                                     uint64 context_view_id) {
+  return SetRemoteShapeAndDevice(shape, d, context_view_id, /*op_device=*/"");
+}
+
+Status TensorHandle::SetRemoteShapeAndDevice(const TensorShape& shape,
+                                             const Device* d,
+                                             uint64 context_view_id,
+                                             string op_device) {
   DVLOG(3) << "SetRemoteShape on TensorHandle: " << this << " device: " << d
            << " " << d->name();
 
@@ -830,7 +869,27 @@ Status TensorHandle::SetRemoteShape(const TensorShape& shape, const Device* d,
   // For mirrors, this is not the case because they colocate with the data
   // consuming op/function device, and we (for now) have to aggressively
   // invalidate those copies to avoid any false positives during cluster update.
-  return data.SetShape(shape);
+  if (op_device.empty()) {
+    return data.SetShape(shape);
+  } else {
+    if (!unknown_device_) {
+      return errors::Internal("Cannot reset known devices.");
+    }
+    Device* device;
+    TF_RETURN_IF_ERROR(ctx_->FindDeviceFromName(op_device.c_str(), &device));
+    device_ = device;
+    op_device_ = device;
+    resource_device_ = dtype == DT_RESOURCE ? device : nullptr;
+    resource_remote_device_incarnation_ =
+        GetRemoteDeviceIncarnation(resource_device_);
+    string remote_task;
+    if (!DeviceNameUtils::GetTaskName(device->parsed_name(), &remote_task)) {
+      return errors::InvalidArgument(
+          "Unable to find remote task corresponding to device ",
+          device->name());
+    }
+    return data.SetShapeAndRemoteTask(shape, remote_task);
+  }
 }
 
 void TensorHandle::PoisonRemote(Status status, const Device* d,
@@ -1040,6 +1099,7 @@ const char* TensorHandle::DeviceName(Status* status) const {
   if (VariantDeviceIsCustom(device())) {
     return absl::get<CustomDevice*>(device())->name().c_str();
   }
+  status->Update(WaitUnknownDevice());
   tensorflow::Device* d = op_device();
   return (d == nullptr) ? "/job:localhost/replica:0/task:0/device:CPU:0"
                         : d->name().c_str();
@@ -1049,6 +1109,7 @@ const char* TensorHandle::BackingDeviceName(Status* status) const {
   if (VariantDeviceIsCustom(device())) {
     return absl::get<tensorflow::CustomDevice*>(device())->name().c_str();
   } else {
+    status->Update(WaitUnknownDevice());
     tensorflow::Device* d = absl::get<tensorflow::Device*>(device());
     return (d == nullptr) ? "/job:localhost/replica:0/task:0/device:CPU:0"
                           : d->name().c_str();
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.h b/tensorflow/core/common_runtime/eager/tensor_handle.h
index 99f88fe886a..eed31b79b0f 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.h
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.h
@@ -66,9 +66,10 @@ class TensorHandle : public ImmediateExecutionTensorHandle {
 
 #if !defined(IS_MOBILE_PLATFORM)
   TensorHandle(int64 op_id, int32 output_num, const string& remote_task,
-               tensorflow::DataType dtype, Device* device, EagerContext* ctx);
+               tensorflow::DataType dtype, Device* device, EagerContext* ctx,
+               const bool unknown_device);
   TensorHandle(int64 op_id, int32 output_num, tensorflow::DataType dtype,
-               Device* device, EagerContext* ctx);
+               Device* device, const bool is_ready, EagerContext* ctx);
 #endif  // IS_MOBILE_PLATFORM
 
  public:
@@ -100,13 +101,21 @@ class TensorHandle : public ImmediateExecutionTensorHandle {
                                    TensorHandle** packed_handle);
 
 #if !defined(IS_MOBILE_PLATFORM)
-  static TensorHandle* CreateUnshapedRemoteHandle(int64 op_id, int32 output_num,
-                                                  const string& remote_task,
-                                                  tensorflow::DataType dtype,
-                                                  Device* d, EagerContext* ctx);
+  // An unshaped remote handle refers to a tensor on a remote worker. It's not
+  // ready until the shape is set. It controls the lifetime of the remote
+  // tensor.
+  static TensorHandle* CreateUnshapedRemoteHandle(
+      int64 op_id, int32 output_num, const string& remote_task,
+      tensorflow::DataType dtype, Device* d, EagerContext* ctx,
+      const bool unknown_device = false);
+  // A lazy remote handle refers to a tensor on a remote worker. The lifetime of
+  // the remote tensor is controlled by the remote worker, but not by the lazy
+  // remote handle. Lazy handles are normally created on a default function
+  // device.
   static TensorHandle* CreateLazyRemoteHandle(int64 op_id, int32 output_num,
                                               tensorflow::DataType dtype,
-                                              Device* d, EagerContext* ctx);
+                                              Device* d, const bool is_ready,
+                                              EagerContext* ctx);
 #endif  // IS_MOBILE_PLATFORM
 
   void Release() override;
@@ -141,6 +150,10 @@ class TensorHandle : public ImmediateExecutionTensorHandle {
     return resource_remote_device_incarnation_;
   }
 
+  // If the devices are unknown at creation time, block until the actual devices
+  // are set (data is ready).
+  Status WaitUnknownDevice() const;
+
   VariantDevice DeviceOrHostCPU(const EagerContext& ctx) const;
 
   Status Shape(tensorflow::TensorShape* shape);
@@ -177,10 +190,15 @@ class TensorHandle : public ImmediateExecutionTensorHandle {
   // transitions the tensor handle from a non-ready to a ready state by
   // replacing the backing data abstraction to allow for the shape to be
   // queried.
+  // creating a TensorHandle (e.g. a remote output of a remote function).
   // This method or Poison must be called exactly once for remote tensors that
   // were created without a known shape.
   Status SetRemoteShape(const TensorShape& shape, const Device* d,
                         uint64 context_view_id);
+  // If op_device is not empty, reset the devices of a remote tensor which is
+  // created without known devices (e.g. function outputs).
+  Status SetRemoteShapeAndDevice(const TensorShape& shape, const Device* d,
+                                 uint64 context_view_id, string op_device);
 
   // Poisons either this handle or a remote mirror with error `status`.
   // Poisoning means that the handle will become ready and methods trying
@@ -258,21 +276,27 @@ class TensorHandle : public ImmediateExecutionTensorHandle {
   // to either SetTensor or SetRemoteShape which replaces the underlying data
   // with a ready version of the tensor handle data.
   bool IsReady() const;
+  Status WaitReady(const char* caller) const;
 
-  VariantDevice const device_;
+  VariantDevice device_;
 
   // Device in which the op producing this tensor was executed. Equals to
   // device_ for constant tensors.
   // Can be nullptr if the op producing this tensor was a function executed
   // with function library runtime.
-  tensorflow::Device* const op_device_;
+  tensorflow::Device* op_device_;
 
   // If the tensor dtype is DT_RESOURCE, resource_device_ holds the device
   // backing the resource. Else resource_device_ is nullptr.
-  tensorflow::Device* const resource_device_;
+  tensorflow::Device* resource_device_;
   // Incarnation ID of the resource device if it locates on a remote device, or
   // 0 if it locates on a local device.
-  const int64 resource_remote_device_incarnation_;
+  int64 resource_remote_device_incarnation_;
+
+  // If true, the handle refers to a remote tensor which is created without
+  // known devices. The actual devices are set by SetRemoteShape. The devices
+  // should be accessed once the handle is ready.
+  const bool unknown_device_ = false;
 
   mutable mutex mu_;
 
@@ -323,6 +347,7 @@ class TensorHandle : public ImmediateExecutionTensorHandle {
     Status NumElements(int64* num_elements) const;
     Status Unprotect();
     bool IsReady() const;
+    Status WaitReady(const char* caller) const;
     void Poison(Status status);
     string DebugString() const;
 
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle_test.cc b/tensorflow/core/common_runtime/eager/tensor_handle_test.cc
index 40cec3fcc49..7f707eef26c 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle_test.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle_test.cc
@@ -38,9 +38,8 @@ TEST(TensorHandle_ShapeTest, AsyncShape) {
       "CPU", {}, "/job:localhost/replica:0/task:0/device:CPU:0"));
   auto ctx = new EagerContext(
       SessionOptions(),
-      tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
-      tensorflow::ContextMirroringPolicy::MIRRORING_NONE, false, false,
-      &device_mgr, false, nullptr, nullptr, nullptr);
+      tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT, false,
+      false, &device_mgr, false, nullptr, nullptr, nullptr);
   TensorHandle* sync_th =
       TensorHandle::CreateLocalHandle(std::move(t), nullptr, nullptr, ctx);
   TensorHandle* async_th = TensorHandle::CreateEmptyLocalHandle(
@@ -106,7 +105,7 @@ class PackedTensorHandleTest : public ::testing::Test {
     context_ = new EagerContext(
         SessionOptions(),
         tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
-        tensorflow::ContextMirroringPolicy::MIRRORING_NONE, /* async= */ false,
+        /* async= */ false,
         /* lazy_copy_function_remote_inputs= */ false, device_mgr_,
         /* device_mgr_owned= */ false, /* rendezvous= */ nullptr,
         /* custom_kernel_creator= */ nullptr,
@@ -257,9 +256,8 @@ TEST(TensorHandle_ResourceDeviceTest, OnLocalDevice) {
   StaticDeviceMgr local_device_mgr(std::move(d0));
   auto ctx = new EagerContext(
       SessionOptions(),
-      tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
-      tensorflow::ContextMirroringPolicy::MIRRORING_NONE, false, false,
-      &local_device_mgr, false, nullptr, nullptr, nullptr);
+      tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT, false,
+      false, &local_device_mgr, false, nullptr, nullptr, nullptr);
 
   tensorflow::DataType dtype = DT_RESOURCE;
   TensorShape shape = {2};
@@ -290,9 +288,8 @@ TEST(TensorHandle_ResourceDeviceTest, OnRemoteDevice) {
   StaticDeviceMgr local_device_mgr(std::move(d_local));
   auto ctx = new EagerContext(
       SessionOptions(),
-      tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
-      tensorflow::ContextMirroringPolicy::MIRRORING_NONE, false, false,
-      &local_device_mgr, false, nullptr, nullptr, nullptr);
+      tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT, false,
+      false, &local_device_mgr, false, nullptr, nullptr, nullptr);
 
   std::unique_ptr<Device> d0(
       CreateDevice("CPU", "/job:worker/task:0/device:CPU:0", false));
@@ -334,4 +331,84 @@ TEST(TensorHandle_ResourceDeviceTest, OnRemoteDevice) {
   ctx->Unref();
 }
 
+class RemoteTensorHandleTest : public ::testing::Test {
+ public:
+  RemoteTensorHandleTest() {
+    std::vector<std::unique_ptr<Device>> devices;
+    for (const char* name : device_names_) {
+      devices.emplace_back(CreateDevice("CPU", name));
+    }
+    device_mgr_ = new StaticDeviceMgr(std::move(devices));
+
+    context_ = new EagerContext(
+        SessionOptions(),
+        tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
+        /* async= */ false,
+        /* lazy_copy_function_remote_inputs= */ false, device_mgr_,
+        /* device_mgr_owned= */ false, /* rendezvous= */ nullptr,
+        /* custom_kernel_creator= */ nullptr,
+        /* cluster_flr= */ nullptr);
+  }
+
+  ~RemoteTensorHandleTest() override {
+    delete device_mgr_;
+    context_->Unref();
+  }
+
+  EagerContext* context() { return context_; }
+
+  std::vector<Device*> ListDevices() const {
+    return device_mgr_->ListDevices();
+  }
+
+ private:
+  const std::vector<const char*> device_names_ = {
+      "/job:worker/replica:0/task:0/device:CPU:0",
+      "/job:worker/replica:0/task:1/device:CPU:0",
+      "/job:worker/replica:0/task:2/device:CPU:0"};
+
+  StaticDeviceMgr* device_mgr_;
+  EagerContext* context_;
+};
+
+TEST_F(RemoteTensorHandleTest, UnknownRemoteDevice) {
+  std::vector<std::unique_ptr<Device>> devices;
+  devices.emplace_back(
+      CreateDevice("CPU", "/job:worker/replica:0/task:0/device:CPU:0"));
+  devices.emplace_back(
+      CreateDevice("CPU", "/job:worker/replica:0/task:1/device:CPU:0"));
+  devices.emplace_back(
+      CreateDevice("CPU", "/job:worker/replica:0/task:2/device:CPU:0"));
+  StaticDeviceMgr device_mgr(std::move(devices));
+
+  EagerContext* context = new EagerContext(
+      SessionOptions(),
+      tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
+      /* async= */ false,
+      /* lazy_copy_function_remote_inputs= */ false, &device_mgr,
+      /* device_mgr_owned= */ false, /* rendezvous= */ nullptr,
+      /* custom_kernel_creator= */ nullptr,
+      /* cluster_flr= */ nullptr);
+
+  tensorflow::DataType dtype = DT_FLOAT;
+  TensorShape shape = {};
+
+  const string remote_task = "/job:worker/replica:0/task:1";
+  Device* d1 = device_mgr.ListDevices().at(1);
+  TensorHandle* h = TensorHandle::CreateUnshapedRemoteHandle(
+      /*op_id=*/0, /*output_num=*/0, remote_task, dtype, d1, context,
+      /*unknown_device=*/true);
+  EXPECT_EQ(absl::get<Device*>(h->device()), d1);
+
+  Device* d2 = device_mgr.ListDevices().at(2);
+  TF_ASSERT_OK(h->SetRemoteShapeAndDevice(
+      shape, d1, context->GetContextViewId(), d2->name()));
+  Status s;
+  EXPECT_EQ(h->BackingDeviceName(&s), d2->name());
+  TF_EXPECT_OK(s);
+  EXPECT_EQ(absl::get<Device*>(h->device()), d2);
+  h->Unref();
+  context->Unref();
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index ff45e84bce8..74305228e6f 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -629,8 +629,8 @@ template <class PropagatorStateType>
 void ExecutorState<PropagatorStateType>::Process(TaggedNode tagged_node,
                                                  int64 scheduled_nsec) {
   profiler::TraceMeConsumer activity(
-      // From TraceMeProducer in KernelAndDeviceFunc::RunAsync,
-      // DirectSession::RunInternal or GraphMgr::ExecuteAsync.
+      // From TraceMeProducer in DirectSession::RunInternal,
+      // GraphMgr::ExecuteAsync, or FunctionLibraryRuntime::Run.
       [&] {
         // NOTE: This tracing uses the iteration number from the first tagged
         // node that executes during this call to `Process()`. In principle,
diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc
index 72a08b4dc9d..a92e04a8ead 100644
--- a/tensorflow/core/common_runtime/function.cc
+++ b/tensorflow/core/common_runtime/function.cc
@@ -45,6 +45,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/profiler/lib/connected_traceme.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 
@@ -1143,6 +1144,14 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
     return;
   }
 
+  profiler::TraceMeProducer activity(
+      // To TraceMeConsumers in ExecutorState::Process/Finish.
+      [&opts] {
+        return profiler::TraceMeEncode("FunctionRun", {{"id", opts.step_id}});
+      },
+      profiler::ContextType::kTfExecutor, opts.step_id,
+      profiler::TraceMeLevel::kInfo);
+
   Executor::Args exec_args;
   ExecutorArgsFromOptions(run_opts, frame, &exec_args);
 
@@ -1207,6 +1216,14 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
   }
   DCHECK(run_opts.runner != nullptr);
 
+  profiler::TraceMeProducer activity(
+      // To TraceMeConsumers in ExecutorState::Process/Finish.
+      [&opts] {
+        return profiler::TraceMeEncode("FunctionRun", {{"id", opts.step_id}});
+      },
+      profiler::ContextType::kTfExecutor, opts.step_id,
+      profiler::TraceMeLevel::kInfo);
+
   Executor::Args exec_args;
   ExecutorArgsFromOptions(run_opts, frame, &exec_args);
   item->exec->RunAsync(exec_args, std::move(done));
diff --git a/tensorflow/core/common_runtime/gpu/BUILD b/tensorflow/core/common_runtime/gpu/BUILD
index c738e490501..6618842f68f 100644
--- a/tensorflow/core/common_runtime/gpu/BUILD
+++ b/tensorflow/core/common_runtime/gpu/BUILD
@@ -159,7 +159,7 @@ tf_cuda_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:stream_executor",
-        "//tensorflow/core/platform:tf32_utils",
+        "//tensorflow/core/platform:tensor_float_32_utils",
         "//tensorflow/core/profiler/lib:annotated_traceme",
         "//tensorflow/core/profiler/lib:scoped_annotation",
         "//third_party/eigen3",
diff --git a/tensorflow/core/common_runtime/graph_execution_state.cc b/tensorflow/core/common_runtime/graph_execution_state.cc
index a7f17ba6a0d..d700576818a 100644
--- a/tensorflow/core/common_runtime/graph_execution_state.cc
+++ b/tensorflow/core/common_runtime/graph_execution_state.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_set.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_join.h"
 #include "tensorflow/core/common_runtime/device.h"
@@ -634,16 +635,19 @@ Status GraphExecutionState::InitBaseGraph(std::unique_ptr<Graph>&& new_graph) {
 Status GraphExecutionState::OptimizeGraph(
     const BuildGraphOptions& options, std::unique_ptr<Graph>* optimized_graph,
     std::unique_ptr<FunctionLibraryDefinition>* optimized_flib) {
-#ifndef IS_MOBILE_PLATFORM
+#ifdef IS_MOBILE_PLATFORM
+  return errors::InvalidArgument("Mobile platforms not supported");
+#else
   if (session_options_->config.graph_options().place_pruned_graph()) {
     return errors::InvalidArgument("Can't optimize a pruned graph");
   }
 
   if (grappler::MetaOptimizerEnabled(session_options_->config)) {
+    // Here we build the GrapplerItem before calling the optimizer.
     grappler::GrapplerItem item;
     item.id = "tf_graph";
-    graph_->ToGraphDef(&item.graph);
 
+    // Add devices to the GrapplerItem
     // It's ok to skip invalid device annotations in Grappler.
     for (const Device* d : device_set_->devices()) {
       Status added_device = item.AddDevice(d->name());
@@ -652,11 +656,7 @@ Status GraphExecutionState::OptimizeGraph(
     VLOG(3) << "Grappler available devices: "
             << absl::StrJoin(item.devices(), ", ");
 
-    // TODO(b/114748242): Add a unit test to test this bug fix.
-    if (flib_def_) {
-      *item.graph.mutable_library() = flib_def_->ToProto();
-    }
-
+    // Add fetches to the GrapplerItem.
     item.fetch.insert(item.fetch.end(),
                       options.callable_options.fetch().begin(),
                       options.callable_options.fetch().end());
@@ -669,6 +669,8 @@ Status GraphExecutionState::OptimizeGraph(
       item.fetch.push_back(tensor_connection.from_tensor());
     }
 
+    // Add feeds to the GrapplerItem if we know them.
+    absl::flat_hash_set<absl::string_view> node_names;
     if (!(options.callable_options.feed().empty() &&
           options.callable_options.tensor_connection().empty())) {
       std::vector<SafeTensorId> feeds;
@@ -683,7 +685,7 @@ Status GraphExecutionState::OptimizeGraph(
 
       // For feeds with tensor index 0 we try to find the corresponding node in
       // the graph to infer feed data type and shape.
-      std::unordered_set<std::string> feed_nodes;
+      absl::flat_hash_set<absl::string_view> feed_nodes;
 
       // For feeds with tensor index larger than 0, we can't infer data type or
       // shape from the graph. Currently we only support type and shape
@@ -702,7 +704,9 @@ Status GraphExecutionState::OptimizeGraph(
 
       // For feeds with tensor index == 0 we try to infer data type and tensor
       // shape from the graph, by looking at the fed node attributes.
+      node_names.reserve(graph_->num_nodes());
       for (const Node* node : graph_->nodes()) {
+        node_names.insert(node->name());
         if (feed_nodes.find(node->name()) == feed_nodes.end()) continue;
 
         // Try to get the type and shape of the feed node.
@@ -747,6 +751,39 @@ Status GraphExecutionState::OptimizeGraph(
       }
     }
 
+    // Validate that the feeds and fetches are valid.
+    if (node_names.empty()) {
+      // Collect all node names in the graph if we didn't already.
+      node_names.reserve(graph_->num_nodes());
+      for (const Node* node : graph_->nodes()) {
+        node_names.insert(node->name());
+      }
+    }
+    for (const auto& feed : item.feed) {
+      SafeTensorId tensor_id = ParseTensorName(feed.first);
+      if (node_names.find(tensor_id.node()) == node_names.end()) {
+        return errors::InvalidArgument("Invalid feed, no such node in graph: ",
+                                       feed.first);
+      }
+    }
+    for (const auto& fetch : item.fetch) {
+      SafeTensorId tensor_id = ParseTensorName(fetch);
+      if (node_names.find(tensor_id.node()) == node_names.end()) {
+        return errors::InvalidArgument("Invalid fetch, no such node in graph: ",
+                                       fetch);
+      }
+    }
+
+    // Convert Graph to GraphDef and add it to the GrapplerItem.
+    graph_->ToGraphDef(&item.graph);
+    // TODO(b/114748242): Add a unit test to test this bug fix.
+    if (flib_def_) {
+      *item.graph.mutable_library() = flib_def_->ToProto();
+    }
+
+    // Construct a virtual cluster and find the cpu_device, which the
+    // ConstantFolding optimizer will use for partial evaluation of the graph.
+    grappler::VirtualCluster cluster(device_set_);
     Device* cpu_device = nullptr;
     for (const auto& device : device_set_->devices()) {
       if (device->parsed_name().id == 0 &&
@@ -755,7 +792,8 @@ Status GraphExecutionState::OptimizeGraph(
         cpu_device = device;
       }
     }
-    grappler::VirtualCluster cluster(device_set_);
+
+    // Now we can run the MetaOptimizer on the constructed GrapplerItem.
     GraphDef new_graph;
     TF_RETURN_IF_ERROR(
         grappler::RunMetaOptimizer(std::move(item), session_options_->config,
@@ -778,9 +816,9 @@ Status GraphExecutionState::OptimizeGraph(
         TF_RETURN_IF_ERROR((*optimized_flib)->AddFunctionDef(fdef));
       }
     }
-
     optimized_graph->reset(new Graph(OpRegistry::Global()));
 
+    // Convert the optimized GraphDef back to a Graph.
     GraphConstructorOptions opts;
     opts.allow_internal_ops = true;
     TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(opts, std::move(new_graph),
@@ -796,8 +834,6 @@ Status GraphExecutionState::OptimizeGraph(
   } else {
     return errors::InvalidArgument("Meta Optimizer disabled");
   }
-#else
-  return errors::InvalidArgument("Mobile platforms not supported");
 #endif  // IS_MOBILE_PLATFORM
 }
 
diff --git a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc
index decf8b2ccb5..ea38349d61c 100644
--- a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc
+++ b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc
@@ -419,12 +419,12 @@ void HierarchicalTreeBroadcaster::DispatchSend(int subdiv, int dst_rank,
           << col_ctx_->device_name << " to_device "
           << col_params_->instance.device_names[dst_idx] << " subdiv=" << subdiv
           << " dst_rank=" << dst_rank << " dst_idx=" << dst_idx;
-  col_ctx_->col_exec->PostToPeer(col_params_->instance.device_names[dst_idx],
-                                 col_params_->instance.task_names[dst_idx],
-                                 send_buf_key, col_ctx_->device,
-                                 col_ctx_->op_ctx->op_device_context(),
-                                 col_ctx_->op_ctx->output_alloc_attr(0),
-                                 src_tensor, col_ctx_->device_locality, done);
+  col_ctx_->col_exec->remote_access()->PostToPeer(
+      col_params_->instance.device_names[dst_idx],
+      col_params_->instance.task_names[dst_idx], send_buf_key, col_ctx_->device,
+      col_ctx_->op_ctx->op_device_context(),
+      col_ctx_->op_ctx->output_alloc_attr(0), src_tensor,
+      col_ctx_->device_locality, done);
 }
 
 void HierarchicalTreeBroadcaster::DispatchRecv(int subdiv, int src_rank,
@@ -438,7 +438,7 @@ void HierarchicalTreeBroadcaster::DispatchRecv(int subdiv, int src_rank,
           << col_params_->instance.device_names[src_idx] << " to_device "
           << col_ctx_->device_name << " subdiv=" << subdiv
           << " src_rank=" << src_rank << " src_idx=" << src_idx;
-  col_ctx_->col_exec->RecvFromPeer(
+  col_ctx_->col_exec->remote_access()->RecvFromPeer(
       col_params_->instance.device_names[src_idx],
       col_params_->instance.task_names[src_idx],
       col_params_->task.is_local[src_idx], recv_buf_key, col_ctx_->device,
diff --git a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc
index 1a98a9adbb8..41d88595b77 100644
--- a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc
+++ b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc
@@ -137,9 +137,8 @@ DEF_TL_TEST(8, 7, 7, -1, V(0, 1))
 class FailTestRMA : public CollectiveRemoteAccessLocal {
  public:
   FailTestRMA(const DeviceMgr* dev_mgr, DeviceResolverInterface* dev_resolver,
-              std::shared_ptr<UnboundedWorkQueue> work_queue, int64 step_id,
-              int fail_after)
-      : CollectiveRemoteAccessLocal(dev_mgr, dev_resolver, work_queue, step_id),
+              int64 step_id, int fail_after)
+      : CollectiveRemoteAccessLocal(dev_mgr, dev_resolver, step_id),
         fail_after_(fail_after) {}
 
   bool MaybeFail(const StatusCallback& done) {
@@ -253,10 +252,11 @@ class HierarchicalTreeBroadcasterTest : public ::testing::Test {
     }
     dev_resolver_ = absl::make_unique<DeviceResolverLocal>(dev_mgr_.get());
     work_queue_ = std::make_shared<UnboundedWorkQueue>(Env::Default(), "test");
-    rma_ = new FailTestRMA(dev_mgr_.get(), dev_resolver_.get(), work_queue_,
-                           kStepId, fail_after);
-    col_exec_ = new BaseCollectiveExecutor(
-        &col_exec_mgr_, rma_, kStepId, dev_mgr_.get(), gpu_ring_order_.get());
+    rma_ = new FailTestRMA(dev_mgr_.get(), dev_resolver_.get(), kStepId,
+                           fail_after);
+    col_exec_ = new BaseCollectiveExecutor(&col_exec_mgr_, rma_, kStepId,
+                                           dev_mgr_.get(),
+                                           gpu_ring_order_.get(), work_queue_);
     col_params_.name = "test_collective";
     col_params_.instance.data_type = dtype;
     static const int kGroupKey = 6;
@@ -674,8 +674,9 @@ class HierarchicalTreeBroadcasterTest : public ::testing::Test {
           new HierarchicalTreeBroadcaster;
       core::ScopedUnref unref(broadcaster);
       auto col_ctx = std::make_shared<CollectiveContext>(
-          parent_->col_exec_, parent_->dev_mgr_.get(), &ctx, &op_params,
-          col_params_, exec_key, kStepId, input_tensor_ptr, output_tensor_ptr);
+          parent_->col_exec_, /*nccl_communicator*/ nullptr,
+          parent_->dev_mgr_.get(), &ctx, &op_params, col_params_, exec_key,
+          kStepId, input_tensor_ptr, output_tensor_ptr);
       TF_CHECK_OK(broadcaster->InitializeCollectiveContext(col_ctx));
 
       // Run the broadcast.
diff --git a/tensorflow/core/common_runtime/inline_function_utils.cc b/tensorflow/core/common_runtime/inline_function_utils.cc
index 5a07573a430..362e4f2e0bc 100644
--- a/tensorflow/core/common_runtime/inline_function_utils.cc
+++ b/tensorflow/core/common_runtime/inline_function_utils.cc
@@ -587,6 +587,10 @@ Status InlineFunctionBody(const FunctionLibraryDefinition& flib_def, Graph* g,
   //
   // If 'x' is a node in fbody->graph and its copy in 'g' is 'y', we
   // remember 'y' in node_map[x->id()].
+  std::unordered_set<string> fn_nodes;
+  for (Node* n : fbody->graph->op_nodes()) {
+    fn_nodes.insert(n->name());
+  }
   std::vector<Node*> node_map(fbody->graph->num_node_ids());
   for (Node* n : fbody->graph->op_nodes()) {
     NodeDef ndef = n->def();
@@ -605,6 +609,8 @@ Status InlineFunctionBody(const FunctionLibraryDefinition& flib_def, Graph* g,
     const string prefix = strings::StrCat(caller->name(), "/");
     TF_RETURN_IF_ERROR(AddPrefixAndSuffixToNode(prefix, /*suffix=*/"", &ndef,
                                                 options.uniquify_frame_names));
+    TF_RETURN_IF_ERROR(
+        MaybeAddPrefixToColocationConstraints(fn_nodes, prefix, &ndef));
 
     Status added_node;
     Node* clone = g->AddNode(ndef, &added_node);
diff --git a/tensorflow/core/common_runtime/memory_types.cc b/tensorflow/core/common_runtime/memory_types.cc
index b37e65a7ca5..71fe7dfaddb 100644
--- a/tensorflow/core/common_runtime/memory_types.cc
+++ b/tensorflow/core/common_runtime/memory_types.cc
@@ -48,13 +48,12 @@ struct EndpointEq {
 static Status ProcessMemoryTypes(
     const DeviceType& device_type, const Graph* g,
     const std::function<Status(const Edge*, MemoryType, MemoryType)>& fn) {
-  if (device_type != DEVICE_GPU && device_type != DEVICE_SYCL) {
-    // On non-GPU and non-SYCL devices, HOST_MEMORY and DEVICE_MEMORY are always
-    // compatible.
+  if (device_type != DEVICE_GPU) {
+    // On non-GPU devices, HOST_MEMORY and DEVICE_MEMORY are always compatible.
     return Status::OK();
   }
-  // For GPU and SYCL device, HOST_MEMORY and DEVICE_MEMORY is not
-  // compatible. I.e., a conversion/transfer must be done.
+  // For GPU, HOST_MEMORY and DEVICE_MEMORY is not compatible. I.e., a
+  // conversion/transfer must be done.
   //
   // {node id, slot id} -> memory type.
   typedef std::unordered_map<Endpoint, MemoryType, EndpointHash, EndpointEq>
diff --git a/tensorflow/core/common_runtime/memory_types_test.cc b/tensorflow/core/common_runtime/memory_types_test.cc
index e2ed7aadd9c..45e0a8b64c9 100644
--- a/tensorflow/core/common_runtime/memory_types_test.cc
+++ b/tensorflow/core/common_runtime/memory_types_test.cc
@@ -34,9 +34,6 @@ TEST(MemoryTypeChecker, Int32OK) {
   // There is a kernel for adding two int32s on host memory.
   TF_EXPECT_OK(ValidateMemoryTypes(DEVICE_GPU, g));
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#ifdef TENSORFLOW_USE_SYCL
-  TF_EXPECT_OK(ValidateMemoryTypes(DEVICE_SYCL, g));
-#endif  // TENSORFLOW_USE_SYCL
   delete g;
 }
 
@@ -56,15 +53,6 @@ TEST(MemoryTypeChecker, Int32NotOk) {
   TF_EXPECT_OK(EnsureMemoryTypes(DEVICE_GPU, "/device:GPU:0", g));
   TF_EXPECT_OK(ValidateMemoryTypes(DEVICE_GPU, g));
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#ifdef TENSORFLOW_USE_SYCL
-  // There is no kernel for casting int32/host memory to float/device
-  // memory.
-  EXPECT_TRUE(errors::IsInternal(ValidateMemoryTypes(DEVICE_SYCL, g)));
-
-  // But we can insert _HostSend/_HostRecv to ensure the invariant.
-  TF_EXPECT_OK(EnsureMemoryTypes(DEVICE_SYCL, "/device:SYCL:0", g));
-  TF_EXPECT_OK(ValidateMemoryTypes(DEVICE_SYCL, g));
-#endif  // TENSORFLOW_USE_SYCL
   delete g;
 }
 
@@ -86,12 +74,6 @@ TEST(MemoryTypeChecker, MemoryTypeForOutput) {
   // int Switch's output on GPU has HOST_MEMORY constraint.
   EXPECT_EQ(memory_type, HOST_MEMORY);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#ifdef TENSORFLOW_USE_SYCL
-  auto si = test::graph::Switch(g, test::graph::Constant(g, vi), pred);
-  TF_EXPECT_OK(MemoryTypeForOutput(DEVICE_SYCL, g, si, 0, &memory_type));
-  // int Switch's output on GPU has HOST_MEMORY constraint.
-  EXPECT_EQ(memory_type, HOST_MEMORY);
-#endif  // TENSORFLOW_USE_SYCL
   delete g;
 }
 
diff --git a/tensorflow/core/common_runtime/mkl_layout_pass.cc b/tensorflow/core/common_runtime/mkl_layout_pass.cc
index 1fcdc7507b4..176670c8aa5 100644
--- a/tensorflow/core/common_runtime/mkl_layout_pass.cc
+++ b/tensorflow/core/common_runtime/mkl_layout_pass.cc
@@ -300,6 +300,13 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     csinfo_.mkl_fused_conv2d = "_MklFusedConv2D";
     csinfo_.mkl_fused_depthwise_conv2d = "_MklFusedDepthwiseConv2dNative";
     csinfo_.mkl_fused_matmul = "_MklFusedMatMul";
+    csinfo_.mkl_native_conv2d_with_bias = "_MklNativeConv2DWithBias";
+    csinfo_.mkl_native_fused_batch_norm_ex = "_MklNativeFusedBatchNormEx";
+    csinfo_.mkl_native_fused_conv2d = "_MklNativeFusedConv2D";
+    csinfo_.mkl_native_fused_depthwise_conv2d =
+        "_MklNativeFusedDepthwiseConv2dNative";
+    csinfo_.mkl_native_pad_with_conv2d = "_MklNativePadWithConv2D";
+    csinfo_.mkl_native_pad_with_fused_conv2d = "_MklNativePadWithFusedConv2D";
     csinfo_.mkl_pad_with_conv2d = "_MklPadWithConv2D";
     csinfo_.mkl_pad_with_fused_conv2d = "_MklPadWithFusedConv2D";
     csinfo_.pad = "Pad";
@@ -367,257 +374,241 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     csinfo_.sub = "Sub";
     // End - element-wise ops. See note above.
 
+    const bool native_fmt = NativeFormatEnabled();
     // NOTE: names are alphabetically sorted.
     rinfo_.push_back({csinfo_.addn, mkl_op_registry::GetMklOpName(csinfo_.addn),
-                      CopyAttrsAll, AlwaysRewrite,
-                      kRewriteForLayoutPropagation});
+                      CopyAttrsAll, AlwaysRewrite, GetRewriteCause()});
     rinfo_.push_back({csinfo_.add, mkl_op_registry::GetMklOpName(csinfo_.add),
                       CopyAttrsAll, RewriteIfAtleastOneMklInput,
-                      kRewriteForLayoutPropagation});
-    rinfo_.push_back({csinfo_.add_v2,
-                      mkl_op_registry::GetMklOpName(csinfo_.add_v2),
-                      CopyAttrsAll, RewriteIfAtleastOneMklInput,
-                      kRewriteForLayoutPropagation});
+                      GetRewriteCause()});
     rinfo_.push_back(
-        {csinfo_.avg_pool, mkl_op_registry::GetMklOpName(csinfo_.avg_pool),
-         CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
+        {csinfo_.add_v2, mkl_op_registry::GetMklOpName(csinfo_.add_v2),
+         CopyAttrsAll, RewriteIfAtleastOneMklInput, GetRewriteCause()});
+    rinfo_.push_back({csinfo_.avg_pool,
+                      mkl_op_registry::GetMklOpName(csinfo_.avg_pool),
+                      CopyAttrsAll, AlwaysRewrite, GetRewriteCause()});
     rinfo_.push_back({csinfo_.avg_pool_grad,
                       mkl_op_registry::GetMklOpName(csinfo_.avg_pool_grad),
-                      CopyAttrsAll, AlwaysRewrite,
-                      kRewriteForLayoutPropagation});
-    rinfo_.push_back(
-        {csinfo_.avg_pool3d, mkl_op_registry::GetMklOpName(csinfo_.avg_pool3d),
-         CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
+                      CopyAttrsAll, AlwaysRewrite, GetRewriteCause()});
+    rinfo_.push_back({csinfo_.avg_pool3d,
+                      mkl_op_registry::GetMklOpName(csinfo_.avg_pool3d),
+                      CopyAttrsAll, AlwaysRewrite, GetRewriteCause()});
     rinfo_.push_back({csinfo_.avg_pool3d_grad,
                       mkl_op_registry::GetMklOpName(csinfo_.avg_pool3d_grad),
-                      CopyAttrsAll, AlwaysRewrite,
-                      kRewriteForLayoutPropagation});
+                      CopyAttrsAll, AlwaysRewrite, GetRewriteCause()});
     rinfo_.push_back({csinfo_.batch_matmul,
                       mkl_op_registry::GetMklOpName(csinfo_.batch_matmul),
                       CopyAttrsAll, MatMulRewrite, kRewriteForOpNameChange});
     rinfo_.push_back({csinfo_.batch_matmul_v2,
                       mkl_op_registry::GetMklOpName(csinfo_.batch_matmul_v2),
                       CopyAttrsAll, MatMulRewrite, kRewriteForOpNameChange});
-    rinfo_.push_back(
-        {csinfo_.concat, mkl_op_registry::GetMklOpName(csinfo_.concat),
-         CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
-    rinfo_.push_back(
-        {csinfo_.concatv2, mkl_op_registry::GetMklOpName(csinfo_.concatv2),
-         CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
+    rinfo_.push_back({csinfo_.concat,
+                      mkl_op_registry::GetMklOpName(csinfo_.concat),
+                      CopyAttrsAll, AlwaysRewrite, GetRewriteCause()});
+    rinfo_.push_back({csinfo_.concatv2,
+                      mkl_op_registry::GetMklOpName(csinfo_.concatv2),
+                      CopyAttrsAll, AlwaysRewrite, GetRewriteCause()});
     rinfo_.push_back(
         {csinfo_.conjugate_transpose,
          mkl_op_registry::GetMklOpName(csinfo_.conjugate_transpose),
          CopyAttrsAll, AlwaysRewrite, kRewriteForOpNameChange});
-    rinfo_.push_back({csinfo_.conv2d,
-                      mkl_op_registry::GetMklOpName(csinfo_.conv2d),
+    rinfo_.push_back(
+        {csinfo_.conv2d, mkl_op_registry::GetMklOpName(csinfo_.conv2d),
+         CopyAttrsConvCheckConstFilter, AlwaysRewrite, GetRewriteCause()});
+    rinfo_.push_back({csinfo_.conv2d_with_bias,
+                      native_fmt ? csinfo_.mkl_native_conv2d_with_bias
+                                 : csinfo_.mkl_conv2d_with_bias,
                       CopyAttrsConvCheckConstFilter, AlwaysRewrite,
-                      kRewriteForLayoutPropagation});
-    rinfo_.push_back({csinfo_.conv2d_with_bias, csinfo_.mkl_conv2d_with_bias,
-                      CopyAttrsConvCheckConstFilter, AlwaysRewrite,
-                      kRewriteForLayoutPropagation});
+                      GetRewriteCause()});
     rinfo_.push_back({csinfo_.conv2d_grad_filter,
                       mkl_op_registry::GetMklOpName(csinfo_.conv2d_grad_filter),
-                      CopyAttrsConv, AlwaysRewrite,
-                      kRewriteForLayoutPropagation});
+                      CopyAttrsConv, AlwaysRewrite, GetRewriteCause()});
     rinfo_.push_back({csinfo_.conv2d_grad_filter_with_bias,
                       csinfo_.mkl_conv2d_grad_filter_with_bias, CopyAttrsConv,
-                      AlwaysRewrite, kRewriteForLayoutPropagation});
+                      AlwaysRewrite, GetRewriteCause()});
     rinfo_.push_back({csinfo_.conv2d_grad_input,
                       mkl_op_registry::GetMklOpName(csinfo_.conv2d_grad_input),
-                      CopyAttrsConv, AlwaysRewrite,
-                      kRewriteForLayoutPropagation});
-    rinfo_.push_back({csinfo_.conv3d,
-                      mkl_op_registry::GetMklOpName(csinfo_.conv3d),
-                      CopyAttrsConvCheckConstFilter, AlwaysRewrite,
-                      kRewriteForLayoutPropagation});
+                      CopyAttrsConv, AlwaysRewrite, GetRewriteCause()});
+    rinfo_.push_back(
+        {csinfo_.conv3d, mkl_op_registry::GetMklOpName(csinfo_.conv3d),
+         CopyAttrsConvCheckConstFilter, AlwaysRewrite, GetRewriteCause()});
     rinfo_.push_back({csinfo_.conv3d_grad_filter,
                       mkl_op_registry::GetMklOpName(csinfo_.conv3d_grad_filter),
-                      CopyAttrsConv, AlwaysRewrite,
-                      kRewriteForLayoutPropagation});
+                      CopyAttrsConv, AlwaysRewrite, GetRewriteCause()});
     rinfo_.push_back({csinfo_.conv3d_grad_input,
                       mkl_op_registry::GetMklOpName(csinfo_.conv3d_grad_input),
-                      CopyAttrsConv, AlwaysRewrite,
-                      kRewriteForLayoutPropagation});
+                      CopyAttrsConv, AlwaysRewrite, GetRewriteCause()});
     rinfo_.push_back({csinfo_.depthwise_conv2d,
                       mkl_op_registry::GetMklOpName(csinfo_.depthwise_conv2d),
                       CopyAttrsConv2DDepthwiseCheckConstFilter, AlwaysRewrite,
-                      kRewriteForLayoutPropagation});
+                      GetRewriteCause()});
     rinfo_.push_back(
         {csinfo_.depthwise_conv2d_grad_input,
          mkl_op_registry::GetMklOpName(csinfo_.depthwise_conv2d_grad_input),
-         CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
+         CopyAttrsAll, AlwaysRewrite, GetRewriteCause()});
     rinfo_.push_back(
         {csinfo_.depthwise_conv2d_grad_filter,
          mkl_op_registry::GetMklOpName(csinfo_.depthwise_conv2d_grad_filter),
-         CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
-    rinfo_.push_back(
-        {csinfo_.dequantize, mkl_op_registry::GetMklOpName(csinfo_.dequantize),
-         CopyAttrsAll, DequantizeRewrite, kRewriteForLayoutPropagation});
+         CopyAttrsAll, AlwaysRewrite, GetRewriteCause()});
+    rinfo_.push_back({csinfo_.dequantize,
+                      mkl_op_registry::GetMklOpName(csinfo_.dequantize),
+                      CopyAttrsAll, DequantizeRewrite, GetRewriteCause()});
     rinfo_.push_back({csinfo_.fused_batch_norm,
                       mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm),
-                      CopyAttrsAll, AlwaysRewrite,
-                      kRewriteForLayoutPropagation});
+                      CopyAttrsAll, AlwaysRewrite, GetRewriteCause()});
     rinfo_.push_back(
         {csinfo_.fused_batch_norm_grad,
          mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm_grad),
-         CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
+         CopyAttrsAll, AlwaysRewrite, GetRewriteCause()});
     rinfo_.push_back(
         {csinfo_.fused_batch_norm_v2,
          mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm_v2),
-         CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
+         CopyAttrsAll, AlwaysRewrite, GetRewriteCause()});
     rinfo_.push_back(
         {csinfo_.fused_batch_norm_grad_v2,
          mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm_grad_v2),
-         CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
+         CopyAttrsAll, AlwaysRewrite, GetRewriteCause()});
 
     // Using CopyAttrsAll for V3 on CPU, as there are no additional
     // attributes.
     rinfo_.push_back(
         {csinfo_.fused_batch_norm_v3,
          mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm_v3),
-         CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
+         CopyAttrsAll, AlwaysRewrite, GetRewriteCause()});
     rinfo_.push_back(
         {csinfo_.fused_batch_norm_grad_v3,
          mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm_grad_v3),
-         CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
+         CopyAttrsAll, AlwaysRewrite, GetRewriteCause()});
 #ifdef ENABLE_MKLDNN_V1
     rinfo_.push_back({csinfo_.fused_batch_norm_ex,
-                      csinfo_.mkl_fused_batch_norm_ex, CopyAttrsAll,
-                      FusedBatchNormExRewrite, kRewriteForLayoutPropagation});
+                      native_fmt ? csinfo_.mkl_native_fused_batch_norm_ex
+                                 : csinfo_.mkl_fused_batch_norm_ex,
+                      CopyAttrsAll, FusedBatchNormExRewrite,
+                      GetRewriteCause()});
 #endif
-    rinfo_.push_back({csinfo_.fused_conv2d, csinfo_.mkl_fused_conv2d,
+    rinfo_.push_back({csinfo_.fused_conv2d,
+                      native_fmt ? csinfo_.mkl_native_fused_conv2d
+                                 : csinfo_.mkl_fused_conv2d,
                       CopyAttrsFusedConv2D, FusedConv2DRewrite,
-                      kRewriteForLayoutPropagation});
+                      GetRewriteCause()});
     rinfo_.push_back({csinfo_.fused_depthwise_conv2d,
-                      csinfo_.mkl_fused_depthwise_conv2d, CopyAttrsFusedConv2D,
-                      FusedDepthwiseConv2DRewrite,
-                      kRewriteForLayoutPropagation});
+                      native_fmt ? csinfo_.mkl_native_fused_depthwise_conv2d
+                                 : csinfo_.mkl_fused_depthwise_conv2d,
+                      CopyAttrsFusedConv2D, FusedDepthwiseConv2DRewrite,
+                      GetRewriteCause()});
     rinfo_.push_back({csinfo_.fused_matmul, csinfo_.mkl_fused_matmul,
                       CopyAttrsAllCheckConstFilter, FusedMatMulRewrite});
 
-    rinfo_.push_back({csinfo_.identity,
-                      mkl_op_registry::GetMklOpName(csinfo_.identity),
-                      CopyAttrsAll, RewriteIfAtleastOneMklInput,
-                      kRewriteForLayoutPropagation});
-    rinfo_.push_back({csinfo_.lrn, mkl_op_registry::GetMklOpName(csinfo_.lrn),
-                      CopyAttrsAll, LrnRewrite, kRewriteForLayoutPropagation});
     rinfo_.push_back(
-        {csinfo_.lrn_grad, mkl_op_registry::GetMklOpName(csinfo_.lrn_grad),
-         CopyAttrsAll, LrnGradRewrite, kRewriteForLayoutPropagation});
+        {csinfo_.identity, mkl_op_registry::GetMklOpName(csinfo_.identity),
+         CopyAttrsAll, RewriteIfAtleastOneMklInput, GetRewriteCause()});
+    rinfo_.push_back({csinfo_.lrn, mkl_op_registry::GetMklOpName(csinfo_.lrn),
+                      CopyAttrsAll, LrnRewrite, GetRewriteCause()});
+    rinfo_.push_back({csinfo_.lrn_grad,
+                      mkl_op_registry::GetMklOpName(csinfo_.lrn_grad),
+                      CopyAttrsAll, LrnGradRewrite, GetRewriteCause()});
     rinfo_.push_back({csinfo_.matmul,
                       mkl_op_registry::GetMklOpName(csinfo_.matmul),
                       CopyAttrsAll, MatMulRewrite, kRewriteForOpNameChange});
-    rinfo_.push_back(
-        {csinfo_.leakyrelu, mkl_op_registry::GetMklOpName(csinfo_.leakyrelu),
-         CopyAttrsAll, LeakyReluRewrite, kRewriteForLayoutPropagation});
+    rinfo_.push_back({csinfo_.leakyrelu,
+                      mkl_op_registry::GetMklOpName(csinfo_.leakyrelu),
+                      CopyAttrsAll, LeakyReluRewrite, GetRewriteCause()});
     rinfo_.push_back({csinfo_.leakyrelu_grad,
                       mkl_op_registry::GetMklOpName(csinfo_.leakyrelu_grad),
-                      CopyAttrsAll, LeakyReluRewrite,
-                      kRewriteForLayoutPropagation});
-    rinfo_.push_back({csinfo_.max_pool,
-                      mkl_op_registry::GetMklOpName(csinfo_.max_pool),
-                      CopyAttrsAll, NonDepthBatchWisePoolRewrite,
-                      kRewriteForLayoutPropagation});
+                      CopyAttrsAll, LeakyReluRewrite, GetRewriteCause()});
+    rinfo_.push_back(
+        {csinfo_.max_pool, mkl_op_registry::GetMklOpName(csinfo_.max_pool),
+         CopyAttrsAll, NonDepthBatchWisePoolRewrite, GetRewriteCause()});
     rinfo_.push_back({csinfo_.max_pool_grad,
                       mkl_op_registry::GetMklOpName(csinfo_.max_pool_grad),
-                      CopyAttrsAll, MaxpoolGradRewrite,
-                      kRewriteForLayoutPropagation});
-    rinfo_.push_back({csinfo_.max_pool3d,
-                      mkl_op_registry::GetMklOpName(csinfo_.max_pool3d),
-                      CopyAttrsAll, NonDepthBatchWisePoolRewrite,
-                      kRewriteForLayoutPropagation});
+                      CopyAttrsAll, MaxpoolGradRewrite, GetRewriteCause()});
+    rinfo_.push_back(
+        {csinfo_.max_pool3d, mkl_op_registry::GetMklOpName(csinfo_.max_pool3d),
+         CopyAttrsAll, NonDepthBatchWisePoolRewrite, GetRewriteCause()});
     rinfo_.push_back({csinfo_.max_pool3d_grad,
                       mkl_op_registry::GetMklOpName(csinfo_.max_pool3d_grad),
-                      CopyAttrsAll, AlwaysRewrite,
-                      kRewriteForLayoutPropagation});
-    rinfo_.push_back({csinfo_.maximum,
-                      mkl_op_registry::GetMklOpName(csinfo_.maximum),
-                      CopyAttrsAll, RewriteIfAtleastOneMklInput,
-                      kRewriteForLayoutPropagation});
+                      CopyAttrsAll, AlwaysRewrite, GetRewriteCause()});
+    rinfo_.push_back(
+        {csinfo_.maximum, mkl_op_registry::GetMklOpName(csinfo_.maximum),
+         CopyAttrsAll, RewriteIfAtleastOneMklInput, GetRewriteCause()});
     rinfo_.push_back({csinfo_.mul, mkl_op_registry::GetMklOpName(csinfo_.mul),
                       CopyAttrsAll, RewriteIfAtleastOneMklInput,
-                      kRewriteForLayoutPropagation});
-    rinfo_.push_back({csinfo_.pad_with_conv2d, csinfo_.mkl_pad_with_conv2d,
+                      GetRewriteCause()});
+    rinfo_.push_back({csinfo_.pad_with_conv2d,
+                      native_fmt ? csinfo_.mkl_native_pad_with_conv2d
+                                 : csinfo_.mkl_pad_with_conv2d,
                       CopyAttrsPadWithConv2D, AlwaysRewrite,
-                      kRewriteForLayoutPropagation});
+                      GetRewriteCause()});
     rinfo_.push_back({csinfo_.pad_with_fused_conv2d,
-                      csinfo_.mkl_pad_with_fused_conv2d,
+                      native_fmt ? csinfo_.mkl_native_pad_with_fused_conv2d
+                                 : csinfo_.mkl_pad_with_fused_conv2d,
                       CopyAttrsPadWithFusedConv2D, AlwaysRewrite,
-                      kRewriteForLayoutPropagation});
+                      GetRewriteCause()});
     rinfo_.push_back({csinfo_.quantized_avg_pool,
                       mkl_op_registry::GetMklOpName(csinfo_.quantized_avg_pool),
-                      CopyAttrsAll, AlwaysRewrite,
-                      kRewriteForLayoutPropagation});
+                      CopyAttrsAll, AlwaysRewrite, GetRewriteCause()});
     rinfo_.push_back({csinfo_.quantized_concatv2,
                       mkl_op_registry::GetMklOpName(csinfo_.quantized_concatv2),
-                      CopyAttrsAll, AlwaysRewrite,
-                      kRewriteForLayoutPropagation});
+                      CopyAttrsAll, AlwaysRewrite, GetRewriteCause()});
     rinfo_.push_back({csinfo_.quantized_conv2d,
                       mkl_op_registry::GetMklOpName(csinfo_.quantized_conv2d),
                       CopyAttrsQuantizedConv2D, AlwaysRewrite,
-                      kRewriteForLayoutPropagation});
+                      GetRewriteCause()});
     rinfo_.push_back(
         {csinfo_.quantized_conv2d_per_channel,
          mkl_op_registry::GetMklOpName(csinfo_.quantized_conv2d_per_channel),
-         CopyAttrsQuantizedConv2D, AlwaysRewrite,
-         kRewriteForLayoutPropagation});
+         CopyAttrsQuantizedConv2D, AlwaysRewrite, GetRewriteCause()});
     rinfo_.push_back({csinfo_.quantized_conv2d_with_requantize,
                       mkl_op_registry::GetMklOpName(
                           csinfo_.quantized_conv2d_with_requantize),
                       CopyAttrsQuantizedConv2D, AlwaysRewrite,
-                      kRewriteForLayoutPropagation});
+                      GetRewriteCause()});
     rinfo_.push_back(
         {csinfo_.quantized_conv2d_with_bias,
          mkl_op_registry::GetMklOpName(csinfo_.quantized_conv2d_with_bias),
-         CopyAttrsQuantizedConv2D, AlwaysRewrite,
-         kRewriteForLayoutPropagation});
+         CopyAttrsQuantizedConv2D, AlwaysRewrite, GetRewriteCause()});
     rinfo_.push_back({csinfo_.quantized_conv2d_with_bias_and_requantize,
                       mkl_op_registry::GetMklOpName(
                           csinfo_.quantized_conv2d_with_bias_and_requantize),
                       CopyAttrsQuantizedConv2D, AlwaysRewrite,
-                      kRewriteForLayoutPropagation});
+                      GetRewriteCause()});
     rinfo_.push_back(
         {csinfo_.quantized_conv2d_and_relu,
          mkl_op_registry::GetMklOpName(csinfo_.quantized_conv2d_and_relu),
-         CopyAttrsQuantizedConv2D, AlwaysRewrite,
-         kRewriteForLayoutPropagation});
+         CopyAttrsQuantizedConv2D, AlwaysRewrite, GetRewriteCause()});
     rinfo_.push_back({csinfo_.quantized_conv2d_and_relu_and_requantize,
                       mkl_op_registry::GetMklOpName(
                           csinfo_.quantized_conv2d_and_relu_and_requantize),
                       CopyAttrsQuantizedConv2D, AlwaysRewrite,
-                      kRewriteForLayoutPropagation});
+                      GetRewriteCause()});
     rinfo_.push_back({csinfo_.quantized_conv2d_with_bias_and_relu,
                       mkl_op_registry::GetMklOpName(
                           csinfo_.quantized_conv2d_with_bias_and_relu),
                       CopyAttrsQuantizedConv2D, AlwaysRewrite,
-                      kRewriteForLayoutPropagation});
+                      GetRewriteCause()});
     rinfo_.push_back(
         {csinfo_.quantized_conv2d_with_bias_and_relu_and_requantize,
          mkl_op_registry::GetMklOpName(
              csinfo_.quantized_conv2d_with_bias_and_relu_and_requantize),
-         CopyAttrsQuantizedConv2D, AlwaysRewrite,
-         kRewriteForLayoutPropagation});
+         CopyAttrsQuantizedConv2D, AlwaysRewrite, GetRewriteCause()});
     rinfo_.push_back({csinfo_.quantized_max_pool,
                       mkl_op_registry::GetMklOpName(csinfo_.quantized_max_pool),
-                      CopyAttrsAll, AlwaysRewrite,
-                      kRewriteForLayoutPropagation});
+                      CopyAttrsAll, AlwaysRewrite, GetRewriteCause()});
     rinfo_.push_back({csinfo_.quantized_conv2d_with_bias_sum_and_relu,
                       mkl_op_registry::GetMklOpName(
                           csinfo_.quantized_conv2d_with_bias_sum_and_relu),
                       CopyAttrsQuantizedConv2D, AlwaysRewrite,
-                      kRewriteForLayoutPropagation});
+                      GetRewriteCause()});
     rinfo_.push_back(
         {csinfo_.quantized_conv2d_with_bias_sum_and_relu_and_requantize,
          mkl_op_registry::GetMklOpName(
              csinfo_.quantized_conv2d_with_bias_sum_and_relu_and_requantize),
-         CopyAttrsQuantizedConv2D, AlwaysRewrite,
-         kRewriteForLayoutPropagation});
+         CopyAttrsQuantizedConv2D, AlwaysRewrite, GetRewriteCause()});
     rinfo_.push_back(
         {csinfo_.quant_conv2d_with_bias_signed_sum_and_relu_and_requantize,
          mkl_op_registry::GetMklOpName(
              csinfo_.quant_conv2d_with_bias_signed_sum_and_relu_and_requantize),
-         CopyAttrsQuantizedConv2D, AlwaysRewrite,
-         kRewriteForLayoutPropagation});
+         CopyAttrsQuantizedConv2D, AlwaysRewrite, GetRewriteCause()});
     rinfo_.push_back(
         {csinfo_.quantized_matmul_with_bias,
          mkl_op_registry::GetMklOpName(csinfo_.quantized_matmul_with_bias),
@@ -643,72 +634,65 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     rinfo_.push_back(
         {csinfo_.quantized_depthwise_conv2d,
          mkl_op_registry::GetMklOpName(csinfo_.quantized_depthwise_conv2d),
-         CopyAttrsQuantizedConv2D, AlwaysRewrite,
-         kRewriteForLayoutPropagation});
+         CopyAttrsQuantizedConv2D, AlwaysRewrite, GetRewriteCause()});
     rinfo_.push_back({csinfo_.quantized_depthwise_conv2d_with_bias,
                       mkl_op_registry::GetMklOpName(
                           csinfo_.quantized_depthwise_conv2d_with_bias),
                       CopyAttrsQuantizedConv2D, AlwaysRewrite,
-                      kRewriteForLayoutPropagation});
+                      GetRewriteCause()});
     rinfo_.push_back(
         {csinfo_.quantized_depthwise_conv2d_with_bias_and_relu,
          mkl_op_registry::GetMklOpName(
              csinfo_.quantized_depthwise_conv2d_with_bias_and_relu),
-         CopyAttrsQuantizedConv2D, AlwaysRewrite,
-         kRewriteForLayoutPropagation});
+         CopyAttrsQuantizedConv2D, AlwaysRewrite, GetRewriteCause()});
     rinfo_.push_back(
         {csinfo_.quantized_depthwise_conv2d_with_bias_and_relu_and_requantize,
          mkl_op_registry::GetMklOpName(
              csinfo_
                  .quantized_depthwise_conv2d_with_bias_and_relu_and_requantize),
-         CopyAttrsQuantizedConv2D, AlwaysRewrite,
-         kRewriteForLayoutPropagation});
+         CopyAttrsQuantizedConv2D, AlwaysRewrite, GetRewriteCause()});
     rinfo_.push_back({csinfo_.quantize_v2,
                       mkl_op_registry::GetMklOpName(csinfo_.quantize_v2),
-                      CopyAttrsAll, QuantizeOpRewrite,
-                      kRewriteForLayoutPropagation});
+                      CopyAttrsAll, QuantizeOpRewrite, GetRewriteCause()});
     rinfo_.push_back({csinfo_.relu, mkl_op_registry::GetMklOpName(csinfo_.relu),
-                      CopyAttrsAll, AlwaysRewrite,
-                      kRewriteForLayoutPropagation});
-    rinfo_.push_back(
-        {csinfo_.relu_grad, mkl_op_registry::GetMklOpName(csinfo_.relu_grad),
-         CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
-    rinfo_.push_back(
-        {csinfo_.relu6, mkl_op_registry::GetMklOpName(csinfo_.relu6),
-         CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
-    rinfo_.push_back(
-        {csinfo_.relu6_grad, mkl_op_registry::GetMklOpName(csinfo_.relu6_grad),
-         CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
-    rinfo_.push_back(
-        {csinfo_.requantize, mkl_op_registry::GetMklOpName(csinfo_.requantize),
-         CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
+                      CopyAttrsAll, AlwaysRewrite, GetRewriteCause()});
+    rinfo_.push_back({csinfo_.relu_grad,
+                      mkl_op_registry::GetMklOpName(csinfo_.relu_grad),
+                      CopyAttrsAll, AlwaysRewrite, GetRewriteCause()});
+    rinfo_.push_back({csinfo_.relu6,
+                      mkl_op_registry::GetMklOpName(csinfo_.relu6),
+                      CopyAttrsAll, AlwaysRewrite, GetRewriteCause()});
+    rinfo_.push_back({csinfo_.relu6_grad,
+                      mkl_op_registry::GetMklOpName(csinfo_.relu6_grad),
+                      CopyAttrsAll, AlwaysRewrite, GetRewriteCause()});
+    rinfo_.push_back({csinfo_.requantize,
+                      mkl_op_registry::GetMklOpName(csinfo_.requantize),
+                      CopyAttrsAll, AlwaysRewrite, GetRewriteCause()});
 #ifdef ENABLE_MKLDNN_V1
     // Optimized TanhGrad support exists only in DNNL 1.x.
     rinfo_.push_back({csinfo_.tanh, mkl_op_registry::GetMklOpName(csinfo_.tanh),
-                      CopyAttrsAll, AlwaysRewrite,
-                      kRewriteForLayoutPropagation});
-    rinfo_.push_back(
-        {csinfo_.tanh_grad, mkl_op_registry::GetMklOpName(csinfo_.tanh_grad),
-         CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
+                      CopyAttrsAll, AlwaysRewrite, GetRewriteCause()});
+    rinfo_.push_back({csinfo_.tanh_grad,
+                      mkl_op_registry::GetMklOpName(csinfo_.tanh_grad),
+                      CopyAttrsAll, AlwaysRewrite, GetRewriteCause()});
 #endif  // ENABLE_MKLDNN_V1
+    rinfo_.push_back({csinfo_.reshape,
+                      mkl_op_registry::GetMklOpName(csinfo_.reshape),
+                      CopyAttrsAll, AlwaysRewrite, GetRewriteCause()});
     rinfo_.push_back(
-        {csinfo_.reshape, mkl_op_registry::GetMklOpName(csinfo_.reshape),
-         CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
-    rinfo_.push_back({csinfo_.slice,
-                      mkl_op_registry::GetMklOpName(csinfo_.slice),
-                      CopyAttrsAll, RewriteIfAtleastOneMklInput,
-                      kRewriteForLayoutPropagation});
-    rinfo_.push_back(
-        {csinfo_.softmax, mkl_op_registry::GetMklOpName(csinfo_.softmax),
-         CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
+        {csinfo_.slice, mkl_op_registry::GetMklOpName(csinfo_.slice),
+         CopyAttrsAll, RewriteIfAtleastOneMklInput, GetRewriteCause()});
+    rinfo_.push_back({csinfo_.softmax,
+                      mkl_op_registry::GetMklOpName(csinfo_.softmax),
+                      CopyAttrsAll, AlwaysRewrite, GetRewriteCause()});
 
     rinfo_.push_back({csinfo_.squared_difference,
                       mkl_op_registry::GetMklOpName(csinfo_.squared_difference),
                       CopyAttrsAll, RewriteIfAtleastOneMklInput,
-                      kRewriteForLayoutPropagation});
+                      GetRewriteCause()});
     rinfo_.push_back({csinfo_.sub, mkl_op_registry::GetMklOpName(csinfo_.sub),
                       CopyAttrsAll, RewriteIfAtleastOneMklInput,
-                      kRewriteForLayoutPropagation});
+                      GetRewriteCause()});
     rinfo_.push_back({csinfo_.transpose,
                       mkl_op_registry::GetMklOpName(csinfo_.transpose),
                       CopyAttrsAll, AlwaysRewrite, kRewriteForOpNameChange});
@@ -723,9 +707,6 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     minfo_.push_back({csinfo_.conv2d, csinfo_.bias_add,
                       csinfo_.conv2d_with_bias, GetConv2DOrBiasAdd});
 
-    minfo_.push_back({csinfo_.conv2d_grad_filter, csinfo_.bias_add_grad,
-                      csinfo_.conv2d_grad_filter_with_bias,
-                      GetConv2DBackpropFilterOrBiasAddGrad});
     // Merge Pad and Conv2d, only if the pad op is "Pad"
     // Doesn't merge if pad op is "PadV2" or "MirrorPad"
     minfo_.push_back(
@@ -734,76 +715,82 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     minfo_.push_back({csinfo_.pad, csinfo_.fused_conv2d,
                       csinfo_.pad_with_fused_conv2d, GetPadOrFusedConv2D});
 
-    // The fusion patterns in "finfo_" that show up first will get applied
-    // first, for example, graph "A->B->C-D" and finfo_ is {A->B->C to ABC,
-    // A->B->C->D to ABCD}, since the first gets applied first, the final
-    // graph will be ABC->D.
+    if (!native_fmt) {
+      minfo_.push_back({csinfo_.conv2d_grad_filter, csinfo_.bias_add_grad,
+                        csinfo_.conv2d_grad_filter_with_bias,
+                        GetConv2DBackpropFilterOrBiasAddGrad});
 
-    //
-    // Add rules to fuse sequences such as "Transpose (NCHW -> NHWC) + Conv2D
-    // (NHWC) + Transpose (NHWC->
-    // NCHW)" into "Conv2D (NCHW)". Such patterns occur frequently in Keras.
-    // Note: we use the term "merge" to combine (exactly) 2 nodes into one,
-    // while "fusion" is for 3+ nodes situation.
-    //
+      // The fusion patterns in "finfo_" that show up first will get applied
+      // first, for example, graph "A->B->C-D" and finfo_ is {A->B->C to ABC,
+      // A->B->C->D to ABCD}, since the first gets applied first, the final
+      // graph will be ABC->D.
 
-    // Transpose + Conv2d + Transpose:
-    std::vector<int> transpose_to_nhwc = {NCHW::dim::N, NCHW::dim::H,
-                                          NCHW::dim::W, NCHW::dim::C};
-    std::vector<int> transpose_to_nchw = {NHWC::dim::N, NHWC::dim::C,
-                                          NHWC::dim::H, NHWC::dim::W};
-    auto CheckForTransposeToNHWC =
-        std::bind(CheckForTranspose, std::placeholders::_1, transpose_to_nhwc);
-    auto CheckForConv2dOp =
-        std::bind(CheckForMklOp, std::placeholders::_1, csinfo_.conv2d);
-    auto CheckForTransposeToNCHW =
-        std::bind(CheckForTranspose, std::placeholders::_1, transpose_to_nchw);
-    auto FuseConv2D =
-        std::bind(FuseTransposeMklOpTranspose, std::placeholders::_1,
-                  std::placeholders::_2, std::placeholders::_3, "NCHW");
-    finfo_.push_back(
-        {"transpose-elimination for Conv2D",
-         {CheckForTransposeToNHWC, CheckForConv2dOp, CheckForTransposeToNCHW},
-         // CheckForMklOp
-         FuseConv2D,
-         CopyAttrsConv});
+      //
+      // Add rules to fuse sequences such as "Transpose (NCHW -> NHWC) + Conv2D
+      // (NHWC) + Transpose (NHWC->
+      // NCHW)" into "Conv2D (NCHW)". Such patterns occur frequently in Keras.
+      // Note: we use the term "merge" to combine (exactly) 2 nodes into one,
+      // while "fusion" is for 3+ nodes situation.
+      //
 
-    // Transpose + Conv3d + Transpose:
-    std::vector<int> transpose_to_ndhwc = {NCDHW::dim::N, NCDHW::dim::D,
-                                           NCDHW::dim::H, NCDHW::dim::W,
-                                           NCDHW::dim::C};
-    std::vector<int> transpose_to_ncdhw = {NDHWC::dim::N, NDHWC::dim::C,
-                                           NDHWC::dim::D, NDHWC::dim::H,
-                                           NDHWC::dim::W};
+      // Transpose + Conv2d + Transpose:
+      std::vector<int> transpose_to_nhwc = {NCHW::dim::N, NCHW::dim::H,
+                                            NCHW::dim::W, NCHW::dim::C};
+      std::vector<int> transpose_to_nchw = {NHWC::dim::N, NHWC::dim::C,
+                                            NHWC::dim::H, NHWC::dim::W};
+      auto CheckForTransposeToNHWC = std::bind(
+          CheckForTranspose, std::placeholders::_1, transpose_to_nhwc);
+      auto CheckForConv2dOp =
+          std::bind(CheckForMklOp, std::placeholders::_1, csinfo_.conv2d);
+      auto CheckForTransposeToNCHW = std::bind(
+          CheckForTranspose, std::placeholders::_1, transpose_to_nchw);
+      auto FuseConv2D =
+          std::bind(FuseTransposeMklOpTranspose, std::placeholders::_1,
+                    std::placeholders::_2, std::placeholders::_3, "NCHW");
+      finfo_.push_back(
+          {"transpose-elimination for Conv2D",
+           {CheckForTransposeToNHWC, CheckForConv2dOp, CheckForTransposeToNCHW},
+           // CheckForMklOp
+           FuseConv2D,
+           CopyAttrsConv});
 
-    auto CheckForTransposeToNDHWC =
-        std::bind(CheckForTranspose, std::placeholders::_1, transpose_to_ndhwc);
-    auto CheckForConv3dOp =
-        std::bind(CheckForMklOp, std::placeholders::_1, csinfo_.conv3d);
-    auto CheckForTransposeToNCDHW =
-        std::bind(CheckForTranspose, std::placeholders::_1, transpose_to_ncdhw);
-    auto FuseConv3D =
-        std::bind(FuseTransposeMklOpTranspose, std::placeholders::_1,
-                  std::placeholders::_2, std::placeholders::_3, "NCDHW");
+      // Transpose + Conv3d + Transpose:
+      std::vector<int> transpose_to_ndhwc = {NCDHW::dim::N, NCDHW::dim::D,
+                                             NCDHW::dim::H, NCDHW::dim::W,
+                                             NCDHW::dim::C};
+      std::vector<int> transpose_to_ncdhw = {NDHWC::dim::N, NDHWC::dim::C,
+                                             NDHWC::dim::D, NDHWC::dim::H,
+                                             NDHWC::dim::W};
 
-    finfo_.push_back(
-        {"transpose-elimination for Conv3D",
-         {CheckForTransposeToNDHWC, CheckForConv3dOp, CheckForTransposeToNCDHW},
-         // CheckForMklOp
-         FuseConv3D,
-         CopyAttrsConv});
+      auto CheckForTransposeToNDHWC = std::bind(
+          CheckForTranspose, std::placeholders::_1, transpose_to_ndhwc);
+      auto CheckForConv3dOp =
+          std::bind(CheckForMklOp, std::placeholders::_1, csinfo_.conv3d);
+      auto CheckForTransposeToNCDHW = std::bind(
+          CheckForTranspose, std::placeholders::_1, transpose_to_ncdhw);
+      auto FuseConv3D =
+          std::bind(FuseTransposeMklOpTranspose, std::placeholders::_1,
+                    std::placeholders::_2, std::placeholders::_3, "NCDHW");
 
-    auto CheckForMaxPool3DOp =
-        std::bind(CheckForMklOp, std::placeholders::_1, csinfo_.max_pool3d);
-    auto FuseMaxPool3D =
-        std::bind(FuseTransposeMklOpTranspose, std::placeholders::_1,
-                  std::placeholders::_2, std::placeholders::_3, "NCDHW");
-    finfo_.push_back({"transpose-elimination for MaxPool3D",
-                      {CheckForTransposeToNDHWC, CheckForMaxPool3DOp,
-                       CheckForTransposeToNCDHW},
-                      // CheckForMklOp
-                      FuseMaxPool3D,
-                      CopyAttrsPooling});
+      finfo_.push_back({"transpose-elimination for Conv3D",
+                        {CheckForTransposeToNDHWC, CheckForConv3dOp,
+                         CheckForTransposeToNCDHW},
+                        // CheckForMklOp
+                        FuseConv3D,
+                        CopyAttrsConv});
+
+      auto CheckForMaxPool3DOp =
+          std::bind(CheckForMklOp, std::placeholders::_1, csinfo_.max_pool3d);
+      auto FuseMaxPool3D =
+          std::bind(FuseTransposeMklOpTranspose, std::placeholders::_1,
+                    std::placeholders::_2, std::placeholders::_3, "NCDHW");
+      finfo_.push_back({"transpose-elimination for MaxPool3D",
+                        {CheckForTransposeToNDHWC, CheckForMaxPool3DOp,
+                         CheckForTransposeToNCDHW},
+                        // CheckForMklOp
+                        FuseMaxPool3D,
+                        CopyAttrsPooling});
+    }
   }
 
   // Standard interface to run pass
@@ -824,6 +811,16 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   /// of ops like MatMul, Transpose, which do not support Mkl layout)
   enum RewriteCause { kRewriteForLayoutPropagation, kRewriteForOpNameChange };
 
+  // Get the op rewrite cause depending on whether native format mode
+  // is enabled or not.
+  RewriteCause GetRewriteCause() {
+    if (NativeFormatEnabled()) {
+      return kRewriteForOpNameChange;
+    } else {
+      return kRewriteForLayoutPropagation;
+    }
+  }
+
   /// Structure to specify the name of an original node, its new name after
   /// rewrite, the number of inputs to the original node, the function to
   /// be used to copy attributes for the op, and the rule (if any) which
@@ -960,6 +957,12 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     string mkl_fused_conv2d;
     string mkl_fused_depthwise_conv2d;
     string mkl_fused_matmul;
+    string mkl_native_conv2d_with_bias;
+    string mkl_native_fused_batch_norm_ex;
+    string mkl_native_fused_conv2d;
+    string mkl_native_fused_depthwise_conv2d;
+    string mkl_native_pad_with_conv2d;
+    string mkl_native_pad_with_fused_conv2d;
     string mkl_pad_with_conv2d;
     string mkl_pad_with_fused_conv2d;
     string mul;
@@ -1736,7 +1739,12 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
             fused_ops == std::vector<string>{"BiasAdd", "Relu6"} ||
             fused_ops == std::vector<string>{"BiasAdd", "Elu"} ||
             fused_ops == std::vector<string>{"BiasAdd", "Add"} ||
-            fused_ops == std::vector<string>{"BiasAdd", "Add", "Relu"});
+            fused_ops == std::vector<string>{"BiasAdd", "Add", "Relu"} ||
+            fused_ops == std::vector<string>{"BiasAdd", "Add", "Relu6"} ||
+            fused_ops == std::vector<string>{"BiasAdd", "Add", "Elu"} ||
+            fused_ops == std::vector<string>{"LeakyRelu"} ||
+            fused_ops == std::vector<string>{"BiasAdd", "LeakyRelu"} ||
+            fused_ops == std::vector<string>{"BiasAdd", "Add", "LeakyRelu"});
   }
 
   static bool FusedDepthwiseConv2DRewrite(const Node* n) {
@@ -2403,8 +2411,10 @@ Status MklLayoutRewritePass::CopyInputs(
     if (ArgIsList(arg)) {
       std::vector<NodeBuilder::NodeOut> new_node_inputs;
       int N = GetTensorListLength(arg, old_node);
-      GetNodesProducingTFTensorList(old_node_inputs, &iidx, N,
-                                    &new_node_inputs);
+      if (N != 0) {
+        GetNodesProducingTFTensorList(old_node_inputs, &iidx, N,
+                                      &new_node_inputs);
+      }
       nb->Input(new_node_inputs);
     } else {
       nb->Input(old_node_inputs[iidx].first, old_node_inputs[iidx].second);
@@ -2711,6 +2721,7 @@ void MklLayoutRewritePass::CopyAttrsFromPadAndFusedConv2D(
   float epsilon;
   std::vector<string> fused_ops;
   DataType Tpaddings;
+  float leakyrelu_alpha;
 
   // Get all attributes from old node.
   TF_CHECK_OK(GetNodeAttr(fused_conv2d->def(), "T", &T));
@@ -2721,6 +2732,8 @@ void MklLayoutRewritePass::CopyAttrsFromPadAndFusedConv2D(
   TF_CHECK_OK(GetNodeAttr(fused_conv2d->def(), "dilations", &dilations));
   TF_CHECK_OK(GetNodeAttr(fused_conv2d->def(), "fused_ops", &fused_ops));
   TF_CHECK_OK(GetNodeAttr(fused_conv2d->def(), "epsilon", &epsilon));
+  TF_CHECK_OK(
+      GetNodeAttr(fused_conv2d->def(), "leakyrelu_alpha", &leakyrelu_alpha));
   TF_CHECK_OK(GetNodeAttr(pad->def(), "Tpaddings", &Tpaddings));
 
   // Add attributes to new node.
@@ -2733,6 +2746,7 @@ void MklLayoutRewritePass::CopyAttrsFromPadAndFusedConv2D(
   nb->Attr("epsilon", epsilon);
   nb->Attr("Tpaddings", Tpaddings);
   nb->Attr("fused_ops", fused_ops);
+  nb->Attr("leakyrelu_alpha", leakyrelu_alpha);
 }
 
 void MklLayoutRewritePass::CopyAttrsConv2DDepthwiseCheckConstFilter(
@@ -2901,6 +2915,7 @@ void MklLayoutRewritePass::CopyAttrsFusedConv2D(const Node* orig_node,
   std::vector<int32> strides;
   std::vector<int32> dilations;
   std::vector<string> fused_ops;
+  float leakyrelu_alpha;
 
   // Get all attributes from old node.
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
@@ -2911,6 +2926,8 @@ void MklLayoutRewritePass::CopyAttrsFusedConv2D(const Node* orig_node,
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "dilations", &dilations));
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "fused_ops", &fused_ops));
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "epsilon", &epsilon));
+  TF_CHECK_OK(
+      GetNodeAttr(orig_node->def(), "leakyrelu_alpha", &leakyrelu_alpha));
 
   Node* filter_node = nullptr;
   TF_CHECK_OK(orig_node->input_node(1, &filter_node));
@@ -2925,6 +2942,7 @@ void MklLayoutRewritePass::CopyAttrsFusedConv2D(const Node* orig_node,
   nb->Attr("dilations", dilations);
   nb->Attr("fused_ops", fused_ops);
   nb->Attr("epsilon", epsilon);
+  nb->Attr("leakyrelu_alpha", leakyrelu_alpha);
 }
 
 void MklLayoutRewritePass::CopyAttrsPooling(const Node* orig_node,
@@ -3642,7 +3660,12 @@ Status MklLayoutRewritePass::RewriteNodeForJustOpNameChange(
     return s;
   }
 
-  ri->copy_attrs(const_cast<const Node*>(orig_node), &nb, true);
+  if (!NativeFormatEnabled()) {
+    ri->copy_attrs(const_cast<const Node*>(orig_node), &nb, true);
+  } else {
+    ri->copy_attrs(const_cast<const Node*>(orig_node), &nb, false);
+  }
+
   nb.Attr("_kernel", mkl_op_registry::kMklNameChangeOpLabel);
 
   // Finalize graph and get new node.
@@ -3767,11 +3790,16 @@ MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const {
     return nullptr;
   }
 
-  // We make an exception for Conv2D, as the corresponding MKL ops
-  // currently do not support the case of padding == EXPLICIT yet.
+  // We make an exception for Conv2D and MaxPool related ops as
+  // the corresponding MKL ops currently do not support the case
+  // of padding == EXPLICIT yet.
   if (n->type_string() == csinfo_.conv2d ||
       n->type_string() == csinfo_.conv2d_grad_input ||
-      n->type_string() == csinfo_.conv2d_grad_filter) {
+      n->type_string() == csinfo_.conv2d_grad_filter ||
+      n->type_string() == csinfo_.max_pool ||
+      n->type_string() == csinfo_.max_pool_grad ||
+      n->type_string() == csinfo_.max_pool3d ||
+      n->type_string() == csinfo_.max_pool3d_grad) {
     string padding;
     TF_CHECK_OK(GetNodeAttr(n->def(), "padding", &padding));
     if (padding == "EXPLICIT") return nullptr;
diff --git a/tensorflow/core/common_runtime/mkl_layout_pass_test.cc b/tensorflow/core/common_runtime/mkl_layout_pass_test.cc
index 9bfa9418bf3..bbe20f4436d 100644
--- a/tensorflow/core/common_runtime/mkl_layout_pass_test.cc
+++ b/tensorflow/core/common_runtime/mkl_layout_pass_test.cc
@@ -1557,6 +1557,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Dequantize_Negative_Non_SCALED_Mode) {
         "} }"                                                                  \
         " attr { key: 'fused_ops'        value { list: {s: 'BiasAdd'} } }"     \
         " attr { key: 'epsilon'          value { f: 0.001 }}"                  \
+        " attr { key: 'leakyrelu_alpha'  value { f: 0.2 }}"                    \
         " input: ['A', 'B', 'C']}"                                             \
         "node { name: 'E' op: 'Zeta'"                                          \
         "attr { key: 'T' value { type: " #T " } }"                             \
@@ -1588,6 +1589,7 @@ REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedConv2D_Positive1);
               "i:1, i:1} } }"                                                 \
               " attr { key: 'fused_ops'        value { list: {s: 'Relu'} } }" \
               " attr { key: 'epsilon'          value { f: 0.001 }}"           \
+              " attr { key: 'leakyrelu_alpha'  value { f: 0.2 }}"             \
               " input: ['A', 'B', 'C']}"                                      \
               "node { name: 'E' op: 'Zeta'"                                   \
               "attr { key: 'T' value { type: " #T " } }"                      \
@@ -1620,6 +1622,7 @@ REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedConv2D_Positive2);
               " attr { key: 'fused_ops'"                                      \
               "             value { list: {s: 'BiasAdd', s: 'Relu'} } }"      \
               " attr { key: 'epsilon'          value { f: 0.001 }}"           \
+              " attr { key: 'leakyrelu_alpha'  value { f: 0.2 }}"             \
               " input: ['A', 'B', 'C']}"                                      \
               "node { name: 'E' op: 'Zeta'"                                   \
               "attr { key: 'T' value { type: " #T " } }"                      \
@@ -1652,6 +1655,7 @@ REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedConv2D_Positive3);
               " attr { key: 'fused_ops'"                                      \
               "             value { list: {s: 'BiasAdd', s: 'Relu6'} } }"     \
               " attr { key: 'epsilon'          value { f: 0.001 }}"           \
+              " attr { key: 'leakyrelu_alpha'  value { f: 0.2 }}"             \
               " input: ['A', 'B', 'C']}"                                      \
               "node { name: 'E' op: 'Zeta'"                                   \
               "attr { key: 'T' value { type: " #T " } }"                      \
@@ -1684,6 +1688,7 @@ REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedConv2D_Positive4);
               " attr { key: 'fused_ops'"                                      \
               "             value { list: {s: 'BiasAdd', s: 'Elu'} } }"       \
               " attr { key: 'epsilon'          value { f: 0.001 }}"           \
+              " attr { key: 'leakyrelu_alpha'  value { f: 0.2 }}"             \
               " input: ['A', 'B', 'C']}"                                      \
               "node { name: 'E' op: 'Zeta'"                                   \
               "attr { key: 'T' value { type: " #T " } }"                      \
@@ -1717,6 +1722,7 @@ REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedConv2D_Positive5);
               " attr { key: 'fused_ops'"                                      \
               "             value { list: {s: 'BiasAdd', s: 'Add'} } }"       \
               " attr { key: 'epsilon'          value { f: 0.001 }}"           \
+              " attr { key: 'leakyrelu_alpha'  value { f: 0.2 }}"             \
               " input: ['A', 'B', 'C', 'D']}"                                 \
               "node { name: 'F' op: 'Zeta'"                                   \
               "attr { key: 'T' value { type: " #T " } }"                      \
@@ -1752,6 +1758,7 @@ REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedConv2D_Positive6);
         " attr { key: 'fused_ops'"                                             \
         "             value { list: {s: 'BiasAdd', s: 'Add', s: 'Relu'} } }"   \
         " attr { key: 'epsilon'          value { f: 0.001 }}"                  \
+        " attr { key: 'leakyrelu_alpha'  value { f: 0.2 }}"                    \
         " input: ['A', 'B', 'C', 'D']}"                                        \
         "node { name: 'F' op: 'Zeta'"                                          \
         "attr { key: 'T' value { type: " #T " } }"                             \
@@ -1767,6 +1774,39 @@ REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedConv2D_Positive6);
 REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedConv2D_Positive7);
 #undef REGISTER_TEST
 
+// Rewrite test for _FusedConv2D Op with BiasAdd+LeakyRelu fusion
+#define REGISTER_TEST(NAME, T, INPUT)                                         \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                     \
+    InitGraph("node { name: 'A' op: '" #INPUT "'}"                            \
+              "node { name: 'B' op: '" #INPUT "'}"                            \
+              "node { name: 'C' op: '" #INPUT "'}"                            \
+              "node { name: 'D' op: '_FusedConv2D'"                           \
+              " attr { key: 'T'                value { type: " #T " } }"      \
+              " attr { key: 'num_args'         value { i: 1 } }"              \
+              " attr { key: 'data_format'      value { s: 'NCHW' } }"         \
+              " attr { key: 'strides'          value { list: {i: 1, i:1, "    \
+              "i:1, i:1} } }"                                                 \
+              " attr { key: 'padding'          value { s: 'SAME' } }"         \
+              " attr { key: 'dilations'        value { list: {i: 1, i:1, "    \
+              "i:1, i:1} } }"                                                 \
+              " attr { key: 'fused_ops'"                                      \
+              "             value { list: {s: 'BiasAdd', s: 'LeakyRelu'} } }" \
+              " attr { key: 'epsilon'          value { f: 0.001 }}"           \
+              " attr { key: 'leakyrelu_alpha'  value { f: 0.2 }}"             \
+              " input: ['A', 'B', 'C']}"                                      \
+              "node { name: 'E' op: 'Zeta'"                                   \
+              "attr { key: 'T' value { type: " #T " } }"                      \
+              " input: ['D', 'C'] }");                                        \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                                  \
+              "A(" #INPUT ");B(" #INPUT ");C(" #INPUT ");D(_MklFusedConv2D);" \
+              "DMT/_0(Const);DMT/_1(Const);DMT/_2(Const);E(Zeta)|A->D;"       \
+              "A:control->DMT/_0:control;A:control->DMT/_1:control;"          \
+              "A:control->DMT/_2:control;B->D:1;C->D:2;C->E:1;D->E;"          \
+              "DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");                         \
+  }
+REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedConv2D_Positive8);
+#undef REGISTER_TEST
+
 // Rewrite test for _FusedDepthwiseConv2dNative Op fusion
 #define REGISTER_TEST(NAME, T, INPUT)                                          \
   TEST_F(MklLayoutPassTest, NAME##_##T) {                                      \
@@ -1785,6 +1825,7 @@ REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedConv2D_Positive7);
         "} }"                                                                  \
         " attr { key: 'fused_ops'        value { list: " FUSED_OPS " } }"      \
         " attr { key: 'epsilon'          value { f: 0.001 }}"                  \
+        " attr { key: 'leakyrelu_alpha'  value { f: 0.2 }}"                    \
         " input: ['A', 'B', 'C']}"                                             \
         "node { name: 'E' op: 'Zeta'"                                          \
         "attr { key: 'T' value { type: " #T " } }"                             \
@@ -1835,6 +1876,7 @@ REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedDepthwiseConv2dNative_Positive4);
         "} }"                                                                  \
         " attr { key: 'fused_ops'        value { list: {s: 'Unsupported'} } }" \
         " attr { key: 'epsilon'          value { f: 0.001 }}"                  \
+        " attr { key: 'leakyrelu_alpha'  value { f: 0.2 }}"                    \
         " input: ['A', 'B', 'C']}"                                             \
         "node { name: 'E' op: 'Zeta'"                                          \
         "attr { key: 'T' value { type: " #T " } }"                             \
@@ -1864,6 +1906,7 @@ REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedConv2D_Negative1);
         "} }"                                                                  \
         " attr { key: 'fused_ops'        value { list: {s: 'Unsupported'} } }" \
         " attr { key: 'epsilon'          value { f: 0.001 }}"                  \
+        " attr { key: 'leakyrelu_alpha'  value { f: 0.2 }}"                    \
         " input: ['A', 'B', 'C']}"                                             \
         "node { name: 'E' op: 'Zeta'"                                          \
         "attr { key: 'T' value { type: " #T " } }"                             \
@@ -1923,6 +1966,7 @@ REGISTER_TEST(NodeRewrite_FusedConv2D_Negative2, DT_DOUBLE, DoubleInput);
         "} }"                                                                  \
         " attr { key: 'fused_ops'        value { list: {s: 'BiasAdd'} } }"     \
         " attr { key: 'epsilon'          value { f: 0.001 }}"                  \
+        " attr { key: 'leakyrelu_alpha'  value { f: 0.2 }}"                    \
         " input: ['A', 'B', 'C']}"                                             \
         "node { name: 'E' op: 'Zeta'"                                          \
         "attr { key: 'T' value { type: " #T "} }"                              \
@@ -2018,6 +2062,7 @@ REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedMatMul_Negative);
         "} }"                                                                  \
         " attr { key: 'fused_ops'        value { list: {s: 'BiasAdd'} } }"     \
         " attr { key: 'epsilon'          value { f: 0.001 }}"                  \
+        " attr { key: 'leakyrelu_alpha'  value { f: 0.2 }}"                    \
         " input: ['C', 'D', 'E']}"                                             \
         "node { name: 'G' op: 'Zeta'"                                          \
         " attr { key: 'T' value { type: " #T " } }"                            \
@@ -2062,6 +2107,7 @@ REGISTER_TEST_ALL_TYPES(NodeMerge_PadWithFusedConv2D_Positive1);
               " attr { key: 'fused_ops'"                                      \
               "             value { list: {s: 'BiasAdd', s: 'Relu'} } }"      \
               " attr { key: 'epsilon'          value { f: 0.001 }}"           \
+              " attr { key: 'leakyrelu_alpha'  value { f: 0.2 }}"             \
               " input: ['C', 'D', 'E']}"                                      \
               "node { name: 'G' op: 'Zeta'"                                   \
               "attr { key: 'T' value { type: " #T "} }"                       \
@@ -2106,6 +2152,7 @@ REGISTER_TEST_ALL_TYPES(NodeMerge_PadWithFusedConv2D_Positive2);
         "} }"                                                                  \
         " attr { key: 'fused_ops'        value { list: {s: 'Unsupported'} } }" \
         " attr { key: 'epsilon'          value { f: 0.001 }}"                  \
+        " attr { key: 'leakyrelu_alpha'  value { f: 0.2 }}"                    \
         " input: ['C', 'D', 'E']}"                                             \
         "node { name: 'G' op: 'Zeta'"                                          \
         " attr { key: 'T' value { type: " #T " } }"                            \
@@ -2146,6 +2193,7 @@ REGISTER_TEST_ALL_TYPES(NodeMerge_PadWithFusedConv2D_Negative1);
         "} }"                                                                  \
         " attr { key: 'fused_ops'        value { list: {s: 'BiasAdd'} } }"     \
         " attr { key: 'epsilon'          value { f: 0.001 }}"                  \
+        " attr { key: 'leakyrelu_alpha'  value { f: 0.2 }}"                    \
         " input: ['C', 'D', 'E']}"                                             \
         "node { name: 'G' op: 'Zeta'"                                          \
         " attr { key: 'T' value { type: " #T " } }"                            \
@@ -2189,6 +2237,7 @@ REGISTER_TEST_ALL_TYPES(NodeMerge_PadWithFusedConv2D_Negative2);
               " attr { key: 'fused_ops'"                                      \
               "             value { list: {s: 'BiasAdd', s: 'Relu'} } }"      \
               " attr { key: 'epsilon'          value { f: 0.001 }}"           \
+              " attr { key: 'leakyrelu_alpha'  value { f: 0.2 }}"             \
               " input: ['C', 'D', 'E']}"                                      \
               "node { name: 'G' op: 'Zeta'"                                   \
               " attr { key: 'T' value { type: " #T " } }"                     \
@@ -2243,6 +2292,7 @@ REGISTER_TEST_ALL_TYPES(NodeMerge_PadWithFusedConv2D_Negative3);
         "} }"                                                                  \
         " attr { key: 'fused_ops'        value { list: {s: 'BiasAdd'} } }"     \
         " attr { key: 'epsilon'          value { f: 0.001 }}"                  \
+        " attr { key: 'leakyrelu_alpha'  value { f: 0.2 }}"                    \
         " input: ['C', 'D', 'E']}"                                             \
         "node { name: 'G' op: 'Zeta'"                                          \
         " attr {key: 'T'                 value { type: " #T " } }"             \
@@ -2304,6 +2354,7 @@ REGISTER_TEST_ALL_TYPES(Input_ControlEdge_PadWithFusedConv2D_Positive);
         "} }"                                                                  \
         " attr { key: 'fused_ops'        value { list: {s: 'BiasAdd'} } }"     \
         " attr { key: 'epsilon'          value { f: 0.001 }}"                  \
+        " attr { key: 'leakyrelu_alpha'  value { f: 0.2 }}"                    \
         " input: ['C', 'D', 'E']}"                                             \
         "node { name: 'G' op: 'Zeta'"                                          \
         " attr {key: 'T'                 value { type: " #T " } }"             \
@@ -2356,6 +2407,7 @@ REGISTER_TEST_ALL_TYPES(Output_ControlEdge_PadWithFusedConv2D_Positive);
         "} }"                                                                  \
         " attr { key: 'fused_ops'        value { list: {s: 'BiasAdd'} } }"     \
         " attr { key: 'epsilon'          value { f: 0.001 }}"                  \
+        " attr { key: 'leakyrelu_alpha'  value { f: 0.2 }}"                    \
         " input: ['C', 'A', 'E']}"                                             \
         "node { name: 'G' op: '" #OUTPUT "'"                                   \
         " input: ['C', 'F']}");                                                \
@@ -3001,9 +3053,7 @@ REGISTER_TEST_ALL_TYPES(NodeRewrite_LeakyReluGrad_Negative);
 }
 REGISTER_TEST_ALL_TYPES(NodeRewrite_LeakyReluLeakyReluGrad_Positive);
 #undef REGISTER_TEST
-// clang-format on
 
-// clang-format off
 #ifdef ENABLE_MKLDNN_V1
 
 #define REGISTER_TEST(NAME, T, INPUT)                                        \
@@ -3055,7 +3105,7 @@ REGISTER_TEST_ALL_TYPES(NodeRewrite_TanhGrad_Positive);
       " input: ['B', 'A'] }"                                                   \
       "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: " #T " } }"   \
       " input: ['A', 'C'] }");                                                 \
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),                                     \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                                   \
             "A(" #INPUT ");B(_MklTanh);C(_MklTanhGrad);D(Zeta);DMT/_0(Const);" \
             "DMT/_1(Const)|A->B;A->C:1;A->D;A:control->DMT/_0:control;"        \
             "B->C;B:1->C:2;B:control->DMT/_1:control;C->D:1;DMT/_0->B:1;"      \
@@ -3064,276 +3114,309 @@ REGISTER_TEST_ALL_TYPES(NodeRewrite_TanhGrad_Positive);
 REGISTER_TEST_ALL_TYPES(NodeRewrite_TanhTanhGrad_Positive);
 #undef REGISTER_TEST
 #endif  // ENABLE_MKLDNN_V1
-// clang-format on
 
-TEST_F(MklLayoutPassTest, NodeRewrite_AvgPool_Positive) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'AvgPool'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(_MklAvgPool);C(Zeta);DMT/_0(Const)|A->B;A->C;"
-            "A:control->DMT/_0:control;B->C:1;DMT/_0->B:1");
+#define REGISTER_TEST(NAME, T, INPUT)                                        \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                    \
+    DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);     \
+    InitGraph(                                                               \
+      "node { name: 'A' op: '" #INPUT "'}"                                   \
+      "node { name: 'B' op: 'AvgPool'"                                       \
+      " attr { key: 'T'            value { type: " #T " } }"                 \
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"                    \
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"  \
+      " attr { key: 'padding'      value { s: 'VALID' } }"                   \
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"  \
+      " input: ['A'] }"                                                      \
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: " #T " } }" \
+      " input: ['A', 'B'] }");                                               \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                                 \
+            "A(" #INPUT ");B(_MklAvgPool);C(Zeta);DMT/_0(Const)|A->B;A->C;"  \
+            "A:control->DMT/_0:control;B->C:1;DMT/_0->B:1");                 \
 }
+REGISTER_TEST_ALL_TYPES(NodeRewrite_AvgPool_Positive);
+#undef REGISTER_TEST
 
-TEST_F(MklLayoutPassTest, NodeRewrite_AvgPoolGrad_Positive) {
-  InitGraph(
-      "node { name: 'A' op: 'Int32Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'AvgPoolGrad' "
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
-      " input: ['A', 'B'] }"
-      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['B', 'C'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Int32Input);B(Input);C(_MklAvgPoolGrad);D(Zeta);DMT/_0(Const);"
-            "DMT/_1(Const)|A->C;A:control->DMT/_0:control;"
-            "A:control->DMT/_1:control;B->C:1;B->D;C->D:1;DMT/_0->C:2;"
-            "DMT/_1->C:3");
+#define REGISTER_TEST(NAME, T, INPUT)                                        \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                    \
+    DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);     \
+    InitGraph(                                                               \
+      "node { name: 'A' op: 'Int32Input'}"                                   \
+      "node { name: 'B' op: '" #INPUT "'}"                                   \
+      "node { name: 'C' op: 'AvgPoolGrad' "                                  \
+      " attr { key: 'T'            value { type: " #T " } }"                 \
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"                    \
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"  \
+      " attr { key: 'padding'      value { s: 'VALID' } }"                   \
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"  \
+      " input: ['A', 'B'] }"                                                 \
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: " #T " } }" \
+      " input: ['B', 'C'] }");                                               \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                                 \
+            "A(Int32Input);B(" #INPUT ");C(_MklAvgPoolGrad);D(Zeta);"        \
+            "DMT/_0(Const);DMT/_1(Const)|A->C;A:control->DMT/_0:control;"    \
+            "A:control->DMT/_1:control;B->C:1;B->D;C->D:1;DMT/_0->C:2;"      \
+            "DMT/_1->C:3");                                                  \
 }
+REGISTER_TEST_ALL_TYPES(NodeRewrite_AvgPoolGrad_Positive);
+#undef REGISTER_TEST
 
-TEST_F(MklLayoutPassTest, NodeRewrite_AvgPoolAvgPoolGrad_Positive) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'I' op: 'Int32Input'}"
-      "node { name: 'B' op: 'AvgPool'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'AvgPoolGrad' "
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
-      " input: ['I', 'B'] }"
-      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'C'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(_MklAvgPool);C(_MklAvgPoolGrad);D(Zeta);DMT/_0(Const);"
-            "DMT/_1(Const);I(Int32Input)|A->B;A->D;A:control->DMT/_0:control;"
-            "B->C:1;B:1->C:3;C->D:1;DMT/_0->B:1;DMT/_1->C:2;I->C;"
-            "I:control->DMT/_1:control");
+#define REGISTER_TEST(NAME, T, INPUT)                                                \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                            \
+    DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);             \
+    InitGraph(                                                                       \
+      "node { name: 'A' op: '" #INPUT "'}"                                           \
+      "node { name: 'I' op: 'Int32Input'}"                                           \
+      "node { name: 'B' op: 'AvgPool'"                                               \
+      " attr { key: 'T'            value { type: " #T " } }"                         \
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"                            \
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"          \
+      " attr { key: 'padding'      value { s: 'VALID' } }"                           \
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"          \
+      " input: ['A'] }"                                                              \
+      "node { name: 'C' op: 'AvgPoolGrad' "                                          \
+      " attr { key: 'T'            value { type: " #T " } }"                         \
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"                            \
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"          \
+      " attr { key: 'padding'      value { s: 'VALID' } }"                           \
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"          \
+      " input: ['I', 'B'] }"                                                         \
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: " #T " } }"         \
+      " input: ['A', 'C'] }");                                                       \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                                         \
+            "A(" #INPUT ");B(_MklAvgPool);C(_MklAvgPoolGrad);D(Zeta);DMT/_0(Const);" \
+            "DMT/_1(Const);I(Int32Input)|A->B;A->D;A:control->DMT/_0:control;"       \
+            "B->C:1;B:1->C:3;C->D:1;DMT/_0->B:1;DMT/_1->C:2;I->C;"                   \
+            "I:control->DMT/_1:control");                                            \
 }
+REGISTER_TEST_ALL_TYPES(NodeRewrite_AvgPoolAvgPoolGrad_Positive);
+#undef REGISTER_TEST
 
-TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNormGrad_Positive) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'Input'}"
-      "node { name: 'F' op: 'FusedBatchNormGrad'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'epsilon'      value { f: 0.0001 } }"
-      " attr { key: 'is_training'  value { b: true } }"
-      " input: ['A', 'B', 'C', 'D', 'E'] }"
-      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'F'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
-            "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(Input);"
-            "F(_MklFusedBatchNormGrad);G(Zeta)|A->F;A->G;"
-            "A:control->DMT/_0:control;A:control->DMT/_1:control;"
-            "A:control->DMT/_2:control;A:control->DMT/_3:control;"
-            "A:control->DMT/_4:control;B->F:1;C->F:2;D->F:3;"
-            "DMT/_0->F:5;DMT/_1->F:6;DMT/_2->F:7;DMT/_3->F:8;DMT/_4->F:9;"
-            "E->F:4;F->G:1");
+#define REGISTER_TEST(NAME, T, INPUT)                                                \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                            \
+    DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);             \
+    InitGraph(                                                                       \
+      "node { name: 'A' op: '" #INPUT "'}"                                           \
+      "node { name: 'B' op: '" #INPUT "'}"                                           \
+      "node { name: 'C' op: '" #INPUT "'}"                                           \
+      "node { name: 'D' op: '" #INPUT "'}"                                           \
+      "node { name: 'E' op: '" #INPUT "'}"                                           \
+      "node { name: 'F' op: 'FusedBatchNormGrad'"                                    \
+      " attr { key: 'T'            value { type: " #T " } }"                         \
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"                            \
+      " attr { key: 'epsilon'      value { f: 0.0001 } }"                            \
+      " attr { key: 'is_training'  value { b: true } }"                              \
+      " input: ['A', 'B', 'C', 'D', 'E'] }"                                          \
+      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: " #T " } }"         \
+      " input: ['A', 'F'] }");                                                       \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                                         \
+            "A(" #INPUT ");B(" #INPUT ");C(" #INPUT ");D(" #INPUT ");"               \
+            "DMT/_0(Const);DMT/_1(Const);DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);" \
+            "E(" #INPUT ");F(_MklFusedBatchNormGrad);G(Zeta)|A->F;A->G;"             \
+            "A:control->DMT/_0:control;A:control->DMT/_1:control;"                   \
+            "A:control->DMT/_2:control;A:control->DMT/_3:control;"                   \
+            "A:control->DMT/_4:control;B->F:1;C->F:2;D->F:3;"                        \
+            "DMT/_0->F:5;DMT/_1->F:6;DMT/_2->F:7;DMT/_3->F:8;DMT/_4->F:9;"           \
+            "E->F:4;F->G:1");                                                        \
 }
+REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedBatchNormGrad_Positive);
+#undef REGISTER_TEST
 
-TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNormGradV2_Positive) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'Input'}"
-      "node { name: 'F' op: 'FusedBatchNormGradV2'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'U'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'epsilon'      value { f: 0.0001 } }"
-      " attr { key: 'is_training'  value { b: true } }"
-      " input: ['A', 'B', 'C', 'D', 'E'] }"
-      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'F'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
-            "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(Input);"
-            "F(_MklFusedBatchNormGradV2);G(Zeta)|A->F;A->G;"
-            "A:control->DMT/_0:control;A:control->DMT/_1:control;"
-            "A:control->DMT/_2:control;A:control->DMT/_3:control;"
-            "A:control->DMT/_4:control;B->F:1;C->F:2;D->F:3;"
-            "DMT/_0->F:5;DMT/_1->F:6;DMT/_2->F:7;DMT/_3->F:8;DMT/_4->F:9;"
-            "E->F:4;F->G:1");
+#define REGISTER_TEST(NAME, T, INPUT)                                                \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                            \
+    DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);             \
+    InitGraph(                                                                       \
+      "node { name: 'A' op: '" #INPUT "'}"                                           \
+      "node { name: 'B' op: '" #INPUT "'}"                                           \
+      "node { name: 'C' op: 'Float32Input'}"                                         \
+      "node { name: 'D' op: 'Float32Input'}"                                         \
+      "node { name: 'E' op: 'Float32Input'}"                                         \
+      "node { name: 'F' op: 'FusedBatchNormGradV2'"                                  \
+      " attr { key: 'T'            value { type: " #T " } }"                         \
+      " attr { key: 'U'            value { type: DT_FLOAT } }"                       \
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"                            \
+      " attr { key: 'epsilon'      value { f: 0.0001 } }"                            \
+      " attr { key: 'is_training'  value { b: true } }"                              \
+      " input: ['A', 'B', 'C', 'D', 'E'] }"                                          \
+      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: " #T " } }"         \
+      " input: ['A', 'F'] }");                                                       \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                                         \
+            "A(" #INPUT ");B(" #INPUT ");C(Float32Input);D(Float32Input);"           \
+            "DMT/_0(Const);DMT/_1(Const);DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);" \
+            "E(Float32Input);F(_MklFusedBatchNormGradV2);G(Zeta)|A->F;A->G;"         \
+            "A:control->DMT/_0:control;A:control->DMT/_1:control;"                   \
+            "A:control->DMT/_2:control;A:control->DMT/_3:control;"                   \
+            "A:control->DMT/_4:control;B->F:1;C->F:2;D->F:3;"                        \
+            "DMT/_0->F:5;DMT/_1->F:6;DMT/_2->F:7;DMT/_3->F:8;DMT/_4->F:9;"           \
+            "E->F:4;F->G:1");                                                        \
 }
+REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedBatchNormGradV2_Positive);
+#undef REGISTER_TEST
 
 // T, U combination is not supported by MKL. Node will not be rewritten
 // into MKL node.
 TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNormGradV2_Negative) {
+  DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
   InitGraph(
-      "node { name: 'A' op: 'HalfInput'}"
-      "node { name: 'B' op: 'HalfInput'}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'Input'}"
-      "node { name: 'F' op: 'FusedBatchNormGradV2'"
-      " attr { key: 'T'            value { type: DT_HALF } }"
-      " attr { key: 'U'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'epsilon'      value { f: 0.0001 } }"
-      " attr { key: 'is_training'  value { b: true } }"
-      " input: ['A', 'B', 'C', 'D', 'E'] }"
-      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_HALF } }"
-      " input: ['A', 'F'] }");
+    "node { name: 'A' op: 'HalfInput'}"
+    "node { name: 'B' op: 'HalfInput'}"
+    "node { name: 'C' op: 'Float32Input'}"
+    "node { name: 'D' op: 'Float32Input'}"
+    "node { name: 'E' op: 'Float32Input'}"
+    "node { name: 'F' op: 'FusedBatchNormGradV2'"
+    " attr { key: 'T'            value { type: DT_HALF } }"
+    " attr { key: 'U'            value { type: DT_FLOAT } }"
+    " attr { key: 'data_format'  value { s: 'NCHW' } }"
+    " attr { key: 'epsilon'      value { f: 0.0001 } }"
+    " attr { key: 'is_training'  value { b: true } }"
+    " input: ['A', 'B', 'C', 'D', 'E'] }"
+    "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_HALF } }"
+    " input: ['A', 'F'] }");
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(HalfInput);B(HalfInput);C(Input);D(Input);E(Input);"
-            "F(FusedBatchNormGradV2);G(Zeta)|A->F;A->G;"
-            "B->F:1;C->F:2;D->F:3;E->F:4;F->G:1");
+          "A(HalfInput);B(HalfInput);C(Float32Input);D(Float32Input);"
+          "E(Float32Input);F(FusedBatchNormGradV2);G(Zeta)|A->F;A->G;"
+          "B->F:1;C->F:2;D->F:3;E->F:4;F->G:1");
 }
 
-TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNorm_Positive) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'Input'}"
-      "node { name: 'F' op: 'FusedBatchNorm'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'epsilon'      value { f: 0.0001 } }"
-      " attr { key: 'is_training'  value { b: true } }"
-      " input: ['A', 'B', 'C', 'D', 'E'] }"
-      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'F'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
-            "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(Input);"
-            "F(_MklFusedBatchNorm);G(Zeta)|A->F;A->G;"
-            "A:control->DMT/_0:control;A:control->DMT/_1:control;"
-            "A:control->DMT/_2:control;A:control->DMT/_3:control;"
-            "A:control->DMT/_4:control;B->F:1;C->F:2;D->F:3;"
-            "DMT/_0->F:5;DMT/_1->F:6;DMT/_2->F:7;DMT/_3->F:8;DMT/_4->F:9;"
-            "E->F:4;F->G:1");
+#define REGISTER_TEST(NAME, T, INPUT)                                                \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                            \
+    DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);             \
+    InitGraph(                                                                       \
+      "node { name: 'A' op: '" #INPUT "'}"                                           \
+      "node { name: 'B' op: '" #INPUT "'}"                                           \
+      "node { name: 'C' op: '" #INPUT "'}"                                           \
+      "node { name: 'D' op: '" #INPUT "'}"                                           \
+      "node { name: 'E' op: '" #INPUT "'}"                                           \
+      "node { name: 'F' op: 'FusedBatchNorm'"                                        \
+      " attr { key: 'T'            value { type: " #T " } }"                         \
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"                            \
+      " attr { key: 'epsilon'      value { f: 0.0001 } }"                            \
+      " attr { key: 'is_training'  value { b: true } }"                              \
+      " input: ['A', 'B', 'C', 'D', 'E'] }"                                          \
+      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: " #T " } }"         \
+      " input: ['A', 'F'] }");                                                       \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                                         \
+            "A(" #INPUT ");B(" #INPUT ");C(" #INPUT ");D(" #INPUT ");DMT/_0(Const);" \
+            "DMT/_1(Const);DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(" #INPUT ");" \
+            "F(_MklFusedBatchNorm);G(Zeta)|A->F;A->G;"                               \
+            "A:control->DMT/_0:control;A:control->DMT/_1:control;"                   \
+            "A:control->DMT/_2:control;A:control->DMT/_3:control;"                   \
+            "A:control->DMT/_4:control;B->F:1;C->F:2;D->F:3;"                        \
+            "DMT/_0->F:5;DMT/_1->F:6;DMT/_2->F:7;DMT/_3->F:8;DMT/_4->F:9;"           \
+            "E->F:4;F->G:1");                                                        \
 }
+REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedBatchNorm_Positive);
+#undef REGISTER_TEST
 
-TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNormV2_Positive) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'Input'}"
-      "node { name: 'F' op: 'FusedBatchNormV2'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'U'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'epsilon'      value { f: 0.0001 } }"
-      " attr { key: 'is_training'  value { b: true } }"
-      " input: ['A', 'B', 'C', 'D', 'E'] }"
-      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'F'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
-            "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(Input);"
-            "F(_MklFusedBatchNormV2);G(Zeta)|A->F;A->G;"
-            "A:control->DMT/_0:control;A:control->DMT/_1:control;"
-            "A:control->DMT/_2:control;A:control->DMT/_3:control;"
-            "A:control->DMT/_4:control;B->F:1;C->F:2;D->F:3;"
-            "DMT/_0->F:5;DMT/_1->F:6;DMT/_2->F:7;DMT/_3->F:8;DMT/_4->F:9;"
-            "E->F:4;F->G:1");
+#define REGISTER_TEST(NAME, T, INPUT)                                                \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                            \
+    DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);             \
+    InitGraph(                                                                       \
+      "node { name: 'A' op: '" #INPUT "'}"                                           \
+      "node { name: 'B' op: 'Float32Input'}"                                         \
+      "node { name: 'C' op: 'Float32Input'}"                                         \
+      "node { name: 'D' op: 'Float32Input'}"                                         \
+      "node { name: 'E' op: 'Float32Input'}"                                         \
+      "node { name: 'F' op: 'FusedBatchNormV2'"                                      \
+      " attr { key: 'T'            value { type: " #T " } }"                         \
+      " attr { key: 'U'            value { type: DT_FLOAT } }"                       \
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"                            \
+      " attr { key: 'epsilon'      value { f: 0.0001 } }"                            \
+      " attr { key: 'is_training'  value { b: true } }"                              \
+      " input: ['A', 'B', 'C', 'D', 'E'] }"                                          \
+      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: " #T " } }"         \
+      " input: ['A', 'F'] }");                                                       \
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),                                           \
+            "A(" #INPUT ");B(Float32Input);C(Float32Input);D(Float32Input);"         \
+            "DMT/_0(Const);DMT/_1(Const);DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);" \
+            "E(Float32Input);F(_MklFusedBatchNormV2);G(Zeta)|A->F;A->G;"             \
+            "A:control->DMT/_0:control;A:control->DMT/_1:control;"                   \
+            "A:control->DMT/_2:control;A:control->DMT/_3:control;"                   \
+            "A:control->DMT/_4:control;B->F:1;C->F:2;D->F:3;"                        \
+            "DMT/_0->F:5;DMT/_1->F:6;DMT/_2->F:7;DMT/_3->F:8;DMT/_4->F:9;"           \
+            "E->F:4;F->G:1");                                                        \
 }
+REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedBatchNormV2_Positive);
+#undef REGISTER_TEST
 
 // T, U combination is not supported by MKL. Node will not be rewritten
 // into MKL node.
 TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNormV2_Negative) {
+  DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
   InitGraph(
-      "node { name: 'A' op: 'HalfInput'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'Input'}"
-      "node { name: 'F' op: 'FusedBatchNormV2'"
-      " attr { key: 'T'            value { type: DT_HALF } }"
-      " attr { key: 'U'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'epsilon'      value { f: 0.0001 } }"
-      " attr { key: 'is_training'  value { b: true } }"
-      " input: ['A', 'B', 'C', 'D', 'E'] }"
-      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_HALF } }"
-      " input: ['A', 'F'] }");
+    "node { name: 'A' op: 'HalfInput'}"
+    "node { name: 'B' op: 'Float32Input'}"
+    "node { name: 'C' op: 'Float32Input'}"
+    "node { name: 'D' op: 'Float32Input'}"
+    "node { name: 'E' op: 'Float32Input'}"
+    "node { name: 'F' op: 'FusedBatchNormV2'"
+    " attr { key: 'T'            value { type: DT_HALF } }"
+    " attr { key: 'U'            value { type: DT_FLOAT } }"
+    " attr { key: 'data_format'  value { s: 'NCHW' } }"
+    " attr { key: 'epsilon'      value { f: 0.0001 } }"
+    " attr { key: 'is_training'  value { b: true } }"
+    " input: ['A', 'B', 'C', 'D', 'E'] }"
+    "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_HALF } }"
+    " input: ['A', 'F'] }");
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(HalfInput);B(Input);C(Input);D(Input);E(Input);"
-            "F(FusedBatchNormV2);G(Zeta)|A->F;A->G;"
-            "B->F:1;C->F:2;D->F:3;E->F:4;F->G:1");
+          "A(HalfInput);B(Float32Input);C(Float32Input);D(Float32Input);"
+          "E(Float32Input);F(FusedBatchNormV2);G(Zeta)|A->F;A->G;"
+          "B->F:1;C->F:2;D->F:3;E->F:4;F->G:1");
 }
 
-TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNormV3_Positive) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'Input'}"
-      "node { name: 'F' op: 'FusedBatchNormV3'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'U'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'epsilon'      value { f: 0.0001 } }"
-      " attr { key: 'is_training'  value { b: true } }"
-      " input: ['A', 'B', 'C', 'D', 'E'] }"
-      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'F'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
-            "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(Input);"
-            "F(_MklFusedBatchNormV3);G(Zeta)|A->F;A->G;"
-            "A:control->DMT/_0:control;A:control->DMT/_1:control;"
-            "A:control->DMT/_2:control;A:control->DMT/_3:control;"
-            "A:control->DMT/_4:control;B->F:1;C->F:2;D->F:3;"
-            "DMT/_0->F:5;DMT/_1->F:6;DMT/_2->F:7;DMT/_3->F:8;DMT/_4->F:9;"
-            "E->F:4;F->G:1");
+#define REGISTER_TEST(NAME, T, INPUT)                                                \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                            \
+    DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);             \
+    InitGraph(                                                                       \
+      "node { name: 'A' op: '" #INPUT "'}"                                           \
+      "node { name: 'B' op: 'Float32Input'}"                                         \
+      "node { name: 'C' op: 'Float32Input'}"                                         \
+      "node { name: 'D' op: 'Float32Input'}"                                         \
+      "node { name: 'E' op: 'Float32Input'}"                                         \
+      "node { name: 'F' op: 'FusedBatchNormV3'"                                      \
+      " attr { key: 'T'            value { type: " #T " } }"                         \
+      " attr { key: 'U'            value { type: DT_FLOAT } }"                       \
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"                            \
+      " attr { key: 'epsilon'      value { f: 0.0001 } }"                            \
+      " attr { key: 'is_training'  value { b: true } }"                              \
+      " input: ['A', 'B', 'C', 'D', 'E'] }"                                          \
+      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: " #T " } }"         \
+      " input: ['A', 'F'] }");                                                       \
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),                                           \
+            "A(" #INPUT ");B(Float32Input);C(Float32Input);D(Float32Input);"         \
+            "DMT/_0(Const);DMT/_1(Const);DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);" \
+            "E(Float32Input);F(_MklFusedBatchNormV3);G(Zeta)|A->F;A->G;"             \
+            "A:control->DMT/_0:control;A:control->DMT/_1:control;"                   \
+            "A:control->DMT/_2:control;A:control->DMT/_3:control;"                   \
+            "A:control->DMT/_4:control;B->F:1;C->F:2;D->F:3;"                        \
+            "DMT/_0->F:5;DMT/_1->F:6;DMT/_2->F:7;DMT/_3->F:8;DMT/_4->F:9;"           \
+            "E->F:4;F->G:1");                                                        \
 }
+REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedBatchNormV3_Positive);
+#undef REGISTER_TEST
 
 TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNormV3_Negative) {
+  DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
   InitGraph(
-      "node { name: 'A' op: 'HalfInput'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'Input'}"
-      "node { name: 'F' op: 'FusedBatchNormV3'"
-      " attr { key: 'T'            value { type: DT_HALF } }"
-      " attr { key: 'U'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'epsilon'      value { f: 0.0001 } }"
-      " attr { key: 'is_training'  value { b: true } }"
-      " input: ['A', 'B', 'C', 'D', 'E'] }"
-      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_HALF } }"
-      " input: ['A', 'F'] }");
+    "node { name: 'A' op: 'HalfInput'}"
+    "node { name: 'B' op: 'Float32Input'}"
+    "node { name: 'C' op: 'Float32Input'}"
+    "node { name: 'D' op: 'Float32Input'}"
+    "node { name: 'E' op: 'Float32Input'}"
+    "node { name: 'F' op: 'FusedBatchNormV3'"
+    " attr { key: 'T'            value { type: DT_HALF } }"
+    " attr { key: 'U'            value { type: DT_FLOAT } }"
+    " attr { key: 'data_format'  value { s: 'NCHW' } }"
+    " attr { key: 'epsilon'      value { f: 0.0001 } }"
+    " attr { key: 'is_training'  value { b: true } }"
+    " input: ['A', 'B', 'C', 'D', 'E'] }"
+    "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_HALF } }"
+    " input: ['A', 'F'] }");
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(HalfInput);B(Input);C(Input);D(Input);E(Input);"
-            "F(FusedBatchNormV3);G(Zeta)|A->F;A->G;"
-            "B->F:1;C->F:2;D->F:3;E->F:4;F->G:1");
+          "A(HalfInput);B(Float32Input);C(Float32Input);D(Float32Input);"
+          "E(Float32Input);F(FusedBatchNormV3);G(Zeta)|A->F;A->G;"
+          "B->F:1;C->F:2;D->F:3;E->F:4;F->G:1");
 }
 
-// clang-format off
 #ifdef ENABLE_MKLDNN_V1
 #define REGISTER_TEST(NAME, T, INPUT)                                        \
   TEST_F(MklLayoutPassTest, NAME##_##T) {                                    \
@@ -3425,7 +3508,6 @@ REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedBatchNormEx_Negative1);
 REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedBatchNormEx_Negative2);
 #undef REGISTER_TEST
 #endif  // ENABLE_MKLDNN_V1
-// clang-format on
 
 TEST_F(MklLayoutPassTest, NodeRewrite_QuantizedDepthwiseConv2D_Positive) {
   InitGraph(
@@ -3464,428 +3546,486 @@ TEST_F(MklLayoutPassTest, NodeRewrite_QuantizedDepthwiseConv2D_Positive) {
 /////////////////////////////////////////////////////////////////////
 
 // If any of the inputs is an MKL op, then rewrite Slice to Mkl op.
-TEST_F(MklLayoutPassTest, NodeRewrite_Ctxbased_Slice_Positive) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'M' op: '_MklInput'}"
-      "node { name: 'N' op: '_MklInput'}"
-      "node { name: 'C' op: '_MklConv2D'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B', 'M', 'N']}"
-      "node { name: 'D' op: 'Int32Input'}"
-      "node { name: 'E' op: 'Int32Input'}"
-      "node { name: 'F' op: 'Slice'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'Index'        value { type: DT_INT32 } }"
-      " input: ['C', 'D', 'E'] }"
-      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'C'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(_MklConv2D);D(Int32Input);"
-            "DMT/_0(Const);DMT/_1(Const);"
-            "E(Int32Input);F(_MklSlice);G(Zeta);M(_MklInput);N(_MklInput)|"
-            "A->C;A->G;B->C:1;C->F;C->G:1;C:2->F:3;"
-            "C:control->DMT/_0:control;C:control->DMT/"
-            "_1:control;"
-            "D->F:1;DMT/_0->F:4;DMT/_1->F:5;"
-            "E->F:2;M->C:2;N->C:3");
+#define REGISTER_TEST(NAME, T, INPUT)                                           \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                       \
+    InitGraph(                                                                  \
+      "node { name: 'A' op: '" #INPUT "'}"                                      \
+      "node { name: 'B' op: '" #INPUT "'}"                                      \
+      "node { name: 'M' op: '_MklInput'}"                                       \
+      "node { name: 'N' op: '_MklInput'}"                                       \
+      "node { name: 'C' op: '_MklConv2D'"                                       \
+      " attr { key: 'T'                value { type: " #T " } }"                \
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"                   \
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"                    \
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }" \
+      " attr { key: 'padding'          value { s: 'SAME' } }"                   \
+      " input: ['A', 'B', 'M', 'N']}"                                           \
+      "node { name: 'D' op: 'Int32Input'}"                                      \
+      "node { name: 'E' op: 'Int32Input'}"                                      \
+      "node { name: 'F' op: 'Slice'"                                            \
+      " attr { key: 'T'            value { type: " #T " } }"                    \
+      " attr { key: 'Index'        value { type: DT_INT32 } }"                  \
+      " input: ['C', 'D', 'E'] }"                                               \
+      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: " #T " } }"    \
+      " input: ['A', 'C'] }");                                                  \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                                    \
+            "A(" #INPUT ");B(" #INPUT ");C(_MklConv2D);D(Int32Input);"          \
+            "DMT/_0(Const);DMT/_1(Const);"                                      \
+            "E(Int32Input);F(_MklSlice);G(Zeta);M(_MklInput);N(_MklInput)|"     \
+            "A->C;A->G;B->C:1;C->F;C->G:1;C:2->F:3;"                            \
+            "C:control->DMT/_0:control;C:control->DMT/"                         \
+            "_1:control;"                                                       \
+            "D->F:1;DMT/_0->F:4;DMT/_1->F:5;"                                   \
+            "E->F:2;M->C:2;N->C:3");                                            \
 }
+REGISTER_TEST_ALL_TYPES(NodeRewrite_Ctxbased_Slice_Positive);
+#undef REGISTER_TEST
 
 // If none of the inputs is an MKL op, then Slice should not be rewritten.
-TEST_F(MklLayoutPassTest, NodeRewrite_Ctxbased_Slice_Negative) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Int32Input'}"
-      "node { name: 'C' op: 'Int32Input'}"
-      "node { name: 'D' op: 'Slice'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'Index'        value { type: DT_INT32 } }"
-      " input: ['A', 'B', 'C'] }"
-      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'D'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Int32Input);C(Int32Input);"
-            "D(Slice);E(Zeta)|A->D;A->E;B->D:1;C->D:2;D->E:1");
+#define REGISTER_TEST(NAME, T, INPUT)                                        \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                    \
+    InitGraph(                                                               \
+      "node { name: 'A' op: '" #INPUT "'}"                                   \
+      "node { name: 'B' op: 'Int32Input'}"                                   \
+      "node { name: 'C' op: 'Int32Input'}"                                   \
+      "node { name: 'D' op: 'Slice'"                                         \
+      " attr { key: 'T'            value { type: " #T " } }"                 \
+      " attr { key: 'Index'        value { type: DT_INT32 } }"               \
+      " input: ['A', 'B', 'C'] }"                                            \
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: " #T " } }" \
+      " input: ['A', 'D'] }");                                               \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                                 \
+            "A(" #INPUT ");B(Int32Input);C(Int32Input);"                     \
+            "D(Slice);E(Zeta)|A->D;A->E;B->D:1;C->D:2;D->E:1");              \
 }
+REGISTER_TEST_ALL_TYPES(NodeRewrite_Ctxbased_Slice_Negative);
+#undef REGISTER_TEST
 
 /////////////////////////////////////////////////////////////////////
 //  Unit tests related to rewriting node for workspace edges
 /////////////////////////////////////////////////////////////////////
 
 /* Test LRN->MaxPool->MaxPoolGrad->LRNGrad replacement by workspace nodes. */
-TEST_F(MklLayoutPassTest, MaxPoolLRN_Positive) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'LRN'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'alpha'        value { f: 0.001 } }"
-      " attr { key: 'beta'         value { f: 0.75 } }"
-      " attr { key: 'bias'         value { f: 1.0 } }"
-      " attr { key: 'depth_radius' value { i: 2 } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'MaxPool'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
-      " input: ['B'] }"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'MaxPoolGrad'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
-      " input: ['B', 'C', 'D'] }"
-      "node { name: 'F' op: 'Input'}"
-      "node { name: 'G' op: 'LRNGrad'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'alpha'        value { f: 0.001 } }"
-      " attr { key: 'beta'         value { f: 0.75 } }"
-      " attr { key: 'bias'         value { f: 1.0 } }"
-      " attr { key: 'depth_radius' value { i: 2 } }"
-      " input: ['E', 'F', 'B'] }"
-      "node { name: 'H' op: 'Input'}"
-      "node { name: 'I' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['H', 'G'] }");
-  EXPECT_EQ(
-      DoMklLayoutOptimizationPass(),
-      "A(Input);B(_MklLRN);C(_MklMaxPool);D(Input);DMT/_0(Const);DMT/_1(Const);"
-      "DMT/_2(Const);E(_MklMaxPoolGrad);F(Input);G(_MklLRNGrad);H(Input);"
-      "I(Zeta)|A->B;A:control->DMT/_0:control;B->C;B->E;B->G:2;B:1->G:3;"
-      "B:2->C:1;B:2->E:4;B:2->G:6;B:3->G:7;B:control->DMT/_1:control;C->E:1;"
-      "C:1->E:3;C:2->E:5;C:3->E:7;D->E:2;DMT/_0->B:1;DMT/_1->E:6;DMT/_2->G:5;"
-      "E->G;E:1->G:4;E:control->DMT/_2:control;F->G:1;G->I:1;H->I");
+#define REGISTER_TEST(NAME, T, INPUT)                                        \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                    \
+    InitGraph(                                                               \
+      "node { name: 'A' op: '" #INPUT "'}"                                   \
+      "node { name: 'B' op: 'LRN'"                                           \
+      " attr { key: 'T'            value { type: " #T " } }"                 \
+      " attr { key: 'alpha'        value { f: 0.001 } }"                     \
+      " attr { key: 'beta'         value { f: 0.75 } }"                      \
+      " attr { key: 'bias'         value { f: 1.0 } }"                       \
+      " attr { key: 'depth_radius' value { i: 2 } }"                         \
+      " input: ['A'] }"                                                      \
+      "node { name: 'C' op: 'MaxPool'"                                       \
+      " attr { key: 'T'            value { type: " #T " } }"                 \
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"                    \
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"  \
+      " attr { key: 'padding'      value { s: 'VALID' } }"                   \
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"  \
+      " input: ['B'] }"                                                      \
+      "node { name: 'D' op: '" #INPUT "'}"                                   \
+      "node { name: 'E' op: 'MaxPoolGrad'"                                   \
+      " attr { key: 'T'            value { type: " #T " } }"                 \
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"                    \
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"  \
+      " attr { key: 'padding'      value { s: 'VALID' } }"                   \
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"  \
+      " input: ['B', 'C', 'D'] }"                                            \
+      "node { name: 'F' op: '" #INPUT "'}"                                   \
+      "node { name: 'G' op: 'LRNGrad'"                                       \
+      " attr { key: 'T'            value { type: " #T " } }"                 \
+      " attr { key: 'alpha'        value { f: 0.001 } }"                     \
+      " attr { key: 'beta'         value { f: 0.75 } }"                      \
+      " attr { key: 'bias'         value { f: 1.0 } }"                       \
+      " attr { key: 'depth_radius' value { i: 2 } }"                         \
+      " input: ['E', 'F', 'B'] }"                                            \
+      "node { name: 'H' op: '" #INPUT "'}"                                   \
+      "node { name: 'I' op: 'Zeta' attr { key: 'T' value { type: " #T " } }" \
+      " input: ['H', 'G'] }");                                               \
+    EXPECT_EQ(                                                               \
+      DoMklLayoutOptimizationPass(),                                         \
+      "A(" #INPUT ");B(_MklLRN);C(_MklMaxPool);D(" #INPUT ");DMT/_0(Const);" \
+      "DMT/_1(Const);DMT/_2(Const);E(_MklMaxPoolGrad);F(" #INPUT ");"        \
+      "G(_MklLRNGrad);H(" #INPUT ");I(Zeta)|A->B;A:control->DMT/_0:control;" \
+      "B->C;B->E;B->G:2;B:1->G:3;B:2->C:1;B:2->E:4;B:2->G:6;B:3->G:7;"       \
+      "B:control->DMT/_1:control;C->E:1;C:1->E:3;C:2->E:5;C:3->E:7;D->E:2;"  \
+      "DMT/_0->B:1;DMT/_1->E:6;DMT/_2->G:5;E->G;E:1->G:4;"                   \
+      "E:control->DMT/_2:control;F->G:1;G->I:1;H->I");                       \
 }
+REGISTER_TEST_FLOAT32(MaxPoolLRN_Positive);
+// TODO(nhasabni): Enable bfloat16 test when we enable the operator.
+#undef REGISTER_TEST
 
 /* Test LRN->LRNGrad replacement by workspace nodes. */
-TEST_F(MklLayoutPassTest, LRN_Positive) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'LRN'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'alpha'        value { f: 0.001 } }"
-      " attr { key: 'beta'         value { f: 0.75 } }"
-      " attr { key: 'bias'         value { f: 1.0 } }"
-      " attr { key: 'depth_radius' value { i: 2 } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'LRNGrad'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'alpha'        value { f: 0.001 } }"
-      " attr { key: 'beta'         value { f: 0.75 } }"
-      " attr { key: 'bias'         value { f: 1.0 } }"
-      " attr { key: 'depth_radius' value { i: 2 } }"
-      " input: ['C', 'D', 'B'] }"
-      "node { name: 'F' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['C', 'E'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(_MklLRN);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
-            "DMT/_2(Const);E(_MklLRNGrad);F(Zeta)|"
-            "A->B;A:control->DMT/_0:control;B->E:2;B:1->E:3;B:2->E:6;B:3->E:7;"
-            "C->E;C->F;C:control->DMT/_1:control;C:control->DMT/_2:control;"
-            "D->E:1;DMT/_0->B:1;DMT/_1->E:4;DMT/_2->E:5;E->F:1");
+#define REGISTER_TEST(NAME, T, INPUT)                                           \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                       \
+    InitGraph(                                                                  \
+      "node { name: 'A' op: '" #INPUT "'}"                                      \
+      "node { name: 'B' op: 'LRN'"                                              \
+      " attr { key: 'T'            value { type: " #T " } }"                    \
+      " attr { key: 'alpha'        value { f: 0.001 } }"                        \
+      " attr { key: 'beta'         value { f: 0.75 } }"                         \
+      " attr { key: 'bias'         value { f: 1.0 } }"                          \
+      " attr { key: 'depth_radius' value { i: 2 } }"                            \
+      " input: ['A'] }"                                                         \
+      "node { name: 'C' op: '" #INPUT "'}"                                      \
+      "node { name: 'D' op: '" #INPUT "'}"                                      \
+      "node { name: 'E' op: 'LRNGrad'"                                          \
+      " attr { key: 'T'            value { type: " #T " } }"                    \
+      " attr { key: 'alpha'        value { f: 0.001 } }"                        \
+      " attr { key: 'beta'         value { f: 0.75 } }"                         \
+      " attr { key: 'bias'         value { f: 1.0 } }"                          \
+      " attr { key: 'depth_radius' value { i: 2 } }"                            \
+      " input: ['C', 'D', 'B'] }"                                               \
+      "node { name: 'F' op: 'Zeta' attr { key: 'T' value { type: " #T " } }"    \
+      " input: ['C', 'E'] }");                                                  \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                                    \
+            "A(" #INPUT ");B(_MklLRN);C(" #INPUT ");D(" #INPUT ");"             \
+            "DMT/_0(Const);DMT/_1(Const);DMT/_2(Const);E(_MklLRNGrad);F(Zeta)|" \
+            "A->B;A:control->DMT/_0:control;B->E:2;B:1->E:3;B:2->E:6;B:3->E:7;" \
+            "C->E;C->F;C:control->DMT/_1:control;C:control->DMT/_2:control;"    \
+            "D->E:1;DMT/_0->B:1;DMT/_1->E:4;DMT/_2->E:5;E->F:1");               \
 }
+REGISTER_TEST_FLOAT32(LRN_Positive);
+#undef REGISTER_TEST
 
 /* Test LRN->LRNGrad replacement when only one of them is present. */
-TEST_F(MklLayoutPassTest, LRN_Negative1) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'LRN'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'alpha'        value { f: 0.001 } }"
-      " attr { key: 'beta'         value { f: 0.75 } }"
-      " attr { key: 'bias'         value { f: 1.0 } }"
-      " attr { key: 'depth_radius' value { i: 2 } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(_MklLRN);C(Zeta);DMT/_0(Const)|"
-            "A->B;A->C;A:control->DMT/_0:control;B->C:1;DMT/_0->B:1");
+#define REGISTER_TEST(NAME, T, INPUT)                                        \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                    \
+    InitGraph(                                                               \
+      "node { name: 'A' op: '" #INPUT "'}"                                   \
+      "node { name: 'B' op: 'LRN'"                                           \
+      " attr { key: 'T'            value { type: " #T " } }"                 \
+      " attr { key: 'alpha'        value { f: 0.001 } }"                     \
+      " attr { key: 'beta'         value { f: 0.75 } }"                      \
+      " attr { key: 'bias'         value { f: 1.0 } }"                       \
+      " attr { key: 'depth_radius' value { i: 2 } }"                         \
+      " input: ['A'] }"                                                      \
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: " #T " } }" \
+      " input: ['A', 'B'] }");                                               \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                                 \
+            "A(" #INPUT ");B(_MklLRN);C(Zeta);DMT/_0(Const)|"                \
+            "A->B;A->C;A:control->DMT/_0:control;B->C:1;DMT/_0->B:1");       \
 }
+REGISTER_TEST_FLOAT32(LRN_Negative1);
+#undef REGISTER_TEST
 
 /* Test LRN->LRNGrad replacement when only one of them is present. */
-TEST_F(MklLayoutPassTest, LRN_Negative2) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'LRNGrad'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'alpha'        value { f: 0.001 } }"
-      " attr { key: 'beta'         value { f: 0.75 } }"
-      " attr { key: 'bias'         value { f: 1.0 } }"
-      " attr { key: 'depth_radius' value { i: 2 } }"
-      " input: ['A', 'B', 'C'] }"
-      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'D'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(Input);D(LRNGrad);"
-            "E(Zeta)|A->D;A->E;B->D:1;C->D:2;D->E:1");
+#define REGISTER_TEST(NAME, T, INPUT)                                        \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                    \
+    InitGraph(                                                               \
+      "node { name: 'A' op: '" #INPUT "'}"                                   \
+      "node { name: 'B' op: '" #INPUT "'}"                                   \
+      "node { name: 'C' op: '" #INPUT "'}"                                   \
+      "node { name: 'D' op: 'LRNGrad'"                                       \
+      " attr { key: 'T'            value { type: " #T " } }"                 \
+      " attr { key: 'alpha'        value { f: 0.001 } }"                     \
+      " attr { key: 'beta'         value { f: 0.75 } }"                      \
+      " attr { key: 'bias'         value { f: 1.0 } }"                       \
+      " attr { key: 'depth_radius' value { i: 2 } }"                         \
+      " input: ['A', 'B', 'C'] }"                                            \
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: " #T " } }" \
+      " input: ['A', 'D'] }");                                               \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                                 \
+            "A(" #INPUT ");B(" #INPUT ");C(" #INPUT ");D(LRNGrad);"          \
+            "E(Zeta)|A->D;A->E;B->D:1;C->D:2;D->E:1");                       \
 }
+REGISTER_TEST_ALL_TYPES(LRN_Negative2);
+#undef REGISTER_TEST
 
 /* Test LRN->LRNGrad negative case, where single LRN feeds
    2 LRNGrad nodes at different slots. */
-TEST_F(MklLayoutPassTest, LRN_Negative3) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'LRN'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'alpha'        value { f: 0.001 } }"
-      " attr { key: 'beta'         value { f: 0.75 } }"
-      " attr { key: 'bias'         value { f: 1.0 } }"
-      " attr { key: 'depth_radius' value { i: 2 } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'LRNGrad'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'alpha'        value { f: 0.001 } }"
-      " attr { key: 'beta'         value { f: 0.75 } }"
-      " attr { key: 'bias'         value { f: 1.0 } }"
-      " attr { key: 'depth_radius' value { i: 2 } }"
-      " input: ['C', 'D', 'B'] }"
-      "node { name: 'F' op: 'LRNGrad'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'alpha'        value { f: 0.001 } }"
-      " attr { key: 'beta'         value { f: 0.75 } }"
-      " attr { key: 'bias'         value { f: 1.0 } }"
-      " attr { key: 'depth_radius' value { i: 2 } }"
-      " input: ['C', 'B', 'D'] }"
-      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['E', 'F'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(_MklLRN);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
-            "DMT/_2(Const);E(_MklLRNGrad);F(LRNGrad);G(Zeta)|A->B;"
-            "A:control->DMT/_0:control;B->E:2;B->F:1;B:1->E:3;B:2->E:6;"
-            "B:3->E:7;C->E;C->F;C:control->DMT/_1:control;"
-            "C:control->DMT/_2:control;D->E:1;D->F:2;DMT/_0->B:1;"
-            "DMT/_1->E:4;DMT/_2->E:5;E->G;F->G:1");
+#define REGISTER_TEST(NAME, T, INPUT)                                          \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                      \
+    InitGraph(                                                                 \
+      "node { name: 'A' op: '" #INPUT "'}"                                     \
+      "node { name: 'B' op: 'LRN'"                                             \
+      " attr { key: 'T'            value { type: " #T " } }"                   \
+      " attr { key: 'alpha'        value { f: 0.001 } }"                       \
+      " attr { key: 'beta'         value { f: 0.75 } }"                        \
+      " attr { key: 'bias'         value { f: 1.0 } }"                         \
+      " attr { key: 'depth_radius' value { i: 2 } }"                           \
+      " input: ['A'] }"                                                        \
+      "node { name: 'C' op: '" #INPUT "'}"                                     \
+      "node { name: 'D' op: '" #INPUT "'}"                                     \
+      "node { name: 'E' op: 'LRNGrad'"                                         \
+      " attr { key: 'T'            value { type: " #T " } }"                   \
+      " attr { key: 'alpha'        value { f: 0.001 } }"                       \
+      " attr { key: 'beta'         value { f: 0.75 } }"                        \
+      " attr { key: 'bias'         value { f: 1.0 } }"                         \
+      " attr { key: 'depth_radius' value { i: 2 } }"                           \
+      " input: ['C', 'D', 'B'] }"                                              \
+      "node { name: 'F' op: 'LRNGrad'"                                         \
+      " attr { key: 'T'            value { type: " #T " } }"                   \
+      " attr { key: 'alpha'        value { f: 0.001 } }"                       \
+      " attr { key: 'beta'         value { f: 0.75 } }"                        \
+      " attr { key: 'bias'         value { f: 1.0 } }"                         \
+      " attr { key: 'depth_radius' value { i: 2 } }"                           \
+      " input: ['C', 'B', 'D'] }"                                              \
+      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: " #T " } }"   \
+      " input: ['E', 'F'] }");                                                 \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                                   \
+            "A(" #INPUT ");B(_MklLRN);C(" #INPUT ");D(" #INPUT ");"            \
+            "DMT/_0(Const);DMT/_1(Const);DMT/_2(Const);E(_MklLRNGrad);"        \
+            "F(LRNGrad);G(Zeta)|A->B;A:control->DMT/_0:control;B->E:2;B->F:1;" \
+            "B:1->E:3;B:2->E:6;B:3->E:7;C->E;C->F;C:control->DMT/_1:control;"  \
+            "C:control->DMT/_2:control;D->E:1;D->F:2;DMT/_0->B:1;"             \
+            "DMT/_1->E:4;DMT/_2->E:5;E->G;F->G:1");                            \
 }
+REGISTER_TEST_FLOAT32(LRN_Negative3);
+#undef REGISTER_TEST
 
 /* Test MaxPool->MaxPoolGrad replacement by workspace+rewrite nodes. */
-TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Positive) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'MaxPool'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'MaxPoolGrad'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
-      " input: ['C', 'B', 'D'] }"
-      "node { name: 'F' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['C', 'E'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(_MklMaxPool);C(Input);D(Input);DMT/_0(Const);"
-            "DMT/_1(Const);DMT/_2(Const);E(_MklMaxPoolGrad);F(Zeta)|"
-            "A->B;A:control->DMT/_0:control;B->E:1;B:1->E:3;B:2->E:5;B:3->E:7;"
-            "C->E;C->F;C:control->DMT/_1:control;C:control->DMT/_2:control;"
-            "D->E:2;DMT/_0->B:1;DMT/_1->E:4;DMT/_2->E:6;E->F:1");
+#define REGISTER_TEST(NAME, T, INPUT)                                          \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                      \
+    InitGraph(                                                                 \
+      "node { name: 'A' op: '" #INPUT "'}"                                     \
+      "node { name: 'B' op: 'MaxPool'"                                         \
+      " attr { key: 'T'            value { type: " #T " } }"                   \
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"                      \
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"    \
+      " attr { key: 'padding'      value { s: 'VALID' } }"                     \
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"    \
+      " input: ['A'] }"                                                        \
+      "node { name: 'C' op: '" #INPUT "'}"                                     \
+      "node { name: 'D' op: '" #INPUT "'}"                                     \
+      "node { name: 'E' op: 'MaxPoolGrad'"                                     \
+      " attr { key: 'T'            value { type: " #T " } }"                   \
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"                      \
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"    \
+      " attr { key: 'padding'      value { s: 'VALID' } }"                     \
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"    \
+      " input: ['C', 'B', 'D'] }"                                              \
+      "node { name: 'F' op: 'Zeta' attr { key: 'T' value { type: " #T " } }"   \
+      " input: ['C', 'E'] }");                                                 \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                                   \
+            "A(" #INPUT ");B(_MklMaxPool);C(" #INPUT ");D(" #INPUT ");"        \
+            "DMT/_0(Const);DMT/_1(Const);DMT/_2(Const);E(_MklMaxPoolGrad);"    \
+            "F(Zeta)|A->B;A:control->DMT/_0:control;B->E:1;B:1->E:3;B:2->E:5;" \
+            "B:3->E:7;C->E;C->F;C:control->DMT/_1:control;"                    \
+            "C:control->DMT/_2:control;D->E:2;DMT/_0->B:1;DMT/_1->E:4;"        \
+            "DMT/_2->E:6;E->F:1");                                             \
 }
+REGISTER_TEST_ALL_TYPES(NodeWorkspace_MaxPool_Positive);
+#undef REGISTER_TEST
 
 // Test MaxPool>MaxPoolGrad replacement when only one of them is present.
 // In this case, we will rewrite MaxPool node but workspace edges will not
 // be present.
-TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative1) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'MaxPool'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(_MklMaxPool);C(Zeta);DMT/_0(Const)|"
-            "A->B;A->C;A:control->DMT/_0:control;B->C:1;DMT/_0->B:1");
+#define REGISTER_TEST(NAME, T, INPUT)                                        \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                    \
+    InitGraph(                                                               \
+      "node { name: 'A' op: '" #INPUT "'}"                                   \
+      "node { name: 'B' op: 'MaxPool'"                                       \
+      " attr { key: 'T'            value { type: " #T " } }"                 \
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"                    \
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"  \
+      " attr { key: 'padding'      value { s: 'VALID' } }"                   \
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"  \
+      " input: ['A'] }"                                                      \
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: " #T " } }" \
+      " input: ['A', 'B'] }");                                               \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                                 \
+            "A(" #INPUT ");B(_MklMaxPool);C(Zeta);DMT/_0(Const)|"            \
+            "A->B;A->C;A:control->DMT/_0:control;B->C:1;DMT/_0->B:1");       \
 }
+REGISTER_TEST_ALL_TYPES(NodeWorkspace_MaxPool_Negative1);
+#undef REGISTER_TEST
 
 // Test MaxPoolGrad replacement when only one of them is present.
 // In this case, we will rewrite MaxPoolGrad and for workspace tensor and
 // its Mkl part, we will generate dummy tensor.
-TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative2) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'MaxPoolGrad'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
-      " input: ['A', 'B', 'C'] }"
-      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'D'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(Input);D(MaxPoolGrad);"
-            "E(Zeta)|A->D;A->E;B->D:1;C->D:2;D->E:1");
+#define REGISTER_TEST(NAME, T, INPUT)                                        \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                    \
+    InitGraph(                                                               \
+      "node { name: 'A' op: '" #INPUT "'}"                                   \
+      "node { name: 'B' op: '" #INPUT "'}"                                   \
+      "node { name: 'C' op: '" #INPUT "'}"                                   \
+      "node { name: 'D' op: 'MaxPoolGrad'"                                   \
+      " attr { key: 'T'            value { type: " #T " } }"                 \
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"                    \
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"  \
+      " attr { key: 'padding'      value { s: 'VALID' } }"                   \
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"  \
+      " input: ['A', 'B', 'C'] }"                                            \
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: " #T " } }" \
+      " input: ['A', 'D'] }");                                               \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                                 \
+            "A(" #INPUT ");B(" #INPUT ");C(" #INPUT ");D(MaxPoolGrad);"      \
+            "E(Zeta)|A->D;A->E;B->D:1;C->D:2;D->E:1");                       \
 }
+REGISTER_TEST_ALL_TYPES(NodeWorkspace_MaxPool_Negative2);
+#undef REGISTER_TEST
 
 // Test MaxPool handling for batch-wise pooling (NCHW)
 // No rewrite should take place in such case
-TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative3) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'MaxPool'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'ksize'        value { list: {i: 2, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
+#define REGISTER_TEST(NAME, T, INPUT)                                        \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                    \
+    InitGraph(                                                               \
+      "node { name: 'A' op: '" #INPUT "'}"                                   \
+      "node { name: 'B' op: 'MaxPool'"                                       \
+      " attr { key: 'T'            value { type: " #T " } }"                 \
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"                    \
+      " attr { key: 'ksize'        value { list: {i: 2, i:1, i:1, i:1} } }"  \
+      " attr { key: 'padding'      value { s: 'VALID' } }"                   \
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"  \
+      " input: ['A'] }"                                                      \
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: " #T " } }" \
+      " input: ['A', 'B'] }");                                               \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                                 \
+            "A(" #INPUT ");B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");            \
 }
+REGISTER_TEST_ALL_TYPES(NodeWorkspace_MaxPool_Negative3);
+#undef REGISTER_TEST
 
 // Test MaxPool handling for batch-wise pooling (NCHW)
 // No rewrite should take place in such case
-TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative4) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'MaxPool'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 2, i:1, i:1, i:1} } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
+#define REGISTER_TEST(NAME, T, INPUT)                                        \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                    \
+    InitGraph(                                                               \
+      "node { name: 'A' op: '" #INPUT "'}"                                   \
+      "node { name: 'B' op: 'MaxPool'"                                       \
+      " attr { key: 'T'            value { type: " #T " } }"                 \
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"                    \
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"  \
+      " attr { key: 'padding'      value { s: 'VALID' } }"                   \
+      " attr { key: 'strides'      value { list: {i: 2, i:1, i:1, i:1} } }"  \
+      " input: ['A'] }"                                                      \
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: " #T " } }" \
+      " input: ['A', 'B'] }");                                               \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                                 \
+            "A(" #INPUT ");B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");            \
 }
+REGISTER_TEST_ALL_TYPES(NodeWorkspace_MaxPool_Negative4);
+#undef REGISTER_TEST
 
 // Test MaxPool handling for depth-wise pooling (NHWC)
 // No rewrite should take place in such case
-TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative5) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'MaxPool'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:2, i:1, i:1} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
+#define REGISTER_TEST(NAME, T, INPUT)                                        \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                    \
+    InitGraph(                                                               \
+      "node { name: 'A' op: '" #INPUT "'}"                                   \
+      "node { name: 'B' op: 'MaxPool'"                                       \
+      " attr { key: 'T'            value { type: " #T " } }"                 \
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"                    \
+      " attr { key: 'ksize'        value { list: {i: 1, i:2, i:1, i:1} } }"  \
+      " attr { key: 'padding'      value { s: 'VALID' } }"                   \
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"  \
+      " input: ['A'] }"                                                      \
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: " #T " } }" \
+      " input: ['A', 'B'] }");                                               \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                                 \
+            "A(" #INPUT ");B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");            \
 }
+REGISTER_TEST_ALL_TYPES(NodeWorkspace_MaxPool_Negative5);
+#undef REGISTER_TEST
 
 // Test MaxPool handling for depth-wise pooling (NCHW)
 // No rewrite should take place in such case
-TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative6) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'MaxPool'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:2, i:1, i:1} } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
+#define REGISTER_TEST(NAME, T, INPUT)                                        \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                    \
+    InitGraph(                                                               \
+      "node { name: 'A' op: '" #INPUT "'}"                                   \
+      "node { name: 'B' op: 'MaxPool'"                                       \
+      " attr { key: 'T'            value { type: " #T " } }"                 \
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"                    \
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"  \
+      " attr { key: 'padding'      value { s: 'VALID' } }"                   \
+      " attr { key: 'strides'      value { list: {i: 1, i:2, i:1, i:1} } }"  \
+      " input: ['A'] }"                                                      \
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: " #T " } }" \
+      " input: ['A', 'B'] }");                                               \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                                 \
+            "A(" #INPUT ");B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");            \
 }
+REGISTER_TEST_ALL_TYPES(NodeWorkspace_MaxPool_Negative6);
+#undef REGISTER_TEST
 
 // Test MaxPool handling for batch-wise pooling (NHWC)
 // No rewrite should take place in such case
-TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative7) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'MaxPool'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NHWC' } }"
-      " attr { key: 'ksize'        value { list: {i: 2, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
+#define REGISTER_TEST(NAME, T, INPUT)                                        \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                    \
+    InitGraph(                                                               \
+      "node { name: 'A' op: '" #INPUT "'}"                                   \
+      "node { name: 'B' op: 'MaxPool'"                                       \
+      " attr { key: 'T'            value { type: " #T " } }"                 \
+      " attr { key: 'data_format'  value { s: 'NHWC' } }"                    \
+      " attr { key: 'ksize'        value { list: {i: 2, i:1, i:1, i:1} } }"  \
+      " attr { key: 'padding'      value { s: 'VALID' } }"                   \
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"  \
+      " input: ['A'] }"                                                      \
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: " #T " } }" \
+      " input: ['A', 'B'] }");                                               \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                                 \
+            "A(" #INPUT ");B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");            \
 }
+REGISTER_TEST_ALL_TYPES(NodeWorkspace_MaxPool_Negative7);
+#undef REGISTER_TEST
 
 // Test MaxPool handling for batch-wise pooling (NHWC)
 // No rewrite should take place in such case
-TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative8) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'MaxPool'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NHWC' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 2, i:1, i:1, i:1} } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
+#define REGISTER_TEST(NAME, T, INPUT)                                        \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                    \
+    InitGraph(                                                               \
+      "node { name: 'A' op: '" #INPUT "'}"                                   \
+      "node { name: 'B' op: 'MaxPool'"                                       \
+      " attr { key: 'T'            value { type: " #T " } }"                 \
+      " attr { key: 'data_format'  value { s: 'NHWC' } }"                    \
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"  \
+      " attr { key: 'padding'      value { s: 'VALID' } }"                   \
+      " attr { key: 'strides'      value { list: {i: 2, i:1, i:1, i:1} } }"  \
+      " input: ['A'] }"                                                      \
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: " #T " } }" \
+      " input: ['A', 'B'] }");                                               \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                                 \
+            "A(" #INPUT ");B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");            \
 }
+REGISTER_TEST_ALL_TYPES(NodeWorkspace_MaxPool_Negative8);
+#undef REGISTER_TEST
 
 // Test MaxPool handling for depth-wise pooling (NHWC)
 // No rewrite should take place in such case
-TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative9) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'MaxPool'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NHWC' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:2} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
+#define REGISTER_TEST(NAME, T, INPUT)                                        \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                    \
+    InitGraph(                                                               \
+      "node { name: 'A' op: '" #INPUT "'}"                                   \
+      "node { name: 'B' op: 'MaxPool'"                                       \
+      " attr { key: 'T'            value { type: " #T " } }"                 \
+      " attr { key: 'data_format'  value { s: 'NHWC' } }"                    \
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:2} } }"  \
+      " attr { key: 'padding'      value { s: 'VALID' } }"                   \
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"  \
+      " input: ['A'] }"                                                      \
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: " #T " } }" \
+      " input: ['A', 'B'] }");                                               \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                                 \
+            "A(" #INPUT ");B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");            \
 }
+REGISTER_TEST_ALL_TYPES(NodeWorkspace_MaxPool_Negative9);
+#undef REGISTER_TEST
 
 // Test MaxPool handling for depth-wise pooling (NHWC)
 // No rewrite should take place in such case
-TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative10) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'MaxPool'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NHWC' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:2} } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
+#define REGISTER_TEST(NAME, T, INPUT)                                        \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                    \
+    InitGraph(                                                               \
+      "node { name: 'A' op: '" #INPUT "'}"                                   \
+      "node { name: 'B' op: 'MaxPool'"                                       \
+      " attr { key: 'T'            value { type: " #T " } }"                 \
+      " attr { key: 'data_format'  value { s: 'NHWC' } }"                    \
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"  \
+      " attr { key: 'padding'      value { s: 'VALID' } }"                   \
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:2} } }"  \
+      " input: ['A'] }"                                                      \
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: " #T " } }" \
+      " input: ['A', 'B'] }");                                               \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                                 \
+            "A(" #INPUT ");B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");            \
 }
+REGISTER_TEST_ALL_TYPES(NodeWorkspace_MaxPool_Negative10);
+#undef REGISTER_TEST
+// clang-format on
 
 /////////////////////////////////////////////////////////////////////
 
@@ -4553,6 +4693,7 @@ TEST_F(MklLayoutPassTest, FusedConv2DWithBias_FilterCaching_Positive) {
       " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'fused_ops'        value { list: {s: 'BiasAdd'} } }"
       " attr { key: 'epsilon'          value { f: 0.001 }}"
+      " attr { key: 'leakyrelu_alpha'  value { f: 0.2 }}"
       " input: ['A', 'B', 'C']}"
       "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['D', 'C'] }");
@@ -4580,6 +4721,7 @@ TEST_F(MklLayoutPassTest,
       " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'fused_ops'        value { list: {s: 'BiasAdd'} } }"
       " attr { key: 'epsilon'          value { f: 0.001 }}"
+      " attr { key: 'leakyrelu_alpha'  value { f: 0.2 }}"
       " input: ['A', 'B', 'C']}"
       "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['D', 'C'] }");
@@ -4602,6 +4744,7 @@ TEST_F(MklLayoutPassTest, FusedConv2DWithBias_FilterCaching_Negative) {
       " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'fused_ops'        value { list: {s: 'BiasAdd'} } }"
       " attr { key: 'epsilon'          value { f: 0.001 }}"
+      " attr { key: 'leakyrelu_alpha'  value { f: 0.2 }}"
       " input: ['A', 'B', 'C']}"
       "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['D', 'C'] }");
@@ -4625,6 +4768,7 @@ TEST_F(MklLayoutPassTest,
       " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'fused_ops'        value { list: {s: 'BiasAdd'} } }"
       " attr { key: 'epsilon'          value { f: 0.001 }}"
+      " attr { key: 'leakyrelu_alpha'  value { f: 0.2 }}"
       " input: ['A', 'B', 'C']}"
       "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['D', 'C'] }");
diff --git a/tensorflow/core/common_runtime/mkl_tfconversion_pass.cc b/tensorflow/core/common_runtime/mkl_tfconversion_pass.cc
index 6f22c9c62ae..6df967d45c9 100644
--- a/tensorflow/core/common_runtime/mkl_tfconversion_pass.cc
+++ b/tensorflow/core/common_runtime/mkl_tfconversion_pass.cc
@@ -430,6 +430,11 @@ Status MklToTfConversionPass::Run(const GraphOptimizationPassOptions& options) {
     VLOG(2) << "TF-MKL: Disabling MKL";
     return Status::OK();
   }
+  if (NativeFormatEnabled()) {
+    VLOG(2)
+        << "Running in native format mode, MklToTfConversionPass won't run.";
+    return Status::OK();
+  }
 
   auto process_graph = [&](std::unique_ptr<Graph>* g) {
     // Get the ownership of graph
diff --git a/tensorflow/core/common_runtime/pending_counts.h b/tensorflow/core/common_runtime/pending_counts.h
index 46b6509390f..bc2e1a58c4e 100644
--- a/tensorflow/core/common_runtime/pending_counts.h
+++ b/tensorflow/core/common_runtime/pending_counts.h
@@ -16,6 +16,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <atomic>
+
 #include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/platform/logging.h"
@@ -93,63 +95,75 @@ class PendingCounts {
 
   void set_initial_count(Handle h, size_t pending_count) {
     if (h.is_large_) {
-      LargeCounts* c = Large(h);
-      c->pending = pending_count;
-      c->dead_count = 0;
-      c->has_started = 0;
+      std::atomic<LargeCounts>* c_ptr = Large(h);
+      auto c = c_ptr->load(std::memory_order_relaxed);
+      c.pending = pending_count;
+      c.dead_count = 0;
+      c.has_started = 0;
+      c_ptr->store(c, std::memory_order_relaxed);
     } else {
-      PackedCounts* c = Packed(h);
       DCHECK_LE(pending_count, kMaxCountForPackedCounts);
-      c->pending = pending_count;
-      c->dead_count = 0;
-      c->has_started = 0;
+      std::atomic<PackedCounts>* c_ptr = Packed(h);
+      auto c = c_ptr->load(std::memory_order_relaxed);
+      c.pending = pending_count;
+      c.dead_count = 0;
+      c.has_started = 0;
+      c_ptr->store(c, std::memory_order_relaxed);
     }
   }
 
   NodeState node_state(Handle h) {
     if (h.is_large_) {
-      return NodeStateForStruct(Large(h));
+      return NodeStateForStruct(Large(h)->load(std::memory_order_relaxed));
     } else {
-      return NodeStateForStruct(Packed(h));
+      return NodeStateForStruct(Packed(h)->load(std::memory_order_relaxed));
     }
   }
   void mark_started(Handle h) {
     DCHECK_EQ(pending(h), 0);
     if (h.is_large_) {
-      LargeCounts* c = Large(h);
-      DCHECK_EQ(c->has_started, 0);
-      c->has_started = 1;
+      std::atomic<LargeCounts>* c_ptr = Large(h);
+      auto c = c_ptr->load(std::memory_order_relaxed);
+      DCHECK_EQ(c.has_started, 0);
+      c.has_started = 1;
+      c_ptr->store(c, std::memory_order_relaxed);
     } else {
-      PackedCounts* c = Packed(h);
-      DCHECK_EQ(c->has_started, 0);
-      c->has_started = 1;
+      std::atomic<PackedCounts>* c_ptr = Packed(h);
+      auto c = c_ptr->load(std::memory_order_relaxed);
+      DCHECK_EQ(c.has_started, 0);
+      c.has_started = 1;
+      c_ptr->store(c, std::memory_order_relaxed);
     }
   }
   void mark_completed(Handle h) {
     if (h.is_large_) {
-      LargeCounts* c = Large(h);
-      DCHECK_EQ(c->has_started, 1);
-      c->pending = 1;
+      std::atomic<LargeCounts>* c_ptr = Large(h);
+      auto c = c_ptr->load(std::memory_order_relaxed);
+      DCHECK_EQ(c.has_started, 1);
+      c.pending = 1;
+      c_ptr->store(c, std::memory_order_relaxed);
     } else {
-      PackedCounts* c = Packed(h);
-      DCHECK_EQ(c->has_started, 1);
-      c->pending = 1;
+      std::atomic<PackedCounts>* c_ptr = Packed(h);
+      auto c = c_ptr->load(std::memory_order_relaxed);
+      DCHECK_EQ(c.has_started, 1);
+      c.pending = 1;
+      c_ptr->store(c, std::memory_order_relaxed);
     }
   }
   int pending(Handle h) {
     if (h.is_large_) {
-      LargeCounts* c = Large(h);
+      LargeCounts c = Large(h)->load(std::memory_order_relaxed);
       if (PENDING_NOTREADY == NodeStateForStruct(c)) {
-        return c->pending;
+        return c.pending;
       } else {
         // The pending count encodes the state once the node has
         // started, so just return 0.
         return 0;
       }
     } else {
-      PackedCounts* c = Packed(h);
+      PackedCounts c = Packed(h)->load(std::memory_order_relaxed);
       if (PENDING_NOTREADY == NodeStateForStruct(c)) {
-        return c->pending;
+        return c.pending;
       } else {
         // The pending count encodes the state once the node has
         // started, so just return 0.
@@ -160,50 +174,63 @@ class PendingCounts {
   int decrement_pending(Handle h, int v) {
     DCHECK_GE(pending(h), v);
     if (h.is_large_) {
-      LargeCounts* c = Large(h);
-      c->pending -= v;
-      return c->pending;
+      std::atomic<LargeCounts>* c_ptr = Large(h);
+      auto c = c_ptr->load(std::memory_order_relaxed);
+      c.pending -= v;
+      c_ptr->store(c, std::memory_order_relaxed);
+      return c.pending;
     } else {
-      PackedCounts* c = Packed(h);
-      c->pending -= v;
-      return c->pending;
+      std::atomic<PackedCounts>* c_ptr = Packed(h);
+      auto c = c_ptr->load(std::memory_order_relaxed);
+      c.pending -= v;
+      c_ptr->store(c, std::memory_order_relaxed);
+      return c.pending;
     }
   }
   // Mark a merge node as live
   // REQUIRES: Node corresponding to "h" is a merge node
   void mark_live(Handle h) {
     if (h.is_large_) {
-      LargeCounts* c = Large(h);
+      std::atomic<LargeCounts>* c_ptr = Large(h);
+      auto c = c_ptr->load(std::memory_order_relaxed);
       // Only do anything if the node hasn't already started executing.
       if (PENDING_NOTREADY == NodeStateForStruct(c)) {
-        c->pending &= ~static_cast<int>(0x1);
+        c.pending &= ~static_cast<int>(0x1);
+        c_ptr->store(c, std::memory_order_relaxed);
       }
     } else {
-      PackedCounts* c = Packed(h);
+      std::atomic<PackedCounts>* c_ptr = Packed(h);
+      auto c = c_ptr->load(std::memory_order_relaxed);
       // Only do anything if the node hasn't already started executing.
       if (PENDING_NOTREADY == NodeStateForStruct(c)) {
         static_assert(7 == kMaxCountForPackedCounts,
                       "Live flag incorrect for max packed count");
-        c->pending &= 0x6;
+        c.pending &= 0x6;
+        c_ptr->store(c, std::memory_order_relaxed);
       }
     }
   }
 
   int dead_count(Handle h) {
-    int r = h.is_large_ ? Large(h)->dead_count : Packed(h)->dead_count;
+    int r = h.is_large_ ? Large(h)->load(std::memory_order_relaxed).dead_count
+                        : Packed(h)->load(std::memory_order_relaxed).dead_count;
     return r;
   }
   void increment_dead_count(Handle h) {
     if (h.is_large_) {
-      LargeCounts* c = Large(h);
+      std::atomic<LargeCounts>* c_ptr = Large(h);
+      auto c = c_ptr->load(std::memory_order_relaxed);
       if (PENDING_NOTREADY == NodeStateForStruct(c)) {
-        c->dead_count++;
+        c.dead_count++;
+        c_ptr->store(c, std::memory_order_relaxed);
       }
     } else {
-      PackedCounts* c = Packed(h);
+      std::atomic<PackedCounts>* c_ptr = Packed(h);
+      auto c = c_ptr->load(std::memory_order_relaxed);
       if (PENDING_NOTREADY == NodeStateForStruct(c)) {
-        DCHECK_LT(c->dead_count, kMaxCountForPackedCounts);
-        c->dead_count++;
+        DCHECK_LT(c.dead_count, kMaxCountForPackedCounts);
+        c.dead_count++;
+        c_ptr->store(c, std::memory_order_relaxed);
       }
     }
   }
@@ -230,6 +257,17 @@ class PendingCounts {
     }
   }
 
+  // The same as the above, but performs the operation atomically. This
+  // is thread-safe to run concurrently with other threads.
+  AdjustResult adjust_for_activation_atomic(Handle h, bool increment_dead) {
+    DCHECK_GE(pending(h), 1);
+    if (h.is_large_) {
+      return adjust_for_activation_shared_atomic(Large(h), increment_dead);
+    } else {
+      return adjust_for_activation_shared_atomic(Packed(h), increment_dead);
+    }
+  }
+
   class Handle {
    public:
     Handle() : byte_offset_(0), is_large_(0) {}
@@ -242,12 +280,31 @@ class PendingCounts {
 
  private:
   template <typename T>
-  inline AdjustResult adjust_for_activation_shared(T* c, bool increment_dead) {
-    if (increment_dead && PENDING_NOTREADY == NodeStateForStruct(c)) {
-      c->dead_count++;
+  inline AdjustResult adjust_for_activation_shared(std::atomic<T>* c,
+                                                   bool increment_dead) {
+    T val = c->load(std::memory_order_relaxed);
+    if (increment_dead && PENDING_NOTREADY == NodeStateForStruct(val)) {
+      val.dead_count++;
+    }
+    val.pending--;
+    c->store(val, std::memory_order_relaxed);
+    return AdjustResult(val.dead_count, val.pending);
+  }
+
+  template <typename T>
+  inline AdjustResult adjust_for_activation_shared_atomic(std::atomic<T>* c,
+                                                          bool increment_dead) {
+    T old_val = c->load(std::memory_order_relaxed);
+    while (true) {
+      T new_val = old_val;
+      if (increment_dead && PENDING_NOTREADY == NodeStateForStruct(new_val)) {
+        new_val.dead_count++;
+      }
+      new_val.pending--;
+      AdjustResult ret(new_val.dead_count, new_val.pending);
+      if (TF_PREDICT_TRUE(c->compare_exchange_weak(old_val, new_val)))
+        return ret;
     }
-    c->pending -= 1;
-    return AdjustResult(c->dead_count, c->pending);
   }
 
   // We keep track of the pending count and dead input count for each
@@ -272,30 +329,35 @@ class PendingCounts {
     uint8 has_started : 1;
   };
 
-  struct LargeCounts {
+  // NOTE: alignas(8) is critical to implement efficient atomic<LargeCounts>
+  // on MSVC.
+  struct alignas(8) LargeCounts {
     uint32 pending;
     uint32 dead_count : 31;
-    uint8 has_started : 1;
+    // NOTE(tlipcon): MSVC won't pack this struct into 8 bytes unless
+    // all of the member types are uint32.
+    uint32 has_started : 1;
   };
 
   template <typename T>
-  NodeState NodeStateForStruct(T* c) const {
-    if (c->has_started) {
-      return (c->pending == 0) ? STARTED : COMPLETED;
+  NodeState NodeStateForStruct(const T& c) const {
+    if (c.has_started) {
+      return (c.pending == 0) ? STARTED : COMPLETED;
     } else {
-      return (c->pending == 0) ? PENDING_READY : PENDING_NOTREADY;
+      return (c.pending == 0) ? PENDING_READY : PENDING_NOTREADY;
     }
   }
-  inline LargeCounts* Large(Handle h) {
+  inline std::atomic<LargeCounts>* Large(Handle h) {
     DCHECK(h.is_large_);
-    DCHECK_LE(h.byte_offset_ + sizeof(LargeCounts), num_bytes_);
-    DCHECK_EQ(h.byte_offset_ % alignof(LargeCounts), 0);
-    return reinterpret_cast<LargeCounts*>(bytes_ + h.byte_offset_);
+    DCHECK_LE(h.byte_offset_ + sizeof(std::atomic<LargeCounts>), num_bytes_);
+    DCHECK_EQ(h.byte_offset_ % alignof(std::atomic<LargeCounts>), 0);
+    return reinterpret_cast<std::atomic<LargeCounts>*>(bytes_ + h.byte_offset_);
   }
-  inline PackedCounts* Packed(Handle h) {
+  inline std::atomic<PackedCounts>* Packed(Handle h) {
     DCHECK(!h.is_large_);
     DCHECK_LE(h.byte_offset_ + sizeof(PackedCounts), num_bytes_);
-    return reinterpret_cast<PackedCounts*>(bytes_ + h.byte_offset_);
+    return reinterpret_cast<std::atomic<PackedCounts>*>(bytes_ +
+                                                        h.byte_offset_);
   }
 
   const int num_bytes_;  // Just for bounds checking in debug mode
@@ -309,9 +371,11 @@ inline PendingCounts::Handle PendingCounts::Layout::CreateHandle(
   Handle result;
   if ((max_pending_count > kMaxCountForPackedCounts) ||
       (max_dead_count > kMaxCountForPackedCounts)) {
-    int B = sizeof(LargeCounts);
+    constexpr int B = sizeof(std::atomic<LargeCounts>);
     // Round byte offset to proper alignment
-    DCHECK_GE(sizeof(LargeCounts), alignof(LargeCounts));
+    static_assert(
+        sizeof(std::atomic<LargeCounts>) >= alignof(std::atomic<LargeCounts>),
+        "std::atomic<LargeCounts> must be packed");
     int64 offset = ((static_cast<int64>(next_offset_) + B - 1) / B) * B;
     result.byte_offset_ = offset;
     result.is_large_ = true;
@@ -319,8 +383,9 @@ inline PendingCounts::Handle PendingCounts::Layout::CreateHandle(
   } else {
     result.byte_offset_ = next_offset_;
     result.is_large_ = false;
-    DCHECK_EQ(sizeof(PackedCounts), 1);
-    next_offset_ += sizeof(PackedCounts);
+    static_assert(sizeof(std::atomic<PackedCounts>) == 1,
+                  "std::atomic<PackedCounts> should be a single byte");
+    next_offset_ += sizeof(std::atomic<PackedCounts>);
   }
   return result;
 }
diff --git a/tensorflow/core/common_runtime/pending_counts_test.cc b/tensorflow/core/common_runtime/pending_counts_test.cc
index 5d5e7367c86..9debed4528a 100644
--- a/tensorflow/core/common_runtime/pending_counts_test.cc
+++ b/tensorflow/core/common_runtime/pending_counts_test.cc
@@ -13,12 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/common_runtime/pending_counts.h"
+
 #include <memory>
 #include <unordered_map>
+#include <vector>
 
-#include "tensorflow/core/common_runtime/pending_counts.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/test.h"
 
+using std::unique_ptr;
+
 namespace tensorflow {
 
 TEST(PendingCounts, Simple) {
@@ -165,4 +170,36 @@ TEST(PendingCounts, AdjustForActivation) {
   }
 }
 
+TEST(PendingCounts, AdjustForActivationAtomic) {
+  PendingCounts::Layout layout;
+  PendingCounts::Handle handles[2];
+  const int kInitialCounts[2] = {6, 16};
+  handles[0] = layout.CreateHandle(kInitialCounts[0], 0);
+  handles[1] = layout.CreateHandle(kInitialCounts[1], 0);
+  PendingCounts c(layout);
+  c.set_initial_count(handles[0], kInitialCounts[0]);
+  c.set_initial_count(handles[1], kInitialCounts[1]);
+
+  Env* env = Env::Default();
+  std::atomic<bool> start{false};
+  std::vector<unique_ptr<Thread>> threads;
+  for (int t = 0; t < 2; t++) {
+    threads.emplace_back(env->StartThread({}, "tester", [&]() {
+      while (!start) {
+      }
+      for (int i = 0; i < kInitialCounts[0] / 2; i++) {
+        c.adjust_for_activation_atomic(handles[0], false);
+      }
+      for (int i = 0; i < kInitialCounts[1] / 2; i++) {
+        c.adjust_for_activation_atomic(handles[1], false);
+      }
+    }));
+  }
+  start = true;
+  threads.clear();  // Joins the threads.
+
+  EXPECT_EQ(c.pending(handles[0]), 0);
+  EXPECT_EQ(c.pending(handles[1]), 0);
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/permuter.cc b/tensorflow/core/common_runtime/permuter.cc
new file mode 100644
index 00000000000..61b8dcb79c8
--- /dev/null
+++ b/tensorflow/core/common_runtime/permuter.cc
@@ -0,0 +1,116 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/permuter.h"
+
+#include "tensorflow/core/common_runtime/collective_rma_local.h"
+#include "tensorflow/core/common_runtime/collective_util.h"
+#include "tensorflow/core/common_runtime/copy_tensor.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+Permuter::Permuter()
+    : col_ctx_(nullptr), col_params_(nullptr), done_(nullptr), counter_(0) {}
+
+StatusCallback Permuter::CheckCounterAndCallDone() {
+  return [this](const Status& s) {
+    mu_.lock();
+    status_.Update(s);
+    int counter = ++counter_;
+    Status status = status_;
+    mu_.unlock();
+    if (counter == 2) done_(status);
+  };
+}
+
+Status Permuter::InitializeCollectiveContext(
+    std::shared_ptr<CollectiveContext> col_ctx) {
+  DCHECK(col_ctx->dev_mgr);
+  col_ctx_ = col_ctx;
+  col_params_ = &col_ctx->col_params;
+  return collective_util::InitializeDeviceAndLocality(
+      col_ctx->dev_mgr, col_ctx->device_name, &col_ctx->device,
+      &col_ctx->device_locality);
+}
+
+void Permuter::Run(StatusCallback done) {
+  if (col_params_->instance.permutation.size() !=
+      col_params_->instance.devices.size()) {
+    done(errors::Internal("Permutation must be the same size as devices"));
+  }
+  done_ = std::move(done);
+  DispatchSend(col_params_->default_rank,
+               col_params_->instance.permutation[col_params_->default_rank],
+               col_ctx_->input, CheckCounterAndCallDone());
+  for (int i = 0; i < col_params_->instance.permutation.size(); ++i) {
+    if (col_params_->default_rank == col_params_->instance.permutation[i]) {
+      DispatchRecv(i, col_params_->instance.permutation[i], col_ctx_->output,
+                   CheckCounterAndCallDone());
+    }
+  }
+}
+
+void Permuter::DispatchSend(int src_rank, int target_rank, const Tensor* tensor,
+                            const StatusCallback& done) {
+  string send_buf_key =
+      strings::StrCat(col_ctx_->exec_key, src_rank, target_rank);
+  VLOG(1) << "DispatchSend " << send_buf_key << " from_device "
+          << col_ctx_->device_name << " to_device "
+          << col_params_->instance.devices[target_rank]
+          << " target_rank=" << target_rank << " src_rank=" << src_rank;
+  col_ctx_->col_exec->remote_access()->PostToPeer(
+      col_params_->instance.devices[target_rank],
+      col_params_->instance.task_names[target_rank], send_buf_key,
+      col_ctx_->device, col_ctx_->op_ctx->op_device_context(),
+      col_ctx_->op_ctx->output_alloc_attr(0), tensor, col_ctx_->device_locality,
+      done);
+}
+
+void Permuter::DispatchRecv(int src_rank, int target_rank, Tensor* tensor,
+                            const StatusCallback& done) {
+  string recv_buf_key =
+      strings::StrCat(col_ctx_->exec_key, src_rank, target_rank);
+  VLOG(1) << "DispatchRecv " << recv_buf_key << " to_device "
+          << col_ctx_->device_name << " from_device "
+          << col_params_->instance.devices[src_rank]
+          << " target_rank=" << target_rank << " src_rank=" << src_rank;
+  col_ctx_->col_exec->remote_access()->RecvFromPeer(
+      col_params_->instance.devices[src_rank],
+      col_params_->instance.task_names[src_rank],
+      col_params_->task.is_local[src_rank], recv_buf_key, col_ctx_->device,
+      col_ctx_->op_ctx->op_device_context(),
+      col_ctx_->op_ctx->output_alloc_attr(0), tensor, col_ctx_->device_locality,
+      0, done);
+}
+namespace {
+REGISTER_COLLECTIVE(Permute, Permuter);
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/permuter.h b/tensorflow/core/common_runtime/permuter.h
new file mode 100644
index 00000000000..a99b8489630
--- /dev/null
+++ b/tensorflow/core/common_runtime/permuter.h
@@ -0,0 +1,87 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_PERMUTER_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_PERMUTER_H_
+
+#include <deque>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/base_collective_executor.h"
+#include "tensorflow/core/framework/collective.h"
+
+namespace tensorflow {
+class Device;
+
+// Implementation of collective permute.
+//
+// Permute takes
+// - a list of devices participating in the collective
+// - a permutation as a list of integers.
+// - a tensor
+//
+// The list of devices replaces the need for group_key and group_size. The
+// number of inputs only scales with the number of devices within one group.
+//
+// The integers in the permutation are based on indices of the list of devices.
+// E.g. devices = {"GPU:0", "GPU:1"} and permutation = {1,0} means
+// - devices[0] sends to devices[permutation[0]] and
+// - devices[1] sends to devices[permutation[1]].
+//
+// Each device sends exactly one tensor and receives exactly one tensor.
+class Permuter : public CollectiveImplementationInterface {
+ public:
+  Permuter();
+  ~Permuter() override = default;
+
+  void Run(StatusCallback done) override;
+
+  Status InitializeCollectiveParams(CollectiveParams* col_params) override {
+    return Status::OK();
+  }
+
+  // Initializes members of CollectiveContext not yet initialized, i.e. device
+  // and device_locality.  Also saves the CollectiveContext in this object.
+  Status InitializeCollectiveContext(
+      std::shared_ptr<CollectiveContext> col_ctx) override;
+
+  Status InitializeCollectiveGroupRuntimeDetails(
+      CollGroupRuntimeDetails*) override {
+    return Status::OK();
+  }
+
+ private:
+  std::shared_ptr<CollectiveContext> col_ctx_;
+  const CollectiveParams* col_params_;  // Not owned
+  StatusCallback done_;
+  mutex mu_;
+  Status status_ TF_GUARDED_BY(mu_);
+  int counter_ TF_GUARDED_BY(mu_);
+
+  void DispatchSend(int src_rank, int target_rank, const Tensor* tensor,
+                    const StatusCallback& done);
+
+  void DispatchRecv(int src_rank, int target_rank, Tensor* tensor,
+                    const StatusCallback& done);
+
+  // Atomically increments counter_ by one for sending, one for receiving.
+  // Invokes done when counter_ reaches 2.
+  // The purpose of checking counter_ is to ensure that done_ is called once.
+  StatusCallback CheckCounterAndCallDone();
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_PERMUTER_H_
diff --git a/tensorflow/core/common_runtime/permuter_test.cc b/tensorflow/core/common_runtime/permuter_test.cc
new file mode 100644
index 00000000000..a07162142f7
--- /dev/null
+++ b/tensorflow/core/common_runtime/permuter_test.cc
@@ -0,0 +1,516 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/permuter.h"
+
+#include <algorithm>
+
+#include "absl/memory/memory.h"
+#include "tensorflow/core/common_runtime/base_collective_executor.h"
+#include "tensorflow/core/common_runtime/collective_rma_local.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/device_resolver_local.h"
+#include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/common_runtime/test_collective_executor_mgr.h"
+#include "tensorflow/core/common_runtime/threadpool_device.h"
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/unbounded_work_queue.h"
+#include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/public/version.h"
+
+namespace tensorflow {
+namespace {
+
+static int64 kStepId = 123;
+
+// Wraps CollectiveRemoteAccessLocal with the ability to return an
+// error status to the N'th action.
+// TODO(b/113171733): factor out of this file and ring_reducer_test.cc
+// into a single common source.
+class FailTestRMA : public CollectiveRemoteAccessLocal {
+ public:
+  FailTestRMA(const DeviceMgr* dev_mgr, DeviceResolverInterface* dev_resolver,
+              int64 step_id, int fail_after)
+      : CollectiveRemoteAccessLocal(dev_mgr, dev_resolver, step_id),
+        fail_after_(fail_after) {}
+
+  bool MaybeFail(const StatusCallback& done) {
+    bool fail_now = false;
+    {
+      mutex_lock l(mu_);
+      if (fail_after_ > 0) {
+        fail_now = (--fail_after_ == 0);
+      }
+    }
+    if (fail_now) {
+      auto error = errors::Internal("Deliberate failure");
+      LOG(INFO) << "triggering failure " << error;
+      SchedNonBlockingClosureAfter(
+          1000, [this, error] { buf_rendezvous()->StartAbort(error); });
+      done(error);
+      return true;
+    }
+    return false;
+  }
+
+  void RecvFromPeer(const string& peer_device, const string& peer_task,
+                    bool peer_is_local, const string& key, Device* to_device,
+                    DeviceContext* to_device_ctx,
+                    const AllocatorAttributes& to_alloc_attr, Tensor* to_tensor,
+                    const DeviceLocality& client_locality, int stream_index,
+                    const StatusCallback& done) override {
+    if (MaybeFail(done)) return;
+    CollectiveRemoteAccessLocal::RecvFromPeer(
+        peer_device, peer_task, peer_is_local, key, to_device, to_device_ctx,
+        to_alloc_attr, to_tensor, client_locality, stream_index, done);
+  }
+
+  void PostToPeer(const string& peer_device, const string& peer_task,
+                  const string& key, Device* from_device,
+                  DeviceContext* from_device_ctx,
+                  const AllocatorAttributes& from_alloc_attr,
+                  const Tensor* from_tensor,
+                  const DeviceLocality& client_locality,
+                  const StatusCallback& done) override {
+    if (MaybeFail(done)) return;
+    CollectiveRemoteAccessLocal::PostToPeer(
+        peer_device, peer_task, key, from_device, from_device_ctx,
+        from_alloc_attr, from_tensor, client_locality, done);
+  }
+
+  mutex mu_;
+  int fail_after_ TF_GUARDED_BY(mu_);
+};
+
+class PermuterTest : public ::testing::Test {
+ protected:
+  PermuterTest() : device_type_(DEVICE_CPU) {}
+
+  ~PermuterTest() override {
+    stop_ = true;
+    for (auto i : instances_) delete i;
+    if (col_exec_) col_exec_->Unref();
+  }
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+  void InitGPUDevices() {
+    auto device_factory = DeviceFactory::GetFactory("GPU");
+    CHECK(device_factory);
+    SessionOptions options;
+    Status s = device_factory->CreateDevices(
+        options, "/job:worker/replica:0/task:0", &gpu_devices_);
+    CHECK(s.ok());
+  }
+#endif
+
+  void Init(int num_workers, int num_devices_per_worker, DataType dtype,
+            const DeviceType& device_type, int fail_after) {
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+    InitGPUDevices();
+#endif
+    device_type_ = device_type;
+    std::vector<std::unique_ptr<Device>> local_devices;
+    SessionOptions sess_opts;
+    sess_opts.env = Env::Default();
+    Bytes mem_limit(4 << 20);
+    DeviceLocality dev_locality;
+    for (int wi = 0; wi < num_workers; ++wi) {
+      for (int di = 0; di < num_devices_per_worker; ++di) {
+        if (device_type == DEVICE_CPU) {
+          string dev_name = strings::StrCat("/job:worker/replica:0/task:", wi,
+                                            "/device:CPU:", di);
+          local_devices.push_back(absl::make_unique<ThreadPoolDevice>(
+              sess_opts, dev_name, mem_limit, dev_locality, cpu_allocator()));
+        } else if (device_type == DEVICE_GPU && !gpu_devices_.empty()) {
+          int dev_idx = (wi * num_devices_per_worker) + di;
+          if (dev_idx >= static_cast<int>(gpu_devices_.size())) {
+            LOG(INFO) << "dev_mgr has access to limited GPUs, reusing for more "
+                         "than one ring node.";
+          } else {
+            local_devices.push_back(std::move(gpu_devices_[dev_idx]));
+          }
+        } else {
+          LOG(FATAL) << "Unsupported device_type " << device_type;
+        }
+      }
+    }
+    if (!dev_mgr_ || device_type == DEVICE_CPU) {
+      dev_mgr_ = absl::make_unique<StaticDeviceMgr>(std::move(local_devices));
+    }
+    if (!gpu_ring_order_) {
+      gpu_ring_order_ = absl::make_unique<string>();
+    }
+    dev_resolver_ = absl::make_unique<DeviceResolverLocal>(dev_mgr_.get());
+    work_queue_ = std::make_shared<UnboundedWorkQueue>(Env::Default(), "test");
+    rma_ = new FailTestRMA(dev_mgr_.get(), dev_resolver_.get(), kStepId,
+                           fail_after);
+    col_exec_ = new BaseCollectiveExecutor(&col_exec_mgr_, rma_, kStepId,
+                                           dev_mgr_.get(),
+                                           gpu_ring_order_.get(), work_queue_);
+    col_params_.name = "test_collective";
+    col_params_.instance.data_type = dtype;
+    static const int kInstanceKey = 18;
+    col_params_.instance.instance_key = kInstanceKey;
+    col_params_.group.device_type = device_type;
+    col_params_.instance.type = PERMUTE_COLLECTIVE;
+
+    // Set up all the fake device contexts.
+    for (int wi = 0; wi < num_workers; wi++) {
+      for (int di = 0; di < num_devices_per_worker; di++) {
+        string task_name = strings::StrCat("/job:worker/replica:0/task:", wi);
+        string dev_name;
+        if (device_type == DEVICE_GPU) {
+          dev_name = strings::StrCat(task_name, "/device:GPU:0");
+        } else {
+          dev_name = strings::StrCat(task_name, "/device:CPU:", di);
+        }
+        col_params_.instance.device_names.push_back(dev_name);
+        col_params_.instance.devices.push_back(dev_name);
+        int default_rank = wi * num_devices_per_worker + di;
+        permutation_.push_back(default_rank);
+        col_params_.instance.task_names.push_back(task_name);
+        col_params_.task.is_local.push_back(true);
+      }
+    }
+
+    // Generate a permutation by permuting every two instances.
+    // E.g. [0,1] becomes [1,0]
+    //      [0,1,2,3] becomes [1,0,3,2]
+    for (int i = 0; i < permutation_.size(); i += 2) {
+      // If the total number of instances is odd,
+      // swap the last instance with the first.
+      // E.g. [0,1,2] becomes [2,0,1]
+      if (permutation_.size() == i + 1) {
+        std::swap(permutation_[i], permutation_[0]);
+        continue;
+      }
+      std::next_permutation(permutation_.begin() + i,
+                            permutation_.begin() + i + 2);
+    }
+    col_params_.instance.permutation = permutation_;
+
+    for (int wi = 0; wi < num_workers; wi++) {
+      for (int di = 0; di < num_devices_per_worker; di++) {
+        int default_rank = wi * num_devices_per_worker + di;
+        instances_.push_back(new DeviceInstance(
+            default_rank, col_params_.instance.device_names[default_rank],
+            device_type, this));
+      }
+    }
+  }
+
+  typedef std::function<void(Tensor*)> InitFunc;
+
+  void Permute(int fail_after) {
+    std::atomic<int> done(0);
+    for (auto di : instances_) {
+      SchedClosure([di, &done] {
+        di->DoPermute();
+        ++done;
+      });
+      if (fail_after > 0) {
+        // Stagger the op execution starts.
+        Env::Default()->SleepForMicroseconds(100);
+      }
+    }
+    while (done < instances_.size()) {
+      if (stop_) break;
+      Env::Default()->SleepForMicroseconds(1000);
+    }
+  }
+
+  template <typename T>
+  void RunTest(DataType dtype, const DeviceType& device_type, int num_workers,
+               int num_devices, int tensor_len, int fail_after) {
+    Init(num_workers, num_devices, dtype, device_type, fail_after);
+    std::vector<T> expected(tensor_len * num_devices * num_workers, 0.0);
+    // Initialize each instance tensor with distinct values.
+    for (int di = 0; di < instances_.size(); ++di) {
+      DeviceInstance* instance = instances_[di];
+      instance->InitTensor(
+          dtype, TensorShape({tensor_len}),
+          [this, &expected, di, tensor_len](Tensor* t) {
+            for (size_t i = 0; i < t->NumElements(); ++i) {
+              // The cast is necessary to prevent clang-tidy from insisting
+              // that a faster non-open source function be substituted.
+              float value = pow(10, static_cast<double>(di)) * i;
+              t->flat<T>()(i) = value;
+              expected[permutation_[di] * tensor_len + i] = value;
+            }
+          });
+    }
+
+    Permute(fail_after);
+
+    // At this point all of the ops have terminated.
+    for (int di = 0; di < instances_.size(); ++di) {
+      if (!instances_[di]->status_.ok()) {
+        ASSERT_GT(fail_after, 0);
+        ASSERT_NE(
+            instances_[di]->status_.error_message().find("Deliberate failure"),
+            string::npos);
+        continue;
+      }
+      TF_EXPECT_OK(instances_[di]->status_);
+      Tensor* inst = &instances_[di]->tensor_output_;
+      Tensor actual(dtype, TensorShape({tensor_len}));
+      if (device_type_ == DEVICE_CPU) {
+        CHECK(actual.CopyFrom(*inst, inst->shape()));
+      } else if (device_type_ == DEVICE_GPU) {
+        Device* dev = instances_[di]->device_;
+        auto* dev_info = dev->tensorflow_gpu_device_info();
+        CHECK(dev_info);
+        TF_CHECK_OK(dev_info->default_context->CopyDeviceTensorToCPUSync(
+            inst, "" /*tensor_name*/, dev, &actual));
+      }
+      for (int i = 0; i < tensor_len; ++i) {
+        switch (dtype) {
+          case DT_FLOAT:
+            EXPECT_FLOAT_EQ(expected[(di * tensor_len) + i],
+                            actual.template flat<T>()(i))
+                << "Mismatch at device " << di << " index " << i;
+            break;
+          case DT_DOUBLE:
+            EXPECT_DOUBLE_EQ(expected[(di * tensor_len) + i],
+                             actual.template flat<T>()(i))
+                << "Mismatch at device " << di << " index " << i;
+            break;
+          case DT_BOOL:
+          case DT_INT32:
+          case DT_INT64:
+            EXPECT_EQ(expected[(di * tensor_len) + i],
+                      actual.template flat<T>()(i))
+                << "Mismatch at device " << di << " index " << i;
+            break;
+          default:
+            LOG(FATAL) << "unimplemented";
+        }
+      }
+      //  }
+    }
+  }
+
+  class DeviceInstance {
+   public:
+    DeviceInstance(int rank, const string& dev_name,
+                   const DeviceType& device_type, PermuterTest* parent)
+        : parent_(parent),
+          dev_name_(dev_name),
+          device_type_(device_type),
+          rank_(rank) {
+      TF_CHECK_OK(parent_->dev_mgr_->LookupDevice(dev_name, &device_));
+      col_params_.name = parent_->col_params_.name;
+      col_params_.instance.data_type = parent_->col_params_.instance.data_type;
+      col_params_.instance.instance_key =
+          parent_->col_params_.instance.instance_key;
+      col_params_.group.device_type = parent_->col_params_.group.device_type;
+      col_params_.instance.device_names =
+          parent_->col_params_.instance.device_names;
+      col_params_.instance.devices = parent_->col_params_.instance.devices;
+      col_params_.instance.permutation =
+          parent->col_params_.instance.permutation;
+      col_params_.instance.task_names =
+          parent_->col_params_.instance.task_names;
+      col_params_.task.is_local = parent_->col_params_.task.is_local;
+      CHECK_EQ(col_params_.instance.devices.size(),
+               col_params_.instance.device_names.size());
+      // Default rank is order in device_names.
+      col_params_.default_rank = rank;
+    }
+
+    void InitTensor(DataType dtype, const TensorShape& shape,
+                    const InitFunc& f) {
+      tensor_input_ =
+          Tensor(device_->GetAllocator(AllocatorAttributes()), dtype, shape);
+      tensor_output_ =
+          Tensor(device_->GetAllocator(AllocatorAttributes()), dtype, shape);
+      if (device_type_ == DEVICE_CPU) {
+        f(&tensor_input_);
+      } else if (device_type_ == DEVICE_GPU) {
+        Tensor cpu_tensor(dtype, shape);
+        f(&cpu_tensor);
+        // Notification notification;
+        auto* dev_info = device_->tensorflow_gpu_device_info();
+        CHECK(dev_info);
+        TF_CHECK_OK(dev_info->default_context->CopyCPUTensorToDeviceSync(
+            &cpu_tensor, device_, &tensor_input_));
+      } else {
+        LOG(FATAL) << "Unsupported device_type " << device_type_;
+      }
+    }
+
+    void DoPermute() {
+      // Prepare an OpKernelContext.
+      OpKernelContext::Params op_params;
+      op_params.step_id = parent_->step_id_;
+      op_params.device = device_;
+      gtl::InlinedVector<TensorValue, 4> inputs;
+      inputs.push_back(TensorValue(&tensor_input_));
+      op_params.inputs = &inputs;
+      gtl::InlinedVector<AllocatorAttributes, 4> input_aa(
+          {AllocatorAttributes()});
+      op_params.input_alloc_attrs = &input_aa;
+      DeviceContext* dev_ctx = nullptr;
+      auto* dev_info = device_->tensorflow_gpu_device_info();
+      if (dev_info) {
+        dev_ctx = dev_info->default_context;
+        dev_ctx->Ref();
+      } else {
+        dev_ctx = new DeviceContext;
+      }
+      op_params.op_device_context = dev_ctx;
+      AllocatorAttributes generic_alloc_attr;
+      op_params.output_attr_array = &generic_alloc_attr;
+      OpKernelContext ctx(&op_params, 1);
+
+      // Prepare a Permuter instance.
+      string exec_key =
+          strings::StrCat(col_params_.instance.instance_key, ":0:0");
+      Permuter* permuter = new Permuter;
+      core::ScopedUnref unref(permuter);
+      auto col_ctx = std::make_shared<CollectiveContext>(
+          parent_->col_exec_, /*nccl_communicator*/ nullptr,
+          parent_->dev_mgr_.get(), &ctx, &op_params, col_params_, exec_key,
+          kStepId, &tensor_input_, &tensor_output_);
+      TF_CHECK_OK(permuter->InitializeCollectiveContext(col_ctx));
+      Notification note;
+      // Run the permute.
+      permuter->Run([this, &note](Status s) {
+        status_ = s;
+        note.Notify();
+      });
+      note.WaitForNotification();
+      dev_ctx->Unref();
+    }
+
+    PermuterTest* parent_;
+    string dev_name_;
+    DeviceType device_type_ = DEVICE_CPU;
+    int rank_;
+    Tensor tensor_input_;
+    Tensor tensor_output_;
+    Device* device_;
+    CollectiveParams col_params_;
+    Status status_;
+  };  // class DeviceInstance
+
+  bool stop_ = false;
+  int64 step_id_ = kStepId;
+  DeviceType device_type_;
+  TestCollectiveExecutorMgr col_exec_mgr_;
+  CollectiveExecutor* col_exec_ = nullptr;
+  CollectiveRemoteAccessLocal* rma_;
+  std::unique_ptr<DeviceResolverLocal> dev_resolver_;
+  std::shared_ptr<UnboundedWorkQueue> work_queue_;
+  std::vector<DeviceInstance*> instances_;
+  CollectiveParams col_params_;
+  std::vector<std::unique_ptr<tensorflow::Device>> gpu_devices_;
+  std::unique_ptr<tensorflow::DeviceMgr> dev_mgr_;
+  std::unique_ptr<string> gpu_ring_order_;
+  mutex mu_;
+  int permute_counter_ TF_GUARDED_BY(mu_) = 0;
+  std::vector<int> permutation_;
+};
+
+// TODO(b/113171733): change to use TEST_P.
+// Tests of full permute algorithm, with different device and
+// data types.
+// B = data element type
+// T = device type
+// W = number of workers
+// D = number of devices per worker
+// L = tensor length
+// A = abort after count
+#define DEF_TEST(B, T, W, D, L, A)                                            \
+  TEST_F(PermuterTest,                                                        \
+         DaTy##B##_DevTy##T##_Wkr##W##_Dev##D##_Sdiv##S##_Len##L##_Abrt##A) { \
+    DataType dtype = DT_##B;                                                  \
+    switch (dtype) {                                                          \
+      case DT_BOOL: {                                                         \
+        RunTest<bool>(dtype, DEVICE_##T, W, D, L, A);                         \
+      } break;                                                                \
+      case DT_FLOAT: {                                                        \
+        RunTest<float>(dtype, DEVICE_##T, W, D, L, A);                        \
+      } break;                                                                \
+      case DT_DOUBLE: {                                                       \
+        RunTest<double>(dtype, DEVICE_##T, W, D, L, A);                       \
+      } break;                                                                \
+      case DT_INT32: {                                                        \
+        RunTest<int32>(dtype, DEVICE_##T, W, D, L, A);                        \
+      } break;                                                                \
+      case DT_INT64: {                                                        \
+        RunTest<int64>(dtype, DEVICE_##T, W, D, L, A);                        \
+      } break;                                                                \
+      default:                                                                \
+        LOG(FATAL) << "Unimplemented";                                        \
+    }                                                                         \
+  }
+
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+//       B      T    W  D  L  A
+DEF_TEST(FLOAT, CPU, 1, 2, 1, 0)
+DEF_TEST(FLOAT, CPU, 1, 3, 3, 0)
+DEF_TEST(FLOAT, CPU, 1, 7, 3, 0)
+DEF_TEST(FLOAT, CPU, 1, 2, 1001, 0)
+DEF_TEST(FLOAT, CPU, 2, 2, 3, 0)
+DEF_TEST(FLOAT, CPU, 2, 1, 128, 0)
+DEF_TEST(FLOAT, CPU, 2, 4, 128, 0)
+DEF_TEST(FLOAT, CPU, 2, 8, 4095, 0)
+DEF_TEST(FLOAT, CPU, 4, 4, 1045991, 0)
+
+DEF_TEST(BOOL, CPU, 1, 4, 1, 0)
+DEF_TEST(BOOL, CPU, 2, 4, 1, 0)
+DEF_TEST(BOOL, CPU, 2, 4, 1001, 0)
+
+DEF_TEST(DOUBLE, CPU, 2, 4, 128, 0)
+DEF_TEST(INT32, CPU, 2, 4, 128, 0)
+DEF_TEST(INT64, CPU, 2, 4, 128, 0)
+
+// Failure cases
+DEF_TEST(FLOAT, CPU, 1, 2, 1, 1)
+DEF_TEST(FLOAT, CPU, 2, 4, 128, 1)
+DEF_TEST(FLOAT, CPU, 2, 4, 128, 5)
+#endif
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+// Can only set W=1 for GPU tests.
+//       B      T    W  D  L  A
+DEF_TEST(FLOAT, GPU, 1, 2, 1, 0)
+DEF_TEST(FLOAT, GPU, 1, 7, 3, 0)
+DEF_TEST(FLOAT, GPU, 1, 2, 33, 0)
+DEF_TEST(FLOAT, GPU, 1, 3, 64, 0)
+DEF_TEST(FLOAT, GPU, 1, 8, 1001, 0)
+DEF_TEST(FLOAT, GPU, 1, 8, 4095, 0)
+DEF_TEST(FLOAT, GPU, 1, 8, 1045991, 0)
+
+DEF_TEST(BOOL, GPU, 1, 4, 1, 0)
+DEF_TEST(BOOL, GPU, 1, 4, 1001, 0)
+
+DEF_TEST(DOUBLE, GPU, 1, 8, 1001, 0)
+DEF_TEST(INT64, GPU, 1, 8, 1001, 0)
+
+// Failure cases
+DEF_TEST(FLOAT, GPU, 1, 8, 128, 6)
+#endif
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.cc b/tensorflow/core/common_runtime/process_function_library_runtime.cc
index b31b2b78bf0..44eeff6534b 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.cc
@@ -466,18 +466,6 @@ Status ProcessFunctionLibraryRuntime::PinArgsAndRets(
                   << " src_device: " << *src_device
                   << " colo group: " << colocation_group;
         }
-        // If colocation_group is not set and output producing node is assigned
-        // to a remote device, colocate the retval node with its input node.
-        // TODO(yujingzhang): Remove this when we support outputting tensors on
-        // remote devices.
-        const bool remote_src_device =
-            !src_device->empty() && GetFLR(*src_device) == nullptr;
-        if (colocation_group.empty() && remote_src_device) {
-          colocation_group =
-              absl::StrCat(kColocationGroupPrefix, it->src()->name());
-          VLOG(3) << "Considering src: " << src_node->name()
-                  << " colo group: " << colocation_group;
-        }
 
         // If resource is produced by a function call node, we can't trust
         // source node device assignment, because multi-device functions can
@@ -510,6 +498,20 @@ Status ProcessFunctionLibraryRuntime::PinArgsAndRets(
                   "Unable to find any devices for spec ", *src_device);
             }
           } else if (matching_devices.size() != 1) {
+            bool on_same_task = true;
+            for (int i = 1; i < matching_devices.size(); ++i) {
+              if (!DeviceNameUtils::IsSameAddressSpace(
+                      matching_devices.at(0)->parsed_name(),
+                      matching_devices.at(i)->parsed_name())) {
+                on_same_task = false;
+                break;
+              }
+            }
+            // If the src node of an output is assigned to a address space (e.g.
+            // py_func), rely on placer to assign a device to the output.
+            if (on_same_task) {
+              continue;
+            }
             // Convert a vector of devices to a string.
             // Using absl::StrJoin did not work in Android builds.
             string devices = "[";
@@ -523,6 +525,7 @@ Status ProcessFunctionLibraryRuntime::PinArgsAndRets(
             devices.append("]");
 
             return errors::InvalidArgument(
+                *src_device,
                 "When FunctionLibraryRuntime::Options.output_devices are "
                 "not specified for a multi-device function, the device "
                 "specification on the output node must match exactly one "
@@ -968,6 +971,7 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
         Status s = flr->Instantiate(unique_name, attrs, opts, component_handle);
         done(s);
       } else {
+        opts.ret_indices = comp_data->ret_indices;
         // Initialize remote function asynchronously.
         InstantiateRemote(unique_name, attrs, opts, component_handle, done);
       }
@@ -988,9 +992,9 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
 }
 
 Status ProcessFunctionLibraryRuntime::GetOutputDevices(
-    FunctionLibraryRuntime::Handle handle,
-    std::vector<Device*>* output_devices) const {
-  const MultiDeviceFunctionData* data = IsMultiDevice(handle);
+    FunctionLibraryRuntime::Handle handle, std::vector<Device*>* output_devices,
+    const bool eager_lazy_copy) const {
+  MultiDeviceFunctionData* data = IsMultiDevice(handle);
   if (data == nullptr) {
     return errors::InvalidArgument(
         "Failed for find multi-device function handle ", handle);
@@ -999,13 +1003,18 @@ Status ProcessFunctionLibraryRuntime::GetOutputDevices(
   for (const auto& pair : data->glue_) {
     const ComponentFunctionData& comp_data = pair.second;
     DCHECK(comp_data.ret_alloc_attrs.size() == comp_data.ret_indices.size());
+    if (comp_data.ret_indices.empty()) {
+      continue;
+    }
 
     const string& target = pair.first;
     FunctionLibraryRuntime* target_flr = GetFLR(target);
+    Device* target_device = nullptr;
+    Device* host = nullptr;
     if (target_flr == nullptr) {
-      if (!comp_data.ret_indices.empty()) {
+      if (!eager_lazy_copy) {
         return errors::Unimplemented(
-            "Currently, outputting tensors on remote devices is not supported. "
+            "Currently, outputting tensors on remote devices is not supported."
             "The ",
             comp_data.ret_indices[0],
             "-th return value of the function outputs to target_device: ",
@@ -1013,20 +1022,25 @@ Status ProcessFunctionLibraryRuntime::GetOutputDevices(
             " Please copy the tensor to local device explicitly using "
             "tf.identity and return the new Tensor instead.");
       }
-      continue;
+      if (!data->has_remote_outputs) {
+        data->has_remote_outputs = true;
+      }
+      target_device = device_set()->FindDeviceByName(target);
+      string remote_host;
+      TF_RETURN_IF_ERROR(
+          DeviceNameUtils::DeviceNameToCpuDeviceName(target, &remote_host));
+      host = device_set()->FindDeviceByName(remote_host);
+    } else {
+      target_device = target_flr->device();
     }
-    Device* target_device = target_flr->device();
-    const FunctionBody* fbody = target_flr->GetFunctionBody(comp_data.handle);
-    DCHECK(fbody != nullptr);
-
     output_devices->resize(data->num_outputs_);
     for (int j = 0; j < comp_data.ret_indices.size(); ++j) {
       int ret_index = comp_data.ret_indices[j];
-      if (fbody->ret_types[j] == DT_RESOURCE) {
+      if (data->ret_types_[ret_index] == DT_RESOURCE) {
         (*output_devices)[ret_index] = target_device;
       } else {
         (*output_devices)[ret_index] =
-            comp_data.ret_alloc_attrs[j].on_host() ? nullptr : target_device;
+            comp_data.ret_alloc_attrs[j].on_host() ? host : target_device;
       }
     }
   }
@@ -1116,10 +1130,11 @@ void ProcessFunctionLibraryRuntime::RunMultiDevice(
     rets->resize(data->num_outputs_);
 
     auto component_fn_callback = [comp_rets, rets, comp_data, refcounted_done,
-                                  cm, local_cm, data,
+                                  cm, local_cm, data, handle,
                                   target](const Status& status) {
       if (!status.ok()) {
         VLOG(2) << "Component function execution on target " << target
+                << " from " << data->function_name_ << " with handle " << handle
                 << " failed: " << status;
         const string function_and_msg = strings::StrCat(
             errors::FormatFunctionForError(data->function_name_), " ",
@@ -1129,6 +1144,7 @@ void ProcessFunctionLibraryRuntime::RunMultiDevice(
         cm->StartCancel();
       } else {
         VLOG(2) << "Component function execution on target " << target
+                << " from " << data->function_name_ << " with handle " << handle
                 << " succeeded.";
         for (int i = 0; i < comp_rets->size(); ++i) {
           (*rets)[comp_data.ret_indices[i]] = (*comp_rets)[i];
@@ -1147,8 +1163,8 @@ void ProcessFunctionLibraryRuntime::RunMultiDevice(
       thread::ThreadPool* pool = flr->device()->tensorflow_device_thread_pool();
       opts_copy.runner = (pool == nullptr) ? opts_copy.runner : flr->runner();
 
-      VLOG(1) << "Running component function on device " << target
-              << " with handle " << handle;
+      VLOG(1) << "Running component function on device " << target << " from "
+              << data->function_name_ << " with handle " << handle;
       VLOG(4) << "    with " << opts_copy.DebugString();
 
       std::vector<Tensor>* comp_tensor_rets = new std::vector<Tensor>;
@@ -1159,8 +1175,8 @@ void ProcessFunctionLibraryRuntime::RunMultiDevice(
     } else {
       opts_copy.remote_execution = true;
 
-      VLOG(1) << "Running component function on device " << target
-              << " with handle " << handle;
+      VLOG(1) << "Running component function on device " << target << " from "
+              << data->function_name_ << " with handle " << handle;
       VLOG(4) << "    with " << opts_copy.DebugString();
 
       RunInternal(opts_copy, handle, comp_args.args, comp_rets, cleanup_items,
@@ -1610,7 +1626,12 @@ void ProcessFunctionLibraryRuntime::Run(
     FunctionLibraryRuntime::Handle handle, const FunctionArgsInterface& args,
     std::vector<FunctionRet>* rets,
     FunctionLibraryRuntime::DoneCallback done) const {
-  if (!args.HasRemoteOrPackedInputs()) {
+  bool has_remote_outputs = false;
+  const MultiDeviceFunctionData* data = IsMultiDevice(handle);
+  if (data != nullptr) {
+    has_remote_outputs = data->has_remote_outputs;
+  }
+  if (!args.HasRemoteOrPackedInputs() && !has_remote_outputs) {
     const std::vector<Tensor> local_inputs = args.GetLocalTensors();
     std::vector<Tensor>* tensor_rets = new std::vector<Tensor>;
     return Run(
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.h b/tensorflow/core/common_runtime/process_function_library_runtime.h
index 3ba04f17880..a882f5406d3 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.h
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.h
@@ -151,7 +151,8 @@ class ProcessFunctionLibraryRuntime {
   // is set to the device backing the resource.
   // REQUIRES: `handle` identifies a multi-device function.
   Status GetOutputDevices(FunctionLibraryRuntime::Handle handle,
-                          std::vector<Device*>* output_devices) const;
+                          std::vector<Device*>* output_devices,
+                          const bool eager_lazy_copy) const;
 
   // Returns true if function with handle `handle` was instantiated on device
   // `device_name`. Returns false for multi-device functions.
@@ -203,7 +204,7 @@ class ProcessFunctionLibraryRuntime {
 
   const DeviceMgr* device_mgr() { return device_mgr_; }
 
-  const std::shared_ptr<DeviceSet> device_set() {
+  const std::shared_ptr<DeviceSet> device_set() const {
     tf_shared_lock l(mu_);
     return device_set_;
   }
@@ -271,7 +272,8 @@ class ProcessFunctionLibraryRuntime {
           lib_def_(std::move(lib_def)),
           num_outputs_(num_outputs),
           ret_types_(std::move(ret_types)),
-          is_cross_process_(false) {}
+          is_cross_process_(false),
+          has_remote_outputs(false) {}
 
     const string function_name_;
     const string function_key_;
@@ -285,6 +287,8 @@ class ProcessFunctionLibraryRuntime {
 
     // Indicates whether this function needs to execute cross process.
     bool is_cross_process_;
+    // Indicates whether this function has remote outputs.
+    bool has_remote_outputs;
 
     // Maps the device name to the information about the component function
     // be run on this device.
diff --git a/tensorflow/core/common_runtime/propagator_state.cc b/tensorflow/core/common_runtime/propagator_state.cc
index a6639b1132e..fde47200282 100644
--- a/tensorflow/core/common_runtime/propagator_state.cc
+++ b/tensorflow/core/common_runtime/propagator_state.cc
@@ -89,13 +89,12 @@ void PropagatorState::PropagateOutputs(const TaggedNode& tagged_node,
   IterationState* output_iter = input_iter;
 
   if (!item->is_enter_exit_or_next_iter) {
-    // Fast path for nodes types that don't need special handling
+    // Fast path for node types that don't need special handling.
+    // This is the case for most nodes.
     DCHECK_EQ(input_frame, output_frame);
-    // Normal path for most nodes
-    mutex_lock l(input_frame->mu);
-    output_frame->ActivateNodes(item, is_dead, output_iter, outputs, ready);
-    is_frame_done =
-        input_frame->DecrementOutstandingOpsLocked(input_iter, ready);
+    FrameState* frame = input_frame;
+    is_frame_done = frame->ActivateNodesAndAdjustOutstanding(
+        item, is_dead, output_iter, outputs, ready);
   } else if (item->is_enter) {
     FindOrCreateChildFrame(input_frame, input_iter, *item, &output_frame);
     {
@@ -105,7 +104,9 @@ void PropagatorState::PropagateOutputs(const TaggedNode& tagged_node,
         // Propagate to all active iterations if this is a loop invariant.
         output_frame->AddLoopInv(item, (*outputs)[0], ready);
       } else {
-        output_frame->ActivateNodes(item, is_dead, output_iter, outputs, ready);
+        int activated = output_frame->ActivateNodesLocked(
+            item, is_dead, output_iter, outputs, ready);
+        output_frame->AdjustOutstandingOpsLocked(output_iter, activated, ready);
       }
       output_frame->num_pending_inputs--;
     }
@@ -124,7 +125,9 @@ void PropagatorState::PropagateOutputs(const TaggedNode& tagged_node,
       output_iter = input_frame->parent_iter;
       {
         mutex_lock l(output_frame->mu);
-        output_frame->ActivateNodes(item, is_dead, output_iter, outputs, ready);
+        int activated = output_frame->ActivateNodesLocked(
+            item, is_dead, output_iter, outputs, ready);
+        output_frame->AdjustOutstandingOpsLocked(output_iter, activated, ready);
       }
       is_frame_done = input_frame->DecrementOutstandingOps(input_iter, ready);
     }
@@ -153,7 +156,9 @@ void PropagatorState::PropagateOutputs(const TaggedNode& tagged_node,
     if (output_frame != nullptr) {
       // This is the case when node is not Enter, Exit, or NextIteration.
       DCHECK(input_frame == output_frame);
-      output_frame->ActivateNodes(item, is_dead, output_iter, outputs, ready);
+      int activated = output_frame->ActivateNodesLocked(
+          item, is_dead, output_iter, outputs, ready);
+      output_frame->AdjustOutstandingOpsLocked(output_iter, activated, ready);
     }
     is_frame_done =
         input_frame->DecrementOutstandingOpsLocked(input_iter, ready);
@@ -378,13 +383,15 @@ void PropagatorState::CleanupFramesIterations(FrameState* frame,
   }
 }
 
-void PropagatorState::FrameState::ActivateNodesFastPath(
+template <bool atomic>
+int PropagatorState::FrameState::ActivateNodesFastPathInternal(
     const NodeItem* item, const bool is_dead, IterationState* iter_state,
     EntryVector* outputs, TaggedNodeSeq* ready) {
   // If we know that none of the item's edge destinations require special
   // handling (i.e. none of the nodes is a merge or control trigger node), we
   // can take a fast path that avoids accessing the destination NodeItem.
   const GraphView& gview = immutable_state.graph_view();
+  int new_outstanding = 0;
 
 // Add dst to the ready queue if it's ready
 //
@@ -399,12 +406,11 @@ void PropagatorState::FrameState::ActivateNodesFastPath(
       t.input_frame = this;                               \
       t.input_iter = iter_state;                          \
       t.is_dead = adjust_result.any_dead;                 \
-      iter_state->outstanding_ops++;                      \
+      new_outstanding++;                                  \
     }                                                     \
   } while (0);
 
   Entry* input_tensors = iter_state->input_tensors;
-
   for (const EdgeInfo& e : item->output_edges()) {
     const int dst_id = e.dst_id;
     const PendingCounts::Handle dst_pending_id =
@@ -413,14 +419,17 @@ void PropagatorState::FrameState::ActivateNodesFastPath(
 
     const bool increment_dead =
         (is_dead || ((*outputs)[src_slot].state == Entry::State::NO_VALUE));
-    const PendingCounts::AdjustResult adjust_result =
-        iter_state->adjust_for_activation(dst_pending_id, increment_dead);
     const int dst_loc = e.input_slot;
     if (e.is_last) {
       input_tensors[dst_loc] = std::move((*outputs)[src_slot]);
     } else {
       input_tensors[dst_loc] = (*outputs)[src_slot];
     }
+    const PendingCounts::AdjustResult adjust_result =
+        atomic
+            ? iter_state->adjust_for_activation_atomic(dst_pending_id,
+                                                       increment_dead)
+            : iter_state->adjust_for_activation(dst_pending_id, increment_dead);
     MAYBE_ADD_TO_READY(dst_id, adjust_result);
   }
 
@@ -429,27 +438,31 @@ void PropagatorState::FrameState::ActivateNodesFastPath(
     const PendingCounts::Handle dst_pending_id =
         immutable_state.pending_ids()[dst_id];
     const PendingCounts::AdjustResult adjust_result =
-        iter_state->adjust_for_activation(dst_pending_id, is_dead);
+        atomic
+            ? iter_state->adjust_for_activation_atomic(dst_pending_id, is_dead)
+            : iter_state->adjust_for_activation(dst_pending_id, is_dead);
     MAYBE_ADD_TO_READY(dst_id, adjust_result);
   }
+
+  return new_outstanding;
 #undef MAYBE_ADD_TO_READY
 }
 
-void PropagatorState::FrameState::ActivateNodesSlowPath(
+int PropagatorState::FrameState::ActivateNodesSlowPath(
     const NodeItem* item, const bool is_dead, IterationState* iter_state,
     EntryVector* outputs, TaggedNodeSeq* ready) {
   // If any of the edge destinations is a merge or a control trigger node,
   // we need to read each destination NodeItem to determine what action
   // to take.
   const GraphView& gview = immutable_state.graph_view();
-
+  int activated = 0;
   auto maybe_add_to_ready = [&](int dst_id, const NodeItem* dst_item,
                                 bool dst_ready, bool dst_dead) {
     // Add dst to the ready queue if it's ready
     if (dst_ready) {
       if (dst_item->is_control_trigger) dst_dead = false;
       ready->emplace_back(dst_item, this, iter_state, dst_dead);
-      iter_state->outstanding_ops++;
+      activated++;
     }
   };
 
@@ -544,43 +557,72 @@ void PropagatorState::FrameState::ActivateNodesSlowPath(
     }
     maybe_add_to_ready(dst_id, dst_item, dst_ready, dst_dead);
   }
+
+  return activated;
 }
 
-void PropagatorState::FrameState::ActivateNodes(const NodeItem* item,
-                                                const bool is_dead,
-                                                IterationState* iter_state,
-                                                EntryVector* outputs,
-                                                TaggedNodeSeq* ready) {
+bool PropagatorState::FrameState::ActivateNodesAndAdjustOutstanding(
+    const NodeItem* item, const bool is_dead, IterationState* iter_state,
+    EntryVector* outputs, TaggedNodeSeq* ready) {
   if (TF_PREDICT_FALSE(item->is_any_consumer_merge_or_control_trigger)) {
-    ActivateNodesSlowPath(item, is_dead, iter_state, outputs, ready);
+    mutex_lock l(mu);
+    int activated =
+        ActivateNodesSlowPath(item, is_dead, iter_state, outputs, ready);
+    return AdjustOutstandingOpsLocked(iter_state, activated - 1, ready);
+  }
+  {
+    tf_shared_lock l(mu);
+    int activated =
+        ActivateNodesFastPathShared(item, is_dead, iter_state, outputs, ready);
+    bool iter_done = AdjustOutstandingOpsFastPath(iter_state, activated - 1);
+    if (!iter_done) return false;
+  }
+  mutex_lock l(mu);
+  return CleanupIterations(iter_state, ready);
+}
+
+int PropagatorState::FrameState::ActivateNodesLocked(const NodeItem* item,
+                                                     const bool is_dead,
+                                                     IterationState* iter_state,
+                                                     EntryVector* outputs,
+                                                     TaggedNodeSeq* ready) {
+  if (TF_PREDICT_FALSE(item->is_any_consumer_merge_or_control_trigger)) {
+    return ActivateNodesSlowPath(item, is_dead, iter_state, outputs, ready);
   } else {
-    ActivateNodesFastPath(item, is_dead, iter_state, outputs, ready);
+    return ActivateNodesFastPathLocked(item, is_dead, iter_state, outputs,
+                                       ready);
   }
 }
 
 void PropagatorState::FrameState::ActivateNexts(IterationState* iter_state,
                                                 TaggedNodeSeq* ready) {
+  int activated = 0;
   // Propagate the deferred NextIteration nodes to the new iteration.
   for (auto& node_entry : next_iter_roots) {
     const NodeItem* item = node_entry.first;
     const Entry& entry = node_entry.second;
     const bool is_dead = entry.state == Entry::State::NO_VALUE;
     EntryVector outputs{entry};
-    ActivateNodes(item, is_dead, iter_state, &outputs, ready);
+    activated +=
+        ActivateNodesLocked(item, is_dead, iter_state, &outputs, ready);
   }
   next_iter_roots.clear();
+  AdjustOutstandingOpsLocked(iter_state, activated, ready);
 }
 
 void PropagatorState::FrameState::ActivateLoopInvs(IterationState* iter_state,
                                                    TaggedNodeSeq* ready) {
   // Propagate loop invariants to the new iteration.
+  int activated = 0;
   for (auto& node_entry : inv_values) {
     const NodeItem* item = node_entry.first;
     const Entry& entry = node_entry.second;
     const bool is_dead = entry.state == Entry::State::NO_VALUE;
     EntryVector outputs{entry};
-    ActivateNodes(item, is_dead, iter_state, &outputs, ready);
+    activated +=
+        ActivateNodesLocked(item, is_dead, iter_state, &outputs, ready);
   }
+  AdjustOutstandingOpsLocked(iter_state, activated, ready);
 }
 
 void PropagatorState::FrameState::AddLoopInv(const NodeItem* item,
@@ -593,7 +635,10 @@ void PropagatorState::FrameState::AddLoopInv(const NodeItem* item,
   const bool is_dead = entry.state == Entry::State::NO_VALUE;
   for (int i = 0; i <= iteration_count; ++i) {
     EntryVector outputs{entry};
-    ActivateNodes(item, is_dead, GetIteration(i), &outputs, ready);
+    IterationState* iter_state = GetIteration(i);
+    int activated =
+        ActivateNodesLocked(item, is_dead, iter_state, &outputs, ready);
+    AdjustOutstandingOpsLocked(iter_state, activated, ready);
   }
 }
 
@@ -676,8 +721,50 @@ void PropagatorState::FrameState::SetIteration(int64 iter,
 // frame. Return true iff the execution of the frame is done.
 bool PropagatorState::FrameState::DecrementOutstandingOps(
     IterationState* iter_state, TaggedNodeSeq* ready) {
+  return AdjustOutstandingOps(iter_state, -1, ready);
+}
+
+bool PropagatorState::FrameState::AdjustOutstandingOps(
+    IterationState* iter_state, int delta, TaggedNodeSeq* ready) {
+  // Given the following profile of values of 'delta' for wide_deep model from
+  // the TF model garden:
+  //
+  // Count  Value
+  // ---------------
+  // 757938 delta=0x0
+  // 541713 delta=0xffffffff
+  // 138115 delta=0x1
+  //  58770 delta=0x2
+  //   5394 delta=0x3
+  //   4669 delta=0x4
+  //   2037 delta=0xa
+  //   1646 delta=0x7
+  //   1632 delta=0x6
+  //   1613 delta=0x6c
+  //   1224 delta=0x5
+  //    409 delta=0x53
+  //     17 delta=0x86
+  //
+  // ... it's worth no-opping out when delta == 0 to avoid the atomic
+  // instruction.
+  if (delta == 0) {
+    return false;
+  }
+  {
+    tf_shared_lock sl(mu);
+    if (TF_PREDICT_TRUE(!AdjustOutstandingOpsFastPath(iter_state, delta))) {
+      return false;
+    }
+  }
   mutex_lock l(mu);
-  return DecrementOutstandingOpsLocked(iter_state, ready);
+  DCHECK(IsIterationDone(iter_state));
+  return CleanupIterations(iter_state, ready);
+}
+
+bool PropagatorState::FrameState::AdjustOutstandingOpsFastPath(
+    IterationState* iter_state, int delta) {
+  auto old_val = iter_state->outstanding_ops.fetch_add(delta);
+  return (old_val + delta == 0) && IsIterationDone(iter_state);
 }
 
 // Decrement the outstanding op count and clean up the iterations in the
@@ -685,12 +772,22 @@ bool PropagatorState::FrameState::DecrementOutstandingOps(
 bool PropagatorState::FrameState::DecrementOutstandingOpsLocked(
     IterationState* iter_state, TaggedNodeSeq* ready)
     TF_EXCLUSIVE_LOCKS_REQUIRED(mu) {
-  iter_state->outstanding_ops--;
-  if (iter_state->outstanding_ops != 0) {
+  return AdjustOutstandingOpsLocked(iter_state, -1, ready);
+}
+
+bool PropagatorState::FrameState::AdjustOutstandingOpsLocked(
+    IterationState* iter_state, int delta, TaggedNodeSeq* ready) {
+  // We hold the lock, so we don't need to use an atomic modification.
+  auto cur_val = iter_state->outstanding_ops.load(std::memory_order_relaxed);
+  DCHECK(delta >= 0 || cur_val >= -delta)
+      << "cannot adjust outstanding_ops by " << delta
+      << " when current value is " << cur_val;
+  auto new_val = cur_val + delta;
+  iter_state->outstanding_ops.store(new_val, std::memory_order_relaxed);
+  if (new_val != 0) {
     return false;
-  } else {
-    return CleanupIterations(iter_state, ready);
   }
+  return CleanupIterations(iter_state, ready);
 }
 
 // Returns true if the computation in the frame is completed.
diff --git a/tensorflow/core/common_runtime/propagator_state.h b/tensorflow/core/common_runtime/propagator_state.h
index 167519ccc73..4e66e709310 100644
--- a/tensorflow/core/common_runtime/propagator_state.h
+++ b/tensorflow/core/common_runtime/propagator_state.h
@@ -143,7 +143,7 @@ class PropagatorState {
     Entry* input_tensors;
 
     // The number of outstanding ops for each iteration.
-    size_t outstanding_ops;
+    std::atomic<size_t> outstanding_ops;
 
     // The number of outstanding frames for each iteration.
     int outstanding_frame_count;
@@ -170,6 +170,10 @@ class PropagatorState {
                                                       bool increment_dead) {
       return counts.adjust_for_activation(h, increment_dead);
     }
+    PendingCounts::AdjustResult adjust_for_activation_atomic(
+        PendingCounts::Handle h, bool increment_dead) {
+      return counts.adjust_for_activation_atomic(h, increment_dead);
+    }
 
     ~IterationState() { delete[] input_tensors; }
 
@@ -283,7 +287,7 @@ class PropagatorState {
     void InitializeFrameInfo(const ImmutableExecutorState::FrameInfo& finfo);
 
     inline IterationState* GetIteration(int64 iter)
-        TF_EXCLUSIVE_LOCKS_REQUIRED(mu) {
+        TF_SHARED_LOCKS_REQUIRED(mu) {
       if (TF_PREDICT_TRUE(iter == 0)) {
         return iterations_first;
       } else {
@@ -294,13 +298,26 @@ class PropagatorState {
 
     void SetIteration(int64 iter, IterationState* state);
 
-    // Decrement the outstanding op count and clean up the iterations in the
-    // frame. Return true iff the execution of the frame is done.
+    // Adjust the outstanding op count by 'delta' and clean up the iterations in
+    // the frame if no more ops are oustanding. Return true iff the execution of
+    // the frame is done.
+    //
+    // Avoids acquiring the lock in the common case that the frame is not done.
+    bool AdjustOutstandingOps(IterationState* iter_state, int delta,
+                              TaggedNodeSeq* ready);
+
+    bool AdjustOutstandingOpsLocked(IterationState* iter_state, int delta,
+                                    TaggedNodeSeq* ready)
+        TF_EXCLUSIVE_LOCKS_REQUIRED(mu);
+
+    bool AdjustOutstandingOpsFastPath(IterationState* iter_state, int delta)
+        TF_SHARED_LOCKS_REQUIRED(mu);
+
+    // Convenience methods for the above 'Adjust' calls where delta takes the
+    // common value of -1.
     bool DecrementOutstandingOps(IterationState* iter_state,
                                  TaggedNodeSeq* ready);
 
-    // Decrement the outstanding op count and clean up the iterations in the
-    // frame. Return true iff the execution of the frame is done.
     bool DecrementOutstandingOpsLocked(IterationState* iter_state,
                                        TaggedNodeSeq* ready);
 
@@ -309,7 +326,7 @@ class PropagatorState {
 
     // Returns true if the iteration of the frame is completed.
     bool IsIterationDone(IterationState* iter_state)
-        TF_EXCLUSIVE_LOCKS_REQUIRED(mu);
+        TF_SHARED_LOCKS_REQUIRED(mu);
 
     // Increments the iteration id. If this is a new iteration, initialize it.
     //
@@ -332,9 +349,23 @@ class PropagatorState {
 
     // Activate the successors of a node. Contents of *outputs are left in an
     // indeterminate state after returning from this method.
-    void ActivateNodes(const NodeItem* item, const bool is_dead,
-                       IterationState* iter_state, EntryVector* outputs,
-                       TaggedNodeSeq* ready) TF_EXCLUSIVE_LOCKS_REQUIRED(mu);
+    //
+    // In the case that 'item' is a simple node (no merge/control outputs) this
+    // will acquire a shared lock and can run concurrently with other
+    // invocations.
+    //
+    // Return true if the frame is done after activation.
+    bool ActivateNodesAndAdjustOutstanding(const NodeItem* item,
+                                           const bool is_dead,
+                                           IterationState* iter_state,
+                                           EntryVector* outputs,
+                                           TaggedNodeSeq* ready);
+
+    // Same as the above, but requires 'mu' already held in exclusive mode.
+    int ActivateNodesLocked(const NodeItem* item, const bool is_dead,
+                            IterationState* iter_state, EntryVector* outputs,
+                            TaggedNodeSeq* ready)
+        TF_EXCLUSIVE_LOCKS_REQUIRED(mu);
 
     // Cleanup iterations of this frame starting from the given iteration.
     bool CleanupIterations(IterationState* iter_state, TaggedNodeSeq* ready)
@@ -359,14 +390,35 @@ class PropagatorState {
 
    private:
     // REQUIRES: `!item->is_any_consumer_merge_or_control_trigger`.
-    void ActivateNodesFastPath(const NodeItem* item, const bool is_dead,
-                               IterationState* iter_state, EntryVector* outputs,
-                               TaggedNodeSeq* ready)
-        TF_EXCLUSIVE_LOCKS_REQUIRED(mu);
+    // This variant does not use atomic operations to modify the pending counts
+    // and thus must hold the exclusive lock.
+    int ActivateNodesFastPathLocked(const NodeItem* item, const bool is_dead,
+                                    IterationState* iter_state,
+                                    EntryVector* outputs, TaggedNodeSeq* ready)
+        TF_EXCLUSIVE_LOCKS_REQUIRED(mu) {
+      return ActivateNodesFastPathInternal<false>(item, is_dead, iter_state,
+                                                  outputs, ready);
+    }
 
-    void ActivateNodesSlowPath(const NodeItem* item, const bool is_dead,
-                               IterationState* iter_state, EntryVector* outputs,
-                               TaggedNodeSeq* ready)
+    // REQUIRES: `!item->is_any_consumer_merge_or_control_trigger`.
+    // This variant uses atomic operations to modify the pending counts.
+    int ActivateNodesFastPathShared(const NodeItem* item, const bool is_dead,
+                                    IterationState* iter_state,
+                                    EntryVector* outputs, TaggedNodeSeq* ready)
+        TF_SHARED_LOCKS_REQUIRED(mu) {
+      return ActivateNodesFastPathInternal<true>(item, is_dead, iter_state,
+                                                 outputs, ready);
+    }
+
+    template <bool atomic>
+    int ActivateNodesFastPathInternal(const NodeItem* item, const bool is_dead,
+                                      IterationState* iter_state,
+                                      EntryVector* outputs,
+                                      TaggedNodeSeq* ready);
+
+    int ActivateNodesSlowPath(const NodeItem* item, const bool is_dead,
+                              IterationState* iter_state, EntryVector* outputs,
+                              TaggedNodeSeq* ready)
         TF_EXCLUSIVE_LOCKS_REQUIRED(mu);
   };
 
diff --git a/tensorflow/core/common_runtime/renamed_device.h b/tensorflow/core/common_runtime/renamed_device.h
index cbec750e86c..9a7c730c1fb 100644
--- a/tensorflow/core/common_runtime/renamed_device.h
+++ b/tensorflow/core/common_runtime/renamed_device.h
@@ -91,11 +91,6 @@ class RenamedDevice : public Device {
     return underlying_device_->has_eigen_cpu_device();
   }
 
-#ifdef TENSORFLOW_USE_SYCL
-  const Eigen::SyclDevice* eigen_sycl_device() const override {
-    return underlying_device_->eigen_sycl_device();
-  }
-#endif
 
   PerOpGpuDevice* MakeGpuDevice() override {
     return underlying_device_->MakeGpuDevice();
diff --git a/tensorflow/core/common_runtime/replicate_per_replica_nodes.cc b/tensorflow/core/common_runtime/replicate_per_replica_nodes.cc
index 610dc1b8835..5bf769d1f3e 100644
--- a/tensorflow/core/common_runtime/replicate_per_replica_nodes.cc
+++ b/tensorflow/core/common_runtime/replicate_per_replica_nodes.cc
@@ -159,6 +159,16 @@ class ReplicateHelper {
     return Status::OK();
   }
 
+  void RemoveDeadReplicatedArgs(Graph* graph) {
+    for (const auto& entry : replicated_nodes_map_) {
+      for (Node* replicated_node : entry.second) {
+        if (replicated_node->IsArg() && replicated_node->out_edges().empty()) {
+          graph->RemoveNode(replicated_node);
+        }
+      }
+    }
+  }
+
  private:
   // Map from original nodes to corresponding replicated nodes.
   absl::flat_hash_map<const Node*, std::vector<Node*>> replicated_nodes_map_;
@@ -256,6 +266,8 @@ Status ReplicatePerReplicaNodesInFunctionGraph(
     for (auto* n : cluster_nodes) {
       graph->RemoveNode(n);
     }
+
+    helper.RemoveDeadReplicatedArgs(graph);
   }
   return Status::OK();
 }
diff --git a/tensorflow/core/common_runtime/replicate_per_replica_nodes_test.cc b/tensorflow/core/common_runtime/replicate_per_replica_nodes_test.cc
index 0bf2001a955..19799f90f69 100644
--- a/tensorflow/core/common_runtime/replicate_per_replica_nodes_test.cc
+++ b/tensorflow/core/common_runtime/replicate_per_replica_nodes_test.cc
@@ -31,7 +31,7 @@ namespace {
 
 class GraphHelper {
  public:
-  explicit GraphHelper(const Graph& graph) {
+  explicit GraphHelper(const Graph& graph) : graph_(graph) {
     for (Node* node : graph.nodes()) {
       nodes_by_name_[node->name()] = node;
     }
@@ -55,6 +55,16 @@ class GraphHelper {
         ->set_assigned_device_name(device_name);
   }
 
+  void CheckArgNum(const int expected_num) {
+    int arg_num = 0;
+    for (Node* node : graph_.op_nodes()) {
+      if (node->IsArg()) {
+        arg_num++;
+      }
+    }
+    EXPECT_EQ(arg_num, expected_num);
+  }
+
   void CheckAssignedDevice(const string& node_name,
                            const string& expected_device_name) {
     EXPECT_EQ(expected_device_name,
@@ -62,6 +72,7 @@ class GraphHelper {
   }
 
  private:
+  const Graph& graph_;
   // Maps from a node name to a Node* in the graph.
   absl::flat_hash_map<string, Node*> nodes_by_name_;
 };
@@ -103,6 +114,7 @@ TEST(ReplicatePerReplicaNodesTest, SingleCompositeDevice) {
     // ReadVariableOp(TPU:0) -> _Retval(CPU:0)
     EXPECT_EQ(graph.num_op_nodes(), 7);
     GraphHelper helper(graph);
+    helper.CheckArgNum(2);
     helper.CheckAssignedDevice("arg/R0", "TPU:0");
     helper.CheckAssignedDevice("arg/R1", "TPU:1");
     helper.CheckAssignedDevice("read", "TPU:0");
@@ -141,6 +153,7 @@ TEST(ReplicatePerReplicaNodesTest, SingleCompositeDeviceToSingleDevice) {
     // _Arg(TPU:0) -> ReadVariableOp(TPU:0) -> _Retval(CPU:0)
     EXPECT_EQ(graph.num_op_nodes(), 3);
     GraphHelper helper(graph);
+    helper.CheckArgNum(1);
     helper.CheckAssignedDevice("arg", "TPU:0");
     helper.CheckAssignedDevice("read", "TPU:0");
     helper.CheckAssignedDevice("ret", "CPU:0");
@@ -192,6 +205,7 @@ TEST(ReplicatePerReplicaNodesTest, MultipleCompositeDevices) {
     // TPU:3) -> Identity(TPU:1, TPU:3) -> Add(TPU:0)-> _Retval(CPU:0)
     EXPECT_EQ(graph.num_op_nodes(), 12);
     GraphHelper helper(graph);
+    helper.CheckArgNum(4);
     helper.CheckAssignedDevice("arg0/R0", "TPU:0");
     helper.CheckAssignedDevice("arg0/R1", "TPU:1");
     helper.CheckAssignedDevice("arg1/R0", "TPU:2");
@@ -261,6 +275,7 @@ TEST(ReplicatePerReplicaNodesTest, NestedFunctions) {
     // _Arg(TPU:0), _Arg(TPU:1) -> Pack(CPU:0) -> Func(CPU:0) -> _Retval(CPU:0)
     EXPECT_EQ(graph.num_op_nodes(), 5);
     GraphHelper helper(graph);
+    helper.CheckArgNum(2);
     helper.CheckAssignedDevice("arg/R0", "TPU:0");
     helper.CheckAssignedDevice("arg/R1", "TPU:1");
     helper.CheckAssignedDevice("arg/Packed", "CPU:0");
@@ -279,5 +294,41 @@ TEST(ReplicatePerReplicaNodesTest, NestedFunctions) {
   }
 }
 
+TEST(ReplicatePerReplicaNodesTest, DeadArgNodes) {
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+  Output arg = ops::_Arg(scope.WithOpName("arg"), DT_RESOURCE, 0);
+  auto read = ops::ReadVariableOp(scope.WithOpName("read"), arg, DT_INT32);
+  auto ret = ops::_Retval(scope.WithOpName("ret"), read, 0);
+
+  const std::vector<string> underlying_devices = {"TPU:0", "TPU:1"};
+  const absl::flat_hash_map<string, const std::vector<string>*>
+      composite_devices = {{"TPU_COMPOSITE:0", &underlying_devices}};
+
+  Graph graph(OpRegistry::Global());
+  TF_ASSERT_OK(scope.ToGraph(&graph));
+  {
+    // _Arg(TPU_COMPOSITE:0) -> ReadVariableOp(TPU:0) -> _Retval(CPU:0)
+    ASSERT_EQ(graph.num_op_nodes(), 3);
+    GraphHelper helper(graph);
+    helper.SetAssignedDevice("arg", "TPU_COMPOSITE:0");
+    helper.SetAssignedDevice("read", "TPU:0");
+    helper.SetAssignedDevice("ret", "CPU:0");
+  }
+
+  TF_EXPECT_OK(
+      ReplicatePerReplicaNodesInFunctionGraph(composite_devices, &graph));
+
+  {
+    // _Arg(TPU:0) -> ReadVariableOp(TPU:0) -> _Retval(CPU:0)
+    // "arg/R1" is a dead node, so gets removed.
+    EXPECT_EQ(graph.num_op_nodes(), 3);
+    GraphHelper helper(graph);
+    helper.CheckArgNum(1);
+    helper.CheckAssignedDevice("arg/R0", "TPU:0");
+    helper.CheckAssignedDevice("read", "TPU:0");
+    helper.CheckAssignedDevice("ret", "CPU:0");
+  }
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/ring_alg.cc b/tensorflow/core/common_runtime/ring_alg.cc
index 753f6ba982e..870429bd883 100644
--- a/tensorflow/core/common_runtime/ring_alg.cc
+++ b/tensorflow/core/common_runtime/ring_alg.cc
@@ -384,7 +384,7 @@ void RingAlg::DispatchSend(RingField* rf, const StatusCallback& done) {
   int send_to_rank = (rf->rank + 1) % group_size_;
   int send_to_dev_idx = col_params_->instance.impl_details
                             .subdiv_permutations[rf->subdiv_idx][send_to_rank];
-  col_ctx_->col_exec->PostToPeer(
+  col_ctx_->col_exec->remote_access()->PostToPeer(
       col_params_->instance.device_names[send_to_dev_idx],
       col_params_->instance.task_names[send_to_dev_idx], send_buf_key,
       col_ctx_->device, col_ctx_->op_ctx->op_device_context(),
@@ -403,7 +403,7 @@ void RingAlg::DispatchRecv(RingField* rf, const StatusCallback& done) {
   Tensor* dst_tensor = (!rf->second_pass && (col_params_->merge_op != nullptr))
                            ? &rf->tmp_chunk
                            : &rf->chunk;
-  col_ctx_->col_exec->RecvFromPeer(
+  col_ctx_->col_exec->remote_access()->RecvFromPeer(
       col_params_->instance.device_names[rf->recv_dev_idx],
       col_params_->instance.task_names[rf->recv_dev_idx],
       col_params_->task.is_local[rf->recv_dev_idx], recv_buf_key,
diff --git a/tensorflow/core/common_runtime/ring_gatherer_test.cc b/tensorflow/core/common_runtime/ring_gatherer_test.cc
index 3e70f523ff5..8fccaf4ea6f 100644
--- a/tensorflow/core/common_runtime/ring_gatherer_test.cc
+++ b/tensorflow/core/common_runtime/ring_gatherer_test.cc
@@ -45,9 +45,8 @@ namespace tensorflow {
 class FailTestRMA : public CollectiveRemoteAccessLocal {
  public:
   FailTestRMA(const DeviceMgr* dev_mgr, DeviceResolverInterface* dev_resolver,
-              std::shared_ptr<UnboundedWorkQueue> work_queue, int64 step_id,
-              int fail_after)
-      : CollectiveRemoteAccessLocal(dev_mgr, dev_resolver, work_queue, step_id),
+              int64 step_id, int fail_after)
+      : CollectiveRemoteAccessLocal(dev_mgr, dev_resolver, step_id),
         fail_after_(fail_after) {}
 
   bool MaybeFail(const StatusCallback& done) {
@@ -173,10 +172,11 @@ class RingGathererTest : public ::testing::Test {
     }
     dev_resolver_ = absl::make_unique<DeviceResolverLocal>(dev_mgr_.get());
     work_queue_ = std::make_shared<UnboundedWorkQueue>(Env::Default(), "test");
-    rma_ = new FailTestRMA(dev_mgr_.get(), dev_resolver_.get(), work_queue_,
-                           kStepId, fail_after);
-    col_exec_ = new BaseCollectiveExecutor(
-        &col_exec_mgr_, rma_, kStepId, dev_mgr_.get(), gpu_ring_order_.get());
+    rma_ = new FailTestRMA(dev_mgr_.get(), dev_resolver_.get(), kStepId,
+                           fail_after);
+    col_exec_ = new BaseCollectiveExecutor(&col_exec_mgr_, rma_, kStepId,
+                                           dev_mgr_.get(),
+                                           gpu_ring_order_.get(), work_queue_);
     col_params_.name = "test_collective";
     static const int kGroupKey = 5;
     col_params_.group.group_key = kGroupKey;
@@ -480,8 +480,9 @@ class RingGathererTest : public ::testing::Test {
       RingGatherer* gatherer = new RingGatherer;
       core::ScopedUnref unref(gatherer);
       auto col_ctx = std::make_shared<CollectiveContext>(
-          parent_->col_exec_, parent_->dev_mgr_.get(), &ctx, &op_params,
-          col_params_, exec_key, kStepId, &input_tensor_, output_tensor_ptr);
+          parent_->col_exec_, /*nccl_communicator*/ nullptr,
+          parent_->dev_mgr_.get(), &ctx, &op_params, col_params_, exec_key,
+          kStepId, &input_tensor_, output_tensor_ptr);
       TF_CHECK_OK(gatherer->InitializeCollectiveContext(col_ctx));
 
       // Run the all-gather.
diff --git a/tensorflow/core/common_runtime/ring_reducer_test.cc b/tensorflow/core/common_runtime/ring_reducer_test.cc
index a7f99cf0f45..d50b83d5644 100644
--- a/tensorflow/core/common_runtime/ring_reducer_test.cc
+++ b/tensorflow/core/common_runtime/ring_reducer_test.cc
@@ -45,9 +45,8 @@ namespace tensorflow {
 class FailTestRMA : public CollectiveRemoteAccessLocal {
  public:
   FailTestRMA(const DeviceMgr* dev_mgr, DeviceResolverInterface* dev_resolver,
-              std::shared_ptr<UnboundedWorkQueue> work_queue, int64 step_id,
-              int fail_after)
-      : CollectiveRemoteAccessLocal(dev_mgr, dev_resolver, work_queue, step_id),
+              int64 step_id, int fail_after)
+      : CollectiveRemoteAccessLocal(dev_mgr, dev_resolver, step_id),
         fail_after_(fail_after) {}
 
   bool MaybeFail(const StatusCallback& done) {
@@ -195,10 +194,11 @@ class RingReducerTest : public ::testing::Test {
     }
     dev_resolver_ = absl::make_unique<DeviceResolverLocal>(dev_mgr_.get());
     work_queue_ = std::make_shared<UnboundedWorkQueue>(Env::Default(), "test");
-    rma_ = new FailTestRMA(dev_mgr_.get(), dev_resolver_.get(), work_queue_,
-                           kStepId, fail_after);
-    col_exec_ = new BaseCollectiveExecutor(
-        &col_exec_mgr_, rma_, kStepId, dev_mgr_.get(), gpu_ring_order_.get());
+    rma_ = new FailTestRMA(dev_mgr_.get(), dev_resolver_.get(), kStepId,
+                           fail_after);
+    col_exec_ = new BaseCollectiveExecutor(&col_exec_mgr_, rma_, kStepId,
+                                           dev_mgr_.get(),
+                                           gpu_ring_order_.get(), work_queue_);
     col_params_.name = "test_collective";
     static const int kGroupKey = 5;
     col_params_.group.group_key = kGroupKey;
@@ -510,8 +510,9 @@ class RingReducerTest : public ::testing::Test {
       RingReducer* reducer = new RingReducer;
       core::ScopedUnref unref(reducer);
       auto col_ctx = std::make_shared<CollectiveContext>(
-          parent_->col_exec_, parent_->dev_mgr_.get(), &ctx, &op_params,
-          col_params_, exec_key, kStepId, &tensor_, &tensor_);
+          parent_->col_exec_, /*nccl_communicator*/ nullptr,
+          parent_->dev_mgr_.get(), &ctx, &op_params, col_params_, exec_key,
+          kStepId, &tensor_, &tensor_);
       TF_CHECK_OK(reducer->InitializeCollectiveContext(col_ctx));
 
       // Run the all-reduce.
diff --git a/tensorflow/core/common_runtime/sycl/BUILD b/tensorflow/core/common_runtime/sycl/BUILD
deleted file mode 100644
index 426903197df..00000000000
--- a/tensorflow/core/common_runtime/sycl/BUILD
+++ /dev/null
@@ -1,46 +0,0 @@
-load(
-    "//tensorflow:tensorflow.bzl",
-    "if_not_windows",
-    "tf_copts",
-)
-load(
-    "//tensorflow/core/platform:rules_cc.bzl",
-    "cc_library",
-)
-
-package(
-    default_visibility = [
-        "//tensorflow:internal",
-    ],
-    features = ["-parse_headers"],
-    licenses = ["notice"],  # Apache 2.0
-)
-
-cc_library(
-    name = "sycl_runtime",
-    srcs = if_not_windows([
-        "sycl_allocator.cc",
-        "sycl_device.cc",
-        "sycl_device_context.cc",
-        "sycl_device_factory.cc",
-    ]),
-    hdrs = if_not_windows([
-        "sycl_allocator.h",
-        "sycl_device.h",
-        "sycl_util.h",
-        "sycl_device_context.h",
-    ]),
-    copts = tf_copts(),
-    linkstatic = 0,
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core/common_runtime:core_cpu",
-        "//tensorflow/core/common_runtime:core_cpu_internal",
-        "//third_party/eigen3",
-        "@local_config_sycl//sycl",
-    ],
-    alwayslink = 0,
-)
diff --git a/tensorflow/core/common_runtime/sycl/sycl_allocator.cc b/tensorflow/core/common_runtime/sycl/sycl_allocator.cc
deleted file mode 100644
index 6a784efe6f5..00000000000
--- a/tensorflow/core/common_runtime/sycl/sycl_allocator.cc
+++ /dev/null
@@ -1,92 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifdef TENSORFLOW_USE_SYCL
-
-#include "tensorflow/core/common_runtime/sycl/sycl_allocator.h"
-
-namespace tensorflow {
-
-SYCLAllocator::SYCLAllocator(Eigen::QueueInterface* queue)
-    : sycl_device_(new Eigen::SyclDevice(queue)) {
-  cl::sycl::queue& sycl_queue = sycl_device_->sycl_queue();
-  const cl::sycl::device& device = sycl_queue.get_device();
-  stats_.bytes_limit =
-      device.get_info<cl::sycl::info::device::max_mem_alloc_size>();
-}
-
-SYCLAllocator::~SYCLAllocator() {
-  if (sycl_device_) {
-    delete sycl_device_;
-  }
-}
-
-string SYCLAllocator::Name() { return "device:SYCL"; }
-
-void* SYCLAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
-  mutex_lock lock(mu_);
-  assert(sycl_device_);
-  if (num_bytes == 0) {
-    // Cannot allocate no bytes in SYCL, so instead allocate a single byte
-    num_bytes = 1;
-  }
-  auto p = sycl_device_->allocate(num_bytes);
-  const auto& allocated_buffer = sycl_device_->get_sycl_buffer(p);
-  const std::size_t bytes_allocated = allocated_buffer.get_range().size();
-
-  ++stats_.num_allocs;
-  stats_.bytes_in_use += bytes_allocated;
-  stats_.max_bytes_in_use =
-      std::max<int64>(stats_.max_bytes_in_use, stats_.bytes_in_use);
-  stats_.max_alloc_size =
-      std::max<int64>(stats_.max_alloc_size, bytes_allocated);
-
-  return p;
-}
-
-void SYCLAllocator::DeallocateRaw(void* ptr) {
-  mutex_lock lock(mu_);
-  if (sycl_device_) {
-    const auto& buffer_to_delete = sycl_device_->get_sycl_buffer(ptr);
-    const std::size_t dealloc_size = buffer_to_delete.get_range().size();
-    stats_.bytes_in_use -= dealloc_size;
-    sycl_device_->deallocate(ptr);
-  }
-}
-
-void SYCLAllocator::GetStats(AllocatorStats* stats) {
-  mutex_lock lock(mu_);
-  *stats = stats_;
-}
-
-void SYCLAllocator::ClearStats() override {
-  mutex_lock l(mu_);
-  stats_.num_allocs = 0;
-  stats_.max_bytes_in_use = stats_.bytes_in_use;
-  stats_.max_alloc_size = 0;
-}
-
-size_t SYCLAllocator::RequestedSize(const void* ptr) const {
-  mutex_lock lock(mu_);
-  if (!sycl_device_) {
-    return 0;
-  }
-  const auto& buffer = sycl_device_->get_sycl_buffer(ptr);
-  return buffer.get_size();
-}
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/common_runtime/sycl/sycl_allocator.h b/tensorflow/core/common_runtime/sycl/sycl_allocator.h
deleted file mode 100644
index a70291181d0..00000000000
--- a/tensorflow/core/common_runtime/sycl/sycl_allocator.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#if !TENSORFLOW_USE_SYCL
-#error This file must only be included when building TensorFlow with SYCL support
-#endif
-
-#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_SYCL_SYCL_ALLOCATOR_H_
-#define TENSORFLOW_CORE_COMMON_RUNTIME_SYCL_SYCL_ALLOCATOR_H_
-
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/framework/allocator.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/types.h"
-
-namespace tensorflow {
-
-class SYCLAllocator : public Allocator {
- public:
-  SYCLAllocator(Eigen::QueueInterface* queue);
-  ~SYCLAllocator() override;
-  string Name() override;
-  void* AllocateRaw(size_t alignment, size_t num_bytes) override;
-  void DeallocateRaw(void* ptr) override;
-
-  bool ShouldAllocateEmptyTensors() const final { return true; }
-  void Synchronize() {
-    mutex_lock lock(mu_);
-    if (sycl_device_) {
-      sycl_device_->synchronize();
-    }
-  }
-  bool Ok() const { return sycl_device_ && sycl_device_->ok(); }
-  void GetStats(AllocatorStats* stats) override;
-  void ClearStats() override;
-
-  // The SYCL buffers keep track of their size, so we already have tracking.
-  bool TracksAllocationSizes() const override { return true; }
-  // Get the size of the corresponding SYCL buffer.
-  // Implementing this also provides an implementation of
-  // AllocatedSize(void* ptr) by default.
-  size_t RequestedSize(const void* ptr) const override;
-  Eigen::SyclDevice* getSyclDevice() { return sycl_device_; }
-  // Clear the SYCL device used by the Allocator
-  void ClearSYCLDevice() {
-    mutex_lock lock(mu_);
-    if (sycl_device_) {
-      delete sycl_device_;
-      sycl_device_ = nullptr;
-    }
-  }
-
- private:
-  mutable mutex mu_;
-  Eigen::SyclDevice* sycl_device_ TF_GUARDED_BY(mu_);  // owned
-  AllocatorStats stats_ TF_GUARDED_BY(mu_);
-
-  TF_DISALLOW_COPY_AND_ASSIGN(SYCLAllocator);
-};
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_SYCL_SYCL_ALLOCATOR_H_
diff --git a/tensorflow/core/common_runtime/sycl/sycl_device.cc b/tensorflow/core/common_runtime/sycl/sycl_device.cc
deleted file mode 100644
index 8293e6d8881..00000000000
--- a/tensorflow/core/common_runtime/sycl/sycl_device.cc
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#if TENSORFLOW_USE_SYCL
-
-#include "tensorflow/core/common_runtime/sycl/sycl_device.h"
-
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/framework/tensor.pb.h"
-#include "tensorflow/core/platform/tracing.h"
-
-namespace tensorflow {
-
-SYCLDevice::~SYCLDevice() {}
-
-void SYCLDevice::Compute(OpKernel* op_kernel, OpKernelContext* context) {
-  assert(context);
-  // When ThreadScape profiling is off (which is the default), constructing the
-  // following code is simple enough that its overhead is negligible.
-  tracing::ScopedRegion region(tracing::EventCategory::kCompute,
-                               op_kernel->name());
-
-  op_kernel->Compute(context);
-}
-
-Allocator* SYCLDevice::GetAllocator(AllocatorAttributes attr) {
-  if (attr.on_host())
-    return cpu_allocator_;
-  else
-    return sycl_allocator_;
-}
-
-Status SYCLDevice::MakeTensorFromProto(const TensorProto& tensor_proto,
-                                       const AllocatorAttributes alloc_attrs,
-                                       Tensor* tensor) {
-  AllocatorAttributes attr;
-  attr.set_on_host(true);
-  Allocator* host_alloc = GetAllocator(attr);
-
-  Tensor parsed(tensor_proto.dtype());
-  if (!parsed.FromProto(host_alloc, tensor_proto)) {
-    return errors::InvalidArgument("Cannot parse tensor from proto: ",
-                                   tensor_proto.DebugString());
-  }
-  Status status;
-  if (alloc_attrs.on_host()) {
-    *tensor = parsed;
-  } else {
-    Tensor copy(GetAllocator(alloc_attrs), parsed.dtype(), parsed.shape());
-
-    // If the tensor is not initialized, we likely ran out of memory.
-    if (!copy.IsInitialized()) {
-      return errors::ResourceExhausted(
-          "OOM when allocating tensor of shape ", parsed.shape().DebugString(),
-          " and type ", DataTypeString(parsed.dtype()));
-    }
-
-    device_context_->CopyCPUTensorToDevice(
-        &parsed, this, &copy, [&status](const Status& s) { status = s; });
-    *tensor = copy;
-  }
-  return status;
-}
-
-Status SYCLDevice::TryGetDeviceContext(DeviceContext** out_context) {
-  device_context_->Ref();
-  *out_context = device_context_;
-  return Status::OK();
-}
-
-Status SYCLDevice::Sync() {
-  sycl_allocator_->Synchronize();
-  if (sycl_allocator_->Ok()) {
-    return Status::OK();
-  } else {
-    return errors::Internal("Unknown error detected on device ", name());
-  }
-}
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/common_runtime/sycl/sycl_device.h b/tensorflow/core/common_runtime/sycl/sycl_device.h
deleted file mode 100644
index 08b5b3979ca..00000000000
--- a/tensorflow/core/common_runtime/sycl/sycl_device.h
+++ /dev/null
@@ -1,231 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#if !TENSORFLOW_USE_SYCL
-#error This file must only be included when building TensorFlow with SYCL support
-#endif
-
-#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_SYCL_SYCL_DEVICE_H_
-#define TENSORFLOW_CORE_COMMON_RUNTIME_SYCL_SYCL_DEVICE_H_
-
-#include "tensorflow/core/common_runtime/local_device.h"
-#include "tensorflow/core/common_runtime/sycl/sycl_allocator.h"
-#include "tensorflow/core/common_runtime/sycl/sycl_device_context.h"
-#include "tensorflow/core/public/session_options.h"
-
-namespace tensorflow {
-
-class GSYCLInterface {
-  std::vector<Eigen::QueueInterface*> m_queue_interface_;  // owned
-  std::vector<Allocator*> m_cpu_allocator_;                // not owned
-  std::vector<SYCLAllocator*> m_sycl_allocator_;           // owned
-  std::vector<SYCLDeviceContext*> m_sycl_context_;         // ref counted
-  GSYCLInterface() {
-    bool found_device = false;
-    auto device_list = Eigen::get_sycl_supported_devices();
-    // Obtain list of supported devices from Eigen
-    for (const auto& device : device_list) {
-      if (device.is_gpu()) {
-        // returns first found GPU
-        AddDevice(device);
-        found_device = true;
-      }
-    }
-
-    if (!found_device) {
-      // Currently Intel GPU is not supported
-      LOG(WARNING) << "No OpenCL GPU found that is supported by "
-                   << "ComputeCpp/triSYCL, trying OpenCL CPU";
-    }
-
-    for (const auto& device : device_list) {
-      if (device.is_cpu()) {
-        // returns first found CPU
-        AddDevice(device);
-        found_device = true;
-      }
-    }
-
-    if (!found_device) {
-      LOG(WARNING) << "No OpenCL CPU found that is supported by "
-                   << "ComputeCpp/triSYCL, checking for host sycl device";
-    }
-
-    for (const auto& device : device_list) {
-      // triSYCL only supports the host device for now
-      if (device.is_host()) {
-        LOG(WARNING) << "Found SYCL host device";
-        AddDevice(device);
-        found_device = true;
-      }
-    }
-
-    if (!found_device) {
-      // Currently Intel GPU is not supported
-      LOG(FATAL) << "No SYCL host and no OpenCL GPU nor CPU"
-                 << " supported by ComputeCPP/triSYCL was found";
-    } else {
-      LOG(INFO) << "Found following OpenCL devices:";
-      for (int i = 0; i < device_list.size(); i++) {
-        LOG(INFO) << GetShortDeviceDescription(i);
-      }
-    }
-  }
-
-  ~GSYCLInterface() {
-    m_cpu_allocator_.clear();
-
-    for (auto p : m_sycl_allocator_) {
-      p->Synchronize();
-      p->ClearSYCLDevice();
-      // Cannot delete the Allocator instances, as the Allocator lifetime
-      // needs to exceed any Tensor created by it. There is no way of
-      // knowing when all Tensors have been deallocated, as they are
-      // RefCounted and wait until all instances of a Tensor have been
-      // destroyed before calling Allocator.Deallocate. This could happen at
-      // program exit, which can set up a race condition between destroying
-      // Tensors and Allocators when the program is cleaning up.
-    }
-    m_sycl_allocator_.clear();
-
-    for (auto p : m_sycl_context_) {
-      p->Unref();
-    }
-    m_sycl_context_.clear();
-
-    for (auto p : m_queue_interface_) {
-      p->deallocate_all();
-      delete p;
-    }
-    m_queue_interface_.clear();
-  }
-
-  void AddDevice(const cl::sycl::device& d) {
-    m_queue_interface_.push_back(new Eigen::QueueInterface(d));
-    m_cpu_allocator_.push_back(cpu_allocator());
-    m_sycl_allocator_.push_back(new SYCLAllocator(m_queue_interface_.back()));
-    m_sycl_context_.push_back(new SYCLDeviceContext());
-  }
-
- public:
-  static const GSYCLInterface* instance() {
-    // c++11 guarantees that this will be constructed in a thread safe way
-    static const GSYCLInterface instance;
-    return &instance;
-  }
-
-  Eigen::QueueInterface* GetQueueInterface(size_t i = 0) const {
-    if (!m_queue_interface_.empty()) {
-      return m_queue_interface_[i];
-    } else {
-      std::cerr << "No cl::sycl::device has been added" << std::endl;
-      return nullptr;
-    }
-  }
-
-  SYCLAllocator* GetSYCLAllocator(size_t i = 0) const {
-    if (!m_sycl_allocator_.empty()) {
-      return m_sycl_allocator_[i];
-    } else {
-      std::cerr << "No cl::sycl::device has been added" << std::endl;
-      return nullptr;
-    }
-  }
-
-  Allocator* GetCPUAllocator(size_t i = 0) const {
-    if (!m_cpu_allocator_.empty()) {
-      return m_cpu_allocator_[i];
-    } else {
-      std::cerr << "No cl::sycl::device has been added" << std::endl;
-      return nullptr;
-    }
-  }
-
-  SYCLDeviceContext* GetSYCLContext(size_t i = 0) const {
-    if (!m_sycl_context_.empty()) {
-      return m_sycl_context_[i];
-    } else {
-      std::cerr << "No cl::sycl::device has been added" << std::endl;
-      return nullptr;
-    }
-  }
-
-  string GetShortDeviceDescription(int device_id = 0) const {
-    Eigen::QueueInterface* queue_ptr = GetQueueInterface(device_id);
-    if (!queue_ptr) {
-      LOG(ERROR)
-          << "Device name cannot be given after Eigen QueueInterface destroyed";
-      return "";
-    }
-    auto device = queue_ptr->sycl_queue().get_device();
-    auto name = device.get_info<cl::sycl::info::device::name>();
-    auto vendor = device.get_info<cl::sycl::info::device::vendor>();
-    auto profile = device.get_info<cl::sycl::info::device::profile>();
-
-    std::string type;
-    if (device.is_host()) {
-      type = "Host";
-    } else if (device.is_cpu()) {
-      type = "CPU";
-    } else if (device.is_gpu()) {
-      type = "GPU";
-    } else if (device.is_accelerator()) {
-      type = "Accelerator";
-    } else {
-      type = "Unknown";
-    }
-
-    return strings::StrCat(
-        "id: ", device_id, ", type: ", type, ", name: ", name.c_str(),
-        ", vendor: ", vendor.c_str(), ", profile: ", profile.c_str());
-  }
-};
-
-class SYCLDevice : public LocalDevice {
- public:
-  SYCLDevice(const SessionOptions& options, const string& name,
-             Bytes memory_limit, const DeviceLocality& locality,
-             const string& physical_device_desc, SYCLAllocator* sycl_allocator,
-             Allocator* cpu_allocator, SYCLDeviceContext* ctx)
-      : LocalDevice(options, Device::BuildDeviceAttributes(
-                                 name, DEVICE_SYCL, memory_limit, locality,
-                                 physical_device_desc)),
-        cpu_allocator_(cpu_allocator),
-        sycl_allocator_(sycl_allocator),
-        device_context_(ctx) {
-    set_eigen_sycl_device(sycl_allocator->getSyclDevice());
-  }
-
-  ~SYCLDevice() override;
-
-  void Compute(OpKernel* op_kernel, OpKernelContext* context) override;
-  Allocator* GetAllocator(AllocatorAttributes attr) override;
-  Status MakeTensorFromProto(const TensorProto& tensor_proto,
-                             const AllocatorAttributes alloc_attrs,
-                             Tensor* tensor) override;
-
-  Status TryGetDeviceContext(DeviceContext** out_context) override;
-
-  Status Sync() override;
-
- private:
-  Allocator* cpu_allocator_;           // not owned
-  SYCLAllocator* sycl_allocator_;      // not owned
-  SYCLDeviceContext* device_context_;  // not owned
-};
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_SYCL_SYCL_DEVICE_H_
diff --git a/tensorflow/core/common_runtime/sycl/sycl_device_context.cc b/tensorflow/core/common_runtime/sycl/sycl_device_context.cc
deleted file mode 100644
index 1c868f5606e..00000000000
--- a/tensorflow/core/common_runtime/sycl/sycl_device_context.cc
+++ /dev/null
@@ -1,181 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#if TENSORFLOW_USE_SYCL
-
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-
-#include "tensorflow/core/common_runtime/dma_helper.h"
-#include "tensorflow/core/common_runtime/sycl/sycl_device_context.h"
-
-namespace tensorflow {
-
-void SYCLDeviceContext::CopyCPUTensorToDevice(const Tensor *cpu_tensor,
-                                              Device *device,
-                                              Tensor *device_tensor,
-                                              StatusCallback done) const {
-  const int64 total_bytes = cpu_tensor->TotalBytes();
-  if (total_bytes > 0) {
-    const void *src_ptr = DMAHelper::base(cpu_tensor);
-    void *dst_ptr = DMAHelper::base(device_tensor);
-    switch (cpu_tensor->dtype()) {
-      case DT_FLOAT:
-        device->eigen_sycl_device()->memcpyHostToDevice(
-            static_cast<float *>(dst_ptr), static_cast<const float *>(src_ptr),
-            total_bytes);
-        break;
-      case DT_DOUBLE:
-        device->eigen_sycl_device()->memcpyHostToDevice(
-            static_cast<double *>(dst_ptr),
-            static_cast<const double *>(src_ptr), total_bytes);
-        break;
-      case DT_INT32:
-        device->eigen_sycl_device()->memcpyHostToDevice(
-            static_cast<int32 *>(dst_ptr), static_cast<const int32 *>(src_ptr),
-            total_bytes);
-        break;
-      case DT_INT64:
-        device->eigen_sycl_device()->memcpyHostToDevice(
-            static_cast<int64 *>(dst_ptr), static_cast<const int64 *>(src_ptr),
-            total_bytes);
-        break;
-      case DT_HALF:
-        device->eigen_sycl_device()->memcpyHostToDevice(
-            static_cast<Eigen::half *>(dst_ptr),
-            static_cast<const Eigen::half *>(src_ptr), total_bytes);
-        break;
-      case DT_COMPLEX64:
-        device->eigen_sycl_device()->memcpyHostToDevice(
-            static_cast<std::complex<float> *>(dst_ptr),
-            static_cast<const std::complex<float> *>(src_ptr), total_bytes);
-        break;
-      case DT_COMPLEX128:
-        device->eigen_sycl_device()->memcpyHostToDevice(
-            static_cast<std::complex<double> *>(dst_ptr),
-            static_cast<const std::complex<double> *>(src_ptr), total_bytes);
-        break;
-      case DT_INT8:
-        device->eigen_sycl_device()->memcpyHostToDevice(
-            static_cast<int8 *>(dst_ptr), static_cast<const int8 *>(src_ptr),
-            total_bytes);
-        break;
-      case DT_INT16:
-        device->eigen_sycl_device()->memcpyHostToDevice(
-            static_cast<int16 *>(dst_ptr), static_cast<const int16 *>(src_ptr),
-            total_bytes);
-        break;
-      case DT_UINT8:
-        device->eigen_sycl_device()->memcpyHostToDevice(
-            static_cast<uint8 *>(dst_ptr), static_cast<const uint8 *>(src_ptr),
-            total_bytes);
-        break;
-      case DT_UINT16:
-        device->eigen_sycl_device()->memcpyHostToDevice(
-            static_cast<uint16 *>(dst_ptr),
-            static_cast<const uint16 *>(src_ptr), total_bytes);
-        break;
-      case DT_BOOL:
-        device->eigen_sycl_device()->memcpyHostToDevice(
-            static_cast<bool *>(dst_ptr), static_cast<const bool *>(src_ptr),
-            total_bytes);
-        break;
-      default:
-        assert(false && "unsupported type");
-    }
-  }
-  device->eigen_sycl_device()->synchronize();
-  done(Status::OK());
-}
-
-void SYCLDeviceContext::CopyDeviceTensorToCPU(const Tensor *device_tensor,
-                                              StringPiece edge_name,
-                                              Device *device,
-                                              Tensor *cpu_tensor,
-                                              StatusCallback done) {
-  const int64 total_bytes = device_tensor->TotalBytes();
-  if (total_bytes > 0) {
-    const void *src_ptr = DMAHelper::base(device_tensor);
-    void *dst_ptr = DMAHelper::base(cpu_tensor);
-    switch (device_tensor->dtype()) {
-      case DT_FLOAT:
-        device->eigen_sycl_device()->memcpyDeviceToHost(
-            static_cast<float *>(dst_ptr), static_cast<const float *>(src_ptr),
-            total_bytes);
-        break;
-      case DT_DOUBLE:
-        device->eigen_sycl_device()->memcpyDeviceToHost(
-            static_cast<double *>(dst_ptr),
-            static_cast<const double *>(src_ptr), total_bytes);
-        break;
-      case DT_INT32:
-        device->eigen_sycl_device()->memcpyDeviceToHost(
-            static_cast<int32 *>(dst_ptr), static_cast<const int32 *>(src_ptr),
-            total_bytes);
-        break;
-      case DT_INT64:
-        device->eigen_sycl_device()->memcpyDeviceToHost(
-            static_cast<int64 *>(dst_ptr), static_cast<const int64 *>(src_ptr),
-            total_bytes);
-        break;
-      case DT_HALF:
-        device->eigen_sycl_device()->memcpyDeviceToHost(
-            static_cast<Eigen::half *>(dst_ptr),
-            static_cast<const Eigen::half *>(src_ptr), total_bytes);
-        break;
-      case DT_COMPLEX64:
-        device->eigen_sycl_device()->memcpyDeviceToHost(
-            static_cast<std::complex<float> *>(dst_ptr),
-            static_cast<const std::complex<float> *>(src_ptr), total_bytes);
-        break;
-      case DT_COMPLEX128:
-        device->eigen_sycl_device()->memcpyDeviceToHost(
-            static_cast<std::complex<double> *>(dst_ptr),
-            static_cast<const std::complex<double> *>(src_ptr), total_bytes);
-        break;
-      case DT_INT8:
-        device->eigen_sycl_device()->memcpyDeviceToHost(
-            static_cast<int8 *>(dst_ptr), static_cast<const int8 *>(src_ptr),
-            total_bytes);
-        break;
-      case DT_INT16:
-        device->eigen_sycl_device()->memcpyDeviceToHost(
-            static_cast<int16 *>(dst_ptr), static_cast<const int16 *>(src_ptr),
-            total_bytes);
-        break;
-      case DT_UINT8:
-        device->eigen_sycl_device()->memcpyDeviceToHost(
-            static_cast<uint8 *>(dst_ptr), static_cast<const uint8 *>(src_ptr),
-            total_bytes);
-        break;
-      case DT_UINT16:
-        device->eigen_sycl_device()->memcpyDeviceToHost(
-            static_cast<uint16 *>(dst_ptr),
-            static_cast<const uint16 *>(src_ptr), total_bytes);
-        break;
-      case DT_BOOL:
-        device->eigen_sycl_device()->memcpyDeviceToHost(
-            static_cast<bool *>(dst_ptr), static_cast<const bool *>(src_ptr),
-            total_bytes);
-        break;
-      default:
-        assert(false && "unsupported type");
-    }
-  }
-  device->eigen_sycl_device()->synchronize();
-  done(Status::OK());
-}
-
-}  // namespace tensorflow
-#endif  // TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/common_runtime/sycl/sycl_device_context.h b/tensorflow/core/common_runtime/sycl/sycl_device_context.h
deleted file mode 100644
index 0f8f17b8058..00000000000
--- a/tensorflow/core/common_runtime/sycl/sycl_device_context.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#if !TENSORFLOW_USE_SYCL
-#error This file must only be included when building TensorFlow with SYCL support
-#endif
-
-#ifndef TENSORFLOW_COMMON_RUNTIME_SYCL_SYCL_DEVICE_CONTEXT_H_
-#define TENSORFLOW_COMMON_RUNTIME_SYCL_SYCL_DEVICE_CONTEXT_H_
-
-#include "tensorflow/core/common_runtime/device.h"
-#include "tensorflow/core/framework/device_base.h"
-
-namespace tensorflow {
-
-class SYCLDeviceContext : public DeviceContext {
- public:
-  SYCLDeviceContext() {}
-
-  ~SYCLDeviceContext() override {}
-
-  void CopyCPUTensorToDevice(const Tensor *cpu_tensor, Device *device,
-                             Tensor *device_tensor,
-                             StatusCallback done) const override;
-
-  void CopyDeviceTensorToCPU(const Tensor *device_tensor, StringPiece edge_name,
-                             Device *device, Tensor *cpu_tensor,
-                             StatusCallback done) override;
-};
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_COMMON_RUNTIME_SYCL_SYCL_DEVICE_CONTEXT_H_
diff --git a/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc b/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc
deleted file mode 100644
index ca575450279..00000000000
--- a/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#if TENSORFLOW_USE_SYCL
-
-#include "tensorflow/core/common_runtime/device_factory.h"
-#include "tensorflow/core/common_runtime/sycl/sycl_device.h"
-
-#include "tensorflow/core/common_runtime/sycl/sycl_util.h"
-
-namespace tensorflow {
-
-class SYCLDeviceFactory : public DeviceFactory {
- public:
-  Status ListPhysicalDevices(std::vector<string>* devices) override {
-    return tensorflow::Status::OK();
-  }
-
-  Status CreateDevices(const SessionOptions& options, const string& name_prefix,
-                       std::vector<std::unique_ptr<Device>>* devices) override {
-    auto syclInterface = GSYCLInterface::instance();
-
-    size_t n = 1;
-    auto iter = options.config.device_count().find("SYCL");
-    if (iter != options.config.device_count().end()) {
-      n = iter->second;
-    }
-
-    for (int i = 0; i < n; i++) {
-      string name = strings::StrCat(name_prefix, "/device:SYCL:", i);
-      devices->push_back(new SYCLDevice(
-          options, name, Bytes(256 << 20), DeviceLocality(),
-          syclInterface->GetShortDeviceDescription(i),
-          syclInterface->GetSYCLAllocator(i), syclInterface->GetCPUAllocator(i),
-          syclInterface->GetSYCLContext(i)));
-    }
-
-    return Status::OK();
-  }
-};
-
-REGISTER_LOCAL_DEVICE_FACTORY("SYCL", SYCLDeviceFactory, 200);
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/common_runtime/sycl/sycl_util.h b/tensorflow/core/common_runtime/sycl/sycl_util.h
deleted file mode 100644
index 3124ed23c92..00000000000
--- a/tensorflow/core/common_runtime/sycl/sycl_util.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#if !TENSORFLOW_USE_SYCL
-#error This file must only be included when building TensorFlow with SYCL support
-#endif
-
-#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_SYCL_SYCL_UTIL_H_
-#define TENSORFLOW_CORE_COMMON_RUNTIME_SYCL_SYCL_UTIL_H_
-
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/common_runtime/device.h"
-// For DMA helper
-#include "tensorflow/core/common_runtime/dma_helper.h"
-#include "tensorflow/core/framework/tensor.h"
-
-namespace tensorflow {
-inline void const* GetBase(const Tensor* src) { return DMAHelper::base(src); }
-inline void* GetBase(Tensor* dst) { return DMAHelper::base(dst); }
-
-inline void SYCLmemcpy(Eigen::SyclDevice const& device,
-                       Tensor const& src_tensor, Tensor* dst_tensor) {
-  const size_t size = src_tensor.TotalBytes();
-  void* dst_ptr = GetBase(dst_tensor);
-  void const* src_ptr = GetBase(&src_tensor);
-
-#define COPY_WITH_TYPE(T) \
-  device.memcpy(dst_ptr, static_cast<T const*>(src_ptr), size);
-  switch (src_tensor.dtype()) {
-    case DT_COMPLEX128:
-      COPY_WITH_TYPE(cl::sycl::cl_ulong2);
-      break;
-    case DT_DOUBLE:
-    case DT_COMPLEX64:
-    case DT_INT64:
-      COPY_WITH_TYPE(cl::sycl::cl_ulong);
-      break;
-    case DT_FLOAT:
-    case DT_INT32:
-    case DT_QINT32:
-      COPY_WITH_TYPE(cl::sycl::cl_uint);
-      break;
-    case DT_INT16:
-    case DT_UINT16:
-    case DT_BFLOAT16:
-    case DT_QINT16:
-    case DT_QUINT16:
-    case DT_HALF:
-      COPY_WITH_TYPE(cl::sycl::cl_ushort);
-      break;
-    case DT_BOOL:
-      COPY_WITH_TYPE(bool);
-      break;
-    case DT_UINT8:
-    case DT_INT8:
-    case DT_QINT8:
-    case DT_QUINT8:
-      COPY_WITH_TYPE(cl::sycl::cl_uchar);
-      break;
-    default:
-      LOG(FATAL) << "Unknown data type " << src_tensor.dtype();
-      break;
-  }
-#undef COPY_WITH_TYPE
-}
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_SYCL_SYCL_UTIL_H_
diff --git a/tensorflow/core/common_runtime/test_collective_executor_mgr.h b/tensorflow/core/common_runtime/test_collective_executor_mgr.h
index 22694120403..9756215c3f3 100644
--- a/tensorflow/core/common_runtime/test_collective_executor_mgr.h
+++ b/tensorflow/core/common_runtime/test_collective_executor_mgr.h
@@ -16,6 +16,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_COMMON_RUNTIME_TEST_COLLECTIVE_EXECUTOR_MGR_H_
 
 #include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
 
 namespace tensorflow {
@@ -28,25 +29,6 @@ class TestCollectiveExecutor : public CollectiveExecutor {
  public:
   explicit TestCollectiveExecutor(CollectiveExecutorMgrInterface* cem)
       : CollectiveExecutor(cem) {}
-  void RecvFromPeer(const string& peer_device, const string& peer_task,
-                    bool peer_is_local, const string& key, Device* to_device,
-                    DeviceContext* to_device_ctx,
-                    const AllocatorAttributes& to_alloc_attr, Tensor* to_tensor,
-                    const DeviceLocality& client_locality,
-                    int dev_to_dev_stream_index,
-                    const StatusCallback& done) override {
-    done(errors::Internal("Unimplemented"));
-  }
-
-  void PostToPeer(const string& peer_device, const string& peer_task,
-                  const string& key, Device* from_device,
-                  DeviceContext* from_device_ctx,
-                  const AllocatorAttributes& from_alloc_attr,
-                  const Tensor* from_tensor,
-                  const DeviceLocality& client_locality,
-                  const StatusCallback& done) override {
-    done(errors::Internal("Unimplemented"));
-  }
 
   void RunClosure(std::function<void()>) override {
     LOG(FATAL) << "Unimplemented";
@@ -54,7 +36,7 @@ class TestCollectiveExecutor : public CollectiveExecutor {
 };
 
 class TestParamResolver : public ParamResolverInterface {
-  void CompleteParamsAsync(const string& device, CollectiveParams* cp,
+  void CompleteParamsAsync(const DeviceAttributes& device, CollectiveParams* cp,
                            CancellationManager* cancel_mgr,
                            const StatusCallback& done) override {
     done(errors::Internal("Unimplemented"));
@@ -119,6 +101,11 @@ class TestCollectiveExecutorMgr : public CollectiveExecutorMgrInterface {
     return nullptr;
   }
 
+  NcclCommunicatorInterface* GetNcclCommunicator() const override {
+    LOG(FATAL) << "Unimplemented";  // Crash OK
+    return nullptr;
+  }
+
   void GetStepSequenceAsync(const GetStepSequenceRequest* request,
                             GetStepSequenceResponse* response,
                             const StatusCallback& done) override {
diff --git a/tensorflow/core/data/compression_utils.cc b/tensorflow/core/data/compression_utils.cc
index 2ab51712580..bbff3a96667 100644
--- a/tensorflow/core/data/compression_utils.cc
+++ b/tensorflow/core/data/compression_utils.cc
@@ -114,7 +114,9 @@ Status UncompressElement(const CompressedElement& compressed,
   size_t uncompressed_size;
   if (!port::Snappy_GetUncompressedLength(
           compressed_data.data(), compressed_data.size(), &uncompressed_size)) {
-    return errors::Internal("Could not get snappy uncompressed length");
+    return errors::Internal(
+        "Could not get snappy uncompressed length. Compressed data size: ",
+        compressed_data.size());
   }
   if (uncompressed_size != static_cast<size_t>(total_size)) {
     return errors::Internal(
diff --git a/tensorflow/core/data/service/BUILD b/tensorflow/core/data/service/BUILD
index 19fe0263df2..ef4f8fbe4f5 100644
--- a/tensorflow/core/data/service/BUILD
+++ b/tensorflow/core/data/service/BUILD
@@ -3,6 +3,7 @@ load(
     "//tensorflow/core/platform:build_config.bzl",
     "tf_additional_all_protos",
     "tf_proto_library",
+    "tf_protos_profiler_service",
 )
 load("//tensorflow:tensorflow.bzl", "tf_grpc_cc_dependency")
 load(
@@ -48,6 +49,114 @@ tf_proto_library(
     ],
 )
 
+cc_library(
+    name = "credentials_factory",
+    srcs = ["credentials_factory.cc"],
+    hdrs = ["credentials_factory.h"],
+    deps = [
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
+        tf_grpc_cc_dependency(),
+    ],
+)
+
+tf_cc_test(
+    name = "credentials_factory_test",
+    srcs = ["credentials_factory_test.cc"],
+    deps = [
+        ":credentials_factory",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
+cc_library(
+    name = "data_service",
+    srcs = ["data_service.cc"],
+    hdrs = [
+        "data_service.h",
+    ],
+    deps = [
+        ":credentials_factory",
+        ":dispatcher_cc_grpc_proto",
+        ":dispatcher_proto_cc",
+        ":grpc_util",
+        ":worker_cc_grpc_proto",
+        ":worker_proto_cc",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        tf_grpc_cc_dependency(),
+    ],
+)
+
+tf_cc_test(
+    name = "data_service_test",
+    srcs = ["data_service_test.cc"],
+    tags = ["no_windows"],
+    deps = [
+        ":data_service",
+        ":dispatcher_cc_grpc_proto",
+        ":dispatcher_proto_cc",
+        ":grpc_dispatcher_impl",
+        ":grpc_util",
+        ":grpc_worker_impl",
+        ":local_credentials_factory",
+        ":server_lib",
+        ":test_cluster",
+        ":test_util",
+        ":worker_cc_grpc_proto",
+        ":worker_proto_cc",
+        "@com_google_absl//absl/strings",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/data:compression_utils",
+        "//tensorflow/core/kernels/data:dataset_test_base",
+        tf_grpc_cc_dependency(),
+    ] + tf_protos_profiler_service(),
+)
+
+cc_library(
+    name = "dataset_store",
+    srcs = ["dataset_store.cc"],
+    hdrs = ["dataset_store.h"],
+    deps = [
+        ":common_proto_cc",
+        ":dispatcher_state",
+        ":utils",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_cc_test(
+    name = "dataset_store_test",
+    srcs = ["dataset_store_test.cc"],
+    deps = [
+        ":common_proto_cc",
+        ":dataset_store",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "@com_google_absl//absl/memory",
+    ],
+)
+
+cc_grpc_library(
+    name = "dispatcher_cc_grpc_proto",
+    srcs = [":dispatcher_proto"],
+    generate_mocks = True,
+    grpc_only = True,
+    deps = [":dispatcher_proto_cc"],
+)
+
 cc_library(
     name = "dispatcher_impl",
     srcs = ["dispatcher_impl.cc"],
@@ -58,14 +167,12 @@ cc_library(
         ":common_proto_cc",
         ":credentials_factory",
         ":data_service",
+        ":dataset_store",
         ":dispatcher_proto_cc",
         ":dispatcher_state",
         ":grpc_util",
         ":journal",
         ":worker_cc_grpc_proto",
-        ":worker_proto_cc",
-        "//tensorflow/c:c_api_internal",
-        "//tensorflow/c:tf_status_helper",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework_internal",
@@ -113,30 +220,14 @@ tf_cc_test(
 )
 
 cc_library(
-    name = "worker_impl",
-    srcs = ["worker_impl.cc"],
-    hdrs = [
-        "worker_impl.h",
-    ],
+    name = "grpc_dispatcher_impl",
+    srcs = ["grpc_dispatcher_impl.cc"],
+    hdrs = ["grpc_dispatcher_impl.h"],
     deps = [
-        ":common_proto_cc",
-        ":credentials_factory",
         ":dispatcher_cc_grpc_proto",
-        ":dispatcher_proto_cc",
-        ":grpc_util",
-        ":worker_proto_cc",
-        "//tensorflow/c:c_api_internal",
-        "//tensorflow/c:tf_status_helper",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
+        ":dispatcher_impl",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/data:dataset_proto_cc",
-        "//tensorflow/core/data:standalone",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/memory",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_util",
         tf_grpc_cc_dependency(),
     ],
 )
@@ -149,6 +240,7 @@ cc_library(
     ],
     deps = [
         "//tensorflow/core:lib",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_util",
         tf_grpc_cc_dependency(),
     ],
 )
@@ -165,6 +257,19 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "grpc_worker_impl",
+    srcs = ["grpc_worker_impl.cc"],
+    hdrs = ["grpc_worker_impl.h"],
+    deps = [
+        ":worker_cc_grpc_proto",
+        ":worker_impl",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_util",
+        tf_grpc_cc_dependency(),
+    ],
+)
+
 cc_library(
     name = "journal",
     srcs = ["journal.cc"],
@@ -172,11 +277,21 @@ cc_library(
     deps = [
         ":journal_proto_cc",
         "//tensorflow/core:lib",
+        "//tensorflow/core/platform:regexp",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
     ],
 )
 
+tf_proto_library(
+    name = "journal_proto",
+    srcs = ["journal.proto"],
+    cc_api_version = 2,
+    protodeps = [
+        ":common_proto",
+    ],
+)
+
 tf_cc_test(
     name = "journal_test",
     srcs = ["journal_test.cc"],
@@ -192,38 +307,6 @@ tf_cc_test(
     ],
 )
 
-tf_proto_library(
-    name = "journal_proto",
-    srcs = ["journal.proto"],
-    cc_api_version = 2,
-    protodeps = [
-        ":common_proto",
-    ],
-)
-
-cc_library(
-    name = "credentials_factory",
-    srcs = ["credentials_factory.cc"],
-    hdrs = ["credentials_factory.h"],
-    deps = [
-        "//tensorflow/core:lib",
-        "@com_google_absl//absl/strings",
-        tf_grpc_cc_dependency(),
-    ],
-)
-
-tf_cc_test(
-    name = "credentials_factory_test",
-    srcs = ["credentials_factory_test.cc"],
-    deps = [
-        ":credentials_factory",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
 # Link this target to enable LOCAL credentials for the dataset service.
 cc_library(
     name = "local_credentials_factory",
@@ -235,6 +318,38 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "server_lib",
+    srcs = ["server_lib.cc"],
+    hdrs = ["server_lib.h"],
+    linkstatic = True,
+    visibility = [
+        "//visibility:public",
+    ],
+    deps = [
+        ":credentials_factory",
+        ":grpc_dispatcher_impl",
+        ":grpc_util",
+        ":grpc_worker_impl",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:tensorflow",
+        "//tensorflow/core/profiler/rpc:profiler_service_impl",
+        tf_grpc_cc_dependency(),
+    ],
+    alwayslink = 1,
+)
+
+# This needs to be cc_header_only_library - tf_pybind_cc_library_wrapper
+# does not pull in the server_lib.h header.
+cc_header_only_library(
+    name = "server_lib_headers_lib",
+    features = ["-parse_headers"],
+    deps = [
+        ":server_lib",
+    ],
+)
+
 cc_library(
     name = "test_cluster",
     testonly = True,
@@ -279,118 +394,31 @@ tf_cc_test(
 )
 
 cc_library(
-    name = "grpc_dispatcher_impl",
-    srcs = ["grpc_dispatcher_impl.cc"],
-    hdrs = ["grpc_dispatcher_impl.h"],
+    name = "utils",
+    srcs = ["utils.cc"],
+    hdrs = ["utils.h"],
     deps = [
-        ":dispatcher_cc_grpc_proto",
-        ":dispatcher_impl",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/distributed_runtime/rpc:grpc_util",
-        tf_grpc_cc_dependency(),
-    ],
-)
-
-cc_library(
-    name = "grpc_worker_impl",
-    srcs = ["grpc_worker_impl.cc"],
-    hdrs = ["grpc_worker_impl.h"],
-    deps = [
-        ":worker_cc_grpc_proto",
-        ":worker_impl",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/distributed_runtime/rpc:grpc_util",
-        tf_grpc_cc_dependency(),
-    ],
-)
-
-# This needs to be cc_header_only_library - tf_pybind_cc_library_wrapper
-# does not pull in the server_lib.h header.
-cc_header_only_library(
-    name = "server_lib_headers_lib",
-    features = ["-parse_headers"],
-    deps = [
-        ":server_lib",
-    ],
-)
-
-cc_library(
-    name = "server_lib",
-    srcs = ["server_lib.cc"],
-    hdrs = ["server_lib.h"],
-    linkstatic = True,
-    visibility = [
-        "//visibility:public",
-    ],
-    deps = [
-        ":credentials_factory",
-        ":grpc_dispatcher_impl",
-        ":grpc_util",
-        ":grpc_worker_impl",
+        ":common_proto_cc",
         "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:tensorflow",
-        tf_grpc_cc_dependency(),
-    ],
-    alwayslink = 1,
-)
-
-cc_library(
-    name = "data_service",
-    srcs = ["data_service.cc"],
-    hdrs = [
-        "data_service.h",
-    ],
-    deps = [
-        ":credentials_factory",
-        ":dispatcher_cc_grpc_proto",
-        ":dispatcher_proto_cc",
-        ":grpc_util",
-        ":worker_cc_grpc_proto",
-        ":worker_proto_cc",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
-        tf_grpc_cc_dependency(),
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
     ],
 )
 
 tf_cc_test(
-    name = "data_service_test",
-    srcs = ["data_service_test.cc"],
-    tags = ["no_windows"],
+    name = "utils_test",
+    srcs = ["utils_test.cc"],
     deps = [
-        ":data_service",
-        ":dispatcher_cc_grpc_proto",
-        ":dispatcher_proto_cc",
-        ":grpc_dispatcher_impl",
-        ":grpc_util",
-        ":grpc_worker_impl",
-        ":local_credentials_factory",
-        ":server_lib",
-        ":test_cluster",
-        ":test_util",
-        ":worker_cc_grpc_proto",
-        ":worker_proto_cc",
+        ":common_proto_cc",
+        ":utils",
         "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
-        "//tensorflow/core/data:compression_utils",
-        "//tensorflow/core/kernels/data:dataset_test_base",
-        "@com_google_absl//absl/strings",
-        tf_grpc_cc_dependency(),
+        "//tensorflow/core:testlib",
     ],
 )
 
-cc_grpc_library(
-    name = "dispatcher_cc_grpc_proto",
-    srcs = [":dispatcher_proto"],
-    generate_mocks = True,
-    grpc_only = True,
-    deps = [":dispatcher_proto_cc"],
-)
-
 cc_grpc_library(
     name = "worker_cc_grpc_proto",
     srcs = [":worker_proto"],
@@ -398,3 +426,34 @@ cc_grpc_library(
     grpc_only = True,
     deps = [":worker_proto_cc"],
 )
+
+cc_library(
+    name = "worker_impl",
+    srcs = ["worker_impl.cc"],
+    hdrs = [
+        "worker_impl.h",
+    ],
+    deps = [
+        ":common_proto_cc",
+        ":credentials_factory",
+        ":data_service",
+        ":dispatcher_cc_grpc_proto",
+        ":dispatcher_proto_cc",
+        ":grpc_util",
+        ":utils",
+        ":worker_proto_cc",
+        "//tensorflow/c:c_api_internal",
+        "//tensorflow/c:tf_status_helper",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/data:dataset_proto_cc",
+        "//tensorflow/core/data:standalone",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/memory",
+        tf_grpc_cc_dependency(),
+    ],
+)
diff --git a/tensorflow/core/data/service/common.proto b/tensorflow/core/data/service/common.proto
index aeeb1371171..64fced1d13c 100644
--- a/tensorflow/core/data/service/common.proto
+++ b/tensorflow/core/data/service/common.proto
@@ -12,11 +12,13 @@ message DatasetDef {
 
 message TaskDef {
   // The dataset to iterate over.
-  // TODO(aaudibert): load the dataset from disk instead of passing it here.
-  DatasetDef dataset = 1;
-  int64 dataset_id = 2;
-  int64 task_id = 3;
-  int64 job_id = 4;
+  oneof dataset {
+    DatasetDef dataset_def = 1;
+    string path = 2;
+  }
+  int64 dataset_id = 3;
+  int64 task_id = 4;
+  int64 job_id = 5;
 }
 
 message TaskInfo {
diff --git a/tensorflow/core/data/service/credentials_factory.cc b/tensorflow/core/data/service/credentials_factory.cc
index 88b0073ae26..43b56d54d2e 100644
--- a/tensorflow/core/data/service/credentials_factory.cc
+++ b/tensorflow/core/data/service/credentials_factory.cc
@@ -65,7 +65,8 @@ Status CredentialsFactory::Get(absl::string_view protocol,
 }
 
 Status CredentialsFactory::CreateServerCredentials(
-    absl::string_view protocol, std::shared_ptr<grpc::ServerCredentials>* out) {
+    absl::string_view protocol,
+    std::shared_ptr<::grpc::ServerCredentials>* out) {
   CredentialsFactory* factory;
   TF_RETURN_IF_ERROR(CredentialsFactory::Get(protocol, &factory));
   TF_RETURN_IF_ERROR(factory->CreateServerCredentials(out));
@@ -74,7 +75,7 @@ Status CredentialsFactory::CreateServerCredentials(
 
 Status CredentialsFactory::CreateClientCredentials(
     absl::string_view protocol,
-    std::shared_ptr<grpc::ChannelCredentials>* out) {
+    std::shared_ptr<::grpc::ChannelCredentials>* out) {
   CredentialsFactory* factory;
   TF_RETURN_IF_ERROR(CredentialsFactory::Get(protocol, &factory));
   TF_RETURN_IF_ERROR(factory->CreateClientCredentials(out));
@@ -86,14 +87,14 @@ class InsecureCredentialsFactory : public CredentialsFactory {
   std::string Protocol() override { return "grpc"; }
 
   Status CreateServerCredentials(
-      std::shared_ptr<grpc::ServerCredentials>* out) override {
-    *out = grpc::InsecureServerCredentials();
+      std::shared_ptr<::grpc::ServerCredentials>* out) override {
+    *out = ::grpc::InsecureServerCredentials();
     return Status::OK();
   }
 
   Status CreateClientCredentials(
-      std::shared_ptr<grpc::ChannelCredentials>* out) override {
-    *out = grpc::InsecureChannelCredentials();
+      std::shared_ptr<::grpc::ChannelCredentials>* out) override {
+    *out = ::grpc::InsecureChannelCredentials();
     return Status::OK();
   }
 };
diff --git a/tensorflow/core/data/service/credentials_factory.h b/tensorflow/core/data/service/credentials_factory.h
index a93b9411ec0..2407f64ee7f 100644
--- a/tensorflow/core/data/service/credentials_factory.h
+++ b/tensorflow/core/data/service/credentials_factory.h
@@ -36,11 +36,11 @@ class CredentialsFactory {
 
   // Stores server credentials to `*out`.
   virtual Status CreateServerCredentials(
-      std::shared_ptr<grpc::ServerCredentials>* out) = 0;
+      std::shared_ptr<::grpc::ServerCredentials>* out) = 0;
 
   // Stores client credentials to `*out`.
   virtual Status CreateClientCredentials(
-      std::shared_ptr<grpc::ChannelCredentials>* out) = 0;
+      std::shared_ptr<::grpc::ChannelCredentials>* out) = 0;
 
   // Registers a credentials factory.
   static void Register(CredentialsFactory* factory);
@@ -49,13 +49,13 @@ class CredentialsFactory {
   // `protocol`, and stores them to `*out`.
   static Status CreateServerCredentials(
       absl::string_view protocol,
-      std::shared_ptr<grpc::ServerCredentials>* out);
+      std::shared_ptr<::grpc::ServerCredentials>* out);
 
   // Creates client credentials using the credentials factory registered as
   // `protocol`, and stores them to `*out`.
   static Status CreateClientCredentials(
       absl::string_view protocol,
-      std::shared_ptr<grpc::ChannelCredentials>* out);
+      std::shared_ptr<::grpc::ChannelCredentials>* out);
 
  private:
   // Gets the credentials factory registered via `Register` for the specified
diff --git a/tensorflow/core/data/service/data_service.cc b/tensorflow/core/data/service/data_service.cc
index be09b10c1fc..cc50e0f3e5a 100644
--- a/tensorflow/core/data/service/data_service.cc
+++ b/tensorflow/core/data/service/data_service.cc
@@ -31,11 +31,11 @@ constexpr const char kParallelEpochs[] = "parallel_epochs";
 constexpr const char kOneEpoch[] = "one_epoch";
 }  // namespace
 
-Status ParseProcessingMode(const std::string& s, ProcessingMode* mode) {
+Status ParseProcessingMode(const std::string& s, ProcessingMode& mode) {
   if (s == kParallelEpochs) {
-    *mode = ProcessingMode::PARALLEL_EPOCHS;
+    mode = ProcessingMode::PARALLEL_EPOCHS;
   } else if (s == kOneEpoch) {
-    *mode = ProcessingMode::ONE_EPOCH;
+    mode = ProcessingMode::ONE_EPOCH;
   } else {
     return errors::InvalidArgument("Unrecognized processing mode: ", s);
   }
@@ -54,8 +54,65 @@ std::string ProcessingModeToString(ProcessingMode mode) {
   }
 }
 
+Status DataServiceDispatcherClient::WorkerHeartbeat(
+    const std::string& worker_address, const std::vector<int64>& current_tasks,
+    std::vector<TaskDef>& new_tasks, std::vector<int64>& tasks_to_delete) {
+  TF_RETURN_IF_ERROR(EnsureInitialized());
+  WorkerHeartbeatRequest req;
+  req.set_worker_address(worker_address);
+  for (int64 task : current_tasks) {
+    req.add_current_tasks(task);
+  }
+  WorkerHeartbeatResponse resp;
+  grpc::ClientContext client_ctx;
+  grpc::Status status = stub_->WorkerHeartbeat(&client_ctx, req, &resp);
+  if (!status.ok()) {
+    return grpc_util::WrapError("Failed to perform worker heartbeat", status);
+  }
+  for (const auto& task : resp.new_tasks()) {
+    new_tasks.push_back(task);
+  }
+  for (int64 task_to_delete : resp.tasks_to_delete()) {
+    tasks_to_delete.push_back(task_to_delete);
+  }
+  return Status::OK();
+}
+
+Status DataServiceDispatcherClient::WorkerUpdate(
+    const std::string& worker_address,
+    std::vector<TaskProgress>& task_progress) {
+  TF_RETURN_IF_ERROR(EnsureInitialized());
+  WorkerUpdateRequest req;
+  req.set_worker_address(worker_address);
+  for (const auto& update : task_progress) {
+    *(req.add_updates()) = update;
+  }
+  WorkerUpdateResponse resp;
+  grpc::ClientContext client_ctx;
+  grpc::Status status = stub_->WorkerUpdate(&client_ctx, req, &resp);
+  if (!status.ok()) {
+    return grpc_util::WrapError("Failed to send worker update", status);
+  }
+  return Status::OK();
+}
+
+Status DataServiceDispatcherClient::GetDatasetDef(int64 dataset_id,
+                                                  DatasetDef& dataset_def) {
+  TF_RETURN_IF_ERROR(EnsureInitialized());
+  GetDatasetDefRequest req;
+  req.set_dataset_id(dataset_id);
+  GetDatasetDefResponse resp;
+  grpc::ClientContext client_ctx;
+  grpc::Status status = stub_->GetDatasetDef(&client_ctx, req, &resp);
+  if (!status.ok()) {
+    return grpc_util::WrapError("Failed to get dataset def", status);
+  }
+  dataset_def = resp.dataset_def();
+  return Status::OK();
+}
+
 Status DataServiceDispatcherClient::RegisterDataset(GraphDef dataset,
-                                                    int64* dataset_id) {
+                                                    int64& dataset_id) {
   TF_RETURN_IF_ERROR(EnsureInitialized());
   GetOrRegisterDatasetRequest req;
   *req.mutable_dataset()->mutable_graph() = dataset;
@@ -65,13 +122,13 @@ Status DataServiceDispatcherClient::RegisterDataset(GraphDef dataset,
   if (!status.ok()) {
     return grpc_util::WrapError("Failed to register dataset", status);
   }
-  *dataset_id = resp.dataset_id();
+  dataset_id = resp.dataset_id();
   return Status::OK();
 }
 
 Status DataServiceDispatcherClient::CreateJob(int64 dataset_id,
                                               ProcessingMode processing_mode,
-                                              int64* job_id) {
+                                              int64& job_client_id) {
   TF_RETURN_IF_ERROR(EnsureInitialized());
   CreateJobRequest req;
   req.set_dataset_id(dataset_id);
@@ -84,13 +141,13 @@ Status DataServiceDispatcherClient::CreateJob(int64 dataset_id,
         absl::StrCat("Failed to create job for dataset with id ", dataset_id),
         status);
   }
-  *job_id = resp.job_id();
+  job_client_id = resp.job_client_id();
   return Status::OK();
 }
 
 Status DataServiceDispatcherClient::GetOrCreateJob(
     int64 dataset_id, ProcessingMode processing_mode,
-    const std::string& job_name, int job_name_index, int64* job_id) {
+    const std::string& job_name, int job_name_index, int64& job_client_id) {
   TF_RETURN_IF_ERROR(EnsureInitialized());
   GetOrCreateJobRequest req;
   req.set_dataset_id(dataset_id);
@@ -106,48 +163,67 @@ Status DataServiceDispatcherClient::GetOrCreateJob(
                      dataset_id),
         status);
   }
-  *job_id = resp.job_id();
+  job_client_id = resp.job_client_id();
   return Status::OK();
 }
 
-Status DataServiceDispatcherClient::GetTasks(int64 job_id,
-                                             std::vector<TaskInfo>* tasks,
-                                             bool* job_finished) {
+Status DataServiceDispatcherClient::ReleaseJobClient(int64 job_client_id) {
+  TF_RETURN_IF_ERROR(EnsureInitialized());
+  ReleaseJobClientRequest req;
+  req.set_job_client_id(job_client_id);
+  ReleaseJobClientResponse resp;
+  grpc::ClientContext client_ctx;
+  grpc::Status status = stub_->ReleaseJobClient(&client_ctx, req, &resp);
+  if (!status.ok()) {
+    return grpc_util::WrapError(
+        absl::StrCat("Failed to release job client with id ", job_client_id),
+        status);
+  }
+  return Status::OK();
+}
+
+Status DataServiceDispatcherClient::GetTasks(int64 job_client_id,
+                                             std::vector<TaskInfo>& tasks,
+                                             bool& job_finished) {
   TF_RETURN_IF_ERROR(EnsureInitialized());
   GetTasksRequest req;
-  req.set_job_id(job_id);
+  req.set_job_client_id(job_client_id);
   GetTasksResponse resp;
-  grpc_impl::ClientContext ctx;
+  grpc::ClientContext ctx;
   grpc::Status s = stub_->GetTasks(&ctx, req, &resp);
   if (!s.ok()) {
     return grpc_util::WrapError("Failed to get tasks", s);
   }
-  tasks->clear();
+  tasks.clear();
   for (auto& task : resp.task_info()) {
-    tasks->push_back(task);
+    tasks.push_back(task);
   }
-  *job_finished = resp.job_finished();
+  job_finished = resp.job_finished();
   return Status::OK();
 }
 
 Status DataServiceDispatcherClient::GetWorkers(
-    std::vector<WorkerInfo>* workers) {
+    std::vector<WorkerInfo>& workers) {
   TF_RETURN_IF_ERROR(EnsureInitialized());
   GetWorkersRequest req;
   GetWorkersResponse resp;
-  grpc_impl::ClientContext ctx;
+  grpc::ClientContext ctx;
   grpc::Status s = stub_->GetWorkers(&ctx, req, &resp);
   if (!s.ok()) {
     return grpc_util::WrapError("Failed to get workers", s);
   }
-  workers->clear();
+  workers.clear();
   for (auto& worker : resp.workers()) {
-    workers->push_back(worker);
+    workers.push_back(worker);
   }
   return Status::OK();
 }
 
 Status DataServiceDispatcherClient::EnsureInitialized() {
+  mutex_lock l(mu_);
+  if (stub_) {
+    return Status::OK();
+  }
   std::shared_ptr<grpc::ChannelCredentials> credentials;
   TF_RETURN_IF_ERROR(
       CredentialsFactory::CreateClientCredentials(protocol_, &credentials));
@@ -157,25 +233,29 @@ Status DataServiceDispatcherClient::EnsureInitialized() {
 }
 
 Status DataServiceWorkerClient::GetElement(int64 task_id,
-                                           CompressedElement* element,
-                                           bool* end_of_sequence) {
+                                           CompressedElement& element,
+                                           bool& end_of_sequence) {
   TF_RETURN_IF_ERROR(EnsureInitialized());
   GetElementRequest req;
   req.set_task_id(task_id);
   GetElementResponse resp;
-  grpc_impl::ClientContext ctx;
+  grpc::ClientContext ctx;
   grpc::Status s = stub_->GetElement(&ctx, req, &resp);
   if (!s.ok()) {
     return grpc_util::WrapError("Failed to get element", s);
   }
-  *end_of_sequence = resp.end_of_sequence();
-  if (!*end_of_sequence) {
-    *element = std::move(*resp.mutable_compressed_element());
+  end_of_sequence = resp.end_of_sequence();
+  if (!end_of_sequence) {
+    element = std::move(*resp.mutable_compressed_element());
   }
   return Status::OK();
 }
 
 Status DataServiceWorkerClient::EnsureInitialized() {
+  mutex_lock l(mu_);
+  if (stub_) {
+    return Status::OK();
+  }
   std::shared_ptr<grpc::ChannelCredentials> credentials;
   TF_RETURN_IF_ERROR(
       CredentialsFactory::CreateClientCredentials(protocol_, &credentials));
@@ -188,20 +268,20 @@ Status DataServiceWorkerClient::EnsureInitialized() {
 
 Status CreateDataServiceDispatcherClient(
     const std::string& address, const std::string& protocol,
-    std::unique_ptr<DataServiceDispatcherClient>* out) {
+    std::unique_ptr<DataServiceDispatcherClient>& out) {
   auto client =
       absl::make_unique<DataServiceDispatcherClient>(address, protocol);
   TF_RETURN_IF_ERROR(client->Initialize());
-  *out = std::move(client);
+  out = std::move(client);
   return Status::OK();
 }
 
 Status CreateDataServiceWorkerClient(
     const std::string& address, const std::string& protocol,
-    std::unique_ptr<DataServiceWorkerClient>* out) {
+    std::unique_ptr<DataServiceWorkerClient>& out) {
   auto client = absl::make_unique<DataServiceWorkerClient>(address, protocol);
   TF_RETURN_IF_ERROR(client->Initialize());
-  *out = std::move(client);
+  out = std::move(client);
   return Status::OK();
 }
 }  // namespace data
diff --git a/tensorflow/core/data/service/data_service.h b/tensorflow/core/data/service/data_service.h
index d0e46c82ff5..f0adbb3d4eb 100644
--- a/tensorflow/core/data/service/data_service.h
+++ b/tensorflow/core/data/service/data_service.h
@@ -34,15 +34,14 @@ enum class ProcessingMode : int64 {
 };
 
 // Parses a string representing a processing mode and stores the result in
-// *mode. Returns an InvalidArgument status if the string is not recognized.
-Status ParseProcessingMode(const std::string& s, ProcessingMode* mode);
+// `mode`. Returns an InvalidArgument status if the string is not recognized.
+Status ParseProcessingMode(const std::string& s, ProcessingMode& mode);
 
 // Converts a processing mode to its corresponding string.
 std::string ProcessingModeToString(ProcessingMode mode);
 
 // Base class for data service clients. Data service clients are
-// thread-compatible, requiring external synchronization when used from multiple
-// threads.
+// threadsafe.
 class DataServiceClientBase {
  public:
   DataServiceClientBase(const std::string& address, const std::string& protocol)
@@ -74,36 +73,61 @@ class DataServiceDispatcherClient : public DataServiceClientBase {
                               const std::string& protocol)
       : DataServiceClientBase(address, protocol) {}
 
+  // Sends a heartbeat to the dispatcher. If the worker wasn't already
+  // registered with the dispatcher, this will register the worker. The
+  // dispatcher will report which new tasks the worker should run, and which
+  // tasks it should delete. This is stored into `new_tasks` and
+  // `tasks_to_delete`.
+  Status WorkerHeartbeat(const std::string& worker_address,
+                         const std::vector<int64>& current_tasks,
+                         std::vector<TaskDef>& new_tasks,
+                         std::vector<int64>& tasks_to_delete);
+
+  // Updates the dispatcher with information about the worker's state.
+  Status WorkerUpdate(const std::string& worker_address,
+                      std::vector<TaskProgress>& task_progress);
+
+  // Gets a dataset definition for the given dataset id, and stores the
+  // definition in `dataset_def`.
+  Status GetDatasetDef(int64 dataset_id, DatasetDef& dataset_def);
+
   // Registers a dataset with the tf.data service, and stores the generated
-  // dataset id in `*dataset_id`.
-  Status RegisterDataset(GraphDef dataset, int64* dataset_id);
+  // dataset id in `dataset_id`.
+  Status RegisterDataset(GraphDef dataset, int64& dataset_id);
 
   // Creates a new tf.data service job for the specified dataset. The id for the
-  // created job will be stored in `*job_id`.
+  // created job will be stored in `job_client_id`.
   Status CreateJob(int64 dataset_id, ProcessingMode processing_mode,
-                   int64* job_id);
+                   int64& job_client_id);
 
   // Gets the job id for the job represented by the tuple
-  // (job_name, job_name_index), and stores the id in *job_id. If the
+  // (job_name, job_name_index), and stores the id in `job_client_id`. If the
   // job doesn't exist yet, it will be created.
   Status GetOrCreateJob(int64 dataset_id, ProcessingMode processing_mode,
                         const std::string& job_name, int job_name_index,
-                        int64* job_id);
+                        int64& job_client_id);
+
+  // Releases a job client id, indicating that the id will no longer be used to
+  // read from the job.
+  Status ReleaseJobClient(int64 job_client_id);
 
   // Queries the dispatcher for the tasks associated with the specified job.
-  // The tasks will be stored in *tasks, and whether the job is finished will
-  // be stored in `*job_finished`.
-  Status GetTasks(int64 job_id, std::vector<TaskInfo>* tasks,
-                  bool* job_finished);
+  // The tasks will be stored in `tasks`, and whether the job is finished will
+  // be stored in `job_finished`.
+  Status GetTasks(int64 job_client_id, std::vector<TaskInfo>& tasks,
+                  bool& job_finished);
 
   // Queries the dispatcher for its registered workers. The worker info will be
-  // stored in `*workers`.
-  Status GetWorkers(std::vector<WorkerInfo>* workers);
+  // stored in `workers`.
+  Status GetWorkers(std::vector<WorkerInfo>& workers);
 
  protected:
   Status EnsureInitialized() override;
 
  private:
+  mutex mu_;
+  // Initialization is guarded by `mu_`, but using the stub does not require
+  // holding `mu_`
   std::unique_ptr<DispatcherService::Stub> stub_;
 };
 
@@ -115,27 +139,30 @@ class DataServiceWorkerClient : public DataServiceClientBase {
       : DataServiceClientBase(address, protocol) {}
 
   // Fetches the next element for the specified task_id. The element's
-  // compressed tensors will be stored in *element. If no element is available,
-  // `*end_of_sequence` will be `true`, and `element` will be left unchanged.
-  Status GetElement(int64 task_id, CompressedElement* element,
-                    bool* end_of_sequence);
+  // compressed tensors will be stored in `element`. If no element is available,
+  // `end_of_sequence` will be `true`, and `element` will be left unchanged.
+  Status GetElement(int64 task_id, CompressedElement& element,
+                    bool& end_of_sequence);
 
  protected:
   Status EnsureInitialized() override;
 
  private:
+  mutex mu_;
+  // Initialization is guarded by `mu_`, but using the stub does not require
+  // holding `mu_`
   std::unique_ptr<WorkerService::Stub> stub_;
 };
 
 // Creates and initializes a new tf.data service dispatcher client.
 Status CreateDataServiceDispatcherClient(
     const std::string& address, const std::string& protocol,
-    std::unique_ptr<DataServiceDispatcherClient>* out);
+    std::unique_ptr<DataServiceDispatcherClient>& out);
 
 // Creates and initializes a new tf.data service worker client.
 Status CreateDataServiceWorkerClient(
     const std::string& address, const std::string& protocol,
-    std::unique_ptr<DataServiceWorkerClient>* out);
+    std::unique_ptr<DataServiceWorkerClient>& out);
 
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/data/service/data_service_test.cc b/tensorflow/core/data/service/data_service_test.cc
index 607570054b4..7b7e240b687 100644
--- a/tensorflow/core/data/service/data_service_test.cc
+++ b/tensorflow/core/data/service/data_service_test.cc
@@ -41,19 +41,19 @@ constexpr const char kProtocol[] = "grpc+local";
 
 TEST(DataService, ParseParallelEpochsProcessingMode) {
   ProcessingMode mode;
-  TF_ASSERT_OK(ParseProcessingMode("parallel_epochs", &mode));
+  TF_ASSERT_OK(ParseProcessingMode("parallel_epochs", mode));
   EXPECT_EQ(mode, ProcessingMode::PARALLEL_EPOCHS);
 }
 
 TEST(DataService, ParseOneEpochProcessingMode) {
   ProcessingMode mode;
-  TF_ASSERT_OK(ParseProcessingMode("one_epoch", &mode));
+  TF_ASSERT_OK(ParseProcessingMode("one_epoch", mode));
   EXPECT_EQ(mode, ProcessingMode::ONE_EPOCH);
 }
 
 TEST(DataService, ParseInvalidProcessingMode) {
   ProcessingMode mode;
-  Status s = ParseProcessingMode("invalid", &mode);
+  Status s = ParseProcessingMode("invalid", mode);
   EXPECT_EQ(s.code(), error::Code::INVALID_ARGUMENT);
 }
 
@@ -69,7 +69,7 @@ TEST(DataService, GetWorkers) {
   DataServiceDispatcherClient dispatcher(cluster.DispatcherAddress(),
                                          kProtocol);
   std::vector<WorkerInfo> workers;
-  TF_EXPECT_OK(dispatcher.GetWorkers(&workers));
+  TF_EXPECT_OK(dispatcher.GetWorkers(workers));
   EXPECT_EQ(1, workers.size());
 }
 
diff --git a/tensorflow/core/data/service/dataset_store.cc b/tensorflow/core/data/service/dataset_store.cc
new file mode 100644
index 00000000000..abb577c9f3f
--- /dev/null
+++ b/tensorflow/core/data/service/dataset_store.cc
@@ -0,0 +1,81 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/data/service/dataset_store.h"
+
+#include <memory>
+
+#include "absl/memory/memory.h"
+#include "tensorflow/core/data/service/common.pb.h"
+#include "tensorflow/core/data/service/utils.h"
+#include "tensorflow/core/lib/io/record_reader.h"
+#include "tensorflow/core/lib/io/record_writer.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/file_system.h"
+#include "tensorflow/core/platform/path.h"
+
+namespace tensorflow {
+namespace data {
+
+FileSystemDatasetStore::FileSystemDatasetStore(const std::string& datasets_dir)
+    : datasets_dir_(datasets_dir) {}
+
+Status FileSystemDatasetStore::Put(const std::string& key,
+                                   const DatasetDef& dataset) {
+  std::string path_to_write = io::JoinPath(datasets_dir_, key);
+
+  if (Env::Default()->FileExists(path_to_write).ok()) {
+    return errors::AlreadyExists("File ", path_to_write, " already exists");
+  }
+  TF_RETURN_IF_ERROR(WriteDatasetDef(path_to_write, dataset));
+  return Status::OK();
+}
+
+Status FileSystemDatasetStore::Get(
+    const std::string& key, std::shared_ptr<const DatasetDef>& dataset_def) {
+  std::string path = io::JoinPath(datasets_dir_, key);
+  TF_RETURN_IF_ERROR(Env::Default()->FileExists(path));
+  DatasetDef def;
+  TF_RETURN_IF_ERROR(ReadDatasetDef(path, def));
+  dataset_def = std::make_shared<const DatasetDef>(def);
+  return Status::OK();
+}
+
+MemoryDatasetStore::MemoryDatasetStore() {}
+
+Status MemoryDatasetStore::Put(const std::string& key,
+                               const DatasetDef& dataset) {
+  auto& stored_dataset = datasets_[key];
+  if (stored_dataset) {
+    return errors::AlreadyExists("Dataset with key ", key,
+                                 " is already stored.");
+  }
+  stored_dataset = std::make_shared<const DatasetDef>(dataset);
+  return Status::OK();
+}
+
+Status MemoryDatasetStore::Get(const std::string& key,
+                               std::shared_ptr<const DatasetDef>& dataset_def) {
+  auto& stored_dataset = datasets_[key];
+  if (!stored_dataset) {
+    return errors::NotFound("Dataset with key ", key, " not found");
+  }
+  dataset_def = stored_dataset;
+  return Status::OK();
+}
+
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/data/service/dataset_store.h b/tensorflow/core/data/service/dataset_store.h
new file mode 100644
index 00000000000..df2baf0ebda
--- /dev/null
+++ b/tensorflow/core/data/service/dataset_store.h
@@ -0,0 +1,77 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_SERVICE_DATASET_STORE_H_
+#define TENSORFLOW_CORE_DATA_SERVICE_DATASET_STORE_H_
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/core/data/service/dispatcher_state.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/io/record_reader.h"
+#include "tensorflow/core/lib/io/record_writer.h"
+#include "tensorflow/core/platform/env.h"
+
+namespace tensorflow {
+namespace data {
+
+// An interface for storing and getting dataset definitions.
+class DatasetStore {
+ public:
+  virtual ~DatasetStore() = default;
+
+  // Stores the given dataset under the given key. Returns ALREADY_EXISTS if the
+  // key already exists.
+  virtual Status Put(const std::string& key, const DatasetDef& dataset) = 0;
+  // Gets the dataset for the given key, storing the dataset in `dataset_def`.
+  virtual Status Get(const std::string& key,
+                     std::shared_ptr<const DatasetDef>& dataset_def) = 0;
+};
+
+// Dataset store which reads and writes datasets within a directory.
+// The dataset with key `key` is stored at the path "datasets_dir/key".
+class FileSystemDatasetStore : public DatasetStore {
+ public:
+  explicit FileSystemDatasetStore(const std::string& datasets_dir);
+  FileSystemDatasetStore(const FileSystemDatasetStore&) = delete;
+  FileSystemDatasetStore& operator=(const FileSystemDatasetStore&) = delete;
+
+  Status Put(const std::string& key, const DatasetDef& dataset) override;
+  Status Get(const std::string& key,
+             std::shared_ptr<const DatasetDef>& dataset_def) override;
+
+ private:
+  const std::string datasets_dir_;
+};
+
+// DatasetStore which stores all datasets in memory. This is useful when the
+// dispatcher doesn't have a work directory configured.
+class MemoryDatasetStore : public DatasetStore {
+ public:
+  MemoryDatasetStore();
+  MemoryDatasetStore(const MemoryDatasetStore&) = delete;
+  MemoryDatasetStore& operator=(const MemoryDatasetStore&) = delete;
+
+  Status Put(const std::string& key, const DatasetDef& dataset) override;
+  Status Get(const std::string& key,
+             std::shared_ptr<const DatasetDef>& dataset_def) override;
+
+ private:
+  // Mapping from key to dataset definition.
+  absl::flat_hash_map<std::string, std::shared_ptr<const DatasetDef>> datasets_;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_SERVICE_DATASET_STORE_H_
diff --git a/tensorflow/core/data/service/dataset_store_test.cc b/tensorflow/core/data/service/dataset_store_test.cc
new file mode 100644
index 00000000000..933ede679cb
--- /dev/null
+++ b/tensorflow/core/data/service/dataset_store_test.cc
@@ -0,0 +1,115 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/data/service/dataset_store.h"
+
+#include "absl/memory/memory.h"
+#include "tensorflow/core/data/service/common.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/path.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace data {
+
+namespace {
+const char kFileSystem[] = "file_system";
+const char kMemory[] = "memory";
+
+std::string NewDatasetsDir() {
+  std::string dir = io::JoinPath(testing::TmpDir(), "datasets");
+  if (Env::Default()->FileExists(dir).ok()) {
+    int64 undeleted_files;
+    int64 undeleted_dirs;
+    CHECK(Env::Default()
+              ->DeleteRecursively(dir, &undeleted_files, &undeleted_dirs)
+              .ok());
+  }
+  CHECK(Env::Default()->RecursivelyCreateDir(dir).ok());
+  return dir;
+}
+
+std::unique_ptr<DatasetStore> MakeStore(const std::string& type) {
+  if (type == kFileSystem) {
+    return absl::make_unique<FileSystemDatasetStore>(NewDatasetsDir());
+  } else if (type == kMemory) {
+    return absl::make_unique<MemoryDatasetStore>();
+  } else {
+    CHECK(false) << "unexpected type: " << type;
+  }
+}
+
+DatasetDef DatasetDefWithVersion(int32 version) {
+  DatasetDef def;
+  def.mutable_graph()->set_version(version);
+  return def;
+}
+
+}  // namespace
+
+class DatasetStoreTest : public ::testing::Test,
+                         public ::testing::WithParamInterface<std::string> {};
+
+TEST_P(DatasetStoreTest, StoreAndGet) {
+  std::unique_ptr<DatasetStore> store = MakeStore(GetParam());
+  std::string key = "key";
+  DatasetDef dataset_def = DatasetDefWithVersion(1);
+  TF_ASSERT_OK(store->Put(key, dataset_def));
+  std::shared_ptr<const DatasetDef> result;
+  TF_ASSERT_OK(store->Get(key, result));
+  EXPECT_EQ(result->graph().version(), dataset_def.graph().version());
+}
+
+TEST_P(DatasetStoreTest, StoreAndGetMultiple) {
+  std::unique_ptr<DatasetStore> store = MakeStore(GetParam());
+  int64 num_datasets = 10;
+  std::vector<std::string> keys;
+  for (int i = 0; i < num_datasets; ++i) {
+    std::string key = absl::StrCat("key", i);
+    DatasetDef dataset_def = DatasetDefWithVersion(i);
+    TF_ASSERT_OK(store->Put(key, dataset_def));
+    keys.push_back(key);
+  }
+  for (int i = 0; i < num_datasets; ++i) {
+    std::shared_ptr<const DatasetDef> result;
+    TF_ASSERT_OK(store->Get(keys[i], result));
+    EXPECT_EQ(result->graph().version(), i);
+  }
+}
+
+TEST_P(DatasetStoreTest, StoreAlreadyExists) {
+  std::unique_ptr<DatasetStore> store = MakeStore(GetParam());
+  int32 version = 1;
+  DatasetDef dataset_def = DatasetDefWithVersion(version);
+  std::string key = "key";
+  TF_ASSERT_OK(store->Put(key, dataset_def));
+  Status s = store->Put(key, dataset_def);
+  EXPECT_EQ(s.code(), error::ALREADY_EXISTS);
+  std::shared_ptr<const DatasetDef> result;
+  TF_ASSERT_OK(store->Get(key, result));
+  EXPECT_EQ(result->graph().version(), version);
+}
+
+TEST_P(DatasetStoreTest, GetMissing) {
+  std::unique_ptr<DatasetStore> store = MakeStore(GetParam());
+  std::shared_ptr<const DatasetDef> result;
+  Status s = store->Get("missing", result);
+  EXPECT_EQ(s.code(), error::NOT_FOUND);
+}
+
+INSTANTIATE_TEST_SUITE_P(DatasetStoreTests, DatasetStoreTest,
+                         ::testing::Values(kFileSystem, kMemory));
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/data/service/dispatcher.proto b/tensorflow/core/data/service/dispatcher.proto
index 057fc58de52..ffa3eb6b5ca 100644
--- a/tensorflow/core/data/service/dispatcher.proto
+++ b/tensorflow/core/data/service/dispatcher.proto
@@ -4,16 +4,6 @@ package tensorflow.data;
 
 import "tensorflow/core/data/service/common.proto";
 
-message RegisterWorkerRequest {
-  // The address of the registering worker.
-  string worker_address = 1;
-}
-
-message RegisterWorkerResponse {
-  // Tasks to begin processing.
-  repeated TaskDef tasks = 2;
-}
-
 message TaskProgress {
   // The task that this message is about.
   int64 task_id = 1;
@@ -21,6 +11,16 @@ message TaskProgress {
   bool completed = 2;
 }
 
+message WorkerHeartbeatRequest {
+  string worker_address = 1;
+  repeated int64 current_tasks = 2;
+}
+
+message WorkerHeartbeatResponse {
+  repeated TaskDef new_tasks = 1;
+  repeated int64 tasks_to_delete = 2;
+}
+
 message WorkerUpdateRequest {
   string worker_address = 1;
   repeated TaskProgress updates = 2;
@@ -28,6 +28,14 @@ message WorkerUpdateRequest {
 
 message WorkerUpdateResponse {}
 
+message GetDatasetDefRequest {
+  int64 dataset_id = 1;
+}
+
+message GetDatasetDefResponse {
+  DatasetDef dataset_def = 1;
+}
+
 message GetOrRegisterDatasetRequest {
   // The dataset to register.
   DatasetDef dataset = 1;
@@ -46,8 +54,9 @@ message CreateJobRequest {
 }
 
 message CreateJobResponse {
-  // An id for the created job.
-  int64 job_id = 1;
+  // An id for the client that will read from the job. When the client is done
+  // with the job, they should call ReleaseJobClient with this id.
+  int64 job_client_id = 1;
 }
 
 message GetOrCreateJobRequest {
@@ -63,13 +72,20 @@ message GetOrCreateJobRequest {
 }
 
 message GetOrCreateJobResponse {
-  // The id of the (potentially newly created) job.
-  int64 job_id = 1;
+  // An id for the client that will read from the job. When the client is done
+  // with the job, they should call ReleaseJobClient with this id.
+  int64 job_client_id = 1;
 }
 
+message ReleaseJobClientRequest {
+  int64 job_client_id = 1;
+}
+
+message ReleaseJobClientResponse {}
+
 message GetTasksRequest {
-  // The job to look up tasks for.
-  int64 job_id = 1;
+  // The job client id to look up tasks for.
+  int64 job_client_id = 1;
 }
 
 message GetTasksResponse {
@@ -94,12 +110,15 @@ message GetWorkersResponse {
 }
 
 service DispatcherService {
-  // Registers a worker with the dispatcher.
-  rpc RegisterWorker(RegisterWorkerRequest) returns (RegisterWorkerResponse);
+  // Performs a periodic worker heartbeat.
+  rpc WorkerHeartbeat(WorkerHeartbeatRequest) returns (WorkerHeartbeatResponse);
 
   // Updates the dispatcher with information about the worker's state.
   rpc WorkerUpdate(WorkerUpdateRequest) returns (WorkerUpdateResponse);
 
+  // Gets a dataset defintion.
+  rpc GetDatasetDef(GetDatasetDefRequest) returns (GetDatasetDefResponse);
+
   // Registers a dataset with the server, or returns its id if it is already
   // registered.
   //
@@ -114,6 +133,10 @@ service DispatcherService {
   // Creates a job for reading from the tf.data service.
   rpc CreateJob(CreateJobRequest) returns (CreateJobResponse);
 
+  // Releases a job client so that a job may eventually be cleaned up.
+  rpc ReleaseJobClient(ReleaseJobClientRequest)
+      returns (ReleaseJobClientResponse);
+
   // Reports a list of all tasks for a job.
   rpc GetTasks(GetTasksRequest) returns (GetTasksResponse);
 
diff --git a/tensorflow/core/data/service/dispatcher_impl.cc b/tensorflow/core/data/service/dispatcher_impl.cc
index 9e705d51ea8..7e3dbb58de2 100644
--- a/tensorflow/core/data/service/dispatcher_impl.cc
+++ b/tensorflow/core/data/service/dispatcher_impl.cc
@@ -19,13 +19,18 @@ limitations under the License.
 #include <tuple>
 #include <utility>
 
+#ifdef PLATFORM_GOOGLE
+#include "file/logging/log_lines.h"
+#endif
 #include "grpcpp/create_channel.h"
 #include "grpcpp/impl/codegen/server_context.h"
 #include "grpcpp/security/credentials.h"
+#include "absl/container/flat_hash_map.h"
 #include "absl/memory/memory.h"
 #include "tensorflow/core/data/service/common.pb.h"
 #include "tensorflow/core/data/service/credentials_factory.h"
 #include "tensorflow/core/data/service/data_service.h"
+#include "tensorflow/core/data/service/dataset_store.h"
 #include "tensorflow/core/data/service/dispatcher.pb.h"
 #include "tensorflow/core/data/service/grpc_util.h"
 #include "tensorflow/core/data/service/journal.h"
@@ -43,7 +48,9 @@ namespace data {
 
 namespace {
 // The name of the journal directory inside the dispatcher's working directory.
-constexpr char kJournalDir[] = "journal";
+constexpr char kJournalDir[] = "tf_data_dispatcher_journal";
+// The name of the datasets directory inside the dispatcher's working directory.
+constexpr char kDatasetsDir[] = "datasets";
 
 using Dataset = DispatcherState::Dataset;
 using Worker = DispatcherState::Worker;
@@ -55,98 +62,158 @@ std::string JournalDir(const std::string& work_dir) {
   return io::JoinPath(work_dir, kJournalDir);
 }
 
+std::string DatasetsDir(const std::string& work_dir) {
+  return io::JoinPath(work_dir, kDatasetsDir);
+}
+
+std::string DatasetKey(int64 id, uint64 fingerprint) {
+  return absl::StrCat("id_", id, "_fp_", fingerprint);
+}
+
 Status CreateWorkerStub(const std::string& address, const std::string& protocol,
-                        std::unique_ptr<WorkerService::Stub>* stub) {
+                        std::unique_ptr<WorkerService::Stub>& stub) {
   ::grpc::ChannelArguments args;
   args.SetMaxReceiveMessageSize(-1);
   std::shared_ptr<::grpc::ChannelCredentials> credentials;
   TF_RETURN_IF_ERROR(
       CredentialsFactory::CreateClientCredentials(protocol, &credentials));
   auto channel = ::grpc::CreateCustomChannel(address, credentials, args);
-  *stub = WorkerService::NewStub(channel);
+  stub = WorkerService::NewStub(channel);
   return Status::OK();
 }
 }  // namespace
 
 DataServiceDispatcherImpl::DataServiceDispatcherImpl(
     const experimental::DispatcherConfig& config)
-    : config_(config) {
+    : config_(config), env_(Env::Default()) {
+  if (config_.work_dir().empty()) {
+    dataset_store_ = absl::make_unique<MemoryDatasetStore>();
+  } else {
+    dataset_store_ = absl::make_unique<FileSystemDatasetStore>(
+        DatasetsDir(config_.work_dir()));
+  }
+}
+
+DataServiceDispatcherImpl::~DataServiceDispatcherImpl() {
+  {
+    mutex_lock l(mu_);
+    cancelled_ = true;
+    job_gc_thread_cv_.notify_all();
+  }
+  job_gc_thread_.reset();
 }
 
 Status DataServiceDispatcherImpl::Start() {
   mutex_lock l(mu_);
+  job_gc_thread_ = absl::WrapUnique(
+      env_->StartThread({}, "job-gc-thread", [&] { JobGcThread(); }));
+  if (config_.work_dir().empty()) {
+    if (config_.fault_tolerant_mode()) {
+      return errors::InvalidArgument(
+          "fault_tolerant_mode is True, but no work_dir is configured.");
+    }
+  } else {
+    TF_RETURN_IF_ERROR(
+        env_->RecursivelyCreateDir(DatasetsDir(config_.work_dir())));
+  }
   if (!config_.fault_tolerant_mode()) {
     LOG(INFO) << "Running with fault_tolerant_mode=False. The dispatcher will "
                  "not be able to recover its state on restart.";
+    started_ = true;
     return Status::OK();
   }
-  if (config_.work_dir().empty()) {
-    return errors::InvalidArgument(
-        "fault_tolerant_mode is True, but no work_dir is configured.");
-  }
   journal_writer_ = absl::make_unique<FileJournalWriter>(
-      Env::Default(), JournalDir(config_.work_dir()));
+      env_, JournalDir(config_.work_dir()));
+  LOG(INFO) << "Attempting to restore dispatcher state from journal in "
+            << JournalDir(config_.work_dir());
   Update update;
   bool end_of_journal = false;
-  FileJournalReader reader(Env::Default(), JournalDir(config_.work_dir()));
-  Status s = reader.Read(&update, &end_of_journal);
+  FileJournalReader reader(env_, JournalDir(config_.work_dir()));
+  Status s = reader.Read(update, end_of_journal);
   if (errors::IsNotFound(s)) {
     LOG(INFO) << "No journal found. Starting dispatcher from new state.";
-    return Status::OK();
-  }
-  TF_RETURN_IF_ERROR(s);
-  LOG(INFO) << "Restoring dispatcher state from journal in "
-            << JournalDir(config_.work_dir());
-  while (!end_of_journal) {
-    TF_RETURN_IF_ERROR(ApplyWithoutJournaling(update));
-    TF_RETURN_IF_ERROR(reader.Read(&update, &end_of_journal));
+  } else if (!s.ok()) {
+    return s;
+  } else {
+    while (!end_of_journal) {
+      TF_RETURN_IF_ERROR(ApplyWithoutJournaling(update));
+      TF_RETURN_IF_ERROR(reader.Read(update, end_of_journal));
+    }
   }
+  // Initialize the journal writer in `Start` so that we fail fast in case it
+  // can't be initialized.
+  TF_RETURN_IF_ERROR(journal_writer_.value()->EnsureInitialized());
+  started_ = true;
   return Status::OK();
 }
 
-Status DataServiceDispatcherImpl::RegisterWorker(
-    const RegisterWorkerRequest* request, RegisterWorkerResponse* response) {
-  VLOG(3) << "Received register worker request";
+Status DataServiceDispatcherImpl::WorkerHeartbeat(
+    const WorkerHeartbeatRequest* request, WorkerHeartbeatResponse* response) {
+  TF_RETURN_IF_ERROR(CheckStarted());
+  VLOG(3) << "Received worker heartbeat request from worker "
+          << request->worker_address();
   mutex_lock l(mu_);
-  std::string worker_address = request->worker_address();
-  std::shared_ptr<const Worker> worker;
-  Status s = state_.WorkerFromAddress(worker_address, &worker);
-  if (errors::IsNotFound(s)) {
+  const std::string& worker_address = request->worker_address();
+  std::vector<std::shared_ptr<const Task>> correct_tasks;
+  Status s = state_.TasksForWorker(worker_address, correct_tasks);
+  if (!s.ok()) {
+    if (!errors::IsNotFound(s)) {
+      return s;
+    }
     Update update;
     update.mutable_register_worker()->set_worker_address(worker_address);
     TF_RETURN_IF_ERROR(Apply(update));
-  } else if (!s.ok()) {
-    return s;
+    TF_RETURN_IF_ERROR(CreateTasksForWorker(worker_address));
+    TF_RETURN_IF_ERROR(state_.TasksForWorker(worker_address, correct_tasks));
   }
 
-  std::vector<std::shared_ptr<const Job>> jobs = state_.ListJobs();
-  // Allocate tasks to the worker.
-  for (const auto& job : jobs) {
-    if (job->finished) {
+  absl::flat_hash_set<int64> current_tasks;
+  current_tasks.insert(request->current_tasks().cbegin(),
+                       request->current_tasks().cend());
+  absl::flat_hash_set<int64> correct_tasks_set;
+
+  for (const auto& task : correct_tasks) {
+    correct_tasks_set.insert(task->task_id);
+    if (current_tasks.contains(task->task_id)) {
       continue;
     }
-    std::shared_ptr<const Task> task;
-    TF_RETURN_IF_ERROR(CreateTask(job, worker_address, &task));
-    TaskDef* task_def = response->add_tasks();
+    TaskDef* task_def = response->add_new_tasks();
     std::shared_ptr<const Dataset> dataset;
-    TF_RETURN_IF_ERROR(state_.DatasetFromId(job->dataset_id, &dataset));
-    *(task_def->mutable_dataset()) = dataset->dataset_def;
-    task_def->set_dataset_id(job->dataset_id);
-    task_def->set_job_id(job->job_id);
+    TF_RETURN_IF_ERROR(state_.DatasetFromId(task->dataset_id, dataset));
+    std::string dataset_key =
+        DatasetKey(dataset->dataset_id, dataset->fingerprint);
+    if (config_.work_dir().empty()) {
+      std::shared_ptr<const DatasetDef> dataset_def;
+      TF_RETURN_IF_ERROR(dataset_store_->Get(dataset_key, dataset_def));
+      *task_def->mutable_dataset_def() = *dataset_def;
+    } else {
+      std::string path =
+          io::JoinPath(DatasetsDir(config_.work_dir()), dataset_key);
+      task_def->set_path(path);
+    }
+    task_def->set_dataset_id(task->dataset_id);
+    task_def->set_job_id(task->job_id);
     task_def->set_task_id(task->task_id);
   }
+  for (int64 current_task : current_tasks) {
+    if (!correct_tasks_set.contains(current_task)) {
+      response->add_tasks_to_delete(current_task);
+    }
+  }
 
-  VLOG(1) << "Registered worker at address " << request->worker_address();
+  VLOG(1) << "Finished worker heartbeat for worker at address "
+          << request->worker_address();
   return Status::OK();
 }
 
 Status DataServiceDispatcherImpl::WorkerUpdate(
     const WorkerUpdateRequest* request, WorkerUpdateResponse* response) {
+  TF_RETURN_IF_ERROR(CheckStarted());
   mutex_lock l(mu_);
   for (auto& update : request->updates()) {
     int64 task_id = update.task_id();
     std::shared_ptr<const Task> task;
-    TF_RETURN_IF_ERROR(state_.TaskFromId(task_id, &task));
+    TF_RETURN_IF_ERROR(state_.TaskFromId(task_id, task));
     if (update.completed()) {
       if (task->finished) {
         VLOG(1) << "Received completion update for already-finished task "
@@ -163,16 +230,35 @@ Status DataServiceDispatcherImpl::WorkerUpdate(
   return Status::OK();
 }
 
+Status DataServiceDispatcherImpl::GetDatasetDef(
+    const GetDatasetDefRequest* request, GetDatasetDefResponse* response) {
+  TF_RETURN_IF_ERROR(CheckStarted());
+  mutex_lock l(mu_);
+  std::shared_ptr<const Dataset> dataset;
+  TF_RETURN_IF_ERROR(state_.DatasetFromId(request->dataset_id(), dataset));
+  std::string key = DatasetKey(dataset->dataset_id, dataset->fingerprint);
+  std::shared_ptr<const DatasetDef> dataset_def;
+  TF_RETURN_IF_ERROR(dataset_store_->Get(key, dataset_def));
+  *response->mutable_dataset_def() = *dataset_def;
+  return Status::OK();
+}
+
 Status DataServiceDispatcherImpl::GetOrRegisterDataset(
     const GetOrRegisterDatasetRequest* request,
     GetOrRegisterDatasetResponse* response) {
+  TF_RETURN_IF_ERROR(CheckStarted());
   uint64 fingerprint;
-  TF_RETURN_IF_ERROR(HashGraph(request->dataset().graph(), &fingerprint));
+  const GraphDef& graph = request->dataset().graph();
+  TF_RETURN_IF_ERROR(HashGraph(graph, &fingerprint));
   mutex_lock l(mu_);
-  VLOG(4) << "Registering dataset graph: "
-          << request->dataset().graph().DebugString();
+#if defined(PLATFORM_GOOGLE)
+  VLOG_LINES(4,
+             absl::StrCat("Registering dataset graph: ", graph.DebugString()));
+#else
+  VLOG(4) << "Registering dataset graph: " << graph.DebugString();
+#endif
   std::shared_ptr<const Dataset> dataset;
-  Status s = state_.DatasetFromFingerprint(fingerprint, &dataset);
+  Status s = state_.DatasetFromFingerprint(fingerprint, dataset);
   if (s.ok()) {
     int64 id = dataset->dataset_id;
     VLOG(3) << "Received duplicate RegisterDataset request with fingerprint "
@@ -184,7 +270,7 @@ Status DataServiceDispatcherImpl::GetOrRegisterDataset(
   }
 
   int64 id;
-  TF_RETURN_IF_ERROR(RegisterDataset(fingerprint, request->dataset(), &id));
+  TF_RETURN_IF_ERROR(RegisterDataset(fingerprint, request->dataset(), id));
   response->set_dataset_id(id);
   VLOG(3) << "Registered new dataset with id " << id;
   return Status::OK();
@@ -192,19 +278,21 @@ Status DataServiceDispatcherImpl::GetOrRegisterDataset(
 
 Status DataServiceDispatcherImpl::RegisterDataset(uint64 fingerprint,
                                                   const DatasetDef& dataset,
-                                                  int64* dataset_id)
+                                                  int64& dataset_id)
     EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-  *dataset_id = state_.NextAvailableDatasetId();
+  dataset_id = state_.NextAvailableDatasetId();
   Update update;
   RegisterDatasetUpdate* register_dataset = update.mutable_register_dataset();
-  register_dataset->set_dataset_id(*dataset_id);
+  register_dataset->set_dataset_id(dataset_id);
   register_dataset->set_fingerprint(fingerprint);
-  *register_dataset->mutable_dataset_def() = dataset;
+  TF_RETURN_IF_ERROR(
+      dataset_store_->Put(DatasetKey(dataset_id, fingerprint), dataset));
   return Apply(update);
 }
 
 Status DataServiceDispatcherImpl::CreateJob(const CreateJobRequest* request,
                                             CreateJobResponse* response) {
+  TF_RETURN_IF_ERROR(CheckStarted());
   VLOG(3) << "Received create job request for dataset id "
           << request->dataset_id();
   ProcessingMode processing_mode = ProcessingMode(request->processing_mode());
@@ -213,10 +301,12 @@ Status DataServiceDispatcherImpl::CreateJob(const CreateJobRequest* request,
   {
     mutex_lock l(mu_);
     TF_RETURN_IF_ERROR(CreateJob(request->dataset_id(), processing_mode,
-                                 absl::optional<NamedJobKey>(), &job));
-    TF_RETURN_IF_ERROR(CreateTasksForJob(job, &tasks));
+                                 absl::optional<NamedJobKey>(), job));
+    int64 job_client_id;
+    TF_RETURN_IF_ERROR(AcquireJobClientId(job, job_client_id));
+    response->set_job_client_id(job_client_id);
+    TF_RETURN_IF_ERROR(CreateTasksForJob(job, tasks));
   }
-  response->set_job_id(job->job_id);
   TF_RETURN_IF_ERROR(AssignTasks(tasks));
 
   VLOG(3) << "Creating job " << job->job_id << " for dataset "
@@ -226,6 +316,7 @@ Status DataServiceDispatcherImpl::CreateJob(const CreateJobRequest* request,
 
 Status DataServiceDispatcherImpl::GetOrCreateJob(
     const GetOrCreateJobRequest* request, GetOrCreateJobResponse* response) {
+  TF_RETURN_IF_ERROR(CheckStarted());
   VLOG(3) << "Received get or create job request for dataset id "
           << request->dataset_id() << " with name " << request->job_name()
           << " and index " << request->job_name_index();
@@ -236,11 +327,13 @@ Status DataServiceDispatcherImpl::GetOrCreateJob(
   std::vector<std::shared_ptr<const Task>> tasks;
   {
     mutex_lock l(mu_);
-    Status s = state_.NamedJobByKey(key, &job);
+    Status s = state_.NamedJobByKey(key, job);
     if (s.ok()) {
       TF_RETURN_IF_ERROR(ValidateMatchingJob(job, requested_processing_mode,
                                              request->dataset_id()));
-      response->set_job_id(job->job_id);
+      int64 job_client_id;
+      TF_RETURN_IF_ERROR(AcquireJobClientId(job, job_client_id));
+      response->set_job_client_id(job_client_id);
       VLOG(3) << "Found existing job for name=" << key.name
               << ", index=" << key.index << ". job_id: " << job->job_id;
       return Status::OK();
@@ -248,16 +341,35 @@ Status DataServiceDispatcherImpl::GetOrCreateJob(
       return s;
     }
     TF_RETURN_IF_ERROR(
-        CreateJob(request->dataset_id(), requested_processing_mode, key, &job));
-    TF_RETURN_IF_ERROR(CreateTasksForJob(job, &tasks));
+        CreateJob(request->dataset_id(), requested_processing_mode, key, job));
+    int64 job_client_id;
+    TF_RETURN_IF_ERROR(AcquireJobClientId(job, job_client_id));
+    response->set_job_client_id(job_client_id);
+    TF_RETURN_IF_ERROR(CreateTasksForJob(job, tasks));
   }
   TF_RETURN_IF_ERROR(AssignTasks(tasks));
-  response->set_job_id(job->job_id);
   VLOG(3) << "Created job " << job->job_id << " for dataset "
           << request->dataset_id() << " and name " << request->job_name();
   return Status::OK();
 }
 
+Status DataServiceDispatcherImpl::ReleaseJobClient(
+    const ReleaseJobClientRequest* request,
+    ReleaseJobClientResponse* response) {
+  TF_RETURN_IF_ERROR(CheckStarted());
+  mutex_lock l(mu_);
+  int64 job_client_id = request->job_client_id();
+  std::shared_ptr<const Job> job;
+  TF_RETURN_IF_ERROR(state_.JobForJobClientId(job_client_id, job));
+  Update update;
+  ReleaseJobClientUpdate* release_job_client =
+      update.mutable_release_job_client();
+  release_job_client->set_job_client_id(job_client_id);
+  release_job_client->set_time_micros(env_->NowMicros());
+  TF_RETURN_IF_ERROR(Apply(update));
+  return Status::OK();
+}
+
 // Validates that the job matches the given processing_mode and dataset_id.
 Status DataServiceDispatcherImpl::ValidateMatchingJob(
     std::shared_ptr<const Job> job, ProcessingMode processing_mode,
@@ -268,22 +380,28 @@ Status DataServiceDispatcherImpl::ValidateMatchingJob(
     std::string requested = ProcessingModeToString(processing_mode);
     std::string actual = ProcessingModeToString(job->processing_mode);
     return errors::FailedPrecondition(
-        "Found a job with name ", job_name, ", but the processing mode <",
-        actual, "> doesn't match the requested processing mode <", requested,
-        ">.");
+        "Tried to create a job with name ", job_name, " and processing_mode <",
+        requested,
+        "> but there is already an existing job with that name using "
+        "processing mode <",
+        actual, ">");
   }
   if (job->dataset_id != dataset_id) {
     return errors::FailedPrecondition(
-        "Found a job with name ", job_name, ", but the dataset id <",
-        job->dataset_id, "> doesn't match the requested dataset id <",
-        dataset_id, ">.");
+        "Tried to create a job with name ", job_name, " for dataset ",
+        dataset_id,
+        ", but there is already an existing job with that name for dataset ",
+        job->dataset_id,
+        ". This either means that you are distributing two different datasets "
+        "under the same job_name, or that your dataset is being constructed "
+        "non-deterministically.");
   }
   return Status::OK();
 }
 
 Status DataServiceDispatcherImpl::CreateJob(
     int64 dataset_id, ProcessingMode processing_mode,
-    absl::optional<NamedJobKey> named_job_key, std::shared_ptr<const Job>* job)
+    absl::optional<NamedJobKey> named_job_key, std::shared_ptr<const Job>& job)
     EXCLUSIVE_LOCKS_REQUIRED(mu_) {
   switch (processing_mode) {
     case ProcessingMode::PARALLEL_EPOCHS:
@@ -313,24 +431,50 @@ Status DataServiceDispatcherImpl::CreateJob(
   return Status::OK();
 }
 
+Status DataServiceDispatcherImpl::CreateTasksForWorker(
+    const std::string& worker_address) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  std::vector<std::shared_ptr<const Job>> jobs = state_.ListJobs();
+  for (const auto& job : jobs) {
+    if (job->finished) {
+      continue;
+    }
+    std::shared_ptr<const Task> task;
+    TF_RETURN_IF_ERROR(CreateTask(job, worker_address, task));
+  }
+  return Status::OK();
+}
+
+Status DataServiceDispatcherImpl::AcquireJobClientId(
+    const std::shared_ptr<const Job>& job, int64& job_client_id)
+    EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  job_client_id = state_.NextAvailableJobClientId();
+  Update update;
+  AcquireJobClientUpdate* acquire_job_client =
+      update.mutable_acquire_job_client();
+  acquire_job_client->set_job_client_id(job_client_id);
+  acquire_job_client->set_job_id(job->job_id);
+  TF_RETURN_IF_ERROR(Apply(update));
+  return Status::OK();
+}
+
 Status DataServiceDispatcherImpl::CreateTasksForJob(
     std::shared_ptr<const Job> job,
-    std::vector<std::shared_ptr<const Task>>* tasks)
+    std::vector<std::shared_ptr<const Task>>& tasks)
     EXCLUSIVE_LOCKS_REQUIRED(mu_) {
   std::vector<std::shared_ptr<const Worker>> workers = state_.ListWorkers();
-  tasks->clear();
-  tasks->reserve(workers.size());
+  tasks.clear();
+  tasks.reserve(workers.size());
   for (const auto& worker : workers) {
     std::shared_ptr<const Task> task;
-    TF_RETURN_IF_ERROR(CreateTask(job, worker->address, &task));
-    tasks->push_back(task);
+    TF_RETURN_IF_ERROR(CreateTask(job, worker->address, task));
+    tasks.push_back(task);
   }
   return Status::OK();
 }
 
 Status DataServiceDispatcherImpl::CreateTask(std::shared_ptr<const Job> job,
                                              const std::string& worker_address,
-                                             std::shared_ptr<const Task>* task)
+                                             std::shared_ptr<const Task>& task)
     EXCLUSIVE_LOCKS_REQUIRED(mu_) {
   int64 task_id = state_.NextAvailableTaskId();
   Update update;
@@ -353,19 +497,19 @@ Status DataServiceDispatcherImpl::AssignTasks(
 }
 
 Status DataServiceDispatcherImpl::GetOrCreateWorkerStub(
-    const std::string& worker_address, WorkerService::Stub** out_stub)
+    const std::string& worker_address, WorkerService::Stub*& out_stub)
     LOCKS_EXCLUDED(mu_) {
   {
     mutex_lock l(mu_);
     auto it = worker_stubs_.find(worker_address);
     if (it != worker_stubs_.end()) {
-      *out_stub = it->second.get();
+      out_stub = it->second.get();
       return Status::OK();
     }
   }
   std::unique_ptr<WorkerService::Stub> stub;
   TF_RETURN_IF_ERROR(
-      CreateWorkerStub(worker_address, config_.protocol(), &stub));
+      CreateWorkerStub(worker_address, config_.protocol(), stub));
   {
     mutex_lock l(mu_);
     // A concurrent call could have already created the stub.
@@ -373,13 +517,15 @@ Status DataServiceDispatcherImpl::GetOrCreateWorkerStub(
     if (worker == nullptr) {
       worker = std::move(stub);
     }
-    *out_stub = worker.get();
+    out_stub = worker.get();
   }
   return Status::OK();
 }
 
 Status DataServiceDispatcherImpl::AssignTask(std::shared_ptr<const Task> task)
     LOCKS_EXCLUDED(mu_) {
+  VLOG(2) << "Started assigning task " << task->task_id << " to worker "
+          << task->worker_address;
   grpc::ClientContext client_ctx;
   ProcessTaskRequest req;
   TaskDef* task_def = req.mutable_task();
@@ -387,44 +533,58 @@ Status DataServiceDispatcherImpl::AssignTask(std::shared_ptr<const Task> task)
   {
     mutex_lock l(mu_);
     std::shared_ptr<const Dataset> dataset;
-    TF_RETURN_IF_ERROR(state_.DatasetFromId(task->dataset_id, &dataset));
-    *task_def->mutable_dataset() = dataset->dataset_def;
+    TF_RETURN_IF_ERROR(state_.DatasetFromId(task->dataset_id, dataset));
+    std::string dataset_key =
+        DatasetKey(dataset->dataset_id, dataset->fingerprint);
+    if (config_.work_dir().empty()) {
+      std::shared_ptr<const DatasetDef> dataset_def;
+      TF_RETURN_IF_ERROR(dataset_store_->Get(dataset_key, dataset_def));
+      *task_def->mutable_dataset_def() = *dataset_def;
+    } else {
+      std::string path =
+          io::JoinPath(DatasetsDir(config_.work_dir()), dataset_key);
+      task_def->set_path(path);
+    }
   }
   task_def->set_task_id(task->task_id);
   ProcessTaskResponse resp;
   WorkerService::Stub* stub;
-  TF_RETURN_IF_ERROR(GetOrCreateWorkerStub(task->worker_address, &stub));
+  TF_RETURN_IF_ERROR(GetOrCreateWorkerStub(task->worker_address, stub));
   grpc::Status s = stub->ProcessTask(&client_ctx, req, &resp);
   if (!s.ok()) {
     return grpc_util::WrapError(
         absl::StrCat("Failed to submit task to worker ", task->worker_address),
         s);
   }
+  VLOG(2) << "Finished assigning task " << task->task_id << " to worker "
+          << task->worker_address;
   return Status::OK();
 }
 
 Status DataServiceDispatcherImpl::GetTasks(const GetTasksRequest* request,
                                            GetTasksResponse* response) {
+  TF_RETURN_IF_ERROR(CheckStarted());
   mutex_lock l(mu_);
-  VLOG(3) << "Looking up tasks for job id " << request->job_id();
+  VLOG(3) << "Looking up tasks for job client id " << request->job_client_id();
+  std::shared_ptr<const Job> job;
+  TF_RETURN_IF_ERROR(state_.JobForJobClientId(request->job_client_id(), job));
   std::vector<std::shared_ptr<const Task>> tasks;
-  TF_RETURN_IF_ERROR(state_.TasksForJob(request->job_id(), &tasks));
+  TF_RETURN_IF_ERROR(state_.TasksForJob(job->job_id, tasks));
   for (const auto& task : tasks) {
     TaskInfo* task_info = response->mutable_task_info()->Add();
     task_info->set_worker_address(task->worker_address);
     task_info->set_task_id(task->task_id);
-    task_info->set_job_id(task->job_id);
+    task_info->set_job_id(job->job_id);
   }
-  std::shared_ptr<const Job> job;
-  TF_RETURN_IF_ERROR(state_.JobFromId(request->job_id(), &job));
   response->set_job_finished(job->finished);
-  VLOG(3) << "Found " << response->task_info_size() << " tasks for job id "
-          << request->job_id();
+  VLOG(3) << "Found " << response->task_info_size()
+          << " tasks for job client id " << request->job_client_id();
   return Status::OK();
 }
 
 Status DataServiceDispatcherImpl::GetWorkers(const GetWorkersRequest* request,
                                              GetWorkersResponse* response) {
+  TF_RETURN_IF_ERROR(CheckStarted());
   mutex_lock l(mu_);
   VLOG(3) << "Enter GetWorkers";
   std::vector<std::shared_ptr<const Worker>> workers = state_.ListWorkers();
@@ -437,6 +597,14 @@ Status DataServiceDispatcherImpl::GetWorkers(const GetWorkersRequest* request,
   return Status::OK();
 }
 
+Status DataServiceDispatcherImpl::CheckStarted() LOCKS_EXCLUDED(mu_) {
+  mutex_lock l(mu_);
+  if (!started_) {
+    return errors::Unavailable("Dispatcher has not started yet.");
+  }
+  return Status::OK();
+}
+
 Status DataServiceDispatcherImpl::ApplyWithoutJournaling(const Update& update)
     EXCLUSIVE_LOCKS_REQUIRED(mu_) {
   return state_.Apply(update);
@@ -450,5 +618,51 @@ Status DataServiceDispatcherImpl::Apply(const Update& update)
   return state_.Apply(update);
 }
 
+void DataServiceDispatcherImpl::JobGcThread() {
+  int64 next_check_micros = 0;
+  while (true) {
+    mutex_lock l(mu_);
+    while (!cancelled_ && env_->NowMicros() < next_check_micros) {
+      int64 remaining_micros = next_check_micros - env_->NowMicros();
+      job_gc_thread_cv_.wait_for(l,
+                                 std::chrono::microseconds(remaining_micros));
+    }
+    if (cancelled_) {
+      return;
+    }
+    Status s = GcOldJobs();
+    if (!s.ok()) {
+      LOG(WARNING) << "Error garbage collecting old jobs: " << s;
+    }
+    next_check_micros =
+        env_->NowMicros() + (config_.job_gc_check_interval_ms() * 1000);
+  }
+}
+
+Status DataServiceDispatcherImpl::GcOldJobs() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  std::vector<std::shared_ptr<const Job>> jobs = state_.ListJobs();
+  int64 now = env_->NowMicros();
+  for (const auto& job : jobs) {
+    if (job->finished || job->num_clients > 0 ||
+        job->last_client_released_micros < 0 ||
+        now < job->last_client_released_micros +
+                  (config_.job_gc_timeout_ms() * 1000)) {
+      continue;
+    }
+    std::vector<std::shared_ptr<const Task>> tasks;
+    TF_RETURN_IF_ERROR(state_.TasksForJob(job->job_id, tasks));
+    for (const auto& task : tasks) {
+      if (task->finished) {
+        continue;
+      }
+      Update update;
+      update.mutable_finish_task()->set_task_id(task->task_id);
+      TF_RETURN_IF_ERROR(state_.Apply(update));
+    }
+    DCHECK(job->finished);
+  }
+  return Status::OK();
+}
+
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/data/service/dispatcher_impl.h b/tensorflow/core/data/service/dispatcher_impl.h
index f4cc6954fe8..08f312d8cdf 100644
--- a/tensorflow/core/data/service/dispatcher_impl.h
+++ b/tensorflow/core/data/service/dispatcher_impl.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/data/service/common.pb.h"
 #include "tensorflow/core/data/service/data_service.h"
+#include "tensorflow/core/data/service/dataset_store.h"
 #include "tensorflow/core/data/service/dispatcher.pb.h"
 #include "tensorflow/core/data/service/dispatcher_state.h"
 #include "tensorflow/core/data/service/worker.grpc.pb.h"
@@ -47,6 +48,8 @@ class DataServiceDispatcherImpl {
   explicit DataServiceDispatcherImpl(
       const experimental::DispatcherConfig& config);
 
+  ~DataServiceDispatcherImpl();
+
   // Starts the dispatcher. If there is a journal, this will read from the
   // journal to restore the dispatcher's state.
   Status Start();
@@ -54,10 +57,12 @@ class DataServiceDispatcherImpl {
   // See dispatcher.proto for API documentation.
 
   /// Worker-facing API.
-  Status RegisterWorker(const RegisterWorkerRequest* request,
-                        RegisterWorkerResponse* response);
+  Status WorkerHeartbeat(const WorkerHeartbeatRequest* request,
+                         WorkerHeartbeatResponse* response);
   Status WorkerUpdate(const WorkerUpdateRequest* request,
                       WorkerUpdateResponse* response);
+  Status GetDatasetDef(const GetDatasetDefRequest* request,
+                       GetDatasetDefResponse* response);
 
   /// Client-facing API.
   Status GetOrRegisterDataset(const GetOrRegisterDatasetRequest* request,
@@ -66,38 +71,48 @@ class DataServiceDispatcherImpl {
                    CreateJobResponse* response);
   Status GetOrCreateJob(const GetOrCreateJobRequest* request,
                         GetOrCreateJobResponse* response);
+  Status ReleaseJobClient(const ReleaseJobClientRequest* request,
+                          ReleaseJobClientResponse* response);
   Status GetTasks(const GetTasksRequest* request, GetTasksResponse* response);
   Status GetWorkers(const GetWorkersRequest* request,
                     GetWorkersResponse* response);
 
  private:
   // Registers a dataset with the given fingerprint, storing the new dataset's
-  // id in `*dataset-id`.
+  // id in `dataset_id`.
   Status RegisterDataset(uint64 fingerprint, const DatasetDef& dataset,
-                         int64* dataset_id) EXCLUSIVE_LOCKS_REQUIRED(mu_);
+                         int64& dataset_id) EXCLUSIVE_LOCKS_REQUIRED(mu_);
   // Gets a worker's stub from `worker_stubs_`, or if none exists, creates a
-  // stub and stores it in `worker_stubs_`.
+  // stub and stores it in `worker_stubs_`. A borrowed pointer to the stub is
+  // stored in `out_stub`.
   Status GetOrCreateWorkerStub(const std::string& worker_address,
-                               WorkerService::Stub** out_stub)
+                               WorkerService::Stub*& out_stub)
       LOCKS_EXCLUDED(mu_);
-  // Creates a job and stores it in `*job`. This method updates the
+  // Creates a job and stores it in `job`. This method updates the
   // dispatcher state with the new job, but does not assign tasks to workers.
   Status CreateJob(int64 dataset_id, ProcessingMode processing_mode,
                    absl::optional<DispatcherState::NamedJobKey> named_job_key,
-                   std::shared_ptr<const DispatcherState::Job>* job)
+                   std::shared_ptr<const DispatcherState::Job>& job)
       EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  // Creates tasks for the specified worker, one task for every unfinished job.
+  Status CreateTasksForWorker(const std::string& worker_address);
+  // Acquires a job client id to read from the given job and sets
+  // `job_client_id`.
+  Status AcquireJobClientId(
+      const std::shared_ptr<const DispatcherState::Job>& job,
+      int64& job_client_id) EXCLUSIVE_LOCKS_REQUIRED(mu_);
   // Creates one task for each worker, for the given job. The created tasks are
-  // stored in `*tasks`. This method only updates dispatcher metadata with the
+  // stored in `tasks`. This method only updates dispatcher metadata with the
   // new tasks, but doesn't assign the tasks to the workers.
   Status CreateTasksForJob(
       std::shared_ptr<const DispatcherState::Job> job,
-      std::vector<std::shared_ptr<const DispatcherState::Task>>* tasks)
+      std::vector<std::shared_ptr<const DispatcherState::Task>>& tasks)
       EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
-  // Creates a new task for a job, storing the created task in `*task`.
+  // Creates a new task for a job, storing the created task in `task`.
   Status CreateTask(std::shared_ptr<const DispatcherState::Job> job,
                     const std::string& worker_address,
-                    std::shared_ptr<const DispatcherState::Task>* task);
+                    std::shared_ptr<const DispatcherState::Task>& task);
   // Assigns the list of tasks to the workers indicated by their
   // `worker_address` fields.
   Status AssignTasks(
@@ -111,26 +126,38 @@ class DataServiceDispatcherImpl {
   Status ValidateMatchingJob(std::shared_ptr<const DispatcherState::Job> job,
                              ProcessingMode processing_mode, int64 dataset_id)
       EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  // Checks that the dispatcher has started, returning UNAVAILABLE if it hasn't.
+  Status CheckStarted() LOCKS_EXCLUDED(mu_);
   // Applies a state update, updating both the journal and the in-memory state.
   Status Apply(const Update& update) EXCLUSIVE_LOCKS_REQUIRED(mu_);
   // Applies a state update, but doesn't update the journal. Only meant to be
   // used when recovering state when the dispatcher starts.
   Status ApplyWithoutJournaling(const Update& update)
       EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  // A thread which periodically checks for jobs to clean up.
+  void JobGcThread();
+  // Scans for old jobs and marks them as finished.
+  Status GcOldJobs() EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   const experimental::DispatcherConfig& config_;
+  Env* env_;
 
   mutex mu_;
-
-  int64 next_task_id_ TF_GUARDED_BY(mu_) = 0;
+  bool started_ TF_GUARDED_BY(mu_) = false;
+  bool cancelled_ TF_GUARDED_BY(mu_) = false;
 
   // Cached worker stubs for communicating with workers.
   absl::flat_hash_map<std::string, std::unique_ptr<WorkerService::Stub>>
       worker_stubs_ TF_GUARDED_BY(mu_);
+  // Store of dataset definitions.
+  std::unique_ptr<DatasetStore> dataset_store_ TF_GUARDED_BY(mu_);
 
   absl::optional<std::unique_ptr<JournalWriter>> journal_writer_
       TF_GUARDED_BY(mu_);
   DispatcherState state_ TF_GUARDED_BY(mu_);
+  // Condition variable for waking up the job gc thread.
+  condition_variable job_gc_thread_cv_;
+  std::unique_ptr<Thread> job_gc_thread_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(DataServiceDispatcherImpl);
 };
diff --git a/tensorflow/core/data/service/dispatcher_state.cc b/tensorflow/core/data/service/dispatcher_state.cc
index 1e914b69e5b..749f9bce340 100644
--- a/tensorflow/core/data/service/dispatcher_state.cc
+++ b/tensorflow/core/data/service/dispatcher_state.cc
@@ -25,7 +25,7 @@ namespace data {
 
 DispatcherState::DispatcherState() {}
 
-Status DispatcherState::Apply(Update update) {
+Status DispatcherState::Apply(const Update& update) {
   switch (update.update_type_case()) {
     case Update::kRegisterDataset:
       RegisterDataset(update.register_dataset());
@@ -36,6 +36,12 @@ Status DispatcherState::Apply(Update update) {
     case Update::kCreateJob:
       CreateJob(update.create_job());
       break;
+    case Update::kAcquireJobClient:
+      AcquireJobClient(update.acquire_job_client());
+      break;
+    case Update::kReleaseJobClient:
+      ReleaseJobClient(update.release_job_client());
+      break;
     case Update::kCreateTask:
       CreateTask(update.create_task());
       break;
@@ -53,8 +59,7 @@ void DispatcherState::RegisterDataset(
     const RegisterDatasetUpdate& register_dataset) {
   int64 id = register_dataset.dataset_id();
   int64 fingerprint = register_dataset.fingerprint();
-  auto dataset = std::make_shared<Dataset>(id, fingerprint,
-                                           register_dataset.dataset_def());
+  auto dataset = std::make_shared<Dataset>(id, fingerprint);
   DCHECK(!datasets_by_id_.contains(id));
   datasets_by_id_[id] = dataset;
   DCHECK(!datasets_by_fingerprint_.contains(fingerprint));
@@ -67,6 +72,8 @@ void DispatcherState::RegisterWorker(
   std::string address = register_worker.worker_address();
   DCHECK(!workers_.contains(address));
   workers_[address] = std::make_shared<Worker>(address);
+  tasks_by_worker_[address] =
+      absl::flat_hash_map<int64, std::shared_ptr<Task>>();
 }
 
 void DispatcherState::CreateJob(const CreateJobUpdate& create_job) {
@@ -89,6 +96,29 @@ void DispatcherState::CreateJob(const CreateJobUpdate& create_job) {
   next_available_job_id_ = std::max(next_available_job_id_, job_id + 1);
 }
 
+void DispatcherState::AcquireJobClient(
+    const AcquireJobClientUpdate& acquire_job_client) {
+  int64 job_client_id = acquire_job_client.job_client_id();
+  std::shared_ptr<Job>& job = jobs_for_client_ids_[job_client_id];
+  DCHECK(!job);
+  job = jobs_[acquire_job_client.job_id()];
+  DCHECK(job);
+  job->num_clients++;
+  next_available_job_client_id_ =
+      std::max(next_available_job_client_id_, job_client_id + 1);
+}
+
+void DispatcherState::ReleaseJobClient(
+    const ReleaseJobClientUpdate& release_job_client) {
+  int64 job_client_id = release_job_client.job_client_id();
+  std::shared_ptr<Job>& job = jobs_for_client_ids_[job_client_id];
+  DCHECK(job);
+  job->num_clients--;
+  DCHECK_GE(job->num_clients, 0);
+  job->last_client_released_micros = release_job_client.time_micros();
+  jobs_for_client_ids_.erase(job_client_id);
+}
+
 void DispatcherState::CreateTask(const CreateTaskUpdate& create_task) {
   int64 task_id = create_task.task_id();
   auto& task = tasks_[task_id];
@@ -97,6 +127,7 @@ void DispatcherState::CreateTask(const CreateTaskUpdate& create_task) {
                                 create_task.dataset_id(),
                                 create_task.worker_address());
   tasks_by_job_[create_task.job_id()].push_back(task);
+  tasks_by_worker_[create_task.worker_address()][task->task_id] = task;
   next_available_task_id_ = std::max(next_available_task_id_, task_id + 1);
 }
 
@@ -106,6 +137,7 @@ void DispatcherState::FinishTask(const FinishTaskUpdate& finish_task) {
   auto& task = tasks_[task_id];
   DCHECK(task != nullptr);
   task->finished = true;
+  tasks_by_worker_[task->worker_address].erase(task->task_id);
   bool all_finished = true;
   for (const auto& task_for_job : tasks_by_job_[task->job_id]) {
     if (!task_for_job->finished) {
@@ -121,32 +153,32 @@ int64 DispatcherState::NextAvailableDatasetId() const {
 }
 
 Status DispatcherState::DatasetFromId(
-    int64 id, std::shared_ptr<const Dataset>* dataset) const {
+    int64 id, std::shared_ptr<const Dataset>& dataset) const {
   auto it = datasets_by_id_.find(id);
   if (it == datasets_by_id_.end()) {
     return errors::NotFound("Dataset id ", id, " not found");
   }
-  *dataset = it->second;
+  dataset = it->second;
   return Status::OK();
 }
 
 Status DispatcherState::DatasetFromFingerprint(
-    uint64 fingerprint, std::shared_ptr<const Dataset>* dataset) const {
+    uint64 fingerprint, std::shared_ptr<const Dataset>& dataset) const {
   auto it = datasets_by_fingerprint_.find(fingerprint);
   if (it == datasets_by_fingerprint_.end()) {
     return errors::NotFound("Dataset fingerprint ", fingerprint, " not found");
   }
-  *dataset = it->second;
+  dataset = it->second;
   return Status::OK();
 }
 
 Status DispatcherState::WorkerFromAddress(
-    const std::string& address, std::shared_ptr<const Worker>* worker) const {
+    const std::string& address, std::shared_ptr<const Worker>& worker) const {
   auto it = workers_.find(address);
   if (it == workers_.end()) {
     return errors::NotFound("Worker with address ", address, " not found.");
   }
-  *worker = it->second;
+  worker = it->second;
   return Status::OK();
 }
 
@@ -171,23 +203,23 @@ DispatcherState::ListJobs() {
 }
 
 Status DispatcherState::JobFromId(int64 id,
-                                  std::shared_ptr<const Job>* job) const {
+                                  std::shared_ptr<const Job>& job) const {
   auto it = jobs_.find(id);
   if (it == jobs_.end()) {
     return errors::NotFound("Job id ", id, " not found");
   }
-  *job = it->second;
+  job = it->second;
   return Status::OK();
 }
 
 Status DispatcherState::NamedJobByKey(NamedJobKey named_job_key,
-                                      std::shared_ptr<const Job>* job) const {
+                                      std::shared_ptr<const Job>& job) const {
   auto it = named_jobs_.find(named_job_key);
   if (it == named_jobs_.end()) {
     return errors::NotFound("Named job key (", named_job_key.name, ", ",
                             named_job_key.index, ") not found");
   }
-  *job = it->second;
+  job = it->second;
   return Status::OK();
 }
 
@@ -195,26 +227,55 @@ int64 DispatcherState::NextAvailableJobId() const {
   return next_available_job_id_;
 }
 
+Status DispatcherState::JobForJobClientId(int64 job_client_id,
+                                          std::shared_ptr<const Job>& job) {
+  job = jobs_for_client_ids_[job_client_id];
+  if (!job) {
+    return errors::NotFound("Job client id not found: ", job_client_id);
+  }
+  return Status::OK();
+}
+
+int64 DispatcherState::NextAvailableJobClientId() const {
+  return next_available_job_client_id_;
+}
+
 Status DispatcherState::TaskFromId(int64 id,
-                                   std::shared_ptr<const Task>* task) const {
+                                   std::shared_ptr<const Task>& task) const {
   auto it = tasks_.find(id);
   if (it == tasks_.end()) {
     return errors::NotFound("Task ", id, " not found");
   }
-  *task = it->second;
+  task = it->second;
   return Status::OK();
 }
 
 Status DispatcherState::TasksForJob(
-    int64 job_id, std::vector<std::shared_ptr<const Task>>* tasks) const {
+    int64 job_id, std::vector<std::shared_ptr<const Task>>& tasks) const {
   auto it = tasks_by_job_.find(job_id);
   if (it == tasks_by_job_.end()) {
     return errors::NotFound("Job ", job_id, " not found");
   }
-  tasks->clear();
-  tasks->reserve(it->second.size());
+  tasks.clear();
+  tasks.reserve(it->second.size());
   for (const auto& task : it->second) {
-    tasks->push_back(task);
+    tasks.push_back(task);
+  }
+  return Status::OK();
+}
+
+Status DispatcherState::TasksForWorker(
+    absl::string_view worker_address,
+    std::vector<std::shared_ptr<const Task>>& tasks) const {
+  auto it = tasks_by_worker_.find(worker_address);
+  if (it == tasks_by_worker_.end()) {
+    return errors::NotFound("Worker ", worker_address, " not found");
+  }
+  const absl::flat_hash_map<int64, std::shared_ptr<Task>>& worker_tasks =
+      it->second;
+  tasks.reserve(worker_tasks.size());
+  for (const auto& task : worker_tasks) {
+    tasks.push_back(task.second);
   }
   return Status::OK();
 }
diff --git a/tensorflow/core/data/service/dispatcher_state.h b/tensorflow/core/data/service/dispatcher_state.h
index b1aa0aa3979..9f307e92ae3 100644
--- a/tensorflow/core/data/service/dispatcher_state.h
+++ b/tensorflow/core/data/service/dispatcher_state.h
@@ -26,7 +26,7 @@ namespace tensorflow {
 namespace data {
 
 // A class encapsulating the journaled state of the dispatcher. All state
-// modifications must be done via `ApplyUpdate`. This helps to ensure that
+// modifications must be done via `Apply`. This helps to ensure that
 // replaying the journal will allow us to restore the exact same state.
 //
 // The following usage pattern will keep the journal in sync with the state of
@@ -34,7 +34,7 @@ namespace data {
 // {
 //   mutex_lock l(mu_);
 //   Update update = ...  // create an update
-//   dispatcher_state.ApplyUpdate(update);
+//   dispatcher_state.Apply(update);
 //   journal_writer.write(Update);
 //   // Unlock mu_
 // }
@@ -56,19 +56,15 @@ class DispatcherState {
   DispatcherState& operator=(const DispatcherState&) = delete;
 
   // Applies the given update to the dispatcher's state.
-  Status Apply(Update update);
+  Status Apply(const Update& update);
 
   // A dataset registered with the dispatcher.
   struct Dataset {
-    explicit Dataset(int64 dataset_id, int64 fingerprint,
-                     const DatasetDef& dataset_def)
-        : dataset_id(dataset_id),
-          fingerprint(fingerprint),
-          dataset_def(dataset_def) {}
+    explicit Dataset(int64 dataset_id, int64 fingerprint)
+        : dataset_id(dataset_id), fingerprint(fingerprint) {}
 
     const int64 dataset_id;
     const int64 fingerprint;
-    const DatasetDef dataset_def;
   };
 
   // A worker registered with the dispatcher.
@@ -110,6 +106,8 @@ class DispatcherState {
     const int64 dataset_id;
     const ProcessingMode processing_mode;
     const absl::optional<NamedJobKey> named_job_key;
+    int64 num_clients = 0;
+    int64 last_client_released_micros = -1;
     bool finished = false;
   };
 
@@ -131,15 +129,15 @@ class DispatcherState {
   // Returns the next available dataset id.
   int64 NextAvailableDatasetId() const;
   // Gets a dataset by id. Returns NOT_FOUND if there is no such dataset.
-  Status DatasetFromId(int64 id, std::shared_ptr<const Dataset>* dataset) const;
+  Status DatasetFromId(int64 id, std::shared_ptr<const Dataset>& dataset) const;
   // Gets a dataset by fingerprint. Returns NOT_FOUND if there is no such
   // dataset.
   Status DatasetFromFingerprint(uint64 fingerprint,
-                                std::shared_ptr<const Dataset>* dataset) const;
+                                std::shared_ptr<const Dataset>& dataset) const;
 
   // Gets a worker by address. Returns NOT_FOUND if there is no such worker.
   Status WorkerFromAddress(const std::string& address,
-                           std::shared_ptr<const Worker>* worker) const;
+                           std::shared_ptr<const Worker>& worker) const;
   // Lists all workers registered with the dispatcher.
   std::vector<std::shared_ptr<const Worker>> ListWorkers() const;
 
@@ -148,27 +146,40 @@ class DispatcherState {
   // Returns a list of all jobs.
   std::vector<std::shared_ptr<const Job>> ListJobs();
   // Gets a job by id. Returns NOT_FOUND if there is no such job.
-  Status JobFromId(int64 id, std::shared_ptr<const Job>* job) const;
+  Status JobFromId(int64 id, std::shared_ptr<const Job>& job) const;
   // Gets a named job by key. Returns NOT_FOUND if there is no such job.
-  Status NamedJobByKey(NamedJobKey key, std::shared_ptr<const Job>* job) const;
+  Status NamedJobByKey(NamedJobKey key, std::shared_ptr<const Job>& job) const;
+
+  // Returns the job associated with the given job client id. Returns NOT_FOUND
+  // if the job_client_id is unknown or has been released.
+  Status JobForJobClientId(int64 job_client_id,
+                           std::shared_ptr<const Job>& job);
+  // Returns the next available job client id.
+  int64 NextAvailableJobClientId() const;
 
   // Returns the next available task id.
   int64 NextAvailableTaskId() const;
   // Gets a task by id. Returns NOT_FOUND if there is no such task.
-  Status TaskFromId(int64 id, std::shared_ptr<const Task>* task) const;
-  // Stores a list of all tasks for the given job to `*tasks`. Returns NOT_FOUND
+  Status TaskFromId(int64 id, std::shared_ptr<const Task>& task) const;
+  // Stores a list of all tasks for the given job to `tasks`. Returns NOT_FOUND
   // if there is no such job.
   Status TasksForJob(int64 job_id,
-                     std::vector<std::shared_ptr<const Task>>* tasks) const;
+                     std::vector<std::shared_ptr<const Task>>& tasks) const;
+  // Stores a list of all tasks for the given worker to `tasks`. Returns
+  // NOT_FOUND if there is no such worker.
+  Status TasksForWorker(const absl::string_view worker_address,
+                        std::vector<std::shared_ptr<const Task>>& tasks) const;
 
  private:
   void RegisterDataset(const RegisterDatasetUpdate& register_dataset);
   void RegisterWorker(const RegisterWorkerUpdate& register_worker);
   void CreateJob(const CreateJobUpdate& create_job);
+  void AcquireJobClient(const AcquireJobClientUpdate& acquire_job_client);
+  void ReleaseJobClient(const ReleaseJobClientUpdate& release_job_client);
   void CreateTask(const CreateTaskUpdate& create_task);
   void FinishTask(const FinishTaskUpdate& finish_task);
 
-  int64 next_available_dataset_id_ = 0;
+  int64 next_available_dataset_id_ = 1000;
   // Registered datasets, keyed by dataset ids.
   absl::flat_hash_map<int64, std::shared_ptr<Dataset>> datasets_by_id_;
   // Registered datasets, keyed by dataset fingerprints.
@@ -178,18 +189,27 @@ class DispatcherState {
   // Registered workers, keyed by address.
   absl::flat_hash_map<std::string, std::shared_ptr<Worker>> workers_;
 
-  int64 next_available_job_id_ = 0;
+  int64 next_available_job_id_ = 2000;
   // Jobs, keyed by job ids.
   absl::flat_hash_map<int64, std::shared_ptr<Job>> jobs_;
   // Named jobs, keyed by their names and indices. Not all jobs have names, so
   // this is a subset of the jobs stored in `jobs_`.
   absl::flat_hash_map<NamedJobKey, std::shared_ptr<Job>> named_jobs_;
 
-  int64 next_available_task_id_ = 0;
+  int64 next_available_job_client_id_ = 3000;
+  // Mapping from client ids to the jobs they are associated with.
+  absl::flat_hash_map<int64, std::shared_ptr<Job>> jobs_for_client_ids_;
+
+  int64 next_available_task_id_ = 4000;
   // Tasks, keyed by task ids.
   absl::flat_hash_map<int64, std::shared_ptr<Task>> tasks_;
   // Tasks, keyed by job ids.
   absl::flat_hash_map<int64, std::vector<std::shared_ptr<Task>>> tasks_by_job_;
+  // Tasks, keyed by worker addresses. The values are a map from task id to
+  // task.
+  absl::flat_hash_map<std::string,
+                      absl::flat_hash_map<int64, std::shared_ptr<Task>>>
+      tasks_by_worker_;
 };
 
 }  // namespace data
diff --git a/tensorflow/core/data/service/dispatcher_state_test.cc b/tensorflow/core/data/service/dispatcher_state_test.cc
index 78f507ec349..299ff2c8feb 100644
--- a/tensorflow/core/data/service/dispatcher_state_test.cc
+++ b/tensorflow/core/data/service/dispatcher_state_test.cc
@@ -36,36 +36,39 @@ using Task = DispatcherState::Task;
 using ::testing::IsEmpty;
 using ::testing::SizeIs;
 
-Status RegisterDatasetWithIdAndFingerprint(int64 id, uint64 fingerprint,
-                                           DispatcherState* state) {
+Status RegisterDataset(int64 id, uint64 fingerprint, DispatcherState& state) {
   Update update;
   RegisterDatasetUpdate* register_dataset = update.mutable_register_dataset();
   register_dataset->set_dataset_id(id);
   register_dataset->set_fingerprint(fingerprint);
-  TF_RETURN_IF_ERROR(state->Apply(update));
+  TF_RETURN_IF_ERROR(state.Apply(update));
   return Status::OK();
 }
 
-Status RegisterWorker(std::string worker_address, DispatcherState* state) {
+Status RegisterDataset(int64 id, DispatcherState& state) {
+  return RegisterDataset(id, /*fingerprint=*/1, state);
+}
+
+Status RegisterWorker(std::string worker_address, DispatcherState& state) {
   Update update;
   update.mutable_register_worker()->set_worker_address(worker_address);
-  TF_RETURN_IF_ERROR(state->Apply(update));
+  TF_RETURN_IF_ERROR(state.Apply(update));
   return Status::OK();
 }
 
 Status CreateAnonymousJob(int64 job_id, int64 dataset_id,
-                          DispatcherState* state) {
+                          DispatcherState& state) {
   Update update;
   CreateJobUpdate* create_job = update.mutable_create_job();
   create_job->set_job_id(job_id);
   create_job->set_dataset_id(dataset_id);
   create_job->set_processing_mode(ProcessingModeDef::PARALLEL_EPOCHS);
-  TF_RETURN_IF_ERROR(state->Apply(update));
+  TF_RETURN_IF_ERROR(state.Apply(update));
   return Status::OK();
 }
 
 Status CreateNamedJob(int64 job_id, int64 dataset_id, NamedJobKey named_job_key,
-                      DispatcherState* state) {
+                      DispatcherState& state) {
   Update update;
   CreateJobUpdate* create_job = update.mutable_create_job();
   create_job->set_job_id(job_id);
@@ -74,46 +77,68 @@ Status CreateNamedJob(int64 job_id, int64 dataset_id, NamedJobKey named_job_key,
   NamedJobKeyDef* key = create_job->mutable_named_job_key();
   key->set_name(named_job_key.name);
   key->set_index(named_job_key.index);
-  TF_RETURN_IF_ERROR(state->Apply(update));
+  TF_RETURN_IF_ERROR(state.Apply(update));
+  return Status::OK();
+}
+
+Status AcquireJobClientId(int64 job_id, int64 job_client_id,
+                          DispatcherState& state) {
+  Update update;
+  AcquireJobClientUpdate* acquire_job_client =
+      update.mutable_acquire_job_client();
+  acquire_job_client->set_job_id(job_id);
+  acquire_job_client->set_job_client_id(job_client_id);
+  TF_RETURN_IF_ERROR(state.Apply(update));
+  return Status::OK();
+}
+
+Status ReleaseJobClientId(int64 job_client_id, int64 release_time,
+                          DispatcherState& state) {
+  Update update;
+  ReleaseJobClientUpdate* release_job_client =
+      update.mutable_release_job_client();
+  release_job_client->set_job_client_id(job_client_id);
+  release_job_client->set_time_micros(release_time);
+  TF_RETURN_IF_ERROR(state.Apply(update));
   return Status::OK();
 }
 
 Status CreateTask(int64 task_id, int64 job_id, int64 dataset_id,
-                  const std::string& worker_address, DispatcherState* state) {
+                  const std::string& worker_address, DispatcherState& state) {
   Update update;
   CreateTaskUpdate* create_task = update.mutable_create_task();
   create_task->set_task_id(task_id);
   create_task->set_job_id(job_id);
   create_task->set_dataset_id(dataset_id);
   create_task->set_worker_address(worker_address);
-  TF_RETURN_IF_ERROR(state->Apply(update));
+  TF_RETURN_IF_ERROR(state.Apply(update));
   return Status::OK();
 }
 
-Status FinishTask(int64 task_id, DispatcherState* state) {
+Status FinishTask(int64 task_id, DispatcherState& state) {
   Update update;
   FinishTaskUpdate* finish_task = update.mutable_finish_task();
   finish_task->set_task_id(task_id);
-  TF_RETURN_IF_ERROR(state->Apply(update));
+  TF_RETURN_IF_ERROR(state.Apply(update));
   return Status::OK();
 }
 }  // namespace
 
 TEST(DispatcherState, RegisterDataset) {
-  int64 id = 10;
   uint64 fingerprint = 20;
   DispatcherState state;
-  TF_EXPECT_OK(RegisterDatasetWithIdAndFingerprint(id, fingerprint, &state));
+  int64 id = state.NextAvailableDatasetId();
+  TF_EXPECT_OK(RegisterDataset(id, fingerprint, state));
   EXPECT_EQ(state.NextAvailableDatasetId(), id + 1);
 
   {
     std::shared_ptr<const Dataset> dataset;
-    TF_EXPECT_OK(state.DatasetFromFingerprint(fingerprint, &dataset));
+    TF_EXPECT_OK(state.DatasetFromFingerprint(fingerprint, dataset));
     EXPECT_EQ(dataset->dataset_id, id);
   }
   {
     std::shared_ptr<const Dataset> dataset;
-    TF_EXPECT_OK(state.DatasetFromId(id, &dataset));
+    TF_EXPECT_OK(state.DatasetFromId(id, dataset));
     EXPECT_EQ(dataset->fingerprint, fingerprint);
   }
 }
@@ -121,14 +146,14 @@ TEST(DispatcherState, RegisterDataset) {
 TEST(DispatcherState, MissingDatasetId) {
   DispatcherState state;
   std::shared_ptr<const Dataset> dataset;
-  Status s = state.DatasetFromId(0, &dataset);
+  Status s = state.DatasetFromId(0, dataset);
   EXPECT_EQ(s.code(), error::NOT_FOUND);
 }
 
 TEST(DispatcherState, MissingDatasetFingerprint) {
   DispatcherState state;
   std::shared_ptr<const Dataset> dataset;
-  Status s = state.DatasetFromFingerprint(0, &dataset);
+  Status s = state.DatasetFromFingerprint(0, dataset);
   EXPECT_EQ(s.code(), error::NOT_FOUND);
 }
 
@@ -136,7 +161,7 @@ TEST(DispatcherState, NextAvailableDatasetId) {
   DispatcherState state;
   int64 id = state.NextAvailableDatasetId();
   uint64 fingerprint = 20;
-  TF_EXPECT_OK(RegisterDatasetWithIdAndFingerprint(id, fingerprint, &state));
+  TF_EXPECT_OK(RegisterDataset(id, fingerprint, state));
   EXPECT_NE(state.NextAvailableDatasetId(), id);
   EXPECT_EQ(state.NextAvailableDatasetId(), state.NextAvailableDatasetId());
 }
@@ -144,9 +169,9 @@ TEST(DispatcherState, NextAvailableDatasetId) {
 TEST(DispatcherState, RegisterWorker) {
   DispatcherState state;
   std::string address = "test_worker_address";
-  TF_EXPECT_OK(RegisterWorker(address, &state));
+  TF_EXPECT_OK(RegisterWorker(address, state));
   std::shared_ptr<const Worker> worker;
-  TF_EXPECT_OK(state.WorkerFromAddress(address, &worker));
+  TF_EXPECT_OK(state.WorkerFromAddress(address, worker));
   EXPECT_EQ(worker->address, address);
 }
 
@@ -158,12 +183,12 @@ TEST(DispatcherState, ListWorkers) {
     std::vector<std::shared_ptr<const Worker>> workers = state.ListWorkers();
     EXPECT_THAT(workers, IsEmpty());
   }
-  TF_EXPECT_OK(RegisterWorker(address_1, &state));
+  TF_EXPECT_OK(RegisterWorker(address_1, state));
   {
     std::vector<std::shared_ptr<const Worker>> workers = state.ListWorkers();
     EXPECT_THAT(workers, SizeIs(1));
   }
-  TF_EXPECT_OK(RegisterWorker(address_2, &state));
+  TF_EXPECT_OK(RegisterWorker(address_2, state));
   {
     std::vector<std::shared_ptr<const Worker>> workers = state.ListWorkers();
     EXPECT_THAT(workers, SizeIs(2));
@@ -173,7 +198,7 @@ TEST(DispatcherState, ListWorkers) {
 TEST(DispatcherState, MissingWorker) {
   DispatcherState state;
   std::shared_ptr<const Worker> worker;
-  Status s = state.WorkerFromAddress("test_worker_address", &worker);
+  Status s = state.WorkerFromAddress("test_worker_address", worker);
   EXPECT_EQ(s.code(), error::NOT_FOUND);
 }
 
@@ -185,31 +210,31 @@ TEST(DispatcherState, UnknownUpdate) {
 }
 
 TEST(DispatcherState, AnonymousJob) {
-  int64 job_id = 3;
   int64 dataset_id = 10;
   DispatcherState state;
-  TF_EXPECT_OK(RegisterDatasetWithIdAndFingerprint(dataset_id, 1, &state));
-  TF_EXPECT_OK(CreateAnonymousJob(job_id, dataset_id, &state));
+  int64 job_id = state.NextAvailableJobId();
+  TF_EXPECT_OK(RegisterDataset(dataset_id, state));
+  TF_EXPECT_OK(CreateAnonymousJob(job_id, dataset_id, state));
   std::shared_ptr<const Job> job;
-  TF_EXPECT_OK(state.JobFromId(job_id, &job));
+  TF_EXPECT_OK(state.JobFromId(job_id, job));
   EXPECT_EQ(state.NextAvailableJobId(), job_id + 1);
   EXPECT_EQ(job->dataset_id, dataset_id);
   EXPECT_EQ(job->job_id, job_id);
   std::vector<std::shared_ptr<const Task>> tasks;
-  TF_EXPECT_OK(state.TasksForJob(job_id, &tasks));
+  TF_EXPECT_OK(state.TasksForJob(job_id, tasks));
   EXPECT_THAT(tasks, IsEmpty());
   EXPECT_FALSE(job->finished);
 }
 
 TEST(DispatcherState, NamedJob) {
-  int64 job_id = 3;
   int64 dataset_id = 10;
   DispatcherState state;
-  TF_EXPECT_OK(RegisterDatasetWithIdAndFingerprint(dataset_id, 1, &state));
+  int64 job_id = state.NextAvailableJobId();
+  TF_EXPECT_OK(RegisterDataset(dataset_id, state));
   NamedJobKey named_job_key("test", 1);
-  TF_EXPECT_OK(CreateNamedJob(job_id, dataset_id, named_job_key, &state));
+  TF_EXPECT_OK(CreateNamedJob(job_id, dataset_id, named_job_key, state));
   std::shared_ptr<const Job> job;
-  TF_EXPECT_OK(state.NamedJobByKey(named_job_key, &job));
+  TF_EXPECT_OK(state.NamedJobByKey(named_job_key, job));
   EXPECT_EQ(state.NextAvailableJobId(), job_id + 1);
   EXPECT_EQ(job->dataset_id, dataset_id);
   EXPECT_EQ(job->job_id, job_id);
@@ -219,16 +244,16 @@ TEST(DispatcherState, NamedJob) {
 TEST(DispatcherState, CreateTask) {
   int64 job_id = 3;
   int64 dataset_id = 10;
-  int64 task_id = 8;
   std::string worker_address = "test_worker_address";
   DispatcherState state;
-  TF_EXPECT_OK(RegisterDatasetWithIdAndFingerprint(dataset_id, 1, &state));
-  TF_EXPECT_OK(CreateAnonymousJob(job_id, dataset_id, &state));
-  TF_EXPECT_OK(CreateTask(task_id, job_id, dataset_id, worker_address, &state));
+  int64 task_id = state.NextAvailableTaskId();
+  TF_EXPECT_OK(RegisterDataset(dataset_id, state));
+  TF_EXPECT_OK(CreateAnonymousJob(job_id, dataset_id, state));
+  TF_EXPECT_OK(CreateTask(task_id, job_id, dataset_id, worker_address, state));
   EXPECT_EQ(state.NextAvailableTaskId(), task_id + 1);
   {
     std::shared_ptr<const Task> task;
-    TF_EXPECT_OK(state.TaskFromId(task_id, &task));
+    TF_EXPECT_OK(state.TaskFromId(task_id, task));
     EXPECT_EQ(task->task_id, task_id);
     EXPECT_EQ(task->job_id, job_id);
     EXPECT_EQ(task->dataset_id, dataset_id);
@@ -236,9 +261,14 @@ TEST(DispatcherState, CreateTask) {
   }
   {
     std::vector<std::shared_ptr<const Task>> tasks;
-    TF_EXPECT_OK(state.TasksForJob(job_id, &tasks));
+    TF_EXPECT_OK(state.TasksForJob(job_id, tasks));
     EXPECT_THAT(tasks, SizeIs(1));
   }
+  {
+    std::vector<std::shared_ptr<const Task>> tasks;
+    TF_EXPECT_OK(state.TasksForWorker(worker_address, tasks));
+    EXPECT_EQ(1, tasks.size());
+  }
 }
 
 TEST(DispatcherState, CreateTasksForSameJob) {
@@ -248,15 +278,15 @@ TEST(DispatcherState, CreateTasksForSameJob) {
   int64 task_id_2 = 9;
   std::string worker_address = "test_worker_address";
   DispatcherState state;
-  TF_EXPECT_OK(RegisterDatasetWithIdAndFingerprint(dataset_id, 1, &state));
-  TF_EXPECT_OK(CreateAnonymousJob(job_id, dataset_id, &state));
+  TF_EXPECT_OK(RegisterDataset(dataset_id, state));
+  TF_EXPECT_OK(CreateAnonymousJob(job_id, dataset_id, state));
   TF_EXPECT_OK(
-      CreateTask(task_id_1, job_id, dataset_id, worker_address, &state));
+      CreateTask(task_id_1, job_id, dataset_id, worker_address, state));
   TF_EXPECT_OK(
-      CreateTask(task_id_2, job_id, dataset_id, worker_address, &state));
+      CreateTask(task_id_2, job_id, dataset_id, worker_address, state));
   {
     std::vector<std::shared_ptr<const Task>> tasks;
-    TF_EXPECT_OK(state.TasksForJob(job_id, &tasks));
+    TF_EXPECT_OK(state.TasksForJob(job_id, tasks));
     EXPECT_THAT(tasks, SizeIs(2));
   }
 }
@@ -269,40 +299,97 @@ TEST(DispatcherState, CreateTasksForDifferentJobs) {
   int64 task_id_2 = 9;
   std::string worker_address = "test_worker_address";
   DispatcherState state;
-  TF_EXPECT_OK(RegisterDatasetWithIdAndFingerprint(dataset_id, 1, &state));
-  TF_EXPECT_OK(CreateAnonymousJob(job_id_1, dataset_id, &state));
-  TF_EXPECT_OK(CreateAnonymousJob(job_id_2, dataset_id, &state));
+  TF_EXPECT_OK(RegisterDataset(dataset_id, state));
+  TF_EXPECT_OK(CreateAnonymousJob(job_id_1, dataset_id, state));
+  TF_EXPECT_OK(CreateAnonymousJob(job_id_2, dataset_id, state));
   TF_EXPECT_OK(
-      CreateTask(task_id_1, job_id_1, dataset_id, worker_address, &state));
+      CreateTask(task_id_1, job_id_1, dataset_id, worker_address, state));
   TF_EXPECT_OK(
-      CreateTask(task_id_2, job_id_2, dataset_id, worker_address, &state));
+      CreateTask(task_id_2, job_id_2, dataset_id, worker_address, state));
   {
     std::vector<std::shared_ptr<const Task>> tasks;
-    TF_EXPECT_OK(state.TasksForJob(job_id_1, &tasks));
+    TF_EXPECT_OK(state.TasksForJob(job_id_1, tasks));
     EXPECT_THAT(tasks, SizeIs(1));
   }
   {
     std::vector<std::shared_ptr<const Task>> tasks;
-    TF_EXPECT_OK(state.TasksForJob(job_id_2, &tasks));
+    TF_EXPECT_OK(state.TasksForJob(job_id_2, tasks));
     EXPECT_THAT(tasks, SizeIs(1));
   }
 }
 
+TEST(DispatcherState, CreateTasksForSameWorker) {
+  int64 job_id = 3;
+  int64 dataset_id = 10;
+  int64 task_id_1 = 8;
+  int64 task_id_2 = 9;
+  std::string worker_address = "test_worker_address";
+  DispatcherState state;
+  TF_EXPECT_OK(RegisterDataset(dataset_id, state));
+  TF_EXPECT_OK(CreateAnonymousJob(job_id, dataset_id, state));
+  TF_EXPECT_OK(
+      CreateTask(task_id_1, job_id, dataset_id, worker_address, state));
+  TF_EXPECT_OK(
+      CreateTask(task_id_2, job_id, dataset_id, worker_address, state));
+  {
+    std::vector<std::shared_ptr<const Task>> tasks;
+    TF_EXPECT_OK(state.TasksForWorker(worker_address, tasks));
+    EXPECT_EQ(2, tasks.size());
+  }
+}
+
+TEST(DispatcherState, CreateTasksForDifferentWorkers) {
+  int64 job_id = 3;
+  int64 dataset_id = 10;
+  int64 task_id_1 = 8;
+  int64 task_id_2 = 9;
+  std::string worker_address_1 = "test_worker_address_1";
+  std::string worker_address_2 = "test_worker_address_2";
+  DispatcherState state;
+  TF_EXPECT_OK(RegisterDataset(dataset_id, state));
+  TF_EXPECT_OK(CreateAnonymousJob(job_id, dataset_id, state));
+  TF_EXPECT_OK(
+      CreateTask(task_id_1, job_id, dataset_id, worker_address_1, state));
+  TF_EXPECT_OK(
+      CreateTask(task_id_2, job_id, dataset_id, worker_address_2, state));
+  {
+    std::vector<std::shared_ptr<const Task>> tasks;
+    TF_EXPECT_OK(state.TasksForWorker(worker_address_1, tasks));
+    EXPECT_EQ(1, tasks.size());
+  }
+  {
+    std::vector<std::shared_ptr<const Task>> tasks;
+    TF_EXPECT_OK(state.TasksForWorker(worker_address_2, tasks));
+    EXPECT_EQ(1, tasks.size());
+  }
+}
+
+TEST(DispatcherState, GetTasksForWorkerEmpty) {
+  std::string worker_address = "test_worker_address";
+  DispatcherState state;
+  TF_EXPECT_OK(RegisterWorker(worker_address, state));
+  {
+    std::vector<std::shared_ptr<const Task>> tasks;
+    TF_EXPECT_OK(state.TasksForWorker(worker_address, tasks));
+    EXPECT_EQ(0, tasks.size());
+  }
+}
+
 TEST(DispatcherState, FinishTask) {
   int64 job_id = 3;
   int64 dataset_id = 10;
   int64 task_id = 4;
   std::string worker_address = "test_worker_address";
   DispatcherState state;
-  TF_EXPECT_OK(RegisterDatasetWithIdAndFingerprint(dataset_id, 1, &state));
-  TF_EXPECT_OK(CreateAnonymousJob(job_id, dataset_id, &state));
-  TF_EXPECT_OK(CreateTask(task_id, job_id, dataset_id, worker_address, &state));
-  TF_EXPECT_OK(FinishTask(task_id, &state));
+  TF_EXPECT_OK(RegisterDataset(dataset_id, state));
+  TF_EXPECT_OK(CreateAnonymousJob(job_id, dataset_id, state));
+  TF_EXPECT_OK(CreateTask(task_id, job_id, dataset_id, worker_address, state));
+  TF_EXPECT_OK(FinishTask(task_id, state));
   std::shared_ptr<const Task> task;
-  TF_EXPECT_OK(state.TaskFromId(task_id, &task));
+  TF_EXPECT_OK(state.TaskFromId(task_id, task));
   EXPECT_TRUE(task->finished);
   std::shared_ptr<const Job> job;
-  TF_EXPECT_OK(state.JobFromId(job_id, &job));
+  TF_EXPECT_OK(state.JobFromId(job_id, job));
   EXPECT_TRUE(job->finished);
 }
 
@@ -313,27 +400,72 @@ TEST(DispatcherState, FinishMultiTaskJob) {
   int64 task_id_2 = 5;
   std::string worker_address = "test_worker_address";
   DispatcherState state;
-  TF_EXPECT_OK(RegisterDatasetWithIdAndFingerprint(dataset_id, 1, &state));
-  TF_EXPECT_OK(CreateAnonymousJob(job_id, dataset_id, &state));
+  TF_EXPECT_OK(RegisterDataset(dataset_id, state));
+  TF_EXPECT_OK(CreateAnonymousJob(job_id, dataset_id, state));
   TF_EXPECT_OK(
-      CreateTask(task_id_1, job_id, dataset_id, worker_address, &state));
+      CreateTask(task_id_1, job_id, dataset_id, worker_address, state));
   TF_EXPECT_OK(
-      CreateTask(task_id_2, job_id, dataset_id, worker_address, &state));
+      CreateTask(task_id_2, job_id, dataset_id, worker_address, state));
 
-  TF_EXPECT_OK(FinishTask(task_id_1, &state));
+  TF_EXPECT_OK(FinishTask(task_id_1, state));
   {
     std::shared_ptr<const Job> job;
-    TF_EXPECT_OK(state.JobFromId(job_id, &job));
+    TF_EXPECT_OK(state.JobFromId(job_id, job));
     EXPECT_FALSE(job->finished);
   }
 
-  TF_EXPECT_OK(FinishTask(task_id_2, &state));
+  TF_EXPECT_OK(FinishTask(task_id_2, state));
   {
     std::shared_ptr<const Job> job;
-    TF_EXPECT_OK(state.JobFromId(job_id, &job));
+    TF_EXPECT_OK(state.JobFromId(job_id, job));
     EXPECT_TRUE(job->finished);
   }
 }
 
+TEST(DispatcherState, AcquireJobClientId) {
+  int64 job_id = 3;
+  int64 job_client_id_1 = 1;
+  int64 job_client_id_2 = 2;
+  int64 dataset_id = 10;
+  DispatcherState state;
+  TF_EXPECT_OK(RegisterDataset(dataset_id, state));
+  TF_EXPECT_OK(CreateAnonymousJob(job_id, dataset_id, state));
+  TF_EXPECT_OK(AcquireJobClientId(job_id, job_client_id_1, state));
+  {
+    std::shared_ptr<const Job> job;
+    TF_EXPECT_OK(state.JobFromId(job_id, job));
+    EXPECT_EQ(job->num_clients, 1);
+    TF_EXPECT_OK(AcquireJobClientId(job_id, job_client_id_2, state));
+    EXPECT_EQ(job->num_clients, 2);
+  }
+  {
+    std::shared_ptr<const Job> job;
+    TF_EXPECT_OK(state.JobForJobClientId(job_client_id_1, job));
+    EXPECT_EQ(job->job_id, job_id);
+  }
+  {
+    std::shared_ptr<const Job> job;
+    TF_EXPECT_OK(state.JobForJobClientId(job_client_id_2, job));
+    EXPECT_EQ(job->job_id, job_id);
+  }
+}
+
+TEST(DispatcherState, ReleaseJobClientId) {
+  int64 job_id = 3;
+  int64 dataset_id = 10;
+  int64 job_client_id = 6;
+  int64 release_time = 100;
+  DispatcherState state;
+  TF_EXPECT_OK(RegisterDataset(dataset_id, state));
+  TF_EXPECT_OK(CreateAnonymousJob(job_id, dataset_id, state));
+  TF_EXPECT_OK(AcquireJobClientId(job_id, job_client_id, state));
+  TF_EXPECT_OK(ReleaseJobClientId(job_client_id, release_time, state));
+  std::shared_ptr<const Job> job;
+  TF_EXPECT_OK(state.JobFromId(job_id, job));
+  EXPECT_EQ(job->num_clients, 0);
+  Status s = state.JobForJobClientId(job_client_id, job);
+  EXPECT_EQ(s.code(), error::NOT_FOUND);
+}
+
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/data/service/grpc_dispatcher_impl.cc b/tensorflow/core/data/service/grpc_dispatcher_impl.cc
index f62b487fcdf..89ae6d4fd50 100644
--- a/tensorflow/core/data/service/grpc_dispatcher_impl.cc
+++ b/tensorflow/core/data/service/grpc_dispatcher_impl.cc
@@ -26,9 +26,9 @@ using ::grpc::ServerBuilder;
 using ::grpc::ServerContext;
 
 GrpcDispatcherImpl::GrpcDispatcherImpl(
-    ServerBuilder* server_builder, const experimental::DispatcherConfig& config)
+    const experimental::DispatcherConfig& config, ServerBuilder& server_builder)
     : impl_(config) {
-  server_builder->RegisterService(this);
+  server_builder.RegisterService(this);
   VLOG(1) << "Registered data service dispatcher";
 }
 
@@ -40,10 +40,12 @@ Status GrpcDispatcherImpl::Start() { return impl_.Start(); }
                                           method##Response* response) {   \
     return ToGrpcStatus(impl_.method(request, response));                 \
   }
-HANDLER(RegisterWorker);
+HANDLER(WorkerHeartbeat);
 HANDLER(WorkerUpdate);
+HANDLER(GetDatasetDef);
 HANDLER(GetOrRegisterDataset);
 HANDLER(CreateJob);
+HANDLER(ReleaseJobClient);
 HANDLER(GetOrCreateJob);
 HANDLER(GetTasks);
 HANDLER(GetWorkers);
diff --git a/tensorflow/core/data/service/grpc_dispatcher_impl.h b/tensorflow/core/data/service/grpc_dispatcher_impl.h
index 1810c3fb6ac..6269148a5f9 100644
--- a/tensorflow/core/data/service/grpc_dispatcher_impl.h
+++ b/tensorflow/core/data/service/grpc_dispatcher_impl.h
@@ -25,30 +25,26 @@ namespace tensorflow {
 namespace data {
 
 // This class is a wrapper that handles communication for gRPC.
-//
-// Example usage:
-//
-// ::grpc::ServerBuilder builder;
-// // configure builder
-// GrpcDispatcherImpl data_service(&builder);
-// builder.BuildAndStart()
-//
 class GrpcDispatcherImpl : public DispatcherService::Service {
  public:
-  explicit GrpcDispatcherImpl(grpc::ServerBuilder* server_builder,
-                              const experimental::DispatcherConfig& config);
+  // Constructs a GrpcDispatcherImpl with the given config, and registers it
+  // with `server_builder`.
+  explicit GrpcDispatcherImpl(const experimental::DispatcherConfig& config,
+                              ::grpc::ServerBuilder& server_builder);
   ~GrpcDispatcherImpl() override {}
 
   Status Start();
 
-#define HANDLER(method)                               \
-  grpc::Status method(grpc::ServerContext* context,   \
-                      const method##Request* request, \
-                      method##Response* response) override;
-  HANDLER(RegisterWorker);
+#define HANDLER(method)                                 \
+  ::grpc::Status method(::grpc::ServerContext* context, \
+                        const method##Request* request, \
+                        method##Response* response) override;
+  HANDLER(WorkerHeartbeat);
   HANDLER(WorkerUpdate);
+  HANDLER(GetDatasetDef);
   HANDLER(GetOrRegisterDataset);
   HANDLER(CreateJob);
+  HANDLER(ReleaseJobClient);
   HANDLER(GetOrCreateJob);
   HANDLER(GetTasks);
   HANDLER(GetWorkers);
diff --git a/tensorflow/core/data/service/grpc_util.cc b/tensorflow/core/data/service/grpc_util.cc
index 40950c51efe..73ea384ea60 100644
--- a/tensorflow/core/data/service/grpc_util.cc
+++ b/tensorflow/core/data/service/grpc_util.cc
@@ -15,14 +15,18 @@ limitations under the License.
 
 #include "tensorflow/core/data/service/grpc_util.h"
 
+#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/env_time.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/status.h"
 
 namespace tensorflow {
 namespace data {
 namespace grpc_util {
 
-Status WrapError(const std::string& message, const grpc::Status& status) {
+Status WrapError(const std::string& message, const ::grpc::Status& status) {
   if (status.ok()) {
     return errors::Internal("Expected a non-ok grpc status. Wrapping message: ",
                             message);
@@ -32,6 +36,36 @@ Status WrapError(const std::string& message, const grpc::Status& status) {
   }
 }
 
+Status Retry(const std::function<Status()>& f, const std::string& description,
+             int64 deadline_micros) {
+  Status s = f();
+  for (int num_retries = 0;; ++num_retries) {
+    if (!errors::IsUnavailable(s) && !errors::IsAborted(s) &&
+        !errors::IsCancelled(s)) {
+      return s;
+    }
+    int64 now_micros = EnvTime::NowMicros();
+    if (now_micros > deadline_micros) {
+      return s;
+    }
+    int64 deadline_with_backoff_micros =
+        now_micros + ::tensorflow::ComputeBackoffMicroseconds(num_retries);
+    // Wait for a short period of time before retrying. If our backoff would put
+    // us past the deadline, we truncate it to ensure our attempt starts before
+    // the deadline.
+    int64 backoff_until =
+        std::min(deadline_with_backoff_micros, deadline_micros);
+    int64 wait_time_micros = backoff_until - now_micros;
+    if (wait_time_micros > 100 * 1000) {
+      LOG(INFO) << "Failed to " << description << ": " << s
+                << ". Will retry in " << wait_time_micros / 1000 << "ms.";
+    }
+    Env::Default()->SleepForMicroseconds(wait_time_micros);
+    s = f();
+  }
+  return s;
+}
+
 }  // namespace grpc_util
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/data/service/grpc_util.h b/tensorflow/core/data/service/grpc_util.h
index 60ea10669a5..0ae2a86d118 100644
--- a/tensorflow/core/data/service/grpc_util.h
+++ b/tensorflow/core/data/service/grpc_util.h
@@ -24,7 +24,17 @@ namespace data {
 namespace grpc_util {
 
 // Wraps a grpc::Status in a tensorflow::Status with the given message.
-Status WrapError(const std::string& message, const grpc::Status& status);
+Status WrapError(const std::string& message, const ::grpc::Status& status);
+
+// Retries the given function if the function produces UNAVAILABLE, ABORTED, or
+// CANCELLED status codes. We retry these codes because they can all indicate
+// preemption of a server. The retries continue until the deadline is exceeded.
+// `description` may be used to log that retries are happening. It should
+// contain a description of the action being retried, e.g. "register dataset"
+// The retry loop uses exponential backoff between retries.
+// `deadline_micros` is interpreted as microseconds since the epoch.
+Status Retry(const std::function<Status()>& f, const std::string& description,
+             int64 deadline_micros);
 
 }  // namespace grpc_util
 }  // namespace data
diff --git a/tensorflow/core/data/service/grpc_worker_impl.cc b/tensorflow/core/data/service/grpc_worker_impl.cc
index c76e1062753..3c3a81d0daf 100644
--- a/tensorflow/core/data/service/grpc_worker_impl.cc
+++ b/tensorflow/core/data/service/grpc_worker_impl.cc
@@ -23,27 +23,27 @@ namespace data {
 
 using ::grpc::ServerBuilder;
 using ::grpc::ServerContext;
-using ::grpc::Status;
 
-GrpcWorkerImpl::GrpcWorkerImpl(ServerBuilder* server_builder,
-                               const experimental::WorkerConfig& config)
+GrpcWorkerImpl::GrpcWorkerImpl(const experimental::WorkerConfig& config,
+                               ServerBuilder& server_builder)
     : impl_(config) {
-  server_builder->RegisterService(this);
+  server_builder.RegisterService(this);
   VLOG(1) << "Registered data service worker";
 }
 
-void GrpcWorkerImpl::Start(const std::string& worker_address) {
-  impl_.Start(worker_address);
+Status GrpcWorkerImpl::Start(const std::string& worker_address) {
+  return impl_.Start(worker_address);
 }
 
-#define HANDLER(method)                                         \
-  Status GrpcWorkerImpl::method(ServerContext* context,         \
-                                const method##Request* request, \
-                                method##Response* response) {   \
-    return ToGrpcStatus(impl_.method(request, response));       \
+#define HANDLER(method)                                                 \
+  ::grpc::Status GrpcWorkerImpl::method(ServerContext* context,         \
+                                        const method##Request* request, \
+                                        method##Response* response) {   \
+    return ToGrpcStatus(impl_.method(request, response));               \
   }
 HANDLER(ProcessTask);
 HANDLER(GetElement);
+HANDLER(GetWorkerTasks);
 #undef HANDLER
 
 }  // namespace data
diff --git a/tensorflow/core/data/service/grpc_worker_impl.h b/tensorflow/core/data/service/grpc_worker_impl.h
index b0881143a57..734865e3447 100644
--- a/tensorflow/core/data/service/grpc_worker_impl.h
+++ b/tensorflow/core/data/service/grpc_worker_impl.h
@@ -25,28 +25,23 @@ namespace tensorflow {
 namespace data {
 
 // This class is a wrapper that handles communication for gRPC.
-//
-// Example usage:
-//
-// ::grpc::ServerBuilder builder;
-// // configure builder
-// GrpcWorkerImpl data_service(&builder);
-// builder.BuildAndStart()
-//
 class GrpcWorkerImpl : public WorkerService::Service {
  public:
-  explicit GrpcWorkerImpl(grpc::ServerBuilder* server_builder,
-                          const experimental::WorkerConfig& config);
+  // Constructs a GrpcWorkerImpl with the given config, and registers it with
+  // `server_builder`.
+  explicit GrpcWorkerImpl(const experimental::WorkerConfig& config,
+                          ::grpc::ServerBuilder& server_builder);
   ~GrpcWorkerImpl() override {}
 
-  void Start(const std::string& worker_address);
+  Status Start(const std::string& worker_address);
 
-#define HANDLER(method)                               \
-  grpc::Status method(grpc::ServerContext* context,   \
-                      const method##Request* request, \
-                      method##Response* response) override;
+#define HANDLER(method)                                 \
+  ::grpc::Status method(::grpc::ServerContext* context, \
+                        const method##Request* request, \
+                        method##Response* response) override;
   HANDLER(ProcessTask);
   HANDLER(GetElement);
+  HANDLER(GetWorkerTasks);
 #undef HANDLER
 
  private:
diff --git a/tensorflow/core/data/service/journal.cc b/tensorflow/core/data/service/journal.cc
index 11952b0dfd9..979fc78b7c0 100644
--- a/tensorflow/core/data/service/journal.cc
+++ b/tensorflow/core/data/service/journal.cc
@@ -22,29 +22,51 @@ limitations under the License.
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/path.h"
+#include "tensorflow/core/platform/regexp.h"
 
 namespace tensorflow {
 namespace data {
 
 namespace {
 constexpr StringPiece kJournal = "journal";
+
+Status ParseSequenceNumber(const std::string& journal_file,
+                           int64* sequence_number) {
+  if (!RE2::FullMatch(journal_file, ".*_(\\d+)", sequence_number)) {
+    return errors::InvalidArgument("Failed to parse journal file name: ",
+                                   journal_file);
+  }
+  return Status::OK();
+}
 }  // namespace
 
-std::string DataServiceJournalFile(StringPiece journal_dir) {
-  return io::JoinPath(journal_dir, kJournal);
+std::string DataServiceJournalFile(const std::string& journal_dir,
+                                   int64 sequence_number) {
+  return io::JoinPath(journal_dir,
+                      absl::StrCat(kJournal, "_", sequence_number));
 }
 
-FileJournalWriter::FileJournalWriter(Env* env, StringPiece journal_dir)
+FileJournalWriter::FileJournalWriter(Env* env, const std::string& journal_dir)
     : env_(env), journal_dir_(journal_dir) {}
 
 Status FileJournalWriter::EnsureInitialized() {
   if (writer_) {
     return Status::OK();
   }
+  std::vector<std::string> journal_files;
   TF_RETURN_IF_ERROR(env_->RecursivelyCreateDir(journal_dir_));
-  TF_RETURN_IF_ERROR(
-      env_->NewAppendableFile(DataServiceJournalFile(journal_dir_), &file_));
+  TF_RETURN_IF_ERROR(env_->GetChildren(journal_dir_, &journal_files));
+  int64 latest_sequence_number = -1;
+  for (const auto& file : journal_files) {
+    int64 sequence_number;
+    TF_RETURN_IF_ERROR(ParseSequenceNumber(file, &sequence_number));
+    latest_sequence_number = std::max(latest_sequence_number, sequence_number);
+  }
+  std::string journal_file =
+      DataServiceJournalFile(journal_dir_, latest_sequence_number + 1);
+  TF_RETURN_IF_ERROR(env_->NewAppendableFile(journal_file, &file_));
   writer_ = absl::make_unique<io::RecordWriter>(file_.get());
+  VLOG(1) << "Created journal writer to write to " << journal_file;
   return Status::OK();
 }
 
@@ -58,6 +80,9 @@ Status FileJournalWriter::Write(const Update& update) {
   TF_RETURN_IF_ERROR(writer_->WriteRecord(s));
   TF_RETURN_IF_ERROR(writer_->Flush());
   TF_RETURN_IF_ERROR(file_->Sync());
+  if (VLOG_IS_ON(4)) {
+    VLOG(4) << "Wrote journal entry: " << update.DebugString();
+  }
   return Status::OK();
 }
 
@@ -68,25 +93,44 @@ Status FileJournalReader::EnsureInitialized() {
   if (reader_) {
     return Status::OK();
   }
-  TF_RETURN_IF_ERROR(
-      env_->NewRandomAccessFile(DataServiceJournalFile(journal_dir_), &file_));
-  reader_ = absl::make_unique<io::RecordReader>(file_.get());
-  return Status::OK();
+  return UpdateFile(DataServiceJournalFile(journal_dir_, 0));
 }
 
-Status FileJournalReader::Read(Update* update, bool* end_of_journal) {
+Status FileJournalReader::Read(Update& update, bool& end_of_journal) {
   TF_RETURN_IF_ERROR(EnsureInitialized());
-  tstring record;
-  Status s = reader_->ReadRecord(&offset_, &record);
-  if (errors::IsOutOfRange(s)) {
-    *end_of_journal = true;
+  while (true) {
+    tstring record;
+    Status s = reader_->ReadRecord(&offset_, &record);
+    if (errors::IsOutOfRange(s)) {
+      sequence_number_++;
+      std::string next_journal_file =
+          DataServiceJournalFile(journal_dir_, sequence_number_);
+      if (errors::IsNotFound(env_->FileExists(next_journal_file))) {
+        VLOG(3) << "Next journal file " << next_journal_file
+                << " does not exist. End of journal reached.";
+        end_of_journal = true;
+        return Status::OK();
+      }
+      TF_RETURN_IF_ERROR(UpdateFile(next_journal_file));
+      continue;
+    }
+    TF_RETURN_IF_ERROR(s);
+    if (!update.ParseFromString(record)) {
+      return errors::DataLoss("Failed to parse journal record.");
+    }
+    if (VLOG_IS_ON(4)) {
+      VLOG(4) << "Read journal entry: " << update.DebugString();
+    }
+    end_of_journal = false;
     return Status::OK();
   }
-  TF_RETURN_IF_ERROR(s);
-  if (!update->ParseFromString(record)) {
-    return errors::DataLoss("Failed to parse journal record.");
-  }
-  *end_of_journal = false;
+}
+
+Status FileJournalReader::UpdateFile(const std::string& filename) {
+  VLOG(1) << "Reading from journal file " << filename;
+  TF_RETURN_IF_ERROR(env_->NewRandomAccessFile(filename, &file_));
+  reader_ = absl::make_unique<io::RecordReader>(file_.get());
+  offset_ = 0;
   return Status::OK();
 }
 
diff --git a/tensorflow/core/data/service/journal.h b/tensorflow/core/data/service/journal.h
index c627c21756c..e31830e8c35 100644
--- a/tensorflow/core/data/service/journal.h
+++ b/tensorflow/core/data/service/journal.h
@@ -25,7 +25,8 @@ namespace tensorflow {
 namespace data {
 
 // Returns the location of the journal file within the journal directory.
-std::string DataServiceJournalFile(StringPiece journal_dir);
+std::string DataServiceJournalFile(const std::string& journal_dir,
+                                   int64 sequence_number);
 
 // Interface for writing to a journal.
 class JournalWriter {
@@ -33,25 +34,39 @@ class JournalWriter {
   virtual ~JournalWriter() = default;
   // Writes and syncs an update to the journal.
   virtual Status Write(const Update& update) = 0;
+  // Initializes the writer if it is not yet initialized.
+  virtual Status EnsureInitialized() = 0;
 };
 
 // FileJournalWriter is not thread-safe, requiring external synchronization when
 // used by multiple threads.
+//
+// FileJournalWriter writes journal files to a configured journal directory. The
+// directory is laid out in the following format:
+//
+// journal_dir/
+//   journal_0
+//   journal_1
+//   ...
+//
+// When the writer is created, it lists the directory to find the next available
+// journal file name. For example, if the journal directory contains
+// "journal_0", "journal_1", and "journal_2", the writer will write to
+// "journal_3". The writer will flush updates as they are written, so that they
+// can be stored durably in case of machine failure.
 class FileJournalWriter : public JournalWriter {
  public:
   // Creates a journal writer to write to the given journal directory.
   // If there is already journal data there, the journal writer will append to
   // the existing journal.
-  explicit FileJournalWriter(Env* env, StringPiece journal_dir);
+  explicit FileJournalWriter(Env* env, const std::string& journal_dir);
   FileJournalWriter(const FileJournalWriter&) = delete;
   FileJournalWriter& operator=(const FileJournalWriter&) = delete;
 
   Status Write(const Update& update) override;
+  Status EnsureInitialized() override;
 
  private:
-  // Initializes the writer if it is not yet initialized.
-  Status EnsureInitialized();
-
   Env* env_;
   const std::string journal_dir_;
   std::unique_ptr<WritableFile> file_;
@@ -62,27 +77,34 @@ class FileJournalWriter : public JournalWriter {
 class JournalReader {
  public:
   virtual ~JournalReader() = default;
-  // Reads the next update from the journal. Sets `*end_of_journal=true` if
+  // Reads the next update from the journal. Sets `end_of_journal=true` if
   // there are no more updates left in the journal.
-  virtual Status Read(Update* update, bool* end_of_journal) = 0;
+  virtual Status Read(Update& update, bool& end_of_journal) = 0;
 };
 
 // JournalReader is not thread-safe, requiring external synchronization when
 // used by multiple threads.
+//
+// The journal reader reads through all journal files in the configured journal
+// directory, in order of their sequence numbers. See FileJournalWriter above.
 class FileJournalReader : public JournalReader {
  public:
   explicit FileJournalReader(Env* env, StringPiece journal_dir);
   FileJournalReader(const FileJournalReader&) = delete;
   FileJournalReader& operator=(const FileJournalReader&) = delete;
 
-  Status Read(Update* update, bool* end_of_journal) override;
+  Status Read(Update& update, bool& end_of_journal) override;
 
  private:
   // Initializes the reader if it is not yet initialized.
   Status EnsureInitialized();
+  // Updates the `FileJournalReader` to read from a new file.
+  Status UpdateFile(const std::string& filename);
 
   Env* env_;
   const std::string journal_dir_;
+  // Sequence number of current journal file.
+  int64 sequence_number_ = 0;
   // Current offset into `file_`.
   uint64 offset_ = 0;
   std::unique_ptr<RandomAccessFile> file_;
diff --git a/tensorflow/core/data/service/journal.proto b/tensorflow/core/data/service/journal.proto
index 725724a5cd5..09136714cfa 100644
--- a/tensorflow/core/data/service/journal.proto
+++ b/tensorflow/core/data/service/journal.proto
@@ -12,6 +12,8 @@ message Update {
     RegisterDatasetUpdate register_dataset = 1;
     RegisterWorkerUpdate register_worker = 5;
     CreateJobUpdate create_job = 2;
+    AcquireJobClientUpdate acquire_job_client = 6;
+    ReleaseJobClientUpdate release_job_client = 7;
     CreateTaskUpdate create_task = 3;
     FinishTaskUpdate finish_task = 4;
   }
@@ -19,8 +21,7 @@ message Update {
 
 message RegisterDatasetUpdate {
   int64 dataset_id = 1;
-  DatasetDef dataset_def = 2;
-  uint64 fingerprint = 3;
+  uint64 fingerprint = 2;
 }
 
 message RegisterWorkerUpdate {
@@ -40,6 +41,18 @@ message CreateJobUpdate {
   NamedJobKeyDef named_job_key = 4;
 }
 
+message AcquireJobClientUpdate {
+  int64 job_id = 1;
+  int64 job_client_id = 2;
+}
+
+message ReleaseJobClientUpdate {
+  int64 job_client_id = 1;
+  // The time when the client was released, measured in microseconds since the
+  // epoch.
+  int64 time_micros = 2;
+}
+
 message CreateTaskUpdate {
   int64 task_id = 1;
   int64 job_id = 2;
diff --git a/tensorflow/core/data/service/journal_test.cc b/tensorflow/core/data/service/journal_test.cc
index 169e58ed048..3f55447cc68 100644
--- a/tensorflow/core/data/service/journal_test.cc
+++ b/tensorflow/core/data/service/journal_test.cc
@@ -28,12 +28,12 @@ namespace data {
 namespace {
 using ::testing::HasSubstr;
 
-bool NewJournalDir(std::string* journal_dir) {
+bool NewJournalDir(std::string& journal_dir) {
   std::string filename = testing::TmpDir();
   if (!Env::Default()->CreateUniqueFileName(&filename, "journal_dir")) {
     return false;
   }
-  *journal_dir = filename;
+  journal_dir = filename;
   return true;
 }
 
@@ -67,7 +67,7 @@ Status CheckJournalContent(StringPiece journal_dir,
   for (const auto& update : expected) {
     Update result;
     bool end_of_journal = true;
-    TF_RETURN_IF_ERROR(reader.Read(&result, &end_of_journal));
+    TF_RETURN_IF_ERROR(reader.Read(result, end_of_journal));
     EXPECT_FALSE(end_of_journal);
     // We can't use the testing::EqualsProto matcher because it is not available
     // in OSS.
@@ -75,7 +75,7 @@ Status CheckJournalContent(StringPiece journal_dir,
   }
   Update result;
   bool end_of_journal = false;
-  TF_RETURN_IF_ERROR(reader.Read(&result, &end_of_journal));
+  TF_RETURN_IF_ERROR(reader.Read(result, end_of_journal));
   EXPECT_TRUE(end_of_journal);
   return Status::OK();
 }
@@ -83,7 +83,7 @@ Status CheckJournalContent(StringPiece journal_dir,
 
 TEST(Journal, RoundTripMultiple) {
   std::string journal_dir;
-  EXPECT_TRUE(NewJournalDir(&journal_dir));
+  EXPECT_TRUE(NewJournalDir(journal_dir));
   std::vector<Update> updates = {MakeCreateJobUpdate(),
                                  MakeRegisterDatasetUpdate(),
                                  MakeFinishTaskUpdate()};
@@ -95,9 +95,9 @@ TEST(Journal, RoundTripMultiple) {
   TF_EXPECT_OK(CheckJournalContent(journal_dir, updates));
 }
 
-TEST(Journal, AppendExistingFile) {
+TEST(Journal, AppendExistingJournal) {
   std::string journal_dir;
-  EXPECT_TRUE(NewJournalDir(&journal_dir));
+  EXPECT_TRUE(NewJournalDir(journal_dir));
   std::vector<Update> updates = {MakeCreateJobUpdate(),
                                  MakeRegisterDatasetUpdate(),
                                  MakeFinishTaskUpdate()};
@@ -111,43 +111,43 @@ TEST(Journal, AppendExistingFile) {
 
 TEST(Journal, MissingFile) {
   std::string journal_dir;
-  EXPECT_TRUE(NewJournalDir(&journal_dir));
+  EXPECT_TRUE(NewJournalDir(journal_dir));
   FileJournalReader reader(Env::Default(), journal_dir);
   Update result;
   bool end_of_journal = true;
-  Status s = reader.Read(&result, &end_of_journal);
+  Status s = reader.Read(result, end_of_journal);
   EXPECT_TRUE(errors::IsNotFound(s));
 }
 
 TEST(Journal, NonRecordData) {
   std::string journal_dir;
-  EXPECT_TRUE(NewJournalDir(&journal_dir));
+  EXPECT_TRUE(NewJournalDir(journal_dir));
 
   TF_ASSERT_OK(Env::Default()->RecursivelyCreateDir(journal_dir));
   {
     std::unique_ptr<WritableFile> file;
     TF_ASSERT_OK(Env::Default()->NewAppendableFile(
-        DataServiceJournalFile(journal_dir), &file));
+        DataServiceJournalFile(journal_dir, /*sequence_number=*/0), &file));
     TF_ASSERT_OK(file->Append("not record data"));
   }
 
   FileJournalReader reader(Env::Default(), journal_dir);
   Update result;
   bool end_of_journal = true;
-  Status s = reader.Read(&result, &end_of_journal);
+  Status s = reader.Read(result, end_of_journal);
   EXPECT_THAT(s.error_message(), HasSubstr("corrupted record"));
   EXPECT_EQ(s.code(), error::DATA_LOSS);
 }
 
 TEST(Journal, InvalidRecordData) {
   std::string journal_dir;
-  EXPECT_TRUE(NewJournalDir(&journal_dir));
+  EXPECT_TRUE(NewJournalDir(journal_dir));
 
   TF_ASSERT_OK(Env::Default()->RecursivelyCreateDir(journal_dir));
   {
     std::unique_ptr<WritableFile> file;
     TF_ASSERT_OK(Env::Default()->NewAppendableFile(
-        DataServiceJournalFile(journal_dir), &file));
+        DataServiceJournalFile(journal_dir, /*sequence_number=*/0), &file));
     auto writer = absl::make_unique<io::RecordWriter>(file.get());
     TF_ASSERT_OK(writer->WriteRecord("not serializd proto"));
   }
@@ -155,7 +155,7 @@ TEST(Journal, InvalidRecordData) {
   FileJournalReader reader(Env::Default(), journal_dir);
   Update result;
   bool end_of_journal = true;
-  Status s = reader.Read(&result, &end_of_journal);
+  Status s = reader.Read(result, end_of_journal);
   EXPECT_THAT(s.error_message(), HasSubstr("Failed to parse journal record"));
   EXPECT_EQ(s.code(), error::DATA_LOSS);
 }
diff --git a/tensorflow/core/data/service/server_lib.cc b/tensorflow/core/data/service/server_lib.cc
index 751fa6ca2a8..af940fe54a3 100644
--- a/tensorflow/core/data/service/server_lib.cc
+++ b/tensorflow/core/data/service/server_lib.cc
@@ -28,8 +28,12 @@ namespace {
 constexpr char kPortPlaceholder[] = "%port%";
 }
 
-GrpcDataServerBase::GrpcDataServerBase(int port, const std::string& protocol)
-    : requested_port_(port), protocol_(protocol) {}
+GrpcDataServerBase::GrpcDataServerBase(int port, const std::string& protocol,
+                                       const std::string server_type)
+    : requested_port_(port),
+      protocol_(protocol),
+      server_type_(server_type),
+      bound_port_(port) {}
 
 Status GrpcDataServerBase::Start() {
   if (stopped_) {
@@ -47,7 +51,8 @@ Status GrpcDataServerBase::Start() {
                            credentials, &bound_port_);
   builder.SetMaxReceiveMessageSize(-1);
 
-  AddServiceToBuilder(&builder);
+  AddDataServiceToBuilder(builder);
+  AddProfilerServiceToBuilder(builder);
   server_ = builder.BuildAndStart();
   if (!server_) {
     return errors::Internal("Could not start gRPC server");
@@ -56,7 +61,8 @@ Status GrpcDataServerBase::Start() {
   TF_RETURN_IF_ERROR(StartServiceInternal());
 
   started_ = true;
-  VLOG(1) << "Started tf.data service running at 0.0.0.0:" << BoundPort();
+  LOG(INFO) << "Started tf.data " << server_type_
+            << " running at 0.0.0.0:" << BoundPort();
   return Status::OK();
 }
 
@@ -66,20 +72,30 @@ void GrpcDataServerBase::Stop() {
   }
   server_->Shutdown();
   stopped_ = true;
+  LOG(INFO) << "Shut down " << server_type_ << " server running at port "
+            << BoundPort();
 }
 
 void GrpcDataServerBase::Join() { server_->Wait(); }
 
 int GrpcDataServerBase::BoundPort() { return bound_port(); }
 
+void GrpcDataServerBase::AddProfilerServiceToBuilder(
+    ::grpc::ServerBuilder& builder) {
+  profiler_service_ = profiler::CreateProfilerService();
+  builder.RegisterService(profiler_service_.get());
+}
+
 DispatchGrpcDataServer::DispatchGrpcDataServer(
     const experimental::DispatcherConfig& config)
-    : GrpcDataServerBase(config.port(), config.protocol()), config_(config) {}
+    : GrpcDataServerBase(config.port(), config.protocol(), "DispatchServer"),
+      config_(config) {}
 
 DispatchGrpcDataServer::~DispatchGrpcDataServer() { delete service_; }
 
-void DispatchGrpcDataServer::AddServiceToBuilder(grpc::ServerBuilder* builder) {
-  service_ = absl::make_unique<GrpcDispatcherImpl>(builder, config_).release();
+void DispatchGrpcDataServer::AddDataServiceToBuilder(
+    ::grpc::ServerBuilder& builder) {
+  service_ = absl::make_unique<GrpcDispatcherImpl>(config_, builder).release();
 }
 
 Status DispatchGrpcDataServer::StartServiceInternal() {
@@ -89,8 +105,8 @@ Status DispatchGrpcDataServer::StartServiceInternal() {
 Status DispatchGrpcDataServer::NumWorkers(int* num_workers) {
   GetWorkersRequest req;
   GetWorkersResponse resp;
-  grpc::ServerContext ctx;
-  grpc::Status s = service_->GetWorkers(&ctx, &req, &resp);
+  ::grpc::ServerContext ctx;
+  ::grpc::Status s = service_->GetWorkers(&ctx, &req, &resp);
   if (!s.ok()) {
     return grpc_util::WrapError("Failed to get workers", s);
   }
@@ -100,12 +116,14 @@ Status DispatchGrpcDataServer::NumWorkers(int* num_workers) {
 
 WorkerGrpcDataServer::WorkerGrpcDataServer(
     const experimental::WorkerConfig& config)
-    : GrpcDataServerBase(config.port(), config.protocol()), config_(config) {}
+    : GrpcDataServerBase(config.port(), config.protocol(), "WorkerServer"),
+      config_(config) {}
 
 WorkerGrpcDataServer::~WorkerGrpcDataServer() { delete service_; }
 
-void WorkerGrpcDataServer::AddServiceToBuilder(grpc::ServerBuilder* builder) {
-  service_ = absl::make_unique<GrpcWorkerImpl>(builder, config_).release();
+void WorkerGrpcDataServer::AddDataServiceToBuilder(
+    ::grpc::ServerBuilder& builder) {
+  service_ = absl::make_unique<GrpcWorkerImpl>(config_, builder).release();
 }
 
 Status WorkerGrpcDataServer::StartServiceInternal() {
@@ -116,19 +134,31 @@ Status WorkerGrpcDataServer::StartServiceInternal() {
   std::string resolved_address = str_util::StringReplace(
       worker_address, kPortPlaceholder, absl::StrCat(bound_port()),
       /*replace_all=*/false);
-  service_->Start(resolved_address);
+  TF_RETURN_IF_ERROR(service_->Start(resolved_address));
+  return Status::OK();
+}
+
+Status WorkerGrpcDataServer::NumTasks(int* num_tasks) {
+  GetWorkerTasksRequest req;
+  GetWorkerTasksResponse resp;
+  ::grpc::ServerContext ctx;
+  ::grpc::Status s = service_->GetWorkerTasks(&ctx, &req, &resp);
+  if (!s.ok()) {
+    return grpc_util::WrapError("Failed to get tasks", s);
+  }
+  *num_tasks = resp.tasks_size();
   return Status::OK();
 }
 
 Status NewDispatchServer(const experimental::DispatcherConfig& config,
-                         std::unique_ptr<DispatchGrpcDataServer>* out_server) {
-  *out_server = absl::make_unique<DispatchGrpcDataServer>(config);
+                         std::unique_ptr<DispatchGrpcDataServer>& out_server) {
+  out_server = absl::make_unique<DispatchGrpcDataServer>(config);
   return Status::OK();
 }
 
 Status NewWorkerServer(const experimental::WorkerConfig& config,
-                       std::unique_ptr<WorkerGrpcDataServer>* out_server) {
-  *out_server = absl::make_unique<WorkerGrpcDataServer>(config);
+                       std::unique_ptr<WorkerGrpcDataServer>& out_server) {
+  out_server = absl::make_unique<WorkerGrpcDataServer>(config);
   return Status::OK();
 }
 
diff --git a/tensorflow/core/data/service/server_lib.h b/tensorflow/core/data/service/server_lib.h
index 2c300947f63..c9981008248 100644
--- a/tensorflow/core/data/service/server_lib.h
+++ b/tensorflow/core/data/service/server_lib.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include "grpcpp/server.h"
 #include "grpcpp/server_builder.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/profiler/rpc/profiler_service_impl.h"
 #include "tensorflow/core/protobuf/data/experimental/service_config.pb.h"
 
 namespace tensorflow {
@@ -34,10 +35,9 @@ class GrpcDataServerBase {
  public:
   // Constructs a tf.data server with the specified port. If the port is 0, the
   // server will find an available port in `Start()`. The chosen port can be
-  // found in the output of `Target()`.
-  //
-  // dispatcher_address is only needed for worker data servers.
-  GrpcDataServerBase(int requested_port, const std::string& protocol);
+  // found by calling `BoundPort()`.
+  GrpcDataServerBase(int requested_port, const std::string& protocol,
+                     const std::string server_type);
   virtual ~GrpcDataServerBase() {}
 
   // Starts the server running asynchronously.
@@ -53,7 +53,8 @@ class GrpcDataServerBase {
   int BoundPort();
 
  protected:
-  virtual void AddServiceToBuilder(::grpc::ServerBuilder* builder) = 0;
+  virtual void AddDataServiceToBuilder(::grpc::ServerBuilder& builder) = 0;
+  void AddProfilerServiceToBuilder(::grpc::ServerBuilder& builder);
   // Starts the service. This will be called after building the service, so
   // bound_port() will return the actual bound port.
   virtual Status StartServiceInternal() = 0;
@@ -62,13 +63,16 @@ class GrpcDataServerBase {
 
   const int requested_port_;
   const std::string protocol_;
+  const std::string server_type_;
 
  private:
   int bound_port_;
   bool started_ = false;
   bool stopped_ = false;
 
-  std::unique_ptr<grpc::Server> server_;
+  std::unique_ptr<::grpc::Server> server_;
+  // TensorFlow profiler service implementation.
+  std::unique_ptr<grpc::ProfilerService::Service> profiler_service_ = nullptr;
 };
 
 class DispatchGrpcDataServer : public GrpcDataServerBase {
@@ -80,7 +84,7 @@ class DispatchGrpcDataServer : public GrpcDataServerBase {
   Status NumWorkers(int* num_workers);
 
  protected:
-  void AddServiceToBuilder(grpc::ServerBuilder* builder) override;
+  void AddDataServiceToBuilder(::grpc::ServerBuilder& builder) override;
   Status StartServiceInternal() override;
 
  private:
@@ -94,8 +98,11 @@ class WorkerGrpcDataServer : public GrpcDataServerBase {
   explicit WorkerGrpcDataServer(const experimental::WorkerConfig& config);
   ~WorkerGrpcDataServer() override;
 
+  // Returns the number of tasks currently being executed by the worker.
+  Status NumTasks(int* num_tasks);
+
  protected:
-  void AddServiceToBuilder(grpc::ServerBuilder* builder) override;
+  void AddDataServiceToBuilder(::grpc::ServerBuilder& builder) override;
   Status StartServiceInternal() override;
 
  private:
@@ -104,13 +111,13 @@ class WorkerGrpcDataServer : public GrpcDataServerBase {
   GrpcWorkerImpl* service_;
 };
 
-// Creates a dispatch tf.data server and stores it in `*out_server`.
+// Creates a dispatch tf.data server and stores it in `out_server`.
 Status NewDispatchServer(const experimental::DispatcherConfig& config,
-                         std::unique_ptr<DispatchGrpcDataServer>* out_server);
+                         std::unique_ptr<DispatchGrpcDataServer>& out_server);
 
-// Creates a worker tf.data server and stores it in `*out_server`.
+// Creates a worker tf.data server and stores it in `out_server`.
 Status NewWorkerServer(const experimental::WorkerConfig& config,
-                       std::unique_ptr<WorkerGrpcDataServer>* out_server);
+                       std::unique_ptr<WorkerGrpcDataServer>& out_server);
 
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/data/service/test_cluster.cc b/tensorflow/core/data/service/test_cluster.cc
index 8ae3f191407..49f7eaef30d 100644
--- a/tensorflow/core/data/service/test_cluster.cc
+++ b/tensorflow/core/data/service/test_cluster.cc
@@ -49,7 +49,7 @@ Status TestCluster::Initialize() {
   experimental::DispatcherConfig config;
   config.set_port(0);
   config.set_protocol(kProtocol);
-  TF_RETURN_IF_ERROR(NewDispatchServer(config, &dispatcher_));
+  TF_RETURN_IF_ERROR(NewDispatchServer(config, dispatcher_));
   TF_RETURN_IF_ERROR(dispatcher_->Start());
   dispatcher_address_ = absl::StrCat("localhost:", dispatcher_->BoundPort());
   workers_.reserve(num_workers_);
@@ -67,7 +67,7 @@ Status TestCluster::AddWorker() {
   config.set_protocol(kProtocol);
   config.set_dispatcher_address(dispatcher_address_);
   config.set_worker_address("localhost:%port%");
-  TF_RETURN_IF_ERROR(NewWorkerServer(config, &worker));
+  TF_RETURN_IF_ERROR(NewWorkerServer(config, worker));
   TF_RETURN_IF_ERROR(worker->Start());
   worker_addresses_.push_back(absl::StrCat("localhost:", worker->BoundPort()));
   workers_.push_back(std::move(worker));
diff --git a/tensorflow/core/data/service/utils.cc b/tensorflow/core/data/service/utils.cc
new file mode 100644
index 00000000000..3045ef60642
--- /dev/null
+++ b/tensorflow/core/data/service/utils.cc
@@ -0,0 +1,54 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/data/service/utils.h"
+
+#include "tensorflow/core/data/service/common.pb.h"
+#include "tensorflow/core/lib/io/record_reader.h"
+#include "tensorflow/core/lib/io/record_writer.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/path.h"
+
+namespace tensorflow {
+namespace data {
+
+Status WriteDatasetDef(const std::string& path, const DatasetDef& dataset_def) {
+  std::unique_ptr<WritableFile> file;
+  TF_RETURN_IF_ERROR(Env::Default()->NewWritableFile(path, &file));
+  io::RecordWriter writer(file.get());
+  TF_RETURN_IF_ERROR(writer.WriteRecord(dataset_def.SerializeAsString()));
+  return Status::OK();
+}
+
+Status ReadDatasetDef(const std::string& path, DatasetDef& dataset_def) {
+  if (path.empty()) {
+    return errors::InvalidArgument("Path is empty");
+  }
+  TF_RETURN_IF_ERROR(Env::Default()->FileExists(path));
+  std::unique_ptr<RandomAccessFile> file;
+  TF_RETURN_IF_ERROR(Env::Default()->NewRandomAccessFile(path, &file));
+  io::RecordReader reader(file.get());
+  uint64 offset = 0;
+  tstring record;
+  TF_RETURN_IF_ERROR(reader.ReadRecord(&offset, &record));
+  if (!dataset_def.ParseFromString(record)) {
+    return errors::DataLoss("Failed to parse dataset definition");
+  }
+  return Status::OK();
+}
+
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/data/service/utils.h b/tensorflow/core/data/service/utils.h
new file mode 100644
index 00000000000..b15e512e5bf
--- /dev/null
+++ b/tensorflow/core/data/service/utils.h
@@ -0,0 +1,38 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_SERVICE_UTILS_H_
+#define TENSORFLOW_CORE_DATA_SERVICE_UTILS_H_
+
+#include "tensorflow/core/data/service/common.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/io/record_reader.h"
+#include "tensorflow/core/platform/env.h"
+
+// Utilities shared between the dispatcher and worker servers.
+namespace tensorflow {
+namespace data {
+
+// Writes a dataset definition to the specified path. If the file already
+// exists, it will be overwritten.
+Status WriteDatasetDef(const std::string& path, const DatasetDef& dataset_def);
+
+// Reads a dataset definition from specified path, and stores it in
+// `dataset_def`. Returns NOT_FOUND if the path cannot be found.
+Status ReadDatasetDef(const std::string& path, DatasetDef& dataset_def);
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_SERVICE_UTILS_H_
diff --git a/tensorflow/core/data/service/utils_test.cc b/tensorflow/core/data/service/utils_test.cc
new file mode 100644
index 00000000000..59b7abdc2bc
--- /dev/null
+++ b/tensorflow/core/data/service/utils_test.cc
@@ -0,0 +1,70 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/data/service/utils.h"
+
+#include "tensorflow/core/data/service/common.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/path.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/error_codes.pb.h"
+
+namespace tensorflow {
+namespace data {
+
+namespace {
+DatasetDef DatasetDefWithVersion(int32 version) {
+  DatasetDef def;
+  def.mutable_graph()->set_version(version);
+  return def;
+}
+}  // namespace
+
+TEST(Utils, ReadWriteDataset) {
+  std::string filename = testing::TmpDir();
+  ASSERT_TRUE(Env::Default()->CreateUniqueFileName(&filename, "journal_dir"));
+  int32 version = 3;
+  DatasetDef def = DatasetDefWithVersion(version);
+  TF_ASSERT_OK(WriteDatasetDef(filename, def));
+  DatasetDef result;
+  TF_ASSERT_OK(ReadDatasetDef(filename, result));
+  EXPECT_EQ(result.graph().version(), version);
+}
+
+TEST(Utils, OverwriteDataset) {
+  std::string filename = testing::TmpDir();
+  ASSERT_TRUE(Env::Default()->CreateUniqueFileName(&filename, "journal_dir"));
+  int32 version_1 = 1;
+  int32 version_2 = 2;
+  DatasetDef def_1 = DatasetDefWithVersion(version_1);
+  TF_ASSERT_OK(WriteDatasetDef(filename, def_1));
+  DatasetDef def_2 = DatasetDefWithVersion(version_2);
+  TF_ASSERT_OK(WriteDatasetDef(filename, def_2));
+  DatasetDef result;
+  TF_ASSERT_OK(ReadDatasetDef(filename, result));
+  EXPECT_EQ(result.graph().version(), version_2);
+}
+
+TEST(Utils, ReadDatasetNotFound) {
+  std::string filename = testing::TmpDir();
+  ASSERT_TRUE(Env::Default()->CreateUniqueFileName(&filename, "journal_dir"));
+  DatasetDef result;
+  Status s = ReadDatasetDef(filename, result);
+  EXPECT_EQ(s.code(), error::NOT_FOUND);
+}
+
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/data/service/worker.proto b/tensorflow/core/data/service/worker.proto
index 51c6899f540..32d3b79a78e 100644
--- a/tensorflow/core/data/service/worker.proto
+++ b/tensorflow/core/data/service/worker.proto
@@ -23,10 +23,20 @@ message GetElementResponse {
   bool end_of_sequence = 2;
 }
 
+// Named GetWorkerTasks to avoid conflicting with GetTasks in dispatcher.proto
+message GetWorkerTasksRequest {}
+
+message GetWorkerTasksResponse {
+  repeated TaskInfo tasks = 1;
+}
+
 service WorkerService {
   // Processes an task for a dataset, making elements available to clients.
   rpc ProcessTask(ProcessTaskRequest) returns (ProcessTaskResponse);
 
   // Gets the next dataset element.
   rpc GetElement(GetElementRequest) returns (GetElementResponse);
+
+  // Gets the tasks currently being executed by the worker.
+  rpc GetWorkerTasks(GetWorkerTasksRequest) returns (GetWorkerTasksResponse);
 }
diff --git a/tensorflow/core/data/service/worker_impl.cc b/tensorflow/core/data/service/worker_impl.cc
index 6326d65782b..f0790f6961e 100644
--- a/tensorflow/core/data/service/worker_impl.cc
+++ b/tensorflow/core/data/service/worker_impl.cc
@@ -21,9 +21,11 @@ limitations under the License.
 #include "tensorflow/c/tf_status_helper.h"
 #include "tensorflow/core/data/dataset.pb.h"
 #include "tensorflow/core/data/service/credentials_factory.h"
+#include "tensorflow/core/data/service/data_service.h"
 #include "tensorflow/core/data/service/dispatcher.grpc.pb.h"
 #include "tensorflow/core/data/service/dispatcher.pb.h"
 #include "tensorflow/core/data/service/grpc_util.h"
+#include "tensorflow/core/data/service/utils.h"
 #include "tensorflow/core/data/standalone.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -36,7 +38,7 @@ limitations under the License.
 namespace tensorflow {
 namespace data {
 
-const constexpr uint64 kHeartbeatIntervalMicros = 5ull * 1000 * 1000;
+const constexpr uint64 kRetryIntervalMicros = 5ull * 1000 * 1000;
 
 namespace {
 auto* tf_data_service_created =
@@ -54,24 +56,35 @@ DataServiceWorkerImpl::DataServiceWorkerImpl(
 DataServiceWorkerImpl::~DataServiceWorkerImpl() {
   mutex_lock l(mu_);
   cancelled_ = true;
+  task_completion_cv_.notify_one();
   heartbeat_cv_.notify_one();
 }
 
-void DataServiceWorkerImpl::Start(const std::string& worker_address) {
+Status DataServiceWorkerImpl::Start(const std::string& worker_address) {
   VLOG(3) << "Starting tf.data service worker at address " << worker_address;
-  mutex_lock l(mu_);
   worker_address_ = worker_address;
 
-  Thread* thread = Env::Default()->StartThread(
-      {}, "data-service-worker-heartbeat", [this]() { HeartbeatThread(); });
-  heartbeat_thread_.reset(thread);
-  Status s = Register();
+  dispatcher_ = absl::make_unique<DataServiceDispatcherClient>(
+      config_.dispatcher_address(), config_.protocol());
+  TF_RETURN_IF_ERROR(dispatcher_->Initialize());
+
+  Status s = Heartbeat();
   while (!s.ok()) {
     LOG(WARNING) << "Failed to register with dispatcher at "
                  << config_.dispatcher_address() << ": " << s;
-    Env::Default()->SleepForMicroseconds(kHeartbeatIntervalMicros);
-    s = Register();
+    Env::Default()->SleepForMicroseconds(kRetryIntervalMicros);
+    s = Heartbeat();
   }
+  LOG(INFO) << "Worker registered with dispatcher running at "
+            << config_.dispatcher_address();
+  task_completion_thread_ = absl::WrapUnique(
+      Env::Default()->StartThread({}, "data-service-worker-task-completion",
+                                  [this]() { TaskCompletionThread(); }));
+  heartbeat_thread_ = absl::WrapUnique(Env::Default()->StartThread(
+      {}, "data-service-worker-heartbeat", [this]() { HeartbeatThread(); }));
+  mutex_lock l(mu_);
+  registered_ = true;
+  return Status::OK();
 }
 
 Status DataServiceWorkerImpl::ProcessTask(const ProcessTaskRequest* request,
@@ -84,27 +97,53 @@ Status DataServiceWorkerImpl::ProcessTask(const ProcessTaskRequest* request,
 
 Status DataServiceWorkerImpl::ProcessTaskInternal(const TaskDef& task_def)
     EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-  VLOG(3) << "Received request to process task " << task_def.task_id();
-  standalone::Dataset::Params params;
-  std::unique_ptr<standalone::Dataset> dataset;
-  TF_RETURN_IF_ERROR(standalone::Dataset::FromGraph(
-      params, task_def.dataset().graph(), &dataset));
-
-  std::unique_ptr<standalone::Iterator> iterator;
-  TF_RETURN_IF_ERROR(dataset->MakeIterator(&iterator));
-
-  if (tasks_.contains(task_def.task_id())) {
-    return errors::AlreadyExists("A task with id ", task_def.task_id(),
-                                 " already exists.");
+  std::unique_ptr<Task>& task = tasks_[task_def.task_id()];
+  if (task) {
+    VLOG(1) << "Received request to process already-processed task "
+            << task->task_def.task_id();
+    return Status::OK();
   }
-  Task& task = tasks_[task_def.task_id()];
-  task.id = task_def.task_id();
-  task.dataset = std::move(dataset);
-  task.iterator = std::move(iterator);
+  task = absl::make_unique<Task>(task_def);
   VLOG(3) << "Began processing for task " << task_def.task_id();
   return Status::OK();
 }
 
+Status DataServiceWorkerImpl::EnsureTaskInitialized(
+    DataServiceWorkerImpl::Task& task) {
+  mutex_lock l(task.mu);
+  if (task.initialized) {
+    return Status::OK();
+  }
+  standalone::Dataset::Params params;
+
+  switch (task.task_def.dataset_case()) {
+    case TaskDef::kDatasetDef:
+      TF_RETURN_IF_ERROR(standalone::Dataset::FromGraph(
+          params, task.task_def.dataset_def().graph(), &task.dataset));
+      break;
+    case TaskDef::kPath: {
+      DatasetDef def;
+      Status s = ReadDatasetDef(task.task_def.path(), def);
+      if (!s.ok()) {
+        LOG(INFO) << "Failed to read dataset from " << task.task_def.path()
+                  << ": " << s << ". Falling back to reading from dispatcher.";
+        TF_RETURN_IF_ERROR(
+            dispatcher_->GetDatasetDef(task.task_def.dataset_id(), def));
+      }
+      TF_RETURN_IF_ERROR(
+          standalone::Dataset::FromGraph(params, def.graph(), &task.dataset));
+      break;
+    }
+    case TaskDef::DATASET_NOT_SET:
+      return errors::Internal("Unrecognized dataset case: ",
+                              task.task_def.dataset_case());
+  }
+  TF_RETURN_IF_ERROR(task.dataset->MakeIterator(&task.iterator));
+  task.initialized = true;
+  VLOG(3) << "Created iterator for task " << task.task_def.task_id();
+  return Status::OK();
+}
+
 Status DataServiceWorkerImpl::GetElement(const GetElementRequest* request,
                                          GetElementResponse* response) {
   VLOG(3) << "Received GetElement request for task " << request->task_id();
@@ -112,24 +151,26 @@ Status DataServiceWorkerImpl::GetElement(const GetElementRequest* request,
   std::vector<tensorflow::Tensor> outputs;
   {
     mutex_lock l(mu_);
+    if (!registered_) {
+      // We need to reject requests until the worker has registered with the
+      // dispatcher, so that we don't return NOT_FOUND for tasks that the worker
+      // had before preemption.
+      return errors::Unavailable(
+          "Worker has not yet registered with dispatcher.");
+    }
     auto it = tasks_.find(request->task_id());
     if (it == tasks_.end()) {
-      return errors::NotFound("DataServiceWorkerImpl::GetElement failed. ",
-                              "Task id ", request->task_id(), " not found");
-    }
-    std::unique_ptr<standalone::Iterator>& iter = it->second.iterator;
-    if (iter == nullptr) {
-      VLOG(3) << "Task " << request->task_id() << " is already finished";
       response->set_end_of_sequence(true);
       return Status::OK();
     }
-    TF_RETURN_IF_ERROR(iter->GetNext(&outputs, &end_of_sequence));
+    auto& task = it->second;
+    TF_RETURN_IF_ERROR(EnsureTaskInitialized(*task));
+    TF_RETURN_IF_ERROR(task->iterator->GetNext(&outputs, &end_of_sequence));
     if (end_of_sequence) {
       VLOG(3) << "Reached end_of_sequence for task " << request->task_id();
-      // Release iterator memory and leave a null entry as a tombstone.
-      iter.reset();
-      pending_completed_tasks_.push_back(request->task_id());
-      heartbeat_cv_.notify_one();
+      tasks_.erase(request->task_id());
+      pending_completed_tasks_.insert(request->task_id());
+      task_completion_cv_.notify_one();
     }
   }
 
@@ -168,77 +209,122 @@ Status DataServiceWorkerImpl::GetElement(const GetElementRequest* request,
   return Status::OK();
 }
 
-Status DataServiceWorkerImpl::EnsureDispatcherStubInitialized()
-    EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-  if (!dispatcher_stub_) {
-    ::grpc::ChannelArguments args;
-    std::shared_ptr<::grpc::ChannelCredentials> credentials;
-    TF_RETURN_IF_ERROR(CredentialsFactory::CreateClientCredentials(
-        config_.protocol(), &credentials));
-    auto channel = ::grpc::CreateCustomChannel(config_.dispatcher_address(),
-                                               credentials, args);
-    dispatcher_stub_ = DispatcherService::NewStub(channel);
+Status DataServiceWorkerImpl::GetWorkerTasks(
+    const GetWorkerTasksRequest* request, GetWorkerTasksResponse* response) {
+  mutex_lock l(mu_);
+  for (const auto& it : tasks_) {
+    Task* task = it.second.get();
+    TaskInfo* task_info = response->add_tasks();
+    task_info->set_worker_address(worker_address_);
+    task_info->set_task_id(task->task_def.task_id());
+    task_info->set_job_id(task->task_def.job_id());
   }
   return Status::OK();
 }
 
-Status DataServiceWorkerImpl::Register() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-  VLOG(3) << "Registering with dispatcher at " << config_.dispatcher_address();
-  TF_RETURN_IF_ERROR(EnsureDispatcherStubInitialized());
-  RegisterWorkerRequest req;
-  req.set_worker_address(worker_address_);
-  RegisterWorkerResponse resp;
-
-  grpc::ClientContext ctx;
-  grpc::Status s = dispatcher_stub_->RegisterWorker(&ctx, req, &resp);
-  if (!s.ok()) {
-    return grpc_util::WrapError("Failed to register worker", s);
-  }
-  for (const TaskDef& task : resp.tasks()) {
-    TF_RETURN_IF_ERROR(ProcessTaskInternal(task));
-  }
-  return Status::OK();
-}
-
-Status DataServiceWorkerImpl::SendTaskUpdate() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-  VLOG(3) << "Sending " << pending_completed_tasks_.size()
-          << " task updates to dispatcher";
-  TF_RETURN_IF_ERROR(EnsureDispatcherStubInitialized());
-  WorkerUpdateRequest req;
-  for (int task_id : pending_completed_tasks_) {
-    TaskProgress* update = req.add_updates();
-    update->set_task_id(task_id);
-    update->set_completed(true);
-  }
-
-  WorkerUpdateResponse resp;
-  grpc::ClientContext ctx;
-  grpc::Status s = dispatcher_stub_->WorkerUpdate(&ctx, req, &resp);
-  if (!s.ok()) {
-    return grpc_util::WrapError("Failed to send task updates", s);
-  }
-  pending_completed_tasks_.clear();
-  VLOG(3) << "Sent " << req.updates().size() << " task updates ";
-  return Status::OK();
-}
-
-void DataServiceWorkerImpl::HeartbeatThread() {
+void DataServiceWorkerImpl::TaskCompletionThread() LOCKS_EXCLUDED(mu_) {
   while (true) {
-    mutex_lock l(mu_);
-    while (!cancelled_ && pending_completed_tasks_.empty()) {
-      heartbeat_cv_.wait(l);
+    {
+      mutex_lock l(mu_);
+      while (!cancelled_ && pending_completed_tasks_.empty()) {
+        task_completion_cv_.wait(l);
+      }
+      if (cancelled_) {
+        VLOG(3) << "Task completion thread shutting down";
+        return;
+      }
     }
-    if (cancelled_) {
-      VLOG(3) << "Heartbeat thread shutting down";
-      return;
-    }
-    Status s = SendTaskUpdate();
+    Status s = SendTaskUpdates();
     if (!s.ok()) {
       LOG(WARNING) << "Failed to send task updates to dispatcher: " << s;
-      Env::Default()->SleepForMicroseconds(kHeartbeatIntervalMicros);
+      mutex_lock l(mu_);
+      if (!cancelled_) {
+        task_completion_cv_.wait_for(
+            l, std::chrono::microseconds(kRetryIntervalMicros));
+      }
     }
   }
 }
 
+Status DataServiceWorkerImpl::SendTaskUpdates() LOCKS_EXCLUDED(mu_) {
+  std::vector<TaskProgress> task_progress;
+  {
+    mutex_lock l(mu_);
+    VLOG(3) << "Sending " << pending_completed_tasks_.size()
+            << " task updates to dispatcher";
+    task_progress.reserve(pending_completed_tasks_.size());
+    for (int task_id : pending_completed_tasks_) {
+      task_progress.emplace_back();
+      task_progress.back().set_task_id(task_id);
+      task_progress.back().set_completed(true);
+    }
+  }
+
+  TF_RETURN_IF_ERROR(dispatcher_->WorkerUpdate(worker_address_, task_progress));
+  mutex_lock l(mu_);
+  for (const auto& update : task_progress) {
+    pending_completed_tasks_.erase(update.task_id());
+  }
+  VLOG(3) << "Sent " << task_progress.size() << " task updates ";
+  return Status::OK();
+}
+
+void DataServiceWorkerImpl::HeartbeatThread() LOCKS_EXCLUDED(mu_) {
+  while (true) {
+    int64 next_heartbeat_micros =
+        Env::Default()->NowMicros() + (config_.heartbeat_interval_ms() * 1000);
+    {
+      mutex_lock l(mu_);
+      while (!cancelled_ &&
+             Env::Default()->NowMicros() < next_heartbeat_micros) {
+        int64 time_to_wait_micros =
+            next_heartbeat_micros - Env::Default()->NowMicros();
+        heartbeat_cv_.wait_for(l,
+                               std::chrono::microseconds(time_to_wait_micros));
+      }
+      if (cancelled_) {
+        VLOG(3) << "Heartbeat thread shutting down";
+        return;
+      }
+      if (!registered_) {
+        VLOG(1) << "Not performing heartbeat; worker is not yet registered";
+        continue;
+      }
+    }
+    Status s = Heartbeat();
+    if (!s.ok()) {
+      LOG(WARNING) << "Failed to send heartbeat to dispatcher: " << s;
+    }
+  }
+}
+
+Status DataServiceWorkerImpl::Heartbeat() LOCKS_EXCLUDED(mu_) {
+  std::vector<int64> current_tasks;
+  {
+    mutex_lock l(mu_);
+    for (const auto& task : tasks_) {
+      current_tasks.push_back(task.first);
+    }
+  }
+  std::vector<TaskDef> new_tasks;
+  std::vector<int64> tasks_to_delete;
+  TF_RETURN_IF_ERROR(dispatcher_->WorkerHeartbeat(
+      worker_address_, current_tasks, new_tasks, tasks_to_delete));
+  mutex_lock l(mu_);
+  for (const auto& task : new_tasks) {
+    Status s = ProcessTaskInternal(task);
+    if (!s.ok() && !errors::IsAlreadyExists(s)) {
+      LOG(WARNING) << "Failed to start processing task " << task.task_id()
+                   << ": " << s;
+    }
+  }
+  for (int64 task_id : tasks_to_delete) {
+    VLOG(3) << "Deleting task " << task_id
+            << " at the request of the dispatcher";
+    tasks_.erase(task_id);
+  }
+  return Status::OK();
+}
+
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/data/service/worker_impl.h b/tensorflow/core/data/service/worker_impl.h
index 6961312ee34..5f05275622b 100644
--- a/tensorflow/core/data/service/worker_impl.h
+++ b/tensorflow/core/data/service/worker_impl.h
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/data/service/common.pb.h"
+#include "tensorflow/core/data/service/data_service.h"
 #include "tensorflow/core/data/service/dispatcher.grpc.pb.h"
 #include "tensorflow/core/data/service/worker.pb.h"
 #include "tensorflow/core/data/standalone.h"
@@ -38,7 +39,7 @@ class DataServiceWorkerImpl {
   // constructor because the worker may be binding to port `0`, in which case
   // the address isn't known until the worker has started and decided which port
   // to bind to.
-  void Start(const std::string& worker_address);
+  Status Start(const std::string& worker_address);
 
   // See worker.proto for API documentation.
 
@@ -49,43 +50,53 @@ class DataServiceWorkerImpl {
   /// Client-facing API.
   Status GetElement(const GetElementRequest* request,
                     GetElementResponse* response);
+  Status GetWorkerTasks(const GetWorkerTasksRequest* request,
+                        GetWorkerTasksResponse* response);
 
  private:
-  // Sets dispatcher_stub_ if it isn't already set.
-  Status EnsureDispatcherStubInitialized();
-  // Registers the worker with the dispatcher.
-  Status Register();
-  // Sends task status to the dispatcher.
-  Status SendTaskUpdate();
-  // Creates an iterator to process a task.
-  Status ProcessTaskInternal(const TaskDef& task);
-  // A thread for updating the dispatcher with worker status.
-  void HeartbeatThread();
+  struct Task {
+    explicit Task(TaskDef task_def) : task_def(std::move(task_def)) {}
 
-  typedef struct Task {
-    int64 id;
+    TaskDef task_def;
+    mutex mu;
+    bool initialized TF_GUARDED_BY(mu) = false;
     // TODO(aaudibert): Have standalone::Iterator own a reference to
     // standalone::Dataset so that we don't need to store the dataset here.
     std::unique_ptr<standalone::Dataset> dataset;
     std::unique_ptr<standalone::Iterator> iterator;
-  } Task;
+  };
+
+  // Sends task status to the dispatcher and checks for dispatcher commands.
+  Status SendTaskUpdates() LOCKS_EXCLUDED(mu_);
+  // Creates an iterator to process a task.
+  Status ProcessTaskInternal(const TaskDef& task) EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  Status EnsureTaskInitialized(Task& task);
+  // A thread for notifying the dispatcher when tasks complete.
+  void TaskCompletionThread() LOCKS_EXCLUDED(mu_);
+  // A thread for doing periodic heartbeats to the dispatcher.
+  void HeartbeatThread() LOCKS_EXCLUDED(mu_);
+  // Performs a heartbeat to the dispatcher.
+  Status Heartbeat() LOCKS_EXCLUDED(mu_);
 
   const experimental::WorkerConfig config_;
   // The worker's own address.
   std::string worker_address_;
+  std::unique_ptr<DataServiceDispatcherClient> dispatcher_;
 
   mutex mu_;
-  int64 worker_id_ TF_GUARDED_BY(mu_);
-  std::unique_ptr<DispatcherService::Stub> dispatcher_stub_ TF_GUARDED_BY(mu_);
   // Information about tasks, keyed by task ids.
-  absl::flat_hash_map<int64, Task> tasks_ TF_GUARDED_BY(mu_);
-  // List of completed tasks which haven't yet been communicated to the
-  // dispatcher.
-  std::vector<int64> pending_completed_tasks_ TF_GUARDED_BY(mu_);
+  absl::flat_hash_map<int64, std::unique_ptr<Task>> tasks_ TF_GUARDED_BY(mu_);
+  // Completed tasks which haven't yet been communicated to the dispatcher.
+  absl::flat_hash_set<int64> pending_completed_tasks_ TF_GUARDED_BY(mu_);
   bool cancelled_ TF_GUARDED_BY(mu_) = false;
-  // Condition variable for notifying the heartbeat thread.
-  condition_variable heartbeat_cv_ TF_GUARDED_BY(mu_);
+  // Whether the worker has registered with the dispatcher yet.
+  bool registered_ TF_GUARDED_BY(mu_) = false;
+  // A thread for notifying the dispatcher when tasks complete.
+  std::unique_ptr<Thread> task_completion_thread_;
+  condition_variable task_completion_cv_ TF_GUARDED_BY(mu_);
+  // A thread for performing regular heartbeats to the dispatcher.
   std::unique_ptr<Thread> heartbeat_thread_;
+  condition_variable heartbeat_cv_ TF_GUARDED_BY(mu_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(DataServiceWorkerImpl);
 };
diff --git a/tensorflow/core/debug/grpc_session_debug_test.cc b/tensorflow/core/debug/grpc_session_debug_test.cc
index 65ec1ef8a6d..cb722c646c4 100644
--- a/tensorflow/core/debug/grpc_session_debug_test.cc
+++ b/tensorflow/core/debug/grpc_session_debug_test.cc
@@ -283,12 +283,10 @@ TEST_F(GrpcSessionDebugTest, MultiDevices_String) {
 
         DeleteDumpDir();
       } else {
-        // CUDA and SYCL devices do not have an Identity op for strings
+        // The CUDA device does not have an Identity op for strings
         LOG(ERROR) << "Error: " << s;
         ASSERT_TRUE((a_dev.device_type() == DEVICE_GPU) ||
-                    (a_dev.device_type() == DEVICE_SYCL) ||
-                    (b_dev.device_type() == DEVICE_GPU) ||
-                    (b_dev.device_type() == DEVICE_SYCL));
+                    (b_dev.device_type() == DEVICE_GPU));
         ASSERT_FALSE(s.ok());
       }
     }
diff --git a/tensorflow/core/distributed_runtime/BUILD b/tensorflow/core/distributed_runtime/BUILD
index 30512295a7e..fa8a5eb8cfc 100644
--- a/tensorflow/core/distributed_runtime/BUILD
+++ b/tensorflow/core/distributed_runtime/BUILD
@@ -105,6 +105,7 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:worker_proto_cc",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -521,6 +522,7 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:worker_proto_cc",
+        "//tensorflow/core/nccl:collective_communicator",
     ],
 )
 
@@ -537,6 +539,7 @@ cc_library(
         "//tensorflow/core:lib_internal",  # protobuf::Any
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:worker_proto_cc",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -569,7 +572,10 @@ cc_library(
         ":device_resolver_distributed",
         ":worker_cache",
         "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/platform:errors",
+        "//tensorflow/core/platform:status",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -604,6 +610,7 @@ tf_cc_test(
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
         "//tensorflow/core/kernels:collective_ops",
+        "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
 
@@ -616,6 +623,8 @@ cc_library(
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/platform:errors",
+        "//tensorflow/core/platform:status",
         "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
diff --git a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h
index eb9ce64bcdb..4655bce44f9 100644
--- a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h
+++ b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_CLUSTER_FUNCTION_LIBRARY_RUNTIME_H_
 #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_CLUSTER_FUNCTION_LIBRARY_RUNTIME_H_
 
+#include "absl/types/optional.h"
 #include "tensorflow/core/distributed_runtime/worker_cache.h"
 #include "tensorflow/core/distributed_runtime/worker_interface.h"
 #include "tensorflow/core/framework/function.h"
diff --git a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.cc b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.cc
index 650c52cd8da..0e4b588a7e6 100644
--- a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.cc
+++ b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.cc
@@ -18,14 +18,18 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/cancellable_call.h"
 #include "tensorflow/core/distributed_runtime/device_resolver_distributed.h"
 #include "tensorflow/core/distributed_runtime/worker_cache.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/util/device_name_utils.h"
 
 namespace tensorflow {
 namespace {
 
 class CompleteGroupCall : public CancellableCall {
  public:
-  CompleteGroupCall(const CollGroupParams& group, const string& device_name,
+  CompleteGroupCall(const CollGroupParams& group,
+                    const DeviceAttributes& device,
                     const CollectiveType& collective_type,
                     CancellationManager* cancel_mgr,
                     const string& remote_worker, WorkerCacheInterface* wc)
@@ -33,7 +37,7 @@ class CompleteGroupCall : public CancellableCall {
     req_.set_group_key(group.group_key);
     req_.set_group_size(group.group_size);
     req_.set_device_type(group.device_type.type_string());
-    req_.add_device_name(device_name);
+    *req_.mutable_device_attributes() = device;
     req_.set_collective_type(collective_type);
   }
   ~CompleteGroupCall() override {}
@@ -98,47 +102,54 @@ CollectiveParamResolverDistributed::CollectiveParamResolverDistributed(
 }
 
 void CollectiveParamResolverDistributed::CompleteParamsAsync(
-    const string& device, CollectiveParams* cp, CancellationManager* cancel_mgr,
-    const StatusCallback& done) {
-  VLOG(1) << "CompleteParams distributed " << device << " for " << cp << ": "
-          << cp->ToString();
-  CompleteGroupDistributed(device, cp, cancel_mgr,
-                           [this, device, cp, cancel_mgr, done](
-                               const Status& s, const GroupRec* gr) {
-                             if (s.ok()) {
-                               CompleteInstanceDistributed(device, gr, cp,
-                                                           cancel_mgr, done);
-                             } else {
-                               done(s);
-                             }
-                           });
+    const DeviceAttributes& device, CollectiveParams* cp,
+    CancellationManager* cancel_mgr, const StatusCallback& done) {
+  VLOG(1) << "CompleteParams distributed " << device.name() << " for " << cp
+          << ": " << cp->ToString();
+  CompleteGroupDistributed(
+      device, cp, cancel_mgr,
+      [this, device, cp, cancel_mgr, done](Status s, const GroupRec* gr) {
+        if (s.ok()) {
+          std::vector<DeviceAttributes> attributes;
+          mutex_lock l(gr->mu);
+          for (const auto& item : gr->devices) {
+            attributes.push_back(item.second);
+          }
+          s = dev_resolver_->UpdateDeviceAttributes(attributes);
+        }
+        if (s.ok()) {
+          CompleteInstanceDistributed(device.name(), gr, cp, cancel_mgr, done);
+        } else {
+          done(s);
+        }
+      });
 }
 
 void CollectiveParamResolverDistributed::CompleteGroupAsync(
     const CompleteGroupRequest* request, CompleteGroupResponse* response,
     CancellationManager* cancel_mgr, const StatusCallback& done) {
+  if (!request->has_device_attributes()) {
+    done(errors::Internal(
+        "CompleteGroupRequest device_attributes is not set. Make sure you're "
+        "running the same version of Tensorflow on all workers."));
+    return;
+  }
   CollectiveParams cp;
   cp.group.group_key = request->group_key();
   cp.group.group_size = request->group_size();
   cp.group.device_type = DeviceType(request->device_type());
-  for (const string& dn : request->device_name()) {
-    cp.instance.device_names.push_back(dn);
-  }
   cp.instance.type = CollectiveType(request->collective_type());
   CompleteGroupDistributed(
-      cp.instance.device_names[0], &cp, cancel_mgr,
+      request->device_attributes(), &cp, cancel_mgr,
       [response, done](const Status& s, const GroupRec* gr) {
         if (s.ok()) {
           mutex_lock l(gr->mu);
           response->set_group_key(gr->group.group_key);
           response->set_group_size(gr->group.group_size);
           response->set_device_type(gr->group.device_type.type_string());
-          response->set_num_tasks(gr->task_set.size());
-          for (const string& dn : gr->device_list) {
-            response->add_device_name(dn);
-          }
-          for (const string& tn : gr->task_list) {
-            response->add_task_name(tn);
+          response->set_num_tasks(gr->group.num_tasks);
+          for (const auto& item : gr->devices) {
+            *response->add_device_attributes() = item.second;
           }
           response->set_communicator_key(
               gr->group.runtime_details.communicator_key);
@@ -152,6 +163,22 @@ void CollectiveParamResolverDistributed::CompleteGroupAsync(
 void CollectiveParamResolverDistributed::CompleteInstanceAsync(
     const CompleteInstanceRequest* request, CompleteInstanceResponse* response,
     CancellationManager* cancel_mgr, const StatusCallback& done) {
+  GroupRec* gr = GetCachedGroup(request->group_key());
+  if (gr == nullptr) {
+    done(errors::FailedPrecondition(
+        "group ", request->group_key(),
+        " not found. This normally means the server has restarted"));
+    return;
+  }
+  {
+    mutex_lock l(gr->mu);
+    if (!gr->status.ok() || gr->devices.size() != gr->group.group_size) {
+      done(errors::FailedPrecondition(
+          "group ", request->group_key(),
+          " failed to resolve. This normally means the server has restarted"));
+      return;
+    }
+  }
   CollectiveParams* cp = new CollectiveParams;
   cp->name = request->name();
   cp->group.group_key = request->group_key();
@@ -164,56 +191,44 @@ void CollectiveParamResolverDistributed::CompleteInstanceAsync(
   for (int32 offset : request->subdiv_offset()) {
     cp->instance.impl_details.subdiv_offsets.push_back(offset);
   }
-  string* device = new string(request->device());
-  VLOG(1) << "New cp " << cp << " for device " << *device << " : "
-          << cp->ToString();
-  StatusCallback done_and_cleanup = [cp, device, done](const Status& s) {
+  StatusCallback done_and_cleanup = [cp, done](const Status& s) {
     done(s);
     delete cp;
-    delete device;
   };
-  // Start by completing the group.
-  CompleteGroupDistributed(
-      *device, cp, cancel_mgr,
-      [this, cp, device, response, cancel_mgr, done_and_cleanup](
-          const Status& cg_status, const GroupRec* gr) {
-        if (cg_status.ok()) {
-          // Then complete the instance.
-          CompleteInstanceDistributed(
-              *device, gr, cp, cancel_mgr,
-              [this, gr, cp, response,
-               done_and_cleanup](const Status& ci_status) {
-                if (ci_status.ok()) {
-                  // Now source_rank should be known, so
-                  // retrieve it.
-                  FindInstanceRec(
-                      gr, cp,
-                      [cp, response, done_and_cleanup](const Status& fi_status,
-                                                       InstanceRec* ir) {
-                        if (fi_status.ok()) {
-                          mutex_lock l(ir->out_mu);
-                          ir->WaitForOutMu(l);
-                          response->set_instance_key(cp->instance.instance_key);
-                          response->set_source_rank(ir->source_rank);
-                          done_and_cleanup(fi_status);
-                        } else {
-                          done_and_cleanup(fi_status);
-                        }
-                      });
+  CompleteInstanceDistributed(
+      request->device(), gr, cp, cancel_mgr,
+      [this, gr, cp, response, done_and_cleanup](const Status& ci_status) {
+        if (ci_status.ok()) {
+          // Now source_rank should be known, so
+          // retrieve it.
+          FindInstanceRec(
+              gr, cp,
+              [cp, response, done_and_cleanup](const Status& fi_status,
+                                               InstanceRec* ir) {
+                if (fi_status.ok()) {
+                  mutex_lock l(ir->out_mu);
+                  ir->WaitForOutMu(l);
+                  response->set_instance_key(cp->instance.instance_key);
+                  response->set_source_rank(ir->source_rank);
+                  done_and_cleanup(fi_status);
                 } else {
-                  done_and_cleanup(ci_status);
+                  done_and_cleanup(fi_status);
                 }
               });
         } else {
-          done_and_cleanup(cg_status);
+          done_and_cleanup(ci_status);
         }
       });
 }
 
-bool CollectiveParamResolverDistributed::GroupIsCached(int32 group_key) {
+CollectiveParamResolverDistributed::GroupRec*
+CollectiveParamResolverDistributed::GetCachedGroup(int32 group_key) {
   mutex_lock l(group_mu_);
-  const auto& it = group_table_.find(group_key);
-  return it != group_table_.end();
+  auto it = group_table_.find(group_key);
+  if (it == group_table_.end()) {
+    return nullptr;
+  }
+  return it->second.get();
 }
 
 Status CollectiveParamResolverDistributed::UpdateGroupCache(
@@ -226,26 +241,19 @@ Status CollectiveParamResolverDistributed::UpdateGroupCache(
     gr->group.group_key = resp.group_key();
     gr->group.group_size = resp.group_size();
     gr->group.num_tasks = resp.num_tasks();
-    if (resp.device_name_size() != gr->group.group_size) {
+    if (resp.device_attributes().empty()) {
+      return errors::Internal(
+          "CompleteGroupResponse device_attributes is empty. Make sure you're "
+          "running the same version of Tensorflow on all workers.");
+    }
+    if (resp.device_attributes_size() != gr->group.group_size) {
       return errors::Internal(
           "CompleteGroupResponse group_size doesn't match device_name list");
     }
-    for (const string& dn : resp.device_name()) {
-      gr->device_set.insert(dn);
-      gr->device_list.push_back(dn);
+    for (const DeviceAttributes& device : resp.device_attributes()) {
+      gr->devices[device.name()] = device;
     }
-    if (resp.task_name_size() != gr->group.group_size) {
-      return errors::Internal(
-          "CompleteGroupResponse group_size doesn't match task_name list");
-    }
-    for (const string& tn : resp.task_name()) {
-      gr->task_list.push_back(tn);
-      gr->task_set.insert(tn);
-    }
-    CHECK_EQ(gr->task_set.size(), gr->group.num_tasks);
     gr->group.runtime_details.communicator_key = resp.communicator_key();
-    VLOG(2) << "Group communicator_key="
-            << absl::CEscape(gr->group.runtime_details.communicator_key);
   }
   {
     // Group membership should never change. Once a record is in group_table_
@@ -273,14 +281,15 @@ Status CollectiveParamResolverDistributed::UpdateGroupCache(
 }
 
 void CollectiveParamResolverDistributed::CompleteGroupDistributed(
-    const string& device, CollectiveParams* cp, CancellationManager* cancel_mgr,
-    const GroupRecCallback& done) {
+    const DeviceAttributes& device, CollectiveParams* cp,
+    CancellationManager* cancel_mgr, const GroupRecCallback& done) {
   VLOG(1) << "CompleteGroupDistributed group_key=" << cp->group.group_key
-          << " dev: " << device << " is_leader=" << (group_leader_.empty());
+          << " dev: " << device.name()
+          << " is_leader=" << (group_leader_.empty());
   if (group_leader_.empty()) {
     // This is the group leader, so resolution is local.
     return CompleteGroupLocal(device, cp, done);
-  } else if (!GroupIsCached(cp->group.group_key)) {
+  } else if (GetCachedGroup(cp->group.group_key) == nullptr) {
     // Need to update Group cache from the leader.
     CompleteGroupCall* call =
         new CompleteGroupCall(cp->group, device, cp->instance.type, cancel_mgr,
diff --git a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h
index 684887430c3..fc692a12fc6 100644
--- a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h
+++ b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h
@@ -16,6 +16,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_COLLECTIVE_PARAM_RESOLVER_DISTRIBUTED_H_
 
 #include "tensorflow/core/common_runtime/collective_param_resolver_local.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
 
 namespace tensorflow {
 class ConfigProto;
@@ -31,7 +32,7 @@ class CollectiveParamResolverDistributed : public CollectiveParamResolverLocal {
                                      WorkerCacheInterface* worker_cache,
                                      const string& task_name);
 
-  void CompleteParamsAsync(const string& device, CollectiveParams* cp,
+  void CompleteParamsAsync(const DeviceAttributes& device, CollectiveParams* cp,
                            CancellationManager* cancel_mgr,
                            const StatusCallback& done) override;
 
@@ -46,9 +47,9 @@ class CollectiveParamResolverDistributed : public CollectiveParamResolverLocal {
                              const StatusCallback& done) override;
 
  protected:
-  // Returns true iff there's an entry for this group_key in the
-  // local group_table_.
-  bool GroupIsCached(int32 group_key) TF_LOCKS_EXCLUDED(group_mu_);
+  // Returns the cached group iff there's an entry for this group_key in the
+  // local group_table_; returns nullptr otherwise.
+  GroupRec* GetCachedGroup(int32 group_key) TF_LOCKS_EXCLUDED(group_mu_);
 
   // Updates group_table_ with contents of resp.
   Status UpdateGroupCache(const CompleteGroupResponse& resp)
@@ -59,7 +60,8 @@ class CollectiveParamResolverDistributed : public CollectiveParamResolverLocal {
   //
   // Semantics are like those of CompleteGroupLocal but will make a
   // remote call to the group leader if necessary.
-  void CompleteGroupDistributed(const string& device, CollectiveParams* cp,
+  void CompleteGroupDistributed(const DeviceAttributes& device,
+                                CollectiveParams* cp,
                                 CancellationManager* cancel_mgr,
                                 const GroupRecCallback& done);
 
diff --git a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc
index 130a48e80d2..8f35f9e8d27 100644
--- a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc
+++ b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h"
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/distributed_runtime/device_resolver_distributed.h"
 #include "tensorflow/core/distributed_runtime/test_utils.h"
@@ -23,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/random.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/util/device_name_utils.h"
 
@@ -41,6 +43,7 @@ static std::unique_ptr<Device> NewDevice(const string& type,
   attr.set_name(name);
   attr.set_device_type(type);
   attr.mutable_locality()->set_numa_node(3);  // a non-default value
+  attr.set_incarnation(random::New64());
   return absl::make_unique<FakeDevice>(attr);
 }
 
@@ -125,127 +128,110 @@ class FakeCache : public TestWorkerCache {
 
 class DeviceResDistTest : public ::testing::Test {
  protected:
-  DeviceResDistTest() {}
-
-  ~DeviceResDistTest() override {
-    for (DeviceMgr* dm : device_mgrs_) {
-      delete dm;
-    }
-    for (auto it : dev_resolvers_) {
-      delete it.second;
-    }
-    for (auto it : cp_resolvers_) {
-      delete it.second;
-    }
-    for (FakeWorker* w : workers_) {
-      delete w;
-    }
-  }
-
   void DefineWorkers(int num_workers, int num_devices,
                      const string& device_type, bool nccl) {
-    ConfigProto config;
     for (int w = 0; w < num_workers; ++w) {
       string name = strings::StrCat("/job:worker/replica:0/task:", w);
-      if (w == 0) {
-        config.mutable_experimental()->set_collective_group_leader(name);
-        if (nccl) {
-          config.mutable_experimental()->set_collective_nccl(true);
-        }
-      }
-      DefineWorker(config, name, device_type, num_devices);
+      DefineWorker(name, device_type, num_devices, nccl);
     }
   }
 
-  void DefineWorker(const ConfigProto& config, const string& worker_name,
-                    const string& device_type, int num_devices) {
+  void DefineWorker(const string& worker_name, const string& device_type,
+                    int num_devices, bool nccl) {
+    ConfigProto config;
+    config.mutable_experimental()->set_collective_group_leader(
+        "/job:worker/replica:0/task:0");
+    config.mutable_experimental()->set_collective_nccl(nccl);
+
     std::vector<std::unique_ptr<Device>> devices;
     for (int i = 0; i < num_devices; ++i) {
       devices.push_back(NewDevice(
           device_type,
           strings::StrCat(worker_name, "/device:", device_type, ":", i)));
     }
-    DeviceMgr* dev_mgr = new StaticDeviceMgr(std::move(devices));
-    device_mgrs_.push_back(dev_mgr);
+    device_mgrs_[worker_name] =
+        absl::make_unique<StaticDeviceMgr>(std::move(devices));
     std::vector<string>* dv = &dev_by_task_[worker_name];
-    for (auto* d : dev_mgr->ListDevices()) {
+    dv->clear();
+    for (auto* d : device_mgrs_[worker_name]->ListDevices()) {
       dv->push_back(d->name());
     }
-    DeviceResolverDistributed* dev_res =
-        new DeviceResolverDistributed(dev_mgr, &wc_, worker_name);
-    dev_resolvers_[worker_name] = dev_res;
-    CollectiveParamResolverDistributed* cp_res =
-        new CollectiveParamResolverDistributed(config, dev_mgr, dev_res, &wc_,
-                                               worker_name);
-    cp_resolvers_[worker_name] = cp_res;
-    FakeWorker* fw = new FakeWorker(worker_name, dev_mgr, cp_res);
-    workers_.push_back(fw);
-    wc_.AddWorker(worker_name, fw);
+    dev_resolvers_[worker_name] = absl::make_unique<DeviceResolverDistributed>(
+        device_mgrs_[worker_name].get());
+    cp_resolvers_[worker_name] =
+        absl::make_unique<CollectiveParamResolverDistributed>(
+            config, device_mgrs_[worker_name].get(),
+            dev_resolvers_[worker_name].get(), &wc_, worker_name);
+    workers_[worker_name] = absl::make_unique<FakeWorker>(
+        worker_name, device_mgrs_[worker_name].get(),
+        cp_resolvers_[worker_name].get());
+    wc_.AddWorker(worker_name, workers_[worker_name].get());
   }
 
-  void DefineCollectiveParams(int num_workers, int num_devices) {
-    const int kGroupKey = 5;
-    const int kInstanceKey = 3;
+  void DefineCollectiveParams(int num_workers, int num_devices,
+                              const string& device_type) {
     for (int wi = 0; wi < num_workers; ++wi) {
       string task_name = strings::StrCat("/job:worker/replica:0/task:", wi);
       for (int di = 0; di < num_devices; ++di) {
-        string device_name = strings::StrCat(task_name, "/device:CPU:", di);
-        cp_.push_back(CollectiveParams());
-        CollectiveParams& cp = cp_.back();
-        cp.group.group_key = kGroupKey;
-        cp.group.group_size = num_workers * num_devices;
-        cp.group.device_type = DEVICE_CPU;
-        cp.group.num_tasks = num_workers;
-        cp.instance.instance_key = kInstanceKey;
-        cp.instance.type = REDUCTION_COLLECTIVE;
-        cp.instance.data_type = DT_FLOAT;
-        cp.instance.shape = TensorShape({64});
-        cp.instance.impl_details.subdiv_offsets.push_back(0);
+        string device_name =
+            strings::StrCat(task_name, "/device:", device_type, ":", di);
+        cp_[device_name] =
+            CreateCollectiveParams(num_workers, num_devices, device_type);
       }
     }
   }
 
+  CollectiveParams CreateCollectiveParams(int num_workers, int num_devices,
+                                          const string& device_type) {
+    const int kGroupKey = 5;
+    const int kInstanceKey = 3;
+    CollectiveParams cp;
+    cp.group.group_key = kGroupKey;
+    cp.group.group_size = num_workers * num_devices;
+    cp.group.device_type = DeviceType(device_type);
+    cp.group.num_tasks = num_workers;
+    cp.instance.instance_key = kInstanceKey;
+    cp.instance.type = REDUCTION_COLLECTIVE;
+    cp.instance.data_type = DT_FLOAT;
+    cp.instance.shape = TensorShape({64});
+    cp.instance.impl_details.subdiv_offsets.push_back(0);
+    return cp;
+  }
+
   void IssueRequests(int num_workers, int num_devices) {
-    const int device_count = num_workers * num_devices;
     {
       mutex_lock l(mu_);
       num_done_ = 0;
     }
-    cp_.resize(device_count);
-    status_.resize(device_count);
-    int idx = 0;
+    int group_size = num_workers * num_devices;
     for (int wi = 0; wi < num_workers; ++wi) {
+      string task_name = strings::StrCat("/job:worker/replica:0/task:", wi);
       for (int di = 0; di < num_devices; ++di) {
-        IssueRequest(num_workers, num_devices, idx);
-        ++idx;
+        string device_name = strings::StrCat(task_name, "/device:CPU:", di);
+        IssueRequest(task_name, device_name, group_size);
       }
     }
   }
 
-  void IssueRequest(int num_workers, int num_devices, int idx) {
-    int device_count = num_workers * num_devices;
-    int wi = idx / num_devices;
-    int di = idx % num_devices;
-    string task_name = strings::StrCat("/job:worker/replica:0/task:", wi);
-    string device_name = strings::StrCat(task_name, "/device:CPU:", di);
-    while (idx >= cp_.size()) {
-      status_.resize(idx + 1);
-      cp_.resize(idx + 1);
-    }
-    CollectiveParams* cp = &cp_[idx];
-    CollectiveParamResolverDistributed* cp_res = cp_resolvers_[task_name];
+  void IssueRequest(const string& task_name, const string& device_name,
+                    int group_size) {
+    Device* device = nullptr;
+    TF_CHECK_OK(device_mgrs_[task_name]->LookupDevice(device_name, &device));
+    CollectiveParams* cp = &cp_[device_name];
+    CollectiveParamResolverDistributed* cp_res = cp_resolvers_[task_name].get();
     CHECK(cp_res);
-    cp_res->CompleteParamsAsync(device_name, cp, &cm_,
-                                [this, idx, device_count](const Status& s) {
-                                  status_[idx] = s;
-                                  {
-                                    mutex_lock l(mu_);
-                                    ++num_done_;
-                                    if (num_done_ == device_count) {
-                                      done_.notify_all();
-                                    }
-                                  }
-                                });
+    cp_res->CompleteParamsAsync(
+        device->attributes(), cp, &cm_,
+        [this, device_name, group_size](const Status& s) {
+          status_[device_name] = s;
+          {
+            mutex_lock l(mu_);
+            ++num_done_;
+            if (num_done_ == group_size) {
+              done_.notify_all();
+            }
+          }
+        });
   }
 
   void ValidateCollectiveParams(int num_workers, int num_devices) {
@@ -259,39 +245,68 @@ class DeviceResDistTest : public ::testing::Test {
     // Verify that all cp_ values get the same set of task and device
     // names, with unique default_rank in the expected order.
     const int dev_count = num_workers * num_devices;
+    string dev0 = "/job:worker/replica:0/task:0/device:CPU:0";
     for (int wi = 0; wi < num_workers; ++wi) {
       string task_name = strings::StrCat("/job:worker/replica:0/task:", wi);
       for (int di = 0; di < num_devices; ++di) {
         string device_name = strings::StrCat(task_name, "/device:CPU:", di);
         int idx = wi * num_devices + di;
-        TF_ASSERT_OK(status_[idx]);
-        EXPECT_EQ(cp_[idx].default_rank, idx);
-        EXPECT_EQ(cp_[idx].instance.device_names.size(), dev_count);
-        EXPECT_EQ(cp_[idx].instance.device_names[idx], device_name);
-        EXPECT_EQ(cp_[idx].instance.task_names[idx], task_name);
+        TF_ASSERT_OK(status_[device_name]);
+        EXPECT_EQ(cp_[device_name].default_rank, idx);
+        EXPECT_EQ(cp_[device_name].instance.device_names.size(), dev_count);
+        EXPECT_EQ(cp_[device_name].instance.device_names[idx], device_name);
+        EXPECT_EQ(cp_[device_name].instance.task_names[idx], task_name);
+        ValidateDeviceResolver(cp_[device_name], task_name);
         if (idx > 0) {
-          EXPECT_EQ(cp_[0].group.runtime_details.communicator_key,
-                    cp_[idx].group.runtime_details.communicator_key);
+          EXPECT_EQ(cp_[dev0].group.runtime_details.communicator_key,
+                    cp_[device_name].group.runtime_details.communicator_key);
           for (int i = 0; i < dev_count; ++i) {
-            EXPECT_EQ(cp_[0].instance.device_names[i],
-                      cp_[idx].instance.device_names[i]);
-            EXPECT_EQ(cp_[0].instance.task_names[i],
-                      cp_[idx].instance.task_names[i]);
+            EXPECT_EQ(cp_[dev0].instance.device_names[i],
+                      cp_[device_name].instance.device_names[i]);
+            EXPECT_EQ(cp_[dev0].instance.task_names[i],
+                      cp_[device_name].instance.task_names[i]);
           }
         }
       }
     }
   }
 
+  void ValidateDeviceResolver(const CollectiveParams& cp, const string& task) {
+    for (const string& device_name : cp.instance.device_names) {
+      DeviceAttributes attributes;
+      TF_ASSERT_OK(
+          dev_resolvers_[task]->GetDeviceAttributes(device_name, &attributes));
+    }
+  }
+
+  void RestartWorker(int worker_idx, int num_workers, int num_devices,
+                     const string& device_type, bool nccl) {
+    string worker_name =
+        strings::StrCat("/job:worker/replica:0/task:", worker_idx);
+    DefineWorker(worker_name, device_type, num_devices, nccl);
+    for (int i = 0; i < num_devices; ++i) {
+      string device_name =
+          strings::StrCat(worker_name, "/device:", device_type, ":", i);
+      cp_[device_name] =
+          CreateCollectiveParams(num_workers, num_devices, device_type);
+      status_.erase(device_name);
+    }
+  }
+
   FakeCache wc_;
   CancellationManager cm_;
-  std::vector<DeviceMgr*> device_mgrs_;
-  std::unordered_map<string, DeviceResolverDistributed*> dev_resolvers_;
-  std::unordered_map<string, CollectiveParamResolverDistributed*> cp_resolvers_;
-  std::unordered_map<string, std::vector<string>> dev_by_task_;
-  std::vector<FakeWorker*> workers_;
-  std::vector<CollectiveParams> cp_;
-  std::vector<Status> status_;
+  // Below are keyed by task names.
+  absl::flat_hash_map<string, std::unique_ptr<DeviceMgr>> device_mgrs_;
+  absl::flat_hash_map<string, std::unique_ptr<DeviceResolverDistributed>>
+      dev_resolvers_;
+  absl::flat_hash_map<string,
+                      std::unique_ptr<CollectiveParamResolverDistributed>>
+      cp_resolvers_;
+  absl::flat_hash_map<string, std::vector<string>> dev_by_task_;
+  absl::flat_hash_map<string, std::unique_ptr<FakeWorker>> workers_;
+  // Below are keyed by device names;
+  absl::flat_hash_map<string, CollectiveParams> cp_;
+  absl::flat_hash_map<string, Status> status_;
   mutex mu_;
   int num_done_ TF_GUARDED_BY(mu_);
   condition_variable done_;
@@ -300,8 +315,8 @@ class DeviceResDistTest : public ::testing::Test {
 TEST_F(DeviceResDistTest, Workers1Devices1) {
   const int num_workers = 1;
   const int num_devices = 1;
-  DefineWorkers(num_workers, num_devices, "CPU", false);
-  DefineCollectiveParams(num_workers, num_devices);
+  DefineWorkers(num_workers, num_devices, "CPU", /*nccl*/ false);
+  DefineCollectiveParams(num_workers, num_devices, "CPU");
   IssueRequests(num_workers, num_devices);
   ValidateCollectiveParams(num_workers, num_devices);
 }
@@ -309,12 +324,25 @@ TEST_F(DeviceResDistTest, Workers1Devices1) {
 TEST_F(DeviceResDistTest, Workers2Devices2) {
   const int num_workers = 2;
   const int num_devices = 2;
-  DefineWorkers(num_workers, num_devices, "CPU", false);
-  DefineCollectiveParams(num_workers, num_devices);
+  DefineWorkers(num_workers, num_devices, "CPU", /*nccl*/ false);
+  DefineCollectiveParams(num_workers, num_devices, "CPU");
   IssueRequests(num_workers, num_devices);
   ValidateCollectiveParams(num_workers, num_devices);
 }
 
+TEST_F(DeviceResDistTest, DifferentIncarnation) {
+  const int num_workers = 2;
+  const int num_devices = 1;
+  DefineWorkers(num_workers, num_devices, "CPU", /*nccl*/ false);
+  DefineCollectiveParams(num_workers, num_devices, "CPU");
+  IssueRequests(num_workers, num_devices);
+  RestartWorker(1, num_workers, num_devices, "CPU", /*nccl*/ false);
+  const string task_name = "/job:worker/replica:0/task:1";
+  const string device_name = absl::StrCat(task_name, "/device:CPU:0");
+  IssueRequest(task_name, device_name, num_workers * num_devices);
+  EXPECT_TRUE(errors::IsFailedPrecondition(status_[device_name]));
+}
+
 #if !GOOGLE_CUDA && !TENSORFLOW_USE_ROCM
 namespace {
 // A mock NcclReducer for testing group runtime details initialization with CPU
@@ -347,7 +375,7 @@ TEST_F(DeviceResDistTest, Workers4Devices3) {
   const int num_workers = 4;
   const int num_devices = 3;
   DefineWorkers(num_workers, num_devices, "CPU", true);
-  DefineCollectiveParams(num_workers, num_devices);
+  DefineCollectiveParams(num_workers, num_devices, "CPU");
   IssueRequests(num_workers, num_devices);
   ValidateCollectiveParams(num_workers, num_devices);
 }
diff --git a/tensorflow/core/distributed_runtime/collective_rma_distributed.cc b/tensorflow/core/distributed_runtime/collective_rma_distributed.cc
index 46889e737e7..1861262e9b1 100644
--- a/tensorflow/core/distributed_runtime/collective_rma_distributed.cc
+++ b/tensorflow/core/distributed_runtime/collective_rma_distributed.cc
@@ -146,7 +146,7 @@ void CollectiveRemoteAccessDistributed::RecvFromPeer(
                              delete cpu_tensor;
                              // This callback must not block, so execute
                              // done in another thread.
-                             RunClosure([s, done] { done(s); });
+                             work_queue_->Schedule([s, done] { done(s); });
                            });
         delete state;
         return;
@@ -155,34 +155,78 @@ void CollectiveRemoteAccessDistributed::RecvFromPeer(
         PopulateTensorFromExtra(extra, to_tensor);
       }
     }
-    if (!s.ok() && errors::IsFailedPrecondition(s)) {
-      dev_resolver_->ClearTask(peer_task);
-    }
 
     delete state;
     done(s);
   };
 
-  // Logic to execute once we have the device attributes for the server-side
-  // device.
-  auto dev_attributes_callback = [this, state, peer_device, peer_task, key,
-                                  to_device, to_device_ctx, to_alloc_attr,
-                                  to_tensor, client_locality,
-                                  recv_buf_callback](const Status& s) {
-    if (!s.ok()) {
-      recv_buf_callback(s);
-    } else {
-      state->call.reset(new RecvBufCall(
-          step_id_, peer_device, peer_task, key, to_device, to_device_ctx,
-          to_alloc_attr, to_tensor, client_locality, state->server_attributes,
-          &cancel_mgr_, worker_cache_));
-      state->call->Start(recv_buf_callback);
-    }
-  };
+  Status s = dev_resolver_->GetDeviceAttributes(peer_device,
+                                                &state->server_attributes);
+  if (!s.ok()) {
+    recv_buf_callback(s);
+    return;
+  }
+  state->call.reset(
+      new RecvBufCall(step_id_, peer_device, peer_task, key, to_device,
+                      to_device_ctx, to_alloc_attr, to_tensor, client_locality,
+                      state->server_attributes, &cancel_mgr_, worker_cache_));
+  state->call->Start(recv_buf_callback);
+}
 
-  dev_resolver_->GetDeviceAttributesAsync(peer_device, peer_task,
-                                          &state->server_attributes,
-                                          dev_attributes_callback);
+void CollectiveRemoteAccessDistributed::CheckPeerHealth(
+    const string& peer_task, const StatusCallback& done) {
+  if (peer_task == task_name_) {
+    // Fast path if the peer is the worker itself.
+    done(Status::OK());
+    return;
+  }
+  // We send a GetStatus RPC with fail_fast=false to check the health of a peer
+  // task. If the RPC succeeds, we verify if the peer_device incarnation matches
+  // the local record if we have it. Note that DeviceResolverInterface always
+  // caches the device attributes.
+  WorkerInterface* wi = worker_cache_->GetOrCreateWorker(peer_task);
+  if (wi == nullptr) {
+    done(errors::InvalidArgument(peer_task,
+                                 " not found. It's probably in valid. The "
+                                 "valid form is /job:xxx/replica:0/task:N"));
+    return;
+  }
+  auto req = new GetStatusRequest();
+  auto resp = new GetStatusResponse();
+  // We're not using Cancellable call because GetStatusAsync doesn't support
+  // cancellation yet.
+  wi->GetStatusAsync(
+      req, resp, /*fail_fast*/ true,
+      [this, req, resp, wi, peer_task, done](Status s) {
+        std::vector<DeviceAttributes> cached_attrs;
+        if (s.ok()) {
+          s = dev_resolver_->GetAllDeviceAttributes(peer_task, &cached_attrs);
+        }
+        if (s.ok()) {
+          absl::flat_hash_set<uint64> remote_incarnations;
+          for (const DeviceAttributes& da : resp->device_attributes()) {
+            remote_incarnations.insert(da.incarnation());
+          }
+          for (const DeviceAttributes& attr : cached_attrs) {
+            if (!remote_incarnations.contains(attr.incarnation())) {
+              s = errors::FailedPrecondition(
+                  attr.name(), " with incarnation ", attr.incarnation(),
+                  " is not available. This usually means ", peer_task,
+                  " has restarted");
+              break;
+            }
+          }
+        } else if (errors::IsNotFound(s)) {
+          // Skip validating device incarnation if we don't know what the
+          // incarnation should be. The device attribute is cached after the
+          // first collective.
+          s = Status::OK();
+        }
+        delete req;
+        delete resp;
+        worker_cache_->ReleaseWorker(peer_task, wi);
+        done(s);
+      });
 }
 
 void CollectiveRemoteAccessDistributed::StartAbort(const Status& s) {
diff --git a/tensorflow/core/distributed_runtime/collective_rma_distributed.h b/tensorflow/core/distributed_runtime/collective_rma_distributed.h
index 7d8fcc615cb..ed4d448afd9 100644
--- a/tensorflow/core/distributed_runtime/collective_rma_distributed.h
+++ b/tensorflow/core/distributed_runtime/collective_rma_distributed.h
@@ -28,9 +28,11 @@ class CollectiveRemoteAccessDistributed : public CollectiveRemoteAccessLocal {
   CollectiveRemoteAccessDistributed(
       const DeviceMgr* dev_mgr, DeviceResolverInterface* dev_resolver,
       std::shared_ptr<UnboundedWorkQueue> work_queue,
-      WorkerCacheInterface* worker_cache, int64 step_id)
-      : CollectiveRemoteAccessLocal(dev_mgr, dev_resolver, work_queue, step_id),
-        worker_cache_(worker_cache) {}
+      WorkerCacheInterface* worker_cache, int64 step_id, string task_name)
+      : CollectiveRemoteAccessLocal(dev_mgr, dev_resolver, step_id),
+        worker_cache_(worker_cache),
+        work_queue_(std::move(work_queue)),
+        task_name_(std::move(task_name)) {}
 
   ~CollectiveRemoteAccessDistributed() override {}
 
@@ -42,11 +44,18 @@ class CollectiveRemoteAccessDistributed : public CollectiveRemoteAccessLocal {
                     int dev_to_dev_stream_index,
                     const StatusCallback& done) override;
 
+  void CheckPeerHealth(const string& peer_task,
+                       const StatusCallback& done) override;
+
   void StartAbort(const Status& s) override;
 
  protected:
   WorkerCacheInterface* worker_cache_;  // Not owned
+  // Ownership of `work_queue_` is shared between `this` and
+  // `CollectiveExecutorMgr`.
+  std::shared_ptr<UnboundedWorkQueue> work_queue_;
   CancellationManager cancel_mgr_;
+  string task_name_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc b/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc
index 2975442d988..454111eb1b6 100644
--- a/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc
+++ b/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/device_resolver_distributed.h"
 #include "tensorflow/core/distributed_runtime/test_utils.h"
 #include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/random/random.h"
@@ -30,7 +31,6 @@ limitations under the License.
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/protobuf/transport_options.pb.h"
 #include "tensorflow/core/protobuf/worker.pb.h"
-#include "tensorflow/core/util/device_name_utils.h"
 
 // The only interesting method on CollectiveRemoteAccessDistributed
 // that's not on CollectiveRemoteAccessLocal is RecvFromPeer which
@@ -63,11 +63,12 @@ static int64 kStepId = 123;
 class FakeWorker : public TestWorkerInterface {
  public:
   FakeWorker(const string& name, DeviceMgr* dev_mgr,
-             DeviceResolverDistributed* dres)
+             DeviceResolverDistributed* dres, bool is_failed)
       : name_(name),
         device_mgr_(dev_mgr),
         device_resolver_(dres),
-        buf_rendezvous_(kStepId, dev_mgr) {}
+        buf_rendezvous_(kStepId, dev_mgr),
+        is_failed_(is_failed) {}
 
   // Direct access to a BufRendezvous that holds whatever the remote
   // worker is supposed to have.
@@ -76,6 +77,10 @@ class FakeWorker : public TestWorkerInterface {
   void GetStatusAsync(const GetStatusRequest* request,
                       GetStatusResponse* response, bool fail_fast,
                       StatusCallback done) override {
+    if (is_failed_) {
+      done(errors::Unavailable("peer down"));
+      return;
+    }
     std::vector<DeviceAttributes> dev_attr;
     device_mgr_->ListDeviceAttributes(&dev_attr);
     for (const auto& da : dev_attr) {
@@ -86,6 +91,10 @@ class FakeWorker : public TestWorkerInterface {
 
   void RecvBufAsync(CallOptions* opts, const RecvBufRequest* request,
                     RecvBufResponse* response, StatusCallback done) override {
+    if (is_failed_) {
+      done(errors::Unavailable("peer down"));
+      return;
+    }
     opts->SetCancelCallback([this]() {
       // Within this test the call is satisfied by a process-local
       // BufRendezvous table. In real application the BufRendezvous
@@ -125,6 +134,7 @@ class FakeWorker : public TestWorkerInterface {
   DeviceMgr* device_mgr_;
   DeviceResolverDistributed* device_resolver_;
   BufRendezvous buf_rendezvous_;
+  bool is_failed_;
 };
 
 class FakeCache : public TestWorkerCache {
@@ -201,7 +211,7 @@ class CollRMADistTest : public ::testing::Test {
     // All tests simulate requests from worker 0 to worker 1.
     rma_.reset(new CollectiveRemoteAccessDistributed(
         device_mgrs_[0], dev_resolvers_[dev0_worker_name], work_queue_, &wc_,
-        kStepId));
+        kStepId, "/job:worker/replica:0/task:0"));
 
     const int kNumElts = 8;
     expected_value_ = Tensor(DT_FLOAT, {kNumElts});
@@ -214,8 +224,20 @@ class CollRMADistTest : public ::testing::Test {
     }
   }
 
+  // Populates all device resolvers with device attributes of the cluster. This
+  // should be called in the beginning of all tests unless you would like to
+  // simulate a situation that is before parameter resolution.
+  void ResolveDeviceAttributes() {
+    for (auto& dev_resolver_item : dev_resolvers_) {
+      DeviceResolverDistributed* dev_resolver = dev_resolver_item.second;
+      for (const auto& item : dev_by_task_) {
+        TF_CHECK_OK(dev_resolver->UpdateDeviceAttributes(item.second));
+      }
+    }
+  }
+
   void DefineWorker(const string& worker_name, const string& device_type,
-                    int num_devices) {
+                    int num_devices, bool is_failed = false) {
     std::vector<std::unique_ptr<Device>> devices;
     for (int i = 0; i < num_devices; ++i) {
       devices.push_back(NewDevice(
@@ -224,27 +246,29 @@ class CollRMADistTest : public ::testing::Test {
     }
     DeviceMgr* dev_mgr = new StaticDeviceMgr(std::move(devices));
     device_mgrs_.push_back(dev_mgr);
-    std::vector<string>* dv = &dev_by_task_[worker_name];
+    std::vector<DeviceAttributes>* dv = &dev_by_task_[worker_name];
     dv->clear();
     for (auto d : dev_mgr->ListDevices()) {
-      dv->push_back(d->name());
+      dv->push_back(d->attributes());
     }
-    DeviceResolverDistributed* dev_res =
-        new DeviceResolverDistributed(dev_mgr, &wc_, worker_name);
+    DeviceResolverDistributed* dev_res = new DeviceResolverDistributed(dev_mgr);
     dev_resolvers_[worker_name] = dev_res;
-    FakeWorker* fw = new FakeWorker(worker_name, dev_mgr, dev_res);
+    FakeWorker* fw = new FakeWorker(worker_name, dev_mgr, dev_res, is_failed);
     workers_.push_back(fw);
     wc_.AddWorker(worker_name, fw);
   }
 
   void RestartWorker(const string& worker_name, const string& device_type,
-                     int num_devices) {
+                     int num_devices, bool is_failed = false) {
     auto it = dev_resolvers_.find(worker_name);
     if (it != dev_resolvers_.end()) {
       delete it->second;
       dev_resolvers_.erase(it);
     }
-    DefineWorker(worker_name, device_type, num_devices);
+    // After restarting a worker, the other workers already have the device
+    // attributes of the old worker. We don't broadcast device attributes of the
+    // new worker to mimic the real world.
+    DefineWorker(worker_name, device_type, num_devices, is_failed);
   }
 
   void ValidateResultTensor() {
@@ -259,7 +283,7 @@ class CollRMADistTest : public ::testing::Test {
   CancellationManager cm_;
   std::vector<DeviceMgr*> device_mgrs_;
   std::unordered_map<string, DeviceResolverDistributed*> dev_resolvers_;
-  std::unordered_map<string, std::vector<string>> dev_by_task_;
+  std::unordered_map<string, std::vector<DeviceAttributes>> dev_by_task_;
   std::shared_ptr<UnboundedWorkQueue> work_queue_;
   std::vector<FakeWorker*> workers_;
   std::unique_ptr<CollectiveRemoteAccessDistributed> rma_;
@@ -274,6 +298,7 @@ class CollRMADistTest : public ::testing::Test {
 };
 
 TEST_F(CollRMADistTest, ProdFirstOK) {
+  ResolveDeviceAttributes();
   Notification consumer_note;
   Notification producer_note;
   Status consumer_status;
@@ -309,6 +334,7 @@ TEST_F(CollRMADistTest, ProdFirstOK) {
 }
 
 TEST_F(CollRMADistTest, ConsFirstOK) {
+  ResolveDeviceAttributes();
   Notification consumer_note;
   Notification producer_note;
   Status consumer_status;
@@ -344,6 +370,7 @@ TEST_F(CollRMADistTest, ConsFirstOK) {
 }
 
 TEST_F(CollRMADistTest, ConsFirstAbort) {
+  ResolveDeviceAttributes();
   Notification consumer_note;
   Status consumer_status;
   const string kBufKey = "fake_buf_key";
@@ -367,6 +394,7 @@ TEST_F(CollRMADistTest, ConsFirstAbort) {
 }
 
 TEST_F(CollRMADistTest, WorkerRestart) {
+  ResolveDeviceAttributes();
   Notification consumer_note;
   Notification producer_note;
   Status consumer_status;
@@ -401,7 +429,7 @@ TEST_F(CollRMADistTest, WorkerRestart) {
   ValidateResultTensor();
 
   // Restart task 1 and check that recv from task 1 to task 0 fails.
-  RestartWorker("/job:worker/replica:0/task:1", "CPU", 1);
+  RestartWorker("/job:worker/replica:0/task:1", "CPU", /*num_devices*/ 1);
   Notification post_restart_note;
   rma_->RecvFromPeer(
       "/job:worker/replica:0/task:1/device:" + dev_name,  // peer_dev
@@ -417,5 +445,80 @@ TEST_F(CollRMADistTest, WorkerRestart) {
   EXPECT_TRUE(errors::IsFailedPrecondition(consumer_status));
 }
 
+TEST_F(CollRMADistTest, CheckHealthOKWithCachedAttr) {
+  ResolveDeviceAttributes();
+  Status check_health_status;
+  Notification check_health_done;
+  rma_->CheckPeerHealth(
+      "/job:worker/replica:0/task:1",
+      [&check_health_status, &check_health_done](const Status s) {
+        check_health_status = s;
+        check_health_done.Notify();
+      });
+  check_health_done.WaitForNotification();
+  TF_EXPECT_OK(check_health_status);
+}
+
+TEST_F(CollRMADistTest, CheckHealthOKWithoutCachedAttr) {
+  Status check_health_status;
+  Notification check_health_done;
+  rma_->CheckPeerHealth(
+      "/job:worker/replica:0/task:1",
+      [&check_health_status, &check_health_done](const Status s) {
+        check_health_status = s;
+        check_health_done.Notify();
+      });
+  check_health_done.WaitForNotification();
+  EXPECT_TRUE(check_health_status.ok());
+}
+
+TEST_F(CollRMADistTest, CheckHealthRestarted) {
+  ResolveDeviceAttributes();
+  RestartWorker("/job:worker/replica:0/task:1", "CPU", /*num_devices*/ 1);
+
+  Status check_health_status;
+  Notification check_health_done;
+  rma_->CheckPeerHealth(
+      "/job:worker/replica:0/task:1",
+      [&check_health_status, &check_health_done](const Status s) {
+        check_health_status = s;
+        check_health_done.Notify();
+      });
+  check_health_done.WaitForNotification();
+  EXPECT_TRUE(errors::IsFailedPrecondition(check_health_status));
+}
+
+TEST_F(CollRMADistTest, CheckHealthFailedPeer) {
+  ResolveDeviceAttributes();
+  RestartWorker("/job:worker/replica:0/task:1", "CPU", /*num_devices*/ 1,
+                /*is_failed*/ true);
+
+  Status check_health_status;
+  Notification check_health_done;
+  rma_->CheckPeerHealth(
+      "/job:worker/replica:0/task:1",
+      [&check_health_status, &check_health_done](const Status s) {
+        check_health_status = s;
+        check_health_done.Notify();
+      });
+  check_health_done.WaitForNotification();
+  EXPECT_TRUE(errors::IsUnavailable(check_health_status));
+}
+
+TEST_F(CollRMADistTest, CheckHealthRestartedWithDifferentDevices) {
+  ResolveDeviceAttributes();
+  RestartWorker("/job:worker/replica:0/task:1", "GPU", /*num_devices*/ 1);
+  Status check_health_status;
+  Notification check_health_done;
+  rma_->CheckPeerHealth(
+      "/job:worker/replica:0/task:1",
+      [&check_health_status, &check_health_done](const Status s) {
+        check_health_status = s;
+        check_health_done.Notify();
+      });
+  check_health_done.WaitForNotification();
+  EXPECT_TRUE(errors::IsFailedPrecondition(check_health_status));
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/device_resolver_distributed.cc b/tensorflow/core/distributed_runtime/device_resolver_distributed.cc
index 927925c0e21..76d921fe254 100644
--- a/tensorflow/core/distributed_runtime/device_resolver_distributed.cc
+++ b/tensorflow/core/distributed_runtime/device_resolver_distributed.cc
@@ -15,123 +15,62 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/device_resolver_distributed.h"
 
 #include "tensorflow/core/common_runtime/device_mgr.h"
-#include "tensorflow/core/distributed_runtime/worker_cache.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
+#include "tensorflow/core/platform/errors.h"
 
 namespace tensorflow {
-DeviceResolverDistributed::DeviceResolverDistributed(
-    const DeviceMgr* dev_mgr, WorkerCacheInterface* worker_cache,
-    const string& task_name)
-    : dev_mgr_(dev_mgr), worker_cache_(worker_cache), task_name_(task_name) {}
 
-void DeviceResolverDistributed::GetDeviceAttributesAsync(
-    const string& device, const string& task, DeviceAttributes* attributes,
-    const StatusCallback& done) {
-  if (task.empty() || task == task_name_) {
-    // Device is local to this task.
-    Device* dev;
-    Status s = dev_mgr_->LookupDevice(device, &dev);
-    if (s.ok()) {
-      *attributes = dev->attributes();
-    }
-    done(s);
-    return;
-  } else {
-    // Lookup of a remote device: first try the local cache.
-    bool found = false;
-    {
-      mutex_lock l(mu_);
-      auto it = attr_table_.find(device);
-      if (it != attr_table_.end()) {
-        *attributes = it->second;
-        found = true;
-      }
-    }
-    if (found) {
-      done(Status::OK());
-      return;
-    }
-  }
-  // Device is remote and no cache entry was found.  Refresh the cache
-  // then retry the lookup.
-  RefreshRemoteAttributes(
-      device, task, [this, device, task, attributes, done](const Status& s) {
-        if (!s.ok()) {
-          done(s);
-        } else {
-          GetDeviceAttributesAsync(device, task, attributes, done);
-        }
-      });
-}
-
-void DeviceResolverDistributed::GetAllDeviceAttributesAsync(
-    const std::vector<string>& devices, const std::vector<string>& tasks,
-    std::vector<DeviceAttributes>* attributes, const StatusCallback& done) {
-  attributes->clear();
-  GetAllDeviceAttributesRecursive(devices, tasks, attributes, done);
-}
-
-void DeviceResolverDistributed::GetAllDeviceAttributesRecursive(
-    const std::vector<string>& devices, const std::vector<string>& tasks,
-    std::vector<DeviceAttributes>* attributes, const StatusCallback& done) {
-  size_t i = attributes->size();
-  if (i < devices.size()) {
-    attributes->push_back(DeviceAttributes());
-    GetDeviceAttributesAsync(
-        devices[i], tasks[i], &attributes->back(),
-        [this, &devices, &tasks, attributes, done](const Status& s) {
-          if (!s.ok()) {
-            done(s);
-            return;
-          } else {
-            GetAllDeviceAttributesRecursive(devices, tasks, attributes, done);
-          }
-        });
-  } else {
-    done(Status::OK());
-  }
-}
-
-void DeviceResolverDistributed::RefreshRemoteAttributes(
-    const string& device, const string& task, const StatusCallback& done) {
-  GetStatusRequest* req = new GetStatusRequest;
-  GetStatusResponse* resp = new GetStatusResponse;
-  WorkerInterface* worker = worker_cache_->GetOrCreateWorker(task);
-  CHECK(worker) << "Failed to get worker for " << task;
-  worker->GetStatusAsync(
-      req, resp, /*fail_fast=*/false,
-      [this, device, task, req, resp, worker, done](Status s) {
-        if (s.ok()) {
-          mutex_lock l(mu_);
-          for (const DeviceAttributes& da : resp->device_attributes()) {
-            attr_table_[da.name()] = da;
-          }
-        }
-        done(s);
-        delete req;
-        delete resp;
-        worker_cache_->ReleaseWorker(task, worker);
-      });
-}
-
-void DeviceResolverDistributed::ClearTask(const string& task) {
+DeviceResolverDistributed::DeviceResolverDistributed(const DeviceMgr* dev_mgr) {
   mutex_lock l(mu_);
-  // First find all the keys belonging to the task.
-  std::unordered_set<string> task_keys;
+  for (Device* device : dev_mgr->ListDevices()) {
+    attr_table_[device->name()] = device->attributes();
+  }
+}
+
+Status DeviceResolverDistributed::GetDeviceAttributes(
+    const string& device, DeviceAttributes* attributes) {
+  mutex_lock l(mu_);
+  auto it = attr_table_.find(device);
+  if (it == attr_table_.end()) {
+    return errors::NotFound(device, " not found");
+  }
+  *attributes = it->second;
+  return Status::OK();
+}
+
+Status DeviceResolverDistributed::GetAllDeviceAttributes(
+    const string& task, std::vector<DeviceAttributes>* attributes) {
+  mutex_lock l(mu_);
+  attributes->clear();
   for (const auto& it : attr_table_) {
     const string& device_name = it.first;
     if (DeviceNameUtils::IsSameAddressSpace(task, device_name)) {
-      task_keys.insert(device_name);
+      attributes->push_back(it.second);
     }
   }
-  // Then delete them.
-  for (const string& key : task_keys) {
-    attr_table_.erase(key);
+  if (attributes->empty()) {
+    return errors::NotFound(task, " not found in the cache");
   }
+  return Status::OK();
 }
 
-void DeviceResolverDistributed::ClearCache() {
+Status DeviceResolverDistributed::UpdateDeviceAttributes(
+    const std::vector<DeviceAttributes>& attributes) {
   mutex_lock l(mu_);
-  attr_table_.clear();
+  for (const DeviceAttributes& attr : attributes) {
+    auto item = attr_table_.insert({attr.name(), attr});
+    auto it = item.first;
+    bool success = item.second;
+    // Returns error if the device already exists in the cache and has a
+    // different incarnation.
+    if (!success && it->second.incarnation() != attr.incarnation()) {
+      return errors::FailedPrecondition(
+          attr.name(),
+          "exists in cache with a different incarnation. "
+          "This usually means the remote worker has restarted");
+    }
+  }
+  return Status::OK();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/device_resolver_distributed.h b/tensorflow/core/distributed_runtime/device_resolver_distributed.h
index 93d51a52fef..299cbc95b01 100644
--- a/tensorflow/core/distributed_runtime/device_resolver_distributed.h
+++ b/tensorflow/core/distributed_runtime/device_resolver_distributed.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/framework/collective.h"
 #include "tensorflow/core/framework/device_attributes.pb.h"
+#include "tensorflow/core/platform/status.h"
 
 namespace tensorflow {
 class DeviceMgr;
@@ -28,40 +29,18 @@ class WorkerCacheInterface;
 
 class DeviceResolverDistributed : public DeviceResolverInterface {
  public:
-  DeviceResolverDistributed(const DeviceMgr* dev_mgr,
-                            WorkerCacheInterface* worker_cache,
-                            const string& task_name);
+  explicit DeviceResolverDistributed(const DeviceMgr* dev_mgr);
 
-  virtual ~DeviceResolverDistributed() {}
+  Status GetDeviceAttributes(const string& device,
+                             DeviceAttributes* attributes) override;
 
-  void GetAllDeviceAttributesAsync(const std::vector<string>& devices,
-                                   const std::vector<string>& tasks,
-                                   std::vector<DeviceAttributes>* attributes,
-                                   const StatusCallback& done) override;
+  Status GetAllDeviceAttributes(
+      const string& task, std::vector<DeviceAttributes>* attributes) override;
 
-  void GetDeviceAttributesAsync(const string& device, const string& task,
-                                DeviceAttributes* attributes,
-                                const StatusCallback& done) override;
-
-  void ClearTask(const string& task) override;
-
-  void ClearCache() override;
+  Status UpdateDeviceAttributes(
+      const std::vector<DeviceAttributes>& attributes) override;
 
  protected:
-  // Loads attr_table_ with device attributes retrieved from remote task.
-  void RefreshRemoteAttributes(const string& device, const string& task,
-                               const StatusCallback& done)
-      TF_LOCKS_EXCLUDED(mu_);
-
-  // Subroutine used by GetAllDeviceAttributesAsync.  Recursively extends
-  // *attributes with DeviceAttributes of the corresponding device named
-  // by inst_params.instance.device_names.
-  void GetAllDeviceAttributesRecursive(
-      const std::vector<string>& devices, const std::vector<string>& tasks,
-      std::vector<DeviceAttributes>* attributes, const StatusCallback& done);
-
-  const DeviceMgr* dev_mgr_;            // Not owned
-  WorkerCacheInterface* worker_cache_;  // Not owned
   const string task_name_;
   mutex mu_;
   absl::flat_hash_map<string, DeviceAttributes> attr_table_ TF_GUARDED_BY(mu_);
diff --git a/tensorflow/core/distributed_runtime/device_resolver_distributed_test.cc b/tensorflow/core/distributed_runtime/device_resolver_distributed_test.cc
index 3d7523f945c..21466d2edf2 100644
--- a/tensorflow/core/distributed_runtime/device_resolver_distributed_test.cc
+++ b/tensorflow/core/distributed_runtime/device_resolver_distributed_test.cc
@@ -22,30 +22,19 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/random.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/util/device_name_utils.h"
 
 namespace tensorflow {
 namespace {
 
-// Subclass of DeviceResolverDistributed which behaves identically but
-// allows access to the attr_table_.
-class TestableDeviceResolverDistributed : public DeviceResolverDistributed {
- public:
-  TestableDeviceResolverDistributed(const DeviceMgr* dev_mgr,
-                                    WorkerCacheInterface* worker_cache,
-                                    const string& task)
-      : DeviceResolverDistributed(dev_mgr, worker_cache, task) {}
-
-  absl::flat_hash_map<string, DeviceAttributes>& attr_table() {
-    return attr_table_;
-  }
-};
+using ::testing::Property;
+using ::testing::UnorderedElementsAre;
 
 // Create a fake 'Device' whose only interesting attribute is a non-default
 // DeviceLocality and incarnation.
-static std::unique_ptr<Device> NewDevice(const string& type, const string& name,
-                                         int numa_node, uint64 incarnation) {
+std::unique_ptr<Device> NewDevice(const string& type, const string& name) {
   class FakeDevice : public Device {
    public:
     explicit FakeDevice(const DeviceAttributes& attr) : Device(nullptr, attr) {}
@@ -55,256 +44,121 @@ static std::unique_ptr<Device> NewDevice(const string& type, const string& name,
   DeviceAttributes attr;
   attr.set_name(name);
   attr.set_device_type(type);
-  attr.mutable_locality()->set_numa_node(numa_node);
-  attr.set_incarnation(incarnation);
+  attr.set_incarnation(random::New64());
   return absl::make_unique<FakeDevice>(attr);
 }
 
-// Create a fake WorkerInterface that responds to requests without RPCs,
-// in this case returning the DeviceAttributes of a fake remote worker.
-class FakeWorker : public TestWorkerInterface {
- public:
-  FakeWorker(const string& name, DeviceMgr* dev_mgr,
-             DeviceResolverDistributed* dres)
-      : name_(name), device_mgr_(dev_mgr), device_resolver_(dres) {}
-
-  void GetStatusAsync(const GetStatusRequest* request,
-                      GetStatusResponse* response, bool fail_fast,
-                      StatusCallback done) override {
-    std::vector<DeviceAttributes> dev_attr;
-    device_mgr_->ListDeviceAttributes(&dev_attr);
-    for (const auto& da : dev_attr) {
-      *response->add_device_attributes() = da;
-    }
-    done(Status::OK());
-  }
-
- private:
-  string name_;
-  DeviceMgr* device_mgr_;
-  DeviceResolverDistributed* device_resolver_;
-};
-
-// An implementation of WorkerCacheInterface that routes all requests
-// to local FakeWorkers, implementing only the methods needed for tests.
-class FakeCache : public TestWorkerCache {
- public:
-  // Override the Locality methods to actually pass through to the
-  // worker.
-  bool GetDeviceLocalityNonBlocking(const string& device,
-                                    DeviceLocality* locality) override {
-    return false;
-  }
-
-  void GetDeviceLocalityAsync(const string& device, DeviceLocality* locality,
-                              StatusCallback done) override {
-    string task_name;
-    string dev_part;
-    if (!DeviceNameUtils::SplitDeviceName(device, &task_name, &dev_part)) {
-      done(errors::Internal("failed to parse device name"));
-      return;
-    }
-    auto it = workers_.find(task_name);
-    if (it == workers_.end()) {
-      done(errors::Internal("failed to find worker ", task_name));
-      return;
-    }
-    WorkerInterface* wi = it->second;
-    GetStatusRequest req;
-    GetStatusResponse resp;
-    Status status = wi->GetStatus(&req, &resp);
-    if (!status.ok()) {
-      done(status);
-      return;
-    }
-    for (const auto& it : resp.device_attributes()) {
-      if (it.name() == device) {
-        *locality = it.locality();
-        done(Status::OK());
-        return;
-      }
-    }
-    done(errors::Internal("device not found: ", device));
-  }
-};
-
 class DeviceResDistTest : public ::testing::Test {
  protected:
-  DeviceResDistTest() {}
-
-  ~DeviceResDistTest() override {
-    for (DeviceMgr* dm : device_mgrs_) {
-      delete dm;
-    }
-    for (auto it : resolvers_) {
-      delete it.second;
-    }
-    for (FakeWorker* w : workers_) {
-      delete w;
-    }
-  }
-
-  void DefineWorkers(int num_workers, int num_devices,
-                     const string& device_type,
-                     uint64 device_incarnation_base) {
-    for (int w = 0; w < num_workers; ++w) {
-      string name = strings::StrCat("/job:worker/replica:0/task:", w);
-      DefineWorker(name, device_type, num_devices,
-                   w * num_devices + device_incarnation_base);
-    }
-  }
-
-  void DefineWorker(const string& worker_name, const string& device_type,
-                    int num_devices, uint64 device_incarnation_base) {
+  void SetUp() override {
     std::vector<std::unique_ptr<Device>> devices;
-    for (int i = 0; i < num_devices; ++i) {
-      devices.push_back(NewDevice(
-          device_type,
-          strings::StrCat(worker_name, "/device:", device_type, ":", i), i,
-          device_incarnation_base + i));
-    }
-    DeviceMgr* dev_mgr = new StaticDeviceMgr(std::move(devices));
-    TestableDeviceResolverDistributed* dev_res =
-        new TestableDeviceResolverDistributed(dev_mgr, &wc_, worker_name);
-    resolvers_[worker_name] = dev_res;
-    device_mgrs_.push_back(dev_mgr);
-    std::vector<string>* dv = &dev_by_task_[worker_name];
-    dv->clear();
-    for (auto* d : dev_mgr->ListDevices()) {
-      dv->push_back(d->name());
-    }
-    FakeWorker* fw = new FakeWorker(worker_name, dev_mgr, dev_res);
-    workers_.push_back(fw);
-    wc_.AddWorker(worker_name, fw);
+    devices.push_back(
+        NewDevice("CPU", "/job:worker/replica:0/task:0/device:CPU:0"));
+    devices.push_back(
+        NewDevice("CPU", "/job:worker/replica:0/task:0/device:CPU:1"));
+    dev_mgr_ = absl::make_unique<StaticDeviceMgr>(std::move(devices));
+    dev_resolver_ =
+        absl::make_unique<DeviceResolverDistributed>(dev_mgr_.get());
+
+    std::vector<DeviceAttributes> attributes;
+    attributes.push_back(
+        NewDevice("CPU", "/job:worker/replica:0/task:1/device:CPU:0")
+            ->attributes());
+    attributes.push_back(
+        NewDevice("CPU", "/job:worker/replica:0/task:1/device:CPU:1")
+            ->attributes());
+    TF_ASSERT_OK(dev_resolver_->UpdateDeviceAttributes(attributes));
   }
 
-  void RestartWorker(const string& worker_name, const string& device_type,
-                     int num_devices, uint64 device_incarnation_base) {
-    for (auto it : resolvers_) {
-      it.second->ClearCache();
-    }
-    // `DefineWorker` creates a device resolver and a worker and adds them to
-    // resolvers_ and workers_.  Recreating the worker would overwrite these map
-    // entries.  We destroy the old device resolver here; all other objects are
-    // cleaned up in the destructor.
-    delete resolvers_[worker_name];
-    DefineWorker(worker_name, device_type, num_devices,
-                 device_incarnation_base);
-  }
-
-  void ResolveIncarnationsAndValidate(
-      const int num_workers, const int num_devices, const string& worker_prefix,
-      const string& device_type,
-      const std::vector<std::vector<uint64>>& expected_incarnations) {
-    for (int w = 0; w < num_workers; ++w) {
-      const string worker_name = absl::StrCat(worker_prefix, w);
-      auto* device_resolver = resolvers_[worker_name];
-      const string device_prefix =
-          absl::StrCat(worker_name, "/device:", device_type, ":");
-      for (int peer_w = 0; peer_w < num_workers; ++peer_w) {
-        const string peer_worker_name = absl::StrCat(worker_prefix, peer_w);
-        for (int d = 0; d < num_devices; ++d) {
-          const string device_name =
-              absl::StrCat(peer_worker_name, "/device:", device_type, ":", d);
-          DeviceNameUtils::ParsedName parsed;
-          ASSERT_TRUE(DeviceNameUtils::ParseFullName(device_name, &parsed));
-          // NOLINT prevents linter from suggesting absl::Notification as a
-          // replacement, which is not available in OSS.
-          Notification note;  // NOLINT
-          Status status;
-          DeviceAttributes attributes;
-          device_resolver->GetDeviceAttributesAsync(
-              device_name, peer_worker_name, &attributes,
-              [&note, &status](const Status& s) {
-                status = s;
-                note.Notify();
-              });
-          note.WaitForNotification();
-          TF_EXPECT_OK(status);
-          EXPECT_EQ(attributes.incarnation(), expected_incarnations[peer_w][d]);
-        }
-      }
-    }
-  }
-
-  FakeCache wc_;
-  std::vector<DeviceMgr*> device_mgrs_;
-  std::unordered_map<string, TestableDeviceResolverDistributed*> resolvers_;
-  std::unordered_map<string, std::vector<string>> dev_by_task_;
-  std::vector<FakeWorker*> workers_;
+  std::unique_ptr<DeviceMgr> dev_mgr_;
+  std::unique_ptr<DeviceResolverDistributed> dev_resolver_;
 };
 
-TEST_F(DeviceResDistTest, Workers3Devices4) {
-  DefineWorkers(/*num_workers=*/3, /*num_devices=*/4, /*device_type=*/"CPU",
-                /*device_incarnation_base=*/1);
-  // Check that every device is available from every task.
-  for (auto it : resolvers_) {
-    DeviceResolverDistributed* dres = it.second;
-    for (auto it2 : dev_by_task_) {
-      const string& task_name = it2.first;
-      for (const auto& dev_name : it2.second) {
-        DeviceNameUtils::ParsedName parsed;
-        ASSERT_TRUE(DeviceNameUtils::ParseFullName(dev_name, &parsed));
-        Notification note;
-        Status status;
-        DeviceAttributes attributes;
-        dres->GetDeviceAttributesAsync(dev_name, task_name, &attributes,
-                                       [&note, &status](const Status& s) {
-                                         status = s;
-                                         note.Notify();
-                                       });
-        note.WaitForNotification();
-        TF_EXPECT_OK(status);
-        EXPECT_EQ(parsed.id, attributes.locality().numa_node());
-      }
-    }
-  }
-  // Clear just task 0 from all.
-  const string w0_name = "/job:worker/replica:0/task:0";
-  for (auto it : resolvers_) {
-    if (it.first == w0_name) continue;
-    TestableDeviceResolverDistributed* dres = it.second;
-    EXPECT_EQ(8, it.second->attr_table().size());
-    dres->ClearTask("/job:worker/replica:0/task:0");
-    EXPECT_EQ(4, it.second->attr_table().size());
-  }
+TEST_F(DeviceResDistTest, GetDeviceAttributesLocal) {
+  DeviceAttributes attributes;
+  TF_ASSERT_OK(dev_resolver_->GetDeviceAttributes(
+      "/job:worker/replica:0/task:0/device:CPU:0", &attributes));
+  EXPECT_EQ(attributes.name(), "/job:worker/replica:0/task:0/device:CPU:0");
 }
 
-TEST_F(DeviceResDistTest, DeviceIncarnationChangesOnFailure) {
-  constexpr int num_workers = 3;
-  constexpr int num_devices = 4;
-  constexpr int failing_worker_index = 1;
-  const string device_type = "CPU";
-  constexpr uint64 device_incarnation_base = 100;
-  DefineWorkers(num_workers, num_devices, device_type, device_incarnation_base);
-  const string worker_prefix = "/job:worker/replica:0/task:";
-  const string failing_worker =
-      absl::StrCat(worker_prefix, failing_worker_index);
+TEST_F(DeviceResDistTest, GetDeviceAttributesLocalUnknown) {
+  DeviceAttributes attributes;
+  EXPECT_TRUE(errors::IsNotFound(dev_resolver_->GetDeviceAttributes(
+      "/job:worker/replica:0/task:0/device:CPU:9", &attributes)));
+}
 
-  // Check device incarnations match expected.
-  std::vector<std::vector<uint64>> expected_incarnations(num_workers);
-  for (int w = 0; w < num_workers; ++w) {
-    expected_incarnations[w].resize(num_devices);
-    for (int d = 0; d < num_devices; ++d) {
-      expected_incarnations[w][d] =
-          w * num_devices + d + device_incarnation_base;
-    }
-  }
-  ResolveIncarnationsAndValidate(num_workers, num_devices, worker_prefix,
-                                 device_type, expected_incarnations);
+TEST_F(DeviceResDistTest, GetAllDeviceAttributes) {
+  std::vector<DeviceAttributes> attributes;
+  TF_ASSERT_OK(dev_resolver_->GetAllDeviceAttributes(
+      "/job:worker/replica:0/task:0", &attributes));
+  EXPECT_THAT(attributes,
+              UnorderedElementsAre(
+                  Property(&DeviceAttributes::name,
+                           "/job:worker/replica:0/task:0/device:CPU:0"),
+                  Property(&DeviceAttributes::name,
+                           "/job:worker/replica:0/task:0/device:CPU:1")));
+  TF_ASSERT_OK(dev_resolver_->GetAllDeviceAttributes(
+      "/job:worker/replica:0/task:1", &attributes));
+  EXPECT_THAT(attributes,
+              UnorderedElementsAre(
+                  Property(&DeviceAttributes::name,
+                           "/job:worker/replica:0/task:1/device:CPU:0"),
+                  Property(&DeviceAttributes::name,
+                           "/job:worker/replica:0/task:1/device:CPU:1")));
+}
 
-  // Restart worker `failing_worker`.
-  constexpr uint64 restart_incarnation_base = 200;
-  RestartWorker(failing_worker, device_type, num_devices,
-                restart_incarnation_base);
-  for (int d = 0; d < num_devices; ++d) {
-    expected_incarnations[failing_worker_index][d] =
-        d + restart_incarnation_base;
-  }
+TEST_F(DeviceResDistTest, GetAllDeviceAttributesUnknown) {
+  std::vector<DeviceAttributes> attributes;
+  EXPECT_TRUE(errors::IsNotFound(dev_resolver_->GetAllDeviceAttributes(
+      "/job:worker/replica:0/task:3", &attributes)));
+}
 
-  // Check incarnations have changed for `failing worker`.
-  ResolveIncarnationsAndValidate(num_workers, num_devices, worker_prefix,
-                                 device_type, expected_incarnations);
+TEST_F(DeviceResDistTest, UpdateDeviceAttributes) {
+  std::vector<DeviceAttributes> attributes;
+  attributes.push_back(
+      NewDevice("CPU", "/job:worker/replica:0/task:2/device:CPU:0")
+          ->attributes());
+  attributes.push_back(
+      NewDevice("CPU", "/job:worker/replica:0/task:2/device:CPU:1")
+          ->attributes());
+  TF_ASSERT_OK(dev_resolver_->UpdateDeviceAttributes(attributes));
+  // Get the new task.
+  TF_ASSERT_OK(dev_resolver_->GetAllDeviceAttributes(
+      "/job:worker/replica:0/task:2", &attributes));
+  EXPECT_THAT(attributes,
+              UnorderedElementsAre(
+                  Property(&DeviceAttributes::name,
+                           "/job:worker/replica:0/task:2/device:CPU:0"),
+                  Property(&DeviceAttributes::name,
+                           "/job:worker/replica:0/task:2/device:CPU:1")));
+  // Get an existing task.
+  TF_ASSERT_OK(dev_resolver_->GetAllDeviceAttributes(
+      "/job:worker/replica:0/task:0", &attributes));
+  EXPECT_THAT(attributes,
+              UnorderedElementsAre(
+                  Property(&DeviceAttributes::name,
+                           "/job:worker/replica:0/task:0/device:CPU:0"),
+                  Property(&DeviceAttributes::name,
+                           "/job:worker/replica:0/task:0/device:CPU:1")));
+}
+
+TEST_F(DeviceResDistTest, UpdateDeviceAttributesExisting) {
+  std::vector<DeviceAttributes> attributes;
+  TF_ASSERT_OK(dev_resolver_->GetAllDeviceAttributes(
+      "/job:worker/replica:0/task:0", &attributes));
+  TF_ASSERT_OK(dev_resolver_->UpdateDeviceAttributes(attributes));
+}
+
+TEST_F(DeviceResDistTest, UpdateDeviceAttributesDifferentIncarnation) {
+  std::vector<DeviceAttributes> attributes;
+  attributes.push_back(
+      NewDevice("CPU", "/job:worker/replica:0/task:0/device:CPU:0")
+          ->attributes());
+  attributes.push_back(
+      NewDevice("CPU", "/job:worker/replica:0/task:0/device:CPU:1")
+          ->attributes());
+  EXPECT_TRUE(errors::IsFailedPrecondition(
+      dev_resolver_->UpdateDeviceAttributes(attributes)));
 }
 
 }  // namespace
diff --git a/tensorflow/core/distributed_runtime/eager/BUILD b/tensorflow/core/distributed_runtime/eager/BUILD
index c27758cbb44..fb9808b80cf 100644
--- a/tensorflow/core/distributed_runtime/eager/BUILD
+++ b/tensorflow/core/distributed_runtime/eager/BUILD
@@ -44,6 +44,7 @@ cc_library(
         "//tensorflow/core/common_runtime/eager:tensor_handle",
         "//tensorflow/core/distributed_runtime:call_options",
         "//tensorflow/core/distributed_runtime:worker_session",
+        "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
         "@com_google_absl//absl/types:variant",
     ],
diff --git a/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc b/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc
index 03944e12590..e9801d65b49 100644
--- a/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc
+++ b/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc
@@ -96,14 +96,16 @@ void EagerClusterFunctionLibraryRuntime::Instantiate(
           .ToProto();
   StripDefaultAttributesInRegisterFunctionOp(register_function);
 
+  const absl::optional<std::vector<int>>& ret_indices = options.ret_indices;
   eager_client->EnqueueAsync(
-      request.get(), response.get(),
+      /*call_opts=*/nullptr, request.get(), response.get(),
       [this, request, response, handle, released_op = released_op.release(),
-       target, eager_client = eager_client.get(), done](const Status& s) {
+       target, ret_indices, eager_client = eager_client.get(),
+       done](const Status& s) {
         {
           mutex_lock l(mu_);
           *handle = function_data_.size();
-          function_data_.emplace_back(target, eager_client,
+          function_data_.emplace_back(target, ret_indices, eager_client,
                                       absl::WrapUnique(released_op));
         }
         done(s);
@@ -168,6 +170,12 @@ void EagerClusterFunctionLibraryRuntime::Run(
   request->set_context_id(context_id_);
   eager::Operation* remote_op = request->mutable_operation();
 
+  if (function_data->ret_indices.has_value()) {
+    for (const int ret_index : function_data->ret_indices.value()) {
+      request->add_output_num(ret_index);
+    }
+  }
+
   for (const auto& arg : args) {
     if (arg.index() == 0) {
       absl::get<Tensor>(arg).AsProtoTensorContent(
@@ -270,7 +278,7 @@ void EagerClusterFunctionLibraryRuntime::CleanUp(
   // CleanUp() needs to be non-blocking since it would be invoked inside the
   // enqueue done callback of Run(). So we don't use StreamingEnqueueAsync here.
   eager_client->EnqueueAsync(
-      request.get(), response.get(),
+      /*call_opts=*/nullptr, request.get(), response.get(),
       [request, response, done](const Status& status) { done(status); });
 }
 
diff --git a/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.h b/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.h
index 6e60ee0b13d..01e864053d1 100644
--- a/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.h
+++ b/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_CLUSTER_FUNCTION_LIBRARY_RUNTIME_H_
 #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_CLUSTER_FUNCTION_LIBRARY_RUNTIME_H_
 
+#include "absl/types/optional.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
 #include "tensorflow/core/common_runtime/eager/eager_operation.h"
@@ -84,12 +85,15 @@ class EagerClusterFunctionLibraryRuntime
 
   struct FunctionData {
     const string target;
+    const absl::optional<std::vector<int>> ret_indices;
     core::RefCountPtr<EagerClient> eager_client;
     std::unique_ptr<EagerOperation> op;
 
-    FunctionData(const string& target, EagerClient* eager_client,
-                 std::unique_ptr<EagerOperation> op)
+    FunctionData(const string& target,
+                 const absl::optional<std::vector<int>>& ret_indices,
+                 EagerClient* eager_client, std::unique_ptr<EagerOperation> op)
         : target(target),
+          ret_indices(ret_indices),
           eager_client(core::RefCountPtr<EagerClient>(eager_client)),
           op(std::move(op)) {
       eager_client->Ref();
diff --git a/tensorflow/core/distributed_runtime/eager/destroy_tensor_handle_node.h b/tensorflow/core/distributed_runtime/eager/destroy_tensor_handle_node.h
index a2ea5f615bd..0df62862d3c 100644
--- a/tensorflow/core/distributed_runtime/eager/destroy_tensor_handle_node.h
+++ b/tensorflow/core/distributed_runtime/eager/destroy_tensor_handle_node.h
@@ -47,7 +47,7 @@ class DestroyTensorHandleNode : public tensorflow::AsyncEagerNode {
     // well. We don't want this request poison following requests since it is
     // safe to ignore a failing destroy tensor handle request.
     eager_client_->EnqueueAsync(
-        request_.get(), response,
+        /*call_opts=*/nullptr, request_.get(), response,
         [response, ready, done](const tensorflow::Status& s) {
           // Omit the warning if:
           // 1. The remote tensor isn't ready.
diff --git a/tensorflow/core/distributed_runtime/eager/eager_client.h b/tensorflow/core/distributed_runtime/eager/eager_client.h
index d6cf0943176..22ff6eeb94b 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_client.h
+++ b/tensorflow/core/distributed_runtime/eager/eager_client.h
@@ -37,16 +37,21 @@ class EagerClient : public core::RefCounted {
 
   CLIENT_METHOD(CreateContext);
   CLIENT_METHOD(UpdateContext);
-  CLIENT_METHOD(Enqueue);
   CLIENT_METHOD(WaitQueueDone);
   CLIENT_METHOD(KeepAlive);
   CLIENT_METHOD(CloseContext);
 
 #undef CLIENT_METHOD
 
-  virtual void RunComponentFunctionAsync(
-      CallOptions* call_opts, const RunComponentFunctionRequest* request,
-      RunComponentFunctionResponse* response, StatusCallback done) = 0;
+#define CLIENT_CANCELABLE_METHOD(method)                      \
+  virtual void method##Async(                                 \
+      CallOptions* call_opts, const method##Request* request, \
+      method##Response* response, StatusCallback done) = 0;
+
+  CLIENT_CANCELABLE_METHOD(Enqueue);
+  CLIENT_CANCELABLE_METHOD(RunComponentFunction);
+
+#undef CLIENT_CANCELABLE_METHOD
 
   // Feeds `request` into the request stream of EagerService::StreamingEnqueue.
   // `response` will be filled with the response for this `request`. The
@@ -59,7 +64,8 @@ class EagerClient : public core::RefCounted {
   // is invoked and keeps it open until some error condition.
   // Similarly to the methods above, the request can be deleted as soon as
   // StreamingEnqueueAsync returns.
-  virtual void StreamingEnqueueAsync(const EnqueueRequest* request,
+  virtual void StreamingEnqueueAsync(CallOptions* call_opts,
+                                     const EnqueueRequest* request,
                                      EnqueueResponse* response,
                                      StatusCallback done) = 0;
 
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
index 7b4d3d14018..d7dfb06f377 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
@@ -156,17 +156,25 @@ Status TensorHandleShape(TensorHandle* handle, TensorShapeProto* proto) {
   const tensorflow::Tensor* t = nullptr;
 
   // TODO(nareshmodi): This call makes async calls sync calls. Fix this.
-  TF_RETURN_IF_ERROR(handle->Tensor(&t));
+  if (handle->Type() == TensorHandle::LOCAL) {
+    TF_RETURN_IF_ERROR(handle->Tensor(&t));
 
-  t->shape().AsProto(proto);
+    t->shape().AsProto(proto);
+  } else {
+    TensorShape shape;
+    TF_RETURN_IF_ERROR(handle->Shape(&shape));
+    shape.AsProto(proto);
+  }
 
   return Status::OK();
 }
 
 Status AddOpRetvalsToResponse(
     EagerContext* eager_context, int op_id, int num_retvals,
-    TensorHandle** retvals, std::function<TensorProto*()> add_tensor_proto_fn,
-    std::function<TensorShapeProto*()> add_shape_proto_fn) {
+    const std::vector<int32>& output_nums, TensorHandle** retvals,
+    std::function<TensorProto*()> add_tensor_proto_fn,
+    std::function<TensorShapeProto*()> add_shape_proto_fn,
+    std::function<string*()> add_device_fn = nullptr) {
   if (op_id == kInvalidRemoteOpId) {
     // Copy the output tensors back along with the response, since the op id
     // is invalid which cannot be added to RemoteMgr.
@@ -175,10 +183,19 @@ Status AddOpRetvalsToResponse(
       retvals[i]->Unref();
     }
   } else {
-    eager_context->RemoteMgr()->AddOperationOutputs(
-        absl::MakeSpan(retvals, num_retvals), op_id);
     for (int i = 0; i < num_retvals; i++) {
       TF_RETURN_IF_ERROR(TensorHandleShape(retvals[i], add_shape_proto_fn()));
+      if (add_device_fn) {
+        Device* device = absl::get<Device*>(retvals[i]->device());
+        *add_device_fn() = device ? device->name() : "";
+      }
+      if (retvals[i]->Type() == TensorHandle::REMOTE) {
+        retvals[i]->Unref();
+      } else {
+        const int output_num = output_nums.empty() ? i : output_nums.at(i);
+        eager_context->RemoteMgr()->AddOperationOutput(retvals[i], op_id,
+                                                       output_num);
+      }
     }
   }
   return Status::OK();
@@ -257,9 +274,8 @@ Status EagerServiceImpl::CreateContext(const CreateContextRequest* request,
   opts.config = request->server_def().default_session_config();
   tensorflow::EagerContext* ctx = new tensorflow::EagerContext(
       opts, tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
-      tensorflow::ContextMirroringPolicy::MIRRORING_NONE, request->async(),
-      request->lazy_copy_remote_function_inputs(), device_mgr, false, r,
-      GetDefaultCustomKernelCreator(), worker_session->cluster_flr());
+      request->async(), request->lazy_copy_remote_function_inputs(), device_mgr,
+      false, r, GetDefaultCustomKernelCreator(), worker_session->cluster_flr());
   // Ownership will be transferred to the ServerContext, or else in an error
   // case ctx will be deleted by this unref.
   core::ScopedUnref unref_ctx(ctx);
@@ -456,6 +472,10 @@ void EagerServiceImpl::RunComponentFunction(
   auto* retvals = new absl::FixedArray<TensorHandle*>(*num_retvals);
   VLOG(3) << "ServerContext: Calling EagerLocalExecuteAsync for op "
           << operation.id();
+  std::vector<int32> output_nums;
+  for (const int32 output_num : request->output_num()) {
+    output_nums.push_back(output_num);
+  }
 
   auto cm = std::make_shared<CancellationManager>();
   op->SetCancellationManager(cm.get());
@@ -464,8 +484,8 @@ void EagerServiceImpl::RunComponentFunction(
   context->Ref();
   EagerLocalExecuteAsync(
       op, retvals->data(), num_retvals,
-      [op, op_id = operation.id(), num_retvals, retvals, cm, call_opts,
-       response, eager_context, context,
+      [op, op_id = operation.id(), num_retvals, retvals, output_nums, cm,
+       call_opts, response, eager_context, context,
        done = std::move(done)](const Status& status) {
         call_opts->ClearCancelCallback();
         auto wrapped_done = [&](const Status& status) {
@@ -479,14 +499,17 @@ void EagerServiceImpl::RunComponentFunction(
           wrapped_done(status);
           return;
         }
+        // The output device of a component function is the component device
+        // which is known on the default device of it's parent function.
         wrapped_done(AddOpRetvalsToResponse(
-            eager_context, op_id, *num_retvals, retvals->data(),
+            eager_context, op_id, *num_retvals, output_nums, retvals->data(),
             [response] { return response->add_tensor(); },
             [response] { return response->add_shape(); }));
       });
 }
 
-Status EagerServiceImpl::ExecuteOp(const Operation& operation,
+Status EagerServiceImpl::ExecuteOp(CallOptions* call_opts,
+                                   const Operation& operation,
                                    EagerContext* eager_context,
                                    EagerExecutor* eager_executor,
                                    QueueResponse* queue_response) {
@@ -495,6 +518,12 @@ Status EagerServiceImpl::ExecuteOp(const Operation& operation,
   TF_RETURN_IF_ERROR(GetEagerOperationAndNumRetvals(
       operation, eager_context, eager_executor, &op, &num_retvals));
 
+  auto cm = std::make_shared<CancellationManager>();
+  if (call_opts) {
+    op.SetCancellationManager(cm.get());
+    call_opts->SetCancelCallback([cm] { cm->StartCancel(); });
+  }
+
   absl::FixedArray<tensorflow::TensorHandle*> retvals(num_retvals);
   VLOG(3) << "ServerContext: Calling EagerExecute for op " << operation.id();
   TF_RETURN_IF_ERROR(op.Execute(
@@ -503,13 +532,23 @@ Status EagerServiceImpl::ExecuteOp(const Operation& operation,
           num_retvals),
       &num_retvals));
 
+  std::function<string*()> add_device_fn = nullptr;
+  // Send the output devices of a function back to let a client know where the
+  // outputs are. For a primitive op, an output devics is the op device which is
+  // known on a client.
+  if (op.is_function()) {
+    add_device_fn = [queue_response] { return queue_response->add_device(); };
+  }
+
   return AddOpRetvalsToResponse(
-      eager_context, operation.id(), num_retvals, retvals.data(),
-      [queue_response] { return queue_response->add_tensor(); },
-      [queue_response] { return queue_response->add_shape(); });
+      eager_context, operation.id(), num_retvals, /*output_nums=*/{},
+      retvals.data(), [queue_response] { return queue_response->add_tensor(); },
+      [queue_response] { return queue_response->add_shape(); },
+      std::move(add_device_fn));
 }
 
-Status EagerServiceImpl::Enqueue(const EnqueueRequest* request,
+Status EagerServiceImpl::Enqueue(CallOptions* call_opts,
+                                 const EnqueueRequest* request,
                                  EnqueueResponse* response, uint64 stream_id) {
   profiler::TraceMe activity(
       [&] {
@@ -530,7 +569,7 @@ Status EagerServiceImpl::Enqueue(const EnqueueRequest* request,
   for (const auto& item : request->queue()) {
     auto* queue_response = response->add_queue_response();
     if (item.has_operation()) {
-      s = ExecuteOp(item.operation(), context->Context(), &executor,
+      s = ExecuteOp(call_opts, item.operation(), context->Context(), &executor,
                     queue_response);
     } else if (item.has_handle_to_decref()) {
       auto handle_to_decref = absl::make_unique<RemoteTensorHandleInternal>(
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl.h b/tensorflow/core/distributed_runtime/eager/eager_service_impl.h
index e8b4e1e5090..f769bde537c 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl.h
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl.h
@@ -90,7 +90,8 @@ class EagerServiceImpl {
   static constexpr uint64 kInvalidStreamId = 0;
 
   // Used by both Enqueue and StreamingEnqueue RPCs.
-  Status Enqueue(const EnqueueRequest* request, EnqueueResponse* response,
+  Status Enqueue(CallOptions* call_opts, const EnqueueRequest* request,
+                 EnqueueResponse* response,
                  uint64 stream_id = kInvalidStreamId);
 
   Status WaitQueueDone(const WaitQueueDoneRequest* request,
@@ -207,8 +208,8 @@ class EagerServiceImpl {
   };
 
  private:
-  Status ExecuteOp(const Operation& operation, EagerContext* eager_context,
-                   EagerExecutor* eager_executor,
+  Status ExecuteOp(CallOptions* call_opts, const Operation& operation,
+                   EagerContext* eager_context, EagerExecutor* eager_executor,
                    QueueResponse* queue_response);
   Status SendTensor(const SendTensorOp& send_tensor,
                     EagerContext* eager_context);
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
index be81355cbc8..6c9149c9196 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/protobuf.h"
@@ -81,12 +82,16 @@ class FakeEagerClient : public EagerClient {
 
   CLIENT_METHOD(CreateContext);
   CLIENT_METHOD(UpdateContext);
-  CLIENT_METHOD(Enqueue);
   CLIENT_METHOD(WaitQueueDone);
   CLIENT_METHOD(KeepAlive);
   CLIENT_METHOD(CloseContext);
 #undef CLIENT_METHOD
 
+  void EnqueueAsync(CallOptions* call_opts, const EnqueueRequest* request,
+                    EnqueueResponse* response, StatusCallback done) override {
+    done(impl_->Enqueue(call_opts, request, response));
+  }
+
   void RunComponentFunctionAsync(CallOptions* call_opts,
                                  const RunComponentFunctionRequest* request,
                                  RunComponentFunctionResponse* response,
@@ -94,10 +99,11 @@ class FakeEagerClient : public EagerClient {
     impl_->RunComponentFunction(call_opts, request, response, std::move(done));
   }
 
-  void StreamingEnqueueAsync(const EnqueueRequest* request,
+  void StreamingEnqueueAsync(CallOptions* call_opts,
+                             const EnqueueRequest* request,
                              EnqueueResponse* response,
                              StatusCallback done) override {
-    done(impl_->Enqueue(request, response));
+    done(impl_->Enqueue(nullptr, request, response));
   }
 
   bool allow_multiple_pending_requests() const override { return false; }
@@ -218,10 +224,11 @@ void AddOperationToRunComponentFunctionRequest(
     const std::vector<absl::variant<TensorProto, std::pair<int64, int32>>>&
         inputs,
     const std::unordered_map<string, AttrValue>& attrs, const string& device,
-    RunComponentFunctionRequest* request) {
+    const int output_num, RunComponentFunctionRequest* request) {
   auto* operation = request->mutable_operation();
   operation->set_is_function(true);
   operation->set_is_component_function(true);
+  request->add_output_num(output_num);
   BuildOperation(operation, id, name, inputs, attrs, device);
 }
 
@@ -421,7 +428,7 @@ TEST_F(EagerServiceImplTest, BasicTest) {
       2, "MatMul", {std::make_pair(1, 0), std::make_pair(1, 0)}, attrs,
       "/job:localhost/replica:0/task:0/device:CPU:0", &remote_enqueue_request);
 
-  TF_ASSERT_OK(eager_service_impl.Enqueue(&remote_enqueue_request,
+  TF_ASSERT_OK(eager_service_impl.Enqueue(nullptr, &remote_enqueue_request,
                                           &remote_enqueue_response));
 
   auto& matmul_result_shape =
@@ -462,7 +469,8 @@ class EagerServiceImplFunctionTest : public EagerServiceImplTest {
   // Creates a context and attempts to execute a function.
   void TestFunction(const RegisterFunctionOp& register_op,
                     const string& function_name,
-                    const bool local_inputs = false) {
+                    const bool local_inputs = false,
+                    const bool test_cancel = false) {
     TestEagerServiceImpl eager_service_impl(&worker_env_);
 
     uint64 context_id = random::New64();
@@ -480,8 +488,8 @@ class EagerServiceImplFunctionTest : public EagerServiceImplTest {
     *enqueue_request.add_queue()->mutable_register_function() = register_op;
     EnqueueResponse enqueue_response;
 
-    TF_ASSERT_OK(
-        eager_service_impl.Enqueue(&enqueue_request, &enqueue_response));
+    TF_ASSERT_OK(eager_service_impl.Enqueue(nullptr, &enqueue_request,
+                                            &enqueue_response));
 
     EnqueueRequest remote_enqueue_request;
     remote_enqueue_request.set_context_id(context_id);
@@ -517,22 +525,38 @@ class EagerServiceImplFunctionTest : public EagerServiceImplTest {
           &remote_enqueue_request);
     }
 
-    TF_ASSERT_OK(eager_service_impl.Enqueue(&remote_enqueue_request,
-                                            &remote_enqueue_response));
+    CallOptions call_opts;
+    Status status;
+    Notification n;
+    Env::Default()->SchedClosure([&] {
+      status = eager_service_impl.Enqueue(&call_opts, &remote_enqueue_request,
+                                          &remote_enqueue_response);
+      n.Notify();
+    });
 
-    const tensorflow::Tensor* t = nullptr;
-    tensorflow::TensorHandle* tensor_handle;
-    TF_ASSERT_OK(eager_service_impl.GetTensorHandle(
-        context_id, RemoteTensorHandleInternal(2, 0), &tensor_handle));
-    TF_ASSERT_OK(tensor_handle->Tensor(&t));
+    if (test_cancel) {
+      // Wait to let the Enqueue thread starts running
+      Env::Default()->SleepForMicroseconds(500000);
+      call_opts.StartCancel();
+      n.WaitForNotification();
+      EXPECT_TRUE(errors::IsCancelled(status)) << status.error_message();
+    } else {
+      n.WaitForNotification();
+      TF_ASSERT_OK(status);
+      const tensorflow::Tensor* t = nullptr;
+      tensorflow::TensorHandle* tensor_handle;
+      TF_ASSERT_OK(eager_service_impl.GetTensorHandle(
+          context_id, RemoteTensorHandleInternal(2, 0), &tensor_handle));
+      TF_ASSERT_OK(tensor_handle->Tensor(&t));
 
-    auto actual = t->flat<float>();
-    EXPECT_EQ(4, actual.size());
+      auto actual = t->flat<float>();
+      EXPECT_EQ(4, actual.size());
 
-    EXPECT_EQ(7, actual(0));
-    EXPECT_EQ(10, actual(1));
-    EXPECT_EQ(15, actual(2));
-    EXPECT_EQ(22, actual(3));
+      EXPECT_EQ(7, actual(0));
+      EXPECT_EQ(10, actual(1));
+      EXPECT_EQ(15, actual(2));
+      EXPECT_EQ(22, actual(3));
+    }
 
     CloseContextRequest close_context_request;
     close_context_request.set_context_id(context_id);
@@ -562,8 +586,8 @@ class EagerServiceImplFunctionTest : public EagerServiceImplTest {
     enqueue_request.set_context_id(context_id);
     *enqueue_request.add_queue()->mutable_register_function() = register_op;
     EnqueueResponse enqueue_response;
-    TF_ASSERT_OK(
-        eager_service_impl.Enqueue(&enqueue_request, &enqueue_response));
+    TF_ASSERT_OK(eager_service_impl.Enqueue(nullptr, &enqueue_request,
+                                            &enqueue_response));
 
     // First run an op to generate input for function.
     EnqueueRequest remote_enqueue_request;
@@ -580,17 +604,19 @@ class EagerServiceImplFunctionTest : public EagerServiceImplTest {
     AddOperationToEnqueueRequest(1, "Const", {}, const_attrs,
                                  "/job:localhost/replica:0/task:0/device:CPU:0",
                                  &remote_enqueue_request);
-    TF_ASSERT_OK(eager_service_impl.Enqueue(&remote_enqueue_request,
+    TF_ASSERT_OK(eager_service_impl.Enqueue(nullptr, &remote_enqueue_request,
                                             &remote_enqueue_response));
 
     // Run function with input from the previous op.
     RunComponentFunctionRequest run_comp_func_request;
     run_comp_func_request.set_context_id(context_id);
     RunComponentFunctionResponse run_comp_func_response;
+    const int output_num = 5;
     AddOperationToRunComponentFunctionRequest(
         2, function_name, {std::make_pair(1, 0)},
         std::unordered_map<string, AttrValue>(),
-        "/job:localhost/replica:0/task:0/device:CPU:0", &run_comp_func_request);
+        "/job:localhost/replica:0/task:0/device:CPU:0", output_num,
+        &run_comp_func_request);
 
     CallOptions call_opts;
     Notification n;
@@ -613,7 +639,8 @@ class EagerServiceImplFunctionTest : public EagerServiceImplTest {
       const tensorflow::Tensor* t = nullptr;
       tensorflow::TensorHandle* tensor_handle;
       TF_ASSERT_OK(eager_service_impl.GetTensorHandle(
-          context_id, RemoteTensorHandleInternal(2, 0), &tensor_handle));
+          context_id, RemoteTensorHandleInternal(2, output_num),
+          &tensor_handle));
       TF_ASSERT_OK(tensor_handle->Tensor(&t));
 
       auto actual = t->flat<float>();
@@ -653,6 +680,13 @@ TEST_F(EagerServiceImplFunctionTest, NestedFunctionTest) {
   TestFunction(register_op, "MatMulNestedFunction");
 }
 
+TEST_F(EagerServiceImplFunctionTest, FunctionCancellationTest) {
+  RegisterFunctionOp register_op;
+  *register_op.mutable_function_def() = SingleRecvNodeFunction();
+  TestFunction(register_op, "SingleRecvNodeFunction", /*local_inputs=*/false,
+               /*test_cancel=*/true);
+}
+
 TEST_F(EagerServiceImplFunctionTest, ComponentFunctionTest) {
   RegisterFunctionOp register_op;
   *register_op.mutable_function_def() = MatMulFunction();
@@ -735,7 +769,7 @@ class FunctionWithRemoteInputsTest : public EagerServiceImplTest {
     const_attrs.insert({"value", val});
     AddOperationToEnqueueRequest(1, "Const", {}, const_attrs, local_device_,
                                  &remote_enqueue_request);
-    TF_EXPECT_OK(eager_service_impl_.Enqueue(&remote_enqueue_request,
+    TF_EXPECT_OK(eager_service_impl_.Enqueue(nullptr, &remote_enqueue_request,
                                              &remote_enqueue_response));
     eager_cluster_flr_ = absl::make_unique<EagerClusterFunctionLibraryRuntime>(
         context_id_, ctx, device_mgr_.get());
@@ -771,12 +805,17 @@ class FunctionWithRemoteInputsTest : public EagerServiceImplTest {
                                                   &close_context_response));
   }
 
-  void CheckOutputsAndClose(const int64 op_id) {
+  void CheckOutputsAndClose(const std::vector<FunctionRet>& outputs,
+                            const int64 op_id) {
     const tensorflow::Tensor* t = nullptr;
     tensorflow::TensorHandle* tensor_handle;
     TF_ASSERT_OK(eager_service_impl_.GetTensorHandle(
         context_id_, RemoteTensorHandleInternal(2, 0), &tensor_handle));
     TF_ASSERT_OK(tensor_handle->Tensor(&t));
+    EXPECT_EQ(outputs.size(), 1);
+    EXPECT_EQ(outputs.at(0).index(), 1);
+    const TensorShape& shape = absl::get<TensorShape>(outputs.at(0));
+    EXPECT_EQ(shape, t->shape());
     CheckOutputTensorAndClose(*t);
   }
 
@@ -845,11 +884,7 @@ TEST_F(FunctionWithRemoteInputsTest, EagerPFLRTest) {
                    });
   done.WaitForNotification();
   TF_ASSERT_OK(status);
-  EXPECT_EQ(outputs.size(), 1);
-  EXPECT_EQ(outputs.at(0).index(), 1);
-  const TensorShape& shape = absl::get<TensorShape>(outputs.at(0));
-  EXPECT_EQ(shape, TensorShape({2, 2}));
-  CheckOutputsAndClose(op_id);
+  CheckOutputsAndClose(outputs, op_id);
 }
 
 // Test executes a remote function with local input and output tensors.
@@ -940,13 +975,13 @@ TEST_F(FunctionWithRemoteInputsTest, KernelAndDeviceFuncTest) {
         *handle = remote_handles.at(index);
         return Status::OK();
       });
-  std::vector<Tensor> outputs;
+  std::vector<FunctionRet> outputs;
 
   TF_ASSERT_OK(kernel->Run(/*step_container=*/nullptr, inputs, &outputs,
                            /*cancellation_manager=*/nullptr,
                            /*remote_func_params=*/absl::nullopt));
 
-  CheckOutputsAndClose(op_id);
+  CheckOutputsAndClose(outputs, op_id);
 }
 
 // Test executes a remote function through KernelAndDeviceFunc::RunAsync.
@@ -987,7 +1022,7 @@ TEST_F(FunctionWithRemoteInputsTest, KernelAndDeviceFuncAsyncTest) {
         *handle = remote_handles.at(index);
         return Status::OK();
       });
-  std::vector<Tensor> outputs;
+  std::vector<FunctionRet> outputs;
 
   Status status;
   Notification n;
@@ -1000,7 +1035,7 @@ TEST_F(FunctionWithRemoteInputsTest, KernelAndDeviceFuncAsyncTest) {
                    });
   n.WaitForNotification();
   TF_ASSERT_OK(status);
-  CheckOutputsAndClose(op_id);
+  CheckOutputsAndClose(outputs, op_id);
 }
 
 // Test creates a context and attempts to send a tensor (using the RPC), and
@@ -1040,7 +1075,7 @@ TEST_F(EagerServiceImplTest, SendTensorTest) {
       2, "MatMul", {std::make_pair(1, 0), std::make_pair(1, 0)}, attrs,
       "/job:localhost/replica:0/task:0/device:CPU:0", &remote_enqueue_request);
 
-  TF_ASSERT_OK(eager_service_impl.Enqueue(&remote_enqueue_request,
+  TF_ASSERT_OK(eager_service_impl.Enqueue(nullptr, &remote_enqueue_request,
                                           &remote_enqueue_response));
 
   const tensorflow::Tensor* t = nullptr;
@@ -1122,7 +1157,7 @@ TEST_F(EagerServiceImplTest, SendPackedHandleTest) {
   remote_handle->set_op_device(device2);
   remote_handle->set_device(device2);
 
-  TF_ASSERT_OK(eager_service_impl.Enqueue(&remote_enqueue_request,
+  TF_ASSERT_OK(eager_service_impl.Enqueue(nullptr, &remote_enqueue_request,
                                           &remote_enqueue_response));
 
   tensorflow::TensorHandle* packed_handle;
@@ -1183,7 +1218,7 @@ TEST_F(EagerServiceImplTest, RequestsToMasterTest) {
   tensorflow::EagerContext* ctx = new tensorflow::EagerContext(
       SessionOptions(),
       tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
-      tensorflow::ContextMirroringPolicy::MIRRORING_NONE, /*async=*/false,
+      /*async=*/false,
       /*lazy_copy_function_remote_inputs=*/false, device_mgr_.get(), false,
       rendezvous, GetDefaultCustomKernelCreator());
   const uint64 context_id = random::New64();
@@ -1209,7 +1244,7 @@ TEST_F(EagerServiceImplTest, RequestsToMasterTest) {
   SetTensorProto(send_tensor->add_tensors());
 
   // Unable to handle the request since there is no eager context.
-  Status status = eager_service_impl.Enqueue(&remote_enqueue_request,
+  Status status = eager_service_impl.Enqueue(nullptr, &remote_enqueue_request,
                                              &remote_enqueue_response);
   EXPECT_EQ(error::INVALID_ARGUMENT, status.code());
   EXPECT_TRUE(absl::StrContains(
@@ -1219,7 +1254,7 @@ TEST_F(EagerServiceImplTest, RequestsToMasterTest) {
   // The request can be handled after adding the master eager context to
   // service.
   TF_ASSERT_OK(eager_service_impl.CreateMasterContext(context_id, ctx));
-  TF_ASSERT_OK(eager_service_impl.Enqueue(&remote_enqueue_request,
+  TF_ASSERT_OK(eager_service_impl.Enqueue(nullptr, &remote_enqueue_request,
                                           &remote_enqueue_response));
   ctx->Unref();
 }
diff --git a/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc b/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
index d4b5fe38964..f673d2ce6f4 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
@@ -174,7 +174,8 @@ void RemoteCopyNode::StartSend() {
     // If StartRecv fails very quickly, `this` can be destroyed before the
     // callback below is executed. So, we can't capture `this`.
     eager_client->StreamingEnqueueAsync(
-        &request, response, [response, captured_state](const Status& s) {
+        /*call_opts=*/nullptr, &request, response,
+        [response, captured_state](const Status& s) {
           captured_state->SetSendStatus(s);
           if (!s.ok()) {
             captured_state->recv_cancellation()->StartCancel();
@@ -192,9 +193,20 @@ Status RemoteCopyNode::RunLocalRecv(EagerOperation* op,
   TF_RETURN_IF_ERROR(CreateUncachedKernelAndDeviceOp(op, &kernel));
 
   EagerKernelArgs args;
-  return kernel->Run(/*step_container*/ nullptr, args, outputs,
-                     captured_state_->recv_cancellation(),
-                     /*remote_func_params=*/absl::nullopt);
+  std::vector<EagerKernelRet> rets;
+  TF_RETURN_IF_ERROR(kernel->Run(/*step_container*/ nullptr, args, &rets,
+                                 captured_state_->recv_cancellation(),
+                                 /*remote_func_params=*/absl::nullopt));
+  outputs->clear();
+  for (const auto& ret : rets) {
+    if (ret.index() == 0) {
+      outputs->push_back(absl::get<Tensor>(ret));
+    } else {
+      return errors::Internal(
+          "Expect to receive a Tensor but got a TensorShape.");
+    }
+  }
+  return Status::OK();
 }
 
 void RemoteCopyNode::RunRemoteRecv(EagerOperation* op, StatusCallback done) {
@@ -230,7 +242,7 @@ void RemoteCopyNode::RunRemoteRecv(EagerOperation* op, StatusCallback done) {
   const std::shared_ptr<CapturedSharedState>& captured_state = captured_state_;
   Device* recv_device = recv_device_;
   eager_client->StreamingEnqueueAsync(
-      &request, response,
+      /*call_opts=*/nullptr, &request, response,
       [captured_state, response, recv_device, context_view_id,
        done](const Status& s) {
         if (s.ok()) {
@@ -376,7 +388,7 @@ void RemoteCopyNode::StartSendPackedHandle(StatusCallback done) {
   Device* recv_device = recv_device_;
   const std::shared_ptr<CapturedSharedState>& captured_state = captured_state_;
   eager_client->StreamingEnqueueAsync(
-      &request, response,
+      /*call_opts=*/nullptr, &request, response,
       [captured_state, response, recv_device, context_view_id,
        done](const Status& s) {
         if (s.ok()) {
@@ -430,7 +442,7 @@ void RemoteCopyNode::StartRemoteSendTensor(StatusCallback done) {
   captured_state->SetSrcShape(tensor.shape());
   Device* recv_device = recv_device_;
   eager_client->StreamingEnqueueAsync(
-      &request, response,
+      /*call_opts=*/nullptr, &request, response,
       [captured_state, response, recv_device, context_view_id,
        done](const Status& s) {
         if (s.ok()) {
diff --git a/tensorflow/core/distributed_runtime/eager/remote_execute_node.cc b/tensorflow/core/distributed_runtime/eager/remote_execute_node.cc
index 067e26a31e4..e2bc73b479f 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_execute_node.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_execute_node.cc
@@ -24,7 +24,7 @@ namespace tensorflow {
 namespace eager {
 
 void RemoteExecuteNode::RunAsync(StatusCallback done) {
-  EnqueueResponse* response = new EnqueueResponse;
+  auto response = std::make_shared<EnqueueResponse>();
 
   const gtl::InlinedVector<TensorHandle*, 4>& inputs = inputs_;
   const gtl::InlinedVector<TensorHandle*, 2>& retvals = retvals_;
@@ -49,6 +49,23 @@ void RemoteExecuteNode::RunAsync(StatusCallback done) {
   }
   VLOG(3) << "Issuing: " << rpc_description;
 
+  CancellationManager* cm = cancellation_manager_;
+  CancellationToken token = 0;
+  auto call_opts = std::make_shared<CallOptions>();
+  if (cm != nullptr) {
+    token = cm->get_cancellation_token();
+    const bool already_cancelled = !cm->RegisterCallback(
+        token, [call_opts, response, done]() { call_opts->StartCancel(); });
+    if (already_cancelled) {
+      Status s = errors::Cancelled("RemoteExecuteNode::RunAsync");
+      for (size_t i = 0; i < retvals.size(); ++i) {
+        retvals[i]->PoisonRemote(s, device, context_view_id_);
+      }
+      done(s);
+      return;
+    }
+  }
+
   for (auto handle : inputs_) {
     handle->Ref();
   }
@@ -57,9 +74,13 @@ void RemoteExecuteNode::RunAsync(StatusCallback done) {
   }
 
   eager_client_->StreamingEnqueueAsync(
-      request_.get(), response,
-      [inputs, retvals, response, device, context_view_id = context_view_id_,
-       rpc_description, done](const Status& status) {
+      call_opts.get(), request_.get(), response.get(),
+      [inputs, retvals, call_opts, response, device,
+       context_view_id = context_view_id_, rpc_description, cm, token,
+       done](const Status& status) {
+        if (cm != nullptr) {
+          cm->TryDeregisterCallback(token);
+        }
         for (auto handle : inputs) {
           handle->Unref();
         }
@@ -71,8 +92,14 @@ void RemoteExecuteNode::RunAsync(StatusCallback done) {
         }
         for (size_t i = 0; i < retvals.size(); ++i) {
           if (status.ok()) {
-            Status s = retvals[i]->SetRemoteShape(
-                response->queue_response(0).shape(i), device, context_view_id);
+            const string output_device =
+                response->queue_response(0).device().empty()
+                    ? ""
+                    : response->queue_response(0).device(i);
+            Status s = retvals[i]->SetRemoteShapeAndDevice(
+                response->queue_response(0).shape(i), device, context_view_id,
+                output_device);
+
             if (!s.ok()) {
               LOG(ERROR) << "Ignoring an error encountered when setting "
                             "remote shape of tensor handle: "
@@ -88,7 +115,6 @@ void RemoteExecuteNode::RunAsync(StatusCallback done) {
           retvals[i]->Unref();
         }
         done(status);
-        delete response;
       });
 }
 
diff --git a/tensorflow/core/distributed_runtime/eager/remote_execute_node.h b/tensorflow/core/distributed_runtime/eager/remote_execute_node.h
index ed9f9c0ee0f..c5e31ed173e 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_execute_node.h
+++ b/tensorflow/core/distributed_runtime/eager/remote_execute_node.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/eager/shape_inference.h"
 #include "tensorflow/core/common_runtime/eager/tensor_handle.h"
 #include "tensorflow/core/distributed_runtime/eager/eager_client.h"
+#include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
@@ -39,6 +40,7 @@ class RemoteExecuteNode : public AsyncRemoteExecuteNode {
   RemoteExecuteNode(EagerContext* eager_context,
                     std::unique_ptr<EnqueueRequest> request, Device* device,
                     uint64 context_view_id, EagerClient* eager_client,
+                    CancellationManager* cancellation_manager,
                     const NodeDef& ndef, FunctionLibraryDefinition* lib_def,
                     const gtl::InlinedVector<TensorHandle*, 4>& inputs,
                     absl::Span<TensorHandle*> retvals)
@@ -48,6 +50,7 @@ class RemoteExecuteNode : public AsyncRemoteExecuteNode {
         device_(device),
         context_view_id_(context_view_id),
         eager_client_(eager_client),
+        cancellation_manager_(cancellation_manager),
         ndef_(ndef),
         lib_def_(lib_def),
         inputs_(inputs) {
@@ -125,6 +128,7 @@ class RemoteExecuteNode : public AsyncRemoteExecuteNode {
   uint64 context_view_id_;
   bool needs_remote_inputs_;
   EagerClient* eager_client_;  // Not owned, and must outlive this node.
+  CancellationManager* cancellation_manager_;
   const NodeDef ndef_;
   const FunctionLibraryDefinition* lib_def_;
   gtl::InlinedVector<TensorHandle*, 4> inputs_;
diff --git a/tensorflow/core/distributed_runtime/eager/remote_mgr.cc b/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
index e755cd247a6..7a3a447042e 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
@@ -35,6 +35,13 @@ void RemoteMgr::AddOperationOutputs(
   }
 }
 
+void RemoteMgr::AddOperationOutput(tensorflow::TensorHandle* handle,
+                                   int64 operation_id, int32 output_num) {
+  mutex_lock l(remote_tensor_handle_mu_);
+  remote_tensor_handle_map_.emplace(
+      RemoteTensorHandleInternal(operation_id, output_num), handle);
+}
+
 Status RemoteMgr::GetTensorHandleImpl(
     const RemoteTensorHandleInternal& remote_handle,
     tensorflow::TensorHandle** handle) {
@@ -160,13 +167,14 @@ Status RemoteMgr::DeserializeRemoteTensorHandle(const RemoteTensorHandle& in,
     (*out)->Ref();
   } else {
     // Create a remote TensorHandle for remote tensors which have not been
-    // copied to the local worker yet.
+    // copied to the local worker yet (e.g. remote function inputs).
     const string& device_name =
         in.op_device().empty() ? in.device() : in.op_device();
     TF_RETURN_IF_ERROR(
         parent_->FindDeviceFromName(device_name.c_str(), &device));
     *out = TensorHandle::CreateLazyRemoteHandle(in.op_id(), in.output_num(),
-                                                in.dtype(), device, parent_);
+                                                in.dtype(), device,
+                                                /*is_ready=*/true, parent_);
     std::vector<DtypeAndPartialTensorShape> dtypes_and_shapes;
     if (!GetMirroredResourceShape(RemoteTensorHandleInternal(in),
                                   &dtypes_and_shapes)
diff --git a/tensorflow/core/distributed_runtime/eager/remote_mgr.h b/tensorflow/core/distributed_runtime/eager/remote_mgr.h
index 2446352c931..0b6e23c4f6b 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_mgr.h
+++ b/tensorflow/core/distributed_runtime/eager/remote_mgr.h
@@ -47,6 +47,9 @@ class RemoteMgr {
       const gtl::ArraySlice<tensorflow::TensorHandle*> handles,
       int64 operation_id);
 
+  void AddOperationOutput(tensorflow::TensorHandle* handles, int64 operation_id,
+                          int32 output_num);
+
   Status GetTensorHandle(const RemoteTensorHandleInternal& remote_handle,
                          tensorflow::TensorHandle** handle);
 
diff --git a/tensorflow/core/distributed_runtime/eager/remote_mgr_test.cc b/tensorflow/core/distributed_runtime/eager/remote_mgr_test.cc
index 1e33a9d0f62..901e757856a 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_mgr_test.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_mgr_test.cc
@@ -54,7 +54,7 @@ class RemoteMgrTest : public ::testing::Test {
     ctx_ = new tensorflow::EagerContext(
         SessionOptions(),
         tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
-        tensorflow::ContextMirroringPolicy::MIRRORING_NONE, /*async=*/false,
+        /*async=*/false,
         /*lazy_copy_function_remote_inputs=*/false, device_mgr.release(), true,
         rendezvous, GetDefaultCustomKernelCreator(), nullptr);
   }
@@ -95,7 +95,7 @@ TEST_F(RemoteMgrTest, SerializeRemoteTensorHandle) {
   const uint64 op_id = 3;
   const int output_num = 1;
   TensorHandle* handle = TensorHandle::CreateLazyRemoteHandle(
-      op_id, output_num, DT_FLOAT, remote_device_, ctx_);
+      op_id, output_num, DT_FLOAT, remote_device_, /*is_ready=*/true, ctx_);
   RemoteTensorHandle remote_handle;
   TF_ASSERT_OK(remote_mgr.SerializeRemoteTensorHandle(
       handle, /*wait_until_ready=*/true, &remote_handle, remote_device_,
diff --git a/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.cc b/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.cc
index 6f4d5ada759..736a5d0bfd7 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.cc
@@ -85,8 +85,9 @@ void DestroyRemoteTensorHandle(EagerContext* ctx, const string& remote_task,
 }  // namespace
 
 RemoteTensorHandleData::RemoteTensorHandleData(int64 op_id, int output_num,
-                                               uint64 context_view_id)
-    : is_ready_(true),
+                                               uint64 context_view_id,
+                                               bool is_ready)
+    : is_ready_(is_ready),
       op_id_(op_id),
       output_num_(output_num),
       context_view_id_(context_view_id),
@@ -173,6 +174,11 @@ Status RemoteTensorHandleData::IsPoisoned() const {
 }
 
 Status RemoteTensorHandleData::SetShape(const TensorShape& shape) {
+  return SetShapeAndRemoteTask(shape, /*remote_task=*/"");
+}
+
+Status RemoteTensorHandleData::SetShapeAndRemoteTask(
+    const TensorShape& shape, const string& remote_task) {
   // If `is_ready_` is set previously due to poisoning, return the original
   // error that poisoned this tensor.
   TF_RETURN_IF_ERROR(IsPoisoned());
@@ -183,6 +189,9 @@ Status RemoteTensorHandleData::SetShape(const TensorShape& shape) {
   }
 
   shape_ = shape;
+  if (!remote_task.empty()) {
+    remote_task_ = remote_task;
+  }
   is_poisoned_ = Status::OK();
   is_ready_ = true;
 
diff --git a/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.h b/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.h
index 5f096677225..7572bf8eac8 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.h
+++ b/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.h
@@ -26,11 +26,16 @@ namespace tensorflow {
 class RemoteTensorHandleData {
  public:
   // Constructor for lazy remote handles. A lazy remote handle is created on
-  // a remote worker with an op_id and an output_num sent by a client. The
-  // client won't serialize them until the corresponding remote tensor is ready.
-  // So the remote tensor should be ready when we create a lazy remote handle.
-  RemoteTensorHandleData(int64 op_id, int output_num, uint64 context_view_id);
-  // Constructor for unshaped remote handles
+  // a remote worker with an op_id and an output_num. It doesn't control the
+  // lifetime of a remote handle that it refers to. If it refers to a remote
+  // function input, it's sent by a client which won't serialize it until
+  // the corresponding remote tensor is ready. So the remote tensor should be
+  // ready when we create a lazy remote handle. If it refers to a remote output,
+  // it's not ready until the shape is set.
+  RemoteTensorHandleData(int64 op_id, int output_num, uint64 context_view_id,
+                         bool is_ready);
+  // Constructor for unshaped remote handles. It controls the lifetime of a
+  // remote handel that it refers to.
   RemoteTensorHandleData(int64 op_id, int output_num, const string& remote_task,
                          EagerContext* ctx);
   ~RemoteTensorHandleData();
@@ -44,7 +49,10 @@ class RemoteTensorHandleData {
   Status Unprotect() { return Status::OK(); }
 
   bool IsReady() const;
+  Status WaitReady(const char* caller) const;
   Status SetShape(const TensorShape& shape);
+  Status SetShapeAndRemoteTask(const TensorShape& shape,
+                               const string& remote_task);
   void Poison(Status status);
   Status IsPoisoned() const;
 
@@ -58,8 +66,6 @@ class RemoteTensorHandleData {
   uint64 context_view_id() const { return context_view_id_; }
 
  private:
-  Status WaitReady(const char* caller) const;
-
   mutable mutex mu_;
   bool is_ready_ TF_GUARDED_BY(mu_);
   Status is_poisoned_ TF_GUARDED_BY(mu_);
@@ -68,7 +74,7 @@ class RemoteTensorHandleData {
   // IDs required when this class is representing a remote tensor handle.
   const int64 op_id_;
   const int32 output_num_;
-  string remote_task_;
+  string remote_task_ TF_GUARDED_BY(mu_);
   uint64 context_id_;
   uint64 context_view_id_;
   EagerContext* ctx_;
diff --git a/tensorflow/core/distributed_runtime/remote_device.cc b/tensorflow/core/distributed_runtime/remote_device.cc
index 9b837bd5671..05a9072894e 100644
--- a/tensorflow/core/distributed_runtime/remote_device.cc
+++ b/tensorflow/core/distributed_runtime/remote_device.cc
@@ -45,6 +45,7 @@ class RemoteDevice : public Device {
   ResourceMgr* resource_manager() override {
     LOG(FATAL) << "Accessing the resource manager of a remote device is not "
                << "supported.";
+    std::abort();
   }
 
   bool IsLocal() const override { return false; }
diff --git a/tensorflow/core/distributed_runtime/rpc/BUILD b/tensorflow/core/distributed_runtime/rpc/BUILD
index 5cf41c3d734..8b27a34f1a2 100644
--- a/tensorflow/core/distributed_runtime/rpc/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/BUILD
@@ -316,6 +316,7 @@ cc_library(
         ":grpc_worker_cache",
         ":grpc_worker_service",
         ":rpc_rendezvous_mgr",
+        "//tensorflow/core/nccl:collective_communicator",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
index 0faf8c1437a..ce8a9635e5c 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
@@ -134,12 +134,27 @@ class GrpcEagerClient : public EagerClient {
 
   CLIENT_METHOD(CreateContext);
   CLIENT_METHOD(UpdateContext);
-  CLIENT_METHOD(Enqueue);
   CLIENT_METHOD(WaitQueueDone);
   CLIENT_METHOD(KeepAlive);
 
 #undef CLIENT_METHOD
 
+#define CLIENT_CANCELABLE_METHOD(method)                                      \
+  void method##Async(CallOptions* call_opts, const method##Request* request,  \
+                     method##Response* response, StatusCallback done)         \
+      override {                                                              \
+    StatusCallback done_wrapped = callback_wrapper(std::move(done));          \
+    new RPCState<protobuf::Message>(                                          \
+        &stub_, cq_, "/tensorflow.eager.EagerService/" #method, *request,     \
+        response, std::move(done_wrapped), call_opts, /*threadpool=*/nullptr, \
+        /*max_retries=*/0, /*fail_fast=*/true, &target_);                     \
+  }
+
+  CLIENT_CANCELABLE_METHOD(Enqueue);
+  CLIENT_CANCELABLE_METHOD(RunComponentFunction);
+
+#undef CLIENT_CANCELABLE_METHOD
+
   void CloseContextAsync(const CloseContextRequest* request,
                          CloseContextResponse* response,
                          StatusCallback done) override {
@@ -164,19 +179,8 @@ class GrpcEagerClient : public EagerClient {
     }
   }
 
-  void RunComponentFunctionAsync(CallOptions* call_opts,
-                                 const RunComponentFunctionRequest* request,
-                                 RunComponentFunctionResponse* response,
-                                 StatusCallback done) override {
-    StatusCallback done_wrapped = callback_wrapper(std::move(done));
-    new RPCState<protobuf::Message>(
-        &stub_, cq_, "/tensorflow.eager.EagerService/RunComponentFunction",
-        *request, response, std::move(done_wrapped), call_opts,
-        /*threadpool=*/nullptr, /*max_retries=*/0, /*fail_fast=*/true,
-        &target_);
-  }
-
-  void StreamingEnqueueAsync(const EnqueueRequest* request,
+  void StreamingEnqueueAsync(CallOptions* call_opts,
+                             const EnqueueRequest* request,
                              EnqueueResponse* response,
                              StatusCallback done) override {
     StatusCallback done_wrapped = callback_wrapper(std::move(done));
@@ -192,14 +196,16 @@ class GrpcEagerClient : public EagerClient {
                 "/tensorflow.eager.EagerService/StreamingEnqueue"));
         it = it_and_bool.first;
       }
+      // TODO(haoyuzhang): Consider supporting cancellation for streaming RPC?
       it->second.SendNextRequest(*request, response, std::move(done_wrapped));
     } else {
       Notification n;
       Status status;
-      EnqueueAsync(request, response, [&n, &status](const Status& s) {
-        status.Update(s);
-        n.Notify();
-      });
+      EnqueueAsync(call_opts, request, response,
+                   [&n, &status](const Status& s) {
+                     status.Update(s);
+                     n.Notify();
+                   });
       n.WaitForNotification();
       done_wrapped(status);
     }
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h
index 1d65f945f27..fb91eee0673 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h
@@ -66,12 +66,26 @@ class GrpcEagerServiceImpl : public AsyncServiceInterface {
   }
   HANDLER(CreateContext);
   HANDLER(UpdateContext);
-  HANDLER(Enqueue);
   HANDLER(WaitQueueDone);
   HANDLER(KeepAlive);
   HANDLER(CloseContext);
 #undef HANDLER
 
+  void EnqueueHandler(EagerCall<EnqueueRequest, EnqueueResponse>* call) {
+    env_->compute_pool->Schedule([this, call]() {
+      auto call_opts = std::make_shared<CallOptions>();
+      call->SetCancelCallback([call_opts]() { call_opts->StartCancel(); });
+      call->SendResponse(ToGrpcStatus(local_impl_.Enqueue(
+          call_opts.get(), &call->request, &call->response)));
+    });
+    Call<GrpcEagerServiceImpl, grpc::EagerService::AsyncService, EnqueueRequest,
+         EnqueueResponse>::
+        EnqueueRequest(&service_, cq_.get(),
+                       &grpc::EagerService::AsyncService::RequestEnqueue,
+                       &GrpcEagerServiceImpl::EnqueueHandler,
+                       /*supports_cancel=*/true);
+  }
+
   void RunComponentFunctionHandler(
       EagerCall<RunComponentFunctionRequest, RunComponentFunctionResponse>*
           call) {
@@ -116,7 +130,7 @@ class GrpcEagerServiceImpl : public AsyncServiceInterface {
       // reuse the same StreamingCall for multiple requests in the same
       // streaming connection.
       Status status = local_impl_.Enqueue(
-          &call->request(), call->mutable_response(),
+          /*call_opts=*/nullptr, &call->request(), call->mutable_response(),
           reinterpret_cast<uint64>(static_cast<void*>(call)));
 
       if (status.ok()) {
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc
index 770a0fcf14f..32f00f44171 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "grpcpp/impl/codegen/async_unary_call.h"
 #include "grpcpp/impl/codegen/channel_interface.h"
 #include "grpcpp/impl/codegen/client_unary_call.h"
-#include "grpcpp/impl/codegen/method_handler_impl.h"
+#include "grpcpp/impl/codegen/method_handler.h"
 #include "grpcpp/impl/codegen/rpc_service_method.h"
 #include "grpcpp/impl/codegen/service_type.h"
 #include "grpcpp/impl/codegen/sync_stream.h"
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
index 6e706179863..d529abef36c 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
@@ -35,11 +35,10 @@ limitations under the License.
 #include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/protobuf/transport_options.pb.h"
 #include "tensorflow/core/protobuf/worker.pb.h"
+#include "tensorflow/core/util/env_var.h"
 
 namespace tensorflow {
 
-const int kMaxWorkerRpcRetries = 10;
-
 class GrpcRemoteWorker : public WorkerInterface {
  public:
   explicit GrpcRemoteWorker(SharedGrpcChannelPtr channel,
@@ -274,7 +273,7 @@ class GrpcRemoteWorker : public WorkerInterface {
                     bool fail_fast = true) {
     new RPCState<protobuf::Message>(
         &stub_, cq_, method, *request, response, std::move(done), call_opts,
-        callback_threadpool_, /*max_retries=*/0, fail_fast, &target_);
+        callback_threadpool_, MaxRetries(), fail_fast, &target_);
   }
 
   void IssueRequest(const protobuf::Message* request, TensorResponse* response,
@@ -282,7 +281,7 @@ class GrpcRemoteWorker : public WorkerInterface {
                     CallOptions* call_opts = nullptr) {
     new RPCState<TensorResponse>(&stub_, cq_, method, *request, response,
                                  std::move(done), call_opts,
-                                 callback_threadpool_, /*max_retries=*/0,
+                                 callback_threadpool_, MaxRetries(),
                                  /*fail_fast=*/true, &target_);
   }
 
@@ -299,6 +298,14 @@ class GrpcRemoteWorker : public WorkerInterface {
   // Helper function for initializing the RpcMethod objects below.
   const char* Method(GrpcWorkerMethod id) { return GrpcWorkerMethodName(id); }
 
+  // Helper function for configuring max GRPC retries. Defaults to 0 (no
+  // retries).
+  const int64 MaxRetries() {
+    int64 max_retries = -1;
+    TF_CHECK_OK(ReadInt64FromEnvVar("GRPC_MAX_RETRIES", 0, &max_retries));
+    return max_retries;
+  }
+
   SharedGrpcChannelPtr channel_;
   ::grpc::GenericStub stub_;
   ::grpc::CompletionQueue* cq_;
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
index 83e072559e9..fb925e51497 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
@@ -47,6 +47,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/nccl/collective_communicator.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/mem.h"
@@ -249,7 +250,7 @@ Status GrpcServer::Init(const GrpcServerOptions& opts) {
                         .release();
   eager_service_ = new eager::GrpcEagerServiceImpl(&worker_env_, &builder);
 
-  profiler_service_ = CreateProfilerService();
+  profiler_service_ = profiler::CreateProfilerService();
   builder.RegisterService(profiler_service_.get());
 
   // extra service:
@@ -279,15 +280,15 @@ Status GrpcServer::Init(const GrpcServerOptions& opts) {
     }
   } else {
     std::unique_ptr<DeviceResolverDistributed> dev_resolver(
-        new DeviceResolverDistributed(worker_env_.device_mgr, worker_cache,
-                                      default_worker_name));
+        new DeviceResolverDistributed(worker_env_.device_mgr));
     std::unique_ptr<CollectiveParamResolverDistributed> param_resolver(
         new CollectiveParamResolverDistributed(config, worker_env_.device_mgr,
                                                dev_resolver.get(), worker_cache,
                                                default_worker_name));
     worker_env_.collective_executor_mgr.reset(new RpcCollectiveExecutorMgr(
         config, worker_env_.device_mgr, std::move(dev_resolver),
-        std::move(param_resolver), worker_cache, default_worker_name));
+        std::move(param_resolver), MaybeCreateNcclCommunicator(), worker_cache,
+        default_worker_name));
   }
 
   // Set up worker environment.
@@ -448,16 +449,15 @@ Status GrpcServer::UpdateServerDef(const ServerDef& server_def) {
     return errors::Internal("Could not parse worker name.");
   }
   std::unique_ptr<DeviceResolverDistributed> dev_resolver(
-      new DeviceResolverDistributed(worker_env_.device_mgr, worker_cache,
-                                    default_worker_name));
+      new DeviceResolverDistributed(worker_env_.device_mgr));
   std::unique_ptr<CollectiveParamResolverDistributed> param_resolver(
       new CollectiveParamResolverDistributed(
           server_def_.default_session_config(), worker_env_.device_mgr,
           dev_resolver.get(), worker_cache, default_worker_name));
   worker_env_.collective_executor_mgr.reset(new RpcCollectiveExecutorMgr(
       server_def_.default_session_config(), worker_env_.device_mgr,
-      std::move(dev_resolver), std::move(param_resolver), worker_cache,
-      default_worker_name));
+      std::move(dev_resolver), std::move(param_resolver),
+      MaybeCreateNcclCommunicator(), worker_cache, default_worker_name));
 
   master_env_.worker_cache = worker_cache;
   master_env_.collective_executor_mgr =
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding.cc b/tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding.cc
index b96baf93e03..b5ea1ebabde 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding.cc
@@ -140,6 +140,15 @@ static void EncodeSkeleton(const Tensor& val, io::ProtoEncodeHelper* e) {
 void EncodeTensorToByteBuffer(bool is_dead, const Tensor& val, bool require_ack,
                               ::grpc::ByteBuffer* result) {
   const int kLargeTensorBytes = 1024;
+  const int64 kProtoBufLimitBytes = 1LL << 31;
+
+  if (val.TotalBytes() > kProtoBufLimitBytes) {
+    size_t exceeded_bytes = val.TotalBytes() - kProtoBufLimitBytes;
+    LOG(FATAL) << "Cannot encode a Tensor that exceeds the 2GB protobuf limit. "
+                  "Exceeded bytes: "
+               << exceeded_bytes;
+  }
+
   RecvTensorResponse response;
   if (is_dead) {
     response.set_is_dead(is_dead);
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.cc
index 7fc89c46b15..5e3acda6276 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "grpcpp/impl/codegen/async_unary_call.h"
 #include "grpcpp/impl/codegen/channel_interface.h"
 #include "grpcpp/impl/codegen/client_unary_call.h"
-#include "grpcpp/impl/codegen/method_handler_impl.h"
+#include "grpcpp/impl/codegen/method_handler.h"
 #include "grpcpp/impl/codegen/rpc_service_method.h"
 #include "grpcpp/impl/codegen/service_type.h"
 #include "grpcpp/impl/codegen/sync_stream.h"
diff --git a/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.cc b/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.cc
index 0c3ef6ab075..1e5c0aa9e5d 100644
--- a/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.cc
+++ b/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.cc
@@ -29,9 +29,11 @@ RpcCollectiveExecutorMgr::RpcCollectiveExecutorMgr(
     const ConfigProto& config, const DeviceMgr* dev_mgr,
     std::unique_ptr<DeviceResolverDistributed> dev_resolver,
     std::unique_ptr<CollectiveParamResolverDistributed> param_resolver,
+    std::unique_ptr<NcclCommunicatorInterface> nccl_communicator,
     WorkerCacheInterface* worker_cache, const string& task_name)
     : CollectiveExecutorMgr(config, dev_mgr, std::move(dev_resolver),
-                            std::move(param_resolver)),
+                            std::move(param_resolver),
+                            std::move(nccl_communicator)),
       worker_cache_(worker_cache),
       task_name_(task_name) {
   group_leader_ = (task_name == config.experimental().collective_group_leader())
@@ -47,10 +49,11 @@ RpcCollectiveExecutorMgr::~RpcCollectiveExecutorMgr() {
 
 CollectiveExecutor* RpcCollectiveExecutorMgr::Create(int64 step_id) {
   CollectiveRemoteAccessDistributed* rma =
-      new CollectiveRemoteAccessDistributed(
-          dev_mgr_, dev_resolver_.get(), work_queue_, worker_cache_, step_id);
+      new CollectiveRemoteAccessDistributed(dev_mgr_, dev_resolver_.get(),
+                                            work_queue_, worker_cache_, step_id,
+                                            task_name_);
   return new BaseCollectiveExecutor(this, rma, step_id, dev_mgr_,
-                                    &gpu_ring_order_);
+                                    &gpu_ring_order_, work_queue_);
 }
 
 namespace {
diff --git a/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.h b/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.h
index 60e428f65a5..9b5959b3fe4 100644
--- a/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.h
+++ b/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.h
@@ -38,6 +38,7 @@ class RpcCollectiveExecutorMgr : public CollectiveExecutorMgr {
       const ConfigProto& config, const DeviceMgr* dev_mgr,
       std::unique_ptr<DeviceResolverDistributed> dev_resolver,
       std::unique_ptr<CollectiveParamResolverDistributed> param_resolver,
+      std::unique_ptr<NcclCommunicatorInterface> nccl_communicator,
       WorkerCacheInterface* worker_cache, const string& task_name);
 
   virtual ~RpcCollectiveExecutorMgr();
diff --git a/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr_test.cc b/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr_test.cc
index d49b040d59f..72061b93c14 100644
--- a/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr_test.cc
+++ b/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr_test.cc
@@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.h"
+
 #include <stdlib.h>
+
 #include <string>
 #include <vector>
 
@@ -21,10 +24,10 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h"
 #include "tensorflow/core/distributed_runtime/device_resolver_distributed.h"
-#include "tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.h"
 #include "tensorflow/core/framework/collective.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/nccl/collective_communicator.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/protobuf/worker.pb.h"
 #include "tensorflow/core/public/session_options.h"
@@ -45,16 +48,16 @@ class RpcCollectiveExecutorMgrTest : public ::testing::Test {
     std::vector<std::unique_ptr<Device>> devices;
     TF_CHECK_OK(DeviceFactory::AddDevices(options, task_name, &devices));
     device_mgr_ = absl::make_unique<StaticDeviceMgr>(std::move(devices));
-    std::unique_ptr<DeviceResolverDistributed> dr(new DeviceResolverDistributed(
-        device_mgr_.get(), worker_cache, task_name));
+    std::unique_ptr<DeviceResolverDistributed> dr(
+        new DeviceResolverDistributed(device_mgr_.get()));
     std::unique_ptr<CollectiveParamResolverDistributed> cpr(
         new CollectiveParamResolverDistributed(options.config,
                                                device_mgr_.get(), dr.get(),
                                                worker_cache, task_name));
     // This CME is the group leader.
-    cme_.reset(new RpcCollectiveExecutorMgr(options.config, device_mgr_.get(),
-                                            std::move(dr), std::move(cpr),
-                                            worker_cache, task_name));
+    cme_.reset(new RpcCollectiveExecutorMgr(
+        options.config, device_mgr_.get(), std::move(dr), std::move(cpr),
+        MaybeCreateNcclCommunicator(), worker_cache, task_name));
   }
 
   std::unique_ptr<RpcCollectiveExecutorMgr> cme_;
diff --git a/tensorflow/core/example/BUILD b/tensorflow/core/example/BUILD
index 52e62ecce52..d817f782daa 100644
--- a/tensorflow/core/example/BUILD
+++ b/tensorflow/core/example/BUILD
@@ -100,6 +100,9 @@ tf_proto_library(
     ],
     cc_api_version = 2,
     make_default_target_header_only = True,
+    tags = [
+        "alt_dep=//third_party/tensorflow/core:protos_all",
+    ],
 )
 
 tf_proto_library(
@@ -122,6 +125,9 @@ tf_proto_library(
         ":example_protos",
         ":example_parser_configuration_proto",
     ],
+    tags = [
+        "alt_dep=//third_party/tensorflow/core:protos_all",
+    ],
 )
 
 tf_pyclif_proto_library(
diff --git a/tensorflow/core/framework/BUILD b/tensorflow/core/framework/BUILD
index 8d2ccd79e77..d2c305e5eed 100644
--- a/tensorflow/core/framework/BUILD
+++ b/tensorflow/core/framework/BUILD
@@ -31,7 +31,7 @@ package(
 )
 
 # Export all header files for which we do not yet provide a dedicated build
-# rule. This avoids breading all the rules in tensorflow/core/BUILD.
+# rule. This avoids breaking all the rules in tensorflow/core/BUILD.
 exports_files(
     srcs = [
         "allocator_registry.h",
@@ -68,6 +68,7 @@ exports_files(
         "resource_mgr.h",
         "resource_op_kernel.h",
         "resource_var.h",
+        "rng_alg.h",
         "run_handler.h",
         "run_handler_util.h",
         "session_state.h",
@@ -214,6 +215,7 @@ filegroup(
         "shared_ptr_variant.h",
         "stats_aggregator.h",
         "tensor.h",
+        "tensor_key.h",
         "tensor_reference.h",
         "tensor_shape.h",
         "tensor_slice.h",
@@ -293,6 +295,7 @@ filegroup(
         "resource_handle.h",
         "tensor.cc",
         "tensor.h",
+        "tensor_key.h",
         "tensor_shape.cc",
         "tensor_shape.h",
         "tensor_types.h",
@@ -383,6 +386,7 @@ filegroup(
         "resource_mgr.h",
         "resource_op_kernel.h",
         "resource_var.h",
+        "rng_alg.h",
         "run_handler.cc",
         "run_handler.h",
         "run_handler_util.cc",
@@ -600,6 +604,7 @@ cc_library(
         ":numeric_types",
         "//tensorflow/core/platform:byte_order",
         "//tensorflow/core/platform:types",
+        "//third_party/eigen3",
     ],
     alwayslink = 1,
 )
@@ -612,7 +617,6 @@ cc_library(
         "//tensorflow/core:__subpackages__",
     ],
     deps = [
-        "//tensorflow/core/lib/bfloat16",
         "//tensorflow/core/platform:types",
         "//third_party/eigen3",
     ],
@@ -1042,6 +1046,7 @@ tf_cc_tests(
         "rendezvous_test.cc",
         "resource_mgr_test.cc",
         "resource_op_kernel_test.cc",
+        "selective_registration_test.cc",
         "shape_inference_test.cc",
         "shape_inference_testutil_test.cc",
         "tensor_shape_test.cc",
@@ -1095,9 +1100,9 @@ tf_cc_tests(
     ],
 )
 
-filegroup(
+cc_library(
     name = "pywrap_required_hdrs",
-    srcs = [
+    textual_hdrs = [
         "op_gen_lib.h",
         "rendezvous.h",
     ],
@@ -1502,4 +1507,7 @@ tf_proto_library(
         ":variable_proto",
         ":versions_proto",
     ],
+    tags = [
+        "alt_dep=//third_party/tensorflow/core:protos_all",
+    ],
 )
diff --git a/tensorflow/core/framework/allocator.cc b/tensorflow/core/framework/allocator.cc
index d20f779c8da..d032276d0f7 100644
--- a/tensorflow/core/framework/allocator.cc
+++ b/tensorflow/core/framework/allocator.cc
@@ -56,9 +56,7 @@ Allocator::~Allocator() {}
 // If true, cpu allocator collects full stats.
 static bool cpu_allocator_collect_full_stats = false;
 
-void EnableCPUAllocatorFullStats(bool enable) {
-  cpu_allocator_collect_full_stats = enable;
-}
+void EnableCPUAllocatorFullStats() { cpu_allocator_collect_full_stats = true; }
 bool CPUAllocatorFullStatsEnabled() { return cpu_allocator_collect_full_stats; }
 
 string AllocatorAttributes::DebugString() const {
diff --git a/tensorflow/core/framework/allocator.h b/tensorflow/core/framework/allocator.h
index dd226b205a9..f7402f7b293 100644
--- a/tensorflow/core/framework/allocator.h
+++ b/tensorflow/core/framework/allocator.h
@@ -39,17 +39,19 @@ class TensorShape;
 struct AllocationAttributes {
   AllocationAttributes() = default;
 
-  AllocationAttributes(bool no_retry_on_failure, bool allocation_will_be_logged,
+  AllocationAttributes(bool retry_on_failure, bool allocation_will_be_logged,
                        std::function<uint64()>* freed_by_func)
-      : no_retry_on_failure(no_retry_on_failure),
+      : retry_on_failure(retry_on_failure),
         allocation_will_be_logged(allocation_will_be_logged),
         freed_by_func(freed_by_func) {}
 
-  // If the first attempt to allocate the memory fails, the allocation
-  // should return immediately without retrying.
-  // An example use case is optional scratch spaces where a failure
-  // has only performance impact.
-  bool no_retry_on_failure = false;
+  // If the first attempt to allocate the memory fails, the allocation should
+  // wait and retry (with a timeout).
+  //
+  // This is usually set to true, but we may set it to false in cases where a
+  // failure has only performance impact (e.g. optional scratch space
+  // allocation).
+  bool retry_on_failure = true;
   // If a Tensor is allocated without the following set to true, then
   // it is logged as an unknown allocation. During execution Tensors
   // should be allocated through the OpKernelContext which records
@@ -408,14 +410,17 @@ Allocator* cpu_allocator_base();
 // call it directly.
 Allocator* cpu_allocator(int numa_node = port::kNUMANoAffinity);
 
-// If 'enable' is true, the default CPU allocator implementation will collect
-// AllocatorStats. By default, it's disabled.
-void EnableCPUAllocatorStats(bool enable);
+// Enables AllocatorStats in the default CPU allocator implementation.  By
+// default, it's disabled.
+void EnableCPUAllocatorStats();
+// Disables AllocatorStats in the default CPU allocator implementation.  By
+// default, it's disabled.
+void DisableCPUAllocatorStats();
 bool CPUAllocatorStatsEnabled();
 
-// If 'enable' is true, the default CPU allocator implementation will collect
-// full statistics. By default, it's disabled.
-void EnableCPUAllocatorFullStats(bool enable);
+// Enables full statistics collection in the default CPU allocator
+// implementation.  By default, it's disabled.
+void EnableCPUAllocatorFullStats();
 bool CPUAllocatorFullStatsEnabled();
 
 // An object that does the underlying suballoc/free of memory for a higher-level
diff --git a/tensorflow/core/framework/allocator_test.cc b/tensorflow/core/framework/allocator_test.cc
index 3caab02eeba..0ac3da1a19c 100644
--- a/tensorflow/core/framework/allocator_test.cc
+++ b/tensorflow/core/framework/allocator_test.cc
@@ -133,7 +133,7 @@ TEST(AllocatorAttributesDeathTest, MergeDifferentScopeIds) {
 }
 
 TEST(CPUAllocatorTest, Simple) {
-  EnableCPUAllocatorStats(true);
+  EnableCPUAllocatorStats();
   Allocator* a = cpu_allocator();
   std::vector<void*> ptrs;
   for (int s = 1; s < 1024; s++) {
@@ -162,7 +162,7 @@ TEST(CPUAllocatorTest, Simple) {
              1048576 * sizeof(double));
   a->ClearStats();
   CheckStats(a, 0, 0, 0, 0);
-  EnableCPUAllocatorStats(false);
+  DisableCPUAllocatorStats();
 }
 
 // Define a struct that we will use to observe behavior in the unit tests
@@ -227,13 +227,13 @@ static void BM_Allocation(int iters, int arg) {
   std::vector<int> sizes = {256, 4096, 16384, 524288, 512, 1048576};
   int size_index = 0;
 
-  if (arg) EnableCPUAllocatorStats(true);
+  if (arg) EnableCPUAllocatorStats();
   while (--iters > 0) {
     int bytes = sizes[size_index++ % sizes.size()];
     void* p = a->AllocateRaw(1, bytes);
     a->DeallocateRaw(p);
   }
-  if (arg) EnableCPUAllocatorStats(false);
+  if (arg) DisableCPUAllocatorStats();
 }
 BENCHMARK(BM_Allocation)->Arg(0)->Arg(1);
 
diff --git a/tensorflow/core/framework/bfloat16.cc b/tensorflow/core/framework/bfloat16.cc
index 6025be51704..c79b0dfb672 100644
--- a/tensorflow/core/framework/bfloat16.cc
+++ b/tensorflow/core/framework/bfloat16.cc
@@ -15,36 +15,34 @@ limitations under the License.
 
 #include "tensorflow/core/framework/bfloat16.h"
 
+#include "third_party/eigen3/Eigen/Core"
+
 namespace tensorflow {
 
+void RoundFloatToBFloat16(const float* src, bfloat16* dst, int64 size) {
+  Eigen::Map<const Eigen::ArrayXf> src_eigen(src, size);
+  Eigen::Map<Eigen::Array<bfloat16, Eigen::Dynamic, 1>> dst_eigen(dst, size);
+  dst_eigen = src_eigen.cast<bfloat16>();
+}
+
 void FloatToBFloat16(const float* src, bfloat16* dst, int64 size) {
-  const uint16_t* p = reinterpret_cast<const uint16_t*>(src);
-  uint16_t* q = reinterpret_cast<uint16_t*>(dst);
+  for (; size != 0; src++, dst++, size--) {
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-  for (; size != 0; p += 2, q++, size--) {
-    *q = p[0];
-  }
+    memcpy(dst, src, sizeof(bfloat16));
 #else
-  for (; size != 0; p += 2, q++, size--) {
-    *q = p[1];
-  }
+    memcpy(
+        dst,
+        reinterpret_cast<const char*>(src) + sizeof(float) - sizeof(bfloat16),
+        sizeof(bfloat16));
 #endif
+  }
 }
 
 void BFloat16ToFloat(const bfloat16* src, float* dst, int64 size) {
-  const uint16_t* p = reinterpret_cast<const uint16_t*>(src);
-  uint16_t* q = reinterpret_cast<uint16_t*>(dst);
-#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-  for (; size != 0; p++, q += 2, size--) {
-    q[0] = *p;
-    q[1] = 0;
-  }
-#else
-  for (; size != 0; p++, q += 2, size--) {
-    q[0] = 0;
-    q[1] = *p;
-  }
-#endif
+  Eigen::Map<const Eigen::Array<bfloat16, Eigen::Dynamic, 1>> src_eigen(src,
+                                                                        size);
+  Eigen::Map<Eigen::ArrayXf> dst_eigen(dst, size);
+  dst_eigen = src_eigen.cast<float>();
 }
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/framework/bfloat16.h b/tensorflow/core/framework/bfloat16.h
index cd608ad9a4c..0d1a074cccf 100644
--- a/tensorflow/core/framework/bfloat16.h
+++ b/tensorflow/core/framework/bfloat16.h
@@ -48,9 +48,12 @@ limitations under the License.
 
 namespace tensorflow {
 
-// Conversion routines between an array of float and bfloat16 of
-// "size".
+// Convert from float to bfloat16 with rounding-to-nearest-even.
+void RoundFloatToBFloat16(const float* src, bfloat16* dst, int64 size);
+// Convert from float to bfloat16 with truncation. Notice this conversion is
+// lossy since it truncates the float to 7 mantissa bits without rounding.
 void FloatToBFloat16(const float* src, bfloat16* dst, int64 size);
+// Convert from bfloat16 to float. This conversion is lossless.
 void BFloat16ToFloat(const bfloat16* src, float* dst, int64 size);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/bfloat16_test.cc b/tensorflow/core/framework/bfloat16_test.cc
index fe1296f19fe..0de298cfce8 100644
--- a/tensorflow/core/framework/bfloat16_test.cc
+++ b/tensorflow/core/framework/bfloat16_test.cc
@@ -23,140 +23,6 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-TEST(Bfloat16Test, ZeroRepresentations) {
-  ASSERT_EQ(bfloat16{0.0f}, bfloat16{0.0f});
-  ASSERT_EQ(bfloat16{-0.0f}, bfloat16{0.0f});
-  ASSERT_EQ(bfloat16{-0.0f}, bfloat16{-0.0f});
-  ASSERT_EQ(bfloat16{0.0f}.value, 0x0000);
-  ASSERT_EQ(bfloat16{-0.0f}.value, 0x8000);
-}
-
-TEST(Bfloat16Test, FlushDenormalsToZero) {
-  for (float denorm = -std::numeric_limits<float>::denorm_min();
-       denorm < std::numeric_limits<float>::denorm_min();
-       denorm = std::nextafterf(denorm, 1.0f)) {
-    bfloat16 bf_trunc =
-        bfloat16(Eigen::bfloat16_impl::truncate_to_bfloat16(denorm));
-    ASSERT_EQ(static_cast<float>(bf_trunc), 0.0f);
-    if (std::signbit(denorm)) {
-      ASSERT_EQ(bf_trunc.value, 0x8000) << denorm;
-    } else {
-      ASSERT_EQ(bf_trunc.value, 0x0000) << denorm;
-    }
-    bfloat16 bf_round(denorm);
-    ASSERT_EQ(static_cast<float>(bf_round), 0.0f);
-    if (std::signbit(denorm)) {
-      ASSERT_EQ(bf_round.value, 0x8000) << denorm;
-    } else {
-      ASSERT_EQ(bf_round.value, 0x0000) << denorm;
-    }
-  }
-}
-
-TEST(Bfloat16Test, DefaultValueIsZero) {
-  EXPECT_EQ(0.0f, static_cast<float>(bfloat16()));
-}
-
-TEST(Bfloat16Test, RepresentableFloatsRoundTripViaBfloat16) {
-  const std::vector<float> values = {
-      -std::numeric_limits<float>::infinity(), -1.0, -0.5, -0.0, 0.0, 0.5, 1.0,
-      std::numeric_limits<float>::infinity(),
-  };
-  for (float v : values) {
-    EXPECT_EQ(v, static_cast<float>(static_cast<bfloat16>(v)));
-  }
-}
-
-TEST(Bfloat16Test, Simple) {
-  bfloat16 a(12);
-  // Floating point representation of 12: 0x41400000
-  EXPECT_EQ(0x4140, a.value);
-}
-
-float BinaryToFloat(uint32_t sign, uint32_t exponent, uint32_t high_mantissa,
-                    uint32_t low_mantissa) {
-  return absl::bit_cast<float>((sign << 31) + (exponent << 23) +
-                               (high_mantissa << 16) + low_mantissa);
-}
-
-struct Bfloat16TestParam {
-  float input;
-  float expected_truncation;
-  float expected_rounding;
-};
-
-class Bfloat16Test : public ::testing::Test,
-                     public ::testing::WithParamInterface<Bfloat16TestParam> {};
-
-TEST_P(Bfloat16Test, TruncateTest) {
-  bfloat16 truncated =
-      bfloat16(Eigen::bfloat16_impl::truncate_to_bfloat16((GetParam().input)));
-
-  if (std::isnan(GetParam().input)) {
-    EXPECT_TRUE(std::isnan(float(truncated)) || std::isinf(float(truncated)));
-    return;
-  }
-
-  EXPECT_EQ(GetParam().expected_truncation, float(truncated));
-
-  bfloat16 rounded(GetParam().input);
-  if (std::isnan(GetParam().input)) {
-    EXPECT_TRUE(std::isnan(float(rounded)) || std::isinf(float(rounded)));
-    return;
-  }
-  EXPECT_EQ(GetParam().expected_rounding, float(rounded));
-}
-
-INSTANTIATE_TEST_SUITE_P(
-    Bfloat16Test_Instantiation, Bfloat16Test,
-    ::testing::Values(
-        Bfloat16TestParam{
-            BinaryToFloat(0, 0b10000000, 0b1001000, 0b1111010111000011),
-            BinaryToFloat(0, 0b10000000, 0b1001000, 0b0000000000000000),
-            BinaryToFloat(0, 0b10000000, 0b1001001, 0b0000000000000000)},
-        Bfloat16TestParam{
-            BinaryToFloat(1, 0b10000000, 0b1001000, 0b1111010111000011),
-            BinaryToFloat(1, 0b10000000, 0b1001000, 0b0000000000000000),
-            BinaryToFloat(1, 0b10000000, 0b1001001, 0b0000000000000000)},
-        Bfloat16TestParam{
-            BinaryToFloat(0, 0b10000000, 0b1001000, 0b1000000000000000),
-            BinaryToFloat(0, 0b10000000, 0b1001000, 0b0000000000000000),
-            BinaryToFloat(0, 0b10000000, 0b1001000, 0b0000000000000000)},
-        Bfloat16TestParam{
-            BinaryToFloat(0, 0b11111111, 0b0000000, 0b0000000000000001),
-            BinaryToFloat(0, 0b11111111, 0b0000000, 0b0000000000000000),
-            BinaryToFloat(0, 0b11111111, 0b1000000, 0b0000000000000000)},
-        Bfloat16TestParam{
-            BinaryToFloat(0, 0b11111111, 0b1111111, 0b1111111111111111),
-            BinaryToFloat(0, 0b11111111, 0b1111111, 0b0000000000000000),
-            BinaryToFloat(0, 0b11111111, 0b1000000, 0b0000000000000000)},
-        Bfloat16TestParam{
-            BinaryToFloat(1, 0b10000000, 0b1001000, 0b1100000000000000),
-            BinaryToFloat(1, 0b10000000, 0b1001000, 0b0000000000000000),
-            BinaryToFloat(1, 0b10000000, 0b1001001, 0b0000000000000000)},
-        Bfloat16TestParam{
-            BinaryToFloat(0, 0b10000000, 0b1001000, 0b0000000000000000),
-            BinaryToFloat(0, 0b10000000, 0b1001000, 0b0000000000000000),
-            BinaryToFloat(0, 0b10000000, 0b1001000, 0b0000000000000000)},
-        Bfloat16TestParam{
-            BinaryToFloat(0, 0b10000000, 0b1001000, 0b0100000000000000),
-            BinaryToFloat(0, 0b10000000, 0b1001000, 0b0000000000000000),
-            BinaryToFloat(0, 0b10000000, 0b1001000, 0b0000000000000000)},
-        Bfloat16TestParam{
-            BinaryToFloat(0, 0b10000000, 0b1001000, 0b1000000000000000),
-            BinaryToFloat(0, 0b10000000, 0b1001000, 0b0000000000000000),
-            BinaryToFloat(0, 0b10000000, 0b1001000, 0b0000000000000000)},
-        // The following two floats are denormals and will be flushed
-        // to zero.
-        Bfloat16TestParam{
-            BinaryToFloat(0, 0b00000000, 0b1001000, 0b1000000000000000),
-            BinaryToFloat(0, 0b00000000, 0b0000000, 0b0000000000000000),
-            BinaryToFloat(0, 0b00000000, 0b0000000, 0b0000000000000000)},
-        Bfloat16TestParam{
-            BinaryToFloat(0, 0b00000000, 0b1111111, 0b1100000000000000),
-            BinaryToFloat(0, 0b00000000, 0b0000000, 0b0000000000000000),
-            BinaryToFloat(0, 0b00000000, 0b0000000, 0b0000000000000000)}));
-
 TEST(Bfloat16Test, Conversion) {
   float a[100];
   for (int i = 0; i < 100; ++i) {
@@ -173,21 +39,6 @@ TEST(Bfloat16Test, Conversion) {
   }
 }
 
-TEST(Bfloat16Test, Epsilon) {
-  EXPECT_LT(1.0f,
-            static_cast<float>(Eigen::NumTraits<Eigen::bfloat16>::epsilon() +
-                               bfloat16(1.0f)));
-  EXPECT_EQ(1.0f,
-            static_cast<float>((Eigen::NumTraits<Eigen::bfloat16>::epsilon() /
-                                bfloat16(2.0f)) +
-                               bfloat16(1.0f)));
-}
-
-TEST(Bfloat16Test, Negate) {
-  EXPECT_EQ(-3.0f, static_cast<float>(-bfloat16(3.0f)));
-  EXPECT_EQ(4.5f, static_cast<float>(-bfloat16(-4.5f)));
-}
-
 static void BM_FloatToBFloat16(int iters) {
   testing::StopTiming();
   static const int N = 32 << 20;
@@ -207,12 +58,6 @@ static void BM_FloatToBFloat16(int iters) {
 }
 BENCHMARK(BM_FloatToBFloat16);
 
-void RoundFloatToBFloat16(const float* src, bfloat16* dst, int64 size) {
-  for (; size != 0; size--) {
-    dst[size] = bfloat16(src[size]);
-  }
-}
-
 static void BM_RoundFloatToBFloat16(int iters) {
   testing::StopTiming();
   static const int N = 32 << 20;
diff --git a/tensorflow/core/framework/collective.cc b/tensorflow/core/framework/collective.cc
index 0b90adb78f8..e1a655ed121 100644
--- a/tensorflow/core/framework/collective.cc
+++ b/tensorflow/core/framework/collective.cc
@@ -158,14 +158,13 @@ string CollectiveParams::ToString() const {
   return ctx->params_;
 }
 
-CollectiveContext::CollectiveContext(CollectiveExecutor* col_exec,
-                                     const DeviceMgr* dev_mgr,
-                                     OpKernelContext* ctx,
-                                     OpKernelContext::Params* op_params,
-                                     const CollectiveParams& col_params,
-                                     const string& exec_key, int64 step_id,
-                                     const Tensor* input, Tensor* output)
+CollectiveContext::CollectiveContext(
+    CollectiveExecutor* col_exec, NcclCommunicatorInterface* nccl_communicator,
+    const DeviceMgr* dev_mgr, OpKernelContext* ctx,
+    OpKernelContext::Params* op_params, const CollectiveParams& col_params,
+    const string& exec_key, int64 step_id, const Tensor* input, Tensor* output)
     : col_exec(col_exec),
+      nccl_communicator(nccl_communicator),
       dev_mgr(dev_mgr),
       op_ctx(ctx),
       op_params(op_params),
diff --git a/tensorflow/core/framework/collective.h b/tensorflow/core/framework/collective.h
index 94e83fa2f08..f6020a29748 100644
--- a/tensorflow/core/framework/collective.h
+++ b/tensorflow/core/framework/collective.h
@@ -36,6 +36,7 @@ class Device;
 class DeviceMgr;
 class GetStepSequenceRequest;
 class GetStepSequenceResponse;
+class NcclManager;
 class Tensor;
 
 // Types of supported collective operations.
@@ -43,6 +44,7 @@ enum CollectiveType {
   REDUCTION_COLLECTIVE = 0,
   BROADCAST_COLLECTIVE,
   GATHER_COLLECTIVE,
+  PERMUTE_COLLECTIVE,
   UNDEFINED_COLLECTIVE,
 };
 
@@ -89,6 +91,7 @@ struct CollImplDetails {
 };
 
 // Data common to all members of a collective instance.
+// TODO(b/163171014) Refactor this struct to not be a union of all fields.
 struct CollInstanceParams {
   // Identifies all participating graph nodes.
   int32 instance_key = -1;
@@ -109,6 +112,16 @@ struct CollInstanceParams {
   CollImplDetails impl_details;
   string ToString() const;
   CollInstanceParams& operator=(const struct CollInstanceParams& other);
+  std::vector<string> devices;  // permuter only
+
+  // For permuter only
+  // Each rank in the permutation is a receiver.
+  // Indices of each rank means a sender to that rank.
+  // Example: permutation = {2,0,1} means
+  //   rank 0 sends to rank 2
+  //   rank 1 sends to rank 0
+  //   rank 2 sends to rank 1
+  std::vector<int> permutation;
 };
 
 // Data common to all instance members in the same task.
@@ -142,25 +155,18 @@ class DeviceResolverInterface {
  public:
   virtual ~DeviceResolverInterface() {}
 
-  // Collects DeviceAttributes protobufs from all of the devices identified
-  // in 'col_params'.
-  virtual void GetAllDeviceAttributesAsync(
-      const std::vector<string>& devices, const std::vector<string>& tasks,
-      std::vector<DeviceAttributes>* attributes,
-      const StatusCallback& done) = 0;
+  // Populates *attributes with the DeviceAttributes of the specified device.
+  virtual Status GetDeviceAttributes(const string& device,
+                                     DeviceAttributes* attributes) = 0;
 
-  // Populate *attributes with the DeviceAttributes of the specified
-  // device.
-  virtual void GetDeviceAttributesAsync(const string& device,
-                                        const string& task,
-                                        DeviceAttributes* attributes,
-                                        const StatusCallback& done) = 0;
+  // Returns all device attributes of a task.
+  virtual Status GetAllDeviceAttributes(
+      const string& task, std::vector<DeviceAttributes>* attributes) = 0;
 
-  // Clear the cache of device data belonging to the specified task.
-  virtual void ClearTask(const string& task) = 0;
-
-  // Clear the cache of all device data.
-  virtual void ClearCache() = 0;
+  // Updates device attributes. It returns error if any device already
+  // exists in the DeviceResolver and has a different incarnation.
+  virtual Status UpdateDeviceAttributes(
+      const std::vector<DeviceAttributes>& attributes) = 0;
 };
 
 // Interface that provides resolution of shared CollectiveParams fields.
@@ -171,7 +177,8 @@ class ParamResolverInterface {
   // Called by each collective op at first execution in order to fill out
   // the CollectiveParams structure with data gathered from the full
   // (maybe distributed) collection of peer nodes.
-  virtual void CompleteParamsAsync(const string& device, CollectiveParams* cp,
+  virtual void CompleteParamsAsync(const DeviceAttributes& device,
+                                   CollectiveParams* cp,
                                    CancellationManager* cancel_mgr,
                                    const StatusCallback& done) = 0;
 
@@ -223,6 +230,8 @@ class StepSequenceInterface {
   virtual void RetireStepId(int64 graph_key, int64 step_id) = 0;
 };
 
+class NcclCommunicatorInterface;
+
 // Interface that provides access to per-step CollectiveExecutor
 // instances and various distributed resolution capabilities.
 class CollectiveExecutorMgrInterface : public StepSequenceInterface {
@@ -240,15 +249,17 @@ class CollectiveExecutorMgrInterface : public StepSequenceInterface {
   virtual ParamResolverInterface* GetParamResolver() const = 0;
 
   virtual DeviceResolverInterface* GetDeviceResolver() const = 0;
+
+  virtual NcclCommunicatorInterface* GetNcclCommunicator() const = 0;
 };
 
 // Interface that a Collective Op implementation uses to exchange data
 // with peers.  Note that data exchange is currently limited to types
 // for which DMAHelper::CanUseDMA() returns true, i.e.  dense numeric
 // types.
-class PeerAccessInterface {
+class CollectiveRemoteAccess {
  public:
-  virtual ~PeerAccessInterface() {}
+  virtual ~CollectiveRemoteAccess() {}
 
   virtual void RecvFromPeer(const string& peer_device, const string& peer_task,
                             bool peer_is_local, const string& key,
@@ -267,15 +278,20 @@ class PeerAccessInterface {
                           const DeviceLocality& client_locality,
                           const StatusCallback& done) = 0;
 
-  // Runs the potentially-blocking closure/expensive callback.
-  virtual void RunClosure(std::function<void()> closure) = 0;
-};
+  // Checks the health of a collective peer. It probes the peer to see if it is
+  // alive. Note that if a peer has restarted, it's considered a different one,
+  // so CheckPeerHealth fails.
+  virtual void CheckPeerHealth(const string& peer_task,
+                               const StatusCallback& done) = 0;
 
-class PerStepCollectiveRemoteAccess;
+  virtual BufRendezvous* buf_rendezvous() = 0;
+
+  virtual void StartAbort(const Status& s) = 0;
+};
 
 // A step-specific object that can execute a collective operation completely
 // described by a CollectiveParams object.
-class CollectiveExecutor : public PeerAccessInterface, public core::RefCounted {
+class CollectiveExecutor : public core::RefCounted {
  public:
   virtual void StartAbort(const Status& s) {}
 
@@ -287,7 +303,8 @@ class CollectiveExecutor : public PeerAccessInterface, public core::RefCounted {
         "a CollectiveExecutor has not been provided."));
   }
 
-  virtual void CompleteParamsAsync(const string& device, CollectiveParams* cp,
+  virtual void CompleteParamsAsync(const DeviceAttributes& device,
+                                   CollectiveParams* cp,
                                    CancellationManager* cancel_mgr,
                                    StatusCallback done) {
     done(errors::Internal(
@@ -295,7 +312,10 @@ class CollectiveExecutor : public PeerAccessInterface, public core::RefCounted {
         "a CollectiveExecutor has not been provided."));
   }
 
-  virtual PerStepCollectiveRemoteAccess* remote_access() { return nullptr; }
+  // Runs the potentially-blocking closure/expensive callback.
+  virtual void RunClosure(std::function<void()> closure) = 0;
+
+  virtual CollectiveRemoteAccess* remote_access() { return nullptr; }
 
   // `WaitForDependencies` and `Launched` are used for fine-grained control of
   // execution order between collective instances.  These functions are intended
@@ -339,37 +359,12 @@ class CollectiveExecutor : public PeerAccessInterface, public core::RefCounted {
   TF_DISALLOW_COPY_AND_ASSIGN(CollectiveExecutor);
 };
 
-// Interface of a helper object that provides a CollectiveExecutor with
-// all of the remote access it needs.
-class CollectiveRemoteAccess : public PeerAccessInterface,
-                               public DeviceResolverInterface {
- public:
-  virtual ~CollectiveRemoteAccess() {}
-
-  virtual BufRendezvous* buf_rendezvous() = 0;
-};
-
-// A per-step version of CollectiveRemoteAccess that cleans up outstanding
-// communications in case step execution is abandoned.
-class PerStepCollectiveRemoteAccess : public CollectiveRemoteAccess {
- public:
-  virtual ~PerStepCollectiveRemoteAccess() {}
-  virtual void StartAbort(const Status& s) = 0;
-};
-
-class CollectiveContext {
- public:
-  CollectiveContext(CollectiveExecutor* col_exec, const DeviceMgr* dev_mgr,
-                    OpKernelContext* ctx, OpKernelContext::Params* op_params,
-                    const CollectiveParams& col_params, const string& exec_key,
-                    int64 step_id, const Tensor* input, Tensor* output);
-
-  virtual ~CollectiveContext() = default;
-
-  CollectiveExecutor* col_exec;        // Not owned
-  const DeviceMgr* dev_mgr;            // Not owned
-  OpKernelContext* op_ctx;             // Not owned
-  OpKernelContext::Params* op_params;  // Not owned
+struct CollectiveContext {
+  CollectiveExecutor* col_exec;                  // Not owned
+  NcclCommunicatorInterface* nccl_communicator;  // Not owned
+  const DeviceMgr* dev_mgr;                      // Not owned
+  OpKernelContext* op_ctx;                       // Not owned
+  OpKernelContext::Params* op_params;            // Not owned
   const CollectiveParams& col_params;
   const string exec_key;
   const int64 step_id;
@@ -378,6 +373,23 @@ class CollectiveContext {
   Device* device;       // The device for which this instance labors
   const string device_name;
   DeviceLocality device_locality;
+
+  CollectiveContext(CollectiveExecutor* col_exec,
+                    NcclCommunicatorInterface* nccl_communicator,
+                    const DeviceMgr* dev_mgr, OpKernelContext* ctx,
+                    OpKernelContext::Params* op_params,
+                    const CollectiveParams& col_params, const string& exec_key,
+                    int64 step_id, const Tensor* input, Tensor* output);
+};
+
+class NcclCommunicatorInterface {
+ public:
+  virtual ~NcclCommunicatorInterface() = default;
+
+  virtual void Enqueue(std::shared_ptr<CollectiveContext> col_ctx,
+                       StatusCallback done) = 0;
+
+  virtual void StartAbort(const Status& s) = 0;
 };
 
 // Interface of a Collective Op implementation.  Each specific CollectiveOp will
diff --git a/tensorflow/core/framework/common_shape_fns.cc b/tensorflow/core/framework/common_shape_fns.cc
index 36ae36e7b74..c54418ba648 100644
--- a/tensorflow/core/framework/common_shape_fns.cc
+++ b/tensorflow/core/framework/common_shape_fns.cc
@@ -1477,7 +1477,8 @@ Status MatrixSetDiagV2Shape(shape_inference::InferenceContext* c) {
   return Status::OK();
 }
 
-Status MaxPoolShape(shape_inference::InferenceContext* c) {
+Status MaxPoolShapeImpl(shape_inference::InferenceContext* c,
+                        bool supports_explicit_padding) {
   string data_format_str;
   TensorFormat data_format;
   Status s = c->GetAttr("data_format", &data_format_str);
@@ -1530,14 +1531,39 @@ Status MaxPoolShape(shape_inference::InferenceContext* c) {
   Padding padding;
   TF_RETURN_IF_ERROR(c->GetAttr("padding", &padding));
 
+  std::vector<int64> explicit_paddings;
+  if (supports_explicit_padding) {
+    Status status = c->GetAttr("explicit_paddings", &explicit_paddings);
+    // Use the default value, which is an empty list, if the attribute is not
+    // found. Otherwise return the error to the caller.
+    if (!status.ok() && !errors::IsNotFound(status)) {
+      return status;
+    }
+    TF_RETURN_IF_ERROR(CheckValidPadding(padding, explicit_paddings,
+                                         /*num_dims=*/4, data_format));
+  } else {
+    DCHECK(padding != Padding::EXPLICIT);
+  }
+
   ShapeHandle output_shape;
   DimensionHandle output_rows, output_cols, output_depth;
-  TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDims(
-      c, in_rows_dim, kernel_rows, stride_rows, padding, &output_rows));
-  TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDims(
-      c, in_cols_dim, kernel_cols, stride_cols, padding, &output_cols));
-  TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDims(
-      c, in_depth_dim, kernel_depth, stride_depth, padding, &output_depth));
+  int64 pad_rows_before = -1, pad_rows_after = -1;
+  int64 pad_cols_before = -1, pad_cols_after = -1;
+  if (padding == Padding::EXPLICIT) {
+    GetExplicitPaddingForDim(explicit_paddings, data_format, 'H',
+                             &pad_rows_before, &pad_rows_after);
+    GetExplicitPaddingForDim(explicit_paddings, data_format, 'W',
+                             &pad_cols_before, &pad_cols_after);
+  }
+  TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDimsV2(
+      c, in_rows_dim, kernel_rows, /*dilation_rate=*/1, stride_rows, padding,
+      pad_rows_before, pad_rows_after, &output_rows));
+  TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDimsV2(
+      c, in_cols_dim, kernel_cols, /*dilation_rate=*/1, stride_cols, padding,
+      pad_cols_before, pad_cols_after, &output_cols));
+  TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDimsV2(
+      c, in_depth_dim, kernel_depth, /*dilation_rate=*/1, stride_depth, padding,
+      /*pad_before*/ 0, /*pad_after*/ 0, &output_depth));
 
   TF_RETURN_IF_ERROR(MakeShapeFromFormat(data_format, batch_size_dim,
                                          {output_rows, output_cols},
@@ -1547,6 +1573,14 @@ Status MaxPoolShape(shape_inference::InferenceContext* c) {
   return Status::OK();
 }
 
+Status MaxPoolShape(shape_inference::InferenceContext* c) {
+  return MaxPoolShapeImpl(c, /*supports_explicit_padding=*/false);
+}
+
+Status MaxPoolShapeWithExplicitPadding(shape_inference::InferenceContext* c) {
+  return MaxPoolShapeImpl(c, /*supports_explicit_padding=*/true);
+}
+
 Status MaxPoolV2Shape(shape_inference::InferenceContext* c, int num_inputs) {
   string data_format_str;
   TensorFormat data_format;
@@ -2257,66 +2291,57 @@ Status GatherNdShape(InferenceContext* c) {
   return Status::OK();
 }
 
-Status ScatterNdUpdateShape(InferenceContext* c) {
-  ShapeHandle input_shape = c->input(0);
-  if (c->input_handle_shapes_and_types(0) != nullptr) {
-    // This is called for tf.scatter_nd_update; input is a Variable handle.
-    const auto& shape_and_type = *(c->input_handle_shapes_and_types(0));
-    if (shape_and_type.size() == 1) {
-      input_shape = shape_and_type[0].shape;
-    }
-  }
-  ShapeHandle indices_shape;
-  TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(1), 1, &indices_shape));
-  ShapeHandle updates_shape;
-  TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(2), 1, &updates_shape));
-
+Status ScatterNdShapeHelper(InferenceContext* c, ShapeHandle indices_shape,
+                            ShapeHandle updates_shape,
+                            ShapeHandle input_shape) {
   if (c->Value(c->NumElements(input_shape)) == 0 &&
       (c->Value(c->NumElements(indices_shape)) > 0 ||
        c->Value(c->NumElements(updates_shape)) > 0)) {
     return errors::InvalidArgument(
-        "Indices and updates specified for empty output shape");
+        "Indices and updates specified for empty input");
   }
 
   if (c->RankKnown(indices_shape) && c->RankKnown(updates_shape)) {
-    const int64 num_outer_dims = c->Rank(indices_shape) - 1;
-    const DimensionHandle index_size = c->Dim(indices_shape, -1);
+    const int64 outer_dims = c->Rank(indices_shape) - 1;
+    const DimensionHandle ixdim = c->Dim(indices_shape, -1);
 
     // We can only do more validation if the last dimension of indices
     // is a known value.
-    if (c->ValueKnown(index_size)) {
-      const int64 ix = c->Value(index_size);
+    if (c->ValueKnown(ixdim)) {
+      int64 ix = c->Value(ixdim);
       ShapeHandle unused;
       ShapeHandle prefix_indices;
       TF_RETURN_IF_ERROR(
-          c->Subshape(indices_shape, 0, num_outer_dims, &prefix_indices));
+          c->Subshape(indices_shape, 0, outer_dims, &prefix_indices));
       ShapeHandle prefix_updates;
       TF_RETURN_IF_ERROR(
-          c->Subshape(updates_shape, 0, num_outer_dims, &prefix_updates));
+          c->Subshape(updates_shape, 0, outer_dims, &prefix_updates));
 
       Status s = c->Merge(prefix_indices, prefix_updates, &unused);
       if (!s.ok()) {
         return errors::InvalidArgument(
-            "The outer ", num_outer_dims,
-            " dimensions of indices.shape=", c->DebugString(indices_shape),
-            " must match the outer ", num_outer_dims,
-            " dimensions of updates.shape=", c->DebugString(updates_shape),
-            ": ", s.error_message());
+            "Dimensions [0,", outer_dims,
+            ") of indices[shape=", c->DebugString(indices_shape),
+            "] = ", c->DebugString(prefix_indices),
+            " must match dimensions [0,", outer_dims,
+            ") of updates[shape=", c->DebugString(updates_shape),
+            "] = ", c->DebugString(prefix_updates), ": ", s.error_message());
       }
 
-      ShapeHandle input_suffix;
-      TF_RETURN_IF_ERROR(c->Subshape(input_shape, ix, &input_suffix));
+      ShapeHandle suffix_output;
+      TF_RETURN_IF_ERROR(c->Subshape(input_shape, ix, &suffix_output));
       ShapeHandle suffix_updates;
       TF_RETURN_IF_ERROR(
-          c->Subshape(updates_shape, num_outer_dims, &suffix_updates));
-      s = c->Merge(input_suffix, suffix_updates, &unused);
+          c->Subshape(updates_shape, outer_dims, &suffix_updates));
+      s = c->Merge(suffix_output, suffix_updates, &unused);
       if (!s.ok()) {
         return errors::InvalidArgument(
-            "The inner ", c->Rank(input_shape) - ix,
-            " dimensions of input.shape=", c->DebugString(input_shape),
-            " must match the inner ", c->Rank(updates_shape) - num_outer_dims,
-            " dimensions of updates.shape=", c->DebugString(updates_shape),
-            ": ", s.error_message());
+            "Dimensions [", ix, ",", c->Rank(input_shape),
+            ") of input[shape=", c->DebugString(input_shape),
+            "] = ", c->DebugString(suffix_output), " must match dimensions [",
+            outer_dims, ",", c->Rank(updates_shape),
+            ") of updates[shape=", c->DebugString(updates_shape),
+            "] = ", c->DebugString(suffix_updates), ": ", s.error_message());
       }
     }
   }
diff --git a/tensorflow/core/framework/common_shape_fns.h b/tensorflow/core/framework/common_shape_fns.h
index 218400c2435..3b14666305e 100644
--- a/tensorflow/core/framework/common_shape_fns.h
+++ b/tensorflow/core/framework/common_shape_fns.h
@@ -168,7 +168,11 @@ Status MatrixDiagV2Shape(shape_inference::InferenceContext* c);
 // Shape function for MatrixSetDiagV2 and MatrixSetDiagV3 operations.
 Status MatrixSetDiagV2Shape(shape_inference::InferenceContext* c);
 
-// Shape function for MaxPool-like operations.
+// Shape function for MaxPool-like operations that support explicit padding.
+Status MaxPoolShapeWithExplicitPadding(shape_inference::InferenceContext* c);
+
+// Shape function for MaxPool-like operations that do not support explicit
+// padding.
 Status MaxPoolShape(shape_inference::InferenceContext* c);
 
 // Shape function for MaxPoolV2-like operations.
@@ -241,8 +245,9 @@ Status ValidateVariableResourceHandle(
 // Shape function for GatherNd operations.
 Status GatherNdShape(InferenceContext* c);
 
-// Shape function for ScatterNd update/add/sub/... operations.
-Status ScatterNdUpdateShape(InferenceContext* c);
+// Helper shape function for ScatterNd.../TensorScatter... operations.
+Status ScatterNdShapeHelper(InferenceContext* c, ShapeHandle indices_shape,
+                            ShapeHandle updates_shape, ShapeHandle input_shape);
 
 // Shape function for ops with an explicit "shape" attribute.
 Status ExplicitShape(InferenceContext* c);
diff --git a/tensorflow/core/framework/cpu_allocator_impl.cc b/tensorflow/core/framework/cpu_allocator_impl.cc
index 814233074fb..511cfce8ab5 100644
--- a/tensorflow/core/framework/cpu_allocator_impl.cc
+++ b/tensorflow/core/framework/cpu_allocator_impl.cc
@@ -29,9 +29,8 @@ namespace tensorflow {
 // If true, cpu allocator collects more stats.
 static bool cpu_allocator_collect_stats = false;
 
-void EnableCPUAllocatorStats(bool enable) {
-  cpu_allocator_collect_stats = enable;
-}
+void EnableCPUAllocatorStats() { cpu_allocator_collect_stats = true; }
+void DisableCPUAllocatorStats() { cpu_allocator_collect_stats = false; }
 bool CPUAllocatorStatsEnabled() { return cpu_allocator_collect_stats; }
 
 static const int kMaxTotalAllocationWarnings = 1;
diff --git a/tensorflow/core/framework/dataset.cc b/tensorflow/core/framework/dataset.cc
index dcae9ab3ef3..c851af9a5c4 100644
--- a/tensorflow/core/framework/dataset.cc
+++ b/tensorflow/core/framework/dataset.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/variant_op_registry.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/resource.h"
@@ -382,6 +383,14 @@ int64 GetTotalBytes(const std::vector<Tensor>& element) {
   return total_bytes;
 }
 
+std::string FullName(const std::string& prefix, const std::string& name) {
+  if (str_util::StrContains(name, kColon)) {
+    LOG(ERROR) << name << " should not contain " << kColon;
+  }
+
+  return strings::StrCat(kFullNameRandomHex, kPipe, prefix, kColon, name);
+}
+
 Status GetDatasetFromVariantTensor(const Tensor& tensor,
                                    DatasetBase** out_dataset) {
   if (!(tensor.dtype() == DT_VARIANT &&
@@ -427,6 +436,31 @@ Status DatasetBase::MakeIterator(
   return s;
 }
 
+Status DatasetBase::MakeSplitProvider(
+    std::unique_ptr<SplitProvider>* split_provider) const {
+  std::vector<const DatasetBase*> inputs;
+  Status s = InputDatasets(&inputs);
+  if (errors::IsUnimplemented(s)) {
+    return errors::Unimplemented(
+        "Cannot create a split provider for dataset of type ", type_string(),
+        ", because the dataset implements neither `InputDatasets` nor "
+        "`MakeSplitProvider`.");
+  }
+  if (inputs.size() != 1) {
+    return errors::Unimplemented(
+        "Cannot create a split provider for dataset of type ", type_string(),
+        ", because the dataset is not unary (having arity ", inputs.size(),
+        "), and no custom implementation of `MakeSplitProvider` is defined.");
+  }
+  return inputs[0]->MakeSplitProvider(split_provider);
+}
+
+Status DatasetBase::InputDatasets(
+    std::vector<const DatasetBase*>* inputs) const {
+  return errors::Unimplemented("InputDatasets not implemented for ",
+                               type_string());
+}
+
 Status DatasetBase::DatasetGraphDefBuilder::AddInputDataset(
     SerializationContext* ctx, const DatasetBase* dataset, Node** output) {
   Status status = dataset->AsGraphDefInternal(ctx, this, output);
@@ -554,6 +588,64 @@ Status DatasetBaseIterator::GetNext(IteratorContext* ctx,
   return s;
 }
 
+Status DatasetBaseIterator::Skip(IteratorContext* ctx, int num_to_skip,
+                                 bool* end_of_sequence, int* num_skipped) {
+  profiler::TraceMe activity([&] { return BuildTraceMeName(); },
+                             profiler::TraceMeLevel::kInfo);
+  DVLOG(3) << prefix() << " Skip enter";
+  auto model = ctx->model();
+  if (model && model->collect_resource_usage() && node_) {
+    int64 now_nanos = EnvTime::NowNanos();
+    auto output = node_->output();
+    if (output) {
+      output->record_stop(now_nanos);
+    }
+    node_->record_start(now_nanos);
+  }
+  Status s = SkipInternal(ctx, num_to_skip, end_of_sequence, num_skipped);
+  if (model && model->collect_resource_usage() && node_) {
+    int64 now_nanos = EnvTime::NowNanos();
+    node_->record_stop(now_nanos);
+    auto output = node_->output();
+    if (output) {
+      output->record_start(now_nanos);
+    }
+  }
+  if (TF_PREDICT_FALSE(errors::IsOutOfRange(s))) {
+    s = errors::Internal("Iterator \"", params_.prefix,
+                         "\" returned `OutOfRange`. This indicates an "
+                         "implementation error as `OutOfRange` errors are not "
+                         "expected to be returned here. Original message: ",
+                         s.error_message());
+    LOG(ERROR) << s;
+  }
+  DVLOG(3) << prefix() << " Skip exit";
+  return s;
+}
+
+Status DatasetBaseIterator::SkipInternal(IteratorContext* ctx, int num_to_skip,
+                                         bool* end_of_sequence,
+                                         int* num_skipped) {
+  *num_skipped = 0;
+  for (int i = 0; i < num_to_skip; ++i) {
+    std::vector<Tensor> out_tensors;
+    TF_RETURN_IF_ERROR(GetNextInternal(ctx, &out_tensors, end_of_sequence));
+    if (*end_of_sequence) {
+      return Status::OK();
+    }
+    // RecordElement is used to count the number of element computed and
+    // help calculate the CPU time spent on a given iterator to do the
+    // autotuning.
+    // Here we only call RecordElement in the default implementation of
+    // SkipInternal (which trivially calls GetNextInternal) and assume
+    // that the overriden SkipInternal in the derived class will have
+    // negligible cost compare to its GetNextInternal.
+    RecordElement(ctx, &out_tensors);
+    (*num_skipped)++;
+  }
+  return Status::OK();
+}
+
 void DatasetOpKernel::Compute(OpKernelContext* ctx) {
   DatasetBase* dataset = nullptr;
   MakeDataset(ctx, &dataset);
diff --git a/tensorflow/core/framework/dataset.h b/tensorflow/core/framework/dataset.h
index 25dc10e8540..ba0c2b84a1a 100644
--- a/tensorflow/core/framework/dataset.h
+++ b/tensorflow/core/framework/dataset.h
@@ -81,17 +81,19 @@ class SerializationContext;
 // Read*(key, val) vs Read*(name, key, val).
 class IteratorStateReader {
  public:
-  virtual Status ReadScalar(StringPiece key, int64* val) = 0;
-  virtual Status ReadScalar(StringPiece key, tstring* val) = 0;
-  virtual Status ReadTensor(StringPiece key, Tensor* val) = 0;
+  virtual Status ReadScalar(StringPiece key, int64* val) const = 0;
+  virtual Status ReadScalar(StringPiece key, tstring* val) const = 0;
+  virtual Status ReadTensor(StringPiece key, Tensor* val) const = 0;
 
-  virtual Status ReadScalar(StringPiece name, StringPiece key, int64* val) = 0;
   virtual Status ReadScalar(StringPiece name, StringPiece key,
-                            tstring* val) = 0;
-  virtual Status ReadTensor(StringPiece name, StringPiece key, Tensor* val) = 0;
+                            int64* val) const = 0;
+  virtual Status ReadScalar(StringPiece name, StringPiece key,
+                            tstring* val) const = 0;
+  virtual Status ReadTensor(StringPiece name, StringPiece key,
+                            Tensor* val) const = 0;
 
-  virtual bool Contains(StringPiece key) = 0;
-  virtual bool Contains(StringPiece name, StringPiece key) = 0;
+  virtual bool Contains(StringPiece key) const = 0;
+  virtual bool Contains(StringPiece name, StringPiece key) const = 0;
 
   virtual ~IteratorStateReader() {}
 };
@@ -121,6 +123,10 @@ class IteratorStateWriter {
   virtual ~IteratorStateWriter() {}
 };
 
+// Generates a full name key for iterator checkpointing. All keys generated for
+// iterator checkpoints should go through this function.
+std::string FullName(const std::string& prefix, const std::string& name);
+
 // Wrapper around GraphDefBuilder. Used to serialize Dataset graph.
 class GraphDefBuilderWrapper {
  public:
@@ -295,6 +301,24 @@ class Runner {
   static Runner* get();
 };
 
+// A class which provides a sequence of splits. Iterators created with a split
+// provider will iterate over only the splits provided by the split provider.
+class SplitProvider {
+ public:
+  virtual ~SplitProvider() {}
+  // Stores the next split in `*split`, setting `*end_of_splits` to indicate
+  // whether there were any splits left.
+  virtual Status GetNext(Tensor* split, bool* end_of_splits) = 0;
+  // Resets the split provider to its beginning.
+  virtual Status Reset() = 0;
+  // Saves the state of this split provider.
+  virtual Status Save(std::function<std::string(std::string)> full_name,
+                      IteratorStateWriter* writer) = 0;
+  // Saves the state of this split provider.
+  virtual Status Restore(std::function<std::string(std::string)> full_name,
+                         IteratorStateReader* reader) = 0;
+};
+
 // A cut-down version of `OpKernelContext` for running computations in
 // iterators. Note that we cannot simply use `OpKernelContext` here because we
 // might run computation in an iterator whose lifetime is not nested within the
@@ -319,6 +343,7 @@ class IteratorContext {
           model(ctx->model()),
           runner(*(ctx->runner())),
           runner_threadpool_size(ctx->runner_threadpool_size()),
+          split_provider(ctx->split_provider()),
           stats_aggregator(ctx->stats_aggregator()),
           thread_factory(ctx->thread_factory()),
           thread_pool(ctx->thread_pool()) {}
@@ -386,6 +411,9 @@ class IteratorContext {
     // Number of threads used for executing user-defined functions.
     int32 runner_threadpool_size = 0;
 
+    // An optional split provider indicating which splits to process.
+    std::shared_ptr<SplitProvider> split_provider = nullptr;
+
     // The `StatsAggregator` object to record statistics about the iterator.
     std::shared_ptr<StatsAggregator> stats_aggregator = nullptr;
 
@@ -432,6 +460,10 @@ class IteratorContext {
 
   int32 runner_threadpool_size() { return params_.runner_threadpool_size; }
 
+  std::shared_ptr<SplitProvider> split_provider() {
+    return params_.split_provider;
+  }
+
   std::shared_ptr<StatsAggregator> stats_aggregator() {
     return params_.stats_aggregator;
   }
@@ -595,6 +627,16 @@ class IteratorBase {
     return GetNext(&ctx, out_tensors, end_of_sequence);
   }
 
+  // Skips the next `num_to_skip` outputs from the range that this iterator
+  // is traversing.
+  //
+  // If there are not enough outputs to skip, it will set
+  // `*end_of_sequence = true` and return `Status::OK()`. `*num_skipped` will
+  // store the number of outputs that are skipped. When `*end_of_sequence` is
+  // `false`, `*num_skipped` should equal to `num_to_skip`.
+  virtual Status Skip(IteratorContext* ctx, int num_to_skip,
+                      bool* end_of_sequence, int* num_skipped) = 0;
+
   // Returns a vector of DataType values, representing the respective
   // element types of each tuple component in the outputs of this
   // iterator.
@@ -792,6 +834,12 @@ class DatasetBase : public core::RefCounted {
     return MakeIteratorFromCheckpoint(&ctx, output_prefix, reader, iterator);
   }
 
+  // Returns a split provider which partitions the dataset's data into splits
+  // and provides them in a sequence. The split provider is stored in
+  // `*split_provider`.
+  virtual Status MakeSplitProvider(
+      std::unique_ptr<SplitProvider>* split_provider) const;
+
   // Returns a vector of DataType values, representing the respective
   // element types of each tuple component in the outputs of this
   // dataset.
@@ -814,6 +862,14 @@ class DatasetBase : public core::RefCounted {
   // A human-readable debug string for this dataset.
   virtual string DebugString() const = 0;
 
+  // Stores the dataset's input datasets in `*inputs`. The pointers stored in
+  // `*inputs` are borrowed. The only valid non-ok return status is
+  // UNIMPLEMENTED in case `InputDatasets` is not implemented by a dataset
+  // subclass. Implementing `InputDatasets` enables `DatasetBase` to provide a
+  // default implementation of `MakeSplitProvider` when there is a single input
+  // dataset.
+  virtual Status InputDatasets(std::vector<const DatasetBase*>* inputs) const;
+
   // Indicates whether the dataset depends on any external state which would
   // prevent it from being serializable. If so, the method returns
   // `errors::FailedPrecondition` with a message that identifies the external
@@ -904,6 +960,9 @@ class DatasetBaseIterator : public IteratorBase {
     return GetNext(&ctx, out_tensors, end_of_sequence);
   }
 
+  Status Skip(IteratorContext* ctx, int num_to_skip, bool* end_of_sequence,
+              int* num_skipped) final;
+
   Status Save(SerializationContext* ctx, IteratorStateWriter* writer) final {
     return IteratorBase::Save(ctx, writer);
   }
@@ -914,13 +973,12 @@ class DatasetBaseIterator : public IteratorBase {
                                  std::vector<Tensor>* out_tensors,
                                  bool* end_of_sequence) = 0;
 
-  string full_name(const string& name) const {
-    if (str_util::StrContains(name, kColon)) {
-      LOG(ERROR) << name << " should not contain " << kColon;
-    }
+  // Internal implementation of Skip that is wrapped in tracing logic
+  virtual Status SkipInternal(IteratorContext* ctx, int num_to_skip,
+                              bool* end_of_sequence, int* num_skipped);
 
-    return strings::StrCat(kFullNameRandomHex, kPipe, params_.prefix, kColon,
-                           name);
+  string full_name(const string& name) const {
+    return FullName(params_.prefix, name);
   }
 
   // Returns a map of key-value pairs to included in the TraceMe string.
@@ -1171,20 +1229,6 @@ class DatasetOpRegistrar {
       registrar__body__##ctr##__object(op_name)
 
 }  // namespace data
-
-// TODO(b/114112161): Remove these aliases when all users have moved over to the
-// `tensorflow::data` namespace.
-using data::DatasetBase;
-using data::DatasetContext;
-using data::DatasetIterator;
-using data::DatasetOpKernel;
-using data::IteratorBase;
-using data::IteratorContext;
-using data::IteratorStateReader;
-using data::IteratorStateWriter;
-using data::SerializationContext;
-using data::UnaryDatasetOpKernel;
-
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_FRAMEWORK_DATASET_H_
diff --git a/tensorflow/core/framework/dataset_test.cc b/tensorflow/core/framework/dataset_test.cc
index 9dbb3be7faf..e471b441ce2 100644
--- a/tensorflow/core/framework/dataset_test.cc
+++ b/tensorflow/core/framework/dataset_test.cc
@@ -27,6 +27,11 @@ TEST(DatasetTest, RegisterDatasetOp) {
   EXPECT_FALSE(data::DatasetOpRegistry::IsRegistered("InvalidDatasetOp"));
 }
 
+TEST(DatasetTest, FullName) {
+  EXPECT_EQ(data::FullName("prefix", "name"),
+            "60d899aa0d8ce4351e7c3b419e92d25b|prefix:name");
+}
+
 enum DataTypeTest {
   _tf_int_32,
   _tf_int_64,
diff --git a/tensorflow/core/framework/device_base.cc b/tensorflow/core/framework/device_base.cc
index a43bb4fd656..3eca8c86e36 100644
--- a/tensorflow/core/framework/device_base.cc
+++ b/tensorflow/core/framework/device_base.cc
@@ -65,10 +65,12 @@ Status DeviceContext::CopyCPUTensorToDeviceSync(const Tensor* cpu_tensor,
 
 const DeviceAttributes& DeviceBase::attributes() const {
   LOG(FATAL) << "Device does not implement attributes()";
+  std::abort();
 }
 
 const string& DeviceBase::name() const {
   LOG(FATAL) << "Device does not implement name()";
+  std::abort();
 }
 
 void DeviceBase::set_eigen_cpu_device(Eigen::ThreadPoolDevice* d) {
diff --git a/tensorflow/core/framework/device_base.h b/tensorflow/core/framework/device_base.h
index fabb0b24a93..c39cf43912c 100644
--- a/tensorflow/core/framework/device_base.h
+++ b/tensorflow/core/framework/device_base.h
@@ -32,9 +32,6 @@ limitations under the License.
 
 namespace Eigen {
 struct ThreadPoolDevice;
-#ifdef TENSORFLOW_USE_SYCL
-struct SyclDevice;
-#endif
 }  // end namespace Eigen
 
 namespace stream_executor {
@@ -176,9 +173,6 @@ class DeviceBase {
   // Does not take ownership.
   void set_eigen_cpu_device(Eigen::ThreadPoolDevice* d);
 
-#ifdef TENSORFLOW_USE_SYCL
-  void set_eigen_sycl_device(Eigen::SyclDevice* d) { eigen_sycl_device_ = d; }
-#endif
 
   // Return the Allocator implementation to use based on the allocator
   // attributes requested.  See allocator.h for more details.
@@ -210,12 +204,6 @@ class DeviceBase {
 
   virtual const Eigen::ThreadPoolDevice* eigen_cpu_device();
 
-#ifdef TENSORFLOW_USE_SYCL
-  virtual const Eigen::SyclDevice* eigen_sycl_device() const {
-    CHECK(eigen_sycl_device_ != nullptr);
-    return eigen_sycl_device_;
-  }
-#endif
 
   // Caller owns the return value. The OpKernelContext calls this even
   // for devices that do not implement an eigen_gpu_device. Overridden
@@ -290,9 +278,6 @@ class DeviceBase {
   GpuDeviceInfo* gpu_device_info_ = nullptr;
   thread::ThreadPool* device_thread_pool_ = nullptr;
   std::vector<Eigen::ThreadPoolDevice*> eigen_cpu_devices_;
-#ifdef TENSORFLOW_USE_SYCL
-  Eigen::SyclDevice* eigen_sycl_device_ = nullptr;
-#endif
 };
 
 // Methods to create and check for Symbolic execution devices.
diff --git a/tensorflow/core/framework/function.cc b/tensorflow/core/framework/function.cc
index ebf06c7d0cd..564290bcb21 100644
--- a/tensorflow/core/framework/function.cc
+++ b/tensorflow/core/framework/function.cc
@@ -1454,6 +1454,12 @@ Status FunctionLibraryDefinition::RemoveFunctionHelper(const string& func) {
   return Status::OK();
 }
 
+void FunctionLibraryDefinition::Clear() {
+  mutex_lock l(mu_);
+  function_defs_.clear();
+  func_grad_.clear();
+}
+
 Status FunctionLibraryDefinition::RemoveGradient(const string& func) {
   const auto& i = func_grad_.find(func);
   if (i == func_grad_.end()) {
diff --git a/tensorflow/core/framework/function.h b/tensorflow/core/framework/function.h
index c7e6e2d158c..3c048161b7d 100644
--- a/tensorflow/core/framework/function.h
+++ b/tensorflow/core/framework/function.h
@@ -403,6 +403,9 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
   // are no longer in use.
   Status RemoveFunction(const std::string& func) TF_LOCKS_EXCLUDED(mu_);
 
+  // Removes all the functions and gradient functions.
+  void Clear() TF_LOCKS_EXCLUDED(mu_);
+
   // Adds the functions and gradients in 'other' to this function library.
   // Duplicate functions and gradients are ignored.
   // This operation is atomic.
@@ -612,6 +615,9 @@ class FunctionLibraryRuntime {
     // infer correct device.
     std::vector<string> output_devices;
 
+    // If set, it indicates the original output indices of a component function.
+    absl::optional<std::vector<int>> ret_indices = absl::nullopt;
+
     // Maps from a CompositeDevice name to a list of underlying physical
     // devices.
     absl::flat_hash_map<string, const std::vector<string>*> composite_devices;
diff --git a/tensorflow/core/framework/function_test.cc b/tensorflow/core/framework/function_test.cc
index a62acfe571e..38ab8be291d 100644
--- a/tensorflow/core/framework/function_test.cc
+++ b/tensorflow/core/framework/function_test.cc
@@ -1068,6 +1068,16 @@ TEST(FunctionLibraryDefinitionTest, RemoveFunction) {
   EXPECT_FALSE(lib_def.Contains("XTimesTwo"));
 }
 
+TEST(FunctionLibraryDefinitionTest, Clear) {
+  FunctionLibraryDefinition lib_def(OpRegistry::Global(), {});
+  TF_CHECK_OK(lib_def.AddFunctionDef(test::function::XTimesTwo()));
+  TF_CHECK_OK(lib_def.AddFunctionDef(test::function::XAddX()));
+
+  lib_def.Clear();
+  EXPECT_FALSE(lib_def.Contains("XTimesTwo"));
+  EXPECT_FALSE(lib_def.Contains("XAddX"));
+}
+
 TEST(FunctionLibraryDefinitionTest, AddLibrary) {
   // Create lib def with single function
   FunctionDefLibrary proto;
diff --git a/tensorflow/core/framework/memory_types.cc b/tensorflow/core/framework/memory_types.cc
index d27ef1da61d..208ad20c21b 100644
--- a/tensorflow/core/framework/memory_types.cc
+++ b/tensorflow/core/framework/memory_types.cc
@@ -161,6 +161,7 @@ Status MemoryTypesForNode(const OpRegistryInterface* op_registry,
       }
     }
   }
+  hostmem_attr.clear();
   if (TryGetNodeAttr(ndef, "_output_hostmem", &hostmem_attr)) {
     for (int32 i : hostmem_attr) {
       if (0 <= i && i < out_mtypes->size()) {
diff --git a/tensorflow/core/framework/memory_types_test.cc b/tensorflow/core/framework/memory_types_test.cc
index 3126ea8e5f8..5228dbafc9b 100644
--- a/tensorflow/core/framework/memory_types_test.cc
+++ b/tensorflow/core/framework/memory_types_test.cc
@@ -33,12 +33,14 @@ class DummyKernel : public OpKernel {
 
 REGISTER_OP("HostMemoryTest")
     .Input("a: float")
-    .Input("b: T")
-    .Input("c: N * string")
-    .Input("d: Tlist")
-    .Input("e: Rlist")
+    .Input("b: float")
+    .Input("c: T")
+    .Input("d: N * string")
+    .Input("e: Tlist")
+    .Input("f: Rlist")
     .Output("o: N * T")
-    .Output("p: Tlist")
+    .Output("p: N * T")
+    .Output("r: Tlist")
     .Attr("T: type")
     .Attr("N: int")
     .Attr("Tlist: list(type)")
@@ -46,21 +48,25 @@ REGISTER_OP("HostMemoryTest")
 REGISTER_KERNEL_BUILDER(Name("HostMemoryTest").Device(DEVICE_CPU), DummyKernel);
 REGISTER_KERNEL_BUILDER(Name("HostMemoryTest")
                             .Device(DEVICE_GPU)
-                            .HostMemory("a")
-                            .HostMemory("c")
+                            .HostMemory("b")
                             .HostMemory("d")
-                            .HostMemory("o"),
+                            .HostMemory("e")
+                            .HostMemory("p"),
                         DummyKernel);
 
 TEST(MemoryTypesForNode, Simple) {
   NodeDef node_def;
   TF_ASSERT_OK(NodeDefBuilder("test", "HostMemoryTest")
+                   .Input(FakeInput())
                    .Input(FakeInput())
                    .Input(FakeInput(DT_BOOL))
                    .Input(FakeInput(3))
                    .Input(FakeInput({DT_INT32, DT_FLOAT, DT_INT32}))
                    .Input(FakeInput({DT_RESOURCE, DT_STRING, DT_RESOURCE}))
                    .Finalize(&node_def));
+  AddNodeAttr("_input_hostmem", {0}, &node_def);
+  AddNodeAttr("_output_hostmem", {6, 7}, &node_def);
+
   MemoryTypeVector input, output;
 
   TF_EXPECT_OK(MemoryTypesForNode(OpRegistry::Global(), DEVICE_CPU, node_def,
@@ -68,24 +74,26 @@ TEST(MemoryTypesForNode, Simple) {
   // a:float, b:bool, c:3*string, d:(int32, float, int32),
   // e:(resource, string, resource)
   EXPECT_EQ(
-      MemoryTypeVector({DEVICE_MEMORY, DEVICE_MEMORY, HOST_MEMORY, HOST_MEMORY,
-                        HOST_MEMORY, DEVICE_MEMORY, DEVICE_MEMORY,
+      MemoryTypeVector({HOST_MEMORY, DEVICE_MEMORY, DEVICE_MEMORY, HOST_MEMORY,
+                        HOST_MEMORY, HOST_MEMORY, DEVICE_MEMORY, DEVICE_MEMORY,
                         DEVICE_MEMORY, HOST_MEMORY, HOST_MEMORY, HOST_MEMORY}),
       input);
   // o:3*bool, p:(int32, float, int32)
   EXPECT_EQ(MemoryTypeVector({DEVICE_MEMORY, DEVICE_MEMORY, DEVICE_MEMORY,
-                              DEVICE_MEMORY, DEVICE_MEMORY, DEVICE_MEMORY}),
+                              DEVICE_MEMORY, DEVICE_MEMORY, DEVICE_MEMORY,
+                              HOST_MEMORY, HOST_MEMORY, DEVICE_MEMORY}),
             output);
 
   TF_EXPECT_OK(MemoryTypesForNode(OpRegistry::Global(), DEVICE_GPU, node_def,
                                   &input, &output));
   EXPECT_EQ(
-      MemoryTypeVector({HOST_MEMORY, DEVICE_MEMORY, HOST_MEMORY, HOST_MEMORY,
+      MemoryTypeVector({HOST_MEMORY, HOST_MEMORY, DEVICE_MEMORY, HOST_MEMORY,
                         HOST_MEMORY, HOST_MEMORY, HOST_MEMORY, HOST_MEMORY,
-                        HOST_MEMORY, HOST_MEMORY, HOST_MEMORY}),
+                        HOST_MEMORY, HOST_MEMORY, HOST_MEMORY, HOST_MEMORY}),
       input);
-  EXPECT_EQ(MemoryTypeVector({HOST_MEMORY, HOST_MEMORY, HOST_MEMORY,
-                              DEVICE_MEMORY, DEVICE_MEMORY, DEVICE_MEMORY}),
+  EXPECT_EQ(MemoryTypeVector({DEVICE_MEMORY, DEVICE_MEMORY, DEVICE_MEMORY,
+                              HOST_MEMORY, HOST_MEMORY, HOST_MEMORY,
+                              HOST_MEMORY, HOST_MEMORY, DEVICE_MEMORY}),
             output);
 }
 
diff --git a/tensorflow/core/framework/metrics.cc b/tensorflow/core/framework/metrics.cc
index 8cbfcd5342a..f5aff3a4e11 100644
--- a/tensorflow/core/framework/metrics.cc
+++ b/tensorflow/core/framework/metrics.cc
@@ -80,6 +80,11 @@ auto* tf_data_bytes_fetched_counter = monitoring::Counter<0>::New(
 auto* tf_data_elements_counter = monitoring::Counter<1>::New(
     "/tensorflow/data/elements", "tf.data elements", "name");
 
+auto* tf_data_experiment_counter = monitoring::Counter<1>::New(
+    "/tensorflow/data/experiment",
+    "The number of times tf.data experiment is applied to input pipelines.",
+    "name");
+
 auto* tf_data_fingerprint_counter = monitoring::Counter<1>::New(
     "/tensorflow/data/fingerprint", "tf.data fingerprint", "name");
 
@@ -179,6 +184,10 @@ void RecordTFDataBytesFetched(int64 num_bytes) {
   tf_data_bytes_fetched_counter->GetCell()->IncrementBy(num_bytes);
 }
 
+void RecordTFDataExperiment(const string& name) {
+  tf_data_experiment_counter->GetCell(name)->IncrementBy(1);
+}
+
 void RecordTFDataFingerprint(const string& name) {
   tf_data_fingerprint_counter->GetCell(name)->IncrementBy(1);
 }
diff --git a/tensorflow/core/framework/metrics.h b/tensorflow/core/framework/metrics.h
index 7bc9a1bda0b..f7c90ce593e 100644
--- a/tensorflow/core/framework/metrics.h
+++ b/tensorflow/core/framework/metrics.h
@@ -56,6 +56,9 @@ monitoring::CounterCell* GetTFDataElementsCounter(const string& name);
 // Records the number of bytes fetched from tf.data.Dataset iterator.
 void RecordTFDataBytesFetched(int64 num_bytes);
 
+// Records the number of times tf.data experiment is applied to input pipelines.
+void RecordTFDataExperiment(const string& name);
+
 // Records the time spent in ItertatorResource::GetNext() in microseconds.
 void RecordTFDataGetNextDuration(uint64 duration_us);
 
diff --git a/tensorflow/core/framework/model.cc b/tensorflow/core/framework/model.cc
index 3d54ffd51d8..aebdcd7a03a 100644
--- a/tensorflow/core/framework/model.cc
+++ b/tensorflow/core/framework/model.cc
@@ -461,7 +461,17 @@ class AsyncKnownRatio : public Node {
     auto* buffer_size_parameter = gtl::FindOrNull(parameters_, kBufferSize);
     if (parallelism_parameter) {
       parallelism = (*parallelism_parameter)->value;
-      buffer_size = parallelism;
+      if (ratio_ == 0) {
+        buffer_size = parallelism;
+      } else {
+        // Currently, MapAndBatch is the only transformation creates
+        // AsyncKnownRatio nodes with ratio >= 1. For MapAndBatch, we create
+        // `parallelism` threads to apply the function on elements from input
+        // dataset, while one element in the buffer actually corresponds to
+        // `ratio_` elements from input dataset. So we adjust the `buffer_size`
+        // by dividing `ratio_`.
+        buffer_size = parallelism / ratio_;
+      }
     } else if (buffer_size_parameter) {
       buffer_size = (*buffer_size_parameter)->value;
     }
@@ -527,10 +537,11 @@ class AsyncKnownRatio : public Node {
 
       // Add derivative w.r.t. own parameter if it's tunable.
       if (parallelism_parameter && (*parallelism_parameter)->state->tunable) {
-        (*gradients)[long_name()] =
-            buffer_size_der - (1.0L + consumer_time_der +
-                               producer_time_der * inputs_time_der_sum) *
-                                  self_processing_time / Square(parallelism);
+        (*gradients)[long_name()] = buffer_size_der / ratio_ -
+                                    (1.0L + consumer_time_der +
+                                     producer_time_der * inputs_time_der_sum) *
+                                        self_processing_time /
+                                        Square(parallelism);
       } else if (buffer_size_parameter &&
                  (*buffer_size_parameter)->state->tunable) {
         (*gradients)[long_name()] = buffer_size_der;
diff --git a/tensorflow/core/framework/model.h b/tensorflow/core/framework/model.h
index bfa6e31209a..826c2ed5cee 100644
--- a/tensorflow/core/framework/model.h
+++ b/tensorflow/core/framework/model.h
@@ -77,7 +77,14 @@ struct Parameter {
   Parameter(const string& name, std::shared_ptr<SharedState> state, double min,
             double max)
       : name(name),
-        value(state->value),
+        // Sometimes non-autotune nodes (with `autotune_=false`) may contain
+        // parameters (for example inputs of parallel interleave dataset which
+        // are not in the current cycle). To avoid unrealistic situation
+        // (say `buffer_size=-1` or `parallelism=-1`) in the optimization
+        // computation, if the state value is `kAutotune=-1` (just to indicate
+        // the `SharedState` is tunable), we initialize the parameter value to
+        // be the minimal value of the state.
+        value(state->value == kAutotune ? min : state->value),
         min(min),
         max(max),
         state(std::move(state)) {}
diff --git a/tensorflow/core/framework/node_def_util.cc b/tensorflow/core/framework/node_def_util.cc
index be98c7cedfe..1146b02ed1c 100644
--- a/tensorflow/core/framework/node_def_util.cc
+++ b/tensorflow/core/framework/node_def_util.cc
@@ -795,6 +795,8 @@ bool IsValidControlInputName(StringPiece sp) {
   }
 }
 
+const StringPiece kColocationGroupPrefixStringPiece(kColocationGroupPrefix);
+
 }  // namespace
 
 Status ValidateOpInput(const string& input_name, bool* is_control_input) {
@@ -924,17 +926,27 @@ Status AddPrefixAndSuffixToNode(StringPiece prefix, StringPiece suffix,
     attr.set_s(frame_name);
   }
 
-  // Update colocation constraints.
-  constexpr char kClassAttr[] = "_class";
-  auto class_attr = node_def->mutable_attr()->find(kClassAttr);
-  if (class_attr != node_def->mutable_attr()->end()) {
-    AttrValue new_value;
-    new_value.mutable_list()->add_s(
-        strings::StrCat(prefix, class_attr->second.s()));
-    node_def->mutable_attr()->erase(kClassAttr);
-    node_def->mutable_attr()->insert({kClassAttr, new_value});
-  }
+  return Status::OK();
+}
 
+Status MaybeAddPrefixToColocationConstraints(
+    const std::unordered_set<string>& match, StringPiece prefix,
+    NodeDef* node_def) {
+  auto attr = node_def->mutable_attr()->find(kColocationAttrName);
+  if (attr == node_def->mutable_attr()->end()) {
+    return Status::OK();
+  }
+  auto constraints_list = attr->second.mutable_list();
+  auto constraints_size = constraints_list->s_size();
+  for (size_t i = 0; i < constraints_size; ++i) {
+    StringPiece original(constraints_list->s(i));
+    if (absl::ConsumePrefix(&original, kColocationGroupPrefixStringPiece)) {
+      if (match.find(string(original)) != match.end()) {
+        (*constraints_list->mutable_s(i)) =
+            strings::StrCat(kColocationGroupPrefix, prefix, original);
+      }
+    }
+  }
   return Status::OK();
 }
 
diff --git a/tensorflow/core/framework/node_def_util.h b/tensorflow/core/framework/node_def_util.h
index d1a7c9aebba..d774d1cf414 100644
--- a/tensorflow/core/framework/node_def_util.h
+++ b/tensorflow/core/framework/node_def_util.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_FRAMEWORK_NODE_DEF_UTIL_H_
 
 #include <string>
+#include <unordered_set>
 #include <vector>
 
 #include "tensorflow/core/framework/attr_value_util.h"
@@ -391,6 +392,13 @@ Status AttachDef(const Status& status, const NodeDef& node_def,
 Status AddPrefixAndSuffixToNode(StringPiece prefix, StringPiece suffix,
                                 NodeDef* node_def,
                                 bool uniquify_frame_name = true);
+
+// Appends the given prefix to the colocation group name if the name exists
+// in `to_match`.
+Status MaybeAddPrefixToColocationConstraints(
+    const std::unordered_set<string>& match, StringPiece prefix,
+    NodeDef* node_def);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_FRAMEWORK_NODE_DEF_UTIL_H_
diff --git a/tensorflow/core/framework/node_def_util_test.cc b/tensorflow/core/framework/node_def_util_test.cc
index 2fc000d4e3c..b79b738353c 100644
--- a/tensorflow/core/framework/node_def_util_test.cc
+++ b/tensorflow/core/framework/node_def_util_test.cc
@@ -615,6 +615,39 @@ TEST(AddPrefixAndSuffixToNode, Enter) {
   EXPECT_EQ("prefix/test_frame/suffix", frame_name);
 }
 
+TEST(MaybeAddPrefixToColocationConstraints, Basic) {
+  NodeDef node_def;
+  node_def.set_name("Identity");
+  node_def.set_op("Identity");
+  AddNodeAttr(kColocationAttrName,
+              {strings::StrCat(kColocationGroupPrefix, "Node1"),
+               strings::StrCat(kColocationGroupPrefix, "Node2"),
+               strings::StrCat(kColocationGroupPrefix, "Node3")},
+              &node_def);
+
+  std::unordered_set<string> match;
+  match.insert("Node1");
+  match.insert("Node3");
+  TF_ASSERT_OK(MaybeAddPrefixToColocationConstraints(match, "fn/", &node_def));
+  std::vector<string> coloc_constraints;
+  TF_ASSERT_OK(GetNodeAttr(node_def, kColocationAttrName, &coloc_constraints));
+  EXPECT_EQ(
+      coloc_constraints,
+      std::vector<string>({"loc:@fn/Node1", "loc:@Node2", "loc:@fn/Node3"}));
+}
+
+TEST(MaybeAddPrefixToColocationConstraints, NoConstraints) {
+  NodeDef node_def;
+  node_def.set_name("Identity");
+  node_def.set_op("Identity");
+
+  std::unordered_set<string> match;
+  match.insert("Node1");
+  match.insert("Node3");
+  TF_ASSERT_OK(MaybeAddPrefixToColocationConstraints(match, "fn/", &node_def));
+  EXPECT_FALSE(HasNodeAttr(node_def, kColocationAttrName));
+}
+
 TEST(FormatNodeForErrorTest, Node) {
   Graph g(OpRegistry::Global());
   Node* node;
diff --git a/tensorflow/core/framework/numeric_types.h b/tensorflow/core/framework/numeric_types.h
index 10313eb8feb..cef2f562515 100644
--- a/tensorflow/core/framework/numeric_types.h
+++ b/tensorflow/core/framework/numeric_types.h
@@ -24,7 +24,6 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/FixedPoint"
 // clang-format on
 
-#include "tensorflow/core/lib/bfloat16/bfloat16.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/framework/op.cc b/tensorflow/core/framework/op.cc
index b9c47f9e61c..00ba011d19c 100644
--- a/tensorflow/core/framework/op.cc
+++ b/tensorflow/core/framework/op.cc
@@ -289,16 +289,17 @@ Status OpListOpRegistry::LookUp(const string& op_type_name,
   return OpNotFound(op_type_name);
 }
 
-// Other registration ---------------------------------------------------------
-
 namespace register_op {
-OpDefBuilderReceiver::OpDefBuilderReceiver(
-    const OpDefBuilderWrapper<true>& wrapper) {
+
+InitOnStartupMarker OpDefBuilderWrapper::operator()() {
   OpRegistry::Global()->Register(
-      [wrapper](OpRegistrationData* op_reg_data) -> Status {
-        return wrapper.builder().Finalize(op_reg_data);
+      [builder =
+           std::move(builder_)](OpRegistrationData* op_reg_data) -> Status {
+        return builder.Finalize(op_reg_data);
       });
+  return {};
 }
-}  // namespace register_op
+
+}  //  namespace register_op
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/op.h b/tensorflow/core/framework/op.h
index adc52d963c9..5372f6fcfb4 100644
--- a/tensorflow/core/framework/op.h
+++ b/tensorflow/core/framework/op.h
@@ -214,121 +214,83 @@ class OpListOpRegistry : public OpRegistryInterface {
 
 namespace register_op {
 
-// OpDefBuilderWrapper is a templated class that is used in the REGISTER_OP
-// calls. This allows the result of REGISTER_OP to be used in chaining, as in
-// REGISTER_OP(a).Attr("...").Input("...");, while still allowing selective
-// registration to turn the entire call-chain into a no-op.
-template <bool should_register>
-class OpDefBuilderWrapper;
-
-// Template specialization that forwards all calls to the contained builder.
-template <>
-class OpDefBuilderWrapper<true> {
+class OpDefBuilderWrapper {
  public:
   explicit OpDefBuilderWrapper(const char name[]) : builder_(name) {}
-  OpDefBuilderWrapper<true>& Attr(std::string spec) {
+  OpDefBuilderWrapper& Attr(std::string spec) {
     builder_.Attr(std::move(spec));
     return *this;
   }
-  OpDefBuilderWrapper<true>& Input(std::string spec) {
+  OpDefBuilderWrapper& Input(std::string spec) {
     builder_.Input(std::move(spec));
     return *this;
   }
-  OpDefBuilderWrapper<true>& Output(std::string spec) {
+  OpDefBuilderWrapper& Output(std::string spec) {
     builder_.Output(std::move(spec));
     return *this;
   }
-  OpDefBuilderWrapper<true>& SetIsCommutative() {
+  OpDefBuilderWrapper& SetIsCommutative() {
     builder_.SetIsCommutative();
     return *this;
   }
-  OpDefBuilderWrapper<true>& SetIsAggregate() {
+  OpDefBuilderWrapper& SetIsAggregate() {
     builder_.SetIsAggregate();
     return *this;
   }
-  OpDefBuilderWrapper<true>& SetIsStateful() {
+  OpDefBuilderWrapper& SetIsStateful() {
     builder_.SetIsStateful();
     return *this;
   }
-  OpDefBuilderWrapper<true>& SetDoNotOptimize() {
+  OpDefBuilderWrapper& SetDoNotOptimize() {
     // We don't have a separate flag to disable optimizations such as constant
     // folding and CSE so we reuse the stateful flag.
     builder_.SetIsStateful();
     return *this;
   }
-  OpDefBuilderWrapper<true>& SetAllowsUninitializedInput() {
+  OpDefBuilderWrapper& SetAllowsUninitializedInput() {
     builder_.SetAllowsUninitializedInput();
     return *this;
   }
-  OpDefBuilderWrapper<true>& Deprecated(int version, std::string explanation) {
+  OpDefBuilderWrapper& Deprecated(int version, std::string explanation) {
     builder_.Deprecated(version, std::move(explanation));
     return *this;
   }
-  OpDefBuilderWrapper<true>& Doc(std::string text) {
+  OpDefBuilderWrapper& Doc(std::string text) {
     builder_.Doc(std::move(text));
     return *this;
   }
-  OpDefBuilderWrapper<true>& SetShapeFn(OpShapeInferenceFn fn) {
+  OpDefBuilderWrapper& SetShapeFn(OpShapeInferenceFn fn) {
     builder_.SetShapeFn(std::move(fn));
     return *this;
   }
+
   const ::tensorflow::OpDefBuilder& builder() const { return builder_; }
 
+  InitOnStartupMarker operator()();
+
  private:
   mutable ::tensorflow::OpDefBuilder builder_;
 };
 
-// Template specialization that turns all calls into no-ops.
-template <>
-class OpDefBuilderWrapper<false> {
- public:
-  explicit constexpr OpDefBuilderWrapper(const char name[]) {}
-  OpDefBuilderWrapper<false>& Attr(StringPiece spec) { return *this; }
-  OpDefBuilderWrapper<false>& Input(StringPiece spec) { return *this; }
-  OpDefBuilderWrapper<false>& Output(StringPiece spec) { return *this; }
-  OpDefBuilderWrapper<false>& SetIsCommutative() { return *this; }
-  OpDefBuilderWrapper<false>& SetIsAggregate() { return *this; }
-  OpDefBuilderWrapper<false>& SetIsStateful() { return *this; }
-  OpDefBuilderWrapper<false>& SetDoNotOptimize() { return *this; }
-  OpDefBuilderWrapper<false>& SetAllowsUninitializedInput() { return *this; }
-  OpDefBuilderWrapper<false>& Deprecated(int, StringPiece) { return *this; }
-  OpDefBuilderWrapper<false>& Doc(StringPiece text) { return *this; }
-  OpDefBuilderWrapper<false>& SetShapeFn(
-      Status (*fn)(shape_inference::InferenceContext*)) {
-    return *this;
-  }
-};
-
-struct OpDefBuilderReceiver {
-  // To call OpRegistry::Global()->Register(...), used by the
-  // REGISTER_OP macro below.
-  // Note: These are implicitly converting constructors.
-  OpDefBuilderReceiver(
-      const OpDefBuilderWrapper<true>& wrapper);  // NOLINT(runtime/explicit)
-  constexpr OpDefBuilderReceiver(const OpDefBuilderWrapper<false>&) {
-  }  // NOLINT(runtime/explicit)
-};
 }  // namespace register_op
 
-#define REGISTER_OP(name) REGISTER_OP_UNIQ_HELPER(__COUNTER__, name)
-#define REGISTER_OP_UNIQ_HELPER(ctr, name) REGISTER_OP_UNIQ(ctr, name)
-#define REGISTER_OP_UNIQ(ctr, name)                                          \
-  static ::tensorflow::register_op::OpDefBuilderReceiver register_op##ctr    \
-      TF_ATTRIBUTE_UNUSED =                                                  \
-          ::tensorflow::register_op::OpDefBuilderWrapper<SHOULD_REGISTER_OP( \
-              name)>(name)
+#define REGISTER_OP_IMPL(ctr, name, is_system_op)                         \
+  static ::tensorflow::InitOnStartupMarker const register_op##ctr         \
+      TF_ATTRIBUTE_UNUSED =                                               \
+          TF_INIT_ON_STARTUP_IF(is_system_op || SHOULD_REGISTER_OP(name)) \
+          << ::tensorflow::register_op::OpDefBuilderWrapper(name)
+
+#define REGISTER_OP(name)        \
+  TF_ATTRIBUTE_ANNOTATE("tf:op") \
+  TF_NEW_ID_FOR_INIT(REGISTER_OP_IMPL, name, false)
 
 // The `REGISTER_SYSTEM_OP()` macro acts as `REGISTER_OP()` except
 // that the op is registered unconditionally even when selective
 // registration is used.
-#define REGISTER_SYSTEM_OP(name) \
-  REGISTER_SYSTEM_OP_UNIQ_HELPER(__COUNTER__, name)
-#define REGISTER_SYSTEM_OP_UNIQ_HELPER(ctr, name) \
-  REGISTER_SYSTEM_OP_UNIQ(ctr, name)
-#define REGISTER_SYSTEM_OP_UNIQ(ctr, name)                                \
-  static ::tensorflow::register_op::OpDefBuilderReceiver register_op##ctr \
-      TF_ATTRIBUTE_UNUSED =                                               \
-          ::tensorflow::register_op::OpDefBuilderWrapper<true>(name)
+#define REGISTER_SYSTEM_OP(name)        \
+  TF_ATTRIBUTE_ANNOTATE("tf:op")        \
+  TF_ATTRIBUTE_ANNOTATE("tf:op:system") \
+  TF_NEW_ID_FOR_INIT(REGISTER_OP_IMPL, name, true)
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc
index d9b679534ee..9bb352b0707 100644
--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@@ -114,10 +114,9 @@ OpKernel::OpKernel(OpKernelConstruction* context, bool is_deferred)
   OP_REQUIRES_OK(context, CheckOpDeprecation(*props_->op_def,
                                              context->graph_def_version()));
 
-  // Kernels executing on GPU/SYCL tie very few resources on the CPU where the
+  // Kernels executing on GPU tie very few resources on the CPU where the
   // scheduler runs: we consider them as inexpensive.
-  expensive_ = context->device_type() != DeviceType(DEVICE_GPU) &&
-               context->device_type() != DeviceType(DEVICE_SYCL);
+  expensive_ = context->device_type() != DeviceType(DEVICE_GPU);
 }
 
 OpKernel::OpKernel(OpKernelConstruction* context, NodeDef&& custom_def,
@@ -141,10 +140,9 @@ OpKernel::OpKernel(OpKernelConstruction* context, NodeDef&& custom_def,
   OP_REQUIRES_OK(context, CheckOpDeprecation(*props_->op_def,
                                              context->graph_def_version()));
 
-  // Kernels executing on GPU/SYCL tie very few resources on the CPU where the
+  // Kernels executing on GPU tie very few resources on the CPU where the
   // scheduler runs: we consider them as inexpensive.
-  expensive_ = context->device_type() != DeviceType(DEVICE_GPU) &&
-               context->device_type() != DeviceType(DEVICE_SYCL);
+  expensive_ = context->device_type() != DeviceType(DEVICE_GPU);
 }
 
 OpKernel::~OpKernel() {}
@@ -709,10 +707,11 @@ Status OpKernelContext::allocate_tensor(
     DataType type, const TensorShape& shape, Tensor* out_tensor,
     AllocatorAttributes attr, const AllocationAttributes& allocation_attr) {
   Allocator* a = get_allocator(attr);
-  Tensor new_tensor(a, type, shape,
-                    AllocationAttributes(allocation_attr.no_retry_on_failure,
-                                         /* allocation_will_be_logged= */ true,
-                                         allocation_attr.freed_by_func));
+  Tensor new_tensor(
+      a, type, shape,
+      AllocationAttributes(
+          /*retry_on_failure=*/allocation_attr.retry_on_failure,
+          /*allocation_will_be_logged=*/true, allocation_attr.freed_by_func));
 
   if (!new_tensor.IsInitialized()) {
     return errors::ResourceExhausted(
@@ -1257,26 +1256,23 @@ namespace kernel_factory {
 void OpKernelRegistrar::InitInternal(const KernelDef* kernel_def,
                                      StringPiece kernel_class_name,
                                      std::unique_ptr<OpKernelFactory> factory) {
-  // See comments in register_kernel::Name in header for info on _no_register.
-  if (kernel_def->op() != "_no_register") {
-    const string key =
-        Key(kernel_def->op(), DeviceType(kernel_def->device_type()),
-            kernel_def->label());
+  const string key =
+      Key(kernel_def->op(), DeviceType(kernel_def->device_type()),
+          kernel_def->label());
 
-    // To avoid calling LoadDynamicKernels DO NOT CALL GlobalKernelRegistryTyped
-    // here.
-    // InitInternal gets called by static initializers, so it ends up executing
-    // before main. This causes LoadKernelLibraries function to get called
-    // before some file libraries can initialize, which in turn crashes the
-    // program flakily. Until we get rid of static initializers in kernel
-    // registration mechanism, we have this workaround here.
-    auto global_registry =
-        reinterpret_cast<KernelRegistry*>(GlobalKernelRegistry());
-    mutex_lock l(global_registry->mu);
-    global_registry->registry.emplace(
-        key,
-        KernelRegistration(*kernel_def, kernel_class_name, std::move(factory)));
-  }
+  // To avoid calling LoadDynamicKernels DO NOT CALL GlobalKernelRegistryTyped
+  // here.
+  // InitInternal gets called by static initializers, so it ends up executing
+  // before main. This causes LoadKernelLibraries function to get called
+  // before some file libraries can initialize, which in turn crashes the
+  // program flakily. Until we get rid of static initializers in kernel
+  // registration mechanism, we have this workaround here.
+  auto global_registry =
+      reinterpret_cast<KernelRegistry*>(GlobalKernelRegistry());
+  mutex_lock l(global_registry->mu);
+  global_registry->registry.emplace(
+      key,
+      KernelRegistration(*kernel_def, kernel_class_name, std::move(factory)));
   delete kernel_def;
 }
 
@@ -1721,12 +1717,6 @@ const Eigen::GpuDevice& OpKernelContext::eigen_device() const {
   return eigen_gpu_device();
 }
 
-#ifdef TENSORFLOW_USE_SYCL
-template <>
-const Eigen::SyclDevice& OpKernelContext::eigen_device() const {
-  return eigen_sycl_device();
-}
-#endif
 
 void OpKernelConstruction::CtxFailure(const Status& s) {
   VLOG(1) << s;
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index 3bfcedaee82..9742a968bb8 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -58,7 +58,6 @@ limitations under the License.
 namespace Eigen {
 struct ThreadPoolDevice;
 struct GpuDevice;
-struct SyclDevice;
 }  // end namespace Eigen
 
 namespace tensorflow {
@@ -1149,11 +1148,6 @@ class OpKernelContext {
   const Eigen::GpuDevice& eigen_gpu_device() const {
     return params_->eigen_gpu_device->device();
   }
-#ifdef TENSORFLOW_USE_SYCL
-  const Eigen::SyclDevice& eigen_sycl_device() const {
-    return *device()->eigen_sycl_device();
-  }
-#endif
   template <typename EigenDeviceType>
   const EigenDeviceType& eigen_device() const;
 
@@ -1336,10 +1330,6 @@ const Eigen::ThreadPoolDevice& OpKernelContext::eigen_device() const;
 template <>
 const Eigen::GpuDevice& OpKernelContext::eigen_device() const;
 
-#ifdef TENSORFLOW_USE_SYCL
-template <>
-const Eigen::SyclDevice& OpKernelContext::eigen_device() const;
-#endif
 
 // Register your OpKernel by specifying the Op's name, the device the
 // kernel runs on, any type attr constraints for this kernel, any
@@ -1421,72 +1411,45 @@ namespace register_kernel {
 
 class Name : public KernelDefBuilder {
  public:
-  // With selective registration, kernels whose implementation class is not used
-  // by any kernel are disabled with the SHOULD_REGISTER_OP_KERNEL call in
-  // REGISTER_KERNEL_BUILDER_UNIQ. However, an unused kernel that shares an
-  // implementation class with a used kernel would get through that mechanism.
-  //
-  // This mechanism stops that registration by changing the name of the kernel
-  // for the unused op to one that is ignored by
-  // OpKernelRegistrar::InitInternal.  Note that this method alone is
-  // not sufficient - the compiler can't evaluate the entire KernelDefBuilder at
-  // compilation time, so this method doesn't actually reduce code size.
-  explicit Name(const char* op)
-      : KernelDefBuilder(SHOULD_REGISTER_OP(op) ? op : "_no_register") {}
-};
-
-namespace system {
-
-class Name : public KernelDefBuilder {
- public:
-  // For system kernels, we ignore selective registration and
-  // unconditionally register the kernel.
   explicit Name(const char* op) : KernelDefBuilder(op) {}
 };
 
-}  // namespace system
-
 }  // namespace register_kernel
 
-#define REGISTER_KERNEL_BUILDER(kernel_builder, ...) \
-  REGISTER_KERNEL_BUILDER_UNIQ_HELPER(__COUNTER__, kernel_builder, __VA_ARGS__)
+// TODO(dodgen): There are some uses of this macro inside functions, where
+// kernel_builder refers to (non-const) locals (they should be fixed). To
+// accommodate those, kernel_builder.Build() appears as an argument to an
+// immediately-called lambda (not in the lambda itself).
+#define REGISTER_KERNEL_BUILDER_IMPL(ctr, kernel_builder, is_system_kernel, \
+                                     ...)                                   \
+  static ::tensorflow::InitOnStartupMarker const register_kernel_##ctr      \
+      TF_ATTRIBUTE_UNUSED =                                                 \
+          TF_INIT_ON_STARTUP_IF(is_system_kernel ||                         \
+                                SHOULD_REGISTER_OP_KERNEL(#__VA_ARGS__))    \
+          << ([](::tensorflow::KernelDef const* kernel_def) {               \
+               ::tensorflow::kernel_factory::OpKernelRegistrar registrar(   \
+                   kernel_def, #__VA_ARGS__,                                \
+                   [](::tensorflow::OpKernelConstruction* context)          \
+                       -> ::tensorflow::OpKernel* {                         \
+                     return new __VA_ARGS__(context);                       \
+                   });                                                      \
+               (void)registrar;                                             \
+               return ::tensorflow::InitOnStartupMarker{};                  \
+             })(::tensorflow::register_kernel::kernel_builder.Build());
 
-#define REGISTER_KERNEL_BUILDER_UNIQ_HELPER(ctr, kernel_builder, ...) \
-  REGISTER_KERNEL_BUILDER_UNIQ(ctr, kernel_builder, __VA_ARGS__)
-
-#define REGISTER_KERNEL_BUILDER_UNIQ(ctr, kernel_builder, ...)        \
-  constexpr bool should_register_##ctr##__flag =                      \
-      SHOULD_REGISTER_OP_KERNEL(#__VA_ARGS__);                        \
-  static ::tensorflow::kernel_factory::OpKernelRegistrar              \
-      registrar__body__##ctr##__object(                               \
-          should_register_##ctr##__flag                               \
-              ? ::tensorflow::register_kernel::kernel_builder.Build() \
-              : nullptr,                                              \
-          #__VA_ARGS__,                                               \
-          [](::tensorflow::OpKernelConstruction* context)             \
-              -> ::tensorflow::OpKernel* {                            \
-            return new __VA_ARGS__(context);                          \
-          });
+#define REGISTER_KERNEL_BUILDER(kernel_builder, ...)                      \
+  TF_ATTRIBUTE_ANNOTATE("tf:kernel")                                      \
+  TF_NEW_ID_FOR_INIT(REGISTER_KERNEL_BUILDER_IMPL, kernel_builder, false, \
+                     __VA_ARGS__)
 
 // The `REGISTER_SYSTEM_KERNEL_BUILDER()` macro acts as
 // `REGISTER_KERNEL_BUILDER()` except that the kernel is registered
 // unconditionally even when selective registration is used.
-#define REGISTER_SYSTEM_KERNEL_BUILDER(kernel_builder, ...)               \
-  REGISTER_SYSTEM_KERNEL_BUILDER_UNIQ_HELPER(__COUNTER__, kernel_builder, \
-                                             __VA_ARGS__)
-
-#define REGISTER_SYSTEM_KERNEL_BUILDER_UNIQ_HELPER(ctr, kernel_builder, ...) \
-  REGISTER_SYSTEM_KERNEL_BUILDER_UNIQ(ctr, kernel_builder, __VA_ARGS__)
-
-#define REGISTER_SYSTEM_KERNEL_BUILDER_UNIQ(ctr, kernel_builder, ...)    \
-  static ::tensorflow::kernel_factory::OpKernelRegistrar                 \
-      registrar__body__##ctr##__object(                                  \
-          ::tensorflow::register_kernel::system::kernel_builder.Build(), \
-          #__VA_ARGS__,                                                  \
-          [](::tensorflow::OpKernelConstruction* context)                \
-              -> ::tensorflow::OpKernel* {                               \
-            return new __VA_ARGS__(context);                             \
-          });
+#define REGISTER_SYSTEM_KERNEL_BUILDER(kernel_builder, ...)              \
+  TF_ATTRIBUTE_ANNOTATE("tf:kernel")                                     \
+  TF_ATTRIBUTE_ANNOTATE("tf:kernel:system")                              \
+  TF_NEW_ID_FOR_INIT(REGISTER_KERNEL_BUILDER_IMPL, kernel_builder, true, \
+                     __VA_ARGS__)
 
 // Checks whether a given kernel is registered on device_type.
 bool KernelDefAvailable(const DeviceType& device_type, const NodeDef& node_def);
@@ -1540,23 +1503,15 @@ class OpKernelRegistrar {
   // KernelDef is required.
   OpKernelRegistrar(const KernelDef* kernel_def, StringPiece kernel_class_name,
                     std::unique_ptr<OpKernelFactory> factory) {
-    // Perform the check in the header to allow compile-time optimization
-    // to a no-op, allowing the linker to remove the kernel symbols.
-    if (kernel_def != nullptr) {
-      InitInternal(kernel_def, kernel_class_name, std::move(factory));
-    }
+    InitInternal(kernel_def, kernel_class_name, std::move(factory));
   }
 
   // Registers the given factory function with TensorFlow. This is equivalent
   // to registering a factory whose Create function invokes `create_fn`.
   OpKernelRegistrar(const KernelDef* kernel_def, StringPiece kernel_class_name,
                     OpKernel* (*create_fn)(OpKernelConstruction*)) {
-    // Perform the check in the header to allow compile-time optimization
-    // to a no-op, allowing the linker to remove the kernel symbols.
-    if (kernel_def != nullptr) {
-      InitInternal(kernel_def, kernel_class_name,
-                   absl::make_unique<PtrOpKernelFactory>(create_fn));
-    }
+    InitInternal(kernel_def, kernel_class_name,
+                 absl::make_unique<PtrOpKernelFactory>(create_fn));
   }
 
  private:
diff --git a/tensorflow/core/framework/op_requires.h b/tensorflow/core/framework/op_requires.h
index ea80bfd7b2d..b7cf6e859fb 100644
--- a/tensorflow/core/framework/op_requires.h
+++ b/tensorflow/core/framework/op_requires.h
@@ -68,7 +68,7 @@ namespace tensorflow {
 
 #define OP_REQUIRES_OK_ASYNC(CTX, STATUS, CALLBACK)         \
   do {                                                      \
-    ::tensorflow::Status _s(STATUS);                        \
+    const ::tensorflow::Status& _s(STATUS);                 \
     if (!TF_PREDICT_TRUE(_s.ok())) {                        \
       (CTX)->CtxFailureWithWarning(__FILE__, __LINE__, _s); \
       (CALLBACK)();                                         \
diff --git a/tensorflow/core/framework/register_types.h b/tensorflow/core/framework/register_types.h
index 0cf6536e8c2..d34b5800864 100644
--- a/tensorflow/core/framework/register_types.h
+++ b/tensorflow/core/framework/register_types.h
@@ -211,16 +211,4 @@ limitations under the License.
   TF_CALL_COMPLEX_TYPES(m)                 \
   TF_CALL_QUANTIZED_TYPES(m) TF_CALL_bool(m) TF_CALL_tstring(m)
 
-#ifdef TENSORFLOW_SYCL_NO_DOUBLE
-#define TF_CALL_SYCL_double(m)
-#else  // TENSORFLOW_SYCL_NO_DOUBLE
-#define TF_CALL_SYCL_double(m) TF_CALL_double(m)
-#endif  // TENSORFLOW_SYCL_NO_DOUBLE
-
-#ifdef __ANDROID_TYPES_SLIM__
-#define TF_CALL_SYCL_NUMBER_TYPES(m) TF_CALL_float(m)
-#else  // __ANDROID_TYPES_SLIM__
-#define TF_CALL_SYCL_NUMBER_TYPES(m) TF_CALL_float(m) TF_CALL_SYCL_double(m)
-#endif  // __ANDROID_TYPES_SLIM__
-
 #endif  // TENSORFLOW_CORE_FRAMEWORK_REGISTER_TYPES_H_
diff --git a/tensorflow/core/framework/register_types_traits.h b/tensorflow/core/framework/register_types_traits.h
index 660021759de..ff6c9fb3da7 100644
--- a/tensorflow/core/framework/register_types_traits.h
+++ b/tensorflow/core/framework/register_types_traits.h
@@ -21,9 +21,6 @@ limitations under the License.
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 #include "tensorflow/core/framework/numeric_types.h"
 #include "tensorflow/core/platform/types.h"
@@ -74,16 +71,6 @@ struct proxy_type_pod<GPUDevice, 1> {
   typedef ::tensorflow::int8 type;
 };
 
-#ifdef TENSORFLOW_USE_SYCL
-template <>
-struct proxy_type_pod<SYCLDevice, 8> {
-  typedef double type;
-};
-template <>
-struct proxy_type_pod<SYCLDevice, 4> {
-  typedef float type;
-};
-#endif  // TENSORFLOW_USE_SYCL
 
 /// If POD we use proxy_type_pod, otherwise this maps to identity.
 template <typename Device, typename T>
@@ -101,10 +88,6 @@ struct proxy_type {
 #define TF_CALL_GPU_PROXY_TYPES(m)                                    \
   TF_CALL_double(m) TF_CALL_float(m) TF_CALL_half(m) TF_CALL_int32(m) \
       TF_CALL_int8(m)
-#ifdef TENSORFLOW_USE_SYCL
-#define TF_CALL_SYCL_PROXY_TYPES(m) \
-  TF_CALL_double(m) TF_CALL_float(m) TF_CALL_int32(m)
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_FRAMEWORK_REGISTER_TYPES_TRAITS_H_
diff --git a/tensorflow/core/framework/rng_alg.h b/tensorflow/core/framework/rng_alg.h
new file mode 100644
index 00000000000..4317d66ad9a
--- /dev/null
+++ b/tensorflow/core/framework/rng_alg.h
@@ -0,0 +1,34 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_RNG_ALG_H_
+#define TENSORFLOW_CORE_FRAMEWORK_RNG_ALG_H_
+
+namespace tensorflow {
+
+enum Algorithm { RNG_ALG_PHILOX = 1, RNG_ALG_THREEFRY = 2 };
+
+static constexpr int RNG_KEY_SIZE = 1;
+static constexpr int RNG_MAX_COUNTER_SIZE = 2;
+inline int GetCounterSize(Algorithm alg) {
+  if (alg == RNG_ALG_PHILOX) {
+    return 2;
+  }
+  return 1;
+}
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_RNG_ALG_H_
diff --git a/tensorflow/core/framework/selective_registration.h b/tensorflow/core/framework/selective_registration.h
index 4b281a04bf6..c9bbcb8bfe8 100644
--- a/tensorflow/core/framework/selective_registration.h
+++ b/tensorflow/core/framework/selective_registration.h
@@ -13,11 +13,28 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+// This file provides some common support for 'registration' of e.g. ops and
+// kernels. In particular, it relates to the REGISTER_OP (op registration) and
+// REGISTER_KERNEL_BUILDER (kernel registration) macros.
+//
+// Note that there are two sides to 'registration':
+//   - Definition (compile-time): making op and kernel definitions _available_.
+//   - Usage (run-time): adding particular (available) definitions of ops and
+//     kernels to the global OpRegistry / KernelRegistry, to be found when
+//     constructing and executing graphs.
+//
+// Currently, definition and usage happen to be coupled together: all
+// 'available' definitions (from the REGISTER_*' macros) are added to the global
+// registries on startup / library load.
+
 #ifndef TENSORFLOW_CORE_FRAMEWORK_SELECTIVE_REGISTRATION_H_
 #define TENSORFLOW_CORE_FRAMEWORK_SELECTIVE_REGISTRATION_H_
 
 #include <string.h>
 
+#include <type_traits>
+#include <utility>
+
 #ifdef SELECTIVE_REGISTRATION
 
 // Experimental selective registration support to reduce binary size.
@@ -55,4 +72,69 @@ static_assert(false, "ops_to_register.h must define SHOULD_REGISTER macros");
 #define SHOULD_REGISTER_OP_KERNEL(clz) true
 #endif
 
+namespace tensorflow {
+
+// An InitOnStartupMarker is 'initialized' on program startup, purely for the
+// side-effects of that initialization - the struct itself is empty. (The type
+// is expected to be used to define globals.)
+//
+// The '<<' operator should be used in initializer expressions to specify what
+// to run on startup. The following values are accepted:
+//   - An InitOnStartupMarker. Example:
+//      InitOnStartupMarker F();
+//      InitOnStartupMarker const kInitF =
+//        InitOnStartupMarker{} << F();
+//   - Something to call, which returns an InitOnStartupMarker. Example:
+//      InitOnStartupMarker const kInit =
+//        InitOnStartupMarker{} << []() { G(); return
+//
+// See also: TF_INIT_ON_STARTUP_IF
+struct InitOnStartupMarker {
+  constexpr InitOnStartupMarker operator<<(InitOnStartupMarker) const {
+    return *this;
+  }
+
+  template <typename T>
+  constexpr InitOnStartupMarker operator<<(T&& v) const {
+    return std::forward<T>(v)();
+  }
+};
+
+// Conditional initializer expressions for InitOnStartupMarker:
+//   TF_INIT_ON_STARTUP_IF(cond) << f
+// If 'cond' is true, 'f' is evaluated (and called, if applicable) on startup.
+// Otherwise, 'f' is *not evaluated*. Note that 'cond' is required to be a
+// constant-expression, and so this approximates #ifdef.
+//
+// The implementation uses the ?: operator (!cond prevents evaluation of 'f').
+// The relative precedence of ?: and << is significant; this effectively expands
+// to (see extra parens):
+//   !cond ? InitOnStartupMarker{} : (InitOnStartupMarker{} << f)
+//
+// Note that although forcing 'cond' to be a constant-expression should not
+// affect binary size (i.e. the same optimizations should apply if it 'happens'
+// to be one), it was found to be necessary (for a recent version of clang;
+// perhaps an optimizer bug).
+//
+// The parens are necessary to hide the ',' from the preprocessor; it could
+// otherwise act as a macro argument separator.
+#define TF_INIT_ON_STARTUP_IF(cond)                \
+  (::std::integral_constant<bool, !(cond)>::value) \
+      ? ::tensorflow::InitOnStartupMarker{}        \
+      : ::tensorflow::InitOnStartupMarker {}
+
+// Wrapper for generating unique IDs (for 'anonymous' InitOnStartup definitions)
+// using __COUNTER__. The new ID (__COUNTER__ already expanded) is provided as a
+// macro argument.
+//
+// Usage:
+//   #define M_IMPL(id, a, b) ...
+//   #define M(a, b) TF_NEW_ID_FOR_INIT(M, a, b)
+#define TF_NEW_ID_FOR_INIT_2(m, c, ...) m(c, __VA_ARGS__)
+#define TF_NEW_ID_FOR_INIT_1(m, c, ...) TF_NEW_ID_FOR_INIT_2(m, c, __VA_ARGS__)
+#define TF_NEW_ID_FOR_INIT(m, ...) \
+  TF_NEW_ID_FOR_INIT_1(m, __COUNTER__, __VA_ARGS__)
+
+}  // namespace tensorflow
+
 #endif  // TENSORFLOW_CORE_FRAMEWORK_SELECTIVE_REGISTRATION_H_
diff --git a/tensorflow/core/framework/selective_registration_test.cc b/tensorflow/core/framework/selective_registration_test.cc
new file mode 100644
index 00000000000..0daed7c4940
--- /dev/null
+++ b/tensorflow/core/framework/selective_registration_test.cc
@@ -0,0 +1,84 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/selective_registration.h"
+
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+using ::testing::Eq;
+
+#define STORE_NEXT_ID_IMPL(id, name) constexpr int name = id
+#define STORE_NEXT_ID(name) TF_NEW_ID_FOR_INIT(STORE_NEXT_ID_IMPL, name)
+
+STORE_NEXT_ID(kBaseId);
+STORE_NEXT_ID(kNextId1);
+STORE_NEXT_ID(kNextId2);
+
+TEST(NewIdForInitTest, SequentialIds) {
+  static_assert(kBaseId >= 0, "kBaseId < 0");
+  static_assert(kNextId1 == kBaseId + 1, "kNextId1 != kBaseId+1");
+  static_assert(kNextId2 == kBaseId + 2, "kNextId2 != kBaseId+2");
+}
+
+int observed_unconditional_init;
+InitOnStartupMarker const kUnconditionalInitMarker =
+    InitOnStartupMarker{} << []() {
+      observed_unconditional_init++;
+      return InitOnStartupMarker{};
+    };
+
+TEST(InitOnStartupTest, Unconditional) {
+  EXPECT_THAT(observed_unconditional_init, Eq(1));
+}
+
+template <bool Enable>
+int observed_conditional_init;
+template <bool Enable>
+InitOnStartupMarker const kConditionalInitMarker =
+    TF_INIT_ON_STARTUP_IF(Enable) << []() {
+      (observed_conditional_init<Enable>)++;
+      return InitOnStartupMarker{};
+    };
+
+template InitOnStartupMarker const kConditionalInitMarker<true>;
+template InitOnStartupMarker const kConditionalInitMarker<false>;
+
+TEST(InitOnStartupTest, Conditional) {
+  EXPECT_THAT(observed_conditional_init<true>, Eq(1));
+  EXPECT_THAT(observed_conditional_init<false>, Eq(0));
+}
+
+template <bool Enable>
+int observed_conditional_init_immediate;
+template <bool Enable>
+InitOnStartupMarker const kConditionalInitImmediateMarker =
+    TF_INIT_ON_STARTUP_IF(Enable) << ([]() {
+      (observed_conditional_init_immediate<Enable>)++;
+      return InitOnStartupMarker{};
+    })();
+
+template InitOnStartupMarker const kConditionalInitImmediateMarker<true>;
+template InitOnStartupMarker const kConditionalInitImmediateMarker<false>;
+
+TEST(InitOnStartupTest, ConditionalImmediate) {
+  EXPECT_THAT(observed_conditional_init_immediate<true>, Eq(1));
+  EXPECT_THAT(observed_conditional_init_immediate<false>, Eq(0));
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/framework/tensor.cc b/tensorflow/core/framework/tensor.cc
index a0f30508ee8..03499ec0220 100644
--- a/tensorflow/core/framework/tensor.cc
+++ b/tensorflow/core/framework/tensor.cc
@@ -1007,9 +1007,9 @@ inline const strings::AlphaNum& PrintOneElement(const strings::AlphaNum& a,
 }
 inline string PrintOneElement(const tstring& a, bool print_v2) {
   if (print_v2) {
-    return "\"" + absl::CEscape(a) + "\"";
+    return "\"" + absl::Utf8SafeCEscape(a) + "\"";
   } else {
-    return absl::CEscape(a);
+    return absl::Utf8SafeCEscape(a);
   }
 }
 inline float PrintOneElement(const Eigen::half& h, bool print_v2) {
diff --git a/tensorflow/core/framework/tensor_key.h b/tensorflow/core/framework/tensor_key.h
new file mode 100644
index 00000000000..e70c03ec9d4
--- /dev/null
+++ b/tensorflow/core/framework/tensor_key.h
@@ -0,0 +1,68 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_TENSOR_KEY_H_
+#define TENSORFLOW_CORE_FRAMEWORK_TENSOR_KEY_H_
+
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+
+class TensorKey : public Tensor {
+ public:
+  using Tensor::Tensor;
+
+  TensorKey(const Tensor& t) : Tensor(t) {}
+
+  // Equality operator. Needed for absl hashing.
+  friend bool operator==(const TensorKey& t1, const TensorKey& t2) {
+    if (t1.dtype() != t2.dtype() || t1.shape() != t2.shape()) {
+      return false;
+    }
+    if (DataTypeCanUseMemcpy(t1.dtype())) {
+      return t1.tensor_data() == t2.tensor_data();
+    }
+    if (t1.dtype() == DT_STRING) {
+      const auto s1 = t1.unaligned_flat<tstring>();
+      const auto s2 = t2.unaligned_flat<tstring>();
+      for (int64 i = 0, n = t1.NumElements(); i < n; ++i) {
+        if (TF_PREDICT_FALSE(s1(i) != s2(i))) {
+          return false;
+        }
+      }
+      return true;
+    }
+    return false;
+  }
+
+  friend bool operator!=(const TensorKey& t1, const TensorKey& t2) {
+    return !(t1 == t2);
+  }
+
+  // Needed for absl hash function.
+  template <typename H>
+  friend H AbslHashValue(H h, const TensorKey& k) {
+    const uint8* d = static_cast<uint8*>(k.data());
+    size_t s = k.AllocatedBytes();
+    std::vector<uint8> vec;
+    for (int i = 0; i < s; i++) {
+      vec.push_back(d[i]);
+    }
+    return H::combine(std::move(h), s);
+  }
+};
+
+}  // namespace tensorflow
+
+#endif
diff --git a/tensorflow/core/framework/tensor_shape.h b/tensorflow/core/framework/tensor_shape.h
index dbe103088c1..70253f4e7c8 100644
--- a/tensorflow/core/framework/tensor_shape.h
+++ b/tensorflow/core/framework/tensor_shape.h
@@ -332,6 +332,11 @@ class TensorShape : public TensorShapeBase<TensorShape> {
   friend class Tensor;
 };
 
+/// Outputs `TensorShapeBase` to `std::ostream`.
+inline std::ostream& operator<<(std::ostream& os, const TensorShape& ts) {
+  return os << ts.DebugString();
+}
+
 /// Represents the value of one dimension in a TensorShape.
 struct TensorShapeDim {
   explicit TensorShapeDim(int64 s) : size(s) {}
diff --git a/tensorflow/core/framework/types.cc b/tensorflow/core/framework/types.cc
index 294f7a21557..457ba639cc2 100644
--- a/tensorflow/core/framework/types.cc
+++ b/tensorflow/core/framework/types.cc
@@ -38,7 +38,6 @@ std::ostream& operator<<(std::ostream& os, const DeviceType& d) {
 const char* const DEVICE_DEFAULT = "DEFAULT";
 const char* const DEVICE_CPU = "CPU";
 const char* const DEVICE_GPU = "GPU";
-const char* const DEVICE_SYCL = "SYCL";
 const char* const DEVICE_TPU_SYSTEM = "TPU_SYSTEM";
 
 const std::string DeviceName<Eigen::ThreadPoolDevice>::value = DEVICE_CPU;
@@ -46,9 +45,6 @@ const std::string DeviceName<Eigen::ThreadPoolDevice>::value = DEVICE_CPU;
     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 const std::string DeviceName<Eigen::GpuDevice>::value = DEVICE_GPU;
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#ifdef TENSORFLOW_USE_SYCL
-const std::string DeviceName<Eigen::SyclDevice>::value = DEVICE_SYCL;
-#endif  // TENSORFLOW_USE_SYCL
 
 namespace {
 string DataTypeStringInternal(DataType dtype) {
diff --git a/tensorflow/core/framework/types.h b/tensorflow/core/framework/types.h
index 2b5f41be0de..2ba259434de 100644
--- a/tensorflow/core/framework/types.h
+++ b/tensorflow/core/framework/types.h
@@ -74,7 +74,6 @@ std::ostream& operator<<(std::ostream& os, const DeviceType& d);
 TF_EXPORT extern const char* const DEVICE_DEFAULT;     // "DEFAULT"
 TF_EXPORT extern const char* const DEVICE_CPU;         // "CPU"
 TF_EXPORT extern const char* const DEVICE_GPU;         // "GPU"
-TF_EXPORT extern const char* const DEVICE_SYCL;        // "SYCL"
 TF_EXPORT extern const char* const DEVICE_TPU_SYSTEM;  // "TPU_SYSTEM"
 
 template <typename Device>
@@ -93,12 +92,6 @@ struct DeviceName<Eigen::GpuDevice> {
 };
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-template <>
-struct DeviceName<Eigen::SyclDevice> {
-  static const std::string value;
-};
-#endif  // TENSORFLOW_USE_SYCL
 
 typedef gtl::InlinedVector<MemoryType, 4> MemoryTypeVector;
 typedef gtl::ArraySlice<MemoryType> MemoryTypeSlice;
diff --git a/tensorflow/core/framework/types_test.cc b/tensorflow/core/framework/types_test.cc
index 63fb35081cd..060e86ed72b 100644
--- a/tensorflow/core/framework/types_test.cc
+++ b/tensorflow/core/framework/types_test.cc
@@ -26,7 +26,6 @@ namespace {
 TEST(TypesTest, DeviceTypeName) {
   EXPECT_EQ("CPU", DeviceTypeString(DeviceType(DEVICE_CPU)));
   EXPECT_EQ("GPU", DeviceTypeString(DeviceType(DEVICE_GPU)));
-  EXPECT_EQ("SYCL", DeviceTypeString(DeviceType(DEVICE_SYCL)));
 }
 
 TEST(TypesTest, kDataTypeRefOffset) {
diff --git a/tensorflow/core/graph/mkl_graph_util.h b/tensorflow/core/graph/mkl_graph_util.h
index 3c4c186b791..9d9727bdb46 100644
--- a/tensorflow/core/graph/mkl_graph_util.h
+++ b/tensorflow/core/graph/mkl_graph_util.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/util/env_var.h"
 
 namespace tensorflow {
 // Since our ops are going to produce and also consume N addition tensors
@@ -87,6 +88,20 @@ bool inline DoesControlEdgeExist(const Node* src, const Node* dst) {
   return false;
 }
 
+// Check if graph should run in layout-dependent mode or native format mode
+// based on environment variable setting. User can set
+// TF_ENABLE_MKL_NATIVE_FORMAT=1 to enable the native format mode.
+bool inline NativeFormatEnabled() {
+  static bool native_fmt_enabled = false;
+  static absl::once_flag once;
+  absl::call_once(once, [&] {
+    TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_MKL_NATIVE_FORMAT",
+                                   /*default_value*/ false,
+                                   &native_fmt_enabled));
+  });
+  return native_fmt_enabled;
+}
+
 namespace mkl_op_registry {
 // MKL operators whose kernels are registered with 'MklLayoutDependentOp' label
 // (e.g., MklConv2D) understand input tensors in MKL layout. These operators
@@ -113,10 +128,36 @@ static const char* const kMklOpPrefix = "_Mkl";
 // through template parameter.
 static const char* const kMklEagerOpPrefix = "_MklEager";
 
+// Prefix that we add to TF op name to construct MKL op that does not
+// depend on layout propagation. It will be used in both Eager and graph
+// modes unless there is a reason to have additional op name with
+// _MklEager prefix.
+static const char* const kMklNativeOpPrefix = "_MklNative";
+
+// Get the name of Mkl Native (does not depend on layout propagation) op
+// from original TensorFlow op.
+inline string GetMklNativeOpName(const string& name) {
+  // There are few operators that don't depend on layout propagation but are
+  // prefixed with _Mkl instead of _MklNative.
+  bool result =
+      (0 == name.compare("ConjugateTranspose") ||
+       0 == name.compare("BatchMatMul") || 0 == name.compare("BatchMatMulV2") ||
+       0 == name.compare("MatMul") || 0 == name.compare("Transpose"));
+  if (result) {
+    return string(kMklOpPrefix) + name;
+  } else {
+    return string(kMklNativeOpPrefix) + name;
+  }
+}
+
 // Get the name of Mkl op from original TensorFlow op
-// We prefix 'Mkl' to the original op to get Mkl op.
+// We prefix the original op with _Mkl or _MklNative to get Mkl op.
 inline string GetMklOpName(const string& name) {
-  return string(kMklOpPrefix) + name;
+  if (!NativeFormatEnabled()) {
+    return string(kMklOpPrefix) + name;
+  } else {
+    return GetMklNativeOpName(name);
+  }
 }
 
 // Get the name of Mkl Eager op from original TensorFlow op
diff --git a/tensorflow/core/grappler/clusters/cluster.h b/tensorflow/core/grappler/clusters/cluster.h
index 0f415cf0392..d391e15de1a 100644
--- a/tensorflow/core/grappler/clusters/cluster.h
+++ b/tensorflow/core/grappler/clusters/cluster.h
@@ -103,9 +103,9 @@ class Cluster {
   // superset of the devices listed in GetDevices/GetDeviceNames().
   virtual const DeviceSet* GetDeviceSet() const { return nullptr; }
 
-  // Enables collecting the allocator stats. Call with enable=true must be made
-  // before Provision().
-  virtual Status EnablePeakMemoryStats(bool enable) {
+  // Enables collecting the allocator stats. If called, must be called before
+  // Provision().
+  virtual Status EnablePeakMemoryStats() {
     return errors::Unimplemented(strings ::StrCat(
         "Peak Memory Stats are not supported on ", type(), " clusters"));
   }
diff --git a/tensorflow/core/grappler/clusters/single_machine.cc b/tensorflow/core/grappler/clusters/single_machine.cc
index c44b74efcdc..678daed02e4 100644
--- a/tensorflow/core/grappler/clusters/single_machine.cc
+++ b/tensorflow/core/grappler/clusters/single_machine.cc
@@ -202,9 +202,9 @@ Status SingleMachine::Run(const GraphDef& graph_def,
   return Status::OK();
 }
 
-Status SingleMachine::EnablePeakMemoryStats(bool enable) {
-  EnableCPUAllocatorStats(enable);
-  cpu_allocator_stats_enabled_ = enable;
+Status SingleMachine::EnablePeakMemoryStats() {
+  EnableCPUAllocatorStats();
+  cpu_allocator_stats_enabled_ = true;
   // No need to enable GPU allocator stats since its stats are always collected.
   return Status::OK();
 }
diff --git a/tensorflow/core/grappler/clusters/single_machine.h b/tensorflow/core/grappler/clusters/single_machine.h
index 9e085d161b6..48f56940ec4 100644
--- a/tensorflow/core/grappler/clusters/single_machine.h
+++ b/tensorflow/core/grappler/clusters/single_machine.h
@@ -45,7 +45,7 @@ class SingleMachine : public Cluster {
 
   const DeviceSet* GetDeviceSet() const override { return device_set_.get(); }
 
-  Status EnablePeakMemoryStats(bool enable) override;
+  Status EnablePeakMemoryStats() override;
 
   // It requires EnableAllocatorStats(true) be called before Provision().
   Status GetPeakMemoryUsage(
diff --git a/tensorflow/core/grappler/clusters/single_machine_test.cc b/tensorflow/core/grappler/clusters/single_machine_test.cc
index a85e4e17748..d0d525e0222 100644
--- a/tensorflow/core/grappler/clusters/single_machine_test.cc
+++ b/tensorflow/core/grappler/clusters/single_machine_test.cc
@@ -51,7 +51,7 @@ class SingleMachineTest : public ::testing::Test {
 #endif
     cluster_.reset(
         new SingleMachine(timeout_s, 3 /* num_cpu_cores */, 0 /* num_gpus */));
-    TF_CHECK_OK(cluster_->EnablePeakMemoryStats(true));
+    TF_CHECK_OK(cluster_->EnablePeakMemoryStats());
     TF_CHECK_OK(cluster_->Provision());
   }
 
diff --git a/tensorflow/core/grappler/costs/BUILD b/tensorflow/core/grappler/costs/BUILD
index edbdaffa1c8..02c69920b84 100644
--- a/tensorflow/core/grappler/costs/BUILD
+++ b/tensorflow/core/grappler/costs/BUILD
@@ -337,6 +337,7 @@ cc_library(
 tf_cc_test(
     name = "op_level_cost_estimator_test",
     srcs = ["op_level_cost_estimator_test.cc"],
+    tags = ["no_oss"],  # b/163222310
     deps = [
         ":op_level_cost_estimator",
         "//tensorflow/core:framework",
diff --git a/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc b/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc
index e558558d00a..b23b657308d 100644
--- a/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc
+++ b/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc
@@ -102,14 +102,13 @@ TEST_F(AnalyticalCostEstimatorTest, SimpleTest) {
   Costs summary;
   TF_ASSERT_OK(estimator.PredictCosts(item.graph, &run_metadata, &summary));
 
-  EXPECT_EQ(Costs::NanoSeconds(9157), summary.execution_time);
+  EXPECT_EQ(Costs::NanoSeconds(9158), summary.execution_time);
   // Note there are totally 17 nodes (RandomUniform creates 2 nodes), but
   // grappler will not process "label", therefore we have 15 here instead
   EXPECT_EQ(15, summary.num_ops_total);
 
   // Make this estimate accurate:
   // TODO(http://b/70031255): Accurate estimator for RandomUniform op needed
-  // TODO(http://b/70031363): Accurate estimator for Softmax needed
   //
   // Change to EXPECT_FALSE when the above TODOs are done:
   EXPECT_TRUE(summary.inaccurate);
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
index 62e6e361ef8..69735fdca07 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
@@ -65,14 +65,17 @@ constexpr char kRecv[] = "_Recv";
 constexpr char kSend[] = "_Send";
 constexpr char kBatchMatMul[] = "BatchMatMul";
 constexpr char kBatchMatMulV2[] = "BatchMatMulV2";
+constexpr char kOneHot[] = "OneHot";
 constexpr char kPack[] = "Pack";
 constexpr char kRank[] = "Rank";
+constexpr char kRange[] = "Range";
 constexpr char kShape[] = "Shape";
 constexpr char kShapeN[] = "ShapeN";
 constexpr char kSize[] = "Size";
 constexpr char kStopGradient[] = "StopGradient";
 constexpr char kPreventGradient[] = "PreventGradient";
 constexpr char kGather[] = "Gather";
+constexpr char kGatherNd[] = "GatherNd";
 constexpr char kGatherV2[] = "GatherV2";
 constexpr char kScatterAdd[] = "ScatterAdd";
 constexpr char kScatterDiv[] = "ScatterDiv";
@@ -82,8 +85,10 @@ constexpr char kScatterMul[] = "ScatterMul";
 constexpr char kScatterSub[] = "ScatterSub";
 constexpr char kScatterUpdate[] = "ScatterUpdate";
 constexpr char kSlice[] = "Slice";
+constexpr char kStridedSlice[] = "StridedSlice";
 constexpr char kSpaceToDepth[] = "SpaceToDepth";
 constexpr char kTranspose[] = "Transpose";
+constexpr char kTile[] = "Tile";
 constexpr char kMaxPool[] = "MaxPool";
 constexpr char kMaxPoolGrad[] = "MaxPoolGrad";
 constexpr char kAvgPool[] = "AvgPool";
@@ -93,6 +98,8 @@ constexpr char kFusedBatchNormGrad[] = "FusedBatchNormGrad";
 constexpr char kQuantizedMatMul[] = "QuantizedMatMul";
 constexpr char kQuantizedMatMulV2[] = "QuantizedMatMulV2";
 constexpr char kUnpack[] = "Unpack";
+constexpr char kSoftmax[] = "Softmax";
+constexpr char kResizeBilinear[] = "ResizeBilinear";
 // Dynamic control flow ops.
 constexpr char kSwitch[] = "Switch";
 constexpr char kMerge[] = "Merge";
@@ -402,6 +409,8 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
 
   device_cost_impl_.emplace(kGather,
                             wrap(&OpLevelCostEstimator::PredictGatherOrSlice));
+  device_cost_impl_.emplace(kGatherNd,
+                            wrap(&OpLevelCostEstimator::PredictGatherOrSlice));
   device_cost_impl_.emplace(kGatherV2,
                             wrap(&OpLevelCostEstimator::PredictGatherOrSlice));
   device_cost_impl_.emplace(kScatterAdd,
@@ -421,6 +430,8 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
 
   device_cost_impl_.emplace(kSlice,
                             wrap(&OpLevelCostEstimator::PredictGatherOrSlice));
+  device_cost_impl_.emplace(kStridedSlice,
+                            wrap(&OpLevelCostEstimator::PredictGatherOrSlice));
 
   device_cost_impl_.emplace(kPlaceholder,
                             wrap(&OpLevelCostEstimator::PredictIdentity));
@@ -463,8 +474,12 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
                             wrap(&OpLevelCostEstimator::PredictPureMemoryOp));
   device_cost_impl_.emplace(kFill,
                             wrap(&OpLevelCostEstimator::PredictPureMemoryOp));
+  device_cost_impl_.emplace(kOneHot,
+                            wrap(&OpLevelCostEstimator::PredictPureMemoryOp));
   device_cost_impl_.emplace(kPack,
                             wrap(&OpLevelCostEstimator::PredictPureMemoryOp));
+  device_cost_impl_.emplace(kRange,
+                            wrap(&OpLevelCostEstimator::PredictPureMemoryOp));
   device_cost_impl_.emplace(kSpaceToDepth,
                             wrap(&OpLevelCostEstimator::PredictPureMemoryOp));
   device_cost_impl_.emplace(kSplit,
@@ -473,6 +488,8 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
                             wrap(&OpLevelCostEstimator::PredictPureMemoryOp));
   device_cost_impl_.emplace(kTranspose,
                             wrap(&OpLevelCostEstimator::PredictPureMemoryOp));
+  device_cost_impl_.emplace(kTile,
+                            wrap(&OpLevelCostEstimator::PredictPureMemoryOp));
   device_cost_impl_.emplace(kUnpack,
                             wrap(&OpLevelCostEstimator::PredictPureMemoryOp));
 
@@ -497,6 +514,10 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
   device_cost_impl_.emplace(
       kFusedBatchNormGrad,
       wrap(&OpLevelCostEstimator::PredictFusedBatchNormGrad));
+  device_cost_impl_.emplace(kSoftmax,
+                            wrap(&OpLevelCostEstimator::PredictSoftmax));
+  device_cost_impl_.emplace(kResizeBilinear,
+                            wrap(&OpLevelCostEstimator::PredictResizeBilinear));
   device_cost_impl_.emplace(
       kAssignVariableOp, wrap(&OpLevelCostEstimator::PredictAssignVariableOps));
   device_cost_impl_.emplace(
@@ -600,6 +621,8 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
                            EIGEN_COST(scalar_product_op<float>));
   elementwise_ops_.emplace("RealDiv", EIGEN_COST(scalar_quotient_op<float>));
   elementwise_ops_.emplace("ReluGrad", EIGEN_COST(scalar_max_op<float>));
+  elementwise_ops_.emplace("Select", EIGEN_COST(scalar_boolean_or_op));
+  elementwise_ops_.emplace("SelectV2", EIGEN_COST(scalar_boolean_or_op));
   elementwise_ops_.emplace("SquaredDifference",
                            EIGEN_COST(scalar_square_op<float>) +
                                EIGEN_COST(scalar_difference_op<float>));
@@ -607,6 +630,7 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
   elementwise_ops_.emplace("TruncateDiv",
                            EIGEN_COST(scalar_quotient_op<float>));
   elementwise_ops_.emplace("TruncateMod", EIGEN_COST(scalar_mod_op<float>));
+  elementwise_ops_.emplace("Where", 1);
 
 #undef EIGEN_COST
 
@@ -791,7 +815,7 @@ Costs OpLevelCostEstimator::PredictOpCountBasedCost(
           << " Intermediate Memory Time (ns):"
           << intermediate_memory_cost.count();
 
-  Costs costs;
+  Costs costs = Costs::ZeroCosts();
   costs.compute_time = compute_cost;
   costs.memory_time = memory_cost;
   costs.intermediate_memory_time = intermediate_memory_cost;
@@ -1518,8 +1542,29 @@ int64 OpLevelCostEstimator::CalculateOutputSize(const OpInfo& op_info,
   return total_output_size;
 }
 
+bool HasZeroDim(const OpInfo& op_info) {
+  for (int i = 0; i < op_info.inputs_size(); ++i) {
+    const auto& input = op_info.inputs(i);
+    for (int j = 0; j < input.shape().dim_size(); ++j) {
+      const auto& dim = input.shape().dim(j);
+      if (dim.size() == 0) {
+        VLOG(1) << "Convolution config has zero dim "
+                << op_info.ShortDebugString();
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
 Costs OpLevelCostEstimator::PredictConv2D(const OpContext& op_context) const {
   const auto& op_info = op_context.op_info;
+  if (HasZeroDim(op_info)) {
+    Costs costs = Costs::ZeroCosts();
+    costs.inaccurate = true;
+    costs.num_ops_with_unknown_shapes = 1;
+    return costs;
+  }
   bool found_unknown_shapes = false;
   auto costs = PredictOpCountBasedCost(
       CountConv2DOperations(op_info, &found_unknown_shapes), op_info);
@@ -1531,6 +1576,12 @@ Costs OpLevelCostEstimator::PredictConv2D(const OpContext& op_context) const {
 Costs OpLevelCostEstimator::PredictConv2DBackpropInput(
     const OpContext& op_context) const {
   const auto& op_info = op_context.op_info;
+  if (HasZeroDim(op_info)) {
+    Costs costs = Costs::ZeroCosts();
+    costs.inaccurate = true;
+    costs.num_ops_with_unknown_shapes = true;
+    return costs;
+  }
   bool found_unknown_shapes = false;
   auto costs =
       PredictOpCountBasedCost(CountConv2DBackpropInputOperations(
@@ -1544,6 +1595,12 @@ Costs OpLevelCostEstimator::PredictConv2DBackpropInput(
 Costs OpLevelCostEstimator::PredictConv2DBackpropFilter(
     const OpContext& op_context) const {
   const auto& op_info = op_context.op_info;
+  if (HasZeroDim(op_info)) {
+    Costs costs = Costs::ZeroCosts();
+    costs.inaccurate = true;
+    costs.num_ops_with_unknown_shapes = true;
+    return costs;
+  }
   bool found_unknown_shapes = false;
   auto costs =
       PredictOpCountBasedCost(CountConv2DBackpropFilterOperations(
@@ -1796,15 +1853,20 @@ Costs OpLevelCostEstimator::PredictGatherOrSlice(
 
   const double output_size = CalculateOutputSize(op_info, &unknown_shapes);
   double input_size = output_size;
+  int begin_input_index = 1, end_input_index;
   if (op_info.op() == "Slice") {
-    // Add 'begin' & 'size' tensors sizes.
-    input_size +=
-        CalculateTensorElementCount(op_info.inputs(1), &unknown_shapes) +
-        CalculateTensorElementCount(op_info.inputs(2), &unknown_shapes);
+    // Slice: 'input' (omitted), 'begin', 'size'
+    end_input_index = 3;
+  } else if (op_info.op() == "StridedSlice") {
+    // StridedSlice: 'input' (omitted), 'begin', 'end', 'strides'
+    end_input_index = 4;
   } else {
-    // Assuming this is "Gather" or "GatherV2" op, add 'indices' size.
+    // Gather, GatherV2, GatherNd: 'params' (omitted), 'indices'
+    end_input_index = 2;
+  }
+  for (int i = begin_input_index; i < end_input_index; ++i) {
     input_size +=
-        CalculateTensorElementCount(op_info.inputs(1), &unknown_shapes);
+        CalculateTensorElementCount(op_info.inputs(i), &unknown_shapes);
   }
 
   Costs costs =
@@ -2273,5 +2335,120 @@ Costs OpLevelCostEstimator::PredictNaryOp(const OpContext& op_context) const {
   costs.num_ops_with_unknown_shapes = found_unknown_shapes;
   return costs;
 }
+
+// softmax[i, j] = exp(logits[i, j]) / sum_j(exp(logits[i, j]))
+Costs OpLevelCostEstimator::PredictSoftmax(const OpContext& op_context) const {
+  bool found_unknown_shapes = false;
+  const int64 logits_size = CalculateTensorElementCount(
+      op_context.op_info.inputs(0), &found_unknown_shapes);
+  TensorShapeProto logits_shape = MaybeGetMinimumShape(
+      op_context.op_info.inputs(0).shape(), 2, &found_unknown_shapes);
+
+#define EIGEN_COST(X) Eigen::internal::functor_traits<Eigen::internal::X>::Cost
+
+  // Every element of <logits> will be exponentiated, have that result included
+  // in a sum across j, and also have that result multiplied by the reciprocal
+  // of the sum_j. In addition, we'll compute 1/sum_j for every i.
+  auto ops =
+      (EIGEN_COST(scalar_exp_op<float>) + EIGEN_COST(scalar_sum_op<float>) +
+       EIGEN_COST(scalar_product_op<float>)) *
+          logits_size +
+      EIGEN_COST(scalar_inverse_op<float>) * logits_shape.dim(0).size();
+
+#undef EIGEN_COST
+
+  return PredictOpCountBasedCost(ops, op_context.op_info);
+}
+
+Costs OpLevelCostEstimator::PredictResizeBilinear(
+    const OpContext& op_context) const {
+  bool found_unknown_shapes = false;
+
+  const int64 input_size =
+      CalculateTensorSize(op_context.op_info.inputs(0), &found_unknown_shapes);
+  const int64 output_size =
+      CalculateTensorSize(op_context.op_info.outputs(0), &found_unknown_shapes);
+  const int64 output_elements = CalculateTensorElementCount(
+      op_context.op_info.outputs(0), &found_unknown_shapes);
+
+  const auto half_pixel_centers =
+      op_context.op_info.attr().find("half_pixel_centers");
+  bool use_half_pixel_centers = false;
+  if (half_pixel_centers == op_context.op_info.attr().end()) {
+    LOG(WARNING) << "half_pixel_centers attr not set for ResizeBilinear.";
+    return PredictCostOfAnUnknownOp(op_context);
+  } else {
+    use_half_pixel_centers = half_pixel_centers->second.b();
+  }
+
+  // Compose cost of bilinear interpolation.
+  int64 ops = 0;
+
+#define EIGEN_COST(X) Eigen::internal::functor_traits<Eigen::internal::X>::Cost
+  const auto sub_cost_float = EIGEN_COST(scalar_difference_op<float>);
+  const auto sub_cost_int = EIGEN_COST(scalar_difference_op<int64>);
+  const auto add_cost = EIGEN_COST(scalar_sum_op<float>);
+  const auto mul_cost = EIGEN_COST(scalar_product_op<float>);
+  const auto floor_cost = EIGEN_COST(scalar_floor_op<float>);
+  const auto max_cost = EIGEN_COST(scalar_max_op<int64>);
+  const auto min_cost = EIGEN_COST(scalar_min_op<int64>);
+  const auto cast_to_int_cost = Eigen::internal::functor_traits<
+      Eigen::internal::scalar_cast_op<float, int64>>::Cost;
+  const auto cast_to_float_cost = Eigen::internal::functor_traits<
+      Eigen::internal::scalar_cast_op<int64, float>>::Cost;
+  const auto ceil_cost = EIGEN_COST(scalar_ceil_op<float>);
+#undef EIGEN_COST
+
+  // Ops calcualted from tensorflow/core/kernels/image/resize_bilinear_op.cc.
+
+  // Op counts taken from resize_bilinear implementation at cl/322475933.
+  // Computed op counts may become inaccurate if resize_bilinear implementation
+  // changes.
+
+  // resize_bilinear has an optimization where the interpolation weights are
+  // precomputed and cached. Given input tensors of size [B,H1,W1,C] and output
+  // tensors of size [B,H2,W2,C], the last dimension C that needs to be accessed
+  // in the input for interpolation are identical at every point in the output.
+  // These values are cached in the compute_interpolation_weights function. For
+  // a particular y in [0...H2-1], the rows to be accessed in the input are the
+  // same. Likewise, for a particular x in [0...H2-1], the columns to be accsed
+  // are the same. So the precomputation only needs to be done for H2 + W2
+  // values.
+  const auto output_shape = MaybeGetMinimumShape(
+      op_context.op_info.outputs(0).shape(), 4, &found_unknown_shapes);
+  // Assume H is dim 1 and W is dim 2 to match logic in resize_bilinear, which
+  // also makes this assumption.
+  const int64 output_height = output_shape.dim(1).size();
+  const int64 output_width = output_shape.dim(2).size();
+  // Add the ops done outside of the scaler function in
+  // compute_interpolation_weights.
+  int64 interp_weight_cost = floor_cost + max_cost + min_cost + sub_cost_float +
+                             sub_cost_int + ceil_cost + cast_to_int_cost * 2;
+  // There are two options for computing the weight of each pixel in the
+  // interpolation. Algorithm can use pixel centers, or corners, for the
+  // weight. Ops depend on the scaler function passed into
+  // compute_interpolation_weights.
+  if (use_half_pixel_centers) {
+    // Ops for HalfPixelScalaer.
+    interp_weight_cost +=
+        add_cost + mul_cost + sub_cost_float + cast_to_float_cost;
+  } else {
+    // Ops for LegacyScaler.
+    interp_weight_cost += cast_to_float_cost + mul_cost;
+  }
+  // Cost for the interpolation is multipled by (H2 + w2), as mentioned above.
+  ops += interp_weight_cost * (output_height + output_width);
+
+  // Ops for computing the new values, done for every element. Logic is from
+  // compute_lerp in the inner loop of resize_image which consists of:
+  //   const float top = top_left + (top_right - top_left) * x_lerp;
+  //   const float bottom = bottom_left + (bottom_right - bottom_left) * x_lerp;
+  //   return top + (bottom - top) * y_lerp;
+  ops += (add_cost * 3 + sub_cost_float * 3 + mul_cost * 3) * output_elements;
+
+  return PredictOpCountBasedCost(ops, input_size, output_size,
+                                 op_context.op_info);
+}
+
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.h b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
index f44f4ee19e5..69d2bd40e1a 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.h
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
@@ -88,6 +88,8 @@ class OpLevelCostEstimator {
   Costs PredictEinsum(const OpContext& op_context) const;
   Costs PredictAssignVariableOps(const OpContext& op_context) const;
   Costs PredictPureMemoryOp(const OpContext& op_context) const;
+  Costs PredictSoftmax(const OpContext& op_context) const;
+  Costs PredictResizeBilinear(const OpContext& op_context) const;
 
   // Generic cost prediction method for fused operations.
   Costs PredictFusedOp(const OpContext& op_context,
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
index 0f19b54feec..dd38acbb93c 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
@@ -634,29 +634,37 @@ TEST_F(OpLevelCostEstimatorTest, TestPersistentOpCosts) {
     EXPECT_EQ(Costs::Duration(0), cost.memory_time);
     EXPECT_EQ(Costs::Duration(1), cost.compute_time);
     EXPECT_EQ(Costs::Duration(1), cost.execution_time);
-    EXPECT_EQ(1, cost.num_ops_total);
+    EXPECT_EQ(cost.num_ops_total, 1);
     EXPECT_FALSE(cost.inaccurate);
-    EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+    EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+    EXPECT_EQ(cost.temporary_memory, 0);
+    EXPECT_EQ(cost.persistent_memory, 0);
   }
 }
 
 TEST_F(OpLevelCostEstimatorTest, TestGatherCosts) {
-  OpContext op_context;
-  SetCpuDevice(&op_context.op_info);
-  op_context.op_info.set_op("Gather");
+  std::vector<std::string> gather_ops = {"Gather", "GatherNd", "GatherV2"};
 
-  // Huge first input shouldn't affect Gather execution and memory costs.
-  DescribeArbitraryRankInput({10000000, 10}, DT_FLOAT, &op_context.op_info);
-  DescribeArbitraryRankInput({16}, DT_INT64, &op_context.op_info);
-  DescribeArbitraryRankOutput({16, 10}, DT_FLOAT, &op_context.op_info);
+  for (const auto& op : gather_ops) {
+    OpContext op_context;
+    SetCpuDevice(&op_context.op_info);
+    op_context.op_info.set_op(op);
 
-  auto cost = estimator_.PredictCosts(op_context);
-  EXPECT_EQ(Costs::Duration(130), cost.memory_time);
-  EXPECT_EQ(Costs::Duration(16), cost.compute_time);
-  EXPECT_EQ(Costs::Duration(146), cost.execution_time);
-  EXPECT_EQ(1, cost.num_ops_total);
-  EXPECT_FALSE(cost.inaccurate);
-  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+    // Huge first input shouldn't affect Gather execution and memory costs.
+    DescribeArbitraryRankInput({10000000, 10}, DT_FLOAT, &op_context.op_info);
+    DescribeArbitraryRankInput({16}, DT_INT64, &op_context.op_info);
+    DescribeArbitraryRankOutput({16, 10}, DT_FLOAT, &op_context.op_info);
+
+    auto cost = estimator_.PredictCosts(op_context);
+    EXPECT_EQ(Costs::Duration(130), cost.memory_time);
+    EXPECT_EQ(Costs::Duration(16), cost.compute_time);
+    EXPECT_EQ(Costs::Duration(146), cost.execution_time);
+    EXPECT_EQ(cost.num_ops_total, 1);
+    EXPECT_FALSE(cost.inaccurate);
+    EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+    EXPECT_EQ(cost.temporary_memory, 0);
+    EXPECT_EQ(cost.persistent_memory, 0);
+  }
 }
 
 TEST_F(OpLevelCostEstimatorTest, TestGatherCostsWithoutOutput) {
@@ -674,7 +682,9 @@ TEST_F(OpLevelCostEstimatorTest, TestGatherCostsWithoutOutput) {
   EXPECT_EQ(Costs::Duration(0), cost.execution_time);
   EXPECT_EQ(1, cost.num_ops_total);
   EXPECT_TRUE(cost.inaccurate);
-  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+  EXPECT_EQ(cost.temporary_memory, 0);
+  EXPECT_EQ(cost.persistent_memory, 0);
 }
 
 TEST_F(OpLevelCostEstimatorTest, TestSliceCosts) {
@@ -692,9 +702,34 @@ TEST_F(OpLevelCostEstimatorTest, TestSliceCosts) {
   EXPECT_EQ(Costs::Duration(81), cost.memory_time);
   EXPECT_EQ(Costs::Duration(10), cost.compute_time);
   EXPECT_EQ(Costs::Duration(91), cost.execution_time);
-  EXPECT_EQ(1, cost.num_ops_total);
+  EXPECT_EQ(cost.num_ops_total, 1);
   EXPECT_FALSE(cost.inaccurate);
-  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+  EXPECT_EQ(cost.temporary_memory, 0);
+  EXPECT_EQ(cost.persistent_memory, 0);
+}
+
+TEST_F(OpLevelCostEstimatorTest, TestStridedSliceCosts) {
+  OpContext op_context;
+  SetCpuDevice(&op_context.op_info);
+  op_context.op_info.set_op("StridedSlice");
+
+  // Huge first input shouldn't affect StridedSlice execution and memory costs.
+  DescribeArbitraryRankInput({10000000, 10}, DT_FLOAT, &op_context.op_info);
+  DescribeArbitraryRankInput({2}, DT_INT64, &op_context.op_info);
+  DescribeArbitraryRankInput({2}, DT_INT64, &op_context.op_info);
+  DescribeArbitraryRankInput({2}, DT_INT64, &op_context.op_info);
+  DescribeArbitraryRankOutput({10, 10}, DT_FLOAT, &op_context.op_info);
+
+  auto cost = estimator_.PredictCosts(op_context);
+  EXPECT_EQ(Costs::Duration(81), cost.memory_time);
+  EXPECT_EQ(Costs::Duration(10), cost.compute_time);
+  EXPECT_EQ(Costs::Duration(91), cost.execution_time);
+  EXPECT_EQ(cost.num_ops_total, 1);
+  EXPECT_FALSE(cost.inaccurate);
+  EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+  EXPECT_EQ(cost.temporary_memory, 0);
+  EXPECT_EQ(cost.persistent_memory, 0);
 }
 
 TEST_F(OpLevelCostEstimatorTest, TestScatterOps) {
@@ -719,9 +754,11 @@ TEST_F(OpLevelCostEstimatorTest, TestScatterOps) {
       EXPECT_EQ(Costs::Duration(205), cost.memory_time);
       EXPECT_EQ(Costs::Duration(16), cost.compute_time);
       EXPECT_EQ(Costs::Duration(221), cost.execution_time);
-      EXPECT_EQ(1, cost.num_ops_total);
+      EXPECT_EQ(cost.num_ops_total, 1);
       EXPECT_FALSE(cost.inaccurate);
-      EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+      EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+      EXPECT_EQ(cost.temporary_memory, 0);
+      EXPECT_EQ(cost.persistent_memory, 0);
     }
 
     // Test updates.shape = [] and INT32 indices
@@ -753,9 +790,11 @@ TEST_F(OpLevelCostEstimatorTest, BiasAddExecutionTime) {
   EXPECT_EQ(Costs::Duration(8400), cost.memory_time);
   EXPECT_EQ(Costs::Duration(1000), cost.compute_time);
   EXPECT_EQ(Costs::Duration(9400), cost.execution_time);
-  EXPECT_EQ(1, cost.num_ops_total);
+  EXPECT_EQ(cost.num_ops_total, 1);
   EXPECT_FALSE(cost.inaccurate);
-  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+  EXPECT_EQ(cost.temporary_memory, 0);
+  EXPECT_EQ(cost.persistent_memory, 0);
 }
 
 TEST_F(OpLevelCostEstimatorTest, Conv2DExecutionTime) {
@@ -763,9 +802,44 @@ TEST_F(OpLevelCostEstimatorTest, Conv2DExecutionTime) {
   EXPECT_EQ(Costs::Duration(233780), cost.memory_time);
   EXPECT_EQ(Costs::Duration(354877440), cost.compute_time);
   EXPECT_EQ(Costs::Duration(355111220), cost.execution_time);
-  EXPECT_EQ(1, cost.num_ops_total);
+  EXPECT_EQ(cost.num_ops_total, 1);
   EXPECT_FALSE(cost.inaccurate);
-  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+  EXPECT_EQ(cost.temporary_memory, 0);
+  EXPECT_EQ(cost.persistent_memory, 0);
+}
+
+TEST_F(OpLevelCostEstimatorTest, InvalidConv2DConfig) {
+  // Convolution ops.
+  const std::vector<const std::string> conv_ops = {
+      "Conv2D",
+      "Conv2DBackpropFilter",
+      "Conv2DBackpropInput",
+      "DepthwiseConv2dNative",
+      "DepthwiseConv2dNativeBackpropFilter",
+      "DepthwiseConv2dNativeBackpropInput",
+  };
+  // A valid Conv2D config.
+  const std::vector<int> valid_conv_config = {16, 19, 19, 48, 48, 5, 5, 256};
+  for (const auto& op : conv_ops) {
+    // Test with setting one value in conv config to zero.
+    // PredictCosts() should return zero costs.
+    for (int i = 0; i < valid_conv_config.size(); ++i) {
+      std::vector<int> conv_config(valid_conv_config);
+      conv_config[i] = 0;
+      auto op_context = DescribeConvolution(
+          conv_config[0], conv_config[1], conv_config[2], conv_config[3],
+          conv_config[4], conv_config[5], conv_config[6], conv_config[7]);
+      op_context.op_info.set_op(op);
+      auto cost = PredictCosts(op_context);
+      EXPECT_EQ(Costs::Duration(0), cost.memory_time);
+      EXPECT_EQ(Costs::Duration(0), cost.compute_time);
+      EXPECT_EQ(Costs::Duration(0), cost.execution_time);
+      EXPECT_EQ(1, cost.num_ops_total);
+      EXPECT_TRUE(cost.inaccurate);
+      EXPECT_EQ(1, cost.num_ops_with_unknown_shapes);
+    }
+  }
 }
 
 TEST_F(OpLevelCostEstimatorTest, DepthwiseConv2dNativeExecutionTime) {
@@ -774,9 +848,11 @@ TEST_F(OpLevelCostEstimatorTest, DepthwiseConv2dNativeExecutionTime) {
   EXPECT_EQ(Costs::Duration(112340), cost.memory_time);
   EXPECT_EQ(Costs::Duration(4158720), cost.compute_time);
   EXPECT_EQ(Costs::Duration(4271060), cost.execution_time);
-  EXPECT_EQ(1, cost.num_ops_total);
+  EXPECT_EQ(cost.num_ops_total, 1);
   EXPECT_FALSE(cost.inaccurate);
-  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+  EXPECT_EQ(cost.temporary_memory, 0);
+  EXPECT_EQ(cost.persistent_memory, 0);
 }
 
 TEST_F(OpLevelCostEstimatorTest, DummyExecutionTime) {
@@ -784,9 +860,11 @@ TEST_F(OpLevelCostEstimatorTest, DummyExecutionTime) {
   EXPECT_EQ(Costs::Duration(2000), cost.memory_time);
   EXPECT_EQ(Costs::Duration(0), cost.compute_time);
   EXPECT_EQ(Costs::Duration(2000), cost.execution_time);
-  EXPECT_EQ(1, cost.num_ops_total);
+  EXPECT_EQ(cost.num_ops_total, 1);
   EXPECT_TRUE(cost.inaccurate);
-  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+  EXPECT_EQ(cost.temporary_memory, 0);
+  EXPECT_EQ(cost.persistent_memory, 0);
 }
 
 TEST_F(OpLevelCostEstimatorTest, ExecutionTimeSumOrMax) {
@@ -795,9 +873,11 @@ TEST_F(OpLevelCostEstimatorTest, ExecutionTimeSumOrMax) {
   EXPECT_EQ(Costs::Duration(2000), cost.memory_time);
   EXPECT_EQ(Costs::Duration(0), cost.compute_time);
   EXPECT_EQ(Costs::Duration(2000), cost.execution_time);  // max(2000, 200)
-  EXPECT_EQ(1, cost.num_ops_total);
+  EXPECT_EQ(cost.num_ops_total, 1);
   EXPECT_TRUE(cost.inaccurate);
-  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+  EXPECT_EQ(cost.temporary_memory, 0);
+  EXPECT_EQ(cost.persistent_memory, 0);
   SetComputeMemoryOverlap(false);  // Set it back to default.
 }
 
@@ -809,9 +889,11 @@ TEST_F(OpLevelCostEstimatorTest,
   EXPECT_EQ(Costs::Duration(825345), cost.memory_time);
   EXPECT_EQ(Costs::Duration(355321038), cost.compute_time);
   EXPECT_EQ(Costs::Duration(356146383), cost.execution_time);
-  EXPECT_EQ(1, cost.num_ops_total);
+  EXPECT_EQ(cost.num_ops_total, 1);
   EXPECT_FALSE(cost.inaccurate);
-  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+  EXPECT_EQ(cost.temporary_memory, 0);
+  EXPECT_EQ(cost.persistent_memory, 0);
 }
 
 TEST_F(OpLevelCostEstimatorTest, FusedConv2DBiasActivationNCHW_HWIO) {
@@ -821,9 +903,11 @@ TEST_F(OpLevelCostEstimatorTest, FusedConv2DBiasActivationNCHW_HWIO) {
   EXPECT_EQ(Costs::Duration(1416808), cost.memory_time);
   EXPECT_EQ(Costs::Duration(355616770), cost.compute_time);
   EXPECT_EQ(Costs::Duration(357033578), cost.execution_time);
-  EXPECT_EQ(1, cost.num_ops_total);
+  EXPECT_EQ(cost.num_ops_total, 1);
   EXPECT_FALSE(cost.inaccurate);
-  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+  EXPECT_EQ(cost.temporary_memory, 0);
+  EXPECT_EQ(cost.persistent_memory, 0);
 }
 
 TEST_F(OpLevelCostEstimatorTest, FusedConv2DBiasActivationNCHW_OIHW) {
@@ -833,9 +917,11 @@ TEST_F(OpLevelCostEstimatorTest, FusedConv2DBiasActivationNCHW_OIHW) {
   EXPECT_EQ(Costs::Duration(1416808), cost.memory_time);
   EXPECT_EQ(Costs::Duration(355616770), cost.compute_time);
   EXPECT_EQ(Costs::Duration(357033578), cost.execution_time);
-  EXPECT_EQ(1, cost.num_ops_total);
+  EXPECT_EQ(cost.num_ops_total, 1);
   EXPECT_FALSE(cost.inaccurate);
-  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+  EXPECT_EQ(cost.temporary_memory, 0);
+  EXPECT_EQ(cost.persistent_memory, 0);
 }
 
 TEST_F(OpLevelCostEstimatorTest, FusedConv2DBiasActivationNHWC_HWIO) {
@@ -845,9 +931,11 @@ TEST_F(OpLevelCostEstimatorTest, FusedConv2DBiasActivationNHWC_HWIO) {
   EXPECT_EQ(Costs::Duration(1416808), cost.memory_time);
   EXPECT_EQ(Costs::Duration(355616770), cost.compute_time);
   EXPECT_EQ(Costs::Duration(357033578), cost.execution_time);
-  EXPECT_EQ(1, cost.num_ops_total);
+  EXPECT_EQ(cost.num_ops_total, 1);
   EXPECT_FALSE(cost.inaccurate);
-  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+  EXPECT_EQ(cost.temporary_memory, 0);
+  EXPECT_EQ(cost.persistent_memory, 0);
 }
 
 TEST_F(OpLevelCostEstimatorTest, FusedConv2DBiasActivationNHWC_OIHW) {
@@ -857,9 +945,11 @@ TEST_F(OpLevelCostEstimatorTest, FusedConv2DBiasActivationNHWC_OIHW) {
   EXPECT_EQ(Costs::Duration(1416808), cost.memory_time);
   EXPECT_EQ(Costs::Duration(355616770), cost.compute_time);
   EXPECT_EQ(Costs::Duration(357033578), cost.execution_time);
-  EXPECT_EQ(1, cost.num_ops_total);
+  EXPECT_EQ(cost.num_ops_total, 1);
   EXPECT_FALSE(cost.inaccurate);
-  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+  EXPECT_EQ(cost.temporary_memory, 0);
+  EXPECT_EQ(cost.persistent_memory, 0);
 }
 
 TEST_F(OpLevelCostEstimatorTest, FusedConv2DBiasActivationNCHW_VECT_C_OIHW) {
@@ -869,9 +959,11 @@ TEST_F(OpLevelCostEstimatorTest, FusedConv2DBiasActivationNCHW_VECT_C_OIHW) {
   EXPECT_EQ(Costs::Duration(1416808), cost.memory_time);
   EXPECT_EQ(Costs::Duration(355616770), cost.compute_time);
   EXPECT_EQ(Costs::Duration(357033578), cost.execution_time);
-  EXPECT_EQ(1, cost.num_ops_total);
+  EXPECT_EQ(cost.num_ops_total, 1);
   EXPECT_FALSE(cost.inaccurate);
-  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+  EXPECT_EQ(cost.temporary_memory, 0);
+  EXPECT_EQ(cost.persistent_memory, 0);
 }
 
 TEST_F(OpLevelCostEstimatorTest, FusedConv2DBiasActivationNCHW_OIHW_VECT_I) {
@@ -881,9 +973,11 @@ TEST_F(OpLevelCostEstimatorTest, FusedConv2DBiasActivationNCHW_OIHW_VECT_I) {
   EXPECT_EQ(Costs::Duration(1416808), cost.memory_time);
   EXPECT_EQ(Costs::Duration(355616770), cost.compute_time);
   EXPECT_EQ(Costs::Duration(357033578), cost.execution_time);
-  EXPECT_EQ(1, cost.num_ops_total);
+  EXPECT_EQ(cost.num_ops_total, 1);
   EXPECT_FALSE(cost.inaccurate);
-  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+  EXPECT_EQ(cost.temporary_memory, 0);
+  EXPECT_EQ(cost.persistent_memory, 0);
 }
 
 TEST_F(OpLevelCostEstimatorTest,
@@ -894,9 +988,11 @@ TEST_F(OpLevelCostEstimatorTest,
   EXPECT_EQ(Costs::Duration(1416808), cost.memory_time);
   EXPECT_EQ(Costs::Duration(355616770), cost.compute_time);
   EXPECT_EQ(Costs::Duration(357033578), cost.execution_time);
-  EXPECT_EQ(1, cost.num_ops_total);
+  EXPECT_EQ(cost.num_ops_total, 1);
   EXPECT_FALSE(cost.inaccurate);
-  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+  EXPECT_EQ(cost.temporary_memory, 0);
+  EXPECT_EQ(cost.persistent_memory, 0);
 }
 
 TEST_F(OpLevelCostEstimatorTest, MulExecutionTime) {
@@ -904,9 +1000,11 @@ TEST_F(OpLevelCostEstimatorTest, MulExecutionTime) {
   EXPECT_EQ(Costs::Duration(2000), cost.memory_time);
   EXPECT_EQ(Costs::Duration(200), cost.compute_time);
   EXPECT_EQ(Costs::Duration(2200), cost.execution_time);
-  EXPECT_EQ(1, cost.num_ops_total);
+  EXPECT_EQ(cost.num_ops_total, 1);
   EXPECT_FALSE(cost.inaccurate);
-  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+  EXPECT_EQ(cost.temporary_memory, 0);
+  EXPECT_EQ(cost.persistent_memory, 0);
 }
 
 TEST_F(OpLevelCostEstimatorTest, MulBroadcastExecutionTime) {
@@ -914,9 +1012,11 @@ TEST_F(OpLevelCostEstimatorTest, MulBroadcastExecutionTime) {
   EXPECT_EQ(Costs::Duration(3600), cost.memory_time);
   EXPECT_EQ(Costs::Duration(400), cost.compute_time);
   EXPECT_EQ(Costs::Duration(4000), cost.execution_time);
-  EXPECT_EQ(1, cost.num_ops_total);
+  EXPECT_EQ(cost.num_ops_total, 1);
   EXPECT_FALSE(cost.inaccurate);
-  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+  EXPECT_EQ(cost.temporary_memory, 0);
+  EXPECT_EQ(cost.persistent_memory, 0);
 }
 
 TEST_F(OpLevelCostEstimatorTest, ModExecutionTime) {
@@ -924,9 +1024,11 @@ TEST_F(OpLevelCostEstimatorTest, ModExecutionTime) {
   EXPECT_EQ(Costs::Duration(2000), cost.memory_time);
   EXPECT_EQ(Costs::Duration(1600), cost.compute_time);
   EXPECT_EQ(Costs::Duration(3600), cost.execution_time);
-  EXPECT_EQ(1, cost.num_ops_total);
+  EXPECT_EQ(cost.num_ops_total, 1);
   EXPECT_FALSE(cost.inaccurate);
-  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+  EXPECT_EQ(cost.temporary_memory, 0);
+  EXPECT_EQ(cost.persistent_memory, 0);
 }
 
 TEST_F(OpLevelCostEstimatorTest, SquaredDifferenceExecutionTime) {
@@ -937,12 +1039,15 @@ TEST_F(OpLevelCostEstimatorTest, SquaredDifferenceExecutionTime) {
   EXPECT_EQ(cost.num_ops_total, 1);
   EXPECT_FALSE(cost.inaccurate);
   EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+  EXPECT_EQ(cost.temporary_memory, 0);
+  EXPECT_EQ(cost.persistent_memory, 0);
 }
 
 TEST_F(OpLevelCostEstimatorTest, UnaryOpExecutionTime) {
   std::vector<std::pair<std::string, int>> unary_ops = {
-      {"All", 1},  {"ArgMax", 1}, {"Cast", 1},  {"Max", 1}, {"Min", 1},
-      {"Prod", 1}, {"Relu", 1},   {"Relu6", 1}, {"Sum", 1}, {"TopKV2", 1}};
+      {"All", 1},      {"ArgMax", 1}, {"Cast", 1},  {"Max", 1},
+      {"Min", 1},      {"Prod", 1},   {"Relu", 1},  {"Relu6", 1},
+      {"Softmax", 43}, {"Sum", 1},    {"TopKV2", 1}};
 
   const int kTensorSize = 1000;
   for (auto unary_op : unary_ops) {
@@ -955,12 +1060,50 @@ TEST_F(OpLevelCostEstimatorTest, UnaryOpExecutionTime) {
 
     auto cost = PredictCosts(op_context);
     EXPECT_EQ(cost.memory_time, Costs::Duration(kExpectedMemoryTime));
-    EXPECT_EQ(cost.compute_time, Costs::Duration(expected_compute_time));
+    EXPECT_EQ(cost.compute_time, Costs::Duration(expected_compute_time))
+        << unary_op.first;
     EXPECT_EQ(cost.execution_time,
               Costs::Duration(expected_compute_time + kExpectedMemoryTime));
     EXPECT_EQ(cost.num_ops_total, 1);
     EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
     EXPECT_FALSE(cost.inaccurate);
+    EXPECT_EQ(cost.temporary_memory, 0);
+    EXPECT_EQ(cost.persistent_memory, 0);
+  }
+}
+
+TEST_F(OpLevelCostEstimatorTest, BinaryOpExecutionTime) {
+  std::vector<std::pair<std::string, int>> binary_ops = {
+      {"Select", 1},
+      {"SelectV2", 1},
+      {"SquaredDifference", 2},
+      {"Where", 1},
+  };
+
+  const int kTensorSize1 = 1000;
+  const int kTensorSize2 = 2;
+  for (auto binary_op : binary_ops) {
+    OpContext op_context =
+        DescribeBinaryOp(binary_op.first, kTensorSize1, kTensorSize2);
+
+    const int kExpectedMemoryTime = 3600;
+    int expected_compute_time = std::ceil(
+        binary_op.second * kTensorSize1 * kTensorSize2 * 2 /
+        estimator_.GetDeviceInfo(op_context.op_info.device()).gigaops);
+
+    auto cost = PredictCosts(op_context);
+    EXPECT_EQ(Costs::Duration(kExpectedMemoryTime), cost.memory_time)
+        << binary_op.first;
+    EXPECT_EQ(Costs::Duration(expected_compute_time), cost.compute_time)
+        << binary_op.first;
+    EXPECT_EQ(Costs::Duration(expected_compute_time + kExpectedMemoryTime),
+              cost.execution_time)
+        << binary_op.first;
+    EXPECT_EQ(cost.num_ops_total, 1);
+    EXPECT_FALSE(cost.inaccurate);
+    EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+    EXPECT_EQ(cost.temporary_memory, 0);
+    EXPECT_EQ(cost.persistent_memory, 0);
   }
 }
 
@@ -976,9 +1119,11 @@ TEST_F(OpLevelCostEstimatorTest, BroadcastAddExecutionTime) {
   EXPECT_EQ(Costs::Duration(44), cost.memory_time);
   EXPECT_EQ(Costs::Duration(100), cost.compute_time);
   EXPECT_EQ(Costs::Duration(144), cost.execution_time);
-  EXPECT_EQ(1, cost.num_ops_total);
+  EXPECT_EQ(cost.num_ops_total, 1);
   EXPECT_FALSE(cost.inaccurate);
-  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+  EXPECT_EQ(cost.temporary_memory, 0);
+  EXPECT_EQ(cost.persistent_memory, 0);
 }
 
 TEST_F(OpLevelCostEstimatorTest, UnknownOrPartialShape) {
@@ -1246,9 +1391,11 @@ TEST_F(OpLevelCostEstimatorTest, PredictMaxPool) {
     EXPECT_EQ(Costs::Duration(1075200), costs.execution_time);
     EXPECT_EQ(Costs::Duration(307200), costs.compute_time);
     EXPECT_EQ(Costs::Duration(768000), costs.memory_time);
-    EXPECT_EQ(1, costs.num_ops_total);
+    EXPECT_EQ(costs.num_ops_total, 1);
     EXPECT_FALSE(costs.inaccurate);
-    EXPECT_EQ(0, costs.num_ops_with_unknown_shapes);
+    EXPECT_EQ(costs.num_ops_with_unknown_shapes, 0);
+    EXPECT_EQ(costs.temporary_memory, 0);
+    EXPECT_EQ(costs.persistent_memory, 0);
   }
   {
     // 1x1 window with 2x2 stride: used for shortcut in resnet-50.
@@ -1288,9 +1435,11 @@ TEST_F(OpLevelCostEstimatorTest, PredictMaxPoolGrad) {
     EXPECT_EQ(Costs::Duration(1996800), costs.execution_time);
     EXPECT_EQ(Costs::Duration(614400), costs.compute_time);
     EXPECT_EQ(Costs::Duration(1382400), costs.memory_time);
-    EXPECT_EQ(1, costs.num_ops_total);
+    EXPECT_EQ(costs.num_ops_total, 1);
     EXPECT_FALSE(costs.inaccurate);
-    EXPECT_EQ(0, costs.num_ops_with_unknown_shapes);
+    EXPECT_EQ(costs.num_ops_with_unknown_shapes, 0);
+    EXPECT_EQ(costs.temporary_memory, 0);
+    EXPECT_EQ(costs.persistent_memory, 0);
   }
   {
     // 1x1 window with 2x2 stride: used for shortcut in resnet-50.
@@ -1329,9 +1478,11 @@ TEST_F(OpLevelCostEstimatorTest, PredictAvgPool) {
     EXPECT_EQ(Costs::Duration(1113600), costs.execution_time);
     EXPECT_EQ(Costs::Duration(345600), costs.compute_time);
     EXPECT_EQ(Costs::Duration(768000), costs.memory_time);
-    EXPECT_EQ(1, costs.num_ops_total);
+    EXPECT_EQ(costs.num_ops_total, 1);
     EXPECT_FALSE(costs.inaccurate);
-    EXPECT_EQ(0, costs.num_ops_with_unknown_shapes);
+    EXPECT_EQ(costs.num_ops_with_unknown_shapes, 0);
+    EXPECT_EQ(costs.temporary_memory, 0);
+    EXPECT_EQ(costs.persistent_memory, 0);
   }
   {
     // 1x1 window with 2x2 stride: used for shortcut in resnet-50.
@@ -1371,9 +1522,11 @@ TEST_F(OpLevelCostEstimatorTest, PredictAvgPoolGrad) {
     EXPECT_EQ(Costs::Duration(1305602), costs.execution_time);
     EXPECT_EQ(Costs::Duration(537600), costs.compute_time);
     EXPECT_EQ(Costs::Duration(768002), costs.memory_time);
-    EXPECT_EQ(1, costs.num_ops_total);
+    EXPECT_EQ(costs.num_ops_total, 1);
     EXPECT_FALSE(costs.inaccurate);
-    EXPECT_EQ(0, costs.num_ops_with_unknown_shapes);
+    EXPECT_EQ(costs.num_ops_with_unknown_shapes, 0);
+    EXPECT_EQ(costs.temporary_memory, 0);
+    EXPECT_EQ(costs.persistent_memory, 0);
   }
   {
     // 1x1 window with 2x2 stride: used for shortcut in resnet-50.
@@ -1410,9 +1563,11 @@ TEST_F(OpLevelCostEstimatorTest, PredictFusedBatchNorm) {
     EXPECT_EQ(Costs::Duration(614737), costs.execution_time);
     EXPECT_EQ(Costs::Duration(153706), costs.compute_time);
     EXPECT_EQ(Costs::Duration(461031), costs.memory_time);
-    EXPECT_EQ(1, costs.num_ops_total);
+    EXPECT_EQ(costs.num_ops_total, 1);
     EXPECT_FALSE(costs.inaccurate);
-    EXPECT_EQ(0, costs.num_ops_with_unknown_shapes);
+    EXPECT_EQ(costs.num_ops_with_unknown_shapes, 0);
+    EXPECT_EQ(costs.temporary_memory, 0);
+    EXPECT_EQ(costs.persistent_memory, 0);
   }
 
   {
@@ -1459,9 +1614,11 @@ TEST_F(OpLevelCostEstimatorTest, PredictFusedBatchNormGrad) {
     EXPECT_EQ(Costs::Duration(1037050), costs.execution_time);
     EXPECT_EQ(Costs::Duration(422496), costs.compute_time);
     EXPECT_EQ(Costs::Duration(614554), costs.memory_time);
-    EXPECT_EQ(1, costs.num_ops_total);
+    EXPECT_EQ(costs.num_ops_total, 1);
     EXPECT_FALSE(costs.inaccurate);
-    EXPECT_EQ(0, costs.num_ops_with_unknown_shapes);
+    EXPECT_EQ(costs.num_ops_with_unknown_shapes, 0);
+    EXPECT_EQ(costs.temporary_memory, 0);
+    EXPECT_EQ(costs.persistent_memory, 0);
   }
 
   {
@@ -1608,9 +1765,11 @@ TEST_F(OpLevelCostEstimatorTest, Einsum) {
     EXPECT_EQ(Costs::Duration(100 * 50 * 100 * 2 / (1000 * 10 * 1e-3)),
               cost.compute_time);
     EXPECT_EQ(Costs::Duration(4000), cost.memory_time);
-    EXPECT_EQ(1, cost.num_ops_total);
+    EXPECT_EQ(cost.num_ops_total, 1);
     EXPECT_FALSE(cost.inaccurate);
-    EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+    EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+    EXPECT_EQ(cost.temporary_memory, 0);
+    EXPECT_EQ(cost.persistent_memory, 0);
 
     // Einsums and XlaEinsums should be estimated similarly.
     EXPECT_EQ(PredictCosts(DescribeEinsum({100, 50}, {100, 50}, "ik,jk->ij"))
@@ -1828,6 +1987,8 @@ TEST_F(OpLevelCostEstimatorTest, PredictResourceVariableOps) {
     EXPECT_EQ(Costs::Duration(0), cost.compute_time);
     EXPECT_EQ(Costs::Duration(400), cost.execution_time);
     EXPECT_FALSE(cost.inaccurate);
+    EXPECT_EQ(cost.temporary_memory, 0);
+    EXPECT_EQ(cost.persistent_memory, 0);
   }
 
   {
@@ -1856,9 +2017,11 @@ TEST_F(OpLevelCostEstimatorTest, AddNExecutionTime) {
   EXPECT_EQ(Costs::Duration(1200), cost.memory_time);
   EXPECT_EQ(Costs::Duration(200), cost.compute_time);
   EXPECT_EQ(Costs::Duration(1400), cost.execution_time);
-  EXPECT_EQ(1, cost.num_ops_total);
+  EXPECT_EQ(cost.num_ops_total, 1);
   EXPECT_FALSE(cost.inaccurate);
-  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+  EXPECT_EQ(cost.temporary_memory, 0);
+  EXPECT_EQ(cost.persistent_memory, 0);
 }
 
 TEST_F(OpLevelCostEstimatorTest, IdentityOpExecutionTime) {
@@ -1881,9 +2044,11 @@ TEST_F(OpLevelCostEstimatorTest, IdentityOpExecutionTime) {
     EXPECT_EQ(Costs::Duration(kExpectedComputeTime + kExpectedMemoryTime),
               cost.execution_time);
     EXPECT_EQ(cost.max_memory, kTensorSize * 4);
-    EXPECT_EQ(1, cost.num_ops_total);
+    EXPECT_EQ(cost.num_ops_total, 1);
     EXPECT_FALSE(cost.inaccurate);
-    EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+    EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+    EXPECT_EQ(cost.temporary_memory, 0);
+    EXPECT_EQ(cost.persistent_memory, 0);
   }
 }
 
@@ -1891,10 +2056,11 @@ TEST_F(OpLevelCostEstimatorTest, PureMemoryOpExecutionTime) {
   std::vector<std::string> reshape_ops = {
       "ConcatV2",     "DataFormatVecPermute",
       "DepthToSpace", "ExpandDims",
-      "Fill",         "Pack",
+      "Fill",         "OneHot",
+      "Pack",         "Range",
       "SpaceToDepth", "Split",
       "Squeeze",      "Transpose",
-      "Unpack"};
+      "Tile",         "Unpack"};
 
   const int kTensorSize = 1000;
   for (auto reshape_op : reshape_ops) {
@@ -1909,9 +2075,143 @@ TEST_F(OpLevelCostEstimatorTest, PureMemoryOpExecutionTime) {
     EXPECT_EQ(Costs::Duration(kExpectedComputeTime + kExpectedMemoryTime),
               cost.execution_time);
     EXPECT_EQ(cost.max_memory, kTensorSize * 4);
-    EXPECT_EQ(1, cost.num_ops_total);
+    EXPECT_EQ(cost.num_ops_total, 1);
     EXPECT_FALSE(cost.inaccurate);
-    EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+    EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+    EXPECT_EQ(cost.temporary_memory, 0);
+    EXPECT_EQ(cost.persistent_memory, 0);
+  }
+}
+TEST_F(OpLevelCostEstimatorTest, ResizeBilinearExecutionTime) {
+  const int kImageDim = 255;
+  const int kChannelSize = 10;
+  const int kComputeLerpCost = 9;
+  {
+    // Test with size 0 output.
+    OpContext op_context;
+    SetCpuDevice(&op_context.op_info);
+    op_context.op_info.set_op("ResizeBilinear");
+
+    DescribeTensor4D(1, kImageDim, kImageDim, kChannelSize,
+                     op_context.op_info.add_inputs());
+    const int kExpectedMemoryTime = kImageDim * kImageDim * 4;
+    DescribeTensor4D(0, 0, 0, 0, op_context.op_info.add_outputs());
+
+    // As the half_pixel_centers attr was not set, cost should be inaccurate
+    // with 0 compute time.
+    auto cost = PredictCosts(op_context);
+    EXPECT_EQ(cost.compute_time, Costs::Duration(0));
+    EXPECT_EQ(cost.memory_time, Costs::Duration(kExpectedMemoryTime));
+    EXPECT_EQ(cost.execution_time, Costs::Duration(kExpectedMemoryTime));
+    EXPECT_TRUE(cost.inaccurate);
+    EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+    EXPECT_EQ(cost.temporary_memory, 0);
+    EXPECT_EQ(cost.persistent_memory, 0);
+
+    AttrValue half_pixel_centers;
+    half_pixel_centers.set_b(false);
+    (*op_context.op_info.mutable_attr())["half_pixel_centers"] =
+        half_pixel_centers;
+    cost = PredictCosts(op_context);
+    // Compute time depends only on output size, so compute time is 0.
+    EXPECT_EQ(cost.compute_time, Costs::Duration(0));
+    EXPECT_EQ(cost.memory_time, Costs::Duration(kExpectedMemoryTime));
+    EXPECT_EQ(cost.execution_time, Costs::Duration(kExpectedMemoryTime));
+    EXPECT_FALSE(cost.inaccurate);
+    EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+  }
+
+  // Test with non-zero output size.
+  const int kOutputImageDim = 100;
+  OpContext op_context;
+  SetCpuDevice(&op_context.op_info);
+  op_context.op_info.set_op("ResizeBilinear");
+  DescribeTensor4D(1, kImageDim, kImageDim, kChannelSize,
+                   op_context.op_info.add_inputs());
+  DescribeTensor4D(1, kOutputImageDim, kOutputImageDim, kChannelSize,
+                   op_context.op_info.add_outputs());
+  const int kExpectedMemoryTime =
+      (kImageDim * kImageDim + kOutputImageDim * kOutputImageDim) * 4;
+
+  {
+    // Cost of calculating weights without using half_pixel_centers.
+    AttrValue half_pixel_centers;
+    half_pixel_centers.set_b(false);
+    (*op_context.op_info.mutable_attr())["half_pixel_centers"] =
+        half_pixel_centers;
+    const int kInterpWeightCost = 10;
+    const int num_ops =
+        kInterpWeightCost * (kOutputImageDim * 2) +
+        kComputeLerpCost * (kOutputImageDim * kOutputImageDim * kChannelSize);
+    const int expected_compute_time = std::ceil(
+        num_ops /
+        estimator_.GetDeviceInfo(op_context.op_info.device()).gigaops);
+
+    const auto cost = PredictCosts(op_context);
+    EXPECT_EQ(cost.compute_time, Costs::Duration(expected_compute_time));
+    EXPECT_EQ(cost.memory_time, Costs::Duration(kExpectedMemoryTime));
+    EXPECT_EQ(cost.execution_time,
+              Costs::Duration(kExpectedMemoryTime + expected_compute_time));
+    EXPECT_FALSE(cost.inaccurate);
+    EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+  }
+
+  {
+    // Cost of calculating weights using half_pixel_centers.
+    AttrValue half_pixel_centers;
+    half_pixel_centers.set_b(true);
+    (*op_context.op_info.mutable_attr())["half_pixel_centers"] =
+        half_pixel_centers;
+    const int kInterpWeightCost = 12;
+    const int num_ops =
+        kInterpWeightCost * (kOutputImageDim * 2) +
+        kComputeLerpCost * (kOutputImageDim * kOutputImageDim * kChannelSize);
+    const int expected_compute_time = std::ceil(
+        num_ops /
+        estimator_.GetDeviceInfo(op_context.op_info.device()).gigaops);
+
+    const auto cost = PredictCosts(op_context);
+    EXPECT_EQ(cost.compute_time, Costs::Duration(expected_compute_time));
+    EXPECT_EQ(cost.memory_time, Costs::Duration(kExpectedMemoryTime));
+    EXPECT_EQ(cost.execution_time,
+              Costs::Duration(kExpectedMemoryTime + expected_compute_time));
+    EXPECT_FALSE(cost.inaccurate);
+    EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+  }
+
+  {
+    // Cost with very large tensor.
+    op_context.op_info.clear_outputs();
+    // Number of elements in tensor exceeds 2^32.
+    constexpr int64 kLargeOutputImageDim = 40000;
+    DescribeTensor4D(1, kLargeOutputImageDim, kLargeOutputImageDim,
+                     kChannelSize, op_context.op_info.add_outputs());
+    const int64 kInterpWeightCost = 12;
+    // Using half_pixel_centers.
+    AttrValue half_pixel_centers;
+    half_pixel_centers.set_b(true);
+    (*op_context.op_info.mutable_attr())["half_pixel_centers"] =
+        half_pixel_centers;
+
+    const int64 num_ops =
+        kInterpWeightCost * (kLargeOutputImageDim * 2) +
+        kComputeLerpCost *
+            (kLargeOutputImageDim * kLargeOutputImageDim * kChannelSize);
+    const int64 expected_compute_time = std::ceil(
+        num_ops /
+        estimator_.GetDeviceInfo(op_context.op_info.device()).gigaops);
+
+    const int64 expected_memory_time =
+        (kImageDim * kImageDim + kLargeOutputImageDim * kLargeOutputImageDim) *
+        4;
+
+    const auto cost = PredictCosts(op_context);
+    EXPECT_EQ(cost.compute_time, Costs::Duration(expected_compute_time));
+    EXPECT_EQ(cost.memory_time, Costs::Duration(expected_memory_time));
+    EXPECT_EQ(cost.execution_time,
+              Costs::Duration(expected_memory_time + expected_compute_time));
+    EXPECT_FALSE(cost.inaccurate);
+    EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
   }
 }
 
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.h b/tensorflow/core/grappler/costs/virtual_scheduler.h
index 0e15b9842a1..ada2c18889f 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.h
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.h
@@ -358,6 +358,25 @@ class SchedulerState {
                                                const Costs& node_costs,
                                                const OpContext& op_context);
 
+  // Some getter functions.
+  const GrapplerItem* GetGrapplerItem() { return grappler_item_; }
+  Costs GetGraphCost() { return graph_costs_; }
+  Cluster* GetCluster() { return cluster_; }
+  bool GetUseStaticShape() { return use_static_shapes_; }
+  bool GetUseAggressiveShapeInference() {
+    return use_aggressive_shape_inference_;
+  }
+  const std::unordered_map<const NodeDef*, NodeState>& GetNodeMap() {
+    return node_map_;
+  }
+
+ protected:
+  // This method can be used by a class derived from SchedulerState to
+  // access the device state map.
+  std::unordered_map<string, DeviceState>* GetMutableDeviceState() {
+    return &device_;
+  }
+
  private:
   // Methods called from Init(). Fails if initialize_ is set.
 
diff --git a/tensorflow/core/grappler/inputs/testdata/BUILD b/tensorflow/core/grappler/inputs/testdata/BUILD
new file mode 100644
index 00000000000..fa859118c84
--- /dev/null
+++ b/tensorflow/core/grappler/inputs/testdata/BUILD
@@ -0,0 +1,14 @@
+# Description:
+# grappler testdata packages.
+
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
+
+filegroup(
+    name = "test_file",
+    srcs = [
+        "test_file.txt",
+    ],
+    visibility = ["//tensorflow/core/grappler/inputs:__pkg__"],
+)
diff --git a/tensorflow/core/grappler/op_types.cc b/tensorflow/core/grappler/op_types.cc
index 9d30f24e047..bb9d2379841 100644
--- a/tensorflow/core/grappler/op_types.cc
+++ b/tensorflow/core/grappler/op_types.cc
@@ -186,6 +186,14 @@ bool IsConv2DBackpropInput(const NodeDef& node) {
 
 bool IsConv3D(const NodeDef& node) { return node.op() == "Conv3D"; }
 
+bool IsConv3DBackpropFilterV2(const NodeDef& node) {
+  return node.op() == "Conv3DBackpropFilterV2";
+}
+
+bool IsConv3DBackpropInputV2(const NodeDef& node) {
+  return node.op() == "Conv3DBackpropInputV2";
+}
+
 bool IsDepthwiseConv2dNative(const NodeDef& node) {
   return node.op() == "DepthwiseConv2dNative";
 }
@@ -326,6 +334,12 @@ bool IsImmutableConst(const NodeDef& node) {
 
 bool IsInvGrad(const NodeDef& node) { return node.op() == "InvGrad"; }
 
+bool IsLeakyRelu(const NodeDef& node) { return node.op() == "LeakyRelu"; }
+
+bool IsLeakyReluGrad(const NodeDef& node) {
+  return node.op() == "LeakyReluGrad";
+}
+
 bool IsLess(const NodeDef& node) { return node.op() == "Less"; }
 
 bool IsLessEqual(const NodeDef& node) { return node.op() == "LessEqual"; }
@@ -567,6 +581,8 @@ bool IsSymbolicGradient(const NodeDef& node) {
   return node.op() == "SymbolicGradient";
 }
 
+bool IsTanh(const NodeDef& node) { return node.op() == "Tanh"; }
+
 bool IsTanhGrad(const NodeDef& node) { return node.op() == "TanhGrad"; }
 
 bool IsTensorArray(const NodeDef& node) {
diff --git a/tensorflow/core/grappler/op_types.h b/tensorflow/core/grappler/op_types.h
index 141eda7415a..e844f961ca3 100644
--- a/tensorflow/core/grappler/op_types.h
+++ b/tensorflow/core/grappler/op_types.h
@@ -63,6 +63,8 @@ bool IsConv2D(const NodeDef& node);
 bool IsConv2DBackpropFilter(const NodeDef& node);
 bool IsConv2DBackpropInput(const NodeDef& node);
 bool IsConv3D(const NodeDef& node);
+bool IsConv3DBackpropFilterV2(const NodeDef& node);
+bool IsConv3DBackpropInputV2(const NodeDef& node);
 bool IsDepthwiseConv2dNative(const NodeDef& node);
 bool IsDepthwiseConv2dNativeBackpropFilter(const NodeDef& node);
 bool IsDepthwiseConv2dNativeBackpropInput(const NodeDef& node);
@@ -97,6 +99,8 @@ bool IsIgammac(const NodeDef& node);
 bool IsImag(const NodeDef& node);
 bool IsImmutableConst(const NodeDef& node);
 bool IsInvGrad(const NodeDef& node);
+bool IsLeakyRelu(const NodeDef& node);
+bool IsLeakyReluGrad(const NodeDef& node);
 bool IsLess(const NodeDef& node);
 bool IsLessEqual(const NodeDef& node);
 bool IsLog(const NodeDef& node);
@@ -185,6 +189,7 @@ bool IsSub(const NodeDef& node);
 bool IsSum(const NodeDef& node);
 bool IsSwitch(const NodeDef& node);
 bool IsSymbolicGradient(const NodeDef& node);
+bool IsTanh(const NodeDef& node);
 bool IsTanhGrad(const NodeDef& node);
 bool IsTensorArray(const NodeDef& node);
 bool IsTile(const NodeDef& node);
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index d3db2f19596..e81816f0acb 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -536,7 +536,7 @@ tf_cuda_cc_test(
     srcs = ["memory_optimizer_test.cc"],
     tags = [
         "no_cuda_on_cpu_tap",  # Do not re-enable again without actually testing.
-        "no_windows",  # b/56402646
+        "no_windows",
     ],
     deps = [
         ":gpu_swapping_kernels",
@@ -883,6 +883,7 @@ tf_cuda_cc_test(
     deps = [
         ":remapper",
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:cc_ops_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
@@ -905,6 +906,7 @@ tf_cc_test_mkl(
     deps = [
         ":remapper",
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:cc_ops_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
@@ -993,6 +995,7 @@ tf_cc_test(
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
         "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
         "//tensorflow/core/grappler/utils:topological_sort",
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index 8b784c602db..88071612e1c 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/graph_topology_view.h"
 #include "tensorflow/core/grappler/grappler_item.h"
@@ -666,7 +667,8 @@ class AddOpsRewriteStage : public ArithmeticNodesGroupOptimizerStage {
 
     // add new Add node
     NodeDef* node = AddEmptyNode(node_name);
-    node->set_op("Add");
+    node->set_op((dtype == DT_STRING || dtype == DT_STRING_REF) ? "Add"
+                                                                : "AddV2");
     node->set_device(root_node.device());
     (*node->mutable_attr())["T"].set_type(dtype);
     node->add_input(left.input);
@@ -783,7 +785,7 @@ class HoistCommonFactorOutOfAggregation : public ArithmeticOptimizerStage {
   // Get a name new inner Add node
   string InnerAddNodeName(const NodeDef* node) const {
     auto scope_and_name = ParseNodeScopeAndName(node->name());
-    return OptimizedNodeName(scope_and_name, "Add");
+    return OptimizedNodeName(scope_and_name, "AddV2");
   }
 
   // Determine the set of common factors if the input nodes are all Mul or
@@ -1361,7 +1363,7 @@ class RemoveNegationStage : public ArithmeticOptimizerStage {
       // a - (-b) = a + b or  a + (-b) = a - b
       ForwardControlDependencies(node, {y});
       ctx().node_map->UpdateInput(node->name(), node->input(1), y->input(0));
-      node->set_op(IsAdd(*node) ? "Sub" : "Add");
+      node->set_op(IsAdd(*node) ? "Sub" : "AddV2");
       node->set_input(1, y->input(0));
       updated = true;
     } else if (IsAdd(*node) && IsNeg(*x)) {
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
index d8c60ec897b..2b75ae3c400 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
@@ -44,7 +44,7 @@ constexpr char kHoistFactorOptimizerMul[] =
     "ArithmeticOptimizer/HoistCommonFactor_Mul_";
 
 constexpr char kHoistFactorOptimizerAdd[] =
-    "ArithmeticOptimizer/HoistCommonFactor_Add_";
+    "ArithmeticOptimizer/HoistCommonFactor_AddV2_";
 
 constexpr char kSimplifyAggregationConst[] =
     "ArithmeticOptimizer/SimplifyAggregation_Const_";
@@ -2219,7 +2219,7 @@ TEST_F(ArithmeticOptimizerTest, AddOpsRewriteMinimizeBCast) {
   // Then add results together starting from smaller shapes [a, x] + [b, y]
   const NodeDef* outer_0_node = node_map.GetNode(outer_0_add_name);
   ASSERT_NE(outer_0_node, nullptr);
-  EXPECT_EQ(outer_0_node->op(), "Add");
+  EXPECT_EQ(outer_0_node->op(), "AddV2");
   ASSERT_EQ(outer_0_node->input_size(), 2);
   EXPECT_EQ(outer_0_node->input(0), inner_0_add_name);
   EXPECT_EQ(outer_0_node->input(1), inner_1_add_name);
@@ -2227,7 +2227,7 @@ TEST_F(ArithmeticOptimizerTest, AddOpsRewriteMinimizeBCast) {
   // And finally top level Add node
   const NodeDef* outer_node = node_map.GetNode(outer_add_name);
   ASSERT_NE(outer_node, nullptr);
-  EXPECT_EQ(outer_node->op(), "Add");
+  EXPECT_EQ(outer_node->op(), "AddV2");
   ASSERT_EQ(outer_node->input_size(), 2);
   EXPECT_EQ(outer_node->input(0), outer_0_add_name);
   EXPECT_EQ(outer_node->input(1), inner_2_add_name);
@@ -2299,7 +2299,7 @@ TEST_F(ArithmeticOptimizerTest, AddOpsRewriteMinimizeBCastWithSymbolicShapes) {
   // outer Add node
   const NodeDef* outer_add = node_map.GetNode(outer_add_name);
   ASSERT_NE(outer_add, nullptr);
-  EXPECT_EQ(outer_add->op(), "Add");
+  EXPECT_EQ(outer_add->op(), "AddV2");
   ASSERT_EQ(outer_add->input_size(), 2);
   EXPECT_EQ(outer_add->input(0), inner_add_name);
   EXPECT_EQ(outer_add->input(1), "b");
@@ -2384,7 +2384,7 @@ TEST_F(ArithmeticOptimizerTest, RemoveNegation) {
       EXPECT_EQ(node.input(1), "y");
     } else if (node.name() == "Sub_x_negy") {
       ++found;
-      EXPECT_EQ(node.op(), "Add");
+      EXPECT_EQ(node.op(), "AddV2");
       ASSERT_EQ(node.input_size(), 2);
       EXPECT_EQ(node.input(0), "x");
       EXPECT_EQ(node.input(1), "y");
diff --git a/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h b/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h
index 805a7de9225..7902700fb0f 100644
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h
@@ -127,11 +127,6 @@ class AutoMixedPrecisionListsCuda : public AutoMixedPrecisionLists {
         "GRUBlockCellGrad",
         "LSTMBlockCell",
         "LSTMBlockCellGrad",
-        // TODO(benbarsdell): Enable these when fast and safe fp16 kernels are
-        // available for depthwise convolutions.
-        // "DepthwiseConv2dNative",
-        // "DepthwiseConv2dNativeBackpropFilter",
-        // "DepthwiseConv2dNativeBackpropInput",
         "MatMul",
     };
     if (cuda_version_ >= 9010) {
@@ -147,6 +142,11 @@ class AutoMixedPrecisionListsCuda : public AutoMixedPrecisionLists {
       list.insert("Conv3DBackpropInput");
       list.insert("Conv3DBackpropInputV2");
     }
+    if (cudnn_version_ >= 8000) {
+      list.insert("DepthwiseConv2dNative");
+      list.insert("DepthwiseConv2dNativeBackpropFilter");
+      list.insert("DepthwiseConv2dNativeBackpropInput");
+    }
     UpdateList("ALLOWLIST", &list);
     // For backwards compatibility, keeping the original env variable here.
     // TODO(reedwm): This should be removed if we don't have active users.
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index d595d2ba09c..6b51c9146d0 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -1338,7 +1338,9 @@ Status ConstantFolding::EvaluateOneFoldable(const NodeDef& node,
     TF_RETURN_IF_ERROR(CheckAttrExists(*input_node, "value"));
     const TensorProto& raw_val = input_node->attr().at("value").tensor();
     Tensor* value = new Tensor(raw_val.dtype(), raw_val.tensor_shape());
-    CHECK(value->FromProto(raw_val));
+    CHECK(value->FromProto(raw_val))
+        << "Unable to make Tensor from proto for " << node.name()
+        << " with shape " << raw_val.tensor_shape().DebugString();
     inputs.emplace_back(value);
     total_inputs_size += value->TotalBytes();
   }
@@ -1793,6 +1795,71 @@ bool ConstantFolding::IsZeros(const NodeDef& node) const {
   return false;
 }
 
+bool ConstantFolding::ReplaceOperationWithBroadcastTo(
+    int input_to_broadcast, const GraphProperties& properties, NodeDef* node,
+    GraphDef* graph) {
+  const DataType dtype = GetDataTypeFromNodeOrProps(*node, properties);
+  if (dtype == DT_INVALID) {
+    return false;
+  }
+  const PartialTensorShape shape(
+      properties.GetOutputProperties(node->name())[0].shape());
+  if (!shape.IsFullyDefined()) {
+    return false;
+  }
+  // Create constant node with shape.
+  const string const_name = OptimizedNodeName(
+      *node, strings::StrCat("-broadcastto_shape-", input_to_broadcast));
+  if (node_map_->GetNode(const_name) != nullptr) {
+    return false;
+  }
+
+  Tensor shape_t;
+  if (!ConvertShapeToConstant("Shape", DT_INT32, shape, &shape_t).ok()) {
+    return false;
+  }
+  NodeDef tmp;
+  if (!CreateNodeDef(const_name, TensorValue(&shape_t), &tmp).ok()) {
+    return false;
+  }
+  NodeDef* const_node = graph->add_node();
+  const_node->Swap(&tmp);
+  const_node->set_device(node->device());
+  node_map_->AddNode(const_name, const_node);
+  for (int i = 0; i < node->input_size(); ++i) {
+    if (i != input_to_broadcast) {
+      // Add a control input on the unused input.
+      string ctrl_dep = AddControlDependency(NodeName(node->input(i)), graph,
+                                             node_map_.get());
+      *const_node->add_input() = ctrl_dep;
+      node_map_->AddOutput(NodeName(ctrl_dep), const_name);
+    }
+  }
+
+  // Rewrite `node` in-place to BroadcastTo.
+  node->set_op("BroadcastTo");
+  EraseRegularNodeAttributes(node);
+  (*node->mutable_attr())["T"].set_type(dtype);
+  (*node->mutable_attr())["Tidx"].set_type(DT_INT32);
+  // Set the designated input to BroadcastTo.
+  node->mutable_input()->SwapElements(0, input_to_broadcast);
+  // Keep all other inputs as control dependencies.
+  for (int i = 1; i < node->input_size(); ++i) {
+    if (IsControlInput(node->input(i))) {
+      break;
+    }
+    const string ctrl_dep =
+        AddControlDependency(node->input(i), graph, node_map_.get());
+    node_map_->UpdateInput(node->name(), node->input(i), ctrl_dep);
+    node->set_input(i, ctrl_dep);
+  }
+  // Add the shape argument.
+  *node->add_input() = const_node->name();
+  node_map_->AddOutput(const_name, node->name());
+  node->mutable_input()->SwapElements(1, node->input_size() - 1);
+  return true;
+}
+
 // Replace an operation with Identity.
 void ConstantFolding::ReplaceOperationWithIdentity(
     int input_to_forward, const GraphProperties& properties, NodeDef* node,
@@ -1878,54 +1945,10 @@ void ConstantFolding::ReplaceOperationWithNoOp(NodeDef* node,
 void ConstantFolding::ReplaceBinaryOperationWithBroadcastTo(
     int input_to_broadcast, const GraphProperties& properties, NodeDef* node,
     GraphDef* graph) {
-  const DataType dtype = GetDataTypeFromNodeOrProps(*node, properties);
-  if (dtype == DT_INVALID) return;
-  const PartialTensorShape shape(
-      properties.GetOutputProperties(node->name())[0].shape());
-  if (!shape.IsFullyDefined()) return;
-
-  // Create constant node with shape.
-  const string const_name = OptimizedNodeName(
-      *node, strings::StrCat("-broadcastto_shape-", input_to_broadcast));
-  if (node_map_->GetNode(const_name) != nullptr) {
+  if (!ReplaceOperationWithBroadcastTo(input_to_broadcast, properties, node,
+                                       graph)) {
     return;
   }
-
-  Tensor shape_t;
-  if (!ConvertShapeToConstant("Shape", DT_INT32, shape, &shape_t).ok()) return;
-  NodeDef tmp;
-  if (!CreateNodeDef(const_name, TensorValue(&shape_t), &tmp).ok()) return;
-  NodeDef* const_node = graph->add_node();
-  const_node->Swap(&tmp);
-  const_node->set_device(node->device());
-  node_map_->AddNode(const_name, const_node);
-  // Add a control input on the unused input.
-  string ctrl_dep = AddControlDependency(
-      NodeName(node->input(1 - input_to_broadcast)), graph, node_map_.get());
-  *const_node->add_input() = ctrl_dep;
-  node_map_->AddOutput(NodeName(ctrl_dep), const_name);
-
-  // Rewrite `node` in-place to BroadcastTo.
-  node->set_op("BroadcastTo");
-  EraseRegularNodeAttributes(node);
-  (*node->mutable_attr())["T"].set_type(dtype);
-  (*node->mutable_attr())["Tidx"].set_type(DT_INT32);
-  // Set the designated input to BroadcastTo.
-  node->mutable_input()->SwapElements(0, input_to_broadcast);
-  // Keep all other inputs as control dependencies.
-  for (int i = 1; i < node->input_size(); ++i) {
-    if (IsControlInput(node->input(i))) {
-      break;
-    }
-    const string ctrl_dep =
-        AddControlDependency(node->input(i), graph, node_map_.get());
-    node_map_->UpdateInput(node->name(), node->input(i), ctrl_dep);
-    node->set_input(i, ctrl_dep);
-  }
-  // Add the shape argument.
-  *node->add_input() = const_node->name();
-  node_map_->AddOutput(const_name, node->name());
-  node->mutable_input()->SwapElements(1, node->input_size() - 1);
   graph_modified_ = true;
 }
 
@@ -2475,19 +2498,9 @@ bool ConstantFolding::SimplifyCase(GraphDef* optimized_graph, NodeDef* node) {
 bool ConstantFolding::SimplifySelect(const GraphProperties& properties,
                                      GraphDef* optimized_graph, NodeDef* node) {
   if (!IsSelect(*node)) return false;
-  // Replace node with Identity if no broadcasting is involved.
-  // TODO(b/155503011): Add support for broadcast.
   const std::vector<OpInfo::TensorProperties>& input_props =
       properties.GetInputProperties(node->name());
   if (input_props.size() < 3) return false;
-  const TensorShapeProto& predicate_shape = input_props[0].shape();
-  const bool predicate_is_scalar =
-      !predicate_shape.unknown_rank() && predicate_shape.dim_size() == 0;
-  if (!ShapesSymbolicallyEqual(input_props[1], input_props[2]) ||
-      !(ShapesSymbolicallyEqual(input_props[0], input_props[1]) ||
-        predicate_is_scalar)) {
-    return false;
-  }
   const NodeDef* predicate_node = node_map_->GetNode(node->input(0));
   const bool is_all_true = IsOnes(*predicate_node);
   const bool is_all_false = IsZeros(*predicate_node);
@@ -2496,12 +2509,23 @@ bool ConstantFolding::SimplifySelect(const GraphProperties& properties,
   }
   const int live_input_idx = is_all_true ? 1 : 2;
   const int ignored_input_idx = is_all_true ? 2 : 1;
-  node->set_op("Identity");
-  *node->mutable_input(0) =
-      AddControlDependency(node->input(0), optimized_graph, node_map_.get());
-  *node->mutable_input(ignored_input_idx) = AddControlDependency(
-      node->input(ignored_input_idx), optimized_graph, node_map_.get());
-  node->mutable_input()->SwapElements(0, live_input_idx);
+  const TensorShapeProto& predicate_shape = input_props[0].shape();
+  const bool predicate_is_scalar =
+      !predicate_shape.unknown_rank() && predicate_shape.dim_size() == 0;
+  if (ShapesSymbolicallyEqual(input_props[1], input_props[2]) &&
+      (ShapesSymbolicallyEqual(input_props[0], input_props[1]) ||
+       predicate_is_scalar)) {
+    // Replace node with Identity if no broadcasting is involved.
+    node->set_op("Identity");
+    *node->mutable_input(0) =
+        AddControlDependency(node->input(0), optimized_graph, node_map_.get());
+    *node->mutable_input(ignored_input_idx) = AddControlDependency(
+        node->input(ignored_input_idx), optimized_graph, node_map_.get());
+    node->mutable_input()->SwapElements(0, live_input_idx);
+  } else if (!ReplaceOperationWithBroadcastTo(live_input_idx, properties, node,
+                                              optimized_graph)) {
+    return false;
+  }
   DedupControlInputs(node);
   return true;
 }
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.h b/tensorflow/core/grappler/optimizers/constant_folding.h
index 398e16947ec..c25bd521a9c 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.h
+++ b/tensorflow/core/grappler/optimizers/constant_folding.h
@@ -102,6 +102,9 @@ class ConstantFolding : public GraphOptimizer {
 
   bool IsOnes(const NodeDef& node) const;
   bool IsZeros(const NodeDef& node) const;
+  bool ReplaceOperationWithBroadcastTo(int input_to_broadcast,
+                                       const GraphProperties& properties,
+                                       NodeDef* node, GraphDef* graph);
   void ReplaceOperationWithIdentity(int input_to_forward,
                                     const GraphProperties& properties,
                                     NodeDef* node, GraphDef* graph);
diff --git a/tensorflow/core/grappler/optimizers/constant_folding_test.cc b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
index cb1ad87de60..dcde4c17e4b 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding_test.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
@@ -4299,6 +4299,75 @@ TEST_F(ConstantFoldingTest, SimplifySelect) {
   }
 }
 
+TEST_F(ConstantFoldingTest, SimplifySelect_BroadcastTo) {
+  for (TensorShape pred_shape : {TensorShape{2, 1}, TensorShape{2, 2, 1}}) {
+    for (bool pred_val : {true, false}) {
+      tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+      std::unique_ptr<Tensor> if_t;
+      if_t.reset(new Tensor(DT_BOOL, pred_shape));
+      for (int i = 0; i < pred_shape.num_elements(); ++i) {
+        if_t->flat<bool>()(i) = pred_val;
+      }
+      Output if_ = ops::Const(scope.WithOpName("if"), *if_t);
+      Output then_ =
+          ops::Placeholder(scope.WithOpName("then"), DT_FLOAT,
+                           ops::Placeholder::Shape(TensorShape({2, 1})));
+      Output else_ =
+          ops::Placeholder(scope.WithOpName("else"), DT_FLOAT,
+                           ops::Placeholder::Shape(TensorShape({2, 4})));
+      Output select =
+          ops::SelectV2(scope.WithOpName("select"), if_, then_, else_);
+      Output id = ops::Identity(scope.WithOpName("id"), select);
+
+      GrapplerItem item;
+      TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+      item.fetch = {"id"};
+
+      const Tensor kOne =
+          test::AsTensor<float>({1.0f, 1.0f}, TensorShape({2, 1}));
+      const Tensor kTwo = test::AsTensor<float>(
+          {2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f},
+          TensorShape({2, 4}));
+      auto tensors_expected = EvaluateNodes(item.graph, item.fetch,
+                                            {{"then", kOne}, {"else", kTwo}});
+
+      // Use aggressive mode to force the shape inference to propagate
+      // placeholder shapes.
+      ConstantFolding optimizer(RewriterConfig::AGGRESSIVE,
+                                /*cpu_device=*/nullptr);
+      GraphDef optimized_graph;
+      TF_EXPECT_OK(
+          optimizer.Optimize(/*cluster=*/nullptr, item, &optimized_graph));
+
+      ASSERT_EQ(optimized_graph.node_size(), 6);
+      bool found = false;
+      for (const auto& node : optimized_graph.node()) {
+        if (node.name() == "select") {
+          found = true;
+          EXPECT_EQ(node.op(), "BroadcastTo");
+          ASSERT_EQ(node.input_size(), 4);
+          EXPECT_EQ(node.input(0), pred_val ? "then" : "else");
+          EXPECT_EQ(node.input(1),
+                    strings::StrCat("ConstantFolding/select-broadcastto_shape-",
+                                    pred_val ? 1 : 2));
+          EXPECT_EQ(node.input(2), pred_val ? "^else" : "^if");
+          EXPECT_EQ(node.input(3), pred_val ? "^if" : "^then");
+        }
+      }
+      EXPECT_TRUE(found);
+
+      auto tensors = EvaluateNodes(optimized_graph, item.fetch,
+                                   {{"then", kOne}, {"else", kTwo}});
+      ASSERT_EQ(tensors.size(), 1);
+      ASSERT_EQ(tensors_expected.size(), 1);
+      ASSERT_EQ(tensors[0].shape(), pred_shape.num_elements() == 2
+                                        ? TensorShape({2, 4})
+                                        : TensorShape({2, 2, 4}));
+      test::ExpectTensorEqual<float>(tensors[0], tensors_expected[0]);
+    }
+  }
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/BUILD b/tensorflow/core/grappler/optimizers/data/BUILD
index 1daf7e9b76e..54a21706c37 100644
--- a/tensorflow/core/grappler/optimizers/data/BUILD
+++ b/tensorflow/core/grappler/optimizers/data/BUILD
@@ -14,6 +14,7 @@ cc_library(
     name = "data",
     visibility = ["//visibility:public"],
     deps = [
+        ":disable_intra_op_parallelism",
         ":filter_fusion",
         ":filter_with_random_uniform_fusion",
         ":hoist_random_uniform",
@@ -57,6 +58,41 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "disable_intra_op_parallelism",
+    srcs = ["disable_intra_op_parallelism.cc"],
+    hdrs = ["disable_intra_op_parallelism.h"],
+    deps = [
+        ":graph_utils",
+        ":optimizer_base",
+        "//tensorflow/core/grappler:mutable_graph_view",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:op_types",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/clusters:cluster",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
+        "//tensorflow/core:lib_internal",
+    ] + tf_protos_all(),
+    alwayslink = 1,
+)
+
+tf_cc_test(
+    name = "disable_intra_op_parallelism_test",
+    srcs = ["disable_intra_op_parallelism_test.cc"],
+    deps = [
+        ":disable_intra_op_parallelism",
+        ":graph_test_utils",
+        ":graph_utils",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/grappler:grappler_item",
+    ],
+)
+
 cc_library(
     name = "filter_fusion",
     srcs = ["filter_fusion.cc"],
diff --git a/tensorflow/core/grappler/optimizers/data/auto_shard.cc b/tensorflow/core/grappler/optimizers/data/auto_shard.cc
index 4ad9cec4fe4..4d324ecbd3d 100644
--- a/tensorflow/core/grappler/optimizers/data/auto_shard.cc
+++ b/tensorflow/core/grappler/optimizers/data/auto_shard.cc
@@ -42,8 +42,12 @@ constexpr char kShardDatasetOpName[] = "ShardDataset";
 constexpr char kShuffleDatasetOpName[] = "ShuffleDataset";
 constexpr char kShuffleDatasetV2OpName[] = "ShuffleDatasetV2";
 constexpr char kShuffleDatasetV3OpName[] = "ShuffleDatasetV3";
+constexpr char kPrefetchDatasetOpName[] = "PrefetchDataset";
+constexpr char kRebatchDatasetOpName[] = "RebatchDataset";
+constexpr char kRebatchDatasetV2OpName[] = "RebatchDatasetV2";
 
 constexpr char kNumWorkersAttrName[] = "num_workers";
+constexpr char kNumReplicasAttrName[] = "num_replicas";
 constexpr char kIndexAttrName[] = "index";
 constexpr char kAutoShardPolicyAttrName[] = "auto_shard_policy";
 constexpr char kReshuffleEachIteration[] = "reshuffle_each_iteration";
@@ -111,20 +115,22 @@ constexpr std::array<const char*, 5> kUnshardableSourceDatasetOps = {
 // clang-format on
 
 Status OptimizeGraph(const GrapplerItem& item, int64 num_workers, int64 index,
-                     AutoShardPolicy policy, GraphDef* output);
+                     AutoShardPolicy policy, int64 num_replicas,
+                     GraphDef* output);
 
 template <std::size_t SIZE>
 bool IsDatasetNodeOfType(const NodeDef& node,
                          const std::array<const char*, SIZE>& arr) {
   for (const auto& dataset_op_name : arr) {
-    if (tensorflow::data::MatchesAnyVersionRE(/*op_prefix=*/dataset_op_name,
-                                              /*op_to_match=*/node.op())) {
+    if (tensorflow::data::MatchesAnyVersion(/*op_prefix=*/dataset_op_name,
+                                            /*op_to_match=*/node.op())) {
       return true;
     }
   }
   return false;
 }
 
+// Adds a ShardDataset node before `add_before`.
 Status AddShardNode(MutableGraphView* graph, const NodeDef& add_before,
                     int64 num_workers, int64 index) {
   NodeDef new_node;
@@ -472,9 +478,91 @@ Status RecursivelyHandleOp(const NodeDef& node, int64 num_workers, int64 index,
                              nodes_to_delete);
 }
 
+// Recursively walk the dataset graph from sink to source, searching for
+// the first (i.e. closest to the sink) occurence of a ReaderDataset, such as
+// CSVDataset, TFRecordDataset, etc. We then insert a ShardDataset op before
+// that nodes input, so that each worker only reads a subset of files.
+// Additionally, we remove sources of randomness (e.g. ShuffleDataset) that
+// occur upstream of the ShardDataset transformation to ensure that sharding
+// returns a sensible result.
+Status ShardByFile(const NodeDef& sink_node, int64 num_workers, int64 index,
+                   FunctionLibraryDefinition* flib, MutableGraphView* graph) {
+  absl::flat_hash_set<string> nodes_to_delete;
+  TF_RETURN_IF_ERROR(RecursivelyHandleOp(sink_node, num_workers, index, flib,
+                                         graph, &nodes_to_delete));
+  return graph->DeleteNodes(nodes_to_delete);
+}
+
+Status RewriteRebatchV2ToV1(const NodeDef& sink_node, int64 num_replicas,
+                            MutableGraphView* graph) {
+  // The final node before AutoShardDataset is RebatchDataset.
+  // This is always the case as RebatchDataset and AutoShardDataset are internal
+  // APIs used directly by tf.distribute's input_lib. As such, instead of
+  // walking the entire dataset graph, we can walk up directly from the
+  // sink_node to get the RebatchDataset.
+  NodeDef* input_node = graph_utils::GetInputNode(sink_node, *graph);
+  if (input_node->op() != kRebatchDatasetV2OpName) {
+    return Status::OK();
+  }
+
+  NodeDef* rebatch_node = input_node;
+  // Update RebatchDatasetV2 in place. Since Rebatch is an internal API, no
+  // other nodes should have it as an input.
+  rebatch_node->set_op(kRebatchDatasetOpName);
+  // Delete the `batch_sizes` and `drop_remainder` input.
+  rebatch_node->mutable_input()->DeleteSubrange(/*start=*/1, /*num=*/2);
+  // Add the `num_replicas` input.
+  if (num_replicas < 1) {
+    return errors::InvalidArgument(
+        "Cannot rewrite RebatchDatasetV2 to legacy RebatchDataset with invalid "
+        "num_replicas argument. `num_replicas` is ",
+        num_replicas, ", but expected to be >= 1.");
+  }
+  auto num_replicas_node = graph_utils::AddScalarConstNode(num_replicas, graph);
+  rebatch_node->add_input(num_replicas_node->name());
+
+  // Set `use_fallback` attr. This attr is not used anywhere, so its value
+  // does not matter
+  (*rebatch_node->mutable_attr())["use_fallback"].set_b(true);
+
+  // Update the output_shapes attr to set all its batch dimensions to -1
+  // (unknown).
+  auto* shapes_attr =
+      gtl::FindOrNull(*rebatch_node->mutable_attr(), "output_shapes");
+  if (shapes_attr == nullptr) {
+    return errors::InvalidArgument(
+        "Cannot rewrite RebatchDatasetV2 with missing `output_shapes` attr.");
+  }
+  for (int i = 0; i < shapes_attr->list().shape_size(); ++i) {
+    auto* shape = shapes_attr->mutable_list()->mutable_shape(i);
+    if (shape->unknown_rank()) continue;
+    shape->mutable_dim(0)->set_size(-1);
+  }
+
+  return Status::OK();
+}
+
+Status ShardByData(const NodeDef& sink_node, int64 num_workers, int64 index,
+                   int64 num_replicas, MutableGraphView* graph) {
+  const NodeDef* shard_before = &sink_node;
+  // We sometimes insert a PrefetchDataset at the end of the input pipeline
+  // before autosharding. When sharding by data, we should insert the shard
+  // before the prefetch so that the right number of elements is prefetched.
+  NodeDef* input_node = graph_utils::GetInputNode(sink_node, *graph);
+  if (input_node->op() == kPrefetchDatasetOpName) {
+    shard_before = input_node;
+  }
+  // Sharding by data only works with legacy RebatchDataset. As such, we rewrite
+  // all instances of RebatchDatasetV2 to RebatchDataset.
+  TF_RETURN_IF_ERROR(RewriteRebatchV2ToV1(*shard_before, num_replicas, graph));
+  return AddShardNode(graph, *shard_before, num_workers, index);
+}
+
 Status OptimizeGraph(const GrapplerItem& item, int64 num_workers, int64 index,
-                     AutoShardPolicy policy, GraphDef* output) {
-  if (policy == AutoShardPolicy::OFF || (num_workers == 1 && index == 0)) {
+                     AutoShardPolicy policy, int64 num_replicas,
+                     GraphDef* output) {
+  if (policy == AutoShardPolicy::OFF ||
+      (policy == AutoShardPolicy::FILE && num_workers == 1 && index == 0)) {
     return Status::OK();
   }
 
@@ -482,49 +570,32 @@ Status OptimizeGraph(const GrapplerItem& item, int64 num_workers, int64 index,
   MutableGraphView graph(output);
   FunctionLibraryDefinition flib(OpRegistry::Global(), item.graph.library());
 
-  absl::flat_hash_set<string> nodes_to_delete;
 
   NodeDef* sink_node;
   TF_RETURN_IF_ERROR(graph_utils::GetFetchNode(graph, item, &sink_node));
 
-  // The basic approach here is to walk the graph from sink to source, and find
-  // the latest occurrence of a ReaderDataset (e.g. CSVDataset, TFRecordDataset,
-  // etc...). We then add a shard after that dataset to shard the outputs of
-  // that dataset, in effect giving a piece to each worker. Finally, we remove
-  // occurrences from randomness from before that point in the graph (e.g.
-  // things like ShuffleDataset) to ensure that `shard` returns a sensible
-  // result.
   switch (policy) {
     case AutoShardPolicy::OFF:
       return Status::OK();
 
     case AutoShardPolicy::FILE:
-      TF_RETURN_IF_ERROR(RecursivelyHandleOp(*sink_node, num_workers, index,
-                                             &flib, &graph, &nodes_to_delete));
-      return graph.DeleteNodes(nodes_to_delete);
-      break;
+      return ShardByFile(*sink_node, num_workers, index, &flib, &graph);
 
     case AutoShardPolicy::DATA:
-      return AddShardNode(&graph, *sink_node, num_workers, index);
-      break;
+      return ShardByData(*sink_node, num_workers, index, num_replicas, &graph);
 
     case AutoShardPolicy::AUTO:
     default:
-      Status s = RecursivelyHandleOp(*sink_node, num_workers, index, &flib,
-                                     &graph, &nodes_to_delete);
-      if (!s.ok() && errors::IsNotFound(s)) {
+      Status s = ShardByFile(*sink_node, num_workers, index, &flib, &graph);
+      if (errors::IsNotFound(s)) {
         LOG(WARNING) << "In AUTO-mode, and switching to DATA-based sharding, "
                         "instead of FILE-based sharding as we cannot find "
                         "appropriate reader dataset op(s) to shard. Error: "
                      << s.error_message();
-        TF_RETURN_IF_ERROR(
-            AddShardNode(&graph, *sink_node, num_workers, index));
-      } else if (!s.ok()) {
-        return s;
+        return ShardByData(*sink_node, num_workers, index, num_replicas,
+                           &graph);
       }
-
-      return graph.DeleteNodes(nodes_to_delete);
-      break;
+      return s;
   }
 }
 
@@ -548,6 +619,7 @@ Status AutoShard::Init(
   index_ = config->parameter_map().at(kIndexAttrName).i();
   auto_shard_policy_ =
       AutoShardPolicy(config->parameter_map().at(kAutoShardPolicyAttrName).i());
+  num_replicas_ = config->parameter_map().at(kNumReplicasAttrName).i();
 
   if (auto_shard_policy_ != AutoShardPolicy::OFF &&
       auto_shard_policy_ != AutoShardPolicy::AUTO &&
@@ -566,6 +638,10 @@ Status AutoShard::Init(
                                    num_workers_, ", currently ", index_);
   }
 
+  if (num_replicas_ < 0) {
+    return errors::InvalidArgument(kNumReplicasAttrName, " should be >= 0");
+  }
+
   return Status::OK();
 }
 
@@ -574,8 +650,8 @@ Status AutoShard::OptimizeAndCollectStats(Cluster* /* cluster */,
                                           GraphDef* output,
                                           OptimizationStats* stats) {
   *output = item.graph;
-  TF_RETURN_IF_ERROR(
-      OptimizeGraph(item, num_workers_, index_, auto_shard_policy_, output));
+  TF_RETURN_IF_ERROR(OptimizeGraph(item, num_workers_, index_,
+                                   auto_shard_policy_, num_replicas_, output));
   stats->num_changes++;
   return Status::OK();
 }
diff --git a/tensorflow/core/grappler/optimizers/data/auto_shard.h b/tensorflow/core/grappler/optimizers/data/auto_shard.h
index 0c73582890f..edb953ba48e 100644
--- a/tensorflow/core/grappler/optimizers/data/auto_shard.h
+++ b/tensorflow/core/grappler/optimizers/data/auto_shard.h
@@ -48,6 +48,7 @@ class AutoShard : public TFDataOptimizerBase {
 
  private:
   int64 num_workers_;
+  int64 num_replicas_;
   int64 index_;
   AutoShardPolicy auto_shard_policy_;
 };
diff --git a/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism.cc b/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism.cc
new file mode 100644
index 00000000000..52f1ba59b32
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism.cc
@@ -0,0 +1,122 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism.h"
+
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/mutable_graph_view.h"
+#include "tensorflow/core/grappler/op_types.h"
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+constexpr char kRetValOp[] = "_Retval";
+constexpr char kMaxIntraOpParallelismDataset[] = "MaxIntraOpParallelismDataset";
+
+constexpr std::array<const char*, 2> kMaxIntraOpParallelismDatasetOps = {
+    "MaxIntraOpParallelismDataset",
+    "ExperimentalMaxIntraOpParallelismDataset",
+};
+
+}  // namespace
+
+Status DisableIntraOpParallelism::OptimizeAndCollectStats(
+    Cluster* cluster, const GrapplerItem& item, GraphDef* output,
+    OptimizationStats* stats) {
+  *output = item.graph;
+  MutableGraphView graph(output);
+
+  for (const auto& fetch_name : item.fetch) {
+    // If the GrapplerItem is derived from a FunctionDef, we don't optimize it,
+    // because we only want to disable intra op parallelism on the main dataset
+    // pipeline.
+    auto fetch = graph.GetNode(fetch_name);
+    if (fetch == nullptr || fetch->op() == kRetValOp) {
+      // Heuristic: If the fetch nodes are Retval ops, this item is from a
+      // function.
+      return Status::OK();
+    }
+  }
+
+  if (item.fetch.size() != 1) {
+    return errors::InvalidArgument(
+        "Expected only one fetch node but there were ", item.fetch.size(), ": ",
+        absl::StrJoin(item.fetch, ", "));
+  }
+
+  for (const NodeDef& node : item.graph.node()) {
+    for (const auto& target_dataset_op : kMaxIntraOpParallelismDatasetOps) {
+      if (node.op() == target_dataset_op) {
+        // If parallelism is set by the user, we keep the user setting instead
+        // of disabling it.
+        return Status::OK();
+      }
+    }
+  }
+
+  NodeDef* sink_node = graph.GetNode(item.fetch.at(0));
+  NodeDef* last_node = graph_utils::GetInputNode(*sink_node, graph);
+
+  // Add a const node with value 1
+  NodeDef* max_parallelism_value = graph_utils::AddScalarConstNode(1LL, &graph);
+
+  NodeDef insert_node;
+  graph_utils::SetUniqueGraphNodeName("intra_op_parallelism", graph.graph(),
+                                      &insert_node);
+  insert_node.set_op(kMaxIntraOpParallelismDataset);
+
+  // `input_dataset` input
+  *insert_node.mutable_input()->Add() = last_node->name();
+  // `max_intra_op_parallelism` input
+  *insert_node.mutable_input()->Add() = max_parallelism_value->name();
+
+  // Set `output_types` and `output_shapes` attributes by copying the relevant
+  // attrs from the input node. If we fail to set the attributes, we abort the
+  // rewrite.
+  for (auto attr : {"output_shapes", "output_types"}) {
+    if (last_node->attr().find(attr) != last_node->attr().end()) {
+      graph_utils::CopyAttribute(attr, *last_node, &insert_node);
+    } else {
+      return Status::OK();
+    }
+  }
+
+  auto* added_node = graph.AddNode(std::move(insert_node));
+  TF_RETURN_IF_ERROR(
+      graph.UpdateFanouts(last_node->name(), added_node->name()));
+
+  stats->num_changes++;
+  return Status::OK();
+}
+
+void DisableIntraOpParallelism::Feedback(Cluster* cluster,
+                                         const GrapplerItem& item,
+                                         const GraphDef& optimize_output,
+                                         double result) {
+  // no-op
+}
+
+REGISTER_GRAPH_OPTIMIZER_AS(DisableIntraOpParallelism,
+                            "disable_intra_op_parallelism");
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism.h b/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism.h
new file mode 100644
index 00000000000..d2355eb8766
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism.h
@@ -0,0 +1,50 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_DISABLE_INTRA_OP_PARALLELISM_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_DISABLE_INTRA_OP_PARALLELISM_H_
+
+#include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// This optimization sets intra-op parallelism to be 1.
+class DisableIntraOpParallelism : public TFDataOptimizerBase {
+ public:
+  DisableIntraOpParallelism() = default;
+  ~DisableIntraOpParallelism() override = default;
+
+  string name() const override { return "disable_intra_op_parallelism"; };
+
+  bool UsesFunctionLibrary() const override { return false; }
+
+  Status Init(
+      const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
+    return Status::OK();
+  }
+
+  Status OptimizeAndCollectStats(Cluster* cluster, const GrapplerItem& item,
+                                 GraphDef* output,
+                                 OptimizationStats* stats) override;
+
+  void Feedback(Cluster* cluster, const GrapplerItem& item,
+                const GraphDef& optimize_output, double result) override;
+};
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_DISABLE_INTRA_OP_PARALLELISM_H_
diff --git a/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism_test.cc b/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism_test.cc
new file mode 100644
index 00000000000..291d77e834c
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism_test.cc
@@ -0,0 +1,158 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism.h"
+
+#include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_test_utils.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+using test::function::NDef;
+
+// If the user manually sets intra op parallelism, we don't insert the op.
+class IntraOpAlreadySetTest
+    : public ::testing::TestWithParam<std::tuple<string, int64>> {};
+
+TEST_P(IntraOpAlreadySetTest, IntraOpParallelism) {
+  const string op = std::get<0>(GetParam());
+  const int64 value = std::get<1>(GetParam());
+
+  GrapplerItem item;
+  MutableGraphView graph(&item.graph);
+
+  NodeDef *start_val = graph_utils::AddScalarConstNode<int64>(0, &graph);
+  NodeDef *stop_val = graph_utils::AddScalarConstNode<int64>(10, &graph);
+  NodeDef *step_val = graph_utils::AddScalarConstNode<int64>(1, &graph);
+  std::vector<string> range_inputs(3);
+  range_inputs[0] = start_val->name();
+  range_inputs[1] = stop_val->name();
+  range_inputs[2] = step_val->name();
+  std::vector<std::pair<string, AttrValue>> range_attrs;
+  NodeDef *range_node = graph_utils::AddNode("", "RangeDataset", range_inputs,
+                                             range_attrs, &graph);
+
+  NodeDef *parallelism_val =
+      graph_utils::AddScalarConstNode<int64>(value, &graph);
+  std::vector<string> parallelism_inputs(2);
+  parallelism_inputs[0] = range_node->name();
+  parallelism_inputs[1] = parallelism_val->name();
+  std::vector<std::pair<string, AttrValue>> parallelism_attrs;
+  NodeDef *parallelism_node = graph_utils::AddNode(
+      "max_parallelism", op, parallelism_inputs, parallelism_attrs, &graph);
+
+  std::vector<string> sink_inputs(1);
+  sink_inputs[0] = parallelism_node->name();
+  std::vector<std::pair<string, AttrValue>> sink_attrs;
+  NodeDef *sink_node =
+      graph_utils::AddNode("Sink", "Identity", sink_inputs, sink_attrs, &graph);
+  item.fetch.push_back(sink_node->name());
+
+  EXPECT_TRUE(graph_utils::ContainsNodeWithOp(op, item.graph));
+  EXPECT_EQ(item.graph.node_size(), 7);
+  EXPECT_EQ(parallelism_val->attr().at("value").tensor().int64_val(0), value);
+
+  DisableIntraOpParallelism optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+  EXPECT_EQ(output.node_size(), 7);
+  EXPECT_TRUE(graph_utils::ContainsNodeWithOp(op, output));
+  NodeDef new_parallelism_node =
+      output.node(graph_utils::FindGraphNodeWithOp(op, output));
+  NodeDef new_parallelism_val = output.node(graph_utils::FindGraphNodeWithName(
+      new_parallelism_node.input(1), output));
+  EXPECT_EQ(new_parallelism_val.attr().at("value").tensor().int64_val(0),
+            value);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    Test, IntraOpAlreadySetTest,
+    ::testing::Combine(
+        ::testing::Values("MaxIntraOpParallelismDataset",
+                          "ExperimentalMaxIntraOpParallelismDataset"),
+        ::testing::Values(1, 5)));
+
+// Test the case if the user hasn't set intra op parallelism.
+//
+// If we can not find the sink node or sink node op is "_Retval", we don't apply
+// the optimization; otherwise, we insert the op to disable intra op
+// parallelism.
+class IntraOpNotSetTest : public ::testing::TestWithParam<string> {};
+
+TEST_P(IntraOpNotSetTest, IntraOpParallelism) {
+  const string op = GetParam();
+  GrapplerItem item;
+
+  item.graph = test::function::GDef(
+      {NDef("start", "Const", {}, {{"value", 0}, {"dtype", DT_INT32}}),
+       NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
+       NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
+       NDef("range", "RangeDataset", {"start", "stop", "step"},
+            {{"output_shapes", gtl::ArraySlice<TensorShape>{}},
+             {"output_types", gtl::ArraySlice<DataType>{}}}),
+       NDef("Sink", op, {"range"}, {})});
+  EXPECT_FALSE(graph_utils::ContainsNodeWithOp("MaxIntraOpParallelismDataset",
+                                               item.graph));
+  EXPECT_EQ(item.graph.node_size(), 5);
+  item.fetch.push_back("Sink_fake");
+
+  DisableIntraOpParallelism optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+  EXPECT_FALSE(
+      graph_utils::ContainsNodeWithOp("MaxIntraOpParallelismDataset", output));
+  EXPECT_EQ(output.node_size(), 5);
+
+  item.fetch[0] = "Sink";
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+  if (op == "_Retval") {
+    EXPECT_FALSE(graph_utils::ContainsNodeWithOp("MaxIntraOpParallelismDataset",
+                                                 output));
+    EXPECT_EQ(output.node_size(), 5);
+    return;
+  }
+
+  EXPECT_EQ(output.node_size(), 7);
+  EXPECT_TRUE(
+      graph_utils::ContainsNodeWithOp("MaxIntraOpParallelismDataset", output));
+  NodeDef sink_node =
+      output.node(graph_utils::FindGraphNodeWithName("Sink", output));
+  EXPECT_EQ(sink_node.input_size(), 1);
+  NodeDef parallelism_node = output.node(
+      graph_utils::FindGraphNodeWithName(sink_node.input(0), output));
+  EXPECT_EQ(parallelism_node.op(), "MaxIntraOpParallelismDataset");
+  EXPECT_EQ(parallelism_node.input_size(), 2);
+  NodeDef range_node = output.node(
+      graph_utils::FindGraphNodeWithName(parallelism_node.input(0), output));
+  EXPECT_EQ(range_node.name(), "range");
+  NodeDef parallelism_val = output.node(
+      graph_utils::FindGraphNodeWithName(parallelism_node.input(1), output));
+  EXPECT_EQ(parallelism_val.attr().at("value").tensor().int64_val(0), 1);
+}
+
+INSTANTIATE_TEST_SUITE_P(Test, IntraOpNotSetTest,
+                         ::testing::Values("Identity", "_Retval"));
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/inject_prefetch.cc b/tensorflow/core/grappler/optimizers/data/inject_prefetch.cc
index eae8d294247..ed202c151ae 100644
--- a/tensorflow/core/grappler/optimizers/data/inject_prefetch.cc
+++ b/tensorflow/core/grappler/optimizers/data/inject_prefetch.cc
@@ -70,7 +70,7 @@ Status InjectPrefetch::OptimizeAndCollectStats(Cluster* cluster,
     graph_utils::SetUniqueGraphNodeName(
         strings::StrCat("inject/prefetch_", async_dataset_node->name()),
         graph.graph(), &prefetch_node);
-    prefetch_node.set_op("PrefetchDataset");
+    prefetch_node.set_op(kPrefetchDataset);
     // `input_dataset` input
     *prefetch_node.mutable_input()->Add() = async_dataset_node->name();
     // `buffer_size` input
diff --git a/tensorflow/core/grappler/optimizers/data/latency_all_edges.cc b/tensorflow/core/grappler/optimizers/data/latency_all_edges.cc
index fdeee86000a..e11be71af61 100644
--- a/tensorflow/core/grappler/optimizers/data/latency_all_edges.cc
+++ b/tensorflow/core/grappler/optimizers/data/latency_all_edges.cc
@@ -35,34 +35,43 @@ namespace {
 
 constexpr char kInsertOpName[] = "LatencyStatsDataset";
 
-NodeDef MakeLatencyNode(const NodeDef& node, MutableGraphView* graph) {
-  NodeDef new_node;
-  new_node.set_op(kInsertOpName);
+// Creates a LatencyStatsDataset node whose input is `node`.
+Status MakeLatencyNode(const NodeDef& node, MutableGraphView* graph,
+                       NodeDef* result) {
+  result->set_op(kInsertOpName);
   graph_utils::SetUniqueGraphNodeName(strings::StrCat(kInsertOpName),
-                                      graph->graph(), &new_node);
+                                      graph->graph(), result);
   // Set the input of LatencyDataset node as `node`
-  new_node.add_input(node.name());
+  result->add_input(node.name());
 
   string tag_name = strings::StrCat("record_latency",
                                     data::stats_utils::kDelimiter, node.name());
   NodeDef* tag = graph_utils::AddScalarConstNode<StringPiece>(
       StringPiece(tag_name), graph);
-  new_node.add_input(tag->name());
+  result->add_input(tag->name());
 
-  // Set `output_types` and `output_shapes` attributes.
+  // Set `output_types` and `output_shapes` attributes by copying the relevant
+  // attrs from the input node. This is an imperfect heuristic; some dataset ops
+  // might not have these attrs. If we encounter such an op, return an error
+  // instead of creating a node.
   for (auto key : {"output_shapes", "output_types"}) {
     if (node.attr().find(key) != node.attr().end()) {
-      (*new_node.mutable_attr())[key] = node.attr().at(key);
+      (*result->mutable_attr())[key] = node.attr().at(key);
     } else {
       const char* kInferredAttrPrefix = "T";
       if (node.attr().find(strings::StrCat(kInferredAttrPrefix, key)) !=
           node.attr().end()) {
-        (*new_node.mutable_attr())[key] =
+        (*result->mutable_attr())[key] =
             node.attr().at(strings::StrCat(kInferredAttrPrefix, key));
+      } else {
+        return errors::InvalidArgument(
+            "Could not create LatencyStatsDataset after ", node.op(),
+            " node because it does not have a (T)output_types or output_shapes "
+            "attr.");
       }
     }
   }
-  return new_node;
+  return Status::OK();
 }
 
 }  // namespace
@@ -83,9 +92,19 @@ Status LatencyAllEdges::OptimizeAndCollectStats(Cluster* cluster,
       // node corresponds to a `Dataset` op.
       continue;
     }
-    NodeDef* latency_node = graph.AddNode(MakeLatencyNode(node, &graph));
-    TF_RETURN_IF_ERROR(graph.UpdateFanouts(node.name(), latency_node->name()));
-    stats->num_changes++;
+    NodeDef latency_node;
+    // Try to make a latency node. This may fail if the input node doesn't have
+    // output_types or output_shapes attrs. In those cases, we don't add a node
+    // after `node`.
+    Status s = MakeLatencyNode(node, &graph, &latency_node);
+    if (s.ok()) {
+      NodeDef* latency_node_pointer = graph.AddNode(std::move(latency_node));
+      TF_RETURN_IF_ERROR(
+          graph.UpdateFanouts(node.name(), latency_node_pointer->name()));
+      stats->num_changes++;
+    } else {
+      LOG(WARNING) << s.error_message();
+    }
   }
   return Status::OK();
 }
diff --git a/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc
index bd7e18b807c..8d50a0409df 100644
--- a/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc
@@ -35,8 +35,9 @@ using ConfigMap =
     std::map<string, tensorflow::RewriterConfig_CustomGraphOptimizer>;
 
 // tf.data optimizations, in the order we want to perform them.
-constexpr std::array<const char*, 16> kTFDataOptimizations = {
+constexpr std::array<const char*, 17> kTFDataOptimizations = {
     "noop_elimination",
+    "disable_intra_op_parallelism",
     "shuffle_and_repeat_fusion",
     "map_fusion",
     "filter_fusion",
diff --git a/tensorflow/core/grappler/optimizers/data/slack.cc b/tensorflow/core/grappler/optimizers/data/slack.cc
index 27915e2d5d6..211b53ba083 100644
--- a/tensorflow/core/grappler/optimizers/data/slack.cc
+++ b/tensorflow/core/grappler/optimizers/data/slack.cc
@@ -101,10 +101,9 @@ Status Slack::RecursivelyHandleOp(const MutableGraphView& graph,
     return Status::OK();
   }
 
-  return errors::InvalidArgument(
-      "Encountered unsupported op \"", dataset_node->op(),
-      "\" when rewriting the input pipeline graph to use slack in its "
-      "final prefetch transformation.");
+  LOG(WARNING) << "Could not find a final `prefetch` in the input pipeline to "
+                  "which to introduce slack.";
+  return Status::OK();
 }
 
 Status Slack::OptimizeAndCollectStats(Cluster* cluster,
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/BUILD b/tensorflow/core/grappler/optimizers/data/vectorization/BUILD
index 7eae11a0c0c..57962163562 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization/BUILD
+++ b/tensorflow/core/grappler/optimizers/data/vectorization/BUILD
@@ -53,6 +53,7 @@ tf_cc_test(
     name = "vectorizer_registry_test",
     srcs = ["vectorizer_registry_test.cc"],
     tags = [
+        "manual",  # TODO(b/159771496)
         "no_oss",  # TODO(b/159771496)
         "notap",  # TODO(b/159771496)
     ],
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.cc b/tensorflow/core/grappler/optimizers/function_optimizer.cc
index 7af70e4755f..ad9f3d5ed8f 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.cc
@@ -828,9 +828,9 @@ const bool IsExemptFromSideEffectsExecutionValidation(const string& op) {
       {// LINT.IfChange
        // Op types that should not run in program order, e.g. because they need
        // to run asynchronously to avoid deadlock.
-       "CollectiveGather", "CollectiveReduce", "CollectiveReduceV2",
-       "CollectiveBcastSend", "CollectiveBcastRecv", "NcclAllReduce", "Send",
-       "Recv",
+       "CollectiveGather", "CollectiveGatherV2", "CollectiveReduce",
+       "CollectiveReduceV2", "CollectiveBcastSend", "CollectiveBcastRecv",
+       "NcclAllReduce", "Send", "Recv",
 
        // Legacy random ops.
        // See details in tensorflow/python/framework/auto_control_deps.py.
diff --git a/tensorflow/core/grappler/optimizers/generic_layout_optimizer.cc b/tensorflow/core/grappler/optimizers/generic_layout_optimizer.cc
index 969857879af..0619895a552 100644
--- a/tensorflow/core/grappler/optimizers/generic_layout_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/generic_layout_optimizer.cc
@@ -38,7 +38,7 @@ namespace {
 constexpr char kNHWC[] = "NHWC";
 constexpr char kNCHW[] = "NCHW";
 constexpr float kVoltaGPURatioThreshold = 0.5;
-constexpr float kConv2DGPUFP16Threshold = 0.5;
+constexpr float kConvGPUFP16Threshold = 0.5;
 
 struct MutableNodeViewFormatter {
   void operator()(std::string* out, utils::MutableNodeView* node_view) const {
@@ -69,15 +69,15 @@ inline std::pair<int, int> GetNumGPUs(const Cluster& cluster) {
   return {num_gpus, num_volta};
 }
 
-inline bool NumConv2DOnDeviceWithDataTypeOverThreshold(
+inline bool NumConvOnDeviceWithDataTypeOverThreshold(
     const TransposeContext& context, absl::string_view device,
     const DataType& data_type) {
-  int num_conv2d_gpu = 0;
-  int num_conv2d_gpu_fp16 = 0;
+  int num_conv_gpu = 0;
+  int num_conv_gpu_fp16 = 0;
 
   for (const auto& node : context.graph_view->GetNodes()) {
     const auto* node_def = node.node();
-    if (!IsConv2D(*node_def)) {
+    if (!IsConv2D(*node_def) and !IsConv3D(*node_def)) {
       continue;
     }
     const string& device_name =
@@ -89,20 +89,20 @@ inline bool NumConv2DOnDeviceWithDataTypeOverThreshold(
                            absl::AsciiStrToLower(device))) {
       continue;
     }
-    num_conv2d_gpu++;
+    num_conv_gpu++;
     const auto* t_attr = node.GetAttr("T");
     if (t_attr == nullptr) {
       continue;
     }
     if (t_attr->type() == data_type) {
-      num_conv2d_gpu_fp16++;
+      num_conv_gpu_fp16++;
     }
   }
 
-  if (num_conv2d_gpu == 0) return false;
+  if (num_conv_gpu == 0) return false;
 
-  return (static_cast<float>(num_conv2d_gpu_fp16) /
-          static_cast<float>(num_conv2d_gpu)) >= kConv2DGPUFP16Threshold;
+  return (static_cast<float>(num_conv_gpu_fp16) /
+          static_cast<float>(num_conv_gpu)) >= kConvGPUFP16Threshold;
 }
 
 inline std::pair<string, string> GetSrcAndDstDataFormats(
@@ -111,7 +111,7 @@ inline std::pair<string, string> GetSrcAndDstDataFormats(
   string dst_format = kNCHW;
   if (((static_cast<float>(num_voltas) / static_cast<float>(num_gpus)) >=
        kVoltaGPURatioThreshold) &&
-      NumConv2DOnDeviceWithDataTypeOverThreshold(context, kGPU, DT_HALF)) {
+      NumConvOnDeviceWithDataTypeOverThreshold(context, kGPU, DT_HALF)) {
     std::swap(src_format, dst_format);
   }
   return {src_format, dst_format};
@@ -384,7 +384,11 @@ Status EraseOutputShapeAttrs(TransposeContext* context) {
   utils::Mutation* mutation = graph_view->GetMutationBuilder();
   const int num_nodes = graph_view->NumNodes();
   for (int i = 0; i < num_nodes; ++i) {
-    mutation->RemoveNodeAttr(graph_view->GetNode(i), kAttrOutputShape);
+    auto* node = graph_view->GetNode(i);
+    if (IsArg(*node->node())) {
+      continue;
+    }
+    mutation->RemoveNodeAttr(node, kAttrOutputShape);
     TF_RETURN_IF_ERROR(mutation->Apply());
   }
   return Status::OK();
@@ -392,6 +396,10 @@ Status EraseOutputShapeAttrs(TransposeContext* context) {
 
 }  // namespace
 
+// When there is a GPU, the computation graph is converted to NCHW format.
+// When there is only CPU, there will be no conversion by default, unless user
+// chose to convert the graph to a desired format. Currently, NCHW -> NHWC
+// format conversion is available on CPU.
 Status GenericLayoutOptimizer::Optimize(Cluster* cluster,
                                         const GrapplerItem& item,
                                         GraphDef* output) {
@@ -402,22 +410,37 @@ Status GenericLayoutOptimizer::Optimize(Cluster* cluster,
   }
   const auto num_gpus_and_num_volta = GetNumGPUs(*cluster);
   const int num_gpus = num_gpus_and_num_volta.first;
-  if (num_gpus < 1) {
-    return errors::Aborted(
-        "No GPUs found: GenericLayoutOptimizer is currently only tuned for "
-        "GPU.");
-  }
 
   const bool is_aggressive = opt_level_ == RewriterConfig::AGGRESSIVE;
 
   TransposeContext context;
-  TF_RETURN_IF_ERROR(
-      TransposeContext::InitializeTransposeContext(item, cluster, &context));
+  if (num_gpus > 0) {
+    TF_RETURN_IF_ERROR(
+        TransposeContext::InitializeTransposeContext(item, cluster, &context));
 
-  const auto src_dst_formats =
-      GetSrcAndDstDataFormats(context, num_gpus, num_gpus_and_num_volta.second);
-  context.AssignDeviceAndDataFormats(kGPU, src_dst_formats.first,
-                                     src_dst_formats.second);
+    const auto src_dst_formats = GetSrcAndDstDataFormats(
+        context, num_gpus, num_gpus_and_num_volta.second);
+    context.AssignDeviceAndDataFormats(kGPU, src_dst_formats.first,
+                                       src_dst_formats.second);
+  } else {
+    TF_RETURN_IF_ERROR(
+        TransposeContext::InitializeTransposeContext(item, cluster, &context));
+    switch (cpu_layout_conversion_) {
+      case RewriterConfig::NCHW_TO_NHWC:
+        context.AssignDeviceAndDataFormats(kCPU, kNCHW, kNHWC);
+        break;
+      // TODO(intel-tf): Add functionality for NHWC_TO_NCHW layout conversion on
+      // CPU.
+      case RewriterConfig::NHWC_TO_NCHW:
+        return errors::Aborted(
+            "Conversion from NHWC to NCHW is currently not  available for "
+            "CPU.");
+      default:
+        *output = item.graph;
+        VLOG(2) << "No layout conversion will take place for CPU.";
+        return Status::OK();
+    }
+  }
 
   TransposerFactory transposer_factory;
   TF_RETURN_IF_ERROR(ExpandLayoutSensitiveOp(&context, &transposer_factory));
diff --git a/tensorflow/core/grappler/optimizers/generic_layout_optimizer.h b/tensorflow/core/grappler/optimizers/generic_layout_optimizer.h
index d4d61bed70c..35ddad35555 100644
--- a/tensorflow/core/grappler/optimizers/generic_layout_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/generic_layout_optimizer.h
@@ -25,9 +25,15 @@ namespace grappler {
 // Optimize the data layout for convolutional models.
 class GenericLayoutOptimizer : public GraphOptimizer {
  public:
-  GenericLayoutOptimizer() : GenericLayoutOptimizer(RewriterConfig::DEFAULT) {}
+  GenericLayoutOptimizer()
+      : GenericLayoutOptimizer(RewriterConfig::DEFAULT,
+                               RewriterConfig::NO_CONVERSION_ON_CPU) {}
   explicit GenericLayoutOptimizer(RewriterConfig::Toggle opt_level)
-      : opt_level_(opt_level) {}
+      : GenericLayoutOptimizer(opt_level,
+                               RewriterConfig::NO_CONVERSION_ON_CPU) {}
+  explicit GenericLayoutOptimizer(RewriterConfig::Toggle opt_level,
+                                  RewriterConfig::CpuLayout layout_conversion)
+      : opt_level_(opt_level), cpu_layout_conversion_(layout_conversion) {}
   ~GenericLayoutOptimizer() override = default;
 
   string name() const override { return "layout"; };
@@ -42,6 +48,7 @@ class GenericLayoutOptimizer : public GraphOptimizer {
 
  private:
   RewriterConfig::Toggle opt_level_;
+  RewriterConfig::CpuLayout cpu_layout_conversion_;
 };
 
 }  // namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_test.cc b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_test.cc
index 79bedf5f2e6..8307b37407e 100644
--- a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_test.cc
@@ -49,6 +49,38 @@ constexpr int kDepthIn = 8;
 constexpr int kKernel = 3;
 constexpr int kDepthOut = 16;
 
+// When there is a GPU, we test generic_layout_optimization for the conversion
+// from NHWC to NCHW format. When there is only CPU, we test the conversion
+// from NCHW to NHWC format. The following macros help setting tensor shapes,
+// source and destination format strings, and transpose permutation vectors
+// appropriately for NHWC -> NCHW conversion (when GPU) and NCHW -> NHWC
+// conversion (when only CPU).
+
+#if (GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+#define DIMS(n, h, w, c) \
+  { n, h, w, c }
+#define SRC_DATA_FORMAT "NHWC"
+#define DST_DATA_FORMAT "NCHW"
+#define DEVICE "GPU"
+#define REWRITER_CONFIG \
+  RewriterConfig::DEFAULT, RewriterConfig::NO_CONVERSION_ON_CPU
+#define PERMUTATION_SRC_TO_DST \
+  { 0, 3, 1, 2 }
+#define PERMUTATION_DST_TO_SRC \
+  { 0, 2, 3, 1 }
+#else
+#define DIMS(n, h, w, c) \
+  { n, c, h, w }
+#define SRC_DATA_FORMAT "NCHW"
+#define DST_DATA_FORMAT "NHWC"
+#define DEVICE "CPU"
+#define REWRITER_CONFIG RewriterConfig::DEFAULT, RewriterConfig::NCHW_TO_NHWC
+#define PERMUTATION_SRC_TO_DST \
+  { 0, 2, 3, 1 }
+#define PERMUTATION_DST_TO_SRC \
+  { 0, 3, 1, 2 }
+#endif  // (GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+
 Output SimpleConv2D(tensorflow::Scope* s, int input_size, int filter_size,
                     const string& padding, const string& device) {
   int batch_size = 8;
@@ -57,7 +89,8 @@ Output SimpleConv2D(tensorflow::Scope* s, int input_size, int filter_size,
   int input_depth = 3;
   int filter_count = 2;
   int stride = 1;
-  TensorShape input_shape({batch_size, input_height, input_width, input_depth});
+  TensorShape input_shape(
+      DIMS(batch_size, input_height, input_width, input_depth));
   Tensor input_data(DT_FLOAT, input_shape);
   test::FillIota<float>(&input_data, 1.0f);
   Output input =
@@ -71,7 +104,8 @@ Output SimpleConv2D(tensorflow::Scope* s, int input_size, int filter_size,
       ops::Const(s->WithOpName("Filter"), Input::Initializer(filter_data));
 
   Output conv = ops::Conv2D(s->WithOpName("Conv2D").WithDevice(device), input,
-                            filter, {1, stride, stride, 1}, padding);
+                            filter, DIMS(1, stride, stride, 1), padding,
+                            ops::Conv2D::Attrs().DataFormat(SRC_DATA_FORMAT));
   return conv;
 }
 
@@ -87,8 +121,8 @@ Output SimpleConv2DBackpropInput(tensorflow::Scope* s, int input_size,
   TensorShape input_sizes_shape({input_sizes_length});
   Tensor input_data(DT_INT32, input_sizes_shape);
   if (input_sizes_length == 4) {
-    test::FillValues<int>(&input_data,
-                          {batch_size, input_height, input_width, input_depth});
+    test::FillValues<int>(
+        &input_data, DIMS(batch_size, input_height, input_width, input_depth));
   } else {
     test::FillValues<int>(&input_data, {input_height, input_width});
   }
@@ -103,7 +137,7 @@ Output SimpleConv2DBackpropInput(tensorflow::Scope* s, int input_size,
   int output_height = input_height;
   int output_width = input_width;
   TensorShape output_shape(
-      {batch_size, output_height, output_width, filter_count});
+      DIMS(batch_size, output_height, output_width, filter_count));
   Tensor output_data(DT_FLOAT, output_shape);
   test::FillIota<float>(&output_data, 1.0f);
   Output output =
@@ -113,12 +147,13 @@ Output SimpleConv2DBackpropInput(tensorflow::Scope* s, int input_size,
   Output input_sizes_i =
       ops::Identity(s->WithOpName("InputSizesIdentity"), input_sizes);
   ops::Conv2DBackpropInput::Attrs attrs;
+  attrs = attrs.DataFormat(SRC_DATA_FORMAT);
   if (dilated) {
-    attrs = attrs.Dilations({1, 2, 2, 1});
+    attrs = attrs.Dilations(DIMS(1, 2, 2, 1));
   }
   conv_backprop_input = ops::Conv2DBackpropInput(
       s->WithOpName("Conv2DBackpropInput"), input_sizes_i, filter, output,
-      {1, stride, stride, 1}, padding, attrs);
+      DIMS(1, stride, stride, 1), padding, attrs);
 
   return conv_backprop_input;
 }
@@ -141,11 +176,18 @@ class GenericLayoutOptimizerTest : public GrapplerTest {
       cpu_device.set_l2_cache_size(256 * 1024);
       cpu_device.set_l3_cache_size(4 * 1024 * 1024);
       cpu_device.set_memory_size(1024 * 1024);
+#if (GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
       DeviceProperties gpu_device;
       gpu_device.set_type("GPU");
       gpu_device.mutable_environment()->insert({"architecture", "6"});
-      virtual_cluster_ = absl::WrapUnique(
-          new VirtualCluster({{"/CPU:0", cpu_device}, {"/GPU:1", gpu_device}}));
+      virtual_cluster_ =
+          absl::WrapUnique(new VirtualCluster({{"/CPU:0", cpu_device},
+                                               { "/GPU:1",
+                                                 gpu_device }}));
+#else
+      virtual_cluster_ =
+          absl::WrapUnique(new VirtualCluster({{"/CPU:0", cpu_device}}));
+#endif  // (GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
     }
     TF_ASSERT_OK(virtual_cluster_->Provision());
   }
@@ -183,10 +225,8 @@ void VerifyDataFormatAttributeMatch(const utils::NodeView* node,
 }
 
 TEST_F(GenericLayoutOptimizerTest, OptimizeSimpleConv2DGraph) {
-#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
-  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
-#endif  // !GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  // A simple graph contains 1 "NHWC" Conv2D node, 2 input and 1 output nodes.
+  // A simple graph contains 1 Conv2D node, 2 input and 1 output nodes.
+  // Data format is NHWC on GPU, while NCHW on CPU.
   Scope scope = Scope::NewRootScope();
 
   auto conv2d = SimpleConv2D(&scope, 4, 2, "VALID", "");
@@ -194,7 +234,7 @@ TEST_F(GenericLayoutOptimizerTest, OptimizeSimpleConv2DGraph) {
   GrapplerItem item;
   TF_ASSERT_OK(scope.ToGraphDef(&item.graph));
 
-  GenericLayoutOptimizer optimizer;
+  GenericLayoutOptimizer optimizer(REWRITER_CONFIG);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -202,9 +242,11 @@ TEST_F(GenericLayoutOptimizerTest, OptimizeSimpleConv2DGraph) {
   utils::GraphView graph_view(&output, &status);
   TF_ASSERT_OK(status);
   // The expected optimized graph contains 2 extra sets of Transpose nodes and
-  // has the Conv2D's data_format set to "NCHW".
-  auto* input_transpose_node =
-      graph_view.GetNode("Conv2D-0-TransposeNHWCToNCHW-LayoutOptimizer");
+  // has the Conv2D's data_format set to "NCHW" on GPU, while "NHWC" on CPU.
+  auto* input_transpose_node = graph_view.GetNode(
+      absl::StrCat("Conv2D-0-Transpose", SRC_DATA_FORMAT, "To", DST_DATA_FORMAT,
+                   "-LayoutOptimizer"));
+
   ASSERT_NE(input_transpose_node, nullptr);
   ASSERT_EQ(input_transpose_node->NumRegularFanins(), 2);
   VerifyRegularFaninMatch(input_transpose_node, 0, "Input", 0);
@@ -214,10 +256,11 @@ TEST_F(GenericLayoutOptimizerTest, OptimizeSimpleConv2DGraph) {
   ASSERT_EQ(conv2d_node->NumRegularFanins(), 2);
   VerifyRegularFaninMatch(conv2d_node, 0, input_transpose_node->GetName(), 0);
   VerifyRegularFaninMatch(conv2d_node, 1, "Filter", 0);
-  VerifyDataFormatAttributeMatch(conv2d_node, "NCHW");
+  VerifyDataFormatAttributeMatch(conv2d_node, DST_DATA_FORMAT);
 
-  auto* output_transpose_node =
-      graph_view.GetNode("Conv2D-0-0-TransposeNCHWToNHWC-LayoutOptimizer");
+  auto* output_transpose_node = graph_view.GetNode(
+      absl::StrCat("Conv2D-0-0-Transpose", DST_DATA_FORMAT, "To",
+                   SRC_DATA_FORMAT, "-LayoutOptimizer"));
   ASSERT_NE(output_transpose_node, nullptr);
   ASSERT_EQ(output_transpose_node->NumRegularFanins(), 2);
   VerifyRegularFaninMatch(output_transpose_node, 0, conv2d_node->GetName(), 0);
@@ -236,7 +279,7 @@ TEST_F(GenericLayoutOptimizerTest, PreserveFetch) {
   item.fetch.push_back("Conv2D");
   TF_ASSERT_OK(s.ToGraphDef(&item.graph));
 
-  GenericLayoutOptimizer optimizer;
+  GenericLayoutOptimizer optimizer(REWRITER_CONFIG);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -245,20 +288,17 @@ TEST_F(GenericLayoutOptimizerTest, PreserveFetch) {
   TF_ASSERT_OK(status);
   auto* conv_node = graph_view.GetNode("Conv2D");
   ASSERT_NE(conv_node, nullptr);
-  VerifyDataFormatAttributeMatch(conv_node, "NHWC");
+  VerifyDataFormatAttributeMatch(conv_node, SRC_DATA_FORMAT);
 }
 
 TEST_F(GenericLayoutOptimizerTest, EmptyDevice) {
-#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
-  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
-#endif  // !GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   auto conv = SimpleConv2D(&s, 4, 2, "VALID", "");
   Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
   GrapplerItem item;
   TF_ASSERT_OK(s.ToGraphDef(&item.graph));
 
-  GenericLayoutOptimizer optimizer;
+  GenericLayoutOptimizer optimizer(REWRITER_CONFIG);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -267,7 +307,7 @@ TEST_F(GenericLayoutOptimizerTest, EmptyDevice) {
   TF_ASSERT_OK(status);
   auto* conv_node = graph_view.GetNode("Conv2D");
   ASSERT_NE(conv_node, nullptr);
-  VerifyDataFormatAttributeMatch(conv_node, "NCHW");
+  VerifyDataFormatAttributeMatch(conv_node, DST_DATA_FORMAT);
 }
 
 TEST_F(GenericLayoutOptimizerTest, GPUDevice) {
@@ -294,16 +334,13 @@ TEST_F(GenericLayoutOptimizerTest, GPUDevice) {
 }
 
 TEST_F(GenericLayoutOptimizerTest, CPUDevice) {
-#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
-  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
-#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   auto conv = SimpleConv2D(&s, 4, 2, "VALID", "/CPU:0");
   Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
   GrapplerItem item;
   TF_ASSERT_OK(s.ToGraphDef(&item.graph));
 
-  GenericLayoutOptimizer optimizer;
+  GenericLayoutOptimizer optimizer(REWRITER_CONFIG);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -312,15 +349,17 @@ TEST_F(GenericLayoutOptimizerTest, CPUDevice) {
   TF_ASSERT_OK(status);
   auto* conv_node = graph_view.GetNode("Conv2D");
   ASSERT_NE(conv_node, nullptr);
+#if (GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   VerifyDataFormatAttributeMatch(conv_node, "NHWC");
+#else
+  VerifyDataFormatAttributeMatch(conv_node, DST_DATA_FORMAT);
+#endif  // (GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
 }
 
 TEST_F(GenericLayoutOptimizerTest, Connectivity) {
-#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
-  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
-#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   Scope scope = Scope::NewRootScope();
-  auto conv = SimpleConv2D(&scope, 4, 2, "VALID", "/device:GPU:0");
+  auto conv = SimpleConv2D(&scope, 4, 2, "VALID",
+                           absl::StrCat("/device:", DEVICE, ":0"));
   auto i1 = ops::Identity(scope.WithOpName("i1"), conv);
   auto i2 = ops::Identity(scope.WithOpName("i2"), i1);
   auto i3 = ops::Identity(scope.WithOpName("i3"), i2);
@@ -337,7 +376,7 @@ TEST_F(GenericLayoutOptimizerTest, Connectivity) {
   const int i2_index = graph_view_original.GetNode("i2")->node_index();
   item.graph.mutable_node()->SwapElements(i1_index, i2_index);
 
-  GenericLayoutOptimizer optimizer;
+  GenericLayoutOptimizer optimizer(REWRITER_CONFIG);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -353,9 +392,6 @@ TEST_F(GenericLayoutOptimizerTest, Connectivity) {
 }
 
 TEST_F(GenericLayoutOptimizerTest, Conv2DBackpropInputNonConstInputSizes) {
-#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
-  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
-#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   for (const int input_sizes_length : {2, 4}) {
     Scope s = Scope::NewRootScope();
     auto conv = SimpleConv2DBackpropInput(&s, 7, 2, "SAME", /*dilated=*/false,
@@ -364,7 +400,7 @@ TEST_F(GenericLayoutOptimizerTest, Conv2DBackpropInputNonConstInputSizes) {
     GrapplerItem item;
     TF_ASSERT_OK(s.ToGraphDef(&item.graph));
 
-    GenericLayoutOptimizer optimizer;
+    GenericLayoutOptimizer optimizer(REWRITER_CONFIG);
     GraphDef output;
     TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -376,10 +412,13 @@ TEST_F(GenericLayoutOptimizerTest, Conv2DBackpropInputNonConstInputSizes) {
     ASSERT_EQ(conv2d_backprop_node->NumRegularFanins(), 3);
     VerifyRegularFaninMatch(
         conv2d_backprop_node, 0,
-        "Conv2DBackpropInput-0-DataFormatVecPermuteNHWCToNCHW-LayoutOptimizer",
+        absl::StrCat("Conv2DBackpropInput-0-DataFormatVecPermute",
+                     SRC_DATA_FORMAT, "To", DST_DATA_FORMAT,
+                     "-LayoutOptimizer"),
         0);
-    auto* input_sizes_node = graph_view.GetNode(
-        "Conv2DBackpropInput-0-DataFormatVecPermuteNHWCToNCHW-LayoutOptimizer");
+    auto* input_sizes_node = graph_view.GetNode(absl::StrCat(
+        "Conv2DBackpropInput-0-DataFormatVecPermute", SRC_DATA_FORMAT, "To",
+        DST_DATA_FORMAT, "-LayoutOptimizer"));
     ASSERT_NE(input_sizes_node, nullptr);
     EXPECT_EQ(input_sizes_node->GetOp(), "DataFormatVecPermute");
     ASSERT_EQ(input_sizes_node->NumRegularFanins(), 1);
@@ -388,11 +427,10 @@ TEST_F(GenericLayoutOptimizerTest, Conv2DBackpropInputNonConstInputSizes) {
 }
 
 TEST_F(GenericLayoutOptimizerTest, Conv2DDataFormatVecPermuteCollapse) {
-#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
-  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
-#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
-  Scope scope = Scope::NewRootScope().WithDevice("/device:GPU:0");
-  auto conv = SimpleConv2D(&scope, 4, 2, "VALID", "/device:GPU:0");
+  Scope scope =
+      Scope::NewRootScope().WithDevice(absl::StrCat("/device:", DEVICE, ":0"));
+  auto conv = SimpleConv2D(&scope, 4, 2, "VALID",
+                           absl::StrCat("/device:", DEVICE, ":0"));
   auto shape = ops::Shape(scope.WithOpName("shape"), conv);
   auto value = ops::Const(scope.WithOpName("value"), 0, {});
   auto fill = ops::Fill(scope.WithOpName("fill"), shape, value);
@@ -400,7 +438,7 @@ TEST_F(GenericLayoutOptimizerTest, Conv2DDataFormatVecPermuteCollapse) {
   GrapplerItem item;
   TF_ASSERT_OK(scope.ToGraphDef(&item.graph));
 
-  GenericLayoutOptimizer optimizer;
+  GenericLayoutOptimizer optimizer(REWRITER_CONFIG);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -418,8 +456,11 @@ TEST_F(GenericLayoutOptimizerTest, Conv2DDataFormatVecPermuteCollapse) {
   auto* conv2d_node = graph_view.GetNode("Conv2D");
   ASSERT_NE(conv2d_node, nullptr);
   ASSERT_EQ(conv2d_node->NumRegularFanins(), 2);
-  VerifyRegularFaninMatch(conv2d_node, 0,
-                          "Conv2D-0-TransposeNHWCToNCHW-LayoutOptimizer", 0);
+  VerifyRegularFaninMatch(
+      conv2d_node, 0,
+      absl::StrCat("Conv2D-0-Transpose", SRC_DATA_FORMAT, "To", DST_DATA_FORMAT,
+                   "-LayoutOptimizer"),
+      0);
 
   auto* shape_node = graph_view.GetNode("shape");
   ASSERT_NE(shape_node, nullptr);
@@ -430,50 +471,59 @@ TEST_F(GenericLayoutOptimizerTest, Conv2DDataFormatVecPermuteCollapse) {
   ASSERT_NE(fill_node, nullptr);
   ASSERT_EQ(fill_node->NumRegularFanins(), 2);
   VerifyRegularFaninMatch(fill_node, 0, shape_node->GetName(), 0);
-  VerifyRegularFanoutMatch(fill_node, 0,
-                           "fill-0-0-TransposeNCHWToNHWC-LayoutOptimizer", 0);
+  VerifyRegularFanoutMatch(
+      fill_node, 0,
+      absl::StrCat("fill-0-0-Transpose", DST_DATA_FORMAT, "To", SRC_DATA_FORMAT,
+                   "-LayoutOptimizer"),
+      0);
 
   auto* graph_output = graph_view.GetNode("i");
   ASSERT_NE(graph_output, nullptr);
   ASSERT_EQ(graph_output->NumRegularFanins(), 1);
-  VerifyRegularFaninMatch(graph_output, 0,
-                          "fill-0-0-TransposeNCHWToNHWC-LayoutOptimizer", 0);
+  VerifyRegularFaninMatch(
+      graph_output, 0,
+      absl::StrCat("fill-0-0-Transpose", DST_DATA_FORMAT, "To", SRC_DATA_FORMAT,
+                   "-LayoutOptimizer"),
+      0);
 }
 
 TEST_F(GenericLayoutOptimizerTest, DoNotPruneNonAddedCancellableTransposes) {
-#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
-  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
-#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   GrapplerItem item;
   {
-    Scope scope = Scope::NewRootScope().WithDevice("/device:GPU:0");
-    auto input =
-        ops::RandomUniform(scope.WithOpName("input"),
-                           {kBatchSize, kHeight, kWidth, kDepthIn}, DT_FLOAT);
-    // NHWC -> NCHW: {0, 3, 1, 2}
+    Scope scope = Scope::NewRootScope().WithDevice(
+        absl::StrCat("/device:", DEVICE, ":0"));
+    auto input = ops::RandomUniform(scope.WithOpName("input"),
+                                    DIMS(kBatchSize, kHeight, kWidth, kDepthIn),
+                                    DT_FLOAT);
+    // Permuation for source to destination data format.
+    // GPU: NHWC -> NCHW: {0, 3, 1, 2}
+    // CPU: NCHW -> NHWC: {0, 2, 3, 1}
     auto input_in_transpose =
         ops::Transpose(scope.WithOpName("input_in_transpose"), input,
-                       ops::Const(scope, {0, 3, 1, 2}, {4}));
-    // NCHW -> NHWC: {0, 2, 3, 1}
+                       ops::Const(scope, PERMUTATION_SRC_TO_DST, {4}));
+    // Permuation for destination to source data format.
+    // GPU: NCHW -> NHWC: {0, 2, 3, 1}
+    // CPU: NHWC -> NCHW: {0, 3, 1, 2}
     auto input_out_transpose = ops::Transpose(
         scope.WithOpName("input_out_transpose"), input_in_transpose,
-        ops::Const(scope, {0, 2, 3, 1}, {4}));
+        ops::Const(scope, PERMUTATION_DST_TO_SRC, {4}));
     Tensor bias_data(DT_FLOAT, TensorShape({kDepthIn}));
     test::FillIota<float>(&bias_data, 1.0f);
-    auto bias_add = ops::BiasAdd(scope.WithOpName("bias_add"),
-                                 input_out_transpose, bias_data);
+    auto bias_add = ops::BiasAdd(
+        scope.WithOpName("bias_add"), input_out_transpose, bias_data,
+        ops::BiasAdd::Attrs().DataFormat(SRC_DATA_FORMAT));
     auto output_in_transpose =
         ops::Transpose(scope.WithOpName("output_in_transpose"), bias_add,
-                       ops::Const(scope, {0, 3, 1, 2}, {4}));
+                       ops::Const(scope, PERMUTATION_SRC_TO_DST, {4}));
     auto output_out_transpose = ops::Transpose(
         scope.WithOpName("output_out_transpose"), output_in_transpose,
-        ops::Const(scope, {0, 2, 3, 1}, {4}));
+        ops::Const(scope, PERMUTATION_DST_TO_SRC, {4}));
     auto output =
         ops::Identity(scope.WithOpName("output"), output_out_transpose);
     TF_ASSERT_OK(scope.ToGraphDef(&item.graph));
   }
 
-  GenericLayoutOptimizer optimizer;
+  GenericLayoutOptimizer optimizer(REWRITER_CONFIG);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -495,8 +545,9 @@ TEST_F(GenericLayoutOptimizerTest, DoNotPruneNonAddedCancellableTransposes) {
   VerifyRegularFaninMatch(input_out_transpose_node, 0,
                           input_in_transpose_node->GetName(), 0);
 
-  auto* bias_add_in_transpose_node =
-      graph_view.GetNode("bias_add-0-TransposeNHWCToNCHW-LayoutOptimizer");
+  auto* bias_add_in_transpose_node = graph_view.GetNode(
+      absl::StrCat("bias_add-0-Transpose", SRC_DATA_FORMAT, "To",
+                   DST_DATA_FORMAT, "-LayoutOptimizer"));
   ASSERT_NE(bias_add_in_transpose_node, nullptr);
   ASSERT_EQ(bias_add_in_transpose_node->NumRegularFanins(), 2);
   VerifyRegularFaninMatch(bias_add_in_transpose_node, 0,
@@ -508,8 +559,9 @@ TEST_F(GenericLayoutOptimizerTest, DoNotPruneNonAddedCancellableTransposes) {
   VerifyRegularFaninMatch(bias_add_node, 0,
                           bias_add_in_transpose_node->GetName(), 0);
 
-  auto* bias_add_out_transpose_node =
-      graph_view.GetNode("bias_add-0-0-TransposeNCHWToNHWC-LayoutOptimizer");
+  auto* bias_add_out_transpose_node = graph_view.GetNode(
+      absl::StrCat("bias_add-0-0-Transpose", DST_DATA_FORMAT, "To",
+                   SRC_DATA_FORMAT, "-LayoutOptimizer"));
   ASSERT_NE(bias_add_out_transpose_node, nullptr);
   ASSERT_EQ(bias_add_out_transpose_node->NumRegularFanins(), 2);
   VerifyRegularFaninMatch(bias_add_out_transpose_node, 0,
@@ -537,7 +589,9 @@ TEST_F(GenericLayoutOptimizerTest, DoNotPruneNonAddedCancellableTransposes) {
 TEST_F(GenericLayoutOptimizerTest, CancelTransposeAroundPad) {
   using test::function::NDef;
 
-  GenericLayoutOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+  GenericLayoutOptimizer optimizer(
+      RewriterConfig::AGGRESSIVE,
+      RewriterConfig::NCHW_TO_NHWC /* CPU settings*/);
 
   const Tensor kPermuteNhwcToNchw = test::AsTensor<int32>({0, 3, 1, 2});
   const Tensor kPermuteNchwToNhwc = test::AsTensor<int32>({0, 2, 3, 1});
@@ -601,6 +655,32 @@ TEST_F(GenericLayoutOptimizerTest, CancelTransposeAroundPad) {
   test::ExpectTensorEqual<float>(tensors_expected[1], tensors[1]);
 }
 
+TEST_F(GenericLayoutOptimizerTest, PreserveInputShapes) {
+  using test::function::NDef;
+
+  GenericLayoutOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+
+  AttrValue output_shapes;
+  auto* shape = output_shapes.mutable_list()->add_shape();
+  shape->add_dim()->set_size(-1);
+
+  GrapplerItem item;
+  item.graph = test::function::GDef({NDef(
+      "x", "_Arg", {},
+      {{"T", DT_FLOAT}, {"index", 0}, {"_output_shapes", output_shapes}})});
+
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
+
+  Status status;
+  utils::GraphView graph_view(&output, &status);
+  TF_ASSERT_OK(status);
+
+  auto* arg = graph_view.GetNode("x");
+  ASSERT_NE(arg, nullptr);
+  EXPECT_TRUE(arg->HasAttr("_output_shapes"));
+}
+
 // TODO(yanzha): Add more complex Graph for test.
 
 }  // namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc
index 9d702971dd7..2f806ba6b6a 100644
--- a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc
+++ b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc
@@ -241,7 +241,7 @@ Status Transposer::CreateConstPermNode(TransposeContext* context,
   node.mutable_attr()->insert({"dtype", attr_data_type});
 
   AttrValue attr_tensor;
-  Tensor tensor(DT_INT32, TensorShape({4}));
+  Tensor tensor(DT_INT32, TensorShape({(long long)permutation.size()}));
   for (int i = 0, end = permutation.size(); i < end; i++) {
     tensor.flat<int>()(i) = permutation[i];
   }
@@ -752,6 +752,86 @@ Status Conv2DBackpropInputTransposer::TransposeNode(
   return context->graph_view->GetMutationBuilder()->Apply();
 }
 
+Status Conv3DTransposer::TransposeNode(TransposeContext* context,
+                                       utils::MutableNodeView* node) {
+  DCHECK(IsConv3D(*node->node()));
+  // Update the format from 4D to 5D layout.
+  std::string src_format = context->src_format;
+  std::string dst_format = context->dst_format;
+  std::string src_format_3d = src_format == "NHWC" ? "NDHWC" : "NCDHW";
+  std::string dst_format_3d = dst_format == "NHWC" ? "NDHWC" : "NCDHW";
+  context->AssignDeviceAndDataFormats(context->target_device, src_format_3d,
+                                      dst_format_3d);
+  if (!ShouldProcess(*context, *node) || !IsFanoutPortRankN(*node, 0, 5)) {
+    return Status::OK();
+  }
+  VLOG(3) << "GenericLayoutOptimizer: transforming node '" << node->GetName()
+          << "' with op '" << node->GetOp() << "' from data format '"
+          << context->src_format << "' to '" << context->dst_format << "'";
+  TF_RETURN_IF_ERROR(UpdateNode(context, node));
+  TF_RETURN_IF_ERROR(UpdateFaninEdgesWithOp(context, {0}, node, kOpTranspose));
+  TF_RETURN_IF_ERROR(UpdateFanoutEdgesWithOp(context, {0}, node, kOpTranspose));
+  // Change back the format from 5D to 4D layout.
+  context->AssignDeviceAndDataFormats(context->target_device, src_format,
+                                      dst_format);
+  return context->graph_view->GetMutationBuilder()->Apply();
+}
+
+Status Conv3DBackpropFilterTransposer::TransposeNode(
+    TransposeContext* context, utils::MutableNodeView* node) {
+  DCHECK(IsConv3DBackpropFilterV2(*node->node()));
+  // Update the format from 4D to 5D layout.
+  std::string src_format = context->src_format;
+  std::string dst_format = context->dst_format;
+  std::string src_format_3d = src_format == "NHWC" ? "NDHWC" : "NCDHW";
+  std::string dst_format_3d = dst_format == "NHWC" ? "NDHWC" : "NCDHW";
+  context->AssignDeviceAndDataFormats(context->target_device, src_format_3d,
+                                      dst_format_3d);
+  if (!ShouldProcess(*context, *node) || !IsFanoutPortRankN(*node, 0, 5)) {
+    return Status::OK();
+  }
+  VLOG(3) << "GenericLayoutOptimizer: transforming node '" << node->GetName()
+          << "' with op '" << node->GetOp() << "' from data format '"
+          << context->src_format << "' to '" << context->dst_format << "'";
+  TF_RETURN_IF_ERROR(UpdateNode(context, node));
+  TF_RETURN_IF_ERROR(
+      UpdateFaninEdgesWithOp(context, {0, 2}, node, kOpTranspose));
+  // No need to update output shape, as it is always of shape
+  // [filter_height, filter_width, in_channels, out_channels], regardless of
+  // whether NCHW or NHWC is used.
+  // Change back the format from 5D to 4D layout.
+  context->AssignDeviceAndDataFormats(context->target_device, src_format,
+                                      dst_format);
+  return context->graph_view->GetMutationBuilder()->Apply();
+}
+
+Status Conv3DBackpropInputTransposer::TransposeNode(
+    TransposeContext* context, utils::MutableNodeView* node) {
+  DCHECK(IsConv3DBackpropInputV2(*node->node()));
+  // Update the format from 4D to 5D layout.
+  std::string src_format = context->src_format;
+  std::string dst_format = context->dst_format;
+  std::string src_format_3d = src_format == "NHWC" ? "NDHWC" : "NCDHW";
+  std::string dst_format_3d = dst_format == "NHWC" ? "NDHWC" : "NCDHW";
+  context->AssignDeviceAndDataFormats(context->target_device, src_format_3d,
+                                      dst_format_3d);
+  if (!ShouldProcess(*context, *node) || !IsFanoutPortRankN(*node, 0, 5)) {
+    return Status::OK();
+  }
+  VLOG(3) << "GenericLayoutOptimizer: transforming node '" << node->GetName()
+          << "' with op '" << node->GetOp() << "' from data format '"
+          << context->src_format << "' to '" << context->dst_format << "'";
+  TF_RETURN_IF_ERROR(UpdateNode(context, node));
+  TF_RETURN_IF_ERROR(
+      UpdateFaninEdgesWithOp(context, {0}, node, kOpDataFormatVecPermute));
+  TF_RETURN_IF_ERROR(UpdateFaninEdgesWithOp(context, {2}, node, kOpTranspose));
+  TF_RETURN_IF_ERROR(UpdateFanoutEdgesWithOp(context, {0}, node, kOpTranspose));
+  // Change back the format from 5D to 4D layout.
+  context->AssignDeviceAndDataFormats(context->target_device, src_format,
+                                      dst_format);
+  return context->graph_view->GetMutationBuilder()->Apply();
+}
+
 Status FusedBatchNormExTransposer::TransposeNode(TransposeContext* context,
                                                  utils::MutableNodeView* node) {
   DCHECK(IsFusedBatchNormEx(*node->node()));
@@ -1684,7 +1764,9 @@ bool IsLayoutSensitiveOp(const NodeDef& node) {
          IsDepthwiseConv2dNativeBackpropInput(node) ||
          IsFusedBatchNormEx(node) || IsFusedBatchNormGrad(node) ||
          IsMaxPoolV2(node) || IsMaxPoolGrad(node) || IsMaxPoolGradV2(node) ||
-         IsMaxPoolGradGradV1(node) || IsMaxPoolGradGradV2(node);
+         IsMaxPoolGradGradV1(node) || IsMaxPoolGradGradV2(node) ||
+         IsConv3D(node) || IsConv3DBackpropInputV2(node) ||
+         IsConv3DBackpropFilterV2(node);
 }
 
 bool IsDefaultLayoutAgnosticOp(const NodeDef& node) {
@@ -1723,6 +1805,7 @@ bool IsDefaultLayoutAgnosticOp(const NodeDef& node) {
                                             "IsFinite",
                                             "IsInf",
                                             "IsNan",
+                                            "LeakyRelu",
                                             "Lgamma",
                                             "Log",
                                             "LogicalNot",
@@ -1771,10 +1854,11 @@ bool IsTernaryOp(const NodeDef& node) { return IsBetainc(node); }
 
 bool IsUnaryGrad(const NodeDef& node) {
   bool is_unary_grad =
-      IsEluGrad(node) || IsInvGrad(node) || IsReciprocalGrad(node) ||
-      IsRelu6Grad(node) || IsReluGrad(node) || IsRsqrtGrad(node) ||
-      IsSeluGrad(node) || IsSigmoidGrad(node) || IsSoftplusGrad(node) ||
-      IsSoftsignGrad(node) || IsSqrtGrad(node) || IsTanhGrad(node);
+      IsEluGrad(node) || IsInvGrad(node) || IsLeakyReluGrad(node) ||
+      IsReciprocalGrad(node) || IsRelu6Grad(node) || IsReluGrad(node) ||
+      IsRsqrtGrad(node) || IsSeluGrad(node) || IsSigmoidGrad(node) ||
+      IsSoftplusGrad(node) || IsSoftsignGrad(node) || IsSqrtGrad(node) ||
+      IsTanhGrad(node);
   return is_unary_grad;
 }
 
diff --git a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.h b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.h
index 95af7933d10..61720df791b 100644
--- a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.h
+++ b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.h
@@ -41,6 +41,7 @@ constexpr char kAttrSrcFormat[] = "src_format";
 constexpr char kAttrDstFormat[] = "dst_format";
 constexpr char kAttrOutputShape[] = "_output_shapes";
 constexpr char kGPU[] = "GPU";
+constexpr char kCPU[] = "CPU";
 
 // TransposeContext owns all data members. Must initialize GraphProperties,
 // FrameView, GraphDef and MutableGraphView with the same graph. NodeDef
@@ -239,6 +240,30 @@ class Conv2DBackpropInputTransposer : public LayoutSensitiveOpTransposer {
                        utils::MutableNodeView* node) override;
 };
 
+class Conv3DTransposer : public LayoutSensitiveOpTransposer {
+ public:
+  explicit Conv3DTransposer() : LayoutSensitiveOpTransposer() {}
+
+  Status TransposeNode(TransposeContext* context,
+                       utils::MutableNodeView* node) override;
+};
+
+class Conv3DBackpropFilterTransposer : public LayoutSensitiveOpTransposer {
+ public:
+  explicit Conv3DBackpropFilterTransposer() : LayoutSensitiveOpTransposer() {}
+
+  Status TransposeNode(TransposeContext* context,
+                       utils::MutableNodeView* node) override;
+};
+
+class Conv3DBackpropInputTransposer : public LayoutSensitiveOpTransposer {
+ public:
+  explicit Conv3DBackpropInputTransposer() : LayoutSensitiveOpTransposer() {}
+
+  Status TransposeNode(TransposeContext* context,
+                       utils::MutableNodeView* node) override;
+};
+
 class FusedBatchNormExTransposer : public LayoutSensitiveOpTransposer {
  public:
   explicit FusedBatchNormExTransposer() : LayoutSensitiveOpTransposer() {}
diff --git a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer_factory.cc b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer_factory.cc
index 59c06d42441..15bbc08079c 100644
--- a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer_factory.cc
+++ b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer_factory.cc
@@ -43,6 +43,17 @@ std::shared_ptr<Transposer> TransposerFactory::GetTransposer(
     return GetOrCreateIfNotFound<Conv2DBackpropInputTransposer>(
         "Conv2DBackpropInput");
   }
+  if (IsConv3D(node)) {
+    return GetOrCreateIfNotFound<Conv3DTransposer>("Conv3D");
+  }
+  if (IsConv3DBackpropInputV2(node)) {
+    return GetOrCreateIfNotFound<Conv3DBackpropInputTransposer>(
+        "Conv3DBackpropInput");
+  }
+  if (IsConv3DBackpropFilterV2(node)) {
+    return GetOrCreateIfNotFound<Conv3DBackpropFilterTransposer>(
+        "Conv3DBackpropFilter");
+  }
   if (IsFusedBatchNormEx(node)) {
     return GetOrCreateIfNotFound<FusedBatchNormExTransposer>(
         "FusedBatchNormEx");
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index 48657a634d0..8f18dfdeef4 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
 
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/substitute.h"
 #include "tensorflow/core/common_runtime/function.h"
@@ -190,7 +191,9 @@ std::unique_ptr<GraphOptimizer> MetaOptimizer::MakeNewOptimizer(
              cfg_.experimental_disable_compressed_tensor_optimization()));
   MK_OPT("shape", new ShapeOptimizer());
   MK_OPT("remap", new Remapper(cfg_.remapping()));
-  MK_OPT("layout", new GenericLayoutOptimizer());
+  MK_OPT("layout", new GenericLayoutOptimizer(
+                       /*optimization level*/ cfg_.layout_optimizer(),
+                       /*CPU layout conversion*/ cfg_.cpu_layout_conversion()));
   MK_OPT("auto_mixed_precision",
          new AutoMixedPrecision(AutoMixedPrecisionMode::CUDA));
   MK_OPT("auto_mixed_precision_mkl",
@@ -270,7 +273,9 @@ Status MetaOptimizer::InitializeOptimizers(
         MakeUnique<ArithmeticOptimizer>(cfg_.arithmetic_optimization()));
   }
   if (cfg_.layout_optimizer() != RewriterConfig::OFF) {
-    optimizers->push_back(MakeUnique<GenericLayoutOptimizer>());
+    optimizers->push_back(MakeUnique<GenericLayoutOptimizer>(
+        /*optimization level*/ cfg_.layout_optimizer(),
+        /*CPU layout conversion*/ cfg_.cpu_layout_conversion()));
   }
   if (cfg_.remapping() != RewriterConfig::OFF) {
     optimizers->push_back(MakeUnique<Remapper>(cfg_.remapping()));
@@ -673,18 +678,41 @@ Status MetaOptimizer::OptimizeConsumeItem(Cluster* cluster, GrapplerItem&& item,
     find_differentiable_functions(function.node_def());
   }
 
-  // Find functions that are formed by XLA and will be compiled later. We do it
-  // by looking for a function attribute in XlaLaunch ops. Grappler rewrites
-  // potentially can add nodes that are not supported by XLA, so we choose to
-  // skip such functions when we optimize function library.
+  // Find functions that will be compiled by XLA later
+  // We do it by looking for XlaLaunch ops that call functions,
+  // then depth first search down those functions to find transitive functions.
+  // Grappler rewrites can potentially add nodes that are
+  // not supported by XLA, so we choose to skip such functions when we optimize
+  // the function library.
   absl::flat_hash_set<string> xla_compiled_functions;
+  std::function<void(const string&)> find_all_functions;
+  find_all_functions = [&](const string& func) -> void {
+    // Ignore call cycles in the graph
+    if (xla_compiled_functions.contains(func)) return;
+    // Find func in the flib
+    const FunctionDef* func_def = flib.Find(func);
+    CHECK(func_def) << "not found: " << func;
+    // Mark function to be ignored by grappler
+    xla_compiled_functions.insert(func);
+    // Depth first search through the func for transitively called funcs
+    for (const NodeDef& node : func_def->node_def()) {
+      for (const auto attr : node.attr()) {
+        const AttrValue& attr_value = attr.second;
+        if (attr_value.has_func()) {
+          find_all_functions(attr_value.func().name());
+        }
+      }
+    }
+  };
 
-  const auto find_xla_compiled_functions = [&](const NodeDefs& nodes) -> void {
+  auto find_xla_compiled_functions = [&](const NodeDefs& nodes) -> void {
     NameAttrList function;
     for (const NodeDef& node : nodes) {
+      // Look only for XlaLaunch nodes that call a function
       if (!IsXlaLaunch(node)) continue;
       if (!GetNodeAttr(node, "function", &function).ok()) continue;
-      xla_compiled_functions.insert(function.name());
+      // Find all transitively called functions
+      find_all_functions(function.name());
     }
   };
 
@@ -827,15 +855,22 @@ Status MetaOptimizer::OptimizeConsumeItem(Cluster* cluster, GrapplerItem&& item,
   return Status::OK();
 }
 
-void MetaOptimizer::PrintResult() {
+string MetaOptimizer::GetResultString() const {
+  std::string result_string;
   for (const GraphOptimizationResult& graph_result : optimization_results_) {
-    LOG(INFO) << "Optimization results for grappler item: " << graph_result.id;
+    absl::StrAppend(&result_string,
+                    "Optimization results for grappler item: ", graph_result.id,
+                    "\n");
     for (const OptimizerResult& result : graph_result.results) {
-      LOG(INFO) << "  " << result.optimizer_name << ": " << result.message;
+      absl::StrAppend(&result_string, "  ", result.optimizer_name, ": ",
+                      result.message, "\n");
     }
   }
+  return result_string;
 }
 
+void MetaOptimizer::PrintResult() { LOG(INFO) << GetResultString(); }
+
 bool MetaOptimizerEnabled(const ConfigProto& cfg) {
   const auto& rewrite_cfg = cfg.graph_options().rewrite_options();
   if (rewrite_cfg.disable_meta_optimizer()) {
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.h b/tensorflow/core/grappler/optimizers/meta_optimizer.h
index f39f0b62bb6..b21ea68f720 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.h
@@ -50,6 +50,8 @@ class MetaOptimizer : public GraphOptimizer {
   Status OptimizeConsumeItem(Cluster* cluster, GrapplerItem&& item,
                              GraphDef* optimized_graph);
 
+  string GetResultString() const;
+
   void PrintResult();
 
   void Feedback(Cluster* cluster, const GrapplerItem& item,
diff --git a/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc b/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc
index f534d3ed34f..f5cc44b1464 100644
--- a/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc
+++ b/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #ifdef INTEL_MKL
+#include "tensorflow/cc/ops/nn_ops_internal.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/grappler/devices.h"
@@ -34,8 +35,9 @@ class MklRemapperTest : public GrapplerTest {
   const string kAddV2Op = "AddV2";
 
  protected:
-  void FuseConv2DWithBiasAndAddNOrAdd(const string& data_format, bool has_relu,
-                                      string add_op, bool add_with_bcast) {
+  void FuseConv2DWithBiasAndAddNOrAdd(const string& data_format,
+                                      const string& activation, string add_op,
+                                      bool add_with_bcast) {
     using ::tensorflow::ops::Placeholder;
 
     tensorflow::Scope s = tensorflow::Scope::NewRootScope();
@@ -70,31 +72,34 @@ class MklRemapperTest : public GrapplerTest {
                     ops::Conv2D::Attrs().DataFormat(data_format));
     auto bias_add = ops::BiasAdd(s.WithOpName("bias_add"), conv, bias,
                                  ops::BiasAdd::Attrs().DataFormat(data_format));
+
+    auto addfetch = [&](::tensorflow::Input addop) {
+      auto activate = s.WithOpName("activation");
+      auto fetch = s.WithOpName("fetch");
+      if (activation == "Relu") {
+        ops::Identity(fetch, ops::Relu(activate, addop));
+      } else if (activation == "Relu6") {
+        ops::Identity(fetch, ops::Relu6(activate, addop));
+      } else if (activation == "Elu") {
+        ops::Identity(fetch, ops::Elu(activate, addop));
+      } else if (activation == "LeakyRelu") {
+        ops::Identity(fetch, ops::internal::LeakyRelu(activate, addop));
+      } else {
+        DCHECK(activation == "None");
+        ops::Identity(fetch, addop);
+      }
+    };
+
     if (add_op == kAddNOp) {
       auto addn = ops::AddN(s.WithOpName(add_op),
                             std::initializer_list<Input>{input_addn, bias_add});
-      if (has_relu) {
-        auto relu = ops::Relu(s.WithOpName("relu"), addn);
-        ops::Identity(s.WithOpName("fetch"), relu);
-      } else {
-        ops::Identity(s.WithOpName("fetch"), addn);
-      }
+      addfetch(addn);
     } else if (add_op == kAddV2Op) {
       auto add = ops::AddV2(s.WithOpName(add_op), input_addn, bias_add);
-      if (has_relu) {
-        auto relu = ops::Relu(s.WithOpName("relu"), add);
-        ops::Identity(s.WithOpName("fetch"), relu);
-      } else {
-        ops::Identity(s.WithOpName("fetch"), add);
-      }
+      addfetch(add);
     } else {
       auto add = ops::Add(s.WithOpName(add_op), input_addn, bias_add);
-      if (has_relu) {
-        auto relu = ops::Relu(s.WithOpName("relu"), add);
-        ops::Identity(s.WithOpName("fetch"), relu);
-      } else {
-        ops::Identity(s.WithOpName("fetch"), add);
-      }
+      addfetch(add);
     }
     auto input_tensor = GenerateRandomTensor<DT_FLOAT>(
         TensorShape(input_shape.shape_.dim_sizes()));
@@ -129,7 +134,7 @@ class MklRemapperTest : public GrapplerTest {
     bool check_fusion = !add_with_bcast;
     int found = 0;
     for (const NodeDef& node : output.node()) {
-      auto fetch_node_name = has_relu ? "relu" : add_op;
+      auto fetch_node_name = activation != "None" ? "activation" : add_op;
       if (node.name() == fetch_node_name) {
         if (check_fusion) {
           EXPECT_EQ("_FusedConv2D", node.op());
@@ -141,19 +146,19 @@ class MklRemapperTest : public GrapplerTest {
           EXPECT_EQ("input_addn", node.input(3));
 
           const auto fused_ops = node.attr().at("fused_ops").list().s();
-          if (has_relu) {
+          if (activation != "None") {
             EXPECT_EQ(3, fused_ops.size());
             EXPECT_EQ("BiasAdd", fused_ops[0]);
             EXPECT_EQ("Add", fused_ops[1]);
-            EXPECT_EQ("Relu", fused_ops[2]);
+            EXPECT_EQ(activation, fused_ops[2]);
           } else {
             EXPECT_EQ(2, fused_ops.size());
             EXPECT_EQ("BiasAdd", fused_ops[0]);
             EXPECT_EQ("Add", fused_ops[1]);
           }
         } else {
-          if (has_relu) {
-            EXPECT_EQ(node.op(), "Relu");
+          if (activation != "None") {
+            EXPECT_EQ(node.op(), activation);
             ASSERT_EQ(node.input_size(), 1);
             EXPECT_EQ(node.input(0), add_op);
           } else {
@@ -174,38 +179,38 @@ class MklRemapperTest : public GrapplerTest {
   }
 };
 
-#define CREATE_CONV2DFUSION_TEST(data_format, addop, relu, bcast)                    \
-  TEST_F(                                                                            \
-      MklRemapperTest,                                                               \
-      FuseConv2DWithBiasAnd##addop##_##data_format##_relu##relu##_addbcast##bcast) { \
-    const bool kShouldFuseRelu = relu;                                               \
-    const bool kIsAddWithBcast = bcast;                                              \
-    FuseConv2DWithBiasAndAddNOrAdd(#data_format, relu, #addop, bcast);               \
+#define CREATE_CONV2DFUSION_TEST(data_format, addop, activation, bcast)                          \
+  TEST_F(                                                                                        \
+      MklRemapperTest,                                                                           \
+      FuseConv2DWithBiasAnd##addop##_##data_format##_activation##activation##_addbcast##bcast) { \
+    FuseConv2DWithBiasAndAddNOrAdd(#data_format, #activation, #addop, bcast);                    \
   }
 
-#define CREATE_CONV2DFUSION_ADD_NOBCAST_TEST(addop)    \
-  CREATE_CONV2DFUSION_TEST(NHWC, addop, false, false); \
-  CREATE_CONV2DFUSION_TEST(NHWC, addop, true, false);  \
-  CREATE_CONV2DFUSION_TEST(NCHW, addop, false, false); \
-  CREATE_CONV2DFUSION_TEST(NCHW, addop, true, false);
+#define CREATE_CONV2DFUSION_ADD_ACTIVATION_TEST(data_format, addop, bcast) \
+  CREATE_CONV2DFUSION_TEST(data_format, addop, Relu, bcast);               \
+  CREATE_CONV2DFUSION_TEST(data_format, addop, Relu6, bcast);              \
+  CREATE_CONV2DFUSION_TEST(data_format, addop, Elu, bcast);                \
+  CREATE_CONV2DFUSION_TEST(data_format, addop, LeakyRelu, bcast);          \
+  CREATE_CONV2DFUSION_TEST(data_format, addop, None, bcast);
+
+#define CREATE_CONV2DFUSION_ADD_NOBCAST_TEST(addop)            \
+  CREATE_CONV2DFUSION_ADD_ACTIVATION_TEST(NHWC, addop, false); \
+  CREATE_CONV2DFUSION_ADD_ACTIVATION_TEST(NCHW, addop, false);
 
 CREATE_CONV2DFUSION_ADD_NOBCAST_TEST(AddN);
 
-#define CREATE_CONV2DFUSION_ADD_BCAST_TEST(addop)      \
-  CREATE_CONV2DFUSION_TEST(NHWC, addop, false, false); \
-  CREATE_CONV2DFUSION_TEST(NHWC, addop, true, false);  \
-  CREATE_CONV2DFUSION_TEST(NCHW, addop, false, false); \
-  CREATE_CONV2DFUSION_TEST(NCHW, addop, true, false);  \
-  CREATE_CONV2DFUSION_TEST(NHWC, addop, false, true);  \
-  CREATE_CONV2DFUSION_TEST(NHWC, addop, true, true);   \
-  CREATE_CONV2DFUSION_TEST(NCHW, addop, false, true);  \
-  CREATE_CONV2DFUSION_TEST(NCHW, addop, true, true);
+#define CREATE_CONV2DFUSION_ADD_BCAST_TEST(addop)              \
+  CREATE_CONV2DFUSION_ADD_ACTIVATION_TEST(NHWC, addop, false); \
+  CREATE_CONV2DFUSION_ADD_ACTIVATION_TEST(NCHW, addop, false); \
+  CREATE_CONV2DFUSION_ADD_ACTIVATION_TEST(NHWC, addop, true);  \
+  CREATE_CONV2DFUSION_ADD_ACTIVATION_TEST(NCHW, addop, true);
 
 CREATE_CONV2DFUSION_ADD_BCAST_TEST(Add);
 CREATE_CONV2DFUSION_ADD_BCAST_TEST(AddV2);
 
 #undef CREATE_CONV2DFUSION_ADD_NOBCAST_TEST
 #undef CREATE_CONV2DFUSION_ADD_BCAST_TEST
+#undef CREATE_CONV2DFUSION_ADD_ACTIVATION_TEST
 #undef CREATE_CONV2DFUSION_TEST
 
 #define REGISTER_TEST(NAME, T, INPUT)                                         \
diff --git a/tensorflow/core/grappler/optimizers/pin_to_host_optimizer_test.cc b/tensorflow/core/grappler/optimizers/pin_to_host_optimizer_test.cc
index 641b5b4ef31..4fe8e6a6c3f 100644
--- a/tensorflow/core/grappler/optimizers/pin_to_host_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/pin_to_host_optimizer_test.cc
@@ -44,26 +44,6 @@ TEST_F(PinToHostOptimizerTest, TryFindHostDeviceCpuXlaGpu) {
             "/device:CPU:0");
 }
 
-TEST_F(PinToHostOptimizerTest, TryFindHostDeviceXlaCpuXlaGpu) {
-  gtl::FlatSet<string> devices = {"/device:XLA_CPU:0", "/device:XLA_GPU:0"};
-
-  EXPECT_EQ(internal::TryFindHostDevice(devices, false, ""), "");
-  EXPECT_EQ(internal::TryFindHostDevice(devices, false, "/device:XLA_GPU:0"),
-            "/device:XLA_CPU:0");
-  EXPECT_EQ(internal::TryFindHostDevice(devices, false, "/device:XLA_GPU:*"),
-            "/device:XLA_CPU:0");
-}
-
-TEST_F(PinToHostOptimizerTest, TryFindHostDeviceXlaGpu) {
-  gtl::FlatSet<string> devices = {"/device:XLA_GPU:0"};
-
-  EXPECT_EQ(internal::TryFindHostDevice(devices, false, ""), "");
-  EXPECT_EQ(internal::TryFindHostDevice(devices, false, "/device:XLA_GPU:0"),
-            "");
-  EXPECT_EQ(internal::TryFindHostDevice(devices, false, "/device:XLA_GPU:*"),
-            "");
-}
-
 TEST_F(PinToHostOptimizerTest, OptimizeSmallOpsToHost) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output a = ops::Const(s.WithOpName("a"), 1, {1024, 1024});
diff --git a/tensorflow/core/grappler/optimizers/remapper.cc b/tensorflow/core/grappler/optimizers/remapper.cc
index 46c7afbc53a..115428ff5ef 100644
--- a/tensorflow/core/grappler/optimizers/remapper.cc
+++ b/tensorflow/core/grappler/optimizers/remapper.cc
@@ -34,6 +34,10 @@ limitations under the License.
 #include "third_party/gpus/cudnn/cudnn.h"
 #endif  // GOOGLE_CUDA
 
+#ifdef INTEL_MKL
+#include "tensorflow/core/graph/mkl_graph_util.h"
+#endif  // INTEL_MKL
+
 namespace tensorflow {
 namespace grappler {
 
@@ -361,7 +365,12 @@ bool IsDeviceCompatible(const RemapperContext& ctx, Pattern& matched) {
 }
 
 bool IsSupportedActivation(const NodeDef& node) {
-  return IsRelu(node) || IsRelu6(node) || IsElu(node);
+#ifdef INTEL_MKL
+  return IsRelu(node) || IsRelu6(node) || IsElu(node) || IsLeakyRelu(node) ||
+         IsTanh(node);
+#else
+  return IsRelu(node) || IsRelu6(node) || IsElu(node) || IsLeakyRelu(node);
+#endif
 }
 
 inline bool HasControlFaninOrFanout(const utils::MutableNodeView& node_view) {
@@ -450,6 +459,17 @@ bool FindContractionWithBiasAndActivation(
       IsInPreserveSet(ctx, bias_add_node_def))
     return false;
 
+  // Get the contraction node
+  const auto* contraction_node_view =
+      bias_add_node_view->GetRegularFanin(0).node_view();
+  const auto* contraction_node_def = contraction_node_view->node();
+
+  // Currently, only matmul + bias + tanh is enable
+  if (!IsMatMul(*contraction_node_def) && IsTanh(*node_def)) return false;
+
+  // Currently, only conv + bias + leakyrelu is enabled
+  if (!IsConv2D(*contraction_node_def) && IsLeakyRelu(*node_def)) return false;
+
   // Check that data type and data format are supported on assigned device.
   const ContractionWithBiasAddAndActivation pattern{base.contraction,
                                                     base.bias_add, node_index};
@@ -699,6 +719,9 @@ bool FindContractionWithBiasAndAddActivation(
   if (node_def == nullptr) return false;
   if (!IsSupportedActivation(*node_def)) return false;
 
+  // Currently, Contraction + Bias + Add + Tanh pattern is not supported
+  if (IsTanh(*node_def)) return false;
+
 #ifdef ENABLE_INTEL_MKL_BFLOAT16
   // MKL activation op only supports float and bfloat16 data types.
   if (!HasDataType(node_def, DT_FLOAT) && !HasDataType(node_def, DT_BFLOAT16))
@@ -719,6 +742,16 @@ bool FindContractionWithBiasAndAddActivation(
     return false;
   }
 
+  // Get the contraction node
+  const auto* bias_add_node_view =
+      add_node_view->GetRegularFanin(base.port_id).node_view();
+  const auto* contraction_node_view =
+      bias_add_node_view->GetRegularFanin(0).node_view();
+  const auto* contraction_node_def = contraction_node_view->node();
+
+  // Currently, only conv + bias + add + leakyrelu is enabled
+  if (!IsConv2D(*contraction_node_def) && IsLeakyRelu(*node_def)) return false;
+
   // We successfully found a Conv2D+BiasAdd+AddN+activation pattern.
   const ContractionWithBiasAndAddActivation pattern{
       base.contraction, base.bias_add, base.add, base.port_id, node_index};
@@ -805,6 +838,12 @@ bool FindFusedBatchNormEx(const RemapperContext& ctx, int node_index,
 #ifndef ENABLE_MKLDNN_V1
     // We fuse FusedBatchNorm on GPU or MKL CPU.
     if (!NodeIsOnGpu(fused_batch_norm_node_def)) return false;
+#else
+    if (NativeFormatEnabled()) {
+      // Temporarily disable FusedBatchNorm fusion on CPU until
+      // we support it under native format mode
+      if (!NodeIsOnGpu(fused_batch_norm_node_def)) return false;
+    }
 #endif
 
     DataType t_dtype = GetDataTypeFromAttr(*fused_batch_norm_node_def, "T");
@@ -919,7 +958,8 @@ bool FindFusedBatchNormEx(const RemapperContext& ctx, int node_index,
   return false;
 }
 
-void CopyConv2DAttributes(const NodeDef& conv2d, NodeDef* fused_conv2d) {
+void CopyConv2DAttributes(const NodeDef& conv2d, NodeDef* fused_conv2d,
+                          const NodeDef* activation = nullptr) {
   DCHECK(IsConv2D(conv2d)) << "Input node must be a Conv2D";
 
   auto* attr = fused_conv2d->mutable_attr();
@@ -932,10 +972,16 @@ void CopyConv2DAttributes(const NodeDef& conv2d, NodeDef* fused_conv2d) {
   (*attr)["dilations"] = src_attr.at("dilations");
   (*attr)["data_format"] = src_attr.at("data_format");
   (*attr)["use_cudnn_on_gpu"] = src_attr.at("use_cudnn_on_gpu");
+  // Copy LeakyRelu's attr alpha to FusedConv2D's attr leakyrelu_alpha
+  if (activation != nullptr && IsLeakyRelu(*activation)) {
+    auto& activation_attr = activation->attr();
+    (*attr)["leakyrelu_alpha"] = activation_attr.at("alpha");
+  }
 }
 
 void CopyDepthwiseConv2dNativeAttributes(const NodeDef& dw_conv2d,
-                                         NodeDef* fused_dw_conv2d) {
+                                         NodeDef* fused_dw_conv2d,
+                                         const NodeDef* activation = nullptr) {
   DCHECK(IsDepthwiseConv2dNative(dw_conv2d))
       << "Input node must be a DepthwiseConv2dNative";
 
@@ -947,6 +993,11 @@ void CopyDepthwiseConv2dNativeAttributes(const NodeDef& dw_conv2d,
   (*attr)["padding"] = src_attr.at("padding");
   (*attr)["dilations"] = src_attr.at("dilations");
   (*attr)["data_format"] = src_attr.at("data_format");
+  // Copy LeakyRelu's attr alpha to FusedDepthwiseConv2d's attr leakyrelu_alpha
+  if (activation != nullptr && IsLeakyRelu(*activation)) {
+    auto& activation_attr = activation->attr();
+    (*attr)["leakyrelu_alpha"] = activation_attr.at("alpha");
+  }
 }
 
 void CopyFusedBatchNormAttributes(const NodeDef& fused_batch_norm,
@@ -1049,6 +1100,7 @@ Status AddFusedContractionNode(
   const NodeDef& contraction = graph->node(matched.contraction);
   const NodeDef& bias_add = graph->node(matched.bias_add);
   const NodeDef& activation = graph->node(matched.activation);
+
   VLOG(2) << "Fuse " << contraction.op() << " with BiasAdd and "
           << activation.op() << ":"
           << " activation=" << activation.name()
@@ -1064,7 +1116,8 @@ Status AddFusedContractionNode(
 
   if (IsConv2D(contraction)) {
     fused_op.set_op(kFusedConv2D);
-    CopyConv2DAttributes(contraction, &fused_op);
+    // leaky relu has a special attribute alpha
+    CopyConv2DAttributes(contraction, &fused_op, &activation);
   } else if (IsDepthwiseConv2dNative(contraction)) {
     fused_op.set_op(kFusedDepthwiseConv2dNative);
     CopyDepthwiseConv2dNativeAttributes(contraction, &fused_op);
@@ -1202,7 +1255,7 @@ Status AddFusedConv2DNode(RemapperContext* ctx,
   fused_conv2d.add_input(fused_batch_norm.input(3));  // 4: mean
   fused_conv2d.add_input(fused_batch_norm.input(4));  // 5: variance
 
-  CopyConv2DAttributes(contraction, &fused_conv2d);
+  CopyConv2DAttributes(contraction, &fused_conv2d, &activation);
   SetFusedOpAttributes(&fused_conv2d, {"FusedBatchNorm", activation.op()},
                        /*num_args=*/4, /*epsilon=*/matched.epsilon);
 
@@ -1284,7 +1337,7 @@ Status AddFusedContractionNode(
   fused_conv2d.add_input(add.input(1 - matched.port_id));
 
   CopyConv2DAttributes(contraction, &fused_conv2d);
-  SetFusedOpAttributes(&fused_conv2d, {"BiasAdd", "Add", "Relu"}, 2);
+  SetFusedOpAttributes(&fused_conv2d, {"BiasAdd", "Add", activation.op()}, 2);
 
   utils::Mutation* mutation = ctx->graph_view.GetMutationBuilder();
   Status status;
diff --git a/tensorflow/core/grappler/optimizers/remapper_test.cc b/tensorflow/core/grappler/optimizers/remapper_test.cc
index f4bc5e38526..9f5a2c4716a 100644
--- a/tensorflow/core/grappler/optimizers/remapper_test.cc
+++ b/tensorflow/core/grappler/optimizers/remapper_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/remapper.h"
 
+#include "tensorflow/cc/ops/nn_ops_internal.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/framework/types.h"
@@ -541,7 +542,7 @@ TEST_F(RemapperTest, DISABLED_FuseConv2DWithBiasAndActivationOnGPU) {
 TEST_F(RemapperTest, FuseConv2DWithBiasAndActivation) {
   using ::tensorflow::ops::Placeholder;
 
-  for (const string& activation : {"Relu", "Relu6", "Elu"}) {
+  for (const string& activation : {"Relu", "Relu6", "Elu", "LeakyRelu"}) {
     tensorflow::Scope s = tensorflow::Scope::NewRootScope();
 
     auto input_shape = Placeholder::Shape({8, 32, 32, 3});
@@ -567,6 +568,10 @@ TEST_F(RemapperTest, FuseConv2DWithBiasAndActivation) {
         return ops::Identity(fetch, ops::Relu6(activate, bias_add));
       } else if (activation == "Elu") {
         return ops::Identity(fetch, ops::Elu(activate, bias_add));
+      } else if (activation == "LeakyRelu") {
+        auto attr = ops::internal::LeakyRelu::Alpha(0.5);
+        return ops::Identity(
+            fetch, ops::internal::LeakyRelu(activate, bias_add, attr));
       }
 
       return ops::Identity(fetch, bias);
@@ -605,6 +610,10 @@ TEST_F(RemapperTest, FuseConv2DWithBiasAndActivation) {
         ASSERT_EQ(fused_ops.size(), 2);
         EXPECT_EQ(fused_ops[0], "BiasAdd");
         EXPECT_EQ(fused_ops[1], activation);
+
+        if (activation == "LeakyRelu") {
+          EXPECT_EQ(node.attr().at("leakyrelu_alpha").f(), 0.5);
+        }
         found++;
       }
     }
@@ -795,7 +804,7 @@ TEST_F(RemapperTest, FuseConv2DWithBatchNorm) {
 TEST_F(RemapperTest, FuseConv2DWithBatchNormAndActivation) {
   using ops::Placeholder;
 
-  for (const string& activation : {"Relu", "Relu6", "Elu"}) {
+  for (const string& activation : {"Relu", "Relu6", "Elu", "LeakyRelu"}) {
     tensorflow::Scope s = tensorflow::Scope::NewRootScope();
 
     auto input_shape = ops::Placeholder::Shape({8, 32, 32, 3});
@@ -828,6 +837,10 @@ TEST_F(RemapperTest, FuseConv2DWithBatchNormAndActivation) {
         return ops::Identity(fetch, ops::Relu6(activate, batch_norm.y));
       } else if (activation == "Elu") {
         return ops::Identity(fetch, ops::Elu(activate, batch_norm.y));
+      } else if (activation == "LeakyRelu") {
+        auto attr = ops::internal::LeakyRelu::Alpha(0.5);
+        return ops::Identity(
+            fetch, ops::internal::LeakyRelu(activate, batch_norm.y, attr));
       }
 
       return ops::Identity(fetch, batch_norm.y);
@@ -874,6 +887,10 @@ TEST_F(RemapperTest, FuseConv2DWithBatchNormAndActivation) {
         ASSERT_EQ(fused_ops.size(), 2);
         EXPECT_EQ(fused_ops[0], "FusedBatchNorm");
         EXPECT_EQ(fused_ops[1], activation);
+
+        if (activation == "LeakyRelu") {
+          EXPECT_EQ(node.attr().at("leakyrelu_alpha").f(), 0.5);
+        }
         found++;
       }
     }
@@ -957,7 +974,7 @@ TEST_F(RemapperTest, FuseConv2DWithSqueezeAndBias) {
   ASSERT_EQ(tensors.size(), 1);
   test::ExpectTensorNear<float>(tensors[0], tensors_expected[0], 1e-6);
 }
-#endif
+#endif  // !INTEL_MKL
 
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc b/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc
index d3d4e2913a9..11f95894ff9 100644
--- a/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc
@@ -104,20 +104,20 @@ Status CheckTypesAndGetShapes(const GraphProperties& graph_properties,
             << shapes->size();
     if (!graph_properties.HasOutputProperties(n->name())) {
       LOG(ERROR) << "Node " << n->DebugString() << " lacks output shape.";
-      return errors::Internal("Node ", n->name(), " lacks output shape.");
+      return errors::Aborted("Node ", n->name(), " lacks output shape.");
     }
     const std::vector<OpInfo::TensorProperties>& prop_list =
         graph_properties.GetOutputProperties(n->name());
     if (prop_list.size() != 1) {
-      return errors::Internal("Node ", n->name(),
-                              " does not have exactly one output as expected "
-                              "by ScopedAllocatorOptimizer");
+      return errors::Aborted("Node ", n->name(),
+                             " does not have exactly one output as expected "
+                             "by ScopedAllocatorOptimizer");
     }
     const OpInfo::TensorProperties& props = prop_list[0];
     if (shapes->empty()) {
       *type = props.dtype();
     } else if (*type != props.dtype()) {
-      return errors::Internal("Group ops don't all have same type");
+      return errors::Aborted("Group ops don't all have same type");
     }
     if (*type != dtype) {
       return errors::Internal(
@@ -128,7 +128,7 @@ Status CheckTypesAndGetShapes(const GraphProperties& graph_properties,
       // TensorShape::IsValid may return true if unknown_rank is True, i.e.
       // number of dimensions is unknown.  But for ScopedAllocatorOptimizer we
       // need to know the shape fully.
-      return errors::Internal("Complete shape not known for ", n->name());
+      return errors::Aborted("Complete shape not known for ", n->name());
     }
     VLOG(2) << "Adding shape " << props.shape().DebugString();
     shapes->push_back(TensorShape(props.shape()));
@@ -218,8 +218,9 @@ Status MaybeRewriteInput(ScopedAllocatorOptimizer* sa_opti,
                          NodeDef* input, const string& edge_name,
                          int output_index, NodeDef* op, NodeDef** new_input,
                          int* new_output_index, bool* rewrite) {
-  *rewrite = IsExit(*input) || (sa_opti->repeated_outputs().find(edge_name) !=
-                                sa_opti->repeated_outputs().end());
+  *rewrite = IsConstant(*input) || IsExit(*input) ||
+             (sa_opti->repeated_outputs().find(edge_name) !=
+              sa_opti->repeated_outputs().end());
   if (!(*rewrite)) {
     *new_input = input;
     *new_output_index = output_index;
@@ -301,8 +302,8 @@ Status GetInputs(ScopedAllocatorOptimizer* sa_opti, int64 invocation_count,
           GetOutputDataType(inode_output_props, output_index, &inode_dtype));
     }
     if (inode_dtype != dtype) {
-      return errors::Internal("ScopedAllocatorOptimizer expected input type ",
-                              dtype, " but found ", inode_dtype);
+      return errors::Aborted("ScopedAllocatorOptimizer expected input type ",
+                             dtype, " but found ", inode_dtype);
     }
     inputs->emplace_back(inode, output_index, n);
   }
@@ -362,6 +363,20 @@ class UnaryElementwiseRewriter : public ScopedAllocatorOptimizer::Rewriter {
  public:
   ~UnaryElementwiseRewriter() override {}
 
+  // Return non-OK if any input is an op that does not use the
+  // AllocatorAttributes set by executor to allocate its output.
+  Status CheckUsesAllocatorAttributes(const std::vector<InputDesc>& inputs) {
+    for (const InputDesc& nd : inputs) {
+      if (IsConstant(*nd.from_node_def)) {
+        return errors::Aborted(
+            "Abandoning ScopedAllocatorOptimizer because input ",
+            nd.from_node_def->name(),
+            " is a Const op which does not use AllocatorAttributes");
+      }
+    }
+    return Status::OK();
+  }
+
   // Return non-OK if any input is already committed to a ScopedAllocator.
   //
   // We insert an identity to ensure that inputs are not committed to different
@@ -379,7 +394,7 @@ class UnaryElementwiseRewriter : public ScopedAllocatorOptimizer::Rewriter {
         LOG(INFO) << "Abandoning ScopedAllocatorOptimizer because input "
                   << nd.from_node_def->name() << " output " << scope_ids[0]
                   << " is already assigned to scope_id " << scope_ids[1];
-        return errors::Internal(
+        return errors::Aborted(
             "Abandoning ScopedAllocatorOptimizer because input ",
             nd.from_node_def->name(), " output ", scope_ids[0], " is already ",
             "assigned to scope_id ", scope_ids[1]);
@@ -394,10 +409,10 @@ class UnaryElementwiseRewriter : public ScopedAllocatorOptimizer::Rewriter {
     for (const InputDesc& nd : inputs) {
       if (op_set.find(nd.from_node_def->name()) != op_set.end()) {
         if (nd.output_slot != tensorflow::Graph::kControlSlot) {
-          return errors::Internal("Data edge exists between ",
-                                  nd.from_node_def->name(),
-                                  " and another "
-                                  "node in the set");
+          return errors::Aborted("Data edge exists between ",
+                                 nd.from_node_def->name(),
+                                 " and another "
+                                 "node in the set");
         }
       }
     }
@@ -441,6 +456,7 @@ class UnaryElementwiseRewriter : public ScopedAllocatorOptimizer::Rewriter {
     LOG_WARNING_AND_RETURN_IF_ERROR(
         GetInputs(sa_opti, invocation_count, graph, *graph_properties_,
                   sa_opti->node_map(), ops, *dtype, inputs));
+    LOG_WARNING_AND_RETURN_IF_ERROR(CheckUsesAllocatorAttributes(*inputs));
     LOG_WARNING_AND_RETURN_IF_ERROR(CheckExistingScopedAllocator(*inputs));
     LOG_WARNING_AND_RETURN_IF_ERROR(
         CheckInternalDataDependency(op_instance_names, *inputs));
@@ -524,7 +540,7 @@ class UnaryElementwiseRewriter : public ScopedAllocatorOptimizer::Rewriter {
     for (int i = 0, end = inputs.size(); i < end; ++i) {
       auto& nd = inputs[i];
       if (IsArg(*nd.from_node_def)) {
-        return errors::Internal(
+        return errors::Aborted(
             "ScopedAllocatorOptimizer does not work well when the op inputs "
             "are _Arg ops; skipping this optimizer for this function");
       }
@@ -604,9 +620,9 @@ class UnaryElementwiseRewriter : public ScopedAllocatorOptimizer::Rewriter {
           if (op_instance_names.find(old_op_input) != op_instance_names.end()) {
             LOG(ERROR) << "Data edge between " << old_op_input << " and "
                        << old_op->name() << " cannot build ScopedAllocator.";
-            return errors::Internal("Data edge between ", old_op_input, " and ",
-                                    old_op->name(),
-                                    " cannot build ScopedAllocator.");
+            return errors::Aborted("Data edge between ", old_op_input, " and ",
+                                   old_op->name(),
+                                   " cannot build ScopedAllocator.");
           }
           sac_inputs->push_back(
               NodeDefBuilder::NodeOut(old_op_input, 0, dtype));
@@ -937,7 +953,7 @@ int ScopedAllocatorOptimizer::NewScopedAllocatorId(int num_fields) {
 Status ScopedAllocatorOptimizer::NewIdentityId(int* id) {
   *id = next_identity_id_++;
   if (next_identity_id_ < 0) {
-    return errors::Internal("NewIdentityId overflow");
+    return errors::Aborted("NewIdentityId overflow");
   }
   return Status::OK();
 }
diff --git a/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer_test.cc b/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer_test.cc
index d67b8acdaa4..b9972526080 100644
--- a/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include <unordered_set>
 
+#include "tensorflow/cc/ops/array_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor.pb.h"  // NOLINT
@@ -23,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/graph/testlib.h"
 #include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/grappler/utils/topological_sort.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -108,7 +110,7 @@ class ScopedAllocatorOptimizerTest : public ::testing::Test {
   // Constructs the following graph.
   // (Flow is top to bottom, like nature intends.)
   //
-  // a, b, and c are constants.  s is an Add op.  a1, a2, and a3 are Abs ops.
+  // a, b, and c are placeholders.  s is an Add op.  a1, a2, and a3 are Abs ops.
   // r1, r2, and r3 are Reshape ops.
   //
   // After this graph undergoes SA optimization, we expect a, b, and s to be
@@ -132,12 +134,12 @@ class ScopedAllocatorOptimizerTest : public ::testing::Test {
     Scope s = Scope::NewRootScope();
     s = s.WithDevice("/job:localhost/replica:0/task:0/device:CPU:0");
 
-    Output a =
-        ops::Const<float>(s.WithOpName("a"), {1.0, 0.0, 0.0, -1.0}, {2, 2});
-    Output b =
-        ops::Const<float>(s.WithOpName("b"), {1.0, -2.0, 3.0, 4.0}, {2, 2});
-    Output c =
-        ops::Const<float>(s.WithOpName("c"), {-5.0, -2.0, 0.0, -2.0}, {2, 2});
+    Output a = ops::Placeholder(s.WithOpName("a"), DT_FLOAT,
+                                ops::Placeholder::Shape({2, 2}));
+    Output b = ops::Placeholder(s.WithOpName("b"), DT_FLOAT,
+                                ops::Placeholder::Shape({2, 2}));
+    Output c = ops::Placeholder(s.WithOpName("c"), DT_FLOAT,
+                                ops::Placeholder::Shape({2, 2}));
     Output s1 = ops::Add(s.WithOpName("s1"), b, c);
     Output a1 = ops::Abs(s.WithOpName("a1"), a);
     Output a2 = ops::Abs(s.WithOpName("a2"), b);
@@ -167,14 +169,14 @@ class ScopedAllocatorOptimizerTest : public ::testing::Test {
     Scope s = Scope::NewRootScope();
     s = s.WithDevice("/job:localhost/replica:0/task:0/device:CPU:0");
 
-    Output a =
-        ops::Const<float>(s.WithOpName("a"), {0.0, 0.0, 0.0, 0.0}, {2, 2});
-    Output b =
-        ops::Const<float>(s.WithOpName("b"), {0.0, 0.0, 0.0, 0.0}, {2, 2});
-    Output ctl1 =
-        ops::Const<float>(s.WithOpName("ctl1"), {0.0, 0.0, 0.0, 0.0}, {2, 2});
-    Output ctl2 =
-        ops::Const<float>(s.WithOpName("ctl2"), {0.0, 0.0, 0.0, 0.0}, {2, 2});
+    Output a = ops::Placeholder(s.WithOpName("a"), DT_FLOAT,
+                                ops::Placeholder::Shape({2, 2}));
+    Output b = ops::Placeholder(s.WithOpName("b"), DT_FLOAT,
+                                ops::Placeholder::Shape({2, 2}));
+    Output ctl1 = ops::Placeholder(s.WithOpName("ctl1"), DT_FLOAT,
+                                   ops::Placeholder::Shape({2, 2}));
+    Output ctl2 = ops::Placeholder(s.WithOpName("ctl2"), DT_FLOAT,
+                                   ops::Placeholder::Shape({2, 2}));
     Output a1 = ops::Abs(s.WithOpName("a1").WithControlDependencies({ctl1}), a);
     Output a2 = ops::Abs(s.WithOpName("a2").WithControlDependencies({ctl2}), b);
     Output o1 = ops::Reshape(s.WithOpName("o1"), a1, {1, 4});
@@ -237,6 +239,34 @@ class ScopedAllocatorOptimizerTest : public ::testing::Test {
     TF_CHECK_OK(root_scope.ToGraphDef(graph_def));
   }
 
+  // Constructs the following graph.
+  //
+  // c1 and c2 are Const ops.  a1 and a2 are Abs ops.
+  // We expect the optimizer to succeed and insert Identity between ci and ai.
+  // This will ensure that we will still be able use ScopedAllocator with Const
+  // inputs.
+  /*
+          c1   c2
+          |    |
+          a1   a2
+          |    |
+          r1   r2
+  */
+  void BuildConstGraph(GraphDef* graph_def, bool forward) {
+    Scope s = Scope::NewRootScope();
+    s = s.WithDevice("/job:localhost/replica:0/task:0/device:CPU:0");
+
+    Output c1 =
+        ops::Const<float>(s.WithOpName("c1"), {1.0, 0.0, 0.0, -1.0}, {2, 2});
+    Output c2 =
+        ops::Const<float>(s.WithOpName("c2"), {1.0, -2.0, 3.0, 4.0}, {2, 2});
+    Output a1 = ops::Abs(s.WithOpName("a1"), c1);
+    Output a2 = ops::Abs(s.WithOpName("a2"), c2);
+    Output r1 = ops::Reshape(s.WithOpName("r1"), a1, {1, 4});
+    Output r2 = ops::Reshape(s.WithOpName("r2"), a2, {4, 1});
+    TF_CHECK_OK(s.ToGraphDef(graph_def));
+  }
+
   void SetShapes(GraphDef* graph_def) {
     TensorShapeProto shape_proto;
     shape_proto.add_dim()->set_size(2);
@@ -391,8 +421,7 @@ TEST_F(ScopedAllocatorOptimizerTest, UnaryExecute) {
   SetShapes(&graph_def);
   std::vector<Tensor> outputs;
   ExecuteGraph(graph_def,
-               /*output_names=*/{"r1:0", "r2:0", "scoped_allocator_1_2_Abs:0"},
-               &outputs);
+               /*output_names=*/{"r1:0", "r2:0"}, &outputs);
   // a + b == 2, -2, 3, 3
   // b + c == -4, -4, 3, 2
   ValidateValues(outputs, /*expected=*/{{2, 2, 3, 3}, {4, 4, 3, 2}});
@@ -531,6 +560,42 @@ TEST_F(ScopedAllocatorOptimizerTest, ControlEdgeRewire) {
   EXPECT_EQ(NumControlInputs(&node_map, "ctl4"), 1);
 }
 
+// Test that the optimization succeeds when any input is a Const op, and that it
+// inserts Identity op between Const and Abs.
+TEST_F(ScopedAllocatorOptimizerTest, ConstInput) {
+  GrapplerItem item;
+  BuildConstGraph(&item.graph, false);
+  SetShapes(&item.graph);
+
+  ScopedAllocatorOptions opts;
+  opts.add_enable_op("Abs");
+  ScopedAllocatorOptimizer sao(RewriterConfig::ON, opts);
+  ScopedAllocatorOptimizer::OpNameSet ons;
+  ons.insert("Abs");
+
+  GraphDef optimized_graph;
+  TF_ASSERT_OK(sao.Optimize(nullptr /*cluster*/, item, &optimized_graph));
+
+  // Examine the resulting graphdef.
+  const NodeDef* sa_node = nullptr;
+  for (const NodeDef& node : optimized_graph.node()) {
+    if (node.op() == "_ScopedAllocator") {
+      sa_node = &node;
+      break;
+    }
+  }
+  ASSERT_NE(sa_node, nullptr);
+  int num_identity_ops = 0;
+  NodeMap node_map(&optimized_graph);
+  for (NodeDef* sa_output : node_map.GetOutputs(sa_node->name())) {
+    EXPECT_FALSE(IsConstant(*sa_output));
+    if (IsIdentity(*sa_output)) {
+      ++num_identity_ops;
+    }
+  }
+  EXPECT_EQ(num_identity_ops, 2);
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/grappler_test.cc b/tensorflow/core/grappler/utils/grappler_test.cc
index 47397b589f0..64645fb0689 100644
--- a/tensorflow/core/grappler/utils/grappler_test.cc
+++ b/tensorflow/core/grappler/utils/grappler_test.cc
@@ -68,27 +68,44 @@ void CompareGraphNodes(protobuf::RepeatedPtrField<NodeDef>* want,
     }
   }
 }
+
+void SetAllOptimizers(RewriterConfig* cfg, RewriterConfig::Toggle value) {
+  cfg->set_arithmetic_optimization(value);
+  cfg->set_auto_mixed_precision(value);
+  cfg->set_auto_mixed_precision_mkl(value);
+  cfg->set_common_subgraph_elimination(value);
+  cfg->set_constant_folding(value);
+  cfg->set_debug_stripper(value);
+  cfg->set_dependency_optimization(value);
+  cfg->set_function_optimization(value);
+  cfg->set_implementation_selector(value);
+  cfg->set_layout_optimizer(value);
+  cfg->set_loop_optimization(value);
+  cfg->set_pin_to_host_optimization(value);
+  cfg->set_remapping(value);
+  cfg->set_scoped_allocator_optimization(value);
+  cfg->set_shape_optimization(value);
+}
 }  // namespace
 
 GrapplerTest::GrapplerTest() {
   // Turn off all the automatic optimizations to ensure that we run the graph
-  // exactly as it is given to us. This ensures that we can compare the results
-  // before and after manual optimization, without any of the automatic
-  // optimizations interfering in the comparison.
-  RewriterConfig* cfg =
-      options_.config.mutable_graph_options()->mutable_rewrite_options();
-  // TODO(rmlarsen): Add utility to generate config w/ all optimizers turned
-  // off.
-  cfg->set_arithmetic_optimization(RewriterConfig::OFF);
-  cfg->set_constant_folding(RewriterConfig::OFF);
-  cfg->set_debug_stripper(RewriterConfig::OFF);
-  cfg->set_dependency_optimization(RewriterConfig::OFF);
-  cfg->set_function_optimization(RewriterConfig::OFF);
-  cfg->set_implementation_selector(RewriterConfig::OFF);
-  cfg->set_layout_optimizer(RewriterConfig::OFF);
-  cfg->set_loop_optimization(RewriterConfig::OFF);
-  cfg->set_pin_to_host_optimization(RewriterConfig::OFF);
-  cfg->set_remapping(RewriterConfig::OFF);
+  // exactly as it is given to us. This ensures that we can compare the
+  // results before and after manual optimization, without any of the
+  // automatic optimizations interfering in the comparison.
+  DisableAllOptimizers();
+}
+
+void GrapplerTest::DisableAllOptimizers() {
+  SetAllOptimizers(
+      options_.config.mutable_graph_options()->mutable_rewrite_options(),
+      RewriterConfig::OFF);
+}
+
+void GrapplerTest::EnableAllOptimizers() {
+  SetAllOptimizers(
+      options_.config.mutable_graph_options()->mutable_rewrite_options(),
+      RewriterConfig::ON);
 }
 
 std::vector<Tensor> GrapplerTest::EvaluateNodes(
diff --git a/tensorflow/core/grappler/utils/grappler_test.h b/tensorflow/core/grappler/utils/grappler_test.h
index 9225f9172e8..967cff28de9 100644
--- a/tensorflow/core/grappler/utils/grappler_test.h
+++ b/tensorflow/core/grappler/utils/grappler_test.h
@@ -37,6 +37,9 @@ class GrapplerTest : public ::testing::Test {
   GrapplerTest();
 
  protected:
+  void DisableAllOptimizers();
+  void EnableAllOptimizers();
+
   std::vector<Tensor> EvaluateNodes(
       const GraphDef& graph, const std::vector<string>& node_names) const;
 
@@ -51,6 +54,8 @@ class GrapplerTest : public ::testing::Test {
                    const std::vector<std::pair<string, AttrValue>>& attributes,
                    GraphDef* graph) const;
 
+  void DisableAllOptimizers(RewriterConfig* cfg);
+
   // Checks if two graphs are equal. Both graphs must have the same set of nodes
   // with the same inputs and attributes. Nodes can be in different order.
   //
diff --git a/tensorflow/core/grappler/utils/grappler_test_test.cc b/tensorflow/core/grappler/utils/grappler_test_test.cc
index 677fa5a7989..795a6e90d73 100644
--- a/tensorflow/core/grappler/utils/grappler_test_test.cc
+++ b/tensorflow/core/grappler/utils/grappler_test_test.cc
@@ -95,6 +95,35 @@ TEST_F(GrapplerTestTest, CountOpNodes) {
   EXPECT_EQ(0, CountOpNodes(graph, "Transpose"));
 }
 
+TEST_F(GrapplerTestTest, EvaluateNodes) {
+  EnableAllOptimizers();
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output a = ops::Const(s.WithOpName("c"), {1.0f, 2.0f}, {1, 2});
+  Output b = ops::Const(s.WithOpName("d"), {3.0f, 4.0f}, {1, 2});
+  Output mul = ops::Mul(s.WithOpName("mul"), a, b);
+  GrapplerItem item;
+  item.fetch = {"mul"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  auto tensors = EvaluateNodes(item.graph, item.fetch);
+  ASSERT_EQ(tensors.size(), 1);
+  EXPECT_EQ(tensors[0].flat<float>()(0), 3.0f);
+  EXPECT_EQ(tensors[0].flat<float>()(1), 8.0f);
+}
+
+TEST_F(GrapplerTestTest, EvaluateNodesInvalidFetch) {
+  EnableAllOptimizers();
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output a = ops::Const(s.WithOpName("c"), {1.0f, 2.0f}, {1, 2});
+  Output b = ops::Const(s.WithOpName("d"), {3.0f, 4.0f}, {1, 2});
+  Output mul = ops::Mul(s.WithOpName("mul"), a, b);
+  GrapplerItem item;
+  item.fetch = {"no_such_node"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  EXPECT_DEATH(EvaluateNodes(item.graph, item.fetch),
+               "Invalid argument: Tensor no_such_node:0, specified in either "
+               "feed_devices or fetch_devices was not found in the Graph");
+}
+
 }  // namespace
 }  // namespace grappler
-}  // namespace tensorflow
\ No newline at end of file
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils_test.cc b/tensorflow/core/grappler/utils_test.cc
index 31444735b20..fd3f8ee89f5 100644
--- a/tensorflow/core/grappler/utils_test.cc
+++ b/tensorflow/core/grappler/utils_test.cc
@@ -26,10 +26,10 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/graph/benchmark_testlib.h"
 #include "tensorflow/core/grappler/grappler_item.h"
-#include "tensorflow/core/lib/bfloat16/bfloat16.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/bfloat16.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/notification.h"
 #include "tensorflow/core/platform/test.h"
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 12d4f1c5574..a5781701599 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -9,15 +9,12 @@ load(
     "tf_cc_binary",
     "tf_cc_shared_object",
     "tf_cc_test",
-    "tf_cc_test_mkl",
     "tf_cc_tests",
     "tf_copts",
     "tf_cuda_library",
     "tf_kernel_library",
-    "tf_mkl_kernel_library",
     "tf_opts_nortti_if_lite_protos",
 )
-load("@local_config_sycl//sycl:build_defs.bzl", "if_sycl")
 load("//tensorflow/core/kernels/mlir_generated:build_defs.bzl", "if_mlir_generated_gpu_kernels_enabled")
 
 # buildifier: disable=same-origin-load
@@ -203,17 +200,6 @@ tf_cc_test(
     ],
 )
 
-# virtual targets since nested select statements not possible
-tf_kernel_library(
-    name = "virtual_nccl",
-    deps = if_cuda(["@local_config_nccl//:nccl"]),
-)
-
-tf_kernel_library(
-    name = "virtual_rccl",
-    deps = if_rocm(["@local_config_rocm//rocm:rccl"]),
-)
-
 tf_kernel_library(
     name = "collective_ops",
     srcs = if_nccl([
@@ -231,11 +217,10 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:core_cpu",
         "//tensorflow/core/profiler/lib:traceme",
     ] + if_nccl([
-        ":virtual_nccl",
-        ":virtual_rccl",
-        "//tensorflow/core/nccl:nccl_lib",
+        "//tensorflow/core/nccl:collective_communicator",
     ]),
 )
 
@@ -245,7 +230,6 @@ tf_cuda_cc_test(
     srcs = ["collective_nccl_test.cc"],
     tags = tf_cuda_tests_tags() + [
         "guitar",
-        "manual",
         "multi_gpu",
         "no_oss",
         "notap",
@@ -259,6 +243,7 @@ tf_cuda_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/core/nccl:collective_communicator",
     ],
 )
 
@@ -323,7 +308,6 @@ tf_kernel_library(
     deps = [
         ":eigen_helpers",
         ":fill_functor",
-        ":image_resizer_state",
         ":ops_util",
         "//third_party/eigen3",
         "//tensorflow/core:core_cpu",
@@ -343,32 +327,6 @@ cc_library(
     ],
 )
 
-tf_kernel_library(
-    name = "extract_image_patches_op",
-    prefix = "extract_image_patches_op",
-    deps = [
-        ":bounds_check",
-        ":eigen_helpers",
-        ":ops_util",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//third_party/eigen3",
-    ],
-)
-
-tf_kernel_library(
-    name = "extract_volume_patches_op",
-    prefix = "extract_volume_patches_op",
-    deps = [
-        ":bounds_check",
-        ":eigen_helpers",
-        ":ops_util",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//third_party/eigen3",
-    ],
-)
-
 cc_library(
     name = "conv_3d",
     hdrs = ["conv_3d.h"],
@@ -519,7 +477,7 @@ cc_library(
     name = "gpu_prim_hdrs",
     hdrs = ["gpu_prim.h"],
     deps = if_cuda([
-        "@cub_archive//:cub",
+        "@local_config_cuda//cuda:cub_headers",
     ]) + if_rocm([
         "@local_config_rocm//rocm:rocprim",
     ]),
@@ -936,43 +894,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "image_resizer_state",
-    hdrs = ["image_resizer_state.h"],
-    visibility = ["//visibility:private"],
-    deps = [
-        ":bounds_check",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//third_party/eigen3",
-    ],
-)
-
-cc_header_only_library(
-    name = "image_resizer_state_lib",
-    deps = [":image_resizer_state"],
-)
-
-cc_library(
-    name = "sampling_kernels",
-    srcs = ["sampling_kernels.cc"],
-    hdrs = ["sampling_kernels.h"],
-    visibility = ["//visibility:private"],
-    deps = ["//tensorflow/core:lib"],
-)
-
-tf_cc_test(
-    name = "sampling_kernels_test",
-    srcs = ["sampling_kernels_test.cc"],
-    deps = [
-        ":sampling_kernels",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-        "@com_google_absl//absl/strings",
-    ],
-)
-
 # OpKernel libraries ----------------------------------------------------------
 
 ARRAY_DEPS = [
@@ -989,7 +910,7 @@ ARRAY_DEPS = [
     "//tensorflow/core:lib_internal",
     "//tensorflow/core:protos_all_cc",
     "//third_party/eigen3",
-] + if_sycl(["//tensorflow/core/common_runtime/sycl:sycl_runtime"])
+]
 
 tf_kernel_library(
     name = "immutable_constant_op",
@@ -1027,8 +948,6 @@ cc_library(
         ":depth_space_ops",
         ":diag_op",
         ":edit_distance_op",
-        ":extract_image_patches_op",
-        ":extract_volume_patches_op",
         ":fingerprint_op",
         ":gather_nd_op",
         ":gather_op",
@@ -1039,7 +958,6 @@ cc_library(
         ":immutable_constant_op",
         ":inplace_ops",
         ":listdiff_op",
-        ":mirror_pad_op",
         ":one_hot_op",
         ":pack_op",
         ":pad_op",
@@ -1171,12 +1089,6 @@ tf_kernel_library(
     deps = ARRAY_DEPS,
 )
 
-tf_kernel_library(
-    name = "mirror_pad_op",
-    prefix = "mirror_pad_op",
-    deps = ARRAY_DEPS,
-)
-
 tf_kernel_library(
     name = "one_hot_op",
     prefix = "one_hot_op",
@@ -1316,7 +1228,6 @@ tf_kernel_library(
         "tile_functor_cpu_uint64.cc",
         "tile_functor_cpu_uint8.cc",
         "tile_functor_cpu_variant.cc",
-        "tile_functor_sycl.cc",
     ],
     hdrs = ["tile_functor.h"],
     gpu_srcs = [
@@ -1728,13 +1639,16 @@ tf_cuda_cc_test(
     name = "conv_ops_test",
     size = "medium",
     srcs = ["conv_ops_test.cc"],
-    tags = ["no_cuda11"],  # b/159664089
+    tags = [
+        "no_cuda11",  # b/159664089
+        "no_oss",
+    ],
     deps = [
         ":conv_ops",
-        ":image",
         ":ops_testutil",
         ":ops_util",
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:cc_ops_internal",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
@@ -1744,6 +1658,8 @@ tf_cuda_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels/image",
+        "//tensorflow/core/platform:tensor_float_32_utils",
         "@com_google_absl//absl/algorithm:container",
     ],
 )
@@ -1829,7 +1745,6 @@ tf_cuda_cc_test(
     tags = tf_cuda_tests_tags(),
     deps = [
         ":conv_ops",
-        ":image",
         ":ops_testutil",
         ":ops_util",
         "//tensorflow/cc:cc_ops",
@@ -1842,69 +1757,7 @@ tf_cuda_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
-    ],
-)
-
-tf_cc_test(
-    name = "decode_wav_op_test",
-    size = "small",
-    srcs = ["decode_wav_op_test.cc"],
-    deps = [
-        ":decode_wav_op",
-        ":ops_testutil",
-        ":ops_util",
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/cc:client_session",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:tensorflow",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
-tf_cc_test(
-    name = "encode_jpeg_op_test",
-    size = "small",
-    srcs = ["encode_jpeg_op_test.cc"],
-    deps = [
-        ":encode_jpeg_op",
-        ":ops_testutil",
-        ":ops_util",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
-tf_cc_test(
-    name = "encode_wav_op_test",
-    size = "small",
-    srcs = ["encode_wav_op_test.cc"],
-    deps = [
-        ":decode_wav_op",
-        ":encode_wav_op",
-        ":ops_testutil",
-        ":ops_util",
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/cc:client_session",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:tensorflow",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels/image",
     ],
 )
 
@@ -2898,6 +2751,53 @@ tf_kernel_library(
     ],
 )
 
+cc_library(
+    name = "tensor_map",
+    srcs = ["tensor_map.cc"],
+    hdrs = ["tensor_map.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/framework:tensor_shape_proto_cc",
+        "//tensorflow/core/lib/core:refcount",
+    ],
+)
+
+tf_kernel_library(
+    name = "map_kernels",
+    srcs = ["map_kernels.cc"],
+    hdrs = ["map_kernels.h"],
+    deps = [
+        ":concat_lib",
+        ":fill_functor",
+        ":tensor_map",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//third_party/eigen3",
+    ],
+)
+
+tf_cc_tests(
+    name = "tensor_map_test",
+    size = "small",
+    srcs = [
+        "tensor_map_test.cc",
+    ],
+    tags = ["nomsan"],  # b/163222155
+    deps = [
+        ":tensor_map",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/framework:tensor_testutil",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 tf_kernel_library(
     name = "fact_op",
     prefix = "fact_op",
@@ -2953,205 +2853,6 @@ tf_kernel_library(
     ],
 )
 
-cc_library(
-    name = "image",
-    deps = [
-        ":adjust_contrast_op",
-        ":adjust_hue_op",
-        ":adjust_saturation_op",
-        ":attention_ops",
-        ":colorspace_op",
-        ":crop_and_resize_op",
-        ":decode_image_op",
-        ":draw_bounding_box_op",
-        ":encode_jpeg_op",
-        ":encode_png_op",
-        ":extract_jpeg_shape_op",
-        ":generate_box_proposals_op",
-        ":image_ops",
-        ":non_max_suppression_op",
-        ":random_crop_op",
-        ":resize_area_op",
-        ":resize_bicubic_op",
-        ":resize_bilinear_op",
-        ":resize_nearest_neighbor_op",
-        ":sample_distorted_bounding_box_op",
-        ":scale_and_translate_op",
-    ],
-)
-
-IMAGE_DEPS = [
-    ":bounds_check",
-    ":eigen_helpers",
-    ":image_resizer_state",
-    "//third_party/eigen3",
-    "//tensorflow/core:framework",
-    "//tensorflow/core:gif_internal",
-    "//tensorflow/core:jpeg_internal",
-    "//tensorflow/core:lib",
-    "//tensorflow/core:lib_internal",
-    "//tensorflow/core:png_internal",
-    "//tensorflow/core:protos_all_cc",
-    "//tensorflow/core/util/tensor_bundle",
-]
-
-tf_kernel_library(
-    name = "adjust_contrast_op",
-    prefix = "adjust_contrast_op",
-    deps = IMAGE_DEPS,
-)
-
-cc_library(
-    name = "adjust_hsv_gpu_lib",
-    hdrs = ["adjust_hsv_gpu.cu.h"],
-    deps = ["//tensorflow/core:framework"],
-)
-
-tf_kernel_library(
-    name = "adjust_hue_op",
-    prefix = "adjust_hue_op",
-    deps = IMAGE_DEPS + [":adjust_hsv_gpu_lib"],
-)
-
-tf_kernel_library(
-    name = "adjust_saturation_op",
-    prefix = "adjust_saturation_op",
-    deps = IMAGE_DEPS + [":adjust_hsv_gpu_lib"],
-)
-
-tf_kernel_library(
-    name = "attention_ops",
-    prefix = "attention_ops",
-    deps = IMAGE_DEPS,
-)
-
-tf_kernel_library(
-    name = "colorspace_op",
-    prefix = "colorspace_op",
-    deps = IMAGE_DEPS,
-)
-
-tf_kernel_library(
-    name = "crop_and_resize_op",
-    prefix = "crop_and_resize_op",
-    deps = IMAGE_DEPS + ["//tensorflow/core:framework_internal"],
-)
-
-tf_kernel_library(
-    name = "decode_image_op",
-    prefix = "decode_image_op",
-    deps = IMAGE_DEPS + ["@com_google_absl//absl/strings"],
-)
-
-tf_kernel_library(
-    name = "draw_bounding_box_op",
-    prefix = "draw_bounding_box_op",
-    deps = IMAGE_DEPS,
-)
-
-tf_kernel_library(
-    name = "encode_jpeg_op",
-    prefix = "encode_jpeg_op",
-    deps = IMAGE_DEPS,
-)
-
-tf_kernel_library(
-    name = "encode_png_op",
-    prefix = "encode_png_op",
-    deps = IMAGE_DEPS,
-)
-
-tf_kernel_library(
-    name = "extract_jpeg_shape_op",
-    prefix = "extract_jpeg_shape_op",
-    deps = IMAGE_DEPS,
-)
-
-tf_kernel_library(
-    name = "generate_box_proposals_op",
-    gpu_srcs = ["generate_box_proposals_op.cu.cc"],
-    deps = [":gpu_prim_hdrs"] + if_cuda([
-        ":non_max_suppression_op_gpu",
-    ]),
-)
-
-tf_kernel_library(
-    name = "non_max_suppression_op",
-    prefix = "non_max_suppression_op",
-    deps = IMAGE_DEPS + [":gpu_prim_hdrs"],
-)
-
-tf_kernel_library(
-    name = "scale_and_translate_op",
-    prefix = "scale_and_translate_op",
-    deps = IMAGE_DEPS + [":sampling_kernels"],
-)
-
-tf_kernel_library(
-    name = "random_crop_op",
-    prefix = "random_crop_op",
-    deps = IMAGE_DEPS,
-)
-
-tf_kernel_library(
-    name = "resize_area_op",
-    prefix = "resize_area_op",
-    deps = IMAGE_DEPS,
-)
-
-tf_kernel_library(
-    name = "resize_bicubic_op",
-    prefix = "resize_bicubic_op",
-    deps = IMAGE_DEPS,
-)
-
-tf_kernel_library(
-    name = "resize_bilinear_op",
-    prefix = "resize_bilinear_op",
-    deps = IMAGE_DEPS,
-)
-
-tf_kernel_library(
-    name = "resize_nearest_neighbor_op",
-    prefix = "resize_nearest_neighbor_op",
-    deps = IMAGE_DEPS,
-)
-
-tf_kernel_library(
-    name = "sample_distorted_bounding_box_op",
-    prefix = "sample_distorted_bounding_box_op",
-    deps = IMAGE_DEPS,
-)
-
-tf_kernel_library(
-    name = "image_ops",
-    prefix = "image_ops",
-    deps = IMAGE_DEPS,
-)
-
-tf_kernel_library(
-    name = "encode_wav_op",
-    prefix = "encode_wav_op",
-    deps = [
-        ":bounds_check",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
-    ],
-)
-
-tf_kernel_library(
-    name = "decode_wav_op",
-    prefix = "decode_wav_op",
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
-    ],
-)
-
 tf_cc_tests(
     name = "eigen_test",
     size = "small",
@@ -3236,158 +2937,6 @@ tf_cc_tests(
     ],
 )
 
-tf_cc_tests(
-    name = "bonus_tests",
-    srcs = [
-        "adjust_contrast_op_test.cc",
-        "colorspace_op_test.cc",
-        "crop_and_resize_op_test.cc",
-        "mirror_pad_op_test.cc",
-        "non_max_suppression_op_test.cc",
-        "resize_area_op_test.cc",
-        "resize_bicubic_op_test.cc",
-        "resize_nearest_neighbor_op_test.cc",
-        "scale_and_translate_op_test.cc",
-    ],
-    linkopts = select({
-        "//tensorflow:macos": ["-headerpad_max_install_names"],
-        "//conditions:default": [],
-    }),
-    deps = [
-        ":image",
-        ":mirror_pad_op",
-        ":ops_testutil",
-        ":ops_util",
-        ":sampling_kernels",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
-tf_cc_test(
-    name = "non_max_suppression_op_benchmark_test",
-    srcs = ["non_max_suppression_op_benchmark_test.cc"],
-    deps = [
-        ":image",
-        ":ops_testutil",
-        ":ops_util",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
-tf_cuda_cc_test(
-    name = "resize_bilinear_op_test",
-    srcs = ["resize_bilinear_op_test.cc"],
-    tags = ["no_cuda_on_cpu_tap"],
-    deps = [
-        ":image",
-        ":ops_testutil",
-        ":ops_util",
-        ":sampling_kernels",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
-tf_cuda_cc_test(
-    name = "adjust_contrast_op_benchmark_test",
-    srcs = ["adjust_contrast_op_benchmark_test.cc"],
-    deps = [
-        ":image",
-        ":ops_testutil",
-        ":ops_util",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
-tf_cuda_cc_test(
-    name = "crop_and_resize_op_benchmark_test",
-    srcs = ["crop_and_resize_op_benchmark_test.cc"],
-    deps = [
-        ":image",
-        ":ops_testutil",
-        ":ops_util",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
-tf_cuda_cc_test(
-    name = "mirror_pad_op_benchmark_test",
-    srcs = ["mirror_pad_op_benchmark_test.cc"],
-    deps = [
-        ":mirror_pad_op",
-        ":ops_testutil",
-        ":ops_util",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
-tf_cuda_cc_test(
-    name = "non_max_suppression_op_gpu_test",
-    srcs = ["non_max_suppression_op_gpu_test.cc"],
-    tags = tf_cuda_tests_tags() + ["no_cuda_on_cpu_tap"],
-    deps = [
-        ":image",
-        ":ops_testutil",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-        "@com_google_absl//absl/strings",
-    ],
-)
-
-tf_cuda_cc_test(
-    name = "resize_benchmark_test",
-    srcs = ["resize_op_benchmark_test.cc"],
-    deps = [
-        ":image",
-        ":ops_testutil",
-        ":ops_util",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
 cc_library(
     name = "io",
     deps = [
@@ -3526,7 +3075,6 @@ cc_library(
         ":logging_ops",
         ":summary_audio_op",
         ":summary_image_op",
-        ":summary_op",
         ":summary_tensor_op",
     ],
 )
@@ -3538,6 +3086,8 @@ LOGGING_DEPS = [
     "//tensorflow/core:lib_internal",
     "//tensorflow/core:protos_all_cc",
     # TODO(b/162630222): remove this dependency.
+    "//tensorflow/c/kernels:histogram_summary_op",
+    "//tensorflow/c/kernels:merge_summary_op",
     "//tensorflow/c/kernels:summary_op",
 ]
 
@@ -3559,10 +3109,14 @@ tf_kernel_library(
     deps = LOGGING_DEPS + ["//tensorflow/core:png_internal"],
 )
 
-tf_kernel_library(
+# TODO(b/162630222): remove this target
+cc_library(
     name = "summary_op",
-    prefix = "summary_op",
-    deps = LOGGING_DEPS,
+    deps = [
+        "//tensorflow/c/kernels:histogram_summary_op",
+        "//tensorflow/c/kernels:merge_summary_op",
+        "//tensorflow/c/kernels:summary_op",
+    ],
 )
 
 tf_kernel_library(
@@ -3772,16 +3326,6 @@ tf_kernel_library(
     ]),
 )
 
-tf_mkl_kernel_library(
-    name = "mkl_batch_matmul_op",
-    srcs = ["mkl_batch_matmul_op.cc"],
-    hdrs = [
-        "batch_matmul_op_impl.h",
-        "mkl_matmul_ops_common.h",
-    ],
-    deps = MATH_DEPS + mkl_deps(),
-)
-
 tf_kernel_library(
     name = "betainc_op",
     prefix = "betainc_op",
@@ -3817,7 +3361,7 @@ tf_kernel_library(
     name = "cwise_op",
     copts = if_mlir_generated_gpu_kernels_enabled(if_true = ["-DMLIR_GENERATED_GPU_KERNELS_ENABLED=1"]),
     prefix = "cwise_op",
-    deps = MATH_DEPS + if_mlir_generated_gpu_kernels_enabled(if_true = ["//tensorflow/core/kernels/mlir_generated:cwise_op"]),
+    deps = MATH_DEPS + if_mlir_generated_gpu_kernels_enabled(if_true = ["//tensorflow/core/kernels/mlir_generated:cwise_unary_op"]),
 )
 
 tf_kernel_library(
@@ -3863,16 +3407,6 @@ tf_kernel_library(
     ]) + if_cuda_or_rocm([":gpu_utils"]),
 )
 
-tf_mkl_kernel_library(
-    name = "mkl_matmul_op",
-    srcs = [
-        "mkl_matmul_op.cc",
-        "mkl_matmul_op_fused.cc",
-    ],
-    hdrs = ["mkl_matmul_ops_common.h"],
-    deps = MATH_DEPS + mkl_deps(),
-)
-
 tf_kernel_library(
     name = "reduction_ops",
     gpu_srcs = ["reduction_gpu_kernels.cu.h"],
@@ -4286,7 +3820,6 @@ tf_kernel_library(
         ":conv_2d",
         ":conv_3d",
         ":eigen_contraction_kernel",
-        ":image_resizer_state",
         ":fill_functor",
         ":fused_eigen_output_kernels",
         ":ops_util",
@@ -4297,6 +3830,7 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core/util:image_resizer_state",
         "//tensorflow/core/util/proto:proto_utils",
         "//tensorflow/stream_executor/gpu:gpu_asm_opts",
     ] + select({
@@ -4356,7 +3890,7 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
     ] + if_cuda([
-        "@cub_archive//:cub",
+        "@local_config_cuda//cuda:cub_headers",
         "@local_config_cuda//cuda:cudnn_header",
     ]) + if_rocm([
         "@local_config_rocm//rocm:rocprim",
@@ -4376,6 +3910,7 @@ tf_kernel_library(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        ":cast_op",
     ] + if_cuda([
         "@local_config_cuda//cuda:cudnn_header",
     ]),
@@ -4445,7 +3980,7 @@ tf_kernel_library(
     ] + if_cuda_or_rocm([
         ":reduction_ops",
     ]) + if_cuda([
-        "@cub_archive//:cub",
+        "@local_config_cuda//cuda:cub_headers",
         "//tensorflow/core:stream_executor",
         "//tensorflow/stream_executor/cuda:cuda_stream",
     ]) + if_rocm([
@@ -4658,7 +4193,7 @@ tf_kernel_library(
         "maxpooling_op.h",
         "pooling_ops_3d.h",
         "pooling_ops_common.h",
-    ] + if_sycl(["pooling_ops_3d_sycl.h"]),
+    ],
     gpu_srcs = [
         "avgpooling_op.h",
         "avgpooling_op_gpu.cu.cc",
@@ -4922,12 +4457,22 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "stateless_random_ops_v2_header",
+    hdrs = ["stateless_random_ops_v2.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
 tf_kernel_library(
     name = "stateful_random_ops",
     prefix = "stateful_random_ops",
     deps = [
         ":bounds_check",
         ":dense_update_functor",
+        ":fill_functor",
         ":gather_functor",
         ":mutex_ops",
         ":random_op",
@@ -5167,7 +4712,7 @@ tf_kernel_library(
     ] + if_cuda_or_rocm([
         ":reduction_ops",
     ]) + if_cuda([
-        "@cub_archive//:cub",
+        "@local_config_cuda//cuda:cub_headers",
     ]) + if_rocm([
         "@local_config_rocm//rocm:rocprim",
     ]),
@@ -5324,7 +4869,7 @@ STATE_DEPS = [
     "//tensorflow/core:framework",
     "//tensorflow/core:lib",
     "//tensorflow/core:lib_internal",
-] + if_sycl(["//tensorflow/core/common_runtime/sycl:sycl_runtime"])
+]
 
 tf_kernel_library(
     name = "count_up_to_op",
@@ -5802,7 +5347,7 @@ tf_kernel_library(
     prefix = "random_binomial_op",
     deps = [
         ":cwise_op",
-        ":random_ops",
+        ":random_op",
         ":resource_variable_ops",
         ":stateful_random_ops",
         ":stateless_random_ops",
@@ -5867,6 +5412,74 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "encode_wav_op",
+    prefix = "encode_wav_op",
+    deps = [
+        ":bounds_check",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+tf_cc_test(
+    name = "encode_wav_op_test",
+    size = "small",
+    srcs = ["encode_wav_op_test.cc"],
+    deps = [
+        ":decode_wav_op",
+        ":encode_wav_op",
+        ":ops_testutil",
+        ":ops_util",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:client_session",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:tensorflow",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
+tf_kernel_library(
+    name = "decode_wav_op",
+    prefix = "decode_wav_op",
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+tf_cc_test(
+    name = "decode_wav_op_test",
+    size = "small",
+    srcs = ["decode_wav_op_test.cc"],
+    deps = [
+        ":decode_wav_op",
+        ":ops_testutil",
+        ":ops_util",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:client_session",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:tensorflow",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 filegroup(
     name = "spectrogram_test_data",
     srcs = [
@@ -6096,6 +5709,19 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "meta_support",
+    srcs = ["meta_support.cc"],
+    hdrs = ["meta_support.h"],
+    deps = [
+        ":quantization_utils",
+        "//tensorflow/core:framework",
+        "//tensorflow/core/platform:logging",
+        "//tensorflow/core/platform:mutex",
+        "@gemmlowp",
+    ],
+)
+
 # Android libraries -----------------------------------------------------------
 
 # Changes to the Android srcs here should be replicated in
@@ -6207,8 +5833,6 @@ filegroup(
         "matmul_op.h",
         "no_op.cc",
         "no_op.h",
-        "non_max_suppression_op.cc",
-        "non_max_suppression_op.h",
         "one_hot_op.cc",
         "one_hot_op.h",
         "ops_util.h",
@@ -6256,7 +5880,10 @@ filegroup(
         "unpack_op.cc",
         "variable_ops.cc",
         "variable_ops.h",
+    ] + [
         "//tensorflow/c/kernels:android_all_op_kernels",
+        "//tensorflow/core/kernels/image:non_max_suppression_op.cc",
+        "//tensorflow/core/kernels/image:non_max_suppression_op.h",
     ],
 )
 
@@ -6280,13 +5907,11 @@ filegroup(
 filegroup(
     name = "android_extended_ops_headers",
     srcs = [
-        "adjust_contrast_op.h",
-        "adjust_hue_op.h",
-        "adjust_saturation_op.h",
         "argmax_op.h",
         "avgpooling_op.h",
         "batch_matmul_op_impl.h",
         "batch_norm_op.h",
+        "bincount_op.h",
         "broadcast_to_op.h",
         "bucketize_op.h",
         "control_flow_ops.h",
@@ -6300,12 +5925,9 @@ filegroup(
         "depthwise_conv_op.h",
         "diag_op.h",
         "dilation_ops.h",
-        "extract_image_patches_op.h",
         "fake_quant_ops_functor.h",
         "fused_batch_norm_op.h",
         "gemm_functors.h",
-        "image_ops.h",
-        "image_resizer_state.h",
         "initializable_lookup_table.h",
         "inplace_ops.cc",
         "inplace_ops_functor.h",
@@ -6317,8 +5939,6 @@ filegroup(
         "mfcc.h",
         "mfcc_dct.h",
         "mfcc_mel_filterbank.h",
-        "mirror_pad_op.h",
-        "mirror_pad_op_cpu_impl.h",
         "multinomial_op.h",
         "pad_op.h",
         "pooling_ops_3d.h",
@@ -6329,8 +5949,6 @@ filegroup(
         "relu_op.h",
         "relu_op_functor.h",
         "reshape_util.h",
-        "resize_bilinear_op.h",
-        "resize_nearest_neighbor_op.h",
         "reverse_op.h",
         "save_restore_tensor.h",
         "scan_ops.h",
@@ -6360,15 +5978,26 @@ filegroup(
         "xent_op.h",
     ] + [
         "//tensorflow/core/kernels/boosted_trees/quantiles:weighted_quantiles_hdrs",
+        "//tensorflow/core/kernels/image:adjust_contrast_op.h",
+        "//tensorflow/core/kernels/image:adjust_hue_op.h",
+        "//tensorflow/core/kernels/image:adjust_saturation_op.h",
+        "//tensorflow/core/kernels/image:extract_image_patches_op.h",
+        "//tensorflow/core/kernels/image:image_ops.h",
+        "//tensorflow/core/kernels/image:mirror_pad_op.h",
+        "//tensorflow/core/kernels/image:mirror_pad_op_cpu_impl.h",
+        "//tensorflow/core/kernels/image:resize_bilinear_op.h",
+        "//tensorflow/core/kernels/image:resize_nearest_neighbor_op.h",
         "//tensorflow/core/kernels/linalg:linalg_ops_common.h",
         "//tensorflow/core/kernels/linalg:matrix_diag_op.h",
         "//tensorflow/core/kernels/linalg:matrix_set_diag_op.h",
+        "//tensorflow/core/util:image_resizer_state.h",
     ],
 )
 
 filegroup(
     name = "android_extended_ops_group1",
     srcs = [
+        ":android_extended_ops_headers",
         "argmax_op.cc",
         "avgpooling_op.cc",
         "batch_matmul_op_real.cc",
@@ -6381,18 +6010,16 @@ filegroup(
         "conv_grad_input_ops.cc",
         "conv_grad_ops.h",
         "conv_grad_ops_3d.cc",
-        "conv_grad_shape_utils.h",
         "conv_grad_shape_utils.cc",
+        "conv_grad_shape_utils.h",
         "conv_ops.cc",
         "conv_ops_3d.cc",
         "conv_ops_fused_double.cc",
         "conv_ops_fused_float.cc",
         "conv_ops_fused_half.cc",
-        "conv_ops_fused_impl.h",
         "conv_ops_fused_image_transform.cc",
+        "conv_ops_fused_impl.h",
         "conv_ops_using_gemm.cc",
-        "crop_and_resize_op.cc",
-        "crop_and_resize_op.h",
         "cwise_op_abs.cc",
         "cwise_op_add_1.cc",
         "cwise_op_add_2.cc",
@@ -6408,8 +6035,6 @@ filegroup(
         "cwise_op_div.cc",
         "cwise_op_equal_to_1.cc",
         "cwise_op_equal_to_2.cc",
-        "cwise_op_not_equal_to_1.cc",
-        "cwise_op_not_equal_to_2.cc",
         "cwise_op_erf.cc",
         "cwise_op_exp.cc",
         "cwise_op_floor.cc",
@@ -6434,6 +6059,8 @@ filegroup(
         "cwise_op_mul_2.cc",
         "cwise_op_neg_1.cc",
         "cwise_op_neg_2.cc",
+        "cwise_op_not_equal_to_1.cc",
+        "cwise_op_not_equal_to_2.cc",
         "cwise_op_pow.cc",
         "cwise_op_real.cc",
         "cwise_op_reciprocal.cc",
@@ -6451,18 +6078,19 @@ filegroup(
         "cwise_op_sub.cc",
         "cwise_op_tan.cc",
         "cwise_op_tanh.cc",
-        "cwise_op_xlogy.cc",
-        "cwise_op_xlog1py.cc",
         "cwise_op_xdivy.cc",
+        "cwise_op_xlog1py.cc",
+        "cwise_op_xlogy.cc",
         "data_format_ops.cc",
+        "decode_raw_op.cc",
         "decode_wav_op.cc",
         "deep_conv2d.cc",
         "deep_conv2d.h",
         "depthwise_conv_op.cc",
         "dynamic_partition_op.cc",
-        "encode_wav_op.cc",
         "eigen_contraction_kernel.cc",
         "eigen_contraction_kernel.h",
+        "encode_wav_op.cc",
         "fake_quant_ops.cc",
         "fifo_queue.cc",
         "fifo_queue_op.cc",
@@ -6473,8 +6101,9 @@ filegroup(
         "population_count_op.cc",
         "population_count_op.h",
         "winograd_transform.h",
-        ":android_extended_ops_headers",
     ] + [
+        "//tensorflow/core/kernels/image:crop_and_resize_op.cc",
+        "//tensorflow/core/kernels/image:crop_and_resize_op.h",
         "//tensorflow/core/kernels/linalg:einsum_op_impl_half.cc",
         "//tensorflow/core/kernels/linalg:einsum_op_impl_bfloat16.cc",
         "//tensorflow/core/kernels/linalg:einsum_op_impl_int32.cc",
@@ -6485,6 +6114,9 @@ filegroup(
         "//tensorflow/core/kernels/linalg:einsum_op_impl_complex128.cc",
         "//tensorflow/core/kernels/linalg:einsum_op_impl.h",
         "//tensorflow/core/kernels/linalg:einsum_op.h",
+        "//tensorflow/core/kernels/image:decode_image_op.cc",
+        "//tensorflow/core/kernels/image:encode_jpeg_op.cc",
+        "//tensorflow/core/kernels/image:encode_png_op.cc",
     ] + select({
         ":xsmm_convolutions": [
             "xsmm_conv2d.h",
@@ -6497,11 +6129,10 @@ filegroup(
 filegroup(
     name = "android_extended_ops_group2",
     srcs = [
-        "adjust_contrast_op.cc",
-        "adjust_hue_op.cc",
-        "adjust_saturation_op.cc",
+        ":android_extended_ops_headers",
         "base64_ops.cc",
         "batchtospace_op.cc",
+        "bincount_op.cc",
         "broadcast_to_op.cc",
         "bucketize_op.cc",
         "ctc_decoder_ops.cc",
@@ -6509,9 +6140,7 @@ filegroup(
         "diag_op.cc",
         "dilation_ops.cc",
         "dynamic_stitch_op.cc",
-        "extract_image_patches_op.cc",
         "fft_ops.cc",
-        "image_ops.cc",
         "in_topk_op.cc",
         "in_topk_op.h",
         "initializable_lookup_table.cc",
@@ -6527,12 +6156,6 @@ filegroup(
         "mfcc_dct.cc",
         "mfcc_mel_filterbank.cc",
         "mfcc_op.cc",
-        "mirror_pad_op.cc",
-        "mirror_pad_op_cpu_impl_1.cc",
-        "mirror_pad_op_cpu_impl_2.cc",
-        "mirror_pad_op_cpu_impl_3.cc",
-        "mirror_pad_op_cpu_impl_4.cc",
-        "mirror_pad_op_cpu_impl_5.cc",
         "multinomial_op.cc",
         "pad_op.cc",
         "padding_fifo_queue.cc",
@@ -6542,9 +6165,11 @@ filegroup(
         "queue_op.cc",
         "queue_ops.cc",
         "ragged_range_op.cc",
+        "ragged_tensor_to_sparse_kernel.cc",
         "ragged_tensor_to_tensor_op.cc",
         "random_op.cc",
         "random_op_cpu.h",
+        "random_ops_util.h",
         "random_poisson_op.cc",
         "reduce_join_op.cc",
         "reduction_ops_all.cc",
@@ -6558,11 +6183,8 @@ filegroup(
         "regex_replace_op.cc",
         "relu_op.cc",
         "reshape_util.cc",
-        "resize_bilinear_op.cc",
-        "resize_nearest_neighbor_op.cc",
         "restore_op.cc",
         "reverse_op.cc",
-        "sample_distorted_bounding_box_op.cc",
         "save_op.cc",
         "save_restore_tensor.cc",
         "save_restore_v2_ops.cc",
@@ -6598,11 +6220,11 @@ filegroup(
         "stack_ops.cc",
         "stateless_random_ops.cc",
         "string_join_op.cc",
+        "string_lower_op.cc",
         "string_util.cc",
         "string_split_op.cc",
         "string_to_hash_bucket_op.cc",
         "substr_op.cc",
-        "summary_op.cc",
         "tensor_array.cc",
         "tensor_array_ops.cc",
         "tensor_list.cc",
@@ -6640,9 +6262,22 @@ filegroup(
         "unique_op.cc",
         "where_op.cc",
         "xent_op.cc",
-        ":android_extended_ops_headers",
     ] + [
         "//tensorflow/core/kernels/boosted_trees:quantile_ops.cc",
+        "//tensorflow/core/kernels/image:adjust_contrast_op.cc",
+        "//tensorflow/core/kernels/image:adjust_hue_op.cc",
+        "//tensorflow/core/kernels/image:adjust_saturation_op.cc",
+        "//tensorflow/core/kernels/image:extract_image_patches_op.cc",
+        "//tensorflow/core/kernels/image:image_ops.cc",
+        "//tensorflow/core/kernels/image:mirror_pad_op.cc",
+        "//tensorflow/core/kernels/image:mirror_pad_op_cpu_impl_1.cc",
+        "//tensorflow/core/kernels/image:mirror_pad_op_cpu_impl_2.cc",
+        "//tensorflow/core/kernels/image:mirror_pad_op_cpu_impl_3.cc",
+        "//tensorflow/core/kernels/image:mirror_pad_op_cpu_impl_4.cc",
+        "//tensorflow/core/kernels/image:mirror_pad_op_cpu_impl_5.cc",
+        "//tensorflow/core/kernels/image:resize_bilinear_op.cc",
+        "//tensorflow/core/kernels/image:resize_nearest_neighbor_op.cc",
+        "//tensorflow/core/kernels/image:sample_distorted_bounding_box_op.cc",
         "//tensorflow/core/kernels/linalg:linalg_ops_common.cc",
         "//tensorflow/core/kernels/linalg:matrix_diag_op.cc",
         "//tensorflow/core/kernels/linalg:matrix_inverse_op.cc",
@@ -6685,7 +6320,6 @@ ANDROID_TEXTUAL_HDRS = [
     "eigen_spatial_convolutions-inl.h",
     "gather_nd_op_cpu_impl.h",
     "gemm_functors.h",
-    "mirror_pad_op_cpu_impl.h",
     "scatter_nd_op_cpu_impl.h",
     "slice_op_cpu_impl.h",
     "strided_slice_op_impl.h",
@@ -6700,6 +6334,7 @@ filegroup(
     srcs = [
         "//tensorflow/c/kernels:android_all_op_kernels",
         "//tensorflow/core/kernels/data:android_all_op_kernels",
+        "//tensorflow/core/kernels/image:android_all_op_kernels",
         "//tensorflow/core/kernels/linalg:android_all_op_kernels",
     ] + glob(
         [
@@ -6731,13 +6366,6 @@ filegroup(
             "sparse_cross_op.*",
             "text_line_reader_op.*",
             "summary_image_op.*",
-            "decode_image_op.*",
-            "encode_png_op.*",
-            "encode_jpeg_op.*",
-            "extract_jpeg_shape_op.*",
-            "decode_jpeg_op.*",
-            "decode_and_crop_jpeg_op.*",
-            "decode_gif_op.*",
             "identity_reader_op.*",
             "remote_fused_graph_execute_op.*",
             "remote_fused_graph_rewriter_transform.*",
@@ -6762,9 +6390,7 @@ filegroup(
             "unicode_ops.cc",
             "unicode_script_op.cc",
             # Ops that are inherently incompatible with Android (e.g. tied to x86 platform).
-            "mkl_*",
             "xsmm_*",
-            "cwise_ops_sycl_common.h",
             "nextafter_op.cc",
         ] + ANDROID_TEXTUAL_HDRS,
     ) + [
@@ -6783,7 +6409,10 @@ filegroup(
 
 filegroup(
     name = "android_all_ops_textual_hdrs",
-    srcs = ANDROID_TEXTUAL_HDRS,
+    srcs = ANDROID_TEXTUAL_HDRS + [
+        "//tensorflow/core/kernels/image:android_all_ops_textual_hdrs",
+        "//tensorflow/core/util:image_resizer_state.h",
+    ],
     visibility = ["//visibility:public"],
 )
 # LINT.ThenChange(//tensorflow/contrib/makefile/tf_op_files.txt)
@@ -6808,6 +6437,9 @@ cc_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/core:portable_gif_internal",
+        "//tensorflow/core:portable_jpeg_internal",
+        "//tensorflow/core:portable_png_internal",
         "//tensorflow/core:portable_tensorflow_lib_lite",
         "//tensorflow/core:protos_all_cc_impl",
         "//tensorflow/core/platform:strong_hash",
@@ -6827,26 +6459,6 @@ build_test(
     targets = [":portable_tensorflow_kernels"],
 )
 
-cc_library(
-    name = "android_tensorflow_image_op",
-    srcs = if_android(["decode_image_op.cc"]),
-    copts = tf_copts(),
-    linkopts = ["-ldl"],
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/core:android_gif_internal",
-        "//tensorflow/core:android_jpeg_internal",
-        "//tensorflow/core:android_png_internal",
-        "//tensorflow/core:portable_tensorflow_lib_lite",
-    ],
-    alwayslink = 1,
-)
-
-build_test(
-    name = "android_tensorflow_image_op_build_test",
-    targets = [":android_tensorflow_image_op"],
-)
-
 cc_library(
     name = "android_whole_file_read_ops",
     srcs = if_android(["whole_file_read_ops.cc"]),
@@ -6865,7 +6477,6 @@ tf_kernel_library(
     name = "quantized_ops",
     srcs = [
         "dequantize_op.cc",
-        "meta_support.cc",
         "quantize_down_and_shrink_range.cc",
         "quantize_op.cc",
         "quantized_activation_ops.cc",
@@ -6884,22 +6495,20 @@ tf_kernel_library(
         "requantize.cc",
         "reshape_op.h",
     ],
-    hdrs = [
-        "meta_support.h",
-        "reference_gemm.h",
-    ],
+    hdrs = ["reference_gemm.h"],
     deps = [
         ":concat_lib_hdrs",
         ":conv_ops",
         ":cwise_op",
         ":eigen_helpers",
-        ":image_resizer_state",
+        ":meta_support",
         ":ops_util",
         ":pooling_ops",
         ":quantization_utils",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core/util:image_resizer_state",
         "//third_party/eigen3",
         "@gemmlowp",
     ],
@@ -7201,50 +6810,6 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test_mkl(
-    name = "mkl_quantized_conv_ops_perchannel_test",
-    size = "small",
-    srcs = ["mkl_quantized_conv_ops_perchannel_test.cc"],
-    deps = [
-        ":mkl_conv_op",
-        ":mkl_input_conversion_op",
-        ":ops_testutil",
-        ":ops_util",
-        ":quantization_utils",
-        ":quantized_ops",
-        "//tensorflow/core:array_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:math_ops_op_lib",
-        "//tensorflow/core:nn_ops_op_lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
-tf_cc_test_mkl(
-    name = "mkl_quantized_conv_ops_test",
-    size = "small",
-    srcs = ["mkl_quantized_conv_ops_test.cc"],
-    deps = [
-        ":mkl_conv_op",
-        ":mkl_input_conversion_op",
-        ":ops_testutil",
-        ":ops_util",
-        ":quantization_utils",
-        ":quantized_ops",
-        "//tensorflow/core:array_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:math_ops_op_lib",
-        "//tensorflow/core:nn_ops_op_lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
 tf_cc_test(
     name = "quantize_op_test",
     size = "small",
@@ -7285,28 +6850,6 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test_mkl(
-    name = "mkl_qmatmul_op_test",
-    size = "small",
-    srcs = ["mkl_qmatmul_op_test.cc"],
-    deps = [
-        ":mkl_input_conversion_op",
-        ":mkl_qmatmul_op",
-        ":ops_testutil",
-        ":ops_util",
-        ":quantization_utils",
-        ":quantized_ops",
-        "//tensorflow/core:array_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:math_ops_op_lib",
-        "//tensorflow/core:nn_ops_op_lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
 # Android-only test for quantized multiply.
 cc_binary(
     name = "quantized_mul_op_test_android_only",
@@ -7387,66 +6930,6 @@ tf_cc_test(
     ],
 )
 
-tf_mkl_kernel_library(
-    name = "mkl_quantize_op",
-    srcs = ["mkl_quantize_op.cc"],
-    hdrs = [
-        "meta_support.h",
-        "reference_gemm.h",
-    ],
-    deps = [
-        ":bounds_check",
-        ":ops_util",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:mkl_graph_util",
-        "@gemmlowp",
-    ] + mkl_deps(),
-)
-
-tf_cc_test_mkl(
-    name = "mkl_quantize_op_test",
-    size = "small",
-    srcs = ["mkl_quantize_op_test.cc"],
-    deps = [
-        ":mkl_quantize_op",
-        ":ops_testutil",
-        ":ops_util",
-        "//tensorflow/core:array_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:math_ops_op_lib",
-        "//tensorflow/core:nn_ops_op_lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
-tf_cc_test_mkl(
-    name = "mkl_quantized_pooling_ops_test",
-    size = "small",
-    srcs = ["mkl_quantized_pooling_ops_test.cc"],
-    deps = [
-        ":mkl_input_conversion_op",
-        ":mkl_pooling_ops",
-        ":ops_testutil",
-        ":ops_util",
-        ":quantization_utils",
-        ":quantized_ops",
-        "//tensorflow/core:array_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:math_ops_op_lib",
-        "//tensorflow/core:nn_ops_op_lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
 tf_cc_test(
     name = "quantized_reshape_op_test",
     size = "small",
@@ -7486,30 +6969,6 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test_mkl(
-    name = "mkl_quantized_concat_op_test",
-    size = "small",
-    srcs = ["mkl_quantized_concat_op_test.cc"],
-    deps = [
-        ":mkl_concat_op",
-        ":ops_testutil",
-        ":ops_util",
-        ":quantization_utils",
-        ":quantized_ops",
-        "//tensorflow/core:array_ops_op_lib",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:math_ops_op_lib",
-        "//tensorflow/core:mkl_array_ops_op_lib",
-        "//tensorflow/core:nn_ops_op_lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
 tf_cc_test(
     name = "quantized_batch_norm_op_test",
     size = "small",
@@ -7733,50 +7192,6 @@ tf_cc_test(
     ],
 )
 
-tf_mkl_kernel_library(
-    name = "mkl_qmatmul_op",
-    srcs = ["mkl_qmatmul_op.cc"],
-    hdrs = [
-        "mkl_matmul_ops_common.h",
-        "mkl_quantized_conv_ops.h",
-        "no_op.h",
-    ],
-    deps = [
-        ":bounds_check",
-        ":fill_functor",
-        ":matmul_op",
-        ":ops_util",
-        "//third_party/eigen3",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:math_ops_op_lib",
-        "//tensorflow/core:mkl_nn_ops_op_lib",
-        "//tensorflow/core:nn_ops_op_lib",
-    ] + mkl_deps(),
-)
-
-tf_mkl_kernel_library(
-    name = "mkl_conv_op",
-    hdrs = [
-        "mkl_quantized_conv_ops.h",
-        "no_op.h",
-    ],
-    prefix = "mkl_conv",
-    deps = [
-        ":bounds_check",
-        ":conv_ops",
-        ":ops_util",
-        "@com_google_absl//absl/strings",
-        "//third_party/eigen3",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ] + mkl_deps(),
-)
-
 tf_cc_test(
     name = "bias_op_test",
     size = "small",
@@ -7791,354 +7206,6 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test_mkl(
-    name = "mkl_conv_ops_test",
-    size = "small",
-    srcs = ["mkl_conv_ops_test.cc"],
-    linkstatic = 1,  # Fixes dyld error on MacOS.
-    deps = [
-        ":ops_testutil",
-        ":ops_util",
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:tensorflow",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
-tf_cc_test_mkl(
-    name = "mkl_relu_op_test",
-    size = "small",
-    srcs = ["mkl_relu_op_test.cc"],
-    linkstatic = 1,  # Fixes dyld error on MacOS.
-    deps = [
-        ":ops_testutil",
-        ":ops_util",
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:tensorflow",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
-tf_mkl_kernel_library(
-    name = "mkl_tfconv_op",
-    prefix = "mkl_tfconv",
-    deps = [
-        ":bounds_check",
-        ":ops_util",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ] + mkl_deps(),
-)
-
-tf_mkl_kernel_library(
-    name = "mkl_input_conversion_op",
-    hdrs = ["mkl_tfconv_op.h"],
-    prefix = "mkl_input_conversion",
-    deps = [
-        ":bounds_check",
-        ":ops_util",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ] + mkl_deps(),
-)
-
-tf_mkl_kernel_library(
-    name = "mkl_pooling_ops",
-    srcs = [
-        "mkl_avgpooling_op.cc",
-        "mkl_maxpooling_op.cc",
-        "mkl_pooling_ops_common.cc",
-    ],
-    hdrs = ["mkl_pooling_ops_common.h"],
-    deps = [
-        ":bounds_check",
-        ":ops_util",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ] + mkl_deps(),
-)
-
-tf_mkl_kernel_library(
-    name = "mkl_dequantize_op",
-    srcs = ["mkl_dequantize_op.cc"],
-    hdrs = [
-        "meta_support.h",
-        "reference_gemm.h",
-    ],
-    deps = [
-        ":concat_lib_hdrs",
-        ":conv_ops",
-        ":cwise_op",
-        ":eigen_helpers",
-        ":image_resizer_state",
-        ":ops_util",
-        ":pooling_ops",
-        ":quantization_utils",
-        ":quantized_ops",
-        ":transpose_functor",
-        "//tensorflow/core:array_ops_op_lib",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:math_ops_op_lib",
-        "//tensorflow/core:mkl_graph_util",
-        "//tensorflow/core:nn_ops_op_lib",
-        "//third_party/eigen3",
-        "@gemmlowp",
-    ] + mkl_deps(),
-)
-
-tf_cc_test_mkl(
-    name = "mkl_dequantize_op_test",
-    size = "small",
-    srcs = ["mkl_dequantize_op_test.cc"],
-    # TODO(b/149940073): Re-enable.
-    tags = [
-        "no_oss",
-        "notap",
-    ],
-    deps = [
-        ":mkl_dequantize_op",
-        ":mkl_tfconv_op",
-        ":ops_testutil",
-        ":ops_util",
-        "//tensorflow/core:array_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:math_ops_op_lib",
-        "//tensorflow/core:mkl_array_ops_op_lib",
-        "//tensorflow/core:nn_ops_op_lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:tensorflow",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
-tf_mkl_kernel_library(
-    name = "mkl_relu_op",
-    prefix = "mkl_relu",
-    deps = [
-        ":bounds_check",
-        ":ops_util",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//third_party/eigen3",
-    ] + mkl_deps(),
-)
-
-tf_mkl_kernel_library(
-    name = "mkl_softmax_op",
-    prefix = "mkl_softmax",
-    deps = [
-        ":bounds_check",
-        ":ops_util",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//third_party/eigen3",
-    ] + mkl_deps(),
-)
-
-tf_mkl_kernel_library(
-    name = "mkl_tmp_bf16_ops",
-    prefix = "mkl_tmp_bf16_ops",
-    deps = [
-        ":no_op",
-    ] + mkl_deps(),
-)
-
-tf_mkl_kernel_library(
-    name = "mkl_fused_batch_norm_op",
-    srcs = ["mkl_fused_batch_norm_op.cc"],
-    deps = NN_DEPS + [
-        ":fused_batch_norm_op",
-        ":no_op",
-    ] + mkl_deps(),
-)
-
-tf_cc_test_mkl(
-    name = "mkl_fused_batch_norm_op_test",
-    size = "small",
-    srcs = ["mkl_fused_batch_norm_op_test.cc"],
-    linkstatic = 1,
-    deps = [
-        ":mkl_fused_batch_norm_op",
-        ":ops_testutil",
-        ":ops_util",
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:tensorflow",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
-tf_mkl_kernel_library(
-    name = "mkl_aggregate_ops",
-    prefix = "mkl_aggregate_ops",
-    deps = MATH_DEPS + mkl_deps(),
-)
-
-tf_mkl_kernel_library(
-    name = "mkl_concat_op",
-    prefix = "mkl_concat_op",
-    deps = [":quantization_utils"] + ARRAY_DEPS + mkl_deps(),
-)
-
-tf_mkl_kernel_library(
-    name = "mkl_reshape_op",
-    prefix = "mkl_reshape_op",
-    deps = ARRAY_DEPS + mkl_deps(),
-)
-
-tf_mkl_kernel_library(
-    name = "mkl_slice_op",
-    prefix = "mkl_slice_op",
-    deps = ARRAY_DEPS + mkl_deps(),
-)
-
-tf_mkl_kernel_library(
-    name = "mkl_identity_op",
-    prefix = "mkl_identity_op",
-    deps = ARRAY_DEPS + mkl_deps(),
-)
-
-tf_mkl_kernel_library(
-    name = "mkl_lrn_op",
-    prefix = "mkl_lrn_op",
-    deps = NN_DEPS + mkl_deps(),
-)
-
-tf_mkl_kernel_library(
-    name = "mkl_cwise_ops_common",
-    hdrs = [
-        "cwise_ops.h",
-        "cwise_ops_common.h",
-        "cwise_ops_gradients.h",
-    ],
-    prefix = "mkl_cwise_ops_common",
-    deps = NN_DEPS + mkl_deps() + [":cwise_op"],
-)
-
-tf_mkl_kernel_library(
-    name = "mkl_requantize_ops",
-    srcs = [
-        "mkl_requantization_range_per_channel_op.cc",
-        "mkl_requantize_per_channel_op.cc",
-    ],
-    hdrs = [
-        "meta_support.h",
-        "no_op.h",
-        "reference_gemm.h",
-    ],
-    deps = [
-        ":concat_lib_hdrs",
-        ":conv_ops",
-        ":cwise_op",
-        ":eigen_helpers",
-        ":image_resizer_state",
-        ":ops_util",
-        ":pooling_ops",
-        ":quantization_utils",
-        ":transpose_functor",
-        "//third_party/eigen3",
-        "@gemmlowp",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-    ] + mkl_deps(),
-)
-
-tf_cc_test_mkl(
-    name = "mkl_requantize_ops_test",
-    size = "small",
-    srcs = ["mkl_requantize_ops_test.cc"],
-    linkstatic = 1,  # Fixes dyld error on MacOS.
-    deps = [
-        ":mkl_requantize_ops",
-        ":ops_testutil",
-        ":ops_util",
-        ":quantization_utils",
-        ":quantized_ops",
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/core:array_ops_op_lib",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:math_ops_op_lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:tensorflow",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
-tf_cc_test_mkl(
-    name = "mkl_fused_ops_test",
-    size = "small",
-    srcs = ["mkl_fused_ops_test.cc"],
-    linkstatic = 1,
-    deps = [
-        ":conv_ops",
-        ":image",
-        ":mkl_conv_op",
-        ":mkl_matmul_op",
-        ":mkl_tfconv_op",
-        ":ops_testutil",
-        ":ops_util",
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:tensorflow",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
-tf_mkl_kernel_library(
-    name = "mkl_transpose_op",
-    srcs = [
-        "mkl_transpose_op.cc",
-    ],
-    hdrs = ["transpose_op.h"],
-    deps = ARRAY_DEPS + mkl_deps() + [":transpose_op"],
-)
-
 # NOTE(lespeholt): This rule is deprecated, please use:
 # tensorflow/core/util/batch_util.h
 cc_library(
@@ -8262,10 +7329,7 @@ tf_kernel_library(
 # should not be linked by projects that also link the cwise_op library.
 cc_library(
     name = "cwise_lib",
-    srcs = [
-        "cwise_ops_common.cc",
-        "meta_support.cc",
-    ],
+    srcs = ["cwise_ops_common.cc"],
     hdrs = [
         "cwise_ops.h",
         "cwise_ops_common.h",
@@ -8273,10 +7337,10 @@ cc_library(
         "cwise_ops_gpu_gradients.cu.h",
         "cwise_ops_gradients.h",
         "fill_functor.h",
-        "meta_support.h",
     ],
     deps = [
         ":bounds_check",
+        ":meta_support",
         ":quantization_utils",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -8338,7 +7402,6 @@ test_suite(
         ":cast_op_test",
         ":concat_op_test",
         ":control_flow_ops_test",
-        ":crop_and_resize_op_test",
         ":cwise_ops_test",
         ":deep_conv2d_test",
         ":dequantize_op_test",
@@ -8351,7 +7414,6 @@ test_suite(
         ":mfcc_test",
         ":multinomial_op_test",
         ":nn_ops_test",
-        ":non_max_suppression_op_test",
         ":quantization_utils_test",
         ":quantize_and_dequantize_op_test",
         ":quantize_op_test",
@@ -8365,7 +7427,6 @@ test_suite(
         ":random_poisson_op_test",
         ":reduction_ops_test",
         ":requantization_range_op_test",
-        ":resize_bilinear_op_test",
         ":scatter_op_test",
         ":segment_reduction_ops_test",
         ":slice_op_test",
@@ -8375,6 +7436,9 @@ test_suite(
         ":strided_slice_op_test",
         ":unique_op_test",
         ":variable_ops_test",
+        "//tensorflow/core/kernels/image:crop_and_resize_op_test",
+        "//tensorflow/core/kernels/image:non_max_suppression_op_test",
+        "//tensorflow/core/kernels/image:resize_bilinear_op_test",
     ],
 )
 
@@ -8403,6 +7467,7 @@ exports_files([
     "cwise_op_gpu_greater_equal.cu.cc",
     "cwise_op_gpu_less.cu.cc",
     "cwise_op_gpu_less_equal.cu.cc",
+    "cwise_op_gpu_log.cu.cc",
     "cwise_op_gpu_logical_and.cu.cc",
     "cwise_op_gpu_logical_not.cu.cc",
     "cwise_op_gpu_logical_or.cu.cc",
@@ -8425,6 +7490,7 @@ exports_files([
     "cwise_op_greater_equal.cc",
     "cwise_op_less.cc",
     "cwise_op_less_equal.cc",
+    "cwise_op_log.cc",
     "cwise_op_logical_and.cc",
     "cwise_op_logical_not.cc",
     "cwise_op_logical_or.cc",
@@ -8481,3 +7547,32 @@ tf_kernel_library(
     name = "einsum_op",
     deps = ["//tensorflow/core/kernels/linalg:einsum_op"],
 )
+
+tf_kernel_library(
+    name = "isotonic_regression_op",
+    srcs = [
+        "isotonic_regression_op.cc",
+    ],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//third_party/eigen3",
+    ],
+)
+
+tf_cc_test(
+    name = "isotonic_regression_op_test",
+    size = "small",
+    srcs = ["isotonic_regression_op_test.cc"],
+    deps = [
+        ":isotonic_regression_op",
+        ":ops_testutil",
+        ":ops_util",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
diff --git a/tensorflow/core/kernels/aggregate_ops.cc b/tensorflow/core/kernels/aggregate_ops.cc
index 79062aee156..3b6f89a4c43 100644
--- a/tensorflow/core/kernels/aggregate_ops.cc
+++ b/tensorflow/core/kernels/aggregate_ops.cc
@@ -28,9 +28,6 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 #define REGISTER_ADDN(type, dev)                                   \
   REGISTER_KERNEL_BUILDER(                                         \
@@ -67,21 +64,6 @@ REGISTER_KERNEL_BUILDER(
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER_ADDN(float, SYCL);
-REGISTER_ADDN(double, SYCL);
-
-// A special GPU kernel for int32.
-// TODO(b/25387198): Also enable int32 in device memory. This kernel
-// registration requires all int32 inputs and outputs to be in host memory.
-REGISTER_KERNEL_BUILDER(
-    Name("AddN")
-        .Device(DEVICE_SYCL)
-        .TypeConstraint<int32>("T")
-        .HostMemory("inputs")
-        .HostMemory("sum"),
-    AddNOp<CPUDevice, int32, OpKernel, OpKernelConstruction, OpKernelContext>);
-#endif  // TENSORFLOW_USE_SYCL
 
 #undef REGISTER_ADDN
 
diff --git a/tensorflow/core/kernels/aggregate_ops_cpu.h b/tensorflow/core/kernels/aggregate_ops_cpu.h
index 3e87917b64f..d64d30615e1 100644
--- a/tensorflow/core/kernels/aggregate_ops_cpu.h
+++ b/tensorflow/core/kernels/aggregate_ops_cpu.h
@@ -23,9 +23,6 @@ limitations under the License.
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 namespace tensorflow {
 
@@ -137,114 +134,6 @@ struct Add9Functor<CPUDevice, T> {
   }
 };
 
-#ifdef TENSORFLOW_USE_SYCL
-// Partial specializations for a SYCLDevice, that uses the Eigen implementation
-// from AddNEigenImpl.
-template <typename T>
-struct Add2Functor<SYCLDevice, T> {
-  void operator()(const SYCLDevice& d, typename TTypes<T>::Flat out,
-                  typename TTypes<T>::ConstFlat in1,
-                  typename TTypes<T>::ConstFlat in2) {
-    Add2EigenImpl<SYCLDevice, T>::Compute(d, out, in1, in2);
-  }
-};
-template <typename T>
-struct Add3Functor<SYCLDevice, T> {
-  void operator()(const SYCLDevice& d, typename TTypes<T>::Flat out,
-                  typename TTypes<T>::ConstFlat in1,
-                  typename TTypes<T>::ConstFlat in2,
-                  typename TTypes<T>::ConstFlat in3) {
-    Add3EigenImpl<SYCLDevice, T>::Compute(d, out, in1, in2, in3);
-  }
-};
-template <typename T>
-struct Add4Functor<SYCLDevice, T> {
-  void operator()(const SYCLDevice& d, typename TTypes<T>::Flat out,
-                  typename TTypes<T>::ConstFlat in1,
-                  typename TTypes<T>::ConstFlat in2,
-                  typename TTypes<T>::ConstFlat in3,
-                  typename TTypes<T>::ConstFlat in4) {
-    Add4EigenImpl<SYCLDevice, T>::Compute(d, out, in1, in2, in3, in4);
-  }
-};
-template <typename T>
-struct Add5Functor<SYCLDevice, T> {
-  void operator()(const SYCLDevice& d, typename TTypes<T>::Flat out,
-                  typename TTypes<T>::ConstFlat in1,
-                  typename TTypes<T>::ConstFlat in2,
-                  typename TTypes<T>::ConstFlat in3,
-                  typename TTypes<T>::ConstFlat in4,
-                  typename TTypes<T>::ConstFlat in5) {
-    Add5EigenImpl<SYCLDevice, T>::Compute(d, out, in1, in2, in3, in4, in5);
-  }
-};
-template <typename T>
-struct Add6Functor<SYCLDevice, T> {
-  void operator()(const SYCLDevice& d, typename TTypes<T>::Flat out,
-                  typename TTypes<T>::ConstFlat in1,
-                  typename TTypes<T>::ConstFlat in2,
-                  typename TTypes<T>::ConstFlat in3,
-                  typename TTypes<T>::ConstFlat in4,
-                  typename TTypes<T>::ConstFlat in5,
-                  typename TTypes<T>::ConstFlat in6) {
-    Add6EigenImpl<SYCLDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6);
-  }
-};
-template <typename T>
-struct Add7Functor<SYCLDevice, T> {
-  void operator()(const SYCLDevice& d, typename TTypes<T>::Flat out,
-                  typename TTypes<T>::ConstFlat in1,
-                  typename TTypes<T>::ConstFlat in2,
-                  typename TTypes<T>::ConstFlat in3,
-                  typename TTypes<T>::ConstFlat in4,
-                  typename TTypes<T>::ConstFlat in5,
-                  typename TTypes<T>::ConstFlat in6,
-                  typename TTypes<T>::ConstFlat in7) {
-    Add7EigenImpl<SYCLDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6,
-                                          in7);
-  }
-};
-
-template <typename T>
-struct Add8Functor<SYCLDevice, T> {
-  void operator()(
-      const SYCLDevice& d, typename TTypes<T>::Flat out,
-      typename TTypes<T>::ConstFlat in1, typename TTypes<T>::ConstFlat in2,
-      typename TTypes<T>::ConstFlat in3, typename TTypes<T>::ConstFlat in4,
-      typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6,
-      typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8) {
-    Add8EigenImpl<SYCLDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6,
-                                          in7, in8);
-  }
-};
-
-template <typename T>
-struct Add8pFunctor<SYCLDevice, T> {
-  void operator()(
-      const SYCLDevice& d, typename TTypes<T>::Flat out,
-      typename TTypes<T>::ConstFlat in1, typename TTypes<T>::ConstFlat in2,
-      typename TTypes<T>::ConstFlat in3, typename TTypes<T>::ConstFlat in4,
-      typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6,
-      typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8) {
-    Add8pEigenImpl<SYCLDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6,
-                                           in7, in8);
-  }
-};
-
-template <typename T>
-struct Add9Functor<SYCLDevice, T> {
-  void operator()(
-      const SYCLDevice& d, typename TTypes<T>::Flat out,
-      typename TTypes<T>::ConstFlat in1, typename TTypes<T>::ConstFlat in2,
-      typename TTypes<T>::ConstFlat in3, typename TTypes<T>::ConstFlat in4,
-      typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6,
-      typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8,
-      typename TTypes<T>::ConstFlat in9) {
-    Add9EigenImpl<SYCLDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6,
-                                          in7, in8, in9);
-  }
-};
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace functor
 
diff --git a/tensorflow/core/kernels/avgpooling_op.cc b/tensorflow/core/kernels/avgpooling_op.cc
index 20df833a934..58004d1789f 100644
--- a/tensorflow/core/kernels/avgpooling_op.cc
+++ b/tensorflow/core/kernels/avgpooling_op.cc
@@ -81,8 +81,13 @@ class AvgPoolingOp : public UnaryOp<T> {
 
   void Compute(OpKernelContext* context) override {
     const Tensor& tensor_in = context->input(0);
-    PoolParameters params{context,  ksize_,       stride_,
-                          padding_, data_format_, tensor_in.shape()};
+    PoolParameters params{context,
+                          ksize_,
+                          stride_,
+                          padding_,
+                          /*explicit_paddings=*/{},
+                          data_format_,
+                          tensor_in.shape()};
     if (!context->status().ok()) {
       return;
     }
@@ -146,8 +151,13 @@ class AvgPoolingOp<GPUDevice, T> : public UnaryOp<T> {
 
   void Compute(OpKernelContext* context) override {
     const Tensor& tensor_in = context->input(0);
-    PoolParameters params{context,  ksize_,       stride_,
-                          padding_, data_format_, tensor_in.shape()};
+    PoolParameters params{context,
+                          ksize_,
+                          stride_,
+                          padding_,
+                          /*explicit_paddings=*/{},
+                          data_format_,
+                          tensor_in.shape()};
     if (!context->status().ok()) {
       return;
     }
@@ -169,14 +179,14 @@ class AvgPoolingOp<GPUDevice, T> : public UnaryOp<T> {
 
 #if CUDNN_VERSION >= 7300
     DnnPoolingOp<T>::Compute(context, se::dnn::PoolingMode::kAverage, ksize_,
-                             stride_, padding_, data_format_, tensor_in,
-                             output_shape,
+                             stride_, padding_, /*explicit_paddings=*/{},
+                             data_format_, tensor_in, output_shape,
                              /*propagate_nans=*/false);
 #else
     if (data_format_ == FORMAT_NCHW) {
       DnnPoolingOp<T>::Compute(context, se::dnn::PoolingMode::kAverage, ksize_,
-                               stride_, padding_, data_format_, tensor_in,
-                               output_shape,
+                               stride_, padding_, /*explicit_paddings=*/{},
+                               data_format_, tensor_in, output_shape,
                                /*propagate_nans=*/false);
     } else {
       Tensor* output = nullptr;
@@ -446,10 +456,10 @@ class AvgPoolingGradOp<GPUDevice, T> : public OpKernel {
       return;
     }
 
-    DnnPoolingGradOp<T>::Compute(context, se::dnn::PoolingMode::kAverage,
-                                 ksize_, stride_, padding_, data_format_,
-                                 nullptr, nullptr, out_backprop, output_shape,
-                                 /*propagate_nans=*/false);
+    DnnPoolingGradOp<T>::Compute(
+        context, se::dnn::PoolingMode::kAverage, ksize_, stride_, padding_,
+        /*explicit_paddings=*/{}, data_format_, nullptr, nullptr, out_backprop,
+        output_shape, /*propagate_nans=*/false);
   }
 
  private:
@@ -533,7 +543,8 @@ class AvgPoolingGradOpCustomGPUKernel : public OpKernel {
 
 #if CUDNN_VERSION >= 7300
     DnnPoolingGradOp<T>::Compute(context, se::dnn::PoolingMode::kAverage,
-                                 ksize_, stride_, padding_, data_format_,
+                                 ksize_, stride_, padding_,
+                                 /*explicit_paddings=*/{}, data_format_,
                                  nullptr, nullptr, out_backprop, output_shape,
                                  /*propagate_nans=*/false);
 #else
@@ -589,7 +600,8 @@ class AvgPoolingGradOpCustomGPUKernel : public OpKernel {
                                 context->eigen_gpu_device());   // d
     } else {
       DnnPoolingGradOp<T>::Compute(context, se::dnn::PoolingMode::kAverage,
-                                   ksize_, stride_, padding_, data_format_,
+                                   ksize_, stride_, padding_,
+                                   /*explicit_paddings=*/{}, data_format_,
                                    nullptr, nullptr, out_backprop, output_shape,
                                    /*propagate_nans=*/false);
     }
diff --git a/tensorflow/core/kernels/batch_kernels.cc b/tensorflow/core/kernels/batch_kernels.cc
index 1f430039b40..5f742c37f35 100644
--- a/tensorflow/core/kernels/batch_kernels.cc
+++ b/tensorflow/core/kernels/batch_kernels.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/batching_util/concat_split_util.h"
 #include "tensorflow/core/kernels/batching_util/periodic_function.h"
 #include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/monitoring/gauge.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/logging.h"
@@ -31,6 +32,33 @@ limitations under the License.
 
 namespace tensorflow {
 
+auto* batch_op_split_usage = monitoring::Gauge<string, 1>::New(
+    "/tensorflow/serving/batching/enable_large_batch_splitting",
+    "Tracks the usage of attribute `enable_large_batch_splitting` for "
+    "BatchFunction kernel in a saved model.",
+    "model_name");
+
+void RecordBatchSplitUsage(
+    absl::optional<bool> maybe_enable_large_batch_splitting,
+    const string& model_name) {
+  if (maybe_enable_large_batch_splitting.has_value()) {
+    if (maybe_enable_large_batch_splitting.value()) {
+      batch_op_split_usage->GetCell(model_name)->Set("true");
+    } else {
+      batch_op_split_usage->GetCell(model_name)->Set("false");
+    }
+  } else {
+    batch_op_split_usage->GetCell(model_name)->Set("unset");
+  }
+}
+
+const string& GetModelName(OpKernelContext* ctx) {
+  static string* kModelNameUnset = new string("model_name_unset");
+  if (!ctx->session_metadata()) return *kModelNameUnset;
+  if (ctx->session_metadata()->name().empty()) return *kModelNameUnset;
+  return ctx->session_metadata()->name();
+}
+
 using ::tensorflow::concat_split_util::Concat;
 using ::tensorflow::concat_split_util::Split;
 
@@ -72,9 +100,10 @@ class BatchResource : public serving::BatchResourceBase {
         fhandle_(fhandle) {}
 
   void ProcessFuncBatchImpl(
-      OpKernelContext* last_task_context, absl::Span<const Tensor> inputs,
+      const BatchTask& last_task, absl::Span<const Tensor> inputs,
       std::vector<Tensor>* combined_outputs,
       std::function<void(const Status&)> done) const override {
+    auto* last_task_context = last_task.context;
     FunctionLibraryRuntime::Options opts;
     opts.step_container = last_task_context->step_container();
     opts.cancellation_manager = last_task_context->cancellation_manager();
@@ -129,8 +158,10 @@ class BatchFunctionKernel : public AsyncOpKernel {
     if (c->HasAttr("enable_large_batch_splitting")) {
       OP_REQUIRES_OK(c, c->GetAttr("enable_large_batch_splitting",
                                    &enable_large_batch_splitting_));
+      has_attribute_enable_large_batch_splitting_ = true;
     } else {
       enable_large_batch_splitting_ = false;
+      has_attribute_enable_large_batch_splitting_ = false;
     }
 
     OP_REQUIRES_OK(c, ValidateAllowedBatchSizes());
@@ -139,6 +170,11 @@ class BatchFunctionKernel : public AsyncOpKernel {
   bool IsExpensive() override { return false; }
 
   void ComputeAsync(OpKernelContext* c, DoneCallback done) final {
+    RecordBatchSplitUsage(
+        has_attribute_enable_large_batch_splitting_
+            ? absl::make_optional(enable_large_batch_splitting_)
+            : absl::nullopt,
+        GetModelName(c));
     BatchResource* br;
     std::function<Status(BatchResource**)> creator = [this](BatchResource** r) {
       std::unique_ptr<BatchResource> new_resource;
@@ -197,6 +233,7 @@ class BatchFunctionKernel : public AsyncOpKernel {
   std::vector<int32> allowed_batch_sizes_;
   FunctionLibraryRuntime::Handle fhandle_;
   bool enable_large_batch_splitting_;
+  bool has_attribute_enable_large_batch_splitting_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("BatchFunction").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/batch_matmul_op_impl.h b/tensorflow/core/kernels/batch_matmul_op_impl.h
index 89c438b62cc..d6cc980633f 100644
--- a/tensorflow/core/kernels/batch_matmul_op_impl.h
+++ b/tensorflow/core/kernels/batch_matmul_op_impl.h
@@ -50,9 +50,6 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 namespace {
 
@@ -632,48 +629,6 @@ struct LaunchBatchMatMul<GPUDevice, Eigen::half> {
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-template <typename Scalar>
-struct ParallelMatMulKernelSYCL {
-  static void Run(const OpKernelContext* context, const Tensor& in_x,
-                  const Tensor& in_y, bool adj_x, bool adj_y, bool trans_x,
-                  bool trans_y, const MatMulBCast& bcast, Tensor* out,
-                  int start, int limit) {
-    auto Tx = in_x.tensor<Scalar, 3>();
-    auto Ty = in_y.tensor<Scalar, 3>();
-    auto Tz = out->tensor<Scalar, 3>();
-    Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> contract_pairs;
-    contract_pairs[0] = ContractionDims(adj_x || trans_x, adj_y || trans_y);
-    auto d = context->eigen_sycl_device();
-
-    const bool should_bcast = bcast.IsBroadcastingRequired();
-    const auto& x_batch_indices = bcast.x_batch_indices();
-    const auto& y_batch_indices = bcast.y_batch_indices();
-    for (int64 i = start; i < limit; ++i) {
-      const int64 x_batch_index = should_bcast ? x_batch_indices[i] : i;
-      const int64 y_batch_index = should_bcast ? y_batch_indices[i] : i;
-
-      auto x = Tx.template chip<0>(x_batch_index);
-      auto y = Ty.template chip<0>(y_batch_index);
-      auto z = Tz.template chip<0>(i);
-      z.device(d) = x.contract(y, contract_pairs);
-    }
-  }
-};
-
-template <typename Scalar>
-struct LaunchBatchMatMul<SYCLDevice, Scalar> {
-  static void Launch(OpKernelContext* context, const Tensor& in_x,
-                     const Tensor& in_y, bool adj_x, bool adj_y, bool trans_x,
-                     bool trans_y, const MatMulBCast& bcast, Tensor* out) {
-    // Number of matrix multiplies i.e. size of the batch.
-    const int64 batch_size = bcast.output_batch_size();
-    ParallelMatMulKernelSYCL<Scalar>::Run(context, in_x, in_y, adj_x, adj_y,
-                                          trans_x, trans_y, bcast, out, 0,
-                                          batch_size);
-  }
-};
-#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename Scalar>
 class BaseBatchMatMulOp : public OpKernel {
@@ -826,15 +781,6 @@ class BatchMatMulV2Op : public BaseBatchMatMulOp<Device, Scalar> {
       Name("BatchMatMulV2").Device(DEVICE_GPU).TypeConstraint<TYPE>("T"), \
       BatchMatMulV2Op<GPUDevice, TYPE>)
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_BATCH_MATMUL_SYCL(TYPE)                                   \
-  REGISTER_KERNEL_BUILDER(                                                 \
-      Name("BatchMatMul").Device(DEVICE_SYCL).TypeConstraint<TYPE>("T"),   \
-      BatchMatMulOp<SYCLDevice, TYPE>);                                    \
-  REGISTER_KERNEL_BUILDER(                                                 \
-      Name("BatchMatMulV2").Device(DEVICE_SYCL).TypeConstraint<TYPE>("T"), \
-      BatchMatMulV2Op<SYCLDevice, TYPE>)
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_KERNELS_BATCH_MATMUL_OP_IMPL_H_
diff --git a/tensorflow/core/kernels/batch_matmul_op_real.cc b/tensorflow/core/kernels/batch_matmul_op_real.cc
index 075666c1dc3..30ec13e6b4d 100644
--- a/tensorflow/core/kernels/batch_matmul_op_real.cc
+++ b/tensorflow/core/kernels/batch_matmul_op_real.cc
@@ -34,8 +34,4 @@ TF_CALL_double(REGISTER_BATCH_MATMUL_GPU);
 TF_CALL_half(REGISTER_BATCH_MATMUL_GPU);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-TF_CALL_float(REGISTER_BATCH_MATMUL_SYCL);
-TF_CALL_double(REGISTER_BATCH_MATMUL_SYCL);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/batch_norm_op.cc b/tensorflow/core/kernels/batch_norm_op.cc
index 4a03abbba49..f9783b52574 100644
--- a/tensorflow/core/kernels/batch_norm_op.cc
+++ b/tensorflow/core/kernels/batch_norm_op.cc
@@ -28,9 +28,6 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T>
 class BatchNormOp : public OpKernel {
@@ -208,17 +205,6 @@ TF_CALL_float(REGISTER_GPU_KERNEL);
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_KERNEL(T)                                         \
-  REGISTER_KERNEL_BUILDER(Name("BatchNormWithGlobalNormalization") \
-                              .Device(DEVICE_SYCL)                 \
-                              .TypeConstraint<T>("T"),             \
-                          BatchNormOp<SYCLDevice, T>);
-
-TF_CALL_float(REGISTER_KERNEL);
-TF_CALL_double(REGISTER_KERNEL);
-#undef REGISTER_KERNEL
-#endif  // TENSORFLOW_USE_SYCL
 
 #define REGISTER_KERNEL(T)                                             \
   REGISTER_KERNEL_BUILDER(Name("BatchNormWithGlobalNormalizationGrad") \
@@ -267,17 +253,5 @@ TF_CALL_float(REGISTER_GPU_KERNEL);
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_KERNEL(T)                                             \
-  REGISTER_KERNEL_BUILDER(Name("BatchNormWithGlobalNormalizationGrad") \
-                              .Device(DEVICE_SYCL)                     \
-                              .TypeConstraint<T>("T"),                 \
-                          BatchNormGradOp<SYCLDevice, T>);
-
-TF_CALL_float(REGISTER_KERNEL);
-TF_CALL_double(REGISTER_KERNEL);
-#undef REGISTER_KERNEL
-
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/batching_util/BUILD b/tensorflow/core/kernels/batching_util/BUILD
index b662e2e066a..0ecc6f7f10c 100644
--- a/tensorflow/core/kernels/batching_util/BUILD
+++ b/tensorflow/core/kernels/batching_util/BUILD
@@ -97,6 +97,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core/profiler/lib:connected_traceme",
         "//tensorflow/core/profiler/lib:traceme",
+        "//tensorflow/core/profiler/lib:traceme_encode",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
index fedea93849c..3e587038005 100644
--- a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
@@ -134,6 +134,17 @@ class AdaptiveSharedBatchScheduler
     // A non-zero value can improve performance by limiting the scheduling of
     // nearly empty batches.
     int64 batch_timeout_micros = 0;
+    // If non nullptr, split_input_task_func should split input_task into
+    // multiple tasks, the first of which has size first_size and the remaining
+    // not exceeding max_size. This function may acquire ownership of input_task
+    // and should return a status indicating if the split was successful. Upon
+    // success, the caller can assume that all output_tasks will be scheduled.
+    // Including this option allows the scheduler to pack batches better and
+    // should usually improve overall throughput.
+    std::function<Status(std::unique_ptr<TaskType>* input_task, int first_size,
+                         int max_size,
+                         std::vector<std::unique_ptr<TaskType>>* output_tasks)>
+        split_input_task_func;
   };
 
   using BatchProcessor = std::function<void(std::unique_ptr<Batch<TaskType>>)>;
@@ -149,7 +160,7 @@ class AdaptiveSharedBatchScheduler
   }
 
  private:
-  // access to AddBatch, RemoveQueue, GetEnv.
+  // access to AddBatch, MaybeScheduleClosedBatches, RemoveQueue, GetEnv.
   friend class internal::ASBSQueue<TaskType>;
 
   explicit AdaptiveSharedBatchScheduler(const Options& options);
@@ -161,16 +172,17 @@ class AdaptiveSharedBatchScheduler
   // Schedules batch if in_flight_batches_limit_ is not met.
   void MaybeScheduleNextBatch() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
-  // Schedules the earliest closed batch in batches_
-  // if batch_thread_pool_ has an idle thead.
+  // Schedules all closed batches in batches_ for which an idle thread is
+  // available in batch_thread_pool_.
   // Batches scheduled this way are called express batches.
   // Express batches are not limited by in_flight_batches_limit_, and
   // their latencies will not affect in_flight_batches_limit_.
-  void MaybeScheduleClosedBatch() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  void MaybeScheduleClosedBatches();
+
+  void MaybeScheduleClosedBatchesLocked() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // Notifies scheduler of non-empty batch which is eligible for processing.
-  void AddBatch(const internal::ASBSBatch<TaskType>* batch,
-                bool also_schedule_closed_batch);
+  void AddBatch(const internal::ASBSBatch<TaskType>* batch);
 
   // Removes queue from scheduler.
   void RemoveQueue(const internal::ASBSQueue<TaskType>* queue);
@@ -262,6 +274,9 @@ class ASBSQueue : public BatchScheduler<TaskType> {
   size_t max_task_size() const override { return options_.max_batch_size; }
 
  private:
+  // Number of size 1 tasks which could currently be scheduled without failing.
+  size_t SchedulingCapacityLocked() const TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
   std::shared_ptr<AdaptiveSharedBatchScheduler<TaskType>> scheduler_;
   const QueueOptions options_;
   // Owned by scheduler_.
@@ -392,19 +407,24 @@ Status AdaptiveSharedBatchScheduler<TaskType>::AddQueue(
 
 template <typename TaskType>
 void AdaptiveSharedBatchScheduler<TaskType>::AddBatch(
-    const internal::ASBSBatch<TaskType>* batch,
-    bool also_schedule_closed_batch) {
+    const internal::ASBSBatch<TaskType>* batch) {
   mutex_lock l(mu_);
   batches_.push_back(batch);
-  // Maybe schedule this batch once it becomes schedulable.
+  int64 delay_micros = batch->schedulable_time_micros() - GetEnv()->NowMicros();
+  if (delay_micros <= 0) {
+    MaybeScheduleNextBatch();
+    return;
+  }
+  // Try to schedule batch once it becomes schedulable. Although scheduler waits
+  // for all batches to finish processing before allowing itself to be deleted,
+  // MaybeScheduleNextBatch() is called in other places, and therefore it's
+  // possible the scheduler could be deleted by the time this closure runs.
+  // Grab a shared_ptr reference to prevent this from happening.
   GetEnv()->SchedClosureAfter(
-      batch->schedulable_time_micros() - batch->creation_time_micros(), [this] {
+      delay_micros, [this, lifetime_preserver = this->shared_from_this()] {
         mutex_lock l(mu_);
         MaybeScheduleNextBatch();
       });
-  if (also_schedule_closed_batch) {
-    MaybeScheduleClosedBatch();
-  }
 }
 
 template <typename TaskType>
@@ -425,7 +445,7 @@ void AdaptiveSharedBatchScheduler<TaskType>::MaybeScheduleNextBatch() {
     return;
   }
   auto best_it = batches_.end();
-  double best_score;
+  double best_score = (std::numeric_limits<double>::max)();
   int64 now_micros = GetEnv()->NowMicros();
   for (auto it = batches_.begin(); it != batches_.end(); it++) {
     if ((*it)->schedulable_time_micros() > now_micros) continue;
@@ -451,21 +471,31 @@ void AdaptiveSharedBatchScheduler<TaskType>::MaybeScheduleNextBatch() {
 }
 
 template <typename TaskType>
-void AdaptiveSharedBatchScheduler<TaskType>::MaybeScheduleClosedBatch() {
-  if (in_flight_batches_ + in_flight_express_batches_ >=
-      options_.num_batch_threads) {
-    return;
-  }
-  for (auto it = batches_.begin(); it != batches_.end(); it++) {
+void AdaptiveSharedBatchScheduler<TaskType>::MaybeScheduleClosedBatches() {
+  mutex_lock l(mu_);
+  MaybeScheduleClosedBatchesLocked();
+}
+
+template <typename TaskType>
+void AdaptiveSharedBatchScheduler<
+    TaskType>::MaybeScheduleClosedBatchesLocked() {
+  // Only schedule closed batches if we have spare capacity.
+  int available_threads =
+      static_cast<int>(options_.num_batch_threads - in_flight_batches_ -
+                       in_flight_express_batches_);
+  for (auto it = batches_.begin();
+       it != batches_.end() && available_threads > 0;) {
     if ((*it)->IsClosed()) {
       const internal::ASBSBatch<TaskType>* batch = *it;
-      batches_.erase(it);
+      it = batches_.erase(it);
       batch->queue()->ReleaseBatch(batch);
       batch_thread_pool_->Schedule(
           std::bind(&AdaptiveSharedBatchScheduler<TaskType>::CallbackWrapper,
                     this, batch, queues_and_callbacks_[batch->queue()], true));
       in_flight_express_batches_++;
-      return;
+      available_threads--;
+    } else {
+      ++it;
     }
   }
 }
@@ -482,7 +512,7 @@ void AdaptiveSharedBatchScheduler<TaskType>::CallbackWrapper(
   mutex_lock l(mu_);
   if (is_express) {
     in_flight_express_batches_--;
-    MaybeScheduleClosedBatch();
+    MaybeScheduleClosedBatchesLocked();
     return;
   }
   in_flight_batches_--;
@@ -554,38 +584,71 @@ ASBSQueue<TaskType>::~ASBSQueue() {
 
 template <typename TaskType>
 Status ASBSQueue<TaskType>::Schedule(std::unique_ptr<TaskType>* task) {
-  ASBSBatch<TaskType>* new_batch = nullptr;
   size_t size = (*task)->size();
-  if (size > options_.max_batch_size) {
+  if (options_.split_input_task_func == nullptr &&
+      size > options_.max_batch_size) {
     return errors::InvalidArgument("Task size ", size,
                                    " is larger than maximum batch size ",
                                    options_.max_batch_size);
   }
-  bool is_old_batch_closed = false;
+  std::vector<std::unique_ptr<TaskType>> tasks_to_schedule;
+  std::vector<ASBSBatch<TaskType>*> new_batches;
+  bool closed_batch = false;
   {
     mutex_lock l(mu_);
-    // Current batch is full, create another if allowed.
-    if (current_batch_ &&
-        current_batch_->size() + size > options_.max_batch_size) {
-      if (num_enqueued_batches_ >= options_.max_enqueued_batches) {
-        return errors::Unavailable("The batch scheduling queue is full");
+    if (size > SchedulingCapacityLocked()) {
+      return errors::Unavailable("The batch scheduling queue is full");
+    }
+    int remaining_batch_size =
+        current_batch_ == nullptr
+            ? options_.max_batch_size
+            : options_.max_batch_size - current_batch_->size();
+    if (options_.split_input_task_func == nullptr ||
+        size <= remaining_batch_size) {
+      // Either we don't allow task splitting or task fits within the current
+      // batch.
+      tasks_to_schedule.push_back(std::move(*task));
+    } else {
+      // Split task in order to completely fill the current batch.
+      // Beyond this point Schedule should not fail, as the caller has been
+      // promised that all of the split tasks will be scheduled.
+      TF_RETURN_IF_ERROR(options_.split_input_task_func(
+          task, remaining_batch_size, options_.max_batch_size,
+          &tasks_to_schedule));
+    }
+    for (auto& task : tasks_to_schedule) {
+      // Can't fit within current batch, close it off and try to create another.
+      if (current_batch_ &&
+          current_batch_->size() + task->size() > options_.max_batch_size) {
+        current_batch_->Close();
+        closed_batch = true;
+        current_batch_ = nullptr;
+      }
+      if (!current_batch_) {
+        num_enqueued_batches_++;
+        current_batch_ =
+            new ASBSBatch<TaskType>(this, scheduler_->GetEnv()->NowMicros(),
+                                    options_.batch_timeout_micros);
+        new_batches.push_back(current_batch_);
+      }
+      current_batch_->AddTask(std::move(task));
+      num_enqueued_tasks_++;
+      // If current_batch_ is now full, allow it to be processed immediately.
+      if (current_batch_->size() == options_.max_batch_size) {
+        current_batch_->Close();
+        closed_batch = true;
+        current_batch_ = nullptr;
       }
-      current_batch_->Close();
-      is_old_batch_closed = true;
-      current_batch_ = nullptr;
     }
-    if (!current_batch_) {
-      num_enqueued_batches_++;
-      current_batch_ = new_batch =
-          new ASBSBatch<TaskType>(this, scheduler_->GetEnv()->NowMicros(),
-                                  options_.batch_timeout_micros);
-    }
-    current_batch_->AddTask(std::move(*task));
-    num_enqueued_tasks_++;
   }
-  // AddBatch must be called outside of lock, since it may call ReleaseBatch.
-  if (new_batch != nullptr)
-    scheduler_->AddBatch(new_batch, is_old_batch_closed);
+  // Scheduler functions must be called outside of lock, since they may call
+  // ReleaseBatch.
+  for (auto* batch : new_batches) {
+    scheduler_->AddBatch(batch);
+  }
+  if (closed_batch) {
+    scheduler_->MaybeScheduleClosedBatches();
+  }
   return Status::OK();
 }
 
@@ -609,6 +672,11 @@ size_t ASBSQueue<TaskType>::NumEnqueuedTasks() const {
 template <typename TaskType>
 size_t ASBSQueue<TaskType>::SchedulingCapacity() const {
   mutex_lock l(mu_);
+  return SchedulingCapacityLocked();
+}
+
+template <typename TaskType>
+size_t ASBSQueue<TaskType>::SchedulingCapacityLocked() const {
   const int current_batch_capacity =
       current_batch_ ? options_.max_batch_size - current_batch_->size() : 0;
   const int spare_batches =
diff --git a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler_test.cc b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler_test.cc
index af356cf24db..ef2f66dcf70 100644
--- a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler_test.cc
+++ b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/batching_util/fake_clock_env.h"
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -33,8 +34,10 @@ class FakeTask : public BatchTask {
 
   size_t size() const override { return size_; }
 
+  void set_size(size_t size) { size_ = size; }
+
  private:
-  const size_t size_;
+  size_t size_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(FakeTask);
 };
@@ -124,10 +127,14 @@ TEST(AdaptiveSharedBatchSchedulerTest, InFlightBatchesLimit) {
   std::unique_ptr<BatchScheduler<FakeTask>> queue;
   TF_ASSERT_OK(scheduler->AddQueue({}, queue_callback, &queue));
 
-  // Enqueue 3 tasks, should result in 3 batches.
-  for (int i = 0; i < 3; i++) {
-    TF_ASSERT_OK(ScheduleTask(100, queue.get()));
+  // Enqueue 3 batches.
+  TF_ASSERT_OK(ScheduleTask(100, queue.get()));
+  while (queue->NumEnqueuedTasks() > 0) {
   }
+  TF_ASSERT_OK(ScheduleTask(100, queue.get()));
+  while (queue->NumEnqueuedTasks() > 0) {
+  }
+  TF_ASSERT_OK(ScheduleTask(100, queue.get()));
 }
 
 TEST(AdaptiveSharedBatchSchedulerTest, InFlightBatchesLimitTuning) {
@@ -201,6 +208,7 @@ TEST(AdaptiveSharedBatchSchedulerTest, FullBatchSchedulingBoostMicros) {
     AdaptiveSharedBatchScheduler<FakeTask>::Options options;
     options.env = &env;
     options.initial_in_flight_batches_limit = 1;
+    options.num_batch_threads = 1;
     options.batches_to_average_over = 1000;
     options.full_batch_scheduling_boost_micros = 100;
     mutex mu;
@@ -242,6 +250,8 @@ TEST(AdaptiveSharedBatchSchedulerTest, FullBatchSchedulingBoostMicros) {
 
     // First batch immediately processed.
     TF_ASSERT_OK(ScheduleTask(100, queue1.get()));
+    while (queue1->NumEnqueuedTasks() > 0) {
+    }
 
     TF_ASSERT_OK(ScheduleTask(100, queue1.get()));
     env.AdvanceByMicroseconds(10);
@@ -266,6 +276,7 @@ TEST(AdaptiveSharedBatchSchedulerTest, FullBatchSchedulingBoostMicros) {
 TEST(AdaptiveSharedBatchSchedulerTest, DeleteQueue) {
   AdaptiveSharedBatchScheduler<FakeTask>::Options options;
   options.initial_in_flight_batches_limit = 1;
+  options.num_batch_threads = 1;
   options.batches_to_average_over = 1000;
   mutex mu;
   int processed_batches = 0;
@@ -280,63 +291,24 @@ TEST(AdaptiveSharedBatchSchedulerTest, DeleteQueue) {
     mu.unlock();
   };
 
-  std::unique_ptr<Thread> queue_deleter;
-  std::shared_ptr<AdaptiveSharedBatchScheduler<FakeTask>> scheduler;
-  TF_ASSERT_OK(
-      AdaptiveSharedBatchScheduler<FakeTask>::Create(options, &scheduler));
-  std::unique_ptr<BatchScheduler<FakeTask>> queue;
-  TF_ASSERT_OK(scheduler->AddQueue({}, queue_callback, &queue));
-
-  // Enqueue 2 tasks, should result in 2 batches.
-  for (int i = 0; i < 2; i++) {
-    TF_ASSERT_OK(ScheduleTask(100, queue.get()));
-  }
-  // Delete queue, should be kept alive until empty.
-  queue_deleter.reset(Env::Default()->StartThread(
-      {}, "QueueDeleterThread", [&queue, &mu, &processed_batches] {
-        queue.reset();
-        mutex_lock l(mu);
-        EXPECT_EQ(processed_batches, 2);
-      }));
-  // Give queue_deleter thread time to delete queue.
-  Env::Default()->SleepForMicroseconds(1000);
-  finish_processing.Notify();
-}
-
-TEST(AdaptiveSharedBatchSchedulerTest, DeleteScheduler) {
-  AdaptiveSharedBatchScheduler<FakeTask>::Options options;
-  options.initial_in_flight_batches_limit = 1;
-  options.batches_to_average_over = 1000;
-  mutex mu;
-  int processed_batches = 0;
-  Notification finish_processing;
-  auto queue_callback = [&mu, &processed_batches, &finish_processing](
-                            std::unique_ptr<Batch<FakeTask>> batch) {
-    ASSERT_TRUE(batch->IsClosed());
-    EXPECT_GT(batch->num_tasks(), 0);
-    finish_processing.WaitForNotification();
-    mu.lock();
-    processed_batches++;
-    mu.unlock();
-  };
-
-  std::shared_ptr<AdaptiveSharedBatchScheduler<FakeTask>> scheduler;
-  TF_ASSERT_OK(
-      AdaptiveSharedBatchScheduler<FakeTask>::Create(options, &scheduler));
-  std::unique_ptr<BatchScheduler<FakeTask>> queue;
-  TF_ASSERT_OK(scheduler->AddQueue({}, queue_callback, &queue));
-
-  // Enqueue 2 tasks, should result in 2 batches.
-  for (int i = 0; i < 2; i++) {
-    TF_ASSERT_OK(ScheduleTask(100, queue.get()));
-  }
-  // Delete scheduler, should be kept alive until queues are empty.
-  scheduler.reset();
-  finish_processing.Notify();
-  while (true) {
+  auto processed_checker = gtl::MakeCleanup([&mu, &processed_batches] {
     mutex_lock l(mu);
-    if (processed_batches == 2) break;
+    EXPECT_EQ(processed_batches, 2);
+  });
+  std::shared_ptr<AdaptiveSharedBatchScheduler<FakeTask>> scheduler;
+  TF_ASSERT_OK(
+      AdaptiveSharedBatchScheduler<FakeTask>::Create(options, &scheduler));
+  std::unique_ptr<BatchScheduler<FakeTask>> queue;
+  TF_ASSERT_OK(scheduler->AddQueue({}, queue_callback, &queue));
+
+  // Enqueue 2 tasks, should result in 2 batches.
+  TF_ASSERT_OK(ScheduleTask(100, queue.get()));
+  while (queue->NumEnqueuedTasks() > 0) {
   }
+  TF_ASSERT_OK(ScheduleTask(100, queue.get()));
+  // Queue destructor should block until second batch has been scheduled.
+  Env::Default()->SchedClosureAfter(
+      1000, [&finish_processing] { finish_processing.Notify(); });
 }
 
 TEST(AdaptiveSharedBatchSchedulerTest, QueueCapacityInfo) {
@@ -364,9 +336,10 @@ TEST(AdaptiveSharedBatchSchedulerTest, QueueCapacityInfo) {
   TF_ASSERT_OK(scheduler->AddQueue({}, queue_callback, &queue));
 
   // Enqueue 2 tasks, should result in 2 batches.
-  for (int i = 0; i < 2; i++) {
-    TF_ASSERT_OK(ScheduleTask(100, queue.get()));
+  TF_ASSERT_OK(ScheduleTask(100, queue.get()));
+  while (queue->NumEnqueuedTasks() > 0) {
   }
+  TF_ASSERT_OK(ScheduleTask(100, queue.get()));
   // First batch was immediately processed, no longer counts as enqueued.
   EXPECT_EQ(queue->NumEnqueuedTasks(), 1);
   EXPECT_EQ(queue->SchedulingCapacity(), 9 * 1000 + 900);
@@ -375,12 +348,69 @@ TEST(AdaptiveSharedBatchSchedulerTest, QueueCapacityInfo) {
   TF_ASSERT_OK(ScheduleTask(200, queue.get()));
   EXPECT_EQ(queue->NumEnqueuedTasks(), 3);
   EXPECT_EQ(queue->SchedulingCapacity(), 9 * 1000 + 600);
-  // Enqueue 1 more task, should create new batch.
+  // Enqueue 1 more task, should create new batch and start processing the
+  // previous batch.
   TF_ASSERT_OK(ScheduleTask(700, queue.get()));
-  EXPECT_EQ(queue->NumEnqueuedTasks(), 4);
-  EXPECT_EQ(queue->SchedulingCapacity(), 8 * 1000 + 300);
+  EXPECT_EQ(queue->NumEnqueuedTasks(), 1);
+  EXPECT_EQ(queue->SchedulingCapacity(), 9 * 1000 + 300);
   finish_processing.Notify();
 }
+
+TEST(AdaptiveSharedBatchSchedulerTest, FullBatches) {
+  std::shared_ptr<AdaptiveSharedBatchScheduler<FakeTask>> scheduler;
+  TF_ASSERT_OK(AdaptiveSharedBatchScheduler<FakeTask>::Create({}, &scheduler));
+  auto queue_callback = [](std::unique_ptr<Batch<FakeTask>> batch) {
+    ASSERT_TRUE(batch->IsClosed());
+  };
+  AdaptiveSharedBatchScheduler<FakeTask>::QueueOptions queue_options;
+  queue_options.max_batch_size = 100;
+  queue_options.batch_timeout_micros = 1000000000000;
+  std::unique_ptr<BatchScheduler<FakeTask>> queue;
+  TF_ASSERT_OK(scheduler->AddQueue(queue_options, queue_callback, &queue));
+  TF_ASSERT_OK(ScheduleTask(100, queue.get()));
+  // Full batches should not have to wait batch_timeout_micros.
+}
+
+TEST(AdaptiveSharedBatchSchedulerTest, TruncateBatches) {
+  mutex mu;
+  int processed_batches = 0;
+  auto queue_callback =
+      [&mu, &processed_batches](std::unique_ptr<Batch<FakeTask>> batch) {
+        ASSERT_TRUE(batch->IsClosed());
+        mutex_lock l(mu);
+        ++processed_batches;
+      };
+  std::shared_ptr<AdaptiveSharedBatchScheduler<FakeTask>> scheduler;
+  TF_ASSERT_OK(AdaptiveSharedBatchScheduler<FakeTask>::Create({}, &scheduler));
+  std::unique_ptr<BatchScheduler<FakeTask>> queue;
+
+  AdaptiveSharedBatchScheduler<FakeTask>::QueueOptions queue_options;
+  queue_options.max_batch_size = 100;
+  queue_options.batch_timeout_micros = 1000000;
+  queue_options.split_input_task_func =
+      [](std::unique_ptr<FakeTask>* input_task, int first_size, int max_size,
+         std::vector<std::unique_ptr<FakeTask>>* output_tasks) {
+        EXPECT_EQ(first_size, 70);
+        output_tasks->push_back(std::move(*input_task));
+        int remaining_size = output_tasks->back()->size() - first_size;
+        output_tasks->back()->set_size(first_size);
+        while (remaining_size > 0) {
+          int task_size = std::min(remaining_size, max_size);
+          output_tasks->emplace_back(new FakeTask(task_size));
+          remaining_size -= task_size;
+        }
+        return Status::OK();
+      };
+  TF_ASSERT_OK(scheduler->AddQueue(queue_options, queue_callback, &queue));
+  TF_ASSERT_OK(ScheduleTask(30, queue.get()));
+  TF_ASSERT_OK(ScheduleTask(350, queue.get()));
+  // Second task should be split into a task of size 70, 2 tasks of size 100,
+  // and one task of size 80.
+  while (true) {
+    mutex_lock l(mu);
+    if (processed_batches == 4) break;
+  }
+}
 }  // namespace anonymous
 }  // namespace serving
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/batching_util/basic_batch_scheduler.h b/tensorflow/core/kernels/batching_util/basic_batch_scheduler.h
index bd44115db22..cdaf5331e9f 100644
--- a/tensorflow/core/kernels/batching_util/basic_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/basic_batch_scheduler.h
@@ -282,7 +282,8 @@ Status BasicBatchScheduler<TaskType>::Create(
 
   typename SharedBatchScheduler<TaskType>::QueueOptions
       shared_scheduler_queue_options;
-  shared_scheduler_queue_options.max_batch_size = options.max_batch_size;
+  shared_scheduler_queue_options.input_batch_size_limit =
+      options.max_batch_size;
   shared_scheduler_queue_options.batch_timeout_micros =
       options.batch_timeout_micros;
   shared_scheduler_queue_options.max_enqueued_batches =
diff --git a/tensorflow/core/kernels/batching_util/batch_resource_base.cc b/tensorflow/core/kernels/batching_util/batch_resource_base.cc
index adfa172cf1c..98175b5b9d0 100644
--- a/tensorflow/core/kernels/batching_util/batch_resource_base.cc
+++ b/tensorflow/core/kernels/batching_util/batch_resource_base.cc
@@ -88,7 +88,8 @@ using TensorMatrix = std::vector<std::vector<Tensor>>;
 Status BatchResourceBase::RegisterInput(
     int64 guid, OpKernelContext* context, const string& batcher_queue_name,
     AsyncOpKernel::DoneCallback done_callback) {
-  auto batch_components = absl::make_unique<BatchResourceBase::BatchTask>();
+  std::unique_ptr<BatchTask> batch_components;
+  TF_RETURN_IF_ERROR(CreateBatchTask(context, &batch_components));
   batch_components->start_time = EnvTime::NowNanos();
   batch_components->guid = guid;
   batch_components->propagated_context = Context(ContextKind::kThread);
@@ -136,7 +137,7 @@ BatchResourceBase::GetBatcherQueueOptions(
     int32 max_enqueued_batches, const std::vector<int32>& allowed_batch_sizes,
     bool enable_large_batch_splitting) {
   BatcherT::QueueOptions batcher_queue_options;
-  batcher_queue_options.max_batch_size = max_batch_size;
+  batcher_queue_options.input_batch_size_limit = max_batch_size;
   batcher_queue_options.max_enqueued_batches = max_enqueued_batches;
   batcher_queue_options.batch_timeout_micros = batch_timeout_micros;
   // Support for splitting large batch is still in progress.
@@ -441,8 +442,8 @@ void BatchResourceBase::ProcessFuncBatch(std::unique_ptr<BatchT> batch) const {
   // which are running this Session, of which this BatchOp is a part.
   WithContext wc(batch->task(batch->num_tasks() - 1).propagated_context);
 
-  OpKernelContext* last_task_context =
-      batch->task(batch->num_tasks() - 1).context;
+  auto& last_task = batch->task(batch->num_tasks() - 1);
+  OpKernelContext* last_task_context = last_task.context;
 
   // Regardless of the outcome, we need to propagate the status to the
   // individual tasks and signal that they are done. We use MakeCleanup() to
@@ -495,25 +496,24 @@ void BatchResourceBase::ProcessFuncBatch(std::unique_ptr<BatchT> batch) const {
   // Releases the cleanup method here, because the callback of the function
   // library runtime will handle it now.
   finally.release();
-  ProcessFuncBatchImpl(last_task_context, args, &combined_outputs,
-                       [&](const Status& run_status) {
-                         Status final_status;
-                         auto run_finally = gtl::MakeCleanup([&]() {
-                           // We do the cleanup here as an optimization, so that
-                           // it runs in the underlying TF inter-op threadpool.
-                           // Running it in the threadpool, let's the ensuing
-                           // ops be scheduled faster, because the executor will
-                           // add them to the front of the threadpool's task
-                           // queue rather than the end.
-                           cleanup_fn(final_status);
-                         });
-                         final_status = run_status;
-                         if (!final_status.ok()) {
-                           return;
-                         }
-                         final_status =
-                             SplitOutputTensors(combined_outputs, batch.get());
-                       });
+  ProcessFuncBatchImpl(
+      last_task, args, &combined_outputs, [&](const Status& run_status) {
+        Status final_status;
+        auto run_finally = gtl::MakeCleanup([&]() {
+          // We do the cleanup here as an optimization, so that
+          // it runs in the underlying TF inter-op threadpool.
+          // Running it in the threadpool, let's the ensuing
+          // ops be scheduled faster, because the executor will
+          // add them to the front of the threadpool's task
+          // queue rather than the end.
+          cleanup_fn(final_status);
+        });
+        final_status = run_status;
+        if (!final_status.ok()) {
+          return;
+        }
+        final_status = SplitOutputTensors(combined_outputs, batch.get());
+      });
 }
 
 // Processes a batch of one or more BatchTask entries.
@@ -632,5 +632,12 @@ Status BatchResourceBase::LookupOrCreateBatcherQueue(const string& queue_name,
   return Status::OK();
 }
 
+Status BatchResourceBase::CreateBatchTask(
+    OpKernelContext* context,
+    std::unique_ptr<BatchResourceBase::BatchTask>* output) const {
+  *output = absl::make_unique<BatchResourceBase::BatchTask>();
+  return Status::OK();
+}
+
 }  // namespace serving
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/batching_util/batch_resource_base.h b/tensorflow/core/kernels/batching_util/batch_resource_base.h
index 0471207c951..39d6e3dd951 100644
--- a/tensorflow/core/kernels/batching_util/batch_resource_base.h
+++ b/tensorflow/core/kernels/batching_util/batch_resource_base.h
@@ -116,10 +116,15 @@ class BatchResourceBase : public ResourceBase {
  private:
   // Implementation of calling the process batch function.
   virtual void ProcessFuncBatchImpl(
-      OpKernelContext* last_task_context, absl::Span<const Tensor> inputs,
-      std::vector<Tensor>* combined_outputs,
+      const BatchResourceBase::BatchTask& last_task,
+      absl::Span<const Tensor> inputs, std::vector<Tensor>* combined_outputs,
       std::function<void(const Status&)> done) const = 0;
 
+  // Factory method for creating a BatchTask, overridable by subclasses.
+  virtual Status CreateBatchTask(
+      OpKernelContext* context,
+      std::unique_ptr<BatchResourceBase::BatchTask>* output) const;
+
   // Validates that it's legal to combine the tasks in 'batch' into a batch.
   // Assumes the batch is non-empty.
   static Status ValidateBatch(const BatchT& batch);
diff --git a/tensorflow/core/kernels/batching_util/concat_split_util.h b/tensorflow/core/kernels/batching_util/concat_split_util.h
index 50ffc664452..77c4463f118 100644
--- a/tensorflow/core/kernels/batching_util/concat_split_util.h
+++ b/tensorflow/core/kernels/batching_util/concat_split_util.h
@@ -29,9 +29,6 @@ namespace concat_split_util {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 // Concatenates 'inputs' into a single tensor along the zeroth dimension.
 // Requires that all elements of 'inputs' have element type T. Writes to
@@ -174,8 +171,11 @@ Status SplitCPU(OpKernelContext* context, const Tensor& input,
         context->allocate_temp(input.dtype(), output_shape, &output));
     auto output_shaped = output.shaped<T, 2>({size, suffix_dim_size});
 
-    Eigen::DSizes<Eigen::DenseIndex, 2> slice_indices{position, 0};
-    Eigen::DSizes<Eigen::DenseIndex, 2> slice_sizes{size, suffix_dim_size};
+    Eigen::DSizes<Eigen::DenseIndex, 2> slice_indices{
+        static_cast<Eigen::DenseIndex>(position), 0};
+    Eigen::DSizes<Eigen::DenseIndex, 2> slice_sizes{
+        static_cast<Eigen::DenseIndex>(size),
+        static_cast<Eigen::DenseIndex>(suffix_dim_size)};
     functor::Split<CPUDevice, T, 2>()(context->eigen_device<CPUDevice>(),
                                       output_shaped, input_reshaped,
                                       slice_indices, slice_sizes);
diff --git a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
index ce7823a7aef..04b84e6054e 100644
--- a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
@@ -39,6 +39,7 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/lib/connected_traceme.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/core/profiler/lib/traceme_encode.h"
 
 namespace tensorflow {
 namespace serving {
@@ -136,17 +137,15 @@ class SharedBatchScheduler
   struct QueueOptions {
     // The size limit of an input batch to the queue.
     //
-    // If `enable_large_batch_splitting` is True, 'max_batch_size' should be
-    // greater or equal than `max_execution_batch_size`; otherwise
-    // `max_batch_size` should be equal to `max_execution_batch_size`.
-    // TODO(b/154140947):
-    // Rename it to 'input_batch_size_limit' here and in caller's code.
-    size_t max_batch_size = 1000;
+    // If `enable_large_batch_splitting` is True, 'input_batch_size_limit'
+    // should be greater or equal than `max_execution_batch_size`; otherwise
+    // `input_batch_size_limit` should be equal to `max_execution_batch_size`.
+    size_t input_batch_size_limit = 1000;
 
     // If a task has been enqueued for this amount of time (in microseconds),
     // and a thread is available, the scheduler will immediately form a batch
     // from enqueued tasks and assign the batch to the thread for processing,
-    // even if the batch's size is below 'max_batch_size'.
+    // even if the batch's size is below 'input_batch_size_limit'.
     //
     // This parameter offers a way to bound queue latency, so that a task isn't
     // stuck in the queue indefinitely waiting for enough tasks to arrive to
@@ -173,7 +172,7 @@ class SharedBatchScheduler
 
     // `input_task`: a unit of task to be split.
     // `first_output_task_size`: task size of first output.
-    // `max_batch_size`: Maximum size of each batch.
+    // `max_execution_batch_size`: Maximum size of each batch.
     // `output_tasks`: A list of output tasks after split.
     //
     // REQUIRED:
@@ -184,7 +183,7 @@ class SharedBatchScheduler
     // Instantiations of `TaskType` may vary, so it's up to caller to define
     // how (e.g., which members to access) to split input tasks.
     std::function<Status(std::unique_ptr<TaskType>* input_task,
-                         int first_output_task_size, int max_batch_size,
+                         int first_output_task_size, int input_batch_size_limit,
                          std::vector<std::unique_ptr<TaskType>>* output_tasks)>
         split_input_task_func;
 
@@ -269,7 +268,7 @@ class Queue {
   using SchedulableBatchCallback = std::function<void()>;
   using SplitInputTaskIntoSubtasksCallback = std::function<Status(
       std::unique_ptr<TaskType>* input_task, int open_batch_remaining_slot,
-      int max_batch_size,
+      int max_execution_batch_size,
       std::vector<std::unique_ptr<TaskType>>* output_tasks)>;
   Queue(const typename SharedBatchScheduler<TaskType>::QueueOptions& options,
         Env* env, ProcessBatchCallback process_batch_callback,
@@ -297,7 +296,7 @@ class Queue {
   size_t SchedulingCapacity() const;
 
   // Returns the maximum allowed size of tasks submitted to the queue.
-  size_t max_task_size() const { return options_.max_batch_size; }
+  size_t max_task_size() const { return options_.input_batch_size_limit; }
 
   // Returns the maximum allowed size of tasks to be enqueued.
   // Returned value would be less than or equal to the maximum allowed input
@@ -306,7 +305,7 @@ class Queue {
     if (options_.enable_large_batch_splitting) {
       return options_.max_execution_batch_size;
     } else {
-      return options_.max_batch_size;
+      return options_.input_batch_size_limit;
     }
   }
 
@@ -459,9 +458,10 @@ Status SharedBatchScheduler<TaskType>::AddQueue(
     std::function<void(std::unique_ptr<Batch<TaskType>>)>
         process_batch_callback,
     std::unique_ptr<BatchScheduler<TaskType>>* queue) {
-  if (options.max_batch_size == 0) {
-    return errors::InvalidArgument("max_batch_size must be positive; was ",
-                                   options.max_batch_size);
+  if (options.input_batch_size_limit == 0) {
+    return errors::InvalidArgument(
+        "input_batch_size_limit must be positive; was ",
+        options.input_batch_size_limit);
   }
   if (options.batch_timeout_micros < 0) {
     return errors::InvalidArgument(
@@ -483,11 +483,12 @@ Status SharedBatchScheduler<TaskType>::AddQueue(
   }
 
   if (options.enable_large_batch_splitting &&
-      (options.max_batch_size < options.max_execution_batch_size)) {
+      (options.input_batch_size_limit < options.max_execution_batch_size)) {
     return errors::InvalidArgument(
-        "When enable_large_batch_splitting is true, max_batch_size must be "
+        "When enable_large_batch_splitting is true, input_batch_size_limit "
+        "must be "
         "greater than or equal to max_execution_batch_size.",
-        options.enable_large_batch_splitting, options.max_batch_size,
+        options.enable_large_batch_splitting, options.input_batch_size_limit,
         options.max_execution_batch_size);
   }
 
@@ -616,10 +617,10 @@ Status Queue<TaskType>::Schedule(std::unique_ptr<TaskType>* task) {
 
 template <typename TaskType>
 Status Queue<TaskType>::ScheduleWithoutSplit(std::unique_ptr<TaskType>* task) {
-  if ((*task)->size() > options_.max_batch_size) {
+  if ((*task)->size() > options_.input_batch_size_limit) {
     return errors::InvalidArgument("Task size ", (*task)->size(),
                                    " is larger than maximum input batch size ",
-                                   options_.max_batch_size);
+                                   options_.input_batch_size_limit);
   }
 
   bool notify_of_schedulable_batch = false;
@@ -628,7 +629,8 @@ Status Queue<TaskType>::ScheduleWithoutSplit(std::unique_ptr<TaskType>* task) {
 
     DCHECK(!closed_);
 
-    if (batches_.back()->size() + (*task)->size() > options_.max_batch_size) {
+    if (batches_.back()->size() + (*task)->size() >
+        options_.input_batch_size_limit) {
       if (batches_.size() >= options_.max_enqueued_batches) {
         return errors::Unavailable(
             "The batch scheduling queue to which this task was submitted is "
@@ -640,7 +642,10 @@ Status Queue<TaskType>::ScheduleWithoutSplit(std::unique_ptr<TaskType>* task) {
       open_batch_start_time_micros_ = env_->NowMicros();
     }
     profiler::TraceMeProducer trace_me(
-        [&] { return strings::StrCat("Schedule:", (*task)->size()); },
+        [task] {
+          return profiler::TraceMeEncode("Schedule",
+                                         {{"size", (*task)->size()}});
+        },
         profiler::ContextType::kSharedBatchScheduler,
         batches_.back()->traceme_context_id());
     batches_.back()->AddTask(std::move(*task));
@@ -667,12 +672,13 @@ Status Queue<TaskType>::ScheduleWithoutSplit(std::unique_ptr<TaskType>* task) {
 template <typename TaskType>
 Status Queue<TaskType>::ScheduleWithSplit(std::unique_ptr<TaskType>* task) {
   profiler::TraceMe trace_me([task] {
-    return strings::StrCat("ScheduleWithSplit:", (*task)->size());
+    return profiler::TraceMeEncode("ScheduleWithSplit",
+                                   {{"size", (*task)->size()}});
   });
-  if ((*task)->size() > options_.max_batch_size) {
+  if ((*task)->size() > options_.input_batch_size_limit) {
     return errors::InvalidArgument("Task size ", (*task)->size(),
                                    " is larger than maximum input batch size ",
-                                   options_.max_batch_size);
+                                   options_.input_batch_size_limit);
   }
 
   // The max size to be enqueued.
@@ -725,6 +731,13 @@ Status Queue<TaskType>::ScheduleWithSplit(std::unique_ptr<TaskType>* task) {
       if (batches_.back()->empty()) {
         open_batch_start_time_micros_ = env_->NowMicros();
       }
+      profiler::TraceMeProducer trace_me(
+          [&output_tasks, i] {
+            return profiler::TraceMeEncode("ScheduleOutputTask",
+                                           {{"size", output_tasks[i]->size()}});
+          },
+          profiler::ContextType::kSharedBatchScheduler,
+          batches_.back()->traceme_context_id());
       batches_.back()->AddTask(std::move(output_tasks[i]));
     }
 
@@ -794,7 +807,12 @@ std::unique_ptr<Batch<TaskType>> Queue<TaskType>::ScheduleBatch() {
 template <typename TaskType>
 void Queue<TaskType>::ProcessBatch(std::unique_ptr<Batch<TaskType>> batch) {
   profiler::TraceMeConsumer trace_me(
-      [&batch] { return strings::StrCat("ProcessBatch:", batch->size()); },
+      [&] {
+        return profiler::TraceMeEncode(
+            "ProcessBatch",
+            {{"size", batch->size()},
+             {"padding", max_execution_batch_size() - batch->size()}});
+      },
       profiler::ContextType::kSharedBatchScheduler,
       batch->traceme_context_id());
   process_batch_callback_(std::move(batch));
diff --git a/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc b/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc
index a1958777a49..10f34cf829b 100644
--- a/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc
+++ b/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc
@@ -97,7 +97,7 @@ TEST(SharedBatchSchedulerTest, Basic) {
 
           // Create two queues.
           SharedBatchScheduler<FakeTask>::QueueOptions queue_options;
-          queue_options.max_batch_size = 10;
+          queue_options.input_batch_size_limit = 10;
           queue_options.batch_timeout_micros = 10 * 1000 * 1000;  // 10 seconds
           queue_options.max_enqueued_batches = 2;
           std::unique_ptr<BatchScheduler<FakeTask>> queue_0;
@@ -155,7 +155,7 @@ TEST(SharedBatchSchedulerTest, ObeyBatchSizeConstraint) {
     std::shared_ptr<SharedBatchScheduler<FakeTask>> scheduler;
     TF_ASSERT_OK(SharedBatchScheduler<FakeTask>::Create(options, &scheduler));
     SharedBatchScheduler<FakeTask>::QueueOptions queue_options;
-    queue_options.max_batch_size = 10;
+    queue_options.input_batch_size_limit = 10;
     queue_options.batch_timeout_micros = 10 * 1000 * 1000;  // 10 seconds
     queue_options.max_enqueued_batches = 2;
     std::unique_ptr<BatchScheduler<FakeTask>> queue;
@@ -217,7 +217,7 @@ TEST(SharedBatchSchedulerTest, ObeysTimeout) {
     std::shared_ptr<SharedBatchScheduler<FakeTask>> scheduler;
     TF_ASSERT_OK(SharedBatchScheduler<FakeTask>::Create(options, &scheduler));
     SharedBatchScheduler<FakeTask>::QueueOptions queue_options;
-    queue_options.max_batch_size = 4;
+    queue_options.input_batch_size_limit = 4;
     queue_options.batch_timeout_micros = 10;
     queue_options.max_enqueued_batches = 2;
     std::unique_ptr<BatchScheduler<FakeTask>> queue;
@@ -273,7 +273,7 @@ TEST(SharedBatchSchedulerTest, ObeysTimeoutWithRealClock) {
   std::shared_ptr<SharedBatchScheduler<FakeTask>> scheduler;
   TF_ASSERT_OK(SharedBatchScheduler<FakeTask>::Create(options, &scheduler));
   SharedBatchScheduler<FakeTask>::QueueOptions queue_options;
-  queue_options.max_batch_size = 10;
+  queue_options.input_batch_size_limit = 10;
   queue_options.batch_timeout_micros = 100 * 1000;  // 100 milliseconds
   queue_options.max_enqueued_batches = 2;
   std::unique_ptr<BatchScheduler<FakeTask>> queue;
@@ -318,7 +318,7 @@ TEST(SharedBatchSchedulerTest,
     TF_ASSERT_OK(SharedBatchScheduler<FakeTask>::Create(options, &scheduler));
     SharedBatchScheduler<FakeTask>::QueueOptions queue_options;
     // Set a large batch size, so that we don't hit the batch size limit.
-    queue_options.max_batch_size = 100;
+    queue_options.input_batch_size_limit = 100;
     // Process a batch as soon as a thread is available.
     queue_options.batch_timeout_micros = 0;
     queue_options.max_enqueued_batches = 2;
@@ -371,7 +371,7 @@ TEST(SharedBatchSchedulerTest, Fairness) {
     std::shared_ptr<SharedBatchScheduler<FakeTask>> scheduler;
     TF_ASSERT_OK(SharedBatchScheduler<FakeTask>::Create(options, &scheduler));
     SharedBatchScheduler<FakeTask>::QueueOptions queue_options;
-    queue_options.max_batch_size = 10;
+    queue_options.input_batch_size_limit = 10;
     queue_options.batch_timeout_micros = 1;
     queue_options.max_enqueued_batches = 100 /* give plenty of room */;
     std::vector<std::unique_ptr<BatchScheduler<FakeTask>>> queues(2);
@@ -423,7 +423,7 @@ TEST(SharedBatchSchedulerTest, ConstMethods) {
     std::shared_ptr<SharedBatchScheduler<FakeTask>> scheduler;
     TF_ASSERT_OK(SharedBatchScheduler<FakeTask>::Create(options, &scheduler));
     SharedBatchScheduler<FakeTask>::QueueOptions queue_options;
-    queue_options.max_batch_size = 2;
+    queue_options.input_batch_size_limit = 2;
     queue_options.batch_timeout_micros = 0;
     queue_options.max_enqueued_batches = max_enqueued_batches;
     std::unique_ptr<BatchScheduler<FakeTask>> queue;
@@ -494,7 +494,7 @@ TEST(SharedBatchSchedulerTest, OneFullQueueDoesntBlockOtherQueues) {
   std::shared_ptr<SharedBatchScheduler<FakeTask>> scheduler;
   TF_ASSERT_OK(SharedBatchScheduler<FakeTask>::Create(options, &scheduler));
   SharedBatchScheduler<FakeTask>::QueueOptions queue_options;
-  queue_options.max_batch_size = 10;
+  queue_options.input_batch_size_limit = 10;
   queue_options.batch_timeout_micros = 0;
   queue_options.max_enqueued_batches = 2;
   std::unique_ptr<BatchScheduler<FakeTask>> queue_0;
@@ -550,7 +550,7 @@ TEST(SharedBatchSchedulerTest, QueueDestructorBlocksUntilAllTasksProcessed) {
     std::shared_ptr<SharedBatchScheduler<FakeTask>> scheduler;
     TF_ASSERT_OK(SharedBatchScheduler<FakeTask>::Create(options, &scheduler));
     SharedBatchScheduler<FakeTask>::QueueOptions queue_options;
-    queue_options.max_batch_size = 10;
+    queue_options.input_batch_size_limit = 10;
     queue_options.batch_timeout_micros = 0;
     queue_options.max_enqueued_batches = 2;
     std::unique_ptr<BatchScheduler<FakeTask>> queue;
diff --git a/tensorflow/core/kernels/bcast_ops.cc b/tensorflow/core/kernels/bcast_ops.cc
index fe185bd1526..f8a640b16c2 100644
--- a/tensorflow/core/kernels/bcast_ops.cc
+++ b/tensorflow/core/kernels/bcast_ops.cc
@@ -145,22 +145,6 @@ REGISTER_KERNEL_BUILDER(Name("BroadcastArgs")
                             .HostMemory("r0"),
                         BCastArgsOp<int64>);
 
-#if TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("BroadcastArgs")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int32>("T")
-                            .HostMemory("s0")
-                            .HostMemory("s1")
-                            .HostMemory("r0"),
-                        BCastArgsOp<int32>);
-REGISTER_KERNEL_BUILDER(Name("BroadcastArgs")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int64>("T")
-                            .HostMemory("s0")
-                            .HostMemory("s1")
-                            .HostMemory("r0"),
-                        BCastArgsOp<int32>);
-#endif
 
 REGISTER_KERNEL_BUILDER(Name("BroadcastGradientArgs")
                             .Device(DEVICE_CPU)
@@ -195,22 +179,4 @@ REGISTER_KERNEL_BUILDER(Name("BroadcastGradientArgs")
                             .HostMemory("r1"),
                         BCastGradArgsOp<int64>);
 
-#if TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("BroadcastGradientArgs")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int32>("T")
-                            .HostMemory("s0")
-                            .HostMemory("s1")
-                            .HostMemory("r0")
-                            .HostMemory("r1"),
-                        BCastGradArgsOp<int32>);
-REGISTER_KERNEL_BUILDER(Name("BroadcastGradientArgs")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int64>("T")
-                            .HostMemory("s0")
-                            .HostMemory("s1")
-                            .HostMemory("r0")
-                            .HostMemory("r1"),
-                        BCastGradArgsOp<int64>);
-#endif
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/bias_op.cc b/tensorflow/core/kernels/bias_op.cc
index bf001cceae7..e3dd9acb29a 100644
--- a/tensorflow/core/kernels/bias_op.cc
+++ b/tensorflow/core/kernels/bias_op.cc
@@ -39,9 +39,6 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 namespace {
 
@@ -216,20 +213,6 @@ class BiasOp : public BinaryOp<T> {
 TF_CALL_NUMBER_TYPES(REGISTER_KERNEL);
 #undef REGISTER_KERNEL
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_KERNEL(type)                                          \
-  REGISTER_KERNEL_BUILDER(                                             \
-      Name("BiasAdd").Device(DEVICE_SYCL).TypeConstraint<type>("T"),   \
-      BiasOp<SYCLDevice, type>);                                       \
-  REGISTER_KERNEL_BUILDER(                                             \
-      Name("BiasAddV1").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
-      BiasOp<SYCLDevice, type>);
-
-TF_CALL_INTEGRAL_TYPES(REGISTER_KERNEL);
-REGISTER_KERNEL(float);
-REGISTER_KERNEL(double);
-#undef REGISTER_KERNEL
-#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T>
 class BiasGradOp : public OpKernel {
@@ -308,17 +291,6 @@ class BiasGradOp : public OpKernel {
 TF_CALL_NUMBER_TYPES(REGISTER_KERNEL);
 #undef REGISTER_KERNEL
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_KERNEL(type)                                            \
-  REGISTER_KERNEL_BUILDER(                                               \
-      Name("BiasAddGrad").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
-      BiasGradOp<SYCLDevice, type>);
-
-TF_CALL_INTEGRAL_TYPES(REGISTER_KERNEL);
-REGISTER_KERNEL(float);
-REGISTER_KERNEL(double);
-#undef REGISTER_KERNEL
-#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 template <typename T>
diff --git a/tensorflow/core/kernels/bincount_op_gpu.cu.cc b/tensorflow/core/kernels/bincount_op_gpu.cu.cc
index b137413d5e3..94ba4d86adb 100644
--- a/tensorflow/core/kernels/bincount_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/bincount_op_gpu.cu.cc
@@ -126,7 +126,6 @@ struct BincountFunctor<GPUDevice, Tidx, T, true> {
     return GpuLaunchKernel(BincountReduceKernel<Tidx, T>, config.block_count,
                            config.thread_per_block, 0, d.stream(), arr.data(),
                            output.data(), nthreads, num_bins);
-    return Status::OK();
   }
 };
 
@@ -215,14 +214,11 @@ struct BincountReduceFunctor<GPUDevice, Tidx, T, binary_count> {
           config.block_count, config.thread_per_block, smem_usage, d.stream(),
           in.data(), weights.data(), weights.size(), out.data(), num_rows,
           num_cols, num_bins);
-    } else {
-      return GpuLaunchKernel(
-          BincountColReduceKernel<Tidx, T, binary_count>, config.block_count,
-          config.thread_per_block, 0, d.stream(), in.data(), weights.data(),
-          weights.size(), out.data(), num_rows, num_cols, num_bins);
     }
-
-    return Status::OK();
+    return GpuLaunchKernel(
+        BincountColReduceKernel<Tidx, T, binary_count>, config.block_count,
+        config.thread_per_block, 0, d.stream(), in.data(), weights.data(),
+        weights.size(), out.data(), num_rows, num_cols, num_bins);
   }
 };
 
diff --git a/tensorflow/core/kernels/cast_op.cc b/tensorflow/core/kernels/cast_op.cc
index e8c428a80d0..5f32291101a 100644
--- a/tensorflow/core/kernels/cast_op.cc
+++ b/tensorflow/core/kernels/cast_op.cc
@@ -34,9 +34,6 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 #define CURRY_TYPES2(FN, arg0)   \
   FN(arg0, bool);                \
@@ -253,50 +250,6 @@ REGISTER_CAST_GPU(bfloat16, float);
 #undef REGISTER_CAST_GPU
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-class SyclCastOp : public CastOpBase {
- public:
-  explicit SyclCastOp(OpKernelConstruction* ctx) : CastOpBase(ctx) {
-    OP_REQUIRES_OK(ctx, Prepare());
-  }
-
- private:
-  Status Prepare() {
-    if (external_src_dtype_ == external_dst_dtype_) {
-      work_ = nullptr;  // Identity
-      return Status::OK();
-    }
-    if (src_dtype_ == DT_BOOL) {
-      work_ = GetSyclCastFromBool(dst_dtype_);
-    } else if (src_dtype_ == DT_INT32) {
-      work_ = GetSyclCastFromInt32(dst_dtype_);
-    } else if (src_dtype_ == DT_INT64) {
-      work_ = GetSyclCastFromInt64(dst_dtype_);
-    } else if (src_dtype_ == DT_FLOAT) {
-      work_ = GetSyclCastFromFloat(dst_dtype_);
-    } else if (src_dtype_ == DT_DOUBLE) {
-      work_ = GetSyclCastFromDouble(dst_dtype_);
-    }
-
-    return work_ == nullptr ? Unimplemented() : Status::OK();
-  }
-};
-
-#define REGISTER_CAST_SYCL(srctype, dsttype)                   \
-  REGISTER_KERNEL_BUILDER(Name("Cast")                         \
-                              .TypeConstraint<srctype>("SrcT") \
-                              .TypeConstraint<dsttype>("DstT") \
-                              .Device(DEVICE_SYCL),            \
-                          SyclCastOp)
-CURRY_TYPES2(REGISTER_CAST_SYCL, bool);
-CURRY_TYPES2(REGISTER_CAST_SYCL, int32);
-CURRY_TYPES2(REGISTER_CAST_SYCL, int64);
-CURRY_TYPES2(REGISTER_CAST_SYCL, float);
-CURRY_TYPES2(REGISTER_CAST_SYCL, double);
-
-#undef REGISTER_CAST_SYCL
-
-#endif  // TENSORFLOW_USE_SYCL
 
 #undef CURRY_TYPES2
 
diff --git a/tensorflow/core/kernels/cast_op_impl.h b/tensorflow/core/kernels/cast_op_impl.h
index 266e2cec47a..536afb49073 100644
--- a/tensorflow/core/kernels/cast_op_impl.h
+++ b/tensorflow/core/kernels/cast_op_impl.h
@@ -27,9 +27,6 @@ namespace functor {
 
 CAST_FUNCTORS(Eigen::ThreadPoolDevice);
 
-#ifdef TENSORFLOW_USE_SYCL
-CAST_FUNCTORS(Eigen::SyclDevice);
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace functor
 
@@ -134,27 +131,6 @@ CastFunctorType GetGpuCastFromBfloat(DataType dst_dtype);
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-CastFunctorType GetSyclCastFromBool(DataType dst_dtype);
-
-CastFunctorType GetSyclCastFromUint8(DataType dst_dtype);
-
-CastFunctorType GetSyclCastFromUint16(DataType dst_dtype);
-
-CastFunctorType GetSyclCastFromUint32(DataType dst_dtype);
-
-CastFunctorType GetSyclCastFromUint64(DataType dst_dtype);
-
-CastFunctorType GetSyclCastFromInt16(DataType dst_dtype);
-
-CastFunctorType GetSyclCastFromInt32(DataType dst_dtype);
-
-CastFunctorType GetSyclCastFromInt64(DataType dst_dtype);
-
-CastFunctorType GetSyclCastFromFloat(DataType dst_dtype);
-
-CastFunctorType GetSyclCastFromDouble(DataType dst_dtype);
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/cast_op_impl_bool.cc b/tensorflow/core/kernels/cast_op_impl_bool.cc
index d08a45a0745..bbe33474ad3 100644
--- a/tensorflow/core/kernels/cast_op_impl_bool.cc
+++ b/tensorflow/core/kernels/cast_op_impl_bool.cc
@@ -33,12 +33,5 @@ CastFunctorType GetGpuCastFromBool(DataType dst_dtype) {
 }
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-CastFunctorType GetSyclCastFromBool(DataType dst_dtype) {
-  CURRY_TYPES3_NO_HALF(CAST_CASE, SYCLDevice, bool);
-  return nullptr;
-}
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cast_op_impl_double.cc b/tensorflow/core/kernels/cast_op_impl_double.cc
index 8637f3dbabf..ad897664fc6 100644
--- a/tensorflow/core/kernels/cast_op_impl_double.cc
+++ b/tensorflow/core/kernels/cast_op_impl_double.cc
@@ -33,12 +33,5 @@ CastFunctorType GetGpuCastFromDouble(DataType dst_dtype) {
 }
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-CastFunctorType GetSyclCastFromDouble(DataType dst_dtype) {
-  CURRY_TYPES3_NO_HALF(CAST_CASE, SYCLDevice, double);
-  return nullptr;
-}
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cast_op_impl_float.cc b/tensorflow/core/kernels/cast_op_impl_float.cc
index c2418e93f9b..698923073d8 100644
--- a/tensorflow/core/kernels/cast_op_impl_float.cc
+++ b/tensorflow/core/kernels/cast_op_impl_float.cc
@@ -35,12 +35,5 @@ CastFunctorType GetGpuCastFromFloat(DataType dst_dtype) {
 }
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-CastFunctorType GetSyclCastFromFloat(DataType dst_dtype) {
-  CURRY_TYPES3_NO_HALF(CAST_CASE, SYCLDevice, float);
-  return nullptr;
-}
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cast_op_impl_int16.cc b/tensorflow/core/kernels/cast_op_impl_int16.cc
index b32200615fa..04c9952afb6 100644
--- a/tensorflow/core/kernels/cast_op_impl_int16.cc
+++ b/tensorflow/core/kernels/cast_op_impl_int16.cc
@@ -33,12 +33,5 @@ CastFunctorType GetGpuCastFromInt16(DataType dst_dtype) {
 }
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-CastFunctorType GetSyclCastFromInt16(DataType dst_dtype) {
-  CURRY_TYPES3_NO_HALF(CAST_CASE, SYCLDevice, int16);
-  return nullptr;
-}
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cast_op_impl_int32.cc b/tensorflow/core/kernels/cast_op_impl_int32.cc
index 154fd148ce2..5c8b7161490 100644
--- a/tensorflow/core/kernels/cast_op_impl_int32.cc
+++ b/tensorflow/core/kernels/cast_op_impl_int32.cc
@@ -33,12 +33,5 @@ CastFunctorType GetGpuCastFromInt32(DataType dst_dtype) {
 }
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-CastFunctorType GetSyclCastFromInt32(DataType dst_dtype) {
-  CURRY_TYPES3_NO_HALF(CAST_CASE, SYCLDevice, int32);
-  return nullptr;
-}
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cast_op_impl_int64.cc b/tensorflow/core/kernels/cast_op_impl_int64.cc
index 1f4ebc96b46..0175231d705 100644
--- a/tensorflow/core/kernels/cast_op_impl_int64.cc
+++ b/tensorflow/core/kernels/cast_op_impl_int64.cc
@@ -33,12 +33,5 @@ CastFunctorType GetGpuCastFromInt64(DataType dst_dtype) {
 }
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-CastFunctorType GetSyclCastFromInt64(DataType dst_dtype) {
-  CURRY_TYPES3_NO_HALF(CAST_CASE, SYCLDevice, int64);
-  return nullptr;
-}
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cast_op_impl_int8.cc b/tensorflow/core/kernels/cast_op_impl_int8.cc
index 00a72ab9868..2aaac7a2c9b 100644
--- a/tensorflow/core/kernels/cast_op_impl_int8.cc
+++ b/tensorflow/core/kernels/cast_op_impl_int8.cc
@@ -33,12 +33,5 @@ CastFunctorType GetGpuCastFromInt8(DataType dst_dtype) {
 }
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-CastFunctorType GetSyclCastFromInt8(DataType dst_dtype) {
-  CURRY_TYPES3_NO_HALF(CAST_CASE, SYCLDevice, int8);
-  return nullptr;
-}
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cast_op_impl_uint16.cc b/tensorflow/core/kernels/cast_op_impl_uint16.cc
index 2981fe99e3c..aca3c877418 100644
--- a/tensorflow/core/kernels/cast_op_impl_uint16.cc
+++ b/tensorflow/core/kernels/cast_op_impl_uint16.cc
@@ -33,12 +33,5 @@ CastFunctorType GetGpuCastFromUint16(DataType dst_dtype) {
 }
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-CastFunctorType GetSyclCastFromUint16(DataType dst_dtype) {
-  CURRY_TYPES3_NO_HALF(CAST_CASE, SYCLDevice, uint16);
-  return nullptr;
-}
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cast_op_impl_uint32.cc b/tensorflow/core/kernels/cast_op_impl_uint32.cc
index b94540dfe7d..d41ac9d9382 100644
--- a/tensorflow/core/kernels/cast_op_impl_uint32.cc
+++ b/tensorflow/core/kernels/cast_op_impl_uint32.cc
@@ -33,12 +33,5 @@ CastFunctorType GetGpuCastFromUint32(DataType dst_dtype) {
 }
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-CastFunctorType GetSyclCastFromUint32(DataType dst_dtype) {
-  CURRY_TYPES3_NO_HALF(CAST_CASE, SYCLDevice, uint32);
-  return nullptr;
-}
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cast_op_impl_uint64.cc b/tensorflow/core/kernels/cast_op_impl_uint64.cc
index e04c0a28cd8..d941f1dc118 100644
--- a/tensorflow/core/kernels/cast_op_impl_uint64.cc
+++ b/tensorflow/core/kernels/cast_op_impl_uint64.cc
@@ -33,12 +33,5 @@ CastFunctorType GetGpuCastFromUint64(DataType dst_dtype) {
 }
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-CastFunctorType GetSyclCastFromUint64(DataType dst_dtype) {
-  CURRY_TYPES3_NO_HALF(CAST_CASE, SYCLDevice, uint64);
-  return nullptr;
-}
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cast_op_impl_uint8.cc b/tensorflow/core/kernels/cast_op_impl_uint8.cc
index 20c572980c3..fbffeb554e1 100644
--- a/tensorflow/core/kernels/cast_op_impl_uint8.cc
+++ b/tensorflow/core/kernels/cast_op_impl_uint8.cc
@@ -33,12 +33,5 @@ CastFunctorType GetGpuCastFromUint8(DataType dst_dtype) {
 }
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-CastFunctorType GetSyclCastFromUint8(DataType dst_dtype) {
-  CURRY_TYPES3_NO_HALF(CAST_CASE, SYCLDevice, uint8);
-  return nullptr;
-}
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cast_op_test.cc b/tensorflow/core/kernels/cast_op_test.cc
index c8da7c55ae8..11550be4874 100644
--- a/tensorflow/core/kernels/cast_op_test.cc
+++ b/tensorflow/core/kernels/cast_op_test.cc
@@ -138,9 +138,6 @@ static void BM_gpu_float_int64(int iters, int num) {
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   test::Benchmark("gpu", Cast<float, int64>(num)).Run(iters);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#ifdef TENSORFLOW_USE_SYCL
-  test::Benchmark("sycl", Cast<float, int64>(num)).Run(iters);
-#endif  // TENSORFLOW_USE_SYCL
 }
 BENCHMARK(BM_gpu_float_int64)->Arg(64 << 10)->Arg(32 << 20);
 
@@ -161,9 +158,6 @@ static void BM_gpu_bool_float(int iters, int num) {
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   test::Benchmark("gpu", Cast<bool, float>(num)).Run(iters);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#ifdef TENSORFLOW_USE_SYCL
-  test::Benchmark("sycl", Cast<bool, float>(num)).Run(iters);
-#endif  // TENSORFLOW_USE_SYCL
 }
 BENCHMARK(BM_gpu_bool_float)->Arg(64 << 10)->Arg(32 << 20);
 
diff --git a/tensorflow/core/kernels/check_numerics_op.cc b/tensorflow/core/kernels/check_numerics_op.cc
index 6922158413d..994f27ffe8e 100644
--- a/tensorflow/core/kernels/check_numerics_op.cc
+++ b/tensorflow/core/kernels/check_numerics_op.cc
@@ -16,7 +16,7 @@ limitations under the License.
 // See docs in ../ops/array_ops.cc.
 
 // clang-format off
-#include "tensorflow/core/lib/bfloat16/bfloat16.h"
+#include "tensorflow/core/platform/bfloat16.h"
 
 #include <math.h>  // NOLINT
 #include <algorithm>  // NOLINT
diff --git a/tensorflow/core/kernels/collective_nccl.cc b/tensorflow/core/kernels/collective_nccl.cc
index 74ad24abfaa..44e0b07e9ad 100644
--- a/tensorflow/core/kernels/collective_nccl.cc
+++ b/tensorflow/core/kernels/collective_nccl.cc
@@ -74,10 +74,6 @@ Status NcclBase::InitializeCollectiveGroupRuntimeDetails(
   return Status::OK();
 }
 
-const string NcclBase::NcclCollectiveKey(const string& exec_key, int step_id) {
-  return strings::StrCat(exec_key, ":", step_id);
-}
-
 }  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/collective_nccl.h b/tensorflow/core/kernels/collective_nccl.h
index b076272b6a5..1c3681311bc 100644
--- a/tensorflow/core/kernels/collective_nccl.h
+++ b/tensorflow/core/kernels/collective_nccl.h
@@ -37,8 +37,6 @@ class NcclBase : public CollectiveImplementationInterface {
       CollGroupRuntimeDetails* col_group_runtime_details) override;
 
  protected:
-  const string NcclCollectiveKey(const string& exec_key, int step_id);
-
   const CollectiveType type_;
   const string name_;
   std::shared_ptr<CollectiveContext> col_ctx_;
diff --git a/tensorflow/core/kernels/collective_nccl_broadcaster.cc b/tensorflow/core/kernels/collective_nccl_broadcaster.cc
index c4081ca81c4..12b7f5222a9 100644
--- a/tensorflow/core/kernels/collective_nccl_broadcaster.cc
+++ b/tensorflow/core/kernels/collective_nccl_broadcaster.cc
@@ -24,55 +24,7 @@ limitations under the License.
 namespace tensorflow {
 
 void NcclBroadcaster::Run(StatusCallback done) {
-  auto* compute_stream = col_ctx_->op_ctx->op_device_context()->stream();
-  auto* gpu_info = col_ctx_->op_ctx->device()->tensorflow_gpu_device_info();
-  const int num_global_devices = col_params_->group.group_size;
-  const int num_local_devices = col_params_->instance.num_devices_per_task.at(
-      col_params_->instance.task_names[col_params_->default_rank]);
-  string nccl_collective_key =
-      NcclCollectiveKey(col_ctx_->exec_key, col_ctx_->step_id);
-  auto participant = absl::make_unique<NcclManager::Participant>(
-      compute_stream->parent(), compute_stream, gpu_info, col_ctx_->input,
-      col_ctx_->output, col_params_->default_rank, std::move(done));
-  VLOG(1)
-      << "NcclBroadcast calling NcclManager::AddBroadcastSend/Recv num_tasks "
-      << col_params_->group.num_tasks << " current task "
-      << col_params_->instance.task_names[col_params_->default_rank]
-      << " num local devices " << num_local_devices << " num global devices "
-      << num_global_devices << " rank " << col_params_->default_rank
-      << " device " << col_ctx_->device_name << " instance "
-      << col_params_->instance.instance_key << " source "
-      << col_params_->is_source;
-  if (col_params_->is_source) {
-    NcclManager::instance()->AddBroadcastSend(
-        std::move(participant),
-        {std::move(nccl_collective_key), num_local_devices, num_global_devices,
-         col_params_->group.runtime_details.communicator_key,
-         col_params_->source_rank});
-  } else {
-    NcclManager::instance()->AddBroadcastRecv(
-        std::move(participant),
-        {std::move(nccl_collective_key), num_local_devices, num_global_devices,
-         col_params_->group.runtime_details.communicator_key,
-         col_params_->source_rank});
-  }
-  {
-    // `WaitForDependencies` may block if the collective instances on which this
-    // op depends have not yet launched.  When this function returns, this op is
-    // ready to go.
-    profiler::TraceMe activity("WaitForDependencies",
-                               profiler::TraceMeLevel::kInfo);
-    col_ctx_->col_exec->WaitForDependencies(*col_params_);
-    NcclManager::instance()->SignalMultiNodeReady(nccl_collective_key);
-  }
-  {
-    // When all devices at this worker have called `SignalMultiNodeReady`, the
-    // `NcclManager` will enqueue the NCCL kernel on the NCCL stream.  Thus the
-    // implementation of `UnblockDependencies` keeps track of the number of
-    // devices that have launched.
-    profiler::TraceMe activity("Schedule", profiler::TraceMeLevel::kInfo);
-    col_ctx_->col_exec->UnblockDependencies(*col_params_);
-  }
+  col_ctx_->nccl_communicator->Enqueue(col_ctx_, std::move(done));
 }
 
 REGISTER_COLLECTIVE(NcclBroadcast, NcclBroadcaster);
diff --git a/tensorflow/core/kernels/collective_nccl_gatherer.cc b/tensorflow/core/kernels/collective_nccl_gatherer.cc
index 8c8b29cbb12..a029e5bb660 100644
--- a/tensorflow/core/kernels/collective_nccl_gatherer.cc
+++ b/tensorflow/core/kernels/collective_nccl_gatherer.cc
@@ -24,45 +24,7 @@ limitations under the License.
 namespace tensorflow {
 
 void NcclGatherer::Run(StatusCallback done) {
-  auto* compute_stream = col_ctx_->op_ctx->op_device_context()->stream();
-  auto* gpu_info = col_ctx_->op_ctx->device()->tensorflow_gpu_device_info();
-  const int num_global_devices = col_params_->group.group_size;
-  const int num_local_devices = col_params_->instance.num_devices_per_task.at(
-      col_params_->instance.task_names[col_params_->default_rank]);
-  string nccl_collective_key =
-      NcclCollectiveKey(col_ctx_->exec_key, col_ctx_->step_id);
-  auto participant = absl::make_unique<NcclManager::Participant>(
-      compute_stream->parent(), compute_stream, gpu_info, col_ctx_->input,
-      col_ctx_->output, col_params_->default_rank, std::move(done));
-  VLOG(1) << "NcclGatherer calling NcclManager::AddToAllGather num_tasks "
-          << col_params_->group.num_tasks << " current task "
-          << col_params_->instance.task_names[col_params_->default_rank]
-          << " num local devices " << num_local_devices
-          << " num global devices " << num_global_devices << " rank "
-          << col_params_->default_rank << " device " << col_ctx_->device_name
-          << " instance " << col_params_->instance.instance_key;
-  NcclManager::instance()->AddToAllGather(
-      std::move(participant),
-      {std::move(nccl_collective_key), num_local_devices, num_global_devices,
-       col_params_->group.runtime_details.communicator_key,
-       /*source_rank=*/-1});
-  {
-    // `WaitForDependencies` may block if the collective instances on which this
-    // op depends have not yet launched.  When this function returns, this op is
-    // ready to go.
-    profiler::TraceMe activity("WaitForDependencies",
-                               profiler::TraceMeLevel::kInfo);
-    col_ctx_->col_exec->WaitForDependencies(*col_params_);
-    NcclManager::instance()->SignalMultiNodeReady(nccl_collective_key);
-  }
-  {
-    // When all devices at this worker have called `SignalMultiNodeReady`, the
-    // `NcclManager` will enqueue the NCCL kernel on the NCCL stream.  Thus the
-    // implementation of `UnblockDependencies` keeps track of the number of
-    // devices that have launched.
-    profiler::TraceMe activity("Schedule", profiler::TraceMeLevel::kInfo);
-    col_ctx_->col_exec->UnblockDependencies(*col_params_);
-  }
+  col_ctx_->nccl_communicator->Enqueue(col_ctx_, std::move(done));
 }
 
 REGISTER_COLLECTIVE(NcclGather, NcclGatherer);
diff --git a/tensorflow/core/kernels/collective_nccl_reducer.cc b/tensorflow/core/kernels/collective_nccl_reducer.cc
index a1ae23b2f8c..451f2cb96bc 100644
--- a/tensorflow/core/kernels/collective_nccl_reducer.cc
+++ b/tensorflow/core/kernels/collective_nccl_reducer.cc
@@ -23,36 +23,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-namespace {
-Status ReductionOp(const string& merge_op, ncclRedOp_t* reduction_op) {
-  if (merge_op == "Add") {
-    *reduction_op = ncclSum;
-    return Status::OK();
-  } else if (merge_op == "Mul") {
-    *reduction_op = ncclProd;
-    return Status::OK();
-  } else if (merge_op == "Maximum") {
-    *reduction_op = ncclMax;
-    return Status::OK();
-  } else if (merge_op == "Minimum") {
-    *reduction_op = ncclMin;
-    return Status::OK();
-  } else {
-    return errors::Internal(
-        "Expected merge_op to be in [Add, Mul, Maximum, Minimum], found ",
-        merge_op);
-  }
-}
-}  // namespace
-
 void NcclReducer::Run(StatusCallback done) {
-  ncclRedOp_t reduction_op;
-  Status s = ReductionOp(col_params_->merge_op->type_string(), &reduction_op);
-  if (!s.ok()) {
-    done(s);
-    return;
-  }
-
   Tensor group_size;
   std::unique_ptr<Notification> group_size_ready;
   Status group_size_status;
@@ -117,73 +88,7 @@ void NcclReducer::Run(StatusCallback done) {
   } else {
     done_callback = std::move(done);
   }
-  auto* compute_stream = col_ctx_->op_ctx->op_device_context()->stream();
-  auto* gpu_info = col_ctx_->op_ctx->device()->tensorflow_gpu_device_info();
-  // `AddToAllReduce` performs consistency checks for the NCCL call and enqueues
-  // the `Participant` struct locally.  When all local participants with this
-  // `nccl_collective_key` have called `AddToAllReduce` and
-  // `SignalMultiNodeReady`, all devices at this worker are ready to process
-  // this NCCL op.
-  //
-  // The `NcclManager` uses a dedicated CUDA stream for NCCL kernels.  At this
-  // point, it synchronizes the NCCL stream with the compute stream, and then
-  // enqueues the NCCL kernel on the NCCL stream.
-  const int num_global_devices = col_params_->group.group_size;
-  const int num_local_devices = col_params_->instance.num_devices_per_task.at(
-      col_params_->instance.task_names[col_params_->default_rank]);
-  const string nccl_collective_key =
-      NcclCollectiveKey(col_ctx_->exec_key, col_ctx_->step_id);
-  auto participant = absl::make_unique<NcclManager::Participant>(
-      compute_stream->parent(), compute_stream, gpu_info, col_ctx_->input,
-      col_ctx_->output, col_params_->default_rank, std::move(done_callback));
-  VLOG(1) << "NcclReducer calling NcclManager::AddToAllReduce num_tasks "
-          << col_params_->group.num_tasks << " current task "
-          << col_params_->instance.task_names[col_params_->default_rank]
-          << " num local devices " << num_local_devices
-          << " num global devices " << num_global_devices << " device "
-          << col_ctx_->device_name << " instance "
-          << col_params_->instance.instance_key;
-  NcclManager::instance()->AddToAllReduce(
-      std::move(participant),
-      {nccl_collective_key, num_local_devices, num_global_devices,
-       col_params_->group.runtime_details.communicator_key, /*source_rank=*/-1},
-      reduction_op);
-
-  // NOTE(ayushd): We need to synchronize NCCL launches across nodes to prevent
-  // deadlocks.  In the current implementation, we define a deterministic
-  // sequential launch order between potentially concurrent collective instances
-  // by introducing control information during static graph analysis in
-  // graph/collective_order.cc.  This can be either in the form of explicit
-  // control edges or via `wait_for` attribute on the collective op.
-  //
-  // The other end of the design spectrum would have a distinguished node
-  // dynamically signal the next collective to launch to all other participants.
-  // This has higher degree of runtime coordination, but it may be able to
-  // achieve better performance if the (arbitrary) static execution order
-  // assigned in the first approach turns out to not be good from a scheduling
-  // perspective.  e.g. consider a graph in which c1, c2, and c3 are three
-  // concurrent collective instances, and the static ordering assigns c1 -> c2
-  // -> c3.  In practice, it could turn out that c3 is always ready to execute
-  // before c1 or c2.
-  {
-    // `WaitForDependencies` may block if the collective instances on which this
-    // op depends have not yet launched.  When this function returns, this op is
-    // ready to go.
-    profiler::TraceMe activity("WaitForDependencies",
-                               profiler::TraceMeLevel::kInfo);
-    // TODO(b/80529858): make this entirely non-blocking by converting
-    // `WaitForDependencies` to async function.
-    col_ctx_->col_exec->WaitForDependencies(*col_params_);
-    NcclManager::instance()->SignalMultiNodeReady(nccl_collective_key);
-  }
-  {
-    // When all devices at this worker have called `SignalMultiNodeReady`, the
-    // `NcclManager` will enqueue the NCCL kernel on the NCCL stream.  Thus the
-    // implementation of `UnblockDependencies` keeps track of the number of
-    // devices that have launched.
-    profiler::TraceMe activity("Schedule", profiler::TraceMeLevel::kInfo);
-    col_ctx_->col_exec->UnblockDependencies(*col_params_);
-  }
+  col_ctx_->nccl_communicator->Enqueue(col_ctx_, std::move(done_callback));
 
   // If no final_op, then this OpKernel is non-blocking.
   if (!col_params_->final_op) {
diff --git a/tensorflow/core/kernels/collective_nccl_test.cc b/tensorflow/core/kernels/collective_nccl_test.cc
index ce4aca1cdcc..e288ba66804 100644
--- a/tensorflow/core/kernels/collective_nccl_test.cc
+++ b/tensorflow/core/kernels/collective_nccl_test.cc
@@ -39,8 +39,10 @@ limitations under the License.
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/nccl/collective_communicator.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/unbounded_work_queue.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/public/version.h"
 
@@ -83,6 +85,9 @@ class NcclTestBase : public ::testing::Test {
   NcclTestBase(CollectiveType collective_type, const string& collective_name)
       : collective_type_(collective_type),
         collective_name_(collective_name),
+        nccl_communicator_(MaybeCreateNcclCommunicator()),
+        work_queue_(std::make_shared<UnboundedWorkQueue>(
+            Env::Default(), "collective_executor")),
         col_exec_(nullptr) {}
 
   ~NcclTestBase() override {
@@ -118,7 +123,7 @@ class NcclTestBase : public ::testing::Test {
       dev_mgr_ = absl::make_unique<StaticDeviceMgr>(std::move(local_devices));
     col_exec_ = new BaseCollectiveExecutor(
         &col_exec_mgr_, /*remote_access=*/nullptr, kStepId, dev_mgr_.get(),
-        /*gpu_ring_order=*/nullptr);
+        /*gpu_ring_order=*/nullptr, work_queue_);
 
     // Initialize collective params.
     col_params_.name = "test_nccl_collective_op";
@@ -313,14 +318,15 @@ class NcclTestBase : public ::testing::Test {
       // Run the all-reduce.
       string exec_key =
           strings::StrCat(col_params_.instance.instance_key, ":0:0");
-      NcclReducer reducer;
+      auto* reducer = new NcclReducer();
       auto col_ctx = std::make_shared<CollectiveContext>(
-          parent_->col_exec_, parent_->dev_mgr_.get(),
+          parent_->col_exec_, parent_->nccl_communicator_.get(),
+          parent_->dev_mgr_.get(),
           /*OpKernelContext=*/&ctx, &op_params, col_params_, exec_key, kStepId,
           /*input=*/&input_, /*output=*/&input_);
-      TF_CHECK_OK(reducer.InitializeCollectiveContext(col_ctx));
+      TF_CHECK_OK(reducer->InitializeCollectiveContext(col_ctx));
       Notification note;
-      reducer.Run([this, &note](Status s) {
+      reducer->Run([this, &note](Status s) {
         status_ = s;
         note.Notify();
       });
@@ -329,6 +335,7 @@ class NcclTestBase : public ::testing::Test {
         CHECK(output_.CopyFrom(*ctx.mutable_output(0), input_.shape()));
       }
 
+      reducer->Unref();
       op_params.op_device_context->Unref();
     }
 
@@ -343,15 +350,16 @@ class NcclTestBase : public ::testing::Test {
       // Run broadcast.
       string exec_key =
           strings::StrCat(col_params_.instance.instance_key, ":0:0");
-      NcclBroadcaster broadcaster;
+      auto* broadcaster = new NcclBroadcaster();
       auto col_ctx = std::make_shared<CollectiveContext>(
-          parent_->col_exec_, parent_->dev_mgr_.get(),
+          parent_->col_exec_, parent_->nccl_communicator_.get(),
+          parent_->dev_mgr_.get(),
           /*OpKernelContext=*/&ctx, &op_params, col_params_, exec_key, kStepId,
           /*input=*/col_params_.is_source ? &input_ : nullptr,
           /*output=*/&input_);
-      TF_CHECK_OK(broadcaster.InitializeCollectiveContext(col_ctx));
+      TF_CHECK_OK(broadcaster->InitializeCollectiveContext(col_ctx));
       Notification note;
-      broadcaster.Run([this, &note](Status s) {
+      broadcaster->Run([this, &note](Status s) {
         status_ = s;
         note.Notify();
       });
@@ -360,6 +368,7 @@ class NcclTestBase : public ::testing::Test {
         CHECK(output_.CopyFrom(input_, input_.shape()));
       }
 
+      broadcaster->Unref();
       op_params.op_device_context->Unref();
     }
 
@@ -382,20 +391,22 @@ class NcclTestBase : public ::testing::Test {
       // Run gather.
       string exec_key =
           strings::StrCat(col_params_.instance.instance_key, ":0:0");
-      NcclGatherer gatherer;
+      auto* gatherer = new NcclGatherer();
       auto col_ctx = std::make_shared<CollectiveContext>(
-          parent_->col_exec_, parent_->dev_mgr_.get(),
+          parent_->col_exec_, parent_->nccl_communicator_.get(),
+          parent_->dev_mgr_.get(),
           /*OpKernelContext=*/&ctx, &op_params, col_params_, exec_key, kStepId,
           /*input=*/&input_,
           /*output=*/&output_);
-      TF_CHECK_OK(gatherer.InitializeCollectiveContext(col_ctx));
+      TF_CHECK_OK(gatherer->InitializeCollectiveContext(col_ctx));
       Notification note;
-      gatherer.Run([this, &note](Status s) {
+      gatherer->Run([this, &note](Status s) {
         status_ = s;
         note.Notify();
       });
       note.WaitForNotification();
 
+      gatherer->Unref();
       op_params.op_device_context->Unref();
     }
 
@@ -413,6 +424,8 @@ class NcclTestBase : public ::testing::Test {
   const string collective_name_;
   std::vector<std::unique_ptr<tensorflow::Device>> gpus_;
   TestCollectiveExecutorMgr col_exec_mgr_;
+  std::unique_ptr<NcclCommunicatorInterface> nccl_communicator_;
+  std::shared_ptr<UnboundedWorkQueue> work_queue_;
   CollectiveExecutor* col_exec_;
   std::unique_ptr<DeviceMgr> dev_mgr_;
   std::vector<std::unique_ptr<DeviceInstance>> instances_;
diff --git a/tensorflow/core/kernels/collective_ops.cc b/tensorflow/core/kernels/collective_ops.cc
index 0230852d082..eed2c4c1d54 100644
--- a/tensorflow/core/kernels/collective_ops.cc
+++ b/tensorflow/core/kernels/collective_ops.cc
@@ -16,6 +16,8 @@ limitations under the License.
 #include "tensorflow/core/framework/collective.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
@@ -73,7 +75,7 @@ class CollectiveOpKernel : public AsyncOpKernel {
                 << " group " << col_params_.group.group_key << " instance "
                 << col_params_.instance.instance_key;
         col_exec->CompleteParamsAsync(
-            c->device()->name(), &col_params_, c->cancellation_manager(),
+            c->device()->attributes(), &col_params_, c->cancellation_manager(),
             [this, c, done](const Status& s) {
               if (s.ok()) {
                 col_params_.instance.impl_details.dependencies = dependencies_;
@@ -538,7 +540,8 @@ class CollectiveReduceV2OpKernel : public AsyncOpKernel {
               << " group " << col_params->group.group_key << " instance "
               << col_params->instance.instance_key;
       col_exec->CompleteParamsAsync(
-          c->device()->name(), col_params.get(), c->cancellation_manager(),
+          c->device()->attributes(), col_params.get(),
+          c->cancellation_manager(),
           [c, done = std::move(done), col_params, col_exec](const Status& s) {
             if (s.ok()) {
               auto actual_done = [c, group_key = col_params->group.group_key,
@@ -580,5 +583,135 @@ REGISTER_KERNEL_BUILDER(Name("CollectiveReduceV2").Device(DEVICE_CPU),
 REGISTER_KERNEL_BUILDER(Name("CollectiveReduceV2").Device(DEVICE_GPU),
                         CollectiveReduceV2OpKernel);
 
+class CollectiveGatherV2OpKernel : public AsyncOpKernel {
+ public:
+  explicit CollectiveGatherV2OpKernel(OpKernelConstruction* c)
+      : AsyncOpKernel(c), device_type_(DEVICE_DEFAULT) {
+    OP_REQUIRES_OK(c, c->GetAttr("T", &data_type_));
+    OP_REQUIRES_OK(c, c->GetAttr("communication_hint", &communication_hint_));
+    OP_REQUIRES_OK(c, c->GetAttr("timeout_seconds", &timeout_seconds_));
+    name_ = strings::StrCat(c->def().name(), ": GatherV2");
+    device_type_ = c->device_type();
+    VLOG(2) << "CollectiveGatherV2 " << this << " name " << name_
+            << " communication_hint " << communication_hint_;
+  }
+
+  void ComputeAsync(OpKernelContext* c, DoneCallback done) override {
+    CollectiveExecutor* col_exec = c->collective_executor();
+    OP_REQUIRES_ASYNC(
+        c, col_exec,
+        errors::Internal(
+            "Failed to get CollectiveExecutor from OpKernelContext for Op ",
+            name_),
+        done);
+    const Tensor& input = c->input(0);
+    const Tensor& group_size = c->input(1);
+    const Tensor& group_key = c->input(2);
+    const Tensor& instance_key = c->input(3);
+    OP_REQUIRES_ASYNC(c, group_size.dims() == 0,
+                      errors::InvalidArgument(
+                          "Unexpected dimensions on input group_size, got ",
+                          group_size.shape().DebugString()),
+                      done);
+    OP_REQUIRES_ASYNC(c, group_key.dims() == 0,
+                      errors::InvalidArgument(
+                          "Unexpected dimensions on input group_key, got ",
+                          group_key.shape().DebugString()),
+                      done);
+    OP_REQUIRES_ASYNC(c, instance_key.dims() == 0,
+                      errors::InvalidArgument(
+                          "Unexpected dimensions on input instance_key, got ",
+                          instance_key.shape().DebugString()),
+                      done);
+
+    auto col_params = new CollectiveParams();
+    col_params->name = name_;
+    col_params->group.device_type = device_type_;
+    col_params->group.group_size = group_size.unaligned_flat<int32>()(0);
+    OP_REQUIRES(
+        c, col_params->group.group_size > 0,
+        errors::InvalidArgument("group_size must be positive integer but got ",
+                                col_params->group.group_size));
+    col_params->group.group_key = group_key.unaligned_flat<int32>()(0);
+    col_params->instance.type = GATHER_COLLECTIVE;
+    col_params->instance.instance_key = instance_key.unaligned_flat<int32>()(0);
+    col_params->instance.data_type = data_type_;
+    col_params->instance.impl_details.communication_hint = communication_hint_;
+    col_params->instance.impl_details.timeout_seconds = timeout_seconds_;
+    VLOG(1) << "CollectiveGatherV2 group_size " << col_params->group.group_size
+            << " group_key " << col_params->group.group_key << " instance_key "
+            << col_params->instance.instance_key;
+
+    auto output_shape = input.shape();
+    output_shape.set_dim(
+        0, output_shape.dim_size(0) * col_params->group.group_size);
+    col_params->instance.shape = output_shape;
+
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK_ASYNC(
+        c, c->allocate_output(0, col_params->instance.shape, &output), done);
+
+    auto done_with_cleanup = [col_params, done = std::move(done)]() {
+      delete col_params;
+      done();
+    };
+
+    // Resolve the collective params.
+    // Schedule the `CompleteParamsAsync` call on a work queue that can handle
+    // blocking work because it's not guaranteed that this call cannot block.
+    c->collective_executor()->RunClosure([c,
+                                          done = std::move(done_with_cleanup),
+                                          col_params, col_exec]() {
+      VLOG(1) << "CollectiveGatherV2 CompleteParams for collective "
+              << col_params->name << " device " << c->device()->name()
+              << " group " << col_params->group.group_key << " instance "
+              << col_params->instance.instance_key;
+      col_exec->CompleteParamsAsync(
+          c->device()->attributes(), col_params, c->cancellation_manager(),
+          [c, done = std::move(done), col_params, col_exec](const Status& s) {
+            if (s.ok()) {
+              auto actual_done = [c, group_key = col_params->group.group_key,
+                                  instance_key =
+                                      col_params->instance.instance_key,
+                                  done = std::move(done)](const Status& s) {
+                VLOG(1) << "CollectiveGatherV2 ExecuteAsync done for "
+                           "collective "
+                        << c->op_kernel().name() << " device "
+                        << c->device()->name() << " group " << group_key
+                        << " instance " << instance_key << " status " << s;
+                OP_REQUIRES_OK_ASYNC(c, s, done);
+                done();
+              };
+              VLOG(1) << "CollectiveGatherV2 ExecuteAsync start for "
+                         "collective "
+                      << col_params->name << " device " << c->device()->name()
+                      << " group " << col_params->group.group_key
+                      << " instance " << col_params->instance.instance_key;
+              col_exec->ExecuteAsync(
+                  c, *col_params,
+                  CollectiveKey(c, col_params->group.group_key,
+                                col_params->instance.instance_key),
+                  actual_done);
+            } else {
+              c->SetStatus(s);
+              done();
+            }
+          });
+    });
+  }
+
+ private:
+  DataType data_type_;
+  string communication_hint_;
+  float timeout_seconds_;
+  DeviceType device_type_;
+  string name_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("CollectiveGatherV2").Device(DEVICE_CPU),
+                        CollectiveGatherV2OpKernel);
+REGISTER_KERNEL_BUILDER(Name("CollectiveGatherV2").Device(DEVICE_GPU),
+                        CollectiveGatherV2OpKernel);
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/concat_lib.h b/tensorflow/core/kernels/concat_lib.h
index 35da7afe3f5..2f813aeb806 100644
--- a/tensorflow/core/kernels/concat_lib.h
+++ b/tensorflow/core/kernels/concat_lib.h
@@ -73,14 +73,6 @@ TF_CALL_GPU_ALL_TYPES(REGISTER);
 #undef REGISTER
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-template <typename T>
-void ConcatSYCL(
-    const Eigen::SyclDevice& d,
-    const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
-        inputs,
-    typename TTypes<T, 2>::Matrix* output);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_KERNELS_CONCAT_LIB_H_
diff --git a/tensorflow/core/kernels/concat_lib_cpu.cc b/tensorflow/core/kernels/concat_lib_cpu.cc
index 1dec589d3ff..d1748e056a7 100644
--- a/tensorflow/core/kernels/concat_lib_cpu.cc
+++ b/tensorflow/core/kernels/concat_lib_cpu.cc
@@ -127,24 +127,4 @@ REGISTER(tstring);
         // !defined(SUPPORT_SELECTIVE_REGISTRATION) &&
         // !defined(__ANDROID_TYPES_FULL__)
 
-#ifdef TENSORFLOW_USE_SYCL
-template <typename T>
-void ConcatSYCL(
-    const Eigen::SyclDevice& d,
-    const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
-        inputs,
-    typename TTypes<T, 2>::Matrix* output) {
-  ConcatSYCLImpl<T>(d, inputs, sizeof(T) /* cost_per_unit */, MemCpyCopier<T>(),
-                    output);
-}
-#define REGISTER_SYCL(T)                                                       \
-  template void ConcatSYCL<T>(                                                 \
-      const Eigen::SyclDevice&,                                                \
-      const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&, \
-      typename TTypes<T, 2>::Matrix* output);
-
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL)
-
-#undef REGISTER_SYCL
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/concat_lib_cpu.h b/tensorflow/core/kernels/concat_lib_cpu.h
index 6ee717a9215..34f99291abf 100644
--- a/tensorflow/core/kernels/concat_lib_cpu.h
+++ b/tensorflow/core/kernels/concat_lib_cpu.h
@@ -130,41 +130,6 @@ void ConcatCPUImpl(
         cost_per_unit, work);
 }
 
-#ifdef TENSORFLOW_USE_SYCL
-template <typename T, typename ElementCopier>
-void ConcatSYCLImpl(
-    const Eigen::SyclDevice& d,
-    const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
-        inputs,
-    int64 cost_per_unit, ElementCopier copier,
-    typename TTypes<T, 2>::Matrix* output) {
-  size_t num_inputs = inputs.size();
-
-  std::vector<ptrdiff_t> sizes;
-  sizes.reserve(num_inputs);
-  int64 row_size = 0;
-  for (const auto& input : inputs) {
-    sizes.push_back(input->dimension(1));
-    row_size += sizes.back();
-  }
-
-  T* out = &(*output)(0, 0);
-  std::vector<const T*> inp;
-  inp.reserve(num_inputs);
-  for (const auto& input : inputs) {
-    inp.push_back(&(*input)(0, 0));
-  }
-  const int64 dim0 = output->dimension(0);
-  for (int64 i = 0; i < dim0; ++i) {
-    for (int64 j = 0; j < num_inputs; ++j) {
-      auto size = sizes[j];
-      d.memcpy(out, inp[j], size * sizeof(T));
-      out += size;
-      inp[j] += size;
-    }
-  }
-}
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_KERNELS_CONCAT_LIB_CPU_H_
diff --git a/tensorflow/core/kernels/concat_op.cc b/tensorflow/core/kernels/concat_op.cc
index d3f3a04f33b..88ffe289665 100644
--- a/tensorflow/core/kernels/concat_op.cc
+++ b/tensorflow/core/kernels/concat_op.cc
@@ -35,9 +35,6 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 typedef Eigen::GpuDevice GPUDevice;
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 enum AxisArgumentName { NAME_IS_AXIS, NAME_IS_CONCAT_DIM };
 
@@ -168,12 +165,6 @@ class ConcatBaseOp : public OpKernel {
         return;
       }
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#ifdef TENSORFLOW_USE_SYCL
-      if (std::is_same<Device, SYCLDevice>::value) {
-        ConcatSYCL<T>(c->eigen_sycl_device(), inputs_flat, &output_flat);
-        return;
-      }
-#endif  // TENSORFLOW_USE_SYCL
       ConcatCPU<T>(c->device(), inputs_flat, &output_flat);
     }
   }
@@ -251,38 +242,6 @@ REGISTER_KERNEL_BUILDER(Name("ConcatV2")
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL(type)                              \
-  REGISTER_KERNEL_BUILDER(Name("Concat")                 \
-                              .Device(DEVICE_SYCL)       \
-                              .TypeConstraint<type>("T") \
-                              .HostMemory("concat_dim"), \
-                          ConcatOp<SYCLDevice, type>)    \
-  REGISTER_KERNEL_BUILDER(Name("ConcatV2")               \
-                              .Device(DEVICE_SYCL)       \
-                              .TypeConstraint<type>("T") \
-                              .HostMemory("axis"),       \
-                          ConcatV2Op<SYCLDevice, type>)
-
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL);
-
-REGISTER_KERNEL_BUILDER(Name("Concat")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int32>("T")
-                            .HostMemory("concat_dim")
-                            .HostMemory("values")
-                            .HostMemory("output"),
-                        ConcatOp<CPUDevice, int32>);
-REGISTER_KERNEL_BUILDER(Name("ConcatV2")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int32>("T")
-                            .HostMemory("values")
-                            .HostMemory("axis")
-                            .HostMemory("output"),
-                        ConcatV2Op<CPUDevice, int32>);
-
-#undef REGISTER_SYCL
-#endif  // TENSORFLOW_USE_SYCL
 
 class ConcatOffsetOp : public OpKernel {
  public:
@@ -370,12 +329,4 @@ REGISTER_KERNEL_BUILDER(Name("ConcatOffset")
                             .HostMemory("offset"),
                         ConcatOffsetOp);
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("ConcatOffset")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("concat_dim")
-                            .HostMemory("shape")
-                            .HostMemory("offset"),
-                        ConcatOffsetOp);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/constant_op.cc b/tensorflow/core/kernels/constant_op.cc
index 682da43a9b0..f9b382ca6f0 100644
--- a/tensorflow/core/kernels/constant_op.cc
+++ b/tensorflow/core/kernels/constant_op.cc
@@ -39,9 +39,6 @@ limitations under the License.
 #include "tensorflow/core/kernels/fill_functor.h"
 #include "tensorflow/core/platform/macros.h"
 
-#ifdef TENSORFLOW_USE_SYCL
-#include "tensorflow/core/common_runtime/sycl/sycl_util.h"
-#endif  // TENSORFLOW_USE_SYCL
 
 namespace tensorflow {
 
@@ -127,33 +124,9 @@ REGISTER_KERNEL(GPU, Variant);
 #undef REGISTER_KERNEL
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(D, TYPE)                                 \
-  REGISTER_KERNEL_BUILDER(                                            \
-      Name("Const").Device(DEVICE_##D).TypeConstraint<TYPE>("dtype"), \
-      ConstantOp);
-REGISTER_SYCL_KERNEL(SYCL, float);
-REGISTER_SYCL_KERNEL(SYCL, double);
-REGISTER_SYCL_KERNEL(SYCL, uint8);
-REGISTER_SYCL_KERNEL(SYCL, int8);
-REGISTER_SYCL_KERNEL(SYCL, qint8);
-REGISTER_SYCL_KERNEL(SYCL, uint16);
-REGISTER_SYCL_KERNEL(SYCL, int16);
-REGISTER_SYCL_KERNEL(SYCL, qint16);
-REGISTER_SYCL_KERNEL(SYCL, quint16);
-REGISTER_SYCL_KERNEL(SYCL, uint32);
-REGISTER_SYCL_KERNEL(SYCL, qint32);
-REGISTER_SYCL_KERNEL(SYCL, int64);
-REGISTER_SYCL_KERNEL(SYCL, uint64);
-REGISTER_SYCL_KERNEL(SYCL, bool);
-#undef REGISTER_SYCL_KERNEL
-#endif
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T, typename Index>
 class FillOp : public OpKernel {
@@ -216,25 +189,6 @@ REGISTER_KERNEL(CPU, qint8);
 REGISTER_KERNEL(CPU, qint16);
 #undef REGISTER_CPU_KERNEL
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL(SYCL, float);
-REGISTER_KERNEL(SYCL, double);
-REGISTER_KERNEL(SYCL, uint8);
-REGISTER_KERNEL(SYCL, int8);
-REGISTER_KERNEL(SYCL, uint16);
-REGISTER_KERNEL(SYCL, int16);
-REGISTER_KERNEL(SYCL, int64);
-
-REGISTER_KERNEL_BUILDER(Name("Fill")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int32>("T")
-                            .TypeConstraint<int32>("index_type")
-                            .HostMemory("dims")
-                            .HostMemory("value")
-                            .HostMemory("output"),
-                        FillOp<CPUDevice, int32, int32>);
-#undef REGISTER_KERNEL_SYCL
-#endif  // TENSORFLOW_USE_SYCL
 
 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
@@ -309,17 +263,6 @@ TF_CALL_POD_STRING_TYPES(REGISTER_CPU);
 REGISTER_CPU(Variant);
 #undef REGISTER_CPU
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL(bool, SYCL);
-REGISTER_KERNEL(float, SYCL);
-REGISTER_KERNEL(double, SYCL);
-REGISTER_KERNEL(int64, SYCL);
-REGISTER_KERNEL_BUILDER(Name("ZerosLike")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int32>("T")
-                            .HostMemory("y"),
-                        ZerosLikeOp<CPUDevice, int32>);
-#endif  // TENSORFLOW_USE_SYCL
 
 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
@@ -365,15 +308,6 @@ class OnesLikeOp : public OpKernel {
 TF_CALL_POD_TYPES(REGISTER_CPU);
 #undef REGISTER_CPU
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL(float, SYCL);
-REGISTER_KERNEL(bool, SYCL);
-REGISTER_KERNEL_BUILDER(Name("OnesLike")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int32>("T")
-                            .HostMemory("y"),
-                        OnesLikeOp<CPUDevice, int32>);
-#endif  // TENSORFLOW_USE_SYCL
 
 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
diff --git a/tensorflow/core/kernels/control_flow_ops.cc b/tensorflow/core/kernels/control_flow_ops.cc
index f886235a3f7..64b1390ed09 100644
--- a/tensorflow/core/kernels/control_flow_ops.cc
+++ b/tensorflow/core/kernels/control_flow_ops.cc
@@ -156,57 +156,6 @@ REGISTER_GPU_HOST_KERNEL(ResourceHandle);
 #undef REGISTER_GPU_HOST_KERNEL
 #undef REGISTER_GPU_HOST_REF_KERNEL
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_SWITCH(type)                        \
-  REGISTER_KERNEL_BUILDER(Name("Switch")                  \
-                              .Device(DEVICE_SYCL)        \
-                              .HostMemory("pred")         \
-                              .TypeConstraint<type>("T"), \
-                          SwitchOp)
-TF_CALL_REAL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_SWITCH);
-
-#define REGISTER_SYCL_REF_SWITCH(type)                    \
-  REGISTER_KERNEL_BUILDER(Name("RefSwitch")               \
-                              .Device(DEVICE_SYCL)        \
-                              .HostMemory("pred")         \
-                              .TypeConstraint<type>("T"), \
-                          SwitchOp)
-TF_CALL_REAL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_SWITCH);
-
-#undef REGISTER_SYCL_SWITCH
-#undef REGISTER_SYCL_REF_SWITCH
-
-#define REGISTER_SYCL_HOST_KERNEL(type)                   \
-  REGISTER_KERNEL_BUILDER(Name("Switch")                  \
-                              .Device(DEVICE_SYCL)        \
-                              .HostMemory("data")         \
-                              .HostMemory("pred")         \
-                              .HostMemory("output_false") \
-                              .HostMemory("output_true")  \
-                              .TypeConstraint<type>("T"), \
-                          SwitchOp)
-
-REGISTER_SYCL_HOST_KERNEL(bool);
-REGISTER_SYCL_HOST_KERNEL(tstring);
-REGISTER_SYCL_HOST_KERNEL(int32);
-
-#define REGISTER_SYCL_HOST_REF_KERNEL(type)               \
-  REGISTER_KERNEL_BUILDER(Name("RefSwitch")               \
-                              .Device(DEVICE_SYCL)        \
-                              .HostMemory("data")         \
-                              .HostMemory("pred")         \
-                              .HostMemory("output_false") \
-                              .HostMemory("output_true")  \
-                              .TypeConstraint<type>("T"), \
-                          SwitchOp)
-
-REGISTER_SYCL_HOST_REF_KERNEL(int32);
-REGISTER_SYCL_HOST_REF_KERNEL(bool);
-REGISTER_SYCL_HOST_REF_KERNEL(tstring);
-
-#undef REGISTER_SYCL_HOST_KERNEL
-#undef REGISTER_SYCL_HOST_REF_KERNEL
-#endif  // TENSORFLOW_USE_SYCL
 
 class RefSelectOp : public OpKernel {
  public:
@@ -316,28 +265,6 @@ TF_CALL_variant(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
 #undef REGISTER_GPU_REF_KERNEL
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)                        \
-  REGISTER_KERNEL_BUILDER(Name("Merge")                   \
-                              .Device(DEVICE_SYCL)        \
-                              .TypeConstraint<type>("T")  \
-                              .HostMemory("value_index"), \
-                          MergeOp);
-REGISTER_SYCL_KERNEL(bool);
-TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
-
-#define REGISTER_SYCL_REF_KERNEL(type)                    \
-  REGISTER_KERNEL_BUILDER(Name("RefMerge")                \
-                              .Device(DEVICE_SYCL)        \
-                              .TypeConstraint<type>("T")  \
-                              .HostMemory("value_index"), \
-                          MergeOp);
-REGISTER_SYCL_REF_KERNEL(bool);
-TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_KERNEL);
-
-#undef REGISTER_SYCL_KERNEL
-#undef REGISTER_SYCL_REF_KERNEL
-#endif  // TENSORFLOW_USE_SYCL
 
 // Special GPU kernels for int32 and string.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
@@ -364,29 +291,6 @@ REGISTER_GPU_HOST_KERNEL(ResourceHandle);
 
 #undef REGISTER_GPU_HOST_KERNEL
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_HOST_KERNEL(type)                   \
-  REGISTER_KERNEL_BUILDER(Name("Merge")                   \
-                              .Device(DEVICE_SYCL)        \
-                              .HostMemory("inputs")       \
-                              .HostMemory("output")       \
-                              .HostMemory("value_index")  \
-                              .TypeConstraint<type>("T"), \
-                          MergeOp);                       \
-  REGISTER_KERNEL_BUILDER(Name("RefMerge")                \
-                              .Device(DEVICE_SYCL)        \
-                              .HostMemory("inputs")       \
-                              .HostMemory("output")       \
-                              .HostMemory("value_index")  \
-                              .TypeConstraint<type>("T"), \
-                          MergeOp)
-
-REGISTER_SYCL_HOST_KERNEL(int32);
-REGISTER_SYCL_HOST_KERNEL(tstring);
-REGISTER_SYCL_HOST_KERNEL(ResourceHandle);
-
-#undef REGISTER_SYCL_HOST_KERNEL
-#endif  // TENSORFLOW_USE_SYCL
 
 void EnterOp::Compute(OpKernelContext* context) {
   if (IsRefType(context->input_dtype(0))) {
@@ -416,46 +320,6 @@ TF_CALL_variant(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
 #undef REGISTER_GPU_REF_KERNEL
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type) \
-  REGISTER_KERNEL_BUILDER(         \
-      Name("Enter").Device(DEVICE_SYCL).TypeConstraint<type>("T"), EnterOp)
-REGISTER_SYCL_KERNEL(bool);
-TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
-
-#define REGISTER_SYCL_REF_KERNEL(type) \
-  REGISTER_KERNEL_BUILDER(             \
-      Name("RefEnter").Device(DEVICE_SYCL).TypeConstraint<type>("T"), EnterOp)
-REGISTER_SYCL_REF_KERNEL(bool);
-TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_KERNEL);
-
-#undef REGISTER_SYCL_KERNEL
-#undef REGISTER_SYCL_REF_KERNEL
-#define REGISTER_SYCL_HOST_KERNEL(type)                   \
-  REGISTER_KERNEL_BUILDER(Name("Enter")                   \
-                              .Device(DEVICE_SYCL)        \
-                              .HostMemory("data")         \
-                              .HostMemory("output")       \
-                              .TypeConstraint<type>("T"), \
-                          EnterOp)
-
-#define REGISTER_SYCL_HOST_REF_KERNEL(type)               \
-  REGISTER_KERNEL_BUILDER(Name("RefEnter")                \
-                              .Device(DEVICE_SYCL)        \
-                              .HostMemory("data")         \
-                              .HostMemory("output")       \
-                              .TypeConstraint<type>("T"), \
-                          EnterOp)
-
-REGISTER_SYCL_HOST_KERNEL(int32);
-REGISTER_SYCL_HOST_REF_KERNEL(int32);
-REGISTER_SYCL_HOST_KERNEL(tstring);
-REGISTER_SYCL_HOST_REF_KERNEL(tstring);
-REGISTER_SYCL_HOST_KERNEL(ResourceHandle);
-
-#undef REGISTER_SYCL_HOST_KERNEL
-#undef REGISTER_SYCL_HOST_REF_KERNEL
-#endif  // TENSORFLOW_USE_SYCL
 
 // Special GPU kernels for int32 and string.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
@@ -513,36 +377,6 @@ TF_CALL_variant(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
 #undef REGISTER_GPU_REF_KERNEL
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)                                         \
-  REGISTER_KERNEL_BUILDER(                                                 \
-      Name("Exit").Device(DEVICE_SYCL).TypeConstraint<type>("T"), ExitOp); \
-  REGISTER_KERNEL_BUILDER(                                                 \
-      Name("RefExit").Device(DEVICE_SYCL).TypeConstraint<type>("T"), ExitOp);
-REGISTER_SYCL_KERNEL(bool);
-TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
-
-#undef REGISTER_SYCL_KERNEL
-#undef REGISTER_SYCL_REF_KERNEL
-
-#define REGISTER_SYCL_HOST_KERNEL(type)                   \
-  REGISTER_KERNEL_BUILDER(Name("Exit")                    \
-                              .Device(DEVICE_SYCL)        \
-                              .HostMemory("data")         \
-                              .HostMemory("output")       \
-                              .TypeConstraint<type>("T"), \
-                          ExitOp);                        \
-  REGISTER_KERNEL_BUILDER(Name("RefExit")                 \
-                              .Device(DEVICE_SYCL)        \
-                              .HostMemory("data")         \
-                              .HostMemory("output")       \
-                              .TypeConstraint<type>("T"), \
-                          ExitOp)
-
-REGISTER_SYCL_HOST_KERNEL(int32);
-REGISTER_SYCL_HOST_KERNEL(tstring);
-#undef REGISTER_SYCL_HOST_KERNEL
-#endif  // TENSORFLOW_USE_SYCL
 
 // Special GPU kernels for int32 and string.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
@@ -619,37 +453,6 @@ REGISTER_GPU_HOST_KERNEL(ResourceHandle);
 
 #undef REGISTER_GPU_HOST_KERNEL
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)                                            \
-  REGISTER_KERNEL_BUILDER(                                                    \
-      Name("NextIteration").Device(DEVICE_SYCL).TypeConstraint<type>("T"),    \
-      NextIterationOp);                                                       \
-  REGISTER_KERNEL_BUILDER(                                                    \
-      Name("RefNextIteration").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
-      NextIterationOp)
-REGISTER_SYCL_KERNEL(bool);
-TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
-
-#undef REGISTER_SYCL_KERNEL
-
-#define REGISTER_SYCL_HOST_KERNEL(type)                   \
-  REGISTER_KERNEL_BUILDER(Name("NextIteration")           \
-                              .Device(DEVICE_SYCL)        \
-                              .HostMemory("data")         \
-                              .HostMemory("output")       \
-                              .TypeConstraint<type>("T"), \
-                          NextIterationOp);               \
-  REGISTER_KERNEL_BUILDER(Name("RefNextIteration")        \
-                              .Device(DEVICE_SYCL)        \
-                              .HostMemory("data")         \
-                              .HostMemory("output")       \
-                              .TypeConstraint<type>("T"), \
-                          NextIterationOp)
-
-REGISTER_SYCL_HOST_KERNEL(int32);
-REGISTER_SYCL_HOST_KERNEL(tstring);
-#undef REGISTER_SYCL_HOST_KERNEL
-#endif  // TENSORFLOW_USE_SYCL
 
 LoopCondOp::LoopCondOp(OpKernelConstruction* context) : OpKernel(context) {}
 LoopCondOp::~LoopCondOp() = default;
diff --git a/tensorflow/core/kernels/conv_2d.h b/tensorflow/core/kernels/conv_2d.h
index 88e9742b136..d65cca6dd36 100644
--- a/tensorflow/core/kernels/conv_2d.h
+++ b/tensorflow/core/kernels/conv_2d.h
@@ -316,7 +316,7 @@ struct PadInput {
                   const std::array<int, NDIMS - 2>& padding_left,
                   const std::array<int, NDIMS - 2>& padding_right,
                   typename TTypes<T, NDIMS, IndexType>::Tensor out,
-                  TensorFormat format) {
+                  TensorFormat format, T padding_value = T{}) {
     Eigen::array<Eigen::IndexPair<IndexType>, NDIMS> padding;
     padding[GetTensorDimIndex<NDIMS - 2>(format, 'N')] = {0, 0};
     for (int i = 0; i < NDIMS - 2; ++i) {
@@ -324,7 +324,7 @@ struct PadInput {
           padding_left[i], padding_right[i]};
     }
     padding[GetTensorDimIndex<NDIMS - 2>(format, 'C')] = {0, 0};
-    out.device(d) = in.pad(padding);
+    out.device(d) = in.pad(padding, padding_value);
   }
 };
 
diff --git a/tensorflow/core/kernels/conv_2d_gpu.h b/tensorflow/core/kernels/conv_2d_gpu.h
index 85ca2b5722a..31aa04247fa 100644
--- a/tensorflow/core/kernels/conv_2d_gpu.h
+++ b/tensorflow/core/kernels/conv_2d_gpu.h
@@ -417,14 +417,12 @@ __global__ void SwapDimension1And2InTensor3UsingTiles(
 }
 
 // A Gpu custom kernel that convert input to output, given proper padding on
-// the left and the top. The padded value is zero.
+// the left and the top.
 template <typename T, int NDIMS>
-__global__ void PadInputCustomKernelNHWC(int nthreads,
-                                         const T* __restrict__ input,
-                                         Dimension<NDIMS> input_dims,
-                                         T* __restrict__ output,
-                                         Dimension<NDIMS> output_dims,
-                                         Dimension<NDIMS - 2> padding_left) {
+__global__ void PadInputCustomKernelNHWC(
+    int nthreads, const T* __restrict__ input, Dimension<NDIMS> input_dims,
+    T* __restrict__ output, Dimension<NDIMS> output_dims,
+    Dimension<NDIMS - 2> padding_left, T padding_value) {
   GPU_1D_KERNEL_LOOP(index, nthreads) {
     int output_index = index;
     Index<NDIMS> output_tensor_index =
@@ -444,18 +442,16 @@ __global__ void PadInputCustomKernelNHWC(int nthreads,
       const int input_index = TensorIndexToFlat(input_tensor_index, input_dims);
       output[output_index] = input[input_index];
     } else {
-      output[output_index] = T(0);
+      output[output_index] = padding_value;
     }
   }
 }
 
 template <typename T, int NDIMS>
-__global__ void PadInputCustomKernelNCHW(int nthreads,
-                                         const T* __restrict__ input,
-                                         Dimension<NDIMS> input_dims,
-                                         T* __restrict__ output,
-                                         Dimension<NDIMS> output_dims,
-                                         Dimension<NDIMS - 2> padding_left) {
+__global__ void PadInputCustomKernelNCHW(
+    int nthreads, const T* __restrict__ input, Dimension<NDIMS> input_dims,
+    T* __restrict__ output, Dimension<NDIMS> output_dims,
+    Dimension<NDIMS - 2> padding_left, T padding_value) {
   GPU_1D_KERNEL_LOOP(index, nthreads) {
     int output_index = index;
     Index<NDIMS> output_tensor_index =
@@ -475,7 +471,7 @@ __global__ void PadInputCustomKernelNCHW(int nthreads,
       const int input_index = TensorIndexToFlat(input_tensor_index, input_dims);
       output[output_index] = input[input_index];
     } else {
-      output[output_index] = T(0);
+      output[output_index] = padding_value;
     }
   }
 }
@@ -572,7 +568,7 @@ struct PadInput<GPUDevice, T, int, NDIMS> {
                   const std::array<int, NDIMS - 2>& padding_left,
                   const std::array<int, NDIMS - 2>& padding_right,
                   typename TTypes<T, NDIMS, int>::Tensor out,
-                  TensorFormat format) {
+                  TensorFormat format, T padding_value = T{}) {
     GpuLaunchConfig config = GetGpuLaunchConfig(out.size(), d);
     Dimension<NDIMS> input_dims;
     for (int i = 0; i < NDIMS; ++i) {
@@ -589,12 +585,14 @@ struct PadInput<GPUDevice, T, int, NDIMS> {
       TF_CHECK_OK(GpuLaunchKernel(
           PadInputCustomKernelNHWC<T, NDIMS>, config.block_count,
           config.thread_per_block, 0, d.stream(), config.virtual_thread_count,
-          in.data(), input_dims, out.data(), output_dims, padding_left_dim));
+          in.data(), input_dims, out.data(), output_dims, padding_left_dim,
+          padding_value));
     } else if (format == FORMAT_NCHW) {
       TF_CHECK_OK(GpuLaunchKernel(
           PadInputCustomKernelNCHW<T, NDIMS>, config.block_count,
           config.thread_per_block, 0, d.stream(), config.virtual_thread_count,
-          in.data(), input_dims, out.data(), output_dims, padding_left_dim));
+          in.data(), input_dims, out.data(), output_dims, padding_left_dim,
+          padding_value));
     } else {
       LOG(FATAL) << "Invalid data format: " << format;
     }
diff --git a/tensorflow/core/kernels/conv_grad_filter_ops.cc b/tensorflow/core/kernels/conv_grad_filter_ops.cc
index b16d3c7270f..840d2c13c77 100644
--- a/tensorflow/core/kernels/conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_filter_ops.cc
@@ -301,7 +301,6 @@ class Conv2DBackpropFilterOp : public OpKernel {
                                               /*num_dims=*/4, data_format_));
 
     OP_REQUIRES_OK(context, context->GetAttr("use_cudnn_on_gpu", &use_cudnn_));
-    use_cudnn_ &= CanUseCudnn();
     cudnn_use_autotune_ = CudnnUseAutotune();
 
     if (std::is_same<Device, CPUDevice>::value) {
@@ -1136,19 +1135,20 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
 
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
-#define DECLARE_GPU_SPEC(T)                                              \
-  template <>                                                            \
-  void TransformFilter<GPUDevice, T, int, 4>::operator()(                \
-      const GPUDevice& d, FilterTensorFormat dst_filter_format,          \
-      typename TTypes<T, 4, int>::ConstTensor in,                        \
-      typename TTypes<T, 4, int>::Tensor out);                           \
-  extern template struct TransformFilter<GPUDevice, T, int, 4>;          \
-  template <>                                                            \
-  void PadInput<GPUDevice, T, int, 4>::operator()(                       \
-      const GPUDevice& d, typename TTypes<T, 4, int>::ConstTensor in,    \
-      const std::array<int, 2>& padding_left,                            \
-      const std::array<int, 2>& padding_right,                           \
-      typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format); \
+#define DECLARE_GPU_SPEC(T)                                             \
+  template <>                                                           \
+  void TransformFilter<GPUDevice, T, int, 4>::operator()(               \
+      const GPUDevice& d, FilterTensorFormat dst_filter_format,         \
+      typename TTypes<T, 4, int>::ConstTensor in,                       \
+      typename TTypes<T, 4, int>::Tensor out);                          \
+  extern template struct TransformFilter<GPUDevice, T, int, 4>;         \
+  template <>                                                           \
+  void PadInput<GPUDevice, T, int, 4>::operator()(                      \
+      const GPUDevice& d, typename TTypes<T, 4, int>::ConstTensor in,   \
+      const std::array<int, 2>& padding_left,                           \
+      const std::array<int, 2>& padding_right,                          \
+      typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format, \
+      T padding_value);                                                 \
   extern template struct PadInput<GPUDevice, T, int, 4>;
 
 DECLARE_GPU_SPEC(float);
diff --git a/tensorflow/core/kernels/conv_grad_input_ops.cc b/tensorflow/core/kernels/conv_grad_input_ops.cc
index 86090864ddb..c1dadcf767c 100644
--- a/tensorflow/core/kernels/conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_input_ops.cc
@@ -426,7 +426,6 @@ class Conv2DBackpropInputOp : public OpKernel {
                                               /*num_dims=*/4, data_format_));
 
     OP_REQUIRES_OK(context, context->GetAttr("use_cudnn_on_gpu", &use_cudnn_));
-    use_cudnn_ &= CanUseCudnn();
     cudnn_use_autotune_ = CudnnUseAutotune();
 
     if (std::is_same<Device, CPUDevice>::value ||
@@ -1334,19 +1333,20 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
 
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
-#define DECLARE_GPU_SPEC(T)                                              \
-  template <>                                                            \
-  void TransformFilter<GPUDevice, T, int, 4>::operator()(                \
-      const GPUDevice& d, FilterTensorFormat dst_filter_format,          \
-      typename TTypes<T, 4, int>::ConstTensor in,                        \
-      typename TTypes<T, 4, int>::Tensor out);                           \
-  extern template struct TransformFilter<GPUDevice, T, int, 4>;          \
-  template <>                                                            \
-  void PadInput<GPUDevice, T, int, 4>::operator()(                       \
-      const GPUDevice& d, typename TTypes<T, 4, int>::ConstTensor in,    \
-      const std::array<int, 2>& padding_left,                            \
-      const std::array<int, 2>& padding_right,                           \
-      typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format); \
+#define DECLARE_GPU_SPEC(T)                                             \
+  template <>                                                           \
+  void TransformFilter<GPUDevice, T, int, 4>::operator()(               \
+      const GPUDevice& d, FilterTensorFormat dst_filter_format,         \
+      typename TTypes<T, 4, int>::ConstTensor in,                       \
+      typename TTypes<T, 4, int>::Tensor out);                          \
+  extern template struct TransformFilter<GPUDevice, T, int, 4>;         \
+  template <>                                                           \
+  void PadInput<GPUDevice, T, int, 4>::operator()(                      \
+      const GPUDevice& d, typename TTypes<T, 4, int>::ConstTensor in,   \
+      const std::array<int, 2>& padding_left,                           \
+      const std::array<int, 2>& padding_right,                          \
+      typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format, \
+      T padding_value);                                                 \
   extern template struct PadInput<GPUDevice, T, int, 4>;
 
 DECLARE_GPU_SPEC(float);
diff --git a/tensorflow/core/kernels/conv_grad_ops_3d.cc b/tensorflow/core/kernels/conv_grad_ops_3d.cc
index 322da2537f0..7c61dc92604 100644
--- a/tensorflow/core/kernels/conv_grad_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_grad_ops_3d.cc
@@ -1082,7 +1082,8 @@ namespace functor {
       const GPUDevice& d, typename TTypes<T, 5, int>::ConstTensor in, \
       const std::array<int, 3>& padding_left,                         \
       const std::array<int, 3>& padding_right,                        \
-      typename TTypes<T, 5, int>::Tensor out, TensorFormat format);
+      typename TTypes<T, 5, int>::Tensor out, TensorFormat format,    \
+      T padding_value);
 
 DECLARE_GPU_SPEC(Eigen::half);
 DECLARE_GPU_SPEC(float);
@@ -1394,7 +1395,9 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
         "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32);  // 4GB by default
 
     const int device_id = stream->parent()->device_ordinal();
-    DataType dtype = context->input(0).dtype();
+    // To make sure the Conv3DBackpropInputV2 get the correct dtype, we infer
+    // the dtype from 2nd input, i.e., out_backprop.
+    DataType dtype = context->input(2).dtype();
     const ConvParameters conv_parameters = {
         dims.batch_size,
         dims.in_depth,
diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc
index 8db796c216b..b8c2671e7d2 100644
--- a/tensorflow/core/kernels/conv_ops.cc
+++ b/tensorflow/core/kernels/conv_ops.cc
@@ -512,7 +512,6 @@ class Conv2DOp : public BinaryOp<T> {
     OP_REQUIRES_OK(context, InitConv2DParameters(context, &params_));
 
     OP_REQUIRES_OK(context, context->GetAttr("use_cudnn_on_gpu", &use_cudnn_));
-    use_cudnn_ &= CanUseCudnn();
     cudnn_use_autotune_ = CudnnUseAutotune();
   }
 
@@ -1184,7 +1183,8 @@ namespace functor {
       const GPUDevice& d, typename TTypes<T, 4, int>::ConstTensor in,       \
       const std::array<int, 2>& padding_left,                               \
       const std::array<int, 2>& padding_right,                              \
-      typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format);    \
+      typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format,     \
+      T padding_value);                                                     \
   extern template struct PadInput<GPUDevice, T, int, 4>
 
 DECLARE_GPU_SPEC(float);
diff --git a/tensorflow/core/kernels/conv_ops_3d.cc b/tensorflow/core/kernels/conv_ops_3d.cc
index 289a083acfb..660f4b6febc 100644
--- a/tensorflow/core/kernels/conv_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_ops_3d.cc
@@ -539,7 +539,8 @@ namespace functor {
       const GPUDevice& d, typename TTypes<T, 5, int>::ConstTensor in, \
       const std::array<int, 3>& padding_left,                         \
       const std::array<int, 3>& padding_right,                        \
-      typename TTypes<T, 5, int>::Tensor out, TensorFormat format);   \
+      typename TTypes<T, 5, int>::Tensor out, TensorFormat format,    \
+      T padding_value);                                               \
   template <>                                                         \
   void NHWCToNCHW<GPUDevice, T, 5>::operator()(                       \
       const GPUDevice& d, typename TTypes<T, 5>::ConstTensor in,      \
diff --git a/tensorflow/core/kernels/conv_ops_fused_image_transform.cc b/tensorflow/core/kernels/conv_ops_fused_image_transform.cc
index 9055639aaaf..091e483b2ca 100644
--- a/tensorflow/core/kernels/conv_ops_fused_image_transform.cc
+++ b/tensorflow/core/kernels/conv_ops_fused_image_transform.cc
@@ -34,9 +34,9 @@ limitations under the License.
 #include "tensorflow/core/kernels/conv_2d.h"
 #include "tensorflow/core/kernels/conv_ops.h"
 #include "tensorflow/core/kernels/gemm_functors.h"
-#include "tensorflow/core/kernels/image_resizer_state.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/util/image_resizer_state.h"
 #include "tensorflow/core/util/mirror_pad_mode.h"
 #include "tensorflow/core/util/padding.h"
 #include "tensorflow/core/util/tensor_format.h"
diff --git a/tensorflow/core/kernels/conv_ops_fused_impl.h b/tensorflow/core/kernels/conv_ops_fused_impl.h
index fb31fc14a7c..5011e8ba7a1 100644
--- a/tensorflow/core/kernels/conv_ops_fused_impl.h
+++ b/tensorflow/core/kernels/conv_ops_fused_impl.h
@@ -106,6 +106,16 @@ class LaunchFusedConv2DWithOutputKernel {
   template <typename OutputKernel>
   void operator()(const OutputKernel& output_kernel, OpKernelContext* ctx,
                   const Tensor& input, const Tensor& filter, Tensor* output) {
+    // Wrap output_kernel into type erased wrapper to reduce the number of
+    // unique template instantiations for Eigen Tensor contraction expressions.
+    OutputKernelWrapper output_kernel_wrapper(
+        [&output_kernel](
+            const ContractionOutputMapper<T, Eigen::Index>& output_mapper,
+            const Eigen::TensorContractionParams& params, Eigen::Index i,
+            Eigen::Index j, Eigen::Index num_rows, Eigen::Index num_cols) {
+          output_kernel(output_mapper, params, i, j, num_rows, num_cols);
+        });
+
     if (filter.dim_size(0) == 1 && filter.dim_size(1) == 1 &&
         row_stride_ == 1 && col_stride_ == 1 && padding_ != EXPLICIT) {
       int conv_width = 1;  // Width for the convolution step.
@@ -115,12 +125,12 @@ class LaunchFusedConv2DWithOutputKernel {
 
       Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair;
       dim_pair[0] = Eigen::IndexPair<Eigen::DenseIndex>(1, 0);
-      functor::MatMulConvFunctor<CPUDevice, T, OutputKernel>()(
+      functor::MatMulConvFunctor<CPUDevice, T, OutputKernelWrapper>()(
           ctx->eigen_device<CPUDevice>(),
           output->shaped<T, 2>({conv_width, filter.dim_size(3)}),
           input.shaped<T, 2>({conv_width, filter.dim_size(2)}),
           filter.shaped<T, 2>({filter.dim_size(2), filter.dim_size(3)}),
-          dim_pair, output_kernel);
+          dim_pair, std::move(output_kernel_wrapper));
 
     } else if (filter.dim_size(0) == input.dim_size(1) &&
                filter.dim_size(1) == input.dim_size(2) && row_dilation_ == 1 &&
@@ -132,34 +142,60 @@ class LaunchFusedConv2DWithOutputKernel {
 
       Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair;
       dim_pair[0] = Eigen::IndexPair<Eigen::DenseIndex>(1, 0);
-      functor::MatMulConvFunctor<CPUDevice, T, OutputKernel>()(
+      functor::MatMulConvFunctor<CPUDevice, T, OutputKernelWrapper>()(
           ctx->eigen_device<CPUDevice>(),
           output->shaped<T, 2>({input.dim_size(0), filter.dim_size(3)}),
           input.shaped<T, 2>({input.dim_size(0), k}),
           filter.shaped<T, 2>({k, filter.dim_size(3)}), dim_pair,
-          output_kernel);
+          std::move(output_kernel_wrapper));
 
     } else {
       if (padding_ == EXPLICIT) {
-        functor::SpatialConvolution<CPUDevice, T, OutputKernel>()(
+        functor::SpatialConvolution<CPUDevice, T, OutputKernelWrapper>()(
             ctx->eigen_device<CPUDevice>(), output->tensor<T, 4>(),
             input.tensor<T, 4>(), filter.tensor<T, 4>(), row_stride_,
             col_stride_, row_dilation_, col_dilation_,
             static_cast<int>(explicit_paddings_[2]),
             static_cast<int>(explicit_paddings_[3]),
             static_cast<int>(explicit_paddings_[4]),
-            static_cast<int>(explicit_paddings_[5]), output_kernel);
+            static_cast<int>(explicit_paddings_[5]),
+            std::move(output_kernel_wrapper));
       } else {
-        functor::SpatialConvolution<CPUDevice, T, OutputKernel>()(
+        functor::SpatialConvolution<CPUDevice, T, OutputKernelWrapper>()(
             ctx->eigen_device<CPUDevice>(), output->tensor<T, 4>(),
             input.tensor<T, 4>(), filter.tensor<T, 4>(), row_stride_,
             col_stride_, row_dilation_, col_dilation_,
-            BrainPadding2EigenPadding(padding_), output_kernel);
+            BrainPadding2EigenPadding(padding_),
+            std::move(output_kernel_wrapper));
       }
     }
   }
 
  private:
+  // Wrap output_kernel into type erased struct to reduce the number of unique
+  // template instantiations for Eigen Tensor contraction expressions.
+  //
+  // We do not pass std::function directly as an output kernel because it blows
+  // up the binary size in debug mode with super long symbol names.
+  struct OutputKernelWrapper {
+    using OutputKernelFn =
+        std::function<void(const ContractionOutputMapper<T, Eigen::Index>&,
+                           const Eigen::TensorContractionParams&, Eigen::Index,
+                           Eigen::Index, Eigen::Index, Eigen::Index)>;
+
+    explicit OutputKernelWrapper(OutputKernelFn fn)
+        : output_kernel_fn(std::move(fn)) {}
+
+    void operator()(
+        const ContractionOutputMapper<T, Eigen::Index>& output_mapper,
+        const Eigen::TensorContractionParams& params, Eigen::Index i,
+        Eigen::Index j, Eigen::Index num_rows, Eigen::Index num_cols) const {
+      output_kernel_fn(output_mapper, params, i, j, num_rows, num_cols);
+    }
+
+    OutputKernelFn output_kernel_fn;
+  };
+
   int row_stride_;
   int col_stride_;
   int row_dilation_;
@@ -185,14 +221,26 @@ struct LaunchFusedConv2DOp<CPUDevice, T> {
 
     BiasAddArgs<T> bias_add_args;
     if (BiasAddArgs<T>::IsSupported(fusion)) {
-      OP_REQUIRES_OK(context, InitBiasAddArgs(context, &bias_add_args));
+      if (fusion == FusedComputationType::kBiasAddWithLeakyRelu) {
+        OP_REQUIRES_OK(context, InitBiasAddArgs(context, &bias_add_args,
+                                                &fusion_args.leakyrelu_alpha));
+      } else {
+        OP_REQUIRES_OK(context, InitBiasAddArgs(context, &bias_add_args));
+      }
     }
 
     FusedBatchNormArgs<T> fused_batch_norm_args;
     if (FusedBatchNormArgs<T>::IsSupported(fusion)) {
-      OP_REQUIRES_OK(context,
-                     InitFusedBatchNormArgs(context, fusion_args.epsilon,
-                                            &fused_batch_norm_args));
+      if (fusion == FusedComputationType::kFusedBatchNormWithLeakyRelu) {
+        OP_REQUIRES_OK(context,
+                       InitFusedBatchNormArgs(context, fusion_args.epsilon,
+                                              &fused_batch_norm_args,
+                                              &fusion_args.leakyrelu_alpha));
+      } else {
+        OP_REQUIRES_OK(context,
+                       InitFusedBatchNormArgs(context, fusion_args.epsilon,
+                                              &fused_batch_norm_args));
+      }
     }
 
     LaunchFusedConv2DWithOutputKernel<T> conv2d(
@@ -215,6 +263,10 @@ struct LaunchFusedConv2DOp<CPUDevice, T> {
         conv2d(WithBiasAddAndRelu6<T>(bias_add_args), context, input, filter,
                output);
         break;
+      case FusedComputationType::kBiasAddWithLeakyRelu:
+        conv2d(WithBiasAddAndLeakyRelu<T>(bias_add_args), context, input,
+               filter, output);
+        break;
       case FusedComputationType::kBiasAddWithElu:
         conv2d(WithBiasAddAndElu<T>(bias_add_args), context, input, filter,
                output);
@@ -234,6 +286,11 @@ struct LaunchFusedConv2DOp<CPUDevice, T> {
                                              fused_batch_norm_args),
                context, input, filter, output);
         break;
+      case FusedComputationType::kFusedBatchNormWithLeakyRelu:
+        conv2d(WithFusedBatchNormAndLeakyRelu<T>(fusion_args.epsilon,
+                                                 fused_batch_norm_args),
+               context, input, filter, output);
+        break;
       case FusedComputationType::kFusedBatchNormWithElu:
         conv2d(WithFusedBatchNormAndElu<T>(fusion_args.epsilon,
                                            fused_batch_norm_args),
@@ -670,7 +727,6 @@ class FusedConv2DOp : public OpKernel {
     OP_REQUIRES_OK(context, InitConv2DParameters(context, &params_));
 
     OP_REQUIRES_OK(context, context->GetAttr("use_cudnn_on_gpu", &use_cudnn_));
-    use_cudnn_ &= CanUseCudnn();
     cudnn_use_autotune_ = CudnnUseAutotune();
 
     using FCT = FusedComputationType;
@@ -682,10 +738,12 @@ class FusedConv2DOp : public OpKernel {
           {FCT::kBiasAddWithRelu, {"BiasAdd", "Relu"}},
           {FCT::kBiasAddWithRelu6, {"BiasAdd", "Relu6"}},
           {FCT::kBiasAddWithElu, {"BiasAdd", "Elu"}},
+          {FCT::kBiasAddWithLeakyRelu, {"BiasAdd", "LeakyRelu"}},
           {FCT::kFusedBatchNorm, {"FusedBatchNorm"}},
           {FCT::kFusedBatchNormWithRelu, {"FusedBatchNorm", "Relu"}},
           {FCT::kFusedBatchNormWithRelu6, {"FusedBatchNorm", "Relu6"}},
           {FCT::kFusedBatchNormWithElu, {"FusedBatchNorm", "Elu"}},
+          {FCT::kFusedBatchNormWithLeakyRelu, {"FusedBatchNorm", "LeakyRelu"}},
       };
     }
 
@@ -766,19 +824,20 @@ class FusedConv2DOp : public OpKernel {
 
 #if GOOGLE_CUDA
 
-#define DECLARE_FUNCTOR_GPU_SPEC(T)                                      \
-  template <>                                                            \
-  void TransformFilter<GPUDevice, T, int, 4>::operator()(                \
-      const GPUDevice& d, FilterTensorFormat dst_filter_format,          \
-      typename TTypes<T, 4, int>::ConstTensor in,                        \
-      typename TTypes<T, 4, int>::Tensor out);                           \
-  extern template struct TransformFilter<GPUDevice, T, int, 4>;          \
-  template <>                                                            \
-  void PadInput<GPUDevice, T, int, 4>::operator()(                       \
-      const GPUDevice& d, typename TTypes<T, 4, int>::ConstTensor in,    \
-      const std::array<int, 2>& padding_left,                            \
-      const std::array<int, 2>& padding_right,                           \
-      typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format); \
+#define DECLARE_FUNCTOR_GPU_SPEC(T)                                     \
+  template <>                                                           \
+  void TransformFilter<GPUDevice, T, int, 4>::operator()(               \
+      const GPUDevice& d, FilterTensorFormat dst_filter_format,         \
+      typename TTypes<T, 4, int>::ConstTensor in,                       \
+      typename TTypes<T, 4, int>::Tensor out);                          \
+  extern template struct TransformFilter<GPUDevice, T, int, 4>;         \
+  template <>                                                           \
+  void PadInput<GPUDevice, T, int, 4>::operator()(                      \
+      const GPUDevice& d, typename TTypes<T, 4, int>::ConstTensor in,   \
+      const std::array<int, 2>& padding_left,                           \
+      const std::array<int, 2>& padding_right,                          \
+      typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format, \
+      T padding_value);                                                 \
   extern template struct PadInput<GPUDevice, T, int, 4>
 
 // Registration of the GPU implementations.
diff --git a/tensorflow/core/kernels/conv_ops_gpu.h b/tensorflow/core/kernels/conv_ops_gpu.h
index ba33224f10a..2e97d486b54 100644
--- a/tensorflow/core/kernels/conv_ops_gpu.h
+++ b/tensorflow/core/kernels/conv_ops_gpu.h
@@ -68,7 +68,7 @@ class DnnScratchAllocator : public se::ScratchAllocator {
                                            memory_limit_, ").")};
     }
     AllocationAttributes allocation_attr;
-    allocation_attr.no_retry_on_failure = true;
+    allocation_attr.retry_on_failure = false;
     Status allocation_status(context_->allocate_temp(
         DT_UINT8, TensorShape({byte_size}), &temporary_memory,
         AllocatorAttributes(), allocation_attr));
diff --git a/tensorflow/core/kernels/conv_ops_test.cc b/tensorflow/core/kernels/conv_ops_test.cc
index 3e192b83c57..6be42217501 100644
--- a/tensorflow/core/kernels/conv_ops_test.cc
+++ b/tensorflow/core/kernels/conv_ops_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/cc/ops/const_op.h"
 #include "tensorflow/cc/ops/image_ops.h"
 #include "tensorflow/cc/ops/nn_ops.h"
+#include "tensorflow/cc/ops/nn_ops_internal.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
 #include "tensorflow/core/framework/fake_input.h"
@@ -30,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/tensor_float_32_utils.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
@@ -652,6 +654,8 @@ class FusedConv2DOpTest : public OpsTestBase {
       ops::Relu6(root.WithOpName("with_activation"), with_bias);
     } else if (activation_type == "Elu") {
       ops::Elu(root.WithOpName("with_activation"), with_bias);
+    } else if (activation_type == "LeakyRelu") {
+      ops::internal::LeakyRelu(root.WithOpName("with_activation"), with_bias);
     } else {
       ops::Identity(root.WithOpName("with_activation"), with_bias);
     }
@@ -721,6 +725,9 @@ class FusedConv2DOpTest : public OpsTestBase {
       ops::Relu6(root.WithOpName("with_activation"), with_fused_batch_norm.y);
     } else if (activation_type == "Elu") {
       ops::Elu(root.WithOpName("with_activation"), with_fused_batch_norm.y);
+    } else if (activation_type == "LeakyRelu") {
+      ops::internal::LeakyRelu(root.WithOpName("with_activation"),
+                               with_fused_batch_norm.y);
     } else {
       ops::Identity(root.WithOpName("with_activation"),
                     with_fused_batch_norm.y);
@@ -1038,9 +1045,11 @@ TYPED_TEST_P(FusedConv2DWithBiasOpTest, ExplicitPaddingConvolution) {
 #endif
 
 TYPED_TEST_P(FusedConv2DWithBiasOpTest, OneByOneConvolutionAndActivation) {
+  // Requires full precision Conv2D op
+  tensorflow::enable_tensor_float_32_execution(false);
   const int filter_size = 1;
   const int filter_count = 12;
-  for (const string& activation : {"Relu", "Relu6", "Elu"}) {
+  for (const string& activation : {"Relu", "Relu6", "Elu", "LeakyRelu"}) {
     this->VerifyConv2DWithBiasAndActivation(activation, filter_size,
                                             filter_count);
   }
@@ -1049,7 +1058,7 @@ TYPED_TEST_P(FusedConv2DWithBiasOpTest, OneByOneConvolutionAndActivation) {
 TYPED_TEST_P(FusedConv2DWithBiasOpTest, ImageSizeConvolutionAndActivation) {
   const int filter_size = TestFixture::kImageWidth;
   const int filter_count = 12;
-  for (const string& activation : {"Relu", "Relu6", "Elu"}) {
+  for (const string& activation : {"Relu", "Relu6", "Elu", "LeakyRelu"}) {
     this->VerifyConv2DWithBiasAndActivation(activation, filter_size,
                                             filter_count);
   }
@@ -1058,7 +1067,7 @@ TYPED_TEST_P(FusedConv2DWithBiasOpTest, ImageSizeConvolutionAndActivation) {
 TYPED_TEST_P(FusedConv2DWithBiasOpTest, SpatialConvolutionAndActivation) {
   const int filter_size = 3;
   const int filter_count = 12;
-  for (const string& activation : {"Relu", "Relu6", "Elu"}) {
+  for (const string& activation : {"Relu", "Relu6", "Elu", "LeakyRelu"}) {
     this->VerifyConv2DWithBiasAndActivation(activation, filter_size,
                                             filter_count);
   }
@@ -1069,7 +1078,7 @@ TYPED_TEST_P(FusedConv2DWithBiasOpTest,
              ExplicitPaddingConvolutionAndActivation) {
   const int filter_size = 3;
   const int filter_count = 12;
-  for (const string& activation : {"Relu", "Relu6", "Elu"}) {
+  for (const string& activation : {"Relu", "Relu6", "Elu", "LeakyRelu"}) {
     this->VerifyConv2DWithBiasAndActivation(
         activation, filter_size, filter_count,
         /*explicit_paddings=*/{0, 0, 1, 2, 3, 4, 0, 0});
@@ -1112,7 +1121,7 @@ TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, ExplicitPaddingConvolution) {
 TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, OneByOneConvolutionAndActivation) {
   const int filter_size = 1;
   const int filter_count = 12;
-  for (const string& activation : {"Relu", "Relu6", "Elu"}) {
+  for (const string& activation : {"Relu", "Relu6", "Elu", "LeakyRelu"}) {
     this->VerifyConv2DWithBatchNormAndActivation(activation, filter_size,
                                                  filter_count);
   }
@@ -1122,7 +1131,7 @@ TYPED_TEST_P(FusedConv2DWithBatchNormOpTest,
              ImageSizeConvolutionAndActivation) {
   const int filter_size = TestFixture::kImageWidth;
   const int filter_count = 12;
-  for (const string& activation : {"Relu", "Relu6", "Elu"}) {
+  for (const string& activation : {"Relu", "Relu6", "Elu", "LeakyRelu"}) {
     this->VerifyConv2DWithBatchNormAndActivation(activation, filter_size,
                                                  filter_count);
   }
@@ -1131,7 +1140,7 @@ TYPED_TEST_P(FusedConv2DWithBatchNormOpTest,
 TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, SpatialConvolutionAndActivation) {
   const int filter_size = 3;
   const int filter_count = 12;
-  for (const string& activation : {"Relu", "Relu6", "Elu"}) {
+  for (const string& activation : {"Relu", "Relu6", "Elu", "LeakyRelu"}) {
     this->VerifyConv2DWithBatchNormAndActivation(activation, filter_size,
                                                  filter_count);
   }
@@ -1142,7 +1151,7 @@ TYPED_TEST_P(FusedConv2DWithBatchNormOpTest,
              ExplicitPaddingConvolutionAndActivation) {
   const int filter_size = 3;
   const int filter_count = 12;
-  for (const string& activation : {"Relu", "Relu6", "Elu"}) {
+  for (const string& activation : {"Relu", "Relu6", "Elu", "LeakyRelu"}) {
     this->VerifyConv2DWithBatchNormAndActivation(
         activation, filter_size, filter_count,
         /*explicit_paddings=*/{0, 0, 1, 2, 3, 4, 0, 0});
diff --git a/tensorflow/core/kernels/conv_ops_using_gemm.cc b/tensorflow/core/kernels/conv_ops_using_gemm.cc
index dff1a533ee0..71eda28899e 100644
--- a/tensorflow/core/kernels/conv_ops_using_gemm.cc
+++ b/tensorflow/core/kernels/conv_ops_using_gemm.cc
@@ -62,7 +62,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_slice.h"
 #include "tensorflow/core/kernels/conv_ops.h"
 #include "tensorflow/core/kernels/gemm_functors.h"
-#include "tensorflow/core/kernels/image_resizer_state.h"
+#include "tensorflow/core/util/image_resizer_state.h"
 #include "tensorflow/core/util/mirror_pad_mode.h"
 #include "tensorflow/core/util/padding.h"
 #include "tensorflow/core/util/tensor_format.h"
diff --git a/tensorflow/core/kernels/cudnn_rnn_ops.cc b/tensorflow/core/kernels/cudnn_rnn_ops.cc
index b9b96d3fc70..bd62f39e8ae 100644
--- a/tensorflow/core/kernels/cudnn_rnn_ops.cc
+++ b/tensorflow/core/kernels/cudnn_rnn_ops.cc
@@ -500,6 +500,9 @@ struct CudnnRnnModelShapes {
   int max_seq_length;
   int batch_size;
   int cell_num_units = 0;
+  // If you add new field to this structure, please take care of
+  // updating IsCompatibleWith() below as well as the hash function in
+  // CudnnRnnConfigHasher.
   TensorShape input_shape;
   TensorShape output_shape;
   TensorShape hidden_state_shape;
@@ -508,7 +511,8 @@ struct CudnnRnnModelShapes {
   bool IsCompatibleWith(const CudnnRnnModelShapes& rhs) const {
     return num_layers == rhs.num_layers && input_size == rhs.input_size &&
            num_units == rhs.num_units && dir_count == rhs.dir_count &&
-           cell_num_units == rhs.cell_num_units;
+           cell_num_units == rhs.cell_num_units &&
+           max_seq_length == rhs.max_seq_length;
   }
   string DebugString() const {
     return strings::Printf(
@@ -530,7 +534,7 @@ struct CudnnRnnConfigHasher {
 
     uint64 hash =
         HashList({shapes.num_layers, shapes.input_size, shapes.num_units,
-                  shapes.dir_count, shapes.batch_size});
+                  shapes.dir_count, shapes.max_seq_length, shapes.batch_size});
     if (algo_desc.has_value()) {
       hash = Hash64Combine(hash, algo_desc->hash());
     }
diff --git a/tensorflow/core/kernels/cwise_op_abs.cc b/tensorflow/core/kernels/cwise_op_abs.cc
index d3b09f7078a..20befa1c061 100644
--- a/tensorflow/core/kernels/cwise_op_abs.cc
+++ b/tensorflow/core/kernels/cwise_op_abs.cc
@@ -39,13 +39,4 @@ REGISTER_KERNEL_BUILDER(Name("Abs")
 #endif
 #endif
 
-#if TENSORFLOW_USE_SYCL
-REGISTER3(UnaryOp, SYCL, "Abs", functor::abs, float, double, int64);
-REGISTER_KERNEL_BUILDER(Name("Abs")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("x")
-                            .HostMemory("y")
-                            .TypeConstraint<int32>("T"),
-                        UnaryOp<CPUDevice, functor::abs<int32>>);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_acos.cc b/tensorflow/core/kernels/cwise_op_acos.cc
index 8449f0661da..7cd01cf283e 100644
--- a/tensorflow/core/kernels/cwise_op_acos.cc
+++ b/tensorflow/core/kernels/cwise_op_acos.cc
@@ -22,7 +22,4 @@ REGISTER2(UnaryOp, CPU, "Acos", functor::acos, float, double);
 REGISTER2(UnaryOp, GPU, "Acos", functor::acos, float, double);
 #endif
 
-#if TENSORFLOW_USE_SYCL
-REGISTER2(UnaryOp, SYCL, "Acos", functor::acos, float, double);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_acosh.cc b/tensorflow/core/kernels/cwise_op_acosh.cc
index 06aee8671bc..05acf66fb16 100644
--- a/tensorflow/core/kernels/cwise_op_acosh.cc
+++ b/tensorflow/core/kernels/cwise_op_acosh.cc
@@ -20,9 +20,6 @@ namespace tensorflow {
 REGISTER4(UnaryOp, CPU, "Acosh", functor::acosh, float, double, complex64,
           complex128);
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER2(UnaryOp, SYCL, "Acosh", functor::acosh, float, double);
-#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER2(UnaryOp, GPU, "Acosh", functor::acosh, float, double);
diff --git a/tensorflow/core/kernels/cwise_op_add_1.cc b/tensorflow/core/kernels/cwise_op_add_1.cc
index 608fe3fa8b1..0af41541de7 100644
--- a/tensorflow/core/kernels/cwise_op_add_1.cc
+++ b/tensorflow/core/kernels/cwise_op_add_1.cc
@@ -44,26 +44,4 @@ REGISTER_KERNEL_BUILDER(Name("AddV2")
                         BinaryOp<CPUDevice, functor::add<int32>>);
 #endif
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_KERNEL(type)                          \
-  REGISTER(BinaryOp, SYCL, "Add", functor::add, type); \
-  REGISTER(BinaryOp, SYCL, "AddV2", functor::add, type);
-
-TF_CALL_SYCL_NUMBER_TYPES(REGISTER_KERNEL);
-
-REGISTER_KERNEL_BUILDER(Name("Add")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("x")
-                            .HostMemory("y")
-                            .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::add<int32>>);
-REGISTER_KERNEL_BUILDER(Name("AddV2")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("x")
-                            .HostMemory("y")
-                            .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::add<int32>>);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_asin.cc b/tensorflow/core/kernels/cwise_op_asin.cc
index 9089dfce23b..2471f8db2c2 100644
--- a/tensorflow/core/kernels/cwise_op_asin.cc
+++ b/tensorflow/core/kernels/cwise_op_asin.cc
@@ -22,7 +22,4 @@ REGISTER2(UnaryOp, CPU, "Asin", functor::asin, float, double);
 REGISTER2(UnaryOp, GPU, "Asin", functor::asin, float, double);
 #endif
 
-#if TENSORFLOW_USE_SYCL
-REGISTER2(UnaryOp, SYCL, "Asin", functor::asin, float, double);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_asinh.cc b/tensorflow/core/kernels/cwise_op_asinh.cc
index 9801b31af48..d096debca2e 100644
--- a/tensorflow/core/kernels/cwise_op_asinh.cc
+++ b/tensorflow/core/kernels/cwise_op_asinh.cc
@@ -20,9 +20,6 @@ namespace tensorflow {
 REGISTER4(UnaryOp, CPU, "Asinh", functor::asinh, float, double, complex64,
           complex128);
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER2(UnaryOp, SYCL, "Asinh", functor::asinh, float, double);
-#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER2(UnaryOp, GPU, "Asinh", functor::asinh, float, double);
diff --git a/tensorflow/core/kernels/cwise_op_atan.cc b/tensorflow/core/kernels/cwise_op_atan.cc
index d8f84f01034..07b030571a8 100644
--- a/tensorflow/core/kernels/cwise_op_atan.cc
+++ b/tensorflow/core/kernels/cwise_op_atan.cc
@@ -22,7 +22,4 @@ REGISTER2(UnaryOp, CPU, "Atan", functor::atan, float, double);
 REGISTER2(UnaryOp, GPU, "Atan", functor::atan, float, double);
 #endif
 
-#if TENSORFLOW_USE_SYCL
-REGISTER2(UnaryOp, SYCL, "Atan", functor::atan, float, double);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_atanh.cc b/tensorflow/core/kernels/cwise_op_atanh.cc
index e58adb57833..2404cd19646 100644
--- a/tensorflow/core/kernels/cwise_op_atanh.cc
+++ b/tensorflow/core/kernels/cwise_op_atanh.cc
@@ -20,9 +20,6 @@ namespace tensorflow {
 REGISTER4(UnaryOp, CPU, "Atanh", functor::atanh, float, double, complex64,
           complex128);
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER2(UnaryOp, SYCL, "Atanh", functor::atanh, float, double);
-#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER2(UnaryOp, GPU, "Atanh", functor::atanh, float, double);
diff --git a/tensorflow/core/kernels/cwise_op_bitwise_and.cc b/tensorflow/core/kernels/cwise_op_bitwise_and.cc
index 49d5044f289..5e557e76e66 100644
--- a/tensorflow/core/kernels/cwise_op_bitwise_and.cc
+++ b/tensorflow/core/kernels/cwise_op_bitwise_and.cc
@@ -19,22 +19,6 @@ namespace tensorflow {
 REGISTER8(BinaryOp, CPU, "BitwiseAnd", functor::bitwise_and, int8, int16, int32,
           int64, uint8, uint16, uint32, uint64);
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                      \
-  REGISTER_KERNEL_BUILDER(                                              \
-      Name("BitwiseAnd").Device(DEVICE_SYCL).TypeConstraint<TYPE>("T"), \
-      BinaryOp<SYCLDevice, functor::bitwise_and<TYPE>>);
-REGISTER_SYCL_KERNEL(int8);
-REGISTER_SYCL_KERNEL(int16);
-REGISTER_SYCL_KERNEL(int32);
-REGISTER_SYCL_KERNEL(int64);
-REGISTER_SYCL_KERNEL(uint8);
-REGISTER_SYCL_KERNEL(uint16);
-REGISTER_SYCL_KERNEL(uint32);
-REGISTER_SYCL_KERNEL(uint64);
-#undef REGISTER_SYCL_KERNEL
-
-#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER8(BinaryOp, GPU, "BitwiseAnd", functor::bitwise_and, int8, int16, int32,
diff --git a/tensorflow/core/kernels/cwise_op_bitwise_or.cc b/tensorflow/core/kernels/cwise_op_bitwise_or.cc
index f448968860d..3b371f9b5f9 100644
--- a/tensorflow/core/kernels/cwise_op_bitwise_or.cc
+++ b/tensorflow/core/kernels/cwise_op_bitwise_or.cc
@@ -19,22 +19,6 @@ namespace tensorflow {
 REGISTER8(BinaryOp, CPU, "BitwiseOr", functor::bitwise_or, int8, int16, int32,
           int64, uint8, uint16, uint32, uint64);
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                     \
-  REGISTER_KERNEL_BUILDER(                                             \
-      Name("BitwiseOr").Device(DEVICE_SYCL).TypeConstraint<TYPE>("T"), \
-      BinaryOp<SYCLDevice, functor::bitwise_or<TYPE>>);
-REGISTER_SYCL_KERNEL(int8);
-REGISTER_SYCL_KERNEL(int16);
-REGISTER_SYCL_KERNEL(int32);
-REGISTER_SYCL_KERNEL(int64);
-REGISTER_SYCL_KERNEL(uint8);
-REGISTER_SYCL_KERNEL(uint16);
-REGISTER_SYCL_KERNEL(uint32);
-REGISTER_SYCL_KERNEL(uint64);
-#undef REGISTER_SYCL_KERNEL
-
-#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER8(BinaryOp, GPU, "BitwiseOr", functor::bitwise_or, int8, int16, int32,
diff --git a/tensorflow/core/kernels/cwise_op_bitwise_xor.cc b/tensorflow/core/kernels/cwise_op_bitwise_xor.cc
index b4387c2e8fd..bb3c7277944 100644
--- a/tensorflow/core/kernels/cwise_op_bitwise_xor.cc
+++ b/tensorflow/core/kernels/cwise_op_bitwise_xor.cc
@@ -19,22 +19,6 @@ namespace tensorflow {
 REGISTER8(BinaryOp, CPU, "BitwiseXor", functor::bitwise_xor, int8, int16, int32,
           int64, uint8, uint16, uint32, uint64);
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                      \
-  REGISTER_KERNEL_BUILDER(                                              \
-      Name("BitwiseXor").Device(DEVICE_SYCL).TypeConstraint<TYPE>("T"), \
-      BinaryOp<SYCLDevice, functor::bitwise_xor<TYPE>>);
-REGISTER_SYCL_KERNEL(int8);
-REGISTER_SYCL_KERNEL(int16);
-REGISTER_SYCL_KERNEL(int32);
-REGISTER_SYCL_KERNEL(int64);
-REGISTER_SYCL_KERNEL(uint8);
-REGISTER_SYCL_KERNEL(uint16);
-REGISTER_SYCL_KERNEL(uint32);
-REGISTER_SYCL_KERNEL(uint64);
-#undef REGISTER_SYCL_KERNEL
-
-#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER8(BinaryOp, GPU, "BitwiseXor", functor::bitwise_xor, int8, int16, int32,
diff --git a/tensorflow/core/kernels/cwise_op_ceil.cc b/tensorflow/core/kernels/cwise_op_ceil.cc
index 4b1847d758c..765e5b94949 100644
--- a/tensorflow/core/kernels/cwise_op_ceil.cc
+++ b/tensorflow/core/kernels/cwise_op_ceil.cc
@@ -16,13 +16,11 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER3(UnaryOp, CPU, "Ceil", functor::ceil, float, Eigen::half, double);
+REGISTER4(UnaryOp, CPU, "Ceil", functor::ceil, float, Eigen::half, bfloat16,
+          double);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER3(UnaryOp, GPU, "Ceil", functor::ceil, float, Eigen::half, double);
 #endif
 
-#if TENSORFLOW_USE_SYCL
-REGISTER2(UnaryOp, SYCL, "Ceil", functor::ceil, float, double);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_clip.cc b/tensorflow/core/kernels/cwise_op_clip.cc
index c0c71c5f638..3d43cf147b1 100644
--- a/tensorflow/core/kernels/cwise_op_clip.cc
+++ b/tensorflow/core/kernels/cwise_op_clip.cc
@@ -156,6 +156,7 @@ struct TernaryClipOp<CPUDevice, T> {
 INSTANTIATE_CPU(Eigen::half);
 INSTANTIATE_CPU(float);
 INSTANTIATE_CPU(double);
+INSTANTIATE_CPU(bfloat16);
 INSTANTIATE_CPU(int8);
 INSTANTIATE_CPU(int16);
 INSTANTIATE_CPU(int32);
@@ -173,6 +174,7 @@ INSTANTIATE_CPU(uint16);
 REGISTER_CPU_KERNEL(Eigen::half);
 REGISTER_CPU_KERNEL(float);
 REGISTER_CPU_KERNEL(double);
+REGISTER_CPU_KERNEL(bfloat16);
 REGISTER_CPU_KERNEL(int8);
 REGISTER_CPU_KERNEL(int16);
 REGISTER_CPU_KERNEL(int32);
diff --git a/tensorflow/core/kernels/cwise_op_cos.cc b/tensorflow/core/kernels/cwise_op_cos.cc
index 7b434ce4294..64e9fabfc2b 100644
--- a/tensorflow/core/kernels/cwise_op_cos.cc
+++ b/tensorflow/core/kernels/cwise_op_cos.cc
@@ -16,14 +16,11 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER5(UnaryOp, CPU, "Cos", functor::cos, float, Eigen::half, double,
-          complex64, complex128);
+REGISTER6(UnaryOp, CPU, "Cos", functor::cos, float, Eigen::half, bfloat16,
+          double, complex64, complex128);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER3(UnaryOp, GPU, "Cos", functor::cos, float, Eigen::half, double);
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER2(UnaryOp, SYCL, "Cos", functor::cos, float, double);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_cosh.cc b/tensorflow/core/kernels/cwise_op_cosh.cc
index 3388df0096a..6e1c5361a58 100644
--- a/tensorflow/core/kernels/cwise_op_cosh.cc
+++ b/tensorflow/core/kernels/cwise_op_cosh.cc
@@ -16,18 +16,9 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER4(UnaryOp, CPU, "Cosh", functor::cosh, float, double, complex64,
-          complex128);
+REGISTER5(UnaryOp, CPU, "Cosh", functor::cosh, float, double, bfloat16,
+          complex64, complex128);
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                \
-  REGISTER_KERNEL_BUILDER(                                        \
-      Name("Cosh").Device(DEVICE_SYCL).TypeConstraint<TYPE>("T"), \
-      UnaryOp<SYCLDevice, functor::cosh<TYPE>>);
-REGISTER_SYCL_KERNEL(float);
-REGISTER_SYCL_KERNEL(double);
-#undef REGISTER_SYCL_KERNEL
-#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER2(UnaryOp, GPU, "Cosh", functor::cosh, float, double);
diff --git a/tensorflow/core/kernels/cwise_op_div.cc b/tensorflow/core/kernels/cwise_op_div.cc
index 733f3886d19..6e43f45b0c7 100644
--- a/tensorflow/core/kernels/cwise_op_div.cc
+++ b/tensorflow/core/kernels/cwise_op_div.cc
@@ -50,15 +50,4 @@ REGISTER_KERNEL_BUILDER(Name("Div")
                         BinaryOp<CPUDevice, functor::safe_div<int32>>);
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER2(BinaryOp, SYCL, "Div", functor::div, float, double);
-REGISTER2(BinaryOp, SYCL, "RealDiv", functor::div, float, double);
-REGISTER_KERNEL_BUILDER(Name("Div")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("x")
-                            .HostMemory("y")
-                            .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::safe_div<int32>>);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_equal_to_1.cc b/tensorflow/core/kernels/cwise_op_equal_to_1.cc
index 64cd784af73..41eadd6da6f 100644
--- a/tensorflow/core/kernels/cwise_op_equal_to_1.cc
+++ b/tensorflow/core/kernels/cwise_op_equal_to_1.cc
@@ -47,16 +47,5 @@ REGISTER_KERNEL_BUILDER(Name("Equal")
                         BinaryOp<CPUDevice, functor::equal_to<int32>>);
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER5(BinaryOp, SYCL, "Equal", functor::equal_to, float, double, uint8,
-          int8, int16);
-REGISTER_KERNEL_BUILDER(Name("Equal")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("x")
-                            .HostMemory("y")
-                            .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::equal_to<int32>>);
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_exp.cc b/tensorflow/core/kernels/cwise_op_exp.cc
index 2b157f0e7a9..28ace80431b 100644
--- a/tensorflow/core/kernels/cwise_op_exp.cc
+++ b/tensorflow/core/kernels/cwise_op_exp.cc
@@ -16,15 +16,12 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER5(UnaryOp, CPU, "Exp", functor::exp, float, Eigen::half, double,
-          complex64, complex128);
+REGISTER6(UnaryOp, CPU, "Exp", functor::exp, float, Eigen::half, bfloat16,
+          double, complex64, complex128);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER5(UnaryOp, GPU, "Exp", functor::exp, float, Eigen::half, double,
           complex64, complex128);
 #endif
 
-#if TENSORFLOW_USE_SYCL
-REGISTER2(UnaryOp, SYCL, "Exp", functor::exp, float, double);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_expm1.cc b/tensorflow/core/kernels/cwise_op_expm1.cc
index 55fdc4763d3..62a26eb1892 100644
--- a/tensorflow/core/kernels/cwise_op_expm1.cc
+++ b/tensorflow/core/kernels/cwise_op_expm1.cc
@@ -16,12 +16,9 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER5(UnaryOp, CPU, "Expm1", functor::expm1, float, Eigen::half, double,
-          complex64, complex128);
+REGISTER6(UnaryOp, CPU, "Expm1", functor::expm1, float, Eigen::half, bfloat16,
+          double, complex64, complex128);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER3(UnaryOp, GPU, "Expm1", functor::expm1, float, Eigen::half, double);
 #endif
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER2(UnaryOp, SYCL, "Expm1", functor::expm1, float, double);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_floor.cc b/tensorflow/core/kernels/cwise_op_floor.cc
index 25210a0fa51..da5619b3df9 100644
--- a/tensorflow/core/kernels/cwise_op_floor.cc
+++ b/tensorflow/core/kernels/cwise_op_floor.cc
@@ -16,12 +16,10 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER3(UnaryOp, CPU, "Floor", functor::floor, float, Eigen::half, double);
+REGISTER4(UnaryOp, CPU, "Floor", functor::floor, float, Eigen::half, bfloat16,
+          double);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER3(UnaryOp, GPU, "Floor", functor::floor, float, Eigen::half, double);
 #endif
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER2(UnaryOp, SYCL, "Floor", functor::floor, float, double);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_floor_div.cc b/tensorflow/core/kernels/cwise_op_floor_div.cc
index 11869e43eaa..a98eecdb889 100644
--- a/tensorflow/core/kernels/cwise_op_floor_div.cc
+++ b/tensorflow/core/kernels/cwise_op_floor_div.cc
@@ -18,8 +18,8 @@ limitations under the License.
 namespace tensorflow {
 REGISTER6(BinaryOp, CPU, "FloorDiv", functor::safe_floor_div, uint8, uint16,
           int8, int16, int32, int64);
-REGISTER3(BinaryOp, CPU, "FloorDiv", functor::floor_div_real, float,
-          Eigen::half, double);
+REGISTER4(BinaryOp, CPU, "FloorDiv", functor::floor_div_real, float,
+          Eigen::half, bfloat16, double);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER4(BinaryOp, GPU, "FloorDiv", functor::floor_div, uint8, uint16, int16,
@@ -41,13 +41,4 @@ REGISTER_KERNEL_BUILDER(Name("FloorDiv")
                         BinaryOp<CPUDevice, functor::safe_floor_div<int32>>);
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("FloorDiv")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("x")
-                            .HostMemory("y")
-                            .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::safe_floor_div<int32>>);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_floor_mod.cc b/tensorflow/core/kernels/cwise_op_floor_mod.cc
index 3305f54bcca..6d8a12a731c 100644
--- a/tensorflow/core/kernels/cwise_op_floor_mod.cc
+++ b/tensorflow/core/kernels/cwise_op_floor_mod.cc
@@ -18,7 +18,8 @@ limitations under the License.
 namespace tensorflow {
 REGISTER3(BinaryOp, CPU, "FloorMod", functor::safe_floor_mod, int32, int64,
           uint64);
-REGISTER2(BinaryOp, CPU, "FloorMod", functor::floor_fmod, float, double);
+REGISTER3(BinaryOp, CPU, "FloorMod", functor::floor_fmod, bfloat16, float,
+          double);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 // A special GPU kernel for int32.
@@ -33,13 +34,4 @@ REGISTER_KERNEL_BUILDER(Name("FloorMod")
                         BinaryOp<CPUDevice, functor::safe_floor_mod<int32>>);
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("FloorMod")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("x")
-                            .HostMemory("y")
-                            .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::safe_floor_mod<int32>>);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_greater.cc b/tensorflow/core/kernels/cwise_op_greater.cc
index d70233dc55c..f9a2b8c2500 100644
--- a/tensorflow/core/kernels/cwise_op_greater.cc
+++ b/tensorflow/core/kernels/cwise_op_greater.cc
@@ -33,15 +33,4 @@ REGISTER_KERNEL_BUILDER(Name("Greater")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::greater<int32>>);
 #endif
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER2(BinaryOp, SYCL, "Greater", functor::greater, float, double);
-
-REGISTER_KERNEL_BUILDER(Name("Greater")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("x")
-                            .HostMemory("y")
-                            .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::greater<int32>>);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_greater_equal.cc b/tensorflow/core/kernels/cwise_op_greater_equal.cc
index 7f6b788eb2e..d33adc2d7d1 100644
--- a/tensorflow/core/kernels/cwise_op_greater_equal.cc
+++ b/tensorflow/core/kernels/cwise_op_greater_equal.cc
@@ -34,16 +34,4 @@ REGISTER_KERNEL_BUILDER(Name("GreaterEqual")
                         BinaryOp<CPUDevice, functor::greater_equal<int32>>);
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER2(BinaryOp, SYCL, "GreaterEqual", functor::greater_equal, float,
-          double);
-
-REGISTER_KERNEL_BUILDER(Name("GreaterEqual")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("x")
-                            .HostMemory("y")
-                            .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::greater_equal<int32>>);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_invert.cc b/tensorflow/core/kernels/cwise_op_invert.cc
index 7bdc3d02a42..455e773cfd1 100644
--- a/tensorflow/core/kernels/cwise_op_invert.cc
+++ b/tensorflow/core/kernels/cwise_op_invert.cc
@@ -19,10 +19,6 @@ namespace tensorflow {
 REGISTER8(UnaryOp, CPU, "Invert", functor::invert, int8, int16, int32, int64,
           uint8, uint16, uint32, uint64);
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER6(UnaryOp, SYCL, "Invert", functor::invert, int8, int16, int32, int64,
-          uint8, uint16, uint32, uint64);
-#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER8(UnaryOp, GPU, "Invert", functor::invert, int8, int16, int32, int64,
diff --git a/tensorflow/core/kernels/cwise_op_isfinite.cc b/tensorflow/core/kernels/cwise_op_isfinite.cc
index 061dc8367e2..0246d89df56 100644
--- a/tensorflow/core/kernels/cwise_op_isfinite.cc
+++ b/tensorflow/core/kernels/cwise_op_isfinite.cc
@@ -16,15 +16,12 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER3(UnaryOp, CPU, "IsFinite", functor::isfinite, float, Eigen::half,
-          double);
+REGISTER4(UnaryOp, CPU, "IsFinite", functor::isfinite, float, Eigen::half,
+          bfloat16, double);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER3(UnaryOp, GPU, "IsFinite", functor::isfinite, float, Eigen::half,
           double);
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER2(UnaryOp, SYCL, "IsFinite", functor::isfinite, float, double);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_isinf.cc b/tensorflow/core/kernels/cwise_op_isinf.cc
index f87a24d2085..d4da9fcf3ca 100644
--- a/tensorflow/core/kernels/cwise_op_isinf.cc
+++ b/tensorflow/core/kernels/cwise_op_isinf.cc
@@ -16,13 +16,11 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER3(UnaryOp, CPU, "IsInf", functor::isinf, float, Eigen::half, double);
+REGISTER4(UnaryOp, CPU, "IsInf", functor::isinf, float, Eigen::half, bfloat16,
+          double);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER3(UnaryOp, GPU, "IsInf", functor::isinf, float, Eigen::half, double);
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER2(UnaryOp, SYCL, "IsInf", functor::isinf, float, double);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_isnan.cc b/tensorflow/core/kernels/cwise_op_isnan.cc
index 2867b16e39a..b168b1c7472 100644
--- a/tensorflow/core/kernels/cwise_op_isnan.cc
+++ b/tensorflow/core/kernels/cwise_op_isnan.cc
@@ -23,7 +23,4 @@ REGISTER4(UnaryOp, CPU, "IsNan", functor::isnan, float, Eigen::half, double,
 REGISTER3(UnaryOp, GPU, "IsNan", functor::isnan, float, Eigen::half, double);
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER2(UnaryOp, SYCL, "IsNan", functor::isnan, float, double);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_left_shift.cc b/tensorflow/core/kernels/cwise_op_left_shift.cc
index 38109a14c5d..ed65bea3126 100644
--- a/tensorflow/core/kernels/cwise_op_left_shift.cc
+++ b/tensorflow/core/kernels/cwise_op_left_shift.cc
@@ -19,22 +19,6 @@ namespace tensorflow {
 REGISTER8(BinaryOp, CPU, "LeftShift", functor::left_shift, int8, int16, int32,
           int64, uint8, uint16, uint32, uint64);
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                     \
-  REGISTER_KERNEL_BUILDER(                                             \
-      Name("LeftShift").Device(DEVICE_SYCL).TypeConstraint<TYPE>("T"), \
-      BinaryOp<SYCLDevice, functor::left_shift<TYPE>>);
-REGISTER_SYCL_KERNEL(int8);
-REGISTER_SYCL_KERNEL(int16);
-REGISTER_SYCL_KERNEL(int32);
-REGISTER_SYCL_KERNEL(int64);
-REGISTER_SYCL_KERNEL(uint8);
-REGISTER_SYCL_KERNEL(uint16);
-REGISTER_SYCL_KERNEL(uint32);
-REGISTER_SYCL_KERNEL(uint64);
-#undef REGISTER_SYCL_KERNEL
-
-#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER8(BinaryOp, GPU, "LeftShift", functor::left_shift, int8, int16, int32,
diff --git a/tensorflow/core/kernels/cwise_op_less.cc b/tensorflow/core/kernels/cwise_op_less.cc
index 062a029f069..817f07af8dd 100644
--- a/tensorflow/core/kernels/cwise_op_less.cc
+++ b/tensorflow/core/kernels/cwise_op_less.cc
@@ -35,14 +35,4 @@ REGISTER_KERNEL_BUILDER(Name("Less")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::less<int32>>);
 #endif
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER3(BinaryOp, SYCL, "Less", functor::less, float, double, int64);
-REGISTER_KERNEL_BUILDER(Name("Less")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("x")
-                            .HostMemory("y")
-                            .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::less<int32>>);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_less_equal.cc b/tensorflow/core/kernels/cwise_op_less_equal.cc
index 43af03878e9..17b9915631b 100644
--- a/tensorflow/core/kernels/cwise_op_less_equal.cc
+++ b/tensorflow/core/kernels/cwise_op_less_equal.cc
@@ -37,15 +37,4 @@ REGISTER_KERNEL_BUILDER(Name("LessEqual")
                         BinaryOp<CPUDevice, functor::less_equal<int32>>);
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER6(BinaryOp, SYCL, "LessEqual", functor::less_equal, float, double,
-          int64, uint8, int8, int16);
-REGISTER_KERNEL_BUILDER(Name("LessEqual")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("x")
-                            .HostMemory("y")
-                            .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::less_equal<int32>>);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_log.cc b/tensorflow/core/kernels/cwise_op_log.cc
index e4ff3808a93..236f95dfa77 100644
--- a/tensorflow/core/kernels/cwise_op_log.cc
+++ b/tensorflow/core/kernels/cwise_op_log.cc
@@ -23,7 +23,4 @@ REGISTER6(UnaryOp, CPU, "Log", functor::log, float, Eigen::half, double,
 REGISTER3(UnaryOp, GPU, "Log", functor::log, float, Eigen::half, double);
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER2(UnaryOp, SYCL, "Log", functor::log, float, double);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_log1p.cc b/tensorflow/core/kernels/cwise_op_log1p.cc
index 06fc764fc75..392067f7341 100644
--- a/tensorflow/core/kernels/cwise_op_log1p.cc
+++ b/tensorflow/core/kernels/cwise_op_log1p.cc
@@ -16,14 +16,11 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER5(UnaryOp, CPU, "Log1p", functor::log1p, float, Eigen::half, double,
-          complex64, complex128);
+REGISTER6(UnaryOp, CPU, "Log1p", functor::log1p, float, Eigen::half, bfloat16,
+          double, complex64, complex128);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER3(UnaryOp, GPU, "Log1p", functor::log1p, float, Eigen::half, double);
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER2(UnaryOp, SYCL, "Log1p", functor::log1p, float, double);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_maximum.cc b/tensorflow/core/kernels/cwise_op_maximum.cc
index 5ebfa74eb4e..2b70cdb4e14 100644
--- a/tensorflow/core/kernels/cwise_op_maximum.cc
+++ b/tensorflow/core/kernels/cwise_op_maximum.cc
@@ -34,14 +34,4 @@ REGISTER_KERNEL_BUILDER(Name("Maximum")
                         BinaryOp<CPUDevice, functor::maximum<int32>>);
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER3(BinaryOp, SYCL, "Maximum", functor::maximum, float, double, int64);
-REGISTER_KERNEL_BUILDER(Name("Maximum")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("x")
-                            .HostMemory("y")
-                            .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::maximum<int32>>);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_minimum.cc b/tensorflow/core/kernels/cwise_op_minimum.cc
index 8b301e8ce64..f8ba0714680 100644
--- a/tensorflow/core/kernels/cwise_op_minimum.cc
+++ b/tensorflow/core/kernels/cwise_op_minimum.cc
@@ -34,15 +34,5 @@ REGISTER_KERNEL_BUILDER(Name("Minimum")
                         BinaryOp<CPUDevice, functor::minimum<int32>>);
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER3(BinaryOp, SYCL, "Minimum", functor::minimum, float, double, int64);
-REGISTER_KERNEL_BUILDER(Name("Minimum")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("x")
-                            .HostMemory("y")
-                            .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::minimum<int32>>);
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_mul_1.cc b/tensorflow/core/kernels/cwise_op_mul_1.cc
index 4e2aa6bbc58..5660f4309b3 100644
--- a/tensorflow/core/kernels/cwise_op_mul_1.cc
+++ b/tensorflow/core/kernels/cwise_op_mul_1.cc
@@ -49,14 +49,4 @@ REGISTER5(BinaryOp, GPU, "MulNoNan", functor::mul_no_nan, Eigen::half, float,
           double, complex64, complex128);
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER3(BinaryOp, SYCL, "Mul", functor::mul, float, double, uint8);
-REGISTER_KERNEL_BUILDER(Name("Mul")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("x")
-                            .HostMemory("y")
-                            .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::mul<int32>>);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_neg_1.cc b/tensorflow/core/kernels/cwise_op_neg_1.cc
index 18a7c61be90..fde5fae54bd 100644
--- a/tensorflow/core/kernels/cwise_op_neg_1.cc
+++ b/tensorflow/core/kernels/cwise_op_neg_1.cc
@@ -18,15 +18,6 @@ limitations under the License.
 namespace tensorflow {
 REGISTER4(UnaryOp, CPU, "Neg", functor::neg, int8, int16, int32, int64);
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER3(UnaryOp, SYCL, "Neg", functor::neg, float, double, int64);
-REGISTER_KERNEL_BUILDER(Name("Neg")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("x")
-                            .HostMemory("y")
-                            .TypeConstraint<int32>("T"),
-                        UnaryOp<CPUDevice, functor::neg<int32>>);
-#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER3(UnaryOp, GPU, "Neg", functor::neg, int8, int16, int64);
diff --git a/tensorflow/core/kernels/cwise_op_not_equal_to_1.cc b/tensorflow/core/kernels/cwise_op_not_equal_to_1.cc
index 4de69edd21d..f0dbac19bd7 100644
--- a/tensorflow/core/kernels/cwise_op_not_equal_to_1.cc
+++ b/tensorflow/core/kernels/cwise_op_not_equal_to_1.cc
@@ -35,16 +35,5 @@ REGISTER_KERNEL_BUILDER(Name("NotEqual")
                         BinaryOp<CPUDevice, functor::not_equal_to<int32>>);
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER2(BinaryOp, SYCL, "NotEqual", functor::not_equal_to, float, double);
-
-REGISTER_KERNEL_BUILDER(Name("NotEqual")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("x")
-                            .HostMemory("y")
-                            .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::not_equal_to<int32>>);
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_pow.cc b/tensorflow/core/kernels/cwise_op_pow.cc
index 1b1d626aa57..e969c39a2d8 100644
--- a/tensorflow/core/kernels/cwise_op_pow.cc
+++ b/tensorflow/core/kernels/cwise_op_pow.cc
@@ -16,15 +16,12 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER5(BinaryOp, CPU, "Pow", functor::pow, float, Eigen::half, double,
-          complex64, complex128);
+REGISTER6(BinaryOp, CPU, "Pow", functor::pow, float, Eigen::half, bfloat16,
+          double, complex64, complex128);
 REGISTER2(BinaryOp, CPU, "Pow", functor::safe_pow, int32, int64);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER4(BinaryOp, GPU, "Pow", functor::pow, float, Eigen::half, double,
           int64);
 #endif
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER2(BinaryOp, SYCL, "Pow", functor::pow, float, double);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_reciprocal.cc b/tensorflow/core/kernels/cwise_op_reciprocal.cc
index 8e92691474a..76480e1fede 100644
--- a/tensorflow/core/kernels/cwise_op_reciprocal.cc
+++ b/tensorflow/core/kernels/cwise_op_reciprocal.cc
@@ -30,30 +30,17 @@ REGISTER3(SimpleBinaryOp, GPU, "InvGrad", functor::inverse_grad, float,
           Eigen::half, double);
 #endif
 
-#ifdef ENABLE_INTEL_MKL_BFLOAT16
-// Since Eigen backend does not support bfloat16 ops, we are selectively
-// enabling them for MKL backend.
 REGISTER6(UnaryOp, CPU, "Reciprocal", functor::inverse, float, Eigen::half,
-          double, complex64, complex128, bfloat16);
-#else
-REGISTER5(UnaryOp, CPU, "Reciprocal", functor::inverse, float, Eigen::half,
-          double, complex64, complex128);
-#endif  // ENABLE_INTEL_MKL_BFLOAT16
+          bfloat16, double, complex64, complex128);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER4(UnaryOp, GPU, "Reciprocal", functor::inverse, float, Eigen::half,
           double, int64);
 #endif
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER(UnaryOp, SYCL, "Reciprocal", functor::inverse, float);
-#endif  // TENSORFLOW_USE_SYCL
 
-REGISTER5(SimpleBinaryOp, CPU, "ReciprocalGrad", functor::inverse_grad, float,
-          Eigen::half, double, complex64, complex128);
+REGISTER6(SimpleBinaryOp, CPU, "ReciprocalGrad", functor::inverse_grad, float,
+          Eigen::half, bfloat16, double, complex64, complex128);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER3(SimpleBinaryOp, GPU, "ReciprocalGrad", functor::inverse_grad, float,
           Eigen::half, double);
 #endif
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER(SimpleBinaryOp, SYCL, "ReciprocalGrad", functor::inverse_grad, float);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_right_shift.cc b/tensorflow/core/kernels/cwise_op_right_shift.cc
index 8165662e53f..2bf819c53fd 100644
--- a/tensorflow/core/kernels/cwise_op_right_shift.cc
+++ b/tensorflow/core/kernels/cwise_op_right_shift.cc
@@ -19,22 +19,6 @@ namespace tensorflow {
 REGISTER8(BinaryOp, CPU, "RightShift", functor::right_shift, int8, int16, int32,
           int64, uint8, uint16, uint32, uint64);
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                      \
-  REGISTER_KERNEL_BUILDER(                                              \
-      Name("RightShift").Device(DEVICE_SYCL).TypeConstraint<TYPE>("T"), \
-      BinaryOp<SYCLDevice, functor::right_shift<TYPE>>);
-REGISTER_SYCL_KERNEL(int8);
-REGISTER_SYCL_KERNEL(int16);
-REGISTER_SYCL_KERNEL(int32);
-REGISTER_SYCL_KERNEL(int64);
-REGISTER_SYCL_KERNEL(uint8);
-REGISTER_SYCL_KERNEL(uint16);
-REGISTER_SYCL_KERNEL(uint32);
-REGISTER_SYCL_KERNEL(uint64);
-#undef REGISTER_SYCL_KERNEL
-
-#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER8(BinaryOp, GPU, "RightShift", functor::right_shift, int8, int16, int32,
diff --git a/tensorflow/core/kernels/cwise_op_round.cc b/tensorflow/core/kernels/cwise_op_round.cc
index 86e709b01e1..73a1d9e533a 100644
--- a/tensorflow/core/kernels/cwise_op_round.cc
+++ b/tensorflow/core/kernels/cwise_op_round.cc
@@ -19,9 +19,6 @@ namespace tensorflow {
 REGISTER5(UnaryOp, CPU, "Round", functor::round, Eigen::half, float, double,
           int32, int64);
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER2(UnaryOp, SYCL, "Round", functor::round, float, double);
-#endif
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER5(UnaryOp, GPU, "Round", functor::round, Eigen::half, float, double,
diff --git a/tensorflow/core/kernels/cwise_op_rsqrt.cc b/tensorflow/core/kernels/cwise_op_rsqrt.cc
index 20d81a66bbf..21e3bf4d33f 100644
--- a/tensorflow/core/kernels/cwise_op_rsqrt.cc
+++ b/tensorflow/core/kernels/cwise_op_rsqrt.cc
@@ -22,9 +22,6 @@ REGISTER5(UnaryOp, CPU, "Rsqrt", functor::rsqrt, float, Eigen::half, double,
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER3(UnaryOp, GPU, "Rsqrt", functor::rsqrt, float, Eigen::half, double);
 #endif
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER2(UnaryOp, SYCL, "Rsqrt", functor::rsqrt, float, double);
-#endif  // TENSORFLOW_USE_SYCL
 
 REGISTER5(SimpleBinaryOp, CPU, "RsqrtGrad", functor::rsqrt_grad, float,
           Eigen::half, double, complex64, complex128);
@@ -32,8 +29,4 @@ REGISTER5(SimpleBinaryOp, CPU, "RsqrtGrad", functor::rsqrt_grad, float,
 REGISTER3(SimpleBinaryOp, GPU, "RsqrtGrad", functor::rsqrt_grad, float,
           Eigen::half, double);
 #endif
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER2(SimpleBinaryOp, SYCL, "RsqrtGrad", functor::rsqrt_grad, float,
-          double);
-#endif  //  TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_select.cc b/tensorflow/core/kernels/cwise_op_select.cc
index af003084998..02a82892fed 100644
--- a/tensorflow/core/kernels/cwise_op_select.cc
+++ b/tensorflow/core/kernels/cwise_op_select.cc
@@ -29,9 +29,6 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 namespace functor {
 template <typename Device, typename T>
@@ -294,22 +291,6 @@ REGISTER_SELECT_GPU(complex128);
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-// Registration of the SYCL implementations.
-#define REGISTER_SELECT_SYCL(type)                                    \
-  REGISTER_KERNEL_BUILDER(                                            \
-      Name("Select").Device(DEVICE_SYCL).TypeConstraint<type>("T"),   \
-      SelectOp<SYCLDevice, type>);                                    \
-  REGISTER_KERNEL_BUILDER(                                            \
-      Name("SelectV2").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
-      SelectOp<SYCLDevice, type>);
-
-REGISTER_SELECT_SYCL(float);
-REGISTER_SELECT_SYCL(double);
-REGISTER_SELECT_SYCL(int32);
-REGISTER_SELECT_SYCL(int64);
-#undef REGISTER_SELECT_SYCL
-#endif  // TENSORFLOW_USE_SYCL
 
 namespace functor {
 
@@ -326,10 +307,6 @@ struct SelectFunctorBase {
 
 template <typename T>
 struct SelectFunctor<CPUDevice, T> : SelectFunctorBase<CPUDevice, T> {};
-#ifdef TENSORFLOW_USE_SYCL
-template <typename T>
-struct SelectFunctor<SYCLDevice, T> : SelectFunctorBase<SYCLDevice, T> {};
-#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T>
 struct SelectScalarHandler {
@@ -364,21 +341,6 @@ struct SelectScalarHandler<CPUDevice, T> {
   }
 };
 
-#ifdef TENSORFLOW_USE_SYCL
-template <typename Device, typename T>
-struct SelectScalarFunctorBase {
-  void operator()(const Device& d, typename TTypes<T>::Flat out,
-                  TTypes<bool>::ConstScalar cond,
-                  typename TTypes<T>::ConstFlat then_flat,
-                  typename TTypes<T>::ConstFlat else_flat) {
-    out.device(d) = cond() ? then_flat : else_flat;
-  }
-};
-
-template <typename T>
-struct SelectScalarFunctor<SYCLDevice, T>
-    : SelectScalarFunctorBase<SYCLDevice, T> {};
-#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T>
 struct BatchSelectFunctorBase {
@@ -469,16 +431,6 @@ template <typename T, int NDIMS>
 struct BCastSelectFunctor<CPUDevice, T, NDIMS>
     : BCastSelectFunctorBase<CPUDevice, T, NDIMS> {};
 
-#ifdef TENSORFLOW_USE_SYCL
-template <typename T>
-struct BatchSelectFunctor<SYCLDevice, T>
-    : BatchSelectFunctorBase<SYCLDevice, T> {};
-
-template <typename T, int NDIMS>
-struct BCastSelectFunctor<SYCLDevice, T, NDIMS>
-    : BCastSelectFunctorBase<SYCLDevice, T, NDIMS> {};
-
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace functor
 
diff --git a/tensorflow/core/kernels/cwise_op_sigmoid.cc b/tensorflow/core/kernels/cwise_op_sigmoid.cc
index 926284571ed..22ec20d124e 100644
--- a/tensorflow/core/kernels/cwise_op_sigmoid.cc
+++ b/tensorflow/core/kernels/cwise_op_sigmoid.cc
@@ -17,24 +17,18 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_gradients.h"
 
 namespace tensorflow {
-REGISTER5(UnaryOp, CPU, "Sigmoid", functor::sigmoid, float, Eigen::half, double,
-          complex64, complex128);
+REGISTER6(UnaryOp, CPU, "Sigmoid", functor::sigmoid, bfloat16, float,
+          Eigen::half, double, complex64, complex128);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER3(UnaryOp, GPU, "Sigmoid", functor::sigmoid, float, Eigen::half,
           double);
 #endif
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER(UnaryOp, SYCL, "Sigmoid", functor::sigmoid, float);
-#endif  // TENSORFLOW_USE_SYCL
 
-REGISTER5(SimpleBinaryOp, CPU, "SigmoidGrad", functor::sigmoid_grad, float,
-          Eigen::half, double, complex64, complex128);
+REGISTER6(SimpleBinaryOp, CPU, "SigmoidGrad", functor::sigmoid_grad, bfloat16,
+          float, Eigen::half, double, complex64, complex128);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER3(SimpleBinaryOp, GPU, "SigmoidGrad", functor::sigmoid_grad, float,
           Eigen::half, double);
 #endif
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER(SimpleBinaryOp, SYCL, "SigmoidGrad", functor::sigmoid_grad, float);
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_sign.cc b/tensorflow/core/kernels/cwise_op_sign.cc
index 983cee4c944..b1501555fbc 100644
--- a/tensorflow/core/kernels/cwise_op_sign.cc
+++ b/tensorflow/core/kernels/cwise_op_sign.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER7(UnaryOp, CPU, "Sign", functor::sign, float, double, int32, int64,
-          complex64, Eigen::half, complex128);
+REGISTER8(UnaryOp, CPU, "Sign", functor::sign, float, double, int32, int64,
+          complex64, Eigen::half, bfloat16, complex128);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER6(UnaryOp, GPU, "Sign", functor::sign, float, Eigen::half, double,
           int64, complex64, complex128);
@@ -33,14 +33,5 @@ REGISTER_KERNEL_BUILDER(Name("Sign")
                         UnaryOp<CPUDevice, functor::sign<int32>>);
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER3(UnaryOp, SYCL, "Sign", functor::sign, float, double, int64);
-REGISTER_KERNEL_BUILDER(Name("Sign")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("x")
-                            .HostMemory("y")
-                            .TypeConstraint<int32>("T"),
-                        UnaryOp<CPUDevice, functor::sign<int32>>);
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_sin.cc b/tensorflow/core/kernels/cwise_op_sin.cc
index ab6fb1ccd5e..d3e8f3b605c 100644
--- a/tensorflow/core/kernels/cwise_op_sin.cc
+++ b/tensorflow/core/kernels/cwise_op_sin.cc
@@ -16,14 +16,11 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER5(UnaryOp, CPU, "Sin", functor::sin, float, Eigen::half, double,
-          complex64, complex128);
+REGISTER6(UnaryOp, CPU, "Sin", functor::sin, float, Eigen::half, bfloat16,
+          double, complex64, complex128);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER3(UnaryOp, GPU, "Sin", functor::sin, float, Eigen::half, double);
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER2(UnaryOp, SYCL, "Sin", functor::sin, float, double);
-#endif  // TENSORFLOW_USE_SYC
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_sinh.cc b/tensorflow/core/kernels/cwise_op_sinh.cc
index 114a6142bdc..24b3a666aee 100644
--- a/tensorflow/core/kernels/cwise_op_sinh.cc
+++ b/tensorflow/core/kernels/cwise_op_sinh.cc
@@ -16,18 +16,9 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER4(UnaryOp, CPU, "Sinh", functor::sinh, float, double, complex64,
-          complex128);
+REGISTER5(UnaryOp, CPU, "Sinh", functor::sinh, float, double, bfloat16,
+          complex64, complex128);
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                \
-  REGISTER_KERNEL_BUILDER(                                        \
-      Name("Sinh").Device(DEVICE_SYCL).TypeConstraint<TYPE>("T"), \
-      UnaryOp<SYCLDevice, functor::sinh<TYPE>>);
-REGISTER_SYCL_KERNEL(float);
-REGISTER_SYCL_KERNEL(double);
-#undef REGISTER_SYCL_KERNEL
-#endif  // TENSORFLOW_USE_SYC
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER2(UnaryOp, GPU, "Sinh", functor::sinh, float, double);
diff --git a/tensorflow/core/kernels/cwise_op_sqrt.cc b/tensorflow/core/kernels/cwise_op_sqrt.cc
index 976f8b0954d..2e33297a305 100644
--- a/tensorflow/core/kernels/cwise_op_sqrt.cc
+++ b/tensorflow/core/kernels/cwise_op_sqrt.cc
@@ -23,9 +23,6 @@ REGISTER6(UnaryOp, CPU, "Sqrt", functor::sqrt, float, Eigen::half, double,
 REGISTER3(UnaryOp, GPU, "Sqrt", functor::sqrt, float, Eigen::half, double);
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER2(UnaryOp, SYCL, "Sqrt", functor::sqrt, float, double);
-#endif  // TENSORFLOW_USE_SYCL
 
 REGISTER6(SimpleBinaryOp, CPU, "SqrtGrad", functor::sqrt_grad, float,
           Eigen::half, bfloat16, double, complex64, complex128);
@@ -34,7 +31,4 @@ REGISTER3(SimpleBinaryOp, GPU, "SqrtGrad", functor::sqrt_grad, float,
           Eigen::half, double);
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER2(SimpleBinaryOp, SYCL, "SqrtGrad", functor::sqrt_grad, float, double);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_square.cc b/tensorflow/core/kernels/cwise_op_square.cc
index 40dea5a5fa3..3811839a7e3 100644
--- a/tensorflow/core/kernels/cwise_op_square.cc
+++ b/tensorflow/core/kernels/cwise_op_square.cc
@@ -34,13 +34,4 @@ REGISTER_KERNEL_BUILDER(Name("Square")
                         UnaryOp<CPUDevice, functor::square<int32>>);
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER3(UnaryOp, SYCL, "Square", functor::square, float, double, int64);
-REGISTER_KERNEL_BUILDER(Name("Square")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("x")
-                            .HostMemory("y")
-                            .TypeConstraint<int32>("T"),
-                        UnaryOp<CPUDevice, functor::square<int32>>);
-#endif  // TENSORFLOW_USE_SYC
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_squared_difference.cc b/tensorflow/core/kernels/cwise_op_squared_difference.cc
index 12520b7e10b..9bd457f5937 100644
--- a/tensorflow/core/kernels/cwise_op_squared_difference.cc
+++ b/tensorflow/core/kernels/cwise_op_squared_difference.cc
@@ -36,17 +36,5 @@ REGISTER_KERNEL_BUILDER(
         .TypeConstraint<int32>("T"),
     BinaryOp<CPUDevice, functor::squared_difference<int32>>);
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER3(BinaryOp, SYCL, "SquaredDifference", functor::squared_difference,
-          float, double, int64);
-REGISTER_KERNEL_BUILDER(
-    Name("SquaredDifference")
-        .Device(DEVICE_SYCL)
-        .HostMemory("x")
-        .HostMemory("y")
-        .HostMemory("z")
-        .TypeConstraint<int32>("T"),
-    BinaryOp<CPUDevice, functor::squared_difference<int32>>);
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_sub.cc b/tensorflow/core/kernels/cwise_op_sub.cc
index 17e690b2c17..6164b2a23ed 100644
--- a/tensorflow/core/kernels/cwise_op_sub.cc
+++ b/tensorflow/core/kernels/cwise_op_sub.cc
@@ -45,14 +45,4 @@ REGISTER_KERNEL_BUILDER(Name("Sub")
                         BinaryOp<CPUDevice, functor::sub<int32>>);
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER3(BinaryOp, SYCL, "Sub", functor::sub, float, double, int64);
-REGISTER_KERNEL_BUILDER(Name("Sub")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("x")
-                            .HostMemory("y")
-                            .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::sub<int32>>);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_tan.cc b/tensorflow/core/kernels/cwise_op_tan.cc
index d9793501a09..a9ccc5853db 100644
--- a/tensorflow/core/kernels/cwise_op_tan.cc
+++ b/tensorflow/core/kernels/cwise_op_tan.cc
@@ -16,14 +16,11 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER5(UnaryOp, CPU, "Tan", functor::tan, Eigen::half, float, double,
-          complex64, complex128);
+REGISTER6(UnaryOp, CPU, "Tan", functor::tan, Eigen::half, bfloat16, float,
+          double, complex64, complex128);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER3(UnaryOp, GPU, "Tan", functor::tan, Eigen::half, float, double);
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER2(UnaryOp, SYCL, "Tan", functor::tan, float, double);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_tanh.cc b/tensorflow/core/kernels/cwise_op_tanh.cc
index 1b6da56e537..2dbd77d9b06 100644
--- a/tensorflow/core/kernels/cwise_op_tanh.cc
+++ b/tensorflow/core/kernels/cwise_op_tanh.cc
@@ -17,8 +17,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_gradients.h"
 
 namespace tensorflow {
-REGISTER5(UnaryOp, CPU, "Tanh", functor::tanh, float, Eigen::half, double,
-          complex64, complex128);
+REGISTER6(UnaryOp, CPU, "Tanh", functor::tanh, float, Eigen::half, bfloat16,
+          double, complex64, complex128);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #ifndef MLIR_GENERATED_GPU_KERNELS_ENABLED
@@ -26,12 +26,9 @@ REGISTER3(UnaryOp, GPU, "Tanh", functor::tanh, float, Eigen::half, double);
 #endif
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER2(UnaryOp, SYCL, "Tanh", functor::tanh, float, double);
-#endif  // TENSORFLOW_USE_SYCL
 
-REGISTER5(SimpleBinaryOp, CPU, "TanhGrad", functor::tanh_grad, float,
-          Eigen::half, double, complex64, complex128);
+REGISTER6(SimpleBinaryOp, CPU, "TanhGrad", functor::tanh_grad, float,
+          Eigen::half, bfloat16, double, complex64, complex128);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER3(SimpleBinaryOp, GPU, "TanhGrad", functor::tanh_grad, float,
           Eigen::half, double);
diff --git a/tensorflow/core/kernels/cwise_op_xdivy.cc b/tensorflow/core/kernels/cwise_op_xdivy.cc
index dbd0a69347b..2baf788182f 100644
--- a/tensorflow/core/kernels/cwise_op_xdivy.cc
+++ b/tensorflow/core/kernels/cwise_op_xdivy.cc
@@ -19,16 +19,6 @@ namespace tensorflow {
 REGISTER5(BinaryOp, CPU, "Xdivy", functor::xdivy, float, Eigen::half, double,
           complex64, complex128);
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                 \
-  REGISTER_KERNEL_BUILDER(                                         \
-      Name("Xdivy").Device(DEVICE_SYCL).TypeConstraint<TYPE>("T"), \
-      BinaryOp<SYCLDevice, functor::xdivy<TYPE>>);
-REGISTER_SYCL_KERNEL(float);
-REGISTER_SYCL_KERNEL(double);
-#undef REGISTER_SYCL_KERNEL
-
-#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER5(BinaryOp, GPU, "Xdivy", functor::xdivy, float, Eigen::half, double,
diff --git a/tensorflow/core/kernels/cwise_op_xlog1py.cc b/tensorflow/core/kernels/cwise_op_xlog1py.cc
index f00d73e3038..493ee91c86d 100644
--- a/tensorflow/core/kernels/cwise_op_xlog1py.cc
+++ b/tensorflow/core/kernels/cwise_op_xlog1py.cc
@@ -19,19 +19,6 @@ namespace tensorflow {
 REGISTER5(BinaryOp, CPU, "Xlog1py", functor::xlog1py, float, Eigen::half,
           double, complex64, complex128);
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                   \
-  REGISTER_KERNEL_BUILDER(                                           \
-      Name("Xlog1py").Device(DEVICE_SYCL).TypeConstraint<TYPE>("T"), \
-      BinaryOp<SYCLDevice, functor::xlog1py<TYPE>>);
-REGISTER_SYCL_KERNEL(Eigen::half);
-REGISTER_SYCL_KERNEL(float);
-REGISTER_SYCL_KERNEL(double);
-REGISTER_SYCL_KERNEL(complex64);
-REGISTER_SYCL_KERNEL(complex128);
-#undef REGISTER_SYCL_KERNEL
-
-#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA
 REGISTER5(BinaryOp, GPU, "Xlog1py", functor::xlog1py, float, Eigen::half,
diff --git a/tensorflow/core/kernels/cwise_op_xlogy.cc b/tensorflow/core/kernels/cwise_op_xlogy.cc
index a7eefa59d61..a48a7865455 100644
--- a/tensorflow/core/kernels/cwise_op_xlogy.cc
+++ b/tensorflow/core/kernels/cwise_op_xlogy.cc
@@ -19,19 +19,6 @@ namespace tensorflow {
 REGISTER5(BinaryOp, CPU, "Xlogy", functor::xlogy, float, Eigen::half, double,
           complex64, complex128);
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                 \
-  REGISTER_KERNEL_BUILDER(                                         \
-      Name("Xlogy").Device(DEVICE_SYCL).TypeConstraint<TYPE>("T"), \
-      BinaryOp<SYCLDevice, functor::xlogy<TYPE>>);
-REGISTER_SYCL_KERNEL(Eigen::half);
-REGISTER_SYCL_KERNEL(float);
-REGISTER_SYCL_KERNEL(double);
-REGISTER_SYCL_KERNEL(complex64);
-REGISTER_SYCL_KERNEL(complex128);
-#undef REGISTER_SYCL_KERNEL
-
-#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA
 REGISTER5(BinaryOp, GPU, "Xlogy", functor::xlogy, float, Eigen::half, double,
diff --git a/tensorflow/core/kernels/cwise_ops_common.h b/tensorflow/core/kernels/cwise_ops_common.h
index c0aee43d268..9adc628421d 100644
--- a/tensorflow/core/kernels/cwise_ops_common.h
+++ b/tensorflow/core/kernels/cwise_ops_common.h
@@ -22,11 +22,8 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#include "tensorflow/core/lib/bfloat16/bfloat16.h"
+#include "tensorflow/core/platform/bfloat16.h"
 
-#ifdef TENSORFLOW_USE_SYCL
-#include "tensorflow/core/kernels/cwise_ops_sycl_common.h"
-#endif
 
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -42,9 +39,6 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif
 
 class BinaryOpShared : public OpKernel {
  public:
diff --git a/tensorflow/core/kernels/cwise_ops_gradients.h b/tensorflow/core/kernels/cwise_ops_gradients.h
index ab919738f99..78f77caa6fe 100644
--- a/tensorflow/core/kernels/cwise_ops_gradients.h
+++ b/tensorflow/core/kernels/cwise_ops_gradients.h
@@ -188,19 +188,6 @@ struct SimpleBinaryFunctor<CPUDevice, Functor> {
   }
 };
 
-#ifdef TENSORFLOW_USE_SYCL
-// Partial specialization of BinaryFunctor for SYCL devices
-typedef Eigen::SyclDevice SYCLDevice;
-template <typename Functor>
-struct SimpleBinaryFunctor<SYCLDevice, Functor> {
-  void operator()(const SYCLDevice& d, typename Functor::tout_type out,
-                  typename Functor::tin_type in0,
-                  typename Functor::tin_type in1) {
-    out.device(d) = in0.binaryExpr(in1, typename Functor::func());
-  }
-};
-
-#endif  // TENSORFLOW_USE_SYCL
 
 template <typename T>
 struct tanh_grad : base<T, Eigen::internal::scalar_tanh_gradient_op<T>> {};
diff --git a/tensorflow/core/kernels/cwise_ops_sycl_common.h b/tensorflow/core/kernels/cwise_ops_sycl_common.h
deleted file mode 100644
index 3e107cee04c..00000000000
--- a/tensorflow/core/kernels/cwise_ops_sycl_common.h
+++ /dev/null
@@ -1,163 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#if !TENSORFLOW_USE_SYCL
-#error This file must only be included when building TensorFlow with SYCL support
-#endif
-
-#ifndef TENSORFLOW_CORE_KERNELS_CWISE_OPS_SYCL_COMMON_H_
-#define TENSORFLOW_CORE_KERNELS_CWISE_OPS_SYCL_COMMON_H_
-
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/kernels/cwise_ops.h"
-#include "tensorflow/core/platform/types.h"
-
-namespace tensorflow {
-namespace functor {
-
-typedef Eigen::SyclDevice SYCLDevice;
-
-template <typename OUT, typename RHS>
-void Assign(const SYCLDevice& d, OUT out, RHS rhs) {
-  out.device(d) = rhs;
-}
-
-// Partial specialization of UnaryFunctor<Device=SYCLDevice, Functor>.
-template <typename Functor>
-struct UnaryFunctor<SYCLDevice, Functor> {
-  void operator()(const SYCLDevice& d, typename Functor::tout_type out,
-                  typename Functor::tin_type in) {
-    To32Bit(out).device(d) = To32Bit(in).unaryExpr(typename Functor::func());
-  }
-};
-
-// Partial specialization of BinaryFunctor<Device=SYCLDevice, Functor>.
-template <typename Functor, int NDIMS, bool has_errors>
-struct BinaryFunctor<SYCLDevice, Functor, NDIMS, has_errors> {
-  void operator()(const SYCLDevice& d, typename Functor::tout_type out,
-                  typename Functor::tin_type in0,
-                  typename Functor::tin_type in1, bool* error) {
-    To32Bit(out).device(d) =
-        To32Bit(in0).binaryExpr(To32Bit(in1), typename Functor::func());
-  }
-
-  void Left(const SYCLDevice& d, typename Functor::tout_type out,
-            typename Functor::tscalar_type scalar,
-            typename Functor::tin_type in, bool* error) {
-    typedef typename Functor::func Binary;
-    constexpr int NumDims = Functor::tin_type::NumDimensions;
-    static_assert(NumDims == 1, "Unexpected size");
-    Eigen::Sizes<1> scalar_dim;
-    out.device(d) = scalar.reshape(scalar_dim)
-                        .broadcast(in.dimensions())
-                        .binaryExpr(in, Binary());
-  }
-
-  void Right(const SYCLDevice& d, typename Functor::tout_type out,
-             typename Functor::tin_type in,
-             typename Functor::tscalar_type scalar, bool* error) {
-    typedef typename Functor::func Binary;
-    constexpr int NumDims = Functor::tin_type::NumDimensions;
-    static_assert(NumDims == 1, "Unexpected size");
-    Eigen::Sizes<1> scalar_dim;
-    out.device(d) = in.binaryExpr(
-        scalar.reshape(scalar_dim).broadcast(in.dimensions()), Binary());
-  }
-
-  void BCast(const SYCLDevice& d,
-             typename TTypes<typename Functor::out_type, NDIMS>::Tensor out,
-             typename TTypes<typename Functor::in_type, NDIMS>::ConstTensor in0,
-             typename Eigen::array<Eigen::DenseIndex, NDIMS> bcast0,
-             typename TTypes<typename Functor::in_type, NDIMS>::ConstTensor in1,
-             typename Eigen::array<Eigen::DenseIndex, NDIMS> bcast1,
-             bool* error) {
-    typedef typename Functor::in_type T;
-    typename Functor::func func;
-    if ((NDIMS == 2) && Functor::use_bcast_optimization &&
-        use_bcast_optimization<T>::value) {
-      const bool bcast0_all_one = AllOne<NDIMS>(bcast0);
-      const bool bcast1_all_one = AllOne<NDIMS>(bcast1);
-      if (bcast0_all_one && !bcast1_all_one) {
-        To32Bit(out).device(d) =
-            To32Bit(in0).binaryExpr(To32Bit(in1).broadcast(bcast1), func);
-        return;
-      }
-      if (!bcast0_all_one && bcast1_all_one) {
-        To32Bit(out).device(d) =
-            To32Bit(in0).broadcast(bcast0).binaryExpr(To32Bit(in1), func);
-        return;
-      }
-    }
-    To32Bit(out).device(d) = To32Bit(in0).broadcast(bcast0).binaryExpr(
-        To32Bit(in1).broadcast(bcast1), func);
-  }
-};
-
-// Macros to explicitly instantiate kernels on GPU for multiple types
-// (T0, T1, etc.) for UnaryFunctor (e.g., functor::sqrt).
-#define DEFINE_UNARY1(F, T) template struct UnaryFunctor<SYCLDevice, F<T> >
-#define DEFINE_UNARY2(F, T0, T1) \
-  DEFINE_UNARY1(F, T0);          \
-  DEFINE_UNARY1(F, T1)
-#define DEFINE_UNARY3(F, T0, T1, T2) \
-  DEFINE_UNARY2(F, T0, T1);          \
-  DEFINE_UNARY1(F, T2)
-#define DEFINE_UNARY4(F, T0, T1, T2, T3) \
-  DEFINE_UNARY2(F, T0, T1);              \
-  DEFINE_UNARY2(F, T2, T3)
-#define DEFINE_UNARY5(F, T0, T1, T2, T3, T4) \
-  DEFINE_UNARY2(F, T0, T1);                  \
-  DEFINE_UNARY3(F, T2, T3, T4)
-
-// Macros to explicitly instantiate kernels on GPU for multiple types
-// (T0, T1, etc.) for BinaryFunctor.
-#define DEFINE_BINARY1(F, T)                          \
-  template struct BinaryFunctor<SYCLDevice, F<T>, 1>; \
-  template struct BinaryFunctor<SYCLDevice, F<T>, 2>; \
-  template struct BinaryFunctor<SYCLDevice, F<T>, 3>
-#define DEFINE_BINARY2(F, T0, T1) \
-  DEFINE_BINARY1(F, T0);          \
-  DEFINE_BINARY1(F, T1)
-#define DEFINE_BINARY3(F, T0, T1, T2) \
-  DEFINE_BINARY2(F, T0, T1);          \
-  DEFINE_BINARY1(F, T2)
-#define DEFINE_BINARY4(F, T0, T1, T2, T3) \
-  DEFINE_BINARY2(F, T0, T1);              \
-  DEFINE_BINARY2(F, T2, T3)
-#define DEFINE_BINARY5(F, T0, T1, T2, T3, T4) \
-  DEFINE_BINARY2(F, T0, T1);                  \
-  DEFINE_BINARY3(F, T2, T3, T4)
-#define DEFINE_BINARY6(F, T0, T1, T2, T3, T4, T5) \
-  DEFINE_BINARY3(F, T0, T1, T2);                  \
-  DEFINE_BINARY3(F, T3, T4, T5)
-#define DEFINE_BINARY7(F, T0, T1, T2, T3, T4, T5, T6) \
-  DEFINE_BINARY3(F, T0, T1, T2);                      \
-  DEFINE_BINARY4(F, T3, T4, T5, T6)
-#define DEFINE_BINARY8(F, T0, T1, T2, T3, T4, T5, T6, T7) \
-  DEFINE_BINARY4(F, T0, T1, T2, T3);                      \
-  DEFINE_BINARY4(F, T4, T5, T6, T7)
-#define DEFINE_BINARY9(F, T0, T1, T2, T3, T4, T5, T6, T7, T8) \
-  DEFINE_BINARY4(F, T0, T1, T2, T3);                          \
-  DEFINE_BINARY5(F, T4, T5, T6, T7, T8)
-#define DEFINE_BINARY10(F, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9) \
-  DEFINE_BINARY5(F, T0, T1, T2, T3, T4);                           \
-  DEFINE_BINARY5(F, T5, T6, T7, T8, T9)
-
-}  // end namespace functor
-}  // end namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_KERNELS_CWISE_OPS_SYCL_COMMON_H_
diff --git a/tensorflow/core/kernels/cwise_ops_test.cc b/tensorflow/core/kernels/cwise_ops_test.cc
index bc77a119f0a..61f4b89535a 100644
--- a/tensorflow/core/kernels/cwise_ops_test.cc
+++ b/tensorflow/core/kernels/cwise_ops_test.cc
@@ -56,17 +56,11 @@ BM_UNARY(cpu, Floor, float, DT_FLOAT);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 BM_UNARY(gpu, Floor, float, DT_FLOAT);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#ifdef TENSORFLOW_USE_SYCL
-BM_UNARY(sycl, Floor, float, DT_FLOAT);
-#endif  // TENSORFLOW_USE_SYCL
 
 BM_UNARY(cpu, Floor, double, DT_DOUBLE);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 BM_UNARY(gpu, Floor, double, DT_DOUBLE);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#ifdef TENSORFLOW_USE_SYCL
-BM_UNARY(sycl, Floor, double, DT_DOUBLE);
-#endif  // TENSORFLOW_USE_SYCL
 
 BM_UNARY(cpu, Conj, std::complex<float>, DT_COMPLEX64);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@@ -134,25 +128,16 @@ BM_BINARY_SCALAR(cpu, Less);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 BM_BINARY_SCALAR(gpu, Less);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#ifdef TENSORFLOW_USE_SYCL
-BM_BINARY_SCALAR(sycl, Less);
-#endif  // TENSORFLOW_USE_SYCL
 
 BM_BINARY_SCALAR(cpu, Add);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 BM_BINARY_SCALAR(gpu, Add);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#ifdef TENSORFLOW_USE_SYCL
-BM_BINARY_SCALAR(sycl, Add);
-#endif  // TENSORFLOW_USE_SYCL
 
 BM_BINARY_SCALAR(cpu, DivNoNan);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 BM_BINARY_SCALAR(gpu, DivNoNan);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#ifdef TENSORFLOW_USE_SYCL
-BM_BINARY_SCALAR(sycl, DivNoNan);
-#endif  // TENSORFLOW_USE_SYCL
 
 #undef BM_BINARY_SCALAR
 
@@ -209,11 +194,6 @@ BM_CUBE(gpu, CubeWithPow3);
 BM_CUBE(gpu, CubeWithTwoMuls);
 BM_CUBE(gpu, CubeWithMulSquare);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#ifdef TENSORFLOW_USE_SYCL
-BM_CUBE(sycl, CubeWithPow3);
-BM_CUBE(sycl, CubeWithTwoMuls);
-BM_CUBE(sycl, CubeWithMulSquare);
-#endif  // TENSORFLOW_USE_SYCL
 
 #undef BM_CUBE
 
@@ -367,9 +347,6 @@ BM_BCAST_ADD_ROW_ALL(cpu);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 BM_BCAST_ADD_ROW_ALL(gpu);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#ifdef TENSORFLOW_USE_SYCL
-BM_BCAST_ADD_ROW_ALL(sycl);
-#endif  // TENSORFLOW_USE_SYCL
 #undef BM_BCAST_ADD_ROW_ALL
 #undef BM_BCAST_ADD_ROW
 
@@ -394,9 +371,6 @@ BM_BCAST_ADD_COL_ALL(cpu);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 BM_BCAST_ADD_COL_ALL(gpu);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#ifdef TENSORFLOW_USE_SYCL
-BM_BCAST_ADD_COL_ALL(sycl);
-#endif  // TENSORFLOW_USE_SYCL
 #undef BM_BCAST_ADD_COL_ALL
 #undef BM_BCAST_ADD_COL
 
@@ -422,9 +396,6 @@ BM_BCAST_ADD_CROSS_RC_ALL(cpu);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 BM_BCAST_ADD_CROSS_RC_ALL(gpu);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#ifdef TENSORFLOW_USE_SYCL
-BM_BCAST_ADD_CROSS_RC_ALL(sycl);
-#endif  // TENSORFLOW_USE_SYCL
 #undef BM_BCAST_ADD_CROSS_RC_ALL
 #undef BM_BCAST_ADD_CROSS_RC
 
@@ -450,9 +421,6 @@ BM_BCAST_ADD_CROSS_CR_ALL(cpu);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 BM_BCAST_ADD_CROSS_CR_ALL(gpu);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#ifdef TENSORFLOW_USE_SYCL
-BM_BCAST_ADD_CROSS_CR_ALL(sycl);
-#endif  // TENSORFLOW_USE_SYCL
 #undef BM_BCAST_ADD_CROSS_CR_ALL
 #undef BM_BCAST_ADD_CROSS_CR
 
diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index 94cc31a8cb6..c8ebea599c8 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -26,6 +26,7 @@ cc_library(
         ":map_dataset_op",
         ":name_utils",
         ":range_dataset_op",
+        ":split_utils",
         ":take_dataset_op",
         ":tensor_slice_dataset_op",
         "//tensorflow/core:core_cpu",
@@ -623,7 +624,6 @@ tf_cc_test(
     name = "parallel_interleave_dataset_op_test",
     size = "small",
     srcs = ["parallel_interleave_dataset_op_test.cc"],
-    tags = ["notsan"],  # TODO(b/147147071): Remove this tag once bug fix lands.
     deps = [
         ":captured_function",
         ":dataset_test_base",
@@ -814,6 +814,7 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -897,6 +898,32 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "split_utils",
+    srcs = ["split_utils.cc"],
+    hdrs = ["split_utils.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/kernels/data:iterator_ops",
+    ],
+)
+
+tf_cc_test(
+    name = "split_utils_test",
+    size = "small",
+    srcs = ["split_utils_test.cc"],
+    deps = [
+        ":dataset_test_base",
+        ":dataset_utils",
+        ":split_utils",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/framework:tensor_testutil",
+    ],
+)
+
 tf_kernel_library(
     name = "tensor_dataset_op",
     srcs = ["tensor_dataset_op.cc"],
@@ -929,6 +956,7 @@ tf_kernel_library(
     deps = [
         ":dataset_utils",
         ":name_utils",
+        ":split_utils",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:graph",
diff --git a/tensorflow/core/kernels/data/batch_dataset_op.cc b/tensorflow/core/kernels/data/batch_dataset_op.cc
index cfeb63a4242..7ea39dfe709 100644
--- a/tensorflow/core/kernels/data/batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/batch_dataset_op.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <algorithm>
 #include <utility>
 
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -73,7 +74,7 @@ class BatchDatasetOp::Dataset : public DatasetBase {
     const auto& input_shapes = input_->output_shapes();
     output_shapes_.reserve(input_shapes.size());
     for (const auto& input_shape : input_shapes) {
-      if (drop_remainder_) {
+      if (drop_remainder_ || input_->Cardinality() == kInfiniteCardinality) {
         output_shapes_.emplace_back(
             PartialTensorShape({batch_size_}).Concatenate(input_shape));
       } else {
@@ -116,6 +117,11 @@ class BatchDatasetOp::Dataset : public DatasetBase {
     return n / batch_size_ + (n % batch_size_ == 0 || drop_remainder_ ? 0 : 1);
   }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    inputs->push_back(input_);
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override {
     return input_->CheckExternalState();
   }
diff --git a/tensorflow/core/kernels/data/cache_dataset_ops.cc b/tensorflow/core/kernels/data/cache_dataset_ops.cc
index f60001b0055..c9883f9c938 100644
--- a/tensorflow/core/kernels/data/cache_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/cache_dataset_ops.cc
@@ -102,6 +102,11 @@ class CacheDatasetOp::FileDatasetBase : public DatasetBase {
 
   int64 Cardinality() const override { return input_->Cardinality(); }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    inputs->push_back(input_);
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override {
     return input_->CheckExternalState();
   }
@@ -680,6 +685,11 @@ class CacheDatasetOp::MemoryDatasetBase : public DatasetBase {
 
   int64 Cardinality() const override { return input_->Cardinality(); }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    inputs->push_back(input_);
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override {
     return input_->CheckExternalState();
   }
diff --git a/tensorflow/core/kernels/data/captured_function.cc b/tensorflow/core/kernels/data/captured_function.cc
index 0066764baa0..681bc1f7c35 100644
--- a/tensorflow/core/kernels/data/captured_function.cc
+++ b/tensorflow/core/kernels/data/captured_function.cc
@@ -519,13 +519,6 @@ Status FunctionMetadata::Create(
       return Status::OK();
     }
   }
-  for (const auto& node : fdef->node_def()) {
-    if (node.op() == kDataServiceDataset) {
-      return errors::InvalidArgument(
-          "The `.distribute(...)` dataset transformation is not supported "
-          "within tf.data functions.");
-    }
-  }
   return Status::OK();
 }
 
diff --git a/tensorflow/core/kernels/data/captured_function.h b/tensorflow/core/kernels/data/captured_function.h
index 68b3ea552fc..46e724c5d22 100644
--- a/tensorflow/core/kernels/data/captured_function.h
+++ b/tensorflow/core/kernels/data/captured_function.h
@@ -264,11 +264,6 @@ class InstantiatedCapturedFunction {
 };
 
 }  // namespace data
-
-// TODO(b/114112161): Remove these aliases when all users have moved over to the
-// `tensorflow::data` namespace.
-using data::CapturedFunction;
-
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_KERNELS_DATA_CAPTURED_FUNCTION_H_
diff --git a/tensorflow/core/kernels/data/concatenate_dataset_op.cc b/tensorflow/core/kernels/data/concatenate_dataset_op.cc
index 34faafeb178..ffe15248c0e 100644
--- a/tensorflow/core/kernels/data/concatenate_dataset_op.cc
+++ b/tensorflow/core/kernels/data/concatenate_dataset_op.cc
@@ -85,6 +85,12 @@ class ConcatenateDatasetOp::Dataset : public DatasetBase {
     return n1 + n2;
   }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    inputs->push_back(input_);
+    inputs->push_back(to_concatenate_);
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override {
     TF_RETURN_IF_ERROR(input_->CheckExternalState());
     return to_concatenate_->CheckExternalState();
diff --git a/tensorflow/core/kernels/data/dataset_test_base.cc b/tensorflow/core/kernels/data/dataset_test_base.cc
index e41e35be1e9..83c7673fc0c 100644
--- a/tensorflow/core/kernels/data/dataset_test_base.cc
+++ b/tensorflow/core/kernels/data/dataset_test_base.cc
@@ -62,14 +62,15 @@ limitations under the License.
 #include "tensorflow/core/kernels/data/map_dataset_op.h"
 #include "tensorflow/core/kernels/data/name_utils.h"
 #include "tensorflow/core/kernels/data/range_dataset_op.h"
+#include "tensorflow/core/kernels/data/split_utils.h"
 #include "tensorflow/core/kernels/data/take_dataset_op.h"
 #include "tensorflow/core/kernels/data/tensor_slice_dataset_op.h"
-#include "tensorflow/core/lib/bfloat16/bfloat16.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/io/record_writer.h"
 #include "tensorflow/core/lib/io/zlib_compression_options.h"
 #include "tensorflow/core/lib/io/zlib_outputbuffer.h"
+#include "tensorflow/core/platform/bfloat16.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/file_system.h"
@@ -606,6 +607,47 @@ Status DatasetOpsTestBase::CheckIteratorGetNext(
   return Status::OK();
 }
 
+Status DatasetOpsTestBase::CheckSplitProviderFullIteration(
+    const DatasetParams& params, const std::vector<Tensor>& expected_outputs) {
+  std::unique_ptr<TestDataset> dataset;
+  TF_RETURN_IF_ERROR(MakeDataset(params, &dataset));
+  std::unique_ptr<SplitProvider> split_provider;
+  TF_RETURN_IF_ERROR(dataset->dataset()->MakeSplitProvider(&split_provider));
+  std::unique_ptr<TestIterator> iterator;
+  TF_RETURN_IF_ERROR(
+      MakeIterator(params, *dataset, std::move(split_provider), &iterator));
+  TF_RETURN_IF_ERROR(CheckIteratorGetNext(iterator.get(), expected_outputs,
+                                          /*compare_order=*/true));
+  return Status::OK();
+}
+
+Status DatasetOpsTestBase::CheckSplitProviderShardedIteration(
+    const DatasetParams& params, int64 num_shards, int64 shard_index,
+    const std::vector<Tensor>& expected_outputs) {
+  std::unique_ptr<TestDataset> dataset;
+  TF_RETURN_IF_ERROR(MakeDataset(params, &dataset));
+  std::unique_ptr<SplitProvider> split_provider;
+  TF_RETURN_IF_ERROR(dataset->dataset()->MakeSplitProvider(&split_provider));
+  split_provider = absl::make_unique<ShardingSplitProvider>(
+      num_shards, shard_index, std::move(split_provider));
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_RETURN_IF_ERROR(
+      CreateIteratorContext(dataset->op_kernel_context(), &iterator_ctx));
+  IteratorContext::Params iterator_params(iterator_ctx.get());
+  iterator_params.split_provider = std::move(split_provider);
+  iterator_ctx = absl::make_unique<IteratorContext>(iterator_params);
+  int mid_breakpoint = expected_outputs.size() / 2;
+  int near_end_breakpoint = expected_outputs.size() - 1;
+  int end_breakpoint = expected_outputs.size();
+  TF_RETURN_IF_ERROR(CheckIteratorSaveAndRestore(
+      dataset->dataset(), iterator_ctx.get(), params.iterator_prefix(),
+      expected_outputs,
+      /*breakpoints=*/
+      {0, mid_breakpoint, near_end_breakpoint, end_breakpoint},
+      /*compare_order=*/true));
+  return Status::OK();
+}
+
 Status DatasetOpsTestBase::CheckDatasetNodeName(
     const string& expected_dataset_node_name) {
   EXPECT_EQ(dataset_->node_name(), expected_dataset_node_name);
@@ -658,11 +700,13 @@ Status DatasetOpsTestBase::CheckIteratorPrefix(
 }
 
 Status DatasetOpsTestBase::CheckIteratorSaveAndRestore(
-    const string& iterator_prefix, const std::vector<Tensor>& expected_outputs,
+    DatasetBase* dataset, IteratorContext* iterator_ctx,
+    const std::string& iterator_prefix,
+    const std::vector<Tensor>& expected_outputs,
     const std::vector<int>& breakpoints, bool compare_order) {
   std::unique_ptr<IteratorBase> iterator;
-  TF_RETURN_IF_ERROR(dataset_->MakeIterator(
-      iterator_ctx_.get(), /*parent=*/nullptr, iterator_prefix, &iterator));
+  TF_RETURN_IF_ERROR(dataset->MakeIterator(iterator_ctx, /*parent=*/nullptr,
+                                           iterator_prefix, &iterator));
   std::unique_ptr<SerializationContext> serialization_ctx;
   TF_RETURN_IF_ERROR(CreateSerializationContext(&serialization_ctx));
   bool end_of_sequence = false;
@@ -674,33 +718,31 @@ Status DatasetOpsTestBase::CheckIteratorSaveAndRestore(
     std::vector<const VariantTensorData*> data;
     writer.GetData(&data);
     VariantTensorDataReader reader(data);
-    TF_EXPECT_OK(RestoreIterator(iterator_ctx_.get(), &reader, iterator_prefix,
-                                 *dataset_, &iterator));
+    TF_EXPECT_OK(RestoreIterator(iterator_ctx, &reader, iterator_prefix,
+                                 *dataset, &iterator));
 
     while (cur_iteration <= breakpoint) {
       std::vector<Tensor> next;
       TF_RETURN_IF_ERROR(
-          iterator->GetNext(iterator_ctx_.get(), &next, &end_of_sequence));
+          iterator->GetNext(iterator_ctx, &next, &end_of_sequence));
       out_tensors.insert(out_tensors.end(), next.begin(), next.end());
       cur_iteration++;
     }
-
-    if (dataset_->Cardinality() == kUnknownCardinality) {
-      continue;
-    }
-
-    if (dataset_->Cardinality() == kInfiniteCardinality ||
-        breakpoint < dataset_->Cardinality()) {
-      EXPECT_FALSE(end_of_sequence);
-    } else {
-      EXPECT_TRUE(end_of_sequence);
-    }
   }
   TF_EXPECT_OK(ExpectEqual(out_tensors, expected_outputs,
                            /*compare_order=*/compare_order));
   return Status::OK();
 }
 
+Status DatasetOpsTestBase::CheckIteratorSaveAndRestore(
+    const std::string& iterator_prefix,
+    const std::vector<Tensor>& expected_outputs,
+    const std::vector<int>& breakpoints, bool compare_order) {
+  return CheckIteratorSaveAndRestore(dataset_, iterator_ctx_.get(),
+                                     iterator_prefix, expected_outputs,
+                                     breakpoints, compare_order);
+}
+
 Status DatasetOpsTestBase::Initialize(const DatasetParams& dataset_params) {
   if (initialized_) {
     return errors::Internal(
@@ -793,10 +835,14 @@ Status DatasetOpsTestBase::MakeDataset(
 
 Status DatasetOpsTestBase::MakeIterator(
     const DatasetParams& dataset_params, const TestDataset& dataset,
+    std::unique_ptr<SplitProvider> split_provider,
     std::unique_ptr<TestIterator>* iterator) {
   std::unique_ptr<IteratorContext> iterator_ctx;
   TF_RETURN_IF_ERROR(
       CreateIteratorContext(dataset.op_kernel_context(), &iterator_ctx));
+  IteratorContext::Params iterator_params(iterator_ctx.get());
+  iterator_params.split_provider = std::move(split_provider);
+  iterator_ctx = absl::make_unique<IteratorContext>(iterator_params);
   std::unique_ptr<IteratorBase> iterator_base;
   TF_RETURN_IF_ERROR(dataset.dataset()->MakeIterator(
       iterator_ctx.get(), /*parent=*/nullptr, dataset_params.iterator_prefix(),
@@ -806,6 +852,13 @@ Status DatasetOpsTestBase::MakeIterator(
   return Status::OK();
 }
 
+Status DatasetOpsTestBase::MakeIterator(
+    const DatasetParams& dataset_params, const TestDataset& dataset,
+    std::unique_ptr<TestIterator>* iterator) {
+  return MakeIterator(dataset_params, dataset, /*split_provider=*/nullptr,
+                      iterator);
+}
+
 Status DatasetOpsTestBase::RunDatasetOp(const DatasetParams& dataset_params,
                                         std::vector<Tensor>* outputs) {
   TF_RETURN_IF_ERROR(RunDatasetOp(dataset_params, &dataset_kernel_, &params_,
diff --git a/tensorflow/core/kernels/data/dataset_test_base.h b/tensorflow/core/kernels/data/dataset_test_base.h
index 0d07a93d4f2..c8680fa98b5 100644
--- a/tensorflow/core/kernels/data/dataset_test_base.h
+++ b/tensorflow/core/kernels/data/dataset_test_base.h
@@ -517,6 +517,12 @@ class DatasetOpsTestBase : public ::testing::Test {
   Status MakeDataset(const DatasetParams& dataset_params,
                      std::unique_ptr<TestDataset>* dataset);
 
+  // Creates an iterator for the given dataset, using the specified split
+  // provider.
+  Status MakeIterator(const DatasetParams& dataset_params,
+                      const TestDataset& dataset,
+                      std::unique_ptr<SplitProvider> split_provider,
+                      std::unique_ptr<TestIterator>* iterator);
   // Creates an iterator for the given dataset.
   Status MakeIterator(const DatasetParams& dataset_params,
                       const TestDataset& dataset,
@@ -556,6 +562,18 @@ class DatasetOpsTestBase : public ::testing::Test {
                               const std::vector<Tensor>& expected_outputs,
                               bool compare_order);
 
+  // Checks that iterating through the dataset using a split provider produces
+  // the expected outputs.
+  Status CheckSplitProviderFullIteration(
+      const DatasetParams& params, const std::vector<Tensor>& expected_outputs);
+
+  // Checks that iterating through the dataset using a sharded split provider
+  // with the given `num_shards` and `shard_index` produces the expected
+  // outputs.
+  Status CheckSplitProviderShardedIteration(
+      const DatasetParams& params, int64 num_shards, int64 shard_index,
+      const std::vector<Tensor>& expected_outputs);
+
   // Checks `DatasetBase::node_name()`.
   Status CheckDatasetNodeName(const string& expected_dataset_node_name);
 
@@ -583,9 +601,14 @@ class DatasetOpsTestBase : public ::testing::Test {
   // Checks `IteratorBase::prefix()`.
   Status CheckIteratorPrefix(const string& expected_iterator_prefix);
 
-  // Checks `IteratorBase::GetNext()`.
   Status CheckIteratorSaveAndRestore(
-      const string& iterator_prefix,
+      DatasetBase* dataset, IteratorContext* iterator_ctx,
+      const std::string& iterator_prefix,
+      const std::vector<Tensor>& expected_outputs,
+      const std::vector<int>& breakpoints, bool compare_order);
+
+  Status CheckIteratorSaveAndRestore(
+      const std::string& iterator_prefix,
       const std::vector<Tensor>& expected_outputs,
       const std::vector<int>& breakpoints, bool compare_order);
 
@@ -660,6 +683,7 @@ class DatasetOpsTestBase : public ::testing::Test {
       OpKernelContext* const op_context,
       std::unique_ptr<IteratorContext>* iterator_context);
 
+  // Creates a new iterator context for iterating the dataset.
   // Creates a new serialization context for serializing the dataset and
   // iterator.
   Status CreateSerializationContext(
diff --git a/tensorflow/core/kernels/data/dataset_utils.cc b/tensorflow/core/kernels/data/dataset_utils.cc
index 4151442d747..b7b6a7c6724 100644
--- a/tensorflow/core/kernels/data/dataset_utils.cc
+++ b/tensorflow/core/kernels/data/dataset_utils.cc
@@ -52,6 +52,7 @@ constexpr std::array<const char*, 3> kOpsWithSeed = {
 
 constexpr char kSeedInputName[] = "seed";
 constexpr char kSeed2InputName[] = "seed2";
+constexpr char kSeedGeneratorInputName[] = "seed_generator";
 constexpr char kComponent[] = "component";
 constexpr char kNumElements[] = "num_elements";
 constexpr char kNumComponents[] = "num_components";
@@ -60,7 +61,9 @@ template <std::size_t SIZE>
 bool IsNodeOfType(const NodeDef& node,
                   const std::array<const char*, SIZE>& op_types) {
   for (const auto& type : op_types) {
-    if (node.op() == type) return true;
+    if (MatchesAnyVersion(type, node.op())) {
+      return true;
+    }
   }
   return false;
 }
@@ -111,7 +114,8 @@ Status ShouldIgnoreInput(const NodeDef& node, int i, bool* result) {
       if (reg->op_def.input_arg_size() > i) {
         const std::string input_arg_name = reg->op_def.input_arg(i).name();
         if (input_arg_name == kSeedInputName ||
-            input_arg_name == kSeed2InputName) {
+            input_arg_name == kSeed2InputName ||
+            input_arg_name == kSeedGeneratorInputName) {
           VLOG(2) << "Ignoring arg: " << input_arg_name
                   << " from node: " << node.name();
           *result = true;
@@ -614,34 +618,35 @@ VariantTensorDataReader::VariantTensorDataReader(
   }
 }
 
-Status VariantTensorDataReader::ReadScalar(StringPiece key, int64* val) {
+Status VariantTensorDataReader::ReadScalar(StringPiece key, int64* val) const {
   return ReadScalarInternal(key, val);
 }
 
-Status VariantTensorDataReader::ReadScalar(StringPiece key, tstring* val) {
+Status VariantTensorDataReader::ReadScalar(StringPiece key,
+                                           tstring* val) const {
   return ReadScalarInternal(key, val);
 }
 
-Status VariantTensorDataReader::ReadTensor(StringPiece key, Tensor* val) {
+Status VariantTensorDataReader::ReadTensor(StringPiece key, Tensor* val) const {
   return ReadTensorInternal(key, val);
 }
 
 Status VariantTensorDataReader::ReadScalar(StringPiece name, StringPiece key,
-                                           int64* val) {
+                                           int64* val) const {
   return ReadScalarInternal(name, key, val);
 }
 
 Status VariantTensorDataReader::ReadScalar(StringPiece name, StringPiece key,
-                                           tstring* val) {
+                                           tstring* val) const {
   return ReadScalarInternal(name, key, val);
 }
 
 Status VariantTensorDataReader::ReadTensor(StringPiece name, StringPiece key,
-                                           Tensor* val) {
+                                           Tensor* val) const {
   return ReadTensorInternal(name, key, val);
 }
 
-bool VariantTensorDataReader::Contains(StringPiece key) {
+bool VariantTensorDataReader::Contains(StringPiece key) const {
   string name;
   if (!GetIteratorName(key, &name).ok()) {
     return false;
@@ -649,20 +654,26 @@ bool VariantTensorDataReader::Contains(StringPiece key) {
   return Contains(name, key);
 }
 
-bool VariantTensorDataReader::Contains(StringPiece n, StringPiece key) {
+bool VariantTensorDataReader::Contains(StringPiece n, StringPiece key) const {
   string name(n);
-  return map_[name].find(string(key)) != map_[name].end();
+  auto it = map_.find(name);
+  if (it == map_.end()) {
+    return false;
+  }
+  const auto& bucket = it->second;
+  return bucket.find(string(key)) != bucket.end();
 }
 
 template <typename T>
-Status VariantTensorDataReader::ReadScalarInternal(StringPiece key, T* val) {
+Status VariantTensorDataReader::ReadScalarInternal(StringPiece key,
+                                                   T* val) const {
   string name;
   TF_RETURN_IF_ERROR(GetIteratorName(key, &name));
   return ReadScalarInternal(name, key, val);
 }
 
 Status VariantTensorDataReader::ReadTensorInternal(StringPiece key,
-                                                   Tensor* val) {
+                                                   Tensor* val) const {
   string name;
   TF_RETURN_IF_ERROR(GetIteratorName(key, &name));
   return ReadTensorInternal(name, key, val);
@@ -670,23 +681,36 @@ Status VariantTensorDataReader::ReadTensorInternal(StringPiece key,
 
 template <typename T>
 Status VariantTensorDataReader::ReadScalarInternal(StringPiece n,
-                                                   StringPiece key, T* val) {
+                                                   StringPiece key,
+                                                   T* val) const {
   string name(n);
-  if (map_[name].find(string(key)) == map_[name].end()) {
+  auto it = map_.find(name);
+  if (it == map_.end()) {
+    return errors::NotFound(name);
+  }
+  const auto& bucket = it->second;
+  auto key_it = bucket.find(string(key));
+  if (key_it == bucket.end()) {
     return errors::NotFound(key);
   }
-  *val = data_[name]->tensors(map_[name][string(key)]).scalar<T>()();
+  *val = data_.at(name)->tensors(key_it->second).scalar<T>()();
   return Status::OK();
 }
 
 Status VariantTensorDataReader::ReadTensorInternal(StringPiece n,
                                                    StringPiece key,
-                                                   Tensor* val) {
+                                                   Tensor* val) const {
   string name(n);
-  if (map_[name].find(string(key)) == map_[name].end()) {
+  auto it = map_.find(name);
+  if (it == map_.end()) {
+    return errors::NotFound(name);
+  }
+  const auto& bucket = it->second;
+  auto key_it = bucket.find(string(key));
+  if (key_it == bucket.end()) {
     return errors::NotFound(key);
   }
-  *val = data_[name]->tensors(map_[name][string(key)]);
+  *val = data_.at(name)->tensors(key_it->second);
   return Status::OK();
 }
 
@@ -899,20 +923,53 @@ std::string DeterminismPolicy::String() const {
   }
 }
 
-bool MatchesAnyVersionRE(StringPiece op_prefix, StringPiece op_to_match) {
-  // Matches all versions of an op by appending an optional version suffix
-  auto expected_re = strings::StrCat(RE2::QuoteMeta(op_prefix), "(V\\d+)?");
-  return RE2::FullMatch(op_to_match, expected_re);
+bool MatchesAnyVersion(StringPiece op_prefix, StringPiece op_to_match) {
+  if (!absl::StartsWith(op_to_match, op_prefix)) {
+    return false;
+  }
+  if (op_to_match.length() == op_prefix.length()) {
+    return true;
+  }
+  size_t index = op_to_match.length() - 1;
+  while (isdigit(op_to_match[index])) {
+    index--;
+  }
+  return (op_to_match[index] == 'V') && (op_prefix.length() == index);
 }
 
 std::vector<tstring> SelectOptimizations(
-    const string& job_name, const string& opt_ins_raw,
-    const string& opt_outs_raw,
+    const string& job_name,
     const absl::flat_hash_map<string, uint64>& live_experiments,
     const std::vector<tstring>& optimizations_enabled,
     const std::vector<tstring>& optimizations_disabled,
     const std::vector<tstring>& optimizations_default,
     std::function<uint64(const string&)> hash_func) {
+  std::vector<tstring> optimizations;
+  if (job_name.empty()) {
+    // If `job_name` is empty, apply the enabled and default optimizations
+    // directly.
+    optimizations.insert(optimizations.end(), optimizations_enabled.begin(),
+                         optimizations_enabled.end());
+    optimizations.insert(optimizations.end(), optimizations_default.begin(),
+                         optimizations_default.end());
+    return optimizations;
+  }
+
+  // If `job_name` is non-empty, we determine which optimizations to apply to
+  // this job based on the enable/disable settings from tf.data.Options, the
+  // opt in/out settings from environment variables, and rollout condition from
+  // `live_experiments`.
+  const char* opt_ins_raw_cs = std::getenv("TF_DATA_EXPERIMENT_OPT_IN");
+  const char* opt_outs_raw_cs = std::getenv("TF_DATA_EXPERIMENT_OPT_OUT");
+  string opt_ins_raw;
+  if (opt_ins_raw_cs != nullptr) {
+    opt_ins_raw = string(opt_ins_raw_cs);
+  }
+  string opt_outs_raw;
+  if (opt_outs_raw_cs != nullptr) {
+    opt_outs_raw = string(opt_outs_raw_cs);
+  }
+
   // Creates a set of optimizations.
   absl::flat_hash_set<tstring> optimizations_set;
 
@@ -1018,22 +1075,20 @@ std::vector<tstring> SelectOptimizations(
     }
   }
 
-  // Log the experiments that will be applied.
-  if (VLOG_IS_ON(1)) {
-    for (auto& pair : live_experiments) {
-      string experiment = pair.first;
-      if (std::find(optimizations_set.begin(), optimizations_set.end(),
-                    experiment) != optimizations_set.end()) {
-        VLOG(1) << "The experiment \"" << experiment << "\" is applied.";
-      }
-    }
-  }
-
-  std::vector<tstring> optimizations;
   optimizations.insert(optimizations.end(), optimizations_set.begin(),
                        optimizations_set.end());
   return optimizations;
 }
 
+void StripDevicePlacement(FunctionDefLibrary* library) {
+  for (auto& function : (*library->mutable_function())) {
+    for (auto& node : (*function.mutable_node_def())) {
+      if (!node.device().empty()) {
+        *node.mutable_device() = "";
+      }
+    }
+  }
+}
+
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/dataset_utils.h b/tensorflow/core/kernels/data/dataset_utils.h
index 0fe3618f34b..5908f1e77bd 100644
--- a/tensorflow/core/kernels/data/dataset_utils.h
+++ b/tensorflow/core/kernels/data/dataset_utils.h
@@ -191,24 +191,28 @@ class VariantTensorDataReader : public IteratorStateReader {
   explicit VariantTensorDataReader(
       const std::vector<const VariantTensorData*>& data);
 
-  Status ReadScalar(StringPiece key, int64* val) override;
-  Status ReadScalar(StringPiece key, tstring* val) override;
-  Status ReadTensor(StringPiece key, Tensor* val) override;
-  bool Contains(StringPiece key) override;
+  Status ReadScalar(StringPiece key, int64* val) const override;
+  Status ReadScalar(StringPiece key, tstring* val) const override;
+  Status ReadTensor(StringPiece key, Tensor* val) const override;
+  bool Contains(StringPiece key) const override;
 
-  Status ReadScalar(StringPiece name, StringPiece key, int64* val) override;
-  Status ReadScalar(StringPiece name, StringPiece key, tstring* val) override;
-  Status ReadTensor(StringPiece name, StringPiece key, Tensor* val) override;
-  bool Contains(StringPiece name, StringPiece key) override;
+  Status ReadScalar(StringPiece name, StringPiece key,
+                    int64* val) const override;
+  Status ReadScalar(StringPiece name, StringPiece key,
+                    tstring* val) const override;
+  Status ReadTensor(StringPiece name, StringPiece key,
+                    Tensor* val) const override;
+  bool Contains(StringPiece name, StringPiece key) const override;
 
  private:
   template <typename T>
-  Status ReadScalarInternal(StringPiece key, T* val);
-  Status ReadTensorInternal(StringPiece key, Tensor* val);
+  Status ReadScalarInternal(StringPiece key, T* val) const;
+  Status ReadTensorInternal(StringPiece key, Tensor* val) const;
 
   template <typename T>
-  Status ReadScalarInternal(StringPiece name, StringPiece key, T* val);
-  Status ReadTensorInternal(StringPiece name, StringPiece key, Tensor* val);
+  Status ReadScalarInternal(StringPiece name, StringPiece key, T* val) const;
+  Status ReadTensorInternal(StringPiece name, StringPiece key,
+                            Tensor* val) const;
 
   std::map<string, std::map<string, size_t>> map_;
   std::map<string, const VariantTensorData*> data_;  // Not owned.
@@ -297,25 +301,27 @@ class DummyResourceOp : public OpKernel {
 };
 
 // Given an op prefix and an op to match, returns whether the op to match
-// is a regex match for any version of the op prefix. For example,
-// MatchesAnyVersionRE("BatchDataset", "BatchDataset") == true
-// MatchesAnyVersionRE("BatchDataset", "BatchDatasetV2") == true
-// MatchesAnyVersionRE("BatchDataset", "BatchDatasetV3") == true
-// MatchesAnyVersionRE("PaddedBatchDataset", "BatchDataset") == false
-bool MatchesAnyVersionRE(StringPiece op_prefix, StringPiece op_to_match);
+// is a match for any version of the op prefix. For example,
+// MatchesAnyVersion("BatchDataset", "BatchDataset") == true
+// MatchesAnyVersion("BatchDataset", "BatchDatasetV2") == true
+// MatchesAnyVersion("BatchDataset", "BatchDatasetV3") == true
+// MatchesAnyVersion("PaddedBatchDataset", "BatchDataset") == false
+bool MatchesAnyVersion(StringPiece op_prefix, StringPiece op_to_match);
 
-// Based on `optimizations_enabled`, `optimizations_disabled`, and
-// `optimizations_disabled`, returns the list of optimizations that will be
+// Based on `job_name`, `optimizations_enabled`, `optimizations_disabled` and
+// `optimizations_default`, returns the list of optimizations that will be
 // applied.
 std::vector<tstring> SelectOptimizations(
-    const string& job_name, const string& opt_ins_raw,
-    const string& opt_outs_raw,
+    const string& job_name,
     const absl::flat_hash_map<string, uint64>& live_experiments,
     const std::vector<tstring>& optimizations_enabled,
     const std::vector<tstring>& optimizations_disabled,
     const std::vector<tstring>& optimizations_default,
     std::function<uint64(const string&)> hash_func);
 
+// Removes device placements from the ops of all functions in `library`.
+void StripDevicePlacement(FunctionDefLibrary* library);
+
 }  // namespace data
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/data/dataset_utils_test.cc b/tensorflow/core/kernels/data/dataset_utils_test.cc
index a1f624faeb6..cc2b8dfa2cd 100644
--- a/tensorflow/core/kernels/data/dataset_utils_test.cc
+++ b/tensorflow/core/kernels/data/dataset_utils_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/data/dataset_utils.h"
 
 #include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/types.pb.h"
@@ -61,16 +62,15 @@ class DatasetHashUtilsTest : public ::testing::Test {
   }
 };
 
-string full_name(string key) {
-  return strings::StrCat(kFullNameRandomHex, kPipe, "Iterator:", key);
-}
+string full_name(string key) { return FullName("Iterator:", key); }
 
 TEST(DatasetUtilsTest, MatchesAnyVersion) {
-  EXPECT_TRUE(MatchesAnyVersionRE("BatchDataset", "BatchDataset"));
-  EXPECT_TRUE(MatchesAnyVersionRE("BatchDataset", "BatchDatasetV2"));
-  EXPECT_TRUE(MatchesAnyVersionRE("BatchDataset", "BatchDatasetV3"));
-  EXPECT_FALSE(MatchesAnyVersionRE("BatchDataset", "BatchV2Dataset"));
-  EXPECT_FALSE(MatchesAnyVersionRE("BatchDataset", "PaddedBatchDataset"));
+  EXPECT_TRUE(MatchesAnyVersion("BatchDataset", "BatchDataset"));
+  EXPECT_TRUE(MatchesAnyVersion("BatchDataset", "BatchDatasetV2"));
+  EXPECT_TRUE(MatchesAnyVersion("BatchDataset", "BatchDatasetV3"));
+  EXPECT_FALSE(MatchesAnyVersion("BatchDataset", "BatchDatasetXV3"));
+  EXPECT_FALSE(MatchesAnyVersion("BatchDataset", "BatchV2Dataset"));
+  EXPECT_FALSE(MatchesAnyVersion("BatchDataset", "PaddedBatchDataset"));
 }
 
 TEST(DatasetUtilsTest, VariantTensorDataRoundtrip) {
@@ -272,6 +272,26 @@ TEST(DatasetUtilsTest, AddToFunctionLibraryWithConflictingSignatures) {
       s.error_message());
 }
 
+TEST(DatasetUtilsTest, StripDevicePlacement) {
+  FunctionDefLibrary flib;
+  *flib.add_function() = FunctionDefHelper::Create(
+      /*function_name=*/"0",
+      /*in_def=*/{"arg: int64"},
+      /*out_def=*/{"ret: int64"},
+      /*attr_def=*/{},
+      /*node_def=*/
+      {{{"node"},
+        "Identity",
+        {"arg"},
+        {{"T", DT_INT64}},
+        /*dep=*/{},
+        /*device=*/"device:CPU:0"}},
+      /*ret_def=*/{{"ret", "arg"}});
+  EXPECT_EQ(flib.function(0).node_def(0).device(), "device:CPU:0");
+  StripDevicePlacement(&flib);
+  EXPECT_EQ(flib.function(0).node_def(0).device(), "");
+}
+
 TEST(DatasetUtilsTest, RunnerWithMaxParallelism) {
   auto runner =
       RunnerWithMaxParallelism([](const std::function<void()> fn) { fn(); }, 2);
@@ -1138,18 +1158,15 @@ class SelectOptimizationsHashTest : public ::testing::TestWithParam<uint64> {};
 TEST_P(SelectOptimizationsHashTest, DatasetUtils) {
   const uint64 hash_result = GetParam();
   string job_name = "job";
-  const string opt_ins_raw = "";
-  const string opt_outs_raw = "";
   auto hash_func = [hash_result](const string& str) { return hash_result; };
   absl::flat_hash_map<string, uint64> live_experiments = {
       {"exp1", 0},  {"exp2", 20}, {"exp3", 33}, {"exp4", 45},
       {"exp5", 67}, {"exp6", 88}, {"exp7", 100}};
   std::vector<tstring> optimizations_enabled, optimizations_disabled,
       optimizations_default;
-  std::vector<tstring> optimizations =
-      SelectOptimizations(job_name, opt_ins_raw, opt_outs_raw, live_experiments,
-                          optimizations_enabled, optimizations_disabled,
-                          optimizations_default, hash_func);
+  std::vector<tstring> optimizations = SelectOptimizations(
+      job_name, live_experiments, optimizations_enabled, optimizations_disabled,
+      optimizations_default, hash_func);
 
   int tested_times = 0;
   switch (hash_result) {
@@ -1182,48 +1199,60 @@ class SelectOptimizationsOptTest
     : public ::testing::TestWithParam<std::tuple<string, string>> {};
 
 TEST_P(SelectOptimizationsOptTest, DatasetUtils) {
+  const string opt_ins = std::get<0>(GetParam());
+  const string opt_outs = std::get<1>(GetParam());
+  if (!opt_ins.empty()) {
+    setenv("TF_DATA_EXPERIMENT_OPT_IN", opt_ins.c_str(), 1);
+  }
+  if (!opt_outs.empty()) {
+    setenv("TF_DATA_EXPERIMENT_OPT_OUT", opt_outs.c_str(), 1);
+  }
   string job_name = "job";
-  const string opt_ins_raw = std::get<0>(GetParam());
-  const string opt_outs_raw = std::get<1>(GetParam());
   auto hash_func = [](const string& str) { return 50; };
   absl::flat_hash_map<string, uint64> live_experiments = {
       {"exp1", 0}, {"exp2", 25}, {"exp3", 50}, {"exp4", 75}, {"exp5", 100}};
   std::vector<tstring> optimizations_enabled, optimizations_disabled,
       optimizations_default;
-  std::vector<tstring> optimizations =
-      SelectOptimizations(job_name, opt_ins_raw, opt_outs_raw, live_experiments,
-                          optimizations_enabled, optimizations_disabled,
-                          optimizations_default, hash_func);
+  std::vector<tstring> optimizations = SelectOptimizations(
+      job_name, live_experiments, optimizations_enabled, optimizations_disabled,
+      optimizations_default, hash_func);
 
   int tested_times = 0;
-  if (opt_outs_raw == "all") {
+  if (opt_outs == "all") {
     EXPECT_THAT(optimizations, UnorderedElementsAre());
     tested_times++;
-  } else if (opt_outs_raw.empty()) {
-    if (opt_ins_raw == "all") {
+  } else if (opt_outs.empty()) {
+    if (opt_ins == "all") {
       EXPECT_THAT(optimizations,
                   UnorderedElementsAre("exp1", "exp2", "exp3", "exp4", "exp5"));
       tested_times++;
-    } else if (opt_ins_raw.empty()) {
+    } else if (opt_ins.empty()) {
       EXPECT_THAT(optimizations, UnorderedElementsAre("exp4", "exp5"));
       tested_times++;
-    } else if (opt_ins_raw == "exp2,exp4") {
+    } else if (opt_ins == "exp2,exp4") {
       EXPECT_THAT(optimizations, UnorderedElementsAre("exp2", "exp4", "exp5"));
       tested_times++;
     }
-  } else if (opt_outs_raw == "exp1,exp5") {
-    if (opt_ins_raw == "all") {
+  } else if (opt_outs == "exp1,exp5") {
+    if (opt_ins == "all") {
       EXPECT_THAT(optimizations, UnorderedElementsAre("exp2", "exp3", "exp4"));
       tested_times++;
-    } else if (opt_ins_raw.empty()) {
+    } else if (opt_ins.empty()) {
       EXPECT_THAT(optimizations, UnorderedElementsAre("exp4"));
       tested_times++;
-    } else if (opt_ins_raw == "exp2,exp4") {
+    } else if (opt_ins == "exp2,exp4") {
       EXPECT_THAT(optimizations, UnorderedElementsAre("exp2", "exp4"));
       tested_times++;
     }
   }
   EXPECT_EQ(tested_times, 1);
+
+  if (!opt_ins.empty()) {
+    unsetenv("TF_DATA_EXPERIMENT_OPT_IN");
+  }
+  if (!opt_outs.empty()) {
+    unsetenv("TF_DATA_EXPERIMENT_OPT_OUT");
+  }
 }
 
 INSTANTIATE_TEST_SUITE_P(
@@ -1235,10 +1264,16 @@ class SelectOptimizationsConflictTest
     : public ::testing::TestWithParam<std::tuple<string, string, uint64>> {};
 
 TEST_P(SelectOptimizationsConflictTest, DatasetUtils) {
-  string job_name = "job";
-  const string opt_ins_raw = std::get<0>(GetParam());
-  const string opt_outs_raw = std::get<1>(GetParam());
+  const string opt_ins = std::get<0>(GetParam());
+  const string opt_outs = std::get<1>(GetParam());
   const uint64 hash_result = std::get<2>(GetParam());
+  if (!opt_ins.empty()) {
+    setenv("TF_DATA_EXPERIMENT_OPT_IN", opt_ins.c_str(), 1);
+  }
+  if (!opt_outs.empty()) {
+    setenv("TF_DATA_EXPERIMENT_OPT_OUT", opt_outs.c_str(), 1);
+  }
+  string job_name = "job";
   auto hash_func = [hash_result](const string& str) { return hash_result; };
   absl::flat_hash_map<string, uint64> live_experiments = {
       {"exp1", 20}, {"exp2", 30}, {"exp3", 40},
@@ -1246,21 +1281,27 @@ TEST_P(SelectOptimizationsConflictTest, DatasetUtils) {
   std::vector<tstring> optimizations_enabled = {"exp1", "exp4"},
                        optimizations_disabled = {"exp2", "exp5"},
                        optimizations_default = {"exp3", "exp6"};
-  std::vector<tstring> optimizations =
-      SelectOptimizations(job_name, opt_ins_raw, opt_outs_raw, live_experiments,
-                          optimizations_enabled, optimizations_disabled,
-                          optimizations_default, hash_func);
+  std::vector<tstring> optimizations = SelectOptimizations(
+      job_name, live_experiments, optimizations_enabled, optimizations_disabled,
+      optimizations_default, hash_func);
 
   int tested_times = 0;
-  if (opt_outs_raw.empty()) {
+  if (opt_outs.empty()) {
     EXPECT_THAT(optimizations,
                 UnorderedElementsAre("exp1", "exp3", "exp4", "exp6"));
     tested_times++;
-  } else if (opt_outs_raw == "exp1,exp3") {
+  } else if (opt_outs == "exp1,exp3") {
     EXPECT_THAT(optimizations, UnorderedElementsAre("exp1", "exp4", "exp6"));
     tested_times++;
   }
   EXPECT_EQ(tested_times, 1);
+
+  if (!opt_ins.empty()) {
+    unsetenv("TF_DATA_EXPERIMENT_OPT_IN");
+  }
+  if (!opt_outs.empty()) {
+    unsetenv("TF_DATA_EXPERIMENT_OPT_OUT");
+  }
 }
 
 INSTANTIATE_TEST_SUITE_P(Test, SelectOptimizationsConflictTest,
@@ -1268,6 +1309,66 @@ INSTANTIATE_TEST_SUITE_P(Test, SelectOptimizationsConflictTest,
                                             ::testing::Values("", "exp1,exp3"),
                                             ::testing::Values(10, 50, 90)));
 
+class SelectOptimizationsJobTest
+    : public ::testing::TestWithParam<std::tuple<string, string, string>> {};
+
+TEST_P(SelectOptimizationsJobTest, DatasetUtils) {
+  const string job_name = std::get<0>(GetParam());
+  const string opt_ins = std::get<1>(GetParam());
+  const string opt_outs = std::get<2>(GetParam());
+  if (!opt_ins.empty()) {
+    setenv("TF_DATA_EXPERIMENT_OPT_IN", opt_ins.c_str(), 1);
+  }
+  if (!opt_outs.empty()) {
+    setenv("TF_DATA_EXPERIMENT_OPT_OUT", opt_outs.c_str(), 1);
+  }
+  std::vector<tstring> optimizations_enabled = {"exp4"}, optimizations_disabled,
+                       optimizations_default = {"exp2"};
+  absl::flat_hash_map<string, uint64> live_experiments = {
+      {"exp1", 0}, {"exp2", 100}, {"exp3", 100}};
+  auto hash_func = [](const string& str) { return Hash64(str); };
+  std::vector<tstring> optimizations = SelectOptimizations(
+      job_name, live_experiments, optimizations_enabled, optimizations_disabled,
+      optimizations_default, hash_func);
+
+  int tested_times = 0;
+  if (job_name.empty()) {
+    EXPECT_THAT(optimizations, UnorderedElementsAre("exp2", "exp4"));
+    tested_times++;
+  } else if (opt_ins.empty()) {
+    if (opt_outs.empty()) {
+      EXPECT_THAT(optimizations, UnorderedElementsAre("exp2", "exp3", "exp4"));
+      tested_times++;
+    } else if (opt_outs == "exp2,exp3") {
+      EXPECT_THAT(optimizations, UnorderedElementsAre("exp4"));
+      tested_times++;
+    }
+  } else if (opt_ins == "exp1") {
+    if (opt_outs.empty()) {
+      EXPECT_THAT(optimizations,
+                  UnorderedElementsAre("exp1", "exp2", "exp3", "exp4"));
+      tested_times++;
+    } else if (opt_outs == "exp2,exp3") {
+      EXPECT_THAT(optimizations, UnorderedElementsAre("exp1", "exp4"));
+      tested_times++;
+    }
+  }
+  EXPECT_EQ(tested_times, 1);
+
+  if (!opt_ins.empty()) {
+    unsetenv("TF_DATA_EXPERIMENT_OPT_IN");
+  }
+  if (!opt_outs.empty()) {
+    unsetenv("TF_DATA_EXPERIMENT_OPT_OUT");
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(Test, SelectOptimizationsJobTest,
+                         ::testing::Combine(::testing::Values("", "job"),
+                                            ::testing::Values("", "exp1"),
+                                            ::testing::Values("",
+                                                              "exp2,exp3")));
+
 }  // namespace
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/BUILD b/tensorflow/core/kernels/data/experimental/BUILD
index bf28d175e6d..cd318b1343b 100644
--- a/tensorflow/core/kernels/data/experimental/BUILD
+++ b/tensorflow/core/kernels/data/experimental/BUILD
@@ -168,6 +168,7 @@ tf_kernel_library(
         "//tensorflow/core/data:compression_utils",
         "//tensorflow/core/data:dataset_proto_cc",
         "//tensorflow/core/data/service:data_service",
+        "//tensorflow/core/data/service:grpc_util",
         "//tensorflow/core/distributed_runtime/rpc:grpc_util",
         "//tensorflow/core/kernels/data:dataset_utils",
         "//tensorflow/core/kernels/data:name_utils",
@@ -189,6 +190,7 @@ tf_kernel_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/data/service:data_service",
+        "//tensorflow/core/data/service:grpc_util",
         "//tensorflow/core/kernels/data:dataset_utils",
         "//tensorflow/core/kernels/data:iterator_ops",
     ],
@@ -268,6 +270,7 @@ tf_kernel_library(
     deps = [
         "//tensorflow/core:experimental_dataset_ops_op_lib",
         "//tensorflow/core:framework",
+        "//tensorflow/core/platform:logging",
         "//third_party/eigen3",
     ],
 )
diff --git a/tensorflow/core/kernels/data/experimental/assert_cardinality_dataset_op.cc b/tensorflow/core/kernels/data/experimental/assert_cardinality_dataset_op.cc
index 1dd38dcaa04..30d0f9405f7 100644
--- a/tensorflow/core/kernels/data/experimental/assert_cardinality_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/assert_cardinality_dataset_op.cc
@@ -67,6 +67,11 @@ class AssertCardinalityDatasetOp::Dataset : public DatasetBase {
 
   int64 Cardinality() const override { return cardinality_; }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    inputs->push_back(input_);
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override {
     return input_->CheckExternalState();
   }
diff --git a/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.cc b/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.cc
index cb8dc67d6dd..3898bb4e705 100644
--- a/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.cc
@@ -64,6 +64,11 @@ class AssertNextDatasetOp::Dataset : public DatasetBase {
 
   int64 Cardinality() const override { return input_->Cardinality(); }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    inputs->push_back(input_);
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override {
     return input_->CheckExternalState();
   }
@@ -97,8 +102,8 @@ class AssertNextDatasetOp::Dataset : public DatasetBase {
       }
       int n = tokens.size();
       for (size_t i = 0; i < dataset()->transformations_.size(); ++i) {
-        if (!MatchesAnyVersionRE(dataset()->transformations_[i],
-                                 tokens[n - 2 - i])) {
+        if (!MatchesAnyVersion(dataset()->transformations_[i],
+                               tokens[n - 2 - i])) {
           return errors::InvalidArgument("Asserted transformation matching ",
                                          dataset()->transformations_[i],
                                          " at offset ", i, " but encountered ",
diff --git a/tensorflow/core/kernels/data/experimental/auto_shard_dataset_op.cc b/tensorflow/core/kernels/data/experimental/auto_shard_dataset_op.cc
index 821314740a2..9637aae5d7c 100644
--- a/tensorflow/core/kernels/data/experimental/auto_shard_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/auto_shard_dataset_op.cc
@@ -25,6 +25,7 @@ namespace experimental {
 /* static */ constexpr const char* const AutoShardDatasetOp::kDatasetType;
 /* static */ constexpr const char* const AutoShardDatasetOp::kInputDataset;
 /* static */ constexpr const char* const AutoShardDatasetOp::kNumWorkers;
+/* static */ constexpr const char* const AutoShardDatasetOp::kNumReplicas;
 /* static */ constexpr const char* const AutoShardDatasetOp::kIndex;
 /* static */ constexpr const char* const AutoShardDatasetOp::kOutputTypes;
 /* static */ constexpr const char* const AutoShardDatasetOp::kOutputShapes;
@@ -33,14 +34,17 @@ constexpr char kOptimizerName[] = "tf_auto_shard";
 
 AutoShardDatasetOp::AutoShardDatasetOp(OpKernelConstruction* ctx)
     : UnaryDatasetOpKernel(ctx), auto_shard_policy_(0) {
-  if (ctx->HasAttr("auto_shard_policy")) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("auto_shard_policy", &auto_shard_policy_));
+  if (ctx->HasAttr(kAutoShardPolicy)) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr(kAutoShardPolicy, &auto_shard_policy_));
+  }
+  if (ctx->HasAttr(kNumReplicas)) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr(kNumReplicas, &num_replicas_));
   }
 }
 
 void AutoShardDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase* input,
                                      DatasetBase** output) {
-  int64 index, num_workers, auto_shard_policy;
+  int64 index, num_workers, auto_shard_policy, num_replicas;
   OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, kNumWorkers, &num_workers));
   OP_REQUIRES(
       ctx, num_workers > 0,
@@ -51,9 +55,11 @@ void AutoShardDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase* input,
       ctx, index >= 0 && index < num_workers,
       errors::InvalidArgument("index must be between 0 and ", num_workers - 1));
   auto_shard_policy = auto_shard_policy_;
+  num_replicas = num_replicas_;
 
-  auto config_factory = [num_workers, index, auto_shard_policy]() {
-    return CreateConfig(num_workers, index, auto_shard_policy);
+  auto config_factory = [num_workers, index, auto_shard_policy,
+                         num_replicas]() {
+    return CreateConfig(num_workers, index, auto_shard_policy, num_replicas);
   };
 
   // We only want to optimize functions for some particular datasets like
@@ -65,7 +71,8 @@ void AutoShardDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase* input,
 }
 
 RewriterConfig AutoShardDatasetOp::CreateConfig(int64 num_workers, int64 index,
-                                                int64 auto_shard_policy) {
+                                                int64 auto_shard_policy,
+                                                int64 num_replicas) {
   RewriterConfig rewriter_config;
   rewriter_config.set_fail_on_optimizer_errors(true);
   rewriter_config.set_meta_optimizer_iterations(RewriterConfig::ONE);
@@ -73,16 +80,18 @@ RewriterConfig AutoShardDatasetOp::CreateConfig(int64 num_workers, int64 index,
   rewriter_config.add_optimizers(kOptimizerName);
   auto custom_optimizer = rewriter_config.add_custom_optimizers();
   custom_optimizer->set_name(kOptimizerName);
-  AttrValue num_workers_attr;
-  num_workers_attr.set_i(num_workers);
-  (*custom_optimizer->mutable_parameter_map())[kNumWorkers] = num_workers_attr;
-  AttrValue index_attr;
-  index_attr.set_i(index);
-  (*custom_optimizer->mutable_parameter_map())[kIndex] = index_attr;
-  AttrValue auto_shard_policy_attr;
-  auto_shard_policy_attr.set_i(auto_shard_policy);
-  (*custom_optimizer->mutable_parameter_map())[kAutoShardPolicy] =
-      auto_shard_policy_attr;
+
+  const std::array<std::pair<const char* const, int64>, 4> attr_pairs = {
+      {{kNumWorkers, num_workers},
+       {kIndex, index},
+       {kAutoShardPolicy, auto_shard_policy},
+       {kNumReplicas, num_replicas}}};
+
+  for (const auto& pair : attr_pairs) {
+    AttrValue attr;
+    attr.set_i(pair.second);
+    (*custom_optimizer->mutable_parameter_map())[pair.first] = attr;
+  }
 
   return rewriter_config;
 }
diff --git a/tensorflow/core/kernels/data/experimental/auto_shard_dataset_op.h b/tensorflow/core/kernels/data/experimental/auto_shard_dataset_op.h
index 174ab32de56..32afca491c6 100644
--- a/tensorflow/core/kernels/data/experimental/auto_shard_dataset_op.h
+++ b/tensorflow/core/kernels/data/experimental/auto_shard_dataset_op.h
@@ -28,6 +28,7 @@ class AutoShardDatasetOp : public UnaryDatasetOpKernel {
   static constexpr const char* const kNumWorkers = "num_workers";
   static constexpr const char* const kIndex = "index";
   static constexpr const char* const kAutoShardPolicy = "auto_shard_policy";
+  static constexpr const char* const kNumReplicas = "num_replicas";
   static constexpr const char* const kOutputTypes = "output_types";
   static constexpr const char* const kOutputShapes = "output_shapes";
 
@@ -39,8 +40,10 @@ class AutoShardDatasetOp : public UnaryDatasetOpKernel {
 
  private:
   static RewriterConfig CreateConfig(int64 num_workers, int64 index,
-                                     int64 auto_shard_policy);
+                                     int64 auto_shard_policy,
+                                     int64 num_replicas);
   int64 auto_shard_policy_;
+  int64 num_replicas_;
 };
 
 }  // namespace experimental
diff --git a/tensorflow/core/kernels/data/experimental/auto_shard_dataset_op_test.cc b/tensorflow/core/kernels/data/experimental/auto_shard_dataset_op_test.cc
index 3c5c5188f19..727c6ba51fe 100644
--- a/tensorflow/core/kernels/data/experimental/auto_shard_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/experimental/auto_shard_dataset_op_test.cc
@@ -25,12 +25,14 @@ class AutoShardDatasetParams : public DatasetParams {
  public:
   template <typename T>
   AutoShardDatasetParams(T input_dataset_params, int64 num_workers, int64 index,
-                         int auto_shard_policy, DataTypeVector output_dtypes,
+                         int auto_shard_policy, int64 num_replicas,
+                         DataTypeVector output_dtypes,
                          std::vector<PartialTensorShape> output_shapes,
                          string node_name)
       : DatasetParams(std::move(output_dtypes), std::move(output_shapes),
                       std::move(node_name)),
         num_workers_(num_workers),
+        num_replicas_(num_replicas),
         index_(index),
         auto_shard_policy_(auto_shard_policy) {
     input_dataset_params_.push_back(absl::make_unique<T>(input_dataset_params));
@@ -55,6 +57,7 @@ class AutoShardDatasetParams : public DatasetParams {
     attr_vector->clear();
     attr_vector->emplace_back(AutoShardDatasetOp::kAutoShardPolicy,
                               auto_shard_policy_);
+    attr_vector->emplace_back(AutoShardDatasetOp::kNumReplicas, num_replicas_);
     attr_vector->emplace_back(AutoShardDatasetOp::kOutputTypes, output_dtypes_);
     attr_vector->emplace_back(AutoShardDatasetOp::kOutputShapes,
                               output_shapes_);
@@ -67,6 +70,7 @@ class AutoShardDatasetParams : public DatasetParams {
 
  private:
   int64 num_workers_;
+  int64 num_replicas_;
   int64 index_;
   int auto_shard_policy_;
 };
@@ -79,6 +83,7 @@ AutoShardDatasetParams AutoShardDatasetParams1() {
                                 /*num_workers=*/5,
                                 /*index=*/2,
                                 /*auto_shard_policy=*/0,
+                                /*num_replicas=*/5,
                                 /*output_dtypes=*/{DT_INT64},
                                 /*output_shapes=*/{PartialTensorShape({})},
                                 /*node_name=*/kNodeName);
@@ -90,6 +95,7 @@ AutoShardDatasetParams AutoShardDatasetParams2() {
                                 /*num_workers=*/5,
                                 /*index=*/2,
                                 /*auto_shard_policy=*/0,
+                                /*num_replicas=*/5,
                                 /*output_dtypes=*/{DT_INT64},
                                 /*output_shapes=*/{PartialTensorShape({})},
                                 /*node_name=*/kNodeName);
@@ -102,6 +108,7 @@ AutoShardDatasetParams AutoShardDatasetParams3() {
                                 /*num_workers=*/4,
                                 /*index=*/3,
                                 /*auto_shard_policy=*/0,
+                                /*num_replicas=*/4,
                                 /*output_dtypes=*/{DT_INT64},
                                 /*output_shapes=*/{PartialTensorShape({})},
                                 /*node_name=*/kNodeName);
@@ -116,6 +123,7 @@ AutoShardDatasetParams AutoShardDatasetParams4() {
                                 /*num_workers=*/5,
                                 /*index=*/7,
                                 /*auto_shard_policy=*/0,
+                                /*num_replicas=*/5,
                                 /*output_dtypes=*/{DT_INT64},
                                 /*output_shapes=*/{PartialTensorShape({})},
                                 /*node_name=*/kNodeName);
@@ -127,6 +135,7 @@ AutoShardDatasetParams AutoShardDatasetParams5() {
                                 /*num_workers=*/5,
                                 /*index=*/-3,
                                 /*auto_shard_policy=*/0,
+                                /*num_replicas=*/5,
                                 /*output_dtypes=*/{DT_INT64},
                                 /*output_shapes=*/{PartialTensorShape({})},
                                 /*node_name=*/kNodeName);
@@ -138,6 +147,7 @@ AutoShardDatasetParams AutoShardDatasetParams6() {
                                 /*num_workers=*/-3,
                                 /*index=*/1,
                                 /*auto_shard_policy=*/0,
+                                /*num_replicas=*/5,
                                 /*output_dtypes=*/{DT_INT64},
                                 /*output_shapes=*/{PartialTensorShape({})},
                                 /*node_name=*/kNodeName);
@@ -149,6 +159,7 @@ AutoShardDatasetParams AutoShardDatasetParams7() {
                                 /*num_workers=*/0,
                                 /*index=*/1,
                                 /*auto_shard_policy=*/0,
+                                /*num_replicas=*/5,
                                 /*output_dtypes=*/{DT_INT64},
                                 /*output_shapes=*/{PartialTensorShape({})},
                                 /*node_name=*/kNodeName);
diff --git a/tensorflow/core/kernels/data/experimental/choose_fastest_branch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/choose_fastest_branch_dataset_op.cc
index 8772f21ef8f..1cb2564d3a0 100644
--- a/tensorflow/core/kernels/data/experimental/choose_fastest_branch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/choose_fastest_branch_dataset_op.cc
@@ -55,6 +55,10 @@ class WrapperDataset : public DatasetBase {
 
   string DebugString() const override { return "WrapperDataset"; }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override { return Status::OK(); }
 
  protected:
@@ -245,6 +249,12 @@ class ChooseFastestBranchDatasetOp : public UnaryDatasetOpKernel {
       return static_cast<double>(n) * ratio_numerator_ / ratio_denominator_;
     }
 
+    Status InputDatasets(
+        std::vector<const DatasetBase*>* inputs) const override {
+      inputs->push_back(input_);
+      return Status::OK();
+    }
+
     Status CheckExternalState() const override {
       for (const auto& captured_func : captured_funcs_) {
         TF_RETURN_IF_ERROR(captured_func->CheckExternalState());
diff --git a/tensorflow/core/kernels/data/experimental/choose_fastest_dataset_op.cc b/tensorflow/core/kernels/data/experimental/choose_fastest_dataset_op.cc
index 6ab72d85a99..3fff7bc6f16 100644
--- a/tensorflow/core/kernels/data/experimental/choose_fastest_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/choose_fastest_dataset_op.cc
@@ -158,6 +158,14 @@ class ChooseFastestDatasetOp : public DatasetOpKernel {
 
     int64 Cardinality() const override { return cardinality_; }
 
+    Status InputDatasets(
+        std::vector<const DatasetBase*>* inputs) const override {
+      for (const auto& input : inputs_) {
+        inputs->push_back(input);
+      }
+      return Status::OK();
+    }
+
     Status CheckExternalState() const override {
       for (const auto& input : inputs_) {
         TF_RETURN_IF_ERROR(input->CheckExternalState());
diff --git a/tensorflow/core/kernels/data/experimental/compute_batch_size_op.cc b/tensorflow/core/kernels/data/experimental/compute_batch_size_op.cc
index 1c4c5dea248..87cfaff5e5f 100644
--- a/tensorflow/core/kernels/data/experimental/compute_batch_size_op.cc
+++ b/tensorflow/core/kernels/data/experimental/compute_batch_size_op.cc
@@ -65,7 +65,7 @@ template <std::size_t SIZE>
 bool IsDatasetNodeOfType(const NodeDef& node,
                          const std::array<const char*, SIZE>& arr) {
   for (const auto& dataset_op : arr) {
-    if (MatchesAnyVersionRE(dataset_op, node.op())) return true;
+    if (MatchesAnyVersion(dataset_op, node.op())) return true;
   }
   return false;
 }
diff --git a/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc b/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc
index 233a61f440e..d8176eb9499 100644
--- a/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "tensorflow/core/data/dataset.pb.h"
 #include "tensorflow/core/data/service/data_service.h"
+#include "tensorflow/core/data/service/grpc_util.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/model.h"
@@ -186,11 +187,22 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
 
     ~Iterator() override {
       VLOG(1) << "Destroying data service dataset iterator for job id "
-              << job_id_;
+              << job_client_id_;
       CancelThreads();
       if (deregister_fn_) deregister_fn_();
-      // Thread destructors will block until the threads finish, no need to wait
-      // here.
+      task_thread_manager_.reset();
+      if (initialized_) {
+        Status s = dispatcher_->ReleaseJobClient(job_client_id_);
+        if (!s.ok()) {
+          LOG(WARNING) << "Failed to release job client id: " << s;
+        }
+      }
+      for (auto& worker_thread : worker_threads_) {
+        worker_thread.reset();
+      }
+
+      VLOG(1) << "Destroyed data service dataset iterator for job id "
+              << job_client_id_;
     }
 
     void CancelThreads() TF_LOCKS_EXCLUDED(mu_) {
@@ -208,17 +220,34 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
       TF_RETURN_IF_ERROR(RegisterCancellationCallback(
           ctx->cancellation_manager(), [this]() { CancelThreads(); },
           &deregister_fn_));
-      DataServiceDispatcherClient dispatcher(dataset()->address_,
-                                             dataset()->protocol_);
+      dispatcher_ = absl::make_unique<DataServiceDispatcherClient>(
+          dataset()->address_, dataset()->protocol_);
+      int64 deadline_micros = ctx->env()->NowMicros() + kRetryTimeoutMicros;
       if (dataset()->job_name_.empty()) {
-        TF_RETURN_IF_ERROR(dispatcher.CreateJob(
-            dataset()->dataset_id_, dataset()->processing_mode_, &job_id_));
+        TF_RETURN_IF_ERROR(grpc_util::Retry(
+            [&]() {
+              return dispatcher_->CreateJob(dataset()->dataset_id_,
+                                            dataset()->processing_mode_,
+                                            job_client_id_);
+            },
+            /*description=*/
+            strings::StrCat("create job with dispatcher at ",
+                            dataset()->address_),
+            deadline_micros));
       } else {
-        TF_RETURN_IF_ERROR(dispatcher.GetOrCreateJob(
-            dataset()->dataset_id_, dataset()->processing_mode_,
-            dataset()->job_name_, iterator_index_, &job_id_));
+        TF_RETURN_IF_ERROR(grpc_util::Retry(
+            [&]() {
+              return dispatcher_->GetOrCreateJob(
+                  dataset()->dataset_id_, dataset()->processing_mode_,
+                  dataset()->job_name_, iterator_index_, job_client_id_);
+            },
+            /*description=*/
+            strings::StrCat("get or create job with dispatcher at ",
+                            dataset()->address_),
+            deadline_micros));
       }
-      VLOG(1) << "Created data service job with id " << job_id_;
+      initialized_ = true;
+      VLOG(1) << "Created data service job with id " << job_client_id_;
       return Status::OK();
     }
 
@@ -299,8 +328,6 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
       auto cleanup =
           gtl::MakeCleanup([] { VLOG(1) << "Task thread manager exiting"; });
       VLOG(1) << "Starting task thread manager";
-      DataServiceDispatcherClient dispatcher(dataset()->address_,
-                                             dataset()->protocol_);
       uint64 next_check = Env::Default()->NowMicros();
       while (true) {
         {
@@ -318,22 +345,21 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
             return;
           }
         }
-        UpdateTasks(&dispatcher);
+        UpdateTasks();
         UpdateWorkerThreads(ctx.get());
         next_check = Env::Default()->NowMicros() +
                      dataset()->task_refresh_interval_ms_ * 1000;
       }
     }
 
-    void UpdateTasks(DataServiceDispatcherClient* dispatcher)
-        LOCKS_EXCLUDED(mu_) {
+    void UpdateTasks() LOCKS_EXCLUDED(mu_) {
       VLOG(3) << "Updating tasks";
       std::vector<TaskInfo> tasks;
       bool job_finished;
-      Status s = dispatcher->GetTasks(job_id_, &tasks, &job_finished);
+      Status s = dispatcher_->GetTasks(job_client_id_, tasks, job_finished);
       if (!s.ok()) {
-        LOG(WARNING) << "Failed to get task info for job id " << job_id_ << ": "
-                     << s;
+        LOG(WARNING) << "Failed to get task info for job client id "
+                     << job_client_id_ << ": " << s;
         return;
       }
       absl::flat_hash_map<int64, TaskInfo> task_id_to_task;
@@ -365,7 +391,7 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
         TaskInfo& task_info = new_task_entry.second;
         std::unique_ptr<DataServiceWorkerClient> worker;
         Status s = CreateDataServiceWorkerClient(task_info.worker_address(),
-                                                 dataset()->protocol_, &worker);
+                                                 dataset()->protocol_, worker);
         if (!s.ok()) {
           status_ = s;
           get_next_cv_.notify_all();
@@ -390,7 +416,6 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
           mutex_lock l(mu_);
           num_running_worker_threads_--;
           outstanding_requests_--;
-          VLOG(3) << "Exiting worker thread";
         };
         worker_threads_.push_back(ctx->StartThread(
             "tf-data-service-task_thread", [this, done = std::move(done)]() {
@@ -426,10 +451,10 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
             }
             worker_thread_cv_.wait(l);
           }
+          outstanding_requests_++;
           if (cancelled_) {
             return;
           }
-          outstanding_requests_++;
           // Search for a task to update.
           int num_tasks = tasks_.size();
           for (int i = 0; i < num_tasks; ++i) {
@@ -450,6 +475,9 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
         Status s = GetElement(task_to_process.get(), deadline_micros);
         if (!s.ok()) {
           mutex_lock l(mu_);
+          VLOG(1) << "Failed to get element for task "
+                  << task_to_process->task_id << ": " << s;
+          task_to_process->in_use = false;
           status_ = s;
           get_next_cv_.notify_all();
           return;
@@ -470,19 +498,11 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
       CompressedElement compressed;
       bool end_of_sequence;
       for (int num_retries = 0;; ++num_retries) {
-        Status s = task->worker->GetElement(task->task_id, &compressed,
-                                            &end_of_sequence);
+        Status s = task->worker->GetElement(task->task_id, compressed,
+                                            end_of_sequence);
         if (s.ok()) {
           break;
         }
-        if (errors::IsNotFound(s)) {
-          // This indicates that the worker was restarted. The restarted worker
-          // will get a new task, and the old task is lost.
-          mutex_lock l(mu_);
-          finished_tasks_++;
-          task->end_of_sequence = true;
-          return Status::OK();
-        }
         // Retry all errors that could indicate preemption.
         if (!errors::IsUnavailable(s) && !errors::IsCancelled(s) &&
             !errors::IsAborted(s)) {
@@ -572,15 +592,13 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
     Status status_ TF_GUARDED_BY(mu_) = Status::OK();
     std::queue<std::vector<Tensor>> results_ TF_GUARDED_BY(mu_);
 
+    bool initialized_ = false;
     // Set once in Initialize().
-    int64 job_id_;
+    int64 job_client_id_;
+    std::unique_ptr<DataServiceDispatcherClient> dispatcher_;
 
     bool job_finished_ = false;
-    // Must be ordered second to last so that worker threads are joined before
-    // destroying other fields.
     std::vector<std::unique_ptr<Thread>> worker_threads_ TF_GUARDED_BY(mu_);
-    // Must be ordered last so that the thread is joined before destroying other
-    // fields.
     std::unique_ptr<Thread> task_thread_manager_ GUARDED_BY(mu_);
   };
 
@@ -620,7 +638,7 @@ void DataServiceDatasetOp::MakeDataset(OpKernelContext* ctx,
       ctx, ParseScalarArgument(ctx, kProcessingMode, &processing_mode_str));
   ProcessingMode processing_mode;
   OP_REQUIRES_OK(ctx,
-                 ParseProcessingMode(processing_mode_str, &processing_mode));
+                 ParseProcessingMode(processing_mode_str, processing_mode));
 
   tstring address;
   OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, kAddress, &address));
diff --git a/tensorflow/core/kernels/data/experimental/data_service_ops.cc b/tensorflow/core/kernels/data/experimental/data_service_ops.cc
index d9ef42d4afa..b9f58b99b0f 100644
--- a/tensorflow/core/kernels/data/experimental/data_service_ops.cc
+++ b/tensorflow/core/kernels/data/experimental/data_service_ops.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/data/experimental/data_service_ops.h"
 
 #include "tensorflow/core/data/service/data_service.h"
+#include "tensorflow/core/data/service/grpc_util.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/platform/errors.h"
@@ -23,6 +24,10 @@ limitations under the License.
 namespace tensorflow {
 namespace data {
 
+namespace {
+const int64 kRetryTimeoutMicros = 1000LL * 1000 * 60 * 60;  // 60 minutes.
+}
+
 RegisterDatasetOp::RegisterDatasetOp(OpKernelConstruction* ctx)
     : OpKernel(ctx) {
   int64 external_state_policy_int;
@@ -52,10 +57,17 @@ void RegisterDatasetOp::Compute(OpKernelContext* ctx) {
   GraphDef graph_def;
   OP_REQUIRES_OK(
       ctx, AsGraphDef(ctx, dataset, std::move(serialization_ctx), &graph_def));
+  StripDevicePlacement(graph_def.mutable_library());
 
   DataServiceDispatcherClient client(address, protocol);
   int64 dataset_id;
-  OP_REQUIRES_OK(ctx, client.RegisterDataset(graph_def, &dataset_id));
+  int64 deadline_micros = EnvTime::NowMicros() + kRetryTimeoutMicros;
+  OP_REQUIRES_OK(
+      ctx, grpc_util::Retry(
+               [&]() { return client.RegisterDataset(graph_def, dataset_id); },
+               /*description=*/
+               strings::StrCat("register dataset with dispatcher at ", address),
+               deadline_micros));
 
   Tensor* output;
   OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape{}, &output));
diff --git a/tensorflow/core/kernels/data/experimental/data_service_ops.h b/tensorflow/core/kernels/data/experimental/data_service_ops.h
index b3d6233aa52..12dbec45b33 100644
--- a/tensorflow/core/kernels/data/experimental/data_service_ops.h
+++ b/tensorflow/core/kernels/data/experimental/data_service_ops.h
@@ -35,6 +35,7 @@ class RegisterDatasetOp : public OpKernel {
   static constexpr const char* const kProtocol = "protocol";
   static constexpr const char* const kExternalStatePolicy =
       "external_state_policy";
+  static constexpr const char* const kTimeoutMs = "timeout_ms";
 
   explicit RegisterDatasetOp(OpKernelConstruction* ctx);
 
diff --git a/tensorflow/core/kernels/data/experimental/dense_to_sparse_batch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/dense_to_sparse_batch_dataset_op.cc
index d09922988b9..c0070dca9f7 100644
--- a/tensorflow/core/kernels/data/experimental/dense_to_sparse_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/dense_to_sparse_batch_dataset_op.cc
@@ -120,6 +120,12 @@ class DenseToSparseBatchDatasetOp : public UnaryDatasetOpKernel {
       return n / batch_size_ + (n % batch_size_ == 0 ? 0 : 1);
     }
 
+    Status InputDatasets(
+        std::vector<const DatasetBase*>* inputs) const override {
+      inputs->push_back(input_);
+      return Status::OK();
+    }
+
     Status CheckExternalState() const override {
       return input_->CheckExternalState();
     }
diff --git a/tensorflow/core/kernels/data/experimental/group_by_window_dataset_op.cc b/tensorflow/core/kernels/data/experimental/group_by_window_dataset_op.cc
index 0a6df24d40a..a629292e2ed 100644
--- a/tensorflow/core/kernels/data/experimental/group_by_window_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/group_by_window_dataset_op.cc
@@ -107,6 +107,12 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
       return "GroupByWindowDatasetOp::Dataset";
     }
 
+    Status InputDatasets(
+        std::vector<const DatasetBase*>* inputs) const override {
+      inputs->push_back(input_);
+      return Status::OK();
+    }
+
     Status CheckExternalState() const override {
       TF_RETURN_IF_ERROR(captured_key_func_->CheckExternalState());
       TF_RETURN_IF_ERROR(captured_reduce_func_->CheckExternalState());
diff --git a/tensorflow/core/kernels/data/experimental/ignore_errors_dataset_op.cc b/tensorflow/core/kernels/data/experimental/ignore_errors_dataset_op.cc
index e177fe27d18..8b2745ee526 100644
--- a/tensorflow/core/kernels/data/experimental/ignore_errors_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/ignore_errors_dataset_op.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
 namespace data {
@@ -24,18 +25,23 @@ namespace {
 class IgnoreErrorsDatasetOp : public UnaryDatasetOpKernel {
  public:
   explicit IgnoreErrorsDatasetOp(OpKernelConstruction* ctx)
-      : UnaryDatasetOpKernel(ctx) {}
+      : UnaryDatasetOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("log_warning", &log_warning_));
+  }
 
   void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
                    DatasetBase** output) override {
-    *output = new Dataset(ctx, input);
+    *output = new Dataset(ctx, input, log_warning_);
   }
 
  private:
   class Dataset : public DatasetBase {
    public:
-    explicit Dataset(OpKernelContext* ctx, const DatasetBase* input)
-        : DatasetBase(DatasetContext(ctx)), input_(input) {
+    explicit Dataset(OpKernelContext* ctx, const DatasetBase* input,
+                     const bool log_warning)
+        : DatasetBase(DatasetContext(ctx)),
+          input_(input),
+          log_warning_(log_warning) {
       input_->Ref();
     }
 
@@ -60,6 +66,12 @@ class IgnoreErrorsDatasetOp : public UnaryDatasetOpKernel {
 
     int64 Cardinality() const override { return input_->Cardinality(); }
 
+    Status InputDatasets(
+        std::vector<const DatasetBase*>* inputs) const override {
+      inputs->push_back(input_);
+      return Status::OK();
+    }
+
     Status CheckExternalState() const override {
       return input_->CheckExternalState();
     }
@@ -70,7 +82,11 @@ class IgnoreErrorsDatasetOp : public UnaryDatasetOpKernel {
                               Node** output) const override {
       Node* input_graph_node = nullptr;
       TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
-      TF_RETURN_IF_ERROR(b->AddDataset(this, {input_graph_node}, output));
+      AttrValue log_warning_attr;
+      b->BuildAttrValue<bool>(log_warning_, &log_warning_attr);
+      TF_RETURN_IF_ERROR(
+          b->AddDataset(this, {std::make_pair(0, input_graph_node)}, {},
+                        {{"log_warning", log_warning_attr}}, output));
       return Status::OK();
     }
 
@@ -97,6 +113,10 @@ class IgnoreErrorsDatasetOp : public UnaryDatasetOpKernel {
           }
           s = input_impl_->GetNext(ctx, out_tensors, end_of_sequence);
           while (!s.ok() && !errors::IsCancelled(s)) {
+            if (dataset()->log_warning_) {
+              LOG(WARNING) << "Error raised with error message "
+                           << s.error_message();
+            }
             out_tensors->clear();
             s = input_impl_->GetNext(ctx, out_tensors, end_of_sequence);
           }
@@ -142,7 +162,9 @@ class IgnoreErrorsDatasetOp : public UnaryDatasetOpKernel {
     };
 
     const DatasetBase* const input_;
+    const bool log_warning_;
   };
+  bool log_warning_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("IgnoreErrorsDataset").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/data/experimental/io_ops.cc b/tensorflow/core/kernels/data/experimental/io_ops.cc
index 112a58a7e9a..903088ff481 100644
--- a/tensorflow/core/kernels/data/experimental/io_ops.cc
+++ b/tensorflow/core/kernels/data/experimental/io_ops.cc
@@ -28,6 +28,11 @@ namespace tensorflow {
 namespace data {
 namespace experimental {
 
+/* static */ constexpr const int SaveDatasetOp::kFileFormatVersion;
+/* static */ constexpr const char* const LoadDatasetOp::kCompression;
+/* static */ constexpr const char* const LoadDatasetOp::kReaderFunc;
+/* static */ constexpr const char* const LoadDatasetOp::kReaderFuncTarguments;
+
 SaveDatasetOp::SaveDatasetOp(OpKernelConstruction* ctx)
     : HybridAsyncOpKernel(ctx, "tf_data_save_dataset") {
   OP_REQUIRES_OK(ctx, ctx->GetAttr(kCompression, &compression_));
diff --git a/tensorflow/core/kernels/data/experimental/lmdb_dataset_op.cc b/tensorflow/core/kernels/data/experimental/lmdb_dataset_op.cc
index 7cfa74e6516..31763c89544 100644
--- a/tensorflow/core/kernels/data/experimental/lmdb_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/lmdb_dataset_op.cc
@@ -54,6 +54,10 @@ class LMDBDatasetOp::Dataset : public DatasetBase {
 
   string DebugString() const override { return "LMDBDatasetOp::Dataset"; }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override { return Status::OK(); }
 
  protected:
diff --git a/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
index fdc63bdb913..5cc72ba853e 100644
--- a/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
@@ -137,6 +137,11 @@ class MapAndBatchDatasetOp::Dataset : public DatasetBase {
     return n / batch_size_ + (n % batch_size_ == 0 || drop_remainder_ ? 0 : 1);
   }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    inputs->push_back(input_);
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override {
     TF_RETURN_IF_ERROR(captured_func_->CheckExternalState());
     return input_->CheckExternalState();
diff --git a/tensorflow/core/kernels/data/experimental/matching_files_dataset_op.cc b/tensorflow/core/kernels/data/experimental/matching_files_dataset_op.cc
index 90a61d72597..a606e76008c 100644
--- a/tensorflow/core/kernels/data/experimental/matching_files_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/matching_files_dataset_op.cc
@@ -82,6 +82,11 @@ class MatchingFilesDatasetOp : public DatasetOpKernel {
       return "MatchingFilesDatasetOp::Dataset";
     }
 
+    Status InputDatasets(
+        std::vector<const DatasetBase*>* inputs) const override {
+      return Status::OK();
+    }
+
     Status CheckExternalState() const override { return Status::OK(); }
 
    protected:
diff --git a/tensorflow/core/kernels/data/experimental/non_serializable_dataset_op.cc b/tensorflow/core/kernels/data/experimental/non_serializable_dataset_op.cc
index 1e752931157..cb0be0e0bbf 100644
--- a/tensorflow/core/kernels/data/experimental/non_serializable_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/non_serializable_dataset_op.cc
@@ -69,6 +69,12 @@ class NonSerializableDatasetOp : public UnaryDatasetOpKernel {
       return "NonSerializableDatasetOp::Dataset";
     }
 
+    Status InputDatasets(
+        std::vector<const DatasetBase*>* inputs) const override {
+      inputs->push_back(input_);
+      return Status::OK();
+    }
+
     Status CheckExternalState() const override {
       return input_->CheckExternalState();
     }
diff --git a/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc b/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc
index 9c344e01c6a..0e15015efee 100644
--- a/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc
@@ -147,6 +147,11 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
     return name_utils::DatasetDebugString(kDatasetType, params);
   }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    inputs->push_back(input_);
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override {
     TF_RETURN_IF_ERROR(captured_func_->CheckExternalState());
     return input_->CheckExternalState();
diff --git a/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc b/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc
index 3002987b621..16cf7fe6416 100644
--- a/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc
@@ -270,6 +270,12 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
 
     int64 Cardinality() const override { return input_->Cardinality(); }
 
+    Status InputDatasets(
+        std::vector<const DatasetBase*>* inputs) const override {
+      inputs->push_back(input_);
+      return Status::OK();
+    }
+
     Status CheckExternalState() const override {
       return input_->CheckExternalState();
     }
diff --git a/tensorflow/core/kernels/data/experimental/random_dataset_op.cc b/tensorflow/core/kernels/data/experimental/random_dataset_op.cc
index 460c18ce7a3..ee90d8cb603 100644
--- a/tensorflow/core/kernels/data/experimental/random_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/random_dataset_op.cc
@@ -64,6 +64,10 @@ class RandomDatasetOp::Dataset : public DatasetBase {
 
   int64 Cardinality() const override { return kInfiniteCardinality; }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override { return Status::OK(); }
 
  protected:
diff --git a/tensorflow/core/kernels/data/experimental/sampling_dataset_op.cc b/tensorflow/core/kernels/data/experimental/sampling_dataset_op.cc
index 00869eea85c..6f271e4912b 100644
--- a/tensorflow/core/kernels/data/experimental/sampling_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/sampling_dataset_op.cc
@@ -70,6 +70,11 @@ class SamplingDatasetOp::Dataset : public DatasetBase {
     return name_utils::DatasetDebugString(kDatasetType);
   }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    inputs->push_back(input_);
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override {
     return input_->CheckExternalState();
   }
diff --git a/tensorflow/core/kernels/data/experimental/scan_dataset_op.cc b/tensorflow/core/kernels/data/experimental/scan_dataset_op.cc
index 723f32311d0..eee635ffa7b 100644
--- a/tensorflow/core/kernels/data/experimental/scan_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/scan_dataset_op.cc
@@ -114,6 +114,12 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
       }
     }
 
+    Status InputDatasets(
+        std::vector<const DatasetBase*>* inputs) const override {
+      inputs->push_back(input_);
+      return Status::OK();
+    }
+
     Status CheckExternalState() const override {
       TF_RETURN_IF_ERROR(captured_func_->CheckExternalState());
       return input_->CheckExternalState();
diff --git a/tensorflow/core/kernels/data/experimental/set_stats_aggregator_dataset_op.cc b/tensorflow/core/kernels/data/experimental/set_stats_aggregator_dataset_op.cc
index e96de29d759..ab4f58e3c5c 100644
--- a/tensorflow/core/kernels/data/experimental/set_stats_aggregator_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/set_stats_aggregator_dataset_op.cc
@@ -138,6 +138,12 @@ class SetStatsAggregatorDatasetOp : public UnaryDatasetOpKernel {
 
     int64 Cardinality() const override { return input_->Cardinality(); }
 
+    Status InputDatasets(
+        std::vector<const DatasetBase*>* inputs) const override {
+      inputs->push_back(input_);
+      return Status::OK();
+    }
+
     Status CheckExternalState() const override {
       return input_->CheckExternalState();
     }
diff --git a/tensorflow/core/kernels/data/experimental/sleep_dataset_op.cc b/tensorflow/core/kernels/data/experimental/sleep_dataset_op.cc
index f2195804cfd..d21039c4078 100644
--- a/tensorflow/core/kernels/data/experimental/sleep_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/sleep_dataset_op.cc
@@ -68,6 +68,12 @@ class SleepDatasetOp : public UnaryDatasetOpKernel {
 
     int64 Cardinality() const override { return input_->Cardinality(); }
 
+    Status InputDatasets(
+        std::vector<const DatasetBase*>* inputs) const override {
+      inputs->push_back(input_);
+      return Status::OK();
+    }
+
     Status CheckExternalState() const override {
       return input_->CheckExternalState();
     }
diff --git a/tensorflow/core/kernels/data/experimental/sliding_window_dataset_op.cc b/tensorflow/core/kernels/data/experimental/sliding_window_dataset_op.cc
index 04ebd5bfd34..523259dde73 100644
--- a/tensorflow/core/kernels/data/experimental/sliding_window_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/sliding_window_dataset_op.cc
@@ -108,6 +108,12 @@ class SlidingWindowDatasetOp : public UnaryDatasetOpKernel {
       return n / window_shift_;
     }
 
+    Status InputDatasets(
+        std::vector<const DatasetBase*>* inputs) const override {
+      inputs->push_back(input_);
+      return Status::OK();
+    }
+
     Status CheckExternalState() const override {
       return input_->CheckExternalState();
     }
diff --git a/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc b/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc
index ec6cf02e02e..ce21903f2f0 100644
--- a/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc
@@ -63,6 +63,15 @@ namespace tensorflow {
 namespace data {
 namespace experimental {
 
+/* static */ constexpr const char* const SnapshotDatasetV2Op::kCompression;
+/* static */ constexpr const char* const SnapshotDatasetV2Op::kReaderFunc;
+/* static */ constexpr const char* const SnapshotDatasetV2Op::kShardFunc;
+/* static */ constexpr const char* const
+    SnapshotDatasetV2Op::kReaderFuncTarguments;
+/* static */ constexpr const char* const
+    SnapshotDatasetV2Op::kShardFuncTarguments;
+/* static */ constexpr const int SnapshotDatasetV2Op::kFileFormatVersion;
+
 // ==== Snapshot Implementation ====
 
 /* The current snapshot on-disk layout is as follows:
@@ -113,6 +122,8 @@ class SnapshotDatasetV2Op::Dataset : public DatasetBase {
 
   int64 Cardinality() const override;
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override;
+
   Status CheckExternalState() const override;
 
  protected:
@@ -236,11 +247,12 @@ class SnapshotDatasetV2Op::Dataset::Iterator::Writer
   void SignalEOF(bool mark_closed) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   mutex mu_;
+  mutex writer_status_mu_;
   std::unique_ptr<IteratorBase> input_impl_ TF_GUARDED_BY(mu_);
 
   absl::flat_hash_map<int64, std::unique_ptr<snapshot_util::AsyncWriter>>
       writers_ TF_GUARDED_BY(mu_);
-  Status writer_status_ TF_GUARDED_BY(mu_);
+  Status writer_status_ TF_GUARDED_BY(writer_status_mu_);
   bool writers_closed_ TF_GUARDED_BY(mu_);
 
   uint64 run_id_ TF_GUARDED_BY(mu_);
@@ -317,6 +329,12 @@ int64 SnapshotDatasetV2Op::Dataset::Cardinality() const {
   return input_->Cardinality();
 }
 
+Status SnapshotDatasetV2Op::Dataset::InputDatasets(
+    std::vector<const DatasetBase*>* inputs) const {
+  inputs->push_back(input_);
+  return Status::OK();
+}
+
 Status SnapshotDatasetV2Op::Dataset::CheckExternalState() const {
   return input_->CheckExternalState();
 }
@@ -664,9 +682,12 @@ Status SnapshotDatasetV2Op::Dataset::Iterator::Writer::GetNextInternal(
     }
 
     // Writers have either encountered an error or are closed.
-    if (!writer_status_.ok() || writers_closed_) {
-      *end_of_sequence = true;
-      return writer_status_;
+    {
+      mutex_lock wsl(writer_status_mu_);
+      if (!writer_status_.ok() || writers_closed_) {
+        *end_of_sequence = true;
+        return writer_status_;
+      }
     }
 
     TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx, out_tensors, end_of_sequence));
@@ -674,7 +695,10 @@ Status SnapshotDatasetV2Op::Dataset::Iterator::Writer::GetNextInternal(
     // Finalize metadata file when we are at the end of the iterator.
     if (*end_of_sequence) {
       SignalEOF(/*mark_closed=*/true);
-      TF_RETURN_IF_ERROR(writer_status_);
+      {
+        mutex_lock wsl(writer_status_mu_);
+        TF_RETURN_IF_ERROR(writer_status_);
+      }
       return WriteMetadataFile(ctx->env(), /*finalized=*/true);
     }
 
@@ -690,7 +714,8 @@ Status SnapshotDatasetV2Op::Dataset::Iterator::Writer::GetNextInternal(
           current_checkpoint_id_, dataset()->compression_, kFileFormatVersion,
           dataset()->output_dtypes(), [this](Status s) {
             if (!s.ok()) {
-              mutex_lock l(mu_);
+              LOG(ERROR) << "AsyncWriter in snapshot writer failed: " << s;
+              mutex_lock l(writer_status_mu_);
               writer_status_ = s;
             }
           });
@@ -1017,6 +1042,12 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
 
     int64 Cardinality() const override { return input_->Cardinality(); }
 
+    Status InputDatasets(
+        std::vector<const DatasetBase*>* inputs) const override {
+      inputs->push_back(input_);
+      return Status::OK();
+    }
+
     Status CheckExternalState() const override {
       return input_->CheckExternalState();
     }
@@ -1122,9 +1153,7 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
       // Initialize at first and at that point we don't know which iterator
       // (Reader / Writer / Passthrough) we need to restore as this info is part
       // of the checkpoint.
-      Status Initialize(IteratorContext* ctx) override {
-        return Status::OK();
-      }
+      Status Initialize(IteratorContext* ctx) override { return Status::OK(); }
 
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
diff --git a/tensorflow/core/kernels/data/experimental/snapshot_util.cc b/tensorflow/core/kernels/data/experimental/snapshot_util.cc
index 9e936974c83..33ce9956cbc 100644
--- a/tensorflow/core/kernels/data/experimental/snapshot_util.cc
+++ b/tensorflow/core/kernels/data/experimental/snapshot_util.cc
@@ -363,6 +363,10 @@ class Reader::Dataset : public DatasetBase {
     return "snapshot_util::Reader::Dataset";
   }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override { return Status::OK(); }
 
  protected:
@@ -483,6 +487,11 @@ class Reader::NestedDataset : public DatasetBase {
     return "snapshot_util::Reader::NestedDataset";
   }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    inputs->clear();
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override { return Status::OK(); }
 
  protected:
diff --git a/tensorflow/core/kernels/data/experimental/sql_dataset_op.cc b/tensorflow/core/kernels/data/experimental/sql_dataset_op.cc
index f6720aa1c88..4b1b99a120f 100644
--- a/tensorflow/core/kernels/data/experimental/sql_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/sql_dataset_op.cc
@@ -103,6 +103,11 @@ class SqlDatasetOp : public DatasetOpKernel {
 
     string DebugString() const override { return "SqlDatasetOp::Dataset"; }
 
+    Status InputDatasets(
+        std::vector<const DatasetBase*>* inputs) const override {
+      return Status::OK();
+    }
+
     Status CheckExternalState() const override { return Status::OK(); }
 
    protected:
diff --git a/tensorflow/core/kernels/data/experimental/stats_dataset_ops.cc b/tensorflow/core/kernels/data/experimental/stats_dataset_ops.cc
index 08d208fc340..1aa179acdd3 100644
--- a/tensorflow/core/kernels/data/experimental/stats_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/experimental/stats_dataset_ops.cc
@@ -78,6 +78,12 @@ class LatencyStatsDatasetOp : public UnaryDatasetOpKernel {
 
     int64 Cardinality() const override { return input_->Cardinality(); }
 
+    Status InputDatasets(
+        std::vector<const DatasetBase*>* inputs) const override {
+      inputs->push_back(input_);
+      return Status::OK();
+    }
+
     Status CheckExternalState() const override {
       return input_->CheckExternalState();
     }
diff --git a/tensorflow/core/kernels/data/experimental/take_while_dataset_op.cc b/tensorflow/core/kernels/data/experimental/take_while_dataset_op.cc
index fd4b4fccb7e..fd7eedc4cf0 100644
--- a/tensorflow/core/kernels/data/experimental/take_while_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/take_while_dataset_op.cc
@@ -82,6 +82,12 @@ class TakeWhileDatasetOp : public UnaryDatasetOpKernel {
 
     int64 Cardinality() const override { return kUnknownCardinality; }
 
+    Status InputDatasets(
+        std::vector<const DatasetBase*>* inputs) const override {
+      inputs->push_back(input_);
+      return Status::OK();
+    }
+
     Status CheckExternalState() const override {
       TF_RETURN_IF_ERROR(captured_func_->CheckExternalState());
       return input_->CheckExternalState();
diff --git a/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc b/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
index a9c682a426b..111d7b2fec2 100644
--- a/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
@@ -173,6 +173,12 @@ class ThreadPoolDatasetOp : public UnaryDatasetOpKernel {
 
     int64 Cardinality() const override { return input_->Cardinality(); }
 
+    Status InputDatasets(
+        std::vector<const DatasetBase*>* inputs) const override {
+      inputs->push_back(input_);
+      return Status::OK();
+    }
+
     Status CheckExternalState() const override {
       return input_->CheckExternalState();
     }
@@ -301,6 +307,13 @@ class MaxIntraOpParallelismDatasetOp : public UnaryDatasetOpKernel {
 
     int64 Cardinality() const override { return input_->Cardinality(); }
 
+    Status InputDatasets(
+        std::vector<const DatasetBase*>* inputs) const override {
+      inputs->clear();
+      inputs->push_back(input_);
+      return Status::OK();
+    }
+
     Status CheckExternalState() const override {
       return input_->CheckExternalState();
     }
@@ -423,6 +436,13 @@ class PrivateThreadPoolDatasetOp : public UnaryDatasetOpKernel {
 
     int64 Cardinality() const override { return input_->Cardinality(); }
 
+    Status InputDatasets(
+        std::vector<const DatasetBase*>* inputs) const override {
+      inputs->clear();
+      inputs->push_back(input_);
+      return Status::OK();
+    }
+
     Status CheckExternalState() const override {
       return input_->CheckExternalState();
     }
diff --git a/tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc
index e813de70931..7d5810f7818 100644
--- a/tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc
@@ -84,6 +84,12 @@ class UnbatchDatasetOp : public UnaryDatasetOpKernel {
       return kUnknownCardinality;
     }
 
+    Status InputDatasets(
+        std::vector<const DatasetBase*>* inputs) const override {
+      inputs->push_back(input_);
+      return Status::OK();
+    }
+
     Status CheckExternalState() const override {
       return input_->CheckExternalState();
     }
diff --git a/tensorflow/core/kernels/data/experimental/unique_dataset_op.cc b/tensorflow/core/kernels/data/experimental/unique_dataset_op.cc
index a4319234082..eeb1655970f 100644
--- a/tensorflow/core/kernels/data/experimental/unique_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/unique_dataset_op.cc
@@ -54,6 +54,11 @@ class UniqueDatasetOp::Dataset : public DatasetBase {
     return strings::StrCat("UniqueDatasetOp::Dataset");
   }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    inputs->push_back(input_);
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override {
     return input_->CheckExternalState();
   }
diff --git a/tensorflow/core/kernels/data/filter_dataset_op.cc b/tensorflow/core/kernels/data/filter_dataset_op.cc
index 1301aed3cb4..b93f19e58e3 100644
--- a/tensorflow/core/kernels/data/filter_dataset_op.cc
+++ b/tensorflow/core/kernels/data/filter_dataset_op.cc
@@ -75,6 +75,11 @@ class FilterDatasetOp::Dataset : public DatasetBase {
     return name_utils::DatasetDebugString(kDatasetType);
   }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    inputs->push_back(input_);
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override {
     TF_RETURN_IF_ERROR(captured_func_->CheckExternalState());
     return input_->CheckExternalState();
diff --git a/tensorflow/core/kernels/data/fixed_length_record_dataset_op.cc b/tensorflow/core/kernels/data/fixed_length_record_dataset_op.cc
index 468a22261d5..2b75483a7a5 100644
--- a/tensorflow/core/kernels/data/fixed_length_record_dataset_op.cc
+++ b/tensorflow/core/kernels/data/fixed_length_record_dataset_op.cc
@@ -93,6 +93,10 @@ class FixedLengthRecordDatasetOp::Dataset : public DatasetBase {
     return name_utils::DatasetDebugString(kDatasetType, params);
   }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override { return Status::OK(); }
 
  protected:
diff --git a/tensorflow/core/kernels/data/flat_map_dataset_op.cc b/tensorflow/core/kernels/data/flat_map_dataset_op.cc
index eba5097a1bb..ab0eb18abda 100644
--- a/tensorflow/core/kernels/data/flat_map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/flat_map_dataset_op.cc
@@ -75,6 +75,11 @@ class FlatMapDatasetOp::Dataset : public DatasetBase {
     return name_utils::DatasetDebugString(kDatasetType);
   }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    inputs->push_back(input_);
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override {
     TF_RETURN_IF_ERROR(captured_func_->CheckExternalState());
     return input_->CheckExternalState();
diff --git a/tensorflow/core/kernels/data/generator_dataset_op.cc b/tensorflow/core/kernels/data/generator_dataset_op.cc
index fcdbe4ab9a5..8d841cf9f60 100644
--- a/tensorflow/core/kernels/data/generator_dataset_op.cc
+++ b/tensorflow/core/kernels/data/generator_dataset_op.cc
@@ -74,6 +74,10 @@ class GeneratorDatasetOp::Dataset : public DatasetBase {
     return name_utils::DatasetDebugString(kDatasetType);
   }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override {
     TF_RETURN_IF_ERROR(init_func_->CheckExternalState());
     TF_RETURN_IF_ERROR(next_func_->CheckExternalState());
diff --git a/tensorflow/core/kernels/data/interleave_dataset_op.cc b/tensorflow/core/kernels/data/interleave_dataset_op.cc
index 0a795c1cf82..cbe1caeb0b0 100644
--- a/tensorflow/core/kernels/data/interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/interleave_dataset_op.cc
@@ -87,6 +87,11 @@ class InterleaveDatasetOp::Dataset : public DatasetBase {
     return name_utils::DatasetDebugString(kDatasetType);
   }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    inputs->push_back(input_);
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override {
     TF_RETURN_IF_ERROR(captured_func_->CheckExternalState());
     return input_->CheckExternalState();
diff --git a/tensorflow/core/kernels/data/map_dataset_op.cc b/tensorflow/core/kernels/data/map_dataset_op.cc
index d34e4f2b041..3626c0bbf89 100644
--- a/tensorflow/core/kernels/data/map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/map_dataset_op.cc
@@ -80,6 +80,11 @@ class MapDatasetOp::Dataset : public DatasetBase {
     }
   }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    inputs->push_back(input_);
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override {
     TF_RETURN_IF_ERROR(captured_func_->CheckExternalState());
     return input_->CheckExternalState();
diff --git a/tensorflow/core/kernels/data/model_dataset_op.cc b/tensorflow/core/kernels/data/model_dataset_op.cc
index d32ac368fa1..d325a3dcf66 100644
--- a/tensorflow/core/kernels/data/model_dataset_op.cc
+++ b/tensorflow/core/kernels/data/model_dataset_op.cc
@@ -34,23 +34,31 @@ constexpr double kRamBudgetShare = 0.5;
 
 class ModelDatasetOp : public UnaryDatasetOpKernel {
  public:
+  static constexpr const char* const kAlgorithm = "algorithm";
+  static constexpr const char* const kCpuBudget = "cpu_budget";
+  static constexpr const char* const kRamBudget = "ram_budget";
+
   explicit ModelDatasetOp(OpKernelConstruction* ctx)
       : UnaryDatasetOpKernel(ctx) {
-    if (ctx->HasAttr("algorithm")) {
+    if (ctx->HasAttr(kAlgorithm)) {
       int64 algorithm;
-      OP_REQUIRES_OK(ctx, ctx->GetAttr("algorithm", &algorithm));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr(kAlgorithm, &algorithm));
       algorithm_ = model::AutotuneAlgorithm(algorithm);
     } else {
       algorithm_ = model::AutotuneAlgorithm::HILL_CLIMB;
     }
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("cpu_budget", &cpu_budget_));
-    if (cpu_budget_ == 0) {
-      cpu_budget_ = port::NumSchedulableCPUs();
-    }
-    OP_REQUIRES(ctx, cpu_budget_ > 0,
+    OP_REQUIRES_OK(ctx, ctx->GetAttr(kCpuBudget, &cpu_budget_));
+    OP_REQUIRES(ctx, cpu_budget_ >= 0,
                 errors::InvalidArgument("CPU budget must be positive but is ",
                                         cpu_budget_, "."));
-    ram_budget_ = kRamBudgetShare * port::AvailableRam();
+    if (ctx->HasAttr(kRamBudget)) {
+      OP_REQUIRES_OK(ctx, ctx->GetAttr(kRamBudget, &ram_budget_));
+    } else {
+      ram_budget_ = 0;
+    }
+    OP_REQUIRES(ctx, ram_budget_ >= 0,
+                errors::InvalidArgument("RAM budget must be positive but is ",
+                                        ram_budget_, "."));
   }
 
   void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
@@ -91,6 +99,12 @@ class ModelDatasetOp : public UnaryDatasetOpKernel {
 
     int64 Cardinality() const override { return input_->Cardinality(); }
 
+    Status InputDatasets(
+        std::vector<const DatasetBase*>* inputs) const override {
+      inputs->push_back(input_);
+      return Status::OK();
+    }
+
     Status CheckExternalState() const override {
       return input_->CheckExternalState();
     }
@@ -102,6 +116,19 @@ class ModelDatasetOp : public UnaryDatasetOpKernel {
       Node* input_graph_node = nullptr;
       TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
       TF_RETURN_IF_ERROR(b->AddDataset(this, {input_graph_node}, output));
+      AttrValue algorithm_attr;
+      b->BuildAttrValue(static_cast<int64>(algorithm_), &algorithm_attr);
+      AttrValue cpu_budget_attr;
+      b->BuildAttrValue(cpu_budget_, &cpu_budget_attr);
+      AttrValue ram_budget_attr;
+      b->BuildAttrValue(ram_budget_, &ram_budget_attr);
+
+      TF_RETURN_IF_ERROR(
+          b->AddDataset(this, {input_graph_node},
+                        {std::make_pair(kAlgorithm, algorithm_attr),
+                         std::make_pair(kCpuBudget, cpu_budget_attr),
+                         std::make_pair(kRamBudget, ram_budget_attr)},
+                        output));
       return Status::OK();
     }
 
@@ -109,7 +136,12 @@ class ModelDatasetOp : public UnaryDatasetOpKernel {
     class Iterator : public DatasetIterator<Dataset> {
      public:
       explicit Iterator(const Params& params)
-          : DatasetIterator<Dataset>(params) {
+          : DatasetIterator<Dataset>(params),
+            cpu_budget_(dataset()->cpu_budget_ == 0 ? port::NumSchedulableCPUs()
+                                                    : dataset()->cpu_budget_),
+            ram_budget_(dataset()->ram_budget_ == 0
+                            ? kRamBudgetShare * port::AvailableRam()
+                            : dataset()->ram_budget_) {
         model_ = std::make_shared<model::Model>();
       }
 
@@ -203,8 +235,8 @@ class ModelDatasetOp : public UnaryDatasetOpKernel {
             tf_shared_lock l(mu_);
             model_input_time = SelfInputTime();
           }
-          model_->Optimize(dataset()->algorithm_, dataset()->cpu_budget_,
-                           dataset()->ram_budget_, /*model_input_time=*/0);
+          model_->Optimize(dataset()->algorithm_, cpu_budget_, ram_budget_,
+                           /*model_input_time=*/0);
           // Exponentially increase the period of running the optimization
           // until a threshold is reached.
           if (optimization_period_ms != kOptimizationPeriodThresholdMs) {
@@ -246,6 +278,8 @@ class ModelDatasetOp : public UnaryDatasetOpKernel {
       int64 num_input_events_ TF_GUARDED_BY(mu_) = 0;
       int64 input_time_ TF_GUARDED_BY(mu_) = 0;
       int64 last_output_time_ TF_GUARDED_BY(mu_) = 0;
+      const int64 cpu_budget_;
+      const int64 ram_budget_;
     };
 
     const DatasetBase* input_;
diff --git a/tensorflow/core/kernels/data/optimize_dataset_op.cc b/tensorflow/core/kernels/data/optimize_dataset_op.cc
index a0101435794..9d05a6a019b 100644
--- a/tensorflow/core/kernels/data/optimize_dataset_op.cc
+++ b/tensorflow/core/kernels/data/optimize_dataset_op.cc
@@ -80,30 +80,41 @@ void OptimizeDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase* input,
                                                      &optimizations_default));
 
     string job_name = port::JobName();
-    if (job_name.empty()) {
-      // If `job_name` is empty, apply the enabled and default optimizations
-      // directly.
-      optimizations.insert(optimizations.end(), optimizations_enabled.begin(),
-                           optimizations_enabled.end());
-      optimizations.insert(optimizations.end(), optimizations_default.begin(),
-                           optimizations_default.end());
-    } else {
-      // The map that stores the experiment names and for how much percentage
-      // of the jobs, the experiments will be randomly turned on.
-      //
-      // This is currently empty; we have no live experiments yet.
-      absl::flat_hash_map<string, uint64> live_experiments;
+    // The map that stores the experiment names and for how much percentage
+    // of the jobs, the experiments will be randomly turned on.
+    // clang-format off
+    absl::flat_hash_map<string, uint64> live_experiments = {
+        {"disable_intra_op_parallelism", 100}
+    };
+    // clang-format on
+    auto hash_func = [](const string& str) { return Hash64(str); };
+    optimizations = SelectOptimizations(
+        job_name, live_experiments, optimizations_enabled,
+        optimizations_disabled, optimizations_default, hash_func);
 
-      const string opt_ins_raw = std::getenv("TF_DATA_EXPERIMENT_OPT_IN");
-      const string opt_outs_raw = std::getenv("TF_DATA_EXPERIMENT_OPT_OUT");
-      auto hash_func = [](const string& str) { return Hash64(str); };
-      optimizations = SelectOptimizations(
-          job_name, opt_ins_raw, opt_outs_raw, live_experiments,
-          optimizations_enabled, optimizations_disabled, optimizations_default,
-          hash_func);
+    // Log and record the experiments that will be applied.
+    if (!job_name.empty() && !live_experiments.empty()) {
+      VLOG(1) << "The input pipeline is subject to tf.data experiment. "
+                 "Please see `go/tf-data-experiments` for more details.";
+
+      for (auto& pair : live_experiments) {
+        string experiment = pair.first;
+        if (std::find(optimizations.begin(), optimizations.end(), experiment) !=
+            optimizations.end()) {
+          VLOG(1) << "The experiment \"" << experiment << "\" is applied.";
+          metrics::RecordTFDataExperiment(experiment);
+        }
+      }
     }
   }
 
+  // If there are no optimizations to be applied, directly return the input.
+  if (optimizations.empty()) {
+    *output = input;
+    input->Ref();
+    return;
+  }
+
   auto config_factory = [this, &optimizations]() {
     return CreateConfig(optimizations, optimization_configs_);
   };
diff --git a/tensorflow/core/kernels/data/padded_batch_dataset_op.cc b/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
index a35fb2c3952..805954a5179 100644
--- a/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
@@ -76,7 +76,7 @@ class PaddedBatchDatasetOp::Dataset : public DatasetBase {
     const auto& input_shapes = input_->output_shapes();
     output_shapes_.reserve(input_shapes.size());
     for (size_t i = 0; i < input_shapes.size(); ++i) {
-      if (drop_remainder_) {
+      if (drop_remainder_ || input_->Cardinality() == kInfiniteCardinality) {
         output_shapes_.push_back(
             PartialTensorShape({batch_size_}).Concatenate(padded_shapes_[i]));
       } else {
@@ -119,6 +119,11 @@ class PaddedBatchDatasetOp::Dataset : public DatasetBase {
     return n / batch_size_ + (n % batch_size_ == 0 || drop_remainder_ ? 0 : 1);
   }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    inputs->push_back(input_);
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override {
     return input_->CheckExternalState();
   }
diff --git a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
index 54ad8886a95..583a3cc509c 100644
--- a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
@@ -41,6 +41,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/blocking_counter.h"
 #include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/stringprintf.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/profiler/lib/traceme_encode.h"
@@ -218,6 +219,11 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
         ParallelInterleaveDatasetOp::kDatasetType, params);
   }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    inputs->push_back(input_);
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override {
     TF_RETURN_IF_ERROR(captured_func_->CheckExternalState());
     return input_->CheckExternalState();
@@ -1342,12 +1348,10 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
         IteratorContext* ctx, IteratorStateReader* reader, int64 size,
         const string& name, std::vector<std::shared_ptr<Element>>* elements) {
       elements->resize(size);
-      std::unique_ptr<thread::ThreadPool> threadpool =
-          ctx->CreateThreadPool(absl::StrCat("read_", name), size);
       Status s = Status::OK();
       BlockingCounter counter(size);
       for (int idx = 0; idx < size; ++idx) {
-        threadpool->Schedule(
+        thread_pool_->Schedule(
             [this, ctx, reader, idx, name, &s, &counter, elements] {
               RecordStart(ctx);
               auto cleanup = gtl::MakeCleanup([this, ctx, &counter]() {
@@ -1357,6 +1361,11 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
               std::shared_ptr<Element> elem;
               Status ret_status = ReadElement(ctx, reader, idx, name, &elem);
               mutex_lock l(*mu_);
+              if (cancelled_) {
+                s.Update(
+                    errors::Cancelled("Cancelled in ReadElementsParallel"));
+                return;
+              }
               if (!ret_status.ok()) {
                 s.Update(ret_status);
                 return;
diff --git a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
index e7480ca24d3..87ea4531d5d 100644
--- a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
@@ -57,11 +57,12 @@ namespace data {
 
 namespace {
 
+constexpr char kComponent[] = "component";
 constexpr char kInvocationResults[] = "invocation_results";
-constexpr char kSizeSuffix[] = ".size";
-constexpr char kEndOfInputSuffix[] = ".end_of_input";
-constexpr char kCodeSuffix[] = ".code";
-constexpr char kErrorMessage[] = ".error_message";
+constexpr char kSize[] = "size";
+constexpr char kEndOfInput[] = "end_of_input";
+constexpr char kErrorCode[] = "code";
+constexpr char kErrorMessage[] = "error_message";
 
 // Period between reporting dataset statistics.
 constexpr int kStatsReportingPeriodMillis = 1000;
@@ -119,6 +120,11 @@ class ParallelMapDatasetOp::Dataset : public DatasetBase {
     }
   }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    inputs->push_back(input_);
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override {
     TF_RETURN_IF_ERROR(captured_func_->CheckExternalState());
     return input_->CheckExternalState();
@@ -274,27 +280,25 @@ class ParallelMapDatasetOp::Dataset : public DatasetBase {
             "Unexpected outstanding calls encountered.");
       }
       TF_RETURN_IF_ERROR(SaveInput(ctx, writer, input_impl_));
-      TF_RETURN_IF_ERROR(writer->WriteScalar(
-          full_name(strings::StrCat(kInvocationResults, kSizeSuffix)),
-          invocation_results_.size()));
+      TF_RETURN_IF_ERROR(
+          writer->WriteScalar(absl::StrCat(prefix(), "::", kInvocationResults),
+                              kSize, invocation_results_.size()));
       for (size_t i = 0; i < invocation_results_.size(); i++) {
         const auto& result = *(invocation_results_[i]);
-        TF_RETURN_IF_ERROR(WriteStatusLocked(writer, i, result.status));
-        TF_RETURN_IF_ERROR(writer->WriteScalar(
-            full_name(
-                strings::StrCat(kInvocationResults, "[", i, "]", kSizeSuffix)),
-            result.return_values.size()));
+        std::string element_prefix =
+            absl::StrCat(prefix(), "::", kInvocationResults, "::", i);
+        TF_RETURN_IF_ERROR(
+            WriteStatusLocked(writer, element_prefix, result.status));
+        TF_RETURN_IF_ERROR(writer->WriteScalar(element_prefix, kSize,
+                                               result.return_values.size()));
         for (size_t j = 0; j < result.return_values.size(); j++) {
           TF_RETURN_IF_ERROR(writer->WriteTensor(
-              full_name(
-                  strings::StrCat(kInvocationResults, "[", i, "][", j, "]")),
+              element_prefix, absl::StrCat(kComponent, "[", j, "]"),
               result.return_values[j]));
         }
         if (result.end_of_input) {
-          TF_RETURN_IF_ERROR(writer->WriteScalar(
-              full_name(strings::StrCat(kInvocationResults, "[", i, "]",
-                                        kEndOfInputSuffix)),
-              ""));
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(element_prefix, kEndOfInput, ""));
         }
       }
       return Status::OK();
@@ -305,39 +309,36 @@ class ParallelMapDatasetOp::Dataset : public DatasetBase {
       mutex_lock l(*mu_);
       TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
       int64 invocation_results_size;
-      TF_RETURN_IF_ERROR(reader->ReadScalar(
-          full_name(strings::StrCat(kInvocationResults, kSizeSuffix)),
-          &invocation_results_size));
+      TF_RETURN_IF_ERROR(
+          reader->ReadScalar(absl::StrCat(prefix(), "::", kInvocationResults),
+                             kSize, &invocation_results_size));
       if (!invocation_results_.empty()) invocation_results_.clear();
       for (size_t i = 0; i < invocation_results_size; i++) {
         invocation_results_.push_back(std::make_shared<InvocationResult>());
         auto& result = *invocation_results_.back();
-        TF_RETURN_IF_ERROR(ReadStatusLocked(reader, i, &result.status));
+        std::string element_prefix =
+            absl::StrCat(prefix(), "::", kInvocationResults, "::", i);
+        TF_RETURN_IF_ERROR(
+            ReadStatusLocked(reader, element_prefix, &result.status));
         size_t num_return_values;
         {
           int64 size;
-          TF_RETURN_IF_ERROR(reader->ReadScalar(
-              full_name(strings::StrCat(kInvocationResults, "[", i, "]",
-                                        kSizeSuffix)),
-              &size));
+          TF_RETURN_IF_ERROR(reader->ReadScalar(element_prefix, kSize, &size));
           num_return_values = static_cast<size_t>(size);
           if (num_return_values != size) {
-            return errors::InvalidArgument(strings::StrCat(
-                full_name(strings::StrCat(kInvocationResults, "[", i, "]",
-                                          kSizeSuffix)),
-                ": ", size, " is not a valid value of type size_t."));
+            return errors::InvalidArgument(
+                element_prefix, ",", kSize, ": ", size,
+                " is not a valid value of type size_t.");
           }
         }
         result.return_values.reserve(num_return_values);
         for (size_t j = 0; j < num_return_values; j++) {
           result.return_values.emplace_back();
-          TF_RETURN_IF_ERROR(
-              reader->ReadTensor(full_name(strings::StrCat(
-                                     kInvocationResults, "[", i, "][", j, "]")),
-                                 &result.return_values.back()));
+          TF_RETURN_IF_ERROR(reader->ReadTensor(
+              element_prefix, absl::StrCat(kComponent, "[", j, "]"),
+              &result.return_values.back()));
         }
-        result.end_of_input = reader->Contains(full_name(strings::StrCat(
-            kInvocationResults, "[", i, "]", kEndOfInputSuffix)));
+        result.end_of_input = reader->Contains(element_prefix, kEndOfInput);
         result.notification.Notify();
       }
       return Status::OK();
@@ -592,28 +593,28 @@ class ParallelMapDatasetOp::Dataset : public DatasetBase {
       }
     }
 
-    Status WriteStatusLocked(IteratorStateWriter* writer, size_t index,
-                             const Status& status)
+    Status WriteStatusLocked(IteratorStateWriter* writer,
+                             const std::string& key, const Status& status)
         TF_EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
       TF_RETURN_IF_ERROR(writer->WriteScalar(
-          CodeKey(index), static_cast<int64>(status.code())));
+          key, kErrorCode, static_cast<int64>(status.code())));
       if (!status.ok()) {
-        TF_RETURN_IF_ERROR(writer->WriteScalar(ErrorMessageKey(index),
-                                               status.error_message()));
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(key, kErrorMessage, status.error_message()));
       }
       return Status::OK();
     }
 
-    Status ReadStatusLocked(IteratorStateReader* reader, size_t index,
+    Status ReadStatusLocked(IteratorStateReader* reader, const std::string& key,
                             Status* status) TF_EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
       int64 code_int;
-      TF_RETURN_IF_ERROR(reader->ReadScalar(CodeKey(index), &code_int));
+      TF_RETURN_IF_ERROR(reader->ReadScalar(key, kErrorCode, &code_int));
       error::Code code = static_cast<error::Code>(code_int);
 
       if (code != error::Code::OK) {
         tstring error_message;
         TF_RETURN_IF_ERROR(
-            reader->ReadScalar(ErrorMessageKey(index), &error_message));
+            reader->ReadScalar(key, kErrorMessage, &error_message));
         *status = Status(code, error_message);
       } else {
         *status = Status::OK();
@@ -621,16 +622,6 @@ class ParallelMapDatasetOp::Dataset : public DatasetBase {
       return Status::OK();
     }
 
-    string CodeKey(size_t index) {
-      return full_name(
-          strings::StrCat(kInvocationResults, "[", index, "]", kCodeSuffix));
-    }
-
-    string ErrorMessageKey(size_t index) {
-      return full_name(
-          strings::StrCat(kInvocationResults, "[", index, "]", kErrorMessage));
-    }
-
     // Used for coordination between the main thread and the runner thread.
     const std::shared_ptr<mutex> mu_;
     // Used for coordination between the main thread and the runner thread. In
diff --git a/tensorflow/core/kernels/data/prefetch_dataset_op.cc b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
index 20b78ba14ad..4a55514ffd1 100644
--- a/tensorflow/core/kernels/data/prefetch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
@@ -88,6 +88,11 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
 
   int64 Cardinality() const override { return input_->Cardinality(); }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    inputs->push_back(input_);
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override {
     return input_->CheckExternalState();
   }
diff --git a/tensorflow/core/kernels/data/range_dataset_op.cc b/tensorflow/core/kernels/data/range_dataset_op.cc
index f6993ab2797..e0a80f1f0ee 100644
--- a/tensorflow/core/kernels/data/range_dataset_op.cc
+++ b/tensorflow/core/kernels/data/range_dataset_op.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/kernels/data/range_dataset_op.h"
 
+#include "absl/memory/memory.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -32,7 +33,97 @@ namespace data {
 /* static */ constexpr const char* const RangeDatasetOp::kOutputTypes;
 /* static */ constexpr const char* const RangeDatasetOp::kOutputShapes;
 
+namespace {
 constexpr char kNext[] = "next";
+constexpr char kHasSplitProvider[] = "has_split_provider";
+constexpr char kSlash[] = "/";
+constexpr char kSplitProvider[] = "split_provider";
+
+// Class which produces the elements of `range(start, stop, step)`. Threadsafe.
+class RangeCounter {
+ public:
+  RangeCounter(int64 start, int64 stop, int64 step)
+      : start_(start), stop_(stop), step_(step), next_(start) {}
+
+  // Returns the next value for the counter. Sets `*end_of_counter` to indicate
+  // whether the end of the counter was reached.
+  int64 GetNext(bool* end_of_counter) {
+    mutex_lock l(mu_);
+    if ((step_ > 0 && next_ >= stop_) || (step_ < 0 && next_ <= stop_)) {
+      *end_of_counter = true;
+      return -1;
+    }
+    *end_of_counter = false;
+    int result = next_;
+    next_ += step_;
+    return result;
+  }
+
+  int64 Peek() const {
+    mutex_lock l(mu_);
+    return next_;
+  }
+
+  void Reset() {
+    mutex_lock l(mu_);
+    next_ = start_;
+  }
+
+  void SetNext(int64 value) {
+    mutex_lock l(mu_);
+    next_ = value;
+  }
+
+ private:
+  const int64 start_;
+  const int64 stop_;
+  const int64 step_;
+  mutable mutex mu_;
+  int64 next_ TF_GUARDED_BY(mu_);
+};
+}  // namespace
+
+// Split provider where splits are individual outputs from RangeDataset.
+// For example, the "splits" of range(0, 10, 2) will be {0, 2, 4, 6, 8}.
+// The split tensors are scalars of type DT_INT64.
+class RangeDatasetOp::RangeSplitProvider : public SplitProvider {
+ public:
+  RangeSplitProvider(int64 start, int64 stop, int64 step)
+      : counter_(start, stop, step) {}
+
+  Status GetNext(Tensor* split, bool* end_of_splits) override {
+    int64 next = counter_.GetNext(end_of_splits);
+    if (*end_of_splits) {
+      return Status::OK();
+    }
+    *split = Tensor(DT_INT64, TensorShape{});
+    split->scalar<int64>()() = next;
+    return Status::OK();
+  }
+
+  Status Reset() override {
+    counter_.Reset();
+    return Status::OK();
+  }
+
+  Status Save(std::function<std::string(std::string)> key_name_fn,
+              IteratorStateWriter* writer) override {
+    TF_RETURN_IF_ERROR(
+        writer->WriteScalar(key_name_fn(kNext), counter_.Peek()));
+    return Status::OK();
+  }
+
+  Status Restore(std::function<std::string(std::string)> key_name_fn,
+                 IteratorStateReader* reader) override {
+    int64 next;
+    TF_RETURN_IF_ERROR(reader->ReadScalar(key_name_fn(kNext), &next));
+    counter_.SetNext(next);
+    return Status::OK();
+  }
+
+ private:
+  RangeCounter counter_;
+};
 
 class RangeDatasetOp::Dataset : public DatasetBase {
  public:
@@ -74,6 +165,18 @@ class RangeDatasetOp::Dataset : public DatasetBase {
     }
   }
 
+  Status MakeSplitProvider(
+      std::unique_ptr<SplitProvider>* split_provider) const override {
+    *split_provider =
+        absl::make_unique<RangeSplitProvider>(start_, stop_, step_);
+    return Status::OK();
+  }
+
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    inputs->clear();
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override { return Status::OK(); }
 
  protected:
@@ -93,24 +196,40 @@ class RangeDatasetOp::Dataset : public DatasetBase {
  private:
   class Iterator : public DatasetIterator<Dataset> {
    public:
-    explicit Iterator(const Params& params) : DatasetIterator<Dataset>(params) {
-      next_ = params.dataset->start_;
+    explicit Iterator(const Params& params)
+        : DatasetIterator<Dataset>(params) {}
+
+    Status Initialize(IteratorContext* ctx) override {
+      split_provider_ = ctx->split_provider();
+      if (!split_provider_) {
+        counter_ = absl::make_unique<RangeCounter>(
+            dataset()->start_, dataset()->stop_, dataset()->step_);
+      }
+      return Status::OK();
     }
 
     Status GetNextInternal(IteratorContext* ctx,
                            std::vector<Tensor>* out_tensors,
                            bool* end_of_sequence) override {
-      mutex_lock l(mu_);
-      if ((dataset()->step_ > 0 && next_ >= dataset()->stop_) ||
-          (dataset()->step_ < 0 && next_ <= dataset()->stop_)) {
-        *end_of_sequence = true;
-        return Status::OK();
+      int64 value;
+      if (split_provider_ != nullptr) {
+        Tensor split;
+        TF_RETURN_IF_ERROR(split_provider_->GetNext(&split, end_of_sequence));
+        if (*end_of_sequence) {
+          return Status::OK();
+        }
+        value = split.scalar<int64>()();
+      } else {
+        value = counter_->GetNext(end_of_sequence);
+        if (*end_of_sequence) {
+          return Status::OK();
+        }
       }
       out_tensors->reserve(1);
       switch (dataset()->output_dtypes()[0]) {
 #define HANDLE_TYPE(type)                                \
   case DataTypeToEnum<type>::value: {                    \
-    out_tensors->emplace_back(static_cast<type>(next_)); \
+    out_tensors->emplace_back(static_cast<type>(value)); \
     break;                                               \
   }
         TF_CALL_NUMBER_TYPES(HANDLE_TYPE);
@@ -120,9 +239,6 @@ class RangeDatasetOp::Dataset : public DatasetBase {
               "Unsupported data type: ",
               DataTypeString(dataset()->output_dtypes()[0]));
       }
-      *end_of_sequence = false;
-      next_ += dataset()->step_;
-
       return Status::OK();
     }
 
@@ -134,21 +250,44 @@ class RangeDatasetOp::Dataset : public DatasetBase {
 
     Status SaveInternal(SerializationContext* ctx,
                         IteratorStateWriter* writer) override {
-      mutex_lock l(mu_);
-      TF_RETURN_IF_ERROR(writer->WriteScalar(full_name(kNext), next_));
+      if (split_provider_) {
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name(kHasSplitProvider), true));
+        TF_RETURN_IF_ERROR(split_provider_->Save(
+            [this](const std::string& key) {
+              return SplitProviderKeyNameFn(key);
+            },
+            writer));
+      } else {
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name(kNext), counter_->Peek()));
+      }
       return Status::OK();
     }
 
     Status RestoreInternal(IteratorContext* ctx,
                            IteratorStateReader* reader) override {
-      mutex_lock l(mu_);
-      TF_RETURN_IF_ERROR(reader->ReadScalar(full_name(kNext), &next_));
+      if (reader->Contains(full_name(kHasSplitProvider))) {
+        TF_RETURN_IF_ERROR(split_provider_->Restore(
+            [this](const std::string& key) {
+              return SplitProviderKeyNameFn(key);
+            },
+            reader));
+      } else {
+        int64 next;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name(kNext), &next));
+        counter_->SetNext(next);
+      }
       return Status::OK();
     }
 
+    std::string SplitProviderKeyNameFn(const std::string& key) {
+      return full_name(absl::StrCat(kSplitProvider, kSlash, key));
+    }
+
    private:
-    mutex mu_;
-    int64 next_ TF_GUARDED_BY(mu_);
+    std::unique_ptr<RangeCounter> counter_;
+    std::shared_ptr<SplitProvider> split_provider_;
   };
 
   const int64 start_;
diff --git a/tensorflow/core/kernels/data/range_dataset_op.h b/tensorflow/core/kernels/data/range_dataset_op.h
index 077987b72e1..8e9891c5671 100644
--- a/tensorflow/core/kernels/data/range_dataset_op.h
+++ b/tensorflow/core/kernels/data/range_dataset_op.h
@@ -36,6 +36,7 @@ class RangeDatasetOp : public DatasetOpKernel {
 
  private:
   class Dataset;
+  class RangeSplitProvider;
   DataTypeVector output_types_;
 };
 
diff --git a/tensorflow/core/kernels/data/range_dataset_op_test.cc b/tensorflow/core/kernels/data/range_dataset_op_test.cc
index 13a027ecdc6..f8f12c36343 100644
--- a/tensorflow/core/kernels/data/range_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/range_dataset_op_test.cc
@@ -141,6 +141,39 @@ TEST_F(RangeDatasetOpTest, ZeroStep) {
             tensorflow::error::INVALID_ARGUMENT);
 }
 
+TEST_F(RangeDatasetOpTest, SplitProviderPositiveStep) {
+  auto params = RangeDatasetParams(/*start=*/0, /*stop=*/10, /*step=*/3,
+                                   /*output_dtypes=*/{DT_INT64});
+  TF_ASSERT_OK(InitializeRuntime(params));
+  TF_EXPECT_OK(CheckSplitProviderFullIteration(
+      params, CreateTensors<int64>(TensorShape({}), {{0}, {3}, {6}, {9}})));
+  TF_EXPECT_OK(CheckSplitProviderShardedIteration(
+      params, /*num_shards=*/2, /*shard_index=*/1,
+      CreateTensors<int64>(TensorShape({}), {{3}, {9}})));
+}
+
+TEST_F(RangeDatasetOpTest, SplitProviderNegativeStep) {
+  auto params = RangeDatasetParams(/*start=*/10, /*stop=*/0, /*step=*/-3,
+                                   /*output_dtypes=*/{DT_INT64});
+  TF_ASSERT_OK(InitializeRuntime(params));
+  TF_EXPECT_OK(CheckSplitProviderFullIteration(
+      params, CreateTensors<int64>(TensorShape({}), {{10}, {7}, {4}, {1}})));
+  TF_EXPECT_OK(CheckSplitProviderShardedIteration(
+      params, /*num_shards=*/2, /*shard_index=*/0,
+      CreateTensors<int64>(TensorShape({}), {{10}, {4}})));
+}
+
+TEST_F(RangeDatasetOpTest, SplitProviderEmpty) {
+  auto params = RangeDatasetParams(/*start=*/0, /*stop=*/0, /*step=*/1,
+                                   /*output_dtypes=*/{DT_INT64});
+  TF_ASSERT_OK(InitializeRuntime(params));
+  TF_EXPECT_OK(CheckSplitProviderFullIteration(
+      params, CreateTensors<int64>(TensorShape({}), {})));
+  TF_EXPECT_OK(CheckSplitProviderShardedIteration(
+      params, /*num_shards=*/3, /*shard_index=*/2,
+      CreateTensors<int64>(TensorShape({}), {})));
+}
+
 }  // namespace
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/repeat_dataset_op.cc b/tensorflow/core/kernels/data/repeat_dataset_op.cc
index dd6a0e9d03e..76dbff1744d 100644
--- a/tensorflow/core/kernels/data/repeat_dataset_op.cc
+++ b/tensorflow/core/kernels/data/repeat_dataset_op.cc
@@ -89,6 +89,11 @@ class RepeatDatasetOp::Dataset : public DatasetBase {
     return count_ * n;
   }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    inputs->push_back(input_);
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override {
     return input_->CheckExternalState();
   }
@@ -158,6 +163,9 @@ class RepeatDatasetOp::Dataset : public DatasetBase {
           return Status::OK();
         }
         ++i_;
+        if (ctx->split_provider()) {
+          TF_RETURN_IF_ERROR(ctx->split_provider()->Reset());
+        }
         TF_RETURN_IF_ERROR(
             dataset()->input_->MakeIterator(ctx, this, prefix(), &input_impl_));
       }
diff --git a/tensorflow/core/kernels/data/shard_dataset_op.cc b/tensorflow/core/kernels/data/shard_dataset_op.cc
index 03c9525a7ab..43c4b79db06 100644
--- a/tensorflow/core/kernels/data/shard_dataset_op.cc
+++ b/tensorflow/core/kernels/data/shard_dataset_op.cc
@@ -84,6 +84,11 @@ class ShardDatasetOp::Dataset : public DatasetBase {
     return n / num_shards_ + (index_ < n % num_shards_ ? 1 : 0);
   }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    inputs->push_back(input_);
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override {
     return input_->CheckExternalState();
   }
@@ -123,26 +128,39 @@ class ShardDatasetOp::Dataset : public DatasetBase {
                            bool* end_of_sequence) override {
       mutex_lock l(mu_);
 
+      *end_of_sequence = false;
       if (!input_impl_) {
         *end_of_sequence = true;
         return Status::OK();
       }
 
+      int num_to_skip =
+          (dataset()->index_ - next_index_) % dataset()->num_shards_;
+      if (num_to_skip < 0) {
+        num_to_skip += dataset()->num_shards_;
+      }
+      int num_skipped;
+      TF_RETURN_IF_ERROR(
+          input_impl_->Skip(ctx, num_to_skip, end_of_sequence, &num_skipped));
+      next_index_ += num_skipped;
+      if (*end_of_sequence) {
+        input_impl_.reset();
+        return Status::OK();
+      }
+
       std::vector<Tensor> result;
-      do {
-        result.clear();
-        TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx, &result, end_of_sequence));
-        if (*end_of_sequence) {
-          input_impl_.reset();
-          return Status::OK();
-        }
-      } while ((next_index_++ % dataset()->num_shards_) != dataset()->index_);
+      TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx, &result, end_of_sequence));
+      if (*end_of_sequence) {
+        input_impl_.reset();
+        return Status::OK();
+      }
+      next_index_++;
 
-      while (dataset()->require_non_empty_ &&
-             next_index_ < dataset()->num_shards_) {
-        std::vector<Tensor> unused_result;
-
-        Status s = input_impl_->GetNext(ctx, &unused_result, end_of_sequence);
+      if (dataset()->require_non_empty_ &&
+          next_index_ < dataset()->num_shards_) {
+        int num_skipped;
+        Status s = input_impl_->Skip(ctx, dataset()->num_shards_ - next_index_,
+                                     end_of_sequence, &num_skipped);
         if (*end_of_sequence || errors::IsOutOfRange(s)) {
           return errors::InvalidArgument(
               "There aren't enough elements in this dataset for each shard to "
@@ -156,7 +174,7 @@ class ShardDatasetOp::Dataset : public DatasetBase {
           return s;
         }
 
-        next_index_++;
+        next_index_ = dataset()->num_shards_;
       }
 
       *out_tensors = std::move(result);
diff --git a/tensorflow/core/kernels/data/shuffle_dataset_op.cc b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
index 7b696371049..4df9dcefcf0 100644
--- a/tensorflow/core/kernels/data/shuffle_dataset_op.cc
+++ b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
@@ -117,6 +117,11 @@ class ShuffleDatasetOpBase::ShuffleDatasetBase : public DatasetBase {
     }
   }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    inputs->push_back(input_);
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override {
     return input_->CheckExternalState();
   }
@@ -182,7 +187,8 @@ class ShuffleDatasetOpBase::ShuffleDatasetBase : public DatasetBase {
             data_produced_ = true;
             break;
           }
-          if (!data_produced_ && this->dataset()->count_ == -1) {
+          if (ctx->split_provider() == nullptr && !data_produced_ &&
+              this->dataset()->count_ == -1) {
             // If we encounter the end of sequence without producing data, we
             // terminate the iteration immediately. (Otherwise, this iterator
             // would loop infinitely and never produce a value.)
@@ -192,6 +198,9 @@ class ShuffleDatasetOpBase::ShuffleDatasetBase : public DatasetBase {
           epoch_++;
           int64 n = slices_.back()->end;
           slices_.push_back(absl::make_unique<Slice>(n, n));
+          if (ctx->split_provider()) {
+            TF_RETURN_IF_ERROR(ctx->split_provider()->Reset());
+          }
           TF_RETURN_IF_ERROR(this->dataset()->input_->MakeIterator(
               ctx, this, this->prefix(), &input_impl_));
         }
diff --git a/tensorflow/core/kernels/data/skip_dataset_op.cc b/tensorflow/core/kernels/data/skip_dataset_op.cc
index 952d5cae97b..897c7b6b7e4 100644
--- a/tensorflow/core/kernels/data/skip_dataset_op.cc
+++ b/tensorflow/core/kernels/data/skip_dataset_op.cc
@@ -75,6 +75,11 @@ class SkipDatasetOp::Dataset : public DatasetBase {
     return count_ < 0 ? 0 : std::max(int64{0}, n - count_);
   }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    inputs->push_back(input_);
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override {
     return input_->CheckExternalState();
   }
@@ -140,21 +145,16 @@ class SkipDatasetOp::Dataset : public DatasetBase {
         return Status::OK();
       }
 
-      // Keep calling GetNext().  TODO(vrv): Figure out a way to
-      // skip records without reading, perhaps by adding an
-      // interface to iterator.
-      while (i_ < dataset()->count_) {
-        // Fetch and throw away Tensors.
-        std::vector<Tensor> dummy_out_tensors;
-        TF_RETURN_IF_ERROR(
-            input_impl_->GetNext(ctx, &dummy_out_tensors, end_of_sequence));
+      if (i_ < dataset()->count_) {
+        int num_skipped;
+        TF_RETURN_IF_ERROR(input_impl_->Skip(ctx, dataset()->count_ - i_,
+                                             end_of_sequence, &num_skipped));
+        i_ += num_skipped;
         if (*end_of_sequence) {
           // We reached the end before the count was reached.
           input_impl_.reset();
           return Status::OK();
         }
-
-        ++i_;
       }
 
       // Return GetNext() on the underlying iterator.
diff --git a/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc b/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc
index 1e3ed53d6c6..9efc9fddf58 100644
--- a/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc
+++ b/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc
@@ -56,6 +56,10 @@ class Dataset : public DatasetBase {
 
   int64 Cardinality() const override { return sparse_tensor_.shape()[0]; }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override { return Status::OK(); }
 
  protected:
diff --git a/tensorflow/core/kernels/data/split_utils.cc b/tensorflow/core/kernels/data/split_utils.cc
new file mode 100644
index 00000000000..def079169db
--- /dev/null
+++ b/tensorflow/core/kernels/data/split_utils.cc
@@ -0,0 +1,115 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/data/split_utils.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+constexpr char kNumToSkip[] = "num_to_skip";
+constexpr char kSplitProvider[] = "split_provider";
+constexpr char kSlash[] = "/";
+constexpr char kIndex[] = "index";
+}  // namespace
+
+IndexSplitProvider::IndexSplitProvider(int64 n) : i_(0), n_(n) {}
+
+Status IndexSplitProvider::GetNext(Tensor* split, bool* end_of_splits) {
+  mutex_lock l(mu_);
+  if (i_ >= n_) {
+    *end_of_splits = true;
+    return Status::OK();
+  }
+  *end_of_splits = false;
+  *split = Tensor(DT_INT64, TensorShape{});
+  split->scalar<int64>()() = i_++;
+  return Status::OK();
+}
+
+Status IndexSplitProvider::Reset() {
+  mutex_lock l(mu_);
+  i_ = 0;
+  return Status::OK();
+}
+
+Status IndexSplitProvider::Save(
+    std::function<std::string(std::string)> full_name,
+    IteratorStateWriter* writer) {
+  mutex_lock l(mu_);
+  return writer->WriteScalar(full_name(kIndex), i_);
+}
+
+Status IndexSplitProvider::Restore(
+    std::function<std::string(std::string)> full_name,
+    IteratorStateReader* reader) {
+  mutex_lock l(mu_);
+  return reader->ReadScalar(full_name(kIndex), &i_);
+}
+
+ShardingSplitProvider::ShardingSplitProvider(
+    int64 num_shards, int64 shard_index,
+    std::shared_ptr<SplitProvider> split_provider)
+    : num_shards_(num_shards),
+      shard_index_(shard_index),
+      split_provider_(split_provider),
+      num_to_skip_(shard_index_) {}
+
+Status ShardingSplitProvider::GetNext(Tensor* split, bool* end_of_splits) {
+  mutex_lock l(mu_);
+  while (num_to_skip_ > 0) {
+    TF_RETURN_IF_ERROR(split_provider_->GetNext(split, end_of_splits));
+    if (*end_of_splits) {
+      return Status::OK();
+    }
+    num_to_skip_--;
+  }
+  num_to_skip_ = num_shards_ - 1;
+  TF_RETURN_IF_ERROR(split_provider_->GetNext(split, end_of_splits));
+  return Status::OK();
+}
+
+Status ShardingSplitProvider::Reset() {
+  mutex_lock l(mu_);
+  TF_RETURN_IF_ERROR(split_provider_->Reset());
+  num_to_skip_ = shard_index_;
+  return Status::OK();
+}
+
+Status ShardingSplitProvider::Save(
+    std::function<std::string(std::string)> full_name,
+    IteratorStateWriter* writer) {
+  mutex_lock l(mu_);
+  TF_RETURN_IF_ERROR(split_provider_->Save(
+      [&](const std::string& key) {
+        return full_name(absl::StrCat(kSplitProvider, kSlash, key));
+      },
+      writer));
+  return writer->WriteScalar(full_name(kNumToSkip), num_to_skip_);
+}
+
+Status ShardingSplitProvider::Restore(
+    std::function<std::string(std::string)> full_name,
+    IteratorStateReader* reader) {
+  mutex_lock l(mu_);
+  TF_RETURN_IF_ERROR(split_provider_->Restore(
+      [&](const std::string& key) {
+        return full_name(absl::StrCat(kSplitProvider, kSlash, key));
+      },
+      reader));
+  TF_RETURN_IF_ERROR(reader->ReadScalar(full_name(kNumToSkip), &num_to_skip_));
+  return Status::OK();
+}
+
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/split_utils.h b/tensorflow/core/kernels/data/split_utils.h
new file mode 100644
index 00000000000..82fd4e8c0a4
--- /dev/null
+++ b/tensorflow/core/kernels/data/split_utils.h
@@ -0,0 +1,67 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_SPLIT_UTILS_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_SPLIT_UTILS_H_
+
+#include "tensorflow/core/framework/dataset.h"
+
+namespace tensorflow {
+namespace data {
+
+// A class which produces splits for a dataset of size N that can be indexed
+// into.
+class IndexSplitProvider : public SplitProvider {
+ public:
+  explicit IndexSplitProvider(int64 n);
+  Status GetNext(Tensor* split, bool* end_of_splits) override;
+  Status Reset() override;
+  Status Save(std::function<std::string(std::string)> full_name,
+              IteratorStateWriter* writer) override;
+  Status Restore(std::function<std::string(std::string)> full_name,
+                 IteratorStateReader* reader) override;
+
+ private:
+  mutex mu_;
+  int64 i_ GUARDED_BY(mu_);
+  const int64 n_;
+};
+
+// A SplitProvider which wraps another split provider, but drops all splits
+// where `index != shard_index % num_shards`
+class ShardingSplitProvider : public SplitProvider {
+ public:
+  ShardingSplitProvider(int64 num_shards, int64 shard_index,
+                        std::shared_ptr<SplitProvider> split_provider);
+
+  Status GetNext(Tensor* split, bool* end_of_splits) override;
+  Status Reset() override;
+  Status Save(std::function<std::string(std::string)> full_name,
+              IteratorStateWriter* writer) override;
+  Status Restore(std::function<std::string(std::string)> full_name,
+                 IteratorStateReader* reader) override;
+
+ private:
+  const int64 num_shards_;
+  const int64 shard_index_;
+  mutex mu_;
+  std::shared_ptr<SplitProvider> split_provider_ TF_GUARDED_BY(mu_);
+  int64 num_to_skip_ TF_GUARDED_BY(mu_);
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_SPLIT_UTILS_H_
diff --git a/tensorflow/core/kernels/data/split_utils_test.cc b/tensorflow/core/kernels/data/split_utils_test.cc
new file mode 100644
index 00000000000..651f6c1c894
--- /dev/null
+++ b/tensorflow/core/kernels/data/split_utils_test.cc
@@ -0,0 +1,142 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/data/split_utils.h"
+
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/kernels/data/dataset_test_base.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+std::string full_name(const std::string& name) {
+  return FullName("test", name);
+}
+
+Status SaveAndRestore(SplitProvider* split_provider) {
+  VariantTensorDataWriter writer;
+  TF_RETURN_IF_ERROR(split_provider->Save(full_name, &writer));
+  std::vector<const VariantTensorData*> variants;
+  writer.GetData(&variants);
+  VariantTensorDataReader reader(variants);
+  TF_RETURN_IF_ERROR(split_provider->Restore(full_name, &reader));
+  return Status::OK();
+}
+
+Status CheckOutput(SplitProvider* split_provider,
+                   std::vector<Tensor> expected) {
+  int64 next = 0;
+  bool end_of_splits = false;
+  while (!end_of_splits) {
+    Tensor split;
+    TF_RETURN_IF_ERROR(split_provider->GetNext(&split, &end_of_splits));
+    if (!end_of_splits) {
+      test::ExpectEqual(split, expected[next++]);
+    }
+  }
+  EXPECT_EQ(next, expected.size());
+  return Status::OK();
+}
+
+TEST(IndexSplitProviderTest, Empty) {
+  IndexSplitProvider split_provider(0);
+  TF_EXPECT_OK(
+      CheckOutput(&split_provider, CreateTensors<int64>(TensorShape({}), {})));
+}
+
+TEST(IndexSplitProviderTest, One) {
+  IndexSplitProvider split_provider(1);
+  TF_EXPECT_OK(CheckOutput(&split_provider,
+                           CreateTensors<int64>(TensorShape({}), {{0}})));
+}
+
+TEST(IndexSplitProviderTest, Three) {
+  IndexSplitProvider split_provider(3);
+  TF_EXPECT_OK(CheckOutput(
+      &split_provider, CreateTensors<int64>(TensorShape({}), {{0}, {1}, {2}})));
+}
+
+TEST(IndexSplitProviderTest, SaveAndRestore) {
+  IndexSplitProvider split_provider(4);
+  std::vector<Tensor> expected =
+      CreateTensors<int64>(TensorShape({}), {{0}, {1}, {2}, {3}});
+  for (int i = 0; i < expected.size(); ++i) {
+    TF_ASSERT_OK(SaveAndRestore(&split_provider));
+    Tensor split;
+    bool end_of_splits = true;
+    TF_ASSERT_OK(split_provider.GetNext(&split, &end_of_splits));
+    EXPECT_FALSE(end_of_splits);
+    test::ExpectEqual(split, expected[i]);
+  }
+  TF_ASSERT_OK(SaveAndRestore(&split_provider));
+  Tensor split;
+  bool end_of_splits = false;
+  TF_ASSERT_OK(split_provider.GetNext(&split, &end_of_splits));
+  EXPECT_TRUE(end_of_splits);
+}
+
+TEST(ShardingSplitProviderTest, TwoWayShardZero) {
+  auto base = std::make_shared<IndexSplitProvider>(4);
+  ShardingSplitProvider split_provider(2, 0, base);
+  TF_EXPECT_OK(CheckOutput(&split_provider,
+                           CreateTensors<int64>(TensorShape({}), {{0}, {2}})));
+}
+
+TEST(ShardingSplitProviderTest, TwoWayShardOne) {
+  auto base = std::make_shared<IndexSplitProvider>(4);
+  ShardingSplitProvider split_provider(2, 1, base);
+  TF_EXPECT_OK(CheckOutput(&split_provider,
+                           CreateTensors<int64>(TensorShape({}), {{1}, {3}})));
+}
+
+TEST(ShardingSplitProviderTest, ThreeWayShardOne) {
+  auto base = std::make_shared<IndexSplitProvider>(6);
+  ShardingSplitProvider split_provider(3, 1, base);
+  TF_EXPECT_OK(CheckOutput(&split_provider,
+                           CreateTensors<int64>(TensorShape({}), {{1}, {4}})));
+}
+
+TEST(ShardingSplitProviderTest, Empty) {
+  auto base = std::make_shared<IndexSplitProvider>(1);
+  ShardingSplitProvider split_provider(2, 1, base);
+  TF_EXPECT_OK(
+      CheckOutput(&split_provider, CreateTensors<int64>(TensorShape({}), {})));
+}
+
+TEST(ShardingSplitProviderTest, SaveAndRestore) {
+  auto base = std::make_shared<IndexSplitProvider>(6);
+  std::vector<Tensor> expected =
+      CreateTensors<int64>(TensorShape({}), {{1}, {4}});
+  ShardingSplitProvider split_provider(3, 1, base);
+  for (int i = 0; i < expected.size(); ++i) {
+    TF_ASSERT_OK(SaveAndRestore(&split_provider));
+    Tensor split;
+    bool end_of_splits = true;
+    TF_ASSERT_OK(split_provider.GetNext(&split, &end_of_splits));
+    EXPECT_FALSE(end_of_splits);
+    test::ExpectEqual(split, expected[i]);
+  }
+  TF_ASSERT_OK(SaveAndRestore(&split_provider));
+  Tensor split;
+  bool end_of_splits = false;
+  TF_ASSERT_OK(split_provider.GetNext(&split, &end_of_splits));
+  EXPECT_TRUE(end_of_splits);
+}
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/take_dataset_op.cc b/tensorflow/core/kernels/data/take_dataset_op.cc
index 627467f291b..bfafcaa7aa1 100644
--- a/tensorflow/core/kernels/data/take_dataset_op.cc
+++ b/tensorflow/core/kernels/data/take_dataset_op.cc
@@ -74,6 +74,12 @@ int64 TakeDataset::Cardinality() const {
   return std::min(n, count_);
 }
 
+Status TakeDataset::InputDatasets(
+    std::vector<const DatasetBase*>* inputs) const {
+  inputs->push_back(input_);
+  return Status::OK();
+}
+
 Status TakeDataset::CheckExternalState() const {
   return input_->CheckExternalState();
 }
diff --git a/tensorflow/core/kernels/data/take_dataset_op.h b/tensorflow/core/kernels/data/take_dataset_op.h
index 03f8ff662a7..2b85e74e7f1 100644
--- a/tensorflow/core/kernels/data/take_dataset_op.h
+++ b/tensorflow/core/kernels/data/take_dataset_op.h
@@ -40,6 +40,8 @@ class TakeDataset : public DatasetBase {
 
   int64 Cardinality() const override;
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override;
+
   Status CheckExternalState() const override;
 
  protected:
diff --git a/tensorflow/core/kernels/data/tensor_dataset_op.cc b/tensorflow/core/kernels/data/tensor_dataset_op.cc
index 78cc06a54c5..84b8e0bd435 100644
--- a/tensorflow/core/kernels/data/tensor_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tensor_dataset_op.cc
@@ -64,6 +64,10 @@ class TensorDatasetOp::Dataset : public DatasetBase {
 
   int64 Cardinality() const override { return 1LL; }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override { return Status::OK(); }
 
  protected:
diff --git a/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc b/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
index e4f27f55327..2a8f422d90a 100644
--- a/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/kernels/data/name_utils.h"
+#include "tensorflow/core/kernels/data/split_utils.h"
 #include "tensorflow/core/util/batch_util.h"
 
 namespace tensorflow {
@@ -57,6 +58,13 @@ class TensorSliceDatasetOp::Dataset : public DatasetBase {
         this, name_utils::IteratorPrefix(kDatasetType, prefix)});
   }
 
+  Status MakeSplitProvider(
+      std::unique_ptr<SplitProvider>* split_provider) const override {
+    *split_provider =
+        absl::make_unique<IndexSplitProvider>(tensors_[0].dim_size(0));
+    return Status::OK();
+  }
+
   const DataTypeVector& output_dtypes() const override { return dtypes_; }
 
   const std::vector<PartialTensorShape>& output_shapes() const override {
@@ -69,6 +77,10 @@ class TensorSliceDatasetOp::Dataset : public DatasetBase {
 
   int64 Cardinality() const override { return tensors_[0].dim_size(0); }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override { return Status::OK(); }
 
  protected:
@@ -99,24 +111,26 @@ class TensorSliceDatasetOp::Dataset : public DatasetBase {
   class Iterator : public DatasetIterator<Dataset> {
    public:
     explicit Iterator(const Params& params)
-        : DatasetIterator<Dataset>(params),
-          i_(0),
-          n_(params.dataset->tensors_[0].dim_size(0)) {}
+        : DatasetIterator<Dataset>(params) {}
+
+    Status Initialize(IteratorContext* ctx) override {
+      split_provider_ = ctx->split_provider();
+      if (split_provider_ == nullptr) {
+        split_provider_ = std::make_shared<IndexSplitProvider>(
+            dataset()->tensors_[0].dim_size(0));
+      }
+      return Status::OK();
+    }
 
     Status GetNextInternal(IteratorContext* ctx,
                            std::vector<Tensor>* out_tensors,
                            bool* end_of_sequence) override {
-      int64 index = 0;
-      {
-        mutex_lock l(mu_);
-        if (i_ < n_) {
-          index = i_;
-          ++i_;
-        } else {
-          *end_of_sequence = true;
-          return Status::OK();
-        }
+      Tensor split;
+      TF_RETURN_IF_ERROR(split_provider_->GetNext(&split, end_of_sequence));
+      if (*end_of_sequence) {
+        return Status::OK();
       }
+      int64 index = split.scalar<int64>()();
       out_tensors->clear();
       out_tensors->reserve(dataset()->tensors_.size());
       for (size_t i = 0; i < dataset()->tensors_.size(); ++i) {
@@ -138,22 +152,18 @@ class TensorSliceDatasetOp::Dataset : public DatasetBase {
 
     Status SaveInternal(SerializationContext* ctx,
                         IteratorStateWriter* writer) override {
-      mutex_lock l(mu_);
-      TF_RETURN_IF_ERROR(writer->WriteScalar(full_name(kCurIndex), i_));
-      return Status::OK();
+      return split_provider_->Save(
+          [this](const std::string& key) { return full_name(key); }, writer);
     }
 
     Status RestoreInternal(IteratorContext* ctx,
                            IteratorStateReader* reader) override {
-      mutex_lock l(mu_);
-      TF_RETURN_IF_ERROR(reader->ReadScalar(full_name(kCurIndex), &i_));
-      return Status::OK();
+      return split_provider_->Restore(
+          [this](const std::string& key) { return full_name(key); }, reader);
     }
 
    private:
-    mutex mu_;
-    int64 i_ TF_GUARDED_BY(mu_);
-    const int64 n_;
+    std::shared_ptr<SplitProvider> split_provider_;
   };
 
   const std::vector<Tensor> tensors_;
diff --git a/tensorflow/core/kernels/data/tensor_slice_dataset_op_test.cc b/tensorflow/core/kernels/data/tensor_slice_dataset_op_test.cc
index a42ac083ba2..f9bccd45173 100644
--- a/tensorflow/core/kernels/data/tensor_slice_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/tensor_slice_dataset_op_test.cc
@@ -319,6 +319,30 @@ INSTANTIATE_TEST_SUITE_P(
     TensorSliceDatasetOpTest, ParameterizedIteratorSaveAndRestoreTest,
     ::testing::ValuesIn(IteratorSaveAndRestoreTestCases()));
 
+TEST_F(TensorSliceDatasetOpTest, SplitProvider) {
+  auto params = TensorSliceDatasetParams(
+      CreateTensors<int64>(TensorShape({7}), {{6, 2, 3, 8, 7, 0, 10}}),
+      kNodeName);
+  TF_ASSERT_OK(InitializeRuntime(params));
+  TF_EXPECT_OK(CheckSplitProviderFullIteration(
+      params, CreateTensors<int64>(TensorShape({}),
+                                   {{6}, {2}, {3}, {8}, {7}, {0}, {10}})));
+  TF_EXPECT_OK(CheckSplitProviderShardedIteration(
+      params, /*num_shards=*/3, /*shard_index=*/1,
+      CreateTensors<int64>(TensorShape({}), {{2}, {7}})));
+}
+
+TEST_F(TensorSliceDatasetOpTest, SplitProviderEmpty) {
+  auto params = TensorSliceDatasetParams(
+      CreateTensors<int64>(TensorShape({0}), {{}}), kNodeName);
+  TF_ASSERT_OK(InitializeRuntime(params));
+  TF_EXPECT_OK(CheckSplitProviderFullIteration(
+      params, CreateTensors<int64>(TensorShape({}), {})));
+  TF_EXPECT_OK(CheckSplitProviderShardedIteration(
+      params, /*num_shards=*/3, /*shard_index=*/1,
+      CreateTensors<int64>(TensorShape({}), {})));
+}
+
 }  // namespace
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/text_line_dataset_op.cc b/tensorflow/core/kernels/data/text_line_dataset_op.cc
index 550a859093d..8851ec7995e 100644
--- a/tensorflow/core/kernels/data/text_line_dataset_op.cc
+++ b/tensorflow/core/kernels/data/text_line_dataset_op.cc
@@ -70,6 +70,10 @@ class TextLineDatasetOp::Dataset : public DatasetBase {
     return name_utils::DatasetDebugString(kDatasetType);
   }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override { return Status::OK(); }
 
  protected:
diff --git a/tensorflow/core/kernels/data/tf_record_dataset_op.cc b/tensorflow/core/kernels/data/tf_record_dataset_op.cc
index c6387a49f46..0de7f9100b1 100644
--- a/tensorflow/core/kernels/data/tf_record_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tf_record_dataset_op.cc
@@ -85,6 +85,10 @@ class TFRecordDatasetOp::Dataset : public DatasetBase {
     return name_utils::DatasetDebugString(kDatasetType);
   }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override { return Status::OK(); }
 
  protected:
diff --git a/tensorflow/core/kernels/data/window_dataset.cc b/tensorflow/core/kernels/data/window_dataset.cc
index 0c156baec89..42c2fc7656c 100644
--- a/tensorflow/core/kernels/data/window_dataset.cc
+++ b/tensorflow/core/kernels/data/window_dataset.cc
@@ -67,6 +67,10 @@ class WindowDataset : public DatasetBase {
 
   string DebugString() const override { return kWindowDataset; }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override { return Status::OK(); }
 
  protected:
diff --git a/tensorflow/core/kernels/data/window_dataset_op.cc b/tensorflow/core/kernels/data/window_dataset_op.cc
index 35437a9231c..4e239d0895c 100644
--- a/tensorflow/core/kernels/data/window_dataset_op.cc
+++ b/tensorflow/core/kernels/data/window_dataset_op.cc
@@ -105,6 +105,11 @@ class WindowDatasetOp::Dataset : public DatasetBase {
     return cardinality;
   }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    inputs->push_back(input_);
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override {
     return input_->CheckExternalState();
   }
diff --git a/tensorflow/core/kernels/data/zip_dataset_op.cc b/tensorflow/core/kernels/data/zip_dataset_op.cc
index b59dc2c3a22..0ac9f17839b 100644
--- a/tensorflow/core/kernels/data/zip_dataset_op.cc
+++ b/tensorflow/core/kernels/data/zip_dataset_op.cc
@@ -87,6 +87,13 @@ class ZipDatasetOp::Dataset : public DatasetBase {
     return result;
   }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    for (const auto& input : inputs_) {
+      inputs->push_back(input);
+    }
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override {
     for (const auto& input : inputs_) {
       TF_RETURN_IF_ERROR(input->CheckExternalState());
diff --git a/tensorflow/core/kernels/data_format_ops.cc b/tensorflow/core/kernels/data_format_ops.cc
index 181aa1b8a2c..c62c710faf1 100644
--- a/tensorflow/core/kernels/data_format_ops.cc
+++ b/tensorflow/core/kernels/data_format_ops.cc
@@ -90,9 +90,11 @@ class DataFormatVecPermuteOp : public OpKernel {
                     "input must be a vector or 2D tensor, but got shape ",
                     input.shape().DebugString()));
     if (input.dims() == 1) {
-      OP_REQUIRES(context, input.NumElements() == 2 || input.NumElements() == 4,
+      OP_REQUIRES(context,
+                  input.NumElements() == 2 || input.NumElements() == 4 ||
+                      input.NumElements() == 5,
                   errors::InvalidArgument(
-                      "1D input must be of size 2 or 4, but got shape ",
+                      "1D input must be of size 2, 4 or 5, but got shape ",
                       input.shape().DebugString()));
     } else if (input.dims() == 2) {
       OP_REQUIRES(context, input.dim_size(0) == 2 || input.dim_size(0) == 4,
diff --git a/tensorflow/core/kernels/debug_ops.cc b/tensorflow/core/kernels/debug_ops.cc
index db42b9f6511..92abc7a4955 100644
--- a/tensorflow/core/kernels/debug_ops.cc
+++ b/tensorflow/core/kernels/debug_ops.cc
@@ -38,15 +38,6 @@ REGISTER_KERNEL_BUILDER(Name("CopyHost")
                         CopyOp);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("Copy").Device(DEVICE_SYCL), CopyOp);
-
-REGISTER_KERNEL_BUILDER(Name("CopyHost")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("input")
-                            .HostMemory("output"),
-                        CopyOp);
-#endif  // TENSORFLOW_USE_SYCL
 
 // Register debug identity (non-ref and ref) ops.
 REGISTER_KERNEL_BUILDER(Name("DebugIdentity").Device(DEVICE_CPU),
@@ -60,13 +51,6 @@ REGISTER_KERNEL_BUILDER(Name("DebugIdentity")
                         DebugIdentityOp);
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("DebugIdentity")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("input")
-                            .HostMemory("output"),
-                        DebugIdentityOp);
-#endif  // TENSORFLOW_USE_SYCL
 
 // Register debug NaN-counter (non-ref and ref) ops.
 #define REGISTER_DEBUG_NAN_COUNT(type)                                    \
@@ -88,17 +72,6 @@ REGISTER_GPU_DEBUG_NAN_COUNT(float);
 REGISTER_GPU_DEBUG_NAN_COUNT(double);
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_GPU_DEBUG_NAN_COUNT(type)                \
-  REGISTER_KERNEL_BUILDER(Name("DebugNanCount")           \
-                              .Device(DEVICE_SYCL)        \
-                              .HostMemory("input")        \
-                              .HostMemory("output")       \
-                              .TypeConstraint<type>("T"), \
-                          DebugNanCountOp<type>);
-REGISTER_GPU_DEBUG_NAN_COUNT(float);
-REGISTER_GPU_DEBUG_NAN_COUNT(double);
-#endif  // TENSORFLOW_USE_SYCL
 
 // Register debug numeric summary ops.
 #define REGISTER_DEBUG_NUMERIC_SUMMARY_COUNT(type)        \
@@ -125,19 +98,6 @@ TF_CALL_float(REGISTER_GPU_DEBUG_NUMERIC_SUMMARY_COUNT);
 TF_CALL_double(REGISTER_GPU_DEBUG_NUMERIC_SUMMARY_COUNT);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_DEBUG_NUMERIC_SUMMARY_COUNT(type)   \
-  REGISTER_KERNEL_BUILDER(Name("DebugNumericSummary")     \
-                              .Device(DEVICE_SYCL)        \
-                              .HostMemory("input")        \
-                              .HostMemory("output")       \
-                              .TypeConstraint<type>("T"), \
-                          DebugNumericSummaryOp<type>);
-TF_CALL_bool(REGISTER_SYCL_DEBUG_NUMERIC_SUMMARY_COUNT);
-TF_CALL_INTEGRAL_TYPES(REGISTER_SYCL_DEBUG_NUMERIC_SUMMARY_COUNT);
-TF_CALL_float(REGISTER_SYCL_DEBUG_NUMERIC_SUMMARY_COUNT);
-TF_CALL_double(REGISTER_SYCL_DEBUG_NUMERIC_SUMMARY_COUNT);
-#endif  // TENSORFLOW_USE_SYCL
 
 REGISTER_KERNEL_BUILDER(Name("DebugIdentityV2").Device(DEVICE_CPU),
                         DebugIdentityV2Op);
diff --git a/tensorflow/core/kernels/debug_ops.h b/tensorflow/core/kernels/debug_ops.h
index 498cd6146a8..b7cb7eb39d0 100644
--- a/tensorflow/core/kernels/debug_ops.h
+++ b/tensorflow/core/kernels/debug_ops.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include <numeric>
 
-#include "tensorflow/core/lib/bfloat16/bfloat16.h"
+#include "tensorflow/core/platform/bfloat16.h"
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
@@ -31,9 +31,6 @@ limitations under the License.
 #include "tensorflow/core/platform/rocm.h"
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-#include "tensorflow/core/common_runtime/sycl/sycl_util.h"
-#endif  // TENSORFLOW_USE_SYCL
 #include "tensorflow/core/debug/debug_io_utils.h"
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -100,17 +97,6 @@ class CopyOp : public OpKernel {
         // The input tensor is on the host (CPU): deep-copy from CPU to CPU.
         *copied_tensor = tensor::DeepCopy(src_tensor);
       }
-#elif defined(TENSORFLOW_USE_SYCL)
-      Device* device = static_cast<Device*>(context->device());
-      // Determine if the input tensor is not on CPU (e.g., on GPU).
-      const bool off_host_input = device->device_type() == DEVICE_SYCL &&
-                                  !context->input_alloc_attr(0).on_host();
-
-      if (off_host_input) {
-        SYCLmemcpy(context->eigen_sycl_device(), src_tensor, copied_tensor);
-      } else {
-        *copied_tensor = tensor::DeepCopy(src_tensor);
-      }
 #else
       *copied_tensor = tensor::DeepCopy(src_tensor);
 #endif
diff --git a/tensorflow/core/kernels/dense_update_functor.h b/tensorflow/core/kernels/dense_update_functor.h
index 61b57312502..791d4b30ef1 100644
--- a/tensorflow/core/kernels/dense_update_functor.h
+++ b/tensorflow/core/kernels/dense_update_functor.h
@@ -27,9 +27,6 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 enum DenseUpdateType { ADD, SUB, ASSIGN };
 
@@ -65,31 +62,6 @@ struct DenseUpdate<CPUDevice, T, ASSIGN> {
   }
 };
 
-#ifdef TENSORFLOW_USE_SYCL
-template <typename T>
-struct DenseUpdate<SYCLDevice, T, ADD> {
-  void operator()(const SYCLDevice& d, typename TTypes<T>::Flat params,
-                  typename TTypes<T>::ConstFlat update) {
-    params.device(d) += update;
-  }
-};
-
-template <typename T>
-struct DenseUpdate<SYCLDevice, T, SUB> {
-  void operator()(const SYCLDevice& d, typename TTypes<T>::Flat params,
-                  typename TTypes<T>::ConstFlat update) {
-    params.device(d) -= update;
-  }
-};
-
-template <typename T>
-struct DenseUpdate<SYCLDevice, T, ASSIGN> {
-  void operator()(const SYCLDevice& d, typename TTypes<T>::Flat params,
-                  typename TTypes<T>::ConstFlat update) {
-    params.device(d) = update;
-  }
-};
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // end namespace functor
 
diff --git a/tensorflow/core/kernels/dense_update_ops.cc b/tensorflow/core/kernels/dense_update_ops.cc
index 71235fca143..f27eab8b901 100644
--- a/tensorflow/core/kernels/dense_update_ops.cc
+++ b/tensorflow/core/kernels/dense_update_ops.cc
@@ -87,9 +87,6 @@ class DenseUpdateOp : public OpKernel {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 #define REGISTER_KERNELS(type)                                     \
   REGISTER_KERNEL_BUILDER(                                         \
@@ -117,15 +114,6 @@ TF_CALL_uint32(REGISTER_GPU_KERNELS);
 #undef REGISTER_GPU_KERNELS
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNELS(type)                                 \
-  REGISTER_KERNEL_BUILDER(                                          \
-      Name("Assign").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
-      AssignOpT<SYCLDevice, type>);
-
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNELS);
-#undef REGISTER_SYCL_KERNELS
-#endif  // TENSORFLOW_USE_SYCL
 
 #define REGISTER_KERNELS(type)                                        \
   REGISTER_KERNEL_BUILDER(                                            \
@@ -151,16 +139,4 @@ TF_CALL_int64(REGISTER_GPU_KERNELS);
 #undef REGISTER_GPU_KERNELS
 #endif  // end GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNELS(type)                                    \
-  REGISTER_KERNEL_BUILDER(                                             \
-      Name("AssignAdd").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
-      DenseUpdateOp<SYCLDevice, type, DenseUpdateType::ADD>);          \
-  REGISTER_KERNEL_BUILDER(                                             \
-      Name("AssignSub").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
-      DenseUpdateOp<SYCLDevice, type, DenseUpdateType::SUB>);
-
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNELS);
-#undef REGISTER_SYCL_KERNELS
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/depthtospace_op.cc b/tensorflow/core/kernels/depthtospace_op.cc
index 20169d0f4b4..1df0ae06bb3 100644
--- a/tensorflow/core/kernels/depthtospace_op.cc
+++ b/tensorflow/core/kernels/depthtospace_op.cc
@@ -117,14 +117,13 @@ class DepthToSpaceOp : public OpKernel {
         // NCHW_VECT_C with 4 x qint8 can be treated as NCHW int32.
         auto Tinput_v = input.template reinterpret_last_dimension<int32, 4>();
         auto Toutput_v = outputs_tensor->reinterpret_last_dimension<int32, 4>();
-        functor::DepthToSpaceOpFunctor<GPUDevice, int32, FORMAT_NCHW> functor;
-        functor(context->eigen_device<GPUDevice>(), Tinput_v, block_size_,
+        functor::DepthToSpaceOpFunctor<Device, int32, FORMAT_NCHW> functor;
+        functor(context->eigen_device<Device>(), Tinput_v, block_size_,
                 Toutput_v);
         return;
       } else if (data_format_ == FORMAT_NCHW) {
-        functor::DepthToSpaceOpFunctor<GPUDevice, T, FORMAT_NCHW> functor;
-        functor(context->eigen_device<GPUDevice>(), Tinput, block_size_,
-                Toutput);
+        functor::DepthToSpaceOpFunctor<Device, T, FORMAT_NCHW> functor;
+        functor(context->eigen_device<Device>(), Tinput, block_size_, Toutput);
         return;
       }
     }
@@ -173,6 +172,16 @@ struct DepthToSpaceOpFunctor<CPUDevice, T, FORMAT_NHWC> {
     }
   }
 };
+
+#ifdef WIN32
+template <typename T>
+struct DepthToSpaceOpFunctor<CPUDevice, T, FORMAT_NCHW> {
+  void operator()(const CPUDevice& d, typename TTypes<T, 4>::ConstTensor input,
+                  int block_size, typename TTypes<T, 4>::Tensor output) {
+    LOG(FATAL) << "Trivial implementation to make debug build compile.";
+  }
+};
+#endif
 }  // namespace functor
 
 #define REGISTER(type)                                                \
diff --git a/tensorflow/core/kernels/depthwise_conv_grad_op.cc b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
index 310bd73ba65..9a613c1d845 100644
--- a/tensorflow/core/kernels/depthwise_conv_grad_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/cast_op.h"
 #include "tensorflow/core/kernels/conv_grad_ops.h"
 #include "tensorflow/core/kernels/depthwise_conv_op.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -578,8 +579,6 @@ class DepthwiseConv2dNativeBackpropInputOp : public OpKernel {
     OP_REQUIRES_OK(context, CheckValidPadding(padding_, explicit_paddings_,
                                               /*num_dims=*/4, data_format_));
 
-    // For in_depth == 1 and grouped convolutions.
-    use_cudnn_ = CanUseCudnn() && std::is_same<Device, GPUDevice>::value;
     cudnn_use_autotune_ = CudnnUseAutotune();
     dtype_ = DataTypeToEnum<T>::value;
 #if CUDNN_VERSION >= 8000
@@ -637,13 +636,13 @@ class DepthwiseConv2dNativeBackpropInputOp : public OpKernel {
 
     // If in_depth==1, this operation is just a standard convolution.
     // Depthwise convolution is a special case of cuDNN's grouped convolution.
-    bool use_cudnn =
-        use_cudnn_ && (in_depth == 1 ||
-                       (use_cudnn_grouped_conv_ &&
-                        IsCudnnSupportedFilterSize(/*filter_rows=*/filter_rows,
-                                                   /*filter_cols=*/filter_cols,
-                                                   /*in_depth=*/in_depth,
-                                                   /*out_depth=*/out_depth)));
+    bool use_cudnn = std::is_same<Device, GPUDevice>::value &&
+                     (in_depth == 1 ||
+                      (use_cudnn_grouped_conv_ &&
+                       IsCudnnSupportedFilterSize(/*filter_rows=*/filter_rows,
+                                                  /*filter_cols=*/filter_cols,
+                                                  /*in_depth=*/in_depth,
+                                                  /*out_depth=*/out_depth)));
 
     VLOG(2) << "DepthwiseConv2dNativeBackpropInput: "
             << " Input: [" << batch << ", " << input_rows << ", " << input_cols
@@ -673,7 +672,7 @@ class DepthwiseConv2dNativeBackpropInputOp : public OpKernel {
               "Failed to reshape filter tensor for grouped convolution."));
       // TODO(yangzihao): Send in arbitrary dilation rates after the dilated
       // conv is supported.
-      launcher_(context, use_cudnn_, cudnn_use_autotune_, out_backprop,
+      launcher_(context, /*use_cudnn=*/true, cudnn_use_autotune_, out_backprop,
                 reshaped_filter, /*row_dilation=*/1, /*col_dilation=*/1,
                 stride_, stride_, padding_, explicit_paddings_, in_backprop,
                 data_format_);
@@ -700,7 +699,6 @@ class DepthwiseConv2dNativeBackpropInputOp : public OpKernel {
 
   // For in_depth == 1 and grouped convolutions.
   LaunchConv2DBackpropInputOp<Device, T> launcher_;
-  bool use_cudnn_;
   bool cudnn_use_autotune_;
   DataType dtype_;
 
@@ -1084,8 +1082,6 @@ class DepthwiseConv2dNativeBackpropFilterOp : public OpKernel {
     OP_REQUIRES_OK(context, CheckValidPadding(padding_, explicit_paddings_,
                                               /*num_dims=*/4, data_format_));
 
-    // For in_depth == 1 and grouped convolutions.
-    use_cudnn_ = CanUseCudnn() && std::is_same<Device, GPUDevice>::value;
     cudnn_use_autotune_ = CudnnUseAutotune();
 
     if (std::is_same<T, Eigen::half>::value) {
@@ -1137,13 +1133,13 @@ class DepthwiseConv2dNativeBackpropFilterOp : public OpKernel {
 
     // If in_depth==1, this operation is just a standard convolution.
     // Depthwise convolution is a special case of cuDNN's grouped convolution.
-    bool use_cudnn =
-        use_cudnn_ && (in_depth == 1 ||
-                       (use_cudnn_grouped_conv_ &&
-                        IsCudnnSupportedFilterSize(/*filter_rows=*/filter_rows,
-                                                   /*filter_cols=*/filter_cols,
-                                                   /*in_depth=*/in_depth,
-                                                   /*out_depth=*/out_depth)));
+    bool use_cudnn = std::is_same<Device, GPUDevice>::value &&
+                     (in_depth == 1 ||
+                      (use_cudnn_grouped_conv_ &&
+                       IsCudnnSupportedFilterSize(/*filter_rows=*/filter_rows,
+                                                  /*filter_cols=*/filter_cols,
+                                                  /*in_depth=*/in_depth,
+                                                  /*out_depth=*/out_depth)));
 
     VLOG(2) << "DepthwiseConv2dNativeBackpropFilter: "
             << " Input: [" << batch << ", " << input_rows << ", " << input_cols
@@ -1174,18 +1170,52 @@ class DepthwiseConv2dNativeBackpropFilterOp : public OpKernel {
 
       // TODO(yangzihao): Send in arbitrary dilation rates after the dilated
       // conv is supported.
-      launcher_(context, use_cudnn_, cudnn_use_autotune_, out_backprop, input,
+      launcher_(context, /*use_cudnn=*/true, cudnn_use_autotune_, out_backprop,
+                input,
                 /*row_dilation=*/1, /*col_dilation=*/1, stride_, stride_,
                 padding_, explicit_paddings_, &reshaped_filter, data_format_);
       return;
     }
 
-    auto out_backprop_ptr = out_backprop.template flat<T>().data();
-    auto input_ptr = input.template flat<T>().data();
-    auto filter_backprop_ptr = filter_backprop->template flat<T>().data();
-    LaunchDepthwiseConvBackpropFilterOp<Device, T>()(
+    // For GPU inputs with type half, we cast inputs to float and outputs back
+    // to half, as half implementation is slow and does not use full precision
+    // accumulation in some cases.
+    constexpr bool cast_to_float = std::is_same<T, Eigen::half>::value &&
+                                   std::is_same<Device, GPUDevice>::value;
+    using U = typename std::conditional<cast_to_float, float, T>::type;
+    Tensor casted_out_backprop = out_backprop;
+    Tensor casted_input = input;
+    Tensor casted_filter_backprop = *filter_backprop;
+    const Device& device = context->template eigen_device<Device>();
+    if (cast_to_float) {
+      functor::CastFunctor<Device, float, Eigen::half> cast;
+      OP_REQUIRES_OK(context,
+                     context->allocate_temp(DT_FLOAT, out_backprop.shape(),
+                                            &casted_out_backprop));
+      cast(device, casted_out_backprop.template flat<float>(),
+           out_backprop.template flat<Eigen::half>());
+      OP_REQUIRES_OK(context, context->allocate_temp(DT_FLOAT, input.shape(),
+                                                     &casted_input));
+      cast(device, casted_input.template flat<float>(),
+           input.template flat<Eigen::half>());
+      OP_REQUIRES_OK(context,
+                     context->allocate_temp(DT_FLOAT, filter_backprop->shape(),
+                                            &casted_filter_backprop));
+    }
+
+    auto out_backprop_ptr = casted_out_backprop.template flat<U>().data();
+    auto input_ptr = casted_input.template flat<U>().data();
+    auto filter_backprop_ptr = casted_filter_backprop.template flat<U>().data();
+    LaunchDepthwiseConvBackpropFilterOp<Device, U>()(
         context, args, out_backprop_ptr, input_ptr, filter_backprop_ptr,
         data_format_);
+
+    if (cast_to_float) {
+      functor::CastFunctor<Device, Eigen::half, float> cast;
+      const Tensor& casted_filter_backprop_const = casted_filter_backprop;
+      cast(device, filter_backprop->template flat<Eigen::half>(),
+           casted_filter_backprop_const.template flat<float>());
+    }
   }
 
  protected:
@@ -1200,7 +1230,6 @@ class DepthwiseConv2dNativeBackpropFilterOp : public OpKernel {
 
   // For in_depth == 1 and grouped convolutions.
   LaunchConv2DBackpropFilterOp<Device, T> launcher_;
-  bool use_cudnn_;
   bool cudnn_use_autotune_;
   DataType dtype_;
 
diff --git a/tensorflow/core/kernels/depthwise_conv_op.cc b/tensorflow/core/kernels/depthwise_conv_op.cc
index fe6a9e3e377..a03de90007f 100644
--- a/tensorflow/core/kernels/depthwise_conv_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op.cc
@@ -298,8 +298,6 @@ class DepthwiseConv2dNativeOp : public BinaryOp<T> {
     OP_REQUIRES_OK(context, CheckValidPadding(padding_, explicit_paddings_,
                                               /*num_dims=*/4, data_format_));
 
-    // For in_depth == 1 and grouped convolutions.
-    use_cudnn_ = CanUseCudnn() && std::is_same<Device, GPUDevice>::value;
     cudnn_use_autotune_ = CudnnUseAutotune();
     dtype_ = DataTypeToEnum<T>::value;
 #if CUDNN_VERSION >= 8000
@@ -407,13 +405,13 @@ class DepthwiseConv2dNativeOp : public BinaryOp<T> {
     // TODO(csigg): Have autotune decide if native is faster than cuDNN.
     // If in_depth==1, this operation is just a standard convolution.
     // Depthwise convolution is a special case of cuDNN's grouped convolution.
-    bool use_cudnn =
-        use_cudnn_ && (in_depth == 1 ||
-                       (use_cudnn_grouped_conv_ &&
-                        IsCudnnSupportedFilterSize(/*filter_rows=*/filter_rows,
-                                                   /*filter_cols=*/filter_cols,
-                                                   /*in_depth=*/in_depth,
-                                                   /*out_depth=*/out_depth)));
+    bool use_cudnn = std::is_same<Device, GPUDevice>::value &&
+                     (in_depth == 1 ||
+                      (use_cudnn_grouped_conv_ &&
+                       IsCudnnSupportedFilterSize(/*filter_rows=*/filter_rows,
+                                                  /*filter_cols=*/filter_cols,
+                                                  /*in_depth=*/in_depth,
+                                                  /*out_depth=*/out_depth)));
 
     VLOG(2) << "DepthwiseConv2dNative: "
             << " Input: [" << batch << ", " << input_rows << ", " << input_cols
@@ -443,7 +441,7 @@ class DepthwiseConv2dNativeOp : public BinaryOp<T> {
               "Failed to reshape filter tensor for grouped convolution."));
       // TODO(yangzihao): Send in arbitrary dilation rates after the dilated
       // conv is supported.
-      launcher_(context, use_cudnn_, cudnn_use_autotune_, input,
+      launcher_(context, /*use_cudnn=*/true, cudnn_use_autotune_, input,
                 reshaped_filter, /*row_dilation=*/1, /*col_dilation=*/1,
                 stride_, stride_, padding_, explicit_paddings_, output,
                 data_format_);
@@ -485,7 +483,6 @@ class DepthwiseConv2dNativeOp : public BinaryOp<T> {
 
   // For in_depth == 1 and grouped convolutions.
   LaunchConv2DOp<Device, T> launcher_;
-  bool use_cudnn_;
   bool cudnn_use_autotune_;
   DataType dtype_;
 
diff --git a/tensorflow/core/kernels/dequantize_op.cc b/tensorflow/core/kernels/dequantize_op.cc
index 3b38daf0067..5393a677db2 100644
--- a/tensorflow/core/kernels/dequantize_op.cc
+++ b/tensorflow/core/kernels/dequantize_op.cc
@@ -23,8 +23,8 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/meta_support.h"
 #include "tensorflow/core/kernels/quantization_utils.h"
-#include "tensorflow/core/lib/bfloat16/bfloat16.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/bfloat16.h"
 
 namespace {
 enum {
diff --git a/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc b/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
index 7b64d9e8484..2bcc2b6ec65 100644
--- a/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
@@ -48,6 +48,14 @@ limitations under the License.
 #include "tensorflow/core/util/gpu_kernel_helper.h"
 #include "tensorflow/core/util/transform_output_iterator.h"
 
+#if GOOGLE_CUDA
+#include "tensorflow/stream_executor/cuda/cuda_activation.h"
+using stream_executor::cuda::ScopedActivateExecutorContext;
+#elif TENSORFLOW_USE_ROCM
+#include "tensorflow/core/platform/rocm.h"
+using stream_executor::rocm::ScopedActivateExecutorContext;
+#endif  // GOOGLE_CUDA
+
 namespace tensorflow {
 
 typedef Eigen::GpuDevice GPUDevice;
@@ -302,6 +310,9 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
     TensorReference partition_ref(partition_count);
     auto wrapped_callback = [this, c, &data, &partitions, indices_out,
                              partition_ref, cpu_tensor, done]() {
+      auto stream = c->op_device_context()->stream();
+      ScopedActivateExecutorContext scoped_activation{stream->parent()};
+
       OpOutputList outputs;
       this->AllocateOutputs(c, &data, &partitions, &cpu_tensor, &outputs, done);
       if (!c->status().ok()) {
diff --git a/tensorflow/core/kernels/dynamic_stitch_op.cc b/tensorflow/core/kernels/dynamic_stitch_op.cc
index 5f6b0357f95..cad691ab8e9 100644
--- a/tensorflow/core/kernels/dynamic_stitch_op.cc
+++ b/tensorflow/core/kernels/dynamic_stitch_op.cc
@@ -365,24 +365,4 @@ TF_CALL_COMPLEX_TYPES(REGISTER_DYNAMIC_STITCH_GPU);
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_DYNAMIC_STITCH_SYCL(type)               \
-  REGISTER_KERNEL_BUILDER(Name("DynamicStitch")          \
-                              .Device(DEVICE_SYCL)       \
-                              .TypeConstraint<type>("T") \
-                              .HostMemory("indices")     \
-                              .HostMemory("data")        \
-                              .HostMemory("merged"),     \
-                          DynamicStitchOpCPU<type>)      \
-  REGISTER_KERNEL_BUILDER(Name("ParallelDynamicStitch")  \
-                              .Device(DEVICE_SYCL)       \
-                              .TypeConstraint<type>("T") \
-                              .HostMemory("indices")     \
-                              .HostMemory("data")        \
-                              .HostMemory("merged"),     \
-                          ParallelDynamicStitchOpCPU<type>)
-
-TF_CALL_POD_STRING_TYPES(REGISTER_DYNAMIC_STITCH_SYCL);
-#undef REGISTER_DYNAMIC_STITCH_SYCL
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/fft_ops.cc b/tensorflow/core/kernels/fft_ops.cc
index 05843594839..050b83980c6 100644
--- a/tensorflow/core/kernels/fft_ops.cc
+++ b/tensorflow/core/kernels/fft_ops.cc
@@ -372,7 +372,7 @@ class CufftScratchAllocator : public se::ScratchAllocator {
       return se::port::StatusOr<se::DeviceMemory<uint8>>();
     }
     AllocationAttributes allocation_attr;
-    allocation_attr.no_retry_on_failure = true;
+    allocation_attr.retry_on_failure = false;
     Status allocation_status(context_->allocate_temp(
         DT_UINT8, TensorShape({byte_size}), &temporary_memory,
         AllocatorAttributes(), allocation_attr));
diff --git a/tensorflow/core/kernels/fill_functor.cc b/tensorflow/core/kernels/fill_functor.cc
index 0619facbd65..140497b06d0 100644
--- a/tensorflow/core/kernels/fill_functor.cc
+++ b/tensorflow/core/kernels/fill_functor.cc
@@ -63,26 +63,6 @@ DEFINE_SETZERO_CPU(complex128);
 DEFINE_SETZERO_CPU(Variant);
 #undef DEFINE_SETZERO_CPU
 
-#ifdef TENSORFLOW_USE_SYCL
-template <typename T>
-void SetZeroFunctor<Eigen::SyclDevice, T>::operator()(
-    const Eigen::SyclDevice& d, typename TTypes<T>::Flat out) {
-  To32Bit(out).device(d) = To32Bit(out).constant(T(0));
-}
-
-#define DEFINE_SETZERO_SYCL(T) \
-  template struct SetZeroFunctor<Eigen::SyclDevice, T>;
-DEFINE_SETZERO_SYCL(bool);
-DEFINE_SETZERO_SYCL(float);
-DEFINE_SETZERO_SYCL(double);
-DEFINE_SETZERO_SYCL(uint8);
-DEFINE_SETZERO_SYCL(int8);
-DEFINE_SETZERO_SYCL(uint16);
-DEFINE_SETZERO_SYCL(int16);
-DEFINE_SETZERO_SYCL(int32);
-DEFINE_SETZERO_SYCL(int64);
-#undef DEFINE_SETZERO_SYCL
-#endif  // TENSORFLOW_USE_SYCL
 
 template <typename T>
 void SetOneFunctor<Eigen::ThreadPoolDevice, T>::operator()(
@@ -110,20 +90,6 @@ DEFINE_SETONE_CPU(complex64);
 DEFINE_SETONE_CPU(complex128);
 #undef DEFINE_SETONE_CPU
 
-#ifdef TENSORFLOW_USE_SYCL
-template <typename T>
-void SetOneFunctor<Eigen::SyclDevice, T>::operator()(
-    const Eigen::SyclDevice& d, typename TTypes<T>::Flat out) {
-  out.device(d) = out.constant(T(1));
-}
-
-#define DEFINE_SETONE_SYCL(T) \
-  template struct SetOneFunctor<Eigen::SyclDevice, T>;
-DEFINE_SETONE_SYCL(float);
-DEFINE_SETONE_SYCL(bool);
-DEFINE_SETONE_SYCL(double);
-#undef DEFINE_SETONE_SYCL
-#endif  // TENSORFLOW_USE_SYCL
 
 template <typename T>
 struct FillFunctor<Eigen::ThreadPoolDevice, T> {
@@ -145,29 +111,6 @@ DEFINE_FILL_CPU(qint8);
 DEFINE_FILL_CPU(qint16);
 #undef DEFINE_FILL_CPU
 
-#ifdef TENSORFLOW_USE_SYCL
-template <typename T>
-struct FillFunctor<Eigen::SyclDevice, T> {
-  void operator()(const Eigen::SyclDevice& d, typename TTypes<T>::Flat out,
-                  typename TTypes<T>::ConstScalar in) {
-#if !defined(EIGEN_HAS_INDEX_LIST)
-    Eigen::array<int, 1> rank1{1};
-#else
-    Eigen::IndexList<Eigen::type2index<1> > rank1;
-#endif
-    const int size = out.dimension(0);
-    Eigen::array<int, 1> broadcast_dims{size};
-
-    To32Bit(out).device(d) = in.reshape(rank1).broadcast(broadcast_dims);
-  }
-};
-
-#define DEFINE_FILL_SYCL(T) template struct FillFunctor<Eigen::SyclDevice, T>;
-DEFINE_FILL_SYCL(float);
-DEFINE_FILL_SYCL(double);
-TF_CALL_INTEGRAL_TYPES(DEFINE_FILL_SYCL)
-#undef DEFINE_FILL_SYCL
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace functor
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/fill_functor.h b/tensorflow/core/kernels/fill_functor.h
index a9a47c6ecd3..7e2d558e33f 100644
--- a/tensorflow/core/kernels/fill_functor.h
+++ b/tensorflow/core/kernels/fill_functor.h
@@ -45,13 +45,6 @@ struct SetZeroFunctor<Eigen::ThreadPoolDevice, T> {
                   typename TTypes<T>::Flat out);
 };
 
-#ifdef TENSORFLOW_USE_SYCL
-// Partial specialization of SetZeroFunctor<Device=Eigen::SyclDevice, T>.
-template <typename T>
-struct SetZeroFunctor<Eigen::SyclDevice, T> {
-  void operator()(const Eigen::SyclDevice& d, typename TTypes<T>::Flat out);
-};
-#endif  // TENSORFLOW_USE_SYCL
 
 template <>
 struct SetZeroFunctor<Eigen::ThreadPoolDevice, tstring> {
@@ -72,13 +65,6 @@ struct SetOneFunctor<Eigen::ThreadPoolDevice, T> {
                   typename TTypes<T>::Flat out);
 };
 
-#ifdef TENSORFLOW_USE_SYCL
-// Partial specialization of SetOneFunctor<Device=Eigen::SyclDevice, T>.
-template <typename T>
-struct SetOneFunctor<Eigen::SyclDevice, T> {
-  void operator()(const Eigen::SyclDevice& d, typename TTypes<T>::Flat out);
-};
-#endif  // TENSORFLOW_USE_SYCL
 
 template <>
 struct SetOneFunctor<Eigen::ThreadPoolDevice, tstring> {
diff --git a/tensorflow/core/kernels/function_ops.cc b/tensorflow/core/kernels/function_ops.cc
index 52a11e0870d..82b1aa8f63a 100644
--- a/tensorflow/core/kernels/function_ops.cc
+++ b/tensorflow/core/kernels/function_ops.cc
@@ -94,28 +94,6 @@ REGISTER_SYSTEM_KERNEL_BUILDER(Name(kDeviceRetOp).Device(DEVICE_CPU), RetvalOp);
 // is turned on.
 REGISTER_KERNEL_BUILDER(Name(kRetOp).Device(DEVICE_TPU_SYSTEM), RetvalOp);
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER(type)     \
-  REGISTER_KERNEL_BUILDER( \
-      Name(kArgOp).Device(DEVICE_SYCL).TypeConstraint<type>("T"), ArgOp);
-TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER)
-TF_CALL_bool(REGISTER) REGISTER_KERNEL_BUILDER(Name(kArgOp)
-                                                   .Device(DEVICE_SYCL)
-                                                   .HostMemory("output")
-                                                   .TypeConstraint<int32>("T"),
-                                               ArgOp);
-#undef REGISTER
-#define REGISTER(type)     \
-  REGISTER_KERNEL_BUILDER( \
-      Name(kRetOp).Device(DEVICE_SYCL).TypeConstraint<type>("T"), RetvalOp);
-TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER)
-TF_CALL_bool(REGISTER) REGISTER_KERNEL_BUILDER(Name(kRetOp)
-                                                   .Device(DEVICE_SYCL)
-                                                   .HostMemory("input")
-                                                   .TypeConstraint<int32>("T"),
-                                               RetvalOp);
-#undef REGISTER
-#endif
 
 #define REGISTER(type)     \
   REGISTER_KERNEL_BUILDER( \
@@ -225,33 +203,6 @@ REGISTER_KERNEL_BUILDER(Name("_ArrayToList")
                             .TypeConstraint<int32>("T"),
                         PassOn);
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNELS(type)                                       \
-  REGISTER_KERNEL_BUILDER(                                                \
-      Name("_ListToArray").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
-      PassOn);                                                            \
-  REGISTER_KERNEL_BUILDER(                                                \
-      Name("_ArrayToList").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
-      PassOn);
-
-REGISTER_SYCL_KERNELS(float);
-REGISTER_SYCL_KERNELS(double);
-
-#undef REGISTER_SYCL_KERNELS
-
-REGISTER_KERNEL_BUILDER(Name("_ListToArray")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("input")
-                            .HostMemory("output")
-                            .TypeConstraint<int32>("T"),
-                        PassOn);
-REGISTER_KERNEL_BUILDER(Name("_ArrayToList")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("input")
-                            .HostMemory("output")
-                            .TypeConstraint<int32>("T"),
-                        PassOn);
-#endif  // TENSORFLOW_USE_SYCL
 
 class SymbolicGradientOp : public AsyncOpKernel {
  public:
@@ -283,13 +234,7 @@ class SymbolicGradientOp : public AsyncOpKernel {
       args.push_back(ctx->input(i));
     }
     std::vector<Tensor>* rets = new std::vector<Tensor>;
-    profiler::TraceMe trace_me(
-        [&] {
-          return absl::StrCat(
-              "SymbolicGradientOp #parent_step_id=", ctx->step_id(),
-              ",function_step_id=", opts.step_id, "#");
-        },
-        profiler::TraceMeLevel::kInfo);
+    profiler::TraceMe trace_me("SymbolicGradientOp");
     lib->Run(opts, handle, args, rets, [ctx, done, rets](const Status& status) {
       if (!status.ok()) {
         ctx->SetStatus(status);
@@ -315,11 +260,6 @@ REGISTER_KERNEL_BUILDER(Name(kGradientOp).Device(DEVICE_CPU),
                         SymbolicGradientOp);
 REGISTER_KERNEL_BUILDER(Name(kGradientOp).Device(DEVICE_GPU),
                         SymbolicGradientOp);
-#if TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name(kGradientOp).Device(DEVICE_SYCL),
-                        SymbolicGradientOp);
-
-#endif  // TENSORFLOW_USE_SYCL
 
 RemoteCallOp::RemoteCallOp(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
   OP_REQUIRES_OK(ctx,
@@ -411,8 +351,6 @@ void RemoteCallOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
   profiler::TraceMe trace_me(
       [&] {
         return absl::StrCat("RemoteCallOp#func_name=", func_name,
-                            ",parent_step_id=", ctx->step_id(),
-                            ",function_step_id=", opts.step_id,
                             ",device=", target_device, "#");
       },
       profiler::TraceMeLevel::kInfo);
@@ -424,8 +362,6 @@ void RemoteCallOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
         profiler::TraceMe activity(
             [&] {
               return absl::StrCat("RemoteCallOpDone#func_name=", func_name,
-                                  ",parent_step_id=", ctx->step_id(),
-                                  ",function_step_id=", function_step_id,
                                   ",device=", target_device, "#");
             },
             profiler::TraceMeLevel::kInfo);
@@ -459,9 +395,4 @@ REGISTER_KERNEL_BUILDER(
     Name("RemoteCall").Device(DEVICE_CPU).HostMemory("target"), RemoteCallOp);
 REGISTER_KERNEL_BUILDER(
     Name("RemoteCall").Device(DEVICE_GPU).HostMemory("target"), RemoteCallOp);
-#if TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(
-    Name("RemoteCall").Device(DEVICE_SYCL).HostMemory("target"), RemoteCallOp);
-
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/functional_ops.cc b/tensorflow/core/kernels/functional_ops.cc
index e9ad7995eed..8d798a3f823 100644
--- a/tensorflow/core/kernels/functional_ops.cc
+++ b/tensorflow/core/kernels/functional_ops.cc
@@ -174,12 +174,7 @@ class IfOp : public AsyncOpKernel {
     void Start() {
       FHandle handle = cond_ ? then_handle_ : else_handle_;
       rets_.clear();
-      profiler::TraceMe trace_me(
-          [&] {
-            return absl::StrCat("IfOp #parent_step_id=", ctx_->step_id(),
-                                ",function_step_id=", opts_.step_id, "#");
-          },
-          /*level=*/2);
+      profiler::TraceMe trace_me("IfOp");
       lib_->Run(
           // Evaluate one of the branch.
           opts_, handle, args_, &rets_,
@@ -309,12 +304,7 @@ class CaseOp : public AsyncOpKernel {
         branch = branch_handles_.size() - 1;
       }
       rets_.clear();
-      profiler::TraceMe trace_me(
-          [&] {
-            return absl::StrCat("CaseOp #parent_step_id=", ctx_->step_id(),
-                                ",function_step_id=", opts_.step_id, "#");
-          },
-          /*level=*/2);
+      profiler::TraceMe trace_me("CaseOp");
       lib_->Run(
           // Evaluate one of the branch.
           opts_, branch_handles_[branch], args_, &rets_,
@@ -407,18 +397,6 @@ class WhileOp : public AsyncOpKernel {
   std::unordered_map<FunctionLibraryRuntime*, std::pair<FHandle, FHandle>>
       handles_ GUARDED_BY(mu_);
 
-  static string EvalCondTraceString(
-      OpKernelContext* ctx, const FunctionLibraryRuntime::Options& opts) {
-    return absl::StrCat("WhileOp-EvalCond #parent_step_id=", ctx->step_id(),
-                        ",function_step_id=", opts.step_id, "#");
-  }
-
-  static string StartBodyTraceString(
-      OpKernelContext* ctx, const FunctionLibraryRuntime::Options& opts) {
-    return absl::StrCat("WhileOp-StartBody #parent_step_id=", ctx->step_id(),
-                        ",function_step_id=", opts.step_id, "#");
-  }
-
   static Status CondResultToBool(OpKernelContext* ctx,
                                  const FunctionLibraryRuntime::Options& opts,
                                  const Tensor& cond_t, bool* out_result) {
@@ -554,9 +532,7 @@ class WhileOp : public AsyncOpKernel {
     std::unique_ptr<BodyFuncCallFrame> body_frame_;
 
     void EvalCond() {
-      profiler::TraceMe trace_me(
-          [&] { return EvalCondTraceString(ctx_, opts_); },
-          /*level=*/2);
+      profiler::TraceMe trace_me("WhileOp-EvalCond");
       lib_->Run(
           // Evaluate the condition.
           opts_, cond_handle_, args_, &rets_,
@@ -592,9 +568,7 @@ class WhileOp : public AsyncOpKernel {
       }
       rets_.clear();
       rets_.resize(args_.size());
-      profiler::TraceMe trace_me(
-          [&] { return StartBodyTraceString(ctx_, opts_); },
-          /*level=*/2);
+      profiler::TraceMe trace_me("WhileOp-StartBody");
       lib_->Run(
           // Evaluate the body.
           opts_, body_handle_, body_frame_.get(),
@@ -649,9 +623,7 @@ class WhileOp : public AsyncOpKernel {
     do {
       // Evaluate the cond function on the current loop variables.
       {
-        profiler::TraceMe trace_me(
-            [&] { return EvalCondTraceString(ctx, opts); },
-            /*level=*/2);
+        profiler::TraceMe trace_me("WhileOp-EvalCond");
         TF_RETURN_IF_ERROR(lib->RunSync(opts, cond_handle, args, &cond_rets));
       }
       if (cond_rets.size() != 1) {
@@ -672,9 +644,7 @@ class WhileOp : public AsyncOpKernel {
       // Evaluate the body function on the current loop variables, to get an
       // updated vector of loop variables.
       {
-        profiler::TraceMe trace_me(
-            [&] { return StartBodyTraceString(ctx, opts); },
-            /*level=*/2);
+        profiler::TraceMe trace_me("WhileOp-StartBody");
         body_rets.resize(num_loop_vars);
         BodyFuncCallFrame call_frame(&args, &body_rets, loop_var_types);
         TF_RETURN_IF_ERROR(lib->RunSync(opts, body_handle, &call_frame));
@@ -854,12 +824,7 @@ class ForOp : public AsyncOpKernel {
         args_[1 + i] = std::move(rets_[i]);
       }
       rets_.clear();
-      profiler::TraceMe trace_me(
-          [&] {
-            return absl::StrCat("ForOp #parent_step_id=", ctx_->step_id(),
-                                ",function_step_id=", opts_.step_id, "#");
-          },
-          /*level=*/2);
+      profiler::TraceMe trace_me("ForOp");
       lib_->Run(opts_, kernel_->body_handle_, args_, &rets_,
                 [this](const Status& s) {
                   if (s.ok()) {
diff --git a/tensorflow/core/kernels/fused_eigen_output_kernels.cc b/tensorflow/core/kernels/fused_eigen_output_kernels.cc
index 94e621ae05b..e8e9fd6407e 100644
--- a/tensorflow/core/kernels/fused_eigen_output_kernels.cc
+++ b/tensorflow/core/kernels/fused_eigen_output_kernels.cc
@@ -60,18 +60,25 @@ Status InitializeFusedComputation(
   if (*fused_computation == FusedComputationType::kBiasAdd ||
       *fused_computation == FusedComputationType::kBiasAddWithRelu ||
       *fused_computation == FusedComputationType::kBiasAddWithRelu6 ||
-      *fused_computation == FusedComputationType::kBiasAddWithElu) {
+      *fused_computation == FusedComputationType::kBiasAddWithElu ||
+      *fused_computation == FusedComputationType::kBiasAddWithLeakyRelu) {
     if (num_args != 1) {
       return errors::InvalidArgument(
           "Fused ", kernel_name,
           " with BiasAdd must have one extra argument: bias.");
     }
+    if (*fused_computation == FusedComputationType::kBiasAddWithLeakyRelu) {
+      TF_RETURN_IF_ERROR(context->GetAttr(
+          "leakyrelu_alpha", &fused_computation_args->leakyrelu_alpha));
+    }
   }
 
   if (*fused_computation == FusedComputationType::kFusedBatchNorm ||
       *fused_computation == FusedComputationType::kFusedBatchNormWithRelu ||
       *fused_computation == FusedComputationType::kFusedBatchNormWithRelu6 ||
-      *fused_computation == FusedComputationType::kFusedBatchNormWithElu) {
+      *fused_computation == FusedComputationType::kFusedBatchNormWithElu ||
+      *fused_computation ==
+          FusedComputationType::kFusedBatchNormWithLeakyRelu) {
     if (num_args != 4) {
       return errors::InvalidArgument(
           "Fused ", kernel_name,
@@ -80,6 +87,11 @@ Status InitializeFusedComputation(
     }
     TF_RETURN_IF_ERROR(
         context->GetAttr("epsilon", &fused_computation_args->epsilon));
+    if (*fused_computation ==
+        FusedComputationType::kFusedBatchNormWithLeakyRelu) {
+      TF_RETURN_IF_ERROR(context->GetAttr(
+          "leakyrelu_alpha", &fused_computation_args->leakyrelu_alpha));
+    }
   }
 
   return Status::OK();
diff --git a/tensorflow/core/kernels/fused_eigen_output_kernels.h b/tensorflow/core/kernels/fused_eigen_output_kernels.h
index 2588da10f58..546cf39e094 100644
--- a/tensorflow/core/kernels/fused_eigen_output_kernels.h
+++ b/tensorflow/core/kernels/fused_eigen_output_kernels.h
@@ -39,15 +39,18 @@ enum class FusedComputationType {
   kBiasAddWithRelu,
   kBiasAddWithRelu6,
   kBiasAddWithElu,
+  kBiasAddWithLeakyRelu,
   kFusedBatchNorm,
   kFusedBatchNormWithRelu,
   kFusedBatchNormWithRelu6,
-  kFusedBatchNormWithElu
+  kFusedBatchNormWithElu,
+  kFusedBatchNormWithLeakyRelu
 };
 
 // We have to pass around additional arguments for all possible fusion types.
 struct FusedComputationArgs {
-  float epsilon = 0.0;  // Used by `FusedBatchNorm` fusion only
+  float epsilon = 0.0;          // Used by `FusedBatchNorm` fusion only
+  float leakyrelu_alpha = 0.0;  // Used by `LeakyRelu` fusion only
 };
 
 struct FusedComputationPattern {
@@ -111,15 +114,32 @@ struct Elu {
   };
 };
 
+// Applies `LeakyRelu` to the passed input expression.
+struct LeakyRelu {
+  template <typename XprType>
+  static auto apply(XprType expr, const float leakyrelu_alpha) -> decltype(
+      (expr < std::declval<typename XprType::Scalar>())
+          .select(expr *
+                      expr.constant(std::declval<typename XprType::Scalar>()),
+                  expr)) {
+    return (expr < static_cast<typename XprType::Scalar>(0))
+        .select(expr * expr.constant(static_cast<typename XprType::Scalar>(
+                           leakyrelu_alpha)),
+                expr);
+  };
+};
+
 template <typename T>
 struct BiasAddArgs {
   const T* bias_add_data = nullptr;
+  float leakyrelu_alpha;
 
   static bool IsSupported(FusedComputationType fusion) {
     return fusion == FusedComputationType::kBiasAdd ||
            fusion == FusedComputationType::kBiasAddWithRelu ||
            fusion == FusedComputationType::kBiasAddWithRelu6 ||
-           fusion == FusedComputationType::kBiasAddWithElu;
+           fusion == FusedComputationType::kBiasAddWithElu ||
+           fusion == FusedComputationType::kBiasAddWithLeakyRelu;
   }
 };
 
@@ -134,11 +154,14 @@ struct FusedBatchNormArgs {
   //   scaling_factor = (estimated_variance + epsilon).rsqrt() * scale
   Eigen::Tensor<T, 1, Eigen::RowMajor> scaling_factor;
 
+  float leakyrelu_alpha;
+
   static bool IsSupported(FusedComputationType fusion) {
     return fusion == FusedComputationType::kFusedBatchNorm ||
            fusion == FusedComputationType::kFusedBatchNormWithRelu ||
            fusion == FusedComputationType::kFusedBatchNormWithRelu6 ||
-           fusion == FusedComputationType::kFusedBatchNormWithElu;
+           fusion == FusedComputationType::kFusedBatchNormWithElu ||
+           fusion == FusedComputationType::kFusedBatchNormWithLeakyRelu;
   }
 };
 
@@ -203,6 +226,34 @@ struct BiasAddOutputKernel {
   const T* bias_data;
 };
 
+template <typename T>
+struct BiasAddOutputKernel<T, LeakyRelu> {
+  explicit BiasAddOutputKernel(const BiasAddArgs<T>& args)
+      : bias_data(args.bias_add_data), leakyrelu_alpha(args.leakyrelu_alpha) {}
+
+  template <typename StorageIndex, typename Scalar>
+  EIGEN_ALWAYS_INLINE void operator()(
+      const ContractionOutputMapper<Scalar, StorageIndex>& output_mapper,
+      const Eigen::TensorContractionParams& params, StorageIndex i,
+      StorageIndex j, StorageIndex num_rows, StorageIndex num_cols) const {
+    DCHECK(params.swapped_arguments);
+
+    const T* bias_base = bias_data + i;
+    typename TTypes<T>::UnalignedConstTensor bias(bias_base, num_rows);
+
+    for (int col = 0; col < num_cols; ++col) {
+      T* output_base = &output_mapper(0, col);
+      typename TTypes<T>::UnalignedTensor output(output_base, num_rows);
+      const auto expr = output + bias;
+      output = LeakyRelu::template apply<decltype(expr)>(expr, leakyrelu_alpha);
+    }
+  }
+
+ private:
+  const T* bias_data;
+  float leakyrelu_alpha;
+};
+
 // Output kernel that fuses FusedBatchNorm operation into the output of tensor
 // contraction + activation function defined by Activation.
 template <typename T, typename Activation = Identity>
@@ -247,6 +298,51 @@ struct FusedBatchNormOutputKernel {
   const T* estimated_mean_data;
 };
 
+template <typename T>
+struct FusedBatchNormOutputKernel<T, LeakyRelu> {
+  FusedBatchNormOutputKernel(T epsilon, const FusedBatchNormArgs<T>& args)
+      : epsilon(epsilon),
+        scaling_factor_data(args.scaling_factor.data()),
+        offset_data(args.offset_data),
+        estimated_mean_data(args.estimated_mean_data),
+        leakyrelu_alpha(args.leakyrelu_alpha) {}
+
+  template <typename StorageIndex, typename Scalar>
+  EIGEN_ALWAYS_INLINE void operator()(
+      const ContractionOutputMapper<Scalar, StorageIndex>& output_mapper,
+      const Eigen::TensorContractionParams& params, StorageIndex i,
+      StorageIndex j, StorageIndex num_rows, StorageIndex num_cols) const {
+    DCHECK(params.swapped_arguments);
+
+    const T* scaling_factor_base = scaling_factor_data + i;
+    const T* offset_base = offset_data + i;
+    const T* mean_base = estimated_mean_data + i;
+
+    typename TTypes<T>::UnalignedConstTensor scaling_factor(scaling_factor_base,
+                                                            num_rows);
+    typename TTypes<T>::UnalignedConstTensor offset(offset_base, num_rows);
+    typename TTypes<T>::UnalignedConstTensor mean(mean_base, num_rows);
+
+    for (int col = 0; col < num_cols; ++col) {
+      T* output_base = &output_mapper(0, col);
+      typename TTypes<T>::UnalignedTensor output(output_base, num_rows);
+
+      auto scaled = (output - mean) * scaling_factor;
+      auto shifted = scaled + offset;
+
+      output = LeakyRelu::template apply<decltype(shifted)>(shifted,
+                                                            leakyrelu_alpha);
+    }
+  }
+
+ private:
+  T epsilon;
+  const T* scaling_factor_data;
+  const T* offset_data;
+  const T* estimated_mean_data;
+  float leakyrelu_alpha;
+};
+
 // Type aliases for the output kernels, purely for the sake of better launch
 // dispatching code readability.
 template <typename T>
@@ -258,6 +354,8 @@ using WithBiasAddAndRelu6 = BiasAddOutputKernel<T, Relu6>;
 template <typename T>
 using WithBiasAddAndElu = BiasAddOutputKernel<T, Elu>;
 template <typename T>
+using WithBiasAddAndLeakyRelu = BiasAddOutputKernel<T, LeakyRelu>;
+template <typename T>
 using WithFusedBatchNorm = FusedBatchNormOutputKernel<T>;
 template <typename T>
 using WithFusedBatchNormAndRelu = FusedBatchNormOutputKernel<T, Relu>;
@@ -265,9 +363,12 @@ template <typename T>
 using WithFusedBatchNormAndRelu6 = FusedBatchNormOutputKernel<T, Relu6>;
 template <typename T>
 using WithFusedBatchNormAndElu = FusedBatchNormOutputKernel<T, Elu>;
+template <typename T>
+using WithFusedBatchNormAndLeakyRelu = FusedBatchNormOutputKernel<T, LeakyRelu>;
 
 template <typename T>
-Status InitBiasAddArgs(OpKernelContext* context, BiasAddArgs<T>* args) {
+Status InitBiasAddArgs(OpKernelContext* context, BiasAddArgs<T>* args,
+                       const float* leakyrelu_alpha = nullptr) {
   // Bias of the following dimensions: [ output_depth ]
   const Tensor& bias = context->input(2);
 
@@ -281,12 +382,17 @@ Status InitBiasAddArgs(OpKernelContext* context, BiasAddArgs<T>* args) {
 
   args->bias_add_data = data_ptr(bias);
 
+  if (leakyrelu_alpha) {
+    args->leakyrelu_alpha = *leakyrelu_alpha;
+  }
+
   return Status::OK();
 }
 
 template <typename T>
 Status InitFusedBatchNormArgs(OpKernelContext* context, float epsilon,
-                              FusedBatchNormArgs<T>* args) {
+                              FusedBatchNormArgs<T>* args,
+                              const float* leakyrelu_alpha = nullptr) {
   const Tensor& scale = context->input(2);
   const Tensor& offset = context->input(3);
   const Tensor& estimated_mean = context->input(4);
@@ -319,6 +425,10 @@ Status InitFusedBatchNormArgs(OpKernelContext* context, float epsilon,
       (estimated_variance.flat<T>() + static_cast<T>(epsilon)).rsqrt() *
       scale.flat<T>();
 
+  if (leakyrelu_alpha) {
+    args->leakyrelu_alpha = *leakyrelu_alpha;
+  }
+
   return Status::OK();
 }
 
diff --git a/tensorflow/core/kernels/gpu_prim.h b/tensorflow/core/kernels/gpu_prim.h
index 82fcb21e0ac..33c5df1ae23 100644
--- a/tensorflow/core/kernels/gpu_prim.h
+++ b/tensorflow/core/kernels/gpu_prim.h
@@ -15,19 +15,19 @@ limitations under the license, the license you must see.
 #define TENSORFLOW_CORE_KERNELS_GPU_PRIM_H_
 
 #if GOOGLE_CUDA
-#include "third_party/cub/block/block_load.cuh"
-#include "third_party/cub/block/block_scan.cuh"
-#include "third_party/cub/block/block_store.cuh"
-#include "third_party/cub/device/device_histogram.cuh"
-#include "third_party/cub/device/device_radix_sort.cuh"
-#include "third_party/cub/device/device_reduce.cuh"
-#include "third_party/cub/device/device_segmented_radix_sort.cuh"
-#include "third_party/cub/device/device_segmented_reduce.cuh"
-#include "third_party/cub/device/device_select.cuh"
-#include "third_party/cub/iterator/counting_input_iterator.cuh"
-#include "third_party/cub/iterator/transform_input_iterator.cuh"
-#include "third_party/cub/thread/thread_operators.cuh"
-#include "third_party/cub/warp/warp_reduce.cuh"
+#include "cub/block/block_load.cuh"
+#include "cub/block/block_scan.cuh"
+#include "cub/block/block_store.cuh"
+#include "cub/device/device_histogram.cuh"
+#include "cub/device/device_radix_sort.cuh"
+#include "cub/device/device_reduce.cuh"
+#include "cub/device/device_segmented_radix_sort.cuh"
+#include "cub/device/device_segmented_reduce.cuh"
+#include "cub/device/device_select.cuh"
+#include "cub/iterator/counting_input_iterator.cuh"
+#include "cub/iterator/transform_input_iterator.cuh"
+#include "cub/thread/thread_operators.cuh"
+#include "cub/warp/warp_reduce.cuh"
 #include "third_party/gpus/cuda/include/cusparse.h"
 
 namespace gpuprim = ::cub;
diff --git a/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc b/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc
index 7a6924e2ebf..461fb7deb78 100644
--- a/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc
+++ b/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc
@@ -414,7 +414,7 @@ TEST(GraphTransferer,
 
   GraphTransferer gt;
   gt.EnableStrictCheckMode(false);
-  profile_utils::CpuUtils::EnableClockCycleProfiling(true);
+  profile_utils::CpuUtils::EnableClockCycleProfiling();
   ClockCycleProfiler prof;
   prof.Start();
   Status status = gt.LoadGraphFromProtoFile(
@@ -447,7 +447,7 @@ TEST(GraphTransferer,
 
   GraphTransferer gt;
   gt.EnableStrictCheckMode(false);
-  profile_utils::CpuUtils::EnableClockCycleProfiling(true);
+  profile_utils::CpuUtils::EnableClockCycleProfiling();
   ClockCycleProfiler prof;
   prof.Start();
   Status status = gt.LoadGraphFromProtoFile(
@@ -481,7 +481,7 @@ TEST(GraphTransferer,
 
   GraphTransferer gt;
   gt.EnableStrictCheckMode(false);
-  profile_utils::CpuUtils::EnableClockCycleProfiling(true);
+  profile_utils::CpuUtils::EnableClockCycleProfiling();
   ClockCycleProfiler prof;
   prof.Start();
   Status status = gt.LoadGraphFromProtoFile(
@@ -540,7 +540,7 @@ TEST(GraphTransferer, DISABLED_RunInceptionV3OnHexagonExampleWithFusedGraph) {
 
 TEST(GraphTransferer, DISABLED_CheckShapeInferencePerformance) {
   CheckHexagonControllerVersion();
-  profile_utils::CpuUtils::EnableClockCycleProfiling(true);
+  profile_utils::CpuUtils::EnableClockCycleProfiling();
 
   const IRemoteFusedGraphOpsDefinitions* ops_definitions =
       &HexagonOpsDefinitions::getInstance();
diff --git a/tensorflow/core/kernels/host_constant_op.cc b/tensorflow/core/kernels/host_constant_op.cc
index cb1afdb1b3f..dbba1feba0a 100644
--- a/tensorflow/core/kernels/host_constant_op.cc
+++ b/tensorflow/core/kernels/host_constant_op.cc
@@ -54,13 +54,6 @@ REGISTER_KERNEL_BUILDER(Name("Const")
                         _HostConstantOp);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("Const")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("output")
-                            .TypeConstraint<int32>("dtype"),
-                        _HostConstantOp);
-#endif  // TENSORFLOW_USE_SYCL
 
 // HostConst: forced to generate output on the host.
 REGISTER_KERNEL_BUILDER(Name("HostConst").Device(DEVICE_CPU), _HostConstantOp);
diff --git a/tensorflow/core/kernels/identity_op.cc b/tensorflow/core/kernels/identity_op.cc
index aee7b545f79..b5a17d8d675 100644
--- a/tensorflow/core/kernels/identity_op.cc
+++ b/tensorflow/core/kernels/identity_op.cc
@@ -60,45 +60,6 @@ REGISTER_KERNEL_BUILDER(Name("Identity")
                             .HostMemory("output"),
                         IdentityOp);
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)                                           \
-  REGISTER_KERNEL_BUILDER(                                                   \
-      Name("Identity").Device(DEVICE_SYCL).TypeConstraint<type>("T"),        \
-      IdentityOp);                                                           \
-  REGISTER_KERNEL_BUILDER(                                                   \
-      Name("PreventGradient").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
-      IdentityOp);                                                           \
-  REGISTER_KERNEL_BUILDER(                                                   \
-      Name("RefIdentity").Device(DEVICE_SYCL).TypeConstraint<type>("T"),     \
-      IdentityOp);                                                           \
-  REGISTER_KERNEL_BUILDER(                                                   \
-      Name("StopGradient").Device(DEVICE_SYCL).TypeConstraint<type>("T"),    \
-      IdentityOp)
-
-TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
-
-#undef REGISTER_SYCL_KERNEL
-
-#define REGISTER_SYCL_HOST_KERNEL(type)                   \
-  REGISTER_KERNEL_BUILDER(Name("Identity")                \
-                              .Device(DEVICE_SYCL)        \
-                              .HostMemory("input")        \
-                              .HostMemory("output")       \
-                              .TypeConstraint<type>("T"), \
-                          IdentityOp);                    \
-  REGISTER_KERNEL_BUILDER(Name("RefIdentity")             \
-                              .Device(DEVICE_SYCL)        \
-                              .HostMemory("input")        \
-                              .HostMemory("output")       \
-                              .TypeConstraint<type>("T"), \
-                          IdentityOp)
-
-REGISTER_SYCL_HOST_KERNEL(int32);
-REGISTER_SYCL_HOST_KERNEL(bool);
-
-#undef REGISTER_SYCL_HOST_KERNEL
-
-#endif  // TENSORFLOW_USE_SYCL
 
 #define REGISTER_GPU_KERNEL(type)                                           \
   REGISTER_KERNEL_BUILDER(                                                  \
diff --git a/tensorflow/core/kernels/image/BUILD b/tensorflow/core/kernels/image/BUILD
new file mode 100644
index 00000000000..1c1004a43bb
--- /dev/null
+++ b/tensorflow/core/kernels/image/BUILD
@@ -0,0 +1,452 @@
+load("@bazel_skylib//rules:build_test.bzl", "build_test")
+load(
+    "//tensorflow:tensorflow.bzl",
+    "if_android",
+    "if_cuda_or_rocm",
+    "tf_cc_test",
+    "tf_cc_tests",
+    "tf_copts",
+    "tf_kernel_library",
+)
+load(
+    "//tensorflow/core/platform:build_config_root.bzl",
+    "tf_cuda_tests_tags",
+)
+load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
+
+# TODO(rmlarsen): Remove ASAP.
+package_group(
+    name = "friends",
+    packages = [
+        "//tensorflow/...",
+        "//tensorflow_text/...",
+    ],
+)
+
+package(
+    default_visibility = [
+        ":friends",
+        "//tensorflow:__subpackages__",
+        "//tensorflow:internal",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+# Export a few files for use on Android.
+exports_files([
+    "adjust_contrast_op.cc",
+    "adjust_contrast_op.h",
+    "adjust_hue_op.cc",
+    "adjust_hue_op.h",
+    "adjust_saturation_op.cc",
+    "adjust_saturation_op.h",
+    "crop_and_resize_op.cc",
+    "crop_and_resize_op.h",
+    "extract_image_patches_op.cc",
+    "extract_image_patches_op.h",
+    "image_ops.h",
+    "image_ops.cc",
+    "mirror_pad_op.cc",
+    "mirror_pad_op.h",
+    "mirror_pad_op_cpu_impl.h",
+    "mirror_pad_op_cpu_impl_1.cc",
+    "mirror_pad_op_cpu_impl_2.cc",
+    "mirror_pad_op_cpu_impl_3.cc",
+    "mirror_pad_op_cpu_impl_4.cc",
+    "mirror_pad_op_cpu_impl_5.cc",
+    "non_max_suppression_op.cc",
+    "non_max_suppression_op.h",
+    "resize_bilinear_op.cc",
+    "resize_bilinear_op.h",
+    "resize_nearest_neighbor_op.cc",
+    "resize_nearest_neighbor_op.h",
+    "sample_distorted_bounding_box_op.cc",
+    "decode_image_op.cc",
+    "encode_jpeg_op.cc",
+    "encode_png_op.cc",
+])
+
+# Private support libraries ---------------------------------------------------
+cc_library(
+    name = "sampling_kernels",
+    srcs = ["sampling_kernels.cc"],
+    hdrs = ["sampling_kernels.h"],
+    visibility = ["//visibility:private"],
+    deps = ["//tensorflow/core:lib"],
+)
+
+tf_cc_test(
+    name = "sampling_kernels_test",
+    srcs = ["sampling_kernels_test.cc"],
+    deps = [
+        ":sampling_kernels",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+# Public support libraries ----------------------------------------------------<
+cc_library(
+    name = "image",
+    deps = [
+        ":adjust_contrast_op",
+        ":adjust_hue_op",
+        ":adjust_saturation_op",
+        ":attention_ops",
+        ":colorspace_op",
+        ":crop_and_resize_op",
+        ":decode_image_op",
+        ":draw_bounding_box_op",
+        ":encode_jpeg_op",
+        ":encode_png_op",
+        ":extract_image_patches_op",
+        ":extract_jpeg_shape_op",
+        ":extract_volume_patches_op",
+        ":generate_box_proposals_op",
+        ":image_ops",
+        ":mirror_pad_op",
+        ":non_max_suppression_op",
+        ":random_crop_op",
+        ":resize_area_op",
+        ":resize_bicubic_op",
+        ":resize_bilinear_op",
+        ":resize_nearest_neighbor_op",
+        ":sample_distorted_bounding_box_op",
+        ":scale_and_translate_op",
+    ],
+)
+
+IMAGE_DEPS = [
+    "//third_party/eigen3",
+    "//tensorflow/core:framework",
+    "//tensorflow/core:gif_internal",
+    "//tensorflow/core:jpeg_internal",
+    "//tensorflow/core:lib",
+    "//tensorflow/core:lib_internal",
+    "//tensorflow/core:png_internal",
+    "//tensorflow/core:protos_all_cc",
+    "//tensorflow/core/kernels:bounds_check",
+    "//tensorflow/core/kernels:eigen_helpers",
+    "//tensorflow/core/util/tensor_bundle",
+    "//tensorflow/core/util:image_resizer_state",
+]
+
+IMAGE_TEST_DEPS = [
+    "//tensorflow/core/kernels:ops_testutil",
+    "//tensorflow/core/kernels:ops_util",
+    "//tensorflow/core:core_cpu",
+    "//tensorflow/core:framework",
+    "//tensorflow/core:lib",
+    "//tensorflow/core:lib_internal",
+    "//tensorflow/core:protos_all_cc",
+    "//tensorflow/core:test",
+    "//tensorflow/core:test_main",
+    "//tensorflow/core:testlib",
+]
+
+tf_kernel_library(
+    name = "adjust_contrast_op",
+    prefix = "adjust_contrast_op",
+    deps = IMAGE_DEPS,
+)
+
+cc_library(
+    name = "adjust_hsv_gpu_lib",
+    hdrs = ["adjust_hsv_gpu.cu.h"],
+    deps = ["//tensorflow/core:framework"],
+)
+
+tf_kernel_library(
+    name = "adjust_hue_op",
+    prefix = "adjust_hue_op",
+    deps = IMAGE_DEPS + [":adjust_hsv_gpu_lib"],
+)
+
+tf_kernel_library(
+    name = "adjust_saturation_op",
+    prefix = "adjust_saturation_op",
+    deps = IMAGE_DEPS + [":adjust_hsv_gpu_lib"],
+)
+
+tf_kernel_library(
+    name = "attention_ops",
+    prefix = "attention_ops",
+    deps = IMAGE_DEPS,
+)
+
+tf_kernel_library(
+    name = "colorspace_op",
+    prefix = "colorspace_op",
+    deps = IMAGE_DEPS,
+)
+
+tf_kernel_library(
+    name = "crop_and_resize_op",
+    prefix = "crop_and_resize_op",
+    deps = IMAGE_DEPS + ["//tensorflow/core:framework_internal"],
+)
+
+tf_kernel_library(
+    name = "decode_image_op",
+    prefix = "decode_image_op",
+    deps = IMAGE_DEPS + ["@com_google_absl//absl/strings"],
+)
+
+tf_kernel_library(
+    name = "draw_bounding_box_op",
+    prefix = "draw_bounding_box_op",
+    deps = IMAGE_DEPS,
+)
+
+tf_kernel_library(
+    name = "encode_jpeg_op",
+    prefix = "encode_jpeg_op",
+    deps = IMAGE_DEPS,
+)
+
+tf_kernel_library(
+    name = "encode_png_op",
+    prefix = "encode_png_op",
+    deps = IMAGE_DEPS,
+)
+
+tf_kernel_library(
+    name = "extract_jpeg_shape_op",
+    prefix = "extract_jpeg_shape_op",
+    deps = IMAGE_DEPS,
+)
+
+tf_kernel_library(
+    name = "extract_image_patches_op",
+    prefix = "extract_image_patches_op",
+    deps = [
+        "//tensorflow/core/kernels:ops_util",
+    ] + IMAGE_DEPS,
+)
+
+tf_kernel_library(
+    name = "extract_volume_patches_op",
+    prefix = "extract_volume_patches_op",
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/kernels:eigen_helpers",
+        "//tensorflow/core/kernels:ops_util",
+        "//third_party/eigen3",
+    ],
+)
+
+tf_kernel_library(
+    name = "generate_box_proposals_op",
+    gpu_srcs = ["generate_box_proposals_op.cu.cc"],
+    deps = ["//tensorflow/core/kernels:gpu_prim_hdrs"] + if_cuda_or_rocm([
+        ":non_max_suppression_op_gpu",
+    ]),
+)
+
+tf_kernel_library(
+    name = "non_max_suppression_op",
+    prefix = "non_max_suppression_op",
+    deps = IMAGE_DEPS + ["//tensorflow/core/kernels:gpu_prim_hdrs"],
+)
+
+tf_kernel_library(
+    name = "scale_and_translate_op",
+    prefix = "scale_and_translate_op",
+    deps = IMAGE_DEPS + [":sampling_kernels"],
+)
+
+tf_kernel_library(
+    name = "random_crop_op",
+    prefix = "random_crop_op",
+    deps = IMAGE_DEPS,
+)
+
+tf_kernel_library(
+    name = "resize_area_op",
+    prefix = "resize_area_op",
+    deps = IMAGE_DEPS,
+)
+
+tf_kernel_library(
+    name = "resize_bicubic_op",
+    prefix = "resize_bicubic_op",
+    deps = IMAGE_DEPS,
+)
+
+tf_kernel_library(
+    name = "resize_bilinear_op",
+    prefix = "resize_bilinear_op",
+    deps = IMAGE_DEPS + ["//tensorflow/core/kernels:cast_op"],
+)
+
+tf_kernel_library(
+    name = "resize_nearest_neighbor_op",
+    prefix = "resize_nearest_neighbor_op",
+    deps = IMAGE_DEPS,
+)
+
+tf_kernel_library(
+    name = "sample_distorted_bounding_box_op",
+    prefix = "sample_distorted_bounding_box_op",
+    deps = IMAGE_DEPS + ["//tensorflow/core/kernels:stateless_random_ops"],
+)
+
+tf_kernel_library(
+    name = "image_ops",
+    prefix = "image_ops",
+    deps = IMAGE_DEPS,
+)
+
+tf_kernel_library(
+    name = "mirror_pad_op",
+    prefix = "mirror_pad_op",
+    deps = IMAGE_DEPS,
+)
+
+# Tests ------------------------
+
+tf_cc_tests(
+    name = "bonus_tests",
+    srcs = [
+        "adjust_contrast_op_test.cc",
+        "colorspace_op_test.cc",
+        "crop_and_resize_op_test.cc",
+        "mirror_pad_op_test.cc",
+        "non_max_suppression_op_test.cc",
+        "resize_area_op_test.cc",
+        "resize_bicubic_op_test.cc",
+        "resize_nearest_neighbor_op_test.cc",
+        "scale_and_translate_op_test.cc",
+    ],
+    linkopts = select({
+        "//tensorflow:macos": ["-headerpad_max_install_names"],
+        "//conditions:default": [],
+    }),
+    deps = [
+        ":image",
+        ":sampling_kernels",
+        ":mirror_pad_op",
+    ] + IMAGE_TEST_DEPS,
+)
+
+tf_cc_test(
+    name = "non_max_suppression_op_benchmark_test",
+    srcs = ["non_max_suppression_op_benchmark_test.cc"],
+    deps = [
+        ":image",
+    ] + IMAGE_TEST_DEPS,
+)
+
+tf_cuda_cc_test(
+    name = "resize_bilinear_op_test",
+    srcs = ["resize_bilinear_op_test.cc"],
+    tags = ["no_cuda_on_cpu_tap"],
+    deps = [
+        ":image",
+        ":sampling_kernels",
+    ] + IMAGE_TEST_DEPS,
+)
+
+tf_cuda_cc_test(
+    name = "adjust_contrast_op_benchmark_test",
+    srcs = ["adjust_contrast_op_benchmark_test.cc"],
+    deps = [
+        ":image",
+    ] + IMAGE_TEST_DEPS,
+)
+
+tf_cuda_cc_test(
+    name = "crop_and_resize_op_benchmark_test",
+    srcs = ["crop_and_resize_op_benchmark_test.cc"],
+    deps = [
+        ":image",
+    ] + IMAGE_TEST_DEPS,
+)
+
+tf_cuda_cc_test(
+    name = "mirror_pad_op_benchmark_test",
+    srcs = ["mirror_pad_op_benchmark_test.cc"],
+    deps = [
+        ":mirror_pad_op",
+    ] + IMAGE_TEST_DEPS,
+)
+
+tf_cuda_cc_test(
+    name = "non_max_suppression_op_gpu_test",
+    srcs = ["non_max_suppression_op_gpu_test.cc"],
+    tags = tf_cuda_tests_tags() + ["no_cuda_on_cpu_tap"],
+    deps = [
+        ":image",
+        "@com_google_absl//absl/strings",
+    ] + IMAGE_TEST_DEPS,
+)
+
+tf_cuda_cc_test(
+    name = "resize_benchmark_test",
+    srcs = ["resize_op_benchmark_test.cc"],
+    deps = [
+        ":image",
+    ] + IMAGE_TEST_DEPS,
+)
+
+tf_cc_test(
+    name = "encode_jpeg_op_test",
+    size = "small",
+    srcs = ["encode_jpeg_op_test.cc"],
+    deps = [
+        ":encode_jpeg_op",
+    ] + IMAGE_TEST_DEPS,
+)
+
+cc_library(
+    name = "android_tensorflow_image_op",
+    srcs = if_android(["decode_image_op.cc"]),
+    copts = tf_copts(),
+    linkopts = ["-ldl"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:android_gif_internal",
+        "//tensorflow/core:android_jpeg_internal",
+        "//tensorflow/core:android_png_internal",
+        "//tensorflow/core:portable_tensorflow_lib_lite",
+    ],
+    alwayslink = 1,
+)
+
+build_test(
+    name = "android_tensorflow_image_op_build_test",
+    targets = [":android_tensorflow_image_op"],
+)
+
+# A file group which contains all operators which are known to work on mobile.
+filegroup(
+    name = "android_all_op_kernels",
+    srcs = glob(
+        [
+            "*.cc",
+            "*.h",
+        ],
+        exclude = [
+            "*test.cc",
+            "*test.h",
+            "*_test_*",
+            "decode_image_op.*",
+            "encode_png_op.*",
+            "encode_jpeg_op.*",
+            "extract_jpeg_shape_op.*",
+            "decode_jpeg_op.*",
+            "decode_and_crop_jpeg_op.*",
+            "decode_gif_op.*",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+filegroup(
+    name = "android_all_ops_textual_hdrs",
+    srcs = ["mirror_pad_op.h"],
+    visibility = ["//visibility:public"],
+)
diff --git a/tensorflow/core/kernels/adjust_contrast_op.cc b/tensorflow/core/kernels/image/adjust_contrast_op.cc
similarity index 94%
rename from tensorflow/core/kernels/adjust_contrast_op.cc
rename to tensorflow/core/kernels/image/adjust_contrast_op.cc
index c13619e0e5f..b43964aa064 100644
--- a/tensorflow/core/kernels/adjust_contrast_op.cc
+++ b/tensorflow/core/kernels/image/adjust_contrast_op.cc
@@ -16,8 +16,10 @@ limitations under the License.
 // See docs in ../ops/image_ops.cc
 #define EIGEN_USE_THREADS
 
-#include "tensorflow/core/kernels/adjust_contrast_op.h"
+#include "tensorflow/core/kernels/image/adjust_contrast_op.h"
+
 #include <memory>
+
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -31,9 +33,6 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif
 
 // AdjustContrastOp is deprecated as of GraphDef version >= 2
 
@@ -432,26 +431,5 @@ REGISTER_GPU(Eigen::half)
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-template <>
-class AdjustContrastOpv2<SYCLDevice, float> : public AdjustContrastOpV2Base {
- public:
-  explicit AdjustContrastOpv2(OpKernelConstruction* context)
-      : AdjustContrastOpV2Base(context) {}
-
-  void DoCompute(OpKernelContext* context,
-                 const ComputeOptions& options) override {
-    const int64 shape[4] = {options.batch, options.height, options.width,
-                            options.channels};
-    functor::AdjustContrastv2<SYCLDevice>()(
-        context->eigen_device<SYCLDevice>(),
-        options.input->shaped<float, 4>(shape), options.factor->scalar<float>(),
-        options.output->shaped<float, 4>(shape));
-  }
-};
-REGISTER_KERNEL_BUILDER(
-    Name("AdjustContrastv2").Device(DEVICE_SYCL).TypeConstraint<float>("T"),
-    AdjustContrastOpv2<SYCLDevice, float>);
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/adjust_contrast_op.h b/tensorflow/core/kernels/image/adjust_contrast_op.h
similarity index 97%
rename from tensorflow/core/kernels/adjust_contrast_op.h
rename to tensorflow/core/kernels/image/adjust_contrast_op.h
index 3e501bccee3..4bff5f73a63 100644
--- a/tensorflow/core/kernels/adjust_contrast_op.h
+++ b/tensorflow/core/kernels/image/adjust_contrast_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_KERNELS_ADJUST_CONTRAST_OP_H_
-#define TENSORFLOW_CORE_KERNELS_ADJUST_CONTRAST_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_IMAGEADJUST_CONTRAST_OP_H_
+#define TENSORFLOW_CORE_KERNELS_IMAGEADJUST_CONTRAST_OP_H_
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor_types.h"
 
@@ -157,4 +157,4 @@ struct AdjustContrastv2 {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_ADJUST_CONTRAST_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_IMAGEADJUST_CONTRAST_OP_H_
diff --git a/tensorflow/core/kernels/adjust_contrast_op_benchmark_test.cc b/tensorflow/core/kernels/image/adjust_contrast_op_benchmark_test.cc
similarity index 96%
rename from tensorflow/core/kernels/adjust_contrast_op_benchmark_test.cc
rename to tensorflow/core/kernels/image/adjust_contrast_op_benchmark_test.cc
index 0b9142ce1b5..bcbbc24d471 100644
--- a/tensorflow/core/kernels/adjust_contrast_op_benchmark_test.cc
+++ b/tensorflow/core/kernels/image/adjust_contrast_op_benchmark_test.cc
@@ -60,8 +60,5 @@ BM_AdjustContrastDev(cpu, 1, 299, 299);
     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 BM_AdjustContrastDev(gpu, 32, 299, 299);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#ifdef TENSORFLOW_USE_SYCL
-BM_AdjustContrastDev(sycl, 32, 299, 299);
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/adjust_contrast_op_gpu.cu.cc b/tensorflow/core/kernels/image/adjust_contrast_op_gpu.cu.cc
similarity index 96%
rename from tensorflow/core/kernels/adjust_contrast_op_gpu.cu.cc
rename to tensorflow/core/kernels/image/adjust_contrast_op_gpu.cu.cc
index e072dc46f5f..147700c1574 100644
--- a/tensorflow/core/kernels/adjust_contrast_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/image/adjust_contrast_op_gpu.cu.cc
@@ -18,9 +18,8 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
-#include "tensorflow/core/kernels/adjust_contrast_op.h"
-
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/image/adjust_contrast_op.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/adjust_contrast_op_test.cc b/tensorflow/core/kernels/image/adjust_contrast_op_test.cc
similarity index 100%
rename from tensorflow/core/kernels/adjust_contrast_op_test.cc
rename to tensorflow/core/kernels/image/adjust_contrast_op_test.cc
diff --git a/tensorflow/core/kernels/adjust_hsv_gpu.cu.h b/tensorflow/core/kernels/image/adjust_hsv_gpu.cu.h
similarity index 96%
rename from tensorflow/core/kernels/adjust_hsv_gpu.cu.h
rename to tensorflow/core/kernels/image/adjust_hsv_gpu.cu.h
index ba4427ffb9d..42511f249bb 100644
--- a/tensorflow/core/kernels/adjust_hsv_gpu.cu.h
+++ b/tensorflow/core/kernels/image/adjust_hsv_gpu.cu.h
@@ -11,8 +11,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CORE_KERNELS_ADJUST_HSV_GPU_CU_H_
-#define TENSORFLOW_CORE_KERNELS_ADJUST_HSV_GPU_CU_H_
+#ifndef TENSORFLOW_CORE_KERNELS_IMAGEADJUST_HSV_GPU_CU_H_
+#define TENSORFLOW_CORE_KERNELS_IMAGEADJUST_HSV_GPU_CU_H_
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
@@ -142,4 +142,4 @@ __global__ void adjust_hsv_nhwc(
 }  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#endif  // TENSORFLOW_CORE_KERNELS_ADJUST_HSV_GPU_CU_H_
+#endif  // TENSORFLOW_CORE_KERNELS_IMAGEADJUST_HSV_GPU_CU_H_
diff --git a/tensorflow/core/kernels/adjust_hue_op.cc b/tensorflow/core/kernels/image/adjust_hue_op.cc
similarity index 99%
rename from tensorflow/core/kernels/adjust_hue_op.cc
rename to tensorflow/core/kernels/image/adjust_hue_op.cc
index c1993029ac6..764665be48e 100644
--- a/tensorflow/core/kernels/adjust_hue_op.cc
+++ b/tensorflow/core/kernels/image/adjust_hue_op.cc
@@ -17,9 +17,9 @@ limitations under the License.
 #define EIGEN_USE_GPU
 #endif
 
-#include <memory>
+#include "tensorflow/core/kernels/image/adjust_hue_op.h"
 
-#include "tensorflow/core/kernels/adjust_hue_op.h"
+#include <memory>
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
diff --git a/tensorflow/core/kernels/adjust_hue_op.h b/tensorflow/core/kernels/image/adjust_hue_op.h
similarity index 88%
rename from tensorflow/core/kernels/adjust_hue_op.h
rename to tensorflow/core/kernels/image/adjust_hue_op.h
index edaf7f538e3..6a5758a44fb 100644
--- a/tensorflow/core/kernels/adjust_hue_op.h
+++ b/tensorflow/core/kernels/image/adjust_hue_op.h
@@ -11,8 +11,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CORE_KERNELS_ADJUST_HUE_OP_H_
-#define TENSORFLOW_CORE_KERNELS_ADJUST_HUE_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_IMAGEADJUST_HUE_OP_H_
+#define TENSORFLOW_CORE_KERNELS_IMAGEADJUST_HUE_OP_H_
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
@@ -38,4 +38,4 @@ struct AdjustHueGPU {
 }  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#endif  // TENSORFLOW_CORE_KERNELS_ADJUST_HUE_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_IMAGEADJUST_HUE_OP_H_
diff --git a/tensorflow/core/kernels/adjust_hue_op_gpu.cu.cc b/tensorflow/core/kernels/image/adjust_hue_op_gpu.cu.cc
similarity index 93%
rename from tensorflow/core/kernels/adjust_hue_op_gpu.cu.cc
rename to tensorflow/core/kernels/image/adjust_hue_op_gpu.cu.cc
index 174ca0002af..10c1ddb6aaf 100644
--- a/tensorflow/core/kernels/adjust_hue_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/image/adjust_hue_op_gpu.cu.cc
@@ -16,8 +16,8 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
-#include "tensorflow/core/kernels/adjust_hsv_gpu.cu.h"
-#include "tensorflow/core/kernels/adjust_hue_op.h"
+#include "tensorflow/core/kernels/image/adjust_hsv_gpu.cu.h"
+#include "tensorflow/core/kernels/image/adjust_hue_op.h"
 #include "tensorflow/core/util/gpu_kernel_helper.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/adjust_saturation_op.cc b/tensorflow/core/kernels/image/adjust_saturation_op.cc
similarity index 99%
rename from tensorflow/core/kernels/adjust_saturation_op.cc
rename to tensorflow/core/kernels/image/adjust_saturation_op.cc
index d1fc9d349be..41b0988cc50 100644
--- a/tensorflow/core/kernels/adjust_saturation_op.cc
+++ b/tensorflow/core/kernels/image/adjust_saturation_op.cc
@@ -18,8 +18,10 @@ limitations under the License.
 #define EIGEN_USE_GPU
 #endif
 
-#include "tensorflow/core/kernels/adjust_saturation_op.h"
+#include "tensorflow/core/kernels/image/adjust_saturation_op.h"
+
 #include <memory>
+
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
diff --git a/tensorflow/core/kernels/adjust_saturation_op.h b/tensorflow/core/kernels/image/adjust_saturation_op.h
similarity index 87%
rename from tensorflow/core/kernels/adjust_saturation_op.h
rename to tensorflow/core/kernels/image/adjust_saturation_op.h
index 0117a48ead8..4a1a619e1fd 100644
--- a/tensorflow/core/kernels/adjust_saturation_op.h
+++ b/tensorflow/core/kernels/image/adjust_saturation_op.h
@@ -11,8 +11,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CORE_KERNELS_ADJUST_SATURATION_OP_H_
-#define TENSORFLOW_CORE_KERNELS_ADJUST_SATURATION_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_IMAGEADJUST_SATURATION_OP_H_
+#define TENSORFLOW_CORE_KERNELS_IMAGEADJUST_SATURATION_OP_H_
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
@@ -38,4 +38,4 @@ struct AdjustSaturationGPU {
 }  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#endif  // TENSORFLOW_CORE_KERNELS_ADJUST_SATURATION_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_IMAGEADJUST_SATURATION_OP_H_
diff --git a/tensorflow/core/kernels/adjust_saturation_op_gpu.cu.cc b/tensorflow/core/kernels/image/adjust_saturation_op_gpu.cu.cc
similarity index 93%
rename from tensorflow/core/kernels/adjust_saturation_op_gpu.cu.cc
rename to tensorflow/core/kernels/image/adjust_saturation_op_gpu.cu.cc
index c2ef9a4d273..59541e41b46 100644
--- a/tensorflow/core/kernels/adjust_saturation_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/image/adjust_saturation_op_gpu.cu.cc
@@ -16,8 +16,8 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
-#include "tensorflow/core/kernels/adjust_hsv_gpu.cu.h"
-#include "tensorflow/core/kernels/adjust_saturation_op.h"
+#include "tensorflow/core/kernels/image/adjust_hsv_gpu.cu.h"
+#include "tensorflow/core/kernels/image/adjust_saturation_op.h"
 #include "tensorflow/core/util/gpu_kernel_helper.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/attention_ops.cc b/tensorflow/core/kernels/image/attention_ops.cc
similarity index 100%
rename from tensorflow/core/kernels/attention_ops.cc
rename to tensorflow/core/kernels/image/attention_ops.cc
diff --git a/tensorflow/core/kernels/colorspace_op.cc b/tensorflow/core/kernels/image/colorspace_op.cc
similarity index 90%
rename from tensorflow/core/kernels/colorspace_op.cc
rename to tensorflow/core/kernels/image/colorspace_op.cc
index 6c817f73058..8e81038ea0a 100644
--- a/tensorflow/core/kernels/colorspace_op.cc
+++ b/tensorflow/core/kernels/image/colorspace_op.cc
@@ -16,6 +16,8 @@ limitations under the License.
 // See docs in ../ops/array_ops.cc.
 #define EIGEN_USE_THREADS
 
+#include "tensorflow/core/kernels/image/colorspace_op.h"
+
 #include <algorithm>
 #include <cmath>
 
@@ -26,7 +28,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/colorspace_op.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
@@ -35,9 +36,6 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif
 
 template <typename Device, typename T>
 class RGBToHSVOp : public OpKernel {
@@ -149,16 +147,5 @@ TF_CALL_float(REGISTER_GPU);
 TF_CALL_double(REGISTER_GPU);
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL(T)                                           \
-  REGISTER_KERNEL_BUILDER(                                         \
-      Name("RGBToHSV").Device(DEVICE_SYCL).TypeConstraint<T>("T"), \
-      RGBToHSVOp<SYCLDevice, T>);                                  \
-  REGISTER_KERNEL_BUILDER(                                         \
-      Name("HSVToRGB").Device(DEVICE_SYCL).TypeConstraint<T>("T"), \
-      HSVToRGBOp<SYCLDevice, T>);
-TF_CALL_float(REGISTER_SYCL);
-TF_CALL_double(REGISTER_SYCL);
-#endif
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/colorspace_op.h b/tensorflow/core/kernels/image/colorspace_op.h
similarity index 95%
rename from tensorflow/core/kernels/colorspace_op.h
rename to tensorflow/core/kernels/image/colorspace_op.h
index 4de14bc3391..486aa1f5dca 100644
--- a/tensorflow/core/kernels/colorspace_op.h
+++ b/tensorflow/core/kernels/image/colorspace_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_KERNELS_COLORSPACE_OP_H_
-#define TENSORFLOW_CORE_KERNELS_COLORSPACE_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_IMAGECOLORSPACE_OP_H_
+#define TENSORFLOW_CORE_KERNELS_IMAGECOLORSPACE_OP_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor_shape.h"
@@ -91,4 +91,4 @@ struct HSVToRGB {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_COLORSPACE_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_IMAGECOLORSPACE_OP_H_
diff --git a/tensorflow/core/kernels/colorspace_op_gpu.cu.cc b/tensorflow/core/kernels/image/colorspace_op_gpu.cu.cc
similarity index 95%
rename from tensorflow/core/kernels/colorspace_op_gpu.cu.cc
rename to tensorflow/core/kernels/image/colorspace_op_gpu.cu.cc
index 227490a2056..c49698e4c04 100644
--- a/tensorflow/core/kernels/colorspace_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/image/colorspace_op_gpu.cu.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #define EIGEN_USE_GPU
 
 #include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/kernels/colorspace_op.h"
+#include "tensorflow/core/kernels/image/colorspace_op.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/colorspace_op_test.cc b/tensorflow/core/kernels/image/colorspace_op_test.cc
similarity index 100%
rename from tensorflow/core/kernels/colorspace_op_test.cc
rename to tensorflow/core/kernels/image/colorspace_op_test.cc
diff --git a/tensorflow/core/kernels/crop_and_resize_op.cc b/tensorflow/core/kernels/image/crop_and_resize_op.cc
similarity index 99%
rename from tensorflow/core/kernels/crop_and_resize_op.cc
rename to tensorflow/core/kernels/image/crop_and_resize_op.cc
index 23058788a4b..1979b0514c6 100644
--- a/tensorflow/core/kernels/crop_and_resize_op.cc
+++ b/tensorflow/core/kernels/image/crop_and_resize_op.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#include "tensorflow/core/kernels/crop_and_resize_op.h"
+#include "tensorflow/core/kernels/image/crop_and_resize_op.h"
 
 #include <functional>
 #include <string>
diff --git a/tensorflow/core/kernels/crop_and_resize_op.h b/tensorflow/core/kernels/image/crop_and_resize_op.h
similarity index 93%
rename from tensorflow/core/kernels/crop_and_resize_op.h
rename to tensorflow/core/kernels/image/crop_and_resize_op.h
index 66ff695d9ce..c26380e395c 100644
--- a/tensorflow/core/kernels/crop_and_resize_op.h
+++ b/tensorflow/core/kernels/image/crop_and_resize_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_KERNELS_CROP_AND_RESIZE_OP_H_
-#define TENSORFLOW_CORE_KERNELS_CROP_AND_RESIZE_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_IMAGECROP_AND_RESIZE_OP_H_
+#define TENSORFLOW_CORE_KERNELS_IMAGECROP_AND_RESIZE_OP_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/numeric_types.h"
@@ -69,4 +69,4 @@ struct CheckValidBoxIndexHelper {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_CROP_AND_RESIZE_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_IMAGECROP_AND_RESIZE_OP_H_
diff --git a/tensorflow/core/kernels/crop_and_resize_op_benchmark_test.cc b/tensorflow/core/kernels/image/crop_and_resize_op_benchmark_test.cc
similarity index 100%
rename from tensorflow/core/kernels/crop_and_resize_op_benchmark_test.cc
rename to tensorflow/core/kernels/image/crop_and_resize_op_benchmark_test.cc
diff --git a/tensorflow/core/kernels/crop_and_resize_op_gpu.cu.cc b/tensorflow/core/kernels/image/crop_and_resize_op_gpu.cu.cc
similarity index 99%
rename from tensorflow/core/kernels/crop_and_resize_op_gpu.cu.cc
rename to tensorflow/core/kernels/image/crop_and_resize_op_gpu.cu.cc
index e64a055503f..e4bbbfa108a 100644
--- a/tensorflow/core/kernels/crop_and_resize_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/image/crop_and_resize_op_gpu.cu.cc
@@ -21,7 +21,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/kernels/crop_and_resize_op.h"
+#include "tensorflow/core/kernels/image/crop_and_resize_op.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/gpu_kernel_helper.h"
 
diff --git a/tensorflow/core/kernels/crop_and_resize_op_test.cc b/tensorflow/core/kernels/image/crop_and_resize_op_test.cc
similarity index 100%
rename from tensorflow/core/kernels/crop_and_resize_op_test.cc
rename to tensorflow/core/kernels/image/crop_and_resize_op_test.cc
diff --git a/tensorflow/core/kernels/decode_image_op.cc b/tensorflow/core/kernels/image/decode_image_op.cc
similarity index 71%
rename from tensorflow/core/kernels/decode_image_op.cc
rename to tensorflow/core/kernels/image/decode_image_op.cc
index 407ff59874e..7b55ec35750 100644
--- a/tensorflow/core/kernels/decode_image_op.cc
+++ b/tensorflow/core/kernels/image/decode_image_op.cc
@@ -67,296 +67,6 @@ FileFormat ClassifyFileFormat(StringPiece data) {
   return kUnknownFormat;
 }
 
-string FileFormatString(FileFormat magic, StringPiece data) {
-  switch (magic) {
-    case kPngFormat:
-      return "PNG";
-    case kJpgFormat:
-      return "JPEG";
-    case kGifFormat:
-      return "GIF";
-    default: {
-      if (data.empty()) return "empty file";
-      return strings::StrCat("unknown format starting with '",
-                             absl::CEscape(data.substr(0, 16)), "'");
-    }
-  }
-}
-
-// Decode an image (either jpeg, png, or gif).  We use a single op so that
-// users don't have to care about which format they have.
-// TODO(b/141645641): Separate concerns here: constructors uses name to
-// determine type of parsing, compute uses file magic to parse and these might
-// not match.
-class DecodeImageOp : public OpKernel {
- public:
-  explicit DecodeImageOp(OpKernelConstruction* context) : OpKernel(context) {
-    // Determine which op we are: jpeg, png, gif, or any
-    if (type_string() == "DecodeJpeg") {
-      format_ = kJpgFormat;
-    } else if (type_string() == "DecodeAndCropJpeg") {
-      format_ = kJpgFormat;
-      flags_.crop = true;
-    } else if (type_string() == "DecodePng") {
-      format_ = kPngFormat;
-    } else if (type_string() == "DecodeGif") {
-      format_ = kGifFormat;
-    } else {
-      OP_REQUIRES_OK(context,
-                     errors::InvalidArgument("Bad op type ", type_string()));
-    }
-
-    if (format_ == kGifFormat) {
-      channels_ = 3;
-    } else {
-      OP_REQUIRES_OK(context, context->GetAttr("channels", &channels_));
-      OP_REQUIRES(
-          context,
-          channels_ == 0 || channels_ == 1 || channels_ == 3 || channels_ == 4,
-          errors::InvalidArgument("channels must be 0, 1, 3, or 4, got ",
-                                  channels_));
-    }
-    flags_.components = channels_;
-
-    // In the case of png, we support uint16 output
-    if (format_ == kPngFormat) {
-      DataType dt;
-      OP_REQUIRES_OK(context, context->GetAttr("dtype", &dt));
-      OP_REQUIRES(
-          context, dt == DataType::DT_UINT8 || dt == DataType::DT_UINT16,
-          errors::InvalidArgument("Type must be uint8 or uint16, got ", dt));
-      if (dt == DataType::DT_UINT8) {
-        channel_bits_ = 8;
-      } else {
-        channel_bits_ = 16;
-      }
-    }
-
-    // The TensorFlow-chosen default for jpeg decoding is IFAST, sacrificing
-    // image quality for speed.
-    flags_.dct_method = JDCT_IFAST;
-
-    if (format_ == kJpgFormat) {
-      OP_REQUIRES_OK(context, context->GetAttr("ratio", &flags_.ratio));
-      OP_REQUIRES(context,
-                  flags_.ratio == 1 || flags_.ratio == 2 || flags_.ratio == 4 ||
-                      flags_.ratio == 8,
-                  errors::InvalidArgument("ratio must be 1, 2, 4, or 8, got ",
-                                          flags_.ratio));
-      OP_REQUIRES_OK(context, context->GetAttr("fancy_upscaling",
-                                               &flags_.fancy_upscaling));
-      OP_REQUIRES_OK(context,
-                     context->GetAttr("try_recover_truncated",
-                                      &flags_.try_recover_truncated_jpeg));
-      OP_REQUIRES_OK(context,
-                     context->GetAttr("acceptable_fraction",
-                                      &flags_.min_acceptable_fraction));
-
-      string dct_method;
-      OP_REQUIRES_OK(context, context->GetAttr("dct_method", &dct_method));
-      OP_REQUIRES(
-          context,
-          (dct_method.empty() || dct_method == "INTEGER_FAST" ||
-           dct_method == "INTEGER_ACCURATE"),
-          errors::InvalidArgument("dct_method must be one of "
-                                  "{'', 'INTEGER_FAST', 'INTEGER_ACCURATE'}"));
-      if (dct_method == "INTEGER_FAST") {
-        flags_.dct_method = JDCT_IFAST;
-      } else if (dct_method == "INTEGER_ACCURATE") {
-        flags_.dct_method = JDCT_ISLOW;
-      }
-    }
-  }
-
-  void Compute(OpKernelContext* context) override {
-    const Tensor& contents = context->input(0);
-    OP_REQUIRES(context, TensorShapeUtils::IsScalar(contents.shape()),
-                errors::InvalidArgument("contents must be scalar, got shape ",
-                                        contents.shape().DebugString()));
-
-    // Determine format
-    const StringPiece input = contents.scalar<tstring>()();
-    const auto magic = ClassifyFileFormat(input);
-    OP_REQUIRES(
-        context,
-        magic == kJpgFormat || magic == kPngFormat || magic == kGifFormat,
-        errors::InvalidArgument("Expected image (JPEG, PNG, or GIF), got ",
-                                FileFormatString(magic, input)));
-    OP_REQUIRES(context, input.size() <= std::numeric_limits<int>::max(),
-                errors::InvalidArgument(
-                    FileFormatString(magic, input),
-                    " contents are too large for int: ", input.size()));
-    OP_REQUIRES(context, magic == kPngFormat || channel_bits_ == 8,
-                errors::InvalidArgument(FileFormatString(magic, input),
-                                        " does not support uint16 output"));
-
-    switch (magic) {
-      case kJpgFormat:
-        DecodeJpeg(context, input);
-        break;
-      case kPngFormat:
-        DecodePng(context, input);
-        break;
-      case kGifFormat:
-        DecodeGif(context, input);
-        break;
-      default:
-        LOG(FATAL) << "Should never get here after check above";
-        break;
-    }
-  }
-
-  void DecodeJpeg(OpKernelContext* context, StringPiece input) {
-    OP_REQUIRES(context, channels_ == 0 || channels_ == 1 || channels_ == 3,
-                errors::InvalidArgument(
-                    "channels must be 0, 1, or 3 for JPEG, got ", channels_));
-
-    // Use local copy of flags to avoid race condition as the class member is
-    // shared among different invocations.
-    jpeg::UncompressFlags flags = flags_;
-    if (flags.crop) {
-      // Update flags to include crop window.
-      const Tensor& crop_window = context->input(1);
-      OP_REQUIRES(context, crop_window.dims() == 1,
-                  errors::InvalidArgument("crop_window must be 1-D, got shape ",
-                                          crop_window.shape().DebugString()));
-      OP_REQUIRES(context, crop_window.dim_size(0) == 4,
-                  errors::InvalidArgument("crop_size must have four elements ",
-                                          crop_window.shape().DebugString()));
-      auto crop_window_vec = crop_window.vec<int32>();
-      flags.crop_y = crop_window_vec(0);
-      flags.crop_x = crop_window_vec(1);
-      flags.crop_height = crop_window_vec(2);
-      flags.crop_width = crop_window_vec(3);
-    }
-
-    // Decode jpeg, allocating tensor once the size is known.
-    Tensor* output = nullptr;
-    OP_REQUIRES(
-        context,
-        jpeg::Uncompress(
-            input.data(), input.size(), flags, nullptr /* nwarn */,
-            [=, &output](int width, int height, int channels) -> uint8* {
-              Status status(context->allocate_output(
-                  0,
-                  format_ == kGifFormat
-                      ? TensorShape({1, height, width, channels})
-                      : TensorShape({height, width, channels}),
-                  &output));
-              if (!status.ok()) {
-                VLOG(1) << status;
-                context->SetStatus(status);
-                return nullptr;
-              }
-              return output->flat<uint8>().data();
-            }),
-        errors::InvalidArgument("Invalid JPEG data or crop window, data size ",
-                                input.size()));
-  }
-
-  void DecodePng(OpKernelContext* context, StringPiece input) {
-    // Start decoding png to get shape details
-    png::DecodeContext decode;
-    OP_REQUIRES(context,
-                png::CommonInitDecode(input, channels_, channel_bits_, &decode),
-                errors::InvalidArgument("Invalid PNG header, data size ",
-                                        input.size()));
-
-    // Verify that width and height are not too large:
-    // - verify width and height don't overflow int.
-    // - width can later be multiplied by channels_ and sizeof(uint16), so
-    //   verify single dimension is not too large.
-    // - verify when width and height are multiplied together, there are a few
-    //   bits to spare as well.
-    const int width = static_cast<int>(decode.width);
-    const int height = static_cast<int>(decode.height);
-    const int64 total_size =
-        static_cast<int64>(width) * static_cast<int64>(height);
-    if (width != static_cast<int64>(decode.width) || width <= 0 ||
-        width >= (1LL << 27) || height != static_cast<int64>(decode.height) ||
-        height <= 0 || height >= (1LL << 27) || total_size >= (1LL << 29)) {
-      png::CommonFreeDecode(&decode);
-      OP_REQUIRES(context, false,
-                  errors::InvalidArgument("PNG size too large for int: ",
-                                          decode.width, " by ", decode.height));
-    }
-
-    // Allocate tensor
-    Tensor* output = nullptr;
-    const auto status = context->allocate_output(
-        0,
-        format_ == kGifFormat ? TensorShape({1, height, width, decode.channels})
-                              : TensorShape({height, width, decode.channels}),
-        &output);
-    if (!status.ok()) png::CommonFreeDecode(&decode);
-    OP_REQUIRES_OK(context, status);
-
-    if (channel_bits_ == 8) {
-      // Finish decoding png
-      OP_REQUIRES(
-          context,
-          png::CommonFinishDecode(
-              reinterpret_cast<png_bytep>(output->flat<uint8>().data()),
-              decode.channels * width * sizeof(uint8), &decode),
-          errors::InvalidArgument("Invalid PNG data, size ", input.size()));
-    } else {
-      // Finish decoding png
-      OP_REQUIRES(
-          context,
-          png::CommonFinishDecode(
-              reinterpret_cast<png_bytep>(output->flat<uint16>().data()),
-              decode.channels * width * sizeof(uint16), &decode),
-          errors::InvalidArgument("Invalid PNG data, size ", input.size()));
-    }
-  }
-
-  void DecodeGif(OpKernelContext* context, StringPiece input) {
-    OP_REQUIRES(context, channels_ == 0 || channels_ == 3,
-                errors::InvalidArgument("channels must be 0 or 3 for GIF, got ",
-                                        channels_));
-
-    // Decode GIF, allocating tensor once the size is known.
-    Tensor* output = nullptr;
-    string error_string;
-    OP_REQUIRES(
-        context,
-        gif::Decode(input.data(), input.size(),
-                    [=, &output](int num_frames, int width, int height,
-                                 int channels) -> uint8* {
-                      Status status;
-                      if (format_ == kGifFormat) {
-                        status = context->allocate_output(
-                            0,
-                            TensorShape({num_frames, height, width, channels}),
-                            &output);
-                      } else if (num_frames == 1) {
-                        status = context->allocate_output(
-                            0, TensorShape({height, width, channels}), &output);
-                      } else {
-                        status = errors::InvalidArgument(
-                            "Got ", num_frames, " frames, but animated gifs ",
-                            "can only be decoded by tf.io.decode_gif or ",
-                            "tf.io.decode_image");
-                      }
-                      if (!status.ok()) {
-                        VLOG(1) << status;
-                        context->SetStatus(status);
-                        return nullptr;
-                      }
-                      return output->flat<uint8>().data();
-                    },
-                    &error_string),
-        errors::InvalidArgument("Invalid GIF data (size ", input.size(), "), ",
-                                error_string));
-  }
-
- private:
-  FileFormat format_;
-  int channels_;
-  int channel_bits_ = 8;
-  jpeg::UncompressFlags flags_;
-};
-
 // Decode an image. Supported image formats are JPEG, PNG, GIF and BMP. This is
 // a newer version of `DecodeImageOp` for enabling image data parsing to take
 // place in kernels only, reducing security vulnerabilities and redundancy.
diff --git a/tensorflow/core/kernels/draw_bounding_box_op.cc b/tensorflow/core/kernels/image/draw_bounding_box_op.cc
similarity index 100%
rename from tensorflow/core/kernels/draw_bounding_box_op.cc
rename to tensorflow/core/kernels/image/draw_bounding_box_op.cc
diff --git a/tensorflow/core/kernels/encode_jpeg_op.cc b/tensorflow/core/kernels/image/encode_jpeg_op.cc
similarity index 100%
rename from tensorflow/core/kernels/encode_jpeg_op.cc
rename to tensorflow/core/kernels/image/encode_jpeg_op.cc
diff --git a/tensorflow/core/kernels/encode_jpeg_op_test.cc b/tensorflow/core/kernels/image/encode_jpeg_op_test.cc
similarity index 100%
rename from tensorflow/core/kernels/encode_jpeg_op_test.cc
rename to tensorflow/core/kernels/image/encode_jpeg_op_test.cc
diff --git a/tensorflow/core/kernels/encode_png_op.cc b/tensorflow/core/kernels/image/encode_png_op.cc
similarity index 100%
rename from tensorflow/core/kernels/encode_png_op.cc
rename to tensorflow/core/kernels/image/encode_png_op.cc
diff --git a/tensorflow/core/kernels/extract_image_patches_op.cc b/tensorflow/core/kernels/image/extract_image_patches_op.cc
similarity index 98%
rename from tensorflow/core/kernels/extract_image_patches_op.cc
rename to tensorflow/core/kernels/image/extract_image_patches_op.cc
index 4e87dfc93a4..a7890090acb 100644
--- a/tensorflow/core/kernels/extract_image_patches_op.cc
+++ b/tensorflow/core/kernels/image/extract_image_patches_op.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #define USE_EIGEN_TENSOR
 #define EIGEN_USE_THREADS
 
-#include "tensorflow/core/kernels/extract_image_patches_op.h"
+#include "tensorflow/core/kernels/image/extract_image_patches_op.h"
 
 #include <vector>
 
diff --git a/tensorflow/core/kernels/extract_image_patches_op.h b/tensorflow/core/kernels/image/extract_image_patches_op.h
similarity index 91%
rename from tensorflow/core/kernels/extract_image_patches_op.h
rename to tensorflow/core/kernels/image/extract_image_patches_op.h
index 64b8c0338bd..ba952275c3e 100644
--- a/tensorflow/core/kernels/extract_image_patches_op.h
+++ b/tensorflow/core/kernels/image/extract_image_patches_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_KERNELS_EXTRACT_IMAGE_PATCHES_OP_H_
-#define TENSORFLOW_CORE_KERNELS_EXTRACT_IMAGE_PATCHES_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_IMAGEEXTRACT_IMAGE_PATCHES_OP_H_
+#define TENSORFLOW_CORE_KERNELS_IMAGEEXTRACT_IMAGE_PATCHES_OP_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor_shape.h"
@@ -53,4 +53,4 @@ struct ExtractImagePatchesForward {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_EXTRACT_IMAGE_PATCHES_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_IMAGEEXTRACT_IMAGE_PATCHES_OP_H_
diff --git a/tensorflow/core/kernels/extract_image_patches_op_gpu.cu.cc b/tensorflow/core/kernels/image/extract_image_patches_op_gpu.cu.cc
similarity index 94%
rename from tensorflow/core/kernels/extract_image_patches_op_gpu.cu.cc
rename to tensorflow/core/kernels/image/extract_image_patches_op_gpu.cu.cc
index e6a49da7fd2..37b9c9bda32 100644
--- a/tensorflow/core/kernels/extract_image_patches_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/image/extract_image_patches_op_gpu.cu.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #define EIGEN_USE_GPU
 
 #include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/kernels/extract_image_patches_op.h"
+#include "tensorflow/core/kernels/image/extract_image_patches_op.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/extract_jpeg_shape_op.cc b/tensorflow/core/kernels/image/extract_jpeg_shape_op.cc
similarity index 100%
rename from tensorflow/core/kernels/extract_jpeg_shape_op.cc
rename to tensorflow/core/kernels/image/extract_jpeg_shape_op.cc
diff --git a/tensorflow/core/kernels/extract_volume_patches_op.cc b/tensorflow/core/kernels/image/extract_volume_patches_op.cc
similarity index 99%
rename from tensorflow/core/kernels/extract_volume_patches_op.cc
rename to tensorflow/core/kernels/image/extract_volume_patches_op.cc
index 3f003b6f7f6..e48e7602afa 100644
--- a/tensorflow/core/kernels/extract_volume_patches_op.cc
+++ b/tensorflow/core/kernels/image/extract_volume_patches_op.cc
@@ -24,7 +24,7 @@ when rates are to be added.
 #define USE_EIGEN_TENSOR
 #define EIGEN_USE_THREADS
 
-#include "tensorflow/core/kernels/extract_volume_patches_op.h"
+#include "tensorflow/core/kernels/image/extract_volume_patches_op.h"
 
 #include <vector>
 
diff --git a/tensorflow/core/kernels/extract_volume_patches_op.h b/tensorflow/core/kernels/image/extract_volume_patches_op.h
similarity index 92%
rename from tensorflow/core/kernels/extract_volume_patches_op.h
rename to tensorflow/core/kernels/image/extract_volume_patches_op.h
index 7e0502b7707..f20ee6a6ade 100644
--- a/tensorflow/core/kernels/extract_volume_patches_op.h
+++ b/tensorflow/core/kernels/image/extract_volume_patches_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_EXTRACT_VOLUME_PATCHES_OP_H_
-#define TENSORFLOW_KERNELS_EXTRACT_VOLUME_PATCHES_OP_H_
+#ifndef TENSORFLOW_KERNELS_IMAGE_EXTRACT_VOLUME_PATCHES_OP_H_
+#define TENSORFLOW_KERNELS_IMAGE_EXTRACT_VOLUME_PATCHES_OP_H_
 
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
@@ -55,4 +55,4 @@ struct ExtractVolumePatchesForward {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_EXTRACT_VOLUME_PATCHES_OP_H_
+#endif  // TENSORFLOW_KERNELS_IMAGE_EXTRACT_VOLUME_PATCHES_OP_H_
diff --git a/tensorflow/core/kernels/extract_volume_patches_op_gpu.cu.cc b/tensorflow/core/kernels/image/extract_volume_patches_op_gpu.cu.cc
similarity index 94%
rename from tensorflow/core/kernels/extract_volume_patches_op_gpu.cu.cc
rename to tensorflow/core/kernels/image/extract_volume_patches_op_gpu.cu.cc
index df8b6f8bfa2..379907712a8 100644
--- a/tensorflow/core/kernels/extract_volume_patches_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/image/extract_volume_patches_op_gpu.cu.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
-#include "tensorflow/core/kernels/extract_volume_patches_op.h"
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/image/extract_volume_patches_op.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/generate_box_proposals_op.cu.cc b/tensorflow/core/kernels/image/generate_box_proposals_op.cu.cc
similarity index 91%
rename from tensorflow/core/kernels/generate_box_proposals_op.cu.cc
rename to tensorflow/core/kernels/image/generate_box_proposals_op.cu.cc
index b862c42d299..a12cd3e6601 100644
--- a/tensorflow/core/kernels/generate_box_proposals_op.cu.cc
+++ b/tensorflow/core/kernels/image/generate_box_proposals_op.cu.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
 
 #include <algorithm>
@@ -24,7 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/kernels/gpu_prim.h"
-#include "tensorflow/core/kernels/non_max_suppression_op.h"
+#include "tensorflow/core/kernels/image/non_max_suppression_op.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor.h"
@@ -34,24 +34,6 @@ limitations under the License.
 
 namespace tensorflow {
 typedef Eigen::GpuDevice GPUDevice;
-#define TF_RETURN_IF_CUDA_ERROR(result)                   \
-  do {                                                    \
-    cudaError_t error(result);                            \
-    if (!SE_PREDICT_TRUE(error == cudaSuccess)) {         \
-      return errors::Internal("Cuda call failed with ",   \
-                              cudaGetErrorString(error)); \
-    }                                                     \
-  } while (0)
-
-#define TF_OP_REQUIRES_CUDA_SUCCESS(context, result)                   \
-  do {                                                                 \
-    cudaError_t error(result);                                         \
-    if (!SE_PREDICT_TRUE(error == cudaSuccess)) {                      \
-      context->SetStatus(errors::Internal("Cuda call failed with",     \
-                                          cudaGetErrorString(error))); \
-      return;                                                          \
-    }                                                                  \
-  } while (0)
 
 namespace {
 
@@ -62,7 +44,7 @@ namespace {
 // min_size is the lower bound of the shortest edge for the boxes to consider.
 // bbox_xform_clip is the upper bound of encoded width and height.
 __global__ void GeneratePreNMSUprightBoxesKernel(
-    const Cuda2DLaunchConfig config, const int* d_sorted_scores_keys,
+    const Gpu2DLaunchConfig config, const int* d_sorted_scores_keys,
     const float4* d_bbox_deltas, const float4* d_anchors, const int height,
     const int width, const int num_anchors, const float min_size,
     const float* d_img_info_vec,  // Input "image_info" to the op [N,5]
@@ -162,7 +144,7 @@ __global__ void GeneratePreNMSUprightBoxesKernel(
 // Copy the selected boxes and scores to output tensors.
 //
 __global__ void WriteUprightBoxesOutput(
-    const CudaLaunchConfig nboxes, const float4* d_image_boxes,
+    const GpuLaunchConfig nboxes, const float4* d_image_boxes,
     const float* d_image_scores, const int* d_image_boxes_keep_list,
     const int n_rois, float* d_image_out_rois, float* d_image_out_rois_probs) {
   CUDA_1D_KERNEL_LOOP(i, nboxes.virtual_thread_count) {
@@ -191,7 +173,7 @@ __global__ void WriteUprightBoxesOutput(
 
 template <typename T>
 Status ResetTensor(Tensor* t, const Eigen::GpuDevice& d) {
-  CudaLaunchConfig zconfig = GetCudaLaunchConfig(t->NumElements(), d);
+  GpuLaunchConfig zconfig = GetGpuLaunchConfig(t->NumElements(), d);
   return GpuLaunchKernel(SetZero<T>, zconfig.block_count,
                          zconfig.thread_per_block, 0, d.stream(),
                          zconfig.virtual_thread_count, (*t).flat<T>().data());
@@ -280,7 +262,7 @@ Status AllocatePreNMSTempTensors(
 
 // Initialize index and offset arrays.
 // num_images is the batch size.
-__global__ void InitializeDataKernel(const Cuda2DLaunchConfig config,
+__global__ void InitializeDataKernel(const Gpu2DLaunchConfig config,
                                      int* d_image_offsets,
                                      int* d_boxes_keys_iota) {
   const int image_size = config.virtual_thread_count.x;
@@ -365,18 +347,19 @@ class GenerateBoundingBoxProposals : public tensorflow::OpKernel {
     size_t cub_sort_temp_storage_bytes = 0;
     float* flt_ptr = nullptr;
     int* int_ptr = nullptr;
-    cudaError_t cuda_ret = cub::DeviceSegmentedRadixSort::SortPairsDescending(
-        nullptr, cub_sort_temp_storage_bytes, flt_ptr, flt_ptr, int_ptr,
-        int_ptr, num_images * conv_layer_nboxes, num_images, int_ptr, int_ptr,
-        0, 8 * sizeof(float),  // sort all bits
-        cuda_stream);
+    cudaError_t cuda_ret =
+        gpuprim::DeviceSegmentedRadixSort::SortPairsDescending(
+            nullptr, cub_sort_temp_storage_bytes, flt_ptr, flt_ptr, int_ptr,
+            int_ptr, num_images * conv_layer_nboxes, num_images, int_ptr,
+            int_ptr, 0, 8 * sizeof(float),  // sort all bits
+            cuda_stream);
     TF_OP_REQUIRES_CUDA_SUCCESS(context, cuda_ret);
     // get the size of select temp buffer
     size_t cub_select_temp_storage_bytes = 0;
     char* char_ptr = nullptr;
     float4* f4_ptr = nullptr;
     TF_OP_REQUIRES_CUDA_SUCCESS(
-        context, cub::DeviceSelect::Flagged(
+        context, gpuprim::DeviceSelect::Flagged(
                      nullptr, cub_select_temp_storage_bytes, f4_ptr, char_ptr,
                      f4_ptr, int_ptr, image_stride * num_anchors, cuda_stream));
     Tensor d_conv_layer_indexes;  // box indices on device
@@ -399,8 +382,8 @@ class GenerateBoundingBoxProposals : public tensorflow::OpKernel {
             &dev_boxes_keep_flags, num_images, conv_layer_nboxes,
             cub_temp_storage_bytes, nboxes_to_generate, box_dim));
     const GPUDevice& d = context->eigen_device<GPUDevice>();
-    Cuda2DLaunchConfig conf2d =
-        GetCuda2DLaunchConfig(conv_layer_nboxes, num_images, d);
+    Gpu2DLaunchConfig conf2d =
+        GetGpu2DLaunchConfig(conv_layer_nboxes, num_images, d);
     // create box indices and offsets for each image on device
     OP_REQUIRES_OK(
         context, GpuLaunchKernel(InitializeDataKernel, conf2d.block_count,
@@ -412,7 +395,7 @@ class GenerateBoundingBoxProposals : public tensorflow::OpKernel {
     // d_sorted_conv_layer_indexes will hold the pointers to old indices.
     TF_OP_REQUIRES_CUDA_SUCCESS(
         context,
-        cub::DeviceSegmentedRadixSort::SortPairsDescending(
+        gpuprim::DeviceSegmentedRadixSort::SortPairsDescending(
             d_cub_temp_buffer.flat<int8>().data(), cub_temp_storage_bytes,
             scores.flat<float>().data(), dev_sorted_scores.flat<float>().data(),
             d_conv_layer_indexes.flat<int>().data(),
@@ -423,7 +406,7 @@ class GenerateBoundingBoxProposals : public tensorflow::OpKernel {
             8 * sizeof(float),  // sort all bits
             cuda_stream));
     // Keeping only the topN pre_nms
-    conf2d = GetCuda2DLaunchConfig(nboxes_to_generate, num_images, d);
+    conf2d = GetGpu2DLaunchConfig(nboxes_to_generate, num_images, d);
 
     // create box y1,x1,y2,x2 from box_deltas and anchors (decode the boxes) and
     // mark the boxes which are smaller that min_size ignored.
@@ -481,8 +464,8 @@ class GenerateBoundingBoxProposals : public tensorflow::OpKernel {
                                 &output_roi_probs));
     float* d_postnms_rois = (*output_rois).flat<float>().data();
     float* d_postnms_rois_probs = (*output_roi_probs).flat<float>().data();
-    cudaEvent_t copy_done;
-    cudaEventCreate(&copy_done);
+    gpuEvent_t copy_done;
+    gpuEventCreate(&copy_done);
 
     // Do  per-image nms
     for (int image_index = 0; image_index < num_images; ++image_index) {
@@ -509,7 +492,7 @@ class GenerateBoundingBoxProposals : public tensorflow::OpKernel {
       // Moving valid boxes (ie the ones with d_boxes_keep_flags[ibox] == true)
       // to the output tensors
       TF_OP_REQUIRES_CUDA_SUCCESS(
-          context, cub::DeviceSelect::Flagged(
+          context, gpuprim::DeviceSelect::Flagged(
                        d_cub_temp_storage, cub_temp_storage_bytes,
                        reinterpret_cast<const float4*>(d_image_boxes),
                        d_image_boxes_keep_flags,
@@ -517,14 +500,14 @@ class GenerateBoundingBoxProposals : public tensorflow::OpKernel {
                        d_prenms_nboxes, nboxes_generated, d.stream()));
       TF_OP_REQUIRES_CUDA_SUCCESS(
           context,
-          cub::DeviceSelect::Flagged(
+          gpuprim::DeviceSelect::Flagged(
               d_cub_temp_storage, cub_temp_storage_bytes, d_image_sorted_scores,
               d_image_boxes_keep_flags, d_image_prenms_scores, d_prenms_nboxes,
               nboxes_generated, d.stream()));
       d.memcpyDeviceToHost(&h_prenms_nboxes, d_prenms_nboxes, sizeof(int));
       TF_OP_REQUIRES_CUDA_SUCCESS(context,
-                                  cudaEventRecord(copy_done, d.stream()));
-      TF_OP_REQUIRES_CUDA_SUCCESS(context, cudaEventSynchronize(copy_done));
+                                  gpuEventRecord(copy_done, d.stream()));
+      TF_OP_REQUIRES_CUDA_SUCCESS(context, gpuEventSynchronize(copy_done));
       // We know prenms_boxes <= topN_prenms, because nboxes_generated <=
       // topN_prenms. Calling NMS on the generated boxes
       const int prenms_nboxes = h_prenms_nboxes;
@@ -538,7 +521,7 @@ class GenerateBoundingBoxProposals : public tensorflow::OpKernel {
       const int postnms_nboxes = std::min(nkeep, post_nms_topn_);
       // Moving the out boxes to the output tensors,
       // adding the image_index dimension on the fly
-      CudaLaunchConfig config = GetCudaLaunchConfig(post_nms_topn_, d);
+      GpuLaunchConfig config = GetGpuLaunchConfig(post_nms_topn_, d);
       // make this single kernel
       OP_REQUIRES_OK(
           context,
diff --git a/tensorflow/core/kernels/image_ops.cc b/tensorflow/core/kernels/image/image_ops.cc
similarity index 55%
rename from tensorflow/core/kernels/image_ops.cc
rename to tensorflow/core/kernels/image/image_ops.cc
index 8792372b6ff..af1c2fd467f 100644
--- a/tensorflow/core/kernels/image_ops.cc
+++ b/tensorflow/core/kernels/image/image_ops.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #define EIGEN_USE_GPU
 #endif  // GOOGLE_CUDA
 
-#include "tensorflow/core/kernels/image_ops.h"
+#include "tensorflow/core/kernels/image/image_ops.h"
 
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -48,6 +48,68 @@ using functor::FillProjectiveTransform;
 using generator::Interpolation;
 using generator::Mode;
 
+template <typename Device, typename T>
+void DoImageProjectiveTransformOp(OpKernelContext* ctx,
+                                  const Interpolation& interpolation,
+                                  const Mode& fill_mode) {
+  const Tensor& images_t = ctx->input(0);
+  const Tensor& transform_t = ctx->input(1);
+  OP_REQUIRES(ctx, images_t.shape().dims() == 4,
+              errors::InvalidArgument("Input images must have rank 4"));
+  OP_REQUIRES(ctx,
+              (TensorShapeUtils::IsMatrix(transform_t.shape()) &&
+               (transform_t.dim_size(0) == images_t.dim_size(0) ||
+                transform_t.dim_size(0) == 1) &&
+               transform_t.dim_size(1) == 8),
+              errors::InvalidArgument(
+                  "Input transform should be num_images x 8 or 1 x 8"));
+
+  int32 out_height, out_width;
+  // Kernel is shared by legacy "ImageProjectiveTransform" op with 2 args.
+  if (ctx->num_inputs() >= 3) {
+    const Tensor& shape_t = ctx->input(2);
+    OP_REQUIRES(ctx, shape_t.dims() == 1,
+                errors::InvalidArgument("output shape must be 1-dimensional",
+                                        shape_t.shape().DebugString()));
+    OP_REQUIRES(ctx, shape_t.NumElements() == 2,
+                errors::InvalidArgument("output shape must have two elements",
+                                        shape_t.shape().DebugString()));
+    auto shape_vec = shape_t.vec<int32>();
+    out_height = shape_vec(0);
+    out_width = shape_vec(1);
+    OP_REQUIRES(ctx, out_height > 0 && out_width > 0,
+                errors::InvalidArgument("output dimensions must be positive"));
+  } else {
+    // Shape is N (batch size), H (height), W (width), C (channels).
+    out_height = images_t.shape().dim_size(1);
+    out_width = images_t.shape().dim_size(2);
+  }
+
+  T fill_value(0);
+  // Kernel is shared by "ImageProjectiveTransformV2" with 3 args.
+  if (ctx->num_inputs() >= 4) {
+    const Tensor& fill_value_t = ctx->input(3);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(fill_value_t.shape()),
+                errors::InvalidArgument("fill_value must be a scalar",
+                                        fill_value_t.shape().DebugString()));
+    fill_value = static_cast<T>(*(fill_value_t.scalar<float>().data()));
+  }
+
+  Tensor* output_t;
+  OP_REQUIRES_OK(
+      ctx, ctx->allocate_output(0,
+                                TensorShape({images_t.dim_size(0), out_height,
+                                             out_width, images_t.dim_size(3)}),
+                                &output_t));
+  auto output = output_t->tensor<T, 4>();
+  auto images = images_t.tensor<T, 4>();
+  auto transform = transform_t.matrix<float>();
+
+  (FillProjectiveTransform<Device, T>(interpolation))(
+      ctx->eigen_device<Device>(), &output, images, transform, fill_mode,
+      fill_value);
+}
+
 template <typename Device, typename T>
 class ImageProjectiveTransformV2 : public OpKernel {
  private:
@@ -84,52 +146,7 @@ class ImageProjectiveTransformV2 : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    const Tensor& images_t = ctx->input(0);
-    const Tensor& transform_t = ctx->input(1);
-    OP_REQUIRES(ctx, images_t.shape().dims() == 4,
-                errors::InvalidArgument("Input images must have rank 4"));
-    OP_REQUIRES(ctx,
-                (TensorShapeUtils::IsMatrix(transform_t.shape()) &&
-                 (transform_t.dim_size(0) == images_t.dim_size(0) ||
-                  transform_t.dim_size(0) == 1) &&
-                 transform_t.dim_size(1) == 8),
-                errors::InvalidArgument(
-                    "Input transform should be num_images x 8 or 1 x 8"));
-
-    int32 out_height, out_width;
-    // Kernel is shared by legacy "ImageProjectiveTransform" op with 2 args.
-    if (ctx->num_inputs() >= 3) {
-      const Tensor& shape_t = ctx->input(2);
-      OP_REQUIRES(ctx, shape_t.dims() == 1,
-                  errors::InvalidArgument("output shape must be 1-dimensional",
-                                          shape_t.shape().DebugString()));
-      OP_REQUIRES(ctx, shape_t.NumElements() == 2,
-                  errors::InvalidArgument("output shape must have two elements",
-                                          shape_t.shape().DebugString()));
-      auto shape_vec = shape_t.vec<int32>();
-      out_height = shape_vec(0);
-      out_width = shape_vec(1);
-      OP_REQUIRES(
-          ctx, out_height > 0 && out_width > 0,
-          errors::InvalidArgument("output dimensions must be positive"));
-    } else {
-      // Shape is N (batch size), H (height), W (width), C (channels).
-      out_height = images_t.shape().dim_size(1);
-      out_width = images_t.shape().dim_size(2);
-    }
-
-    Tensor* output_t;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(
-                            0,
-                            TensorShape({images_t.dim_size(0), out_height,
-                                         out_width, images_t.dim_size(3)}),
-                            &output_t));
-    auto output = output_t->tensor<T, 4>();
-    auto images = images_t.tensor<T, 4>();
-    auto transform = transform_t.matrix<float>();
-
-    (FillProjectiveTransform<Device, T>(interpolation_))(
-        ctx->eigen_device<Device>(), &output, images, transform, fill_mode_);
+    DoImageProjectiveTransformOp<Device, T>(ctx, interpolation_, fill_mode_);
   }
 };
 
@@ -148,6 +165,29 @@ TF_CALL_double(REGISTER);
 
 #undef REGISTER
 
+template <typename Device, typename T>
+class ImageProjectiveTransformV3
+    : public ImageProjectiveTransformV2<Device, T> {
+ public:
+  explicit ImageProjectiveTransformV3(OpKernelConstruction* ctx)
+      : ImageProjectiveTransformV2<Device, T>(ctx) {}
+};
+
+#define REGISTER(TYPE)                                        \
+  REGISTER_KERNEL_BUILDER(Name("ImageProjectiveTransformV3")  \
+                              .Device(DEVICE_CPU)             \
+                              .TypeConstraint<TYPE>("dtype"), \
+                          ImageProjectiveTransformV3<CPUDevice, TYPE>)
+
+TF_CALL_uint8(REGISTER);
+TF_CALL_int32(REGISTER);
+TF_CALL_int64(REGISTER);
+TF_CALL_half(REGISTER);
+TF_CALL_float(REGISTER);
+TF_CALL_double(REGISTER);
+
+#undef REGISTER
+
 #if GOOGLE_CUDA
 
 typedef Eigen::GpuDevice GPUDevice;
@@ -161,7 +201,8 @@ namespace functor {
   template <>                                                               \
   void FillProjectiveTransform<GPUDevice, TYPE>::operator()(                \
       const GPUDevice& device, OutputType* output, const InputType& images, \
-      const TransformsType& transform, const Mode fill_mode) const;         \
+      const TransformsType& transform, const Mode fill_mode,                \
+      const TYPE fill_value) const;                                         \
   extern template struct FillProjectiveTransform<GPUDevice, TYPE>
 
 TF_CALL_uint8(DECLARE_PROJECT_FUNCTOR);
@@ -204,6 +245,23 @@ TF_CALL_double(REGISTER);
 
 #undef REGISTER
 
+#define REGISTER(TYPE)                                       \
+  REGISTER_KERNEL_BUILDER(Name("ImageProjectiveTransformV3") \
+                              .Device(DEVICE_GPU)            \
+                              .TypeConstraint<TYPE>("dtype") \
+                              .HostMemory("output_shape")    \
+                              .HostMemory("fill_value"),     \
+                          ImageProjectiveTransformV3<GPUDevice, TYPE>)
+
+TF_CALL_uint8(REGISTER);
+TF_CALL_int32(REGISTER);
+TF_CALL_int64(REGISTER);
+TF_CALL_half(REGISTER);
+TF_CALL_float(REGISTER);
+TF_CALL_double(REGISTER);
+
+#undef REGISTER
+
 #endif  // GOOGLE_CUDA
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/image_ops.h b/tensorflow/core/kernels/image/image_ops.h
similarity index 76%
rename from tensorflow/core/kernels/image_ops.h
rename to tensorflow/core/kernels/image/image_ops.h
index e77fcbbd56a..abaccf88d64 100644
--- a/tensorflow/core/kernels/image_ops.h
+++ b/tensorflow/core/kernels/image/image_ops.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_KERNELS_IMAGE_OPS_H_
-#define TENSORFLOW_CORE_KERNELS_IMAGE_OPS_H_
+#ifndef TENSORFLOW_CORE_KERNELS_IMAGE_IMAGE_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_IMAGE_IMAGE_OPS_H_
 
 // See docs in ../ops/image_ops.cc.
 
@@ -35,6 +35,8 @@ enum Mode { FILL_REFLECT, FILL_WRAP, FILL_CONSTANT, FILL_NEAREST };
 using Eigen::array;
 using Eigen::DenseIndex;
 
+// Follow scipy's implementation
+// https://github.com/scipy/scipy/blob/master/scipy/ndimage/src/ni_interpolation.c
 template <typename Device, Mode M>
 struct MapCoordinate {
   float operator()(const float out_coord, const DenseIndex len);
@@ -44,22 +46,32 @@ template <typename Device>
 struct MapCoordinate<Device, Mode::FILL_REFLECT> {
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float operator()(const float out_coord,
                                                          const DenseIndex len) {
+    // Reflect [abcd] to [dcba|abcd|dcba].
     float in_coord = out_coord;
-    // Reflect [abcd] to [dcba|abcd|dcba], periodically from [0, 2 * len)
-    // over [abcddcba]
-    const DenseIndex boundary = 2 * len;
-    // Shift coordinate to (-boundary, boundary)
-    in_coord -= boundary * static_cast<DenseIndex>(in_coord / boundary);
-    // Convert negative coordinates from [-boundary, 0) to [0, boundary)
     if (in_coord < 0) {
-      in_coord += boundary;
+      if (len <= 1) {
+        in_coord = 0;
+      } else {
+        const DenseIndex sz2 = 2 * len;
+        if (in_coord < sz2) {
+          in_coord = sz2 * static_cast<DenseIndex>(-in_coord / sz2) + in_coord;
+        }
+        in_coord = (in_coord < -len) ? in_coord + sz2 : -in_coord - 1;
+      }
+    } else if (in_coord > len - 1) {
+      if (len <= 1) {
+        in_coord = 0;
+      } else {
+        const DenseIndex sz2 = 2 * len;
+        in_coord -= sz2 * static_cast<DenseIndex>(in_coord / sz2);
+        if (in_coord >= len) {
+          in_coord = sz2 - in_coord - 1;
+        }
+      }
     }
-    // Coordinate in_coord between [len, boundary) should reverse reflect
-    // to coordinate to (bounary - 1 - in_coord) between [0, len)
-    if (in_coord > len - 1) {
-      in_coord = boundary - 1 - in_coord;
-    }
-    return in_coord;
+    // clamp is necessary because when out_coord = 3.5 and len = 4,
+    // in_coord = 3.5 and will be rounded to 4 in nearest interpolation.
+    return Eigen::internal::scalar_clamp_op<float>(0.0f, len - 1)(in_coord);
   }
 };
 
@@ -67,17 +79,26 @@ template <typename Device>
 struct MapCoordinate<Device, Mode::FILL_WRAP> {
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float operator()(const float out_coord,
                                                          const DenseIndex len) {
+    // Wrap [abcd] to [abcd|abcd|abcd].
     float in_coord = out_coord;
-    // Wrap [abcd] to [abcd|abcd|abcd], periodically from [0, len)
-    // over [abcd]
-    const DenseIndex boundary = len;
-    // Shift coordinate to (-boundary, boundary)
-    in_coord -= boundary * static_cast<DenseIndex>(in_coord / boundary);
-    // Shift negative coordinate from [-boundary, 0) to [0, boundary)
     if (in_coord < 0) {
-      in_coord += boundary;
+      if (len <= 1) {
+        in_coord = 0;
+      } else {
+        const DenseIndex sz = len - 1;
+        in_coord += len * (static_cast<DenseIndex>(-in_coord / sz) + 1);
+      }
+    } else if (in_coord > len - 1) {
+      if (len <= 1) {
+        in_coord = 0;
+      } else {
+        const DenseIndex sz = len - 1;
+        in_coord -= len * static_cast<DenseIndex>(in_coord / sz);
+      }
     }
-    return in_coord;
+    // clamp is necessary because when out_coord = -0.5 and len = 4,
+    // in_coord = 3.5 and will be rounded to 4 in nearest interpolation.
+    return Eigen::internal::scalar_clamp_op<float>(0.0f, len - 1)(in_coord);
   }
 };
 
@@ -93,11 +114,7 @@ template <typename Device>
 struct MapCoordinate<Device, Mode::FILL_NEAREST> {
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float operator()(const float out_coord,
                                                          const DenseIndex len) {
-    if (out_coord < 0)
-      return 0;
-    else if (out_coord >= len)
-      return len - 1;
-    return out_coord;
+    return Eigen::internal::scalar_clamp_op<float>(0.0f, len - 1)(out_coord);
   }
 };
 
@@ -107,17 +124,20 @@ class ProjectiveGenerator {
   typename TTypes<T, 4>::ConstTensor input_;
   typename TTypes<float>::ConstMatrix transforms_;
   const Interpolation interpolation_;
+  const T fill_value_;
 
  public:
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
   ProjectiveGenerator(typename TTypes<T, 4>::ConstTensor input,
                       typename TTypes<float>::ConstMatrix transforms,
-                      const Interpolation interpolation)
-      : input_(input), transforms_(transforms), interpolation_(interpolation) {}
+                      const Interpolation interpolation, const T fill_value)
+      : input_(input),
+        transforms_(transforms),
+        interpolation_(interpolation),
+        fill_value_(fill_value) {}
 
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T
   operator()(const array<DenseIndex, 4>& coords) const {
-    const T fill_value = T(0);
     const int64 output_y = coords[1];
     const int64 output_x = coords[2];
     const float* transform =
@@ -126,9 +146,9 @@ class ProjectiveGenerator {
             : &transforms_.data()[transforms_.dimension(1) * coords[0]];
     float projection = transform[6] * output_x + transform[7] * output_y + 1.f;
     if (projection == 0) {
-      // Return the fill value (0) for infinite coordinates,
+      // Return the fill value for infinite coordinates,
       // which are outside the input image
-      return fill_value;
+      return fill_value_;
     }
     const float input_x =
         (transform[0] * output_x + transform[1] * output_y + transform[2]) /
@@ -146,13 +166,13 @@ class ProjectiveGenerator {
     const DenseIndex channels = coords[3];
     switch (interpolation_) {
       case NEAREST:
-        return nearest_interpolation(batch, y, x, channels, fill_value);
+        return nearest_interpolation(batch, y, x, channels, fill_value_);
       case BILINEAR:
-        return bilinear_interpolation(batch, y, x, channels, fill_value);
+        return bilinear_interpolation(batch, y, x, channels, fill_value_);
     }
     // Unreachable; ImageProjectiveTransform only uses INTERPOLATION_NEAREST
     // or INTERPOLATION_BILINEAR.
-    return fill_value;
+    return fill_value_;
   }
 
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T
@@ -225,27 +245,27 @@ struct FillProjectiveTransform {
   EIGEN_ALWAYS_INLINE
   void operator()(const Device& device, OutputType* output,
                   const InputType& images, const TransformsType& transform,
-                  const Mode fill_mode) const {
+                  const Mode fill_mode, const T fill_value) const {
     switch (fill_mode) {
       case Mode::FILL_REFLECT:
         output->device(device) =
             output->generate(ProjectiveGenerator<Device, T, Mode::FILL_REFLECT>(
-                images, transform, interpolation));
+                images, transform, interpolation, fill_value));
         break;
       case Mode::FILL_WRAP:
         output->device(device) =
             output->generate(ProjectiveGenerator<Device, T, Mode::FILL_WRAP>(
-                images, transform, interpolation));
+                images, transform, interpolation, fill_value));
         break;
       case Mode::FILL_CONSTANT:
         output->device(device) = output->generate(
             ProjectiveGenerator<Device, T, Mode::FILL_CONSTANT>(
-                images, transform, interpolation));
+                images, transform, interpolation, fill_value));
         break;
       case Mode::FILL_NEAREST:
         output->device(device) =
             output->generate(ProjectiveGenerator<Device, T, Mode::FILL_NEAREST>(
-                images, transform, interpolation));
+                images, transform, interpolation, fill_value));
         break;
     }
   }
@@ -255,4 +275,4 @@ struct FillProjectiveTransform {
 
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_IMAGE_OPS_H_
+#endif  // TENSORFLOW_CORE_KERNELS_IMAGE_IMAGE_OPS_H_
diff --git a/tensorflow/core/kernels/image_ops_gpu.cu.cc b/tensorflow/core/kernels/image/image_ops_gpu.cu.cc
similarity index 96%
rename from tensorflow/core/kernels/image_ops_gpu.cu.cc
rename to tensorflow/core/kernels/image/image_ops_gpu.cu.cc
index 827fb493e4c..dd94559ffd7 100644
--- a/tensorflow/core/kernels/image_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/image/image_ops_gpu.cu.cc
@@ -19,7 +19,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/image_ops.h"
+#include "tensorflow/core/kernels/image/image_ops.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/mirror_pad_op.cc b/tensorflow/core/kernels/image/mirror_pad_op.cc
similarity index 99%
rename from tensorflow/core/kernels/mirror_pad_op.cc
rename to tensorflow/core/kernels/image/mirror_pad_op.cc
index 20211c88c8b..e22b1f1adbf 100644
--- a/tensorflow/core/kernels/mirror_pad_op.cc
+++ b/tensorflow/core/kernels/image/mirror_pad_op.cc
@@ -17,11 +17,11 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#include "tensorflow/core/kernels/mirror_pad_op.h"
+#include "tensorflow/core/kernels/image/mirror_pad_op.h"
+
 #include <string>
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
diff --git a/tensorflow/core/kernels/mirror_pad_op.h b/tensorflow/core/kernels/image/mirror_pad_op.h
similarity index 99%
rename from tensorflow/core/kernels/mirror_pad_op.h
rename to tensorflow/core/kernels/image/mirror_pad_op.h
index 23ab574b8b6..8a8f84b7a64 100644
--- a/tensorflow/core/kernels/mirror_pad_op.h
+++ b/tensorflow/core/kernels/image/mirror_pad_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_KERNELS_MIRROR_PAD_OP_H_
-#define TENSORFLOW_CORE_KERNELS_MIRROR_PAD_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_IMAGE_MIRROR_PAD_OP_H_
+#define TENSORFLOW_CORE_KERNELS_IMAGE_MIRROR_PAD_OP_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor_types.h"
@@ -444,4 +444,4 @@ struct MirrorPadGrad {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_MIRROR_PAD_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_IMAGE_MIRROR_PAD_OP_H_
diff --git a/tensorflow/core/kernels/mirror_pad_op_benchmark_test.cc b/tensorflow/core/kernels/image/mirror_pad_op_benchmark_test.cc
similarity index 100%
rename from tensorflow/core/kernels/mirror_pad_op_benchmark_test.cc
rename to tensorflow/core/kernels/image/mirror_pad_op_benchmark_test.cc
diff --git a/tensorflow/core/kernels/mirror_pad_op_cpu_impl.h b/tensorflow/core/kernels/image/mirror_pad_op_cpu_impl.h
similarity index 83%
rename from tensorflow/core/kernels/mirror_pad_op_cpu_impl.h
rename to tensorflow/core/kernels/image/mirror_pad_op_cpu_impl.h
index 45e6676e5a6..7a7c263c526 100644
--- a/tensorflow/core/kernels/mirror_pad_op_cpu_impl.h
+++ b/tensorflow/core/kernels/image/mirror_pad_op_cpu_impl.h
@@ -13,13 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_KERNELS_MIRROR_PAD_OP_CPU_IMPL_H_
-#define TENSORFLOW_CORE_KERNELS_MIRROR_PAD_OP_CPU_IMPL_H_
+#ifndef TENSORFLOW_CORE_KERNELS_IMAGE_MIRROR_PAD_OP_CPU_IMPL_H_
+#define TENSORFLOW_CORE_KERNELS_IMAGE_MIRROR_PAD_OP_CPU_IMPL_H_
 
+#if CPU_PROVIDED_IXDIM
 #define EIGEN_USE_THREADS
 
 #include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/kernels/mirror_pad_op.h"
+#include "tensorflow/core/kernels/image/mirror_pad_op.h"
 
 namespace tensorflow {
 
@@ -39,7 +40,7 @@ TF_CALL_tstring(DEFINE_CPU_SPECS);
                                          CPU_PROVIDED_IXDIM>;
 TF_CALL_NUMBER_TYPES(DEFINE_CPU_SPECS);
 #undef DEFINE_CPU_SPECS
-
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_MIRROR_PAD_OP_CPU_IMPL_H_
+#endif  // CPU_PROVIDED_IXDIM
+#endif  // TENSORFLOW_CORE_KERNELS_IMAGE_MIRROR_PAD_OP_CPU_IMPL_H_
diff --git a/tensorflow/core/kernels/mirror_pad_op_cpu_impl_1.cc b/tensorflow/core/kernels/image/mirror_pad_op_cpu_impl_1.cc
similarity index 91%
rename from tensorflow/core/kernels/mirror_pad_op_cpu_impl_1.cc
rename to tensorflow/core/kernels/image/mirror_pad_op_cpu_impl_1.cc
index 140c487221f..ad64170aa0f 100644
--- a/tensorflow/core/kernels/mirror_pad_op_cpu_impl_1.cc
+++ b/tensorflow/core/kernels/image/mirror_pad_op_cpu_impl_1.cc
@@ -14,5 +14,5 @@ limitations under the License.
 ==============================================================================*/
 
 #define CPU_PROVIDED_IXDIM 1
-#include "tensorflow/core/kernels/mirror_pad_op_cpu_impl.h"
+#include "tensorflow/core/kernels/image/mirror_pad_op_cpu_impl.h"
 #undef CPU_PROVIDED_IXDIM
diff --git a/tensorflow/core/kernels/mirror_pad_op_cpu_impl_2.cc b/tensorflow/core/kernels/image/mirror_pad_op_cpu_impl_2.cc
similarity index 91%
rename from tensorflow/core/kernels/mirror_pad_op_cpu_impl_2.cc
rename to tensorflow/core/kernels/image/mirror_pad_op_cpu_impl_2.cc
index d67f7754e1d..76096f78030 100644
--- a/tensorflow/core/kernels/mirror_pad_op_cpu_impl_2.cc
+++ b/tensorflow/core/kernels/image/mirror_pad_op_cpu_impl_2.cc
@@ -14,5 +14,5 @@ limitations under the License.
 ==============================================================================*/
 
 #define CPU_PROVIDED_IXDIM 2
-#include "tensorflow/core/kernels/mirror_pad_op_cpu_impl.h"
+#include "tensorflow/core/kernels/image/mirror_pad_op_cpu_impl.h"
 #undef CPU_PROVIDED_IXDIM
diff --git a/tensorflow/core/kernels/mirror_pad_op_cpu_impl_3.cc b/tensorflow/core/kernels/image/mirror_pad_op_cpu_impl_3.cc
similarity index 91%
rename from tensorflow/core/kernels/mirror_pad_op_cpu_impl_3.cc
rename to tensorflow/core/kernels/image/mirror_pad_op_cpu_impl_3.cc
index 096547f1f9c..3c29e87bc45 100644
--- a/tensorflow/core/kernels/mirror_pad_op_cpu_impl_3.cc
+++ b/tensorflow/core/kernels/image/mirror_pad_op_cpu_impl_3.cc
@@ -14,5 +14,5 @@ limitations under the License.
 ==============================================================================*/
 
 #define CPU_PROVIDED_IXDIM 3
-#include "tensorflow/core/kernels/mirror_pad_op_cpu_impl.h"
+#include "tensorflow/core/kernels/image/mirror_pad_op_cpu_impl.h"
 #undef CPU_PROVIDED_IXDIM
diff --git a/tensorflow/core/kernels/mirror_pad_op_cpu_impl_4.cc b/tensorflow/core/kernels/image/mirror_pad_op_cpu_impl_4.cc
similarity index 91%
rename from tensorflow/core/kernels/mirror_pad_op_cpu_impl_4.cc
rename to tensorflow/core/kernels/image/mirror_pad_op_cpu_impl_4.cc
index 5a7455f3c07..5d1a3400054 100644
--- a/tensorflow/core/kernels/mirror_pad_op_cpu_impl_4.cc
+++ b/tensorflow/core/kernels/image/mirror_pad_op_cpu_impl_4.cc
@@ -14,5 +14,5 @@ limitations under the License.
 ==============================================================================*/
 
 #define CPU_PROVIDED_IXDIM 4
-#include "tensorflow/core/kernels/mirror_pad_op_cpu_impl.h"
+#include "tensorflow/core/kernels/image/mirror_pad_op_cpu_impl.h"
 #undef CPU_PROVIDED_IXDIM
diff --git a/tensorflow/core/kernels/mirror_pad_op_cpu_impl_5.cc b/tensorflow/core/kernels/image/mirror_pad_op_cpu_impl_5.cc
similarity index 91%
rename from tensorflow/core/kernels/mirror_pad_op_cpu_impl_5.cc
rename to tensorflow/core/kernels/image/mirror_pad_op_cpu_impl_5.cc
index ed2db03a8f5..71a6c9307c6 100644
--- a/tensorflow/core/kernels/mirror_pad_op_cpu_impl_5.cc
+++ b/tensorflow/core/kernels/image/mirror_pad_op_cpu_impl_5.cc
@@ -14,5 +14,5 @@ limitations under the License.
 ==============================================================================*/
 
 #define CPU_PROVIDED_IXDIM 5
-#include "tensorflow/core/kernels/mirror_pad_op_cpu_impl.h"
+#include "tensorflow/core/kernels/image/mirror_pad_op_cpu_impl.h"
 #undef CPU_PROVIDED_IXDIM
diff --git a/tensorflow/core/kernels/mirror_pad_op_gpu.cu.cc b/tensorflow/core/kernels/image/mirror_pad_op_gpu.cu.cc
similarity index 97%
rename from tensorflow/core/kernels/mirror_pad_op_gpu.cu.cc
rename to tensorflow/core/kernels/image/mirror_pad_op_gpu.cu.cc
index ac89599714d..f0afc707fc6 100644
--- a/tensorflow/core/kernels/mirror_pad_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/image/mirror_pad_op_gpu.cu.cc
@@ -17,9 +17,8 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
-#include "tensorflow/core/kernels/mirror_pad_op.h"
-
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/image/mirror_pad_op.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/mirror_pad_op_test.cc b/tensorflow/core/kernels/image/mirror_pad_op_test.cc
similarity index 100%
rename from tensorflow/core/kernels/mirror_pad_op_test.cc
rename to tensorflow/core/kernels/image/mirror_pad_op_test.cc
diff --git a/tensorflow/core/kernels/non_max_suppression_op.cc b/tensorflow/core/kernels/image/non_max_suppression_op.cc
similarity index 99%
rename from tensorflow/core/kernels/non_max_suppression_op.cc
rename to tensorflow/core/kernels/image/non_max_suppression_op.cc
index 20ae3a2e0d0..701753a81d6 100644
--- a/tensorflow/core/kernels/non_max_suppression_op.cc
+++ b/tensorflow/core/kernels/image/non_max_suppression_op.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#include "tensorflow/core/kernels/non_max_suppression_op.h"
+#include "tensorflow/core/kernels/image/non_max_suppression_op.h"
 
 #include <cmath>
 #include <functional>
diff --git a/tensorflow/core/kernels/non_max_suppression_op.cu.cc b/tensorflow/core/kernels/image/non_max_suppression_op.cu.cc
similarity index 99%
rename from tensorflow/core/kernels/non_max_suppression_op.cu.cc
rename to tensorflow/core/kernels/image/non_max_suppression_op.cu.cc
index 8ec26ba13d7..37d7d42e438 100644
--- a/tensorflow/core/kernels/non_max_suppression_op.cu.cc
+++ b/tensorflow/core/kernels/image/non_max_suppression_op.cu.cc
@@ -23,7 +23,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/kernels/gpu_prim.h"
-#include "tensorflow/core/kernels/non_max_suppression_op.h"
+#include "tensorflow/core/kernels/image/non_max_suppression_op.h"
 #include "tensorflow/core/util/gpu_kernel_helper.h"
 #include "tensorflow/core/util/gpu_launch_config.h"
 #include "tensorflow/stream_executor/stream_executor.h"
diff --git a/tensorflow/core/kernels/non_max_suppression_op.h b/tensorflow/core/kernels/image/non_max_suppression_op.h
similarity index 92%
rename from tensorflow/core/kernels/non_max_suppression_op.h
rename to tensorflow/core/kernels/image/non_max_suppression_op.h
index 24957c2bbed..d6d3b68b099 100644
--- a/tensorflow/core/kernels/non_max_suppression_op.h
+++ b/tensorflow/core/kernels/image/non_max_suppression_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_KERNELS_NON_MAX_SUPPRESSION_OP_H_
-#define TENSORFLOW_CORE_KERNELS_NON_MAX_SUPPRESSION_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_IMAGENON_MAX_SUPPRESSION_OP_H_
+#define TENSORFLOW_CORE_KERNELS_IMAGENON_MAX_SUPPRESSION_OP_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/numeric_types.h"
@@ -59,4 +59,4 @@ Status NmsGpu(const float* d_sorted_boxes_float_ptr, const int num_boxes,
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_NON_MAX_SUPPRESSION_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_IMAGENON_MAX_SUPPRESSION_OP_H_
diff --git a/tensorflow/core/kernels/non_max_suppression_op_benchmark_test.cc b/tensorflow/core/kernels/image/non_max_suppression_op_benchmark_test.cc
similarity index 100%
rename from tensorflow/core/kernels/non_max_suppression_op_benchmark_test.cc
rename to tensorflow/core/kernels/image/non_max_suppression_op_benchmark_test.cc
diff --git a/tensorflow/core/kernels/non_max_suppression_op_gpu_test.cc b/tensorflow/core/kernels/image/non_max_suppression_op_gpu_test.cc
similarity index 100%
rename from tensorflow/core/kernels/non_max_suppression_op_gpu_test.cc
rename to tensorflow/core/kernels/image/non_max_suppression_op_gpu_test.cc
diff --git a/tensorflow/core/kernels/non_max_suppression_op_test.cc b/tensorflow/core/kernels/image/non_max_suppression_op_test.cc
similarity index 100%
rename from tensorflow/core/kernels/non_max_suppression_op_test.cc
rename to tensorflow/core/kernels/image/non_max_suppression_op_test.cc
diff --git a/tensorflow/core/kernels/random_crop_op.cc b/tensorflow/core/kernels/image/random_crop_op.cc
similarity index 94%
rename from tensorflow/core/kernels/random_crop_op.cc
rename to tensorflow/core/kernels/image/random_crop_op.cc
index b89bda4769d..7da97466636 100644
--- a/tensorflow/core/kernels/random_crop_op.cc
+++ b/tensorflow/core/kernels/image/random_crop_op.cc
@@ -64,10 +64,6 @@ class RandomCropOp : public OpKernel {
       *output = context->input(0);
     }
 
-    // TODO(shlens): Implement edge case to guarantee output size dimensions.
-    // Edge case. The target dimensions are larger then the image, so
-    // zero-pad the image. This guarantees that the image will *always*
-    // be [target_height, target_width] in size.
     OP_REQUIRES(context, width >= target_width,
                 errors::FailedPrecondition(
                     "width must be >= target_width: width = ", width,
diff --git a/tensorflow/core/kernels/resize_area_op.cc b/tensorflow/core/kernels/image/resize_area_op.cc
similarity index 99%
rename from tensorflow/core/kernels/resize_area_op.cc
rename to tensorflow/core/kernels/image/resize_area_op.cc
index 325c5ccade1..00691ae46b0 100644
--- a/tensorflow/core/kernels/resize_area_op.cc
+++ b/tensorflow/core/kernels/image/resize_area_op.cc
@@ -18,15 +18,16 @@ limitations under the License.
 
 #include <algorithm>
 #include <memory>
+
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/image_resizer_state.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/util/image_resizer_state.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/resize_area_op_test.cc b/tensorflow/core/kernels/image/resize_area_op_test.cc
similarity index 100%
rename from tensorflow/core/kernels/resize_area_op_test.cc
rename to tensorflow/core/kernels/image/resize_area_op_test.cc
diff --git a/tensorflow/core/kernels/resize_bicubic_op.cc b/tensorflow/core/kernels/image/resize_bicubic_op.cc
similarity index 99%
rename from tensorflow/core/kernels/resize_bicubic_op.cc
rename to tensorflow/core/kernels/image/resize_bicubic_op.cc
index 48bd1986b7b..89f34cb80f0 100644
--- a/tensorflow/core/kernels/resize_bicubic_op.cc
+++ b/tensorflow/core/kernels/image/resize_bicubic_op.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include <math.h>
+
 #include <algorithm>
 #include <array>
 
@@ -26,9 +27,9 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/image_resizer_state.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/util/image_resizer_state.h"
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/core/kernels/resize_bicubic_op_test.cc b/tensorflow/core/kernels/image/resize_bicubic_op_test.cc
similarity index 100%
rename from tensorflow/core/kernels/resize_bicubic_op_test.cc
rename to tensorflow/core/kernels/image/resize_bicubic_op_test.cc
diff --git a/tensorflow/core/kernels/resize_bilinear_op.cc b/tensorflow/core/kernels/image/resize_bilinear_op.cc
similarity index 89%
rename from tensorflow/core/kernels/resize_bilinear_op.cc
rename to tensorflow/core/kernels/image/resize_bilinear_op.cc
index a0673fea73d..b84c7aaddbd 100644
--- a/tensorflow/core/kernels/resize_bilinear_op.cc
+++ b/tensorflow/core/kernels/image/resize_bilinear_op.cc
@@ -16,22 +16,28 @@ limitations under the License.
 // See docs in ../ops/image_ops.cc
 #define EIGEN_USE_THREADS
 
-#include "tensorflow/core/kernels/resize_bilinear_op.h"
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#define EIGEN_USE_GPU
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#include "tensorflow/core/kernels/image/resize_bilinear_op.h"
 
 #ifdef __SSE4_1__
 #include <xmmintrin.h>
 #endif
 
 #include <memory>
+
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/image_resizer_state.h"
+#include "tensorflow/core/kernels/cast_op.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/util/image_resizer_state.h"
 
 namespace tensorflow {
 
@@ -280,6 +286,25 @@ void resize_image(typename TTypes<T, 4>::ConstTensor images,
   }
 }
 
+template <typename Device>
+struct CastFloatToHalf {
+  void operator()(const Device& d, typename TTypes<float>::ConstFlat input,
+                  typename TTypes<Eigen::half>::Flat output) {
+    output.device(d) = input.template cast<Eigen::half>();
+  }
+};
+
+template <>
+struct CastFloatToHalf<GPUDevice> {
+  void operator()(const GPUDevice& d, typename TTypes<float>::ConstFlat input,
+                  typename TTypes<Eigen::half>::Flat output) {
+    // Use existing cast functor instead of directly casting Eigen tensor, as
+    // otherwise we need to instantiate the cast function in a .cu.cc file
+    functor::CastFunctor<GPUDevice, Eigen::half, float> cast;
+    cast(d, output, input);
+  }
+};
+
 }  // namespace
 
 // Partial specialization of ResizeBilinear functor for a CPUDevice.
@@ -354,11 +379,29 @@ class ResizeBilinearOpGrad : public OpKernel {
     if (!context->status().ok()) return;
 
     TTypes<float, 4>::ConstTensor input_grad = input.tensor<float, 4>();
-    typename TTypes<T, 4>::Tensor output_grad(st.output->tensor<T, 4>());
 
-    functor::ResizeBilinearGrad<Device, T>()(
-        context->eigen_device<Device>(), input_grad, st.height_scale,
-        st.width_scale, half_pixel_centers_, output_grad);
+    if (!std::is_same<T, Eigen::half>::value) {
+      typename TTypes<T, 4>::Tensor output_grad(st.output->tensor<T, 4>());
+      functor::ResizeBilinearGrad<Device, T>()(
+          context->eigen_device<Device>(), input_grad, st.height_scale,
+          st.width_scale, half_pixel_centers_, output_grad);
+    } else {
+      // Accumulate output to float instead of half tensor, since float
+      // accumulation is more numerically stable and GPU half implementation is
+      // slow.
+      // TODO(b/165759037): Create optimized and numerically stable half
+      // implementation
+      Tensor output_grad;
+      OP_REQUIRES_OK(context, context->allocate_temp(
+                                  DT_FLOAT, st.output->shape(), &output_grad));
+      functor::ResizeBilinearGrad<Device, float>()(
+          context->eigen_device<Device>(), input_grad, st.height_scale,
+          st.width_scale, half_pixel_centers_, output_grad.tensor<float, 4>());
+      const Tensor& output_grad_const = output_grad;
+      CastFloatToHalf<Device>{}(context->template eigen_device<Device>(),
+                                output_grad_const.template flat<float>(),
+                                st.output->template flat<Eigen::half>());
+    }
   }
 
  private:
@@ -478,7 +521,7 @@ TF_CALL_double(REGISTER_GRAD_KERNEL);
                               .HostMemory("size"),    \
                           ResizeBilinearOp<GPUDevice, T>);
 
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_KERNEL);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_KERNEL);
 
 #undef REGISTER_KERNEL
 
@@ -487,7 +530,7 @@ TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_KERNEL);
       Name("ResizeBilinearGrad").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
       ResizeBilinearOpGrad<GPUDevice, T>);
 
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_GRAD_KERNEL);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GRAD_KERNEL);
 
 #undef REGISTER_GRAD_KERNEL
 
diff --git a/tensorflow/core/kernels/resize_bilinear_op.h b/tensorflow/core/kernels/image/resize_bilinear_op.h
similarity index 90%
rename from tensorflow/core/kernels/resize_bilinear_op.h
rename to tensorflow/core/kernels/image/resize_bilinear_op.h
index b4d0066d4f3..34a6b320251 100644
--- a/tensorflow/core/kernels/resize_bilinear_op.h
+++ b/tensorflow/core/kernels/image/resize_bilinear_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_KERNELS_RESIZE_BILINEAR_OP_H_
-#define TENSORFLOW_CORE_KERNELS_RESIZE_BILINEAR_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_IMAGERESIZE_BILINEAR_OP_H_
+#define TENSORFLOW_CORE_KERNELS_IMAGERESIZE_BILINEAR_OP_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/numeric_types.h"
@@ -43,4 +43,4 @@ struct ResizeBilinearGrad {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_RESIZE_BILINEAR_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_IMAGERESIZE_BILINEAR_OP_H_
diff --git a/tensorflow/core/kernels/resize_bilinear_op_gpu.cu.cc b/tensorflow/core/kernels/image/resize_bilinear_op_gpu.cu.cc
similarity index 98%
rename from tensorflow/core/kernels/resize_bilinear_op_gpu.cu.cc
rename to tensorflow/core/kernels/image/resize_bilinear_op_gpu.cu.cc
index 42a3daae116..c8dfe754060 100644
--- a/tensorflow/core/kernels/resize_bilinear_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/image/resize_bilinear_op_gpu.cu.cc
@@ -21,7 +21,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/kernels/resize_bilinear_op.h"
+#include "tensorflow/core/kernels/image/resize_bilinear_op.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/gpu_kernel_helper.h"
 
@@ -442,13 +442,17 @@ struct ResizeBilinearGrad<GPUDevice, T> {
   }
 };
 
-#define DEFINE_GPU_SPECS(T)                     \
-  template struct ResizeBilinear<GPUDevice, T>; \
+#define DEFINE_GPU_SPEC(T) template struct ResizeBilinear<GPUDevice, T>;
+
+TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPEC);
+
+#define DEFINE_GRAD_GPU_SPEC(T) \
   template struct ResizeBilinearGrad<GPUDevice, T>;
 
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(DEFINE_GPU_SPECS);
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(DEFINE_GRAD_GPU_SPEC);
 
-#undef DEFINE_GPU_SPECS
+#undef DEFINE_GPU_SPEC
+#undef DEFINE_GRAD_GPU_SPEC
 
 }  // namespace functor
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/resize_bilinear_op_test.cc b/tensorflow/core/kernels/image/resize_bilinear_op_test.cc
similarity index 100%
rename from tensorflow/core/kernels/resize_bilinear_op_test.cc
rename to tensorflow/core/kernels/image/resize_bilinear_op_test.cc
diff --git a/tensorflow/core/kernels/resize_nearest_neighbor_op.cc b/tensorflow/core/kernels/image/resize_nearest_neighbor_op.cc
similarity index 99%
rename from tensorflow/core/kernels/resize_nearest_neighbor_op.cc
rename to tensorflow/core/kernels/image/resize_nearest_neighbor_op.cc
index 4a357333957..a3c6a69a692 100644
--- a/tensorflow/core/kernels/resize_nearest_neighbor_op.cc
+++ b/tensorflow/core/kernels/image/resize_nearest_neighbor_op.cc
@@ -16,7 +16,7 @@ limitations under the License.
 // See docs in ../ops/image_ops.cc
 #define EIGEN_USE_THREADS
 
-#include "tensorflow/core/kernels/resize_nearest_neighbor_op.h"
+#include "tensorflow/core/kernels/image/resize_nearest_neighbor_op.h"
 
 #include <memory>
 
@@ -26,9 +26,9 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/image_resizer_state.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/util/image_resizer_state.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/resize_nearest_neighbor_op.h b/tensorflow/core/kernels/image/resize_nearest_neighbor_op.h
similarity index 88%
rename from tensorflow/core/kernels/resize_nearest_neighbor_op.h
rename to tensorflow/core/kernels/image/resize_nearest_neighbor_op.h
index d6b053180ce..db0276477eb 100644
--- a/tensorflow/core/kernels/resize_nearest_neighbor_op.h
+++ b/tensorflow/core/kernels/image/resize_nearest_neighbor_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_KERNELS_RESIZE_NEAREST_NEIGHBOR_OP_H_
-#define TENSORFLOW_CORE_KERNELS_RESIZE_NEAREST_NEIGHBOR_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_IMAGERESIZE_NEAREST_NEIGHBOR_OP_H_
+#define TENSORFLOW_CORE_KERNELS_IMAGERESIZE_NEAREST_NEIGHBOR_OP_H_
 
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/platform/types.h"
@@ -42,4 +42,4 @@ struct ResizeNearestNeighborGrad {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_RESIZE_NEAREST_NEIGHBOR_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_IMAGERESIZE_NEAREST_NEIGHBOR_OP_H_
diff --git a/tensorflow/core/kernels/resize_nearest_neighbor_op_gpu.cu.cc b/tensorflow/core/kernels/image/resize_nearest_neighbor_op_gpu.cu.cc
similarity index 99%
rename from tensorflow/core/kernels/resize_nearest_neighbor_op_gpu.cu.cc
rename to tensorflow/core/kernels/image/resize_nearest_neighbor_op_gpu.cu.cc
index b6a9c77ba13..50066d5b653 100644
--- a/tensorflow/core/kernels/resize_nearest_neighbor_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/image/resize_nearest_neighbor_op_gpu.cu.cc
@@ -21,7 +21,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/kernels/resize_nearest_neighbor_op.h"
+#include "tensorflow/core/kernels/image/resize_nearest_neighbor_op.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/gpu_kernel_helper.h"
 
diff --git a/tensorflow/core/kernels/resize_nearest_neighbor_op_test.cc b/tensorflow/core/kernels/image/resize_nearest_neighbor_op_test.cc
similarity index 100%
rename from tensorflow/core/kernels/resize_nearest_neighbor_op_test.cc
rename to tensorflow/core/kernels/image/resize_nearest_neighbor_op_test.cc
diff --git a/tensorflow/core/kernels/resize_op_benchmark_test.cc b/tensorflow/core/kernels/image/resize_op_benchmark_test.cc
similarity index 100%
rename from tensorflow/core/kernels/resize_op_benchmark_test.cc
rename to tensorflow/core/kernels/image/resize_op_benchmark_test.cc
diff --git a/tensorflow/core/kernels/sample_distorted_bounding_box_op.cc b/tensorflow/core/kernels/image/sample_distorted_bounding_box_op.cc
similarity index 86%
rename from tensorflow/core/kernels/sample_distorted_bounding_box_op.cc
rename to tensorflow/core/kernels/image/sample_distorted_bounding_box_op.cc
index 2936856ec29..3b1cc3d27f0 100644
--- a/tensorflow/core/kernels/sample_distorted_bounding_box_op.cc
+++ b/tensorflow/core/kernels/image/sample_distorted_bounding_box_op.cc
@@ -14,12 +14,16 @@ limitations under the License.
 ==============================================================================*/
 // See docs in ../ops/image_ops.cc.
 #include <math.h>
+
 #include <cmath>
+
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/stateless_random_ops.h"
+#include "tensorflow/core/lib/random/philox_random.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
 #include "tensorflow/core/util/guarded_philox_random.h"
 
@@ -201,12 +205,10 @@ bool GenerateRandomCrop(int original_width, int original_height,
 }  // namespace
 
 template <typename T>
-class SampleDistortedBoundingBoxV2Op : public OpKernel {
+class SampleDistortedBoundingBoxBaseOp : public OpKernel {
  public:
-  explicit SampleDistortedBoundingBoxV2Op(OpKernelConstruction* context)
+  explicit SampleDistortedBoundingBoxBaseOp(OpKernelConstruction* context)
       : OpKernel(context) {
-    OP_REQUIRES_OK(context, generator_.Init(context));
-
     if (context->num_inputs() == 2) {
       OP_REQUIRES_OK(context, context->GetAttr("min_object_covered",
                                                &min_object_covered_));
@@ -252,7 +254,7 @@ class SampleDistortedBoundingBoxV2Op : public OpKernel {
                                         max_attempts_));
   }
 
-  void Compute(OpKernelContext* context) override {
+  void DoCompute(OpKernelContext* context, const random::PhiloxRandom& rng) {
     const Tensor& image_size = context->input(0);
 
     OP_REQUIRES(context, image_size.dims() == 1,
@@ -287,7 +289,11 @@ class SampleDistortedBoundingBoxV2Op : public OpKernel {
                     input_boxes.shape().DebugString()));
 
     float min_object_covered_val = 0.0;
-    if (context->num_inputs() == 3) {
+    // `SampleDistortedBoundingBox` op accepts 2 inputs and has
+    // `min_object_covered` as an attribute (handled in the constructor).
+    // `SampleDistortedBoundingBoxV2` and `StatelessSampleDistortedBoundingBox`
+    // ops accept 3+ inputs, including `min_object_covered`.
+    if (context->num_inputs() >= 3) {
       const Tensor& min_object_covered = context->input(2);
 
       OP_REQUIRES(
@@ -342,8 +348,8 @@ class SampleDistortedBoundingBoxV2Op : public OpKernel {
     const float min_sample_aspect_ratio = aspect_ratio_range_[0];
     const float max_sample_aspect_ratio = aspect_ratio_range_[1];
 
-    auto local_gen = generator_.ReserveSamples32(4 * max_attempts_);
-    random::SimplePhilox random(&local_gen);
+    auto local_rng = rng;
+    random::SimplePhilox random(&local_rng);
 
     Rectangle crop_rect;
     bool sample_generated = false;
@@ -420,8 +426,7 @@ class SampleDistortedBoundingBoxV2Op : public OpKernel {
     size_data(2) = T(-1);
   }
 
- private:
-  GuardedPhiloxRandom generator_;
+ protected:
   int32 max_attempts_;
   std::vector<float> area_range_;
   std::vector<float> aspect_ratio_range_;
@@ -429,15 +434,62 @@ class SampleDistortedBoundingBoxV2Op : public OpKernel {
   bool use_image_if_no_bounding_boxes_;
 };
 
-#define REGISTER_KERNELS(type)                                  \
-  REGISTER_KERNEL_BUILDER(Name("SampleDistortedBoundingBox")    \
-                              .Device(DEVICE_CPU)               \
-                              .TypeConstraint<type>("T"),       \
-                          SampleDistortedBoundingBoxV2Op<type>) \
-  REGISTER_KERNEL_BUILDER(Name("SampleDistortedBoundingBoxV2")  \
-                              .Device(DEVICE_CPU)               \
-                              .TypeConstraint<type>("T"),       \
-                          SampleDistortedBoundingBoxV2Op<type>)
+template <typename T>
+class StatefulSampleDistortedBoundingBoxOp
+    : public SampleDistortedBoundingBoxBaseOp<T> {
+ public:
+  explicit StatefulSampleDistortedBoundingBoxOp(OpKernelConstruction* context)
+      : SampleDistortedBoundingBoxBaseOp<T>(context) {
+    OP_REQUIRES_OK(context, generator_.Init(context));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    // Need to reserve samples since `generator_` is shared.
+    this->DoCompute(context,
+                    generator_.ReserveSamples32(4 * this->max_attempts_));
+  }
+
+ private:
+  GuardedPhiloxRandom generator_;
+};
+
+template <typename T>
+class StatelessSampleDistortedBoundingBoxOp
+    : public SampleDistortedBoundingBoxBaseOp<T> {
+ public:
+  explicit StatelessSampleDistortedBoundingBoxOp(OpKernelConstruction* context)
+      : SampleDistortedBoundingBoxBaseOp<T>(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& seed_t = context->input(3);
+    OP_REQUIRES(context, seed_t.dims() == 1 && seed_t.dim_size(0) == 2,
+                errors::InvalidArgument("seed must have shape [2], not ",
+                                        seed_t.shape().DebugString()));
+
+    // Create and initialize stateless random number generator (rng).
+    // There is no need to `Skip` (or reserve) samples since the scope of this
+    // rng is local.
+    random::PhiloxRandom::Key key;
+    random::PhiloxRandom::ResultType counter;
+    OP_REQUIRES_OK(context, GenerateKey(seed_t, &key, &counter));
+
+    this->DoCompute(context, random::PhiloxRandom(counter, key));
+  }
+};
+
+#define REGISTER_KERNELS(type)                                        \
+  REGISTER_KERNEL_BUILDER(Name("SampleDistortedBoundingBox")          \
+                              .Device(DEVICE_CPU)                     \
+                              .TypeConstraint<type>("T"),             \
+                          StatefulSampleDistortedBoundingBoxOp<type>) \
+  REGISTER_KERNEL_BUILDER(Name("SampleDistortedBoundingBoxV2")        \
+                              .Device(DEVICE_CPU)                     \
+                              .TypeConstraint<type>("T"),             \
+                          StatefulSampleDistortedBoundingBoxOp<type>) \
+  REGISTER_KERNEL_BUILDER(Name("StatelessSampleDistortedBoundingBox") \
+                              .Device(DEVICE_CPU)                     \
+                              .TypeConstraint<type>("T"),             \
+                          StatelessSampleDistortedBoundingBoxOp<type>)
 
 TF_CALL_INTEGRAL_TYPES(REGISTER_KERNELS);
 #undef REGISTER_KERNELS
diff --git a/tensorflow/core/kernels/sampling_kernels.cc b/tensorflow/core/kernels/image/sampling_kernels.cc
similarity index 96%
rename from tensorflow/core/kernels/sampling_kernels.cc
rename to tensorflow/core/kernels/image/sampling_kernels.cc
index 306b8d6a390..ae62a1b2e3d 100644
--- a/tensorflow/core/kernels/sampling_kernels.cc
+++ b/tensorflow/core/kernels/image/sampling_kernels.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/sampling_kernels.h"
+#include "tensorflow/core/kernels/image/sampling_kernels.h"
+
 #include <string>
+
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 
diff --git a/tensorflow/core/kernels/sampling_kernels.h b/tensorflow/core/kernels/image/sampling_kernels.h
similarity index 100%
rename from tensorflow/core/kernels/sampling_kernels.h
rename to tensorflow/core/kernels/image/sampling_kernels.h
diff --git a/tensorflow/core/kernels/sampling_kernels_test.cc b/tensorflow/core/kernels/image/sampling_kernels_test.cc
similarity index 98%
rename from tensorflow/core/kernels/sampling_kernels_test.cc
rename to tensorflow/core/kernels/image/sampling_kernels_test.cc
index 37c2edc14a3..039a785063f 100644
--- a/tensorflow/core/kernels/sampling_kernels_test.cc
+++ b/tensorflow/core/kernels/image/sampling_kernels_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/sampling_kernels.h"
+#include "tensorflow/core/kernels/image/sampling_kernels.h"
 
 #include "tensorflow/core/platform/test.h"
 
diff --git a/tensorflow/core/kernels/scale_and_translate_op.cc b/tensorflow/core/kernels/image/scale_and_translate_op.cc
similarity index 99%
rename from tensorflow/core/kernels/scale_and_translate_op.cc
rename to tensorflow/core/kernels/image/scale_and_translate_op.cc
index fff457e55c7..1011af7d19e 100644
--- a/tensorflow/core/kernels/scale_and_translate_op.cc
+++ b/tensorflow/core/kernels/image/scale_and_translate_op.cc
@@ -16,9 +16,10 @@ limitations under the License.
 // See docs in ../ops/image_ops.cc
 #define EIGEN_USE_THREADS
 
-#include "tensorflow/core/kernels/scale_and_translate_op.h"
+#include "tensorflow/core/kernels/image/scale_and_translate_op.h"
 
 #include <memory>
+
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -27,7 +28,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/kernels/sampling_kernels.h"
+#include "tensorflow/core/kernels/image/sampling_kernels.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/core/kernels/scale_and_translate_op.h b/tensorflow/core/kernels/image/scale_and_translate_op.h
similarity index 92%
rename from tensorflow/core/kernels/scale_and_translate_op.h
rename to tensorflow/core/kernels/image/scale_and_translate_op.h
index 74bc87ecc7a..9c0650a4c26 100644
--- a/tensorflow/core/kernels/scale_and_translate_op.h
+++ b/tensorflow/core/kernels/image/scale_and_translate_op.h
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_KERNELS_SCALE_AND_TRANSLATE_OP_H_
-#define TENSORFLOW_CORE_KERNELS_SCALE_AND_TRANSLATE_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_IMAGESCALE_AND_TRANSLATE_OP_H_
+#define TENSORFLOW_CORE_KERNELS_IMAGESCALE_AND_TRANSLATE_OP_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/numeric_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/kernels/sampling_kernels.h"
+#include "tensorflow/core/kernels/image/sampling_kernels.h"
 
 namespace tensorflow {
 namespace functor {
@@ -72,4 +72,4 @@ struct GatherSpans {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_SCALE_AND_TRANSLATE_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_IMAGESCALE_AND_TRANSLATE_OP_H_
diff --git a/tensorflow/core/kernels/scale_and_translate_op_test.cc b/tensorflow/core/kernels/image/scale_and_translate_op_test.cc
similarity index 99%
rename from tensorflow/core/kernels/scale_and_translate_op_test.cc
rename to tensorflow/core/kernels/image/scale_and_translate_op_test.cc
index 412a1012686..2959f93a266 100644
--- a/tensorflow/core/kernels/scale_and_translate_op_test.cc
+++ b/tensorflow/core/kernels/image/scale_and_translate_op_test.cc
@@ -21,9 +21,9 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/image/sampling_kernels.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/kernels/ops_util.h"
-#include "tensorflow/core/kernels/sampling_kernels.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
diff --git a/tensorflow/core/kernels/immutable_constant_op_test.cc b/tensorflow/core/kernels/immutable_constant_op_test.cc
index 7eceba7ad8b..d52a8b55a35 100644
--- a/tensorflow/core/kernels/immutable_constant_op_test.cc
+++ b/tensorflow/core/kernels/immutable_constant_op_test.cc
@@ -60,8 +60,12 @@ class TestReadOnlyMemoryRegion : public ReadOnlyMemoryRegion {
 class TestFileSystem : public NullFileSystem {
  public:
   ~TestFileSystem() override = default;
+
+  // import non-transactional method from the base class
+  using NullFileSystem::NewReadOnlyMemoryRegionFromFile;
+
   Status NewReadOnlyMemoryRegionFromFile(
-      const string& fname,
+      const string& fname, TransactionToken* token,
       std::unique_ptr<ReadOnlyMemoryRegion>* result) override {
     float val = 0;
     StringPiece scheme, host, path;
diff --git a/tensorflow/core/kernels/inplace_ops.cc b/tensorflow/core/kernels/inplace_ops.cc
index b5191b9989f..1849cb42883 100644
--- a/tensorflow/core/kernels/inplace_ops.cc
+++ b/tensorflow/core/kernels/inplace_ops.cc
@@ -25,9 +25,6 @@ limitations under the License.
 
 namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SyclDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 namespace functor {
 
@@ -60,23 +57,6 @@ Status DoParallelConcat(const CPUDevice& d, const Tensor& value, int32 loc,
   }
 }
 
-#ifdef TENSORFLOW_USE_SYCL
-template <>
-Status DoParallelConcat(const SyclDevice& d, const Tensor& value, int32 loc,
-                        Tensor* output) {
-  CHECK_EQ(value.dtype(), output->dtype());
-  switch (value.dtype()) {
-#define CASE(type)                  \
-  case DataTypeToEnum<type>::value: \
-    return DoParallelConcatUpdate<SyclDevice, type>(d, value, loc, output);
-    TF_CALL_GPU_NUMBER_TYPES_NO_HALF(CASE);
-#undef CASE
-    default:
-      return errors::InvalidArgument("Unsupported data type: ",
-                                     DataTypeString(value.dtype()));
-  }
-}
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // end namespace functor
 
@@ -175,41 +155,6 @@ TF_CALL_POD_STRING_TYPES(REGISTER_EMPTY)
 TF_CALL_POD_STRING_TYPES(REGISTER_PARALLEL_CONCAT);
 #undef REGISTER_PARALLEL_CONCAT
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_EMPTY(type)                                  \
-  REGISTER_KERNEL_BUILDER(Name("_ParallelConcatStart")        \
-                              .Device(DEVICE_SYCL)            \
-                              .TypeConstraint<type>("dtype"), \
-                          ParallelConcatStart<SyclDevice, type>);
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_EMPTY)
-#undef REGISTER_EMPTY
-
-#define REGISTER_PARALLEL_CONCAT(type)                                      \
-  REGISTER_KERNEL_BUILDER(                                                  \
-      Name("ParallelConcat").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
-      FailureKernel);
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_PARALLEL_CONCAT);
-#undef REGISTER_PARALLEL_CONCAT
-
-#define REGISTER(type)                                    \
-  REGISTER_KERNEL_BUILDER(Name("_ParallelConcatUpdate")   \
-                              .Device(DEVICE_SYCL)        \
-                              .TypeConstraint<type>("T"), \
-                          ParallelConcatUpdate<SyclDevice>);
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER)
-#undef REGISTER
-
-// Register versions that operate on int32 data on the CPU even though the op
-// has been placed on the SYCL
-
-REGISTER_KERNEL_BUILDER(Name("_ParallelConcatUpdate")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("value")
-                            .HostMemory("update")
-                            .HostMemory("output")
-                            .TypeConstraint<int32>("T"),
-                        ParallelConcatUpdate<CPUDevice>);
-#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
diff --git a/tensorflow/core/kernels/isotonic_regression_op.cc b/tensorflow/core/kernels/isotonic_regression_op.cc
new file mode 100644
index 00000000000..d2a310e56c3
--- /dev/null
+++ b/tensorflow/core/kernels/isotonic_regression_op.cc
@@ -0,0 +1,226 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cmath>
+
+#include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/platform/threadpool.h"
+
+namespace {
+
+using tensorflow::int32;
+using tensorflow::int64;
+
+// The # of ops estimated for the isotonic regression solver is the size of the
+// array multiplied by this constant. This is used by the thread pool executor
+// when deciding how many threads to use.
+constexpr int kCostMultiplier = 100;
+
+// In separable chain-constrained problems, i.e., those of the form
+//
+//  min_{y_1 >= y_2 >= ... >= y_n} \sum_{i=1}^n h_i(y_i)
+//
+// for any set of convex functions h_i, of particular importance are contiguous
+// segments of coordinates, which this class represents. The interval is assumed
+// to be half-closed and equal to [col_start(), col_limit()).
+class Segment {
+ public:
+  // Creates the [col_index, col_index+1).
+  explicit Segment(int col_index)
+      : col_start_(col_index), col_limit_(col_index + 1) {}
+
+  // Returns the number of points in the segment.
+  int num_points() const { return col_limit_ - col_start_; }
+
+  // Merge another segment into this one.
+  void merge_with(const Segment& other) {
+    col_start_ = std::min(col_start_, other.col_start());
+    col_limit_ = std::max(col_limit_, other.col_limit());
+  }
+
+  int col_start() const { return col_start_; }
+
+  int col_limit() const { return col_limit_; }
+
+ private:
+  int col_start_;
+  int col_limit_;
+};
+
+// If we can solve for each segment {j, j+1, ..., j+m} the interval problem
+//
+//  argmin_y \sum_{i=j}^{j+m} h_i(y),
+//
+// we can use such an oracle to solve the general problem. The following class
+// implements such an oracle for the case when h_i is the squared (l2) loss,
+// or formally h_i(y) = (y - x_i)^2, where x_i is the i-th input.
+//
+// TODO(josipd): We know how and can extend this to other functions if needed.
+template <typename T>
+class L2PavaSegment : public Segment {
+ public:
+  L2PavaSegment(T y, int col_index)
+      : Segment(col_index), y_sum_(y), minimum_(y) {}
+
+  void merge_with(const L2PavaSegment& other) {
+    Segment::merge_with(other);
+    y_sum_ += other.y_sum_;
+    minimum_ = y_sum_ / static_cast<T>(num_points());
+  }
+
+  T minimum() const { return minimum_; }
+
+ private:
+  T y_sum_;    // The sum of the inputs within the segment.
+  T minimum_;  // The minimum, cached to avoid expensive divisions.
+};
+
+// Solve one of the problems in the batch (the row_index'th one) using the
+// pool-adjacent violators algorithm (PAVA).
+//
+// The PAVA algorithm goes back to
+//
+// Nonmetric Multidimensional Scaling: A numerical method
+// Kruskal, J. B. (1964), Psychometrika (1964)
+//
+// For a more recent analysis, please refer to
+//
+// Active set algorithms for isotonic regression; a unifying framework
+// Best, Michael J., and Nilotpal Chakravarti
+// Mathematical Programming 47.1-3 (1990)
+//
+// Intuitively, the algorithm splits the inputs into blocks (starting from
+// singleton ones), and then whenever there are two consecutive blocks whose
+// minima violate the inequality constraint, they are merged. The solution is
+// then block-wise constant, each block equal to the corresponding minimum.
+//
+// The tensors should be two dimensional, and the segment objects should
+// support the minimum() and merge_with() methods.
+template <typename SegmentType, typename FloatTensor, typename IntTensor>
+void solve_pava(const std::function<SegmentType(int, int)>& make_segment,
+                FloatTensor* solution, IntTensor* segments, int row_index) {
+  const size_t n = solution->dimensions()[1];
+  std::vector<SegmentType> pools;
+  pools.reserve(n);
+
+  for (size_t col_index = 0; col_index < n; ++col_index) {
+    pools.push_back(make_segment(row_index, col_index));
+
+    // While the last two pools are decreasing, merge them.
+    while (pools.size() > 1 &&
+           pools.rbegin()->minimum() > (pools.rbegin() + 1)->minimum()) {
+      (pools.rbegin() + 1)->merge_with(*pools.rbegin());
+      pools.pop_back();
+    }
+  }
+
+  int segment_id = 0;
+  for (const auto& pool : pools) {
+    const auto pool_minimum = pool.minimum();
+    // The matrices are row major, so we can scan the memory linearly.
+    auto* solution_ptr = &(*solution)(row_index, pool.col_start());
+    auto* segments_ptr = &(*segments)(row_index, pool.col_start());
+    for (int i = pool.col_start(); i < pool.col_limit(); ++i) {
+      *solution_ptr++ = pool_minimum;
+      *segments_ptr++ = segment_id;
+    }
+    ++segment_id;
+  }
+}
+
+// Solve a batch of problems using the pool-adjacent violators algorithm.
+// The problems are solved in parallel using tensorflow's thread pool.
+template <typename SegmentType, typename FloatTensor, typename IntTensor>
+void solve_pava_batch(const std::function<SegmentType(int, int)>& make_segment,
+                      FloatTensor* solution, IntTensor* segments,
+                      tensorflow::OpKernelContext* context) {
+  const int batch_size = solution->dimensions()[0];
+  const int problem_size = solution->dimensions()[1];
+
+  auto thread_pool =
+      context->device()->tensorflow_cpu_worker_threads()->workers;
+
+  thread_pool->ParallelFor(
+      batch_size, kCostMultiplier * problem_size,
+      [&make_segment, &solution, &segments](int64 row_start, int64 row_limit) {
+        // Casting to int is safe, as we do boundary checks in `Compute`.
+        for (int row_index = static_cast<int>(row_start);
+             row_index < static_cast<int>(row_limit); ++row_index) {
+          solve_pava(make_segment, solution, segments, row_index);
+        }
+      });
+}
+
+}  // namespace
+
+template <typename Tin, typename Tout>
+class IsotonicRegressionOp : public tensorflow::OpKernel {
+ public:
+  explicit IsotonicRegressionOp(tensorflow::OpKernelConstruction* context)
+      : tensorflow::OpKernel(context) {}
+
+  void Compute(tensorflow::OpKernelContext* context) override {
+    // Grab the input tensor.
+    const tensorflow::Tensor& input_tensor = context->input(0);
+    const auto input = input_tensor.flat_inner_dims<Tin, 2>();
+    int int_max = std::numeric_limits<int32>::max();
+    OP_REQUIRES(context,
+                tensorflow::FastBoundsCheck(input.dimensions()[0], int_max) &&
+                    tensorflow::FastBoundsCheck(input.dimensions()[1], int_max),
+                tensorflow::errors::InvalidArgument("Tensor too large"));
+
+    // Create the output tensor holding the minimizers.
+    const auto shape = input_tensor.shape();
+    tensorflow::Tensor* output_tensor = nullptr;
+    OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
+                                {0}, 0, shape, &output_tensor));
+    auto output = output_tensor->flat_inner_dims<Tout, 2>();
+
+    // Create the output tensor holidng the segment memberships.
+    tensorflow::Tensor* segments_tensor = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(1, shape, &segments_tensor));
+    auto segments = segments_tensor->flat_inner_dims<int>();
+
+    auto make_l2_segment = [&input](int row_index, int col_index) {
+      return L2PavaSegment<Tout>(input(row_index, col_index), col_index);
+    };
+    solve_pava_batch<L2PavaSegment<Tout>>(make_l2_segment, &output, &segments,
+                                          context);
+  }
+};
+
+#define REGISTER_CPU_KERNEL(Tin, Tout)                               \
+  REGISTER_KERNEL_BUILDER(Name("IsotonicRegression")                 \
+                              .Device(tensorflow::DEVICE_CPU)        \
+                              .TypeConstraint<Tin>("T")              \
+                              .TypeConstraint<Tout>("output_dtype"), \
+                          IsotonicRegressionOp<Tin, Tout>);
+
+// Float types have the same input and output.
+#define REGISTER_CPU_SAME_KERNEL(T) REGISTER_CPU_KERNEL(T, T)
+TF_CALL_FLOAT_TYPES(REGISTER_CPU_SAME_KERNEL);
+
+// 8 and 16 bit integers get converted to 32 bit floats.
+#define REGISTER_CPU_KERNEL_FLOAT(Tin) REGISTER_CPU_KERNEL(Tin, float)
+TF_CALL_int16(REGISTER_CPU_KERNEL_FLOAT);
+TF_CALL_int8(REGISTER_CPU_KERNEL_FLOAT);
+
+// 32 and 64 bit integers get converted to 64 bit floats.
+#define REGISTER_CPU_KERNEL_DOUBLE(Tin) REGISTER_CPU_KERNEL(Tin, double)
+TF_CALL_int64(REGISTER_CPU_KERNEL_DOUBLE);
+TF_CALL_int32(REGISTER_CPU_KERNEL_DOUBLE);
diff --git a/tensorflow/core/kernels/isotonic_regression_op_test.cc b/tensorflow/core/kernels/isotonic_regression_op_test.cc
new file mode 100644
index 00000000000..dcba9001f5b
--- /dev/null
+++ b/tensorflow/core/kernels/isotonic_regression_op_test.cc
@@ -0,0 +1,143 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdio>
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+namespace {
+
+class IsotonicRegressionOpTest : public OpsTestBase {
+ public:
+  void MakeOp(DataType type) {
+    TF_ASSERT_OK(NodeDefBuilder("myop", "IsotonicRegression")
+                     .Input(FakeInput(type))
+                     .Finalize(node_def()));
+    TF_ASSERT_OK(InitOp());
+  }
+};
+
+class BenchmarkHelper : public IsotonicRegressionOpTest {
+ public:
+  void TestBody() override {}
+
+  void AddIncreasingInput(int batch_size, int input_size) {
+    std::vector<float> input_data(input_size * batch_size, 0);
+    for (int i = 0; i < input_data.size(); i++) {
+      input_data[i] = i;
+    }
+    AddInputFromArray<float>(TensorShape({batch_size, input_size}), input_data);
+  }
+};
+
+TEST_F(IsotonicRegressionOpTest, Constant) {
+  MakeOp(DT_FLOAT_REF);
+
+  AddInputFromArray<float>(TensorShape({5, 3}),
+                           {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0});
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({5, 3}));
+  test::FillValues<float>(&expected,
+                          {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0});
+  test::ExpectClose(expected, *GetOutput((0)));
+}
+
+TEST_F(IsotonicRegressionOpTest, IncreasingInput) {
+  MakeOp(DT_FLOAT_REF);
+
+  AddInputFromArray<float>(TensorShape({5, 3}),
+                           {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({5, 3}));
+  test::FillValues<float>(&expected,
+                          {2, 2, 2, 5, 5, 5, 8, 8, 8, 11, 11, 11, 14, 14, 14});
+  test::ExpectClose(expected, *GetOutput((0)));
+
+  Tensor expected_ord(allocator(), DT_INT32, TensorShape({5, 3}));
+  test::FillValues<int>(&expected_ord,
+                        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0});
+  test::ExpectTensorEqual<int>(expected_ord, *GetOutput((1)));
+}
+
+TEST_F(IsotonicRegressionOpTest, Decreasing) {
+  MakeOp(DT_FLOAT_REF);
+
+  AddInputFromArray<float>(TensorShape({5, 3}),
+                           {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({5, 3}));
+  test::FillValues<float>(&expected,
+                          {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1});
+  test::ExpectClose(expected, *GetOutput((0)));
+
+  Tensor expected_ord(allocator(), DT_INT32, TensorShape({5, 3}));
+  test::FillValues<int>(&expected_ord,
+                        {0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2});
+  test::ExpectTensorEqual<int>(expected_ord, *GetOutput((1)));
+}
+
+#ifdef PLATFORM_GOOGLE
+
+static void BM_IncreasingSequence(benchmark::State& state) {
+  int batch_size = state.range(0);
+  int input_size = state.range(1);
+
+  for (auto _ : state) {
+    state.PauseTiming();
+    BenchmarkHelper helper;
+    helper.MakeOp(DT_FLOAT_REF);
+    helper.AddIncreasingInput(batch_size, input_size);
+    state.ResumeTiming();
+    Status stat = helper.RunOpKernel();
+  }
+  state.SetItemsProcessed(
+      static_cast<int64>(batch_size * input_size * state.iterations()));
+}
+
+BENCHMARK(BM_IncreasingSequence)
+    ->Args({1, 1 << 0})
+    ->Args({1, 1 << 5})
+    ->Args({1, 1 << 8})
+    ->Args({1, 1 << 10})
+    ->Args({1, 1 << 20})
+    ->Args({1, 2 << 20})
+    ->Args({1 << 0, 1 << 10})
+    ->Args({1 << 1, 1 << 10})
+    ->Args({1 << 4, 1 << 10})
+    ->Args({1 << 6, 1 << 10})
+    ->Args({1 << 9, 1 << 10})
+    ->Args({1 << 10, 1 << 10});
+
+#endif  // PLATFORM_GOOGLE
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/linalg/BUILD b/tensorflow/core/kernels/linalg/BUILD
index c735f58ae51..ab25fad3ec3 100644
--- a/tensorflow/core/kernels/linalg/BUILD
+++ b/tensorflow/core/kernels/linalg/BUILD
@@ -10,19 +10,6 @@ load(
 )
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
 
-# Description:
-# Op kernel implementations for TensorFlow.
-#
-# Note: Any test that uses GPU support and which we would like to
-# benchmark should be linked statically so that it can be executed
-# from a py_binary or cuda_py_test test logger.  For such a test,
-# append "_gpu" to the test name to invoke the GPU benchmarks.  Example:
-#
-#   # for CPU tests
-#   $ bazel test --config opt //third_party/tensorflow/core/kernels:my_op_test
-#   # for GPU benchmarks
-#   $ bazel run --config opt --config=cuda //third_party/tensorflow/core/kernels:my_op_test_gpu -- --benchmarks=..
-#
 package(
     default_visibility = [
         "//tensorflow:__subpackages__",
diff --git a/tensorflow/core/kernels/list_kernels.h b/tensorflow/core/kernels/list_kernels.h
index 37fc1b3ae08..1a99eac31c4 100644
--- a/tensorflow/core/kernels/list_kernels.h
+++ b/tensorflow/core/kernels/list_kernels.h
@@ -281,9 +281,6 @@ class TensorListConcat : public OpKernel {
       std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>;
   explicit TensorListConcat(OpKernelConstruction* c) : OpKernel(c) {
     OP_REQUIRES_OK(c, c->GetAttr("element_dtype", &element_dtype_));
-    // TODO(skyewm): the HasAttr check can be removed once the
-    // element_shape_except_first_dim attr has been checked in for 2 weeks
-    // (around 1/14/2019).
     if (c->HasAttr("element_shape")) {
       PartialTensorShape element_shape;
       OP_REQUIRES_OK(c, c->GetAttr("element_shape", &element_shape));
diff --git a/tensorflow/core/kernels/lookup_table_init_op.cc b/tensorflow/core/kernels/lookup_table_init_op.cc
index 7bffb5ac547..cb757ac930b 100644
--- a/tensorflow/core/kernels/lookup_table_init_op.cc
+++ b/tensorflow/core/kernels/lookup_table_init_op.cc
@@ -175,7 +175,7 @@ class InitializeTableFromDatasetOp : public AsyncOpKernel {
     OP_REQUIRES_OK_ASYNC(
         ctx, GetInitializableLookupTable("table_handle", ctx, &table), done);
     core::ScopedUnref unref_me(table);
-    DatasetBase* dataset;
+    data::DatasetBase* dataset;
     OP_REQUIRES_OK_ASYNC(
         ctx, GetDatasetFromVariantTensor(ctx->input(1), &dataset), done);
     background_worker_.Schedule([ctx, dataset, table, done]() {
diff --git a/tensorflow/core/kernels/lookup_util.cc b/tensorflow/core/kernels/lookup_util.cc
index fc1e2fe2b17..d07b525a6bd 100644
--- a/tensorflow/core/kernels/lookup_util.cc
+++ b/tensorflow/core/kernels/lookup_util.cc
@@ -396,12 +396,12 @@ Status InitializeTableFromTextFile(const string& filename, int64 vocab_size,
 
 class DatasetIterator : public InitializableLookupTable::InitTableIterator {
  public:
-  explicit DatasetIterator(DatasetBase* dataset) : dataset_(dataset) {}
+  explicit DatasetIterator(data::DatasetBase* dataset) : dataset_(dataset) {}
 
   ~DatasetIterator() override {}
 
   Status Init(OpKernelContext* ctx) {
-    IteratorContext::Params params(ctx);
+    data::IteratorContext::Params params(ctx);
     function_handle_cache_ =
         absl::make_unique<data::FunctionHandleCache>(params.flr);
     params.function_handle_cache = function_handle_cache_.get();
@@ -409,7 +409,7 @@ class DatasetIterator : public InitializableLookupTable::InitTableIterator {
     cancellation_manager_ =
         absl::make_unique<CancellationManager>(ctx->cancellation_manager());
     params.cancellation_manager = cancellation_manager_.get();
-    iterator_ctx_ = absl::make_unique<IteratorContext>(std::move(params));
+    iterator_ctx_ = absl::make_unique<data::IteratorContext>(std::move(params));
     TF_RETURN_IF_ERROR(dataset_->MakeIterator(iterator_ctx_.get(), nullptr,
                                               "LookupTable", &iterator_));
     Next();
@@ -442,12 +442,12 @@ class DatasetIterator : public InitializableLookupTable::InitTableIterator {
   }
 
  private:
-  DatasetBase* dataset_;  // not owned.
-  std::unique_ptr<IteratorContext> iterator_ctx_;
+  data::DatasetBase* dataset_;  // not owned.
+  std::unique_ptr<data::IteratorContext> iterator_ctx_;
   std::unique_ptr<data::FunctionHandleCache> function_handle_cache_;
   ResourceMgr resource_mgr_;
   std::unique_ptr<CancellationManager> cancellation_manager_;
-  std::unique_ptr<IteratorBase> iterator_;
+  std::unique_ptr<data::IteratorBase> iterator_;
   std::vector<Tensor> tensors_;
   Status status_;
 };
diff --git a/tensorflow/core/kernels/map_kernels.cc b/tensorflow/core/kernels/map_kernels.cc
new file mode 100644
index 00000000000..4430fdf9217
--- /dev/null
+++ b/tensorflow/core/kernels/map_kernels.cc
@@ -0,0 +1,59 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/kernels/map_kernels.h"
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/variant_encode_decode.h"
+
+namespace tensorflow {
+
+REGISTER_KERNEL_BUILDER(Name("EmptyTensorMap").Device(DEVICE_CPU),
+                        EmptyTensorMap);
+
+REGISTER_KERNEL_BUILDER(Name("TensorMapSize").Device(DEVICE_CPU),
+                        TensorMapSize);
+
+REGISTER_KERNEL_BUILDER(Name("TensorMapLookup").Device(DEVICE_CPU),
+                        TensorMapLookup);
+
+REGISTER_KERNEL_BUILDER(Name("TensorMapInsert").Device(DEVICE_CPU),
+                        TensorMapInsert);
+
+REGISTER_KERNEL_BUILDER(Name("TensorMapErase").Device(DEVICE_CPU),
+                        TensorMapErase);
+
+REGISTER_KERNEL_BUILDER(Name("TensorMapHasKey").Device(DEVICE_CPU),
+                        TensorMapHasKey);
+
+REGISTER_KERNEL_BUILDER(Name("TensorMapStackKeys").Device(DEVICE_CPU),
+                        TensorMapStackKeys);
+
+#undef REGISTER_TENSOR_MAP_OPS_CPU
+
+#define REGISTER_TENSOR_MAP_OPS_CPU(T)
+
+REGISTER_UNARY_VARIANT_BINARY_OP_FUNCTION(ADD_VARIANT_BINARY_OP, DEVICE_CPU,
+                                          TensorMap,
+                                          TensorMapBinaryAdd<CPUDevice>);
+
+REGISTER_UNARY_VARIANT_UNARY_OP_FUNCTION(ZEROS_LIKE_VARIANT_UNARY_OP,
+                                         DEVICE_CPU, TensorMap,
+                                         TensorMapZerosLike<CPUDevice>);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/map_kernels.h b/tensorflow/core/kernels/map_kernels.h
new file mode 100644
index 00000000000..cf45db132ee
--- /dev/null
+++ b/tensorflow/core/kernels/map_kernels.h
@@ -0,0 +1,253 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_MAP_KERNELS_H_
+#define TENSORFLOW_CORE_KERNELS_MAP_KERNELS_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/kernels/tensor_map.h"
+#include "tensorflow/core/util/batch_util.h"
+#include "tensorflow/core/util/tensor_ops_util.h"
+
+namespace tensorflow {
+
+Status GetInputMap(OpKernelContext* ctx, int index, const TensorMap** ret_map) {
+  if (!TensorShapeUtils::IsScalar(ctx->input(index).shape())) {
+    return errors::InvalidArgument("Input map must be a scalar. Saw: ",
+                                   ctx->input(index).shape().DebugString());
+  }
+  const TensorMap* map = ctx->input(index).scalar<Variant>()().get<TensorMap>();
+  if (map == nullptr) {
+    return errors::InvalidArgument(
+        "Input handle is not a map. Saw: '",
+        ctx->input(index).scalar<Variant>()().DebugString(), "'");
+  }
+  *ret_map = map;
+  return Status::OK();
+}
+
+// TODO(kattian): change into templated function
+Status ForwardInputOrCreateNewMap(OpKernelContext* ctx, int32 input_index,
+                                  int32 output_index,
+                                  const TensorMap& input_map,
+                                  TensorMap** output_map) {
+  // Attempt to forward the input tensor to the output if possible.
+  std::unique_ptr<Tensor> maybe_output = ctx->forward_input(
+      input_index, output_index, DT_VARIANT, TensorShape{},
+      ctx->input_memory_type(input_index), AllocatorAttributes());
+  Tensor* output_tensor;
+  if (maybe_output != nullptr && maybe_output->dtype() == DT_VARIANT &&
+      maybe_output->NumElements() == 1) {
+    output_tensor = maybe_output.get();
+    TensorMap* tmp_out = output_tensor->scalar<Variant>()().get<TensorMap>();
+    if (tmp_out == nullptr) {
+      return errors::InvalidArgument(
+          "Expected input ", input_index, " to be a TensorMap but saw ",
+          output_tensor->scalar<Variant>()().TypeName());
+    }
+    if (tmp_out->RefCountIsOne()) {
+      // Woohoo, forwarding succeeded!
+      ctx->set_output(output_index, *output_tensor);
+      *output_map = tmp_out;
+      return Status::OK();
+    }
+  }
+
+  // If forwarding is not possible allocate a new output tensor and copy
+  // the `input_map` to it.
+  AllocatorAttributes attr;
+  attr.set_on_host(true);
+  TF_RETURN_IF_ERROR(
+      ctx->allocate_output(output_index, {}, &output_tensor, attr));
+  output_tensor->scalar<Variant>()() = input_map.Copy();
+
+  *output_map = output_tensor->scalar<Variant>()().get<TensorMap>();
+  return Status::OK();
+}
+
+class EmptyTensorMap : public OpKernel {
+ public:
+  explicit EmptyTensorMap(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    Tensor* result;
+    AllocatorAttributes attr;
+    attr.set_on_host(true);
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape{}, &result, attr));
+    TensorMap empty;
+    result->scalar<Variant>()() = std::move(empty);
+  }
+};
+
+class TensorMapSize : public OpKernel {
+ public:
+  explicit TensorMapSize(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+  ~TensorMapSize() override {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const TensorMap* map = nullptr;
+    OP_REQUIRES_OK(ctx, GetInputMap(ctx, 0, &map));
+    Tensor* result;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape{}, &result));
+    result->scalar<int32>()() = map->tensors().size();
+  }
+};
+
+class TensorMapLookup : public OpKernel {
+ public:
+  explicit TensorMapLookup(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+  ~TensorMapLookup() override {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const TensorKey& key = ctx->input(1);
+    const TensorMap* map = nullptr;
+    OP_REQUIRES_OK(ctx, GetInputMap(ctx, 0, &map));
+
+    OP_REQUIRES(
+        ctx, map->tensors().find(key) != map->tensors().end(),
+        errors::InvalidArgument("Trying to lookup non-existent key. Could not "
+                                "find key \"" +
+                                key.SummarizeValue(100) + "\"."));
+
+    ctx->set_output(0, map->tensors().find(key)->second);
+  }
+};
+
+class TensorMapInsert : public OpKernel {
+ public:
+  explicit TensorMapInsert(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+  ~TensorMapInsert() override {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const TensorKey& key = ctx->input(1);
+    const Tensor& value = ctx->input(2);
+    const TensorMap* map = nullptr;
+    OP_REQUIRES_OK(ctx, GetInputMap(ctx, 0, &map));
+
+    TensorMap* output_map = nullptr;
+    OP_REQUIRES_OK(ctx,
+                   ForwardInputOrCreateNewMap(ctx, 0, 0, *map, &output_map));
+    output_map->replace(key, value);
+  }
+};
+
+class TensorMapErase : public OpKernel {
+ public:
+  explicit TensorMapErase(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const TensorKey& key = ctx->input(1);
+    const TensorMap* map = nullptr;
+    OP_REQUIRES_OK(ctx, GetInputMap(ctx, 0, &map));
+
+    OP_REQUIRES(
+        ctx, map->tensors().find(key) != map->tensors().end(),
+        errors::InvalidArgument("Trying to erase non-existent item. Could not "
+                                "find key \"" +
+                                key.SummarizeValue(100) + "\"."));
+
+    TensorMap* output_map = nullptr;
+    OP_REQUIRES_OK(ctx,
+                   ForwardInputOrCreateNewMap(ctx, 0, 0, *map, &output_map));
+    output_map->tensors().erase(key);
+  }
+};
+
+class TensorMapHasKey : public OpKernel {
+ public:
+  explicit TensorMapHasKey(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+  ~TensorMapHasKey() override {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const TensorKey& key = ctx->input(1);
+    const TensorMap* map = nullptr;
+    OP_REQUIRES_OK(ctx, GetInputMap(ctx, 0, &map));
+    Tensor* result;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape{}, &result));
+    result->scalar<bool>()() = map->tensors().find(key) != map->tensors().end();
+  }
+};
+
+class TensorMapStackKeys : public OpKernel {
+ public:
+  explicit TensorMapStackKeys(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("key_dtype", &key_dtype_));
+  }
+  ~TensorMapStackKeys() override {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const TensorMap* map = nullptr;
+    OP_REQUIRES_OK(ctx, GetInputMap(ctx, 0, &map));
+
+    OP_REQUIRES(ctx, map->size() != 0,
+                errors::InvalidArgument(
+                    "TensorMapStackKeys cannot be called on empty map."));
+
+    auto it = map->tensors().begin();
+    TensorShape output_shape = it->first.shape();
+    output_shape.InsertDim(0, map->tensors().size());
+    Tensor* result;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, output_shape, &result));
+
+    int i = 0;
+    size_t sz = map->tensors().size();
+    TensorShape key_shape = it->first.shape();
+    while (it != map->tensors().end() && i < sz) {
+      OP_REQUIRES(
+          ctx, it->first.dtype() == key_dtype_,
+          errors::InvalidArgument("Key does not match requested dtype."));
+      OP_REQUIRES(
+          ctx, it->first.shape() == key_shape,
+          errors::InvalidArgument("Keys must all have the same shape."));
+      OP_REQUIRES_OK(ctx, batch_util::CopyElementToSlice(it->first, result, i));
+      i++;
+      it++;
+    }
+  }
+
+ private:
+  DataType key_dtype_;
+};
+
+template <typename Device>
+Status TensorMapBinaryAdd(OpKernelContext* ctx, const TensorMap& a,
+                          const TensorMap& b, TensorMap* out) {
+  // Binary add returns a map containing the union of keys.
+  // Values with keys in the intersection are added.
+  out->tensors() = a.tensors();
+  for (const std::pair<TensorKey, Tensor>& p : b.tensors()) {
+    absl::flat_hash_map<TensorKey, Tensor>::iterator it =
+        out->tensors().find(p.first);
+    if (it != out->tensors().end()) {
+      Tensor out_tensor;
+      TF_RETURN_IF_ERROR(
+          BinaryAddTensors<Device>(ctx, p.second, it->second, &out_tensor));
+      it->second = out_tensor;
+    } else {
+      out->tensors().emplace(p.first, p.second);
+    }
+  }
+  return Status::OK();
+}
+
+template <typename Device>
+Status TensorMapZerosLike(OpKernelContext* ctx, const TensorMap& x,
+                          TensorMap* y) {
+  // Zeros like returns an empty map.
+  return Status::OK();
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_MAP_KERNELS_H_
diff --git a/tensorflow/core/kernels/map_stage_op.cc b/tensorflow/core/kernels/map_stage_op.cc
index 6c01e42ff8c..89b760ea4d0 100644
--- a/tensorflow/core/kernels/map_stage_op.cc
+++ b/tensorflow/core/kernels/map_stage_op.cc
@@ -556,18 +556,6 @@ REGISTER_KERNEL_BUILDER(Name("OrderedMapStage")
                         MapStageOp<true>);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("MapStage")
-                            .HostMemory("key")
-                            .HostMemory("indices")
-                            .Device(DEVICE_SYCL),
-                        MapStageOp<false>);
-REGISTER_KERNEL_BUILDER(Name("OrderedMapStage")
-                            .HostMemory("key")
-                            .HostMemory("indices")
-                            .Device(DEVICE_SYCL),
-                        MapStageOp<true>);
-#endif  // TENSORFLOW_USE_SYCL
 
 template <bool Ordered>
 class MapUnstageOp : public OpKernel {
@@ -617,18 +605,6 @@ REGISTER_KERNEL_BUILDER(Name("OrderedMapUnstage")
                             .Device(DEVICE_GPU),
                         MapUnstageOp<true>);
 #endif
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("MapUnstage")
-                            .HostMemory("key")
-                            .HostMemory("indices")
-                            .Device(DEVICE_SYCL),
-                        MapUnstageOp<false>);
-REGISTER_KERNEL_BUILDER(Name("OrderedMapUnstage")
-                            .HostMemory("key")
-                            .HostMemory("indices")
-                            .Device(DEVICE_SYCL),
-                        MapUnstageOp<true>);
-#endif  // TENSORFLOW_USE_SYCL
 
 template <bool Ordered>
 class MapPeekOp : public OpKernel {
@@ -676,16 +652,6 @@ REGISTER_KERNEL_BUILDER(Name("OrderedMapPeek")
                         MapPeekOp<true>);
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(
-    Name("MapPeek").HostMemory("key").HostMemory("indices").Device(DEVICE_SYCL),
-    MapPeekOp<false>);
-REGISTER_KERNEL_BUILDER(Name("OrderedMapPeek")
-                            .HostMemory("key")
-                            .HostMemory("indices")
-                            .Device(DEVICE_SYCL),
-                        MapPeekOp<true>);
-#endif  // TENSORFLOW_USE_SYCL
 
 template <bool Ordered>
 class MapUnstageNoKeyOp : public OpKernel {
@@ -741,18 +707,6 @@ REGISTER_KERNEL_BUILDER(Name("OrderedMapUnstageNoKey")
                         MapUnstageNoKeyOp<true>);
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("MapUnstageNoKey")
-                            .HostMemory("key")
-                            .HostMemory("indices")
-                            .Device(DEVICE_SYCL),
-                        MapUnstageNoKeyOp<false>);
-REGISTER_KERNEL_BUILDER(Name("OrderedMapUnstageNoKey")
-                            .HostMemory("key")
-                            .HostMemory("indices")
-                            .Device(DEVICE_SYCL),
-                        MapUnstageNoKeyOp<true>);
-#endif  // TENSORFLOW_USE_SYCL
 
 template <bool Ordered>
 class MapSizeOp : public OpKernel {
@@ -784,13 +738,6 @@ REGISTER_KERNEL_BUILDER(
     Name("OrderedMapSize").Device(DEVICE_GPU).HostMemory("size"),
     MapSizeOp<true>);
 #endif
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("MapSize").Device(DEVICE_SYCL).HostMemory("size"),
-                        MapSizeOp<false>);
-REGISTER_KERNEL_BUILDER(
-    Name("OrderedMapSize").Device(DEVICE_SYCL).HostMemory("size"),
-    MapSizeOp<true>);
-#endif  // TENSORFLOW_USE_SYCL
 
 template <bool Ordered>
 class MapIncompleteSizeOp : public OpKernel {
@@ -824,14 +771,6 @@ REGISTER_KERNEL_BUILDER(
     Name("OrderedMapIncompleteSize").Device(DEVICE_GPU).HostMemory("size"),
     MapIncompleteSizeOp<true>);
 #endif
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(
-    Name("MapIncompleteSize").Device(DEVICE_SYCL).HostMemory("size"),
-    MapIncompleteSizeOp<false>);
-REGISTER_KERNEL_BUILDER(
-    Name("OrderedMapIncompleteSize").Device(DEVICE_SYCL).HostMemory("size"),
-    MapIncompleteSizeOp<true>);
-#endif  // TENSORFLOW_USE_SYCL
 
 template <bool Ordered>
 class MapClearOp : public OpKernel {
@@ -856,12 +795,6 @@ REGISTER_KERNEL_BUILDER(Name("MapClear").Device(DEVICE_GPU), MapClearOp<false>);
 REGISTER_KERNEL_BUILDER(Name("OrderedMapClear").Device(DEVICE_GPU),
                         MapClearOp<true>);
 #endif
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("MapClear").Device(DEVICE_SYCL),
-                        MapClearOp<false>);
-REGISTER_KERNEL_BUILDER(Name("OrderedMapClear").Device(DEVICE_SYCL),
-                        MapClearOp<true>);
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/matmul_op.cc b/tensorflow/core/kernels/matmul_op.cc
index 2e3c120248f..3b57f093e23 100644
--- a/tensorflow/core/kernels/matmul_op.cc
+++ b/tensorflow/core/kernels/matmul_op.cc
@@ -36,9 +36,6 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T, bool USE_CUBLAS>
 struct LaunchMatMul;
@@ -123,18 +120,14 @@ struct LaunchMatMulBase {
       OpKernelContext* ctx, const Tensor& a, const Tensor& b,
       const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair,
       std::vector<AlgorithmType>* algorithms, bool use_autotune, Tensor* out) {
-#ifndef TENSORFLOW_USE_SYCL
     // An explicit vector-matrix multiply is much better optimized than an
     // implicit one and this is a bottleneck during non-batched inference.
     bool was_vector = ExplicitVectorMatrixOptimization<T>(a, b, dim_pair, out);
     if (!was_vector) {
-#endif  // TENSORFLOW_USE_SYCL
       functor::MatMulFunctor<Device, T>()(ctx->eigen_device<Device>(),
                                           out->matrix<T>(), a.matrix<T>(),
                                           b.matrix<T>(), dim_pair);
-#ifndef TENSORFLOW_USE_SYCL
     }
-#endif  // TENSORFLOW_USE_SYCL
   }
 
   static void GetBlasGemmAlgorithm(OpKernelConstruction* ctx,
@@ -148,13 +141,6 @@ struct LaunchMatMulCPU : LaunchMatMulBase<CPUDevice, T> {};
 template <typename T, bool USE_CUBLAS>
 struct LaunchMatMul<CPUDevice, T, USE_CUBLAS> : public LaunchMatMulCPU<T> {};
 
-#ifdef TENSORFLOW_USE_SYCL
-template <typename T>
-struct LaunchMatMulSYCL : LaunchMatMulBase<SYCLDevice, T> {};
-
-template <typename T, bool USE_CUBLAS>
-struct LaunchMatMul<SYCLDevice, T, USE_CUBLAS> : public LaunchMatMulSYCL<T> {};
-#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
@@ -544,19 +530,6 @@ struct MatMulFunctor<CPUDevice, T> {
   }
 };
 
-#ifdef TENSORFLOW_USE_SYCL
-// Partial specialization MatMulFunctor<Device=SYCLDevice, T>.
-template <typename T>
-struct MatMulFunctor<SYCLDevice, T> {
-  void operator()(
-      const SYCLDevice& d, typename MatMulTypes<T>::out_type out,
-      typename MatMulTypes<T>::in_type in0,
-      typename MatMulTypes<T>::in_type in1,
-      const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair) {
-    MatMul<SYCLDevice>(d, out, in0, in1, dim_pair);
-  }
-};
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // end namespace functor
 
@@ -591,18 +564,4 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
 TF_CALL_COMPLEX_TYPES(REGISTER_GPU);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL(T)                                         \
-  REGISTER_KERNEL_BUILDER(                                       \
-      Name("MatMul").Device(DEVICE_SYCL).TypeConstraint<T>("T"), \
-      MatMulOp<SYCLDevice, T, false /* xxblas */>);              \
-  REGISTER_KERNEL_BUILDER(Name("MatMul")                         \
-                              .Device(DEVICE_SYCL)               \
-                              .TypeConstraint<T>("T")            \
-                              .Label("eigen"),                   \
-                          MatMulOp<SYCLDevice, T, false /* xxblas */>)
-TF_CALL_float(REGISTER_SYCL);
-TF_CALL_double(REGISTER_SYCL);
-
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/maxpooling_op.cc b/tensorflow/core/kernels/maxpooling_op.cc
index 5652addd00a..53168105aba 100644
--- a/tensorflow/core/kernels/maxpooling_op.cc
+++ b/tensorflow/core/kernels/maxpooling_op.cc
@@ -105,8 +105,8 @@ static void SpatialMaxPoolWithArgMaxHelper(
     const int32 depth = params.depth;
     const int32 in_rows = params.tensor_in_rows;
     const int32 in_cols = params.tensor_in_cols;
-    const int32 pad_rows = params.pad_rows;
-    const int32 pad_cols = params.pad_cols;
+    const int32 pad_top = params.pad_top;
+    const int32 pad_left = params.pad_left;
     const int32 window_rows = params.window_rows;
     const int32 window_cols = params.window_cols;
     const int32 row_stride = params.row_stride;
@@ -131,8 +131,8 @@ static void SpatialMaxPoolWithArgMaxHelper(
         for (int w = 0; w < in_cols; ++w) {
           // (h_start, h_end) * (w_start, w_end) is the range that the input
           // vector projects to.
-          const int hpad = h + pad_rows;
-          const int wpad = w + pad_cols;
+          const int hpad = h + pad_top;
+          const int wpad = w + pad_left;
           const int h_start =
               (hpad < window_rows) ? 0 : (hpad - window_rows) / row_stride + 1;
           const int h_end = std::min(hpad / row_stride + 1, out_height);
@@ -243,6 +243,13 @@ class MaxPoolingGradOp : public OpKernel {
     }
 
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+
+    if (padding_ == Padding::EXPLICIT) {
+      OP_REQUIRES_OK(
+          context, context->GetAttr("explicit_paddings", &explicit_paddings_));
+      OP_REQUIRES_OK(context, CheckValidPadding(padding_, explicit_paddings_,
+                                                /*num_dims=*/4, data_format_));
+    }
   }
 
   void Compute(OpKernelContext* context) override {
@@ -297,8 +304,13 @@ class MaxPoolingGradOp : public OpKernel {
         errors::Unimplemented(
             "MaxPoolingGrad is not yet supported on the depth dimension."));
 
-    PoolParameters params{context,  ksize,       stride,
-                          padding_, FORMAT_NHWC, tensor_in.shape()};
+    PoolParameters params{context,
+                          ksize,
+                          stride,
+                          padding_,
+                          explicit_paddings_,
+                          FORMAT_NHWC,
+                          tensor_in.shape()};
     if (!context->status().ok()) {
       return;
     }
@@ -316,35 +328,12 @@ class MaxPoolingGradOp : public OpKernel {
   std::vector<int32> ksize_;
   std::vector<int32> stride_;
   Padding padding_;
+  std::vector<int64> explicit_paddings_;
   TensorFormat data_format_;
 };
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-template <typename T>
-static void MaxPoolingBackwardCustomKernel(
-    OpKernelContext* context, const std::vector<int32>& size,
-    const std::vector<int32>& stride, Padding padding, const Tensor* tensor_in,
-    const Tensor& out_backprop, const TensorShape& tensor_in_shape) {
-  Tensor* output = nullptr;
-  OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
-                              {0}, 0, tensor_in_shape, &output));
-
-  PoolParameters params{context, size,        stride,
-                        padding, FORMAT_NHWC, tensor_in_shape};
-  if (!context->status().ok()) {
-    return;
-  }
-
-  functor::MaxPoolBackwardNoMask<T>()(
-      tensor_in->flat<T>().data(), params.tensor_in_batch,
-      params.tensor_in_rows, params.tensor_in_cols, params.depth,
-      params.out_height, params.out_width, params.window_rows,
-      params.window_cols, params.row_stride, params.col_stride, params.pad_rows,
-      params.pad_cols, out_backprop.flat<T>().data(), output->flat<T>().data(),
-      context->eigen_device<Eigen::GpuDevice>());
-}
-
 template <class T>
 class MaxPoolingGradOp<Eigen::GpuDevice, T> : public OpKernel {
  public:
@@ -371,8 +360,12 @@ class MaxPoolingGradOp<Eigen::GpuDevice, T> : public OpKernel {
                       "Pooling is not yet supported on the batch dimension."));
     }
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
-
-    use_dnn_ = CanUseCudnn();
+    if (padding_ == Padding::EXPLICIT) {
+      OP_REQUIRES_OK(
+          context, context->GetAttr("explicit_paddings", &explicit_paddings_));
+      OP_REQUIRES_OK(context, CheckValidPadding(padding_, explicit_paddings_,
+                                                /*num_dims=*/4, data_format_));
+    }
     TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_MAXPOOL_NANPROP", false,
                                    &propagate_nans_));
   }
@@ -417,26 +410,27 @@ class MaxPoolingGradOp<Eigen::GpuDevice, T> : public OpKernel {
     OP_REQUIRES(context, ksize_n == 1 && stride_n == 1,
                 errors::Unimplemented(
                     "Pooling is not yet supported on the batch dimension."));
-
-    if (use_dnn_) {
-      DnnPoolingGradOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum,
-                                   ksize, stride, padding_, data_format_,
-                                   &tensor_in, &tensor_out, out_backprop,
-                                   output_shape, propagate_nans_);
-    } else {
-      CHECK(data_format_ == FORMAT_NHWC)
-          << "Non-Cudnn MaxPoolGrad only supports NHWC format";
-      MaxPoolingBackwardCustomKernel<T>(context, ksize, stride, padding_,
-                                        &tensor_in, out_backprop, output_shape);
+    int64 pad_top, pad_bottom, pad_left, pad_right;
+    if (padding_ == Padding::EXPLICIT) {
+      GetExplicitPaddingForDim(explicit_paddings_, data_format_, 'H',
+                               /*pad_top=*/&pad_top,
+                               /*pad_bottom=*/&pad_bottom);
+      GetExplicitPaddingForDim(explicit_paddings_, data_format_, 'W',
+                               /*pad_left=*/&pad_left,
+                               /*pad_right=*/&pad_right);
     }
+    DnnPoolingGradOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum, ksize,
+                                 stride, padding_, explicit_paddings_,
+                                 data_format_, &tensor_in, &tensor_out,
+                                 out_backprop, output_shape, propagate_nans_);
   }
 
  private:
   std::vector<int32> ksize_;
   std::vector<int32> stride_;
   Padding padding_;
+  std::vector<int64> explicit_paddings_;
   TensorFormat data_format_;
-  bool use_dnn_;
   bool propagate_nans_;
 };
 
@@ -526,8 +520,13 @@ class MaxPoolingGradGradOp : public OpKernel {
         errors::Unimplemented(
             "MaxPoolingGrad is not yet supported on the depth dimension."));
 
-    PoolParameters params{context,  ksize,       stride,
-                          padding_, FORMAT_NHWC, tensor_in.shape()};
+    PoolParameters params{context,
+                          ksize,
+                          stride,
+                          padding_,
+                          /*explicit_paddings=*/{},
+                          FORMAT_NHWC,
+                          tensor_in.shape()};
     Tensor* output = nullptr;
     OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
                                 {2}, 0, tensor_out.shape(), &output));
@@ -585,8 +584,8 @@ class MaxPoolingGradGradOp : public OpKernel {
       const int32 depth = params.depth;
       const int32 in_rows = params.tensor_in_rows;
       const int32 in_cols = params.tensor_in_cols;
-      const int32 pad_rows = params.pad_rows;
-      const int32 pad_cols = params.pad_cols;
+      const int32 pad_top = params.pad_top;
+      const int32 pad_left = params.pad_left;
       const int32 window_rows = params.window_rows;
       const int32 window_cols = params.window_cols;
       const int32 row_stride = params.row_stride;
@@ -608,9 +607,9 @@ class MaxPoolingGradGradOp : public OpKernel {
           for (int pw = 0; pw < out_width; ++pw) {
             // (h_start, h_end) * (w_start, w_end) is the range that the input
             // vector projects to.
-            int h_start = ph * row_stride - pad_rows;
+            int h_start = ph * row_stride - pad_top;
             const int h_end = std::min(h_start + window_rows, in_rows);
-            int w_start = pw * col_stride - pad_cols;
+            int w_start = pw * col_stride - pad_left;
             const int w_end = std::min(w_start + window_cols, in_cols);
             h_start = std::max(h_start, 0);
             w_start = std::max(w_start, 0);
@@ -725,15 +724,20 @@ class MaxPoolingGradGradOp<Eigen::GpuDevice, T> : public OpKernel {
                 errors::Unimplemented(
                     "Pooling is not yet supported on the batch dimension."));
 
-    PoolParameters params{context,  ksize,        stride,
-                          padding_, data_format_, tensor_in.shape()};
+    PoolParameters params{context,
+                          ksize,
+                          stride,
+                          padding_,
+                          /*explicit_paddings=*/{},
+                          data_format_,
+                          tensor_in.shape()};
 
     functor::MaxPoolGradBackwardNoMask<T>()(
         data_format_, tensor_in.flat<T>().data(), tensor_out.flat<T>().data(),
         params.tensor_in_batch, params.out_height, params.out_width,
         params.depth, params.tensor_in_rows, params.tensor_in_cols,
         params.window_rows, params.window_cols, params.row_stride,
-        params.col_stride, params.pad_rows, params.pad_cols,
+        params.col_stride, params.pad_top, params.pad_left,
         out_grad_backprop.flat<T>().data(), output->flat<T>().data(),
         context->eigen_device<Eigen::GpuDevice>());
   }
@@ -777,13 +781,22 @@ class MaxPoolingNoMaskOp : public OpKernel {
     OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
                 errors::Unimplemented(
                     "Pooling is not yet supported on the batch dimension."));
+    OP_REQUIRES(
+        context, padding_ != EXPLICIT,
+        errors::Unimplemented(
+            "Explicit padding is not supported for MaxPoolingNoMaskOp."));
   }
 
   void Compute(OpKernelContext* context) override {
     const Tensor& tensor_in = context->input(0);
 
-    PoolParameters params{context,  ksize_,       stride_,
-                          padding_, data_format_, tensor_in.shape()};
+    PoolParameters params{context,
+                          ksize_,
+                          stride_,
+                          padding_,
+                          /*explicit_paddings=*/{},
+                          data_format_,
+                          tensor_in.shape()};
     if (!context->status().ok()) {
       return;
     }
@@ -860,8 +873,13 @@ class MaxPoolingNoMaskV2Op : public OpKernel {
     OP_REQUIRES(context, ksize[0] == 1 && stride[0] == 1,
                 errors::Unimplemented(
                     "Pooling is not yet supported on the batch dimension."));
-    PoolParameters params{context,  ksize,        stride,
-                          padding_, data_format_, tensor_in.shape()};
+    PoolParameters params{context,
+                          ksize,
+                          stride,
+                          padding_,
+                          /*explicit_paddings=*/{},
+                          data_format_,
+                          tensor_in.shape()};
     if (!context->status().ok()) {
       return;
     }
@@ -923,8 +941,13 @@ class MaxPoolingWithArgmaxOp : public OpKernel {
   void Compute(OpKernelContext* context) override {
     const Tensor& tensor_in = context->input(0);
 
-    PoolParameters params{context,  ksize_,      stride_,
-                          padding_, FORMAT_NHWC, tensor_in.shape()};
+    PoolParameters params{context,
+                          ksize_,
+                          stride_,
+                          padding_,
+                          /*explicit_paddings=*/{},
+                          FORMAT_NHWC,
+                          tensor_in.shape()};
     if (!context->status().ok()) {
       return;
     }
@@ -1037,8 +1060,13 @@ class MaxPoolingGradWithArgmaxOp : public OpKernel {
     const Tensor& grad_in = context->input(1);
     const Tensor& argmax = context->input(2);
 
-    PoolParameters params{context,  ksize_,      stride_,
-                          padding_, FORMAT_NHWC, tensor_in.shape()};
+    PoolParameters params{context,
+                          ksize_,
+                          stride_,
+                          padding_,
+                          /*explicit_paddings=*/{},
+                          FORMAT_NHWC,
+                          tensor_in.shape()};
     if (!context->status().ok()) {
       return;
     }
@@ -1090,8 +1118,13 @@ class MaxPoolingGradGradWithArgmaxOp : public OpKernel {
     const Tensor& grad_in = context->input(1);
     const Tensor& argmax = context->input(2);
 
-    PoolParameters params{context,  ksize_,      stride_,
-                          padding_, FORMAT_NHWC, tensor_in.shape()};
+    PoolParameters params{context,
+                          ksize_,
+                          stride_,
+                          padding_,
+                          /*explicit_paddings=*/{},
+                          FORMAT_NHWC,
+                          tensor_in.shape()};
     if (!context->status().ok()) {
       return;
     }
@@ -1134,12 +1167,13 @@ class MaxPoolingNoMaskOp<GPUDevice, T> : public OpKernel {
                 errors::InvalidArgument("Sliding window stride field must "
                                         "specify 4 dimensions"));
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("explicit_paddings", &explicit_paddings_));
     const int32 ksize_n = GetTensorDim(ksize_, data_format_, 'N');
     const int32 stride_n = GetTensorDim(stride_, data_format_, 'N');
     OP_REQUIRES(context, ksize_n == 1 && stride_n == 1,
                 errors::Unimplemented(
                     "Pooling is not yet supported on the batch dimension."));
-    use_dnn_ = CanUseCudnn();
 
     TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_MAXPOOL_NANPROP", false,
                                    &propagate_nans_));
@@ -1148,8 +1182,9 @@ class MaxPoolingNoMaskOp<GPUDevice, T> : public OpKernel {
   void Compute(OpKernelContext* context) override {
     const Tensor& tensor_in = context->input(0);
 
-    PoolParameters params{context,  ksize_,       stride_,
-                          padding_, data_format_, tensor_in.shape()};
+    PoolParameters params{
+        context,      ksize_,           stride_, padding_, explicit_paddings_,
+        data_format_, tensor_in.shape()};
     if (!context->status().ok()) {
       return;
     }
@@ -1165,18 +1200,21 @@ class MaxPoolingNoMaskOp<GPUDevice, T> : public OpKernel {
                     "qint8 should be used with data_format NCHW_VECT_C."));
 
 #if CUDNN_VERSION >= 7300
-    if (use_dnn_) {
-      DnnPoolingOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum, ksize_,
-                               stride_, padding_, data_format_, tensor_in,
-                               out_shape, propagate_nans_);
+    DnnPoolingOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum, ksize_,
+                             stride_, padding_, explicit_paddings_,
+                             data_format_, tensor_in, out_shape,
+                             propagate_nans_);
 #else
     // These is_int8x4 checks avoid linker errors for missing qint8 kernels.
-    if (!is_int8x4 && use_dnn_ && data_format_ == FORMAT_NCHW) {
+    if (!is_int8x4 && data_format_ == FORMAT_NCHW) {
       DnnPoolingOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum, ksize_,
-                               stride_, padding_, data_format_, tensor_in,
-                               out_shape, propagate_nans_);
-#endif
+                               stride_, padding_, explicit_paddings_,
+                               data_format_, tensor_in, out_shape,
+                               propagate_nans_);
     } else {
+      OP_REQUIRES(context, padding_ != EXPLICIT,
+                  errors::Unimplemented("Explicit padding is not supported ",
+                                        "when CUDNN is not enabled."));
       Tensor* output = nullptr;
       OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
       if (is_int8x4) {
@@ -1195,14 +1233,15 @@ class MaxPoolingNoMaskOp<GPUDevice, T> : public OpKernel {
                    << ") is not supported.";
       }
     }
+#endif
   }
 
  private:
   std::vector<int32> ksize_;
   std::vector<int32> stride_;
   Padding padding_;
+  std::vector<int64> explicit_paddings_;
   TensorFormat data_format_;
-  bool use_dnn_;
   bool propagate_nans_;
 };
 
@@ -1232,7 +1271,6 @@ class MaxPoolingNoMaskV2Op<GPUDevice, T> : public OpKernel {
                       "Pooling is not yet supported on the batch dimension."));
     }
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
-    use_dnn_ = CanUseCudnn();
     TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_MAXPOOL_NANPROP", false,
                                    &propagate_nans_));
   }
@@ -1266,8 +1304,13 @@ class MaxPoolingNoMaskV2Op<GPUDevice, T> : public OpKernel {
                 errors::Unimplemented(
                     "Pooling is not yet supported on the batch dimension."));
 
-    PoolParameters params{context,  ksize,        stride,
-                          padding_, data_format_, tensor_in.shape()};
+    PoolParameters params{context,
+                          ksize,
+                          stride,
+                          padding_,
+                          /*explicit_paddings=*/{},
+                          data_format_,
+                          tensor_in.shape()};
     if (!context->status().ok()) {
       return;
     }
@@ -1275,13 +1318,14 @@ class MaxPoolingNoMaskV2Op<GPUDevice, T> : public OpKernel {
     TensorShape out_shape =
         ShapeFromFormat(data_format_, params.tensor_in_batch, params.out_height,
                         params.out_width, params.depth);
-    if (use_dnn_ && data_format_ == FORMAT_NCHW) {
+    if (data_format_ == FORMAT_NCHW) {
       DnnPoolingOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum, ksize,
-                               stride, padding_, data_format_, tensor_in,
-                               out_shape, propagate_nans_);
+                               stride, padding_, explicit_paddings_,
+                               data_format_, tensor_in, out_shape,
+                               propagate_nans_);
     } else {
       CHECK(data_format_ == FORMAT_NHWC)
-          << "Non-Cudnn MaxPool only supports NHWC format";
+          << "MaxPool only supports NCHW or NHWC format";
       Tensor* output = nullptr;
       OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
       LaunchMaxPoolingNoMask<Device, T>::launch(context, params, tensor_in,
@@ -1293,8 +1337,8 @@ class MaxPoolingNoMaskV2Op<GPUDevice, T> : public OpKernel {
   std::vector<int32> ksize_;
   std::vector<int32> stride_;
   Padding padding_;
+  std::vector<int64> explicit_paddings_;
   TensorFormat data_format_;
-  bool use_dnn_;
   bool propagate_nans_;
 };
 
@@ -1306,7 +1350,7 @@ struct LaunchMaxPoolingNoMask<Eigen::GpuDevice, T> {
         input.flat<T>().data(), params.tensor_in_batch, params.tensor_in_rows,
         params.tensor_in_cols, params.depth, params.out_height,
         params.out_width, params.window_rows, params.window_cols,
-        params.row_stride, params.col_stride, params.pad_rows, params.pad_cols,
+        params.row_stride, params.col_stride, params.pad_top, params.pad_left,
         output->flat<T>().data(), nullptr, context->eigen_gpu_device(),
         propagate_nans, false);
     if (!status) {
@@ -1325,7 +1369,7 @@ struct LaunchMaxPoolingWithArgmax<Eigen::GpuDevice, T> {
         input.flat<T>().data(), params.tensor_in_batch, params.tensor_in_rows,
         params.tensor_in_cols, params.depth, params.out_height,
         params.out_width, params.window_rows, params.window_cols,
-        params.row_stride, params.col_stride, params.pad_rows, params.pad_cols,
+        params.row_stride, params.col_stride, params.pad_top, params.pad_left,
         output->flat<T>().data(),
         reinterpret_cast<int64*>(argmax->flat<int64>().data()),
         context->eigen_gpu_device(), propagate_nans, include_batch_in_index);
diff --git a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
index 2a4bb9a94fe..4de2f29aa30 100644
--- a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
@@ -180,48 +180,6 @@ __global__ void MaxPoolForwardNHWC(
   }
 }
 
-template <typename dtype>
-__global__ void MaxPoolBackwardNoMaskNHWC(
-    const int nthreads, const dtype* __restrict__ bottom_data, const int height,
-    const int width, const int channels, const int pooled_height,
-    const int pooled_width, const int kernel_h, const int kernel_w,
-    const int stride_h, const int stride_w, const int pad_t, const int pad_l,
-    const dtype* __restrict__ top_diff, dtype* __restrict__ bottom_diff) {
-  GPU_1D_KERNEL_LOOP(index, nthreads) {
-    // First find out the index to the maximum, since we have no mask.
-    int n = index;
-    int c = n % channels;
-    n /= channels;
-    int wstart = (n % pooled_width) * stride_w - pad_l;
-    n /= pooled_width;
-    int hstart = (n % pooled_height) * stride_h - pad_t;
-    n /= pooled_height;
-    int hend = min(hstart + kernel_h, height);
-    int wend = min(wstart + kernel_w, width);
-    hstart = max(hstart, 0);
-    wstart = max(wstart, 0);
-    dtype maxval = Eigen::NumTraits<dtype>::lowest();
-    int maxidx = -1;
-    const dtype* bottom_data_n = bottom_data + n * height * width * channels;
-    for (int h = hstart; h < hend; ++h) {
-      for (int w = wstart; w < wend; ++w) {
-        int idx = (h * width + w) * channels + c;
-        if (bottom_data_n[idx] > maxval) {
-          maxidx = idx;
-          maxval = bottom_data_n[idx];
-        }
-      }
-    }
-
-    // Atomically accumulate the bottom diff. The index could still be
-    // uninitialized, if all the bottom_data are NaN.
-    if (maxidx != -1) {
-      GpuAtomicAdd(bottom_diff + n * height * width * channels + maxidx,
-                   top_diff[index]);
-    }
-  }
-}
-
 // The parameters to the kernels in the backward function is as follows:
 //     nthreads: the number of threads, which is equal to the output size.
 //     top_diff: the gradient of the output data, of size N*Hout*Wout*C (or
@@ -445,31 +403,6 @@ bool MaxPoolForwardWithOptionalArgmax<T>::operator()(
   return d.ok();
 }
 
-template <typename T>
-bool MaxPoolBackwardNoMask<T>::operator()(
-    const T* bottom_data, const int batch, const int height, const int width,
-    const int channels, const int pooled_height, const int pooled_width,
-    const int kernel_h, const int kernel_w, const int stride_h,
-    const int stride_w, const int pad_t, const int pad_l, const T* top_diff,
-    T* bottom_diff, const Eigen::GpuDevice& d) {
-  const int kThreadsPerBlock = 1024;
-
-  const int bottom_size = batch * channels * height * width;
-  if (bottom_size == 0) return true;
-  TF_CHECK_OK(GpuLaunchKernel(
-      SetZero<T>, (bottom_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
-      kThreadsPerBlock, 0, d.stream(), bottom_size, bottom_diff));
-
-  const int top_size = batch * channels * pooled_height * pooled_width;
-  TF_CHECK_OK(GpuLaunchKernel(
-      MaxPoolBackwardNoMaskNHWC<T>,
-      (top_size + kThreadsPerBlock - 1) / kThreadsPerBlock, kThreadsPerBlock, 0,
-      d.stream(), top_size, bottom_data, height, width, channels, pooled_height,
-      pooled_width, kernel_h, kernel_w, stride_h, stride_w, pad_t, pad_l,
-      top_diff, bottom_diff));
-  return d.ok();
-}
-
 template <typename T>
 bool MaxPoolBackwardWithArgmax<T>::operator()(
     const int output_size, const int input_size, const T* top_diff,
@@ -540,7 +473,6 @@ typedef Eigen::GpuDevice GPUDevice;
   template struct SpatialMaxPooling<GPUDevice, T>;     \
   template struct MaxPoolForwardWithOptionalArgmax<T>; \
   template struct MaxPoolBackwardWithArgmax<T>;        \
-  template struct MaxPoolBackwardNoMask<T>;            \
   template struct MaxPoolGradBackwardWithArgmax<T>;    \
   template struct MaxPoolGradBackwardNoMask<T>;
 
diff --git a/tensorflow/core/kernels/maxpooling_op_gpu.h b/tensorflow/core/kernels/maxpooling_op_gpu.h
index 5383833b318..44ccdfd9a76 100644
--- a/tensorflow/core/kernels/maxpooling_op_gpu.h
+++ b/tensorflow/core/kernels/maxpooling_op_gpu.h
@@ -60,16 +60,6 @@ struct MaxPoolBackwardWithArgmax {
                   const Eigen::GpuDevice& d, const bool include_batch_in_index);
 };
 
-template <typename T>
-struct MaxPoolBackwardNoMask {
-  bool operator()(const T* bottom_data, const int batch, const int height,
-                  const int width, const int channels, const int pooled_height,
-                  const int pooled_width, const int kernel_h,
-                  const int kernel_w, const int stride_h, const int stride_w,
-                  const int pad_t, const int pad_l, const T* top_diff,
-                  T* bottom_diff, const Eigen::GpuDevice& d);
-};
-
 template <typename T>
 struct MaxPoolGradBackwardWithArgmax {
   bool operator()(const int output_size, const int input_size,
diff --git a/tensorflow/core/kernels/mkl/BUILD b/tensorflow/core/kernels/mkl/BUILD
new file mode 100644
index 00000000000..de620a238d6
--- /dev/null
+++ b/tensorflow/core/kernels/mkl/BUILD
@@ -0,0 +1,430 @@
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cc_test_mkl",
+    "tf_mkl_kernel_library",
+)
+load(
+    "//third_party/mkl:build_defs.bzl",
+    "mkl_deps",
+)
+
+package(
+    default_visibility = [
+        "//tensorflow:__subpackages__",
+        "//tensorflow:internal",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+# Public support libraries ----------------------------------------------------
+MKL_SHORT_DEPS = [
+    "//tensorflow/core:core_cpu",
+    "//tensorflow/core:framework",
+    "//tensorflow/core:lib",
+    "//tensorflow/core:lib_internal",
+    "//tensorflow/core/kernels:bounds_check",
+    "//tensorflow/core/kernels:ops_util",
+] + mkl_deps()
+
+MKL_DEPS = MKL_SHORT_DEPS + [
+    "//third_party/eigen3",
+    "//tensorflow/core:array_grad",
+    "//tensorflow/core:math_grad",
+    "//tensorflow/core:nn_grad",
+    "//tensorflow/core:protos_all_cc",
+    "//tensorflow/core/kernels:concat_lib",
+    "//tensorflow/core/kernels:conv_2d",
+    "//tensorflow/core/kernels:eigen_contraction_kernel",
+    "//tensorflow/core/kernels:fill_functor",
+    "//tensorflow/core/kernels:gather_functor",
+    "//tensorflow/core/kernels:transpose_functor",
+]
+
+MKL_TEST_DEPS = [
+    ":mkl_input_conversion_op",
+    "//tensorflow/cc:cc_ops",
+    "//tensorflow/core:core_cpu",
+    "//tensorflow/core:framework",
+    "//tensorflow/core:framework_internal",
+    "//tensorflow/core:lib",
+    "//tensorflow/core:protos_all_cc",
+    "//tensorflow/core:test",
+    "//tensorflow/core:test_main",
+    "//tensorflow/core:testlib",
+    "//tensorflow/core/kernels:ops_testutil",
+    "//tensorflow/core/kernels:ops_util",
+]
+
+tf_mkl_kernel_library(
+    name = "mkl_batch_matmul_op",
+    srcs = ["mkl_batch_matmul_op.cc"],
+    hdrs = [
+        "mkl_matmul_ops_common.h",
+    ],
+    deps = ["//tensorflow/core/kernels:batch_matmul_op"] + MKL_DEPS,
+)
+
+tf_mkl_kernel_library(
+    name = "mkl_matmul_op",
+    srcs = [
+        "mkl_matmul_op.cc",
+        "mkl_matmul_op_fused.cc",
+    ],
+    hdrs = ["mkl_matmul_ops_common.h"],
+    deps = MKL_DEPS,
+)
+
+tf_cc_test_mkl(
+    name = "mkl_quantized_conv_ops_perchannel_test",
+    size = "small",
+    srcs = ["mkl_quantized_conv_ops_perchannel_test.cc"],
+    deps = [
+        ":mkl_conv_op",
+        "//tensorflow/core:array_ops_op_lib",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:nn_ops_op_lib",
+        "//tensorflow/core/kernels:quantization_utils",
+        "//tensorflow/core/kernels:quantized_ops",
+    ] + MKL_TEST_DEPS,
+)
+
+tf_cc_test_mkl(
+    name = "mkl_quantized_conv_ops_test",
+    size = "small",
+    srcs = ["mkl_quantized_conv_ops_test.cc"],
+    deps = [
+        ":mkl_conv_op",
+        "//tensorflow/core:array_ops_op_lib",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:nn_ops_op_lib",
+        "//tensorflow/core/kernels:quantization_utils",
+        "//tensorflow/core/kernels:quantized_ops",
+    ] + MKL_TEST_DEPS,
+)
+
+tf_cc_test_mkl(
+    name = "mkl_qmatmul_op_test",
+    size = "small",
+    srcs = ["mkl_qmatmul_op_test.cc"],
+    deps = [
+        ":mkl_qmatmul_op",
+        "//tensorflow/core:array_ops_op_lib",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:nn_ops_op_lib",
+        "//tensorflow/core/framework:fake_input",
+        "//tensorflow/core/framework:tensor_testutil",
+        "//tensorflow/core/kernels:quantization_utils",
+        "//tensorflow/core/kernels:quantized_ops",
+    ] + MKL_TEST_DEPS,
+)
+
+tf_mkl_kernel_library(
+    name = "mkl_quantize_op",
+    srcs = ["mkl_quantize_op.cc"],
+    deps = [
+        "//tensorflow/core/kernels:quantized_ops",
+        "//tensorflow/core:mkl_graph_util",
+        "@gemmlowp",
+    ] + MKL_DEPS,
+)
+
+tf_cc_test_mkl(
+    name = "mkl_quantize_op_test",
+    size = "small",
+    srcs = ["mkl_quantize_op_test.cc"],
+    deps = [
+        ":mkl_quantize_op",
+        "//tensorflow/core:array_ops_op_lib",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:nn_ops_op_lib",
+    ] + MKL_TEST_DEPS,
+)
+
+tf_cc_test_mkl(
+    name = "mkl_quantized_pooling_ops_test",
+    size = "small",
+    srcs = ["mkl_quantized_pooling_ops_test.cc"],
+    deps = [
+        ":mkl_pooling_ops",
+        "//tensorflow/core:array_ops_op_lib",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:nn_ops_op_lib",
+        "//tensorflow/core/kernels:quantization_utils",
+        "//tensorflow/core/kernels:quantized_ops",
+    ] + MKL_TEST_DEPS,
+)
+
+tf_cc_test_mkl(
+    name = "mkl_quantized_concat_op_test",
+    size = "small",
+    srcs = ["mkl_quantized_concat_op_test.cc"],
+    deps = [
+        ":mkl_concat_op",
+        "//tensorflow/core:array_ops_op_lib",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:mkl_array_ops_op_lib",
+        "//tensorflow/core:nn_ops_op_lib",
+        "//tensorflow/core/kernels:quantization_utils",
+        "//tensorflow/core/kernels:quantized_ops",
+    ] + MKL_TEST_DEPS,
+)
+
+tf_mkl_kernel_library(
+    name = "mkl_qmatmul_op",
+    srcs = ["mkl_qmatmul_op.cc"],
+    hdrs = [
+        "mkl_matmul_ops_common.h",
+        "mkl_quantized_conv_ops.h",
+    ],
+    deps = [
+        "//tensorflow/core/kernels:matmul_op",
+        "//tensorflow/core/kernels:no_op",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:mkl_nn_ops_op_lib",
+        "//tensorflow/core:nn_ops_op_lib",
+    ] + MKL_DEPS,
+)
+
+tf_mkl_kernel_library(
+    name = "mkl_conv_op",
+    hdrs = [
+        "mkl_quantized_conv_ops.h",
+    ],
+    prefix = "mkl_conv",
+    deps = [
+        "@com_google_absl//absl/strings",
+        "//tensorflow/core/kernels:conv_ops",
+        "//tensorflow/core/kernels:no_op",
+    ] + MKL_DEPS,
+)
+
+tf_cc_test_mkl(
+    name = "mkl_conv_ops_test",
+    size = "small",
+    srcs = ["mkl_conv_ops_test.cc"],
+    linkstatic = 1,  # Fixes dyld error on MacOS.
+    deps = MKL_TEST_DEPS,
+)
+
+tf_cc_test_mkl(
+    name = "mkl_relu_op_test",
+    size = "small",
+    srcs = ["mkl_relu_op_test.cc"],
+    linkstatic = 1,  # Fixes dyld error on MacOS.
+    deps = MKL_TEST_DEPS,
+)
+
+tf_mkl_kernel_library(
+    name = "mkl_tfconv_op",
+    prefix = "mkl_tfconv",
+    deps = MKL_SHORT_DEPS,
+)
+
+tf_mkl_kernel_library(
+    name = "mkl_input_conversion_op",
+    hdrs = ["mkl_tfconv_op.h"],
+    prefix = "mkl_input_conversion",
+    deps = MKL_SHORT_DEPS,
+)
+
+tf_mkl_kernel_library(
+    name = "mkl_pooling_ops",
+    srcs = [
+        "mkl_avgpooling_op.cc",
+        "mkl_maxpooling_op.cc",
+        "mkl_pooling_ops_common.cc",
+    ],
+    hdrs = ["mkl_pooling_ops_common.h"],
+    deps = MKL_SHORT_DEPS,
+)
+
+tf_mkl_kernel_library(
+    name = "mkl_dequantize_op",
+    srcs = ["mkl_dequantize_op.cc"],
+    deps = [
+        "//third_party/eigen3",
+        "@gemmlowp",
+        "//tensorflow/core:array_ops_op_lib",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:mkl_graph_util",
+        "//tensorflow/core:nn_ops_op_lib",
+        "//tensorflow/core/kernels:concat_lib_hdrs",
+        "//tensorflow/core/kernels:conv_ops",
+        "//tensorflow/core/kernels:cwise_op",
+        "//tensorflow/core/kernels:eigen_helpers",
+        "//tensorflow/core/kernels:ops_util",
+        "//tensorflow/core/kernels:pooling_ops",
+        "//tensorflow/core/kernels:quantization_utils",
+        "//tensorflow/core/kernels:quantized_ops",
+        "//tensorflow/core/kernels:transpose_functor",
+        "//tensorflow/core/util:image_resizer_state",
+    ] + mkl_deps(),
+)
+
+tf_cc_test_mkl(
+    name = "mkl_dequantize_op_test",
+    size = "small",
+    srcs = ["mkl_dequantize_op_test.cc"],
+    # TODO(b/149940073): Re-enable.
+    tags = [
+        "no_oss",
+        "notap",
+    ],
+    deps = [
+        ":mkl_dequantize_op",
+        ":mkl_tfconv_op",
+        "//tensorflow/core:array_ops_op_lib",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:mkl_array_ops_op_lib",
+        "//tensorflow/core:nn_ops_op_lib",
+    ] + MKL_TEST_DEPS,
+)
+
+tf_mkl_kernel_library(
+    name = "mkl_relu_op",
+    prefix = "mkl_relu",
+    deps = MKL_DEPS,
+)
+
+tf_mkl_kernel_library(
+    name = "mkl_softmax_op",
+    prefix = "mkl_softmax",
+    deps = MKL_SHORT_DEPS,
+)
+
+tf_mkl_kernel_library(
+    name = "mkl_tmp_bf16_ops",
+    prefix = "mkl_tmp_bf16_ops",
+    deps = MKL_DEPS + [
+        "//tensorflow/core/kernels:no_op",
+    ],
+)
+
+tf_mkl_kernel_library(
+    name = "mkl_fused_batch_norm_op",
+    srcs = ["mkl_fused_batch_norm_op.cc"],
+    deps = [
+        "//tensorflow/core/kernels:fused_batch_norm_op",
+        "//tensorflow/core/kernels:no_op",
+    ] + mkl_deps(),
+)
+
+tf_cc_test_mkl(
+    name = "mkl_fused_batch_norm_op_test",
+    size = "small",
+    srcs = ["mkl_fused_batch_norm_op_test.cc"],
+    linkstatic = 1,
+    deps = [
+        ":mkl_fused_batch_norm_op",
+        "//tensorflow/core:direct_session",
+        "//tensorflow/core/kernels:conv_ops_gpu_hdrs",
+    ] + MKL_TEST_DEPS,
+)
+
+tf_mkl_kernel_library(
+    name = "mkl_aggregate_ops",
+    prefix = "mkl_aggregate_ops",
+    deps = MKL_DEPS,
+)
+
+tf_mkl_kernel_library(
+    name = "mkl_concat_op",
+    prefix = "mkl_concat_op",
+    deps = ["//tensorflow/core/kernels:quantization_utils"] + MKL_DEPS,
+)
+
+tf_mkl_kernel_library(
+    name = "mkl_reshape_op",
+    prefix = "mkl_reshape_op",
+    deps = MKL_DEPS,
+)
+
+tf_mkl_kernel_library(
+    name = "mkl_slice_op",
+    prefix = "mkl_slice_op",
+    deps = MKL_DEPS,
+)
+
+tf_mkl_kernel_library(
+    name = "mkl_identity_op",
+    prefix = "mkl_identity_op",
+    deps = MKL_DEPS,
+)
+
+tf_mkl_kernel_library(
+    name = "mkl_lrn_op",
+    prefix = "mkl_lrn_op",
+    deps = MKL_DEPS,
+)
+
+tf_mkl_kernel_library(
+    name = "mkl_cwise_ops_common",
+    prefix = "mkl_cwise_ops_common",
+    deps = MKL_DEPS + ["//tensorflow/core/kernels:cwise_op"],
+)
+
+tf_mkl_kernel_library(
+    name = "mkl_requantize_ops",
+    srcs = [
+        "mkl_requantization_range_per_channel_op.cc",
+        "mkl_requantize_per_channel_op.cc",
+    ],
+    deps = [
+        "@gemmlowp",
+        "//tensorflow/core/kernels:concat_lib_hdrs",
+        "//tensorflow/core/kernels:conv_ops",
+        "//tensorflow/core/kernels:eigen_helpers",
+        "//tensorflow/core/kernels:meta_support",
+        "//tensorflow/core/kernels:no_op",
+        "//tensorflow/core/kernels:pooling_ops",
+        "//tensorflow/core/kernels:quantization_utils",
+        "//tensorflow/core/util:image_resizer_state",
+    ] + MKL_DEPS,
+)
+
+tf_cc_test_mkl(
+    name = "mkl_requantize_ops_test",
+    size = "small",
+    srcs = ["mkl_requantize_ops_test.cc"],
+    linkstatic = 1,  # Fixes dyld error on MacOS.
+    deps = [
+        ":mkl_requantize_ops",
+        "//tensorflow/core:array_ops_op_lib",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core/kernels:quantization_utils",
+        "//tensorflow/core/kernels:quantized_ops",
+    ] + MKL_TEST_DEPS,
+)
+
+tf_cc_test_mkl(
+    name = "mkl_fused_ops_test",
+    size = "small",
+    srcs = ["mkl_fused_ops_test.cc"],
+    linkstatic = 1,
+    deps = [
+        ":mkl_conv_op",
+        ":mkl_matmul_op",
+        ":mkl_tfconv_op",
+        "//tensorflow/cc:cc_ops_internal",
+        "//tensorflow/core:direct_session",
+        "//tensorflow/core/kernels:bias_op",
+        "//tensorflow/core/kernels:conv_ops",
+        "//tensorflow/core/kernels:depthwise_conv_op",
+        "//tensorflow/core/kernels:matmul_op",
+        "//tensorflow/core/kernels:pad_op",
+        "//tensorflow/core/kernels:relu_op",
+        "//tensorflow/core/kernels/image:image",
+        "//tensorflow/core:tensorflow",
+    ] + MKL_TEST_DEPS,
+)
+
+tf_mkl_kernel_library(
+    name = "mkl_transpose_op",
+    srcs = [
+        "mkl_transpose_op.cc",
+    ],
+    deps = MKL_DEPS + ["//tensorflow/core/kernels:transpose_op"],
+)
diff --git a/tensorflow/core/kernels/mkl_aggregate_ops.cc b/tensorflow/core/kernels/mkl/mkl_aggregate_ops.cc
similarity index 100%
rename from tensorflow/core/kernels/mkl_aggregate_ops.cc
rename to tensorflow/core/kernels/mkl/mkl_aggregate_ops.cc
diff --git a/tensorflow/core/kernels/mkl_avgpooling_op.cc b/tensorflow/core/kernels/mkl/mkl_avgpooling_op.cc
similarity index 99%
rename from tensorflow/core/kernels/mkl_avgpooling_op.cc
rename to tensorflow/core/kernels/mkl/mkl_avgpooling_op.cc
index a238f51860b..754156c860a 100644
--- a/tensorflow/core/kernels/mkl_avgpooling_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_avgpooling_op.cc
@@ -21,7 +21,7 @@
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/kernels/mkl_pooling_ops_common.h"
+#include "tensorflow/core/kernels/mkl/mkl_pooling_ops_common.h"
 #include "tensorflow/core/util/mkl_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
diff --git a/tensorflow/core/kernels/mkl_batch_matmul_op.cc b/tensorflow/core/kernels/mkl/mkl_batch_matmul_op.cc
similarity index 99%
rename from tensorflow/core/kernels/mkl_batch_matmul_op.cc
rename to tensorflow/core/kernels/mkl/mkl_batch_matmul_op.cc
index b65c70566b5..da5a239c224 100644
--- a/tensorflow/core/kernels/mkl_batch_matmul_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_batch_matmul_op.cc
@@ -45,7 +45,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/batch_matmul_op_impl.h"
 #include "tensorflow/core/kernels/fill_functor.h"
-#include "tensorflow/core/kernels/mkl_matmul_ops_common.h"
+#include "tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/matmul_bcast.h"
diff --git a/tensorflow/core/kernels/mkl_concat_op.cc b/tensorflow/core/kernels/mkl/mkl_concat_op.cc
similarity index 100%
rename from tensorflow/core/kernels/mkl_concat_op.cc
rename to tensorflow/core/kernels/mkl/mkl_concat_op.cc
diff --git a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc b/tensorflow/core/kernels/mkl/mkl_conv_grad_filter_ops.cc
similarity index 95%
rename from tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
rename to tensorflow/core/kernels/mkl/mkl_conv_grad_filter_ops.cc
index 12581d0bfa5..c9bcdb57cb7 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/mkl/mkl_conv_grad_filter_ops.cc
@@ -28,7 +28,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
 #include "tensorflow/core/kernels/conv_grad_ops.h"
-#include "tensorflow/core/kernels/mkl_conv_ops.h"
+#include "tensorflow/core/kernels/mkl/mkl_conv_ops.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
@@ -59,6 +59,8 @@ struct MklConvBwdFilterParams {
   memory::dims diff_bias_dims;
   memory::dims diff_dst_dims;
   memory::dims strides;
+  MKL_TENSOR_FORMAT tf_fmt;
+  bool native_format;
   memory::dims dilations;
   memory::dims padding_left;
   memory::dims padding_right;
@@ -69,6 +71,7 @@ struct MklConvBwdFilterParams {
   MklConvBwdFilterParams(memory::dims src_dims, memory::dims diff_filter_dims,
                          memory::dims diff_bias_dims,
                          memory::dims diff_dst_dims, memory::dims strides,
+                         MKL_TENSOR_FORMAT tf_fmt, bool native_format,
                          memory::dims dilations, memory::dims padding_left,
 #ifndef ENABLE_MKLDNN_V1
                          memory::dims padding_right, padding_kind padding)
@@ -80,6 +83,8 @@ struct MklConvBwdFilterParams {
         diff_bias_dims(diff_bias_dims),
         diff_dst_dims(diff_dst_dims),
         strides(strides),
+        tf_fmt(tf_fmt),
+        native_format(native_format),
         dilations(dilations),
         padding_left(padding_left),
 #ifndef ENABLE_MKLDNN_V1
@@ -243,15 +248,21 @@ class MklConvBwdFilterPrimitive : public MklPrimitive {
   };
 
   void Setup(const MklConvBwdFilterParams& convBwdFilterDims) {
-    // Create memory descriptors for convolution backward filter without any
-    // specific format so that MKL-DNN can pick an appropriate one depending
-    // on the input parameters.
-    context_.src_md.reset(new memory::desc(
-        {convBwdFilterDims.src_dims}, MklDnnType<T>(), MEMORY_FORMAT::any));
+    MEMORY_FORMAT user_data_fmt;
+    if (convBwdFilterDims.native_format) {
+      user_data_fmt =
+          MklTensorFormatToMklDnnDataFormat(convBwdFilterDims.tf_fmt);
+    } else {
+      // Create memory descriptors for convolution backward filter without any
+      // specific format so that MKL-DNN can pick an appropriate one depending
+      // on the input parameters.
+      user_data_fmt = MEMORY_FORMAT::any;
+    }
+    context_.src_md.reset(new memory::desc({convBwdFilterDims.src_dims},
+                                           MklDnnType<T>(), user_data_fmt));
 
-    context_.diff_dst_md.reset(
-        new memory::desc({convBwdFilterDims.diff_dst_dims}, MklDnnType<T>(),
-                         MEMORY_FORMAT::any));
+    context_.diff_dst_md.reset(new memory::desc(
+        {convBwdFilterDims.diff_dst_dims}, MklDnnType<T>(), user_data_fmt));
 
     context_.diff_filter_md.reset(
         new memory::desc({convBwdFilterDims.diff_filter_dims}, MklDnnType<T>(),
@@ -407,6 +418,9 @@ class MklConvBwdFilterPrimitiveFactory : public MklPrimitiveFactory<T> {
     key_creator.AddAsKey(convBwdFilterDims.dilations);
     key_creator.AddAsKey(convBwdFilterDims.padding_left);
     key_creator.AddAsKey(convBwdFilterDims.padding_right);
+    if (convBwdFilterDims.native_format) {
+      key_creator.AddAsKey(convBwdFilterDims.tf_fmt);
+    }
     return key_creator.GetKey();
   }
 
@@ -424,7 +438,7 @@ class MklConvBwdFilterPrimitiveFactory : public MklPrimitiveFactory<T> {
 };
 
 template <typename Device, class T, bool bias_enabled, bool is_depthwise,
-          bool eager_mode>
+          bool native_format>
 class MklConvCustomBackpropFilterOp
     : public MklConvBackpropCommonOp<Device, T, is_depthwise> {
  public:
@@ -441,9 +455,9 @@ class MklConvCustomBackpropFilterOp
       const Tensor& diff_dst_tensor = MklGetInput(context, kDiffDstIdx);
 
       MklDnnShape src_mkl_shape, filter_mkl_shape, diff_dst_mkl_shape;
-      GetMklShape(context, kInputIdx, &src_mkl_shape, eager_mode);
-      GetMklShape(context, kFilterIdx, &filter_mkl_shape, eager_mode);
-      GetMklShape(context, kDiffDstIdx, &diff_dst_mkl_shape, eager_mode);
+      GetMklShape(context, kInputIdx, &src_mkl_shape, native_format);
+      GetMklShape(context, kFilterIdx, &filter_mkl_shape, native_format);
+      GetMklShape(context, kDiffDstIdx, &diff_dst_mkl_shape, native_format);
       // Allow operator-specific sanity checking of shapes.
       ValidateMklShapes(src_mkl_shape, filter_mkl_shape, diff_dst_mkl_shape);
 
@@ -455,7 +469,7 @@ class MklConvCustomBackpropFilterOp
       TensorShape src_tf_shape = MakeInputTfShape(context, src_tensor);
       TensorShape filter_tf_shape = MakeFilterTfShape(context, filter_tensor);
       TensorShape diff_dst_tf_shape =
-          GetTfShape(context, kDiffDstIdx, eager_mode);
+          GetTfShape(context, kDiffDstIdx, native_format);
 
       // Corner cases: output with 0 elements and 0 batch size.
       Tensor* diff_filter_tensor = nullptr;
@@ -469,7 +483,7 @@ class MklConvCustomBackpropFilterOp
         const int kOutputIdx = 0;
         AllocateOutputSetMklShape(context, kOutputIdx, &diff_filter_tensor,
                                   diff_filter_tf_shape, diff_filter_mkl_shape,
-                                  eager_mode);
+                                  native_format);
         DCHECK(diff_filter_tensor != nullptr);
 
         // If output tensor has more than 0 elements, we need to 0 them out.
@@ -534,6 +548,7 @@ class MklConvCustomBackpropFilterOp
       for (int i = 0; i < dilations.size(); ++i) --dilations[i];
       MklConvBwdFilterParams convBwdFilterDims(
           fwd_src_dims, fwd_filter_dims, diff_bias_dims, diff_dst_dims, strides,
+          tf_fmt, native_format,
 #ifndef ENABLE_MKLDNN_V1
           dilations, padding_left, padding_right,
           TFPaddingToMklDnnPadding(this->padding_));
@@ -566,7 +581,7 @@ class MklConvCustomBackpropFilterOp
                diff_filter_dims[MklDnnDims::Dim_O]});
           AllocateOutputSetMklShape(context, 0, &diff_filter_tensor,
                                     diff_filter_tf_shape, diff_filter_mkl_shape,
-                                    eager_mode);
+                                    native_format);
         } else {
           // Depthwise Conv2d: diff_filter_dims is GOIHW format.
           //                  | TensorFlow       | MKLDNN
@@ -710,7 +725,7 @@ class MklConvCustomBackpropFilterOp
   TensorShape MakeInputTfShape(OpKernelContext* context,
                                const Tensor& input_tensor) {
     size_t input_idx = 0;
-    return GetTfShape(context, input_idx, eager_mode);
+    return GetTfShape(context, input_idx, native_format);
   }
 
   // Get TensorFlow shape of filter tensor.
@@ -792,7 +807,7 @@ class MklConvCustomBackpropFilterOp
           .Label(mkl_op_registry::kMklLayoutDependentOpLabel),           \
       MklConvCustomBackpropFilterOp<CPUDevice, T, false, false, false>); \
   REGISTER_KERNEL_BUILDER(                                               \
-      Name("_MklEagerConv2DBackpropFilter")                              \
+      Name("_MklNativeConv2DBackpropFilter")                             \
           .Device(DEVICE_CPU)                                            \
           .TypeConstraint<T>("T")                                        \
           .Label(mkl_op_registry::kMklNameChangeOpLabel),                \
diff --git a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc b/tensorflow/core/kernels/mkl/mkl_conv_grad_input_ops.cc
similarity index 92%
rename from tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
rename to tensorflow/core/kernels/mkl/mkl_conv_grad_input_ops.cc
index 7177431029a..bfac57d59eb 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/mkl/mkl_conv_grad_input_ops.cc
@@ -35,7 +35,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_util.h"
 #include "tensorflow/core/kernels/conv_grad_ops.h"
 #include "tensorflow/core/kernels/conv_grad_shape_utils.h"
-#include "tensorflow/core/kernels/mkl_conv_ops.h"
+#include "tensorflow/core/kernels/mkl/mkl_conv_ops.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
@@ -63,6 +63,8 @@ struct MklConvBwdInputParams {
   memory::dims filter_dims;
   memory::dims diff_dst_dims;
   memory::dims strides;
+  MKL_TENSOR_FORMAT tf_fmt;
+  bool native_format;
   memory::dims dilations;
   memory::dims padding_left;
   memory::dims padding_right;
@@ -72,6 +74,7 @@ struct MklConvBwdInputParams {
 
   MklConvBwdInputParams(memory::dims diff_src_dims, memory::dims filter_dims,
                         memory::dims diff_dst_dims, memory::dims strides,
+                        MKL_TENSOR_FORMAT tf_fmt, bool native_format,
                         memory::dims dilations, memory::dims padding_left,
 #ifndef ENABLE_MKLDNN_V1
                         memory::dims padding_right, padding_kind padding)
@@ -82,6 +85,8 @@ struct MklConvBwdInputParams {
         filter_dims(filter_dims),
         diff_dst_dims(diff_dst_dims),
         strides(strides),
+        tf_fmt(tf_fmt),
+        native_format(native_format),
         dilations(dilations),
         padding_left(padding_left),
 #ifndef ENABLE_MKLDNN_V1
@@ -215,15 +220,22 @@ class MklConvBwdInputPrimitive : public MklPrimitive {
   };
 
   void Setup(const MklConvBwdInputParams& convBwdInputDims) {
-    // Create memory descriptors for conv bwd input without any specified
-    // format so that MKL-DNN can pick an appropriate one depending on the
-    // input parameters.
+    MEMORY_FORMAT user_data_fmt;
+    if (convBwdInputDims.native_format) {
+      user_data_fmt =
+          MklTensorFormatToMklDnnDataFormat(convBwdInputDims.tf_fmt);
+    } else {
+      // Create memory descriptors for conv bwd input without any specified
+      // format so that MKL-DNN can pick an appropriate one depending on the
+      // input parameters.
+      user_data_fmt = MEMORY_FORMAT::any;
+    }
+    context_.diff_dst_md.reset(new memory::desc(
+        {convBwdInputDims.diff_dst_dims}, MklDnnType<T>(), user_data_fmt));
     context_.diff_src_md.reset(new memory::desc(
-        {convBwdInputDims.diff_src_dims}, MklDnnType<T>(), MEMORY_FORMAT::any));
+        {convBwdInputDims.diff_src_dims}, MklDnnType<T>(), user_data_fmt));
     context_.filter_md.reset(new memory::desc(
         {convBwdInputDims.filter_dims}, MklDnnType<T>(), MEMORY_FORMAT::any));
-    context_.diff_dst_md.reset(new memory::desc(
-        {convBwdInputDims.diff_dst_dims}, MklDnnType<T>(), MEMORY_FORMAT::any));
 
     // Create descriptors for both conv fwd and conv bwd input.
     context_.bwd_input_desc.reset(new ConvBwdDataDesc(
@@ -343,6 +355,9 @@ class MklConvBwdInputPrimitiveFactory : public MklPrimitiveFactory<T> {
     key_creator.AddAsKey(convBwdInputDims.dilations);
     key_creator.AddAsKey(convBwdInputDims.padding_left);
     key_creator.AddAsKey(convBwdInputDims.padding_right);
+    if (convBwdInputDims.native_format) {
+      key_creator.AddAsKey(convBwdInputDims.tf_fmt);
+    }
     return key_creator.GetKey();
   }
 
@@ -358,7 +373,7 @@ class MklConvBwdInputPrimitiveFactory : public MklPrimitiveFactory<T> {
   }
 };
 
-template <typename Device, class T, bool is_depthwise, bool eager_mode>
+template <typename Device, class T, bool is_depthwise, bool native_format>
 class MklConvCustomBackpropInputOp
     : public MklConvBackpropCommonOp<Device, T, is_depthwise> {
  public:
@@ -375,9 +390,9 @@ class MklConvCustomBackpropInputOp
       const Tensor& diff_dst_tensor = MklGetInput(context, kOutbpropIdx);
 
       MklDnnShape src_mkl_shape, filter_mkl_shape, diff_dst_mkl_shape;
-      GetMklShape(context, kInputIdx, &src_mkl_shape, eager_mode);
-      GetMklShape(context, kFilterIdx, &filter_mkl_shape, eager_mode);
-      GetMklShape(context, kOutbpropIdx, &diff_dst_mkl_shape, eager_mode);
+      GetMklShape(context, kInputIdx, &src_mkl_shape, native_format);
+      GetMklShape(context, kFilterIdx, &filter_mkl_shape, native_format);
+      GetMklShape(context, kOutbpropIdx, &diff_dst_mkl_shape, native_format);
       // Allow operator-specific sanity checking of shapes.
       ValidateMklShapes(src_mkl_shape, filter_mkl_shape, diff_dst_mkl_shape);
 
@@ -397,7 +412,7 @@ class MklConvCustomBackpropInputOp
 
       TensorShape filter_tf_shape = MakeFilterTfShape(context, filter_tensor);
       TensorShape diff_dst_tf_shape =
-          GetTfShape(context, kOutbpropIdx, eager_mode);
+          GetTfShape(context, kOutbpropIdx, native_format);
 
       // Corner cases: output with 0 elements and 0 batch size.
       Tensor* diff_src_tensor = nullptr;
@@ -411,7 +426,7 @@ class MklConvCustomBackpropInputOp
         const int kOutputIdx = 0;
         AllocateOutputSetMklShape(context, kOutputIdx, &diff_src_tensor,
                                   diff_src_tf_shape, diff_src_mkl_shape,
-                                  eager_mode);
+                                  native_format);
         DCHECK(diff_src_tensor != nullptr);
 
         // If output tensor has more than 0 elements, we need to 0 them out.
@@ -475,7 +490,8 @@ class MklConvCustomBackpropInputOp
       // 0 in MKL-DNN.
       for (int i = 0; i < dilations.size(); ++i) --dilations[i];
       MklConvBwdInputParams convBwdInputDims(
-          fwd_src_dims, fwd_filter_dims, diff_dst_dims, strides, dilations,
+          fwd_src_dims, fwd_filter_dims, diff_dst_dims, strides, tf_fmt,
+          native_format, dilations,
 #ifndef ENABLE_MKLDNN_V1
           padding_left, padding_right,
           TFPaddingToMklDnnPadding(this->padding_));
@@ -511,13 +527,11 @@ class MklConvCustomBackpropInputOp
                                      bwd_diff_src_dims, bwd_diff_src_format);
       TensorShape diff_src_tf_shape;
       diff_src_tf_shape.AddDim(diff_src_pd.get_size() / sizeof(T));
-      Tensor tmp_tensor;
-      if (eager_mode) {
-        AllocTmpBuffer<T>(context, &tmp_tensor, diff_src_tf_shape);
+      if (native_format) {
         diff_src_tf_shape = diff_src_mkl_shape.GetTfShape();
       }
       AllocateOutputSetMklShape(context, 0, &diff_src_tensor, diff_src_tf_shape,
-                                diff_src_mkl_shape, eager_mode);
+                                diff_src_mkl_shape, native_format);
       T* diff_src_data =
           static_cast<T*>(const_cast<T*>(diff_src_tensor->flat<T>().data()));
 
@@ -555,29 +569,8 @@ class MklConvCustomBackpropInputOp
       std::shared_ptr<stream> bwd_cpu_stream;
       bwd_cpu_stream.reset(CreateStream(context, conv_bwd_input->GetEngine()));
       // Execute conv bwd input primitive.
-      if (!eager_mode) {
-        conv_bwd_input->Execute(diff_src_data, filter_data, diff_dst_data,
-                                bwd_cpu_stream);
-      } else {
-        // In eager mode we first write the output to temporary
-        // buffer in MKL format. Then we convert the data to TF format.
-        T* tmp_data =
-            static_cast<T*>(const_cast<T*>(tmp_tensor.flat<T>().data()));
-        conv_bwd_input->Execute(tmp_data, filter_data, diff_dst_data,
-                                bwd_cpu_stream);
-        auto output_tf_md = diff_src_mkl_shape.GetTfLayout();
-#ifndef ENABLE_MKLDNN_V1
-        auto output_tf_pd = memory::primitive_desc(output_tf_md, cpu_engine_);
-#endif
-        ReorderPd reorder_pd =
-            REORDER_PD_CONSTRUCTOR(diff_src_pd, OUTPUT_TF_MD, cpu_engine_);
-        memory* tmp_data_mem =
-            new MEMORY_CONSTRUCTOR(diff_src_pd, cpu_engine_, tmp_data);
-        memory* dst_data_mem =
-            new MEMORY_CONSTRUCTOR(OUTPUT_TF_MD, cpu_engine_, diff_src_data);
-        CreateAndExecuteReorder(reorder_pd, *tmp_data_mem, *dst_data_mem,
-                                cpu_engine_, context);
-      }
+      conv_bwd_input->Execute(diff_src_data, filter_data, diff_dst_data,
+                              bwd_cpu_stream);
 
       // Delete primitive since it is not cached.
       if (do_not_cache) {
@@ -625,7 +618,7 @@ class MklConvCustomBackpropInputOp
   // Get TensorFlow shape of filter tensor.
   TensorShape MakeFilterTfShape(OpKernelContext* context,
                                 const Tensor& filter_tensor) {
-    return GetTfShape(context, kFilterIdx, eager_mode);
+    return GetTfShape(context, kFilterIdx, native_format);
   }
 
   // Get the Tensorflow shape of Output (diff_src),
@@ -683,7 +676,7 @@ class MklConvCustomBackpropInputOp
           .Label(mkl_op_registry::kMklLayoutDependentOpLabel),   \
       MklConvCustomBackpropInputOp<CPUDevice, T, false, false>); \
   REGISTER_KERNEL_BUILDER(                                       \
-      Name("_MklEagerConv2DBackpropInput")                       \
+      Name("_MklNativeConv2DBackpropInput")                      \
           .Device(DEVICE_CPU)                                    \
           .TypeConstraint<T>("T")                                \
           .Label(mkl_op_registry::kMklNameChangeOpLabel),        \
diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl/mkl_conv_ops.cc
similarity index 96%
rename from tensorflow/core/kernels/mkl_conv_ops.cc
rename to tensorflow/core/kernels/mkl/mkl_conv_ops.cc
index 210044436aa..cf9badb5347 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.cc
+++ b/tensorflow/core/kernels/mkl/mkl_conv_ops.cc
@@ -16,7 +16,7 @@ limitations under the License.
 // See docs in ../ops/nn_ops.cc.
 #ifdef INTEL_MKL
 
-#include "tensorflow/core/kernels/mkl_conv_ops.h"
+#include "tensorflow/core/kernels/mkl/mkl_conv_ops.h"
 
 #include <algorithm>
 #include <map>
@@ -33,7 +33,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
-#include "tensorflow/core/kernels/mkl_quantized_conv_ops.h"
+#include "tensorflow/core/kernels/mkl/mkl_quantized_conv_ops.h"
 #include "tensorflow/core/kernels/no_op.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -65,6 +65,8 @@ struct MklConvFwdParams {
   memory::dims dilations;
   memory::dims padding_left;
   memory::dims padding_right;
+  MKL_TENSOR_FORMAT tf_fmt;
+  bool native_format;
   string dtypes = string("");
   struct PostOpParam {
     string name;
@@ -77,7 +79,8 @@ struct MklConvFwdParams {
   MklConvFwdParams(memory::dims src_dims, memory::dims filter_dims,
                    memory::dims bias_dims, memory::dims dst_dims,
                    memory::dims strides, memory::dims dilations,
-                   memory::dims padding_left, memory::dims padding_right)
+                   memory::dims padding_left, memory::dims padding_right,
+                   MKL_TENSOR_FORMAT tf_fmt, bool native_format)
       : src_dims(src_dims),
         filter_dims(filter_dims),
         bias_dims(bias_dims),
@@ -85,7 +88,9 @@ struct MklConvFwdParams {
         strides(strides),
         dilations(dilations),
         padding_left(padding_left),
-        padding_right(padding_right) {}
+        padding_right(padding_right),
+        tf_fmt(tf_fmt),
+        native_format(native_format) {}
 };
 
 // With quantization, input, filter, and output can have different types
@@ -228,15 +233,21 @@ class MklConvFwdPrimitive : public MklPrimitive {
   };
 
   void Setup(const MklConvFwdParams& convFwdDims) {
-    // Create memory descriptors for convolution data w/ no specified format
+    MEMORY_FORMAT user_data_fmt;
+    if (convFwdDims.native_format) {
+      user_data_fmt = MklTensorFormatToMklDnnDataFormat(convFwdDims.tf_fmt);
+    } else {
+      // Create memory descriptors for convolution data w/ no specified format
+      user_data_fmt = MEMORY_FORMAT::any;
+    }
     context_.src_md.reset(new memory::desc(
-        {convFwdDims.src_dims}, MklDnnType<Tinput>(), MEMORY_FORMAT::any));
+        {convFwdDims.src_dims}, MklDnnType<Tinput>(), user_data_fmt));
 
     context_.filter_md.reset(new memory::desc(
         {convFwdDims.filter_dims}, MklDnnType<Tfilter>(), MEMORY_FORMAT::any));
 
     context_.dst_md.reset(new memory::desc(
-        {convFwdDims.dst_dims}, MklDnnType<Toutput>(), MEMORY_FORMAT::any));
+        {convFwdDims.dst_dims}, MklDnnType<Toutput>(), user_data_fmt));
 
     if (!convFwdDims.bias_dims.empty())
       context_.bias_md.reset(new memory::desc(
@@ -414,6 +425,9 @@ class MklConvFwdPrimitiveFactory : public MklPrimitiveFactory<float> {
     key_creator.AddAsKey(convFwdDims.padding_left);
     key_creator.AddAsKey(convFwdDims.padding_right);
     key_creator.AddAsKey(convFwdDims.dtypes);
+    if (convFwdDims.native_format) {
+      key_creator.AddAsKey(convFwdDims.tf_fmt);
+    }
 
     // Generate keys for post-ops
     for (auto const& post_op_param : convFwdDims.post_op_params) {
@@ -453,7 +467,7 @@ class MklConvFwdPrimitiveFactory : public MklPrimitiveFactory<float> {
 template <typename Device, typename Tinput, typename Tfilter, typename Tbias,
           typename Toutput, typename Ttemp_output, typename Tpadding,
           bool bias_enabled, bool pad_enabled, bool is_depthwise,
-          bool eager_mode>
+          bool native_format>
 class MklConvOp : public OpKernel {
  public:
   ~MklConvOp() {}
@@ -525,8 +539,9 @@ class MklConvOp : public OpKernel {
       const Tensor& src_tensor = MklGetInput(context, kInputIndex_Src);
       const Tensor& filter_tensor = MklGetInput(context, kInputIndex_Filter);
       MklDnnShape src_mkl_shape, filter_mkl_shape;
-      GetMklShape(context, kInputIndex_Src, &src_mkl_shape, eager_mode);
-      GetMklShape(context, kInputIndex_Filter, &filter_mkl_shape, eager_mode);
+      GetMklShape(context, kInputIndex_Src, &src_mkl_shape, native_format);
+      GetMklShape(context, kInputIndex_Filter, &filter_mkl_shape,
+                  native_format);
 
       OP_REQUIRES(context, !filter_mkl_shape.IsMklTensor(),
                   errors::InvalidArgument("Filter should not be in "
@@ -557,9 +572,9 @@ class MklConvOp : public OpKernel {
       // Get shapes of input tensors in MKL-DNN order
       MklDnnConvUtil conv_utl(context, strides_, padding_, data_format_,
                               dilations_);
-      auto src_tf_shape = GetTfShape(context, kInputIndex_Src, eager_mode);
+      auto src_tf_shape = GetTfShape(context, kInputIndex_Src, native_format);
       auto filter_tf_shape =
-          GetTfShape(context, kInputIndex_Filter, eager_mode);
+          GetTfShape(context, kInputIndex_Filter, native_format);
       conv_utl.GetConvFwdSizesInMklOrder(
           src_tf_shape, filter_tf_shape, &src_dims, &filter_dims, &strides,
           &dilations, &dst_dims_tf_order, &dst_dims_mkl_order, &padding_left,
@@ -572,17 +587,16 @@ class MklConvOp : public OpKernel {
 
       // Corner cases: output with 0 elements and 0 batch size.
       Tensor* dst_tensor = nullptr;
-      Tensor tmp_tensor;
       bool emit_filter_output = (typeid(Tinput) == typeid(Tfilter) &&
                                  typeid(Tinput) == typeid(Toutput) &&
                                  (typeid(Tinput) == typeid(float) ||
                                   typeid(Tinput) == typeid(bfloat16))) &&
-                                !eager_mode;
+                                !native_format;
       if (dst_tf_shape.num_elements() == 0 || dst_dims_tf_order[0] == 0) {
         MklDnnShape dst_mkl_shape;
         dst_mkl_shape.SetMklTensor(false);
         AllocateOutputSetMklShape(context, kOutputIndex_Dst, &dst_tensor,
-                                  src_tf_shape, dst_mkl_shape, eager_mode);
+                                  src_tf_shape, dst_mkl_shape, native_format);
 
         // MklConv2D/3D also outputs converted filter as 2nd output.
         filter_mkl_shape.SetMklTensor(false);
@@ -682,18 +696,19 @@ class MklConvOp : public OpKernel {
       }
       MklConvFwdParams convFwdDims(
           src_dims, filter_dims, fuse_biasadd_ ? bias_dims : NONE_DIMS,
-          dst_dims_mkl_order, strides, dilations, padding_left, padding_right);
+          dst_dims_mkl_order, strides, dilations, padding_left, padding_right,
+          tf_fmt, native_format);
 
       // TODO(mdfaijul): Extend the basic parameters for data types and fusions
       this->ExtendConvFwdParams(context, convFwdDims);
       conv_fwd =
           MklConvFwdPrimitiveFactory<Tinput, Tfilter, Tbias, Ttemp_output>::Get(
               convFwdDims, do_not_cache);
-      // Allocate output tensors `output_tensor` and `filter_out_tensor`
+      // Allocate output tensors `dst_tensor` and `filter_out_tensor`
       MklDnnShape output_mkl_shape;
       std::shared_ptr<ConvFwdPd> conv_fwd_pd = conv_fwd->GetPrimitiveDesc();
       AllocateOutputTensor(context, *conv_fwd_pd, dst_dims_mkl_order, tf_fmt,
-                           &output_mkl_shape, &dst_tensor, &tmp_tensor);
+                           &output_mkl_shape, &dst_tensor);
 
       Tensor* filter_out_tensor = nullptr;
       if (emit_filter_output) {
@@ -772,30 +787,7 @@ class MklConvOp : public OpKernel {
         conv_fwd->Execute(src_data, filter_data, bias_data, dst_data,
                           fwd_cpu_stream);
       } else {
-        if (!eager_mode) {
-          conv_fwd->Execute(src_data, filter_data, dst_data, fwd_cpu_stream);
-        } else {
-          // In eager mode we first write the output to temporary
-          // buffer in MKL format. Then we convert the data to TF format.
-          Ttemp_output* tmp_data = reinterpret_cast<Ttemp_output*>(
-              tmp_tensor.flat<Toutput>().data());
-          conv_fwd->Execute(src_data, filter_data, tmp_data, fwd_cpu_stream);
-
-          // Now we need to convert the output to TF format.
-          auto output_tf_md = output_mkl_shape.GetTfLayout();
-#ifndef ENABLE_MKLDNN_V1
-          auto output_tf_pd = memory::primitive_desc(output_tf_md, cpu_engine_);
-#endif  // !ENABLE_MKLDNN_V1
-          auto dst_pd = conv_fwd_pd->PRIMITIVE_DESC_DST;
-          ReorderPd reorder_pd =
-              REORDER_PD_CONSTRUCTOR(dst_pd, OUTPUT_TF_MD, cpu_engine_);
-          memory* tmp_data_mem =
-              new MEMORY_CONSTRUCTOR(dst_pd, cpu_engine_, tmp_data);
-          memory* dst_data_mem =
-              new MEMORY_CONSTRUCTOR(OUTPUT_TF_MD, cpu_engine_, dst_data);
-          CreateAndExecuteReorder(reorder_pd, *tmp_data_mem, *dst_data_mem,
-                                  cpu_engine_, context);
-        }
+        conv_fwd->Execute(src_data, filter_data, dst_data, fwd_cpu_stream);
       }
 
       // Delete primitive since it is not cached.
@@ -861,10 +853,12 @@ class MklConvOp : public OpKernel {
   void set_fuse_biasadd(bool fuse_biasadd) { fuse_biasadd_ = fuse_biasadd; }
   void set_fuse_activation(bool fuse_activation,
                            mkldnn::algorithm activation_alg,
-                           float relu_up_bound = 0.0) {
+                           float alpha_or_upbound = 0.0) {
     fuse_activation_ = fuse_activation;
     activation_alg_ = activation_alg;
-    relu_up_bound_ = relu_up_bound;
+    // This variable is used for alpha in leakyrelu or upper bound in relu6
+    // depending on the context
+    alpha_or_upbound_ = alpha_or_upbound;
   }
   void set_fuse_pad(bool fuse_pad) {
     fuse_pad_ = fuse_pad;
@@ -892,7 +886,7 @@ class MklConvOp : public OpKernel {
     }
     if (fuse_activation_) {
       params.post_op_params.push_back(
-          {"activation", activation_alg_, {1.0, relu_up_bound_, 0.0}, ""});
+          {"activation", activation_alg_, {1.0, alpha_or_upbound_, 0.0}, ""});
     }
   }
 
@@ -911,8 +905,7 @@ class MklConvOp : public OpKernel {
                                     const memory::dims& output_dims_mkl_order,
                                     MKL_TENSOR_FORMAT output_tf_format,
                                     MklDnnShape* output_mkl_shape,
-                                    Tensor** output_tensor,
-                                    Tensor* tmp_tensor) {
+                                    Tensor** output_tensor) {
     DCHECK(output_tensor);
 #ifdef ENABLE_MKLDNN_V1
     auto dst_md = conv_prim_desc.dst_desc();
@@ -939,8 +932,7 @@ class MklConvOp : public OpKernel {
     // Allocate shape of TF tensor
     TensorShape output_tf_shape;
     output_tf_shape.AddDim((DST_MD.get_size() / sizeof(Toutput)));
-    if (eager_mode) {
-      AllocTmpBuffer<Toutput>(context, tmp_tensor, output_tf_shape);
+    if (native_format) {
       output_tf_shape = output_mkl_shape->GetTfShape();
     }
 
@@ -957,7 +949,7 @@ class MklConvOp : public OpKernel {
       } else {
         AllocateOutputSetMklShape(context, kOutputIndex_Dst, output_tensor,
                                   output_tf_shape, *output_mkl_shape,
-                                  eager_mode);
+                                  native_format);
 #ifdef ENABLE_MKLDNN_V1
         auto output_format_tag = MklTensorFormatToMklDnnDataFormat(
             output_mkl_shape->GetTfDataFormat());
@@ -991,7 +983,8 @@ class MklConvOp : public OpKernel {
       }
     } else {
       AllocateOutputSetMklShape(context, kOutputIndex_Dst, output_tensor,
-                                output_tf_shape, *output_mkl_shape, eager_mode);
+                                output_tf_shape, *output_mkl_shape,
+                                native_format);
     }
   }
 
@@ -1016,7 +1009,9 @@ class MklConvOp : public OpKernel {
   bool fuse_pad_ = pad_enabled;
   bool fuse_add_ = false;
 
-  float relu_up_bound_ = 0.0;
+  // This variable is used for alpha in leakyrelu or upper bound in relu6
+  // depending on the context
+  float alpha_or_upbound_ = 0.0;
   mkldnn::algorithm activation_alg_ = ALGORITHM_UNDEF;
 
   int input_index_pad_ = 2;
@@ -1317,6 +1312,11 @@ class MklFusedConvOp
       this->set_fuse_activation(true, ALGORITHM::eltwise_bounded_relu, 6.0);
     } else if (fused_ops == std::vector<string>{"Elu"}) {
       this->set_fuse_activation(true, ALGORITHM::eltwise_elu, 1.0);
+    } else if (fused_ops == std::vector<string>{"LeakyRelu"}) {
+      float leakyrelu_alpha;
+      OP_REQUIRES_OK(context,
+                     context->GetAttr("leakyrelu_alpha", &leakyrelu_alpha));
+      this->set_fuse_activation(true, ALGORITHM::eltwise_relu, leakyrelu_alpha);
     } else if (fused_ops == std::vector<string>{"BiasAdd", "Relu"}) {
       this->set_fuse_biasadd(true);
       this->set_fuse_activation(true, ALGORITHM::eltwise_relu);
@@ -1335,6 +1335,15 @@ class MklFusedConvOp
       OP_REQUIRES(context, num_args == 1,
                   errors::InvalidArgument(
                       "Fused Conv2D must have one extra argument: bias."));
+    } else if (fused_ops == std::vector<string>{"BiasAdd", "LeakyRelu"}) {
+      this->set_fuse_biasadd(true);
+      float leakyrelu_alpha;
+      OP_REQUIRES_OK(context,
+                     context->GetAttr("leakyrelu_alpha", &leakyrelu_alpha));
+      this->set_fuse_activation(true, ALGORITHM::eltwise_relu, leakyrelu_alpha);
+      OP_REQUIRES(context, num_args == 1,
+                  errors::InvalidArgument(
+                      "Fused Conv2D must have one extra argument: bias."));
     } else if (fused_ops == std::vector<string>{"BiasAdd", "Add"}) {
       this->set_fuse_biasadd(true);
       this->set_fuse_add(true);
@@ -1366,6 +1375,18 @@ class MklFusedConvOp
           context, num_args == 2,
           errors::InvalidArgument(
               "Fused Conv2D must have two extra arguments: bias and add."));
+    } else if (fused_ops ==
+               std::vector<string>{"BiasAdd", "Add", "LeakyRelu"}) {
+      this->set_fuse_biasadd(true);
+      this->set_fuse_add(true);
+      float leakyrelu_alpha;
+      OP_REQUIRES_OK(context,
+                     context->GetAttr("leakyrelu_alpha", &leakyrelu_alpha));
+      this->set_fuse_activation(true, ALGORITHM::eltwise_relu, leakyrelu_alpha);
+      OP_REQUIRES(
+          context, num_args == 2,
+          errors::InvalidArgument(
+              "Fused Conv2D must have two extra arguments: bias and add."));
     } else {
       OP_REQUIRES(context, false,
                   errors::Unimplemented("Fusion is not implemented: [",
@@ -1836,8 +1857,7 @@ class MklQuantizedConv2DSumReluOp
                             const memory::dims& output_dims_mkl_order,
                             MKL_TENSOR_FORMAT output_tf_format,
                             MklDnnShape* output_mkl_shape,
-                            Tensor** output_tensor,
-                            Tensor* tmp_tensor) override {
+                            Tensor** output_tensor) override {
     int summand_idx = context->num_inputs() / 2 - 1;
     if (std::is_same<Toutput, quint8>::value) {
       summand_idx -= 2;
@@ -1869,7 +1889,7 @@ class MklQuantizedConv2DSumReluOp
               false>::AllocateOutputTensor(context, conv_prim_desc,
                                            output_dims_mkl_order,
                                            output_tf_format, output_mkl_shape,
-                                           output_tensor, tmp_tensor);
+                                           output_tensor);
     const Tensor& summand = MklGetInput(context, summand_idx);
     if (summand.dtype() != DT_FLOAT)
       TF_CHECK_OK(Status(error::Code::FAILED_PRECONDITION,
@@ -2432,7 +2452,7 @@ REGISTER_KERNEL_BUILDER(
           .Label(mkl_op_registry::kMklLayoutDependentOpLabel),                 \
       MklDummyOp<CPUDevice, T>);                                               \
   REGISTER_KERNEL_BUILDER(                                                     \
-      Name("_MklEagerConv2D")                                                  \
+      Name("_MklNativeConv2D")                                                 \
           .Device(DEVICE_CPU)                                                  \
           .TypeConstraint<T>("T")                                              \
           .Label(mkl_op_registry::kMklNameChangeOpLabel),                      \
diff --git a/tensorflow/core/kernels/mkl_conv_ops.h b/tensorflow/core/kernels/mkl/mkl_conv_ops.h
similarity index 99%
rename from tensorflow/core/kernels/mkl_conv_ops.h
rename to tensorflow/core/kernels/mkl/mkl_conv_ops.h
index 2ee2a621067..c4a4942e877 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.h
+++ b/tensorflow/core/kernels/mkl/mkl_conv_ops.h
@@ -13,9 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_KERNELS_MKL_CONV_OPS_H_
-#define TENSORFLOW_CORE_KERNELS_MKL_CONV_OPS_H_
+#ifndef TENSORFLOW_CORE_KERNELS_MKL_MKL_CONV_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_MKL_MKL_CONV_OPS_H_
 
+#ifdef INTEL_MKL
 #include <limits>
 #include <memory>
 #include <vector>
@@ -640,4 +641,5 @@ class MklDummyOp : public OpKernel {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_MKL_CONV_OPS_H_
+#endif  // INTEL_MKL
+#endif  // TENSORFLOW_CORE_KERNELS_MKL_MKL_CONV_OPS_H_
diff --git a/tensorflow/core/kernels/mkl_conv_ops_test.cc b/tensorflow/core/kernels/mkl/mkl_conv_ops_test.cc
similarity index 100%
rename from tensorflow/core/kernels/mkl_conv_ops_test.cc
rename to tensorflow/core/kernels/mkl/mkl_conv_ops_test.cc
diff --git a/tensorflow/core/kernels/mkl_cwise_ops_common.cc b/tensorflow/core/kernels/mkl/mkl_cwise_ops_common.cc
similarity index 100%
rename from tensorflow/core/kernels/mkl_cwise_ops_common.cc
rename to tensorflow/core/kernels/mkl/mkl_cwise_ops_common.cc
diff --git a/tensorflow/core/kernels/mkl_dequantize_op.cc b/tensorflow/core/kernels/mkl/mkl_dequantize_op.cc
similarity index 81%
rename from tensorflow/core/kernels/mkl_dequantize_op.cc
rename to tensorflow/core/kernels/mkl/mkl_dequantize_op.cc
index 82d78250576..e262e3ec5ef 100644
--- a/tensorflow/core/kernels/mkl_dequantize_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_dequantize_op.cc
@@ -26,7 +26,6 @@ limitations under the License.
 #include "tensorflow/core/kernels/meta_support.h"
 #include "tensorflow/core/kernels/quantization_utils.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/util/mkl_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
 using mkldnn::primitive_attr;
@@ -51,7 +50,7 @@ class MklDequantizeOp : public OpKernel {
   void Compute(OpKernelContext* ctx) override {
     try {
       // Using CPU device
-      auto cpu_engine = engine(ENGINE_CPU, 0);
+      auto cpu_engine = engine(engine::kind::cpu, 0);
 
       // Get the inputs
       const Tensor& src_tensor = MklGetInput(ctx, kSrcIndex);
@@ -82,10 +81,10 @@ class MklDequantizeOp : public OpKernel {
       // construct input TF layout. For TF layout, although input shape
       // (src_dims) required is in MKL-DNN order, the layout is Tensorflow's
       // layout
-      auto src_md =
-          src_mkl_shape.IsMklTensor()
-              ? src_mkl_shape.GetMklLayout()
-              : memory::desc(src_dims, MklDnnType<T>(), MEMORY_FORMAT::nhwc);
+      auto src_md = src_mkl_shape.IsMklTensor()
+                        ? src_mkl_shape.GetMklLayout()
+                        : memory::desc(src_dims, MklDnnType<T>(),
+                                       memory::format_tag::nhwc);
 
       src.SetUsrMem(src_md, &src_tensor);
       src.SetUsrMemDataHandle(&src_tensor, reorder_stream);
@@ -93,14 +92,6 @@ class MklDequantizeOp : public OpKernel {
       Tensor* output_tensor = nullptr;
       MklDnnShape output_mkl_shape;
       TensorShape output_tf_shape;
-#ifndef ENABLE_MKLDNN_V1
-      memory::desc dst_md =
-          src_mkl_shape.IsMklTensor()
-              ? memory::desc(src_dims, MklDnnType<float>(),
-                             static_cast<MEMORY_FORMAT>(src_md.data.format))
-              : memory::desc(src_dims, MklDnnType<float>(),
-                             MEMORY_FORMAT::nhwc);
-#else
       memory::desc dst_md = memory::desc();
       if (src_mkl_shape.IsMklTensor()) {
         dst_md = memory::desc(src_mkl_shape.GetMklLayout().data);
@@ -108,10 +99,9 @@ class MklDequantizeOp : public OpKernel {
         // same .data field but different type.
         dst_md.data.data_type = memory::convert_to_c(MklDnnType<float>());
       } else {
-        dst_md =
-            memory::desc(src_dims, MklDnnType<float>(), MEMORY_FORMAT::nhwc);
+        dst_md = memory::desc(src_dims, MklDnnType<float>(),
+                              memory::format_tag::nhwc);
       }
-#endif  // !ENABLE_MKLDNN_V1
 
       // If input is MKL shape, output is also MKL shape.
       // If input is TF shape, output is also TF shape.
@@ -122,8 +112,7 @@ class MklDequantizeOp : public OpKernel {
         output_mkl_shape.SetTfLayout(src_mkl_shape.GetDimension(),
                                      src_mkl_shape.GetSizesAsMklDnnDims(),
                                      src_mkl_shape.GetTfDataFormat());
-        output_tf_shape.AddDim(GET_MEMORY_SIZE_FROM_MD(dst_md, cpu_engine) /
-                               sizeof(float));
+        output_tf_shape.AddDim(dst_md.get_size() / sizeof(float));
       } else {
         output_mkl_shape.SetMklTensor(false);
         output_tf_shape = MklDnnDimsToTFShape(output_dims);
@@ -155,29 +144,17 @@ class MklDequantizeOp : public OpKernel {
       scales.push_back(scale_factor);
       primitive_attr attr;
       attr.set_output_scales(0, scales);
-#ifndef ENABLE_MKLDNN_V1
-      // MKL-DNN 1.0 does not provide set_int_output_round_mode() API.
-      // Also it does not define round_nearest (enum).
-      attr.set_int_output_round_mode(mkldnn::round_mode::round_nearest);
-#endif  // !ENABLE_MKLDNN_V1
       std::vector<primitive> net;
 
       // Create reorder primitive and then execute.
-      auto reorder_pd = REORDER_PD_CONSTRUCTOR_WITH_ATTR(
-          GET_MEMORY_PRIMITIVE_DESC_FROM_MEM_PTR(src.GetUsrMem()),
-          GET_MEMORY_PRIMITIVE_DESC_FROM_MEM_PTR(dst.GetUsrMem()), cpu_engine,
-          attr);
-#ifdef ENABLE_MKLDNN_V1
+      auto reorder_pd =
+          ReorderPd(cpu_engine, src.GetUsrMem()->get_desc(), cpu_engine,
+                    dst.GetUsrMem()->get_desc(), attr);
       net.push_back(reorder(reorder_pd));
       std::vector<std::unordered_map<int, memory>> reorder_net_args;
       reorder_net_args.push_back({{MKLDNN_ARG_FROM, *src.GetUsrMem()},
-                                  { MKLDNN_ARG_TO,
-                                    *dst.GetUsrMem() }});
+                                  {MKLDNN_ARG_TO, *dst.GetUsrMem()}});
       execute_primitives(net, reorder_stream, reorder_net_args);
-#else
-      net.push_back(reorder(reorder_pd, *src.GetUsrMem(), *dst.GetUsrMem()));
-      reorder_stream->submit(net);
-#endif  // ENABLE_MKLDNN_V1
     } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
                          ", message: " + string(e.message) + ", in file " +
diff --git a/tensorflow/core/kernels/mkl_dequantize_op_test.cc b/tensorflow/core/kernels/mkl/mkl_dequantize_op_test.cc
similarity index 88%
rename from tensorflow/core/kernels/mkl_dequantize_op_test.cc
rename to tensorflow/core/kernels/mkl/mkl_dequantize_op_test.cc
index b400fb761cb..564c2829e99 100644
--- a/tensorflow/core/kernels/mkl_dequantize_op_test.cc
+++ b/tensorflow/core/kernels/mkl/mkl_dequantize_op_test.cc
@@ -62,23 +62,6 @@ TEST_F(MklDequantizeOpTest, small) {
   test::ExpectTensorNear<float>(expected, output, 0.1);
 }
 
-Tensor CreateMklInput() {
-  MklDnnShape mkl_shape;
-  memory::desc md =
-      memory::desc({1, 2, 2, 2}, MklDnnType<uint8>(), memory::format::nhwc);
-  mkl_shape.SetMklTensor(true);
-  mkl_shape.SetMklLayout(&md);
-  mkl_shape.SetElemType(MklDnnType<uint8>());
-  mkl_shape.SetTfLayout(4, {1, 2, 2, 2}, memory::format::nhwc);
-
-  DataType dtype = DataTypeToEnum<uint8>::v();
-  Tensor mkl_tensor(dtype, {mkl_shape.GetSerializeBufferSize()});
-  mkl_shape.SerializeMklDnnShape(
-      mkl_tensor.flat<uint8>().data(),
-      mkl_tensor.flat<uint8>().size() * sizeof(uint8));
-  return mkl_tensor;
-}
-
 template <typename T>
 class CommonTestUtilities : public OpsTestBase {
  public:
@@ -129,8 +112,7 @@ TEST_F(MklDequantizeOpTest, MKLInput) {
   AddInputFromArray<float>(TensorShape({1}), {0});
   // max_range = 200
   AddInputFromArray<float>(TensorShape({1}), {200.0f});
-  auto mkl_tensor = CreateMklInput();
-  AddInputFromArray<uint8>(mkl_tensor.shape(), mkl_tensor.flat<uint8>());
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
   AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
   AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
   TF_ASSERT_OK(RunOpKernel());
diff --git a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc b/tensorflow/core/kernels/mkl/mkl_fused_batch_norm_op.cc
similarity index 100%
rename from tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
rename to tensorflow/core/kernels/mkl/mkl_fused_batch_norm_op.cc
diff --git a/tensorflow/core/kernels/mkl_fused_batch_norm_op_test.cc b/tensorflow/core/kernels/mkl/mkl_fused_batch_norm_op_test.cc
similarity index 100%
rename from tensorflow/core/kernels/mkl_fused_batch_norm_op_test.cc
rename to tensorflow/core/kernels/mkl/mkl_fused_batch_norm_op_test.cc
diff --git a/tensorflow/core/kernels/mkl_fused_ops_test.cc b/tensorflow/core/kernels/mkl/mkl_fused_ops_test.cc
similarity index 96%
rename from tensorflow/core/kernels/mkl_fused_ops_test.cc
rename to tensorflow/core/kernels/mkl/mkl_fused_ops_test.cc
index 1f1bbd158f9..8b48007321a 100644
--- a/tensorflow/core/kernels/mkl_fused_ops_test.cc
+++ b/tensorflow/core/kernels/mkl/mkl_fused_ops_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/cc/ops/const_op.h"
 #include "tensorflow/cc/ops/image_ops.h"
 #include "tensorflow/cc/ops/nn_ops.h"
+#include "tensorflow/cc/ops/nn_ops_internal.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
 #include "tensorflow/core/framework/fake_input.h"
@@ -248,6 +249,12 @@ class MklFusedConv2DOpTest : public OpsTestBase {
       next_op = ops::Elu(root.WithOpName(last_op), next_op);
     }
 
+    if (std::find(fused_ops.begin(), fused_ops.end(), "LeakyRelu") !=
+        fused_ops.end()) {
+      last_op = "with_leakyrelu";
+      next_op = ops::internal::LeakyRelu(root.WithOpName(last_op), next_op);
+    }
+
     CommonTestUtilities<T>::RunAndFetch(root, last_op, output);
   }
 
@@ -385,6 +392,18 @@ TYPED_TEST_P(MklFusedConv2DWithBiasOpTest, SpatialConvolutionAndElu) {
   this->VerifyFusedConv2D(kFilterSize, kFilterCount, {"BiasAdd", "Elu"});
 }
 
+TYPED_TEST_P(MklFusedConv2DWithBiasOpTest, OneByOneConvolutionAndLeakyRelu) {
+  const int kFilterSize = 1;
+  const int kFilterCount = 12;
+  this->VerifyFusedConv2D(kFilterSize, kFilterCount, {"BiasAdd", "LeakyRelu"});
+}
+
+TYPED_TEST_P(MklFusedConv2DWithBiasOpTest, SpatialConvolutionAndLeakyRelu) {
+  const int kFilterSize = 3;
+  const int kFilterCount = 12;
+  this->VerifyFusedConv2D(kFilterSize, kFilterCount, {"BiasAdd", "LeakyRelu"});
+}
+
 TYPED_TEST_P(MklFusedConv2DWithBiasOpTest, OneByOneConvolutionAndAdd) {
   const int kFilterSize = 1;
   const int kFilterCount = 3;
@@ -437,15 +456,31 @@ TYPED_TEST_P(MklFusedConv2DWithBiasOpTest, SpatialConvolutionAndAddElu) {
   this->VerifyFusedConv2D(kFilterSize, kFilterCount, {"BiasAdd", "Add", "Elu"});
 }
 
+TYPED_TEST_P(MklFusedConv2DWithBiasOpTest, OneByOneConvolutionAndAddLeakyRelu) {
+  const int kFilterSize = 1;
+  const int kFilterCount = 3;
+  this->VerifyFusedConv2D(kFilterSize, kFilterCount,
+                          {"BiasAdd", "Add", "LeakyRelu"});
+}
+
+TYPED_TEST_P(MklFusedConv2DWithBiasOpTest, SpatialConvolutionAndAddLeakyRelu) {
+  const int kFilterSize = 3;
+  const int kFilterCount = 3;
+  this->VerifyFusedConv2D(kFilterSize, kFilterCount,
+                          {"BiasAdd", "Add", "LeakyRelu"});
+}
+
 REGISTER_TYPED_TEST_SUITE_P(
     MklFusedConv2DWithBiasOpTest, OneByOneConvolution, SpatialConvolution,
     OneByOneConvolutionAndRelu, SpatialConvolutionAndRelu,
     OneByOneConvolutionAndRelu6, SpatialConvolutionAndRelu6,
     OneByOneConvolutionAndElu, SpatialConvolutionAndElu,
+    OneByOneConvolutionAndLeakyRelu, SpatialConvolutionAndLeakyRelu,
     OneByOneConvolutionAndAdd, SpatialConvolutionAndAdd,
     OneByOneConvolutionAndAddRelu, SpatialConvolutionAndAddRelu,
     OneByOneConvolutionAndAddRelu6, SpatialConvolutionAndAddRelu6,
-    OneByOneConvolutionAndAddElu, SpatialConvolutionAndAddElu);
+    OneByOneConvolutionAndAddElu, SpatialConvolutionAndAddElu,
+    OneByOneConvolutionAndAddLeakyRelu, SpatialConvolutionAndAddLeakyRelu);
 
 using MklFusedBiasAddDataTypes = ::testing::Types<float>;
 INSTANTIATE_TYPED_TEST_SUITE_P(Test, MklFusedConv2DWithBiasOpTest,
@@ -876,6 +911,12 @@ class MklFusedMatMulOpTest : public OpsTestBase {
             next_op = ops::Elu(root.WithOpName(last_op), next_op);
           }
 
+          if (std::find(fused_ops.begin(), fused_ops.end(), "Tanh") !=
+              fused_ops.end()) {
+            last_op = "with_tanh";
+            next_op = ops::Tanh(root.WithOpName(last_op), next_op);
+          }
+
           CommonTestUtilities<T>::RunAndFetch(root, last_op, output);
         };
 
@@ -963,11 +1004,21 @@ TYPED_TEST_P(MklFusedMatMulOpTest, WithBiasAndElu) {
                           {"BiasAdd", "Elu"});
 }
 
+TYPED_TEST_P(MklFusedMatMulOpTest, WithBiasAndTanh) {
+  const int batch = 3;
+  const int input_channel = 4;
+  const int output_channel = 5;
+
+  this->VerifyFusedMatMul(batch, input_channel, output_channel,
+                          {"BiasAdd", "Tanh"});
+}
+
 REGISTER_TYPED_TEST_SUITE_P(MklFusedMatMulOpTest,  //
                             WithBias,              //
                             WithBiasAndRelu,       //
                             WithBiasAndRelu6,      //
-                            WithBiasAndElu);
+                            WithBiasAndElu,        //
+                            WithBiasAndTanh);
 
 using MklFusedMatMulDataTypes = ::testing::Types<float>;
 INSTANTIATE_TYPED_TEST_SUITE_P(Test, MklFusedMatMulOpTest,
diff --git a/tensorflow/core/kernels/mkl_identity_op.cc b/tensorflow/core/kernels/mkl/mkl_identity_op.cc
similarity index 100%
rename from tensorflow/core/kernels/mkl_identity_op.cc
rename to tensorflow/core/kernels/mkl/mkl_identity_op.cc
diff --git a/tensorflow/core/kernels/mkl_input_conversion_op.cc b/tensorflow/core/kernels/mkl/mkl_input_conversion_op.cc
similarity index 99%
rename from tensorflow/core/kernels/mkl_input_conversion_op.cc
rename to tensorflow/core/kernels/mkl/mkl_input_conversion_op.cc
index f7866cbcea6..ae130700a8d 100644
--- a/tensorflow/core/kernels/mkl_input_conversion_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_input_conversion_op.cc
@@ -17,21 +17,21 @@ limitations under the License.
 
 #include <algorithm>
 #include <vector>
+
+#include "mkldnn.hpp"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/mkl/mkl_tfconv_op.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/util/tensor_format.h"
-
-#include "mkldnn.hpp"
-#include "tensorflow/core/kernels/mkl_tfconv_op.h"
 #include "tensorflow/core/util/mkl_util.h"
+#include "tensorflow/core/util/tensor_format.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/mkl_lrn_op.cc b/tensorflow/core/kernels/mkl/mkl_lrn_op.cc
similarity index 91%
rename from tensorflow/core/kernels/mkl_lrn_op.cc
rename to tensorflow/core/kernels/mkl/mkl_lrn_op.cc
index 3e512d0792b..c315385ddae 100644
--- a/tensorflow/core/kernels/mkl_lrn_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_lrn_op.cc
@@ -33,7 +33,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/util/mkl_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 #include "tensorflow/core/util/tensor_format.h"
 
@@ -72,7 +71,7 @@ class MklLRNOp : public OpKernel {
   ~MklLRNOp() {}
 
   explicit MklLRNOp(OpKernelConstruction* context)
-      : OpKernel(context), cpu_engine_(ENGINE_CPU, 0) {
+      : OpKernel(context), cpu_engine_(engine::kind::cpu, 0) {
     int64 depth_radius64;
     OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64));
     OP_REQUIRES(
@@ -136,18 +135,18 @@ class MklLRNOp : public OpKernel {
       // and MKL-DNN performs normalization over Channel, we tell MKL-DNN
       // that input is in NHWC layout with Channel being the last dimension.
       src_dnn_data.SetUsrMem(src_md, &src_tensor);
-      src_dnn_data.SetOpMemDesc(input_dims, MEMORY_FORMAT::nhwc);
+      src_dnn_data.SetOpMemDesc(input_dims, memory::format_tag::nhwc);
       src_dnn_data.SetUsrMemDataHandle(&src_tensor, fwd_stream_);
 
       // dst_dnn_data has the same shape as input.
       dst_dnn_data.SetUsrMem(src_md);
-      dst_dnn_data.SetOpMemDesc(input_dims, MEMORY_FORMAT::nhwc);
+      dst_dnn_data.SetOpMemDesc(input_dims, memory::format_tag::nhwc);
 
       // Create LRN primitive descriptor.
       // Tensorflow's normalization semantics is across channels.
       // MKL-DNN also supports normalization within channel.
       auto lrn_desc = lrn_forward::desc(
-          prop_kind::forward, ALGORITHM::lrn_across_channels,
+          prop_kind::forward, mkldnn::algorithm::lrn_across_channels,
           src_dnn_data.GetUsrMemDesc(), kernel_size, new_alpha, beta_, bias_);
       auto lrn_prim_desc = lrn_forward::primitive_desc(lrn_desc, cpu_engine_);
 
@@ -165,26 +164,17 @@ class MklLRNOp : public OpKernel {
       OP_REQUIRES_OK(context, context->status());
 
       // Check for input reorder
-      src_dnn_data.CheckReorderToOpMem(MEMORY_PD_WITHOUT_DATA(
-          lrn_prim_desc.PRIMITIVE_DESC_SRC, cpu_engine_));
+      src_dnn_data.CheckReorderToOpMem(lrn_prim_desc.src_desc(), cpu_engine_);
 
       std::vector<primitive> net;
       fwd_stream_.reset(CreateStream(context, cpu_engine_));
-#ifdef ENABLE_MKLDNN_V1
       net.push_back(lrn_forward(lrn_prim_desc));
       std::vector<std::unordered_map<int, memory>> net_args;
       net_args.push_back({{MKLDNN_ARG_SRC, src_dnn_data.GetOpMem()},
                           {MKLDNN_ARG_WORKSPACE, workspace_dnn_data.GetOpMem()},
-                          { MKLDNN_ARG_DST,
-                            dst_dnn_data.GetOpMem() }});
+                          {MKLDNN_ARG_DST, dst_dnn_data.GetOpMem()}});
       net.push_back(lrn_forward(lrn_prim_desc));
       net.at(0).execute(*fwd_stream_, net_args.at(0));
-#else
-      net.push_back(lrn_forward(lrn_prim_desc, src_dnn_data.GetOpMem(),
-                                workspace_dnn_data.GetOpMem(),
-                                dst_dnn_data.GetOpMem()));
-      fwd_stream_->submit(net).wait();
-#endif
     } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
                          ", message: " + string(e.message) + ", in file " +
@@ -200,9 +190,9 @@ class MklLRNOp : public OpKernel {
       OpKernelContext* context,
       const lrn_forward::primitive_desc& lrn_fwd_prim_desc,
       const memory::dims output_dims_mkl_order,
-      const MKL_TENSOR_FORMAT& output_tf_format, Tensor** output_tensor) {
+      const MklTensorFormat& output_tf_format, Tensor** output_tensor) {
     DCHECK(output_tensor != nullptr);
-    MEMORY_PRIMITIVE_DESC dst_pd = lrn_fwd_prim_desc.PRIMITIVE_DESC_DST;
+    memory::desc dst_pd = lrn_fwd_prim_desc.dst_desc();
 
     MklDnnShape output_mkl_shape;
     // We only handle the case when the inputs and output are in Mkl format
@@ -274,8 +264,7 @@ class MklLRNOp : public OpKernel {
       MklDnnData<uint8>* dnn_data_wksp) {
     DCHECK(dnn_data_wksp != nullptr);
     Tensor* workspace_tensor = nullptr;
-    MEMORY_PRIMITIVE_DESC workspace_pd =
-        lrn_fwd_prim_desc.PRIMITIVE_DESC_WORKSPACE;
+    memory::desc workspace_pd = lrn_fwd_prim_desc.workspace_desc();
     size_t workspace_bytes = workspace_pd.get_size();
     MklDnnShape workspace_mkl_shape;
     // the workspace tensor is a uint8 tensor that has
@@ -325,7 +314,7 @@ template <typename T>
 class MklLRNGradOp : public OpKernel {
  public:
   explicit MklLRNGradOp(OpKernelConstruction* context)
-      : OpKernel(context), cpu_engine_(ENGINE_CPU, 0) {
+      : OpKernel(context), cpu_engine_(engine::kind::cpu, 0) {
     int64 depth_radius64;
     OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64));
     OP_REQUIRES(
@@ -340,7 +329,7 @@ class MklLRNGradOp : public OpKernel {
     workspace_enabled_ = false;
     OP_REQUIRES_OK(context,
                    context->GetAttr("workspace_enabled", &workspace_enabled_));
-    bwd_stream_.reset(new CPU_STREAM(cpu_engine_));
+    bwd_stream_.reset(new stream(cpu_engine_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -393,12 +382,13 @@ class MklLRNGradOp : public OpKernel {
       memory::dims orig_input_dims =
           orig_input_dnn_shape.GetSizesAsMklDnnDims();
       orig_input_dnn_data.SetUsrMem(orig_input_md, &orig_input_tensor);
-      orig_input_dnn_data.SetOpMemDesc(orig_input_dims, MEMORY_FORMAT::nhwc);
+      orig_input_dnn_data.SetOpMemDesc(orig_input_dims,
+                                       memory::format_tag::nhwc);
       orig_input_dnn_data.SetUsrMemDataHandle(&orig_input_tensor, bwd_stream_);
 
       // output_dnn_data has the same shape as original input
       output_dnn_data.SetUsrMem(orig_input_md);
-      output_dnn_data.SetOpMemDesc(orig_input_dims, MEMORY_FORMAT::nhwc);
+      output_dnn_data.SetOpMemDesc(orig_input_dims, memory::format_tag::nhwc);
 
       // MKL-DNN has a notion of kernel_size and not depth_radius.
       int kernel_size = 2 * depth_radius_ + 1;
@@ -407,12 +397,12 @@ class MklLRNGradOp : public OpKernel {
       // Create LRN backward primitive descriptor. It requires LRN forward
       // primitive descriptor also.
       auto lrn_fwd_desc = lrn_forward::desc(
-          prop_kind::forward, ALGORITHM::lrn_across_channels, orig_input_md,
-          kernel_size, new_alpha, beta_, bias_);
+          prop_kind::forward, mkldnn::algorithm::lrn_across_channels,
+          orig_input_md, kernel_size, new_alpha, beta_, bias_);
       auto lrn_fwd_prim_desc =
           lrn_forward::primitive_desc(lrn_fwd_desc, cpu_engine_);
       auto lrn_bwd_desc = lrn_backward::desc(
-          ALGORITHM::lrn_across_channels, original_output_md,
+          mkldnn::algorithm::lrn_across_channels, original_output_md,
           target_diff_dst_md, kernel_size, new_alpha, beta_, bias_);
       auto lrn_bwd_prim_desc = lrn_backward::primitive_desc(
           lrn_bwd_desc, cpu_engine_, lrn_fwd_prim_desc);
@@ -430,34 +420,25 @@ class MklLRNGradOp : public OpKernel {
       // to check. Pass input workspace to LRN backward primitive.
       const Tensor& workspace_tensor = MklGetInput(context, kIdxWorkspace);
       MklDnnData<uint8> workspace_dnn_data(&cpu_engine_);
-      ConfigureWorkspace(workspace_tensor,
-                         lrn_fwd_prim_desc.PRIMITIVE_DESC_WORKSPACE,
+      ConfigureWorkspace(workspace_tensor, lrn_fwd_prim_desc.workspace_desc(),
                          &workspace_dnn_data);
 
       // Check for input reordering on the diff dst input
-      input_grad_dnn_data.CheckReorderToOpMem(MEMORY_PD_WITHOUT_DATA(
-          lrn_bwd_prim_desc.PRIMITIVE_DESC_DIFF_DST, cpu_engine_));
+      input_grad_dnn_data.CheckReorderToOpMem(lrn_bwd_prim_desc.diff_dst_desc(),
+                                              cpu_engine_);
 
       // Check for input reordering on the original input
-      orig_input_dnn_data.CheckReorderToOpMem(MEMORY_PD_WITHOUT_DATA(
-          lrn_fwd_prim_desc.PRIMITIVE_DESC_SRC, cpu_engine_));
+      orig_input_dnn_data.CheckReorderToOpMem(lrn_fwd_prim_desc.src_desc(),
+                                              cpu_engine_);
 
       std::vector<primitive> net;
-#ifdef ENABLE_MKLDNN_V1
       std::vector<std::unordered_map<int, memory>> net_args;
       net.push_back(lrn_backward(lrn_bwd_prim_desc));
       net_args.push_back({{MKLDNN_ARG_SRC, orig_input_dnn_data.GetOpMem()},
                           {MKLDNN_ARG_DIFF_DST, input_grad_dnn_data.GetOpMem()},
-                          { MKLDNN_ARG_DST,
-                            output_dnn_data.GetOpMem() }});
+                          {MKLDNN_ARG_DST, output_dnn_data.GetOpMem()}});
       net.push_back(lrn_backward(lrn_bwd_prim_desc));
       net.at(0).execute(*bwd_stream_, net_args.at(0));
-#else
-      net.push_back(lrn_backward(
-          lrn_bwd_prim_desc, orig_input_dnn_data.GetOpMem(),
-          input_grad_dnn_data.GetOpMem(), output_dnn_data.GetOpMem()));
-      bwd_stream_->submit(net).wait();
-#endif
     } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
                          ", message: " + string(e.message) + ", in file " +
@@ -472,9 +453,9 @@ class MklLRNGradOp : public OpKernel {
       OpKernelContext* context,
       const lrn_backward::primitive_desc& lrn_bkwd_prim_desc,
       const memory::dims output_dims_mkl_order,
-      const MKL_TENSOR_FORMAT& output_tf_format, Tensor** output_tensor) {
+      const MklTensorFormat& output_tf_format, Tensor** output_tensor) {
     DCHECK(output_tensor != nullptr);
-    MEMORY_PRIMITIVE_DESC dst_pd = lrn_bkwd_prim_desc.PRIMITIVE_DESC_DIFF_SRC;
+    memory::desc dst_pd = lrn_bkwd_prim_desc.diff_src_desc();
     MklDnnShape output_mkl_shape;
 
     // We assume that all outputs at this point are MKL Tensors
@@ -502,12 +483,13 @@ class MklLRNGradOp : public OpKernel {
     memory::desc input_grad_md = input_grad_dnn_shape.GetCurLayout();
     memory::dims orig_input_dims = input_grad_dnn_shape.GetSizesAsMklDnnDims();
     input_grad_dnn_data->SetUsrMem(input_grad_md, &input_grad_tensor);
-    input_grad_dnn_data->SetOpMemDesc(orig_input_dims, MEMORY_FORMAT::nhwc);
+    input_grad_dnn_data->SetOpMemDesc(orig_input_dims,
+                                      memory::format_tag::nhwc);
     return input_grad_md;
   }
 
   void ConfigureWorkspace(const Tensor& workspace_tensor,
-                          MEMORY_PRIMITIVE_DESC workspace_pd,
+                          memory::desc workspace_pd,
                           MklDnnData<uint8>* workspace_dnn_data) {
     DCHECK(workspace_dnn_data);
 
diff --git a/tensorflow/core/kernels/mkl_matmul_op.cc b/tensorflow/core/kernels/mkl/mkl_matmul_op.cc
similarity index 99%
rename from tensorflow/core/kernels/mkl_matmul_op.cc
rename to tensorflow/core/kernels/mkl/mkl_matmul_op.cc
index c92fceb415c..81339489223 100644
--- a/tensorflow/core/kernels/mkl_matmul_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_matmul_op.cc
@@ -34,7 +34,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/fill_functor.h"
-#include "tensorflow/core/kernels/mkl_matmul_ops_common.h"
+#include "tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h"
 #include "tensorflow/core/util/mkl_util.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/mkl_matmul_op_fused.cc b/tensorflow/core/kernels/mkl/mkl_matmul_op_fused.cc
similarity index 98%
rename from tensorflow/core/kernels/mkl_matmul_op_fused.cc
rename to tensorflow/core/kernels/mkl/mkl_matmul_op_fused.cc
index 9e05d3c0cfe..1d087851ce3 100644
--- a/tensorflow/core/kernels/mkl_matmul_op_fused.cc
+++ b/tensorflow/core/kernels/mkl/mkl_matmul_op_fused.cc
@@ -21,7 +21,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/fill_functor.h"
-#include "tensorflow/core/kernels/mkl_matmul_ops_common.h"
+#include "tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
@@ -226,6 +226,8 @@ class MklFusedMatMulOp : public MklDnnMatMulOpBase<T, T> {
         params.post_op_params.push_back({"relu6", {1.0, 6.0, 0.0}});
       } else if (post_op == "Elu") {
         params.post_op_params.push_back({"elu", {1.0, 1.0, 0.0}});
+      } else if (post_op == "Tanh") {
+        params.post_op_params.push_back({"tanh", {1.0, 0.0, 0.0}});
       } else {
         OP_REQUIRES_OK(
             ctx, errors::InvalidArgument(
diff --git a/tensorflow/core/kernels/mkl_matmul_ops_common.h b/tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h
similarity index 97%
rename from tensorflow/core/kernels/mkl_matmul_ops_common.h
rename to tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h
index f8242d06fa6..e084b25f737 100644
--- a/tensorflow/core/kernels/mkl_matmul_ops_common.h
+++ b/tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_KERNELS_MKL_MATMUL_OPS_COMMON_H_
-#define TENSORFLOW_CORE_KERNELS_MKL_MATMUL_OPS_COMMON_H_
+#ifndef TENSORFLOW_CORE_KERNELS_MKL_MKL_MATMUL_OPS_COMMON_H_
+#define TENSORFLOW_CORE_KERNELS_MKL_MKL_MATMUL_OPS_COMMON_H_
 
 #ifdef INTEL_MKL
 #include <memory>
@@ -41,6 +41,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef enum { CblasRowMajor, CblasColumnMajor } CBLAS_LAYOUT;
 #define MKL_INT int
 #endif
+
 // This structure aggregates multiple inputs to MklDnnMatMul* methods.
 struct MklDnnMatMulFwdParams {
   memory::dims src_dims;
@@ -246,6 +247,13 @@ class MklDnnMatMulFwdPrimitive : public MklPrimitive {
           float op_beta = post_op_param.param[2];
           post_ops.append_eltwise(op_scale, ALGORITHM::eltwise_elu, op_alpha,
                                   op_beta);
+        } else if (post_op_param.name == "tanh") {
+          DCHECK_EQ(post_op_param.param.size(), 3);
+          float op_scale = post_op_param.param[0];
+          float op_alpha = post_op_param.param[1];
+          float op_beta = post_op_param.param[2];
+          post_ops.append_eltwise(op_scale, ALGORITHM::eltwise_tanh, op_alpha,
+                                  op_beta);
         } else if (post_op_param.name == "output_scale") {
           DCHECK_EQ(post_op_param.param.size(), 1);
           std::vector<float> scales;
@@ -255,6 +263,7 @@ class MklDnnMatMulFwdPrimitive : public MklPrimitive {
           DCHECK((post_op_param.name == "relu") ||
                  (post_op_param.name == "relu6") ||
                  (post_op_param.name == "elu") ||
+                 (post_op_param.name == "tanh") ||
                  (post_op_param.name == "output_scale"));
         }
       }
@@ -358,11 +367,12 @@ class MklDnnMatMulFwdPrimitiveFactory : public MklPrimitiveFactory<T> {
     key_creator.AddAsKey(mkldnn_matmul_fwd_dims.bias_dims);
     key_creator.AddAsKey(mkldnn_matmul_fwd_dims.dst_dims);
     key_creator.AddAsKey(mkldnn_matmul_fwd_dims.dtypes);
+    key_creator.AddAsKey(mkldnn_matmul_fwd_dims.weight_format);
 
     // Generate keys for post-ops
     for (auto const& post_op_param : mkldnn_matmul_fwd_dims.post_op_params) {
       if (post_op_param.name == "relu" || post_op_param.name == "relu6" ||
-          post_op_param.name == "elu") {
+          post_op_param.name == "elu" || post_op_param.name == "tanh") {
         DCHECK_EQ(post_op_param.param.size(), 3);
         key_creator.AddAsKey(post_op_param.name);
         key_creator.AddAsKey(post_op_param.param[0]);
@@ -817,4 +827,4 @@ void dnnl_gemm(char transa, char transb, int64_t m, int64_t n, int64_t k,
 }  // namespace tensorflow
 
 #endif  // INTEL_MKL
-#endif  // TENSORFLOW_CORE_KERNELS_MKL_MATMUL_OPS_COMMON_H_
+#endif  // TENSORFLOW_CORE_KERNELS_MKL_MKL_MATMUL_OPS_COMMON_H_
diff --git a/tensorflow/core/kernels/mkl_maxpooling_op.cc b/tensorflow/core/kernels/mkl/mkl_maxpooling_op.cc
similarity index 99%
rename from tensorflow/core/kernels/mkl_maxpooling_op.cc
rename to tensorflow/core/kernels/mkl/mkl_maxpooling_op.cc
index 3ed6b9d02a2..ca7ebd7fd12 100644
--- a/tensorflow/core/kernels/mkl_maxpooling_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_maxpooling_op.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "mkldnn.hpp"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/kernels/mkl_pooling_ops_common.h"
+#include "tensorflow/core/kernels/mkl/mkl_pooling_ops_common.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/util/mkl_types.h"
 #include "tensorflow/core/util/mkl_util.h"
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.cc b/tensorflow/core/kernels/mkl/mkl_pooling_ops_common.cc
similarity index 99%
rename from tensorflow/core/kernels/mkl_pooling_ops_common.cc
rename to tensorflow/core/kernels/mkl/mkl_pooling_ops_common.cc
index c7ad39ddb50..9824fabce0e 100644
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.cc
+++ b/tensorflow/core/kernels/mkl/mkl_pooling_ops_common.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #ifdef INTEL_MKL
 
-#include "tensorflow/core/kernels/mkl_pooling_ops_common.h"
+#include "tensorflow/core/kernels/mkl/mkl_pooling_ops_common.h"
 
 #include <limits>
 #include <vector>
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.h b/tensorflow/core/kernels/mkl/mkl_pooling_ops_common.h
similarity index 99%
rename from tensorflow/core/kernels/mkl_pooling_ops_common.h
rename to tensorflow/core/kernels/mkl/mkl_pooling_ops_common.h
index 3d5498ed77b..3a608a66c16 100644
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.h
+++ b/tensorflow/core/kernels/mkl/mkl_pooling_ops_common.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_KERNELS_MKL_POOLING_OPS_COMMON_H_
-#define TENSORFLOW_CORE_KERNELS_MKL_POOLING_OPS_COMMON_H_
+#ifndef TENSORFLOW_CORE_KERNELS_MKL_MKL_POOLING_OPS_COMMON_H_
+#define TENSORFLOW_CORE_KERNELS_MKL_MKL_POOLING_OPS_COMMON_H_
 
 #ifdef INTEL_MKL
 
@@ -728,4 +728,4 @@ class MklPoolingBackwardOpBase : public MklPoolingOpBase<T> {
 }  // namespace tensorflow
 
 #endif  // INTEL_MKL
-#endif  // TENSORFLOW_CORE_KERNELS_MKL_POOLING_OPS_COMMON_H_
+#endif  // TENSORFLOW_CORE_KERNELS_MKL_MKL_POOLING_OPS_COMMON_H_
diff --git a/tensorflow/core/kernels/mkl_qmatmul_op.cc b/tensorflow/core/kernels/mkl/mkl_qmatmul_op.cc
similarity index 99%
rename from tensorflow/core/kernels/mkl_qmatmul_op.cc
rename to tensorflow/core/kernels/mkl/mkl_qmatmul_op.cc
index b59612433e6..1cc1945dd4b 100644
--- a/tensorflow/core/kernels/mkl_qmatmul_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_qmatmul_op.cc
@@ -94,8 +94,8 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/fill_functor.h"
-#include "tensorflow/core/kernels/mkl_matmul_ops_common.h"
-#include "tensorflow/core/kernels/mkl_quantized_conv_ops.h"
+#include "tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h"
+#include "tensorflow/core/kernels/mkl/mkl_quantized_conv_ops.h"
 #include "tensorflow/core/kernels/no_op.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/util/mkl_threadpool.h"
diff --git a/tensorflow/core/kernels/mkl_qmatmul_op_test.cc b/tensorflow/core/kernels/mkl/mkl_qmatmul_op_test.cc
similarity index 100%
rename from tensorflow/core/kernels/mkl_qmatmul_op_test.cc
rename to tensorflow/core/kernels/mkl/mkl_qmatmul_op_test.cc
diff --git a/tensorflow/core/kernels/mkl_quantize_op.cc b/tensorflow/core/kernels/mkl/mkl_quantize_op.cc
similarity index 92%
rename from tensorflow/core/kernels/mkl_quantize_op.cc
rename to tensorflow/core/kernels/mkl/mkl_quantize_op.cc
index 177cbb43d0b..0b24af31a14 100644
--- a/tensorflow/core/kernels/mkl_quantize_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_quantize_op.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "tensorflow/core/graph/mkl_graph_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/util/mkl_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
 using mkldnn::primitive_attr;
@@ -77,7 +76,7 @@ class MklReorderWithScalePrimitive : public MklPrimitive {
  public:
   explicit MklReorderWithScalePrimitive(
       const MklReorderWithScaleFwdParams& fwdParams)
-      : MklPrimitive(engine(ENGINE_CPU, 0)) {
+      : MklPrimitive(engine(engine::kind::cpu, 0)) {
     // Create reorder primitive
     Setup(fwdParams);
   }
@@ -95,11 +94,7 @@ class MklReorderWithScalePrimitive : public MklPrimitive {
     context_.src_mem->set_data_handle(src_data);
     context_.dst_mem->set_data_handle(dst_data);
 #endif  // ENABLE_MKLDNN_THREADPOOL
-#ifndef ENABLE_MKLDNN_V1
-    reorder_stream->submit(context_.net);
-#else
     context_.reorder_prim->execute(*reorder_stream, context_.prim_args);
-#endif  // !ENABLE_MKLDNN_V1
     // After execution, set data handle back.
     context_.src_mem->set_data_handle(DummyData);
     context_.dst_mem->set_data_handle(DummyData);
@@ -119,11 +114,7 @@ class MklReorderWithScalePrimitive : public MklPrimitive {
     // Stream and primitive vector
     std::shared_ptr<mkldnn::stream> reorder_stream;
 
-#ifndef ENABLE_MKLDNN_V1
-    std::vector<mkldnn::primitive> net;
-#else
     std::unordered_map<int, mkldnn::memory> prim_args;
-#endif  // !ENABLE_MKLDNN_V1
 
     ReorderContext()
         : src_mem(nullptr),
@@ -135,10 +126,10 @@ class MklReorderWithScalePrimitive : public MklPrimitive {
   // Reorder primitive setup
   void Setup(const MklReorderWithScaleFwdParams& fwdParams) {
     // Create memory descriptors for reorder data with specified format
-    context_.src_mem.reset(new MEMORY_CONSTRUCTOR_USING_MD(
-        fwdParams.src_md, cpu_engine_, DummyData));
-    context_.dst_mem.reset(new MEMORY_CONSTRUCTOR_USING_MD(
-        fwdParams.dst_md, cpu_engine_, DummyData));
+    context_.src_mem.reset(
+        new memory(fwdParams.src_md, cpu_engine_, DummyData));
+    context_.dst_mem.reset(
+        new memory(fwdParams.dst_md, cpu_engine_, DummyData));
 
     // Check if there is any fusion as post-ops
     auto const& post_op_params = fwdParams.post_op_params;
@@ -150,21 +141,14 @@ class MklReorderWithScalePrimitive : public MklPrimitive {
     scales.push_back(post_op_params.param[0]);
     post_ops_attr.set_output_scales(0, scales);
 
-    context_.reorder_pd.reset(new REORDER_PD_CONSTRUCTOR_WITH_ATTR(
-        GET_MEMORY_PRIMITIVE_DESC_FROM_MEM_PTR(context_.src_mem),
-        GET_MEMORY_PRIMITIVE_DESC_FROM_MEM_PTR(context_.dst_mem), cpu_engine_,
-        post_ops_attr));
+    context_.reorder_pd.reset(
+        new ReorderPd(cpu_engine_, context_.src_mem->get_desc(), cpu_engine_,
+                      context_.dst_mem->get_desc(), post_ops_attr));
 
-// Create reorder primitive
-#ifndef ENABLE_MKLDNN_V1
-    context_.reorder_prim.reset(new reorder(
-        *context_.reorder_pd, *context_.src_mem, *context_.dst_mem));
-    context_.net.push_back(*context_.reorder_prim);
-#else
+    // Create reorder primitive
     context_.reorder_prim.reset(new reorder(*context_.reorder_pd));
     context_.prim_args.insert({MKLDNN_ARG_FROM, *context_.src_mem});
     context_.prim_args.insert({MKLDNN_ARG_TO, *context_.dst_mem});
-#endif  // !ENABLE_MKLDNN_V1
   }
 };
 
@@ -278,7 +262,7 @@ class MklQuantizeV2Op : public OpKernel {
                     "Scalar calculation in MKL is supported only for"
                     "MIN_FIRST mode for now."));
 
-    auto cpu_engine = engine(ENGINE_CPU, 0);
+    auto cpu_engine = engine(engine::kind::cpu, 0);
     const Tensor& input = ctx->input(0);
     const unsigned int src_idx = 0;
     const Tensor& src_tensor = MklGetInput(ctx, src_idx);
@@ -344,7 +328,7 @@ class MklQuantizeV2Op : public OpKernel {
     max_range = std::max(input_max_range, min_range + epsilon);
     // Clamping the max_range to zero since max_range can also be negative.
     max_range = std::max(0.0f, max_range);
-    auto cpu_engine = engine(ENGINE_CPU, 0);
+    auto cpu_engine = engine(engine::kind::cpu, 0);
     const Tensor& src_tensor = MklGetInput(ctx, src_idx);
     MklDnnShape src_mkl_shape;
     GetMklShape(ctx, src_idx, &src_mkl_shape);
@@ -355,25 +339,25 @@ class MklQuantizeV2Op : public OpKernel {
                         : TFShapeToMklDnnDims(src_tensor.shape());
     auto output_dims = src_dims;
     // Set the dst layout to be the best mkl layout based on dims and type.
-    MEMORY_FORMAT dst_layout_type;
+    memory::format_tag dst_layout_type;
     switch (src_tf_shape.dims()) {
       case 0:
         ComputeScalar(ctx, min_range, max_range);
         return;
       case 1:
-        dst_layout_type = MEMORY_FORMAT::x;
+        dst_layout_type = memory::format_tag::x;
         break;
       case 2:
-        dst_layout_type = MEMORY_FORMAT::nc;
+        dst_layout_type = memory::format_tag::nc;
         break;
       case 3:
-        dst_layout_type = MEMORY_FORMAT::tnc;
+        dst_layout_type = memory::format_tag::tnc;
         break;
       case 4:
-        dst_layout_type = MEMORY_FORMAT::nhwc;
+        dst_layout_type = memory::format_tag::nhwc;
         break;
       case 5:
-        dst_layout_type = MEMORY_FORMAT::ndhwc;
+        dst_layout_type = memory::format_tag::ndhwc;
         break;
       default:
         OP_REQUIRES_OK(ctx,
@@ -417,9 +401,7 @@ class MklQuantizeV2Op : public OpKernel {
 
     memory::desc dst_md =
         memory::desc(src_dims, MklDnnType<T>(), dst_layout_type);
-#ifndef ENABLE_MKLDNN_V1
-    auto dst_pd = memory::primitive_desc(dst_md, cpu_engine);
-#endif  // !ENABLE_MKLDNN_V1
+
     // Standard shape assignments for layout pass
     MklDnnShape output_mkl_shape;
     TensorShape output_tf_shape;
diff --git a/tensorflow/core/kernels/mkl_quantize_op_test.cc b/tensorflow/core/kernels/mkl/mkl_quantize_op_test.cc
similarity index 100%
rename from tensorflow/core/kernels/mkl_quantize_op_test.cc
rename to tensorflow/core/kernels/mkl/mkl_quantize_op_test.cc
diff --git a/tensorflow/core/kernels/mkl_quantized_concat_op_test.cc b/tensorflow/core/kernels/mkl/mkl_quantized_concat_op_test.cc
similarity index 100%
rename from tensorflow/core/kernels/mkl_quantized_concat_op_test.cc
rename to tensorflow/core/kernels/mkl/mkl_quantized_concat_op_test.cc
diff --git a/tensorflow/core/kernels/mkl_quantized_conv_ops.h b/tensorflow/core/kernels/mkl/mkl_quantized_conv_ops.h
similarity index 95%
rename from tensorflow/core/kernels/mkl_quantized_conv_ops.h
rename to tensorflow/core/kernels/mkl/mkl_quantized_conv_ops.h
index 4121c88fb83..9fd699cf704 100644
--- a/tensorflow/core/kernels/mkl_quantized_conv_ops.h
+++ b/tensorflow/core/kernels/mkl/mkl_quantized_conv_ops.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_KERNELS_MKL_QUANTIZED_CONV_OPS_H_
-#define TENSORFLOW_CORE_KERNELS_MKL_QUANTIZED_CONV_OPS_H_
+#ifndef TENSORFLOW_CORE_KERNELS_MKL_MKL_QUANTIZED_CONV_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_MKL_MKL_QUANTIZED_CONV_OPS_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor.h"
@@ -90,4 +90,4 @@ void MklQuantizationRangeForMultiplication(float min_a, float max_a,
 
 #endif  // INTEL_MKL
 
-#endif  // TENSORFLOW_CORE_KERNELS_MKL_QUANTIZED_CONV_OPS_H_
+#endif  // TENSORFLOW_CORE_KERNELS_MKL_MKL_QUANTIZED_CONV_OPS_H_
diff --git a/tensorflow/core/kernels/mkl_quantized_conv_ops_perchannel_test.cc b/tensorflow/core/kernels/mkl/mkl_quantized_conv_ops_perchannel_test.cc
similarity index 100%
rename from tensorflow/core/kernels/mkl_quantized_conv_ops_perchannel_test.cc
rename to tensorflow/core/kernels/mkl/mkl_quantized_conv_ops_perchannel_test.cc
diff --git a/tensorflow/core/kernels/mkl_quantized_conv_ops_test.cc b/tensorflow/core/kernels/mkl/mkl_quantized_conv_ops_test.cc
similarity index 100%
rename from tensorflow/core/kernels/mkl_quantized_conv_ops_test.cc
rename to tensorflow/core/kernels/mkl/mkl_quantized_conv_ops_test.cc
diff --git a/tensorflow/core/kernels/mkl_quantized_pooling_ops_test.cc b/tensorflow/core/kernels/mkl/mkl_quantized_pooling_ops_test.cc
similarity index 100%
rename from tensorflow/core/kernels/mkl_quantized_pooling_ops_test.cc
rename to tensorflow/core/kernels/mkl/mkl_quantized_pooling_ops_test.cc
diff --git a/tensorflow/core/kernels/mkl_relu_op.cc b/tensorflow/core/kernels/mkl/mkl_relu_op.cc
similarity index 100%
rename from tensorflow/core/kernels/mkl_relu_op.cc
rename to tensorflow/core/kernels/mkl/mkl_relu_op.cc
diff --git a/tensorflow/core/kernels/mkl_relu_op_test.cc b/tensorflow/core/kernels/mkl/mkl_relu_op_test.cc
similarity index 100%
rename from tensorflow/core/kernels/mkl_relu_op_test.cc
rename to tensorflow/core/kernels/mkl/mkl_relu_op_test.cc
diff --git a/tensorflow/core/kernels/mkl_requantization_range_per_channel_op.cc b/tensorflow/core/kernels/mkl/mkl_requantization_range_per_channel_op.cc
similarity index 100%
rename from tensorflow/core/kernels/mkl_requantization_range_per_channel_op.cc
rename to tensorflow/core/kernels/mkl/mkl_requantization_range_per_channel_op.cc
diff --git a/tensorflow/core/kernels/mkl_requantize_ops_test.cc b/tensorflow/core/kernels/mkl/mkl_requantize_ops_test.cc
similarity index 100%
rename from tensorflow/core/kernels/mkl_requantize_ops_test.cc
rename to tensorflow/core/kernels/mkl/mkl_requantize_ops_test.cc
diff --git a/tensorflow/core/kernels/mkl_requantize_per_channel_op.cc b/tensorflow/core/kernels/mkl/mkl_requantize_per_channel_op.cc
similarity index 100%
rename from tensorflow/core/kernels/mkl_requantize_per_channel_op.cc
rename to tensorflow/core/kernels/mkl/mkl_requantize_per_channel_op.cc
diff --git a/tensorflow/core/kernels/mkl_reshape_op.cc b/tensorflow/core/kernels/mkl/mkl_reshape_op.cc
similarity index 100%
rename from tensorflow/core/kernels/mkl_reshape_op.cc
rename to tensorflow/core/kernels/mkl/mkl_reshape_op.cc
diff --git a/tensorflow/core/kernels/mkl_slice_op.cc b/tensorflow/core/kernels/mkl/mkl_slice_op.cc
similarity index 88%
rename from tensorflow/core/kernels/mkl_slice_op.cc
rename to tensorflow/core/kernels/mkl/mkl_slice_op.cc
index 7e293e14d98..3f01dfd104f 100644
--- a/tensorflow/core/kernels/mkl_slice_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_slice_op.cc
@@ -26,13 +26,9 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/prefetch.h"
-#include "tensorflow/core/util/mkl_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
 using mkldnn::stream;
-#ifndef ENABLE_MKLDNN_V1
-using mkldnn::view;
-#endif
 
 namespace tensorflow {
 
@@ -181,7 +177,7 @@ template <typename T>
 class MklSlicePrimitive : public MklPrimitive {
  public:
   explicit MklSlicePrimitive(const MklSliceParams& sliceParams)
-      : MklPrimitive(engine(ENGINE_CPU, 0)) {
+      : MklPrimitive(engine(engine::kind::cpu, 0)) {
     Setup(sliceParams);
   }
 
@@ -198,12 +194,9 @@ class MklSlicePrimitive : public MklPrimitive {
     context_.src_mem->set_data_handle(sliceParams.from->get_data_handle());
     context_.dst_mem->set_data_handle(sliceParams.to->get_data_handle());
 #endif  // ENABLE_MKLDNN_THREADPOOL
-#ifdef ENABLE_MKLDNN_V1
+
     execute_primitives(context_.slice_primitives, slice_stream,
                        context_.slice_primitives_args);
-#else
-    slice_stream->submit(context_.slice_primitives);
-#endif
 
     // We should set it back to DummyData so as to make the primitive
     // in cache pool stateless. Otherwise, if the result for previous
@@ -224,12 +217,8 @@ class MklSlicePrimitive : public MklPrimitive {
     std::shared_ptr<reorder::primitive_desc> reorder_pd;
     std::shared_ptr<mkldnn::stream> slice_stream;
     std::vector<mkldnn::primitive> slice_primitives;
-#ifdef ENABLE_MKLDNN_V1
     std::shared_ptr<mkldnn::memory> src_sub_mem;
     std::vector<std::unordered_map<int, memory>> slice_primitives_args;
-#else
-    std::shared_ptr<view::primitive_desc> view_pd;
-#endif  // ENABLE_MKLDNN_V1
     SliceContext()
         : src_mem(nullptr), dst_mem(nullptr), reorder_prim(nullptr) {}
   } context_;
@@ -237,15 +226,13 @@ class MklSlicePrimitive : public MklPrimitive {
   void Setup(const MklSliceParams& sliceParams) {
     // Actually, DummyData will not be used in computation,
     // because the real data will be filled before execution.
-    context_.src_mem.reset(new MEMORY_CONSTRUCTOR_WITH_MEM_PD(
-        sliceParams.from, cpu_engine_, DummyData));
-    context_.dst_mem.reset(new MEMORY_CONSTRUCTOR_WITH_MEM_PD(
-        sliceParams.to, cpu_engine_, DummyData));
-    auto src_pd = context_.src_mem->GET_DESC;
-    auto dst_pd = context_.dst_mem->GET_DESC;
-#ifdef ENABLE_MKLDNN_V1
-    // MKL-DNN 1.x removes struct view, alias of memory in 0.x version.
-    // So the implementation is based on submemory.
+    context_.src_mem.reset(
+        new memory(sliceParams.from->get_desc(), cpu_engine_, DummyData));
+    context_.dst_mem.reset(
+        new memory(sliceParams.to->get_desc(), cpu_engine_, DummyData));
+    auto src_pd = context_.src_mem->get_desc();
+    auto dst_pd = context_.dst_mem->get_desc();
+
     auto src_sub_desc = context_.src_mem->get_desc().submemory_desc(
         sliceParams.size_dims, sliceParams.begin_dims);
     context_.src_sub_mem.reset(new memory(src_sub_desc, cpu_engine_, nullptr));
@@ -256,18 +243,7 @@ class MklSlicePrimitive : public MklPrimitive {
 
     context_.slice_primitives_args.push_back(
         {{MKLDNN_ARG_SRC, *context_.src_mem},
-         { MKLDNN_ARG_DST,
-           *context_.dst_mem }});
-#else
-    context_.view_pd =
-        std::make_shared<view::primitive_desc>(view::primitive_desc(
-            src_pd, sliceParams.size_dims, sliceParams.begin_dims));
-    context_.reorder_pd =
-        std::make_shared<reorder::primitive_desc>(reorder::primitive_desc(
-            context_.view_pd->dst_primitive_desc(), dst_pd));
-    context_.reorder_prim = std::make_shared<mkldnn::reorder>(
-        reorder(*context_.reorder_pd, *context_.src_mem, *context_.dst_mem));
-#endif
+         {MKLDNN_ARG_DST, *context_.dst_mem}});
     context_.slice_primitives.push_back(*context_.reorder_prim);
   }
 };
@@ -298,32 +274,24 @@ class MklSlicePrimitiveFactory : public MklPrimitiveFactory<T> {
   static string CreateKey(const MklSliceParams& sliceParams) {
     string prefix = "reorder";
     FactoryKeyCreator key_creator;
-    auto const& from_desc = GET_MEMORY_DESC_FROM_MEM_PTR(sliceParams.from).data;
-    auto const& to_desc = GET_MEMORY_DESC_FROM_MEM_PTR(sliceParams.to).data;
+    auto const& from_desc = sliceParams.from->get_desc().data;
+    auto const& to_desc = sliceParams.to->get_desc().data;
     const int kIdxFirstStride = 0;
     memory::dims from_dims(from_desc.dims, &from_desc.dims[from_desc.ndims]);
     memory::dims to_dims(to_desc.dims, &to_desc.dims[to_desc.ndims]);
 
     // MKL-DNN removes "struct view". Submemory has similar capability.
-    auto from_strides = from_desc.MEMORY_FORMAT_DESC.blocking.strides;
-    auto to_strides = to_desc.MEMORY_FORMAT_DESC.blocking.strides;
-    memory::dims from_strides_outer_blocks(
-        GET_BLOCK_STRIDES(from_strides, kIdxFirstStride),
-        &GET_BLOCK_STRIDES(from_strides, kIdxFirstStride)[from_desc.ndims]);
-    memory::dims to_strides_outer_blocks(
-        GET_BLOCK_STRIDES(to_strides, kIdxFirstStride),
-        &GET_BLOCK_STRIDES(to_strides, kIdxFirstStride)[to_desc.ndims]);
+    auto from_strides = from_desc.format_desc.blocking.strides;
+    auto to_strides = to_desc.format_desc.blocking.strides;
+    memory::dims from_strides_outer_blocks(from_strides,
+                                           &from_strides[from_desc.ndims]);
+    memory::dims to_strides_outer_blocks(to_strides,
+                                         &to_strides[to_desc.ndims]);
 
     key_creator.AddAsKey(prefix);
-#ifndef ENABLE_MKLDNN_V1
-    key_creator.AddAsKey(static_cast<int>(from_desc.format));
-#endif
     key_creator.AddAsKey(static_cast<int>(from_desc.data_type));
     key_creator.AddAsKey(from_dims);
     key_creator.AddAsKey(from_strides_outer_blocks);
-#ifndef ENABLE_MKLDNN_V1
-    key_creator.AddAsKey(static_cast<int>(to_desc.format));
-#endif
     key_creator.AddAsKey(static_cast<int>(to_desc.data_type));
     key_creator.AddAsKey(to_dims);
     key_creator.AddAsKey(to_strides_outer_blocks);
@@ -401,7 +369,7 @@ class MklSliceOp : public OpKernel {
       // primitive descriptor. And the reorder uses source memory as input but
       // traverses it according to a view in_submem_pd.
 
-      auto cpu_engine = engine(ENGINE_CPU, 0);
+      auto cpu_engine = engine(engine::kind::cpu, 0);
       MklDnnData<T> src(&cpu_engine);
       MklDnnData<T> output(&cpu_engine);
 
@@ -468,22 +436,13 @@ class MklSliceOp : public OpKernel {
       // Or else do nothing for it.
       auto op_md =
           MklDnnData<T>::CreateBlockedMemDesc(input_dims, input_strides);
-#ifdef ENABLE_MKLDNN_V1
       src.CheckReorderToOpMem(op_md, cpu_engine, context);
-#else
-      auto op_pd = memory::primitive_desc(op_md, cpu_engine);
-      src.CheckReorderToOpMem(op_pd);
-#endif
 
       // Step 2 - Create memory for output.
       auto output_strides = CalculateTFStrides(size_dims);
       auto output_md =
           MklDnnData<T>::CreateBlockedMemDesc(size_dims, output_strides);
-#ifdef ENABLE_MKLDNN_V1
       auto output_pd = output_md;
-#else
-      auto output_pd = memory::primitive_desc(output_md, cpu_engine);
-#endif
       AllocateOutputTensor(context, input_mkl_shape, &output_pd, size_dims,
                            &output_tensor, &output_mkl_shape);
       DCHECK(output_tensor);
@@ -512,7 +471,7 @@ class MklSliceOp : public OpKernel {
  private:
   void AllocateOutputTensor(OpKernelContext* context,
                             const MklDnnShape& input_mkl_shape,
-                            MEMORY_PRIMITIVE_DESC* output_pd,
+                            memory::desc* output_pd,
                             const memory::dims& output_dims,
                             Tensor** output_tensor,
                             MklDnnShape* output_mkl_shape) {
diff --git a/tensorflow/core/kernels/mkl_softmax_op.cc b/tensorflow/core/kernels/mkl/mkl_softmax_op.cc
similarity index 88%
rename from tensorflow/core/kernels/mkl_softmax_op.cc
rename to tensorflow/core/kernels/mkl/mkl_softmax_op.cc
index 2f51573fe13..71837e9e91d 100644
--- a/tensorflow/core/kernels/mkl_softmax_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_softmax_op.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/util/mkl_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 #include "tensorflow/core/util/tensor_format.h"
 
@@ -37,10 +36,10 @@ namespace tensorflow {
 class MklSoftmaxParams {
  public:
   memory::dims src_dims;
-  MKL_TENSOR_FORMAT src_fmt;
+  MklTensorFormat src_fmt;
   int axis;
 
-  MklSoftmaxParams(memory::dims src_dims, MKL_TENSOR_FORMAT src_fmt, int axis)
+  MklSoftmaxParams(memory::dims src_dims, MklTensorFormat src_fmt, int axis)
       : src_dims(src_dims), src_fmt(src_fmt), axis(axis) {}
 };
 
@@ -48,7 +47,7 @@ template <typename T>
 class MklSoftmaxPrimitive : public MklPrimitive {
  public:
   explicit MklSoftmaxPrimitive(const MklSoftmaxParams& fwdParams)
-      : MklPrimitive(engine(ENGINE_CPU, 0)) {
+      : MklPrimitive(engine(engine::kind::cpu, 0)) {
     Setup(fwdParams);
   }
 
@@ -69,13 +68,10 @@ class MklSoftmaxPrimitive : public MklPrimitive {
         static_cast<void*>(const_cast<T*>(src_data)));
     context_.dst_mem->set_data_handle(static_cast<void*>(dst_data));
 #endif  // ENABLE_MKLDNN_THREADPOOL
-#ifdef ENABLE_MKLDNN_V1
+
     DCHECK_EQ(context_.fwd_primitives.size(), context_.fwd_net_args.size());
     execute_primitives(context_.fwd_primitives, fwd_cpu_stream,
                        context_.fwd_net_args);
-#else
-    fwd_cpu_stream->submit(context_.fwd_primitives);
-#endif
 
     // After execution, set data handle back.
     context_.src_mem->set_data_handle(DummyData);
@@ -117,7 +113,7 @@ class MklSoftmaxPrimitive : public MklPrimitive {
   // Softmax forward primitive setup
   void Setup(const MklSoftmaxParams& fwdParams) {
     // Create memory descriptors for softmax data with specified format.
-    auto src_format = GET_TENSOR_FORMAT(fwdParams.src_fmt);
+    auto src_format = MklTensorFormatToMklDnnDataFormat(fwdParams.src_fmt);
     context_.src_md.reset(
         new memory::desc({fwdParams.src_dims}, MklDnnType<T>(), src_format));
 
@@ -128,21 +124,15 @@ class MklSoftmaxPrimitive : public MklPrimitive {
         *context_.fwd_desc, cpu_engine_));
 
     // Create memory primitive based on dummy data.
-    context_.src_mem.reset(new MEMORY_CONSTRUCTOR_USING_MD(
-        *context_.src_md, cpu_engine_, DummyData));
-    context_.dst_mem.reset(new MEMORY_CONSTRUCTOR_PD(
-        context_.fwd_pd.get()->PRIMITIVE_DESC_DST, cpu_engine_, DummyData));
+    context_.src_mem.reset(
+        new memory(*context_.src_md, cpu_engine_, DummyData));
+    context_.dst_mem.reset(
+        new memory(context_.fwd_pd.get()->dst_desc(), cpu_engine_, DummyData));
 
-#ifdef ENABLE_MKLDNN_V1
     // Create softmax primitive and add it to net
     context_.softmax_fwd.reset(new mkldnn::softmax_forward(*context_.fwd_pd));
     context_.fwd_net_args.push_back({{MKLDNN_ARG_SRC, *context_.src_mem},
-                                     { MKLDNN_ARG_DST,
-                                       *context_.dst_mem }});
-#else
-    context_.softmax_fwd.reset(new mkldnn::softmax_forward(
-        *context_.fwd_pd, *context_.src_mem, *context_.dst_mem));
-#endif  // ENABLE_MKLDNN_V1
+                                     {MKLDNN_ARG_DST, *context_.dst_mem}});
 
     context_.fwd_primitives.push_back(*context_.softmax_fwd);
   }
@@ -209,7 +199,7 @@ class MklSoftmaxOp : public OpKernel {
 
   void Compute(OpKernelContext* context) override {
     try {
-      auto cpu_engine = engine(ENGINE_CPU, 0);
+      auto cpu_engine = engine(engine::kind::cpu, 0);
       // src_tensor points to the 0-th input of global data struct "context".
       size_t src_idx = 0;
       const Tensor& src_tensor = MklGetInput(context, src_idx);
@@ -231,7 +221,7 @@ class MklSoftmaxOp : public OpKernel {
         src_dims = TFShapeToMklDnnDims(src_tf_shape);
         axis = input_dims - 1;
       }
-      MKL_TENSOR_FORMAT layout_type;
+      MklTensorFormat layout_type;
       // In MKL, data format passed to mkl softmax op depends on dimension of
       // the input tensor. Here "x" data format in MKL is used for 1 dim tensor,
       // "nc" for 2 dim tensor, "tnc" for 3 dim tensor, "nchw" for 4 dim tensor,
@@ -243,26 +233,26 @@ class MklSoftmaxOp : public OpKernel {
       // dimension to do softmax.
       switch (input_dims) {
         case 1:
-          layout_type = MKL_TENSOR_FORMAT_X;
+          layout_type = MklTensorFormat::FORMAT_X;
           break;
         case 2:
-          layout_type = MKL_TENSOR_FORMAT_NC;
+          layout_type = MklTensorFormat::FORMAT_NC;
           break;
         case 3:
-          layout_type = MKL_TENSOR_FORMAT_TNC;
+          layout_type = MklTensorFormat::FORMAT_TNC;
           break;
         case 4:
           if (src_mkl_shape.IsMklTensor()) {
-            layout_type = MKL_TENSOR_FORMAT_NHWC;
+            layout_type = MklTensorFormat::FORMAT_NHWC;
           } else {
-            layout_type = MKL_TENSOR_FORMAT_NCHW;
+            layout_type = MklTensorFormat::FORMAT_NCHW;
           }
           break;
         case 5:
           if (src_mkl_shape.IsMklTensor()) {
-            layout_type = MKL_TENSOR_FORMAT_NDHWC;
+            layout_type = MklTensorFormat::FORMAT_NDHWC;
           } else {
-            layout_type = MKL_TENSOR_FORMAT_NCDHW;
+            layout_type = MklTensorFormat::FORMAT_NCDHW;
           }
           break;
         default:
@@ -274,7 +264,7 @@ class MklSoftmaxOp : public OpKernel {
       // If input is in MKL layout, then simply get the format from input;
       // otherwise, use TF layout defined before.
       auto src_fmt = src_mkl_shape.IsMklTensor()
-                         ? GET_FORMAT_FROM_SHAPE(src_mkl_shape)
+                         ? MklTensorFormat::FORMAT_BLOCKED
                          : layout_type;
 
       // Get a softmax fwd primitive from primitive pool.
@@ -287,7 +277,7 @@ class MklSoftmaxOp : public OpKernel {
       MklDnnShape output_mkl_shape;
       TensorShape output_tf_shape;  // shape of output TF tensor.
 
-      auto dst_pd = softmax_fwd->GetSoftmaxFwdPd()->PRIMITIVE_DESC_DST;
+      auto dst_pd = softmax_fwd->GetSoftmaxFwdPd()->dst_desc();
 
       // If input is MKL shape, output is also MKL shape.
       // If input is TF shape, output is also TF shape.
diff --git a/tensorflow/core/kernels/mkl_tfconv_op.h b/tensorflow/core/kernels/mkl/mkl_tfconv_op.h
similarity index 97%
rename from tensorflow/core/kernels/mkl_tfconv_op.h
rename to tensorflow/core/kernels/mkl/mkl_tfconv_op.h
index f7aa4d2bebf..0a603ee2c12 100644
--- a/tensorflow/core/kernels/mkl_tfconv_op.h
+++ b/tensorflow/core/kernels/mkl/mkl_tfconv_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_KERNELS_MKL_TFCONV_OP_H_
-#define TENSORFLOW_CORE_KERNELS_MKL_TFCONV_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_MKL_MKL_TFCONV_OP_H_
+#define TENSORFLOW_CORE_KERNELS_MKL_MKL_TFCONV_OP_H_
 
 #ifdef INTEL_MKL
 
@@ -160,4 +160,4 @@ TF_CALL_QUANTIZED_TYPES(REGISTER_CPU);
 
 }  // namespace tensorflow
 #endif  // INTEL_MKL
-#endif  // TENSORFLOW_CORE_KERNELS_MKL_TFCONV_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_MKL_MKL_TFCONV_OP_H_
diff --git a/tensorflow/core/kernels/mkl_tmp_bf16_ops.cc b/tensorflow/core/kernels/mkl/mkl_tmp_bf16_ops.cc
similarity index 100%
rename from tensorflow/core/kernels/mkl_tmp_bf16_ops.cc
rename to tensorflow/core/kernels/mkl/mkl_tmp_bf16_ops.cc
diff --git a/tensorflow/core/kernels/mkl_transpose_op.cc b/tensorflow/core/kernels/mkl/mkl_transpose_op.cc
similarity index 96%
rename from tensorflow/core/kernels/mkl_transpose_op.cc
rename to tensorflow/core/kernels/mkl/mkl_transpose_op.cc
index 2e5c6d2719b..72cc760c0de 100644
--- a/tensorflow/core/kernels/mkl_transpose_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_transpose_op.cc
@@ -27,7 +27,6 @@ limitations under the License.
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/transpose_functor.h"
 #include "tensorflow/core/kernels/transpose_op.h"
-#include "tensorflow/core/util/mkl_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
 using mkldnn::stream;
@@ -126,7 +125,7 @@ template <typename T>
 Status MKLTransposeND(OpKernelContext* context, const Tensor& in_tensor,
                       Tensor* out_tensor, const gtl::ArraySlice<int32>& perm) {
   try {
-    engine cpu_engine = engine(ENGINE_CPU, 0);
+    engine cpu_engine = engine(engine::kind::cpu, 0);
     MklDnnData<T> in(&cpu_engine);
     MklDnnData<T> out(&cpu_engine);
 
@@ -144,7 +143,6 @@ Status MKLTransposeND(OpKernelContext* context, const Tensor& in_tensor,
     out.SetUsrMem(in_dims, out_strides, out_tensor);
 
     std::vector<primitive> net;
-#ifdef ENABLE_MKLDNN_V1
     auto* prim = FindOrCreateReorder<T>(in.GetUsrMem(), out.GetUsrMem());
     transpose_stream.reset(CreateStream(context, prim->GetEngine()));
     in.SetUsrMemDataHandle(&in_tensor, transpose_stream);
@@ -154,11 +152,6 @@ Status MKLTransposeND(OpKernelContext* context, const Tensor& in_tensor,
     net_args.push_back({{MKLDNN_ARG_FROM, *in.GetUsrMem()},
                         {MKLDNN_ARG_TO, *out.GetUsrMem()}});
     execute_primitives(net, transpose_stream, net_args);
-#else
-    transpose_stream.reset(new CPU_STREAM(cpu_engine));
-    net.push_back(FindOrCreateReorder<T>(in.GetUsrMem(), out.GetUsrMem()));
-    transpose_stream->submit(net).wait();
-#endif  // ENABLE_MKLDNN_V1
 
     return Status::OK();
   } catch (mkldnn::error& e) {
@@ -196,7 +189,7 @@ Status MklTransposeCpuOp::DoTranspose(OpKernelContext* ctx, const Tensor& in,
 
   // MKL-DNN has limit on the maximum number of dimensions in a tensor.
   // Fallback to Eigen for not supported cases.
-  if (in.dims() <= TENSOR_MAX_DIMS) {
+  if (in.dims() <= MKLDNN_MAX_NDIMS) {
     switch (in.dtype()) {
       case DT_FLOAT:
         return MKLTransposeND<float>(ctx, in, out, perm);
@@ -243,7 +236,7 @@ Status MklConjugateTransposeCpuOp::DoTranspose(OpKernelContext* ctx,
 
   // MKL-DNN has limit on the maximum number of dimensions in a tensor.
   // Fallback to Eigen for not supported cases.
-  if (in.dims() <= TENSOR_MAX_DIMS) {
+  if (in.dims() <= MKLDNN_MAX_NDIMS) {
     switch (in.dtype()) {
       case DT_FLOAT:
         return MKLTransposeND<float>(ctx, in, out, perm);
diff --git a/tensorflow/core/kernels/mlir_generated/BUILD b/tensorflow/core/kernels/mlir_generated/BUILD
index 9f3efe9d972..47516e13824 100644
--- a/tensorflow/core/kernels/mlir_generated/BUILD
+++ b/tensorflow/core/kernels/mlir_generated/BUILD
@@ -1,16 +1,20 @@
 # Generates CUDA kernels using MLIR codegen.
 
-load("//tensorflow/core/kernels/mlir_generated:build_defs.bzl", "gen_kernel_library", "if_mlir_generated_gpu_kernels_enabled")
 load(
-    "//tensorflow:tensorflow.bzl",
-    "tf_kernel_library",
+    "//tensorflow/core/kernels/mlir_generated:build_defs.bzl",
+    "gen_kernel_library",
+    "if_mlir_generated_gpu_kernels_enabled",
 )
+load(
+    "//tensorflow/stream_executor:build_defs.bzl",
+    "if_gpu_is_configured",
+)
+load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "tf_cuda_tests_tags",
 )
-load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 
 package(
     default_visibility = ["//tensorflow/core/kernels:__subpackages__"],
@@ -25,31 +29,32 @@ config_setting(
 )
 
 tf_kernel_library(
-    name = "cwise_op",
-    gpu_srcs = [
-        "cwise_op_gpu_base.cu.cc",
-        "cwise_op_gpu_base.cu.h",
-        "cwise_op_gpu_abs.cu.cc",
-        "cwise_op_gpu_tanh.cu.cc",
-    ],
+    name = "cwise_unary_op",
+    srcs = if_gpu_is_configured([
+        "cwise_op_gpu_abs.cc",
+        "cwise_op_gpu_base.cc",
+        "cwise_op_gpu_base.h",
+        "cwise_op_gpu_tanh.cc",
+    ]),
     tags = ["manual"],
-    deps = if_cuda([
+    deps = [
         ":abs_kernels",
         ":tanh_kernels",
-        "@com_google_absl//absl/strings",
-        "//third_party/eigen3",
-        "@com_google_absl//absl/types:span",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor",
-    ]),
+        "//third_party/eigen3",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/types:span",
+    ],
 )
 
 tf_cuda_cc_test(
     name = "gpu_tanh_test",
     size = "small",
     srcs = if_mlir_generated_gpu_kernels_enabled(["gpu_tanh_test.cc"]),
-    tags = tf_cuda_tests_tags() + ["no_rocm"],
+    tags = tf_cuda_tests_tags(),
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
@@ -68,7 +73,7 @@ tf_cuda_cc_test(
     name = "gpu_abs_test",
     size = "small",
     srcs = if_mlir_generated_gpu_kernels_enabled(["gpu_abs_test.cc"]),
-    tags = tf_cuda_tests_tags() + ["no_rocm"],
+    tags = tf_cuda_tests_tags(),
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
@@ -132,3 +137,99 @@ gen_kernel_library(
     ],
     unroll_factors = "4",
 )
+
+gen_kernel_library(
+    name = "ceil",
+    same_shape = "0,1",
+    tile_size = "256",
+    types = [
+        "f16",
+        "f32",
+        "f64",
+    ],
+    unroll_factors = "4",
+)
+
+gen_kernel_library(
+    name = "cos",
+    same_shape = "0,1",
+    tile_size = "256",
+    types = [
+        "f16",
+        "f32",
+        "f64",
+    ],
+    unroll_factors = "4",
+)
+
+gen_kernel_library(
+    name = "exp",
+    same_shape = "0,1",
+    tile_size = "256",
+    types = [
+        "f16",
+        "f32",
+        "f64",
+    ],
+    unroll_factors = "4",
+)
+
+gen_kernel_library(
+    name = "floor",
+    same_shape = "0,1",
+    tile_size = "256",
+    types = [
+        "f16",
+        "f32",
+        "f64",
+    ],
+    unroll_factors = "4",
+)
+
+gen_kernel_library(
+    name = "log",
+    same_shape = "0,1",
+    tile_size = "256",
+    types = [
+        "f16",
+        "f32",
+        "f64",
+    ],
+    unroll_factors = "4",
+)
+
+gen_kernel_library(
+    name = "neg",
+    same_shape = "0,1",
+    tile_size = "256",
+    types = [
+        "f16",
+        "f32",
+        "f64",
+    ],
+    unroll_factors = "4",
+)
+
+gen_kernel_library(
+    name = "rsqrt",
+    same_shape = "0,1",
+    tile_size = "256",
+    types = [
+        "f16",
+        "f32",
+        "f64",
+    ],
+    unroll_factors = "4",
+)
+
+gen_kernel_library(
+    name = "sqrt",
+    same_shape = "0,1",
+    tile_size = "256",
+    types = [
+        "f16",
+        "f32",
+        "f64",
+    ],
+    unroll_factors = "4",
+)
diff --git a/tensorflow/core/kernels/mlir_generated/build_defs.bzl b/tensorflow/core/kernels/mlir_generated/build_defs.bzl
index 2bf6e8fa3bb..5dc6fa527c0 100644
--- a/tensorflow/core/kernels/mlir_generated/build_defs.bzl
+++ b/tensorflow/core/kernels/mlir_generated/build_defs.bzl
@@ -1,6 +1,15 @@
 """Generates cubin headers for TF dialect ops."""
 
-load("@local_config_cuda//cuda:build_defs.bzl", "cuda_gpu_architectures", "if_cuda")
+load("@local_config_cuda//cuda:build_defs.bzl", "cuda_gpu_architectures")
+load(
+    "@local_config_rocm//rocm:build_defs.bzl",
+    "rocm_gpu_architectures",
+    "rocm_is_configured",
+)
+load(
+    "//tensorflow/stream_executor:build_defs.bzl",
+    "if_gpu_is_configured",
+)
 
 def if_mlir_generated_gpu_kernels_enabled(if_true, if_false = []):
     return select({
@@ -15,9 +24,12 @@ def _lookup_file(filegroup, path):
             return file
     return None
 
-CubinInfo = provider(fields = ["cubins"])
+GpuBinaryInfo = provider(
+    "GPU binaries in either cubin format or hsaco format",
+    fields = ["gpu_bins"],
+)
 
-def _gen_kernel_cubin_impl(ctx):
+def _gen_kernel_gpu_bin_impl(ctx):
     name = ctx.attr.name
     tile_sizes = ctx.attr.tile_size.replace("x", ",")
     cmd_args = []
@@ -26,35 +38,39 @@ def _gen_kernel_cubin_impl(ctx):
     if ctx.attr.unroll_factors:
         cmd_args.append("--unroll_factors=%s" % ctx.attr.unroll_factors)
 
-    cubins = []
+    if ctx.attr.extra_args:
+        cmd_args.extend(ctx.attr.extra_args)
+
+    gpu_bins = []
     for arch in ctx.attr.gpu_archs:
         # TODO(b/152737872): 'compute_' should generate both SASS and PTX.
         arch = arch.replace("compute_", "sm_")
-        filename = "%s.%s.cubin" % (name, arch)
-        cubin = ctx.actions.declare_file(filename)
+        filename = "%s.%s.bin" % (name, arch)
+        gpu_bin = ctx.actions.declare_file(filename)
         ctx.actions.run(
             inputs = [ctx.file.mlir_op, ctx.file._tfso],
-            outputs = [cubin],
+            outputs = [gpu_bin],
             executable = ctx.executable._tool,
             arguments = cmd_args + [
                 "--tile_sizes=%s" % tile_sizes,
-                "--arch=%s" % arch.split("_")[1],
+                # For ROCM, remove the "gfx" prefix. For CUDA, remove the "sm_" prefix.
+                "--arch=%s" % arch[3:],
                 "--input=%s" % ctx.file.mlir_op.path,
-                "--output=%s" % cubin.path,
+                "--output=%s" % gpu_bin.path,
             ],
             mnemonic = "compile",
         )
-        cubins.append(cubin)
-    return [CubinInfo(cubins = cubins)]
+        gpu_bins.append(gpu_bin)
+    return [GpuBinaryInfo(gpu_bins = gpu_bins)]
 
-_gen_kernel_cubin_rule = rule(
-    implementation = _gen_kernel_cubin_impl,
+_gen_kernel_gpu_bin_rule = rule(
     attrs = {
         "mlir_op": attr.label(mandatory = True, allow_single_file = True),
         "tile_size": attr.string(mandatory = True),
         "same_shape": attr.string(),
         "unroll_factors": attr.string(),
         "gpu_archs": attr.string_list(mandatory = True),
+        "extra_args": attr.string_list(),
         "_tfso": attr.label(
             default = Label("//tensorflow:libtensorflow_framework.so.2"),
             cfg = "host",
@@ -62,16 +78,17 @@ _gen_kernel_cubin_rule = rule(
         ),
         "_tool": attr.label(
             executable = True,
-            default = Label("//tensorflow/compiler/mlir/tools/kernel_gen:tf_to_cubin"),
+            default = Label("//tensorflow/compiler/mlir/tools/kernel_gen:tf_to_gpu_binary"),
             cfg = "host",
         ),
     },
     output_to_genfiles = True,
+    implementation = _gen_kernel_gpu_bin_impl,
 )
 
-def _gen_kernel_image_hdr_impl(ctx):
+def _gen_kernel_image_hdr_impl_cuda(ctx):
     images = []
-    for cubin in ctx.attr.input[CubinInfo].cubins:
+    for cubin in ctx.attr.input[GpuBinaryInfo].gpu_bins:
         arch = cubin.path.split(".")[-2]
         images.append("--image=profile=%s,file=%s" % (arch, cubin.path))
 
@@ -79,8 +96,8 @@ def _gen_kernel_image_hdr_impl(ctx):
     fatbin = ctx.actions.declare_file("%s.fatbin" % ctx.attr.name)
     ctx.actions.run(
         outputs = [fatbin],
-        inputs = ctx.attr.input[CubinInfo].cubins,
-        executable = _lookup_file(ctx.attr._cuda_root, "bin/fatbinary"),
+        inputs = ctx.attr.input[GpuBinaryInfo].gpu_bins,
+        executable = _lookup_file(ctx.attr._gpu_root, "bin/fatbinary"),
         arguments = [
             "--64",
             "--cmdline=--compile-only",
@@ -91,7 +108,7 @@ def _gen_kernel_image_hdr_impl(ctx):
         mnemonic = "fatbinary",
     )
 
-    bin2c = _lookup_file(ctx.attr._cuda_root, "bin/bin2c")
+    bin2c = _lookup_file(ctx.attr._gpu_root, "bin/bin2c")
     ctx.actions.run_shell(
         outputs = [ctx.outputs.out],
         inputs = [fatbin],
@@ -101,36 +118,81 @@ def _gen_kernel_image_hdr_impl(ctx):
         mnemonic = "bin2c",
     )
 
+def _gen_kernel_image_hdr_impl_rocm(ctx):
+    hsaco_files = []
+    hsaco_targets = []
+
+    # Add a dummy host target triple...clang-offload-bundler requires 1 and only 1 host target triple
+    hsaco_files.append("/dev/null")
+    hsaco_targets.append("host-x86_64-unknown-linux")
+
+    hsacos = ctx.attr.input[GpuBinaryInfo].gpu_bins
+    for hsaco in hsacos:
+        gfx_arch = hsaco.path.split(".")[-2]
+        hsaco_files.append(hsaco.path)
+        hsaco_targets.append("hip-amdgcn-amd-amdhsa-%s" % gfx_arch)
+
+    # Generate fatbin file from all hsacos.
+    fatbin = ctx.actions.declare_file("%s.fatbin" % ctx.attr.name)
+    ctx.actions.run(
+        outputs = [fatbin],
+        inputs = hsacos,
+        executable = _lookup_file(ctx.attr._gpu_root, "bin/clang-offload-bundler"),
+        arguments = [
+            "--inputs=%s" % ",".join(hsaco_files),
+            "--targets=%s" % ",".join(hsaco_targets),
+            "--type=o",
+            "--outputs=%s" % fatbin.path,
+        ],
+        mnemonic = "fatbinary",
+    )
+
+    ctx.actions.run_shell(
+        outputs = [ctx.outputs.out],
+        inputs = [fatbin],
+        command = (
+            ("echo 'static const unsigned char %s[] = {' > %s && " +
+             "hexdump -v -e \'/1 \"0x%%02x, \"\' %s | cat >> %s && " +
+             "echo '};' >> %s") % (
+                ctx.attr.symbol,
+                ctx.outputs.out.path,
+                fatbin.path,
+                ctx.outputs.out.path,
+                ctx.outputs.out.path,
+            )
+        ),
+    )
+
 _gen_kernel_image_hdr_rule = rule(
-    implementation = _gen_kernel_image_hdr_impl,
+    implementation = _gen_kernel_image_hdr_impl_rocm if rocm_is_configured() else _gen_kernel_image_hdr_impl_cuda,
     output_to_genfiles = True,
     attrs = {
-        "input": attr.label(mandatory = True, providers = [CubinInfo]),
+        "input": attr.label(mandatory = True, providers = [GpuBinaryInfo]),
         "out": attr.output(mandatory = True),
         "symbol": attr.string(mandatory = True),
-        "_cuda_root": attr.label(
-            default = Label("@local_config_cuda//cuda:cuda_root"),
+        "_gpu_root": attr.label(
+            default = Label("@local_config_rocm//rocm:rocm_root") if rocm_is_configured() else Label("@local_config_cuda//cuda:cuda_root"),
         ),
     },
 )
 
-def _gen_kernel_image_hdr(name, mlir_op, tile_size, same_shape = None, unroll_factors = None):
+def _gen_kernel_image_hdr(name, mlir_op, gpu_archs, tile_size, same_shape = None, unroll_factors = None, extra_args = []):
     """Generates a C header with fatbin data from a Tensorflow op."""
-    if cuda_gpu_architectures():
-        _gen_kernel_cubin_rule(
-            name = name + "_cubin",
-            mlir_op = mlir_op,
-            tile_size = tile_size,
-            same_shape = same_shape,
-            unroll_factors = unroll_factors,
-            gpu_archs = cuda_gpu_architectures(),
-        )
-        _gen_kernel_image_hdr_rule(
-            name = name,
-            input = ":" + name + "_cubin",
-            out = "%s.h" % name,
-            symbol = "k%s" % name.replace("_", " ").title().replace(" ", ""),
-        )
+    _gen_kernel_gpu_bin_rule(
+        name = name + "_cubin",
+        mlir_op = mlir_op,
+        tile_size = tile_size,
+        same_shape = same_shape,
+        unroll_factors = unroll_factors,
+        gpu_archs = gpu_archs,
+        extra_args = extra_args,
+    )
+    _gen_kernel_image_hdr_rule(
+        name = name,
+        input = ":" + name + "_cubin",
+        out = "%s.h" % name,
+        symbol = "k%s" % name.replace("_", " ").title().replace(" ", ""),
+    )
 
 def _gen_mlir_op_impl(ctx):
     ctx.actions.run_shell(
@@ -161,7 +223,7 @@ def _gen_mlir_op(name, type):
         out = "{name}_{type}.mlir".format(name = name, type = type),
     )
 
-def gen_kernel_library(name, types, tile_size, tags = [], same_shape = None, unroll_factors = None):
+def gen_kernel_library(name, types, tile_size, tags = [], same_shape = None, unroll_factors = None, extra_args = []):
     """ Generate a library with kernels for a specific tensorflow op.
 
     Args:
@@ -171,9 +233,10 @@ def gen_kernel_library(name, types, tile_size, tags = [], same_shape = None, unr
       unroll_factors: The unrolling specification, e.g. "4,4"
       tags: The tags which should be added to the library.
       same_shape: The information about which shapes are the same, e.g. "0,1".
+      extra_args: Extra arguments to pass to the generator tool.
     """
 
-    if cuda_gpu_architectures():
+    if cuda_gpu_architectures() or rocm_gpu_architectures():
         for type in types:
             _gen_mlir_op(
                 name = name,
@@ -182,13 +245,15 @@ def gen_kernel_library(name, types, tile_size, tags = [], same_shape = None, unr
             _gen_kernel_image_hdr(
                 name = "{name}_{type}_kernel".format(name = name, type = type),
                 mlir_op = "{name}_{type}.mlir".format(name = name, type = type),
+                gpu_archs = rocm_gpu_architectures() if rocm_is_configured() else cuda_gpu_architectures(),
                 tile_size = tile_size,
                 same_shape = same_shape,
                 unroll_factors = unroll_factors,
+                extra_args = extra_args,
             )
 
     native.cc_library(
         name = name + "_kernels",
-        hdrs = if_cuda(if_true = [":{name}_{type}_kernel".format(name = name, type = type) for type in types]),
+        hdrs = if_gpu_is_configured([":{name}_{type}_kernel".format(name = name, type = type) for type in types]),
         tags = tags,
     )
diff --git a/tensorflow/core/kernels/mlir_generated/cwise_op_gpu_abs.cu.cc b/tensorflow/core/kernels/mlir_generated/cwise_op_gpu_abs.cc
similarity index 99%
rename from tensorflow/core/kernels/mlir_generated/cwise_op_gpu_abs.cu.cc
rename to tensorflow/core/kernels/mlir_generated/cwise_op_gpu_abs.cc
index 1920317a7ae..a8e780d6bb5 100644
--- a/tensorflow/core/kernels/mlir_generated/cwise_op_gpu_abs.cu.cc
+++ b/tensorflow/core/kernels/mlir_generated/cwise_op_gpu_abs.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/mlir_generated/abs_f64_kernel.h"
 #include "tensorflow/core/kernels/mlir_generated/abs_i32_kernel.h"
 #include "tensorflow/core/kernels/mlir_generated/abs_i64_kernel.h"
-#include "tensorflow/core/kernels/mlir_generated/cwise_op_gpu_base.cu.h"
+#include "tensorflow/core/kernels/mlir_generated/cwise_op_gpu_base.h"
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/core/kernels/mlir_generated/cwise_op_gpu_base.cu.cc b/tensorflow/core/kernels/mlir_generated/cwise_op_gpu_base.cc
similarity index 99%
rename from tensorflow/core/kernels/mlir_generated/cwise_op_gpu_base.cu.cc
rename to tensorflow/core/kernels/mlir_generated/cwise_op_gpu_base.cc
index 5a5c9ed6a42..c5fbb155923 100644
--- a/tensorflow/core/kernels/mlir_generated/cwise_op_gpu_base.cu.cc
+++ b/tensorflow/core/kernels/mlir_generated/cwise_op_gpu_base.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/mlir_generated/cwise_op_gpu_base.cu.h"
+#include "tensorflow/core/kernels/mlir_generated/cwise_op_gpu_base.h"
 
 #include <memory>
 #include <string>
diff --git a/tensorflow/core/kernels/mlir_generated/cwise_op_gpu_base.cu.h b/tensorflow/core/kernels/mlir_generated/cwise_op_gpu_base.h
similarity index 99%
rename from tensorflow/core/kernels/mlir_generated/cwise_op_gpu_base.cu.h
rename to tensorflow/core/kernels/mlir_generated/cwise_op_gpu_base.h
index 4e75aab6e16..4905d21c299 100644
--- a/tensorflow/core/kernels/mlir_generated/cwise_op_gpu_base.cu.h
+++ b/tensorflow/core/kernels/mlir_generated/cwise_op_gpu_base.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_KERNELS_MLIR_GENERATED_CWISE_OP_GPU_BASE_CU_H_
-#define TENSORFLOW_CORE_KERNELS_MLIR_GENERATED_CWISE_OP_GPU_BASE_CU_H_
+#ifndef TENSORFLOW_CORE_KERNELS_MLIR_GENERATED_CWISE_OP_GPU_BASE_H_
+#define TENSORFLOW_CORE_KERNELS_MLIR_GENERATED_CWISE_OP_GPU_BASE_H_
 
 #include <memory>
 #include <string>
@@ -74,4 +74,4 @@ class MlirGeneratedUnaryOp : public OpKernel {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_MLIR_GENERATED_CWISE_OP_GPU_BASE_CU_H_
+#endif  // TENSORFLOW_CORE_KERNELS_MLIR_GENERATED_CWISE_OP_GPU_BASE_H_
diff --git a/tensorflow/core/kernels/mlir_generated/cwise_op_gpu_tanh.cu.cc b/tensorflow/core/kernels/mlir_generated/cwise_op_gpu_tanh.cc
similarity index 99%
rename from tensorflow/core/kernels/mlir_generated/cwise_op_gpu_tanh.cu.cc
rename to tensorflow/core/kernels/mlir_generated/cwise_op_gpu_tanh.cc
index b113c4cad34..72469a33378 100644
--- a/tensorflow/core/kernels/mlir_generated/cwise_op_gpu_tanh.cu.cc
+++ b/tensorflow/core/kernels/mlir_generated/cwise_op_gpu_tanh.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/kernels/mlir_generated/cwise_op_gpu_base.cu.h"
+#include "tensorflow/core/kernels/mlir_generated/cwise_op_gpu_base.h"
 #include "tensorflow/core/kernels/mlir_generated/tanh_f16_kernel.h"
 #include "tensorflow/core/kernels/mlir_generated/tanh_f32_kernel.h"
 #include "tensorflow/core/kernels/mlir_generated/tanh_f64_kernel.h"
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/ceil.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/ceil.mlir.tmpl
new file mode 100644
index 00000000000..a02dd3bb14f
--- /dev/null
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/ceil.mlir.tmpl
@@ -0,0 +1,5 @@
+func @ceil(%arg0: tensor<?xelem_type>) -> tensor<?xelem_type> {
+  %0 = "tf.Ceil"(%arg0) { }
+    : (tensor<?xelem_type>) -> tensor<?xelem_type>
+  return %0 : tensor<?xelem_type>
+}
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/cos.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/cos.mlir.tmpl
new file mode 100644
index 00000000000..3578ecc0c35
--- /dev/null
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/cos.mlir.tmpl
@@ -0,0 +1,5 @@
+func @cos(%arg0: tensor<?xelem_type>) -> tensor<?xelem_type> {
+  %0 = "tf.Cos"(%arg0) { }
+    : (tensor<?xelem_type>) -> tensor<?xelem_type>
+  return %0 : tensor<?xelem_type>
+}
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/exp.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/exp.mlir.tmpl
new file mode 100644
index 00000000000..e4f450692b9
--- /dev/null
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/exp.mlir.tmpl
@@ -0,0 +1,5 @@
+func @exp(%arg0: tensor<?xelem_type>) -> tensor<?xelem_type> {
+  %0 = "tf.Exp"(%arg0) { }
+    : (tensor<?xelem_type>) -> tensor<?xelem_type>
+  return %0 : tensor<?xelem_type>
+}
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/floor.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/floor.mlir.tmpl
new file mode 100644
index 00000000000..c841cfb6501
--- /dev/null
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/floor.mlir.tmpl
@@ -0,0 +1,5 @@
+func @floor(%arg0: tensor<?xelem_type>) -> tensor<?xelem_type> {
+  %0 = "tf.Floor"(%arg0) { }
+    : (tensor<?xelem_type>) -> tensor<?xelem_type>
+  return %0 : tensor<?xelem_type>
+}
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/log.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/log.mlir.tmpl
new file mode 100644
index 00000000000..71e5b3ef507
--- /dev/null
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/log.mlir.tmpl
@@ -0,0 +1,5 @@
+func @log(%arg0: tensor<?xelem_type>) -> tensor<?xelem_type> {
+  %0 = "tf.Log"(%arg0) { }
+    : (tensor<?xelem_type>) -> tensor<?xelem_type>
+  return %0 : tensor<?xelem_type>
+}
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/neg.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/neg.mlir.tmpl
new file mode 100644
index 00000000000..bdc6d929239
--- /dev/null
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/neg.mlir.tmpl
@@ -0,0 +1,5 @@
+func @neg(%arg0: tensor<?xelem_type>) -> tensor<?xelem_type> {
+  %0 = "tf.Neg"(%arg0) { }
+    : (tensor<?xelem_type>) -> tensor<?xelem_type>
+  return %0 : tensor<?xelem_type>
+}
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/rsqrt.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/rsqrt.mlir.tmpl
new file mode 100644
index 00000000000..3c456778092
--- /dev/null
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/rsqrt.mlir.tmpl
@@ -0,0 +1,5 @@
+func @rsqrt(%arg0: tensor<?xelem_type>) -> tensor<?xelem_type> {
+  %0 = "tf.Rsqrt"(%arg0) { }
+    : (tensor<?xelem_type>) -> tensor<?xelem_type>
+  return %0 : tensor<?xelem_type>
+}
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/sqrt.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/sqrt.mlir.tmpl
new file mode 100644
index 00000000000..9d54326e032
--- /dev/null
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/sqrt.mlir.tmpl
@@ -0,0 +1,5 @@
+func @sqrt(%arg0: tensor<?xelem_type>) -> tensor<?xelem_type> {
+  %0 = "tf.Sqrt"(%arg0) { }
+    : (tensor<?xelem_type>) -> tensor<?xelem_type>
+  return %0 : tensor<?xelem_type>
+}
diff --git a/tensorflow/core/kernels/multinomial_op_gpu.cu.cc b/tensorflow/core/kernels/multinomial_op_gpu.cu.cc
index 95bc0ed357a..bc3232170f2 100644
--- a/tensorflow/core/kernels/multinomial_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/multinomial_op_gpu.cu.cc
@@ -66,8 +66,9 @@ struct MultinomialFunctor<GPUDevice, T, OutputType> {
                   typename TTypes<OutputType>::Matrix output) {
     // Uniform, [0, 1).
     typedef random::UniformDistribution<random::PhiloxRandom, float> Dist;
-    functor::FillPhiloxRandom<GPUDevice, Dist>()(ctx, d, gen, noises.data(),
-                                                 noises.size(), Dist());
+    functor::FillPhiloxRandom<GPUDevice, Dist>()(
+        ctx, d, /*key=*/nullptr, /*counter=*/nullptr, gen, noises.data(),
+        noises.size(), Dist());
 
 #if defined(EIGEN_HAS_INDEX_LIST)
     Eigen::IndexList<int, int, int> bsc;
diff --git a/tensorflow/core/kernels/nextafter_op.cc b/tensorflow/core/kernels/nextafter_op.cc
index d97b7373bba..923fc2399d1 100644
--- a/tensorflow/core/kernels/nextafter_op.cc
+++ b/tensorflow/core/kernels/nextafter_op.cc
@@ -22,15 +22,6 @@ namespace tensorflow {
 
 REGISTER2(BinaryOp, CPU, "NextAfter", functor::nextafter, float, double);
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                     \
-  REGISTER_KERNEL_BUILDER(                                             \
-      Name("NextAfter").Device(DEVICE_SYCL).TypeConstraint<TYPE>("T"), \
-      BinaryOp<SYCLDevice, functor::nextafter<TYPE>>);
-REGISTER_SYCL_KERNEL(float);
-REGISTER_SYCL_KERNEL(double);
-#undef REGISTER_SYCL_KERNEL
-#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER2(BinaryOp, GPU, "NextAfter", functor::nextafter, float, double);
diff --git a/tensorflow/core/kernels/pack_op.cc b/tensorflow/core/kernels/pack_op.cc
index 04b5c72b3cf..1418159ff8a 100644
--- a/tensorflow/core/kernels/pack_op.cc
+++ b/tensorflow/core/kernels/pack_op.cc
@@ -34,9 +34,6 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 typedef Eigen::GpuDevice GPUDevice;
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 // --------------------------------------------------------------------------
 template <typename Device, typename T>
@@ -115,12 +112,6 @@ class PackOp : public OpKernel {
         return;
       }
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#ifdef TENSORFLOW_USE_SYCL
-      if (std::is_same<Device, SYCLDevice>::value) {
-        ConcatSYCL<T>(c->eigen_sycl_device(), inputs_flat, &output_flat);
-        return;
-      }
-#endif  // TENSORFLOW_USE_SYCL
       ConcatCPU<T>(c->device(), inputs_flat, &output_flat);
     }
   }
@@ -170,19 +161,4 @@ REGISTER_KERNEL_BUILDER(Name("Pack")
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL(type)                                       \
-  REGISTER_KERNEL_BUILDER(                                        \
-      Name("Pack").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
-      PackOp<SYCLDevice, type>)
-
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL);
-REGISTER_KERNEL_BUILDER(Name("Pack")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("values")
-                            .HostMemory("output")
-                            .TypeConstraint<int32>("T"),
-                        PackOp<CPUDevice, int32>);
-#undef REGISTER_SYCL
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/pad_op.cc b/tensorflow/core/kernels/pad_op.cc
index 0b404238a14..4a1d0cfc3e2 100644
--- a/tensorflow/core/kernels/pad_op.cc
+++ b/tensorflow/core/kernels/pad_op.cc
@@ -38,9 +38,6 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T, typename Tpadding>
 class PadOp : public OpKernel {
@@ -392,72 +389,5 @@ REGISTER_KERNEL_BUILDER(Name("PadV2")
                         PadOp<CPUDevice, int32, int64>);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-// Registration of the GPU implementations.
-#define REGISTER_SYCL_KERNEL(T)                                   \
-  REGISTER_KERNEL_BUILDER(Name("Pad")                             \
-                              .Device(DEVICE_SYCL)                \
-                              .TypeConstraint<T>("T")             \
-                              .TypeConstraint<int32>("Tpaddings") \
-                              .HostMemory("paddings"),            \
-                          PadOp<SYCLDevice, T, int32>);           \
-  REGISTER_KERNEL_BUILDER(Name("Pad")                             \
-                              .Device(DEVICE_SYCL)                \
-                              .TypeConstraint<T>("T")             \
-                              .TypeConstraint<int64>("Tpaddings") \
-                              .HostMemory("paddings"),            \
-                          PadOp<SYCLDevice, T, int64>);           \
-  REGISTER_KERNEL_BUILDER(Name("PadV2")                           \
-                              .Device(DEVICE_SYCL)                \
-                              .TypeConstraint<T>("T")             \
-                              .TypeConstraint<int32>("Tpaddings") \
-                              .HostMemory("paddings")             \
-                              .HostMemory("constant_values"),     \
-                          PadOp<SYCLDevice, T, int32>)            \
-  REGISTER_KERNEL_BUILDER(Name("PadV2")                           \
-                              .Device(DEVICE_SYCL)                \
-                              .TypeConstraint<T>("T")             \
-                              .TypeConstraint<int64>("Tpaddings") \
-                              .HostMemory("paddings")             \
-                              .HostMemory("constant_values"),     \
-                          PadOp<SYCLDevice, T, int64>)
-
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNEL);
-REGISTER_KERNEL_BUILDER(Name("Pad")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int32>("T")
-                            .TypeConstraint<int32>("Tpaddings")
-                            .HostMemory("input")
-                            .HostMemory("paddings")
-                            .HostMemory("output"),
-                        PadOp<CPUDevice, int32, int32>);
-REGISTER_KERNEL_BUILDER(Name("Pad")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int32>("T")
-                            .TypeConstraint<int64>("Tpaddings")
-                            .HostMemory("input")
-                            .HostMemory("paddings")
-                            .HostMemory("output"),
-                        PadOp<CPUDevice, int32, int64>);
-REGISTER_KERNEL_BUILDER(Name("PadV2")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int32>("T")
-                            .TypeConstraint<int32>("Tpaddings")
-                            .HostMemory("input")
-                            .HostMemory("paddings")
-                            .HostMemory("constant_values")
-                            .HostMemory("output"),
-                        PadOp<CPUDevice, int32, int32>);
-REGISTER_KERNEL_BUILDER(Name("PadV2")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int32>("T")
-                            .TypeConstraint<int64>("Tpaddings")
-                            .HostMemory("input")
-                            .HostMemory("paddings")
-                            .HostMemory("constant_values")
-                            .HostMemory("output"),
-                        PadOp<CPUDevice, int32, int64>);
-#undef REGISTER_SYCL_KERNEL
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/partitioned_function_ops.cc b/tensorflow/core/kernels/partitioned_function_ops.cc
index d8112531b73..6a1e2d5e29f 100644
--- a/tensorflow/core/kernels/partitioned_function_ops.cc
+++ b/tensorflow/core/kernels/partitioned_function_ops.cc
@@ -257,13 +257,7 @@ void PartitionedCallOp::RunFunction(FunctionLibraryRuntime::Handle handle,
 
   std::vector<Tensor>* rets = new std::vector<Tensor>;
   const string& func_name = func_->name();
-  profiler::TraceMe trace_me(
-      [&] {
-        return absl::StrCat(
-            "PartitionedCallOp #parent_step_id=", ctx->step_id(),
-            ",function_step_id=", run_opts.step_id, "#");
-      },
-      /*level=*/2);
+  profiler::TraceMe trace_me("PartitionedCallOp");
   lib->Run(run_opts, handle, inputs, rets,
            [rets, done = std::move(done), ctx, func_name,
             step_container](const Status& status) {
@@ -295,11 +289,5 @@ REGISTER_KERNEL_BUILDER(Name("StatefulPartitionedCall").Device(DEVICE_GPU),
 REGISTER_INPUT_COLOCATION_EXEMPTION("PartitionedCall");
 REGISTER_INPUT_COLOCATION_EXEMPTION("StatefulPartitionedCall");
 
-#if TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("PartitionedCall").Device(DEVICE_SYCL),
-                        PartitionedCallOp);
-REGISTER_KERNEL_BUILDER(Name("StatefulPartitionedCall").Device(DEVICE_SYCL),
-                        PartitionedCallOp);
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/pooling_ops_3d.cc b/tensorflow/core/kernels/pooling_ops_3d.cc
index 532d861e615..1114e6931ec 100644
--- a/tensorflow/core/kernels/pooling_ops_3d.cc
+++ b/tensorflow/core/kernels/pooling_ops_3d.cc
@@ -39,17 +39,11 @@ limitations under the License.
 #include "tensorflow/core/kernels/pooling_ops_3d_gpu.h"
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-#include "tensorflow/core/kernels/pooling_ops_3d_sycl.h"
-#endif  // TENSORFLOW_USE_SYCL
 
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 Pool3dParameters::Pool3dParameters(OpKernelContext* context,
                                    const std::vector<int32>& ksize,
@@ -830,11 +824,6 @@ TF_CALL_float(REGISTER_GPU_KERNELS) TF_CALL_half(REGISTER_GPU_KERNELS)
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNELS(T) REGISTER_KERNELS(SYCL, T)
-    TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNELS)
-#undef REGISTER_SYCL_KERNELS
-#endif  // TENSORFLOW_USE_SYCL
 
 #undef REGISTER_KERNELS
 
diff --git a/tensorflow/core/kernels/pooling_ops_3d_sycl.h b/tensorflow/core/kernels/pooling_ops_3d_sycl.h
deleted file mode 100644
index b4bead2456d..00000000000
--- a/tensorflow/core/kernels/pooling_ops_3d_sycl.h
+++ /dev/null
@@ -1,758 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#if !TENSORFLOW_USE_SYCL
-#error This file must only be included when building with SYCL support
-#endif
-
-#ifndef TENSORFLOW_CORE_KERNELS_POOLING_OP_3D_SYCL_H_
-#define TENSORFLOW_CORE_KERNELS_POOLING_OP_3D_SYCL_H_
-
-#include "tensorflow/core/kernels/pooling_ops_3d.h"
-
-namespace tensorflow {
-
-typedef Eigen::SyclDevice SYCLDevice;
-
-// Helper struct to contain the various pool parameters used in the SYCL
-// pooling kernels. Similar to the Pool3dParameters, but with a number of
-// convenient constructors.
-struct SYCL3DPoolParams {
-  SYCL3DPoolParams(const int depth, const int batch, const int in_planes,
-                   const int in_rows, const int in_cols, const int out_planes,
-                   const int out_rows, const int out_cols,
-                   const std::array<int64, 3>& window,
-                   const std::array<int64, 3>& stride,
-                   const std::array<int64, 3>& padding)
-      : depth_(depth),
-        batch_(batch),
-        in_planes_(in_planes),
-        in_rows_(in_rows),
-        in_cols_(in_cols),
-        window_planes_(window[2]),
-        window_rows_(window[1]),
-        window_cols_(window[0]),
-        stride_planes_(stride[2]),
-        stride_rows_(stride[1]),
-        stride_cols_(stride[0]),
-        out_planes_(out_planes),
-        out_rows_(out_rows),
-        out_cols_(out_cols),
-        pad_planes_(padding[2]),
-        pad_rows_(padding[1]),
-        pad_cols_(padding[0]) {}
-
-  SYCL3DPoolParams(const int depth, const int batch, const int in_planes,
-                   const int in_rows, const int in_cols,
-                   const std::array<int64, 3>& out_shape,
-                   const std::array<int64, 3>& window,
-                   const std::array<int64, 3>& stride,
-                   const std::array<int64, 3>& padding)
-      : SYCL3DPoolParams(depth, batch, in_planes, in_rows, in_cols,
-                         out_shape[2], out_shape[1], out_shape[0], window,
-                         stride, padding) {}
-
-  SYCL3DPoolParams(const Pool3dParameters& params)
-      : depth_(params.depth),
-        batch_(params.tensor_in_batch),
-        in_planes_(params.tensor_in_planes),
-        in_rows_(params.tensor_in_rows),
-        in_cols_(params.tensor_in_cols),
-        window_planes_(params.window_planes),
-        window_rows_(params.window_rows),
-        window_cols_(params.window_cols),
-        stride_planes_(params.plane_stride),
-        stride_rows_(params.row_stride),
-        stride_cols_(params.col_stride),
-        out_planes_(params.out_plane),
-        out_rows_(params.out_height),
-        out_cols_(params.out_width),
-        pad_planes_(params.pad_planes),
-        pad_rows_(params.pad_rows),
-        pad_cols_(params.pad_cols) {}
-
-  const int depth_;
-  const int batch_;
-  const int in_planes_;
-  const int in_rows_;
-  const int in_cols_;
-
-  const int window_planes_;
-  const int window_rows_;
-  const int window_cols_;
-
-  const int stride_planes_;
-  const int stride_rows_;
-  const int stride_cols_;
-
-  const int out_planes_;
-  const int out_rows_;
-  const int out_cols_;
-
-  const int pad_planes_;
-  const int pad_rows_;
-  const int pad_cols_;
-};
-// MaxPool3d SYCL kernel. Expects the number of threads to be equal to the
-// number of elements in the output tensor.
-//
-// For each output element, find the corresponding input window and run over
-// all values in the window to find the maximum value. This value is then
-// copied into that output element.
-template <typename T>
-class MaxPool3DSYCL {
-  using write_accessor =
-      cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write,
-                         cl::sycl::access::target::global_buffer>;
-  using read_accessor =
-      cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read,
-                         cl::sycl::access::target::global_buffer>;
-
- public:
-  MaxPool3DSYCL(const int depth, const int batch, const int in_planes,
-                const int in_rows, const int in_cols, const int out_planes,
-                const int out_rows, const int out_cols,
-                const std::array<int64, 3>& window,
-                const std::array<int64, 3>& stride,
-                const std::array<int64, 3>& padding,
-                const read_accessor input_accessor,
-                write_accessor output_accessor)
-      : p_(depth, batch, in_planes, in_rows, in_cols, out_planes, out_rows,
-           out_cols, window, stride, padding),
-        input_accessor_(input_accessor),
-        output_accessor_(output_accessor) {}
-  void operator()(cl::sycl::item<1> item) {
-    T* input_data = ConvertToActualTypeSycl(T, input_accessor_);
-    T* output_data = ConvertToActualTypeSycl(T, output_accessor_);
-
-    int index = item.get_linear_id();
-    int n = index;
-    int d = n % p_.depth_;
-    n /= p_.depth_;
-    int cstart = (n % p_.out_cols_) * p_.stride_cols_ - p_.pad_cols_;
-    int cend = std::min(cstart + p_.window_cols_, p_.in_cols_);
-    cstart = std::max(cstart, 0);
-    n /= p_.out_cols_;
-    int rstart = (n % p_.out_rows_) * p_.stride_rows_ - p_.pad_rows_;
-    int rend = std::min(rstart + p_.window_rows_, p_.in_rows_);
-    rstart = std::max(rstart, 0);
-    n /= p_.out_rows_;
-    int pstart = (n % p_.out_planes_) * p_.stride_planes_ - p_.pad_planes_;
-    int pend = std::min(pstart + p_.window_planes_, p_.in_planes_);
-    pstart = std::max(pstart, 0);
-    n /= p_.out_planes_;
-    T maxval = Eigen::NumTraits<T>::lowest();
-    const T* input_data_n =
-        input_data + n * p_.in_planes_ * p_.in_cols_ * p_.in_rows_ * p_.depth_;
-    for (int p = pstart; p < pend; ++p) {
-      for (int r = rstart; r < rend; ++r) {
-        for (int c = cstart; c < cend; ++c) {
-          int idx = ((p * p_.in_rows_ + r) * p_.in_cols_ + c) * p_.depth_ + d;
-          if (input_data_n[idx] > maxval) {
-            maxval = input_data_n[idx];
-          }
-        }
-      }
-    }
-    output_data[index] = maxval;
-  }
-
- private:
-  const SYCL3DPoolParams p_;
-  const read_accessor input_accessor_;
-  write_accessor output_accessor_;
-};
-template <typename T>
-struct LaunchPoolingOp<SYCLDevice, T, MAX> {
-  static void launch(OpKernelContext* context, const Tensor& tensor_in,
-                     const std::array<int64, 3>& window,
-                     const std::array<int64, 3>& stride,
-                     const std::array<int64, 3>& padding,
-                     TensorFormat data_format, Padding padding_type,
-                     Tensor* output) {
-    const SYCLDevice& device = context->eigen_device<SYCLDevice>();
-    const int out_planes = GetTensorDim(*output, data_format, '0');
-    const int out_rows = GetTensorDim(*output, data_format, '1');
-    const int out_cols = GetTensorDim(*output, data_format, '2');
-    const int batch = GetTensorDim(tensor_in, data_format, 'N');
-    const int in_planes = GetTensorDim(tensor_in, data_format, '0');
-    const int in_rows = GetTensorDim(tensor_in, data_format, '1');
-    const int in_cols = GetTensorDim(tensor_in, data_format, '2');
-    const int depth = GetTensorDim(tensor_in, data_format, 'C');
-
-    const int num_threads = output->NumElements();
-
-    auto input_buffer =
-        device.get_sycl_buffer(tensor_in.template flat<T>().data());
-    auto output_buffer =
-        device.get_sycl_buffer(output->template flat<T>().data());
-
-    device.sycl_queue().submit([&](cl::sycl::handler& cgh) {
-      auto input_access =
-          input_buffer.template get_access<cl::sycl::access::mode::read>(cgh);
-      auto output_access =
-          output_buffer.template get_access<cl::sycl::access::mode::write>(cgh);
-      MaxPool3DSYCL<T> max_pool(depth, batch, in_planes, in_rows, in_cols,
-                                out_planes, out_rows, out_cols, window, stride,
-                                padding, input_access, output_access);
-
-      cgh.parallel_for(cl::sycl::range<1>(num_threads), max_pool);
-    });
-  }
-};
-// MaxPool3DGrad SYCL kernel. Expects the number of threads to be equal to the
-// number of elements in the output backprop tensor (i.e. the number of elements
-// in the input data tensor).
-//
-// For each output backprop element we compute the possible window of values in
-// the input backprop tensor which might contribute to this element. Then for
-// each error in this window, compute the corresponding input window which was
-// pooled into that element in the output. Walk through this input window to
-// determine whether the input value is the first maximum value, and so the
-// error should be propagated back to the corresponding backprop element.
-template <typename T>
-class MaxPool3DGradSYCL {
-  using write_accessor =
-      cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write,
-                         cl::sycl::access::target::global_buffer>;
-  using read_accessor =
-      cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read,
-                         cl::sycl::access::target::global_buffer>;
-
- public:
-  MaxPool3DGradSYCL(const int depth, const int batch, const int in_planes,
-                    const int in_rows, const int in_cols,
-                    const std::array<int64, 3>& output_shape,
-                    const std::array<int64, 3>& window,
-                    const std::array<int64, 3>& stride,
-                    const std::array<int64, 3>& padding,
-                    const read_accessor input_data_accessor,
-                    const read_accessor output_data_accessor,
-                    const read_accessor input_backprop_accessor,
-                    write_accessor output_backprop_accessor)
-      : p_(depth, batch, in_planes, in_rows, in_cols, output_shape, window,
-           stride, padding),
-        input_data_accessor_(input_data_accessor),
-        output_data_accessor_(output_data_accessor),
-        input_backprop_accessor_(input_backprop_accessor),
-        output_backprop_accessor_(output_backprop_accessor) {}
-  void operator()(cl::sycl::item<1> item) {
-    T* input_data = ConvertToActualTypeSycl(T, input_data_accessor_);
-    T* output_data = ConvertToActualTypeSycl(T, output_data_accessor_);
-    T* input_backprop = ConvertToActualTypeSycl(T, input_backprop_accessor_);
-    T* output_backprop = ConvertToActualTypeSycl(T, output_backprop_accessor_);
-
-    const int index = item.get_linear_id();
-    T output_value = 0;
-    int n = index;
-    const int d = n % p_.depth_;
-    n /= p_.depth_;
-    const int c = (n % p_.in_cols_) + p_.pad_cols_;
-    const int poolcstart =
-        (c < p_.window_cols_) ? 0 : (c - p_.window_cols_) / p_.stride_cols_ + 1;
-    const int poolcend = std::min(c / p_.stride_cols_ + 1, p_.out_cols_);
-    n /= p_.in_cols_;
-    const int r = (n % p_.in_rows_) + p_.pad_rows_;
-    const int poolrstart =
-        (r < p_.window_rows_) ? 0 : (r - p_.window_rows_) / p_.stride_rows_ + 1;
-    const int poolrend = std::min(r / p_.stride_rows_ + 1, p_.out_rows_);
-    n /= p_.in_rows_;
-    const int p = (n % p_.in_planes_) + p_.pad_planes_;
-    const int poolpstart =
-        (p < p_.window_planes_)
-            ? 0
-            : (p - p_.window_planes_) / p_.stride_planes_ + 1;
-    const int poolpend = std::min(p / p_.stride_planes_ + 1, p_.out_planes_);
-    n /= p_.in_planes_;
-    const int index_no_n =
-        index - n * p_.in_planes_ * p_.in_cols_ * p_.in_rows_ * p_.depth_;
-
-    const T* input_data_n =
-        input_data + n * p_.in_planes_ * p_.in_cols_ * p_.in_rows_ * p_.depth_;
-    const T* output_data_n = output_data + n * p_.out_planes_ * p_.out_cols_ *
-                                               p_.out_rows_ * p_.depth_;
-    const T* input_backprop_n = input_backprop + n * p_.out_planes_ *
-                                                     p_.out_cols_ *
-                                                     p_.out_rows_ * p_.depth_;
-    for (int poolp = poolpstart; poolp < poolpend; ++poolp) {
-      int pstart = poolp * p_.stride_planes_ - p_.pad_planes_;
-      const int pend = std::min(pstart + p_.window_planes_, p_.in_planes_);
-      pstart = std::max(pstart, 0);
-
-      for (int poolr = poolrstart; poolr < poolrend; ++poolr) {
-        int rstart = poolr * p_.stride_rows_ - p_.pad_rows_;
-        const int rend = std::min(rstart + p_.window_rows_, p_.in_rows_);
-        rstart = std::max(rstart, 0);
-
-        for (int poolc = poolcstart; poolc < poolcend; ++poolc) {
-          int cstart = poolc * p_.stride_cols_ - p_.pad_cols_;
-          const int cend = std::min(cstart + p_.window_cols_, p_.in_cols_);
-          cstart = std::max(cstart, 0);
-
-          const int output_data_idx =
-              ((poolp * p_.out_rows_ + poolr) * p_.out_cols_ + poolc) *
-                  p_.depth_ +
-              d;
-          bool should_continue = true;
-          bool is_max = (input_data[index] == output_data_n[output_data_idx]);
-          for (int win_p = pstart; win_p < pend && should_continue; ++win_p) {
-            for (int win_r = rstart; win_r < rend && should_continue; ++win_r) {
-              for (int win_c = cstart; win_c < cend && should_continue;
-                   ++win_c) {
-                const int input_data_idx =
-                    ((win_p * p_.in_rows_ + win_r) * p_.in_cols_ + win_c) *
-                        p_.depth_ +
-                    d;
-                if (input_data_idx == index_no_n) {
-                  should_continue = false;
-                } else if (input_data_n[input_data_idx] ==
-                           output_data_n[output_data_idx]) {
-                  should_continue = false;
-                  is_max = false;
-                }
-              }
-            }
-          }
-          if (is_max) {
-            output_value += input_backprop_n[output_data_idx];
-          }
-        }
-      }
-    }
-    output_backprop[index] = output_value;
-  }
-
- private:
-  const SYCL3DPoolParams p_;
-
-  const read_accessor input_data_accessor_;
-  const read_accessor output_data_accessor_;
-  const read_accessor input_backprop_accessor_;
-  write_accessor output_backprop_accessor_;
-};
-template <typename T>
-struct LaunchMaxPooling3dGradOp<SYCLDevice, T> {
-  static void launch(OpKernelContext* context, const Tensor& tensor_in,
-                     const Tensor& tensor_out, const Tensor& out_backprop,
-                     const std::array<int64, 3>& window,
-                     const std::array<int64, 3>& stride,
-                     const std::array<int64, 3>& out,
-                     const std::array<int64, 3>& padding,
-                     TensorFormat data_format, Tensor* output) {
-    const SYCLDevice& device = context->eigen_device<SYCLDevice>();
-    const int batch = GetTensorDim(tensor_in, data_format, 'N');
-    const int in_planes = GetTensorDim(tensor_in, data_format, '0');
-    const int in_rows = GetTensorDim(tensor_in, data_format, '1');
-    const int in_cols = GetTensorDim(tensor_in, data_format, '2');
-    const int depth = GetTensorDim(tensor_in, data_format, 'C');
-
-    const int output_size = output->NumElements();
-
-    auto input_data_buffer =
-        device.get_sycl_buffer(tensor_in.template flat<T>().data());
-    auto output_data_buffer =
-        device.get_sycl_buffer(tensor_out.template flat<T>().data());
-    auto input_backprop_buffer =
-        device.get_sycl_buffer(out_backprop.template flat<T>().data());
-    auto output_backprop_buffer =
-        device.get_sycl_buffer(output->template flat<T>().data());
-
-    device.sycl_queue().submit([&](cl::sycl::handler& cgh) {
-      auto input_data_access =
-          input_data_buffer.template get_access<cl::sycl::access::mode::read>(
-              cgh);
-      auto output_data_access =
-          output_data_buffer.template get_access<cl::sycl::access::mode::read>(
-              cgh);
-      auto input_backprop_access =
-          input_backprop_buffer
-              .template get_access<cl::sycl::access::mode::read>(cgh);
-      auto output_backprop_access =
-          output_backprop_buffer
-              .template get_access<cl::sycl::access::mode::write>(cgh);
-      MaxPool3DGradSYCL<T> max_pool(
-          depth, batch, in_planes, in_rows, in_cols, out, window, stride,
-          padding, input_data_access, output_data_access, input_backprop_access,
-          output_backprop_access);
-
-      cgh.parallel_for(cl::sycl::range<1>(output_size), max_pool);
-    });
-  }
-};
-// MaxPool3DGradGrad SYCL kernel. Expects the number of threads to be equal to
-// the number of elements in the output backprop tensor, i.e. the number of
-// elements in the output tensor.
-//
-// For each element in the output backprop tensor, find the corresponding input
-// window, and compare the input and output data to find the index of the
-// maximum value in the input tensor. This is then the index of the gradient to
-// pass through to the output backprop tensor.
-template <typename T>
-class MaxPool3DGradGradSYCL {
-  using write_accessor =
-      cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write,
-                         cl::sycl::access::target::global_buffer>;
-  using read_accessor =
-      cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read,
-                         cl::sycl::access::target::global_buffer>;
-
- public:
-  MaxPool3DGradGradSYCL(const Pool3dParameters& params,
-                        const read_accessor input_data_accessor,
-                        const read_accessor output_data_accessor,
-                        const read_accessor input_backprop_accessor,
-                        write_accessor output_backprop_accessor)
-      : p_(params),
-        input_data_accessor_(input_data_accessor),
-        output_data_accessor_(output_data_accessor),
-        input_backprop_accessor_(input_backprop_accessor),
-        output_backprop_accessor_(output_backprop_accessor) {}
-  void operator()(cl::sycl::item<1> item) {
-    T* input_data = ConvertToActualTypeSycl(T, input_data_accessor_);
-    T* output_data = ConvertToActualTypeSycl(T, output_data_accessor_);
-    T* input_backprop = ConvertToActualTypeSycl(T, input_backprop_accessor_);
-    T* output_backprop = ConvertToActualTypeSycl(T, output_backprop_accessor_);
-
-    int index = item.get_linear_id();
-    int n = index;
-    int d = n % p_.depth_;
-    n /= p_.depth_;
-    int cstart = (n % p_.out_cols_) * p_.stride_cols_ - p_.pad_cols_;
-    int cend = std::min(cstart + p_.window_cols_, p_.in_cols_);
-    cstart = std::max(cstart, 0);
-    n /= p_.out_cols_;
-    int rstart = (n % p_.out_rows_) * p_.stride_rows_ - p_.pad_rows_;
-    int rend = std::min(rstart + p_.window_rows_, p_.in_rows_);
-    rstart = std::max(rstart, 0);
-    n /= p_.out_rows_;
-    int pstart = (n % p_.out_planes_) * p_.stride_planes_ - p_.pad_planes_;
-    int pend = std::min(pstart + p_.window_planes_, p_.in_planes_);
-    pstart = std::max(pstart, 0);
-    n /= p_.out_planes_;
-    int maxidx = -1;
-    bool should_stop = false;
-    const T* input_data_n =
-        input_data + n * p_.in_planes_ * p_.in_cols_ * p_.in_rows_ * p_.depth_;
-    for (int p = pstart; p < pend && !should_stop; ++p) {
-      for (int r = rstart; r < rend && !should_stop; ++r) {
-        for (int c = cstart; c < cend && !should_stop; ++c) {
-          int idx = ((p * p_.in_rows_ + r) * p_.in_cols_ + c) * p_.depth_ + d;
-          if (output_data[index] == input_data_n[idx]) {
-            maxidx = idx;
-            should_stop = true;
-          }
-        }
-      }
-    }
-    if (maxidx != -1) {
-      output_backprop[index] = input_backprop[n * p_.in_planes_ * p_.in_rows_ *
-                                                  p_.in_cols_ * p_.depth_ +
-                                              maxidx];
-    }
-  }
-
- private:
-  const SYCL3DPoolParams p_;
-
-  const read_accessor input_data_accessor_;
-  const read_accessor output_data_accessor_;
-  const read_accessor input_backprop_accessor_;
-  write_accessor output_backprop_accessor_;
-};
-template <typename T>
-struct LaunchMaxPooling3dGradGradOp<SYCLDevice, T> {
-  static void launch(OpKernelContext* context, const Pool3dParameters& params,
-                     const Tensor& tensor_in, const Tensor& tensor_out,
-                     const Tensor& out_backprop, Tensor* output) {
-    const SYCLDevice& device = context->eigen_device<SYCLDevice>();
-
-    const int num_threads = output->NumElements();
-
-    auto input_data_buffer =
-        device.get_sycl_buffer(tensor_in.template flat<T>().data());
-    auto output_data_buffer =
-        device.get_sycl_buffer(tensor_out.template flat<T>().data());
-    auto input_backprop_buffer =
-        device.get_sycl_buffer(out_backprop.template flat<T>().data());
-    auto output_backprop_buffer =
-        device.get_sycl_buffer(output->template flat<T>().data());
-
-    device.sycl_queue().submit([&](cl::sycl::handler& cgh) {
-      auto input_data_access =
-          input_data_buffer.template get_access<cl::sycl::access::mode::read>(
-              cgh);
-      auto output_data_access =
-          output_data_buffer.template get_access<cl::sycl::access::mode::read>(
-              cgh);
-      auto input_backprop_access =
-          input_backprop_buffer
-              .template get_access<cl::sycl::access::mode::read>(cgh);
-      auto output_backprop_access =
-          output_backprop_buffer
-              .template get_access<cl::sycl::access::mode::write>(cgh);
-      MaxPool3DGradGradSYCL<T> functor(
-          params, input_data_access, output_data_access, input_backprop_access,
-          output_backprop_access);
-
-      cgh.parallel_for(cl::sycl::range<1>(num_threads), functor);
-    });
-  }
-};
-// AvgPool3D SYCL kernel. Expects the number of threads to be equal to the
-// number of elements in the output tensor.
-//
-// For each output value find the corresponding input window, and run through
-// the window accumulating the values to form an average. We divide each value
-// before accumulating to prevent the accumulator from becoming significantly
-// bigger than the values we are adding and so decrease any errors.
-template <typename T>
-class AvgPool3DSYCL {
-  using write_accessor =
-      cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write,
-                         cl::sycl::access::target::global_buffer>;
-  using read_accessor =
-      cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read,
-                         cl::sycl::access::target::global_buffer>;
-
- public:
-  AvgPool3DSYCL(const int depth, const int batch, const int in_planes,
-                const int in_rows, const int in_cols, const int out_planes,
-                const int out_rows, const int out_cols,
-                const std::array<int64, 3>& window,
-                const std::array<int64, 3>& stride,
-                const std::array<int64, 3>& padding,
-                const read_accessor input_accessor,
-                write_accessor output_accessor)
-      : p_(depth, batch, in_planes, in_rows, in_cols, out_planes, out_rows,
-           out_cols, window, stride, padding),
-        input_accessor_(input_accessor),
-        output_accessor_(output_accessor) {}
-  void operator()(cl::sycl::item<1> item) {
-    T* input_data = ConvertToActualTypeSycl(T, input_accessor_);
-    T* output_data = ConvertToActualTypeSycl(T, output_accessor_);
-
-    int index = item.get_linear_id();
-    int n = index;
-    int d = n % p_.depth_;
-    n /= p_.depth_;
-    int cstart = (n % p_.out_cols_) * p_.stride_cols_ - p_.pad_cols_;
-    int cend = std::min(cstart + p_.window_cols_, p_.in_cols_);
-    cstart = std::max(cstart, 0);
-    n /= p_.out_cols_;
-    int rstart = (n % p_.out_rows_) * p_.stride_rows_ - p_.pad_rows_;
-    int rend = std::min(rstart + p_.window_rows_, p_.in_rows_);
-    rstart = std::max(rstart, 0);
-    n /= p_.out_rows_;
-    int pstart = (n % p_.out_planes_) * p_.stride_planes_ - p_.pad_planes_;
-    int pend = std::min(pstart + p_.window_planes_, p_.in_planes_);
-    pstart = std::max(pstart, 0);
-    n /= p_.out_planes_;
-    T accum = T(0);
-    T count =
-        static_cast<T>((pend - pstart) * (rend - rstart) * (cend - cstart));
-    const T* input_data_n =
-        input_data + n * p_.in_planes_ * p_.in_cols_ * p_.in_rows_ * p_.depth_;
-    for (int p = pstart; p < pend; ++p) {
-      for (int r = rstart; r < rend; ++r) {
-        for (int c = cstart; c < cend; ++c) {
-          int idx = ((p * p_.in_rows_ + r) * p_.in_cols_ + c) * p_.depth_ + d;
-          accum += input_data_n[idx] / count;
-        }
-      }
-    }
-    output_data[index] = accum;
-  }
-
- private:
-  const SYCL3DPoolParams p_;
-  const read_accessor input_accessor_;
-  write_accessor output_accessor_;
-};
-template <typename T>
-struct LaunchPoolingOp<SYCLDevice, T, AVG> {
-  static void launch(OpKernelContext* context, const Tensor& tensor_in,
-                     const std::array<int64, 3>& window,
-                     const std::array<int64, 3>& stride,
-                     const std::array<int64, 3>& padding,
-                     TensorFormat data_format, Padding padding_type,
-                     Tensor* output) {
-    const SYCLDevice& device = context->eigen_device<SYCLDevice>();
-    const int out_planes = GetTensorDim(*output, data_format, '0');
-    const int out_rows = GetTensorDim(*output, data_format, '1');
-    const int out_cols = GetTensorDim(*output, data_format, '2');
-    const int batch = GetTensorDim(tensor_in, data_format, 'N');
-    const int in_planes = GetTensorDim(tensor_in, data_format, '0');
-    const int in_rows = GetTensorDim(tensor_in, data_format, '1');
-    const int in_cols = GetTensorDim(tensor_in, data_format, '2');
-    const int depth = GetTensorDim(tensor_in, data_format, 'C');
-
-    const int num_threads = output->NumElements();
-
-    auto input_buffer =
-        device.get_sycl_buffer(tensor_in.template flat<T>().data());
-    auto output_buffer =
-        device.get_sycl_buffer(output->template flat<T>().data());
-
-    device.sycl_queue().submit([&](cl::sycl::handler& cgh) {
-      auto input_access =
-          input_buffer.template get_access<cl::sycl::access::mode::read>(cgh);
-      auto output_access =
-          output_buffer.template get_access<cl::sycl::access::mode::write>(cgh);
-      AvgPool3DSYCL<T> avg_pool(depth, batch, in_planes, in_rows, in_cols,
-                                out_planes, out_rows, out_cols, window, stride,
-                                padding, input_access, output_access);
-
-      cgh.parallel_for(cl::sycl::range<1>(num_threads), avg_pool);
-    });
-  }
-};
-// AvgPool3DGrad SYCL kernel. Expects the number of threads to be equal to the
-// number of elements in the output backprop tensor, i.e. the number of
-// elements in the input tensor.
-//
-// For each output backprop index find a window in the input backprop tensor
-// which corresponds to all the values of the output which were affected by the
-// input value at this index. Then for each gradient in this window, compute
-// the size of the input window which was averaged to give this output, and use
-// this size to scale the gradient accordingly. Add this scaled gradient to the
-// output backprop value.
-template <typename T>
-class AvgPool3DGradSYCL {
-  using write_accessor =
-      cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write,
-                         cl::sycl::access::target::global_buffer>;
-  using read_accessor =
-      cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read,
-                         cl::sycl::access::target::global_buffer>;
-
- public:
-  AvgPool3DGradSYCL(const int depth, const int batch, const int in_planes,
-                    const int in_rows, const int in_cols,
-                    const std::array<int64, 3>& out_shape,
-                    const std::array<int64, 3>& window,
-                    const std::array<int64, 3>& stride,
-                    const std::array<int64, 3>& padding,
-                    const read_accessor input_backprop_accessor,
-                    write_accessor output_backprop_accessor)
-      : p_(depth, batch, in_planes, in_rows, in_cols, out_shape, window, stride,
-           padding),
-        input_backprop_accessor_(input_backprop_accessor),
-        output_backprop_accessor_(output_backprop_accessor) {}
-  void operator()(cl::sycl::item<1> item) {
-    T* input_backprop = ConvertToActualTypeSycl(T, input_backprop_accessor_);
-    T* output_backprop = ConvertToActualTypeSycl(T, output_backprop_accessor_);
-
-    const int index = item.get_linear_id();
-    int n = index;
-    const int d = n % p_.depth_;
-    n /= p_.depth_;
-    const int c = (n % p_.in_cols_) + p_.pad_cols_;
-    const int poolcstart =
-        (c < p_.window_cols_) ? 0 : (c - p_.window_cols_) / p_.stride_cols_ + 1;
-    const int poolcend = std::min(c / p_.stride_cols_ + 1, p_.out_cols_);
-    n /= p_.in_cols_;
-    const int r = (n % p_.in_rows_) + p_.pad_rows_;
-    const int poolrstart =
-        (r < p_.window_rows_) ? 0 : (r - p_.window_rows_) / p_.stride_rows_ + 1;
-    const int poolrend = std::min(r / p_.stride_rows_ + 1, p_.out_rows_);
-    n /= p_.in_rows_;
-    const int p = (n % p_.in_planes_) + p_.pad_planes_;
-    const int poolpstart =
-        (p < p_.window_planes_)
-            ? 0
-            : (p - p_.window_planes_) / p_.stride_planes_ + 1;
-    const int poolpend = std::min(p / p_.stride_planes_ + 1, p_.out_planes_);
-    n /= p_.in_planes_;
-
-    T gradient = T(0);
-    const T* input_backprop_n = input_backprop + n * p_.out_planes_ *
-                                                     p_.out_cols_ *
-                                                     p_.out_rows_ * p_.depth_;
-    for (int poolp = poolpstart; poolp < poolpend; ++poolp) {
-      int pstart = poolp * p_.stride_planes_ - p_.pad_planes_;
-      const int pend = std::min(pstart + p_.window_planes_, p_.in_planes_);
-      pstart = std::max(pstart, 0);
-      const int plane_window_size = pend - pstart;
-      for (int poolr = poolrstart; poolr < poolrend; ++poolr) {
-        int rstart = poolr * p_.stride_rows_ - p_.pad_rows_;
-        const int rend = std::min(rstart + p_.window_rows_, p_.in_rows_);
-        rstart = std::max(rstart, 0);
-        const int row_window_size = rend - rstart;
-        for (int poolc = poolcstart; poolc < poolcend; ++poolc) {
-          const int idx =
-              ((poolp * p_.out_rows_ + poolr) * p_.out_cols_ + poolc) *
-                  p_.depth_ +
-              d;
-          int cstart = poolc * p_.stride_cols_ - p_.pad_cols_;
-          const int cend = std::min(cstart + p_.window_cols_, p_.in_cols_);
-          cstart = std::max(cstart, 0);
-          const int col_window_size = cend - cstart;
-          const int window_size =
-              plane_window_size * row_window_size * col_window_size;
-          gradient += input_backprop_n[idx] / static_cast<T>(window_size);
-        }
-      }
-    }
-    output_backprop[index] = gradient;
-  }
-
- private:
-  const SYCL3DPoolParams p_;
-  const read_accessor input_backprop_accessor_;
-  write_accessor output_backprop_accessor_;
-};
-template <typename T>
-struct LaunchAvgPooling3dGradOp<SYCLDevice, T> {
-  static void launch(OpKernelContext* context,
-                     const TensorShape& tensor_in_shape,
-                     const Tensor& out_backprop,
-                     const std::array<int64, 3>& window,
-                     const std::array<int64, 3>& stride,
-                     const std::array<int64, 3>& output_shape,
-                     const std::array<int64, 3>& padding,
-                     TensorFormat data_format, Tensor* output) {
-    const SYCLDevice& device = context->eigen_device<SYCLDevice>();
-    const int batch = GetTensorDim(tensor_in_shape, data_format, 'N');
-    const int in_planes = GetTensorDim(tensor_in_shape, data_format, '0');
-    const int in_rows = GetTensorDim(tensor_in_shape, data_format, '1');
-    const int in_cols = GetTensorDim(tensor_in_shape, data_format, '2');
-    const int depth = GetTensorDim(tensor_in_shape, data_format, 'C');
-
-    const int num_threads = output->NumElements();
-
-    auto input_backprop_buffer =
-        device.get_sycl_buffer(out_backprop.template flat<T>().data());
-    auto output_backprop_buffer =
-        device.get_sycl_buffer(output->template flat<T>().data());
-
-    device.sycl_queue().submit([&](cl::sycl::handler& cgh) {
-      auto input_backprop_access =
-          input_backprop_buffer
-              .template get_access<cl::sycl::access::mode::read>(cgh);
-      auto output_backprop_access =
-          output_backprop_buffer
-              .template get_access<cl::sycl::access::mode::write>(cgh);
-      AvgPool3DGradSYCL<T> functor(
-          depth, batch, in_planes, in_rows, in_cols, output_shape, window,
-          stride, padding, input_backprop_access, output_backprop_access);
-
-      cgh.parallel_for(cl::sycl::range<1>(num_threads), functor);
-    });
-  }
-};
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_KERNELS_POOLING_OP_3D_SYCL_H_
diff --git a/tensorflow/core/kernels/pooling_ops_common.cc b/tensorflow/core/kernels/pooling_ops_common.cc
index 4bd710546fe..ca2f0ecfe21 100644
--- a/tensorflow/core/kernels/pooling_ops_common.cc
+++ b/tensorflow/core/kernels/pooling_ops_common.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/kernel_shape_util.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -49,12 +50,75 @@ struct RawType<qint8> {
   using type = int8;
 };
 
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+template <typename T>
+struct PadInputWithNegativeInf {
+  Status operator()(const GPUDevice& d,
+                    typename TTypes<T, 4, int>::ConstTensor in,
+                    int input_pad_top, int input_pad_bottom, int input_pad_left,
+                    int input_pad_right, typename TTypes<T, 4, int>::Tensor out,
+                    TensorFormat format) {
+    T padding_value = -std::numeric_limits<T>::infinity();
+    functor::PadInput<GPUDevice, T, int, 4>()(
+        d, in, {{input_pad_top, input_pad_left}},
+        {{input_pad_bottom, input_pad_right}}, out, format, padding_value);
+    return Status::OK();
+  }
+};
+
+template <>
+struct PadInputWithNegativeInf<qint8> {
+  Status operator()(const GPUDevice& d,
+                    typename TTypes<qint8, 4, int>::ConstTensor in,
+                    int input_pad_top, int input_pad_bottom, int input_pad_left,
+                    int input_pad_right,
+                    typename TTypes<qint8, 4, int>::Tensor out,
+                    TensorFormat format) {
+    return errors::InvalidArgument(
+        "Explicit padding not yet supported with qint8");
+  }
+};
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
 }  // namespace
 
+Status CheckPaddingSize(int64 window_rows, int64 window_cols, int64 pad_top,
+                        int64 pad_bottom, int64 pad_left, int64 pad_right) {
+  if (!FastBoundsCheck(pad_top, window_rows)) {
+    return errors::InvalidArgument("Top padding ", pad_top,
+                                   " needs to be smaller than the "
+                                   "window size ",
+                                   window_rows);
+  }
+  if (!FastBoundsCheck(pad_bottom, window_rows)) {
+    return errors::InvalidArgument("Bottom padding ", pad_bottom,
+                                   " needs to be smaller than the "
+                                   "window size ",
+                                   window_rows);
+  }
+  if (!FastBoundsCheck(pad_left, window_cols)) {
+    return errors::InvalidArgument("Left padding ", pad_left,
+                                   " needs to be smaller than the "
+                                   "window size ",
+                                   window_cols);
+  }
+  if (!FastBoundsCheck(pad_right, window_cols)) {
+    return errors::InvalidArgument("Right padding ", pad_right,
+                                   " needs to be smaller than the "
+                                   "window size ",
+                                   window_cols);
+  }
+  return Status::OK();
+}
+
 PoolParameters::PoolParameters(OpKernelContext* context,
                                const std::vector<int32>& ksize,
                                const std::vector<int32>& stride,
-                               Padding padding, TensorFormat data_format,
+                               Padding padding,
+                               std::vector<int64> explicit_paddings,
+                               TensorFormat data_format,
                                const TensorShape& tensor_in_shape) {
   // For maxpooling, tensor_in should have 2 spatial dimensions.
   // Note: the total number of dimensions could be 4 for NHWC, NCHW,
@@ -85,14 +149,24 @@ PoolParameters::PoolParameters(OpKernelContext* context,
               errors::Unimplemented(
                   "MaxPooling supports exactly one of pooling across depth "
                   "or pooling across width/height."));
+  if (padding == Padding::EXPLICIT) {
+    OP_REQUIRES_OK(context, CheckValidPadding(padding, explicit_paddings,
+                                              /*num_dims=*/4, data_format));
+    GetExplicitPaddingForDim(explicit_paddings, data_format, 'H', &pad_top,
+                             &pad_bottom);
+    GetExplicitPaddingForDim(explicit_paddings, data_format, 'W', &pad_left,
+                             &pad_right);
+    OP_REQUIRES_OK(context, CheckPaddingSize(window_rows, window_cols, pad_top,
+                                             pad_bottom, pad_left, pad_right));
+  }
 
   if (depth_window == 1) {
-    OP_REQUIRES_OK(
-        context, GetWindowedOutputSize(tensor_in_rows, window_rows, row_stride,
-                                       padding, &out_height, &pad_rows));
-    OP_REQUIRES_OK(
-        context, GetWindowedOutputSize(tensor_in_cols, window_cols, col_stride,
-                                       padding, &out_width, &pad_cols));
+    OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose(
+                                tensor_in_rows, window_rows, row_stride,
+                                padding, &out_height, &pad_top, &pad_bottom));
+    OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose(
+                                tensor_in_cols, window_cols, col_stride,
+                                padding, &out_width, &pad_left, &pad_right));
     pad_depth = 0;
     out_depth = depth;
   } else {
@@ -140,6 +214,7 @@ void DnnPoolingOp<T>::Compute(OpKernelContext* context,
                               se::dnn::PoolingMode pooling_mode,
                               const std::vector<int32>& size,
                               const std::vector<int32>& stride, Padding padding,
+                              std::vector<int64> explicit_paddings,
                               TensorFormat data_format, const Tensor& tensor_in,
                               const TensorShape& tensor_out_shape,
                               bool propagate_nans) {
@@ -150,14 +225,18 @@ void DnnPoolingOp<T>::Compute(OpKernelContext* context,
     return;
   }
 
-  PoolParameters params{context, size,        stride,
-                        padding, data_format, tensor_in.shape()};
+  PoolParameters params{
+      context,           size,        stride,           padding,
+      explicit_paddings, data_format, tensor_in.shape()};
   if (!context->status().ok()) {
     return;
   }
 
   int batch_size = params.tensor_in_batch;
   int depth = params.depth;
+  int tensor_in_cols = params.tensor_in_cols;
+  int tensor_in_rows = params.tensor_in_rows;
+
 #if CUDNN_VERSION < 7300
   /// Earlier versions do not support NHWC format, so we need to convert it
   /// to NCHW before calling cudnn. We need to get rid of this once it is done
@@ -186,7 +265,7 @@ void DnnPoolingOp<T>::Compute(OpKernelContext* context,
   }
   se::dnn::DataLayout data_layout = se::dnn::DataLayout::kBatchDepthYX;
 #else
-  auto& transformed_input = tensor_in;
+  Tensor transformed_input = tensor_in;
   auto& transformed_output = *tensor_out;
   se::dnn::DataLayout data_layout;
   switch (data_format) {
@@ -209,21 +288,81 @@ void DnnPoolingOp<T>::Compute(OpKernelContext* context,
                                           ToString(data_format)));
   }
 #endif
-  /// Get ready to call cudnn
+
+  int64 vertical_padding = params.pad_top;
+  int64 horizontal_padding = params.pad_left;
+
+  if (padding == EXPLICIT && (params.pad_top != params.pad_bottom ||
+                              params.pad_left != params.pad_right)) {
+    // cuDNN only supports padding the same amount on the left and right sides,
+    // and on the top and bottom sides. So we manually create a new padded
+    // input tensor such that we can pass it to cuDNN.
+    const int64 common_padding_rows =
+        std::min(params.pad_top, params.pad_bottom);
+    const int64 common_padding_cols =
+        std::min(params.pad_left, params.pad_right);
+
+    Tensor padded_input;
+    const int64 padding_rows_diff =
+        std::abs(params.pad_top - params.pad_bottom);
+    const int64 padding_cols_diff =
+        std::abs(params.pad_left - params.pad_right);
+
+    const int64 new_in_rows = tensor_in_rows + padding_rows_diff;
+    const int64 new_in_cols = tensor_in_cols + padding_cols_diff;
+
+    OP_REQUIRES_OK(
+        context,
+        context->allocate_temp(DataTypeToEnum<T>::value,
+                               ShapeFromFormat(data_format, batch_size,
+                                               new_in_rows, new_in_cols, depth),
+                               &padded_input));
+    const int64 input_pad_top = params.pad_top - common_padding_rows;
+    const int64 input_pad_bottom = params.pad_bottom - common_padding_rows;
+    const int64 input_pad_left = params.pad_left - common_padding_cols;
+    const int64 input_pad_right = params.pad_right - common_padding_cols;
+
+    bool in_bounds =
+        FastBoundsCheck(input_pad_top, std::numeric_limits<int>::max()) &&
+        FastBoundsCheck(input_pad_bottom, std::numeric_limits<int>::max()) &&
+        FastBoundsCheck(input_pad_left, std::numeric_limits<int>::max()) &&
+        FastBoundsCheck(input_pad_right, std::numeric_limits<int>::max());
+    if (!in_bounds) {
+      context->SetStatus(errors::InvalidArgument("Padding is too large."));
+      return;
+    }
+
+    // We need to call the const version of transformed_input.tensor()
+    const Tensor& const_transformed_input = transformed_input;
+    OP_REQUIRES_OK(
+        context,
+        PadInputWithNegativeInf<T>()(
+            context->eigen_device<GPUDevice>(),
+            To32Bit(const_transformed_input.tensor<T, 4>()),
+            static_cast<int>(input_pad_top), static_cast<int>(input_pad_bottom),
+            static_cast<int>(input_pad_left), static_cast<int>(input_pad_right),
+            To32Bit(padded_input.tensor<T, 4>()), data_format));
+    transformed_input = padded_input;
+    vertical_padding = common_padding_rows;
+    horizontal_padding = common_padding_cols;
+    tensor_in_rows = new_in_rows;
+    tensor_in_cols = new_in_cols;
+  }
+
   se::dnn::PoolingDescriptor pooling_desc;
   pooling_desc.set_pooling_mode(pooling_mode)
       .set_window_height(params.window_rows)
       .set_window_width(params.window_cols)
       .set_vertical_stride(params.row_stride)
       .set_horizontal_stride(params.col_stride)
-      .set_vertical_padding(params.pad_rows)
-      .set_horizontal_padding(params.pad_cols)
+      .set_vertical_padding(vertical_padding)
+      .set_horizontal_padding(horizontal_padding)
       .set_propagate_nans(propagate_nans);
 
   se::dnn::BatchDescriptor input_desc;
   input_desc.set_count(batch_size)
-      .set_height(params.tensor_in_rows)
-      .set_width(params.tensor_in_cols)
+      .set_height(tensor_in_rows)
+      .set_width(tensor_in_cols)
       .set_feature_map_count(depth)
       .set_layout(data_layout);
 
@@ -280,13 +419,32 @@ void DnnPoolingOp<T>::Compute(OpKernelContext* context,
 #endif
 }
 
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T)                                             \
+  template <>                                                           \
+  void PadInput<GPUDevice, T, int, 4>::operator()(                      \
+      const GPUDevice& d, typename TTypes<T, 4, int>::ConstTensor in,   \
+      const std::array<int, 2>& padding_left,                           \
+      const std::array<int, 2>& padding_right,                          \
+      typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format, \
+      T padding_value);                                                 \
+  extern template struct PadInput<GPUDevice, T, int, 4>;
+
+DECLARE_GPU_SPEC(float);
+DECLARE_GPU_SPEC(Eigen::half);
+DECLARE_GPU_SPEC(double);
+DECLARE_GPU_SPEC(int32);
+}  // namespace functor
+
 template <typename T>
 void DnnPoolingGradOp<T>::Compute(
     OpKernelContext* context, se::dnn::PoolingMode pooling_mode,
     const std::vector<int32>& size, const std::vector<int32>& stride,
-    Padding padding, TensorFormat data_format, const Tensor* tensor_in,
-    const Tensor* tensor_out, const Tensor& out_backprop,
-    const TensorShape& tensor_in_shape, bool propagate_nans) {
+    Padding padding, std::vector<int64> explicit_paddings,
+    TensorFormat data_format, const Tensor* tensor_in, const Tensor* tensor_out,
+    const Tensor& out_backprop, const TensorShape& tensor_in_shape,
+    bool propagate_nans) {
   CHECK((pooling_mode != se::dnn::PoolingMode::kMaximum) ||
         (tensor_in && tensor_out))
       << "For MaxPoolGrad, both tensor_in and tensor_out needs to be "
@@ -299,8 +457,8 @@ void DnnPoolingGradOp<T>::Compute(
     return;
   }
 
-  PoolParameters params{context, size,        stride,
-                        padding, data_format, tensor_in_shape};
+  PoolParameters params{context,           size,        stride,         padding,
+                        explicit_paddings, data_format, tensor_in_shape};
   if (!context->status().ok()) {
     return;
   }
@@ -406,6 +564,98 @@ void DnnPoolingGradOp<T>::Compute(
   }
 #endif  // CUDNN_VERSION < 7300
 
+  int64 vertical_padding = params.pad_top;
+  int64 horizontal_padding = params.pad_left;
+
+  int batch_size = params.tensor_in_batch;
+  int depth = params.depth;
+  int tensor_in_cols = params.tensor_in_cols;
+  int tensor_in_rows = params.tensor_in_rows;
+
+  int64 input_pad_top = 0;
+  int64 input_pad_bottom = 0;
+  int64 input_pad_left = 0;
+  int64 input_pad_right = 0;
+
+  if (padding == EXPLICIT && (params.pad_top != params.pad_bottom ||
+                              params.pad_left != params.pad_right)) {
+    // Pad the input in the same way we did during the forward pass, so that
+    // cuDNN or MIOpen receives the same input during the backward pass function
+    // as it did during the forward pass function.
+    const int64 common_padding_rows =
+        std::min(params.pad_top, params.pad_bottom);
+    const int64 common_padding_cols =
+        std::min(params.pad_left, params.pad_right);
+
+    Tensor padded_input;
+    Tensor padded_input_backprop;
+    const int64 padding_rows_diff =
+        std::abs(params.pad_top - params.pad_bottom);
+    const int64 padding_cols_diff =
+        std::abs(params.pad_left - params.pad_right);
+
+    const int64 new_in_rows = tensor_in_rows + padding_rows_diff;
+    const int64 new_in_cols = tensor_in_cols + padding_cols_diff;
+
+    VLOG(2) << "Create new tensor: "
+            << " original rows=" << tensor_in_rows
+            << " original cols=" << tensor_in_cols
+            << " padding_rows=" << new_in_rows
+            << " padding_cols=" << new_in_cols << " depth= " << depth
+            << " batch_size=" << batch_size << " kernel_rows"
+            << params.window_rows << " kernel_col" << params.window_cols
+            << " stride_rows" << params.row_stride;
+
+    OP_REQUIRES_OK(
+        context,
+        context->allocate_temp(DataTypeToEnum<T>::value,
+                               ShapeFromFormat(data_format, batch_size,
+                                               new_in_rows, new_in_cols, depth),
+                               &padded_input));
+
+    OP_REQUIRES_OK(
+        context,
+        context->allocate_temp(DataTypeToEnum<T>::value,
+                               ShapeFromFormat(data_format, batch_size,
+                                               new_in_rows, new_in_cols, depth),
+                               &transformed_input_backprop));
+
+    input_pad_top = params.pad_top - common_padding_rows;
+    input_pad_bottom = params.pad_bottom - common_padding_rows;
+    input_pad_left = params.pad_left - common_padding_cols;
+    input_pad_right = params.pad_right - common_padding_cols;
+
+    bool in_bounds =
+        FastBoundsCheck(input_pad_top, std::numeric_limits<int>::max()) &&
+        FastBoundsCheck(input_pad_bottom, std::numeric_limits<int>::max()) &&
+        FastBoundsCheck(input_pad_left, std::numeric_limits<int>::max()) &&
+        FastBoundsCheck(input_pad_right, std::numeric_limits<int>::max());
+    if (!in_bounds) {
+      context->SetStatus(errors::InvalidArgument("Padding is too large."));
+      return;
+    }
+
+    // PadInputWithNegativeInf functor requires input to be a const.
+    const Tensor& const_transformed_input = transformed_input;
+    OP_REQUIRES_OK(
+        context,
+        PadInputWithNegativeInf<T>()(
+            context->eigen_device<GPUDevice>(),
+            To32Bit(const_transformed_input.tensor<T, 4>()),
+            static_cast<int>(input_pad_top), static_cast<int>(input_pad_bottom),
+            static_cast<int>(input_pad_left), static_cast<int>(input_pad_right),
+            To32Bit(padded_input.tensor<T, 4>()), data_format));
+
+    transformed_input = padded_input;
+
+    vertical_padding = common_padding_rows;
+    horizontal_padding = common_padding_cols;
+    VLOG(2) << "vertical padding set to: " << vertical_padding
+            << " horizontal padding set to: " << horizontal_padding;
+    tensor_in_rows = new_in_rows;
+    tensor_in_cols = new_in_cols;
+  }
+
   /// Get ready to call cudnn
   se::dnn::PoolingDescriptor pooling_desc;
   pooling_desc.set_pooling_mode(pooling_mode)
@@ -413,8 +663,8 @@ void DnnPoolingGradOp<T>::Compute(
       .set_window_width(params.window_cols)
       .set_vertical_stride(params.row_stride)
       .set_horizontal_stride(params.col_stride)
-      .set_vertical_padding(params.pad_rows)
-      .set_horizontal_padding(params.pad_cols)
+      .set_vertical_padding(vertical_padding)
+      .set_horizontal_padding(horizontal_padding)
       .set_propagate_nans(propagate_nans);
 
   se::dnn::BatchDescriptor orig_output_desc;
@@ -426,8 +676,8 @@ void DnnPoolingGradOp<T>::Compute(
 
   se::dnn::BatchDescriptor orig_input_desc;
   orig_input_desc.set_count(params.tensor_in_batch)
-      .set_height(params.tensor_in_rows)
-      .set_width(params.tensor_in_cols)
+      .set_height(tensor_in_rows)
+      .set_width(tensor_in_cols)
       .set_feature_map_count(params.depth)
       .set_layout(data_layout);
 
@@ -482,6 +732,18 @@ void DnnPoolingGradOp<T>::Compute(
         input_backprop->tensor<T, 4>());
   }
 #endif  // CUDNN_VERSION < 7300
+  if (padding == EXPLICIT && (params.pad_top != params.pad_bottom ||
+                              params.pad_left != params.pad_right)) {
+    // Remove the padding that was added to the input shape above.
+    functor::PadInput<GPUDevice, T, int, 4>()(
+        context->eigen_device<GPUDevice>(),
+        To32Bit(const_cast<const Tensor&>(transformed_input_backprop)
+                    .tensor<T, 4>()),
+        {{static_cast<int>(-input_pad_top), static_cast<int>(-input_pad_left)}},
+        {{static_cast<int>(-input_pad_bottom),
+          static_cast<int>(-input_pad_right)}},
+        To32Bit(input_backprop->tensor<T, 4>()), data_format);
+  }
 }
 
 #define DEFINE_DNN_OPS(T)         \
diff --git a/tensorflow/core/kernels/pooling_ops_common.h b/tensorflow/core/kernels/pooling_ops_common.h
index 2aa1d0b6de0..dacbb872cf0 100644
--- a/tensorflow/core/kernels/pooling_ops_common.h
+++ b/tensorflow/core/kernels/pooling_ops_common.h
@@ -18,7 +18,12 @@ limitations under the License.
 
 #include <vector>
 
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#define EIGEN_USE_GPU
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_shape.h"
@@ -40,9 +45,12 @@ typedef Eigen::GpuDevice GPUDevice;
 // A helper class to manage sizes and shapes for pooling operations.
 struct PoolParameters {
   // Updates context->status if there is an invalid input.
+  // explicit_paddings has eight elements if padding==EXPLIICT, and zero
+  // elements otherwise.
   PoolParameters(OpKernelContext* context, const std::vector<int32>& ksize,
                  const std::vector<int32>& stride, Padding padding,
-                 TensorFormat data_format, const TensorShape& tensor_in_shape);
+                 std::vector<int64> explicit_paddings, TensorFormat data_format,
+                 const TensorShape& tensor_in_shape);
 
   // Returns the shape of the output for "forward" pooling operations.
   TensorShape forward_output_shape();
@@ -65,13 +73,21 @@ struct PoolParameters {
   int64 out_width;
   int out_depth;
 
-  int64 pad_rows;
-  int64 pad_cols;
+  int64 pad_top;
+  int64 pad_bottom;
+  int64 pad_left;
+  int64 pad_right;
+
   int pad_depth;
 
   TensorFormat data_format;
 };
 
+// Checks if the sizes of the paddings are less than the size of window.
+// This is required for MaxPool because it pads with -inf, so the pooling
+// window cannot fully cover the padded area.
+Status CheckPaddingSize(PoolParameters& params);
+
 // An implementation of MaxPooling (forward).
 // TODO (yongtang): Remove MaxPoolingOp and use MaxPoolingV2Op,
 //     QuantizedMaxPoolingOp depends on MaxPoolingOp so keep intact for now
@@ -106,6 +122,10 @@ class MaxPoolingOp : public OpKernel {
                 errors::InvalidArgument("Sliding window stride field must "
                                         "specify 4 dimensions"));
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    if (padding_ == Padding::EXPLICIT) {
+      OP_REQUIRES_OK(
+          context, context->GetAttr("explicit_paddings", &explicit_paddings_));
+    }
     OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
                 errors::Unimplemented(
                     "Pooling is not yet supported on the batch dimension."));
@@ -113,8 +133,9 @@ class MaxPoolingOp : public OpKernel {
 
   void Compute(OpKernelContext* context) override {
     const Tensor& tensor_in = context->input(0);
-    PoolParameters params{context,  ksize_,      stride_,
-                          padding_, FORMAT_NHWC, tensor_in.shape()};
+    PoolParameters params{
+        context,     ksize_,           stride_, padding_, explicit_paddings_,
+        FORMAT_NHWC, tensor_in.shape()};
     if (!context->status().ok()) {
       return;
     }
@@ -134,9 +155,21 @@ class MaxPoolingOp : public OpKernel {
           context, params.depth_window == params.depth_stride,
           errors::Unimplemented("Depthwise max pooling requires "
                                 "the depth window to equal the depth stride."));
+      OP_REQUIRES(
+          context, padding_ != EXPLICIT,
+          errors::Unimplemented("Depthwise max pooling does not support "
+                                "explicit padding."));
 
       DepthwiseMaxPool(context, output, tensor_in, params);
     } else {
+      // MaxPoolingOp is only called on the GPU when the eigen_tensor label
+      // is used. In this case, explicit padding is not supported
+      if (std::is_same<Device, GPUDevice>::value &&
+          padding_ == Padding::EXPLICIT) {
+        context->SetStatus(errors::Unimplemented(
+            "MaxPoolingOp does not support explicit padding."));
+        return;
+      }
       SpatialMaxPool(context, output, tensor_in, params, padding_);
     }
   }
@@ -202,8 +235,8 @@ class MaxPoolingOp : public OpKernel {
       auto shard = [&params, &in_mat, &out_mat](int64 start, int64 limit) {
         const int32 in_rows = params.tensor_in_rows;
         const int32 in_cols = params.tensor_in_cols;
-        const int32 pad_rows = params.pad_rows;
-        const int32 pad_cols = params.pad_cols;
+        const int32 pad_top = params.pad_top;
+        const int32 pad_left = params.pad_left;
         const int32 window_rows = params.window_rows;
         const int32 window_cols = params.window_cols;
         const int32 row_stride = params.row_stride;
@@ -225,8 +258,8 @@ class MaxPoolingOp : public OpKernel {
             for (int32 w = 0; w < in_cols; ++w) {
               // (h_start, h_end) * (w_start, w_end) is the range that the input
               // vector projects to.
-              const int32 hpad = h + pad_rows;
-              const int32 wpad = w + pad_cols;
+              const int32 hpad = h + pad_top;
+              const int32 wpad = w + pad_left;
               const int32 h_start = (hpad < window_rows)
                                         ? 0
                                         : (hpad - window_rows) / row_stride + 1;
@@ -263,6 +296,7 @@ class MaxPoolingOp : public OpKernel {
   std::vector<int32> ksize_;
   std::vector<int32> stride_;
   Padding padding_;
+  std::vector<int64> explicit_paddings_;
   TensorFormat data_format_;
 };
 
@@ -280,7 +314,7 @@ struct LaunchMaxPoolingNoMask_NCHW_VECT_C<Eigen::GpuDevice> {
         params.tensor_in_batch, params.tensor_in_rows, params.tensor_in_cols,
         params.depth, params.out_height, params.out_width, params.window_rows,
         params.window_cols, params.row_stride, params.col_stride,
-        params.pad_rows, params.pad_cols,
+        params.pad_top, params.pad_left,
         reinterpret_cast<int32*>(output->flat<qint8>().data()),
         context->eigen_gpu_device());
     if (!status) {
@@ -358,8 +392,15 @@ class MaxPoolingV2Op : public OpKernel {
                 errors::Unimplemented(
                     "Pooling is not yet supported on the batch dimension."));
 
-    PoolParameters params{context,  ksize,        stride,
-                          padding_, data_format_, tensor_in.shape()};
+    PoolParameters params{
+        context,
+        ksize,
+        stride,
+        padding_,
+        /*explicit_paddings=*/{},
+        data_format_,
+        tensor_in.shape(),
+    };
     if (!context->status().ok()) {
       return;
     }
@@ -455,8 +496,8 @@ class MaxPoolingV2Op : public OpKernel {
       auto shard = [&params, &in_mat, &out_mat](int64 start, int64 limit) {
         const int32 in_rows = params.tensor_in_rows;
         const int32 in_cols = params.tensor_in_cols;
-        const int32 pad_rows = params.pad_rows;
-        const int32 pad_cols = params.pad_cols;
+        const int32 pad_top = params.pad_top;
+        const int32 pad_left = params.pad_left;
         const int32 window_rows = params.window_rows;
         const int32 window_cols = params.window_cols;
         const int32 row_stride = params.row_stride;
@@ -478,8 +519,8 @@ class MaxPoolingV2Op : public OpKernel {
             for (int32 w = 0; w < in_cols; ++w) {
               // (h_start, h_end) * (w_start, w_end) is the range that the input
               // vector projects to.
-              const int32 hpad = h + pad_rows;
-              const int32 wpad = w + pad_cols;
+              const int32 hpad = h + pad_top;
+              const int32 wpad = w + pad_left;
               const int32 h_start = (hpad < window_rows)
                                         ? 0
                                         : (hpad - window_rows) / row_stride + 1;
@@ -567,8 +608,8 @@ void SpatialAvgPool(OpKernelContext* context, Tensor* output,
         for (int w = 0; w < params.tensor_in_cols; ++w) {
           // (h_start, h_end) * (w_start, w_end) is the range that the input
           // vector projects to.
-          const int hpad = h + params.pad_rows;
-          const int wpad = w + params.pad_cols;
+          const int hpad = h + params.pad_top;
+          const int wpad = w + params.pad_left;
           const int h_start =
               (hpad < params.window_rows)
                   ? 0
diff --git a/tensorflow/core/kernels/pooling_ops_common_gpu.h b/tensorflow/core/kernels/pooling_ops_common_gpu.h
index 9685bd9fdd0..8e391f35aee 100644
--- a/tensorflow/core/kernels/pooling_ops_common_gpu.h
+++ b/tensorflow/core/kernels/pooling_ops_common_gpu.h
@@ -43,6 +43,7 @@ class DnnPoolingOp {
                       se::dnn::PoolingMode pooling_mode,
                       const std::vector<int32>& size,
                       const std::vector<int32>& stride, Padding padding,
+                      std::vector<int64> explicit_paddings,
                       TensorFormat data_format, const Tensor& tensor_in,
                       const TensorShape& tensor_out_shape, bool propagate_nans);
 };
@@ -58,6 +59,7 @@ class DnnPoolingGradOp {
                       se::dnn::PoolingMode pooling_mode,
                       const std::vector<int32>& size,
                       const std::vector<int32>& stride, Padding padding,
+                      std::vector<int64> explicit_paddings,
                       TensorFormat data_format, const Tensor* tensor_in,
                       const Tensor* tensor_out, const Tensor& out_backprop,
                       const TensorShape& tensor_in_shape, bool propagate_nans);
diff --git a/tensorflow/core/kernels/quantized_pooling_ops.cc b/tensorflow/core/kernels/quantized_pooling_ops.cc
index 33a12c47466..cf8d6838b1c 100644
--- a/tensorflow/core/kernels/quantized_pooling_ops.cc
+++ b/tensorflow/core/kernels/quantized_pooling_ops.cc
@@ -54,8 +54,13 @@ class QuantizedAvgPoolingOp : public OpKernel {
 
   void Compute(OpKernelContext* context) override {
     const Tensor& tensor_in = context->input(0);
-    PoolParameters params{context,  ksize_,      stride_,
-                          padding_, FORMAT_NHWC, tensor_in.shape()};
+    PoolParameters params{context,
+                          ksize_,
+                          stride_,
+                          padding_,
+                          /*explicit_paddings=*/{},
+                          FORMAT_NHWC,
+                          tensor_in.shape()};
     if (!context->status().ok()) {
       return;
     }
diff --git a/tensorflow/core/kernels/quantized_resize_bilinear_op.cc b/tensorflow/core/kernels/quantized_resize_bilinear_op.cc
index 4da56cde547..da0a35a6554 100644
--- a/tensorflow/core/kernels/quantized_resize_bilinear_op.cc
+++ b/tensorflow/core/kernels/quantized_resize_bilinear_op.cc
@@ -25,9 +25,9 @@ limitations under the License.
 
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/image_resizer_state.h"
 #include "tensorflow/core/kernels/quantization_utils.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/util/image_resizer_state.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/ragged_tensor_from_variant_op.cc b/tensorflow/core/kernels/ragged_tensor_from_variant_op.cc
index ad0712e6fd0..aa736ad7f60 100644
--- a/tensorflow/core/kernels/ragged_tensor_from_variant_op.cc
+++ b/tensorflow/core/kernels/ragged_tensor_from_variant_op.cc
@@ -175,8 +175,22 @@ Status NestedStackRaggedTensors(
     }
   }
 
+  // If the variant tensor input is empty, then we have no way to determine
+  // the correct shape for the dense_values.  (It must have rank>=1, and its
+  // outer dimension must be 0, but we don't know its shape beyond that.)
+  // For now, we just use a shape of `[0]` in this case.
+  // TODO(edloper): Update this op with an attribute containing information
+  // about dense_values shape.  If it's `None`, then we'll probably still have
+  // to use shape=[0] here, but if we have more info, then we can use it.
+  // E.g., in map_fn, we may have shape info from the RaggedTensorSpec.
+  TensorShape component_values_shape;
+  if (ragged_components.empty()) {
+    component_values_shape = TensorShape({0});
+  } else {
+    component_values_shape = ragged_components[0].values.shape();
+  }
+
   // Populate values.
-  TensorShape component_values_shape = ragged_components[0].values.shape();
   int values_size = component_values_shape.dim_size(0);
   for (int i = 1; i < ragged_components.size(); i++) {
     if (ragged_components[i].values.dims() != component_values_shape.dims()) {
diff --git a/tensorflow/core/kernels/ragged_tensor_to_tensor_op.cc b/tensorflow/core/kernels/ragged_tensor_to_tensor_op.cc
index 88931292ef2..28898c65ca7 100644
--- a/tensorflow/core/kernels/ragged_tensor_to_tensor_op.cc
+++ b/tensorflow/core/kernels/ragged_tensor_to_tensor_op.cc
@@ -34,9 +34,9 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/broadcast_to_op.h"
 #include "tensorflow/core/kernels/list_kernels.h"
-#include "tensorflow/core/lib/bfloat16/bfloat16.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/bfloat16.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/bcast.h"
 #include "tensorflow/core/util/ragged_to_dense_util.h"
diff --git a/tensorflow/core/kernels/random_binomial_op.cc b/tensorflow/core/kernels/random_binomial_op.cc
index 4647457ff6f..d040cd93d00 100644
--- a/tensorflow/core/kernels/random_binomial_op.cc
+++ b/tensorflow/core/kernels/random_binomial_op.cc
@@ -30,8 +30,10 @@ limitations under the License.
 
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/rng_alg.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/random_ops_util.h"
 #include "tensorflow/core/kernels/stateful_random_ops_cpu_gpu.h"
 #include "tensorflow/core/kernels/stateless_random_ops.h"
 #include "tensorflow/core/kernels/training_op_helpers.h"
@@ -375,7 +377,7 @@ class RandomBinomialOp : public OpKernel {
     OP_REQUIRES(ctx, alg_tensor.dims() == 0,
                 errors::InvalidArgument("algorithm must be of shape [], not ",
                                         alg_tensor.shape().DebugString()));
-    Algorithm alg = alg_tensor.flat<Algorithm>()(0);
+    Algorithm alg = Algorithm(alg_tensor.flat<int64>()(0));
 
     int64 samples_per_batch = 1;
     const int64 num_sample_dims =
diff --git a/tensorflow/core/kernels/random_op.cc b/tensorflow/core/kernels/random_op.cc
index 152ab5f7d1e..7bc4d9bf4d0 100644
--- a/tensorflow/core/kernels/random_op.cc
+++ b/tensorflow/core/kernels/random_op.cc
@@ -48,9 +48,6 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 namespace {
 
@@ -77,7 +74,7 @@ class PhiloxRandomOp : public OpKernel {
     OP_REQUIRES_OK(ctx, AllocateOutputWithShape(ctx, shape, 0, &output));
     auto output_flat = output->flat<T>();
     functor::FillPhiloxRandom<Device, Distribution>()(
-        ctx, ctx->eigen_device<Device>(),
+        ctx, ctx->eigen_device<Device>(), /*key=*/nullptr, /*counter=*/nullptr,
         // Multiplier 256 is the same as in FillPhiloxRandomTask; do not change
         // it just here.
         generator_.ReserveRandomOutputs(output_flat.size(), 256),
@@ -126,7 +123,7 @@ class RandomUniformIntOp : public OpKernel {
 
     auto output_flat = output->flat<IntType>();
     functor::FillPhiloxRandom<Device, Distribution>()(
-        ctx, ctx->eigen_device<Device>(),
+        ctx, ctx->eigen_device<Device>(), /*key=*/nullptr, /*counter=*/nullptr,
         // Multiplier 256 is the same as in FillPhiloxRandomTask; do not change
         // it just here.
         generator_.ReserveRandomOutputs(output_flat.size(), 256),
@@ -457,52 +454,5 @@ TF_CALL_uint64(REGISTER_FULL_INT);
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-
-#define REGISTER(TYPE)                                                         \
-  template struct functor::FillPhiloxRandom<                                   \
-      SYCLDevice, random::UniformDistribution<random::PhiloxRandom, TYPE>>;    \
-  REGISTER_KERNEL_BUILDER(                                                     \
-      Name("RandomUniform")                                                    \
-          .Device(DEVICE_SYCL)                                                 \
-          .HostMemory("shape")                                                 \
-          .TypeConstraint<TYPE>("dtype"),                                      \
-      PhiloxRandomOp<SYCLDevice, random::UniformDistribution<                  \
-                                     random::PhiloxRandom, TYPE>>);            \
-  REGISTER_KERNEL_BUILDER(                                                     \
-      Name("RandomStandardNormal")                                             \
-          .Device(DEVICE_SYCL)                                                 \
-          .HostMemory("shape")                                                 \
-          .TypeConstraint<TYPE>("dtype"),                                      \
-      PhiloxRandomOp<SYCLDevice,                                               \
-                     random::NormalDistribution<random::PhiloxRandom, TYPE>>); \
-  REGISTER_KERNEL_BUILDER(                                                     \
-      Name("TruncatedNormal")                                                  \
-          .Device(DEVICE_SYCL)                                                 \
-          .HostMemory("shape")                                                 \
-          .TypeConstraint<TYPE>("dtype"),                                      \
-      PhiloxRandomOp<                                                          \
-          SYCLDevice,                                                          \
-          random::TruncatedNormalDistribution<                                 \
-              random::SingleSampleAdapter<random::PhiloxRandom>, TYPE>>);
-
-#define REGISTER_INT(IntType)                                   \
-  REGISTER_KERNEL_BUILDER(Name("RandomUniformInt")              \
-                              .Device(DEVICE_SYCL)              \
-                              .HostMemory("shape")              \
-                              .HostMemory("minval")             \
-                              .HostMemory("maxval")             \
-                              .TypeConstraint<IntType>("Tout"), \
-                          RandomUniformIntOp<SYCLDevice, IntType>);
-
-TF_CALL_float(REGISTER);
-TF_CALL_double(REGISTER);
-TF_CALL_int32(REGISTER_INT);
-TF_CALL_int64(REGISTER_INT);
-
-#undef REGISTER
-#undef REGISTER_INT
-
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/random_op.h b/tensorflow/core/kernels/random_op.h
index c3f138a87f6..5c8615344d9 100644
--- a/tensorflow/core/kernels/random_op.h
+++ b/tensorflow/core/kernels/random_op.h
@@ -34,10 +34,14 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 // NOTE: Due to inlining done by the compiler, you may need to add
 // explicit instantiation of the functor in random_op.cc.  See example
 // functor::FillPhiloxRandom<CPUDevice, random::UniformDistribution>.
+//
+// This functor can take the PhiloxRandom input from either device memory `key`
+// and `counter` or a stack value `gen`. If both `key` and `counter` are not
+// nullptr, they provide the input; otherwise `gen` provides the input.
 template <class Distribution>
 struct FillPhiloxRandom<CPUDevice, Distribution> {
-  void operator()(OpKernelContext* ctx, const CPUDevice& d,
-                  random::PhiloxRandom gen,
+  void operator()(OpKernelContext* ctx, const CPUDevice& d, const uint64* key,
+                  const uint64* counter, random::PhiloxRandom gen,
                   typename Distribution::ResultElementType* data, int64 size,
                   Distribution dist);
 };
@@ -47,25 +51,13 @@ typedef Eigen::GpuDevice GPUDevice;
 // Declares the partially GPU-specialized functor struct.
 template <class Distribution>
 struct FillPhiloxRandom<GPUDevice, Distribution> {
-  void operator()(OpKernelContext* ctx, const GPUDevice& d,
-                  random::PhiloxRandom gen,
+  void operator()(OpKernelContext* ctx, const GPUDevice& d, const uint64* key,
+                  const uint64* counter, random::PhiloxRandom gen,
                   typename Distribution::ResultElementType* data, int64 size,
                   Distribution dist);
 };
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#if TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-// Declares the partially SYCL-specialized functor struct.
-template <class Distribution>
-struct FillPhiloxRandom<SYCLDevice, Distribution> {
-  void operator()(OpKernelContext* ctx, const SYCLDevice& d,
-                  random::PhiloxRandom gen,
-                  typename Distribution::ResultElementType* data, int64 size,
-                  Distribution dist);
-};
-#endif  // TENSORFLOW_USE_SYCL
-
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/random_op_cpu.h b/tensorflow/core/kernels/random_op_cpu.h
index eac1faee2e4..461de98ccac 100644
--- a/tensorflow/core/kernels/random_op_cpu.h
+++ b/tensorflow/core/kernels/random_op_cpu.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/kernels/random_op.h"
+#include "tensorflow/core/kernels/random_ops_util.h"
 #include "tensorflow/core/lib/hash/crc32c.h"
 #include "tensorflow/core/lib/random/random_distributions.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
@@ -48,9 +49,6 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 namespace functor {
 using random::PhiloxRandom;
@@ -62,8 +60,9 @@ using random::SingleSampleAdapter;
 template <typename Device, class Distribution>
 struct FillPhiloxRandom {
   typedef typename Distribution::ResultElementType T;
-  void operator()(OpKernelContext* ctx, const Device&, random::PhiloxRandom gen,
-                  T* data, int64 size, Distribution dist) {
+  void operator()(OpKernelContext* ctx, const Device&, const uint64* key,
+                  const uint64* counter, random::PhiloxRandom gen, T* data,
+                  int64 size, Distribution dist) {
     OP_REQUIRES(
         ctx, false,
         errors::Internal(
@@ -157,18 +156,24 @@ struct FillPhiloxRandomTask<Distribution, true> {
 // It splits the work into several tasks and run them in parallel
 template <class Distribution>
 void FillPhiloxRandom<CPUDevice, Distribution>::operator()(
-    OpKernelContext* context, const CPUDevice&, random::PhiloxRandom gen,
+    OpKernelContext* ctx, const CPUDevice&, const uint64* key,
+    const uint64* counter, random::PhiloxRandom gen,
     typename Distribution::ResultElementType* data, int64 size,
     Distribution dist) {
   const int kGroupSize = Distribution::kResultElementCount;
 
-  auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
+  auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
 
   int64 total_group_count = (size + kGroupSize - 1) / kGroupSize;
 
   const int kGroupCost =
       random::PhiloxRandom::kResultElementCount *
       (random::PhiloxRandom::kElementCost + Distribution::kElementCost);
+
+  if (key != nullptr && counter != nullptr) {
+    gen = GetPhiloxRandomFromCounterKeyMem(counter, key);
+  }
+
   Shard(worker_threads.num_threads, worker_threads.workers, total_group_count,
         kGroupCost,
         [&gen, data, size, dist](int64 start_group, int64 limit_group) {
@@ -182,146 +187,6 @@ void FillPhiloxRandom<CPUDevice, Distribution>::operator()(
 
 }  // namespace functor
 
-#ifdef TENSORFLOW_USE_SYCL
-
-namespace functor {
-
-template <class Distribution, bool VariableSamplesPerOutput>
-struct FillPhiloxRandomKernel;
-
-template <class Distribution>
-struct FillPhiloxRandomKernel<Distribution, false> {
-  typedef typename Distribution::ResultElementType T;
-  using write_accessor = sycl::accessor<uint8_t, 1, sycl::access::mode::write,
-                                        sycl::access::target::global_buffer>;
-
-  FillPhiloxRandomKernel(write_accessor& data, random::PhiloxRandom& gen,
-                         Distribution& dist)
-      : data_(data), gen_(gen), dist_(dist) {}
-
-  void operator()(sycl::nd_item<1> item) {
-    const size_t kGroupSize = Distribution::kResultElementCount;
-
-    const size_t item_id = item.get_global(0);
-    const size_t total_item_count = item.get_global_range();
-    size_t offset = item_id * kGroupSize;
-    gen_.Skip(item_id);
-
-    const size_t size = data_.get_size() / sizeof(T);
-    T* data = ConvertToActualTypeSycl(T, data_);
-
-    while (offset + kGroupSize <= size) {
-      const typename Distribution::ResultType samples = dist_(&gen_);
-      for (size_t i = 0; i < kGroupSize; ++i) {
-        data[offset + i] = samples[i];
-      }
-
-      offset += (total_item_count - 1) * kGroupSize;
-      gen_.Skip(total_item_count - 1);
-    }
-
-    const typename Distribution::ResultType samples = dist_(&gen_);
-    for (size_t i = 0; i < kGroupSize; ++i) {
-      if (offset >= size) {
-        return;
-      }
-      data[offset] = samples[i];
-      ++offset;
-    }
-  }
-
- private:
-  write_accessor data_;
-  random::PhiloxRandom gen_;
-  Distribution dist_;
-};
-
-template <class Distribution>
-struct FillPhiloxRandomKernel<Distribution, true> {
-  typedef typename Distribution::ResultElementType T;
-  using write_accessor = sycl::accessor<uint8_t, 1, sycl::access::mode::write,
-                                        sycl::access::target::global_buffer>;
-
-  FillPhiloxRandomKernel(write_accessor& data, random::PhiloxRandom& gen,
-                         Distribution& dist)
-      : data_(data), gen_(gen), dist_(dist) {}
-
-  void operator()(sycl::nd_item<1> item) {
-    using random::PhiloxRandom;
-    using random::SingleSampleAdapter;
-
-    const size_t kReservedSamplesPerOutput = 256;
-    const size_t kGroupSize = Distribution::kResultElementCount;
-    const size_t kGeneratorSkipPerOutputGroup =
-        kGroupSize * kReservedSamplesPerOutput /
-        PhiloxRandom::kResultElementCount;
-
-    const size_t item_id = item.get_global(0);
-    const size_t total_item_count = item.get_global_range();
-    size_t group_index = item_id;
-    size_t offset = group_index * kGroupSize;
-
-    T* data = ConvertToActualTypeSycl(T, data_);
-    const size_t size = data_.get_size() / sizeof(T);
-
-    while (offset < size) {
-      // Since each output takes a variable number of samples, we need to
-      // realign the generator to the beginning for the current output group
-      PhiloxRandom gen = gen_;
-      gen.Skip(group_index * kGeneratorSkipPerOutputGroup);
-      SingleSampleAdapter<PhiloxRandom> single_samples(&gen);
-
-      const typename Distribution::ResultType samples = dist_(&single_samples);
-
-      for (size_t i = 0; i < kGroupSize; ++i) {
-        if (offset >= size) {
-          return;
-        }
-        data[offset] = samples[i];
-        ++offset;
-      }
-
-      offset += (total_item_count - 1) * kGroupSize;
-      group_index += total_item_count;
-    }
-  }
-
- private:
-  write_accessor data_;
-  random::PhiloxRandom gen_;
-  Distribution dist_;
-};
-
-template <typename T>
-class FillRandomKernel;
-// Partial specialization for SYCL to fill the entire region with randoms
-// It splits the work into several tasks and run them in parallel
-template <class Distribution>
-void FillPhiloxRandom<SYCLDevice, Distribution>::operator()(
-    OpKernelContext* context, const SYCLDevice& device,
-    random::PhiloxRandom gen, typename Distribution::ResultElementType* data,
-    int64 size, Distribution dist) {
-  const size_t group_size = device.maxSyclThreadsPerBlock();
-  const size_t group_count = (size + group_size - 1) / group_size;
-
-  auto buffer = device.get_sycl_buffer(data);
-
-  device.sycl_queue().submit([&](sycl::handler& cgh) {
-    auto access = buffer.template get_access<sycl::access::mode::write>(cgh);
-
-    FillPhiloxRandomKernel<Distribution,
-                           Distribution::kVariableSamplesPerOutput>
-        task(access, gen, dist);
-    cgh.parallel_for<class FillRandomKernel<Distribution>>(
-        sycl::nd_range<1>(sycl::range<1>(group_count * group_size),
-                          sycl::range<1>(group_size)),
-        task);
-  });
-}
-
-}  // namespace functor
-
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // end namespace tensorflow
 
diff --git a/tensorflow/core/kernels/random_op_gpu.h b/tensorflow/core/kernels/random_op_gpu.h
index 70c96b45af5..71c19aca808 100644
--- a/tensorflow/core/kernels/random_op_gpu.h
+++ b/tensorflow/core/kernels/random_op_gpu.h
@@ -19,6 +19,7 @@ limitations under the License.
 #if defined(__CUDACC__) || TENSORFLOW_USE_ROCM
 
 #include "tensorflow/core/kernels/random_op.h"
+#include "tensorflow/core/kernels/random_ops_util.h"
 #include "tensorflow/core/lib/random/philox_random.h"
 #include "tensorflow/core/lib/random/random_distributions.h"
 #include "tensorflow/core/util/gpu_kernel_helper.h"
@@ -33,14 +34,16 @@ struct FillPhiloxRandomKernel;
 template <class Distribution>
 struct FillPhiloxRandomKernel<Distribution, false> {
   typedef typename Distribution::ResultElementType T;
-  PHILOX_DEVICE_INLINE void Run(random::PhiloxRandom gen, T* data, int64 size,
+  PHILOX_DEVICE_INLINE void Run(const uint64* key, const uint64* counter,
+                                random::PhiloxRandom gen, T* data, int64 size,
                                 Distribution dist);
 };
 
 template <class Distribution>
 struct FillPhiloxRandomKernel<Distribution, true> {
   typedef typename Distribution::ResultElementType T;
-  PHILOX_DEVICE_INLINE void Run(const random::PhiloxRandom& base_gen, T* data,
+  PHILOX_DEVICE_INLINE void Run(const uint64* key, const uint64* counter,
+                                random::PhiloxRandom base_gen, T* data,
                                 int64 size, Distribution dist);
 };
 
@@ -136,12 +139,16 @@ class SampleCopier<int64, 2> {
 // distribution. Each output takes a fixed number of samples.
 template <class Distribution>
 PHILOX_DEVICE_INLINE void FillPhiloxRandomKernel<Distribution, false>::Run(
-    random::PhiloxRandom gen, T* data, int64 size, Distribution dist) {
+    const uint64* key, const uint64* counter, random::PhiloxRandom gen, T* data,
+    int64 size, Distribution dist) {
   const int kGroupSize = Distribution::kResultElementCount;
 
   const int32 thread_id = blockIdx.x * blockDim.x + threadIdx.x;
   const int32 total_thread_count = gridDim.x * blockDim.x;
   int32 offset = thread_id * kGroupSize;
+  if (key != nullptr && counter != nullptr) {
+    gen = GetPhiloxRandomFromCounterKeyMem(counter, key);
+  }
   gen.Skip(thread_id);
 
   const SampleCopier<T, kGroupSize> copier;
@@ -167,8 +174,8 @@ PHILOX_DEVICE_INLINE void FillPhiloxRandomKernel<Distribution, false>::Run(
 // distribution. Each output takes a variable number of samples.
 template <class Distribution>
 PHILOX_DEVICE_INLINE void FillPhiloxRandomKernel<Distribution, true>::Run(
-    const random::PhiloxRandom& base_gen, T* data, int64 size,
-    Distribution dist) {
+    const uint64* key, const uint64* counter, random::PhiloxRandom base_gen,
+    T* data, int64 size, Distribution dist) {
   using random::PhiloxRandom;
   using random::SingleSampleAdapter;
 
@@ -183,6 +190,9 @@ PHILOX_DEVICE_INLINE void FillPhiloxRandomKernel<Distribution, true>::Run(
   int64 group_index = thread_id;
   int64 offset = group_index * kGroupSize;
 
+  if (key != nullptr && counter != nullptr) {
+    base_gen = GetPhiloxRandomFromCounterKeyMem(counter, key);
+  }
   while (offset < size) {
     // Since each output takes a variable number of samples, we need to
     // realign the generator to the beginning for the current output group
@@ -208,18 +218,20 @@ PHILOX_DEVICE_INLINE void FillPhiloxRandomKernel<Distribution, true>::Run(
 // A simple launch pad to call the correct function templates to fill the data
 template <class Distribution>
 __global__ void __launch_bounds__(1024)
-    FillPhiloxRandomKernelLaunch(random::PhiloxRandom base_gen,
+    FillPhiloxRandomKernelLaunch(const uint64* key, const uint64* counter,
+                                 random::PhiloxRandom base_gen,
                                  typename Distribution::ResultElementType* data,
                                  int64 size, Distribution dist) {
   FillPhiloxRandomKernel<Distribution,
                          Distribution::kVariableSamplesPerOutput>()
-      .Run(base_gen, data, size, dist);
+      .Run(key, counter, base_gen, data, size, dist);
 }
 
 // Partial specialization for GPU
 template <class Distribution>
 void FillPhiloxRandom<GPUDevice, Distribution>::operator()(
-    OpKernelContext*, const GPUDevice& d, random::PhiloxRandom gen,
+    OpKernelContext*, const GPUDevice& d, const uint64* key,
+    const uint64* counter, random::PhiloxRandom gen,
     typename Distribution::ResultElementType* data, int64 size,
     Distribution dist) {
   const int32 block_size = d.maxGpuThreadsPerBlock();
@@ -228,8 +240,8 @@ void FillPhiloxRandom<GPUDevice, Distribution>::operator()(
       block_size;
 
   TF_CHECK_OK(GpuLaunchKernel(FillPhiloxRandomKernelLaunch<Distribution>,
-                              num_blocks, block_size, 0, d.stream(), gen, data,
-                              size, dist));
+                              num_blocks, block_size, 0, d.stream(), key,
+                              counter, gen, data, size, dist));
 }
 
 }  // namespace functor
diff --git a/tensorflow/core/kernels/random_ops_util.h b/tensorflow/core/kernels/random_ops_util.h
new file mode 100644
index 00000000000..728eac3485e
--- /dev/null
+++ b/tensorflow/core/kernels/random_ops_util.h
@@ -0,0 +1,72 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_RANDOM_OPS_UTIL_H_
+#define TENSORFLOW_CORE_KERNELS_RANDOM_OPS_UTIL_H_
+
+#include "tensorflow/core/lib/random/philox_random.h"
+
+namespace tensorflow {
+
+using random::PhiloxRandom;
+
+// The following 2 functions use the contract "lower 32 bits for the first
+// uint32, higher 32 bits for the second". Note that this is endian-neutral,
+// unlike a direct memory copy `memcpy(output, &input, 8)`.
+PHILOX_DEVICE_INLINE void Uint64ToUint32s(uint64 input, uint32* output1,
+                                          uint32* output2) {
+  *output1 = static_cast<uint32>(input);
+  *output2 = static_cast<uint32>(input >> 32);
+}
+
+PHILOX_DEVICE_INLINE uint64 Uint32sToUint64(uint32 input1, uint32 input2) {
+  auto u64_1 = static_cast<uint64>(input1);
+  auto u64_2 = static_cast<uint64>(input2);
+  return u64_1 | (u64_2 << 32);
+}
+
+PHILOX_DEVICE_INLINE PhiloxRandom::ResultType GetCounterFromMem(
+    uint64 const* ptr) {
+  PhiloxRandom::ResultType counter;
+  Uint64ToUint32s(ptr[0], &counter[0], &counter[1]);
+  Uint64ToUint32s(ptr[1], &counter[2], &counter[3]);
+  return counter;
+}
+
+PHILOX_DEVICE_INLINE void WriteCounterToMem(
+    PhiloxRandom::ResultType const& counter, uint64* ptr) {
+  ptr[0] = Uint32sToUint64(counter[0], counter[1]);
+  ptr[1] = Uint32sToUint64(counter[2], counter[3]);
+}
+
+PHILOX_DEVICE_INLINE PhiloxRandom::Key GetKeyFromMem(uint64 const* ptr) {
+  PhiloxRandom::Key key;
+  Uint64ToUint32s(ptr[0], &key[0], &key[1]);
+  return key;
+}
+
+PHILOX_DEVICE_INLINE void WriteKeyToMem(PhiloxRandom::Key const& key,
+                                        uint64* ptr) {
+  *ptr = Uint32sToUint64(key[0], key[1]);
+}
+
+PHILOX_DEVICE_INLINE PhiloxRandom GetPhiloxRandomFromCounterKeyMem(
+    uint64 const* counter_ptr, uint64 const* key_ptr) {
+  return PhiloxRandom(GetCounterFromMem(counter_ptr), GetKeyFromMem(key_ptr));
+}
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_RANDOM_OPS_UTIL_H_
diff --git a/tensorflow/core/kernels/reduction_ops_common.h b/tensorflow/core/kernels/reduction_ops_common.h
index 072699288db..2dbf5f7d307 100644
--- a/tensorflow/core/kernels/reduction_ops_common.h
+++ b/tensorflow/core/kernels/reduction_ops_common.h
@@ -41,9 +41,6 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device>
 struct Constants {
@@ -71,10 +68,6 @@ struct ConstantsBase {
 };
 template <>
 struct Constants<CPUDevice> : ConstantsBase {};
-#ifdef TENSORFLOW_USE_SYCL
-template <>
-struct Constants<SYCLDevice> : ConstantsBase {};
-#endif  // TENSORFLOW_USE_SYCL
 #endif  // EIGEN_HAS_INDEX_LIST
 
 class ReductionHelper {
@@ -279,11 +272,6 @@ struct ReduceFunctorBase {
 template <typename Reducer>
 struct ReduceFunctor<CPUDevice, Reducer>
     : ReduceFunctorBase<CPUDevice, Reducer> {};
-#if TENSORFLOW_USE_SYCL
-template <typename Reducer>
-struct ReduceFunctor<SYCLDevice, Reducer>
-    : ReduceFunctorBase<SYCLDevice, Reducer> {};
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace functor
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/reduction_ops_euclidean.cc b/tensorflow/core/kernels/reduction_ops_euclidean.cc
index 9bc11e29069..370328a829f 100644
--- a/tensorflow/core/kernels/reduction_ops_euclidean.cc
+++ b/tensorflow/core/kernels/reduction_ops_euclidean.cc
@@ -58,25 +58,5 @@ TF_CALL_COMPLEX_TYPES(REGISTER_GPU_KERNELS);
 
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNELS(type)                                          \
-  REGISTER_KERNEL_BUILDER(Name("EuclideanNorm")                              \
-                              .Device(DEVICE_SYCL)                           \
-                              .TypeConstraint<type>("T")                     \
-                              .TypeConstraint<int32>("Tidx")                 \
-                              .HostMemory("reduction_indices"),              \
-                          ReductionOp<SYCLDevice, type, int32,               \
-                                      functor::EuclideanNormReducer<type>>); \
-  REGISTER_KERNEL_BUILDER(Name("EuclideanNorm")                              \
-                              .Device(DEVICE_SYCL)                           \
-                              .TypeConstraint<type>("T")                     \
-                              .TypeConstraint<int64>("Tidx")                 \
-                              .HostMemory("reduction_indices"),              \
-                          ReductionOp<SYCLDevice, type, int64,               \
-                                      functor::EuclideanNormReducer<type>>);
-REGISTER_SYCL_KERNELS(float);
-REGISTER_SYCL_KERNELS(double);
-#undef REGISTER_SYCL_KERNELS
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/reduction_ops_max.cc b/tensorflow/core/kernels/reduction_ops_max.cc
index fe9775f7f1d..99b17f402af 100644
--- a/tensorflow/core/kernels/reduction_ops_max.cc
+++ b/tensorflow/core/kernels/reduction_ops_max.cc
@@ -82,44 +82,5 @@ REGISTER_KERNEL_BUILDER(
 
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNELS(type)                                        \
-  REGISTER_KERNEL_BUILDER(Name("Max")                                      \
-                              .Device(DEVICE_SYCL)                         \
-                              .TypeConstraint<type>("T")                   \
-                              .TypeConstraint<int32>("Tidx")               \
-                              .HostMemory("reduction_indices"),            \
-                          ReductionOp<SYCLDevice, type, int32,             \
-                                      Eigen::internal::MaxReducer<type>>); \
-  REGISTER_KERNEL_BUILDER(Name("Max")                                      \
-                              .Device(DEVICE_SYCL)                         \
-                              .TypeConstraint<type>("T")                   \
-                              .TypeConstraint<int64>("Tidx")               \
-                              .HostMemory("reduction_indices"),            \
-                          ReductionOp<SYCLDevice, type, int64,             \
-                                      Eigen::internal::MaxReducer<type>>);
-REGISTER_SYCL_KERNELS(float);
-REGISTER_SYCL_KERNELS(double);
-
-REGISTER_KERNEL_BUILDER(
-    Name("Max")
-        .Device(DEVICE_SYCL)
-        .HostMemory("reduction_indices")
-        .HostMemory("input")
-        .HostMemory("output")
-        .TypeConstraint<int32>("T")
-        .TypeConstraint<int32>("Tidx"),
-    ReductionOp<CPUDevice, int32, int32, Eigen::internal::MaxReducer<int32>>);
-REGISTER_KERNEL_BUILDER(
-    Name("Max")
-        .Device(DEVICE_SYCL)
-        .HostMemory("reduction_indices")
-        .HostMemory("input")
-        .HostMemory("output")
-        .TypeConstraint<int32>("T")
-        .TypeConstraint<int64>("Tidx"),
-    ReductionOp<CPUDevice, int32, int64, Eigen::internal::MaxReducer<int32>>);
-#undef REGISTER_SYCL_KERNELS
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/reduction_ops_mean.cc b/tensorflow/core/kernels/reduction_ops_mean.cc
index e96d6f829ac..2eff4752080 100644
--- a/tensorflow/core/kernels/reduction_ops_mean.cc
+++ b/tensorflow/core/kernels/reduction_ops_mean.cc
@@ -58,25 +58,5 @@ TF_CALL_COMPLEX_TYPES(REGISTER_GPU_KERNELS);
 
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNELS(type)                                      \
-  REGISTER_KERNEL_BUILDER(                                               \
-      Name("Mean")                                                       \
-          .Device(DEVICE_SYCL)                                           \
-          .TypeConstraint<type>("T")                                     \
-          .TypeConstraint<int32>("Tidx")                                 \
-          .HostMemory("reduction_indices"),                              \
-      ReductionOp<SYCLDevice, type, int32, functor::MeanReducer<type>>); \
-  REGISTER_KERNEL_BUILDER(                                               \
-      Name("Mean")                                                       \
-          .Device(DEVICE_SYCL)                                           \
-          .TypeConstraint<type>("T")                                     \
-          .TypeConstraint<int64>("Tidx")                                 \
-          .HostMemory("reduction_indices"),                              \
-      ReductionOp<SYCLDevice, type, int64, functor::MeanReducer<type>>);
-REGISTER_SYCL_KERNELS(float);
-REGISTER_SYCL_KERNELS(double);
-#undef REGISTER_SYCL_KERNELS
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/reduction_ops_min.cc b/tensorflow/core/kernels/reduction_ops_min.cc
index 9f1feae969e..be1d09352e0 100644
--- a/tensorflow/core/kernels/reduction_ops_min.cc
+++ b/tensorflow/core/kernels/reduction_ops_min.cc
@@ -80,44 +80,5 @@ REGISTER_KERNEL_BUILDER(
 
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNELS(type)                                        \
-  REGISTER_KERNEL_BUILDER(Name("Min")                                      \
-                              .Device(DEVICE_SYCL)                         \
-                              .TypeConstraint<type>("T")                   \
-                              .TypeConstraint<int32>("Tidx")               \
-                              .HostMemory("reduction_indices"),            \
-                          ReductionOp<SYCLDevice, type, int32,             \
-                                      Eigen::internal::MinReducer<type>>); \
-  REGISTER_KERNEL_BUILDER(Name("Min")                                      \
-                              .Device(DEVICE_SYCL)                         \
-                              .TypeConstraint<type>("T")                   \
-                              .TypeConstraint<int64>("Tidx")               \
-                              .HostMemory("reduction_indices"),            \
-                          ReductionOp<SYCLDevice, type, int64,             \
-                                      Eigen::internal::MinReducer<type>>);
-REGISTER_SYCL_KERNELS(float);
-REGISTER_SYCL_KERNELS(double);
-
-REGISTER_KERNEL_BUILDER(
-    Name("Min")
-        .Device(DEVICE_SYCL)
-        .HostMemory("reduction_indices")
-        .HostMemory("input")
-        .HostMemory("output")
-        .TypeConstraint<int32>("T")
-        .TypeConstraint<int32>("Tidx"),
-    ReductionOp<CPUDevice, int32, int32, Eigen::internal::MinReducer<int32>>);
-REGISTER_KERNEL_BUILDER(
-    Name("Min")
-        .Device(DEVICE_SYCL)
-        .HostMemory("reduction_indices")
-        .HostMemory("input")
-        .HostMemory("output")
-        .TypeConstraint<int32>("T")
-        .TypeConstraint<int64>("Tidx"),
-    ReductionOp<CPUDevice, int32, int64, Eigen::internal::MinReducer<int32>>);
-#undef REGISTER_SYCL_KERNELS
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/reduction_ops_prod.cc b/tensorflow/core/kernels/reduction_ops_prod.cc
index 33742e97146..a9dfbbca67d 100644
--- a/tensorflow/core/kernels/reduction_ops_prod.cc
+++ b/tensorflow/core/kernels/reduction_ops_prod.cc
@@ -59,26 +59,5 @@ TF_CALL_COMPLEX_TYPES(REGISTER_GPU_KERNELS);
 
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNELS(type)                                         \
-  REGISTER_KERNEL_BUILDER(Name("Prod")                                      \
-                              .Device(DEVICE_SYCL)                          \
-                              .TypeConstraint<type>("T")                    \
-                              .TypeConstraint<int32>("Tidx")                \
-                              .HostMemory("reduction_indices"),             \
-                          ReductionOp<SYCLDevice, type, int32,              \
-                                      Eigen::internal::ProdReducer<type>>); \
-  REGISTER_KERNEL_BUILDER(Name("Prod")                                      \
-                              .Device(DEVICE_SYCL)                          \
-                              .TypeConstraint<type>("T")                    \
-                              .TypeConstraint<int64>("Tidx")                \
-                              .HostMemory("reduction_indices"),             \
-                          ReductionOp<SYCLDevice, type, int64,              \
-                                      Eigen::internal::ProdReducer<type>>);
-REGISTER_SYCL_KERNELS(int32);
-REGISTER_SYCL_KERNELS(float);
-REGISTER_SYCL_KERNELS(double);
-#undef REGISTER_SYCL_KERNELS
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/reduction_ops_sum.cc b/tensorflow/core/kernels/reduction_ops_sum.cc
index b5f7a5d7089..1c3c03f032c 100644
--- a/tensorflow/core/kernels/reduction_ops_sum.cc
+++ b/tensorflow/core/kernels/reduction_ops_sum.cc
@@ -81,44 +81,5 @@ REGISTER_KERNEL_BUILDER(
 
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNELS(type)                                        \
-  REGISTER_KERNEL_BUILDER(Name("Sum")                                      \
-                              .Device(DEVICE_SYCL)                         \
-                              .TypeConstraint<type>("T")                   \
-                              .TypeConstraint<int32>("Tidx")               \
-                              .HostMemory("reduction_indices"),            \
-                          ReductionOp<SYCLDevice, type, int32,             \
-                                      Eigen::internal::SumReducer<type>>); \
-  REGISTER_KERNEL_BUILDER(Name("Sum")                                      \
-                              .Device(DEVICE_SYCL)                         \
-                              .TypeConstraint<type>("T")                   \
-                              .TypeConstraint<int64>("Tidx")               \
-                              .HostMemory("reduction_indices"),            \
-                          ReductionOp<SYCLDevice, type, int64,             \
-                                      Eigen::internal::SumReducer<type>>);
-REGISTER_SYCL_KERNELS(float);
-REGISTER_SYCL_KERNELS(double);
-
-REGISTER_KERNEL_BUILDER(
-    Name("Sum")
-        .Device(DEVICE_SYCL)
-        .TypeConstraint<int32>("T")
-        .TypeConstraint<int32>("Tidx")
-        .HostMemory("input")
-        .HostMemory("output")
-        .HostMemory("reduction_indices"),
-    ReductionOp<CPUDevice, int32, int32, Eigen::internal::SumReducer<int32>>);
-REGISTER_KERNEL_BUILDER(
-    Name("Sum")
-        .Device(DEVICE_SYCL)
-        .TypeConstraint<int32>("T")
-        .TypeConstraint<int64>("Tidx")
-        .HostMemory("input")
-        .HostMemory("output")
-        .HostMemory("reduction_indices"),
-    ReductionOp<CPUDevice, int32, int64, Eigen::internal::SumReducer<int32>>);
-#undef REGISTER_SYCL_KERNELS
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/regex_full_match_op.cc b/tensorflow/core/kernels/regex_full_match_op.cc
index 04da969df12..f00e971c0bc 100644
--- a/tensorflow/core/kernels/regex_full_match_op.cc
+++ b/tensorflow/core/kernels/regex_full_match_op.cc
@@ -20,6 +20,8 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
@@ -28,6 +30,8 @@ class RegexFullMatchOp : public OpKernel {
  public:
   explicit RegexFullMatchOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
 
+  ~RegexFullMatchOp() override {}
+
   void Compute(OpKernelContext* ctx) override {
     const Tensor* input_tensor;
     OP_REQUIRES_OK(ctx, ctx->input("input", &input_tensor));
@@ -39,19 +43,43 @@ class RegexFullMatchOp : public OpKernel {
                 errors::InvalidArgument("Pattern must be scalar, but received ",
                                         pattern_tensor->shape().DebugString()));
     const string pattern = pattern_tensor->flat<tstring>()(0);
-    const RE2 match(pattern);
-    OP_REQUIRES(ctx, match.ok(),
+    std::shared_ptr<RE2> regex = CachedRE2(pattern);
+    OP_REQUIRES(ctx, regex->ok(),
                 errors::InvalidArgument("Invalid pattern: ", pattern,
-                                        ", error: ", match.error()));
+                                        ", error: ", regex->error()));
 
     Tensor* output_tensor = nullptr;
     OP_REQUIRES_OK(ctx, ctx->allocate_output("output", input_tensor->shape(),
                                              &output_tensor));
     auto output_flat = output_tensor->flat<bool>();
     for (size_t i = 0; i < input_flat.size(); ++i) {
-      output_flat(i) = RE2::FullMatch(input_flat(i), match);
+      output_flat(i) = RE2::FullMatch(input_flat(i), *regex);
     }
   }
+
+ private:
+  std::shared_ptr<RE2> CachedRE2(const string& pattern) {
+    {
+      tf_shared_lock l(mu_);
+      if (regex_ != nullptr && regex_->pattern() == pattern) {
+        return regex_;
+      }
+    }
+    // Construct the new RE2 object before acquiring the lock.
+    auto regex = std::make_shared<RE2>(pattern);
+    {
+      mutex_lock l(mu_);
+      // Swap instead of assigning so that we destruct the old
+      // RE2 object (when necessary) after releasing the lock.
+      regex_.swap(regex);
+      return regex_;
+    }
+  }
+
+  mutex mu_;
+  std::shared_ptr<RE2> regex_ TF_GUARDED_BY(mu_);
+
+  TF_DISALLOW_COPY_AND_ASSIGN(RegexFullMatchOp);
 };
 
 REGISTER_KERNEL_BUILDER(Name("RegexFullMatch").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/regex_replace_op.cc b/tensorflow/core/kernels/regex_replace_op.cc
index 4eb83c5fe0d..5e464e0a13a 100644
--- a/tensorflow/core/kernels/regex_replace_op.cc
+++ b/tensorflow/core/kernels/regex_replace_op.cc
@@ -20,6 +20,8 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
@@ -29,7 +31,7 @@ namespace {
 // Context requirements:
 //  - "input" string Tensor at input_index=0
 //  - "output" string Tensor at output_index=0
-Status InternalCompute(const RE2& match, const string& rewrite,
+Status InternalCompute(const RE2& regex, const string& rewrite,
                        const bool replace_global, OpKernelContext* ctx) {
   const Tensor* input_tensor;
   TF_RETURN_IF_ERROR(ctx->input("input", &input_tensor));
@@ -52,9 +54,9 @@ Status InternalCompute(const RE2& match, const string& rewrite,
     // accept std::string.
     string buf = output_flat(i);
     if (replace_global) {
-      RE2::GlobalReplace(&buf, match, rewrite);
+      RE2::GlobalReplace(&buf, regex, rewrite);
     } else {
-      RE2::Replace(&buf, match, rewrite);
+      RE2::Replace(&buf, regex, rewrite);
     }
     output_flat(i) = std::move(buf);
   }
@@ -68,6 +70,8 @@ class RegexReplaceOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("replace_global", &replace_global_));
   }
 
+  ~RegexReplaceOp() override {}
+
   void Compute(OpKernelContext* ctx) override {
     const Tensor* pattern_tensor;
     OP_REQUIRES_OK(ctx, ctx->input("pattern", &pattern_tensor));
@@ -75,10 +79,10 @@ class RegexReplaceOp : public OpKernel {
                 errors::InvalidArgument("Pattern must be scalar, but received ",
                                         pattern_tensor->shape().DebugString()));
     const string& pattern = pattern_tensor->scalar<tstring>()();
-    const RE2 match(pattern);
-    OP_REQUIRES(ctx, match.ok(),
+    std::shared_ptr<RE2> regex = CachedRE2(pattern);
+    OP_REQUIRES(ctx, regex->ok(),
                 errors::InvalidArgument("Invalid pattern: ", pattern,
-                                        ", error: ", match.error()));
+                                        ", error: ", regex->error()));
 
     const Tensor* rewrite_tensor;
     OP_REQUIRES_OK(ctx, ctx->input("rewrite", &rewrite_tensor));
@@ -86,11 +90,33 @@ class RegexReplaceOp : public OpKernel {
                 errors::InvalidArgument("Rewrite must be scalar, but received ",
                                         rewrite_tensor->shape().DebugString()));
     const string& rewrite = rewrite_tensor->scalar<tstring>()();
-    OP_REQUIRES_OK(ctx, InternalCompute(match, rewrite, replace_global_, ctx));
+    OP_REQUIRES_OK(ctx, InternalCompute(*regex, rewrite, replace_global_, ctx));
   }
 
  private:
+  std::shared_ptr<RE2> CachedRE2(const string& pattern) {
+    {
+      tf_shared_lock l(mu_);
+      if (regex_ != nullptr && regex_->pattern() == pattern) {
+        return regex_;
+      }
+    }
+    // Construct the new RE2 object before acquiring the lock.
+    auto regex = std::make_shared<RE2>(pattern);
+    {
+      mutex_lock l(mu_);
+      // Swap instead of assigning so that we destruct the old
+      // RE2 object (when necessary) after releasing the lock.
+      regex_.swap(regex);
+      return regex_;
+    }
+  }
+
   bool replace_global_;
+  mutex mu_;
+  std::shared_ptr<RE2> regex_ TF_GUARDED_BY(mu_);
+
+  TF_DISALLOW_COPY_AND_ASSIGN(RegexReplaceOp);
 };
 
 REGISTER_KERNEL_BUILDER(Name("RegexReplace").Device(DEVICE_CPU),
@@ -101,11 +127,11 @@ class StaticRegexReplaceOp : public OpKernel {
   explicit StaticRegexReplaceOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
     string pattern;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("pattern", &pattern));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("rewrite", &rewrite_str_));
     re_ = MakeUnique<RE2>(pattern);
     OP_REQUIRES(ctx, re_->ok(),
                 errors::InvalidArgument("Invalid pattern: ", pattern,
                                         ", error: ", re_->error()));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("rewrite", &rewrite_str_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("replace_global", &replace_global_));
   }
 
@@ -115,8 +141,8 @@ class StaticRegexReplaceOp : public OpKernel {
   }
 
  private:
-  string rewrite_str_;
   std::unique_ptr<RE2> re_;
+  string rewrite_str_;
   bool replace_global_;
 };
 
diff --git a/tensorflow/core/kernels/relu_op.cc b/tensorflow/core/kernels/relu_op.cc
index 784c977ac50..210b994a0b8 100644
--- a/tensorflow/core/kernels/relu_op.cc
+++ b/tensorflow/core/kernels/relu_op.cc
@@ -29,9 +29,6 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 #define REGISTER_RELU_KERNELS(type)                                       \
   REGISTER_KERNEL_BUILDER(                                                \
@@ -211,42 +208,5 @@ REGISTER_KERNEL_BUILDER(
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-// Registration of the GPU implementations.
-#define REGISTER_SYCL_KERNELS(type)                                        \
-  REGISTER_KERNEL_BUILDER(                                                 \
-      Name("Relu").Device(DEVICE_SYCL).TypeConstraint<type>("T"),          \
-      ReluOp<SYCLDevice, type>);                                           \
-  REGISTER_KERNEL_BUILDER(                                                 \
-      Name("ReluGrad").Device(DEVICE_SYCL).TypeConstraint<type>("T"),      \
-      ReluGradOp<SYCLDevice, type>);                                       \
-  REGISTER_KERNEL_BUILDER(                                                 \
-      Name("Relu6").Device(DEVICE_SYCL).TypeConstraint<type>("T"),         \
-      Relu6Op<SYCLDevice, type>);                                          \
-  REGISTER_KERNEL_BUILDER(                                                 \
-      Name("Relu6Grad").Device(DEVICE_SYCL).TypeConstraint<type>("T"),     \
-      Relu6GradOp<SYCLDevice, type>);                                      \
-  REGISTER_KERNEL_BUILDER(                                                 \
-      Name("LeakyRelu").Device(DEVICE_SYCL).TypeConstraint<type>("T"),     \
-      LeakyReluOp<SYCLDevice, type>);                                      \
-  REGISTER_KERNEL_BUILDER(                                                 \
-      Name("LeakyReluGrad").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
-      LeakyReluGradOp<SYCLDevice, type>);                                  \
-  REGISTER_KERNEL_BUILDER(                                                 \
-      Name("Elu").Device(DEVICE_SYCL).TypeConstraint<type>("T"),           \
-      EluOp<SYCLDevice, type>);                                            \
-  REGISTER_KERNEL_BUILDER(                                                 \
-      Name("EluGrad").Device(DEVICE_SYCL).TypeConstraint<type>("T"),       \
-      EluGradOp<SYCLDevice, type>);                                        \
-  REGISTER_KERNEL_BUILDER(                                                 \
-      Name("Selu").Device(DEVICE_SYCL).TypeConstraint<type>("T"),          \
-      SeluOp<SYCLDevice, type>);                                           \
-  REGISTER_KERNEL_BUILDER(                                                 \
-      Name("SeluGrad").Device(DEVICE_SYCL).TypeConstraint<type>("T"),      \
-      SeluGradOp<SYCLDevice, type>)
-
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNELS);
-#undef REGISTER_SYCL_KERNELS
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/reshape_op.cc b/tensorflow/core/kernels/reshape_op.cc
index 9860448947a..d43cc5a92ea 100644
--- a/tensorflow/core/kernels/reshape_op.cc
+++ b/tensorflow/core/kernels/reshape_op.cc
@@ -46,45 +46,6 @@ TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
 TF_CALL_bool(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)                              \
-  REGISTER_KERNEL_BUILDER(Name("Reshape")                       \
-                              .Device(DEVICE_SYCL)              \
-                              .HostMemory("shape")              \
-                              .TypeConstraint<type>("T")        \
-                              .TypeConstraint<int32>("Tshape"), \
-                          ReshapeOp);                           \
-  REGISTER_KERNEL_BUILDER(Name("Reshape")                       \
-                              .Device(DEVICE_SYCL)              \
-                              .HostMemory("shape")              \
-                              .TypeConstraint<type>("T")        \
-                              .TypeConstraint<int64>("Tshape"), \
-                          ReshapeOp);
-REGISTER_SYCL_KERNEL(float)
-REGISTER_SYCL_KERNEL(double)
-REGISTER_SYCL_KERNEL(uint8)
-REGISTER_SYCL_KERNEL(int8)
-REGISTER_SYCL_KERNEL(int64)
-REGISTER_SYCL_KERNEL(uint16)
-
-REGISTER_KERNEL_BUILDER(Name("Reshape")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("tensor")
-                            .HostMemory("shape")
-                            .HostMemory("output")
-                            .TypeConstraint<int32>("T")
-                            .TypeConstraint<int32>("Tshape"),
-                        ReshapeOp);
-REGISTER_KERNEL_BUILDER(Name("Reshape")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("tensor")
-                            .HostMemory("shape")
-                            .HostMemory("output")
-                            .TypeConstraint<int32>("T")
-                            .TypeConstraint<int64>("Tshape"),
-                        ReshapeOp);
-#undef REGISTER_SYCL_KERNEL
-#endif  // TENSORFLOW_USE_SYCL
 
 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
diff --git a/tensorflow/core/kernels/reverse_op.cc b/tensorflow/core/kernels/reverse_op.cc
index d551d1ee4bc..4b4aa05fc7b 100644
--- a/tensorflow/core/kernels/reverse_op.cc
+++ b/tensorflow/core/kernels/reverse_op.cc
@@ -34,9 +34,6 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 namespace {
 
@@ -237,7 +234,7 @@ class ReverseV2Op : public OpKernel {
     const Tensor& input = context->input(0);
     const Tensor& sparse_dims = context->input(1);
 
-    if (TensorShapeUtils::IsScalar(input.shape())) {
+    if (TensorShapeUtils::IsScalar(input.shape()) || input.NumElements() == 0) {
       context->set_output(0, input);
     } else {
       const int input_dims = input.dims();
@@ -399,52 +396,4 @@ REGISTER_KERNEL_BUILDER(Name("ReverseV2")
                         ReverseV2Op<CPUDevice, int32, int64>);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNELS(T)                             \
-  REGISTER_KERNEL_BUILDER(Name("Reverse")                    \
-                              .Device(DEVICE_SYCL)           \
-                              .TypeConstraint<T>("T")        \
-                              .HostMemory("dims"),           \
-                          ReverseOp<SYCLDevice, T>)          \
-  REGISTER_KERNEL_BUILDER(Name("ReverseV2")                  \
-                              .Device(DEVICE_SYCL)           \
-                              .TypeConstraint<T>("T")        \
-                              .TypeConstraint<int32>("Tidx") \
-                              .HostMemory("axis"),           \
-                          ReverseV2Op<SYCLDevice, T, int32>) \
-  REGISTER_KERNEL_BUILDER(Name("ReverseV2")                  \
-                              .Device(DEVICE_SYCL)           \
-                              .TypeConstraint<T>("T")        \
-                              .TypeConstraint<int64>("Tidx") \
-                              .HostMemory("axis"),           \
-                          ReverseV2Op<SYCLDevice, T, int64>)
-TF_CALL_uint8(REGISTER_SYCL_KERNELS);
-TF_CALL_int8(REGISTER_SYCL_KERNELS);
-TF_CALL_float(REGISTER_SYCL_KERNELS);
-TF_CALL_double(REGISTER_SYCL_KERNELS);
-
-REGISTER_KERNEL_BUILDER(Name("Reverse")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int32>("T")
-                            .HostMemory("tensor")
-                            .HostMemory("dims")
-                            .HostMemory("output"),
-                        ReverseOp<CPUDevice, int32>);
-REGISTER_KERNEL_BUILDER(Name("ReverseV2")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int32>("T")
-                            .TypeConstraint<int32>("Tidx")
-                            .HostMemory("tensor")
-                            .HostMemory("axis")
-                            .HostMemory("output"),
-                        ReverseV2Op<CPUDevice, int32, int32>);
-REGISTER_KERNEL_BUILDER(Name("ReverseV2")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int32>("T")
-                            .TypeConstraint<int64>("Tidx")
-                            .HostMemory("tensor")
-                            .HostMemory("axis")
-                            .HostMemory("output"),
-                        ReverseV2Op<CPUDevice, int32, int64>);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/scatter_functor.h b/tensorflow/core/kernels/scatter_functor.h
index fd2724a73d8..5af04c7aeae 100644
--- a/tensorflow/core/kernels/scatter_functor.h
+++ b/tensorflow/core/kernels/scatter_functor.h
@@ -33,9 +33,6 @@ namespace tensorflow {
 class OpKernelContext;
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 namespace scatter_op {
 
@@ -125,65 +122,6 @@ struct Assign<scatter_op::UpdateOp::MAX> {
   }
 };
 
-#ifdef TENSORFLOW_USE_SYCL
-template <scatter_op::UpdateOp Op>
-struct AssignSYCL {};
-template <>
-struct AssignSYCL<scatter_op::UpdateOp::ASSIGN> {
-  template <typename Device, typename Params, typename Update>
-  static void Run(Device d, Params p, Update u) {
-    p.device(d) = u;
-  }
-};
-
-template <>
-struct AssignSYCL<scatter_op::UpdateOp::ADD> {
-  template <typename Device, typename Params, typename Update>
-  static void Run(Device d, Params p, Update u) {
-    p.device(d) += u;
-  }
-};
-
-template <>
-struct AssignSYCL<scatter_op::UpdateOp::SUB> {
-  template <typename Device, typename Params, typename Update>
-  static void Run(Device d, Params p, Update u) {
-    p.device(d) -= u;
-  }
-};
-
-template <>
-struct AssignSYCL<scatter_op::UpdateOp::MUL> {
-  template <typename Device, typename Params, typename Update>
-  static void Run(Device d, Params p, Update u) {
-    p.device(d) = p * u;
-  }
-};
-
-template <>
-struct AssignSYCL<scatter_op::UpdateOp::DIV> {
-  template <typename Device, typename Params, typename Update>
-  static void Run(Device d, Params p, Update u) {
-    p.device(d) = p / u;
-  }
-};
-
-template <>
-struct AssignSYCL<scatter_op::UpdateOp::MIN> {
-  template <typename Device, typename Params, typename Update>
-  static void Run(Device d, Params p, Update u) {
-    p.device(d) = p.cwiseMin(u);
-  }
-};
-
-template <>
-struct AssignSYCL<scatter_op::UpdateOp::MAX> {
-  template <typename Device, typename Params, typename Update>
-  static void Run(Device d, Params p, Update u) {
-    p.device(d) = p.cwiseMax(u);
-  }
-};
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace internal
 }  // namespace scatter_op
@@ -328,30 +266,6 @@ template <typename Index>
 struct ScatterFunctor<GPUDevice, Variant, Index, scatter_op::UpdateOp::ASSIGN>
     : ScatterFunctorVariantAssignBase<GPUDevice, Index> {};
 
-#ifdef TENSORFLOW_USE_SYCL
-template <typename T, typename Index, scatter_op::UpdateOp op>
-struct ScatterFunctorBase<SYCLDevice, T, Index, op> {
-  Index operator()(OpKernelContext* c, const SYCLDevice& d,
-                   typename TTypes<T>::Matrix params,
-                   typename TTypes<T>::ConstMatrix updates,
-                   typename TTypes<Index>::ConstFlat indices) {
-    // indices and params sizes were validated in DoCompute().
-    const Index N = static_cast<Index>(indices.size());
-    const Index limit = static_cast<Index>(params.dimension(0));
-    for (Index i = 0; i < N; i++) {
-      // Grab the index and check its validity.  Do this carefully,
-      // to avoid checking the value and grabbing it again from
-      // memory a second time (a security risk since it may change in between).
-      const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
-      if (!FastBoundsCheck(index, limit)) return i;
-      // Copy last Ndim-1 dimensions of updates[i] to params[index]
-      scatter_op::internal::AssignSYCL<op>::Run(
-          d, params.template chip<0>(index), updates.template chip<0>(i));
-    }
-    return -1;
-  }
-};
-#endif  // TENSORFLOW_USE_SYCL
 
 template <typename T, typename Index>
 struct ScatterFunctorBase<CPUDevice, T, Index, scatter_op::UpdateOp::ASSIGN> {
@@ -395,27 +309,6 @@ template <typename T, typename Index, scatter_op::UpdateOp op>
 struct ScatterFunctor<CPUDevice, T, Index, op>
     : ScatterFunctorBase<CPUDevice, T, Index, op> {};
 
-#ifdef TENSORFLOW_USE_SYCL
-template <typename T, typename Index, scatter_op::UpdateOp op>
-struct ScatterFunctorSYCL {
-  Index operator()(OpKernelContext* c, const SYCLDevice& d,
-                   typename TTypes<T>::Matrix params,
-                   typename TTypes<T>::ConstMatrix updates,
-                   typename TTypes<Index>::Flat indices) {
-    // indices and params sizes were validated in DoCompute().
-    const Index N = static_cast<Index>(indices.size());
-    const Index limit = static_cast<Index>(params.dimension(0));
-    for (Index i = 0; i < N; i++) {
-      const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
-      if (!FastBoundsCheck(index, limit)) return i;
-      // Copy last Ndim-1 dimensions of updates[i] to params[index]
-      scatter_op::internal::AssignSYCL<op>::Run(
-          d, params.template chip<0>(index), updates.template chip<0>(i));
-    }
-    return -1;
-  }
-};
-#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T, typename Index, scatter_op::UpdateOp op>
 struct ScatterScalarFunctor {
@@ -483,30 +376,6 @@ struct ScatterScalarFunctor<GPUDevice, Variant, Index,
                             scatter_op::UpdateOp::ASSIGN>
     : ScatterScalarFunctorVariantAssignBase<GPUDevice, Index> {};
 
-#ifdef TENSORFLOW_USE_SYCL
-template <typename T, typename Index, scatter_op::UpdateOp op>
-struct ScatterScalarFunctorBase<SYCLDevice, T, Index, op> {
-  Index operator()(OpKernelContext* c, const SYCLDevice& d,
-                   typename TTypes<T>::Matrix params,
-                   const typename TTypes<T>::ConstScalar update,
-                   typename TTypes<Index>::ConstFlat indices) {
-    // indices and params sizes were validated in DoCompute().
-    const Index N = static_cast<Index>(indices.size());
-    const Index limit = static_cast<Index>(params.dimension(0));
-    for (Index i = 0; i < N; i++) {
-      // Grab the index and check its validity.  Do this carefully,
-      // to avoid checking the value and grabbing it again from
-      // memory a second time (a security risk since it may change in between).
-      const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
-      if (!FastBoundsCheck(index, limit)) return i;
-      // Broadcast update to params[index]
-      scatter_op::internal::AssignSYCL<op>::RunScalar(
-          d, params.template chip<0>(index), update);
-    }
-    return -1;
-  }
-};
-#endif  // TENSORFLOW_USE_SYCL
 
 template <typename T, typename Index>
 struct ScatterScalarFunctorBase<CPUDevice, T, Index,
@@ -536,27 +405,6 @@ template <typename T, typename Index, scatter_op::UpdateOp op>
 struct ScatterScalarFunctor<CPUDevice, T, Index, op>
     : ScatterScalarFunctorBase<CPUDevice, T, Index, op> {};
 
-#ifdef TENSORFLOW_USE_SYCL
-template <typename T, typename Index, scatter_op::UpdateOp op>
-struct ScatterScalarFunctorSYCL {
-  Index operator()(OpKernelContext* c, const SYCLDevice& d,
-                   typename TTypes<T>::Matrix params,
-                   const typename TTypes<T>::ConstScalar update,
-                   typename TTypes<Index>::Flat indices) {
-    // indices and params sizes were validated in DoCompute().
-    const Index N = static_cast<Index>(indices.size());
-    const Index limit = static_cast<Index>(params.dimension(0));
-    for (Index i = 0; i < N; i++) {
-      const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
-      if (!FastBoundsCheck(index, limit)) return i;
-      // Broadcast update to params[index]
-      scatter_op::internal::AssignSYCL<op>::Run(
-          d, params.template chip<0>(index), update());
-    }
-    return -1;
-  }
-};
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace functor
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/scatter_nd_op.cc b/tensorflow/core/kernels/scatter_nd_op.cc
index 88bf16d974e..b50c8d2cec3 100644
--- a/tensorflow/core/kernels/scatter_nd_op.cc
+++ b/tensorflow/core/kernels/scatter_nd_op.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/dense_update_functor.h"
 #include "tensorflow/core/kernels/fill_functor.h"
 #include "tensorflow/core/kernels/inplace_ops_functor.h"
@@ -37,17 +38,11 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/util.h"
 
-#ifdef TENSORFLOW_USE_SYCL
-#include "tensorflow/core/common_runtime/sycl/sycl_util.h"
-#endif  // TENSORFLOW_USE_SYCL
 
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 // Returns true if the three tensors have valid number of elements
 // If shape_input has 0 elements, then we need to have indices and updates with
@@ -100,29 +95,31 @@ class ScatterNdOp : public OpKernel {
     const int64 outer_dims = indices.shape().dims() - 1;
 
     for (int i = 0; i < outer_dims; ++i) {
-      OP_REQUIRES(c, indices.shape().dim_size(i) == updates.shape().dim_size(i),
-                  errors::InvalidArgument(
-                      "Outer dimensions of indices and update must match. "
-                      "Indices shape: ",
-                      indices.shape().DebugString(),
-                      ", updates shape:", updates.shape().DebugString()));
+      OP_REQUIRES(
+          c, indices.shape().dim_size(i) == updates.shape().dim_size(i),
+          errors::InvalidArgument(
+              "Dimensions [0,", outer_dims,
+              ") of indices[shape=", indices.shape().DebugString(),
+              "] must match dimensions [0,", outer_dims,
+              ") of updates[shape=", updates.shape().DebugString(), "]"));
     }
 
     const int64 ix = indices.shape().dim_size(outer_dims);
-    OP_REQUIRES(
-        c, updates.shape().dims() - outer_dims == shape.dims() - ix,
-        errors::InvalidArgument("Inner dimensions of output shape must match "
-                                "inner dimensions of updates shape. Output: ",
-                                shape.DebugString(),
-                                " updates: ", updates.shape().DebugString()));
+    OP_REQUIRES(c, updates.shape().dims() - outer_dims == shape.dims() - ix,
+                errors::InvalidArgument(
+                    "Dimensions [", ix, ",", shape.dims(), ") of input[shape=",
+                    shape.DebugString(), "] must match dimensions [",
+                    outer_dims, ",", updates.shape().dims(),
+                    ") of updates[shape=", updates.shape().DebugString(), "]"));
+
     for (int i = 0; i + outer_dims < updates.shape().dims(); ++i) {
       OP_REQUIRES(
           c, updates.shape().dim_size(i + outer_dims) == shape.dim_size(ix + i),
-          errors::InvalidArgument(
-              "The inner ", shape.dims() - ix,
-              " dimensions of output.shape=", shape.DebugString(),
-              " must match the inner ", updates.shape().dims() - outer_dims,
-              " dimensions of updates.shape=", updates.shape().DebugString()));
+          errors::InvalidArgument("Dimensions [", ix, ",", shape.dims(),
+                                  ") of input[shape=", shape.DebugString(),
+                                  "] must match dimensions [", outer_dims, ",",
+                                  updates.shape().dims(), ") of updates[shape=",
+                                  updates.shape().DebugString(), "]"));
     }
     OP_REQUIRES(c, shape_input.dims() == 1,
                 errors::InvalidArgument("Shape must be a vector"));
@@ -196,8 +193,16 @@ class TensorScatterOp : public OpKernel {
               " dimensions of updates.shape=", updates.shape().DebugString()));
     }
 
-    std::unique_ptr<Tensor> forwarded_input = c->forward_input(
-        0, 0, input.dtype(), shape, DEVICE_MEMORY, AllocatorAttributes());
+    AllocatorAttributes alloc_attr;
+    MemoryType memory_type = DEVICE_MEMORY;
+    if (std::is_same<Device, CPUDevice>::value) {
+      alloc_attr.set_on_host(true);
+      memory_type = HOST_MEMORY;
+    } else {
+      memory_type = DEVICE_MEMORY;
+    }
+    std::unique_ptr<Tensor> forwarded_input =
+        c->forward_input(0, 0, input.dtype(), shape, memory_type, alloc_attr);
 
     if (forwarded_input == nullptr) {
       // We were not able to forward the input, so we deep copy the tensor and
@@ -315,6 +320,17 @@ class ScatterNdUpdateOp : public OpKernel {
                               .HostMemory("shape"),                   \
                           ScatterNdOp<dev##Device, type, index_type>)
 
+#define REGISTER_SCATTER_ND_KERNEL_INDEX_INT32_GPU(index_type, name)  \
+  REGISTER_KERNEL_BUILDER(Name(name)                                  \
+                              .Device(DEVICE_GPU)                     \
+                              .TypeConstraint<int32>("T")             \
+                              .TypeConstraint<index_type>("Tindices") \
+                              .HostMemory("indices")                  \
+                              .HostMemory("updates")                  \
+                              .HostMemory("shape")                    \
+                              .HostMemory("output"),                  \
+                          ScatterNdOp<CPUDevice, int32, index_type>)
+
 #define REGISTER_SCATTER_ND_UPDATE_KERNEL_INDEX(type, index_type, dev, name, \
                                                 op)                          \
   REGISTER_KERNEL_BUILDER(                                                   \
@@ -324,6 +340,30 @@ class ScatterNdUpdateOp : public OpKernel {
           .TypeConstraint<index_type>("Tindices"),                           \
       ScatterNdUpdateOp<dev##Device, type, index_type, op>)
 
+#define REGISTER_SCATTER_ND_UPDATE_KERNEL_INDEX_INT32_GPU(index_type, name, \
+                                                          op)               \
+  REGISTER_KERNEL_BUILDER(Name(name)                                        \
+                              .Device(DEVICE_GPU)                           \
+                              .TypeConstraint<int32>("T")                   \
+                              .TypeConstraint<index_type>("Tindices")       \
+                              .HostMemory("ref")                            \
+                              .HostMemory("indices")                        \
+                              .HostMemory("updates")                        \
+                              .HostMemory("output_ref"),                    \
+                          ScatterNdUpdateOp<CPUDevice, int32, index_type, op>)
+
+#define REGISTER_SCATTER_ND_NON_ALIASING_UPDATE_KERNEL_INDEX_INT32_GPU( \
+    index_type, name, op)                                               \
+  REGISTER_KERNEL_BUILDER(Name(name)                                    \
+                              .Device(DEVICE_GPU)                       \
+                              .TypeConstraint<int32>("T")               \
+                              .TypeConstraint<index_type>("Tindices")   \
+                              .HostMemory("input")                      \
+                              .HostMemory("indices")                    \
+                              .HostMemory("updates")                    \
+                              .HostMemory("output"),                    \
+                          ScatterNdUpdateOp<CPUDevice, int32, index_type, op>)
+
 #define REGISTER_RESOURCE_SCATTER_ND_UPDATE_KERNEL_INDEX(type, index_type, \
                                                          dev, name, op)    \
   REGISTER_KERNEL_BUILDER(                                                 \
@@ -334,19 +374,48 @@ class ScatterNdUpdateOp : public OpKernel {
           .HostMemory("ref"),                                              \
       ScatterNdUpdateOp<dev##Device, type, index_type, op>)
 
+#define REGISTER_RESOURCE_SCATTER_ND_UPDATE_KERNEL_INDEX_INT32_GPU(index_type, \
+                                                                   name, op)   \
+  REGISTER_KERNEL_BUILDER(Name(name)                                           \
+                              .Device(DEVICE_GPU)                              \
+                              .TypeConstraint<int32>("T")                      \
+                              .TypeConstraint<index_type>("Tindices")          \
+                              .HostMemory("ref")                               \
+                              .HostMemory("indices")                           \
+                              .HostMemory("updates"),                          \
+                          ScatterNdUpdateOp<CPUDevice, int32, index_type, op>)
+
 #define REGISTER_SCATTER_ND_KERNEL(type, dev, name)         \
   REGISTER_SCATTER_ND_KERNEL_INDEX(type, int32, dev, name); \
   REGISTER_SCATTER_ND_KERNEL_INDEX(type, int64, dev, name)
 
+#define REGISTER_SCATTER_ND_KERNEL_INT32_GPU(name)         \
+  REGISTER_SCATTER_ND_KERNEL_INDEX_INT32_GPU(int32, name); \
+  REGISTER_SCATTER_ND_KERNEL_INDEX_INT32_GPU(int64, name)
+
 #define REGISTER_SCATTER_ND_UPDATE_KERNEL(type, dev, name, op)         \
   REGISTER_SCATTER_ND_UPDATE_KERNEL_INDEX(type, int32, dev, name, op); \
   REGISTER_SCATTER_ND_UPDATE_KERNEL_INDEX(type, int64, dev, name, op)
 
+#define REGISTER_SCATTER_ND_UPDATE_KERNEL_INT32_GPU(name, op)         \
+  REGISTER_SCATTER_ND_UPDATE_KERNEL_INDEX_INT32_GPU(int32, name, op); \
+  REGISTER_SCATTER_ND_UPDATE_KERNEL_INDEX_INT32_GPU(int64, name, op)
+
+#define REGISTER_SCATTER_ND_NON_ALIASING_UPDATE_KERNEL_INT32_GPU(name, op)    \
+  REGISTER_SCATTER_ND_NON_ALIASING_UPDATE_KERNEL_INDEX_INT32_GPU(int32, name, \
+                                                                 op);         \
+  REGISTER_SCATTER_ND_NON_ALIASING_UPDATE_KERNEL_INDEX_INT32_GPU(int64, name, \
+                                                                 op)
+
 #define REGISTER_RESOURCE_SCATTER_ND_UPDATE_KERNEL(type, dev, name, op)    \
   REGISTER_RESOURCE_SCATTER_ND_UPDATE_KERNEL_INDEX(type, int32, dev, name, \
                                                    op);                    \
   REGISTER_RESOURCE_SCATTER_ND_UPDATE_KERNEL_INDEX(type, int64, dev, name, op)
 
+#define REGISTER_RESOURCE_SCATTER_ND_UPDATE_KERNEL_INT32_GPU(name, op)         \
+  REGISTER_RESOURCE_SCATTER_ND_UPDATE_KERNEL_INDEX_INT32_GPU(int32, name, op); \
+  REGISTER_RESOURCE_SCATTER_ND_UPDATE_KERNEL_INDEX_INT32_GPU(int64, name, op)
+
 #define REGISTER_SCATTER_ND_ADD_SUB(type, dev)                            \
   REGISTER_SCATTER_ND_UPDATE_KERNEL(type, dev, "ScatterNdAdd",            \
                                     scatter_nd_op::UpdateOp::ADD);        \
@@ -359,15 +428,36 @@ class ScatterNdUpdateOp : public OpKernel {
   REGISTER_RESOURCE_SCATTER_ND_UPDATE_KERNEL(                             \
       type, dev, "ResourceScatterNdSub", scatter_nd_op::UpdateOp::SUB);
 
+#define REGISTER_SCATTER_ND_ADD_SUB_INT32_GPU()                              \
+  REGISTER_SCATTER_ND_NON_ALIASING_UPDATE_KERNEL_INT32_GPU(                  \
+      "ScatterNdNonAliasingAdd", scatter_nd_op::UpdateOp::ADD);              \
+  REGISTER_SCATTER_ND_UPDATE_KERNEL_INT32_GPU("ScatterNdAdd",                \
+                                              scatter_nd_op::UpdateOp::ADD); \
+  REGISTER_SCATTER_ND_UPDATE_KERNEL_INT32_GPU("ScatterNdSub",                \
+                                              scatter_nd_op::UpdateOp::SUB); \
+  REGISTER_RESOURCE_SCATTER_ND_UPDATE_KERNEL_INT32_GPU(                      \
+      "ResourceScatterNdAdd", scatter_nd_op::UpdateOp::ADD);                 \
+  REGISTER_RESOURCE_SCATTER_ND_UPDATE_KERNEL_INT32_GPU(                      \
+      "ResourceScatterNdSub", scatter_nd_op::UpdateOp::SUB);
+
 #define REGISTER_SCATTER_ND(type, dev) \
   REGISTER_SCATTER_ND_KERNEL(type, dev, "ScatterNd");
 
+#define REGISTER_SCATTER_ND_INT32_GPU() \
+  REGISTER_SCATTER_ND_KERNEL_INT32_GPU("ScatterNd");
+
 #define REGISTER_SCATTER_ND_UPDATE(type, dev)                         \
   REGISTER_SCATTER_ND_UPDATE_KERNEL(type, dev, "ScatterNdUpdate",     \
                                     scatter_nd_op::UpdateOp::ASSIGN); \
   REGISTER_RESOURCE_SCATTER_ND_UPDATE_KERNEL(                         \
       type, dev, "ResourceScatterNdUpdate", scatter_nd_op::UpdateOp::ASSIGN);
 
+#define REGISTER_SCATTER_ND_UPDATE_INT32_GPU()             \
+  REGISTER_SCATTER_ND_UPDATE_KERNEL_INT32_GPU(             \
+      "ScatterNdUpdate", scatter_nd_op::UpdateOp::ASSIGN); \
+  REGISTER_RESOURCE_SCATTER_ND_UPDATE_KERNEL_INT32_GPU(    \
+      "ResourceScatterNdUpdate", scatter_nd_op::UpdateOp::ASSIGN);
+
 #define REGISTER_SCATTER_ND_MIN_MAX(type, dev)                          \
   REGISTER_SCATTER_ND_UPDATE_KERNEL(type, dev, "ScatterNdMax",          \
                                     scatter_nd_op::UpdateOp::MAX);      \
@@ -378,6 +468,16 @@ class ScatterNdUpdateOp : public OpKernel {
   REGISTER_RESOURCE_SCATTER_ND_UPDATE_KERNEL(                           \
       type, dev, "ResourceScatterNdMax", scatter_nd_op::UpdateOp::MAX);
 
+#define REGISTER_SCATTER_ND_MIN_MAX_INT32_GPU()                              \
+  REGISTER_SCATTER_ND_UPDATE_KERNEL_INT32_GPU("ScatterNdMax",                \
+                                              scatter_nd_op::UpdateOp::MAX); \
+  REGISTER_SCATTER_ND_UPDATE_KERNEL_INT32_GPU("ScatterNdMin",                \
+                                              scatter_nd_op::UpdateOp::MIN); \
+  REGISTER_RESOURCE_SCATTER_ND_UPDATE_KERNEL_INT32_GPU(                      \
+      "ResourceScatterNdMin", scatter_nd_op::UpdateOp::MIN);                 \
+  REGISTER_RESOURCE_SCATTER_ND_UPDATE_KERNEL_INT32_GPU(                      \
+      "ResourceScatterNdMax", scatter_nd_op::UpdateOp::MAX);
+
 // Registers CPU kernels.
 #define REGISTER_SCATTER_ND_ADD_SUB_CPU(type) \
   REGISTER_SCATTER_ND_ADD_SUB(type, CPU);
@@ -410,6 +510,18 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_SCATTER_ND_MIN_MAX_CPU);
                           TensorScatterOp<dev##Device, type, index_type,    \
                                           scatter_nd_op::UpdateOp::ASSIGN>)
 
+#define REGISTER_SCATTER_ND_TENSOR_UPDATE_INT32_GPU_INDEX_TYPE(index_type) \
+  REGISTER_KERNEL_BUILDER(Name("TensorScatterUpdate")                      \
+                              .Device(DEVICE_GPU)                          \
+                              .TypeConstraint<int32>("T")                  \
+                              .TypeConstraint<index_type>("Tindices")      \
+                              .HostMemory("tensor")                        \
+                              .HostMemory("indices")                       \
+                              .HostMemory("updates")                       \
+                              .HostMemory("output"),                       \
+                          TensorScatterOp<CPUDevice, int32, index_type,    \
+                                          scatter_nd_op::UpdateOp::ASSIGN>)
+
 #define REGISTER_SCATTER_ND_TENSOR_ADD_TYPE_INDEX_TYPE(type, index_type, dev) \
   REGISTER_KERNEL_BUILDER(Name("TensorScatterAdd")                            \
                               .Device(DEVICE_##dev)                           \
@@ -418,6 +530,18 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_SCATTER_ND_MIN_MAX_CPU);
                           TensorScatterOp<dev##Device, type, index_type,      \
                                           scatter_nd_op::UpdateOp::ADD>)
 
+#define REGISTER_SCATTER_ND_TENSOR_ADD_INT32_GPU_INDEX_TYPE(index_type) \
+  REGISTER_KERNEL_BUILDER(Name("TensorScatterAdd")                      \
+                              .Device(DEVICE_GPU)                       \
+                              .TypeConstraint<int32>("T")               \
+                              .TypeConstraint<index_type>("Tindices")   \
+                              .HostMemory("tensor")                     \
+                              .HostMemory("indices")                    \
+                              .HostMemory("updates")                    \
+                              .HostMemory("output"),                    \
+                          TensorScatterOp<CPUDevice, int32, index_type, \
+                                          scatter_nd_op::UpdateOp::ADD>)
+
 #define REGISTER_SCATTER_ND_TENSOR_SUB_TYPE_INDEX_TYPE(type, index_type, dev) \
   REGISTER_KERNEL_BUILDER(Name("TensorScatterSub")                            \
                               .Device(DEVICE_##dev)                           \
@@ -426,6 +550,18 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_SCATTER_ND_MIN_MAX_CPU);
                           TensorScatterOp<dev##Device, type, index_type,      \
                                           scatter_nd_op::UpdateOp::SUB>)
 
+#define REGISTER_SCATTER_ND_TENSOR_SUB_INT32_GPU_INDEX_TYPE(index_type) \
+  REGISTER_KERNEL_BUILDER(Name("TensorScatterSub")                      \
+                              .Device(DEVICE_GPU)                       \
+                              .TypeConstraint<int32>("T")               \
+                              .TypeConstraint<index_type>("Tindices")   \
+                              .HostMemory("tensor")                     \
+                              .HostMemory("indices")                    \
+                              .HostMemory("updates")                    \
+                              .HostMemory("output"),                    \
+                          TensorScatterOp<CPUDevice, int32, index_type, \
+                                          scatter_nd_op::UpdateOp::SUB>)
+
 #define REGISTER_SCATTER_ND_TENSOR_MIN_TYPE_INDEX_TYPE(type, index_type, dev) \
   REGISTER_KERNEL_BUILDER(Name("TensorScatterMin")                            \
                               .Device(DEVICE_##dev)                           \
@@ -434,6 +570,18 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_SCATTER_ND_MIN_MAX_CPU);
                           TensorScatterOp<dev##Device, type, index_type,      \
                                           scatter_nd_op::UpdateOp::MIN>)
 
+#define REGISTER_SCATTER_ND_TENSOR_MIN_INT32_GPU_INDEX_TYPE(index_type) \
+  REGISTER_KERNEL_BUILDER(Name("TensorScatterMin")                      \
+                              .Device(DEVICE_GPU)                       \
+                              .TypeConstraint<int32>("T")               \
+                              .TypeConstraint<index_type>("Tindices")   \
+                              .HostMemory("tensor")                     \
+                              .HostMemory("indices")                    \
+                              .HostMemory("updates")                    \
+                              .HostMemory("output"),                    \
+                          TensorScatterOp<CPUDevice, int32, index_type, \
+                                          scatter_nd_op::UpdateOp::MIN>)
+
 #define REGISTER_SCATTER_ND_TENSOR_MAX_TYPE_INDEX_TYPE(type, index_type, dev) \
   REGISTER_KERNEL_BUILDER(Name("TensorScatterMax")                            \
                               .Device(DEVICE_##dev)                           \
@@ -442,6 +590,18 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_SCATTER_ND_MIN_MAX_CPU);
                           TensorScatterOp<dev##Device, type, index_type,      \
                                           scatter_nd_op::UpdateOp::MAX>)
 
+#define REGISTER_SCATTER_ND_TENSOR_MAX_INT32_GPU_INDEX_TYPE(index_type) \
+  REGISTER_KERNEL_BUILDER(Name("TensorScatterMax")                      \
+                              .Device(DEVICE_GPU)                       \
+                              .TypeConstraint<int32>("T")               \
+                              .TypeConstraint<index_type>("Tindices")   \
+                              .HostMemory("tensor")                     \
+                              .HostMemory("indices")                    \
+                              .HostMemory("updates")                    \
+                              .HostMemory("output"),                    \
+                          TensorScatterOp<CPUDevice, int32, index_type, \
+                                          scatter_nd_op::UpdateOp::MAX>)
+
 #define REGISTER_SCATTER_ND_TENSOR_UPDATE_CPU(type)                    \
   REGISTER_SCATTER_ND_TENSOR_UPDATE_TYPE_INDEX_TYPE(type, int32, CPU); \
   REGISTER_SCATTER_ND_TENSOR_UPDATE_TYPE_INDEX_TYPE(type, int64, CPU);
@@ -495,9 +655,14 @@ TF_CALL_bool(REGISTER_SCATTER_ND_TENSOR_UPDATE_CPU);
   REGISTER_SCATTER_ND_UPDATE_GPU(type);   \
   REGISTER_SCATTER_ND_GPU(type);
 
-// TODO(b/155931747): Use HostMemory for int32
-TF_CALL_int32(REGISTER_SCATTER_ND_ALL_GPU);
-TF_CALL_int32(REGISTER_SCATTER_ND_MIN_MAX_GPU);
+#define REGISTER_SCATTER_ND_ALL_INT32_GPU() \
+  REGISTER_SCATTER_ND_ADD_SUB_INT32_GPU();  \
+  REGISTER_SCATTER_ND_UPDATE_INT32_GPU();   \
+  REGISTER_SCATTER_ND_INT32_GPU();
+
+REGISTER_SCATTER_ND_ALL_INT32_GPU();
+REGISTER_SCATTER_ND_MIN_MAX_INT32_GPU();
+
 TF_CALL_int64(REGISTER_SCATTER_ND_ALL_GPU);
 TF_CALL_int64(REGISTER_SCATTER_ND_MIN_MAX_GPU);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_SCATTER_ND_ALL_GPU);
@@ -506,28 +671,6 @@ TF_CALL_COMPLEX_TYPES(REGISTER_SCATTER_ND_ALL_GPU);
 
 #undef REGISTER_SCATTER_ND_ALL_GPU
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SCATTER_ND_ADD_SUB_SYCL(type) \
-  REGISTER_SCATTER_ND_ADD_SUB(type, SYCL);
-
-#define REGISTER_SCATTER_ND_UPDATE_SYCL(type) \
-  REGISTER_SCATTER_ND_UPDATE(type, SYCL);
-
-#define REGISTER_SCATTER_ND_MIN_MAX_SYCL(type) \
-  REGISTER_SCATTER_ND_MIN_MAX(type, SYCL);
-
-TF_CALL_int32(REGISTER_SCATTER_ND_ADD_SUB_SYCL);
-TF_CALL_int32(REGISTER_SCATTER_ND_UPDATE_SYCL);
-TF_CALL_int32(REGISTER_SCATTER_ND_MIN_MAX_SYCL);
-TF_CALL_bool(REGISTER_SCATTER_ND_UPDATE_SYCL);
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_ADD_SUB_SYCL);
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_UPDATE_SYCL);
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_MIN_MAX_SYCL);
-
-#undef REGISTER_SCATTER_ND_ADD_SUB_SYCL
-#undef REGISTER_SCATTER_ND_MIN_MAX_SYCL
-#undef REGISTER_SCATTER_ND_UPDATE_SYCL
-#endif  // TENSORFLOW_USE_SYCL
 
 #define REGISTER_SCATTER_ND_TENSOR_UPDATE_GPU(type)                    \
   REGISTER_SCATTER_ND_TENSOR_UPDATE_TYPE_INDEX_TYPE(type, int32, GPU); \
@@ -554,10 +697,27 @@ TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_MIN_MAX_SYCL);
   REGISTER_SCATTER_ND_TENSOR_UPDATE_GPU(type); \
   REGISTER_SCATTER_ND_TENSOR_SUB_GPU(type);
 
+#define REGISTER_SCATTER_ND_TENSOR_INT32_GPU()                   \
+  REGISTER_SCATTER_ND_TENSOR_ADD_INT32_GPU_INDEX_TYPE(int32);    \
+  REGISTER_SCATTER_ND_TENSOR_ADD_INT32_GPU_INDEX_TYPE(int64);    \
+  REGISTER_SCATTER_ND_TENSOR_SUB_INT32_GPU_INDEX_TYPE(int32);    \
+  REGISTER_SCATTER_ND_TENSOR_SUB_INT32_GPU_INDEX_TYPE(int64);    \
+  REGISTER_SCATTER_ND_TENSOR_UPDATE_INT32_GPU_INDEX_TYPE(int32); \
+  REGISTER_SCATTER_ND_TENSOR_UPDATE_INT32_GPU_INDEX_TYPE(int64);
+
 #define REGISTER_SCATTER_ND_TENSOR_GPU_MIN_MAX(type) \
   REGISTER_SCATTER_ND_TENSOR_MIN_GPU(type);          \
   REGISTER_SCATTER_ND_TENSOR_MAX_GPU(type);
 
+#define REGISTER_SCATTER_ND_TENSOR_MIN_MAX_INT32_GPU()        \
+  REGISTER_SCATTER_ND_TENSOR_MIN_INT32_GPU_INDEX_TYPE(int32); \
+  REGISTER_SCATTER_ND_TENSOR_MIN_INT32_GPU_INDEX_TYPE(int64); \
+  REGISTER_SCATTER_ND_TENSOR_MAX_INT32_GPU_INDEX_TYPE(int32); \
+  REGISTER_SCATTER_ND_TENSOR_MAX_INT32_GPU_INDEX_TYPE(int64);
+
+REGISTER_SCATTER_ND_TENSOR_INT32_GPU();
+REGISTER_SCATTER_ND_TENSOR_MIN_MAX_INT32_GPU();
+
 TF_CALL_int64(REGISTER_SCATTER_ND_TENSOR_GPU);
 TF_CALL_int64(REGISTER_SCATTER_ND_TENSOR_GPU_MIN_MAX);
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_TENSOR_GPU);
@@ -581,15 +741,33 @@ TF_CALL_COMPLEX_TYPES(REGISTER_SCATTER_ND_TENSOR_GPU);
 #undef REGISTER_SCATTER_ND_TENSOR_GPU
 #undef REGISTER_SCATTER_ND_TENSOR_UPDATE_TYPE_INDEX_TYPE
 #undef REGISTER_SCATTER_ND_TENSOR_ADD_TYPE_INDEX_TYPE
+#undef REGISTER_SCATTER_ND_TENSOR_ADD_INT32_GPU_INDEX_TYPE
 #undef REGISTER_SCATTER_ND_TENSOR_SUB_TYPE_INDEX_TYPE
+#undef REGISTER_SCATTER_ND_TENSOR_SUB_INT32_GPU_INDEX_TYPE
 #undef REGISTER_SCATTER_ND_TENSOR_MIN_TYPE_INDEX_TYPE
+#undef REGISTER_SCATTER_ND_TENSOR_MIN_INT32_GPU_INDEX_TYPE
 #undef REGISTER_SCATTER_ND_TENSOR_MAX_TYPE_INDEX_TYPE
+#undef REGISTER_SCATTER_ND_TENSOR_MAX_INT32_GPU_INDEX_TYPE
 #undef REGISTER_SCATTER_ND_TENSOR_UPDATE_GPU
+#undef REGISTER_SCATTER_ND_TENSOR_UPDATE_INT32_GPU_INDEX_TYPE
 #undef REGISTER_SCATTER_ND_TENSOR_ADD_GPU
 #undef REGISTER_SCATTER_ND_TENSOR_SUB_GPU
 #undef REGISTER_SCATTER_ND_TENSOR_MIN_GPU
 #undef REGISTER_SCATTER_ND_TENSOR_MAX_GPU
 #undef REGISTER_SCATTER_ND_TENSOR_GPU
+#undef REGISTER_SCATTER_ND_TENSOR_INT32_GPU
+#undef REGISTER_SCATTER_ND_TENSOR_MIN_MAX_INT32_GPU
+#undef REGISTER_SCATTER_ND_ADD_SUB_INT32_GPU
+#undef REGISTER_SCATTER_ND_ALL_INT32_GPU
+#undef REGISTER_SCATTER_ND_MIN_MAX_INT32_GPU
+#undef REGISTER_SCATTER_ND_INT32_GPU
+#undef REGISTER_SCATTER_ND_UPDATE_INT32_GPU
+#undef REGISTER_RESOURCE_SCATTER_ND_UPDATE_KERNEL_INT32_GPU
+#undef REGISTER_RESOURCE_SCATTER_ND_UPDATE_KERNEL_INDEX_INT32_GPU
+#undef REGISTER_SCATTER_ND_UPDATE_KERNEL_INT32_GPU
+#undef REGISTER_SCATTER_ND_UPDATE_KERNEL_INDEX_INT32_GPU
+#undef REGISTER_SCATTER_ND_KERNEL_INT32_GPU
+#undef REGISTER_SCATTER_ND_KERNEL_INDEX_INT32_GPU
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
@@ -602,30 +780,35 @@ Status ValidateUpdateShape(const TensorShape& params_shape,
       (indices.dims() > 1) ? indices.dim_size(indices.dims() - 1) : 1;
   const int64 batch_dim = (indices.dims() > 1) ? indices.dims() - 1 : 1;
 
-  auto shape_err = [&]() {
+  auto shape_err_prefix = [&]() {
     return errors::InvalidArgument(
-        "Must have updates.shape = indices.shape[:batch_dim] + ",
-        "params_shape[slice_dim:], got updates.shape: ",
-        updates.shape().DebugString(),
-        ", indices.shape: ", indices.shape().DebugString(),
-        ", params_shape: ", params_shape.DebugString(),
-        ", slice_dim: ", slice_dim, ", and batch_dim: ", batch_dim);
+        "Dimensions [0,", batch_dim,
+        ") of indices[shape=", indices.shape().DebugString(),
+        "] must match dimensions [0,", batch_dim,
+        ") of updates[shape=", updates.shape().DebugString(), "]");
+  };
+  auto shape_err_suffix = [&]() {
+    return errors::InvalidArgument(
+        "Dimensions [", slice_dim, ",", params_shape.dims(),
+        ") of input[shape=", params_shape.DebugString(),
+        "] must match dimensions [", slice_dim, ",", updates.dims(),
+        ") of updates[shape=", updates.shape().DebugString(), "]");
   };
 
-  if (updates.dims() < batch_dim) return shape_err();
+  if (updates.dims() < batch_dim) return shape_err_prefix();
   if (params_shape.dims() < slice_dim + (updates.dims() - batch_dim)) {
-    return shape_err();
+    return shape_err_suffix();
   }
   if (updates.dims() != batch_dim + params_shape.dims() - slice_dim) {
-    return shape_err();
+    return shape_err_suffix();
   }
   for (int d = 0; d < batch_dim; ++d) {
-    if (updates.dim_size(d) != indices.dim_size(d)) return shape_err();
+    if (updates.dim_size(d) != indices.dim_size(d)) return shape_err_prefix();
   }
   for (int d = 0; d < updates.dims() - batch_dim; ++d) {
     if (updates.dim_size(d + batch_dim) !=
         params_shape.dim_size(d + slice_dim)) {
-      return shape_err();
+      return shape_err_suffix();
     }
   }
   return Status::OK();
@@ -654,9 +837,9 @@ Status PrepareAndValidateInputs(const TensorShape& params_shape,
 
   if (updates.dim_size(0) != indices.dim_size(0)) {
     return errors::InvalidArgument(
-        "The outermost dimension of updates and indices ",
-        "must match. Got indices.shape ", indices_shape.DebugString(),
-        ", updates.shape ", updates_shape.DebugString());
+        "Dimensions [0,1) of indices[shape=", indices_shape.DebugString(),
+        "] = ", indices.dim_size(0), " must match dimensions [0,1) of updates[",
+        "shape=", updates_shape.DebugString(), "] = ", updates.dim_size(0));
   }
   TF_RETURN_IF_ERROR(ValidateUpdateShape(params_shape, indices, updates));
 
@@ -713,30 +896,6 @@ class IndexFlattener {
   }
 };
 
-#ifdef TENSORFLOW_USE_SYCL
-template <typename Index>
-class IndexFlattener<SYCLDevice, Index> {
- public:
-  IndexFlattener() { indices_host_ = nullptr; }
-  ~IndexFlattener() { delete[] indices_host_; }
-
-  inline typename TTypes<Index, 2>::ConstTensor operator()(
-      OpKernelContext* c, const Tensor& indices) {
-    size_t num_indices = indices.NumElements();
-    indices_host_ = new Index[num_indices];
-    auto device = c->eigen_sycl_device();
-    auto size = sizeof(Index) * num_indices;
-    auto src_ptr = GetBase(&indices);
-    device.memcpyDeviceToHost(indices_host_, static_cast<const Index*>(src_ptr),
-                              size);
-    return typename TTypes<Index, 2>::ConstTensor(
-        indices_host_, indices.shape().AsEigenDSizes<2>());
-  }
-
- private:
-  Index* indices_host_;
-};
-#endif
 
 template <typename Device, typename T, typename Index,
           scatter_nd_op::UpdateOp Op>
@@ -754,7 +913,12 @@ Status DoScatterNd(OpKernelContext* c, const Tensor& indices,
   auto updates_flat = updates.shaped<T, 2>({num_updates, slice_size});
 
   if (allocate) {
-    TF_RETURN_IF_ERROR(c->allocate_temp(DataTypeToEnum<T>::value, shape, out));
+    AllocatorAttributes alloc_attr;
+    if (std::is_same<Device, CPUDevice>::value) {
+      alloc_attr.set_on_host(true);
+    }
+    TF_RETURN_IF_ERROR(
+        c->allocate_temp(DataTypeToEnum<T>::value, shape, out, alloc_attr));
   } else {
     CHECK_NOTNULL(out);
   }
diff --git a/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h b/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h
index 948db7f932d..6cfa1df7c61 100644
--- a/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h
+++ b/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h
@@ -38,9 +38,6 @@ limitations under the License.
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 class OpKernelContext;
 
@@ -194,97 +191,6 @@ TF_CALL_bool(REGISTER_SCATTER_ND_MATH);
 #undef REGISTER_SCATTER_ND_UPDATE
 #undef REGISTER_SCATTER_ND_INDEX
 #undef REGISTER_SCATTER_ND_FULL
-
-// Implementation of update functor for SYCL.
-#ifdef TENSORFLOW_USE_SYCL
-
-template <typename T, typename Index, scatter_nd_op::UpdateOp OP, int IXDIM>
-struct ScatterNdFunctor<SYCLDevice, T, Index, OP, IXDIM> {
-  Index operator()(
-      const SYCLDevice& d, const Index slice_size,
-      const Eigen::array<Eigen::DenseIndex, IXDIM> output_shape_prefix,
-      typename TTypes<T, 2>::Tensor Tparams,
-      typename TTypes<Index, 2>::ConstTensor Tindices,
-      typename TTypes<T, 2>::ConstTensor Tupdates,
-      typename TTypes<T, 2>::Tensor Toutput) {
-    // error_loc is -1 if there's no out-of-bounds index,
-    // otherwise it is the location of an OOB index in Tindices.
-    Index error_loc = -1;
-
-    const Eigen::DenseIndex batch_size = Tindices.dimension(0);
-
-    Index batch_strides[IXDIM];
-    for (int dim = IXDIM - 1; dim >= 0; --dim) {
-      if (dim == IXDIM - 1) {
-        batch_strides[dim] = 1;
-      } else {
-        batch_strides[dim] =
-            batch_strides[dim + 1] * output_shape_prefix[dim + 1];
-      }
-    }
-
-    for (Eigen::DenseIndex loc = 0; loc < batch_size; ++loc) {
-      Index i = 0;
-      bool out_of_bounds = false;
-      for (int dim = 0; dim < IXDIM; ++dim) {
-        const Index ix_d = internal::SubtleMustCopy(Tindices(loc, dim));
-        out_of_bounds |= !FastBoundsCheck(ix_d, output_shape_prefix[dim]);
-        i += ix_d * batch_strides[dim];
-      }
-      if (TF_PREDICT_FALSE(out_of_bounds)) {
-        error_loc = loc;
-        break;
-      } else {
-        auto input_chip = Toutput.template chip<0>(i);
-        auto output_chip = input_chip;
-        auto update_chip = Tupdates.template chip<0>(loc);
-        update_executor::UpdateExecutor<
-            SYCLDevice, decltype(input_chip), decltype(update_chip),
-            decltype(output_chip), OP>::Execute(d, input_chip, update_chip,
-                                                output_chip);
-      }
-    }
-
-    return error_loc;
-  }
-};
-
-#define REGISTER_SCATTER_ND_FULL_SYCL(T, Index, op)                           \
-  template Index                                                              \
-  ScatterNdFunctor<SYCLDevice, T, Index, op, CPU_PROVIDED_IXDIM>::operator()( \
-      const SYCLDevice& d, const Index slice_size,                            \
-      const Eigen::array<Eigen::DenseIndex, CPU_PROVIDED_IXDIM>               \
-          output_shape_prefix,                                                \
-      typename TTypes<T, 2>::Tensor Tparams,                                  \
-      typename TTypes<Index, 2>::ConstTensor Tindices,                        \
-      typename TTypes<T, 2>::ConstTensor Tupdates,                            \
-      typename TTypes<T, 2>::Tensor Toutput)
-
-#define REGISTER_SCATTER_ND_INDEX_SYCL(type, op)  \
-  REGISTER_SCATTER_ND_FULL_SYCL(type, int32, op); \
-  REGISTER_SCATTER_ND_FULL_SYCL(type, int64, op)
-
-#define REGISTER_SCATTER_ND_UPDATE_SYCL(type) \
-  REGISTER_SCATTER_ND_INDEX_SYCL(type, scatter_nd_op::UpdateOp::ASSIGN);
-
-#define REGISTER_SCATTER_ND_MATH_SYCL(type)                           \
-  REGISTER_SCATTER_ND_INDEX_SYCL(type, scatter_nd_op::UpdateOp::ADD); \
-  REGISTER_SCATTER_ND_INDEX_SYCL(type, scatter_nd_op::UpdateOp::SUB); \
-  REGISTER_SCATTER_ND_INDEX_SYCL(type, scatter_nd_op::UpdateOp::MIN); \
-  REGISTER_SCATTER_ND_INDEX_SYCL(type, scatter_nd_op::UpdateOp::MAX);
-
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_UPDATE_SYCL)
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_MATH_SYCL)
-REGISTER_SCATTER_ND_UPDATE_SYCL(int32);
-REGISTER_SCATTER_ND_MATH_SYCL(int32);
-
-#undef REGISTER_SCATTER_ND_MATH_SYCL
-#undef REGISTER_SCATTER_ND_UPDATE_SYCL
-#undef REGISTER_SCATTER_ND_INDEX_SYCL
-#undef REGISTER_SCATTER_ND_FULL_SYCL
-
-#endif  // TENSORFLOW_USE_SYCL
-
 }  // namespace functor
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/scatter_nd_op_test.cc b/tensorflow/core/kernels/scatter_nd_op_test.cc
index 1461831a1fb..9c31bed784f 100644
--- a/tensorflow/core/kernels/scatter_nd_op_test.cc
+++ b/tensorflow/core/kernels/scatter_nd_op_test.cc
@@ -200,8 +200,8 @@ TEST_F(ScatterNdUpdateOpTest, Error_WrongDimsIndices) {
   Status s = RunOpKernel();
   EXPECT_TRUE(absl::StrContains(
       s.ToString(),
-      "The outermost dimension of updates and indices must match. Got "
-      "indices.shape [1,3,1], updates.shape [3,3]"))
+      "Dimensions [0,1) of indices[shape=[1,3,1]] = 1 must match dimensions "
+      "[0,1) of updates[shape=[3,3]] = 3"))
       << s;
 }
 
@@ -217,7 +217,9 @@ TEST_F(ScatterNdUpdateOpTest, Error_MismatchedParamsAndUpdateDimensions) {
       {100, 101, 102, 103, 777, 778, 779, 780, 10000, 10001, 10002, 10004});
   Status s = RunOpKernel();
   EXPECT_TRUE(absl::StrContains(
-      s.ToString(), "Must have updates.shape = indices.shape[:batch_dim]"))
+      s.ToString(),
+      "Dimensions [1,2) of input[shape=[5,3]] must match dimensions [1,2) of "
+      "updates[shape=[3,4]]"))
       << s;
 }
 
@@ -233,7 +235,8 @@ TEST_F(ScatterNdUpdateOpTest, Error_MismatchedIndicesAndUpdateDimensions) {
   Status s = RunOpKernel();
   EXPECT_TRUE(absl::StrContains(
       s.ToString(),
-      "The outermost dimension of updates and indices must match."))
+      "Dimensions [0,1) of indices[shape=[3,1]] = 3 must match dimensions [0,1)"
+      " of updates[shape=[2,3]] = 2"))
       << s;
 }
 
diff --git a/tensorflow/core/kernels/scatter_op.cc b/tensorflow/core/kernels/scatter_op.cc
index c7ea9def4fa..f551711e25a 100644
--- a/tensorflow/core/kernels/scatter_op.cc
+++ b/tensorflow/core/kernels/scatter_op.cc
@@ -23,17 +23,11 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/util.h"
 
-#ifdef TENSORFLOW_USE_SYCL
-#include "tensorflow/core/common_runtime/sycl/sycl_util.h"
-#endif  // TENSORFLOW_USE_SYCL
 
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 // Check whether updates.shape = indices.shape + params.shape[1:]
 static bool ValidShapes(const Tensor& params, const Tensor& updates,
@@ -151,94 +145,6 @@ class ScatterUpdateOp : public OpKernel {
   }
 };
 
-#ifdef TENSORFLOW_USE_SYCL
-template <typename T, typename Index, scatter_op::UpdateOp op>
-class ScatterUpdateOp<SYCLDevice, T, Index, op> : public OpKernel {
- public:
-  explicit ScatterUpdateOp(OpKernelConstruction* c) : OpKernel(c) {
-    OP_REQUIRES_OK(c, c->GetAttr("use_locking", &use_exclusive_lock_));
-  }
-
-  void Compute(OpKernelContext* c) override {
-    if (use_exclusive_lock_) {
-      // Hold mutex while we apply updates
-      mutex_lock l(*c->input_ref_mutex(0));
-      DoCompute(c);
-    } else {
-      DoCompute(c);
-    }
-  }
-
- private:
-  bool use_exclusive_lock_;
-
-  void DoCompute(OpKernelContext* c) {
-    Tensor params = c->mutable_input(0, use_exclusive_lock_);
-    const Tensor& indices = c->input(1);
-    const Tensor& updates = c->input(2);
-    DoValidationChecking(c, params, indices, updates);
-    if (!c->status().ok()) return;
-
-    // Check that we have enough index space
-    const int64 N_big = indices.NumElements();
-    OP_REQUIRES(
-        c, N_big <= std::numeric_limits<Index>::max(),
-        errors::InvalidArgument("indices has too many elements for ",
-                                DataTypeString(DataTypeToEnum<Index>::v()),
-                                " indexing: ", N_big, " > ",
-                                std::numeric_limits<Index>::max()));
-    const Index N = static_cast<Index>(indices.NumElements());
-    OP_REQUIRES(
-        c, params.dim_size(0) <= std::numeric_limits<Index>::max(),
-        errors::InvalidArgument("params.shape[0] too large for ",
-                                DataTypeString(DataTypeToEnum<Index>::v()),
-                                " indexing: ", params.dim_size(0), " > ",
-                                std::numeric_limits<Index>::max()));
-
-    // We always return the input ref.
-    c->forward_ref_input_to_ref_output(0, 0);
-
-    if (N > 0) {
-      auto index_size = indices.NumElements() * sizeof(Index);
-      Tensor indices_host = Tensor(indices.dtype(), indices.shape());
-
-      auto src_ptr = GetBase(&indices);
-      auto dst_ptr = GetBase(&indices_host);
-
-      c->eigen_sycl_device().memcpyDeviceToHost(
-          dst_ptr, static_cast<const Index*>(src_ptr), index_size);
-
-      auto indices_flat = indices_host.flat<Index>();
-      auto params_flat = params.flat_outer_dims<T>();
-
-      if (TensorShapeUtils::IsScalar(updates.shape())) {
-        const auto update = updates.scalar<T>();
-
-        functor::ScatterScalarFunctorSYCL<T, Index, op> functor;
-        const Index bad_i = functor(c, c->template eigen_device<SYCLDevice>(),
-                                    params_flat, update, indices_flat);
-        OP_REQUIRES(c, bad_i < 0,
-                    errors::InvalidArgument(
-                        "indices", SliceDebugString(indices.shape(), bad_i),
-                        " = ", indices_flat(bad_i), " is not in [0, ",
-                        params.dim_size(0), ")"));
-      } else {
-        auto updates_flat =
-            updates.shaped<T, 2>({N, updates.NumElements() / N});
-
-        functor::ScatterFunctorSYCL<T, Index, op> functor;
-        const Index bad_i = functor(c, c->template eigen_device<SYCLDevice>(),
-                                    params_flat, updates_flat, indices_flat);
-        OP_REQUIRES(c, bad_i < 0,
-                    errors::InvalidArgument(
-                        "indices", SliceDebugString(indices.shape(), bad_i),
-                        " = ", indices_flat(bad_i), " is not in [0, ",
-                        params.dim_size(0), ")"));
-      }
-    }
-  }
-};
-#endif  // TENSORFLOW_USE_SYCL
 
 #define REGISTER_SCATTER_KERNEL_INDEX(type, index_type, dev, name, op) \
   REGISTER_KERNEL_BUILDER(Name(name)                                   \
@@ -293,22 +199,6 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_SCATTER_UPDATE_GPU);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 // Registers GPU kernels.
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SCATTER_ARITHMETIC_SYCL(type) \
-  REGISTER_SCATTER_ARITHMETIC(type, SYCL);
-
-#define REGISTER_SCATTER_MINMAX_SYCL(type) REGISTER_SCATTER_MINMAX(type, SYCL);
-
-#define REGISTER_SCATTER_UPDATE_SYCL(type) REGISTER_SCATTER_UPDATE(type, SYCL);
-
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ARITHMETIC_SYCL);
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_MINMAX_SYCL);
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_UPDATE_SYCL);
-
-#undef REGISTER_SCATTER_ARITHMETIC_SYCL
-#undef REGISTER_SCATTER_MINMAX_SYCL
-#undef REGISTER_SCATTER_UPDATE_SYCL
-#endif  // TENSORFLOW_USE_SYCL
 
 #undef REGISTER_SCATTER_ARITHMETIC
 #undef REGISTER_SCATTER_ARITHMETIC_CPU
diff --git a/tensorflow/core/kernels/sequence_ops.cc b/tensorflow/core/kernels/sequence_ops.cc
index 7ce2016a2f7..d15f95125e0 100644
--- a/tensorflow/core/kernels/sequence_ops.cc
+++ b/tensorflow/core/kernels/sequence_ops.cc
@@ -99,14 +99,6 @@ class RangeOp : public OpKernel {
 
 #define REGISTER_CPU_KERNEL(T) REGISTER_KERNEL(DEVICE_CPU, T)
 #define REGISTER_GPU_KERNEL(T) REGISTER_KERNEL(DEVICE_GPU, T)
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(T) REGISTER_KERNEL(DEVICE_SYCL, T)
-TF_CALL_float(REGISTER_SYCL_KERNEL);
-TF_CALL_double(REGISTER_SYCL_KERNEL);
-TF_CALL_int32(REGISTER_SYCL_KERNEL);
-TF_CALL_int64(REGISTER_SYCL_KERNEL);
-#undef REGISTER_SYCL_KERNEL
-#endif  // TENSORFLOW_USE_SYCL
 
 TF_CALL_float(REGISTER_CPU_KERNEL);
 TF_CALL_double(REGISTER_CPU_KERNEL);
@@ -189,12 +181,6 @@ TF_CALL_float(REGISTER_GPU_KERNEL);
 TF_CALL_double(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(T) REGISTER_KERNEL_ALL_NUMS(DEVICE_SYCL, T)
-TF_CALL_float(REGISTER_SYCL_KERNEL);
-TF_CALL_double(REGISTER_SYCL_KERNEL);
-#undef REGISTER_SYCL_KERNEL
-#endif  // TENSORFLOW_USE_SYCL
 
 #undef REGISTER_CPU_KERNEL
 #undef REGISTER_KERNEL_ALL_NUMS
diff --git a/tensorflow/core/kernels/session_ops.cc b/tensorflow/core/kernels/session_ops.cc
index d83a714452f..9e67fec3c20 100644
--- a/tensorflow/core/kernels/session_ops.cc
+++ b/tensorflow/core/kernels/session_ops.cc
@@ -85,23 +85,6 @@ TF_CALL_NUMBER_TYPES(REGISTER_GPU_KERNEL);
 REGISTER_GPU_KERNEL(bool);
 #undef REGISTER_GPU_KERNEL
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)                        \
-  REGISTER_KERNEL_BUILDER(Name("GetSessionHandle")        \
-                              .Device(DEVICE_SYCL)        \
-                              .HostMemory("handle")       \
-                              .TypeConstraint<type>("T"), \
-                          GetSessionHandleOp)             \
-  REGISTER_KERNEL_BUILDER(Name("GetSessionHandleV2")      \
-                              .Device(DEVICE_SYCL)        \
-                              .HostMemory("handle")       \
-                              .TypeConstraint<type>("T"), \
-                          GetSessionHandleOp)
-
-TF_CALL_NUMBER_TYPES(REGISTER_SYCL_KERNEL);
-REGISTER_SYCL_KERNEL(bool);
-#undef REGISTER_SYCL_KERNEL
-#endif  // TENSORFLOW_USE_SYCL
 
 class GetSessionTensorOp : public OpKernel {
  public:
@@ -133,18 +116,6 @@ TF_CALL_NUMBER_TYPES(REGISTER_GPU_KERNEL);
 REGISTER_GPU_KERNEL(bool);
 #undef REGISTER_GPU_KERNEL
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)                            \
-  REGISTER_KERNEL_BUILDER(Name("GetSessionTensor")            \
-                              .Device(DEVICE_SYCL)            \
-                              .HostMemory("handle")           \
-                              .TypeConstraint<type>("dtype"), \
-                          GetSessionTensorOp)
-
-TF_CALL_NUMBER_TYPES(REGISTER_SYCL_KERNEL);
-REGISTER_SYCL_KERNEL(bool);
-#undef REGISTER_SYCL_KERNEL
-#endif  // TENSORFLOW_USE_SYCL
 
 class DeleteSessionTensorOp : public OpKernel {
  public:
@@ -166,9 +137,4 @@ REGISTER_KERNEL_BUILDER(
     Name("DeleteSessionTensor").Device(DEVICE_GPU).HostMemory("handle"),
     DeleteSessionTensorOp);
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(
-    Name("DeleteSessionTensor").Device(DEVICE_SYCL).HostMemory("handle"),
-    DeleteSessionTensorOp);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/shape_ops.cc b/tensorflow/core/kernels/shape_ops.cc
index cf065f738d6..7b2ffa8a3d7 100644
--- a/tensorflow/core/kernels/shape_ops.cc
+++ b/tensorflow/core/kernels/shape_ops.cc
@@ -33,40 +33,6 @@ REGISTER_KERNEL_BUILDER(Name("Shape")
                             .TypeConstraint<int64>("out_type"),
                         ShapeOp<int64>);
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)                               \
-  REGISTER_KERNEL_BUILDER(Name("Shape")                          \
-                              .Device(DEVICE_SYCL)               \
-                              .HostMemory("output")              \
-                              .TypeConstraint<int32>("out_type") \
-                              .TypeConstraint<type>("T"),        \
-                          ShapeOp<int32>);                       \
-  REGISTER_KERNEL_BUILDER(Name("Shape")                          \
-                              .Device(DEVICE_SYCL)               \
-                              .HostMemory("output")              \
-                              .TypeConstraint<int64>("out_type") \
-                              .TypeConstraint<type>("T"),        \
-                          ShapeOp<int64>);
-
-TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
-TF_CALL_bool(REGISTER_SYCL_KERNEL);
-#undef REGISTER_SYCL_KERNEL
-
-REGISTER_KERNEL_BUILDER(Name("Shape")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("input")
-                            .HostMemory("output")
-                            .TypeConstraint<int32>("T")
-                            .TypeConstraint<int32>("out_type"),
-                        ShapeOp<int32>);
-REGISTER_KERNEL_BUILDER(Name("Shape")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("input")
-                            .HostMemory("output")
-                            .TypeConstraint<int32>("T")
-                            .TypeConstraint<int64>("out_type"),
-                        ShapeOp<int64>);
-#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define REGISTER_GPU_KERNEL(type)                                \
@@ -158,69 +124,11 @@ REGISTER_KERNEL_BUILDER(Name("ShapeN")
                         ShapeNOp<int64>);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)                               \
-  REGISTER_KERNEL_BUILDER(Name("ShapeN")                         \
-                              .Device(DEVICE_SYCL)               \
-                              .HostMemory("output")              \
-                              .TypeConstraint<int32>("out_type") \
-                              .TypeConstraint<type>("T"),        \
-                          ShapeNOp<int32>);                      \
-  REGISTER_KERNEL_BUILDER(Name("ShapeN")                         \
-                              .Device(DEVICE_SYCL)               \
-                              .HostMemory("output")              \
-                              .TypeConstraint<int64>("out_type") \
-                              .TypeConstraint<type>("T"),        \
-                          ShapeNOp<int64>)
-
-TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
-TF_CALL_bool(REGISTER_SYCL_KERNEL);
-#undef REGISTER_SYCL_KERNEL
-
-REGISTER_KERNEL_BUILDER(Name("ShapeN")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("input")
-                            .HostMemory("output")
-                            .TypeConstraint<int32>("T")
-                            .TypeConstraint<int32>("out_type"),
-                        ShapeNOp<int32>);
-REGISTER_KERNEL_BUILDER(Name("ShapeN")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("input")
-                            .HostMemory("output")
-                            .TypeConstraint<int32>("T")
-                            .TypeConstraint<int64>("out_type"),
-                        ShapeNOp<int64>);
-#endif  // TENSORFLOW_USE_SYCL
 
 // Rank ------------------------------------------
 REGISTER_KERNEL_BUILDER(Name("Rank").Device(DEVICE_CPU).HostMemory("output"),
                         RankOp);
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)                       \
-  REGISTER_KERNEL_BUILDER(Name("Rank")                   \
-                              .Device(DEVICE_SYCL)       \
-                              .TypeConstraint<type>("T") \
-                              .HostMemory("output"),     \
-                          RankOp);
-TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
-#undef REGISTER_SYCL_KERNEL
-
-REGISTER_KERNEL_BUILDER(Name("Rank")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int32>("T")
-                            .HostMemory("input")
-                            .HostMemory("output"),
-                        RankOp);
-
-REGISTER_KERNEL_BUILDER(Name("Rank")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<bool>("T")
-                            .HostMemory("input")
-                            .HostMemory("output"),
-                        RankOp);
-#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define REGISTER_GPU_KERNEL(type)                        \
@@ -303,39 +211,6 @@ REGISTER_KERNEL_BUILDER(Name("Size")
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)                               \
-  REGISTER_KERNEL_BUILDER(Name("Size")                           \
-                              .Device(DEVICE_SYCL)               \
-                              .TypeConstraint<type>("T")         \
-                              .TypeConstraint<int32>("out_type") \
-                              .HostMemory("output"),             \
-                          SizeOp<int32>);                        \
-  REGISTER_KERNEL_BUILDER(Name("Size")                           \
-                              .Device(DEVICE_SYCL)               \
-                              .TypeConstraint<type>("T")         \
-                              .TypeConstraint<int64>("out_type") \
-                              .HostMemory("output"),             \
-                          SizeOp<int64>);
-TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
-TF_CALL_bool(REGISTER_SYCL_KERNEL);
-#undef REGISTER_SYCL_KERNEL
-
-REGISTER_KERNEL_BUILDER(Name("Size")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int32>("T")
-                            .TypeConstraint<int32>("out_type")
-                            .HostMemory("input")
-                            .HostMemory("output"),
-                        SizeOp<int32>);
-REGISTER_KERNEL_BUILDER(Name("Size")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int32>("T")
-                            .TypeConstraint<int64>("out_type")
-                            .HostMemory("input")
-                            .HostMemory("output"),
-                        SizeOp<int64>);
-#endif  // TENSORFLOW_USE_SYCL
 
 // ExpandDims ------------------------------------
 REGISTER_KERNEL_BUILDER(Name("ExpandDims")
@@ -385,41 +260,6 @@ REGISTER_KERNEL_BUILDER(Name("ExpandDims")
                         ExpandDimsOp<int64>);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)                           \
-  REGISTER_KERNEL_BUILDER(Name("ExpandDims")                 \
-                              .Device(DEVICE_SYCL)           \
-                              .TypeConstraint<type>("T")     \
-                              .TypeConstraint<int32>("Tdim") \
-                              .HostMemory("dim"),            \
-                          ExpandDimsOp<int32>);              \
-  REGISTER_KERNEL_BUILDER(Name("ExpandDims")                 \
-                              .Device(DEVICE_SYCL)           \
-                              .TypeConstraint<type>("T")     \
-                              .TypeConstraint<int64>("Tdim") \
-                              .HostMemory("dim"),            \
-                          ExpandDimsOp<int64>);
-TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
-TF_CALL_bool(REGISTER_SYCL_KERNEL);
-#undef REGISTER_SYCL_KERNEL
-
-REGISTER_KERNEL_BUILDER(Name("ExpandDims")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int32>("T")
-                            .TypeConstraint<int32>("Tdim")
-                            .HostMemory("input")
-                            .HostMemory("dim")
-                            .HostMemory("output"),
-                        ExpandDimsOp<int32>);
-REGISTER_KERNEL_BUILDER(Name("ExpandDims")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int32>("T")
-                            .TypeConstraint<int64>("Tdim")
-                            .HostMemory("input")
-                            .HostMemory("dim")
-                            .HostMemory("output"),
-                        ExpandDimsOp<int64>);
-#endif  // TENSORFLOW_USE_SYCL
 
 // Squeeze ---------------------------------------
 REGISTER_KERNEL_BUILDER(Name("Squeeze").Device(DEVICE_CPU), SqueezeOp);
@@ -444,22 +284,6 @@ REGISTER_KERNEL_BUILDER(Name("Squeeze")
                         SqueezeOp);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)                                   \
-  REGISTER_KERNEL_BUILDER(                                           \
-      Name("Squeeze").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
-      SqueezeOp);
-TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
-TF_CALL_bool(REGISTER_SYCL_KERNEL);
-#undef REGISTER_SYCL_KERNEL
-
-REGISTER_KERNEL_BUILDER(Name("Squeeze")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int32>("T")
-                            .HostMemory("input")
-                            .HostMemory("output"),
-                        SqueezeOp);
-#endif  // TENSORFLOW_USE_SYCL
 
 class EnsureShapeOp : public OpKernel {
  public:
@@ -497,30 +321,6 @@ class EnsureShapeOp : public OpKernel {
 // constraints.
 REGISTER_KERNEL_BUILDER(Name("EnsureShape").Device(DEVICE_CPU), EnsureShapeOp);
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)                                       \
-  REGISTER_KERNEL_BUILDER(                                               \
-      Name("EnsureShape").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
-      EnsureShapeOp)
-
-TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
-
-#undef REGISTER_SYCL_KERNEL
-
-#define REGISTER_SYCL_HOST_KERNEL(type)                   \
-  REGISTER_KERNEL_BUILDER(Name("EnsureShape")             \
-                              .Device(DEVICE_SYCL)        \
-                              .HostMemory("input")        \
-                              .HostMemory("output")       \
-                              .TypeConstraint<type>("T"), \
-                          EnsureShapeOp)
-
-REGISTER_SYCL_HOST_KERNEL(int32);
-REGISTER_SYCL_HOST_KERNEL(bool);
-
-#undef REGISTER_SYCL_HOST_KERNEL
-
-#endif  // TENSORFLOW_USE_SYCL
 
 #define REGISTER_GPU_KERNEL(type)                                       \
   REGISTER_KERNEL_BUILDER(                                              \
diff --git a/tensorflow/core/kernels/slice_op.cc b/tensorflow/core/kernels/slice_op.cc
index 6d7cd6f2a3d..3bf3ce4c9d9 100644
--- a/tensorflow/core/kernels/slice_op.cc
+++ b/tensorflow/core/kernels/slice_op.cc
@@ -57,9 +57,6 @@ void IntTensorToInt64Vec(const Tensor& tensor,
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 // Shared code that is not dependent on the type of T.  We do this to reduce
 // code size by not duplicating all this for all T (float, double, int32, etc.)
@@ -339,57 +336,4 @@ REGISTER_KERNEL_BUILDER(Name("Slice")
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-// Forward declarations of the functor specializations for SYCL.
-namespace functor {
-#define DECLARE_SYCL_SPEC(T, NDIM)                                  \
-  template <>                                                       \
-  void Slice<SYCLDevice, T, NDIM>::operator()(                      \
-      const SYCLDevice& d, typename TTypes<T, NDIM>::Tensor output, \
-      typename TTypes<T, NDIM>::ConstTensor input,                  \
-      const Eigen::DSizes<Eigen::DenseIndex, NDIM>& indices,        \
-      const Eigen::DSizes<Eigen::DenseIndex, NDIM>& sizes);         \
-  extern template struct Slice<SYCLDevice, T, NDIM>;
-
-#define DECLARE_FOR_N(T)   \
-  DECLARE_SYCL_SPEC(T, 1); \
-  DECLARE_SYCL_SPEC(T, 2); \
-  DECLARE_SYCL_SPEC(T, 3); \
-  DECLARE_SYCL_SPEC(T, 4); \
-  DECLARE_SYCL_SPEC(T, 5); \
-  DECLARE_SYCL_SPEC(T, 6); \
-  DECLARE_SYCL_SPEC(T, 7); \
-  DECLARE_SYCL_SPEC(T, 8);
-
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(DECLARE_FOR_N);
-DECLARE_FOR_N(int32);
-DECLARE_FOR_N(bool);
-
-#undef DECLARE_FOR_N
-#undef DECLARE_SYCL_SPEC
-}  // namespace functor
-
-#define REGISTER_SYCL(type)                                    \
-  REGISTER_KERNEL_BUILDER(Name("Slice")                        \
-                              .Device(DEVICE_SYCL)             \
-                              .TypeConstraint<type>("T")       \
-                              .HostMemory("begin")             \
-                              .HostMemory("size")              \
-                              .TypeConstraint<int32>("Index"), \
-                          SliceOp<SYCLDevice, type>)
-
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL);
-
-REGISTER_KERNEL_BUILDER(Name("Slice")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int32>("T")
-                            .TypeConstraint<int32>("Index")
-                            .HostMemory("input")
-                            .HostMemory("begin")
-                            .HostMemory("size")
-                            .HostMemory("output"),
-                        SliceOp<CPUDevice, int32>);
-#undef REGISTER_SYCL
-
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/slice_op_cpu_impl.h b/tensorflow/core/kernels/slice_op_cpu_impl.h
index 64b6948190a..9eda840aa4a 100644
--- a/tensorflow/core/kernels/slice_op_cpu_impl.h
+++ b/tensorflow/core/kernels/slice_op_cpu_impl.h
@@ -33,17 +33,6 @@ TF_CALL_ALL_TYPES(DEFINE_CPU_KERNELS);
 
 #undef DEFINE_CPU_KERNELS
 
-#ifdef TENSORFLOW_USE_SYCL
-using SyclDevice = Eigen::SyclDevice;
-
-#define DEFINE_SYCL_KERNELS(T) \
-  template struct functor::Slice<SyclDevice, T, CPU_PROVIDED_IXDIM>;
-
-TF_CALL_GPU_NUMBER_TYPES(DEFINE_SYCL_KERNELS);
-DEFINE_SYCL_KERNELS(int32);
-
-#undef DEFINE_SYCL_KERNELS
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/snapshot_op.cc b/tensorflow/core/kernels/snapshot_op.cc
index 95bcfd6b39d..1cbcb49548f 100644
--- a/tensorflow/core/kernels/snapshot_op.cc
+++ b/tensorflow/core/kernels/snapshot_op.cc
@@ -61,16 +61,5 @@ TF_CALL_POD_TYPES(REGISTER_KERNEL);
 #undef REGISTER_KERNEL
 #endif
 
-#if TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SyclDevice;
-#define REGISTER_SYCL_KERNEL(TYPE)                                    \
-  REGISTER_KERNEL_BUILDER(                                            \
-      Name("Snapshot").Device(DEVICE_SYCL).TypeConstraint<TYPE>("T"), \
-      SnapshotOp<SyclDevice, TYPE>);
-
-TF_CALL_POD_TYPES(REGISTER_SYCL_KERNEL);
-
-#undef REGISTER_SYCL_KERNEL
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/softmax_op.cc b/tensorflow/core/kernels/softmax_op.cc
index 7d09b39ad4b..5bb6c3702e2 100644
--- a/tensorflow/core/kernels/softmax_op.cc
+++ b/tensorflow/core/kernels/softmax_op.cc
@@ -29,9 +29,6 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 // Partial specialization for a CPUDevice, that uses the Eigen implementation
 // from SoftmaxEigenImpl.
@@ -46,10 +43,6 @@ struct SoftmaxFunctorBase {
 template <typename T>
 struct SoftmaxFunctor<CPUDevice, T> : SoftmaxFunctorBase<CPUDevice, T> {};
 
-#ifdef TENSORFLOW_USE_SYCL
-template <typename T>
-struct SoftmaxFunctor<SYCLDevice, T> : SoftmaxFunctorBase<SYCLDevice, T> {};
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace functor
 
 template <typename Device, typename T>
@@ -93,12 +86,4 @@ TF_CALL_FLOAT_TYPES(REGISTER_CPU);
 
 #undef REGISTER_CPU
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(
-    Name("Softmax").Device(DEVICE_SYCL).TypeConstraint<float>("T"),
-    SoftmaxOp<SYCLDevice, float>);
-REGISTER_KERNEL_BUILDER(
-    Name("Softmax").Device(DEVICE_SYCL).TypeConstraint<double>("T"),
-    SoftmaxOp<SYCLDevice, double>);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/softmax_op_gpu.cu.cc b/tensorflow/core/kernels/softmax_op_gpu.cu.cc
index 3cf357713e9..160cf4f4b24 100644
--- a/tensorflow/core/kernels/softmax_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/softmax_op_gpu.cu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/lib/strings/str_util.h"
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #define EIGEN_USE_GPU
@@ -27,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/gpu_prim.h"
 #include "tensorflow/core/kernels/reduction_gpu_kernels.cu.h"
 #include "tensorflow/core/kernels/reduction_ops_common.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/gpu_kernel_helper.h"
 
diff --git a/tensorflow/core/kernels/softplus_op.h b/tensorflow/core/kernels/softplus_op.h
index 0e4de9cdeb1..b7f601072d2 100644
--- a/tensorflow/core/kernels/softplus_op.h
+++ b/tensorflow/core/kernels/softplus_op.h
@@ -19,7 +19,7 @@ limitations under the License.
 // nvcc.
 
 // clang-format off
-#include "tensorflow/core/lib/bfloat16/bfloat16.h"
+#include "tensorflow/core/platform/bfloat16.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 // clang-format on
 #include "tensorflow/core/framework/tensor_types.h"
diff --git a/tensorflow/core/kernels/spacetodepth_op.cc b/tensorflow/core/kernels/spacetodepth_op.cc
index 7919f933019..a292894e5b3 100644
--- a/tensorflow/core/kernels/spacetodepth_op.cc
+++ b/tensorflow/core/kernels/spacetodepth_op.cc
@@ -132,18 +132,18 @@ class SpaceToDepthOp : public OpKernel {
         // NCHW_VECT_C with 4 x qint8 can be treated as NCHW int32.
         auto Tinput_v = input.template reinterpret_last_dimension<int32, 4>();
         auto Toutput_v = outputs_tensor->reinterpret_last_dimension<int32, 4>();
-        functor::SpaceToDepthOpFunctor<GPUDevice, int32, FORMAT_NCHW> functor;
-        functor(context->eigen_device<GPUDevice>(), Tinput_v, block_size_,
+        functor::SpaceToDepthOpFunctor<Device, int32, FORMAT_NCHW> functor;
+        functor(context->eigen_device<Device>(), Tinput_v, block_size_,
                 Toutput_v);
       } else if (data_format_ == FORMAT_NCHW) {
         CHECK((std::is_same<T, RT>::value));
-        functor::SpaceToDepthOpFunctor<GPUDevice, RT, FORMAT_NCHW> functor;
-        functor(context->eigen_device<GPUDevice>(), input.tensor<RT, 4>(),
+        functor::SpaceToDepthOpFunctor<Device, RT, FORMAT_NCHW> functor;
+        functor(context->eigen_device<Device>(), input.tensor<RT, 4>(),
                 block_size_, outputs_tensor->tensor<RT, 4>());
       } else {
         CHECK((std::is_same<T, RT>::value));
-        functor::SpaceToDepthOpFunctor<GPUDevice, RT, FORMAT_NHWC> functor;
-        functor(context->eigen_device<GPUDevice>(), input.tensor<RT, 4>(),
+        functor::SpaceToDepthOpFunctor<Device, RT, FORMAT_NHWC> functor;
+        functor(context->eigen_device<Device>(), input.tensor<RT, 4>(),
                 block_size_, outputs_tensor->tensor<RT, 4>());
       }
     } else {
@@ -188,6 +188,16 @@ struct SpaceToDepthOpFunctor<CPUDevice, T, FORMAT_NHWC> {
     }
   }
 };
+
+#ifdef WIN32
+template <typename T>
+struct SpaceToDepthOpFunctor<CPUDevice, T, FORMAT_NCHW> {
+  void operator()(const CPUDevice& d, typename TTypes<T, 4>::ConstTensor input,
+                  int block_size, typename TTypes<T, 4>::Tensor output) {
+    LOG(FATAL) << "Trivial implementation to make debug build compile.";
+  }
+};
+#endif
 }  // namespace functor
 
 #define REGISTER(type)                                                \
diff --git a/tensorflow/core/kernels/sparse/mat_mul_op.cc b/tensorflow/core/kernels/sparse/mat_mul_op.cc
index bf9de570fbf..799e33000ad 100644
--- a/tensorflow/core/kernels/sparse/mat_mul_op.cc
+++ b/tensorflow/core/kernels/sparse/mat_mul_op.cc
@@ -886,11 +886,11 @@ class CSRSparseMatrixMatMul<GPUDevice, T> {
       const gpusparseOperation_t transB = HIPSPARSE_OPERATION_TRANSPOSE;
 
       gpusparseMatDescr_t descrA;
-      TF_RETURN_IF_GPUSPARSE_ERROR(hipsparseCreateMatDescr(&descrA));
+      TF_RETURN_IF_GPUSPARSE_ERROR(wrap::hipsparseCreateMatDescr(&descrA));
       TF_RETURN_IF_GPUSPARSE_ERROR(
-          hipsparseSetMatType(descrA, HIPSPARSE_MATRIX_TYPE_GENERAL));
+          wrap::hipsparseSetMatType(descrA, HIPSPARSE_MATRIX_TYPE_GENERAL));
       TF_RETURN_IF_GPUSPARSE_ERROR(
-          hipsparseSetMatIndexBase(descrA, HIPSPARSE_INDEX_BASE_ZERO));
+          wrap::hipsparseSetMatIndexBase(descrA, HIPSPARSE_INDEX_BASE_ZERO));
 #endif  // GOOGLE_CUDA
 
       TF_RETURN_IF_ERROR(
@@ -940,11 +940,11 @@ class CSRSparseMatrixMatVec<GPUDevice, T> {
           cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO));
 #elif TENSORFLOW_USE_ROCM
       gpusparseMatDescr_t descrA;
-      TF_RETURN_IF_GPUSPARSE_ERROR(hipsparseCreateMatDescr(&descrA));
+      TF_RETURN_IF_GPUSPARSE_ERROR(wrap::hipsparseCreateMatDescr(&descrA));
       TF_RETURN_IF_GPUSPARSE_ERROR(
-          hipsparseSetMatType(descrA, HIPSPARSE_MATRIX_TYPE_GENERAL));
+          wrap::hipsparseSetMatType(descrA, HIPSPARSE_MATRIX_TYPE_GENERAL));
       TF_RETURN_IF_GPUSPARSE_ERROR(
-          hipsparseSetMatIndexBase(descrA, HIPSPARSE_INDEX_BASE_ZERO));
+          wrap::hipsparseSetMatIndexBase(descrA, HIPSPARSE_INDEX_BASE_ZERO));
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
       const int m = a.dense_shape_host(0);
diff --git a/tensorflow/core/kernels/sparse/sparse_matrix_components_op.cc b/tensorflow/core/kernels/sparse/sparse_matrix_components_op.cc
index 2eaf9bd5310..2832d66ee9f 100644
--- a/tensorflow/core/kernels/sparse/sparse_matrix_components_op.cc
+++ b/tensorflow/core/kernels/sparse/sparse_matrix_components_op.cc
@@ -89,8 +89,9 @@ class CSRSparseMatrixComponentsOp : public OpKernel {
       slice_int(d,
                 /*output*/ row_ptrs,
                 /*input*/ csr_sparse_matrix->row_pointers().vec<int32>(),
-                /*slice_indices*/ EVec{index * (rows + 1)},
-                /*slice_sizes*/ EVec{rows + 1});
+                /*slice_indices*/
+                EVec{static_cast<Eigen::DenseIndex>(index * (rows + 1))},
+                /*slice_sizes*/ EVec{static_cast<Eigen::DenseIndex>(rows + 1)});
       slice_int(d,
                 /*output*/ col_inds,
                 /*input*/ csr_sparse_matrix->col_indices().vec<int32>(),
diff --git a/tensorflow/core/kernels/sparse_tensor_dense_matmul_op.cc b/tensorflow/core/kernels/sparse_tensor_dense_matmul_op.cc
index 9baaa6edb7b..791ac1bac0d 100644
--- a/tensorflow/core/kernels/sparse_tensor_dense_matmul_op.cc
+++ b/tensorflow/core/kernels/sparse_tensor_dense_matmul_op.cc
@@ -23,7 +23,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/kernels/fill_functor.h"
-#include "tensorflow/core/lib/bfloat16/bfloat16.h"
+#include "tensorflow/core/platform/bfloat16.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/split_lib.h b/tensorflow/core/kernels/split_lib.h
index 9d43a008226..674083b7bf1 100644
--- a/tensorflow/core/kernels/split_lib.h
+++ b/tensorflow/core/kernels/split_lib.h
@@ -48,16 +48,6 @@ struct Split<Eigen::ThreadPoolDevice, T, NDims> {
                   const Eigen::DSizes<Eigen::DenseIndex, NDims>& slice_sizes);
 };
 
-#ifdef TENSORFLOW_USE_SYCL
-template <typename T, int NDims>
-struct Split<Eigen::SyclDevice, T> {
-  void operator()(const Eigen::SyclDevice& d,
-                  typename TTypes<T, NDims>::Tensor output,
-                  typename TTypes<T, NDims>::ConstTensor input,
-                  const Eigen::DSizes<Eigen::DenseIndex, NDims>& slice_indices,
-                  const Eigen::DSizes<Eigen::DenseIndex, NDims>& slice_sizes);
-};
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace functor
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/split_lib_cpu.cc b/tensorflow/core/kernels/split_lib_cpu.cc
index a3060e4e90d..743ff1f04a5 100644
--- a/tensorflow/core/kernels/split_lib_cpu.cc
+++ b/tensorflow/core/kernels/split_lib_cpu.cc
@@ -44,22 +44,6 @@ void Split<Eigen::ThreadPoolDevice, T, NDims>::operator()(
 TF_CALL_ALL_TYPES(DEFINE_CPU_KERNELS)
 DEFINE_CPU_KERNELS(quint8)
 
-#ifdef TENSORFLOW_USE_SYCL
-template <typename T, int NDims>
-void Split<Eigen::SyclDevice, T, NDims>::operator()(
-    const Eigen::SyclDevice& d, typename TTypes<T, NDims>::Tensor output,
-    typename TTypes<T, NDims>::ConstTensor input,
-    const Eigen::DSizes<Eigen::DenseIndex, NDims>& slice_indices,
-    const Eigen::DSizes<Eigen::DenseIndex, NDims>& slice_sizes) {
-  output.device(d) = input.slice(slice_indices, slice_sizes);
-}
-
-#define DEFINE_SYCL_KERNELS(T)                    \
-  template struct Split<Eigen::SyclDevice, T, 2>; \
-  template struct Split<Eigen::SyclDevice, T, 3>;
-
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(DEFINE_SYCL_KERNELS);
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace functor
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/split_op.cc b/tensorflow/core/kernels/split_op.cc
index 08575f01f67..6f2cd965e7a 100644
--- a/tensorflow/core/kernels/split_op.cc
+++ b/tensorflow/core/kernels/split_op.cc
@@ -38,9 +38,6 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T>
 class SplitOpBase : public OpKernel {
@@ -325,75 +322,6 @@ class SplitOpGPU : public SplitOpBase<GPUDevice, T> {
 };
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-template <typename T>
-class SplitOpSYCL : public SplitOpBase<SYCLDevice, T> {
- public:
-  typedef SplitOpBase<SYCLDevice, T> Base;
-  explicit SplitOpSYCL(OpKernelConstruction* c) : Base(c) {}
-
-  void Compute(OpKernelContext* context) override {
-    bool done = false;
-    Base::ComputeEasyCases(context, &done);
-    if (!context->status().ok() || done) {
-      return;
-    }
-    const Tensor& input = context->input(1);
-    const TensorShape& input_shape = input.shape();
-    const int32 split_dim_orig = context->input(0).flat<int32>()(0);
-    const int32 split_dim =
-        split_dim_orig < 0 ? split_dim_orig + input.dims() : split_dim_orig;
-    const int32 num_split = Base::num_outputs();
-
-    // Android also uses int32 indexing, so check here also.
-    OP_REQUIRES(
-        context,
-        FastBoundsCheck(input.NumElements(),
-                        std::numeric_limits<Eigen::DenseIndex>::max()),
-        errors::InvalidArgument("Split requires input size < ",
-                                std::numeric_limits<Eigen::DenseIndex>::max()));
-
-    Eigen::DenseIndex prefix_dim_size;
-    Eigen::DenseIndex split_dim_size;
-    Eigen::DenseIndex suffix_dim_size;
-
-    std::tie(prefix_dim_size, split_dim_size, suffix_dim_size) =
-        Base::template SetDims<Eigen::DenseIndex>(input_shape, split_dim);
-    auto input_reshaped =
-        input.shaped<T, 3>({prefix_dim_size, split_dim_size, suffix_dim_size});
-
-    const int64 split_dim_output_size = split_dim_size / num_split;
-    TensorShape output_shape(input_shape);
-    output_shape.set_dim(split_dim, split_dim_output_size);
-
-    Eigen::DSizes<Eigen::DenseIndex, 3> indices{0, 0, 0};
-    Eigen::DSizes<Eigen::DenseIndex, 3> sizes{
-        prefix_dim_size, split_dim_output_size, suffix_dim_size};
-
-    for (int i = 0; i < num_split; ++i) {
-      Tensor* result = nullptr;
-      OP_REQUIRES_OK(context,
-                     context->allocate_output(i, output_shape, &result));
-      if (prefix_dim_size * split_dim_output_size * suffix_dim_size > 0) {
-        Eigen::DSizes<Eigen::DenseIndex, 3> slice_indices;
-        Eigen::DSizes<Eigen::DenseIndex, 3> slice_sizes;
-        for (int j = 0; j < 3; ++j) {
-          slice_indices[j] = indices[j];
-          slice_sizes[j] = sizes[j];
-        }
-
-        auto result_shaped = result->shaped<T, 3>(
-            {prefix_dim_size, split_dim_output_size, suffix_dim_size});
-
-        functor::Split<SYCLDevice, T>()(context->eigen_device<SYCLDevice>(),
-                                        result_shaped, input_reshaped,
-                                        slice_indices, slice_sizes);
-      }
-      indices[1] += split_dim_output_size;
-    }
-  }
-};
-#endif  // TENSORFLOW_USE_SYCL
 
 #define REGISTER_SPLIT(type)                             \
   REGISTER_KERNEL_BUILDER(Name("Split")                  \
@@ -423,17 +351,5 @@ TF_CALL_COMPLEX_TYPES(REGISTER_GPU);
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL(type)                              \
-  REGISTER_KERNEL_BUILDER(Name("Split")                  \
-                              .Device(DEVICE_SYCL)       \
-                              .TypeConstraint<type>("T") \
-                              .HostMemory("split_dim"),  \
-                          SplitOpSYCL<type>)
-
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL);
-#undef REGISTER_SYCL
-
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/stage_op.cc b/tensorflow/core/kernels/stage_op.cc
index 9c0f370de3b..58c41c4d0e5 100644
--- a/tensorflow/core/kernels/stage_op.cc
+++ b/tensorflow/core/kernels/stage_op.cc
@@ -220,9 +220,6 @@ REGISTER_KERNEL_BUILDER(Name("Stage").Device(DEVICE_CPU), StageOp);
     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 REGISTER_KERNEL_BUILDER(Name("Stage").Device(DEVICE_GPU), StageOp);
 #endif
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("Stage").Device(DEVICE_SYCL), StageOp);
-#endif  // TENSORFLOW_USE_SYCL
 
 class UnstageOp : public OpKernel {
  public:
@@ -254,9 +251,6 @@ REGISTER_KERNEL_BUILDER(Name("Unstage").Device(DEVICE_CPU), UnstageOp);
     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 REGISTER_KERNEL_BUILDER(Name("Unstage").Device(DEVICE_GPU), UnstageOp);
 #endif
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("Unstage").Device(DEVICE_SYCL), UnstageOp);
-#endif  // TENSORFLOW_USE_SYCL
 
 class StagePeekOp : public OpKernel {
  public:
@@ -291,10 +285,6 @@ REGISTER_KERNEL_BUILDER(Name("StagePeek").Device(DEVICE_CPU), StagePeekOp);
 REGISTER_KERNEL_BUILDER(
     Name("StagePeek").HostMemory("index").Device(DEVICE_GPU), StagePeekOp);
 #endif
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(
-    Name("StagePeek").HostMemory("index").Device(DEVICE_SYCL), StagePeekOp);
-#endif  // TENSORFLOW_USE_SYCL
 
 class StageSizeOp : public OpKernel {
  public:
@@ -322,10 +312,6 @@ REGISTER_KERNEL_BUILDER(Name("StageSize").Device(DEVICE_CPU), StageSizeOp);
 REGISTER_KERNEL_BUILDER(Name("StageSize").HostMemory("size").Device(DEVICE_GPU),
                         StageSizeOp);
 #endif
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(
-    Name("StageSize").HostMemory("size").Device(DEVICE_SYCL), StageSizeOp);
-#endif  // TENSORFLOW_USE_SYCL
 
 class StageClearOp : public OpKernel {
  public:
@@ -347,8 +333,5 @@ REGISTER_KERNEL_BUILDER(Name("StageClear").Device(DEVICE_CPU), StageClearOp);
     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 REGISTER_KERNEL_BUILDER(Name("StageClear").Device(DEVICE_GPU), StageClearOp);
 #endif
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("StageClear").Device(DEVICE_SYCL), StageClearOp);
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/stateful_random_ops.cc b/tensorflow/core/kernels/stateful_random_ops.cc
index 01fb6fce75d..dca3d451db3 100644
--- a/tensorflow/core/kernels/stateful_random_ops.cc
+++ b/tensorflow/core/kernels/stateful_random_ops.cc
@@ -15,7 +15,9 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
+#include "tensorflow/core/framework/rng_alg.h"
 #include "tensorflow/core/framework/tensor_util.h"
+#include "tensorflow/core/kernels/fill_functor.h"
 #include "tensorflow/core/kernels/random_op_cpu.h"
 #include "tensorflow/core/kernels/stateful_random_ops_cpu_gpu.h"
 #include "tensorflow/core/kernels/training_op_helpers.h"
@@ -23,6 +25,8 @@ limitations under the License.
 
 namespace tensorflow {
 
+namespace functor {
+
 template <typename Distribution>
 struct UpdateVariableAndFill_Philox<CPUDevice, Distribution> {
   void operator()(OpKernelContext* ctx, const CPUDevice& device,
@@ -42,10 +46,13 @@ struct UpdateVariableAndFill_Philox<CPUDevice, Distribution> {
     // No longer needs the lock.
     state_var_guard->Release();
     functor::FillPhiloxRandom<CPUDevice, Distribution>()(
-        ctx, device, philox, output_data, output_size, dist);
+        ctx, device, /*key=*/nullptr, /*counter=*/nullptr, philox, output_data,
+        output_size, dist);
   }
 };
 
+}  // end namespace functor
+
 Status CheckState(const Tensor& state) {
   if (state.dtype() != STATE_ELEMENT_DTYPE) {
     return errors::InvalidArgument("dtype of RNG state variable must be ",
@@ -64,11 +71,12 @@ Status CheckPhiloxState(const Tensor& state, int64 alg_tag_skip = 0) {
                 "StateElementType must be int64");
   static_assert(std::is_same<PhiloxRandom::ResultElementType, uint32>::value,
                 "PhiloxRandom::ResultElementType must be uint32");
-  if (state.NumElements() < alg_tag_skip + PHILOX_MIN_STATE_SIZE) {
+  auto min_size = alg_tag_skip + PHILOX_MIN_STATE_SIZE;
+  if (state.NumElements() < min_size) {
     return errors::InvalidArgument(
         "For the Philox algorithm, the size of state"
         " must be at least ",
-        alg_tag_skip + PHILOX_MIN_STATE_SIZE, "; got ", state.NumElements());
+        min_size, "; got ", state.NumElements());
   }
   return Status::OK();
 }
@@ -95,7 +103,7 @@ Status UpdateVariableAndFill(
     if (var_tensor_flat.size() < 1) {
       return errors::InvalidArgument("Size of tensor must be at least 1");
     }
-    alg = var_tensor_flat(0);
+    alg = Algorithm(var_tensor_flat(0));
   }
   if (alg == RNG_ALG_PHILOX) {
     TF_RETURN_IF_ERROR(CheckPhiloxState(*var_tensor, alg_tag_skip));
@@ -107,7 +115,7 @@ Status UpdateVariableAndFill(
     arg.alg_tag_skip = alg_tag_skip;
     arg.not_used = &state_var_guard;
     arg.state_tensor = var_tensor;
-    UpdateVariableAndFill_Philox<Device, Distribution>()(
+    functor::UpdateVariableAndFill_Philox<Device, Distribution>()(
         ctx, ctx->eigen_device<Device>(), dist, &arg, output_data);
     return Status::OK();
   } else {
@@ -138,7 +146,8 @@ class StatefulRandomOp : public OpKernel {
   explicit StatefulRandomOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
 
   void Compute(OpKernelContext* ctx) override {
-    StatefulRandomCompute<Device>(ctx, Distribution(), 0, 1, true, 0);
+    StatefulRandomCompute<Device>(ctx, Distribution(), 0, 1, true,
+                                  RNG_ALG_PHILOX /*dummy*/);
   }
 };
 
@@ -159,6 +168,14 @@ Status GetScalar(const Tensor& tensor, int input_idx, T* result) {
   return Status::OK();
 }
 
+template <typename AlgEnumType>
+Status GetAlg(OpKernelContext* ctx, int input_idx, Algorithm* alg) {
+  AlgEnumType alg_id;
+  TF_RETURN_IF_ERROR(GetScalar(ctx->input(input_idx), input_idx, &alg_id));
+  *alg = Algorithm(alg_id);
+  return Status::OK();
+}
+
 template <typename Device, class Distribution>
 class StatefulRandomOpV2 : public OpKernel {
  public:
@@ -166,7 +183,7 @@ class StatefulRandomOpV2 : public OpKernel {
 
   void Compute(OpKernelContext* ctx) override {
     Algorithm alg;
-    OP_REQUIRES_OK(ctx, GetScalar(ctx->input(1), 1, &alg));
+    OP_REQUIRES_OK(ctx, GetAlg<int64>(ctx, 1, &alg));
     StatefulRandomCompute<Device>(ctx, Distribution(), /*state_input_idx=*/0,
                                   /*shape_input_idx=*/2,
                                   /*read_alg_from_state=*/false, alg);
@@ -180,7 +197,7 @@ class StatefulUniformIntOp : public OpKernel {
 
   void Compute(OpKernelContext* ctx) override {
     Algorithm alg;
-    OP_REQUIRES_OK(ctx, GetScalar(ctx->input(1), 1, &alg));
+    OP_REQUIRES_OK(ctx, GetAlg<int64>(ctx, 1, &alg));
     const Tensor& minval = ctx->input(3);
     const Tensor& maxval = ctx->input(4);
     OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(minval.shape()),
@@ -217,7 +234,7 @@ class StatefulUniformFullIntOp : public OpKernel {
 
   void Compute(OpKernelContext* ctx) override {
     Algorithm alg;
-    OP_REQUIRES_OK(ctx, GetScalar(ctx->input(1), 1, &alg));
+    OP_REQUIRES_OK(ctx, GetAlg<int64>(ctx, 1, &alg));
     StatefulRandomCompute<Device>(
         ctx,
         random::UniformFullIntDistribution<random::PhiloxRandom, IntType>(),
@@ -226,38 +243,66 @@ class StatefulUniformFullIntOp : public OpKernel {
   }
 };
 
+namespace functor {
+
 template <>
 struct RngSkip_Philox<CPUDevice> {
-  void operator()(const CPUDevice& device, int64 delta, Tensor* state_tensor) {
-    auto state_data = state_tensor->flat<StateElementType>().data();
+  void operator()(const CPUDevice& device, const StateElementType* in_data,
+                  uint64 delta, StateElementType* out_data) {
     // Delegates to PhiloxRandom to do the actual increasing.
-    auto philox = GetPhiloxRandomFromMem(state_data);
-    UpdateMemWithPhiloxRandom(philox, delta, state_data);
+    auto counter = GetCounterFromMem(reinterpret_cast<const uint64*>(in_data));
+    UpdateCounterMemWithPhiloxRandom(counter, delta, out_data);
   }
 };
 
-template <typename Device>
+}  // end namespace functor
+
+template <typename Device, typename AlgEnumType = int64,
+          typename DeltaType = int64, bool read_old_value = false>
 class RngSkipOp : public OpKernel {
  public:
   explicit RngSkipOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
 
   void Compute(OpKernelContext* ctx) override {
     auto state_input_idx = 0;
+    auto alg_input_idx = 1;
+    auto delta_input_idx = 2;
     Algorithm alg;
-    OP_REQUIRES_OK(ctx, GetScalar(ctx->input(1), 1, &alg));
-    int64 delta;
-    OP_REQUIRES_OK(ctx, GetScalar(ctx->input(2), 2, &delta));
+    OP_REQUIRES_OK(ctx, GetAlg<AlgEnumType>(ctx, alg_input_idx, &alg));
+    DeltaType delta_;
+    OP_REQUIRES_OK(
+        ctx, GetScalar(ctx->input(delta_input_idx), delta_input_idx, &delta_));
+    uint64 delta = static_cast<uint64>(delta_);
     Var* var = nullptr;
     OP_REQUIRES_OK(
         ctx, LookupResource(ctx, HandleFromInput(ctx, state_input_idx), &var));
     ScopedUnlockUnrefVar state_var_guard(var);
     Tensor* var_tensor = var->tensor();
     OP_REQUIRES_OK(ctx, CheckState(*var_tensor));
+    using T = StateElementType;
+    OP_REQUIRES_OK(ctx, PrepareToUpdateVariable<Device, T>(
+                            ctx, var_tensor, var->copy_on_read_mode.load()));
+    if (read_old_value) {
+      Tensor* output;
+      OP_REQUIRES_OK(
+          ctx, ctx->allocate_output(0, {RNG_MAX_COUNTER_SIZE + RNG_KEY_SIZE},
+                                    &output));
+      auto output_flat = output->flat<T>();
+      if (RNG_MAX_COUNTER_SIZE > GetCounterSize(alg)) {
+        functor::SetZeroFunctor<Device, T>()(ctx->eigen_device<Device>(),
+                                             output_flat);
+      }
+      functor::DenseUpdate<Device, T, ASSIGN>()(
+          ctx->eigen_device<Device>(), output_flat,
+          const_cast<const Tensor*>(var_tensor)->flat<T>());
+    }
     if (alg == RNG_ALG_PHILOX) {
       OP_REQUIRES_OK(ctx, CheckPhiloxState(*var_tensor));
-      OP_REQUIRES_OK(ctx, PrepareToUpdateVariable<Device, StateElementType>(
-                              ctx, var_tensor, var->copy_on_read_mode.load()));
-      RngSkip_Philox<Device>()(ctx->eigen_device<Device>(), delta, var_tensor);
+      // var_tensor layout is counter+key, so var_tensor data is also counter
+      // data.
+      auto counter_data = var_tensor->flat<T>().data();
+      functor::RngSkip_Philox<Device>()(ctx->eigen_device<Device>(),
+                                        counter_data, delta, counter_data);
     } else {
       OP_REQUIRES(ctx, false,
                   errors::InvalidArgument("Unsupported algorithm id: ", alg));
@@ -393,13 +438,20 @@ TF_CALL_int64(REGISTER_StatefulUniformFullInt_CPU);
 TF_CALL_uint32(REGISTER_StatefulUniformFullInt_CPU);
 TF_CALL_uint64(REGISTER_StatefulUniformFullInt_CPU);
 
+// TODO(wangpeng): Remove `HostMemory("delta")` for RngReadAndSkip
 #define REGISTER_RngSkip(DEVICE)                       \
   REGISTER_KERNEL_BUILDER(Name("RngSkip")              \
                               .Device(DEVICE_##DEVICE) \
                               .HostMemory("resource")  \
                               .HostMemory("algorithm") \
                               .HostMemory("delta"),    \
-                          RngSkipOp<DEVICE##Device>);
+                          RngSkipOp<DEVICE##Device>);  \
+  REGISTER_KERNEL_BUILDER(Name("RngReadAndSkip")       \
+                              .Device(DEVICE_##DEVICE) \
+                              .HostMemory("resource")  \
+                              .HostMemory("alg")       \
+                              .HostMemory("delta"),    \
+                          RngSkipOp<DEVICE##Device, int32, uint64, true>);
 
 REGISTER_RngSkip(CPU);
 
diff --git a/tensorflow/core/kernels/stateful_random_ops.h b/tensorflow/core/kernels/stateful_random_ops.h
index 58ab41426f1..8fb8b9ad857 100644
--- a/tensorflow/core/kernels/stateful_random_ops.h
+++ b/tensorflow/core/kernels/stateful_random_ops.h
@@ -22,15 +22,12 @@ limitations under the License.
 namespace tensorflow {
 
 // 'Variable' doesn't support uint32 or uint64 yet (due to reasons explained
-// in b/111604096 and cl/171681867), so I use signed int here. I choose int64
-// instead of int32 because `VarHandleOp` doesn't support int32 on GPU.
+// in b/111604096 and cl/171681867), so we use signed int here. We choose int64
+// instead of int32 because `VarHandleOp` doesn't support int32 on GPU, and
+// because of the "int32 problem".
 using StateElementType = int64;
 static constexpr DataType STATE_ELEMENT_DTYPE = DT_INT64;
-
-using Algorithm = StateElementType;
 static constexpr DataType ALGORITHM_DTYPE = STATE_ELEMENT_DTYPE;
-static constexpr Algorithm RNG_ALG_PHILOX = 1;
-static constexpr Algorithm RNG_ALG_THREEFRY = 2;
 
 using random::PhiloxRandom;
 
diff --git a/tensorflow/core/kernels/stateful_random_ops_cpu_gpu.h b/tensorflow/core/kernels/stateful_random_ops_cpu_gpu.h
index a121b2c48bf..b69b2da92ca 100644
--- a/tensorflow/core/kernels/stateful_random_ops_cpu_gpu.h
+++ b/tensorflow/core/kernels/stateful_random_ops_cpu_gpu.h
@@ -17,59 +17,51 @@ limitations under the License.
 #define TENSORFLOW_CORE_KERNELS_STATEFUL_RANDOM_OPS_CPU_GPU_H_
 
 #include "tensorflow/core/framework/resource_var.h"
+#include "tensorflow/core/kernels/random_ops_util.h"
 #include "tensorflow/core/kernels/stateful_random_ops.h"
 
 namespace tensorflow {
 
-// The following 5 functions are made templates to avoid duplicate symbols when
-// linking.
-
-// The following 2 functions use the contract "lower 32 bits for the first
-// uint32, higher 32 bits for the second". Note that this is endian-neutral,
-// unlike a direct memory copy `memcpy(output, &input, 8)`.
-PHILOX_DEVICE_INLINE void Int64ToUint32s(int64 input, uint32* output1,
-                                         uint32* output2) {
-  auto u64 = static_cast<uint64>(input);
-  *output1 = static_cast<uint32>(u64);
-  *output2 = static_cast<uint32>(u64 >> 32);
-}
-
-PHILOX_DEVICE_INLINE int64 Uint32sToInt64(uint32 input1, uint32 input2) {
-  auto u64_1 = static_cast<uint64>(input1);
-  auto u64_2 = static_cast<uint64>(input2);
-  return static_cast<int64>(u64_1 | (u64_2 << 32));
-}
-
 PHILOX_DEVICE_INLINE PhiloxRandom
 GetPhiloxRandomFromMem(StateElementType const* ptr) {
-  PhiloxRandom::ResultType counter;
-  PhiloxRandom::Key key;
-  Int64ToUint32s(ptr[0], &counter[0], &counter[1]);
-  Int64ToUint32s(ptr[1], &counter[2], &counter[3]);
-  Int64ToUint32s(ptr[2], &key[0], &key[1]);
-  return PhiloxRandom(counter, key);
+  auto ptr_ = reinterpret_cast<uint64 const*>(ptr);
+  return GetPhiloxRandomFromCounterKeyMem(ptr_, ptr_ + 2);
 }
 
 PHILOX_DEVICE_INLINE void WritePhiloxRandomToMem(PhiloxRandom const& philox,
                                                  StateElementType* ptr) {
-  PhiloxRandom::ResultType const& counter = philox.counter();
-  PhiloxRandom::Key const& key = philox.key();
-  ptr[0] = Uint32sToInt64(counter[0], counter[1]);
-  ptr[1] = Uint32sToInt64(counter[2], counter[3]);
-  ptr[2] = Uint32sToInt64(key[0], key[1]);
+  auto ptr_ = reinterpret_cast<uint64*>(ptr);
+  WriteCounterToMem(philox.counter(), ptr_);
+  WriteKeyToMem(philox.key(), ptr_ + 2);
+}
+
+PHILOX_DEVICE_INLINE PhiloxRandom SkipPhiloxRandom(PhiloxRandom const& philox,
+                                                   uint64 output_size) {
+  auto new_philox = philox;
+  // Multiplier 256 is the same as in FillPhiloxRandomTask; do not change it
+  // just here.
+  auto delta = output_size * 256;
+  new_philox.Skip(delta);  // do the actual increasing
+  return new_philox;
 }
 
 PHILOX_DEVICE_INLINE void UpdateMemWithPhiloxRandom(PhiloxRandom const& philox,
-                                                    int64 output_size,
+                                                    uint64 output_size,
                                                     StateElementType* ptr) {
-  auto new_philox = philox;
-  // Multiplier 256 is the same as in `FillPhiloxRandomTask`; do not change
-  // it just here.
-  auto delta = output_size * 256;
-  new_philox.Skip(delta);  // do the actual increasing
+  auto new_philox = SkipPhiloxRandom(philox, output_size);
   WritePhiloxRandomToMem(new_philox, ptr);
 }
 
+PHILOX_DEVICE_INLINE void UpdateCounterMemWithPhiloxRandom(
+    PhiloxRandom::ResultType const& counter, uint64 output_size,
+    StateElementType* ptr) {
+  auto philox = PhiloxRandom(counter, PhiloxRandom::Key() /*dummy*/);
+  auto new_philox = SkipPhiloxRandom(philox, output_size);
+  WriteCounterToMem(new_philox.counter(), reinterpret_cast<uint64*>(ptr));
+}
+
+namespace functor {
+
 // A per-device helper function that does the actual work for
 // `UpdateVariableAndFill`.
 // Reason to use functor: C++ doesn't allow function-template partial
@@ -80,6 +72,8 @@ struct UpdateVariableAndFill_Philox;
 template <typename Device>
 struct RngSkip_Philox;
 
+}  // end namespace functor
+
 using CPUDevice = Eigen::ThreadPoolDevice;
 
 struct UpdateVariableAndFill_Philox_Arg {
@@ -93,6 +87,8 @@ struct UpdateVariableAndFill_Philox_Arg {
 
 using GPUDevice = Eigen::GpuDevice;
 
+namespace functor {
+
 // Declares the partially GPU-specialized functor structs.
 // must be kept at <=6 arguments because of a gcc/clang ABI incompatibility bug
 template <typename Distribution>
@@ -104,9 +100,12 @@ struct UpdateVariableAndFill_Philox<GPUDevice, Distribution> {
 
 template <>
 struct RngSkip_Philox<GPUDevice> {
-  void operator()(const GPUDevice& device, int64 delta, Tensor* state_tensor);
+  void operator()(const GPUDevice& device, const StateElementType* in_data,
+                  uint64 delta, StateElementType* out_data);
 };
 
+}  // end namespace functor
+
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/stateful_random_ops_gpu.cu.cc b/tensorflow/core/kernels/stateful_random_ops_gpu.cu.cc
index 804a2fd60ee..d244577d2ab 100644
--- a/tensorflow/core/kernels/stateful_random_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/stateful_random_ops_gpu.cu.cc
@@ -31,6 +31,8 @@ __device__ int tensorflow_philox_thread_counter;
 
 namespace tensorflow {
 
+namespace functor {
+
 using random::PhiloxRandom;
 
 template <typename Distribution>
@@ -48,7 +50,8 @@ __global__ void FillKernel(
   __syncthreads();
   functor::FillPhiloxRandomKernel<Distribution,
                                   Distribution::kVariableSamplesPerOutput>()
-      .Run(*philox, output_data, output_size, dist);
+      .Run(/*key=*/nullptr, /*counter=*/nullptr, *philox, output_data,
+           output_size, dist);
   // The last thread updates the state.
   auto total_thread_count = gridDim.x * blockDim.x;
   auto old_counter_value = atomicAdd(&tensorflow_philox_thread_counter, 1);
@@ -96,16 +99,19 @@ void UpdateVariableAndFill_Philox<GPUDevice, Distribution>::operator()(
 }
 
 // Precondition: there is only 1 block and 1 thread.
-__global__ void SkipKernel(int64 delta,
-                           StateElementType* __restrict__ state_data) {
-  auto philox = GetPhiloxRandomFromMem(state_data);
-  UpdateMemWithPhiloxRandom(philox, delta, state_data);
+__global__ void SkipKernel(const StateElementType* __restrict__ in_data,
+                           uint64 delta,
+                           StateElementType* __restrict__ out_data) {
+  auto counter = GetCounterFromMem(reinterpret_cast<const uint64*>(in_data));
+  UpdateCounterMemWithPhiloxRandom(counter, delta, out_data);
 }
 
-void RngSkip_Philox<GPUDevice>::operator()(const GPUDevice& d, int64 delta,
-                                           Tensor* state_tensor) {
-  TF_CHECK_OK(GpuLaunchKernel(SkipKernel, 1, 1, 0, d.stream(), delta,
-                              state_tensor->flat<StateElementType>().data()));
+void RngSkip_Philox<GPUDevice>::operator()(const GPUDevice& d,
+                                           const StateElementType* in_data,
+                                           uint64 delta,
+                                           StateElementType* out_data) {
+  TF_CHECK_OK(GpuLaunchKernel(SkipKernel, 1, 1, 0, d.stream(), in_data, delta,
+                              out_data));
 }
 
 // Explicit instantiation of the GPU distributions functors.
@@ -154,6 +160,7 @@ template struct UpdateVariableAndFill_Philox<
                  random::PhiloxRandom, uint64> >;
 // clang-format on
 
+}  // end namespace functor
 }  // end namespace tensorflow
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/stateless_random_ops.cc b/tensorflow/core/kernels/stateless_random_ops.cc
index 6738a34e3fd..91944db1f62 100644
--- a/tensorflow/core/kernels/stateless_random_ops.cc
+++ b/tensorflow/core/kernels/stateless_random_ops.cc
@@ -121,8 +121,8 @@ class StatelessRandomOp : public StatelessRandomOpBase {
     auto flat = output->flat<T>();
     // Reuse the compute kernels from the stateful random ops
     functor::FillPhiloxRandom<Device, Distribution>()(
-        context, context->eigen_device<Device>(), random, flat.data(),
-        flat.size(), Distribution());
+        context, context->eigen_device<Device>(), /*key=*/nullptr,
+        /*counter=*/nullptr, random, flat.data(), flat.size(), Distribution());
   }
 };
 
@@ -158,8 +158,8 @@ class StatelessRandomUniformIntOp : public StatelessRandomOpBase {
     auto flat = output->flat<IntType>();
     // Reuse the compute kernels from the stateful random ops
     functor::FillPhiloxRandom<Device, Distribution>()(
-        context, context->eigen_device<Device>(), random, flat.data(),
-        flat.size(), dist);
+        context, context->eigen_device<Device>(), /*key=*/nullptr,
+        /*counter=*/nullptr, random, flat.data(), flat.size(), dist);
   }
 };
 
@@ -178,8 +178,8 @@ class StatelessRandomUniformFullIntOp : public StatelessRandomOpBase {
     auto flat = output->flat<IntType>();
     // Reuse the compute kernels from the stateful random ops
     functor::FillPhiloxRandom<Device, Distribution>()(
-        context, context->eigen_device<Device>(), random, flat.data(),
-        flat.size(), dist);
+        context, context->eigen_device<Device>(), /*key=*/nullptr,
+        /*counter=*/nullptr, random, flat.data(), flat.size(), dist);
   }
 };
 
diff --git a/tensorflow/core/kernels/stateless_random_ops_v2.cc b/tensorflow/core/kernels/stateless_random_ops_v2.cc
new file mode 100644
index 00000000000..c93ba6c8d66
--- /dev/null
+++ b/tensorflow/core/kernels/stateless_random_ops_v2.cc
@@ -0,0 +1,330 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/stateless_random_ops_v2.h"
+
+#include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/rng_alg.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_util.h"
+#include "tensorflow/core/kernels/random_op.h"
+#include "tensorflow/core/kernels/random_ops_util.h"
+#include "tensorflow/core/kernels/random_poisson_op.h"
+#include "tensorflow/core/kernels/stateless_random_ops.h"
+#include "tensorflow/core/lib/random/random_distributions.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+#if EIGEN_COMP_GNUC && __cplusplus > 199711L
+#define DISABLE_FLOAT_EQUALITY_WARNING \
+  _Pragma("GCC diagnostic push")       \
+      _Pragma("GCC diagnostic ignored \"-Wfloat-equal\"")
+#define ENABLE_FLOAT_EQUALITY_WARNING _Pragma("GCC diagnostic pop")
+#else
+#define DISABLE_FLOAT_EQUALITY_WARNING
+#define ENABLE_FLOAT_EQUALITY_WARNING
+#endif
+
+namespace tensorflow {
+
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
+
+namespace {
+
+template <typename T>
+Status GetScalar(const Tensor& tensor, int input_idx, T* result) {
+  auto dtype = DataTypeToEnum<T>::v();
+  if (tensor.dims() != 0) {
+    return errors::InvalidArgument("input ", std::to_string(input_idx),
+                                   " (0-based) must have shape [], not ",
+                                   tensor.shape().DebugString());
+  }
+  if (tensor.dtype() != dtype) {
+    return errors::InvalidArgument("dtype of input ", std::to_string(input_idx),
+                                   " (0-based) must be ", DataTypeString(dtype),
+                                   ", not ", DataTypeString(tensor.dtype()));
+  }
+  *result = tensor.flat<T>()(0);
+  return Status::OK();
+}
+
+class StatelessRandomOpBase : public OpKernel {
+ public:
+  explicit StatelessRandomOpBase(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    // Sanitize input
+    const Tensor& shape_t = ctx->input(0);
+    const Tensor& key_t = ctx->input(1);
+    const Tensor& counter_t = ctx->input(2);
+    const int alg_input_idx = 3;
+    const Tensor& alg_t = ctx->input(alg_input_idx);
+
+    int alg_id;
+    OP_REQUIRES_OK(ctx, GetScalar(alg_t, alg_input_idx, &alg_id));
+    Algorithm alg = Algorithm(alg_id);
+
+    TensorShape shape;
+    OP_REQUIRES_OK(ctx, tensor::MakeShape(shape_t, &shape));
+    OP_REQUIRES_OK(ctx,
+                   CheckKeyCounterShape(alg, key_t.shape(), counter_t.shape()));
+
+    // Allocate output
+    Tensor* output;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, shape, &output));
+    if (shape.num_elements() == 0) {
+      return;
+    }
+
+    // Fill in the random numbers
+    Fill(ctx, alg, key_t, counter_t, output);
+  }
+
+  // The part of Compute that depends on device, type, and distribution
+  virtual void Fill(OpKernelContext* ctx, Algorithm alg, const Tensor& key,
+                    const Tensor& counter, Tensor* output) = 0;
+};
+
+template <typename Device, typename Distribution>
+class StatelessRandomOp : public StatelessRandomOpBase {
+ public:
+  using StatelessRandomOpBase::StatelessRandomOpBase;
+
+  void Fill(OpKernelContext* ctx, Algorithm alg, const Tensor& key,
+            const Tensor& counter, Tensor* output) override {
+    typedef typename Distribution::ResultElementType T;
+    auto flat = output->flat<T>();
+    if (alg == RNG_ALG_PHILOX) {
+      // Reuse the compute kernels from the stateful random ops
+      auto key_data = key.flat<uint64>().data();
+      auto counter_data = counter.flat<uint64>().data();
+      functor::FillPhiloxRandom<Device, Distribution>()(
+          ctx, ctx->eigen_device<Device>(), key_data, counter_data,
+          random::PhiloxRandom() /*dummy*/, flat.data(), flat.size(),
+          Distribution());
+    } else {
+      OP_REQUIRES(ctx, false,
+                  errors::InvalidArgument("Unsupported algorithm id: ", alg));
+    }
+  }
+};
+
+template <typename Device, typename IntType>
+class StatelessRandomUniformIntOp : public StatelessRandomOpBase {
+ public:
+  using StatelessRandomOpBase::StatelessRandomOpBase;
+
+  void Fill(OpKernelContext* ctx, Algorithm alg, const Tensor& key,
+            const Tensor& counter, Tensor* output) override {
+    const Tensor& minval = ctx->input(4);
+    const Tensor& maxval = ctx->input(5);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(minval.shape()),
+                errors::InvalidArgument("minval must be 0-D, got shape ",
+                                        minval.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(maxval.shape()),
+                errors::InvalidArgument("maxval must be 0-D, got shape ",
+                                        maxval.shape().DebugString()));
+
+    // Verify that minval < maxval.  Note that we'll never reach this point for
+    // empty output.  Zero impossible things are fine.
+    const auto lo = minval.scalar<IntType>()();
+    const auto hi = maxval.scalar<IntType>()();
+    OP_REQUIRES(
+        ctx, lo < hi,
+        errors::InvalidArgument("Need minval < maxval, got ", lo, " >= ", hi));
+
+    // Build distribution
+    typedef random::UniformDistribution<random::PhiloxRandom, IntType>
+        Distribution;
+    Distribution dist(lo, hi);
+
+    auto flat = output->flat<IntType>();
+    if (alg == RNG_ALG_PHILOX) {
+      // Reuse the compute kernels from the stateful random ops
+      auto key_data = key.flat<uint64>().data();
+      auto counter_data = counter.flat<uint64>().data();
+      functor::FillPhiloxRandom<Device, Distribution>()(
+          ctx, ctx->eigen_device<Device>(), key_data, counter_data,
+          random::PhiloxRandom() /*dummy*/, flat.data(), flat.size(), dist);
+    } else {
+      OP_REQUIRES(ctx, false,
+                  errors::InvalidArgument("Unsupported algorithm id: ", alg));
+    }
+  }
+};
+
+template <typename Device, typename IntType>
+class StatelessRandomUniformFullIntOp : public StatelessRandomOpBase {
+ public:
+  using StatelessRandomOpBase::StatelessRandomOpBase;
+
+  void Fill(OpKernelContext* ctx, Algorithm alg, const Tensor& key,
+            const Tensor& counter, Tensor* output) override {
+    // Build distribution
+    typedef random::UniformFullIntDistribution<random::PhiloxRandom, IntType>
+        Distribution;
+    Distribution dist;
+
+    auto flat = output->flat<IntType>();
+    if (alg == RNG_ALG_PHILOX) {
+      // Reuse the compute kernels from the stateful random ops
+      auto key_data = key.flat<uint64>().data();
+      auto counter_data = counter.flat<uint64>().data();
+      functor::FillPhiloxRandom<Device, Distribution>()(
+          ctx, ctx->eigen_device<Device>(), key_data, counter_data,
+          random::PhiloxRandom() /*dummy*/, flat.data(), flat.size(), dist);
+    } else {
+      OP_REQUIRES(ctx, false,
+                  errors::InvalidArgument("Unsupported algorithm id: ", alg));
+    }
+  }
+};
+
+class GetKeyCounterAlgOp : public OpKernel {
+ public:
+  explicit GetKeyCounterAlgOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& seed_t = ctx->input(0);
+    OP_REQUIRES(ctx, seed_t.dims() == 1 && seed_t.dim_size(0) == 2,
+                errors::InvalidArgument("seed must have shape [2], not ",
+                                        seed_t.shape().DebugString()));
+    // Allocate outputs
+    Tensor* key_output;
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_output(0, TensorShape({RNG_KEY_SIZE}), &key_output));
+    Tensor* counter_output;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_output(1, TensorShape({RNG_MAX_COUNTER_SIZE}),
+                                        &counter_output));
+    Tensor* alg_output;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(2, TensorShape({}), &alg_output));
+
+    random::PhiloxRandom::Key key;
+    random::PhiloxRandom::ResultType counter;
+    OP_REQUIRES_OK(ctx, GenerateKey(seed_t, &key, &counter));
+    WriteKeyToMem(key, key_output->flat<uint64>().data());
+    WriteCounterToMem(counter, counter_output->flat<uint64>().data());
+    alg_output->flat<int>()(0) = RNG_ALG_PHILOX;
+  }
+};
+
+#define REGISTER(DEVICE, TYPE)                                              \
+  REGISTER_KERNEL_BUILDER(                                                  \
+      Name("StatelessRandomUniformV2")                                      \
+          .Device(DEVICE_##DEVICE)                                          \
+          .HostMemory("shape")                                              \
+          .HostMemory("alg")                                                \
+          .TypeConstraint<TYPE>("dtype"),                                   \
+      StatelessRandomOp<DEVICE##Device, random::UniformDistribution<        \
+                                            random::PhiloxRandom, TYPE> >); \
+  REGISTER_KERNEL_BUILDER(                                                  \
+      Name("StatelessRandomNormalV2")                                       \
+          .Device(DEVICE_##DEVICE)                                          \
+          .HostMemory("shape")                                              \
+          .HostMemory("alg")                                                \
+          .TypeConstraint<TYPE>("dtype"),                                   \
+      StatelessRandomOp<DEVICE##Device, random::NormalDistribution<         \
+                                            random::PhiloxRandom, TYPE> >); \
+  REGISTER_KERNEL_BUILDER(                                                  \
+      Name("StatelessTruncatedNormalV2")                                    \
+          .Device(DEVICE_##DEVICE)                                          \
+          .HostMemory("shape")                                              \
+          .HostMemory("alg")                                                \
+          .TypeConstraint<TYPE>("dtype"),                                   \
+      StatelessRandomOp<                                                    \
+          DEVICE##Device,                                                   \
+          random::TruncatedNormalDistribution<                              \
+              random::SingleSampleAdapter<random::PhiloxRandom>, TYPE> >)
+
+#define REGISTER_FULL_INT(DEVICE, TYPE)       \
+  REGISTER_KERNEL_BUILDER(                    \
+      Name("StatelessRandomUniformFullIntV2") \
+          .Device(DEVICE_##DEVICE)            \
+          .HostMemory("shape")                \
+          .HostMemory("alg")                  \
+          .TypeConstraint<TYPE>("dtype"),     \
+      StatelessRandomUniformFullIntOp<DEVICE##Device, TYPE>)
+
+#define REGISTER_INT(DEVICE, TYPE)                            \
+  REGISTER_FULL_INT(DEVICE, TYPE);                            \
+  REGISTER_KERNEL_BUILDER(Name("StatelessRandomUniformIntV2") \
+                              .Device(DEVICE_##DEVICE)        \
+                              .HostMemory("shape")            \
+                              .HostMemory("alg")              \
+                              .HostMemory("minval")           \
+                              .HostMemory("maxval")           \
+                              .TypeConstraint<TYPE>("dtype"), \
+                          StatelessRandomUniformIntOp<DEVICE##Device, TYPE>)
+
+#define REGISTER_CPU(TYPE) REGISTER(CPU, TYPE)
+#define REGISTER_GPU(TYPE) REGISTER(GPU, TYPE)
+#define REGISTER_INT_CPU(TYPE) REGISTER_INT(CPU, TYPE)
+#define REGISTER_INT_GPU(TYPE) REGISTER_INT(GPU, TYPE)
+#define REGISTER_FULL_INT_CPU(TYPE) REGISTER_FULL_INT(CPU, TYPE)
+#define REGISTER_FULL_INT_GPU(TYPE) REGISTER_FULL_INT(GPU, TYPE)
+
+TF_CALL_half(REGISTER_CPU);
+TF_CALL_bfloat16(REGISTER_CPU);
+TF_CALL_float(REGISTER_CPU);
+TF_CALL_double(REGISTER_CPU);
+TF_CALL_int32(REGISTER_INT_CPU);
+TF_CALL_int64(REGISTER_INT_CPU);
+TF_CALL_uint32(REGISTER_FULL_INT_CPU);
+TF_CALL_uint64(REGISTER_FULL_INT_CPU);
+
+#define REGISTER_GET_KCA(DEVICE)                                  \
+  REGISTER_KERNEL_BUILDER(Name("StatelessRandomGetKeyCounterAlg") \
+                              .Device(DEVICE_##DEVICE)            \
+                              .HostMemory("seed")                 \
+                              .HostMemory("key")                  \
+                              .HostMemory("counter")              \
+                              .HostMemory("alg"),                 \
+                          GetKeyCounterAlgOp)
+
+REGISTER_GET_KCA(CPU);
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+TF_CALL_half(REGISTER_GPU);
+TF_CALL_float(REGISTER_GPU);
+TF_CALL_double(REGISTER_GPU);
+TF_CALL_int32(REGISTER_INT_GPU);
+TF_CALL_int64(REGISTER_INT_GPU);
+TF_CALL_uint32(REGISTER_FULL_INT_GPU);
+TF_CALL_uint64(REGISTER_FULL_INT_GPU);
+
+REGISTER_GET_KCA(GPU);
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#undef REGISTER
+#undef REGISTER_INT
+#undef REGISTER_CPU
+#undef REGISTER_GPU
+#undef REGISTER_INT_CPU
+#undef REGISTER_INT_GPU
+#undef REGISTER_FULL_INT_CPU
+#undef REGISTER_FULL_INT_GPU
+
+#undef REGISTER_GET_KCA
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/stateless_random_ops_v2.h b/tensorflow/core/kernels/stateless_random_ops_v2.h
new file mode 100644
index 00000000000..429f9353ab7
--- /dev/null
+++ b/tensorflow/core/kernels/stateless_random_ops_v2.h
@@ -0,0 +1,46 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_STATELESS_RANDOM_OPS_V2_H_
+#define TENSORFLOW_CORE_KERNELS_STATELESS_RANDOM_OPS_V2_H_
+
+#include "tensorflow/core/framework/rng_alg.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+
+namespace tensorflow {
+
+inline Status CheckKeyCounterShape(Algorithm const& alg,
+                                   TensorShape const& key_shape,
+                                   TensorShape const& counter_shape) {
+  if (!(key_shape.dims() == 1 && key_shape.dim_size(0) == RNG_KEY_SIZE)) {
+    return errors::InvalidArgument(
+        "key must have shape [", RNG_KEY_SIZE, "], not ",
+        key_shape.DebugString(),
+        ". (Note that batched keys are not supported yet.)");
+  }
+  auto counter_size = GetCounterSize(alg);
+  if (!(counter_shape.dims() == 1 &&
+        counter_shape.dim_size(0) >= counter_size)) {
+    return errors::InvalidArgument(
+        "counter must be a vector with length at least ", counter_size,
+        "; got shape: ", counter_shape.DebugString(),
+        ". (Note that batched counters are not supported yet.)");
+  }
+  return Status::OK();
+}
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_STATELESS_RANDOM_OPS_V2_H_
diff --git a/tensorflow/core/kernels/strided_slice_op.cc b/tensorflow/core/kernels/strided_slice_op.cc
index 7d9dfa44129..47147061912 100644
--- a/tensorflow/core/kernels/strided_slice_op.cc
+++ b/tensorflow/core/kernels/strided_slice_op.cc
@@ -529,90 +529,4 @@ REGISTER_KERNEL_BUILDER(Name("TensorStridedSliceUpdate")
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL(type)                                              \
-  REGISTER_KERNEL_BUILDER(Name("StridedSlice")                           \
-                              .Device(DEVICE_SYCL)                       \
-                              .TypeConstraint<type>("T")                 \
-                              .HostMemory("begin")                       \
-                              .HostMemory("end")                         \
-                              .HostMemory("strides"),                    \
-                          StridedSliceOp<SYCLDevice, type>)              \
-  REGISTER_KERNEL_BUILDER(Name("StridedSliceGrad")                       \
-                              .Device(DEVICE_SYCL)                       \
-                              .TypeConstraint<type>("T")                 \
-                              .HostMemory("shape")                       \
-                              .HostMemory("begin")                       \
-                              .HostMemory("end")                         \
-                              .HostMemory("strides"),                    \
-                          StridedSliceGradOp<SYCLDevice, type>)          \
-  REGISTER_KERNEL_BUILDER(Name("StridedSliceAssign")                     \
-                              .Device(DEVICE_SYCL)                       \
-                              .TypeConstraint<type>("T")                 \
-                              .HostMemory("begin")                       \
-                              .HostMemory("end")                         \
-                              .HostMemory("strides"),                    \
-                          StridedSliceAssignOp<SYCLDevice, type, false>) \
-  REGISTER_KERNEL_BUILDER(Name("ResourceStridedSliceAssign")             \
-                              .Device(DEVICE_SYCL)                       \
-                              .TypeConstraint<type>("T")                 \
-                              .HostMemory("ref")                         \
-                              .HostMemory("begin")                       \
-                              .HostMemory("end")                         \
-                              .HostMemory("strides"),                    \
-                          StridedSliceAssignOp<SYCLDevice, type, false>) \
-  REGISTER_KERNEL_BUILDER(Name("TensorStridedSliceUpdate")               \
-                              .Device(DEVICE_SYCL)                       \
-                              .TypeConstraint<type>("T")                 \
-                              .HostMemory("begin")                       \
-                              .HostMemory("end")                         \
-                              .HostMemory("strides"),                    \
-                          StridedSliceAssignOp<SYCLDevice, type, true>)
-
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL);
-
-REGISTER_KERNEL_BUILDER(Name("StridedSlice")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int32>("T")
-                            .HostMemory("input")
-                            .HostMemory("begin")
-                            .HostMemory("end")
-                            .HostMemory("strides")
-                            .HostMemory("output"),
-                        StridedSliceOp<CPUDevice, int32>);
-REGISTER_KERNEL_BUILDER(Name("StridedSliceGrad")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int32>("T")
-                            .HostMemory("shape")
-                            .HostMemory("begin")
-                            .HostMemory("end")
-                            .HostMemory("strides")
-                            .HostMemory("dy")
-                            .HostMemory("output"),
-                        StridedSliceGradOp<CPUDevice, int32>);
-REGISTER_KERNEL_BUILDER(Name("StridedSliceAssign")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int32>("T")
-                            .HostMemory("ref")
-                            .HostMemory("begin")
-                            .HostMemory("end")
-                            .HostMemory("strides"),
-                        StridedSliceAssignOp<CPUDevice, int32, false>);
-REGISTER_KERNEL_BUILDER(Name("ResourceStridedSliceAssign")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int32>("T")
-                            .HostMemory("ref")
-                            .HostMemory("begin")
-                            .HostMemory("end")
-                            .HostMemory("strides"),
-                        StridedSliceAssignOp<CPUDevice, int32, false>);
-REGISTER_KERNEL_BUILDER(Name("TensorStridedSliceUpdate")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int32>("T")
-                            .HostMemory("begin")
-                            .HostMemory("end")
-                            .HostMemory("strides"),
-                        StridedSliceAssignOp<CPUDevice, int32, true>)
-#undef REGISTER_SYCL
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/strided_slice_op_impl.h b/tensorflow/core/kernels/strided_slice_op_impl.h
index 5ce1d773e33..6f4f5fcc940 100644
--- a/tensorflow/core/kernels/strided_slice_op_impl.h
+++ b/tensorflow/core/kernels/strided_slice_op_impl.h
@@ -288,20 +288,6 @@ TF_CALL_GPU_ALL_TYPES(DECLARE_FOR_N_GPU);
 
 TF_CALL_ALL_TYPES(DECLARE_FOR_N_CPU);
 
-#ifdef TENSORFLOW_USE_SYCL
-#define PREVENT_FOR_N_SYCL(T) \
-  PREVENT_INSTANTIATE(T, STRIDED_SLICE_INSTANTIATE_DIM)
-
-#define DECLARE_FOR_N_SYCL(T) \
-  INSTANTIATE(SYCLDevice, T, STRIDED_SLICE_INSTANTIATE_DIM)
-
-TF_CALL_SYCL_PROXY_TYPES(PREVENT_FOR_N_SYCL);
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(DECLARE_FOR_N_SYCL);
-DECLARE_FOR_N_SYCL(int32);
-DECLARE_FOR_N_SYCL(int64);
-
-#undef DECLARE_FOR_N_SYCL
-#endif  // TENSORFLOW_USE_SYCL
 
 #undef INSTANTIATE
 #undef DECLARE_FOR_N_CPU
diff --git a/tensorflow/core/kernels/summary_op.cc b/tensorflow/core/kernels/summary_op.cc
deleted file mode 100644
index 22d1a21a889..00000000000
--- a/tensorflow/core/kernels/summary_op.cc
+++ /dev/null
@@ -1,132 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Operators that deal with SummaryProtos (encoded as DT_STRING tensors) as
-// inputs or outputs in various ways.
-
-// See docs in ../ops/summary_ops.cc.
-
-#include <unordered_set>
-
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/resource_mgr.h"
-#include "tensorflow/core/framework/summary.pb.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/histogram/histogram.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/protobuf.h"
-
-namespace tensorflow {
-
-template <typename T>
-class SummaryHistoOp : public OpKernel {
- public:
-  // SummaryHistoOp could be extended to take a list of custom bucket
-  // boundaries as an option.
-  explicit SummaryHistoOp(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* c) override {
-    const Tensor& tags = c->input(0);
-    const Tensor& values = c->input(1);
-    const auto flat = values.flat<T>();
-    OP_REQUIRES(c, TensorShapeUtils::IsScalar(tags.shape()),
-                errors::InvalidArgument("tags must be scalar"));
-    // Build histogram of values in "values" tensor
-    histogram::Histogram histo;
-    for (int64 i = 0; i < flat.size(); i++) {
-      const double double_val = static_cast<double>(flat(i));
-      if (Eigen::numext::isnan(double_val)) {
-        c->SetStatus(
-            errors::InvalidArgument("Nan in summary histogram for: ", name()));
-        break;
-      } else if (Eigen::numext::isinf(double_val)) {
-        c->SetStatus(errors::InvalidArgument(
-            "Infinity in summary histogram for: ", name()));
-        break;
-      }
-      histo.Add(double_val);
-    }
-
-    Summary s;
-    Summary::Value* v = s.add_value();
-    const tstring& tags0 = tags.scalar<tstring>()();
-    v->set_tag(tags0.data(), tags0.size());
-    histo.EncodeToProto(v->mutable_histo(), false /* Drop zero buckets */);
-
-    Tensor* summary_tensor = nullptr;
-    OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape({}), &summary_tensor));
-    CHECK(SerializeToTString(s, &summary_tensor->scalar<tstring>()()));
-  }
-};
-
-#define REGISTER(T)                                                       \
-  REGISTER_KERNEL_BUILDER(                                                \
-      Name("HistogramSummary").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
-      SummaryHistoOp<T>);
-TF_CALL_REAL_NUMBER_TYPES(REGISTER)
-#undef REGISTER
-
-struct HistogramResource : public ResourceBase {
-  histogram::ThreadSafeHistogram histogram;
-
-  string DebugString() const override {
-    return "A histogram summary. Stats ...";
-  }
-};
-
-class SummaryMergeOp : public OpKernel {
- public:
-  explicit SummaryMergeOp(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* c) override {
-    Summary s;
-    std::unordered_set<string> tags;
-    for (int input_num = 0; input_num < c->num_inputs(); input_num++) {
-      const Tensor& in = c->input(input_num);
-      auto in_vec = in.flat<tstring>();
-      for (int i = 0; i < in_vec.dimension(0); i++) {
-        const string& s_in = in_vec(i);
-        Summary summary_in;
-        if (!ParseProtoUnlimited(&summary_in, s_in)) {
-          c->SetStatus(errors::InvalidArgument(
-              "Could not parse one of the summary inputs"));
-          return;
-        }
-
-        for (int v = 0; v < summary_in.value_size(); v++) {
-          const string& tag = summary_in.value(v).tag();
-          // The tag is unused by the TensorSummary op, so no need to check
-          // for duplicates.
-          if ((!tag.empty()) && !tags.insert(tag).second) {
-            c->SetStatus(errors::InvalidArgument(strings::StrCat(
-                "Duplicate tag ", tag, " found in summary inputs")));
-            return;
-          }
-          *s.add_value() = summary_in.value(v);
-        }
-      }
-    }
-
-    Tensor* summary_tensor = nullptr;
-    OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape({}), &summary_tensor));
-    CHECK(SerializeToTString(s, &summary_tensor->scalar<tstring>()()));
-  }
-};
-
-REGISTER_KERNEL_BUILDER(Name("MergeSummary").Device(DEVICE_CPU),
-                        SummaryMergeOp);
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/tensor_list.h b/tensorflow/core/kernels/tensor_list.h
index b67157d4c65..5d3921cffe9 100644
--- a/tensorflow/core/kernels/tensor_list.h
+++ b/tensorflow/core/kernels/tensor_list.h
@@ -151,7 +151,8 @@ class TensorList {
 
 #if defined(PLATFORM_GOOGLE)
 // TODO(ebrevdo): Identify why Variant inline size is smaller on mobile devices.
-static_assert(Variant::CanInlineType<TensorList>(),
+// For 32-bit devices, it's acceptable not to inline.
+static_assert(Variant::CanInlineType<TensorList>() || sizeof(void*) < 8,
               "Must be able to inline TensorList into a Variant");
 #endif
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/tensor_map.cc b/tensorflow/core/kernels/tensor_map.cc
new file mode 100644
index 00000000000..cb929007111
--- /dev/null
+++ b/tensorflow/core/kernels/tensor_map.cc
@@ -0,0 +1,87 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/tensor_map.h"
+
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/variant_op_registry.h"
+#include "tensorflow/core/lib/core/coding.h"
+
+namespace tensorflow {
+
+TensorMap::~TensorMap() {
+  if (tensors_) tensors_->Unref();
+}
+
+void TensorMap::Encode(VariantTensorData* data) const {
+  data->set_type_name(TypeName());
+
+  absl::flat_hash_map<TensorKey, Tensor>::const_iterator map_it =
+      tensors().begin();
+  while (map_it != tensors().end()) {
+    Tensor k = map_it->first;
+    Tensor v = map_it->second;
+    // TODO: k should also not be DT_RESOURCE or DT_VARIANT
+    CHECK_NE(k.dtype(), DT_INVALID);
+    CHECK_NE(v.dtype(), DT_INVALID);
+    *data->add_tensors() = k;
+    *data->add_tensors() = v;
+    map_it++;
+  }
+}
+
+static Status TensorMapDeviceCopy(
+    const TensorMap& from, TensorMap* to,
+    const UnaryVariantOpRegistry::AsyncTensorDeviceCopyFn& copy) {
+  for (const std::pair<TensorKey, Tensor>& p : from.tensors()) {
+    TensorKey to_key(p.first.dtype());
+    Tensor to_val(p.second.dtype());
+    TF_RETURN_IF_ERROR(copy(p.first, &to_key));
+    TF_RETURN_IF_ERROR(copy(p.second, &to_val));
+    to->tensors().emplace(to_key, to_val);
+  }
+  return Status::OK();
+}
+
+#define REGISTER_LIST_COPY(DIRECTION)                                        \
+  INTERNAL_REGISTER_UNARY_VARIANT_DEVICE_COPY_FUNCTION(TensorMap, DIRECTION, \
+                                                       TensorMapDeviceCopy)
+
+REGISTER_LIST_COPY(VariantDeviceCopyDirection::HOST_TO_DEVICE);
+REGISTER_LIST_COPY(VariantDeviceCopyDirection::DEVICE_TO_HOST);
+REGISTER_LIST_COPY(VariantDeviceCopyDirection::DEVICE_TO_DEVICE);
+
+REGISTER_UNARY_VARIANT_DECODE_FUNCTION(TensorMap, TensorMap::kTypeName);
+
+bool TensorMap::Decode(const VariantTensorData& data) {
+  // TODO(srbs): Change the signature to Decode(VariantTensorData data) so
+  // that we do not have to copy each tensor individually below. This would
+  // require changing VariantTensorData::tensors() as well.
+  std::vector<Tensor>::const_iterator tensors_it = data.tensors().begin();
+
+  while (tensors_it != data.tensors().end()) {
+    if (std::next(tensors_it) == data.tensors().end()) {
+      return false;
+    }
+    tensors().emplace(tensors_it[0], tensors_it[1]);
+    tensors_it += 2;
+  }
+  return true;
+}
+
+const char TensorMap::kTypeName[] = "tensorflow::TensorMap";
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/tensor_map.h b/tensorflow/core/kernels/tensor_map.h
new file mode 100644
index 00000000000..cb4c827cc3c
--- /dev/null
+++ b/tensorflow/core/kernels/tensor_map.h
@@ -0,0 +1,181 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_TENSOR_MAP_H_
+#define TENSORFLOW_CORE_KERNELS_TENSOR_MAP_H_
+
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_key.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/framework/variant_tensor_data.h"
+#include "tensorflow/core/lib/core/refcount.h"
+
+namespace tensorflow {
+
+// Variant compatible type for a map of tensors. This is mutable but instances
+// should never be mutated after stored in a variant tensor.
+//
+// **NOTE**: TensorMap stores a refcounted container of tf::Tensor objects,
+// which are accessible via TensorMap::tensors().  Because it is refcounted,
+// straight copies of the form:
+//
+//    TensorMap b = a;
+//    b.tensors().insert(k,v);  // WARNING: This modifies a.tensors().
+//
+// Do not create a true copy of the underlying container - but instead increment
+// a reference count.  Modifying b.tensors() modifies a.tensors().  In this way,
+// TensorMap should be considered similar to the tf::Tensor object.
+//
+// In order to get a copy of the underlying map, use the Copy method:
+//
+//    TensorMap b = a.Copy();
+//    b.tensors().insert(k, v);  // This does not modify a.tensors().
+//
+// Note that this is not a deep copy: the memory locations of the underlying
+// tensors will still point to the same locations of the corresponding tensors
+// in the original.  To truly perform a deep copy, Device and Type-specific
+// code needs to be applied to the underlying tensors as usual.
+//
+// The most important implication of RefCounted TensorMaps is that OpKernels
+// wishing to reuse TensorMap inputs as outputs via context->forward_input()
+// need to perform an additional check on the refcount of the TensorList,
+// to ensure aliasing can be performed safely.  For example:
+//
+//     bool can_alias = false;
+//     auto fw = c->forward_input(..., DT_VARIANT, {}, ...);
+//     if (fw && fw->dtype() == DT_VARIANT && fw->NumElements() == 1) {
+//       auto* tl = fw->scalar<Variant>()().get<TensorMap>();
+//       if (tl && tl->RefCountIsOne()) {
+//         can_alias = true;
+//       }
+//     }
+//
+class TensorMap {
+ public:
+  TensorMap() : tensors_(new Tensors) {}
+  ~TensorMap();
+
+  TensorMap(const TensorMap& other) : tensors_(other.tensors_) {
+    tensors_->Ref();
+  }
+
+  TensorMap(TensorMap&& rhs) : tensors_(rhs.tensors_) {
+    rhs.tensors_ = nullptr;
+  }
+
+  TensorMap& operator=(const TensorMap& rhs) {
+    if (this == &rhs) return *this;
+    tensors_->Unref();
+    tensors_ = rhs.tensors_;
+    tensors_->Ref();
+    return *this;
+  }
+
+  TensorMap& operator=(TensorMap&& rhs) {
+    if (this == &rhs) return *this;
+    std::swap(tensors_, rhs.tensors_);
+    return *this;
+  }
+
+  static const char kTypeName[];
+
+  string TypeName() const { return kTypeName; }
+
+  void Encode(VariantTensorData* data) const;
+
+  bool Decode(const VariantTensorData& data);
+
+  // TODO(apassos) fill this out
+  string DebugString() const { return "TensorMap"; }
+
+  // Access to the underlying tensor container.
+  absl::flat_hash_map<TensorKey, Tensor>& tensors() {
+    return tensors_->values_;
+  }
+
+  const absl::flat_hash_map<TensorKey, Tensor>& tensors() const {
+    return tensors_->values_;
+  }
+
+  // Get a new TensorMap containing a copy of the underlying tensor container.
+  TensorMap Copy() const {
+    TensorMap out;
+    // This performs a copy of the absl::hashmap.
+    out.tensors_->values_ = tensors_->values_;
+    return out;
+  }
+
+  // Insert key and value if the key does not already exist.
+  // Returns true if the insertion happens.
+  bool insert(const TensorKey& key, const Tensor& value) {
+    auto r = tensors_->values_.try_emplace(key, value);
+    return r.second;
+  }
+
+  // Lookup given key. Returns iterator to found key or end.
+  absl::flat_hash_map<TensorKey, Tensor>::iterator find(TensorKey key) {
+    return tensors_->values_.find(key);
+  }
+
+  Tensor& lookup(TensorKey key) { return tensors_->values_.find(key)->second; }
+
+  Tensor& operator[](TensorKey& k) { return tensors_->values_[k]; }
+
+  bool replace(const TensorKey& k, const Tensor& v) {
+    tensors_->values_[k] = v;
+    return true;
+  }
+
+  // Removes element with given key. Return size of removed element.
+  size_t erase(TensorKey key) { return tensors_->values_.erase(key); }
+
+  // Size returns the number of elements in the map
+  size_t size() const { return tensors_->values_.size(); }
+
+  std::vector<Tensor> keys() const {
+    std::vector<Tensor> keys;
+    keys.reserve(tensors_->values_.size());
+    absl::flat_hash_map<TensorKey, Tensor>::iterator it =
+        tensors_->values_.begin();
+    while (it != tensors_->values_.end()) {
+      keys.push_back(it->first);
+      it++;
+    }
+    return keys;
+  }
+
+  // Is this TensorMap the only one with a reference to the underlying
+  // container?
+  bool RefCountIsOne() const { return tensors_->RefCountIsOne(); }
+
+ private:
+  class Tensors : public core::RefCounted {
+   public:
+    absl::flat_hash_map<TensorKey, Tensor> values_;
+  };
+  Tensors* tensors_;
+};
+
+#if defined(PLATFORM_GOOGLE)
+// TODO(ebrevdo): Identify why Variant inline size is smaller on mobile devices.
+// For 32-bit devices, it's acceptable not to inline.
+static_assert(Variant::CanInlineType<TensorMap>() || sizeof(void*) < 8,
+              "Must be able to inline TensorMap into a Variant");
+#endif
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_TENSOR_MAP_H_
diff --git a/tensorflow/core/kernels/tensor_map_test.cc b/tensorflow/core/kernels/tensor_map_test.cc
new file mode 100644
index 00000000000..76c903f047c
--- /dev/null
+++ b/tensorflow/core/kernels/tensor_map_test.cc
@@ -0,0 +1,177 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/tensor_map.h"
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+
+namespace {
+
+TEST(TensorMapTest, Empty) {
+  TensorMap tm;
+  EXPECT_EQ(tm.tensors().size(), 0);
+  EXPECT_EQ(tm.tensors().begin(), tm.tensors().end());
+}
+
+TEST(TensorKeyTest, Equal) {
+  TensorKey k1 = Tensor(15);
+  TensorKey k2 = Tensor(15);
+  EXPECT_EQ(k1, k2);
+  EXPECT_EQ(k1.shape(), k2.shape());
+  EXPECT_EQ(k1.dtype(), k2.dtype());
+
+  TensorKey k3 = Tensor(37.0);
+  EXPECT_NE(k1, k3);
+  EXPECT_NE(k1.dtype(), k3.dtype());
+}
+
+TEST(TensorMapTest, Insert) {
+  TensorMap tm;
+  TensorKey k = Tensor(11);
+  Tensor v = Tensor(22);
+  tm.insert(k, v);
+  absl::flat_hash_map<TensorKey, Tensor> am;
+  am.try_emplace(k, v);
+
+  absl::flat_hash_map<TensorKey, Tensor>::iterator map_it =
+      tm.tensors().begin();
+  EXPECT_EQ(map_it->first, k);
+  test::ExpectTensorEqual<int32>(map_it->second, v);
+  map_it++;
+  EXPECT_EQ(map_it, tm.tensors().end());
+}
+
+TEST(TensorMapTest, Lookup) {
+  TensorMap tm;
+  TensorKey k = Tensor(11);
+  Tensor v = Tensor(22);
+  tm.insert(k, v);
+  absl::flat_hash_map<TensorKey, Tensor>::iterator map_it = tm.find(k);
+  Tensor f = map_it->second;
+
+  EXPECT_EQ(map_it->first, k);
+  test::ExpectTensorEqual<int32>(f, v);
+}
+
+TEST(TensorMapTest, Erase) {
+  TensorMap tm;
+  TensorKey k = Tensor(11);
+  Tensor v = Tensor(22);
+  tm.insert(k, v);
+  tm.erase(k);
+  EXPECT_EQ(tm.find(k), tm.tensors().end());
+}
+
+TEST(TensorMapTest, SameKeyInsert) {
+  TensorMap tm;
+  TensorKey k = Tensor(11);
+  Tensor v1 = Tensor(22);
+  Tensor v2 = Tensor(23);
+  bool b1 = tm.insert(k, v1);
+  bool b2 = tm.insert(k, v2);
+  EXPECT_EQ(b1, true);
+  EXPECT_EQ(b2, false);
+  absl::flat_hash_map<TensorKey, Tensor>::iterator map_it = tm.find(k);
+  EXPECT_EQ(map_it->first, k);
+  test::ExpectTensorEqual<int32>(map_it->second, v1);
+}
+
+TEST(TensorMapTest, Replace) {
+  TensorMap tm;
+  TensorKey k = Tensor(11);
+  Tensor v1 = Tensor(22);
+  Tensor v2 = Tensor(23);
+  tm[k] = v2;
+  absl::flat_hash_map<TensorKey, Tensor>::iterator map_it = tm.find(k);
+  EXPECT_EQ(map_it->first, k);
+  test::ExpectTensorEqual<int32>(map_it->second, v2);
+}
+
+TEST(TensorMapTest, ListKeys) {
+  TensorMap tm;
+  TensorKey k = Tensor(11.0);
+  TensorKey k2 = Tensor(12.0);
+  Tensor v = Tensor(22);
+  Tensor v2 = Tensor(23);
+  tm.insert(k, v);
+  tm.insert(k2, v2);
+  std::vector<Tensor> keys = tm.keys();
+
+  // Extract and sort double value for each key Tensor.
+  std::vector<std::pair<double, int>> key_doubles;
+  for (int i = 0; i < keys.size(); i++) {
+    double x = keys[i].scalar<double>()();
+    std::pair<double, int> p = std::pair<double, int>(x, i);
+    key_doubles.push_back(p);
+  }
+  sort(key_doubles.begin(), key_doubles.end());
+  // Check number of keys and each key.
+  EXPECT_EQ(keys.size(), 2);
+  EXPECT_EQ(key_doubles[0].first, 11.0);
+  EXPECT_EQ(key_doubles[1].first, 12.0);
+  // Check key shapes.
+  int ind1 = key_doubles[0].second;
+  int ind2 = key_doubles[1].second;
+  EXPECT_EQ(keys[ind1].shape(), k.shape());
+  EXPECT_EQ(keys[ind2].shape(), k2.shape());
+}
+
+TEST(TensorMapTest, Size) {
+  TensorMap tm;
+  EXPECT_EQ(tm.size(), 0);
+  TensorKey k = Tensor(11);
+  Tensor v = Tensor(22);
+  tm.insert(k, v);
+  EXPECT_EQ(tm.size(), 1);
+}
+
+TEST(TensorMapTest, Copy) {
+  TensorMap tm;
+  TensorKey k = Tensor(11);
+  Tensor v = Tensor(22);
+  tm.insert(k, v);
+  TensorMap tmc = tm.Copy();
+  EXPECT_EQ(tm.size(), tmc.size());
+  EXPECT_NE(tm.find(k), tm.tensors().end());
+  EXPECT_NE(tmc.find(k), tmc.tensors().end());
+  EXPECT_EQ(tm.find(k)->first, tmc.find(k)->first);
+  test::ExpectTensorEqual<int32>(tm.find(k)->second, tmc.find(k)->second);
+}
+
+TEST(TensorMapTest, EncodeDecode) {
+  TensorMap tm;
+  TensorKey k = Tensor(11);
+  Tensor v = Tensor(22);
+  tm.insert(k, v);
+  VariantTensorData data;
+  tm.Encode(&data);
+  TensorMap tmc;
+  tmc.Decode(data);
+  EXPECT_EQ(tm.size(), tmc.size());
+  EXPECT_NE(tm.find(k), tm.tensors().end());
+  EXPECT_NE(tmc.find(k), tmc.tensors().end());
+  EXPECT_EQ(tm.find(k)->first, tmc.find(k)->first);
+  test::ExpectTensorEqual<int32>(tm.find(k)->second, tmc.find(k)->second);
+}
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/tile_functor.h b/tensorflow/core/kernels/tile_functor.h
index d8ce39dcaf8..f2428cd48d9 100644
--- a/tensorflow/core/kernels/tile_functor.h
+++ b/tensorflow/core/kernels/tile_functor.h
@@ -37,10 +37,6 @@ template <typename T>
 void TileSimple(const Eigen::GpuDevice& d, Tensor* out, const Tensor& in);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-template <typename T>
-void TileSimple(const Eigen::SyclDevice& d, Tensor* out, const Tensor& in);
-#endif
 
 template <typename Device, typename T, typename Tmultiples, int NDIM>
 void TileUsingEigen(const Device& d, Tensor* out, const Tensor& in,
diff --git a/tensorflow/core/kernels/tile_functor_cpu.h b/tensorflow/core/kernels/tile_functor_cpu.h
index 5b005e4a8b4..2967d56346d 100644
--- a/tensorflow/core/kernels/tile_functor_cpu.h
+++ b/tensorflow/core/kernels/tile_functor_cpu.h
@@ -48,12 +48,6 @@ void TileSimple(const Eigen::ThreadPoolDevice& d, Tensor* out,
                 const Tensor& in) {
   return TileSimpleImpl<Eigen::ThreadPoolDevice, T>(d, out, in);
 }
-#ifdef TENSORFLOW_USE_SYCL
-template <typename T>
-void TileSimple(const Eigen::SyclDevice& d, Tensor* out, const Tensor& in) {
-  return TileSimpleImpl<Eigen::SyclDevice, T>(d, out, in);
-}
-#endif
 
 }  // namespace internal
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/tile_functor_sycl.cc b/tensorflow/core/kernels/tile_functor_sycl.cc
index 21574250773..b15a1f203b5 100644
--- a/tensorflow/core/kernels/tile_functor_sycl.cc
+++ b/tensorflow/core/kernels/tile_functor_sycl.cc
@@ -19,24 +19,6 @@ limitations under the License.
 namespace tensorflow {
 namespace functor {
 
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-
-#define DEFINE_TYPE(T)                        \
-  template struct Tile<SYCLDevice, T, int32>; \
-  template struct Tile<SYCLDevice, T, int64>;
-
-TF_CALL_bool(DEFINE_TYPE);
-TF_CALL_float(DEFINE_TYPE);
-TF_CALL_bfloat16(DEFINE_TYPE);
-TF_CALL_double(DEFINE_TYPE);
-TF_CALL_uint8(DEFINE_TYPE);
-TF_CALL_int32(DEFINE_TYPE);
-TF_CALL_int16(DEFINE_TYPE);
-TF_CALL_int64(DEFINE_TYPE);
-
-#undef DEFINE_TYPE
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // end namespace functor
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/tile_ops.cc b/tensorflow/core/kernels/tile_ops.cc
index f733d9b9aea..c24c7f1b0bc 100644
--- a/tensorflow/core/kernels/tile_ops.cc
+++ b/tensorflow/core/kernels/tile_ops.cc
@@ -41,9 +41,6 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 // Forward declarations of functors that will be defined in tile_ops_impl.h
 namespace functor {
@@ -108,26 +105,6 @@ extern template struct Tile<GPUDevice, int64, int64>;
 #define DECLARE_CUDA_DIM(T, NDIM)
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-#define DECLARE_TYPE(T)                              \
-  extern template struct Tile<SYCLDevice, T, int32>; \
-  extern template struct Tile<SYCLDevice, T, int64>;
-TF_CALL_bool(DECLARE_TYPE);
-TF_CALL_float(DECLARE_TYPE);
-TF_CALL_bfloat16(DECLARE_TYPE);
-TF_CALL_double(DECLARE_TYPE);
-TF_CALL_uint8(DECLARE_TYPE);
-TF_CALL_int32(DECLARE_TYPE);
-TF_CALL_int16(DECLARE_TYPE);
-TF_CALL_int64(DECLARE_TYPE);
-#undef DECLARE_TYPE
-#define DECLARE_SYCL_DIM(T, NDIM)                       \
-  extern template struct TileGrad<SYCLDevice, T, NDIM>; \
-  extern template struct ReduceAndReshape<SYCLDevice, T, NDIM, 1>
-#else  // TENSORFLOW_USE_SYCL
-#define DECLARE_SYCL_DIM(T, NDIM)
-#endif  // TENSORFLOW_USE_SYCL
-
 #define DECLARE_TYPE(T)                             \
   extern template struct Tile<CPUDevice, T, int32>; \
   extern template struct Tile<CPUDevice, T, int64>;
@@ -150,7 +127,6 @@ TF_CALL_variant(DECLARE_TYPE);
 
 #define DECLARE_DIM(T, NDIM)                           \
   DECLARE_CUDA_DIM(T, NDIM);                           \
-  DECLARE_SYCL_DIM(T, NDIM);                           \
   extern template struct TileGrad<CPUDevice, T, NDIM>; \
   extern template struct ReduceAndReshape<CPUDevice, T, NDIM, 1>;
 
@@ -174,7 +150,6 @@ TF_CALL_complex128(DECLARE_TYPE);
 #undef DECLARE_TYPE
 
 #undef DECLARE_DIM
-#undef DECLARE_SYCL_DIM
 #undef DECLARE_CUDA_DIM
 
 }  // namespace functor
@@ -310,11 +285,6 @@ inline void TileOp<Device, Tmultiples>::HandleCase(
   HANDLE_CASE(GPUDevice, DataTypeToEnum<T>::value, int32); \
   HANDLE_CASE(GPUDevice, DataTypeToEnum<T>::value, int64);
 
-#ifdef TENSORFLOW_USE_SYCL
-#define HANDLE_TYPE_NAME_SYCL(T)                            \
-  HANDLE_CASE(SYCLDevice, DataTypeToEnum<T>::value, int32); \
-  HANDLE_CASE(SYCLDevice, DataTypeToEnum<T>::value, int64);
-#endif  // TENSORFLOW_USE_SYCL
 
 TF_CALL_bool(HANDLE_TYPE_NAME_CPU);
 TF_CALL_float(HANDLE_TYPE_NAME_CPU);
@@ -345,19 +315,9 @@ TF_CALL_complex64(HANDLE_TYPE_NAME_GPU);
 TF_CALL_complex128(HANDLE_TYPE_NAME_GPU);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-TF_CALL_float(HANDLE_TYPE_NAME_SYCL);
-TF_CALL_double(HANDLE_TYPE_NAME_SYCL);
-TF_CALL_int16(HANDLE_TYPE_NAME_SYCL);
-TF_CALL_int32(HANDLE_TYPE_NAME_SYCL);
-TF_CALL_int64(HANDLE_TYPE_NAME_SYCL);
-#endif  // TENSORFLOW_USE_SYCL
 
 #undef HANDLE_TYPE_NAME_CPU
 #undef HANDLE_TYPE_NAME_GPU
-#ifdef TENSORFLOW_USE_SYCL
-#undef HANDLE_TYPE_NAME_SYCL
-#endif  // TENSORFLOW_USE_SYCL
 #undef HANDLE_CASE
 
 // --------------------------------------------------------------------------
@@ -610,17 +570,6 @@ TF_CALL_complex64(HANDLE_TYPE_NAME_GPU);
 TF_CALL_complex128(HANDLE_TYPE_NAME_GPU);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#if TENSORFLOW_USE_SYCL
-#define HANDLE_TYPE_NAME_SYCL(T) \
-  HANDLE_CASE_DIM(SYCLDevice, T, DataTypeToEnum<T>::value);
-
-TF_CALL_float(HANDLE_TYPE_NAME_SYCL);
-TF_CALL_double(HANDLE_TYPE_NAME_SYCL);
-TF_CALL_int16(HANDLE_TYPE_NAME_SYCL);
-TF_CALL_int32(HANDLE_TYPE_NAME_SYCL);
-TF_CALL_int64(HANDLE_TYPE_NAME_SYCL);
-#undef HANDLE_TYPE_NAME_SYCL
-#endif  // TENSORFLOW_USE_SYCL
 
 #undef HANDLE_TYPE_NAME_CPU
 #undef HANDLE_TYPE_NAME_GPU
@@ -696,37 +645,5 @@ TF_CALL_complex128(REGISTER_GPU)
 #undef REGISTER_GPU
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL(type)                                        \
-  REGISTER_KERNEL_BUILDER(Name("Tile")                             \
-                              .Device(DEVICE_SYCL)                 \
-                              .TypeConstraint<type>("T")           \
-                              .TypeConstraint<int32>("Tmultiples") \
-                              .HostMemory("multiples"),            \
-                          TileOp<SYCLDevice, int32>);              \
-  REGISTER_KERNEL_BUILDER(Name("Tile")                             \
-                              .Device(DEVICE_SYCL)                 \
-                              .TypeConstraint<type>("T")           \
-                              .TypeConstraint<int64>("Tmultiples") \
-                              .HostMemory("multiples"),            \
-                          TileOp<SYCLDevice, int64>);              \
-  REGISTER_KERNEL_BUILDER(Name("TileGrad")                         \
-                              .Device(DEVICE_SYCL)                 \
-                              .TypeConstraint<type>("T")           \
-                              .TypeConstraint<int32>("Tmultiples") \
-                              .HostMemory("multiples"),            \
-                          TileGradientOp<SYCLDevice, int32>);      \
-  REGISTER_KERNEL_BUILDER(Name("TileGrad")                         \
-                              .Device(DEVICE_SYCL)                 \
-                              .TypeConstraint<type>("T")           \
-                              .TypeConstraint<int64>("Tmultiples") \
-                              .HostMemory("multiples"),            \
-                          TileGradientOp<SYCLDevice, int64>);
-
-    TF_CALL_float(REGISTER_SYCL);
-TF_CALL_double(REGISTER_SYCL);
-
-#undef REGISTER_SYCL
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/tile_ops_cpu_impl.h b/tensorflow/core/kernels/tile_ops_cpu_impl.h
index 8b0c80159a3..066954a16a7 100644
--- a/tensorflow/core/kernels/tile_ops_cpu_impl.h
+++ b/tensorflow/core/kernels/tile_ops_cpu_impl.h
@@ -45,27 +45,6 @@ TF_CALL_complex128(DEFINE_TYPE);
 #undef DEFINE_DIM
 #undef DEFINE_TYPE
 
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-
-// Register functors used for TileGradientOp.
-#define DEFINE_DIM(T, NDIM)                      \
-  template struct TileGrad<SYCLDevice, T, NDIM>; \
-  template struct ReduceAndReshape<SYCLDevice, T, NDIM, 1>;
-#define DEFINE_TYPE(T) DEFINE_DIM(T, CPU_PROVIDED_IXDIM)
-
-TF_CALL_bool(DEFINE_TYPE);
-TF_CALL_float(DEFINE_TYPE);
-TF_CALL_bfloat16(DEFINE_TYPE);
-TF_CALL_double(DEFINE_TYPE);
-TF_CALL_uint8(DEFINE_TYPE);
-TF_CALL_int16(DEFINE_TYPE);
-TF_CALL_int32(DEFINE_TYPE);
-TF_CALL_int64(DEFINE_TYPE);
-
-#undef DEFINE_DIM
-#undef DEFINE_TYPE
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // end namespace functor
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc
index 5948121e8a3..bdb07470c07 100644
--- a/tensorflow/core/kernels/training_ops.cc
+++ b/tensorflow/core/kernels/training_ops.cc
@@ -23,19 +23,15 @@ limitations under the License.
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/training_op_helpers.h"
 #include "tensorflow/core/kernels/variable_ops.h"
-#include "tensorflow/core/lib/bfloat16/bfloat16.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/bfloat16.h"
 #include "tensorflow/core/util/util.h"
 
-#ifdef TENSORFLOW_USE_SYCL
-#include "tensorflow/core/common_runtime/sycl/sycl_util.h"
-#endif  // TENSORFLOW_USE_SYCL
 
 namespace tensorflow {
 
 using CPUDevice = Eigen::ThreadPoolDevice;
 using GPUDevice = Eigen::GpuDevice;
-using SYCLDevice = Eigen::SyclDevice;
 using Index = Eigen::Index;
 
 namespace {
@@ -57,15 +53,6 @@ struct ApplyGradientDescent<CPUDevice, T> {
   }
 };
 
-#ifdef TENSORFLOW_USE_SYCL
-template <typename T>
-struct ApplyGradientDescentSYCL {
-  void operator()(const SYCLDevice& d, typename TTypes<T>::Flat var, T lr,
-                  typename TTypes<T>::ConstFlat grad) {
-    var.device(d) -= grad * lr;
-  }
-};
-#endif
 
 template <typename T>
 struct ApplyAdadelta<CPUDevice, T> {
@@ -496,21 +483,6 @@ struct ApplyAdamNonCuda {
   }
 };
 
-#ifdef TENSORFLOW_USE_SYCL
-template <typename T>
-struct ApplyAdamSYCL {
-  void operator()(const SYCLDevice& d, typename TTypes<T>::Flat var,
-                  typename TTypes<T>::Flat m, typename TTypes<T>::Flat v,
-                  T beta1_power, T beta2_power, T lr, T beta1, T beta2,
-                  T epsilon, typename TTypes<T>::ConstFlat grad) {
-    const T alpha =
-        lr * Eigen::numext::sqrt(T(1) - beta2_power) / (T(1) - beta1_power);
-    m.device(d) += (grad - m) * (T(1) - beta1);
-    v.device(d) += (grad.square() - v) * (T(1) - beta2);
-    var.device(d) -= (m * alpha) / (v.sqrt() + epsilon);
-  }
-};
-#endif  // TENSORFLOW_USE_SYCL
 
 template <typename T>
 struct ApplyAdam<CPUDevice, T> : ApplyAdamNonCuda<CPUDevice, T> {};
@@ -666,53 +638,6 @@ class ApplyGradientDescentOp : public OpKernel {
   bool use_exclusive_lock_;
 };
 
-#ifdef TENSORFLOW_USE_SYCL
-template <typename T>
-class ApplyGradientDescentOp<SYCLDevice, T> : public OpKernel {
- public:
-  explicit ApplyGradientDescentOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
-  }
-
-  void Compute(OpKernelContext* ctx) override {
-    const bool sparse = false;
-    auto locks = MaybeLockVariableInputMutexesInOrder<SYCLDevice, T>(
-        ctx, use_exclusive_lock_, sparse, {0});
-    Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<SYCLDevice, T>(
-                            ctx, 0, use_exclusive_lock_, sparse, &var));
-
-    OP_REQUIRES(
-        ctx, var.IsInitialized(),
-        errors::FailedPrecondition(
-            "Attempting to use uninitialized variables: ", requested_input(0)));
-    const Tensor& alpha_dev = ctx->input(1);
-    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(alpha_dev.shape()),
-                errors::InvalidArgument("alpha is not a scalar: ",
-                                        alpha_dev.shape().DebugString()));
-    const Tensor& delta = ctx->input(2);
-    OP_REQUIRES(
-        ctx, var.shape().IsSameSize(delta.shape()),
-        errors::InvalidArgument("var and delta do not have the same shape",
-                                var.shape().DebugString(), " ",
-                                delta.shape().DebugString()));
-
-    auto device = ctx->eigen_sycl_device();
-    auto size = sizeof(T);
-    T alpha = T(0);
-    auto src_ptr = GetBase(&alpha_dev);
-    device.memcpyDeviceToHost(&alpha, static_cast<const T*>(src_ptr), size);
-
-    functor::ApplyGradientDescentSYCL<T>()(device, var.flat<T>(), alpha,
-                                           delta.flat<T>());
-
-    MaybeForwardRefInputToRefOutput(ctx, 0, 0);
-  }
-
- private:
-  bool use_exclusive_lock_;
-};
-#endif  // TENSORFLOW_USE_SYCL
 
 #define REGISTER_KERNELS(D, T)                                                \
   REGISTER_KERNEL_BUILDER(                                                    \
@@ -757,12 +682,6 @@ REGISTER_KERNELS(GPU, complex128);
 #endif
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNELS(T) REGISTER_KERNELS(SYCL, T);
-TF_CALL_float(REGISTER_SYCL_KERNELS);
-TF_CALL_double(REGISTER_SYCL_KERNELS);
-#undef REGISTER_SYCL_KERNELS
-#endif  // TENSORFLOW_USE_SYCL
 
 #undef REGISTER_CPU_KERNELS
 #undef REGISTER_KERNELS
@@ -3523,123 +3442,6 @@ class ApplyAdamOp : public OpKernel {
   bool use_nesterov_;
 };
 
-#ifdef TENSORFLOW_USE_SYCL
-template <typename T>
-class ApplyAdamOp<SYCLDevice, T> : public OpKernel {
- public:
-  explicit ApplyAdamOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
-  }
-
-  void Compute(OpKernelContext* ctx) override {
-    const bool sparse = false;
-    auto locks = MaybeLockVariableInputMutexesInOrder<SYCLDevice, T>(
-        ctx, use_exclusive_lock_, sparse, {0, 1, 2});
-
-    Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<SYCLDevice, T>(
-                            ctx, 0, use_exclusive_lock_, sparse, &var));
-    Tensor m;
-    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<SYCLDevice, T>(
-                            ctx, 1, use_exclusive_lock_, sparse, &m));
-    Tensor v;
-    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<SYCLDevice, T>(
-                            ctx, 2, use_exclusive_lock_, sparse, &v));
-    OP_REQUIRES(
-        ctx, var.IsInitialized(),
-        errors::FailedPrecondition(
-            "Attempting to use uninitialized variables: ", requested_input(0)));
-    OP_REQUIRES(
-        ctx, m.IsInitialized(),
-        errors::FailedPrecondition(
-            "Attempting to use uninitialized variables: ", requested_input(1)));
-    OP_REQUIRES(
-        ctx, v.IsInitialized(),
-        errors::FailedPrecondition(
-            "Attempting to use uninitialized variables: ", requested_input(2)));
-
-    const Tensor& beta1_power_dev = ctx->input(3);
-    const Tensor& beta2_power_dev = ctx->input(4);
-    const Tensor& lr_dev = ctx->input(5);
-    const Tensor& beta1_dev = ctx->input(6);
-    const Tensor& beta2_dev = ctx->input(7);
-    const Tensor& epsilon_dev = ctx->input(8);
-
-    T beta1_power = 0;
-    T beta2_power = 0;
-    T lr = 0;
-    T beta1 = 0;
-    T beta2 = 0;
-    T epsilon = 0;
-
-    auto device = ctx->eigen_sycl_device();
-    auto size = sizeof(T);
-    auto src_ptr = GetBase(&beta1_power_dev);
-    device.memcpyDeviceToHost(&beta1_power, static_cast<const T*>(src_ptr),
-                              size);
-
-    src_ptr = GetBase(&beta2_power_dev);
-    device.memcpyDeviceToHost(&beta2_power, static_cast<const T*>(src_ptr),
-                              size);
-
-    src_ptr = GetBase(&lr_dev);
-    device.memcpyDeviceToHost(&lr, static_cast<const T*>(src_ptr), size);
-
-    src_ptr = GetBase(&beta1_dev);
-    device.memcpyDeviceToHost(&beta1, static_cast<const T*>(src_ptr), size);
-
-    src_ptr = GetBase(&beta2_dev);
-    device.memcpyDeviceToHost(&beta2, static_cast<const T*>(src_ptr), size);
-
-    src_ptr = GetBase(&epsilon_dev);
-    device.memcpyDeviceToHost(&epsilon, static_cast<const T*>(src_ptr), size);
-
-    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta1_power_dev.shape()),
-                errors::InvalidArgument("beta1_power is not a scalar: ",
-                                        beta1_power_dev.shape().DebugString()));
-    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta2_power_dev.shape()),
-                errors::InvalidArgument("beta2_power is not a scalar: ",
-                                        beta2_power_dev.shape().DebugString()));
-    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr_dev.shape()),
-                errors::InvalidArgument("lr is not a scalar : ",
-                                        lr_dev.shape().DebugString()));
-    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta1_dev.shape()),
-                errors::InvalidArgument("beta1 is not a scalar: ",
-                                        beta1_dev.shape().DebugString()));
-    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta2_dev.shape()),
-                errors::InvalidArgument("beta2 is not a scalar: ",
-                                        beta2_dev.shape().DebugString()));
-    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(epsilon_dev.shape()),
-                errors::InvalidArgument("epsilon is not a scalar: ",
-                                        epsilon_dev.shape().DebugString()));
-
-    const Tensor& grad = ctx->input(9);
-
-    OP_REQUIRES(ctx, var.shape().IsSameSize(m.shape()),
-                errors::InvalidArgument("var and m do not have the same shape",
-                                        var.shape().DebugString(), " ",
-                                        m.shape().DebugString()));
-    OP_REQUIRES(ctx, var.shape().IsSameSize(v.shape()),
-                errors::InvalidArgument("var and v do not have the same shape",
-                                        var.shape().DebugString(), " ",
-                                        v.shape().DebugString()));
-    OP_REQUIRES(
-        ctx, var.shape().IsSameSize(grad.shape()),
-        errors::InvalidArgument("var and grad do not have the same shape",
-                                var.shape().DebugString(), " ",
-                                grad.shape().DebugString()));
-
-    functor::ApplyAdamSYCL<T>()(device, var.flat<T>(), m.flat<T>(), v.flat<T>(),
-                                beta1_power, beta2_power, lr, beta1, beta2,
-                                epsilon, grad.flat<T>());
-
-    MaybeForwardRefInputToRefOutput(ctx, 0, 0);
-  }
-
- private:
-  bool use_exclusive_lock_;
-};
-#endif  // TENSORFLOW_USE_SYCL
 
 #define REGISTER_KERNELS(D, T)                                     \
   REGISTER_KERNEL_BUILDER(                                         \
@@ -3657,12 +3459,6 @@ class ApplyAdamOp<SYCLDevice, T> : public OpKernel {
 TF_CALL_FLOAT_TYPES(REGISTER_CPU_KERNELS);
 TF_CALL_COMPLEX_TYPES(REGISTER_CPU_KERNELS);
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNELS(T) REGISTER_KERNELS(SYCL, T);
-
-TF_CALL_float(REGISTER_SYCL_KERNELS);
-TF_CALL_double(REGISTER_SYCL_KERNELS);
-#endif
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 // Forward declarations of the functor specializations for GPU.
diff --git a/tensorflow/core/kernels/transpose_functor.h b/tensorflow/core/kernels/transpose_functor.h
index 0c22b11b7c6..e6aaca8ff5d 100644
--- a/tensorflow/core/kernels/transpose_functor.h
+++ b/tensorflow/core/kernels/transpose_functor.h
@@ -247,13 +247,6 @@ inline Status DoMatrixTransposeImpl(const Device& device, const Tensor& in,
   return DoTransposeImpl(device, in, perm, conjugate, out);
 }
 
-#ifdef TENSORFLOW_USE_SYCL
-// For SYCL lets always go through Eigen
-template <typename Device, typename T>
-void TransposeSYCL(const Device& d, const Tensor& in,
-                   const gtl::ArraySlice<int32> perm, bool conjugate,
-                   Tensor* out);
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace internal
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/transpose_functor_cpu.cc b/tensorflow/core/kernels/transpose_functor_cpu.cc
index 1271c02fae7..6d0dd9848e5 100644
--- a/tensorflow/core/kernels/transpose_functor_cpu.cc
+++ b/tensorflow/core/kernels/transpose_functor_cpu.cc
@@ -136,69 +136,5 @@ struct Transpose<CPUDevice, T, conjugate> {
 
 INSTANTIATE(CPUDevice)
 
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-
-namespace internal {
-template <typename T>
-void TransposeSYCL(const SYCLDevice& d, const Tensor& in,
-                   const gtl::ArraySlice<int32> perm, bool conjugate,
-                   Tensor* out) {
-  switch (in.dims()) {
-    case 1:
-      TransposeUsingEigen<SYCLDevice, T, 1>(d, in, perm, conjugate, out);
-      break;
-    case 2:
-      TransposeUsingEigen<SYCLDevice, T, 2>(d, in, perm, conjugate, out);
-      break;
-    case 3:
-      TransposeUsingEigen<SYCLDevice, T, 3>(d, in, perm, conjugate, out);
-      break;
-    case 4:
-      TransposeUsingEigen<SYCLDevice, T, 4>(d, in, perm, conjugate, out);
-      break;
-    case 5:
-      TransposeUsingEigen<SYCLDevice, T, 5>(d, in, perm, conjugate, out);
-      break;
-    case 6:
-      TransposeUsingEigen<SYCLDevice, T, 6>(d, in, perm, conjugate, out);
-      break;
-    case 7:
-      TransposeUsingEigen<SYCLDevice, T, 7>(d, in, perm, conjugate, out);
-      break;
-    case 8:
-      TransposeUsingEigen<SYCLDevice, T, 8>(d, in, perm, conjugate, out);
-      break;
-    default:
-      LOG(FATAL) << "Unsupported TransposeUsingEigen for: " << in.dims();
-      break;
-  }
-}
-
-}  // namespace internal
-
-template <typename T, bool conjugate>
-struct Transpose<SYCLDevice, T, conjugate> {
-  static void run(const SYCLDevice& d, const Tensor& in,
-                  const gtl::ArraySlice<int32> perm, Tensor* out) {
-    internal::TransposeSycl(d, in, perm, conjugate, out);
-  }
-};
-
-template <bool conjugate>
-struct Transpose<SYCLDevice, tstring, conjugate> {
-  static void run(const SYCLDevice& d, const Tensor& in,
-                  const gtl::ArraySlice<int32> perm, Tensor* out) {
-    LOG(FATAL) << "DT_STRING not supported on SYCL device.";
-  }
-};
-
-// Explicit instantiation.
-template struct Transpose<SYCLDevice, tstring, false>;
-
-INSTANTIATE(SYCLDevice)
-#undef INSTANTIATE
-
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/transpose_op.cc b/tensorflow/core/kernels/transpose_op.cc
index acd278d7a51..8c2196903ae 100644
--- a/tensorflow/core/kernels/transpose_op.cc
+++ b/tensorflow/core/kernels/transpose_op.cc
@@ -91,20 +91,6 @@ REGISTER_KERNEL_BUILDER(Name("InvertPermutation")
                             .HostMemory("y"),
                         InvertPermutationOp<int64>);
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("InvertPermutation")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int32>("T")
-                            .HostMemory("x")
-                            .HostMemory("y"),
-                        InvertPermutationOp<int32>);
-REGISTER_KERNEL_BUILDER(Name("InvertPermutation")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int64>("T")
-                            .HostMemory("x")
-                            .HostMemory("y"),
-                        InvertPermutationOp<int64>);
-#endif  // TENSORFLOW_USE_SYCL
 
 namespace {
 template <typename Tperm>
@@ -263,33 +249,4 @@ TF_CALL_POD_TYPES(REGISTER);
 #undef REGISTER
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-Status TransposeSyclOp::DoTranspose(OpKernelContext* ctx, const Tensor& in,
-                                    gtl::ArraySlice<int32> perm, Tensor* out) {
-  typedef Eigen::SyclDevice SYCLDevice;
-  return ::tensorflow::DoTranspose(ctx->eigen_device<SYCLDevice>(), in, perm,
-                                   out);
-}
-Status ConjugateTransposeSyclOp::DoTranspose(OpKernelContext* ctx,
-                                             const Tensor& in,
-                                             gtl::ArraySlice<int32> perm,
-                                             Tensor* out) {
-  typedef Eigen::SyclDevice SYCLDevice;
-  return ::tensorflow::DoConjugateTranspose(ctx->eigen_device<SYCLDevice>(), in,
-                                            perm, out);
-}
-#define REGISTER(T)                                   \
-  REGISTER_KERNEL_BUILDER(Name("Transpose")           \
-                              .Device(DEVICE_SYCL)    \
-                              .TypeConstraint<T>("T") \
-                              .HostMemory("perm"),    \
-                          TransposeSyclOp);           \
-  REGISTER_KERNEL_BUILDER(Name("ConjugateTranspose")  \
-                              .Device(DEVICE_SYCL)    \
-                              .TypeConstraint<T>("T") \
-                              .HostMemory("perm"),    \
-                          ConjugateTransposeSyclOp);
-TF_CALL_POD_TYPES(REGISTER);
-#undef REGISTER
-#endif
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/transpose_op.h b/tensorflow/core/kernels/transpose_op.h
index 9e8c5737618..3ea51c7935b 100644
--- a/tensorflow/core/kernels/transpose_op.h
+++ b/tensorflow/core/kernels/transpose_op.h
@@ -62,16 +62,6 @@ class TransposeGpuOp : public TransposeOp {
                      gtl::ArraySlice<int32> perm, Tensor* out) override;
 };
 
-#ifdef TENSORFLOW_USE_SYCL
-class TransposeSyclOp : public TransposeOp {
- public:
-  explicit TransposeSyclOp(OpKernelConstruction* ctx) : TransposeOp(ctx) {}
-
- protected:
-  Status DoTranspose(OpKernelContext* ctx, const Tensor& in,
-                     gtl::ArraySlice<int32> perm, Tensor* out) override;
-};
-#endif  // TENSORFLOW_USE_SYCL
 
 // Conjugating transpose ops.
 class ConjugateTransposeCpuOp : public TransposeOp {
@@ -109,18 +99,6 @@ class ConjugateTransposeGpuOp : public TransposeOp {
   bool IsConjugate() const override { return true; }
 };
 
-#ifdef TENSORFLOW_USE_SYCL
-class ConjugateTransposeSyclOp : public TransposeOp {
- public:
-  explicit ConjugateTransposeSyclOp(OpKernelConstruction* ctx)
-      : TransposeOp(ctx) {}
-
- protected:
-  Status DoTranspose(OpKernelContext* ctx, const Tensor& in,
-                     gtl::ArraySlice<int32> perm, Tensor* out) override;
-  bool IsConjugate() const override { return true; }
-};
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/unique_op.cc b/tensorflow/core/kernels/unique_op.cc
index 8316018294b..d049d1f41ff 100644
--- a/tensorflow/core/kernels/unique_op.cc
+++ b/tensorflow/core/kernels/unique_op.cc
@@ -23,9 +23,9 @@ limitations under the License.
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/lib/bfloat16/bfloat16.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/platform/bfloat16.h"
 
 namespace tensorflow {
 namespace {
@@ -322,40 +322,6 @@ REGISTER_KERNEL_BUILDER(Name("Unique")
                             .HostMemory("idx"),
                         UniqueOp<int64, int64>);
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("Unique")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int32>("T")
-                            .TypeConstraint<int32>("out_idx")
-                            .HostMemory("x")
-                            .HostMemory("y")
-                            .HostMemory("idx"),
-                        UniqueOp<int32, int32>);
-REGISTER_KERNEL_BUILDER(Name("Unique")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int64>("T")
-                            .TypeConstraint<int32>("out_idx")
-                            .HostMemory("x")
-                            .HostMemory("y")
-                            .HostMemory("idx"),
-                        UniqueOp<int64, int32>);
-REGISTER_KERNEL_BUILDER(Name("Unique")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int32>("T")
-                            .TypeConstraint<int64>("out_idx")
-                            .HostMemory("x")
-                            .HostMemory("y")
-                            .HostMemory("idx"),
-                        UniqueOp<int32, int64>);
-REGISTER_KERNEL_BUILDER(Name("Unique")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int64>("T")
-                            .TypeConstraint<int64>("out_idx")
-                            .HostMemory("x")
-                            .HostMemory("y")
-                            .HostMemory("idx"),
-                        UniqueOp<int64, int64>);
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/unpack_op.cc b/tensorflow/core/kernels/unpack_op.cc
index 7ac02e8b4d4..1bdb2474861 100644
--- a/tensorflow/core/kernels/unpack_op.cc
+++ b/tensorflow/core/kernels/unpack_op.cc
@@ -32,9 +32,6 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T>
 class UnpackOp : public OpKernel {
@@ -70,8 +67,6 @@ class UnpackOp : public OpKernel {
                         std::numeric_limits<Eigen::DenseIndex>::max()),
         errors::InvalidArgument("output size must fit in Eigen DenseIndex"));
 
-// This optimization is currently not applicable for SYCL devices
-#ifndef TENSORFLOW_USE_SYCL
     // Special case: Aligned, so we can share the underlying buffer.
     //
     // Apply this optimization conservatively: if input is aligned,
@@ -88,7 +83,6 @@ class UnpackOp : public OpKernel {
       }
       return;
     }
-#endif  // TENSORFLOW_USE_SYCL
 
     Eigen::DenseIndex before_dim = 1;
     for (int i = 0; i < axis; ++i) {
@@ -167,28 +161,5 @@ REGISTER_KERNEL_BUILDER(Name("Unpack")
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL(type)                                         \
-  REGISTER_KERNEL_BUILDER(                                          \
-      Name("Unpack").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
-      UnpackOp<SYCLDevice, type>)
-
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL);
-
-REGISTER_KERNEL_BUILDER(Name("Unpack")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("value")
-                            .HostMemory("output")
-                            .TypeConstraint<int32>("T"),
-                        UnpackOp<CPUDevice, int32>);
-
-REGISTER_KERNEL_BUILDER(Name("Unpack")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("value")
-                            .HostMemory("output")
-                            .TypeConstraint<int64>("T"),
-                        UnpackOp<CPUDevice, int64>);
-#undef REGISTER_SYCL
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/unravel_index_op.cc b/tensorflow/core/kernels/unravel_index_op.cc
index b45ff5e5b85..11d9dac70f7 100644
--- a/tensorflow/core/kernels/unravel_index_op.cc
+++ b/tensorflow/core/kernels/unravel_index_op.cc
@@ -107,12 +107,14 @@ class UnravelIndexOp : public OpKernel {
 
       auto output = output_tensor->matrix<Tidx>();
 
-      Eigen::array<Eigen::Index, 2> reshape{{dims_tensor.NumElements(), 1}};
-      Eigen::array<Eigen::Index, 2> bcast({1, indices_tensor.NumElements()});
+      Eigen::array<Eigen::Index, 2> reshape{
+          {static_cast<Eigen::Index>(dims_tensor.NumElements()), 1}};
+      Eigen::array<Eigen::Index, 2> bcast(
+          {1, static_cast<Eigen::Index>(indices_tensor.NumElements())});
       Eigen::array<Eigen::Index, 2> indices_reshape{
-          {1, indices_tensor.NumElements()}};
+          {1, static_cast<Eigen::Index>(indices_tensor.NumElements())}};
       Eigen::array<Eigen::Index, 2> indices_bcast(
-          {dims_tensor.NumElements(), 1});
+          {static_cast<Eigen::Index>(dims_tensor.NumElements()), 1});
 
       output = indices_tensor.vec<Tidx>()
                    .reshape(indices_reshape)
diff --git a/tensorflow/core/kernels/variable_ops.cc b/tensorflow/core/kernels/variable_ops.cc
index ccd33e8c75a..259c8f6c5e0 100644
--- a/tensorflow/core/kernels/variable_ops.cc
+++ b/tensorflow/core/kernels/variable_ops.cc
@@ -200,31 +200,6 @@ REGISTER_KERNEL_BUILDER(Name("DestroyTemporaryVariable").Device(DEVICE_CPU),
 REGISTER_KERNEL_BUILDER(Name("IsVariableInitialized").Device(DEVICE_CPU),
                         IsVariableInitializedOp);
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)                                          \
-  REGISTER_KERNEL_BUILDER(                                                  \
-      Name("Variable").Device(DEVICE_SYCL).TypeConstraint<type>("dtype"),   \
-      VariableOp);                                                          \
-  REGISTER_KERNEL_BUILDER(                                                  \
-      Name("VariableV2").Device(DEVICE_SYCL).TypeConstraint<type>("dtype"), \
-      VariableOp);                                                          \
-  REGISTER_KERNEL_BUILDER(Name("TemporaryVariable")                         \
-                              .Device(DEVICE_SYCL)                          \
-                              .TypeConstraint<type>("dtype"),               \
-                          TemporaryVariableOp);                             \
-  REGISTER_KERNEL_BUILDER(Name("DestroyTemporaryVariable")                  \
-                              .Device(DEVICE_SYCL)                          \
-                              .TypeConstraint<type>("T"),                   \
-                          DestroyTemporaryVariableOp);                      \
-  REGISTER_KERNEL_BUILDER(Name("IsVariableInitialized")                     \
-                              .Device(DEVICE_SYCL)                          \
-                              .TypeConstraint<type>("dtype")                \
-                              .HostMemory("is_initialized"),                \
-                          IsVariableInitializedOp);
-
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNEL);
-#undef REGISTER_SYCL_KERNEL
-#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 // Only register 'Variable' on GPU for the subset of types also supported by
diff --git a/tensorflow/core/kernels/xent_op.cc b/tensorflow/core/kernels/xent_op.cc
index 8a7c16349a7..0e826274f2e 100644
--- a/tensorflow/core/kernels/xent_op.cc
+++ b/tensorflow/core/kernels/xent_op.cc
@@ -30,9 +30,6 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T>
 class SoftmaxXentWithLogitsOp : public OpKernel {
@@ -119,10 +116,6 @@ struct XentFunctorBase {
 template <typename T>
 struct XentFunctor<CPUDevice, T> : XentFunctorBase<CPUDevice, T> {};
 
-#ifdef TENSORFLOW_USE_SYCL
-template <typename T>
-struct XentFunctor<SYCLDevice, T> : XentFunctorBase<SYCLDevice, T> {};
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace functor
 
 #define REGISTER_CPU(T)                                         \
@@ -150,11 +143,5 @@ REGISTER_KERNEL_BUILDER(Name("SoftmaxCrossEntropyWithLogits")
                         SoftmaxXentWithLogitsOp<GPUDevice, double>);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("SoftmaxCrossEntropyWithLogits")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<float>("T"),
-                        SoftmaxXentWithLogitsOp<SYCLDevice, float>);
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/bfloat16/BUILD b/tensorflow/core/lib/bfloat16/BUILD
index d8213933358..00f0ff03cfc 100644
--- a/tensorflow/core/lib/bfloat16/BUILD
+++ b/tensorflow/core/lib/bfloat16/BUILD
@@ -1,7 +1,7 @@
-load(
-    "//tensorflow/core/platform:rules_cc.bzl",
-    "cc_library",
-)
+# load(
+#     "//tensorflow/core/platform:rules_cc.bzl",
+#     "cc_library",
+# )
 
 package(
     default_visibility = [
@@ -10,24 +10,23 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-cc_library(
-    name = "bfloat16",
-    hdrs = ["bfloat16.h"],
-    deps = [
-        "//tensorflow/core/platform:byte_order",
-        "//third_party/eigen3",
-    ],
-)
+# cc_library(
+#     name = "bfloat16",
+#     hdrs = ["bfloat16.h"],
+#     deps = [
+#         "//third_party/eigen3",
+#         "//tensorflow/core/platform:byte_order",
+#     ],
+# )
 
-# Export source files needed for mobile builds, which do not use granular targets.
-filegroup(
-    name = "mobile_srcs_no_runtime",
-    srcs = [
-        "bfloat16.h",
-    ],
-)
+# # Export source files needed for mobile builds, which do not use granular targets.
+# filegroup(
+#     name = "mobile_srcs_no_runtime",
+#     srcs = [
+#         "bfloat16.h",
+#     ],
+# )
 
-# TODO(bmzhao): Remove the following once references in core/BUILD is removed.
 exports_files(
-    glob(["*"]),
+    ["bfloat16.h"],
 )
diff --git a/tensorflow/core/lib/bfloat16/bfloat16.h b/tensorflow/core/lib/bfloat16/bfloat16.h
index 5f82c0ffd5f..d6ac77b6750 100644
--- a/tensorflow/core/lib/bfloat16/bfloat16.h
+++ b/tensorflow/core/lib/bfloat16/bfloat16.h
@@ -16,13 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_LIB_BFLOAT16_BFLOAT16_H_
 #define TENSORFLOW_CORE_LIB_BFLOAT16_BFLOAT16_H_
 
-// clang-format off
-#include "tensorflow/core/platform/byte_order.h"
-#include "third_party/eigen3/Eigen/Core"
-// clang-format on
-
-namespace tensorflow {
-typedef Eigen::bfloat16 bfloat16;
-}  // end namespace tensorflow
+#include "tensorflow/core/platform/bfloat16.h"
 
 #endif  // TENSORFLOW_CORE_LIB_BFLOAT16_BFLOAT16_H_
diff --git a/tensorflow/core/lib/bmp/BUILD b/tensorflow/core/lib/bmp/BUILD
new file mode 100644
index 00000000000..b12424b4814
--- /dev/null
+++ b/tensorflow/core/lib/bmp/BUILD
@@ -0,0 +1,22 @@
+# Description:
+# bmp test data packages.
+
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
+
+filegroup(
+    name = "bmp_testdata",
+    srcs = [
+        # BMP data
+        "testdata/lena.bmp",
+        "testdata/rgb_small.bmp",
+        "testdata/rgb_small_255.bmp",
+        "testdata/rgba_small.bmp",
+        "testdata/rgba_small_255.bmp",
+        "testdata/grayscale_small.bmp",
+        "testdata/grayscale_small_3channels.bmp",
+        "testdata/grayscale_small_4channels.bmp",
+    ],
+    visibility = ["//visibility:public"],
+)
diff --git a/tensorflow/core/lib/gif/gif_io.cc b/tensorflow/core/lib/gif/gif_io.cc
index 32e2f6dfa52..659513d05ed 100644
--- a/tensorflow/core/lib/gif/gif_io.cc
+++ b/tensorflow/core/lib/gif/gif_io.cc
@@ -16,9 +16,11 @@ limitations under the License.
 // Functions to read images in GIF format.
 
 #include "tensorflow/core/lib/gif/gif_io.h"
+
 #include <algorithm>
+
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/gif.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mem.h"
@@ -68,17 +70,17 @@ uint8* Decode(const void* srcdata, int datasize,
     }
   });
   if (error_code != D_GIF_SUCCEEDED) {
-    *error_string = strings::StrCat("failed to open gif file: ",
-                                    GifErrorStringNonNull(error_code));
+    *error_string = absl::StrCat("failed to open gif file: ",
+                                 GifErrorStringNonNull(error_code));
     return nullptr;
   }
   if (DGifSlurp(gif_file) != GIF_OK) {
-    *error_string = strings::StrCat("failed to slurp gif file: ",
-                                    GifErrorStringNonNull(gif_file->Error));
+    *error_string = absl::StrCat("failed to slurp gif file: ",
+                                 GifErrorStringNonNull(gif_file->Error));
     return nullptr;
   }
   if (gif_file->ImageCount <= 0) {
-    *error_string = strings::StrCat("gif file does not contain any image");
+    *error_string = "gif file does not contain any image";
     return nullptr;
   }
 
@@ -118,8 +120,7 @@ uint8* Decode(const void* srcdata, int datasize,
         img_desc->Height != height) {
       // If the first frame does not fill the entire canvas then return error.
       if (k == 0) {
-        *error_string =
-            strings::StrCat("the first frame does not fill the canvas");
+        *error_string = "the first frame does not fill the canvas";
         return nullptr;
       }
       // Otherwise previous frame will be reused to fill the unoccupied canvas.
@@ -144,7 +145,7 @@ uint8* Decode(const void* srcdata, int datasize,
                                     ? this_image->ImageDesc.ColorMap
                                     : gif_file->SColorMap;
     if (color_map == nullptr) {
-      *error_string = strings::StrCat("missing color map for frame ", k);
+      *error_string = absl::StrCat("missing color map for frame ", k);
       return nullptr;
     }
 
@@ -156,9 +157,9 @@ uint8* Decode(const void* srcdata, int datasize,
                                    (j - img_desc->Left)];
 
         if (color_index >= color_map->ColorCount) {
-          *error_string = strings::StrCat("found color index ", color_index,
-                                          " outside of color map range ",
-                                          color_map->ColorCount);
+          *error_string = absl::StrCat("found color index ", color_index,
+                                       " outside of color map range ",
+                                       color_map->ColorCount);
           return nullptr;
         }
 
diff --git a/tensorflow/core/lib/gtl/BUILD b/tensorflow/core/lib/gtl/BUILD
index f4bfb8f8f60..0a96a17e590 100644
--- a/tensorflow/core/lib/gtl/BUILD
+++ b/tensorflow/core/lib/gtl/BUILD
@@ -129,7 +129,7 @@ cc_library(
     name = "map_util",
     srcs = [
         "map_util.h",
-        "subtle/map_traits.h",
+        "//tensorflow/core/lib/gtl/subtle:map_traits",
     ],
     hdrs = ["map_util.h"],
 )
@@ -221,7 +221,7 @@ filegroup(
         "map_util.h",
         "optional.h",
         "priority_queue_util.h",
-        "subtle/map_traits.h",
+        "//tensorflow/core/lib/gtl/subtle:map_traits",
     ],
     visibility = ["//tensorflow/core:__pkg__"],
 )
@@ -243,8 +243,8 @@ filegroup(
         "map_util.h",
         "optional.h",
         "priority_queue_util.h",
-        "subtle/map_traits.h",
         "top_n.h",
+        "//tensorflow/core/lib/gtl/subtle:map_traits",
     ],
     visibility = ["//tensorflow/core:__pkg__"],
 )
diff --git a/tensorflow/core/lib/gtl/subtle/BUILD b/tensorflow/core/lib/gtl/subtle/BUILD
new file mode 100644
index 00000000000..febe982446c
--- /dev/null
+++ b/tensorflow/core/lib/gtl/subtle/BUILD
@@ -0,0 +1,14 @@
+# Description:
+# gtl subtle packages.
+
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
+
+filegroup(
+    name = "map_traits",
+    srcs = [
+        "map_traits.h",
+    ],
+    visibility = ["//tensorflow/core/lib/gtl:__pkg__"],
+)
diff --git a/tensorflow/core/lib/jpeg/jpeg_mem_unittest.cc b/tensorflow/core/lib/jpeg/jpeg_mem_unittest.cc
index ac8f657d20d..4bbe74c423a 100644
--- a/tensorflow/core/lib/jpeg/jpeg_mem_unittest.cc
+++ b/tensorflow/core/lib/jpeg/jpeg_mem_unittest.cc
@@ -34,7 +34,6 @@ namespace tensorflow {
 namespace jpeg {
 namespace {
 
-using absl::bit_cast;
 const char kTestData[] = "tensorflow/core/lib/jpeg/testdata/";
 
 int ComputeSumAbsoluteDifference(const uint8* a, const uint8* b, int width,
@@ -60,7 +59,7 @@ void TestJPEG(Env* env, const string& jpegfile) {
   string jpeg;
   ReadFileToStringOrDie(env, jpegfile, &jpeg);
   const int fsize = jpeg.size();
-  const uint8* const temp = bit_cast<const uint8*>(jpeg.data());
+  const uint8* const temp = absl::bit_cast<const uint8*>(jpeg.data());
 
   // Try partial decoding (half of the data)
   int w, h, c;
@@ -102,7 +101,7 @@ void TestCropAndDecodeJpeg(Env* env, const string& jpegfile,
   string jpeg;
   ReadFileToStringOrDie(env, jpegfile, &jpeg);
   const int fsize = jpeg.size();
-  auto temp = bit_cast<const uint8*>(jpeg.data());
+  const auto* temp = absl::bit_cast<const uint8*>(jpeg.data());
 
   // Decode the whole image.
   std::unique_ptr<uint8[]> imgdata1;
@@ -225,7 +224,7 @@ TEST(JpegMemTest, CropAndDecodeJpegWithStride) {
   string jpeg;
   ReadFileToStringOrDie(env, data_path + "jpeg_merge_test1.jpg", &jpeg);
   const int fsize = jpeg.size();
-  auto temp = bit_cast<const uint8*>(jpeg.data());
+  const auto* temp = absl::bit_cast<const uint8*>(jpeg.data());
 
   int w, h, c;
   ASSERT_TRUE(GetImageInfo(temp, fsize, &w, &h, &c));
@@ -263,7 +262,7 @@ TEST(JpegMemTest, CropAndDecodeJpegWithInvalidCropWindow) {
   string jpeg;
   ReadFileToStringOrDie(env, data_path + "jpeg_merge_test1.jpg", &jpeg);
   const int fsize = jpeg.size();
-  auto temp = bit_cast<const uint8*>(jpeg.data());
+  const auto* temp = absl::bit_cast<const uint8*>(jpeg.data());
 
   int w, h, c;
   ASSERT_TRUE(GetImageInfo(temp, fsize, &w, &h, &c));
diff --git a/tensorflow/core/lib/lmdb/BUILD b/tensorflow/core/lib/lmdb/BUILD
new file mode 100644
index 00000000000..f85de8cc223
--- /dev/null
+++ b/tensorflow/core/lib/lmdb/BUILD
@@ -0,0 +1,12 @@
+# Description:
+# lmdb test data packages alias.
+
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
+
+alias(
+    name = "lmdb_testdata",
+    actual = "//tensorflow/core/lib/lmdb/testdata:lmdb_testdata",
+    visibility = ["//visibility:public"],
+)
diff --git a/tensorflow/core/lib/lmdb/testdata/BUILD b/tensorflow/core/lib/lmdb/testdata/BUILD
new file mode 100644
index 00000000000..85046175f61
--- /dev/null
+++ b/tensorflow/core/lib/lmdb/testdata/BUILD
@@ -0,0 +1,28 @@
+# Description:
+# lmdb test data packages.
+
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
+
+filegroup(
+    name = "lmdb_testdata",
+    testonly = 1,
+    srcs = [
+        # A simple key-value store:
+        #   0 : 'b'
+        #   1 : 'b'
+        #    ...
+        #   9 : 'b'
+        # Which is then overwritten with:
+        #   0 : 'a'
+        #   1 : 'b'
+        #    ...
+        #   9 : 'j'
+        "data.mdb",
+        # LMDB, being a memory-mapped database, uses a different file format on
+        # big-endian systems.
+        "data_bigendian.mdb",
+    ],
+    visibility = ["//visibility:public"],
+)
diff --git a/tensorflow/core/lib/png/BUILD b/tensorflow/core/lib/png/BUILD
index 95debe44e5e..63326737d8e 100644
--- a/tensorflow/core/lib/png/BUILD
+++ b/tensorflow/core/lib/png/BUILD
@@ -26,13 +26,8 @@ cc_library(
     ],
 )
 
-filegroup(
+alias(
     name = "testdata",
-    srcs = [
-        "testdata/lena_gray.png",
-        "testdata/lena_palette.png",
-        "testdata/lena_palette_trns.png",
-        "testdata/lena_rgba.png",
-        "testdata/palette_only.png",
-    ],
+    actual = "//tensorflow/core/lib/png/testdata:png_testdata",
+    visibility = ["//tensorflow/core:__pkg__"],
 )
diff --git a/tensorflow/core/lib/png/png_io.cc b/tensorflow/core/lib/png/png_io.cc
index 35e189e7829..d0014066ce3 100644
--- a/tensorflow/core/lib/png/png_io.cc
+++ b/tensorflow/core/lib/png/png_io.cc
@@ -282,11 +282,8 @@ bool CommonInitDecode(StringPiece png_string, int desired_channels,
   }
 
   // convert palette to rgb(a) if needs be.
-  // Note if desired_channels=1 then the original palette indices
-  // will be presented.
-  if (context->color_type == PNG_COLOR_TYPE_PALETTE && desired_channels != 1) {
+  if (context->color_type == PNG_COLOR_TYPE_PALETTE)
     png_set_palette_to_rgb(context->png_ptr);
-  }
 
   // handle grayscale case for source or destination
   const bool want_gray = (context->channels < 3);
@@ -297,9 +294,7 @@ bool CommonInitDecode(StringPiece png_string, int desired_channels,
     }
   }
   if (want_gray) {  // output is grayscale
-    // Note if color type is palette and context->channels < 3,
-    // then the original palette indices will be presented.
-    if (!is_gray && context->color_type != PNG_COLOR_TYPE_PALETTE)
+    if (!is_gray)
       png_set_rgb_to_gray(context->png_ptr, 1, 0.299, 0.587);  // 601, JPG
   } else {  // output is rgb(a)
     if (is_gray)
diff --git a/tensorflow/core/lib/png/testdata/BUILD b/tensorflow/core/lib/png/testdata/BUILD
new file mode 100644
index 00000000000..0885aee2e6f
--- /dev/null
+++ b/tensorflow/core/lib/png/testdata/BUILD
@@ -0,0 +1,18 @@
+# Description:
+# PNG test data packages.
+
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
+
+filegroup(
+    name = "png_testdata",
+    srcs = [
+        "lena_gray.png",
+        "lena_palette.png",
+        "lena_palette_trns.png",
+        "lena_rgba.png",
+        "palette_only.png",
+    ],
+    visibility = ["//tensorflow/core/lib/png:__pkg__"],
+)
diff --git a/tensorflow/core/lib/psnr/BUILD b/tensorflow/core/lib/psnr/BUILD
new file mode 100644
index 00000000000..386f1a5bd06
--- /dev/null
+++ b/tensorflow/core/lib/psnr/BUILD
@@ -0,0 +1,15 @@
+package(
+    default_visibility = [
+        "//tensorflow/core:__pkg__",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+filegroup(
+    name = "testdata",
+    srcs = [
+        "testdata/cat_q20.jpg",
+        "testdata/cat_q72.jpg",
+        "testdata/cat_q95.jpg",
+    ],
+)
diff --git a/tensorflow/core/lib/random/BUILD b/tensorflow/core/lib/random/BUILD
index 1487a813149..88d2f0280f1 100644
--- a/tensorflow/core/lib/random/BUILD
+++ b/tensorflow/core/lib/random/BUILD
@@ -40,7 +40,6 @@ cc_library(
     deps = [
         ":exact_uniform_int",
         ":philox_random",
-        "//tensorflow/core/lib/bfloat16",
         "//tensorflow/core/lib/gtl:array_slice",
         "//tensorflow/core/platform:logging",
         "//tensorflow/core/platform:macros",
diff --git a/tensorflow/core/lib/random/random_distributions.h b/tensorflow/core/lib/random/random_distributions.h
index 386f13347d7..79ca7247838 100644
--- a/tensorflow/core/lib/random/random_distributions.h
+++ b/tensorflow/core/lib/random/random_distributions.h
@@ -18,14 +18,13 @@ limitations under the License.
 
 #include <string.h>
 
-#include <cmath>
-
 #include <algorithm>
+#include <cmath>
 #include <type_traits>
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/lib/bfloat16/bfloat16.h"
 #include "tensorflow/core/lib/random/philox_random.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace random {
@@ -711,7 +710,7 @@ void BoxMullerFloat(uint32 x0, uint32 x1, float* f0, float* f1) {
   }
   const float v1 = 2.0f * M_PI * Uint32ToFloat(x1);
   const float u2 = Eigen::numext::sqrt(-2.0f * Eigen::numext::log(u1));
-#if defined(TENSORFLOW_USE_SYCL) || !defined(__linux__)
+#if !defined(__linux__)
   *f0 = Eigen::numext::sin(v1);
   *f1 = Eigen::numext::cos(v1);
 #else
@@ -737,7 +736,7 @@ void BoxMullerDouble(uint32 x0, uint32 x1, uint32 x2, uint32 x3, double* d0,
   }
   const double v1 = 2 * M_PI * Uint64ToDouble(x2, x3);
   const double u2 = Eigen::numext::sqrt(-2.0 * Eigen::numext::log(u1));
-#if defined(TENSORFLOW_USE_SYCL) || !defined(__linux__)
+#if !defined(__linux__)
   *d0 = Eigen::numext::sin(v1);
   *d1 = Eigen::numext::cos(v1);
 #else
diff --git a/tensorflow/core/lib/ssim/BUILD b/tensorflow/core/lib/ssim/BUILD
new file mode 100644
index 00000000000..7d9b72b11b0
--- /dev/null
+++ b/tensorflow/core/lib/ssim/BUILD
@@ -0,0 +1,15 @@
+package(
+    default_visibility = [
+        "//tensorflow/core:__pkg__",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+filegroup(
+    name = "testdata",
+    srcs = [
+        "testdata/checkerboard1.png",
+        "testdata/checkerboard2.png",
+        "testdata/checkerboard3.png",
+    ],
+)
diff --git a/tensorflow/core/nccl/BUILD b/tensorflow/core/nccl/BUILD
index 388b5e62c18..e49b89de1f5 100644
--- a/tensorflow/core/nccl/BUILD
+++ b/tensorflow/core/nccl/BUILD
@@ -3,15 +3,17 @@
 #   APIs are meant to change over time.
 
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
-load("//tensorflow:tensorflow.bzl", "tf_copts")
+load("//tensorflow:tensorflow.bzl", "if_cuda_or_rocm", "tf_copts")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
-load("//tensorflow:tensorflow.bzl", "if_cuda_or_rocm")
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "tf_cuda_tests_tags",
 )
 
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "if_nccl")
+
 package(
     default_visibility = ["//tensorflow:__subpackages__"],
     licenses = ["notice"],  # Apache 2.0
@@ -77,3 +79,28 @@ tf_cuda_cc_test(
         "//tensorflow/core/common_runtime/gpu:rocm",
     ]),
 )
+
+cc_library(
+    name = "collective_communicator",
+    srcs = ["collective_communicator.cc"],
+    hdrs = ["collective_communicator.h"],
+    copts = tf_copts() + if_nccl(["-DTENSORFLOW_USE_NCCL=1"]),
+    visibility = [
+        "//learning/brain/runtime:__subpackages__",
+        "//tensorflow:__subpackages__",
+    ],
+    deps =
+        ["//tensorflow/core:framework"] + if_nccl([
+            ":nccl_lib",
+            "@com_google_absl//absl/memory",
+            "//tensorflow/core/profiler/lib:traceme",
+        ]),
+)
+
+filegroup(
+    name = "mobile_srcs",
+    srcs = [
+        "collective_communicator.cc",
+        "collective_communicator.h",
+    ],
+)
diff --git a/tensorflow/core/nccl/collective_communicator.cc b/tensorflow/core/nccl/collective_communicator.cc
new file mode 100644
index 00000000000..ddb906037a8
--- /dev/null
+++ b/tensorflow/core/nccl/collective_communicator.cc
@@ -0,0 +1,178 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/nccl/collective_communicator.h"
+
+#if TENSORFLOW_USE_NCCL && (GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+
+#include "absl/memory/memory.h"
+#include "tensorflow/core/nccl/nccl_manager.h"
+#include "tensorflow/core/platform/tracing.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
+
+namespace tensorflow {
+
+class NcclCommunicator : public NcclCommunicatorInterface {
+ public:
+  void Enqueue(std::shared_ptr<CollectiveContext> col_ctx,
+               StatusCallback done) override;
+
+  void StartAbort(const Status& s) override;
+
+ private:
+  NcclManager nccl_manager_;
+};
+
+namespace {
+Status ReductionOp(const string& merge_op, ncclRedOp_t* reduction_op) {
+  if (merge_op == "Add") {
+    *reduction_op = ncclSum;
+    return Status::OK();
+  } else if (merge_op == "Mul") {
+    *reduction_op = ncclProd;
+    return Status::OK();
+  } else if (merge_op == "Maximum") {
+    *reduction_op = ncclMax;
+    return Status::OK();
+  } else if (merge_op == "Minimum") {
+    *reduction_op = ncclMin;
+    return Status::OK();
+  } else {
+    return errors::Internal(
+        "Expected merge_op to be in [Add, Mul, Maximum, Minimum], found ",
+        merge_op);
+  }
+}
+
+string NcclCollectiveKey(const string& exec_key, int step_id) {
+  return strings::StrCat(exec_key, ":", step_id);
+}
+}  // namespace
+
+std::unique_ptr<NcclCommunicatorInterface> MaybeCreateNcclCommunicator() {
+  return absl::make_unique<NcclCommunicator>();
+}
+
+void NcclCommunicator::Enqueue(std::shared_ptr<CollectiveContext> col_ctx,
+                               StatusCallback done) {
+  const CollectiveParams& col_params = col_ctx->col_params;
+  const int num_global_devices = col_params.group.group_size;
+  const int num_local_devices = col_params.instance.num_devices_per_task.at(
+      col_params.instance.task_names[col_params.default_rank]);
+  const string nccl_collective_key =
+      NcclCollectiveKey(col_ctx->exec_key, col_ctx->step_id);
+  auto* compute_stream = col_ctx->op_ctx->op_device_context()->stream();
+  auto* gpu_info = col_ctx->op_ctx->device()->tensorflow_gpu_device_info();
+  auto participant = absl::make_unique<NcclManager::Participant>(
+      compute_stream->parent(), compute_stream, gpu_info, col_ctx->input,
+      col_ctx->output, col_ctx->col_params.default_rank, std::move(done));
+  NcclManager::Context context(
+      nccl_collective_key, num_local_devices, num_global_devices,
+      col_params.group.runtime_details.communicator_key,
+      col_params.source_rank);
+  VLOG(1) << "NcclCommunicator::Enqueue type " << col_params.instance.type
+          << " num_tasks " << col_params.group.num_tasks << " current task "
+          << col_params.instance.task_names[col_params.default_rank]
+          << " num local devices " << num_local_devices
+          << " num global devices " << num_global_devices << " device "
+          << col_ctx->device_name << " instance "
+          << col_params.instance.instance_key;
+  // `AddTo*` performs consistency checks for the NCCL call and enqueues the
+  // `Participant` struct locally.  When all local participants with this
+  // `nccl_collective_key` have called `AddToAllReduce` and
+  // `SignalMultiNodeReady`, all devices at this worker are ready to process
+  // this NCCL op.
+  //
+  // The `NcclManager` uses a dedicated CUDA stream for NCCL kernels.  At this
+  // point, it synchronizes the NCCL stream with the compute stream, and then
+  // enqueues the NCCL kernel on the NCCL stream.
+  switch (col_params.instance.type) {
+    case REDUCTION_COLLECTIVE: {
+      ncclRedOp_t reduction_op;
+      Status s = ReductionOp(col_params.merge_op->type_string(), &reduction_op);
+      if (!s.ok()) {
+        participant->done_callback(s);
+        return;
+      }
+      nccl_manager_.AddToAllReduce(std::move(participant), context,
+                                   reduction_op);
+      break;
+    }
+    case GATHER_COLLECTIVE: {
+      nccl_manager_.AddToAllGather(std::move(participant), context);
+      break;
+    }
+    case BROADCAST_COLLECTIVE: {
+      if (col_params.is_source) {
+        nccl_manager_.AddBroadcastSend(std::move(participant), context);
+      } else {
+        nccl_manager_.AddBroadcastRecv(std::move(participant), context);
+      }
+      break;
+    }
+    default: {
+      participant->done_callback(errors::Internal("Unexpected CollectiveType ",
+                                                  col_params.instance.type));
+      return;
+    }
+  }
+  // NOTE(ayushd): We need to synchronize NCCL launches across nodes to prevent
+  // deadlocks.  In the current implementation, we define a deterministic
+  // sequential launch order between potentially concurrent collective instances
+  // by introducing control information during static graph analysis in
+  // graph/collective_order.cc.  This can be either in the form of explicit
+  // control edges or via `wait_for` attribute on the collective op.
+  //
+  // The other end of the design spectrum would have a distinguished node
+  // dynamically signal the next collective to launch to all other participants.
+  // This has higher degree of runtime coordination, but it may be able to
+  // achieve better performance if the (arbitrary) static execution order
+  // assigned in the first approach turns out to not be good from a scheduling
+  // perspective.  e.g. consider a graph in which c1, c2, and c3 are three
+  // concurrent collective instances, and the static ordering assigns c1 -> c2
+  // -> c3.  In practice, it could turn out that c3 is always ready to execute
+  // before c1 or c2.
+  {
+    // `WaitForDependencies` may block if the collective instances on which this
+    // op depends have not yet launched.  When this function returns, this op is
+    // ready to go.
+    profiler::TraceMe activity("WaitForDependencies",
+                               profiler::TraceMeLevel::kInfo);
+    col_ctx->col_exec->WaitForDependencies(col_params);
+    nccl_manager_.SignalMultiNodeReady(nccl_collective_key);
+  }
+  {
+    // When all devices at this worker have called `SignalMultiNodeReady`, the
+    // `NcclManager` will enqueue the NCCL kernel on the NCCL stream.  Thus the
+    // implementation of `UnblockDependencies` keeps track of the number of
+    // devices that have launched.
+    profiler::TraceMe activity("Schedule", profiler::TraceMeLevel::kInfo);
+    col_ctx->col_exec->UnblockDependencies(col_params);
+  }
+}
+
+void NcclCommunicator::StartAbort(const Status& s) {
+  CHECK(false) << "not implemented yet";  // Crash ok.
+}
+
+}  // namespace tensorflow
+
+#else
+namespace tensorflow {
+std::unique_ptr<NcclCommunicatorInterface> MaybeCreateNcclCommunicator() {
+  return nullptr;
+}
+}  // namespace tensorflow
+#endif  // TENSORFLOW_USE_NCCL && (GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
diff --git a/tensorflow/core/nccl/collective_communicator.h b/tensorflow/core/nccl/collective_communicator.h
new file mode 100644
index 00000000000..8820e0a53e0
--- /dev/null
+++ b/tensorflow/core/nccl/collective_communicator.h
@@ -0,0 +1,28 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_NCCL_COLECTIVE_COMMUNICATOR_H_
+#define TENSORFLOW_CORE_NCCL_COLECTIVE_COMMUNICATOR_H_
+
+#include "tensorflow/core/framework/collective.h"
+
+namespace tensorflow {
+
+// Creates a NcclCommunicator if built with NCCL support, otherwise it returns
+// nullptr.
+std::unique_ptr<NcclCommunicatorInterface> MaybeCreateNcclCommunicator();
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_NCCL_COLECTIVE_COMMUNICATOR_H_
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index 11bfb9a3346..b4dfe6187d5 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -2974,73 +2974,6 @@ REGISTER_OP("QuantizedInstanceNorm")
 
 namespace {
 
-Status ScatterNdShapeHelper(InferenceContext* c, ShapeHandle indices_shape,
-                            ShapeHandle updates_shape,
-                            ShapeHandle output_shape) {
-  if (c->Value(c->NumElements(output_shape)) == 0 &&
-      (c->Value(c->NumElements(indices_shape)) > 0 ||
-       c->Value(c->NumElements(updates_shape)) > 0)) {
-    return errors::InvalidArgument(
-        "Indices and updates specified for empty output shape");
-  }
-
-  if (c->RankKnown(indices_shape) && c->RankKnown(updates_shape)) {
-    const int64 outer_dims = c->Rank(indices_shape) - 1;
-    const DimensionHandle ixdim = c->Dim(indices_shape, -1);
-
-    // We can only do more validation if the last dimension of indices
-    // is a known value.
-    if (c->ValueKnown(ixdim)) {
-      int64 ix = c->Value(ixdim);
-      ShapeHandle unused;
-      ShapeHandle prefix_indices;
-      TF_RETURN_IF_ERROR(
-          c->Subshape(indices_shape, 0, outer_dims, &prefix_indices));
-      ShapeHandle prefix_updates;
-      TF_RETURN_IF_ERROR(
-          c->Subshape(updates_shape, 0, outer_dims, &prefix_updates));
-
-      Status s = c->Merge(prefix_indices, prefix_updates, &unused);
-      if (!s.ok()) {
-        return errors::InvalidArgument(
-            "The outer ", outer_dims,
-            " dimensions of indices.shape=", c->DebugString(indices_shape),
-            " must match the outer ", outer_dims,
-            " dimensions of updates.shape=", c->DebugString(updates_shape),
-            ": ", s.error_message());
-      }
-
-      ShapeHandle suffix_output;
-      TF_RETURN_IF_ERROR(c->Subshape(output_shape, ix, &suffix_output));
-      ShapeHandle suffix_updates;
-      TF_RETURN_IF_ERROR(
-          c->Subshape(updates_shape, outer_dims, &suffix_updates));
-      s = c->Merge(suffix_output, suffix_updates, &unused);
-      if (!s.ok()) {
-        return errors::InvalidArgument(
-            "The inner ", c->Rank(output_shape) - ix,
-            " dimensions of output.shape=", c->DebugString(output_shape),
-            " must match the inner ", c->Rank(updates_shape) - outer_dims,
-            " dimensions of updates.shape=", c->DebugString(updates_shape),
-            ": ", s.error_message());
-      }
-    }
-  }
-
-  c->set_output(0, output_shape);
-  return Status::OK();
-}
-
-Status ScatterNdShape(InferenceContext* c) {
-  ShapeHandle indices_shape;
-  TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), 1, &indices_shape));
-  ShapeHandle updates_shape;
-  TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(1), 1, &updates_shape));
-  ShapeHandle output_shape;
-  TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(2, &output_shape));
-  return ScatterNdShapeHelper(c, indices_shape, updates_shape, output_shape);
-}
-
 Status ScatterNdTensorShape(InferenceContext* c) {
   ShapeHandle output_shape;
   TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), 1, &output_shape));
@@ -3048,7 +2981,8 @@ Status ScatterNdTensorShape(InferenceContext* c) {
   TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(1), 1, &indices_shape));
   ShapeHandle updates_shape;
   TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(2), 1, &updates_shape));
-  return ScatterNdShapeHelper(c, indices_shape, updates_shape, output_shape);
+  return shape_inference::ScatterNdShapeHelper(c, indices_shape, updates_shape,
+                                               output_shape);
 }
 
 }  // namespace
@@ -3088,7 +3022,16 @@ REGISTER_OP("ScatterNd")
     .Output("output: T")
     .Attr("T: type")
     .Attr("Tindices: {int32, int64}")
-    .SetShapeFn(ScatterNdShape);
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle indices_shape;
+      TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), 1, &indices_shape));
+      ShapeHandle updates_shape;
+      TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(1), 1, &updates_shape));
+      ShapeHandle output_shape;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(2, &output_shape));
+      return shape_inference::ScatterNdShapeHelper(c, indices_shape,
+                                                   updates_shape, output_shape);
+    });
 
 REGISTER_OP("TensorScatterUpdate")
     .Input("tensor: T")
@@ -3142,7 +3085,7 @@ REGISTER_OP("ScatterNdNonAliasingAdd")
     .Output("output: T")
     .Attr("T: {numbertype, bool}")
     .Attr("Tindices: {int32, int64}")
-    .SetShapeFn(shape_inference::ScatterNdUpdateShape);
+    .SetShapeFn(ScatterNdTensorShape);
 
 REGISTER_OP("FakeQuantWithMinMaxArgs")
     .Attr("min: float = -6.0")
diff --git a/tensorflow/core/ops/array_ops_test.cc b/tensorflow/core/ops/array_ops_test.cc
index 1725bdbac39..412c926d386 100644
--- a/tensorflow/core/ops/array_ops_test.cc
+++ b/tensorflow/core/ops/array_ops_test.cc
@@ -27,6 +27,39 @@ limitations under the License.
 
 namespace tensorflow {
 
+TEST(ArrayOpsTest, TensorScatterUpdate_ShapeFn) {
+  ShapeInferenceTestOp op("TensorScatterUpdate");
+
+  INFER_OK(op, "[4,3];[8,2];[8]", "in0");
+  INFER_OK(op, "[?,?];[?,2];[?]", "in0");
+  INFER_OK(op, "[?];[?];[?]", "in0");
+
+  INFER_ERROR("Shape must be at least rank 1 but is rank 0", op,
+              "[];[?,2];[?]");
+  INFER_ERROR("Indices and updates specified for empty input", op,
+              "[0,2,2];[8,2];[8]");
+  INFER_ERROR(
+      "Dimensions [0,1) of indices[shape=[8,2]] = [8] must match "
+      "dimensions [0,1) of updates[shape=[9]] = [9]",
+      op, "[?,?];[8,2];[9]");
+  INFER_ERROR(
+      "Dimensions [2,2) of input[shape=[?,?]] = [] must match "
+      "dimensions [1,2) of updates[shape=[?,1]] = [1]",
+      op, "[?,?];[?,2];[?,1]");
+}
+
+TEST(ArrayOpsTest, ScatterNd_ShapeFn) {
+  ShapeInferenceTestOp op("ScatterNd");
+
+  INFER_OK(op, "[8,2];[8];[2]", "[?,?]");
+
+  INFER_ERROR("Shape must be rank 1 but is rank 0", op, "[?,2];[?];[]");
+  INFER_ERROR(
+      "Dimensions [0,1) of indices[shape=[8,2]] = [8] must match "
+      "dimensions [0,1) of updates[shape=[9]] = [9]",
+      op, "[8,2];[9];[?]");
+}
+
 TEST(ArrayOpsTest, UnravelIndex_ShapeFn) {
   ShapeInferenceTestOp op("UnravelIndex");
 
diff --git a/tensorflow/core/ops/collective_ops.cc b/tensorflow/core/ops/collective_ops.cc
index 51b266d8f08..fc9010ade34 100644
--- a/tensorflow/core/ops/collective_ops.cc
+++ b/tensorflow/core/ops/collective_ops.cc
@@ -117,4 +117,27 @@ REGISTER_OP("CollectiveReduceV2")
     .SetIsStateful()
     .SetShapeFn(shape_inference::UnchangedShape);
 
+REGISTER_OP("CollectiveGatherV2")
+    .Input("input: T")
+    .Output("data: T")
+    .Attr("T: {float, float16, float64, int32, int64}")
+    .Input("group_size: int32")
+    .Input("group_key: int32")
+    .Input("instance_key: int32")
+    .Attr("communication_hint: string = 'auto'")
+    .Attr("timeout_seconds: float = 0")
+    .SetIsStateful()
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      // Scalar input is not supported.
+      shape_inference::ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), 1, &unused));
+      // This output should have the same shape as its input except the first
+      // dimension is unknown, since the group size is unknown.
+      shape_inference::ShapeHandle out;
+      TF_RETURN_IF_ERROR(
+          c->ReplaceDim(c->input(0), /*dim_index*/ 0, c->UnknownDim(), &out));
+      c->set_output(0, out);
+      return Status::OK();
+    });
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/compat/BUILD b/tensorflow/core/ops/compat/BUILD
index 1b1aea3fab7..47ab66cd944 100644
--- a/tensorflow/core/ops/compat/BUILD
+++ b/tensorflow/core/ops/compat/BUILD
@@ -32,11 +32,14 @@ cc_library(
 tf_cc_test(
     name = "backwards_compatibility_test",
     size = "small",
-    srcs = ["backwards_compatibility_test.cc"],
+    srcs = [
+        "backwards_compatibility_test.cc",
+    ],
     data = [
         "//tensorflow/core:ops/ops.pbtxt",
+        "//tensorflow/core/ops/compat/ops_history_v1:ops_history_v1_srcs",
+        "//tensorflow/core/ops/compat/ops_history_v2:ops_history_v2_srcs",
     ] + glob([
-        "ops_history_v*/*.pbtxt",
         "ops_history.v*.pbtxt",
     ]),
     tags = [
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BUILD b/tensorflow/core/ops/compat/ops_history_v1/BUILD
new file mode 100644
index 00000000000..dfd7dab25bf
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BUILD
@@ -0,0 +1,16 @@
+# Description:
+# Test for keeping the history of OpDefs for every major version of TensorFlow,
+# to validate that we don't make backwards-incompatible changes in particular
+# for v1.
+
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
+
+filegroup(
+    name = "ops_history_v1_srcs",
+    srcs = glob([
+        "*.pbtxt",
+    ]),
+    visibility = ["//tensorflow/core/ops/compat:__pkg__"],
+)
diff --git a/tensorflow/core/ops/compat/ops_history_v2/AddV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/AddV2.pbtxt
index 8781485827d..400e4e04042 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/AddV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/AddV2.pbtxt
@@ -105,3 +105,40 @@ op {
   is_aggregate: true
   is_commutative: true
 }
+op {
+  name: "AddV2"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_UINT32
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_aggregate: true
+  is_commutative: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/AutoShardDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/AutoShardDataset.pbtxt
index 1ccd14b6627..af4ab4c03b5 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/AutoShardDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/AutoShardDataset.pbtxt
@@ -36,3 +36,48 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "AutoShardDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "num_workers"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "index"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "auto_shard_policy"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_replicas"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BUILD b/tensorflow/core/ops/compat/ops_history_v2/BUILD
new file mode 100644
index 00000000000..a7462807779
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/BUILD
@@ -0,0 +1,16 @@
+# Description:
+# Test for keeping the history of OpDefs for every major version of TensorFlow,
+# to validate that we don't make backwards-incompatible changes in particular
+# for v2.
+
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
+
+filegroup(
+    name = "ops_history_v2_srcs",
+    srcs = glob([
+        "*.pbtxt",
+    ]),
+    visibility = ["//tensorflow/core/ops/compat:__pkg__"],
+)
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CollectiveGatherV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CollectiveGatherV2.pbtxt
new file mode 100644
index 00000000000..8a081e34d34
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/CollectiveGatherV2.pbtxt
@@ -0,0 +1,51 @@
+op {
+  name: "CollectiveGatherV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "group_size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "group_key"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "instance_key"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "communication_hint"
+    type: "string"
+    default_value {
+      s: "auto"
+    }
+  }
+  attr {
+    name: "timeout_seconds"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CrossReplicaSum.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CrossReplicaSum.pbtxt
index 09c2402cc5a..f879b85bd10 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CrossReplicaSum.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CrossReplicaSum.pbtxt
@@ -50,3 +50,31 @@ op {
     }
   }
 }
+op {
+  name: "CrossReplicaSum"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "group_assignment"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_INT32
+        type: DT_UINT32
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/EmptyTensorMap.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/EmptyTensorMap.pbtxt
new file mode 100644
index 00000000000..25327b4e1e8
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/EmptyTensorMap.pbtxt
@@ -0,0 +1,7 @@
+op {
+  name: "EmptyTensorMap"
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalIgnoreErrorsDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalIgnoreErrorsDataset.pbtxt
index e334e93046a..c9897f9f8f5 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalIgnoreErrorsDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalIgnoreErrorsDataset.pbtxt
@@ -21,3 +21,33 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "ExperimentalIgnoreErrorsDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "log_warning"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/IgnoreErrorsDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/IgnoreErrorsDataset.pbtxt
index 0670fd69e1b..e6ce7a0c94c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/IgnoreErrorsDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/IgnoreErrorsDataset.pbtxt
@@ -21,3 +21,33 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "IgnoreErrorsDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "log_warning"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ImageProjectiveTransformV3.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ImageProjectiveTransformV3.pbtxt
new file mode 100644
index 00000000000..90858cb06fc
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/ImageProjectiveTransformV3.pbtxt
@@ -0,0 +1,48 @@
+op {
+  name: "ImageProjectiveTransformV3"
+  input_arg {
+    name: "images"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "transforms"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "output_shape"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "fill_value"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "transformed_images"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "interpolation"
+    type: "string"
+  }
+  attr {
+    name: "fill_mode"
+    type: "string"
+    default_value {
+      s: "CONSTANT"
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/IsotonicRegression.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/IsotonicRegression.pbtxt
new file mode 100644
index 00000000000..abe6fb4bbd8
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/IsotonicRegression.pbtxt
@@ -0,0 +1,50 @@
+op {
+  name: "IsotonicRegression"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "output_dtype"
+  }
+  output_arg {
+    name: "segments"
+    type: DT_INT32
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "output_dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Max.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Max.pbtxt
index 4c931ccac4d..bf147acf0f4 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Max.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Max.pbtxt
@@ -32,8 +32,6 @@ op {
         type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
@@ -89,8 +87,6 @@ op {
         type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
@@ -148,8 +144,6 @@ op {
         type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
@@ -206,14 +200,12 @@ op {
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
         type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
         type: DT_UINT16
-        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -234,3 +226,63 @@ op {
     }
   }
 }
+op {
+  name: "Max"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MaxPool.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MaxPool.pbtxt
index ff78964704c..f4fd1cccf29 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MaxPool.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MaxPool.pbtxt
@@ -260,3 +260,81 @@ op {
     }
   }
 }
+op {
+  name: "MaxPool"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_QINT8
+      }
+    }
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+        s: "EXPLICIT"
+      }
+    }
+  }
+  attr {
+    name: "explicit_paddings"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+        s: "NCHW_VECT_C"
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MaxPoolGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MaxPoolGrad.pbtxt
index b54e555594c..131a3633cf9 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MaxPoolGrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MaxPoolGrad.pbtxt
@@ -369,3 +369,89 @@ op {
     }
   }
 }
+op {
+  name: "MaxPoolGrad"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+        s: "EXPLICIT"
+      }
+    }
+  }
+  attr {
+    name: "explicit_paddings"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Min.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Min.pbtxt
index f0ebdb0e41f..4959b5e8d58 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Min.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Min.pbtxt
@@ -32,8 +32,6 @@ op {
         type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
@@ -89,8 +87,6 @@ op {
         type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
@@ -148,8 +144,6 @@ op {
         type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
@@ -206,14 +200,12 @@ op {
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
         type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
         type: DT_UINT16
-        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -234,3 +226,63 @@ op {
     }
   }
 }
+op {
+  name: "Min"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ModelDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ModelDataset.pbtxt
index 1a44d3416b2..a2ebc78ff48 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ModelDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ModelDataset.pbtxt
@@ -58,3 +58,47 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "ModelDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "algorithm"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "cpu_budget"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "ram_budget"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RngReadAndSkip.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RngReadAndSkip.pbtxt
new file mode 100644
index 00000000000..e64f5dd6d26
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/RngReadAndSkip.pbtxt
@@ -0,0 +1,20 @@
+op {
+  name: "RngReadAndSkip"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "alg"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "delta"
+    type: DT_UINT64
+  }
+  output_arg {
+    name: "value"
+    type: DT_INT64
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SaveDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SaveDataset.pbtxt
index 984c793f735..18f99fbe359 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SaveDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SaveDataset.pbtxt
@@ -36,3 +36,42 @@ op {
     has_minimum: true
   }
 }
+op {
+  name: "SaveDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "path"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "shard_func_other_args"
+    type_list_attr: "Tshard_func_args"
+  }
+  attr {
+    name: "compression"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shard_func"
+    type: "func"
+  }
+  attr {
+    name: "use_shard_func"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "Tshard_func_args"
+    type: "list(type)"
+    has_minimum: true
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomGetKeyCounterAlg.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomGetKeyCounterAlg.pbtxt
new file mode 100644
index 00000000000..149359b7068
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomGetKeyCounterAlg.pbtxt
@@ -0,0 +1,33 @@
+op {
+  name: "StatelessRandomGetKeyCounterAlg"
+  input_arg {
+    name: "seed"
+    type_attr: "Tseed"
+  }
+  output_arg {
+    name: "key"
+    type: DT_UINT64
+  }
+  output_arg {
+    name: "counter"
+    type: DT_UINT64
+  }
+  output_arg {
+    name: "alg"
+    type: DT_INT32
+  }
+  attr {
+    name: "Tseed"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomNormalV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomNormalV2.pbtxt
new file mode 100644
index 00000000000..dac945afe53
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomNormalV2.pbtxt
@@ -0,0 +1,51 @@
+op {
+  name: "StatelessRandomNormalV2"
+  input_arg {
+    name: "shape"
+    type_attr: "Tshape"
+  }
+  input_arg {
+    name: "key"
+    type: DT_UINT64
+  }
+  input_arg {
+    name: "counter"
+    type: DT_UINT64
+  }
+  input_arg {
+    name: "alg"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tshape"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomUniformFullIntV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomUniformFullIntV2.pbtxt
new file mode 100644
index 00000000000..d4511c5447b
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomUniformFullIntV2.pbtxt
@@ -0,0 +1,51 @@
+op {
+  name: "StatelessRandomUniformFullIntV2"
+  input_arg {
+    name: "shape"
+    type_attr: "Tshape"
+  }
+  input_arg {
+    name: "key"
+    type: DT_UINT64
+  }
+  input_arg {
+    name: "counter"
+    type: DT_UINT64
+  }
+  input_arg {
+    name: "alg"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_UINT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tshape"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomUniformIntV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomUniformIntV2.pbtxt
new file mode 100644
index 00000000000..be4ed607285
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomUniformIntV2.pbtxt
@@ -0,0 +1,56 @@
+op {
+  name: "StatelessRandomUniformIntV2"
+  input_arg {
+    name: "shape"
+    type_attr: "Tshape"
+  }
+  input_arg {
+    name: "key"
+    type: DT_UINT64
+  }
+  input_arg {
+    name: "counter"
+    type: DT_UINT64
+  }
+  input_arg {
+    name: "alg"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "minval"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "maxval"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tshape"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomUniformV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomUniformV2.pbtxt
new file mode 100644
index 00000000000..f66ee72bd4a
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomUniformV2.pbtxt
@@ -0,0 +1,51 @@
+op {
+  name: "StatelessRandomUniformV2"
+  input_arg {
+    name: "shape"
+    type_attr: "Tshape"
+  }
+  input_arg {
+    name: "key"
+    type: DT_UINT64
+  }
+  input_arg {
+    name: "counter"
+    type: DT_UINT64
+  }
+  input_arg {
+    name: "alg"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tshape"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StatelessSampleDistortedBoundingBox.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StatelessSampleDistortedBoundingBox.pbtxt
new file mode 100644
index 00000000000..6858a110cf4
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/StatelessSampleDistortedBoundingBox.pbtxt
@@ -0,0 +1,88 @@
+op {
+  name: "StatelessSampleDistortedBoundingBox"
+  input_arg {
+    name: "image_size"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "bounding_boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_object_covered"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "seed"
+    type_attr: "Tseed"
+  }
+  output_arg {
+    name: "begin"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "size"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "bboxes"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tseed"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "aspect_ratio_range"
+    type: "list(float)"
+    default_value {
+      list {
+        f: 0.75
+        f: 1.33
+      }
+    }
+  }
+  attr {
+    name: "area_range"
+    type: "list(float)"
+    default_value {
+      list {
+        f: 0.05
+        f: 1
+      }
+    }
+  }
+  attr {
+    name: "max_attempts"
+    type: "int"
+    default_value {
+      i: 100
+    }
+  }
+  attr {
+    name: "use_image_if_no_bounding_boxes"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StatelessTruncatedNormalV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StatelessTruncatedNormalV2.pbtxt
new file mode 100644
index 00000000000..23f886f104d
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/StatelessTruncatedNormalV2.pbtxt
@@ -0,0 +1,51 @@
+op {
+  name: "StatelessTruncatedNormalV2"
+  input_arg {
+    name: "shape"
+    type_attr: "Tshape"
+  }
+  input_arg {
+    name: "key"
+    type: DT_UINT64
+  }
+  input_arg {
+    name: "counter"
+    type: DT_UINT64
+  }
+  input_arg {
+    name: "alg"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tshape"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorMapErase.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorMapErase.pbtxt
new file mode 100644
index 00000000000..854e7311eab
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorMapErase.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "TensorMapErase"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "key"
+    type_attr: "key_dtype"
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorMapHasKey.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorMapHasKey.pbtxt
new file mode 100644
index 00000000000..a095c36d7c2
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorMapHasKey.pbtxt
@@ -0,0 +1,19 @@
+op {
+  name: "TensorMapHasKey"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "key"
+    type_attr: "key_dtype"
+  }
+  output_arg {
+    name: "has_key"
+    type: DT_BOOL
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorMapInsert.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorMapInsert.pbtxt
new file mode 100644
index 00000000000..10061ea1cde
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorMapInsert.pbtxt
@@ -0,0 +1,27 @@
+op {
+  name: "TensorMapInsert"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "key"
+    type_attr: "key_dtype"
+  }
+  input_arg {
+    name: "value"
+    type_attr: "value_dtype"
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorMapLookup.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorMapLookup.pbtxt
new file mode 100644
index 00000000000..b48fda8ac46
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorMapLookup.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "TensorMapLookup"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "key"
+    type_attr: "key_dtype"
+  }
+  output_arg {
+    name: "value"
+    type_attr: "value_dtype"
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorMapSize.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorMapSize.pbtxt
new file mode 100644
index 00000000000..dd8ade84414
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorMapSize.pbtxt
@@ -0,0 +1,11 @@
+op {
+  name: "TensorMapSize"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorMapStackKeys.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorMapStackKeys.pbtxt
new file mode 100644
index 00000000000..c3befaa320a
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorMapStackKeys.pbtxt
@@ -0,0 +1,15 @@
+op {
+  name: "TensorMapStackKeys"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "keys"
+    type_attr: "key_dtype"
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index 6ef5635e95a..7f2a8304eda 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -882,6 +882,7 @@ REGISTER_OP("ModelDataset")
     .Output("handle: variant")
     .Attr("algorithm: int = 0")
     .Attr("cpu_budget: int = 0")
+    .Attr("ram_budget: int = 0")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .SetShapeFn(shape_inference::ScalarShape);
diff --git a/tensorflow/core/ops/experimental_dataset_ops.cc b/tensorflow/core/ops/experimental_dataset_ops.cc
index dd75f99bb70..2d012fdfa3e 100644
--- a/tensorflow/core/ops/experimental_dataset_ops.cc
+++ b/tensorflow/core/ops/experimental_dataset_ops.cc
@@ -64,6 +64,7 @@ REGISTER_OP("AutoShardDataset")
     .Attr("auto_shard_policy: int = 0")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
+    .Attr("num_replicas: int = 0")
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("ExperimentalAutoShardDataset")
@@ -437,6 +438,7 @@ REGISTER_OP("IgnoreErrorsDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
+    .Attr("log_warning: bool = false")
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("ExperimentalIgnoreErrorsDataset")
@@ -444,6 +446,7 @@ REGISTER_OP("ExperimentalIgnoreErrorsDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
+    .Attr("log_warning: bool = false")
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("IteratorGetDevice")
@@ -982,11 +985,12 @@ REGISTER_OP("SaveDataset")
     .Attr("shard_func: func")
     .Attr("use_shard_func: bool = true")
     .Attr("Tshard_func_args: list(type) >= 0")
+    .SetIsStateful()
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
       // `path` should be a scalar.
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
-      return shape_inference::ScalarShape(c);
+      return Status::OK();
     });
 
 REGISTER_OP("LoadDataset")
diff --git a/tensorflow/core/ops/image_ops.cc b/tensorflow/core/ops/image_ops.cc
index 43ee65c4ab4..d01ab2a8e60 100644
--- a/tensorflow/core/ops/image_ops.cc
+++ b/tensorflow/core/ops/image_ops.cc
@@ -758,6 +758,44 @@ REGISTER_OP("SampleDistortedBoundingBoxV2")
       return Status::OK();
     });
 
+REGISTER_OP("StatelessSampleDistortedBoundingBox")
+    .Input("image_size: T")
+    .Input("bounding_boxes: float")
+    .Input("min_object_covered: float")
+    .Input("seed: Tseed")
+    .Output("begin: T")
+    .Output("size: T")
+    .Output("bboxes: float")
+    .Attr("T: {uint8, int8, int16, int32, int64}")
+    .Attr("Tseed: {int32, int64}")
+    .Attr("aspect_ratio_range: list(float) = [0.75, 1.33]")
+    .Attr("area_range: list(float) = [0.05, 1.0]")
+    .Attr("max_attempts: int = 100")
+    .Attr("use_image_if_no_bounding_boxes: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      // Get inputs and validate ranks.
+      ShapeHandle image_size;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &image_size));
+      ShapeHandle bounding_boxes;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 3, &bounding_boxes));
+      ShapeHandle min_object_covered;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &min_object_covered));
+      ShapeHandle seed;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 1, &seed));
+      // image_size: 1-D with [height, width, channels]
+      // bounding_boxes: 3-D with shape [batch, N, 4]
+      DimensionHandle unused;
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(image_size, 0), 3, &unused));
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(bounding_boxes, 2), 4, &unused));
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(seed, 0), 2, &unused));
+
+      c->set_output(0, c->Vector(3));
+      c->set_output(1, c->Vector(3));
+      c->set_output(2, c->MakeShape({1, 1, 4}));
+
+      return Status::OK();
+    });
+
 // --------------------------------------------------------------------------
 
 // glimpse = extract_glimpse(input, size, offsets) extract the glimpse
@@ -1108,8 +1146,9 @@ REGISTER_OP("GenerateBoundingBoxProposals")
       return Status::OK();
     });
 
-// TODO(ringwalt): Add a "fill_constant" argument for constant mode (default 0).
-// V2 op supports output_shape. V1 op is in contrib.
+// V3 op supports fill_value.
+// V2 op supports output_shape.
+// V1 op is in contrib.
 REGISTER_OP("ImageProjectiveTransformV2")
     .Input("images: dtype")
     .Input("transforms: float32")
@@ -1125,4 +1164,20 @@ REGISTER_OP("ImageProjectiveTransformV2")
                                    c->Dim(input, 3));
     });
 
+REGISTER_OP("ImageProjectiveTransformV3")
+    .Input("images: dtype")
+    .Input("transforms: float32")
+    .Input("output_shape: int32")
+    .Input("fill_value: float32")
+    .Attr("dtype: {uint8, int32, int64, float16, float32, float64}")
+    .Attr("interpolation: string")
+    .Attr("fill_mode: string = 'CONSTANT'")
+    .Output("transformed_images: dtype")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle input;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input));
+      return SetOutputToSizedImage(c, c->Dim(input, 0), 2 /* size_input_idx */,
+                                   c->Dim(input, 3));
+    });
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/logging_ops.cc b/tensorflow/core/ops/logging_ops.cc
index 01fe5057616..ae9822c3fa4 100644
--- a/tensorflow/core/ops/logging_ops.cc
+++ b/tensorflow/core/ops/logging_ops.cc
@@ -87,13 +87,6 @@ REGISTER_OP("TensorSummary")
     .Attr("display_name: string = ''")
     .SetShapeFn(shape_inference::ScalarShape);
 
-REGISTER_OP("HistogramSummary")
-    .Input("tag: string")
-    .Input("values: T")
-    .Output("summary: string")
-    .Attr("T: realnumbertype = DT_FLOAT")
-    .SetShapeFn(shape_inference::ScalarShape);
-
 REGISTER_OP("ImageSummary")
     .Input("tag: string")
     .Input("tensor: T")
@@ -123,12 +116,6 @@ REGISTER_OP("AudioSummary")
     .SetShapeFn(shape_inference::ScalarShape)
     .Deprecated(15, "Use AudioSummaryV2.");
 
-REGISTER_OP("MergeSummary")
-    .Input("inputs: N * string")
-    .Output("summary: string")
-    .Attr("N : int >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
-
 REGISTER_OP("Timestamp")
     .Output("ts: float64")
     .SetIsStateful()
diff --git a/tensorflow/core/ops/map_ops.cc b/tensorflow/core/ops/map_ops.cc
new file mode 100644
index 00000000000..d54ef54b481
--- /dev/null
+++ b/tensorflow/core/ops/map_ops.cc
@@ -0,0 +1,87 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+namespace {
+
+// TODO(kttian): Support non-scalar values
+REGISTER_OP("EmptyTensorMap")
+    .Output("handle: variant")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->Scalar());
+      return Status::OK();
+    });
+
+REGISTER_OP("TensorMapSize")
+    .Input("input_handle: variant")
+    .Output("size: int32")
+    .SetShapeFn(shape_inference::ScalarShape);
+
+REGISTER_OP("TensorMapLookup")
+    .Input("input_handle: variant")
+    .Input("key: key_dtype")
+    .Output("value: value_dtype")
+    .Attr("key_dtype: type")
+    .Attr("value_dtype: type")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->UnknownShape());
+      return Status::OK();
+    });
+
+REGISTER_OP("TensorMapInsert")
+    .Input("input_handle: variant")
+    .Input("key: key_dtype")
+    .Input("value: value_dtype")
+    .Output("output_handle: variant")
+    .Attr("key_dtype: type")
+    .Attr("value_dtype: type")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->Scalar());
+      return Status::OK();
+    });
+
+REGISTER_OP("TensorMapErase")
+    .Input("input_handle: variant")
+    .Input("key: key_dtype")
+    .Output("output_handle: variant")
+    .Attr("key_dtype: type")
+    .Attr("value_dtype: type")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->Scalar());  // output map
+      return Status::OK();
+    });
+
+REGISTER_OP("TensorMapHasKey")
+    .Input("input_handle: variant")
+    .Input("key: key_dtype")
+    .Output("has_key: bool")
+    .Attr("key_dtype: type")
+    .SetShapeFn(shape_inference::ScalarShape);
+
+REGISTER_OP("TensorMapStackKeys")
+    .Input("input_handle: variant")
+    .Output("keys: key_dtype")
+    .Attr("key_dtype: type")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->UnknownShape());  // output keys
+      return Status::OK();
+    });
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/math_grad_test.cc b/tensorflow/core/ops/math_grad_test.cc
index ef839de92c9..0bc0e351b39 100644
--- a/tensorflow/core/ops/math_grad_test.cc
+++ b/tensorflow/core/ops/math_grad_test.cc
@@ -434,9 +434,6 @@ class TestOp : public OpKernel {
   void Compute(OpKernelContext* ctx) override { ctx->set_output(0, Tensor()); }
 };
 REGISTER_KERNEL_BUILDER(Name("TestOpWithNoGrad").Device(DEVICE_CPU), TestOp);
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("TestOpWithNoGrad").Device(DEVICE_SYCL), TestOp);
-#endif  // TENSORFLOW_USE_SYCL
 
 TEST_F(MathGradTest, Error_Reporting) {
   auto x = test::AsTensor<float>({-3.f});
@@ -893,8 +890,6 @@ TEST_F(MathGradTest, Pow) {
   }
 }
 
-// TODO{lukeiwanski}: Implement Complex Pow for SYCL
-#ifndef TENSORFLOW_USE_SYCL
 TEST_F(MathGradTest, ComplexPow) {
   auto x = test::AsTensor<complex64>({0.f, 2.f, -2.f}, TensorShape({3}));
   auto y = test::AsTensor<complex64>({2.f, 2.f, 2.f}, TensorShape({3}));
@@ -941,7 +936,6 @@ TEST_F(MathGradTest, ComplexPow) {
                                 TensorShape({3})),
       4.5e-6f);
 }
-#endif  // TENSORFLOW_USE_SYCL
 
 TEST_F(MathGradTest, Xlogy) {
   auto x = test::AsTensor<float>({0.f, 0.f, 2.f, 3.f, 4.f, 5.f},
@@ -1185,8 +1179,6 @@ TEST_F(MathGradTest, MatMul_11) {
   test::ExpectClose(dy, MatMul(dz, true, x, true));
 }
 
-// TODO{lukeiwanski}: Implement BatchMatMul for SYCL
-#ifndef TENSORFLOW_USE_SYCL
 TEST_F(MathGradTest, BatchMatMul_00) {
   auto x = test::AsTensor<float>({1.f, 2.f, 3.f, 4.f, 5.f, 6.f},
                                  TensorShape({1, 2, 3}));
@@ -1234,7 +1226,6 @@ TEST_F(MathGradTest, BatchMatMul_11) {
   test::ExpectClose(dx, BatchMatMul(y, true, dz, true));
   test::ExpectClose(dy, BatchMatMul(dz, true, x, true));
 }
-#endif  // TENSORFLOW_USE_SYCL
 
 TEST_F(MathGradTest, BatchMatMulV2_00) {
   auto x = test::AsTensor<float>({1.f, 2.f, 3.f, 4.f, 5.f, 6.f},
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index cbf1ef53dde..15d1a1f86e2 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -382,8 +382,8 @@ REGISTER_OP("AddV2")
     .Input("y: T")
     .Output("z: T")
     .Attr(
-        "T: {bfloat16, half, float, double, uint8, int8, int16, int32, int64, "
-        "complex64, complex128}")
+        "T: {bfloat16, half, float, double, uint8, int8, int16, uint32, int32, "
+        "int64, complex64, complex128}")
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
     .SetIsAggregate()
     .SetIsCommutative();
@@ -1026,7 +1026,7 @@ REGISTER_OP("Min")
     .Input("reduction_indices: Tidx")
     .Output("output: T")
     .Attr("keep_dims: bool = false")
-    .Attr("T: numbertype")
+    .Attr("T: {realnumbertype, quantizedtype}")
     .Attr("Tidx: {int32, int64} = DT_INT32")
     .SetShapeFn(shape_inference::ReductionShape);
 
@@ -1035,7 +1035,7 @@ REGISTER_OP("Max")
     .Input("reduction_indices: Tidx")
     .Output("output: T")
     .Attr("keep_dims: bool = false")
-    .Attr("T: numbertype")
+    .Attr("T: {realnumbertype, quantizedtype}")
     .Attr("Tidx: {int32, int64} = DT_INT32")
     .SetShapeFn(shape_inference::ReductionShape);
 
diff --git a/tensorflow/core/ops/mkl_nn_ops.cc b/tensorflow/core/ops/mkl_nn_ops.cc
index 248cf1d0e8a..5a206e03e58 100644
--- a/tensorflow/core/ops/mkl_nn_ops.cc
+++ b/tensorflow/core/ops/mkl_nn_ops.cc
@@ -54,6 +54,8 @@ REGISTER_OP("_MklFusedConv2D")
     .Attr("fused_ops: list(string) = []")
     // Attributes for the FusedBatchNorm ------------------------------------ //
     .Attr("epsilon: float = 0.0001")
+    // Attributes for the LeakyRelu ----------------------------------------- //
+    .Attr("leakyrelu_alpha: float = 0.2")
     // ---------------------------------------------------------------------- //
     .SetShapeFn(shape_inference::Conv2DShape)
     .Doc(R"doc(
@@ -82,6 +84,8 @@ REGISTER_OP("_MklFusedDepthwiseConv2dNative")
     .Attr("fused_ops: list(string) = []")
     // Attributes for the FusedBatchNorm ------------------------------------ //
     .Attr("epsilon: float = 0.0001")
+    // Attributes for the LeakyRelu ----------------------------------------- //
+    .Attr("leakyrelu_alpha: float = 0.2")
     // ---------------------------------------------------------------------- //
     .SetShapeFn(shape_inference::DepthwiseConv2DNativeShape);
 
@@ -131,6 +135,8 @@ REGISTER_OP("__MklDummyPadWithFusedConv2D")
     .Attr("Tpaddings: {int32, int64} = DT_INT32")
     // Attributes for the FusedBatchNorm ------------------------------------ //
     .Attr("epsilon: float = 0.0001")
+    // Attributes for the LeakyRelu ----------------------------------------- //
+    .Attr("leakyrelu_alpha: float = 0.2")
     // ---------------------------------------------------------------------- //
     .SetShapeFn(shape_inference::Conv2DShape)
     .Doc(R"doc(
@@ -162,6 +168,8 @@ REGISTER_OP("_MklPadWithFusedConv2D")
     .Attr("Tpaddings: {int32, int64} = DT_INT32")
     // Attributes for the FusedBatchNorm ------------------------------------ //
     .Attr("epsilon: float = 0.0001")
+    // Attributes for the LeakyRelu ----------------------------------------- //
+    .Attr("leakyrelu_alpha: float = 0.2")
     // ---------------------------------------------------------------------- //
     .SetShapeFn(shape_inference::Conv2DShape)
     .Doc(R"doc(
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index 7eedd8b0371..2b6330db4aa 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -252,7 +252,9 @@ REGISTER_OP("_FusedBatchNormEx")
     .Attr("is_training: bool = true")
     .SetShapeFn(shape_inference::FusedBatchNormExShape)
     .Doc(R"doc(
-*NOTE*: Do not invoke this operator directly in Python. Grappler is
+Internal FusedBatchNorm operation: reserved for internal use.
+
+Do not invoke this operator directly in Python. A fusion optimization is
 expected to create these operators.
 )doc");
 
@@ -402,6 +404,8 @@ REGISTER_OP("_FusedConv2D")
     .Attr("fused_ops: list(string) = []")
     // Attributes for the FusedBatchNorm ------------------------------------ //
     .Attr("epsilon: float = 0.0001")
+    // Attributes for the LeakyRelu ----------------------------------------- //
+    .Attr("leakyrelu_alpha: float = 0.2")
     // ---------------------------------------------------------------------- //
     .SetShapeFn(shape_inference::Conv2DShapeWithExplicitPadding)
     .Doc(R"doc(
@@ -631,7 +635,10 @@ REGISTER_OP("_FusedDepthwiseConv2dNative")
     .Attr("fused_ops: list(string) = []")
     // Attributes for the FusedBatchNorm ------------------------------------ //
     .Attr("epsilon: float = 0.0001")
+    // Attributes for the LeakyRelu ----------------------------------------- //
+    .Attr("leakyrelu_alpha: float = 0.2")
     // ---------------------------------------------------------------------- //
+
     .SetShapeFn(shape_inference::DepthwiseConv2DNativeShape);
 
 // --------------------------------------------------------------------------
@@ -839,11 +846,12 @@ REGISTER_OP("MaxPool")
         "uint16, qint8} = DT_FLOAT")
     .Attr("ksize: list(int) >= 4")
     .Attr("strides: list(int) >= 4")
-    .Attr(GetPaddingAttrString())
+    .Attr(GetPaddingAttrStringWithExplicit())
+    .Attr(GetExplicitPaddingsAttrString())
     .Attr("data_format: {'NHWC', 'NCHW', 'NCHW_VECT_C'} = 'NHWC'")
     .Input("input: T")
     .Output("output: T")
-    .SetShapeFn(shape_inference::MaxPoolShape);
+    .SetShapeFn(shape_inference::MaxPoolShapeWithExplicitPadding);
 
 REGISTER_OP("MaxPoolV2")
     .Attr(
@@ -863,7 +871,8 @@ REGISTER_OP("MaxPoolV2")
 REGISTER_OP("MaxPoolGrad")
     .Attr("ksize: list(int) >= 4")
     .Attr("strides: list(int) >= 4")
-    .Attr(GetPaddingAttrString())
+    .Attr(GetPaddingAttrStringWithExplicit())
+    .Attr(GetExplicitPaddingsAttrString())
     .Attr(GetConvnetDataFormatAttrString())
     .Input("orig_input: T")
     .Input("orig_output: T")
@@ -888,6 +897,7 @@ REGISTER_OP("MaxPoolGradV2")
       return UnchangedShapeWithRank(c, 4);
     });
 
+// TODO(b/150813181): Implement explicit padding.
 REGISTER_OP("MaxPoolGradGrad")
     .Attr("ksize: list(int) >= 4")
     .Attr("strides: list(int) >= 4")
@@ -1695,13 +1705,14 @@ NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
 expected to invoke these operators.
 )doc");
 
-REGISTER_OP("_MklEagerConv2D")
+REGISTER_OP("_MklNativeConv2D")
     .Input("input: T")
     .Input("filter: T")
     .Output("output: T")
     .Attr("T: {bfloat16, float}")
     .Attr("strides: list(int)")
     .Attr("use_cudnn_on_gpu: bool = true")
+    .Attr("is_filter_const: bool = false")
     .Attr(GetPaddingAttrStringWithExplicit())
     .Attr(GetExplicitPaddingsAttrString())
     .Attr(GetConvnetDataFormatAttrString())
@@ -1845,7 +1856,7 @@ NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
 expected to invoke these operators.
 )doc");
 
-REGISTER_OP("_MklEagerConv2DBackpropFilter")
+REGISTER_OP("_MklNativeConv2DBackpropFilter")
     .Input("input: T")
     .Input("filter_sizes: int32")
     .Input("out_backprop: T")
@@ -2006,7 +2017,7 @@ NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
 expected to invoke these operators.
 )doc");
 
-REGISTER_OP("_MklEagerConv2DBackpropInput")
+REGISTER_OP("_MklNativeConv2DBackpropInput")
     .Input("input_sizes: int32")
     .Input("filter: T")
     .Input("out_backprop: T")
@@ -2289,6 +2300,7 @@ REGISTER_OP("_MklMaxPool")
     .Attr("strides: list(int) >= 4")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
+    .Attr(GetExplicitPaddingsAttrString())
     .Attr("workspace_enabled: bool = false")
     .Input("input: T")
     .Input("mkl_input: uint8")
@@ -2316,6 +2328,7 @@ REGISTER_OP("_MklMaxPoolGrad")
     .Attr("workspace_enabled: bool = false")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
+    .Attr(GetExplicitPaddingsAttrString())
     .Input("orig_input: T")
     .Input("orig_output: T")
     .Input("grad: T")
@@ -3406,4 +3419,16 @@ REGISTER_OP("QuantizedDepthwiseConv2DWithBiasAndReluAndRequantize")
     .Attr("padding_list: list(int) = []")
     .SetShapeFn(shape_inference::DepthwiseConv2DNativeShape);
 
+REGISTER_OP("IsotonicRegression")
+    .Input("input: T")
+    .Output("output: output_dtype")
+    .Output("segments: int32")
+    .Attr("T: realnumbertype")
+    .Attr("output_dtype: {half, bfloat16, float, double} = DT_FLOAT")
+    .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* context) {
+      context->set_output(0, context->input(0));
+      context->set_output(1, context->input(0));
+      return tensorflow::Status::OK();
+    });
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 29facd6e298..08eba7074b6 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -434,6 +434,7 @@ op {
         type: DT_UINT8
         type: DT_INT8
         type: DT_INT16
+        type: DT_UINT32
         type: DT_INT32
         type: DT_INT64
         type: DT_COMPLEX64
@@ -2835,6 +2836,13 @@ op {
     has_minimum: true
     minimum: 1
   }
+  attr {
+    name: "num_replicas"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
 }
 op {
   name: "AvgPool"
@@ -7543,6 +7551,57 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "CollectiveGatherV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "group_size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "group_key"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "instance_key"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "communication_hint"
+    type: "string"
+    default_value {
+      s: "auto"
+    }
+  }
+  attr {
+    name: "timeout_seconds"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "CollectivePermute"
   input_arg {
@@ -9292,6 +9351,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_INT32
@@ -13480,6 +13540,13 @@ op {
     }
   }
 }
+op {
+  name: "EmptyTensorMap"
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+}
 op {
   name: "EncodeBase64"
   input_arg {
@@ -14727,6 +14794,13 @@ op {
     has_minimum: true
     minimum: 1
   }
+  attr {
+    name: "log_warning"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
   name: "ExperimentalIteratorGetDevice"
@@ -19012,6 +19086,13 @@ op {
     has_minimum: true
     minimum: 1
   }
+  attr {
+    name: "log_warning"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
   name: "Imag"
@@ -19094,6 +19175,54 @@ op {
     }
   }
 }
+op {
+  name: "ImageProjectiveTransformV3"
+  input_arg {
+    name: "images"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "transforms"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "output_shape"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "fill_value"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "transformed_images"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "interpolation"
+    type: "string"
+  }
+  attr {
+    name: "fill_mode"
+    type: "string"
+    default_value {
+      s: "CONSTANT"
+    }
+  }
+}
 op {
   name: "ImageSummary"
   input_arg {
@@ -19834,6 +19963,56 @@ op {
   }
   allows_uninitialized_input: true
 }
+op {
+  name: "IsotonicRegression"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "output_dtype"
+  }
+  output_arg {
+    name: "segments"
+    type: DT_INT32
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "output_dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "Iterator"
   output_arg {
@@ -23555,17 +23734,17 @@ op {
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
         type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
         type: DT_BFLOAT16
         type: DT_UINT16
-        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
       }
     }
   }
@@ -23661,6 +23840,15 @@ op {
       list {
         s: "SAME"
         s: "VALID"
+        s: "EXPLICIT"
+      }
+    }
+  }
+  attr {
+    name: "explicit_paddings"
+    type: "list(int)"
+    default_value {
+      list {
       }
     }
   }
@@ -23929,6 +24117,15 @@ op {
       list {
         s: "SAME"
         s: "VALID"
+        s: "EXPLICIT"
+      }
+    }
+  }
+  attr {
+    name: "explicit_paddings"
+    type: "list(int)"
+    default_value {
+      list {
       }
     }
   }
@@ -24713,17 +24910,17 @@ op {
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
         type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
         type: DT_BFLOAT16
         type: DT_UINT16
-        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
       }
     }
   }
@@ -24935,6 +25132,13 @@ op {
       i: 0
     }
   }
+  attr {
+    name: "ram_budget"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
   attr {
     name: "output_types"
     type: "list(type)"
@@ -40906,6 +41110,26 @@ op {
     }
   }
 }
+op {
+  name: "RngReadAndSkip"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "alg"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "delta"
+    type: DT_UINT64
+  }
+  output_arg {
+    name: "value"
+    type: DT_INT64
+  }
+  is_stateful: true
+}
 op {
   name: "RngSkip"
   input_arg {
@@ -41364,6 +41588,7 @@ op {
     type: "list(type)"
     has_minimum: true
   }
+  is_stateful: true
 }
 op {
   name: "SaveSlices"
@@ -49428,6 +49653,39 @@ op {
     }
   }
 }
+op {
+  name: "StatelessRandomGetKeyCounterAlg"
+  input_arg {
+    name: "seed"
+    type_attr: "Tseed"
+  }
+  output_arg {
+    name: "key"
+    type: DT_UINT64
+  }
+  output_arg {
+    name: "counter"
+    type: DT_UINT64
+  }
+  output_arg {
+    name: "alg"
+    type: DT_INT32
+  }
+  attr {
+    name: "Tseed"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "StatelessRandomNormal"
   input_arg {
@@ -49484,6 +49742,57 @@ op {
     }
   }
 }
+op {
+  name: "StatelessRandomNormalV2"
+  input_arg {
+    name: "shape"
+    type_attr: "Tshape"
+  }
+  input_arg {
+    name: "key"
+    type: DT_UINT64
+  }
+  input_arg {
+    name: "counter"
+    type: DT_UINT64
+  }
+  input_arg {
+    name: "alg"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tshape"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "StatelessRandomPoisson"
   input_arg {
@@ -49666,6 +49975,57 @@ op {
     }
   }
 }
+op {
+  name: "StatelessRandomUniformFullIntV2"
+  input_arg {
+    name: "shape"
+    type_attr: "Tshape"
+  }
+  input_arg {
+    name: "key"
+    type: DT_UINT64
+  }
+  input_arg {
+    name: "counter"
+    type: DT_UINT64
+  }
+  input_arg {
+    name: "alg"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_UINT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tshape"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "StatelessRandomUniformInt"
   input_arg {
@@ -49722,6 +50082,201 @@ op {
     }
   }
 }
+op {
+  name: "StatelessRandomUniformIntV2"
+  input_arg {
+    name: "shape"
+    type_attr: "Tshape"
+  }
+  input_arg {
+    name: "key"
+    type: DT_UINT64
+  }
+  input_arg {
+    name: "counter"
+    type: DT_UINT64
+  }
+  input_arg {
+    name: "alg"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "minval"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "maxval"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tshape"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "StatelessRandomUniformV2"
+  input_arg {
+    name: "shape"
+    type_attr: "Tshape"
+  }
+  input_arg {
+    name: "key"
+    type: DT_UINT64
+  }
+  input_arg {
+    name: "counter"
+    type: DT_UINT64
+  }
+  input_arg {
+    name: "alg"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tshape"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "StatelessSampleDistortedBoundingBox"
+  input_arg {
+    name: "image_size"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "bounding_boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_object_covered"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "seed"
+    type_attr: "Tseed"
+  }
+  output_arg {
+    name: "begin"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "size"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "bboxes"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tseed"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "aspect_ratio_range"
+    type: "list(float)"
+    default_value {
+      list {
+        f: 0.75
+        f: 1.33
+      }
+    }
+  }
+  attr {
+    name: "area_range"
+    type: "list(float)"
+    default_value {
+      list {
+        f: 0.05
+        f: 1
+      }
+    }
+  }
+  attr {
+    name: "max_attempts"
+    type: "int"
+    default_value {
+      i: 100
+    }
+  }
+  attr {
+    name: "use_image_if_no_bounding_boxes"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
 op {
   name: "StatelessTruncatedNormal"
   input_arg {
@@ -49778,6 +50333,57 @@ op {
     }
   }
 }
+op {
+  name: "StatelessTruncatedNormalV2"
+  input_arg {
+    name: "shape"
+    type_attr: "Tshape"
+  }
+  input_arg {
+    name: "key"
+    type: DT_UINT64
+  }
+  input_arg {
+    name: "counter"
+    type: DT_UINT64
+  }
+  input_arg {
+    name: "alg"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tshape"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "StatelessWhile"
   input_arg {
@@ -53009,6 +53615,124 @@ op {
     }
   }
 }
+op {
+  name: "TensorMapErase"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "key"
+    type_attr: "key_dtype"
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
+}
+op {
+  name: "TensorMapHasKey"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "key"
+    type_attr: "key_dtype"
+  }
+  output_arg {
+    name: "has_key"
+    type: DT_BOOL
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+}
+op {
+  name: "TensorMapInsert"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "key"
+    type_attr: "key_dtype"
+  }
+  input_arg {
+    name: "value"
+    type_attr: "value_dtype"
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
+}
+op {
+  name: "TensorMapLookup"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "key"
+    type_attr: "key_dtype"
+  }
+  output_arg {
+    name: "value"
+    type_attr: "value_dtype"
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
+}
+op {
+  name: "TensorMapSize"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+}
+op {
+  name: "TensorMapStackKeys"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "keys"
+    type_attr: "key_dtype"
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+}
 op {
   name: "TensorScatterAdd"
   input_arg {
diff --git a/tensorflow/core/ops/state_ops.cc b/tensorflow/core/ops/state_ops.cc
index 500d5ec88b8..5d856396360 100644
--- a/tensorflow/core/ops/state_ops.cc
+++ b/tensorflow/core/ops/state_ops.cc
@@ -131,6 +131,22 @@ Status ScatterUpdateShape(InferenceContext* c) {
   return Status::OK();
 }
 
+Status ScatterNdUpdateShape(InferenceContext* c) {
+  ShapeHandle input_shape = c->input(0);
+  if (c->input_handle_shapes_and_types(0) != nullptr) {
+    const auto& shape_and_type = *(c->input_handle_shapes_and_types(0));
+    if (!shape_and_type.empty()) {
+      input_shape = shape_and_type[0].shape;
+    }
+  }
+  ShapeHandle indices_shape;
+  TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(1), 1, &indices_shape));
+  ShapeHandle updates_shape;
+  TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(2), 1, &updates_shape));
+  return shape_inference::ScatterNdShapeHelper(c, indices_shape, updates_shape,
+                                               input_shape);
+}
+
 }  // namespace
 
 REGISTER_OP("ScatterUpdate")
@@ -211,7 +227,7 @@ REGISTER_OP("ScatterNdUpdate")
     .Attr("T: type")
     .Attr("Tindices: {int32, int64}")
     .Attr("use_locking: bool = true")
-    .SetShapeFn(shape_inference::ScatterNdUpdateShape);
+    .SetShapeFn(ScatterNdUpdateShape);
 
 REGISTER_OP("ResourceScatterNdUpdate")
     .Input("ref: resource")
@@ -220,7 +236,7 @@ REGISTER_OP("ResourceScatterNdUpdate")
     .Attr("T: type")
     .Attr("Tindices: {int32, int64}")
     .Attr("use_locking: bool = true")
-    .SetShapeFn(shape_inference::ScatterNdUpdateShape);
+    .SetShapeFn(ScatterNdUpdateShape);
 
 REGISTER_OP("ResourceScatterNdAdd")
     .Input("ref: resource")
@@ -229,7 +245,7 @@ REGISTER_OP("ResourceScatterNdAdd")
     .Attr("T: type")
     .Attr("Tindices: {int32, int64}")
     .Attr("use_locking: bool = true")
-    .SetShapeFn(shape_inference::ScatterNdUpdateShape);
+    .SetShapeFn(ScatterNdUpdateShape);
 
 REGISTER_OP("ResourceScatterNdSub")
     .Input("ref: resource")
@@ -238,7 +254,7 @@ REGISTER_OP("ResourceScatterNdSub")
     .Attr("T: type")
     .Attr("Tindices: {int32, int64}")
     .Attr("use_locking: bool = true")
-    .SetShapeFn(shape_inference::ScatterNdUpdateShape);
+    .SetShapeFn(ScatterNdUpdateShape);
 
 REGISTER_OP("ResourceScatterNdMin")
     .Input("ref: resource")
@@ -247,7 +263,7 @@ REGISTER_OP("ResourceScatterNdMin")
     .Attr("T: type")
     .Attr("Tindices: {int32, int64}")
     .Attr("use_locking: bool = true")
-    .SetShapeFn(shape_inference::ScatterNdUpdateShape);
+    .SetShapeFn(ScatterNdUpdateShape);
 
 REGISTER_OP("ResourceScatterNdMax")
     .Input("ref: resource")
@@ -256,7 +272,7 @@ REGISTER_OP("ResourceScatterNdMax")
     .Attr("T: type")
     .Attr("Tindices: {int32, int64}")
     .Attr("use_locking: bool = true")
-    .SetShapeFn(shape_inference::ScatterNdUpdateShape);
+    .SetShapeFn(ScatterNdUpdateShape);
 
 REGISTER_OP("ScatterNdAdd")
     .Input("ref: Ref(T)")
@@ -266,7 +282,7 @@ REGISTER_OP("ScatterNdAdd")
     .Attr("T: numbertype")
     .Attr("Tindices: {int32, int64}")
     .Attr("use_locking: bool = false")
-    .SetShapeFn(shape_inference::ScatterNdUpdateShape);
+    .SetShapeFn(ScatterNdUpdateShape);
 
 REGISTER_OP("ScatterNdSub")
     .Input("ref: Ref(T)")
@@ -276,7 +292,7 @@ REGISTER_OP("ScatterNdSub")
     .Attr("T: numbertype")
     .Attr("Tindices: {int32, int64}")
     .Attr("use_locking: bool = false")
-    .SetShapeFn(shape_inference::ScatterNdUpdateShape);
+    .SetShapeFn(ScatterNdUpdateShape);
 
 REGISTER_OP("ScatterNdMax")
     .Input("ref: Ref(T)")
@@ -286,7 +302,7 @@ REGISTER_OP("ScatterNdMax")
     .Attr("T: numbertype")
     .Attr("Tindices: {int32, int64}")
     .Attr("use_locking: bool = false")
-    .SetShapeFn(shape_inference::ScatterNdUpdateShape);
+    .SetShapeFn(ScatterNdUpdateShape);
 
 REGISTER_OP("ScatterNdMin")
     .Input("ref: Ref(T)")
@@ -296,7 +312,7 @@ REGISTER_OP("ScatterNdMin")
     .Attr("T: numbertype")
     .Attr("Tindices: {int32, int64}")
     .Attr("use_locking: bool = false")
-    .SetShapeFn(shape_inference::ScatterNdUpdateShape);
+    .SetShapeFn(ScatterNdUpdateShape);
 
 REGISTER_OP("CountUpTo")
     .Input("ref: Ref(T)")
diff --git a/tensorflow/core/ops/state_ops_test.cc b/tensorflow/core/ops/state_ops_test.cc
index a0caad4a49f..bc68cf46f03 100644
--- a/tensorflow/core/ops/state_ops_test.cc
+++ b/tensorflow/core/ops/state_ops_test.cc
@@ -69,6 +69,28 @@ TEST(StateOpsTest, ScatterUpdate_ShapeFn) {
   INFER_ERROR("Shapes must be equal rank, but are 1 and 0", op, "[2];[];[2]");
 }
 
+TEST(StateOpsTest, ResourceScatterNdUpdate_ShapeFn) {
+  ShapeInferenceTestOp op("ResourceScatterNdUpdate");
+  TF_ASSERT_OK(NodeDefBuilder("test", "ResourceScatterNdUpdate")
+                   .Input("ref", 0, DT_RESOURCE)
+                   .Input("indices", 0, DT_INT32)
+                   .Input("updates", 1, DT_FLOAT)
+                   .Finalize(&op.node_def));
+
+  std::vector<ShapeInferenceTestOp::ShapeAndType> shapes_and_types;
+  op.input_resource_handle_shapes_and_types.push_back(&shapes_and_types);
+  op.input_resource_handle_shapes_and_types.push_back(nullptr);
+  op.input_resource_handle_shapes_and_types.push_back(nullptr);
+  shapes_and_types.emplace_back("[?,?]", DT_FLOAT);
+  INFER_OK(op, "[?];[?,2];[?]", "");
+  INFER_ERROR("Shape must be at least rank 1 but is rank 0", op,
+              "[?];[?,2];[]");
+  INFER_ERROR(
+      "Dimensions [0,1) of indices[shape=[8,2]] = [8] must match "
+      "dimensions [0,1) of updates[shape=[9]] = [9]",
+      op, "[?];[8,2];[9]");
+}
+
 TEST(StateOpsTest, TemporaryVariable_ShapeFn) {
   ShapeInferenceTestOp op("TemporaryVariable");
   TensorShape shape({1, 2, 3});
diff --git a/tensorflow/core/ops/stateful_random_ops.cc b/tensorflow/core/ops/stateful_random_ops.cc
index 6c3ca3ac466..beb9fd60e6a 100644
--- a/tensorflow/core/ops/stateful_random_ops.cc
+++ b/tensorflow/core/ops/stateful_random_ops.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/rng_alg.h"
 #include "tensorflow/core/framework/shape_inference.h"
 
 namespace tensorflow {
@@ -90,6 +91,19 @@ REGISTER_OP("RngSkip")
       return Status::OK();
     });
 
+REGISTER_OP("RngReadAndSkip")
+    .Input("resource: resource")
+    .Input("alg: int32")
+    .Input("delta: uint64")
+    .Output("value: int64")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      c->set_output(0, c->MakeShape({RNG_MAX_COUNTER_SIZE + RNG_KEY_SIZE}));
+      return Status::OK();
+    });
+
 REGISTER_OP("NonDeterministicInts")
     .Input("shape: shape_dtype")
     .SetIsStateful()
diff --git a/tensorflow/core/ops/stateless_random_ops_v2.cc b/tensorflow/core/ops/stateless_random_ops_v2.cc
new file mode 100644
index 00000000000..e6f87674174
--- /dev/null
+++ b/tensorflow/core/ops/stateless_random_ops_v2.cc
@@ -0,0 +1,119 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/rng_alg.h"
+
+namespace tensorflow {
+
+using shape_inference::DimensionHandle;
+using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
+
+static Status StatelessShapeV2(InferenceContext* c) {
+  // Check key and counter shapes
+  ShapeHandle key;
+  ShapeHandle counter;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &key));
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &counter));
+  shape_inference::ShapeHandle unused_shape;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused_shape));
+  DimensionHandle unused;
+  TF_RETURN_IF_ERROR(c->WithValue(c->Dim(key, 0), RNG_KEY_SIZE, &unused));
+
+  // Set output shape
+  ShapeHandle out;
+  TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &out));
+  c->set_output(0, out);
+  return Status::OK();
+}
+
+#define REGISTER_STATELESS_OP(name)                           \
+  REGISTER_OP(name)                                           \
+      .Input("shape: Tshape")                                 \
+      .Input("key: uint64")                                   \
+      .Input("counter: uint64")                               \
+      .Input("alg: int32")                                    \
+      .Output("output: dtype")                                \
+      .Attr("dtype: {half,bfloat16,float,double} = DT_FLOAT") \
+      .Attr("Tshape: {int32, int64} = DT_INT32")              \
+      .SetShapeFn(StatelessShapeV2)
+
+REGISTER_STATELESS_OP("StatelessRandomUniformV2");
+REGISTER_STATELESS_OP("StatelessRandomNormalV2");
+REGISTER_STATELESS_OP("StatelessTruncatedNormalV2");
+
+#undef REGISTER_STATELESS_OP
+
+REGISTER_OP("StatelessRandomUniformIntV2")
+    .Input("shape: Tshape")
+    .Input("key: uint64")
+    .Input("counter: uint64")
+    .Input("alg: int32")
+    .Input("minval: dtype")
+    .Input("maxval: dtype")
+    .Output("output: dtype")
+    .Attr("dtype: {int32, int64, uint32, uint64}")
+    .Attr("Tshape: {int32, int64} = DT_INT32")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle unused;
+      Status s = c->WithRank(c->input(4), 0, &unused);
+      if (!s.ok()) {
+        return errors::InvalidArgument(
+            "minval must be a scalar; got a tensor of shape ",
+            c->DebugString(c->input(4)));
+      }
+      s = c->WithRank(c->input(5), 0, &unused);
+      if (!s.ok()) {
+        return errors::InvalidArgument(
+            "maxval must be a scalar; got a tensor of shape ",
+            c->DebugString(c->input(5)));
+      }
+      return StatelessShapeV2(c);
+    });
+
+REGISTER_OP("StatelessRandomUniformFullIntV2")
+    .Input("shape: Tshape")
+    .Input("key: uint64")
+    .Input("counter: uint64")
+    .Input("alg: int32")
+    .Output("output: dtype")
+    .Attr("dtype: {int32, int64, uint32, uint64} = DT_UINT64")
+    .Attr("Tshape: {int32, int64} = DT_INT32")
+    .SetShapeFn(StatelessShapeV2);
+
+REGISTER_OP("StatelessRandomGetKeyCounterAlg")
+    .Input("seed: Tseed")
+    .Output("key: uint64")
+    .Output("counter: uint64")
+    .Output("alg: int32")
+    .Attr("Tseed: {int32, int64} = DT_INT64")
+    .SetIsStateful()  // because outputs depend on device
+    .SetShapeFn([](InferenceContext* c) {
+      // Check seed shape
+      ShapeHandle seed;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &seed));
+      DimensionHandle unused;
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(seed, 0), 2, &unused));
+
+      // Set output shapes
+      c->set_output(0, c->MakeShape({RNG_KEY_SIZE}));
+      c->set_output(1, c->MakeShape({RNG_MAX_COUNTER_SIZE}));
+      c->set_output(2, c->MakeShape({}));
+      return Status::OK();
+    });
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/tpu_cross_replica_ops.cc b/tensorflow/core/ops/tpu_cross_replica_ops.cc
index adce0b51a05..1f10fe3136d 100644
--- a/tensorflow/core/ops/tpu_cross_replica_ops.cc
+++ b/tensorflow/core/ops/tpu_cross_replica_ops.cc
@@ -78,7 +78,7 @@ REGISTER_OP("CrossReplicaSum")
     .Input("input: T")
     .Input("group_assignment: int32")
     .Output("output: T")
-    .Attr("T: {bfloat16, float, int32, uint32}")
+    .Attr("T: {half, bfloat16, float, int32, uint32}")
     .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("CollectivePermute")
diff --git a/tensorflow/core/platform/BUILD b/tensorflow/core/platform/BUILD
index a889666c608..80d0a916fe3 100644
--- a/tensorflow/core/platform/BUILD
+++ b/tensorflow/core/platform/BUILD
@@ -54,6 +54,9 @@ load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_cc_test_gpu")
 
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "get_compatible_with_portable")
+
 package(
     default_visibility = ["//tensorflow:__subpackages__"],
     licenses = ["notice"],  # Apache 2.0
@@ -68,6 +71,7 @@ exports_files(
         "cpu_info.cc",
         "cpu_info.h",
         "cuda_libdevice_path.h",
+        "bfloat16.h",
         "demangle.h",
         "env.cc",
         "env.h",
@@ -123,6 +127,16 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "bfloat16",
+    hdrs = ["bfloat16.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        ":byte_order",
+        "//third_party/eigen3",
+    ],
+)
+
 cc_library(
     name = "blocking_counter",
     hdrs = ["blocking_counter.h"],
@@ -135,6 +149,7 @@ cc_library(
 cc_library(
     name = "byte_order",
     hdrs = ["byte_order.h"],
+    compatible_with = get_compatible_with_portable(),
 )
 
 cc_library(
@@ -158,6 +173,7 @@ cc_library(
 cc_library(
     name = "cord",
     hdrs = ["cord.h"],
+    compatible_with = get_compatible_with_portable(),
     deps = [
         ":platform",
     ] + tf_platform_deps("cord"),
@@ -430,6 +446,7 @@ cc_library(
 cc_library(
     name = "platform",
     hdrs = ["platform.h"],
+    compatible_with = get_compatible_with_portable(),
 )
 
 cc_library(
@@ -762,6 +779,7 @@ cc_library(
         "ctstring_internal.h",
         "tstring.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     deps = [
         ":cord",
         "@com_google_absl//absl/strings",
@@ -779,6 +797,7 @@ filegroup(
 cc_library(
     name = "types",
     hdrs = ["types.h"],
+    compatible_with = get_compatible_with_portable(),
     # TODO(b/161569340): Short-term fix. Remove this visibility rule.
     visibility = [
         "//tensorflow:__subpackages__",
@@ -786,6 +805,7 @@ cc_library(
     ],
     deps = [
         ":platform",
+        ":bfloat16",
         ":tstring",
     ] + tf_platform_deps("types"),
 )
@@ -958,9 +978,9 @@ cc_library(
 )
 
 cc_library(
-    name = "tf32_utils",
-    srcs = ["tf32_utils.cc"],
-    hdrs = ["tf32_utils.h"],
+    name = "tensor_float_32_utils",
+    srcs = ["tensor_float_32_utils.cc"],
+    hdrs = ["tensor_float_32_utils.h"],
     copts = tf_copts(),
     alwayslink = 1,
 )
@@ -986,8 +1006,8 @@ cc_library(
 )
 
 filegroup(
-    name = "tf32_hdr",
-    srcs = ["tf32_utils.h"],
+    name = "tensor_float_32_hdr",
+    srcs = ["tensor_float_32_utils.h"],
 )
 
 tf_cc_tests(
@@ -1322,6 +1342,7 @@ filegroup(
     srcs = [
         "abi.h",
         "base64.h",
+        "bfloat16.h",
         "casts.h",
         "coding.h",
         "context.h",
@@ -1408,6 +1429,7 @@ filegroup(
 filegroup(
     name = "framework_lite_hdrs",
     srcs = [
+        "bfloat16.h",
         "byte_order.h",
         "cpu_info.h",
         "ctstring.h",
@@ -1428,6 +1450,7 @@ filegroup(
 filegroup(
     name = "lib_internal_private_hdrs",
     srcs = [
+        "bfloat16.h",
         "raw_coding.h",
         "scanner.h",
         "str_util.h",
@@ -1473,6 +1496,7 @@ filegroup(
 filegroup(
     name = "tflite_portable_logging_hdrs",
     srcs = [
+        "bfloat16.h",
         "ctstring.h",
         "ctstring_internal.h",
         "logging.h",
@@ -1487,6 +1511,7 @@ filegroup(
 filegroup(
     name = "jpeg_internal_hdrs",
     srcs = [
+        "bfloat16.h",
         "ctstring.h",
         "ctstring_internal.h",
         "dynamic_annotations.h",
@@ -1504,6 +1529,7 @@ filegroup(
 filegroup(
     name = "gif_internal_hdrs",
     srcs = [
+        "bfloat16.h",
         "ctstring.h",
         "ctstring_internal.h",
         "dynamic_annotations.h",
@@ -1523,6 +1549,7 @@ filegroup(
     srcs = [
         "abi.cc",
         "abi.h",
+        "bfloat16.h",
         "blocking_counter.h",
         "byte_order.h",
         "casts.h",
diff --git a/tensorflow/core/platform/bfloat16.h b/tensorflow/core/platform/bfloat16.h
new file mode 100644
index 00000000000..3e3ab2ce55a
--- /dev/null
+++ b/tensorflow/core/platform/bfloat16.h
@@ -0,0 +1,28 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_BFLOAT16_BFLOAT16_H_
+#define TENSORFLOW_CORE_PLATFORM_BFLOAT16_BFLOAT16_H_
+
+// clang-format off
+#include "tensorflow/core/platform/byte_order.h"
+#include "third_party/eigen3/Eigen/Core"
+// clang-format on
+
+namespace tensorflow {
+typedef Eigen::bfloat16 bfloat16;
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_BFLOAT16_BFLOAT16_H_
diff --git a/tensorflow/core/platform/build_config.bzl b/tensorflow/core/platform/build_config.bzl
index 3bfbe617122..cd902ac3353 100644
--- a/tensorflow/core/platform/build_config.bzl
+++ b/tensorflow/core/platform/build_config.bzl
@@ -43,6 +43,7 @@ load(
     _tf_py_clif_cc = "tf_py_clif_cc",
     _tf_pyclif_proto_library = "tf_pyclif_proto_library",
     _tf_resource_deps = "tf_resource_deps",
+    _tf_tpu_dependencies = "tf_tpu_dependencies",
     _tf_windows_aware_platform_deps = "tf_windows_aware_platform_deps",
 )
 
@@ -88,3 +89,4 @@ tf_py_clif_cc = _tf_py_clif_cc
 tf_pyclif_proto_library = _tf_pyclif_proto_library
 tf_resource_deps = _tf_resource_deps
 tf_windows_aware_platform_deps = _tf_windows_aware_platform_deps
+tf_tpu_dependencies = _tf_tpu_dependencies
diff --git a/tensorflow/core/platform/build_config_root.bzl b/tensorflow/core/platform/build_config_root.bzl
index c5626ca8d8c..b82e1041695 100644
--- a/tensorflow/core/platform/build_config_root.bzl
+++ b/tensorflow/core/platform/build_config_root.bzl
@@ -14,7 +14,6 @@ load(
     _tf_cuda_tests_tags = "tf_cuda_tests_tags",
     _tf_exec_properties = "tf_exec_properties",
     _tf_gpu_tests_tags = "tf_gpu_tests_tags",
-    _tf_sycl_tests_tags = "tf_sycl_tests_tags",
 )
 
 if_dynamic_kernels = _if_dynamic_kernels
@@ -29,4 +28,3 @@ tf_additional_xla_deps_py = _tf_additional_xla_deps_py
 tf_cuda_tests_tags = _tf_cuda_tests_tags
 tf_exec_properties = _tf_exec_properties
 tf_gpu_tests_tags = _tf_gpu_tests_tags
-tf_sycl_tests_tags = _tf_sycl_tests_tags
diff --git a/tensorflow/core/platform/cloud/BUILD b/tensorflow/core/platform/cloud/BUILD
index 5553c9094cb..4b934fbac8d 100644
--- a/tensorflow/core/platform/cloud/BUILD
+++ b/tensorflow/core/platform/cloud/BUILD
@@ -20,7 +20,7 @@ package_group(
     packages = [
         "//learning/brain/tfrc/...",
         "//tensorflow/...",
-        "//third_party/gstpufs/...",
+        "//third_party/gsmemcachedfs/...",
     ],
 )
 
@@ -391,8 +391,8 @@ tf_cc_test(
     size = "small",
     srcs = ["oauth_client_test.cc"],
     data = [
-        "testdata/service_account_credentials.json",
-        "testdata/service_account_public_key.txt",
+        "//tensorflow/core/platform/cloud/testdata:service_account_credentials",
+        "//tensorflow/core/platform/cloud/testdata:service_account_public_key",
     ],
     deps = [
         ":http_request_fake",
@@ -414,8 +414,8 @@ tf_cc_test(
     size = "small",
     srcs = ["google_auth_provider_test.cc"],
     data = [
-        "testdata/application_default_credentials.json",
-        "testdata/service_account_credentials.json",
+        "//tensorflow/core/platform/cloud/testdata:application_default_credentials",
+        "//tensorflow/core/platform/cloud/testdata:service_account_credentials",
     ],
     deps = [
         ":google_auth_provider",
diff --git a/tensorflow/core/platform/cloud/curl_http_request.cc b/tensorflow/core/platform/cloud/curl_http_request.cc
index 3ae121abafe..10df2e12038 100644
--- a/tensorflow/core/platform/cloud/curl_http_request.cc
+++ b/tensorflow/core/platform/cloud/curl_http_request.cc
@@ -494,13 +494,16 @@ Status CurlHttpRequest::Send() {
 
     // INVALID_ARGUMENT indicates a problem with how the request is constructed.
     case 400:  // Bad Request
+    case 406:  // Not Acceptable
     case 411:  // Length Required
+    case 414:  // URI Too Long
       result = errors::InvalidArgument(get_error_message());
       break;
 
     // PERMISSION_DENIED indicates an authentication or an authorization issue.
     case 401:  // Unauthorized
     case 403:  // Forbidden
+    case 407:  // Proxy Authorization Required
       result = errors::PermissionDenied(get_error_message());
       break;
 
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.cc b/tensorflow/core/platform/cloud/gcs_file_system.cc
index 63c601f2244..74553f33a3d 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system.cc
@@ -30,7 +30,7 @@ limitations under the License.
 #include <io.h>  // for _mktemp
 #endif
 #include "absl/base/macros.h"
-#include "include/json/json.h"
+#include "json/json.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/platform/cloud/curl_http_request.h"
 #include "tensorflow/core/platform/cloud/file_block_cache.h"
@@ -389,7 +389,7 @@ class BufferedGcsRandomAccessFile : public RandomAccessFile {
 typedef std::function<Status(
     uint64 start_offset, const std::string& object_to_upload,
     const std::string& bucket, uint64 file_size, const std::string& gcs_path,
-    std::string* session_uri)>
+    UploadSessionHandle* session_handle)>
     SessionCreator;
 
 // Function object declaration with params needed to upload objects.
@@ -542,7 +542,7 @@ class GcsWritableFile : public WritableFile {
       return errors::Internal(
           "Could not write to the internal temporary file.");
     }
-    string session_uri;
+    UploadSessionHandle session_handle;
     uint64 start_offset = 0;
     string object_to_upload = object_;
     bool should_compose = false;
@@ -556,17 +556,21 @@ class GcsWritableFile : public WritableFile {
                             io::Basename(object_), ".", start_offset_);
       }
     }
-    TF_RETURN_IF_ERROR(
-        CreateNewUploadSession(start_offset, object_to_upload, &session_uri));
+    TF_RETURN_IF_ERROR(CreateNewUploadSession(start_offset, object_to_upload,
+                                              &session_handle));
     uint64 already_uploaded = 0;
     bool first_attempt = true;
     const Status upload_status = RetryingUtils::CallWithRetries(
-        [&first_attempt, &already_uploaded, &session_uri, &start_offset,
+        [&first_attempt, &already_uploaded, &session_handle, &start_offset,
          this]() {
-          if (!first_attempt) {
+          if (session_handle.resumable && !first_attempt) {
             bool completed;
             TF_RETURN_IF_ERROR(RequestUploadSessionStatus(
-                session_uri, &completed, &already_uploaded));
+                session_handle.session_uri, &completed, &already_uploaded));
+            LOG(INFO) << "### RequestUploadSessionStatus: completed = "
+                      << completed
+                      << ", already_uploaded = " << already_uploaded
+                      << ", file = " << GetGcsPath();
             if (completed) {
               // Erase the file from the file cache on every successful write.
               file_cache_erase_();
@@ -577,7 +581,8 @@ class GcsWritableFile : public WritableFile {
             }
           }
           first_attempt = false;
-          return UploadToSession(session_uri, start_offset, already_uploaded);
+          return UploadToSession(session_handle.session_uri, start_offset,
+                                 already_uploaded);
         },
         retry_config_);
     if (upload_status.code() == errors::Code::NOT_FOUND) {
@@ -617,11 +622,11 @@ class GcsWritableFile : public WritableFile {
   /// Initiates a new resumable upload session.
   Status CreateNewUploadSession(uint64 start_offset,
                                 std::string object_to_upload,
-                                std::string* session_uri) {
+                                UploadSessionHandle* session_handle) {
     uint64 file_size;
     TF_RETURN_IF_ERROR(GetCurrentFileSize(&file_size));
     return session_creator_(start_offset, object_to_upload, bucket_, file_size,
-                            GetGcsPath(), session_uri);
+                            GetGcsPath(), session_handle);
   }
 
   /// Appends the data of append_object to the original object and deletes
@@ -648,7 +653,8 @@ class GcsWritableFile : public WritableFile {
           TF_RETURN_WITH_CONTEXT_IF_ERROR(request->Send(),
                                           " when composing to ", GetGcsPath());
           TF_RETURN_WITH_CONTEXT_IF_ERROR(
-              filesystem_->DeleteFile(GetGcsPathWithObject(append_object)),
+              filesystem_->DeleteFile(GetGcsPathWithObject(append_object),
+                                      nullptr),
               " when cleaning up.");
           return Status::OK();
         },
@@ -912,6 +918,7 @@ GcsFileSystem::GcsFileSystem(
     std::pair<const string, const string>* additional_header,
     bool compose_append)
     : timeouts_(timeouts),
+      retry_config_(retry_config),
       auth_provider_(std::move(auth_provider)),
       http_request_factory_(std::move(http_request_factory)),
       zone_provider_(std::move(zone_provider)),
@@ -925,12 +932,11 @@ GcsFileSystem::GcsFileSystem(
           kCacheNeverExpire, kBucketLocationCacheMaxEntries)),
       allowed_locations_(allowed_locations),
       compose_append_(compose_append),
-      retry_config_(retry_config),
       additional_header_(additional_header) {}
 
 Status GcsFileSystem::NewRandomAccessFile(
-    const string& fname,
-    std::unique_ptr<RandomAccessFile>* result /*, TransactionToken* token */) {
+    const string& fname, TransactionToken* token,
+    std::unique_ptr<RandomAccessFile>* result) {
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseGcsPath(fname, false, &bucket, &object));
   TF_RETURN_IF_ERROR(CheckBucketLocationConstraint(bucket));
@@ -1079,7 +1085,7 @@ Status GcsFileSystem::LoadBufferFromGCS(const string& fname, size_t offset,
 Status GcsFileSystem::CreateNewUploadSession(
     uint64 start_offset, const std::string& object_to_upload,
     const std::string& bucket, uint64 file_size, const std::string& gcs_path,
-    std::string* session_uri) {
+    UploadSessionHandle* session_handle) {
   std::vector<char> output_buffer;
   std::unique_ptr<HttpRequest> request;
   TF_RETURN_IF_ERROR(CreateHttpRequest(&request));
@@ -1095,9 +1101,10 @@ Status GcsFileSystem::CreateNewUploadSession(
   request->SetTimeouts(timeouts_.connect, timeouts_.idle, timeouts_.metadata);
   TF_RETURN_WITH_CONTEXT_IF_ERROR(request->Send(),
                                   " when initiating an upload to ", gcs_path);
-  if (session_uri != nullptr) {
-    *session_uri = request->GetResponseHeader("Location");
-    if (session_uri->empty()) {
+  if (session_handle != nullptr) {
+    session_handle->resumable = true;
+    session_handle->session_uri = request->GetResponseHeader("Location");
+    if (session_handle->session_uri.empty()) {
       return errors::Internal("Unexpected response from GCS when writing to ",
                               gcs_path, ": 'Location' header not returned.");
     }
@@ -1231,18 +1238,18 @@ void GcsFileSystem::ClearFileCaches(const string& fname) {
   // MatchingPathsCache as well.
 }
 
-Status GcsFileSystem::NewWritableFile(
-    const string& fname,
-    std::unique_ptr<WritableFile>* result /*, TransactionToken* token */) {
+Status GcsFileSystem::NewWritableFile(const string& fname,
+                                      TransactionToken* token,
+                                      std::unique_ptr<WritableFile>* result) {
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseGcsPath(fname, false, &bucket, &object));
 
   auto session_creator =
       [this](uint64 start_offset, const std::string& object_to_upload,
              const std::string& bucket, uint64 file_size,
-             const std::string& gcs_path, std::string* session_uri) {
+             const std::string& gcs_path, UploadSessionHandle* session_handle) {
         return CreateNewUploadSession(start_offset, object_to_upload, bucket,
-                                      file_size, gcs_path, session_uri);
+                                      file_size, gcs_path, session_handle);
       };
   auto object_uploader =
       [this](const std::string& session_uri, uint64 start_offset,
@@ -1267,11 +1274,11 @@ Status GcsFileSystem::NewWritableFile(
 
 // Reads the file from GCS in chunks and stores it in a tmp file,
 // which is then passed to GcsWritableFile.
-Status GcsFileSystem::NewAppendableFile(
-    const string& fname,
-    std::unique_ptr<WritableFile>* result /*, TransactionToken* token */) {
+Status GcsFileSystem::NewAppendableFile(const string& fname,
+                                        TransactionToken* token,
+                                        std::unique_ptr<WritableFile>* result) {
   std::unique_ptr<RandomAccessFile> reader;
-  TF_RETURN_IF_ERROR(NewRandomAccessFile(fname, &reader));
+  TF_RETURN_IF_ERROR(NewRandomAccessFile(fname, token, &reader));
   std::unique_ptr<char[]> buffer(new char[kReadAppendableFileBufferSize]);
   Status status;
   uint64 offset = 0;
@@ -1287,6 +1294,9 @@ Status GcsFileSystem::NewAppendableFile(
     if (status.ok()) {
       old_content << read_chunk;
       offset += kReadAppendableFileBufferSize;
+    } else if (status.code() == error::NOT_FOUND) {
+      // New file, there is no existing content in it.
+      break;
     } else if (status.code() == error::OUT_OF_RANGE) {
       // Expected, this means we reached EOF.
       old_content << read_chunk;
@@ -1300,9 +1310,9 @@ Status GcsFileSystem::NewAppendableFile(
   auto session_creator =
       [this](uint64 start_offset, const std::string& object_to_upload,
              const std::string& bucket, uint64 file_size,
-             const std::string& gcs_path, std::string* session_uri) {
+             const std::string& gcs_path, UploadSessionHandle* session_handle) {
         return CreateNewUploadSession(start_offset, object_to_upload, bucket,
-                                      file_size, gcs_path, session_uri);
+                                      file_size, gcs_path, session_handle);
       };
   auto object_uploader =
       [this](const std::string& session_uri, uint64 start_offset,
@@ -1330,14 +1340,14 @@ Status GcsFileSystem::NewAppendableFile(
 }
 
 Status GcsFileSystem::NewReadOnlyMemoryRegionFromFile(
-    const string& fname, std::unique_ptr<ReadOnlyMemoryRegion>*
-                             result /*, TransactionToken* token */) {
+    const string& fname, TransactionToken* token,
+    std::unique_ptr<ReadOnlyMemoryRegion>* result) {
   uint64 size;
-  TF_RETURN_IF_ERROR(GetFileSize(fname, &size));
+  TF_RETURN_IF_ERROR(GetFileSize(fname, token, &size));
   std::unique_ptr<char[]> data(new char[size]);
 
   std::unique_ptr<RandomAccessFile> file;
-  TF_RETURN_IF_ERROR(NewRandomAccessFile(fname, &file));
+  TF_RETURN_IF_ERROR(NewRandomAccessFile(fname, token, &file));
 
   StringPiece piece;
   TF_RETURN_IF_ERROR(file->Read(0, size, &piece, data.get()));
@@ -1346,8 +1356,7 @@ Status GcsFileSystem::NewReadOnlyMemoryRegionFromFile(
   return Status::OK();
 }
 
-Status GcsFileSystem::FileExists(
-    const string& fname /*, TransactionToken* token */) {
+Status GcsFileSystem::FileExists(const string& fname, TransactionToken* token) {
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseGcsPath(fname, true, &bucket, &object));
   if (object.empty()) {
@@ -1561,17 +1570,17 @@ Status GcsFileSystem::FolderExists(const string& dirname, bool* result) {
   return s;
 }
 
-Status GcsFileSystem::GetChildren(
-    const string& dirname,
-    std::vector<string>* result /*, TransactionToken* token */) {
+Status GcsFileSystem::GetChildren(const string& dirname,
+                                  TransactionToken* token,
+                                  std::vector<string>* result) {
   return GetChildrenBounded(dirname, UINT64_MAX, result,
                             false /* recursively */,
                             false /* include_self_directory_marker */);
 }
 
-Status GcsFileSystem::GetMatchingPaths(
-    const string& pattern,
-    std::vector<string>* results /*, TransactionToken* token */) {
+Status GcsFileSystem::GetMatchingPaths(const string& pattern,
+                                       TransactionToken* token,
+                                       std::vector<string>* results) {
   MatchingPathsCache::ComputeFunc compute_func =
       [this](const string& pattern, std::vector<string>* results) {
         results->clear();
@@ -1731,8 +1740,8 @@ Status GcsFileSystem::GetChildrenBounded(const string& dirname,
   }
 }
 
-Status GcsFileSystem::Stat(
-    const string& fname, FileStatistics* stat /*, TransactionToken* token */) {
+Status GcsFileSystem::Stat(const string& fname, TransactionToken* token,
+                           FileStatistics* stat) {
   if (!stat) {
     return errors::Internal("'stat' cannot be nullptr.");
   }
@@ -1766,8 +1775,7 @@ Status GcsFileSystem::Stat(
   return errors::NotFound("The specified path ", fname, " was not found.");
 }
 
-Status GcsFileSystem::DeleteFile(
-    const string& fname /*, TransactionToken* token */) {
+Status GcsFileSystem::DeleteFile(const string& fname, TransactionToken* token) {
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseGcsPath(fname, false, &bucket, &object));
 
@@ -1783,8 +1791,8 @@ Status GcsFileSystem::DeleteFile(
   return Status::OK();
 }
 
-Status GcsFileSystem::CreateDir(
-    const string& dirname /*, TransactionToken* token */) {
+Status GcsFileSystem::CreateDir(const string& dirname,
+                                TransactionToken* token) {
   string dirname_with_slash = MaybeAppendSlash(dirname);
   VLOG(3) << "CreateDir: creating directory with dirname: " << dirname
           << " and dirname_with_slash: " << dirname_with_slash;
@@ -1799,7 +1807,7 @@ Status GcsFileSystem::CreateDir(
                                         dirname_with_slash, " was not found.");
   }
 
-  if (FileExists(dirname_with_slash).ok()) {
+  if (FileExists(dirname_with_slash, token).ok()) {
     // Use the original name for a correct error here.
     VLOG(3) << "CreateDir: directory already exists, not uploading " << dirname;
     return errors::AlreadyExists(dirname);
@@ -1833,8 +1841,8 @@ Status GcsFileSystem::CreateDir(
 
 // Checks that the directory is empty (i.e no objects with this prefix exist).
 // Deletes the GCS directory marker if it exists.
-Status GcsFileSystem::DeleteDir(
-    const string& dirname /*, TransactionToken* token */) {
+Status GcsFileSystem::DeleteDir(const string& dirname,
+                                TransactionToken* token) {
   std::vector<string> children;
   // A directory is considered empty either if there are no matching objects
   // with the corresponding name prefix or if there is exactly one matching
@@ -1849,13 +1857,13 @@ Status GcsFileSystem::DeleteDir(
   }
   if (children.size() == 1 && children[0].empty()) {
     // This is the directory marker object. Delete it.
-    return DeleteFile(MaybeAppendSlash(dirname));
+    return DeleteFile(MaybeAppendSlash(dirname), token);
   }
   return Status::OK();
 }
 
-Status GcsFileSystem::GetFileSize(
-    const string& fname, uint64* file_size /*, TransactionToken* token */) {
+Status GcsFileSystem::GetFileSize(const string& fname, TransactionToken* token,
+                                  uint64* file_size) {
   if (!file_size) {
     return errors::Internal("'file_size' cannot be nullptr.");
   }
@@ -1865,14 +1873,14 @@ Status GcsFileSystem::GetFileSize(
   TF_RETURN_IF_ERROR(ParseGcsPath(fname, false, &bucket, &object));
 
   FileStatistics stat;
-  TF_RETURN_IF_ERROR(Stat(fname, &stat));
+  TF_RETURN_IF_ERROR(Stat(fname, token, &stat));
   *file_size = stat.length;
   return Status::OK();
 }
 
-Status GcsFileSystem::RenameFile(
-    const string& src, const string& target /*, TransactionToken* token */) {
-  if (!IsDirectory(src).ok()) {
+Status GcsFileSystem::RenameFile(const string& src, const string& target,
+                                 TransactionToken* token) {
+  if (!IsDirectory(src, token).ok()) {
     return RenameObject(src, target);
   }
   // Rename all individual objects in the directory one by one.
@@ -1930,11 +1938,11 @@ Status GcsFileSystem::RenameObject(const string& src, const string& target) {
   // on the server side, we can't just retry the whole RenameFile operation
   // because the source object is already gone.
   return RetryingUtils::DeleteWithRetries(
-      [this, &src]() { return DeleteFile(src); }, retry_config_);
+      [this, &src]() { return DeleteFile(src, nullptr); }, retry_config_);
 }
 
-Status GcsFileSystem::IsDirectory(
-    const string& fname /*, TransactionToken* token */) {
+Status GcsFileSystem::IsDirectory(const string& fname,
+                                  TransactionToken* token) {
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseGcsPath(fname, true, &bucket, &object));
   if (object.empty()) {
@@ -1960,16 +1968,17 @@ Status GcsFileSystem::IsDirectory(
   return errors::NotFound("The specified path ", fname, " was not found.");
 }
 
-Status GcsFileSystem::DeleteRecursively(
-    const string& dirname, int64* undeleted_files,
-    int64* undeleted_dirs /*, TransactionToken* token */) {
+Status GcsFileSystem::DeleteRecursively(const string& dirname,
+                                        TransactionToken* token,
+                                        int64* undeleted_files,
+                                        int64* undeleted_dirs) {
   if (!undeleted_files || !undeleted_dirs) {
     return errors::Internal(
         "'undeleted_files' and 'undeleted_dirs' cannot be nullptr.");
   }
   *undeleted_files = 0;
   *undeleted_dirs = 0;
-  if (!IsDirectory(dirname).ok()) {
+  if (!IsDirectory(dirname, token).ok()) {
     *undeleted_dirs = 1;
     return Status(
         error::NOT_FOUND,
@@ -1987,9 +1996,10 @@ Status GcsFileSystem::DeleteRecursively(
     // and therefore RetryingFileSystem won't pay attention to the failures,
     // we need to make sure these failures are properly retried.
     const auto& delete_file_status = RetryingUtils::DeleteWithRetries(
-        [this, &full_path]() { return DeleteFile(full_path); }, retry_config_);
+        [this, &full_path, token]() { return DeleteFile(full_path, token); },
+        retry_config_);
     if (!delete_file_status.ok()) {
-      if (IsDirectory(full_path).ok()) {
+      if (IsDirectory(full_path, token).ok()) {
         // The object is a directory marker.
         (*undeleted_dirs)++;
       } else {
@@ -2003,7 +2013,7 @@ Status GcsFileSystem::DeleteRecursively(
 // Flushes all caches for filesystem metadata and file contents. Useful for
 // reclaiming memory once filesystem operations are done (e.g. model is loaded),
 // or for resetting the filesystem to a consistent state.
-void GcsFileSystem::FlushCaches(/* TransactionToken* token */) {
+void GcsFileSystem::FlushCaches(TransactionToken* token) {
   tf_shared_lock l(block_cache_lock_);
   file_block_cache_->Flush();
   stat_cache_->Clear();
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.h b/tensorflow/core/platform/cloud/gcs_file_system.h
index 6f0e9535bfe..eceb76970fb 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.h
+++ b/tensorflow/core/platform/cloud/gcs_file_system.h
@@ -101,6 +101,11 @@ class GcsStatsInterface {
   virtual ~GcsStatsInterface() = default;
 };
 
+struct UploadSessionHandle {
+  std::string session_uri;
+  bool resumable;
+};
+
 /// Google Cloud Storage implementation of a file system.
 ///
 /// The clients should use RetryingGcsFileSystem defined below,
@@ -125,68 +130,52 @@ class GcsFileSystem : public FileSystem {
                 std::pair<const string, const string>* additional_header,
                 bool compose_append);
 
+  TF_USE_FILESYSTEM_METHODS_WITH_NO_TRANSACTION_SUPPORT;
+
   Status NewRandomAccessFile(
-      const string& fname,
-      std::unique_ptr<RandomAccessFile>*
-          result /*, TransactionToken* token = nullptr */) override;
+      const string& fname, TransactionToken* token,
+      std::unique_ptr<RandomAccessFile>* result) override;
 
-  Status NewWritableFile(
-      const string& fname,
-      std::unique_ptr<WritableFile>*
-          result) /*, TransactionToken* token = nullptr */ override;
+  Status NewWritableFile(const string& fname, TransactionToken* token,
+                         std::unique_ptr<WritableFile>* result) override;
 
-  Status NewAppendableFile(
-      const string& fname,
-      std::unique_ptr<WritableFile>*
-          result /*, TransactionToken* token = nullptr */) override;
+  Status NewAppendableFile(const string& fname, TransactionToken* token,
+                           std::unique_ptr<WritableFile>* result) override;
 
   Status NewReadOnlyMemoryRegionFromFile(
-      const string& fname,
-      std::unique_ptr<ReadOnlyMemoryRegion>*
-          result /*, TransactionToken* token = nullptr */) override;
+      const string& fname, TransactionToken* token,
+      std::unique_ptr<ReadOnlyMemoryRegion>* result) override;
 
-  Status FileExists(
-      const string& fname /*, TransactionToken* token = nullptr */) override;
+  Status FileExists(const string& fname, TransactionToken* token) override;
 
-  Status Stat(
-      const string& fname,
-      FileStatistics* stat /*, TransactionToken* token = nullptr */) override;
+  Status Stat(const string& fname, TransactionToken* token,
+              FileStatistics* stat) override;
 
-  Status GetChildren(
-      const string& dir,
-      std::vector<string>* result /*, TransactionToken* token = nullptr */)
-      override;
+  Status GetChildren(const string& dir, TransactionToken* token,
+                     std::vector<string>* result) override;
 
-  Status GetMatchingPaths(
-      const string& pattern,
-      std::vector<string>* results /*, TransactionToken* token = nullptr */)
-      override;
+  Status GetMatchingPaths(const string& pattern, TransactionToken* token,
+                          std::vector<string>* results) override;
 
-  Status DeleteFile(
-      const string& fname /*, TransactionToken* token = nullptr */) override;
+  Status DeleteFile(const string& fname, TransactionToken* token) override;
 
-  Status CreateDir(
-      const string& dirname /*, TransactionToken* token = nullptr */) override;
+  Status CreateDir(const string& dirname, TransactionToken* token) override;
 
-  Status DeleteDir(
-      const string& dirname /*, TransactionToken* token = nullptr */) override;
+  Status DeleteDir(const string& dirname, TransactionToken* token) override;
 
-  Status GetFileSize(
-      const string& fname,
-      uint64* file_size /*, TransactionToken* token = nullptr */) override;
+  Status GetFileSize(const string& fname, TransactionToken* token,
+                     uint64* file_size) override;
 
-  Status RenameFile(
-      const string& src,
-      const string& target /*, TransactionToken* token = nullptr */) override;
+  Status RenameFile(const string& src, const string& target,
+                    TransactionToken* token) override;
 
-  Status IsDirectory(
-      const string& fname /*, TransactionToken* token = nullptr */) override;
+  Status IsDirectory(const string& fname, TransactionToken* token) override;
 
-  Status DeleteRecursively(
-      const string& dirname, int64* undeleted_files,
-      int64* undeleted_dirs /*, TransactionToken* token = nullptr */) override;
+  Status DeleteRecursively(const string& dirname, TransactionToken* token,
+                           int64* undeleted_files,
+                           int64* undeleted_dirs) override;
 
-  void FlushCaches(/* TransactionToken* token = nullptr */) override;
+  void FlushCaches(TransactionToken* token) override;
 
   /// Set an object to collect runtime statistics from the GcsFilesystem.
   void SetStats(GcsStatsInterface* stats);
@@ -297,7 +286,7 @@ class GcsFileSystem : public FileSystem {
                                         const std::string& bucket,
                                         uint64 file_size,
                                         const std::string& gcs_path,
-                                        std::string* session_uri);
+                                        UploadSessionHandle* session_handle);
 
   // Uploads object data to session.
   virtual Status UploadToSession(const std::string& session_uri,
@@ -334,6 +323,9 @@ class GcsFileSystem : public FileSystem {
   // Used by a subclass.
   TimeoutConfig timeouts_;
 
+  /// The retry configuration used for retrying failed calls.
+  RetryConfig retry_config_;
+
  private:
   // GCS file statistics.
   struct GcsFileStat {
@@ -432,9 +424,6 @@ class GcsFileSystem : public FileSystem {
 
   GcsStatsInterface* stats_ = nullptr;  // Not owned.
 
-  /// The initial delay for exponential backoffs when retrying failed calls.
-  RetryConfig retry_config_;
-
   // Additional header material to be transmitted with all GCS requests
   std::unique_ptr<std::pair<const string, const string>> additional_header_;
 
diff --git a/tensorflow/core/platform/cloud/gcs_file_system_test.cc b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
index c8e72487bbe..34f86f5107c 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system_test.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
@@ -86,7 +86,8 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_NoBlockCache) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<RandomAccessFile> file;
-  TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
+  TF_EXPECT_OK(
+      fs.NewRandomAccessFile("gs://bucket/random_access.txt", nullptr, &file));
 
   StringPiece filename;
   TF_EXPECT_OK(file->Name(&filename));
@@ -133,7 +134,8 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_Buffered) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<RandomAccessFile> file;
-  TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
+  TF_EXPECT_OK(
+      fs.NewRandomAccessFile("gs://bucket/random_access.txt", nullptr, &file));
 
   StringPiece filename;
   TF_EXPECT_OK(file->Name(&filename));
@@ -181,7 +183,8 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_Buffered_Errors) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<RandomAccessFile> file;
-  TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
+  TF_EXPECT_OK(
+      fs.NewRandomAccessFile("gs://bucket/random_access.txt", nullptr, &file));
 
   StringPiece filename;
   TF_EXPECT_OK(file->Name(&filename));
@@ -228,7 +231,8 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_Buffered_ReadAtEOF) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<RandomAccessFile> file;
-  TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
+  TF_EXPECT_OK(
+      fs.NewRandomAccessFile("gs://bucket/random_access.txt", nullptr, &file));
 
   StringPiece filename;
   TF_EXPECT_OK(file->Name(&filename));
@@ -269,7 +273,8 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_Buffered_CachedOutOfRange) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<RandomAccessFile> file;
-  TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
+  TF_EXPECT_OK(
+      fs.NewRandomAccessFile("gs://bucket/random_access.txt", nullptr, &file));
 
   StringPiece filename;
   TF_EXPECT_OK(file->Name(&filename));
@@ -320,7 +325,8 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_Buffered_CachedNotSequential) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<RandomAccessFile> file;
-  TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
+  TF_EXPECT_OK(
+      fs.NewRandomAccessFile("gs://bucket/random_access.txt", nullptr, &file));
 
   StringPiece filename;
   TF_EXPECT_OK(file->Name(&filename));
@@ -361,7 +367,8 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_Buffered_Growing) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<RandomAccessFile> file;
-  TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
+  TF_EXPECT_OK(
+      fs.NewRandomAccessFile("gs://bucket/random_access.txt", nullptr, &file));
 
   StringPiece filename;
   TF_EXPECT_OK(file->Name(&filename));
@@ -408,7 +415,8 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_Buffered_ReadBackwards) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<RandomAccessFile> file;
-  TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
+  TF_EXPECT_OK(
+      fs.NewRandomAccessFile("gs://bucket/random_access.txt", nullptr, &file));
 
   StringPiece filename;
   TF_EXPECT_OK(file->Name(&filename));
@@ -450,7 +458,8 @@ TEST(GcsFileSystemTest,
       nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<RandomAccessFile> file;
-  TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
+  TF_EXPECT_OK(
+      fs.NewRandomAccessFile("gs://bucket/random_access.txt", nullptr, &file));
 }
 
 TEST(GcsFileSystemTest, NewRandomAccessFile_WithLocationConstraintCaching) {
@@ -496,18 +505,18 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_WithLocationConstraintCaching) {
   string bucket = "gs://bucket/random_access.txt";
   string another_bucket = "gs://anotherbucket/random_access.txt";
   // Multiple calls should only cause one request to the location api.
-  TF_EXPECT_OK(fs.NewRandomAccessFile(bucket, &file));
-  TF_EXPECT_OK(fs.NewRandomAccessFile(bucket, &file));
+  TF_EXPECT_OK(fs.NewRandomAccessFile(bucket, nullptr, &file));
+  TF_EXPECT_OK(fs.NewRandomAccessFile(bucket, nullptr, &file));
 
   // A new bucket should have one cache miss
-  TF_EXPECT_OK(fs.NewRandomAccessFile(another_bucket, &file));
+  TF_EXPECT_OK(fs.NewRandomAccessFile(another_bucket, nullptr, &file));
   // And then future calls to both should be cached
-  TF_EXPECT_OK(fs.NewRandomAccessFile(bucket, &file));
-  TF_EXPECT_OK(fs.NewRandomAccessFile(another_bucket, &file));
+  TF_EXPECT_OK(fs.NewRandomAccessFile(bucket, nullptr, &file));
+  TF_EXPECT_OK(fs.NewRandomAccessFile(another_bucket, nullptr, &file));
 
   // Trigger a flush, should then require one more call
-  fs.FlushCaches();
-  TF_EXPECT_OK(fs.NewRandomAccessFile(bucket, &file));
+  fs.FlushCaches(nullptr);
+  TF_EXPECT_OK(fs.NewRandomAccessFile(bucket, nullptr, &file));
 }
 
 TEST(GcsFileSystemTest,
@@ -533,10 +542,11 @@ TEST(GcsFileSystemTest,
       nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<RandomAccessFile> file;
-  EXPECT_EQ(tensorflow::errors::FailedPrecondition(
-                "Bucket 'bucket' is in 'barfoo' location, allowed locations "
-                "are: (us-east1)."),
-            fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
+  EXPECT_EQ(
+      tensorflow::errors::FailedPrecondition(
+          "Bucket 'bucket' is in 'barfoo' location, allowed locations "
+          "are: (us-east1)."),
+      fs.NewRandomAccessFile("gs://bucket/random_access.txt", nullptr, &file));
 }
 
 TEST(GcsFileSystemTest, NewRandomAccessFile_NoBlockCache_DifferentN) {
@@ -565,7 +575,8 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_NoBlockCache_DifferentN) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<RandomAccessFile> file;
-  TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
+  TF_EXPECT_OK(
+      fs.NewRandomAccessFile("gs://bucket/random_access.txt", nullptr, &file));
 
   char small_scratch[3];
   StringPiece result;
@@ -630,8 +641,8 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_WithBlockCache) {
     // We are instantiating this in an enclosed scope to make sure after the
     // unique ptr goes out of scope, we can still access result.
     std::unique_ptr<RandomAccessFile> file;
-    TF_EXPECT_OK(
-        fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
+    TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt",
+                                        nullptr, &file));
 
     // Read the first chunk. The cache will be populated with the first block of
     // 9 bytes.
@@ -716,7 +727,8 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_WithBlockCache_Flush) {
   char scratch[100];
   StringPiece result;
   std::unique_ptr<RandomAccessFile> file;
-  TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
+  TF_EXPECT_OK(
+      fs.NewRandomAccessFile("gs://bucket/random_access.txt", nullptr, &file));
   // Read the first chunk. The cache will be populated with the first block of
   // 9 bytes.
   scratch[5] = 'x';
@@ -725,7 +737,7 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_WithBlockCache_Flush) {
   EXPECT_EQ(scratch[5], 'x');  // Make sure we only copied 4 bytes.
   // Flush caches and read the second chunk. This will be a cache miss, and
   // the same block will be fetched again.
-  fs.FlushCaches();
+  fs.FlushCaches(nullptr);
   TF_EXPECT_OK(file->Read(4, 4, &result, scratch));
   EXPECT_EQ("4567", result);
 }
@@ -772,8 +784,8 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_WithBlockCache_MaxStaleness) {
     // staleness of the filesystem is > 0, they will share the same blocks.
     std::unique_ptr<RandomAccessFile> file1;
     std::unique_ptr<RandomAccessFile> file2;
-    TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/object", &file1));
-    TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/object", &file2));
+    TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/object", nullptr, &file1));
+    TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/object", nullptr, &file2));
     // Reading the first block from file1 should load it once.
     TF_EXPECT_OK(file1->Read(0, 8, &result, scratch));
     EXPECT_EQ("01234567", result);
@@ -834,7 +846,8 @@ TEST(GcsFileSystemTest,
       nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<RandomAccessFile> file;
-  TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
+  TF_EXPECT_OK(
+      fs.NewRandomAccessFile("gs://bucket/random_access.txt", nullptr, &file));
 
   char scratch[5];
   StringPiece result;
@@ -864,7 +877,7 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_NoObjectName) {
 
   std::unique_ptr<RandomAccessFile> file;
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
-            fs.NewRandomAccessFile("gs://bucket/", &file).code());
+            fs.NewRandomAccessFile("gs://bucket/", nullptr, &file).code());
 }
 
 TEST(GcsFileSystemTest, NewRandomAccessFile_InconsistentRead) {
@@ -897,10 +910,11 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_InconsistentRead) {
 
   // Stat the file first so that the file stats are cached.
   FileStatistics stat;
-  TF_ASSERT_OK(fs.Stat("gs://bucket/random_access.txt", &stat));
+  TF_ASSERT_OK(fs.Stat("gs://bucket/random_access.txt", nullptr, &stat));
 
   std::unique_ptr<RandomAccessFile> file;
-  TF_ASSERT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
+  TF_ASSERT_OK(
+      fs.NewRandomAccessFile("gs://bucket/random_access.txt", nullptr, &file));
 
   char scratch[6];
   StringPiece result;
@@ -964,14 +978,16 @@ TEST(GcsFileSystemTest, NewWritableFile) {
 
   // Read from the file first, to fill the block cache.
   std::unique_ptr<RandomAccessFile> rfile;
-  TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/path/writeable", &rfile));
+  TF_EXPECT_OK(
+      fs.NewRandomAccessFile("gs://bucket/path/writeable", nullptr, &rfile));
   char scratch[100];
   StringPiece result;
   TF_EXPECT_OK(rfile->Read(0, 4, &result, scratch));
   EXPECT_EQ("0123", result);
   // Open the writable file.
   std::unique_ptr<WritableFile> wfile;
-  TF_EXPECT_OK(fs.NewWritableFile("gs://bucket/path/writeable", &wfile));
+  TF_EXPECT_OK(
+      fs.NewWritableFile("gs://bucket/path/writeable", nullptr, &wfile));
   TF_EXPECT_OK(wfile->Append("content1,"));
   int64 pos;
   TF_EXPECT_OK(wfile->Tell(&pos));
@@ -1055,7 +1071,8 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadSucceeds) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<WritableFile> file;
-  TF_EXPECT_OK(fs.NewWritableFile("gs://bucket/path/writeable.txt", &file));
+  TF_EXPECT_OK(
+      fs.NewWritableFile("gs://bucket/path/writeable.txt", nullptr, &file));
 
   TF_EXPECT_OK(file->Append("content1,"));
   TF_EXPECT_OK(file->Append("content2"));
@@ -1127,7 +1144,8 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadSucceedsOnGetStatus) {
   // Pull the file's first block into the cache. This will trigger the first
   // HTTP request to GCS.
   std::unique_ptr<RandomAccessFile> rfile;
-  TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/path/writeable", &rfile));
+  TF_EXPECT_OK(
+      fs.NewRandomAccessFile("gs://bucket/path/writeable", nullptr, &rfile));
   char scratch[100];
   StringPiece result;
   TF_EXPECT_OK(rfile->Read(0, 4, &result, scratch));
@@ -1135,7 +1153,8 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadSucceedsOnGetStatus) {
   // Now write to the same file. Once the write succeeds, the cached block will
   // be flushed.
   std::unique_ptr<WritableFile> wfile;
-  TF_EXPECT_OK(fs.NewWritableFile("gs://bucket/path/writeable", &wfile));
+  TF_EXPECT_OK(
+      fs.NewWritableFile("gs://bucket/path/writeable", nullptr, &wfile));
   TF_EXPECT_OK(wfile->Append("content1,"));
   TF_EXPECT_OK(wfile->Append("content2"));
   // Appending doesn't invalidate the read cache - only flushing does. This read
@@ -1213,7 +1232,8 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadAllAttemptsFail) {
       false /* compose append */);
 
   std::unique_ptr<WritableFile> file;
-  TF_EXPECT_OK(fs.NewWritableFile("gs://bucket/path/writeable.txt", &file));
+  TF_EXPECT_OK(
+      fs.NewWritableFile("gs://bucket/path/writeable.txt", nullptr, &file));
 
   TF_EXPECT_OK(file->Append("content1,"));
   TF_EXPECT_OK(file->Append("content2"));
@@ -1277,7 +1297,8 @@ TEST(GcsFileSystemTest, NewWritableFile_UploadReturns410) {
 
   {
     std::unique_ptr<WritableFile> file;
-    TF_EXPECT_OK(fs.NewWritableFile("gs://bucket/path/writeable.txt", &file));
+    TF_EXPECT_OK(
+        fs.NewWritableFile("gs://bucket/path/writeable.txt", nullptr, &file));
 
     TF_EXPECT_OK(file->Append("content1,"));
     TF_EXPECT_OK(file->Append("content2"));
@@ -1317,7 +1338,7 @@ TEST(GcsFileSystemTest, NewWritableFile_NoObjectName) {
 
   std::unique_ptr<WritableFile> file;
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
-            fs.NewWritableFile("gs://bucket/", &file).code());
+            fs.NewWritableFile("gs://bucket/", nullptr, &file).code());
 }
 
 TEST(GcsFileSystemTest, NewAppendableFile) {
@@ -1382,12 +1403,14 @@ TEST(GcsFileSystemTest, NewAppendableFile) {
   // Create an appendable file. This should read the file from GCS, and pull its
   // contents into the block cache.
   std::unique_ptr<WritableFile> wfile;
-  TF_EXPECT_OK(fs.NewAppendableFile("gs://bucket/path/appendable", &wfile));
+  TF_EXPECT_OK(
+      fs.NewAppendableFile("gs://bucket/path/appendable", nullptr, &wfile));
   TF_EXPECT_OK(wfile->Append("content2"));
   // Verify that the file contents are in the block cache. This read should not
   // trigger an HTTP request to GCS.
   std::unique_ptr<RandomAccessFile> rfile;
-  TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/path/appendable", &rfile));
+  TF_EXPECT_OK(
+      fs.NewRandomAccessFile("gs://bucket/path/appendable", nullptr, &rfile));
   char scratch[100];
   StringPiece result;
   TF_EXPECT_OK(rfile->Read(0, 8, &result, scratch));
@@ -1416,7 +1439,38 @@ TEST(GcsFileSystemTest, NewAppendableFile_NoObjectName) {
 
   std::unique_ptr<WritableFile> file;
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
-            fs.NewAppendableFile("gs://bucket/", &file).code());
+            fs.NewAppendableFile("gs://bucket/", nullptr, &file).code());
+}
+
+TEST(GcsFileSystemTest, NewAppendableFile_ObjectDoesNotExist) {
+  std::vector<HttpRequest*> requests(
+      {new FakeHttpRequest(
+           "Uri: https://storage.googleapis.com/bucket/filename\n"
+           "Auth Token: fake_token\n"
+           "Range: 0-1048575\n"
+           "Timeouts: 5 1 20\n",
+           "", errors::NotFound("404"), 404),
+       new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/upload/storage/v1/b/bucket/o"
+           "?uploadType=resumable&name=filename\n"
+           "Auth Token: fake_token\n"
+           "Header X-Upload-Content-Length: 0\n"
+           "Post: yes\n"
+           "Timeouts: 5 1 10\n",
+           "")});
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
+
+  std::unique_ptr<WritableFile> file;
+  TF_EXPECT_OK(fs.NewAppendableFile("gs://bucket/filename", nullptr, &file));
 }
 
 TEST(GcsFileSystemTest, NewReadOnlyMemoryRegionFromFile) {
@@ -1450,7 +1504,7 @@ TEST(GcsFileSystemTest, NewReadOnlyMemoryRegionFromFile) {
 
   std::unique_ptr<ReadOnlyMemoryRegion> region;
   TF_EXPECT_OK(fs.NewReadOnlyMemoryRegionFromFile(
-      "gs://bucket/path/random_access.txt", &region));
+      "gs://bucket/path/random_access.txt", nullptr, &region));
 
   EXPECT_EQ(content, StringPiece(reinterpret_cast<const char*>(region->data()),
                                  region->length()));
@@ -1471,7 +1525,8 @@ TEST(GcsFileSystemTest, NewReadOnlyMemoryRegionFromFile_NoObjectName) {
 
   std::unique_ptr<ReadOnlyMemoryRegion> region;
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
-            fs.NewReadOnlyMemoryRegionFromFile("gs://bucket/", &region).code());
+            fs.NewReadOnlyMemoryRegionFromFile("gs://bucket/", nullptr, &region)
+                .code());
 }
 
 TEST(GcsFileSystemTest, FileExists_YesAsObject) {
@@ -1493,7 +1548,7 @@ TEST(GcsFileSystemTest, FileExists_YesAsObject) {
       kTestTimeoutConfig, *kAllowedLocationsDefault,
       nullptr /* gcs additional header */, false /* compose append */);
 
-  TF_EXPECT_OK(fs.FileExists("gs://bucket/path/file1.txt"));
+  TF_EXPECT_OK(fs.FileExists("gs://bucket/path/file1.txt", nullptr));
 }
 
 TEST(GcsFileSystemTest, FileExists_YesAsFolder) {
@@ -1523,7 +1578,7 @@ TEST(GcsFileSystemTest, FileExists_YesAsFolder) {
       kTestTimeoutConfig, *kAllowedLocationsDefault,
       nullptr /* gcs additional header */, false /* compose append */);
 
-  TF_EXPECT_OK(fs.FileExists("gs://bucket/path/subfolder"));
+  TF_EXPECT_OK(fs.FileExists("gs://bucket/path/subfolder", nullptr));
 }
 
 TEST(GcsFileSystemTest, FileExists_YesAsBucket) {
@@ -1549,8 +1604,8 @@ TEST(GcsFileSystemTest, FileExists_YesAsBucket) {
       kTestTimeoutConfig, *kAllowedLocationsDefault,
       nullptr /* gcs additional header */, false /* compose append */);
 
-  TF_EXPECT_OK(fs.FileExists("gs://bucket1"));
-  TF_EXPECT_OK(fs.FileExists("gs://bucket1/"));
+  TF_EXPECT_OK(fs.FileExists("gs://bucket1", nullptr));
+  TF_EXPECT_OK(fs.FileExists("gs://bucket1/", nullptr));
 }
 
 TEST(GcsFileSystemTest, FileExists_NotAsObjectOrFolder) {
@@ -1580,7 +1635,7 @@ TEST(GcsFileSystemTest, FileExists_NotAsObjectOrFolder) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   EXPECT_EQ(errors::Code::NOT_FOUND,
-            fs.FileExists("gs://bucket/path/file1.txt").code());
+            fs.FileExists("gs://bucket/path/file1.txt", nullptr).code());
 }
 
 TEST(GcsFileSystemTest, FileExists_NotAsBucket) {
@@ -1606,9 +1661,9 @@ TEST(GcsFileSystemTest, FileExists_NotAsBucket) {
       kTestTimeoutConfig, *kAllowedLocationsDefault,
       nullptr /* gcs additional header */, false /* compose append */);
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
-            fs.FileExists("gs://bucket2/").code());
+            fs.FileExists("gs://bucket2/", nullptr).code());
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
-            fs.FileExists("gs://bucket2").code());
+            fs.FileExists("gs://bucket2", nullptr).code());
 }
 
 TEST(GcsFileSystemTest, FileExists_StatCache) {
@@ -1648,8 +1703,8 @@ TEST(GcsFileSystemTest, FileExists_StatCache) {
   // The stat cache will ensure that repeated lookups don't trigger additional
   // HTTP requests.
   for (int i = 0; i < 10; i++) {
-    TF_EXPECT_OK(fs.FileExists("gs://bucket/path/file1.txt"));
-    TF_EXPECT_OK(fs.FileExists("gs://bucket/path/subfolder/"));
+    TF_EXPECT_OK(fs.FileExists("gs://bucket/path/file1.txt", nullptr));
+    TF_EXPECT_OK(fs.FileExists("gs://bucket/path/subfolder/", nullptr));
   }
 }
 
@@ -1672,8 +1727,8 @@ TEST(GcsFileSystemTest, FileExists_DirectoryMark) {
       kTestTimeoutConfig, *kAllowedLocationsDefault,
       nullptr /* gcs additional header */, false /* compose append */);
 
-  TF_EXPECT_OK(fs.FileExists("gs://bucket/dir/"));
-  TF_EXPECT_OK(fs.IsDirectory("gs://bucket/dir/"));
+  TF_EXPECT_OK(fs.FileExists("gs://bucket/dir/", nullptr));
+  TF_EXPECT_OK(fs.IsDirectory("gs://bucket/dir/", nullptr));
 }
 
 TEST(GcsFileSystemTest, GetChildren_NoItems) {
@@ -1696,7 +1751,7 @@ TEST(GcsFileSystemTest, GetChildren_NoItems) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   std::vector<string> children;
-  TF_EXPECT_OK(fs.GetChildren("gs://bucket/path/", &children));
+  TF_EXPECT_OK(fs.GetChildren("gs://bucket/path/", nullptr, &children));
 
   EXPECT_EQ(std::vector<string>({"subpath/"}), children);
 }
@@ -1724,7 +1779,7 @@ TEST(GcsFileSystemTest, GetChildren_ThreeFiles) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   std::vector<string> children;
-  TF_EXPECT_OK(fs.GetChildren("gs://bucket/path/", &children));
+  TF_EXPECT_OK(fs.GetChildren("gs://bucket/path/", nullptr, &children));
 
   EXPECT_EQ(std::vector<string>({"file1.txt", "file3.txt", "subpath/"}),
             children);
@@ -1753,7 +1808,7 @@ TEST(GcsFileSystemTest, GetChildren_SelfDirectoryMarker) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   std::vector<string> children;
-  TF_EXPECT_OK(fs.GetChildren("gs://bucket/path/", &children));
+  TF_EXPECT_OK(fs.GetChildren("gs://bucket/path/", nullptr, &children));
 
   EXPECT_EQ(std::vector<string>({"file3.txt", "subpath/"}), children);
 }
@@ -1781,7 +1836,7 @@ TEST(GcsFileSystemTest, GetChildren_ThreeFiles_NoSlash) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   std::vector<string> children;
-  TF_EXPECT_OK(fs.GetChildren("gs://bucket/path", &children));
+  TF_EXPECT_OK(fs.GetChildren("gs://bucket/path", nullptr, &children));
 
   EXPECT_EQ(std::vector<string>({"file1.txt", "file3.txt", "subpath/"}),
             children);
@@ -1806,7 +1861,7 @@ TEST(GcsFileSystemTest, GetChildren_Root) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   std::vector<string> children;
-  TF_EXPECT_OK(fs.GetChildren("gs://bucket-a-b-c", &children));
+  TF_EXPECT_OK(fs.GetChildren("gs://bucket-a-b-c", nullptr, &children));
 
   EXPECT_EQ(0, children.size());
 }
@@ -1831,7 +1886,7 @@ TEST(GcsFileSystemTest, GetChildren_Empty) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   std::vector<string> children;
-  TF_EXPECT_OK(fs.GetChildren("gs://bucket/path/", &children));
+  TF_EXPECT_OK(fs.GetChildren("gs://bucket/path/", nullptr, &children));
 
   EXPECT_EQ(0, children.size());
 }
@@ -1872,7 +1927,7 @@ TEST(GcsFileSystemTest, GetChildren_Pagination) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   std::vector<string> children;
-  TF_EXPECT_OK(fs.GetChildren("gs://bucket/path", &children));
+  TF_EXPECT_OK(fs.GetChildren("gs://bucket/path", nullptr, &children));
 
   EXPECT_EQ(std::vector<string>({"file1.txt", "file3.txt", "subpath/",
                                  "file4.txt", "file5.txt"}),
@@ -1899,8 +1954,8 @@ TEST(GcsFileSystemTest, GetMatchingPaths_NoWildcard) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   std::vector<string> result;
-  TF_EXPECT_OK(
-      fs.GetMatchingPaths("gs://bucket/path/subpath/file2.txt", &result));
+  TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/path/subpath/file2.txt",
+                                   nullptr, &result));
   EXPECT_EQ(std::vector<string>({"gs://bucket/path/subpath/file2.txt"}),
             result);
 }
@@ -1927,7 +1982,7 @@ TEST(GcsFileSystemTest, GetMatchingPaths_BucketAndWildcard) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   std::vector<string> result;
-  TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/*/*", &result));
+  TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/*/*", nullptr, &result));
   EXPECT_EQ(std::vector<string>({"gs://bucket/path/file1.txt",
                                  "gs://bucket/path/file3.txt",
                                  "gs://bucket/path/subpath"}),
@@ -1956,7 +2011,8 @@ TEST(GcsFileSystemTest, GetMatchingPaths_FolderAndWildcard_Matches) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   std::vector<string> result;
-  TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/path/*/file2.txt", &result));
+  TF_EXPECT_OK(
+      fs.GetMatchingPaths("gs://bucket/path/*/file2.txt", nullptr, &result));
   EXPECT_EQ(std::vector<string>({"gs://bucket/path/subpath/file2.txt"}),
             result);
 }
@@ -1982,7 +2038,7 @@ TEST(GcsFileSystemTest, GetMatchingPaths_SelfDirectoryMarker) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   std::vector<string> result;
-  TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/path/*", &result));
+  TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/path/*", nullptr, &result));
   EXPECT_EQ(std::vector<string>({"gs://bucket/path/file3.txt"}), result);
 }
 
@@ -2007,7 +2063,7 @@ TEST(GcsFileSystemTest, GetMatchingPaths_SlashInObjectName) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   std::vector<string> result;
-  TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/path/*", &result));
+  TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/path/*", nullptr, &result));
   EXPECT_EQ(std::vector<string>(), result);
 }
 
@@ -2032,7 +2088,7 @@ TEST(GcsFileSystemTest, GetMatchingPaths_SlashInObjectNameEscaped) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   std::vector<string> result;
-  TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/path/\\/*", &result));
+  TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/path/\\/*", nullptr, &result));
   EXPECT_EQ(std::vector<string>({"gs://bucket/path//foo.txt"}), result);
 }
 
@@ -2058,7 +2114,8 @@ TEST(GcsFileSystemTest, GetMatchingPaths_FolderAndWildcard_NoMatches) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   std::vector<string> result;
-  TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/path/*/file3.txt", &result));
+  TF_EXPECT_OK(
+      fs.GetMatchingPaths("gs://bucket/path/*/file3.txt", nullptr, &result));
   EXPECT_EQ(std::vector<string>(), result);
 }
 
@@ -2077,7 +2134,7 @@ TEST(GcsFileSystemTest, GetMatchingPaths_OnlyWildcard) {
 
   std::vector<string> result;
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
-            fs.GetMatchingPaths("gs://*", &result).code());
+            fs.GetMatchingPaths("gs://*", nullptr, &result).code());
 }
 
 TEST(GcsFileSystemTest, GetMatchingPaths_Cache) {
@@ -2113,11 +2170,11 @@ TEST(GcsFileSystemTest, GetMatchingPaths_Cache) {
   // any additional HTTP requests to GCS.
   for (int i = 0; i < 10; i++) {
     std::vector<string> result;
-    TF_EXPECT_OK(
-        fs.GetMatchingPaths("gs://bucket/path/subpath/file2.txt", &result));
+    TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/path/subpath/file2.txt",
+                                     nullptr, &result));
     EXPECT_EQ(std::vector<string>({"gs://bucket/path/subpath/file2.txt"}),
               result);
-    TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/*/*", &result));
+    TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/*/*", nullptr, &result));
     EXPECT_EQ(std::vector<string>({"gs://bucket/path/file1.txt",
                                    "gs://bucket/path/file3.txt",
                                    "gs://bucket/path/subpath"}),
@@ -2155,17 +2212,17 @@ TEST(GcsFileSystemTest, GetMatchingPaths_Cache_Flush) {
   // This loop should trigger the first HTTP request to GCS.
   for (int i = 0; i < 10; i++) {
     std::vector<string> result;
-    TF_EXPECT_OK(
-        fs.GetMatchingPaths("gs://bucket/path/subpath/file2.txt", &result));
+    TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/path/subpath/file2.txt",
+                                     nullptr, &result));
     EXPECT_EQ(std::vector<string>({"gs://bucket/path/subpath/file2.txt"}),
               result);
   }
   // After flushing caches, there should be another (identical) request to GCS.
-  fs.FlushCaches();
+  fs.FlushCaches(nullptr);
   for (int i = 0; i < 10; i++) {
     std::vector<string> result;
-    TF_EXPECT_OK(
-        fs.GetMatchingPaths("gs://bucket/path/subpath/file2.txt", &result));
+    TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/path/subpath/file2.txt",
+                                     nullptr, &result));
     EXPECT_EQ(std::vector<string>({"gs://bucket/path/subpath/file2.txt"}),
               result);
   }
@@ -2220,11 +2277,12 @@ TEST(GcsFileSystemTest, DeleteFile) {
   char scratch[100];
   StringPiece result;
   std::unique_ptr<RandomAccessFile> file;
-  TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/path/file1.txt", &file));
+  TF_EXPECT_OK(
+      fs.NewRandomAccessFile("gs://bucket/path/file1.txt", nullptr, &file));
   TF_EXPECT_OK(file->Read(0, 8, &result, scratch));
   EXPECT_EQ("01234567", result);
   // Deleting the file triggers the next HTTP request to GCS.
-  TF_EXPECT_OK(fs.DeleteFile("gs://bucket/path/file1.txt"));
+  TF_EXPECT_OK(fs.DeleteFile("gs://bucket/path/file1.txt", nullptr));
   // Re-reading the file causes its contents to be reloaded from GCS and not
   // from the block cache.
   TF_EXPECT_OK(file->Read(0, 8, &result, scratch));
@@ -2245,7 +2303,7 @@ TEST(GcsFileSystemTest, DeleteFile_NoObjectName) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
-            fs.DeleteFile("gs://bucket/").code());
+            fs.DeleteFile("gs://bucket/", nullptr).code());
 }
 
 TEST(GcsFileSystemTest, DeleteFile_StatCacheRemoved) {
@@ -2289,14 +2347,15 @@ TEST(GcsFileSystemTest, DeleteFile_StatCacheRemoved) {
 
   // Stats the file first so the stat is cached.
   FileStatistics stat_before_deletion;
-  TF_EXPECT_OK(fs.Stat("gs://bucket/file.txt", &stat_before_deletion));
+  TF_EXPECT_OK(fs.Stat("gs://bucket/file.txt", nullptr, &stat_before_deletion));
   EXPECT_EQ(1010, stat_before_deletion.length);
 
-  TF_EXPECT_OK(fs.DeleteFile("gs://bucket/file.txt"));
+  TF_EXPECT_OK(fs.DeleteFile("gs://bucket/file.txt", nullptr));
 
   FileStatistics stat_after_deletion;
-  EXPECT_EQ(error::Code::NOT_FOUND,
-            fs.Stat("gs://bucket/file.txt", &stat_after_deletion).code());
+  EXPECT_EQ(
+      error::Code::NOT_FOUND,
+      fs.Stat("gs://bucket/file.txt", nullptr, &stat_after_deletion).code());
 }
 
 TEST(GcsFileSystemTest, DeleteDir_Empty) {
@@ -2317,7 +2376,7 @@ TEST(GcsFileSystemTest, DeleteDir_Empty) {
       kTestTimeoutConfig, *kAllowedLocationsDefault,
       nullptr /* gcs additional header */, false /* compose append */);
 
-  TF_EXPECT_OK(fs.DeleteDir("gs://bucket/path/"));
+  TF_EXPECT_OK(fs.DeleteDir("gs://bucket/path/", nullptr));
 }
 
 TEST(GcsFileSystemTest, DeleteDir_OnlyDirMarkerLeft) {
@@ -2346,7 +2405,7 @@ TEST(GcsFileSystemTest, DeleteDir_OnlyDirMarkerLeft) {
       kTestTimeoutConfig, *kAllowedLocationsDefault,
       nullptr /* gcs additional header */, false /* compose append */);
 
-  TF_EXPECT_OK(fs.DeleteDir("gs://bucket/path/"));
+  TF_EXPECT_OK(fs.DeleteDir("gs://bucket/path/", nullptr));
 }
 
 TEST(GcsFileSystemTest, DeleteDir_BucketOnly) {
@@ -2366,7 +2425,7 @@ TEST(GcsFileSystemTest, DeleteDir_BucketOnly) {
       kTestTimeoutConfig, *kAllowedLocationsDefault,
       nullptr /* gcs additional header */, false /* compose append */);
 
-  TF_EXPECT_OK(fs.DeleteDir("gs://bucket"));
+  TF_EXPECT_OK(fs.DeleteDir("gs://bucket", nullptr));
 }
 
 TEST(GcsFileSystemTest, DeleteDir_NonEmpty) {
@@ -2389,7 +2448,7 @@ TEST(GcsFileSystemTest, DeleteDir_NonEmpty) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   EXPECT_EQ(error::Code::FAILED_PRECONDITION,
-            fs.DeleteDir("gs://bucket/path/").code());
+            fs.DeleteDir("gs://bucket/path/", nullptr).code());
 }
 
 TEST(GcsFileSystemTest, GetFileSize) {
@@ -2412,7 +2471,7 @@ TEST(GcsFileSystemTest, GetFileSize) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   uint64 size;
-  TF_EXPECT_OK(fs.GetFileSize("gs://bucket/file.txt", &size));
+  TF_EXPECT_OK(fs.GetFileSize("gs://bucket/file.txt", nullptr, &size));
   EXPECT_EQ(1010, size);
 }
 
@@ -2431,7 +2490,7 @@ TEST(GcsFileSystemTest, GetFileSize_NoObjectName) {
 
   uint64 size;
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
-            fs.GetFileSize("gs://bucket/", &size).code());
+            fs.GetFileSize("gs://bucket/", nullptr, &size).code());
 }
 
 TEST(GcsFileSystemTest, RenameFile_Folder) {
@@ -2515,7 +2574,8 @@ TEST(GcsFileSystemTest, RenameFile_Folder) {
       kTestTimeoutConfig, *kAllowedLocationsDefault,
       nullptr /* gcs additional header */, false /* compose append */);
 
-  TF_EXPECT_OK(fs.RenameFile("gs://bucket/path1", "gs://bucket/path2/"));
+  TF_EXPECT_OK(
+      fs.RenameFile("gs://bucket/path1", "gs://bucket/path2/", nullptr));
 }
 
 TEST(GcsFileSystemTest, RenameFile_Object) {
@@ -2612,15 +2672,17 @@ TEST(GcsFileSystemTest, RenameFile_Object) {
   StringPiece result;
   std::unique_ptr<RandomAccessFile> src;
   std::unique_ptr<RandomAccessFile> dst;
-  TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/path/src.txt", &src));
+  TF_EXPECT_OK(
+      fs.NewRandomAccessFile("gs://bucket/path/src.txt", nullptr, &src));
   TF_EXPECT_OK(src->Read(0, 8, &result, scratch));
   EXPECT_EQ("01234567", result);
-  TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/path/dst.txt", &dst));
+  TF_EXPECT_OK(
+      fs.NewRandomAccessFile("gs://bucket/path/dst.txt", nullptr, &dst));
   TF_EXPECT_OK(dst->Read(0, 8, &result, scratch));
   EXPECT_EQ("76543210", result);
   // Now rename src to dst. This should flush the block cache for both files.
-  TF_EXPECT_OK(
-      fs.RenameFile("gs://bucket/path/src.txt", "gs://bucket/path/dst.txt"));
+  TF_EXPECT_OK(fs.RenameFile("gs://bucket/path/src.txt",
+                             "gs://bucket/path/dst.txt", nullptr));
   // Re-read both files. This should reload their contents from GCS.
   TF_EXPECT_OK(src->Read(0, 8, &result, scratch));
   EXPECT_EQ("89abcdef", result);
@@ -2690,14 +2752,16 @@ TEST(GcsFileSystemTest, RenameFile_Object_FlushTargetStatCache) {
   // Do an initial stat of the destination file to load their contents into the
   // stat cache.
   FileStatistics stat_before_renaming;
-  TF_EXPECT_OK(fs.Stat("gs://bucket/path/dst.txt", &stat_before_renaming));
+  TF_EXPECT_OK(
+      fs.Stat("gs://bucket/path/dst.txt", nullptr, &stat_before_renaming));
   EXPECT_EQ(1000, stat_before_renaming.length);
 
-  TF_EXPECT_OK(
-      fs.RenameFile("gs://bucket/path/src.txt", "gs://bucket/path/dst.txt"));
+  TF_EXPECT_OK(fs.RenameFile("gs://bucket/path/src.txt",
+                             "gs://bucket/path/dst.txt", nullptr));
 
   FileStatistics stat_after_renaming;
-  TF_EXPECT_OK(fs.Stat("gs://bucket/path/dst.txt", &stat_after_renaming));
+  TF_EXPECT_OK(
+      fs.Stat("gs://bucket/path/dst.txt", nullptr, &stat_after_renaming));
   EXPECT_EQ(1010, stat_after_renaming.length);
 }
 
@@ -2755,8 +2819,8 @@ TEST(GcsFileSystemTest, RenameFile_Object_DeletionRetried) {
       kTestTimeoutConfig, *kAllowedLocationsDefault,
       nullptr /* gcs additional header */, false /* compose append */);
 
-  TF_EXPECT_OK(
-      fs.RenameFile("gs://bucket/path/src.txt", "gs://bucket/path/dst.txt"));
+  TF_EXPECT_OK(fs.RenameFile("gs://bucket/path/src.txt",
+                             "gs://bucket/path/dst.txt", nullptr));
 }
 
 /// Tests the case when rewrite couldn't complete in one RPC.
@@ -2797,10 +2861,10 @@ TEST(GcsFileSystemTest, RenameFile_Object_Incomplete) {
       kTestTimeoutConfig, *kAllowedLocationsDefault,
       nullptr /* gcs additional header */, false /* compose append */);
 
-  EXPECT_EQ(
-      errors::Code::UNIMPLEMENTED,
-      fs.RenameFile("gs://bucket/path/src.txt", "gs://bucket/path/dst.txt")
-          .code());
+  EXPECT_EQ(errors::Code::UNIMPLEMENTED,
+            fs.RenameFile("gs://bucket/path/src.txt",
+                          "gs://bucket/path/dst.txt", nullptr)
+                .code());
 }
 
 TEST(GcsFileSystemTest, Stat_Object) {
@@ -2823,7 +2887,7 @@ TEST(GcsFileSystemTest, Stat_Object) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   FileStatistics stat;
-  TF_EXPECT_OK(fs.Stat("gs://bucket/file.txt", &stat));
+  TF_EXPECT_OK(fs.Stat("gs://bucket/file.txt", nullptr, &stat));
   EXPECT_EQ(1010, stat.length);
   EXPECT_NEAR(1461971724896, stat.mtime_nsec / 1000 / 1000, 1);
   EXPECT_FALSE(stat.is_directory);
@@ -2857,7 +2921,7 @@ TEST(GcsFileSystemTest, Stat_Folder) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   FileStatistics stat;
-  TF_EXPECT_OK(fs.Stat("gs://bucket/subfolder", &stat));
+  TF_EXPECT_OK(fs.Stat("gs://bucket/subfolder", nullptr, &stat));
   EXPECT_EQ(0, stat.length);
   EXPECT_EQ(0, stat.mtime_nsec);
   EXPECT_TRUE(stat.is_directory);
@@ -2890,7 +2954,8 @@ TEST(GcsFileSystemTest, Stat_ObjectOrFolderNotFound) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   FileStatistics stat;
-  EXPECT_EQ(error::Code::NOT_FOUND, fs.Stat("gs://bucket/path", &stat).code());
+  EXPECT_EQ(error::Code::NOT_FOUND,
+            fs.Stat("gs://bucket/path", nullptr, &stat).code());
 }
 
 TEST(GcsFileSystemTest, Stat_Bucket) {
@@ -2911,7 +2976,7 @@ TEST(GcsFileSystemTest, Stat_Bucket) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   FileStatistics stat;
-  TF_EXPECT_OK(fs.Stat("gs://bucket/", &stat));
+  TF_EXPECT_OK(fs.Stat("gs://bucket/", nullptr, &stat));
   EXPECT_EQ(0, stat.length);
   EXPECT_EQ(0, stat.mtime_nsec);
   EXPECT_TRUE(stat.is_directory);
@@ -2935,7 +3000,8 @@ TEST(GcsFileSystemTest, Stat_BucketNotFound) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   FileStatistics stat;
-  EXPECT_EQ(error::Code::NOT_FOUND, fs.Stat("gs://bucket/", &stat).code());
+  EXPECT_EQ(error::Code::NOT_FOUND,
+            fs.Stat("gs://bucket/", nullptr, &stat).code());
 }
 
 TEST(GcsFileSystemTest, Stat_Cache) {
@@ -2976,11 +3042,11 @@ TEST(GcsFileSystemTest, Stat_Cache) {
   // HTTP requests to GCS.
   for (int i = 0; i < 10; i++) {
     FileStatistics stat;
-    TF_EXPECT_OK(fs.Stat("gs://bucket/file.txt", &stat));
+    TF_EXPECT_OK(fs.Stat("gs://bucket/file.txt", nullptr, &stat));
     EXPECT_EQ(1010, stat.length);
     EXPECT_NEAR(1461971724896, stat.mtime_nsec / 1000 / 1000, 1);
     EXPECT_FALSE(stat.is_directory);
-    TF_EXPECT_OK(fs.Stat("gs://bucket/subfolder/", &stat));
+    TF_EXPECT_OK(fs.Stat("gs://bucket/subfolder/", nullptr, &stat));
     EXPECT_EQ(0, stat.length);
     EXPECT_EQ(0, stat.mtime_nsec);
     EXPECT_TRUE(stat.is_directory);
@@ -3016,16 +3082,16 @@ TEST(GcsFileSystemTest, Stat_Cache_Flush) {
   // There should be a single HTTP request to GCS for fs.Stat in this loop.
   for (int i = 0; i < 10; i++) {
     FileStatistics stat;
-    TF_EXPECT_OK(fs.Stat("gs://bucket/file.txt", &stat));
+    TF_EXPECT_OK(fs.Stat("gs://bucket/file.txt", nullptr, &stat));
     EXPECT_EQ(1010, stat.length);
     EXPECT_NEAR(1461971724896, stat.mtime_nsec / 1000 / 1000, 1);
     EXPECT_FALSE(stat.is_directory);
   }
   // After flushing caches, there should be a second request to GCS for fs.Stat.
-  fs.FlushCaches();
+  fs.FlushCaches(nullptr);
   for (int i = 0; i < 10; i++) {
     FileStatistics stat;
-    TF_EXPECT_OK(fs.Stat("gs://bucket/file.txt", &stat));
+    TF_EXPECT_OK(fs.Stat("gs://bucket/file.txt", nullptr, &stat));
     EXPECT_EQ(1010, stat.length);
     EXPECT_NEAR(1461971724896, stat.mtime_nsec / 1000 / 1000, 1);
     EXPECT_FALSE(stat.is_directory);
@@ -3052,7 +3118,7 @@ TEST(GcsFileSystemTest, Stat_FilenameEndingWithSlash) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   FileStatistics stat;
-  TF_EXPECT_OK(fs.Stat("gs://bucket/dir/", &stat));
+  TF_EXPECT_OK(fs.Stat("gs://bucket/dir/", nullptr, &stat));
   EXPECT_EQ(5, stat.length);
   EXPECT_TRUE(stat.is_directory);
 }
@@ -3084,7 +3150,7 @@ TEST(GcsFileSystemTest, IsDirectory_NotFound) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   EXPECT_EQ(error::Code::NOT_FOUND,
-            fs.IsDirectory("gs://bucket/file.txt").code());
+            fs.IsDirectory("gs://bucket/file.txt", nullptr).code());
 }
 
 TEST(GcsFileSystemTest, IsDirectory_NotDirectoryButObject) {
@@ -3115,7 +3181,7 @@ TEST(GcsFileSystemTest, IsDirectory_NotDirectoryButObject) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   EXPECT_EQ(error::Code::FAILED_PRECONDITION,
-            fs.IsDirectory("gs://bucket/file.txt").code());
+            fs.IsDirectory("gs://bucket/file.txt", nullptr).code());
 }
 
 TEST(GcsFileSystemTest, IsDirectory_Yes) {
@@ -3145,8 +3211,8 @@ TEST(GcsFileSystemTest, IsDirectory_Yes) {
       kTestTimeoutConfig, *kAllowedLocationsDefault,
       nullptr /* gcs additional header */, false /* compose append */);
 
-  TF_EXPECT_OK(fs.IsDirectory("gs://bucket/subfolder"));
-  TF_EXPECT_OK(fs.IsDirectory("gs://bucket/subfolder/"));
+  TF_EXPECT_OK(fs.IsDirectory("gs://bucket/subfolder", nullptr));
+  TF_EXPECT_OK(fs.IsDirectory("gs://bucket/subfolder/", nullptr));
 }
 
 TEST(GcsFileSystemTest, IsDirectory_Bucket) {
@@ -3172,8 +3238,8 @@ TEST(GcsFileSystemTest, IsDirectory_Bucket) {
       kTestTimeoutConfig, *kAllowedLocationsDefault,
       nullptr /* gcs additional header */, false /* compose append */);
 
-  TF_EXPECT_OK(fs.IsDirectory("gs://bucket"));
-  TF_EXPECT_OK(fs.IsDirectory("gs://bucket/"));
+  TF_EXPECT_OK(fs.IsDirectory("gs://bucket", nullptr));
+  TF_EXPECT_OK(fs.IsDirectory("gs://bucket/", nullptr));
 }
 
 TEST(GcsFileSystemTest, IsDirectory_BucketNotFound) {
@@ -3193,7 +3259,8 @@ TEST(GcsFileSystemTest, IsDirectory_BucketNotFound) {
       kTestTimeoutConfig, *kAllowedLocationsDefault,
       nullptr /* gcs additional header */, false /* compose append */);
 
-  EXPECT_EQ(error::Code::NOT_FOUND, fs.IsDirectory("gs://bucket/").code());
+  EXPECT_EQ(error::Code::NOT_FOUND,
+            fs.IsDirectory("gs://bucket/", nullptr).code());
 }
 
 TEST(GcsFileSystemTest, CreateDir_Folder) {
@@ -3250,15 +3317,15 @@ TEST(GcsFileSystemTest, CreateDir_Folder) {
       kTestTimeoutConfig, *kAllowedLocationsDefault,
       nullptr /* gcs additional header */, false /* compose append */);
 
-  TF_EXPECT_OK(fs.CreateDir("gs://bucket/subpath"));
+  TF_EXPECT_OK(fs.CreateDir("gs://bucket/subpath", nullptr));
   // Check that when GCS returns the object already exists return that the
   // directory already exists.
   EXPECT_EQ(errors::AlreadyExists("gs://bucket/subpath"),
-            fs.CreateDir("gs://bucket/subpath"));
+            fs.CreateDir("gs://bucket/subpath", nullptr));
   // Check that when GCS returns the object already has a version (failed
   // precondition) return directory already exists.
   EXPECT_EQ(errors::AlreadyExists("gs://bucket/subpath"),
-            fs.CreateDir("gs://bucket/subpath"));
+            fs.CreateDir("gs://bucket/subpath", nullptr));
 }
 
 TEST(GcsFileSystemTest, CreateDir_Bucket) {
@@ -3284,8 +3351,8 @@ TEST(GcsFileSystemTest, CreateDir_Bucket) {
       kTestTimeoutConfig, *kAllowedLocationsDefault,
       nullptr /* gcs additional header */, false /* compose append */);
 
-  TF_EXPECT_OK(fs.CreateDir("gs://bucket/"));
-  TF_EXPECT_OK(fs.CreateDir("gs://bucket"));
+  TF_EXPECT_OK(fs.CreateDir("gs://bucket/", nullptr));
+  TF_EXPECT_OK(fs.CreateDir("gs://bucket", nullptr));
 }
 
 TEST(GcsFileSystemTest, DeleteRecursively_Ok) {
@@ -3357,8 +3424,8 @@ TEST(GcsFileSystemTest, DeleteRecursively_Ok) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   int64 undeleted_files, undeleted_dirs;
-  TF_EXPECT_OK(fs.DeleteRecursively("gs://bucket/path", &undeleted_files,
-                                    &undeleted_dirs));
+  TF_EXPECT_OK(fs.DeleteRecursively("gs://bucket/path", nullptr,
+                                    &undeleted_files, &undeleted_dirs));
   EXPECT_EQ(0, undeleted_files);
   EXPECT_EQ(0, undeleted_dirs);
 }
@@ -3450,8 +3517,8 @@ TEST(GcsFileSystemTest, DeleteRecursively_DeletionErrors) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   int64 undeleted_files, undeleted_dirs;
-  TF_EXPECT_OK(fs.DeleteRecursively("gs://bucket/path", &undeleted_files,
-                                    &undeleted_dirs));
+  TF_EXPECT_OK(fs.DeleteRecursively("gs://bucket/path", nullptr,
+                                    &undeleted_files, &undeleted_dirs));
   EXPECT_EQ(1, undeleted_files);
   EXPECT_EQ(1, undeleted_dirs);
 }
@@ -3486,7 +3553,7 @@ TEST(GcsFileSystemTest, DeleteRecursively_NotAFolder) {
 
   int64 undeleted_files, undeleted_dirs;
   EXPECT_EQ(error::Code::NOT_FOUND,
-            fs.DeleteRecursively("gs://bucket/path", &undeleted_files,
+            fs.DeleteRecursively("gs://bucket/path", nullptr, &undeleted_files,
                                  &undeleted_dirs)
                 .code());
   EXPECT_EQ(0, undeleted_files);
@@ -3501,7 +3568,7 @@ TEST(GcsFileSystemTest, NoConstraintsEnvironmentVariableTest) {
 
   // Cover cache initialization code, any uninitialized cache will cause this to
   // fail
-  fs1.FlushCaches();
+  fs1.FlushCaches(nullptr);
 }
 
 TEST(GcsFileSystemTest, BucketLocationConstraintEnvironmentVariableTest) {
@@ -3715,7 +3782,7 @@ TEST(GcsFileSystemTest, Stat_StatsRecording) {
   EXPECT_EQ(stats.fs_, &fs);
 
   FileStatistics stat;
-  TF_EXPECT_OK(fs.Stat("gs://bucket/file.txt", &stat));
+  TF_EXPECT_OK(fs.Stat("gs://bucket/file.txt", nullptr, &stat));
   EXPECT_EQ(1, stats.stat_object_request_count_);
 }
 
@@ -3742,7 +3809,8 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_StatsRecording) {
   EXPECT_EQ(stats.fs_, &fs);
 
   std::unique_ptr<RandomAccessFile> file;
-  TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
+  TF_EXPECT_OK(
+      fs.NewRandomAccessFile("gs://bucket/random_access.txt", nullptr, &file));
 
   char scratch[6];
   StringPiece result;
@@ -3883,8 +3951,8 @@ TEST(GcsFileSystemTest, NewAppendableFile_MultipleFlushesWithCompose) {
   // Create an appendable file. This should read the file from GCS, and pull its
   // contents into the block cache.
   std::unique_ptr<WritableFile> wfile;
-  TF_EXPECT_OK(
-      fs.NewAppendableFile("gs://bucket/some/path/appendable", &wfile));
+  TF_EXPECT_OK(fs.NewAppendableFile("gs://bucket/some/path/appendable", nullptr,
+                                    &wfile));
   TF_EXPECT_OK(wfile->Append(contents[1]));
   TF_EXPECT_OK(wfile->Flush());
   TF_EXPECT_OK(wfile->Append(contents[2]));
@@ -3981,7 +4049,8 @@ TEST(GcsFileSystemTest, NewAppendableFile_MultipleFlushesWithoutCompose) {
   // Create an appendable file. This should read the file from GCS, and pull its
   // contents into the block cache.
   std::unique_ptr<WritableFile> wfile;
-  TF_EXPECT_OK(fs.NewAppendableFile("gs://bucket/path/appendable", &wfile));
+  TF_EXPECT_OK(
+      fs.NewAppendableFile("gs://bucket/path/appendable", nullptr, &wfile));
   TF_EXPECT_OK(wfile->Append(contents[1]));
   TF_EXPECT_OK(wfile->Flush());
   TF_EXPECT_OK(wfile->Append(contents[2]));
diff --git a/tensorflow/core/platform/cloud/google_auth_provider.cc b/tensorflow/core/platform/cloud/google_auth_provider.cc
index e8546ca022f..57240fa2494 100644
--- a/tensorflow/core/platform/cloud/google_auth_provider.cc
+++ b/tensorflow/core/platform/cloud/google_auth_provider.cc
@@ -24,7 +24,7 @@ limitations under the License.
 #include <utility>
 
 #include "absl/strings/match.h"
-#include "include/json/json.h"
+#include "json/json.h"
 #include "tensorflow/core/platform/base64.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/errors.h"
diff --git a/tensorflow/core/platform/cloud/oauth_client.h b/tensorflow/core/platform/cloud/oauth_client.h
index ed8bf257253..97af3ecaf17 100644
--- a/tensorflow/core/platform/cloud/oauth_client.h
+++ b/tensorflow/core/platform/cloud/oauth_client.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include <memory>
 
-#include "include/json/json.h"
+#include "json/json.h"
 #include "tensorflow/core/platform/cloud/http_request.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/status.h"
diff --git a/tensorflow/core/platform/cloud/testdata/BUILD b/tensorflow/core/platform/cloud/testdata/BUILD
new file mode 100644
index 00000000000..c7f1b8ef29b
--- /dev/null
+++ b/tensorflow/core/platform/cloud/testdata/BUILD
@@ -0,0 +1,29 @@
+# Cloud test data files.
+
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
+
+filegroup(
+    name = "application_default_credentials",
+    srcs = [
+        "application_default_credentials.json",
+    ],
+    visibility = ["//tensorflow/core/platform/cloud:__pkg__"],
+)
+
+filegroup(
+    name = "service_account_credentials",
+    srcs = [
+        "service_account_credentials.json",
+    ],
+    visibility = ["//tensorflow/core/platform/cloud:__pkg__"],
+)
+
+filegroup(
+    name = "service_account_public_key",
+    srcs = [
+        "service_account_public_key.txt",
+    ],
+    visibility = ["//tensorflow/core/platform/cloud:__pkg__"],
+)
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index dda65f93cda..08ef8a6628a 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -1,7 +1,7 @@
 # Platform-specific build configurations.
 
 load("@com_google_protobuf//:protobuf.bzl", "proto_gen")
-load("//tensorflow:tensorflow.bzl", "clean_dep", "if_not_windows")
+load("//tensorflow:tensorflow.bzl", "clean_dep", "if_not_windows", "if_tpu")
 load("//tensorflow/core/platform:build_config_root.bzl", "if_static")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
@@ -369,6 +369,7 @@ def tf_proto_library_cc(
         cc_api_version = 2,
         js_codegen = "jspb",
         create_service = False,
+        create_java_proto = False,
         make_default_target_header_only = False):
     js_codegen = js_codegen  # unused argument
     native.filegroup(
@@ -377,7 +378,7 @@ def tf_proto_library_cc(
         testonly = testonly,
         visibility = visibility,
     )
-    _ignore = create_service
+    _ignore = (create_service, create_java_proto)
 
     use_grpc_plugin = None
     if cc_grpc_version:
@@ -503,14 +504,16 @@ def tf_proto_library(
         j2objc_api_version = 1,
         js_codegen = "jspb",
         create_service = False,
+        create_java_proto = False,
         make_default_target_header_only = False,
-        exports = []):
+        exports = [],
+        tags = []):
     """Make a proto library, possibly depending on other proto libraries."""
 
     # TODO(b/145545130): Add docstring explaining what rules this creates and how
     # opensource projects importing TF in bazel can use them safely (i.e. w/o ODR or
     # ABI violations).
-    _ignore = (js_codegen, exports, create_service)
+    _ignore = (js_codegen, exports, create_service, create_java_proto)
 
     native.proto_library(
         name = name,
@@ -518,6 +521,7 @@ def tf_proto_library(
         deps = protodeps + well_known_proto_libs(),
         visibility = visibility,
         testonly = testonly,
+        tags = tags,
     )
 
     tf_proto_library_cc(
@@ -798,3 +802,6 @@ def if_llvm_system_z_available(then, otherwise = []):
         "//tensorflow:linux_s390x": then,
         "//conditions:default": otherwise,
     })
+
+def tf_tpu_dependencies():
+    return if_tpu(["//tensorflow/core/tpu/kernels"])
diff --git a/tensorflow/core/platform/default/build_config/BUILD b/tensorflow/core/platform/default/build_config/BUILD
index 20f0e9e42d9..2e2ef2c766c 100644
--- a/tensorflow/core/platform/default/build_config/BUILD
+++ b/tensorflow/core/platform/default/build_config/BUILD
@@ -3,8 +3,6 @@
 
 load("//tensorflow:tensorflow.bzl", "tf_copts", "tf_cuda_library")
 load("//tensorflow/core/platform:build_config_root.bzl", "if_static")
-load("@local_config_sycl//sycl:platform.bzl", "sycl_library_path")
-load("@local_config_sycl//sycl:build_defs.bzl", "if_ccpp")
 
 package(default_visibility = ["//tensorflow:internal"])
 
@@ -219,17 +217,3 @@ cc_library(
     }),
     deps = [],
 )
-
-cc_library(
-    name = "sycl",
-    data = if_ccpp([
-        "@local_config_sycl//sycl:{}".format(sycl_library_path("ComputeCpp")),
-    ]),
-    linkopts = if_ccpp([
-        "-Wl,-rpath,../local_config_sycl/sycl/lib",
-    ]),
-    deps = if_ccpp(
-        ["@local_config_sycl//sycl:syclrt"],
-        ["@local_config_sycl//sycl:sycl_headers"],
-    ),
-)
diff --git a/tensorflow/core/platform/default/build_config_root.bzl b/tensorflow/core/platform/default/build_config_root.bzl
index 3afe1de58df..6012b4db407 100644
--- a/tensorflow/core/platform/default/build_config_root.bzl
+++ b/tensorflow/core/platform/default/build_config_root.bzl
@@ -18,9 +18,6 @@ def tf_gpu_tests_tags():
 def tf_cuda_tests_tags():
     return tf_gpu_tests_tags()
 
-def tf_sycl_tests_tags():
-    return ["requires-gpu", "gpu"] + gpu_test_tags()
-
 def tf_exec_properties(kwargs):
     if ("tags" in kwargs and kwargs["tags"] != None and
         "remote-gpu" in kwargs["tags"]):
diff --git a/tensorflow/core/platform/default/port.cc b/tensorflow/core/platform/default/port.cc
index fee82623ee0..e25ed074844 100644
--- a/tensorflow/core/platform/default/port.cc
+++ b/tensorflow/core/platform/default/port.cc
@@ -61,7 +61,13 @@ string Hostname() {
   return string(hostname);
 }
 
-string JobName() { return ""; }
+string JobName() {
+  const char* job_name_cs = std::getenv("TF_JOB_NAME");
+  if (job_name_cs != nullptr) {
+    return string(job_name_cs);
+  }
+  return "";
+}
 
 int NumSchedulableCPUs() {
 #if defined(__linux__) && !defined(__ANDROID__)
diff --git a/tensorflow/core/platform/default/posix_file_system.cc b/tensorflow/core/platform/default/posix_file_system.cc
index 8533e34fc3f..18fea3fe15d 100644
--- a/tensorflow/core/platform/default/posix_file_system.cc
+++ b/tensorflow/core/platform/default/posix_file_system.cc
@@ -178,8 +178,8 @@ class PosixReadOnlyMemoryRegion : public ReadOnlyMemoryRegion {
 };
 
 Status PosixFileSystem::NewRandomAccessFile(
-    const string& fname,
-    std::unique_ptr<RandomAccessFile>* result /*, TransactionToken* token */) {
+    const string& fname, TransactionToken* token,
+    std::unique_ptr<RandomAccessFile>* result) {
   string translated_fname = TranslateName(fname);
   Status s;
   int fd = open(translated_fname.c_str(), O_RDONLY);
@@ -191,9 +191,9 @@ Status PosixFileSystem::NewRandomAccessFile(
   return s;
 }
 
-Status PosixFileSystem::NewWritableFile(
-    const string& fname,
-    std::unique_ptr<WritableFile>* result /*, TransactionToken* token */) {
+Status PosixFileSystem::NewWritableFile(const string& fname,
+                                        TransactionToken* token,
+                                        std::unique_ptr<WritableFile>* result) {
   string translated_fname = TranslateName(fname);
   Status s;
   FILE* f = fopen(translated_fname.c_str(), "w");
@@ -206,8 +206,8 @@ Status PosixFileSystem::NewWritableFile(
 }
 
 Status PosixFileSystem::NewAppendableFile(
-    const string& fname,
-    std::unique_ptr<WritableFile>* result /*, TransactionToken* token */) {
+    const string& fname, TransactionToken* token,
+    std::unique_ptr<WritableFile>* result) {
   string translated_fname = TranslateName(fname);
   Status s;
   FILE* f = fopen(translated_fname.c_str(), "a");
@@ -220,8 +220,8 @@ Status PosixFileSystem::NewAppendableFile(
 }
 
 Status PosixFileSystem::NewReadOnlyMemoryRegionFromFile(
-    const string& fname, std::unique_ptr<ReadOnlyMemoryRegion>*
-                             result /*, TransactionToken* token */) {
+    const string& fname, TransactionToken* token,
+    std::unique_ptr<ReadOnlyMemoryRegion>* result) {
   string translated_fname = TranslateName(fname);
   Status s = Status::OK();
   int fd = open(translated_fname.c_str(), O_RDONLY);
@@ -244,17 +244,16 @@ Status PosixFileSystem::NewReadOnlyMemoryRegionFromFile(
   return s;
 }
 
-Status PosixFileSystem::FileExists(
-    const string& fname /*, TransactionToken* token */) {
+Status PosixFileSystem::FileExists(const string& fname,
+                                   TransactionToken* token) {
   if (access(TranslateName(fname).c_str(), F_OK) == 0) {
     return Status::OK();
   }
   return errors::NotFound(fname, " not found");
 }
 
-Status PosixFileSystem::GetChildren(
-    const string& dir,
-    std::vector<string>* result /*, TransactionToken* token */) {
+Status PosixFileSystem::GetChildren(const string& dir, TransactionToken* token,
+                                    std::vector<string>* result) {
   string translated_dir = TranslateName(dir);
   result->clear();
   DIR* d = opendir(translated_dir.c_str());
@@ -274,14 +273,14 @@ Status PosixFileSystem::GetChildren(
   return Status::OK();
 }
 
-Status PosixFileSystem::GetMatchingPaths(
-    const string& pattern,
-    std::vector<string>* results /*, TransactionToken* token */) {
+Status PosixFileSystem::GetMatchingPaths(const string& pattern,
+                                         TransactionToken* token,
+                                         std::vector<string>* results) {
   return internal::GetMatchingPaths(this, Env::Default(), pattern, results);
 }
 
-Status PosixFileSystem::DeleteFile(
-    const string& fname /*, TransactionToken* token */) {
+Status PosixFileSystem::DeleteFile(const string& fname,
+                                   TransactionToken* token) {
   Status result;
   if (unlink(TranslateName(fname).c_str()) != 0) {
     result = IOError(fname, errno);
@@ -289,8 +288,7 @@ Status PosixFileSystem::DeleteFile(
   return result;
 }
 
-Status PosixFileSystem::CreateDir(
-    const string& name /*, TransactionToken* token */) {
+Status PosixFileSystem::CreateDir(const string& name, TransactionToken* token) {
   string translated = TranslateName(name);
   if (translated.empty()) {
     return errors::AlreadyExists(name);
@@ -301,8 +299,7 @@ Status PosixFileSystem::CreateDir(
   return Status::OK();
 }
 
-Status PosixFileSystem::DeleteDir(
-    const string& name /*, TransactionToken* token */) {
+Status PosixFileSystem::DeleteDir(const string& name, TransactionToken* token) {
   Status result;
   if (rmdir(TranslateName(name).c_str()) != 0) {
     result = IOError(name, errno);
@@ -310,8 +307,8 @@ Status PosixFileSystem::DeleteDir(
   return result;
 }
 
-Status PosixFileSystem::GetFileSize(
-    const string& fname, uint64* size /*, TransactionToken* token */) {
+Status PosixFileSystem::GetFileSize(const string& fname,
+                                    TransactionToken* token, uint64* size) {
   Status s;
   struct stat sbuf;
   if (stat(TranslateName(fname).c_str(), &sbuf) != 0) {
@@ -323,8 +320,8 @@ Status PosixFileSystem::GetFileSize(
   return s;
 }
 
-Status PosixFileSystem::Stat(
-    const string& fname, FileStatistics* stats /*, TransactionToken* token */) {
+Status PosixFileSystem::Stat(const string& fname, TransactionToken* token,
+                             FileStatistics* stats) {
   Status s;
   struct stat sbuf;
   if (stat(TranslateName(fname).c_str(), &sbuf) != 0) {
@@ -337,8 +334,8 @@ Status PosixFileSystem::Stat(
   return s;
 }
 
-Status PosixFileSystem::RenameFile(
-    const string& src, const string& target /*, TransactionToken* token */) {
+Status PosixFileSystem::RenameFile(const string& src, const string& target,
+                                   TransactionToken* token) {
   Status result;
   if (rename(TranslateName(src).c_str(), TranslateName(target).c_str()) != 0) {
     result = IOError(src, errno);
@@ -346,8 +343,8 @@ Status PosixFileSystem::RenameFile(
   return result;
 }
 
-Status PosixFileSystem::CopyFile(
-    const string& src, const string& target /*, TransactionToken* token */) {
+Status PosixFileSystem::CopyFile(const string& src, const string& target,
+                                 TransactionToken* token) {
   string translated_src = TranslateName(src);
   struct stat sbuf;
   if (stat(translated_src.c_str(), &sbuf) != 0) {
diff --git a/tensorflow/core/platform/default/posix_file_system.h b/tensorflow/core/platform/default/posix_file_system.h
index a1c6f34ad65..8e301c8b2e4 100644
--- a/tensorflow/core/platform/default/posix_file_system.h
+++ b/tensorflow/core/platform/default/posix_file_system.h
@@ -27,63 +27,47 @@ class PosixFileSystem : public FileSystem {
 
   ~PosixFileSystem() {}
 
+  TF_USE_FILESYSTEM_METHODS_WITH_NO_TRANSACTION_SUPPORT;
+
   Status NewRandomAccessFile(
-      const string& filename,
-      std::unique_ptr<RandomAccessFile>*
-          result /*, TransactionToken* token = nullptr */) override;
+      const string& filename, TransactionToken* token,
+      std::unique_ptr<RandomAccessFile>* result) override;
 
-  Status NewWritableFile(
-      const string& fname,
-      std::unique_ptr<WritableFile>*
-          result /*, TransactionToken* token = nullptr */) override;
+  Status NewWritableFile(const string& fname, TransactionToken* token,
+                         std::unique_ptr<WritableFile>* result) override;
 
-  Status NewAppendableFile(
-      const string& fname,
-      std::unique_ptr<WritableFile>*
-          result /*, TransactionToken* token = nullptr */) override;
+  Status NewAppendableFile(const string& fname, TransactionToken* token,
+                           std::unique_ptr<WritableFile>* result) override;
 
   Status NewReadOnlyMemoryRegionFromFile(
-      const string& filename,
-      std::unique_ptr<ReadOnlyMemoryRegion>*
-          result /*, TransactionToken* token = nullptr */) override;
+      const string& filename, TransactionToken* token,
+      std::unique_ptr<ReadOnlyMemoryRegion>* result) override;
 
-  Status FileExists(
-      const string& fname /*, TransactionToken* token = nullptr */) override;
+  Status FileExists(const string& fname, TransactionToken* token) override;
 
-  Status GetChildren(
-      const string& dir,
-      std::vector<string>* result /*, TransactionToken* token = nullptr */)
-      override;
+  Status GetChildren(const string& dir, TransactionToken* token,
+                     std::vector<string>* result) override;
 
-  Status Stat(
-      const string& fname,
-      FileStatistics* stats /*, TransactionToken* token = nullptr */) override;
+  Status Stat(const string& fname, TransactionToken* token,
+              FileStatistics* stats) override;
 
-  Status GetMatchingPaths(
-      const string& pattern,
-      std::vector<string>* results /*, TransactionToken* token = nullptr */)
-      override;
+  Status GetMatchingPaths(const string& pattern, TransactionToken* token,
+                          std::vector<string>* results) override;
 
-  Status DeleteFile(
-      const string& fname /*, TransactionToken* token = nullptr */) override;
+  Status DeleteFile(const string& fname, TransactionToken* token) override;
 
-  Status CreateDir(
-      const string& name /*, TransactionToken* token = nullptr */) override;
+  Status CreateDir(const string& name, TransactionToken* token) override;
 
-  Status DeleteDir(
-      const string& name /*, TransactionToken* token = nullptr */) override;
+  Status DeleteDir(const string& name, TransactionToken* token) override;
 
-  Status GetFileSize(
-      const string& fname,
-      uint64* size /*, TransactionToken* token = nullptr */) override;
+  Status GetFileSize(const string& fname, TransactionToken* token,
+                     uint64* size) override;
 
-  Status RenameFile(
-      const string& src,
-      const string& target /*, TransactionToken* token = nullptr */) override;
+  Status RenameFile(const string& src, const string& target,
+                    TransactionToken* token) override;
 
-  Status CopyFile(
-      const string& src,
-      const string& target /*, TransactionToken* token = nullptr */) override;
+  Status CopyFile(const string& src, const string& target,
+                  TransactionToken* token) override;
 };
 
 Status IOError(const string& context, int err_number);
diff --git a/tensorflow/core/platform/env.h b/tensorflow/core/platform/env.h
index 7b716798c28..308d8a09fa7 100644
--- a/tensorflow/core/platform/env.h
+++ b/tensorflow/core/platform/env.h
@@ -111,6 +111,13 @@ class Env {
   Status NewRandomAccessFile(const std::string& fname,
                              std::unique_ptr<RandomAccessFile>* result);
 
+  Status NewRandomAccessFile(const std::string& fname, TransactionToken* token,
+                             std::unique_ptr<RandomAccessFile>* result) {
+    // We duplicate these methods due to Google internal coding style prevents
+    // virtual functions with default arguments. See PR #41615.
+    return Status::OK();
+  }
+
   /// \brief Creates an object that writes to a new file with the specified
   /// name.
   ///
@@ -127,6 +134,11 @@ class Env {
   Status NewWritableFile(const std::string& fname,
                          std::unique_ptr<WritableFile>* result);
 
+  Status NewWritableFile(const std::string& fname, TransactionToken* token,
+                         std::unique_ptr<WritableFile>* result) {
+    return Status::OK();
+  }
+
   /// \brief Creates an object that either appends to an existing file, or
   /// writes to a new file (if the file does not exist to begin with).
   ///
@@ -142,6 +154,10 @@ class Env {
   Status NewAppendableFile(const std::string& fname,
                            std::unique_ptr<WritableFile>* result);
 
+  Status NewAppendableFile(const std::string& fname, TransactionToken* token,
+                           std::unique_ptr<WritableFile>* result) {
+    return Status::OK();
+  }
   /// \brief Creates a readonly region of memory with the file context.
   ///
   /// On success, it returns a pointer to read-only memory region
@@ -156,21 +172,41 @@ class Env {
   Status NewReadOnlyMemoryRegionFromFile(
       const std::string& fname, std::unique_ptr<ReadOnlyMemoryRegion>* result);
 
+  Status NewReadOnlyMemoryRegionFromFile(
+      const std::string& fname, TransactionToken* token,
+      std::unique_ptr<ReadOnlyMemoryRegion>* result) {
+    return Status::OK();
+  }
+
   /// Returns OK if the named path exists and NOT_FOUND otherwise.
   Status FileExists(const std::string& fname);
 
+  Status FileExists(const std::string& fname, TransactionToken* token) {
+    return Status::OK();
+  }
+
   /// Returns true if all the listed files exist, false otherwise.
   /// if status is not null, populate the vector with a detailed status
   /// for each file.
   bool FilesExist(const std::vector<string>& files,
                   std::vector<Status>* status);
 
+  bool FilesExist(const std::vector<string>& files, TransactionToken* token,
+                  std::vector<Status>* status) {
+    return true;
+  }
+
   /// \brief Stores in *result the names of the children of the specified
   /// directory. The names are relative to "dir".
   ///
   /// Original contents of *results are dropped.
   Status GetChildren(const std::string& dir, std::vector<string>* result);
 
+  Status GetChildren(const std::string& dir, TransactionToken* token,
+                     std::vector<string>* result) {
+    return Status::OK();
+  }
+
   /// \brief Returns true if the path matches the given pattern. The wildcards
   /// allowed in pattern are described in FileSystem::GetMatchingPaths.
   virtual bool MatchPath(const std::string& path,
@@ -183,9 +219,18 @@ class Env {
   virtual Status GetMatchingPaths(const std::string& pattern,
                                   std::vector<string>* results);
 
+  Status GetMatchingPaths(const std::string& pattern, TransactionToken* token,
+                          std::vector<string>* results) {
+    return Status::OK();
+  }
+
   /// Deletes the named file.
   Status DeleteFile(const std::string& fname);
 
+  Status DeleteFile(const std::string& fname, TransactionToken* token) {
+    return Status::OK();
+  }
+
   /// \brief Deletes the specified directory and all subdirectories and files
   /// underneath it. This is accomplished by traversing the directory tree
   /// rooted at dirname and deleting entries as they are encountered.
@@ -213,6 +258,11 @@ class Env {
   Status DeleteRecursively(const std::string& dirname, int64* undeleted_files,
                            int64* undeleted_dirs);
 
+  Status DeleteRecursively(const std::string& dirname, TransactionToken* token,
+                           int64* undeleted_files, int64* undeleted_dirs) {
+    return Status::OK();
+  }
+
   /// \brief Creates the specified directory and all the necessary
   /// subdirectories. Typical return codes.
   ///  * OK - successfully created the directory and sub directories, even if
@@ -220,18 +270,35 @@ class Env {
   ///  * PERMISSION_DENIED - dirname or some subdirectory is not writable.
   Status RecursivelyCreateDir(const std::string& dirname);
 
+  Status RecursivelyCreateDir(const std::string& dirname,
+                              TransactionToken* token) {
+    return Status::OK();
+  }
   /// \brief Creates the specified directory. Typical return codes
   ///  * OK - successfully created the directory.
   ///  * ALREADY_EXISTS - directory already exists.
   ///  * PERMISSION_DENIED - dirname is not writable.
   Status CreateDir(const std::string& dirname);
 
+  Status CreateDir(const std::string& dirname, TransactionToken* token) {
+    return Status::OK();
+  }
+
   /// Deletes the specified directory.
   Status DeleteDir(const std::string& dirname);
 
+  Status DeleteDir(const std::string& dirname, TransactionToken* token) {
+    return Status::OK();
+  }
+
   /// Obtains statistics for the given path.
   Status Stat(const std::string& fname, FileStatistics* stat);
 
+  Status Stat(const std::string& fname, TransactionToken* token,
+              FileStatistics* stat) {
+    return Status::OK();
+  }
+
   /// \brief Returns whether the given path is a directory or not.
   /// Typical return codes (not guaranteed exhaustive):
   ///  * OK - The path exists and is a directory.
@@ -256,13 +323,59 @@ class Env {
   /// Stores the size of `fname` in `*file_size`.
   Status GetFileSize(const std::string& fname, uint64* file_size);
 
+  Status GetFileSize(const std::string& fname, TransactionToken* token,
+                     uint64* file_size) {
+    return Status::OK();
+  }
+
   /// \brief Renames file src to target. If target already exists, it will be
   /// replaced.
   Status RenameFile(const std::string& src, const std::string& target);
 
+  Status RenameFile(const std::string& src, const std::string& target,
+                    TransactionToken* token) {
+    return Status::OK();
+  }
+
   /// \brief Copy the src to target.
   Status CopyFile(const std::string& src, const std::string& target);
 
+  Status CopyFile(const std::string& src, const std::string& target,
+                  TransactionToken* token) {
+    return Status::OK();
+  }
+
+  /// \brief starts a new transaction on the filesystem that handles filename
+  Status StartTransaction(const std::string& filename,
+                          TransactionToken** token) {
+    token = nullptr;
+    return Status::OK();
+  }
+
+  /// \brief Adds `path` to transaction in `token` if token belongs to
+  /// filesystem that handles the path.
+  Status AddToTransaction(const std::string& path, TransactionToken* token) {
+    return Status::OK();
+  }
+
+  /// \brief Get token for `path` or start a new transaction and add `path` to
+  /// it.
+  Status GetTokenOrStartTransaction(const std::string& path,
+                                    TransactionToken** token) {
+    *token = nullptr;
+    return Status::OK();
+  }
+
+  /// \brief Returns the transaction for `path` or nullptr in `token`
+  Status GetTransactionForPath(const std::string& path,
+                               TransactionToken** token) {
+    token = nullptr;
+    return Status::OK();
+  }
+
+  /// \brief Finalizes the transaction
+  Status EndTransaction(TransactionToken* token) { return Status::OK(); }
+
   /// \brief Returns the absolute path of the current executable. It resolves
   /// symlinks if there is any.
   std::string GetExecutablePath();
diff --git a/tensorflow/core/platform/env_test.cc b/tensorflow/core/platform/env_test.cc
index f013aff9703..79d793ee636 100644
--- a/tensorflow/core/platform/env_test.cc
+++ b/tensorflow/core/platform/env_test.cc
@@ -295,7 +295,9 @@ TEST_F(DefaultEnvTest, SleepForMicroseconds) {
 
 class TmpDirFileSystem : public NullFileSystem {
  public:
-  Status FileExists(const string& dir) override {
+  TF_USE_FILESYSTEM_METHODS_WITH_NO_TRANSACTION_SUPPORT;
+
+  Status FileExists(const string& dir, TransactionToken* token) override {
     StringPiece scheme, host, path;
     io::ParseURI(dir, &scheme, &host, &path);
     if (path.empty()) return errors::NotFound(dir, " not found");
@@ -311,7 +313,7 @@ class TmpDirFileSystem : public NullFileSystem {
     return Env::Default()->FileExists(io::JoinPath(BaseDir(), path));
   }
 
-  Status CreateDir(const string& dir) override {
+  Status CreateDir(const string& dir, TransactionToken* token) override {
     StringPiece scheme, host, path;
     io::ParseURI(dir, &scheme, &host, &path);
     if (scheme != "tmpdirfs") {
@@ -328,7 +330,7 @@ class TmpDirFileSystem : public NullFileSystem {
     return status;
   }
 
-  Status IsDirectory(const string& dir) override {
+  Status IsDirectory(const string& dir, TransactionToken* token) override {
     StringPiece scheme, host, path;
     io::ParseURI(dir, &scheme, &host, &path);
     for (const auto& existing_dir : created_directories_)
@@ -336,7 +338,7 @@ class TmpDirFileSystem : public NullFileSystem {
     return errors::NotFound(dir, " not found");
   }
 
-  void FlushCaches() override { flushed_ = true; }
+  void FlushCaches(TransactionToken* token) override { flushed_ = true; }
 
  private:
   bool flushed_ = false;
diff --git a/tensorflow/core/platform/file_system.h b/tensorflow/core/platform/file_system.h
index 4a8d9e63023..3b593049902 100644
--- a/tensorflow/core/platform/file_system.h
+++ b/tensorflow/core/platform/file_system.h
@@ -68,7 +68,7 @@ class FileSystem {
   /// The ownership of the returned RandomAccessFile is passed to the caller
   /// and the object should be deleted when is not used.
   virtual tensorflow::Status NewRandomAccessFile(
-      const string& fname, std::unique_ptr<RandomAccessFile>* result) {
+      const std::string& fname, std::unique_ptr<RandomAccessFile>* result) {
     return NewRandomAccessFile(fname, nullptr, result);
   };
 
@@ -93,7 +93,7 @@ class FileSystem {
   /// The ownership of the returned WritableFile is passed to the caller
   /// and the object should be deleted when is not used.
   virtual tensorflow::Status NewWritableFile(
-      const string& fname, std::unique_ptr<WritableFile>* result) {
+      const std::string& fname, std::unique_ptr<WritableFile>* result) {
     return NewWritableFile(fname, nullptr, result);
   };
 
@@ -115,7 +115,7 @@ class FileSystem {
   /// The ownership of the returned WritableFile is passed to the caller
   /// and the object should be deleted when is not used.
   virtual tensorflow::Status NewAppendableFile(
-      const string& fname, std::unique_ptr<WritableFile>* result) {
+      const std::string& fname, std::unique_ptr<WritableFile>* result) {
     return NewAppendableFile(fname, nullptr, result);
   };
 
@@ -136,7 +136,7 @@ class FileSystem {
   /// The ownership of the returned ReadOnlyMemoryRegion is passed to the caller
   /// and the object should be deleted when is not used.
   virtual tensorflow::Status NewReadOnlyMemoryRegionFromFile(
-      const string& fname, std::unique_ptr<ReadOnlyMemoryRegion>* result) {
+      const std::string& fname, std::unique_ptr<ReadOnlyMemoryRegion>* result) {
     return NewReadOnlyMemoryRegionFromFile(fname, nullptr, result);
   }
 
@@ -147,7 +147,7 @@ class FileSystem {
   }
 
   /// Returns OK if the named path exists and NOT_FOUND otherwise.
-  virtual tensorflow::Status FileExists(const string& fname) {
+  virtual tensorflow::Status FileExists(const std::string& fname) {
     return FileExists(fname, nullptr);
   };
 
@@ -222,7 +222,8 @@ class FileSystem {
   virtual bool Match(const std::string& filename, const std::string& pattern);
 
   /// \brief Obtains statistics for the given path.
-  virtual tensorflow::Status Stat(const string& fname, FileStatistics* stat) {
+  virtual tensorflow::Status Stat(const std::string& fname,
+                                  FileStatistics* stat) {
     return Stat(fname, nullptr, stat);
   }
 
@@ -233,7 +234,7 @@ class FileSystem {
   }
 
   /// \brief Deletes the named file.
-  virtual tensorflow::Status DeleteFile(const string& fname) {
+  virtual tensorflow::Status DeleteFile(const std::string& fname) {
     return DeleteFile(fname, nullptr);
   }
 
@@ -247,7 +248,7 @@ class FileSystem {
   ///  * OK - successfully created the directory.
   ///  * ALREADY_EXISTS - directory with name dirname already exists.
   ///  * PERMISSION_DENIED - dirname is not writable.
-  virtual tensorflow::Status CreateDir(const string& dirname) {
+  virtual tensorflow::Status CreateDir(const std::string& dirname) {
     return CreateDir(dirname, nullptr);
   }
 
@@ -262,7 +263,7 @@ class FileSystem {
   ///  * OK - successfully created the directory and sub directories, even if
   ///         they were already created.
   ///  * PERMISSION_DENIED - dirname or some subdirectory is not writable.
-  virtual tensorflow::Status RecursivelyCreateDir(const string& dirname) {
+  virtual tensorflow::Status RecursivelyCreateDir(const std::string& dirname) {
     return RecursivelyCreateDir(dirname, nullptr);
   }
 
@@ -270,7 +271,7 @@ class FileSystem {
                                                   TransactionToken* token);
 
   /// \brief Deletes the specified directory.
-  virtual tensorflow::Status DeleteDir(const string& dirname) {
+  virtual tensorflow::Status DeleteDir(const std::string& dirname) {
     return DeleteDir(dirname, nullptr);
   };
 
@@ -309,7 +310,7 @@ class FileSystem {
     return DeleteRecursively(dirname, nullptr, undeleted_files, undeleted_dirs);
   }
 
-  virtual tensorflow::Status DeleteRecursively(const string& dirname,
+  virtual tensorflow::Status DeleteRecursively(const std::string& dirname,
                                                TransactionToken* token,
                                                int64* undeleted_files,
                                                int64* undeleted_dirs);
@@ -327,8 +328,8 @@ class FileSystem {
   }
 
   /// \brief Overwrites the target if it exists.
-  virtual tensorflow::Status RenameFile(const string& src,
-                                        const string& target) {
+  virtual tensorflow::Status RenameFile(const std::string& src,
+                                        const std::string& target) {
     return RenameFile(src, target, nullptr);
   }
 
@@ -339,7 +340,8 @@ class FileSystem {
   }
 
   /// \brief Copy the src to target.
-  virtual tensorflow::Status CopyFile(const string& src, const string& target) {
+  virtual tensorflow::Status CopyFile(const std::string& src,
+                                      const std::string& target) {
     return CopyFile(src, target, nullptr);
   }
 
@@ -365,7 +367,7 @@ class FileSystem {
   ///  * NOT_FOUND - The path entry does not exist.
   ///  * PERMISSION_DENIED - Insufficient permissions.
   ///  * UNIMPLEMENTED - The file factory doesn't support directories.
-  virtual tensorflow::Status IsDirectory(const string& fname) {
+  virtual tensorflow::Status IsDirectory(const std::string& fname) {
     return IsDirectory(fname, nullptr);
   }
 
@@ -481,7 +483,7 @@ class FileSystem {
 
   /// \brief Starts a new transaction
   virtual tensorflow::Status StartTransaction(TransactionToken** token) {
-    token = nullptr;
+    *token = nullptr;
     return Status::OK();
   }
 
@@ -500,14 +502,14 @@ class FileSystem {
   /// it.
   virtual tensorflow::Status GetTokenOrStartTransaction(
       const std::string& path, TransactionToken** token) {
-    token = nullptr;
+    *token = nullptr;
     return Status::OK();
   }
 
   /// \brief Return transaction for `path` or nullptr in `token`
   virtual tensorflow::Status GetTransactionForPath(const std::string& path,
                                                    TransactionToken** token) {
-    token = nullptr;
+    *token = nullptr;
     return Status::OK();
   }
 
@@ -518,6 +520,30 @@ class FileSystem {
 
   virtual ~FileSystem() = default;
 };
+/// This macro adds forwarding methods from FileSystem class to
+/// used class since name hiding will prevent these to be accessed from
+/// derived classes and would require all use locations to migrate to
+/// Transactional API. This is an interim solution until ModularFileSystem class
+/// becomes a singleton.
+// TODO(sami): Remove this macro when filesystem plugins migration is complete.
+#define TF_USE_FILESYSTEM_METHODS_WITH_NO_TRANSACTION_SUPPORT \
+  using FileSystem::NewRandomAccessFile;                      \
+  using FileSystem::NewWritableFile;                          \
+  using FileSystem::NewAppendableFile;                        \
+  using FileSystem::NewReadOnlyMemoryRegionFromFile;          \
+  using FileSystem::FileExists;                               \
+  using FileSystem::GetChildren;                              \
+  using FileSystem::GetMatchingPaths;                         \
+  using FileSystem::Stat;                                     \
+  using FileSystem::DeleteFile;                               \
+  using FileSystem::RecursivelyCreateDir;                     \
+  using FileSystem::DeleteDir;                                \
+  using FileSystem::DeleteRecursively;                        \
+  using FileSystem::GetFileSize;                              \
+  using FileSystem::RenameFile;                               \
+  using FileSystem::CopyFile;                                 \
+  using FileSystem::IsDirectory;                              \
+  using FileSystem::FlushCaches
 
 /// A Wrapper class for Transactional FileSystem support.
 /// This provides means to make use of the transactions with minimal code change
@@ -529,6 +555,8 @@ class FileSystem {
 /// transactional filesystem access with minimal code change.
 class WrappedFileSystem : public FileSystem {
  public:
+  TF_USE_FILESYSTEM_METHODS_WITH_NO_TRANSACTION_SUPPORT;
+
   tensorflow::Status NewRandomAccessFile(
       const std::string& fname, TransactionToken* token,
       std::unique_ptr<RandomAccessFile>* result) override {
@@ -605,7 +633,7 @@ class WrappedFileSystem : public FileSystem {
     return fs_->DeleteDir(dirname, (token ? token : token_));
   }
 
-  tensorflow::Status DeleteRecursively(const string& dirname,
+  tensorflow::Status DeleteRecursively(const std::string& dirname,
                                        TransactionToken* token,
                                        int64* undeleted_files,
                                        int64* undeleted_dirs) override {
@@ -691,31 +719,6 @@ class WrappedFileSystem : public FileSystem {
   TransactionToken* token_;
 };
 
-/// This macro adds forwarding methods from FileSystem class to
-/// used class since name hiding will prevent these to be accessed from
-/// derived classes and would require all use locations to migrate to
-/// Transactional API. This is an interim solution until ModularFileSystem class
-/// becomes a singleton.
-// TODO(sami): Remove this macro when filesystem plugins migration is complete.
-#define TF_USE_FILESYSTEM_METHODS_WITH_NO_TRANSACTION_SUPPORT \
-  using FileSystem::NewRandomAccessFile;                      \
-  using FileSystem::NewWritableFile;                          \
-  using FileSystem::NewAppendableFile;                        \
-  using FileSystem::NewReadOnlyMemoryRegionFromFile;          \
-  using FileSystem::FileExists;                               \
-  using FileSystem::GetChildren;                              \
-  using FileSystem::GetMatchingPaths;                         \
-  using FileSystem::Stat;                                     \
-  using FileSystem::DeleteFile;                               \
-  using FileSystem::RecursivelyCreateDir;                     \
-  using FileSystem::DeleteDir;                                \
-  using FileSystem::DeleteRecursively;                        \
-  using FileSystem::GetFileSize;                              \
-  using FileSystem::RenameFile;                               \
-  using FileSystem::CopyFile;                                 \
-  using FileSystem::IsDirectory;                              \
-  using FileSystem::FlushCaches
-
 /// A file abstraction for randomly reading the contents of a file.
 class RandomAccessFile {
  public:
diff --git a/tensorflow/core/platform/file_system_test.cc b/tensorflow/core/platform/file_system_test.cc
index 0af45185612..b683723936b 100644
--- a/tensorflow/core/platform/file_system_test.cc
+++ b/tensorflow/core/platform/file_system_test.cc
@@ -32,7 +32,9 @@ static const char* const kPrefix = "ipfs://solarsystem";
 // cannot have children further.
 class InterPlanetaryFileSystem : public NullFileSystem {
  public:
-  Status FileExists(const string& fname) override {
+  TF_USE_FILESYSTEM_METHODS_WITH_NO_TRANSACTION_SUPPORT;
+
+  Status FileExists(const string& fname, TransactionToken* token) override {
     string parsed_path;
     ParsePath(fname, &parsed_path);
     if (BodyExists(parsed_path)) {
@@ -42,7 +44,7 @@ class InterPlanetaryFileSystem : public NullFileSystem {
   }
 
   // Adds the dir to the parent's children list and creates an entry for itself.
-  Status CreateDir(const string& dirname) override {
+  Status CreateDir(const string& dirname, TransactionToken* token) override {
     string parsed_path;
     ParsePath(dirname, &parsed_path);
     // If the directory already exists, throw an error.
@@ -88,7 +90,7 @@ class InterPlanetaryFileSystem : public NullFileSystem {
     return Status(tensorflow::error::FAILED_PRECONDITION, "Failed to create");
   }
 
-  Status IsDirectory(const string& dirname) override {
+  Status IsDirectory(const string& dirname, TransactionToken* token) override {
     string parsed_path;
     ParsePath(dirname, &parsed_path);
     // Simulate evil_directory has bad permissions by throwing a LOG(FATAL)
@@ -105,8 +107,9 @@ class InterPlanetaryFileSystem : public NullFileSystem {
     return Status(tensorflow::error::FAILED_PRECONDITION, "Not a dir");
   }
 
-  Status GetChildren(const string& dir, std::vector<string>* result) override {
-    TF_RETURN_IF_ERROR(IsDirectory(dir));
+  Status GetChildren(const string& dir, TransactionToken* token,
+                     std::vector<string>* result) override {
+    TF_RETURN_IF_ERROR(IsDirectory(dir, nullptr));
     string parsed_path;
     ParsePath(dir, &parsed_path);
     result->insert(result->begin(), celestial_bodies_[parsed_path].begin(),
@@ -151,8 +154,8 @@ class InterPlanetaryFileSystem : public NullFileSystem {
 // common prefix of BaseDir().
 string Match(InterPlanetaryFileSystem* ipfs, const string& suffix_pattern) {
   std::vector<string> results;
-  Status s =
-      ipfs->GetMatchingPaths(ipfs->JoinPath(kPrefix, suffix_pattern), &results);
+  Status s = ipfs->GetMatchingPaths(ipfs->JoinPath(kPrefix, suffix_pattern),
+                                    nullptr, &results);
   if (!s.ok()) {
     return s.ToString();
   } else {
@@ -179,18 +182,18 @@ TEST(InterPlanetaryFileSystemTest, IPFSMatch) {
   // Returns Jupiter's and Earth's moons.
   EXPECT_EQ(Match(&ipfs, "*/*"),
             "Earth/Moon,Jupiter/Europa,Jupiter/Ganymede,Jupiter/Io");
-  TF_EXPECT_OK(ipfs.CreateDir(ipfs.JoinPath(kPrefix, "Planet0")));
-  TF_EXPECT_OK(ipfs.CreateDir(ipfs.JoinPath(kPrefix, "Planet1")));
+  TF_EXPECT_OK(ipfs.CreateDir(ipfs.JoinPath(kPrefix, "Planet0"), nullptr));
+  TF_EXPECT_OK(ipfs.CreateDir(ipfs.JoinPath(kPrefix, "Planet1"), nullptr));
   EXPECT_EQ(Match(&ipfs, "Planet[0-1]"), "Planet0,Planet1");
   EXPECT_EQ(Match(&ipfs, "Planet?"), "Planet0,Planet1");
 }
 
 TEST(InterPlanetaryFileSystemTest, MatchSimple) {
   InterPlanetaryFileSystem ipfs;
-  TF_EXPECT_OK(ipfs.CreateDir(ipfs.JoinPath(kPrefix, "match-00")));
-  TF_EXPECT_OK(ipfs.CreateDir(ipfs.JoinPath(kPrefix, "match-0a")));
-  TF_EXPECT_OK(ipfs.CreateDir(ipfs.JoinPath(kPrefix, "match-01")));
-  TF_EXPECT_OK(ipfs.CreateDir(ipfs.JoinPath(kPrefix, "match-aaa")));
+  TF_EXPECT_OK(ipfs.CreateDir(ipfs.JoinPath(kPrefix, "match-00"), nullptr));
+  TF_EXPECT_OK(ipfs.CreateDir(ipfs.JoinPath(kPrefix, "match-0a"), nullptr));
+  TF_EXPECT_OK(ipfs.CreateDir(ipfs.JoinPath(kPrefix, "match-01"), nullptr));
+  TF_EXPECT_OK(ipfs.CreateDir(ipfs.JoinPath(kPrefix, "match-aaa"), nullptr));
 
   EXPECT_EQ(Match(&ipfs, "match-*"), "match-00,match-01,match-0a,match-aaa");
   EXPECT_EQ(Match(&ipfs, "match-0[0-9]"), "match-00,match-01");
@@ -203,22 +206,23 @@ TEST(InterPlanetaryFileSystemTest, MatchSimple) {
 // that evil_directory isn't accessed.
 TEST(InterPlanetaryFileSystemTest, MatchOnlyNeeded) {
   InterPlanetaryFileSystem ipfs;
-  TF_EXPECT_OK(ipfs.CreateDir(ipfs.JoinPath(kPrefix, "abcd")));
-  TF_EXPECT_OK(ipfs.CreateDir(ipfs.JoinPath(kPrefix, "evil_directory")));
+  TF_EXPECT_OK(ipfs.CreateDir(ipfs.JoinPath(kPrefix, "abcd"), nullptr));
+  TF_EXPECT_OK(
+      ipfs.CreateDir(ipfs.JoinPath(kPrefix, "evil_directory"), nullptr));
 
   EXPECT_EQ(Match(&ipfs, "abcd"), "abcd");
 }
 
 TEST(InterPlanetaryFileSystemTest, MatchDirectory) {
   InterPlanetaryFileSystem ipfs;
-  TF_EXPECT_OK(
-      ipfs.RecursivelyCreateDir(ipfs.JoinPath(kPrefix, "match-00/abc/x")));
-  TF_EXPECT_OK(
-      ipfs.RecursivelyCreateDir(ipfs.JoinPath(kPrefix, "match-0a/abc/x")));
-  TF_EXPECT_OK(
-      ipfs.RecursivelyCreateDir(ipfs.JoinPath(kPrefix, "match-01/abc/x")));
-  TF_EXPECT_OK(
-      ipfs.RecursivelyCreateDir(ipfs.JoinPath(kPrefix, "match-aaa/abc/x")));
+  TF_EXPECT_OK(ipfs.RecursivelyCreateDir(
+      ipfs.JoinPath(kPrefix, "match-00/abc/x"), nullptr));
+  TF_EXPECT_OK(ipfs.RecursivelyCreateDir(
+      ipfs.JoinPath(kPrefix, "match-0a/abc/x"), nullptr));
+  TF_EXPECT_OK(ipfs.RecursivelyCreateDir(
+      ipfs.JoinPath(kPrefix, "match-01/abc/x"), nullptr));
+  TF_EXPECT_OK(ipfs.RecursivelyCreateDir(
+      ipfs.JoinPath(kPrefix, "match-aaa/abc/x"), nullptr));
 
   EXPECT_EQ(Match(&ipfs, "match-*/abc/x"),
             "match-00/abc/x,match-01/abc/x,match-0a/abc/x,match-aaa/abc/x");
@@ -232,20 +236,20 @@ TEST(InterPlanetaryFileSystemTest, MatchDirectory) {
 
 TEST(InterPlanetaryFileSystemTest, MatchMultipleWildcards) {
   InterPlanetaryFileSystem ipfs;
-  TF_EXPECT_OK(
-      ipfs.RecursivelyCreateDir(ipfs.JoinPath(kPrefix, "match-00/abc/00")));
-  TF_EXPECT_OK(
-      ipfs.RecursivelyCreateDir(ipfs.JoinPath(kPrefix, "match-00/abc/01")));
-  TF_EXPECT_OK(
-      ipfs.RecursivelyCreateDir(ipfs.JoinPath(kPrefix, "match-00/abc/09")));
-  TF_EXPECT_OK(
-      ipfs.RecursivelyCreateDir(ipfs.JoinPath(kPrefix, "match-01/abc/00")));
-  TF_EXPECT_OK(
-      ipfs.RecursivelyCreateDir(ipfs.JoinPath(kPrefix, "match-01/abc/04")));
-  TF_EXPECT_OK(
-      ipfs.RecursivelyCreateDir(ipfs.JoinPath(kPrefix, "match-01/abc/10")));
-  TF_EXPECT_OK(
-      ipfs.RecursivelyCreateDir(ipfs.JoinPath(kPrefix, "match-02/abc/00")));
+  TF_EXPECT_OK(ipfs.RecursivelyCreateDir(
+      ipfs.JoinPath(kPrefix, "match-00/abc/00"), nullptr));
+  TF_EXPECT_OK(ipfs.RecursivelyCreateDir(
+      ipfs.JoinPath(kPrefix, "match-00/abc/01"), nullptr));
+  TF_EXPECT_OK(ipfs.RecursivelyCreateDir(
+      ipfs.JoinPath(kPrefix, "match-00/abc/09"), nullptr));
+  TF_EXPECT_OK(ipfs.RecursivelyCreateDir(
+      ipfs.JoinPath(kPrefix, "match-01/abc/00"), nullptr));
+  TF_EXPECT_OK(ipfs.RecursivelyCreateDir(
+      ipfs.JoinPath(kPrefix, "match-01/abc/04"), nullptr));
+  TF_EXPECT_OK(ipfs.RecursivelyCreateDir(
+      ipfs.JoinPath(kPrefix, "match-01/abc/10"), nullptr));
+  TF_EXPECT_OK(ipfs.RecursivelyCreateDir(
+      ipfs.JoinPath(kPrefix, "match-02/abc/00"), nullptr));
 
   EXPECT_EQ(Match(&ipfs, "match-0[0-1]/abc/0[0-8]"),
             "match-00/abc/00,match-00/abc/01,match-01/abc/00,match-01/abc/04");
@@ -273,7 +277,7 @@ TEST(InterPlanetaryFileSystemTest, HasAtomicMove) {
 class TestFileSystem : public NullFileSystem {
  public:
   // Only allow for a single root directory.
-  Status IsDirectory(const string& dirname) override {
+  Status IsDirectory(const string& dirname, TransactionToken* token) override {
     if (dirname == "." || dirname.empty()) {
       return Status::OK();
     }
@@ -281,7 +285,8 @@ class TestFileSystem : public NullFileSystem {
   }
 
   // Simulating a FS with a root dir and a single file underneath it.
-  Status GetChildren(const string& dir, std::vector<string>* result) override {
+  Status GetChildren(const string& dir, TransactionToken* token,
+                     std::vector<string>* result) override {
     if (dir == "." || dir.empty()) {
       result->push_back("test");
     }
@@ -293,10 +298,10 @@ class TestFileSystem : public NullFileSystem {
 TEST(TestFileSystemTest, RootDirectory) {
   TestFileSystem fs;
   std::vector<string> results;
-  auto ret = fs.GetMatchingPaths("./te*", &results);
+  auto ret = fs.GetMatchingPaths("./te*", nullptr, &results);
   EXPECT_EQ(1, results.size());
   EXPECT_EQ("./test", results[0]);
-  ret = fs.GetMatchingPaths("te*", &results);
+  ret = fs.GetMatchingPaths("te*", nullptr, &results);
   EXPECT_EQ(1, results.size());
   EXPECT_EQ("./test", results[0]);
 }
diff --git a/tensorflow/core/platform/hadoop/hadoop_file_system.cc b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
index 327f506665f..3a036cec5b8 100644
--- a/tensorflow/core/platform/hadoop/hadoop_file_system.cc
+++ b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
@@ -209,7 +209,14 @@ class HDFSRandomAccessFile : public RandomAccessFile {
       : filename_(filename),
         hdfs_filename_(hdfs_filename),
         fs_(fs),
-        file_(file) {}
+        file_(file) {
+    const char* disable_eof_retried = getenv("HDFS_DISABLE_READ_EOF_RETRIED");
+    if (disable_eof_retried && disable_eof_retried[0] == '1') {
+      disable_eof_retried_ = true;
+    } else {
+      disable_eof_retried_ = false;
+    }
+  }
 
   ~HDFSRandomAccessFile() override {
     if (file_ != nullptr) {
@@ -228,6 +235,10 @@ class HDFSRandomAccessFile : public RandomAccessFile {
     Status s;
     char* dst = scratch;
     bool eof_retried = false;
+    if (disable_eof_retried_) {
+      // eof_retried = true, avoid calling hdfsOpenFile in Read, Fixes #42597
+      eof_retried = true;
+    }
     while (n > 0 && s.ok()) {
       // We lock inside the loop rather than outside so we don't block other
       // concurrent readers.
@@ -274,14 +285,15 @@ class HDFSRandomAccessFile : public RandomAccessFile {
   string filename_;
   string hdfs_filename_;
   hdfsFS fs_;
+  bool disable_eof_retried_;
 
   mutable mutex mu_;
   mutable hdfsFile file_ TF_GUARDED_BY(mu_);
 };
 
 Status HadoopFileSystem::NewRandomAccessFile(
-    const string& fname,
-    std::unique_ptr<RandomAccessFile>* result /*, TransactionToken* token */) {
+    const string& fname, TransactionToken* token,
+    std::unique_ptr<RandomAccessFile>* result) {
   hdfsFS fs = nullptr;
   TF_RETURN_IF_ERROR(Connect(fname, &fs));
 
@@ -373,8 +385,8 @@ class HDFSWritableFile : public WritableFile {
 };
 
 Status HadoopFileSystem::NewWritableFile(
-    const string& fname,
-    std::unique_ptr<WritableFile>* result /*, TransactionToken* token */) {
+    const string& fname, TransactionToken* token,
+    std::unique_ptr<WritableFile>* result) {
   hdfsFS fs = nullptr;
   TF_RETURN_IF_ERROR(Connect(fname, &fs));
 
@@ -388,8 +400,8 @@ Status HadoopFileSystem::NewWritableFile(
 }
 
 Status HadoopFileSystem::NewAppendableFile(
-    const string& fname,
-    std::unique_ptr<WritableFile>* result /*, TransactionToken* token */) {
+    const string& fname, TransactionToken* token,
+    std::unique_ptr<WritableFile>* result) {
   hdfsFS fs = nullptr;
   TF_RETURN_IF_ERROR(Connect(fname, &fs));
 
@@ -403,8 +415,8 @@ Status HadoopFileSystem::NewAppendableFile(
 }
 
 Status HadoopFileSystem::NewReadOnlyMemoryRegionFromFile(
-    const string& fname, std::unique_ptr<ReadOnlyMemoryRegion>*
-                             result /*, TransactionToken* token */) {
+    const string& fname, TransactionToken* token,
+    std::unique_ptr<ReadOnlyMemoryRegion>* result) {
   // hadoopReadZero() technically supports this call with the following
   // caveats:
   // - It only works up to 2 GB. We'd have to Stat() the file to ensure that
@@ -414,8 +426,8 @@ Status HadoopFileSystem::NewReadOnlyMemoryRegionFromFile(
   return errors::Unimplemented("HDFS does not support ReadOnlyMemoryRegion");
 }
 
-Status HadoopFileSystem::FileExists(
-    const string& fname /*, TransactionToken* token */) {
+Status HadoopFileSystem::FileExists(const string& fname,
+                                    TransactionToken* token) {
   hdfsFS fs = nullptr;
   TF_RETURN_IF_ERROR(Connect(fname, &fs));
   if (libhdfs()->hdfsExists(fs, TranslateName(fname).c_str()) == 0) {
@@ -424,9 +436,8 @@ Status HadoopFileSystem::FileExists(
   return errors::NotFound(fname, " not found.");
 }
 
-Status HadoopFileSystem::GetChildren(
-    const string& dir,
-    std::vector<string>* result /*, TransactionToken* token */) {
+Status HadoopFileSystem::GetChildren(const string& dir, TransactionToken* token,
+                                     std::vector<string>* result) {
   result->clear();
   hdfsFS fs = nullptr;
   TF_RETURN_IF_ERROR(Connect(dir, &fs));
@@ -434,7 +445,7 @@ Status HadoopFileSystem::GetChildren(
   // hdfsListDirectory returns nullptr if the directory is empty. Do a separate
   // check to verify the directory exists first.
   FileStatistics stat;
-  TF_RETURN_IF_ERROR(Stat(dir, &stat));
+  TF_RETURN_IF_ERROR(Stat(dir, token, &stat));
 
   int entries = 0;
   hdfsFileInfo* info =
@@ -453,14 +464,14 @@ Status HadoopFileSystem::GetChildren(
   return Status::OK();
 }
 
-Status HadoopFileSystem::GetMatchingPaths(
-    const string& pattern,
-    std::vector<string>* results /*, TransactionToken* token */) {
+Status HadoopFileSystem::GetMatchingPaths(const string& pattern,
+                                          TransactionToken* token,
+                                          std::vector<string>* results) {
   return internal::GetMatchingPaths(this, Env::Default(), pattern, results);
 }
 
-Status HadoopFileSystem::DeleteFile(
-    const string& fname /*, TransactionToken* token */) {
+Status HadoopFileSystem::DeleteFile(const string& fname,
+                                    TransactionToken* token) {
   hdfsFS fs = nullptr;
   TF_RETURN_IF_ERROR(Connect(fname, &fs));
 
@@ -471,8 +482,7 @@ Status HadoopFileSystem::DeleteFile(
   return Status::OK();
 }
 
-Status HadoopFileSystem::CreateDir(
-    const string& dir /*, TransactionToken* token */) {
+Status HadoopFileSystem::CreateDir(const string& dir, TransactionToken* token) {
   hdfsFS fs = nullptr;
   TF_RETURN_IF_ERROR(Connect(dir, &fs));
 
@@ -482,8 +492,7 @@ Status HadoopFileSystem::CreateDir(
   return Status::OK();
 }
 
-Status HadoopFileSystem::DeleteDir(
-    const string& dir /*, TransactionToken* token */) {
+Status HadoopFileSystem::DeleteDir(const string& dir, TransactionToken* token) {
   hdfsFS fs = nullptr;
   TF_RETURN_IF_ERROR(Connect(dir, &fs));
 
@@ -502,7 +511,7 @@ Status HadoopFileSystem::DeleteDir(
   // the call is actually successful. Check again by Stat.
   if (info == nullptr && errno != 0) {
     FileStatistics stat;
-    TF_RETURN_IF_ERROR(Stat(dir, &stat));
+    TF_RETURN_IF_ERROR(Stat(dir, token, &stat));
   }
 
   if (entries > 0) {
@@ -515,8 +524,8 @@ Status HadoopFileSystem::DeleteDir(
   return Status::OK();
 }
 
-Status HadoopFileSystem::GetFileSize(
-    const string& fname, uint64* size /*, TransactionToken* token */) {
+Status HadoopFileSystem::GetFileSize(const string& fname,
+                                     TransactionToken* token, uint64* size) {
   hdfsFS fs = nullptr;
   TF_RETURN_IF_ERROR(Connect(fname, &fs));
 
@@ -530,8 +539,8 @@ Status HadoopFileSystem::GetFileSize(
   return Status::OK();
 }
 
-Status HadoopFileSystem::RenameFile(
-    const string& src, const string& target /*, TransactionToken* token */) {
+Status HadoopFileSystem::RenameFile(const string& src, const string& target,
+                                    TransactionToken* token) {
   hdfsFS fs = nullptr;
   TF_RETURN_IF_ERROR(Connect(src, &fs));
 
@@ -548,8 +557,8 @@ Status HadoopFileSystem::RenameFile(
   return Status::OK();
 }
 
-Status HadoopFileSystem::Stat(
-    const string& fname, FileStatistics* stats /*, TransactionToken* token */) {
+Status HadoopFileSystem::Stat(const string& fname, TransactionToken* token,
+                              FileStatistics* stats) {
   hdfsFS fs = nullptr;
   TF_RETURN_IF_ERROR(Connect(fname, &fs));
 
diff --git a/tensorflow/core/platform/hadoop/hadoop_file_system.h b/tensorflow/core/platform/hadoop/hadoop_file_system.h
index 13abc067cd8..24fb28c522e 100644
--- a/tensorflow/core/platform/hadoop/hadoop_file_system.h
+++ b/tensorflow/core/platform/hadoop/hadoop_file_system.h
@@ -32,63 +32,46 @@ class HadoopFileSystem : public FileSystem {
   HadoopFileSystem();
   ~HadoopFileSystem();
 
+  TF_USE_FILESYSTEM_METHODS_WITH_NO_TRANSACTION_SUPPORT;
+
   Status NewRandomAccessFile(
-      const string& fname,
-      std::unique_ptr<RandomAccessFile>*
-          result /*, TransactionToken* token = nullptr*/) override;
+      const string& fname, TransactionToken* token,
+      std::unique_ptr<RandomAccessFile>* result) override;
 
-  Status NewWritableFile(
-      const string& fname,
-      std::unique_ptr<WritableFile>*
-          result /*, TransactionToken* token = nullptr*/) override;
+  Status NewWritableFile(const string& fname, TransactionToken* token,
+                         std::unique_ptr<WritableFile>* result) override;
 
-  Status NewAppendableFile(
-      const string& fname,
-      std::unique_ptr<WritableFile>*
-          result /*, TransactionToken* token = nullptr*/) override;
+  Status NewAppendableFile(const string& fname, TransactionToken* token,
+                           std::unique_ptr<WritableFile>* result) override;
 
   Status NewReadOnlyMemoryRegionFromFile(
-      const string& fname,
-      std::unique_ptr<ReadOnlyMemoryRegion>*
-          result /*, TransactionToken* token = nullptr*/) override;
+      const string& fname, TransactionToken* token,
+      std::unique_ptr<ReadOnlyMemoryRegion>* result) override;
 
-  Status FileExists(
-      const string& fname /*, TransactionToken* token = nullptr*/) override;
+  Status FileExists(const string& fname, TransactionToken* token) override;
 
-  Status GetChildren(
-      const string& dir,
-      std::vector<string>* result /*, TransactionToken* token = nullptr*/)
-      override;
+  Status GetChildren(const string& dir, TransactionToken* token,
+                     std::vector<string>* result) override;
 
-  Status GetMatchingPaths(
-      const string& pattern,
-      std::vector<string>* results /*, TransactionToken* token = nullptr*/)
-      override;
+  Status GetMatchingPaths(const string& pattern, TransactionToken* token,
+                          std::vector<string>* results) override;
 
-  Status DeleteFile(
-      const string& fname /*, TransactionToken* token = nullptr*/) override;
+  Status DeleteFile(const string& fname, TransactionToken* token) override;
 
-  Status CreateDir(
-      const string& name /*, TransactionToken* token = nullptr*/) override;
+  Status CreateDir(const string& dir, TransactionToken* token) override;
 
-  Status DeleteDir(
-      const string& name /*, TransactionToken* token = nullptr*/) override;
+  Status DeleteDir(const string& dir, TransactionToken* token) override;
 
-  Status GetFileSize(
-      const string& fname,
-      uint64* size /*, TransactionToken* token = nullptr*/) override;
+  Status GetFileSize(const string& fname, TransactionToken* token,
+                     uint64* size) override;
 
-  Status RenameFile(
-      const string& src,
-      const string& target /*, TransactionToken* token = nullptr*/) override;
+  Status RenameFile(const string& src, const string& target,
+                    TransactionToken* token) override;
 
-  Status Stat(
-      const string& fname,
-      FileStatistics* stat /*, TransactionToken* token = nullptr*/) override;
+  Status Stat(const string& fname, TransactionToken* token,
+              FileStatistics* stat) override;
 
-  string TranslateName(
-      const string& name /*, TransactionToken* token = nullptr*/)
-      const override;
+  string TranslateName(const string& name) const override;
 
  private:
   Status Connect(StringPiece fname, hdfsFS* fs);
diff --git a/tensorflow/core/platform/hadoop/hadoop_file_system_test.cc b/tensorflow/core/platform/hadoop/hadoop_file_system_test.cc
index ae5d09c806b..892eb919141 100644
--- a/tensorflow/core/platform/hadoop/hadoop_file_system_test.cc
+++ b/tensorflow/core/platform/hadoop/hadoop_file_system_test.cc
@@ -121,12 +121,12 @@ TEST_F(HadoopFileSystemTest, FileExists) {
 
 TEST_F(HadoopFileSystemTest, GetChildren) {
   const string base = TmpDir("GetChildren");
-  TF_EXPECT_OK(hdfs.CreateDir(base));
+  TF_EXPECT_OK(hdfs.CreateDir(base, nullptr));
 
   const string file = io::JoinPath(base, "testfile.csv");
   TF_EXPECT_OK(WriteString(file, "blah"));
   const string subdir = io::JoinPath(base, "subdir");
-  TF_EXPECT_OK(hdfs.CreateDir(subdir));
+  TF_EXPECT_OK(hdfs.CreateDir(subdir, nullptr));
 
   std::vector<string> children;
   TF_EXPECT_OK(hdfs.GetChildren(base, &children));
@@ -151,7 +151,7 @@ TEST_F(HadoopFileSystemTest, GetFileSize) {
 
 TEST_F(HadoopFileSystemTest, CreateDirStat) {
   const string dir = TmpDir("CreateDirStat");
-  TF_EXPECT_OK(hdfs.CreateDir(dir));
+  TF_EXPECT_OK(hdfs.CreateDir(dir, nullptr));
   FileStatistics stat;
   TF_EXPECT_OK(hdfs.Stat(dir, &stat));
   EXPECT_TRUE(stat.is_directory);
@@ -160,7 +160,7 @@ TEST_F(HadoopFileSystemTest, CreateDirStat) {
 TEST_F(HadoopFileSystemTest, DeleteDir) {
   const string dir = TmpDir("DeleteDir");
   EXPECT_FALSE(hdfs.DeleteDir(dir).ok());
-  TF_EXPECT_OK(hdfs.CreateDir(dir));
+  TF_EXPECT_OK(hdfs.CreateDir(dir, nullptr));
   TF_EXPECT_OK(hdfs.DeleteDir(dir));
   FileStatistics stat;
   EXPECT_FALSE(hdfs.Stat(dir, &stat).ok());
@@ -236,6 +236,40 @@ TEST_F(HadoopFileSystemTest, WriteWhileReading) {
   TF_EXPECT_OK(writer->Close());
 }
 
+TEST_F(HadoopFileSystemTest, ReadWhileOverwriting) {
+  static char set_disable_var[] = "HDFS_DISABLE_READ_EOF_RETRIED=1";
+  putenv(set_disable_var);
+  const string fname = TmpDir("ReadWhileOverwriting");
+
+  if (!str_util::StartsWith(fname, "hdfs://")) {
+    return;
+  }
+
+  const string content1 = "content1";
+  TF_ASSERT_OK(WriteString(fname, content1));
+
+  std::unique_ptr<RandomAccessFile> reader;
+  TF_EXPECT_OK(hdfs.NewRandomAccessFile(fname, &reader));
+
+  string got;
+  got.resize(content1.size());
+  StringPiece result;
+  TF_EXPECT_OK(reader->Read(0, content1.size(), &result, &*got.begin()));
+  EXPECT_EQ(content1, result);
+
+  TF_EXPECT_OK(hdfs.DeleteFile(fname));
+
+  string content2 = "overwrite";
+  TF_ASSERT_OK(WriteString(fname, content1 + content2));
+
+  got.resize(content2.size());
+  reader->Read(content1.size(), content2.size(), &result, &*got.begin());
+  EXPECT_EQ(0, result.size());
+
+  static char set_enable_var[] = "HDFS_DISABLE_READ_EOF_RETRIED=0";
+  putenv(set_enable_var);
+}
+
 TEST_F(HadoopFileSystemTest, HarSplit) {
   string har_path =
       "har://hdfs-root/user/j.doe/my_archive.har/dir0/dir1/file.txt";
diff --git a/tensorflow/core/platform/macros.h b/tensorflow/core/platform/macros.h
index a38c57d1d04..4f8e49d2653 100644
--- a/tensorflow/core/platform/macros.h
+++ b/tensorflow/core/platform/macros.h
@@ -74,6 +74,25 @@ limitations under the License.
 #define TF_HAS_BUILTIN(x) 0
 #endif
 
+// C++11-style attributes (N2761)
+#if defined(__has_cpp_attribute)
+// Safely checks if an attribute is supported. Equivalent to
+// ABSL_HAVE_CPP_ATTRIBUTE.
+#define TF_HAS_CPP_ATTRIBUTE(n) __has_cpp_attribute(n)
+#else
+#define TF_HAS_CPP_ATTRIBUTE(n) 0
+#endif
+
+// [[clang::annotate("x")]] allows attaching custom strings (e.g. "x") to
+// declarations (variables, functions, fields, etc.) for use by tools. They are
+// represented in the Clang AST (as AnnotateAttr nodes) and in LLVM IR, but not
+// in final output.
+#if TF_HAS_CPP_ATTRIBUTE(clang::annotate)
+#define TF_ATTRIBUTE_ANNOTATE(str) [[clang::annotate(str)]]
+#else
+#define TF_ATTRIBUTE_ANNOTATE(str)
+#endif
+
 // Compilers can be told that a certain branch is not likely to be taken
 // (for instance, a CHECK failure), and use that information in static
 // analysis. Giving it this information can help it optimize for the
diff --git a/tensorflow/core/platform/null_file_system.h b/tensorflow/core/platform/null_file_system.h
index ef8879090e9..d7deca32da2 100644
--- a/tensorflow/core/platform/null_file_system.h
+++ b/tensorflow/core/platform/null_file_system.h
@@ -36,84 +36,69 @@ class NullFileSystem : public FileSystem {
 
   ~NullFileSystem() override = default;
 
+  TF_USE_FILESYSTEM_METHODS_WITH_NO_TRANSACTION_SUPPORT;
+
   Status NewRandomAccessFile(
-      const string& fname,
-      std::unique_ptr<RandomAccessFile>*
-          result /*, TransactionToken* token = nullptr */) override {
+      const string& fname, TransactionToken* token,
+      std::unique_ptr<RandomAccessFile>* result) override {
     return errors::Unimplemented("NewRandomAccessFile unimplemented");
   }
 
-  Status NewWritableFile(
-      const string& fname,
-      std::unique_ptr<WritableFile>*
-          result /*, TransactionToken* token = nullptr */) override {
+  Status NewWritableFile(const string& fname, TransactionToken* token,
+                         std::unique_ptr<WritableFile>* result) override {
     return errors::Unimplemented("NewWritableFile unimplemented");
   }
 
-  Status NewAppendableFile(
-      const string& fname,
-      std::unique_ptr<WritableFile>*
-          result /*, TransactionToken* token = nullptr */) override {
+  Status NewAppendableFile(const string& fname, TransactionToken* token,
+                           std::unique_ptr<WritableFile>* result) override {
     return errors::Unimplemented("NewAppendableFile unimplemented");
   }
 
   Status NewReadOnlyMemoryRegionFromFile(
-      const string& fname,
-      std::unique_ptr<ReadOnlyMemoryRegion>*
-          result /*, TransactionToken* token = nullptr */) override {
+      const string& fname, TransactionToken* token,
+      std::unique_ptr<ReadOnlyMemoryRegion>* result) override {
     return errors::Unimplemented(
         "NewReadOnlyMemoryRegionFromFile unimplemented");
   }
 
-  Status FileExists(
-      const string& fname /*, TransactionToken* token = nullptr */) override {
+  Status FileExists(const string& fname, TransactionToken* token) override {
     return errors::Unimplemented("FileExists unimplemented");
   }
 
-  Status GetChildren(
-      const string& dir,
-      std::vector<string>* result /*, TransactionToken* token = nullptr */)
-      override {
+  Status GetChildren(const string& dir, TransactionToken* token,
+                     std::vector<string>* result) override {
     return errors::Unimplemented("GetChildren unimplemented");
   }
 
-  Status GetMatchingPaths(
-      const string& pattern,
-      std::vector<string>* results /*, TransactionToken* token = nullptr */)
-      override {
+  Status GetMatchingPaths(const string& pattern, TransactionToken* token,
+                          std::vector<string>* results) override {
     return internal::GetMatchingPaths(this, Env::Default(), pattern, results);
   }
 
-  Status DeleteFile(
-      const string& fname /*, TransactionToken* token = nullptr */) override {
+  Status DeleteFile(const string& fname, TransactionToken* token) override {
     return errors::Unimplemented("DeleteFile unimplemented");
   }
 
-  Status CreateDir(
-      const string& dirname /*, TransactionToken* token = nullptr */) override {
+  Status CreateDir(const string& dirname, TransactionToken* token) override {
     return errors::Unimplemented("CreateDir unimplemented");
   }
 
-  Status DeleteDir(
-      const string& dirname /*, TransactionToken* token = nullptr */) override {
+  Status DeleteDir(const string& dirname, TransactionToken* token) override {
     return errors::Unimplemented("DeleteDir unimplemented");
   }
 
-  Status GetFileSize(
-      const string& fname,
-      uint64* file_size /*, TransactionToken* token = nullptr */) override {
+  Status GetFileSize(const string& fname, TransactionToken* token,
+                     uint64* file_size) override {
     return errors::Unimplemented("GetFileSize unimplemented");
   }
 
-  Status RenameFile(
-      const string& src,
-      const string& target /*, TransactionToken* token = nullptr */) override {
+  Status RenameFile(const string& src, const string& target,
+                    TransactionToken* token) override {
     return errors::Unimplemented("RenameFile unimplemented");
   }
 
-  Status Stat(
-      const string& fname,
-      FileStatistics* stat /*, TransactionToken* token = nullptr */) override {
+  Status Stat(const string& fname, TransactionToken* token,
+              FileStatistics* stat) override {
     return errors::Unimplemented("Stat unimplemented");
   }
 };
diff --git a/tensorflow/core/platform/path.cc b/tensorflow/core/platform/path.cc
index a041ac67d72..281371d3f70 100644
--- a/tensorflow/core/platform/path.cc
+++ b/tensorflow/core/platform/path.cc
@@ -328,6 +328,7 @@ string GetTempFilename(const string& extension) {
     }
   }
   LOG(FATAL) << "No temp directory found.";
+  std::abort();
 #endif
 }
 
diff --git a/tensorflow/core/platform/platform.h b/tensorflow/core/platform/platform.h
index 3375a6e50eb..8241fe0bc00 100644
--- a/tensorflow/core/platform/platform.h
+++ b/tensorflow/core/platform/platform.h
@@ -47,17 +47,6 @@ limitations under the License.
 // EMSCRIPTEN builds are considered "mobile" for the sake of portability.
 #define IS_MOBILE_PLATFORM
 
-#elif defined(__arm__) || defined(__aarch64__)
-// If no platform specified, use:
-#define PLATFORM_POSIX
-
-// Require an outside macro to tell us if we're building for Raspberry Pi or
-// another ARM device that's not a mobile platform.
-#if !defined(RASPBERRY_PI) && !defined(ARM_NON_MOBILE) && \
-    !defined(PLATFORM_GOOGLE)
-#define IS_MOBILE_PLATFORM
-#endif
-
 #else
 // If no platform specified, use:
 #define PLATFORM_POSIX
diff --git a/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.cc b/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.cc
index 6dc1826d93b..d61a036181d 100644
--- a/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.cc
+++ b/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.cc
@@ -54,12 +54,11 @@ uint64 AndroidArmV7ACpuUtilsHelper::GetCurrentClockCycle() {
   return static_cast<uint64>(count);
 }
 
-void AndroidArmV7ACpuUtilsHelper::EnableClockCycleProfiling(const bool enable) {
+void AndroidArmV7ACpuUtilsHelper::EnableClockCycleProfiling() {
   if (!is_initialized_) {
     // Initialize here to avoid unnecessary initialization
     InitializeInternal();
   }
-  if (enable) {
     const int64 cpu0_scaling_min = ReadCpuFrequencyFile(0, "scaling_min");
     const int64 cpu0_scaling_max = ReadCpuFrequencyFile(0, "scaling_max");
     if (cpu0_scaling_max != cpu0_scaling_min) {
@@ -69,9 +68,14 @@ void AndroidArmV7ACpuUtilsHelper::EnableClockCycleProfiling(const bool enable) {
     }
     ResetClockCycle();
     ioctl(fd_, PERF_EVENT_IOC_ENABLE, 0);
-  } else {
-    ioctl(fd_, PERF_EVENT_IOC_DISABLE, 0);
+}
+
+void AndroidArmV7ACpuUtilsHelper::DisableClockCycleProfiling() {
+  if (!is_initialized_) {
+    // Initialize here to avoid unnecessary initialization
+    InitializeInternal();
   }
+  ioctl(fd_, PERF_EVENT_IOC_DISABLE, 0);
 }
 
 int64 AndroidArmV7ACpuUtilsHelper::CalculateCpuFrequency() {
diff --git a/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.h b/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.h
index 2d94736c978..66bc0fd5928 100644
--- a/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.h
+++ b/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.h
@@ -36,7 +36,8 @@ class AndroidArmV7ACpuUtilsHelper : public ICpuUtilsHelper {
   AndroidArmV7ACpuUtilsHelper() = default;
   void ResetClockCycle() final;
   uint64 GetCurrentClockCycle() final;
-  void EnableClockCycleProfiling(bool enable) final;
+  void EnableClockCycleProfiling() final;
+  void DisableClockCycleProfiling() final;
   int64 CalculateCpuFrequency() final;
 
  private:
diff --git a/tensorflow/core/platform/profile_utils/cpu_utils.cc b/tensorflow/core/platform/profile_utils/cpu_utils.cc
index b22123a804a..7cd1c4de88f 100644
--- a/tensorflow/core/platform/profile_utils/cpu_utils.cc
+++ b/tensorflow/core/platform/profile_utils/cpu_utils.cc
@@ -58,8 +58,12 @@ static ICpuUtilsHelper* cpu_utils_helper_instance_ = nullptr;
   GetCpuUtilsHelperSingletonInstance().ResetClockCycle();
 }
 
-/* static */ void CpuUtils::EnableClockCycleProfiling(const bool enable) {
-  GetCpuUtilsHelperSingletonInstance().EnableClockCycleProfiling(enable);
+/* static */ void CpuUtils::EnableClockCycleProfiling() {
+  GetCpuUtilsHelperSingletonInstance().EnableClockCycleProfiling();
+}
+
+/* static */ void CpuUtils::DisableClockCycleProfiling() {
+  GetCpuUtilsHelperSingletonInstance().DisableClockCycleProfiling();
 }
 
 /* static */ std::chrono::duration<double> CpuUtils::ConvertClockCycleToTime(
diff --git a/tensorflow/core/platform/profile_utils/cpu_utils.h b/tensorflow/core/platform/profile_utils/cpu_utils.h
index d26f28478a5..1132c485f90 100644
--- a/tensorflow/core/platform/profile_utils/cpu_utils.h
+++ b/tensorflow/core/platform/profile_utils/cpu_utils.h
@@ -138,9 +138,10 @@ class CpuUtils {
   // clock cycle counters from overflowing on some platforms.
   static void ResetClockCycle();
 
-  // Enable clock cycle profile
+  // Enable/Disable clock cycle profile
   // You can enable / disable profile if it's supported by the platform
-  static void EnableClockCycleProfiling(bool enable);
+  static void EnableClockCycleProfiling();
+  static void DisableClockCycleProfiling();
 
   // Return chrono::duration per each clock
   static std::chrono::duration<double> ConvertClockCycleToTime(
@@ -152,7 +153,8 @@ class CpuUtils {
     DefaultCpuUtilsHelper() = default;
     void ResetClockCycle() final {}
     uint64 GetCurrentClockCycle() final { return DUMMY_CYCLE_CLOCK; }
-    void EnableClockCycleProfiling(bool /* enable */) final {}
+    void EnableClockCycleProfiling() final {}
+    void DisableClockCycleProfiling() final {}
     int64 CalculateCpuFrequency() final { return INVALID_FREQUENCY; }
 
    private:
diff --git a/tensorflow/core/platform/profile_utils/cpu_utils_test.cc b/tensorflow/core/platform/profile_utils/cpu_utils_test.cc
index eb8161fbfd5..a18561a1156 100644
--- a/tensorflow/core/platform/profile_utils/cpu_utils_test.cc
+++ b/tensorflow/core/platform/profile_utils/cpu_utils_test.cc
@@ -26,7 +26,7 @@ static constexpr bool DBG = false;
 
 class CpuUtilsTest : public ::testing::Test {
  protected:
-  void SetUp() override { CpuUtils::EnableClockCycleProfiling(true); }
+  void SetUp() override { CpuUtils::EnableClockCycleProfiling(); }
 };
 
 TEST_F(CpuUtilsTest, SetUpTestCase) {}
diff --git a/tensorflow/core/platform/profile_utils/i_cpu_utils_helper.h b/tensorflow/core/platform/profile_utils/i_cpu_utils_helper.h
index cab7618a70a..bd63ffd0e85 100644
--- a/tensorflow/core/platform/profile_utils/i_cpu_utils_helper.h
+++ b/tensorflow/core/platform/profile_utils/i_cpu_utils_helper.h
@@ -35,9 +35,10 @@ class ICpuUtilsHelper {
   virtual void ResetClockCycle() = 0;
   // Return current clock cycle.
   virtual uint64 GetCurrentClockCycle() = 0;
-  // Enable clock cycle profile
+  // Enable/Disable clock cycle profile
   // You can enable / disable profile if it's supported by the platform
-  virtual void EnableClockCycleProfiling(bool enable) = 0;
+  virtual void EnableClockCycleProfiling() = 0;
+  virtual void DisableClockCycleProfiling() = 0;
   // Return cpu frequency.
   // CAVEAT: as this method may read file and/or call system calls,
   // this call is supposed to be slow.
diff --git a/tensorflow/core/platform/ram_file_system.h b/tensorflow/core/platform/ram_file_system.h
index 9437a7174a9..ce6d05486e5 100644
--- a/tensorflow/core/platform/ram_file_system.h
+++ b/tensorflow/core/platform/ram_file_system.h
@@ -103,10 +103,11 @@ class RamRandomAccessFile : public RandomAccessFile, public WritableFile {
 
 class RamFileSystem : public FileSystem {
  public:
+  TF_USE_FILESYSTEM_METHODS_WITH_NO_TRANSACTION_SUPPORT;
+
   Status NewRandomAccessFile(
-      const string& fname,
-      std::unique_ptr<RandomAccessFile>*
-          result /*, TransactionToken* token = nullptr */) override {
+      const string& fname, TransactionToken* token,
+      std::unique_ptr<RandomAccessFile>* result) override {
     mutex_lock m(mu_);
     if (fs_.find(fname) == fs_.end()) {
       return errors::NotFound("");
@@ -116,10 +117,8 @@ class RamFileSystem : public FileSystem {
     return Status::OK();
   }
 
-  Status NewWritableFile(
-      const string& fname,
-      std::unique_ptr<WritableFile>*
-          result /*, TransactionToken* token = nullptr */) override {
+  Status NewWritableFile(const string& fname, TransactionToken* token,
+                         std::unique_ptr<WritableFile>* result) override {
     mutex_lock m(mu_);
     if (fs_.find(fname) == fs_.end()) {
       fs_[fname] = std::make_shared<std::string>();
@@ -128,10 +127,8 @@ class RamFileSystem : public FileSystem {
         new RamRandomAccessFile(fname, fs_[fname]));
     return Status::OK();
   }
-  Status NewAppendableFile(
-      const string& fname,
-      std::unique_ptr<WritableFile>*
-          result /*, TransactionToken* token = nullptr */) override {
+  Status NewAppendableFile(const string& fname, TransactionToken* token,
+                           std::unique_ptr<WritableFile>* result) override {
     mutex_lock m(mu_);
     if (fs_.find(fname) == fs_.end()) {
       fs_[fname] = std::make_shared<std::string>();
@@ -142,22 +139,18 @@ class RamFileSystem : public FileSystem {
   }
 
   Status NewReadOnlyMemoryRegionFromFile(
-      const string& fname,
-      std::unique_ptr<ReadOnlyMemoryRegion>*
-          result /*, TransactionToken* token = nullptr */) override {
+      const string& fname, TransactionToken* token,
+      std::unique_ptr<ReadOnlyMemoryRegion>* result) override {
     return errors::Unimplemented("");
   }
 
-  Status FileExists(
-      const string& fname /*, TransactionToken* token = nullptr */) override {
+  Status FileExists(const string& fname, TransactionToken* token) override {
     FileStatistics stat;
-    return Stat(fname, &stat);
+    return Stat(fname, token, &stat);
   }
 
-  Status GetChildren(
-      const string& dir,
-      std::vector<string>* result /*, TransactionToken* token = nullptr */)
-      override {
+  Status GetChildren(const string& dir, TransactionToken* token,
+                     std::vector<string>* result) override {
     mutex_lock m(mu_);
     auto it = fs_.lower_bound(dir);
     while (it != fs_.end() && absl::StartsWith(it->first, dir)) {
@@ -168,10 +161,8 @@ class RamFileSystem : public FileSystem {
     return Status::OK();
   }
 
-  Status GetMatchingPaths(
-      const string& pattern,
-      std::vector<string>* results /*, TransactionToken* token = nullptr */)
-      override {
+  Status GetMatchingPaths(const string& pattern, TransactionToken* token,
+                          std::vector<string>* results) override {
     mutex_lock m(mu_);
     Env* env = Env::Default();
     for (auto it = fs_.begin(); it != fs_.end(); ++it) {
@@ -182,12 +173,11 @@ class RamFileSystem : public FileSystem {
     return Status::OK();
   }
 
-  Status Stat(
-      const string& fname,
-      FileStatistics* stat /*, TransactionToken* token = nullptr */) override {
+  Status Stat(const string& fname, TransactionToken* token,
+              FileStatistics* stat) override {
     mutex_lock m(mu_);
     auto it = fs_.lower_bound(fname);
-    if (it == fs_.end()) {
+    if (it == fs_.end() || !absl::StartsWith(it->first, fname)) {
       return errors::NotFound("");
     }
 
@@ -204,8 +194,7 @@ class RamFileSystem : public FileSystem {
     return Status::OK();
   }
 
-  Status DeleteFile(
-      const string& fname /*, TransactionToken* token = nullptr */) override {
+  Status DeleteFile(const string& fname, TransactionToken* token) override {
     mutex_lock m(mu_);
     if (fs_.find(fname) != fs_.end()) {
       fs_.erase(fname);
@@ -215,24 +204,21 @@ class RamFileSystem : public FileSystem {
     return errors::NotFound("");
   }
 
-  Status CreateDir(
-      const string& dirname /*, TransactionToken* token = nullptr */) override {
+  Status CreateDir(const string& dirname, TransactionToken* token) override {
     return Status::OK();
   }
 
-  Status RecursivelyCreateDir(
-      const string& dirname /*, TransactionToken* token = nullptr */) override {
+  Status RecursivelyCreateDir(const string& dirname,
+                              TransactionToken* token) override {
     return Status::OK();
   }
 
-  Status DeleteDir(
-      const string& dirname /*, TransactionToken* token = nullptr */) override {
+  Status DeleteDir(const string& dirname, TransactionToken* token) override {
     return Status::OK();
   }
 
-  Status GetFileSize(
-      const string& fname,
-      uint64* file_size /*, TransactionToken* token = nullptr */) override {
+  Status GetFileSize(const string& fname, TransactionToken* token,
+                     uint64* file_size) override {
     mutex_lock m(mu_);
     if (fs_.find(fname) != fs_.end()) {
       *file_size = fs_[fname]->size();
@@ -241,9 +227,8 @@ class RamFileSystem : public FileSystem {
     return errors::NotFound("");
   }
 
-  Status RenameFile(
-      const string& src,
-      const string& target /*, TransactionToken* token = nullptr */) override {
+  Status RenameFile(const string& src, const string& target,
+                    TransactionToken* token) override {
     mutex_lock m(mu_);
     if (fs_.find(src) != fs_.end()) {
       fs_[target] = fs_[src];
diff --git a/tensorflow/core/platform/ram_file_system_test.py b/tensorflow/core/platform/ram_file_system_test.py
index 0f4f47ec44e..960765d68a2 100644
--- a/tensorflow/core/platform/ram_file_system_test.py
+++ b/tensorflow/core/platform/ram_file_system_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.eager import def_function
 from tensorflow.python.estimator.estimator import Estimator
 from tensorflow.python.estimator.model_fn import EstimatorSpec
 from tensorflow.python.estimator.run_config import RunConfig
@@ -28,9 +29,11 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
 from tensorflow.python.layers import core as core_layers
+from tensorflow.python.module import module
 from tensorflow.python.ops.losses import losses
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
+from tensorflow.python.saved_model import saved_model
 from tensorflow.python.training import adam
 from tensorflow.python.training import training_util
 
@@ -82,6 +85,17 @@ class RamFilesystemTest(test_util.TensorFlowTestCase):
     matches = ['ram://c/b/%d.txt' % i for i in range(10)]
     self.assertEqual(gfile.Glob('ram://c/b/*'), matches)
 
+  def test_file_exists(self):
+    with gfile.GFile('ram://exists/a/b/c.txt', 'w') as f:
+      f.write('')
+    self.assertTrue(gfile.Exists('ram://exists/a'))
+    self.assertTrue(gfile.Exists('ram://exists/a/b'))
+    self.assertTrue(gfile.Exists('ram://exists/a/b/c.txt'))
+
+    self.assertFalse(gfile.Exists('ram://exists/b'))
+    self.assertFalse(gfile.Exists('ram://exists/a/c'))
+    self.assertFalse(gfile.Exists('ram://exists/a/b/k'))
+
   def test_estimator(self):
 
     def model_fn(features, labels, mode, params):
@@ -114,6 +128,18 @@ class RamFilesystemTest(test_util.TensorFlowTestCase):
     estimator.train(input_fn=input_fn, steps=10)
     estimator.train(input_fn=input_fn, steps=10)
 
+  def test_savedmodel(self):
+    class MyModule(module.Module):
+
+      @def_function.function(input_signature=[])
+      def foo(self):
+        return constant_op.constant([1])
+
+    saved_model.save(MyModule(), 'ram://my_module')
+
+    loaded = saved_model.load('ram://my_module')
+    self.assertAllEqual(loaded.foo(), [1])
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/core/platform/retrying_file_system.h b/tensorflow/core/platform/retrying_file_system.h
index 3891ce7499f..52e2caf8398 100644
--- a/tensorflow/core/platform/retrying_file_system.h
+++ b/tensorflow/core/platform/retrying_file_system.h
@@ -38,108 +38,104 @@ class RetryingFileSystem : public FileSystem {
       : base_file_system_(std::move(base_file_system)),
         retry_config_(retry_config) {}
 
+  TF_USE_FILESYSTEM_METHODS_WITH_NO_TRANSACTION_SUPPORT;
+
   Status NewRandomAccessFile(
-      const string& filename,
-      std::unique_ptr<RandomAccessFile>*
-          result /*, TransactionToken* token = nullptr */) override;
+      const string& filename, TransactionToken* token,
+      std::unique_ptr<RandomAccessFile>* result) override;
 
-  Status NewWritableFile(
-      const string& filename,
-      std::unique_ptr<WritableFile>*
-          result /*, TransactionToken* token = nullptr */) override;
+  Status NewWritableFile(const string& filename, TransactionToken* token,
+                         std::unique_ptr<WritableFile>* result) override;
 
-  Status NewAppendableFile(
-      const string& filename,
-      std::unique_ptr<WritableFile>*
-          result /*, TransactionToken* token = nullptr */) override;
+  Status NewAppendableFile(const string& filename, TransactionToken* token,
+                           std::unique_ptr<WritableFile>* result) override;
 
   Status NewReadOnlyMemoryRegionFromFile(
-      const string& filename,
-      std::unique_ptr<ReadOnlyMemoryRegion>*
-          result /*, TransactionToken* token = nullptr */) override;
+      const string& filename, TransactionToken* token,
+      std::unique_ptr<ReadOnlyMemoryRegion>* result) override;
 
-  Status FileExists(
-      const string& fname /*, TransactionToken* token = nullptr */) override {
+  Status FileExists(const string& fname, TransactionToken* token) override {
     return RetryingUtils::CallWithRetries(
-        [this, &fname]() { return base_file_system_->FileExists(fname); },
-        retry_config_);
-  }
-
-  Status GetChildren(
-      const string& dir,
-      std::vector<string>* result /*, TransactionToken* token = nullptr */)
-      override {
-    return RetryingUtils::CallWithRetries(
-        [this, &dir, result]() {
-          return base_file_system_->GetChildren(dir, result);
+        [this, &fname, token]() {
+          return base_file_system_->FileExists(fname, token);
         },
         retry_config_);
   }
 
-  Status GetMatchingPaths(
-      const string& pattern,
-      std::vector<string>* result /*, TransactionToken* token = nullptr */)
-      override {
+  Status GetChildren(const string& dir, TransactionToken* token,
+                     std::vector<string>* result) override {
     return RetryingUtils::CallWithRetries(
-        [this, &pattern, result]() {
-          return base_file_system_->GetMatchingPaths(pattern, result);
+        [this, &dir, result, token]() {
+          return base_file_system_->GetChildren(dir, token, result);
         },
         retry_config_);
   }
 
-  Status Stat(
-      const string& fname,
-      FileStatistics* stat /*, TransactionToken* token = nullptr */) override {
+  Status GetMatchingPaths(const string& pattern, TransactionToken* token,
+                          std::vector<string>* result) override {
     return RetryingUtils::CallWithRetries(
-        [this, &fname, stat]() { return base_file_system_->Stat(fname, stat); },
+        [this, &pattern, result, token]() {
+          return base_file_system_->GetMatchingPaths(pattern, token, result);
+        },
         retry_config_);
   }
 
-  Status DeleteFile(
-      const string& fname /*, TransactionToken* token = nullptr */) override {
+  Status Stat(const string& fname, TransactionToken* token,
+              FileStatistics* stat) override {
+    return RetryingUtils::CallWithRetries(
+        [this, &fname, stat, token]() {
+          return base_file_system_->Stat(fname, token, stat);
+        },
+        retry_config_);
+  }
+
+  Status DeleteFile(const string& fname, TransactionToken* token) override {
     return RetryingUtils::DeleteWithRetries(
-        [this, &fname]() { return base_file_system_->DeleteFile(fname); },
+        [this, &fname, token]() {
+          return base_file_system_->DeleteFile(fname, token);
+        },
         retry_config_);
   }
 
-  Status CreateDir(
-      const string& dirname /*, TransactionToken* token = nullptr */) override {
+  Status CreateDir(const string& dirname, TransactionToken* token) override {
     return RetryingUtils::CallWithRetries(
-        [this, &dirname]() { return base_file_system_->CreateDir(dirname); },
+        [this, &dirname, token]() {
+          return base_file_system_->CreateDir(dirname, token);
+        },
         retry_config_);
   }
 
-  Status DeleteDir(
-      const string& dirname /*, TransactionToken* token = nullptr */) override {
+  Status DeleteDir(const string& dirname, TransactionToken* token) override {
     return RetryingUtils::DeleteWithRetries(
-        [this, &dirname]() { return base_file_system_->DeleteDir(dirname); },
-        retry_config_);
-  }
-
-  Status GetFileSize(
-      const string& fname,
-      uint64* file_size /*, TransactionToken* token = nullptr */) override {
-    return RetryingUtils::CallWithRetries(
-        [this, &fname, file_size]() {
-          return base_file_system_->GetFileSize(fname, file_size);
+        [this, &dirname, token]() {
+          return base_file_system_->DeleteDir(dirname, token);
         },
         retry_config_);
   }
 
-  Status RenameFile(
-      const string& src,
-      const string& target /*, TransactionToken* token = nullptr */) override {
+  Status GetFileSize(const string& fname, TransactionToken* token,
+                     uint64* file_size) override {
     return RetryingUtils::CallWithRetries(
-        [this, &src, &target]() {
-          return base_file_system_->RenameFile(src, target);
+        [this, &fname, file_size, token]() {
+          return base_file_system_->GetFileSize(fname, token, file_size);
         },
         retry_config_);
   }
 
-  Status IsDirectory(
-      const string& dirname /*, TransactionToken* token = nullptr */) override {
+  Status RenameFile(const string& src, const string& target,
+                    TransactionToken* token) override {
     return RetryingUtils::CallWithRetries(
-        [this, &dirname]() { return base_file_system_->IsDirectory(dirname); },
+        [this, &src, &target, token]() {
+          return base_file_system_->RenameFile(src, target, token);
+        },
+        retry_config_);
+  }
+
+  Status IsDirectory(const string& dirname, TransactionToken* token) override {
+    return RetryingUtils::CallWithRetries(
+        [this, &dirname, token]() {
+          return base_file_system_->IsDirectory(dirname, token);
+        },
         retry_config_);
   }
 
@@ -148,19 +144,19 @@ class RetryingFileSystem : public FileSystem {
     return base_file_system_->HasAtomicMove(path, has_atomic_move);
   }
 
-  Status DeleteRecursively(
-      const string& dirname, int64* undeleted_files,
-      int64* undeleted_dirs /*, TransactionToken* token = nullptr */) override {
+  Status DeleteRecursively(const string& dirname, TransactionToken* token,
+                           int64* undeleted_files,
+                           int64* undeleted_dirs) override {
     return RetryingUtils::DeleteWithRetries(
-        [this, &dirname, undeleted_files, undeleted_dirs]() {
-          return base_file_system_->DeleteRecursively(dirname, undeleted_files,
-                                                      undeleted_dirs);
+        [this, &dirname, token, undeleted_files, undeleted_dirs]() {
+          return base_file_system_->DeleteRecursively(
+              dirname, token, undeleted_files, undeleted_dirs);
         },
         retry_config_);
   }
 
-  void FlushCaches(/* TransactionToken* token=nullptr */) override {
-    base_file_system_->FlushCaches();
+  void FlushCaches(TransactionToken* token) override {
+    base_file_system_->FlushCaches(token);
   }
 
   Underlying* underlying() const { return base_file_system_.get(); }
@@ -243,12 +239,13 @@ class RetryingWritableFile : public WritableFile {
 
 template <typename Underlying>
 Status RetryingFileSystem<Underlying>::NewRandomAccessFile(
-    const string& filename,
-    std::unique_ptr<RandomAccessFile>* result /*, TransactionToken* token */) {
+    const string& filename, TransactionToken* token,
+    std::unique_ptr<RandomAccessFile>* result) {
   std::unique_ptr<RandomAccessFile> base_file;
   TF_RETURN_IF_ERROR(RetryingUtils::CallWithRetries(
-      [this, &filename, &base_file]() {
-        return base_file_system_->NewRandomAccessFile(filename, &base_file);
+      [this, &filename, &base_file, token]() {
+        return base_file_system_->NewRandomAccessFile(filename, token,
+                                                      &base_file);
       },
       retry_config_));
   result->reset(new retrying_internals::RetryingRandomAccessFile(
@@ -258,12 +255,12 @@ Status RetryingFileSystem<Underlying>::NewRandomAccessFile(
 
 template <typename Underlying>
 Status RetryingFileSystem<Underlying>::NewWritableFile(
-    const string& filename,
-    std::unique_ptr<WritableFile>* result /*, TransactionToken* token */) {
+    const string& filename, TransactionToken* token,
+    std::unique_ptr<WritableFile>* result) {
   std::unique_ptr<WritableFile> base_file;
   TF_RETURN_IF_ERROR(RetryingUtils::CallWithRetries(
-      [this, &filename, &base_file]() {
-        return base_file_system_->NewWritableFile(filename, &base_file);
+      [this, &filename, &base_file, token]() {
+        return base_file_system_->NewWritableFile(filename, token, &base_file);
       },
       retry_config_));
   result->reset(new retrying_internals::RetryingWritableFile(
@@ -273,12 +270,13 @@ Status RetryingFileSystem<Underlying>::NewWritableFile(
 
 template <typename Underlying>
 Status RetryingFileSystem<Underlying>::NewAppendableFile(
-    const string& filename,
-    std::unique_ptr<WritableFile>* result /*, TransactionToken* token */) {
+    const string& filename, TransactionToken* token,
+    std::unique_ptr<WritableFile>* result) {
   std::unique_ptr<WritableFile> base_file;
   TF_RETURN_IF_ERROR(RetryingUtils::CallWithRetries(
-      [this, &filename, &base_file]() {
-        return base_file_system_->NewAppendableFile(filename, &base_file);
+      [this, &filename, &base_file, token]() {
+        return base_file_system_->NewAppendableFile(filename, token,
+                                                    &base_file);
       },
       retry_config_));
   result->reset(new retrying_internals::RetryingWritableFile(
@@ -288,12 +286,12 @@ Status RetryingFileSystem<Underlying>::NewAppendableFile(
 
 template <typename Underlying>
 Status RetryingFileSystem<Underlying>::NewReadOnlyMemoryRegionFromFile(
-    const string& filename, std::unique_ptr<ReadOnlyMemoryRegion>*
-                                result /*, TransactionToken* token */) {
+    const string& filename, TransactionToken* token,
+    std::unique_ptr<ReadOnlyMemoryRegion>* result) {
   return RetryingUtils::CallWithRetries(
-      [this, &filename, result]() {
-        return base_file_system_->NewReadOnlyMemoryRegionFromFile(filename,
-                                                                  result);
+      [this, &filename, result, token]() {
+        return base_file_system_->NewReadOnlyMemoryRegionFromFile(
+            filename, token, result);
       },
       retry_config_);
 }
diff --git a/tensorflow/core/platform/retrying_file_system_test.cc b/tensorflow/core/platform/retrying_file_system_test.cc
index 439abd6f3ec..093b85a1afc 100644
--- a/tensorflow/core/platform/retrying_file_system_test.cc
+++ b/tensorflow/core/platform/retrying_file_system_test.cc
@@ -99,101 +99,85 @@ class MockFileSystem : public FileSystem {
   explicit MockFileSystem(const ExpectedCalls& calls, bool* flushed = nullptr)
       : calls_(calls), flushed_(flushed) {}
 
+  TF_USE_FILESYSTEM_METHODS_WITH_NO_TRANSACTION_SUPPORT;
+
   Status NewRandomAccessFile(
-      const string& fname,
-      std::unique_ptr<RandomAccessFile>*
-          result /*, TransactionToken* token = nullptr */) override {
+      const string& fname, TransactionToken* token,
+      std::unique_ptr<RandomAccessFile>* result) override {
     *result = std::move(random_access_file_to_return);
     return calls_.ConsumeNextCall("NewRandomAccessFile");
   }
 
-  Status NewWritableFile(
-      const string& fname,
-      std::unique_ptr<WritableFile>*
-          result /*, TransactionToken* token = nullptr */) override {
+  Status NewWritableFile(const string& fname, TransactionToken* token,
+                         std::unique_ptr<WritableFile>* result) override {
     *result = std::move(writable_file_to_return);
     return calls_.ConsumeNextCall("NewWritableFile");
   }
 
-  Status NewAppendableFile(
-      const string& fname,
-      std::unique_ptr<WritableFile>*
-          result /*, TransactionToken* token = nullptr */) override {
+  Status NewAppendableFile(const string& fname, TransactionToken* token,
+                           std::unique_ptr<WritableFile>* result) override {
     *result = std::move(writable_file_to_return);
     return calls_.ConsumeNextCall("NewAppendableFile");
   }
 
   Status NewReadOnlyMemoryRegionFromFile(
-      const string& fname,
-      std::unique_ptr<ReadOnlyMemoryRegion>*
-          result /*, TransactionToken* token = nullptr */) override {
+      const string& fname, TransactionToken* token,
+      std::unique_ptr<ReadOnlyMemoryRegion>* result) override {
     return calls_.ConsumeNextCall("NewReadOnlyMemoryRegionFromFile");
   }
 
-  Status FileExists(
-      const string& fname /*, TransactionToken* token = nullptr */) override {
+  Status FileExists(const string& fname, TransactionToken* token) override {
     return calls_.ConsumeNextCall("FileExists");
   }
 
-  Status GetChildren(
-      const string& dir,
-      std::vector<string>* result /*, TransactionToken* token = nullptr */)
-      override {
+  Status GetChildren(const string& dir, TransactionToken* token,
+                     std::vector<string>* result) override {
     return calls_.ConsumeNextCall("GetChildren");
   }
 
-  Status GetMatchingPaths(
-      const string& dir,
-      std::vector<string>* result /*, TransactionToken* token = nullptr */)
-      override {
+  Status GetMatchingPaths(const string& dir, TransactionToken* token,
+                          std::vector<string>* result) override {
     return calls_.ConsumeNextCall("GetMatchingPaths");
   }
 
-  Status Stat(
-      const string& fname,
-      FileStatistics* stat /*, TransactionToken* token = nullptr */) override {
+  Status Stat(const string& fname, TransactionToken* token,
+              FileStatistics* stat) override {
     return calls_.ConsumeNextCall("Stat");
   }
 
-  Status DeleteFile(
-      const string& fname /*, TransactionToken* token = nullptr */) override {
+  Status DeleteFile(const string& fname, TransactionToken* token) override {
     return calls_.ConsumeNextCall("DeleteFile");
   }
 
-  Status CreateDir(
-      const string& dirname /*, TransactionToken* token = nullptr */) override {
+  Status CreateDir(const string& dirname, TransactionToken* token) override {
     return calls_.ConsumeNextCall("CreateDir");
   }
 
-  Status DeleteDir(
-      const string& dirname /*, TransactionToken* token = nullptr */) override {
+  Status DeleteDir(const string& dirname, TransactionToken* token) override {
     return calls_.ConsumeNextCall("DeleteDir");
   }
 
-  Status GetFileSize(
-      const string& fname,
-      uint64* file_size /*, TransactionToken* token = nullptr */) override {
+  Status GetFileSize(const string& fname, TransactionToken* token,
+                     uint64* file_size) override {
     return calls_.ConsumeNextCall("GetFileSize");
   }
 
-  Status RenameFile(
-      const string& src,
-      const string& target /*, TransactionToken* token = nullptr */) override {
+  Status RenameFile(const string& src, const string& target,
+                    TransactionToken* token) override {
     return calls_.ConsumeNextCall("RenameFile");
   }
 
-  Status IsDirectory(
-      const string& dirname /*, TransactionToken* token = nullptr */) override {
+  Status IsDirectory(const string& dirname, TransactionToken* token) override {
     return calls_.ConsumeNextCall("IsDirectory");
   }
 
-  Status DeleteRecursively(
-      const string& dirname, int64* undeleted_files,
-      int64* undeleted_dirs /*, TransactionToken* token = nullptr */) override {
+  Status DeleteRecursively(const string& dirname, TransactionToken* token,
+                           int64* undeleted_files,
+                           int64* undeleted_dirs) override {
     return calls_.ConsumeNextCall("DeleteRecursively");
   }
 
-  void FlushCaches(/* TransactionToken* token=nullptr */) override {
+  void FlushCaches(TransactionToken* token) override {
     if (flushed_) {
       *flushed_ = true;
     }
@@ -225,7 +209,8 @@ TEST(RetryingFileSystemTest, NewRandomAccessFile_ImmediateSuccess) {
 
   // Retrieve the wrapped random access file.
   std::unique_ptr<RandomAccessFile> random_access_file;
-  TF_EXPECT_OK(fs.NewRandomAccessFile("filename.txt", &random_access_file));
+  TF_EXPECT_OK(
+      fs.NewRandomAccessFile("filename.txt", nullptr, &random_access_file));
 
   // Use it and check the results.
   StringPiece result;
@@ -256,7 +241,8 @@ TEST(RetryingFileSystemTest, NewRandomAccessFile_SuccessWith3rdTry) {
 
   // Retrieve the wrapped random access file.
   std::unique_ptr<RandomAccessFile> random_access_file;
-  TF_EXPECT_OK(fs.NewRandomAccessFile("filename.txt", &random_access_file));
+  TF_EXPECT_OK(
+      fs.NewRandomAccessFile("filename.txt", nullptr, &random_access_file));
 
   // Use it and check the results.
   StringPiece result;
@@ -281,7 +267,8 @@ TEST(RetryingFileSystemTest, NewRandomAccessFile_AllRetriesFailed) {
 
   // Retrieve the wrapped random access file.
   std::unique_ptr<RandomAccessFile> random_access_file;
-  TF_EXPECT_OK(fs.NewRandomAccessFile("filename.txt", &random_access_file));
+  TF_EXPECT_OK(
+      fs.NewRandomAccessFile("filename.txt", nullptr, &random_access_file));
 
   // Use it and check the results.
   StringPiece result;
@@ -311,7 +298,8 @@ TEST(RetryingFileSystemTest, NewRandomAccessFile_NoRetriesForSomeErrors) {
 
   // Retrieve the wrapped random access file.
   std::unique_ptr<RandomAccessFile> random_access_file;
-  TF_EXPECT_OK(fs.NewRandomAccessFile("filename.txt", &random_access_file));
+  TF_EXPECT_OK(
+      fs.NewRandomAccessFile("filename.txt", nullptr, &random_access_file));
 
   // Use it and check the results.
   StringPiece result;
@@ -339,7 +327,7 @@ TEST(RetryingFileSystemTest, NewWritableFile_ImmediateSuccess) {
 
   // Retrieve the wrapped writable file.
   std::unique_ptr<WritableFile> writable_file;
-  TF_EXPECT_OK(fs.NewWritableFile("filename.txt", &writable_file));
+  TF_EXPECT_OK(fs.NewWritableFile("filename.txt", nullptr, &writable_file));
 
   StringPiece result;
   TF_EXPECT_OK(writable_file->Name(&result));
@@ -370,7 +358,7 @@ TEST(RetryingFileSystemTest, NewWritableFile_SuccessWith3rdTry) {
 
   // Retrieve the wrapped writable file.
   std::unique_ptr<WritableFile> writable_file;
-  TF_EXPECT_OK(fs.NewWritableFile("filename.txt", &writable_file));
+  TF_EXPECT_OK(fs.NewWritableFile("filename.txt", nullptr, &writable_file));
 
   // Use it and check the results.
   TF_EXPECT_OK(writable_file->Sync());
@@ -397,7 +385,7 @@ TEST(RetryingFileSystemTest, NewWritableFile_SuccessWith3rdTry_ViaDestructor) {
 
   // Retrieve the wrapped writable file.
   std::unique_ptr<WritableFile> writable_file;
-  TF_EXPECT_OK(fs.NewWritableFile("filename.txt", &writable_file));
+  TF_EXPECT_OK(fs.NewWritableFile("filename.txt", nullptr, &writable_file));
 
   writable_file.reset();  // Trigger Close() via destructor.
 }
@@ -423,7 +411,7 @@ TEST(RetryingFileSystemTest, NewAppendableFile_SuccessWith3rdTry) {
 
   // Retrieve the wrapped appendable file.
   std::unique_ptr<WritableFile> writable_file;
-  TF_EXPECT_OK(fs.NewAppendableFile("filename.txt", &writable_file));
+  TF_EXPECT_OK(fs.NewAppendableFile("filename.txt", nullptr, &writable_file));
 
   // Use it and check the results.
   TF_EXPECT_OK(writable_file->Sync());
@@ -447,7 +435,7 @@ TEST(RetryingFileSystemTest, NewWritableFile_AllRetriesFailed) {
 
   // Retrieve the wrapped writable file.
   std::unique_ptr<WritableFile> writable_file;
-  TF_EXPECT_OK(fs.NewWritableFile("filename.txt", &writable_file));
+  TF_EXPECT_OK(fs.NewWritableFile("filename.txt", nullptr, &writable_file));
 
   // Use it and check the results.
   const auto& status = writable_file->Sync();
@@ -467,7 +455,8 @@ TEST(RetryingFileSystemTest,
       std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
   std::unique_ptr<ReadOnlyMemoryRegion> result;
-  TF_EXPECT_OK(fs.NewReadOnlyMemoryRegionFromFile("filename.txt", &result));
+  TF_EXPECT_OK(
+      fs.NewReadOnlyMemoryRegionFromFile("filename.txt", nullptr, &result));
 }
 
 TEST(RetryingFileSystemTest, NewReadOnlyMemoryRegionFromFile_AllRetriesFailed) {
@@ -480,7 +469,7 @@ TEST(RetryingFileSystemTest, NewReadOnlyMemoryRegionFromFile_AllRetriesFailed) {
 
   std::unique_ptr<ReadOnlyMemoryRegion> result;
   const auto& status =
-      fs.NewReadOnlyMemoryRegionFromFile("filename.txt", &result);
+      fs.NewReadOnlyMemoryRegionFromFile("filename.txt", nullptr, &result);
   EXPECT_TRUE(absl::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
@@ -496,7 +485,7 @@ TEST(RetryingFileSystemTest, GetChildren_SuccessWith2ndTry) {
       std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
   std::vector<string> result;
-  TF_EXPECT_OK(fs.GetChildren("gs://path", &result));
+  TF_EXPECT_OK(fs.GetChildren("gs://path", nullptr, &result));
 }
 
 TEST(RetryingFileSystemTest, GetChildren_AllRetriesFailed) {
@@ -507,7 +496,7 @@ TEST(RetryingFileSystemTest, GetChildren_AllRetriesFailed) {
       std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
   std::vector<string> result;
-  const auto& status = fs.GetChildren("gs://path", &result);
+  const auto& status = fs.GetChildren("gs://path", nullptr, &result);
   EXPECT_TRUE(absl::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
@@ -523,7 +512,7 @@ TEST(RetryingFileSystemTest, GetMatchingPaths_SuccessWith2ndTry) {
       std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
   std::vector<string> result;
-  TF_EXPECT_OK(fs.GetMatchingPaths("gs://path/dir", &result));
+  TF_EXPECT_OK(fs.GetMatchingPaths("gs://path/dir", nullptr, &result));
 }
 
 TEST(RetryingFileSystemTest, GetMatchingPaths_AllRetriesFailed) {
@@ -535,7 +524,7 @@ TEST(RetryingFileSystemTest, GetMatchingPaths_AllRetriesFailed) {
       std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
   std::vector<string> result;
-  const auto& status = fs.GetMatchingPaths("gs://path/dir", &result);
+  const auto& status = fs.GetMatchingPaths("gs://path/dir", nullptr, &result);
   EXPECT_TRUE(absl::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
@@ -549,7 +538,7 @@ TEST(RetryingFileSystemTest, DeleteFile_SuccessWith2ndTry) {
   RetryingFileSystem<MockFileSystem> fs(
       std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
-  TF_EXPECT_OK(fs.DeleteFile("gs://path/file.txt"));
+  TF_EXPECT_OK(fs.DeleteFile("gs://path/file.txt", nullptr));
 }
 
 TEST(RetryingFileSystemTest, DeleteFile_AllRetriesFailed) {
@@ -559,7 +548,7 @@ TEST(RetryingFileSystemTest, DeleteFile_AllRetriesFailed) {
   RetryingFileSystem<MockFileSystem> fs(
       std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
-  const auto& status = fs.DeleteFile("gs://path/file.txt");
+  const auto& status = fs.DeleteFile("gs://path/file.txt", nullptr);
   EXPECT_TRUE(absl::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
@@ -573,7 +562,7 @@ TEST(RetryingFileSystemTest, CreateDir_SuccessWith2ndTry) {
   RetryingFileSystem<MockFileSystem> fs(
       std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
-  TF_EXPECT_OK(fs.CreateDir("gs://path/newdir"));
+  TF_EXPECT_OK(fs.CreateDir("gs://path/newdir", nullptr));
 }
 
 TEST(RetryingFileSystemTest, CreateDir_AllRetriesFailed) {
@@ -583,7 +572,7 @@ TEST(RetryingFileSystemTest, CreateDir_AllRetriesFailed) {
   RetryingFileSystem<MockFileSystem> fs(
       std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
-  const auto& status = fs.CreateDir("gs://path/newdir");
+  const auto& status = fs.CreateDir("gs://path/newdir", nullptr);
   EXPECT_TRUE(absl::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
@@ -597,7 +586,7 @@ TEST(RetryingFileSystemTest, DeleteDir_SuccessWith2ndTry) {
   RetryingFileSystem<MockFileSystem> fs(
       std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
-  TF_EXPECT_OK(fs.DeleteDir("gs://path/dir"));
+  TF_EXPECT_OK(fs.DeleteDir("gs://path/dir", nullptr));
 }
 
 TEST(RetryingFileSystemTest, DeleteDir_AllRetriesFailed) {
@@ -607,7 +596,7 @@ TEST(RetryingFileSystemTest, DeleteDir_AllRetriesFailed) {
   RetryingFileSystem<MockFileSystem> fs(
       std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
-  const auto& status = fs.DeleteDir("gs://path/dir");
+  const auto& status = fs.DeleteDir("gs://path/dir", nullptr);
   EXPECT_TRUE(absl::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
@@ -623,7 +612,7 @@ TEST(RetryingFileSystemTest, GetFileSize_SuccessWith2ndTry) {
       std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
   uint64 size;
-  TF_EXPECT_OK(fs.GetFileSize("gs://path/file.txt", &size));
+  TF_EXPECT_OK(fs.GetFileSize("gs://path/file.txt", nullptr, &size));
 }
 
 TEST(RetryingFileSystemTest, GetFileSize_AllRetriesFailed) {
@@ -634,7 +623,7 @@ TEST(RetryingFileSystemTest, GetFileSize_AllRetriesFailed) {
       std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
   uint64 size;
-  const auto& status = fs.GetFileSize("gs://path/file.txt", &size);
+  const auto& status = fs.GetFileSize("gs://path/file.txt", nullptr, &size);
   EXPECT_TRUE(absl::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
@@ -648,7 +637,7 @@ TEST(RetryingFileSystemTest, RenameFile_SuccessWith2ndTry) {
   RetryingFileSystem<MockFileSystem> fs(
       std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
-  TF_EXPECT_OK(fs.RenameFile("old_name", "new_name"));
+  TF_EXPECT_OK(fs.RenameFile("old_name", "new_name", nullptr));
 }
 
 TEST(RetryingFileSystemTest, RenameFile_AllRetriesFailed) {
@@ -658,7 +647,7 @@ TEST(RetryingFileSystemTest, RenameFile_AllRetriesFailed) {
   RetryingFileSystem<MockFileSystem> fs(
       std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
-  const auto& status = fs.RenameFile("old_name", "new_name");
+  const auto& status = fs.RenameFile("old_name", "new_name", nullptr);
   EXPECT_TRUE(absl::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
@@ -673,7 +662,7 @@ TEST(RetryingFileSystemTest, Stat_SuccessWith2ndTry) {
       std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
   FileStatistics stat;
-  TF_EXPECT_OK(fs.Stat("file_name", &stat));
+  TF_EXPECT_OK(fs.Stat("file_name", nullptr, &stat));
 }
 
 TEST(RetryingFileSystemTest, Stat_AllRetriesFailed) {
@@ -684,7 +673,7 @@ TEST(RetryingFileSystemTest, Stat_AllRetriesFailed) {
       std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
   FileStatistics stat;
-  const auto& status = fs.Stat("file_name", &stat);
+  const auto& status = fs.Stat("file_name", nullptr, &stat);
   EXPECT_TRUE(absl::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
@@ -696,7 +685,7 @@ TEST(RetryingFileSystemTest, FileExists_AllRetriesFailed) {
   RetryingFileSystem<MockFileSystem> fs(
       std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
-  const auto& status = fs.FileExists("file_name");
+  const auto& status = fs.FileExists("file_name", nullptr);
   EXPECT_TRUE(absl::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
@@ -710,7 +699,7 @@ TEST(RetryingFileSystemTest, FileExists_SuccessWith2ndTry) {
   RetryingFileSystem<MockFileSystem> fs(
       std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
-  TF_EXPECT_OK(fs.FileExists("gs://path/dir"));
+  TF_EXPECT_OK(fs.FileExists("gs://path/dir", nullptr));
 }
 
 TEST(RetryingFileSystemTest, IsDirectory_SuccessWith2ndTry) {
@@ -723,7 +712,7 @@ TEST(RetryingFileSystemTest, IsDirectory_SuccessWith2ndTry) {
   RetryingFileSystem<MockFileSystem> fs(
       std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
-  TF_EXPECT_OK(fs.IsDirectory("gs://path/dir"));
+  TF_EXPECT_OK(fs.IsDirectory("gs://path/dir", nullptr));
 }
 
 TEST(RetryingFileSystemTest, IsDirectory_AllRetriesFailed) {
@@ -733,7 +722,7 @@ TEST(RetryingFileSystemTest, IsDirectory_AllRetriesFailed) {
   RetryingFileSystem<MockFileSystem> fs(
       std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
-  const auto& status = fs.IsDirectory("gs://path/dir");
+  const auto& status = fs.IsDirectory("gs://path/dir", nullptr);
   EXPECT_TRUE(absl::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
@@ -749,8 +738,8 @@ TEST(RetryingFileSystemTest, DeleteRecursively_SuccessWith2ndTry) {
       std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
   int64 undeleted_files, undeleted_dirs;
 
-  TF_EXPECT_OK(
-      fs.DeleteRecursively("gs://path/dir", &undeleted_files, &undeleted_dirs));
+  TF_EXPECT_OK(fs.DeleteRecursively("gs://path/dir", nullptr, &undeleted_files,
+                                    &undeleted_dirs));
 }
 
 TEST(RetryingFileSystemTest, DeleteRecursively_AllRetriesFailed) {
@@ -762,8 +751,8 @@ TEST(RetryingFileSystemTest, DeleteRecursively_AllRetriesFailed) {
       std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
   int64 undeleted_files, undeleted_dirs;
 
-  const auto& status =
-      fs.DeleteRecursively("gs://path/dir", &undeleted_files, &undeleted_dirs);
+  const auto& status = fs.DeleteRecursively("gs://path/dir", nullptr,
+                                            &undeleted_files, &undeleted_dirs);
   EXPECT_TRUE(absl::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
@@ -774,7 +763,7 @@ TEST(RetryingFileSystemTest, FlushCaches) {
   std::unique_ptr<MockFileSystem> base_fs(new MockFileSystem(none, &flushed));
   RetryingFileSystem<MockFileSystem> fs(
       std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
-  fs.FlushCaches();
+  fs.FlushCaches(nullptr);
   EXPECT_TRUE(flushed);
 }
 
diff --git a/tensorflow/core/platform/s3/s3_file_system.cc b/tensorflow/core/platform/s3/s3_file_system.cc
index 8812424e89d..8d74ea6aff6 100644
--- a/tensorflow/core/platform/s3/s3_file_system.cc
+++ b/tensorflow/core/platform/s3/s3_file_system.cc
@@ -58,7 +58,7 @@ static const char* kS3TempFileTemplate = "/tmp/s3_filesystem_XXXXXX";
 #endif
 static const char* kS3FileSystemAllocationTag = "S3FileSystemAllocation";
 static const size_t kS3ReadAppendableFileBufferSize = 1024 * 1024;
-static const int64 kS3TimeoutMsec = 300000;                       // 5 min
+static const int64 kS3TimeoutMsec = 300000;                           // 5 min
 static const uint64 kS3MultiPartUploadChunkSize = 50 * 1024 * 1024;   // 50 MB
 static const uint64 kS3MultiPartDownloadChunkSize = 2 * 1024 * 1024;  // 50 MB
 static const int kS3GetChildrenMaxKeys = 100;
@@ -568,14 +568,14 @@ S3FileSystem::GetExecutor() {
 }
 
 Status S3FileSystem::NewRandomAccessFile(
-    const string& fname,
-    std::unique_ptr<RandomAccessFile>* result /*, TransactionToken* token */) {
-  return NewRandomAccessFile(fname, result, true);
+    const string& fname, TransactionToken* token,
+    std::unique_ptr<RandomAccessFile>* result) {
+  return NewRandomAccessFile(fname, token, result, true);
 }
 
 Status S3FileSystem::NewRandomAccessFile(
-    const string& fname, std::unique_ptr<RandomAccessFile>* result,
-    bool use_multi_part_download /*, TransactionToken* token */) {
+    const string& fname, TransactionToken* token,
+    std::unique_ptr<RandomAccessFile>* result, bool use_multi_part_download) {
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseS3Path(fname, false, &bucket, &object));
 
@@ -588,9 +588,9 @@ Status S3FileSystem::NewRandomAccessFile(
   return Status::OK();
 }
 
-Status S3FileSystem::NewWritableFile(
-    const string& fname,
-    std::unique_ptr<WritableFile>* result /*, TransactionToken* token */) {
+Status S3FileSystem::NewWritableFile(const string& fname,
+                                     TransactionToken* token,
+                                     std::unique_ptr<WritableFile>* result) {
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseS3Path(fname, false, &bucket, &object));
   result->reset(new S3WritableFile(
@@ -601,11 +601,11 @@ Status S3FileSystem::NewWritableFile(
   return Status::OK();
 }
 
-Status S3FileSystem::NewAppendableFile(
-    const string& fname,
-    std::unique_ptr<WritableFile>* result /*, TransactionToken* token */) {
+Status S3FileSystem::NewAppendableFile(const string& fname,
+                                       TransactionToken* token,
+                                       std::unique_ptr<WritableFile>* result) {
   std::unique_ptr<RandomAccessFile> reader;
-  TF_RETURN_IF_ERROR(NewRandomAccessFile(fname, &reader));
+  TF_RETURN_IF_ERROR(NewRandomAccessFile(fname, token, &reader));
   std::unique_ptr<char[]> buffer(new char[kS3ReadAppendableFileBufferSize]);
   Status status;
   uint64 offset = 0;
@@ -637,14 +637,14 @@ Status S3FileSystem::NewAppendableFile(
 }
 
 Status S3FileSystem::NewReadOnlyMemoryRegionFromFile(
-    const string& fname, std::unique_ptr<ReadOnlyMemoryRegion>*
-                             result /*, TransactionToken* token */) {
+    const string& fname, TransactionToken* token,
+    std::unique_ptr<ReadOnlyMemoryRegion>* result) {
   uint64 size;
-  TF_RETURN_IF_ERROR(GetFileSize(fname, &size));
+  TF_RETURN_IF_ERROR(GetFileSize(fname, token, &size));
   std::unique_ptr<char[]> data(new char[size]);
 
   std::unique_ptr<RandomAccessFile> file;
-  TF_RETURN_IF_ERROR(NewRandomAccessFile(fname, &file));
+  TF_RETURN_IF_ERROR(NewRandomAccessFile(fname, token, &file));
 
   StringPiece piece;
   TF_RETURN_IF_ERROR(file->Read(0, size, &piece, data.get()));
@@ -653,16 +653,14 @@ Status S3FileSystem::NewReadOnlyMemoryRegionFromFile(
   return Status::OK();
 }
 
-Status S3FileSystem::FileExists(
-    const string& fname /*, TransactionToken* token */) {
+Status S3FileSystem::FileExists(const string& fname, TransactionToken* token) {
   FileStatistics stats;
-  TF_RETURN_IF_ERROR(this->Stat(fname, &stats));
+  TF_RETURN_IF_ERROR(this->Stat(fname, token, &stats));
   return Status::OK();
 }
 
-Status S3FileSystem::GetChildren(
-    const string& dir,
-    std::vector<string>* result /*, TransactionToken* token */) {
+Status S3FileSystem::GetChildren(const string& dir, TransactionToken* token,
+                                 std::vector<string>* result) {
   VLOG(1) << "GetChildren for path: " << dir;
   string bucket, prefix;
   TF_RETURN_IF_ERROR(ParseS3Path(dir, true, &bucket, &prefix));
@@ -709,8 +707,8 @@ Status S3FileSystem::GetChildren(
   return Status::OK();
 }
 
-Status S3FileSystem::Stat(
-    const string& fname, FileStatistics* stats /*, TransactionToken* token */) {
+Status S3FileSystem::Stat(const string& fname, TransactionToken* token,
+                          FileStatistics* stats) {
   VLOG(1) << "Stat on path: " << fname;
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseS3Path(fname, true, &bucket, &object));
@@ -772,14 +770,13 @@ Status S3FileSystem::Stat(
   return Status::OK();
 }
 
-Status S3FileSystem::GetMatchingPaths(
-    const string& pattern,
-    std::vector<string>* results /*, TransactionToken* token */) {
+Status S3FileSystem::GetMatchingPaths(const string& pattern,
+                                      TransactionToken* token,
+                                      std::vector<string>* results) {
   return internal::GetMatchingPaths(this, Env::Default(), pattern, results);
 }
 
-Status S3FileSystem::DeleteFile(
-    const string& fname /*, TransactionToken* token */) {
+Status S3FileSystem::DeleteFile(const string& fname, TransactionToken* token) {
   VLOG(1) << "DeleteFile: " << fname;
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseS3Path(fname, false, &bucket, &object));
@@ -795,8 +792,7 @@ Status S3FileSystem::DeleteFile(
   return Status::OK();
 }
 
-Status S3FileSystem::CreateDir(
-    const string& dirname /*, TransactionToken* token */) {
+Status S3FileSystem::CreateDir(const string& dirname, TransactionToken* token) {
   VLOG(1) << "CreateDir: " << dirname;
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseS3Path(dirname, true, &bucket, &object));
@@ -815,16 +811,15 @@ Status S3FileSystem::CreateDir(
   if (filename.back() != '/') {
     filename.push_back('/');
   }
-  if (!this->FileExists(filename).ok()) {
+  if (!this->FileExists(filename, token).ok()) {
     std::unique_ptr<WritableFile> file;
-    TF_RETURN_IF_ERROR(NewWritableFile(filename, &file));
+    TF_RETURN_IF_ERROR(NewWritableFile(filename, token, &file));
     TF_RETURN_IF_ERROR(file->Close());
   }
   return Status::OK();
 }
 
-Status S3FileSystem::DeleteDir(
-    const string& dirname /*, TransactionToken* token */) {
+Status S3FileSystem::DeleteDir(const string& dirname, TransactionToken* token) {
   VLOG(1) << "DeleteDir: " << dirname;
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseS3Path(dirname, false, &bucket, &object));
@@ -855,7 +850,7 @@ Status S3FileSystem::DeleteDir(
       if (filename.back() != '/') {
         filename.push_back('/');
       }
-      return DeleteFile(filename);
+      return DeleteFile(filename, token);
     }
   } else {
     TF_RETURN_IF_ERROR(CheckForbiddenError(listObjectsOutcome.GetError()));
@@ -863,10 +858,10 @@ Status S3FileSystem::DeleteDir(
   return Status::OK();
 }
 
-Status S3FileSystem::GetFileSize(
-    const string& fname, uint64* file_size /*, TransactionToken* token */) {
+Status S3FileSystem::GetFileSize(const string& fname, TransactionToken* token,
+                                 uint64* file_size) {
   FileStatistics stats;
-  TF_RETURN_IF_ERROR(this->Stat(fname, &stats));
+  TF_RETURN_IF_ERROR(this->Stat(fname, token, &stats));
   *file_size = stats.length;
   return Status::OK();
 }
@@ -916,8 +911,8 @@ Status S3FileSystem::CopyFile(const Aws::String& source_bucket,
   Aws::String source = Aws::String((source_bucket + "/" + source_key).c_str());
   Aws::String source_full_path = Aws::String("s3://") + source;
   uint64 file_length;
-  TF_RETURN_IF_ERROR(
-      this->GetFileSize(string(source_full_path.c_str()), &file_length));
+  TF_RETURN_IF_ERROR(this->GetFileSize(string(source_full_path.c_str()),
+                                       nullptr, &file_length));
   int num_parts;
   if (file_length <=
       multi_part_chunk_size_[Aws::Transfer::TransferDirection::UPLOAD]) {
@@ -1135,8 +1130,8 @@ Status S3FileSystem::CompleteMultiPartCopy(
   return Status::OK();
 }
 
-Status S3FileSystem::RenameFile(
-    const string& src, const string& target /*, TransactionToken* token */) {
+Status S3FileSystem::RenameFile(const string& src, const string& target,
+                                TransactionToken* token) {
   VLOG(1) << "RenameFile from: " << src << " to: " << target;
   string src_bucket, src_object, target_bucket, target_object;
   TF_RETURN_IF_ERROR(ParseS3Path(src, false, &src_bucket, &src_object));
diff --git a/tensorflow/core/platform/s3/s3_file_system.h b/tensorflow/core/platform/s3/s3_file_system.h
index 41a5195efec..8da74c668d1 100644
--- a/tensorflow/core/platform/s3/s3_file_system.h
+++ b/tensorflow/core/platform/s3/s3_file_system.h
@@ -49,67 +49,50 @@ class S3FileSystem : public FileSystem {
   S3FileSystem();
   ~S3FileSystem();
 
-  Status NewRandomAccessFile(
-      const string& fname,
-      std::unique_ptr<RandomAccessFile>*
-          result /*, TransactionToken* token = nullptr */) override;
+  TF_USE_FILESYSTEM_METHODS_WITH_NO_TRANSACTION_SUPPORT;
 
   Status NewRandomAccessFile(
-      const string& fname, std::unique_ptr<RandomAccessFile>* result,
-      bool use_multi_part_download /*, TransactionToken* token = nullptr */);
+      const string& fname, TransactionToken* token,
+      std::unique_ptr<RandomAccessFile>* result) override;
 
-  Status NewWritableFile(
-      const string& fname,
-      std::unique_ptr<WritableFile>*
-          result /*, TransactionToken* token = nullptr */) override;
+  Status NewRandomAccessFile(const string& fname, TransactionToken* token,
+                             std::unique_ptr<RandomAccessFile>* result,
+                             bool use_multi_part_download);
 
-  Status NewAppendableFile(
-      const string& fname,
-      std::unique_ptr<WritableFile>*
-          result /*, TransactionToken* token = nullptr */) override;
+  Status NewWritableFile(const string& fname, TransactionToken* token,
+                         std::unique_ptr<WritableFile>* result) override;
+
+  Status NewAppendableFile(const string& fname, TransactionToken* token,
+                           std::unique_ptr<WritableFile>* result) override;
 
   Status NewReadOnlyMemoryRegionFromFile(
-      const string& fname,
-      std::unique_ptr<ReadOnlyMemoryRegion>*
-          result /*, TransactionToken* token = nullptr */) override;
+      const string& fname, TransactionToken* token,
+      std::unique_ptr<ReadOnlyMemoryRegion>* result) override;
 
-  Status FileExists(
-      const string& fname /*, TransactionToken* token = nullptr */) override;
+  Status FileExists(const string& fname, TransactionToken* token) override;
 
-  Status GetChildren(
-      const string& dir,
-      std::vector<string>* result /*, TransactionToken* token = nullptr */)
-      override;
+  Status GetChildren(const string& dir, TransactionToken* token,
+                     std::vector<string>* result) override;
 
-  Status Stat(
-      const string& fname,
-      FileStatistics* stat /*, TransactionToken* token = nullptr */) override;
+  Status Stat(const string& fname, TransactionToken* token,
+              FileStatistics* stat) override;
 
-  Status GetMatchingPaths(
-      const string& pattern,
-      std::vector<string>* results /*, TransactionToken* token = nullptr */)
-      override;
+  Status GetMatchingPaths(const string& pattern, TransactionToken* token,
+                          std::vector<string>* results) override;
 
-  Status DeleteFile(
-      const string& fname /*, TransactionToken* token = nullptr */) override;
+  Status DeleteFile(const string& fname, TransactionToken* token) override;
 
-  Status CreateDir(
-      const string& name /*, TransactionToken* token = nullptr */) override;
+  Status CreateDir(const string& name, TransactionToken* token) override;
 
-  Status DeleteDir(
-      const string& name /*, TransactionToken* token = nullptr */) override;
+  Status DeleteDir(const string& name, TransactionToken* token) override;
 
-  Status GetFileSize(
-      const string& fname,
-      uint64* size /*, TransactionToken* token = nullptr */) override;
+  Status GetFileSize(const string& fname, TransactionToken* token,
+                     uint64* size) override;
 
-  Status RenameFile(
-      const string& src,
-      const string& target /*, TransactionToken* token = nullptr */) override;
+  Status RenameFile(const string& src, const string& target,
+                    TransactionToken* token) override;
 
-  Status HasAtomicMove(
-      const string& path,
-      bool* has_atomic_move /*, TransactionToken* token = nullptr */) override;
+  Status HasAtomicMove(const string& path, bool* has_atomic_move) override;
 
  private:
   // Returns the member S3 client, initializing as-needed.
diff --git a/tensorflow/core/platform/s3/s3_file_system_test.cc b/tensorflow/core/platform/s3/s3_file_system_test.cc
index 06432165e68..960316685f3 100644
--- a/tensorflow/core/platform/s3/s3_file_system_test.cc
+++ b/tensorflow/core/platform/s3/s3_file_system_test.cc
@@ -68,8 +68,8 @@ class S3FileSystemTest : public ::testing::Test {
                          bool use_multi_part_download = true) {
     std::unique_ptr<RandomAccessFile> reader;
 
-    TF_RETURN_IF_ERROR(
-        s3fs.NewRandomAccessFile(fname, &reader, use_multi_part_download));
+    TF_RETURN_IF_ERROR(s3fs.NewRandomAccessFile(fname, nullptr, &reader,
+                                                use_multi_part_download));
 
     uint64 file_size = 0;
     TF_RETURN_IF_ERROR(s3fs.GetFileSize(fname, &file_size));
@@ -228,13 +228,13 @@ TEST_F(S3FileSystemTest, FileExists) {
 
 TEST_F(S3FileSystemTest, GetChildren) {
   const string base = TmpDir("GetChildren");
-  TF_EXPECT_OK(s3fs.CreateDir(base));
+  TF_EXPECT_OK(s3fs.CreateDir(base, nullptr));
 
   const string file = io::JoinPath(base, "TestFile.csv");
   TF_EXPECT_OK(WriteString(file, "test"));
 
   const string subdir = io::JoinPath(base, "SubDir");
-  TF_EXPECT_OK(s3fs.CreateDir(subdir));
+  TF_EXPECT_OK(s3fs.CreateDir(subdir, nullptr));
   // s3 object storage doesn't support empty directory, we create file in the
   // directory
   const string subfile = io::JoinPath(subdir, "TestSubFile.csv");
@@ -264,7 +264,7 @@ TEST_F(S3FileSystemTest, CreateDir) {
   // s3 object storage doesn't support empty directory, we create file in the
   // directory
   const string dir = TmpDir("CreateDir");
-  TF_EXPECT_OK(s3fs.CreateDir(dir));
+  TF_EXPECT_OK(s3fs.CreateDir(dir, nullptr));
 
   const string file = io::JoinPath(dir, "CreateDirFile.csv");
   TF_EXPECT_OK(WriteString(file, "test"));
diff --git a/tensorflow/core/platform/strcat.h b/tensorflow/core/platform/strcat.h
index 3569a86ab33..752cae148f3 100644
--- a/tensorflow/core/platform/strcat.h
+++ b/tensorflow/core/platform/strcat.h
@@ -117,6 +117,8 @@ class AlphaNum {
       : piece_(digits_, FloatToBuffer(f, digits_)) {}
   AlphaNum(double f)  // NOLINT(runtime/explicit)
       : piece_(digits_, DoubleToBuffer(f, digits_)) {}
+  AlphaNum(bfloat16 bf)  // NOLINT(runtime/explicit)
+      : piece_(digits_, FloatToBuffer(static_cast<float>(bf), digits_)) {}
 
   AlphaNum(Hex hex);               // NOLINT(runtime/explicit)
 
diff --git a/tensorflow/core/platform/strcat_test.cc b/tensorflow/core/platform/strcat_test.cc
index 0dde19af9c9..6648c716f22 100644
--- a/tensorflow/core/platform/strcat_test.cc
+++ b/tensorflow/core/platform/strcat_test.cc
@@ -61,6 +61,21 @@ TEST(StrCat, Ints) {
   EXPECT_EQ(answer, "130");
 }
 
+TEST(StrCat, Floats) {
+  const int s = 0;
+  const float f = 1.5f;
+  const double d = 1.5;
+  const bfloat16 bf(1.5f);
+
+  string answer;
+  answer = tensorflow::strings::StrCat(s, f);
+  EXPECT_EQ(answer, "01.5");
+  answer = tensorflow::strings::StrCat(s, d);
+  EXPECT_EQ(answer, "01.5");
+  answer = tensorflow::strings::StrCat(s, bf);
+  EXPECT_EQ(answer, "01.5");
+}
+
 TEST(StrCat, Basics) {
   string result;
 
diff --git a/tensorflow/core/platform/tf32_utils.cc b/tensorflow/core/platform/tensor_float_32_utils.cc
similarity index 73%
rename from tensorflow/core/platform/tf32_utils.cc
rename to tensorflow/core/platform/tensor_float_32_utils.cc
index d2f40ea161a..bbbe5683109 100644
--- a/tensorflow/core/platform/tf32_utils.cc
+++ b/tensorflow/core/platform/tensor_float_32_utils.cc
@@ -13,18 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/platform/tf32_utils.h"
+#include "tensorflow/core/platform/tensor_float_32_utils.h"
 
 #include <atomic>
 
 namespace tensorflow {
 
 // Whether TensorFloat-32 should be used where supported.
-// TODO(nluehr): Maybe enable by default after TF32 Ampere testing.
-static std::atomic<bool> tf32_allowed{false};
+static std::atomic<bool> tensor_float_32_enabled{true};
 
-void allow_tf32_execution(bool allowed) { tf32_allowed = allowed; }
+void enable_tensor_float_32_execution(bool enabled) {
+  tensor_float_32_enabled = enabled;
+}
 
-bool tf32_execution_allowed() { return tf32_allowed; }
+bool tensor_float_32_execution_enabled() { return tensor_float_32_enabled; }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/tensor_float_32_utils.h b/tensorflow/core/platform/tensor_float_32_utils.h
new file mode 100644
index 00000000000..5dcef037ec2
--- /dev/null
+++ b/tensorflow/core/platform/tensor_float_32_utils.h
@@ -0,0 +1,27 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_TENSOR_FLOAT_32_UTILS_H_
+#define TENSORFLOW_CORE_PLATFORM_TENSOR_FLOAT_32_UTILS_H_
+
+namespace tensorflow {
+
+void enable_tensor_float_32_execution(bool enabled);
+
+bool tensor_float_32_execution_enabled();
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_TENSOR_FLOAT_32_UTILS_H_
diff --git a/tensorflow/core/platform/types.h b/tensorflow/core/platform/types.h
index b2fefcaa960..e7539c411dd 100644
--- a/tensorflow/core/platform/types.h
+++ b/tensorflow/core/platform/types.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <string>
 
+#include "tensorflow/core/platform/bfloat16.h"
 #include "tensorflow/core/platform/platform.h"
 #include "tensorflow/core/platform/tstring.h"
 
diff --git a/tensorflow/core/platform/windows/port.cc b/tensorflow/core/platform/windows/port.cc
index 52f9e479036..16b5a328256 100644
--- a/tensorflow/core/platform/windows/port.cc
+++ b/tensorflow/core/platform/windows/port.cc
@@ -49,7 +49,13 @@ string Hostname() {
   return name;
 }
 
-string JobName() { return ""; }
+string JobName() {
+  const char* job_name_cs = std::getenv("TF_JOB_NAME");
+  if (job_name_cs != nullptr) {
+    return string(job_name_cs);
+  }
+  return "";
+}
 
 int NumSchedulableCPUs() {
   SYSTEM_INFO system_info;
diff --git a/tensorflow/core/platform/windows/windows_file_system.cc b/tensorflow/core/platform/windows/windows_file_system.cc
index e1e8656bce5..475f8791144 100644
--- a/tensorflow/core/platform/windows/windows_file_system.cc
+++ b/tensorflow/core/platform/windows/windows_file_system.cc
@@ -261,8 +261,8 @@ class WinReadOnlyMemoryRegion : public ReadOnlyMemoryRegion {
 }  // namespace
 
 Status WindowsFileSystem::NewRandomAccessFile(
-    const string& fname,
-    std::unique_ptr<RandomAccessFile>* result /*, TransactionToken* token */) {
+    const string& fname, TransactionToken* token,
+    std::unique_ptr<RandomAccessFile>* result) {
   string translated_fname = TranslateName(fname);
   std::wstring ws_translated_fname = Utf8ToWideChar(translated_fname);
   result->reset();
@@ -289,8 +289,8 @@ Status WindowsFileSystem::NewRandomAccessFile(
 }
 
 Status WindowsFileSystem::NewWritableFile(
-    const string& fname,
-    std::unique_ptr<WritableFile>* result /*, TransactionToken* token */) {
+    const string& fname, TransactionToken* token,
+    std::unique_ptr<WritableFile>* result) {
   string translated_fname = TranslateName(fname);
   std::wstring ws_translated_fname = Utf8ToWideChar(translated_fname);
   result->reset();
@@ -310,8 +310,8 @@ Status WindowsFileSystem::NewWritableFile(
 }
 
 Status WindowsFileSystem::NewAppendableFile(
-    const string& fname,
-    std::unique_ptr<WritableFile>* result /*, TransactionToken* token */) {
+    const string& fname, TransactionToken* token,
+    std::unique_ptr<WritableFile>* result) {
   string translated_fname = TranslateName(fname);
   std::wstring ws_translated_fname = Utf8ToWideChar(translated_fname);
   result->reset();
@@ -341,8 +341,8 @@ Status WindowsFileSystem::NewAppendableFile(
 }
 
 Status WindowsFileSystem::NewReadOnlyMemoryRegionFromFile(
-    const string& fname, std::unique_ptr<ReadOnlyMemoryRegion>*
-                             result /*, TransactionToken* token */) {
+    const string& fname, TransactionToken* token,
+    std::unique_ptr<ReadOnlyMemoryRegion>* result) {
   string translated_fname = TranslateName(fname);
   std::wstring ws_translated_fname = Utf8ToWideChar(translated_fname);
   result->reset();
@@ -418,8 +418,8 @@ Status WindowsFileSystem::NewReadOnlyMemoryRegionFromFile(
   return s;
 }
 
-Status WindowsFileSystem::FileExists(
-    const string& fname /*, TransactionToken* token */) {
+Status WindowsFileSystem::FileExists(const string& fname,
+                                     TransactionToken* token) {
   constexpr int kOk = 0;
   std::wstring ws_translated_fname = Utf8ToWideChar(TranslateName(fname));
   if (_waccess(ws_translated_fname.c_str(), kOk) == 0) {
@@ -428,9 +428,9 @@ Status WindowsFileSystem::FileExists(
   return errors::NotFound(fname, " not found");
 }
 
-Status WindowsFileSystem::GetChildren(
-    const string& dir,
-    std::vector<string>* result /*, TransactionToken* token */) {
+Status WindowsFileSystem::GetChildren(const string& dir,
+                                      TransactionToken* token,
+                                      std::vector<string>* result) {
   string translated_dir = TranslateName(dir);
   std::wstring ws_translated_dir = Utf8ToWideChar(translated_dir);
   result->clear();
@@ -465,8 +465,8 @@ Status WindowsFileSystem::GetChildren(
   return Status::OK();
 }
 
-Status WindowsFileSystem::DeleteFile(
-    const string& fname /*, TransactionToken* token */) {
+Status WindowsFileSystem::DeleteFile(const string& fname,
+                                     TransactionToken* token) {
   Status result;
   std::wstring file_name = Utf8ToWideChar(fname);
   if (_wunlink(file_name.c_str()) != 0) {
@@ -475,8 +475,8 @@ Status WindowsFileSystem::DeleteFile(
   return result;
 }
 
-Status WindowsFileSystem::CreateDir(
-    const string& name /*, TransactionToken* token */) {
+Status WindowsFileSystem::CreateDir(const string& name,
+                                    TransactionToken* token) {
   Status result;
   std::wstring ws_name = Utf8ToWideChar(name);
   if (ws_name.empty()) {
@@ -488,8 +488,8 @@ Status WindowsFileSystem::CreateDir(
   return result;
 }
 
-Status WindowsFileSystem::DeleteDir(
-    const string& name /*, TransactionToken* token */) {
+Status WindowsFileSystem::DeleteDir(const string& name,
+                                    TransactionToken* token) {
   Status result;
   std::wstring ws_name = Utf8ToWideChar(name);
   if (_wrmdir(ws_name.c_str()) != 0) {
@@ -498,8 +498,8 @@ Status WindowsFileSystem::DeleteDir(
   return result;
 }
 
-Status WindowsFileSystem::GetFileSize(
-    const string& fname, uint64* size /*, TransactionToken* token */) {
+Status WindowsFileSystem::GetFileSize(const string& fname,
+                                      TransactionToken* token, uint64* size) {
   string translated_fname = TranslateName(fname);
   std::wstring ws_translated_dir = Utf8ToWideChar(translated_fname);
   Status result;
@@ -517,8 +517,8 @@ Status WindowsFileSystem::GetFileSize(
   return result;
 }
 
-Status WindowsFileSystem::IsDirectory(
-    const string& fname /*, TransactionToken* token */) {
+Status WindowsFileSystem::IsDirectory(const string& fname,
+                                      TransactionToken* token) {
   TF_RETURN_IF_ERROR(FileExists(fname));
   std::wstring ws_translated_fname = Utf8ToWideChar(TranslateName(fname));
   if (PathIsDirectoryW(ws_translated_fname.c_str())) {
@@ -527,8 +527,8 @@ Status WindowsFileSystem::IsDirectory(
   return Status(tensorflow::error::FAILED_PRECONDITION, "Not a directory");
 }
 
-Status WindowsFileSystem::RenameFile(
-    const string& src, const string& target /*, TransactionToken* token */) {
+Status WindowsFileSystem::RenameFile(const string& src, const string& target,
+                                     TransactionToken* token) {
   Status result;
   // rename() is not capable of replacing the existing file as on Linux
   // so use OS API directly
@@ -542,9 +542,9 @@ Status WindowsFileSystem::RenameFile(
   return result;
 }
 
-Status WindowsFileSystem::GetMatchingPaths(
-    const string& pattern,
-    std::vector<string>* results /*, TransactionToken* token */) {
+Status WindowsFileSystem::GetMatchingPaths(const string& pattern,
+                                           TransactionToken* token,
+                                           std::vector<string>* results) {
   // NOTE(mrry): The existing implementation of FileSystem::GetMatchingPaths()
   // does not handle Windows paths containing backslashes correctly. Since
   // Windows APIs will accept forward and backslashes equivalently, we
@@ -567,8 +567,8 @@ bool WindowsFileSystem::Match(const string& filename, const string& pattern) {
   return PathMatchSpecW(ws_path.c_str(), ws_pattern.c_str()) == TRUE;
 }
 
-Status WindowsFileSystem::Stat(
-    const string& fname, FileStatistics* stat /*, TransactionToken* token */) {
+Status WindowsFileSystem::Stat(const string& fname, TransactionToken* token,
+                               FileStatistics* stat) {
   Status result;
   struct _stat sbuf;
   std::wstring ws_translated_fname = Utf8ToWideChar(TranslateName(fname));
diff --git a/tensorflow/core/platform/windows/windows_file_system.h b/tensorflow/core/platform/windows/windows_file_system.h
index 604cd141e40..8c550f53b84 100644
--- a/tensorflow/core/platform/windows/windows_file_system.h
+++ b/tensorflow/core/platform/windows/windows_file_system.h
@@ -32,72 +32,50 @@ class WindowsFileSystem : public FileSystem {
 
   ~WindowsFileSystem() {}
 
+  TF_USE_FILESYSTEM_METHODS_WITH_NO_TRANSACTION_SUPPORT;
+
   Status NewRandomAccessFile(
-      const string& fname,
-      std::unique_ptr<RandomAccessFile>*
-          result /*, TransactionToken* token = nullptr */) override;
+      const string& fname, TransactionToken* token,
+      std::unique_ptr<RandomAccessFile>* result) override;
 
-  Status NewWritableFile(
-      const string& fname,
-      std::unique_ptr<WritableFile>*
-          result /*, TransactionToken* token = nullptr */) override;
+  Status NewWritableFile(const string& fname, TransactionToken* token,
+                         std::unique_ptr<WritableFile>* result) override;
 
-  Status NewAppendableFile(
-      const string& fname,
-      std::unique_ptr<WritableFile>*
-          result /*, TransactionToken* token = nullptr */) override;
+  Status NewAppendableFile(const string& fname, TransactionToken* token,
+                           std::unique_ptr<WritableFile>* result) override;
 
   Status NewReadOnlyMemoryRegionFromFile(
-      const string& fname,
-      std::unique_ptr<ReadOnlyMemoryRegion>*
-          result /*, TransactionToken* token = nullptr */) override;
+      const string& fname, TransactionToken* token,
+      std::unique_ptr<ReadOnlyMemoryRegion>* result) override;
 
-  Status FileExists(
-      const string& fname /*, TransactionToken* token = nullptr */) override;
+  Status FileExists(const string& fname, TransactionToken* token) override;
 
-  Status GetChildren(
-      const string& dir,
-      std::vector<string>* result /*, TransactionToken* token = nullptr */)
-      override;
+  Status GetChildren(const string& dir, TransactionToken* token,
+                     std::vector<string>* result) override;
 
-  Status GetMatchingPaths(
-      const string& pattern,
-      std::vector<string>* result /*, TransactionToken* token = nullptr */)
-      override;
+  Status GetMatchingPaths(const string& pattern, TransactionToken* token,
+                          std::vector<string>* result) override;
 
-  bool Match(
-      const string& filename,
-      const string& pattern /*, TransactionToken* token = nullptr */) override;
+  bool Match(const string& filename, const string& pattern) override;
 
-  Status Stat(
-      const string& fname,
-      FileStatistics* stat /*, TransactionToken* token = nullptr */) override;
+  Status Stat(const string& fname, TransactionToken* token,
+              FileStatistics* stat) override;
 
-  Status DeleteFile(
-      const string& fname /*, TransactionToken* token = nullptr */) override;
+  Status DeleteFile(const string& fname, TransactionToken* token) override;
 
-  Status CreateDir(
-      const string& name /*, TransactionToken* token = nullptr */) override;
+  Status CreateDir(const string& name, TransactionToken* token) override;
 
-  Status DeleteDir(
-      const string& name /*, TransactionToken* token = nullptr */) override;
+  Status DeleteDir(const string& name, TransactionToken* token) override;
 
-  Status GetFileSize(
-      const string& fname,
-      uint64* size /*, TransactionToken* token = nullptr */) override;
+  Status GetFileSize(const string& fname, TransactionToken* token,
+                     uint64* size) override;
 
-  Status IsDirectory(
-      const string& fname /*, TransactionToken* token = nullptr */) override;
+  Status IsDirectory(const string& fname, TransactionToken* token) override;
 
-  Status RenameFile(
-      const string& src,
-      const string& target /*, TransactionToken* token = nullptr */) override;
+  Status RenameFile(const string& src, const string& target,
+                    TransactionToken* token) override;
 
-  string TranslateName(
-      const string& name /*, TransactionToken* token = nullptr */)
-      const override {
-    return name;
-  }
+  string TranslateName(const string& name) const override { return name; }
 
   char Separator() const override { return '\\'; };
 };
diff --git a/tensorflow/core/profiler/builds/BUILD b/tensorflow/core/profiler/builds/BUILD
new file mode 100644
index 00000000000..40abf596e9f
--- /dev/null
+++ b/tensorflow/core/profiler/builds/BUILD
@@ -0,0 +1,10 @@
+package(
+    default_visibility = ["//tensorflow/core/profiler:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+# ONLY FOR DEV TESTING. DO NOT USE IF YOU DO NOT KNOW ABOUT IT ALREADY.
+config_setting(
+    name = "profiler_build_oss",
+    values = {"define": "profiler_build=oss"},
+)
diff --git a/tensorflow/core/profiler/builds/build_config.bzl b/tensorflow/core/profiler/builds/build_config.bzl
new file mode 100644
index 00000000000..7c1b0a06c06
--- /dev/null
+++ b/tensorflow/core/profiler/builds/build_config.bzl
@@ -0,0 +1,14 @@
+"""Provides a redirection point for platform specific implementations of Starlark utilities."""
+
+load(
+    "//tensorflow/core/profiler/builds/oss:build_config.bzl",
+    _tf_profiler_alias = "tf_profiler_alias",
+)
+
+tf_profiler_alias = _tf_profiler_alias
+
+def if_profiler_oss(if_true, if_false = []):
+    return select({
+        "//tensorflow/core/profiler/builds:profiler_build_oss": if_true,
+        "//conditions:default": if_false,
+    })
diff --git a/tensorflow/core/profiler/builds/oss/BUILD b/tensorflow/core/profiler/builds/oss/BUILD
new file mode 100644
index 00000000000..14475f19ff3
--- /dev/null
+++ b/tensorflow/core/profiler/builds/oss/BUILD
@@ -0,0 +1,8 @@
+# Tensorflow default + linux implementations of tensorflow/core/profiler libraries.
+
+package(
+    default_visibility = [
+        "//tensorflow/core/profiler:internal",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
diff --git a/tensorflow/core/profiler/builds/oss/build_config.bzl b/tensorflow/core/profiler/builds/oss/build_config.bzl
new file mode 100644
index 00000000000..1dcfd0e3291
--- /dev/null
+++ b/tensorflow/core/profiler/builds/oss/build_config.bzl
@@ -0,0 +1,7 @@
+# Platform-specific build configurations.
+"""
+TF profiler build macros for use in OSS.
+"""
+
+def tf_profiler_alias(target_dir, name):
+    return target_dir + "oss:" + name
diff --git a/tensorflow/core/profiler/convert/BUILD b/tensorflow/core/profiler/convert/BUILD
index 3261d918e04..35c3a8761b0 100644
--- a/tensorflow/core/profiler/convert/BUILD
+++ b/tensorflow/core/profiler/convert/BUILD
@@ -108,6 +108,7 @@ cc_library(
         "//tensorflow/core/profiler/utils:kernel_stats_utils",
         "//tensorflow/core/profiler/utils:math_utils",
         "//tensorflow/core/profiler/utils:op_metrics_db_utils",
+        "//tensorflow/core/profiler/utils:tf_op_utils",
         "//tensorflow/core/profiler/utils:time_utils",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -153,6 +154,8 @@ cc_library(
         "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
         "//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
         "//tensorflow/core/profiler/protobuf:tf_stats_proto_cc",
+        "//tensorflow/core/profiler/utils:kernel_stats_utils",
+        "//tensorflow/core/profiler/utils:math_utils",
         "//tensorflow/core/profiler/utils:op_metrics_db_utils",
         "//tensorflow/core/profiler/utils:time_utils",
     ],
@@ -512,6 +515,7 @@ cc_library(
         "//tensorflow/core/profiler/utils:xplane_visitor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:optional",
@@ -536,3 +540,39 @@ tf_cc_test(
         "@com_google_absl//absl/strings",
     ],
 )
+
+cc_library(
+    name = "op_stats_combiner",
+    srcs = ["op_stats_combiner.cc"],
+    hdrs = ["op_stats_combiner.h"],
+    deps = [
+        ":op_metrics_db_combiner",
+        ":xplane_to_tf_functions",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/profiler/protobuf:diagnostics_proto_cc",
+        "//tensorflow/core/profiler/protobuf:hardware_types_proto_cc",
+        "//tensorflow/core/profiler/protobuf:kernel_stats_proto_cc",
+        "//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
+        "//tensorflow/core/profiler/protobuf:steps_db_proto_cc",
+        "//tensorflow/core/profiler/utils:hardware_type_utils",
+        "//tensorflow/core/profiler/utils:kernel_stats_utils",
+        "//tensorflow/core/profiler/utils:step_intersection",
+        "@com_google_absl//absl/container:flat_hash_map",
+    ],
+)
+
+cc_library(
+    name = "post_process_single_host_xplane",
+    srcs = ["post_process_single_host_xplane.cc"],
+    hdrs = ["post_process_single_host_xplane.h"],
+    visibility = ["//tensorflow/core/profiler:internal"],
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core/profiler/internal:profiler_factory",
+        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
+        "//tensorflow/core/profiler/utils:derived_timeline",
+        "//tensorflow/core/profiler/utils:group_events",
+        "//tensorflow/core/profiler/utils:xplane_schema",
+        "//tensorflow/core/profiler/utils:xplane_utils",
+    ],
+)
diff --git a/tensorflow/core/profiler/convert/op_metrics_db_combiner.cc b/tensorflow/core/profiler/convert/op_metrics_db_combiner.cc
index ad1d4bf380a..4cf81f422af 100644
--- a/tensorflow/core/profiler/convert/op_metrics_db_combiner.cc
+++ b/tensorflow/core/profiler/convert/op_metrics_db_combiner.cc
@@ -25,11 +25,20 @@ namespace {
 
 using OperationType = OpMetrics::MemoryAccessed::OperationType;
 
-// Copies OpMetrics symbol data from src to dst.
-void CopyOpMetricsSymbolData(const OpMetrics& src, OpMetrics* dst) {
+void CombinePrecisionStats(const PrecisionStats& src, PrecisionStats* dst) {
+  dst->set_compute_16bit_ps(src.compute_16bit_ps() + dst->compute_16bit_ps());
+  dst->set_compute_32bit_ps(src.compute_32bit_ps() + dst->compute_32bit_ps());
+}
+
+}  // namespace
+
+void CopyOpMetricsMetadata(const OpMetrics& src, OpMetrics* dst) {
   DCHECK(dst != nullptr);
   DCHECK_EQ(src.hlo_module_id(), dst->hlo_module_id());
   DCHECK_EQ(src.name(), dst->name());
+  if (dst->long_name().empty()) {
+    dst->set_long_name(src.long_name());
+  }
   if (dst->category().empty()) {
     dst->set_category(src.category());
   }
@@ -47,13 +56,6 @@ void CopyOpMetricsSymbolData(const OpMetrics& src, OpMetrics* dst) {
   }
 }
 
-void CombinePrecisionStats(const PrecisionStats& src, PrecisionStats* dst) {
-  dst->set_compute_16bit_ps(src.compute_16bit_ps() + dst->compute_16bit_ps());
-  dst->set_compute_32bit_ps(src.compute_32bit_ps() + dst->compute_32bit_ps());
-}
-
-}  // namespace
-
 void CombineOpMetrics(const OpMetrics& src, OpMetrics* dst) {
   DCHECK(dst != nullptr);
   if (dst->occurrences() == 0) {
@@ -115,7 +117,7 @@ void OpMetricsDbCombiner::Combine(const OpMetricsDb& src) {
   for (const auto& src_metrics : src.metrics_db()) {
     auto* dst_metrics = LookupOrInsertNewOpMetrics(src_metrics.hlo_module_id(),
                                                    src_metrics.name());
-    CopyOpMetricsSymbolData(src_metrics, dst_metrics);
+    CopyOpMetricsMetadata(src_metrics, dst_metrics);
     CombineOpMetrics(src_metrics, dst_metrics);
   }
 }
diff --git a/tensorflow/core/profiler/convert/op_metrics_db_combiner.h b/tensorflow/core/profiler/convert/op_metrics_db_combiner.h
index a87a2b53500..5c1490d2e8b 100644
--- a/tensorflow/core/profiler/convert/op_metrics_db_combiner.h
+++ b/tensorflow/core/profiler/convert/op_metrics_db_combiner.h
@@ -23,7 +23,10 @@ limitations under the License.
 namespace tensorflow {
 namespace profiler {
 
-// Combines the src OpMetrics into the dst OpMetrics.
+// Copies OpMetrics metadata (e.g., category, provenance) from src to dst.
+void CopyOpMetricsMetadata(const OpMetrics& src, OpMetrics* dst);
+
+// Combines OpMetrics data (e.g., occurrences, time) from src into dst.
 void CombineOpMetrics(const OpMetrics& src, OpMetrics* dst);
 
 // Combines the memory access breakdown.
diff --git a/tensorflow/core/profiler/convert/op_stats_combiner.cc b/tensorflow/core/profiler/convert/op_stats_combiner.cc
new file mode 100644
index 00000000000..d4ce7ec315f
--- /dev/null
+++ b/tensorflow/core/profiler/convert/op_stats_combiner.cc
@@ -0,0 +1,252 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/profiler/convert/op_stats_combiner.h"
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/profiler/convert/op_metrics_db_combiner.h"
+#include "tensorflow/core/profiler/convert/xplane_to_tf_functions.h"
+#include "tensorflow/core/profiler/protobuf/diagnostics.pb.h"
+#include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
+#include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
+#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
+#include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
+#include "tensorflow/core/profiler/utils/hardware_type_utils.h"
+#include "tensorflow/core/profiler/utils/kernel_stats_utils.h"
+#include "tensorflow/core/profiler/utils/step_intersection.h"
+
+namespace tensorflow {
+namespace profiler {
+
+namespace {
+
+// Combines the src PerCoreStepInfo into the dst PerCoreStepInfo.
+void CombinePerCoreStepInfo(
+    int src_host_id, const PerCoreStepInfo& src, bool use_incomplete_step,
+    PerCoreStepInfo* dst,
+    OpMetricsDbCombiner* hlo_metrics_db_complete_steps_only_combiner,
+    OpMetricsDbCombiner* hlo_metrics_db_per_step_combiner) {
+  CombineCoreIdMap(src_host_id, src.step_info_per_core(),
+                   dst->mutable_step_info_per_core());
+
+  // Since we have assigned a new step number to the combined result, update
+  // the step number on each core to this new step number.
+  uint32 new_step_num = dst->step_num();
+  for (auto& percore_stepinfo : *dst->mutable_step_info_per_core()) {
+    auto& stepinfo = percore_stepinfo.second;
+    stepinfo.set_step_num(new_step_num);
+  }
+
+  if (!use_incomplete_step) {
+    hlo_metrics_db_complete_steps_only_combiner->Combine(src.hlo_metrics_db());
+  }
+  hlo_metrics_db_per_step_combiner->Combine(src.hlo_metrics_db());
+  CombineCoreIdMap(src_host_id, src.flow_db_per_core(),
+                   dst->mutable_flow_db_per_core());
+  CombineCoreIdMap(src_host_id, src.all_reduce_db_per_core(),
+                   dst->mutable_all_reduce_db_per_core());
+  CombineCoreIdMap(src_host_id, src.core_id_to_replica_id_map(),
+                   dst->mutable_core_id_to_replica_id_map());
+}
+
+void CombineStepDatabase(
+    int src_host_id, const StepIntersection& step_intersection,
+    const StepDatabaseResult& src, StepDatabaseResult* dst,
+    OpMetricsDbCombiner* hlo_metrics_db_complete_steps_only_combiner,
+    std::vector<OpMetricsDbCombiner>* hlo_metrics_db_per_step_combiners) {
+  if (src.use_incomplete_step()) dst->set_use_incomplete_step(true);
+  uint32 src_first_step_idx = step_intersection.FirstStepIndex(src_host_id);
+  for (uint32 i = 0; i < step_intersection.NumSteps(); i++) {
+    CombinePerCoreStepInfo(
+        src_host_id, src.step_sequence(src_first_step_idx + i),
+        src.use_incomplete_step(), dst->mutable_step_sequence(i),
+        hlo_metrics_db_complete_steps_only_combiner,
+        &(*hlo_metrics_db_per_step_combiners)[i]);
+  }
+}
+
+void CombineRunEnvironment(const RunEnvironment& src, RunEnvironment* dst) {
+  dst->mutable_hostnames()->insert(src.hostnames().begin(),
+                                   src.hostnames().end());
+  dst->set_host_count(dst->hostnames_size());
+  if (src.device_type() != "CPU") {
+    dst->set_device_type(src.device_type());
+    // TODO(b/111402648): Batch size may differ per-core. Currently, we report
+    // the max batch size. We need to come up with a better measure.
+    dst->set_per_core_batch_size(
+        std::max(src.per_core_batch_size(), dst->per_core_batch_size()));
+    dst->set_device_core_count(src.device_core_count() +
+                               dst->device_core_count());
+    // Replica count and num cores per replica must be same for all copies.
+    dst->set_replica_count(std::max(src.replica_count(), dst->replica_count()));
+    dst->set_num_cores_per_replica(
+        std::max(src.num_cores_per_replica(), dst->num_cores_per_replica()));
+    *dst->mutable_topology() = src.topology();
+  }
+  dst->set_task_count(src.task_count() + dst->task_count());
+  (*dst->mutable_host_independent_job_info()) = src.host_independent_job_info();
+  for (const auto& job_info : src.host_dependent_job_info()) {
+    *(dst->add_host_dependent_job_info()) = job_info;
+  }
+  dst->set_host_trace_level(src.host_trace_level());
+}
+
+// Combines the src PerfEnv into the dst PerfEnv.
+void CombinePerfEnv(const PerfEnv& src, PerfEnv* dst) {
+  dst->set_peak_tera_flops_per_second(src.peak_tera_flops_per_second());
+  dst->set_peak_hbm_bw_giga_bytes_per_second(
+      src.peak_hbm_bw_giga_bytes_per_second());
+  dst->set_ridge_point(src.ridge_point());
+}
+
+// Combines the src Diagnostics into the dst Diagnostics.
+void CombineDiagnostics(const Diagnostics& src, Diagnostics* dst) {
+  dst->mutable_info()->MergeFrom(src.info());
+  dst->mutable_warnings()->MergeFrom(src.warnings());
+  dst->mutable_errors()->MergeFrom(src.errors());
+}
+
+// Combine the src OpStats into the dst OpStats.
+void CombineOpStats(
+    bool no_accelerator_in_system, int src_host_id, HardwareType hardware_type,
+    const StepIntersection& step_intersection, const OpStats& src, OpStats* dst,
+    OpMetricsDbCombiner* host_op_metrics_db_combiner,
+    OpMetricsDbCombiner* device_op_metrics_db_combiner,
+    OpMetricsDbCombiner* hlo_metrics_db_complete_steps_only_combiner,
+    std::vector<OpMetricsDbCombiner>* hlo_metrics_db_per_step_combiners) {
+  // Combine host_metrics_db.
+  host_op_metrics_db_combiner->Combine(src.host_op_metrics_db());
+  // Combine device_metrics_db.
+  device_op_metrics_db_combiner->Combine(src.device_op_metrics_db());
+
+  // Combine step_db.
+  if (!IsCoordinator(no_accelerator_in_system, hardware_type)) {
+    CombineStepDatabase(src_host_id, step_intersection, src.step_db(),
+                        dst->mutable_step_db(),
+                        hlo_metrics_db_complete_steps_only_combiner,
+                        hlo_metrics_db_per_step_combiners);
+  }
+
+  // Combine run environment info.
+  CombineRunEnvironment(src.run_environment(), dst->mutable_run_environment());
+
+  // Combine the perf environment info.
+  CombinePerfEnv(src.perf_env(), dst->mutable_perf_env());
+
+  // Combine diagnostics.
+  CombineDiagnostics(src.diagnostics(), dst->mutable_diagnostics());
+
+  // Combine kernel stats.
+  dst->mutable_kernel_stats_db()->mutable_reports()->MergeFrom(
+      src.kernel_stats_db().reports());
+
+  // Combine tf-function stats.
+  CombineTfFunctionDb(src.tf_function_db(), dst->mutable_tf_function_db());
+}
+
+}  // namespace
+
+bool IsCoordinator(bool no_accelerator_in_system, HardwareType hardware_type) {
+  // A host is a coordinator if:
+  //   (1) The host doesn't have a device, and
+  //   (2) The system does use accelerator (if not, it uses CPU only and so this
+  //   host should be regarded as a worker as well).
+  return !HasDevice(hardware_type) && !no_accelerator_in_system;
+}
+
+bool NoAcceleratorInSystem(const std::vector<OpStatsInfo>& all_op_stats_info) {
+  for (const auto& op_stats_info : all_op_stats_info) {
+    if (HasDevice(op_stats_info.hardware_type)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+uint32 GlobalCoreId(int host_id, uint32 device_ordinal) {
+  constexpr uint32 kMaxDevicesPerHost = 1000;  // power-of-10 for debuggability
+  return host_id * kMaxDevicesPerHost + device_ordinal;
+}
+
+StepIntersection ComputeStepIntersectionToMergeOpStats(
+    const std::vector<OpStatsInfo>& all_op_stats_info,
+    uint32 max_step_per_host) {
+  bool no_accelerator_in_system = NoAcceleratorInSystem(all_op_stats_info);
+
+  absl::flat_hash_map<uint32, const StepDatabaseResult*> per_host_step_db;
+  for (const auto& op_stats_info : all_op_stats_info) {
+    if (IsCoordinator(no_accelerator_in_system, op_stats_info.hardware_type))
+      continue;
+    // Includes only workers in per_host_step_db.
+    per_host_step_db[op_stats_info.src_host_id] =
+        &op_stats_info.op_stats->step_db();
+  }
+
+  return StepIntersection(max_step_per_host, per_host_step_db);
+}
+
+void CombineAllOpStats(const std::vector<OpStatsInfo>& all_op_stats_info,
+                       const StepIntersection& step_intersection,
+                       OpStats* combined_op_stats) {
+  StepDatabaseResult* combined_step_db = combined_op_stats->mutable_step_db();
+  // Initialize the StepDatabaseResult field that depends on the number of
+  // steps.
+  for (uint32 dst_step_num : step_intersection.DstStepNumbers()) {
+    combined_step_db->add_step_sequence()->set_step_num(dst_step_num);
+  }
+  // Record the number of steps that are dropped.
+  combined_step_db->set_num_steps_dropped(step_intersection.StepsDropped());
+
+  // Set the default value of per_core_batch_size in <combined_op_stats>
+  combined_op_stats->mutable_run_environment()->set_per_core_batch_size(-1);
+
+  // Initialize all the OpMetricsDbCombiners.
+  OpMetricsDbCombiner host_op_metrics_db_combiner(
+      combined_op_stats->mutable_host_op_metrics_db());
+  OpMetricsDbCombiner device_op_metrics_db_combiner(
+      combined_op_stats->mutable_device_op_metrics_db());
+  OpMetricsDbCombiner hlo_metrics_db_complete_steps_only_combiner(
+      combined_op_stats->mutable_hlo_metrics_db_complete_steps_only());
+  std::vector<OpMetricsDbCombiner> hlo_metrics_db_per_step_combiners;
+  hlo_metrics_db_per_step_combiners.reserve(
+      combined_step_db->step_sequence_size());
+  for (PerCoreStepInfo& step_info :
+       *combined_step_db->mutable_step_sequence()) {
+    hlo_metrics_db_per_step_combiners.emplace_back(
+        step_info.mutable_hlo_metrics_db());
+  }
+
+  bool no_accelerator_in_system = NoAcceleratorInSystem(all_op_stats_info);
+
+  for (const auto& op_stats_info : all_op_stats_info) {
+    CombineOpStats(no_accelerator_in_system, op_stats_info.src_host_id,
+                   op_stats_info.hardware_type, step_intersection,
+                   *op_stats_info.op_stats, combined_op_stats,
+                   &host_op_metrics_db_combiner, &device_op_metrics_db_combiner,
+                   &hlo_metrics_db_complete_steps_only_combiner,
+                   &hlo_metrics_db_per_step_combiners);
+  }
+
+  // Sorts all the kernel reports that have been merged by CombineTfOpStats and
+  // keeps only the top kernel reports with long kernel duration.
+  SortAndKeepTopKDurationKernelReportsInDb(
+      combined_op_stats->mutable_kernel_stats_db());
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/op_stats_combiner.h b/tensorflow/core/profiler/convert/op_stats_combiner.h
new file mode 100644
index 00000000000..8f694c3dd09
--- /dev/null
+++ b/tensorflow/core/profiler/convert/op_stats_combiner.h
@@ -0,0 +1,84 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_COMBINER_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_COMBINER_H_
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/profiler/convert/op_metrics_db_combiner.h"
+#include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
+#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
+#include "tensorflow/core/profiler/utils/step_intersection.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Whether a host is a coordinator.
+bool IsCoordinator(bool no_accelerator_in_system, HardwareType hardware_type);
+
+// Translates the core id from single host to the one for multiple-host.
+// We need this translation because the device_ordinal was assigned when a
+// single host response was given. Now, we need a global core_id to distinguish
+// it with multiple hosts.
+uint32 GlobalCoreId(int host_id, uint32 device_ordinal);
+
+// Combines the src map into the dst map.
+// The src map keys are local core_ids. The src_host_id is used to convert them
+// into global core_ids used as keys in the dst map.
+// REQUIRED: cores from src_host_id are not already in dst.
+template <typename CoreIdMap>
+void CombineCoreIdMap(int src_host_id, const CoreIdMap& src, CoreIdMap* dst) {
+  for (const auto& core_id_and_value : src) {
+    uint32 global_core_id = GlobalCoreId(src_host_id, core_id_and_value.first);
+    auto iter_and_inserted =
+        dst->insert({global_core_id, core_id_and_value.second});
+    DCHECK(iter_and_inserted.second)
+        << "Duplicated core_id: " << iter_and_inserted.first->first;
+  }
+}
+
+// A struct that contains all the information that is needed to combine OpStats.
+struct OpStatsInfo {
+  OpStatsInfo(const OpStats* op_stats, HardwareType hardware_type,
+              int src_host_id)
+      : op_stats(op_stats),
+        hardware_type(hardware_type),
+        src_host_id(src_host_id) {}
+  const OpStats* op_stats;
+  HardwareType hardware_type;
+  int src_host_id;
+};
+
+// Returns true if there is no device (accelerator) in any of the hosts.
+bool NoAcceleratorInSystem(const std::vector<OpStatsInfo>& all_op_stats_info);
+
+// Compute the StepIntersection to merge OpStats.
+// Profiler will limit the number of steps to be at most <max_step_per_host>.
+StepIntersection ComputeStepIntersectionToMergeOpStats(
+    const std::vector<OpStatsInfo>& all_op_stats_info,
+    uint32 max_step_per_host);
+
+// Combine all the OpStats in <all_op_stats_info> using the steps in range
+// <step_intersection>. The result is stored in <combined_op_stats>.
+void CombineAllOpStats(const std::vector<OpStatsInfo>& all_op_stats_info,
+                       const StepIntersection& step_intersection,
+                       OpStats* combined_op_stats);
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_COMBINER_H_
diff --git a/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc b/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
index 25391b99846..8f58b7bf3ae 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/utils/kernel_stats_utils.h"
 #include "tensorflow/core/profiler/utils/math_utils.h"
 #include "tensorflow/core/profiler/utils/op_metrics_db_utils.h"
+#include "tensorflow/core/profiler/utils/tf_op_utils.h"
 #include "tensorflow/core/profiler/utils/time_utils.h"
 
 namespace tensorflow {
@@ -128,18 +129,20 @@ std::string GeneratePrecisionStatement(const PrecisionStats& precision_stats) {
 
 }  // namespace
 
-void SetCommonRecommendation(absl::string_view input_classification,
-                             absl::string_view input_statement,
-                             absl::string_view output_statement,
-                             HardwareType hardware_type,
-                             absl::string_view tf_function_statement_html,
-                             absl::string_view eager_statement_html,
-                             OverviewPageRecommendation* re) {
+void SetCommonRecommendation(
+    absl::string_view input_classification, absl::string_view input_statement,
+    absl::string_view output_statement, HardwareType hardware_type,
+    absl::string_view tf_function_statement_html,
+    absl::string_view eager_statement_html,
+    absl::string_view outside_compilation_statement_html,
+    OverviewPageRecommendation* re) {
   re->set_bottleneck(std::string(input_classification));
   re->set_statement(std::string(input_statement));
   re->set_output_statement(std::string(output_statement));
   re->set_tf_function_statement_html(std::string(tf_function_statement_html));
   re->set_eager_statement_html(std::string(eager_statement_html));
+  re->set_outside_compilation_statement_html(
+      std::string(outside_compilation_statement_html));
   ComputeHostTips(re);
   ComputeDeviceTips(hardware_type, re);
   ComputeDocumentationTips(re);
@@ -222,6 +225,19 @@ OverviewPageAnalysis ComputeAnalysisResult(const OpStats& op_stats) {
       if (metrics.is_eager()) eager_device_op_time_ps += metrics.self_time_ps();
     }
   }
+  // Figures out outside_compilation time from
+  // op_stats.device_op_metrics_db().metrics_db(). We don't use the
+  // {metrics.provenance(), metrics.name()} from
+  // device_tf_op_metrics_db.metrics_db(), because metrics.provenance() there is
+  // not set and metrics.name() can be either HLO-Op name or TF-Op name, which
+  // will confuse IsOutsideCompilationOp().
+  uint64 outside_compilation_device_op_time_ps = 0;
+  for (const OpMetrics& metrics :
+       op_stats.device_op_metrics_db().metrics_db()) {
+    if (!IsOutsideCompilationOp(metrics.provenance(), metrics.long_name()))
+      continue;
+    outside_compilation_device_op_time_ps += metrics.self_time_ps();
+  }
   uint64 num_total_tf_ops = num_host_tf_ops + num_device_tf_ops;
   analysis.set_host_tf_op_percent(
       100.0 * SafeDivide(num_host_tf_ops, num_total_tf_ops));
@@ -234,6 +250,9 @@ OverviewPageAnalysis ComputeAnalysisResult(const OpStats& op_stats) {
   analysis.set_device_op_time_eager_percent(
       100.0 * SafeDivide(eager_device_op_time_ps,
                          total_device_op_time_ps_exclude_idle));
+  analysis.set_device_op_time_outside_compilation_percent(
+      100.0 * SafeDivide(outside_compilation_device_op_time_ps,
+                         total_device_op_time_ps_exclude_idle));
   return analysis;
 }
 
@@ -315,10 +334,12 @@ std::string EagerRecommendationHtml(double host_op_time_eager_percent,
                                     double device_op_time_eager_percent) {
   std::string recommendation = "";
   if (host_op_time_eager_percent > kEagerReportThresholdInPercent)
-    absl::StrAppend(&recommendation, host_op_time_eager_percent,
+    absl::StrAppend(&recommendation,
+                    absl::StrFormat("%.1f", host_op_time_eager_percent),
                     "% of Op time on the host used eager execution. ");
   if (device_op_time_eager_percent > kEagerReportThresholdInPercent)
-    absl::StrAppend(&recommendation, device_op_time_eager_percent,
+    absl::StrAppend(&recommendation,
+                    absl::StrFormat("%.1f", device_op_time_eager_percent),
                     "% of Op time on the device used eager execution. ");
   if (!recommendation.empty())
     absl::StrAppend(&recommendation, "Performance could be improved with ",
@@ -327,6 +348,17 @@ std::string EagerRecommendationHtml(double host_op_time_eager_percent,
   return recommendation;
 }
 
+std::string OutsideCompilationRecommendationHtml(
+    double device_op_time_outside_compilation_percent) {
+  if (device_op_time_outside_compilation_percent <=
+      kOutsideCompilationThresholdInPercent)
+    return "";
+  return absl::StrCat(
+      absl::StrFormat("%.1lf", device_op_time_outside_compilation_percent),
+      " % of Op time on the device are for outside compilation. Performance "
+      "could be improved by avoiding outside compilation.");
+}
+
 OverviewPage ConvertOpStatsToOverviewPage(const OpStats& op_stats) {
   OverviewPage overview_page;
   *overview_page.mutable_run_environment() =
@@ -346,6 +378,9 @@ OverviewPage ConvertOpStatsToOverviewPage(const OpStats& op_stats) {
       EagerRecommendationHtml(
           overview_page.analysis().host_op_time_eager_percent(),
           overview_page.analysis().device_op_time_eager_percent()),
+      OutsideCompilationRecommendationHtml(
+          overview_page.analysis()
+              .device_op_time_outside_compilation_percent()),
       overview_page.mutable_recommendation());
   PopulateOverviewDiagnostics(op_stats, overview_page.mutable_diagnostics());
   return overview_page;
diff --git a/tensorflow/core/profiler/convert/op_stats_to_overview_page.h b/tensorflow/core/profiler/convert/op_stats_to_overview_page.h
index 876f6847e9f..11edfc7b247 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_overview_page.h
+++ b/tensorflow/core/profiler/convert/op_stats_to_overview_page.h
@@ -37,13 +37,18 @@ const double kTfFunctionReportThresholdInPercent = 20;
 // this threshold.
 const double kEagerReportThresholdInPercent = 10;
 
-void SetCommonRecommendation(absl::string_view input_classification,
-                             absl::string_view input_statement,
-                             absl::string_view output_statement,
-                             HardwareType hardware_type,
-                             absl::string_view tf_function_statement_html,
-                             absl::string_view eager_statement_html,
-                             OverviewPageRecommendation* re);
+// Reports outside-compilation opportunity in the Overview Page if the
+// percent of Op time on device that is for outside compilation is over
+// this threshold.
+const double kOutsideCompilationThresholdInPercent = 5;
+
+void SetCommonRecommendation(
+    absl::string_view input_classification, absl::string_view input_statement,
+    absl::string_view output_statement, HardwareType hardware_type,
+    absl::string_view tf_function_statement_html,
+    absl::string_view eager_statement_html,
+    absl::string_view outside_compilation_statement_html,
+    OverviewPageRecommendation* re);
 
 OverviewPageRecommendation ComputeGenericRecommendation(
     const BottleneckAnalysis& bottleneck,
@@ -63,6 +68,10 @@ std::string TfFunctionRecommendationHtml(const TfFunctionDb& tf_function_db);
 std::string EagerRecommendationHtml(double host_op_time_eager_percent,
                                     double device_op_time_eager_percent);
 
+// Returns a html which provides outside-compilation related recommendation.
+std::string OutsideCompilationRecommendationHtml(
+    double device_op_time_outside_compilation_percent);
+
 }  // namespace profiler
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/profiler/convert/op_stats_to_tf_stats.cc b/tensorflow/core/profiler/convert/op_stats_to_tf_stats.cc
index e23813a5b5d..48354874509 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_tf_stats.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_tf_stats.cc
@@ -20,6 +20,8 @@ limitations under the License.
 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
 #include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
 #include "tensorflow/core/profiler/protobuf/tf_stats.pb.h"
+#include "tensorflow/core/profiler/utils/kernel_stats_utils.h"
+#include "tensorflow/core/profiler/utils/math_utils.h"
 #include "tensorflow/core/profiler/utils/op_metrics_db_utils.h"
 #include "tensorflow/core/profiler/utils/time_utils.h"
 
@@ -27,6 +29,10 @@ namespace tensorflow {
 namespace profiler {
 namespace {
 
+// The maximum number of Tensorflow Ops displayed on Tensorflow Stats page.
+// 500 device side ops and 500 host side ops.
+const int kMaxNumOfOps = 500;
+
 TfStatsRecord ConvertOpMetricsToTfStatsRecord(
     bool on_device, const OpMetrics& metrics,
     double ridge_point_operational_intensity) {
@@ -40,9 +46,11 @@ TfStatsRecord ConvertOpMetricsToTfStatsRecord(
   return record;
 }
 
-TfStatsTable GenerateTfStatsTable(const OpMetricsDb& host_tf_metrics_db,
-                                  const OpMetricsDb& device_tf_metrics_db,
-                                  double ridge_point, bool exclude_idle) {
+TfStatsTable GenerateTfStatsTable(
+    const OpMetricsDb& host_tf_metrics_db,
+    const OpMetricsDb& device_tf_metrics_db,
+    const KernelStatsByOpName& kernel_stats_by_op_name, double ridge_point,
+    bool exclude_idle) {
   TfStatsTable tf_stats_table;
   TfStatsRecord sentinel;
   sentinel.set_rank(0);
@@ -56,11 +64,21 @@ TfStatsTable GenerateTfStatsTable(const OpMetricsDb& host_tf_metrics_db,
     total_device_time_ps -= IdleTimePs(device_tf_metrics_db);
   }
   double total_device_time_us = PicosToMicros(total_device_time_ps);
-  for (const OpMetrics* metrics : SortedOpMetricsDb(device_tf_metrics_db)) {
+  for (const OpMetrics* metrics :
+       SortedOpMetricsDb(device_tf_metrics_db, kMaxNumOfOps)) {
     if (exclude_idle && IsIdleOp(*metrics)) continue;
     TfStatsRecord* record = tf_stats_table.add_tf_stats_record();
     *record = ConvertOpMetricsToTfStatsRecord(
         /*on_device=*/true, *metrics, ridge_point);
+    // Compute TensorCore utilization only on device side.
+    auto iter = kernel_stats_by_op_name.find(record->op_name());
+    if (iter != kernel_stats_by_op_name.end()) {
+      record->set_gpu_tensorcore_utilization(
+          SafeDivide(iter->second.tensor_core_duration_ns,
+                     iter->second.total_duration_ns));
+    } else {
+      record->set_gpu_tensorcore_utilization(0.0);
+    }
     SetRankAndDeviceTimeFractions(total_device_time_us, *prev_record, record);
     prev_record = record;
   }
@@ -71,12 +89,14 @@ TfStatsTable GenerateTfStatsTable(const OpMetricsDb& host_tf_metrics_db,
     total_host_time_ps -= IdleTimePs(host_tf_metrics_db);
   }
   double total_host_time_us = PicosToMicros(total_host_time_ps);
-  for (const OpMetrics* metrics :
-       tensorflow::profiler::SortedOpMetricsDb(host_tf_metrics_db)) {
+  for (const OpMetrics* metrics : tensorflow::profiler::SortedOpMetricsDb(
+           host_tf_metrics_db, kMaxNumOfOps)) {
     if (exclude_idle && IsIdleOp(*metrics)) continue;
     TfStatsRecord* record = tf_stats_table.add_tf_stats_record();
     *record = ConvertOpMetricsToTfStatsRecord(
         /*on_device=*/false, *metrics, ridge_point);
+    // Host side TensorCore utilization is always 0.0
+    record->set_gpu_tensorcore_utilization(0.0);
     SetRankAndHostTimeFractions(total_host_time_us, *prev_record, record);
     prev_record = record;
   }
@@ -90,13 +110,16 @@ TfStatsDatabase ConvertOpStatsToTfStats(const OpStats& op_stats) {
   OpMetricsDb device_tf_metrics_db =
       CreateTfMetricsDbFromDeviceOpMetricsDb(op_stats.device_op_metrics_db());
   double ridge_point = op_stats.perf_env().ridge_point();
+  KernelStatsByOpName kernel_stats_by_op_name =
+      GroupKernelReportsByOpName(op_stats.kernel_stats_db());
   TfStatsDatabase tf_stats_db;
-  *tf_stats_db.mutable_with_idle() =
-      GenerateTfStatsTable(host_tf_metrics_db, device_tf_metrics_db,
-                           ridge_point, /*exclude_idle=*/false);
-  *tf_stats_db.mutable_without_idle() =
-      GenerateTfStatsTable(host_tf_metrics_db, device_tf_metrics_db,
-                           ridge_point, /*exclude_idle=*/true);
+  *tf_stats_db.mutable_with_idle() = GenerateTfStatsTable(
+      host_tf_metrics_db, device_tf_metrics_db, kernel_stats_by_op_name,
+      ridge_point, /*exclude_idle=*/false);
+  *tf_stats_db.mutable_without_idle() = GenerateTfStatsTable(
+      host_tf_metrics_db, device_tf_metrics_db, kernel_stats_by_op_name,
+      ridge_point, /*exclude_idle=*/true);
+  tf_stats_db.set_device_type(op_stats.run_environment().device_type());
   return tf_stats_db;
 }
 
diff --git a/tensorflow/core/profiler/convert/op_stats_to_tf_stats_test.cc b/tensorflow/core/profiler/convert/op_stats_to_tf_stats_test.cc
index 4abd210705b..067d47a0b57 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_tf_stats_test.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_tf_stats_test.cc
@@ -32,32 +32,69 @@ namespace tensorflow {
 namespace profiler {
 namespace {
 
-void AddTensorFlowOpEvent(absl::string_view tf_op_fullname,
-                          int64 start_timestamp_ns, int64 duration_ns,
-                          bool on_device, absl::string_view kernel_name,
-                          XPlaneBuilder* plane, XLineBuilder* line) {
+XEventBuilder AddTensorFlowOpEvent(absl::string_view tf_op_fullname,
+                                   int64 start_timestamp_ns, int64 duration_ns,
+                                   bool on_device,
+                                   absl::string_view kernel_name,
+                                   XPlaneBuilder* plane, XLineBuilder* line) {
   absl::string_view name = on_device ? kernel_name : tf_op_fullname;
   XEventBuilder event = line->AddEvent(*plane->GetOrCreateEventMetadata(name));
   event.SetTimestampNs(start_timestamp_ns);
   event.SetDurationNs(duration_ns);
-  if (!on_device) return;
+  if (!on_device) return event;
   event.ParseAndAddStatValue(*plane->GetOrCreateStatMetadata("level 0"),
                              tf_op_fullname);
+  return event;
+}
+
+void AddTensorFlowOpEventWithKernelDetails(absl::string_view tf_op_fullname,
+                                           int64 start_timestamp_ns,
+                                           int64 duration_ns, bool on_device,
+                                           absl::string_view kernel_name,
+                                           absl::string_view kernel_details,
+                                           XPlaneBuilder* plane,
+                                           XLineBuilder* line) {
+  XEventBuilder event =
+      AddTensorFlowOpEvent(tf_op_fullname, start_timestamp_ns, duration_ns,
+                           on_device, kernel_name, plane, line);
+  if (!on_device) return;
+  event.ParseAndAddStatValue(*plane->GetOrCreateStatMetadata("kernel_details"),
+                             kernel_details);
 }
 
 TEST(OpStatsToTfStats, GpuTfStats) {
-  // TfOp1 has kernel1 and kernel2; TfOp2 has kernel3.
+  // TfOp1 has kernel1 and kernel2; TfOp2 has kernel3;
+  // TfOp3 has kernel4 and kernel5 and is TensorCore eligible.
   static constexpr char kTfOp1[] = "TfOp1";
   static constexpr char kTfOp2[] = "TfOp2";
+  static constexpr char kTfOp3[] = "Conv2D";
   static constexpr char kKernel1[] = "kernel1";
   static constexpr char kKernel2[] = "kernel2";
   static constexpr char kKernel3[] = "kernel3";
+  // Kernel4 is a kernel using TensorCore
+  static constexpr char kKernel4[] = "volta_fp16_s884gemm";
+  static constexpr char kKernel5[] = "kernel5";
   constexpr int64 kKernel1StartNs = 100000;
   constexpr int64 kKernel1DurationNs = 8000;
   constexpr int64 kKernel2StartNs = 110000;
   constexpr int64 kKernel2DurationNs = 10000;
   constexpr int64 kKernel3StartNs = 120000;
   constexpr int64 kKernel3DurationNs = 10000;
+  constexpr int64 kKernel4StartNs = 130000;
+  constexpr int64 kKernel4DurationNs = 10000;
+  constexpr int64 kKernel5StartNs = 150000;
+  constexpr int64 kKernel5DurationNs = 10000;
+
+  // Mock kernel details for both kernel4 and kernel5.
+  const std::string kKernelDetails = R"MULTI(registers_per_thread:32
+static_shared_memory_usage:0
+dynamic_shared_memory_usage:16384
+grid_x:2
+grid_y:1
+grid_z:1
+block_x:32
+block_y:1
+block_z:1)MULTI";
 
   XSpace space;
   XPlaneBuilder device_plane(
@@ -79,12 +116,23 @@ TEST(OpStatsToTfStats, GpuTfStats) {
   AddTensorFlowOpEvent(absl::StrCat(kTfOp2, ":", kTfOp2), kKernel3StartNs,
                        kKernel3DurationNs, /*on_device=*/true, kKernel3,
                        &device_plane, &stream2);
+  AddTensorFlowOpEventWithKernelDetails(
+      absl::StrCat(kTfOp3, ":", kTfOp3), kKernel4StartNs, kKernel4DurationNs,
+      /*on_device=*/true, kKernel4, kKernelDetails, &device_plane, &stream2);
+  AddTensorFlowOpEventWithKernelDetails(
+      absl::StrCat(kTfOp3, ":", kTfOp3), kKernel5StartNs, kKernel5DurationNs,
+      /*on_device=*/true, kKernel5, kKernelDetails, &device_plane, &stream2);
 
-  const OpStats op_stats = ConvertXSpaceToOpStats(space, {OP_METRICS_DB});
+  OpStatsOptions options;
+  options.generate_kernel_stats_db = true;
+  options.generate_op_metrics_db = true;
+  const OpStats op_stats = ConvertXSpaceToOpStats(space, options);
   const TfStatsDatabase tf_stats = ConvertOpStatsToTfStats(op_stats);
 
-  // TfOp1, TfOp2, Idle
-  EXPECT_EQ(3, tf_stats.with_idle().tf_stats_record_size());
+  EXPECT_EQ(tf_stats.device_type(), op_stats.run_environment().device_type());
+
+  // TfOp1, TfOp3, TfOp2, Idle
+  EXPECT_EQ(4, tf_stats.with_idle().tf_stats_record_size());
 
   const TfStatsRecord& record_0 = tf_stats.with_idle().tf_stats_record(0);
   EXPECT_EQ(kTfOp1, record_0.op_name());
@@ -95,11 +143,22 @@ TEST(OpStatsToTfStats, GpuTfStats) {
             record_0.total_self_time_in_us());
 
   const TfStatsRecord& record_1 = tf_stats.with_idle().tf_stats_record(1);
-  EXPECT_EQ(kTfOp2, record_1.op_name());
-  EXPECT_EQ(kTfOp2, record_1.op_type());
+  EXPECT_EQ(kTfOp3, record_1.op_name());
+  EXPECT_EQ(kTfOp3, record_1.op_type());
   EXPECT_EQ(1, record_1.occurrences());
+  EXPECT_EQ(
+      NanosToMicros(kKernel4DurationNs) + NanosToMicros(kKernel5DurationNs),
+      record_1.total_self_time_in_us());
+  // GPU TensorCore utilization is 0.5 because kernel4 is using TensorCore and
+  // kernel5 is not using TensorCore, and they have the same duration.
+  EXPECT_DOUBLE_EQ(0.5, record_1.gpu_tensorcore_utilization());
+
+  const TfStatsRecord& record_2 = tf_stats.with_idle().tf_stats_record(2);
+  EXPECT_EQ(kTfOp2, record_2.op_name());
+  EXPECT_EQ(kTfOp2, record_2.op_type());
+  EXPECT_EQ(1, record_2.occurrences());
   EXPECT_EQ(NanosToMicros(kKernel3DurationNs),
-            record_1.total_self_time_in_us());
+            record_2.total_self_time_in_us());
 }
 
 }  // namespace
diff --git a/tensorflow/core/profiler/convert/post_process_single_host_xplane.cc b/tensorflow/core/profiler/convert/post_process_single_host_xplane.cc
new file mode 100644
index 00000000000..b379c2d2f5e
--- /dev/null
+++ b/tensorflow/core/profiler/convert/post_process_single_host_xplane.cc
@@ -0,0 +1,69 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/profiler/convert/post_process_single_host_xplane.h"
+
+#if !defined(IS_MOBILE_PLATFORM)
+#include "tensorflow/core/profiler/internal/profiler_factory.h"
+#include "tensorflow/core/profiler/utils/derived_timeline.h"
+#include "tensorflow/core/profiler/utils/group_events.h"
+#include "tensorflow/core/profiler/utils/xplane_schema.h"
+#include "tensorflow/core/profiler/utils/xplane_utils.h"
+#endif
+
+namespace tensorflow {
+namespace profiler {
+
+void PostProcessSingleHostXSpace(XSpace* space, uint64 start_time_ns) {
+  VLOG(3) << "Post processing local profiler XSpace.";
+  // Post processing the collected XSpace without hold profiler lock.
+  // 1. Merge plane of host events with plane of CUPTI driver api.
+  const XPlane* cupti_driver_api_plane =
+      FindPlaneWithName(*space, kCuptiDriverApiPlaneName);
+  const XPlane* python_tracer_plane =
+      FindPlaneWithName(*space, kPythonTracerPlaneName);
+  if (cupti_driver_api_plane || python_tracer_plane) {
+    XPlane* host_plane =
+        FindOrAddMutablePlaneWithName(space, kHostThreadsPlaneName);
+    if (cupti_driver_api_plane) {
+      MergePlanes(*cupti_driver_api_plane, host_plane);
+    }
+    if (python_tracer_plane) {
+      MergePlanes(*python_tracer_plane, host_plane);
+    }
+    SortXLinesBy(host_plane, XLinesComparatorByName());
+    // NOTE: RemovePlaneWithName might invalidate plane pointers. so do these
+    // at the last step.
+    if (cupti_driver_api_plane) {
+      RemovePlaneWithName(space, kCuptiDriverApiPlaneName);
+    }
+    if (python_tracer_plane) {
+      RemovePlaneWithName(space, kPythonTracerPlaneName);
+    }
+  }
+
+  // 2. Normalize all timestamps by shifting timeline to profiling start time.
+  // NOTE: this have to be done before sorting XSpace due to timestamp overflow.
+  NormalizeTimestamps(space, start_time_ns);
+  // 3. Sort each plane of the XSpace
+  SortXSpace(space);
+  // 4. Grouping (i.e. marking step number) events in the XSpace.
+  GroupMetadataMap group_metadata_map;
+  GroupTfEvents(space, &group_metadata_map);
+  // 5. Generated miscellaneous derived time lines for device planes.
+  GenerateDerivedTimeLines(group_metadata_map, space);
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/post_process_single_host_xplane.h b/tensorflow/core/profiler/convert/post_process_single_host_xplane.h
new file mode 100644
index 00000000000..31ebe28c48f
--- /dev/null
+++ b/tensorflow/core/profiler/convert/post_process_single_host_xplane.h
@@ -0,0 +1,30 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_POST_PROCESS_SINGLE_HOST_XPLANE_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_POST_PROCESS_SINGLE_HOST_XPLANE_H_
+
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Post process XSpaces collected locally from multiple profilers.
+void PostProcessSingleHostXSpace(XSpace* space, uint64 start_time_ns);
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_POST_PROCESS_SINGLE_HOST_XPLANE_H_
diff --git a/tensorflow/core/profiler/convert/step_events_to_steps_db.cc b/tensorflow/core/profiler/convert/step_events_to_steps_db.cc
index f37cd6ed103..4b541f5b26c 100644
--- a/tensorflow/core/profiler/convert/step_events_to_steps_db.cc
+++ b/tensorflow/core/profiler/convert/step_events_to_steps_db.cc
@@ -109,7 +109,8 @@ string DebugStepInfo(const StepInfoResult& step_info) {
 }  // namespace
 
 StepDatabaseResult ConvertStepEventsToStepDb(
-    bool has_device, const StepEvents& nonoverlapped_step_events) {
+    bool has_device, bool maybe_drop_incomplete_steps,
+    const StepEvents& nonoverlapped_step_events) {
   StepDatabaseResult step_db;
   // Gets sorted step numbers.
   std::vector<int64> step_numbers;
@@ -151,6 +152,17 @@ StepDatabaseResult ConvertStepEventsToStepDb(
     // The remaining fields in PerCoreStepInfo are not filled.
     *step_db.add_step_sequence() = per_core_step_info;
   }
+
+  // If we are using sampling mode and we get enough steps, we would like to
+  // drop the incomplete steps at the beginning and the end.
+  // (Sometimes CUTPI instrumentation will prolong the first step too).
+  int kDropIncomplteteStepThreshold = 5;
+  if (maybe_drop_incomplete_steps &&
+      step_db.step_sequence_size() > kDropIncomplteteStepThreshold) {
+    step_db.mutable_step_sequence()->erase(
+        step_db.mutable_step_sequence()->begin());
+    step_db.mutable_step_sequence()->RemoveLast();
+  }
   return step_db;
 }
 
diff --git a/tensorflow/core/profiler/convert/step_events_to_steps_db.h b/tensorflow/core/profiler/convert/step_events_to_steps_db.h
index 9db65163f7a..bc2927f2df9 100644
--- a/tensorflow/core/profiler/convert/step_events_to_steps_db.h
+++ b/tensorflow/core/profiler/convert/step_events_to_steps_db.h
@@ -27,7 +27,8 @@ ABSL_CONST_INIT extern const uint32 kDefaultGpuLocalCoreId;
 
 // Converts from overlapped Step-Events to StepDatabaseResult.
 StepDatabaseResult ConvertStepEventsToStepDb(
-    bool has_device, const StepEvents& overlapped_step_events);
+    bool has_device, bool maybe_drop_incomplete_steps,
+    const StepEvents& overlapped_step_events);
 
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/trace_events_to_json.cc b/tensorflow/core/profiler/convert/trace_events_to_json.cc
index ba3e4516c8c..ad40292ceff 100644
--- a/tensorflow/core/profiler/convert/trace_events_to_json.cc
+++ b/tensorflow/core/profiler/convert/trace_events_to_json.cc
@@ -21,7 +21,7 @@ limitations under the License.
 
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
-#include "include/json/json.h"
+#include "json/json.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/trace_events.pb.h"
 
diff --git a/tensorflow/core/profiler/convert/trace_events_to_json_test.cc b/tensorflow/core/profiler/convert/trace_events_to_json_test.cc
index dc985f2f76f..bf08a19e022 100644
--- a/tensorflow/core/profiler/convert/trace_events_to_json_test.cc
+++ b/tensorflow/core/profiler/convert/trace_events_to_json_test.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/trace_events_to_json.h"
 
-#include "include/json/json.h"
+#include "json/json.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/profiler/protobuf/trace_events.pb.h"
diff --git a/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db_test.cc b/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db_test.cc
index e402b3b6672..a7052c1d065 100644
--- a/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db_test.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db_test.cc
@@ -84,8 +84,7 @@ block_z:1)MULTI"},
   KernelReportMap reports;
   ConvertDeviceTraceXPlaneToKernelReports(*device_trace, {}, &reports);
   KernelStatsDb kernel_stats;
-  CopyKernelReportsToDb(reports, &kernel_stats);
-  SortKernelsByTotalDurationDesc(&kernel_stats);
+  CopyTopKDurationKernelReportsToDb(reports, &kernel_stats);
 
   EXPECT_EQ(kernel_stats.reports_size(), 3);
 
diff --git a/tensorflow/core/profiler/convert/xplane_to_memory_profile.cc b/tensorflow/core/profiler/convert/xplane_to_memory_profile.cc
index 9a5130f63be..3b67124ef27 100644
--- a/tensorflow/core/profiler/convert/xplane_to_memory_profile.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_memory_profile.cc
@@ -24,11 +24,13 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/optional.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/profiler/protobuf/memory_profile.pb.h"
@@ -424,23 +426,86 @@ void ProcessActiveAllocations(int64 peak_bytes_profile_step_id,
           << memory_profile->active_allocations_size();
 }
 
+struct Sample {
+  int64 orig_index;  // original index to the snapshot.
+  MemoryProfileSnapshot* snapshot;
+};
+
+// This function samples max_num_snapshots from snapshots. We first keep the
+// snapshots referenced by active_allocations in the samples. After this, if
+// there is still room for more samples, we pick more from snapshots into the
+// samples. Then, we sort the samples in time (so that they can be correctly
+// displayed on the timeline). Finally, we need to adjust the original indices
+// (to snapshots) in active_allocations to the new indices in the samples.
 void SampleSnapshots(
     int64 max_num_snapshots,
-    protobuf::RepeatedPtrField<MemoryProfileSnapshot>* snapshots) {
+    protobuf::RepeatedPtrField<MemoryProfileSnapshot>* snapshots,
+    protobuf::RepeatedPtrField<ActiveAllocation>* active_allocations) {
   if (snapshots->size() <= max_num_snapshots) return;
-  absl::c_partial_sort(
-      *snapshots, snapshots->begin() + max_num_snapshots,
-      [](const MemoryProfileSnapshot& a, const MemoryProfileSnapshot& b) {
-        return a.aggregation_stats().free_memory_bytes() <
-               b.aggregation_stats().free_memory_bytes();
-      });
-  snapshots->erase(snapshots->begin() + max_num_snapshots, snapshots->end());
-  // Sort the memory_profile_snapshots by time_offset_ps (ascending) after
-  // sampling.
-  absl::c_sort(*snapshots, [](const MemoryProfileSnapshot& a,
-                              const MemoryProfileSnapshot& b) {
-    return a.time_offset_ps() < b.time_offset_ps();
+
+  std::vector<Sample> samples;
+
+  // First, puts the snapshots referenced by active_allocations in samples[].
+  absl::flat_hash_set<int64> allocation_snapshot_indices;
+  for (const auto& allocation : *active_allocations) {
+    auto orig_index = allocation.snapshot_index();
+    if (orig_index < 0) continue;
+    allocation_snapshot_indices.insert(orig_index);
+    samples.push_back({orig_index, &(*snapshots)[orig_index]});
+    if (allocation_snapshot_indices.size() >= max_num_snapshots) break;
+  }
+
+  // Second, extracts remaining samples from snapshots.
+  int64 num_samples_remained =
+      max_num_snapshots - allocation_snapshot_indices.size();
+  if (num_samples_remained > 0) {
+    std::vector<Sample> remaining;
+    for (int64 i = 0; i < snapshots->size(); i++) {
+      if (allocation_snapshot_indices.contains(i)) continue;
+      // snapshots[i] is not yet sampled; put it in remaining[] for further
+      // consideration.
+      remaining.push_back({i, &(*snapshots)[i]});
+    }
+    // Moves the num_samples_remained snapshots with least free bytes to the
+    // beginning of remaining[].
+    absl::c_partial_sort(
+        remaining, remaining.begin() + num_samples_remained,
+        [](const Sample& a, const Sample& b) {
+          return a.snapshot->aggregation_stats().free_memory_bytes() <
+                 b.snapshot->aggregation_stats().free_memory_bytes();
+        });
+    // Copies the first num_samples_remained in remaining[] to samples[].
+    for (int64 i = 0; i < num_samples_remained; i++)
+      samples.push_back(remaining[i]);
+  }
+
+  // Third, sorts samples[] in ascending order of time_offset_ps.
+  absl::c_sort(samples, [](const Sample& a, const Sample& b) {
+    return a.snapshot->time_offset_ps() < b.snapshot->time_offset_ps();
   });
+
+  // Fourth, constructs a map from the original snapshot index to samples index.
+  absl::flat_hash_map</*original=*/int64, /*new=*/int64> index_map;
+  for (int64 i = 0; i < samples.size(); i++) {
+    index_map[samples[i].orig_index] = i;
+  }
+
+  // Fifth, changes the original snapshot indices in active_allocations to the
+  // sample indices.
+  for (auto& allocation : *active_allocations) {
+    auto orig_index = allocation.snapshot_index();
+    if (orig_index < 0) continue;
+    auto new_index = gtl::FindWithDefault(index_map, orig_index, -1);
+    allocation.set_snapshot_index(new_index);
+  }
+
+  // Sixth, replaces *snapshot by samples[]
+  protobuf::RepeatedPtrField<MemoryProfileSnapshot> new_snapshots;
+  new_snapshots.Reserve(samples.size());
+  for (const auto& sample : samples) {
+    *new_snapshots.Add() = std::move(*sample.snapshot);
+  }
+  *snapshots = std::move(new_snapshots);
 }
 
 // Post-process the memory profile to correctly update proto fields, and break
@@ -478,7 +543,8 @@ void ProcessMemoryProfileProto(int64 max_num_snapshots,
                               .peak_bytes_in_use(),
                           allocator_memory_profile);
     ProcessActiveAllocations(peak_step_id, allocator_memory_profile);
-    SampleSnapshots(max_num_snapshots, snapshots);
+    SampleSnapshots(max_num_snapshots, snapshots,
+                    allocator_memory_profile->mutable_active_allocations());
   }
 }
 
diff --git a/tensorflow/core/profiler/convert/xplane_to_op_stats.cc b/tensorflow/core/profiler/convert/xplane_to_op_stats.cc
index 2f4bf2689b0..47a0062884a 100644
--- a/tensorflow/core/profiler/convert/xplane_to_op_stats.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_op_stats.cc
@@ -107,7 +107,7 @@ void SetRunEnvironment(int32 accelerator_count, RunEnvironment* env) {
 }
 
 void ProcessHostPlane(const XPlane* host_plane, bool use_device_step_events,
-                      const OpStatsConfig& config, OpMetricsDb* op_metrics_db,
+                      const OpStatsOptions& options, OpMetricsDb* op_metrics_db,
                       StepEvents* step_events) {
   absl::flat_hash_map<int64, TfOp> tf_ops =
       CollectTfOpsFromHostThreadsXPlane(*host_plane);
@@ -116,7 +116,7 @@ void ProcessHostPlane(const XPlane* host_plane, bool use_device_step_events,
   plane.ForEachLine([&](const XLineVisitor& line) {
     ConsumeTfMetricsDbData(
         ConvertHostThreadsXLineToTfMetricsDbData(line, tf_ops), &combiner);
-    if (config.contains(STEP_DB)) {
+    if (options.generate_step_db) {
       CombineStepEvents(ConvertHostThreadsXLineToStepEvents(
                             line, use_device_step_events, *step_events),
                         step_events);
@@ -143,7 +143,7 @@ void PropagateXSpaceDiagnosticsToOpStats(const XSpace& space,
 }
 
 OpStats ConvertXSpaceToOpStats(const XSpace& space,
-                               const OpStatsConfig& config) {
+                               const OpStatsOptions& options) {
   const XPlane* host_plane = FindPlaneWithName(space, kHostThreadsPlaneName);
   std::vector<const XPlane*> device_planes =
       FindPlanesWithPrefix(space, kGpuPlanePrefix);
@@ -158,7 +158,7 @@ OpStats ConvertXSpaceToOpStats(const XSpace& space,
   KernelReportMap reports;
   // TODO(b/161942993) parallelize XPlane processing per thread.
   for (const XPlane* device_trace : device_planes) {
-    if (config.contains(OP_METRICS_DB)) {
+    if (options.generate_op_metrics_db) {
       if (!op_stats.has_perf_env()) {
         *op_stats.mutable_perf_env() = GetPerfEnvFromXPlane(*device_trace);
       }
@@ -168,34 +168,34 @@ OpStats ConvertXSpaceToOpStats(const XSpace& space,
           perf_env.peak_hbm_bw_giga_bytes_per_second());
       op_metrics_db_combiner.Combine(device_op_metrics_db);
     }
-    if (config.contains(STEP_DB)) {
+    if (options.generate_step_db) {
       CombineStepEvents(ConvertDeviceTraceXPlaneToStepEvents(*device_trace),
                         &step_events);
     }
-    if (config.contains(KERNEL_STATS_DB)) {
+    if (options.generate_kernel_stats_db) {
       ConvertDeviceTraceXPlaneToKernelReports(*device_trace,
                                               /*on_kernel_fn=*/{}, &reports);
     }
   }
 
   // Combine into reports.
-  if (config.contains(KERNEL_STATS_DB)) {
-    CopyKernelReportsToDb(reports, op_stats.mutable_kernel_stats_db());
-    // TODO(b/161943499) Replace sort with a TopK algorithm.
-    SortKernelsByTotalDurationDesc(op_stats.mutable_kernel_stats_db());
+  if (options.generate_kernel_stats_db) {
+    CopyTopKDurationKernelReportsToDb(reports,
+                                      op_stats.mutable_kernel_stats_db());
   }
 
   bool has_device = !device_planes.empty();
   // Convert a host plane.
-  if (host_plane && config.contains(OP_METRICS_DB)) {
-    ProcessHostPlane(host_plane, has_device, config,
+  if (host_plane && options.generate_op_metrics_db) {
+    ProcessHostPlane(host_plane, has_device, options,
                      op_stats.mutable_host_op_metrics_db(), &step_events);
   }
-  if (config.contains(STEP_DB)) {
+  if (options.generate_step_db) {
     StepEvents nonoverlapped_step_events =
         ToNonOverlappedStepEvents(step_events);
-    *op_stats.mutable_step_db() =
-        ConvertStepEventsToStepDb(has_device, nonoverlapped_step_events);
+    *op_stats.mutable_step_db() = ConvertStepEventsToStepDb(
+        has_device, options.maybe_drop_incomplete_steps,
+        nonoverlapped_step_events);
     *op_stats.mutable_device_op_metrics_db()->mutable_precision_stats() =
         ComputePrecisionStats(nonoverlapped_step_events);
   }
diff --git a/tensorflow/core/profiler/convert/xplane_to_op_stats.h b/tensorflow/core/profiler/convert/xplane_to_op_stats.h
index e1778006cbd..adad8296307 100644
--- a/tensorflow/core/profiler/convert/xplane_to_op_stats.h
+++ b/tensorflow/core/profiler/convert/xplane_to_op_stats.h
@@ -23,17 +23,16 @@ limitations under the License.
 namespace tensorflow {
 namespace profiler {
 
-enum OpStatsKind {
-  OP_METRICS_DB,
-  STEP_DB,
-  KERNEL_STATS_DB,
+struct OpStatsOptions {
+  bool maybe_drop_incomplete_steps = false;
+  bool generate_op_metrics_db = false;
+  bool generate_step_db = false;
+  bool generate_kernel_stats_db = false;
 };
 
-using OpStatsConfig = absl::flat_hash_set<OpStatsKind>;
-
 // NOTE: call GroupTfEvents before if OpStats.step_db needs to be generated.
 OpStats ConvertXSpaceToOpStats(const XSpace& space,
-                               const OpStatsConfig& config);
+                               const OpStatsOptions& config);
 
 // Propagate and dedup the diagnostics in XSpace and add to OpStats.
 void PropagateXSpaceDiagnosticsToOpStats(const XSpace& space,
diff --git a/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc b/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc
index beeb4a097bc..68a6e511c7c 100644
--- a/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc
@@ -62,7 +62,9 @@ TEST(ConvertXPlaneToOpStats, PerfEnv) {
       absl::StrCat(kComputeCapMinor));
 
   GroupTfEvents(&space, /*group_metadata_map=*/nullptr);
-  OpStats op_stats = ConvertXSpaceToOpStats(space, {OP_METRICS_DB});
+  OpStatsOptions options;
+  options.generate_op_metrics_db = true;
+  OpStats op_stats = ConvertXSpaceToOpStats(space, options);
   const PerfEnv& perf_env = op_stats.perf_env();
   EXPECT_NEAR(141, perf_env.peak_tera_flops_per_second(), kMaxError);
   EXPECT_NEAR(900, perf_env.peak_hbm_bw_giga_bytes_per_second(), kMaxError);
@@ -77,7 +79,7 @@ TEST(ConvertXPlaneToOpStats, RunEnvironment) {
       GetOrCreateGpuXPlane(&space, /*device_ordinal=*/1));
 
   GroupTfEvents(&space, /*group_metadata_map=*/nullptr);
-  OpStats op_stats = ConvertXSpaceToOpStats(space, {});
+  OpStats op_stats = ConvertXSpaceToOpStats(space, OpStatsOptions());
   const RunEnvironment& run_env = op_stats.run_environment();
 
   EXPECT_EQ("GPU", run_env.device_type());
@@ -107,7 +109,10 @@ TEST(ConvertXPlaneToOpStats, CpuOnlyStepDbTest) {
   CreateXEvent(&host_plane_builder, &tf_executor_thread, "matmul", 30, 70);
 
   GroupTfEvents(&space, /*group_metadata_map=*/nullptr);
-  OpStats op_stats = ConvertXSpaceToOpStats(space, {OP_METRICS_DB, STEP_DB});
+  OpStatsOptions options;
+  options.generate_op_metrics_db = true;
+  options.generate_step_db = true;
+  OpStats op_stats = ConvertXSpaceToOpStats(space, options);
   const StepDatabaseResult& step_db = op_stats.step_db();
 
   EXPECT_EQ(step_db.step_sequence_size(), 1);
@@ -144,7 +149,10 @@ TEST(ConvertXPlaneToOpStats, GpuStepDbTest) {
                {{StatType::kCorrelationId, kCorrelationId}});
 
   GroupTfEvents(&space, /*group_metadata_map=*/nullptr);
-  OpStats op_stats = ConvertXSpaceToOpStats(space, {OP_METRICS_DB, STEP_DB});
+  OpStatsOptions options;
+  options.generate_op_metrics_db = true;
+  options.generate_step_db = true;
+  OpStats op_stats = ConvertXSpaceToOpStats(space, options);
   const StepDatabaseResult& step_db = op_stats.step_db();
 
   EXPECT_EQ(step_db.step_sequence_size(), 1);
@@ -161,7 +169,7 @@ TEST(ConvertXPlaneToOpStats, PropagateAndDedupErrors) {
   *space.add_errors() = kError;
   *space.add_errors() = kError;
 
-  OpStats op_stats = ConvertXSpaceToOpStats(space, {});
+  OpStats op_stats = ConvertXSpaceToOpStats(space, OpStatsOptions());
 
   EXPECT_EQ(1, op_stats.diagnostics().errors_size());
   EXPECT_EQ(kError, op_stats.diagnostics().errors(/*index=*/0));
diff --git a/tensorflow/core/profiler/convert/xplane_to_profile_response.cc b/tensorflow/core/profiler/convert/xplane_to_profile_response.cc
index 54e9a8b2a10..10b2122d764 100644
--- a/tensorflow/core/profiler/convert/xplane_to_profile_response.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_profile_response.cc
@@ -81,14 +81,19 @@ Status ConvertXSpaceToProfileResponse(const XSpace& xspace,
       response->set_empty_trace(true);
       return Status::OK();
     }
-    TF_RETURN_IF_ERROR(SaveGzippedToolDataToTensorboardProfile(
+    TF_RETURN_IF_ERROR(SaveGzippedToolData(
         req.repository_root(), req.session_id(), req.host_name(),
         ToolName(kTraceViewer), TraceEventsToJson(trace)));
     // Trace viewer is the only tool, skip OpStats conversion.
     if (tools.size() == 1) return Status::OK();
   }
-  OpStats op_stats =
-      ConvertXSpaceToOpStats(xspace, {OP_METRICS_DB, STEP_DB, KERNEL_STATS_DB});
+
+  OpStatsOptions options;
+  options.generate_kernel_stats_db = true;
+  options.generate_op_metrics_db = true;
+  options.generate_step_db = true;
+  options.maybe_drop_incomplete_steps = true;
+  OpStats op_stats = ConvertXSpaceToOpStats(xspace, options);
   if (tools.contains(kOverviewPage)) {
     OverviewPage overview_page_db = ConvertOpStatsToOverviewPage(op_stats);
     AddToolData(ToolName(kOverviewPage), overview_page_db, response);
@@ -110,7 +115,7 @@ Status ConvertXSpaceToProfileResponse(const XSpace& xspace,
   if (tools.contains(kMemoryProfile)) {
     std::string json_output;
     TF_RETURN_IF_ERROR(ConvertXSpaceToMemoryProfileJson(xspace, &json_output));
-    TF_RETURN_IF_ERROR(SaveGzippedToolDataToTensorboardProfile(
+    TF_RETURN_IF_ERROR(SaveGzippedToolData(
         req.repository_root(), req.session_id(), req.host_name(),
         ToolName(kMemoryProfile), json_output));
   }
diff --git a/tensorflow/core/profiler/internal/BUILD b/tensorflow/core/profiler/internal/BUILD
index 85fa4e7fc44..d484ad9eff1 100644
--- a/tensorflow/core/profiler/internal/BUILD
+++ b/tensorflow/core/profiler/internal/BUILD
@@ -212,14 +212,7 @@ tf_cc_test(
     name = "tfprof_show_test",
     size = "small",
     srcs = ["tfprof_show_test.cc"],
-    data = [
-        "testdata/ckpt.data-00000-of-00001",
-        "testdata/ckpt.index",
-        "testdata/ckpt.meta",
-        "testdata/graph.pbtxt",
-        "testdata/run_meta",
-        "testdata/tfprof_log",
-    ],
+    data = ["//tensorflow/core/profiler/internal/testdata:profiler_testdata"],
     deps = [
         ":tfprof_constants",
         ":tfprof_stats",
@@ -238,10 +231,7 @@ tf_cc_test(
     name = "tfprof_timeline_test",
     size = "small",
     srcs = ["tfprof_timeline_test.cc"],
-    data = [
-        "testdata/graph.pbtxt",
-        "testdata/run_meta",
-    ],
+    data = ["//tensorflow/core/profiler/internal/testdata:profiler_testdata"],
     deps = [
         ":tfprof_constants",
         ":tfprof_stats",
@@ -300,14 +290,7 @@ tf_cc_test(
     name = "tfprof_stats_test",
     size = "small",
     srcs = ["tfprof_stats_test.cc"],
-    data = [
-        "testdata/ckpt.data-00000-of-00001",
-        "testdata/ckpt.index",
-        "testdata/ckpt.meta",
-        "testdata/graph.pbtxt",
-        "testdata/run_meta",
-        "testdata/tfprof_log",
-    ],
+    data = ["//tensorflow/core/profiler/internal/testdata:profiler_testdata"],
     deps = [
         ":tfprof_constants",
         ":tfprof_stats",
@@ -340,12 +323,7 @@ tf_cc_test(
     name = "tfprof_tensor_test",
     size = "small",
     srcs = ["tfprof_tensor_test.cc"],
-    data = [
-        "testdata/ckpt.data-00000-of-00001",
-        "testdata/ckpt.index",
-        "testdata/ckpt.meta",
-        "testdata/graph.pbtxt",
-    ],
+    data = ["//tensorflow/core/profiler/internal/testdata:profiler_testdata"],
     deps = [
         ":tfprof_stats",
         ":tfprof_tf_testlib",
diff --git a/tensorflow/core/profiler/internal/cpu/BUILD b/tensorflow/core/profiler/internal/cpu/BUILD
index 3ec721e7395..d8c84425e2b 100644
--- a/tensorflow/core/profiler/internal/cpu/BUILD
+++ b/tensorflow/core/profiler/internal/cpu/BUILD
@@ -29,7 +29,6 @@ cc_library(
     deps = [
         ":host_tracer_utils",
         "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/profiler:profiler_options_proto_cc",
         "//tensorflow/core/profiler/internal:profiler_factory",
@@ -73,7 +72,6 @@ cc_library(
     features = ["-use_header_modules"],
     deps = [
         "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/profiler:profiler_options_proto_cc",
         "//tensorflow/core/profiler/internal:profiler_factory",
@@ -91,7 +89,6 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo_proto_cc",
         "//tensorflow/compiler/xla/service/gpu:gpu_debug_info_manager",
         "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/profiler:profiler_options_proto_cc",
         "//tensorflow/core/profiler/internal:profiler_factory",
diff --git a/tensorflow/core/profiler/internal/cpu/host_tracer.cc b/tensorflow/core/profiler/internal/cpu/host_tracer.cc
index fa21df004df..61cb75b2ab3 100644
--- a/tensorflow/core/profiler/internal/cpu/host_tracer.cc
+++ b/tensorflow/core/profiler/internal/cpu/host_tracer.cc
@@ -33,7 +33,6 @@ limitations under the License.
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
 #include "tensorflow/core/profiler/utils/xplane_utils.h"
 #include "tensorflow/core/protobuf/config.pb.h"
-#include "tensorflow/core/util/env_var.h"
 
 namespace tensorflow {
 namespace profiler {
@@ -142,6 +141,7 @@ Status HostTracer::CollectData(RunMetadata* run_metadata) {
 }
 
 Status HostTracer::CollectData(XSpace* space) {
+  VLOG(2) << "Collecting data to XSpace from HostTracer.";
   if (recording_) {
     return errors::Internal("TraceMeRecorder not stopped");
   }
@@ -162,11 +162,7 @@ std::unique_ptr<ProfilerInterface> CreateHostTracer(
 }
 
 auto register_host_tracer_factory = [] {
-  bool enable;
-  TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_OSS_CPU_PROFILER", true, &enable));
-  if (enable) {
-    RegisterProfilerFactory(&CreateHostTracer);
-  }
+  RegisterProfilerFactory(&CreateHostTracer);
   return 0;
 }();
 
diff --git a/tensorflow/core/profiler/internal/cpu/python_tracer.cc b/tensorflow/core/profiler/internal/cpu/python_tracer.cc
index 4233c5fdd72..7284e597199 100644
--- a/tensorflow/core/profiler/internal/cpu/python_tracer.cc
+++ b/tensorflow/core/profiler/internal/cpu/python_tracer.cc
@@ -58,7 +58,7 @@ class PythonTracer : public ProfilerInterface {
 
 PythonTracer::~PythonTracer() {
   Stop().IgnoreError();
-  PythonHooks::GetSingleton()->Finalize();
+  PythonHooks::GetSingleton()->Finalize(nullptr);
 }
 
 Status PythonTracer::Start() {
@@ -76,7 +76,7 @@ Status PythonTracer::Stop() {
     return errors::Internal("TraceMeRecorder not started");
   }
   VLOG(1) << __FUNCTION__;
-  PythonHooks::GetSingleton()->Stop(options_);
+  PythonHooks::GetSingleton()->Stop();
   recording_ = false;
   return Status::OK();
 }
@@ -87,17 +87,14 @@ Status PythonTracer::CollectData(RunMetadata* run_metadata) {
   // in the wrong threads.
   // We had assumed HostTracer::Stop is called when ProfilerSession try to
   // serialize PythonTracer.
-  PythonHooks::GetSingleton()->Finalize();
+  VLOG(2) << "Collecting data to RunMetaData from PythonTracer.";
+  PythonHooks::GetSingleton()->Finalize(nullptr);
   return Status::OK();
 }
 
 Status PythonTracer::CollectData(XSpace* space) {
-  // This ProfilerInterface rely on HostTracer to serialize its trace.
-  // Make sure unpaired traceme don't get recorded, because it will end up
-  // in the wrong threads.
-  // We had assumed HostTracer::Stop is called when ProfilerSession try to
-  // serialize PythonTracer.
-  PythonHooks::GetSingleton()->Finalize();
+  VLOG(2) << "Collecting data to XSpace from PythonTracer.";
+  PythonHooks::GetSingleton()->Finalize(space);
   return Status::OK();
 }
 
@@ -107,8 +104,7 @@ Status PythonTracer::CollectData(XSpace* space) {
 std::unique_ptr<ProfilerInterface> CreatePythonTracer(
     const ProfileOptions& options) {
   PythonHooksOptions pyhooks_options;
-  pyhooks_options.enable_trace_python_function =
-      options.python_tracer_level() && options.host_tracer_level();
+  pyhooks_options.enable_trace_python_function = options.python_tracer_level();
   pyhooks_options.enable_python_traceme = options.host_tracer_level() != 0;
   return absl::make_unique<PythonTracer>(pyhooks_options);
 }
diff --git a/tensorflow/core/profiler/internal/gpu/device_tracer.cc b/tensorflow/core/profiler/internal/gpu/device_tracer.cc
index 6fc10bbf95b..978f3a90cd3 100644
--- a/tensorflow/core/profiler/internal/gpu/device_tracer.cc
+++ b/tensorflow/core/profiler/internal/gpu/device_tracer.cc
@@ -718,6 +718,7 @@ Status GpuTracer::CollectData(XSpace* space) {
 // Not in anonymous namespace for testing purposes.
 std::unique_ptr<profiler::ProfilerInterface> CreateGpuTracer(
     const ProfileOptions& options) {
+  VLOG(2) << "Collecting data to XSpace from GpuTracer.";
   if (options.device_type() != ProfileOptions::GPU &&
       options.device_type() != ProfileOptions::UNSPECIFIED)
     return nullptr;
diff --git a/tensorflow/core/profiler/internal/testdata/BUILD b/tensorflow/core/profiler/internal/testdata/BUILD
new file mode 100644
index 00000000000..e2701e7b978
--- /dev/null
+++ b/tensorflow/core/profiler/internal/testdata/BUILD
@@ -0,0 +1,19 @@
+# Description:
+# Profiler test data packages.
+
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
+
+filegroup(
+    name = "profiler_testdata",
+    srcs = [
+        "ckpt.data-00000-of-00001",
+        "ckpt.index",
+        "ckpt.meta",
+        "graph.pbtxt",
+        "run_meta",
+        "tfprof_log",
+    ],
+    visibility = ["//tensorflow/core/profiler/internal:__pkg__"],
+)
diff --git a/tensorflow/core/profiler/internal/tfprof_node.cc b/tensorflow/core/profiler/internal/tfprof_node.cc
index 8bcec0ccafb..6b1d0ee2403 100644
--- a/tensorflow/core/profiler/internal/tfprof_node.cc
+++ b/tensorflow/core/profiler/internal/tfprof_node.cc
@@ -23,8 +23,7 @@ bool CountAsAcceleratorTime(const string& device) {
   return device.find("stream:all") != device.npos;
 }
 bool CountAsCPUTime(const string& device) {
-  return RE2::FullMatch(device,
-                        ".*/(device:gpu|gpu|device:cpu|cpu|device:sycl):\\d+");
+  return RE2::FullMatch(device, ".*/(device:gpu|gpu|device:cpu|cpu):\\d+");
 }
 bool IsCanonicalDevice(const string& device) { return CountAsCPUTime(device); }
 
@@ -210,11 +209,7 @@ void TFGraphNode::AddStepStat(int64 step, const string& device,
     } else {
       node_.set_canonical_device(dev);
       // TODO(xpan): Support things other than gpu?
-      if (dev.find("sycl") != dev.npos) {
-        node_.set_host_device(StringReplace(dev, "device:sycl:\\d+", "cpu:0"));
-      } else {
-        node_.set_host_device(StringReplace(dev, "gpu:\\d+", "cpu:0"));
-      }
+      node_.set_host_device(StringReplace(dev, "gpu:\\d+", "cpu:0"));
       AddOpType(node_.canonical_device());
     }
   }
@@ -288,8 +283,7 @@ TensorShapeProto VecToShapeProto(const std::vector<int64>& shape_vec) {
 }
 
 bool IsPlacedOnAccelerator(const string& device) {
-  return device.find("gpu") != device.npos ||
-         device.find("sycl") != device.npos;
+  return device.find("gpu") != device.npos;
 }
 bool IsPlacedOnCPU(const string& device) {
   return device.find("cpu") != device.npos;
diff --git a/tensorflow/core/profiler/internal/tfprof_timeline.h b/tensorflow/core/profiler/internal/tfprof_timeline.h
index 834e3c9be91..fb9ff8012e0 100644
--- a/tensorflow/core/profiler/internal/tfprof_timeline.h
+++ b/tensorflow/core/profiler/internal/tfprof_timeline.h
@@ -17,7 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_TIMELINE_H_
 
 #include "absl/strings/str_cat.h"
-#include "include/json/json.h"
+#include "json/json.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
 #include "tensorflow/core/profiler/internal/tfprof_node_show.h"
diff --git a/tensorflow/core/profiler/lib/BUILD b/tensorflow/core/profiler/lib/BUILD
index 57a3fa8a586..67eb9923986 100644
--- a/tensorflow/core/profiler/lib/BUILD
+++ b/tensorflow/core/profiler/lib/BUILD
@@ -53,7 +53,8 @@ cc_library(
         "@com_google_absl//absl/memory",
         "//tensorflow/core:protos_all_cc",
     ] + if_not_android([
-        ":profiler_utils",
+        ":profiler_lock",
+        "//tensorflow/core/profiler/convert:post_process_single_host_xplane",
         "//tensorflow/core/profiler/internal:profiler_factory",
         "//tensorflow/core/profiler/utils:derived_timeline",
         "//tensorflow/core/profiler/utils:group_events",
@@ -63,6 +64,57 @@ cc_library(
     alwayslink = True,
 )
 
+tf_pybind_cc_library_wrapper(
+    name = "local_profiler_headers",
+    visibility = [
+        "//tensorflow/core/profiler/rpc:__pkg__",
+        "//tensorflow/python/profiler/internal:__pkg__",
+    ],
+    deps = [":local_profiler"],
+)
+
+cc_library(
+    name = "local_profiler",
+    hdrs = ["local_profiler.h"],
+    visibility = ["//tensorflow/core/profiler:internal"],
+    deps = [
+        "@com_google_absl//absl/memory",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/platform",
+        "//tensorflow/core/profiler:profiler_options_proto_cc",
+        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
+        "//tensorflow/core/profiler/internal:profiler_interface",
+    ] + if_static([
+        ":local_profiler_impl",
+    ]),
+)
+
+cc_library(
+    name = "local_profiler_impl",
+    srcs = ["local_profiler.cc"],
+    hdrs = ["local_profiler.h"],
+    visibility = ["//tensorflow/core/profiler:internal"],
+    deps = [
+        ":profiler_lock",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/platform",
+        "//tensorflow/core/profiler:profiler_options_proto_cc",
+        "//tensorflow/core/profiler/convert:post_process_single_host_xplane",
+        "//tensorflow/core/profiler/internal:profiler_factory",
+        "//tensorflow/core/profiler/internal:profiler_interface",
+        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
+        "//tensorflow/core/profiler/utils:derived_timeline",
+        "//tensorflow/core/profiler/utils:group_events",
+        "//tensorflow/core/profiler/utils:xplane_schema",
+        "//tensorflow/core/profiler/utils:xplane_utils",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/time",
+    ],
+    alwayslink = True,
+)
+
 tf_cuda_library(
     name = "profiler_backends",
     cuda_deps = [
@@ -164,9 +216,9 @@ cc_library(
 )
 
 cc_library(
-    name = "profiler_utils",
-    srcs = ["profiler_utils.cc"],
-    hdrs = ["profiler_utils.h"],
+    name = "profiler_lock",
+    srcs = ["profiler_lock.cc"],
+    hdrs = ["profiler_lock.h"],
     visibility = ["//tensorflow/core/profiler:internal"],
 )
 
diff --git a/tensorflow/core/profiler/lib/annotated_traceme.h b/tensorflow/core/profiler/lib/annotated_traceme.h
index c3257e2adbe..eb75a896107 100644
--- a/tensorflow/core/profiler/lib/annotated_traceme.h
+++ b/tensorflow/core/profiler/lib/annotated_traceme.h
@@ -38,12 +38,12 @@ class AnnotatedTraceMe {
     bool annotation_enabled = ScopedAnnotation::IsEnabled();
     bool traceme_enabled = TraceMe::Active(level);
     if (TF_PREDICT_FALSE(annotation_enabled || traceme_enabled)) {
-      string label = name_generator();
+      string name = name_generator();
       if (annotation_enabled) {
-        scoped_annotation_.emplace(absl::string_view(label));
+        scoped_annotation_.emplace(absl::string_view(name));
       }
       if (TF_PREDICT_TRUE(traceme_enabled)) {
-        trace_me_.emplace(std::move(label), level);
+        trace_me_.emplace([&name] { return std::move(name); }, level);
       }
     }
   }
diff --git a/tensorflow/core/profiler/lib/local_profiler.cc b/tensorflow/core/profiler/lib/local_profiler.cc
new file mode 100644
index 00000000000..b3a8cedefb0
--- /dev/null
+++ b/tensorflow/core/profiler/lib/local_profiler.cc
@@ -0,0 +1,185 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/profiler/lib/local_profiler.h"
+
+#include <memory>
+
+#include "absl/memory/memory.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "tensorflow/core/platform/env_time.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/platform.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/convert/post_process_single_host_xplane.h"
+#include "tensorflow/core/profiler/internal/profiler_factory.h"
+#include "tensorflow/core/profiler/internal/profiler_interface.h"
+#include "tensorflow/core/profiler/lib/profiler_lock.h"
+#include "tensorflow/core/profiler/profiler_options.pb.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/core/profiler/utils/derived_timeline.h"
+#include "tensorflow/core/profiler/utils/group_events.h"
+#include "tensorflow/core/profiler/utils/xplane_schema.h"
+#include "tensorflow/core/profiler/utils/xplane_utils.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/protobuf/error_codes.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+/*static*/ std::unique_ptr<LocalProfiler> LocalProfiler::Create(
+    const ProfileOptions& options, Status* out_status) {
+  auto profiler = absl::WrapUnique(new LocalProfiler(options));
+  Status status = profiler->Init();
+  if (out_status) {
+    *out_status = status;
+  }
+  if (!status.ok()) {
+    LOG(ERROR) << status;
+    return nullptr;
+  }
+  return profiler;
+}
+
+LocalProfiler::LocalProfiler(ProfileOptions options)
+    : options_(std::move(options)) {}
+
+LocalProfiler::~LocalProfiler() {
+  mutex_lock lock(mutex_);
+
+  for (auto& profiler : profilers_) {
+    profiler->Stop().IgnoreError();
+  }
+
+  if (active_) {
+    // Allow another LocalProfiler to be instantiated.
+    ReleaseProfilerLock();
+    active_ = false;
+  }
+}
+
+Status LocalProfiler::Init() {
+  mutex_lock lock(mutex_);
+  VLOG(1) << "Creating a LocalProfiler.";
+
+  bool active_ = AcquireProfilerLock();
+  if (!active_) {
+    return errors::Unavailable("Another LocalProfiler is active.");
+  }
+
+  CreateProfilers(options_, &profilers_);
+
+  VLOG(1) << "LocalProfiler initialized with " << profilers_.size()
+          << " profilers.";
+  return Status::OK();
+}
+
+Status LocalProfiler::Start() {
+  mutex_lock lock(mutex_);
+  VLOG(1) << "Starting all profilers.";
+
+  if (!active_) {
+    return errors::FailedPrecondition("LocalProfiler is inactive.");
+  }
+
+  if (start_time_ns_ != 0) {
+    return errors::FailedPrecondition("LocalProfiler is not restartable.");
+  }
+
+  start_time_ns_ = EnvTime::NowNanos();
+
+  Status status;
+  for (auto& profiler : profilers_) {
+    Status start_status = profiler->Start();
+    if (!start_status.ok()) {
+      LOG(WARNING) << "Encountered error while starting profiler: "
+                   << start_status.ToString();
+    }
+    status.Update(start_status);
+  }
+
+  VLOG(1) << "Started all profilers.";
+  return status;
+}
+
+Status LocalProfiler::Stop() {
+  mutex_lock lock(mutex_);
+  VLOG(1) << "Stopping all profilers.";
+
+  if (!active_) {
+    return errors::FailedPrecondition("LocalProfiler is inactive.");
+  }
+
+  if (start_time_ns_ == 0) {
+    return errors::FailedPrecondition(
+        "LocalProfiler needs to Start() before it can stop producing data.");
+  }
+
+  Status status;
+  for (auto& profiler : profilers_) {
+    status.Update(profiler->Stop());
+  }
+
+  // Allow another LocalProfiler to be instantiated.
+  if (active_) {
+    ReleaseProfilerLock();
+    active_ = false;
+  }
+
+  VLOG(1) << "Stopped all profilers.";
+  return status;
+}
+
+Status LocalProfiler::CollectData(XSpace* space) {
+  Status status;
+  uint64 data_start_time_ns;
+
+  {
+    mutex_lock lock(mutex_);
+    VLOG(1) << "Collecting data from " << profilers_.size() << " profilers.";
+
+    if (!active_) {
+      return errors::FailedPrecondition("LocalProfiler is inactive.");
+    }
+
+    if (start_time_ns_ != 0) {
+      return errors::FailedPrecondition(
+          "LocalProfiler needs to Stop() before collecting data.");
+    }
+
+    for (auto& profiler : profilers_) {
+      VLOG(3) << "Collecting data from " << typeid(*profiler).name();
+      status.Update(profiler->CollectData(space));
+    }
+
+    profilers_.clear();
+
+    data_start_time_ns = start_time_ns_;
+  }
+
+  PostProcessSingleHostXSpace(space, data_start_time_ns);
+  return status;
+}
+
+Status LocalProfiler::CollectData(RunMetadata* run_metadata) {
+  return errors::Unimplemented(
+      "Collecting profiler data into RunMetaData is unsupported.");
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/lib/local_profiler.h b/tensorflow/core/profiler/lib/local_profiler.h
new file mode 100644
index 00000000000..1de71d13676
--- /dev/null
+++ b/tensorflow/core/profiler/lib/local_profiler.h
@@ -0,0 +1,101 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_LIB_LOCAL_PROFILER_H_
+#define TENSORFLOW_CORE_PROFILER_LIB_LOCAL_PROFILER_H_
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/internal/profiler_interface.h"
+#include "tensorflow/core/profiler/profiler_options.pb.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// LocalProfiler encapsulates multiple profiler backends that each implements.
+// ProfilerInterface.
+// Thread-safety: LocalProfiler is thread-safe.
+class LocalProfiler : public ProfilerInterface {
+ public:
+  // Instantiates a LocalProfiler if there is not one already active.
+  // Returns null on errors, which will be indicated by the Status code.
+  static std::unique_ptr<LocalProfiler> Create(const ProfileOptions& options,
+                                               Status* status);
+
+  static ProfileOptions DefaultOptions() {
+    ProfileOptions options;
+    options.set_version(1);
+    options.set_device_tracer_level(1);
+    options.set_host_tracer_level(2);
+    options.set_device_type(ProfileOptions::UNSPECIFIED);
+    options.set_python_tracer_level(0);
+    options.set_enable_hlo_proto(false);
+    options.set_include_dataset_ops(true);
+    return options;
+  }
+
+  // Starts all profilers.
+  Status Start() override TF_LOCKS_EXCLUDED(mutex_);
+
+  // Stops all profilers.
+  Status Stop() override TF_LOCKS_EXCLUDED(mutex_);
+
+  // Collects data from all profilers into XSpace. Post-process the XSpace
+  // (e.g., groups trace events per step). This is best effort profiling and
+  //  XSpace may contain data collected before any errors occurred.
+  Status CollectData(XSpace* space) override TF_LOCKS_EXCLUDED(mutex_);
+
+  // Unimplemented, do not use. This will be deprecated in future.
+  Status CollectData(RunMetadata* run_metadata) override;
+
+  // Deletes an existing Profiler and enables starting a new one.
+  ~LocalProfiler() override;
+
+ private:
+  // Constructs an instance of the class and starts profiling
+  explicit LocalProfiler(ProfileOptions options);
+
+  // Neither copyable or movable.
+  LocalProfiler(const LocalProfiler&) = delete;
+  LocalProfiler& operator=(const LocalProfiler&) = delete;
+
+  // Initializes LocalProfiler and sets ups all profilers.
+  Status Init();
+
+  mutex mutex_;
+
+  std::vector<std::unique_ptr<ProfilerInterface>> profilers_
+      TF_GUARDED_BY(mutex_);
+
+  // True if the LocalProfiler is active.
+  bool active_ TF_GUARDED_BY(mutex_) = false;
+
+  // Time when Start() was called.
+  uint64 start_time_ns_ TF_GUARDED_BY(mutex_) = 0;
+
+  ProfileOptions options_;
+};
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_LIB_LOCAL_PROFILER_H_
diff --git a/tensorflow/core/profiler/lib/profiler_utils.cc b/tensorflow/core/profiler/lib/profiler_lock.cc
similarity index 95%
rename from tensorflow/core/profiler/lib/profiler_utils.cc
rename to tensorflow/core/profiler/lib/profiler_lock.cc
index ce3278f4519..b276b00e766 100644
--- a/tensorflow/core/profiler/lib/profiler_utils.cc
+++ b/tensorflow/core/profiler/lib/profiler_lock.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/profiler/lib/profiler_utils.h"
+#include "tensorflow/core/profiler/lib/profiler_lock.h"
 
 #include <atomic>
 
diff --git a/tensorflow/core/profiler/lib/profiler_utils.h b/tensorflow/core/profiler/lib/profiler_lock.h
similarity index 85%
rename from tensorflow/core/profiler/lib/profiler_utils.h
rename to tensorflow/core/profiler/lib/profiler_lock.h
index 140f12776db..75c49f77d54 100644
--- a/tensorflow/core/profiler/lib/profiler_utils.h
+++ b/tensorflow/core/profiler/lib/profiler_lock.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CORE_PROFILER_LIB_PROFILER_UTILS_H_
-#define TENSORFLOW_CORE_PROFILER_LIB_PROFILER_UTILS_H_
+#ifndef TENSORFLOW_CORE_PROFILER_LIB_PROFILER_LOCK_H_
+#define TENSORFLOW_CORE_PROFILER_LIB_PROFILER_LOCK_H_
 
 namespace tensorflow {
 namespace profiler {
@@ -28,4 +28,4 @@ void ReleaseProfilerLock();
 }  // namespace profiler
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_PROFILER_LIB_PROFILER_UTILS_H_
+#endif  // TENSORFLOW_CORE_PROFILER_LIB_PROFILER_LOCK_H_
diff --git a/tensorflow/core/profiler/lib/profiler_session.cc b/tensorflow/core/profiler/lib/profiler_session.cc
index ee6eb55300e..d8958180e2a 100644
--- a/tensorflow/core/profiler/lib/profiler_session.cc
+++ b/tensorflow/core/profiler/lib/profiler_session.cc
@@ -31,8 +31,9 @@ limitations under the License.
 #include "tensorflow/core/protobuf/error_codes.pb.h"
 
 #if !defined(IS_MOBILE_PLATFORM)
+#include "tensorflow/core/profiler/convert/post_process_single_host_xplane.h"
 #include "tensorflow/core/profiler/internal/profiler_factory.h"
-#include "tensorflow/core/profiler/lib/profiler_utils.h"
+#include "tensorflow/core/profiler/lib/profiler_lock.h"
 #include "tensorflow/core/profiler/utils/derived_timeline.h"
 #include "tensorflow/core/profiler/utils/group_events.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
@@ -81,27 +82,7 @@ Status ProfilerSession::CollectData(profiler::XSpace* space) {
   }
 
 #if !defined(IS_MOBILE_PLATFORM)
-  // Post processing the collected XSpace without hold profiler lock.
-  // 1. Merge plane of host events with plane of CUPTI driver api.
-  const profiler::XPlane* cupti_driver_api_plane =
-      profiler::FindPlaneWithName(*space, profiler::kCuptiDriverApiPlaneName);
-  if (cupti_driver_api_plane) {
-    profiler::XPlane* host_plane = profiler::FindOrAddMutablePlaneWithName(
-        space, profiler::kHostThreadsPlaneName);
-    profiler::MergePlanes(*cupti_driver_api_plane, host_plane);
-    profiler::SortXLinesBy(host_plane, profiler::XLinesComparatorByName());
-    profiler::RemovePlaneWithName(space, profiler::kCuptiDriverApiPlaneName);
-  }
-  // 2. Normalize all timestamps by shifting timeline to profiling start time.
-  // NOTE: this have to be done before sorting XSpace due to timestamp overflow.
-  profiler::NormalizeTimestamps(space, start_time_ns_);
-  // 3. Sort each plane of the XSpace
-  profiler::SortXSpace(space);
-  // 4. Grouping (i.e. marking step number) events in the XSpace.
-  profiler::GroupMetadataMap group_metadata_map;
-  profiler::GroupTfEvents(space, &group_metadata_map);
-  // 5. Generated miscellaneous derived time lines for device planes.
-  profiler::GenerateDerivedTimeLines(group_metadata_map, space);
+  PostProcessSingleHostXSpace(space, start_time_ns_);
 #endif
 
   return Status::OK();
diff --git a/tensorflow/core/profiler/lib/traceme.h b/tensorflow/core/profiler/lib/traceme.h
index 64103d95215..526f6d5104d 100644
--- a/tensorflow/core/profiler/lib/traceme.h
+++ b/tensorflow/core/profiler/lib/traceme.h
@@ -97,30 +97,21 @@ class TraceMe {
 #endif
   }
 
-  // string&& constructor to prevent an unnecessary string copy, e.g. when a
-  // TraceMe is constructed based on the result of a StrCat operation.
-  // Note: We can't take the string by value because a) it would make the
-  // overloads ambiguous, and b) we want lvalue strings to use the string_view
-  // constructor so we avoid copying them when tracing is disabled.
-  explicit TraceMe(std::string&& name, int level = 1) {
-    DCHECK_GE(level, 1);
-#if !defined(IS_MOBILE_PLATFORM)
-    if (TF_PREDICT_FALSE(TraceMeRecorder::Active(level))) {
-      new (&no_init_.name) std::string(std::move(name));
-      start_time_ = EnvTime::NowNanos();
-    }
-#endif
-  }
+  // Do not allow passing a temporary string as the overhead of generating that
+  // string should only be incurred when tracing is enabled. Wrap the temporary
+  // string generation (e.g., StrCat) in a lambda and use the name_generator
+  // template instead.
+  explicit TraceMe(std::string&& name, int level = 1) = delete;
 
   // Do not allow passing strings by reference or value since the caller
   // may unintentionally maintain ownership of the name.
-  // Explicitly std::move the name or wrap it in a string_view if
-  // you really wish to maintain ownership.
+  // Explicitly wrap the name in a string_view if you really wish to maintain
+  // ownership of a string already generated for other purposes. For temporary
+  // strings (e.g., result of StrCat) use the name_generator template.
   explicit TraceMe(const std::string& name, int level = 1) = delete;
 
   // This overload is necessary to make TraceMe's with string literals work.
-  // Otherwise, the string&& and the string_view constructor would be equally
-  // good overload candidates.
+  // Otherwise, the name_generator template would be used.
   explicit TraceMe(const char* raw, int level = 1)
       : TraceMe(absl::string_view(raw), level) {}
 
diff --git a/tensorflow/core/profiler/profiler_analysis.proto b/tensorflow/core/profiler/profiler_analysis.proto
index 8f7bd01920a..5f2c5aa86fd 100644
--- a/tensorflow/core/profiler/profiler_analysis.proto
+++ b/tensorflow/core/profiler/profiler_analysis.proto
@@ -7,8 +7,7 @@ import "tensorflow/core/profiler/profiler_service.proto";
 message NewProfileSessionRequest {
   ProfileRequest request = 1;
   // The place where we will dump profile data. We will normally use
-  // MODEL_DIR as the repository root. The data will be saved under
-  // MODEL_DIR/plugins/profile/.
+  // MODEL_DIR/plugins/profile as the repository root.
   string repository_root = 2;
   repeated string hosts = 3;
   string session_id = 4;
diff --git a/tensorflow/core/profiler/profiler_service.proto b/tensorflow/core/profiler/profiler_service.proto
index 649a5be56bf..69ccf04e304 100644
--- a/tensorflow/core/profiler/profiler_service.proto
+++ b/tensorflow/core/profiler/profiler_service.proto
@@ -48,8 +48,7 @@ message ProfileRequest {
   ProfileOptions opts = 4;
 
   // The place where we will dump profile data. We will normally use
-  // MODEL_DIR as the repository root. The data will be saved under
-  // MODEL_DIR/plugins/profile/.
+  // MODEL_DIR/plugins/profile/ as the repository root.
   string repository_root = 5;
 
   // The user provided profile session identifier.
diff --git a/tensorflow/core/profiler/protobuf/op_metrics.proto b/tensorflow/core/profiler/protobuf/op_metrics.proto
index af38795b7b2..670ebd5ed67 100644
--- a/tensorflow/core/profiler/protobuf/op_metrics.proto
+++ b/tensorflow/core/profiler/protobuf/op_metrics.proto
@@ -26,12 +26,14 @@ message LayoutAnalysis {
 }
 
 // Metrics for an operation (accumulated over all occurrences).
-// Next ID: 20
+// Next ID: 21
 message OpMetrics {
   // HLO module id. 0 for TF ops.
   uint64 hlo_module_id = 13;
   // Name of this op.
   string name = 6;
+  // Long name of this op (e.g., HLO expression).
+  string long_name = 20;
   // Category of this op.
   string category = 11;
   // Provenance of this op (e.g., if HLO op, original TF op).
diff --git a/tensorflow/core/profiler/protobuf/op_stats.proto b/tensorflow/core/profiler/protobuf/op_stats.proto
index 4800e88a50a..500de69048a 100644
--- a/tensorflow/core/profiler/protobuf/op_stats.proto
+++ b/tensorflow/core/profiler/protobuf/op_stats.proto
@@ -98,6 +98,8 @@ message OpStats {
   // The database for the op metrics collected from the device over the entire
   // profiling session including incomplete steps.
   OpMetricsDb device_op_metrics_db = 2;
+  // The result for the HLO-metric database over the complete steps only.
+  OpMetricsDb hlo_metrics_db_complete_steps_only = 10;
   // Performance environment of the op metrics collected.
   PerfEnv perf_env = 3;
   // The database of step sequences.
diff --git a/tensorflow/core/profiler/protobuf/overview_page.proto b/tensorflow/core/profiler/protobuf/overview_page.proto
index feb3423a00e..433f8df27a6 100644
--- a/tensorflow/core/profiler/protobuf/overview_page.proto
+++ b/tensorflow/core/profiler/protobuf/overview_page.proto
@@ -60,6 +60,9 @@ message OverviewPageAnalysis {
   // Percentage of TF-op execution time on the device (excluding the idle time)
   // that are in eager mode.
   double device_op_time_eager_percent = 15;
+  // Percentage of TF-op execution time on the device (excluding the idle time)
+  // that are for outside compilation.
+  double device_op_time_outside_compilation_percent = 16;
 }
 
 // Overview result for a performance tip to users.
@@ -99,10 +102,14 @@ message OverviewPageRecommendation {
   // bottleneck.
   string output_statement = 9;
   // A statement that recommends the next steps for investigating eager-mode
-  // related bottleneck (it is a html so that it can link to other tools/docs.)
+  // related bottleneck (it is an html so that it can link to other tools/docs.)
   string eager_statement_html = 12;
+  // A statement that recommends the next steps for investigating
+  // outside-compilation related bottleneck (it is an html so that it can link
+  // to other tools/docs.)
+  string outside_compilation_statement_html = 13;
   // A statement that recommends the next steps for investigating tf-function
-  // related bottleneck (it is a html so that it can link to other tools/docs.)
+  // related bottleneck (it is an html so that it can link to other tools/docs.)
   string tf_function_statement_html = 10;
   // A list of tips for improving host performance.
   repeated OverviewPageTip host_tips = 3;
diff --git a/tensorflow/core/profiler/protobuf/tf_stats.proto b/tensorflow/core/profiler/protobuf/tf_stats.proto
index 2dae6230f50..6e09057e557 100644
--- a/tensorflow/core/profiler/protobuf/tf_stats.proto
+++ b/tensorflow/core/profiler/protobuf/tf_stats.proto
@@ -10,6 +10,8 @@ message TfStatsDatabase {
   TfStatsTable with_idle = 4;
   // The table that excludes IDLE time.
   TfStatsTable without_idle = 5;
+  // The type of device used.
+  string device_type = 6;
   reserved 1, 2, 3;
 }
 
@@ -71,4 +73,7 @@ message TfStatsRecord {
   string bound_by = 17;
   // Whether this TF-op is eagerly executed.
   bool is_eager = 18;
+  // Fraction of kernel time that utilizes GPU TensorCore.
+  // It is 0.0 if this op does not run on a GPU device.
+  double gpu_tensorcore_utilization = 19;
 }
diff --git a/tensorflow/core/profiler/protobuf/xplane.proto b/tensorflow/core/profiler/protobuf/xplane.proto
index 48aa38dafff..dd34c2f40b1 100644
--- a/tensorflow/core/profiler/protobuf/xplane.proto
+++ b/tensorflow/core/profiler/protobuf/xplane.proto
@@ -36,6 +36,7 @@ message XPlane {
   map<int64, XStatMetadata> stat_metadata = 5;
 
   // XStats associated with this plane, e.g. device capabilities.
+  // Each of these XStats should have a different metadata_id.
   repeated XStat stats = 6;
 }
 
@@ -89,6 +90,7 @@ message XEvent {
   int64 duration_ps = 3;
 
   // XStats associated with the event.
+  // Each of these XStats should have a different metadata_id.
   repeated XStat stats = 4;
 }
 
@@ -112,8 +114,9 @@ message XStat {
   }
 }
 
-// Metadata for an XEvent, shared by all instances of the same event.
-// Next ID: 5
+// Metadata for an XEvent, corresponds to an event type and is shared by
+// all XEvents with the same metadata_id.
+// Next ID: 6
 message XEventMetadata {
   // XPlane.event_metadata map key.
   int64 id = 1;
@@ -126,15 +129,21 @@ message XEventMetadata {
 
   // Additional metadata in serialized format.
   bytes metadata = 3;
+
+  // XStats that are constant for all XEvents with the same metadata_id.
+  // Each of these XStats should have a different metadata_id.
+  repeated XStat stats = 5;
 }
 
-// Metadata for an XStat, shared by all instances of the same stat.
+// Metadata for an XStat, corresponds to a stat type and is shared by all
+// XStats with the same metadata_id.
 // Next ID: 4
 message XStatMetadata {
   // XPlane.stat_metadata map key.
   int64 id = 1;
 
   // Name of the stat (should be short).
+  // Two XStatMetadata with different id should have different names.
   string name = 2;
 
   // Description of the stat (might be long).
diff --git a/tensorflow/core/profiler/rpc/BUILD b/tensorflow/core/profiler/rpc/BUILD
index 06e5d2e4d2b..81861b95a3e 100644
--- a/tensorflow/core/profiler/rpc/BUILD
+++ b/tensorflow/core/profiler/rpc/BUILD
@@ -1,11 +1,31 @@
 load("//tensorflow:tensorflow.bzl", "tf_external_workspace_visible")  # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_grpc_cc_dependency")  # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_pybind_cc_library_wrapper")  # buildifier: disable=same-origin-load
+load("//tensorflow/core/profiler/builds:build_config.bzl", "tf_profiler_alias")
 
 package(
+    default_visibility = [
+        "//tensorflow/core/profiler:internal",
+    ],
     licenses = ["notice"],  # Apache 2.0
 )
 
+cc_library(
+    name = "grpc",
+    hdrs = ["grpc.h"],
+    deps = [
+        tf_profiler_alias("//tensorflow/core/profiler/rpc/", "grpc"),
+        tf_grpc_cc_dependency(),
+    ],
+)
+
+exports_files(
+    [
+        "grpc.h",
+    ],
+    visibility = ["//tensorflow/core/profiler/rpc:__subpackages__"],
+)
+
 cc_library(
     name = "profiler_service_impl",
     srcs = ["profiler_service_impl.cc"],
@@ -13,6 +33,7 @@ cc_library(
     features = ["-layering_check"],
     visibility = tf_external_workspace_visible(
         [
+            "//tensorflow/core/data/service:__pkg__",
             "//tensorflow/core/distributed_runtime/rpc:__pkg__",
             "//tensorflow_serving/model_servers:__pkg__",
         ],
@@ -37,6 +58,7 @@ cc_library(
         "//tensorflow/python/profiler/internal:__pkg__",
     ],
     deps = [
+        ":grpc",
         ":profiler_service_impl",
         "//tensorflow/core:lib",
         "//tensorflow/core/profiler:profiler_service_proto_cc",
diff --git a/tensorflow/core/profiler/rpc/client/BUILD b/tensorflow/core/profiler/rpc/client/BUILD
index 68b382eecee..eb7f236d587 100644
--- a/tensorflow/core/profiler/rpc/client/BUILD
+++ b/tensorflow/core/profiler/rpc/client/BUILD
@@ -26,6 +26,8 @@ cc_library(
         "//tensorflow/core/profiler:profiler_options_proto_cc",
         "//tensorflow/core/profiler:profiler_service_proto_cc",
         "//tensorflow/core/profiler/convert:xplane_to_profile_response",
+        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
+        "//tensorflow/core:lib_internal",
     ] + tf_profiler_client_deps(),
 )
 
@@ -39,6 +41,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/profiler:profiler_service_proto_cc",
+        "//tensorflow/core/profiler/utils:file_system_utils",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
     ],
@@ -54,6 +57,7 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/profiler:profiler_analysis_proto_cc",
         "//tensorflow/core/profiler:profiler_service_proto_cc",
+        "//tensorflow/core/profiler/rpc:grpc",
         tf_grpc_cc_dependency(),
     ],
     alwayslink = True,
diff --git a/tensorflow/core/profiler/rpc/client/capture_profile.cc b/tensorflow/core/profiler/rpc/client/capture_profile.cc
index bd82ba64db2..d7707aff5c2 100644
--- a/tensorflow/core/profiler/rpc/client/capture_profile.cc
+++ b/tensorflow/core/profiler/rpc/client/capture_profile.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "absl/strings/str_join.h"
 #include "absl/strings/str_split.h"
 #include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/host_info.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/convert/xplane_to_profile_response.h"
@@ -47,6 +48,30 @@ MonitorRequest PopulateMonitorRequest(int duration_ms, int monitoring_level,
   return request;
 }
 
+ProfileRequest PopulateProfileRequest(int duration_ms,
+                                      const std::string& repository_root,
+                                      const std::string& session_id,
+                                      const std::string& host_name,
+                                      const ProfileOptions& opts) {
+  ProfileRequest request;
+  request.set_duration_ms(duration_ms);
+  request.set_max_events(kMaxEvents);
+  request.set_repository_root(repository_root);
+  request.set_session_id(session_id);
+  request.set_host_name(host_name);
+  request.add_tools("trace_viewer");
+  request.add_tools("op_profile");
+  request.add_tools("input_pipeline");
+  request.add_tools("kernel_stats");
+  request.add_tools("memory_viewer");
+  request.add_tools("memory_profile");
+  request.add_tools("overview_page");
+  request.add_tools("pod_viewer");
+  request.add_tools("tensorflow_stats");
+  *request.mutable_opts() = opts;
+  return request;
+}
+
 NewProfileSessionRequest PopulateNewProfileSessionRequest(
     const std::string& service_addr, const std::string& repository_root,
     const std::vector<string>& hostnames, int duration_ms,
@@ -87,20 +112,20 @@ Status ConvertXSpaceToToolsInProfileResponse(const ProfileRequest& request,
   return Status::OK();
 }
 
-Status Profile(const std::string& service_addr, const std::string& logdir,
-               int duration_ms, const std::string& session_id,
-               const ProfileOptions& opts) {
+Status Profile(const std::string& service_addr,
+               const std::string& repository_root, int duration_ms,
+               const std::string& session_id, const ProfileOptions& opts) {
   std::vector<std::string> parts = absl::StrSplit(service_addr, ':');
-  ProfileRequest request =
-      PopulateProfileRequest(duration_ms, logdir, session_id, parts[0], opts);
+  ProfileRequest request = PopulateProfileRequest(duration_ms, repository_root,
+                                                  session_id, parts[0], opts);
   ProfileResponse response;
   TF_RETURN_IF_ERROR(ProfileGrpc(service_addr, request, &response));
 
   if (!response.empty_trace()) {
     TF_RETURN_IF_ERROR(
         ConvertXSpaceToToolsInProfileResponse(request, &response));
-    TF_RETURN_IF_ERROR(SaveTensorboardProfile(
-        logdir, session_id, request.host_name(), response, &std::cout));
+    TF_RETURN_IF_ERROR(SaveProfile(repository_root, session_id,
+                                   request.host_name(), response, &std::cout));
     // Print this at the end so that it's not buried in irrelevant LOG messages.
     std::cout
         << "NOTE: using the trace duration " << duration_ms << "ms.\n"
@@ -138,30 +163,6 @@ Status NewSession(const std::string& service_addr,
 
 }  // namespace
 
-ProfileRequest PopulateProfileRequest(int duration_ms,
-                                      const std::string& repository_root,
-                                      const std::string& session_id,
-                                      const std::string& host_name,
-                                      const ProfileOptions& opts) {
-  ProfileRequest request;
-  request.set_duration_ms(duration_ms);
-  request.set_max_events(kMaxEvents);
-  request.set_repository_root(repository_root);
-  request.set_session_id(session_id);
-  request.set_host_name(host_name);
-  request.add_tools("trace_viewer");
-  request.add_tools("op_profile");
-  request.add_tools("input_pipeline");
-  request.add_tools("kernel_stats");
-  request.add_tools("memory_viewer");
-  request.add_tools("memory_profile");
-  request.add_tools("overview_page");
-  request.add_tools("pod_viewer");
-  request.add_tools("tensorflow_stats");
-  *request.mutable_opts() = opts;
-  return request;
-}
-
 // Starts tracing on a single or multiple hosts and saves the result in the
 // given logdir. If no trace was collected, retries tracing for
 // num_tracing_attempts.
@@ -174,6 +175,9 @@ Status Trace(const std::string& service_addr, const std::string& logdir,
   if (!workers_list.empty()) {
     hostnames = absl::StrSplit(workers_list, ',');
   }
+  TF_RETURN_IF_ERROR(MaybeCreateEmptyEventFile(logdir));
+  std::string repository_root =
+      profiler::GetTensorBoardProfilePluginDir(logdir);
 
   Status status = Status::OK();
   int remaining_attempts = num_tracing_attempts;
@@ -181,9 +185,10 @@ Status Trace(const std::string& service_addr, const std::string& logdir,
     std::cout << "Starting to trace for " << duration_ms << " ms. "
               << "Remaining attempt(s): " << --remaining_attempts << std::endl;
     if (hostnames.empty()) {
-      status = Profile(service_addr, logdir, duration_ms, session_id, opts);
+      status =
+          Profile(service_addr, repository_root, duration_ms, session_id, opts);
     } else {
-      status = NewSession(service_addr, logdir, hostnames, duration_ms,
+      status = NewSession(service_addr, repository_root, hostnames, duration_ms,
                           session_id, opts);
     }
     if (remaining_attempts <= 0 || status.ok() || !ShouldRetryTracing(status))
@@ -213,5 +218,23 @@ Status Monitor(const std::string& service_addr, int duration_ms,
   return Status::OK();
 }
 
+Status ExportToTensorBoard(const XSpace& xspace, const std::string& logdir) {
+  TF_RETURN_IF_ERROR(MaybeCreateEmptyEventFile(logdir));
+
+  ProfileResponse response;
+  ProfileRequest request = PopulateProfileRequest(
+      /*duration_ms=*/0, GetTensorBoardProfilePluginDir(logdir),
+      GetCurrentTimeStampAsString(), port::Hostname(), /*opts=*/{});
+  TF_RETURN_IF_ERROR(
+      ConvertXSpaceToProfileResponse(xspace, request, &response));
+
+  std::stringstream ss;  // Record LOG messages.
+  TF_RETURN_IF_ERROR(SaveProfile(request.repository_root(),
+                                 request.session_id(), request.host_name(),
+                                 response, &ss));
+  LOG(INFO) << ss.str();
+  return Status::OK();
+}
+
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/rpc/client/capture_profile.h b/tensorflow/core/profiler/rpc/client/capture_profile.h
index 5745f24cbfa..771f1fee722 100644
--- a/tensorflow/core/profiler/rpc/client/capture_profile.h
+++ b/tensorflow/core/profiler/rpc/client/capture_profile.h
@@ -22,15 +22,13 @@ limitations under the License.
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/profiler/profiler_options.pb.h"
 #include "tensorflow/core/profiler/profiler_service.pb.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 
 namespace tensorflow {
 namespace profiler {
 
-ProfileRequest PopulateProfileRequest(int duration_ms,
-                                      const std::string& repository_root,
-                                      const std::string& session_id,
-                                      const std::string& host_name,
-                                      const ProfileOptions& opts);
+// Convert XSpace to tool data and saves under <logdir>/plugins/profile/.
+Status ExportToTensorBoard(const XSpace& xspace, const std::string& logdir);
 
 // Collects one sample of monitoring profile and shows user-friendly metrics.
 // If timestamp flag is true, timestamp will be displayed in "%H:%M:%S" format.
diff --git a/tensorflow/core/profiler/rpc/client/profiler_client.cc b/tensorflow/core/profiler/rpc/client/profiler_client.cc
index 0d8fd8411a5..94c2bc8766f 100644
--- a/tensorflow/core/profiler/rpc/client/profiler_client.cc
+++ b/tensorflow/core/profiler/rpc/client/profiler_client.cc
@@ -18,8 +18,10 @@ limitations under the License.
 
 #include "grpcpp/grpcpp.h"
 #include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/rpc/grpc.h"
 #include "tensorflow/core/protobuf/error_codes.pb.h"
 
 namespace tensorflow {
@@ -36,9 +38,13 @@ template <typename T>
 std::unique_ptr<typename T::Stub> CreateStub(const std::string& service_addr) {
   ::grpc::ChannelArguments channel_args;
   channel_args.SetMaxReceiveMessageSize(std::numeric_limits<int32>::max());
-  return T::NewStub(::grpc::CreateCustomChannel(
-      "dns:///" + service_addr, ::grpc::InsecureChannelCredentials(),
-      channel_args));
+  // Default URI prefix is "dns:///" if not provided.
+  auto channel = ::grpc::CreateCustomChannel(
+      service_addr, ::grpc::InsecureChannelCredentials(), channel_args);
+  if (!channel) {
+    LOG(ERROR) << "Unable to create channel" << service_addr;
+  }
+  return T::NewStub(channel);
 }
 
 }  // namespace
diff --git a/tensorflow/core/profiler/rpc/client/save_profile.cc b/tensorflow/core/profiler/rpc/client/save_profile.cc
index 9cf2e291692..acf5ecc71de 100644
--- a/tensorflow/core/profiler/rpc/client/save_profile.cc
+++ b/tensorflow/core/profiler/rpc/client/save_profile.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/rpc/client/save_profile.h"
 
-#include <initializer_list>
 #include <memory>
 #include <sstream>
 #include <string>
@@ -35,6 +34,7 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/profiler/profiler_service.pb.h"
+#include "tensorflow/core/profiler/utils/file_system_utils.h"
 
 // Windows.h #defines ERROR, but it is also used in
 // tensorflow/core/util/event.proto
@@ -45,51 +45,16 @@ namespace tensorflow {
 namespace profiler {
 namespace {
 
-#ifdef PLATFORM_WINDOWS
-const absl::string_view kPathSep = "\\";
-#else
-const absl::string_view kPathSep = "/";
-#endif
-
-string ProfilerJoinPathImpl(std::initializer_list<absl::string_view> paths) {
-  string result;
-  for (absl::string_view path : paths) {
-    if (path.empty()) continue;
-
-    if (result.empty()) {
-      result = string(path);
-      continue;
-    }
-
-    path = absl::StripPrefix(path, kPathSep);
-    if (absl::EndsWith(result, kPathSep)) {
-      absl::StrAppend(&result, path);
-    } else {
-      absl::StrAppend(&result, kPathSep, path);
-    }
-  }
-
-  return result;
-}
-
-// A local duplication of ::tensorflow::io::JoinPath that supports windows.
-// TODO(b/150699701): revert to use ::tensorflow::io::JoinPath when fixed.
-template <typename... T>
-string ProfilerJoinPath(const T&... args) {
-  return ProfilerJoinPathImpl({args...});
-}
 
 constexpr char kProtoTraceFileName[] = "trace";
 constexpr char kTfStatsHelperSuffix[] = "tf_stats_helper_result";
 
-Status DumpToolDataToLogDirectory(absl::string_view run_dir,
-                                  absl::string_view host,
-                                  const ProfileToolData& tool,
-                                  std::ostream* os) {
+Status DumpToolData(absl::string_view run_dir, absl::string_view host,
+                    const ProfileToolData& tool, std::ostream* os) {
   // Don't save the intermediate results for combining the per host tool data.
   if (absl::EndsWith(tool.name(), kTfStatsHelperSuffix)) return Status::OK();
-  string host_prefix = host.empty() ? "" : absl::StrCat(host, ".");
-  string path =
+  std::string host_prefix = host.empty() ? "" : absl::StrCat(host, ".");
+  std::string path =
       ProfilerJoinPath(run_dir, absl::StrCat(host_prefix, tool.name()));
   TF_RETURN_IF_ERROR(WriteStringToFile(Env::Default(), path, tool.data()));
   if (os) {
@@ -99,24 +64,8 @@ Status DumpToolDataToLogDirectory(absl::string_view run_dir,
   return Status::OK();
 }
 
-// Creates an empty event file if not already exists, which indicates that we
-// have a plugins/profile/ directory in the current logdir.
-Status MaybeCreateEmptyEventFile(const string& logdir) {
-  // Suffix for an empty event file.  it should be kept in sync with
-  // _EVENT_FILE_SUFFIX in tensorflow/python/eager/profiler.py.
-  constexpr char kProfileEmptySuffix[] = ".profile-empty";
-  std::vector<string> children;
-  TF_RETURN_IF_ERROR(Env::Default()->GetChildren(logdir, &children));
-  for (const string& child : children) {
-    if (absl::EndsWith(child, kProfileEmptySuffix)) {
-      return Status::OK();
-    }
-  }
-  EventsWriter event_writer(ProfilerJoinPath(logdir, "events"));
-  return event_writer.InitWithSuffix(kProfileEmptySuffix);
-}
-
-Status WriteGzippedDataToFile(const string& filepath, const string& data) {
+Status WriteGzippedDataToFile(const std::string& filepath,
+                              const std::string& data) {
   std::unique_ptr<WritableFile> file;
   TF_RETURN_IF_ERROR(Env::Default()->NewWritableFile(filepath, &file));
   io::ZlibCompressionOptions options = io::ZlibCompressionOptions::GZIP();
@@ -129,61 +78,69 @@ Status WriteGzippedDataToFile(const string& filepath, const string& data) {
   return Status::OK();
 }
 
-Status GetOrCreateProfileRunDir(const string& logdir, const string& run,
-                                string* profile_run_dir, std::ostream* os) {
-  // Dumps profile data to <logdir>/plugins/profile/<run>/.
-  *profile_run_dir =
-      ProfilerJoinPath(GetTensorBoardProfilePluginDir(logdir), run);
-  *os << "Creating directory: " << *profile_run_dir;
-  TF_RETURN_IF_ERROR(Env::Default()->RecursivelyCreateDir(*profile_run_dir));
-
-  // Creates an empty event file so that TensorBoard plugin logic can find
-  // the logdir.
-  TF_RETURN_IF_ERROR(MaybeCreateEmptyEventFile(logdir));
+Status GetOrCreateRunDir(const std::string& repository_root,
+                         const std::string& run, std::string* run_dir,
+                         std::ostream* os) {
+  // Creates a directory to <repository_root>/<run>/.
+  *run_dir = ProfilerJoinPath(repository_root, run);
+  *os << "Creating directory: " << *run_dir;
+  TF_RETURN_IF_ERROR(Env::Default()->RecursivelyCreateDir(*run_dir));
   return Status::OK();
 }
-
 }  // namespace
 
-string GetTensorBoardProfilePluginDir(const string& logdir) {
+std::string GetTensorBoardProfilePluginDir(const std::string& logdir) {
   constexpr char kPluginName[] = "plugins";
   constexpr char kProfileName[] = "profile";
   return ProfilerJoinPath(logdir, kPluginName, kProfileName);
 }
 
-Status SaveTensorboardProfile(const string& logdir, const string& run,
-                              const string& host,
-                              const ProfileResponse& response,
-                              std::ostream* os) {
-  string profile_run_dir;
-  TF_RETURN_IF_ERROR(
-      GetOrCreateProfileRunDir(logdir, run, &profile_run_dir, os));
+Status MaybeCreateEmptyEventFile(const std::string& logdir) {
+  // Suffix for an empty event file.  it should be kept in sync with
+  // _EVENT_FILE_SUFFIX in tensorflow/python/eager/profiler.py.
+  constexpr char kProfileEmptySuffix[] = ".profile-empty";
+  TF_RETURN_IF_ERROR(Env::Default()->RecursivelyCreateDir(logdir));
+
+  std::vector<std::string> children;
+  TF_RETURN_IF_ERROR(Env::Default()->GetChildren(logdir, &children));
+  for (const std::string& child : children) {
+    if (absl::EndsWith(child, kProfileEmptySuffix)) {
+      return Status::OK();
+    }
+  }
+  EventsWriter event_writer(ProfilerJoinPath(logdir, "events"));
+  return event_writer.InitWithSuffix(kProfileEmptySuffix);
+}
+
+Status SaveProfile(const std::string& repository_root, const std::string& run,
+                   const std::string& host, const ProfileResponse& response,
+                   std::ostream* os) {
+  std::string run_dir;
+  TF_RETURN_IF_ERROR(GetOrCreateRunDir(repository_root, run, &run_dir, os));
   for (const auto& tool_data : response.tool_data()) {
-    TF_RETURN_IF_ERROR(
-        DumpToolDataToLogDirectory(profile_run_dir, host, tool_data, os));
+    TF_RETURN_IF_ERROR(DumpToolData(run_dir, host, tool_data, os));
   }
   return Status::OK();
 }
 
-Status SaveGzippedToolDataToTensorboardProfile(const string& logdir,
-                                               const string& run,
-                                               const string& host,
-                                               const string& tool_name,
-                                               const string& data) {
-  string profile_run_dir;
+Status SaveGzippedToolData(const std::string& repository_root,
+                           const std::string& run, const std::string& host,
+                           const std::string& tool_name,
+                           const std::string& data) {
+  std::string run_dir;
   std::stringstream ss;
-  Status status = GetOrCreateProfileRunDir(logdir, run, &profile_run_dir, &ss);
+  Status status = GetOrCreateRunDir(repository_root, run, &run_dir, &ss);
   LOG(INFO) << ss.str();
   TF_RETURN_IF_ERROR(status);
-  string host_prefix = host.empty() ? "" : absl::StrCat(host, ".");
-  string path =
-      ProfilerJoinPath(profile_run_dir, absl::StrCat(host_prefix, tool_name));
+  std::string host_prefix = host.empty() ? "" : absl::StrCat(host, ".");
+  std::string path =
+      ProfilerJoinPath(run_dir, absl::StrCat(host_prefix, tool_name));
   TF_RETURN_IF_ERROR(WriteGzippedDataToFile(path, data));
   LOG(INFO) << "Dumped gzipped tool data for " << tool_name << " to " << path;
   return Status::OK();
 }
 
-string GetCurrentTimeStampAsString() {
+std::string GetCurrentTimeStampAsString() {
   return absl::FormatTime("%E4Y_%m_%d_%H_%M_%S", absl::Now(),
                           absl::LocalTimeZone());
 }
diff --git a/tensorflow/core/profiler/rpc/client/save_profile.h b/tensorflow/core/profiler/rpc/client/save_profile.h
index 2e8fc96390a..9c15ef26080 100644
--- a/tensorflow/core/profiler/rpc/client/save_profile.h
+++ b/tensorflow/core/profiler/rpc/client/save_profile.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_PROFILER_RPC_CLIENT_SAVE_PROFILE_H_
 
 #include <ostream>
+#include <string>
 
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/types.h"
@@ -25,26 +26,28 @@ limitations under the License.
 namespace tensorflow {
 namespace profiler {
 
-string GetCurrentTimeStampAsString();
+std::string GetCurrentTimeStampAsString();
 
 // Returns the profile plugin directory given a logdir to TensorBoard.
-string GetTensorBoardProfilePluginDir(const string& logdir);
+std::string GetTensorBoardProfilePluginDir(const std::string& logdir);
 
-// Saves all profiling tool data in a profile to a TensorBoard log directory
-// with the given run name. This writes user-facing log messages to `os`.
+// Creates an empty event file if not already exists, which indicates that we
+// have a plugins/profile/ directory in the current logdir.
+Status MaybeCreateEmptyEventFile(const std::string& logdir);
+
+// Saves all profiling tool data in a profile to <repository_root>/<run>/.
+// This writes user-facing log messages to `os`.
 // Note: this function creates a directory even when all fields in
 // ProfileResponse are unset/empty.
-Status SaveTensorboardProfile(const string& logdir, const string& run,
-                              const string& host,
-                              const ProfileResponse& response,
-                              std::ostream* os);
+Status SaveProfile(const std::string& repository_root, const std::string& run,
+                   const std::string& host, const ProfileResponse& response,
+                   std::ostream* os);
 
-// Gzip the data and save to the specified filepath.
-Status SaveGzippedToolDataToTensorboardProfile(const string& logdir,
-                                               const string& run,
-                                               const string& host,
-                                               const string& tool_name,
-                                               const string& data);
+// Gzip the data and save to <repository_root>/<run>/.
+Status SaveGzippedToolData(const std::string& repository_root,
+                           const std::string& run, const std::string& host,
+                           const std::string& tool_name,
+                           const std::string& data);
 
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/rpc/grpc.h b/tensorflow/core/profiler/rpc/grpc.h
new file mode 100644
index 00000000000..4066c6899b3
--- /dev/null
+++ b/tensorflow/core/profiler/rpc/grpc.h
@@ -0,0 +1,37 @@
+/* Copyright 2020 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// GRPC utilities
+
+#ifndef TENSORFLOW_CORE_PROFILER_COMMON_GRPC_GRPC_H_
+#define TENSORFLOW_CORE_PROFILER_COMMON_GRPC_GRPC_H_
+
+#include <memory>
+
+#include "grpcpp/security/credentials.h"
+#include "grpcpp/security/server_credentials.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Returns default credentials for use when creating a gRPC server.
+std::shared_ptr<::grpc::ServerCredentials> GetDefaultServerCredentials();
+
+// Returns default credentials for use when creating a gRPC channel.
+std::shared_ptr<::grpc::ChannelCredentials> GetDefaultChannelCredentials();
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_COMMON_GRPC_GRPC_H_
diff --git a/tensorflow/core/profiler/rpc/oss/BUILD b/tensorflow/core/profiler/rpc/oss/BUILD
new file mode 100644
index 00000000000..12bc92a68e8
--- /dev/null
+++ b/tensorflow/core/profiler/rpc/oss/BUILD
@@ -0,0 +1,27 @@
+load("//tensorflow:tensorflow.bzl", "tf_grpc_cc_dependency")
+
+package(
+    default_visibility = [
+        "//tensorflow/core/profiler:internal",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+cc_library(
+    name = "grpc",
+    srcs = [
+        "grpc.cc",
+        "//tensorflow/core/profiler/rpc:grpc.h",
+    ],
+    deps = [
+        tf_grpc_cc_dependency(),
+    ],
+    alwayslink = True,
+)
+
+exports_files(
+    [
+        "grpc.cc",
+    ],
+    visibility = ["//tensorflow/core/profiler/rpc:__subpackages__"],
+)
diff --git a/tensorflow/core/profiler/rpc/oss/grpc.cc b/tensorflow/core/profiler/rpc/oss/grpc.cc
new file mode 100644
index 00000000000..6e0e7ca5db2
--- /dev/null
+++ b/tensorflow/core/profiler/rpc/oss/grpc.cc
@@ -0,0 +1,30 @@
+/* Copyright 2020 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/profiler/rpc/grpc.h"
+
+namespace tensorflow {
+namespace profiler {
+
+std::shared_ptr<::grpc::ServerCredentials> GetDefaultServerCredentials() {
+  return ::grpc::InsecureServerCredentials();
+}
+
+std::shared_ptr<::grpc::ChannelCredentials> GetDefaultChannelCredentials() {
+  return ::grpc::InsecureChannelCredentials();
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/rpc/profiler_server.cc b/tensorflow/core/profiler/rpc/profiler_server.cc
index f05a829fb93..cefe0b6fef3 100644
--- a/tensorflow/core/profiler/rpc/profiler_server.cc
+++ b/tensorflow/core/profiler/rpc/profiler_server.cc
@@ -23,18 +23,30 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/profiler_service.grpc.pb.h"
+#include "tensorflow/core/profiler/rpc/grpc.h"
 #include "tensorflow/core/profiler/rpc/profiler_service_impl.h"
 
 namespace tensorflow {
+namespace profiler {
 
 void ProfilerServer::StartProfilerServer(int32 port) {
-  std::string server_address = absl::StrCat("0.0.0.0:", port);
+  VLOG(1) << "Starting profiler server.";
+  std::string server_address = absl::StrCat("[::]:", port);
   service_ = CreateProfilerService();
   ::grpc::ServerBuilder builder;
-  builder.AddListeningPort(server_address, ::grpc::InsecureServerCredentials());
+
+  int selected_port = 0;
+  builder.AddListeningPort(server_address, ::grpc::InsecureServerCredentials(),
+                           &selected_port);
   builder.RegisterService(service_.get());
   server_ = builder.BuildAndStart();
-  LOG(INFO) << "Profiling Server listening on " << server_address;
+  if (!selected_port) {
+    LOG(ERROR) << "Unable to bind to " << server_address
+               << " selected port:" << selected_port;
+  } else {
+    LOG(INFO) << "Profiler server listening on " << server_address
+              << " selected port:" << selected_port;
+  }
 }
 
 ProfilerServer::~ProfilerServer() {
@@ -44,4 +56,5 @@ ProfilerServer::~ProfilerServer() {
   }
 }
 
+}  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/rpc/profiler_server.h b/tensorflow/core/profiler/rpc/profiler_server.h
index b7148e7e686..45680e83b6c 100644
--- a/tensorflow/core/profiler/rpc/profiler_server.h
+++ b/tensorflow/core/profiler/rpc/profiler_server.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/profiler_service.grpc.pb.h"
 
 namespace tensorflow {
+namespace profiler {
 
 class ProfilerServer {
  public:
@@ -34,6 +35,7 @@ class ProfilerServer {
   std::unique_ptr<::grpc::Server> server_;
 };
 
+}  // namespace profiler
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_PROFILER_RPC_PROFILER_SERVER_H_
diff --git a/tensorflow/core/profiler/rpc/profiler_service_impl.cc b/tensorflow/core/profiler/rpc/profiler_service_impl.cc
index ba463813fc0..fa8810cd535 100644
--- a/tensorflow/core/profiler/rpc/profiler_service_impl.cc
+++ b/tensorflow/core/profiler/rpc/profiler_service_impl.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 
 namespace tensorflow {
+namespace profiler {
 namespace {
 
 const absl::string_view kXPlanePb = "xplane.pb";
@@ -115,4 +116,6 @@ std::unique_ptr<grpc::ProfilerService::Service> CreateProfilerService() {
   return absl::make_unique<ProfilerServiceImpl>();
 }
 
+}  // namespace profiler
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/rpc/profiler_service_impl.h b/tensorflow/core/profiler/rpc/profiler_service_impl.h
index 00a850acbf2..5aca89d891f 100644
--- a/tensorflow/core/profiler/rpc/profiler_service_impl.h
+++ b/tensorflow/core/profiler/rpc/profiler_service_impl.h
@@ -20,9 +20,11 @@ limitations under the License.
 #include "tensorflow/core/profiler/profiler_service.grpc.pb.h"
 
 namespace tensorflow {
+namespace profiler {
 
 std::unique_ptr<grpc::ProfilerService::Service> CreateProfilerService();
 
+}  // namespace profiler
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_PROFILER_RPC_PROFILER_SERVICE_IMPL_H_
diff --git a/tensorflow/core/profiler/tfprof_log.proto b/tensorflow/core/profiler/tfprof_log.proto
index 90b9e293ec7..b6db47e805e 100644
--- a/tensorflow/core/profiler/tfprof_log.proto
+++ b/tensorflow/core/profiler/tfprof_log.proto
@@ -1,6 +1,7 @@
 syntax = "proto3";
 
 package tensorflow.tfprof;
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/profiler/tfprof_log_go_proto";
 
 import "tensorflow/core/framework/attr_value.proto";
 import "tensorflow/core/framework/step_stats.proto";
diff --git a/tensorflow/core/profiler/utils/BUILD b/tensorflow/core/profiler/utils/BUILD
index 2d3ec1d004d..d4b64b089d3 100644
--- a/tensorflow/core/profiler/utils/BUILD
+++ b/tensorflow/core/profiler/utils/BUILD
@@ -212,6 +212,7 @@ cc_library(
     visibility = [":friends"],
     deps = [
         ":timespan",
+        ":trace_utils",
         ":xplane_builder",
         ":xplane_visitor",
         "//tensorflow/core:platform_base",
@@ -322,8 +323,10 @@ tf_cc_test(
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core/profiler/lib:connected_traceme",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
     ],
 )
@@ -433,3 +436,38 @@ cc_library(
         "@com_google_absl//absl/strings",
     ],
 )
+
+cc_library(
+    name = "step_intersection",
+    srcs = ["step_intersection.cc"],
+    hdrs = ["step_intersection.h"],
+    deps = [
+        ":timespan",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/platform:types",
+        "//tensorflow/core/profiler/protobuf:steps_db_proto_cc",
+        "@com_google_absl//absl/container:flat_hash_map",
+    ],
+)
+
+tf_cc_test(
+    name = "step_intersection_test",
+    srcs = ["step_intersection_test.cc"],
+    deps = [
+        ":step_intersection",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "@com_google_absl//absl/container:flat_hash_map",
+    ],
+)
+
+cc_library(
+    name = "file_system_utils",
+    hdrs = ["file_system_utils.h"],
+    deps = [
+        "//tensorflow/core/platform",
+        "@com_google_absl//absl/strings",
+    ],
+)
diff --git a/tensorflow/core/profiler/utils/file_system_utils.h b/tensorflow/core/profiler/utils/file_system_utils.h
new file mode 100644
index 00000000000..e0cebbef6fc
--- /dev/null
+++ b/tensorflow/core/profiler/utils/file_system_utils.h
@@ -0,0 +1,69 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_UTILS_FILE_SYSTEM_UTILS_H_
+#define TENSORFLOW_CORE_PROFILER_UTILS_FILE_SYSTEM_UTILS_H_
+
+#include <initializer_list>
+#include <string>
+
+#include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/strings/strip.h"
+#include "tensorflow/core/platform/platform.h"
+
+#ifdef PLATFORM_WINDOWS
+const absl::string_view kPathSep = "\\";
+#else
+const absl::string_view kPathSep = "/";
+#endif
+
+namespace tensorflow {
+namespace profiler {
+
+inline std::string ProfilerJoinPathImpl(
+    std::initializer_list<absl::string_view> paths) {
+  std::string result;
+  for (absl::string_view path : paths) {
+    if (path.empty()) continue;
+
+    if (result.empty()) {
+      result = std::string(path);
+      continue;
+    }
+
+    path = absl::StripPrefix(path, kPathSep);
+    if (absl::EndsWith(result, kPathSep)) {
+      absl::StrAppend(&result, path);
+    } else {
+      absl::StrAppend(&result, kPathSep, path);
+    }
+  }
+
+  return result;
+}
+
+// A local duplication of ::tensorflow::io::JoinPath that supports windows.
+// TODO(b/150699701): revert to use ::tensorflow::io::JoinPath when fixed.
+template <typename... T>
+std::string ProfilerJoinPath(const T&... args) {
+  return ProfilerJoinPathImpl({args...});
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_UTILS_FILE_SYSTEM_UTILS_H_
diff --git a/tensorflow/core/profiler/utils/group_events.cc b/tensorflow/core/profiler/utils/group_events.cc
index 86566822252..7cb0f5199c8 100644
--- a/tensorflow/core/profiler/utils/group_events.cc
+++ b/tensorflow/core/profiler/utils/group_events.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <iterator>
 #include <map>
 #include <memory>
+#include <queue>
 #include <string>
 #include <utility>
 #include <vector>
@@ -27,7 +28,9 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/optional.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
@@ -51,40 +54,39 @@ void CreateStatMetadata(XPlane* plane) {
   builder.GetOrCreateStatMetadata(GetStatTypeStr(StatType::kGroupId));
   builder.GetOrCreateStatMetadata(GetStatTypeStr(StatType::kStepName));
   builder.GetOrCreateStatMetadata(GetStatTypeStr(StatType::kIsEager));
+  builder.GetOrCreateStatMetadata(GetStatTypeStr(StatType::kSelectedGroupIds));
 }
 
 // Returns event type if it is a KernelLaunch or KernelExecute event.
 absl::optional<int64> GetKernelEventType(bool is_host_plane,
-                                         const XPlaneVisitor& visitor,
-                                         const XEvent& event) {
-  for (const auto& stat : event.stats()) {
-    if (visitor.GetStatType(stat) == StatType::kCorrelationId) {
-      return is_host_plane ? HostEventType::kKernelLaunch
-                           : HostEventType::kKernelExecute;
-    }
+                                         const EventNode& event) {
+  if (event.GetEventVisitor().GetStat(StatType::kCorrelationId).has_value()) {
+    return is_host_plane ? HostEventType::kKernelLaunch
+                         : HostEventType::kKernelExecute;
   }
   return absl::nullopt;
 }
 
-Category GetTfEventCategory(const XPlaneVisitor& visitor, const XEvent& event) {
-  TfOp tf_op =
-      ParseTfOpFullname(visitor.GetEventMetadata(event.metadata_id())->name());
-  return tf_op.category;
-}
-
-int64 GetEventType(bool is_host_plane, const XPlaneVisitor& visitor,
-                   const XEvent& event) {
-  if (absl::optional<int64> event_type = visitor.GetEventType(event)) {
+int64 GetEventType(bool is_host_plane, const EventNode& event) {
+  if (absl::optional<int64> event_type = event.GetEventVisitor().Type()) {
     return *event_type;
   } else if (absl::optional<int64> kernel_event_type =
-                 GetKernelEventType(is_host_plane, visitor, event)) {
+                 GetKernelEventType(is_host_plane, event)) {
     // KernelLaunch and KernelExecute event types are not supported by
     // XPlaneVisitor and should be checked separately.
     // TODO(b/148346217): Make XPlaneVisitor support KernelLaunch and
     // KernelExecute event types.
     return *kernel_event_type;
   } else {
-    Category category = GetTfEventCategory(visitor, event);
+    absl::string_view name = event.GetEventVisitor().Name();
+    // Legacy event names appended with arguments.
+    if (absl::StartsWith(name, "BatchingSessionRun")) {
+      return HostEventType::kBatchingSessionRun;
+    } else if (absl::StartsWith(name, "ProcessBatch")) {
+      return HostEventType::kProcessBatch;
+    }
+    // TF op names.
+    Category category = ParseTfOpFullname(name).category;
     switch (category) {
       case Category::kTensorFlow:
         return HostEventType::kTfOpRun;
@@ -104,9 +106,8 @@ void SetGroupId(const XPlaneVisitor& visitor, int64 group_id, XEvent* event) {
 void SetContextGroup(EventNode* event, ContextGroupMap* context_groups) {
   auto producer = event->GetProducerContext();
   if (producer.has_value()) {
-    DCHECK_EQ(((*context_groups)[producer->type][producer->id]).producer,
-              nullptr);
-    ((*context_groups)[producer->type][producer->id]).producer = event;
+    ((*context_groups)[producer->type][producer->id])
+        .producers.push_back(event);
   }
   auto consumer = event->GetConsumerContext();
   if (consumer.has_value()) {
@@ -119,7 +120,7 @@ void ConnectContextGroups(const ContextGroupMap& context_groups) {
   for (auto& type_id_group : context_groups) {
     for (auto& id_group : type_id_group.second) {
       const ContextGroup& group = id_group.second;
-      if (EventNode* parent = group.producer) {
+      for (EventNode* parent : group.producers) {
         for (EventNode* child : group.consumers) {
           parent->AddChild(child);
         }
@@ -153,9 +154,15 @@ bool IsImplicitRootEvent(const XEventVisitor& event) {
          kImplicitRootEvents->contains(*event.Type());
 }
 
-void ProcessRootEvent(int64 group_id, EventNode* root_event,
+void ProcessRootEvent(int64 group_id, bool set_step_name, EventNode* root_event,
                       GroupMetadataMap* group_metadata_map) {
-  root_event->PropagateGroupId(group_id);
+  root_event->PropagateGroupId(group_id, group_metadata_map);
+  if (!set_step_name) {
+    // Step names are not necessary for inference profiles but add group_id to
+    // group_metadata_map to count the number of groups.
+    group_metadata_map->emplace(group_id, GroupMetadata());
+    return;
+  }
   std::string group_name = root_event->GetGroupName();
   // TODO(jihochoi): change event name instead.
   if (!IsImplicitRootEvent(root_event->GetEventVisitor())) {
@@ -258,14 +265,42 @@ bool IsLegacyRootEvent(const XEventVisitor& event) {
   return event.Type().has_value() && kRootEvents->contains(*event.Type());
 }
 
+using Comparator = std::function<bool(const EventNode*)>;
+
+const EventNode* FindParentWithComparator(const Comparator& comparator,
+                                          const EventNode* node,
+                                          bool include_self) {
+  std::queue<const EventNode*> nodes;
+  absl::flat_hash_set<const EventNode*> seen = {node};
+  if (include_self) {
+    nodes.push(node);
+  } else {
+    for (const EventNode* parent : node->GetParents()) {
+      nodes.push(parent);
+      seen.insert(parent);
+    }
+  }
+  while (!nodes.empty()) {
+    const EventNode* node = nodes.front();
+    nodes.pop();
+    if (comparator(node)) return node;
+    for (const EventNode* parent : node->GetParents()) {
+      if (seen.contains(parent)) continue;
+      nodes.push(parent);
+      seen.insert(parent);
+    }
+  }
+  return nullptr;
+}
+
 // Returns true if none of its ancestors is a root event.
 bool IsTopRoot(const EventNode* event) {
+  // If it is already grouped, it is not a top root.
   if (event->GetGroupId().has_value()) return false;
-  for (EventNode* cur = event->GetParent(); cur != nullptr;
-       cur = cur->GetParent()) {
-    if (cur->IsRoot()) return false;
-  }
-  return true;
+  const EventNode* root_parent = FindParentWithComparator(
+      [](const EventNode* node) { return node->IsRoot(); }, event,
+      /*include_self=*/false);
+  return root_parent == nullptr;
 }
 
 void SortEventList(EventList* event_list) {
@@ -347,10 +382,20 @@ EventNode::EventNode(const EventNode& event_node)
                 event_node.raw_event_) {}
 
 absl::optional<XStatVisitor> EventNode::GetContextStat(int64 stat_type) const {
-  for (const EventNode* node = this; node != nullptr; node = node->parent_) {
+  std::queue<const EventNode*> nodes;
+  absl::flat_hash_set<const EventNode*> seen = {this};
+  nodes.push(this);
+  while (!nodes.empty()) {
+    const EventNode* node = nodes.front();
+    nodes.pop();
     if (absl::optional<XStatVisitor> stat = node->visitor_.GetStat(stat_type)) {
       return stat;
     }
+    for (const EventNode* parent : node->GetParents()) {
+      if (seen.contains(parent)) continue;
+      nodes.push(parent);
+      seen.insert(parent);
+    }
   }
   return absl::nullopt;
 }
@@ -374,16 +419,22 @@ std::string EventNode::GetGroupName() const {
   return name;
 }
 
-void EventNode::PropagateGroupId(int64 group_id) {
+void EventNode::PropagateGroupId(int64 group_id,
+                                 GroupMetadataMap* group_metadata_map) {
   group_id_ = group_id;
   SetGroupId(*plane_, group_id, raw_event_);
   for (const auto& child : children_) {
-    // Skip if it already belongs to a group. Some nodes may be added multiple
-    // times as child (e.g., sometimes async ops are executed synchronously and
-    // their nodes are added as child both in ConnectIntraThread and
-    // ConnectInterThread).
-    if (child->GetGroupId()) continue;
-    child->PropagateGroupId(*group_id_);
+    absl::optional<int64> child_group_id = child->GetGroupId();
+    if (child_group_id.has_value()) {
+      if (*child_group_id != group_id) {
+        (*group_metadata_map)[group_id].children.push_back(*child_group_id);
+        (*group_metadata_map)[*child_group_id].parents.push_back(group_id);
+      }
+      // Stop propagation if it already belongs to a group. It may have been
+      // grouped by another root.
+      continue;
+    }
+    child->PropagateGroupId(group_id, group_metadata_map);
   }
 }
 
@@ -392,6 +443,24 @@ void EventNode::AddStepName(absl::string_view step_name) {
                      raw_event_);
 }
 
+void EventNode::AddSelectedGroupIds(
+    const GroupMetadataMap& group_metadata_map) {
+  std::vector<int64> group_ids;
+  group_ids.reserve(1 + group_metadata_map.at(*group_id_).parents.size() +
+                    group_metadata_map.at(*group_id_).children.size());
+  group_ids.push_back(*group_id_);
+  group_ids.insert(group_ids.end(),
+                   group_metadata_map.at(*group_id_).parents.begin(),
+                   group_metadata_map.at(*group_id_).parents.end());
+  group_ids.insert(group_ids.end(),
+                   group_metadata_map.at(*group_id_).children.begin(),
+                   group_metadata_map.at(*group_id_).children.end());
+  AddOrUpdateStrStat(
+      *plane_->GetStatMetadataId(StatType::kSelectedGroupIds),
+      absl::StrCat("?selected_group_ids=", absl::StrJoin(group_ids, ",")),
+      raw_event_);
+}
+
 void EventNode::SetIsEager(bool is_eager) {
   AddOrUpdateIntStat(*plane_->GetStatMetadataId(StatType::kIsEager),
                      is_eager ? 1 : 0, raw_event_);
@@ -405,14 +474,12 @@ bool EventNode::IsEager() {
          FindParent(HostEventType::kEagerKernelExecute) != nullptr;
 }
 
-EventNode* EventNode::FindParent(int64 event_type) const {
-  if (parent_) {
-    if (parent_->GetEventVisitor().Type() == event_type) {
-      return parent_;
-    }
-    return parent_->FindParent(event_type);
-  }
-  return nullptr;
+const EventNode* EventNode::FindParent(int64 event_type) const {
+  return FindParentWithComparator(
+      [event_type](const EventNode* node) {
+        return node->GetEventVisitor().Type() == event_type;
+      },
+      this, /*include_self=*/true);
 }
 
 bool EventNode::StartsBefore(const EventNode& other) const {
@@ -447,7 +514,7 @@ void EventForest::ConnectIntraThread(const XPlaneVisitor& visitor,
       }
       parent_nodes.push_back(cur_node.get());
       // event_node_map_ keeps cur_node alive.
-      event_node_map_[GetEventType(is_host_plane, visitor, event)].push_back(
+      event_node_map_[GetEventType(is_host_plane, *cur_node)].push_back(
           std::move(cur_node));
     }
   }
@@ -511,10 +578,40 @@ void EventForest::ProcessLegacyRootEvents(
 }
 
 void EventForest::CreateEventGroup() {
+  // Handle inference batching profiles.
+  if (event_node_map_.contains(HostEventType::kProcessBatch)) {
+    // Assign group_id per batch.
+    for (const auto& process_batch_node :
+         event_node_map_[HostEventType::kProcessBatch]) {
+      ProcessRootEvent(next_group_id_++, /*set_step_name=*/false,
+                       process_batch_node.get(), &group_metadata_map_);
+    }
+    HostEventType request_event_type =
+        event_node_map_.contains(HostEventType::kBatchingSessionRun)
+            ? HostEventType::kBatchingSessionRun
+            : HostEventType::kSessionRun;
+    if (auto request_events =
+            gtl::FindOrNull(event_node_map_, request_event_type)) {
+      // Assign group_id per request.
+      for (const auto& request_event : *request_events) {
+        ProcessRootEvent(next_group_id_++, /*set_step_name=*/false,
+                         request_event.get(), &group_metadata_map_);
+        // Also, set a helper stat for selected_group_ids.
+        request_event->AddSelectedGroupIds(group_metadata_map_);
+      }
+    }
+    // Set a helper stat for selected_group_ids per batch.
+    for (const auto& process_batch_node :
+         event_node_map_[HostEventType::kProcessBatch]) {
+      process_batch_node->AddSelectedGroupIds(group_metadata_map_);
+    }
+    return;
+  }
   // Create a group for each TF loop iteration in non-JAX profiles.
   if (!HasJaxEvent(event_node_map_) && !tf_loop_root_events_.empty()) {
     for (EventNode* root_event : tf_loop_root_events_) {
-      ProcessRootEvent(next_group_id_++, root_event, &group_metadata_map_);
+      ProcessRootEvent(next_group_id_++, /*set_step_name=*/true, root_event,
+                       &group_metadata_map_);
     }
     return;
   }
@@ -525,7 +622,8 @@ void EventForest::CreateEventGroup() {
     if (IsTopRoot(root_event) &&
         (!HasJaxEvent(event_node_map_) ||
          !IsLegacyRootEvent(root_event->GetEventVisitor()))) {
-      ProcessRootEvent(next_group_id_++, root_event, &group_metadata_map_);
+      ProcessRootEvent(next_group_id_++, /*set_step_name=*/true, root_event,
+                       &group_metadata_map_);
     }
   }
 }
@@ -695,7 +793,10 @@ void EventForest::ProcessTfDataEvents() {
       absl::optional<XStatVisitor> element_id =
           consume_event->GetEventVisitor().GetStat(StatType::kElementId);
       if (!element_id.has_value()) continue;
-      EventNode* consume_iterator = consume_event->GetParent();
+      if (consume_event->GetParents().empty()) continue;
+      // consume_event is nested by consumer_iterator and does not have other
+      // parents.
+      EventNode* consume_iterator = consume_event->GetParents().at(0);
       if (!consume_iterator ||
           !IsDatasetOp(
               ParseTfOpFullname(consume_iterator->GetEventVisitor().Name()))) {
diff --git a/tensorflow/core/profiler/utils/group_events.h b/tensorflow/core/profiler/utils/group_events.h
index e03acf3a37f..c5d07320f59 100644
--- a/tensorflow/core/profiler/utils/group_events.h
+++ b/tensorflow/core/profiler/utils/group_events.h
@@ -49,6 +49,15 @@ struct ContextInfo {
   uint64 id;
 };
 
+struct GroupMetadata {
+  std::string name;
+  std::string model_id;  // inference only.
+  std::vector<int64> parents;
+  std::vector<int64> children;
+};
+
+using GroupMetadataMap = absl::flat_hash_map<int64 /*group_id*/, GroupMetadata>;
+
 // A wrapper for XEvent with parent and children pointers. Through these
 // pointers, a tree of EventNode is formed.
 class EventNode {
@@ -58,13 +67,13 @@ class EventNode {
 
   EventNode(const EventNode& event_node);
 
-  EventNode* GetParent() const { return parent_; }
+  const std::vector<EventNode*>& GetParents() const { return parents_; }
 
   const std::vector<EventNode*>& GetChildren() const { return children_; }
 
   void AddChild(EventNode* child) {
     children_.push_back(child);
-    child->parent_ = this;
+    child->parents_.push_back(this);
   }
 
   absl::optional<int64> GetGroupId() const { return group_id_; }
@@ -72,7 +81,7 @@ class EventNode {
   std::string GetGroupName() const;
 
   // Sets group_id for this node and its descendants.
-  void PropagateGroupId(int64 group_id);
+  void PropagateGroupId(int64 group_id, GroupMetadataMap* group_metadata_map);
 
   const XPlaneVisitor& GetPlaneVisitor() const { return *plane_; }
 
@@ -82,6 +91,10 @@ class EventNode {
 
   void AddStepName(absl::string_view step_name);
 
+  // Add a helper stat, "selected_group_ids", with group_ids of the groups
+  // connected to this event's group.
+  void AddSelectedGroupIds(const GroupMetadataMap& group_metadata_map);
+
   void SetIsEager(bool is_eager);
 
   // Returns true if this event is part of eagerly executed op.
@@ -89,8 +102,8 @@ class EventNode {
 
   bool IsNestedIn(EventNode* parent);
 
-  // Returns the closest parent of the given event type.
-  EventNode* FindParent(int64 event_type) const;
+  // Returns the closest parent (including itself) of the given event type.
+  const EventNode* FindParent(int64 event_type) const;
 
   absl::optional<ContextInfo> GetProducerContext() const {
     return producer_context_;
@@ -113,7 +126,7 @@ class EventNode {
   XEventVisitor visitor_;
   XLine* raw_line_;
   XEvent* raw_event_;
-  EventNode* parent_ = nullptr;
+  std::vector<EventNode*> parents_;
   std::vector<EventNode*> children_;
   absl::optional<int64> group_id_;
   absl::optional<ContextInfo> producer_context_;
@@ -126,17 +139,10 @@ using EventNodeMap =
     absl::flat_hash_map<int64 /*event_type*/,
                         std::vector<std::unique_ptr<EventNode>>>;
 
-struct GroupMetadata {
-  std::string name;
-  std::string model_id;  // inference only.
-};
-
-using GroupMetadataMap = absl::flat_hash_map<int64 /*group_id*/, GroupMetadata>;
-
 using EventList = std::vector<EventNode*>;
 
 struct ContextGroup {
-  EventNode* producer = nullptr;
+  std::vector<EventNode*> producers;
   std::vector<EventNode*> consumers;
 };
 
diff --git a/tensorflow/core/profiler/utils/group_events_test.cc b/tensorflow/core/profiler/utils/group_events_test.cc
index 195f2adb9c4..3701ed75101 100644
--- a/tensorflow/core/profiler/utils/group_events_test.cc
+++ b/tensorflow/core/profiler/utils/group_events_test.cc
@@ -16,9 +16,12 @@ limitations under the License.
 #include "tensorflow/core/profiler/utils/group_events.h"
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/optional.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/lib/connected_traceme.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
@@ -563,6 +566,77 @@ TEST(GroupEventsTest, WorkerTest) {
       });
 }
 
+TEST(GroupEventsTest, BatchingSessionTest) {
+  constexpr absl::string_view kSchedule = "Schedule";
+  constexpr int64 kBatchContextType =
+      static_cast<int64>(ContextType::kSharedBatchScheduler);
+  constexpr int64 kBatchContextId = 123;
+
+  XSpace raw_space;
+  XPlane* raw_plane = raw_space.add_planes();
+  XPlaneBuilder plane(raw_plane);
+  plane.ReserveLines(2);
+  auto request_thread = plane.GetOrCreateLine(0);
+  // First request.
+  CreateXEvent(&plane, &request_thread, HostEventType::kBatchingSessionRun, 0,
+               100);
+  CreateXEvent(&plane, &request_thread, kSchedule, 0, 100,
+               {{StatType::kProducerType, kBatchContextType},
+                {StatType::kProducerId, kBatchContextId}});
+  // Second request.
+  CreateXEvent(&plane, &request_thread, HostEventType::kBatchingSessionRun, 200,
+               100);
+  CreateXEvent(&plane, &request_thread, kSchedule, 200, 100,
+               {{StatType::kProducerType, kBatchContextType},
+                {StatType::kProducerId, kBatchContextId}});
+  auto batch_thread = plane.GetOrCreateLine(1);
+  CreateXEvent(&plane, &batch_thread, HostEventType::kProcessBatch, 200, 100,
+               {{StatType::kConsumerType, kBatchContextType},
+                {StatType::kConsumerId, kBatchContextId}});
+
+  GroupMetadataMap group_metadata_map;
+  GroupTfEvents(&raw_space, &group_metadata_map);
+  EXPECT_EQ(group_metadata_map.size(), 3);
+  // Check that the ProcessBatch group has two BatchingSessionRun groups as
+  // parents.
+  EXPECT_EQ(group_metadata_map[0].parents.size(), 2);
+  // Check that the BatchingSessionRun groups have one ProcessBatch group as a
+  // child.
+  EXPECT_EQ(group_metadata_map[1].children.size(), 1);
+  EXPECT_EQ(group_metadata_map[2].children.size(), 1);
+  // Chech that the events have the selected_group_ids stat set.
+  uint64 num_checked = 0;
+  CreateTfXPlaneVisitor(raw_plane).ForEachLine(
+      [&](const tensorflow::profiler::XLineVisitor& line) {
+        line.ForEachEvent(
+            [&](const tensorflow::profiler::XEventVisitor& event) {
+              absl::optional<int64> group_id;
+              if (absl::optional<XStatVisitor> stat =
+                      event.GetStat(StatType::kGroupId)) {
+                group_id = stat->IntValue();
+              }
+              EXPECT_TRUE(group_id.has_value());
+              std::string selected_group_ids;
+              if (absl::optional<XStatVisitor> stat =
+                      event.GetStat(StatType::kSelectedGroupIds)) {
+                selected_group_ids = stat->StrOrRefValue();
+              }
+              if (line.Id() == 0) {
+                if (event.Type() == HostEventType::kBatchingSessionRun) {
+                  EXPECT_EQ(
+                      selected_group_ids,
+                      absl::StrCat("?selected_group_ids=", *group_id, ",0"));
+                  ++num_checked;
+                }
+              } else if (line.Id() == 1) {
+                EXPECT_EQ(selected_group_ids, "?selected_group_ids=0,1,2");
+                ++num_checked;
+              }
+            });
+      });
+  EXPECT_EQ(num_checked, 3);
+}
+
 }  // namespace
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/hardware_type_utils.cc b/tensorflow/core/profiler/utils/hardware_type_utils.cc
index 69b5d4796a3..b7cae767813 100644
--- a/tensorflow/core/profiler/utils/hardware_type_utils.cc
+++ b/tensorflow/core/profiler/utils/hardware_type_utils.cc
@@ -82,5 +82,7 @@ HardwareType ParseHardwareType(absl::string_view device_type) {
   return HardwareType::UNKNOWN_HARDWARE;
 }
 
+bool HasDevice(HardwareType x) { return x > tensorflow::profiler::CPU_ONLY; }
+
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/hardware_type_utils.h b/tensorflow/core/profiler/utils/hardware_type_utils.h
index 70090fb766f..4a1470a352e 100644
--- a/tensorflow/core/profiler/utils/hardware_type_utils.h
+++ b/tensorflow/core/profiler/utils/hardware_type_utils.h
@@ -28,6 +28,9 @@ double GetFlopMaxThroughputPerSM(const DeviceCapabilities& device_cap);
 
 HardwareType ParseHardwareType(absl::string_view device_type);
 
+// Returns true if the given hardware type has a device.
+bool HasDevice(HardwareType x);
+
 }  // namespace profiler
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/profiler/utils/kernel_stats_utils.cc b/tensorflow/core/profiler/utils/kernel_stats_utils.cc
index 9212ecba533..b19d02ffa44 100644
--- a/tensorflow/core/profiler/utils/kernel_stats_utils.cc
+++ b/tensorflow/core/profiler/utils/kernel_stats_utils.cc
@@ -32,6 +32,13 @@ limitations under the License.
 namespace tensorflow {
 namespace profiler {
 
+namespace {
+
+// The maximum number of Kernels displayed on Kernel Stats page.
+const int kMaxNumOfKernels = 1000;
+
+}  // namespace
+
 void ParseKernelLaunchParams(absl::string_view xstat_kernel_details,
                              KernelReport* kernel) {
   const std::vector<absl::string_view> params =
@@ -203,26 +210,66 @@ bool KernelReportEqualToComparator::operator()(const KernelReport& lhs,
   // clang-format on
 }
 
-void SortKernelsByTotalDurationDesc(KernelStatsDb* kernel_stats_db) {
-  // Sort kernel reports by total duration descendingly.
-  std::sort(kernel_stats_db->mutable_reports()->begin(),
-            kernel_stats_db->mutable_reports()->end(),
-            [](const KernelReport& lhs, const KernelReport& rhs) {
-              return lhs.total_duration_ns() > rhs.total_duration_ns() ||
-                     (lhs.total_duration_ns() == rhs.total_duration_ns() &&
-                      KernelReportLessThanComparator()(lhs, rhs));
-            });
+void SortAndKeepTopKDurationKernelReportsInDb(KernelStatsDb* kernel_stats_db) {
+  auto comp = [](const KernelReport& lhs, const KernelReport& rhs) {
+    return lhs.total_duration_ns() > rhs.total_duration_ns() ||
+           (lhs.total_duration_ns() == rhs.total_duration_ns() &&
+            KernelReportLessThanComparator()(lhs, rhs));
+  };
+
+  // Sort and keep at most <kMaxNumOfKernels> kernel reports.
+  if (kernel_stats_db->reports_size() > kMaxNumOfKernels) {
+    std::partial_sort(
+        kernel_stats_db->mutable_reports()->begin(),
+        kernel_stats_db->mutable_reports()->begin() + kMaxNumOfKernels,
+        kernel_stats_db->mutable_reports()->end(), comp);
+    kernel_stats_db->mutable_reports()->erase(
+        kernel_stats_db->mutable_reports()->begin() + kMaxNumOfKernels,
+        kernel_stats_db->mutable_reports()->end());
+  } else {
+    std::sort(kernel_stats_db->mutable_reports()->begin(),
+              kernel_stats_db->mutable_reports()->end(), comp);
+  }
 }
 
-void CopyKernelReportsToDb(const KernelReportMap& reports, KernelStatsDb* dst) {
+void CopyTopKDurationKernelReportsToDb(const KernelReportMap& reports,
+                                       KernelStatsDb* dst) {
+  std::vector<std::pair<const KernelReport*, const KernelReportValue*>>
+      kernels_to_sort;
+  kernels_to_sort.reserve(reports.size());
   for (const auto& report_value : reports) {
+    kernels_to_sort.push_back(
+        std::make_pair(&report_value.first, &report_value.second));
+  }
+
+  auto comp =
+      [](const std::pair<const KernelReport*, const KernelReportValue*>& lhs,
+         const std::pair<const KernelReport*, const KernelReportValue*>& rhs) {
+        return lhs.second->total_duration_ns > rhs.second->total_duration_ns ||
+               (lhs.second->total_duration_ns ==
+                    rhs.second->total_duration_ns &&
+                KernelReportLessThanComparator()(*lhs.first, *rhs.first));
+      };
+
+  // Sort and copy at most <kMaxNumOfKernels> kernels to <dst>.
+  if (kernels_to_sort.size() > kMaxNumOfKernels) {
+    absl::c_partial_sort(kernels_to_sort,
+                         kernels_to_sort.begin() + kMaxNumOfKernels, comp);
+  } else {
+    absl::c_sort(kernels_to_sort, comp);
+  }
+
+  int copy_size =
+      std::min(kMaxNumOfKernels, static_cast<int>(kernels_to_sort.size()));
+  for (int i = 0; i < copy_size; i++) {
     KernelReport* report = dst->add_reports();
-    *report = report_value.first;
+    *report = *kernels_to_sort[i].first;
+    const KernelReportValue& kernel_value = *kernels_to_sort[i].second;
     // Set value using KernelReportValue.
-    report->set_occurrences(report_value.second.occurrences);
-    report->set_min_duration_ns(report_value.second.min_duration_ns);
-    report->set_max_duration_ns(report_value.second.max_duration_ns);
-    report->set_total_duration_ns(report_value.second.total_duration_ns);
+    report->set_occurrences(kernel_value.occurrences);
+    report->set_min_duration_ns(kernel_value.min_duration_ns);
+    report->set_max_duration_ns(kernel_value.max_duration_ns);
+    report->set_total_duration_ns(kernel_value.total_duration_ns);
   }
 }
 
diff --git a/tensorflow/core/profiler/utils/kernel_stats_utils.h b/tensorflow/core/profiler/utils/kernel_stats_utils.h
index 8d0bd82bbe6..1b965376297 100644
--- a/tensorflow/core/profiler/utils/kernel_stats_utils.h
+++ b/tensorflow/core/profiler/utils/kernel_stats_utils.h
@@ -50,7 +50,9 @@ struct KernelReportEqualToComparator {
 };
 
 // Sorts kernel reorts by total duration descendingly.
-void SortKernelsByTotalDurationDesc(KernelStatsDb* kernel_stats_db);
+// Keeps only the top kernel reports with long kernel duration in the given
+// KernelStatsDb. Kernel reports with shorter kernel duration are dropped.
+void SortAndKeepTopKDurationKernelReportsInDb(KernelStatsDb* kernel_stats_db);
 
 struct KernelReportValue {
   uint64 total_duration_ns = 0;
@@ -95,8 +97,10 @@ using KernelReportMap =
     absl::flat_hash_map<KernelReport, KernelReportValue, KernelHash,
                         KernelReportEqualToComparator>;
 
-// Copies reports into the given KernelStatsDb.
-void CopyKernelReportsToDb(const KernelReportMap& reports, KernelStatsDb* dst);
+// Copies the top kernel reports with long kernel duration into the given
+// KernelStatsDb.
+void CopyTopKDurationKernelReportsToDb(const KernelReportMap& reports,
+                                       KernelStatsDb* dst);
 
 // Inserts or aggregates KernelReports into the given KernelReportMap.
 void InsertOrUpdateKernelReport(const KernelReport& kernel,
diff --git a/tensorflow/core/profiler/utils/op_metrics_db_utils.cc b/tensorflow/core/profiler/utils/op_metrics_db_utils.cc
index 863d2f79819..422e8e37d49 100644
--- a/tensorflow/core/profiler/utils/op_metrics_db_utils.cc
+++ b/tensorflow/core/profiler/utils/op_metrics_db_utils.cc
@@ -106,16 +106,18 @@ OpMetricsDb CreateTfMetricsDbFromDeviceOpMetricsDb(
   OpMetricsDb tf_op_metrics_db;
   DeviceTfOpMetricsDbBuilder builder(&tf_op_metrics_db);
   for (const auto& device_op_metrics : device_op_metrics_db.metrics_db()) {
-    if (!device_op_metrics.provenance().empty()) {
-      TfOp tf_op = ParseTfOpFullname(device_op_metrics.provenance());
-      builder.UpdateTfOpMetricsWithDeviceOpMetrics(tf_op.name, tf_op.type,
-                                                   device_op_metrics);
-    } else {
-      DCHECK(IsIdleOp(device_op_metrics));
+    if (IsIdleOp(device_op_metrics)) {
       if (with_idle) {
         builder.UpdateTfOpMetricsWithDeviceOpMetrics(kIdle, kIdle,
                                                      device_op_metrics);
       }
+    } else if (device_op_metrics.provenance().empty()) {
+      builder.UpdateTfOpMetricsWithDeviceOpMetrics(
+          device_op_metrics.name(), kUnknownOp, device_op_metrics);
+    } else {
+      TfOp tf_op = ParseTfOpFullname(device_op_metrics.provenance());
+      builder.UpdateTfOpMetricsWithDeviceOpMetrics(tf_op.name, tf_op.type,
+                                                   device_op_metrics);
     }
   }
   tf_op_metrics_db.set_total_op_time_ps(
diff --git a/tensorflow/core/profiler/utils/op_utils.cc b/tensorflow/core/profiler/utils/op_utils.cc
index 1f01e00cc8e..2e10ae59c3e 100644
--- a/tensorflow/core/profiler/utils/op_utils.cc
+++ b/tensorflow/core/profiler/utils/op_utils.cc
@@ -82,13 +82,9 @@ void DeviceOpMetricsDbBuilder::EnterOp(
   op_metrics->set_occurrences(op_metrics->occurrences() + occurrences);
   op_metrics->set_time_ps(op_metrics->time_ps() + time_ps);
   op_metrics->set_self_time_ps(op_metrics->self_time_ps() + self_time_ps);
-  op_metrics->set_flops(op_metrics->flops() +
-                        GetCappedPerf(flops * occurrences, self_time_ps,
-                                      peak_tera_flops_per_second_));
-  op_metrics->set_bytes_accessed(
-      op_metrics->bytes_accessed() +
-      GetCappedPerf(bytes_accessed * occurrences, self_time_ps,
-                    peak_hbm_bw_giga_bytes_per_second_ / 1000));
+  op_metrics->set_flops(op_metrics->flops() + flops * occurrences);
+  op_metrics->set_bytes_accessed(op_metrics->bytes_accessed() +
+                                 bytes_accessed * occurrences);
   CombineMemoryAccessedBreakdown(
       memory_accessed_breakdown,
       op_metrics->mutable_memory_accessed_breakdown());
diff --git a/tensorflow/core/profiler/utils/step_intersection.cc b/tensorflow/core/profiler/utils/step_intersection.cc
new file mode 100644
index 00000000000..af51590e68b
--- /dev/null
+++ b/tensorflow/core/profiler/utils/step_intersection.cc
@@ -0,0 +1,286 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/profiler/utils/step_intersection.h"
+
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/profiler/utils/timespan.h"
+
+namespace tensorflow {
+namespace profiler {
+
+namespace {
+
+// Returns the timespan in this step (across all cores).
+Timespan StepTimespan(const PerCoreStepInfo& percore_stepinfo) {
+  uint64 min_ps = kuint64max;
+  uint64 max_ps = 0;
+  for (const auto& core_stepinfo : percore_stepinfo.step_info_per_core()) {
+    const auto& stepinfo = core_stepinfo.second;
+    uint64 begin_ps = stepinfo.begin_ps();
+    uint64 end_ps = begin_ps + stepinfo.duration_ps();
+    min_ps = std::min(min_ps, begin_ps);
+    max_ps = std::max(max_ps, end_ps);
+  }
+  return (min_ps < max_ps) ? Timespan::FromEndPoints(min_ps, max_ps)
+                           : Timespan();
+}
+
+// Returns the timespan across all steps in the given step_db.
+Timespan AllStepsTimespan(const StepDatabaseResult& step_db) {
+  uint64 min_ps = kuint64max;
+  uint64 max_ps = 0;
+  for (const auto& step : step_db.step_sequence()) {
+    Timespan timespan = StepTimespan(step);
+    uint64 begin_ps = timespan.begin_ps();
+    uint64 end_ps = timespan.end_ps();
+    min_ps = std::min(min_ps, begin_ps);
+    max_ps = std::max(max_ps, end_ps);
+  }
+  return (min_ps < max_ps) ? Timespan::FromEndPoints(min_ps, max_ps)
+                           : Timespan();
+}
+
+struct AlignmentInfo {
+  StepsAlignment alignment;
+  double similarity;
+};
+
+// Computes the similarity between the given two steps. The closer their
+// timespans are, the larger is the similarity.
+double StepSimilarity(const PerCoreStepInfo& subordinate_step,
+                      const PerCoreStepInfo& chief_step) {
+  Timespan subordinate_timespan = StepTimespan(subordinate_step);
+  Timespan chief_timespan = StepTimespan(chief_step);
+  return chief_timespan.OverlappedDurationPs(subordinate_timespan);
+}
+
+// If the subordinate steps and the chief steps are aligned at the given anchor
+// points (i.e. at the subordinate_anchor step on the subordinate sequence, at
+// the chief_anchor step on the chief sequence), returns the corresponding
+// AlignmentInfo.
+AlignmentInfo ComputeAlignmentInfo(const StepDatabaseResult& subordinate,
+                                   uint32 subordinate_anchor,
+                                   const StepDatabaseResult& chief,
+                                   uint32 chief_anchor) {
+  // Assumes that the step at subordinate_anchor on the subordinate sequence is
+  // aligned with the step at the chief_anchor on the chief sequence. Then the
+  // number of steps before the anchor is the minimum of the number of steps
+  // before the anchor in the subordinate and that before the anchor in the
+  // chief. Similarly, the number of steps after the anchor is the minimum of
+  // the number of steps after the anchor in the subordinate and that after the
+  // anchor in the chief.
+  uint32 pre_anchor_steps = std::min(subordinate_anchor, chief_anchor);
+  uint32 post_anchor_steps =
+      std::min(subordinate.step_sequence_size() - subordinate_anchor,
+               chief.step_sequence_size() - chief_anchor);
+  // total number of steps aligned = pre_anchor_steps + post_anchor_steps.
+  uint32 alignment_steps = pre_anchor_steps + post_anchor_steps;
+
+  double similarity = 0;
+  // Where the aligned steps begin on the subordinate sequence.
+  uint32 begin_subordinate_idx = subordinate_anchor - pre_anchor_steps;
+  // Where the aligned steps begin on the chief sequence.
+  uint32 begin_chief_idx = chief_anchor - pre_anchor_steps;
+
+  for (uint32 i = 0; i < alignment_steps; i++) {
+    // Accumulates the similarity at each step.
+    similarity +=
+        StepSimilarity(subordinate.step_sequence(begin_subordinate_idx + i),
+                       chief.step_sequence(begin_chief_idx + i));
+  }
+  StepsAlignment alignment = {begin_subordinate_idx, begin_chief_idx,
+                              alignment_steps};
+  return {alignment, similarity};
+}
+
+// Returns the best alignment for aligning subordinate against chief.
+StepsAlignment FindStepsAlignment(const StepDatabaseResult& subordinate,
+                                  const StepDatabaseResult& chief) {
+  double max_similarity = -1;
+  StepsAlignment alignment = {0, 0, 0};
+  if (subordinate.step_sequence_size() == 0 || chief.step_sequence_size() == 0)
+    return alignment;
+  for (auto c = 0; c < chief.step_sequence_size(); c++) {
+    AlignmentInfo info =
+        ComputeAlignmentInfo(subordinate, /*subordinate_anchor=*/0, chief, c);
+    if (info.similarity <= max_similarity) continue;
+    max_similarity = info.similarity;
+    alignment = info.alignment;
+  }
+  for (auto s = 1; s < subordinate.step_sequence_size(); s++) {
+    // s starts at 1 instead of 0, because the loop above already considers
+    // (s=0, c=0).
+    AlignmentInfo info =
+        ComputeAlignmentInfo(subordinate, s, chief, /*chief_anchor=*/0);
+    if (info.similarity <= max_similarity) continue;
+    max_similarity = info.similarity;
+    alignment = info.alignment;
+  }
+  return alignment;
+}
+
+std::string StringStepsAlignment(const StepsAlignment& alignment) {
+  return absl::StrCat(
+      "[begin_subordinate_idx: ", alignment.begin_subordinate_idx,
+      ", begin_chief_idx: ", alignment.begin_chief_idx,
+      ", num_steps: ", alignment.num_steps, "]");
+}
+
+std::string StringDstStepNumbers(const std::vector<uint32>& step_numbers) {
+  std::string str;
+  absl::StrAppend(&str, "[");
+  for (auto i = 0; i < step_numbers.size(); i++) {
+    if (i > 0) absl::StrAppend(&str, ", ");
+    absl::StrAppend(&str, step_numbers[i]);
+  }
+  absl::StrAppend(&str, "]");
+  return str;
+}
+
+std::string StringSrcToDstIndexMap(uint32 src_first_step_idx,
+                                   uint32 num_steps) {
+  std::string str;
+  absl::StrAppend(&str, "[");
+  for (auto i = 0; i < num_steps; i++) {
+    if (i > 0) absl::StrAppend(&str, ", ");
+    absl::StrAppend(&str, src_first_step_idx + i, ":", i);
+  }
+  absl::StrAppend(&str, "]");
+  return str;
+}
+
+}  // namespace
+
+StepIntersection::StepIntersection(
+    uint32 max_steps,
+    const absl::flat_hash_map<uint32, const StepDatabaseResult*>&
+        perhost_stepdb) {
+  // Figures out the host with the shortest timespan among their steps (called
+  // this host the "chief").
+  chief_host_id_ = kuint32max;
+  uint64 min_duration_ps = kuint64max;
+  const StepDatabaseResult* chief_step_db = nullptr;
+  for (const auto& hostid_stepdb : perhost_stepdb) {
+    auto host_id = hostid_stepdb.first;
+    const auto& step_db = hostid_stepdb.second;
+    Timespan timespan = AllStepsTimespan(*step_db);
+    if (timespan.duration_ps() < min_duration_ps) {
+      chief_host_id_ = host_id;
+      chief_step_db = step_db;
+      min_duration_ps = timespan.duration_ps();
+    }
+  }
+  if (chief_host_id_ == kuint32max) {
+    // There is no step at all on any host.
+    steps_dropped_ = 0;
+    begin_chief_idx_ = 0;
+    end_chief_idx_ = 0;
+    return;
+  }
+
+  uint32 max_begin_chief_idx = 0;
+  uint32 min_end_chief_idx = kuint32max;
+  // Aligns the steps in all hosts with those in the chief.
+  for (const auto& hostid_stepdb : perhost_stepdb) {
+    auto host_id = hostid_stepdb.first;
+    const auto& step_db = hostid_stepdb.second;
+    if (host_id == chief_host_id_) {
+      // Simply aligns with itself.
+      perhost_alignment_[host_id] = {
+          /*begin_subordinate_idx=*/0, /*begin_chief_idx=*/0,
+          static_cast<uint32>(step_db->step_sequence_size())};
+    } else {
+      perhost_alignment_[host_id] =
+          FindStepsAlignment(*step_db, *chief_step_db);
+    }
+    // Intersects this host's alignment with other hosts' alignments.
+    uint32 host_begin_chief_idx = perhost_alignment_[host_id].begin_chief_idx;
+    max_begin_chief_idx = std::max(max_begin_chief_idx, host_begin_chief_idx);
+    uint32 host_end_chief_idx = perhost_alignment_[host_id].begin_chief_idx +
+                                perhost_alignment_[host_id].num_steps;
+    min_end_chief_idx = std::min(min_end_chief_idx, host_end_chief_idx);
+  }
+  DCHECK(max_begin_chief_idx <= min_end_chief_idx);
+
+  begin_chief_idx_ = max_begin_chief_idx;
+
+  // Takes max_steps into account.
+  uint32 num_steps = min_end_chief_idx - max_begin_chief_idx;
+  if (num_steps > max_steps) {
+    steps_dropped_ = num_steps - max_steps;
+    // TODO(ckluk): Drops from both ends to avoid incomplete steps at the
+    // beginning and end of the profile.
+    end_chief_idx_ = max_begin_chief_idx + max_steps;
+  } else {
+    steps_dropped_ = 0;
+    end_chief_idx_ = min_end_chief_idx;
+  }
+}
+
+std::vector<uint32> StepIntersection::DstStepNumbers() const {
+  // TODO(ckluk): Honors training-loop boundaries (if more than one loop
+  // sampled).
+  std::vector<uint32> result;
+  result.reserve(NumSteps());
+  for (uint32 i = 0; i < NumSteps(); i++) {
+    result.push_back(i);
+  }
+  return result;
+}
+
+uint32 StepIntersection::FirstStepIndex(uint32 host_id) const {
+  const auto* alignment = gtl::FindOrNull(perhost_alignment_, host_id);
+  if (alignment == nullptr) return 0;
+  DCHECK(alignment->begin_chief_idx <= begin_chief_idx_);
+  uint32 shift = begin_chief_idx_ - alignment->begin_chief_idx;
+  uint32 begin_subordinate_idx = alignment->begin_subordinate_idx + shift;
+  return begin_subordinate_idx;
+}
+
+std::string StepIntersection::DebugString() const {
+  std::string str;
+  absl::StrAppend(&str, "chief host id_: ", chief_host_id_, "\n");
+  absl::StrAppend(&str, "begin_chief_idx_: ", begin_chief_idx_,
+                  ", num_steps: ", NumSteps(), "\n");
+  absl::StrAppend(
+      &str, "DstStepNumbers(): ", StringDstStepNumbers(DstStepNumbers()), "\n");
+
+  std::vector<uint32> host_ids;
+  host_ids.reserve(perhost_alignment_.size());
+  for (const auto& hostid_alignment : perhost_alignment_) {
+    auto host_id = hostid_alignment.first;
+    host_ids.push_back(host_id);
+  }
+  absl::c_sort(host_ids);
+
+  absl::StrAppend(&str, "perhost_alignment:\n");
+  for (const auto host_id : host_ids) {
+    const auto* ptr = gtl::FindOrNull(perhost_alignment_, host_id);
+    if (ptr == nullptr) continue;
+    absl::StrAppend(&str, "host: ", host_id,
+                    ", step-alignment: ", StringStepsAlignment(*ptr), "\n");
+  }
+  absl::StrAppend(&str, "SrcToDstIndexMap():\n");
+  for (const auto host_id : host_ids) {
+    absl::StrAppend(&str, "host: ", host_id, ", src-to-dst-index-map: ",
+                    StringSrcToDstIndexMap(FirstStepIndex(host_id), NumSteps()),
+                    "\n");
+  }
+  return str;
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/step_intersection.h b/tensorflow/core/profiler/utils/step_intersection.h
new file mode 100644
index 00000000000..12501288aa5
--- /dev/null
+++ b/tensorflow/core/profiler/utils/step_intersection.h
@@ -0,0 +1,75 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_UTILS_STEP_INTERSECTION_H_
+#define TENSORFLOW_CORE_PROFILER_UTILS_STEP_INTERSECTION_H_
+
+#include <algorithm>
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Description of how two step sequences are aligned.
+struct StepsAlignment {
+  uint32 begin_subordinate_idx;  // where the alignment begins on the
+                                 // subordinate steps.
+  uint32 begin_chief_idx;  // where the alignment begins on the chief steps.
+  uint32 num_steps;        // aligned for how many steps.
+};
+
+class StepIntersection {
+ public:
+  StepIntersection(
+      uint32 max_steps,
+      const absl::flat_hash_map</*host_id=*/uint32, const StepDatabaseResult*>&
+          perhost_stepdb);
+
+  // Returns the number of steps in the intersection.
+  uint32 NumSteps() const { return end_chief_idx_ - begin_chief_idx_; }
+
+  // Returns the step numbers for the destination (i.e. the intersection
+  // result).
+  std::vector<uint32> DstStepNumbers() const;
+
+  // Returns the index to the step in the given host that corresponds to the
+  // first step in the intersection.
+  uint32 FirstStepIndex(uint32 host_id) const;
+
+  // Returns the number of steps dropped due to the max_steps constraint
+  // specified in the constructor.
+  uint32 StepsDropped() const { return steps_dropped_; }
+
+  std::string DebugString() const;
+
+ private:
+  absl::flat_hash_map</*host_id=*/uint32, StepsAlignment> perhost_alignment_;
+  uint32
+      chief_host_id_;  // the host whose step sequence is selected as the chief.
+  uint32 steps_dropped_;  // number of steps dropped.
+  // The begin and end indices to the chief step sequence for this step
+  // intersection. Note that the begin index is inclusive but the end index is
+  // exclusive.
+  uint32 begin_chief_idx_;
+  uint32 end_chief_idx_;
+};
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_UTILS_STEP_INTERSECTION_H_
diff --git a/tensorflow/core/profiler/utils/step_intersection_test.cc b/tensorflow/core/profiler/utils/step_intersection_test.cc
new file mode 100644
index 00000000000..038d2226c9d
--- /dev/null
+++ b/tensorflow/core/profiler/utils/step_intersection_test.cc
@@ -0,0 +1,200 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/profiler/utils/step_intersection.h"
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace profiler {
+namespace {
+
+using PerHostStepDb =
+    absl::flat_hash_map<uint32 /*=host_id*/, StepDatabaseResult>;
+
+constexpr uint64 kStepDurationPs = 2000000000;
+constexpr uint32 kNumStepsPerHost = 10;
+constexpr uint64 kStepGapPs = 0;
+constexpr uint32 kNumCoresPerHost = 8;
+
+PerCoreStepInfo CreateOneTestStep(uint32 host_id, uint32 num_steps,
+                                  uint32 step_idx, uint64 step_begin_ps) {
+  PerCoreStepInfo result;
+  uint32 step_num =
+      step_idx * host_id;  // creates the situation where each host has a
+                           // different step number for the same step.
+  result.set_step_num(step_num);
+  StepInfoResult info;
+  info.set_step_num(step_num);
+  if (host_id == 0 && step_idx == (num_steps - 1)) {
+    // Makes the last step on host_id is little bit shorter so that host-0 will
+    // be chosen as the chief.
+    info.set_duration_ps(kStepDurationPs - 1);
+  } else {
+    info.set_duration_ps(kStepDurationPs);
+  }
+  info.set_begin_ps(step_begin_ps);
+  // Don't care about the rest of the fields in StepInfoResult.
+  for (uint32 core_id = 0; core_id < kNumCoresPerHost; core_id++) {
+    (*result.mutable_step_info_per_core())[core_id] = info;
+    // Don't care about the rest of the fields in PerCoreStepInfo.
+  }
+  return result;
+}
+
+PerHostStepDb CreateTestSteps(uint32 num_hosts, uint64 shift_ps) {
+  PerHostStepDb result;
+  uint64 first_step_begin_ps = 0;
+  for (uint32 host_id = 0; host_id < num_hosts; host_id++) {
+    StepDatabaseResult step_db;
+    uint64 step_begin_ps = first_step_begin_ps;
+    for (uint32 step_idx = 0; step_idx < kNumStepsPerHost; step_idx++) {
+      *step_db.add_step_sequence() =
+          CreateOneTestStep(host_id, kNumStepsPerHost, step_idx, step_begin_ps);
+      step_begin_ps += (kStepDurationPs + kStepGapPs);
+    }
+    result[host_id] = step_db;
+    first_step_begin_ps += shift_ps;
+  }
+  return result;
+}
+
+PerHostStepDb CreateNoStep(uint32 num_hosts) {
+  PerHostStepDb result;
+  for (uint32 host_id = 0; host_id < num_hosts; host_id++) {
+    StepDatabaseResult step_db;
+    result[host_id] = step_db;
+  }
+  return result;
+}
+
+absl::flat_hash_map<uint32 /*=host_id*/, const StepDatabaseResult*> Convert(
+    const PerHostStepDb& perhost_stepdb) {
+  absl::flat_hash_map<uint32 /*=host_id*/, const StepDatabaseResult*> result;
+  for (const auto& hostid_stepdb : perhost_stepdb) {
+    auto host_id = hostid_stepdb.first;
+    const auto& step_db = hostid_stepdb.second;
+    result[host_id] = &step_db;
+  }
+  return result;
+}
+
+TEST(StepIntersectionTest, EachHostShiftedBy1StepDuration) {
+  uint32 num_hosts = 4;
+  uint64 shift_ps = kStepDurationPs;
+
+  PerHostStepDb perhost_stepdb = CreateTestSteps(num_hosts, shift_ps);
+  StepIntersection intersection =
+      StepIntersection(kNumStepsPerHost, Convert(perhost_stepdb));
+  EXPECT_EQ(intersection.StepsDropped(), 0);
+  uint32 dst_num_steps = kNumStepsPerHost - num_hosts + 1;
+  EXPECT_EQ(intersection.NumSteps(), dst_num_steps);
+
+  uint32 src_first_step_index = intersection.FirstStepIndex(0);
+  EXPECT_EQ(src_first_step_index, num_hosts - 1);
+  std::vector<uint32> dst_step_numbers = intersection.DstStepNumbers();
+  for (uint32 i = 0; i < dst_num_steps; i++) {
+    EXPECT_EQ(dst_step_numbers[i], i);
+  }
+}
+
+TEST(StepIntersectionTest, ExactlyNoShift) {
+  uint32 num_hosts = 4;
+  uint64 shift_ps = 0;
+
+  PerHostStepDb perhost_stepdb = CreateTestSteps(num_hosts, shift_ps);
+  StepIntersection intersection =
+      StepIntersection(kNumStepsPerHost, Convert(perhost_stepdb));
+  EXPECT_EQ(intersection.StepsDropped(), 0);
+  uint32 dst_num_steps = kNumStepsPerHost;
+  EXPECT_EQ(intersection.NumSteps(), dst_num_steps);
+
+  std::vector<uint32> dst_step_numbers = intersection.DstStepNumbers();
+  for (uint32 i = 0; i < dst_num_steps; i++) {
+    EXPECT_EQ(dst_step_numbers[i], i);
+  }
+  for (uint32 host_id = 0; host_id < num_hosts; host_id++) {
+    uint32 src_first_step_index = intersection.FirstStepIndex(host_id);
+    EXPECT_EQ(src_first_step_index, 0);
+  }
+}
+
+TEST(StepIntersectionTest, EachHostShiftedByJustABit) {
+  uint32 num_hosts = 4;
+  uint64 shift_ps = 100;
+
+  PerHostStepDb perhost_stepdb = CreateTestSteps(num_hosts, shift_ps);
+  StepIntersection intersection =
+      StepIntersection(kNumStepsPerHost, Convert(perhost_stepdb));
+  EXPECT_EQ(intersection.StepsDropped(), 0);
+  uint32 dst_num_steps = kNumStepsPerHost;
+  EXPECT_EQ(intersection.NumSteps(), dst_num_steps);
+
+  std::vector<uint32> dst_step_numbers = intersection.DstStepNumbers();
+  for (uint32 i = 0; i < dst_num_steps; i++) {
+    EXPECT_EQ(dst_step_numbers[i], i);
+  }
+  for (uint32 host_id = 0; host_id < num_hosts; host_id++) {
+    uint32 src_first_step_index = intersection.FirstStepIndex(host_id);
+    EXPECT_EQ(src_first_step_index, 0);
+  }
+}
+
+TEST(StepIntersectionTest, SingleHost) {
+  uint32 num_hosts = 1;
+  uint64 shift_ps = 0;
+
+  PerHostStepDb perhost_stepdb = CreateTestSteps(num_hosts, shift_ps);
+  StepIntersection intersection =
+      StepIntersection(kNumStepsPerHost, Convert(perhost_stepdb));
+  EXPECT_EQ(intersection.StepsDropped(), 0);
+  uint32 dst_num_steps = kNumStepsPerHost;
+  EXPECT_EQ(intersection.NumSteps(), dst_num_steps);
+
+  std::vector<uint32> dst_step_numbers = intersection.DstStepNumbers();
+  for (uint32 i = 0; i < dst_num_steps; i++) {
+    EXPECT_EQ(dst_step_numbers[i], i);
+  }
+  for (uint32 host_id = 0; host_id < num_hosts; host_id++) {
+    uint32 src_first_step_index = intersection.FirstStepIndex(host_id);
+    EXPECT_EQ(src_first_step_index, 0);
+  }
+}
+
+TEST(StepIntersectionTest, WithMaxSteps) {
+  uint32 num_hosts = 4;
+  uint64 shift_ps = 0;
+  uint32 max_steps = 3;
+
+  PerHostStepDb perhost_stepdb = CreateTestSteps(num_hosts, shift_ps);
+  StepIntersection intersection =
+      StepIntersection(max_steps, Convert(perhost_stepdb));
+  EXPECT_EQ(intersection.StepsDropped(), kNumStepsPerHost - max_steps);
+  EXPECT_EQ(intersection.NumSteps(), max_steps);
+}
+
+TEST(StepIntersectionTest, NoStep) {
+  uint32 num_hosts = 4;
+  uint32 max_steps = 100;
+  PerHostStepDb perhost_stepdb = CreateNoStep(num_hosts);
+  StepIntersection intersection =
+      StepIntersection(max_steps, Convert(perhost_stepdb));
+  EXPECT_EQ(intersection.NumSteps(), 0);
+}
+
+}  // namespace
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/tf_op_utils.cc b/tensorflow/core/profiler/utils/tf_op_utils.cc
index e58ccba445b..941676079b9 100644
--- a/tensorflow/core/profiler/utils/tf_op_utils.cc
+++ b/tensorflow/core/profiler/utils/tf_op_utils.cc
@@ -32,6 +32,7 @@ namespace {
 
 const absl::string_view kIterator = "Iterator";
 const absl::string_view kSeparator = "::";
+constexpr char kNameScopeSeparator = '/';
 
 }  // namespace
 
@@ -51,10 +52,17 @@ bool IsTfOpType(absl::string_view op_type) {
 }
 
 bool IsJaxOpType(absl::string_view op_type) {
-  static const LazyRE2 kJaxOpTypeRegEx = {"[a-z_]*"};
+  static const LazyRE2 kJaxOpTypeRegEx = {"[a-z_][a-z_]*"};
   return RE2::FullMatch(op_type, *kJaxOpTypeRegEx);
 }
 
+bool IsJaxOpNameAndType(absl::string_view op_name, absl::string_view op_type) {
+  if (op_name.empty() || !IsJaxOpType(op_type)) return false;
+  std::vector<absl::string_view> split_result =
+      absl::StrSplit(op_name, kNameScopeSeparator);
+  return absl::StrContains(split_result.back(), op_type);
+}
+
 TfOp ParseTfOpFullname(absl::string_view tf_op_fullname) {
   // TF Op names have the format "name:type".
   TfOp tf_op = {Category::kUnknown, tf_op_fullname, kUnknownOp};
@@ -85,7 +93,8 @@ TfOp ParseTfOpFullname(absl::string_view tf_op_fullname) {
 }
 
 std::vector<absl::string_view> ParseTfNameScopes(const TfOp& tf_op) {
-  std::vector<absl::string_view> name_scopes = absl::StrSplit(tf_op.name, '/');
+  std::vector<absl::string_view> name_scopes =
+      absl::StrSplit(tf_op.name, kNameScopeSeparator);
   // The last element is an op name not TF name scope.
   if (!name_scopes.empty()) name_scopes.pop_back();
   return name_scopes;
diff --git a/tensorflow/core/profiler/utils/tf_op_utils.h b/tensorflow/core/profiler/utils/tf_op_utils.h
index f0668190a07..af14e1ccb8e 100644
--- a/tensorflow/core/profiler/utils/tf_op_utils.h
+++ b/tensorflow/core/profiler/utils/tf_op_utils.h
@@ -75,6 +75,16 @@ inline bool IsInfeedEnqueueOp(absl::string_view tf_op_type) {
   return tf_op_type == "InfeedEnqueue" || tf_op_type == "InfeedEnqueueTuple";
 }
 
+// Returns true if the given op is for outside compilation.
+inline bool IsOutsideCompilationOp(absl::string_view tf_op_fullname,
+                                   absl::string_view hlo_expression) {
+  if (absl::EndsWith(tf_op_fullname, ":XlaSendToHost")) return true;
+  if (absl::StrContains(hlo_expression, "send-done") &&
+      absl::StrContains(hlo_expression, "is_host_transfer=true"))
+    return true;
+  return false;
+}
+
 // Returns true if the given name is a TensorFlow embedding op.
 inline bool IsEmbeddingOp(absl::string_view tf_op_fullname) {
   return absl::StrContains(tf_op_fullname, "Embedding");
@@ -104,6 +114,9 @@ bool IsTfOpType(absl::string_view op_type);
 // Returns true if the given string matches JAX pattern.
 bool IsJaxOpType(absl::string_view op_type);
 
+// Returns true if the given strings match JAX pattern.
+bool IsJaxOpNameAndType(absl::string_view op_name, absl::string_view op_type);
+
 }  // namespace profiler
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/profiler/utils/xplane_schema.cc b/tensorflow/core/profiler/utils/xplane_schema.cc
index 19831a53c4c..dec10e327bc 100644
--- a/tensorflow/core/profiler/utils/xplane_schema.cc
+++ b/tensorflow/core/profiler/utils/xplane_schema.cc
@@ -30,6 +30,7 @@ const absl::string_view kGpuPlanePrefix = "/device:GPU:";
 const absl::string_view kCuptiDriverApiPlaneName = "/host:CUPTI";
 const absl::string_view kMetadataPlaneName = "/host:metadata";
 const absl::string_view kTFStreamzPlaneName = "/host:tfstreamz";
+const absl::string_view kPythonTracerPlaneName = "/host:python-tracer";
 
 const absl::string_view kStepLineName = "Steps";
 const absl::string_view kTensorFlowNameScopeLineName = "TensorFlow Name Scope";
@@ -107,6 +108,9 @@ const HostEventTypeMap& GetHostEventTypeMap() {
       {"MapAndBatchConsume", kMapAndBatchConsume},
       {"ParseExampleProduce", kParseExampleProduce},
       {"ParseExampleConsume", kParseExampleConsume},
+      // Batching related.
+      {"BatchingSessionRun", kBatchingSessionRun},
+      {"ProcessBatch", kProcessBatch},
       // JAX related.
       {"LocalExecutable::ExecuteOnLocalDevices", kExecuteOnLocalDevices},
       // GPU related.
@@ -181,6 +185,7 @@ const StatTypeMap& GetStatTypeMap() {
       {"tracing_count", kTfFunctionTracingCount},
       {"flops", kFlops},
       {"bytes_accessed", kBytesAccessed},
+      {"selected_group_ids", kSelectedGroupIds},
       // Performance counter related.
       {"Raw Value", kRawValue},
       {"Scaled Value", kScaledValue},
diff --git a/tensorflow/core/profiler/utils/xplane_schema.h b/tensorflow/core/profiler/utils/xplane_schema.h
index ea3656106ce..ef3cd819d16 100644
--- a/tensorflow/core/profiler/utils/xplane_schema.h
+++ b/tensorflow/core/profiler/utils/xplane_schema.h
@@ -36,6 +36,8 @@ ABSL_CONST_INIT extern const absl::string_view kCuptiDriverApiPlaneName;
 ABSL_CONST_INIT extern const absl::string_view kMetadataPlaneName;
 // Name of XPlane that contains kpi related metrics.
 ABSL_CONST_INIT extern const absl::string_view kTFStreamzPlaneName;
+// Name of XPlane that contains events from python tracer.
+ABSL_CONST_INIT extern const absl::string_view kPythonTracerPlaneName;
 
 // Names of XLines that contain ML-level events.
 ABSL_CONST_INIT extern const absl::string_view kStepLineName;
@@ -97,6 +99,9 @@ enum HostEventType {
   kMapAndBatchConsume,
   kParseExampleProduce,
   kParseExampleConsume,
+  // Batching related.
+  kBatchingSessionRun,
+  kProcessBatch,
   // JAX related.
   kExecuteOnLocalDevices,
   // GPU related.
@@ -169,6 +174,7 @@ enum StatType {
   kTfFunctionTracingCount,
   kFlops,
   kBytesAccessed,
+  kSelectedGroupIds,
   // Performance counter related.
   kRawValue,
   kScaledValue,
diff --git a/tensorflow/core/profiler/utils/xplane_utils.cc b/tensorflow/core/profiler/utils/xplane_utils.cc
index 867d1315053..825469f9eab 100644
--- a/tensorflow/core/profiler/utils/xplane_utils.cc
+++ b/tensorflow/core/profiler/utils/xplane_utils.cc
@@ -40,13 +40,6 @@ Timespan XEventTimespan(const XEvent& event) {
   return Timespan(event.offset_ps(), event.duration_ps());
 }
 
-// Functor that compares XEvents of the same XLine for sorting by timespan.
-struct XEventsComparator {
-  bool operator()(const XEvent* a, const XEvent* b) const {
-    return XEventTimespan(*a) < XEventTimespan(*b);
-  }
-};
-
 }  // namespace
 
 const XPlane* FindPlaneWithName(const XSpace& space, absl::string_view name) {
@@ -144,6 +137,10 @@ void RemoveEmptyLines(XPlane* plane) {
                lines->end());
 }
 
+bool XEventsComparator::operator()(const XEvent* a, const XEvent* b) const {
+  return XEventTimespan(*a) < XEventTimespan(*b);
+}
+
 void SortXPlane(XPlane* plane) {
   for (XLine& line : *plane->mutable_lines()) {
     auto& events = *line.mutable_events();
diff --git a/tensorflow/core/profiler/utils/xplane_utils.h b/tensorflow/core/profiler/utils/xplane_utils.h
index ff65f5af3ef..5cd5275e85e 100644
--- a/tensorflow/core/profiler/utils/xplane_utils.h
+++ b/tensorflow/core/profiler/utils/xplane_utils.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/core/profiler/utils/trace_utils.h"
 
 namespace tensorflow {
 namespace profiler {
@@ -75,6 +76,26 @@ void SortXPlane(XPlane* plane);
 // Sorts each plane of the XSpace.
 void SortXSpace(XSpace* space);
 
+// Functor that compares XEvents for sorting by timespan.
+struct XEventsComparator {
+  bool operator()(const XEvent* a, const XEvent* b) const;
+};
+
+// Returns a sorted vector of all XEvents in the given XPlane.
+template <class Compare>
+std::vector<XEvent*> GetSortedEvents(XPlane* plane, Compare comp,
+                                     bool include_derived_events = false) {
+  std::vector<XEvent*> events;
+  for (XLine& line : *plane->mutable_lines()) {
+    if (!include_derived_events && IsDerivedThreadId(line.id())) continue;
+    for (XEvent& event : *line.mutable_events()) {
+      events.push_back(&event);
+    }
+  }
+  absl::c_sort(events, XEventsComparator());
+  return events;
+}
+
 // Normalize timestamps by time-shifting to start_time_ns_ as origin.
 void NormalizeTimestamps(XPlane* plane, uint64 start_time_ns);
 void NormalizeTimestamps(XSpace* space, uint64 start_time_ns);
diff --git a/tensorflow/core/profiler/utils/xplane_visitor.h b/tensorflow/core/profiler/utils/xplane_visitor.h
index 6605bdf5658..f0e4204e4d3 100644
--- a/tensorflow/core/profiler/utils/xplane_visitor.h
+++ b/tensorflow/core/profiler/utils/xplane_visitor.h
@@ -101,11 +101,18 @@ class XStatsOwner {
   const XPlaneVisitor* metadata_;
 };
 
+using XEventMetadataVisitor = XStatsOwner<XEventMetadata>;
+
 class XEventVisitor : public XStatsOwner<XEvent> {
  public:
   // REQUIRED: plane, line and event cannot be nullptr.
   XEventVisitor(const XPlaneVisitor* plane, const XLine* line,
                 const XEvent* event);
+
+  XEventMetadataVisitor MetadataStats() const {
+    return XEventMetadataVisitor(plane_, metadata_);
+  }
+
   int64 Id() const { return event_->metadata_id(); }
 
   absl::string_view Name() const { return metadata_->name(); }
diff --git a/tensorflow/core/protobuf/data/experimental/service_config.proto b/tensorflow/core/protobuf/data/experimental/service_config.proto
index 017aaa2a960..7a0aa16e2c4 100644
--- a/tensorflow/core/protobuf/data/experimental/service_config.proto
+++ b/tensorflow/core/protobuf/data/experimental/service_config.proto
@@ -9,12 +9,17 @@ message DispatcherConfig {
   int64 port = 1;
   // The protocol for the dispatcher to use when connecting to workers.
   string protocol = 2;
-  // An optional work directory to use for storing dispatcher state, and for
-  // recovering during restarts.
+  // A work directory to use for storing dispatcher state, and for recovering
+  // during restarts. The empty string indicates not to use any work directory.
   string work_dir = 3;
   // Whether to run in fault tolerant mode, where dispatcher state is saved
-  // across restarts.
+  // across restarts. Requires that `work_dir` is nonempty.
   bool fault_tolerant_mode = 4;
+  // How often the dispatcher should scan through to delete old and unused jobs.
+  int64 job_gc_check_interval_ms = 5;
+  // How long a job needs to be unused before it becomes a candidate for garbage
+  // collection.
+  int64 job_gc_timeout_ms = 6;
 }
 
 // Configuration for a tf.data service WorkerServer.
@@ -30,4 +35,6 @@ message WorkerConfig {
   // will be replaced with the worker's bound port. This is useful when the port
   // is set to `0`.
   string worker_address = 4;
+  // How often the worker should heartbeat to the master.
+  int64 heartbeat_interval_ms = 5;
 }
diff --git a/tensorflow/core/protobuf/eager_service.proto b/tensorflow/core/protobuf/eager_service.proto
index 57bbf48ac67..204acf6b1df 100644
--- a/tensorflow/core/protobuf/eager_service.proto
+++ b/tensorflow/core/protobuf/eager_service.proto
@@ -77,6 +77,8 @@ message QueueResponse {
   // `shape` and `tensor` cannot be set in the same response.
   // Shapes of output tensors for creating remote TensorHandles.
   repeated TensorShapeProto shape = 1;
+  // Optional. If set, represents the output devices of a function.
+  repeated string device = 3;
 
   // Output tensors of a remote function. Set when Operation.id is invalid.
   repeated TensorProto tensor = 2;
@@ -178,6 +180,9 @@ message RunComponentFunctionRequest {
   fixed64 context_id = 1;
 
   Operation operation = 2;
+
+  // The output indices of its parent function.
+  repeated int32 output_num = 3;
 }
 
 message RunComponentFunctionResponse {
diff --git a/tensorflow/core/protobuf/rewriter_config.proto b/tensorflow/core/protobuf/rewriter_config.proto
index 695e73f62e8..1600449e474 100644
--- a/tensorflow/core/protobuf/rewriter_config.proto
+++ b/tensorflow/core/protobuf/rewriter_config.proto
@@ -39,6 +39,13 @@ message RewriterConfig {
     AGGRESSIVE = 3;
   }
 
+  // Enum for layout conversion between NCHW and NHWC on CPU. Default is OFF.
+  enum CpuLayout {
+    NO_CONVERSION_ON_CPU = 0;
+    NCHW_TO_NHWC = 1;
+    NHWC_TO_NCHW = 2;
+  }
+
   // Enum controlling the number of times to run optimizers. The default is to
   // run them twice.
   enum NumIterationsType {
@@ -47,6 +54,9 @@ message RewriterConfig {
     TWO = 2;
   }
 
+  // CPU Conversion settings between NHCW and NCHW.
+  CpuLayout cpu_layout_conversion = 50;
+
   // Optimize tensor layouts (default is ON)
   // e.g. This will try to use NCHW layout on GPU which is faster.
   Toggle layout_optimizer = 1;
diff --git a/tensorflow/core/protobuf/saved_object_graph.proto b/tensorflow/core/protobuf/saved_object_graph.proto
index c756644f7ec..83ba782f2ae 100644
--- a/tensorflow/core/protobuf/saved_object_graph.proto
+++ b/tensorflow/core/protobuf/saved_object_graph.proto
@@ -140,6 +140,7 @@ message SavedVariable {
   VariableSynchronization synchronization = 4;
   VariableAggregation aggregation = 5;
   string name = 6;
+  string device = 7;
 }
 
 // Represents `FunctionSpec` used in `Function`. This represents a
diff --git a/tensorflow/core/protobuf/struct.proto b/tensorflow/core/protobuf/struct.proto
index ee0f089f2a3..c99eab5dd88 100644
--- a/tensorflow/core/protobuf/struct.proto
+++ b/tensorflow/core/protobuf/struct.proto
@@ -136,6 +136,7 @@ message TypeSpecProto {
     PER_REPLICA_SPEC = 8;     // PerReplicaSpec from distribute/values.py
     VARIABLE_SPEC = 9;        // tf.VariableSpec
     ROW_PARTITION_SPEC = 10;  // RowPartitionSpec from ragged/row_partition.py
+    NDARRAY_SPEC = 11;        // TF Numpy NDarray spec
   }
   TypeSpecClass type_spec_class = 1;
 
diff --git a/tensorflow/core/protobuf/tpu/optimization_parameters.proto b/tensorflow/core/protobuf/tpu/optimization_parameters.proto
index 53905a33a3b..f7748ef5689 100644
--- a/tensorflow/core/protobuf/tpu/optimization_parameters.proto
+++ b/tensorflow/core/protobuf/tpu/optimization_parameters.proto
@@ -96,13 +96,18 @@ message StochasticGradientDescentParameters {}
 // for a dynamic learning rate, it is nearly the same as long as the learning
 // rate does not change quickly. The benefit of setting multiply_linear_by_lr to
 // true is that the modified formula handles zero and near-zero learning rates
-// without producing NaNs, improving flexibility for learning rate ramp-up.
+// without producing NaNs, improving flexibility for learning rate ramp-up. The
+// allow_zero_accumulator parameter changes some internal formulas to allow zero
+// and near-zero accumulator values at the cost of some performance; this only
+// needs to be set if you are using an initial accumulator value of zero, which
+// is uncommon.
 message FtrlParameters {
   float l1 = 1;
   float l2 = 2;
   float lr_power = 3;
   float beta = 7;
   bool multiply_linear_by_lr = 6;
+  bool allow_zero_accumulator = 8;
 
   // Old initial accumulator parameters.
   reserved "initial_accum", "initial_linear";
diff --git a/tensorflow/core/protobuf/worker.proto b/tensorflow/core/protobuf/worker.proto
index 739ba8e03e6..0b4b50236c4 100644
--- a/tensorflow/core/protobuf/worker.proto
+++ b/tensorflow/core/protobuf/worker.proto
@@ -545,8 +545,10 @@ message CompleteGroupRequest {
   int32 group_key = 1;
   int32 group_size = 2;
   string device_type = 3;
-  repeated string device_name = 4;
   int32 collective_type = 5;
+  DeviceAttributes device_attributes = 6;
+
+  reserved 4;
 }
 
 // Gives the complete membership of the group identified by group_key.
@@ -555,9 +557,10 @@ message CompleteGroupResponse {
   int32 group_size = 2;
   string device_type = 3;
   int32 num_tasks = 4;  // number of distinct tasks hosting the devices
-  repeated string device_name = 5;
-  repeated string task_name = 6;  // task name prefixes of device_names
   bytes communicator_key = 7;
+  repeated DeviceAttributes device_attributes = 8;
+
+  reserved 5, 6;
 }
 
 // Supplies data about one collective op belonging to the instance identified
diff --git a/tensorflow/core/public/session.h b/tensorflow/core/public/session.h
index 772b57b5d20..f877bccd87a 100644
--- a/tensorflow/core/public/session.h
+++ b/tensorflow/core/public/session.h
@@ -131,9 +131,9 @@ class Session {
   /// `target_node_names` must be non-empty.
   ///
   /// REQUIRES: outputs is not nullptr if `output_tensor_names` is non-empty.
-  virtual Status Run(const std::vector<std::pair<string, Tensor> >& inputs,
-                     const std::vector<string>& output_tensor_names,
-                     const std::vector<string>& target_node_names,
+  virtual Status Run(const std::vector<std::pair<std::string, Tensor> >& inputs,
+                     const std::vector<std::string>& output_tensor_names,
+                     const std::vector<std::string>& target_node_names,
                      std::vector<Tensor>* outputs) = 0;
 
   /// \brief Implementations which support `RunOptions`.
@@ -169,18 +169,18 @@ class Session {
   /// discarded.
   /// NOTE: This API is still experimental and may change.
   virtual Status Run(const RunOptions& run_options,
-                     const std::vector<std::pair<string, Tensor> >& inputs,
-                     const std::vector<string>& output_tensor_names,
-                     const std::vector<string>& target_node_names,
+                     const std::vector<std::pair<std::string, Tensor> >& inputs,
+                     const std::vector<std::string>& output_tensor_names,
+                     const std::vector<std::string>& target_node_names,
                      std::vector<Tensor>* outputs, RunMetadata* run_metadata);
 
   /// \brief Like `Run` with `RunOptions` proto, but allows user to provide
   /// custom threadpool implementation via ThreadPoolOptions.
   /// NOTE: This API is still experimental and may change.
   virtual Status Run(const RunOptions& run_options,
-                     const std::vector<std::pair<string, Tensor> >& inputs,
-                     const std::vector<string>& output_tensor_names,
-                     const std::vector<string>& target_node_names,
+                     const std::vector<std::pair<std::string, Tensor> >& inputs,
+                     const std::vector<std::string>& output_tensor_names,
+                     const std::vector<std::string>& target_node_names,
                      std::vector<Tensor>* outputs, RunMetadata* run_metadata,
                      const thread::ThreadPoolOptions& threadpool_options) {
     return errors::Unimplemented(
@@ -192,19 +192,20 @@ class Session {
   /// `handle` that can be used to perform a sequence of partial feeds and
   /// fetches.
   /// NOTE: This API is still experimental and may change.
-  virtual Status PRunSetup(const std::vector<string>& input_names,
-                           const std::vector<string>& output_names,
-                           const std::vector<string>& target_nodes,
-                           string* handle);
+  virtual Status PRunSetup(const std::vector<std::string>& input_names,
+                           const std::vector<std::string>& output_names,
+                           const std::vector<std::string>& target_nodes,
+                           std::string* handle);
 
   /// \brief Continues the pending execution specified by `handle` with the
   /// provided input tensors and fills `outputs` for the endpoints specified
   /// in `output_names`.
   /// NOTE: This API is still experimental and may change.
-  virtual Status PRun(const string& handle,
-                      const std::vector<std::pair<string, Tensor> >& inputs,
-                      const std::vector<string>& output_names,
-                      std::vector<Tensor>* outputs);
+  virtual Status PRun(
+      const std::string& handle,
+      const std::vector<std::pair<std::string, Tensor> >& inputs,
+      const std::vector<std::string>& output_names,
+      std::vector<Tensor>* outputs);
 
   /// \brief List devices in the session.
   ///
@@ -338,7 +339,7 @@ Status NewSession(const SessionOptions& options, Session** out_session);
 /// If Reset succeeds, this function will return `OK()`. Otherwise, this
 /// function will return an error status.
 Status Reset(const SessionOptions& options,
-             const std::vector<string>& containers);
+             const std::vector<std::string>& containers);
 
 /// \brief Create a new session with the given options.
 ///
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index acaa48f251c..99edf676109 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 484  // Updated: 2020/8/5
+#define TF_GRAPH_DEF_VERSION 528  // Updated: 2020/9/18
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //
diff --git a/tensorflow/core/tpu/BUILD b/tensorflow/core/tpu/BUILD
index 0a17ba3d408..c5a5910f4a5 100644
--- a/tensorflow/core/tpu/BUILD
+++ b/tensorflow/core/tpu/BUILD
@@ -124,6 +124,7 @@ cc_library(
     deps = [
         ":libtftpu_header",
         ":tpu_config_c_api",
+        ":tpu_executor_api",
         "//tensorflow/core/tpu/kernels:tpu_compile_c_api_hdrs",
         "//tensorflow/core/tpu/kernels:tpu_execute_c_api_hdrs",
         "//tensorflow/core/tpu/kernels:tpu_mesh_state_c_api_hdrs",
@@ -133,6 +134,16 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "tpu_executor_api",
+    srcs = ["tpu_executor_api.cc"],
+    hdrs = ["tpu_executor_api.h"],
+    deps = [
+        ":libtftpu_header",
+        "//tensorflow/stream_executor/tpu:tpu_executor_c_api_hdrs",
+    ],
+)
+
 cc_library(
     name = "tpu_api_dlsym_initializer",
     srcs = if_windows(
@@ -146,6 +157,7 @@ cc_library(
         ":tpu_api",
         ":tpu_compilation_device",
         ":tpu_config_c_api",
+        ":tpu_executor_init_fns",
         ":tpu_library_init_fns",
         ":tpu_node_device",
         ":tpu_system_device",
@@ -167,6 +179,12 @@ cc_library(
     visibility = ["//visibility:public"],
 )
 
+cc_library(
+    name = "tpu_executor_init_fns",
+    hdrs = ["tpu_executor_init_fns.inc"],
+    visibility = ["//visibility:public"],
+)
+
 cc_library(
     name = "tpu_node_device",
     srcs = ["tpu_node_device.cc"],
@@ -267,6 +285,7 @@ cc_library(
     srcs = ["tpu_on_demand_compiler.cc"],
     deps = [
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/compiler/xla/service:executable",
@@ -276,8 +295,10 @@ cc_library(
         "//tensorflow/compiler/xla/service:shaped_buffer",
         "//tensorflow/stream_executor:device_memory_allocator",
         "//tensorflow/stream_executor/tpu:c_api_conversions",
+        "//tensorflow/stream_executor/tpu:c_api_decl",
         "//tensorflow/stream_executor/tpu:proto_helper",
         "//tensorflow/stream_executor/tpu:status_helper",
+        "//tensorflow/stream_executor/tpu:tpu_executable_interface",
         "//tensorflow/stream_executor/tpu:tpu_executor",
         "//tensorflow/stream_executor/tpu:tpu_executor_c_api_hdrs",
         "@com_google_absl//absl/types:span",
diff --git a/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass.cc b/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass.cc
index 5fdc74b79fc..cdf32c54d86 100644
--- a/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass.cc
+++ b/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass.cc
@@ -599,8 +599,11 @@ Status GetStepMarkerLocation(const Node& replicate_node,
 // sharding attribute.
 Status GetDimensionIndicesAndNumSplitsFromSharding(
     const xla::OpSharding& sharding, std::map<int, int>* split_dimension_map) {
-  for (int dim_index = 0;
-       dim_index < sharding.tile_assignment_dimensions_size(); dim_index++) {
+  int64 tensor_tile_rank = sharding.tile_assignment_dimensions_size();
+  if (sharding.replicate_on_last_tile_dim()) {
+    tensor_tile_rank--;
+  }
+  for (int dim_index = 0; dim_index < tensor_tile_rank; dim_index++) {
     if (sharding.tile_assignment_dimensions(dim_index) > 1) {
       split_dimension_map->emplace(
           dim_index, sharding.tile_assignment_dimensions(dim_index));
@@ -685,6 +688,7 @@ xla::StatusOr<Node*> CreateSplitNode(int num_splits, int dim,
   split_def.add_input(absl::StrCat(split_dim_node->name(), ":0"));
   split_def.add_input(absl::StrCat(orig_src->name(), ":", orig_src_output));
   Node* split_node = graph->AddNode(split_def, &s);
+  split_node->set_assigned_device_name(input_assigned_device);
   TF_RETURN_IF_ERROR(s);
 
   graph->AddEdge(split_dim_node, 0, split_node, 0);
@@ -776,8 +780,9 @@ xla::StatusOr<ShardedInputInfo> CreateOrGetSplitNodesForInputSharding(
   // `split_nodes_for_dimension` now includes final split nodes
   // from which sharded data will be fed into TPUExcute nodes -- sorted by
   // row major order.
-  std::vector<NodeOut> sharded_inputs_list;
-  sharded_inputs_list.reserve(split_nodes_for_dimension.size());
+  std::vector<NodeOut> sharded_inputs_list(
+      sharding.tile_assignment_devices_size());
+  int64 next_core_tile_index = 0;
   while (!split_nodes_for_dimension.empty()) {
     Node* split_node = split_nodes_for_dimension.front();
     split_nodes_for_dimension.pop();
@@ -785,7 +790,14 @@ xla::StatusOr<ShardedInputInfo> CreateOrGetSplitNodesForInputSharding(
     TF_RETURN_IF_ERROR(
         GetNodeAttr(split_node->def(), "num_split", &num_splits));
     for (int out_index = 0; out_index < num_splits; ++out_index) {
-      sharded_inputs_list.emplace_back(NodeOut{split_node, out_index});
+      int64 repeat_count = sharding.replicate_on_last_tile_dim()
+                               ? *sharding.tile_assignment_dimensions().rbegin()
+                               : 1;
+      for (int64 i = 0; i < repeat_count; ++i) {
+        int64 next_core =
+            sharding.tile_assignment_devices(next_core_tile_index++);
+        sharded_inputs_list[next_core] = NodeOut{split_node, out_index};
+      }
     }
   }
 
@@ -888,19 +900,6 @@ xla::StatusOr<Node*> CreateConcatNodesForRetval(
   return inputs_to_sharded_retval.at(0).node;
 }
 
-absl::optional<int> GetCoreIndexInSharding(const xla::OpSharding& sharding,
-                                           int64 core) {
-  absl::optional<int> output_index;
-  for (int i = 0; i < sharding.tile_assignment_devices_size(); i++) {
-    int64 assigned_core = sharding.tile_assignment_devices(i);
-    if (assigned_core == core) {
-      output_index = i;
-      break;
-    }
-  }
-  return output_index;
-}
-
 // Set the padding ops the same devices as the original inputs. If the original
 // inputs are on TPUs, the padding ops will be placed on TPUs and XLA on demand
 // mode will be triggered, so we don't need to copy the data back to the host
@@ -962,7 +961,7 @@ bool IsTpuDevice(const string& device_string) {
 const absl::flat_hash_set<std::string>& PlaceOnTPUOpList() {
   static const auto place_on_tpu_ops = new absl::flat_hash_set<std::string>(
       {"Identity", "IdentityN", "Enter", "Exit", "Switch", "Merge",
-       "NextIteration", "Shape"});
+       "NextIteration", "Shape", "_Retval"});
   return *place_on_tpu_ops;
 }
 
@@ -1569,7 +1568,8 @@ Status DistributedTPURewritePass::GetArgAndRetvalShapes(
     arg_shape.shape = TensorShape();  // Variables are always scalars.
     arg_shape.handle_shape = info->handle_shape;
     arg_shape.handle_type = info->handle_type;
-    TF_RET_CHECK(arg_shape.handle_type != DT_INVALID);
+    TF_RET_CHECK(arg_shape.handle_type != DT_INVALID)
+        << " input edge: " << input_edges[edge_pos]->DebugString();
     ++edge_pos;
   }
 
@@ -2762,14 +2762,8 @@ Status DistributedTPURewritePass::BuildExecuteNodes(
                       sharding, orig_arg_num, dtype, replica,
                       edge->src_output(), edge->src(), control_predecessor,
                       graph, &input_index_to_sharded_inputs));
-
-              // Calculate which output we should receive from the Split node.
-              absl::optional<int> output_index =
-                  GetCoreIndexInSharding(sharding, core);
-              TF_RET_CHECK(output_index);
-
               NodeOut split_node_and_index =
-                  sharded_input_info.sharded_inputs.at(output_index.value());
+                  sharded_input_info.sharded_inputs.at(core);
               // Connect with Split node output.
               graph->AddEdge(split_node_and_index.node,
                              split_node_and_index.index, node, i);
@@ -2849,13 +2843,8 @@ Status DistributedTPURewritePass::BuildExecuteNodes(
                       arg_shapes[orig_arg_num].handle_type, replica,
                       var_data.index, var_data.node, control_predecessor, graph,
                       &input_index_to_sharded_inputs));
-
-              // Calculate which output we should receive from the Split node.
-              absl::optional<int> output_index =
-                  GetCoreIndexInSharding(sharding, core);
-              TF_RET_CHECK(output_index);
               NodeOut split_node_and_index =
-                  sharded_input_info.sharded_inputs[output_index.value()];
+                  sharded_input_info.sharded_inputs[core];
               // Connect with Split node output.
               graph->AddEdge(split_node_and_index.node,
                              split_node_and_index.index, node, i);
@@ -2918,7 +2907,16 @@ Status DistributedTPURewritePass::BuildExecuteNodes(
 
           // Add a Concat node.
           std::vector<NodeOut> orig_inputs;
-          for (int64 core_id : sharding.tile_assignment_devices()) {
+          for (int64 tile_index = 0;
+               tile_index < sharding.tile_assignment_devices_size();
+               ++tile_index) {
+            int64 last_tile_dim_size =
+                *sharding.tile_assignment_dimensions().rbegin();
+            if (sharding.replicate_on_last_tile_dim() &&
+                tile_index % last_tile_dim_size != 0) {
+              continue;
+            }
+            int64 core_id = sharding.tile_assignment_devices(tile_index);
             int core_retval_index =
                 retval_index_to_output_index_mapping[retval_index][core_id];
             orig_inputs.push_back(
@@ -2986,7 +2984,16 @@ Status DistributedTPURewritePass::BuildExecuteNodes(
 
             // Add a Concat node.
             std::vector<NodeOut> orig_inputs;
-            for (int64 core_id : sharding.tile_assignment_devices()) {
+            for (int64 tile_index = 0;
+                 tile_index < sharding.tile_assignment_devices_size();
+                 ++tile_index) {
+              int64 last_tile_dim_size =
+                  *sharding.tile_assignment_dimensions().rbegin();
+              if (sharding.replicate_on_last_tile_dim() &&
+                  tile_index % last_tile_dim_size != 0) {
+                continue;
+              }
+              int64 core_id = sharding.tile_assignment_devices(tile_index);
               int core_retval_num =
                   orig_arg_num_to_output_index_mapping[orig_arg_num][core_id];
               orig_inputs.push_back(
diff --git a/tensorflow/core/tpu/graph_rewrite/host_training_loop_optimization_util.cc b/tensorflow/core/tpu/graph_rewrite/host_training_loop_optimization_util.cc
index fad8e22399c..f303588ef33 100644
--- a/tensorflow/core/tpu/graph_rewrite/host_training_loop_optimization_util.cc
+++ b/tensorflow/core/tpu/graph_rewrite/host_training_loop_optimization_util.cc
@@ -504,9 +504,10 @@ Status AddReshardOp(Graph* graph, const HostTrainingLoopInfo& host_loop_info) {
       "TPUVariableReshard/default_shard_state", "/_", internal::GetNodeId())));
   AddNodeAttr("dtype", DT_STRING, &default_sharding);
 
-  Tensor t(DT_STRING, {2});
+  Tensor t(DT_STRING, {3});
   t.vec<tstring>()(0) = kDefaultShardingValue;
   t.vec<tstring>()(1) = kDefaultShardingValue;
+  t.vec<tstring>()(2) = kDefaultShardingValue;
   t.AsProtoTensorContent(
       (*default_sharding.mutable_attr())["value"].mutable_tensor());
 
diff --git a/tensorflow/core/tpu/kernels/BUILD b/tensorflow/core/tpu/kernels/BUILD
index 75d12f89426..f35f7151222 100644
--- a/tensorflow/core/tpu/kernels/BUILD
+++ b/tensorflow/core/tpu/kernels/BUILD
@@ -4,6 +4,7 @@ load(
     "//tensorflow/core/platform:build_config.bzl",
     "tf_proto_library_cc",
 )
+load("//tensorflow:tensorflow.bzl", "tf_grpc_cc_dependency")  # buildifier: disable=same-origin-load
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_kernel_library",
@@ -28,10 +29,16 @@ tf_kernel_library(
     deps = [
         ":cross_replica_ops",
         ":host_compute_ops",
+        ":image_resize_ops",
+        ":infeed_ops",
+        ":outfeed_ops",
+        ":replication_ops",
         ":topk_ops",
         ":tpu_compile_op",
         ":tpu_configuration_ops",
         ":tpu_execute_op",
+        ":tpu_handle_to_key_op",
+        ":transfer_ops",
     ],
 )
 
@@ -39,7 +46,10 @@ cc_library(
     name = "tpu_compile_op_common",
     srcs = ["tpu_compile_op_common.cc"],
     hdrs = ["tpu_compile_op_common.h"],
-    deps = [
+    deps = select({
+        WITH_TPU_SUPPORT: [":tpu_compilation_metrics"],
+        DEFAULT: ["//tensorflow/core/tpu/kernels:tpu_compilation_metrics"],
+    }) + [
         ":tpu_compilation_cache_entry_unloader",
         ":tpu_compilation_cache_interface",
         ":tpu_compilation_metrics_hdrs",
@@ -52,6 +62,9 @@ cc_library(
         ":tpu_util",
         ":tpu_util_c_api_hdrs",
         ":tpu_util_hdrs",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@com_google_absl//absl/types:variant",
         "//tensorflow/compiler/jit:flags",
         "//tensorflow/compiler/jit:shape_inference",
         "//tensorflow/compiler/tf2xla:tf2xla_util",
@@ -72,9 +85,6 @@ cc_library(
         "//tensorflow/core/tpu:tpu_configuration",
         "//tensorflow/core/tpu:tpu_defs",
         "//tensorflow/stream_executor/tpu:tpu_platform_interface",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:span",
-        "@com_google_absl//absl/types:variant",
     ],
     alwayslink = 1,
 )
@@ -93,13 +103,18 @@ tf_kernel_library(
         WITH_TPU_SUPPORT: ["-DLIBTFTPU"],
         DEFAULT: [],
     }),
-    deps = [
+    deps = select({
+        WITH_TPU_SUPPORT: [":tpu_util"],
+        DEFAULT: ["//tensorflow/core/tpu/kernels:tpu_util"],
+    }) + [
         ":tpu_compilation_cache_factory",
         ":tpu_compilation_cache_interface",
         ":tpu_compilation_cache_local_lookup",
         ":tpu_compilation_cache_lookup",
+        ":tpu_compilation_cache_rpc_lookup",
         ":tpu_mesh_state_interface",
         ":tpu_op_consts",
+        ":tpu_pod_state",
         "//tensorflow/c:tf_status",
         "//tensorflow/c:tf_status_helper",
         "//tensorflow/compiler/xla:util",
@@ -110,6 +125,7 @@ tf_kernel_library(
         "//tensorflow/core/tpu:tpu_config_c_api",
         "//tensorflow/core/tpu:tpu_configuration",
         "//tensorflow/core/tpu:tpu_defs",
+        "//tensorflow/stream_executor/lib",
         "//tensorflow/stream_executor/tpu:proto_helper",
     ],
     alwayslink = 1,
@@ -123,7 +139,7 @@ cc_library(
         ":tpu_program_c_api_hdrs",
         ":tpu_util_c_api_hdrs",
         "//tensorflow/core/tpu:libtftpu_header",
-        "//tensorflow/stream_executor/tpu:proto_helper",
+        "//tensorflow/stream_executor/tpu:c_api_decl",
     ],
     alwayslink = True,
 )
@@ -214,30 +230,14 @@ cc_library(
 
 cc_library(
     name = "tpu_compilation_cache_entry",
-    srcs = ["tpu_compilation_cache_entry.cc"],
     hdrs = [
         "tpu_compilation_cache_entry.h",
     ],
     deps = [
-        ":compiled_subgraph",
-        ":tpu_compilation_cache_proto_cc",
         ":tpu_executable_info_proto_cc",
-        ":tpu_program_group",
+        ":tpu_program_group_interface",
         "//tensorflow/compiler/xla/service:hlo_proto_cc",
-        "//tensorflow/core:framework",
         "//tensorflow/core/lib/core:refcount",
-        "//tensorflow/core/platform:casts",
-    ],
-)
-
-cc_library(
-    name = "tpu_compilation_cache_entry_impl",
-    srcs = [],
-    hdrs = ["tpu_compilation_cache_entry_impl.h"],
-    deps = [
-        ":compiled_subgraph",
-        ":tpu_compilation_cache_interface",
-        ":tpu_executable_info_proto_cc",
     ],
 )
 
@@ -247,8 +247,8 @@ cc_library(
         "tpu_compilation_cache_lookup.h",
     ],
     deps = [
+        ":tpu_compilation_cache_common_proto_cc",
         ":tpu_compilation_cache_interface",
-        ":tpu_compilation_cache_proto_cc",
         "//tensorflow/core/lib/core:refcount",
         "//tensorflow/core/platform:status",
         "//tensorflow/core/profiler/lib:traceme",
@@ -260,11 +260,11 @@ cc_library(
     srcs = ["tpu_compilation_cache_local_lookup.cc"],
     hdrs = ["tpu_compilation_cache_local_lookup.h"],
     deps = [
+        ":tpu_compilation_cache_common_proto_cc",
         ":tpu_compilation_cache_entry",
         ":tpu_compilation_cache_external",
         ":tpu_compilation_cache_interface",
         ":tpu_compilation_cache_lookup",
-        ":tpu_compilation_cache_proto_cc",
         "//tensorflow/core/platform:status",
     ],
 )
@@ -308,6 +308,8 @@ cc_library(
         "//tensorflow/compiler/tf2xla:host_compute_metadata_proto_cc",
         "//tensorflow/compiler/xla/service:hlo_proto_cc",
         "//tensorflow/core/lib/core:status",
+        "@com_google_absl//absl/time",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
@@ -325,6 +327,7 @@ cc_library(
         ":tpu_mesh_state_interface",
         ":tpu_program_c_api_hdrs",
         ":tpu_program_group_interface",
+        "//tensorflow/compiler/tf2xla:host_compute_metadata_proto_cc",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/xla:xla_proto_cc",
         "//tensorflow/compiler/xla/client:compile_only_client",
@@ -345,14 +348,22 @@ cc_library(
     name = "tpu_compilation_cache_interface",
     srcs = ["tpu_compilation_cache_interface.cc"],
     hdrs = ["tpu_compilation_cache_interface.h"],
-    deps = [
+    deps = select({
+        WITH_TPU_SUPPORT: [":tpu_compilation_metrics"],
+        DEFAULT: ["//tensorflow/core/tpu/kernels:tpu_compilation_metrics"],
+    }) + [
         ":compiled_subgraph",
+        ":tpu_compilation_cache_common_proto_cc",
+        ":tpu_compilation_cache_entry",
         ":tpu_compilation_cache_key",
-        ":tpu_compilation_cache_proto_cc",
         ":tpu_compilation_metrics_hdrs",
         ":tpu_util",
         ":tpu_util_hdrs",
         ":trace_util_hdrs",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:node_hash_map",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
         "//tensorflow/compiler/tf2xla:host_compute_metadata_proto_cc",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:framework",
@@ -361,10 +372,6 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/profiler/lib:traceme",
         "//tensorflow/core/tpu:tpu_api",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/container:node_hash_map",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/synchronization",
     ],
     alwayslink = 1,
 )
@@ -377,17 +384,17 @@ cc_library(
     ],
     deps = [
         ":compiled_subgraph",
+        ":tpu_compilation_cache_common_proto_cc",
         ":tpu_compilation_cache_entry",
-        ":tpu_compilation_cache_entry_impl",
         ":tpu_compilation_cache_interface",
         ":tpu_compilation_cache_key",
-        ":tpu_compilation_cache_proto_cc",
         ":tpu_compilation_metrics",  # buildcleaner: keep
         ":tpu_compilation_metrics_hdrs",
         ":tpu_compile_c_api_hdrs",
         ":tpu_compile_op_support",
         ":tpu_mesh_state_interface",
         ":tpu_op_consts",
+        ":tpu_program_c_api_hdrs",
         ":tpu_program_group",
         ":tpu_util",
         ":trace_util_hdrs",
@@ -397,10 +404,10 @@ cc_library(
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/profiler/lib:traceme",
         "//tensorflow/core/protobuf/tpu:compile_metadata_proto_cc",
         "@com_google_absl//absl/container:node_hash_map",
+        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
@@ -450,6 +457,7 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
+        tf_grpc_cc_dependency(),
     ],
 )
 
@@ -471,6 +479,7 @@ cc_library(
     deps = [
         ":tpu_util_c_api_hdrs",
         "//tensorflow/core/tpu:libtftpu_header",
+        "//tensorflow/stream_executor/tpu:c_api_decl",
         "//tensorflow/stream_executor/tpu:proto_helper",
     ],
     alwayslink = True,
@@ -498,23 +507,159 @@ cc_library(
     hdrs = ["tpu_util.h"],
     deps = [
         ":tpu_compilation_cache_key",
-        ":tpu_program_group_interface",
+        ":tpu_util_c_api_hdrs",
         "//tensorflow/cc:ops",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/client:compile_only_client",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/tpu:tpu_api",
         "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/time",
+        "@com_google_absl//absl/strings:str_format",
+        tf_grpc_cc_dependency(),
     ],
     alwayslink = 1,
 )
 
+# An alias for
+cc_library(
+    name = "tpu_compilation_cache_cc_proto",
+    deps = [":tpu_compilation_cache_proto_cc"],
+)
+
+cc_library(
+    name = "tpu_compilation_cache_rpc_support_hdrs",
+    hdrs = ["tpu_compilation_cache_rpc_support.h"],
+    copts = select({
+        WITH_TPU_SUPPORT: ["-DLIBTFTPU"],
+        DEFAULT: [],
+    }),
+    deps = select({
+        WITH_TPU_SUPPORT: [":tpu_compilation_cache_proto_cc"],  # build_cleaner: keep
+        DEFAULT: ["//tensorflow/core/tpu/kernels:tpu_compilation_cache_cc_proto"],  # build_cleaner: keep
+    }) + [
+        ":tpu_compilation_cache_entry",
+        ":tpu_compilation_cache_interface",
+        ":tpu_compilation_cache_lookup",
+        ":tpu_program_group_interface",
+        "@com_google_absl//absl/strings",
+        "//tensorflow/core/platform:status",
+        tf_grpc_cc_dependency(),
+    ],
+)
+
+cc_library(
+    name = "tpu_compilation_cache_rpc_support",
+    srcs = ["tpu_compilation_cache_rpc_support.cc"],
+    copts = select({
+        WITH_TPU_SUPPORT: ["-DLIBTFTPU"],
+        DEFAULT: [],
+    }),
+    deps = [
+        ":tpu_compilation_cache_common_proto_cc",
+        ":tpu_compilation_cache_proto_cc",
+        ":tpu_compilation_cache_rpc_support_hdrs",
+        ":tpu_program_group",
+        "//tensorflow/compiler/tf2xla:host_compute_metadata_proto_cc",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_util",
+        "//tensorflow/core/tpu:tpu_config_c_api",
+        "//tensorflow/stream_executor/tpu:proto_helper",
+    ],
+)
+
+cc_library(
+    name = "tpu_compilation_cache_rpc_lookup",
+    srcs = ["tpu_compilation_cache_rpc_lookup.cc"],
+    hdrs = ["tpu_compilation_cache_rpc_lookup.h"],
+    copts = select({
+        WITH_TPU_SUPPORT: ["-DLIBTFTPU"],
+        DEFAULT: [],
+    }),
+    deps = select({
+        WITH_TPU_SUPPORT: [":tpu_compilation_cache_rpc_support"],
+        DEFAULT: ["//tensorflow/core/tpu/kernels:tpu_compilation_cache_rpc_support"],
+    }) + [
+        ":tpu_compilation_cache_grpc",
+        ":tpu_compilation_cache_interface",
+        ":tpu_compilation_cache_lookup",
+        ":tpu_compilation_cache_common_proto_cc",
+        ":tpu_compilation_cache_rpc_support_hdrs",
+        ":tpu_program_group_interface",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_util",
+        tf_grpc_cc_dependency(),
+    ],
+)
+
 tf_proto_library_cc(
     name = "tpu_compilation_cache_proto",
     srcs = ["tpu_compilation_cache.proto"],
+    has_services = True,
     cc_api_version = 2,
+    create_java_proto = False,
+    protodeps = [
+        ":tpu_compilation_cache_common_proto",
+        "//tensorflow/compiler/tf2xla:host_compute_metadata_proto",
+    ],
+)
+
+tf_proto_library_cc(
+    name = "tpu_compilation_cache_common_proto",
+    srcs = ["tpu_compilation_cache_common.proto"],
+    cc_api_version = 2,
+    create_java_proto = False,
+)
+
+cc_library(
+    name = "tpu_compilation_cache_grpc",
+    srcs = ["tpu_compilation_cache_grpc.cc"],
+    hdrs = ["tpu_compilation_cache_grpc.h"],
+    copts = select({
+        WITH_TPU_SUPPORT: ["-DLIBTFTPU"],
+        DEFAULT: [],
+    }),
+    deps = select({
+        WITH_TPU_SUPPORT: [":tpu_compilation_cache_proto_cc"],
+        DEFAULT: ["//tensorflow/core/tpu/kernels:tpu_compilation_cache_cc_proto"],
+    }) + [
+        ":tpu_compilation_cache_common_proto_cc",
+        tf_grpc_cc_dependency(),
+    ],
+)
+
+cc_library(
+    name = "tpu_compilation_cache_service",
+    srcs = ["tpu_compilation_cache_service.cc"],
+    hdrs = ["tpu_compilation_cache_service.h"],
+    copts = select({
+        WITH_TPU_SUPPORT: ["-DLIBTFTPU"],
+        DEFAULT: [],
+    }),
+    deps = select({
+        WITH_TPU_SUPPORT: [
+            ":tpu_compilation_cache_rpc_support",  # build_cleaner: keep
+            ":tpu_compilation_cache_proto_cc",  # build_cleaner: keep
+        ],
+        DEFAULT: [
+            "//tensorflow/core/tpu/kernels:tpu_compilation_cache_rpc_support",  # build_cleaner: keep
+            "//tensorflow/core/tpu/kernels:tpu_compilation_cache_cc_proto",  # build_cleaner: keep
+        ],
+    }) + [
+        ":tpu_compilation_cache_common_proto_cc",
+        ":tpu_compilation_cache_grpc",
+        ":tpu_compilation_cache_interface",
+        ":tpu_compilation_cache_rpc_support_hdrs",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_call",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_util",
+        "//tensorflow/core/lib/core:threadpool",
+        "//tensorflow/core/platform:coding",
+        tf_grpc_cc_dependency(),
+    ],
 )
 
 cc_library(
@@ -621,6 +766,7 @@ cc_library(
     deps = [
         ":tpu_compilation_cache_entry",
         ":tpu_compilation_cache_external",
+        ":tpu_compilation_cache_interface",
         ":tpu_compilation_cache_local_lookup",
         ":tpu_compilation_cache_lookup",
         ":tpu_executable_info_proto_cc",
@@ -700,3 +846,126 @@ cc_library(
     ],
     alwayslink = 1,
 )
+
+cc_library(
+    name = "infeed_ops",
+    srcs = ["infeed_ops.cc"],
+    hdrs = ["infeed_ops.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":transfer_ops",
+        "//tensorflow/compiler/jit:xla_device_no_jit_rewrite_registration",
+        "//tensorflow/compiler/tf2xla:common",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/core:framework",
+        "//tensorflow/core/common_runtime:dma_helper",
+        "//tensorflow/core/framework:protos_all_cc",
+        "//tensorflow/core/kernels:transpose_functor",
+        "//tensorflow/core/platform:status",
+        "//tensorflow/core/profiler/lib:traceme",
+        "//tensorflow/core/tpu:tpu_api",
+        "//tensorflow/core/tpu:tpu_defs",
+        "//tensorflow/stream_executor:multi_platform_manager",
+        "//tensorflow/stream_executor/tpu:c_api_conversions",
+        "//tensorflow/stream_executor/tpu:tpu_transfer_manager_base",
+        "//tensorflow/stream_executor/tpu:tpu_transfer_manager_interface",
+    ],
+    alwayslink = True,
+)
+
+cc_library(
+    name = "transfer_ops",
+    srcs = ["transfer_ops.cc"],
+    hdrs = ["transfer_ops.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/compiler/jit:xla_device_no_jit_rewrite_registration",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels:ops_util",
+        "//tensorflow/core/profiler/lib:traceme",
+        "//tensorflow/stream_executor:multi_platform_manager",
+        "//tensorflow/stream_executor/tpu:tpu_node_context",
+        "//tensorflow/stream_executor/tpu:tpu_platform_interface",
+        "//tensorflow/stream_executor/tpu:tpu_transfer_manager_interface",
+    ],
+    alwayslink = True,
+)
+
+cc_library(
+    name = "outfeed_ops",
+    srcs = ["outfeed_ops.cc"],
+    hdrs = ["outfeed_ops.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":transfer_ops",
+        "//tensorflow/compiler/jit:xla_device_no_jit_rewrite_registration",
+        "//tensorflow/compiler/tf2xla:common",
+        "//tensorflow/core:framework",
+        "//tensorflow/core/framework:protos_all_cc",
+        "//tensorflow/core/tpu:tpu_defs",
+        "//tensorflow/stream_executor:multi_platform_manager",
+    ],
+    alwayslink = True,
+)
+
+cc_library(
+    name = "image_resize_ops",
+    srcs = ["image_resize_ops.cc"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/compiler/tf2xla:common",
+        "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client/lib:constants",
+        "//tensorflow/core:framework",
+        "//tensorflow/core/tpu:tpu_defs",
+        "@com_google_absl//absl/strings",
+    ],
+    alwayslink = True,
+)
+
+cc_library(
+    name = "replication_ops",
+    srcs = ["replication_ops.cc"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/compiler/jit:xla_device_no_jit_rewrite_registration",
+        "//tensorflow/core:framework",
+        "//tensorflow/core/tpu:tpu_defs",
+    ],
+    alwayslink = True,
+)
+
+cc_library(
+    name = "tpu_handle_to_key_op",
+    srcs = ["tpu_handle_to_key_op.cc"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":tpu_compilation_cache_interface",
+        ":tpu_op_consts",
+        "//tensorflow/core:framework",
+        "//tensorflow/core/tpu:tpu_configuration",
+    ],
+    alwayslink = True,
+)
+
+cc_library(
+    name = "tpu_pod_state",
+    srcs = ["tpu_pod_state.cc"],
+    hdrs = ["tpu_pod_state.h"],
+    copts = select({
+        WITH_TPU_SUPPORT: ["-DLIBTFTPU"],
+        DEFAULT: [],
+    }),
+    deps = select({
+        WITH_TPU_SUPPORT: [":tpu_util"],
+        DEFAULT: ["//tensorflow/core/tpu/kernels:tpu_util"],
+    }) + [
+        ":tpu_compilation_cache_service",
+        "//tensorflow/c:tf_status",
+        "//tensorflow/c:tf_status_helper",
+        "//tensorflow/core/tpu:tpu_api",
+        "//tensorflow/core:framework",
+    ],
+)
diff --git a/tensorflow/core/tpu/kernels/compiled_subgraph.h b/tensorflow/core/tpu/kernels/compiled_subgraph.h
index a97c652c279..091d6e74470 100644
--- a/tensorflow/core/tpu/kernels/compiled_subgraph.h
+++ b/tensorflow/core/tpu/kernels/compiled_subgraph.h
@@ -124,6 +124,9 @@ struct CompiledSubgraph : public core::RefCounted {
   // Compilation cache proto key to identify the cache entry.
   std::vector<std::string> proto_key;
 
+  // Fingerprints of sharding programs if there is any.
+  std::vector<std::string> sharding_key;
+
   // The number of 'external' client-held references to the entry.
   int external_references = 0;
 
diff --git a/tensorflow/core/tpu/kernels/host_compute_ops.cc b/tensorflow/core/tpu/kernels/host_compute_ops.cc
index 77a7d6f3bf8..5295c1c700b 100644
--- a/tensorflow/core/tpu/kernels/host_compute_ops.cc
+++ b/tensorflow/core/tpu/kernels/host_compute_ops.cc
@@ -58,9 +58,9 @@ class RecvAtHostOp : public AsyncOpKernel {
     OP_REQUIRES_ASYNC(
         ctx,
         TensorShapeUtils::IsVector(input.shape()) &&
-            input.shape().dim_size(0) == 2,
+            input.shape().dim_size(0) == 3,
         errors::InvalidArgument("Input shape ", input.shape().DebugString(),
-                                " is not a vector of length 2."),
+                                " is not a vector of length 3."),
         done);
     const string rendezvous_key_base = input.vec<tstring>()(1);
     OP_REQUIRES_ASYNC(
@@ -164,10 +164,10 @@ class SendFromHostOp : public OpKernel {
     const Tensor& key_input = ctx->input(ctx->num_inputs() - 1);
     OP_REQUIRES(ctx,
                 TensorShapeUtils::IsVector(key_input.shape()) &&
-                    key_input.shape().dim_size(0) == 2,
+                    key_input.shape().dim_size(0) == 3,
                 errors::InvalidArgument("Key input shape ",
                                         key_input.shape().DebugString(),
-                                        " is not a vector of length 2."));
+                                        " is not a vector of length 3."));
     const string rendezvous_key_base = key_input.vec<tstring>()(1);
     OP_REQUIRES(
         ctx, ctx->rendezvous() != nullptr,
diff --git a/tensorflow/core/tpu/kernels/image_resize_ops.cc b/tensorflow/core/tpu/kernels/image_resize_ops.cc
new file mode 100644
index 00000000000..fd0f5e4c7a6
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/image_resize_ops.cc
@@ -0,0 +1,155 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/tpu/tpu_defs.h"
+
+namespace tensorflow {
+
+class TpuCustomResizeOp : public XlaOpKernel {
+ public:
+  explicit TpuCustomResizeOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("align_corners", &align_corners_));
+    OP_REQUIRES_OK(ctx,
+                   ctx->GetAttr("half_pixel_centers", &half_pixel_centers_));
+  }
+
+  xla::Shape GetOutputShape(XlaOpKernelContext* ctx) const {
+    std::vector<int64> out_size;
+    auto status = ctx->ConstantInputAsIntVector(1, &out_size);
+    CHECK_EQ(out_size.size(), 2) << status.ToString();
+    xla::Shape output_shape =
+        TensorShapeToXLAShape(ctx->output_xla_type(0), ctx->InputShape(0));
+    output_shape.mutable_dimensions()[1] = out_size[0];
+    output_shape.mutable_dimensions()[2] = out_size[1];
+    return output_shape;
+  }
+
+  string OpaqueField() const {
+    return absl::StrCat("\"", align_corners_, half_pixel_centers_, "\"");
+  }
+
+  void CompileGrad(XlaOpKernelContext* ctx, const char* target,
+                   const xla::Shape& output_shape) {
+    auto input_shape =
+        TensorShapeToXLAShape(ctx->output_xla_type(0), ctx->InputShape(0));
+    if (ctx->InputShape(1).dim_sizes() == ctx->InputShape(0).dim_sizes()) {
+      ctx->SetOutput(
+          0, xla::ConvertElementType(ctx->Input(0), ctx->output_xla_type(0)));
+      return;
+    }
+    // The gradient should be done in two phases for large resizes.
+    auto input = ctx->Input(0);
+    if (input_shape.dimensions(1) / output_shape.dimensions(1) > 3 &&
+        input_shape.dimensions(2) / output_shape.dimensions(2) > 3) {
+      auto intermediate_shape = output_shape;
+      intermediate_shape.mutable_dimensions()[1] = input_shape.dimensions(1);
+      input = xla::CustomCall(ctx->builder(), target, {ctx->Input(0)},
+                              intermediate_shape, OpaqueField());
+    }
+    ctx->SetOutput(0, xla::CustomCall(ctx->builder(), target, {input},
+                                      output_shape, OpaqueField()));
+  }
+
+  void CompileForward(XlaOpKernelContext* ctx, const char* target) {
+    auto output_shape = GetOutputShape(ctx);
+    if (ctx->InputShape(0).dim_size(1) == output_shape.dimensions(1) &&
+        ctx->InputShape(0).dim_size(2) == output_shape.dimensions(2)) {
+      ctx->SetOutput(
+          0, xla::ConvertElementType(ctx->Input(0), ctx->output_xla_type(0)));
+      return;
+    }
+    if (ctx->InputShape(0).dim_size(1) == 1 &&
+        ctx->InputShape(0).dim_size(2) == 1) {
+      ctx->SetOutput(0,
+                     ctx->Input(0) + xla::Zeros(ctx->builder(), output_shape));
+      return;
+    }
+    ctx->SetOutput(0, xla::CustomCall(ctx->builder(), target, {ctx->Input(0)},
+                                      output_shape, OpaqueField()));
+  }
+
+ private:
+  bool align_corners_;
+  bool half_pixel_centers_;
+};
+
+class TpuResizeNearestNeighborOp : public TpuCustomResizeOp {
+ public:
+  explicit TpuResizeNearestNeighborOp(OpKernelConstruction* ctx)
+      : TpuCustomResizeOp(ctx) {}
+  void Compile(XlaOpKernelContext* ctx) override {
+    CompileForward(ctx, "ResizeNearest");
+  }
+};
+
+class TpuResizeBilinearOp : public TpuCustomResizeOp {
+ public:
+  explicit TpuResizeBilinearOp(OpKernelConstruction* ctx)
+      : TpuCustomResizeOp(ctx) {}
+  void Compile(XlaOpKernelContext* ctx) override {
+    CompileForward(ctx, "ResizeBilinear");
+  }
+};
+
+class TpuResizeNearestNeighborGradOp : public TpuCustomResizeOp {
+ public:
+  explicit TpuResizeNearestNeighborGradOp(OpKernelConstruction* ctx)
+      : TpuCustomResizeOp(ctx) {}
+  void Compile(XlaOpKernelContext* ctx) override {
+    CompileGrad(ctx, "ResizeNearestGrad", GetOutputShape(ctx));
+  }
+};
+
+class TpuResizeBilinearGradOp : public TpuCustomResizeOp {
+ public:
+  explicit TpuResizeBilinearGradOp(OpKernelConstruction* ctx)
+      : TpuCustomResizeOp(ctx) {}
+  void Compile(XlaOpKernelContext* ctx) override {
+    auto output_shape =
+        TensorShapeToXLAShape(ctx->output_xla_type(0), ctx->InputShape(1));
+    CompileGrad(ctx, "ResizeBilinearGrad", output_shape);
+  }
+};
+
+REGISTER_XLA_OP(Name("ResizeNearestNeighbor")
+                    .CompileTimeConstantInput("size")
+                    .Device(DEVICE_TPU_XLA_JIT),
+                TpuResizeNearestNeighborOp);
+
+REGISTER_XLA_OP(Name("ResizeNearestNeighborGrad")
+                    .CompileTimeConstantInput("size")
+                    .Device(DEVICE_TPU_XLA_JIT),
+                TpuResizeNearestNeighborGradOp);
+
+REGISTER_XLA_OP(Name("ResizeBilinear")
+                    .CompileTimeConstantInput("size")
+                    .Device(DEVICE_TPU_XLA_JIT),
+                TpuResizeBilinearOp);
+
+REGISTER_XLA_OP(Name("ResizeBilinearGrad").Device(DEVICE_TPU_XLA_JIT),
+                TpuResizeBilinearGradOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/infeed_ops.cc b/tensorflow/core/tpu/kernels/infeed_ops.cc
new file mode 100644
index 00000000000..1d10667f2c2
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/infeed_ops.cc
@@ -0,0 +1,548 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/tpu/kernels/infeed_ops.h"
+
+#include <algorithm>
+#include <vector>
+
+#include "tensorflow/compiler/jit/xla_device.h"
+#include "tensorflow/compiler/tf2xla/literal_util.h"
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function_handle_cache.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/framework/variant_encode_decode.h"
+#include "tensorflow/core/framework/variant_tensor_data.h"
+#include "tensorflow/core/kernels/transpose_functor.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/core/tpu/kernels/transfer_ops.h"
+#include "tensorflow/core/tpu/tpu_api.h"
+#include "tensorflow/core/tpu/tpu_defs.h"
+#include "tensorflow/stream_executor/tpu/c_api_conversions.h"
+#include "tensorflow/stream_executor/multi_platform_manager.h"
+#include "tensorflow/stream_executor/tpu/tpu_transfer_manager.h"
+#include "tensorflow/stream_executor/tpu/tpu_transfer_manager_interface.h"
+
+namespace tensorflow {
+namespace {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef tensorflow::tpu::NoncopyableBuffer LinearizerBuffer;
+typedef std::deque<LinearizerBuffer> LinearizerBufferList;
+
+// For the given shape, chooses a layout for infeed on TPU. The returned shape
+// has the same dimensions as the original shape, and only the layout is
+// changed.
+xla::Shape GetTPUInfeedLayout(const xla::Shape& shape) {
+  XLA_Shape c_shape;
+  XLA_Shape c_infeed_shape;
+
+  ApiConverter::ToC(shape, &c_shape);
+
+  tpu::ExecutorApiFn()->TpuTransferManager_GetInfeedLayoutFn(&c_shape,
+                                                             &c_infeed_shape);
+  xla::Shape infeed_shape = ApiConverter::FromC(&c_infeed_shape);
+  ApiConverter::Free(&c_shape);
+  ApiConverter::Free(&c_infeed_shape);
+  return infeed_shape;
+}
+
+// Transposes the given tensor using the tensorflow C++ transpose implementation
+// to obtain a XLA literal for the host tensor laid out as the given layout. The
+// returned tensor is normalized to the dim0major layout -- F32[10,20,30]{2,0,1}
+// is returned as F32[20,10,30]{2,1,0}.
+xla::StatusOr<Tensor> TransposeTensor(OpKernelContext* ctx,
+                                      const Tensor& input_tensor,
+                                      const xla::Shape& xla_shape) {
+  profiler::TraceMe trace_me("TransposeTensor", /*level=*/2);
+  const int64 rank = xla_shape.rank();
+  std::vector<int32> permutation(rank);
+  std::vector<int64> transposed_shapes(rank);
+  for (int64 i = 0; i < rank; ++i) {
+    permutation[i] = xla_shape.layout().minor_to_major(rank - 1 - i);
+    transposed_shapes[i] = xla_shape.dimensions(permutation[i]);
+  }
+
+  Tensor transposed_tensor;
+
+  // If this is a trivial transpose (i.e., bitcast), just create an aliased
+  // tensor with the transposed shape.
+  if (xla::LayoutUtil::IsMonotonicWithDim0Major(
+          xla::ShapeUtil::DropDegenerateDimensions(xla_shape).layout())) {
+    TensorShape shape;
+    TF_RETURN_IF_ERROR(TensorShapeUtils::MakeShape(transposed_shapes, &shape));
+    TF_RETURN_IF_ERROR(transposed_tensor.BitcastFrom(
+        input_tensor, input_tensor.dtype(), shape));
+    return transposed_tensor;
+  }
+
+  AllocatorAttributes alloc_attr;
+  alloc_attr.set_on_host(true);
+  TF_RETURN_IF_ERROR(ctx->allocate_temp(input_tensor.dtype(),
+                                        TensorShape(transposed_shapes),
+                                        &transposed_tensor, alloc_attr));
+  // Eigen Transpose fails with SIGFPE if there is a dimension of size 0.
+  if (input_tensor.NumElements() > 0) {
+    TF_RETURN_IF_ERROR(DoTranspose<CPUDevice>(ctx->eigen_device<CPUDevice>(),
+                                              input_tensor, permutation,
+                                              &transposed_tensor));
+  }
+  return transposed_tensor;
+}
+
+xla::StatusOr<bool> GetLayoutOverride(OpKernelConstruction* ctx,
+                                      const char* attrn_name,
+                                      std::vector<int64>* minor_to_major) {
+  if (!ctx->HasAttr(attrn_name)) {
+    return false;
+  }
+  TF_RETURN_IF_ERROR(ctx->GetAttr(attrn_name, minor_to_major));
+  return !minor_to_major->empty();
+}
+
+Status GetInfeedShapeWithLayout(OpKernelConstruction* ctx,
+                                const char* attrn_name,
+                                const xla::Shape& input_shape,
+                                xla::Shape* output_shape) {
+  std::vector<int64> minor_to_major;
+  TF_ASSIGN_OR_RETURN(bool has_override,
+                      GetLayoutOverride(ctx, attrn_name, &minor_to_major));
+  if (!has_override) {
+    *output_shape = input_shape;
+    if (output_shape->IsTuple()) {
+      int64 tuple_elements = xla::ShapeUtil::TupleElementCount(*output_shape);
+      for (int64 i = 0; i < tuple_elements; ++i) {
+        xla::Shape* sub_shape =
+            xla::ShapeUtil::GetMutableSubshape(output_shape, {i});
+        *sub_shape->mutable_layout() = GetTPUInfeedLayout(*sub_shape).layout();
+      }
+    } else {
+      *output_shape->mutable_layout() =
+          GetTPUInfeedLayout(*output_shape).layout();
+    }
+    return Status::OK();
+  }
+
+  auto layout_func = [](const xla::Shape& shape) -> xla::Layout {
+    return GetTPUInfeedLayout(shape).layout();
+  };
+  return GetShapeWithLayout(input_shape, minor_to_major, layout_func,
+                            output_shape);
+}
+
+// LinearizedBuffersWrapper is an opaque C++ data structure for the outputs of
+// PrelinearizeOp and PrelinearizeTupleOp. It holds the resultant linearized
+// buffers and references to input tensors whose underlying storage are shared
+// with linearized buffers.
+// NOTE: This is not a feature-complete implementation of the DT_VARIANT
+// specification. In particular, we cannot currently serialize an arbitrary
+// `LinearizerBufferList` (aka `std::deque<LinearizerBuffer>`)
+// object, so the `Encode()` and `Decode()` methods are not implemented.
+struct LinearizedBuffersWrapper {
+  explicit LinearizedBuffersWrapper() {}
+  explicit LinearizedBuffersWrapper(LinearizerBufferList bufs,
+                                    std::vector<tensorflow::Tensor> ts)
+      : buffers(std::move(bufs)), tensors(std::move(ts)) {}
+  LinearizedBuffersWrapper(const LinearizedBuffersWrapper& wrapper) {
+    // tensorflow::Variant requires this copy constructor to compile.
+    LOG(FATAL) << "LinearizedBuffersWrapper should not copy.";
+  }
+  LinearizedBuffersWrapper& operator=(const LinearizedBuffersWrapper& wrapper) =
+      delete;
+  LinearizedBuffersWrapper(LinearizedBuffersWrapper&&) = default;
+  LinearizedBuffersWrapper& operator=(LinearizedBuffersWrapper&&) = default;
+  ~LinearizedBuffersWrapper() = default;
+
+  // These functions are tensorflow::Variant requirements.
+  string TypeName() const { return "(anonymous)::LinearizedBuffersWrapper"; }
+  void Encode(tensorflow::VariantTensorData* data) const {
+    LOG(ERROR) << "Encode() is not implemented for LinearizedBuffersWrapper "
+                  "objects.";
+  }
+  bool Decode(const tensorflow::VariantTensorData& data) {
+    LOG(ERROR) << "Decode() is not implemented for LinearizedBuffersWrapper "
+                  "objects.";
+    return false;
+  }
+
+  LinearizerBufferList buffers;
+  // Save references on tensors whose underlying storage are shared with
+  // LiteralLinearizer::Buffer in `buffers`.
+  std::vector<tensorflow::Tensor> tensors;
+};
+
+Status AutoTransposeAndLinearize(OpKernelContext* ctx,
+                                 const Tensor& input_tensor,
+                                 const xla::Shape& shape,
+                                 LinearizerBufferList* linearized_buffers,
+                                 std::vector<Tensor>* saved_input_tensors) {
+  const Tensor* tensor = &input_tensor;
+  // If the given layout is not in dim0major layout, tranposes the tensor.
+  bool has_transposed = false;
+  Tensor transposed_tensor;
+  if (!xla::LayoutUtil::IsMonotonicWithDim0Major(shape.layout())) {
+    // If the given layout is not in dim0major layout, transpose the tensor.
+    TF_ASSIGN_OR_RETURN(transposed_tensor,
+                        TransposeTensor(ctx, input_tensor, shape));
+    tensor = &transposed_tensor;
+    has_transposed = true;
+  }
+
+  xla::BorrowingLiteral literal;
+  TF_RETURN_IF_ERROR(HostTensorToBorrowingLiteral(*tensor, &literal));
+
+  TF_RETURN_IF_ERROR(
+      xla::TpuTransferManagerInterface::GetRegisteredTpuTransferManager()
+          ->LinearizeToBuffers(literal, linearized_buffers));
+
+  // The input tensor is ref-counted. Save a handle on the input tensor if
+  // its underlying storage is shared with linearized buffers to prevent
+  // input tensor from getting freed.
+  for (const auto& buffer : *linearized_buffers) {
+    if (!buffer.owns_data() && !has_transposed) {
+      // `buffer` is created from zero-copy fast path from the un-transposed
+      // input tensor so its underlying data is shared with input tensor.
+      // Save a handle to input tensor to increment its ref-count and avoid
+      // it getting deallocated after PrelinearizeTupleOp completes.
+      saved_input_tensors->push_back(*tensor);
+      // A literal can be linearized to zero to two buffers. If any of the
+      // linearized buffer shares storage with input tensor. We save exactly
+      // one handle on the input tensor.
+      break;
+    }
+  }
+  return Status::OK();
+}
+
+// PrelinearizeOp is used to linearize one tensor to the device format.
+class PrelinearizeOp : public OpKernel {
+ public:
+  explicit PrelinearizeOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("shape", &shape_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dtype", &dtype_));
+    xla::Shape shape;
+    OP_REQUIRES_OK(ctx, TensorShapeToXLAShape(dtype_, shape_, &shape));
+    OP_REQUIRES_OK(ctx,
+                   GetInfeedShapeWithLayout(ctx, "layout", shape, &xla_shape_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& input_tensor = ctx->input(0);
+    // Validate input.
+    OP_REQUIRES(
+        ctx, input_tensor.dtype() == dtype_,
+        errors::InvalidArgument("Prelinearize dtype mismatch; expected ",
+                                DataType_Name(dtype_), ", got ",
+                                DataType_Name(input_tensor.dtype())));
+    OP_REQUIRES(
+        ctx, input_tensor.shape() == shape_,
+        errors::InvalidArgument("Prelinearize shape mismatch; expected ",
+                                shape_.DebugString(), ", got ",
+                                input_tensor.shape().DebugString()));
+
+    // Auto-transpose and prelinearize.
+    LinearizerBufferList linearized_buffers;
+    std::vector<Tensor> saved_input_tensors;
+    auto status =
+        AutoTransposeAndLinearize(ctx, input_tensor, xla_shape_,
+                                  &linearized_buffers, &saved_input_tensors);
+    OP_REQUIRES_OK(ctx, status);
+
+    // Write to output.
+    tensorflow::Tensor* output;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_output(0, tensorflow::TensorShape{}, &output));
+    output->scalar<tensorflow::Variant>()() = LinearizedBuffersWrapper{
+        std::move(linearized_buffers), std::move(saved_input_tensors)};
+  }
+
+  bool IsExpensive() override { return true; }
+
+ private:
+  TensorShape shape_;
+  DataType dtype_;
+  xla::Shape xla_shape_;
+
+  // PrelinearizeOp is neither copyable nor movable.
+  PrelinearizeOp(const PrelinearizeOp&) = delete;
+  PrelinearizeOp& operator=(const PrelinearizeOp&) = delete;
+};
+
+// PrelinearizeTupleOp is used to linearize multiple tensors to the device
+// format.
+class PrelinearizeTupleOp : public OpKernel {
+ public:
+  explicit PrelinearizeTupleOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("shapes", &shapes_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dtypes", &dtypes_));
+    OP_REQUIRES(
+        ctx, shapes_.size() == dtypes_.size(),
+        errors::InvalidArgument(
+            "shapes and dtypes must be the same length. shapes length = ",
+            shapes_.size(), ", dtypes length = ", dtypes_.size()));
+
+    std::vector<xla::Shape> xla_shapes;
+    for (int i = 0; i < shapes_.size(); i++) {
+      xla::Shape xla_shape;
+      OP_REQUIRES_OK(ctx,
+                     TensorShapeToXLAShape(dtypes_[i], shapes_[i], &xla_shape));
+      xla_shapes.push_back(xla_shape);
+    }
+    OP_REQUIRES_OK(
+        ctx, GetInfeedShapeWithLayout(
+                 ctx, "layouts", xla::ShapeUtil::MakeTupleShape(xla_shapes),
+                 &tuple_shape_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    OpInputList values;
+    OP_REQUIRES_OK(ctx, ctx->input_list("inputs", &values));
+    OP_REQUIRES(ctx, values.size() == shapes_.size(),
+                errors::InvalidArgument(
+                    "Wrong number of inputs to PrelinearizeTuple."));
+
+    LinearizerBufferList all_linearized_buffers;
+    std::vector<Tensor> all_saved_input_tensors;
+    for (int i = 0; i < values.size(); i++) {
+      // Validate input.
+      const Tensor& input_tensor = values[i];
+      OP_REQUIRES(ctx, input_tensor.dtype() == dtypes_[i],
+                  errors::InvalidArgument(
+                      "PrelinearizeTuple dtype mismatch at tuple element ", i,
+                      "; expected ", DataType_Name(dtypes_[i]), ", got ",
+                      DataType_Name(input_tensor.dtype())));
+      OP_REQUIRES(ctx, input_tensor.shape() == shapes_[i],
+                  errors::InvalidArgument(
+                      "PrelinearizeTuple shape mismatch at tuple element ", i,
+                      "; expected ", shapes_[i].DebugString(), ", got ",
+                      input_tensor.shape().DebugString()));
+
+      // Auto-transpose and prelinearize.
+      LinearizerBufferList linearized_buffers;
+      std::vector<Tensor> saved_input_tensors;
+      auto status = AutoTransposeAndLinearize(
+          ctx, input_tensor, tuple_shape_.tuple_shapes(i), &linearized_buffers,
+          &saved_input_tensors);
+      OP_REQUIRES_OK(ctx, status);
+      all_linearized_buffers.insert(
+          all_linearized_buffers.end(),
+          std::make_move_iterator(linearized_buffers.begin()),
+          std::make_move_iterator(linearized_buffers.end()));
+      all_saved_input_tensors.insert(
+          all_saved_input_tensors.end(),
+          std::make_move_iterator(saved_input_tensors.begin()),
+          std::make_move_iterator(saved_input_tensors.end()));
+    }
+
+    tensorflow::Tensor* output;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_output(0, tensorflow::TensorShape{}, &output));
+    output->scalar<tensorflow::Variant>()() = LinearizedBuffersWrapper{
+        std::move(all_linearized_buffers), std::move(all_saved_input_tensors)};
+  }
+
+  bool IsExpensive() override { return true; }
+
+ private:
+  std::vector<TensorShape> shapes_;
+  DataTypeVector dtypes_;
+  xla::Shape tuple_shape_;
+
+  // PrelinearizeTupleOp is neither copyable nor movable.
+  PrelinearizeTupleOp(const PrelinearizeTupleOp&) = delete;
+  PrelinearizeTupleOp& operator=(const PrelinearizeTupleOp&) = delete;
+};
+
+// The InfeedEnqueuePrelinearizedBufferOp op is used to transfer prelinearized
+// buffers to the device infeed queue.
+class InfeedEnqueuePrelinearizedBufferOp : public TpuTransferAsyncOpKernel {
+ public:
+  explicit InfeedEnqueuePrelinearizedBufferOp(OpKernelConstruction* ctx)
+      : TpuTransferAsyncOpKernel(ctx, "prelinearized_buffers_to_infeed", 8) {}
+
+  Status DoWork(OpKernelContext* ctx,
+                xla::TpuTransferManagerInterface* transfer_manager,
+                stream_executor::StreamExecutor* stream_executor) override {
+    const Tensor& input_tensor = ctx->input(0);
+    const LinearizedBuffersWrapper* wrapper =
+        input_tensor.scalar<tensorflow::Variant>()()
+            .get<LinearizedBuffersWrapper>();
+    TF_RETURN_IF_ERROR(transfer_manager->TransferBuffersToInfeed(
+        stream_executor, wrapper->buffers));
+
+    return Status::OK();
+  }
+
+ private:
+  // InfeedEnqueuePrelinearizedBufferOp is neither copyable nor movable.
+  InfeedEnqueuePrelinearizedBufferOp(
+      const InfeedEnqueuePrelinearizedBufferOp&) = delete;
+  InfeedEnqueuePrelinearizedBufferOp& operator=(
+      const InfeedEnqueuePrelinearizedBufferOp&) = delete;
+};
+
+}  // anonymous namespace
+
+TpuInfeedEnqueueOp::TpuInfeedEnqueueOp(OpKernelConstruction* ctx)
+    : TpuTransferAsyncOpKernel(ctx, "infeed_enqueue", 8) {
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("shape", &shape_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("dtype", &dtype_));
+  xla::Shape shape;
+  OP_REQUIRES_OK(ctx, TensorShapeToXLAShape(dtype_, shape_, &shape));
+  OP_REQUIRES_OK(ctx,
+                 GetInfeedShapeWithLayout(ctx, "layout", shape, &xla_shape_));
+}
+
+Status TpuInfeedEnqueueOp::DoWork(
+    OpKernelContext* ctx, xla::TpuTransferManagerInterface* transfer_manager,
+    stream_executor::StreamExecutor* stream_executor) {
+  const Tensor& input_tensor = ctx->input(0);
+
+  // Validate runtime shape and fail if it doesn't match the contract.
+  if (input_tensor.dtype() != dtype_) {
+    return errors::InvalidArgument("Infeed dtype mismatch.");
+  }
+  if (input_tensor.shape() != shape_) {
+    return errors::InvalidArgument("Infeed shape mismatch; expected ",
+                                   shape_.DebugString(), ", got ",
+                                   input_tensor.shape().DebugString());
+  }
+
+  const Tensor* tensor = &input_tensor;
+  Tensor transposed_tensor;
+  if (!xla::LayoutUtil::IsMonotonicWithDim0Major(xla_shape_.layout())) {
+    // If the given layout is not in dim0major layout, transpose the tensor.
+    TF_ASSIGN_OR_RETURN(transposed_tensor,
+                        TransposeTensor(ctx, input_tensor, xla_shape_));
+    tensor = &transposed_tensor;
+  }
+
+  xla::BorrowingLiteral literal;
+  TF_RETURN_IF_ERROR(HostTensorToBorrowingLiteral(*tensor, &literal));
+
+  // Transfer the given literal to the Infeed interface of the device.
+  TF_RETURN_IF_ERROR(
+      transfer_manager->TransferLiteralToInfeed(stream_executor, literal));
+  return Status::OK();
+}
+
+TpuInfeedEnqueueTupleOp::TpuInfeedEnqueueTupleOp(OpKernelConstruction* ctx)
+    : TpuTransferAsyncOpKernel(ctx, "infeed_enqueue", 8) {
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("shapes", &shapes_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("dtypes", &dtypes_));
+  OP_REQUIRES(
+      ctx, shapes_.size() == dtypes_.size(),
+      errors::InvalidArgument("shapes and dtypes must be the same length."));
+
+  std::vector<xla::Shape> xla_shapes;
+  for (int i = 0; i < shapes_.size(); i++) {
+    xla::Shape xla_shape;
+    OP_REQUIRES_OK(ctx,
+                   TensorShapeToXLAShape(dtypes_[i], shapes_[i], &xla_shape));
+    xla_shapes.push_back(xla_shape);
+  }
+  OP_REQUIRES_OK(
+      ctx, GetInfeedShapeWithLayout(ctx, "layouts",
+                                    xla::ShapeUtil::MakeTupleShape(xla_shapes),
+                                    &tuple_shape_));
+}
+
+Status TpuInfeedEnqueueTupleOp::DoWork(
+    OpKernelContext* ctx, xla::TpuTransferManagerInterface* transfer_manager,
+    stream_executor::StreamExecutor* stream_executor) {
+  OpInputList values;
+  TF_RETURN_IF_ERROR(ctx->input_list("inputs", &values));
+  if (values.size() != shapes_.size()) {
+    return errors::InvalidArgument(
+        "Wrong number of inputs to InfeedEnqueueTuple.");
+  }
+
+  for (const auto& shapes : shapes_) {
+    VLOG(1) << "TransferLiteralToInfeed " << shapes.DebugString();
+  }
+
+  std::vector<Tensor> maybe_transposed_tensors;
+  maybe_transposed_tensors.reserve(values.size());
+  for (int i = 0; i < values.size(); i++) {
+    // Validate runtime shapes and fail if it doesn't match the contract.
+    const Tensor* tensor = &values[i];
+    if (tensor->shape() != shapes_[i]) {
+      return errors::InvalidArgument("Infeed shape mismatch for tuple element ",
+                                     i, "; expected ", shapes_[i].DebugString(),
+                                     ", got ", tensor->shape().DebugString());
+    }
+    if (!xla::LayoutUtil::IsMonotonicWithDim0Major(
+            tuple_shape_.tuple_shapes(i).layout())) {
+      // If the given layout is not in dim0major layout, tranposes the given
+      // tensor.
+      TF_ASSIGN_OR_RETURN(
+          Tensor transposed_tensor,
+          TransposeTensor(ctx, *tensor, tuple_shape_.tuple_shapes(i)));
+      maybe_transposed_tensors.emplace_back(transposed_tensor);
+    } else {
+      maybe_transposed_tensors.emplace_back(*tensor);
+    }
+  }
+
+  xla::BorrowingLiteral tuple;
+  TF_RETURN_IF_ERROR(
+      HostTensorsToBorrowingLiteralTuple(maybe_transposed_tensors, &tuple));
+
+  // Transfer the given literal to the Infeed interface of the device.
+  TF_RETURN_IF_ERROR(
+      transfer_manager->TransferLiteralToInfeed(stream_executor, tuple));
+
+  VLOG(1) << "TransferLiteralToInfeed complete.";
+
+  return Status::OK();
+}
+
+// These ops execute on either the TPU device or the CPU device. When running on
+// CPU they must specify a non-negative value for device_ordinal to indicate
+// which TPU to send infeed to.
+REGISTER_KERNEL_BUILDER(
+    Name("InfeedEnqueue").Device(DEVICE_TPU_NODE).HostMemory("input"),
+    TpuInfeedEnqueueOp);
+REGISTER_KERNEL_BUILDER(Name("InfeedEnqueue").Device(DEVICE_CPU),
+                        TpuInfeedEnqueueOp);
+
+REGISTER_KERNEL_BUILDER(
+    Name("InfeedEnqueueTuple").Device(DEVICE_TPU_NODE).HostMemory("inputs"),
+    TpuInfeedEnqueueTupleOp);
+REGISTER_KERNEL_BUILDER(Name("InfeedEnqueueTuple").Device(DEVICE_CPU),
+                        TpuInfeedEnqueueTupleOp);
+
+// Prelinearize ops run on CPU as part of tf.data input pipeline.
+REGISTER_KERNEL_BUILDER(Name("Prelinearize").Device(DEVICE_CPU),
+                        PrelinearizeOp);
+REGISTER_KERNEL_BUILDER(Name("PrelinearizeTuple").Device(DEVICE_CPU),
+                        PrelinearizeTupleOp);
+
+// InfeedEnqueuePrelinearizedBuffer op run on CPU and takes a device_ordinal to
+// select the right device to infeed.
+REGISTER_KERNEL_BUILDER(
+    Name("InfeedEnqueuePrelinearizedBuffer").Device(DEVICE_CPU),
+    InfeedEnqueuePrelinearizedBufferOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/infeed_ops.h b/tensorflow/core/tpu/kernels/infeed_ops.h
new file mode 100644
index 00000000000..622583b6a73
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/infeed_ops.h
@@ -0,0 +1,69 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_INFEED_OPS_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_INFEED_OPS_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/tpu/kernels/transfer_ops.h"
+
+namespace tensorflow {
+
+// TODO(b/65200690): Rework this when there is a callback based infeed API to
+// StreamExecutor.
+
+// The InfeedEnqueue op is used to deliver data to the device infeed queue.
+class TpuInfeedEnqueueOp : public TpuTransferAsyncOpKernel {
+ public:
+  explicit TpuInfeedEnqueueOp(OpKernelConstruction* ctx);
+  Status DoWork(OpKernelContext* ctx,
+                xla::TpuTransferManagerInterface* transfer_manager,
+                stream_executor::StreamExecutor* stream_executor) override;
+
+ private:
+  TensorShape shape_;
+  DataType dtype_;
+  xla::Shape xla_shape_;
+
+  // TpuInfeedEnqueueOp is neither copyable nor movable.
+  TpuInfeedEnqueueOp(const TpuInfeedEnqueueOp&) = delete;
+  TpuInfeedEnqueueOp& operator=(const TpuInfeedEnqueueOp&) = delete;
+};
+
+// The InfeedEnqueueTuple op is used on the host to deliver multiple tensors to
+// the device infeed queue as an XLA tuple.
+class TpuInfeedEnqueueTupleOp : public TpuTransferAsyncOpKernel {
+ public:
+  explicit TpuInfeedEnqueueTupleOp(OpKernelConstruction* ctx);
+  Status DoWork(OpKernelContext* ctx,
+                xla::TpuTransferManagerInterface* transfer_manager,
+                stream_executor::StreamExecutor* stream_executor) override;
+
+ private:
+  std::vector<TensorShape> shapes_;
+  DataTypeVector dtypes_;
+  xla::Shape tuple_shape_;
+
+  // TpuInfeedEnqueueTupleOp is neither copyable nor movable.
+  TpuInfeedEnqueueTupleOp(const TpuInfeedEnqueueTupleOp&) = delete;
+  TpuInfeedEnqueueTupleOp& operator=(const TpuInfeedEnqueueTupleOp&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_INFEED_OPS_H_
diff --git a/tensorflow/core/tpu/kernels/outfeed_ops.cc b/tensorflow/core/tpu/kernels/outfeed_ops.cc
new file mode 100644
index 00000000000..51a3a71a297
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/outfeed_ops.cc
@@ -0,0 +1,116 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/tpu/kernels/outfeed_ops.h"
+
+#include "tensorflow/compiler/jit/xla_device.h"
+#include "tensorflow/compiler/tf2xla/literal_util.h"
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/type_util.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/tpu/kernels/transfer_ops.h"
+#include "tensorflow/core/tpu/tpu_defs.h"
+#include "tensorflow/stream_executor/multi_platform_manager.h"
+
+namespace tensorflow {
+
+TpuOutfeedDequeueOp::TpuOutfeedDequeueOp(OpKernelConstruction* ctx)
+    : TpuTransferAsyncOpKernel(ctx, "outfeed_dequeue", 1) {
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("shape", &shape_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("dtype", &dtype_));
+  OP_REQUIRES_OK(ctx, TensorShapeToXLAShape(dtype_, shape_, &xla_shape_));
+}
+
+Status TpuOutfeedDequeueOp::DoWork(
+    OpKernelContext* ctx, xla::TpuTransferManagerInterface* transfer_manager,
+    stream_executor::StreamExecutor* stream_executor) {
+  Tensor* output;
+  TF_RETURN_IF_ERROR(ctx->allocate_output(0, shape_, &output));
+
+  // Transfer from the outfeed interface of the device.
+  xla::MutableBorrowingLiteral literal;
+  TF_RETURN_IF_ERROR(
+      HostTensorToMutableBorrowingLiteral(xla_shape_, output, &literal));
+
+  VLOG(1) << "TransferLiteralFromOutfeed "
+          << xla::ShapeUtil::HumanStringWithLayout(xla_shape_);
+
+  TF_RETURN_IF_ERROR(transfer_manager->TransferLiteralFromOutfeed(
+      stream_executor, xla_shape_, literal));
+
+  VLOG(1) << "TransferLiteralFromOutfeed complete.";
+
+  return Status::OK();
+}
+
+// The OutfeedDequeueTuple op is used to retrieve multiple tensors from the
+// device outfeed queue.
+TpuOutfeedDequeueTupleOp::TpuOutfeedDequeueTupleOp(OpKernelConstruction* ctx)
+    : TpuTransferAsyncOpKernel(ctx, "outfeed_dequeue", 1) {
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("shapes", &shapes_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("dtypes", &dtypes_));
+  OP_REQUIRES(
+      ctx, shapes_.size() == dtypes_.size(),
+      errors::InvalidArgument("shapes and dtypes must be the same length."));
+  // The `dtypes` list is inferred from the supplied inputs, so it
+  // is always the correct length.
+  for (int i = 0; i < shapes_.size(); i++) {
+    xla::Shape xla_shape;
+    OP_REQUIRES_OK(ctx,
+                   TensorShapeToXLAShape(dtypes_[i], shapes_[i], &xla_shape));
+    xla_shapes_.push_back(xla_shape);
+  }
+  tuple_shape_ = xla::ShapeUtil::MakeTupleShape(xla_shapes_);
+}
+
+Status TpuOutfeedDequeueTupleOp::DoWork(
+    OpKernelContext* ctx, xla::TpuTransferManagerInterface* transfer_manager,
+    stream_executor::StreamExecutor* stream_executor) {
+  VLOG(1) << "TransferLiteralFromOutfeed "
+          << xla::ShapeUtil::HumanStringWithLayout(tuple_shape_);
+
+  for (int i = 0; i < shapes_.size(); ++i) {
+    Tensor* output;
+    TF_RETURN_IF_ERROR(ctx->allocate_output(i, shapes_[i], &output));
+
+    xla::MutableBorrowingLiteral literal;
+    TF_RETURN_IF_ERROR(
+        HostTensorToMutableBorrowingLiteral(xla_shapes_[i], output, &literal));
+    TF_RETURN_IF_ERROR(transfer_manager->TransferLiteralFromOutfeed(
+        stream_executor, xla_shapes_[i], literal));
+  }
+  return Status::OK();
+}
+
+// These ops execute on either the TPU device or the CPU device. When
+// running on CPU they must specify a non-negative value for
+// device_ordinal to indicate which TPU to receive outfeed from.
+REGISTER_KERNEL_BUILDER(
+    Name("OutfeedDequeue").Device(DEVICE_TPU_NODE).HostMemory("output"),
+    TpuOutfeedDequeueOp);
+REGISTER_KERNEL_BUILDER(Name("OutfeedDequeue").Device(DEVICE_CPU),
+                        TpuOutfeedDequeueOp);
+
+REGISTER_KERNEL_BUILDER(
+    Name("OutfeedDequeueTuple").Device(DEVICE_TPU_NODE).HostMemory("outputs"),
+    TpuOutfeedDequeueTupleOp);
+REGISTER_KERNEL_BUILDER(Name("OutfeedDequeueTuple").Device(DEVICE_CPU),
+                        TpuOutfeedDequeueTupleOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/outfeed_ops.h b/tensorflow/core/tpu/kernels/outfeed_ops.h
new file mode 100644
index 00000000000..5e3ed87c04b
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/outfeed_ops.h
@@ -0,0 +1,69 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_OUTFEED_OPS_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_OUTFEED_OPS_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/tpu/kernels/transfer_ops.h"
+
+namespace tensorflow {
+
+// The OutfeedDequeue op is used to retrieve a single tensor from the device
+// outfeed queue.
+class TpuOutfeedDequeueOp : public TpuTransferAsyncOpKernel {
+ public:
+  explicit TpuOutfeedDequeueOp(OpKernelConstruction* ctx);
+
+  Status DoWork(OpKernelContext* ctx,
+                xla::TpuTransferManagerInterface* transfer_manager,
+                stream_executor::StreamExecutor* stream_executor) override;
+
+ private:
+  TensorShape shape_;
+  DataType dtype_;
+  xla::Shape xla_shape_;
+
+  // OutfeedDequeueOp is neither copyable nor movable.
+  TpuOutfeedDequeueOp(const TpuOutfeedDequeueOp&) = delete;
+  TpuOutfeedDequeueOp& operator=(const TpuOutfeedDequeueOp&) = delete;
+};
+
+// The OutfeedDequeueTuple op is used to retrieve multiple tensors from the
+// device outfeed queue.
+class TpuOutfeedDequeueTupleOp : public TpuTransferAsyncOpKernel {
+ public:
+  explicit TpuOutfeedDequeueTupleOp(OpKernelConstruction* ctx);
+
+  Status DoWork(OpKernelContext* ctx,
+                xla::TpuTransferManagerInterface* transfer_manager,
+                stream_executor::StreamExecutor* stream_executor) override;
+
+ private:
+  std::vector<TensorShape> shapes_;
+  DataTypeVector dtypes_;
+  std::vector<xla::Shape> xla_shapes_;
+  xla::Shape tuple_shape_;
+
+  // OutfeedDequeueTupleOp is neither copyable nor movable.
+  TpuOutfeedDequeueTupleOp(const TpuOutfeedDequeueTupleOp&) = delete;
+  TpuOutfeedDequeueTupleOp& operator=(const TpuOutfeedDequeueTupleOp&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_OUTFEED_OPS_H_
diff --git a/tensorflow/c/eager/parallel_device/parallel_device_ops.cc b/tensorflow/core/tpu/kernels/replication_ops.cc
similarity index 62%
rename from tensorflow/c/eager/parallel_device/parallel_device_ops.cc
rename to tensorflow/core/tpu/kernels/replication_ops.cc
index 1decffca047..4c986e880e7 100644
--- a/tensorflow/c/eager/parallel_device/parallel_device_ops.cc
+++ b/tensorflow/core/tpu/kernels/replication_ops.cc
@@ -13,14 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/compiler/jit/xla_device_ops.h"
 #include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/tpu/tpu_defs.h"
 
-// TODO(allenl): Figure out if we need this op, and if so whether we should move
-// it to core TF. Right now the eager C API does some checking of op
-// registrations before calling into custom devices, but we may be able to avoid
-// that.
-REGISTER_OP("DeviceID")
-    .Output("device_id: int64")
-    .SetIsStateful()
-    .SetShapeFn(tensorflow::shape_inference::ScalarShape);
+namespace tensorflow {
+
+REGISTER_KERNEL_BUILDER(Name("_TPUReplicate").Device(DEVICE_TPU_SYSTEM),
+                        XlaDeviceDummyOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache.proto b/tensorflow/core/tpu/kernels/tpu_compilation_cache.proto
index 8308cba128e..f4529224109 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache.proto
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache.proto
@@ -16,10 +16,27 @@ syntax = "proto3";
 
 package tensorflow.tpu;
 
-// Target type for compilation cache fetch operation.
-enum CompilationCacheFetchTarget {
-  INVALID = 0;
-  MAIN = 1;
-  SHARDING = 2;
-  UNSHARDING = 3;
+import "tensorflow/compiler/tf2xla/host_compute_metadata.proto";
+import "tensorflow/core/tpu/kernels/tpu_compilation_cache_common.proto";
+
+// Response for GetTpuProgram RPC.
+message GetTpuProgramResponseExternal {
+  message Blob {
+    bytes data = 1;
+  }
+
+  Blob proto = 1;
+  tf2xla.HostComputeMetadata host_compute_metadata = 2;
+  bool may_modify_variables = 3;
+  Blob compiler_metadata = 4;
+  // Whether the program is empty, which could be true for sharding/unsharding
+  // entries.
+  bool is_empty = 5;
+}
+
+service TpuCompilationCacheServiceExternal {
+  // This method requests the cached proto that the TPU execute op has been
+  // instructed to execute.
+  rpc GetTpuProgram(GetTpuProgramRequest)
+      returns (GetTpuProgramResponseExternal) {}
 }
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_common.proto b/tensorflow/core/tpu/kernels/tpu_compilation_cache_common.proto
new file mode 100644
index 00000000000..89b92ae9157
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_common.proto
@@ -0,0 +1,38 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+syntax = "proto3";
+
+package tensorflow.tpu;
+
+// Target type for compilation cache fetch operation.
+enum CompilationCacheFetchTarget {
+  INVALID = 0;
+  MAIN = 1;
+  SHARDING = 2;
+  UNSHARDING = 3;
+}
+
+message TpuCompilationUidAndIndex {
+  int64 uid = 1;
+  int32 proto_index = 2;
+}
+
+message GetTpuProgramRequest {
+  oneof key_oneof {
+    string key = 1;
+    TpuCompilationUidAndIndex uid_and_index = 2;
+  }
+  CompilationCacheFetchTarget fetch_target = 3;
+}
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.cc b/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.cc
deleted file mode 100644
index 73f55853306..00000000000
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.cc
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.h"
-
-#include "tensorflow/core/platform/casts.h"
-
-namespace tensorflow {
-namespace tpu {
-
-TpuCompilationCacheEntry::TpuCompilationCacheEntry(
-    const TpuProgramGroupInterface* tpu_program_group, int core_index)
-    : tpu_program_group_(
-          tensorflow::down_cast<const TpuProgramGroup*>(tpu_program_group)),
-      core_index_(core_index) {}
-
-// Constructor for an empty entry.
-TpuCompilationCacheEntry::TpuCompilationCacheEntry()
-    : tpu_program_group_(nullptr) {}
-
-const TPUExecutableInfoProto* TpuCompilationCacheEntry::get_executable_info()
-    const {
-  return &(tpu_program_group_->executable_info());
-}
-
-const TPUHostTransferInfoProto*
-TpuCompilationCacheEntry::get_host_transfer_info() const {
-  return &(tpu_program_group_->host_transfer_info());
-}
-
-const xla::HloProto* TpuCompilationCacheEntry::get_hlo_metadata() const {
-  return tpu_program_group_->hlo_metadatas()[core_index_];
-}
-
-// TODO(henrytan,jiawenhao): When should we expect more than one
-// XLA_TpuProgram* per TpuProgram? Remove the program_count CHECK below then.
-const XLA_TpuProgram* TpuCompilationCacheEntry::get_tpu_program() const {
-  CHECK_EQ(tpu_program_group_->program_count(), 1);
-  return tpu_program_group_->tpu_programs()[core_index_];
-}
-
-}  // namespace tpu
-}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.h
index b3766b8b4dd..832d76bfceb 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.h
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.h
@@ -18,30 +18,32 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/tpu/kernels/tpu_executable_info.pb.h"
-#include "tensorflow/core/tpu/kernels/tpu_program_group.h"
+#include "tensorflow/core/tpu/kernels/tpu_program_group_interface.h"
 
 namespace tensorflow {
 namespace tpu {
 
-// A version of `CompilationCacheEntry` to access Tpu binary program
-// `XLA_TpuProgram`.
+// Cache entry to hold a `TpuProgramGroupInterface` object that can be used to
+// fetch a TPU program for a given TPU core index.
 class TpuCompilationCacheEntry {
  public:
   explicit TpuCompilationCacheEntry(
-      const TpuProgramGroupInterface* tpu_program_group, int core_index);
+      const TpuProgramGroupInterface* tpu_program_group, int core_index)
+      : tpu_program_group_(tpu_program_group), core_index_(core_index) {}
+
   // Constructor for an empty entry.
-  TpuCompilationCacheEntry();
-  const TPUExecutableInfoProto* get_executable_info() const;
-  const TPUHostTransferInfoProto* get_host_transfer_info() const;
-  const xla::HloProto* get_hlo_metadata() const;
-  // TODO(henrytan): maybe nicer to return C++ wrapper of `XLA_TpuProgram`
-  const XLA_TpuProgram* get_tpu_program() const;
+  TpuCompilationCacheEntry() : tpu_program_group_(nullptr), core_index_(-1) {}
+
+  const TpuProgramGroupInterface* tpu_program_group() const {
+    return tpu_program_group_;
+  }
+
+  int core_index() const { return core_index_; }
 
  private:
-  const TpuProgramGroup* tpu_program_group_;
+  const TpuProgramGroupInterface* tpu_program_group_;
   int core_index_;
 };
-
 }  // namespace tpu
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry_impl.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry_impl.h
deleted file mode 100644
index 0632d9a163f..00000000000
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry_impl.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_ENTRY_IMPL_H_
-#define TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_ENTRY_IMPL_H_
-#include "tensorflow/core/tpu/kernels/compiled_subgraph.h"
-#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h"
-#include "tensorflow/core/tpu/kernels/tpu_executable_info.pb.h"
-namespace tensorflow {
-namespace tpu {
-// Wrapper for a cache entry that holds a reference to the entry until the
-// wrapper is deleted. This wrapper is the concrete type of
-// CompilationCacheEntryRef returned by Lookup.
-template <typename CacheEntryType>
-class CompilationCacheEntryRefImpl
-    : public CompilationCacheEntryRef<CacheEntryType> {
- public:
-  CompilationCacheEntryRefImpl(TpuCompilationCacheInterface* parent,
-                               CompiledSubgraph* entry, int index);
-  ~CompilationCacheEntryRefImpl() override;
-  Status ToSubEntryRef(CompilationCacheFetchTarget fetch_target) override;
-
- protected:
-  TpuCompilationCacheInterface* parent_;  // Not owned.
-  // A reference to entry_ is acquired in the constructor and released via
-  // parent->DiscardEntryRefs in the destructor.
-  CompiledSubgraph* entry_;
-  // The index of the program in entry_ that is returned by the get method.
-  int index_;
-};
-template <typename CacheEntryType>
-CompilationCacheEntryRefImpl<CacheEntryType>::CompilationCacheEntryRefImpl(
-    TpuCompilationCacheInterface* parent, CompiledSubgraph* entry, int index)
-    : parent_(parent), entry_(entry), index_(index) {
-  if (entry_ == nullptr) {
-    return;
-  }
-  if (entry_->main_entry == nullptr) {
-    entry_->Ref();
-  } else {
-    // This is a sharding/unsharding entry nested in a main entry. Only
-    // refcount the main entry.
-    entry_->main_entry->Ref();
-  }
-}
-template <typename CacheEntryType>
-CompilationCacheEntryRefImpl<CacheEntryType>::~CompilationCacheEntryRefImpl() {
-  if (entry_ == nullptr) {
-    return;
-  }
-  if (entry_->main_entry == nullptr) {
-    parent_->DiscardEntryRefs({entry_});
-  } else {
-    parent_->DiscardEntryRefs({entry_->main_entry});
-  }
-}
-template <typename CacheEntryType>
-Status CompilationCacheEntryRefImpl<CacheEntryType>::ToSubEntryRef(
-    CompilationCacheFetchTarget fetch_target) {
-  CompiledSubgraph* target = nullptr;
-  switch (fetch_target) {
-    case CompilationCacheFetchTarget::MAIN:
-      target = entry_;
-      break;
-    case CompilationCacheFetchTarget::SHARDING:
-      target = entry_->sharding_entry.get();
-      break;
-    case CompilationCacheFetchTarget::UNSHARDING:
-      target = entry_->unsharding_entry.get();
-      break;
-    default:
-      return xla::InvalidArgument("Invalid fetch target: %d", fetch_target);
-  }
-  if (target == nullptr) {
-    // Cache entry does not have an unsharding subentry. Unref and replace
-    // with nullptr.
-    parent_->DiscardEntryRefs({entry_});
-  }
-  // Otherwise, since the refcount is always on the main entry, we don't
-  // need ref/unref.
-  entry_ = target;
-  return Status::OK();
-}
-}  // namespace tpu
-}  // namespace tensorflow
-#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_ENTRY_IMPL_H_
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.cc b/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.cc
index b4b18d1743b..80010d70cd4 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.cc
@@ -16,15 +16,18 @@ limitations under the License.
 
 #include <string>
 
+#include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/platform/random.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/core/tpu/kernels/compiled_subgraph.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_metrics.h"
 #include "tensorflow/core/tpu/kernels/tpu_compile_c_api.h"
 #include "tensorflow/core/tpu/kernels/tpu_compile_op_support.h"
+#include "tensorflow/core/tpu/kernels/tpu_program_c_api.h"
 #include "tensorflow/core/tpu/kernels/tpu_util.h"
 #include "tensorflow/core/tpu/kernels/trace_util.h"
 
@@ -48,23 +51,22 @@ void PopulateEntry(const std::string& key, CompiledSubgraph* entry,
   entry->tpu_program_group =
       absl::make_unique<TpuProgramGroup>(std::move(tpu_program_group));
   entry->initialized = true;
+
+  if (entry->initialization_status.ok()) {
+    // Compute the entries total size once all members are initialized.
+    entry->total_size = entry->ComputeTotalSize();
+  }
+}
+
+std::unique_ptr<CompiledSubgraph> CreateAndInitializeCompiledSubgraph(
+    CompiledSubgraph* main_entry) {
+  auto entry = absl::make_unique<CompiledSubgraph>();
+  entry->main_entry = main_entry;
+  entry->tpu_program_group = absl::make_unique<TpuProgramGroup>();
+  return entry;
 }
 }  // namespace
 
-TpuCompilationCacheExternal::EntryRefImpl::EntryRefImpl(
-    TpuCompilationCacheInterface* parent, CompiledSubgraph* entry, int index)
-    : CompilationCacheEntryRefImpl<TpuCompilationCacheEntry>(parent, entry,
-                                                             index) {}
-
-TpuCompilationCacheEntry TpuCompilationCacheExternal::EntryRefImpl::get() {
-  if (entry_ == nullptr) {
-    // Create an empty entry if the entry is nullptr. This corresponds to
-    // non-existing sharding/unsharding entries.
-    return TpuCompilationCacheEntry();
-  }
-  return TpuCompilationCacheEntry(entry_->tpu_program_group.get(), index_);
-}
-
 CompiledSubgraph* TpuCompilationCacheExternal::InitializeEntry(
     const string& key,
     const std::function<Status(TpuProgramGroupInterface*)>& initialize_program,
@@ -73,7 +75,6 @@ CompiledSubgraph* TpuCompilationCacheExternal::InitializeEntry(
   main_entry->parent = this;
   main_entry->subgraph_key = key;
   main_entry->uid = get_uid();
-  // TODO(henrytan): implement TpuCompilationCacheKey.debug_string.
   main_entry->cache_entry_debug_string = subgraph_key.prefix;
   VLOG(1) << "Cache Initializing Entry Session Debug "
           << main_entry->cache_entry_debug_string;
@@ -112,17 +113,29 @@ CompiledSubgraph* TpuCompilationCacheExternal::InitializeEntry(
       std::pair<int64, CompiledSubgraph*>(main_entry->uid, main_entry));
   CHECK(uid_inserted.second);
 
-  if (initialization_status.ok()) {
-    // Compute the entries total size once all members are initialized.
-    main_entry->total_size = tpu_program_group.program_size();
+  if (tpu_program_group.has_sharding_program()) {
+    main_entry->sharding_entry =
+        CreateAndInitializeCompiledSubgraph(main_entry);
+    TpuProgramGroup sharding_programs;
+    sharding_programs.Initialize(
+        tpu_program_group.tpu_programs(TpuProgramShardingType::kSharding));
+    PopulateEntry(key, main_entry->sharding_entry.get(),
+                  std::move(sharding_programs));
+
+    main_entry->unsharding_entry =
+        CreateAndInitializeCompiledSubgraph(main_entry);
+    TpuProgramGroup unsharding_programs;
+    unsharding_programs.Initialize(
+        tpu_program_group.tpu_programs(TpuProgramShardingType::kUnsharding));
+    PopulateEntry(key, main_entry->unsharding_entry.get(),
+                  std::move(unsharding_programs));
   }
 
-  // TODO(henrytan): handle sharding/unsharding.
   PopulateEntry(key, main_entry, std::move(tpu_program_group));
 
   for (int64 i = 0; i < main_entry->proto_key.size(); ++i) {
     auto entry_inserted = entries_by_proto_key_.insert(
-        std::pair<string, std::pair<CompiledSubgraph*, int>>(
+        std::pair<std::string, std::pair<CompiledSubgraph*, int>>(
             main_entry->proto_key[i], std::make_pair(main_entry, i)));
     CHECK(entry_inserted.second);
   }
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.h
index 86615b15d4c..c3f95e7e09d 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.h
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.h
@@ -30,9 +30,8 @@ limitations under the License.
 #include "tensorflow/core/platform/refcount.h"
 #include "tensorflow/core/protobuf/tpu/compile_metadata.pb.h"
 #include "tensorflow/core/tpu/kernels/compiled_subgraph.h"
-#include "tensorflow/core/tpu/kernels/tpu_compilation_cache.pb.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_common.pb.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.h"
-#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_entry_impl.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_key.h"
 #include "tensorflow/core/tpu/kernels/tpu_compile_c_api.h"
@@ -46,17 +45,6 @@ namespace tpu {
 
 class TpuCompilationCacheExternal : public TpuCompilationCacheInterface {
  public:
-  using Status = ::stream_executor::port::Status;
-
-  class EntryRefImpl
-      : public CompilationCacheEntryRefImpl<TpuCompilationCacheEntry> {
-   public:
-    EntryRefImpl(TpuCompilationCacheInterface* parent, CompiledSubgraph* entry,
-                 int index);
-
-    TpuCompilationCacheEntry get() override;
-  };
-
   explicit TpuCompilationCacheExternal(int64 max_cache_size)
       : TpuCompilationCacheInterface(max_cache_size) {}
 
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_grpc.cc b/tensorflow/core/tpu/kernels/tpu_compilation_cache_grpc.cc
new file mode 100644
index 00000000000..207a60e7b48
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_grpc.cc
@@ -0,0 +1,107 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_grpc.h"
+
+#include <grpcpp/impl/codegen/async_stream.h>
+#include <grpcpp/impl/codegen/async_unary_call.h>
+#include <grpcpp/impl/codegen/channel_interface.h>
+#include <grpcpp/impl/codegen/client_callback.h>
+#include <grpcpp/impl/codegen/client_unary_call.h>
+#include <grpcpp/impl/codegen/method_handler.h>
+#include <grpcpp/impl/codegen/rpc_service_method.h>
+#include <grpcpp/impl/codegen/server_callback.h>
+#include <grpcpp/impl/codegen/service_type.h>
+#include <grpcpp/impl/codegen/sync_stream.h>
+
+#include <functional>
+namespace tensorflow {
+namespace tpu {
+
+static const char* grpcTpuCompilationCacheService_method_names[] = {
+#if defined(LIBTFTPU)
+    "/tensorflow.tpu.TpuCompilationCacheServiceExternal/GetTpuProgram",
+#else  // LIBTFTPU
+    "/tensorflow.tpu.TpuCompilationCacheService/GetTpuProgram",
+#endif  // LIBTFTPU
+};
+
+std::unique_ptr<grpc::TpuCompilationCacheService::Stub>
+grpc::TpuCompilationCacheService::NewStub(
+    const std::shared_ptr< ::grpc::ChannelInterface>& channel,
+    const ::grpc::StubOptions& options) {
+  (void)options;
+  std::unique_ptr<grpc::TpuCompilationCacheService::Stub> stub(
+      new grpc::TpuCompilationCacheService::Stub(channel));
+  return stub;
+}
+
+grpc::TpuCompilationCacheService::Stub::Stub(
+    const std::shared_ptr< ::grpc::ChannelInterface>& channel)
+    : channel_(channel),
+      rpcmethod_get_tpu_program_(grpcTpuCompilationCacheService_method_names[0],
+                                 ::grpc::internal::RpcMethod::NORMAL_RPC,
+                                 channel) {}
+
+::grpc::Status grpc::TpuCompilationCacheService::Stub::GetTpuProgram(
+    ::grpc::ClientContext* context, const RequestType& request,
+    ResponseType* response) {
+  return ::grpc::internal::BlockingUnaryCall(
+      channel_.get(), rpcmethod_get_tpu_program_, context, request, response);
+}
+
+::grpc::ClientAsyncResponseReader<
+    grpc::TpuCompilationCacheService::ResponseType>*
+grpc::TpuCompilationCacheService::Stub::AsyncGetTpuProgramRaw(
+    ::grpc::ClientContext* context, const RequestType& request,
+    ::grpc::CompletionQueue* cq) {
+  return ::grpc::internal::ClientAsyncResponseReaderFactory<
+      ResponseType>::Create(channel_.get(), cq, rpcmethod_get_tpu_program_,
+                            context, request, true);
+}
+
+::grpc::ClientAsyncResponseReader<
+    grpc::TpuCompilationCacheService::ResponseType>*
+grpc::TpuCompilationCacheService::Stub::PrepareAsyncGetTpuProgramRaw(
+    ::grpc::ClientContext* context, const RequestType& request,
+    ::grpc::CompletionQueue* cq) {
+  return ::grpc::internal::ClientAsyncResponseReaderFactory<
+      ResponseType>::Create(channel_.get(), cq, rpcmethod_get_tpu_program_,
+                            context, request, false);
+}
+
+grpc::TpuCompilationCacheService::Service::Service() {
+  AddMethod(new ::grpc::internal::RpcServiceMethod(
+      grpcTpuCompilationCacheService_method_names[0],
+      ::grpc::internal::RpcMethod::NORMAL_RPC,
+      new ::grpc::internal::RpcMethodHandler<
+          grpc::TpuCompilationCacheService::Service, RequestType, ResponseType>(
+          std::mem_fn(
+              &grpc::TpuCompilationCacheService::Service::GetTpuProgram),
+          this)));
+}
+
+grpc::TpuCompilationCacheService::Service::~Service() {}
+
+::grpc::Status grpc::TpuCompilationCacheService::Service::GetTpuProgram(
+    ::grpc::ServerContext* context, const RequestType* request,
+    ResponseType* response) {
+  (void)context;
+  (void)request;
+  (void)response;
+  return ::grpc::Status(::grpc::StatusCode::UNIMPLEMENTED, "");
+}
+
+}  // namespace tpu
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_grpc.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_grpc.h
new file mode 100644
index 00000000000..324fc9e6f08
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_grpc.h
@@ -0,0 +1,235 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Copied from auto-generated gRPC code in order to enable using grpc_call.h
+// for raw message handling.
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_GRPC_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_GRPC_H_
+
+#include <grpcpp/impl/codegen/async_generic_service.h>
+#include <grpcpp/impl/codegen/async_stream.h>
+#include <grpcpp/impl/codegen/async_unary_call.h>
+#include <grpcpp/impl/codegen/client_callback.h>
+#include <grpcpp/impl/codegen/client_context.h>
+#include <grpcpp/impl/codegen/completion_queue.h>
+#include <grpcpp/impl/codegen/method_handler.h>
+#include <grpcpp/impl/codegen/proto_utils.h>
+#include <grpcpp/impl/codegen/rpc_method.h>
+#include <grpcpp/impl/codegen/server_callback.h>
+#include <grpcpp/impl/codegen/server_context.h>
+#include <grpcpp/impl/codegen/service_type.h>
+#include <grpcpp/impl/codegen/status.h>
+#include <grpcpp/impl/codegen/stub_options.h>
+#include <grpcpp/impl/codegen/sync_stream.h>
+
+#include <functional>
+
+#if defined(LIBTFTPU)
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache.pb.h"
+#else
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache.pb.h"  // copybara"
+#endif
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_common.pb.h"
+
+namespace tensorflow {
+namespace tpu {
+namespace grpc {
+class TpuCompilationCacheService final {
+ public:
+  using RequestType = ::tensorflow::tpu::GetTpuProgramRequest;
+#if defined(LIBTFTPU)
+  using ResponseType = ::tensorflow::tpu::GetTpuProgramResponseExternal;
+#else
+  using ResponseType = ::tensorflow::tpu::GetTpuProgramResponse;
+#endif
+
+  // N.B. This must be synchronized with the method order in
+  // tpu_compilation_cache.proto.
+  enum class MethodId { kGetTpuProgram = 0 };
+
+  static constexpr char const* service_full_name() {
+#if defined(LIBTFTPU)
+    return "tensorflow.tpu.TpuCompilationCacheServiceExternal";
+#else
+    return "tensorflow.tpu.TpuCompilationCacheService";
+#endif
+  }
+  class StubInterface {
+   public:
+    virtual ~StubInterface() {}
+    // This method requests the cached proto that the TPU execute op has
+    // been instructed to execute.
+    virtual ::grpc::Status GetTpuProgram(::grpc::ClientContext* context,
+                                         const RequestType& request,
+                                         ResponseType* response) = 0;
+    std::unique_ptr<::grpc::ClientAsyncResponseReaderInterface<ResponseType>>
+    AsyncGetTpuProgram(::grpc::ClientContext* context,
+                       const RequestType& request,
+                       ::grpc::CompletionQueue* cq) {
+      return std::unique_ptr<
+          ::grpc::ClientAsyncResponseReaderInterface<ResponseType>>(
+          AsyncGetTpuProgramRaw(context, request, cq));
+    }
+    std::unique_ptr<::grpc::ClientAsyncResponseReaderInterface<ResponseType>>
+    PrepareAsyncGetTpuProgram(::grpc::ClientContext* context,
+                              const RequestType& request,
+                              ::grpc::CompletionQueue* cq) {
+      return std::unique_ptr<
+          ::grpc::ClientAsyncResponseReaderInterface<ResponseType>>(
+          PrepareAsyncGetTpuProgramRaw(context, request, cq));
+    }
+
+   private:
+    virtual ::grpc::ClientAsyncResponseReaderInterface<ResponseType>*
+    AsyncGetTpuProgramRaw(::grpc::ClientContext* context,
+                          const RequestType& request,
+                          ::grpc::CompletionQueue* cq) = 0;
+    virtual ::grpc::ClientAsyncResponseReaderInterface<ResponseType>*
+    PrepareAsyncGetTpuProgramRaw(::grpc::ClientContext* context,
+                                 const RequestType& request,
+                                 ::grpc::CompletionQueue* cq) = 0;
+  };
+  class Stub final : public StubInterface {
+   public:
+    explicit Stub(const std::shared_ptr<::grpc::ChannelInterface>& channel);
+    ::grpc::Status GetTpuProgram(::grpc::ClientContext* context,
+                                 const RequestType& request,
+                                 ResponseType* response) override;
+    std::unique_ptr<::grpc::ClientAsyncResponseReader<ResponseType>>
+    AsyncGetTpuProgram(::grpc::ClientContext* context,
+                       const RequestType& request,
+                       ::grpc::CompletionQueue* cq) {
+      return std::unique_ptr<::grpc::ClientAsyncResponseReader<ResponseType>>(
+          AsyncGetTpuProgramRaw(context, request, cq));
+    }
+    std::unique_ptr<::grpc::ClientAsyncResponseReader<ResponseType>>
+    PrepareAsyncGetTpuProgram(::grpc::ClientContext* context,
+                              const RequestType& request,
+                              ::grpc::CompletionQueue* cq) {
+      return std::unique_ptr<::grpc::ClientAsyncResponseReader<ResponseType>>(
+          PrepareAsyncGetTpuProgramRaw(context, request, cq));
+    }
+
+   private:
+    std::shared_ptr<::grpc::ChannelInterface> channel_;
+    ::grpc::ClientAsyncResponseReader<ResponseType>* AsyncGetTpuProgramRaw(
+        ::grpc::ClientContext* context, const RequestType& request,
+        ::grpc::CompletionQueue* cq) override;
+    ::grpc::ClientAsyncResponseReader<ResponseType>*
+    PrepareAsyncGetTpuProgramRaw(::grpc::ClientContext* context,
+                                 const RequestType& request,
+                                 ::grpc::CompletionQueue* cq) override;
+    const ::grpc::internal::RpcMethod rpcmethod_get_tpu_program_;
+  };
+  static std::unique_ptr<Stub> NewStub(
+      const std::shared_ptr<::grpc::ChannelInterface>& channel,
+      const ::grpc::StubOptions& options = ::grpc::StubOptions());
+
+  class Service : public ::grpc::Service {
+   public:
+    Service();
+    ~Service() override;
+    // This method requests the cached proto that the TPU execute op has
+    // been instructed to execute.
+    virtual ::grpc::Status GetTpuProgram(::grpc::ServerContext* context,
+                                         const RequestType* request,
+                                         ResponseType* response);
+  };
+  template <class BaseClass>
+  class WithAsyncMethod_GetTpuProgram : public BaseClass {
+   private:
+    void BaseClassMustBeDerivedFromService(const Service* service) {}
+
+   public:
+    WithAsyncMethod_GetTpuProgram() { ::grpc::Service::MarkMethodAsync(0); }
+    ~WithAsyncMethod_GetTpuProgram() override {
+      BaseClassMustBeDerivedFromService(this);
+    }
+    // disable synchronous version of this method
+    ::grpc::Status GetTpuProgram(::grpc::ServerContext* context,
+                                 const RequestType* request,
+                                 ResponseType* response) override {
+      abort();
+      return ::grpc::Status(::grpc::StatusCode::UNIMPLEMENTED, "");
+    }
+    void RequestGetTpuProgram(
+        ::grpc::ServerContext* context, RequestType* request,
+        ::grpc::ServerAsyncResponseWriter<ResponseType>* response,
+        ::grpc::CompletionQueue* new_call_cq,
+        ::grpc::ServerCompletionQueue* notification_cq, void* tag) {
+      ::grpc::Service::RequestAsyncUnary(0, context, request, response,
+                                         new_call_cq, notification_cq, tag);
+    }
+
+    // Make RequestAsyncUnary accessible to grpc_call.h
+    using ::grpc::Service::RequestAsyncUnary;
+  };
+  typedef WithAsyncMethod_GetTpuProgram<Service> AsyncService;
+  template <class BaseClass>
+  class WithGenericMethod_GetTpuProgram : public BaseClass {
+   private:
+    void BaseClassMustBeDerivedFromService(const Service* service) {}
+
+   public:
+    WithGenericMethod_GetTpuProgram() { ::grpc::Service::MarkMethodGeneric(0); }
+    ~WithGenericMethod_GetTpuProgram() override {
+      BaseClassMustBeDerivedFromService(this);
+    }
+    // disable synchronous version of this method
+    ::grpc::Status GetTpuProgram(::grpc::ServerContext* context,
+                                 const RequestType* request,
+                                 ResponseType* response) override {
+      abort();
+      return ::grpc::Status(::grpc::StatusCode::UNIMPLEMENTED, "");
+    }
+  };
+  template <class BaseClass>
+  class WithStreamedUnaryMethod_GetTpuProgram : public BaseClass {
+   private:
+    void BaseClassMustBeDerivedFromService(const Service* service) {}
+
+   public:
+    WithStreamedUnaryMethod_GetTpuProgram() {
+      ::grpc::Service::MarkMethodStreamed(
+          0,
+          new ::grpc::internal::StreamedUnaryHandler<RequestType, ResponseType>(
+              std::bind(&WithStreamedUnaryMethod_GetTpuProgram<
+                            BaseClass>::StreamedGetTpuProgram,
+                        this, std::placeholders::_1, std::placeholders::_2)));
+    }
+    ~WithStreamedUnaryMethod_GetTpuProgram() override {
+      BaseClassMustBeDerivedFromService(this);
+    }
+    // disable regular version of this method
+    ::grpc::Status GetTpuProgram(::grpc::ServerContext* context,
+                                 const RequestType* request,
+                                 ResponseType* response) override {
+      abort();
+      return ::grpc::Status(::grpc::StatusCode::UNIMPLEMENTED, "");
+    }
+    // replace default version of method with streamed unary
+    virtual ::grpc::Status StreamedGetTpuProgram(
+        ::grpc::ServerContext* context,
+        ::grpc::ServerUnaryStreamer<RequestType, ResponseType>*
+            server_unary_streamer) = 0;
+  };
+  typedef WithStreamedUnaryMethod_GetTpuProgram<Service> StreamedUnaryService;
+  typedef Service SplitStreamedService;
+  typedef WithStreamedUnaryMethod_GetTpuProgram<Service> StreamedService;
+};
+}  // namespace grpc
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_GRPC_H_
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.cc b/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.cc
index 9e1aedf92ce..0d0a2ae1b61 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.cc
@@ -38,10 +38,77 @@ void TpuCompilationCacheInterface::RefHolder::AddRef(CompiledSubgraph* entry) {
   entries_.push_back(entry);
 }
 
-string TpuCompilationCacheInterface::RefHolder::DebugString() const {
+std::string TpuCompilationCacheInterface::RefHolder::DebugString() const {
   return "TpuCompilationCacheRefHolder";
 }
 
+CompilationCacheEntryRef::CompilationCacheEntryRef()
+    : parent_(nullptr), entry_(nullptr), index_(0) {}
+
+CompilationCacheEntryRef::CompilationCacheEntryRef(
+    TpuCompilationCacheInterface* parent, CompiledSubgraph* entry, int index)
+    : parent_(parent), entry_(entry), index_(index) {
+  if (entry_ == nullptr) {
+    return;
+  }
+  if (entry_->main_entry == nullptr) {
+    entry_->Ref();
+  } else {
+    // This is a sharding/unsharding entry nested in a main entry. Only
+    // refcount the main entry.
+    entry_->main_entry->Ref();
+  }
+}
+
+CompilationCacheEntryRef::~CompilationCacheEntryRef() {
+  if (entry_ == nullptr) {
+    return;
+  }
+  if (entry_->main_entry == nullptr) {
+    parent_->DiscardEntryRefs({entry_});
+  } else {
+    parent_->DiscardEntryRefs({entry_->main_entry});
+  }
+}
+
+TpuCompilationCacheEntry CompilationCacheEntryRef::get() {
+  if (entry_ == nullptr) {
+    // Create an empty entry if the entry is nullptr. This corresponds to
+    // non-existing sharding/unsharding entries.
+    return TpuCompilationCacheEntry();
+  }
+
+  return TpuCompilationCacheEntry(entry_->tpu_program_group.get(), index_);
+}
+
+Status CompilationCacheEntryRef::ToSubEntryRef(
+    CompilationCacheFetchTarget fetch_target) {
+  CompiledSubgraph* target = nullptr;
+  switch (fetch_target) {
+    case CompilationCacheFetchTarget::MAIN:
+      target = entry_;
+      break;
+    case CompilationCacheFetchTarget::SHARDING:
+      target = entry_->sharding_entry.get();
+      break;
+    case CompilationCacheFetchTarget::UNSHARDING:
+      target = entry_->unsharding_entry.get();
+      break;
+    default:
+      return xla::InvalidArgument("Invalid fetch target: %d", fetch_target);
+  }
+
+  if (target == nullptr) {
+    // Cache entry does not have an unsharding subentry. Unref and replace
+    // with nullptr.
+    parent_->DiscardEntryRefs({entry_});
+  }
+  // Otherwise, since the refcount is always on the main entry, we don't
+  // need ref/unref.
+  entry_ = target;
+  return Status::OK();
+}
+
 TpuCompilationCacheInterface::TpuCompilationCacheInterface(int64 max_cache_size)
     : max_cache_size_(max_cache_size) {
   CHECK_GE(max_cache_size_, 0);
@@ -156,7 +223,7 @@ void TpuCompilationCacheInterface::UnloadAndDestroy(CompiledSubgraph* entry) {
   entry->Unref();
 }
 
-size_t TpuCompilationCacheInterface::RemoveEntry(const string& key) {
+size_t TpuCompilationCacheInterface::RemoveEntry(const std::string& key) {
   auto erased = cache_.erase(key);
   TpuCompilationMetrics::SetCacheEntryCount(cache_.size());
 
@@ -196,7 +263,7 @@ CompiledSubgraph* TpuCompilationCacheInterface::DiscardEntryRef(
     }
     erased = entries_by_uid_.erase(entry->uid);
     CHECK_EQ(erased, 1);
-    for (const string& key : entry->proto_key) {
+    for (const std::string& key : entry->proto_key) {
       erased = entries_by_proto_key_.erase(key);
       CHECK_EQ(erased, 1);
     }
@@ -269,10 +336,10 @@ void TpuCompilationCacheInterface::LookupEntryMarkedForEviction(
   }
 }
 
-void TpuCompilationCacheInterface::InsertEntry(const string& key,
+void TpuCompilationCacheInterface::InsertEntry(const std::string& key,
                                                CompiledSubgraph* entry) {
   auto cache_inserted =
-      cache_.insert(std::pair<string, CompiledSubgraph*>(key, entry));
+      cache_.insert(std::pair<std::string, CompiledSubgraph*>(key, entry));
   CHECK(cache_inserted.second);
   TpuCompilationMetrics::SetCacheEntryCount(cache_.size());
 
@@ -295,20 +362,22 @@ Status TpuCompilationCacheInterface::CompileIfKeyAbsent(
     const TpuCompilationCacheKey& subgraph_key,
     const SessionMetadata* session_metadata,
     CompilationRefHolder* per_step_ref_holder, int64* uid,
-    std::vector<string>* proto_key, std::vector<bool>* may_modify_variables,
+    std::vector<std::string>* proto_key, std::vector<std::string>* sharding_key,
+    std::vector<bool>* may_modify_variables,
     absl::Span<const xla::HloProto* const>* hlo_metadatas,
     const std::function<Status(TpuProgramGroupInterface*)>& compile_function) {
   std::vector<CompiledSubgraph*> removed_entries;
   auto status = CompileIfKeyAbsentHelper(
       subgraph_key, session_metadata, per_step_ref_holder, uid, proto_key,
-      may_modify_variables, &removed_entries, hlo_metadatas, compile_function);
+      sharding_key, may_modify_variables, &removed_entries, hlo_metadatas,
+      compile_function);
   for (auto entry : removed_entries) {
     UnloadAndDestroy(entry);
   }
   return status;
 }
 
-string TpuCompilationCacheInterface::FindCacheKey(
+std::string TpuCompilationCacheInterface::FindCacheKey(
     const TpuCompilationCacheKey& subgraph_key) {
   if (!subgraph_key.has_guaranteed_const) {
     return subgraph_key.prefix;
@@ -331,7 +400,8 @@ Status TpuCompilationCacheInterface::CompileIfKeyAbsentHelper(
     const TpuCompilationCacheKey& subgraph_key,
     const SessionMetadata* session_metadata,
     CompilationRefHolder* per_step_ref_holder, int64* uid,
-    std::vector<string>* proto_key, std::vector<bool>* may_modify_variables,
+    std::vector<std::string>* proto_key, std::vector<std::string>* sharding_key,
+    std::vector<bool>* may_modify_variables,
     std::vector<CompiledSubgraph*>* removed_entries,
     absl::Span<const xla::HloProto* const>* hlo_metadatas,
     const std::function<Status(TpuProgramGroupInterface*)>& compile_function) {
@@ -345,17 +415,18 @@ Status TpuCompilationCacheInterface::CompileIfKeyAbsentHelper(
   // for the lifetime of the object, see InitializeEntry() call below.
   absl::MutexLock lock(&mu_);
 
-  string cache_key = FindCacheKey(subgraph_key);
+  std::string cache_key = FindCacheKey(subgraph_key);
   auto iter = cache_.find(cache_key);
   bool is_new_key = iter == cache_.end();
 
-  const string session_name = tpu::SessionNameFromMetadata(session_metadata);
+  const std::string session_name =
+      tpu::SessionNameFromMetadata(session_metadata);
 
   if (is_new_key) {
     cache_key = subgraph_key.ToString();
     TpuCompilationMetrics::IncrementCacheLookupCount(
         /*is_cache_hit=*/false, session_name);
-    const string msg =
+    const std::string msg =
         strings::StrCat("TPU host compilation cache miss: cache_key(",
                         cache_key, "), session_name(", session_name, ")");
     TRACESTRING(msg);
@@ -364,7 +435,7 @@ Status TpuCompilationCacheInterface::CompileIfKeyAbsentHelper(
     // Check if caller has disabled compilation. Set using
     // internal::ScopedTpuCompileDisabler.
     if (!UtilApiFn()->TpuCompile_IsTpuCompilationEnabledFn()) {
-      const string error_msg = strings::StrCat(
+      const std::string error_msg = strings::StrCat(
           "[TpuCompilationDisabled]: Compilation cache miss, but compilation "
           "disabled, session_name(",
           session_name, ") Debug String: ", subgraph_key.debug_string);
@@ -403,7 +474,7 @@ Status TpuCompilationCacheInterface::CompileIfKeyAbsentHelper(
   } else {
     TpuCompilationMetrics::IncrementCacheLookupCount(
         /*is_cache_hit=*/true, session_name);
-    const string msg =
+    const std::string msg =
         strings::StrCat("TPU host compilation cache hit: cache_key(", cache_key,
                         "), session_name(", session_name, ")");
     TRACESTRING(msg);
@@ -427,7 +498,8 @@ Status TpuCompilationCacheInterface::CompileIfKeyAbsentHelper(
   *uid = entry->uid;
   // Let the caller know the keys for each of the cached protos.
   *proto_key = entry->proto_key;
-  *may_modify_variables = entry->tpu_program_group->may_modify_variables();
+  *sharding_key = entry->sharding_key;
+  *may_modify_variables = entry->tpu_program_group->may_modify_variables_list();
   *hlo_metadatas = entry->tpu_program_group->hlo_metadatas();
 
   // If the caller didn't supply a per_step_ref_holder then the caller is going
@@ -466,8 +538,8 @@ Status TpuCompilationCacheInterface::CompileIfKeyAbsentHelper(
   return entry->initialization_status;
 }
 
-Status TpuCompilationCacheInterface::GetKeysFromUid(int64 uid,
-                                                    std::vector<string>* keys) {
+Status TpuCompilationCacheInterface::GetKeysFromUid(
+    int64 uid, std::vector<std::string>* keys) {
   keys->clear();
 
   absl::MutexLock lock(&mu_);
@@ -479,5 +551,49 @@ Status TpuCompilationCacheInterface::GetKeysFromUid(int64 uid,
   return Status::OK();
 }
 
+Status TpuCompilationCacheInterface::Lookup(
+    int64 uid, int proto_index,
+    std::unique_ptr<CompilationCacheEntryRef>* entry) {
+  entry->reset();
+
+  profiler::TraceMe proto_lookup_traceme(
+      "TPU compilation cache proto lookup by uid",
+      /*level=*/2);
+
+  absl::MutexLock lock(&mu_);
+  const auto iter = entries_by_uid_.find(uid);
+  if (iter == entries_by_uid_.end()) {
+    return errors::NotFound("No subgraph found for uid ", uid);
+  }
+  CompiledSubgraph* cache_entry = iter->second;
+  if (proto_index < 0 ||
+      proto_index >= cache_entry->tpu_program_group->program_count()) {
+    return errors::NotFound("No proto found for core index ", proto_index,
+                            " in subgraph with uid ", uid);
+  }
+  *entry = absl::make_unique<CompilationCacheEntryRef>(this, cache_entry,
+                                                       proto_index);
+  return Status::OK();
+}
+
+Status TpuCompilationCacheInterface::Lookup(
+    const std::string& proto_key,
+    std::unique_ptr<CompilationCacheEntryRef>* entry) {
+  entry->reset();
+
+  profiler::TraceMe proto_lookup_traceme("TPU compilation cache proto lookup",
+                                         /*level=*/2);
+
+  absl::MutexLock lock(&mu_);
+  const auto iter = entries_by_proto_key_.find(proto_key);
+  if (iter == entries_by_proto_key_.end()) {
+    return errors::NotFound("No proto found for key ", proto_key);
+  }
+  CompiledSubgraph* cache_entry = iter->second.first;
+  int proto_index = iter->second.second;
+  *entry = absl::make_unique<CompilationCacheEntryRef>(this, cache_entry,
+                                                       proto_index);
+  return Status::OK();
+}
 }  // namespace tpu
 }  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h
index cde6467b7af..12f116dfd26 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h
@@ -31,7 +31,8 @@ limitations under the License.
 #include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/tpu/kernels/compiled_subgraph.h"
-#include "tensorflow/core/tpu/kernels/tpu_compilation_cache.pb.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_common.pb.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_key.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_metrics.h"
 #include "tensorflow/core/tpu/kernels/trace_util.h"
@@ -48,18 +49,20 @@ class CompilationRefHolder : public ResourceBase {
   ~CompilationRefHolder() override = default;
 };
 
-// Base class for a reference to a cached tpu program. A unique_ptr to a
-// CompilationCacheEntryRef is returned by all the cache Lookup methods below,
-// and ensures the underlying proto is not garbage-collected until the client
-// discards the ptr.
-template <typename CacheEntryType>
+// Wrapper for a cache entry returned by all the TpuCompilationCacheInterface
+// `Lookup` methods, and ensures the underlying proto is not garbage-collected
+// until the client discards the ptr.
 class CompilationCacheEntryRef {
  public:
-  virtual ~CompilationCacheEntryRef() = default;
+  CompilationCacheEntryRef();
+  CompilationCacheEntryRef(TpuCompilationCacheInterface* parent,
+                           CompiledSubgraph* entry, int index);
 
-  // Returns a CompilationCacheEntry that should not be used beyond the lifetime
-  // of the tpu::CompilationCacheEntryRef.
-  virtual CacheEntryType get() = 0;
+  virtual ~CompilationCacheEntryRef();
+
+  // Returns a TpuCompilationCacheEntry that should not be used beyond the
+  // lifetime of the CompilationCacheEntryRef.
+  virtual TpuCompilationCacheEntry get();
 
   // Mutates this ref to point to the entry's subentry (for
   // sharding/unsharding) or main entry (unchanged) as specified by
@@ -69,7 +72,15 @@ class CompilationCacheEntryRef {
   //
   // If the requested subentry does not exist, the ref will point to a nullptr
   // entry, and the original entry will be unref'ed.
-  virtual Status ToSubEntryRef(CompilationCacheFetchTarget fetch_target) = 0;
+  virtual Status ToSubEntryRef(CompilationCacheFetchTarget fetch_target);
+
+ protected:
+  TpuCompilationCacheInterface* parent_;  // Not owned.
+  // A reference to entry_ is acquired in the constructor and released via
+  // parent->DiscardEntryRefs in the destructor.
+  CompiledSubgraph* entry_;
+  // The index of the program in entry_ that is returned by the get method.
+  int index_;
 };
 
 class TpuCompilationCacheInterface : public ResourceBase {
@@ -97,7 +108,9 @@ class TpuCompilationCacheInterface : public ResourceBase {
       const TpuCompilationCacheKey& subgraph_key,
       const SessionMetadata* session_metadata,
       CompilationRefHolder* per_step_ref_holder, int64* uid,
-      std::vector<string>* proto_key, std::vector<bool>* may_modify_variables,
+      std::vector<std::string>* proto_key,
+      std::vector<std::string>* sharding_key,
+      std::vector<bool>* may_modify_variables,
       absl::Span<const xla::HloProto* const>* hlo_metadatas,
       const std::function<Status(TpuProgramGroupInterface*)>& compile_function);
 
@@ -124,19 +137,18 @@ class TpuCompilationCacheInterface : public ResourceBase {
   // Looks up an executable corresponding to the model-parallel core index of
   // the subgraph represented by key. On success a pointer to an EntryRef
   // holding the program is returned in entry.
-  template <typename CacheEntryRef, typename CacheEntryRefImpl>
-  Status Lookup(const string& proto_key, std::unique_ptr<CacheEntryRef>* entry);
+  Status Lookup(const std::string& proto_key,
+                std::unique_ptr<CompilationCacheEntryRef>* entry);
 
   // Looks up an executable corresponding to the model-parallel core index of
   // the subgraph represented by uid. On success a pointer to an EntryRef
   // holding the program is returned in entry.
-  template <typename CacheEntryRef, typename CacheEntryRefImpl>
   Status Lookup(int64 uid, int proto_index,
-                std::unique_ptr<CacheEntryRef>* entry);
+                std::unique_ptr<CompilationCacheEntryRef>* entry);
 
   // Looks up the subgraph represented by uid, and returns the vector of keys,
   // one per core, corresponding to that subgraph.
-  Status GetKeysFromUid(int64 uid, std::vector<string>* keys);
+  Status GetKeysFromUid(int64 uid, std::vector<std::string>* keys);
 
   // Makes a reference holder for this cache, that can be stored in the per-step
   // resource manager and will ensure that compiled entries persist until the
@@ -170,7 +182,7 @@ class TpuCompilationCacheInterface : public ResourceBase {
     // parent_->DiscardEntryRefs.
     void AddRef(CompiledSubgraph* entry);
 
-    string DebugString() const override;
+    std::string DebugString() const override;
 
    private:
     TpuCompilationCacheInterface* parent_;  // Not owned.
@@ -185,7 +197,9 @@ class TpuCompilationCacheInterface : public ResourceBase {
       const TpuCompilationCacheKey& subgraph_key,
       const SessionMetadata* session_metadata,
       CompilationRefHolder* per_step_ref_holder, int64* uid,
-      std::vector<string>* proto_key, std::vector<bool>* may_modify_variables,
+      std::vector<std::string>* proto_key,
+      std::vector<std::string>* sharding_key,
+      std::vector<bool>* may_modify_variables,
       std::vector<CompiledSubgraph*>* removed_entries,
       absl::Span<const xla::HloProto* const>* hlo_metadatas,
       const std::function<Status(TpuProgramGroupInterface*)>& compile_function);
@@ -230,14 +244,14 @@ class TpuCompilationCacheInterface : public ResourceBase {
       ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // Removes the entry with given key from cache.
-  size_t RemoveEntry(const string& key) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  size_t RemoveEntry(const std::string& key) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // Inserts the given key and entry to cache.
-  void InsertEntry(const string& key, CompiledSubgraph* entry)
+  void InsertEntry(const std::string& key, CompiledSubgraph* entry)
       ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // Returns the cache key matching given subgraph_key.
-  string FindCacheKey(const TpuCompilationCacheKey& subgraph_key)
+  std::string FindCacheKey(const TpuCompilationCacheKey& subgraph_key)
       ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // Creates a new entry by running initialize_programs and places it in the
@@ -247,7 +261,7 @@ class TpuCompilationCacheInterface : public ResourceBase {
   //
   // **InitializeEntry releases mu_ during the call to initialize_programs.**
   virtual CompiledSubgraph* InitializeEntry(
-      const string& key,
+      const std::string& key,
       const std::function<Status(TpuProgramGroupInterface*)>&
           initialize_programs,
       const TpuCompilationCacheKey& subgraph_key)
@@ -276,13 +290,16 @@ class TpuCompilationCacheInterface : public ResourceBase {
   // cache_ key matching a given subgraph key. When doing a lookup, check
   // session_key_map_ first to avoid unnecessay fingerprint computation.
   // Map from key prefix + session_handle to a cache_ key.
-  absl::node_hash_map<string, string> session_key_map_ ABSL_GUARDED_BY(mu_);
+  absl::node_hash_map<std::string, std::string> session_key_map_
+      ABSL_GUARDED_BY(mu_);
   // Map from key prefix + fingerprint to a cache_ key.
-  absl::node_hash_map<string, string> fingerprint_key_map_ ABSL_GUARDED_BY(mu_);
+  absl::node_hash_map<std::string, std::string> fingerprint_key_map_
+      ABSL_GUARDED_BY(mu_);
   // All the subgraph entries that can be looked up in the cache. An entry is
   // marked for eviction iff it is present in cache_ and not in
   // entries_by_last_use_.
-  std::unordered_map<string, CompiledSubgraph*> cache_ ABSL_GUARDED_BY(mu_);
+  std::unordered_map<std::string, CompiledSubgraph*> cache_
+      ABSL_GUARDED_BY(mu_);
   // All the subgraph entries that can be looked up in the cache, indexed by
   // uid.
   absl::node_hash_map<int64, CompiledSubgraph*> entries_by_uid_
@@ -290,7 +307,7 @@ class TpuCompilationCacheInterface : public ResourceBase {
   // All the protos that can be looked up in the cache, indexed by proto
   // key. The value of the map is a subgraph and the index of the proto compiled
   // for that subgraph.
-  std::unordered_map<string, std::pair<CompiledSubgraph*, int>>
+  std::unordered_map<std::string, std::pair<CompiledSubgraph*, int>>
       entries_by_proto_key_ ABSL_GUARDED_BY(mu_);
   // Map from last_use to entry, used to mark entries for eviction in LRU
   // order. If an entry's last_use counter is not present as a key in
@@ -304,50 +321,6 @@ class TpuCompilationCacheInterface : public ResourceBase {
   TpuCompilationCacheInterface& operator=(const TpuCompilationCacheInterface&) =
       delete;
 };
-
-template <typename CacheEntryRef, typename CacheEntryRefImpl>
-Status TpuCompilationCacheInterface::Lookup(
-    int64 uid, int proto_index, std::unique_ptr<CacheEntryRef>* entry) {
-  entry->reset();
-
-  profiler::TraceMe proto_lookup_traceme(
-      "TPU compilation cache proto lookup by uid",
-      /*level=*/2);
-
-  absl::MutexLock lock(&mu_);
-  const auto iter = entries_by_uid_.find(uid);
-  if (iter == entries_by_uid_.end()) {
-    return errors::NotFound("No subgraph found for uid ", uid);
-  }
-  CompiledSubgraph* cache_entry = iter->second;
-  if (proto_index < 0 ||
-      proto_index >= cache_entry->tpu_program_group->program_count()) {
-    return errors::NotFound("No proto found for core index ", proto_index,
-                            " in subgraph with uid ", uid);
-  }
-  *entry = absl::make_unique<CacheEntryRefImpl>(this, cache_entry, proto_index);
-  return Status::OK();
-}
-
-template <typename CacheEntryRef, typename CacheEntryRefImpl>
-Status TpuCompilationCacheInterface::Lookup(
-    const string& proto_key, std::unique_ptr<CacheEntryRef>* entry) {
-  entry->reset();
-
-  profiler::TraceMe proto_lookup_traceme("TPU compilation cache proto lookup",
-                                         /*level=*/2);
-
-  absl::MutexLock lock(&mu_);
-  const auto iter = entries_by_proto_key_.find(proto_key);
-  if (iter == entries_by_proto_key_.end()) {
-    return errors::NotFound("No proto found for key ", proto_key);
-  }
-  CompiledSubgraph* cache_entry = iter->second.first;
-  int proto_index = iter->second.second;
-  *entry = absl::make_unique<CacheEntryRefImpl>(this, cache_entry, proto_index);
-  return Status::OK();
-}
-
 }  // namespace tpu
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_local_lookup.cc b/tensorflow/core/tpu/kernels/tpu_compilation_cache_local_lookup.cc
index f30a503d2d2..29864a310d1 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_local_lookup.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_local_lookup.cc
@@ -16,70 +16,50 @@ limitations under the License.
 
 namespace tensorflow {
 namespace tpu {
-namespace {
-class CompilationCacheFetchTargetUtility {
- public:
-  CompilationCacheFetchTargetUtility()
-      : names_({"Invalid", "Main", "Sharding", "Unsharding"}) {}
-
-  std::string name(CompilationCacheFetchTarget target) const {
-    return names_[static_cast<int>(target)];
-  }
-
- private:
-  const std::vector<std::string> names_;
-};
-
-std::string GetName(CompilationCacheFetchTarget target) {
-  static const auto* util = new CompilationCacheFetchTargetUtility();
-  return util->name(target);
-}
-
-}  // namespace
 
 TpuCompilationCacheLocalLookup::TpuCompilationCacheLocalLookup(
     TpuCompilationCacheInterface* cache)
-    : cache_(cache) {}
+    : cache_(cache) {
+  cache_->Ref();
+}
 
 TpuCompilationCacheLocalLookup::~TpuCompilationCacheLocalLookup() {
   cache_->Unref();
 }
 
 Status TpuCompilationCacheLocalLookup::Lookup(
-    const string& proto_key,
-    std::unique_ptr<TpuCompilationCacheEntryRef>* entry,
+    const string& proto_key, std::unique_ptr<CompilationCacheEntryRef>* entry,
     CompilationCacheFetchTarget fetch_target) {
   profiler::TraceMe proto_lookup_traceme("Local TPU proto cache lookup",
                                          /*level=*/2);
-  Status s = cache_->Lookup<TpuCompilationCacheEntryRef, EntryRefImpl>(
-      proto_key, entry);
+  Status s = cache_->Lookup(proto_key, entry);
   VLOG(1) << "Looked up key " << proto_key << " in local subgraph cache status "
           << s;
   if (!s.ok()) {
     return s;
   }
   s = (*entry)->ToSubEntryRef(fetch_target);
-
-  VLOG(1) << "Fetched subentry: " << GetName(fetch_target) << " with status "
+  VLOG(1) << "Fetched subentry: "
+          << CompilationCacheFetchTarget_Name(fetch_target) << " with status "
           << s;
   return s;
 }
 
 Status TpuCompilationCacheLocalLookup::Lookup(
     int64 uid, int proto_index,
-    std::unique_ptr<TpuCompilationCacheEntryRef>* entry,
+    std::unique_ptr<CompilationCacheEntryRef>* entry,
     CompilationCacheFetchTarget fetch_target) {
   profiler::TraceMe proto_lookup_traceme("Local TPU proto cache lookup by uid",
                                          /*level=*/2);
-  Status s = cache_->Lookup<TpuCompilationCacheEntryRef, EntryRefImpl>(
-      uid, proto_index, entry);
+  Status s = cache_->Lookup(uid, proto_index, entry);
   VLOG(1) << "Looked up uid " << uid << ", index " << proto_index
           << " in local subgraph cache status " << s;
   if (!s.ok()) {
     return s;
   }
   s = (*entry)->ToSubEntryRef(fetch_target);
-  VLOG(1) << "Fetched subentry: " << GetName(fetch_target) << " with status "
+  VLOG(1) << "Fetched subentry: "
+          << CompilationCacheFetchTarget_Name(fetch_target) << " with status "
           << s;
   return s;
 }
@@ -87,6 +67,5 @@ Status TpuCompilationCacheLocalLookup::Lookup(
 string TpuCompilationCacheLocalLookup::DebugString() const {
   return "TpuCompilationCacheLocalLookup";
 }
-
 }  // namespace tpu
 }  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_local_lookup.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_local_lookup.h
index eb5aadcd3e2..96f92358241 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_local_lookup.h
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_local_lookup.h
@@ -16,9 +16,8 @@ limitations under the License.
 #define TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_LOCAL_LOOKUP_H_
 
 #include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/tpu/kernels/tpu_compilation_cache.pb.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_common.pb.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.h"
-#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_external.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.h"
 
@@ -28,24 +27,17 @@ namespace tpu {
 // Class for looking up TPU programs when the execute and compile Op are in the
 // same address space. The proto is simply looked up in the compilation cache,
 // without any serialization taking place.
-class TpuCompilationCacheLocalLookup
-    : public TpuCompilationCacheLookup<
-          CompilationCacheEntryRef<TpuCompilationCacheEntry>> {
+class TpuCompilationCacheLocalLookup : public TpuCompilationCacheLookup {
  public:
-  using TpuCompilationCacheEntryRef =
-      ::tensorflow::tpu::CompilationCacheEntryRef<TpuCompilationCacheEntry>;
-  using EntryRefImpl =
-      ::tensorflow::tpu::TpuCompilationCacheExternal::EntryRefImpl;
-
   explicit TpuCompilationCacheLocalLookup(TpuCompilationCacheInterface* cache);
   ~TpuCompilationCacheLocalLookup() override;
 
   Status Lookup(const string& proto_key,
-                std::unique_ptr<TpuCompilationCacheEntryRef>* entry,
+                std::unique_ptr<CompilationCacheEntryRef>* entry,
                 CompilationCacheFetchTarget fetch_target) override;
 
   Status Lookup(int64 uid, int proto_index,
-                std::unique_ptr<TpuCompilationCacheEntryRef>* entry,
+                std::unique_ptr<CompilationCacheEntryRef>* entry,
                 CompilationCacheFetchTarget fetch_target) override;
 
   string DebugString() const override;
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.h
index 0d1a53d31d2..fc819700204 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.h
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.h
@@ -17,16 +17,17 @@ limitations under the License.
 
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/tpu/kernels/tpu_compilation_cache.pb.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_common.pb.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h"
 
 namespace tensorflow {
 namespace tpu {
 
+// TODO(b/162241759): consider merging TpuCompilationCacheLookup and
+// TpuCompilationCacheInterface.
 // Base class allowing Execute Ops to look up TPU programs. Different subclasses
 // are used when the execute Op is in the same address space as the compile Op,
 // and when they need to communicate over RPC.
-template <typename TpuCompilationCacheEntryRefType>
 class TpuCompilationCacheLookup : public ResourceBase {
  public:
   ~TpuCompilationCacheLookup() override = default;
@@ -43,12 +44,11 @@ class TpuCompilationCacheLookup : public ResourceBase {
   // fetch_target requests one of them, then after this call
   //   (*entry)->get().get_executable() will return nullptr.
   virtual Status Lookup(const string& proto_key,
-                        std::unique_ptr<TpuCompilationCacheEntryRefType>* entry,
+                        std::unique_ptr<CompilationCacheEntryRef>* entry,
                         CompilationCacheFetchTarget fetch_target) = 0;
 
-  virtual Status Lookup(
-      const string& proto_key,
-      std::unique_ptr<TpuCompilationCacheEntryRefType>* entry) {
+  virtual Status Lookup(const string& proto_key,
+                        std::unique_ptr<CompilationCacheEntryRef>* entry) {
     return Lookup(proto_key, std::move(entry),
                   CompilationCacheFetchTarget::MAIN);
   }
@@ -58,17 +58,15 @@ class TpuCompilationCacheLookup : public ResourceBase {
   // returned in program. The wrapper is guaranteed to be valid only during the
   // execution of the Op requesting the proto.
   virtual Status Lookup(int64 uid, int proto_index,
-                        std::unique_ptr<TpuCompilationCacheEntryRefType>* entry,
+                        std::unique_ptr<CompilationCacheEntryRef>* entry,
                         CompilationCacheFetchTarget fetch_target) = 0;
 
-  virtual Status Lookup(
-      int64 uid, int proto_index,
-      std::unique_ptr<TpuCompilationCacheEntryRefType>* entry) {
+  virtual Status Lookup(int64 uid, int proto_index,
+                        std::unique_ptr<CompilationCacheEntryRef>* entry) {
     return Lookup(uid, proto_index, std::move(entry),
                   CompilationCacheFetchTarget::MAIN);
   }
 };
-
 }  // namespace tpu
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_lookup.cc b/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_lookup.cc
new file mode 100644
index 00000000000..8b0fb674682
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_lookup.cc
@@ -0,0 +1,202 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_lookup.h"
+
+#include <grpcpp/security/credentials.h>
+
+#include "absl/strings/str_cat.h"
+#include "absl/time/time.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_support.h"
+
+namespace tensorflow {
+namespace tpu {
+namespace {
+
+#if defined(LIBTFTPU)
+using ResponseType = GetTpuProgramResponseExternal;
+#else
+using ResponseType = GetTpuProgramResponse;
+#endif
+
+static constexpr absl::Duration kProtoTimeout = absl::Minutes(15);
+static gpr_timespec TimeToGprTimespec(absl::Time time) {
+  if (time == absl::InfiniteFuture()) {
+    return gpr_inf_future(GPR_CLOCK_REALTIME);
+  }
+  if (time == absl::InfinitePast()) {
+    return gpr_inf_past(GPR_CLOCK_REALTIME);
+  }
+
+  gpr_timespec spec;
+  timespec t = absl::ToTimespec(time);
+  spec.tv_sec = t.tv_sec;
+  spec.tv_nsec = static_cast<int32_t>(t.tv_nsec);
+  spec.clock_type = GPR_CLOCK_REALTIME;
+  return spec;
+}
+}  // namespace
+TpuCompilationCacheRpcLookup::TpuCompilationCacheRpcLookup(
+    const std::string& server_address, int64 max_cache_size)
+    : max_cache_size_(max_cache_size) {
+  // Ensure that large TPU program can get sent over the channel.
+  ::grpc::ChannelArguments args;
+  args.SetInt(GRPC_ARG_MAX_MESSAGE_LENGTH, std::numeric_limits<int32>::max());
+  auto channel =
+      ::grpc::CreateCustomChannel(absl::StrCat("dns:///", server_address),
+                                  CreateChannelCredentials(), args);
+  stub_ = tpu::grpc::TpuCompilationCacheService::NewStub(channel);
+  VLOG(1) << "Created RPC lookup cache size " << max_cache_size_ << " bytes.";
+}
+
+Status TpuCompilationCacheRpcLookup::Lookup(
+    const std::string& proto_key,
+    std::unique_ptr<CompilationCacheEntryRef>* entry,
+    tpu::CompilationCacheFetchTarget fetch_target) {
+  profiler::TraceMe proto_lookup_traceme("Remote TPU proto cache lookup",
+                                         /*level=*/2);
+  entry->reset();
+  std::shared_ptr<CacheEntry> cache_entry;
+  // Keep a reference to CacheEntry objects evicted from the cache so that the
+  // potential deletion happens outside the lock upon method exit.
+  std::vector<std::shared_ptr<CacheEntry>> removed_entries;
+
+  std::string local_proto_key = absl::StrCat(
+      proto_key, "_", tpu::CompilationCacheFetchTarget_Name(fetch_target));
+
+  {
+    absl::MutexLock lock(&mu_);
+    auto iter = cache_.find(local_proto_key);
+    if (iter == cache_.end()) {
+      tpu::GetTpuProgramRequest request;
+      request.set_key(proto_key);
+      request.set_fetch_target(fetch_target);
+      TF_RETURN_IF_ERROR(
+          RemoteLookupLocked(local_proto_key, request, &cache_entry));
+    } else {
+      VLOG(1) << "Found key " << local_proto_key << " in local proto cache.";
+      cache_entry = iter->second;
+      auto erased = entries_by_last_use_.erase(cache_entry->last_use);
+      CHECK_EQ(erased, 1);
+    }
+    PostLookupLocked(&cache_entry, entry, &removed_entries);
+  }
+  return Status::OK();
+}
+
+Status TpuCompilationCacheRpcLookup::Lookup(
+    int64 uid, int proto_index,
+    std::unique_ptr<CompilationCacheEntryRef>* entry,
+    tpu::CompilationCacheFetchTarget fetch_target) {
+  profiler::TraceMe proto_lookup_traceme("Remote TPU proto cache lookup by uid",
+                                         /*level=*/2);
+  entry->reset();
+  std::shared_ptr<CacheEntry> cache_entry;
+  // Keep a reference to CacheEntry objects evicted from the cache so that the
+  // potential deletion happens outside the lock upon method exit.
+  std::vector<std::shared_ptr<CacheEntry>> removed_entries;
+
+  // Make a string key so that we can uniformly store cached entries under
+  // string keys whether they are looked up by proto_key or uid+index. The
+  // expectation is that any given executable will only ever be looked up
+  // *either* by proto_key *or* by uid+index, so we are not concerned that the
+  // same proto could be placed in the cache twice if it is looked up by both
+  // methods.
+  std::string local_proto_key =
+      absl::StrCat(" _ ", uid, ":", proto_index, "_",
+                   tpu::CompilationCacheFetchTarget_Name(fetch_target));
+  {
+    absl::MutexLock lock(&mu_);
+    auto iter = cache_.find(local_proto_key);
+    if (iter == cache_.end()) {
+      tpu::GetTpuProgramRequest request;
+      tpu::TpuCompilationUidAndIndex* uid_and_index =
+          request.mutable_uid_and_index();
+      uid_and_index->set_uid(uid);
+      uid_and_index->set_proto_index(proto_index);
+      request.set_fetch_target(fetch_target);
+      TF_RETURN_IF_ERROR(
+          RemoteLookupLocked(local_proto_key, request, &cache_entry));
+    } else {
+      VLOG(1) << "Found uid " << uid << " and index " << proto_index
+              << " in local proto cache.";
+      cache_entry = iter->second;
+      auto erased = entries_by_last_use_.erase(cache_entry->last_use);
+      CHECK_EQ(erased, 1);
+    }
+    PostLookupLocked(&cache_entry, entry, &removed_entries);
+  }
+  return Status::OK();
+}
+
+Status TpuCompilationCacheRpcLookup::RemoteLookupLocked(
+    const std::string& local_proto_key,
+    const tpu::GetTpuProgramRequest& request,
+    std::shared_ptr<CacheEntry>* cache_entry) {
+  profiler::TraceMe proto_lookup_traceme("Remote TPU proto cache fetch",
+                                         /*level=*/2);
+  // Perform the RPC while holding the lock unless it is demonstrated that
+  // this causes a performance problem.
+  ::grpc::ClientContext client_context;
+  client_context.set_deadline(TimeToGprTimespec(::absl::Now() + kProtoTimeout));
+  client_context.set_compression_algorithm(GRPC_COMPRESS_GZIP);
+
+  ResponseType response;
+  Status s =
+      FromGrpcStatus(stub_->GetTpuProgram(&client_context, request, &response));
+  VLOG(1) << "Looked up key " << local_proto_key
+          << " in remote subgraph cache status " << s;
+  TF_RETURN_IF_ERROR(s);
+
+  TF_RETURN_IF_ERROR(DeserializeRpcResponseToCacheEntry(
+      local_proto_key, &response, cache_entry));
+  cache_.emplace(local_proto_key, (*cache_entry));
+  cache_size_ += (*cache_entry)->size;
+
+  return Status::OK();
+}
+
+void TpuCompilationCacheRpcLookup::PostLookupLocked(
+    std::shared_ptr<CacheEntry>* cache_entry,
+    std::unique_ptr<CompilationCacheEntryRef>* entry,
+    std::vector<std::shared_ptr<CacheEntry>>* removed_entries) {
+  (*cache_entry)->last_use = use_counter_++;
+  entries_by_last_use_[(*cache_entry)->last_use] = cache_entry->get();
+  *entry =
+      std::unique_ptr<CompilationCacheEntryRef>(new CacheWrapper(*cache_entry));
+
+  // Evict overflowing entries if necessary, but never evict the most recently
+  // used entry.
+  while (entries_by_last_use_.size() > 1 && cache_size_ > max_cache_size_) {
+    auto entry_to_evict = entries_by_last_use_.begin()->second;
+    entries_by_last_use_.erase(entry_to_evict->last_use);
+    CHECK_GE(cache_size_, entry_to_evict->size);
+    cache_size_ -= entry_to_evict->size;
+    // Delete the cache's reference to the entry, though clients may still be
+    // holding onto references. We use 'removed_entries' to delay the possible
+    // CacheEntry destruction until the mu_ lock is released.
+    auto entry_to_evict_it = cache_.find(entry_to_evict->key);
+    CHECK(entry_to_evict_it != cache_.end())
+        << "Missing entry key: " << entry_to_evict->key;
+    removed_entries->push_back(entry_to_evict_it->second);
+    cache_.erase(entry_to_evict_it);
+  }
+}
+
+std::string TpuCompilationCacheRpcLookup::DebugString() const {
+  return "TpuCompilationCacheRpcLookup";
+}
+}  // namespace tpu
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_lookup.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_lookup.h
new file mode 100644
index 00000000000..d5449a05371
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_lookup.h
@@ -0,0 +1,95 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_RPC_LOOKUP_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_RPC_LOOKUP_H_
+
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "absl/synchronization/mutex.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_common.pb.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_grpc.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_support.h"
+#include "tensorflow/core/tpu/kernels/tpu_program_group_interface.h"
+
+namespace tensorflow {
+namespace tpu {
+
+// Class for looking up and caching TPU program via RPC.
+class TpuCompilationCacheRpcLookup : public TpuCompilationCacheLookup {
+ public:
+  using StubType = tpu::grpc::TpuCompilationCacheService::Stub;
+
+  TpuCompilationCacheRpcLookup(const string& server_address,
+                               int64 max_cache_size);
+  ~TpuCompilationCacheRpcLookup() override = default;
+
+  Status Lookup(const string& proto_key,
+                std::unique_ptr<tpu::CompilationCacheEntryRef>* entry,
+                tpu::CompilationCacheFetchTarget fetch_target) override;
+
+  Status Lookup(int64 uid, int proto_index,
+                std::unique_ptr<tpu::CompilationCacheEntryRef>* entry,
+                tpu::CompilationCacheFetchTarget fetch_target) override;
+
+  string DebugString() const override;
+
+ private:
+  // Helper method to make the RPC request to the central cache.
+  Status RemoteLookupLocked(const string& local_proto_key,
+                            const tpu::GetTpuProgramRequest& request,
+                            std::shared_ptr<CacheEntry>* cache_entry)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Helper method to adjust datastructures after a cache lookup.
+  // We use `removed_entries` so that actual CacheEntry destruction happens
+  // outside the lock.
+  void PostLookupLocked(
+      std::shared_ptr<CacheEntry>* cache_entry,
+      std::unique_ptr<tpu::CompilationCacheEntryRef>* entry,
+      std::vector<std::shared_ptr<CacheEntry>>* removed_entries)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // The maximum size of entries that are stored in the cache before entries are
+  // evicted.
+  const int64 max_cache_size_;
+
+  std::unique_ptr<StubType> stub_;
+
+  // Protect concurrent access to member variables below.
+  mutable absl::Mutex mu_;
+
+  // The total size of entries in the cache.
+  int64 cache_size_ ABSL_GUARDED_BY(mu_) = 0;
+  // The value to assign to the last_use field of the next entry that is looked
+  // up.
+  int64 use_counter_ ABSL_GUARDED_BY(mu_) = 0;
+  // The entries that can be looked up in the cache. An entry is deleted from
+  // the cache as soon as it is evicted, but the underlying shared_ptr won't be
+  // freed until any wrappers holding it go out of scope.
+  std::unordered_map<std::string, std::shared_ptr<CacheEntry>> cache_
+      ABSL_GUARDED_BY(mu_);
+  // Map from last_use to entry, used to evict entries in LRU order.
+  std::map<int64, CacheEntry*> entries_by_last_use_ ABSL_GUARDED_BY(mu_);
+};
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_RPC_LOOKUP_H_
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_support.cc b/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_support.cc
new file mode 100644
index 00000000000..9a6ca6be7e4
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_support.cc
@@ -0,0 +1,161 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_support.h"
+
+#include "tensorflow/compiler/tf2xla/host_compute_metadata.pb.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
+#include "tensorflow/core/platform/casts.h"
+#if defined(LIBTFTPU)
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache.pb.h"
+#endif
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_common.pb.h"
+#include "tensorflow/core/tpu/kernels/tpu_program_group.h"
+#include "tensorflow/stream_executor/tpu/proto_helper.h"
+
+namespace tensorflow {
+namespace tpu {
+std::shared_ptr<::grpc::ChannelCredentials> CreateChannelCredentials() {
+  return ::grpc::InsecureChannelCredentials();  // NOLINT
+}
+
+#if defined(LIBTFTPU)
+template <>
+Status DeserializeRpcResponseToCacheEntry<GetTpuProgramResponseExternal>(
+    absl::string_view local_proto_key, GetTpuProgramResponseExternal* response,
+    std::shared_ptr<CacheEntry>* cache_entry) {
+  CHECK_NE(response, nullptr);
+  CHECK_NE(cache_entry, nullptr);
+  *cache_entry = std::make_shared<CacheEntry>();
+  CacheEntry& entry = **cache_entry;
+  entry.key = std::string(local_proto_key);
+
+  if (response->is_empty()) {
+    entry.size = 0;
+  } else {
+    TpuSerializedProto serialized_response_proto =
+        stream_executor::tpu::SerializeProto(*response);
+    auto cleanup = xla::MakeCleanup([&serialized_response_proto]() {
+      stream_executor::tpu::SerializedProto_Free(serialized_response_proto);
+    });
+    // When we lookup from remote cache, we fetch a TPU program for a specific
+    // core, hence we allocate TPU program group for a single program.
+    auto tpu_program_group = absl::make_unique<TpuProgramGroup>();
+
+    // TODO(b/166575150): can be optimized by sending the buffer over the gRPC
+    // without an extra deserializing.
+    TF_RETURN_IF_ERROR(tpu_program_group->DeserializeFromRpcResponseProtos(
+        {serialized_response_proto}));
+    entry.tpu_program_group = std::move(tpu_program_group);
+    entry.size = entry.tpu_program_group->program_size();
+  }
+
+  return Status::OK();
+}
+
+xla::StatusOr<std::vector<::grpc::Slice>> SerializeCacheEntryToBufferSlices(
+    const TpuCompilationCacheEntry& cache_entry) {
+  if (cache_entry.tpu_program_group() == nullptr) {
+    // It's possible that the sharding/unsharding entry does not exist, but the
+    // main entry must exist.
+    GetTpuProgramResponseExternal header;
+    header.set_is_empty(true);
+    std::string encoded_header;
+    if (!header.AppendToString(&encoded_header)) {
+      return errors::Internal("Failed to serialize TPU program metadata.");
+    }
+    ::grpc::Slice slice(encoded_header);
+    return std::vector<::grpc::Slice>{slice};
+  }
+
+  const TpuProgramGroup* tpu_program_group =
+      tensorflow::down_cast<const TpuProgramGroup*>(
+          cache_entry.tpu_program_group());
+  CHECK_NE(tpu_program_group, nullptr);
+  CHECK_GE(tpu_program_group->program_count(), 0);
+  CHECK_GE(cache_entry.core_index(), 0);
+  CHECK_LT(cache_entry.core_index(), tpu_program_group->program_count());
+  const int64 program_size = tpu_program_group->program_size();
+  if (program_size > INT_MAX) {
+    return errors::Internal("TPU program exceeded 2 GiB.");
+  }
+
+  TpuExecutableSerializedProto executable;
+  auto cleanup_executable = xla::MakeCleanup([&executable]() {
+    if (executable.size > 0) {
+      stream_executor::tpu::SerializedProto_Free(executable);
+    }
+  });
+  auto get_executable_status = tpu_program_group->SerializeExecutable(
+      cache_entry.core_index(), &executable);
+  if (!get_executable_status.ok()) {
+    return errors::Internal("Failed to serialize TPU program.");
+  }
+
+  // Encode and serialize header fields.
+  GetTpuProgramResponseExternal header;
+  if (!header.mutable_proto()->ParseFromArray(executable.bytes,
+                                              executable.size)) {
+    return errors::Internal("Failed to serialize TPU program.");
+  }
+  header.set_is_empty(false);
+
+  HostComputeMetadataSerializedProto host_compute_metadata;
+  auto cleanup_host_compute_metadata =
+      xla::MakeCleanup([&host_compute_metadata]() {
+        if (host_compute_metadata.size > 0) {
+          stream_executor::tpu::SerializedProto_Free(host_compute_metadata);
+        }
+      });
+  Status get_host_compute_metadata_status =
+      tpu_program_group->SerializeHostComputeMetadata(cache_entry.core_index(),
+                                                      &host_compute_metadata);
+  if (!get_host_compute_metadata_status.ok()) {
+    return errors::Internal("Failed to serialize host compute metadata.");
+  }
+  if (!header.mutable_host_compute_metadata()->ParseFromArray(
+          host_compute_metadata.bytes, host_compute_metadata.size)) {
+    return errors::Internal("Failed to deserialize host compute metadata.");
+  }
+
+  bool may_modify_variables =
+      tpu_program_group->may_modify_variables(cache_entry.core_index());
+  header.set_may_modify_variables(may_modify_variables);
+
+  CompilerMetadataSerializedProto compiler_metadata;
+  auto cleanup_compiler_metadata = xla::MakeCleanup([&compiler_metadata]() {
+    if (compiler_metadata.size > 0) {
+      stream_executor::tpu::SerializedProto_Free(compiler_metadata);
+    }
+  });
+  Status get_compiler_metadata_status =
+      tpu_program_group->SerializeCompilerMetadata(cache_entry.core_index(),
+                                                   &compiler_metadata);
+  if (!get_compiler_metadata_status.ok()) {
+    return errors::Internal("Failed to serialize compiler metadata.");
+  }
+  if (!header.mutable_compiler_metadata()->ParseFromArray(
+          compiler_metadata.bytes, compiler_metadata.size)) {
+    return errors::Internal("Failed to deserialize compiler metadata.");
+  }
+  std::string encoded_header;
+  if (!header.AppendToString(&encoded_header)) {
+    return errors::Internal("Failed to serialize TPU program metadata.");
+  }
+
+  return std::vector<::grpc::Slice>{::grpc::Slice(encoded_header)};
+}
+#endif  // LIBTFTPU
+}  // namespace tpu
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_support.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_support.h
new file mode 100644
index 00000000000..c9099ec7a27
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_support.h
@@ -0,0 +1,97 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_SUPPORT_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_SUPPORT_H_
+
+#include <grpcpp/security/credentials.h>
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "grpcpp/support/slice.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.h"
+#include "tensorflow/core/tpu/kernels/tpu_program_group_interface.h"
+
+namespace tensorflow {
+namespace tpu {
+
+// A cache entry for remote TPU compilation.
+struct CacheEntry {
+  CacheEntry() : size(0), last_use(-1) {}
+  virtual ~CacheEntry() {
+    if (tpu_program_group != nullptr) {
+      tpu_program_group->UnloadAndDestroyPrograms();
+    }
+  }
+  std::unique_ptr<TpuProgramGroupInterface> tpu_program_group;
+  std::string key;
+  int64 size;
+
+  // An integer-based monotonically increasing counter used by the TPU
+  // compilation cache to sort and evict the least recently used entry when the
+  // cache size exceeded the maximum size limit. The value is initialized to
+  // `-1` as an initial value.
+  int64 last_use;
+};
+
+// Implementation of `CompilationCacheEntryRef` that holds a shared_ptr to the
+// local cache entry until the wrapper is destroyed.
+class CacheWrapper : public CompilationCacheEntryRef {
+ public:
+  explicit CacheWrapper(std::shared_ptr<CacheEntry> entry)
+      : cache_entry_(std::move(entry)) {}
+  ~CacheWrapper() override = default;
+
+  TpuCompilationCacheEntry get() override {
+    if (cache_entry_->size == 0) {
+      // Create an empty entry if the size is 0. This corresponds to
+      // non-existing sharding/unsharding entries.
+      return TpuCompilationCacheEntry();
+    }
+    return TpuCompilationCacheEntry(cache_entry_->tpu_program_group.get(),
+                                    /*core_index=*/0);
+  }
+
+  Status ToSubEntryRef(CompilationCacheFetchTarget fetch_target) override {
+    LOG(FATAL) << "Not implemented by designed.";
+  }
+
+ private:
+  std::shared_ptr<CacheEntry> cache_entry_;
+};
+
+// Creates gRPC channel credentials for the current runtime env.
+std::shared_ptr<::grpc::ChannelCredentials> CreateChannelCredentials();
+
+// Fills an uinitialized `CacheEntry` from `GetTpuProgramResponse` proto. The
+// `cache_entry` will be instantiated by the function.
+template <typename ResponseType>
+Status DeserializeRpcResponseToCacheEntry(
+    const absl::string_view local_proto_key, ResponseType* response,
+    std::shared_ptr<CacheEntry>* cache_entry);
+
+// Serializes `TpuCompilationCacheEntry` to gRPC bufer slices.
+xla::StatusOr<std::vector<::grpc::Slice>> SerializeCacheEntryToBufferSlices(
+    const TpuCompilationCacheEntry& cache_entry);
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_SUPPORT_H_
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_service.cc b/tensorflow/core/tpu/kernels/tpu_compilation_cache_service.cc
new file mode 100644
index 00000000000..96fb7e8bd32
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_service.cc
@@ -0,0 +1,175 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_service.h"
+
+#include <chrono>  // NOLINT
+
+#include "grpcpp/support/byte_buffer.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
+#include "tensorflow/core/platform/coding.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_support.h"
+
+namespace tensorflow {
+namespace {
+using ::tensorflow::tpu::CompilationCacheEntryRef;
+using ::tensorflow::tpu::TpuCompilationCacheEntry;
+using ::tensorflow::tpu::TpuCompilationCacheInterface;
+
+static constexpr int kGetTpuProgramServingThreads = 32;
+}  // namespace
+
+TpuCompilationCacheService::TpuCompilationCacheService(
+    ::grpc::ServerBuilder* server_builder, TpuCompilationCacheInterface* cache)
+    : running_(true),
+      cache_(cache),
+      server_builder_(server_builder),
+      cq_(server_builder_->AddCompletionQueue()),
+      thread_pool_(absl::make_unique<thread::ThreadPool>(
+          Env::Default(), "TpuCompilationCacheService",
+          kGetTpuProgramServingThreads)) {
+  cache_->Ref();
+  server_builder_->RegisterService(&service_);
+}
+
+TpuCompilationCacheService::~TpuCompilationCacheService() {
+  // This ordering is important. We must first shutdown our CQ and allow the
+  // polling thread and dispatch pool to shutdown before releasing our cache
+  // reference. The gRPC server must be Shutdown() by this point or we will
+  // deadlock here.  The running_ boolean is necessary to avoid adding new
+  // operations to the CQ after is has shutdown.
+  running_ = false;
+  cq_->Shutdown();
+  polling_thread_.reset();
+  thread_pool_.reset();
+  cache_->Unref();
+}
+
+void TpuCompilationCacheService::Start() {
+  server_ = server_builder_->BuildAndStart();
+  ThreadOptions opts;
+  polling_thread_.reset(Env::Default()->StartThread(
+      opts, "TpuCompilationCachePoller", [this]() { HandleRPCsLoop(); }));
+}
+
+bool TpuCompilationCacheService::Shutdown(int timeout_sec) {
+  if (server_ != nullptr) {
+    std::chrono::system_clock::time_point timeout =
+        std::chrono::system_clock::now() + std::chrono::seconds(timeout_sec);
+    server_->Shutdown(std::chrono::system_clock::now() +
+                      std::chrono::seconds(timeout_sec));
+    if (std::chrono::system_clock::now() >= timeout) {
+      return false;
+    }
+    return true;
+  } else {
+    return false;
+  }
+}
+
+void TpuCompilationCacheService::SetMemoryQuota(size_t max_bytes) {
+  ::grpc::ResourceQuota quota;
+  quota.Resize(max_bytes);
+  server_builder_->SetResourceQuota(quota);
+}
+
+// Fetch a cache result for the given request and serialize the result directly
+// into a ByteBuffer.
+void TpuCompilationCacheService::GetTpuProgram(GetTpuProgramCall* call) {
+  std::unique_ptr<CompilationCacheEntryRef> entry;
+
+  VLOG(1) << "GetTpuProgram: " << call->request.DebugString();
+  Status s;
+  switch (call->request.key_oneof_case()) {
+    case tpu::GetTpuProgramRequest::kKey:
+      s = cache_->Lookup(call->request.key(), &entry);
+      break;
+
+    case tpu::GetTpuProgramRequest::kUidAndIndex:
+      s = cache_->Lookup(call->request.uid_and_index().uid(),
+                         call->request.uid_and_index().proto_index(), &entry);
+      break;
+
+    default:
+      s = errors::Internal("Bad GetTpuProgram RPC request oneof case ",
+                           call->request.key_oneof_case());
+      break;
+  }
+  if (!s.ok()) {
+    return call->SendResponse(ToGrpcStatus(s));
+  }
+
+  s = entry->ToSubEntryRef(call->request.fetch_target());
+  if (!s.ok()) {
+    return call->SendResponse(::grpc::Status(
+        ::grpc::StatusCode::INVALID_ARGUMENT,
+        absl::StrCat(
+            "Error getting the fetching target ",
+            CompilationCacheFetchTarget_Name(call->request.fetch_target())),
+        s.error_message()));
+  }
+
+  TpuCompilationCacheEntry cache_entry = entry->get();
+  if (cache_entry.tpu_program_group() == nullptr) {
+    // It's possible that the sharding/unsharding entry does not exist, but the
+    // main entry must exist.
+    CHECK_NE(call->request.fetch_target(),
+             tpu::CompilationCacheFetchTarget::MAIN);
+  }
+
+  xla::StatusOr<std::vector<::grpc::Slice>> buffer_slices =
+      tpu::SerializeCacheEntryToBufferSlices(cache_entry);
+
+  if (!buffer_slices.ok()) {
+    return call->SendResponse(ToGrpcStatus(buffer_slices.status()));
+  }
+
+  call->response =
+      ::grpc::ByteBuffer{&buffer_slices.ValueOrDie()[0], buffer_slices->size()};
+  return call->SendResponse(::grpc::Status());
+}
+
+void TpuCompilationCacheService::HandleGetTpuProgram(GetTpuProgramCall* call) {
+  thread_pool_->Schedule([this, call]() { GetTpuProgram(call); });
+  if (running_) {
+    GetTpuProgramCall::EnqueueRequestForMethod(
+        &service_, cq_.get(),
+        static_cast<int>(ServiceType::MethodId::kGetTpuProgram),
+        &TpuCompilationCacheService::HandleGetTpuProgram,
+        /*supports_cancel=*/false);
+  }
+}
+
+void TpuCompilationCacheService::HandleRPCsLoop() {
+  void* tag;
+  bool ok;
+
+  for (int i = 0; i < 50; ++i) {
+    GetTpuProgramCall::EnqueueRequestForMethod(
+        &service_, cq_.get(),
+        static_cast<int>(ServiceType::MethodId::kGetTpuProgram),
+        &TpuCompilationCacheService::HandleGetTpuProgram,
+        /*supports_cancel=*/false);
+  }
+
+  while (cq_->Next(&tag, &ok)) {
+    VLOG(2) << "HandleRPCS: " << tag;
+    UntypedCall<TpuCompilationCacheService>::Tag* callback_tag =
+        static_cast<UntypedCall<TpuCompilationCacheService>::Tag*>(tag);
+    callback_tag->OnCompleted(this, ok);
+  }
+
+  VLOG(2) << "Cache thread shutting down.";
+}
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_service.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_service.h
new file mode 100644
index 00000000000..72270ef28c5
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_service.h
@@ -0,0 +1,70 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_SERVICE_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_SERVICE_H_
+
+#include <atomic>
+#include <memory>
+
+#include "grpcpp/server_builder.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_call.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_common.pb.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_grpc.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h"
+
+namespace tensorflow {
+// gRPC service for handling CompilationCache requests.
+// To avoid OOMs during execution, this service using the asynchronous raw gRPC
+// interface to serialize cache results directly to gRPC byte buffers. This
+// allows us to control serialization concurrency and avoids making an extra
+// copy of the program cache for each worker.
+class TpuCompilationCacheService {
+ public:
+  using ServiceType = ::tensorflow::tpu::grpc::TpuCompilationCacheService;
+  using AsyncService = ServiceType::AsyncService;
+
+  TpuCompilationCacheService(::grpc::ServerBuilder* server_builder,
+                             tpu::TpuCompilationCacheInterface* cache);
+  ~TpuCompilationCacheService();
+
+  void Start();
+  bool Shutdown(int timeout_sec);
+  void SetMemoryQuota(size_t max_bytes);
+
+ private:
+  void HandleRPCsLoop();
+
+  using GetTpuProgramCall = Call<TpuCompilationCacheService, AsyncService,
+                                 tpu::GetTpuProgramRequest, ::grpc::ByteBuffer>;
+
+  // Schedule the cache fetch into the serving thread pool.
+  void HandleGetTpuProgram(GetTpuProgramCall* call);
+
+  // Performs the actual cache fetch and serialization.
+  void GetTpuProgram(GetTpuProgramCall* call);
+
+  std::atomic<bool> running_;
+  tpu::TpuCompilationCacheInterface* cache_;
+  ::grpc::ServerBuilder* server_builder_;
+  std::unique_ptr<::grpc::Server> server_;
+  std::unique_ptr<::grpc::ServerCompletionQueue> cq_;
+  std::unique_ptr<thread::ThreadPool> thread_pool_;
+  std::unique_ptr<Thread> polling_thread_;
+  AsyncService service_;
+};
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_SERVICE_H_
diff --git a/tensorflow/core/tpu/kernels/tpu_compile.proto b/tensorflow/core/tpu/kernels/tpu_compile.proto
index bdf754493ce..3e95dfa0fa0 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile.proto
+++ b/tensorflow/core/tpu/kernels/tpu_compile.proto
@@ -16,134 +16,11 @@ syntax = "proto3";
 
 package tensorflow.tpu;
 
-import "tensorflow/compiler/tf2xla/host_compute_metadata.proto";
-import "tensorflow/compiler/xla/service/hlo.proto";
-import "tensorflow/compiler/xla/xla_data.proto";
 import "tensorflow/core/framework/attr_value.proto";
 import "tensorflow/core/framework/function.proto";
 import "tensorflow/core/framework/tensor.proto";
 import "tensorflow/core/framework/tensor_shape.proto";
-import "tensorflow/core/framework/types.proto";
 import "tensorflow/core/protobuf/tpu/compile_metadata.proto";
-import "tensorflow/core/tpu/kernels/tpu_executable_info.proto";
-
-message PerCoreVariableIndices {
-  // For each resource variable output, what was the index of the corresponding
-  // input and was it updated? The indices are sorted by input order.
-  repeated TPUExecutableInfoProto.UpdateIndexPair variable_indices = 1;
-}
-
-message PerCoreArgShapes {
-  // Argument shapes for each Tpu core.
-  repeated xla.ShapeProto shapes = 1;
-}
-
-message PerCoreOutputShapes {
-  // Output shapes for each Tpu core.
-  repeated xla.ShapeProto shapes = 1;
-}
-
-message OutputDescriptionProto {
-  // Type and shape of the output. The shape is the unflattened shape.
-  // When `type` is DT_RESOURCE, `shape` is the shape of the resource
-  // variable's value.
-  tensorflow.DataType type = 1;
-  tensorflow.TensorShapeProto shape = 2;
-
-  // Constant output value, if known to be constant at JIT compilation time.
-  // 'Tensor' is in host memory.
-  bool is_constant = 3;
-  tensorflow.TensorProto constant_value = 4;
-
-  // When this output is a resource, i.e. `type == DT_RESOURCE`, this is
-  // the index of the input that contains the resource.
-  int32 input_index = 5;
-
-  // Whether this output is a TensorList.
-  bool is_tensor_list = 6;
-}
-
-// Describes a variable write side effect of the computation.
-message ResourceUpdateProto {
-  // Index of the input that contains the variable resource to write to.
-  int32 input_index = 1;
-
-  // Type and shape of the tensor to be written back.
-  // The `shape` field has the same meaning as the Argument::shape field.
-  tensorflow.DataType type = 2;
-  tensorflow.TensorShapeProto shape = 3;
-
-  // Was the value of the variable modified by the computation?
-  // (Always true, unless `return_updated_values_for_all_resources` is true.)
-  bool modified = 4;
-
-  // If the resource is a TensorArray, the set of gradients read or written.
-  map<string, bool> tensor_array_gradients_accessed = 5;
-}
-
-// Describes the result of a XLA Compiler compilation.
-message XlaCompilationResultProto {
-  // Vector that maps from the parameters of the XLA computation to their
-  // original argument positions. To handle compile-time constant inputs, the
-  // parameters to the XLA computation may be a subset of the original
-  // arguments. The relative ordering of parameters are maintained.
-  repeated int32 input_mappings = 1;
-
-  // Input shapes of the computation. If we are flattening inputs, these are
-  // the flattened shapes.
-  repeated xla.ShapeProto xla_input_shapes = 2;
-
-  // Output shape in XLA format. The output shape is always a tuple. If we
-  // are flattening outputs, these are the flattened shapes.
-  xla.ShapeProto xla_output_shape = 3;
-
-  // TensorFlow shapes of outputs, together with the values of any
-  // constant arguments. Vector indexed by Tensorflow _Retval number,
-  // containing both constant and non-constant results.
-  repeated OutputDescriptionProto outputs = 4;
-
-  // TensorFlow shapes and types of sends/recvs from HostCompute Ops to their
-  // matching RecvAtHost/SendFromHost Ops in the outer graph.
-  tf2xla.HostComputeMetadata host_compute_metadata = 5;
-
-  // Resources whose values were updated by the computation, ordered
-  // by return value position (which is the same as the order the resources
-  // were passed as arguments). Resource updates follow the non-constant
-  // results in the outputs of XLA computation.
-  repeated ResourceUpdateProto resource_updates = 6;
-
-  // The XLA computation built from the tensorflow subgraph.
-  xla.HloModuleProto computation = 7;
-}
-
-// TpuAotCompilationRequestProto represents a compilation request for performing
-// ahead-of-time (AOT) compilation of XLA Computations into XLA HLO IR.
-message TpuAotCompilationRequestProto {
-  // A set of HLO module built to run concurrently
-  // across different devices.
-  xla.HloModuleGroupProto hlo_module_group = 1;
-
-  // Compilation metadata.
-  TPUCompileMetadataProto metadata = 2;
-
-  // DeviceAssignmentProto is a serialized form of DeviceAssignment class, which
-  // represents the device ids assigned to a set of replicated computations.
-  // See xla::DeviceAssignment class comment for more details.
-  xla.DeviceAssignmentProto device_assignment = 3;
-
-  // Per TPU core program arguments shapes.
-  repeated PerCoreArgShapes per_core_arg_shapes = 4;
-
-  // Per TPU core program outputs shapes.
-  repeated PerCoreOutputShapes per_core_output_shapes = 5;
-
-  // Per TPU core information containing what was the index of the corresponding
-  // input and if whether it was updated. The indices are sorted by input order.
-  repeated PerCoreVariableIndices per_core_variable_indices = 6;
-
-  // XLA compiler compilation result.
-  XlaCompilationResultProto compilation_result = 7;
-}
 
 // TPU compilation request for compiling computations into XLA HLO IR and build
 // TPU programs.
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_c_api.h b/tensorflow/core/tpu/kernels/tpu_compile_c_api.h
index 44607631e15..07bc49b2167 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_c_api.h
+++ b/tensorflow/core/tpu/kernels/tpu_compile_c_api.h
@@ -15,25 +15,15 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILE_C_API_H_
 #define TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILE_C_API_H_
 
+#include <stddef.h>
+
 #include "tensorflow/core/tpu/kernels/tpu_mesh_state_c_api.h"
 #include "tensorflow/core/tpu/kernels/tpu_program_c_api.h"
-#include "tensorflow/core/tpu/kernels/tpu_util_c_api.h"
 #include "tensorflow/core/tpu/libtftpu.h"
-#include "tensorflow/stream_executor/tpu/proto_helper.h"
+#include "tensorflow/stream_executor/tpu/c_api_decl.h"
 
 extern "C" {
 
-// Compiles HLO IR and returns `count` number of TPU programs ready for
-// execution.
-// The API allocates the `XLA_TpuProgram*[]` array `tpu_programs` and creates
-// `XLA_TpuProgram` object(s) using the `TpuProgram_New` API. The caller is
-// responsible to deallocate both the `XLA_TpuProgram*[]` array and the
-// `XLA_TpuProgram` object(s) using `TpuProgram_FreeArray` and `TpuProgram_Free`
-// API respectively.
-TFTPU_CAPI_EXPORT void TpuCompile_CompileAheadOfTime(
-    TpuSerializedProto aot_compilation_request, XLA_TpuProgram** tpu_programs[],
-    size_t* count, SE_Status* status);
-
 // Compiles Mlir or TF function computation by lowering into HLO IR and returns
 // `count` number of TPU programs ready for execution.
 // The API allocates the `XLA_TpuProgram*[]` array `tpu_programs` and creates
@@ -46,7 +36,6 @@ TFTPU_CAPI_EXPORT void TpuCompile_CompileAndBuild(
     XLA_TpuProgram** tpu_programs[], size_t* count, SE_Status* status);
 
 struct TfTpu_CompileApiFn {
-  TFTPU_ADD_FN_IN_STRUCT(TpuCompile_CompileAheadOfTime);
   TFTPU_ADD_FN_IN_STRUCT(TpuCompile_CompileAndBuild);
 };
 
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc b/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc
index 4ed646af302..34ea31759ce 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc
@@ -413,46 +413,6 @@ Status TpuCompileOpKernelCommon::CompileTFFunctionToHlo(
   return Status::OK();
 }
 
-/* static */
-Status TpuCompileOpKernelCommon::ComputeArgumentShapes(
-    const tpu::TPUCompileMetadataProto& metadata,
-    const std::vector<TensorShape>& dynamic_shapes,
-    std::vector<TensorShape>* arg_shapes) {
-  arg_shapes->resize(metadata.args_size());
-  int dynamic_shape_pos = 0;
-  for (int i = 0; i < metadata.args_size(); ++i) {
-    const tpu::TPUCompileMetadataProto::Arg& arg = metadata.args(i);
-    // The XLA compiler determines the shape of each constant by inspecting the
-    // value of its corresponding host-memory tensor. As a result, we don't need
-    // to give the compiler graph-inferred shapes for constant arguments.
-    if (arg.kind() == tpu::TPUCompileMetadataProto::Arg::GUARANTEED_CONSTANT) {
-      continue;
-    }
-    TF_RETURN_IF_ERROR(PartialTensorShape::IsValidShape(arg.shape()));
-    PartialTensorShape static_shape(arg.shape());
-
-    TensorShape& shape = (*arg_shapes)[i];
-    if (static_shape.IsFullyDefined()) {
-      TF_RET_CHECK(static_shape.AsTensorShape(&shape));
-    } else {
-      TF_RET_CHECK(dynamic_shape_pos < dynamic_shapes.size())
-          << "Too few dynamic shapes";
-      shape = dynamic_shapes[dynamic_shape_pos++];
-      if (!static_shape.IsCompatibleWith(shape)) {
-        return errors::InvalidArgument(
-            "Mismatch between static and dynamic shape for argument. Static "
-            "shape: ",
-            static_shape.DebugString(),
-            "; dynamic shape: ", shape.DebugString());
-      }
-    }
-  }
-  // Checks we consumed all of the dynamic shapes.
-  TF_RET_CHECK(dynamic_shape_pos == dynamic_shapes.size())
-      << "Too many dynamic shapes";
-  return Status::OK();
-}
-
 // Function arguments and return values lose their device assignments, so we
 // must recreate them.
 /* static */ Status TpuCompileOpKernelCommon::AssignDevicesToArgsAndRetvals(
@@ -697,10 +657,11 @@ Status TpuCompileOpKernelCommon::ComputeInternal(OpKernelContext* ctx) {
 
   int64 uid;
   std::vector<std::string> proto_key;
+  std::vector<std::string> sharding_key;
   std::vector<bool> may_modify_variables;
   absl::Span<const xla::HloProto* const> hlo_metadatas;
   Status status = cache->CompileIfKeyAbsent(
-      key, ctx->session_metadata(), ref_holder, &uid, &proto_key,
+      key, ctx->session_metadata(), ref_holder, &uid, &proto_key, &sharding_key,
       &may_modify_variables, &hlo_metadatas,
       [&](TpuProgramGroupInterface* tpu_program_group) {
         VLOG(1) << "Cloud TPU: Compiling TPU program";
@@ -818,13 +779,21 @@ Status TpuCompileOpKernelCommon::ComputeInternal(OpKernelContext* ctx) {
 
   if (status.ok()) {
     for (int i = 0; i < num_cores_with_compiled_programs; ++i) {
-      Tensor output(DT_STRING, TensorShape({2}));
+      Tensor output(DT_STRING, TensorShape({3}));
       if (proto_key.size() == 1) {
         output.vec<tstring>()(0) = proto_key[0];
       } else {
         output.vec<tstring>()(0) = proto_key[i];
       }
       output.vec<tstring>()(1) = rendezvous_key_base;
+      if (sharding_key.empty()) {
+        output.vec<tstring>()(2) = "";
+      } else if (sharding_key.size() == 1) {
+        output.vec<tstring>()(2) = sharding_key[0];
+      } else {
+        TF_RET_CHECK(sharding_key.size() == num_cores_with_compiled_programs);
+        output.vec<tstring>()(2) = sharding_key[i];
+      }
       ctx->set_output(i + 1, output);
     }
     if (!use_mlir_) {
@@ -845,9 +814,10 @@ Status TpuCompileOpKernelCommon::ComputeInternal(OpKernelContext* ctx) {
   } else {
     // Return error in the invalid case.
     for (int i = 0; i < num_computations_; ++i) {
-      Tensor output(DT_STRING, TensorShape({2}));
+      Tensor output(DT_STRING, TensorShape({3}));
       output.vec<tstring>()(0) = "<<NO PROGRAM AS COMPILATION FAILED>>";
       output.vec<tstring>()(1) = "<<NO RENDEZVOUS KEY AS COMPILATION FAILED>>";
+      output.vec<tstring>()(2) = "<<NO SHARDing KEY AS COMPILATION FAILED>>";
       ctx->set_output(i + 1, output);
     }
     if (!use_mlir_) {
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_op_common.h b/tensorflow/core/tpu/kernels/tpu_compile_op_common.h
index 3d3f0afcdb7..327aa460ddd 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_op_common.h
+++ b/tensorflow/core/tpu/kernels/tpu_compile_op_common.h
@@ -99,15 +99,6 @@ class TpuCompileOpKernelCommon {
       const std::vector<TensorShape>& arg_shapes,
       TpuProgramGroupInterface* tpu_program_group) = 0;
 
-  // Computes shapes for each argument. Uses both the static shape from the
-  // metadata, and the dynamic shapes where the static shape is not
-  // defined. There must be one dynamic_shape for each argument with a
-  // partially defined shape, in index order.
-  static Status ComputeArgumentShapes(
-      const tpu::TPUCompileMetadataProto& metadata,
-      const std::vector<TensorShape>& dynamic_shapes,
-      std::vector<TensorShape>* arg_shapes);
-
   // Performs shape inference on `computation`, filling shape_info with operator
   // shapes. The shapes of the _Arg nodes are taken from `arg_shapes`.
   static Status RunShapeInferenceOnComputation(
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_op_support.cc b/tensorflow/core/tpu/kernels/tpu_compile_op_support.cc
index 5cc35a07e66..75635c3c36a 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_op_support.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compile_op_support.cc
@@ -334,109 +334,6 @@ Status CreateHloModules(
   return Status::OK();
 }
 
-XlaCompilationResultProto SerializeCompilationResult(
-    const XlaCompiler::CompilationResult& compilation_result) {
-  XlaCompilationResultProto compilation_result_proto;
-  for (int input_mapping : compilation_result.input_mapping) {
-    compilation_result_proto.add_input_mappings(input_mapping);
-  }
-
-  for (const Shape& input_shape : compilation_result.xla_input_shapes) {
-    *(compilation_result_proto.add_xla_input_shapes()) = input_shape.ToProto();
-  }
-  *(compilation_result_proto.mutable_xla_output_shape()) =
-      compilation_result.xla_output_shape.ToProto();
-
-  for (const XlaCompiler::OutputDescription& output_description :
-       compilation_result.outputs) {
-    auto* new_output = compilation_result_proto.add_outputs();
-    new_output->set_type(output_description.type);
-    output_description.shape.AsProto(new_output->mutable_shape());
-    new_output->set_is_constant(output_description.is_constant);
-    output_description.constant_value.AsProtoField(
-        new_output->mutable_constant_value());
-    new_output->set_input_index(output_description.input_index);
-    new_output->set_is_tensor_list(output_description.is_tensor_list);
-  }
-
-  *compilation_result_proto.mutable_host_compute_metadata() =
-      compilation_result.host_compute_metadata;
-
-  for (const XlaCompiler::ResourceUpdate& resource_update :
-       compilation_result.resource_updates) {
-    auto* new_resource_update = compilation_result_proto.add_resource_updates();
-    new_resource_update->set_input_index(resource_update.input_index);
-    new_resource_update->set_type(resource_update.type);
-    resource_update.shape.AsProto(new_resource_update->mutable_shape());
-    new_resource_update->set_modified(resource_update.modified);
-    for (const std::string& gradient_access :
-         resource_update.tensor_array_gradients_accessed) {
-      new_resource_update->mutable_tensor_array_gradients_accessed()->insert(
-          {gradient_access, true});
-    }
-  }
-
-  if (compilation_result.computation != nullptr) {
-    *compilation_result_proto.mutable_computation() =
-        compilation_result.computation->proto();
-  }
-
-  return compilation_result_proto;
-}
-
-StatusOr<TpuAotCompilationRequestProto> CreateTpuAotCompilationRequest(
-    const xla::HloModuleGroup& module_group,
-    const XlaCompiler::CompilationResult& compilation_result,
-    const TPUCompileMetadataProto& metadata,
-    const std::vector<std::vector<xla::Shape>>& per_core_arg_shapes,
-    const std::vector<std::vector<xla::Shape>>& per_core_output_shapes,
-    const std::vector<std::vector<std::pair<int, bool>>>&
-        per_core_variable_indices,
-    const absl::optional<xla::DeviceAssignment>& device_assignment) {
-  VLOG(1) << "CreateTpuAotCompilationRequest.";
-  TpuAotCompilationRequestProto aot_request;
-  *(aot_request.mutable_hlo_module_group()) = module_group.ToProto();
-  *(aot_request.mutable_metadata()) = metadata;
-  if (device_assignment.has_value()) {
-    xla::DeviceAssignmentProto device_assignment_proto;
-    Status status = device_assignment->Serialize(&device_assignment_proto);
-    if (!status.ok()) {
-      return status;
-    }
-    *(aot_request.mutable_device_assignment()) = device_assignment_proto;
-  }
-
-  for (const auto& arg_shapes : per_core_arg_shapes) {
-    auto* new_shape_list = aot_request.add_per_core_arg_shapes();
-    for (const auto& arg_shape : arg_shapes) {
-      *new_shape_list->add_shapes() = arg_shape.ToProto();
-    }
-  }
-
-  for (const auto& output_shapes : per_core_output_shapes) {
-    auto* new_shape_list = aot_request.add_per_core_output_shapes();
-    for (const auto& output_shape : output_shapes) {
-      *new_shape_list->add_shapes() = output_shape.ToProto();
-    }
-  }
-
-  for (const auto& variable_indices : per_core_variable_indices) {
-    auto* new_list = aot_request.add_per_core_variable_indices();
-    for (const auto& variable_index : variable_indices) {
-      auto* core_index = new_list->add_variable_indices();
-      core_index->set_index(variable_index.first);
-      core_index->set_updated(variable_index.second);
-    }
-  }
-
-  XlaCompilationResultProto compilation_result_proto =
-      SerializeCompilationResult(compilation_result);
-  *aot_request.mutable_compilation_result() = compilation_result_proto;
-
-  VLOG(1) << "TpuAotCompilationRequest:\n" << aot_request.DebugString();
-  return aot_request;
-}
-
 StatusOr<TpuCompilationRequestProto> CreateTpuCompilationRequest(
     const absl::variant<MlirToHloArgs, FunctionToHloArgs>& computation,
     const TPUCompileMetadataProto& metadata,
@@ -540,5 +437,43 @@ Status CompileOpMetadataFromContext(OpKernelConstruction* ctx,
   }
   return Status::OK();
 }
+
+Status ComputeArgumentShapes(const tpu::TPUCompileMetadataProto& metadata,
+                             const std::vector<TensorShape>& dynamic_shapes,
+                             std::vector<TensorShape>* arg_shapes) {
+  arg_shapes->resize(metadata.args_size());
+  int dynamic_shape_pos = 0;
+  for (int i = 0; i < metadata.args_size(); ++i) {
+    const tpu::TPUCompileMetadataProto::Arg& arg = metadata.args(i);
+    // The XLA compiler determines the shape of each constant by inspecting the
+    // value of its corresponding host-memory tensor. As a result, we don't need
+    // to give the compiler graph-inferred shapes for constant arguments.
+    if (arg.kind() == tpu::TPUCompileMetadataProto::Arg::GUARANTEED_CONSTANT) {
+      continue;
+    }
+    TF_RETURN_IF_ERROR(PartialTensorShape::IsValidShape(arg.shape()));
+    PartialTensorShape static_shape(arg.shape());
+
+    TensorShape& shape = (*arg_shapes)[i];
+    if (static_shape.IsFullyDefined()) {
+      TF_RET_CHECK(static_shape.AsTensorShape(&shape));
+    } else {
+      TF_RET_CHECK(dynamic_shape_pos < dynamic_shapes.size())
+          << "Too few dynamic shapes";
+      shape = dynamic_shapes[dynamic_shape_pos++];
+      if (!static_shape.IsCompatibleWith(shape)) {
+        return errors::InvalidArgument(
+            "Mismatch between static and dynamic shape for argument. Static "
+            "shape: ",
+            static_shape.DebugString(),
+            "; dynamic shape: ", shape.DebugString());
+      }
+    }
+  }
+  // Checks we consumed all of the dynamic shapes.
+  TF_RET_CHECK(dynamic_shape_pos == dynamic_shapes.size())
+      << "Too many dynamic shapes";
+  return Status::OK();
+}
 }  // namespace tpu
 }  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_op_support.h b/tensorflow/core/tpu/kernels/tpu_compile_op_support.h
index bc60f64286a..d0e28494f53 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_op_support.h
+++ b/tensorflow/core/tpu/kernels/tpu_compile_op_support.h
@@ -139,17 +139,6 @@ se::port::Status CreateHloModules(
     const absl::optional<xla::DeviceAssignment>& device_assignment,
     std::vector<std::unique_ptr<xla::HloModule>>* hlo_modules);
 
-se::port::StatusOr<TpuAotCompilationRequestProto>
-CreateTpuAotCompilationRequest(
-    const xla::HloModuleGroup& module_group,
-    const XlaCompiler::CompilationResult& compilation_result,
-    const TPUCompileMetadataProto& metadata,
-    const std::vector<std::vector<xla::Shape>>& per_core_arg_shapes,
-    const std::vector<std::vector<xla::Shape>>& per_core_output_shapes,
-    const std::vector<std::vector<std::pair<int, bool>>>&
-        per_core_variable_indices,
-    const absl::optional<xla::DeviceAssignment>& device_assignment);
-
 se::port::StatusOr<TpuCompilationRequestProto> CreateTpuCompilationRequest(
     const absl::variant<MlirToHloArgs, FunctionToHloArgs>& computation,
     const TPUCompileMetadataProto& metadata,
@@ -159,6 +148,14 @@ se::port::Status CompileOpMetadataFromContext(OpKernelConstruction* ctx,
                                               TPUCompileMetadataProto* metadata,
                                               NameAttrList* function_name,
                                               std::string* mlir_module);
+
+// Computes shapes for each argument. Uses both the static shape from the
+// metadata, and the dynamic shapes where the static shape is not
+// defined. There must be one dynamic_shape for each argument with a
+// partially defined shape, in index order.
+Status ComputeArgumentShapes(const TPUCompileMetadataProto& metadata,
+                             const std::vector<TensorShape>& dynamic_shapes,
+                             std::vector<TensorShape>* arg_shapes);
 }  // namespace tpu
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/tpu/kernels/tpu_configuration_ops.cc b/tensorflow/core/tpu/kernels/tpu_configuration_ops.cc
index 71735f0639f..271a9697f18 100644
--- a/tensorflow/core/tpu/kernels/tpu_configuration_ops.cc
+++ b/tensorflow/core/tpu/kernels/tpu_configuration_ops.cc
@@ -27,8 +27,10 @@ limitations under the License.
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_local_lookup.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_lookup.h"
 #include "tensorflow/core/tpu/kernels/tpu_mesh_state_interface.h"
 #include "tensorflow/core/tpu/kernels/tpu_op_consts.h"
+#include "tensorflow/core/tpu/kernels/tpu_pod_state.h"
 #include "tensorflow/core/tpu/tpu_api.h"
 #include "tensorflow/core/tpu/tpu_config_c_api.h"
 #include "tensorflow/core/tpu/tpu_configuration.h"
@@ -37,7 +39,6 @@ limitations under the License.
 
 namespace tensorflow {
 namespace {
-
 Status GetTpuMeshStateInterface(const ResourceMgr* rmgr,
                                 tpu::TpuMeshStateInterface** state) {
   if (!rmgr->Lookup(rmgr->default_container(),
@@ -69,7 +70,6 @@ Status DeleteIfExists(ResourceMgr* resource_manager,
   VLOG(1) << "Error removing resource " << resource_name << " : " << status;
   return status;
 }
-
 }  // namespace
 
 Status CreateTpuCompilationCache(
@@ -82,36 +82,39 @@ Status CreateTpuCompilationCache(
       });
 }
 
-void ConfigureDistributedTpuOp::Compute(OpKernelContext* ctx) {
-  VLOG(1) << "ConfigureDistributedTpuOp";
-  XLA_SCOPED_LOGGING_TIMER("ConfigureDistributedTpuOp");
-
+xla::StatusOr<std::vector<int32_t>> ConstructDevicesPerHost(
+    OpKernelContext* ctx) {
   std::vector<int32_t> num_devices_per_host;
   int chips_per_host = -1;
   for (int i = 0; i < ctx->num_inputs(); ++i) {
     const Tensor& input_tensor = ctx->input(i);
-    OP_REQUIRES(
-        ctx, TensorShapeUtils::IsScalar(input_tensor.shape()),
-        errors::InvalidArgument("Input ", i, " should be a scalar but has ",
-                                input_tensor.dims(), " dimensions"));
+    if (!TensorShapeUtils::IsScalar(input_tensor.shape())) {
+      return errors::InvalidArgument("Input ", i,
+                                     " should be a scalar but has ",
+                                     input_tensor.dims(), " dimensions");
+    }
     if (chips_per_host == -1) {
       chips_per_host = input_tensor.scalar<int32_t>()();
     } else {
-      OP_REQUIRES(
-          ctx, chips_per_host == input_tensor.scalar<int32>()(),
-          errors::Internal("Host ", i, " has ", input_tensor.scalar<int32>()(),
-                           " TPU chips but host 0 has ", chips_per_host));
+      if (chips_per_host != input_tensor.scalar<int32>()()) {
+        return errors::Internal("Host ", i, " has ",
+                                input_tensor.scalar<int32>()(),
+                                " TPU chips but host 0 has ", chips_per_host);
+      }
     }
     num_devices_per_host.push_back(input_tensor.scalar<int32_t>()());
   }
+  return num_devices_per_host;
+}
 
-  TF_Status* status = TF_NewStatus();
-  size_t host_config_output_size;
-  char* host_config_output;
+void ConfigureDistributedTpuOp::Compute(OpKernelContext* ctx) {
+  VLOG(1) << "ConfigureDistributedTpuOp";
+  XLA_SCOPED_LOGGING_TIMER("ConfigureDistributedTpuOp");
 
-  auto* rmgr = GetTPUConfigResourceMgr();
-  OP_REQUIRES_OK(ctx, DeleteIfExists<tpu::TpuMeshStateInterface>(
-                          rmgr, tpu::kTpuMeshStateInterfaceResourceName));
+  xla::StatusOr<std::vector<int32_t>> num_devices_per_host =
+      ConstructDevicesPerHost(ctx);
+  OP_REQUIRES_OK(ctx, num_devices_per_host.status());
+  ResourceMgr* rmgr = GetTPUConfigResourceMgr();
 
   // Create the subgraph compilation cache and put it in the local resource
   // manager.
@@ -119,9 +122,13 @@ void ConfigureDistributedTpuOp::Compute(OpKernelContext* ctx) {
   OP_REQUIRES_OK(ctx, CreateTpuCompilationCache(rmgr, &compilation_cache));
   core::ScopedUnref compilation_cache_ref(compilation_cache);
 
-  tpu::ConfigApiFn()->ConfigureDistributedTpuOp_DoWorkFn(
-      num_devices_per_host.size(), num_devices_per_host.data(),
-      compilation_cache, &host_config_output_size, &host_config_output, status);
+  std::string host_config_output;
+  OP_REQUIRES_OK(
+      ctx, ConstructTpuPodState(rmgr, *num_devices_per_host, compilation_cache,
+                                &host_config_output));
+
+  OP_REQUIRES_OK(ctx, DeleteIfExists<tpu::TpuMeshStateInterface>(
+                          rmgr, tpu::kTpuMeshStateInterfaceResourceName));
 
   auto* tpu_mesh = tpu::TpuMeshStateInterface::Create();
   OP_REQUIRES_OK(
@@ -130,13 +137,7 @@ void ConfigureDistributedTpuOp::Compute(OpKernelContext* ctx) {
 
   Tensor* ctx_output;
   OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &ctx_output));
-  ctx_output->scalar<tstring>()() =
-      std::string(host_config_output, host_config_output_size);
-
-  OP_REQUIRES_OK(ctx, StatusFromTF_Status(status));
-  TF_DeleteStatus(status);
-
-  tpu::ConfigApiFn()->TpuConfigurationApi_FreeCharArrayFn(host_config_output);
+  ctx_output->scalar<tstring>()() = std::move(host_config_output);
 
   VLOG(1) << "ConfigureDistributedTpuOp done";
 }
@@ -186,30 +187,39 @@ void WaitForDistributedTpuOp::Compute(OpKernelContext* ctx) {
     mapping_arg.push_back(mapping[i].data());
   }
 
-  TF_Status* status = TF_NewStatus();
-  size_t tpu_topology_output_size;
-  char* tpu_topology_output;
-
   tpu::TpuMeshStateInterface* mesh_state;
   auto* rmgr = GetTPUConfigResourceMgr();
   OP_REQUIRES_OK(ctx, GetTpuMeshStateInterface(rmgr, &mesh_state));
   core::ScopedUnref mesh_state_unref(mesh_state);
 
+  // TODO(b/166858751): this code to check if `TpuPodState` exists is ported
+  // from a legacy library that may have staled. A candidate for cleanup.
+  TpuPodState* pod_state;
+  OP_REQUIRES_OK(ctx, GetTPUPodState(rmgr, &pod_state));
+  core::ScopedUnref pod_state_unref(pod_state);
+
+  size_t tpu_topology_output_size;
+  char* tpu_topology_output = nullptr;
+  TF_Status* status = TF_NewStatus();
+  auto cleanup = xla::MakeCleanup([&status, &tpu_topology_output]() {
+    TF_DeleteStatus(status);
+    tpu::ConfigApiFn()->TpuConfigurationApi_FreeCharArrayFn(
+        tpu_topology_output);
+  });
+
   auto* mesh_common_state = mesh_state->mesh_common_state();
   tpu::ConfigApiFn()->WaitForDistributedTpuOp_DoWorkFn(
       num_hosts, num_devices_per_host,
       const_cast<const int32_t**>(mapping_arg.data()), mesh_common_state,
       &tpu_topology_output_size, &tpu_topology_output, status);
 
+  OP_REQUIRES_OK(ctx, StatusFromTF_Status(status));
+
   Tensor* ctx_output;
   OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &ctx_output));
   ctx_output->scalar<tstring>()() =
       std::string(tpu_topology_output, tpu_topology_output_size);
 
-  OP_REQUIRES_OK(ctx, StatusFromTF_Status(status));
-  TF_DeleteStatus(status);
-  tpu::ConfigApiFn()->TpuConfigurationApi_FreeCharArrayFn(tpu_topology_output);
-
   VLOG(1) << "WaitForDistributedTpuOp done";
 }
 
@@ -217,17 +227,14 @@ void ShutdownDistributedTpuOp::Compute(OpKernelContext* ctx) {
   VLOG(1) << "ShutdownDistributedTpuOp";
   XLA_SCOPED_LOGGING_TIMER("ShutdownDistributedTpuOp");
 
-  TF_Status* status = TF_NewStatus();
+  auto* rmgr = GetTPUConfigResourceMgr();
   OP_REQUIRES_OK(ctx, DeleteIfExists<tpu::TpuMeshStateInterface>(
-                          GetTPUConfigResourceMgr(),
-                          tpu::kTpuMeshStateInterfaceResourceName));
-  tpu::ConfigApiFn()->ShutdownDistributedTpuOp_DoWorkFn(status);
-  OP_REQUIRES_OK(ctx, StatusFromTF_Status(status));
-  TF_DeleteStatus(status);
+                          rmgr, tpu::kTpuMeshStateInterfaceResourceName));
 
-  OP_REQUIRES_OK(
-      ctx, DeleteIfExists<tpu::TpuCompilationCacheInterface>(
-               GetTPUConfigResourceMgr(), tpu::kCompilationCacheResourceName));
+  OP_REQUIRES_OK(ctx,
+                 DeleteIfExists<TpuPodState>(rmgr, kTpuPodStateResourceName));
+  OP_REQUIRES_OK(ctx, DeleteIfExists<tpu::TpuCompilationCacheInterface>(
+                          rmgr, tpu::kCompilationCacheResourceName));
 
   VLOG(1) << "ShutdownDistributedTpuOp done";
 }
@@ -239,10 +246,6 @@ void InitializeHostForDistributedTpuOp::Compute(OpKernelContext* ctx) {
   auto* rmgr = GetTPUConfigResourceMgr();
   auto tpu_host_config = ctx->input(0).scalar<tstring>()();
 
-  size_t device_id_output_size;
-  int32_t* device_id_output;
-  TF_Status* status = TF_NewStatus();
-
   bool is_master_worker =
       tpu::ConfigApiFn()->TpuConfigurationApi_HasTPUPodStateFn();
   if (!is_master_worker) {
@@ -255,13 +258,9 @@ void InitializeHostForDistributedTpuOp::Compute(OpKernelContext* ctx) {
                                      mesh_state_interface));
   }
 
-#if defined(LIBTFTPU)
   VLOG(1) << "Removing existing proto compilation cache lookup if it exists";
-  OP_REQUIRES_OK(
-      ctx, DeleteIfExists<tpu::TpuCompilationCacheLookup<
-               tpu::CompilationCacheEntryRef<tpu::TpuCompilationCacheEntry>>>(
-               rmgr, tpu::kCompiledProtoCacheResourceName));
-#endif
+  OP_REQUIRES_OK(ctx, DeleteIfExists<tpu::TpuCompilationCacheLookup>(
+                          rmgr, tpu::kCompiledProtoCacheResourceName));
 
   if (enable_whole_mesh_compilations_) {
     // If this is a whole mesh compilation mode, create the compilation cache,
@@ -279,24 +278,52 @@ void InitializeHostForDistributedTpuOp::Compute(OpKernelContext* ctx) {
     local_compilation_cache = nullptr;
   }
 
+  TF_Status* status = TF_NewStatus();
+  size_t device_id_output_size;
+  int32_t* device_id_output = nullptr;
+  auto cleanup = xla::MakeCleanup([&status, &device_id_output]() {
+    TF_DeleteStatus(status);
+    tpu::ConfigApiFn()->TpuConfigurationApi_FreeInt32ArrayFn(device_id_output);
+  });
   tpu::ConfigApiFn()->InitializeHostForDistributedTpuOp_DoWorkFn(
       tpu_host_config.size(), tpu_host_config.data(),
-      enable_whole_mesh_compilations_, local_compilation_cache,
-      &device_id_output_size, &device_id_output, status);
+      enable_whole_mesh_compilations_, is_master_worker, &device_id_output_size,
+      &device_id_output, status);
+  OP_REQUIRES_OK(ctx, StatusFromTF_Status(status));
 
   if (local_compilation_cache != nullptr) {
     local_compilation_cache->Unref();
 
-#if defined(LIBTFTPU)
-    tpu::TpuCompilationCacheLookup<
-        tpu::CompilationCacheEntryRef<tpu::TpuCompilationCacheEntry>>*
-        proto_lookup;
+    tpu::TpuCompilationCacheLookup* proto_lookup;
     proto_lookup =
         new tpu::TpuCompilationCacheLocalLookup(local_compilation_cache);
     OP_REQUIRES_OK(
         ctx, rmgr->Create(rmgr->default_container(),
                           tpu::kCompiledProtoCacheResourceName, proto_lookup));
-#endif
+  } else {
+    int64_t cache_size_bytes;
+    tpu::ConfigApiFn()->TpuConfigurationApi_RemoteCompilationCacheSizeInBytesFn(
+        &cache_size_bytes);
+
+    char* server_address_output = nullptr;
+    auto cleanup_server_address = xla::MakeCleanup([&server_address_output]() {
+      tpu::ConfigApiFn()->TpuConfigurationApi_FreeCharArrayFn(
+          server_address_output);
+    });
+    size_t server_address_output_size;
+    tpu::ConfigApiFn()
+        ->TpuConfigurationApi_CompilationCacheServerAddressFromConfigFn(
+            tpu_host_config.size(), tpu_host_config.data(),
+            &server_address_output_size, &server_address_output, status);
+    OP_REQUIRES_OK(ctx, StatusFromTF_Status(status));
+
+    std::string server_address(server_address_output,
+                               server_address_output_size);
+    tpu::TpuCompilationCacheLookup* proto_lookup =
+        new tpu::TpuCompilationCacheRpcLookup(server_address, cache_size_bytes);
+    OP_REQUIRES_OK(
+        ctx, rmgr->Create(rmgr->default_container(),
+                          tpu::kCompiledProtoCacheResourceName, proto_lookup));
   }
 
   Tensor* ctx_output;
@@ -309,10 +336,6 @@ void InitializeHostForDistributedTpuOp::Compute(OpKernelContext* ctx) {
     ctx_output->flat<int32>()(i) = device_id_output[i];
   }
 
-  OP_REQUIRES_OK(ctx, StatusFromTF_Status(status));
-  TF_DeleteStatus(status);
-  tpu::ConfigApiFn()->TpuConfigurationApi_FreeInt32ArrayFn(device_id_output);
-
   VLOG(1) << "InitializeHostForDistributedTpuOp done";
 }
 
diff --git a/tensorflow/core/tpu/kernels/tpu_configuration_ops.h b/tensorflow/core/tpu/kernels/tpu_configuration_ops.h
index d0bf5809842..d58712ae3dd 100644
--- a/tensorflow/core/tpu/kernels/tpu_configuration_ops.h
+++ b/tensorflow/core/tpu/kernels/tpu_configuration_ops.h
@@ -15,14 +15,22 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_CONFIGURATION_OPS_H_
 #define TENSORFLOW_CORE_TPU_KERNELS_TPU_CONFIGURATION_OPS_H_
 
+#include <stdint.h>
+
+#include <vector>
+
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
 
 namespace tensorflow {
 
 Status CreateTpuCompilationCache(
     ResourceMgr* rmgr, tpu::TpuCompilationCacheInterface** compilation_cache);
 
+xla::StatusOr<std::vector<int32_t>> ConstructDevicesPerHost(
+    OpKernelContext* ctx);
+
 // The ConfigureDistributedTpu op is used to start an TPUDriver from
 // TensorFlow. It should be run on a TPU_SYSTEM device and returns the
 // connection host:port for the CompilationCacheServer. The
diff --git a/tensorflow/core/tpu/kernels/tpu_execute_op.cc b/tensorflow/core/tpu/kernels/tpu_execute_op.cc
index 51c9dd481a3..ce69d976398 100644
--- a/tensorflow/core/tpu/kernels/tpu_execute_op.cc
+++ b/tensorflow/core/tpu/kernels/tpu_execute_op.cc
@@ -40,10 +40,12 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/casts.h"
 #include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_external.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_local_lookup.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.h"
 #include "tensorflow/core/tpu/kernels/tpu_executable_info.pb.h"
@@ -56,14 +58,10 @@ limitations under the License.
 #include "tensorflow/stream_executor/tpu/tpu_node_context.h"
 
 namespace tensorflow {
-
 namespace {
-
+using ::tensorflow::tpu::CompilationCacheEntryRef;
+using ::tensorflow::tpu::TpuCompilationCacheLookup;
 using ::tensorflow::tpu::TpuNodeContext;
-using CompilationCacheEntryRef = ::tensorflow::tpu::CompilationCacheEntryRef<
-    ::tensorflow::tpu::TpuCompilationCacheEntry>;
-using TpuCompilationCacheLookup =
-    ::tensorflow::tpu::TpuCompilationCacheLookup<CompilationCacheEntryRef>;
 
 // Looks up the input `key` in the compilation cache, populating
 // `*rendezvous_key_base` and `*entry`.
@@ -74,9 +72,9 @@ Status GetComputationCacheEntry(
   TF_RETURN_IF_ERROR(context->input("key", &key));
   profiler::TraceMe trace_me("TpuExecuteOp::LookupProto", /*level=*/2);
   if (!TensorShapeUtils::IsVector(key->shape()) ||
-      key->shape().dim_size(0) != 2) {
+      key->shape().dim_size(0) != 3) {
     return errors::InvalidArgument(
-        "Key argument to TPUExecute must be a 2-element vector");
+        "Key argument to TPUExecute must be a 3-element vector");
   }
 
   ResourceMgr* rmgr = GetTPUConfigResourceMgr();
@@ -641,28 +639,35 @@ Status TPUExecuteOp::DoWork(OpKernelContext* context) {
   profiler::TraceMe trace_me_init("TPUExecuteOp::Init", /*level=*/2);
 
   string rendezvous_key_base;
-  std::unique_ptr<CompilationCacheEntryRef> entry;
+  std::unique_ptr<CompilationCacheEntryRef> entry_ref;
   TF_RETURN_IF_ERROR(
-      GetComputationCacheEntry(context, &rendezvous_key_base, &entry));
+      GetComputationCacheEntry(context, &rendezvous_key_base, &entry_ref));
 
   // Shapes of the inputs and outputs, in xla::Shape form.
-  const TPUExecutableInfoProto* proto = entry->get().get_executable_info();
+  tpu::TpuCompilationCacheEntry entry = entry_ref->get();
+  const tpu::TpuProgramGroup* tpu_program_group =
+      tensorflow::down_cast<const tpu::TpuProgramGroup*>(
+          entry.tpu_program_group());
+  CHECK_NE(tpu_program_group, nullptr);
+  const int core_index = entry.core_index();
+  const TPUExecutableInfoProto& executable =
+      tpu_program_group->executable_info(core_index);
 
   xla::Backend* const backend = node_context->backend();
   xla::TransferManager* const transfer_manager = backend->transfer_manager();
   TF_RET_CHECK(context->op_device_context());
   se::Stream* stream = context->op_device_context()->stream();
 
-  TF_RET_CHECK(proto->input_shapes_size() == 1);
+  TF_RET_CHECK(executable.input_shapes_size() == 1);
 
-  xla::Shape host_shape(proto->input_shapes(0));
+  xla::Shape host_shape(executable.input_shapes(0));
 
   TF_ASSIGN_OR_RETURN(
       auto variable_update_map,
-      BuildVariableUpdateMap(proto->variable_indices(),
+      BuildVariableUpdateMap(executable.variable_indices(),
                              fused_device_var_reads_in_computation_inputs_,
                              fused_device_var_updates_in_computation_outputs_,
-                             proto->output_tensor_shapes().size()));
+                             executable.output_tensor_shapes().size()));
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<InputBuffers> input_buffers,
       BuildComputationInputs(context, host_shape, variable_update_map, backend,
@@ -697,8 +702,9 @@ Status TPUExecuteOp::DoWork(OpKernelContext* context) {
 
   // Snapshot the inputs, if a snapshot was requested.
   std::shared_ptr<xla::HloSnapshot> hlo_snapshot;
-  if (proto->has_session_module()) {
-    hlo_snapshot = std::make_shared<xla::HloSnapshot>(proto->session_module());
+  if (executable.has_session_module()) {
+    hlo_snapshot =
+        std::make_shared<xla::HloSnapshot>(executable.session_module());
     auto literal =
         std::make_shared<xla::Literal>(shaped_buffer.on_host_shape());
     transfer_manager->TransferLiteralFromDevice(
@@ -723,9 +729,9 @@ Status TPUExecuteOp::DoWork(OpKernelContext* context) {
   const uint32 rng_seed = GetXLARandomSeed();
 
   std::unique_ptr<xla::DeviceAssignment> device_assignment;
-  if (proto->has_device_assignment()) {
+  if (executable.has_device_assignment()) {
     TF_ASSIGN_OR_RETURN(device_assignment, xla::DeviceAssignment::Deserialize(
-                                               proto->device_assignment()));
+                                               executable.device_assignment()));
   }
 
   VLOG(4) << "Input buffers after alias resolution: "
@@ -743,24 +749,24 @@ Status TPUExecuteOp::DoWork(OpKernelContext* context) {
   // we free a memory and reassign it to other users while a program is running,
   // all subsequent writes to the program that could possibly clobber the memory
   // will depend on the program to finish.
-  const TPUHostTransferInfoProto* host_transfer_info =
-      entry->get().get_host_transfer_info();
-  const xla::HloProto* hlo_metadata = entry->get().get_hlo_metadata();
+  const TPUHostTransferInfoProto& host_transfer_info =
+      tpu_program_group->host_transfer_info(core_index);
   TF_ASSIGN_OR_RETURN(
       xla::ExecutionOutput output,
-      TPUExecute(*proto, *host_transfer_info, *hlo_metadata, std::move(input),
+      TPUExecute(executable, host_transfer_info,
+                 *tpu_program_group->hlo_metadata(core_index), std::move(input),
                  rendezvous_key_base, rng_seed, node_context.get(),
                  device_assignment.get(), context->cancellation_manager(),
                  context, stream, transfer_stream_ptr.get(),
-                 entry->get().get_tpu_program()));
+                 tpu_program_group->tpu_program(core_index)));
   stream->ThenRecordEvent(definition_event.get());
 
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<OutputBuffers> output_buffers,
-      AllocateOutputTensors(context, output.ConsumeResult(),
-                            proto->output_tensor_shapes(), variable_update_map,
-                            node_context.get(), stream, device_ordinal,
-                            input_buffers.get(), definition_event));
+      AllocateOutputTensors(
+          context, output.ConsumeResult(), executable.output_tensor_shapes(),
+          variable_update_map, node_context.get(), stream, device_ordinal,
+          input_buffers.get(), definition_event));
 
   // Transfer the outputs and save the snapshot to disk.
   if (hlo_snapshot) {
diff --git a/tensorflow/core/tpu/kernels/tpu_handle_to_key_op.cc b/tensorflow/core/tpu/kernels/tpu_handle_to_key_op.cc
new file mode 100644
index 00000000000..ec2ae91d3eb
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/tpu_handle_to_key_op.cc
@@ -0,0 +1,62 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h"
+#include "tensorflow/core/tpu/kernels/tpu_op_consts.h"
+#include "tensorflow/core/tpu/tpu_configuration.h"
+
+namespace tensorflow {
+
+class TpuHandleToProtoKeyOp : public OpKernel {
+ public:
+  explicit TpuHandleToProtoKeyOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+  ~TpuHandleToProtoKeyOp() override = default;
+  TpuHandleToProtoKeyOp(const TpuHandleToProtoKeyOp&) = delete;
+  TpuHandleToProtoKeyOp& operator=(const TpuHandleToProtoKeyOp&) = delete;
+
+  void Compute(OpKernelContext* ctx) override {
+    VLOG(1) << "TpuHandleToProtoKeyOp::Compute " << ctx->op_kernel().name()
+            << " on device " << ctx->op_kernel().requested_device();
+    const Tensor& uid = ctx->input(0);
+
+    ResourceMgr* rm = GetTPUConfigResourceMgr();
+    tpu::TpuCompilationCacheInterface* cache;
+    OP_REQUIRES_OK(ctx, rm->Lookup<tpu::TpuCompilationCacheInterface>(
+                            rm->default_container(),
+                            tpu::kCompilationCacheResourceName, &cache));
+    core::ScopedUnref cache_unref(cache);
+
+    std::vector<std::string> keys;
+    OP_REQUIRES_OK(ctx, cache->GetKeysFromUid(uid.scalar<int64>()(), &keys));
+
+    TensorShape output_shape;
+    output_shape.AddDim(keys.size());
+    Tensor* result = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, output_shape, &result));
+    for (int i = 0; i < keys.size(); ++i) {
+      result->vec<tstring>()(i) = keys[i];
+    }
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("TpuHandleToProtoKey").Device(DEVICE_CPU),
+                        TpuHandleToProtoKeyOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/tpu_pod_state.cc b/tensorflow/core/tpu/kernels/tpu_pod_state.cc
new file mode 100644
index 00000000000..7b02998b343
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/tpu_pod_state.cc
@@ -0,0 +1,173 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/tpu/kernels/tpu_pod_state.h"
+
+#include "tensorflow/c/tf_status.h"
+#include "tensorflow/c/tf_status_helper.h"
+#include "tensorflow/core/tpu/tpu_api.h"
+
+#if defined(LIBTFTPU)
+#include "tensorflow/core/tpu/kernels/tpu_util.h"
+#else
+#include "tensorflow/core/tpu/kernels/tpu_util.h"  // copybara"
+#endif
+
+namespace tensorflow {
+const char kTpuPodStateResourceName[] = "tpu_pod_state";
+
+namespace {
+
+// Attempt to delete resource_name from resource_manager's default_container.
+// Returns OK if the deletion succeeded, or if the resource was not found. Else
+// return the deletion error.
+template <class ResourceT>
+Status DeleteIfExists(ResourceMgr* resource_manager,
+                      const char* resource_name) {
+  VLOG(1) << "Removing resource " << resource_name << " if it exists";
+  Status status = resource_manager->Delete<ResourceT>(
+      resource_manager->default_container(), resource_name);
+  if (status.ok()) {
+    VLOG(1) << "Removed existing resource " << resource_name;
+    return Status::OK();
+  }
+  if (status.code() == error::NOT_FOUND) {
+    VLOG(1) << "No resource " << resource_name << " to remove";
+    return Status::OK();
+  }
+  VLOG(1) << "Error removing resource " << resource_name << " : " << status;
+  return status;
+}
+
+xla::StatusOr<std::unique_ptr<TpuCompilationCacheService>>
+ConstructCacheService(ResourceMgr* rmgr, int serving_port,
+                      tpu::TpuCompilationCacheInterface* compilation_cache) {
+  xla::StatusOr<std::unique_ptr<::grpc::ServerBuilder>> server_builder;
+#if defined(LIBTFTPU)
+  server_builder = tpu::CreateServerBuilder(serving_port);
+#else
+  server_builder = tpu::CreateServerBuilderGoogle(serving_port);
+#endif
+  TF_RETURN_IF_ERROR(server_builder.status());
+
+  auto cache_service = absl::make_unique<TpuCompilationCacheService>(
+      server_builder.ValueOrDie().get(), compilation_cache);
+  cache_service->SetMemoryQuota(1ul << 31);  // 2GB
+  cache_service->Start();
+  return cache_service;
+}
+}  // namespace
+
+Status GetServerAddressAndPort(std::string* server_address, int* serving_port) {
+  TF_Status* status = TF_NewStatus();
+  char* server_address_output = nullptr;
+  auto cleanup = xla::MakeCleanup([&status, &server_address_output]() {
+    TF_DeleteStatus(status);
+    tpu::ConfigApiFn()->TpuConfigurationApi_FreeCharArrayFn(
+        server_address_output);
+  });
+  size_t server_address_output_size;
+  *serving_port = -1;
+  tpu::ConfigApiFn()->TpuConfigurationApi_GetServerAddressAndPortFn(
+      &server_address_output_size, &server_address_output, serving_port,
+      status);
+  TF_RETURN_IF_ERROR(StatusFromTF_Status(status));
+  *server_address =
+      std::string(server_address_output, server_address_output_size);
+  CHECK_NE(*serving_port, -1);
+  return Status::OK();
+}
+
+TpuPodState::TpuPodState(
+    int service_port, std::unique_ptr<TpuCompilationCacheService> cache_service)
+    : cache_service_(std::move(cache_service)), service_port_(service_port) {}
+
+TpuPodState::~TpuPodState() {
+  if (cache_service_) {
+    VLOG(1) << "Shutting down Compilation Cache Service.";
+    if (cache_service_->Shutdown(20)) {
+      if (service_port_ >= 0) {
+        tpu::UtilApiFn()->TpuNetUtil_RecycleUnusedPortFn(service_port_);
+      }
+    } else {
+      LOG(ERROR)
+          << "Failed to shutdown Compilation Cache Service within timeout.";
+    }
+  }
+  VLOG(1) << "Shutting down Compilation Cache Service done.";
+}
+
+string TpuPodState::DebugString() const {
+  return "Wrapper for distributed TPU state";
+}
+
+Status GetTPUPodState(const ResourceMgr* rmgr, TpuPodState** pod_state) {
+  if (!rmgr) {
+    return errors::Internal("No resource manager.");
+  }
+  if (!rmgr->Lookup(rmgr->default_container(), kTpuPodStateResourceName,
+                    pod_state)
+           .ok()) {
+    return errors::FailedPrecondition(
+        "The TPU system has not been initialized.");
+  }
+  return Status::OK();
+}
+
+bool HasTPUPodState(const ResourceMgr* rmgr) {
+  TpuPodState* pod_state;
+  if (!rmgr->Lookup(rmgr->default_container(), kTpuPodStateResourceName,
+                    &pod_state)
+           .ok()) {
+    return false;
+  }
+  pod_state->Unref();
+  return true;
+}
+
+Status ConstructTpuPodState(
+    ResourceMgr* rmgr, const std::vector<int32_t>& num_devices_per_host,
+    tpu::TpuCompilationCacheInterface* compilation_cache,
+    std::string* host_config_proto) {
+  TF_Status* status = TF_NewStatus();
+  auto status_cleanup =
+      xla::MakeCleanup([&status]() { TF_DeleteStatus(status); });
+
+  int serving_port;
+  std::string server_address;
+  TF_RETURN_IF_ERROR(GetServerAddressAndPort(&server_address, &serving_port));
+
+  char* host_config_output = nullptr;
+  auto host_config_cleanup = xla::MakeCleanup([&host_config_output]() {
+    tpu::ConfigApiFn()->TpuConfigurationApi_FreeCharArrayFn(host_config_output);
+  });
+  size_t host_config_output_size;
+  tpu::ConfigApiFn()->ConfigureDistributedTpuOp_DoWorkFn(
+      num_devices_per_host.size(), num_devices_per_host.data(),
+      server_address.size(), server_address.data(), &host_config_output_size,
+      &host_config_output, status);
+  TF_RETURN_IF_ERROR(StatusFromTF_Status(status));
+  *host_config_proto = std::string(host_config_output, host_config_output_size);
+
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<TpuCompilationCacheService> cache_service,
+      ConstructCacheService(rmgr, serving_port, compilation_cache));
+
+  // Delete TpuPodState if it exists, and recreate below.
+  TF_RETURN_IF_ERROR(
+      DeleteIfExists<TpuPodState>(rmgr, kTpuPodStateResourceName));
+  return rmgr->Create(rmgr->default_container(), kTpuPodStateResourceName,
+                      new TpuPodState(serving_port, std::move(cache_service)));
+}
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/tpu_pod_state.h b/tensorflow/core/tpu/kernels/tpu_pod_state.h
new file mode 100644
index 00000000000..9515a8ee8f5
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/tpu_pod_state.h
@@ -0,0 +1,63 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_POD_STATE_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_POD_STATE_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_service.h"
+
+namespace tensorflow {
+
+// Name of tpu pod state.
+ABSL_CONST_INIT extern const char kTpuPodStateResourceName[];
+
+// Wrapper to hold centralized state for the distributed TPU in the TPU_SYSTEM
+// device's resource manager.
+class TpuPodState : public ResourceBase {
+ public:
+  // The port number given by isa_cache_port will be freed with
+  // RecycleUnusedPort in the destructor if it is non-negative.
+  TpuPodState(int service_port,
+              std::unique_ptr<TpuCompilationCacheService> cache_service);
+
+  ~TpuPodState() override;
+
+  string DebugString() const override;
+
+ private:
+  std::unique_ptr<TpuCompilationCacheService> cache_service_;
+  int service_port_;
+};
+
+// Returns the TPU pod state or an error.
+Status GetTPUPodState(const ResourceMgr* rmgr, TpuPodState** pod_state);
+
+// Checks whether the TPU POD state configuration is present within the resource
+// manager.
+bool HasTPUPodState(const ResourceMgr* rmgr);
+
+// Construct TpuPodState.
+Status ConstructTpuPodState(
+    ResourceMgr* rmgr, const std::vector<int32_t>& num_devices_per_host,
+    tpu::TpuCompilationCacheInterface* compilation_cache,
+    std::string* host_config_proto);
+
+Status GetServerAddressAndPort(std::string* server_address, int* serving_port);
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_POD_STATE_H_
diff --git a/tensorflow/core/tpu/kernels/tpu_program_c_api.h b/tensorflow/core/tpu/kernels/tpu_program_c_api.h
index c9951e4d5ce..1b35a8a036b 100644
--- a/tensorflow/core/tpu/kernels/tpu_program_c_api.h
+++ b/tensorflow/core/tpu/kernels/tpu_program_c_api.h
@@ -17,10 +17,29 @@ limitations under the License.
 
 #include "tensorflow/core/tpu/kernels/tpu_util_c_api.h"
 #include "tensorflow/core/tpu/libtftpu.h"
+#include "tensorflow/stream_executor/tpu/c_api_decl.h"
 #include "tensorflow/stream_executor/tpu/proto_helper.h"
 
 typedef struct XLA_TpuProgram XLA_TpuProgram;
 
+// Enum for choosing sharding/unsharding program from a `XLA_TpuProgram` obj.
+enum TpuProgramShardingType { kInvalid = 0, kMain, kSharding, kUnsharding };
+
+struct TpuExecutableSerializedProto {
+  const char* bytes;
+  size_t size;
+};
+
+struct CompilerMetadataSerializedProto {
+  const char* bytes;
+  size_t size;
+};
+
+struct HostComputeMetadataSerializedProto {
+  const char* bytes;
+  size_t size;
+};
+
 extern "C" {
 
 // Creates a new TPU program.
@@ -50,20 +69,53 @@ TFTPU_CAPI_EXPORT bool TpuProgram_LogProgramMemorySummary(
 
 // Gets TPU program executable info from the `tpu_program`.
 TFTPU_CAPI_EXPORT void TpuProgram_GetExecutableInfo(
-    const XLA_TpuProgram* tpu_program, TpuSerializedProto* executable_info);
+    const XLA_TpuProgram* tpu_program, TpuSerializedProto* executable_info,
+    SE_Status* status);
 
 // Gets host transfer info proto.
 TFTPU_CAPI_EXPORT void TpuProgram_GetHostTransferInfo(
-    const XLA_TpuProgram* tpu_program, TpuSerializedProto* host_transfer_info);
+    const XLA_TpuProgram* tpu_program, TpuSerializedProto* host_transfer_info,
+    SE_Status* status);
 
 // Gets HLO metadata proto.
 TFTPU_CAPI_EXPORT void TpuProgram_GetHloMetadata(
-    const XLA_TpuProgram* tpu_program, TpuSerializedProto* hlo_metadata);
+    const XLA_TpuProgram* tpu_program, TpuSerializedProto* hlo_metadata,
+    SE_Status* status);
 
 // Gets may modify variables boolean value.
 TFTPU_CAPI_EXPORT void TpuProgram_GetMayModifyVariables(
     const XLA_TpuProgram* tpu_program, bool* may_modify_variables);
 
+// Checks if TPU program has sharding.
+TFTPU_CAPI_EXPORT bool TpuProgram_HasSharding(
+    const XLA_TpuProgram* tpu_program);
+
+// Gets TPU program by sharding type. Return value is valid only when the
+// `status.status()` returns `OK`.
+TFTPU_CAPI_EXPORT XLA_TpuProgram* TpuProgram_GetTpuProgram(
+    XLA_TpuProgram* tpu_program, TpuProgramShardingType type);
+
+// Gets TPU executable proto from a `tpu_program`.
+TFTPU_CAPI_EXPORT void TpuProgram_SerializeTpuExecutable(
+    const XLA_TpuProgram* tpu_program, TpuExecutableSerializedProto* executable,
+    SE_Status* status);
+
+// Gets compilation metadata proto from a `tpu_program`.
+TFTPU_CAPI_EXPORT void TpuProgram_SerializeCompilerMetadata(
+    const XLA_TpuProgram* tpu_program,
+    CompilerMetadataSerializedProto* compiler_metadata, SE_Status* status);
+
+// Gets host transfer metadata proto from a `tpu_program`.
+TFTPU_CAPI_EXPORT void TpuProgram_SerializeHostComputeMetadata(
+    const XLA_TpuProgram* tpu_program,
+    HostComputeMetadataSerializedProto* host_compute_metadata,
+    SE_Status* status);
+
+// Deserializes the `GetTpuProgramResponse` proto into an `XLA_TpuProgram`.
+TFTPU_CAPI_EXPORT void TpuProgram_DeserializeFromGetTpuProgramResponseProto(
+    TpuSerializedProto get_tpu_program_response, XLA_TpuProgram* tpu_program,
+    SE_Status* status);
+
 struct TfTpu_TpuProgramApiFn {
   TFTPU_ADD_FN_IN_STRUCT(TpuProgram_New);
   TFTPU_ADD_FN_IN_STRUCT(TpuProgram_Free);
@@ -76,6 +128,12 @@ struct TfTpu_TpuProgramApiFn {
   TFTPU_ADD_FN_IN_STRUCT(TpuProgram_GetHostTransferInfo);
   TFTPU_ADD_FN_IN_STRUCT(TpuProgram_GetHloMetadata);
   TFTPU_ADD_FN_IN_STRUCT(TpuProgram_GetMayModifyVariables);
+  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_HasSharding);
+  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_GetTpuProgram);
+  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_SerializeTpuExecutable);
+  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_SerializeCompilerMetadata);
+  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_SerializeHostComputeMetadata);
+  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_DeserializeFromGetTpuProgramResponseProto);
 };
 
 }  // extern "C"
diff --git a/tensorflow/core/tpu/kernels/tpu_program_group.cc b/tensorflow/core/tpu/kernels/tpu_program_group.cc
index e22175af270..cd2d5bda98c 100644
--- a/tensorflow/core/tpu/kernels/tpu_program_group.cc
+++ b/tensorflow/core/tpu/kernels/tpu_program_group.cc
@@ -22,131 +22,106 @@ limitations under the License.
 #include "tensorflow/core/tpu/kernels/tpu_compile.pb.h"
 #include "tensorflow/core/tpu/kernels/tpu_compile_c_api.h"
 #include "tensorflow/core/tpu/kernels/tpu_compile_op_support.h"
+#include "tensorflow/core/tpu/kernels/tpu_program_c_api.h"
 #include "tensorflow/core/tpu/tpu_api.h"
 #include "tensorflow/stream_executor/tpu/proto_helper.h"
 #include "tensorflow/stream_executor/tpu/status_helper.h"
 
 namespace tensorflow {
 namespace tpu {
-
 namespace {
-
 namespace se_tpu = ::stream_executor::tpu;
-
 using stream_executor::port::Status;
-using stream_executor::port::StatusOr;
-using xla::Shape;
+}  // namespace
 
-StatusOr<std::vector<XLA_TpuProgram*>> CompileAheadOfTime(
-    std::unique_ptr<xla::HloModuleGroup> module_group,
-    const XlaCompiler::CompilationResult& compilation_result,
-    const TPUCompileMetadataProto& metadata,
-    const std::vector<std::vector<xla::Shape>>& per_core_arg_shapes,
-    const std::vector<std::vector<xla::Shape>>& per_core_output_shapes,
-    const std::vector<std::vector<std::pair<int, bool>>>&
-        per_core_variable_indices,
-    const absl::optional<xla::DeviceAssignment>& device_assignment) {
-  VLOG(1) << "Run CompileAheadOfTime.";
-  TF_ASSIGN_OR_RETURN(TpuAotCompilationRequestProto aot_request,
-                      CreateTpuAotCompilationRequest(
-                          *module_group, compilation_result, metadata,
-                          per_core_arg_shapes, per_core_output_shapes,
-                          per_core_variable_indices, device_assignment));
-  se_tpu::SerializedProto serialized_aot_request =
-      se_tpu::SerializeProto(aot_request);
-  auto cleanup = gtl::MakeCleanup([serialized_aot_request] {
-    se_tpu::SerializedProto_Free(serialized_aot_request);
-  });
-
-  XLA_TpuProgram** xla_tpu_programs = nullptr;
-  size_t count = 0;
+TPUExecutableInfoProto TpuProgramGroup::ConstructExecutableInfo(
+    const XLA_TpuProgram* xla_tpu_program) {
+  VLOG(1) << "ConstructExecutableInfo";
+  TpuSerializedProto serialized_executable_info = {};
   StatusHelper status;
-  VLOG(1) << "Run TpuCompile_CompileAheadOfTime.";
-  CompileApiFn()->TpuCompile_CompileAheadOfTimeFn(
-      serialized_aot_request, &xla_tpu_programs, &count, status.c_status);
-  VLOG(1) << "Run CompileAheadOfTime completed.";
-  if (!status.status().ok()) {
-    return status.status();
-  }
-  std::vector<XLA_TpuProgram*> tpu_programs(count, nullptr);
-  for (size_t i = 0; i < count; ++i) {
-    tpu_programs[i] = xla_tpu_programs[i];
-  }
-  TpuProgramApiFn()->TpuProgram_FreeArrayFn(xla_tpu_programs);
-  return tpu_programs;
-}
-
-StatusOr<std::vector<XLA_TpuProgram*>> CompileAheadOfTime(
-    const TPUCompileMetadataProto& metadata,
-    const XlaCompiler::CompilationResult& compilation_result,
-    const std::vector<std::vector<xla::Shape>>& per_core_arg_shapes,
-    const std::vector<std::vector<xla::Shape>>& per_core_output_shapes,
-    const std::vector<std::vector<std::pair<int, bool>>>&
-        per_core_variable_indices,
-    const absl::optional<xla::DeviceAssignment>& device_assignment) {
-  VLOG(1) << "Compile Tpu programs.";
-  std::vector<std::unique_ptr<xla::HloModule>> hlo_modules;
-  auto status = CreateHloModules(metadata, compilation_result,
-                                 device_assignment, &hlo_modules);
-  if (!status.ok()) {
-    return status;
-  }
-
-  return CompileAheadOfTime(
-      absl::make_unique<xla::HloModuleGroup>(hlo_modules[0]->name(),
-                                             absl::MakeSpan(hlo_modules)),
-      compilation_result, metadata, per_core_arg_shapes, per_core_output_shapes,
-      per_core_variable_indices, device_assignment);
-}
-
-Status CreateTpuProgramGroup(
-    absl::Span<XLA_TpuProgram* const> xla_tpu_programs,
-    TpuProgramGroupInterface* tpu_program_group_interface) {
-  CHECK_GT(xla_tpu_programs.size(), 0);
-  TpuProgramGroup* tpu_program_group =
-      tensorflow::down_cast<TpuProgramGroup*>(tpu_program_group_interface);
-  CHECK_NE(tpu_program_group, nullptr);
-  tpu_program_group->set_tpu_programs(xla_tpu_programs);
-
-  // TODO(jiawenhao): Handle the case of xla_tpu_programs.size() > 1.
-  bool may_modify_variables;
-  TpuProgramApiFn()->TpuProgram_GetMayModifyVariablesFn(xla_tpu_programs[0],
-                                                        &may_modify_variables);
-  tpu_program_group->set_may_modify_variables(
-      std::vector<bool>(1, may_modify_variables));
-
-  TpuSerializedProto serialized_executable_info;
   TpuProgramApiFn()->TpuProgram_GetExecutableInfoFn(
-      xla_tpu_programs[0], &serialized_executable_info);
-  TPUExecutableInfoProto executable_info =
-      se_tpu::DeserializeProto<TPUExecutableInfoProto>(
-          serialized_executable_info);
-  tpu_program_group->set_executable_info(executable_info);
-  StreamExecutor_Tpu_FreeSerializedProto(&serialized_executable_info);
+      xla_tpu_program, &serialized_executable_info, status.c_status);
+  TPUExecutableInfoProto executable_info;
+  if (status.ok()) {
+    executable_info = se_tpu::DeserializeProto<TPUExecutableInfoProto>(
+        serialized_executable_info);
+    StreamExecutor_Tpu_FreeSerializedProto(&serialized_executable_info);
+  }
+  return executable_info;
+}
 
-  TPUHostTransferInfoProto host_transfer_info;
-  TpuSerializedProto serialized_host_transfer_info;
+TPUHostTransferInfoProto TpuProgramGroup::ConstructHostTransferInfo(
+    const XLA_TpuProgram* xla_tpu_program) {
+  VLOG(1) << "ConstructHostTransferInfo";
+  TpuSerializedProto serialized_host_transfer_info = {};
+  StatusHelper status;
   TpuProgramApiFn()->TpuProgram_GetHostTransferInfoFn(
-      xla_tpu_programs[0], &serialized_host_transfer_info);
-  if (serialized_host_transfer_info.size > 0) {
+      xla_tpu_program, &serialized_host_transfer_info, status.c_status);
+  TPUHostTransferInfoProto host_transfer_info;
+  if (status.ok()) {
     host_transfer_info = se_tpu::DeserializeProto<TPUHostTransferInfoProto>(
         serialized_host_transfer_info);
     StreamExecutor_Tpu_FreeSerializedProto(&serialized_host_transfer_info);
   }
-  tpu_program_group->set_host_transfer_info(host_transfer_info);
-
-  TpuSerializedProto serialized_hlo_metadata;
-  TpuProgramApiFn()->TpuProgram_GetHloMetadataFn(xla_tpu_programs[0],
-                                                 &serialized_hlo_metadata);
-  xla::HloProto hlo_metadata =
-      se_tpu::DeserializeProto<xla::HloProto>(serialized_hlo_metadata);
-  tpu_program_group->set_hlo_metadata(hlo_metadata);
-  StreamExecutor_Tpu_FreeSerializedProto(&serialized_hlo_metadata);
-
-  return Status::OK();
+  return host_transfer_info;
 }
 
-}  // namespace
+xla::HloProto TpuProgramGroup::ConstructHloMetadata(
+    const XLA_TpuProgram* xla_tpu_program) {
+  VLOG(1) << "ConstructHloMetadata";
+  TpuSerializedProto serialized_hlo_metadata = {};
+  StatusHelper status;
+  TpuProgramApiFn()->TpuProgram_GetHloMetadataFn(
+      xla_tpu_program, &serialized_hlo_metadata, status.c_status);
+  xla::HloProto hlo_metadata;
+  if (status.ok()) {
+    hlo_metadata =
+        se_tpu::DeserializeProto<xla::HloProto>(serialized_hlo_metadata);
+    StreamExecutor_Tpu_FreeSerializedProto(&serialized_hlo_metadata);
+  }
+  return hlo_metadata;
+}
+
+void TpuProgramGroup::Initialize(
+    absl::Span<XLA_TpuProgram* const> xla_tpu_programs) {
+  CHECK_GT(xla_tpu_programs.size(), 0);
+  CHECK_EQ(program_count(), 0) << "Reinitialization of an existing "
+                                  "`TpuProgramGroup` instance is prohibited.";
+  set_tpu_programs(xla_tpu_programs);
+
+  std::vector<bool> may_modify_variables_array(tpu_programs_.size(), false);
+  std::vector<TPUExecutableInfoProto> executable_infos(tpu_programs_.size());
+  std::vector<TPUHostTransferInfoProto> host_transfer_infos(
+      tpu_programs_.size());
+  std::vector<xla::HloProto> hlo_metadatas(tpu_programs_.size());
+  for (size_t i = 0; i < tpu_programs_.size(); ++i) {
+    const XLA_TpuProgram* xla_tpu_program = tpu_programs_[i];
+    bool may_modify_variables;
+    TpuProgramApiFn()->TpuProgram_GetMayModifyVariablesFn(
+        xla_tpu_program, &may_modify_variables);
+    may_modify_variables_array[i] = may_modify_variables;
+    executable_infos[i] = ConstructExecutableInfo(xla_tpu_program);
+    host_transfer_infos[i] = ConstructHostTransferInfo(xla_tpu_program);
+    hlo_metadatas[i] = ConstructHloMetadata(xla_tpu_program);
+  }
+
+  may_modify_variables_ = may_modify_variables_array;
+  executable_infos_ = executable_infos;
+  host_transfer_infos_ = host_transfer_infos;
+  hlo_metadatas_ = hlo_metadatas;
+  RefreshHloMetadatasPtrs();
+}
+
+bool TpuProgramGroup::has_sharding_program() const {
+  for (const XLA_TpuProgram* tpu_program : tpu_programs_) {
+    if (!TpuProgramApiFn()->TpuProgram_HasShardingFn(tpu_program)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+size_t TpuProgramGroup::program_count() const { return tpu_programs_.size(); }
 
 int64_t TpuProgramGroup::program_size() const {
   int64_t total_size = 0;
@@ -178,68 +153,20 @@ void TpuProgramGroup::UnloadAndDestroyPrograms() {
   tpu_programs_.clear();
 }
 
-/*static*/ Status TpuProgramGroup::Build(
-    const TPUCompileMetadataProto& metadata,
-    const tensorflow::XlaCompiler::CompilationResult& compilation_result,
-    const std::vector<ShardingAndIndex>& arg_core_mapping,
-    const std::vector<std::vector<xla::Shape>>& per_core_arg_shapes,
-    const absl::optional<xla::DeviceAssignment>& xla_device_assignment,
-    TpuProgramGroupInterface* tpu_program_group_interface) {
-  std::vector<std::vector<xla::Shape>> per_core_output_shapes(
-      metadata.num_cores_per_replica());
-  TF_RETURN_IF_ERROR(ComputeOutputShapesForEachCore(
-      metadata, compilation_result, &per_core_output_shapes));
-
-  std::vector<std::vector<std::pair<int, bool>>> per_core_variable_indices(
-      metadata.num_cores_per_replica());
-  std::vector<bool> may_modify_variables;
-  TF_RETURN_IF_ERROR(AddVariableUpdatesToCores(
-      metadata, compilation_result, arg_core_mapping, &may_modify_variables,
-      &per_core_output_shapes, &per_core_variable_indices));
-  TF_RET_CHECK(per_core_arg_shapes.size() == metadata.num_cores_per_replica());
-  TF_RET_CHECK(per_core_output_shapes.size() == per_core_arg_shapes.size());
-  TF_RET_CHECK(per_core_output_shapes.size() ==
-               per_core_variable_indices.size());
-
-  // TODO(henrytan): add an interface to TpuProgramGroupInterface to set
-  // may_modify_variables.
-  TpuProgramGroup* tpu_program_group =
-      tensorflow::down_cast<TpuProgramGroup*>(tpu_program_group_interface);
-  tpu_program_group->may_modify_variables_ = may_modify_variables;
-
-  // With shardable input/output pairs, XLA could generate separate
-  // sharding/unsharding programs along with the main program. The
-  // sharding/unsharding programs will be in nested entries of the AOT
-  // compilation result.
-  auto status_or = CompileAheadOfTime(
-      metadata, compilation_result, per_core_arg_shapes, per_core_output_shapes,
-      per_core_variable_indices, xla_device_assignment);
-
-  TF_ASSIGN_OR_RETURN(std::vector<XLA_TpuProgram*> xla_tpu_programs,
-                      std::move(status_or));
-  // SPMD could return 1 result for all partitions.
-  TF_RET_CHECK(xla_tpu_programs.size() == 1 ||
-               xla_tpu_programs.size() == metadata.num_cores_per_replica());
-
-  TF_RETURN_IF_ERROR(
-      CreateTpuProgramGroup(xla_tpu_programs, tpu_program_group));
-  return Status::OK();
-}
-
 TpuProgramGroup::TpuProgramGroup(TpuProgramGroup&& other)
     : may_modify_variables_(std::move(other.may_modify_variables_)),
-      host_compute_metadata_(std::move(other.host_compute_metadata_)),
       tpu_programs_(std::move(other.tpu_programs_)),
-      executable_info_(std::move(other.executable_info_)),
-      host_transfer_info_(std::move(other.host_transfer_info_)),
+      executable_infos_(std::move(other.executable_infos_)),
+      host_transfer_infos_(std::move(other.host_transfer_infos_)),
       hlo_metadatas_(std::move(other.hlo_metadatas_)) {
   RefreshHloMetadatasPtrs();
 }
 
-void TpuProgramGroup::set_hlo_metadata(const xla::HloProto& hlo_metadata) {
-  // TODO(henrytan): initialize hlo_metadatas_ for multi program support.
-  if (hlo_metadatas_.empty()) {
-    hlo_metadatas_.push_back(hlo_metadata);
+void TpuProgramGroup::set_hlo_metadatas(
+    absl::Span<const xla::HloProto> hlo_metadatas) {
+  hlo_metadatas_.resize(hlo_metadatas.size());
+  for (size_t i = 0; i < hlo_metadatas.size(); ++i) {
+    hlo_metadatas_[i] = hlo_metadatas[i];
   }
   RefreshHloMetadatasPtrs();
 }
@@ -248,6 +175,12 @@ absl::Span<const xla::HloProto* const> TpuProgramGroup::hlo_metadatas() const {
   return hlo_metadatas_ptrs_;
 }
 
+const xla::HloProto* TpuProgramGroup::hlo_metadata(int index) const {
+  CHECK_GE(index, 0);
+  CHECK_LT(index, hlo_metadatas_ptrs_.size());
+  return hlo_metadatas_ptrs_[index];
+}
+
 void TpuProgramGroup::RefreshHloMetadatasPtrs() {
   hlo_metadatas_ptrs_.reserve(hlo_metadatas_.size());
   for (const auto& hlo_metadata_internal_ : hlo_metadatas_) {
@@ -262,6 +195,56 @@ Status TpuProgramGroup::LogCompilationStats(const TpuCompilationCacheKey& key,
   return Status::OK();
 }
 
+const std::vector<bool>& TpuProgramGroup::may_modify_variables_list() const {
+  return may_modify_variables_;
+}
+
+void TpuProgramGroup::set_may_modify_variables(
+    const std::vector<bool>& may_modify_variables) {
+  may_modify_variables_ = may_modify_variables;
+}
+
+bool TpuProgramGroup::may_modify_variables(int index) const {
+  CHECK_GE(index, 0);
+  CHECK_LT(index, tpu_programs_.size());
+  bool may_modify_variables;
+  TpuProgramApiFn()->TpuProgram_GetMayModifyVariablesFn(tpu_programs_[index],
+                                                        &may_modify_variables);
+  return may_modify_variables;
+}
+
+const std::vector<XLA_TpuProgram*>& TpuProgramGroup::tpu_programs() const {
+  return tpu_programs_;
+}
+
+const XLA_TpuProgram* TpuProgramGroup::tpu_program(int index) const {
+  CHECK_GE(index, 0);
+  CHECK_LT(index, tpu_programs_.size());
+  return tpu_programs_[index];
+}
+
+void TpuProgramGroup::set_tpu_programs(
+    absl::Span<XLA_TpuProgram* const> tpu_programs) {
+  tpu_programs_.resize(tpu_programs.size());
+  for (size_t i = 0; i < tpu_programs.size(); ++i) {
+    tpu_programs_[i] = tpu_programs[i];
+  }
+}
+
+const TPUExecutableInfoProto& TpuProgramGroup::executable_info(
+    int index) const {
+  CHECK_GE(index, 0);
+  CHECK_LT(index, executable_infos_.size());
+  return executable_infos_[index];
+}
+
+const TPUHostTransferInfoProto& TpuProgramGroup::host_transfer_info(
+    int index) const {
+  CHECK_GE(index, 0);
+  CHECK_LT(index, host_transfer_infos_.size());
+  return host_transfer_infos_[index];
+}
+
 /*static*/
 Status TpuProgramGroup::CompileAndBuild(
     const TpuCompilationRequestProto& compilation_request,
@@ -287,15 +270,79 @@ Status TpuProgramGroup::CompileAndBuild(
   TF_RET_CHECK(count == 1 ||
                count == compilation_request.metadata().num_cores_per_replica());
 
-  VLOG(1) << "CreateTpuProgramGroup";
-  Status serialize_status =
-      CreateTpuProgramGroup(absl::MakeConstSpan(&xla_tpu_programs[0], count),
-                            tpu_program_group_interface);
-  VLOG(1) << absl::StrCat("Run CreateTpuProgramGroup completed. StatusCode: ",
-                          serialize_status.code());
+  VLOG(1) << "Initialize TpuProgramGroup.";
+  TpuProgramGroup* tpu_program_group =
+      tensorflow::down_cast<TpuProgramGroup*>(tpu_program_group_interface);
+  tpu_program_group->Initialize(
+      absl::MakeConstSpan(&xla_tpu_programs[0], count));
   TpuProgramApiFn()->TpuProgram_FreeArrayFn(xla_tpu_programs);
-  return serialize_status;
+  return status.status();
 }
 
+std::vector<XLA_TpuProgram*> TpuProgramGroup::tpu_programs(
+    TpuProgramShardingType sharding_type) const {
+  std::vector<XLA_TpuProgram*> tpu_programs;
+  tpu_programs.reserve(tpu_programs_.size());
+  for (size_t i = 0; i < tpu_programs_.size(); ++i) {
+    if (TpuProgramApiFn()->TpuProgram_HasShardingFn(tpu_programs_[i])) {
+      tpu_programs.push_back(TpuProgramApiFn()->TpuProgram_GetTpuProgramFn(
+          tpu_programs_[i], sharding_type));
+      CHECK_NE(tpu_programs[i], nullptr);
+    }
+  }
+  return tpu_programs;
+}
+
+Status TpuProgramGroup::DeserializeFromRpcResponseProtos(
+    const std::vector<TpuSerializedProto>& rpc_response_protos) {
+  std::vector<XLA_TpuProgram*> tpu_programs;
+  tpu_programs.resize(rpc_response_protos.size());
+
+  for (size_t i = 0; i < rpc_response_protos.size(); ++i) {
+    StatusHelper status;
+    auto* xla_tpu_program = TpuProgramApiFn()->TpuProgram_NewFn();
+    TpuProgramApiFn()->TpuProgram_DeserializeFromGetTpuProgramResponseProtoFn(
+        rpc_response_protos[i], xla_tpu_program, status.c_status);
+    if (!status.status().ok()) {
+      TpuProgramApiFn()->TpuProgram_FreeFn(xla_tpu_program);
+      return status.status();
+    }
+    tpu_programs[i] = xla_tpu_program;
+  }
+
+  Initialize(tpu_programs);
+  return Status::OK();
+}
+
+Status TpuProgramGroup::SerializeExecutable(
+    int index, TpuExecutableSerializedProto* executable) const {
+  CHECK_GE(index, 0);
+  CHECK_LT(index, tpu_programs_.size());
+  StatusHelper status;
+  TpuProgramApiFn()->TpuProgram_SerializeTpuExecutableFn(
+      tpu_programs_[index], executable, status.c_status);
+  return status.status();
+}
+
+Status TpuProgramGroup::SerializeCompilerMetadata(
+    int index, CompilerMetadataSerializedProto* compiler_metadata) const {
+  CHECK_GE(index, 0);
+  CHECK_LT(index, tpu_programs_.size());
+  StatusHelper status;
+  TpuProgramApiFn()->TpuProgram_SerializeCompilerMetadataFn(
+      tpu_programs_[index], compiler_metadata, status.c_status);
+  return status.status();
+}
+
+Status TpuProgramGroup::SerializeHostComputeMetadata(
+    int index,
+    HostComputeMetadataSerializedProto* host_compute_metadata) const {
+  CHECK_GE(index, 0);
+  CHECK_LT(index, tpu_programs_.size());
+  StatusHelper status;
+  TpuProgramApiFn()->TpuProgram_SerializeHostComputeMetadataFn(
+      tpu_programs_[index], host_compute_metadata, status.c_status);
+  return status.status();
+}
 }  // namespace tpu
 }  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/tpu_program_group.h b/tensorflow/core/tpu/kernels/tpu_program_group.h
index 4bc8cdd003a..3ed1623e9e6 100644
--- a/tensorflow/core/tpu/kernels/tpu_program_group.h
+++ b/tensorflow/core/tpu/kernels/tpu_program_group.h
@@ -15,9 +15,11 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_PROGRAM_GROUP_H_
 #define TENSORFLOW_CORE_TPU_KERNELS_TPU_PROGRAM_GROUP_H_
 
+#include <memory>
 #include <vector>
 
 #include "absl/types/optional.h"
+#include "tensorflow/compiler/tf2xla/host_compute_metadata.pb.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/xla/client/compile_only_client.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
@@ -93,20 +95,16 @@ class TpuProgramGroup : public TpuProgramGroupInterface {
       const XLA_TpuMeshState* mesh_state,
       TpuProgramGroupInterface* tpu_program_group_interface);
 
-  // Compiles HLO IR and returns TPU programs ready for execution.
-  static Status Build(
-      const TPUCompileMetadataProto& metadata,
-      const tensorflow::XlaCompiler::CompilationResult& compilation_result,
-      const std::vector<ShardingAndIndex>& arg_core_mapping,
-      const std::vector<std::vector<xla::Shape>>& per_core_arg_shapes,
-      const absl::optional<xla::DeviceAssignment>& xla_device_assignment,
-      TpuProgramGroupInterface* tpu_program_group_interface);
+  // Initializes `TpuProgramGroup` object with `xla_tpu_programs`.
+  void Initialize(absl::Span<XLA_TpuProgram* const> xla_tpu_programs);
 
   TpuProgramGroup() = default;
   TpuProgramGroup(TpuProgramGroup&& other);
   TpuProgramGroup& operator=(TpuProgramGroup&&) = delete;
 
-  size_t program_count() const override { return tpu_programs_.size(); }
+  bool has_sharding_program() const override;
+
+  size_t program_count() const override;
 
   int64_t program_size() const override;
 
@@ -117,58 +115,57 @@ class TpuProgramGroup : public TpuProgramGroupInterface {
   Status LogCompilationStats(const TpuCompilationCacheKey& key,
                              absl::Duration duration) override;
 
-  const std::vector<bool>& may_modify_variables() const override {
-    return may_modify_variables_;
-  }
-  void set_may_modify_variables(const std::vector<bool>& may_modify_variables) {
-    may_modify_variables_ = may_modify_variables;
-  }
+  const std::vector<bool>& may_modify_variables_list() const override;
+  void set_may_modify_variables(const std::vector<bool>& may_modify_variables);
+  bool may_modify_variables(int index) const override;
 
-  const tf2xla::HostComputeMetadata& host_compute_metadata() const {
-    return host_compute_metadata_;
-  }
-  void set_host_compute_metadata(
-      const tf2xla::HostComputeMetadata& host_compute_metadata) {
-    host_compute_metadata_ = host_compute_metadata;
-  }
+  const std::vector<XLA_TpuProgram*>& tpu_programs() const;
+  std::vector<XLA_TpuProgram*> tpu_programs(TpuProgramShardingType type) const;
+  const XLA_TpuProgram* tpu_program(int index) const;
+  void set_tpu_programs(absl::Span<XLA_TpuProgram* const> tpu_programs);
 
-  const std::vector<XLA_TpuProgram*>& tpu_programs() const {
-    return tpu_programs_;
-  }
-  void set_tpu_programs(absl::Span<XLA_TpuProgram* const> tpu_programs) {
-    tpu_programs_.resize(tpu_programs.size());
-    for (size_t i = 0; i < tpu_programs.size(); ++i) {
-      tpu_programs_[i] = tpu_programs[i];
-    }
-  }
+  const TPUExecutableInfoProto& executable_info(int index) const;
 
-  const TPUExecutableInfoProto& executable_info() const {
-    return executable_info_;
-  }
-  void set_executable_info(const TPUExecutableInfoProto& executable_info) {
-    executable_info_ = executable_info;
-  }
-
-  const TPUHostTransferInfoProto& host_transfer_info() const {
-    return host_transfer_info_;
-  }
-  void set_host_transfer_info(
-      const TPUHostTransferInfoProto& host_transfer_info) {
-    host_transfer_info_ = host_transfer_info;
-  }
-
-  void set_hlo_metadata(const xla::HloProto& hlo_metadata);
+  const TPUHostTransferInfoProto& host_transfer_info(int index) const;
+  void set_hlo_metadatas(absl::Span<const xla::HloProto> hlo_metadatas);
+  const xla::HloProto* hlo_metadata(int index) const;
   absl::Span<const xla::HloProto* const> hlo_metadatas() const override;
 
+  // Deserializes `GetTpuProgramResponse` protos from remote cache.
+  Status DeserializeFromRpcResponseProtos(
+      const std::vector<TpuSerializedProto>& rpc_response_protos);
+
+  // Serializes executable proto from the TPU program for the given core
+  // `index`.
+  Status SerializeExecutable(int index,
+                             TpuExecutableSerializedProto* executable) const;
+
+  // Serializes compiler metadata of the TPU program for the given core `index`.
+  Status SerializeCompilerMetadata(
+      int index, CompilerMetadataSerializedProto* compiler_metadata) const;
+
+  // Serializes host compute metadata of the TPU program for the given core
+  // `index`.
+  Status SerializeHostComputeMetadata(
+      int index,
+      HostComputeMetadataSerializedProto* host_compute_metadata) const;
+
  private:
+  TPUExecutableInfoProto ConstructExecutableInfo(
+      const XLA_TpuProgram* tpu_program);
+  TPUHostTransferInfoProto ConstructHostTransferInfo(
+      const XLA_TpuProgram* tpu_program);
+  xla::HloProto ConstructHloMetadata(const XLA_TpuProgram* tpu_program);
+
+  // Update `hlo_metadatas__ptrs_` array from `hlo_metadatas_`. This needs to be
+  // called on `hlo_metadatas_` change(s).
   void RefreshHloMetadatasPtrs();
 
   std::vector<bool> may_modify_variables_;
-  tf2xla::HostComputeMetadata host_compute_metadata_;
 
   std::vector<XLA_TpuProgram*> tpu_programs_;  // Not owned.
-  TPUExecutableInfoProto executable_info_;
-  TPUHostTransferInfoProto host_transfer_info_;
+  std::vector<TPUExecutableInfoProto> executable_infos_;
+  std::vector<TPUHostTransferInfoProto> host_transfer_infos_;
 
   // To be consistent with the TpuProgramGroupInterface::hlo_metadatas()
   // signature, we store HloProto values in hlo_metadatas_ when
diff --git a/tensorflow/core/tpu/kernels/tpu_program_group_interface.h b/tensorflow/core/tpu/kernels/tpu_program_group_interface.h
index cb7347783b1..8bf4404859f 100644
--- a/tensorflow/core/tpu/kernels/tpu_program_group_interface.h
+++ b/tensorflow/core/tpu/kernels/tpu_program_group_interface.h
@@ -20,6 +20,8 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "absl/time/time.h"
+#include "absl/types/span.h"
 #include "tensorflow/compiler/tf2xla/host_compute_metadata.pb.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -34,13 +36,16 @@ class TpuProgramGroupInterface {
  public:
   virtual ~TpuProgramGroupInterface() = default;
 
+  // Check if whether sharding/unsharding program exists.
+  virtual bool has_sharding_program() const = 0;
+
   // Computes program count.
   virtual size_t program_count() const = 0;
 
   // Computes total program size.
   virtual int64_t program_size() const = 0;
 
-  // Unloads and destroys safely Tpu programs.
+  // Unloads and destroys safely TPU programs.
   virtual void UnloadAndDestroyPrograms() = 0;
 
   // Logs program memory summary.
@@ -56,7 +61,11 @@ class TpuProgramGroupInterface {
 
   // Boolean array to indicate if the modification of variables are
   // allowed.
-  virtual const std::vector<bool>& may_modify_variables() const = 0;
+  virtual const std::vector<bool>& may_modify_variables_list() const = 0;
+
+  // Gets may modify variables value of the TPU program for the given core
+  // `index`.
+  virtual bool may_modify_variables(int index) const = 0;
 };
 
 }  // namespace tpu
diff --git a/tensorflow/core/tpu/kernels/tpu_util.cc b/tensorflow/core/tpu/kernels/tpu_util.cc
index 60f8fe0198b..6f31d066db5 100644
--- a/tensorflow/core/tpu/kernels/tpu_util.cc
+++ b/tensorflow/core/tpu/kernels/tpu_util.cc
@@ -14,8 +14,10 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/tpu/kernels/tpu_util.h"
 
+#include "absl/strings/str_format.h"
 #include "absl/strings/str_split.h"
 #include "tensorflow/core/platform/random.h"
+#include "tensorflow/core/tpu/tpu_api.h"
 
 namespace tensorflow {
 namespace tpu {
@@ -95,5 +97,14 @@ Status DynamicShapesToTensorShapes(const InputList& dynamic_shapes,
   }
   return Status::OK();
 }
+
+xla::StatusOr<std::unique_ptr<::grpc::ServerBuilder>> CreateServerBuilder(
+    int serving_port) {
+  auto server_builder = absl::make_unique<::grpc::ServerBuilder>();
+  server_builder->AddListeningPort(
+      absl::StrFormat("[::]:%d", serving_port),
+      ::grpc::InsecureServerCredentials());  // NOLINT
+  return std::move(server_builder);
+}
 }  // namespace tpu
 }  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/tpu_util.h b/tensorflow/core/tpu/kernels/tpu_util.h
index 579fbdf5e85..d45934f31b6 100644
--- a/tensorflow/core/tpu/kernels/tpu_util.h
+++ b/tensorflow/core/tpu/kernels/tpu_util.h
@@ -15,9 +15,11 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_UTIL_H_
 #define TENSORFLOW_CORE_TPU_KERNELS_TPU_UTIL_H_
 
+#include <memory>
 #include <string>
 #include <vector>
 
+#include "grpcpp/server_builder.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
@@ -54,6 +56,10 @@ Status DynamicShapesToTensorShapes(const OpInputList& dynamic_shapes,
                                    std::vector<TensorShape>* shapes);
 Status DynamicShapesToTensorShapes(const InputList& dynamic_shapes,
                                    std::vector<TensorShape>* shapes);
+
+// Creates gRPC ServerBuilder.
+xla::StatusOr<std::unique_ptr<::grpc::ServerBuilder>> CreateServerBuilder(
+    int serving_port);
 }  // namespace tpu
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/tpu/kernels/tpu_util_c_api.h b/tensorflow/core/tpu/kernels/tpu_util_c_api.h
index ddc7a842f49..04b65e24e54 100644
--- a/tensorflow/core/tpu/kernels/tpu_util_c_api.h
+++ b/tensorflow/core/tpu/kernels/tpu_util_c_api.h
@@ -56,6 +56,9 @@ TFTPU_CAPI_EXPORT bool TpuCompile_ShouldTpuCompileOpIgnoreCancellation();
 TFTPU_CAPI_EXPORT int TpuTopology_AvailableCoreCount(
     const XLA_TpuMeshState* mesh_state, TpuCoreTypeEnum tpu_core_type);
 
+// Recycle unused service port.
+TFTPU_CAPI_EXPORT void TpuNetUtil_RecycleUnusedPort(int port);
+
 // Creates a unique compilation cache `key` used for `put` and `get` operations.
 // Returned buffers are heap-allocated and must be owned.
 TFTPU_CAPI_EXPORT CompilationCacheKeyResult
@@ -79,6 +82,7 @@ struct TfTpu_UtilApiFn {
   TFTPU_ADD_FN_IN_STRUCT(TpuCompile_IsTpuCompilationEnabled);
   TFTPU_ADD_FN_IN_STRUCT(TpuCompile_ShouldTpuCompileOpIgnoreCancellation);
   TFTPU_ADD_FN_IN_STRUCT(TpuTopology_AvailableCoreCount);
+  TFTPU_ADD_FN_IN_STRUCT(TpuNetUtil_RecycleUnusedPort);
   TFTPU_ADD_FN_IN_STRUCT(TpuCompile_CreateCompilationCacheKey);
   TFTPU_ADD_FN_IN_STRUCT(TpuCompile_DestroyCompilationCacheKey);
   TFTPU_ADD_FN_IN_STRUCT(TpuCompile_CreateGuaranteedConstFingerprint);
diff --git a/tensorflow/core/tpu/kernels/transfer_ops.cc b/tensorflow/core/tpu/kernels/transfer_ops.cc
new file mode 100644
index 00000000000..a5cdfd466a6
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/transfer_ops.cc
@@ -0,0 +1,99 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/tpu/kernels/transfer_ops.h"
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/platform/tracing.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/stream_executor/multi_platform_manager.h"
+#include "tensorflow/stream_executor/tpu/tpu_node_context.h"
+#include "tensorflow/stream_executor/tpu/tpu_platform_interface.h"
+#include "tensorflow/stream_executor/tpu/tpu_transfer_manager_interface.h"
+
+namespace tensorflow {
+
+TpuTransferAsyncOpKernel::TpuTransferAsyncOpKernel(OpKernelConstruction* ctx,
+                                                   const string& transfer_type,
+                                                   int number_of_threads)
+    : AsyncOpKernel(ctx),
+      thread_pool_(new thread::ThreadPool(
+          ctx->env(),
+          strings::StrCat(transfer_type, "_thread_",
+                          SanitizeThreadSuffix(def().name())),
+          /*num_threads=*/8)) {
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("device_ordinal", &device_ordinal_));
+  if (ctx->device_type() == DeviceType(DEVICE_CPU)) {
+    OP_REQUIRES(
+        ctx, device_ordinal_ >= 0,
+        errors::InvalidArgument(transfer_type,
+                                " ops must specify a device_ordinal when "
+                                "placed on CPU."));
+  }
+}
+
+void TpuTransferAsyncOpKernel::ComputeAsync(OpKernelContext* ctx,
+                                            DoneCallback done) {
+  CancellationToken token =
+      ctx->cancellation_manager()->get_cancellation_token();
+  bool already_cancelled;
+  {
+    // Only protect registering the cancellation callback as mu_ cannot be held
+    // at a point where `done` could be called.
+    mutex_lock lock(mu_);
+    already_cancelled = !ctx->cancellation_manager()->RegisterCallback(
+        token, [this]() { Cancel(); });
+  }
+  OP_REQUIRES_ASYNC(ctx, !already_cancelled,
+                    errors::Cancelled("Infeed was cancelled."), done);
+  thread_pool_->Schedule([this, ctx, done, token]() {
+    Status s = RunTransfer(ctx);
+    ctx->cancellation_manager()->DeregisterCallback(token);
+    OP_REQUIRES_OK_ASYNC(ctx, s, done);
+    done();
+  });
+}
+
+Status TpuTransferAsyncOpKernel::RunTransfer(OpKernelContext* ctx) {
+  auto* tpu_platform = tpu::TpuPlatformInterface::GetRegisteredPlatform(
+      /*initialize_platform=*/false);
+
+  int real_device_ordinal = device_ordinal_;
+  if (real_device_ordinal < 0) {
+    const XlaDevice::Metadata* metadata;
+    TF_RETURN_IF_ERROR(XlaDevice::GetMetadata(ctx, &metadata));
+    real_device_ordinal = metadata->device_ordinal();
+  }
+  stream_executor::StreamExecutor* stream_executor =
+      tpu_platform->ExecutorForDevice(real_device_ordinal).ValueOrDie();
+
+  // When Xprof profiling is off (which is the default), constructing the
+  // activity is simple enough that its overhead is negligible.
+  profiler::TraceMe activity(
+      [this] { return profiler::TraceMeOp(name(), type_string()); },
+      profiler::TraceMeLevel::kInfo);
+  return DoWork(
+      ctx, xla::TpuTransferManagerInterface::GetRegisteredTpuTransferManager(),
+      stream_executor);
+}
+
+void TpuTransferAsyncOpKernel::Cancel() {
+  mutex_lock lock(mu_);
+  TF_CHECK_OK(tpu::TpuNodeContext::CloseTpuHost());
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/transfer_ops.h b/tensorflow/core/tpu/kernels/transfer_ops.h
new file mode 100644
index 00000000000..d98d743f569
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/transfer_ops.h
@@ -0,0 +1,56 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TRANSFER_OPS_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TRANSFER_OPS_H_
+
+#include "tensorflow/compiler/jit/xla_device.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/util/stream_executor_util.h"
+#include "tensorflow/stream_executor/tpu/tpu_transfer_manager_interface.h"
+
+namespace tensorflow {
+
+// Base class providing common functionality for async ops that transfer from
+// host to TPU.
+class TpuTransferAsyncOpKernel : public AsyncOpKernel {
+ public:
+  explicit TpuTransferAsyncOpKernel(OpKernelConstruction* ctx,
+                                    const string& transfer_type,
+                                    int number_of_threads);
+
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override;
+
+ protected:
+  virtual Status DoWork(OpKernelContext* context,
+                        xla::TpuTransferManagerInterface* transfer_manager,
+                        stream_executor::StreamExecutor* stream_executor) = 0;
+
+ private:
+  Status RunTransfer(OpKernelContext* ctx);
+  void Cancel();
+
+  std::unique_ptr<thread::ThreadPool> thread_pool_;
+  int device_ordinal_;
+  mutex mu_;
+
+  // TpuTransferAsyncOpKernel is neither copyable nor movable.
+  TpuTransferAsyncOpKernel(const TpuTransferAsyncOpKernel&) = delete;
+  TpuTransferAsyncOpKernel& operator=(const TpuTransferAsyncOpKernel&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TRANSFER_OPS_H_
diff --git a/tensorflow/core/tpu/kernels/xla/BUILD b/tensorflow/core/tpu/kernels/xla/BUILD
new file mode 100644
index 00000000000..f55583a570b
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/xla/BUILD
@@ -0,0 +1,52 @@
+# XLA Ops for TPUs
+
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
+
+cc_library(
+    name = "xla_ops",
+    srcs = [
+        "get_item_op.cc",
+        "host_compute_ops.cc",
+        "index_ops.cc",
+        "infeed_op.cc",
+        "inplace_ops.cc",
+        "outfeed_ops.cc",
+        "segment_reduction_ops.cc",
+        "where_op.cc",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/compiler/tf2xla:common",
+        "//tensorflow/compiler/tf2xla:sharding_util",
+        "//tensorflow/compiler/tf2xla:side_effect_util",
+        "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/tf2xla:xla_context",
+        "//tensorflow/compiler/tf2xla:xla_helpers",
+        "//tensorflow/compiler/tf2xla:xla_op_registry",
+        "//tensorflow/compiler/tf2xla/kernels:if_op",
+        "//tensorflow/compiler/tf2xla/kernels:while_op",
+        "//tensorflow/compiler/tf2xla/kernels:xla_ops",
+        "//tensorflow/compiler/tf2xla/lib:scatter",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/compiler/xla/client/lib:comparators",
+        "//tensorflow/compiler/xla/client/lib:constants",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/tpu:tpu_api",
+        "//tensorflow/core/tpu:tpu_defs",
+        "//tensorflow/core/tpu/kernels:cross_replica_ops",
+        "//tensorflow/stream_executor/tpu:c_api_conversions",
+        "//tensorflow/stream_executor/tpu:c_api_decl",
+        "@com_google_absl//absl/strings",
+    ],
+    alwayslink = 1,
+)
diff --git a/tensorflow/core/tpu/kernels/xla/get_item_op.cc b/tensorflow/core/tpu/kernels/xla/get_item_op.cc
new file mode 100644
index 00000000000..094c6b87f64
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/xla/get_item_op.cc
@@ -0,0 +1,75 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/type_util.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_util.h"
+
+namespace tensorflow {
+namespace {
+
+// The Xla kernel to build up the computation for get_item(data, index).
+class GetItemXlaOp : public XlaOpKernel {
+ public:
+  explicit GetItemXlaOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    const TensorShape& data_shape = ctx->InputShape(0);
+    const TensorShape& index_shape = ctx->InputShape(1);
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsVectorOrHigher(data_shape),
+        errors::InvalidArgument("data must be at least 1 dimensional."));
+    OP_REQUIRES(ctx, index_shape.dims() == 1 && index_shape.dim_size(0) == 1,
+                errors::InvalidArgument("index must be a vector of size 1."));
+
+    // NOTE(pbar) Use Concat to extend the indices to match cl/142279605.
+    // This isn't the simplest way to emit the indices, but the code for
+    // dynamic slice needs to be able to see that minor dims are const zero.
+    auto const_zero = xla::ConstantR0(ctx->builder(), 0);
+    std::vector<xla::XlaOp> operands;
+    operands.push_back(xla::Reshape(ctx->Input(1), {}));
+    for (int i = 1; i < data_shape.dims(); i++) {
+      operands.push_back(const_zero);
+    }
+
+    std::vector<int64> dims = {0};
+    std::vector<int64> slice_sizes = {1};
+    std::vector<int64> out_sizes = {};
+    for (int i = 1; i < data_shape.dims(); i++) {
+      dims.push_back(i);
+      auto size = data_shape.dim_size(i);
+      slice_sizes.push_back(size);
+      out_sizes.push_back(size);
+    }
+    // NOTE: DynamicSlice here doesn't raise an error or wraps the index
+    // if its out-of-range.
+    auto slice = xla::DynamicSlice(ctx->Input(0), operands, slice_sizes);
+    // In-order collapse to remove the 1st dim.
+    auto reshape = xla::Reshape(slice, dims, out_sizes);
+    ctx->SetOutput(0, reshape);
+  }
+};
+
+REGISTER_XLA_OP(Name("GetItem"), GetItemXlaOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/xla/host_compute_ops.cc b/tensorflow/core/tpu/kernels/xla/host_compute_ops.cc
new file mode 100644
index 00000000000..be3ee1c9d24
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/xla/host_compute_ops.cc
@@ -0,0 +1,498 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/sharding_util.h"
+#include "tensorflow/compiler/tf2xla/side_effect_util.h"
+#include "tensorflow/compiler/tf2xla/type_util.h"
+#include "tensorflow/compiler/tf2xla/xla_context.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
+#include "tensorflow/core/common_runtime/lower_function_call_op.h"
+#include "tensorflow/core/common_runtime/lower_if_op.h"
+#include "tensorflow/core/common_runtime/shape_refiner.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/versions.pb.h"
+#include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/tpu/tpu_defs.h"
+
+namespace tensorflow {
+
+namespace {
+
+// TODO(phawkins) add a canonical copy of these operator names and refactor
+// everything to use it.
+static const char* const kSendFromHostOp = "_XlaSendFromHost";
+static const char* const kRecvAtHostOp = "_XlaRecvAtHost";
+
+Status MakeXlaShapes(gtl::ArraySlice<TensorShape> shapes,
+                     gtl::ArraySlice<DataType> dtypes,
+                     std::vector<xla::Shape>* xla_shapes,
+                     xla::Shape* xla_shape) {
+  for (int i = 0; i < shapes.size(); i++) {
+    xla::Shape single_xla_shape;
+    TF_RETURN_IF_ERROR(
+        TensorShapeToXLAShape(dtypes[i], shapes[i], &single_xla_shape));
+    VLOG(2) << "Shape " << single_xla_shape.DebugString();
+    xla_shapes->push_back(single_xla_shape);
+  }
+  // Temporarily add a dummy output to the shape array before making the tuple:
+  // this output is used for control dependencies between host compute ops.
+  xla_shapes->push_back(xla::ShapeUtil::MakeShape(xla::PRED, {}));
+  *xla_shape = xla::ShapeUtil::MakeTupleShape(*xla_shapes);
+  // Remove the dummy output from the vector that will be used to copy real
+  // outputs from host to device.
+  xla_shapes->pop_back();
+  return Status::OK();
+}
+
+// This TensorFlow pseudo-op is used to record host-side computation.
+class HostComputeOp : public XlaOpKernel {
+ public:
+  explicit HostComputeOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("cost_estimate_ns", &cost_estimate_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("key", &key_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("tpu_core", &tpu_core_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("Tinputs", &input_dtypes_));
+    OP_REQUIRES(ctx, ctx->num_inputs() == input_dtypes_.size(),
+                errors::InvalidArgument("Tinputs size=", input_dtypes_.size(),
+                                        " but expected ", ctx->num_inputs(),
+                                        " inputs."));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("Toutputs", &output_dtypes_));
+    OP_REQUIRES(ctx, ctx->num_outputs() == output_dtypes_.size(),
+                errors::InvalidArgument("Toutputs size=", output_dtypes_.size(),
+                                        " but expected ", ctx->num_outputs(),
+                                        " outputs."));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("ancestors", &ancestors_));
+    NameAttrList shape_inference_graph;
+    OP_REQUIRES_OK(
+        ctx, ctx->GetAttr("shape_inference_graph", &shape_inference_graph));
+    if (shape_inference_graph.name().empty()) {
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("shapes", &static_output_shapes_));
+      OP_REQUIRES(ctx, static_output_shapes_.size() == output_dtypes_.size(),
+                  errors::InvalidArgument(
+                      "shapes attr list size ", static_output_shapes_.size(),
+                      " differs from dtypes size ", output_dtypes_.size()));
+      OP_REQUIRES_OK(ctx, MakeXlaShapes(static_output_shapes_, output_dtypes_,
+                                        &static_xla_output_shapes_,
+                                        &static_xla_output_shape_));
+      VLOG(2) << "Output Shape: " << static_xla_output_shape_.DebugString();
+    } else {
+      FunctionLibraryRuntime* flib_runtime = ctx->function_library();
+      OP_REQUIRES(ctx, flib_runtime != nullptr,
+                  errors::Internal(
+                      "No function library runtime at kernel construction"));
+      const FunctionLibraryDefinition* library =
+          flib_runtime->GetFunctionLibraryDefinition();
+      const FunctionDef* fdef = library->Find(shape_inference_graph.name());
+      OP_REQUIRES(ctx, fdef != nullptr,
+                  errors::Internal("Failed to find function ",
+                                   shape_inference_graph.name(),
+                                   " in function library."));
+      OP_REQUIRES_OK(ctx, FunctionDefToBodyHelper(
+                              *fdef, AttrSlice(&shape_inference_graph.attr()),
+                              library, &shape_inference_graph_function_));
+      VLOG(2) << "Output Shape to be inferred at compile time";
+    }
+    OP_REQUIRES_OK(
+        ctx, ctx->GetAttr(kXlaTokenInputNodesAttrName, &token_input_nodes_));
+    OP_REQUIRES(ctx, !token_input_nodes_.empty(),
+                errors::InvalidArgument("XlaHostCompute node does not have ",
+                                        kXlaTokenInputNodesAttrName, " attr"));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr(kXlaOriginalOutsideCompilationNodeName,
+                                     &original_node_name_));
+  }
+
+  ~HostComputeOp() override {}
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::XlaBuilder* b = ctx->builder();
+    XlaCompiler* compiler = ctx->compiler();
+
+    std::vector<xla::XlaOp> input_handles;
+    std::vector<TensorShape> input_shapes;
+    auto inputs = ctx->InputList("inputs", &input_handles, &input_shapes);
+    const auto device_sharding = xla::sharding_builder::AssignDevice(tpu_core_);
+    xla::XlaScopedShardingAssignment assign_sharding(b, device_sharding);
+
+    std::vector<xla::XlaOp> input_tokens;
+    for (auto& token_input_node : token_input_nodes_) {
+      auto token_or = compiler->GetNodeToken(token_input_node);
+      OP_REQUIRES_OK(ctx, token_or.status());
+      input_tokens.push_back(token_or.ValueOrDie());
+    }
+    xla::XlaOp token = xla::AfterAll(b, input_tokens);
+
+    // Send values to the host.
+    std::vector<xla::XlaOp> send_to_host_tokens;
+    for (int i = 0; i < input_handles.size(); ++i) {
+      const string channel_name = absl::StrCat(key_, "_dtoh_", i);
+      xla::Shape xla_shape;
+      OP_REQUIRES_OK(ctx, TensorShapeToXLAShape(input_dtypes_[i],
+                                                input_shapes[i], &xla_shape));
+      // Specify frontend attributes.
+      xla::FrontendAttributes attrs;
+      (*attrs.mutable_map())[kXlaHostTransferRendezvousNameAttr] = channel_name;
+      (*attrs.mutable_map())[kXlaHostTransferOriginalTypeAttr] =
+          xla::primitive_util::LowercasePrimitiveTypeName(
+              xla_shape.element_type());
+      b->SetFrontendAttributes(attrs);
+      xla::ChannelHandle channel;
+      OP_REQUIRES_OK(
+          ctx, compiler->GetDeviceToHostChannelHandle(channel_name, &channel));
+      send_to_host_tokens.push_back(
+          xla::SendToHost(input_handles[i], token, xla_shape, channel));
+      b->ClearOpMetadata();
+    }
+    xla::XlaOp recv_from_host_token_input =
+        send_to_host_tokens.empty() ? token
+                                    : xla::AfterAll(b, send_to_host_tokens);
+    if (!input_handles.empty()) {
+      // Register the shapes used in this transfer.
+      OP_REQUIRES_OK(ctx, ctx->compiler()->SetDeviceToHostMetadata(
+                              key_, input_dtypes_, input_shapes));
+    }
+    // Compute the shapes of the values to copy to the device, if necessary.
+    std::vector<TensorShape>* output_shapes;
+    std::vector<xla::Shape>* xla_output_shapes;
+    xla::Shape* xla_output_shape;
+    std::vector<TensorShape> inferred_output_shapes;
+    std::vector<xla::Shape> inferred_xla_output_shapes;
+    xla::Shape inferred_xla_output_shape;
+    if (shape_inference_graph_function_) {
+      OP_REQUIRES_OK(
+          ctx, InferOutputShapes(
+                   ctx, ctx->function_library()->GetFunctionLibraryDefinition(),
+                   &inferred_output_shapes));
+      OP_REQUIRES_OK(ctx, MakeXlaShapes(inferred_output_shapes, output_dtypes_,
+                                        &inferred_xla_output_shapes,
+                                        &inferred_xla_output_shape));
+      output_shapes = &inferred_output_shapes;
+      xla_output_shapes = &inferred_xla_output_shapes;
+      xla_output_shape = &inferred_xla_output_shape;
+    } else {
+      output_shapes = &static_output_shapes_;
+      xla_output_shapes = &static_xla_output_shapes_;
+      xla_output_shape = &static_xla_output_shape_;
+    }
+    OP_REQUIRES(
+        ctx, output_shapes->size() == ctx->num_outputs(),
+        errors::InvalidArgument("Op has ", ctx->num_outputs(), " outputs ",
+                                " but output shape vector of size ",
+                                output_shapes->size()));
+    if (ctx->num_outputs() > 0) {
+      // Register the shapes used in this transfer.
+      OP_REQUIRES_OK(ctx, ctx->compiler()->SetHostToDeviceMetadata(
+                              key_, output_dtypes_, *output_shapes));
+    }
+    // Copy results to the device.
+    std::vector<xla::XlaOp> recv_from_host_tokens;
+    for (int i = 0; i < output_shapes->size(); ++i) {
+      const string channel_name = absl::StrCat(key_, "_htod_", i);
+      // Specify frontend attributes.
+      xla::FrontendAttributes attrs;
+      (*attrs.mutable_map())[kXlaHostTransferRendezvousNameAttr] = channel_name;
+      (*attrs.mutable_map())[kXlaHostTransferOriginalTypeAttr] =
+          xla::primitive_util::LowercasePrimitiveTypeName(
+              xla_output_shapes->at(i).element_type());
+      b->SetFrontendAttributes(attrs);
+      xla::ChannelHandle channel;
+      OP_REQUIRES_OK(
+          ctx, compiler->GetHostToDeviceChannelHandle(channel_name, &channel));
+
+      const auto result_token_tuple = xla::RecvFromHost(
+          recv_from_host_token_input, xla_output_shapes->at(i), channel);
+      b->ClearOpMetadata();
+      recv_from_host_tokens.push_back(
+          xla::GetTupleElement(result_token_tuple, /*index=*/1));
+      ctx->SetOutput(i, xla::GetTupleElement(result_token_tuple, 0));
+    }
+
+    // Set token output.
+    xla::XlaOp token_output = recv_from_host_tokens.empty()
+                                  ? recv_from_host_token_input
+                                  : xla::AfterAll(b, recv_from_host_tokens);
+    OP_REQUIRES_OK(
+        ctx, ctx->compiler()->SetNodeToken(original_node_name_, token_output));
+  }
+
+ private:
+  Status LowerFunctionalOps(Graph* g,
+                            const FunctionLibraryDefinition& flib_def) {
+    bool modified;
+    do {
+      modified = false;
+
+      // Lower "If" nodes first. Their body functions will be expanded as
+      // function call nodes, which we will lower later.
+      // We do not need to lower "While" nodes because shape inference can
+      // handle them correctly (output shapes are input shapes).
+      std::vector<Node*> if_nodes;
+      for (Node* n : g->op_nodes()) {
+        if (n->type_string() == "If") {
+          if_nodes.push_back(n);
+        }
+      }
+      for (Node* if_node : if_nodes) {
+        TF_RETURN_IF_ERROR(
+            RewriteIfNode(if_node, g, /*keep_node_fetchable=*/false));
+      }
+      if (!if_nodes.empty()) {
+        modified = true;
+      }
+
+      // Lower function call nodes.
+      std::vector<Node*> call_nodes;
+      for (Node* n : g->op_nodes()) {
+        if (IsFunctionCall(flib_def, *n)) {
+          call_nodes.push_back(n);
+        }
+      }
+      for (Node* call_node : call_nodes) {
+        TF_RETURN_IF_ERROR(RewriteFunctionCallNode(
+            call_node, g, flib_def, /*keep_caller_fetchable=*/false));
+      }
+      if (!call_nodes.empty()) {
+        modified = true;
+      }
+    } while (modified);
+
+    return Status::OK();
+  }
+
+  Status InferOutputShapes(XlaOpKernelContext* ctx,
+                           const FunctionLibraryDefinition* flib_def,
+                           std::vector<TensorShape>* output_shapes) {
+    // First unpack the inference graphdef from the attr into graph. Don't do
+    // any shape inference at this point.
+    Graph* graph = shape_inference_graph_function_->graph;
+
+    // Lower functional ops, because they are not friendly to shape inference.
+    TF_RETURN_IF_ERROR(LowerFunctionalOps(graph, *flib_def));
+
+    // Now run shape inference, filling in the shapes of recvathost nodes.
+    bool got_output_shapes = false;
+    ShapeRefiner shape_refiner{graph->versions().producer(),
+                               graph->op_registry()};
+    std::vector<Node*> nodes;
+    GetReversePostOrder(*graph, &nodes);
+    for (auto node : nodes) {
+      TF_RETURN_IF_ERROR(shape_refiner.AddNode(node));
+      if (node->type_string() == kRecvAtHostOp) {
+        const AttrValue* key_attr = node->attrs().Find("key");
+        if (key_attr == nullptr) {
+          return errors::InvalidArgument("Node ", node->name(),
+                                         " has no key attribute");
+        }
+        std::vector<TensorShape> dtoh_shapes;
+        if (!ctx->compiler()
+                 ->GetDeviceToHostShapes(key_attr->s(), &dtoh_shapes)
+                 .ok()) {
+          return errors::InvalidArgument(
+              "Shape inference for HostCompute ", ctx->op_kernel().name(),
+              " failed: host recv node ", node->name(), " with key '",
+              key_attr->s(), "' has unknown shapes.");
+        }
+        if (dtoh_shapes.size() != node->num_outputs()) {
+          return errors::InvalidArgument(
+              "Shape inference for HostCompute ", ctx->op_kernel().name(),
+              " failed: host recv node ", node->name(), " with key '",
+              key_attr->s(), "' has ", node->num_outputs(),
+              " outputs but inferred shapes expect ", dtoh_shapes.size());
+        }
+        for (int i = 0; i < node->num_outputs(); ++i) {
+          shape_inference::InferenceContext* shape_ctx =
+              shape_refiner.GetContext(node);
+          shape_inference::ShapeHandle handle;
+          TF_RETURN_IF_ERROR(
+              shape_ctx->MakeShapeFromTensorShape(dtoh_shapes.at(i), &handle));
+          shape_ctx->set_output(i, handle);
+        }
+      } else if (node->type_string() == kSendFromHostOp) {
+        if (got_output_shapes) {
+          return errors::InvalidArgument(
+              "Shape inference for HostCompute ", ctx->op_kernel().name(),
+              " failed: inference graph has multiple send from host nodes");
+        } else {
+          got_output_shapes = true;
+          // The last input is the dynamic key so don't record its shape.
+          output_shapes->resize(node->num_inputs() - 1);
+          shape_inference::InferenceContext* shape_ctx =
+              shape_refiner.GetContext(node);
+          for (int i = 0; i < node->num_inputs() - 1; ++i) {
+            shape_inference::ShapeHandle handle = shape_ctx->input(i);
+            if (!shape_ctx->FullyDefined(handle)) {
+              return errors::InvalidArgument(
+                  "Shape inference for HostCompute ", ctx->op_kernel().name(),
+                  " failed: send from host node ", node->name(),
+                  " has non-fully defined shape of input index ", i);
+            }
+            TensorShapeProto shape_proto;
+            shape_ctx->ShapeHandleToProto(handle, &shape_proto);
+            (*output_shapes)[i] = TensorShape(shape_proto);
+            VLOG(2) << "Inferred shape " << shape_proto.DebugString();
+          }
+        }
+      }
+    }
+    if (!got_output_shapes) {
+      return errors::InvalidArgument(
+          "Shape inference for HostCompute ", ctx->op_kernel().name(),
+          " failed: inference graph has no send from host node");
+    }
+    return Status::OK();
+  }
+
+  DataTypeVector input_dtypes_;
+  DataTypeVector output_dtypes_;
+  std::vector<string> ancestors_;
+  std::vector<TensorShape> static_output_shapes_;
+  std::vector<xla::Shape> static_xla_output_shapes_;
+  string original_node_name_;
+  // If static_xla_output_shapes_.size() == 1 then xla_output_shape_ is the
+  // unique output shape, otherwise it is a tuple of all the xla_output_shapes_.
+  xla::Shape static_xla_output_shape_;
+  string key_;
+  // If shape inference is performed at runtime, the graph needed to perform
+  // shape inference is stored in this function.
+  std::unique_ptr<FunctionBody> shape_inference_graph_function_;
+  int64 cost_estimate_;
+  int64 tpu_core_;
+  std::vector<string> token_input_nodes_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(HostComputeOp);
+};
+
+class SendToHostOp : public XlaOpKernel {
+ public:
+  explicit SendToHostOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("Tinput", &input_dtype_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("key", &key_));
+    OP_REQUIRES_OK(
+        ctx, ctx->GetAttr(kXlaTokenInputNodesAttrName, &token_input_nodes_));
+    OP_REQUIRES(ctx, !token_input_nodes_.empty(),
+                errors::InvalidArgument("XlaSendToHost node does not have ",
+                                        kXlaTokenInputNodesAttrName, " attr"));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr(kXlaOriginalOutsideCompilationNodeName,
+                                     &original_node_name_));
+  }
+
+  ~SendToHostOp() override {}
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::XlaBuilder* b = ctx->builder();
+
+    XlaCompiler* compiler = ctx->compiler();
+    xla::XlaOp operand = ctx->Input(0);
+    std::vector<xla::XlaOp> input_tokens;
+    for (auto& token_input_node : token_input_nodes_) {
+      auto token_or = compiler->GetNodeToken(token_input_node);
+      OP_REQUIRES_OK(ctx, token_or.status());
+      input_tokens.push_back(token_or.ValueOrDie());
+    }
+    xla::XlaOp token = xla::AfterAll(b, input_tokens);
+    xla::Shape xla_shape;
+    OP_REQUIRES_OK(ctx, TensorShapeToXLAShape(input_dtype_, ctx->InputShape(0),
+                                              &xla_shape));
+    // Specify frontend attributes.
+    xla::FrontendAttributes attrs;
+    (*attrs.mutable_map())[kXlaHostTransferRendezvousNameAttr] = key_;
+    (*attrs.mutable_map())[kXlaHostTransferOriginalTypeAttr] =
+        xla::primitive_util::LowercasePrimitiveTypeName(
+            xla_shape.element_type());
+    b->SetFrontendAttributes(attrs);
+    xla::ChannelHandle channel;
+    OP_REQUIRES_OK(ctx, compiler->GetDeviceToHostChannelHandle(key_, &channel));
+    xla::XlaOp output_token =
+        xla::SendToHost(operand, token, xla_shape, channel);
+    OP_REQUIRES_OK(ctx,
+                   compiler->SetNodeToken(original_node_name_, output_token));
+  }
+
+ private:
+  DataType input_dtype_;
+  string key_;
+  std::vector<string> token_input_nodes_;
+  string original_node_name_;
+  TF_DISALLOW_COPY_AND_ASSIGN(SendToHostOp);
+};
+
+class RecvFromHostOp : public XlaOpKernel {
+ public:
+  explicit RecvFromHostOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("Toutput", &output_dtype_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("shape", &output_shape_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("key", &key_));
+    OP_REQUIRES_OK(
+        ctx, ctx->GetAttr(kXlaTokenInputNodesAttrName, &token_input_nodes_));
+    OP_REQUIRES(ctx, !token_input_nodes_.empty(),
+                errors::InvalidArgument("XlaRecvFromHost node does not have ",
+                                        kXlaTokenInputNodesAttrName, " attr"));
+  }
+
+  ~RecvFromHostOp() override {}
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::XlaBuilder* b = ctx->builder();
+
+    XlaCompiler* compiler = ctx->compiler();
+    std::vector<xla::XlaOp> input_tokens;
+    for (auto& token_input_node : token_input_nodes_) {
+      auto token_or = compiler->GetNodeToken(token_input_node);
+      OP_REQUIRES_OK(ctx, token_or.status());
+      input_tokens.push_back(token_or.ValueOrDie());
+    }
+    xla::XlaOp token = xla::AfterAll(b, input_tokens);
+    xla::Shape xla_shape;
+    OP_REQUIRES_OK(
+        ctx, TensorShapeToXLAShape(output_dtype_, output_shape_, &xla_shape));
+    // Specify frontend attributes.
+    xla::FrontendAttributes attrs;
+    (*attrs.mutable_map())[kXlaHostTransferRendezvousNameAttr] = key_;
+    (*attrs.mutable_map())[kXlaHostTransferOriginalTypeAttr] =
+        xla::primitive_util::LowercasePrimitiveTypeName(
+            xla_shape.element_type());
+    b->SetFrontendAttributes(attrs);
+    xla::ChannelHandle channel;
+    OP_REQUIRES_OK(ctx, compiler->GetHostToDeviceChannelHandle(key_, &channel));
+    xla::XlaOp result = xla::RecvFromHost(token, xla_shape, channel);
+    // xla::RecvFromHost returns a tuple of (received data, token).
+    ctx->SetOutput(0, xla::GetTupleElement(result, 0));
+    OP_REQUIRES_OK(
+        ctx, compiler->SetNodeToken(name(), xla::GetTupleElement(result, 1)));
+  }
+
+ private:
+  DataType output_dtype_;
+  TensorShape output_shape_;
+  string key_;
+  std::vector<string> token_input_nodes_;
+  TF_DISALLOW_COPY_AND_ASSIGN(RecvFromHostOp);
+};
+
+REGISTER_XLA_OP(Name("XlaHostCompute"), HostComputeOp);
+REGISTER_XLA_OP(Name("XlaSendToHost"), SendToHostOp);
+REGISTER_XLA_OP(Name("XlaRecvFromHost"), RecvFromHostOp);
+
+}  // anonymous namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/xla/index_ops.cc b/tensorflow/core/tpu/kernels/xla/index_ops.cc
new file mode 100644
index 00000000000..40148f31177
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/xla/index_ops.cc
@@ -0,0 +1,34 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/kernels/index_ops.h"
+
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/core/tpu/tpu_defs.h"
+
+namespace tensorflow {
+namespace {
+
+// This registration is needed here because the ArgMax Op is defined in
+// third_party where DEVICE_TPU_XLA_JIT is not visible. Most Ops don't need a
+// specific TPU whitelist, but ArgMax does because it has a separate CustomCall
+// implementation on CPU.
+REGISTER_XLA_OP(Name("ArgMax")
+                    .Device(DEVICE_TPU_XLA_JIT)
+                    .CompileTimeConstantInput("dimension"),
+                XlaArgMaxOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/xla/infeed_op.cc b/tensorflow/core/tpu/kernels/xla/infeed_op.cc
new file mode 100644
index 00000000000..941a543e386
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/xla/infeed_op.cc
@@ -0,0 +1,162 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/sharding_util.h"
+#include "tensorflow/compiler/tf2xla/type_util.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/tpu/tpu_api.h"
+#include "tensorflow/core/tpu/tpu_defs.h"
+#include "tensorflow/stream_executor/tpu/c_api_conversions.h"
+#include "tensorflow/stream_executor/tpu/c_api_decl.h"
+
+namespace tensorflow {
+
+namespace {
+
+xla::Shape GetTPUInfeedLayout(const xla::Shape& shape) {
+  XLA_Shape c_shape;
+  XLA_Shape c_infeed_shape;
+
+  ApiConverter::ToC(shape, &c_shape);
+
+  tpu::ExecutorApiFn()->TpuTransferManager_GetInfeedLayoutFn(&c_shape,
+                                                             &c_infeed_shape);
+  xla::Shape infeed_shape = ApiConverter::FromC(&c_infeed_shape);
+  ApiConverter::Free(&c_shape);
+  ApiConverter::Free(&c_infeed_shape);
+  return infeed_shape;
+}
+
+// Updates the layout of the given infeed shape, optionally considering the
+// sharding of the op. If the op has tile sharding, assign the layout based on
+// the shard shape.
+Status UpdateInfeedLayout(xla::Shape* shape,
+                          absl::optional<xla::OpSharding> sharding) {
+  if (sharding && sharding->type() == xla::OpSharding::OTHER) {
+    TF_ASSIGN_OR_RETURN(auto hlo_sharding,
+                        xla::HloSharding::FromProto(*sharding));
+    for (int64 i = 0; i < sharding->tile_assignment_devices_size(); ++i) {
+      auto device = sharding->tile_assignment_devices(i);
+      auto shard_shape =
+          GetTPUInfeedLayout(hlo_sharding.TileShape(*shape, device));
+      if (i == 0) {
+        *shape->mutable_layout() = shard_shape.layout();
+      }
+      if (xla::ShapeUtil::ElementsIn(shard_shape) == 0) {
+        // Shapes with 0 dimensions may be assigned with a different layout, but
+        // it doesn't matter since we're not sending any data.
+        continue;
+      }
+      if (!xla::LayoutUtil::Equal(shard_shape.layout(), shape->layout())) {
+        return xla::Unimplemented(
+            "Sharded infeed with non-uniform layouts is not supported. Try "
+            "turning off the infeed layout optimization "
+            "(--transpose_tpu_infeed=false) and report to XLA team.");
+      }
+    }
+    return Status::OK();
+  }
+  *shape = GetTPUInfeedLayout(*shape);
+  return Status::OK();
+}
+
+// TODO(pbar) Work out if we need to Infeed Tuples - if so then
+// this op will need a way to provide a list of shapes
+// since they can't be provided by the runtime JIT mechanism.
+// (InfeedDequeue has no inputs!)
+// Compare this op to tf.Queue operations which operate on N tensors.
+
+// This TensorFlow op supports the XLA Infeed primitve.
+class InfeedDequeueOp : public XlaOpKernel {
+ public:
+  explicit InfeedDequeueOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("shape", &shape_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dtype", &dtype_));
+    OP_REQUIRES_OK(ctx, TensorShapeToXLAShape(dtype_, shape_, &xla_shape_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::XlaBuilder* b = ctx->builder();
+    OP_REQUIRES_OK(ctx, UpdateInfeedLayout(&xla_shape_, b->sharding()));
+    ctx->SetOutput(0, xla::Infeed(b, xla_shape_));
+  }
+
+ private:
+  TensorShape shape_;
+  DataType dtype_;
+  xla::Shape xla_shape_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(InfeedDequeueOp);
+};
+
+REGISTER_XLA_OP(Name("InfeedDequeue"), InfeedDequeueOp);
+
+// This TensorFlow op supports the XLA Infeed primitive for tuple types.
+class InfeedDequeueTupleOp : public XlaOpKernel {
+ public:
+  explicit InfeedDequeueTupleOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("shapes", &shapes_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dtypes", &dtypes_));
+    for (int i = 0; i < shapes_.size(); i++) {
+      xla::Shape xla_shape;
+      OP_REQUIRES_OK(ctx,
+                     TensorShapeToXLAShape(dtypes_[i], shapes_[i], &xla_shape));
+      xla_shapes_.push_back(xla_shape);
+    }
+  }
+
+  ~InfeedDequeueTupleOp() override {}
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::XlaBuilder* b = ctx->builder();
+    for (int64 i = 0; i < xla_shapes_.size(); ++i) {
+      absl::optional<xla::OpSharding> sharding;
+      if (b->sharding()) {
+        sharding = b->sharding()->type() == xla::OpSharding::TUPLE
+                       ? b->sharding()->tuple_shardings(i)
+                       : b->sharding();
+      }
+      OP_REQUIRES_OK(ctx, UpdateInfeedLayout(&xla_shapes_[i], sharding));
+    }
+    tuple_shape_ = xla::ShapeUtil::MakeTupleShape(xla_shapes_);
+    auto tuple = xla::Infeed(b, tuple_shape_);
+
+    // Don't apply the infeed tuple sharding to the get-tuple-elements. They
+    // need non-tuple shardings.
+    xla::XlaScopedShardingAssignment clear_sharding(b, absl::nullopt);
+    for (int i = 0; i < shapes_.size(); ++i) {
+      ctx->SetOutput(i, xla::GetTupleElement(tuple, i));
+    }
+  }
+
+ private:
+  std::vector<TensorShape> shapes_;
+  DataTypeVector dtypes_;
+  std::vector<xla::Shape> xla_shapes_;
+  xla::Shape tuple_shape_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(InfeedDequeueTupleOp);
+};
+
+REGISTER_XLA_OP(Name("InfeedDequeueTuple"), InfeedDequeueTupleOp);
+
+}  // anonymous namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/xla/inplace_ops.cc b/tensorflow/core/tpu/kernels/xla/inplace_ops.cc
new file mode 100644
index 00000000000..9baffd6bbf0
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/xla/inplace_ops.cc
@@ -0,0 +1,142 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+#include "tensorflow/compiler/tf2xla/lib/scatter.h"
+#include "tensorflow/compiler/tf2xla/type_util.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/core/framework/kernel_def_builder.h"
+
+namespace tensorflow {
+namespace {
+
+class InplaceUpdateOp : public XlaOpKernel {
+ public:
+  explicit InplaceUpdateOp(OpKernelConstruction* context)
+      : XlaOpKernel(context) {}
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    VLOG(3) << "InplaceUpdateOp::Compile";
+
+    DataType index_type = input_type(1);
+    OP_REQUIRES(ctx, index_type == DT_INT32 || index_type == DT_INT64,
+                errors::InvalidArgument("index must be int32 or int64"));
+
+    // TF Args are X, I, V
+    const TensorShape x_shape = ctx->InputShape(0);
+    const TensorShape i_shape = ctx->InputShape(1);
+    const TensorShape v_shape = ctx->InputShape(2);
+
+    OP_REQUIRES(ctx,
+                TensorShapeUtils::IsScalar(i_shape) ||
+                    TensorShapeUtils::IsVector(i_shape),
+                errors::InvalidArgument("index must be Rank 0 or 1"));
+    OP_REQUIRES(ctx, (x_shape.dims() == v_shape.dims()),
+                errors::InvalidArgument("X and V must have the same Rank,"
+                                        " X.shape=",
+                                        x_shape.DebugString(),
+                                        " V.shape=", v_shape.DebugString()));
+
+    auto* builder = ctx->builder();
+    auto const_zero = xla::ConstantR0(builder, 0);
+    auto current = ctx->Input(0);
+
+    for (int64 i = 0; i < i_shape.num_elements(); i++) {
+      std::vector<xla::XlaOp> update_indices;
+      update_indices.push_back(
+          xla::Reshape(xla::SliceInDim(ctx->Input(1), i, i + 1, 1, 0), {}));
+      for (int xi = 1; xi < x_shape.dims(); xi++) {
+        update_indices.push_back(const_zero);
+      }
+      current = xla::DynamicUpdateSlice(
+          current, xla::SliceInDim(ctx->Input(2), i, i + 1, 1, 0),
+          update_indices);
+    }
+    ctx->SetOutput(0, current);
+
+    // TODO(b/118122460): Uncomment+format this code to use XLA Scatter.
+    //     auto* builder = ctx->builder();
+    //     const auto initial = ctx->Input(0);
+    //     const auto indices = ctx->Input(1);
+    //     const auto updates = ctx->Input(2);
+    //
+    //     auto result = XlaScatter(
+    //         initial, updates, indices, /*indices_are_vectors=*/false,
+    //         [](xla::XlaOp, xla::XlaOp second, xla::XlaBuilder*) { return
+    //         second; }, builder);
+    //     OP_REQUIRES_OK(ctx, result.status());
+    //     ctx->SetOutput(0, result.ValueOrDie());
+  }
+};
+
+REGISTER_XLA_OP(Name("InplaceUpdate"), InplaceUpdateOp);
+
+class InplaceAddOp : public XlaOpKernel {
+ public:
+  explicit InplaceAddOp(OpKernelConstruction* context) : XlaOpKernel(context) {}
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    VLOG(3) << "InplaceAddOp::Compile";
+
+    DataType index_type = input_type(1);
+    OP_REQUIRES(ctx, index_type == DT_INT32 || index_type == DT_INT64,
+                errors::InvalidArgument("index must be int32 or int64"));
+
+    // TF Args are X, I, V
+    const TensorShape x_shape = ctx->InputShape(0);
+    const TensorShape i_shape = ctx->InputShape(1);
+    const TensorShape v_shape = ctx->InputShape(2);
+    OP_REQUIRES(ctx,
+                (TensorShapeUtils::IsScalar(i_shape) ||
+                 ((i_shape.dims() == 1) && (i_shape.num_elements() == 1))),
+                errors::InvalidArgument("index must be Rank 1 and size 1"));
+    OP_REQUIRES(ctx, (x_shape.dims() == v_shape.dims()),
+                errors::InvalidArgument("X and V must have the same Rank,"
+                                        " X.shape=",
+                                        x_shape.DebugString(),
+                                        " V.shape=", v_shape.DebugString()));
+    // Pad the indices out to the match the rank of params.
+    auto* builder = ctx->builder();
+    std::vector<xla::XlaOp> padded_indices;
+    padded_indices.push_back(xla::Reshape(ctx->Input(1), {}));
+    for (int i = 0; i < x_shape.dims() - 1; ++i) {
+      padded_indices.push_back(XlaHelpers::Zero(builder, index_type));
+    }
+
+    std::vector<int64> sizes;
+    sizes.push_back(1);
+    for (int i = 1; i < x_shape.dims(); i++) {
+      sizes.push_back(x_shape.dim_size(i));
+    }
+
+    auto prev = xla::DynamicSlice(ctx->Input(0), padded_indices, sizes);
+    auto updated = xla::Add(prev, ctx->Input(2));
+    auto result =
+        xla::DynamicUpdateSlice(ctx->Input(0), updated, padded_indices);
+    ctx->SetOutput(0, result);
+  }
+};
+
+REGISTER_XLA_OP(Name("InplaceAdd"), InplaceAddOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/xla/outfeed_ops.cc b/tensorflow/core/tpu/kernels/xla/outfeed_ops.cc
new file mode 100644
index 00000000000..8abdd3d171f
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/xla/outfeed_ops.cc
@@ -0,0 +1,91 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+#include "tensorflow/compiler/tf2xla/type_util.h"
+#include "tensorflow/core/framework/kernel_def_builder.h"
+
+namespace tensorflow {
+
+namespace {
+
+// This TensorFlow op implements the XLA Outfeed primitive.
+class OutfeedEnqueueOp : public XlaOpKernel {
+ public:
+  explicit OutfeedEnqueueOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dtype", &dtype_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::Shape xla_shape;
+    OP_REQUIRES_OK(
+        ctx, TensorShapeToXLAShape(dtype_, ctx->InputShape(0), &xla_shape));
+    // Outfeed configuration is only needed for embedding outfeed.
+    const string outfeed_config;
+    xla::Outfeed(ctx->Input(0), xla_shape, outfeed_config);
+  }
+
+ private:
+  DataType dtype_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(OutfeedEnqueueOp);
+};
+
+REGISTER_XLA_OP(Name("OutfeedEnqueue"), OutfeedEnqueueOp);
+
+// This TensorFlow op implements the XLA Outfeed primitive for tuple types.
+class OutfeedEnqueueTupleOp : public XlaOpKernel {
+ public:
+  explicit OutfeedEnqueueTupleOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dtypes", &dtypes_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    std::vector<xla::XlaOp> handles;
+    std::vector<TensorShape> shapes;
+    auto inputs = ctx->InputList("inputs", &handles, &shapes);
+
+    std::vector<xla::Shape> xla_shapes;
+    for (int i = 0; i < shapes.size(); ++i) {
+      xla::Shape xla_shape;
+      OP_REQUIRES_OK(ctx,
+                     TensorShapeToXLAShape(dtypes_[i], shapes[i], &xla_shape));
+      xla_shapes.push_back(xla_shape);
+    }
+    xla::Shape tuple_shape = xla::ShapeUtil::MakeTupleShape(xla_shapes);
+    VLOG(1) << "OutfeedEnqueueTuple: "
+            << xla::ShapeUtil::HumanStringWithLayout(tuple_shape);
+    auto b = ctx->builder();
+    auto tuple = xla::Tuple(b, handles);
+    // Outfeed configuration is only needed for embedding outfeed.
+    const string outfeed_config;
+    xla::Outfeed(tuple, tuple_shape, outfeed_config);
+  }
+
+ private:
+  DataTypeVector dtypes_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(OutfeedEnqueueTupleOp);
+};
+
+REGISTER_XLA_OP(Name("OutfeedEnqueueTuple"), OutfeedEnqueueTupleOp);
+
+}  // anonymous namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/xla/segment_reduction_ops.cc b/tensorflow/core/tpu/kernels/xla/segment_reduction_ops.cc
new file mode 100644
index 00000000000..fc15d71dfd8
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/xla/segment_reduction_ops.cc
@@ -0,0 +1,177 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/lib/scatter.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/core/tpu/tpu_defs.h"
+
+namespace tensorflow {
+namespace {
+// TODO(b/32945756): Add a scatter op in XLA and move this to a HLO optimization
+// pass. Optimization for UnsortedSegmentSum on TPU: use k-hot matmul. This
+// optimization requires:
+//     1. data has dtype supported by TPU matmul and has rank of 1 or 2.
+//     2. indices has rank of 1.
+//     3. matmul op count is less than 800 billion.
+//
+// Example of calculating UnsortedSegmentSum by k-hot matmul:
+//     data shape        [A, B]
+//     indices shape     [A]
+//     num_segment        N
+//     output shape      [N, B]
+//     matmul op count    N * A * B
+// Step 1: create k-hot matrix
+//     k-hot matrix has shape of [A, N], where row i is responsible for
+//     collecting the sum of the i-th segment, concretely
+//            k-hot[i][j] = 1 if indices[i] = j
+// Step 2: perform matmul
+//     the final result is obtained by multiplying k-hot matrix with data
+//     matrix, namely
+//             k-hot  *  data   => result
+// shape:      [N, A] *  [A, B] => [N, B]
+xla::XlaOp KHotMatmul(XlaOpKernelContext* ctx, xla::XlaBuilder* builder,
+                      const xla::XlaOp data, const xla::XlaOp indices,
+                      int64 num_segments) {
+  DataType data_dtype = ctx->input_type(0);
+  xla::PrimitiveType indices_type = ctx->input_xla_type(1);
+  TensorShape data_shape = ctx->InputShape(0);
+  TensorShape indices_shape = ctx->InputShape(1);
+  xla::XlaOp linspace = xla::Iota(builder, indices_type, num_segments);
+  xla::XlaOp linspace_col = xla::Reshape(linspace, {num_segments, 1});
+  TensorShape indices_row_shape = indices_shape;
+  indices_row_shape.InsertDim(0, 1);
+  xla::XlaOp indices_row = xla::Reshape(indices, indices_row_shape.dim_sizes());
+  xla::XlaOp k_hot = xla::Eq(indices_row, linspace_col);
+  xla::XlaOp k_hot_with_data_dtype =
+      XlaHelpers::ConvertElementType(k_hot, data_dtype);
+  // F32 version of the KHotMatmul. It splits the F32 data into three
+  // BF16 partial data and run KHotMatmul for each of them. The final result
+  // is the summation of three BF16 results.
+  // Note that this still doesn't fully retain f32 precision.
+  // In particular, values smaller than 2^-111 may see loss of precision.
+  xla::PrecisionConfig precision_config;
+  if (data_dtype == DT_FLOAT) {
+    precision_config.add_operand_precision(xla::PrecisionConfig::HIGHEST);
+  } else {
+    CHECK_EQ(data_dtype, DT_BFLOAT16);
+    precision_config.add_operand_precision(xla::PrecisionConfig::DEFAULT);
+  }
+  precision_config.add_operand_precision(xla::PrecisionConfig::DEFAULT);
+  return xla::Dot(k_hot_with_data_dtype, data, &precision_config);
+}
+
+class UnsortedSegmentSum : public XlaOpKernel {
+ public:
+  explicit UnsortedSegmentSum(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("T", &dtype_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    // output = unsorted_segment_sum(data, indices, num_segments)
+    // Compute a tensor such that:
+    //    output[i] = sum over {j where indices[j] == i} of data[j]
+    //    output[i] == 0 if i does not appear in indices
+    //
+    // Contrast with segment_sum(), which assumes indices are sorted and that
+    // max(indices)+1 is the desired size of the output.
+    //
+    // The returned output tensor has the same type as data, and the same shape
+    // as data with the first indices.rank dimensions are replaced
+    // by a single dimension with size num_segments.
+    xla::XlaOp data = ctx->Input(0);
+    TensorShape data_shape = ctx->InputShape(0);
+
+    xla::XlaOp indices = ctx->Input(1);
+    TensorShape indices_shape = ctx->InputShape(1);
+
+    int64 num_segments;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar(2, &num_segments));
+
+    OP_REQUIRES(ctx, data_shape.dims() >= indices_shape.dims(),
+                errors::InvalidArgument(
+                    "UnsortedSegmentSum requires that indices' rank be"
+                    " less than or equal to data's rank."));
+    // Validate that indices.shape is a prefix of data.shape.
+    for (int d = 0; d < indices_shape.dims(); ++d) {
+      OP_REQUIRES(ctx, (data_shape.dim_size(d) == indices_shape.dim_size(d)),
+                  errors::InvalidArgument(
+                      "UnsortedSegmentSum requires indices shape to be prefix"
+                      " of data_shape, but dimension ",
+                      d, " differs ", data_shape.dim_size(d), " vs. ",
+                      indices_shape.dim_size(d)));
+    }
+    xla::XlaBuilder* builder = ctx->builder();
+    // data shape = [indices_shape, segment_shape]
+    // buffer shape = [num_segment, segment_shape]
+    // We now create the buffer shape by reverse enginerring data shape into
+    // indices shape and segment shape.
+    TensorShape buffer_shape = data_shape;
+    buffer_shape.RemoveDimRange(0, indices_shape.dims());
+    buffer_shape.InsertDim(0, num_segments);
+
+    auto buffer = xla::Broadcast(XlaHelpers::Zero(builder, dtype_),
+                                 buffer_shape.dim_sizes());
+
+    // Build dynamic dim sizes for buffer, as well as whether each dimension
+    // size is dynamic or static. We build two parts: num_sgement part and
+    // segment_shape part.
+    std::vector<xla::XlaOp> buffer_dims;
+    std::vector<bool> buffer_dims_are_dynamic;
+    // Build the "num_segment" part.
+    bool num_segments_is_dynamic;
+    OP_REQUIRES_OK(
+        ctx, ctx->ResolveInputDynamismIntoPred(2, &num_segments_is_dynamic));
+
+    buffer_dims.insert(buffer_dims.begin(), ctx->Input(2));
+    buffer_dims_are_dynamic.insert(buffer_dims_are_dynamic.begin(),
+                                   num_segments_is_dynamic);
+    // Build the segment shape part.
+    for (int64 i = indices_shape.dims(); i < data_shape.dims(); ++i) {
+      buffer_dims.push_back(xla::GetDimensionSize(data, i));
+      buffer_dims_are_dynamic.push_back(
+          ctx->InputXlaShape(0)->is_dynamic_dimension(i));
+    }
+
+    for (int64 i = 0; i < buffer_dims.size(); ++i) {
+      if (buffer_dims_are_dynamic[i]) {
+        // For each dynamic dimension, call set-dimension-size on it.
+        buffer = xla::SetDimensionSize(buffer, buffer_dims[i], i);
+      }
+    }
+
+    auto combiner = [](xla::XlaOp a, xla::XlaOp b, xla::XlaBuilder* builder) {
+      return a + b;
+    };
+
+    auto result = XlaScatter(buffer, /*updates=*/data, indices,
+                             /*indices_are_vectors=*/false, combiner, builder);
+    OP_REQUIRES_OK(ctx, result.status());
+    ctx->SetOutput(0, result.ValueOrDie());
+  }
+
+ private:
+  DataType dtype_;
+};
+
+REGISTER_XLA_OP(Name("UnsortedSegmentSum")
+                    .Device(DEVICE_TPU_XLA_JIT)
+                    .CompileTimeConstantInput("num_segments"),
+                UnsortedSegmentSum);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/xla/where_op.cc b/tensorflow/core/tpu/kernels/xla/where_op.cc
new file mode 100644
index 00000000000..420d5bcb9c3
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/xla/where_op.cc
@@ -0,0 +1,91 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/literal_util.h"
+#include "tensorflow/compiler/tf2xla/type_util.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/lib/arithmetic.h"
+#include "tensorflow/compiler/xla/client/lib/comparators.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/ops_util.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/tpu/tpu_defs.h"
+
+namespace tensorflow {
+namespace {
+
+class WhereOp : public XlaOpKernel {
+ public:
+  explicit WhereOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::XlaOp condition = ctx->Input(0);
+    xla::StatusOr<xla::Shape> input_shape = ctx->builder()->GetShape(condition);
+    OP_REQUIRES_OK(ctx, input_shape.status());
+    // Use S32 as indices first, then convert to S64 in the end if needed.
+    auto iota_shape = input_shape.ValueOrDie();
+    iota_shape.set_element_type(xla::S32);
+
+    int64 flattened_size = xla::Product(iota_shape.dimensions());
+    xla::XlaOp reshaped_condition = xla::Reshape(condition, {flattened_size});
+    xla::XlaOp zeros = xla::ZerosLike(reshaped_condition);
+    xla::XlaOp zeros_int = xla::ConvertElementType(zeros, xla::S32);
+    xla::XlaOp reshaped_condition_int =
+        xla::ConvertElementType(reshaped_condition, xla::S32);
+    xla::XlaOp compared = xla::ConvertElementType(
+        xla::Gt(reshaped_condition_int, zeros_int), xla::S32);
+    xla::XlaOp length = xla::ReduceAll(
+        compared, xla::Zero(ctx->builder(), xla::S32),
+        xla::CreateScalarAddComputation(xla::S32, ctx->builder()));
+
+    std::vector<xla::XlaOp> to_sort = {reshaped_condition_int};
+    std::vector<xla::PrimitiveType> types_to_sort = {xla::S32};
+    // Generate iota for each dimension, which after combining becomes
+    // indices of each element.
+    for (int64 axis = 0; axis < iota_shape.rank(); ++axis) {
+      xla::XlaOp iota = xla::Iota(ctx->builder(), iota_shape, axis);
+      xla::XlaOp reshaped = xla::Reshape(iota, {flattened_size});
+      to_sort.push_back(reshaped);
+      types_to_sort.push_back(xla::S32);
+    }
+
+    xla::XlaOp sorted = xla::Sort(
+        to_sort, xla::CreateScalarGtComputation(types_to_sort, ctx->builder()),
+        /*dimension=*/0,
+        /*is_stable=*/true);
+    std::vector<xla::XlaOp> to_concat;
+    for (int64 i = 0; i < iota_shape.rank(); ++i) {
+      xla::XlaOp index_single_dim = xla::GetTupleElement(sorted, i + 1);
+      to_concat.push_back(xla::Reshape(index_single_dim, {flattened_size, 1}));
+    }
+
+    xla::XlaOp result = xla::ConcatInDim(ctx->builder(), to_concat, 1);
+    result = xla::ConvertElementType(result, ctx->output_xla_type(0));
+    // Dynamic padder will handle the dynamic dimension.
+    xla::XlaOp result_padded = xla::SetDimensionSize(result, length, 0);
+    ctx->SetOutput(0, result_padded);
+  }
+};
+
+REGISTER_XLA_OP(Name("Where").Device(DEVICE_TPU_XLA_JIT), WhereOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/ops/tpu_compile_op.cc b/tensorflow/core/tpu/ops/tpu_compile_op.cc
index 6f62e36f857..038005b4f45 100644
--- a/tensorflow/core/tpu/ops/tpu_compile_op.cc
+++ b/tensorflow/core/tpu/ops/tpu_compile_op.cc
@@ -40,16 +40,31 @@ REGISTER_OP("_TPUCompileMlir")
       c->set_output(0, c->Scalar());
       // Programs.
       for (int i = 0; i < num_computations; ++i) {
-        c->set_output(i + 1, c->Vector(2));
+        c->set_output(i + 1, c->Vector(3));
       }
       return Status::OK();
-    });
+    })
+    .Doc(
+        R"(
+Compiles a computations for execution on one or more TPU devices.
+For the internal use of the distributed TPU compiler.
+
+'mlir_module' is a serialized MLIR module with a `main` function that contains
+target computation.
+'dynamic_shapes' contains dynamic shapes of arguments whose shapes were not
+known statically at TPUReplication rewrite time.
+'metadata' is a serialized TPUCompileMetadataProto describing the shapes and
+types of the inputs to the computation, as well as a mapping onto the TPU pod
+topology.
+'program' output is a string key that is passed to the TPUExecute op and used to
+look up the program in the compilation cache.
+)");
 
 REGISTER_OP("_TPUCompileMlirPlaceholderProgramKey")
     .SetIsStateful()
     .Output("program: string")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
-      c->set_output(0, c->Vector(2));
+      c->set_output(0, c->Vector(3));
       return Status::OK();
     })
     .SetIsStateful()
@@ -85,7 +100,7 @@ REGISTER_OP("TPUCompile")
       c->set_output(0, c->Scalar());
       // Programs.
       for (int i = 0; i < num_computations; ++i) {
-        c->set_output(i + 1, c->Vector(2));
+        c->set_output(i + 1, c->Vector(3));
       }
       // May modify variables.
       for (int i = 0; i < num_computations; ++i) {
diff --git a/tensorflow/core/tpu/ops/tpu_execute_op.cc b/tensorflow/core/tpu/ops/tpu_execute_op.cc
index 68ddc862031..2e437192aa2 100644
--- a/tensorflow/core/tpu/ops/tpu_execute_op.cc
+++ b/tensorflow/core/tpu/ops/tpu_execute_op.cc
@@ -30,7 +30,7 @@ REGISTER_OP("TPUExecute")
       shape_inference::ShapeHandle key;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(c->num_inputs() - 1), 1, &key));
       shape_inference::DimensionHandle unused;
-      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(key, 0), 2, &unused));
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(key, 0), 3, &unused));
       for (int i = 0; i < c->num_outputs(); ++i) {
         c->set_output(i, c->UnknownShape());
       }
@@ -50,7 +50,7 @@ REGISTER_OP("TPUExecuteAndUpdateVariables")
       shape_inference::ShapeHandle key;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(c->num_inputs() - 1), 1, &key));
       shape_inference::DimensionHandle unused;
-      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(key, 0), 2, &unused));
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(key, 0), 3, &unused));
       for (int i = 0; i < c->num_outputs(); ++i) {
         c->set_output(i, c->UnknownShape());
       }
diff --git a/tensorflow/core/tpu/tpu_api.cc b/tensorflow/core/tpu/tpu_api.cc
index cd6ca80e4e7..17520ea6ea4 100644
--- a/tensorflow/core/tpu/tpu_api.cc
+++ b/tensorflow/core/tpu/tpu_api.cc
@@ -48,11 +48,6 @@ TfTpu_TpuProgramApiFn* TpuProgramApiFn() {
   return &tpu_program_api_fn;
 }
 
-TfTpu_ExecutorApiFn* ExecutorApiFn() {
-  static TfTpu_ExecutorApiFn executor_api_fn;
-  return &executor_api_fn;
-}
-
 TfTpu_NodeContextApiFn* NodeContextApiFn() {
   static TfTpu_NodeContextApiFn node_context_api_fn;
   return &node_context_api_fn;
diff --git a/tensorflow/core/tpu/tpu_api.h b/tensorflow/core/tpu/tpu_api.h
index b6edbfd14bb..a9f7bccfdb4 100644
--- a/tensorflow/core/tpu/tpu_api.h
+++ b/tensorflow/core/tpu/tpu_api.h
@@ -22,7 +22,7 @@ limitations under the License.
 #include "tensorflow/core/tpu/kernels/tpu_util_c_api.h"
 #include "tensorflow/core/tpu/libtftpu.h"
 #include "tensorflow/core/tpu/tpu_config_c_api.h"
-#include "tensorflow/stream_executor/tpu/tpu_executor_c_api.h"
+#include "tensorflow/core/tpu/tpu_executor_api.h"
 #include "tensorflow/stream_executor/tpu/tpu_node_context_c_api.h"
 
 namespace tensorflow {
@@ -40,8 +40,6 @@ TfTpu_ExecuteApiFn* ExecuteApiFn();
 
 TfTpu_TpuProgramApiFn* TpuProgramApiFn();
 
-TfTpu_ExecutorApiFn* ExecutorApiFn();
-
 TfTpu_NodeContextApiFn* NodeContextApiFn();
 
 TfTpu_UtilApiFn* UtilApiFn();
diff --git a/tensorflow/core/tpu/tpu_api_dlsym_initializer.cc b/tensorflow/core/tpu/tpu_api_dlsym_initializer.cc
index 47d517270dc..4dc09770c38 100644
--- a/tensorflow/core/tpu/tpu_api_dlsym_initializer.cc
+++ b/tensorflow/core/tpu/tpu_api_dlsym_initializer.cc
@@ -31,7 +31,7 @@ limitations under the License.
   Struct->FnName##Fn =                                                       \
       reinterpret_cast<decltype(FnName)*>(dlsym(library_handle, #FnName));   \
   if (!(Struct->FnName##Fn)) {                                               \
-    LOG(ERROR) << #FnName " not available in this library.";                 \
+    LOG(FATAL) << #FnName " not available in this library.";                 \
     return errors::Unimplemented(#FnName " not available in this library."); \
   }
 
@@ -72,7 +72,7 @@ Status InitializeTpuLibrary(void* library_handle) {
 }
 
 bool FindAndLoadTpuLibrary() {
-  void* library = dlopen("libtftpu.so", RTLD_NOW);
+  void* library = dlopen("libtpu.so", RTLD_NOW);
   if (library) {
     InitializeTpuLibrary(library);
   }
diff --git a/tensorflow/core/tpu/tpu_config_c_api.h b/tensorflow/core/tpu/tpu_config_c_api.h
index 55de89d17c9..de4b2e25570 100644
--- a/tensorflow/core/tpu/tpu_config_c_api.h
+++ b/tensorflow/core/tpu/tpu_config_c_api.h
@@ -26,17 +26,15 @@ typedef struct TpuSerializedProto TpuSerializedProto;
 
 namespace tensorflow {
 class TpuMeshCommonState;
-namespace tpu {
-class TpuMeshStateInterface;
-}  // namespace tpu
 }  // namespace tensorflow
 
 extern "C" {
 
 TFTPU_CAPI_EXPORT void ConfigureDistributedTpuOp_DoWork(
     const size_t num_cores_per_host_size, const int32_t* num_cores_per_host,
-    void* tpu_compilation_cache_interface, size_t* host_config_output_size,
-    char** host_config_output, TF_Status* status);
+    size_t server_address_size, const char* server_address,
+    size_t* host_config_output_size, char** host_config_output,
+    TF_Status* status);
 
 TFTPU_CAPI_EXPORT void WaitForDistributedTpuOp_DoWork(
     const size_t num_hosts, const size_t num_cores_per_host,
@@ -45,11 +43,9 @@ TFTPU_CAPI_EXPORT void WaitForDistributedTpuOp_DoWork(
     size_t* tpu_topology_output_size, char** tpu_topology_output,
     TF_Status* status);
 
-TFTPU_CAPI_EXPORT void ShutdownDistributedTpuOp_DoWork(TF_Status* status);
-
 TFTPU_CAPI_EXPORT void InitializeHostForDistributedTpuOp_DoWork(
     const size_t tpu_host_config_size, const char* tpu_host_config,
-    const bool enable_whole_mesh_compilations, void* local_compilation_cache,
+    const bool enable_whole_mesh_compilations, bool is_master_worker,
     size_t* core_id_output_size, int32_t** core_id_output, TF_Status* status);
 
 TFTPU_CAPI_EXPORT void SetGlobalTPUArrayOp_DoWork(
@@ -68,12 +64,22 @@ TFTPU_CAPI_EXPORT void TpuConfigurationApi_TpusPerHost(int32_t* tpus,
                                                        TF_Status* status);
 TFTPU_CAPI_EXPORT void TpuConfigurationApi_TpuMemoryLimit(int64_t* memory_limit,
                                                           TF_Status* status);
+
+TFTPU_CAPI_EXPORT void TpuConfigurationApi_RemoteCompilationCacheSizeInBytes(
+    int64_t* cache_size_in_bytes);
+TFTPU_CAPI_EXPORT
+void TpuConfigurationApi_CompilationCacheServerAddressFromConfig(
+    size_t tpu_host_config_size, const char* tpu_host_config,
+    size_t* server_address_output_size, char** server_address_output,
+    TF_Status* status);
+TFTPU_CAPI_EXPORT void TpuConfigurationApi_GetServerAddressAndPort(
+    size_t* server_address_output_size, char** server_address_output,
+    int* port_output, TF_Status* status);
 }
 
 struct TfTpu_ConfigApiFn {
   TFTPU_ADD_FN_IN_STRUCT(ConfigureDistributedTpuOp_DoWork);
   TFTPU_ADD_FN_IN_STRUCT(WaitForDistributedTpuOp_DoWork);
-  TFTPU_ADD_FN_IN_STRUCT(ShutdownDistributedTpuOp_DoWork);
   TFTPU_ADD_FN_IN_STRUCT(InitializeHostForDistributedTpuOp_DoWork);
   TFTPU_ADD_FN_IN_STRUCT(SetGlobalTPUArrayOp_DoWork);
   TFTPU_ADD_FN_IN_STRUCT(DisconnectDistributedTpuChipsOp_DoWork);
@@ -82,6 +88,10 @@ struct TfTpu_ConfigApiFn {
   TFTPU_ADD_FN_IN_STRUCT(TpuConfigurationApi_HasTPUPodState);
   TFTPU_ADD_FN_IN_STRUCT(TpuConfigurationApi_TpusPerHost);
   TFTPU_ADD_FN_IN_STRUCT(TpuConfigurationApi_TpuMemoryLimit);
+  TFTPU_ADD_FN_IN_STRUCT(TpuConfigurationApi_RemoteCompilationCacheSizeInBytes);
+  TFTPU_ADD_FN_IN_STRUCT(
+      TpuConfigurationApi_CompilationCacheServerAddressFromConfig);
+  TFTPU_ADD_FN_IN_STRUCT(TpuConfigurationApi_GetServerAddressAndPort);
 };
 
 #endif  // TENSORFLOW_CORE_TPU_TPU_CONFIG_C_API_H_
diff --git a/tensorflow/core/platform/tf32_utils.h b/tensorflow/core/tpu/tpu_executor_api.cc
similarity index 76%
rename from tensorflow/core/platform/tf32_utils.h
rename to tensorflow/core/tpu/tpu_executor_api.cc
index 7a158d00ad3..dd02ca27aa4 100644
--- a/tensorflow/core/platform/tf32_utils.h
+++ b/tensorflow/core/tpu/tpu_executor_api.cc
@@ -13,15 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_PLATFORM_TF32_UTILS_H_
-#define TENSORFLOW_CORE_PLATFORM_TF32_UTILS_H_
+#include "tensorflow/core/tpu/tpu_executor_api.h"
 
 namespace tensorflow {
+namespace tpu {
 
-void allow_tf32_execution(bool allowed);
-
-bool tf32_execution_allowed();
+TfTpu_ExecutorApiFn* ExecutorApiFn() {
+  static TfTpu_ExecutorApiFn executor_api_fn;
+  return &executor_api_fn;
+}
 
+}  // namespace tpu
 }  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PLATFORM_TF32_UTILS_H_
diff --git a/tensorflow/core/tpu/tpu_executor_api.h b/tensorflow/core/tpu/tpu_executor_api.h
new file mode 100644
index 00000000000..ee07dc618a6
--- /dev/null
+++ b/tensorflow/core/tpu/tpu_executor_api.h
@@ -0,0 +1,30 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_TPU_EXECUTOR_API_H_
+#define TENSORFLOW_CORE_TPU_TPU_EXECUTOR_API_H_
+
+#include "tensorflow/core/tpu/libtftpu.h"
+#include "tensorflow/stream_executor/tpu/tpu_executor_c_api.h"
+
+namespace tensorflow {
+namespace tpu {
+
+TfTpu_ExecutorApiFn* ExecutorApiFn();
+
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_TPU_EXECUTOR_API_H_
diff --git a/tensorflow/core/tpu/tpu_executor_init_fns.inc b/tensorflow/core/tpu/tpu_executor_init_fns.inc
new file mode 100644
index 00000000000..696a5a412db
--- /dev/null
+++ b/tensorflow/core/tpu/tpu_executor_init_fns.inc
@@ -0,0 +1,151 @@
+namespace {
+
+tensorflow::Status SetExecutorStructFn(void* library_handle) {
+  auto* executor_fn = tensorflow::tpu::ExecutorApiFn();
+
+  TFTPU_SET_FN(executor_fn, TpuPlatform_New);
+  TFTPU_SET_FN(executor_fn, TpuPlatform_Free);
+  TFTPU_SET_FN(executor_fn, TpuPlatform_Initialize);
+  TFTPU_SET_FN(executor_fn, TpuPlatform_Initialized);
+  TFTPU_SET_FN(executor_fn, TpuPlatform_GetExecutor);
+  TFTPU_SET_FN(executor_fn, TpuPlatform_Id);
+  TFTPU_SET_FN(executor_fn, TpuPlatform_VisibleDeviceCount);
+  TFTPU_SET_FN(executor_fn, TpuPlatform_TpuMemoryLimit);
+  TFTPU_SET_FN(executor_fn, TpuPlatform_ShouldRegisterTpuDeviceToDeviceCopy);
+  TFTPU_SET_FN(executor_fn, TpuPlatform_GetTopologyPtr);
+  TFTPU_SET_FN(executor_fn, TpuPlatform_GetHostLocation);
+
+  TFTPU_SET_FN(executor_fn, TpuExecutor_Init);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_Free);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_PlatformDeviceCount);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_Allocate);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_Deallocate);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_GetAllocatorStats);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_DeviceMemoryUsage);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_AllocateStream);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_DeallocateStream);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_CreateStreamDependency);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_GetStatus);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_GetCoreLocation);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_AllocateEvent);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_DeallocateEvent);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_PollForEventStatus);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_RecordEvent);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_WaitForEvent);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_AllocateTimer);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_DeallocateTimer);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_StartTimer);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_StopTimer);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_SynchronousMemcpyToHost);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_SynchronousMemcpyFromHost);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_MemcpyToHost);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_MemcpyFromHost);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_EnqueueInfeed);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_DequeueOutfeed);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_WaitForInfeedReady);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_WaitForOutfeedReady);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_BlockHostUntilDone);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_BlockUntilDoneOrFailed);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_SyncAndForgetFailedStreams);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_SynchronizeAllActivity);
+
+  TFTPU_SET_FN(executor_fn, TpuStream_New);
+  TFTPU_SET_FN(executor_fn, TpuStream_Free);
+  TFTPU_SET_FN(executor_fn, TpuStream_Stream);
+  TFTPU_SET_FN(executor_fn, TpuStream_Status);
+  TFTPU_SET_FN(executor_fn, TpuStream_IsSameSharedMemoryLocation);
+  TFTPU_SET_FN(executor_fn, TpuStream_EnqueueTransferHostToDevice);
+  TFTPU_SET_FN(executor_fn, TpuStream_EnqueueTransferDeviceToHost);
+  TFTPU_SET_FN(executor_fn, TpuStream_TpuEnqueueOnDeviceSendRecvLocal);
+
+  TFTPU_SET_FN(executor_fn, TpuEvent_New);
+  TFTPU_SET_FN(executor_fn, TpuEvent_Free);
+
+  TFTPU_SET_FN(executor_fn, TpuTimer_New);
+  TFTPU_SET_FN(executor_fn, TpuTimer_Free);
+  TFTPU_SET_FN(executor_fn, TpuTimer_Nanoseconds);
+  TFTPU_SET_FN(executor_fn, TpuTimer_Microseconds);
+
+  TFTPU_SET_FN(executor_fn, TpuStatus_New);
+  TFTPU_SET_FN(executor_fn, TpuStatus_Create);
+  TFTPU_SET_FN(executor_fn, TpuStatus_Set);
+  TFTPU_SET_FN(executor_fn, TpuStatus_Free);
+  TFTPU_SET_FN(executor_fn, TpuStatus_Message);
+  TFTPU_SET_FN(executor_fn, TpuStatus_Code);
+  TFTPU_SET_FN(executor_fn, TpuStatus_Ok);
+
+  TFTPU_SET_FN(executor_fn, TpuStreamExecutorConfig_Default);
+  TFTPU_SET_FN(executor_fn, TpuStreamExecutorConfig_SetOrdinal);
+  TFTPU_SET_FN(executor_fn, TpuStreamExecutorConfig_Free);
+
+  TFTPU_SET_FN(executor_fn, TpuDeviceDescription_New);
+  TFTPU_SET_FN(executor_fn, TpuDeviceDescription_Free);
+
+  TFTPU_SET_FN(executor_fn, TpuExecutor_CreateDeviceDescription);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_NewDeviceOptions);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_FreeDeviceOptions);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_HostCallback);
+
+  TFTPU_SET_FN(executor_fn, TpuTransferManager_New);
+  TFTPU_SET_FN(executor_fn, TpuTransferManager_Free);
+  TFTPU_SET_FN(executor_fn, TpuTransferManager_PlatformId);
+  TFTPU_SET_FN(executor_fn, TpuTransferManager_HostShapeToDeviceShape);
+  TFTPU_SET_FN(executor_fn, TpuTransferManager_TransferLiteralToDeviceAsync);
+  TFTPU_SET_FN(executor_fn, TpuTransferManager_TransferLiteralFromDevice);
+  TFTPU_SET_FN(executor_fn, TpuTransferManager_GetByteSizeRequirement);
+  TFTPU_SET_FN(executor_fn, TpuTransferManager_ChooseCompactLayoutForShape);
+  TFTPU_SET_FN(executor_fn, TpuTransferManager_CanShapedBufferBeAccessedNow);
+  TFTPU_SET_FN(executor_fn, TpuTransferManager_CanBufferBeAccessedNow);
+  TFTPU_SET_FN(executor_fn, TpuTransferManager_WriteSingleTupleIndexTable);
+  TFTPU_SET_FN(executor_fn, TpuTransferManager_GetInfeedLayout);
+  TFTPU_SET_FN(executor_fn, TpuTransferManager_LinearizeToBuffers);
+  TFTPU_SET_FN(executor_fn, TpuTransferManager_FreeBuffers);
+  TFTPU_SET_FN(executor_fn, TpuTransferManager_TransferLiteralToInfeed);
+  TFTPU_SET_FN(executor_fn, TpuTransferManager_TransferBuffersToInfeed);
+  TFTPU_SET_FN(executor_fn, TpuTransferManager_TransferLiteralFromOutfeed);
+  TFTPU_SET_FN(executor_fn, TpuTransferManager_ResetDevices);
+
+  TFTPU_SET_FN(executor_fn, TpuComputationPlacer_New);
+  TFTPU_SET_FN(executor_fn, TpuComputationPlacer_Free);
+  TFTPU_SET_FN(executor_fn, TpuComputationPlacer_AssignDevices);
+  TFTPU_SET_FN(executor_fn, TpuComputationPlacer_AssignLocalDevices);
+
+  TFTPU_SET_FN(executor_fn, TpuTopology_LogicalDevicesPerHost);
+  TFTPU_SET_FN(executor_fn, TpuTopology_LogicalDevicesPerChip);
+  TFTPU_SET_FN(executor_fn, TpuTopology_ChipBounds_X);
+  TFTPU_SET_FN(executor_fn, TpuTopology_ChipBounds_Y);
+  TFTPU_SET_FN(executor_fn, TpuTopology_ChipBounds_Z);
+  TFTPU_SET_FN(executor_fn, TpuTopology_HasChip);
+  TFTPU_SET_FN(executor_fn, TpuTopology_Core);
+  TFTPU_SET_FN(executor_fn, TpuTopology_NumCores);
+  TFTPU_SET_FN(executor_fn, TpuTopology_Cores);
+  TFTPU_SET_FN(executor_fn, TpuTopology_IdForHost);
+  TFTPU_SET_FN(executor_fn, TpuTopology_Version);
+
+  TFTPU_SET_FN(executor_fn, TpuCoreLocation_ChipCoordinates);
+  TFTPU_SET_FN(executor_fn, TpuCoreLocation_HostCoordinates);
+  TFTPU_SET_FN(executor_fn, TpuCoreLocation_Index);
+  TFTPU_SET_FN(executor_fn, TpuCoreLocation_Id);
+
+  TFTPU_SET_FN(executor_fn, TpuHostLocation_Id);
+  TFTPU_SET_FN(executor_fn, TpuHostLocation_NumCores);
+  TFTPU_SET_FN(executor_fn, TpuHostLocation_Cores);
+
+  TFTPU_SET_FN(executor_fn, TpuCompiler_New);
+  TFTPU_SET_FN(executor_fn, TpuCompiler_Free);
+
+  TFTPU_SET_FN(executor_fn, TpuCompiler_RunHloPasses);
+  TFTPU_SET_FN(executor_fn, TpuCompiler_RunBackend);
+  TFTPU_SET_FN(executor_fn, TpuCompiler_Compile);
+  TFTPU_SET_FN(executor_fn, TpuCompiler_ShapeSize);
+  TFTPU_SET_FN(executor_fn, TpuExecutable_ExecuteAsyncOnStream);
+  TFTPU_SET_FN(executor_fn, TpuExecutable_Fingerprint);
+  TFTPU_SET_FN(executor_fn, TpuExecutable_Free);
+
+  TFTPU_SET_FN(executor_fn, XlaShapeToTpuShapeRepresentation);
+  TFTPU_SET_FN(executor_fn, XlaShapeToTpuPaddedShape);
+
+  return tensorflow::Status::OK();
+}
+
+}  // namespace
diff --git a/tensorflow/core/tpu/tpu_library_init_fns.inc b/tensorflow/core/tpu/tpu_library_init_fns.inc
index 682cc8b1c13..de245340b8a 100644
--- a/tensorflow/core/tpu/tpu_library_init_fns.inc
+++ b/tensorflow/core/tpu/tpu_library_init_fns.inc
@@ -1,3 +1,9 @@
+#if defined(PLATFORM_GOOGLE)
+#include "third_party/tensorflow/core/tpu/tpu_executor_init_fns.inc"
+#else
+#include "tensorflow/core/tpu/tpu_executor_init_fns.inc"
+#endif
+
 namespace {
 
 tensorflow::Status SetTpuConfigStructFns(void* library_handle) {
@@ -5,7 +11,6 @@ tensorflow::Status SetTpuConfigStructFns(void* library_handle) {
 
   TFTPU_SET_FN(config_fn, ConfigureDistributedTpuOp_DoWork);
   TFTPU_SET_FN(config_fn, WaitForDistributedTpuOp_DoWork);
-  TFTPU_SET_FN(config_fn, ShutdownDistributedTpuOp_DoWork);
   TFTPU_SET_FN(config_fn, InitializeHostForDistributedTpuOp_DoWork);
   TFTPU_SET_FN(config_fn, SetGlobalTPUArrayOp_DoWork);
   TFTPU_SET_FN(config_fn, DisconnectDistributedTpuChipsOp_DoWork);
@@ -14,6 +19,11 @@ tensorflow::Status SetTpuConfigStructFns(void* library_handle) {
   TFTPU_SET_FN(config_fn, TpuConfigurationApi_HasTPUPodState);
   TFTPU_SET_FN(config_fn, TpuConfigurationApi_TpusPerHost);
   TFTPU_SET_FN(config_fn, TpuConfigurationApi_TpuMemoryLimit);
+  TFTPU_SET_FN(config_fn,
+               TpuConfigurationApi_RemoteCompilationCacheSizeInBytes);
+  TFTPU_SET_FN(config_fn,
+               TpuConfigurationApi_CompilationCacheServerAddressFromConfig);
+  TFTPU_SET_FN(config_fn, TpuConfigurationApi_GetServerAddressAndPort);
 
   return tensorflow::Status::OK();
 }
@@ -31,7 +41,6 @@ tensorflow::Status SetTpuMeshStateStructFns(void* library_handle) {
 tensorflow::Status SetCompileStructFn(void* library_handle) {
   auto* compile_fn = tensorflow::tpu::CompileApiFn();
 
-  TFTPU_SET_FN(compile_fn, TpuCompile_CompileAheadOfTime);
   TFTPU_SET_FN(compile_fn, TpuCompile_CompileAndBuild);
 
   return tensorflow::Status::OK();
@@ -64,132 +73,13 @@ tensorflow::Status SetTpuProgramStructFn(void* library_handle) {
   TFTPU_SET_FN(tpu_program_fn, TpuProgram_GetHostTransferInfo);
   TFTPU_SET_FN(tpu_program_fn, TpuProgram_GetHloMetadata);
   TFTPU_SET_FN(tpu_program_fn, TpuProgram_GetMayModifyVariables);
-
-  return tensorflow::Status::OK();
-}
-
-tensorflow::Status SetExecutorStructFn(void* library_handle) {
-  auto* executor_fn = tensorflow::tpu::ExecutorApiFn();
-
-  TFTPU_SET_FN(executor_fn, TpuPlatform_New);
-  TFTPU_SET_FN(executor_fn, TpuPlatform_Free);
-  TFTPU_SET_FN(executor_fn, TpuPlatform_Initialize);
-  TFTPU_SET_FN(executor_fn, TpuPlatform_Initialized);
-  TFTPU_SET_FN(executor_fn, TpuPlatform_GetExecutor);
-  TFTPU_SET_FN(executor_fn, TpuPlatform_Id);
-  TFTPU_SET_FN(executor_fn, TpuPlatform_VisibleDeviceCount);
-  TFTPU_SET_FN(executor_fn, TpuPlatform_TpuMemoryLimit);
-  TFTPU_SET_FN(executor_fn, TpuPlatform_ShouldRegisterTpuDeviceToDeviceCopy);
-  TFTPU_SET_FN(executor_fn, TpuPlatform_GetTopologyPtr);
-  TFTPU_SET_FN(executor_fn, TpuPlatform_GetHostLocation);
-
-  TFTPU_SET_FN(executor_fn, TpuExecutor_Init);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_Free);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_PlatformDeviceCount);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_Allocate);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_Deallocate);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_GetAllocatorStats);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_DeviceMemoryUsage);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_AllocateStream);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_DeallocateStream);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_CreateStreamDependency);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_GetStatus);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_AllocateEvent);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_DeallocateEvent);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_PollForEventStatus);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_RecordEvent);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_WaitForEvent);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_AllocateTimer);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_DeallocateTimer);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_StartTimer);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_StopTimer);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_SynchronousMemcpyToHost);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_SynchronousMemcpyFromHost);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_MemcpyToHost);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_MemcpyFromHost);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_EnqueueInfeed);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_DequeueOutfeed);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_WaitForInfeedReady);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_WaitForOutfeedReady);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_BlockHostUntilDone);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_BlockUntilDoneOrFailed);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_SyncAndForgetFailedStreams);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_SynchronizeAllActivity);
-
-  TFTPU_SET_FN(executor_fn, TpuStream_New);
-  TFTPU_SET_FN(executor_fn, TpuStream_Free);
-  TFTPU_SET_FN(executor_fn, TpuStream_Stream);
-  TFTPU_SET_FN(executor_fn, TpuStream_Status);
-  TFTPU_SET_FN(executor_fn, TpuStream_IsSameSharedMemoryLocation);
-  TFTPU_SET_FN(executor_fn, TpuStream_TpuEnqueueOnDeviceSendRecvLocal);
-
-  TFTPU_SET_FN(executor_fn, TpuEvent_New);
-  TFTPU_SET_FN(executor_fn, TpuEvent_Free);
-
-  TFTPU_SET_FN(executor_fn, TpuTimer_New);
-  TFTPU_SET_FN(executor_fn, TpuTimer_Free);
-  TFTPU_SET_FN(executor_fn, TpuTimer_Nanoseconds);
-  TFTPU_SET_FN(executor_fn, TpuTimer_Microseconds);
-
-  TFTPU_SET_FN(executor_fn, TpuStatus_New);
-  TFTPU_SET_FN(executor_fn, TpuStatus_Create);
-  TFTPU_SET_FN(executor_fn, TpuStatus_Set);
-  TFTPU_SET_FN(executor_fn, TpuStatus_Free);
-  TFTPU_SET_FN(executor_fn, TpuStatus_Message);
-  TFTPU_SET_FN(executor_fn, TpuStatus_Code);
-  TFTPU_SET_FN(executor_fn, TpuStatus_Ok);
-
-  TFTPU_SET_FN(executor_fn, TpuStreamExecutorConfig_Default);
-  TFTPU_SET_FN(executor_fn, TpuStreamExecutorConfig_SetOrdinal);
-  TFTPU_SET_FN(executor_fn, TpuStreamExecutorConfig_Free);
-
-  TFTPU_SET_FN(executor_fn, TpuDeviceDescription_New);
-  TFTPU_SET_FN(executor_fn, TpuDeviceDescription_Free);
-
-  TFTPU_SET_FN(executor_fn, TpuExecutor_CreateDeviceDescription);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_NewDeviceOptions);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_FreeDeviceOptions);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_HostCallback);
-
-  TFTPU_SET_FN(executor_fn, TpuTransferManager_New);
-  TFTPU_SET_FN(executor_fn, TpuTransferManager_Free);
-  TFTPU_SET_FN(executor_fn, TpuTransferManager_PlatformId);
-  TFTPU_SET_FN(executor_fn, TpuTransferManager_HostShapeToDeviceShape);
-  TFTPU_SET_FN(executor_fn, TpuTransferManager_TransferLiteralToDeviceAsync);
-  TFTPU_SET_FN(executor_fn, TpuTransferManager_TransferLiteralFromDevice);
-  TFTPU_SET_FN(executor_fn, TpuTransferManager_GetByteSizeRequirement);
-  TFTPU_SET_FN(executor_fn, TpuTransferManager_WriteSingleTupleIndexTable);
-
-  TFTPU_SET_FN(executor_fn, TpuComputationPlacer_New);
-  TFTPU_SET_FN(executor_fn, TpuComputationPlacer_Free);
-
-  TFTPU_SET_FN(executor_fn, TpuTopology_LogicalDevicesPerHost);
-  TFTPU_SET_FN(executor_fn, TpuTopology_LogicalDevicesPerChip);
-  TFTPU_SET_FN(executor_fn, TpuTopology_ChipBounds_X);
-  TFTPU_SET_FN(executor_fn, TpuTopology_ChipBounds_Y);
-  TFTPU_SET_FN(executor_fn, TpuTopology_ChipBounds_Z);
-  TFTPU_SET_FN(executor_fn, TpuTopology_HasChip);
-  TFTPU_SET_FN(executor_fn, TpuTopology_Core);
-  TFTPU_SET_FN(executor_fn, TpuCoreLocation_ChipCoordinates_X);
-  TFTPU_SET_FN(executor_fn, TpuCoreLocation_ChipCoordinates_Y);
-  TFTPU_SET_FN(executor_fn, TpuCoreLocation_ChipCoordinates_Z);
-  TFTPU_SET_FN(executor_fn, TpuCoreLocation_Index);
-  TFTPU_SET_FN(executor_fn, TpuCoreLocation_Id);
-
-  TFTPU_SET_FN(executor_fn, TpuHostLocation_Id);
-
-  TFTPU_SET_FN(executor_fn, TpuCompiler_New);
-  TFTPU_SET_FN(executor_fn, TpuCompiler_Free);
-
-  TFTPU_SET_FN(executor_fn, TpuCompiler_RunHloPasses);
-  TFTPU_SET_FN(executor_fn, TpuCompiler_RunBackend);
-  TFTPU_SET_FN(executor_fn, TpuCompiler_Compile);
-  TFTPU_SET_FN(executor_fn, TpuCompiler_ShapeSize);
-  TFTPU_SET_FN(executor_fn, TpuExecutable_ExecuteAsyncOnStream);
-  TFTPU_SET_FN(executor_fn, TpuExecutable_Free);
-
-  TFTPU_SET_FN(executor_fn, XlaShapeToTpuShapeRepresentation);
-  TFTPU_SET_FN(executor_fn, XlaShapeToTpuPaddedShape);
+  TFTPU_SET_FN(tpu_program_fn, TpuProgram_HasSharding);
+  TFTPU_SET_FN(tpu_program_fn, TpuProgram_GetTpuProgram);
+  TFTPU_SET_FN(tpu_program_fn, TpuProgram_SerializeTpuExecutable);
+  TFTPU_SET_FN(tpu_program_fn, TpuProgram_SerializeCompilerMetadata);
+  TFTPU_SET_FN(tpu_program_fn, TpuProgram_SerializeHostComputeMetadata);
+  TFTPU_SET_FN(tpu_program_fn,
+               TpuProgram_DeserializeFromGetTpuProgramResponseProto);
 
   return tensorflow::Status::OK();
 }
@@ -210,6 +100,7 @@ tensorflow::Status SetTpuUtilStructFns(void* library_handle) {
   auto* util_fn = tensorflow::tpu::UtilApiFn();
 
   TFTPU_SET_FN(util_fn, TpuTopology_AvailableCoreCount);
+  TFTPU_SET_FN(util_fn, TpuNetUtil_RecycleUnusedPort);
   TFTPU_SET_FN(util_fn, TpuCompile_IsTpuCompilationEnabled);
   TFTPU_SET_FN(util_fn, TpuCompile_ShouldTpuCompileOpIgnoreCancellation);
   TFTPU_SET_FN(util_fn, TpuCompile_CreateCompilationCacheKey);
diff --git a/tensorflow/core/tpu/tpu_on_demand_compiler.cc b/tensorflow/core/tpu/tpu_on_demand_compiler.cc
index eae7ff86835..d5c94db89bc 100644
--- a/tensorflow/core/tpu/tpu_on_demand_compiler.cc
+++ b/tensorflow/core/tpu/tpu_on_demand_compiler.cc
@@ -22,11 +22,14 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_module_group.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/stream_executor/device_memory_allocator.h"
 #include "tensorflow/stream_executor/tpu/c_api_conversions.h"
+#include "tensorflow/stream_executor/tpu/c_api_decl.h"
 #include "tensorflow/stream_executor/tpu/proto_helper.h"
 #include "tensorflow/stream_executor/tpu/status_helper.h"
+#include "tensorflow/stream_executor/tpu/tpu_executable_interface.h"
 #include "tensorflow/stream_executor/tpu/tpu_executor.h"
 #include "tensorflow/stream_executor/tpu/tpu_executor_c_api.h"
 #include "tensorflow/stream_executor/tpu/tpu_platform.h"
@@ -40,7 +43,7 @@ static SE_ExecutableRunOptions ToC(
   se_options.device_ordinal = options.run_options().device_ordinal();
   if (options.run_options().host_to_device_stream() != nullptr) {
     se_options.host_to_device_stream =
-        static_cast<TpuStream*>(
+        static_cast<tensorflow::tpu::TpuStream*>(
             options.run_options().host_to_device_stream()->implementation())
             ->se_stream();
   } else {
@@ -69,7 +72,8 @@ static SE_ExecutableRunOptions ToC(
 
   auto impl =
       const_cast<stream_executor::Stream*>(options.stream())->implementation();
-  se_options.stream = static_cast<TpuStream*>(impl)->se_stream();
+  se_options.stream =
+      static_cast<tensorflow::tpu::TpuStream*>(impl)->se_stream();
   return se_options;
 }
 }  // namespace ApiConverter
@@ -80,11 +84,25 @@ namespace {
 
 using ::tensorflow::tpu::ExecutorApiFn;
 
-class TpuExecutable : public Executable {
+void XLA_HloModuleConfig_Free(XLA_HloModuleConfig* module_config) {
+  for (auto i = 0; i < module_config->entry_computation_layout.parameter_count;
+       ++i) {
+    ApiConverter::Free(
+        &module_config->entry_computation_layout.parameter_layouts[i]);
+  }
+  delete[] module_config->entry_computation_layout.parameter_layouts;
+  ApiConverter::Free(&module_config->entry_computation_layout.result_layout);
+  if (module_config->has_static_device_assignment) {
+    stream_executor::tpu::SerializedProto_Free(
+        module_config->static_device_assignment);
+  }
+}
+
+class TpuExecutable : public TpuExecutableInterface {
  public:
   TpuExecutable(SE_Executable* se_executable,
                 std::shared_ptr<HloModule> hlo_module)
-      : Executable(std::move(hlo_module), nullptr, nullptr),
+      : TpuExecutableInterface(std::move(hlo_module), nullptr, nullptr),
         se_executable_(se_executable) {}
 
   ~TpuExecutable() override {
@@ -105,7 +123,8 @@ class TpuExecutable : public Executable {
       auto* arg_buffers = arg.MutableBuffers();
       absl::InlinedVector<SE_MaybeOwningDeviceMemory, 2> se_buffers;
       for (auto& pair : *arg_buffers) {
-        se_buffers.push_back(ApiConverter::ToC(pair.second));
+        bool aliased = arg.unowned_indices().count(pair.first) > 0;
+        se_buffers.push_back(ApiConverter::ToC(pair.second, aliased));
       }
       se_args[i]->shape_tree.buffers =
           new SE_MaybeOwningDeviceMemory[se_buffers.size()];
@@ -129,6 +148,21 @@ class TpuExecutable : public Executable {
     ExecutorApiFn()->TpuExecutable_ExecuteAsyncOnStreamFn(
         se_executable_, &se_run_options, se_args, arguments.size(), nullptr,
         &se_execution_output, status.c_status);
+
+    if (se_run_options.device_assignment.bytes != nullptr) {
+      stream_executor::tpu::SerializedProto_Free(
+          se_run_options.device_assignment);
+    }
+    for (int i = 0; i < arguments.size(); ++i) {
+      ApiConverter::Free(&se_args[i]->shape_tree.shape);
+      ApiConverter::Free(&se_args[i]->dynamic_shape);
+      ApiConverter::Free(&se_args[i]->host_shape);
+      delete[] se_args[i]->unowned_indices;
+      delete[] se_args[i]->shape_tree.buffers;
+      delete se_args[i];
+    }
+    delete[] se_args;
+
     if (!status.ok()) {
       return status.status();
     }
@@ -136,12 +170,14 @@ class TpuExecutable : public Executable {
     xla::ScopedShapedBuffer result(
         ApiConverter::FromC(&se_execution_output.result),
         run_options->stream()->parent()->GetAllocator());
+    ApiConverter::Free(&se_execution_output.result);
 
     ExecutionOutput output(std::move(result));
     for (int i = 0; i < se_execution_output.aliased_indices_size; ++i) {
       output.AddAliasedIndex(
           ApiConverter::FromC(&se_execution_output.aliased_indices[i]));
     }
+    ApiConverter::Free(se_execution_output.aliased_indices);
 
     for (int i = 0; i < se_execution_output.to_be_released_size; ++i) {
       output.AddToBeReleased(
@@ -150,11 +186,36 @@ class TpuExecutable : public Executable {
               .Release()
               .value());
     }
+    delete[] se_execution_output.to_be_released;
 
     return output;
   }
 
+  absl::string_view fingerprint() const override {
+    const char* data;
+    size_t size;
+    ExecutorApiFn()->TpuExecutable_FingerprintFn(se_executable_, &data, &size);
+    return absl::string_view(data, size);
+  }
+
  private:
+  Status LoadProgramAndEnqueueToStream(
+      const ServiceExecutableRunOptions& run_options,
+      absl::Span<const stream_executor::DeviceMemoryBase> arguments,
+      stream_executor::DeviceMemoryBase result,
+      absl::optional<stream_executor::DeviceMemoryBase>
+          cross_program_prefetch_addr) override {
+    LOG(FATAL) << "LoadProgramAndEnqueueToStream unimplemented";
+  }
+
+  Shape HostShapeToDeviceShape(const Shape& host_shape) override {
+    LOG(FATAL) << "HostShapeToDeviceShape unimplemented";
+  }
+
+  int64 ShapeSize(const Shape& shape) override {
+    LOG(FATAL) << "ShapeSize unimplemented";
+  }
+
   SE_Executable* se_executable_;
 };
 
@@ -208,10 +269,15 @@ class TpuCompiler : public Compiler {
       stream_executor::StreamExecutor* executor,
       stream_executor::DeviceMemoryAllocator* device_allocator) override {
     XLA_HloModule hlo_module;
+    XLA_HloModule result;
+    auto cleanup = xla::MakeCleanup([&hlo_module, &result]() {
+      stream_executor::tpu::SerializedProto_Free(hlo_module.proto);
+      stream_executor::tpu::SerializedProto_Free(result.proto);
+      XLA_HloModuleConfig_Free(&hlo_module.module_config);
+    });
     hlo_module.module_config = HloModuleConfigToC(module->config());
     hlo_module.proto = stream_executor::tpu::SerializeProto(module->ToProto());
     auto allocator = ApiConverter::ToC(device_allocator);
-    XLA_HloModule result;
     StatusHelper status;
     ExecutorApiFn()->TpuCompiler_RunHloPassesFn(
         compiler_, &hlo_module,
@@ -241,11 +307,15 @@ class TpuCompiler : public Compiler {
       stream_executor::StreamExecutor* executor,
       stream_executor::DeviceMemoryAllocator* device_allocator) override {
     XLA_HloModule hlo_module;
+    auto cleanup = xla::MakeCleanup([&hlo_module]() {
+      stream_executor::tpu::SerializedProto_Free(hlo_module.proto);
+      XLA_HloModuleConfig_Free(&hlo_module.module_config);
+    });
+    SE_Executable* result;
     hlo_module.module_config = HloModuleConfigToC(module->config());
     hlo_module.proto = stream_executor::tpu::SerializeProto(module->ToProto());
     auto allocator = ApiConverter::ToC(device_allocator);
 
-    SE_Executable* result;
     StatusHelper status;
     ExecutorApiFn()->TpuCompiler_RunBackendFn(
         compiler_, &hlo_module,
@@ -270,11 +340,18 @@ class TpuCompiler : public Compiler {
         stream_executor::tpu::SerializeProto(module_group->ToProto());
     se_module_group.module_config =
         new XLA_HloModuleConfig[module_group->size()];
+    int module_group_size = module_group->size();
+    auto cleanup_config =
+        xla::MakeCleanup([&se_module_group, module_group_size]() {
+          for (auto i = 0; i < module_group_size; ++i) {
+            XLA_HloModuleConfig_Free(&se_module_group.module_config[i]);
+          }
+          delete[] se_module_group.module_config;
+        });
     for (int i = 0; i < module_group->size(); ++i) {
       const auto& config = module_group->module(i).config();
       se_module_group.module_config[i] = HloModuleConfigToC(config);
     }
-
     SE_StreamExecutorList* se_lists =
         new SE_StreamExecutorList[stream_exec.size()];
     for (int i = 0; i < stream_exec.size(); ++i) {
@@ -308,6 +385,10 @@ class TpuCompiler : public Compiler {
                                                         std::move(modules[i]));
     }
 
+    stream_executor::tpu::SerializedProto_Free(se_module_group.proto);
+    delete se_module_group.module_config;
+    delete[] se_executables;
+
     return executables;
   }
 
diff --git a/tensorflow/core/util/BUILD b/tensorflow/core/util/BUILD
index dcb2787e309..d7f4464d8b0 100644
--- a/tensorflow/core/util/BUILD
+++ b/tensorflow/core/util/BUILD
@@ -16,6 +16,7 @@ load(
     "tf_cuda_only_cc_test",
     "tf_kernel_library",
 )
+load("//tensorflow:tensorflow.bzl", "get_compatible_with_portable")
 load("//tensorflow:tensorflow.bzl", "tf_version_info_genrule")
 load(
     "//third_party/mkl:build_defs.bzl",
@@ -40,11 +41,12 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-# List of exported proto source files.
+# List of exported source files.
 exports_files(
     srcs = [
         "event.proto",
         "example_proto_fast_parsing_test.proto",
+        "image_resizer_state.h",
         "memmapped_file_system.proto",
         "saved_tensor_slice.proto",
     ],
@@ -104,6 +106,8 @@ filegroup(
         "ptr_util.h",
         "ragged_to_dense_util.cc",
         "ragged_to_dense_util.h",
+        "ragged_to_dense_util_common.cc",
+        "ragged_to_dense_util_common.h",
         "reffed_status_callback.h",
         "saved_tensor_slice_util.cc",
         "saved_tensor_slice_util.h",
@@ -363,6 +367,7 @@ filegroup(
 tf_version_info_genrule(
     name = "version_info_gen",
     out = "version_info.cc",
+    compatible_with = get_compatible_with_portable(),
 )
 
 cc_library(
@@ -380,6 +385,19 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "ragged_to_dense_util_common",
+    srcs = [
+        "ragged_to_dense_util_common.cc",
+    ],
+    hdrs = [
+        "ragged_to_dense_util_common.h",
+    ],
+    visibility = [
+        "//visibility:public",
+    ],
+)
+
 cc_library(
     name = "ragged_to_dense_util",
     srcs = [
@@ -389,6 +407,7 @@ cc_library(
         "ragged_to_dense_util.h",
     ],
     deps = [
+        ":ragged_to_dense_util_common",
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
     ],
@@ -506,6 +525,7 @@ cc_library(
     name = "version_info",
     srcs = ["version_info.cc"],
     hdrs = ["//tensorflow/core/public:version.h"],
+    compatible_with = get_compatible_with_portable(),
     copts = tf_copts(),
     alwayslink = if_static(0, 1),
 )
@@ -519,6 +539,7 @@ cc_library(
         "//tensorflow/core/lib/gtl:array_slice",
         "//tensorflow/core/lib/gtl:inlined_vector",
         "//tensorflow/core/platform:types",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -624,14 +645,24 @@ tf_kernel_library(
         "//tensorflow/core:lib",
     ] + if_cuda([
         "//tensorflow/stream_executor/cuda:cusparse_lib",
-        "@cub_archive//:cub",
+        "@local_config_cuda//cuda:cub_headers",
     ]) + if_rocm([
-        "@local_config_rocm//rocm:hipsparse",
+        "//tensorflow/stream_executor/rocm:hipsparse_wrapper",
     ]),
 )
 
-# Tests.
+cc_library(
+    name = "image_resizer_state",
+    hdrs = ["image_resizer_state.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/kernels:bounds_check",
+        "//third_party/eigen3",
+    ],
+)
 
+# Tests.
 tf_cc_test(
     name = "overflow_test",
     size = "small",
@@ -837,4 +868,7 @@ tf_proto_library(
         ":saved_tensor_slice_proto",
         ":memmapped_file_system_proto",
     ],
+    tags = [
+        "alt_dep=//third_party/tensorflow/core:protos_all",
+    ],
 )
diff --git a/tensorflow/core/util/cuda_sparse.h b/tensorflow/core/util/cuda_sparse.h
index 76580766d69..2f22e7b8c48 100644
--- a/tensorflow/core/util/cuda_sparse.h
+++ b/tensorflow/core/util/cuda_sparse.h
@@ -46,7 +46,7 @@ using gpusparseSpMMAlg_t = cusparseSpMMAlg_t;
 
 #elif TENSORFLOW_USE_ROCM
 
-#include "rocm/include/hipsparse/hipsparse.h"
+#include "tensorflow/stream_executor/rocm/hipsparse_wrapper.h"
 
 using gpusparseStatus_t = hipsparseStatus_t;
 using gpusparseOperation_t = hipsparseOperation_t;
@@ -75,7 +75,8 @@ using gpuStream_t = hipStream_t;
 
 namespace tensorflow {
 
-inline string ConvertGPUSparseErrorToString(const gpusparseStatus_t status) {
+inline std::string ConvertGPUSparseErrorToString(
+    const gpusparseStatus_t status) {
   switch (status) {
 #define STRINGIZE(q) #q
 #define RETURN_IF_STATUS(err) \
@@ -485,7 +486,7 @@ class GpuSparseMatrixDescriptor {
 #if GOOGLE_CUDA
     TF_RETURN_IF_GPUSPARSE_ERROR(cusparseCreateMatDescr(&descr_));
 #elif TENSORFLOW_USE_ROCM
-    TF_RETURN_IF_GPUSPARSE_ERROR(hipsparseCreateMatDescr(&descr_));
+    TF_RETURN_IF_GPUSPARSE_ERROR(wrap::hipsparseCreateMatDescr(&descr_));
 #endif
     initialized_ = true;
     return Status::OK();
@@ -507,7 +508,7 @@ class GpuSparseMatrixDescriptor {
 #if GOOGLE_CUDA
       cusparseDestroyMatDescr(descr_);
 #elif TENSORFLOW_USE_ROCM
-      hipsparseDestroyMatDescr(descr_);
+      wrap::hipsparseDestroyMatDescr(descr_);
 #endif
       initialized_ = false;
     }
diff --git a/tensorflow/core/util/device_name_utils.cc b/tensorflow/core/util/device_name_utils.cc
index 8688a11870e..14dab634416 100644
--- a/tensorflow/core/util/device_name_utils.cc
+++ b/tensorflow/core/util/device_name_utils.cc
@@ -174,6 +174,11 @@ bool DeviceNameUtils::ParseFullName(StringPiece fullname, ParsedName* p) {
   return true;
 }
 
+bool DeviceNameUtils::ParseFullOrLocalName(StringPiece fullname,
+                                           ParsedName* p) {
+  return ParseFullName(fullname, p) || ParseLocalName(fullname, p);
+}
+
 namespace {
 
 void CompleteName(const DeviceNameUtils::ParsedName& parsed_basename,
diff --git a/tensorflow/core/util/device_name_utils.h b/tensorflow/core/util/device_name_utils.h
index a1515ba8508..3de7544a05e 100644
--- a/tensorflow/core/util/device_name_utils.h
+++ b/tensorflow/core/util/device_name_utils.h
@@ -89,6 +89,11 @@ class DeviceNameUtils {
     bool has_id = false;
     int id = 0;
   };
+
+  // Parses the device name, first as a full name, then, if it fails, as a
+  // global one. Returns `false` if both attempts fail.
+  static bool ParseFullOrLocalName(StringPiece fullname, ParsedName* parsed);
+
   // Parses "fullname" into "*parsed". Returns true iff succeeds.
   // Legacy names like "/cpu:0" that don't contain "device",
   // are parsed to mean their current counterparts "/device:CPU:0". More
diff --git a/tensorflow/core/util/device_name_utils_test.cc b/tensorflow/core/util/device_name_utils_test.cc
index 729d1ec3ae8..065fcfbf2ce 100644
--- a/tensorflow/core/util/device_name_utils_test.cc
+++ b/tensorflow/core/util/device_name_utils_test.cc
@@ -105,6 +105,8 @@ TEST(DeviceNameUtilsTest, Basic) {
     DeviceNameUtils::ParsedName p;
     EXPECT_TRUE(DeviceNameUtils::ParseFullName(
         "/job:foo_bar/replica:1/task:2/device:GPU:3", &p));
+    EXPECT_TRUE(DeviceNameUtils::ParseFullOrLocalName(
+        "/job:foo_bar/replica:1/task:2/device:GPU:3", &p));
     EXPECT_TRUE(p.has_job);
     EXPECT_TRUE(p.has_replica);
     EXPECT_TRUE(p.has_task);
@@ -246,12 +248,14 @@ TEST(DeviceNameUtilsTest, Basic) {
   {
     DeviceNameUtils::ParsedName p;
     EXPECT_TRUE(DeviceNameUtils::ParseLocalName("CPU:10", &p));
+    EXPECT_TRUE(DeviceNameUtils::ParseFullOrLocalName("CPU:10", &p));
     EXPECT_EQ(p.type, "CPU");
     EXPECT_EQ(p.id, 10);
     EXPECT_FALSE(DeviceNameUtils::ParseLocalName("cpu:abc", &p));
     EXPECT_FALSE(DeviceNameUtils::ParseLocalName("abc:", &p));
     EXPECT_FALSE(DeviceNameUtils::ParseLocalName("abc", &p));
     EXPECT_FALSE(DeviceNameUtils::ParseLocalName("myspecialdevice", &p));
+    EXPECT_FALSE(DeviceNameUtils::ParseFullOrLocalName("myspecialdevice", &p));
   }
 
   // Test that all parts are round-tripped correctly.
diff --git a/tensorflow/core/util/event.proto b/tensorflow/core/util/event.proto
index f5dfffa671b..310d7948752 100644
--- a/tensorflow/core/util/event.proto
+++ b/tensorflow/core/util/event.proto
@@ -8,6 +8,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "EventProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.util";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/util/event_go_proto";
 
 // Protocol buffer representing an event that happened during
 // the execution of a Brain model.
diff --git a/tensorflow/core/util/events_writer.h b/tensorflow/core/util/events_writer.h
index d5952c3cbdf..249bf31a108 100644
--- a/tensorflow/core/util/events_writer.h
+++ b/tensorflow/core/util/events_writer.h
@@ -44,7 +44,7 @@ class EventsWriter {
   // to the ultimate filename once Init() is called.
   // Note that it is not recommended to simultaneously have two
   // EventWriters writing to the same file_prefix.
-  explicit EventsWriter(const string& file_prefix);
+  explicit EventsWriter(const std::string& file_prefix);
   ~EventsWriter();
 
   // Sets the event file filename and opens file for writing.  If not called by
@@ -54,11 +54,11 @@ class EventsWriter {
   // but has since disappeared (e.g. deleted by another process), this will open
   // a new file with a new timestamp in its filename.
   Status Init();
-  Status InitWithSuffix(const string& suffix);
+  Status InitWithSuffix(const std::string& suffix);
 
   // Returns the filename for the current events file:
   // filename_ = [file_prefix_].out.events.[timestamp].[hostname][suffix]
-  string FileName();
+  std::string FileName();
 
   // Append "event" to the file.  The "tensorflow::" part is for swig happiness.
   void WriteEvent(const tensorflow::Event& event);
@@ -84,9 +84,9 @@ class EventsWriter {
   Status InitIfNeeded();
 
   Env* env_;
-  const string file_prefix_;
-  string file_suffix_;
-  string filename_;
+  const std::string file_prefix_;
+  std::string file_suffix_;
+  std::string filename_;
   std::unique_ptr<WritableFile> recordio_file_;
   std::unique_ptr<io::RecordWriter> recordio_writer_;
   int num_outstanding_events_;
diff --git a/tensorflow/core/util/example_proto_fast_parsing.cc b/tensorflow/core/util/example_proto_fast_parsing.cc
index b148ffab042..03acb98989b 100644
--- a/tensorflow/core/util/example_proto_fast_parsing.cc
+++ b/tensorflow/core/util/example_proto_fast_parsing.cc
@@ -167,11 +167,14 @@ class Feature {
   }
 
   // Helper methods
-  tstring& construct_at_end(LimitedArraySlice<tstring>* bytes_list) {
-    return bytes_list->construct_at_end();
+  tstring* construct_at_end(LimitedArraySlice<tstring>* bytes_list) {
+    if (bytes_list->EndDistance() <= 0) {
+      return nullptr;
+    }
+    return &bytes_list->construct_at_end();
   }
-  tstring& construct_at_end(SmallVector<tstring>* bytes_list) {
-    return bytes_list->emplace_back();
+  tstring* construct_at_end(SmallVector<tstring>* bytes_list) {
+    return &bytes_list->emplace_back();
   }
 
   template <typename Result>
@@ -192,9 +195,10 @@ class Feature {
       // parse string
       uint32 bytes_length;
       if (!stream.ReadVarint32(&bytes_length)) return false;
-      tstring& bytes = construct_at_end(bytes_list);
-      bytes.resize_uninitialized(bytes_length);
-      if (!stream.ReadRaw(bytes.data(), bytes_length)) return false;
+      tstring* bytes = construct_at_end(bytes_list);
+      if (bytes == nullptr) return false;
+      bytes->resize_uninitialized(bytes_length);
+      if (!stream.ReadRaw(bytes->data(), bytes_length)) return false;
     }
     stream.PopLimit(limit);
     return true;
diff --git a/tensorflow/core/util/gpu_device_functions.h b/tensorflow/core/util/gpu_device_functions.h
index bdfe528bcdf..5381c054d32 100644
--- a/tensorflow/core/util/gpu_device_functions.h
+++ b/tensorflow/core/util/gpu_device_functions.h
@@ -606,7 +606,7 @@ __device__ double GpuAtomicCasHelper(double* ptr, F accumulate) {
   // HIP has a bug in the implementation of __longlong_as_double
   // So workaround it by using reinterpret_cast<double*>.
   uint64_t result =
-      GpuAtomicCasHelper(reinterpret_cast<tensorflow::uint64*>(ptr),
+      GpuAtomicCasHelper(reinterpret_cast<unsigned long long*>(ptr),
                          [accumulate](tensorflow::uint64 a) {
                            return __double_as_longlong(
                                accumulate(*(reinterpret_cast<double*>(&a))));
@@ -614,7 +614,7 @@ __device__ double GpuAtomicCasHelper(double* ptr, F accumulate) {
   return *(reinterpret_cast<double*>(&result));
 #else
   return __longlong_as_double(GpuAtomicCasHelper(
-      reinterpret_cast<tensorflow::uint64*>(ptr),
+      reinterpret_cast<unsigned long long*>(ptr),
       [accumulate](tensorflow::uint64 a) {
         return __double_as_longlong(accumulate(__longlong_as_double(a)));
       }));
@@ -676,6 +676,38 @@ template <typename From, typename To>
 using ToTypeIfConvertible =
     typename std::enable_if<std::is_convertible<From, To>::value, To>::type;
 
+template <typename T>
+struct CudaSupportedTypeImpl {
+  using type = T;
+};
+
+template <>
+struct CudaSupportedTypeImpl<long long> {
+  using type = unsigned long long;
+};
+
+template <>
+struct CudaSupportedTypeImpl<unsigned long> {
+  using type =
+      typename std::conditional<sizeof(unsigned long) == sizeof(unsigned int),
+                                unsigned int, unsigned long long>::type;
+};
+
+template <>
+struct CudaSupportedTypeImpl<long> {
+  // This cast should be safe since module-2 addition should work fine. However,
+  // signed overflow is not handled correctly since it's undefined behavior.
+  using type = typename CudaSupportedTypeImpl<unsigned long>::type;
+};
+
+template <typename T>
+using CudaSupportedType = typename CudaSupportedTypeImpl<T>::type;
+
+template <typename T>
+__device__ CudaSupportedType<T>* ToCudaSupportedPtr(T* ptr) {
+  return reinterpret_cast<CudaSupportedType<T>*>(ptr);
+}
+
 }  // namespace detail
 
 // CUDA provides atomic ops, but not for all types.  We provide wrappers
@@ -683,13 +715,7 @@ using ToTypeIfConvertible =
 
 template <typename T, typename U>
 __device__ detail::ToTypeIfConvertible<U, T> GpuAtomicAdd(T* ptr, U value) {
-  return atomicAdd(ptr, value);
-}
-
-__device__ inline int64 GpuAtomicAdd(int64* ptr, int64 value) {
-  // This cast should be safe since module-2 addition should work fine. However,
-  // signed overflow is not handled correctly since it's undefined behavior.
-  return atomicAdd(reinterpret_cast<uint64*>(ptr), static_cast<uint64>(value));
+  return atomicAdd(detail::ToCudaSupportedPtr(ptr), value);
 }
 
 __device__ inline Eigen::half GpuAtomicAdd(Eigen::half* ptr,
@@ -765,7 +791,7 @@ CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuAtomicSub, CudaAtomicSub);
 // GpuAtomicMax
 template <typename T, typename U>
 __device__ detail::ToTypeIfConvertible<U, T> GpuAtomicMax(T* ptr, U value) {
-  return atomicMax(ptr, value);
+  return atomicMax(detail::ToCudaSupportedPtr(ptr), value);
 }
 
 #if TENSORFLOW_USE_ROCM
@@ -817,11 +843,12 @@ __device__ inline Eigen::half GpuAtomicMax(Eigen::half* ptr,
 __device__ inline tensorflow::uint64 GpuAtomicMax(tensorflow::uint64* ptr,
                                                   tensorflow::uint64 value) {
   return detail::GpuAtomicCasHelper(
-      ptr, [value](tensorflow::uint64 a) { return max(a, value); });
+      detail::ToCudaSupportedPtr(ptr),
+      [value](tensorflow::uint64 a) { return max(a, value); });
 }
 
 __device__ inline int64 GpuAtomicMax(int64* ptr, int64 value) {
-  return detail::GpuAtomicCasHelper(ptr,
+  return detail::GpuAtomicCasHelper(detail::ToCudaSupportedPtr(ptr),
                                     [value](int64 a) { return max(a, value); });
 }
 #endif
@@ -830,7 +857,7 @@ CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuAtomicMax, CudaAtomicMax);
 // GpuAtomicMin
 template <typename T, typename U>
 __device__ detail::ToTypeIfConvertible<U, T> GpuAtomicMin(T* ptr, U value) {
-  return atomicMin(ptr, value);
+  return atomicMin(detail::ToCudaSupportedPtr(ptr), value);
 }
 
 #if TENSORFLOW_USE_ROCM
@@ -882,11 +909,12 @@ __device__ inline Eigen::half GpuAtomicMin(Eigen::half* ptr,
 __device__ inline tensorflow::uint64 GpuAtomicMin(tensorflow::uint64* ptr,
                                                   tensorflow::uint64 value) {
   return detail::GpuAtomicCasHelper(
-      ptr, [value](tensorflow::uint64 a) { return min(a, value); });
+      detail::ToCudaSupportedPtr(ptr),
+      [value](tensorflow::uint64 a) { return min(a, value); });
 }
 
 __device__ inline int64 GpuAtomicMin(int64* ptr, int64 value) {
-  return detail::GpuAtomicCasHelper(ptr,
+  return detail::GpuAtomicCasHelper(detail::ToCudaSupportedPtr(ptr),
                                     [value](int64 a) { return min(a, value); });
 }
 #endif
diff --git a/tensorflow/core/kernels/image_resizer_state.h b/tensorflow/core/util/image_resizer_state.h
similarity index 98%
rename from tensorflow/core/kernels/image_resizer_state.h
rename to tensorflow/core/util/image_resizer_state.h
index 1b1550fd47a..b302021918d 100644
--- a/tensorflow/core/kernels/image_resizer_state.h
+++ b/tensorflow/core/util/image_resizer_state.h
@@ -18,12 +18,12 @@ limitations under the License.
 // reduce code duplication and ensure consistency across the different
 // resizers, it performs the input validation.
 
-#ifndef TENSORFLOW_CORE_KERNELS_IMAGE_RESIZER_STATE_H_
-#define TENSORFLOW_CORE_KERNELS_IMAGE_RESIZER_STATE_H_
+#ifndef TENSORFLOW_CORE_KERNELS_UTIL_IMAGE_RESIZER_STATE_H_
+#define TENSORFLOW_CORE_KERNELS_UTIL_IMAGE_RESIZER_STATE_H_
 
 #define EIGEN_USE_THREADS
-
 #include <math.h>
+
 #include <algorithm>
 #include <array>
 
@@ -228,4 +228,4 @@ struct ImageResizerGradientState {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_IMAGE_RESIZER_STATE_H_
+#endif  // TENSORFLOW_CORE_KERNELS_UTIL_IMAGE_RESIZER_STATE_H_
diff --git a/tensorflow/core/util/memmapped_file_system.cc b/tensorflow/core/util/memmapped_file_system.cc
index 1451d6350ce..c6bda8b07e9 100644
--- a/tensorflow/core/util/memmapped_file_system.cc
+++ b/tensorflow/core/util/memmapped_file_system.cc
@@ -86,8 +86,8 @@ class RandomAccessFileFromMemmapped : public RandomAccessFile {
 
 MemmappedFileSystem::MemmappedFileSystem() {}
 
-Status MemmappedFileSystem::FileExists(
-    const string& fname /*, TransactionToken* token */) {
+Status MemmappedFileSystem::FileExists(const string& fname,
+                                       TransactionToken* token) {
   if (!mapped_memory_) {
     return errors::FailedPrecondition("MemmappedEnv is not initialized");
   }
@@ -99,8 +99,8 @@ Status MemmappedFileSystem::FileExists(
 }
 
 Status MemmappedFileSystem::NewRandomAccessFile(
-    const string& filename,
-    std::unique_ptr<RandomAccessFile>* result /*, TransactionToken* token */) {
+    const string& filename, TransactionToken* token,
+    std::unique_ptr<RandomAccessFile>* result) {
   if (!mapped_memory_) {
     return errors::FailedPrecondition("MemmappedEnv is not initialized");
   }
@@ -115,8 +115,8 @@ Status MemmappedFileSystem::NewRandomAccessFile(
 }
 
 Status MemmappedFileSystem::NewReadOnlyMemoryRegionFromFile(
-    const string& filename, std::unique_ptr<ReadOnlyMemoryRegion>*
-                                result /*, TransactionToken* token */) {
+    const string& filename, TransactionToken* token,
+    std::unique_ptr<ReadOnlyMemoryRegion>* result) {
   if (!mapped_memory_) {
     return errors::FailedPrecondition("MemmappedEnv is not initialized");
   }
@@ -130,8 +130,8 @@ Status MemmappedFileSystem::NewReadOnlyMemoryRegionFromFile(
   return Status::OK();
 }
 
-Status MemmappedFileSystem::GetFileSize(
-    const string& filename, uint64* size /*, TransactionToken* token */) {
+Status MemmappedFileSystem::GetFileSize(const string& filename,
+                                        TransactionToken* token, uint64* size) {
   if (!mapped_memory_) {
     return errors::FailedPrecondition("MemmappedEnv is not initialized");
   }
@@ -143,59 +143,59 @@ Status MemmappedFileSystem::GetFileSize(
   return Status::OK();
 }
 
-Status MemmappedFileSystem::Stat(
-    const string& fname, FileStatistics* stat /*, TransactionToken* token */) {
+Status MemmappedFileSystem::Stat(const string& fname, TransactionToken* token,
+                                 FileStatistics* stat) {
   uint64 size;
-  auto status = GetFileSize(fname, &size);
+  auto status = GetFileSize(fname, token, &size);
   if (status.ok()) {
     stat->length = size;
   }
   return status;
 }
 
-Status MemmappedFileSystem::NewWritableFile(
-    const string& filename,
-    std::unique_ptr<WritableFile>* wf /*, TransactionToken* token */) {
+Status MemmappedFileSystem::NewWritableFile(const string& filename,
+                                            TransactionToken* token,
+                                            std::unique_ptr<WritableFile>* wf) {
   return errors::Unimplemented("memmapped format doesn't support writing");
 }
 
 Status MemmappedFileSystem::NewAppendableFile(
-    const string& filename,
-    std::unique_ptr<WritableFile>* result /*, TransactionToken* token */) {
+    const string& filename, TransactionToken* token,
+    std::unique_ptr<WritableFile>* result) {
   return errors::Unimplemented("memmapped format doesn't support writing");
 }
 
-Status MemmappedFileSystem::GetChildren(
-    const string& filename,
-    std::vector<string>* strings /*, TransactionToken* token */) {
+Status MemmappedFileSystem::GetChildren(const string& filename,
+                                        TransactionToken* token,
+                                        std::vector<string>* strings) {
   return errors::Unimplemented("memmapped format doesn't support GetChildren");
 }
 
-Status MemmappedFileSystem::GetMatchingPaths(
-    const string& pattern,
-    std::vector<string>* results /*, TransactionToken* token */) {
+Status MemmappedFileSystem::GetMatchingPaths(const string& pattern,
+                                             TransactionToken* token,
+                                             std::vector<string>* results) {
   return errors::Unimplemented(
       "memmapped format doesn't support GetMatchingPaths");
 }
 
-Status MemmappedFileSystem::DeleteFile(
-    const string& filename /*, TransactionToken* token */) {
+Status MemmappedFileSystem::DeleteFile(const string& filename,
+                                       TransactionToken* token) {
   return errors::Unimplemented("memmapped format doesn't support DeleteFile");
 }
 
-Status MemmappedFileSystem::CreateDir(
-    const string& dirname /*, TransactionToken* token */) {
+Status MemmappedFileSystem::CreateDir(const string& dirname,
+                                      TransactionToken* token) {
   return errors::Unimplemented("memmapped format doesn't support CreateDir");
 }
 
-Status MemmappedFileSystem::DeleteDir(
-    const string& dirname /*, TransactionToken* token */) {
+Status MemmappedFileSystem::DeleteDir(const string& dirname,
+                                      TransactionToken* token) {
   return errors::Unimplemented("memmapped format doesn't support DeleteDir");
 }
 
-Status MemmappedFileSystem::RenameFile(
-    const string& filename_from,
-    const string& filename_to /*, TransactionToken* token */) {
+Status MemmappedFileSystem::RenameFile(const string& filename_from,
+                                       const string& filename_to,
+                                       TransactionToken* token) {
   return errors::Unimplemented("memmapped format doesn't support RenameFile");
 }
 
diff --git a/tensorflow/core/util/memmapped_file_system.h b/tensorflow/core/util/memmapped_file_system.h
index d8f19444454..27305a500f5 100644
--- a/tensorflow/core/util/memmapped_file_system.h
+++ b/tensorflow/core/util/memmapped_file_system.h
@@ -60,52 +60,39 @@ class MemmappedFileSystem : public FileSystem {
 
   MemmappedFileSystem();
   ~MemmappedFileSystem() override = default;
-  Status FileExists(
-      const string& fname /*, TransactionToken* token =  nullptr */) override;
+
+  TF_USE_FILESYSTEM_METHODS_WITH_NO_TRANSACTION_SUPPORT;
+
+  Status FileExists(const string& fname, TransactionToken* token) override;
   Status NewRandomAccessFile(
-      const string& filename,
-      std::unique_ptr<RandomAccessFile>*
-          result /*, TransactionToken* token =  nullptr */) override;
+      const string& filename, TransactionToken* token,
+      std::unique_ptr<RandomAccessFile>* result) override;
   Status NewReadOnlyMemoryRegionFromFile(
-      const string& filename,
-      std::unique_ptr<ReadOnlyMemoryRegion>*
-          result /*, TransactionToken* token =  nullptr */) override;
+      const string& filename, TransactionToken* token,
+      std::unique_ptr<ReadOnlyMemoryRegion>* result) override;
 
   // All these functions return Unimplemented error, the memmapped storage is
   // read only.
-  Status NewWritableFile(
-      const string& fname,
-      std::unique_ptr<WritableFile>*
-          result /*, TransactionToken* token =  nullptr */) override;
-  Status NewAppendableFile(
-      const string& fname,
-      std::unique_ptr<WritableFile>*
-          result /*, TransactionToken* token =  nullptr */) override;
-  Status GetChildren(const string& dir,
-                     std::vector<string>*
-                         r /*, TransactionToken* token =  nullptr */) override;
-  Status GetMatchingPaths(
-      const string& pattern,
-      std::vector<string>* results /*, TransactionToken* token =  nullptr */)
-      override;
-  Status DeleteFile(
-      const string& f /*, TransactionToken* token =  nullptr */) override;
-  Status CreateDir(
-      const string& d /*, TransactionToken* token =  nullptr */) override;
-  Status DeleteDir(
-      const string& d /*, TransactionToken* token =  nullptr */) override;
-  Status RenameFile(
-      const string& s,
-      const string& t /*, TransactionToken* token =  nullptr */) override;
+  Status NewWritableFile(const string& fname, TransactionToken* token,
+                         std::unique_ptr<WritableFile>* result) override;
+  Status NewAppendableFile(const string& fname, TransactionToken* token,
+                           std::unique_ptr<WritableFile>* result) override;
+  Status GetChildren(const string& dir, TransactionToken* token,
+                     std::vector<string>* r) override;
+  Status GetMatchingPaths(const string& pattern, TransactionToken* token,
+                          std::vector<string>* results) override;
+  Status DeleteFile(const string& f, TransactionToken* token) override;
+  Status CreateDir(const string& d, TransactionToken* token) override;
+  Status DeleteDir(const string& d, TransactionToken* token) override;
+  Status RenameFile(const string& s, const string& t,
+                    TransactionToken* token) override;
 
   // These functions are implemented.
-  Status GetFileSize(
-      const string& f,
-      uint64* s /*, TransactionToken* token =  nullptr */) override;
+  Status GetFileSize(const string& f, TransactionToken* token,
+                     uint64* s) override;
   // Currently just returns size.
-  Status Stat(
-      const string& fname,
-      FileStatistics* stat /*, TransactionToken* token =  nullptr */) override;
+  Status Stat(const string& fname, TransactionToken* token,
+              FileStatistics* stat) override;
 
   // Initializes filesystem from a file in memmapped format.
   Status InitializeFromFile(Env* env, const string& filename);
diff --git a/tensorflow/core/util/mkl_types.h b/tensorflow/core/util/mkl_types.h
index 8edbddac8a1..1091b2a7a59 100644
--- a/tensorflow/core/util/mkl_types.h
+++ b/tensorflow/core/util/mkl_types.h
@@ -18,6 +18,9 @@ limitations under the License.
 #ifdef INTEL_MKL
 
 namespace tensorflow {
+// MKL DNN 0.x will not be supported. So all related macro's have been removed
+// This file will be removed once MKL DNN 0.x related source code is cleaned and
+// all MKL DNN 1.x related macro's have been replaced.
 
 #ifdef ENABLE_MKLDNN_V1
 #define ADD_MD add_md
@@ -116,106 +119,6 @@ namespace tensorflow {
 #define TENSOR_FORMAT_NHWC MKL_TENSOR_FORMAT_NHWC
 #define TENSOR_MAX_DIMS MKLDNN_MAX_NDIMS
 
-#else
-
-#define ADD_MD add_pd
-#define ALGORITHM mkldnn
-#define ALGORITHM_UNDEF ALGORITHM::algorithm_undef
-#define BN_FLAGS mkldnn
-#define CPU_STREAM(engine) stream(stream::kind::eager_nostore)
-#define DATA_WITH_ENGINE(data, engine) data
-#define DST_MD dst_pd
-#define ENGINE_CPU engine::cpu
-#define GET_CHECK_REORDER_MEM_ARGS(md, tensor, net_ptr, net_args, engine) \
-  memory::primitive_desc(md, engine), tensor, &net_ptr
-#define GET_CHECK_REORDER_TO_OP_MEM_ARGS(pd, tensor, net_ptr, net_args, \
-                                         engine)                        \
-  pd, tensor, &net_ptr
-#define GET_DESC get_primitive_desc()
-#define GET_FORMAT_FROM_SHAPE(src_mkl_shape) \
-  static_cast<memory::format>(src_mkl_shape.GetMklLayout().data.format)
-#define GET_BLOCK_STRIDES(strides, idx) strides[(idx)]
-#define GET_MEMORY_DESC_CONSTRUCTOR(dims, type, fm) \
-  { {dims}, MklDnnType<type>(), fm }
-#define GET_MEMORY_SIZE_FROM_MD(md, engine) \
-  memory::primitive_desc(md, engine).get_size()
-#define GET_SRC_DESC_FROM_OP_PD(op_pd) op_pd.get()->src_primitive_desc()
-#define GET_DST_DESC_FROM_OP_PD(op_pd) op_pd.get()->dst_primitive_desc()
-#define GET_BIAS_DESC_FROM_OP_PD(op_pd) op_pd.get()->bias_primitive_desc()
-#define GET_DIFF_DST_DESC_FROM_OP_PD(op_pd) \
-  op_pd.get()->diff_dst_primitive_desc()
-#define GET_WORKSPACE_DESC_FROM_OP_PD(op_pd) \
-  op_pd.get()->workspace_primitive_desc()
-#define GET_TENSOR_FORMAT(fmt) fmt
-#define GET_TF_DATA_FORMAT(shape, mem_desc) mem_desc.data.format
-#define GET_USR_MEM_PRIM_DESC(src) src.GetUsrMemPrimDesc()
-#define GET_WEIGHTS_DESC_FROM_OP_PD(op_pd) op_pd.get()->weights_primitive_desc()
-#define GET_WEIGHTS_FORMAT_FROM_OP_PD(op_pd, op) op->GetFilterMemoryFormat()
-#define IS_DIFF_DST_REORDER_NEEDED(diff_dst_md, op_pd, op) \
-  diff_dst_md.data.format != op->GetDiffDstMemoryFormat()
-#define IS_DIFF_FILTER_REORDER_NEEDED(diff_filter_md, fmt, op_pd, op) \
-  fmt != op->GetDiffFilterMemoryFormat()
-#define IS_FILTER_REORDER_NEEDED(filter_md, op_pd, op) \
-  filter_md.data.format != op->GetFilterMemoryFormat()
-#define IS_SRC_REORDER_NEEDED(src_md, op_pd, op) \
-  src_md.data.format != op->GetSrcMemoryFormat()
-#define IS_WEIGHTS_REORDER_NEEDED(weights_md, op_pd, op) \
-  weights_md.data.format != op->GetWeightMemoryFormat()
-#define GET_MEMORY_DESC_FROM_MEM_PTR(mem_ptr) \
-  mem_ptr->get_primitive_desc().desc()
-#define GET_MEMORY_PRIMITIVE_DESC_FROM_MEM_PTR(mem_ptr) \
-  mem_ptr->get_primitive_desc()
-#define MEMORY_CONSTRUCTOR(mem_pd, engine, data) memory(mem_pd, data)
-#define MEMORY_CONSTRUCTOR_PD(mem_pd, engine, data) memory(mem_pd, data)
-#define MEMORY_CONSTRUCTOR_WITH_MEM_PD(mem_ptr, cpu_engine, data) \
-  memory({GET_MEMORY_DESC_FROM_MEM_PTR(mem_ptr), cpu_engine}, data)
-#define MEMORY_CONSTRUCTOR_USING_MD(md, engine, data) memory({md, engine}, data)
-#define MEMORY_CONSTRUCTOR_USING_MEM_PD(dims, type, fm, engine, data) \
-  memory({GET_MEMORY_DESC_CONSTRUCTOR(dims, type, fm), engine}, data)
-#define MEMORY_CONSTRUCTOR_WITHOUT_DATA(mem_pd, engine) memory(mem_pd)
-#define MEMORY_DATA_TYPE_UNDEF memory::data_type::data_undef
-#define MEMORY_DESC memory::format
-#define MEMORY_FORMAT mkldnn::memory::format
-#define MEMORY_FORMAT_DESC layout_desc
-#define MEMORY_FORMAT_UNDEF mkldnn::memory::format::format_undef
-#define MEMORY_PD_CONSTRUCTOR(dims, type, fm, engine) \
-  memory::primitive_desc(GET_MEMORY_DESC_CONSTRUCTOR(dims, type, fm), engine)
-#define MEMORY_PD_WITHOUT_DATA(pd, engine) pd
-#define MEMORY_PRIMITIVE_DESC memory::primitive_desc
-#define MEMORY_PD_CONSTRUCTOR_2_PARAMS(md, engine) \
-  MEMORY_PRIMITIVE_DESC(md, engine)
-#define MKL_FMT_TAG tf_fmt
-#define MKL_TENSOR_FORMAT memory::format
-#define MKL_TENSOR_FORMAT_BLOCKED memory::format::blocked
-#define MKL_TENSOR_FORMAT_IN_C mkldnn_memory_format_t
-#define MKL_TENSOR_FORMAT_INVALID memory::format::format_undef
-#define MKL_TENSOR_FORMAT_NC memory::format::nc
-#define MKL_TENSOR_FORMAT_NCHW memory::format::nchw
-#define MKL_TENSOR_FORMAT_NCDHW memory::format::ncdhw
-#define MKL_TENSOR_FORMAT_NDHWC memory::format::ndhwc
-#define MKL_TENSOR_FORMAT_NHWC memory::format::nhwc
-#define MKL_TENSOR_FORMAT_TNC memory::format::tnc
-#define MKL_TENSOR_FORMAT_X memory::format::x
-#define MKL_TENSOR_FORMAT_UNDEF MKL_TENSOR_FORMAT_INVALID
-#define NET_ARGS_PTR nullptr
-#define OUTPUT_TF_MD output_tf_pd
-#define PRIMITIVE_DESC_BIAS bias_primitive_desc()
-#define PRIMITIVE_DESC_WEIGHTS weights_primitive_desc()
-#define PRIMITIVE_DESC_DIFF_DST diff_dst_primitive_desc()
-#define PRIMITIVE_DESC_DIFF_SRC diff_src_primitive_desc()
-#define PRIMITIVE_DESC_DIFF_WEIGHTS diff_weights_primitive_desc()
-#define PRIMITIVE_DESC_DST dst_primitive_desc()
-#define PRIMITIVE_DESC_SRC src_primitive_desc()
-#define PRIMITIVE_DESC_WORKSPACE workspace_primitive_desc()
-#define REORDER_PD_CONSTRUCTOR(src_pd, dst_pd, engine) ReorderPd(src_pd, dst_pd)
-#define REORDER_PD_CONSTRUCTOR_WITH_ATTR(src_pd, dst_pd, engine, prim_attr) \
-  ReorderPd(src_pd, dst_pd, prim_attr)
-#define SKIP_INPUT_REORDER(input_mkl_shape, input_md)           \
-  (input_mkl_shape.GetTfDataFormat() == input_md.data.format && \
-   input_mkl_shape.GetTfDataFormat() != MKL_TENSOR_FORMAT_BLOCKED)
-#define SUMMAND_MD summand_pd
-#define TENSOR_FORMAT TensorFormat
-#define TENSOR_FORMAT_NHWC FORMAT_NHWC
 #endif  // ENABLE_MKLDNN_V1
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index 56a14cca04a..d2133244363 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -43,9 +43,6 @@ limitations under the License.
 
 using mkldnn::engine;
 using mkldnn::memory;
-#ifndef ENABLE_MKLDNN_V1
-using mkldnn::padding_kind;
-#endif
 using mkldnn::primitive;
 using mkldnn::reorder;
 using mkldnn::stream;
@@ -130,7 +127,6 @@ enum class MklQuantization {
 
 static const int kSmallBatchSize = 32;
 
-#ifdef ENABLE_MKLDNN_V1
 inline void execute_primitives(
     std::vector<mkldnn::primitive>& primitives, std::shared_ptr<stream> stream,
     std::vector<std::unordered_map<int, memory>>& net_args) {
@@ -174,13 +170,11 @@ enum class MklTensorFormat {
   FORMAT_INVALID = 8,
 };
 
-#endif  // ENABLE_MKLDNN_V1
-
 // Forward declarations
-MEMORY_FORMAT MklTensorFormatToMklDnnDataFormat(MKL_TENSOR_FORMAT format);
+memory::format_tag MklTensorFormatToMklDnnDataFormat(MklTensorFormat format);
 
-TensorFormat MklDnn3DDataFormatToTFDataFormat(MKL_TENSOR_FORMAT format);
-TensorFormat MklDnnDataFormatToTFDataFormat(MKL_TENSOR_FORMAT format);
+TensorFormat MklDnn3DDataFormatToTFDataFormat(MklTensorFormat format);
+TensorFormat MklDnnDataFormatToTFDataFormat(MklTensorFormat format);
 
 memory::dims CalculateTFStrides(const memory::dims& dims_tf_order);
 Status CreateBlockedMemDescHelper(const memory::dims& dim,
@@ -188,7 +182,6 @@ Status CreateBlockedMemDescHelper(const memory::dims& dim,
                                   memory::data_type dtype,
                                   mkldnn_memory_desc_t* blocked_md);
 
-#ifdef ENABLE_MKLDNN_V1
 inline std::ostream& operator<<(std::ostream& os,
                                 const memory::format_tag& tag) {
   if (tag == memory::format_tag::undef) {
@@ -223,7 +216,6 @@ inline std::ostream& operator<<(std::ostream& os,
     os << "INVALID FORMAT";
   }
 }
-#endif  // ENABLE_MKLDNN_V1
 
 template <typename T>
 inline bool array_cmp(const T* a1, const T* a2, size_t size) {
@@ -235,7 +227,7 @@ inline bool array_cmp(const T* a1, const T* a2, size_t size) {
 inline mkldnn::stream* CreateStream(OpKernelContext* ctx,
                                     const engine& engine) {
 #ifdef ENABLE_MKLDNN_THREADPOOL
-  stream_attr tp_stream_attr(ENGINE_CPU);
+  stream_attr tp_stream_attr(engine::kind::cpu);
   if (ctx != nullptr) {
     auto eigen_tp =
         MklDnnThreadPoolWrapper::GetInstance().CreateThreadPoolPtr(ctx);
@@ -244,11 +236,11 @@ inline mkldnn::stream* CreateStream(OpKernelContext* ctx,
         new stream(engine, stream::flags::default_flags, tp_stream_attr);
     return tp_stream;
   } else {
-    stream* tp_stream = new CPU_STREAM(engine);
+    stream* tp_stream = new stream(engine);
     return tp_stream;
   }
 #else
-  stream* tp_stream = new CPU_STREAM(engine);
+  stream* tp_stream = new stream(engine);
   return tp_stream;
 #endif  // ENABLE_MKLDNN_THREADPOOL
 }
@@ -261,8 +253,8 @@ class MklDnnShape {
     // Number of dimensions in Tensorflow format
     size_t dimension_ = 0;
     mkldnn_dims_t sizes_;  // Required by MKL for conversions
-    MKL_TENSOR_FORMAT tf_data_format_ = MKL_TENSOR_FORMAT_UNDEF;
-    memory::data_type T_ = MEMORY_DATA_TYPE_UNDEF;
+    MklTensorFormat tf_data_format_ = MklTensorFormat::FORMAT_BLOCKED;
+    memory::data_type T_ = memory::data_type::undef;
     // MKL layout
     mkldnn_memory_desc_t mkl_md_;
     /// TF dimension corresponding to this MKL dimension
@@ -272,41 +264,6 @@ class MklDnnShape {
 
   typedef std::remove_extent<mkldnn_dims_t>::type mkldnn_dim_t;
 
-#ifndef ENABLE_MKLDNN_V1
-  // Helper function to compare mkldnn_blocking_desc_t.
-  inline bool blocking_desc_is_equal(const mkldnn_blocking_desc_t& lhs,
-                                     const mkldnn_blocking_desc_t& rhs,
-                                     int ndims) const {
-    return lhs.offset_padding == rhs.offset_padding &&
-           array_cmp(lhs.block_dims, rhs.block_dims, ndims) &&
-           array_cmp(lhs.strides[0], rhs.strides[0], ndims) &&
-           array_cmp(lhs.strides[1], rhs.strides[1], ndims) &&
-           array_cmp(lhs.padding_dims, rhs.padding_dims, ndims) &&
-           array_cmp(lhs.offset_padding_to_data, rhs.offset_padding_to_data,
-                     ndims);
-  }
-
-  // Helper function to compare mkldnn_wino_desc_t.
-  inline bool wino_desc_is_equal(const mkldnn_wino_desc_t& lhs,
-                                 const mkldnn_wino_desc_t& rhs) const {
-    return lhs.wino_format == rhs.wino_format && lhs.alpha == rhs.alpha &&
-           lhs.ic == rhs.ic && lhs.oc == rhs.oc &&
-           lhs.ic_block == rhs.ic_block && lhs.oc_block == rhs.oc_block &&
-           lhs.ic2_block == rhs.ic2_block && lhs.oc2_block == rhs.oc2_block &&
-           lhs.r == rhs.r;
-  }
-
-  // Helper function to compare mkldnn_rnn_packed_desc_t.
-  inline bool rnn_packed_desc_is_equal(
-      const mkldnn_rnn_packed_desc_t& lhs,
-      const mkldnn_rnn_packed_desc_t& rhs) const {
-    return lhs.format == rhs.format && lhs.n_parts == rhs.n_parts &&
-           lhs.offset_compensation == rhs.offset_compensation &&
-           lhs.size == rhs.size && lhs.n == rhs.n &&
-           array_cmp(lhs.parts, rhs.parts, lhs.n_parts) &&
-           array_cmp(lhs.part_pack_size, rhs.part_pack_size, lhs.n_parts);
-  }
-#endif  // !ENABLE_MKLDNN_V1
 #define INVALID_DIM_SIZE -1
 
  public:
@@ -323,36 +280,6 @@ class MklDnnShape {
   ~MklDnnShape() {}
   TF_DISALLOW_COPY_AND_ASSIGN(MklDnnShape);  // Cannot copy
 
-#ifndef ENABLE_MKLDNN_V1
-  /// Helper function to compare memory::desc objects for MklDnn.
-  /// May be this should go into MklDnn directly.
-  inline bool CompareMklDnnLayouts(const memory::desc& md1,
-                                   const memory::desc& md2) const {
-    mkldnn_memory_desc_t mdd1 = md1.data;
-    mkldnn_memory_desc_t mdd2 = md2.data;
-
-    assert(mdd1.primitive_kind == mkldnn::primitive::kind::memory);
-    assert(mdd2.primitive_kind == mkldnn::primitive::kind::memory);
-    bool base_equal = mdd1.ndims == mdd2.ndims &&
-                      array_cmp(mdd1.dims, mdd2.dims, mdd1.ndims) &&
-                      mdd1.data_type == mdd2.data_type &&
-                      mdd1.format == mdd2.format;
-    if (!base_equal) return false;
-    if (mdd1.format == memory::format::blocked) {
-      return blocking_desc_is_equal(mdd1.layout_desc.blocking,
-                                    mdd2.layout_desc.blocking, mdd1.ndims);
-    } else if (mdd1.format == memory::format::wino_fmt) {
-      return wino_desc_is_equal(mdd1.layout_desc.wino_desc,
-                                mdd2.layout_desc.wino_desc);
-    } else if (mdd1.format == memory::format::rnn_packed) {
-      return rnn_packed_desc_is_equal(mdd1.layout_desc.rnn_packed_desc,
-                                      mdd2.layout_desc.rnn_packed_desc);
-    }
-
-    return true;
-  }
-#endif  // !ENABLE_MKLDNN_V1
-
   /// Equality function for MklDnnShape objects
   /// @return true if both are equal; false otherwise.
   inline bool operator==(const MklDnnShape& input_shape) const {
@@ -363,17 +290,11 @@ class MklDnnShape {
     // If input tensors are in MKL layout, then we check for dimensions and
     // sizes.
     if (this->IsMklTensor()) {
-#ifdef ENABLE_MKLDNN_V1
       const mkldnn_memory_desc_t& cur_md = (this->GetMklLayout()).data;
       const mkldnn_memory_desc_t& input_shape_md =
           input_shape.GetMklLayout().data;
       return this->GetTfShape() == input_shape.GetTfShape() &&
              mkldnn_memory_desc_equal(&cur_md, &input_shape_md);
-#else
-      return this->GetTfShape() == input_shape.GetTfShape() &&
-             CompareMklDnnLayouts(this->GetMklLayout(),
-                                  input_shape.GetMklLayout());
-#endif  // ENABLE_MKLDNN_V1
     }
 
     // Both inputs are not MKL tensors.
@@ -480,7 +401,7 @@ class MklDnnShape {
     std::vector<int32> shape(data_.dimension_, -1);
     // As mentioned in the comment above, we now rely on TF's `data_format`
     // attribute to determine if TF shape is in blocked format or not.
-    if (data_.tf_data_format_ != MKL_TENSOR_FORMAT_BLOCKED) {
+    if (data_.tf_data_format_ != MklTensorFormat::FORMAT_BLOCKED) {
       for (size_t idx = 0; idx < data_.dimension_; ++idx) {
         shape[idx] = data_.sizes_[TfDimIdx(idx)];
       }
@@ -502,14 +423,6 @@ class MklDnnShape {
   inline void SetElemType(memory::data_type dt) { data_.T_ = dt; }
   inline const memory::data_type GetElemType() { return data_.T_; }
 
-#ifndef ENABLE_MKLDNN_V1
-  // Memory primitive descriptor is deprecated in MKL-DNN v1.x.
-  inline void SetMklLayout(memory::primitive_desc* pd) {
-    CHECK_NOTNULL(pd);
-    data_.mkl_md_ = pd->desc().data;
-  }
-#endif  // !ENABLE_MKLDNN_V1
-
   inline void SetMklLayout(memory::desc* md) {
     CHECK_NOTNULL(md);
     data_.mkl_md_ = md->data;
@@ -519,7 +432,7 @@ class MklDnnShape {
     return memory::desc(data_.mkl_md_);
   }
 
-  inline MKL_TENSOR_FORMAT GetTfDataFormat() const {
+  inline MklTensorFormat GetTfDataFormat() const {
     return data_.tf_data_format_;
   }
 
@@ -527,7 +440,7 @@ class MklDnnShape {
   /// We use lazy evaluation and create it only when needed. Input format can
   /// also be Blocked format.
   inline void SetTfLayout(size_t dims, const memory::dims& sizes,
-                          MKL_TENSOR_FORMAT format) {
+                          MklTensorFormat format) {
     DCHECK_EQ(dims, sizes.size())
         << "SetTfLayout: Number of dimensions does not"
            "match with dimension array";
@@ -536,7 +449,7 @@ class MklDnnShape {
       data_.sizes_[ii] = sizes[ii];
     }
     data_.tf_data_format_ = format;
-    if (format != MKL_TENSOR_FORMAT_BLOCKED) {
+    if (format != MklTensorFormat::FORMAT_BLOCKED) {
       if (dims == 2) {
         data_.map_[0] = MklDnnDims::Dim_N;
         data_.map_[1] = MklDnnDims::Dim_C;
@@ -553,21 +466,17 @@ class MklDnnShape {
     }
 
     // Create Blocked memory desc if input TF format was set like that.
-    if (data_.tf_data_format_ == MKL_TENSOR_FORMAT_BLOCKED) {
+    if (data_.tf_data_format_ == MklTensorFormat::FORMAT_BLOCKED) {
       auto strides = CalculateTFStrides(dims);
       mkldnn_memory_desc_t blocked_md;
       TF_CHECK_OK(
           CreateBlockedMemDescHelper(dims, strides, data_.T_, &blocked_md));
       return memory::desc(blocked_md);
     } else {
-#ifdef ENABLE_MKLDNN_V1
       auto format_tag =
           MklTensorFormatToMklDnnDataFormat(data_.tf_data_format_);
       DCHECK_NE(format_tag, memory::format_tag::undef);
       return memory::desc(dims, data_.T_, format_tag);
-#else
-      return memory::desc(dims, data_.T_, data_.tf_data_format_);
-#endif  // ENABLE_MKLDNN_V1
     }
   }
 
@@ -608,7 +517,7 @@ class MklDnnShape {
     }
   }
 
-  inline void SetTfDimOrder(const size_t dimension, MKL_TENSOR_FORMAT format) {
+  inline void SetTfDimOrder(const size_t dimension, MklTensorFormat format) {
     TensorFormat data_format = MklDnnDataFormatToTFDataFormat(format);
     SetTfDimOrder(dimension, data_format);
   }
@@ -702,7 +611,6 @@ inline void ExecutePrimitive(const std::vector<primitive>& net,
                              const std::vector<MemoryArgsMap>* net_args,
                              const engine& cpu_engine,
                              OpKernelContext* context = nullptr) {
-#ifdef ENABLE_MKLDNN_V1
   DCHECK(net_args);
   DCHECK_EQ(net.size(), net_args->size());
   stream* cpu_stream = CreateStream(context, cpu_engine);
@@ -711,9 +619,6 @@ inline void ExecutePrimitive(const std::vector<primitive>& net,
   }
   cpu_stream->wait();
   delete cpu_stream;
-#else
-  stream(stream::kind::eager_nostore).submit(net).wait();
-#endif  // ENABLE_MKLDNN_V1
 }
 template <typename T>
 inline Status ConvertMklToTF(OpKernelContext* context,
@@ -732,28 +637,24 @@ inline Status ConvertMklToTF(OpKernelContext* context,
     TF_CHECK_OK(context->allocate_temp(DataTypeToEnum<T>::v(), output_tf_shape,
                                        output_tf_tensor));
 
-    engine cpu_engine(ENGINE_CPU, 0);
+    engine cpu_engine(engine::kind::cpu, 0);
     MklDnnData<T> input(&cpu_engine);
 
     // Get MKL layout of input tensor.
     auto input_mkl_md = input_mkl_shape.GetMklLayout();
     auto output_tf_md = input_mkl_shape.GetTfLayout();
-#ifndef ENABLE_MKLDNN_V1
-    // Memory primitive descriptor is deprecated in MKL-DNN v1.x.
-    auto output_tf_pd = memory::primitive_desc(output_tf_md, cpu_engine);
-#endif  // !ENABLE_MKLDNN_V1
     input.SetUsrMem(input_mkl_md, &input_mkl_tensor);
 
-    if (input.IsReorderNeeded(OUTPUT_TF_MD)) {
+    if (input.IsReorderNeeded(output_tf_md)) {
       std::vector<primitive> net;
       std::vector<MemoryArgsMap> net_args;
-      bool status = input.CheckReorderToOpMem(GET_CHECK_REORDER_TO_OP_MEM_ARGS(
-          OUTPUT_TF_MD, output_tf_tensor, net, net_args, cpu_engine));
+      bool status = input.CheckReorderToOpMem(output_tf_md, output_tf_tensor,
+                                              net, net_args, cpu_engine);
       if (!status) {
         return Status(error::Code::INTERNAL,
                       "ConvertMklToTF(): Failed to create reorder for input");
       }
-      ExecutePrimitive(net, NET_ARGS_PTR, cpu_engine, context);
+      ExecutePrimitive(net, &net_args, cpu_engine, context);
     } else {
       // If not, just forward input tensor to output tensor.
       bool status =
@@ -877,7 +778,7 @@ inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n,
 // Allocates a temp tensor and returns the data buffer for temporary storage.
 template <typename T>
 inline void AllocTmpBuffer(OpKernelContext* context, Tensor* tensor_out,
-                           const MEMORY_PRIMITIVE_DESC& pd, void** buf_out) {
+                           const memory::desc& pd, void** buf_out) {
   TensorShape tf_shape;
 
   tf_shape.AddDim(pd.get_size() / sizeof(T) + 1);
@@ -893,13 +794,11 @@ inline void AllocTmpBuffer(OpKernelContext* context, Tensor* tensor_out,
                                                  tf_shape, tensor_out));
 }
 
-inline void GetStridesFromSizes(TENSOR_FORMAT data_format, size_t* strides,
+inline void GetStridesFromSizes(MklTensorFormat data_format, size_t* strides,
                                 const size_t* sizes) {
-#ifdef ENABLE_MKLDNN_V1
   DCHECK_NE(data_format, MklTensorFormat::FORMAT_INVALID);
-#endif  // ENABLE_MKLDNN_V1
   // MKL requires strides in NCHW
-  if (data_format == TENSOR_FORMAT_NHWC) {
+  if (data_format == MklTensorFormat::FORMAT_NHWC) {
     strides[0] = sizes[2];
     strides[1] = sizes[0] * sizes[2];
     strides[2] = 1;
@@ -1092,31 +991,27 @@ memory::data_type MklDnnType<bfloat16>() {
 // @input: MklTensorFormat i.e. TensorFlow data format
 // @return: MKL-DNN's memory format tag corresponding to MklTensorFormat.
 //          Fails with an error if invalid data format.
-inline MEMORY_FORMAT MklTensorFormatToMklDnnDataFormat(
-    MKL_TENSOR_FORMAT format) {
-#ifdef ENABLE_MKLDNN_V1
-  if (format == MklTensorFormat::FORMAT_NHWC) return MEMORY_FORMAT::nhwc;
-  if (format == MklTensorFormat::FORMAT_NCHW) return MEMORY_FORMAT::nchw;
-  if (format == MklTensorFormat::FORMAT_NDHWC) return MEMORY_FORMAT::ndhwc;
-  if (format == MklTensorFormat::FORMAT_NCDHW) return MEMORY_FORMAT::ncdhw;
-  if (format == MklTensorFormat::FORMAT_X) return MEMORY_FORMAT::x;
-  if (format == MklTensorFormat::FORMAT_NC) return MEMORY_FORMAT::nc;
-  if (format == MklTensorFormat::FORMAT_TNC) return MEMORY_FORMAT::tnc;
-  return MEMORY_FORMAT::undef;
-#else
-  return format;
-#endif
+inline memory::format_tag MklTensorFormatToMklDnnDataFormat(
+    MklTensorFormat format) {
+  if (format == MklTensorFormat::FORMAT_NHWC) return memory::format_tag::nhwc;
+  if (format == MklTensorFormat::FORMAT_NCHW) return memory::format_tag::nchw;
+  if (format == MklTensorFormat::FORMAT_NDHWC) return memory::format_tag::ndhwc;
+  if (format == MklTensorFormat::FORMAT_NCDHW) return memory::format_tag::ncdhw;
+  if (format == MklTensorFormat::FORMAT_X) return memory::format_tag::x;
+  if (format == MklTensorFormat::FORMAT_NC) return memory::format_tag::nc;
+  if (format == MklTensorFormat::FORMAT_TNC) return memory::format_tag::tnc;
+  return memory::format_tag::undef;
 }
 
 /// Map TensorFlow data format into MKL-DNN 3D data format
 /// @input: TensorFlow data format
 /// @return: MKL-DNN 3D data format corresponding to TensorFlow data format;
 ///          Fails with an error if invalid data format.
-inline MKL_TENSOR_FORMAT TFDataFormatToMklDnn3DDataFormat(TensorFormat format) {
-  if (format == FORMAT_NHWC) return MKL_TENSOR_FORMAT_NDHWC;
-  if (format == FORMAT_NCHW) return MKL_TENSOR_FORMAT_NCDHW;
+inline MklTensorFormat TFDataFormatToMklDnn3DDataFormat(TensorFormat format) {
+  if (format == FORMAT_NHWC) return MklTensorFormat::FORMAT_NDHWC;
+  if (format == FORMAT_NCHW) return MklTensorFormat::FORMAT_NCDHW;
   TF_CHECK_OK(Status(error::Code::INVALID_ARGUMENT, "Unsupported data format"));
-  return MKL_TENSOR_FORMAT_INVALID;
+  return MklTensorFormat::FORMAT_INVALID;
 }
 
 /// Map TensorFlow data format into MKL-DNN data format
@@ -1124,11 +1019,11 @@ inline MKL_TENSOR_FORMAT TFDataFormatToMklDnn3DDataFormat(TensorFormat format) {
 /// @input: TensorFlow data format
 /// @return: MKL-DNN data format corresponding to TensorFlow data format;
 ///          Fails with an error if invalid data format.
-inline MKL_TENSOR_FORMAT TFDataFormatToMklDnnDataFormat(TensorFormat format) {
-  if (format == FORMAT_NHWC) return MKL_TENSOR_FORMAT_NHWC;
-  if (format == FORMAT_NCHW) return MKL_TENSOR_FORMAT_NCHW;
+inline MklTensorFormat TFDataFormatToMklDnnDataFormat(TensorFormat format) {
+  if (format == FORMAT_NHWC) return MklTensorFormat::FORMAT_NHWC;
+  if (format == FORMAT_NCHW) return MklTensorFormat::FORMAT_NCHW;
   TF_CHECK_OK(Status(error::Code::INVALID_ARGUMENT, "Unsupported data format"));
-  return MKL_TENSOR_FORMAT_INVALID;
+  return MklTensorFormat::FORMAT_INVALID;
 }
 
 /// Map MKL-DNN data format into TensorFlow data format
@@ -1136,10 +1031,12 @@ inline MKL_TENSOR_FORMAT TFDataFormatToMklDnnDataFormat(TensorFormat format) {
 /// @input: MKL-DNN data format
 /// @return: Tensorflow data format corresponding to MKL-DNN data format;
 ///          Fails with an error if invalid data format.
-inline TensorFormat MklDnnDataFormatToTFDataFormat(MKL_TENSOR_FORMAT format) {
-  if (format == MKL_TENSOR_FORMAT_NHWC || format == MKL_TENSOR_FORMAT_NDHWC)
+inline TensorFormat MklDnnDataFormatToTFDataFormat(MklTensorFormat format) {
+  if (format == MklTensorFormat::FORMAT_NHWC ||
+      format == MklTensorFormat::FORMAT_NDHWC)
     return FORMAT_NHWC;
-  if (format == MKL_TENSOR_FORMAT_NCHW || format == MKL_TENSOR_FORMAT_NCDHW)
+  if (format == MklTensorFormat::FORMAT_NCHW ||
+      format == MklTensorFormat::FORMAT_NCDHW)
     return FORMAT_NCHW;
   TF_CHECK_OK(Status(error::Code::INVALID_ARGUMENT, "Unsupported data format"));
 
@@ -1176,7 +1073,8 @@ inline memory::dims TFShapeToMklDnnDims(const TensorShape& shape) {
 inline memory::dims TFShapeToMklDnnDimsInNCHW(const TensorShape& shape,
                                               TensorFormat format) {
   // Check validity of format.
-  DCHECK_NE(TFDataFormatToMklDnnDataFormat(format), MKL_TENSOR_FORMAT_INVALID);
+  DCHECK_NE(TFDataFormatToMklDnnDataFormat(format),
+            MklTensorFormat::FORMAT_INVALID);
 
   int n = shape.dim_size(GetTensorDimIndex(format, 'N'));
   int c = shape.dim_size(GetTensorDimIndex(format, 'C'));
@@ -1191,7 +1089,7 @@ inline memory::dims TFShapeToMklDnnDimsInNCDHW(const TensorShape& shape,
                                                TensorFormat format) {
   // Validate format.
   DCHECK_NE(TFDataFormatToMklDnn3DDataFormat(format),
-            MKL_TENSOR_FORMAT_INVALID);
+            MklTensorFormat::FORMAT_INVALID);
 
   int n = shape.dim_size(GetTensorDimIndex<3>(format, 'N'));
   int c = shape.dim_size(GetTensorDimIndex<3>(format, 'C'));
@@ -1208,7 +1106,8 @@ inline memory::dims TFShapeToMklDnnDimsInNCDHW(const TensorShape& shape,
 inline memory::dims MklDnnDimsInNCHW(const memory::dims& in_dims,
                                      TensorFormat format) {
   // Validate format.
-  DCHECK_NE(TFDataFormatToMklDnnDataFormat(format), MKL_TENSOR_FORMAT_INVALID);
+  DCHECK_NE(TFDataFormatToMklDnnDataFormat(format),
+            MklTensorFormat::FORMAT_INVALID);
 
   int n = in_dims[GetTensorDimIndex(format, 'N')];
   int c = in_dims[GetTensorDimIndex(format, 'C')];
@@ -1224,7 +1123,8 @@ inline memory::dims MklDnnDimsInNCHW(const memory::dims& in_dims,
 inline memory::dims MklDnnDimsInNCDHW(const memory::dims& in_dims,
                                       TensorFormat format) {
   // Validate format.
-  DCHECK_NE(TFDataFormatToMklDnnDataFormat(format), MKL_TENSOR_FORMAT_INVALID);
+  DCHECK_NE(TFDataFormatToMklDnnDataFormat(format),
+            MklTensorFormat::FORMAT_INVALID);
 
   int n = in_dims[GetTensorDimIndex<3>(format, 'N')];
   int c = in_dims[GetTensorDimIndex<3>(format, 'C')];
@@ -1273,13 +1173,6 @@ inline memory::dims CalculateTFStrides(const memory::dims& dims_tf_order) {
   return strides;
 }
 
-#ifndef ENABLE_MKLDNN_V1
-inline padding_kind TFPaddingToMklDnnPadding(Padding pad) {
-  // MKL-DNN only supports zero padding.
-  return padding_kind::zero;
-}
-#endif
-
 /// Helper function to create memory descriptor in Blocked format
 ///
 /// @input: Tensor dimensions
@@ -1295,7 +1188,6 @@ inline Status CreateBlockedMemDescHelper(const memory::dims& dim,
                                          memory::data_type dtype,
                                          mkldnn_memory_desc_t* blocked_md) {
   DCHECK_EQ(dim.size(), strides.size());
-#ifdef ENABLE_MKLDNN_V1
   const int kNumDims = dim.size();
   mkldnn_dim_t* input_dims = new mkldnn_dim_t[kNumDims];
   mkldnn_dim_t* input_strides = new mkldnn_dim_t[kNumDims];
@@ -1317,26 +1209,6 @@ inline Status CreateBlockedMemDescHelper(const memory::dims& dim,
                       "Failed to create blocked memory descriptor.",
                       "Status: ", e.status, ", message: ", e.message));
   }
-#else
-  // We have to construct memory descriptor in a C style. This is not at all
-  // ideal but MKL-DNN does not offer any API to construct descriptor in
-  // blocked format except a copy constructor that accepts
-  // mkldnn_memory_desc_t.
-  blocked_md->primitive_kind = mkldnn_memory;
-  blocked_md->ndims = dim.size();
-  blocked_md->format = mkldnn_blocked;
-  blocked_md->data_type = memory::convert_to_c(dtype);
-
-  for (size_t i = 0; i < dim.size(); i++) {
-    blocked_md->layout_desc.blocking.block_dims[i] = 1;
-    blocked_md->layout_desc.blocking.strides[1][i] = 1;
-    blocked_md->layout_desc.blocking.strides[0][i] = strides[i];
-    blocked_md->layout_desc.blocking.padding_dims[i] = dim[i];
-    blocked_md->layout_desc.blocking.offset_padding_to_data[i] = 0;
-    blocked_md->dims[i] = dim[i];
-  }
-  blocked_md->layout_desc.blocking.offset_padding = 0;
-#endif  // ENABLE_MKLDNN_V1
   return Status::OK();
 }
 
@@ -1345,25 +1217,17 @@ inline void CreateAndExecuteReorder(const ReorderPd& reorder_desc,
                                     const memory& dst_mem, const engine& engine,
                                     OpKernelContext* ctx = nullptr) {
   std::vector<primitive> net;
-#ifdef ENABLE_MKLDNN_V1
   net.push_back(mkldnn::reorder(reorder_desc));
   std::vector<MemoryArgsMap> net_args;
   net_args.push_back({{MKLDNN_ARG_FROM, src_mem}, {MKLDNN_ARG_TO, dst_mem}});
-#else
-  net.push_back(mkldnn::reorder(reorder_desc, src_mem, dst_mem));
-#endif  // ENABLE_MKLDNN_V1
-  ExecutePrimitive(net, NET_ARGS_PTR, engine, ctx);
+  ExecutePrimitive(net, &net_args, engine, ctx);
 }
 
 class MklReorderPrimitive;
 
 template <typename T>
-#ifdef ENABLE_MKLDNN_V1
 inline MklReorderPrimitive* FindOrCreateReorder(const memory* from,
                                                 const memory* to);
-#else
-inline primitive FindOrCreateReorder(const memory* from, const memory* to);
-#endif  // ENABLE_MKLDNN_V1
 
 // Class to represent all the resources corresponding to a tensor in TensorFlow
 // that are required to execute an operation (such as Convolution).
@@ -1421,13 +1285,13 @@ class MklDnnData {
   /// an operation. E.g., filter of Conv2D is of shape {1, 2, 3, 4}, and
   /// memory format tag HWIO, and the buffer that contains actual values is
   /// pointed by data_buffer.
-  inline void SetUsrMem(const memory::dims& dim, MEMORY_FORMAT fm,
+  inline void SetUsrMem(const memory::dims& dim, memory::format_tag fm,
                         void* data_buffer = nullptr) {
     auto md = memory::desc(dim, MklDnnType<T>(), fm);
     SetUsrMem(md, data_buffer);
   }
 
-  inline void SetUsrMem(const memory::dims& dim, MEMORY_FORMAT fm,
+  inline void SetUsrMem(const memory::dims& dim, memory::format_tag fm,
                         const Tensor* tensor) {
     DCHECK(tensor);
     SetUsrMem(dim, fm, GetTensorBuffer(tensor));
@@ -1468,18 +1332,6 @@ class MklDnnData {
     SetUsrMem(dim, strides, GetTensorBuffer(tensor));
   }
 
-#ifndef ENABLE_MKLDNN_V1
-  /// Memory primitive descriptor is deprecated in MKL-DNN v1.x.
-  /// A version of function to set user memory primitive that accepts memory
-  /// descriptor directly, instead of accepting dimensions and format. This
-  /// function is more generic that the one above, but the function above is
-  /// sufficient in most cases.
-  inline void SetUsrMem(const memory::desc& md, void* data_buffer = nullptr) {
-    auto pd = memory::primitive_desc(md, *cpu_engine_);
-    SetUsrMem(pd, data_buffer);
-  }
-#endif  // !ENABLE_MKLDNN_V1
-
   /// A version of SetUsrMem with memory descriptor and tensor
   inline void SetUsrMem(const memory::desc& md, const Tensor* tensor) {
     CHECK_NOTNULL(tensor);
@@ -1490,50 +1342,24 @@ class MklDnnData {
   /// descriptor directly, instead of accepting dimensions and format. This
   /// function is more generic than the one above, but the function above is
   /// sufficient in most cases.
-  inline void SetUsrMem(const MEMORY_PRIMITIVE_DESC& pd,
-                        void* data_buffer = nullptr) {
+  inline void SetUsrMem(const memory::desc& pd, void* data_buffer = nullptr) {
     DCHECK(cpu_engine_);
     if (user_memory_) delete user_memory_;
     // TODO(nhasabni): can we remove dynamic memory allocation?
     if (data_buffer) {
-      user_memory_ = new MEMORY_CONSTRUCTOR(pd, *cpu_engine_, data_buffer);
+      user_memory_ = new memory(pd, *cpu_engine_, data_buffer);
     } else {
-      user_memory_ = new MEMORY_CONSTRUCTOR_WITHOUT_DATA(pd, *cpu_engine_);
+      user_memory_ = new memory(pd, *cpu_engine_);
     }
   }
 
-#ifndef ENABLE_MKLDNN_V1
-  /// Memory primitive descriptor is deprecated in MKL-DNN v1.x
-  /// A version of SetUsrMem with primitive descriptor and tensor
-  inline void SetUsrMem(const memory::primitive_desc& pd,
-                        const Tensor* tensor) {
-    DCHECK(tensor);
-    SetUsrMem(pd, GetTensorBuffer(tensor));
-  }
-#endif  // !ENABLE_MKLDNN_V1
-
   /// Get function for user memory primitive.
   inline const memory* GetUsrMem() const { return user_memory_; }
 
-#ifndef ENABLE_MKLDNN_V1
-  /// Memory primitive descriptor is deprecated in MKL-DNN v1.x.
-  /// Get function for primitive descriptor of user memory primitive.
-  inline const memory::primitive_desc GetUsrMemPrimDesc() const {
-    DCHECK(user_memory_);
-    return user_memory_->get_primitive_desc();
-  }
-#endif  // !ENABLE_MKLDNN_V1
-
   /// Get function for descriptor of user memory.
   inline memory::desc GetUsrMemDesc() const {
-#ifdef ENABLE_MKLDNN_V1
     DCHECK(user_memory_);
     return user_memory_->get_desc();
-#else
-    // This is ugly. Why MKL-DNN does not provide desc() method of const type??
-    const memory::primitive_desc pd = GetUsrMemPrimDesc();
-    return const_cast<memory::primitive_desc*>(&pd)->desc();
-#endif  // ENABLE_MKLDNN_V1
   }
 
   /// Get function for data buffer of user memory primitive.
@@ -1584,7 +1410,7 @@ class MklDnnData {
   /// format. E.g., For Conv2D, the dimensions would be same as user dimensions
   /// but memory::format_tag would be mkldnn::any because we want MKL-DNN to
   /// choose the best layout/format for given input dimensions.
-  inline void SetOpMemDesc(const memory::dims& dim, MEMORY_FORMAT fm) {
+  inline void SetOpMemDesc(const memory::dims& dim, memory::format_tag fm) {
     // TODO(nhasabni): can we remove dynamic memory allocation?
     op_md_ = new memory::desc(dim, MklDnnType<T>(), fm);
   }
@@ -1597,29 +1423,11 @@ class MklDnnData {
   ///
   /// @input: op_md - memory descriptor of the given input of an operation.
   /// @return: true in case reorder of input is needed; false, otherwise.
-  inline bool IsReorderNeeded(const MEMORY_PRIMITIVE_DESC& op_pd) const {
+  inline bool IsReorderNeeded(const memory::desc& op_pd) const {
     DCHECK(user_memory_);
-    return op_pd != GET_MEMORY_PRIMITIVE_DESC_FROM_MEM_PTR(user_memory_);
+    return op_pd != user_memory_->get_desc();
   }
 
-#ifndef ENABLE_MKLDNN_V1
-  /// In MKL-DNN v1.x, it it is not possible to directly compare two memory
-  /// format tags since they only provide a partial description of the memory
-  /// layout. Hence, this function is disabled for MKL-DNN v1.x.
-  ///
-  /// Predicate that checks if we need to reorder user's memory into memory
-  /// based on the provided format.
-  ///
-  /// @input: target_format - memory format of the given input of an
-  ///               operation
-  /// @return: true in case reorder of input is needed; false, otherwise.
-  inline bool IsReorderNeeded(const memory::format& target_format) const {
-    CHECK_NOTNULL(user_memory_);
-    return target_format !=
-           user_memory_->get_primitive_desc().desc().data.format;
-  }
-#endif  // !ENABLE_MKLDNN_V1
-
   /// Function to create a reorder from memory pointed by from to memory pointed
   /// by to. Returns created primitive.
   inline primitive CreateReorder(const memory* from, const memory* to) const {
@@ -1628,24 +1436,23 @@ class MklDnnData {
     return reorder(*from, *to);
   }
 
-/// Function to handle input reordering
-///
-/// Check if we need to reorder this input of an operation.
-/// Return true and allocate reorder memory primitive if reorder is needed.
-/// Otherwise, return false and do not allocate reorder memory primitive.
-///
-/// To check if reorder is needed, this function compares memory primitive
-/// descriptor (memory descriptor for v1.x) of an operation (op_pd) for
-/// the given input with the user-specified memory descriptor.
-///
-/// @input: op_pd - memory primitive descriptor of the given input of an
-///                 operation
-/// @input: net - net to which to add reorder primitive in case it is needed.
-/// @input: net_args - net to which user and reorder memories are added if
-///                    needed. Each entry is a key-value pair of the form
-///                    <argument-type, mkldnn::memory>.
-/// @return: true in case reorder of input is needed; false, otherwise.
-#ifdef ENABLE_MKLDNN_V1
+  /// Function to handle input reordering
+  ///
+  /// Check if we need to reorder this input of an operation.
+  /// Return true and allocate reorder memory primitive if reorder is needed.
+  /// Otherwise, return false and do not allocate reorder memory primitive.
+  ///
+  /// To check if reorder is needed, this function compares memory primitive
+  /// descriptor (memory descriptor for v1.x) of an operation (op_pd) for
+  /// the given input with the user-specified memory descriptor.
+  ///
+  /// @input: op_pd - memory primitive descriptor of the given input of an
+  ///                 operation
+  /// @input: net - net to which to add reorder primitive in case it is needed.
+  /// @input: net_args - net to which user and reorder memories are added if
+  ///                    needed. Each entry is a key-value pair of the form
+  ///                    <argument-type, mkldnn::memory>.
+  /// @return: true in case reorder of input is needed; false, otherwise.
   inline bool CheckReorderToOpMem(const memory::desc& op_md,
                                   std::vector<primitive>& net,
                                   std::vector<MemoryArgsMap>& net_args,
@@ -1658,21 +1465,11 @@ class MklDnnData {
       net.push_back(CreateReorder(user_memory_, reorder_memory_));
       net_args.push_back(MemoryArgsMap{{MKLDNN_ARG_FROM, *user_memory_},
                                        {MKLDNN_ARG_TO, *reorder_memory_}});
-#else
-  inline bool CheckReorderToOpMem(const memory::primitive_desc& op_pd,
-                                  std::vector<primitive>* net) {
-    DCHECK(net);
-    DCHECK(user_memory_);
-    if (IsReorderNeeded(op_pd)) {
-      reorder_memory_ = new memory(op_pd);
-      net->push_back(CreateReorder(user_memory_, reorder_memory_));
-#endif  // ENABLE_MKLDNN_V1
       return true;
     }
     return false;
   }
 
-#ifdef ENABLE_MKLDNN_V1
   inline bool CheckReorderToOpMem(const memory::desc& op_md,
                                   const engine& engine,
                                   OpKernelContext* context = nullptr) {
@@ -1691,36 +1488,25 @@ class MklDnnData {
       net_args.push_back({{MKLDNN_ARG_FROM, *user_memory_},
                           {MKLDNN_ARG_TO, *reorder_memory_}});
       execute_primitives(net, cpu_stream, net_args);
-#else
-  inline bool CheckReorderToOpMem(const memory::primitive_desc& op_pd,
-                                  OpKernelContext* ctx = nullptr) {
-    CHECK_NOTNULL(user_memory_);
-    if (IsReorderNeeded(op_pd)) {
-      reorder_memory_ = new memory(op_pd);
-      std::vector<primitive> net;
-      net.push_back(FindOrCreateReorder<T>(user_memory_, reorder_memory_));
-      stream(stream::kind::eager_nostore).submit(net).wait();
-#endif  // ENABLE_MKLDNN_V1
       return true;
     }
     return false;
   }
 
-/// Overloaded version of above function that accepts memory buffer
-/// where output of reorder needs to be stored.
-///
-/// @input: op_pd - memory primitive descriptor (memory descriptor for v1.x)
-///                 of the given input of an operation
-/// @reorder_data_handle - memory buffer where output of reorder needs to be
-///                        stored. Primitive does not check if buffer has
-///                        enough size to write.
-/// @input: net - net to which to add reorder primitive in case it is needed.
-/// @input: net_args - net to which user and reorder memories are added if
-///                    needed. Each entry is a key-value pair of the form
-///                    <argument-type, mkldnn::memory>.
-/// @input: engine - MKL-DNN's abstraction of a computational device
-/// @return: true in case reorder of input is needed; false, otherwise.
-#ifdef ENABLE_MKLDNN_V1
+  /// Overloaded version of above function that accepts memory buffer
+  /// where output of reorder needs to be stored.
+  ///
+  /// @input: op_pd - memory primitive descriptor (memory descriptor for v1.x)
+  ///                 of the given input of an operation
+  /// @reorder_data_handle - memory buffer where output of reorder needs to be
+  ///                        stored. Primitive does not check if buffer has
+  ///                        enough size to write.
+  /// @input: net - net to which to add reorder primitive in case it is needed.
+  /// @input: net_args - net to which user and reorder memories are added if
+  ///                    needed. Each entry is a key-value pair of the form
+  ///                    <argument-type, mkldnn::memory>.
+  /// @input: engine - MKL-DNN's abstraction of a computational device
+  /// @return: true in case reorder of input is needed; false, otherwise.
   inline bool CheckReorderToOpMem(const memory::desc& op_md,
                                   void* reorder_data_handle,
                                   std::vector<primitive>& net,
@@ -1734,26 +1520,14 @@ class MklDnnData {
       net.push_back(CreateReorder(user_memory_, reorder_memory_));
       net_args.push_back(MemoryArgsMap{{MKLDNN_ARG_FROM, *user_memory_},
                                        {MKLDNN_ARG_TO, *reorder_memory_}});
-#else
-  inline bool CheckReorderToOpMem(const memory::primitive_desc& op_pd,
-                                  void* reorder_data_handle,
-                                  std::vector<primitive>* net) {
-    CHECK_NOTNULL(net);
-    CHECK_NOTNULL(reorder_data_handle);
-    CHECK_NOTNULL(user_memory_);
-    if (IsReorderNeeded(op_pd)) {
-      reorder_memory_ = new memory(op_pd, reorder_data_handle);
-      net->push_back(CreateReorder(user_memory_, reorder_memory_));
-#endif  // ENABLE_MKLDNN_V1
       return true;
     }
     return false;
   }
 
-/// This is a faster path with reorder primitive cache compared with
-/// CheckReorderToOpMem(..., std::vector<primitive>* net).
-/// The slower path will be removed in the future
-#ifdef ENABLE_MKLDNN_V1
+  /// This is a faster path with reorder primitive cache compared with
+  /// CheckReorderToOpMem(..., std::vector<primitive>* net).
+  /// The slower path will be removed in the future
   /// TODO(bhavanis): Need to use reorder cache here for better performance.
   inline bool CheckReorderToOpMem(const memory::desc& op_md,
                                   void* reorder_data_handle,
@@ -1775,38 +1549,25 @@ class MklDnnData {
       net_args.push_back({{MKLDNN_ARG_FROM, *user_memory_},
                           {MKLDNN_ARG_TO, *reorder_memory_}});
       execute_primitives(net, cpu_stream, net_args);
-#else
-  inline bool CheckReorderToOpMem(const memory::primitive_desc& op_pd,
-                                  void* reorder_data_handle,
-                                  OpKernelContext* context = nullptr) {
-    CHECK_NOTNULL(reorder_data_handle);
-    CHECK_NOTNULL(user_memory_);
-    if (IsReorderNeeded(op_pd)) {
-      std::vector<primitive> net;
-      reorder_memory_ = new memory(op_pd, reorder_data_handle);
-      net.push_back(FindOrCreateReorder<T>(user_memory_, reorder_memory_));
-      stream(stream::kind::eager_nostore).submit(net).wait();
-#endif  // ENABLE_MKLDNN_V1
       return true;
     }
     return false;
   }
 
-/// Another overloaded version of CheckReorderToOpMem that accepts Tensor
-/// where output of reorder needs to be stored.
-///
-/// @input: op_md - memory primitive descriptor (memory descriptor for v1.x)
-///                 of the given input of an operation
-/// @reorder_tensor - Tensor whose buffer is to be used to store output of
-///                   reorder. Primitive does not check if buffer is
-///                   enough size to write.
-/// @input: net - net to which to add reorder primitive in case it is needed.
-/// @input: net_args - net to which user and reorder memories are added if
-///                    needed. Each entry is a key-value pair of the form
-///                    <argument-type, mkldnn::memory>.
-/// @input: engine - MKL-DNN's abstraction of a computational device
-/// @return: true in case reorder of input is needed; false, otherwise.
-#ifdef ENABLE_MKLDNN_V1
+  /// Another overloaded version of CheckReorderToOpMem that accepts Tensor
+  /// where output of reorder needs to be stored.
+  ///
+  /// @input: op_md - memory primitive descriptor (memory descriptor for v1.x)
+  ///                 of the given input of an operation
+  /// @reorder_tensor - Tensor whose buffer is to be used to store output of
+  ///                   reorder. Primitive does not check if buffer is
+  ///                   enough size to write.
+  /// @input: net - net to which to add reorder primitive in case it is needed.
+  /// @input: net_args - net to which user and reorder memories are added if
+  ///                    needed. Each entry is a key-value pair of the form
+  ///                    <argument-type, mkldnn::memory>.
+  /// @input: engine - MKL-DNN's abstraction of a computational device
+  /// @return: true in case reorder of input is needed; false, otherwise.
   inline bool CheckReorderToOpMem(const memory::desc& op_md,
                                   Tensor* reorder_tensor,
                                   std::vector<primitive>& net,
@@ -1816,30 +1577,17 @@ class MklDnnData {
     return CheckReorderToOpMem(op_md, GetTensorBuffer(reorder_tensor), net,
                                net_args, engine);
   }
-#else
-  inline bool CheckReorderToOpMem(const memory::primitive_desc& op_pd,
-                                  Tensor* reorder_tensor,
-                                  std::vector<primitive>* net) {
-    CHECK_NOTNULL(net);
-    CHECK_NOTNULL(reorder_tensor);
-    return CheckReorderToOpMem(op_pd, GetTensorBuffer(reorder_tensor), net);
-  }
-#endif  // ENABLE_MKLDNN_V1
 
   /// TODO: this is a faster path with reorder primitive cache compared with
   /// CheckReorderToOpMem(op_md, reorder_tensor, net, net_args, engine), will
   /// remove
   /// slow path in the future
-  inline bool CheckReorderToOpMem(const MEMORY_PRIMITIVE_DESC& op_pd,
+  inline bool CheckReorderToOpMem(const memory::desc& op_pd,
                                   Tensor* reorder_tensor,
                                   OpKernelContext* ctx = nullptr) {
     DCHECK(reorder_tensor);
-#ifdef ENABLE_MKLDNN_V1
     return CheckReorderToOpMem(op_pd, GetTensorBuffer(reorder_tensor),
                                *cpu_engine_, ctx);
-#else
-    return CheckReorderToOpMem(op_pd, GetTensorBuffer(reorder_tensor), ctx);
-#endif  // ENABLE_MKLDNN_V1
   }
 
   /// Function to handle output reorder
@@ -1855,28 +1603,26 @@ class MklDnnData {
   /// @input - memory primitive descriptor (memory descriptor for v1.x) for the
   ///          given output of an operation
   /// @return: true in case reorder of output is needed; false, otherwise.
-  inline bool PrepareReorderToUserMemIfReq(const MEMORY_PRIMITIVE_DESC& op_pd) {
+  inline bool PrepareReorderToUserMemIfReq(const memory::desc& op_pd) {
     DCHECK(user_memory_);
     if (IsReorderNeeded(op_pd)) {
       // TODO(nhasabni): can we remove dynamic memory allocation?
-      reorder_memory_ =
-          new MEMORY_CONSTRUCTOR_WITHOUT_DATA(op_pd, *cpu_engine_);
+      reorder_memory_ = new memory(op_pd, *cpu_engine_);
       return true;
     }
     return false;
   }
 
-/// Function to actually insert reorder primitive in the net
-///
-/// This function completes remaining part of output reordering. It inserts
-/// a reordering primitive from the temporary buffer that holds the output
-/// to the user-specified output buffer.
-///
-/// @input: net - net to which to add reorder primitive
-/// @input: net_args - net to which user and reorder memories are added if
-///                    needed. Each entry is a key-value pair of the form
-///                    <argument-type, mkldnn::memory>.
-#ifdef ENABLE_MKLDNN_V1
+  /// Function to actually insert reorder primitive in the net
+  ///
+  /// This function completes remaining part of output reordering. It inserts
+  /// a reordering primitive from the temporary buffer that holds the output
+  /// to the user-specified output buffer.
+  ///
+  /// @input: net - net to which to add reorder primitive
+  /// @input: net_args - net to which user and reorder memories are added if
+  ///                    needed. Each entry is a key-value pair of the form
+  ///                    <argument-type, mkldnn::memory>.
   inline void InsertReorderToUserMem(std::vector<primitive>& net,
                                      std::vector<MemoryArgsMap>& net_args) {
     DCHECK(user_memory_);
@@ -1885,14 +1631,6 @@ class MklDnnData {
     net_args.push_back(MemoryArgsMap{{MKLDNN_ARG_FROM, *reorder_memory_},
                                      {MKLDNN_ARG_TO, *user_memory_}});
   }
-#else
-  inline void InsertReorderToUserMem(std::vector<primitive>* net) {
-    CHECK_NOTNULL(net);
-    CHECK_NOTNULL(user_memory_);
-    CHECK_NOTNULL(reorder_memory_);
-    net->push_back(CreateReorder(reorder_memory_, user_memory_));
-  }
-#endif  // ENABLE_MKLDNN_V1
 
   /// TODO: this is a faster path with reorder primitive cache compared with
   ///       InsertReorderToUserMem(net, net_args), will remove
@@ -1904,7 +1642,6 @@ class MklDnnData {
     // primitive reuse don't allow two same reorder prim in
     // one stream, so submit it immediately
     std::vector<primitive> net;
-#ifdef ENABLE_MKLDNN_V1
     auto* prim = FindOrCreateReorder<T>(reorder_memory_, user_memory_);
     net.push_back(*(prim->GetPrimitive()));
     std::vector<MemoryArgsMap> net_args;
@@ -1913,10 +1650,6 @@ class MklDnnData {
     std::shared_ptr<stream> cpu_stream;
     cpu_stream.reset(CreateStream(ctx, prim->GetEngine()));
     execute_primitives(net, cpu_stream, net_args);
-#else
-    net.push_back(FindOrCreateReorder<T>(reorder_memory_, user_memory_));
-    ExecutePrimitive(net, NET_ARGS_PTR, *cpu_engine_);
-#endif  // ENABLE_MKLDNN_V1
   }
 };
 
@@ -1928,7 +1661,7 @@ class MklPrimitive {
   MklPrimitive(const engine& cpu_engine) { cpu_engine_ = cpu_engine; }
   // Dummy data which MKL DNN never operates on
   unsigned char* DummyData = nullptr;
-  engine cpu_engine_ = engine(ENGINE_CPU, 0);
+  engine cpu_engine_ = engine(engine::kind::cpu, 0);
   const engine& GetEngine() { return cpu_engine_; }
 };
 
@@ -2116,7 +1849,7 @@ class FactoryKeyCreator {
 class MklReorderPrimitive : public MklPrimitive {
  public:
   explicit MklReorderPrimitive(const memory* from, const memory* to)
-      : MklPrimitive(engine(ENGINE_CPU, 0)) {
+      : MklPrimitive(engine(engine::kind::cpu, 0)) {
     Setup(from, to);
   }
   ~MklReorderPrimitive() {}
@@ -2143,12 +1876,11 @@ class MklReorderPrimitive : public MklPrimitive {
 
   void Setup(const memory* from, const memory* to) {
     context_.src_mem.reset(
-        new MEMORY_CONSTRUCTOR_WITH_MEM_PD(from, cpu_engine_, DummyData));
-    context_.dst_mem.reset(
-        new MEMORY_CONSTRUCTOR_WITH_MEM_PD(to, cpu_engine_, DummyData));
+        new memory(from->get_desc(), cpu_engine_, DummyData));
+    context_.dst_mem.reset(new memory(to->get_desc(), cpu_engine_, DummyData));
     context_.reorder_prim = std::make_shared<mkldnn::reorder>(
         reorder(*context_.src_mem, *context_.dst_mem));
-    stream_.reset(new CPU_STREAM(cpu_engine_));
+    stream_.reset(new stream(cpu_engine_));
   }
 };
 
@@ -2175,58 +1907,46 @@ class MklReorderPrimitiveFactory : public MklPrimitiveFactory<T> {
   static string CreateKey(const memory* from, const memory* to) {
     string prefix = "reorder";
     FactoryKeyCreator key_creator;
-    auto const& from_desc = GET_MEMORY_DESC_FROM_MEM_PTR(from).data;
-    auto const& to_desc = GET_MEMORY_DESC_FROM_MEM_PTR(to).data;
+    auto const& from_desc = from->get_desc().data;
+    auto const& to_desc = to->get_desc().data;
     const int kIdxFirstStride = 0;
     memory::dims from_dims(from_desc.dims, &from_desc.dims[from_desc.ndims]);
     memory::dims to_dims(to_desc.dims, &to_desc.dims[to_desc.ndims]);
-    auto from_strides = from_desc.MEMORY_FORMAT_DESC.blocking.strides;
-#ifdef ENABLE_MKLDNN_V1
+    auto from_strides = from_desc.format_desc.blocking.strides;
+
     // As DNNL memory desc has C style array and only init the used
     // part, so need use the valid part as key.
-    auto from_inner_nblks = from_desc.MEMORY_FORMAT_DESC.blocking.inner_nblks;
-    auto from_inner_blks = from_desc.MEMORY_FORMAT_DESC.blocking.inner_blks;
-    auto from_inner_idxs = from_desc.MEMORY_FORMAT_DESC.blocking.inner_idxs;
+    auto from_inner_nblks = from_desc.format_desc.blocking.inner_nblks;
+    auto from_inner_blks = from_desc.format_desc.blocking.inner_blks;
+    auto from_inner_idxs = from_desc.format_desc.blocking.inner_idxs;
     memory::dims from_inner_blks_1(from_inner_blks,
                                    &from_inner_blks[from_inner_nblks]);
     memory::dims from_inner_idxs_1(from_inner_idxs,
                                    &from_inner_idxs[from_inner_nblks]);
-    auto to_inner_nblks = to_desc.MEMORY_FORMAT_DESC.blocking.inner_nblks;
-    auto to_inner_blks = to_desc.MEMORY_FORMAT_DESC.blocking.inner_blks;
-    auto to_inner_idxs = to_desc.MEMORY_FORMAT_DESC.blocking.inner_idxs;
+    auto to_inner_nblks = to_desc.format_desc.blocking.inner_nblks;
+    auto to_inner_blks = to_desc.format_desc.blocking.inner_blks;
+    auto to_inner_idxs = to_desc.format_desc.blocking.inner_idxs;
     memory::dims to_inner_blks_1(to_inner_blks, &to_inner_blks[to_inner_nblks]);
     memory::dims to_inner_idxs_1(to_inner_idxs, &to_inner_idxs[to_inner_nblks]);
-#endif  // ENABLE_MKLDNN_V1
-    auto to_strides = to_desc.MEMORY_FORMAT_DESC.blocking.strides;
-    memory::dims from_strides_outer_blocks(
-        GET_BLOCK_STRIDES(from_strides, kIdxFirstStride),
-        &GET_BLOCK_STRIDES(from_strides, kIdxFirstStride)[from_desc.ndims]);
-    memory::dims to_strides_outer_blocks(
-        GET_BLOCK_STRIDES(to_strides, kIdxFirstStride),
-        &GET_BLOCK_STRIDES(to_strides, kIdxFirstStride)[to_desc.ndims]);
+
+    auto to_strides = to_desc.format_desc.blocking.strides;
+    memory::dims from_strides_outer_blocks(from_strides,
+                                           &from_strides[from_desc.ndims]);
+    memory::dims to_strides_outer_blocks(to_strides,
+                                         &to_strides[to_desc.ndims]);
 
     key_creator.AddAsKey(prefix);
-#ifndef ENABLE_MKLDNN_V1
-    // `format_kind` is not added in v1.x since it will always set to
-    // `mkldnn_blocked`
-    key_creator.AddAsKey(static_cast<int>(from_desc.format));
-#else
     key_creator.AddAsKey(static_cast<int>(from_desc.extra.flags));
     key_creator.AddAsKey(static_cast<int>(from_inner_nblks));
     key_creator.AddAsKey(from_inner_blks_1);
     key_creator.AddAsKey(from_inner_idxs_1);
-#endif  // !ENABLE_MKLDNN_V1
     key_creator.AddAsKey(static_cast<int>(from_desc.data_type));
     key_creator.AddAsKey(from_dims);
     key_creator.AddAsKey(from_strides_outer_blocks);
-#ifndef ENABLE_MKLDNN_V1
-    key_creator.AddAsKey(static_cast<int>(to_desc.format));
-#else
     key_creator.AddAsKey(static_cast<int>(to_desc.extra.flags));
     key_creator.AddAsKey(static_cast<int>(to_inner_nblks));
     key_creator.AddAsKey(to_inner_blks_1);
     key_creator.AddAsKey(to_inner_idxs_1);
-#endif  // !ENABLE_MKLDNN_V1
     key_creator.AddAsKey(static_cast<int>(to_desc.data_type));
     key_creator.AddAsKey(to_dims);
     key_creator.AddAsKey(to_strides_outer_blocks);
@@ -2253,21 +1973,13 @@ class MklReorderPrimitiveFactory : public MklPrimitiveFactory<T> {
 /// get primitive from pool if it is cached.
 /// Returns the primitive.
 template <typename T>
-#ifdef ENABLE_MKLDNN_V1
 inline MklReorderPrimitive* FindOrCreateReorder(const memory* from,
                                                 const memory* to) {
-#else
-inline primitive FindOrCreateReorder(const memory* from, const memory* to) {
-#endif  // ENABLE_MKLDNN_V1
   CHECK_NOTNULL(from);
   CHECK_NOTNULL(to);
   MklReorderPrimitive* reorder_prim =
       MklReorderPrimitiveFactory<T>::Get(from, to);
-#ifdef ENABLE_MKLDNN_V1
   return reorder_prim;
-#else
-  return *reorder_prim->GetPrimitive();
-#endif  // ENABLE_MKLDNN_V1
 }
 
 // utility function to determine if it is conv 1x1 and stride != 1
diff --git a/tensorflow/core/util/mkl_util_test.cc b/tensorflow/core/util/mkl_util_test.cc
index 872ffcc61d4..29760cab4e9 100644
--- a/tensorflow/core/util/mkl_util_test.cc
+++ b/tensorflow/core/util/mkl_util_test.cc
@@ -68,9 +68,6 @@ TEST(MklUtilTest, MklDnnBlockedFormatTest) {
   EXPECT_EQ(a_md1.data.ndims, 2);
   EXPECT_EQ(a_md1.data.dims[0], 3);
   EXPECT_EQ(a_md1.data.dims[1], 4);
-#ifndef ENABLE_MKLDNN_V1
-  EXPECT_EQ(a_md1.data.format, mkldnn_blocked);
-#endif  // !ENABLE_MKLDNN_V1
 
   // Setting for case 2
   MklDnnData<float> b(&cpu_engine);
@@ -82,9 +79,6 @@ TEST(MklUtilTest, MklDnnBlockedFormatTest) {
   EXPECT_EQ(b_md2.data.ndims, 2);
   EXPECT_EQ(b_md2.data.dims[0], 3);
   EXPECT_EQ(b_md2.data.dims[1], 4);
-#ifndef ENABLE_MKLDNN_V1
-  EXPECT_EQ(b_md2.data.format, mkldnn_blocked);
-#endif  // !ENABLE_MKLDNN_V1
 }
 
 TEST(MklUtilTest, LRUCacheTest) {
diff --git a/tensorflow/core/util/ragged_to_dense_util.cc b/tensorflow/core/util/ragged_to_dense_util.cc
index cd95b5ec75b..1d00a43a14a 100644
--- a/tensorflow/core/util/ragged_to_dense_util.cc
+++ b/tensorflow/core/util/ragged_to_dense_util.cc
@@ -24,43 +24,15 @@ namespace tensorflow {
 
 using errors::InvalidArgument;
 
-string RowPartitionTypeToString(RowPartitionType row_partition_type) {
-  switch (row_partition_type) {
-    case RowPartitionType::FIRST_DIM_SIZE:
-      return "FIRST_DIM_SIZE";
-    case RowPartitionType::VALUE_ROWIDS:
-      return "VALUE_ROWIDS";
-    case RowPartitionType::ROW_LENGTHS:
-      return "ROW_LENGTHS";
-    case RowPartitionType::ROW_SPLITS:
-      return "ROW_SPLITS";
-    case RowPartitionType::ROW_LIMITS:
-      return "ROW_LIMITS";
-    case RowPartitionType::ROW_STARTS:
-      return "ROW_STARTS";
-    default:
-      return "UNKNOWN ROW PARTITION TYPE";
-  }
-}
 tensorflow::Status GetRowPartitionTypesHelper(
     const std::vector<string>& row_partition_type_strings,
     std::vector<RowPartitionType>* row_partition_types) {
-  static const auto kStringToType =
-      new std::unordered_map<string, RowPartitionType>(
-          {{"FIRST_DIM_SIZE", RowPartitionType::FIRST_DIM_SIZE},
-           {"VALUE_ROWIDS", RowPartitionType::VALUE_ROWIDS},
-           {"ROW_LENGTHS", RowPartitionType::ROW_LENGTHS},
-           {"ROW_SPLITS", RowPartitionType::ROW_SPLITS},
-           {"ROW_LIMITS", RowPartitionType::ROW_LIMITS},
-           {"ROW_STARTS", RowPartitionType::ROW_STARTS}});
-
-  for (const string& type_str : row_partition_type_strings) {
-    const auto iter = kStringToType->find(type_str);
-    if (iter == kStringToType->end()) {
-      return InvalidArgument("Unknown string for partition info type: ",
-                             type_str);
-    }
-    row_partition_types->push_back(iter->second);
+  *row_partition_types = GetRowPartitionTypesHelper(row_partition_type_strings);
+  if (row_partition_types->size() != row_partition_type_strings.size()) {
+    // Something was not converted, return error status.
+    return InvalidArgument(
+        "Unknown string for partition info type: ",
+        row_partition_type_strings.at(row_partition_types->size()));
   }
   return tensorflow::Status::OK();
 }
@@ -120,16 +92,6 @@ tensorflow::Status CombineRaggedTensorToTensorShapes(
   return tensorflow::Status::OK();
 }
 
-int GetRaggedRank(const std::vector<RowPartitionType>& row_partition_types) {
-  if (row_partition_types.empty()) {
-    return 0;
-  }
-  if (row_partition_types[0] == RowPartitionType::FIRST_DIM_SIZE) {
-    return row_partition_types.size() - 1;
-  }
-  return row_partition_types.size();
-}
-
 tensorflow::Status ValidateDefaultValueShape(
     const TensorShapeProto& default_value_shape,
     const TensorShapeProto& value_shape) {
diff --git a/tensorflow/core/util/ragged_to_dense_util.h b/tensorflow/core/util/ragged_to_dense_util.h
index d29d6a5b62d..28d230aa4a9 100644
--- a/tensorflow/core/util/ragged_to_dense_util.h
+++ b/tensorflow/core/util/ragged_to_dense_util.h
@@ -20,16 +20,9 @@ limitations under the License.
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/util/ragged_to_dense_util_common.h"
 
 namespace tensorflow {
-enum class RowPartitionType {
-  FIRST_DIM_SIZE,
-  VALUE_ROWIDS,
-  ROW_LENGTHS,
-  ROW_SPLITS,
-  ROW_LIMITS,
-  ROW_STARTS
-};
 
 string RowPartitionTypeToString(RowPartitionType row_partition_type);
 
@@ -48,6 +41,10 @@ Status GetRowPartitionTypes(
                                     row_partition_types);
 }
 
+Status GetRowPartitionTypesHelper(
+    const std::vector<string>& row_partition_type_strings,
+    std::vector<RowPartitionType>* row_partition_types);
+
 Status CombineRaggedTensorToTensorShapes(int ragged_rank,
                                          const TensorShapeProto& shape,
                                          const TensorShapeProto& value_shape,
diff --git a/tensorflow/core/util/ragged_to_dense_util_common.cc b/tensorflow/core/util/ragged_to_dense_util_common.cc
new file mode 100644
index 00000000000..b2d0b2d2fd9
--- /dev/null
+++ b/tensorflow/core/util/ragged_to_dense_util_common.cc
@@ -0,0 +1,70 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/util/ragged_to_dense_util_common.h"
+
+#include <unordered_map>
+
+namespace tensorflow {
+std::string RowPartitionTypeToString(RowPartitionType row_partition_type) {
+  switch (row_partition_type) {
+    case RowPartitionType::FIRST_DIM_SIZE:
+      return "FIRST_DIM_SIZE";
+    case RowPartitionType::VALUE_ROWIDS:
+      return "VALUE_ROWIDS";
+    case RowPartitionType::ROW_LENGTHS:
+      return "ROW_LENGTHS";
+    case RowPartitionType::ROW_SPLITS:
+      return "ROW_SPLITS";
+    case RowPartitionType::ROW_LIMITS:
+      return "ROW_LIMITS";
+    case RowPartitionType::ROW_STARTS:
+      return "ROW_STARTS";
+    default:
+      return "UNKNOWN ROW PARTITION TYPE";
+  }
+}
+
+std::vector<RowPartitionType> GetRowPartitionTypesHelper(
+    const std::vector<std::string>& row_partition_type_strings) {
+  static const auto kStringToType =
+      new std::unordered_map<std::string, RowPartitionType>(
+          {{"FIRST_DIM_SIZE", RowPartitionType::FIRST_DIM_SIZE},
+           {"VALUE_ROWIDS", RowPartitionType::VALUE_ROWIDS},
+           {"ROW_LENGTHS", RowPartitionType::ROW_LENGTHS},
+           {"ROW_SPLITS", RowPartitionType::ROW_SPLITS},
+           {"ROW_LIMITS", RowPartitionType::ROW_LIMITS},
+           {"ROW_STARTS", RowPartitionType::ROW_STARTS}});
+  std::vector<RowPartitionType> result;
+  for (const auto& type_str : row_partition_type_strings) {
+    const auto iter = kStringToType->find(type_str);
+    if (iter == kStringToType->end()) {
+      break;
+    }
+    result.push_back(iter->second);
+  }
+  return result;
+}
+
+int GetRaggedRank(const std::vector<RowPartitionType>& row_partition_types) {
+  if (row_partition_types.empty()) {
+    return 0;
+  }
+  if (row_partition_types[0] == RowPartitionType::FIRST_DIM_SIZE) {
+    return row_partition_types.size() - 1;
+  }
+  return row_partition_types.size();
+}
+}  // namespace tensorflow
diff --git a/tensorflow/core/util/ragged_to_dense_util_common.h b/tensorflow/core/util/ragged_to_dense_util_common.h
new file mode 100644
index 00000000000..b43412adb59
--- /dev/null
+++ b/tensorflow/core/util/ragged_to_dense_util_common.h
@@ -0,0 +1,40 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_RAGGED_TO_DENSE_UTIL_COMMON_H_
+#define TENSORFLOW_CORE_UTIL_RAGGED_TO_DENSE_UTIL_COMMON_H_
+
+#include <string>
+#include <vector>
+
+namespace tensorflow {
+enum class RowPartitionType {
+  FIRST_DIM_SIZE,
+  VALUE_ROWIDS,
+  ROW_LENGTHS,
+  ROW_SPLITS,
+  ROW_LIMITS,
+  ROW_STARTS
+};
+
+std::string RowPartitionTypeToString(RowPartitionType row_partition_type);
+
+std::vector<RowPartitionType> GetRowPartitionTypesHelper(
+    const std::vector<std::string>& row_partition_type_strings);
+
+int GetRaggedRank(const std::vector<RowPartitionType>& row_partition_types);
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_RAGGED_TO_DENSE_UTIL_COMMON_H_
diff --git a/tensorflow/core/util/reporter.cc b/tensorflow/core/util/reporter.cc
index 8e9d863b4c2..44465a58329 100644
--- a/tensorflow/core/util/reporter.cc
+++ b/tensorflow/core/util/reporter.cc
@@ -91,6 +91,14 @@ Status TestReporter::SetProperty(const string& name, double value) {
   return Status::OK();
 }
 
+Status TestReporter::AddMetric(const string& name, double value) {
+  if (report_file_.IsClosed()) return Status::OK();
+  auto* metric = benchmark_entry_.add_metrics();
+  metric->set_name(name);
+  metric->set_value(value);
+  return Status::OK();
+}
+
 Status TestReporter::Initialize() { return report_file_.Initialize(); }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/reporter.h b/tensorflow/core/util/reporter.h
index 51d7502701c..900fe40353e 100644
--- a/tensorflow/core/util/reporter.h
+++ b/tensorflow/core/util/reporter.h
@@ -111,6 +111,9 @@ class TestReporter {
   // Set property on Benchmark to the given value.
   Status SetProperty(const string& name, const string& value);
 
+  // Add the given value to the metrics on the Benchmark.
+  Status AddMetric(const string& name, double value);
+
   // TODO(b/32704451): Don't just ignore the ::tensorflow::Status object!
   ~TestReporter() { Close().IgnoreError(); }  // Autoclose in destructor.
 
diff --git a/tensorflow/core/util/reporter_test.cc b/tensorflow/core/util/reporter_test.cc
index 4c06560b852..77e7ed6467e 100644
--- a/tensorflow/core/util/reporter_test.cc
+++ b/tensorflow/core/util/reporter_test.cc
@@ -138,5 +138,30 @@ TEST(TestReporter, SetProperties) {
   EXPECT_EQ(4.0, extras.at("double_prop").double_value());
 }
 
+TEST(TestReporter, AddMetrics) {
+  string fname =
+      strings::StrCat(testing::TmpDir(), "/test_reporter_benchmarks_");
+  TestReporter test_reporter(fname, "b3/4/5");
+  TF_EXPECT_OK(test_reporter.Initialize());
+  TF_EXPECT_OK(test_reporter.AddMetric("metric1", 2.0));
+  TF_EXPECT_OK(test_reporter.AddMetric("metric2", 3.0));
+
+  TF_EXPECT_OK(test_reporter.Close());
+  string expected_fname = strings::StrCat(fname, "b3__4__5");
+  string read;
+  TF_EXPECT_OK(ReadFileToString(Env::Default(), expected_fname, &read));
+
+  BenchmarkEntries benchmark_entries;
+  ASSERT_TRUE(benchmark_entries.ParseFromString(read));
+  ASSERT_EQ(1, benchmark_entries.entry_size());
+  const BenchmarkEntry& benchmark_entry = benchmark_entries.entry(0);
+  const auto& metrics = benchmark_entry.metrics();
+  ASSERT_EQ(2, metrics.size());
+  EXPECT_EQ("metric1", metrics.at(0).name());
+  EXPECT_EQ(2.0, metrics.at(0).value());
+  EXPECT_EQ("metric2", metrics.at(1).name());
+  EXPECT_EQ(3.0, metrics.at(1).value());
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/rocm_sparse.cc b/tensorflow/core/util/rocm_sparse.cc
index cc7b56fdc01..22c2af780c7 100644
--- a/tensorflow/core/util/rocm_sparse.cc
+++ b/tensorflow/core/util/rocm_sparse.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "tensorflow/core/util/cuda_sparse.h"
 
 namespace tensorflow {
+
 namespace {
 
 // A set of initialized handles to the underlying ROCm libraries used by
@@ -67,9 +68,9 @@ class HipSparseHandles {
 
   Status Initialize() {
     if (initialized_) return Status::OK();
-    TF_RETURN_IF_GPUSPARSE_ERROR(hipsparseCreate(&hipsparse_handle_));
+    TF_RETURN_IF_GPUSPARSE_ERROR(wrap::hipsparseCreate(&hipsparse_handle_));
     TF_RETURN_IF_GPUSPARSE_ERROR(
-        hipsparseSetStream(hipsparse_handle_, stream_));
+        wrap::hipsparseSetStream(hipsparse_handle_, stream_));
     initialized_ = true;
     return Status::OK();
   }
@@ -88,7 +89,7 @@ class HipSparseHandles {
   void Release() {
     if (initialized_) {
       // This should never return anything other than success
-      auto err = hipsparseDestroy(hipsparse_handle_);
+      auto err = wrap::hipsparseDestroy(hipsparse_handle_);
       DCHECK(err == HIPSPARSE_STATUS_SUCCESS)
           << "Failed to destroy hipSPARSE instance.";
       initialized_ = false;
@@ -156,23 +157,23 @@ Status GpuSparse::Initialize() {
 #define TF_CALL_HIP_LAPACK_TYPES(m) m(float, S) m(double, D)
 
 // Macros to construct hipsparse method names.
-#define SPARSE_FN(method, sparse_prefix) hipsparse##sparse_prefix##method
+#define SPARSE_FN(method, sparse_prefix) wrap::hipsparse##sparse_prefix##method
 
 Status GpuSparse::Coo2csr(const int* cooRowInd, int nnz, int m,
                           int* csrRowPtr) const {
   DCHECK(initialized_);
-  TF_RETURN_IF_GPUSPARSE_ERROR(hipsparseXcoo2csr(*gpusparse_handle_, cooRowInd,
-                                                 nnz, m, csrRowPtr,
-                                                 HIPSPARSE_INDEX_BASE_ZERO));
+  TF_RETURN_IF_GPUSPARSE_ERROR(
+      wrap::hipsparseXcoo2csr(*gpusparse_handle_, cooRowInd, nnz, m, csrRowPtr,
+                              HIPSPARSE_INDEX_BASE_ZERO));
   return Status::OK();
 }
 
 Status GpuSparse::Csr2coo(const int* csrRowPtr, int nnz, int m,
                           int* cooRowInd) const {
   DCHECK(initialized_);
-  TF_RETURN_IF_GPUSPARSE_ERROR(hipsparseXcsr2coo(*gpusparse_handle_, csrRowPtr,
-                                                 nnz, m, cooRowInd,
-                                                 HIPSPARSE_INDEX_BASE_ZERO));
+  TF_RETURN_IF_GPUSPARSE_ERROR(
+      wrap::hipsparseXcsr2coo(*gpusparse_handle_, csrRowPtr, nnz, m, cooRowInd,
+                              HIPSPARSE_INDEX_BASE_ZERO));
   return Status::OK();
 }
 
@@ -252,7 +253,7 @@ Status GpuSparse::CsrgemmNnz(
     int* csrSortedRowPtrC, int* nnzTotalDevHostPtr) {
   DCHECK(initialized_);
   DCHECK(nnzTotalDevHostPtr != nullptr);
-  TF_RETURN_IF_GPUSPARSE_ERROR(hipsparseXcsrgemmNnz(
+  TF_RETURN_IF_GPUSPARSE_ERROR(wrap::hipsparseXcsrgemmNnz(
       *gpusparse_handle_, transA, transB, m, n, k, descrA, nnzA,
       csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB,
       csrSortedColIndB, descrC, csrSortedRowPtrC, nnzTotalDevHostPtr));
diff --git a/tensorflow/core/util/sparse/sparse_tensor.h b/tensorflow/core/util/sparse/sparse_tensor.h
index bc4e2c88f1c..062226d7699 100644
--- a/tensorflow/core/util/sparse/sparse_tensor.h
+++ b/tensorflow/core/util/sparse/sparse_tensor.h
@@ -476,12 +476,12 @@ inline SparseTensor SparseTensor::Concat(
     // Fill in indices & values.
     if (st_num_entries > 0) {
       std::copy_n(&st.vals_.vec<T>()(0), st_num_entries, &vals_t(offset));
-    }
 
-    const auto* st_ix = &st.ix_.matrix<int64>()(0, 0);
-    auto* ix_out = &ix_t(offset, 0);
-    for (std::size_t i = 0; i < st_num_entries * dims; ++i) {
-      *ix_out++ = *st_ix++ + ((i % dims == primary_dim) ? shape_offset : 0);
+      const auto* st_ix = &st.ix_.matrix<int64>()(0, 0);
+      auto* ix_out = &ix_t(offset, 0);
+      for (std::size_t i = 0; i < st_num_entries * dims; ++i) {
+        *ix_out++ = *st_ix++ + ((i % dims == primary_dim) ? shape_offset : 0);
+      }
     }
 
     offset += st_num_entries;
diff --git a/tensorflow/core/util/sparse/sparse_tensor_test.cc b/tensorflow/core/util/sparse/sparse_tensor_test.cc
index 8a73211327b..792269fb028 100644
--- a/tensorflow/core/util/sparse/sparse_tensor_test.cc
+++ b/tensorflow/core/util/sparse/sparse_tensor_test.cc
@@ -592,6 +592,20 @@ TEST(SparseTensorTest, Concat) {
   EXPECT_EQ(conc_ooo.num_entries(), 4 * N);
 }
 
+TEST(SparseTensorTest, ConcatEmptyN) {
+  constexpr int N = 0;
+  constexpr int NDIM = 2;
+  Tensor ix(DT_INT64, TensorShape({N, NDIM}));
+  Tensor vals(DT_STRING, TensorShape({N}));
+  TensorShape shape({10, 10});
+  SparseTensor st;
+  TF_ASSERT_OK(SparseTensor::Create(ix, vals, shape, {0, 1}, &st));
+
+  SparseTensor concatted = SparseTensor::Concat<tstring>({st, st, st});
+
+  EXPECT_EQ(concatted.num_entries(), 0);
+}
+
 // TODO(ebrevdo): ReduceToDense(R={dim1,dim2,...}, reduce_fn, &output)
 // reduce_fn sees slices of resorted values based on generator (dim: DDIMS), and
 // slices of resorted indices on generator.
diff --git a/tensorflow/core/util/strided_slice_op.cc b/tensorflow/core/util/strided_slice_op.cc
index 0df810abd00..1cf9a8cd013 100644
--- a/tensorflow/core/util/strided_slice_op.cc
+++ b/tensorflow/core/util/strided_slice_op.cc
@@ -59,6 +59,11 @@ struct StridedSliceDenseSpec {
   // is obtained from canonical end-begin. Otherwise, if it is a kNewAxis,
   // it will be 1. A shrunk dimension is skipped.
   gtl::InlinedVector<int32, 4> final_shape_gather_indices;
+  // This vector has the same size as final_shape_gather_indices, but it
+  // remembers the sparse index that a dimension comes from, instead of dense
+  // index. A -1 in this vector means there the index is not from the sparse
+  // input.
+  gtl::InlinedVector<int32, 4> final_shape_gather_indices_sparse;
   // The dense indexed shrink mask is which processing dimensions
   // should be shrunk. For example, if foo.shape = (10,10,10,10)
   // foo[3, ..., 5] has sparse_shrink_axis_mask of 0x5 and
@@ -108,9 +113,11 @@ static Status TF_MUST_USE_RESULT BuildDenseSpec(
           dense->begin_mask |= (1 << full_index);
           dense->end_mask |= (1 << full_index);
           dense->final_shape_gather_indices.push_back(full_index);
+          dense->final_shape_gather_indices_sparse.push_back(-1);
         }
       } else if ((1 << i) & sparse.new_axis_mask) {
         dense->final_shape_gather_indices.push_back(kNewAxis);
+        dense->final_shape_gather_indices_sparse.push_back(-1);
       } else {
         if (full_index == dense->begin.size()) {
           return errors::InvalidArgument("Index out of range using input dim ",
@@ -138,9 +145,13 @@ static Status TF_MUST_USE_RESULT BuildDenseSpec(
         // axis (now in dense form) so we can ignore dense->end below.
         if (sparse.shrink_axis_mask & (1 << i)) {
           dense->final_shape_gather_indices.push_back(kShrinkAxis);
+          dense->final_shape_gather_indices_sparse.push_back(-1);
           dense->shrink_axis_mask |= (1 << full_index);
         } else {
           dense->final_shape_gather_indices.push_back(full_index);
+          // Remember that where in the sparse shape the dense dim comes
+          // from.
+          dense->final_shape_gather_indices_sparse.push_back(i);
         }
         full_index++;
       }
@@ -157,7 +168,9 @@ Status ValidateStridedSliceOp(
     PartialTensorShape* processing_shape, PartialTensorShape* final_shape,
     bool* is_identity, bool* is_simple_slice, bool* slice_dim0,
     gtl::InlinedVector<int64, 4>* begin, gtl::InlinedVector<int64, 4>* end,
-    gtl::InlinedVector<int64, 4>* strides) {
+    gtl::InlinedVector<int64, 4>* strides,
+    gtl::InlinedVector<int64, 4>* output_to_sparse_mapping,
+    gtl::InlinedVector<int64, 4>* output_to_processing_mapping) {
   const bool begin_is_wrong =
       begin_tensor != nullptr &&
       !(TensorShapeUtils::IsVector(begin_tensor->shape()) &&
@@ -362,11 +375,34 @@ Status ValidateStridedSliceOp(
   // slices like foo[3,...] will reduce dimension by 1.
   // This cannot be done earlier, because it depends on Step 3.
   final_shape->Clear();
-  for (auto gather_index : dense_spec.final_shape_gather_indices) {
+  if (output_to_sparse_mapping != nullptr) {
+    output_to_sparse_mapping->clear();
+  }
+
+  if (output_to_processing_mapping != nullptr) {
+    output_to_processing_mapping->clear();
+  }
+  for (int64 dense_dim = 0;
+       dense_dim < dense_spec.final_shape_gather_indices.size(); ++dense_dim) {
+    int64 gather_index = dense_spec.final_shape_gather_indices[dense_dim];
+    int64 sparse_index =
+        dense_spec.final_shape_gather_indices_sparse[dense_dim];
     if (gather_index >= 0) {
       final_shape->AddDim(processing_shape->dim_size(gather_index));
+      if (output_to_sparse_mapping != nullptr) {
+        output_to_sparse_mapping->push_back(sparse_index);
+      }
+      if (output_to_processing_mapping != nullptr) {
+        output_to_processing_mapping->push_back(gather_index);
+      }
     } else if (gather_index == kNewAxis) {
       final_shape->AddDim(1);
+      if (output_to_sparse_mapping != nullptr) {
+        output_to_sparse_mapping->push_back(-1);
+      }
+      if (output_to_processing_mapping != nullptr) {
+        output_to_processing_mapping->push_back(-1);
+      }
     }
   }
   return Status::OK();
@@ -379,14 +415,17 @@ Status ValidateStridedSliceOp(
     int32 new_axis_mask, int32 shrink_axis_mask, TensorShape* processing_shape,
     TensorShape* final_shape, bool* is_identity, bool* is_simple_slice,
     bool* slice_dim0, gtl::InlinedVector<int64, 4>* begin,
-    gtl::InlinedVector<int64, 4>* end, gtl::InlinedVector<int64, 4>* strides) {
+    gtl::InlinedVector<int64, 4>* end, gtl::InlinedVector<int64, 4>* strides,
+    gtl::InlinedVector<int64, 4>* output_to_sparse_mapping,
+    gtl::InlinedVector<int64, 4>* output_to_processing_mapping) {
   // Validate with PartialTensorShape output
   PartialTensorShape partial_processing_shape, partial_final_shape;
   TF_RETURN_IF_ERROR(ValidateStridedSliceOp(
       begin_tensor, end_tensor, strides_tensor, input_shape, begin_mask_spec,
       end_mask_spec, ellipsis_mask, new_axis_mask, shrink_axis_mask,
       &partial_processing_shape, &partial_final_shape, is_identity,
-      is_simple_slice, slice_dim0, begin, end, strides));
+      is_simple_slice, slice_dim0, begin, end, strides,
+      output_to_sparse_mapping, output_to_processing_mapping));
 
   // Verify that the output shapes are fully known
   if (!partial_processing_shape.AsTensorShape(processing_shape) ||
diff --git a/tensorflow/core/util/strided_slice_op.h b/tensorflow/core/util/strided_slice_op.h
index 25ecccd2855..9e49477a9c3 100644
--- a/tensorflow/core/util/strided_slice_op.h
+++ b/tensorflow/core/util/strided_slice_op.h
@@ -40,6 +40,17 @@ namespace tensorflow {
 // some dimensions of <processing_shape> and/or <final_shape> may be unknown
 // (-1). Any validation that can be done without complete information is
 // performed.
+//
+// This function changes the orders of dimensions, output_to_sparse_mapping and
+// output_to_processing_mapping are used to track the order change.
+//
+// output_to_sparse_mapping[i] represents output[i]'s the corresponding dim
+// index in the begin_tensor. If
+// output_to_sparse_mapping[i] is -1, it means the dimension doesn't show up in
+// sparse_mapping.
+//
+// output_to_processing_mapping is similar to output_to_sparse_mapping, but for
+// processing_shape.
 Status ValidateStridedSliceOp(
     const Tensor* begin_tensor, const Tensor* end_tensor,
     const Tensor& strides_tensor, const PartialTensorShape& input_shape,
@@ -48,7 +59,9 @@ Status ValidateStridedSliceOp(
     PartialTensorShape* processing_shape, PartialTensorShape* final_shape,
     bool* is_identity, bool* is_simple_slice, bool* slice_dim0,
     gtl::InlinedVector<int64, 4>* begin, gtl::InlinedVector<int64, 4>* end,
-    gtl::InlinedVector<int64, 4>* strides);
+    gtl::InlinedVector<int64, 4>* strides,
+    gtl::InlinedVector<int64, 4>* output_to_sparse_mapping = nullptr,
+    gtl::InlinedVector<int64, 4>* output_to_processing_mapping = nullptr);
 
 // Same as above, but the outputs are TensorShape, not PartialTensorShape
 Status ValidateStridedSliceOp(
@@ -58,7 +71,9 @@ Status ValidateStridedSliceOp(
     int32 new_axis_mask, int32 shrink_axis_mask, TensorShape* processing_shape,
     TensorShape* final_shape, bool* is_identity, bool* is_simple_slice,
     bool* slice_dim0, gtl::InlinedVector<int64, 4>* begin,
-    gtl::InlinedVector<int64, 4>* end, gtl::InlinedVector<int64, 4>* strides);
+    gtl::InlinedVector<int64, 4>* end, gtl::InlinedVector<int64, 4>* strides,
+    gtl::InlinedVector<int64, 4>* output_to_sparse_mapping = nullptr,
+    gtl::InlinedVector<int64, 4>* output_to_processing_mapping = nullptr);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/util/tensor_bundle/BUILD b/tensorflow/core/util/tensor_bundle/BUILD
index cbe1a89b230..36114256216 100644
--- a/tensorflow/core/util/tensor_bundle/BUILD
+++ b/tensorflow/core/util/tensor_bundle/BUILD
@@ -72,7 +72,7 @@ cc_library(
 tf_cc_test(
     name = "tensor_bundle_test",
     srcs = ["tensor_bundle_test.cc"],
-    data = glob(["testdata/**"]),
+    data = ["//tensorflow/core/util/tensor_bundle/testdata:old_string_tensors"],
     tags = [
         "nomsan",
         "notsan",
diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
index bb18000fcfe..c5aa2f1e8c9 100644
--- a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
+++ b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
@@ -31,7 +31,6 @@ limitations under the License.
 #include "tensorflow/core/framework/variant_tensor_data.h"
 #include "tensorflow/core/framework/versions.h"
 #include "tensorflow/core/framework/versions.pb.h"
-#include "tensorflow/core/lib/bfloat16/bfloat16.h"
 #include "tensorflow/core/lib/core/coding.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
@@ -41,6 +40,7 @@ limitations under the License.
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/bfloat16.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/util/env_var.h"
 #include "tensorflow/core/util/saved_tensor_slice_util.h"
diff --git a/tensorflow/core/util/tensor_bundle/testdata/BUILD b/tensorflow/core/util/tensor_bundle/testdata/BUILD
new file mode 100644
index 00000000000..5ea691eae10
--- /dev/null
+++ b/tensorflow/core/util/tensor_bundle/testdata/BUILD
@@ -0,0 +1,12 @@
+# Description:
+# Old string tensors data package alias.
+
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
+
+alias(
+    name = "old_string_tensors",
+    actual = "//tensorflow/core/util/tensor_bundle/testdata/old_string_tensors:old_string_tensors_testdata",
+    visibility = ["//visibility:public"],
+)
diff --git a/tensorflow/core/util/tensor_bundle/testdata/old_string_tensors/BUILD b/tensorflow/core/util/tensor_bundle/testdata/old_string_tensors/BUILD
new file mode 100644
index 00000000000..9f7c7c65ba5
--- /dev/null
+++ b/tensorflow/core/util/tensor_bundle/testdata/old_string_tensors/BUILD
@@ -0,0 +1,15 @@
+# Description:
+# Old string tensors data packages.
+
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
+
+filegroup(
+    name = "old_string_tensors_testdata",
+    srcs = [
+        "foo.data-00000-of-00001",
+        "foo.index",
+    ],
+    visibility = ["//visibility:public"],
+)
diff --git a/tensorflow/core/util/tensor_format.cc b/tensorflow/core/util/tensor_format.cc
index 5dbd8ef318f..008c4d45200 100644
--- a/tensorflow/core/util/tensor_format.cc
+++ b/tensorflow/core/util/tensor_format.cc
@@ -73,7 +73,7 @@ string ToString(FilterTensorFormat format) {
   }
 }
 
-bool FormatFromString(const string& format_str, TensorFormat* format) {
+bool FormatFromString(absl::string_view format_str, TensorFormat* format) {
   if (format_str == "NHWC" || format_str == "NDHWC") {
     *format = FORMAT_NHWC;
     return true;
@@ -101,7 +101,7 @@ bool FormatFromString(const string& format_str, TensorFormat* format) {
   return false;
 }
 
-bool FilterFormatFromString(const string& format_str,
+bool FilterFormatFromString(absl::string_view format_str,
                             FilterTensorFormat* format) {
   if (format_str == "HWIO" || format_str == "DHWIO") {
     *format = FORMAT_HWIO;
diff --git a/tensorflow/core/util/tensor_format.h b/tensorflow/core/util/tensor_format.h
index d2d7b9e58de..07762f84300 100644
--- a/tensorflow/core/util/tensor_format.h
+++ b/tensorflow/core/util/tensor_format.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <array>
 #include <vector>
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
@@ -97,11 +98,11 @@ enum FilterTensorFormat {
 
 // Parse tensor format from the given string.
 // Return true if the parsing succeeds, and false if it fails.
-bool FormatFromString(const std::string& format_str, TensorFormat* format);
+bool FormatFromString(absl::string_view format_str, TensorFormat* format);
 
 // Parse tensor format from the given string.
 // Return true if the parsing succeeds, and false if it fails.
-bool FilterFormatFromString(const std::string& format_str,
+bool FilterFormatFromString(absl::string_view format_str,
                             FilterTensorFormat* format);
 
 // Convert a tensor format into string.
diff --git a/tensorflow/core/util/use_cudnn.cc b/tensorflow/core/util/use_cudnn.cc
index d0157f8ad37..442b3725db5 100644
--- a/tensorflow/core/util/use_cudnn.cc
+++ b/tensorflow/core/util/use_cudnn.cc
@@ -22,24 +22,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-bool CanUseCudnn() {
-  static bool is_enabled = [] {
-    bool is_enabled = true;
-    // TODO(b/155239286): Remove TF_USE_CUDNN after TF 2.3 is released.
-    Status status =
-        ReadBoolFromEnvVar("TF_USE_CUDNN", /*default_val=*/true, &is_enabled);
-    if (!status.ok()) {
-      LOG(ERROR) << status;
-    }
-    if (!is_enabled) {
-      LOG(WARNING) << "The environmental variable TF_USE_CUDNN is deprecated "
-                      "and will be ignored in the future";
-    }
-    return is_enabled;
-  }();
-  return is_enabled;
-}
-
 #define ADD_BOOL_CUDNN_FLAG(func_name, flag_name, default_value)           \
   bool func_name() {                                                       \
     bool value = default_value;                                            \
@@ -88,25 +70,6 @@ ADD_BOOL_CUDNN_FLAG(DebugCudnnRnnUseTensorOps,
 ADD_INT64_CUDNN_FLAG(DebugCudnnRnnAlgo, TF_DEBUG_CUDNN_RNN_ALGO, -1);
 #undef ADD_INT64_CUDNN_FLAG
 
-FP16ConvMode CudnnConvComputeMode() {
-  string value;
-  Status status = ReadStringFromEnvVar("TF_FP16_CONV_MODE", "accurate", &value);
-  if (!status.ok()) {
-    LOG(ERROR) << status;
-  }
-  string lowercase_value = absl::AsciiStrToLower(value);
-  if (lowercase_value == "accurate") {
-    return FP16ConvMode::kAccurate;
-  } else if (lowercase_value == "fast") {
-    return FP16ConvMode::kFast;
-  } else {
-    LOG(ERROR) << "FP16ConvMode only supports two modes, ACCURATE and FAST. "
-                  "Got unknown mode: "
-               << value;
-  }
-  return FP16ConvMode::kAccurate;
-}
-
 bool IsCudnnSupportedFilterSize(const int32 filter_rows,
                                 const int32 filter_cols, const int32 in_depth,
                                 const int32 out_depth) {
diff --git a/tensorflow/core/util/use_cudnn.h b/tensorflow/core/util/use_cudnn.h
index bbacd349daf..f59a6950269 100644
--- a/tensorflow/core/util/use_cudnn.h
+++ b/tensorflow/core/util/use_cudnn.h
@@ -22,20 +22,9 @@ limitations under the License.
 
 namespace tensorflow {
 
-// FP16ConvMode: The mode to set the internal compute type for cudnn convolution
-// when the input data type is float16. Two types of modes are supported:
-//   kAccurate: Always use float32 as the internal compute type.
-//   kFast: Include both float32 and float16 compute type in the autotune.
-enum class FP16ConvMode {
-  kAccurate = 1,
-  kFast = 2,
-};
-
-bool CanUseCudnn();
 bool CudnnUseAutotune();
 bool CudnnRnnUseAutotune();
 bool CudnnDisableConv1x1Optimization();
-FP16ConvMode CudnnConvComputeMode();
 bool DebugCudnnRnn();
 bool DebugCudnnRnnUseTensorOps();
 int64 DebugCudnnRnnAlgo();
diff --git a/tensorflow/examples/label_image/BUILD b/tensorflow/examples/label_image/BUILD
index a0e5005d45a..7c3a6dca1b2 100644
--- a/tensorflow/examples/label_image/BUILD
+++ b/tensorflow/examples/label_image/BUILD
@@ -38,7 +38,7 @@ tf_cc_binary(
             "//tensorflow/core:portable_tensorflow_lib",
             # cc:android_tensorflow_image_op is for including jpeg/gif/png
             # decoder to enable real-image evaluation on Android
-            "//tensorflow/core/kernels:android_tensorflow_image_op",
+            "//tensorflow/core/kernels/image:android_tensorflow_image_op",
         ],
         "//conditions:default": [
             "//tensorflow/cc:cc_ops",
diff --git a/tensorflow/examples/tutorials/mnist/input_data.py b/tensorflow/examples/tutorials/mnist/input_data.py
index c203c7b5341..a70787d7dcc 100644
--- a/tensorflow/examples/tutorials/mnist/input_data.py
+++ b/tensorflow/examples/tutorials/mnist/input_data.py
@@ -138,7 +138,7 @@ class _DataSet(object):
     Args:
       images: The images
       labels: The labels
-      fake_data: Ignore inages and labels, use fake data.
+      fake_data: Ignore images and labels, use fake data.
       one_hot: Bool, return the labels as one hot vectors (if True) or ints (if
         False).
       dtype: Output image dtype. One of [uint8, float32]. `uint8` output has
@@ -330,4 +330,3 @@ def read_data_sets(train_dir,
   test = _DataSet(test_images, test_labels, **options)
 
   return _Datasets(train=train, validation=validation, test=test)
-
diff --git a/tensorflow/g3doc/README.txt b/tensorflow/g3doc/README.txt
deleted file mode 100644
index 515a9e9a025..00000000000
--- a/tensorflow/g3doc/README.txt
+++ /dev/null
@@ -1,46 +0,0 @@
-Docs have moved!  If you just want to view TensorFlow documentation,
-go to:
-
-   https://www.tensorflow.org/
-
-Documentation (on Github, tensorflow.org, and anywhere else we decide to
-serve it from) is now generated from the files in
-tensorflow/docs_src/ (for tutorials and other guides) and
-TensorFlow source code (for the API reference pages). If you see a problem with
-API reference, edit the code comments in the appropriate language. If you see a
-problem with our other docs, edit the files in docs_src.
-
-To preview the results of your changes, or generate an offline copy of
-the docs, run:
-
-  bazel run -- tensorflow/tools/docs:generate \
-    --src_dir=/path/to/tensorflow/docs_src/ \
-    --output_dir=/tmp/tfdocs/
-
-`src_dir` must be absolute path to documentation source.
-When authoring docs, note that we have some new syntax for references --
-at least for docs coming from Python docstrings or
-tensorflow/docs_src/.  Use:
-
-* `tf.symbol` to make a link to the reference page for a Python
-  symbol.  Note that class members don't get their own page, but the
-  syntax still works, since `tf.MyClass.method` links to the right
-  part of the tf.MyClass page.
-
-* `tensorflow::symbol` to make a link to the reference page for a C++
-  symbol. (This only works for a few symbols but will work for more soon.)
-
-* @{$doc_page} to make a link to another (not an API reference) doc
-  page. To link to
-    - red/green/blue/index.md use @{$blue} or @{$green/blue},
-    - foo/bar/baz.md use @{$baz} or @{$bar/baz}.
-  The shorter one is preferred, so we can move pages around without
-  breaking these references. The main exception is that the Python API
-  guides should probably be referred to using @{$python/<guide-name>}
-  to avoid ambiguity. To link to an anchor in that doc and use
-  different link text (by default it uses the title of the target
-  page) use:
-        @{$doc_page#anchor-tag$link-text}
-  (You can skip #anchor-tag if you just want to override the link text).
-
-Thanks!
diff --git a/tensorflow/go/genop/generate.go b/tensorflow/go/genop/generate.go
new file mode 100644
index 00000000000..3a02ff3ce2a
--- /dev/null
+++ b/tensorflow/go/genop/generate.go
@@ -0,0 +1,21 @@
+/*
+Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// Package stub file for non-Windows builds.
+
+//go:generate bash generate.sh
+
+package main
diff --git a/tensorflow/go/genop/generate.sh b/tensorflow/go/genop/generate.sh
index 18bee11da5a..547dd790e05 100644
--- a/tensorflow/go/genop/generate.sh
+++ b/tensorflow/go/genop/generate.sh
@@ -24,6 +24,16 @@ then
   GOPATH=$(go env GOPATH)
 fi
 
+# convert GOPATH's Windows style to UNIX style
+if [ $1 == "win" ]; then
+  # eg: convert "D:\go-14;D:\go-13" to "D\go-14;D\go-13"
+  GOPATH=${GOPATH//:\\/\\}
+  # eg: convert "D\go-14;D\go-13" to "\D\go-14:\D\go-13"
+  GOPATH=\\${GOPATH//;/:\\}
+  # eg: convert "\D\go-14:\D\go-13" to "/D/go-14:/D/go-13"
+  GOPATH=${GOPATH//\\/\/}
+fi
+
 cd $(dirname $0)
 for g in $(echo "${GOPATH//:/ }"); do
     TF_DIR="${g}/src/github.com/tensorflow/tensorflow"
diff --git a/tensorflow/go/genop/generate.win.go b/tensorflow/go/genop/generate.win.go
new file mode 100644
index 00000000000..5b50dda618d
--- /dev/null
+++ b/tensorflow/go/genop/generate.win.go
@@ -0,0 +1,21 @@
+/*
+Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// Package stub file for Windows builds.
+
+//go:generate bash generate.sh win
+
+package main
diff --git a/tensorflow/go/genop/main.go b/tensorflow/go/genop/main.go
index 4a53084ed13..87c1d27c3b5 100644
--- a/tensorflow/go/genop/main.go
+++ b/tensorflow/go/genop/main.go
@@ -14,8 +14,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */
 
-//go:generate bash generate.sh
-
 // Command genop generates a Go source file with functions for TensorFlow ops.
 package main
 
diff --git a/tensorflow/go/graph.go b/tensorflow/go/graph.go
index ac28c3ac5bd..60de1e1a29e 100644
--- a/tensorflow/go/graph.go
+++ b/tensorflow/go/graph.go
@@ -495,3 +495,34 @@ func setAttr(cdesc *C.TF_OperationDescription, status *status, name string, valu
 	}
 	return nil
 }
+
+type LibraryHandler struct {
+	cptr *C.TF_Library
+}
+
+// Load library content into current context, useful to load ops implementation into non-monolitic TF build. Returns LibraryHandler or nil and error
+func LoadLibrary(path string) (*LibraryHandler, error) {
+	status := newStatus()
+
+	cpath := C.CString(path)
+	defer C.free(unsafe.Pointer(cpath))
+	cptr := C.TF_LoadLibrary(cpath, status.c)
+	if cptr == nil || status.Code() != C.TF_OK {
+		return nil, fmt.Errorf("could not load library %s: code: %d, error: %s", path, status.Code(), status.String())
+	}
+
+	lh := &LibraryHandler{
+		cptr: cptr,
+	}
+
+	runtime.SetFinalizer(lh, (*LibraryHandler).free)
+	return lh, nil
+}
+
+func (lh *LibraryHandler) free() {
+	if lh == nil || lh.cptr == nil {
+		return
+	}
+
+	C.TF_DeleteLibraryHandle(lh.cptr)
+}
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 34ff57636ca..7b922ce8bf9 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -7748,98 +7748,6 @@ func PriorityQueueV2(scope *Scope, shapes []tf.Shape, optional ...PriorityQueueV
 	return op.Output(0)
 }
 
-// Does nothing. Serves as a control trigger for scheduling.
-//
-// Only useful as a placeholder for control edges.
-//
-// Returns the created operation.
-func ControlTrigger(scope *Scope) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ControlTrigger",
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Interleave the values from the `data` tensors into a single tensor.
-//
-// Builds a merged tensor such that
-//
-// ```python
-//     merged[indices[m][i, ..., j], ...] = data[m][i, ..., j, ...]
-// ```
-//
-// For example, if each `indices[m]` is scalar or vector, we have
-//
-// ```python
-//     # Scalar indices:
-//     merged[indices[m], ...] = data[m][...]
-//
-//     # Vector indices:
-//     merged[indices[m][i], ...] = data[m][i, ...]
-// ```
-//
-// Each `data[i].shape` must start with the corresponding `indices[i].shape`,
-// and the rest of `data[i].shape` must be constant w.r.t. `i`.  That is, we
-// must have `data[i].shape = indices[i].shape + constant`.  In terms of this
-// `constant`, the output shape is
-//
-//     merged.shape = [max(indices)] + constant
-//
-// Values may be merged in parallel, so if an index appears in both `indices[m][i]`
-// and `indices[n][j]`, the result may be invalid. This differs from the normal
-// DynamicStitch operator that defines the behavior in that case.
-//
-// For example:
-//
-// ```python
-//     indices[0] = 6
-//     indices[1] = [4, 1]
-//     indices[2] = [[5, 2], [0, 3]]
-//     data[0] = [61, 62]
-//     data[1] = [[41, 42], [11, 12]]
-//     data[2] = [[[51, 52], [21, 22]], [[1, 2], [31, 32]]]
-//     merged = [[1, 2], [11, 12], [21, 22], [31, 32], [41, 42],
-//               [51, 52], [61, 62]]
-// ```
-//
-// This method can be used to merge partitions created by `dynamic_partition`
-// as illustrated on the following example:
-//
-// ```python
-//     # Apply function (increments x_i) on elements for which a certain condition
-//     # apply (x_i != -1 in this example).
-//     x=tf.constant([0.1, -1., 5.2, 4.3, -1., 7.4])
-//     condition_mask=tf.not_equal(x,tf.constant(-1.))
-//     partitioned_data = tf.dynamic_partition(
-//         x, tf.cast(condition_mask, tf.int32) , 2)
-//     partitioned_data[1] = partitioned_data[1] + 1.0
-//     condition_indices = tf.dynamic_partition(
-//         tf.range(tf.shape(x)[0]), tf.cast(condition_mask, tf.int32) , 2)
-//     x = tf.dynamic_stitch(condition_indices, partitioned_data)
-//     # Here x=[1.1, -1., 6.2, 5.3, -1, 8.4], the -1. values remain
-//     # unchanged.
-// ```
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicStitch.png" alt>
-// </div>
-func ParallelDynamicStitch(scope *Scope, indices []tf.Output, data []tf.Output) (merged tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ParallelDynamicStitch",
-		Input: []tf.Input{
-			tf.OutputList(indices), tf.OutputList(data),
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Partitions `data` into `num_partitions` tensors using indices from `partitions`.
 //
 // For each index tuple `js` of size `partitions.ndim`, the slice `data[js, ...]`
@@ -8248,6 +8156,14 @@ func ModelDatasetCpuBudget(value int64) ModelDatasetAttr {
 	}
 }
 
+// ModelDatasetRamBudget sets the optional ram_budget attribute to value.
+// If not specified, defaults to 0
+func ModelDatasetRamBudget(value int64) ModelDatasetAttr {
+	return func(m optionalAttr) {
+		m["ram_budget"] = value
+	}
+}
+
 // Identity transformation that models performance.
 //
 // Identity transformation that models performance.
@@ -10899,12 +10815,26 @@ func ExperimentalRandomDataset(scope *Scope, seed tf.Output, seed2 tf.Output, ou
 	return op.Output(0)
 }
 
+// ExperimentalIgnoreErrorsDatasetAttr is an optional argument to ExperimentalIgnoreErrorsDataset.
+type ExperimentalIgnoreErrorsDatasetAttr func(optionalAttr)
+
+// ExperimentalIgnoreErrorsDatasetLogWarning sets the optional log_warning attribute to value.
+// If not specified, defaults to false
+func ExperimentalIgnoreErrorsDatasetLogWarning(value bool) ExperimentalIgnoreErrorsDatasetAttr {
+	return func(m optionalAttr) {
+		m["log_warning"] = value
+	}
+}
+
 // Creates a dataset that contains the elements of `input_dataset` ignoring errors.
-func ExperimentalIgnoreErrorsDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+func ExperimentalIgnoreErrorsDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape, optional ...ExperimentalIgnoreErrorsDatasetAttr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
 		Type: "ExperimentalIgnoreErrorsDataset",
 		Input: []tf.Input{
@@ -12077,6 +12007,153 @@ func ExtractGlimpse(scope *Scope, input tf.Output, size tf.Output, offsets tf.Ou
 	return op.Output(0)
 }
 
+// StatelessSampleDistortedBoundingBoxAttr is an optional argument to StatelessSampleDistortedBoundingBox.
+type StatelessSampleDistortedBoundingBoxAttr func(optionalAttr)
+
+// StatelessSampleDistortedBoundingBoxAspectRatioRange sets the optional aspect_ratio_range attribute to value.
+//
+// value: The cropped area of the image must have an aspect ratio =
+// width / height within this range.
+// If not specified, defaults to <f:0.75 f:1.33 >
+func StatelessSampleDistortedBoundingBoxAspectRatioRange(value []float32) StatelessSampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["aspect_ratio_range"] = value
+	}
+}
+
+// StatelessSampleDistortedBoundingBoxAreaRange sets the optional area_range attribute to value.
+//
+// value: The cropped area of the image must contain a fraction of the
+// supplied image within this range.
+// If not specified, defaults to <f:0.05 f:1 >
+func StatelessSampleDistortedBoundingBoxAreaRange(value []float32) StatelessSampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["area_range"] = value
+	}
+}
+
+// StatelessSampleDistortedBoundingBoxMaxAttempts sets the optional max_attempts attribute to value.
+//
+// value: Number of attempts at generating a cropped region of the image
+// of the specified constraints. After `max_attempts` failures, return the entire
+// image.
+// If not specified, defaults to 100
+func StatelessSampleDistortedBoundingBoxMaxAttempts(value int64) StatelessSampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["max_attempts"] = value
+	}
+}
+
+// StatelessSampleDistortedBoundingBoxUseImageIfNoBoundingBoxes sets the optional use_image_if_no_bounding_boxes attribute to value.
+//
+// value: Controls behavior if no bounding boxes supplied.
+// If true, assume an implicit bounding box covering the whole input. If false,
+// raise an error.
+// If not specified, defaults to false
+func StatelessSampleDistortedBoundingBoxUseImageIfNoBoundingBoxes(value bool) StatelessSampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["use_image_if_no_bounding_boxes"] = value
+	}
+}
+
+// Generate a randomly distorted bounding box for an image deterministically.
+//
+// Bounding box annotations are often supplied in addition to ground-truth labels
+// in image recognition or object localization tasks. A common technique for
+// training such a system is to randomly distort an image while preserving its
+// content, i.e. *data augmentation*. This Op, given the same `seed`,
+// deterministically outputs a randomly distorted localization of an object, i.e.
+// bounding box, given an `image_size`, `bounding_boxes` and a series of
+// constraints.
+//
+// The output of this Op is a single bounding box that may be used to crop the
+// original image. The output is returned as 3 tensors: `begin`, `size` and
+// `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
+// image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
+// what the bounding box looks like.
+//
+// Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
+// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
+// the height of the underlying image.
+//
+// The output of this Op is guaranteed to be the same given the same `seed` and is
+// independent of how many times the function is called, and independent of global
+// seed settings (e.g. `tf.random.set_seed`).
+//
+// Example usage:
+//
+// >>> image = np.array([[[1], [2], [3]], [[4], [5], [6]], [[7], [8], [9]]])
+// >>> bbox = tf.constant(
+// ...   [0.0, 0.0, 1.0, 1.0], dtype=tf.float32, shape=[1, 1, 4])
+// >>> seed = (1, 2)
+// >>> # Generate a single distorted bounding box.
+// >>> bbox_begin, bbox_size, bbox_draw = (
+// ...   tf.image.stateless_sample_distorted_bounding_box(
+// ...     tf.shape(image), bounding_boxes=bbox, seed=seed))
+// >>> # Employ the bounding box to distort the image.
+// >>> tf.slice(image, bbox_begin, bbox_size)
+// <tf.Tensor: shape=(2, 2, 1), dtype=int64, numpy=
+// array([[[1],
+//         [2]],
+//        [[4],
+//         [5]]])>
+// >>> # Draw the bounding box in an image summary.
+// >>> colors = np.array([[1.0, 0.0, 0.0], [0.0, 0.0, 1.0]])
+// >>> tf.image.draw_bounding_boxes(
+// ...   tf.expand_dims(tf.cast(image, tf.float32),0), bbox_draw, colors)
+// <tf.Tensor: shape=(1, 3, 3, 1), dtype=float32, numpy=
+// array([[[[1.],
+//          [1.],
+//          [3.]],
+//         [[1.],
+//          [1.],
+//          [6.]],
+//         [[7.],
+//          [8.],
+//          [9.]]]], dtype=float32)>
+//
+// Note that if no bounding box information is available, setting
+// `use_image_if_no_bounding_boxes = true` will assume there is a single implicit
+// bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
+// false and no bounding boxes are supplied, an error is raised.
+//
+// Arguments:
+//	image_size: 1-D, containing `[height, width, channels]`.
+//	bounding_boxes: 3-D with shape `[batch, N, 4]` describing the N bounding boxes
+// associated with the image.
+//	min_object_covered: The cropped area of the image must contain at least this
+// fraction of any bounding box supplied. The value of this parameter should be
+// non-negative. In the case of 0, the cropped area does not need to overlap
+// any of the bounding boxes supplied.
+//	seed: 1-D with shape `[2]`. The seed to the random number generator. Must have dtype
+// `int32` or `int64`. (When using XLA, only `int32` is allowed.)
+//
+// Returns:
+//	begin: 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
+// `tf.slice`.
+//	size: 1-D, containing `[target_height, target_width, -1]`. Provide as input to
+// `tf.slice`.
+//	bboxes: 3-D with shape `[1, 1, 4]` containing the distorted bounding box.
+// Provide as input to `tf.image.draw_bounding_boxes`.
+func StatelessSampleDistortedBoundingBox(scope *Scope, image_size tf.Output, bounding_boxes tf.Output, min_object_covered tf.Output, seed tf.Output, optional ...StatelessSampleDistortedBoundingBoxAttr) (begin tf.Output, size tf.Output, bboxes tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatelessSampleDistortedBoundingBox",
+		Input: []tf.Input{
+			image_size, bounding_boxes, min_object_covered, seed,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
 // SampleDistortedBoundingBoxAttr is an optional argument to SampleDistortedBoundingBox.
 type SampleDistortedBoundingBoxAttr func(optionalAttr)
 
@@ -13369,6 +13446,24 @@ func ResizeArea(scope *Scope, images tf.Output, size tf.Output, optional ...Resi
 	return op.Output(0)
 }
 
+// Returns the number of work units this Reader has finished processing.
+//
+// Arguments:
+//	reader_handle: Handle to a Reader.
+func ReaderNumWorkUnitsCompletedV2(scope *Scope, reader_handle tf.Output) (units_completed tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReaderNumWorkUnitsCompletedV2",
+		Input: []tf.Input{
+			reader_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Returns up to `num_records` (key, value) pairs produced by a Reader.
 //
 // Will dequeue from the input queue if necessary (e.g. when the
@@ -13455,6 +13550,33 @@ func QueueDequeueV2(scope *Scope, handle tf.Output, component_types []tf.DataTyp
 	return components
 }
 
+// Returns the next record (key, value pair) produced by a Reader.
+//
+// Will dequeue from the input queue if necessary (e.g. when the
+// Reader needs to start reading from a new file since it has finished
+// with the previous file).
+//
+// Arguments:
+//	reader_handle: Handle to a Reader.
+//	queue_handle: Handle to a Queue, with string work items.
+//
+// Returns:
+//	key: A scalar.
+//	value: A scalar.
+func ReaderReadV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output) (key tf.Output, value tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReaderReadV2",
+		Input: []tf.Input{
+			reader_handle, queue_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
 // Return a slice from 'input'.
 //
 // The output tensor is a tensor with dimensions described by 'size'
@@ -13762,6 +13884,48 @@ func FixedLengthRecordReaderV2(scope *Scope, record_bytes int64, optional ...Fix
 	return op.Output(0)
 }
 
+// ExperimentalRebatchDatasetAttr is an optional argument to ExperimentalRebatchDataset.
+type ExperimentalRebatchDatasetAttr func(optionalAttr)
+
+// ExperimentalRebatchDatasetUseFallback sets the optional use_fallback attribute to value.
+// If not specified, defaults to true
+func ExperimentalRebatchDatasetUseFallback(value bool) ExperimentalRebatchDatasetAttr {
+	return func(m optionalAttr) {
+		m["use_fallback"] = value
+	}
+}
+
+// Creates a dataset that changes the batch size.
+//
+// Creates a dataset that changes the batch size of the dataset to current batch
+// size // num_replicas.
+//
+// Arguments:
+//	input_dataset: A variant tensor representing the input dataset.
+//	num_replicas: A scalar representing the number of replicas to distribute this batch across. As
+// a result of this transformation the current batch size would end up being
+// divided  by this parameter.
+//
+//
+func ExperimentalRebatchDataset(scope *Scope, input_dataset tf.Output, num_replicas tf.Output, output_types []tf.DataType, output_shapes []tf.Shape, optional ...ExperimentalRebatchDatasetAttr) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalRebatchDataset",
+		Input: []tf.Input{
+			input_dataset, num_replicas,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Saves the input tensors to disk.
 //
 // The size of `tensor_names` must match the number of tensors in `data`. `data[i]`
@@ -14805,9 +14969,9 @@ func MatrixInverse(scope *Scope, input tf.Output, optional ...MatrixInverseAttr)
 // The input is a tensor of shape `[N, M, M]` whose inner-most 2 dimensions
 // form square matrices. The outputs are two tensors containing the signs and
 // absolute values of the log determinants for all N input submatrices
-// `[..., :, :]` such that the determinant = sign*exp(log_abs_determinant).
-// The log_abs_determinant is computed as det(P)*sum(log(diag(LU))) where LU
-// is the LU decomposition of the input and P is the corresponding
+// `[..., :, :]` such that `determinant = sign*exp(log_abs_determinant)`.
+// The `log_abs_determinant` is computed as `det(P)*sum(log(diag(LU)))` where `LU`
+// is the `LU` decomposition of the input and `P` is the corresponding
 // permutation matrix.
 //
 // Arguments:
@@ -14831,30 +14995,6 @@ func LogMatrixDeterminant(scope *Scope, input tf.Output) (sign tf.Output, log_ab
 	return op.Output(0), op.Output(1)
 }
 
-// Computes the determinant of one or more square matrices.
-//
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. The output is a tensor containing the determinants
-// for all input submatrices `[..., :, :]`.
-//
-// Arguments:
-//	input: Shape is `[..., M, M]`.
-//
-// Returns Shape is `[...]`.
-func MatrixDeterminant(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "MatrixDeterminant",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Creates a TensorList by indexing into a Tensor.
 //
 // Each member of the TensorList corresponds to one row of the input tensor,
@@ -15101,35 +15241,206 @@ func TensorListLength(scope *Scope, input_handle tf.Output) (length tf.Output) {
 	return op.Output(0)
 }
 
-// Merges summaries.
+// Does nothing. Serves as a control trigger for scheduling.
 //
-// This op creates a
-// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
-// protocol buffer that contains the union of all the values in the input
-// summaries.
+// Only useful as a placeholder for control edges.
 //
-// When the Op is run, it reports an `InvalidArgument` error if multiple values
-// in the summaries to merge use the same tag.
-//
-// Arguments:
-//	inputs: Can be of any shape.  Each must contain serialized `Summary` protocol
-// buffers.
-//
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func MergeSummary(scope *Scope, inputs []tf.Output) (summary tf.Output) {
+// Returns the created operation.
+func ControlTrigger(scope *Scope) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "MergeSummary",
+		Type: "ControlTrigger",
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Interleave the values from the `data` tensors into a single tensor.
+//
+// Builds a merged tensor such that
+//
+// ```python
+//     merged[indices[m][i, ..., j], ...] = data[m][i, ..., j, ...]
+// ```
+//
+// For example, if each `indices[m]` is scalar or vector, we have
+//
+// ```python
+//     # Scalar indices:
+//     merged[indices[m], ...] = data[m][...]
+//
+//     # Vector indices:
+//     merged[indices[m][i], ...] = data[m][i, ...]
+// ```
+//
+// Each `data[i].shape` must start with the corresponding `indices[i].shape`,
+// and the rest of `data[i].shape` must be constant w.r.t. `i`.  That is, we
+// must have `data[i].shape = indices[i].shape + constant`.  In terms of this
+// `constant`, the output shape is
+//
+//     merged.shape = [max(indices)] + constant
+//
+// Values may be merged in parallel, so if an index appears in both `indices[m][i]`
+// and `indices[n][j]`, the result may be invalid. This differs from the normal
+// DynamicStitch operator that defines the behavior in that case.
+//
+// For example:
+//
+// ```python
+//     indices[0] = 6
+//     indices[1] = [4, 1]
+//     indices[2] = [[5, 2], [0, 3]]
+//     data[0] = [61, 62]
+//     data[1] = [[41, 42], [11, 12]]
+//     data[2] = [[[51, 52], [21, 22]], [[1, 2], [31, 32]]]
+//     merged = [[1, 2], [11, 12], [21, 22], [31, 32], [41, 42],
+//               [51, 52], [61, 62]]
+// ```
+//
+// This method can be used to merge partitions created by `dynamic_partition`
+// as illustrated on the following example:
+//
+// ```python
+//     # Apply function (increments x_i) on elements for which a certain condition
+//     # apply (x_i != -1 in this example).
+//     x=tf.constant([0.1, -1., 5.2, 4.3, -1., 7.4])
+//     condition_mask=tf.not_equal(x,tf.constant(-1.))
+//     partitioned_data = tf.dynamic_partition(
+//         x, tf.cast(condition_mask, tf.int32) , 2)
+//     partitioned_data[1] = partitioned_data[1] + 1.0
+//     condition_indices = tf.dynamic_partition(
+//         tf.range(tf.shape(x)[0]), tf.cast(condition_mask, tf.int32) , 2)
+//     x = tf.dynamic_stitch(condition_indices, partitioned_data)
+//     # Here x=[1.1, -1., 6.2, 5.3, -1, 8.4], the -1. values remain
+//     # unchanged.
+// ```
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicStitch.png" alt>
+// </div>
+func ParallelDynamicStitch(scope *Scope, indices []tf.Output, data []tf.Output) (merged tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ParallelDynamicStitch",
 		Input: []tf.Input{
-			tf.OutputList(inputs),
+			tf.OutputList(indices), tf.OutputList(data),
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
+// Returns a Tensor stack of all keys in a tensor map.
+//
+// input_handle: the input map
+// keys: the returned Tensor of all keys in the map
+func TensorMapStackKeys(scope *Scope, input_handle tf.Output, key_dtype tf.DataType) (keys tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"key_dtype": key_dtype}
+	opspec := tf.OpSpec{
+		Type: "TensorMapStackKeys",
+		Input: []tf.Input{
+			input_handle,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns whether the given key exists in the map.
+//
+// input_handle: the input map
+// key: the key to check
+// has_key: whether the key is already in the map or not
+func TensorMapHasKey(scope *Scope, input_handle tf.Output, key tf.Output) (has_key tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorMapHasKey",
+		Input: []tf.Input{
+			input_handle, key,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Inverse 3D fast Fourier transform.
+//
+// Computes the inverse 3-dimensional discrete Fourier transform over the
+// inner-most 3 dimensions of `input`.
+//
+// Arguments:
+//	input: A complex tensor.
+//
+// Returns A complex tensor of the same shape as `input`. The inner-most 3
+//   dimensions of `input` are replaced with their inverse 3D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.ifftn with 3 dimensions.
+// @end_compatibility
+func IFFT3D(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IFFT3D",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns a map that is the 'input_handle' with the given key-value pair inserted.
+//
+// input_handle: the original map
+// output_handle: the map with key and value inserted
+// key: the key to be inserted
+// value: the value to be inserted
+func TensorMapInsert(scope *Scope, input_handle tf.Output, key tf.Output, value tf.Output) (output_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorMapInsert",
+		Input: []tf.Input{
+			input_handle, key, value,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the value from a given key in a tensor map.
+//
+// input_handle: the input map
+// key: the key to be looked up
+// value: the value found from the given key
+func TensorMapLookup(scope *Scope, input_handle tf.Output, key tf.Output, value_dtype tf.DataType) (value tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"value_dtype": value_dtype}
+	opspec := tf.OpSpec{
+		Type: "TensorMapLookup",
+		Input: []tf.Input{
+			input_handle, key,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // DecodeImageAttr is an optional argument to DecodeImage.
 type DecodeImageAttr func(optionalAttr)
 
@@ -15306,33 +15617,6 @@ func AudioSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate t
 	return op.Output(0)
 }
 
-// Outputs a `Summary` protocol buffer with a histogram.
-//
-// The generated
-// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
-// has one summary value containing a histogram for `values`.
-//
-// This op reports an `InvalidArgument` error if any value is not finite.
-//
-// Arguments:
-//	tag: Scalar.  Tag to use for the `Summary.Value`.
-//	values: Any shape. Values to use to build the histogram.
-//
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func HistogramSummary(scope *Scope, tag tf.Output, values tf.Output) (summary tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "HistogramSummary",
-		Input: []tf.Input{
-			tag, values,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // StringLengthAttr is an optional argument to StringLength.
 type StringLengthAttr func(optionalAttr)
 
@@ -15445,50 +15729,6 @@ func TensorSummary(scope *Scope, tensor tf.Output, optional ...TensorSummaryAttr
 	return op.Output(0)
 }
 
-// Scatters tensor at indices in an input list.
-//
-// Each member of the TensorList corresponds to one row of the input tensor,
-// specified by the given index (see `tf.gather`).
-//
-// input_handle: The list to scatter into.
-// tensor: The input tensor.
-// indices: The indices used to index into the list.
-// output_handle: The TensorList.
-func TensorListScatterIntoExistingList(scope *Scope, input_handle tf.Output, tensor tf.Output, indices tf.Output) (output_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorListScatterIntoExistingList",
-		Input: []tf.Input{
-			input_handle, tensor, indices,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Outputs a `Summary` protocol buffer with a tensor and per-plugin data.
-//
-// Arguments:
-//	tag: A string attached to this summary. Used for organization in TensorBoard.
-//	tensor: A tensor to serialize.
-//	serialized_summary_metadata: A serialized SummaryMetadata proto. Contains plugin
-// data.
-func TensorSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, serialized_summary_metadata tf.Output) (summary tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorSummaryV2",
-		Input: []tf.Input{
-			tag, tensor, serialized_summary_metadata,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Computes the gradient for the sqrt of `x` wrt its input.
 //
 // Specifically, `grad = dy * 0.5 / y`, where `y = sqrt(x)`, and `dy`
@@ -15687,6 +15927,46 @@ func Dilation2D(scope *Scope, input tf.Output, filter tf.Output, strides []int64
 	return op.Output(0)
 }
 
+// IsotonicRegressionAttr is an optional argument to IsotonicRegression.
+type IsotonicRegressionAttr func(optionalAttr)
+
+// IsotonicRegressionOutputDtype sets the optional output_dtype attribute to value.
+//
+// value: Dtype of output.
+// If not specified, defaults to DT_FLOAT
+func IsotonicRegressionOutputDtype(value tf.DataType) IsotonicRegressionAttr {
+	return func(m optionalAttr) {
+		m["output_dtype"] = value
+	}
+}
+
+// Solves a batch of isotonic regression problems.
+//
+// Arguments:
+//	input: A (batch_size, dim)-tensor holding a batch of inputs.
+//
+// Returns:
+//	output: A (batch_size, dim)-tensor holding the per-batch element solutions.
+//	segments: An int32 (batch_size, dim)-tensor with the segments.
+func IsotonicRegression(scope *Scope, input tf.Output, optional ...IsotonicRegressionAttr) (output tf.Output, segments tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "IsotonicRegression",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
 // Computes softplus: `log(exp(features) + 1)`.
 func Softplus(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
@@ -16547,6 +16827,14 @@ func LookupTableFindV2(scope *Scope, table_handle tf.Output, keys tf.Output, def
 // MaxPoolGradAttr is an optional argument to MaxPoolGrad.
 type MaxPoolGradAttr func(optionalAttr)
 
+// MaxPoolGradExplicitPaddings sets the optional explicit_paddings attribute to value.
+// If not specified, defaults to <>
+func MaxPoolGradExplicitPaddings(value []int64) MaxPoolGradAttr {
+	return func(m optionalAttr) {
+		m["explicit_paddings"] = value
+	}
+}
+
 // MaxPoolGradDataFormat sets the optional data_format attribute to value.
 //
 // value: Specify the data format of the input and output data. With the
@@ -18363,6 +18651,31 @@ func SparseSegmentSumWithNumSegments(scope *Scope, data tf.Output, indices tf.Ou
 	return op.Output(0)
 }
 
+// Picks the best algorithm based on device, and scrambles seed into key and counter.
+//
+// This op picks the best counter-based RNG algorithm based on device, and scrambles a shape-[2] seed into a key and a counter, both needed by the counter-based algorithm. The scrambling is opaque but approximately satisfies the property that different seed results in different key/counter pair (which will in turn result in different random numbers).
+//
+// Arguments:
+//	seed: 2 seeds (shape [2]).
+//
+// Returns:
+//	key: Key for the counter-based RNG algorithm (shape uint64[1]).
+//	counter: Counter for the counter-based RNG algorithm. Since counter size is algorithm-dependent, this output will be right-padded with zeros to reach shape uint64[2] (the current maximal counter size among algorithms).
+//	alg: The RNG algorithm (shape int32[]).
+func StatelessRandomGetKeyCounterAlg(scope *Scope, seed tf.Output) (key tf.Output, counter tf.Output, alg tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "StatelessRandomGetKeyCounterAlg",
+		Input: []tf.Input{
+			seed,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
 // Computes the sum along sparse segments of a tensor.
 //
 // Read
@@ -19894,6 +20207,24 @@ func LogicalAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
+// Writes a graph summary.
+//
+// Writes TensorFlow graph `tensor` at `step` using summary `writer`.
+//
+// Returns the created operation.
+func WriteGraphSummary(scope *Scope, writer tf.Output, step tf.Output, tensor tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "WriteGraphSummary",
+		Input: []tf.Input{
+			writer, step, tensor,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
 // ApproximateEqualAttr is an optional argument to ApproximateEqual.
 type ApproximateEqualAttr func(optionalAttr)
 
@@ -20036,6 +20367,27 @@ func Polygamma(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
+// Returns a tensor map with item from given key erased.
+//
+// input_handle: the original map
+// output_handle: the map with value from given key removed
+// key: the key of the value to be erased
+func TensorMapErase(scope *Scope, input_handle tf.Output, key tf.Output, value_dtype tf.DataType) (output_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"value_dtype": value_dtype}
+	opspec := tf.OpSpec{
+		Type: "TensorMapErase",
+		Input: []tf.Input{
+			input_handle, key,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Shuffle dimensions of x according to a permutation.
 //
 // The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
@@ -20499,6 +20851,24 @@ func TruncateDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
+// Writes a serialized proto summary.
+//
+// Writes `tensor`, a serialized proto at `step` using summary `writer`.
+//
+// Returns the created operation.
+func WriteRawProtoSummary(scope *Scope, writer tf.Output, step tf.Output, tensor tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "WriteRawProtoSummary",
+		Input: []tf.Input{
+			writer, step, tensor,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
 // Returns 0 if the denominator is zero.
 //
 //
@@ -21177,6 +21547,11 @@ func ResourceGather(scope *Scope, resource tf.Output, indices tf.Output, dtype t
 //
 // *NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
 // [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+//
+// Given two input tensors, the `tf.add` operation computes the sum for every element in the tensor.
+//
+// Both input and output have a range `(-inf, inf)`.
+//
 func Add(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
@@ -21254,6 +21629,12 @@ func Atan(scope *Scope, x tf.Output) (y tf.Output) {
 }
 
 // Computes acos of x element-wise.
+//
+//
+//   Provided an input tensor, the `tf.math.acos` operation returns the inverse cosine of each element of the tensor. If `y = tf.math.cos(x)` then, `x = tf.math.acos(y)`.
+//
+//   Input range is `[-1, 1]` and the output has a range of `[0, pi]`.
+//
 func Acos(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
@@ -22706,6 +23087,26 @@ func NcclReduce(scope *Scope, input []tf.Output, reduction string) (data tf.Outp
 	return op.Output(0)
 }
 
+// An op to receive a tensor from the host.
+//
+// output: the tensor that will be received from the host.
+// Toutput: element type for output.
+// shape: shape for output.
+// key: A unique identifier for this region used to match up host transfers.
+func XlaRecvFromHost(scope *Scope, Toutput tf.DataType, shape tf.Shape, key string) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"Toutput": Toutput, "shape": shape, "key": key}
+	opspec := tf.OpSpec{
+		Type: "XlaRecvFromHost",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // QuantizedDepthwiseConv2DWithBiasAndReluAttr is an optional argument to QuantizedDepthwiseConv2DWithBiasAndRelu.
 type QuantizedDepthwiseConv2DWithBiasAndReluAttr func(optionalAttr)
 
@@ -23944,9 +24345,28 @@ func DataFormatVecPermuteDstFormat(value string) DataFormatVecPermuteAttr {
 	}
 }
 
-// Returns the permuted vector/tensor in the destination data format given the
+// Permute input tensor from `src_format` to `dst_format`.
 //
-// one in the source data format.
+// Input tensor must be a vector of size 4, or a 4x2 tensor.
+//
+// For example, with `src_format` of `NHWC`, `dst_format` of `NCHW`, and inputs:
+// ```
+// [1, 2, 3, 4]
+// ```
+// and
+// ```
+// [[1, 2, 3, 4],
+//  [5, 6, 7, 8]]
+// ```
+// , the outputs will be (respectively):
+// ```
+// [1, 4, 2, 3]
+// ```
+// and
+// ```
+// [[1, 4, 2, 3],
+//  [5, 8, 6, 7]]
+// ```
 //
 // Arguments:
 //	x: Vector of size 4 or Tensor of shape (4, 2) in source data format.
@@ -24997,6 +25417,24 @@ func MaxPoolWithArgmax(scope *Scope, input tf.Output, ksize []int64, strides []i
 	return op.Output(0), op.Output(1)
 }
 
+// Returns the number of tensors in the input tensor map.
+//
+// input_handle: the input map
+// size: the number of tensors in the map
+func TensorMapSize(scope *Scope, input_handle tf.Output) (size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorMapSize",
+		Input: []tf.Input{
+			input_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // MaxPoolGradGradAttr is an optional argument to MaxPoolGradGrad.
 type MaxPoolGradGradAttr func(optionalAttr)
 
@@ -27598,6 +28036,576 @@ func RaggedTensorFromVariant(scope *Scope, encoded_ragged tf.Output, input_ragge
 	return output_nested_splits, output_dense_values
 }
 
+// Returns the name of the device on which `resource` has been placed.
+func ExperimentalIteratorGetDevice(scope *Scope, resource tf.Output) (device tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalIteratorGetDevice",
+		Input: []tf.Input{
+			resource,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Records the bytes size of each element of `input_dataset` in a StatsAggregator.
+func ExperimentalBytesProducedStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalBytesProducedStatsDataset",
+		Input: []tf.Input{
+			input_dataset, tag,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes exponential linear: `exp(features) - 1` if < 0, `features` otherwise.
+//
+// See [Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)
+// ](http://arxiv.org/abs/1511.07289)
+func Elu(scope *Scope, features tf.Output) (activations tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Elu",
+		Input: []tf.Input{
+			features,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// AddSparseToTensorsMapAttr is an optional argument to AddSparseToTensorsMap.
+type AddSparseToTensorsMapAttr func(optionalAttr)
+
+// AddSparseToTensorsMapContainer sets the optional container attribute to value.
+//
+// value: The container name for the `SparseTensorsMap` created by this op.
+// If not specified, defaults to ""
+func AddSparseToTensorsMapContainer(value string) AddSparseToTensorsMapAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// AddSparseToTensorsMapSharedName sets the optional shared_name attribute to value.
+//
+// value: The shared name for the `SparseTensorsMap` created by this op.
+// If blank, the new Operation's unique name is used.
+// If not specified, defaults to ""
+func AddSparseToTensorsMapSharedName(value string) AddSparseToTensorsMapAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Add a `SparseTensor` to a `SparseTensorsMap` return its handle.
+//
+// A `SparseTensor` is represented by three tensors: `sparse_indices`,
+// `sparse_values`, and `sparse_shape`.
+//
+// This operator takes the given `SparseTensor` and adds it to a container
+// object (a `SparseTensorsMap`).  A unique key within this container is generated
+// in the form of an `int64`, and this is the value that is returned.
+//
+// The `SparseTensor` can then be read out as part of a minibatch by passing
+// the key as a vector element to `TakeManySparseFromTensorsMap`.  To ensure
+// the correct `SparseTensorsMap` is accessed, ensure that the same
+// `container` and `shared_name` are passed to that Op.  If no `shared_name`
+// is provided here, instead use the *name* of the Operation created by calling
+// `AddSparseToTensorsMap` as the `shared_name` passed to
+// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
+//
+// Arguments:
+//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
+//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
+//
+// Returns 0-D.  The handle of the `SparseTensor` now stored in the
+// `SparseTensorsMap`.
+func AddSparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddSparseToTensorsMapAttr) (sparse_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "AddSparseToTensorsMap",
+		Input: []tf.Input{
+			sparse_indices, sparse_values, sparse_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Transforms a vector of tf.Example protos (as strings) into typed tensors.
+//
+// Arguments:
+//	serialized: A scalar or vector containing binary serialized Example protos.
+//	names: A tensor containing the names of the serialized protos.
+// Corresponds 1:1 with the `serialized` tensor.
+// May contain, for example, table key (descriptive) names for the
+// corresponding serialized protos.  These are purely useful for debugging
+// purposes, and the presence of values here has no effect on the output.
+// May also be an empty vector if no names are available.
+// If non-empty, this tensor must have the same shape as "serialized".
+//	sparse_keys: Vector of strings.
+// The keys expected in the Examples' features associated with sparse values.
+//	dense_keys: Vector of strings.
+// The keys expected in the Examples' features associated with dense values.
+//	ragged_keys: Vector of strings.
+// The keys expected in the Examples' features associated with ragged values.
+//	dense_defaults: A list of Tensors (some may be empty).  Corresponds 1:1 with `dense_keys`.
+// dense_defaults[j] provides default values
+// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
+// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
+// The input type is inferred from dense_defaults[j], even when it's empty.
+// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
+// then the shape of dense_defaults[j] must match that of dense_shapes[j].
+// If dense_shapes[j] has an undefined major dimension (variable strides dense
+// feature), dense_defaults[j] must contain a single element:
+// the padding element.
+//	num_sparse: The number of sparse keys.
+//	sparse_types: A list of `num_sparse` types; the data types of data in each Feature
+// given in sparse_keys.
+// Currently the ParseExample supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+//	ragged_value_types: A list of `num_ragged` types; the data types of data in each Feature
+// given in ragged_keys (where `num_ragged = sparse_keys.size()`).
+// Currently the ParseExample supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+//	ragged_split_types: A list of `num_ragged` types; the data types of row_splits in each Feature
+// given in ragged_keys (where `num_ragged = sparse_keys.size()`).
+// May be DT_INT32 or DT_INT64.
+//	dense_shapes: A list of `num_dense` shapes; the shapes of data in each Feature
+// given in dense_keys (where `num_dense = dense_keys.size()`).
+// The number of elements in the Feature corresponding to dense_key[j]
+// must always equal dense_shapes[j].NumEntries().
+// If dense_shapes[j] == (D0, D1, ..., DN) then the shape of output
+// Tensor dense_values[j] will be (|serialized|, D0, D1, ..., DN):
+// The dense outputs are just the inputs row-stacked by batch.
+// This works for dense_shapes[j] = (-1, D1, ..., DN).  In this case
+// the shape of the output Tensor dense_values[j] will be
+// (|serialized|, M, D1, .., DN), where M is the maximum number of blocks
+// of elements of length D1 * .... * DN, across all minibatch entries
+// in the input.  Any minibatch entry with less than M blocks of elements of
+// length D1 * ... * DN will be padded with the corresponding default_value
+// scalar element along the second dimension.
+func ParseExampleV2(scope *Scope, serialized tf.Output, names tf.Output, sparse_keys tf.Output, dense_keys tf.Output, ragged_keys tf.Output, dense_defaults []tf.Output, num_sparse int64, sparse_types []tf.DataType, ragged_value_types []tf.DataType, ragged_split_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output, ragged_values []tf.Output, ragged_row_splits []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_sparse": num_sparse, "sparse_types": sparse_types, "ragged_value_types": ragged_value_types, "ragged_split_types": ragged_split_types, "dense_shapes": dense_shapes}
+	opspec := tf.OpSpec{
+		Type: "ParseExampleV2",
+		Input: []tf.Input{
+			serialized, names, sparse_keys, dense_keys, ragged_keys, tf.OutputList(dense_defaults),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
+		scope.UpdateErr("ParseExampleV2", err)
+		return
+	}
+	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
+		scope.UpdateErr("ParseExampleV2", err)
+		return
+	}
+	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseExampleV2", err)
+		return
+	}
+	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
+		scope.UpdateErr("ParseExampleV2", err)
+		return
+	}
+	if ragged_values, idx, err = makeOutputList(op, idx, "ragged_values"); err != nil {
+		scope.UpdateErr("ParseExampleV2", err)
+		return
+	}
+	if ragged_row_splits, idx, err = makeOutputList(op, idx, "ragged_row_splits"); err != nil {
+		scope.UpdateErr("ParseExampleV2", err)
+		return
+	}
+	return sparse_indices, sparse_values, sparse_shapes, dense_values, ragged_values, ragged_row_splits
+}
+
+// Scatter `updates` into a new tensor according to `indices`.
+//
+// Creates a new tensor by applying sparse `updates` to individual values or
+// slices within a tensor (initially zero for numeric, empty for string) of
+// the given `shape` according to indices.  This operator is the inverse of the
+// `tf.gather_nd` operator which extracts values or slices from a given tensor.
+//
+// This operation is similar to tensor_scatter_add, except that the tensor is
+// zero-initialized. Calling `tf.scatter_nd(indices, values, shape)` is identical
+// to `tensor_scatter_add(tf.zeros(shape, values.dtype), indices, values)`
+//
+// If `indices` contains duplicates, then their updates are accumulated (summed).
+//
+// **WARNING**: The order in which updates are applied is nondeterministic, so the
+// output will be nondeterministic if `indices` contains duplicates -- because
+// of some numerical approximation issues, numbers summed in different order
+// may yield different results.
+//
+// `indices` is an integer tensor containing indices into a new tensor of shape
+// `shape`.  The last dimension of `indices` can be at most the rank of `shape`:
+//
+//     indices.shape[-1] <= shape.rank
+//
+// The last dimension of `indices` corresponds to indices into elements
+// (if `indices.shape[-1] = shape.rank`) or slices
+// (if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of
+// `shape`.  `updates` is a tensor with shape
+//
+//     indices.shape[:-1] + shape[indices.shape[-1]:]
+//
+// The simplest form of scatter is to insert individual elements in a tensor by
+// index. For example, say we want to insert 4 scattered elements in a rank-1
+// tensor with 8 elements.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd1.png" alt>
+// </div>
+//
+// In Python, this scatter operation would look like this:
+//
+// ```python
+//     indices = tf.constant([[4], [3], [1], [7]])
+//     updates = tf.constant([9, 10, 11, 12])
+//     shape = tf.constant([8])
+//     scatter = tf.scatter_nd(indices, updates, shape)
+//     print(scatter)
+// ```
+//
+// The resulting tensor would look like this:
+//
+//     [0, 11, 0, 10, 9, 0, 0, 12]
+//
+// We can also, insert entire slices of a higher rank tensor all at once. For
+// example, if we wanted to insert two slices in the first dimension of a
+// rank-3 tensor with two matrices of new values.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd2.png" alt>
+// </div>
+//
+// In Python, this scatter operation would look like this:
+//
+// ```python
+//     indices = tf.constant([[0], [2]])
+//     updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],
+//                             [7, 7, 7, 7], [8, 8, 8, 8]],
+//                            [[5, 5, 5, 5], [6, 6, 6, 6],
+//                             [7, 7, 7, 7], [8, 8, 8, 8]]])
+//     shape = tf.constant([4, 4, 4])
+//     scatter = tf.scatter_nd(indices, updates, shape)
+//     print(scatter)
+// ```
+//
+// The resulting tensor would look like this:
+//
+//     [[[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
+//      [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],
+//      [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
+//      [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]]
+//
+// Note that on CPU, if an out of bound index is found, an error is returned.
+// On GPU, if an out of bound index is found, the index is ignored.
+//
+// Arguments:
+//	indices: Index tensor.
+//	updates: Updates to scatter into output.
+//	shape: 1-D. The shape of the resulting tensor.
+//
+// Returns A new tensor with the given shape and updates applied according
+// to the indices.
+func ScatterNd(scope *Scope, indices tf.Output, updates tf.Output, shape tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ScatterNd",
+		Input: []tf.Input{
+			indices, updates, shape,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// UniqueAttr is an optional argument to Unique.
+type UniqueAttr func(optionalAttr)
+
+// UniqueOutIdx sets the optional out_idx attribute to value.
+// If not specified, defaults to DT_INT32
+func UniqueOutIdx(value tf.DataType) UniqueAttr {
+	return func(m optionalAttr) {
+		m["out_idx"] = value
+	}
+}
+
+// Finds unique elements in a 1-D tensor.
+//
+// This operation returns a tensor `y` containing all of the unique elements of `x`
+// sorted in the same order that they occur in `x`; `x` does not need to be sorted.
+// This operation also returns a tensor `idx` the same size as `x` that contains
+// the index of each value of `x` in the unique output `y`. In other words:
+//
+// `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
+//
+// Examples:
+//
+// ```
+// # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
+// y, idx = unique(x)
+// y ==> [1, 2, 4, 7, 8]
+// idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
+// ```
+//
+// ```
+// # tensor 'x' is [4, 5, 1, 2, 3, 3, 4, 5]
+// y, idx = unique(x)
+// y ==> [4, 5, 1, 2, 3]
+// idx ==> [0, 1, 2, 3, 4, 4, 0, 1]
+// ```
+//
+// Arguments:
+//	x: 1-D.
+//
+// Returns:
+//	y: 1-D.
+//	idx: 1-D.
+func Unique(scope *Scope, x tf.Output, optional ...UniqueAttr) (y tf.Output, idx tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Unique",
+		Input: []tf.Input{
+			x,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Converts a `RaggedTensor` into a `SparseTensor` with the same values.
+//
+// input=ragged.from_nested_row_splits(rt_dense_values, rt_nested_splits)
+// output=SparseTensor(indices=sparse_indices, values=sparse_values,
+//                     dense_shape=sparse_dense_shape)
+//
+// Arguments:
+//	rt_nested_splits: The `row_splits` for the `RaggedTensor`.
+//	rt_dense_values: The `flat_values` for the `RaggedTensor`.
+//
+// Returns:
+//	sparse_indices: The indices for the `SparseTensor`.
+//	sparse_values: The values of the `SparseTensor`.
+//	sparse_dense_shape: `sparse_dense_shape` is a tight bounding box of the input `RaggedTensor`.
+func RaggedTensorToSparse(scope *Scope, rt_nested_splits []tf.Output, rt_dense_values tf.Output) (sparse_indices tf.Output, sparse_values tf.Output, sparse_dense_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RaggedTensorToSparse",
+		Input: []tf.Input{
+			tf.OutputList(rt_nested_splits), rt_dense_values,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Produce a string tensor that encodes the state of a Reader.
+//
+// Not all Readers support being serialized, so this can produce an
+// Unimplemented error.
+//
+// Arguments:
+//	reader_handle: Handle to a Reader.
+func ReaderSerializeStateV2(scope *Scope, reader_handle tf.Output) (state tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReaderSerializeStateV2",
+		Input: []tf.Input{
+			reader_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes rectified linear 6: `min(max(features, 0), 6)`.
+func Relu6(scope *Scope, features tf.Output) (activations tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Relu6",
+		Input: []tf.Input{
+			features,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Split a `SparseTensor` into `num_split` tensors along one dimension.
+//
+// If the `shape[split_dim]` is not an integer multiple of `num_split`. Slices
+// `[0 : shape[split_dim] % num_split]` gets one extra dimension.
+// For example, if `split_dim = 1` and `num_split = 2` and the input is
+//
+//     input_tensor = shape = [2, 7]
+//     [    a   d e  ]
+//     [b c          ]
+//
+// Graphically the output tensors are:
+//
+//     output_tensor[0] = shape = [2, 4]
+//     [    a  ]
+//     [b c    ]
+//
+//     output_tensor[1] = shape = [2, 3]
+//     [ d e  ]
+//     [      ]
+//
+// Arguments:
+//	split_dim: 0-D.  The dimension along which to split.  Must be in the range
+// `[0, rank(shape))`.
+//	indices: 2-D tensor represents the indices of the sparse tensor.
+//	values: 1-D tensor represents the values of the sparse tensor.
+//	shape: 1-D. tensor represents the shape of the sparse tensor.
+// output indices: A list of 1-D tensors represents the indices of the output
+// sparse tensors.
+//	num_split: The number of ways to split.
+//
+// Returns:
+//	output_indices
+//	output_values: A list of 1-D tensors represents the values of the output sparse
+// tensors.
+//	output_shape: A list of 1-D tensors represents the shape of the output sparse
+// tensors.
+func SparseSplit(scope *Scope, split_dim tf.Output, indices tf.Output, values tf.Output, shape tf.Output, num_split int64) (output_indices []tf.Output, output_values []tf.Output, output_shape []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_split": num_split}
+	opspec := tf.OpSpec{
+		Type: "SparseSplit",
+		Input: []tf.Input{
+			split_dim, indices, values, shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output_indices, idx, err = makeOutputList(op, idx, "output_indices"); err != nil {
+		scope.UpdateErr("SparseSplit", err)
+		return
+	}
+	if output_values, idx, err = makeOutputList(op, idx, "output_values"); err != nil {
+		scope.UpdateErr("SparseSplit", err)
+		return
+	}
+	if output_shape, idx, err = makeOutputList(op, idx, "output_shape"); err != nil {
+		scope.UpdateErr("SparseSplit", err)
+		return
+	}
+	return output_indices, output_values, output_shape
+}
+
+// RaggedRangeAttr is an optional argument to RaggedRange.
+type RaggedRangeAttr func(optionalAttr)
+
+// RaggedRangeTsplits sets the optional Tsplits attribute to value.
+// If not specified, defaults to DT_INT64
+func RaggedRangeTsplits(value tf.DataType) RaggedRangeAttr {
+	return func(m optionalAttr) {
+		m["Tsplits"] = value
+	}
+}
+
+// Returns a `RaggedTensor` containing the specified sequences of numbers.
+//
+//
+// Returns a `RaggedTensor` `result` composed from `rt_dense_values` and
+// `rt_nested_splits`, such that
+// `result[i] = range(starts[i], limits[i], deltas[i])`.
+//
+// ```python
+// (rt_nested_splits, rt_dense_values) = ragged_range(
+//       starts=[2, 5, 8], limits=[3, 5, 12], deltas=1)
+// result = tf.ragged.from_row_splits(rt_dense_values, rt_nested_splits)
+// print(result)
+// <tf.RaggedTensor [[2], [], [8, 9, 10, 11]] >
+// ```
+//
+// The input tensors `starts`, `limits`, and `deltas` may be scalars or vectors.
+// The vector inputs must all have the same size.  Scalar inputs are broadcast
+// to match the size of the vector inputs.
+//
+// Arguments:
+//	starts: The starts of each range.
+//	limits: The limits of each range.
+//	deltas: The deltas of each range.
+//
+// Returns:
+//	rt_nested_splits: The `row_splits` for the returned `RaggedTensor`.
+//	rt_dense_values: The `flat_values` for the returned `RaggedTensor`.
+func RaggedRange(scope *Scope, starts tf.Output, limits tf.Output, deltas tf.Output, optional ...RaggedRangeAttr) (rt_nested_splits tf.Output, rt_dense_values tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RaggedRange",
+		Input: []tf.Input{
+			starts, limits, deltas,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
 // RandomPoissonV2Attr is an optional argument to RandomPoissonV2.
 type RandomPoissonV2Attr func(optionalAttr)
 
@@ -27671,6 +28679,124 @@ func RandomPoissonV2(scope *Scope, shape tf.Output, rate tf.Output, optional ...
 	return op.Output(0)
 }
 
+// LoadTPUEmbeddingCenteredRMSPropParametersAttr is an optional argument to LoadTPUEmbeddingCenteredRMSPropParameters.
+type LoadTPUEmbeddingCenteredRMSPropParametersAttr func(optionalAttr)
+
+// LoadTPUEmbeddingCenteredRMSPropParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+func LoadTPUEmbeddingCenteredRMSPropParametersTableId(value int64) LoadTPUEmbeddingCenteredRMSPropParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingCenteredRMSPropParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingCenteredRMSPropParametersTableName(value string) LoadTPUEmbeddingCenteredRMSPropParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// LoadTPUEmbeddingCenteredRMSPropParametersConfig sets the optional config attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingCenteredRMSPropParametersConfig(value string) LoadTPUEmbeddingCenteredRMSPropParametersAttr {
+	return func(m optionalAttr) {
+		m["config"] = value
+	}
+}
+
+// Load centered RMSProp embedding parameters.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the centered RMSProp optimization algorithm.
+//	ms: Value of ms used in the centered RMSProp optimization algorithm.
+//	mom: Value of mom used in the centered RMSProp optimization algorithm.
+//	mg: Value of mg used in the centered RMSProp optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingCenteredRMSPropParameters(scope *Scope, parameters tf.Output, ms tf.Output, mom tf.Output, mg tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingCenteredRMSPropParametersAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingCenteredRMSPropParameters",
+		Input: []tf.Input{
+			parameters, ms, mom, mg,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// RandomPoissonAttr is an optional argument to RandomPoisson.
+type RandomPoissonAttr func(optionalAttr)
+
+// RandomPoissonSeed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func RandomPoissonSeed(value int64) RandomPoissonAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomPoissonSeed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func RandomPoissonSeed2(value int64) RandomPoissonAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Use RandomPoissonV2 instead.
+//
+// DEPRECATED at GraphDef version 25: Replaced by RandomPoissonV2
+func RandomPoisson(scope *Scope, shape tf.Output, rate tf.Output, optional ...RandomPoissonAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RandomPoisson",
+		Input: []tf.Input{
+			shape, rate,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the derivative of a Gamma random sample w.r.t. `alpha`.
+func RandomGammaGrad(scope *Scope, alpha tf.Output, sample tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RandomGammaGrad",
+		Input: []tf.Input{
+			alpha, sample,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Creates a dataset that takes a Bernoulli sample of the contents of another dataset.
 //
 // There is no transformation in the `tf.data` Python API for creating this dataset.
@@ -28370,6 +29496,70 @@ func IteratorFromStringHandle(scope *Scope, string_handle tf.Output, optional ..
 	return op.Output(0)
 }
 
+// WriteAudioSummaryAttr is an optional argument to WriteAudioSummary.
+type WriteAudioSummaryAttr func(optionalAttr)
+
+// WriteAudioSummaryMaxOutputs sets the optional max_outputs attribute to value.
+// If not specified, defaults to 3
+//
+// REQUIRES: value >= 1
+func WriteAudioSummaryMaxOutputs(value int64) WriteAudioSummaryAttr {
+	return func(m optionalAttr) {
+		m["max_outputs"] = value
+	}
+}
+
+// Writes an audio summary.
+//
+// Writes encoded audio summary `tensor` at `step` with `tag` using summary `writer`.
+// `sample_rate` is the audio sample rate is Hz.
+//
+// Returns the created operation.
+func WriteAudioSummary(scope *Scope, writer tf.Output, step tf.Output, tag tf.Output, tensor tf.Output, sample_rate tf.Output, optional ...WriteAudioSummaryAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "WriteAudioSummary",
+		Input: []tf.Input{
+			writer, step, tag, tensor, sample_rate,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Outputs a `Summary` protocol buffer with a histogram.
+//
+// The generated
+// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
+// has one summary value containing a histogram for `values`.
+//
+// This op reports an `InvalidArgument` error if any value is not finite.
+//
+// Arguments:
+//	tag: Scalar.  Tag to use for the `Summary.Value`.
+//	values: Any shape. Values to use to build the histogram.
+//
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func HistogramSummary(scope *Scope, tag tf.Output, values tf.Output) (summary tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "HistogramSummary",
+		Input: []tf.Input{
+			tag, values,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Performs gradient updates of embedding tables.
 //
 // Arguments:
@@ -29439,6 +30629,77 @@ func ResourceScatterMin(scope *Scope, resource tf.Output, indices tf.Output, upd
 	return scope.AddOperation(opspec)
 }
 
+// Scatters tensor at indices in an input list.
+//
+// Each member of the TensorList corresponds to one row of the input tensor,
+// specified by the given index (see `tf.gather`).
+//
+// input_handle: The list to scatter into.
+// tensor: The input tensor.
+// indices: The indices used to index into the list.
+// output_handle: The TensorList.
+func TensorListScatterIntoExistingList(scope *Scope, input_handle tf.Output, tensor tf.Output, indices tf.Output) (output_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorListScatterIntoExistingList",
+		Input: []tf.Input{
+			input_handle, tensor, indices,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Outputs a `Summary` protocol buffer with a tensor and per-plugin data.
+//
+// Arguments:
+//	tag: A string attached to this summary. Used for organization in TensorBoard.
+//	tensor: A tensor to serialize.
+//	serialized_summary_metadata: A serialized SummaryMetadata proto. Contains plugin
+// data.
+func TensorSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, serialized_summary_metadata tf.Output) (summary tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorSummaryV2",
+		Input: []tf.Input{
+			tag, tensor, serialized_summary_metadata,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Advance the counter of a counter-based RNG.
+//
+// The state of the RNG after
+// `rng_read_and_skip(n)` will be the same as that after `uniform([n])`
+// (or any other distribution). The actual increment added to the
+// counter is an unspecified implementation choice.
+//
+// Arguments:
+//	resource: The handle of the resource variable that stores the state of the RNG.
+//	alg: The RNG algorithm.
+//	delta: The amount of advancement.
+//
+// Returns The old value of the resource variable, before incrementing. Since state size is algorithm-dependent, this output will be right-padded with zeros to reach shape int64[3] (the current maximal state size among algorithms).
+func RngReadAndSkip(scope *Scope, resource tf.Output, alg tf.Output, delta tf.Output) (value tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RngReadAndSkip",
+		Input: []tf.Input{
+			resource, alg, delta,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Multiplies sparse updates into the variable referenced by `resource`.
 //
 // This operation computes
@@ -30742,12 +32003,26 @@ func MaxPool3DGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output
 	return op.Output(0)
 }
 
+// IgnoreErrorsDatasetAttr is an optional argument to IgnoreErrorsDataset.
+type IgnoreErrorsDatasetAttr func(optionalAttr)
+
+// IgnoreErrorsDatasetLogWarning sets the optional log_warning attribute to value.
+// If not specified, defaults to false
+func IgnoreErrorsDatasetLogWarning(value bool) IgnoreErrorsDatasetAttr {
+	return func(m optionalAttr) {
+		m["log_warning"] = value
+	}
+}
+
 // Creates a dataset that contains the elements of `input_dataset` ignoring errors.
-func IgnoreErrorsDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+func IgnoreErrorsDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape, optional ...IgnoreErrorsDatasetAttr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
 		Type: "IgnoreErrorsDataset",
 		Input: []tf.Input{
@@ -32634,109 +33909,6 @@ func TakeManySparseFromTensorsMap(scope *Scope, sparse_handles tf.Output, dtype
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// LoadTPUEmbeddingCenteredRMSPropParametersAttr is an optional argument to LoadTPUEmbeddingCenteredRMSPropParameters.
-type LoadTPUEmbeddingCenteredRMSPropParametersAttr func(optionalAttr)
-
-// LoadTPUEmbeddingCenteredRMSPropParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-func LoadTPUEmbeddingCenteredRMSPropParametersTableId(value int64) LoadTPUEmbeddingCenteredRMSPropParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// LoadTPUEmbeddingCenteredRMSPropParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingCenteredRMSPropParametersTableName(value string) LoadTPUEmbeddingCenteredRMSPropParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// LoadTPUEmbeddingCenteredRMSPropParametersConfig sets the optional config attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingCenteredRMSPropParametersConfig(value string) LoadTPUEmbeddingCenteredRMSPropParametersAttr {
-	return func(m optionalAttr) {
-		m["config"] = value
-	}
-}
-
-// Load centered RMSProp embedding parameters.
-//
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
-//
-// Arguments:
-//	parameters: Value of parameters used in the centered RMSProp optimization algorithm.
-//	ms: Value of ms used in the centered RMSProp optimization algorithm.
-//	mom: Value of mom used in the centered RMSProp optimization algorithm.
-//	mg: Value of mg used in the centered RMSProp optimization algorithm.
-//
-//
-//
-// Returns the created operation.
-func LoadTPUEmbeddingCenteredRMSPropParameters(scope *Scope, parameters tf.Output, ms tf.Output, mom tf.Output, mg tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingCenteredRMSPropParametersAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingCenteredRMSPropParameters",
-		Input: []tf.Input{
-			parameters, ms, mom, mg,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// RandomPoissonAttr is an optional argument to RandomPoisson.
-type RandomPoissonAttr func(optionalAttr)
-
-// RandomPoissonSeed sets the optional seed attribute to value.
-// If not specified, defaults to 0
-func RandomPoissonSeed(value int64) RandomPoissonAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// RandomPoissonSeed2 sets the optional seed2 attribute to value.
-// If not specified, defaults to 0
-func RandomPoissonSeed2(value int64) RandomPoissonAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Use RandomPoissonV2 instead.
-//
-// DEPRECATED at GraphDef version 25: Replaced by RandomPoissonV2
-func RandomPoisson(scope *Scope, shape tf.Output, rate tf.Output, optional ...RandomPoissonAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RandomPoisson",
-		Input: []tf.Input{
-			shape, rate,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Compute the regularized incomplete beta integral \\(I_x(a, b)\\).
 //
 // The regularized incomplete beta integral is defined as:
@@ -32884,6 +34056,24 @@ func AddManySparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_va
 	return op.Output(0)
 }
 
+// Writes a histogram summary.
+//
+// Writes histogram `values` at `step` with `tag` using summary `writer`.
+//
+// Returns the created operation.
+func WriteHistogramSummary(scope *Scope, writer tf.Output, step tf.Output, tag tf.Output, values tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "WriteHistogramSummary",
+		Input: []tf.Input{
+			writer, step, tag, values,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
 // Computes tan of x element-wise.
 //
 //   Given an input tensor, this function computes tangent of every
@@ -32957,221 +34147,6 @@ func BiasAddGrad(scope *Scope, out_backprop tf.Output, optional ...BiasAddGradAt
 	return op.Output(0)
 }
 
-// Returns the name of the device on which `resource` has been placed.
-func ExperimentalIteratorGetDevice(scope *Scope, resource tf.Output) (device tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalIteratorGetDevice",
-		Input: []tf.Input{
-			resource,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Records the bytes size of each element of `input_dataset` in a StatsAggregator.
-func ExperimentalBytesProducedStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalBytesProducedStatsDataset",
-		Input: []tf.Input{
-			input_dataset, tag,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Transforms a vector of tf.Example protos (as strings) into typed tensors.
-//
-// Arguments:
-//	serialized: A scalar or vector containing binary serialized Example protos.
-//	names: A tensor containing the names of the serialized protos.
-// Corresponds 1:1 with the `serialized` tensor.
-// May contain, for example, table key (descriptive) names for the
-// corresponding serialized protos.  These are purely useful for debugging
-// purposes, and the presence of values here has no effect on the output.
-// May also be an empty vector if no names are available.
-// If non-empty, this tensor must have the same shape as "serialized".
-//	sparse_keys: Vector of strings.
-// The keys expected in the Examples' features associated with sparse values.
-//	dense_keys: Vector of strings.
-// The keys expected in the Examples' features associated with dense values.
-//	ragged_keys: Vector of strings.
-// The keys expected in the Examples' features associated with ragged values.
-//	dense_defaults: A list of Tensors (some may be empty).  Corresponds 1:1 with `dense_keys`.
-// dense_defaults[j] provides default values
-// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
-// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
-// The input type is inferred from dense_defaults[j], even when it's empty.
-// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
-// then the shape of dense_defaults[j] must match that of dense_shapes[j].
-// If dense_shapes[j] has an undefined major dimension (variable strides dense
-// feature), dense_defaults[j] must contain a single element:
-// the padding element.
-//	num_sparse: The number of sparse keys.
-//	sparse_types: A list of `num_sparse` types; the data types of data in each Feature
-// given in sparse_keys.
-// Currently the ParseExample supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-//	ragged_value_types: A list of `num_ragged` types; the data types of data in each Feature
-// given in ragged_keys (where `num_ragged = sparse_keys.size()`).
-// Currently the ParseExample supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-//	ragged_split_types: A list of `num_ragged` types; the data types of row_splits in each Feature
-// given in ragged_keys (where `num_ragged = sparse_keys.size()`).
-// May be DT_INT32 or DT_INT64.
-//	dense_shapes: A list of `num_dense` shapes; the shapes of data in each Feature
-// given in dense_keys (where `num_dense = dense_keys.size()`).
-// The number of elements in the Feature corresponding to dense_key[j]
-// must always equal dense_shapes[j].NumEntries().
-// If dense_shapes[j] == (D0, D1, ..., DN) then the shape of output
-// Tensor dense_values[j] will be (|serialized|, D0, D1, ..., DN):
-// The dense outputs are just the inputs row-stacked by batch.
-// This works for dense_shapes[j] = (-1, D1, ..., DN).  In this case
-// the shape of the output Tensor dense_values[j] will be
-// (|serialized|, M, D1, .., DN), where M is the maximum number of blocks
-// of elements of length D1 * .... * DN, across all minibatch entries
-// in the input.  Any minibatch entry with less than M blocks of elements of
-// length D1 * ... * DN will be padded with the corresponding default_value
-// scalar element along the second dimension.
-func ParseExampleV2(scope *Scope, serialized tf.Output, names tf.Output, sparse_keys tf.Output, dense_keys tf.Output, ragged_keys tf.Output, dense_defaults []tf.Output, num_sparse int64, sparse_types []tf.DataType, ragged_value_types []tf.DataType, ragged_split_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output, ragged_values []tf.Output, ragged_row_splits []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_sparse": num_sparse, "sparse_types": sparse_types, "ragged_value_types": ragged_value_types, "ragged_split_types": ragged_split_types, "dense_shapes": dense_shapes}
-	opspec := tf.OpSpec{
-		Type: "ParseExampleV2",
-		Input: []tf.Input{
-			serialized, names, sparse_keys, dense_keys, ragged_keys, tf.OutputList(dense_defaults),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
-		scope.UpdateErr("ParseExampleV2", err)
-		return
-	}
-	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
-		scope.UpdateErr("ParseExampleV2", err)
-		return
-	}
-	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseExampleV2", err)
-		return
-	}
-	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
-		scope.UpdateErr("ParseExampleV2", err)
-		return
-	}
-	if ragged_values, idx, err = makeOutputList(op, idx, "ragged_values"); err != nil {
-		scope.UpdateErr("ParseExampleV2", err)
-		return
-	}
-	if ragged_row_splits, idx, err = makeOutputList(op, idx, "ragged_row_splits"); err != nil {
-		scope.UpdateErr("ParseExampleV2", err)
-		return
-	}
-	return sparse_indices, sparse_values, sparse_shapes, dense_values, ragged_values, ragged_row_splits
-}
-
-// Computes exponential linear: `exp(features) - 1` if < 0, `features` otherwise.
-//
-// See [Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)
-// ](http://arxiv.org/abs/1511.07289)
-func Elu(scope *Scope, features tf.Output) (activations tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Elu",
-		Input: []tf.Input{
-			features,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// AddSparseToTensorsMapAttr is an optional argument to AddSparseToTensorsMap.
-type AddSparseToTensorsMapAttr func(optionalAttr)
-
-// AddSparseToTensorsMapContainer sets the optional container attribute to value.
-//
-// value: The container name for the `SparseTensorsMap` created by this op.
-// If not specified, defaults to ""
-func AddSparseToTensorsMapContainer(value string) AddSparseToTensorsMapAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// AddSparseToTensorsMapSharedName sets the optional shared_name attribute to value.
-//
-// value: The shared name for the `SparseTensorsMap` created by this op.
-// If blank, the new Operation's unique name is used.
-// If not specified, defaults to ""
-func AddSparseToTensorsMapSharedName(value string) AddSparseToTensorsMapAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Add a `SparseTensor` to a `SparseTensorsMap` return its handle.
-//
-// A `SparseTensor` is represented by three tensors: `sparse_indices`,
-// `sparse_values`, and `sparse_shape`.
-//
-// This operator takes the given `SparseTensor` and adds it to a container
-// object (a `SparseTensorsMap`).  A unique key within this container is generated
-// in the form of an `int64`, and this is the value that is returned.
-//
-// The `SparseTensor` can then be read out as part of a minibatch by passing
-// the key as a vector element to `TakeManySparseFromTensorsMap`.  To ensure
-// the correct `SparseTensorsMap` is accessed, ensure that the same
-// `container` and `shared_name` are passed to that Op.  If no `shared_name`
-// is provided here, instead use the *name* of the Operation created by calling
-// `AddSparseToTensorsMap` as the `shared_name` passed to
-// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
-//
-// Arguments:
-//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
-//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
-//
-// Returns 0-D.  The handle of the `SparseTensor` now stored in the
-// `SparseTensorsMap`.
-func AddSparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddSparseToTensorsMapAttr) (sparse_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "AddSparseToTensorsMap",
-		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Advance the counter of a counter-based RNG.
 //
 // The state of the RNG after
@@ -33754,6 +34729,24 @@ func ShardedFilespec(scope *Scope, basename tf.Output, num_shards tf.Output) (fi
 	return op.Output(0)
 }
 
+// Writes a scalar summary.
+//
+// Writes scalar `value` at `step` with `tag` using summary `writer`.
+//
+// Returns the created operation.
+func WriteScalarSummary(scope *Scope, writer tf.Output, step tf.Output, tag tf.Output, value tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "WriteScalarSummary",
+		Input: []tf.Input{
+			writer, step, tag, value,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
 // RetrieveTPUEmbeddingProximalAdagradParametersAttr is an optional argument to RetrieveTPUEmbeddingProximalAdagradParameters.
 type RetrieveTPUEmbeddingProximalAdagradParametersAttr func(optionalAttr)
 
@@ -34342,6 +35335,14 @@ func OrderedMapClear(scope *Scope, dtypes []tf.DataType, optional ...OrderedMapC
 // MaxPoolAttr is an optional argument to MaxPool.
 type MaxPoolAttr func(optionalAttr)
 
+// MaxPoolExplicitPaddings sets the optional explicit_paddings attribute to value.
+// If not specified, defaults to <>
+func MaxPoolExplicitPaddings(value []int64) MaxPoolAttr {
+	return func(m optionalAttr) {
+		m["explicit_paddings"] = value
+	}
+}
+
 // MaxPoolDataFormat sets the optional data_format attribute to value.
 //
 // value: Specify the data format of the input and output data. With the
@@ -35440,34 +36441,6 @@ func DecodeRaw(scope *Scope, bytes tf.Output, out_type tf.DataType, optional ...
 	return op.Output(0)
 }
 
-// Inverse 3D fast Fourier transform.
-//
-// Computes the inverse 3-dimensional discrete Fourier transform over the
-// inner-most 3 dimensions of `input`.
-//
-// Arguments:
-//	input: A complex tensor.
-//
-// Returns A complex tensor of the same shape as `input`. The inner-most 3
-//   dimensions of `input` are replaced with their inverse 3D Fourier transform.
-//
-// @compatibility(numpy)
-// Equivalent to np.fft.ifftn with 3 dimensions.
-// @end_compatibility
-func IFFT3D(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "IFFT3D",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // QueueDequeueUpToV2Attr is an optional argument to QueueDequeueUpToV2.
 type QueueDequeueUpToV2Attr func(optionalAttr)
 
@@ -36546,21 +37519,6 @@ func RaggedTensorToTensor(scope *Scope, shape tf.Output, values tf.Output, defau
 	return op.Output(0)
 }
 
-// Computes the derivative of a Gamma random sample w.r.t. `alpha`.
-func RandomGammaGrad(scope *Scope, alpha tf.Output, sample tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "RandomGammaGrad",
-		Input: []tf.Input{
-			alpha, sample,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // LRNGradAttr is an optional argument to LRNGrad.
 type LRNGradAttr func(optionalAttr)
 
@@ -37171,6 +38129,51 @@ func ParseSingleExample(scope *Scope, serialized tf.Output, dense_defaults []tf.
 	return sparse_indices, sparse_values, sparse_shapes, dense_values
 }
 
+// StatelessRandomUniformFullIntV2Attr is an optional argument to StatelessRandomUniformFullIntV2.
+type StatelessRandomUniformFullIntV2Attr func(optionalAttr)
+
+// StatelessRandomUniformFullIntV2Dtype sets the optional dtype attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_UINT64
+func StatelessRandomUniformFullIntV2Dtype(value tf.DataType) StatelessRandomUniformFullIntV2Attr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs deterministic pseudorandom random integers from a uniform distribution.
+//
+// The generated values are uniform integers covering the whole range of `dtype`.
+//
+// The outputs are a deterministic function of `shape`, `key`, `counter` and `alg`.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	key: Key for the counter-based RNG algorithm (shape uint64[1]).
+//	counter: Initial counter for the counter-based RNG algorithm (shape uint64[2] or uint64[1] depending on the algorithm). If a larger vector is given, only the needed portion on the left (i.e. [:N]) will be used.
+//	alg: The RNG algorithm (shape int32[]).
+//
+// Returns Random values with specified shape.
+func StatelessRandomUniformFullIntV2(scope *Scope, shape tf.Output, key tf.Output, counter tf.Output, alg tf.Output, optional ...StatelessRandomUniformFullIntV2Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatelessRandomUniformFullIntV2",
+		Input: []tf.Input{
+			shape, key, counter, alg,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // StringToNumberAttr is an optional argument to StringToNumber.
 type StringToNumberAttr func(optionalAttr)
 
@@ -37339,6 +38342,343 @@ func StatelessMultinomial(scope *Scope, logits tf.Output, num_samples tf.Output,
 	return op.Output(0)
 }
 
+// QuantizedMatMulWithBiasAndReluAttr is an optional argument to QuantizedMatMulWithBiasAndRelu.
+type QuantizedMatMulWithBiasAndReluAttr func(optionalAttr)
+
+// QuantizedMatMulWithBiasAndReluToutput sets the optional Toutput attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedMatMulWithBiasAndReluToutput(value tf.DataType) QuantizedMatMulWithBiasAndReluAttr {
+	return func(m optionalAttr) {
+		m["Toutput"] = value
+	}
+}
+
+// QuantizedMatMulWithBiasAndReluTransposeA sets the optional transpose_a attribute to value.
+//
+// value: If true, `a` is transposed before multiplication.
+// If not specified, defaults to false
+func QuantizedMatMulWithBiasAndReluTransposeA(value bool) QuantizedMatMulWithBiasAndReluAttr {
+	return func(m optionalAttr) {
+		m["transpose_a"] = value
+	}
+}
+
+// QuantizedMatMulWithBiasAndReluTransposeB sets the optional transpose_b attribute to value.
+//
+// value: If true, `b` is transposed before multiplication.
+// If not specified, defaults to false
+func QuantizedMatMulWithBiasAndReluTransposeB(value bool) QuantizedMatMulWithBiasAndReluAttr {
+	return func(m optionalAttr) {
+		m["transpose_b"] = value
+	}
+}
+
+// QuantizedMatMulWithBiasAndReluInputQuantMode sets the optional input_quant_mode attribute to value.
+//
+// value: Input data quantization mode. Either MIN_FIRST(default) or SCALED.
+// If not specified, defaults to "MIN_FIRST"
+func QuantizedMatMulWithBiasAndReluInputQuantMode(value string) QuantizedMatMulWithBiasAndReluAttr {
+	return func(m optionalAttr) {
+		m["input_quant_mode"] = value
+	}
+}
+
+// Perform a quantized matrix multiplication of  `a` by the matrix `b` with bias
+// add and relu fusion.
+//
+// The inputs must be two-dimensional matrices and 1D bias vector. And the inner
+// dimension of `a` (after being transposed if `transpose_a` is non-zero) must
+// match the outer dimension of `b` (after being transposed if `transposed_b` is
+// non-zero). Then do broadcast add operation with bias values on the matrix
+// multiplication result. The bias size must match inner dimension of `b`. Then do
+// relu activation to get non-negative result.
+//
+// Arguments:
+//	a: A matrix to be multiplied. Must be a two-dimensional tensor of type `quint8`.
+//	b: A matrix to be multiplied and must be a two-dimensional tensor of type `qint8`.
+//	bias: A 1D bias tensor with size matching with inner dimension of `b` (after being
+// transposed if `transposed_b` is non-zero).
+//	min_a: The float value that the lowest quantized `a` value represents.
+//	max_a: The float value that the highest quantized `a` value represents.
+//	min_b: The float value that the lowest quantized `b` value represents.
+//	max_b: The float value that the highest quantized `b` value represents.
+//
+// Returns:
+//	out
+//	min_out: The float value that the lowest quantized output value represents.
+//	max_out: The float value that the highest quantized output value represents.
+func QuantizedMatMulWithBiasAndRelu(scope *Scope, a tf.Output, b tf.Output, bias tf.Output, min_a tf.Output, max_a tf.Output, min_b tf.Output, max_b tf.Output, optional ...QuantizedMatMulWithBiasAndReluAttr) (out tf.Output, min_out tf.Output, max_out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedMatMulWithBiasAndRelu",
+		Input: []tf.Input{
+			a, b, bias, min_a, max_a, min_b, max_b,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingMomentumParametersGradAccumDebug.
+type RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+func RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugConfig sets the optional config attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingMomentumParametersGradAccumDebugConfig(value string) RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["config"] = value
+	}
+}
+
+// Retrieve Momentum embedding parameters with debug support.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns:
+//	parameters: Parameter parameters updated by the Momentum optimization algorithm.
+//	momenta: Parameter momenta updated by the Momentum optimization algorithm.
+//	gradient_accumulators: Parameter gradient_accumulators updated by the Momentum optimization algorithm.
+func RetrieveTPUEmbeddingMomentumParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr) (parameters tf.Output, momenta tf.Output, gradient_accumulators tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingMomentumParametersGradAccumDebug",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// StatelessRandomUniformFullIntAttr is an optional argument to StatelessRandomUniformFullInt.
+type StatelessRandomUniformFullIntAttr func(optionalAttr)
+
+// StatelessRandomUniformFullIntDtype sets the optional dtype attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_UINT64
+func StatelessRandomUniformFullIntDtype(value tf.DataType) StatelessRandomUniformFullIntAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs deterministic pseudorandom random integers from a uniform distribution.
+//
+// The generated values are uniform integers covering the whole range of `dtype`.
+//
+// The outputs are a deterministic function of `shape` and `seed`.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
+//
+// Returns Random values with specified shape.
+func StatelessRandomUniformFullInt(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomUniformFullIntAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatelessRandomUniformFullInt",
+		Input: []tf.Input{
+			shape, seed,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceApplyRMSPropAttr is an optional argument to ResourceApplyRMSProp.
+type ResourceApplyRMSPropAttr func(optionalAttr)
+
+// ResourceApplyRMSPropUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var, ms, and mom tensors is protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyRMSPropUseLocking(value bool) ResourceApplyRMSPropAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the RMSProp algorithm.
+//
+// Note that in dense implementation of this algorithm, ms and mom will
+// update even if the grad is zero, but in this sparse implementation, ms
+// and mom will not update in iterations during which the grad is zero.
+//
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+//
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+// var <- var - mom
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
+//
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyRMSPropAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyRMSProp",
+		Input: []tf.Input{
+			var_, ms, mom, lr, rho, momentum, epsilon, grad,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// MaxPool3DGradAttr is an optional argument to MaxPool3DGrad.
+type MaxPool3DGradAttr func(optionalAttr)
+
+// MaxPool3DGradDataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func MaxPool3DGradDataFormat(value string) MaxPool3DGradAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes gradients of 3D max pooling function.
+//
+// Arguments:
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func MaxPool3DGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MaxPool3DGrad",
+		Input: []tf.Input{
+			orig_input, orig_output, grad,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that executes a SQL query and emits rows of the result set.
+//
+// Arguments:
+//	driver_name: The database type. Currently, the only supported type is 'sqlite'.
+//	data_source_name: A connection string to connect to the database.
+//	query: A SQL query to execute.
+//
+//
+func SqlDataset(scope *Scope, driver_name tf.Output, data_source_name tf.Output, query tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "SqlDataset",
+		Input: []tf.Input{
+			driver_name, data_source_name, query,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Outputs deterministic pseudorandom random integers from a uniform distribution.
+//
+// The generated values follow a uniform distribution in the range `[minval, maxval)`.
+//
+// The outputs are a deterministic function of `shape`, `seed`, `minval`, and `maxval`.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
+//	minval: Minimum value (inclusive, scalar).
+//	maxval: Maximum value (exclusive, scalar).
+//
+// Returns Random values with specified shape.
+func StatelessRandomUniformInt(scope *Scope, shape tf.Output, seed tf.Output, minval tf.Output, maxval tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "StatelessRandomUniformInt",
+		Input: []tf.Input{
+			shape, seed, minval, maxval,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Returns a copy of the input tensor.
 func Snapshot(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
@@ -37472,6 +38812,205 @@ func StatelessRandomNormal(scope *Scope, shape tf.Output, seed tf.Output, option
 	return op.Output(0)
 }
 
+// ResourceScatterNdUpdateAttr is an optional argument to ResourceScatterNdUpdate.
+type ResourceScatterNdUpdateAttr func(optionalAttr)
+
+// ResourceScatterNdUpdateUseLocking sets the optional use_locking attribute to value.
+//
+// value: An optional bool. Defaults to True. If True, the assignment will
+// be protected by a lock; otherwise the behavior is undefined,
+// but may exhibit less contention.
+// If not specified, defaults to true
+func ResourceScatterNdUpdateUseLocking(value bool) ResourceScatterNdUpdateAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Applies sparse `updates` to individual values or slices within a given
+//
+// variable according to `indices`.
+//
+// `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+//
+// `indices` must be integer tensor, containing indices into `ref`.
+// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+//
+// The innermost dimension of `indices` (with length `K`) corresponds to
+// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+// dimension of `ref`.
+//
+// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
+//
+// ```
+// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+// ```
+//
+// For example, say we want to update 4 scattered elements to a rank-1 tensor to
+// 8 elements. In Python, that update would look like this:
+//
+// ```python
+//     ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+//     indices = tf.constant([[4], [3], [1] ,[7]])
+//     updates = tf.constant([9, 10, 11, 12])
+//     update = tf.scatter_nd_update(ref, indices, updates)
+//     with tf.Session() as sess:
+//       print sess.run(update)
+// ```
+//
+// The resulting update to ref would look like this:
+//
+//     [1, 11, 3, 10, 9, 6, 7, 12]
+//
+// See `tf.scatter_nd` for more details about how to make updates to
+// slices.
+//
+// Arguments:
+//	ref: A resource handle. Must be from a VarHandleOp.
+//	indices: A Tensor. Must be one of the following types: int32, int64.
+// A tensor of indices into ref.
+//	updates: A Tensor. Must have the same type as ref. A tensor of updated
+// values to add to ref.
+//
+// Returns the created operation.
+func ResourceScatterNdUpdate(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdUpdateAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterNdUpdate",
+		Input: []tf.Input{
+			ref, indices, updates,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// EnqueueTPUEmbeddingSparseBatchAttr is an optional argument to EnqueueTPUEmbeddingSparseBatch.
+type EnqueueTPUEmbeddingSparseBatchAttr func(optionalAttr)
+
+// EnqueueTPUEmbeddingSparseBatchDeviceOrdinal sets the optional device_ordinal attribute to value.
+//
+// value: The TPU device to use. Should be >= 0 and less than the number
+// of TPU cores in the task on which the node is placed.
+// If not specified, defaults to -1
+func EnqueueTPUEmbeddingSparseBatchDeviceOrdinal(value int64) EnqueueTPUEmbeddingSparseBatchAttr {
+	return func(m optionalAttr) {
+		m["device_ordinal"] = value
+	}
+}
+
+// EnqueueTPUEmbeddingSparseBatchCombiners sets the optional combiners attribute to value.
+//
+// value: A list of string scalars, one for each embedding table that specify
+// how to normalize the embedding activations after weighted summation.
+// Supported combiners are 'mean', 'sum', or 'sqrtn'. It is invalid to have
+// the sum of the weights be 0 for 'mean' or the sum of the squared weights be
+// 0 for 'sqrtn'. If combiners isn't passed, the default is to use 'sum' for
+// all tables.
+// If not specified, defaults to <>
+func EnqueueTPUEmbeddingSparseBatchCombiners(value []string) EnqueueTPUEmbeddingSparseBatchAttr {
+	return func(m optionalAttr) {
+		m["combiners"] = value
+	}
+}
+
+// An op that enqueues TPUEmbedding input indices from a SparseTensor.
+//
+// This Op eases the porting of code that uses embedding_lookup_sparse(),
+// although some Python preprocessing of the SparseTensor arguments to
+// embedding_lookup_sparse() is required to produce the arguments to this Op,
+// since only a single EnqueueTPUEmbeddingSparseBatch Op is allowed per training
+// step.
+//
+// The tensors at corresponding positions in the three input lists
+// must have the same shape, i.e. rank 1 with dim_size() equal to the total
+// number of lookups into the table described by the corresponding table_id.
+//
+// Arguments:
+//	sample_indices: A list of rank 1 Tensors specifying the training example and
+// feature to which the corresponding embedding_indices and aggregation_weights
+// values belong. sample_indices[i] must equal b * nf + f, where nf is the
+// number of features from the corresponding table, f is in [0, nf), and
+// b is in [0, batch size).
+//	embedding_indices: A list of rank 1 Tensors, indices into the embedding tables.
+//	aggregation_weights: A list of rank 1 Tensors containing per sample -- i.e. per
+// (training example, feature) -- aggregation weights.
+//	mode_override: A string input that overrides the mode specified in the
+// TPUEmbeddingConfiguration. Supported values are {'unspecified', 'inference',
+// 'training', 'backward_pass_only'}. When set to 'unspecified', the mode set
+// in TPUEmbeddingConfiguration is used, otherwise mode_override is used.
+//
+// Returns the created operation.
+func EnqueueTPUEmbeddingSparseBatch(scope *Scope, sample_indices []tf.Output, embedding_indices []tf.Output, aggregation_weights []tf.Output, mode_override tf.Output, optional ...EnqueueTPUEmbeddingSparseBatchAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "EnqueueTPUEmbeddingSparseBatch",
+		Input: []tf.Input{
+			tf.OutputList(sample_indices), tf.OutputList(embedding_indices), tf.OutputList(aggregation_weights), mode_override,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// StatelessRandomUniformV2Attr is an optional argument to StatelessRandomUniformV2.
+type StatelessRandomUniformV2Attr func(optionalAttr)
+
+// StatelessRandomUniformV2Dtype sets the optional dtype attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatelessRandomUniformV2Dtype(value tf.DataType) StatelessRandomUniformV2Attr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs deterministic pseudorandom random values from a uniform distribution.
+//
+// The generated values follow a uniform distribution in the range `[0, 1)`. The
+// lower bound 0 is included in the range, while the upper bound 1 is excluded.
+//
+// The outputs are a deterministic function of `shape`, `key`, `counter` and `alg`.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	key: Key for the counter-based RNG algorithm (shape uint64[1]).
+//	counter: Initial counter for the counter-based RNG algorithm (shape uint64[2] or uint64[1] depending on the algorithm). If a larger vector is given, only the needed portion on the left (i.e. [:N]) will be used.
+//	alg: The RNG algorithm (shape int32[]).
+//
+// Returns Random values with specified shape.
+func StatelessRandomUniformV2(scope *Scope, shape tf.Output, key tf.Output, counter tf.Output, alg tf.Output, optional ...StatelessRandomUniformV2Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatelessRandomUniformV2",
+		Input: []tf.Input{
+			shape, key, counter, alg,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 //   This op is used as a placeholder in If branch functions. It doesn't provide a
 //   valid output when run, so must either be removed (e.g. replaced with a
 //   function input) or guaranteed not to be used (e.g. if mirroring an
@@ -38214,168 +39753,6 @@ func StringJoin(scope *Scope, inputs []tf.Output, optional ...StringJoinAttr) (o
 	return op.Output(0)
 }
 
-// Produce a string tensor that encodes the state of a Reader.
-//
-// Not all Readers support being serialized, so this can produce an
-// Unimplemented error.
-//
-// Arguments:
-//	reader_handle: Handle to a Reader.
-func ReaderSerializeStateV2(scope *Scope, reader_handle tf.Output) (state tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReaderSerializeStateV2",
-		Input: []tf.Input{
-			reader_handle,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes rectified linear 6: `min(max(features, 0), 6)`.
-func Relu6(scope *Scope, features tf.Output) (activations tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Relu6",
-		Input: []tf.Input{
-			features,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Split a `SparseTensor` into `num_split` tensors along one dimension.
-//
-// If the `shape[split_dim]` is not an integer multiple of `num_split`. Slices
-// `[0 : shape[split_dim] % num_split]` gets one extra dimension.
-// For example, if `split_dim = 1` and `num_split = 2` and the input is
-//
-//     input_tensor = shape = [2, 7]
-//     [    a   d e  ]
-//     [b c          ]
-//
-// Graphically the output tensors are:
-//
-//     output_tensor[0] = shape = [2, 4]
-//     [    a  ]
-//     [b c    ]
-//
-//     output_tensor[1] = shape = [2, 3]
-//     [ d e  ]
-//     [      ]
-//
-// Arguments:
-//	split_dim: 0-D.  The dimension along which to split.  Must be in the range
-// `[0, rank(shape))`.
-//	indices: 2-D tensor represents the indices of the sparse tensor.
-//	values: 1-D tensor represents the values of the sparse tensor.
-//	shape: 1-D. tensor represents the shape of the sparse tensor.
-// output indices: A list of 1-D tensors represents the indices of the output
-// sparse tensors.
-//	num_split: The number of ways to split.
-//
-// Returns:
-//	output_indices
-//	output_values: A list of 1-D tensors represents the values of the output sparse
-// tensors.
-//	output_shape: A list of 1-D tensors represents the shape of the output sparse
-// tensors.
-func SparseSplit(scope *Scope, split_dim tf.Output, indices tf.Output, values tf.Output, shape tf.Output, num_split int64) (output_indices []tf.Output, output_values []tf.Output, output_shape []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_split": num_split}
-	opspec := tf.OpSpec{
-		Type: "SparseSplit",
-		Input: []tf.Input{
-			split_dim, indices, values, shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output_indices, idx, err = makeOutputList(op, idx, "output_indices"); err != nil {
-		scope.UpdateErr("SparseSplit", err)
-		return
-	}
-	if output_values, idx, err = makeOutputList(op, idx, "output_values"); err != nil {
-		scope.UpdateErr("SparseSplit", err)
-		return
-	}
-	if output_shape, idx, err = makeOutputList(op, idx, "output_shape"); err != nil {
-		scope.UpdateErr("SparseSplit", err)
-		return
-	}
-	return output_indices, output_values, output_shape
-}
-
-// RaggedRangeAttr is an optional argument to RaggedRange.
-type RaggedRangeAttr func(optionalAttr)
-
-// RaggedRangeTsplits sets the optional Tsplits attribute to value.
-// If not specified, defaults to DT_INT64
-func RaggedRangeTsplits(value tf.DataType) RaggedRangeAttr {
-	return func(m optionalAttr) {
-		m["Tsplits"] = value
-	}
-}
-
-// Returns a `RaggedTensor` containing the specified sequences of numbers.
-//
-//
-// Returns a `RaggedTensor` `result` composed from `rt_dense_values` and
-// `rt_nested_splits`, such that
-// `result[i] = range(starts[i], limits[i], deltas[i])`.
-//
-// ```python
-// (rt_nested_splits, rt_dense_values) = ragged_range(
-//       starts=[2, 5, 8], limits=[3, 5, 12], deltas=1)
-// result = tf.ragged.from_row_splits(rt_dense_values, rt_nested_splits)
-// print(result)
-// <tf.RaggedTensor [[2], [], [8, 9, 10, 11]] >
-// ```
-//
-// The input tensors `starts`, `limits`, and `deltas` may be scalars or vectors.
-// The vector inputs must all have the same size.  Scalar inputs are broadcast
-// to match the size of the vector inputs.
-//
-// Arguments:
-//	starts: The starts of each range.
-//	limits: The limits of each range.
-//	deltas: The deltas of each range.
-//
-// Returns:
-//	rt_nested_splits: The `row_splits` for the returned `RaggedTensor`.
-//	rt_dense_values: The `flat_values` for the returned `RaggedTensor`.
-func RaggedRange(scope *Scope, starts tf.Output, limits tf.Output, deltas tf.Output, optional ...RaggedRangeAttr) (rt_nested_splits tf.Output, rt_dense_values tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RaggedRange",
-		Input: []tf.Input{
-			starts, limits, deltas,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
 // Replaces the contents of the table with the specified keys and values.
 //
 // The tensor `keys` must be of the same type as the keys of the table.
@@ -38400,6 +39777,57 @@ func LookupTableImportV2(scope *Scope, table_handle tf.Output, keys tf.Output, v
 	return scope.AddOperation(opspec)
 }
 
+// ImageProjectiveTransformV3Attr is an optional argument to ImageProjectiveTransformV3.
+type ImageProjectiveTransformV3Attr func(optionalAttr)
+
+// ImageProjectiveTransformV3FillMode sets the optional fill_mode attribute to value.
+//
+// value: Fill mode, "REFLECT", "WRAP", or "CONSTANT".
+// If not specified, defaults to "CONSTANT"
+func ImageProjectiveTransformV3FillMode(value string) ImageProjectiveTransformV3Attr {
+	return func(m optionalAttr) {
+		m["fill_mode"] = value
+	}
+}
+
+// Applies the given transform to each of the images.
+//
+// If one row of `transforms` is `[a0, a1, a2, b0, b1, b2, c0, c1]`, then it maps
+// the *output* point `(x, y)` to a transformed *input* point
+// `(x', y') = ((a0 x + a1 y + a2) / k, (b0 x + b1 y + b2) / k)`, where
+// `k = c0 x + c1 y + 1`. If the transformed point lays outside of the input
+// image, the output pixel is set to fill_value.
+//
+// Arguments:
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	transforms: 2-D Tensor, `[batch, 8]` or `[1, 8]` matrix, where each row corresponds to a 3 x 3
+// projective transformation matrix, with the last entry assumed to be 1. If there
+// is one row, the same transformation will be applied to all images.
+//	output_shape: 1-D Tensor [new_height, new_width].
+//	fill_value: float, the value to be filled when fill_mode is constant".
+//	interpolation: Interpolation method, "NEAREST" or "BILINEAR".
+//
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ImageProjectiveTransformV3(scope *Scope, images tf.Output, transforms tf.Output, output_shape tf.Output, fill_value tf.Output, interpolation string, optional ...ImageProjectiveTransformV3Attr) (transformed_images tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"interpolation": interpolation}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ImageProjectiveTransformV3",
+		Input: []tf.Input{
+			images, transforms, output_shape, fill_value,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // LoadTPUEmbeddingMomentumParametersAttr is an optional argument to LoadTPUEmbeddingMomentumParameters.
 type LoadTPUEmbeddingMomentumParametersAttr func(optionalAttr)
 
@@ -38718,48 +40146,6 @@ func RegexReplace(scope *Scope, input tf.Output, pattern tf.Output, rewrite tf.O
 	return op.Output(0)
 }
 
-// ExperimentalRebatchDatasetAttr is an optional argument to ExperimentalRebatchDataset.
-type ExperimentalRebatchDatasetAttr func(optionalAttr)
-
-// ExperimentalRebatchDatasetUseFallback sets the optional use_fallback attribute to value.
-// If not specified, defaults to true
-func ExperimentalRebatchDatasetUseFallback(value bool) ExperimentalRebatchDatasetAttr {
-	return func(m optionalAttr) {
-		m["use_fallback"] = value
-	}
-}
-
-// Creates a dataset that changes the batch size.
-//
-// Creates a dataset that changes the batch size of the dataset to current batch
-// size // num_replicas.
-//
-// Arguments:
-//	input_dataset: A variant tensor representing the input dataset.
-//	num_replicas: A scalar representing the number of replicas to distribute this batch across. As
-// a result of this transformation the current batch size would end up being
-// divided  by this parameter.
-//
-//
-func ExperimentalRebatchDataset(scope *Scope, input_dataset tf.Output, num_replicas tf.Output, output_types []tf.DataType, output_shapes []tf.Shape, optional ...ExperimentalRebatchDatasetAttr) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalRebatchDataset",
-		Input: []tf.Input{
-			input_dataset, num_replicas,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Concatenates tensors along one dimension.
 //
 // Arguments:
@@ -39085,24 +40471,6 @@ func ResourceApplyAddSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Outpu
 	return scope.AddOperation(opspec)
 }
 
-// Returns the number of work units this Reader has finished processing.
-//
-// Arguments:
-//	reader_handle: Handle to a Reader.
-func ReaderNumWorkUnitsCompletedV2(scope *Scope, reader_handle tf.Output) (units_completed tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReaderNumWorkUnitsCompletedV2",
-		Input: []tf.Input{
-			reader_handle,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // FractionalMaxPoolAttr is an optional argument to FractionalMaxPool.
 type FractionalMaxPoolAttr func(optionalAttr)
 
@@ -39425,6 +40793,20 @@ func RemoteFusedGraphExecute(scope *Scope, inputs []tf.Output, Toutputs []tf.Dat
 	return outputs
 }
 
+// Creates and returns an empty tensor map.
+//
+// handle: an empty tensor map
+func EmptyTensorMap(scope *Scope) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "EmptyTensorMap",
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // DatasetToGraphAttr is an optional argument to DatasetToGraph.
 type DatasetToGraphAttr func(optionalAttr)
 
@@ -40899,6 +42281,43 @@ func WriteFile(scope *Scope, filename tf.Output, contents tf.Output) (o *tf.Oper
 	return scope.AddOperation(opspec)
 }
 
+// WriteImageSummaryAttr is an optional argument to WriteImageSummary.
+type WriteImageSummaryAttr func(optionalAttr)
+
+// WriteImageSummaryMaxImages sets the optional max_images attribute to value.
+// If not specified, defaults to 3
+//
+// REQUIRES: value >= 1
+func WriteImageSummaryMaxImages(value int64) WriteImageSummaryAttr {
+	return func(m optionalAttr) {
+		m["max_images"] = value
+	}
+}
+
+// Writes an image summary.
+//
+// Writes image `tensor` at `step` with `tag` using summary `writer`.
+// `tensor` is image with shape [height, width, channels].
+//
+// Returns the created operation.
+func WriteImageSummary(scope *Scope, writer tf.Output, step tf.Output, tag tf.Output, tensor tf.Output, bad_color tf.Output, optional ...WriteImageSummaryAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "WriteImageSummary",
+		Input: []tf.Input{
+			writer, step, tag, tensor, bad_color,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
 // MatrixSolveAttr is an optional argument to MatrixSolve.
 type MatrixSolveAttr func(optionalAttr)
 
@@ -41430,6 +42849,24 @@ func Batch(scope *Scope, in_tensors []tf.Output, num_batch_threads int64, max_ba
 	return batched_tensors, batch_index, id
 }
 
+// Writes a tensor summary.
+//
+// Writes `tensor` at `step` with `tag` using summary `writer`.
+//
+// Returns the created operation.
+func WriteSummary(scope *Scope, writer tf.Output, step tf.Output, tensor tf.Output, tag tf.Output, summary_metadata tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "WriteSummary",
+		Input: []tf.Input{
+			writer, step, tensor, tag, summary_metadata,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
 // UnicodeDecodeAttr is an optional argument to UnicodeDecode.
 type UnicodeDecodeAttr func(optionalAttr)
 
@@ -41941,7 +43378,7 @@ func ResourceApplyCenteredRMSPropUseLocking(value bool) ResourceApplyCenteredRMS
 //	mom: Should be from a Variable().
 //	lr: Scaling factor. Must be a scalar.
 //	rho: Decay rate. Must be a scalar.
-//
+//	momentum: Momentum Scale. Must be a scalar.
 //	epsilon: Ridge term. Must be a scalar.
 //	grad: The gradient.
 //
@@ -42228,110 +43665,35 @@ func SparseFillEmptyRowsGrad(scope *Scope, reverse_index_map tf.Output, grad_val
 	return op.Output(0), op.Output(1)
 }
 
-// MaxPool3DGradAttr is an optional argument to MaxPool3DGrad.
-type MaxPool3DGradAttr func(optionalAttr)
-
-// MaxPool3DGradDataFormat sets the optional data_format attribute to value.
+// Merges summaries.
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func MaxPool3DGradDataFormat(value string) MaxPool3DGradAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Computes gradients of 3D max pooling function.
+// This op creates a
+// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
+// protocol buffer that contains the union of all the values in the input
+// summaries.
+//
+// When the Op is run, it reports an `InvalidArgument` error if multiple values
+// in the summaries to merge use the same tag.
 //
 // Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func MaxPool3DGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradAttr) (output tf.Output) {
+//	inputs: Can be of any shape.  Each must contain serialized `Summary` protocol
+// buffers.
+//
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func MergeSummary(scope *Scope, inputs []tf.Output) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "MaxPool3DGrad",
+		Type: "MergeSummary",
 		Input: []tf.Input{
-			orig_input, orig_output, grad,
+			tf.OutputList(inputs),
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceApplyRMSPropAttr is an optional argument to ResourceApplyRMSProp.
-type ResourceApplyRMSPropAttr func(optionalAttr)
-
-// ResourceApplyRMSPropUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var, ms, and mom tensors is protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyRMSPropUseLocking(value bool) ResourceApplyRMSPropAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the RMSProp algorithm.
-//
-// Note that in dense implementation of this algorithm, ms and mom will
-// update even if the grad is zero, but in this sparse implementation, ms
-// and mom will not update in iterations during which the grad is zero.
-//
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
-//
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-// var <- var - mom
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
-//
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
-//
-// Returns the created operation.
-func ResourceApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyRMSPropAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyRMSProp",
-		Input: []tf.Input{
-			var_, ms, mom, lr, rho, momentum, epsilon, grad,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
 // Reshapes a SparseTensor to represent values in a new dense shape.
 //
 // This operation has the same semantics as reshape on the represented dense
@@ -42858,62 +44220,6 @@ func Fact(scope *Scope) (fact tf.Output) {
 	return op.Output(0)
 }
 
-// RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug.
-type RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-func RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugConfig sets the optional config attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugConfig(value string) RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["config"] = value
-	}
-}
-
-// Retrieve Adadelta embedding parameters with debug support.
-//
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
-//
-// Returns:
-//	parameters: Parameter parameters updated by the Adadelta optimization algorithm.
-//	accumulators: Parameter accumulators updated by the Adadelta optimization algorithm.
-//	updates: Parameter updates updated by the Adadelta optimization algorithm.
-//	gradient_accumulators: Parameter gradient_accumulators updated by the Adadelta optimization algorithm.
-func RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr) (parameters tf.Output, accumulators tf.Output, updates tf.Output, gradient_accumulators tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
-}
-
 // Pads a tensor with mirrored values.
 //
 // This operation pads a `input` with mirrored values according to the `paddings`
@@ -43778,159 +45084,6 @@ func LoadTPUEmbeddingADAMParameters(scope *Scope, parameters tf.Output, momenta
 	return scope.AddOperation(opspec)
 }
 
-// ResourceScatterNdUpdateAttr is an optional argument to ResourceScatterNdUpdate.
-type ResourceScatterNdUpdateAttr func(optionalAttr)
-
-// ResourceScatterNdUpdateUseLocking sets the optional use_locking attribute to value.
-//
-// value: An optional bool. Defaults to True. If True, the assignment will
-// be protected by a lock; otherwise the behavior is undefined,
-// but may exhibit less contention.
-// If not specified, defaults to true
-func ResourceScatterNdUpdateUseLocking(value bool) ResourceScatterNdUpdateAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Applies sparse `updates` to individual values or slices within a given
-//
-// variable according to `indices`.
-//
-// `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
-//
-// `indices` must be integer tensor, containing indices into `ref`.
-// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
-//
-// The innermost dimension of `indices` (with length `K`) corresponds to
-// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-// dimension of `ref`.
-//
-// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
-//
-// ```
-// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
-// ```
-//
-// For example, say we want to update 4 scattered elements to a rank-1 tensor to
-// 8 elements. In Python, that update would look like this:
-//
-// ```python
-//     ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
-//     indices = tf.constant([[4], [3], [1] ,[7]])
-//     updates = tf.constant([9, 10, 11, 12])
-//     update = tf.scatter_nd_update(ref, indices, updates)
-//     with tf.Session() as sess:
-//       print sess.run(update)
-// ```
-//
-// The resulting update to ref would look like this:
-//
-//     [1, 11, 3, 10, 9, 6, 7, 12]
-//
-// See `tf.scatter_nd` for more details about how to make updates to
-// slices.
-//
-// Arguments:
-//	ref: A resource handle. Must be from a VarHandleOp.
-//	indices: A Tensor. Must be one of the following types: int32, int64.
-// A tensor of indices into ref.
-//	updates: A Tensor. Must have the same type as ref. A tensor of updated
-// values to add to ref.
-//
-// Returns the created operation.
-func ResourceScatterNdUpdate(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdUpdateAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceScatterNdUpdate",
-		Input: []tf.Input{
-			ref, indices, updates,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// EnqueueTPUEmbeddingSparseBatchAttr is an optional argument to EnqueueTPUEmbeddingSparseBatch.
-type EnqueueTPUEmbeddingSparseBatchAttr func(optionalAttr)
-
-// EnqueueTPUEmbeddingSparseBatchDeviceOrdinal sets the optional device_ordinal attribute to value.
-//
-// value: The TPU device to use. Should be >= 0 and less than the number
-// of TPU cores in the task on which the node is placed.
-// If not specified, defaults to -1
-func EnqueueTPUEmbeddingSparseBatchDeviceOrdinal(value int64) EnqueueTPUEmbeddingSparseBatchAttr {
-	return func(m optionalAttr) {
-		m["device_ordinal"] = value
-	}
-}
-
-// EnqueueTPUEmbeddingSparseBatchCombiners sets the optional combiners attribute to value.
-//
-// value: A list of string scalars, one for each embedding table that specify
-// how to normalize the embedding activations after weighted summation.
-// Supported combiners are 'mean', 'sum', or 'sqrtn'. It is invalid to have
-// the sum of the weights be 0 for 'mean' or the sum of the squared weights be
-// 0 for 'sqrtn'. If combiners isn't passed, the default is to use 'sum' for
-// all tables.
-// If not specified, defaults to <>
-func EnqueueTPUEmbeddingSparseBatchCombiners(value []string) EnqueueTPUEmbeddingSparseBatchAttr {
-	return func(m optionalAttr) {
-		m["combiners"] = value
-	}
-}
-
-// An op that enqueues TPUEmbedding input indices from a SparseTensor.
-//
-// This Op eases the porting of code that uses embedding_lookup_sparse(),
-// although some Python preprocessing of the SparseTensor arguments to
-// embedding_lookup_sparse() is required to produce the arguments to this Op,
-// since only a single EnqueueTPUEmbeddingSparseBatch Op is allowed per training
-// step.
-//
-// The tensors at corresponding positions in the three input lists
-// must have the same shape, i.e. rank 1 with dim_size() equal to the total
-// number of lookups into the table described by the corresponding table_id.
-//
-// Arguments:
-//	sample_indices: A list of rank 1 Tensors specifying the training example and
-// feature to which the corresponding embedding_indices and aggregation_weights
-// values belong. sample_indices[i] must equal b * nf + f, where nf is the
-// number of features from the corresponding table, f is in [0, nf), and
-// b is in [0, batch size).
-//	embedding_indices: A list of rank 1 Tensors, indices into the embedding tables.
-//	aggregation_weights: A list of rank 1 Tensors containing per sample -- i.e. per
-// (training example, feature) -- aggregation weights.
-//	mode_override: A string input that overrides the mode specified in the
-// TPUEmbeddingConfiguration. Supported values are {'unspecified', 'inference',
-// 'training', 'backward_pass_only'}. When set to 'unspecified', the mode set
-// in TPUEmbeddingConfiguration is used, otherwise mode_override is used.
-//
-// Returns the created operation.
-func EnqueueTPUEmbeddingSparseBatch(scope *Scope, sample_indices []tf.Output, embedding_indices []tf.Output, aggregation_weights []tf.Output, mode_override tf.Output, optional ...EnqueueTPUEmbeddingSparseBatchAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "EnqueueTPUEmbeddingSparseBatch",
-		Input: []tf.Input{
-			tf.OutputList(sample_indices), tf.OutputList(embedding_indices), tf.OutputList(aggregation_weights), mode_override,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
 // Transforms a vector of brain.Example protos (as strings) into typed tensors.
 //
 // Arguments:
@@ -44269,6 +45422,53 @@ func TPUEmbeddingActivations(scope *Scope, embedding_variable tf.Output, sliced_
 	return op.Output(0)
 }
 
+// StatelessTruncatedNormalV2Attr is an optional argument to StatelessTruncatedNormalV2.
+type StatelessTruncatedNormalV2Attr func(optionalAttr)
+
+// StatelessTruncatedNormalV2Dtype sets the optional dtype attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatelessTruncatedNormalV2Dtype(value tf.DataType) StatelessTruncatedNormalV2Attr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs deterministic pseudorandom values from a truncated normal distribution.
+//
+// The generated values follow a normal distribution with mean 0 and standard
+// deviation 1, except that values whose magnitude is more than 2 standard
+// deviations from the mean are dropped and re-picked.
+//
+// The outputs are a deterministic function of `shape`, `key`, `counter` and `alg`.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	key: Key for the counter-based RNG algorithm (shape uint64[1]).
+//	counter: Initial counter for the counter-based RNG algorithm (shape uint64[2] or uint64[1] depending on the algorithm). If a larger vector is given, only the needed portion on the left (i.e. [:N]) will be used.
+//	alg: The RNG algorithm (shape int32[]).
+//
+// Returns Random values with specified shape.
+func StatelessTruncatedNormalV2(scope *Scope, shape tf.Output, key tf.Output, counter tf.Output, alg tf.Output, optional ...StatelessTruncatedNormalV2Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatelessTruncatedNormalV2",
+		Input: []tf.Input{
+			shape, key, counter, alg,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // QuantizeAndDequantizeV3Attr is an optional argument to QuantizeAndDequantizeV3.
 type QuantizeAndDequantizeV3Attr func(optionalAttr)
 
@@ -44389,6 +45589,51 @@ func TensorListGetItem(scope *Scope, input_handle tf.Output, index tf.Output, el
 	return op.Output(0)
 }
 
+// StatelessRandomNormalV2Attr is an optional argument to StatelessRandomNormalV2.
+type StatelessRandomNormalV2Attr func(optionalAttr)
+
+// StatelessRandomNormalV2Dtype sets the optional dtype attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatelessRandomNormalV2Dtype(value tf.DataType) StatelessRandomNormalV2Attr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs deterministic pseudorandom values from a normal distribution.
+//
+// The generated values will have mean 0 and standard deviation 1.
+//
+// The outputs are a deterministic function of `shape`, `key`, `counter` and `alg`.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	key: Key for the counter-based RNG algorithm (shape uint64[1]).
+//	counter: Initial counter for the counter-based RNG algorithm (shape uint64[2] or uint64[1] depending on the algorithm). If a larger vector is given, only the needed portion on the left (i.e. [:N]) will be used.
+//	alg: The RNG algorithm (shape int32[]).
+//
+// Returns Random values with specified shape.
+func StatelessRandomNormalV2(scope *Scope, shape tf.Output, key tf.Output, counter tf.Output, alg tf.Output, optional ...StatelessRandomNormalV2Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatelessRandomNormalV2",
+		Input: []tf.Input{
+			shape, key, counter, alg,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug.
 type RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr func(optionalAttr)
 
@@ -45426,104 +46671,6 @@ func StatelessRandomUniform(scope *Scope, shape tf.Output, seed tf.Output, optio
 	return op.Output(0)
 }
 
-// StatelessRandomUniformFullIntAttr is an optional argument to StatelessRandomUniformFullInt.
-type StatelessRandomUniformFullIntAttr func(optionalAttr)
-
-// StatelessRandomUniformFullIntDtype sets the optional dtype attribute to value.
-//
-// value: The type of the output.
-// If not specified, defaults to DT_UINT64
-func StatelessRandomUniformFullIntDtype(value tf.DataType) StatelessRandomUniformFullIntAttr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
-	}
-}
-
-// Outputs deterministic pseudorandom random integers from a uniform distribution.
-//
-// The generated values are uniform integers covering the whole range of `dtype`.
-//
-// The outputs are a deterministic function of `shape` and `seed`.
-//
-// Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
-//
-// Returns Random values with specified shape.
-func StatelessRandomUniformFullInt(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomUniformFullIntAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StatelessRandomUniformFullInt",
-		Input: []tf.Input{
-			shape, seed,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingMomentumParametersGradAccumDebug.
-type RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-func RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugConfig sets the optional config attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingMomentumParametersGradAccumDebugConfig(value string) RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["config"] = value
-	}
-}
-
-// Retrieve Momentum embedding parameters with debug support.
-//
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
-//
-// Returns:
-//	parameters: Parameter parameters updated by the Momentum optimization algorithm.
-//	momenta: Parameter momenta updated by the Momentum optimization algorithm.
-//	gradient_accumulators: Parameter gradient_accumulators updated by the Momentum optimization algorithm.
-func RetrieveTPUEmbeddingMomentumParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr) (parameters tf.Output, momenta tf.Output, gradient_accumulators tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingMomentumParametersGradAccumDebug",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
 // MaxPoolGradGradV2Attr is an optional argument to MaxPoolGradGradV2.
 type MaxPoolGradGradV2Attr func(optionalAttr)
 
@@ -45572,6 +46719,28 @@ func MaxPoolGradGradV2(scope *Scope, orig_input tf.Output, orig_output tf.Output
 	return op.Output(0)
 }
 
+// An op to send a tensor to the host.
+//
+// input: the tensor that will be sent to the host.
+// Tinput: element type for input.
+// key: A unique identifier for this region used to match up host transfers.
+//
+// Returns the created operation.
+func XlaSendToHost(scope *Scope, input tf.Output, key string) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"key": key}
+	opspec := tf.OpSpec{
+		Type: "XlaSendToHost",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
 // ResourceSparseApplyRMSPropAttr is an optional argument to ResourceSparseApplyRMSProp.
 type ResourceSparseApplyRMSPropAttr func(optionalAttr)
 
@@ -45745,153 +46914,6 @@ func LoadTPUEmbeddingMomentumParametersGradAccumDebug(scope *Scope, parameters t
 	return scope.AddOperation(opspec)
 }
 
-// DecodeCompressedAttr is an optional argument to DecodeCompressed.
-type DecodeCompressedAttr func(optionalAttr)
-
-// DecodeCompressedCompressionType sets the optional compression_type attribute to value.
-//
-// value: A scalar containing either (i) the empty string (no
-// compression), (ii) "ZLIB", or (iii) "GZIP".
-// If not specified, defaults to ""
-func DecodeCompressedCompressionType(value string) DecodeCompressedAttr {
-	return func(m optionalAttr) {
-		m["compression_type"] = value
-	}
-}
-
-// Decompress strings.
-//
-// This op decompresses each element of the `bytes` input `Tensor`, which
-// is assumed to be compressed using the given `compression_type`.
-//
-// The `output` is a string `Tensor` of the same shape as `bytes`,
-// each element containing the decompressed data from the corresponding
-// element in `bytes`.
-//
-// Arguments:
-//	bytes: A Tensor of string which is compressed.
-//
-// Returns A Tensor with the same shape as input `bytes`, uncompressed
-// from bytes.
-func DecodeCompressed(scope *Scope, bytes tf.Output, optional ...DecodeCompressedAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DecodeCompressed",
-		Input: []tf.Input{
-			bytes,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// NonDeterministicIntsAttr is an optional argument to NonDeterministicInts.
-type NonDeterministicIntsAttr func(optionalAttr)
-
-// NonDeterministicIntsDtype sets the optional dtype attribute to value.
-//
-// value: The type of the output.
-// If not specified, defaults to DT_INT64
-func NonDeterministicIntsDtype(value tf.DataType) NonDeterministicIntsAttr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
-	}
-}
-
-// Non-deterministically generates some integers.
-//
-// This op may use some OS-provided source of non-determinism (e.g. an RNG), so each execution will give different results.
-//
-// Arguments:
-//	shape: The shape of the output tensor.
-//
-// Returns Non-deterministic integer values with specified shape.
-func NonDeterministicInts(scope *Scope, shape tf.Output, optional ...NonDeterministicIntsAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "NonDeterministicInts",
-		Input: []tf.Input{
-			shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MultinomialAttr is an optional argument to Multinomial.
-type MultinomialAttr func(optionalAttr)
-
-// MultinomialSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 is set to be non-zero, the internal random number
-// generator is seeded by the given seed.  Otherwise, a random seed is used.
-// If not specified, defaults to 0
-func MultinomialSeed(value int64) MultinomialAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// MultinomialSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func MultinomialSeed2(value int64) MultinomialAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// MultinomialOutputDtype sets the optional output_dtype attribute to value.
-// If not specified, defaults to DT_INT64
-func MultinomialOutputDtype(value tf.DataType) MultinomialAttr {
-	return func(m optionalAttr) {
-		m["output_dtype"] = value
-	}
-}
-
-// Draws samples from a multinomial distribution.
-//
-// Arguments:
-//	logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
-// represents the unnormalized log probabilities for all classes.
-//	num_samples: 0-D.  Number of independent samples to draw for each row slice.
-//
-// Returns 2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
-// contains the drawn class labels with range `[0, num_classes)`.
-func Multinomial(scope *Scope, logits tf.Output, num_samples tf.Output, optional ...MultinomialAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Multinomial",
-		Input: []tf.Input{
-			logits, num_samples,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // SerializeSparseAttr is an optional argument to SerializeSparse.
 type SerializeSparseAttr func(optionalAttr)
 
@@ -46023,6 +47045,35 @@ func InfeedEnqueue(scope *Scope, input tf.Output, optional ...InfeedEnqueueAttr)
 	return scope.AddOperation(opspec)
 }
 
+// Outputs deterministic pseudorandom random integers from a uniform distribution.
+//
+// The generated values follow a uniform distribution in the range `[minval, maxval)`.
+//
+// The outputs are a deterministic function of `shape`, `key`, `counter`, `alg`, `minval` and `maxval`.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	key: Key for the counter-based RNG algorithm (shape uint64[1]).
+//	counter: Initial counter for the counter-based RNG algorithm (shape uint64[2] or uint64[1] depending on the algorithm). If a larger vector is given, only the needed portion on the left (i.e. [:N]) will be used.
+//	alg: The RNG algorithm (shape int32[]).
+//	minval: Minimum value (inclusive, scalar).
+//	maxval: Maximum value (exclusive, scalar).
+//
+// Returns Random values with specified shape.
+func StatelessRandomUniformIntV2(scope *Scope, shape tf.Output, key tf.Output, counter tf.Output, alg tf.Output, minval tf.Output, maxval tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "StatelessRandomUniformIntV2",
+		Input: []tf.Input{
+			shape, key, counter, alg, minval, maxval,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // EnqueueTPUEmbeddingIntegerBatchAttr is an optional argument to EnqueueTPUEmbeddingIntegerBatch.
 type EnqueueTPUEmbeddingIntegerBatchAttr func(optionalAttr)
 
@@ -47417,57 +48468,6 @@ func ConfigureDistributedTPU(scope *Scope, optional ...ConfigureDistributedTPUAt
 	return op.Output(0)
 }
 
-// Creates a dataset that executes a SQL query and emits rows of the result set.
-//
-// Arguments:
-//	driver_name: The database type. Currently, the only supported type is 'sqlite'.
-//	data_source_name: A connection string to connect to the database.
-//	query: A SQL query to execute.
-//
-//
-func SqlDataset(scope *Scope, driver_name tf.Output, data_source_name tf.Output, query tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "SqlDataset",
-		Input: []tf.Input{
-			driver_name, data_source_name, query,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Outputs deterministic pseudorandom random integers from a uniform distribution.
-//
-// The generated values follow a uniform distribution in the range `[minval, maxval)`.
-//
-// The outputs are a deterministic function of `shape`, `seed`, `minval`, and `maxval`.
-//
-// Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
-//	minval: Minimum value (inclusive, scalar).
-//	maxval: Maximum value (exclusive, scalar).
-//
-// Returns Random values with specified shape.
-func StatelessRandomUniformInt(scope *Scope, shape tf.Output, seed tf.Output, minval tf.Output, maxval tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "StatelessRandomUniformInt",
-		Input: []tf.Input{
-			shape, seed, minval, maxval,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // LoadTPUEmbeddingRMSPropParametersAttr is an optional argument to LoadTPUEmbeddingRMSPropParameters.
 type LoadTPUEmbeddingRMSPropParametersAttr func(optionalAttr)
 
@@ -47787,90 +48787,6 @@ func LoadTPUEmbeddingAdagradParameters(scope *Scope, parameters tf.Output, accum
 	return scope.AddOperation(opspec)
 }
 
-// QuantizedMatMulWithBiasAndReluAttr is an optional argument to QuantizedMatMulWithBiasAndRelu.
-type QuantizedMatMulWithBiasAndReluAttr func(optionalAttr)
-
-// QuantizedMatMulWithBiasAndReluToutput sets the optional Toutput attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedMatMulWithBiasAndReluToutput(value tf.DataType) QuantizedMatMulWithBiasAndReluAttr {
-	return func(m optionalAttr) {
-		m["Toutput"] = value
-	}
-}
-
-// QuantizedMatMulWithBiasAndReluTransposeA sets the optional transpose_a attribute to value.
-//
-// value: If true, `a` is transposed before multiplication.
-// If not specified, defaults to false
-func QuantizedMatMulWithBiasAndReluTransposeA(value bool) QuantizedMatMulWithBiasAndReluAttr {
-	return func(m optionalAttr) {
-		m["transpose_a"] = value
-	}
-}
-
-// QuantizedMatMulWithBiasAndReluTransposeB sets the optional transpose_b attribute to value.
-//
-// value: If true, `b` is transposed before multiplication.
-// If not specified, defaults to false
-func QuantizedMatMulWithBiasAndReluTransposeB(value bool) QuantizedMatMulWithBiasAndReluAttr {
-	return func(m optionalAttr) {
-		m["transpose_b"] = value
-	}
-}
-
-// QuantizedMatMulWithBiasAndReluInputQuantMode sets the optional input_quant_mode attribute to value.
-//
-// value: Input data quantization mode. Either MIN_FIRST(default) or SCALED.
-// If not specified, defaults to "MIN_FIRST"
-func QuantizedMatMulWithBiasAndReluInputQuantMode(value string) QuantizedMatMulWithBiasAndReluAttr {
-	return func(m optionalAttr) {
-		m["input_quant_mode"] = value
-	}
-}
-
-// Perform a quantized matrix multiplication of  `a` by the matrix `b` with bias
-// add and relu fusion.
-//
-// The inputs must be two-dimensional matrices and 1D bias vector. And the inner
-// dimension of `a` (after being transposed if `transpose_a` is non-zero) must
-// match the outer dimension of `b` (after being transposed if `transposed_b` is
-// non-zero). Then do broadcast add operation with bias values on the matrix
-// multiplication result. The bias size must match inner dimension of `b`. Then do
-// relu activation to get non-negative result.
-//
-// Arguments:
-//	a: A matrix to be multiplied. Must be a two-dimensional tensor of type `quint8`.
-//	b: A matrix to be multiplied and must be a two-dimensional tensor of type `qint8`.
-//	bias: A 1D bias tensor with size matching with inner dimension of `b` (after being
-// transposed if `transposed_b` is non-zero).
-//	min_a: The float value that the lowest quantized `a` value represents.
-//	max_a: The float value that the highest quantized `a` value represents.
-//	min_b: The float value that the lowest quantized `b` value represents.
-//	max_b: The float value that the highest quantized `b` value represents.
-//
-// Returns:
-//	out
-//	min_out: The float value that the lowest quantized output value represents.
-//	max_out: The float value that the highest quantized output value represents.
-func QuantizedMatMulWithBiasAndRelu(scope *Scope, a tf.Output, b tf.Output, bias tf.Output, min_a tf.Output, max_a tf.Output, min_b tf.Output, max_b tf.Output, optional ...QuantizedMatMulWithBiasAndReluAttr) (out tf.Output, min_out tf.Output, max_out tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedMatMulWithBiasAndRelu",
-		Input: []tf.Input{
-			a, b, bias, min_a, max_a, min_b, max_b,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
 // Shuts down a running distributed TPU system.
 //
 // The op returns an error if no system is running.
@@ -48041,6 +48957,106 @@ func LinSpace(scope *Scope, start tf.Output, stop tf.Output, num tf.Output) (out
 	return op.Output(0)
 }
 
+// MultinomialAttr is an optional argument to Multinomial.
+type MultinomialAttr func(optionalAttr)
+
+// MultinomialSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 is set to be non-zero, the internal random number
+// generator is seeded by the given seed.  Otherwise, a random seed is used.
+// If not specified, defaults to 0
+func MultinomialSeed(value int64) MultinomialAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// MultinomialSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func MultinomialSeed2(value int64) MultinomialAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// MultinomialOutputDtype sets the optional output_dtype attribute to value.
+// If not specified, defaults to DT_INT64
+func MultinomialOutputDtype(value tf.DataType) MultinomialAttr {
+	return func(m optionalAttr) {
+		m["output_dtype"] = value
+	}
+}
+
+// Draws samples from a multinomial distribution.
+//
+// Arguments:
+//	logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
+// represents the unnormalized log probabilities for all classes.
+//	num_samples: 0-D.  Number of independent samples to draw for each row slice.
+//
+// Returns 2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
+// contains the drawn class labels with range `[0, num_classes)`.
+func Multinomial(scope *Scope, logits tf.Output, num_samples tf.Output, optional ...MultinomialAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Multinomial",
+		Input: []tf.Input{
+			logits, num_samples,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// NonDeterministicIntsAttr is an optional argument to NonDeterministicInts.
+type NonDeterministicIntsAttr func(optionalAttr)
+
+// NonDeterministicIntsDtype sets the optional dtype attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_INT64
+func NonDeterministicIntsDtype(value tf.DataType) NonDeterministicIntsAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Non-deterministically generates some integers.
+//
+// This op may use some OS-provided source of non-determinism (e.g. an RNG), so each execution will give different results.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//
+// Returns Non-deterministic integer values with specified shape.
+func NonDeterministicInts(scope *Scope, shape tf.Output, optional ...NonDeterministicIntsAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "NonDeterministicInts",
+		Input: []tf.Input{
+			shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Creates a dataset that caches elements from `input_dataset`.
 //
 // A CacheDataset will iterate over the input_dataset, and store tensors. If the
@@ -48239,6 +49255,101 @@ func LoadTPUEmbeddingStochasticGradientDescentParameters(scope *Scope, parameter
 	return scope.AddOperation(opspec)
 }
 
+// CollectiveGatherV2Attr is an optional argument to CollectiveGatherV2.
+type CollectiveGatherV2Attr func(optionalAttr)
+
+// CollectiveGatherV2CommunicationHint sets the optional communication_hint attribute to value.
+// If not specified, defaults to "auto"
+func CollectiveGatherV2CommunicationHint(value string) CollectiveGatherV2Attr {
+	return func(m optionalAttr) {
+		m["communication_hint"] = value
+	}
+}
+
+// CollectiveGatherV2TimeoutSeconds sets the optional timeout_seconds attribute to value.
+// If not specified, defaults to 0
+func CollectiveGatherV2TimeoutSeconds(value float32) CollectiveGatherV2Attr {
+	return func(m optionalAttr) {
+		m["timeout_seconds"] = value
+	}
+}
+
+// Mutually accumulates multiple tensors of identical type and shape.
+func CollectiveGatherV2(scope *Scope, input tf.Output, group_size tf.Output, group_key tf.Output, instance_key tf.Output, optional ...CollectiveGatherV2Attr) (data tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CollectiveGatherV2",
+		Input: []tf.Input{
+			input, group_size, group_key, instance_key,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug.
+type RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+func RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugConfig sets the optional config attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugConfig(value string) RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["config"] = value
+	}
+}
+
+// Retrieve Adadelta embedding parameters with debug support.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns:
+//	parameters: Parameter parameters updated by the Adadelta optimization algorithm.
+//	accumulators: Parameter accumulators updated by the Adadelta optimization algorithm.
+//	updates: Parameter updates updated by the Adadelta optimization algorithm.
+//	gradient_accumulators: Parameter gradient_accumulators updated by the Adadelta optimization algorithm.
+func RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr) (parameters tf.Output, accumulators tf.Output, updates tf.Output, gradient_accumulators tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+}
+
 // CropAndResizeGradImageAttr is an optional argument to CropAndResizeGradImage.
 type CropAndResizeGradImageAttr func(optionalAttr)
 
@@ -48467,6 +49578,14 @@ func AutoShardDatasetAutoShardPolicy(value int64) AutoShardDatasetAttr {
 	}
 }
 
+// AutoShardDatasetNumReplicas sets the optional num_replicas attribute to value.
+// If not specified, defaults to 0
+func AutoShardDatasetNumReplicas(value int64) AutoShardDatasetAttr {
+	return func(m optionalAttr) {
+		m["num_replicas"] = value
+	}
+}
+
 // Creates a dataset that shards the input dataset.
 //
 // Creates a dataset that shards the input dataset by num_workers, returning a
@@ -48502,6 +49621,53 @@ func AutoShardDataset(scope *Scope, input_dataset tf.Output, num_workers tf.Outp
 	return op.Output(0)
 }
 
+// DecodeCompressedAttr is an optional argument to DecodeCompressed.
+type DecodeCompressedAttr func(optionalAttr)
+
+// DecodeCompressedCompressionType sets the optional compression_type attribute to value.
+//
+// value: A scalar containing either (i) the empty string (no
+// compression), (ii) "ZLIB", or (iii) "GZIP".
+// If not specified, defaults to ""
+func DecodeCompressedCompressionType(value string) DecodeCompressedAttr {
+	return func(m optionalAttr) {
+		m["compression_type"] = value
+	}
+}
+
+// Decompress strings.
+//
+// This op decompresses each element of the `bytes` input `Tensor`, which
+// is assumed to be compressed using the given `compression_type`.
+//
+// The `output` is a string `Tensor` of the same shape as `bytes`,
+// each element containing the decompressed data from the corresponding
+// element in `bytes`.
+//
+// Arguments:
+//	bytes: A Tensor of string which is compressed.
+//
+// Returns A Tensor with the same shape as input `bytes`, uncompressed
+// from bytes.
+func DecodeCompressed(scope *Scope, bytes tf.Output, optional ...DecodeCompressedAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeCompressed",
+		Input: []tf.Input{
+			bytes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // DecodeJpegAttr is an optional argument to DecodeJpeg.
 type DecodeJpegAttr func(optionalAttr)
 
@@ -49185,33 +50351,6 @@ func LoadTPUEmbeddingMDLAdagradLightParameters(scope *Scope, parameters tf.Outpu
 	return scope.AddOperation(opspec)
 }
 
-// Returns the next record (key, value pair) produced by a Reader.
-//
-// Will dequeue from the input queue if necessary (e.g. when the
-// Reader needs to start reading from a new file since it has finished
-// with the previous file).
-//
-// Arguments:
-//	reader_handle: Handle to a Reader.
-//	queue_handle: Handle to a Queue, with string work items.
-//
-// Returns:
-//	key: A scalar.
-//	value: A scalar.
-func ReaderReadV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output) (key tf.Output, value tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReaderReadV2",
-		Input: []tf.Input{
-			reader_handle, queue_handle,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
 // CumprodAttr is an optional argument to Cumprod.
 type CumprodAttr func(optionalAttr)
 
@@ -49881,6 +51020,30 @@ func EnqueueTPUEmbeddingRaggedTensorBatch(scope *Scope, sample_splits []tf.Outpu
 	return scope.AddOperation(opspec)
 }
 
+// Computes the determinant of one or more square matrices.
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. The output is a tensor containing the determinants
+// for all input submatrices `[..., :, :]`.
+//
+// Arguments:
+//	input: Shape is `[..., M, M]`.
+//
+// Returns Shape is `[...]`.
+func MatrixDeterminant(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MatrixDeterminant",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Creates and returns an empty tensor list.
 //
 // All list elements must be tensors of dtype element_dtype and shape compatible
@@ -49980,196 +51143,3 @@ func TPUReplicatedOutput(scope *Scope, input tf.Output, num_replicas int64) (out
 	}
 	return outputs
 }
-
-// Scatter `updates` into a new tensor according to `indices`.
-//
-// Creates a new tensor by applying sparse `updates` to individual values or
-// slices within a tensor (initially zero for numeric, empty for string) of
-// the given `shape` according to indices.  This operator is the inverse of the
-// `tf.gather_nd` operator which extracts values or slices from a given tensor.
-//
-// This operation is similar to tensor_scatter_add, except that the tensor is
-// zero-initialized. Calling `tf.scatter_nd(indices, values, shape)` is identical
-// to `tensor_scatter_add(tf.zeros(shape, values.dtype), indices, values)`
-//
-// If `indices` contains duplicates, then their updates are accumulated (summed).
-//
-// **WARNING**: The order in which updates are applied is nondeterministic, so the
-// output will be nondeterministic if `indices` contains duplicates -- because
-// of some numerical approximation issues, numbers summed in different order
-// may yield different results.
-//
-// `indices` is an integer tensor containing indices into a new tensor of shape
-// `shape`.  The last dimension of `indices` can be at most the rank of `shape`:
-//
-//     indices.shape[-1] <= shape.rank
-//
-// The last dimension of `indices` corresponds to indices into elements
-// (if `indices.shape[-1] = shape.rank`) or slices
-// (if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of
-// `shape`.  `updates` is a tensor with shape
-//
-//     indices.shape[:-1] + shape[indices.shape[-1]:]
-//
-// The simplest form of scatter is to insert individual elements in a tensor by
-// index. For example, say we want to insert 4 scattered elements in a rank-1
-// tensor with 8 elements.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd1.png" alt>
-// </div>
-//
-// In Python, this scatter operation would look like this:
-//
-// ```python
-//     indices = tf.constant([[4], [3], [1], [7]])
-//     updates = tf.constant([9, 10, 11, 12])
-//     shape = tf.constant([8])
-//     scatter = tf.scatter_nd(indices, updates, shape)
-//     print(scatter)
-// ```
-//
-// The resulting tensor would look like this:
-//
-//     [0, 11, 0, 10, 9, 0, 0, 12]
-//
-// We can also, insert entire slices of a higher rank tensor all at once. For
-// example, if we wanted to insert two slices in the first dimension of a
-// rank-3 tensor with two matrices of new values.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd2.png" alt>
-// </div>
-//
-// In Python, this scatter operation would look like this:
-//
-// ```python
-//     indices = tf.constant([[0], [2]])
-//     updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],
-//                             [7, 7, 7, 7], [8, 8, 8, 8]],
-//                            [[5, 5, 5, 5], [6, 6, 6, 6],
-//                             [7, 7, 7, 7], [8, 8, 8, 8]]])
-//     shape = tf.constant([4, 4, 4])
-//     scatter = tf.scatter_nd(indices, updates, shape)
-//     print(scatter)
-// ```
-//
-// The resulting tensor would look like this:
-//
-//     [[[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
-//      [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],
-//      [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
-//      [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]]
-//
-// Note that on CPU, if an out of bound index is found, an error is returned.
-// On GPU, if an out of bound index is found, the index is ignored.
-//
-// Arguments:
-//	indices: Index tensor.
-//	updates: Updates to scatter into output.
-//	shape: 1-D. The shape of the resulting tensor.
-//
-// Returns A new tensor with the given shape and updates applied according
-// to the indices.
-func ScatterNd(scope *Scope, indices tf.Output, updates tf.Output, shape tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ScatterNd",
-		Input: []tf.Input{
-			indices, updates, shape,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// UniqueAttr is an optional argument to Unique.
-type UniqueAttr func(optionalAttr)
-
-// UniqueOutIdx sets the optional out_idx attribute to value.
-// If not specified, defaults to DT_INT32
-func UniqueOutIdx(value tf.DataType) UniqueAttr {
-	return func(m optionalAttr) {
-		m["out_idx"] = value
-	}
-}
-
-// Finds unique elements in a 1-D tensor.
-//
-// This operation returns a tensor `y` containing all of the unique elements of `x`
-// sorted in the same order that they occur in `x`; `x` does not need to be sorted.
-// This operation also returns a tensor `idx` the same size as `x` that contains
-// the index of each value of `x` in the unique output `y`. In other words:
-//
-// `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
-//
-// Examples:
-//
-// ```
-// # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
-// y, idx = unique(x)
-// y ==> [1, 2, 4, 7, 8]
-// idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
-// ```
-//
-// ```
-// # tensor 'x' is [4, 5, 1, 2, 3, 3, 4, 5]
-// y, idx = unique(x)
-// y ==> [4, 5, 1, 2, 3]
-// idx ==> [0, 1, 2, 3, 4, 4, 0, 1]
-// ```
-//
-// Arguments:
-//	x: 1-D.
-//
-// Returns:
-//	y: 1-D.
-//	idx: 1-D.
-func Unique(scope *Scope, x tf.Output, optional ...UniqueAttr) (y tf.Output, idx tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Unique",
-		Input: []tf.Input{
-			x,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Converts a `RaggedTensor` into a `SparseTensor` with the same values.
-//
-// input=ragged.from_nested_row_splits(rt_dense_values, rt_nested_splits)
-// output=SparseTensor(indices=sparse_indices, values=sparse_values,
-//                     dense_shape=sparse_dense_shape)
-//
-// Arguments:
-//	rt_nested_splits: The `row_splits` for the `RaggedTensor`.
-//	rt_dense_values: The `flat_values` for the `RaggedTensor`.
-//
-// Returns:
-//	sparse_indices: The indices for the `SparseTensor`.
-//	sparse_values: The values of the `SparseTensor`.
-//	sparse_dense_shape: `sparse_dense_shape` is a tight bounding box of the input `RaggedTensor`.
-func RaggedTensorToSparse(scope *Scope, rt_nested_splits []tf.Output, rt_dense_values tf.Output) (sparse_indices tf.Output, sparse_values tf.Output, sparse_dense_shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "RaggedTensorToSparse",
-		Input: []tf.Input{
-			tf.OutputList(rt_nested_splits), rt_dense_values,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
diff --git a/tensorflow/go/tensor.go b/tensorflow/go/tensor.go
index 9221d35274c..d9036ced325 100644
--- a/tensorflow/go/tensor.go
+++ b/tensorflow/go/tensor.go
@@ -83,7 +83,7 @@ func NewTensor(value interface{}) (*Tensor, error) {
 		return nil, err
 	}
 	nflattened := numElements(shape)
-	nbytes := typeOf(dataType, nil).Size() * uintptr(nflattened)
+	nbytes := TypeOf(dataType, nil).Size() * uintptr(nflattened)
 	if dataType == String {
 		nbytes = uintptr(nflattened) * C.sizeof_TF_TString
 	}
@@ -168,7 +168,7 @@ func ReadTensor(dataType DataType, shape []int64, r io.Reader) (*Tensor, error)
 	if err := isTensorSerializable(dataType); err != nil {
 		return nil, err
 	}
-	nbytes := typeOf(dataType, nil).Size() * uintptr(numElements(shape))
+	nbytes := TypeOf(dataType, nil).Size() * uintptr(numElements(shape))
 	var shapePtr *C.int64_t
 	if len(shape) > 0 {
 		shapePtr = (*C.int64_t)(unsafe.Pointer(&shape[0]))
@@ -207,6 +207,28 @@ func (t *Tensor) DataType() DataType { return DataType(C.TF_TensorType(t.c)) }
 // Shape returns the shape of the Tensor.
 func (t *Tensor) Shape() []int64 { return t.shape }
 
+// Reshape  updates tensor's shape in place if this is possible or returns an error otherwise.
+func (t *Tensor) Reshape(new_shape []int64) error {
+	old_shape_size := numElements(t.shape)
+	new_shape_size := numElements(new_shape)
+
+	if old_shape_size != new_shape_size {
+		return fmt.Errorf("unable to convert shape %v (num_elements: %d) into shape %v (num_elements: %d)", t.shape, old_shape_size, new_shape, new_shape_size)
+	}
+
+	if len(new_shape) == 0 {
+		return nil
+	}
+
+	var shapePtr *C.int64_t
+	shapePtr = (*C.int64_t)(unsafe.Pointer(&new_shape[0]))
+
+	status := newStatus()
+	C.TF_TensorBitcastFrom(t.c, C.TF_TensorType(t.c), t.c, shapePtr, C.int(len(new_shape)), status.c)
+
+	return status.Err()
+}
+
 // Value converts the Tensor to a Go value. For now, not all Tensor types are
 // supported, and this function may panic if it encounters an unsupported
 // DataType.
@@ -407,8 +429,8 @@ func typeForDataType(dt DataType) reflect.Type {
 	panic(bug("DataType %v is not supported (see https://www.tensorflow.org/code/tensorflow/core/framework/types.proto)", dt))
 }
 
-// typeOf converts from a DataType and Shape to the equivalent Go type.
-func typeOf(dt DataType, shape []int64) reflect.Type {
+// TypeOf converts from a DataType and Shape to the equivalent Go type.
+func TypeOf(dt DataType, shape []int64) reflect.Type {
 	ret := typeForDataType(dt)
 	for range shape {
 		ret = reflect.SliceOf(ret)
diff --git a/tensorflow/lite/BUILD b/tensorflow/lite/BUILD
index fac85181231..76e9f5de7c1 100644
--- a/tensorflow/lite/BUILD
+++ b/tensorflow/lite/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow:tensorflow.bzl", "if_not_windows", "tf_cc_test")
-load("//tensorflow/lite:build_def.bzl", "if_tflite_experimental_runtime", "tflite_cc_shared_object", "tflite_copts", "tflite_experimental_runtime_linkopts")
+load("//tensorflow/lite:build_def.bzl", "tflite_cc_shared_object", "tflite_copts")
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
+load("//tensorflow:tensorflow.bzl", "get_compatible_with_portable")
 
 package(
     default_visibility = ["//visibility:public"],
@@ -15,13 +16,6 @@ exports_files(glob([
     "models/testdata/*",
 ]))
 
-config_setting(
-    name = "enable_default_profiler",
-    values = {
-        "copt": "-DTFLITE_ENABLE_DEFAULT_PROFILER",
-    },
-)
-
 config_setting(
     name = "gemmlowp_profiling",
     values = {
@@ -44,15 +38,11 @@ config_setting(
 )
 
 config_setting(
-    name = "tflite_experimental_runtime_eager",
-    values = {"define": "tflite_experimental_runtime=eager"},
-    visibility = ["//visibility:public"],
-)
-
-config_setting(
-    name = "tflite_experimental_runtime_non_eager",
-    values = {"define": "tflite_experimental_runtime=non-eager"},
-    visibility = ["//visibility:public"],
+    name = "tf_lite_static_memory",
+    values = {
+        "copt": "-DTF_LITE_STATIC_MEMORY",
+        "cpu": "k8",
+    },
 )
 
 TFLITE_DEFAULT_COPTS = if_not_windows([
@@ -82,6 +72,7 @@ FRAMEWORK_LIB_HDRS = [
 cc_library(
     name = "version",
     hdrs = ["version.h"],
+    compatible_with = get_compatible_with_portable(),
     copts = TFLITE_DEFAULT_COPTS,
     # Note that we only use the header defines from :version_lib.
     deps = ["//tensorflow/core:version_lib"],
@@ -99,6 +90,7 @@ cc_library(
     name = "arena_planner",
     srcs = ["arena_planner.cc"],
     hdrs = ["arena_planner.h"],
+    compatible_with = get_compatible_with_portable(),
     copts = TFLITE_DEFAULT_COPTS,
     deps = [
         ":graph_info",
@@ -129,6 +121,7 @@ cc_test(
 cc_library(
     name = "context",
     hdrs = ["context.h"],
+    compatible_with = get_compatible_with_portable(),
     copts = TFLITE_DEFAULT_COPTS,
     deps = ["//tensorflow/lite/c:common"],
 )
@@ -137,6 +130,7 @@ cc_library(
     name = "external_cpu_backend_context",
     srcs = ["external_cpu_backend_context.cc"],
     hdrs = ["external_cpu_backend_context.h"],
+    compatible_with = get_compatible_with_portable(),
     copts = TFLITE_DEFAULT_COPTS,
     deps = [
         "//tensorflow/lite/c:common",
@@ -146,6 +140,7 @@ cc_library(
 cc_library(
     name = "graph_info",
     hdrs = ["graph_info.h"],
+    compatible_with = get_compatible_with_portable(),
     copts = TFLITE_DEFAULT_COPTS,
     deps = ["//tensorflow/lite/c:common"],
 )
@@ -153,6 +148,7 @@ cc_library(
 cc_library(
     name = "memory_planner",
     hdrs = ["memory_planner.h"],
+    compatible_with = get_compatible_with_portable(),
     copts = TFLITE_DEFAULT_COPTS,
     deps = ["//tensorflow/lite/c:common"],
 )
@@ -161,6 +157,7 @@ cc_library(
     name = "simple_memory_arena",
     srcs = ["simple_memory_arena.cc"],
     hdrs = ["simple_memory_arena.h"],
+    compatible_with = get_compatible_with_portable(),
     copts = TFLITE_DEFAULT_COPTS,
     deps = ["//tensorflow/lite/c:common"],
 )
@@ -180,6 +177,7 @@ cc_library(
         "builtin_ops.h",
         "context_util.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     deps = ["//tensorflow/lite/c:common"],
 )
 
@@ -190,6 +188,7 @@ cc_library(
     hdrs = [
         "string_type.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     copts = TFLITE_DEFAULT_COPTS,
 )
 
@@ -211,6 +210,7 @@ cc_library(
     hdrs = [
         "allocation.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     copts = TFLITE_DEFAULT_COPTS,
     deps = [
         ":string",
@@ -232,6 +232,7 @@ cc_library(
         "stderr_reporter.cc",
     ],
     hdrs = FRAMEWORK_LIB_HDRS,
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts() + TFLITE_DEFAULT_COPTS,
     visibility = [
         "//tensorflow/lite:__subpackages__",
@@ -246,7 +247,6 @@ cc_library(
         ":shared_library",
         ":simple_memory_arena",
         ":string",
-        ":tflite_with_xnnpack_optional",
         ":type_to_tflitetype",
         ":util",
         ":version",
@@ -257,13 +257,9 @@ cc_library(
         "//tensorflow/lite/experimental/resource",
         "//tensorflow/lite/kernels/internal:compatibility",
         "//tensorflow/lite/nnapi:nnapi_implementation",
+        "//tensorflow/lite/profiling:platform_profiler",
         "//tensorflow/lite/schema:schema_fbs",
-    ] + select({
-        ":enable_default_profiler": [
-            "//tensorflow/lite/profiling:platform_profiler",
-        ],
-        "//conditions:default": [],
-    }),
+    ],
     alwayslink = 1,
 )
 
@@ -273,17 +269,13 @@ cc_library(
     srcs = [
     ],
     hdrs = FRAMEWORK_LIB_HDRS,
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts() + TFLITE_DEFAULT_COPTS,
-    defines = if_tflite_experimental_runtime(
-        if_eager = ["TFLITE_EXPERIMENTAL_RUNTIME_EAGER"],
-        if_non_eager = ["TFLITE_EXPERIMENTAL_RUNTIME_NON_EAGER"],
-        if_none = [],
-    ),
     deps = [
-        ":framework_lib",
         ":allocation",
         ":arena_planner",
         ":external_cpu_backend_context",
+        ":framework_lib",
         ":graph_info",
         ":memory_planner",
         ":minimal_logging",
@@ -298,13 +290,14 @@ cc_library(
         "//tensorflow/lite/experimental/resource",
         "//tensorflow/lite/nnapi:nnapi_implementation",
         "//tensorflow/lite/schema:schema_fbs",
-    ] + tflite_experimental_runtime_linkopts(),
+    ],
 )
 
 cc_library(
     name = "string_util",
     srcs = ["string_util.cc"],
     hdrs = ["string_util.h"],
+    compatible_with = get_compatible_with_portable(),
     copts = TFLITE_DEFAULT_COPTS,
     deps = [
         ":string",
@@ -349,6 +342,7 @@ cc_library(
 
 cc_library(
     name = "tflite_with_xnnpack_default",
+    compatible_with = get_compatible_with_portable(),
     visibility = ["//visibility:private"],
     # TODO(b/151246885): put ":tflite_with_xnnpack_enabled" to macos/windows
     # once we have a good testing coverage on these two platforms.
@@ -366,6 +360,7 @@ cc_library(
         "core/macros.h",
         "tflite_with_xnnpack_optional.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts() + TFLITE_DEFAULT_COPTS,
     deps = [
         "//tensorflow/lite/c:common",
@@ -412,9 +407,11 @@ cc_test(
         "tflite_smoke_test",
     ],
     deps = [
+        ":builtin_op_data",
         ":external_cpu_backend_context",
         ":framework",
         ":string_util",
+        ":util",
         ":version",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/kernels:builtin_ops",
@@ -559,6 +556,7 @@ cc_library(
     name = "util",
     srcs = ["util.cc"],
     hdrs = ["util.h"],
+    compatible_with = get_compatible_with_portable(),
     copts = TFLITE_DEFAULT_COPTS + tflite_copts(),
     deps = [
         ":kernel_api",
@@ -602,6 +600,7 @@ cc_library(
         ],
     }),
     hdrs = ["minimal_logging.h"],
+    compatible_with = get_compatible_with_portable(),
     copts = TFLITE_DEFAULT_COPTS + tflite_copts(),
     linkopts = select({
         "//tensorflow:android": ["-llog"],
@@ -614,7 +613,15 @@ cc_library(
 
 cc_library(
     name = "type_to_tflitetype",
-    hdrs = ["type_to_tflitetype.h"],
+    hdrs = [
+        "portable_type_to_tflitetype.h",
+    ] + select({
+        ":tf_lite_static_memory": [],
+        "//conditions:default": [
+            "type_to_tflitetype.h",
+        ],
+    }),
+    compatible_with = get_compatible_with_portable(),
     deps = ["//tensorflow/lite/c:common"],
 )
 
@@ -644,6 +651,7 @@ cc_test(
 cc_library(
     name = "shared_library",
     hdrs = ["shared_library.h"],
+    compatible_with = get_compatible_with_portable(),
     linkopts = if_not_windows(["-ldl"]),
 )
 
diff --git a/tensorflow/lite/CMakeLists.txt b/tensorflow/lite/CMakeLists.txt
new file mode 100644
index 00000000000..f15c9eaf2b4
--- /dev/null
+++ b/tensorflow/lite/CMakeLists.txt
@@ -0,0 +1,374 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Builds the Tensorflow Lite runtime.
+#
+# WARNING: This is an experimental that is subject to change.
+# This has only been tested on Windows, Linux and macOS.
+#
+# The following are not currently supported:
+# - GPU acceleration
+# - Android
+# - iOS
+# - Micro backend
+# - Tests
+# - Many features in experimental
+# - Host Tools (i.e conversion / analysis tools etc.)
+
+cmake_minimum_required(VERSION 3.16)
+# Double colon in target name means ALIAS or IMPORTED target.
+cmake_policy(SET CMP0028 NEW)
+# Enable MACOSX_RPATH (@rpath) for built dynamic libraries.
+cmake_policy(SET CMP0042 NEW)
+project(tensorflow-lite C CXX)
+set(TENSORFLOW_SOURCE_DIR "" CACHE PATH
+  "Directory that contains the TensorFlow project"
+)
+if(NOT TENSORFLOW_SOURCE_DIR)
+  get_filename_component(TENSORFLOW_SOURCE_DIR
+    "${CMAKE_SOURCE_DIR}/../../"
+    ABSOLUTE
+  )
+endif()
+set(TF_SOURCE_DIR "${TENSORFLOW_SOURCE_DIR}/tensorflow")
+set(TFLITE_SOURCE_DIR "${CMAKE_SOURCE_DIR}")
+set(CMAKE_MODULE_PATH
+  "${TFLITE_SOURCE_DIR}/tools/cmake/modules"
+  ${CMAKE_MODULE_PATH}
+)
+set(CMAKE_PREFIX_PATH
+  "${TFLITE_SOURCE_DIR}/tools/cmake/modules"
+  ${CMAKE_PREFIX_PATH}
+)
+# b/168750039: To workaround absl module not found error on Android build.
+set(absl_DIR ${CMAKE_MODULE_PATH})
+
+option(TFLITE_ENABLE_RUY "Enable experimental RUY integration" OFF)
+option(TFLITE_ENABLE_RESOURCE "Enable experimental support for resources" ON)
+option(TFLITE_ENABLE_NNAPI "Enable NNAPI (Android only)." ON)
+option(TFLITE_ENABLE_MMAP "Enable MMAP (unsupported on Windows)" ON)
+option(TFLITE_ENABLE_GPU "Enable GPU (not supported)" OFF)
+# This must be enabled when converting from TF models with SELECT_TF_OPS
+# enabled.
+# https://www.tensorflow.org/lite/guide/ops_select#converting_the_model
+# This is currently not supported.
+option(TFLITE_ENABLE_FLEX "Enable SELECT_TF_OPS" OFF) # TODO: Add support
+option(TFLITE_ENABLE_XNNPACK "Enable XNNPACK backend" ON)
+set(CMAKE_CXX_STANDARD 14)  # Some components require C++14.
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(_TFLITE_ENABLE_NNAPI "${TFLITE_ENABLE_NNAPI}")
+if(NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "Android")
+  set(_TFLITE_ENABLE_NNAPI OFF)
+endif()
+set(_TFLITE_ENABLE_MMAP "${TFLITE_ENABLE_MMAP}")
+if(${CMAKE_SYSTEM_NAME} MATCHES "Windows")
+  # See https://github.com/tensorflow/tensorflow/blob/\
+  # 2b96f3662bd776e277f86997659e61046b56c315/tensorflow/lite/tools/make/\
+  # Makefile#L157
+  set(_TFLITE_ENABLE_MMAP OFF)
+endif()
+# Simplifies inclusion of non-test sources and headers from a directory.
+# SOURCE_DIR: Directory to search for files.
+# SOURCES_VAR: Variable to append with all matching *.cc and *.h files.
+# [FILTER expression0 .. expressionN]:
+#   Additional regular expressions to filter the set of matching
+#   files. By default, all files ending in "(_test|test_util)\\.(cc|h)" are
+#   removed.
+# [RECURSE]: Whether to recursively search SOURCE_DIR.
+macro(populate_source_vars SOURCE_DIR SOURCES_VAR)
+  cmake_parse_arguments(ARGS "RECURSE" "" "FILTER" ${ARGN})
+  if(ARGS_RECURSE)
+    set(GLOB_OP GLOB_RECURSE)
+  else()
+    set(GLOB_OP GLOB)
+  endif()
+  set(DEFAULT_FILE_FILTER ".*(_test|test_util)\\.(c|cc|h)$")
+  file(${GLOB_OP} FOUND_SOURCES "${SOURCE_DIR}/*.*")
+  list(FILTER FOUND_SOURCES INCLUDE REGEX ".*\\.(c|cc|h)$")
+  list(FILTER FOUND_SOURCES EXCLUDE REGEX "${DEFAULT_FILE_FILTER}")
+  foreach(FILE_FILTER ${ARGS_FILTER})
+    list(FILTER FOUND_SOURCES EXCLUDE REGEX "${FILE_FILTER}")
+  endforeach()
+  list(APPEND ${SOURCES_VAR} ${FOUND_SOURCES})
+endmacro()
+# Simplifies inclusion of non-test sources and headers from a directory
+# relative to TFLITE_SOURCE_DIR. See populate_source_vars() for the
+# description of arguments including and following SOURCES_VAR.
+macro(populate_tflite_source_vars RELATIVE_DIR SOURCES_VAR)
+  populate_source_vars(
+    "${TFLITE_SOURCE_DIR}/${RELATIVE_DIR}" ${SOURCES_VAR} ${ARGN}
+  )
+endmacro()
+# Simplifies inclusion of non-test sources and headers from a directory
+# relative to TF_SOURCE_DIR. See populate_source_vars() for the description of
+# arguments including and following SOURCES_VAR.
+macro(populate_tf_source_vars RELATIVE_DIR SOURCES_VAR)
+  populate_source_vars(
+    "${TF_SOURCE_DIR}/${RELATIVE_DIR}" ${SOURCES_VAR} ${ARGN}
+  )
+endmacro()
+# Find TensorFlow Lite dependencies.
+find_package(absl REQUIRED CONFIG)
+find_package(eigen REQUIRED)
+find_package(farmhash REQUIRED)
+find_package(fft2d REQUIRED)
+find_package(flatbuffers REQUIRED)
+find_package(gemmlowp REQUIRED)
+find_package(neon2sse REQUIRED)
+find_package(ruy REQUIRED)
+# Generate TensorFlow Lite FlatBuffer code.
+# We used to have an actual compilation logic with flatc but decided to use
+# schema_generated.h since flatc doesn't work with cross compilation.
+set(TFLITE_FLATBUFFERS_SCHEMA_DIR "${TFLITE_SOURCE_DIR}/schema")
+set(TF_TARGET_PRIVATE_OPTIONS "")
+if(CMAKE_CXX_COMPILER_ID MATCHES "Clang$")
+  # TensorFlow uses a heap of deprecated proto fields so surpress these
+  # warnings until they're fixed.
+  list(APPEND TF_TARGET_PRIVATE_OPTIONS "-Wno-deprecated-declarations")
+endif()
+# Additional compiler flags used when compiling TF Lite.
+set(TFLITE_TARGET_PUBLIC_OPTIONS "")
+set(TFLITE_TARGET_PRIVATE_OPTIONS "")
+# Additional library dependencies based upon enabled features.
+set(TFLITE_TARGET_DEPENDENCIES "")
+if(CMAKE_CXX_COMPILER_ID MATCHES "Clang$")
+  # TFLite uses deprecated methods in neon2sse which generates a huge number of
+  # warnings so surpress these until they're fixed.
+  list(APPEND TFLITE_TARGET_PRIVATE_OPTIONS "-Wno-deprecated-declarations")
+endif()
+if(CMAKE_SYSTEM_NAME MATCHES "Windows")
+  # Use NOMINMAX to disable the min / max macros in windows.h as they break
+  # use of std::min std::max.
+  # Use NOGDI to ERROR macro which breaks TensorFlow logging.
+  list(APPEND TFLITE_TARGET_PRIVATE_OPTIONS "-DNOMINMAX" "-DNOGDI")
+  # lite/kernels/conv.cc has more than 64k sections so enable /bigobj to
+  # support compilation with MSVC2015.
+  if(MSVC)
+    list(APPEND TFLITE_TARGET_PRIVATE_OPTIONS "/bigobj")
+  elseif(CMAKE_COMPILER_IS_GNUCXX)
+    list(APPEND TFLITE_TARGET_PRIVATE_OPTIONS "-Wa,-mbig-obj")
+  endif()
+endif()
+if(CMAKE_SYSTEM_NAME MATCHES "Android")
+  find_library(ANDROID_LOG_LIB log)
+endif()
+# Build a list of source files to compile into the TF Lite library.
+populate_tflite_source_vars("." TFLITE_SRCS)
+if(_TFLITE_ENABLE_MMAP)
+  list(FILTER TFLITE_SRCS EXCLUDE REGEX ".*mmap_allocation_disabled\\.cc$")
+else()
+  list(FILTER TFLITE_SRCS EXCLUDE REGEX ".*mmap_allocation\\.cc$")
+endif()
+if(NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "Android")
+  list(FILTER TFLITE_SRCS EXCLUDE REGEX ".*minimal_logging_android\\.cc$")
+endif()
+if(NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "iOS")
+  list(FILTER TFLITE_SRCS EXCLUDE REGEX ".*minimal_logging_ios\\.cc$")
+endif()
+populate_tflite_source_vars("core" TFLITE_CORE_SRCS)
+populate_tflite_source_vars("core/api" TFLITE_CORE_API_SRCS)
+populate_tflite_source_vars("c" TFLITE_C_SRCS)
+populate_tflite_source_vars("delegates" TFLITE_DELEGATES_SRCS)
+if(TFLITE_ENABLE_FLEX)
+  message(FATAL_ERROR "TF Lite Flex delegate is currently not supported.")
+  populate_tflite_source_vars("delegates/flex" TFLITE_DELEGATES_FLEX_SRCS)
+  list(APPEND TFLITE_TARGET_DEPENDENCIES
+    absl::inlined_vector
+    absl::optional
+    absl::type_traits
+  )
+endif()
+if(TFLITE_ENABLE_GPU)
+  # Implementation is under delegates/gpu.
+  message(FATAL_ERROR
+    "GPU acceleration is not currently supported in CMake builds"
+  )
+endif()
+if(_TFLITE_ENABLE_NNAPI)
+  populate_tflite_source_vars("delegates/nnapi"
+    TFLITE_DELEGATES_NNAPI_SRCS
+    FILTER "(_test_list|_disabled)\\.(cc|h)$"
+  )
+  populate_tflite_source_vars(
+    "nnapi" TFLITE_NNAPI_SRCS FILTER "(_disabled)\\.(cc|h)$"
+  )
+else()
+  set(TFLITE_DELEGATES_NNAPI_SRCS
+    "${TFLITE_SOURCE_DIR}/delegates/nnapi/nnapi_delegate_disabled.cc"
+  )
+  set(TFLITE_NNAPI_SRCS
+    "${TFLITE_SOURCE_DIR}/nnapi/nnapi_implementation_disabled.cc"
+  )
+endif()
+if(TFLITE_ENABLE_XNNPACK)
+  find_package(xnnpack REQUIRED)
+  populate_tflite_source_vars("delegates/xnnpack"
+    TFLITE_DELEGATES_XNNPACK_SRCS
+    FILTER ".*(_test|_tester)\\.(cc|h)"
+  )
+  list(APPEND TFLITE_TARGET_DEPENDENCIES
+    XNNPACK
+  )
+endif()
+if (TFLITE_ENABLE_RESOURCE)
+  populate_tflite_source_vars("experimental/resource"
+    TFLITE_EXPERIMENTAL_RESOURCE_SRCS
+  )
+endif()
+populate_tflite_source_vars("experimental/ruy"
+  TFLITE_EXPERIMENTAL_RUY_SRCS
+  FILTER
+  ".*(test(_fast|_slow|_special_specs))\\.(cc|h)$"
+  ".*(benchmark|tune_tool|example)\\.(cc|h)$"
+)
+populate_tflite_source_vars("experimental/ruy/profiler"
+  TFLITE_EXPERIMENTAL_RUY_PROFILER_SRCS
+  FILTER ".*(test|test_instrumented_library)\\.(cc|h)$"
+)
+if(TFLITE_ENABLE_RUY)
+  list(APPEND TFLITE_TARGET_PUBLIC_OPTIONS "-DTFLITE_WITH_RUY")
+endif()
+populate_tflite_source_vars("kernels"
+  TFLITE_KERNEL_SRCS
+  FILTER ".*(_test_util_internal|test_main)\\.(cc|h)"
+)
+populate_tflite_source_vars("kernels/internal" TFLITE_KERNEL_INTERNAL_SRCS)
+populate_tflite_source_vars("kernels/internal/optimized"
+  TFLITE_KERNEL_INTERNAL_OPT_SRCS
+)
+populate_tflite_source_vars("kernels/internal/optimized/integer_ops"
+  TFLITE_KERNEL_INTERNAL_OPT_INTEGER_OPS_SRCS
+)
+populate_tflite_source_vars("kernels/internal/optimized/sparse_ops"
+  TFLITE_KERNEL_INTERNAL_OPT_SPARSE_OPS_SRCS
+)
+populate_tflite_source_vars("kernels/internal/reference"
+  TFLITE_KERNEL_INTERNAL_REF_SRCS
+)
+populate_tflite_source_vars("kernels/internal/reference/integer_ops"
+  TFLITE_KERNEL_INTERNAL_REF_INTEGER_OPS_SRCS
+)
+populate_tflite_source_vars("kernels/internal/reference/sparse_ops"
+  TFLITE_KERNEL_INTERNAL_REF_SPARSE_OPS_SRCS
+)
+
+# Common include directories
+include_directories(
+  BEFORE
+   "${TENSORFLOW_SOURCE_DIR}"
+   "${TFLITE_FLATBUFFERS_SCHEMA_DIR}"
+)
+
+# TFLite library
+add_library(tensorflowlite
+  ${TFLITE_CORE_API_SRCS}
+  ${TFLITE_CORE_SRCS}
+  ${TFLITE_C_SRCS}
+  ${TFLITE_DELEGATES_FLEX_SRCS}
+  ${TFLITE_DELEGATES_NNAPI_SRCS}
+  ${TFLITE_DELEGATES_SRCS}
+  ${TFLITE_DELEGATES_XNNPACK_SRCS}
+  ${TFLITE_EXPERIMENTAL_RESOURCE_SRCS}
+  ${TFLITE_EXPERIMENTAL_RUY_PROFILER_SRCS}
+  ${TFLITE_EXPERIMENTAL_RUY_SRCS}
+  ${TFLITE_KERNEL_INTERNAL_OPT_INTEGER_OPS_SRCS}
+  ${TFLITE_KERNEL_INTERNAL_OPT_SPARSE_OPS_SRCS}
+  ${TFLITE_KERNEL_INTERNAL_OPT_SRCS}
+  ${TFLITE_KERNEL_INTERNAL_REF_INTEGER_OPS_SRCS}
+  ${TFLITE_KERNEL_INTERNAL_REF_SPARSE_OPS_SRCS}
+  ${TFLITE_KERNEL_INTERNAL_REF_SRCS}
+  ${TFLITE_KERNEL_INTERNAL_SRCS}
+  ${TFLITE_KERNEL_SRCS}
+  ${TFLITE_NNAPI_SRCS}
+  ${TFLITE_SRCS}
+)
+target_link_libraries(tensorflowlite
+  PUBLIC
+    Eigen3::Eigen
+    NEON_2_SSE
+    absl::flags
+    absl::hash
+    absl::status
+    absl::strings
+    absl::synchronization
+    absl::variant
+    farmhash
+    fft2d_fftsg2d
+    flatbuffers
+    gemmlowp
+    ruy
+    ${TFLITE_TARGET_DEPENDENCIES}
+)
+target_compile_options(tensorflowlite
+  PUBLIC ${TFLITE_TARGET_PUBLIC_OPTIONS}
+  PRIVATE ${TFLITE_TARGET_PRIVATE_OPTIONS}
+)
+add_library(tensorflow::tensorflowlite ALIAS tensorflowlite)
+
+# Benchmark Tool
+populate_source_vars("${TFLITE_SOURCE_DIR}/tools/benchmark"
+  TFLITE_BENCHMARK_SRCS
+  FILTER "(_test|_plus_flex_main|_performance_options_main)\\.cc$"
+)
+list(APPEND TFLITE_BENCHMARK_SRCS
+  ${TF_SOURCE_DIR}/core/util/stats_calculator.cc
+  ${TFLITE_SOURCE_DIR}/profiling/memory_info.cc
+  ${TFLITE_SOURCE_DIR}/profiling/platform_profiler.cc
+  ${TFLITE_SOURCE_DIR}/profiling/profile_summarizer.cc
+  ${TFLITE_SOURCE_DIR}/profiling/profile_summary_formatter.cc
+  ${TFLITE_SOURCE_DIR}/profiling/time.cc
+  ${TFLITE_SOURCE_DIR}/tools/command_line_flags.cc
+  ${TFLITE_SOURCE_DIR}/tools/delegates/default_execution_provider.cc
+  ${TFLITE_SOURCE_DIR}/tools/evaluation/utils.cc
+  ${TFLITE_SOURCE_DIR}/tools/optimize/sparsity/format_converter.cc
+  ${TFLITE_SOURCE_DIR}/tools/tool_params.cc
+)
+
+list(APPEND TFLITE_BENCHMARK_LIBS
+  tensorflowlite
+  ${CMAKE_DL_LIBS}
+)
+
+if(TFLITE_ENABLE_XNNPACK)
+  list(APPEND TFLITE_BENCHMARK_SRCS
+    ${TFLITE_SOURCE_DIR}/tools/delegates/xnnpack_delegate_provider.cc
+  )
+else()
+  set(TFLITE_BENCHMARK_CC_OPTIONS "-DTFLITE_WITHOUT_XNNPACK")
+endif()  # TFLITE_ENABLE_XNNPACK
+
+if(CMAKE_SYSTEM_NAME MATCHES "Android")
+  list(APPEND TFLITE_BENCHMARK_SRCS
+    ${TFLITE_SOURCE_DIR}/profiling/atrace_profiler.cc
+    ${TFLITE_SOURCE_DIR}/tools/delegates/nnapi_delegate_provider.cc
+  )
+  list(APPEND TFLITE_BENCHMARK_LIBS
+    ${ANDROID_LOG_LIB}
+    absl::strings
+  )
+endif()  # Android
+
+add_executable(benchmark_model
+  EXCLUDE_FROM_ALL
+  ${TFLITE_BENCHMARK_SRCS}
+)
+target_compile_options(benchmark_model
+  PRIVATE
+    ${TFLITE_BENCHMARK_CC_OPTIONS}
+)
+target_link_libraries(benchmark_model
+    ${TFLITE_BENCHMARK_LIBS}
+)
diff --git a/tensorflow/lite/arena_planner.cc b/tensorflow/lite/arena_planner.cc
index dd5e3777fc1..b134a5de044 100644
--- a/tensorflow/lite/arena_planner.cc
+++ b/tensorflow/lite/arena_planner.cc
@@ -140,7 +140,7 @@ TfLiteStatus ArenaPlanner::PlanAllocations() {
   }
 
   // Count references to node input tensors.
-  for (size_t i = 0; i < graph_info_->num_nodes(); ++i) {
+  for (size_t i = 0; i < graph_info_->num_execution_nodes(); ++i) {
     const TfLiteNode& node = graph_info_->node(i);
     TfLiteIntArray* node_inputs = node.inputs;
     for (int j = 0; j < node_inputs->size; ++j) {
@@ -158,7 +158,7 @@ TfLiteStatus ArenaPlanner::PlanAllocations() {
     }
   }
   // Go through the graph in execution order.
-  for (size_t i = 0; i < graph_info_->num_nodes(); ++i) {
+  for (size_t i = 0; i < graph_info_->num_execution_nodes(); ++i) {
     const TfLiteNode& node = graph_info_->node(i);
 
     // First queue output tensors for allocation.
@@ -197,8 +197,8 @@ TfLiteStatus ArenaPlanner::ExecuteAllocations(int first_node, int last_node) {
   dealloc_node_.resize(graph_info_->num_tensors(), kNodeNotAssigned);
   allocs_.resize(graph_info_->num_tensors());
   // Set allocation and deallocation for temporary tensors.
-  for (size_t i = first_node;
-       i <= static_cast<size_t>(last_node) && i < graph_info_->num_nodes();
+  for (size_t i = first_node; i <= static_cast<size_t>(last_node) &&
+                              i < graph_info_->num_execution_nodes();
        ++i) {
     const TfLiteNode& node = graph_info_->node(i);
     TfLiteIntArray* node_temporaries = node.temporaries;
diff --git a/tensorflow/lite/arena_planner_test.cc b/tensorflow/lite/arena_planner_test.cc
index 813e10082a3..47ecc68cf40 100644
--- a/tensorflow/lite/arena_planner_test.cc
+++ b/tensorflow/lite/arena_planner_test.cc
@@ -134,7 +134,8 @@ class TestGraphInfo : public GraphInfo {
   TfLiteTensor* tensor(size_t index) override {
     return &graph_->tensors()->at(index);
   }
-  size_t num_nodes() const override { return graph_->nodes().size(); }
+  size_t num_execution_nodes() const override { return graph_->nodes().size(); }
+  size_t num_total_nodes() const override { return graph_->nodes().size(); }
   const TfLiteNode& node(size_t index) const override {
     return graph_->nodes()[index];
   }
diff --git a/tensorflow/lite/build_def.bzl b/tensorflow/lite/build_def.bzl
index 001b2fc791e..57c04be11f0 100644
--- a/tensorflow/lite/build_def.bzl
+++ b/tensorflow/lite/build_def.bzl
@@ -578,7 +578,14 @@ def flags_for_merged_test_models(test_name, conversion_mode):
         tests_csv = tests_csv[:-1]  # Remove trailing comma.
     return " --no_tests_limit --test_sets=%s" % tests_csv
 
-def gen_zip_test(name, test_name, conversion_mode, **kwargs):
+def gen_zip_test(
+        name,
+        test_name,
+        conversion_mode,
+        test_tags,
+        test_args,
+        additional_test_tags_args = {},
+        **kwargs):
     """Generate a zipped-example test and its dependent zip files.
 
     Args:
@@ -586,6 +593,13 @@ def gen_zip_test(name, test_name, conversion_mode, **kwargs):
       test_name: str. Test targets this model. Comes from the list above.
       conversion_mode: str. Which conversion mode to run with. Comes from the
         list above.
+      test_tags: tags for the generated cc_test.
+      test_args: the basic cc_test args to be used.
+      additional_test_tags_args: a dictionary of additional test tags and args
+        to be used together with test_tags and test_args. The key is an
+        identifier which can be in creating a test tag to identify a set of
+        tests. The value is a tuple of list of additional test tags and args to
+        be used.
       **kwargs: tf_cc_test kwargs
     """
     toco = "//tensorflow/lite/toco:toco"
@@ -603,7 +617,21 @@ def gen_zip_test(name, test_name, conversion_mode, **kwargs):
         toco = toco,
         flags = flags + " --save_graphdefs",
     )
-    tf_cc_test(name, **kwargs)
+    tf_cc_test(
+        name,
+        args = test_args,
+        tags = test_tags + ["gen_zip_test"],
+        **kwargs
+    )
+    for key, value in additional_test_tags_args.items():
+        extra_tags, extra_args = value
+        extra_tags.append("gen_zip_test_%s" % key)
+        tf_cc_test(
+            name = "%s_%s" % (name, key),
+            args = test_args + extra_args,
+            tags = test_tags + extra_tags,
+            **kwargs
+        )
 
 def gen_zipped_test_file(name, file, toco, flags):
     """Generate a zip file of tests by using :generate_examples.
@@ -619,7 +647,8 @@ def gen_zipped_test_file(name, file, toco, flags):
         cmd = (("$(locations :generate_examples) --toco $(locations {0}) " +
                 " --zip_to_output {1} {2} $(@D)").format(toco, file, flags)),
         outs = [file],
-        tools = [
+        # `exec_tools` is required for PY3 compatibility in place of `tools`.
+        exec_tools = [
             ":generate_examples",
             toco,
         ],
@@ -713,27 +742,6 @@ def gen_model_coverage_test(src, model_name, data, failure_type, tags, size = "m
             ] + flex_dep(target_op_sets),
         )
 
-def if_tflite_experimental_runtime(if_eager, if_non_eager, if_none = []):
-    return select({
-        "//tensorflow/lite:tflite_experimental_runtime_eager": if_eager,
-        "//tensorflow/lite:tflite_experimental_runtime_non_eager": if_non_eager,
-        "//conditions:default": if_none,
-    })
-
-def tflite_experimental_runtime_linkopts(if_eager = [], if_non_eager = [], if_none = []):
-    return if_tflite_experimental_runtime(
-        if_eager = [
-            # "//tensorflow/lite/experimental/tf_runtime:eager_interpreter",
-            # "//tensorflow/lite/experimental/tf_runtime:eager_model",
-            # "//tensorflow/lite/experimental/tf_runtime:subgraph",
-        ] + if_eager,
-        if_non_eager = [
-            # "//tensorflow/lite/experimental/tf_runtime:interpreter",
-            # "//tensorflow/lite/experimental/tf_runtime:model",
-        ] + if_non_eager,
-        if_none = [] + if_none,
-    )
-
 def tflite_custom_cc_library(
         name,
         models = [],
diff --git a/tensorflow/lite/c/BUILD b/tensorflow/lite/c/BUILD
index bdf86d7904f..3f4ff9130a0 100644
--- a/tensorflow/lite/c/BUILD
+++ b/tensorflow/lite/c/BUILD
@@ -3,6 +3,7 @@ load(
     "tflite_cc_shared_object",
     "tflite_copts",
 )
+load("//tensorflow:tensorflow.bzl", "get_compatible_with_portable")
 
 package(
     default_visibility = ["//visibility:public"],
@@ -61,8 +62,9 @@ cc_library(
         ":common",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:version",
-        "//tensorflow/lite/core/api",
+        "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
         "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/kernels/internal:compatibility",
     ],
     alwayslink = 1,
 )
@@ -122,6 +124,7 @@ cc_library(
         "builtin_op_data.h",
         "common.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     alwayslink = 1,
 )
 
diff --git a/tensorflow/lite/c/c_api.cc b/tensorflow/lite/c/c_api.cc
index aa93a10302c..895b6798c94 100644
--- a/tensorflow/lite/c/c_api.cc
+++ b/tensorflow/lite/c/c_api.cc
@@ -17,8 +17,10 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
 #include "tensorflow/lite/error_reporter.h"
 #include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/version.h"
@@ -30,20 +32,16 @@ extern "C" {
 namespace {
 class CallbackErrorReporter : public tflite::ErrorReporter {
  public:
-  using ErrorCallback = void (*)(void* user_data, const char* format,
-                                 va_list args);
-
-  CallbackErrorReporter(ErrorCallback callback, void* user_data)
-      : callback_(callback), user_data_(user_data) {}
+  explicit CallbackErrorReporter(TfLiteErrorReporterCallback callback)
+      : callback_(callback) {}
 
   int Report(const char* format, va_list args) override {
-    callback_(user_data_, format, args);
+    callback_.error_reporter(callback_.user_data, format, args);
     return 0;
   }
 
  private:
-  ErrorCallback callback_;
-  void* user_data_;
+  TfLiteErrorReporterCallback callback_;
 };
 }  // namespace
 
@@ -88,57 +86,16 @@ void TfLiteInterpreterOptionsSetErrorReporter(
     TfLiteInterpreterOptions* options,
     void (*reporter)(void* user_data, const char* format, va_list args),
     void* user_data) {
-  options->error_reporter = reporter;
-  options->error_reporter_user_data = user_data;
+  options->error_reporter_callback.error_reporter = reporter;
+  options->error_reporter_callback.user_data = user_data;
 }
 
 TfLiteInterpreter* TfLiteInterpreterCreate(
     const TfLiteModel* model,
     const TfLiteInterpreterOptions* optional_options) {
-  if (!model || !model->impl) {
-    return nullptr;
-  }
-
-  std::unique_ptr<tflite::ErrorReporter> optional_error_reporter;
-  if (optional_options && optional_options->error_reporter != nullptr) {
-    optional_error_reporter.reset(
-        new CallbackErrorReporter(optional_options->error_reporter,
-                                  optional_options->error_reporter_user_data));
-  }
-
-  // TODO(b/111881878): Allow use of C API without pulling in all builtin ops.
   tflite::ops::builtin::BuiltinOpResolver resolver;
-  if (optional_options) {
-    resolver.AddAll(optional_options->op_resolver);
-  }
-  tflite::ErrorReporter* error_reporter = optional_error_reporter
-                                              ? optional_error_reporter.get()
-                                              : tflite::DefaultErrorReporter();
-  tflite::InterpreterBuilder builder(model->impl->GetModel(), resolver,
-                                     error_reporter);
-
-  std::unique_ptr<tflite::Interpreter> interpreter;
-  if (builder(&interpreter) != kTfLiteOk) {
-    return nullptr;
-  }
-
-  if (optional_options) {
-    interpreter->UseNNAPI(optional_options->use_nnapi);
-
-    if (optional_options->num_threads !=
-        TfLiteInterpreterOptions::kDefaultNumThreads) {
-      interpreter->SetNumThreads(optional_options->num_threads);
-    }
-
-    for (auto* delegate : optional_options->delegates) {
-      if (interpreter->ModifyGraphWithDelegate(delegate) != kTfLiteOk) {
-        return nullptr;
-      }
-    }
-  }
-
-  return new TfLiteInterpreter{model->impl, std::move(optional_error_reporter),
-                               std::move(interpreter)};
+  return tflite::internal::InterpreterCreateWithOpResolver(
+      model, optional_options, &resolver);
 }
 
 void TfLiteInterpreterDelete(TfLiteInterpreter* interpreter) {
@@ -234,3 +191,63 @@ TfLiteStatus TfLiteTensorCopyToBuffer(const TfLiteTensor* tensor,
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
+
+namespace tflite {
+namespace internal {
+
+TfLiteInterpreter* InterpreterCreateWithOpResolver(
+    const TfLiteModel* model, const TfLiteInterpreterOptions* optional_options,
+    tflite::MutableOpResolver* mutable_resolver) {
+  TFLITE_DCHECK_NE(mutable_resolver, nullptr);
+  if (!model || !model->impl) {
+    return nullptr;
+  }
+
+  std::unique_ptr<tflite::ErrorReporter> optional_error_reporter;
+  if (optional_options &&
+      optional_options->error_reporter_callback.error_reporter != nullptr) {
+    optional_error_reporter.reset(
+        new CallbackErrorReporter(optional_options->error_reporter_callback));
+  }
+
+  if (optional_options) {
+    mutable_resolver->AddAll(optional_options->op_resolver);
+  }
+
+  tflite::ErrorReporter* error_reporter = optional_error_reporter
+                                              ? optional_error_reporter.get()
+                                              : tflite::DefaultErrorReporter();
+  tflite::InterpreterBuilder builder(model->impl->GetModel(), *mutable_resolver,
+                                     error_reporter);
+
+  std::unique_ptr<tflite::Interpreter> interpreter;
+  if (builder(&interpreter) != kTfLiteOk) {
+    return nullptr;
+  }
+
+  if (optional_options) {
+    if (optional_options->num_threads !=
+        TfLiteInterpreterOptions::kDefaultNumThreads) {
+      interpreter->SetNumThreads(optional_options->num_threads);
+    }
+
+    if (optional_options->use_nnapi) {
+      if (interpreter->ModifyGraphWithDelegate(tflite::NnApiDelegate()) !=
+          kTfLiteOk) {
+        return nullptr;
+      }
+    }
+
+    for (auto* delegate : optional_options->delegates) {
+      if (interpreter->ModifyGraphWithDelegate(delegate) != kTfLiteOk) {
+        return nullptr;
+      }
+    }
+  }
+
+  return new TfLiteInterpreter{model->impl, std::move(optional_error_reporter),
+                               std::move(interpreter)};
+}
+
+}  // namespace internal
+}  // namespace tflite
diff --git a/tensorflow/lite/c/c_api.h b/tensorflow/lite/c/c_api.h
index 880b80e69b4..152bcf986fe 100644
--- a/tensorflow/lite/c/c_api.h
+++ b/tensorflow/lite/c/c_api.h
@@ -188,7 +188,7 @@ TFL_CAPI_EXPORT extern int32_t TfLiteInterpreterGetOutputTensorCount(
     const TfLiteInterpreter* interpreter);
 
 // Returns the tensor associated with the output index.
-// REQUIRES: 0 <= input_index < TfLiteInterpreterGetOutputTensorCount(tensor)
+// REQUIRES: 0 <= output_index < TfLiteInterpreterGetOutputTensorCount(tensor)
 //
 // NOTE: The shape and underlying data buffer for output tensors may be not
 // be available until after the output tensor has been both sized and allocated.
diff --git a/tensorflow/lite/c/c_api_experimental.cc b/tensorflow/lite/c/c_api_experimental.cc
index cff1b3d1530..1d84d24eb14 100644
--- a/tensorflow/lite/c/c_api_experimental.cc
+++ b/tensorflow/lite/c/c_api_experimental.cc
@@ -42,6 +42,14 @@ void TfLiteInterpreterOptionsAddBuiltinOp(
                                   registration, min_version, max_version);
 }
 
+TfLiteInterpreter* TfLiteInterpreterCreateWithSelectedOps(
+    const TfLiteModel* model,
+    const TfLiteInterpreterOptions* optional_options) {
+  tflite::MutableOpResolver resolver;
+  return tflite::internal::InterpreterCreateWithOpResolver(
+      model, optional_options, &resolver);
+}
+
 void TfLiteInterpreterOptionsAddCustomOp(TfLiteInterpreterOptions* options,
                                          const char* name,
                                          const TfLiteRegistration* registration,
diff --git a/tensorflow/lite/c/c_api_experimental.h b/tensorflow/lite/c/c_api_experimental.h
index 0398c385874..5971b5f6a4a 100644
--- a/tensorflow/lite/c/c_api_experimental.h
+++ b/tensorflow/lite/c/c_api_experimental.h
@@ -23,33 +23,59 @@ limitations under the License.
 extern "C" {
 #endif  // __cplusplus
 
-// Resets all variable tensors to zero.
+/// Resets all variable tensors to zero.
 TFL_CAPI_EXPORT extern TfLiteStatus TfLiteInterpreterResetVariableTensors(
     TfLiteInterpreter* interpreter);
 
-// Adds an op registration for a builtin operator.
-//
-// NOTE: The interpreter will make a copy of `registration` internally, so the
-// caller should ensure that its contents (function pointers, etc...) remain
-// valid for the duration of the interpreter's lifetime. A common practice is
-// making the provided TfLiteRegistration instance static.
+/// Adds an op registration for a builtin operator.
+///
+/// Op registrations are used to map ops referenced in the flatbuffer model
+/// to executable function pointers (`TfLiteRegistration`s).
+///
+/// NOTE: The interpreter will make a shallow copy of `registration` internally,
+/// so the caller should ensure that its contents (function pointers, etc...)
+/// remain valid for the duration of the interpreter's lifetime. A common
+/// practice is making the provided `TfLiteRegistration` instance static.
 TFL_CAPI_EXPORT void TfLiteInterpreterOptionsAddBuiltinOp(
     TfLiteInterpreterOptions* options, TfLiteBuiltinOperator op,
     const TfLiteRegistration* registration, int32_t min_version,
     int32_t max_version);
 
-// Adds an op registration for a custom operator.
-//
-// NOTE: The interpreter will make a copy of `registration` internally, so the
-// caller should ensure that its contents (function pointers, etc...) remain
-// valid for the duration of any created interpreter's lifetime. A common
-// practice is making the provided TfLiteRegistration instance static.
+/// Adds an op registration for a custom operator.
+///
+/// Op registrations are used to map ops referenced in the flatbuffer model
+/// to executable function pointers (`TfLiteRegistration`s).
+///
+/// NOTE: The interpreter will make a shallow copy of `registration` internally,
+/// so the caller should ensure that its contents (function pointers, etc...)
+/// remain valid for the duration of any created interpreter's lifetime. A
+/// common practice is making the provided `TfLiteRegistration` instance static.
 TFL_CAPI_EXPORT void TfLiteInterpreterOptionsAddCustomOp(
     TfLiteInterpreterOptions* options, const char* name,
     const TfLiteRegistration* registration, int32_t min_version,
     int32_t max_version);
 
-// Enable or disable the NN API for the interpreter (true to enable).
+/// Returns a new interpreter using the provided model and options, or null on
+/// failure, where the model uses only the operators explicitly added to the
+/// options.  This is the same as `TFLiteInterpreterCreate` from `c_api.h`,
+/// except that the only operators that are supported are the ones registered
+/// in `options` via calls to `TfLiteInterpreterOptionsAddBuiltinOp` and/or
+/// `TfLiteInterpreterOptionsAddCustomOp`.
+///
+/// * `model` must be a valid model instance. The caller retains ownership of
+///   the object, and can destroy it immediately after creating the interpreter;
+///   the interpreter will maintain its own reference to the underlying model
+///   data.
+/// * `options` should not be null. The caller retains ownership of the object,
+///   and can safely destroy it immediately after creating the interpreter.
+///
+/// NOTE: The client *must* explicitly allocate tensors before attempting to
+/// access input tensor data or invoke the interpreter.
+TFL_CAPI_EXPORT extern TfLiteInterpreter*
+TfLiteInterpreterCreateWithSelectedOps(const TfLiteModel* model,
+                                       const TfLiteInterpreterOptions* options);
+
+/// Enable or disable the NN API for the interpreter (true to enable).
 TFL_CAPI_EXPORT extern void TfLiteInterpreterOptionsSetUseNNAPI(
     TfLiteInterpreterOptions* options, bool enable);
 
diff --git a/tensorflow/lite/c/c_api_experimental_test.cc b/tensorflow/lite/c/c_api_experimental_test.cc
index 18bc7bb0397..ec79e4d898e 100644
--- a/tensorflow/lite/c/c_api_experimental_test.cc
+++ b/tensorflow/lite/c/c_api_experimental_test.cc
@@ -23,8 +23,8 @@ limitations under the License.
 
 namespace {
 
-TfLiteRegistration* GetDummyRegistration() {
-  static TfLiteRegistration registration = {
+const TfLiteRegistration* GetDummyRegistration() {
+  static const TfLiteRegistration registration = {
       /*init=*/nullptr,
       /*free=*/nullptr,
       /*prepare=*/nullptr,
@@ -53,6 +53,63 @@ TEST(CApiExperimentalTest, Smoke) {
   TfLiteModelDelete(model);
 }
 
+// Test using TfLiteInterpreterCreateWithSelectedOps.
+TEST(CApiExperimentalTest, SelectedBuiltins) {
+  TfLiteModel* model =
+      TfLiteModelCreateFromFile("tensorflow/lite/testdata/add.bin");
+  ASSERT_NE(model, nullptr);
+
+  TfLiteInterpreterOptions* options = TfLiteInterpreterOptionsCreate();
+  TfLiteInterpreterOptionsAddBuiltinOp(options, kTfLiteBuiltinAdd,
+                                       GetDummyRegistration(), 1, 1);
+
+  TfLiteInterpreter* interpreter =
+      TfLiteInterpreterCreateWithSelectedOps(model, options);
+  ASSERT_NE(interpreter, nullptr);
+  ASSERT_EQ(TfLiteInterpreterAllocateTensors(interpreter), kTfLiteOk);
+  EXPECT_EQ(TfLiteInterpreterResetVariableTensors(interpreter), kTfLiteOk);
+  EXPECT_EQ(TfLiteInterpreterInvoke(interpreter), kTfLiteOk);
+
+  TfLiteInterpreterDelete(interpreter);
+  TfLiteInterpreterOptionsDelete(options);
+  TfLiteModelDelete(model);
+}
+
+// Test that when using TfLiteInterpreterCreateWithSelectedOps,
+// we do NOT get the standard builtin operators by default.
+TEST(CApiExperimentalTest, MissingBuiltin) {
+  TfLiteModel* model =
+      TfLiteModelCreateFromFile("tensorflow/lite/testdata/add.bin");
+  ASSERT_NE(model, nullptr);
+
+  // Install a custom error reporter into the interpreter by way of options.
+  tflite::TestErrorReporter reporter;
+  TfLiteInterpreterOptions* options = TfLiteInterpreterOptionsCreate();
+  TfLiteInterpreterOptionsSetErrorReporter(
+      options,
+      [](void* user_data, const char* format, va_list args) {
+        reinterpret_cast<tflite::TestErrorReporter*>(user_data)->Report(format,
+                                                                        args);
+      },
+      &reporter);
+
+  // Create an interpreter with no builtins at all.
+  TfLiteInterpreter* interpreter =
+      TfLiteInterpreterCreateWithSelectedOps(model, options);
+
+  // Check that interpreter creation failed, because the model contain a buitin
+  // op that wasn't supported, and that we got the expected error messages.
+  ASSERT_EQ(interpreter, nullptr);
+  EXPECT_EQ(reporter.error_messages(),
+            "Didn't find op for builtin opcode 'ADD' version '1'\n"
+            "Registration failed.\n");
+  EXPECT_EQ(reporter.num_calls(), 2);
+
+  TfLiteInterpreterDelete(interpreter);
+  TfLiteInterpreterOptionsDelete(options);
+  TfLiteModelDelete(model);
+}
+
 }  // namespace
 
 int main(int argc, char** argv) {
diff --git a/tensorflow/lite/c/c_api_internal.h b/tensorflow/lite/c/c_api_internal.h
index f13712362a6..cc31807bd3f 100644
--- a/tensorflow/lite/c/c_api_internal.h
+++ b/tensorflow/lite/c/c_api_internal.h
@@ -25,8 +25,8 @@ limitations under the License.
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/mutable_op_resolver.h"
 
-// Internal structures used by the C API. These are likely to change and should
-// not be depended on directly by any C API clients.
+// Internal structures and subroutines used by the C API. These are likely to
+// change and should not be depended on directly by any C API clients.
 //
 // NOTE: This header does not follow C conventions and does not define a C API.
 // It is effectively an (internal) implementation detail of the C API.
@@ -36,6 +36,16 @@ struct TfLiteModel {
   std::shared_ptr<const tflite::FlatBufferModel> impl;
 };
 
+// This struct mirrors the tflite::ErrorResolver C++ abstract base class.
+struct TfLiteErrorReporterCallback {
+  // Opaque data that gets passed down to the callback function.
+  void* user_data = nullptr;
+
+  // Callback function that reports an error.
+  void (*error_reporter)(void* user_data, const char* format,
+                         va_list args) = nullptr;
+};
+
 struct TfLiteInterpreterOptions {
   enum {
     kDefaultNumThreads = -1,
@@ -44,12 +54,10 @@ struct TfLiteInterpreterOptions {
 
   tflite::MutableOpResolver op_resolver;
 
-  void (*error_reporter)(void* user_data, const char* format,
-                         va_list args) = nullptr;
-  void* error_reporter_user_data = nullptr;
-
   std::vector<TfLiteDelegate*> delegates;
 
+  TfLiteErrorReporterCallback error_reporter_callback;
+
   bool use_nnapi = false;
 };
 
@@ -60,10 +68,38 @@ struct TfLiteInterpreter {
 
   // The interpreter does not take ownership of the provided ErrorReporter
   // instance, so we ensure its validity here. Note that the interpreter may use
-  // the reporter in its destructor, so it should be declared first.
+  // the reporter in its destructor, so the reporter should be declared first.
   std::unique_ptr<tflite::ErrorReporter> optional_error_reporter;
 
   std::unique_ptr<tflite::Interpreter> impl;
 };
 
+namespace tflite {
+namespace internal {
+
+// This adds the builtin and/or custom operators specified in options in
+// `optional_options` (if any) to `mutable_resolver`, and then returns a newly
+// created TfLiteInterpreter using `mutable_op_resolver` as the OpResolver, and
+// using any other options in `optional_options`, and using the provided
+// `model`.
+//
+// * `model` must be a valid model instance. The caller retains ownership of the
+//   object, and can destroy it immediately after creating the interpreter; the
+//   interpreter will maintain its own reference to the underlying model data.
+// * `optional_options` may be null. The caller retains ownership of the object,
+//   and can safely destroy it immediately after creating the interpreter.
+// * `mutable_resolver` must not be null. The caller retains ownership of the
+//   MutableOpResolver object, and can safely destroy it immediately after
+//   creating the interpreter.
+//
+// NOTE: The client *must* explicitly allocate tensors before attempting to
+// access input tensor data or invoke the interpreter.
+
+TfLiteInterpreter* InterpreterCreateWithOpResolver(
+    const TfLiteModel* model, const TfLiteInterpreterOptions* optional_options,
+    tflite::MutableOpResolver* mutable_resolver);
+
+}  // namespace internal
+}  // namespace tflite
+
 #endif  // TENSORFLOW_LITE_C_C_API_INTERNAL_H_
diff --git a/tensorflow/lite/c/common.h b/tensorflow/lite/c/common.h
index 3398d178561..d320a90d005 100644
--- a/tensorflow/lite/c/common.h
+++ b/tensorflow/lite/c/common.h
@@ -47,7 +47,8 @@ extern "C" {
 typedef enum TfLiteStatus {
   kTfLiteOk = 0,
   kTfLiteError = 1,
-  kTfLiteDelegateError = 2
+  kTfLiteDelegateError = 2,
+  kTfLiteApplicationError = 3
 } TfLiteStatus;
 
 // The list of external context types known to TF Lite. This list exists solely
@@ -88,7 +89,7 @@ typedef struct TfLiteIntArray {
 // https://github.com/google/re2/commit/b94b7cd42e9f02673cd748c1ac1d16db4052514c
 #if (!defined(__clang__) && defined(__GNUC__) && __GNUC__ == 6 && \
      __GNUC_MINOR__ >= 1) ||                                      \
-    defined(HEXAGON)
+    defined(HEXAGON) || (__clang_major__ == 7 && __clang_minor__ == 1)
   int data[0];
 #else
   int data[];
@@ -358,6 +359,8 @@ typedef union TfLitePtrUnion {
 //  * kTfLitePersistentRo: Allocated and populated during prepare. This is
 //        useful for tensors that can be computed during prepare and treated
 //        as constant inputs for downstream ops (also in prepare).
+//  * kTfLiteCustom: Custom memory allocation provided by the user. See
+//        TfLiteCustomAllocation below.
 typedef enum TfLiteAllocationType {
   kTfLiteMemNone = 0,
   kTfLiteMmapRo,
@@ -365,6 +368,7 @@ typedef enum TfLiteAllocationType {
   kTfLiteArenaRwPersistent,
   kTfLiteDynamic,
   kTfLitePersistentRo,
+  kTfLiteCustom,
 } TfLiteAllocationType;
 
 // The delegates should use zero or positive integers to represent handles.
@@ -397,6 +401,15 @@ typedef struct TfLiteSparsity {
   int dim_metadata_size;
 } TfLiteSparsity;
 
+// Defines a custom memory allocation not owned by the runtime.
+// `data` should be aligned to kDefaultTensorAlignment defined in
+// lite/util.h. (Currently 64 bytes)
+// NOTE: See Interpreter.SetCustomAllocationForTensor for details on usage.
+typedef struct TfLiteCustomAllocation {
+  void* data;
+  size_t bytes;
+} TfLiteCustomAllocation;
+
 // An tensor in the interpreter system which is a wrapper around a buffer of
 // data including a dimensionality (or NULL if not currently defined).
 #ifndef TF_LITE_STATIC_MEMORY
@@ -861,7 +874,26 @@ typedef enum TfLiteDelegateFlags {
   //
   // If the delegate isn't capable to handle dynamic tensors, this flag need
   // to be set to false.
-  kTfLiteDelegateFlagsAllowDynamicTensors = 1
+  kTfLiteDelegateFlagsAllowDynamicTensors = 1,
+
+  // This flag can be used by delegates (that allow dynamic tensors) to ensure
+  // applicable tensor shapes are automatically propagated in the case of tensor
+  // resizing.
+  // This means that non-dynamic (allocation_type != kTfLiteDynamic) I/O tensors
+  // of a delegate kernel will have correct shapes before its Prepare() method
+  // is called. The runtime leverages TFLite builtin ops in the original
+  // execution plan to propagate shapes.
+  //
+  // A few points to note:
+  // 1. This requires kTfLiteDelegateFlagsAllowDynamicTensors. If that flag is
+  // false, this one is redundant since the delegate kernels are re-initialized
+  // every time tensors are resized.
+  // 2. Enabling this flag adds some overhead to AllocateTensors(), since extra
+  // work is required to prepare the original execution plan.
+  // 3. This flag requires that the original execution plan only have ops with
+  // valid registrations (and not 'dummy' custom ops like with Flex).
+  // WARNING: This feature is experimental and subject to change.
+  kTfLiteDelegateFlagsRequirePropagatedShapes = 2
 } TfLiteDelegateFlags;
 
 // WARNING: This is an experimental interface that is subject to change.
diff --git a/tensorflow/lite/core/api/BUILD b/tensorflow/lite/core/api/BUILD
index a1e6fc41cd9..451117106fe 100644
--- a/tensorflow/lite/core/api/BUILD
+++ b/tensorflow/lite/core/api/BUILD
@@ -1,5 +1,6 @@
 load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 load("//tensorflow/lite/micro:build_def.bzl", "micro_copts")
+load("//tensorflow:tensorflow.bzl", "get_compatible_with_portable")
 
 package(
     default_visibility = ["//visibility:public"],
@@ -21,6 +22,7 @@ cc_library(
         "profiler.h",
         "tensor_utils.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts() + micro_copts(),
     deps = [
         "@flatbuffers//:runtime_cc",
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.cc b/tensorflow/lite/core/api/flatbuffer_conversions.cc
index 7fb04f5b89e..e952604b06f 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions.cc
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.cc
@@ -358,6 +358,10 @@ TfLiteStatus ParseOpDataTfLite(const Operator* op, BuiltinOperator op_type,
       return ParseSplit(op, error_reporter, allocator, builtin_data);
     }
 
+    case BuiltinOperator_SPLIT_V: {
+      return ParseSplitV(op, error_reporter, allocator, builtin_data);
+    }
+
     case BuiltinOperator_SQRT: {
       return ParseSqrt(op, error_reporter, allocator, builtin_data);
     }
@@ -619,24 +623,20 @@ TfLiteStatus ParseOpDataTfLite(const Operator* op, BuiltinOperator op_type,
       *builtin_data = params.release();
       return kTfLiteOk;
     }
-    case BuiltinOperator_SPLIT_V: {
-      auto params = safe_allocator.Allocate<TfLiteSplitParams>();
-      TF_LITE_ENSURE(error_reporter, params != nullptr);
-      if (const auto* schema_params = op->builtin_options_as_SplitVOptions()) {
-        params->num_splits = schema_params->num_splits();
-      }
-      *builtin_data = params.release();
-      return kTfLiteOk;
-    }
+
     case BuiltinOperator_SQUEEZE: {
       auto params = safe_allocator.Allocate<TfLiteSqueezeParams>();
       TF_LITE_ENSURE(error_reporter, params != nullptr);
       if (const auto* schema_params = op->builtin_options_as_SqueezeOptions()) {
         const auto* squeeze_dims = schema_params->squeeze_dims();
-        TF_LITE_ENSURE_STATUS(FlatBufferIntVectorToArray(
-            sizeof(params->squeeze_dims), squeeze_dims, params->squeeze_dims,
-            error_reporter, "squeeze"));
-        params->num_squeeze_dims = squeeze_dims->size();
+        if (squeeze_dims != nullptr) {
+          TF_LITE_ENSURE_STATUS(FlatBufferIntVectorToArray(
+              sizeof(params->squeeze_dims), squeeze_dims, params->squeeze_dims,
+              error_reporter, "squeeze"));
+          params->num_squeeze_dims = squeeze_dims->size();
+        } else {
+          params->num_squeeze_dims = 0;
+        }
       }
       *builtin_data = params.release();
       return kTfLiteOk;
@@ -1571,6 +1571,30 @@ TfLiteStatus ParseSplit(const Operator* op, ErrorReporter* error_reporter,
   return kTfLiteOk;
 }
 
+TfLiteStatus ParseSplitV(const Operator* op, ErrorReporter* error_reporter,
+                         BuiltinDataAllocator* allocator, void** builtin_data) {
+  CheckParsePointerParams(op, error_reporter, allocator, builtin_data);
+  SafeBuiltinDataAllocator safe_allocator(allocator);
+
+  std::unique_ptr<TfLiteSplitVParams,
+                  SafeBuiltinDataAllocator::BuiltinDataDeleter>
+      params = safe_allocator.Allocate<TfLiteSplitVParams>();
+  TF_LITE_ENSURE(error_reporter, params != nullptr);
+
+  const SplitVOptions* schema_params = op->builtin_options_as_SplitVOptions();
+
+  if (schema_params != nullptr) {
+    params->num_splits = schema_params->num_splits();
+  } else {
+    // TODO(b/157480169): We should either return kTfLiteError or fill in some
+    // reasonable defaults in the params struct. We are not doing so until we
+    // better undertand the ramifications of changing the legacy behavior.
+  }
+
+  *builtin_data = params.release();
+  return kTfLiteOk;
+}
+
 // We have this parse function instead of directly returning kTfLiteOk from the
 // switch-case in ParseOpData because this function is used as part of the
 // selective registration for the OpResolver implementation in micro.
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.h b/tensorflow/lite/core/api/flatbuffer_conversions.h
index aaeb98c0a2e..3ded5305eb6 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions.h
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.h
@@ -225,6 +225,9 @@ TfLiteStatus ParseSoftmax(const Operator* op, ErrorReporter* error_reporter,
 TfLiteStatus ParseSplit(const Operator* op, ErrorReporter* error_reporter,
                         BuiltinDataAllocator* allocator, void** builtin_data);
 
+TfLiteStatus ParseSplitV(const Operator* op, ErrorReporter* error_reporter,
+                         BuiltinDataAllocator* allocator, void** builtin_data);
+
 TfLiteStatus ParseSqrt(const Operator* op, ErrorReporter* error_reporter,
                        BuiltinDataAllocator* allocator, void** builtin_data);
 
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions_test.cc b/tensorflow/lite/core/api/flatbuffer_conversions_test.cc
index 89ca3f566ec..e8be9480aa5 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions_test.cc
+++ b/tensorflow/lite/core/api/flatbuffer_conversions_test.cc
@@ -82,15 +82,12 @@ class FlatbufferConversionsTest : public ::testing::Test {
   flatbuffers::FlatBufferBuilder builder_;
 };
 
-TEST_F(FlatbufferConversionsTest, ParseBadSqueeze) {
+TEST_F(FlatbufferConversionsTest, ParseSqueezeAll) {
   const Operator* op = BuildTestOperator(
       BuiltinOptions_SqueezeOptions, CreateSqueezeOptions(builder_).Union());
   void* output_data = nullptr;
-  EXPECT_NE(kTfLiteOk, ParseOpData(op, BuiltinOperator_SQUEEZE, &mock_reporter_,
+  EXPECT_EQ(kTfLiteOk, ParseOpData(op, BuiltinOperator_SQUEEZE, &mock_reporter_,
                                    &mock_allocator_, &output_data));
-  EXPECT_THAT(mock_reporter_.GetAsString(),
-              ::testing::ContainsRegex(
-                  "Input array not provided for operation 'squeeze'"));
 }
 
 TEST_F(FlatbufferConversionsTest, ParseDynamicReshape) {
diff --git a/tensorflow/lite/core/api/op_resolver.h b/tensorflow/lite/core/api/op_resolver.h
index 1294b7b8ea8..b6a8171d2a3 100644
--- a/tensorflow/lite/core/api/op_resolver.h
+++ b/tensorflow/lite/core/api/op_resolver.h
@@ -15,6 +15,8 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_CORE_API_OP_RESOLVER_H_
 #define TENSORFLOW_LITE_CORE_API_OP_RESOLVER_H_
 
+#include <vector>
+
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/schema/schema_generated.h"
@@ -32,6 +34,16 @@ class OpResolver {
   /// Finds the op registration of a custom operator by op name.
   virtual const TfLiteRegistration* FindOp(const char* op,
                                            int version) const = 0;
+
+  // Returns optional delegates for resolving and handling ops in the flatbuffer
+  // model. This may be used in addition to the standard TfLiteRegistration
+  // lookup for graph resolution.
+  using TfLiteDelegatePtrVector =
+      std::vector<std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)>>;
+  virtual TfLiteDelegatePtrVector GetDelegates(int num_threads) const {
+    return TfLiteDelegatePtrVector();
+  }
+
   virtual ~OpResolver() {}
 };
 
diff --git a/tensorflow/lite/core/subgraph.cc b/tensorflow/lite/core/subgraph.cc
index b087ae1901c..2fe8099d372 100644
--- a/tensorflow/lite/core/subgraph.cc
+++ b/tensorflow/lite/core/subgraph.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/lite/core/subgraph.h"
 
 #include <algorithm>
+#include <cstdint>
 
 #include "tensorflow/lite/arena_planner.h"
 #include "tensorflow/lite/c/common.h"
@@ -29,8 +30,6 @@ limitations under the License.
 
 namespace tflite {
 
-namespace impl {
-
 namespace {
 
 struct TfLiteQuantizationDeleter {
@@ -140,6 +139,17 @@ const char* GetTFLiteOpName(const TfLiteRegistration& op_reg) {
   return tflite::EnumNamesBuiltinOperator()[op_reg.builtin_code];
 }
 
+TfLiteStatus ValidateCustomAllocationForTensor(
+    TfLiteContext* context, const TfLiteTensor* tensor,
+    const TfLiteCustomAllocation& allocation) {
+  TF_LITE_ENSURE(context, allocation.data != nullptr);
+  TF_LITE_ENSURE(context, allocation.bytes >= tensor->bytes);
+  // Ensure provided memory is aligned to what TFLite requires.
+  const intptr_t data_ptr_value = reinterpret_cast<intptr_t>(allocation.data);
+  TF_LITE_ENSURE(context, data_ptr_value % kDefaultTensorAlignment == 0);
+  return kTfLiteOk;
+}
+
 }  // namespace
 
 // A trivial implementation of GraphInfo around the Interpreter.
@@ -155,9 +165,10 @@ class InterpreterInfo : public GraphInfo {
   TfLiteTensor* tensor(size_t index) override {
     return &subgraph_->tensors()[index];
   }
-  size_t num_nodes() const override {
+  size_t num_execution_nodes() const override {
     return subgraph_->execution_plan().size();
   }
+  size_t num_total_nodes() const override { return subgraph_->nodes_size(); }
   const TfLiteNode& node(size_t index) const override {
     int node_index = subgraph_->execution_plan()[index];
     return subgraph_->nodes_and_registration()[node_index].first;
@@ -637,6 +648,7 @@ TfLiteStatus Subgraph::AllocateTensors() {
 
   next_execution_plan_index_to_prepare_ = 0;
   next_execution_plan_index_to_plan_allocation_ = 0;
+  next_original_execution_plan_index_to_prepare_ = 0;
   if (memory_planner_) {
     TF_LITE_ENSURE_STATUS(memory_planner_->ResetAllocations());
   }
@@ -661,13 +673,17 @@ TfLiteStatus Subgraph::ResetVariableTensors() {
       continue;
     }
 
-    // Variable tensors have to be `kTfLiteArenaRwPersistent`, and must be
-    // allocated after the initial `PrepareOpsAndTensors()` is called.
-    TF_LITE_ENSURE_EQ(&context_, tensor.allocation_type,
-                      kTfLiteArenaRwPersistent);
-    TF_LITE_ENSURE(&context_, tensor.data.raw != nullptr);
-
-    tflite::ResetVariableTensor(&tensor);
+    if (tensor.allocation_type == kTfLiteArenaRwPersistent) {
+      // If variable tensors allocation type is `kTfLiteArenaRwPersistent`, then
+      // they must be allocated after the initial `PrepareOpsAndTensors()` is
+      // called.
+      TF_LITE_ENSURE(&context_, tensor.data.raw != nullptr);
+      tflite::ResetVariableTensor(&tensor);
+    } else {
+      // If variable tensors allocation type is not `kTfLiteArenaRwPersistent`,
+      // then it can only be `kTfLiteCustom` in which case, we do not reset it.
+      TF_LITE_ENSURE_EQ(&context_, tensor.allocation_type, kTfLiteCustom);
+    }
   }
   return kTfLiteOk;
 }
@@ -829,13 +845,14 @@ TfLiteStatus Subgraph::OpPrepare(const TfLiteRegistration& op_reg,
 }
 
 TfLiteStatus Subgraph::PrepareOpsStartingAt(
-    int first_execution_plan_index, int* last_execution_plan_index_prepared) {
+    int first_execution_plan_index, const std::vector<int>& execution_plan,
+    int* last_execution_plan_index_prepared) {
   if (first_execution_plan_index == 0) {
     has_dynamic_tensors_ = false;
   }
   for (int execution_plan_index = first_execution_plan_index;
-       execution_plan_index < execution_plan_.size(); execution_plan_index++) {
-    int node_index = execution_plan_[execution_plan_index];
+       execution_plan_index < execution_plan.size(); execution_plan_index++) {
+    int node_index = execution_plan[execution_plan_index];
     TfLiteNode& node = nodes_and_registration_[node_index].first;
     const TfLiteRegistration& registration =
         nodes_and_registration_[node_index].second;
@@ -867,15 +884,53 @@ TfLiteStatus Subgraph::PrepareOpsAndTensors() {
     memory_planner_->PlanAllocations();
   }
 
-  int last_exec_plan_index_prepared = 0;
+  // Prepare original execution plan if any applied delegate wants it.
+  // If any of the delegates is immutable, this won't be triggered
+  // post-delegation (since we undo/redo delegation). For all other cases, other
+  // delegates that do shape propagation themselves would still be able to.
+  bool prepare_original_plan = false;
+  if (!pre_delegation_execution_plan_.empty()) {
+    for (int i = 0; i < delegates_applied_.size(); ++i) {
+      if ((delegates_applied_[i]->flags &
+           kTfLiteDelegateFlagsRequirePropagatedShapes)) {
+        prepare_original_plan = true;
+        break;
+      }
+    }
+  }
+  if (prepare_original_plan) {
+    int last_original_exec_plan_index_prepared = 0;
+    TF_LITE_ENSURE_STATUS(PrepareOpsStartingAt(
+        next_execution_plan_index_to_prepare_, pre_delegation_execution_plan_,
+        &last_original_exec_plan_index_prepared));
+    next_original_execution_plan_index_to_prepare_ =
+        last_original_exec_plan_index_prepared + 1;
+  }
 
-  TF_LITE_ENSURE_STATUS(PrepareOpsStartingAt(
-      next_execution_plan_index_to_prepare_, &last_exec_plan_index_prepared));
+  int last_exec_plan_index_prepared = 0;
+  TF_LITE_ENSURE_STATUS(
+      PrepareOpsStartingAt(next_execution_plan_index_to_prepare_,
+                           execution_plan_, &last_exec_plan_index_prepared));
   next_execution_plan_index_to_prepare_ = last_exec_plan_index_prepared + 1;
 
+  // Execute arena allocations.
   TF_LITE_ENSURE_STATUS(memory_planner_->ExecuteAllocations(
       next_execution_plan_index_to_plan_allocation_,
       last_exec_plan_index_prepared));
+
+  // Ensure custom allocations are still valid for applicable tensors.
+  // This causes some extra validations for cases with dynamic tensors, but the
+  // overhead should be minimal since the number of custom-allocated tensors
+  // will typically be low.
+  for (int i = 0; i < custom_allocations_.size(); ++i) {
+    auto idx_and_alloc = custom_allocations_[i];
+    auto& tensor = tensors()[idx_and_alloc.first];
+    const auto& alloc = idx_and_alloc.second;
+    TF_LITE_ENSURE(context(), tensor.allocation_type == kTfLiteCustom);
+    TF_LITE_ENSURE_STATUS(
+        ValidateCustomAllocationForTensor(context(), &tensor, alloc));
+  }
+
   next_execution_plan_index_to_plan_allocation_ =
       last_exec_plan_index_prepared + 1;
 
@@ -1193,7 +1248,8 @@ TfLiteStatus Subgraph::ResizeTensorImpl(TfLiteTensor* tensor,
   if (tensor->allocation_type == kTfLiteArenaRw ||
       tensor->allocation_type == kTfLiteDynamic ||
       tensor->allocation_type == kTfLiteArenaRwPersistent ||
-      tensor->allocation_type == kTfLitePersistentRo) {
+      tensor->allocation_type == kTfLitePersistentRo ||
+      tensor->allocation_type == kTfLiteCustom) {
     tensor_resized_since_op_invoke_ |=
         TfLiteIntArrayEqual(tensor->dims, new_size) == 0;
     if (tensor->type != kTfLiteString) {
@@ -1361,13 +1417,14 @@ TfLiteStatus Subgraph::ModifyGraphWithDelegate(TfLiteDelegate* delegate) {
   if (state_ == kStateInvokableAndImmutable) {
     ReportError(
         "ModifyGraphWithDelegate is disallowed when graph is immutable.");
-    return kTfLiteError;
+    return kTfLiteApplicationError;
   }
 
   if (!(delegate->flags & kTfLiteDelegateFlagsAllowDynamicTensors)) {
     int last_execution_plan_index_prepared;
-    TF_LITE_ENSURE_OK(&context_, PrepareOpsStartingAt(
-                                     0, &last_execution_plan_index_prepared));
+    TF_LITE_ENSURE_OK(
+        &context_, PrepareOpsStartingAt(0, execution_plan_,
+                                        &last_execution_plan_index_prepared));
     if (has_dynamic_tensors_) {
       // Make sure that we are in a defined ready state before returning.
       // Plan and allocate tensors before returning.
@@ -1429,6 +1486,33 @@ TfLiteStatus Subgraph::ModifyGraphWithDelegate(TfLiteDelegate* delegate) {
   return status;
 }
 
-}  // namespace impl
+TfLiteStatus Subgraph::SetCustomAllocationForTensor(
+    int tensor_index, const TfLiteCustomAllocation& allocation) {
+  TfLiteTensor* tensor = &context_.tensors[tensor_index];
+  TF_LITE_ENSURE(context(),
+                 (tensor->allocation_type == kTfLiteArenaRw ||
+                  tensor->allocation_type == kTfLiteArenaRwPersistent ||
+                  tensor->allocation_type == kTfLiteCustom));
+  TF_LITE_ENSURE_STATUS(
+      ValidateCustomAllocationForTensor(context(), tensor, allocation));
+
+  // If tensor already has a custom alloc, just reassign.
+  const auto alloc_it = std::find_if(
+      custom_allocations_.begin(), custom_allocations_.end(),
+      [tensor_index](
+          const std::pair<int, TfLiteCustomAllocation>& existing_alloc) {
+        return existing_alloc.first == tensor_index;
+      });
+  if (alloc_it == custom_allocations_.end()) {
+    custom_allocations_.emplace_back(tensor_index, allocation);
+  } else {
+    alloc_it->second = allocation;
+  }
+
+  tensor->allocation_type = kTfLiteCustom;
+  tensor->data.data = allocation.data;
+
+  return kTfLiteOk;
+}
 
 }  // namespace tflite
diff --git a/tensorflow/lite/core/subgraph.h b/tensorflow/lite/core/subgraph.h
index bee13c9073e..c74611a6ca4 100644
--- a/tensorflow/lite/core/subgraph.h
+++ b/tensorflow/lite/core/subgraph.h
@@ -30,14 +30,8 @@ limitations under the License.
 #include "tensorflow/lite/memory_planner.h"
 #include "tensorflow/lite/util.h"
 
-#if TFLITE_EXPERIMENTAL_RUNTIME_EAGER
-#include "tensorflow/lite/experimental/tf_runtime/public/subgraph.h"
-#endif
-
 namespace tflite {
 
-namespace impl {
-
 // Forward declare since NNAPIDelegate uses Interpreter.
 class NNAPIDelegate;
 
@@ -332,6 +326,29 @@ class Subgraph {
   // Before `AllocateTensors` is called, this will always return true;
   bool HasDynamicTensors() { return has_dynamic_tensors_; }
 
+  // Assigns (or reassigns) a custom memory allocation for the given tensor.
+  // If AllocateTensors() is called after this, the runtime does not consider
+  // the tensor during internal memory planning and will continue using the
+  // provided allocation for the tensor (assuming it satisfies the expected
+  // tensor byte length).
+  // The runtime does NOT take ownership of the underlying memory.
+  // Note that while this function can be called again to set a new allocation
+  // for the tensor, it can no longer be reset to the TFLite arena memory.
+  //
+  // Parameters should satisfy the following conditions:
+  // 1. tensor->allocation_type == kTfLiteArenaRw or kTfLiteArenaRwPersistent
+  //    In general, this is true for I/O tensors & variable tensors.
+  // 2. allocation->data has the appropriate permissions for runtime access
+  //    (Read-only for inputs, Read-Write for others), and outlives Interpreter.
+  // 3. allocation->bytes >= tensor->bytes.
+  //    This condition is checked again if any tensors are resized.
+  // 4. allocation->data should be aligned to kDefaultTensorAlignment
+  //    defined in lite/util.h. (Currently 64 bytes)
+  //
+  // WARNING: This is an experimental interface that is subject to change.
+  TfLiteStatus SetCustomAllocationForTensor(
+      int tensor_index, const TfLiteCustomAllocation& allocation);
+
  private:
   // SubgraphAwareProfiler wraps an actual TFLite profiler, such as a
   // BufferedProfiler instance, and takes care of event profiling/tracing in a
@@ -419,6 +436,7 @@ class Subgraph {
   // 'last_node_prepared' with the id of the op containing dynamic tensors, or
   // the last in the graph.
   TfLiteStatus PrepareOpsStartingAt(int first_execution_plan_index,
+                                    const std::vector<int>& execution_plan,
                                     int* last_execution_plan_index_prepared);
 
   // Tensors needed by the interpreter. Use `AddTensors` to add more blank
@@ -534,12 +552,15 @@ class Subgraph {
   // be reallocated if the graph was modified (i.e., the caller does *not* need
   // to explicitly call |AllocateTensors()| again). If tensors were unallocated,
   // they will remain unallocated after delegate application.
-  // Returns one of the following three status codes:
+  // Returns one of the following status codes:
   // 1. kTfLiteOk: Delegation succeeded
-  // 2. kTfLiteDelegateError: Delegation failed due to an error in the
-  // delegate. The Subgraph has been restored to its pre-delegation state.
+  // 2. kTfLiteDelegateError: Delegation failed due to an error *in the
+  // delegate*. The Subgraph has been restored to its pre-delegation state.
   // NOTE: This reverts all delegates previously applied to the Subgraph.
-  // 3. kTfLiteError: Unexpected/runtime failure.
+  // 3. kTfLiteApplicationError : Delegation failed to be applied due to the
+  // state that the TfLite runtime is in. However, the Subgraph is still in a
+  // invokable state.
+  // 4. kTfLiteError: Unexpected/runtime failure.
   TfLiteStatus ModifyGraphWithDelegate(TfLiteDelegate* delegate);
 
   // This un-applies all delegates that have been applied till now, but retains
@@ -635,6 +656,11 @@ class Subgraph {
   // NOTE: this relies on the order of nodes that is in topological order.
   int next_execution_plan_index_to_prepare_;
 
+  // Only used in cases where a delegate supporting dynamic tensors is applied.
+  // This helps prepare the original execution before the post-delegation one,
+  // so that tensor shapes propagate.
+  int next_original_execution_plan_index_to_prepare_;
+
   // This is similar to `next_execution_plan_index_to_prepare_`, but it tracks
   // which nodes' allocation is planned with the arena planner.
   //
@@ -674,6 +700,9 @@ class Subgraph {
 
   std::unique_ptr<MemoryPlanner> memory_planner_;
 
+  // Contains <tensor idx, custom allocation> pairs for all applicable tensors.
+  std::vector<std::pair<int, TfLiteCustomAllocation>> custom_allocations_;
+
   // Tracking bit for whether a tensor was resized in the course of an op
   // invocation. This is a useful hint to ensure that dynamic tensor outputs
   // trigger downstream reallocation after op invocation.
@@ -704,13 +733,5 @@ class Subgraph {
   resource::ResourceMap* resources_ = nullptr;
 };
 
-}  // namespace impl
-
-#if TFLITE_EXPERIMENTAL_RUNTIME_EAGER
-using Subgraph = tflrt::Subgraph;
-#else
-using Subgraph = impl::Subgraph;
-#endif
-
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_CORE_SUBGRAPH_H_
diff --git a/tensorflow/lite/delegates/BUILD b/tensorflow/lite/delegates/BUILD
index e1f91f32c34..3998a3cc198 100644
--- a/tensorflow/lite/delegates/BUILD
+++ b/tensorflow/lite/delegates/BUILD
@@ -14,6 +14,7 @@
 # ==============================================================================
 
 load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_linkopts")
+load("//tensorflow:tensorflow.bzl", "get_compatible_with_portable")
 
 package(
     default_visibility = ["//visibility:public"],
@@ -23,6 +24,7 @@ package(
 cc_library(
     name = "status",
     hdrs = ["status.h"],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
     deps = [
         "//tensorflow/lite/c:common",
@@ -33,6 +35,7 @@ cc_library(
     name = "utils",
     srcs = ["utils.cc"],
     hdrs = ["utils.h"],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
     deps = [
         "//tensorflow/lite:kernel_api",
diff --git a/tensorflow/lite/delegates/delegate_test.cc b/tensorflow/lite/delegates/delegate_test.cc
index 1efe6e44d54..b4c8671d833 100644
--- a/tensorflow/lite/delegates/delegate_test.cc
+++ b/tensorflow/lite/delegates/delegate_test.cc
@@ -127,15 +127,19 @@ class TestDelegate : public ::testing::Test {
     // min_ops_per_subset: If >0, partitioning preview is used to choose only
     // those subsets with min_ops_per_subset number of nodes.
     // fail_node_invoke: To simulate failure of Delegate node's Invoke().
-    explicit SimpleDelegate(
-        const std::vector<int>& nodes,
-        TfLiteDelegateFlags delegate_flags = kTfLiteDelegateFlagsNone,
-        bool fail_node_prepare = false, int min_ops_per_subset = 0,
-        bool fail_node_invoke = false)
+    // automatic_shape_propagation: This assumes that the runtime will propagate
+    // shapes using the original execution plan.
+    explicit SimpleDelegate(const std::vector<int>& nodes,
+                            int64_t delegate_flags = kTfLiteDelegateFlagsNone,
+                            bool fail_node_prepare = false,
+                            int min_ops_per_subset = 0,
+                            bool fail_node_invoke = false,
+                            bool automatic_shape_propagation = false)
         : nodes_(nodes),
           fail_delegate_node_prepare_(fail_node_prepare),
           min_ops_per_subset_(min_ops_per_subset),
-          fail_delegate_node_invoke_(fail_node_invoke) {
+          fail_delegate_node_invoke_(fail_node_invoke),
+          automatic_shape_propagation_(automatic_shape_propagation) {
       delegate_.Prepare = [](TfLiteContext* context,
                              TfLiteDelegate* delegate) -> TfLiteStatus {
         auto* simple = static_cast<SimpleDelegate*>(delegate->data_);
@@ -242,60 +246,80 @@ class TestDelegate : public ::testing::Test {
       TfLiteRegistration reg = {nullptr};
       reg.custom_name = "fake_fused_op";
 
-      reg.invoke = [](TfLiteContext* context,
-                      TfLiteNode* node) -> TfLiteStatus {
-        // Copy input data to output data.
-        const TfLiteTensor* a0;
-        const TfLiteTensor* a1;
-        if (node->inputs->size == 2) {
-          a0 = GetInput(context, node, 0);
-          a1 = GetInput(context, node, 1);
-        } else {
-          a0 = GetInput(context, node, 0);
-          a1 = a0;
-        }
-        TfLiteTensor* out = GetOutput(context, node, 0);
-        int num = 1;
-        for (int i = 0; i < a0->dims->size; ++i) {
-          num *= a0->dims->data[i];
-        }
-        for (int i = 0; i < num; i++) {
-          out->data.f[i] = a0->data.f[i] + a1->data.f[i];
-        }
-        if (out->buffer_handle != kTfLiteNullBufferHandle) {
-          // Make the data stale so that CopyFromBufferHandle can be invoked
-          out->data_is_stale = true;
-        }
-        return kTfLiteOk;
-      };
+      // Different flavors of the delegate kernel's Invoke(), dependent on
+      // testing parameters.
       if (fail_delegate_node_invoke_) {
         reg.invoke = [](TfLiteContext* context,
                         TfLiteNode* node) -> TfLiteStatus {
           return kTfLiteError;
         };
+      } else {
+        reg.invoke = [](TfLiteContext* context,
+                        TfLiteNode* node) -> TfLiteStatus {
+          // Copy input data to output data.
+          const TfLiteTensor* a0;
+          const TfLiteTensor* a1;
+          if (node->inputs->size == 2) {
+            a0 = GetInput(context, node, 0);
+            a1 = GetInput(context, node, 1);
+          } else {
+            a0 = GetInput(context, node, 0);
+            a1 = a0;
+          }
+          TfLiteTensor* out = GetOutput(context, node, 0);
+          int num = 1;
+          for (int i = 0; i < a0->dims->size; ++i) {
+            num *= a0->dims->data[i];
+          }
+          for (int i = 0; i < num; i++) {
+            out->data.f[i] = a0->data.f[i] + a1->data.f[i];
+          }
+          if (out->buffer_handle != kTfLiteNullBufferHandle) {
+            // Make the data stale so that CopyFromBufferHandle can be invoked
+            out->data_is_stale = true;
+          }
+          return kTfLiteOk;
+        };
       }
 
-      reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
-        // Set output size to input size
-        const TfLiteTensor* input1;
-        const TfLiteTensor* input2;
-        if (node->inputs->size == 2) {
-          input1 = GetInput(context, node, 0);
-          input2 = GetInput(context, node, 1);
-        } else {
-          input1 = GetInput(context, node, 0);
-          input2 = input1;
-        }
-        TfLiteTensor* output = GetOutput(context, node, 0);
-
-        TF_LITE_ENSURE_STATUS(context->ResizeTensor(
-            context, output, TfLiteIntArrayCopy(input1->dims)));
-        return kTfLiteOk;
-      };
-      if (fail_delegate_node_prepare_) {
+      // Different flavors of the delegate kernel's Prepare(), dependent on
+      // testing parameters.
+      if (automatic_shape_propagation_) {
+        reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
+          // Shapes should already by propagated by the runtime, just need to
+          // check.
+          const TfLiteTensor* input1 = GetInput(context, node, 0);
+          TfLiteTensor* output = GetOutput(context, node, 0);
+          const int input_dims_size = input1->dims->size;
+          TF_LITE_ENSURE(context, output->dims->size == input_dims_size);
+          for (int i = 0; i < input_dims_size; ++i) {
+            TF_LITE_ENSURE(context,
+                           output->dims->data[i] == input1->dims->data[i]);
+          }
+          return kTfLiteOk;
+        };
+      } else if (fail_delegate_node_prepare_) {
         reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
           return kTfLiteError;
         };
+      } else {
+        reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
+          // Set output size to input size
+          const TfLiteTensor* input1;
+          const TfLiteTensor* input2;
+          if (node->inputs->size == 2) {
+            input1 = GetInput(context, node, 0);
+            input2 = GetInput(context, node, 1);
+          } else {
+            input1 = GetInput(context, node, 0);
+            input2 = input1;
+          }
+          TfLiteTensor* output = GetOutput(context, node, 0);
+
+          TF_LITE_ENSURE_STATUS(context->ResizeTensor(
+              context, output, TfLiteIntArrayCopy(input1->dims)));
+          return kTfLiteOk;
+        };
       }
 
       return reg;
@@ -311,6 +335,7 @@ class TestDelegate : public ::testing::Test {
     bool fail_delegate_node_prepare_ = false;
     int min_ops_per_subset_ = 0;
     bool fail_delegate_node_invoke_ = false;
+    bool automatic_shape_propagation_ = false;
   };
 
   std::unique_ptr<Interpreter> interpreter_;
@@ -501,6 +526,35 @@ TEST_F(TestDelegate, SecondDelegationInvokeFailure) {
   }
 }
 
+// This test ensures that node indices in multi-delegate application are handled
+// correctly by the TFLite partitioning algorithm.
+TEST_F(TestDelegate, TwoDelegates_ExecutionPlanIndicesDifferent) {
+  // First delegate supports nodes 0, 1.
+  // After this delegation, the execution plan size is 2.
+  delegate_ = std::unique_ptr<SimpleDelegate>(
+      new SimpleDelegate({0, 1}, kTfLiteDelegateFlagsAllowDynamicTensors));
+  // Second delegate supports (original) node index 2.
+  // The execution plan has 2 nodes, so this verifies that the partitioning
+  // algorithm correctly refers to (original) node indices instead of execution
+  // plan indices.
+  delegate2_ = std::unique_ptr<SimpleDelegate>(
+      new SimpleDelegate({2}, kTfLiteDelegateFlagsNone));
+
+  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
+      kTfLiteOk);
+  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
+
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate2_->get_tf_lite_delegate()),
+      kTfLiteOk);
+  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
+
+  // Verify Invoke works.
+  ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
+}
+
 TEST_F(TestDelegate, StaticDelegateMakesGraphImmutable) {
   delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
   ASSERT_EQ(
@@ -744,6 +798,129 @@ TEST_F(TestDelegate, TestResizeInputWithMultipleDelegates) {
   }
 }
 
+// If a delegate sets kTfLiteDelegateFlagsRequirePropagatedShapes but not
+// kTfLiteDelegateFlagsAllowDynamicTensors, the former is redundant.
+TEST_F(TestDelegate, TestRequirePropagatedShapes_NonDynamicDelegate) {
+  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate(
+      {0, 1, 2}, kTfLiteDelegateFlagsRequirePropagatedShapes));
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
+      kTfLiteOk);
+
+  ASSERT_EQ(interpreter_->ResizeInputTensor(0, {1, 4}), kTfLiteOk);
+  ASSERT_EQ(interpreter_->ResizeInputTensor(1, {1, 4}), kTfLiteOk);
+  // Resizing should revert execution plan to original state.
+  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
+
+  std::vector<float> input = {1.0f, 2.0f, 3.0f, 4.0f};
+  std::vector<float> expected_output = {2.0f, 4.0f, 6.0f, 8.0f};
+  constexpr int kOutputTensorIndex = 3;
+  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
+
+  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 4 * sizeof(float));
+  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 4 * sizeof(float));
+  interpreter_->Invoke();
+  for (int i = 0; i < 4; ++i) {
+    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
+  }
+}
+
+TEST_F(TestDelegate, TestRequirePropagatedShapes_DynamicDelegateWithFlag) {
+  // Delegate sets both flags and in its Prepare, ensures that shapes have been
+  // propagated by runtime.
+  int delegate_flags = kTfLiteDelegateFlagsAllowDynamicTensors |
+                       kTfLiteDelegateFlagsRequirePropagatedShapes;
+  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate(
+      {0, 1, 2}, delegate_flags, false /**fail_node_prepare**/,
+      3 /**min_ops_per_subset**/, false /**fail_node_invoke**/,
+      true /**automatic_shape_propagation**/));
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
+      kTfLiteOk);
+
+  ASSERT_EQ(interpreter_->ResizeInputTensor(0, {1, 4}), kTfLiteOk);
+  ASSERT_EQ(interpreter_->ResizeInputTensor(1, {1, 4}), kTfLiteOk);
+  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
+
+  std::vector<float> input = {1.0f, 2.0f, 3.0f, 4.0f};
+  std::vector<float> expected_output = {2.0f, 4.0f, 6.0f, 8.0f};
+  constexpr int kOutputTensorIndex = 3;
+  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
+
+  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 4 * sizeof(float));
+  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 4 * sizeof(float));
+  interpreter_->Invoke();
+  for (int i = 0; i < 4; ++i) {
+    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
+  }
+}
+
+// If the delegate implementation expects shapes to be automatically propagated
+// but does not set the required flag, its Prepare should fail.
+TEST_F(TestDelegate, TestRequirePropagatedShapes_DynamicDelegateWithoutFlag) {
+  // Delegate sets both flags and in its Prepare, ensures that shapes have been
+  // propagated by runtime.
+  int delegate_flags = kTfLiteDelegateFlagsAllowDynamicTensors;
+  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate(
+      {0, 1, 2}, delegate_flags, false /**fail_node_prepare**/,
+      3 /**min_ops_per_subset**/, false /**fail_node_invoke**/,
+      true /**automatic_shape_propagation**/));
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
+      kTfLiteOk);
+
+  ASSERT_EQ(interpreter_->ResizeInputTensor(0, {1, 4}), kTfLiteOk);
+  ASSERT_EQ(interpreter_->ResizeInputTensor(1, {1, 4}), kTfLiteOk);
+  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteError);
+}
+
+TEST_F(TestDelegate, TestRequirePropagatedShapes_MultipleDelegates) {
+  // First delegate needs to support dynamic tensors to allow second delegation.
+  // This delegate does not require automatic propagation.
+  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate(
+      {0}, kTfLiteDelegateFlagsAllowDynamicTensors,
+      false /**fail_node_prepare**/, 1 /**min_ops_per_subset**/,
+      false /**fail_node_invoke**/, false /**automatic_shape_propagation**/));
+  // Second delegate supports nodes 1 & 2, and requires automatic shape
+  // propagation.
+  int delegate_flags = kTfLiteDelegateFlagsAllowDynamicTensors |
+                       kTfLiteDelegateFlagsRequirePropagatedShapes;
+  delegate2_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate(
+      {1, 2}, delegate_flags, false /**fail_node_prepare**/,
+      1 /**min_ops_per_subset**/, false /**fail_node_invoke**/,
+      true /**automatic_shape_propagation**/));
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
+      kTfLiteOk);
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate2_->get_tf_lite_delegate()),
+      kTfLiteOk);
+  // Should be two delegate nodes.
+  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
+
+  ASSERT_EQ(interpreter_->ResizeInputTensor(0, {1, 4}), kTfLiteOk);
+  ASSERT_EQ(interpreter_->ResizeInputTensor(1, {1, 4}), kTfLiteOk);
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
+
+  std::vector<float> input = {1.0f, 2.0f, 3.0f, 4.0f};
+  std::vector<float> expected_output = {2.0f, 4.0f, 6.0f, 8.0f};
+  constexpr int kOutputTensorIndex = 2;
+  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
+
+  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 4 * sizeof(float));
+  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 4 * sizeof(float));
+  interpreter_->Invoke();
+  for (int i = 0; i < 4; ++i) {
+    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
+  }
+}
+
 TEST_F(TestDelegate, TestFallbackWithMultipleDelegates) {
   // First delegate only supports node 0.
   // This delegate should support dynamic tensors, otherwise the second won't be
@@ -959,16 +1136,18 @@ class TestDelegateWithDynamicTensors : public ::testing::Test {
   void SetUp() override {
     interpreter_.reset(new Interpreter);
 
-    interpreter_->AddTensors(2);
+    interpreter_->AddTensors(3);
     interpreter_->SetInputs({0});
-    interpreter_->SetOutputs({1});
+    interpreter_->SetOutputs({1, 2});
     TfLiteQuantizationParams quant;
     interpreter_->SetTensorParametersReadWrite(0, kTfLiteFloat32, "", {3},
                                                quant);
     interpreter_->SetTensorParametersReadWrite(1, kTfLiteFloat32, "", {3},
                                                quant);
+    interpreter_->SetTensorParametersReadWrite(2, kTfLiteFloat32, "", {3},
+                                               quant);
     TfLiteRegistration reg = DynamicCopyOpRegistration();
-    interpreter_->AddNodeWithParameters({0}, {1}, nullptr, 0, nullptr, &reg);
+    interpreter_->AddNodeWithParameters({0}, {1, 2}, nullptr, 0, nullptr, &reg);
 
     delegate_.Prepare = [](TfLiteContext* context,
                            TfLiteDelegate* delegate) -> TfLiteStatus {
@@ -988,8 +1167,14 @@ class TestDelegateWithDynamicTensors : public ::testing::Test {
     TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr};
 
     reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
-      TfLiteTensor* output = GetOutput(context, node, 0);
-      SetTensorToDynamic(output);
+      // Output 0 is dynamic
+      TfLiteTensor* output0 = GetOutput(context, node, 0);
+      SetTensorToDynamic(output0);
+      // Output 1 has the same shape as input.
+      const TfLiteTensor* input = GetInput(context, node, 0);
+      TfLiteTensor* output1 = GetOutput(context, node, 1);
+      TF_LITE_ENSURE_STATUS(context->ResizeTensor(
+          context, output1, TfLiteIntArrayCopy(input->dims)));
       return kTfLiteOk;
     };
 
@@ -1002,6 +1187,21 @@ class TestDelegateWithDynamicTensors : public ::testing::Test {
 
   static TfLiteRegistration DelegateRegistration() {
     TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr};
+
+    reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
+      // If tensors are resized, the runtime should propagate shapes
+      // automatically if correct flag is set. Ensure values are correct.
+      // Output 0 should be dynamic.
+      TfLiteTensor* output0 = GetOutput(context, node, 0);
+      TF_LITE_ENSURE(context, IsDynamicTensor(output0));
+      // Output 1 has the same shape as input.
+      const TfLiteTensor* input = GetInput(context, node, 0);
+      TfLiteTensor* output1 = GetOutput(context, node, 1);
+      TF_LITE_ENSURE(context, input->dims->size == output1->dims->size);
+      TF_LITE_ENSURE(context, input->dims->data[0] == output1->dims->data[0]);
+      return kTfLiteOk;
+    };
+
     return reg;
   }
 
@@ -1041,6 +1241,34 @@ TEST_F(TestDelegateWithDynamicTensors, ModifyGraphAfterAllocate) {
   ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
 }
 
+TEST_F(TestDelegateWithDynamicTensors, ShapePropagation_FlagSet) {
+  // Trigger allocation *before* delegate application.
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+
+  delegate_.flags = kTfLiteDelegateFlagsAllowDynamicTensors |
+                    kTfLiteDelegateFlagsRequirePropagatedShapes;
+  ASSERT_EQ(interpreter_->ModifyGraphWithDelegate(&delegate_), kTfLiteOk);
+
+  // Allocation before & after resizing tensors should work.
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+  ASSERT_EQ(interpreter_->ResizeInputTensor(0, {4}), kTfLiteOk);
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+}
+
+TEST_F(TestDelegateWithDynamicTensors, ShapePropagation_FlagNotSet) {
+  // Trigger allocation *before* delegate application.
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+
+  delegate_.flags = kTfLiteDelegateFlagsAllowDynamicTensors;
+  ASSERT_EQ(interpreter_->ModifyGraphWithDelegate(&delegate_), kTfLiteOk);
+
+  // Allocation after resizing tensors should NOT work, since runtime won't
+  // propagate shape - causing delegate kernel to fail.
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+  ASSERT_EQ(interpreter_->ResizeInputTensor(0, {4}), kTfLiteOk);
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteError);
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/delegates/external/README.md b/tensorflow/lite/delegates/external/README.md
index d110dede5b7..01945181e06 100644
--- a/tensorflow/lite/delegates/external/README.md
+++ b/tensorflow/lite/delegates/external/README.md
@@ -23,7 +23,7 @@ is allowed.
 void tflite_plugin_destroy_delegate(TfLiteDelegate* delegate)
 ```
 
-The external delegate provides an opague and transparent way to utilize a
+The external delegate provides an opaque and transparent way to utilize a
 Tensorflow Lite delegate when performing inference. In other words, one may
 replace the actual Tensorflow Lite delegate by simply updating the dynamic
 library without changing the application code. We developed this mainly for
diff --git a/tensorflow/lite/delegates/flex/BUILD b/tensorflow/lite/delegates/flex/BUILD
index 6210007361a..3a7684eab95 100644
--- a/tensorflow/lite/delegates/flex/BUILD
+++ b/tensorflow/lite/delegates/flex/BUILD
@@ -1,4 +1,5 @@
 load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_opts_nortti_if_lite_protos")
+load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 load("//tensorflow/lite/delegates/flex:build_def.bzl", "tflite_flex_cc_library")
 
 #
@@ -84,6 +85,7 @@ cc_library(
     hdrs = [
         "delegate.h",
     ],
+    copts = tflite_copts(),
     visibility = ["//visibility:public"],
     deps = [
         ":buffer_map",
@@ -128,6 +130,7 @@ tf_cc_test(
     deps = [
         ":delegate",
         ":test_util",
+        "//tensorflow/lite:shared_library",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest",
     ],
@@ -177,8 +180,8 @@ tf_cc_test(
     srcs = ["kernel_test.cc"],
     tags = ["no_gpu"],  # GPU + flex is not officially supported.
     deps = [
+        ":delegate",
         ":delegate_data",
-        ":delegate_only_runtime",
         ":test_util",
         "@com_google_googletest//:gtest",
     ],
@@ -276,3 +279,15 @@ tf_cc_test(
         ],
     }),
 )
+
+# Alias to support selective build of image ops.
+# TODO(b/163285312): Remove after tensorflow/core refactoring completed.
+cc_library(
+    name = "portable_images_lib",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:portable_gif_internal",
+        "//tensorflow/core:portable_jpeg_internal",
+        "//tensorflow/core:portable_png_internal",
+    ],
+)
diff --git a/tensorflow/lite/delegates/flex/allowlisted_flex_ops.cc b/tensorflow/lite/delegates/flex/allowlisted_flex_ops.cc
index e06410485ba..31a62d07232 100644
--- a/tensorflow/lite/delegates/flex/allowlisted_flex_ops.cc
+++ b/tensorflow/lite/delegates/flex/allowlisted_flex_ops.cc
@@ -75,6 +75,7 @@ const std::set<std::string>& GetFlexAllowlist() {
           "BiasAdd",
           "BiasAddGrad",
           "BiasAddV1",
+          "Bincount",
           "BoostedTreesBucketize",
           "BroadcastArgs",
           "BroadcastGradientArgs",
@@ -111,10 +112,18 @@ const std::set<std::string>& GetFlexAllowlist() {
           "DataFormatVecPermute",
           "DebugGradientIdentity",
           "DebugGradientRefIdentity",
+          "DecodeAndCropJpeg",
           "DecodeBase64",
+          "DecodeBmp",
+          "DecodeGif",
+          "DecodeImage",
+          "DecodeJpeg",
+          "DecodePng",
+          "DecodeRaw",
           "DecodeWav",
           "DeepCopy",
           "DeleteSessionTensor",
+          "DenseBincount",
           "DepthToSpace",
           "DepthwiseConv2dNative",
           "Dequantize",
@@ -130,6 +139,9 @@ const std::set<std::string>& GetFlexAllowlist() {
           "EluGrad",
           "Empty",
           "EncodeBase64",
+          "EncodeJpeg",
+          "EncodeJpegVariableQuality",
+          "EncodePng",
           "EncodeWav",
           "EnsureShape",
           "Enter",
@@ -301,7 +313,9 @@ const std::set<std::string>& GetFlexAllowlist() {
           "RFFT",
           "RFFT2D",
           "RFFT3D",
+          "RaggedBincount",
           "RaggedRange",
+          "RaggedTensorToSparse",
           "RaggedTensorToTensor",
           "RandomGamma",
           "RandomStandardNormal",
@@ -414,6 +428,7 @@ const std::set<std::string>& GetFlexAllowlist() {
           "SparseApplyProximalAdagrad",
           "SparseApplyProximalGradientDescent",
           "SparseApplyRMSProp",
+          "SparseBincount",
           "SparseCross",
           "SparseCrossHashed",
           "SparseCrossV2",
@@ -457,6 +472,7 @@ const std::set<std::string>& GetFlexAllowlist() {
           "StridedSliceAssign",
           "StridedSliceGrad",
           "StringJoin",
+          "StringLower",
           "StringSplit",
           "StringSplitV2",
           "StringToHashBucket",
@@ -574,10 +590,27 @@ bool IsAllowedTFTextOpForFlex(const std::string& op_name) {
   return tensorflow::OpRegistry::Global()->LookUp(op_name) != nullptr;
 }
 
+// Allow the sentencepiece ops if they are registered in the global op registry.
+bool IsAllowedSentencePieceOpForFlex(const std::string& op_name) {
+  static const std::set<std::string>* sentencepiece_flex_ops =
+      new std::set<std::string>({
+          "SentencepieceGetPieceSize",
+          "SentencepiecePieceToId",
+          "SentencepieceIdToPiece",
+          "SentencepieceEncodeDense",
+          "SentencepieceEncodeSparse",
+          "SentencepieceDecode",
+      });
+  if (sentencepiece_flex_ops->count(op_name) == 0) return false;
+  return tensorflow::OpRegistry::Global()->LookUp(op_name) != nullptr;
+}
+
 bool IsAllowlistedFlexOp(const std::string& tensorflow_op_name) {
   if (GetFlexAllowlist().count(tensorflow_op_name) != 0) return true;
-  // Check if the op is an allowlisted tf.text op.
-  return IsAllowedTFTextOpForFlex(tensorflow_op_name);
+
+  // Check if the op is an allowlisted tf.text or sentencepiece op.
+  return IsAllowedTFTextOpForFlex(tensorflow_op_name) ||
+         IsAllowedSentencePieceOpForFlex(tensorflow_op_name);
 }
 
 }  // namespace flex
diff --git a/tensorflow/lite/delegates/flex/build_def.bzl b/tensorflow/lite/delegates/flex/build_def.bzl
index 9b9f1b2c4cb..5826e1f83cd 100644
--- a/tensorflow/lite/delegates/flex/build_def.bzl
+++ b/tensorflow/lite/delegates/flex/build_def.bzl
@@ -2,6 +2,7 @@
 
 load(
     "//tensorflow:tensorflow.bzl",
+    "clean_dep",
     "if_android",
     "if_ios",
     "if_mobile",
@@ -46,12 +47,12 @@ def generate_flex_kernel_header(
         ["$(location %s)" % f for f in models],
     )
     list_ops_output = include_path + "/list_flex_ops"
-    list_ops_tool = "//tensorflow/lite/tools:list_flex_ops_main"
+    list_ops_tool = clean_dep("//tensorflow/lite/tools:list_flex_ops_main")
     if additional_deps:
         tf_cc_binary(
             name = "%s_list_flex_ops_main" % name,
             deps = [
-                "//tensorflow/lite/tools:list_flex_ops_main_lib",
+                clean_dep("//tensorflow/lite/tools:list_flex_ops_main_lib"),
             ] + additional_deps,
         )
         list_ops_tool = ":%s_list_flex_ops_main" % name
@@ -66,12 +67,12 @@ def generate_flex_kernel_header(
     )
 
     # Generate the kernel registration header file from list of flex ops.
-    tool = "//tensorflow/python/tools:print_selective_registration_header"
+    tool = clean_dep("//tensorflow/python/tools:print_selective_registration_header")
     native.genrule(
         name = "%s_kernel_registration" % name,
         srcs = [list_ops_output],
         outs = [header],
-        tools = [tool],
+        exec_tools = [tool],
         message = "Processing %s..." % list_ops_output,
         cmd = ("$(location " + tool + ")" +
                " --default_ops=\"\"" +
@@ -95,7 +96,7 @@ def tflite_flex_cc_library(
       additional_deps: Dependencies for additional TF ops.
       visibility: visibility of the generated rules.
     """
-    portable_tensorflow_lib = "//tensorflow/core:portable_tensorflow_lib"
+    portable_tensorflow_lib = clean_dep("//tensorflow/core:portable_tensorflow_lib")
     if models:
         CUSTOM_KERNEL_HEADER = generate_flex_kernel_header(
             name = "%s_tf_op_headers" % name,
@@ -108,9 +109,9 @@ def tflite_flex_cc_library(
         native.cc_library(
             name = "%s_tensorflow_lib" % name,
             srcs = if_mobile([
-                "//tensorflow/core:portable_op_registrations_and_gradients",
-                "//tensorflow/core/kernels:android_core_ops",
-                "//tensorflow/core/kernels:android_extended_ops",
+                clean_dep("//tensorflow/core:portable_op_registrations_and_gradients"),
+                clean_dep("//tensorflow/core/kernels:android_core_ops"),
+                clean_dep("//tensorflow/core/kernels:android_extended_ops"),
             ]) + [CUSTOM_KERNEL_HEADER.header],
             copts = tf_copts(android_optimization_level_override = None) + tf_opts_nortti_if_lite_protos() + if_ios(["-Os"]),
             defines = [
@@ -126,7 +127,7 @@ def tflite_flex_cc_library(
                 CUSTOM_KERNEL_HEADER.include_path,
             ],
             textual_hdrs = [
-                "//tensorflow/core/kernels:android_all_ops_textual_hdrs",
+                clean_dep("//tensorflow/core/kernels:android_all_ops_textual_hdrs"),
             ],
             visibility = visibility,
             deps = [
@@ -135,10 +136,11 @@ def tflite_flex_cc_library(
                 "//third_party/eigen3",
                 "@com_google_absl//absl/types:optional",
                 "@gemmlowp",
-                "//tensorflow/core:protos_all_cc",
                 "@icu//:common",
-                "//tensorflow/core:portable_tensorflow_lib_lite",
-                "//tensorflow/core/platform:strong_hash",
+                clean_dep("//tensorflow/core:protos_all_cc"),
+                clean_dep("//tensorflow/core:portable_tensorflow_lib_lite"),
+                clean_dep("//tensorflow/core/platform:strong_hash"),
+                clean_dep("//tensorflow/lite/delegates/flex:portable_images_lib"),
             ],
             alwayslink = 1,
         )
@@ -148,23 +150,23 @@ def tflite_flex_cc_library(
     native.cc_library(
         name = name,
         hdrs = [
-            "//tensorflow/lite/delegates/flex:delegate.h",
+            clean_dep("//tensorflow/lite/delegates/flex:delegate.h"),
         ],
         visibility = visibility,
         deps = [
-            "//tensorflow/lite/delegates/flex:delegate_data",
-            "//tensorflow/lite/delegates/flex:delegate_only_runtime",
-            "//tensorflow/lite/delegates/utils:simple_delegate",
+            clean_dep("//tensorflow/lite/delegates/flex:delegate_data"),
+            clean_dep("//tensorflow/lite/delegates/flex:delegate_only_runtime"),
+            clean_dep("//tensorflow/lite/delegates/utils:simple_delegate"),
         ] + select({
-            "//tensorflow:android": [
+            clean_dep("//tensorflow:android"): [
                 portable_tensorflow_lib,
             ],
-            "//tensorflow:ios": [
+            clean_dep("//tensorflow:ios"): [
                 portable_tensorflow_lib,
             ],
             "//conditions:default": [
-                "//tensorflow/core:tensorflow",
-                "//tensorflow/lite/c:common",
+                clean_dep("//tensorflow/core:tensorflow"),
+                clean_dep("//tensorflow/lite/c:common"),
             ],
         }) + additional_deps,
         alwayslink = 1,
@@ -202,21 +204,21 @@ def tflite_flex_jni_library(
     native.cc_library(
         name = "%s_flex_native" % name,
         srcs = [
-            "//tensorflow/lite/testing:init_tensorflow.h",
-            "//tensorflow/lite/testing:init_tensorflow.cc",
-            "//tensorflow/lite/delegates/flex/java/src/main/native:flex_delegate_jni.cc",
+            clean_dep("//tensorflow/lite/testing:init_tensorflow.h"),
+            clean_dep("//tensorflow/lite/testing:init_tensorflow.cc"),
+            clean_dep("//tensorflow/lite/delegates/flex/java/src/main/native:flex_delegate_jni.cc"),
         ],
         copts = tflite_copts(),
         visibility = visibility,
         deps = [
             ":%s_flex_delegate" % name,
-            "//tensorflow/lite/java/jni",
-            "//tensorflow/lite/delegates/utils:simple_delegate",
+            clean_dep("//tensorflow/lite/java/jni"),
+            clean_dep("//tensorflow/lite/delegates/utils:simple_delegate"),
         ] + select({
-            "//tensorflow:android": [],
-            "//tensorflow:ios": [],
+            clean_dep("//tensorflow:android"): [],
+            clean_dep("//tensorflow:ios"): [],
             "//conditions:default": [
-                "//tensorflow/core:lib",
+                clean_dep("//tensorflow/core:lib"),
             ],
         }),
         alwayslink = 1,
@@ -264,14 +266,14 @@ def tflite_flex_android_library(
 
     android_library(
         name = name,
-        srcs = ["//tensorflow/lite/delegates/flex/java/src/main/java/org/tensorflow/lite/flex:flex_delegate"],
-        manifest = "//tensorflow/lite/java:AndroidManifest.xml",
-        proguard_specs = ["//tensorflow/lite/java:proguard.flags"],
+        srcs = [clean_dep("//tensorflow/lite/delegates/flex/java/src/main/java/org/tensorflow/lite/flex:flex_delegate")],
+        manifest = clean_dep("//tensorflow/lite/java:AndroidManifest.xml"),
+        proguard_specs = [clean_dep("//tensorflow/lite/java:proguard.flags")],
         custom_package = custom_package,
         deps = [
             ":%s_native" % name,
-            "//tensorflow/lite/java:tensorflowlite_java",
-            "@org_checkerframework_qual",
+            clean_dep("//tensorflow/lite/java:tensorflowlite_java"),
+            clean_dep("@org_checkerframework_qual"),
         ],
         visibility = visibility,
     )
diff --git a/tensorflow/lite/delegates/flex/delegate.cc b/tensorflow/lite/delegates/flex/delegate.cc
index 4664ab34700..f7d07af6595 100644
--- a/tensorflow/lite/delegates/flex/delegate.cc
+++ b/tensorflow/lite/delegates/flex/delegate.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include "absl/strings/str_cat.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/context_util.h"
 #include "tensorflow/lite/core/macros.h"
 #include "tensorflow/lite/delegates/flex/buffer_map.h"
@@ -142,14 +143,16 @@ TfLiteStatus FlexDelegate::CopyFromBufferHandle(
 
 }  // namespace tflite
 
+// LINT.IfChange
 // Exported C interface function which is used by AcquireFlexDelegate() at
-// interpreter_build.cc. To export the function name globally, the function name
-// must be matched with patterns in tf_version_script.lds
+// interpreter_builder.cc. To export the function name globally, the function
+// name must be matched with patterns in tf_version_script.lds. In Android, we
+// don't use this feature so skip building.
+#if !defined(__ANDROID__)
 extern "C" {
-#if defined(_WIN32)
-__declspec(dllexport)
-#endif
-    tflite::TfLiteDelegateUniquePtr TF_AcquireFlexDelegate() {
+TFL_CAPI_EXPORT tflite::TfLiteDelegateUniquePtr TF_AcquireFlexDelegate() {
   return tflite::FlexDelegate::Create();
 }
 }  // extern "C"
+#endif  // !defined(__ANDROID__)
+// LINT.ThenChange(//tensorflow/lite/interpreter_builder.cc)
diff --git a/tensorflow/lite/delegates/flex/delegate_data.cc b/tensorflow/lite/delegates/flex/delegate_data.cc
index 2be928073ff..8e3ed964e01 100644
--- a/tensorflow/lite/delegates/flex/delegate_data.cc
+++ b/tensorflow/lite/delegates/flex/delegate_data.cc
@@ -46,7 +46,6 @@ tensorflow::Status DelegateData::Prepare(
   eager_context_ = new tensorflow::EagerContext(
       session_options,
       tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
-      tensorflow::ContextMirroringPolicy::MIRRORING_NONE,
       /*async=*/false, /*lazy_copy_function_remote_inputs=*/false,
       device_mgr.release(), /*device_mgr_owned*/ true, rendezvous, nullptr);
   return tensorflow::Status();
diff --git a/tensorflow/lite/delegates/flex/delegate_test.cc b/tensorflow/lite/delegates/flex/delegate_test.cc
index d574d8fabbb..6861729e8c8 100644
--- a/tensorflow/lite/delegates/flex/delegate_test.cc
+++ b/tensorflow/lite/delegates/flex/delegate_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/delegates/flex/test_util.h"
+#include "tensorflow/lite/shared_library.h"
 
 namespace tflite {
 namespace flex {
@@ -301,6 +302,17 @@ TEST_F(DelegateTest, MultiThreaded) {
   ASSERT_EQ(GetType(8), kTfLiteFloat32);
 }
 
+#if !defined(__ANDROID__)
+TEST_F(DelegateTest, TF_AcquireFlexDelegate) {
+  auto TF_AcquireFlexDelegate =
+      reinterpret_cast<Interpreter::TfLiteDelegatePtr (*)()>(
+          SharedLibrary::GetLibrarySymbol(nullptr, "TF_AcquireFlexDelegate"));
+  ASSERT_TRUE(TF_AcquireFlexDelegate);
+  auto delegate_ptr = TF_AcquireFlexDelegate();
+  ASSERT_TRUE(delegate_ptr != nullptr);
+}
+#endif  // !defined(__ANDROID__)
+
 }  // namespace
 }  // namespace flex
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/BUILD b/tensorflow/lite/delegates/gpu/BUILD
index 0e40095f255..8778653b586 100644
--- a/tensorflow/lite/delegates/gpu/BUILD
+++ b/tensorflow/lite/delegates/gpu/BUILD
@@ -1,3 +1,4 @@
+load("@bazel_skylib//lib:selects.bzl", "selects")
 load("//tensorflow/lite:special_rules.bzl", "tflite_extra_gles_deps")
 load("@build_bazel_rules_apple//apple:ios.bzl", "ios_static_framework")
 load("@build_bazel_rules_apple//apple:macos.bzl", "macos_dylib")
@@ -53,7 +54,7 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:shape",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common:tensor",
-        "//tensorflow/lite/delegates/gpu/common/transformations:general_transformations",
+        "//tensorflow/lite/delegates/gpu/common/transformations:model_transformations",
         "//tensorflow/lite/delegates/gpu/gl:api",
         "//tensorflow/lite/delegates/gpu/gl:command_queue",
         "//tensorflow/lite/delegates/gpu/gl:compiler",
@@ -95,7 +96,6 @@ objc_library(
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common:tensor",
         "//tensorflow/lite/delegates/gpu/common:types",
-        "//tensorflow/lite/delegates/gpu/common/transformations:general_transformations",
         "//tensorflow/lite/delegates/gpu/metal:api",
         "//tensorflow/lite/delegates/gpu/metal:buffer_convert",
         "//tensorflow/lite/delegates/gpu/metal:compiled_model",
@@ -219,6 +219,16 @@ cc_library(
     ],
 )
 
+# Currently the GPU delegate needs to be built on Android (due to EGL dependency),
+# or built with -DCL_DELEGATE_NO_GL (disabling OpenGL backend fallback), or both.
+selects.config_setting_group(
+    name = "supports_gpu_delegate",
+    match_any = [
+        "//tensorflow:android",
+        "//tensorflow/lite/delegates/gpu/cl:opencl_delegate_no_gl",
+    ],
+)
+
 cc_library(
     name = "delegate",
     srcs = ["delegate.cc"],
@@ -240,6 +250,7 @@ cc_library(
             "//tensorflow/lite/delegates/gpu/gl:api2",
         ],
     }) + [
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/types:span",
         "//tensorflow/lite:kernel_api",
diff --git a/tensorflow/lite/delegates/gpu/cl/BUILD b/tensorflow/lite/delegates/gpu/cl/BUILD
index ebfb2cff41b..28b6cc7671a 100644
--- a/tensorflow/lite/delegates/gpu/cl/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/BUILD
@@ -76,7 +76,10 @@ cc_test(
     ],
     deps = [
         ":arguments",
+        ":buffer",
+        ":device_info",
         ":gpu_object",
+        ":tensor",
         ":tensor_type",
         "//tensorflow/lite/delegates/gpu/common:data_type",
         "@com_google_absl//absl/strings",
@@ -257,6 +260,7 @@ cc_library(
     srcs = ["device_info.cc"],
     hdrs = ["device_info.h"],
     deps = [
+        "//tensorflow/lite/delegates/gpu/common:data_type",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -342,7 +346,7 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:model_builder",
         "//tensorflow/lite/delegates/gpu/common:model_transformer",
         "//tensorflow/lite/delegates/gpu/common:status",
-        "//tensorflow/lite/delegates/gpu/common/transformations:general_transformations",
+        "//tensorflow/lite/delegates/gpu/common/transformations:model_transformations",
         "@com_google_absl//absl/types:span",
     ],
 )
@@ -352,6 +356,7 @@ cc_library(
     srcs = ["gpu_object.cc"],
     hdrs = ["gpu_object.h"],
     deps = [
+        ":cl_context",
         ":opencl_wrapper",
         "//tensorflow/lite/delegates/gpu/common:access_type",
         "//tensorflow/lite/delegates/gpu/common:data_type",
@@ -368,6 +373,7 @@ cc_library(
         ":cl_command_queue",
         ":cl_device",
         ":environment",
+        ":gpu_object",
         ":model_hints",
         ":opencl_wrapper",
         ":precision",
@@ -388,6 +394,8 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:util",
         "//tensorflow/lite/delegates/gpu/common/transformations:add_bias",
         "//tensorflow/lite/delegates/gpu/common/transformations:merge_padding_with",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
     ],
 )
 
@@ -396,11 +404,9 @@ cc_library(
     srcs = ["linear_storage.cc"],
     hdrs = ["linear_storage.h"],
     deps = [
-        ":buffer",
         ":gpu_object",
         ":opencl_wrapper",
         ":tensor_type",
-        ":texture2d",
         ":util",
         "//tensorflow/lite/delegates/gpu/common:data_type",
         "//tensorflow/lite/delegates/gpu/common:status",
@@ -454,6 +460,7 @@ cc_library(
         ":compiled_program_cache_cc_fbs",
         ":util",
         "//tensorflow/lite/delegates/gpu/common:status",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/types:span",
         "@farmhash_archive//:farmhash",
         "@flatbuffers",
@@ -465,11 +472,11 @@ cc_library(
     srcs = ["storage_type_util.cc"],
     hdrs = ["storage_type_util.h"],
     deps = [
-        ":cl_context",
-        ":cl_device",
+        ":device_info",
         ":tensor_type",
         "//tensorflow/lite/delegates/gpu/common:data_type",
         "//tensorflow/lite/delegates/gpu/common:shape",
+        "//tensorflow/lite/delegates/gpu/common:util",
     ],
 )
 
diff --git a/tensorflow/lite/delegates/gpu/cl/api.cc b/tensorflow/lite/delegates/gpu/cl/api.cc
index 2a3c84a67cf..01d32aa9206 100644
--- a/tensorflow/lite/delegates/gpu/cl/api.cc
+++ b/tensorflow/lite/delegates/gpu/cl/api.cc
@@ -196,8 +196,8 @@ class DefaultTensorTie : public TensorTie {
             ToTensorStorageType(d.object_def.object_type,
                                 d.object_def.data_layout),
             Layout::BHWC};
-        RETURN_IF_ERROR(AllocateTensorMemory(env->context(), env->device(),
-                                             shape, desc, &cl_memory_));
+        RETURN_IF_ERROR(
+            AllocateTensorMemory(env->context(), shape, desc, &cl_memory_));
         if (d.object_def.object_type == ObjectType::OPENCL_TEXTURE) {
           external_obj_ = OpenClTexture{cl_memory_.memory()};
         } else {
diff --git a/tensorflow/lite/delegates/gpu/cl/arguments.cc b/tensorflow/lite/delegates/gpu/cl/arguments.cc
index 8db58e5e81b..b7e6b08616e 100644
--- a/tensorflow/lite/delegates/gpu/cl/arguments.cc
+++ b/tensorflow/lite/delegates/gpu/cl/arguments.cc
@@ -256,11 +256,10 @@ void Arguments::AddObjectRef(const std::string& name, AccessType access_type,
   object_refs_[name] = {std::move(descriptor_ptr)};
 }
 
-void Arguments::AddObject(const std::string& name, AccessType access_type,
-                          GPUObjectPtr&& object,
+void Arguments::AddObject(const std::string& name,
                           GPUObjectDescriptorPtr&& descriptor_ptr) {
-  descriptor_ptr->SetAccess(access_type);
-  objects_[name] = {std::move(object), std::move(descriptor_ptr)};
+  descriptor_ptr->SetAccess(AccessType::READ);
+  objects_[name] = {nullptr, std::move(descriptor_ptr)};
 }
 
 void Arguments::AddGPUResources(const std::string& name,
@@ -840,6 +839,15 @@ absl::Status Arguments::ResolveSelectorsPass(
   return absl::OkStatus();
 }
 
+absl::Status Arguments::AllocateObjects(CLContext* context) {
+  for (auto& t : objects_) {
+    RETURN_IF_ERROR(
+        t.second.descriptor->CreateGPUObject(context, &t.second.obj_ptr));
+    t.second.descriptor->Release();
+  }
+  return absl::OkStatus();
+}
+
 absl::Status Arguments::AddObjectArgs() {
   for (auto& t : objects_) {
     AddGPUResources(t.first, t.second.descriptor->GetGPUResources());
diff --git a/tensorflow/lite/delegates/gpu/cl/arguments.h b/tensorflow/lite/delegates/gpu/cl/arguments.h
index 0648ae43101..4636a06db6f 100644
--- a/tensorflow/lite/delegates/gpu/cl/arguments.h
+++ b/tensorflow/lite/delegates/gpu/cl/arguments.h
@@ -39,40 +39,22 @@ class Arguments {
   void AddFloat(const std::string& name, float value = 0.0f);
   void AddHalf(const std::string& name, half value = half(0.0f));
   void AddInt(const std::string& name, int value = 0);
-  void AddBuffer(const std::string& name, const GPUBufferDescriptor& desc);
-  void AddImage2D(const std::string& name, const GPUImage2DDescriptor& desc);
-  void AddImage2DArray(const std::string& name,
-                       const GPUImage2DArrayDescriptor& desc);
-  void AddImage3D(const std::string& name, const GPUImage3DDescriptor& desc);
-  void AddImageBuffer(const std::string& name,
-                      const GPUImageBufferDescriptor& desc);
-  void AddCustomMemory(const std::string& name,
-                       const GPUCustomMemoryDescriptor& desc);
-
   void AddObjectRef(const std::string& name, AccessType access_type,
                     GPUObjectDescriptorPtr&& descriptor_ptr);
-  void AddObject(const std::string& name, AccessType access_type,
-                 GPUObjectPtr&& object,
+  void AddObject(const std::string& name,
                  GPUObjectDescriptorPtr&& descriptor_ptr);
 
   absl::Status SetInt(const std::string& name, int value);
   absl::Status SetFloat(const std::string& name, float value);
   absl::Status SetHalf(const std::string& name, half value);
-  absl::Status SetImage2D(const std::string& name, cl_mem memory);
-  absl::Status SetBuffer(const std::string& name, cl_mem memory);
-  absl::Status SetImage2DArray(const std::string& name, cl_mem memory);
-  absl::Status SetImage3D(const std::string& name, cl_mem memory);
-  absl::Status SetImageBuffer(const std::string& name, cl_mem memory);
-  absl::Status SetCustomMemory(const std::string& name, cl_mem memory);
   absl::Status SetObjectRef(const std::string& name, const GPUObject* object);
 
-  std::string GetListOfArgs();
-
   absl::Status Bind(cl_kernel kernel, int offset = 0);
 
   void RenameArgs(const std::string& postfix, std::string* code) const;
   absl::Status Merge(Arguments&& args, const std::string& postfix);
 
+  absl::Status AllocateObjects(CLContext* context);
   absl::Status TransformToCLCode(
       const DeviceInfo& device_info,
       const std::map<std::string, std::string>& linkables, std::string* code);
@@ -84,6 +66,25 @@ class Arguments {
   Arguments& operator=(const Arguments&) = delete;
 
  private:
+  void AddBuffer(const std::string& name, const GPUBufferDescriptor& desc);
+  void AddImage2D(const std::string& name, const GPUImage2DDescriptor& desc);
+  void AddImage2DArray(const std::string& name,
+                       const GPUImage2DArrayDescriptor& desc);
+  void AddImage3D(const std::string& name, const GPUImage3DDescriptor& desc);
+  void AddImageBuffer(const std::string& name,
+                      const GPUImageBufferDescriptor& desc);
+  void AddCustomMemory(const std::string& name,
+                       const GPUCustomMemoryDescriptor& desc);
+
+  absl::Status SetImage2D(const std::string& name, cl_mem memory);
+  absl::Status SetBuffer(const std::string& name, cl_mem memory);
+  absl::Status SetImage2DArray(const std::string& name, cl_mem memory);
+  absl::Status SetImage3D(const std::string& name, cl_mem memory);
+  absl::Status SetImageBuffer(const std::string& name, cl_mem memory);
+  absl::Status SetCustomMemory(const std::string& name, cl_mem memory);
+
+  std::string GetListOfArgs();
+
   std::string AddActiveArgument(const std::string& arg_name,
                                 bool use_f32_for_halfs);
   void AddGPUResources(const std::string& name, const GPUResources& resources);
diff --git a/tensorflow/lite/delegates/gpu/cl/arguments_test.cc b/tensorflow/lite/delegates/gpu/cl/arguments_test.cc
index 29a15e16a57..722ca5b1827 100644
--- a/tensorflow/lite/delegates/gpu/cl/arguments_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/arguments_test.cc
@@ -14,85 +14,58 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/gpu/cl/arguments.h"
 
+#include <cstdint>
 #include <string>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/strings/match.h"
+#include "tensorflow/lite/delegates/gpu/cl/buffer.h"
+#include "tensorflow/lite/delegates/gpu/cl/device_info.h"
 #include "tensorflow/lite/delegates/gpu/cl/gpu_object.h"
-#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
 
 namespace tflite {
 namespace gpu {
 namespace cl {
-namespace {
-struct TestDescriptor : public GPUObjectDescriptor {
-  absl::Status PerformSelector(const std::string& selector,
-                               const std::vector<std::string>& args,
-                               const std::vector<std::string>& template_args,
-                               std::string* result) const override {
-    if (selector == "Length") {
-      *result = "length";
-      return absl::OkStatus();
-    } else if (selector == "Read") {
-      if (args.size() != 1) {
-        return absl::NotFoundError(
-            absl::StrCat("TestDescriptor Read require one argument, but ",
-                         args.size(), " was passed"));
-      }
-      *result = absl::StrCat("buffer[", args[0], "]");
-      return absl::OkStatus();
-    } else {
-      return absl::NotFoundError(absl::StrCat(
-          "TestDescriptor don't have selector with name - ", selector));
-    }
-  }
-
-  GPUResources GetGPUResources(AccessType access_type) const override {
-    GPUResources resources;
-    resources.ints.push_back("length");
-    GPUBufferDescriptor desc;
-    desc.data_type = DataType::FLOAT32;
-    desc.element_size = 4;
-    resources.buffers.push_back({"buffer", desc});
-    return resources;
-  }
-};
-}  // namespace
-
 TEST(ArgumentsTest, TestSelectorResolve) {
-  TestDescriptor descriptor;
-  Arguments args;
-  args.AddObjectRef("object", AccessType::WRITE,
-                    absl::make_unique<TestDescriptor>(descriptor));
-  std::string sample_code = R"(
-  if (a < 3) {
-    value = args.object.Read(id);
-  }
-)";
-  const std::string expected_result = R"(
-  if (a < 3) {
-    value = object_buffer[id];
-  }
-)";
-  ASSERT_OK(args.TransformToCLCode({}, &sample_code));
-  EXPECT_EQ(sample_code, expected_result);
+  BufferDescriptor desc;
+  desc.element_type = DataType::FLOAT32;
+  desc.element_size = 4;
+  desc.memory_type = MemoryType::GLOBAL;
 
-  std::string cl_arguments = args.GetListOfArgs();
-  EXPECT_TRUE(cl_arguments.find("__global float4* object_buffer") !=
-              std::string::npos);
+  Arguments args;
+  args.AddObjectRef("weights", AccessType::READ,
+                    absl::make_unique<BufferDescriptor>(std::move(desc)));
+  std::string sample_code = R"(
+__kernel void main_function($0) {
+  if (a < 3) {
+    value = args.weights.Read(id);
+  }
+})";
+
+  DeviceInfo device_info;
+  ASSERT_OK(args.TransformToCLCode(device_info, {}, &sample_code));
+  EXPECT_TRUE(absl::StrContains(sample_code, "value = weights_buffer[id];"));
+  EXPECT_TRUE(
+      absl::StrContains(sample_code, "__global float4* weights_buffer"));
 }
 
 TEST(ArgumentsTest, TestNoSelector) {
-  TestDescriptor descriptor;
+  BufferDescriptor desc;
+  desc.element_type = DataType::FLOAT32;
+  desc.element_size = 4;
+  desc.memory_type = MemoryType::GLOBAL;
+
   Arguments args;
-  args.AddObjectRef("object", AccessType::WRITE,
-                    absl::make_unique<TestDescriptor>(descriptor));
+  args.AddObjectRef("weights", AccessType::READ,
+                    absl::make_unique<BufferDescriptor>(std::move(desc)));
   std::string sample_code = R"(
   if (a < 3) {
-    value = args.object.Write(id);
+    value = args.weights.UnknownSelector(id);
   }
 )";
-  EXPECT_FALSE(args.TransformToCLCode({}, &sample_code).ok());
+  DeviceInfo device_info;
+  EXPECT_FALSE(args.TransformToCLCode(device_info, {}, &sample_code).ok());
 }
 
 TEST(ArgumentsTest, TestRenameArgs) {
diff --git a/tensorflow/lite/delegates/gpu/cl/buffer.cc b/tensorflow/lite/delegates/gpu/cl/buffer.cc
index 8639e8bbf18..340c2a7f9ac 100644
--- a/tensorflow/lite/delegates/gpu/cl/buffer.cc
+++ b/tensorflow/lite/delegates/gpu/cl/buffer.cc
@@ -28,25 +28,40 @@ namespace {
 absl::Status CreateBuffer(size_t size_in_bytes, bool gpu_read_only,
                           const void* data, CLContext* context,
                           Buffer* result) {
-  cl_mem_flags flags = gpu_read_only ? CL_MEM_READ_ONLY : CL_MEM_READ_WRITE;
-  if (data != nullptr) {
-    flags |= CL_MEM_COPY_HOST_PTR;
-  }
-  cl_int error_code;
-  cl_mem buffer = clCreateBuffer(context->context(), flags, size_in_bytes,
-                                 const_cast<void*>(data), &error_code);
-  if (!buffer) {
-    return absl::UnknownError(
-        absl::StrCat("Failed to allocate device memory (clCreateBuffer): ",
-                     CLErrorCodeToString(error_code)));
-  }
-
+  cl_mem buffer;
+  RETURN_IF_ERROR(CreateCLBuffer(context->context(), size_in_bytes,
+                                 gpu_read_only, const_cast<void*>(data),
+                                 &buffer));
   *result = Buffer(buffer, size_in_bytes);
 
   return absl::OkStatus();
 }
 }  // namespace
 
+BufferDescriptor::BufferDescriptor(BufferDescriptor&& desc)
+    : GPUObjectDescriptor(std::move(desc)),
+      element_type(desc.element_type),
+      element_size(desc.element_size),
+      memory_type(desc.memory_type),
+      attributes(std::move(desc.attributes)),
+      size(desc.size),
+      data(std::move(desc.data)) {}
+
+BufferDescriptor& BufferDescriptor::operator=(BufferDescriptor&& desc) {
+  if (this != &desc) {
+    std::swap(element_type, desc.element_type);
+    std::swap(element_size, desc.element_size);
+    std::swap(memory_type, desc.memory_type);
+    attributes = std::move(desc.attributes);
+    std::swap(size, desc.size);
+    data = std::move(desc.data);
+    GPUObjectDescriptor::operator=(std::move(desc));
+  }
+  return *this;
+}
+
+void BufferDescriptor::Release() { data.clear(); }
+
 GPUResources BufferDescriptor::GetGPUResources() const {
   GPUResources resources;
   GPUBufferDescriptor desc;
@@ -115,6 +130,14 @@ absl::Status BufferDescriptor::PerformGetPtrSelector(
   return absl::OkStatus();
 }
 
+absl::Status BufferDescriptor::CreateGPUObject(CLContext* context,
+                                               GPUObjectPtr* result) const {
+  Buffer gpu_buffer;
+  RETURN_IF_ERROR(gpu_buffer.CreateFromBufferDescriptor(*this, context));
+  *result = absl::make_unique<Buffer>(std::move(gpu_buffer));
+  return absl::OkStatus();
+}
+
 Buffer::Buffer(cl_mem buffer, size_t size_in_bytes)
     : buffer_(buffer), size_(size_in_bytes) {}
 
@@ -132,8 +155,6 @@ Buffer& Buffer::operator=(Buffer&& buffer) {
   return *this;
 }
 
-Buffer::~Buffer() { Release(); }
-
 void Buffer::Release() {
   if (buffer_) {
     clReleaseMemObject(buffer_);
@@ -153,6 +174,17 @@ absl::Status Buffer::GetGPUResources(const GPUObjectDescriptor* obj_ptr,
   return absl::OkStatus();
 }
 
+absl::Status Buffer::CreateFromBufferDescriptor(const BufferDescriptor& desc,
+                                                CLContext* context) {
+  bool read_only = desc.memory_type == MemoryType::CONSTANT;
+  uint8_t* data_ptr = desc.data.empty()
+                          ? nullptr
+                          : const_cast<unsigned char*>(desc.data.data());
+  size_ = desc.size;
+  return CreateCLBuffer(context->context(), desc.size, read_only, data_ptr,
+                        &buffer_);
+}
+
 absl::Status CreateReadOnlyBuffer(size_t size_in_bytes, CLContext* context,
                                   Buffer* result) {
   return CreateBuffer(size_in_bytes, true, nullptr, context, result);
diff --git a/tensorflow/lite/delegates/gpu/cl/buffer.h b/tensorflow/lite/delegates/gpu/cl/buffer.h
index dc5befebea2..60c48304e95 100644
--- a/tensorflow/lite/delegates/gpu/cl/buffer.h
+++ b/tensorflow/lite/delegates/gpu/cl/buffer.h
@@ -35,6 +35,16 @@ struct BufferDescriptor : public GPUObjectDescriptor {
   MemoryType memory_type = MemoryType::GLOBAL;
   std::vector<std::string> attributes;
 
+  // optional
+  int size = 0;
+  std::vector<uint8_t> data;
+
+  BufferDescriptor() = default;
+  BufferDescriptor(const BufferDescriptor&) = default;
+  BufferDescriptor& operator=(const BufferDescriptor&) = default;
+  BufferDescriptor(BufferDescriptor&& desc);
+  BufferDescriptor& operator=(BufferDescriptor&& desc);
+
   absl::Status PerformSelector(const std::string& selector,
                                const std::vector<std::string>& args,
                                const std::vector<std::string>& template_args,
@@ -46,6 +56,10 @@ struct BufferDescriptor : public GPUObjectDescriptor {
   absl::Status PerformGetPtrSelector(
       const std::vector<std::string>& args,
       const std::vector<std::string>& template_args, std::string* result) const;
+
+  absl::Status CreateGPUObject(CLContext* context,
+                               GPUObjectPtr* result) const override;
+  void Release() override;
 };
 
 // Buffer represent linear GPU data storage with arbitrary data format.
@@ -61,7 +75,7 @@ class Buffer : public GPUObject {
   Buffer(const Buffer&) = delete;
   Buffer& operator=(const Buffer&) = delete;
 
-  ~Buffer();
+  virtual ~Buffer() { Release(); }
 
   // for profiling and memory statistics
   uint64_t GetMemorySizeInBytes() const { return size_; }
@@ -80,6 +94,9 @@ class Buffer : public GPUObject {
   absl::Status GetGPUResources(const GPUObjectDescriptor* obj_ptr,
                                GPUResourcesWithValue* resources) const override;
 
+  absl::Status CreateFromBufferDescriptor(const BufferDescriptor& desc,
+                                          CLContext* context);
+
  private:
   void Release();
 
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_context.cc b/tensorflow/lite/delegates/gpu/cl/cl_context.cc
index e697c78b692..9a8f404c46e 100644
--- a/tensorflow/lite/delegates/gpu/cl/cl_context.cc
+++ b/tensorflow/lite/delegates/gpu/cl/cl_context.cc
@@ -43,6 +43,44 @@ std::vector<cl_image_format> GetSupportedImage2DFormats(cl_context context,
   return result;
 }
 
+bool IsEqualToImageFormat(cl_image_format image_format, DataType data_type,
+                          int num_channels) {
+  return image_format.image_channel_data_type ==
+             ToImageChannelType(data_type) &&
+         image_format.image_channel_order == ToChannelOrder(num_channels);
+}
+
+void AddSupportedImageFormats(cl_context context, DeviceInfo* info) {
+  auto supported_formats =
+      GetSupportedImage2DFormats(context, CL_MEM_READ_WRITE);
+  for (auto format : supported_formats) {
+    info->supports_r_f16_tex2d =
+        info->supports_r_f16_tex2d ||
+        IsEqualToImageFormat(format, DataType::FLOAT16, 1);
+    info->supports_rg_f16_tex2d =
+        info->supports_rg_f16_tex2d ||
+        IsEqualToImageFormat(format, DataType::FLOAT16, 2);
+    info->supports_rgb_f16_tex2d =
+        info->supports_rgb_f16_tex2d ||
+        IsEqualToImageFormat(format, DataType::FLOAT16, 3);
+    info->supports_rgba_f16_tex2d =
+        info->supports_rgba_f16_tex2d ||
+        IsEqualToImageFormat(format, DataType::FLOAT16, 4);
+    info->supports_r_f32_tex2d =
+        info->supports_r_f32_tex2d ||
+        IsEqualToImageFormat(format, DataType::FLOAT32, 1);
+    info->supports_rg_f32_tex2d =
+        info->supports_rg_f32_tex2d ||
+        IsEqualToImageFormat(format, DataType::FLOAT32, 2);
+    info->supports_rgb_f32_tex2d =
+        info->supports_rgb_f32_tex2d ||
+        IsEqualToImageFormat(format, DataType::FLOAT32, 3);
+    info->supports_rgba_f32_tex2d =
+        info->supports_rgba_f32_tex2d ||
+        IsEqualToImageFormat(format, DataType::FLOAT32, 4);
+  }
+}
+
 absl::Status CreateCLContext(const CLDevice& device,
                              cl_context_properties* properties,
                              CLContext* result) {
@@ -55,6 +93,7 @@ absl::Status CreateCLContext(const CLDevice& device,
         absl::StrCat("Failed to create a compute context - ",
                      CLErrorCodeToString(error_code)));
   }
+  AddSupportedImageFormats(context, &device.info_);
 
   *result = CLContext(context, true);
   return absl::OkStatus();
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_device.cc b/tensorflow/lite/delegates/gpu/cl/cl_device.cc
index b93bfb25ad1..cce72174df8 100644
--- a/tensorflow/lite/delegates/gpu/cl/cl_device.cc
+++ b/tensorflow/lite/delegates/gpu/cl/cl_device.cc
@@ -244,28 +244,48 @@ DeviceInfo DeviceInfoFromDeviceID(cl_device_id id) {
   info.max_work_group_size_x = max_work_group_sizes.x;
   info.max_work_group_size_y = max_work_group_sizes.y;
   info.max_work_group_size_z = max_work_group_sizes.z;
+
+  if (info.IsIntel()) {
+    if (info.SupportsExtension("cl_intel_required_subgroup_size")) {
+      size_t sub_groups_count;
+      cl_int status =
+          clGetDeviceInfo(id, 0x4108 /*CL_DEVICE_SUB_GROUP_SIZES_INTEL*/, 0,
+                          nullptr, &sub_groups_count);
+      if (status == CL_SUCCESS) {
+        std::vector<size_t> sub_group_sizes(sub_groups_count);
+        status = clGetDeviceInfo(id, 0x4108 /*CL_DEVICE_SUB_GROUP_SIZES_INTEL*/,
+                                 sizeof(size_t) * sub_groups_count,
+                                 sub_group_sizes.data(), nullptr);
+        if (status == CL_SUCCESS) {
+          for (int i = 0; i < sub_groups_count; ++i) {
+            info.supported_subgroup_sizes.push_back(sub_group_sizes[i]);
+          }
+        }
+      }
+    }
+  }
   return info;
 }
 
 CLDevice::CLDevice(cl_device_id id, cl_platform_id platform_id)
-    : id_(id), platform_id_(platform_id), info_(DeviceInfoFromDeviceID(id)) {}
+    : info_(DeviceInfoFromDeviceID(id)), id_(id), platform_id_(platform_id) {}
 
 CLDevice::CLDevice(const CLDevice& device)
-    : id_(device.id_), platform_id_(device.platform_id_), info_(device.info_) {}
+    : info_(device.info_), id_(device.id_), platform_id_(device.platform_id_) {}
 
 CLDevice& CLDevice::operator=(const CLDevice& device) {
   if (this != &device) {
+    info_ = device.info_;
     id_ = device.id_;
     platform_id_ = device.platform_id_;
-    info_ = device.info_;
   }
   return *this;
 }
 
 CLDevice::CLDevice(CLDevice&& device)
-    : id_(device.id_),
-      platform_id_(device.platform_id_),
-      info_(std::move(device.info_)) {
+    : info_(std::move(device.info_)),
+      id_(device.id_),
+      platform_id_(device.platform_id_) {
   device.id_ = nullptr;
   device.platform_id_ = nullptr;
 }
@@ -274,9 +294,9 @@ CLDevice& CLDevice::operator=(CLDevice&& device) {
   if (this != &device) {
     id_ = nullptr;
     platform_id_ = nullptr;
+    info_ = std::move(device.info_);
     std::swap(id_, device.id_);
     std::swap(platform_id_, device.platform_id_);
-    info_ = std::move(device.info_);
   }
   return *this;
 }
@@ -284,12 +304,7 @@ CLDevice& CLDevice::operator=(CLDevice&& device) {
 bool CLDevice::SupportsFP16() const { return info_.supports_fp16; }
 
 bool CLDevice::SupportsExtension(const std::string& extension) const {
-  for (const auto& ext : info_.extensions) {
-    if (ext == extension) {
-      return true;
-    }
-  }
-  return false;
+  return info_.SupportsExtension(extension);
 }
 
 bool CLDevice::SupportsTextureArray() const {
@@ -310,37 +325,10 @@ std::string CLDevice::GetPlatformVersion() const {
   return GetPlatformInfo(platform_id_, CL_PLATFORM_VERSION);
 }
 
-bool CLDevice::IsCL20OrHigher() const {
-  return info_.cl_version != OpenCLVersion::CL_1_0 &&
-         info_.cl_version != OpenCLVersion::CL_1_1 &&
-         info_.cl_version != OpenCLVersion::CL_1_2;
-}
+bool CLDevice::IsCL20OrHigher() const { return info_.IsCL20OrHigher(); }
 
 bool CLDevice::SupportsSubGroupWithSize(int sub_group_size) const {
-  if (IsIntel()) {
-    if (SupportsExtension("cl_intel_required_subgroup_size")) {
-      size_t sub_groups_count;
-      cl_int error =
-          clGetDeviceInfo(id_, 0x4108 /*CL_DEVICE_SUB_GROUP_SIZES_INTEL*/, 0,
-                          nullptr, &sub_groups_count);
-      if (error != CL_SUCCESS) {
-        return false;
-      }
-      std::vector<size_t> sub_group_sizes(sub_groups_count);
-      error = clGetDeviceInfo(id_, 0x4108 /*CL_DEVICE_SUB_GROUP_SIZES_INTEL*/,
-                              sizeof(size_t) * sub_groups_count,
-                              sub_group_sizes.data(), nullptr);
-      if (error != CL_SUCCESS) {
-        return false;
-      }
-      for (int i = 0; i < sub_groups_count; ++i) {
-        if (sub_group_sizes[i] == sub_group_size) {
-          return true;
-        }
-      }
-    }
-  }
-  return false;
+  return info_.SupportsSubGroupWithSize(sub_group_size);
 }
 
 bool CLDevice::IsAdreno() const { return info_.IsAdreno(); }
@@ -368,7 +356,7 @@ bool CLDevice::IsAMD() const { return info_.IsAMD(); }
 bool CLDevice::IsIntel() const { return info_.IsIntel(); }
 
 bool CLDevice::SupportsOneLayerTextureArray() const {
-  return !IsAdreno() || info_.adreno_info.support_one_layer_texture_array;
+  return info_.SupportsOneLayerTextureArray();
 }
 
 void CLDevice::DisableOneLayerTextureArray() {
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_device.h b/tensorflow/lite/delegates/gpu/cl/cl_device.h
index 7e4792b0a53..e7cd274661d 100644
--- a/tensorflow/lite/delegates/gpu/cl/cl_device.h
+++ b/tensorflow/lite/delegates/gpu/cl/cl_device.h
@@ -46,9 +46,6 @@ class CLDevice {
   cl_platform_id platform() const { return platform_id_; }
   std::string GetPlatformVersion() const;
 
-  const DeviceInfo& GetInfo() const { return info_; }
-  const DeviceInfo* GetInfoPtr() const { return &info_; }
-
   Vendor vendor() const { return info_.vendor; }
   OpenCLVersion cl_version() const { return info_.cl_version; }
   bool SupportsFP16() const;
@@ -76,10 +73,13 @@ class CLDevice {
   bool SupportsOneLayerTextureArray() const;
   void DisableOneLayerTextureArray();
 
+  // We update device info during context creation, so as supported texture
+  // formats can be requested from context only.
+  mutable DeviceInfo info_;
+
  private:
   cl_device_id id_ = nullptr;
   cl_platform_id platform_id_ = nullptr;
-  DeviceInfo info_;
 };
 
 absl::Status CreateDefaultGPUDevice(CLDevice* result);
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_kernel.cc b/tensorflow/lite/delegates/gpu/cl/cl_kernel.cc
index c498c14dfe8..7a8aaf6102f 100644
--- a/tensorflow/lite/delegates/gpu/cl/cl_kernel.cc
+++ b/tensorflow/lite/delegates/gpu/cl/cl_kernel.cc
@@ -58,8 +58,7 @@ absl::Status GetKernelPrivateMemorySize(cl_kernel kernel,
 }  // namespace
 
 CLKernel::CLKernel(CLKernel&& kernel)
-    : private_memory_size_(kernel.private_memory_size_),
-      max_work_group_size_(kernel.max_work_group_size_),
+    : info_(kernel.info_),
       binding_counter_(kernel.binding_counter_),
       function_name_(std::move(kernel.function_name_)),
       program_(kernel.program_),
@@ -70,8 +69,7 @@ CLKernel::CLKernel(CLKernel&& kernel)
 CLKernel& CLKernel::operator=(CLKernel&& kernel) {
   if (this != &kernel) {
     Release();
-    std::swap(private_memory_size_, kernel.private_memory_size_);
-    std::swap(max_work_group_size_, kernel.max_work_group_size_);
+    std::swap(info_, kernel.info_);
     std::swap(binding_counter_, kernel.binding_counter_);
     function_name_ = std::move(kernel.function_name_);
     std::swap(program_, kernel.program_);
@@ -119,9 +117,9 @@ absl::Status CLKernel::CreateFromProgram(const CLProgram& program,
   clRetainProgram(program_);
 
   RETURN_IF_ERROR(GetKernelPrivateMemorySize(kernel_, program.GetDeviceId(),
-                                             &private_memory_size_));
+                                             &info_.private_memory_size));
   RETURN_IF_ERROR(GetKernelMaxWorkGroupSize(kernel_, program.GetDeviceId(),
-                                            &max_work_group_size_));
+                                            &info_.max_work_group_size));
   return absl::OkStatus();
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_kernel.h b/tensorflow/lite/delegates/gpu/cl/cl_kernel.h
index 81a777ed822..0af8052f738 100644
--- a/tensorflow/lite/delegates/gpu/cl/cl_kernel.h
+++ b/tensorflow/lite/delegates/gpu/cl/cl_kernel.h
@@ -28,6 +28,11 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
+struct KernelInfo {
+  int private_memory_size;
+  int max_work_group_size;
+};
+
 // Arguments binding to CLKernel can be manual or automatic
 // In manual you specify binding index explicitly
 // In automatic binding, index auto-incremented with every binding call
@@ -61,9 +66,6 @@ class CLKernel {
     return SetBytesAuto(static_cast<const void*>(&value), sizeof(T));
   }
 
-  int GetPrivateMemorySize() const { return private_memory_size_; }
-  int GetMaxWorkGroupSize() const { return max_work_group_size_; }
-
   int GetBindingCounter() const { return binding_counter_; }
   void ResetBindingCounter() { binding_counter_ = 0; }
 
@@ -71,13 +73,13 @@ class CLKernel {
   // workaround for Mali memory leak
   absl::Status ReInit() const;
 
+  KernelInfo info_;
+
  private:
   void Release();
   absl::Status SetBytes(int index, const void* ptr, int length) const;
   absl::Status SetBytesAuto(const void* ptr, int length);
 
-  int private_memory_size_;
-  int max_work_group_size_;
   int binding_counter_ = -1;
 
   std::string function_name_;
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_program.cc b/tensorflow/lite/delegates/gpu/cl/cl_program.cc
index 3b821dc3a5d..a67ebae8ca3 100644
--- a/tensorflow/lite/delegates/gpu/cl/cl_program.cc
+++ b/tensorflow/lite/delegates/gpu/cl/cl_program.cc
@@ -78,13 +78,13 @@ std::string CompilerOptionToString(const CLDevice& device,
                                    CompilerOptions option) {
   switch (option) {
     case CompilerOptions::ADRENO_FULL_SIMD_LINE:
-      if (device.GetInfo().adreno_info.gpu_version < 500) {
+      if (device.info_.adreno_info.gpu_version < 500) {
         return "-qcom-accelerate-16-bit";
       } else {
         return "-qcom-accelerate-16-bit=true";
       }
     case CompilerOptions::ADRENO_MORE_WAVES:
-      if (device.GetInfo().adreno_info.gpu_version >= 500) {
+      if (device.info_.adreno_info.gpu_version >= 500) {
         return "-qcom-accelerate-16-bit=false";
       } else {
         return "";
@@ -95,6 +95,8 @@ std::string CompilerOptionToString(const CLDevice& device,
       return "-cl-opt-disable";
     case CompilerOptions::CL_2_0:
       return "-cl-std=CL2.0";
+    case CompilerOptions::CL_3_0:
+      return "-cl-std=CL3.0";
   }
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_program.h b/tensorflow/lite/delegates/gpu/cl/cl_program.h
index 138b7d9fbd0..af8239ae7f5 100644
--- a/tensorflow/lite/delegates/gpu/cl/cl_program.h
+++ b/tensorflow/lite/delegates/gpu/cl/cl_program.h
@@ -41,7 +41,8 @@ enum class CompilerOptions {
   ADRENO_MORE_WAVES,
   POWERVR_FP16,
   CL_OPT_DISABLE,
-  CL_2_0
+  CL_2_0,
+  CL_3_0,
 };
 
 std::string CompilerOptionsToString(
diff --git a/tensorflow/lite/delegates/gpu/cl/device_info.cc b/tensorflow/lite/delegates/gpu/cl/device_info.cc
index 7e0acb87ab7..5d035e34617 100644
--- a/tensorflow/lite/delegates/gpu/cl/device_info.cc
+++ b/tensorflow/lite/delegates/gpu/cl/device_info.cc
@@ -231,6 +231,52 @@ bool DeviceInfo::SupportsImage3D() const {
   return supports_image3d_writes;
 }
 
+bool DeviceInfo::SupportsFloatImage2D(DataType data_type, int channels) const {
+  if (channels == 1) {
+    return data_type == DataType::FLOAT32 ? supports_r_f32_tex2d
+                                          : supports_r_f16_tex2d;
+  } else if (channels == 2) {
+    return data_type == DataType::FLOAT32 ? supports_rg_f32_tex2d
+                                          : supports_rg_f16_tex2d;
+  } else if (channels == 3) {
+    return data_type == DataType::FLOAT32 ? supports_rgb_f32_tex2d
+                                          : supports_rgb_f16_tex2d;
+  } else if (channels == 4) {
+    return data_type == DataType::FLOAT32 ? supports_rgba_f32_tex2d
+                                          : supports_rgba_f16_tex2d;
+  } else {
+    return false;
+  }
+}
+
+bool DeviceInfo::SupportsOneLayerTextureArray() const {
+  return !IsAdreno() || adreno_info.support_one_layer_texture_array;
+}
+
+bool DeviceInfo::SupportsExtension(const std::string& extension) const {
+  for (const auto& ext : extensions) {
+    if (ext == extension) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool DeviceInfo::IsCL20OrHigher() const {
+  return cl_version != OpenCLVersion::CL_1_0 &&
+         cl_version != OpenCLVersion::CL_1_1 &&
+         cl_version != OpenCLVersion::CL_1_2;
+}
+
+bool DeviceInfo::SupportsSubGroupWithSize(int sub_group_size) const {
+  for (auto subgroup_size : supported_subgroup_sizes) {
+    if (sub_group_size == subgroup_size) {
+      return true;
+    }
+  }
+  return false;
+}
+
 bool DeviceInfo::IsAdreno() const { return vendor == Vendor::kQualcomm; }
 
 bool DeviceInfo::IsAdreno3xx() const {
diff --git a/tensorflow/lite/delegates/gpu/cl/device_info.h b/tensorflow/lite/delegates/gpu/cl/device_info.h
index b13fe3df846..abb3feb07b1 100644
--- a/tensorflow/lite/delegates/gpu/cl/device_info.h
+++ b/tensorflow/lite/delegates/gpu/cl/device_info.h
@@ -19,6 +19,8 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+
 // for use only in device_info.cc, but keep here to make tests
 int GetAdrenoGPUVersion(const std::string& gpu_version);
 
@@ -131,6 +133,15 @@ struct DeviceInfo {
   bool SupportsImageBuffer() const;
   bool SupportsImage3D() const;
 
+  bool SupportsFloatImage2D(DataType data_type, int channels) const;
+
+  // To track bug on some Adreno. b/131099086
+  bool SupportsOneLayerTextureArray() const;
+
+  bool SupportsExtension(const std::string& extension) const;
+  bool IsCL20OrHigher() const;
+  bool SupportsSubGroupWithSize(int sub_group_size) const;
+
   std::vector<std::string> extensions;
   bool supports_fp16;
   bool supports_image3d_writes;
@@ -148,6 +159,7 @@ struct DeviceInfo {
   int max_work_group_size_x;
   int max_work_group_size_y;
   int max_work_group_size_z;
+  std::vector<int> supported_subgroup_sizes;
 
   // rtn is ROUND_TO_NEAREST
   // with rtn precision is much better then with rtz (ROUND_TO_ZERO)
@@ -157,6 +169,16 @@ struct DeviceInfo {
   bool supports_fp32_rtn;
   bool supports_fp16_rtn;
 
+  bool supports_r_f16_tex2d = false;
+  bool supports_rg_f16_tex2d = false;
+  bool supports_rgb_f16_tex2d = false;
+  bool supports_rgba_f16_tex2d = false;
+
+  bool supports_r_f32_tex2d = false;
+  bool supports_rg_f32_tex2d = false;
+  bool supports_rgb_f32_tex2d = false;
+  bool supports_rgba_f32_tex2d = false;
+
   AdrenoInfo adreno_info;
   MaliInfo mali_info;
 };
diff --git a/tensorflow/lite/delegates/gpu/cl/environment.cc b/tensorflow/lite/delegates/gpu/cl/environment.cc
index c8b0b56978c..785e88299a7 100644
--- a/tensorflow/lite/delegates/gpu/cl/environment.cc
+++ b/tensorflow/lite/delegates/gpu/cl/environment.cc
@@ -47,7 +47,7 @@ __kernel void main_function(__write_only image2d_array_t dst) {
 absl::Status CheckKernelSupportOfOneLayerTextureArray(Environment* env,
                                                       bool* result) {
   // No bug on Adreno 6xx
-  if (env->device().GetInfo().adreno_info.gpu_version >= 600) {
+  if (env->device().info_.adreno_info.gpu_version >= 600) {
     *result = true;
     return absl::OkStatus();
   }
@@ -59,7 +59,7 @@ absl::Status CheckKernelSupportOfOneLayerTextureArray(Environment* env,
   Tensor tensor;
   const BHWC shape(1, 4, 4, 4);
   RETURN_IF_ERROR(CreateTensor(
-      env->context(), env->device(), shape,
+      env->context(), shape,
       {DataType::FLOAT32, TensorStorageType::TEXTURE_ARRAY, Layout::HWC},
       &tensor));
   RETURN_IF_ERROR(kernel.SetMemory(0, tensor.GetMemoryPtr()));
@@ -242,7 +242,7 @@ TensorStorageType GetFastestStorageType(const CLDevice& gpu) {
   } else if (gpu.IsPowerVR()) {
     return TensorStorageType::TEXTURE_2D;
   } else if (gpu.IsMali()) {
-    const MaliInfo mali_info = gpu.GetInfo().mali_info;
+    const MaliInfo mali_info = gpu.info_.mali_info;
     if (mali_info.IsMaliT8xx() || mali_info.IsBifrostGen3() ||
         mali_info.IsValhall()) {
       return TensorStorageType::TEXTURE_2D;
diff --git a/tensorflow/lite/delegates/gpu/cl/gpu_api_delegate.cc b/tensorflow/lite/delegates/gpu/cl/gpu_api_delegate.cc
index fc8fcde439b..e0933ed56e1 100644
--- a/tensorflow/lite/delegates/gpu/cl/gpu_api_delegate.cc
+++ b/tensorflow/lite/delegates/gpu/cl/gpu_api_delegate.cc
@@ -27,7 +27,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/model_builder.h"
 #include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
-#include "tensorflow/lite/delegates/gpu/common/transformations/general_transformations.h"
+#include "tensorflow/lite/delegates/gpu/common/transformations/model_transformations.h"
 
 namespace tflite {
 namespace gpu {
@@ -97,8 +97,8 @@ class Delegate {
     // Apply general transformations on the graph.
     NullTransformationReporter reporter;
     ModelTransformer transformer(&graph, &reporter);
-    if (!ApplyGeneralTransformations(&transformer)) {
-      return absl::InternalError("Graph general transformations failed");
+    if (!ApplyModelTransformations(&transformer)) {
+      return absl::InternalError("Graph transformations failed");
     }
 
     InferenceEnvironmentOptions env_options;
diff --git a/tensorflow/lite/delegates/gpu/cl/gpu_object.h b/tensorflow/lite/delegates/gpu/cl/gpu_object.h
index 68a8877ca59..297a5f70858 100644
--- a/tensorflow/lite/delegates/gpu/cl/gpu_object.h
+++ b/tensorflow/lite/delegates/gpu/cl/gpu_object.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
 #include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
 #include "tensorflow/lite/delegates/gpu/common/access_type.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
@@ -119,9 +120,21 @@ struct GPUResourcesWithValue {
   std::vector<std::pair<std::string, cl_mem>> custom_memories;
 };
 
+class GPUObject;
+
 class GPUObjectDescriptor {
  public:
   GPUObjectDescriptor() = default;
+  GPUObjectDescriptor(const GPUObjectDescriptor&) = default;
+  GPUObjectDescriptor& operator=(const GPUObjectDescriptor&) = default;
+  GPUObjectDescriptor(GPUObjectDescriptor&& obj_desc)
+      : state_vars_(std::move(obj_desc.state_vars_)) {}
+  GPUObjectDescriptor& operator=(GPUObjectDescriptor&& obj_desc) {
+    if (this != &obj_desc) {
+      state_vars_ = std::move(obj_desc.state_vars_);
+    }
+    return *this;
+  }
   virtual ~GPUObjectDescriptor() = default;
 
   void SetStateVar(const std::string& key, const std::string& value) const {
@@ -141,6 +154,12 @@ class GPUObjectDescriptor {
   }
   virtual GPUResources GetGPUResources() const { return GPUResources(); }
 
+  virtual absl::Status CreateGPUObject(
+      CLContext* context, std::unique_ptr<GPUObject>* result) const {
+    return absl::OkStatus();
+  }
+  virtual void Release() {}
+
   void SetAccess(AccessType access_type) { access_type_ = access_type; }
   AccessType GetAccess() const { return access_type_; }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/inference_context.cc b/tensorflow/lite/delegates/gpu/cl/inference_context.cc
index 8e23eb1bcee..684421d7cd4 100644
--- a/tensorflow/lite/delegates/gpu/cl/inference_context.cc
+++ b/tensorflow/lite/delegates/gpu/cl/inference_context.cc
@@ -21,9 +21,10 @@ limitations under the License.
 #include <map>
 #include <memory>
 #include <string>
-#include <unordered_set>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/lite/delegates/gpu/cl/buffer.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
@@ -49,7 +50,7 @@ namespace gpu {
 namespace cl {
 
 namespace {
-bool IsReady(const std::unordered_set<ValueId>& ready_tensors,
+bool IsReady(const absl::flat_hash_set<ValueId>& ready_tensors,
              const CLNode& node) {
   for (const ValueId in_id : node.inputs) {
     if (ready_tensors.find(in_id) == ready_tensors.end()) {
@@ -62,42 +63,25 @@ bool IsReady(const std::unordered_set<ValueId>& ready_tensors,
 std::vector<std::pair<ValueId, TensorDescriptor>> GetCLNodeTensors(
     const CLNode& node) {
   std::vector<std::pair<ValueId, TensorDescriptor>> result;
-  const OperationDef main_def = node.operations[0]->GetDefinition();
-  const auto& first_range = node.ranges[0];
-  for (int k = first_range.x; k < first_range.y; ++k) {
-    result.push_back({node.inputs[k], main_def.src_tensors[k - first_range.x]});
-  }
-  for (int j = 1; j < node.ranges.size(); ++j) {
-    const auto& range = node.ranges[j];
-    const OperationDef op_def = node.operations[j]->GetDefinition();
-    for (int k = range.x; k < range.y; ++k) {
-      result.push_back({node.inputs[k], op_def.src_tensors[k - range.x + 1]});
-    }
+  result.reserve(node.inputs.size() + node.outputs.size());
+  const OperationDef op_def = node.operation->GetDefinition();
+  for (int j = 0; j < node.inputs.size(); ++j) {
+    result.push_back({node.inputs[j], op_def.src_tensors[j]});
   }
   for (int j = 0; j < node.outputs.size(); ++j) {
-    result.push_back({node.outputs[j], main_def.dst_tensors[j]});
+    result.push_back({node.outputs[j], op_def.dst_tensors[j]});
   }
 
   return result;
 }
 
-void MergeCLNodes(CLNode* src, CLNode* dst) {
-  int offset = dst->inputs.size();
+absl::Status MergeCLNodes(CLNode* src, CLNode* dst) {
   for (int j = 1; j < src->inputs.size(); ++j) {
     dst->inputs.push_back(src->inputs[j]);
   }
-  auto first_range = src->ranges[0];
-  dst->ranges.push_back(
-      int2(first_range.x + offset, first_range.y - 1 + offset));
-  for (int i = 1; i < src->ranges.size(); ++i) {
-    auto range = src->ranges[i];
-    dst->ranges.push_back(int2(range.x + offset, range.y + offset));
-  }
   dst->outputs[0] = src->outputs[0];
-  for (int i = 0; i < src->operations.size(); ++i) {
-    dst->operations.push_back(std::move(src->operations[i]));
-  }
   dst->name += " linked : " + src->name;
+  return dst->operation->AddOperation(src->operation.get());
 }
 
 void AddUsage(ValueId id, int task_index,
@@ -152,18 +136,16 @@ bool IsGenericAdd(const Node& node, const std::vector<Value*>& inputs,
 }  // namespace
 
 CLNode::CLNode(CLNode&& node)
-    : operations(std::move(node.operations)),
+    : operation(std::move(node.operation)),
       inputs(std::move(node.inputs)),
       outputs(std::move(node.outputs)),
-      ranges(std::move(node.ranges)),
       name(std::move(node.name)) {}
 
 CLNode& CLNode::operator=(CLNode&& node) {
   if (this != &node) {
-    operations = std::move(node.operations);
+    operation = std::move(node.operation);
     inputs = std::move(node.inputs);
     outputs = std::move(node.outputs);
-    ranges = std::move(node.ranges);
     name = std::move(node.name);
   }
   return *this;
@@ -178,7 +160,7 @@ absl::Status InferenceContext::InitFromGraph(
   creation_context.queue = env->queue();
   creation_context.cache = env->program_cache();
 
-  ReserveGraphTensors(create_info, creation_context, graph);
+  ReserveGraphTensors(create_info, creation_context.GetDeviceInfo(), graph);
   precision_ = create_info.precision;
   storage_type_ = create_info.storage_type;
   if (env->device().IsMali()) {
@@ -192,17 +174,17 @@ absl::Status InferenceContext::InitFromGraph(
     need_flush_ = true;
   }
   CopyInAndOutIds(graph);
-  RETURN_IF_ERROR(
-      ConvertOperations(creation_context, graph, create_info.hints));
-  Merge();
-  RETURN_IF_ERROR(AllocateMemory(env->device(), creation_context.context));
+  RETURN_IF_ERROR(ConvertOperations(creation_context.GetDeviceInfo(), graph,
+                                    create_info.hints));
+  RETURN_IF_ERROR(Merge());
+  RETURN_IF_ERROR(AllocateMemory(creation_context.context));
   BindMemoryToOperations();
   RETURN_IF_ERROR(Compile(creation_context));
   RETURN_IF_ERROR(UpdateParams());
 
   TuningParameters tuning_parameters;
   tuning_parameters.queue = env->profiling_queue();
-  tuning_parameters.info = env->device().GetInfoPtr();
+  tuning_parameters.info = &env->device().info_;
   if (create_info.hints.Check(ModelHints::kFastTuning)) {
     tuning_parameters.tuning_type = TuningType::FAST;
   }
@@ -224,6 +206,11 @@ void InferenceContext::CopyInAndOutIds(const GraphFloat32& graph) {
     input_ids_.push_back(input->id);
   }
 
+  const auto variable_inputs = graph.variable_inputs();
+  for (const auto& variable_input : variable_inputs) {
+    variable_ids_and_refs_[variable_input->id] = variable_input->tensor.ref;
+  }
+
   const auto outputs = graph.outputs();
   for (const auto& output : outputs) {
     output_ids_.push_back(output->id);
@@ -231,8 +218,8 @@ void InferenceContext::CopyInAndOutIds(const GraphFloat32& graph) {
 }
 
 void InferenceContext::ReserveGraphTensors(
-    const CreateInferenceInfo& create_info,
-    const CreationContext& creation_context, const GraphFloat32& graph) {
+    const CreateInferenceInfo& create_info, const DeviceInfo& device_info,
+    const GraphFloat32& graph) {
   ValueId max_id;
   auto tensors = graph.values();
   auto data_type = DeduceDataTypeFromPrecision(create_info.precision);
@@ -243,15 +230,14 @@ void InferenceContext::ReserveGraphTensors(
     if (graph.IsGraphInput(t->id) || graph.IsGraphOutput(t->id)) {
       if (shape.c < 4 &&
           CanCreateTensorWithShape(
-              *creation_context.context, *creation_context.device, shape,
+              device_info, shape,
               TensorDescriptor{data_type, TensorStorageType::SINGLE_TEXTURE_2D,
                                layout})) {
         storage_type = TensorStorageType::SINGLE_TEXTURE_2D;
       }
     }
-    storage_type = SelectBestStorageType(*creation_context.context,
-                                         *creation_context.device, shape,
-                                         storage_type, data_type, layout);
+    storage_type = SelectBestStorageType(device_info, shape, storage_type,
+                                         data_type, layout);
     tensor_reserver_.Add(
         t->id, {shape, TensorDescriptor{data_type, storage_type, layout}});
     max_id = std::max(max_id, t->id);
@@ -259,9 +245,9 @@ void InferenceContext::ReserveGraphTensors(
   tensor_reserver_.SetNext(max_id + 1);
 }
 
-absl::Status InferenceContext::ConvertOperations(
-    const CreationContext& creation_context, const GraphFloat32& graph,
-    ModelHints hints) {
+absl::Status InferenceContext::ConvertOperations(const DeviceInfo& device_info,
+                                                 const GraphFloat32& graph,
+                                                 ModelHints hints) {
   std::map<ValueId, TensorDescriptor> tensor_descriptors;
   const auto values = graph.values();
   for (auto value : values) {
@@ -282,7 +268,7 @@ absl::Status InferenceContext::ConvertOperations(
     }
     GPUOperationsSubgraph gpu_subgraph;
     if (hints.Check(ModelHints::kAllowSpecialKernels) &&
-        GPUSubgraphFromGraph(creation_context, precision_, graph, node.id,
+        GPUSubgraphFromGraph(device_info, precision_, graph, node.id,
                              tensor_descriptors, &consumed_nodes, &gpu_subgraph)
             .ok()) {
       // Mapping of subgraph (set of nodes) to GPU operations. Should happen
@@ -321,11 +307,10 @@ absl::Status InferenceContext::ConvertOperations(
         op_def.dst_tensors.push_back(
             tensor_reserver_.Get(outputs[j]->id).descriptor);
       }
-      RETURN_IF_ERROR(GPUOperationFromNode(creation_context, op_def, hints,
-                                           inputs, outputs, node,
-                                           &gpu_subgraph));
+      RETURN_IF_ERROR(GPUOperationFromNode(device_info, op_def, hints, inputs,
+                                           outputs, node, &gpu_subgraph));
     }
-    std::unordered_map<int, ValueId> mapping_to_global_ids;
+    absl::flat_hash_map<int, ValueId> mapping_to_global_ids;
     for (int j = 0; j < gpu_subgraph.new_tensors.size(); ++j) {
       const auto& t = gpu_subgraph.new_tensors[j];
       auto global_id = tensor_reserver_.Add({t.first, t.second});
@@ -333,9 +318,7 @@ absl::Status InferenceContext::ConvertOperations(
     }
     for (auto& gpu_op : gpu_subgraph.operations) {
       CLNode cl_node;
-      cl_node.operations.push_back(std::move(gpu_op.operation));
-      cl_node.ranges.push_back(
-          int2(0, static_cast<int>(gpu_op.input_ids.size())));
+      cl_node.operation = std::move(gpu_op.operation);
       cl_node.inputs.resize(gpu_op.input_ids.size());
       for (int j = 0; j < gpu_op.input_ids.size(); ++j) {
         int id = gpu_op.input_ids[j];
@@ -363,8 +346,8 @@ absl::Status InferenceContext::ConvertOperations(
   return absl::OkStatus();
 }
 
-void InferenceContext::Merge() {
-  std::unordered_set<ValueId> ready_tensors;
+absl::Status InferenceContext::Merge() {
+  absl::flat_hash_set<ValueId> ready_tensors;
   for (const auto& input_id : input_ids_) {
     ready_tensors.insert(input_id);
   }
@@ -390,66 +373,90 @@ void InferenceContext::Merge() {
       continue;
     }
     auto& linkable_node = nodes_[next_nodes[0]];
-    if (!linkable_node.operations[0]->IsLinkable() ||
+    if (!linkable_node.operation->IsLinkable() ||
         linkable_node.outputs.size() != 1 ||
         !IsReady(ready_tensors, linkable_node)) {
       continue;
     }
     const auto& original_dst_def =
-        node.operations[0]->GetDefinition().dst_tensors[0];
+        node.operation->GetDefinition().dst_tensors[0];
     const auto& link_dst_def =
-        linkable_node.operations[0]->GetDefinition().dst_tensors[0];
+        linkable_node.operation->GetDefinition().dst_tensors[0];
     if (original_dst_def != link_dst_def) {
       continue;
     }
-    MergeCLNodes(&linkable_node, &node);
+    RETURN_IF_ERROR(MergeCLNodes(&linkable_node, &node));
     nodes_.erase(nodes_.begin() + next_nodes[0]);
     i -= 1;
   }
-  for (auto& node : nodes_) {
-    for (int j = 1; j < node.operations.size(); ++j) {
-      node.operations[0]->AddOperation(node.operations[j].get());
-    }
-  }
+  return absl::OkStatus();
 }
 
-void InferenceContext::GetUsages(
-    const std::function<bool(const TensorDescriptor&)>& functor,
-    std::map<ValueId, int2>* usages) {
+void InferenceContext::GetUsages(const std::function<bool(ValueId)>& functor,
+                                 std::map<ValueId, int2>* usages) {
   for (ValueId in_id : input_ids_) {
-    const auto& desc = tensor_reserver_.Get(in_id).descriptor;
-    if (functor(desc)) {
+    if (functor(in_id)) {
       AddUsage(in_id, 0, usages);
     }
   }
   for (int op_index = 0; op_index < nodes_.size(); ++op_index) {
     auto tensors = GetCLNodeTensors(nodes_[op_index]);
     for (auto& tensor : tensors) {
-      if (functor(tensor.second)) {
+      if (functor(tensor.first)) {
         AddUsage(tensor.first, op_index, usages);
       }
     }
   }
   for (ValueId out_id : output_ids_) {
-    const auto& desc = tensor_reserver_.Get(out_id).descriptor;
-    if (functor(desc)) {
+    if (functor(out_id)) {
       AddUsage(out_id, nodes_.size(), usages);
     }
   }
 }
 
-absl::Status InferenceContext::AllocateMemory(const CLDevice& device,
-                                              CLContext* context) {
-  RETURN_IF_ERROR(AllocateMemoryForBuffers(device, context));
-  RETURN_IF_ERROR(AllocateMemoryForStrongShapes(device, context));
+InferenceContext::TensorMemoryType InferenceContext::GetTensorMemoryType(
+    ValueId id) {
+  if (variable_ids_and_refs_.find(id) != variable_ids_and_refs_.end()) {
+    return TensorMemoryType::VARIABLE;
+  } else if (IsBufferBased(tensor_reserver_.Get(id).descriptor.storage_type)) {
+    return TensorMemoryType::BUFFER;
+  } else {
+    return TensorMemoryType::STRONG_SHAPE;
+  }
+}
+
+absl::Status InferenceContext::AllocateMemory(CLContext* context) {
+  RETURN_IF_ERROR(AllocateMemoryForVariableTensors(context));
+  RETURN_IF_ERROR(AllocateMemoryForBuffers(context));
+  RETURN_IF_ERROR(AllocateMemoryForStrongShapes(context));
   return absl::OkStatus();
 }
 
-absl::Status InferenceContext::AllocateMemoryForBuffers(const CLDevice& device,
-                                                        CLContext* context) {
+absl::Status InferenceContext::AllocateMemoryForVariableTensors(
+    CLContext* context) {
+  std::map<ValueId, int> ref_value_to_tensor_index;
+
+  for (auto value_and_ref_value : variable_ids_and_refs_) {
+    if (ref_value_to_tensor_index.find(value_and_ref_value.second) ==
+        ref_value_to_tensor_index.end()) {
+      const auto& t = tensor_reserver_.Get(value_and_ref_value.first);
+      const auto& shape = t.shape;
+      const auto& descriptor = t.descriptor;
+
+      RETURN_IF_ERROR(
+          CreateTensor(*context, shape, descriptor,
+                       &variable_tensors_[value_and_ref_value.second]));
+    }
+  }
+  return absl::OkStatus();
+}
+
+absl::Status InferenceContext::AllocateMemoryForBuffers(CLContext* context) {
   std::map<ValueId, int2> buffer_usages;
   GetUsages(
-      [](const TensorDescriptor& t) { return IsBufferBased(t.storage_type); },
+      [this](ValueId id) {
+        return GetTensorMemoryType(id) == TensorMemoryType::BUFFER;
+      },
       &buffer_usages);
 
   std::vector<TensorUsageRecord<size_t>> buffer_usage_records;
@@ -483,7 +490,7 @@ absl::Status InferenceContext::AllocateMemoryForBuffers(const CLDevice& device,
   for (auto& node : nodes_) {
     auto tensors = GetCLNodeTensors(node);
     for (auto& t : tensors) {
-      if (!IsBufferBased(t.second.storage_type)) continue;
+      if (GetTensorMemoryType(t.first) != TensorMemoryType::BUFFER) continue;
       const int tensor_index = graph_ids_to_shared_buffer_tensors_[t.first];
       if (created_tensors[tensor_index]) continue;
       const auto& shape = tensor_reserver_.Get(t.first).shape;
@@ -498,10 +505,12 @@ absl::Status InferenceContext::AllocateMemoryForBuffers(const CLDevice& device,
 }
 
 absl::Status InferenceContext::AllocateMemoryForStrongShapes(
-    const CLDevice& device, CLContext* context) {
+    CLContext* context) {
   std::map<ValueId, int2> usages;
   GetUsages(
-      [](const TensorDescriptor& t) { return !IsBufferBased(t.storage_type); },
+      [this](ValueId id) {
+        return GetTensorMemoryType(id) == TensorMemoryType::STRONG_SHAPE;
+      },
       &usages);
 
   std::vector<TensorUsageRecord<DummyTensor>> usage_records;
@@ -520,13 +529,15 @@ absl::Status InferenceContext::AllocateMemoryForStrongShapes(
   for (auto& node : nodes_) {
     auto tensors = GetCLNodeTensors(node);
     for (auto& t : tensors) {
-      if (IsBufferBased(t.second.storage_type)) continue;
+      if (GetTensorMemoryType(t.first) != TensorMemoryType::STRONG_SHAPE) {
+        continue;
+      }
       const auto& shape = tensor_reserver_.Get(t.first).shape;
       const auto id = assignment.object_ids[remap_from_graph_ids[t.first]];
       graph_ids_to_strong_shape_tensors_[t.first] = id;
       const auto& it = strong_shape_tensors_.find(id);
       if (it == strong_shape_tensors_.end()) {
-        RETURN_IF_ERROR(CreateTensor(*context, device, shape, t.second,
+        RETURN_IF_ERROR(CreateTensor(*context, shape, t.second,
                                      &strong_shape_tensors_[id]));
       }
     }
@@ -536,19 +547,11 @@ absl::Status InferenceContext::AllocateMemoryForStrongShapes(
 
 void InferenceContext::BindMemoryToOperations() {
   for (auto& node : nodes_) {
-    const auto& first_range = node.ranges[0];
-    for (int k = first_range.x; k < first_range.y; ++k) {
-      node.operations[0]->SetSrc(GetTensor(node.inputs[k]), k - first_range.x);
+    for (int i = 0; i < node.inputs.size(); ++i) {
+      node.operation->SetSrc(GetTensor(node.inputs[i]), i);
     }
-    for (int i = 1; i < node.ranges.size(); ++i) {
-      const auto& range = node.ranges[i];
-      for (int k = range.x; k < range.y; ++k) {
-        node.operations[i]->SetSrc(GetTensor(node.inputs[k]), k - range.x + 1);
-      }
-    }
-
     for (int i = 0; i < node.outputs.size(); ++i) {
-      node.operations[0]->SetDst(GetTensor(node.outputs[i]), i);
+      node.operation->SetDst(GetTensor(node.outputs[i]), i);
     }
   }
 }
@@ -556,21 +559,21 @@ void InferenceContext::BindMemoryToOperations() {
 absl::Status InferenceContext::Compile(
     const CreationContext& creation_context) {
   for (auto& node : nodes_) {
-    RETURN_IF_ERROR(node.operations[0]->Compile(creation_context));
+    RETURN_IF_ERROR(node.operation->Compile(creation_context));
   }
   return absl::OkStatus();
 }
 
 absl::Status InferenceContext::Tune(const TuningParameters& tuning_parameters) {
   for (auto& node : nodes_) {
-    RETURN_IF_ERROR(node.operations[0]->Tune(tuning_parameters));
+    RETURN_IF_ERROR(node.operation->Tune(tuning_parameters));
   }
   return absl::OkStatus();
 }
 
 absl::Status InferenceContext::UpdateParams() {
   for (auto& node : nodes_) {
-    RETURN_IF_ERROR(node.operations[0]->UpdateParams());
+    RETURN_IF_ERROR(node.operation->UpdateParams());
   }
   return absl::OkStatus();
 }
@@ -584,7 +587,7 @@ absl::Status InferenceContext::AddToQueue(CLCommandQueue* queue) {
   }
   int counter = 0;
   for (auto& node : nodes_) {
-    RETURN_IF_ERROR(node.operations[0]->AddToQueue(queue));
+    RETURN_IF_ERROR(node.operation->AddToQueue(queue));
     counter++;
     if (flush_periodically_ && counter % flush_period_ == 0) {
       clFlush(queue->queue());
@@ -601,7 +604,7 @@ absl::Status InferenceContext::Profile(ProfilingCommandQueue* queue,
   queue->ResetMeasurements();
   for (auto& node : nodes_) {
     queue->SetEventsLabel(node.name);
-    RETURN_IF_ERROR(node.operations[0]->AddToQueue(queue));
+    RETURN_IF_ERROR(node.operation->AddToQueue(queue));
   }
   RETURN_IF_ERROR(queue->WaitForCompletion());
   *result = queue->GetProfilingInfo();
@@ -617,13 +620,18 @@ uint64_t InferenceContext::GetSizeOfMemoryAllocatedForIntermediateTensors()
   for (const auto& b : shared_buffers_) {
     total_memory += b.GetMemorySizeInBytes();
   }
+  for (const auto& t : variable_tensors_) {
+    total_memory += t.second.GetMemorySizeInBytes();
+  }
 
   return total_memory;
 }
 
 Tensor* InferenceContext::GetTensor(ValueId id) {
-  if (graph_ids_to_shared_buffer_tensors_.find(id) !=
-      graph_ids_to_shared_buffer_tensors_.end()) {
+  if (variable_ids_and_refs_.find(id) != variable_ids_and_refs_.end()) {
+    return &variable_tensors_[variable_ids_and_refs_[id]];
+  } else if (graph_ids_to_shared_buffer_tensors_.find(id) !=
+             graph_ids_to_shared_buffer_tensors_.end()) {
     return &shared_buffer_tensors_[graph_ids_to_shared_buffer_tensors_[id]];
   } else {
     return &strong_shape_tensors_[graph_ids_to_strong_shape_tensors_[id]];
diff --git a/tensorflow/lite/delegates/gpu/cl/inference_context.h b/tensorflow/lite/delegates/gpu/cl/inference_context.h
index 3f05026b795..d30f9c49174 100644
--- a/tensorflow/lite/delegates/gpu/cl/inference_context.h
+++ b/tensorflow/lite/delegates/gpu/cl/inference_context.h
@@ -20,12 +20,13 @@ limitations under the License.
 #include <functional>
 #include <map>
 #include <memory>
-#include <unordered_map>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/lite/delegates/gpu/cl/buffer.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
 #include "tensorflow/lite/delegates/gpu/cl/environment.h"
+#include "tensorflow/lite/delegates/gpu/cl/gpu_object.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
 #include "tensorflow/lite/delegates/gpu/cl/model_hints.h"
 #include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
@@ -40,12 +41,9 @@ namespace gpu {
 namespace cl {
 
 struct CLNode {
-  std::vector<std::unique_ptr<GPUOperation>> operations;
+  std::unique_ptr<GPUOperation> operation;
   std::vector<ValueId> inputs;
   std::vector<ValueId> outputs;
-  // So as CLNode can have few operations, ranges keep range of ids from inputs,
-  // for every operation.
-  std::vector<int2> ranges;
 
   // Mostly for debug purposes.
   std::string name;
@@ -65,6 +63,7 @@ class InferenceContext {
     TensorStorageType storage_type;
     ModelHints hints;
   };
+
   absl::Status InitFromGraph(const CreateInferenceInfo& create_info,
                              const GraphFloat32& graph, Environment* env);
 
@@ -91,26 +90,30 @@ class InferenceContext {
                                TensorFloat32* result);
 
  private:
+  enum TensorMemoryType { STRONG_SHAPE = 0, BUFFER = 1, VARIABLE = 2 };
+
   void CopyInAndOutIds(const GraphFloat32& graph);
-  absl::Status ConvertOperations(const CreationContext& creation_context,
+  absl::Status ConvertOperations(const DeviceInfo& device_info,
                                  const GraphFloat32& graph, ModelHints hints);
   void CreateLinks();
   void ReserveGraphTensors(const CreateInferenceInfo& create_info,
-                           const CreationContext& creation_context,
+                           const DeviceInfo& device_info,
                            const GraphFloat32& graph);
-  void Merge();
-  absl::Status AllocateMemory(const CLDevice& device, CLContext* context);
+  absl::Status Merge();
+  absl::Status AllocateMemory(CLContext* context);
 
-  absl::Status AllocateMemoryForBuffers(const CLDevice& device,
-                                        CLContext* context);
+  absl::Status AllocateMemoryForVariableTensors(CLContext* context);
 
-  absl::Status AllocateMemoryForStrongShapes(const CLDevice& device,
-                                             CLContext* context);
+  absl::Status AllocateMemoryForBuffers(CLContext* context);
+
+  absl::Status AllocateMemoryForStrongShapes(CLContext* context);
 
   // utility function
-  void GetUsages(const std::function<bool(const TensorDescriptor&)>& functor,
+  void GetUsages(const std::function<bool(ValueId)>& functor,
                  std::map<ValueId, int2>* usages);
 
+  TensorMemoryType GetTensorMemoryType(ValueId id);
+
   void BindMemoryToOperations();
   absl::Status Compile(const CreationContext& creation_context);
   absl::Status Tune(const TuningParameters& tuning_parameters);
@@ -160,11 +163,12 @@ class InferenceContext {
     DummyTensor Get(ValueId id) { return reservations_[id]; }
 
    private:
-    std::unordered_map<ValueId, DummyTensor> reservations_;
+    absl::flat_hash_map<ValueId, DummyTensor> reservations_;
     ValueId next_;
   };
   TensorReserver tensor_reserver_;
 
+  std::map<ValueId, Tensor> variable_tensors_;
   std::vector<Buffer> shared_buffers_;
   std::vector<Tensor>
       shared_buffer_tensors_;  // use references to memory from shared_buffers_
@@ -174,6 +178,7 @@ class InferenceContext {
   std::map<ValueId, ValueId> graph_ids_to_strong_shape_tensors_;
 
   std::vector<ValueId> input_ids_;
+  std::map<ValueId, ValueId> variable_ids_and_refs_;
   std::vector<ValueId> output_ids_;
 };
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/BUILD b/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
index b89e7d7252a..1858d070350 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
@@ -104,31 +104,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "conv_3d",
-    srcs = ["conv_3d.cc"],
-    hdrs = ["conv_3d.h"],
-    deps = [
-        ":gpu_operation",
-        ":util",
-        ":work_group_picking",
-        "//tensorflow/lite/delegates/gpu/cl:buffer",
-        "//tensorflow/lite/delegates/gpu/cl:cl_device",
-        "//tensorflow/lite/delegates/gpu/cl:linear_storage",
-        "//tensorflow/lite/delegates/gpu/cl:precision",
-        "//tensorflow/lite/delegates/gpu/cl:tensor",
-        "//tensorflow/lite/delegates/gpu/cl:tensor_type",
-        "//tensorflow/lite/delegates/gpu/cl:util",
-        "//tensorflow/lite/delegates/gpu/common:data_type",
-        "//tensorflow/lite/delegates/gpu/common:operations",
-        "//tensorflow/lite/delegates/gpu/common:shape",
-        "//tensorflow/lite/delegates/gpu/common:status",
-        "//tensorflow/lite/delegates/gpu/common:tensor",
-        "//tensorflow/lite/delegates/gpu/common:types",
-        "@com_google_absl//absl/strings",
-    ],
-)
-
 cc_library(
     name = "conv_buffer_1x1",
     srcs = ["conv_buffer_1x1.cc"],
@@ -232,6 +207,7 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/cl:precision",
         "//tensorflow/lite/delegates/gpu/cl:tensor",
         "//tensorflow/lite/delegates/gpu/cl:tensor_type",
+        "//tensorflow/lite/delegates/gpu/cl:texture2d",
         "//tensorflow/lite/delegates/gpu/cl:util",
         "//tensorflow/lite/delegates/gpu/common:data_type",
         "//tensorflow/lite/delegates/gpu/common:operations",
@@ -240,6 +216,7 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:tensor",
         "//tensorflow/lite/delegates/gpu/common:types",
         "//tensorflow/lite/delegates/gpu/common:winograd_util",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -383,30 +360,6 @@ cc_test(
     ],
 )
 
-cc_library(
-    name = "convolution_transposed_3d",
-    srcs = ["convolution_transposed_3d.cc"],
-    hdrs = ["convolution_transposed_3d.h"],
-    deps = [
-        ":gpu_operation",
-        ":util",
-        ":work_group_picking",
-        "//tensorflow/lite/delegates/gpu/cl:buffer",
-        "//tensorflow/lite/delegates/gpu/cl:linear_storage",
-        "//tensorflow/lite/delegates/gpu/cl:tensor",
-        "//tensorflow/lite/delegates/gpu/cl:tensor_type",
-        "//tensorflow/lite/delegates/gpu/cl:texture2d",
-        "//tensorflow/lite/delegates/gpu/cl:util",
-        "//tensorflow/lite/delegates/gpu/common:data_type",
-        "//tensorflow/lite/delegates/gpu/common:operations",
-        "//tensorflow/lite/delegates/gpu/common:shape",
-        "//tensorflow/lite/delegates/gpu/common:status",
-        "//tensorflow/lite/delegates/gpu/common:tensor",
-        "//tensorflow/lite/delegates/gpu/common:types",
-        "@com_google_absl//absl/strings",
-    ],
-)
-
 cc_library(
     name = "convolution_transposed_3x3",
     srcs = ["convolution_transposed_3x3.cc"],
@@ -680,17 +633,21 @@ cc_library(
     hdrs = ["fully_connected.h"],
     deps = [
         ":gpu_operation",
+        ":tuning_parameters",
         ":util",
+        "//tensorflow/lite/delegates/gpu/cl:arguments",
         "//tensorflow/lite/delegates/gpu/cl:buffer",
+        "//tensorflow/lite/delegates/gpu/cl:cl_kernel",
+        "//tensorflow/lite/delegates/gpu/cl:device_info",
         "//tensorflow/lite/delegates/gpu/cl:linear_storage",
-        "//tensorflow/lite/delegates/gpu/cl:tensor",
-        "//tensorflow/lite/delegates/gpu/cl:util",
+        "//tensorflow/lite/delegates/gpu/cl:precision",
         "//tensorflow/lite/delegates/gpu/common:data_type",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:shape",
-        "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common:tensor",
         "//tensorflow/lite/delegates/gpu/common:types",
+        "//tensorflow/lite/delegates/gpu/common:util",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -705,8 +662,14 @@ cc_test(
     deps = [
         ":cl_test",
         ":fully_connected",
+        ":gpu_operation",
+        "//tensorflow/lite/delegates/gpu/cl:environment",
+        "//tensorflow/lite/delegates/gpu/cl:precision",
+        "//tensorflow/lite/delegates/gpu/cl:tensor_type",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
         "//tensorflow/lite/delegates/gpu/common:operations",
-        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:shape",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -766,6 +729,25 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "lstm_full_test",
+    srcs = ["lstm_full_test.cc"],
+    linkstatic = True,
+    tags = tf_gpu_tests_tags() + [
+        "linux",
+        "local",
+    ],
+    deps = [
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/delegates/gpu:delegate",
+        "//tensorflow/lite/kernels:test_main",
+        "//tensorflow/lite/kernels:test_util",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
+
 cc_library(
     name = "mean_stddev_normalization",
     srcs = ["mean_stddev_normalization.cc"],
@@ -774,6 +756,8 @@ cc_library(
         ":gpu_operation",
         ":util",
         ":work_group_picking",
+        "//tensorflow/lite/delegates/gpu/cl:cl_program",
+        "//tensorflow/lite/delegates/gpu/cl:device_info",
         "//tensorflow/lite/delegates/gpu/cl:precision",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
@@ -938,6 +922,7 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/cl:cl_context",
         "//tensorflow/lite/delegates/gpu/cl:cl_kernel",
         "//tensorflow/lite/delegates/gpu/cl:linear_storage",
+        "//tensorflow/lite/delegates/gpu/cl:storage_type_util",
         "//tensorflow/lite/delegates/gpu/common:data_type",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
@@ -1001,6 +986,37 @@ cc_test(
     ],
 )
 
+cc_library(
+    name = "reduce",
+    srcs = ["reduce.cc"],
+    hdrs = ["reduce.h"],
+    deps = [
+        ":gpu_operation",
+        ":util",
+        "//tensorflow/lite/delegates/gpu/cl:precision",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:types",
+    ],
+)
+
+cc_test(
+    name = "reduce_test",
+    srcs = ["reduce_test.cc"],
+    linkstatic = True,
+    tags = tf_gpu_tests_tags() + [
+        "linux",
+        "local",
+    ],
+    deps = [
+        ":cl_test",
+        ":reduce",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "relu",
     srcs = ["relu.cc"],
@@ -1296,7 +1312,7 @@ cc_library(
     srcs = ["util.cc"],
     hdrs = ["util.h"],
     deps = [
-        "//tensorflow/lite/delegates/gpu/cl:cl_device",
+        "//tensorflow/lite/delegates/gpu/cl:device_info",
         "//tensorflow/lite/delegates/gpu/cl:precision",
         "//tensorflow/lite/delegates/gpu/cl:tensor_type",
         "//tensorflow/lite/delegates/gpu/common:access_type",
@@ -1388,9 +1404,12 @@ test_suite(
         "fully_connected_test",
         "lstm_test",
         "max_unpooling_test",
+        "mean_stddev_normalization_test",
+        "mean_test",
         "padding_test",
         "pooling_test",
         "prelu_test",
+        "reduce_test",
         "relu_test",
         "reshape_test",
         "reshapex4_test",
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/cl_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/cl_test.cc
index f864a731446..0112241117e 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/cl_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/cl_test.cc
@@ -34,8 +34,7 @@ absl::Status ExecuteGPUOperation(const std::vector<TensorFloat32>& src_cpu,
       return absl::InvalidArgumentError(
           "Layout doesn't have Batch dimension, but shape.b != 1");
     }
-    RETURN_IF_ERROR(CreateTensor(*creation_context.context,
-                                 *creation_context.device, src_shape,
+    RETURN_IF_ERROR(CreateTensor(*creation_context.context, src_shape,
                                  op_def.src_tensors[0], &src[i]));
     RETURN_IF_ERROR(src[i].WriteData(creation_context.queue, src_cpu[i]));
     operation->SetSrc(&src[i], i);
@@ -48,8 +47,7 @@ absl::Status ExecuteGPUOperation(const std::vector<TensorFloat32>& src_cpu,
       return absl::InvalidArgumentError(
           "Layout doesn't have Batch dimension, but shape.b != 1");
     }
-    RETURN_IF_ERROR(CreateTensor(*creation_context.context,
-                                 *creation_context.device, dst_shape,
+    RETURN_IF_ERROR(CreateTensor(*creation_context.context, dst_shape,
                                  op_def.dst_tensors[0], &dst[i]));
 
     operation->SetDst(&dst[i], i);
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/concat_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/concat_test.cc
index b2e2e23b6f9..f5f019177de 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/concat_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/concat_test.cc
@@ -51,7 +51,7 @@ TEST_F(OpenCLOperationTest, ConcatWidth) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ConcatXY operation = CreateConcatXY(op_def, attr);
+      GPUOperation operation = CreateConcatXY(op_def, attr);
       ASSERT_OK(ExecuteGPUOperation({src0, src1}, creation_context_, &operation,
                                     BHWC(1, 2, 3, 2), &dst_tensor));
       EXPECT_THAT(
@@ -83,7 +83,7 @@ TEST_F(OpenCLOperationTest, ConcatHeight) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ConcatXY operation = CreateConcatXY(op_def, attr);
+      GPUOperation operation = CreateConcatXY(op_def, attr);
       ASSERT_OK(ExecuteGPUOperation({src0, src1}, creation_context_, &operation,
                                     BHWC(1, 3, 1, 2), &dst_tensor));
       EXPECT_THAT(
@@ -117,8 +117,8 @@ TEST_F(OpenCLOperationTest, ConcatChannels) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ConcatZ operation =
-          CreateConcatZ(op_def, {1, 2, 3}, env_.GetDevicePtr()->GetInfo());
+      GPUOperation operation =
+          CreateConcatZ(op_def, {1, 2, 3}, env_.GetDevicePtr()->info_);
       ASSERT_OK(ExecuteGPUOperation({src0, src1, src2}, creation_context_,
                                     &operation, BHWC(1, 2, 1, 6), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -151,8 +151,8 @@ TEST_F(OpenCLOperationTest, ConcatChannelsAlignedx4) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ConcatZ operation =
-          CreateConcatZ(op_def, {4, 4}, env_.GetDevicePtr()->GetInfo());
+      GPUOperation operation =
+          CreateConcatZ(op_def, {4, 4}, env_.GetDevicePtr()->info_);
       ASSERT_OK(ExecuteGPUOperation({src0, src1}, creation_context_, &operation,
                                     BHWC(1, 2, 1, 8), &dst_tensor));
       EXPECT_THAT(
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.cc b/tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.cc
index 7aaa587503e..fa5b933db8a 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.cc
@@ -27,28 +27,13 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 namespace cl {
-ConcatXY::ConcatXY(const OperationDef& definition, const ConcatAttributes& attr)
-    : GPUOperation(definition) {
-  code_ = GetConcatKernelCode(definition, attr);
-}
-
-ConcatXY::ConcatXY(ConcatXY&& operation) : GPUOperation(std::move(operation)) {}
-
-ConcatXY& ConcatXY::operator=(ConcatXY&& operation) {
-  if (this != &operation) {
-    GPUOperation::operator=(std::move(operation));
-  }
-  return *this;
-}
-
-std::string ConcatXY::GetConcatKernelCode(const OperationDef& op_def,
-                                          const ConcatAttributes& attr) {
+namespace {
+std::string GetConcatKernelCode(const OperationDef& op_def,
+                                const ConcatAttributes& attr) {
   std::vector<std::string> tensor_names(op_def.src_tensors.size());
   for (int i = 0; i < op_def.src_tensors.size(); ++i) {
     tensor_names[i] = "src_tensor_" + std::to_string(i);
-    AddSrcTensor(tensor_names[i], op_def.src_tensors[i]);
   }
-  AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
 
   std::map<Axis, std::string> axis_to_selector = {
       {Axis::WIDTH, "Width"}, {Axis::HEIGHT, "Height"},
@@ -127,17 +112,19 @@ std::string ConcatXY::GetConcatKernelCode(const OperationDef& op_def,
   c += "}\n";
   return c;
 }
+}  // namespace
 
-int3 ConcatXY::GetGridSize() const {
-  const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
-  const int grid_y = dst_[0]->Height() * dst_[0]->Depth();
-  const int grid_z = dst_[0]->Slices();
-  return int3(grid_x, grid_y, grid_z);
-}
-
-ConcatXY CreateConcatXY(const OperationDef& definition,
-                        const ConcatAttributes& attr) {
-  return ConcatXY(definition, attr);
+GPUOperation CreateConcatXY(const OperationDef& definition,
+                            const ConcatAttributes& attr) {
+  GPUOperation op(definition);
+  for (int i = 0; i < definition.src_tensors.size(); ++i) {
+    const std::string name = "src_tensor_" + std::to_string(i);
+    op.AddSrcTensor(name, definition.src_tensors[i]);
+  }
+  op.AddDstTensor("dst_tensor", definition.dst_tensors[0]);
+  op.code_ = GetConcatKernelCode(definition, attr);
+  op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_SToZ;
+  return op;
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.h b/tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.h
index 7732064808b..9dd3fcee52a 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.h
@@ -26,24 +26,8 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-class ConcatXY : public GPUOperation {
- public:
-  ConcatXY(const OperationDef& definition, const ConcatAttributes& attr);
-  int3 GetGridSize() const override;
-
-  // Move only
-  ConcatXY(ConcatXY&& operation);
-  ConcatXY& operator=(ConcatXY&& operation);
-  ConcatXY(const ConcatXY&) = delete;
-  ConcatXY& operator=(const ConcatXY&) = delete;
-
- private:
-  std::string GetConcatKernelCode(const OperationDef& op_def,
-                                  const ConcatAttributes& attr);
-};
-
-ConcatXY CreateConcatXY(const OperationDef& definition,
-                        const ConcatAttributes& attr);
+GPUOperation CreateConcatXY(const OperationDef& definition,
+                            const ConcatAttributes& attr);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/concat_z.cc b/tensorflow/lite/delegates/gpu/cl/kernels/concat_z.cc
index 067ef25a988..2c027c91a81 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/concat_z.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/concat_z.cc
@@ -36,53 +36,12 @@ bool IsAllChannelsX4(const std::vector<int>& channels) {
   return true;
 }
 
-}  // namespace
-
-ConcatZ::ConcatZ(const OperationDef& definition,
-                 const std::vector<int>& channels,
-                 const DeviceInfo& device_info)
-    : GPUOperation(definition) {
-  code_ = GetConcatKernelCode(definition, channels);
-  if (device_info.IsPowerVR() &&
-      definition.precision == CalculationsPrecision::F32 &&
-      !IsAllChannelsX4(channels)) {
-    // BUG, some PowerVRs (GE8320) produce incorrect result without it
-    compiler_options_.push_back(CompilerOptions::CL_OPT_DISABLE);
-  }
-  if (device_info.IsAMD() &&
-      definition.precision != CalculationsPrecision::F32 &&
-      definition.src_tensors[0].storage_type != TensorStorageType::BUFFER &&
-      !IsAllChannelsX4(channels)) {
-    // BUG, some AMD gpus crash without it
-    compiler_options_.push_back(CompilerOptions::CL_OPT_DISABLE);
-  }
-}
-
-ConcatZ::ConcatZ(ConcatZ&& kernel) : GPUOperation(std::move(kernel)) {}
-
-ConcatZ& ConcatZ::operator=(ConcatZ&& kernel) {
-  if (this != &kernel) {
-    GPUOperation::operator=(std::move(kernel));
-  }
-  return *this;
-}
-
-std::string ConcatZ::GetConcatKernelCode(const OperationDef& op_def,
-                                         const std::vector<int>& channels) {
+std::string GetConcatKernelCode(const OperationDef& op_def,
+                                const std::vector<int>& channels) {
   std::vector<std::string> tensor_names(op_def.src_tensors.size());
   for (int i = 0; i < op_def.src_tensors.size(); ++i) {
     tensor_names[i] = "src_tensor_" + std::to_string(i);
-    auto src_desc = op_def.src_tensors[i];
-    if (op_def.IsBatchSupported()) {
-      src_desc.SetStateVar("BatchedWidth", "true");
-    }
-    AddSrcTensor(tensor_names[i], src_desc);
   }
-  auto dst_desc = op_def.dst_tensors[0];
-  if (op_def.IsBatchSupported()) {
-    dst_desc.SetStateVar("BatchedWidth", "true");
-  }
-  AddDstTensor("dst_tensor", dst_desc);
 
   std::string c = GetCommonDefines(op_def.precision);
   c += "__kernel void main_function(\n";
@@ -161,17 +120,41 @@ std::string ConcatZ::GetConcatKernelCode(const OperationDef& op_def,
   return c;
 }
 
-int3 ConcatZ::GetGridSize() const {
-  const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
-  const int grid_y = dst_[0]->Height();
-  const int grid_z = dst_[0]->Depth();
-  return int3(grid_x, grid_y, grid_z);
-}
+}  // namespace
 
-ConcatZ CreateConcatZ(const OperationDef& definition,
-                      const std::vector<int>& channels,
-                      const DeviceInfo& device_info) {
-  return ConcatZ(definition, channels, device_info);
+GPUOperation CreateConcatZ(const OperationDef& definition,
+                           const std::vector<int>& channels,
+                           const DeviceInfo& device_info) {
+  GPUOperation op(definition);
+  for (int i = 0; i < definition.src_tensors.size(); ++i) {
+    const std::string name = "src_tensor_" + std::to_string(i);
+    auto src_desc = definition.src_tensors[i];
+    if (definition.IsBatchSupported()) {
+      src_desc.SetStateVar("BatchedWidth", "true");
+    }
+    op.AddSrcTensor(name, src_desc);
+  }
+  auto dst_desc = definition.dst_tensors[0];
+  if (definition.IsBatchSupported()) {
+    dst_desc.SetStateVar("BatchedWidth", "true");
+  }
+  op.AddDstTensor("dst_tensor", dst_desc);
+  op.code_ = GetConcatKernelCode(definition, channels);
+  if (device_info.IsPowerVR() &&
+      definition.precision == CalculationsPrecision::F32 &&
+      !IsAllChannelsX4(channels)) {
+    // BUG, some PowerVRs (GE8320) produce incorrect result without it
+    op.compiler_options_.push_back(CompilerOptions::CL_OPT_DISABLE);
+  }
+  if (device_info.IsAMD() &&
+      definition.precision != CalculationsPrecision::F32 &&
+      definition.src_tensors[0].storage_type != TensorStorageType::BUFFER &&
+      !IsAllChannelsX4(channels)) {
+    // BUG, some AMD gpus crash without it
+    op.compiler_options_.push_back(CompilerOptions::CL_OPT_DISABLE);
+  }
+  op.tensor_to_grid_ = TensorToGrid::kWBToX_HToY_DToZ;
+  return op;
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/concat_z.h b/tensorflow/lite/delegates/gpu/cl/kernels/concat_z.h
index f3835093e2b..b209d8f3cd2 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/concat_z.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/concat_z.h
@@ -29,26 +29,9 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-class ConcatZ : public GPUOperation {
- public:
-  ConcatZ(const OperationDef& definition, const std::vector<int>& channels,
-          const DeviceInfo& device_info);
-  int3 GetGridSize() const override;
-
-  // Move only
-  ConcatZ(ConcatZ&& kernel);
-  ConcatZ& operator=(ConcatZ&& kernel);
-  ConcatZ(const ConcatZ&) = delete;
-  ConcatZ& operator=(const ConcatZ&) = delete;
-
- private:
-  std::string GetConcatKernelCode(const OperationDef& op_def,
-                                  const std::vector<int>& channels);
-};
-
-ConcatZ CreateConcatZ(const OperationDef& definition,
-                      const std::vector<int>& channels,
-                      const DeviceInfo& device_info);
+GPUOperation CreateConcatZ(const OperationDef& definition,
+                           const std::vector<int>& channels,
+                           const DeviceInfo& device_info);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_3d.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_3d.cc
deleted file mode 100644
index b1e1e39327c..00000000000
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_3d.cc
+++ /dev/null
@@ -1,862 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_3d.h"
-
-#include <algorithm>
-#include <string>
-#include <utility>
-
-#include "absl/strings/substitute.h"
-#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
-#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
-#include "tensorflow/lite/delegates/gpu/cl/precision.h"
-#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
-#include "tensorflow/lite/delegates/gpu/common/data_type.h"
-#include "tensorflow/lite/delegates/gpu/common/status.h"
-
-namespace tflite {
-namespace gpu {
-namespace cl {
-namespace {
-std::string GenerateUploadByThreads(const std::string& local_ptr_name,
-                                    const std::string& global_ptr_name,
-                                    const std::string& global_offset_name,
-                                    const std::string& lid_name,
-                                    int total_work_items,
-                                    int elements_to_upload) {
-  std::string c;
-  std::string offset =
-      global_offset_name.empty() ? "" : global_offset_name + " + ";
-  const int groups = elements_to_upload / total_work_items;
-  const int reminder = elements_to_upload % total_work_items;
-  for (int i = 0; i < groups; ++i) {
-    c += "    " + local_ptr_name + "[" + lid_name + " + " +
-         std::to_string(total_work_items * i) + "] = " + global_ptr_name + "[" +
-         offset + lid_name + " + " + std::to_string(total_work_items * i) +
-         "];\n";
-  }
-  if (reminder != 0) {
-    c += "    if (" + lid_name + " < " + std::to_string(reminder) + ") {\n";
-    c += "      " + local_ptr_name + "[" + lid_name + " + " +
-         std::to_string(total_work_items * groups) + "] = " + global_ptr_name +
-         "[" + offset + lid_name + " + " +
-         std::to_string(total_work_items * groups) + "];\n";
-    c += "    }\n";
-  }
-  return c;
-}
-
-std::string GenerateAsyncUpload(const std::string& local_ptr_name,
-                                const std::string& global_ptr_name,
-                                const std::string& global_offset_name,
-                                int elements_to_upload) {
-  std::string c;
-  std::string offset =
-      global_offset_name.empty() ? "" : " + " + global_offset_name;
-  c += "    async_work_group_copy(" + local_ptr_name + ", " + global_ptr_name +
-       offset + ", " + std::to_string(elements_to_upload) + ", 0);\n";
-  return c;
-}
-
-std::string GenerateGlobalCoordinates(const int4& block_size,
-                                      const int3& work_group_launch_order) {
-  std::string c;
-  int3 launch_remap;
-  launch_remap[work_group_launch_order.x] = 0;
-  launch_remap[work_group_launch_order.y] = 1;
-  launch_remap[work_group_launch_order.z] = 2;
-  if (work_group_launch_order[0] == 0) {
-    c += "  int DST_X = get_global_id(0) * " + std::to_string(block_size.x) +
-         ";\n";
-  } else {
-    c += "  int DST_X = (get_group_id(" + std::to_string(launch_remap[0]) +
-         ") * get_local_size(0) + get_local_id(0)) * " +
-         std::to_string(block_size.x) + ";\n";
-  }
-  if (work_group_launch_order[1] == 1) {
-    c += "  int DST_Y = get_global_id(1) * " + std::to_string(block_size.y) +
-         ";\n";
-  } else {
-    c += "  int DST_Y = (get_group_id(" + std::to_string(launch_remap[1]) +
-         ") * get_local_size(1) + get_local_id(1)) * " +
-         std::to_string(block_size.y) + ";\n";
-  }
-  if (work_group_launch_order[2] == 2) {
-    c += "  int linear_id_z = get_global_id(2);\n";
-  } else {
-    c += "  int linear_id_z = get_group_id(" + std::to_string(launch_remap[2]) +
-         ") * get_local_size(2) + get_local_id(2);\n";
-  }
-  c += "  int DST_S = (linear_id_z % args.grid_size_s) * " +
-       std::to_string(block_size.w) + ";\n";
-  c += "  int DST_Z = (linear_id_z / args.grid_size_s) * " +
-       std::to_string(block_size.z) + ";\n";
-  return c;
-}
-
-std::string GenerateConv(CalculationsPrecision precision,
-                         const int4& block_size, int offset,
-                         bool weights_are_buffer) {
-  std::string c;
-  const std::string channels[] = {"x", "y", "z", "w"};
-  for (int s = 0; s < block_size.w; ++s) {
-    switch (precision) {
-      case CalculationsPrecision::F32:
-      case CalculationsPrecision::F16:
-        for (int ch = 0; ch < 4; ++ch) {
-          const std::string weight_id = std::to_string(s * 4 + ch + offset);
-          std::string weight_name;
-          if (weights_are_buffer) {
-            weight_name = "weights_cache[" + weight_id + "]";
-          } else {
-            weight_name = "f" + weight_id;
-          }
-          for (int z = 0; z < block_size.z; ++z) {
-            for (int y = 0; y < block_size.y; ++y) {
-              for (int x = 0; x < block_size.x; ++x) {
-                std::string id =
-                    std::to_string(z) + std::to_string(y) + std::to_string(x);
-                c += "    r" + std::to_string(s) + id + " += " + weight_name +
-                     " * src" + id + "." + channels[ch] + ";\n";
-              }
-            }
-          }
-        }
-        break;
-      case CalculationsPrecision::F32_F16:
-        for (int z = 0; z < block_size.z; ++z) {
-          for (int y = 0; y < block_size.y; ++y) {
-            for (int x = 0; x < block_size.x; ++x) {
-              std::string id =
-                  std::to_string(z) + std::to_string(y) + std::to_string(x);
-              std::vector<std::string> weight_names(4);
-              for (int i = 0; i < 4; ++i) {
-                std::string weight_id = std::to_string(s * 4 + i + offset);
-                if (weights_are_buffer) {
-                  weight_names[i] = "weights_cache[" + weight_id + "]";
-                } else {
-                  weight_names[i] = "f" + weight_id;
-                }
-              }
-              c += absl::Substitute(
-                  "    $0 += convert_float4($1.x * $2 + $1.y * $3 + $1.z * "
-                  "$4 + $1.w * $5);\n",
-                  "r" + std::to_string(s) + id, "src" + id, weight_names[0],
-                  weight_names[1], weight_names[2], weight_names[3]);
-            }
-          }
-        }
-        break;
-    }
-  }
-  return c;
-}
-}  // namespace
-
-Conv3D::Conv3D(const OperationDef& definition,
-               const Convolution3DAttributes& attr, const CLDevice& device)
-    : GPUOperation(definition),
-      stride_(attr.strides.w, attr.strides.h, attr.strides.d),
-      padding_(-attr.padding.prepended.w, -attr.padding.prepended.h,
-               -attr.padding.prepended.d),
-      kernel_size_(attr.weights.shape.w, attr.weights.shape.h,
-                   attr.weights.shape.d),
-      dilation_(attr.dilations.w, attr.dilations.h, attr.dilations.d),
-      conv_params_(GuessBestParams(device, definition, attr)) {
-  const bool stride_correction =
-      definition_.IsBatchSupported() && stride_.x != 1;
-  code_ = GenerateConv3D(definition_, stride_correction, conv_params_);
-  if (definition_.precision == CalculationsPrecision::F16 &&
-      device.IsPowerVR()) {
-    compiler_options_.push_back(CompilerOptions::POWERVR_FP16);
-  }
-}
-
-Conv3D::Conv3D(Conv3D&& operation)
-    : GPUOperation(std::move(operation)),
-      stride_(operation.stride_),
-      padding_(operation.padding_),
-      kernel_size_(operation.kernel_size_),
-      dilation_(operation.dilation_),
-      conv_params_(operation.conv_params_) {}
-
-Conv3D& Conv3D::operator=(Conv3D&& operation) {
-  if (this != &operation) {
-    std::swap(stride_, operation.stride_);
-    std::swap(padding_, operation.padding_);
-    std::swap(kernel_size_, operation.kernel_size_);
-    std::swap(dilation_, operation.dilation_);
-    std::swap(conv_params_, operation.conv_params_);
-    GPUOperation::operator=(std::move(operation));
-  }
-  return *this;
-}
-
-absl::Status Conv3D::BindArguments() {
-  if (!conv_params_.x_kernel_is_1) {
-    RETURN_IF_ERROR(args_.SetInt("stride_x", stride_.x));
-    RETURN_IF_ERROR(args_.SetInt("padding_x", padding_.x * src_[0]->Batch()));
-    RETURN_IF_ERROR(args_.SetInt("kernel_size_x", kernel_size_.x));
-    RETURN_IF_ERROR(args_.SetInt("dilation_x", dilation_.x * src_[0]->Batch()));
-  }
-  if (!conv_params_.y_kernel_is_1) {
-    RETURN_IF_ERROR(args_.SetInt("stride_y", stride_.y));
-    RETURN_IF_ERROR(args_.SetInt("padding_y", padding_.y));
-    RETURN_IF_ERROR(args_.SetInt("kernel_size_y", kernel_size_.y));
-    RETURN_IF_ERROR(args_.SetInt("dilation_y", dilation_.y));
-  }
-  if (!conv_params_.z_kernel_is_1) {
-    RETURN_IF_ERROR(args_.SetInt("stride_z", stride_.z));
-    RETURN_IF_ERROR(args_.SetInt("padding_z", padding_.z));
-    RETURN_IF_ERROR(args_.SetInt("kernel_size_z", kernel_size_.z));
-    RETURN_IF_ERROR(args_.SetInt("dilation_z", dilation_.z));
-  }
-  return args_.SetInt("grid_size_s", DivideRoundUp(dst_[0]->Slices(),
-                                                   conv_params_.block_size.w));
-}
-
-int3 Conv3D::GetGridSize() const {
-  const int grid_x = DivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(),
-                                   conv_params_.block_size.x);
-  const int grid_y =
-      DivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y);
-  const int grid_z =
-      DivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.w) *
-      DivideRoundUp(dst_[0]->Depth(), conv_params_.block_size.z);
-  int3 wg;
-  wg.x = DivideRoundUp(grid_x, conv_params_.work_group_size.x);
-  wg.y = DivideRoundUp(grid_y, conv_params_.work_group_size.y);
-  wg.z = DivideRoundUp(grid_z, conv_params_.work_group_size.z);
-  return int3(wg[conv_params_.work_group_launch_order[0]] *
-                  conv_params_.work_group_size.x,
-              wg[conv_params_.work_group_launch_order[1]] *
-                  conv_params_.work_group_size.y,
-              wg[conv_params_.work_group_launch_order[2]] *
-                  conv_params_.work_group_size.z);
-}
-
-absl::Status Conv3D::Tune(const TuningParameters& params) {
-  if (conv_params_.weights_upload_type ==
-          WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP ||
-      conv_params_.weights_upload_type ==
-          WeightsUploadType::LOCAL_MEM_BY_THREADS) {
-    return absl::OkStatus();
-  }
-  if (conv_params_.work_group_launch_order[0] == 0 &&
-      conv_params_.work_group_launch_order[1] == 1 &&
-      conv_params_.work_group_launch_order[2] == 2) {
-    RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
-    RETURN_IF_ERROR(GetBestWorkGroupConv(params, kernel_, grid_size_,
-                                         &conv_params_.work_group_size));
-    work_group_size_ = conv_params_.work_group_size;
-  }
-  return absl::OkStatus();
-}
-
-std::string Conv3D::GenerateConv3D(const OperationDef& op_def,
-                                   bool stride_correction,
-                                   const Conv3D::ConvParams& conv_params) {
-  auto src_desc = op_def.src_tensors[0];
-  src_desc.SetTextureAddressMode(TextureAddressMode::ZERO);
-  if (op_def.IsBatchSupported()) {
-    src_desc.SetStateVar("BatchedWidth", "true");
-  }
-  AddSrcTensor("src_tensor", src_desc);
-
-  auto dst_desc = op_def.dst_tensors[0];
-  if (op_def.IsBatchSupported()) {
-    dst_desc.SetStateVar("BatchedWidth", "true");
-  }
-  AddDstTensor("dst_tensor", dst_desc);
-
-  if (!conv_params_.x_kernel_is_1) {
-    args_.AddInt("stride_x");
-    args_.AddInt("padding_x");
-    args_.AddInt("kernel_size_x");
-    args_.AddInt("dilation_x");
-  }
-  if (!conv_params_.y_kernel_is_1) {
-    args_.AddInt("stride_y");
-    args_.AddInt("padding_y");
-    args_.AddInt("kernel_size_y");
-    args_.AddInt("dilation_y");
-  }
-  if (!conv_params_.z_kernel_is_1) {
-    args_.AddInt("stride_z");
-    args_.AddInt("padding_z");
-    args_.AddInt("kernel_size_z");
-    args_.AddInt("dilation_z");
-  }
-  args_.AddInt("grid_size_s");
-
-  const auto src_tensor_type = op_def.src_tensors[0].storage_type;
-  const bool buffer_type = src_tensor_type == TensorStorageType::BUFFER ||
-                           src_tensor_type == TensorStorageType::IMAGE_BUFFER;
-
-  const bool manual_clamp_x = buffer_type && !conv_params.x_kernel_is_1;
-  const bool manual_clamp_y = buffer_type && !conv_params.y_kernel_is_1;
-  const bool manual_clamp_z =
-      src_tensor_type != TensorStorageType::TEXTURE_3D &&
-      !conv_params.z_kernel_is_1;
-
-  const bool can_read_out_of_x = !buffer_type;
-  const bool can_read_out_of_y = !buffer_type;
-  const bool can_read_out_of_z =
-      src_tensor_type == TensorStorageType::TEXTURE_3D ||
-      src_tensor_type == TensorStorageType::TEXTURE_2D ||
-      src_tensor_type == TensorStorageType::SINGLE_TEXTURE_2D;
-
-  const bool is1x1x1 = conv_params.x_kernel_is_1 && conv_params.y_kernel_is_1 &&
-                       conv_params.z_kernel_is_1;
-
-  const bool need_local_mem =
-      conv_params.weights_upload_type ==
-          Conv3D::WeightsUploadType::LOCAL_MEM_BY_THREADS ||
-      conv_params.weights_upload_type ==
-          Conv3D::WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP;
-
-  const int3 work_group_size = conv_params.work_group_size;
-  const int4 block_size = conv_params.block_size;
-  std::string c = GetCommonDefines(op_def.precision);
-  if (need_local_mem) {  // we use fixed workgroup size when use local mem
-    c += "__attribute__((reqd_work_group_size(" +
-         std::to_string(work_group_size.x) + ", " +
-         std::to_string(work_group_size.y) + ", " +
-         std::to_string(work_group_size.z) + ")))\n";
-  }
-  c += "__kernel void main_function(\n";
-  c += "$0) {\n";
-  c += GenerateGlobalCoordinates(block_size,
-                                 conv_params.work_group_launch_order);
-  if (!need_local_mem) {
-    c += "  if (DST_X >= args.dst_tensor.Width() || DST_Y >= "
-         "args.dst_tensor.Height() || DST_Z >= args.dst_tensor.Depth()) "
-         "return;\n";
-  }
-  if (conv_params.weights_upload_type ==
-      Conv3D::WeightsUploadType::LOCAL_MEM_BY_THREADS) {
-    c += "  int lid = get_local_id(1) * " + std::to_string(work_group_size.x) +
-         " + get_local_id(0);\n";
-  }
-  for (int s = 0; s < block_size.w; ++s) {
-    for (int z = 0; z < block_size.z; ++z) {
-      for (int y = 0; y < block_size.y; ++y) {
-        for (int x = 0; x < block_size.x; ++x) {
-          c += "  ACCUM_FLT4 r" + std::to_string(s) + std::to_string(z) +
-               std::to_string(y) + std::to_string(x) +
-               " = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
-        }
-      }
-    }
-  }
-  if (!conv_params.x_kernel_is_1) {
-    for (int x = 0; x < block_size.x; ++x) {
-      const std::string xc = "(DST_X + " + std::to_string(x) + ")";
-      if (stride_correction) {
-        c += "  int xc" + std::to_string(x) + " = " +
-             GetXStrideCorrected(xc, "args.src_tensor.Batch()", "args.stride_x",
-                                 "args.padding_x") +
-             ";\n";
-      } else {
-        c += "  int xc" + std::to_string(x) + " = " + xc +
-             " * args.stride_x + args.padding_x;\n";
-      }
-    }
-  } else if (!can_read_out_of_x) {
-    for (int x = 0; x < block_size.x; ++x) {
-      const std::string xc = "(DST_X + " + std::to_string(x) + ")";
-      c += "  int xc" + std::to_string(x) + " = clamp(" + xc +
-           ", 0, args.src_tensor.Width() - 1);\n";
-    }
-  }
-  if (!conv_params.y_kernel_is_1) {
-    for (int y = 0; y < block_size.y; ++y) {
-      const std::string yc = "(DST_Y + " + std::to_string(y) + ")";
-      c += "  int yc" + std::to_string(y) + " = " + yc +
-           " * args.stride_y + args.padding_y;\n";
-    }
-  } else if (!can_read_out_of_y) {
-    for (int y = 0; y < block_size.y; ++y) {
-      const std::string yc = "(DST_Y + " + std::to_string(y) + ")";
-      c += "  int yc" + std::to_string(y) + " = clamp(" + yc +
-           ", 0, args.src_tensor.Height() - 1);\n";
-    }
-  }
-  if (!conv_params.z_kernel_is_1) {
-    for (int z = 0; z < block_size.z; ++z) {
-      const std::string zc = "(DST_Z + " + std::to_string(z) + ")";
-      c += "  int zc" + std::to_string(z) + " = " + zc +
-           " * args.stride_z + args.padding_z;\n";
-    }
-  } else if (!can_read_out_of_z) {
-    for (int z = 0; z < block_size.z; ++z) {
-      const std::string zc = "(DST_Z + " + std::to_string(z) + ")";
-      c += "  int zc" + std::to_string(z) + " = clamp(" + zc +
-           ", 0, args.src_tensor.Depth() - 1);\n";
-    }
-  }
-  if (need_local_mem) {
-    c += "  __local FLT4 weights_cache[" +
-         std::to_string(block_size.w * 4 * conv_params.src_depth_loop_size) +
-         "];\n";
-  }
-  if (conv_params.weights_upload_type ==
-      Conv3D::WeightsUploadType::GLOBAL_MEM) {
-    c += "  __global FLT4* weights_cache;\n";
-  }
-  std::string kernel_size;
-  kernel_size += conv_params.x_kernel_is_1 ? "" : " * args.kernel_size_x";
-  kernel_size += conv_params.y_kernel_is_1 ? "" : " * args.kernel_size_y";
-  kernel_size += conv_params.z_kernel_is_1 ? "" : " * args.kernel_size_z";
-  if (conv_params.AreWeightsBuffer()) {
-    c += "  __global FLT4* filters_loc = args.weights.GetPtr() + DST_S * 4 * "
-         "args.src_tensor.Slices()" +
-         kernel_size + ";\n";
-  }
-  if (buffer_type) {
-    c += "  const int src_layer_offset = args.src_tensor.SliceStride();\n";
-  }
-  if (!is1x1x1) {
-    c += "  int filter_offset = 0;\n";
-  }
-  if (!conv_params.z_kernel_is_1) {
-    c += "  for (int kz = 0; kz < args.kernel_size_z; ++kz) {\n";
-    for (int z = 0; z < block_size.z; ++z) {
-      const std::string zck = "zck" + std::to_string(z);
-      c += "  int zck" + std::to_string(z) + " = kz * args.dilation_z + zc" +
-           std::to_string(z) + ";\n";
-      if (manual_clamp_z) {
-        c += "  bool mz" + std::to_string(z) + " = " + zck + " >= 0 && " + zck +
-             " < args.src_tensor.Depth();\n";
-        c += "  " + zck + " = clamp(" + zck +
-             ", 0, args.src_tensor.Depth() - 1);\n";
-      }
-    }
-  }
-  if (!conv_params.y_kernel_is_1) {
-    c += "  for (int ky = 0; ky < args.kernel_size_y; ++ky) {\n";
-    for (int y = 0; y < block_size.y; ++y) {
-      const std::string yck = "yck" + std::to_string(y);
-      c += "  int " + yck + " = ky * args.dilation_y + yc" + std::to_string(y) +
-           ";\n";
-      if (manual_clamp_y) {
-        c += "  bool my" + std::to_string(y) + " = " + yck + " >= 0 && " + yck +
-             " < args.src_tensor.Height();\n";
-        c += "  " + yck + " = clamp(" + yck +
-             ", 0, args.src_tensor.Height() - 1);\n";
-      }
-    }
-  }
-  if (!conv_params.x_kernel_is_1) {
-    c += "  for (int kx = 0; kx < args.kernel_size_x; ++kx) {\n";
-    for (int x = 0; x < block_size.x; ++x) {
-      const std::string xck = "xck" + std::to_string(x);
-      c += "  int xck" + std::to_string(x) + " = kx * args.dilation_x + xc" +
-           std::to_string(x) + ";\n";
-      if (manual_clamp_x) {
-        c += "  bool mx" + std::to_string(x) + " = " + xck + " >= 0 && " + xck +
-             " < args.src_tensor.Width();\n";
-        c += "  " + xck + " = clamp(" + xck +
-             ", 0, args.src_tensor.Width() - 1);\n";
-      }
-    }
-  }
-
-  auto get_src_x_coord = [&](int id) {
-    std::string xs = std::to_string(id);
-    std::string xc = "xck" + xs;
-    if (conv_params.x_kernel_is_1) {
-      if (can_read_out_of_x) {
-        xc = "DST_X + " + xs;
-      } else {
-        xc = "xc" + xs;
-      }
-    }
-    return xc;
-  };
-  auto get_src_y_coord = [&](int id) {
-    std::string ys = std::to_string(id);
-    std::string yc = "yck" + ys;
-    if (conv_params.y_kernel_is_1) {
-      if (can_read_out_of_y) {
-        yc = "DST_Y + " + ys;
-      } else {
-        yc = "yc" + ys;
-      }
-    }
-    return yc;
-  };
-  auto get_src_z_coord = [&](int id) {
-    std::string zs = std::to_string(id);
-    std::string zc = "zck" + zs;
-    if (conv_params.z_kernel_is_1) {
-      if (can_read_out_of_z) {
-        zc = "DST_Z + " + zs;
-      } else {
-        zc = "zc" + zs;
-      }
-    }
-    return zc;
-  };
-
-  if (buffer_type) {
-    for (int z = 0; z < block_size.z; ++z) {
-      const std::string zs = std::to_string(z);
-      const std::string zc = get_src_z_coord(z);
-      for (int y = 0; y < block_size.y; ++y) {
-        const std::string ys = std::to_string(y);
-        const std::string yc = get_src_y_coord(y);
-        for (int x = 0; x < block_size.x; ++x) {
-          const std::string xs = std::to_string(x);
-          const std::string xc = get_src_x_coord(x);
-          const std::string id = zs + ys + xs;
-          c += "  args.src_tensor.GetAddress(src_a_" + id + ", " + xc + ", " +
-               yc + ", " + zc + ", 0);\n";
-          if (!is1x1x1 && src_tensor_type == TensorStorageType::IMAGE_BUFFER) {
-            std::string condition;
-            if (manual_clamp_x) {
-              if (!condition.empty()) {
-                condition += " && ";
-              }
-              condition += "mx" + xs;
-            }
-            if (manual_clamp_y) {
-              if (!condition.empty()) {
-                condition += " && ";
-              }
-              condition += "my" + ys;
-            }
-            if (manual_clamp_z) {
-              if (!condition.empty()) {
-                condition += " && ";
-              }
-              condition += "mz" + zs;
-            }
-            c += "  src_a_" + id + " = select(-1, src_a_" + id + ", " +
-                 condition + ");\n";
-            c += "  int dz_" + id + " = select(0, src_layer_offset, " +
-                 condition + ");\n";
-          }
-        }
-      }
-    }
-  }
-
-  auto declare_src = [&]() {
-    for (int z = 0; z < block_size.z; ++z) {
-      const std::string zs = std::to_string(z);
-      for (int y = 0; y < block_size.y; ++y) {
-        const std::string ys = std::to_string(y);
-        for (int x = 0; x < block_size.x; ++x) {
-          const std::string xs = std::to_string(x);
-          const std::string id = zs + ys + xs;
-          c += "  FLT4 src" + id + ";\n";
-        }
-      }
-    }
-  };
-
-  auto read_src = [&]() {
-    for (int z = 0; z < block_size.z; ++z) {
-      const std::string zs = std::to_string(z);
-      const std::string zc = get_src_z_coord(z);
-      for (int y = 0; y < block_size.y; ++y) {
-        const std::string ys = std::to_string(y);
-        const std::string yc = get_src_y_coord(y);
-        for (int x = 0; x < block_size.x; ++x) {
-          const std::string xs = std::to_string(x);
-          const std::string xc = get_src_x_coord(x);
-          std::string multiplier;
-          multiplier += manual_clamp_x ? " * (FLT)(mx" + xs + ")" : "";
-          multiplier += manual_clamp_y ? " * (FLT)(my" + ys + ")" : "";
-          multiplier += manual_clamp_z ? " * (FLT)(mz" + zs + ")" : "";
-          const std::string id = zs + ys + xs;
-          if (buffer_type) {
-            if (src_tensor_type == TensorStorageType::IMAGE_BUFFER) {
-              multiplier = "";
-            }
-            c += "    src" + id + " = args.src_tensor.Read(src_a_" + id + ")" +
-                 multiplier + ";\n";
-            if (!is1x1x1 &&
-                src_tensor_type == TensorStorageType::IMAGE_BUFFER) {
-              c += "    src_a_" + id + " += dz_" + id + ";\n";
-            } else {
-              c += "    src_a_" + id + " += src_layer_offset;\n";
-            }
-          } else {
-            c += "    src" + id + " = args.src_tensor.Read(" + xc + ", " + yc +
-                 ", " + zc + ", s)" + multiplier + ";\n";
-          }
-        }
-      }
-    }
-  };
-  c += "  int s = 0;\n";
-  declare_src();
-  c += "  do {\n";
-  const int total_work_items =
-      work_group_size.x * work_group_size.y * work_group_size.z;
-  if (conv_params.weights_upload_type ==
-      Conv3D::WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP) {
-    c +=
-        GenerateAsyncUpload("weights_cache", "filters_loc",
-                            /*global_offset_name*/ "",
-                            block_size.w * 4 * conv_params.src_depth_loop_size);
-  } else if (conv_params.weights_upload_type ==
-             Conv3D::WeightsUploadType::LOCAL_MEM_BY_THREADS) {
-    c += "    barrier(CLK_LOCAL_MEM_FENCE);\n";
-    c += GenerateUploadByThreads(
-        "weights_cache", "filters_loc",
-        /*global_offset_name*/ "", "lid", total_work_items,
-        block_size.w * 4 * conv_params.src_depth_loop_size);
-  } else if (conv_params.weights_upload_type ==
-             Conv3D::WeightsUploadType::GLOBAL_MEM) {
-    c += "    weights_cache = filters_loc;\n";
-  } else {  // TEXTURES_MEM
-    for (int dst_s = 0; dst_s < block_size.w; ++dst_s) {
-      const std::string f_y = is1x1x1 ? "s" : "filter_offset";
-      c += absl::Substitute(
-          R"(    FLT4 f$2 = args.weights0.Read(DST_S + $0, $1);
-    FLT4 f$3 = args.weights1.Read(DST_S + $0, $1);
-    FLT4 f$4 = args.weights2.Read(DST_S + $0, $1);
-    FLT4 f$5 = args.weights3.Read(DST_S + $0, $1);
-)",
-          dst_s, f_y, dst_s * 4 + 0, dst_s * 4 + 1, dst_s * 4 + 2,
-          dst_s * 4 + 3);
-    }
-    if (!is1x1x1) {
-      c += "    filter_offset++;\n";
-    }
-  }
-  read_src();
-  c += "    s += 1;\n";
-  if (conv_params.weights_upload_type ==
-      Conv3D::WeightsUploadType::LOCAL_MEM_BY_THREADS) {
-    c += "    barrier(CLK_LOCAL_MEM_FENCE);\n";
-  }
-  c += GenerateConv(op_def.precision, block_size, 0,
-                    conv_params.AreWeightsBuffer());
-  for (int i = 1; i < conv_params.src_depth_loop_size; ++i) {
-    read_src();
-    c += GenerateConv(op_def.precision, block_size, i * block_size.w * 4,
-                      conv_params.AreWeightsBuffer());
-    c += "    s += 1;\n";
-  }
-  if (conv_params.AreWeightsBuffer()) {
-    c += "    filters_loc += " +
-         std::to_string(block_size.w * 4 * conv_params.src_depth_loop_size) +
-         ";\n";
-  }
-  c += "  } while (s < args.src_tensor.Slices());\n";
-  if (!conv_params.z_kernel_is_1) {
-    c += "  }\n";
-  }
-  if (!conv_params.y_kernel_is_1) {
-    c += "  }\n";
-  }
-  if (!conv_params.x_kernel_is_1) {
-    c += "  }\n";
-  }
-  if (conv_params.weights_upload_type ==
-      Conv3D::WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP) {
-    c += GenerateAsyncUpload("weights_cache", "args.biases.GetPtr()", "DST_S",
-                             block_size.w);
-  } else if (conv_params.weights_upload_type ==
-             Conv3D::WeightsUploadType::LOCAL_MEM_BY_THREADS) {
-    c += "  barrier(CLK_LOCAL_MEM_FENCE);\n";
-    c +=
-        GenerateUploadByThreads("weights_cache", "args.biases.GetPtr()",
-                                "DST_S", "lid", total_work_items, block_size.w);
-    c += "  barrier(CLK_LOCAL_MEM_FENCE);\n";
-  } else if (conv_params.weights_upload_type ==
-             Conv3D::WeightsUploadType::GLOBAL_MEM) {
-    c += "  weights_cache = args.biases.GetPtr() + DST_S;\n";
-  }
-  if (need_local_mem) {
-    c += "  if (DST_X >= args.dst_tensor.Width() || DST_Y >= "
-         "args.dst_tensor.Height() || DST_Z >= args.dst_tensor.Depth()) "
-         "return;\n";
-  }
-  for (int s = 0; s < block_size.w; ++s) {
-    const std::string dsts =
-        "DST_S" + (s == 0 ? "" : " + " + std::to_string(s));
-    c += "  if (" + dsts + " >= args.dst_tensor.Slices()) return;\n";
-    for (int z = 0; z < block_size.z; ++z) {
-      const std::string dstz =
-          "DST_Z" + (z == 0 ? "" : " + " + std::to_string(z));
-      for (int y = 0; y < block_size.y; ++y) {
-        const std::string dsty =
-            "DST_Y" + (y == 0 ? "" : " + " + std::to_string(y));
-        for (int x = 0; x < block_size.x; ++x) {
-          const std::string dstx =
-              "DST_X" + (x == 0 ? "" : " + " + std::to_string(x));
-          const std::string r_id = std::to_string(s) + std::to_string(z) +
-                                   std::to_string(y) + std::to_string(x);
-          c += "  if (" + dstx + " < args.dst_tensor.Width() && " + dsty +
-               " < args.dst_tensor.Height() && " + dstz +
-               " < args.dst_tensor.Depth()) {\n";
-          if (conv_params.AreWeightsBuffer()) {
-            c += "    FLT4 res = TO_FLT4(r" + r_id + ") + weights_cache[" +
-                 std::to_string(s) + "];\n";
-          } else {
-            c += "    FLT4 res = TO_FLT4(r" + r_id + ") + args.biases.Read(" +
-                 dsts + ");\n";
-          }
-          c += "    args.dst_tensor.Write(res, " + dstx + ", " + dsty + ", " +
-               dstz + ", " + dsts + ");\n";
-          c += "  }\n";
-        }
-      }
-    }
-  }
-  c += "}\n";
-  return c;
-}
-
-Conv3D::ConvParams Conv3D::GuessBestParams(const CLDevice& device,
-                                           const OperationDef& definition,
-                                           int src_slices, int dst_slices,
-                                           bool x_kernel_is_1,
-                                           bool y_kernel_is_1,
-                                           bool z_kernel_is_1) const {
-  ConvParams conv_params;
-  conv_params.x_kernel_is_1 = x_kernel_is_1;
-  conv_params.y_kernel_is_1 = y_kernel_is_1;
-  conv_params.z_kernel_is_1 = z_kernel_is_1;
-  if (device.IsNvidia()) {
-    conv_params.block_size = int4(1, 1, 1, 4);
-    conv_params.work_group_size = int3(8, 4, 1);
-    conv_params.work_group_launch_order = int3(2, 0, 1);
-    conv_params.src_depth_loop_size = 1;
-    conv_params.weights_upload_type = WeightsUploadType::LOCAL_MEM_BY_THREADS;
-    if (dst_slices % 4 == 0 || dst_slices >= 8) {
-      conv_params.block_size.w = 4;
-    } else if (dst_slices % 2 == 0 || dst_slices >= 4) {
-      conv_params.block_size.w = 2;
-    } else {
-      conv_params.block_size.w = dst_slices;
-    }
-    if (src_slices % 2 == 0) {
-      conv_params.src_depth_loop_size = 2;
-    }
-    if (src_slices % 4 == 0 && conv_params.block_size.w <= 2) {
-      conv_params.src_depth_loop_size = 4;
-    }
-  } else if (device.IsPowerVR()) {
-    conv_params.block_size = int4(1, 1, 1, 4);
-    conv_params.work_group_size = int3(8, 4, 1);
-    conv_params.work_group_launch_order = int3(2, 0, 1);
-    conv_params.src_depth_loop_size = 1;
-    conv_params.weights_upload_type =
-        WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP;
-    if (dst_slices % 8 == 0 || dst_slices >= 32) {
-      conv_params.block_size.w = 8;
-    } else if (dst_slices % 4 == 0 || dst_slices >= 8) {
-      conv_params.block_size.w = 4;
-    } else if (dst_slices % 2 == 0 || dst_slices >= 4) {
-      conv_params.block_size.w = 2;
-    } else {
-      conv_params.block_size.w = dst_slices;
-    }
-    if (definition.precision == CalculationsPrecision::F16) {
-      conv_params.block_size.w = std::min(4, conv_params.block_size.w);
-      if (src_slices % 2 == 0) {
-        conv_params.src_depth_loop_size = 2;
-      }
-      if (src_slices % 4 == 0 && conv_params.block_size.w <= 2) {
-        conv_params.src_depth_loop_size = 4;
-      }
-      if (conv_params.block_size.w == 1) {
-        if (src_slices % 2 == 0) {
-          conv_params.src_depth_loop_size = 2;
-        }
-        if (src_slices % 4 == 0) {
-          conv_params.src_depth_loop_size = 4;
-        }
-        if (src_slices <= 8) {
-          conv_params.src_depth_loop_size = src_slices;
-        }
-      }
-      conv_params.block_size.x = 2;
-      conv_params.work_group_size = int3(4, 8, 1);
-    }
-  } else if (device.IsAdreno()) {
-    conv_params.block_size = int4(2, 2, 1, 2);
-    conv_params.work_group_size = int3(8, 4, 1);
-    conv_params.work_group_launch_order = int3(0, 1, 2);
-    conv_params.src_depth_loop_size = 1;
-    conv_params.weights_upload_type = WeightsUploadType::TEXTURES_MEM;
-  } else if (device.IsMali()) {
-    conv_params.block_size = int4(1, 1, 1, 4);
-    conv_params.work_group_size = int3(8, 4, 1);
-    conv_params.work_group_launch_order = int3(0, 1, 2);
-    conv_params.src_depth_loop_size = 1;
-    conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
-    if (dst_slices % 4 == 0 || dst_slices >= 8) {
-      conv_params.block_size.w = 4;
-    } else if (dst_slices % 2 == 0 || dst_slices >= 4) {
-      conv_params.block_size.w = 2;
-    } else {
-      conv_params.block_size.w = dst_slices;
-    }
-    if (src_slices % 2 == 0) {
-      conv_params.src_depth_loop_size = 2;
-    }
-    if (src_slices % 4 == 0 && conv_params.block_size.w <= 2) {
-      conv_params.src_depth_loop_size = 4;
-    }
-  } else {
-    conv_params.block_size = int4(2, 2, 1, 2);
-    conv_params.work_group_size = int3(8, 4, 1);
-    conv_params.work_group_launch_order = int3(0, 1, 2);
-    conv_params.src_depth_loop_size = 1;
-    conv_params.weights_upload_type = WeightsUploadType::TEXTURES_MEM;
-  }
-
-  return conv_params;
-}
-
-Conv3D::ConvParams Conv3D::GuessBestParams(
-    const CLDevice& device, const OperationDef& definition,
-    const Convolution3DAttributes& attr) const {
-  const int dst_slices = DivideRoundUp(attr.weights.shape.o, 4);
-  const int src_slices = DivideRoundUp(attr.weights.shape.i, 4);
-  const bool x_kernel_is_1 = attr.weights.shape.w == 1 && attr.strides.w == 1 &&
-                             attr.dilations.w == 1 &&
-                             attr.padding.prepended.w == 0 &&
-                             attr.padding.appended.w == 0;
-  const bool y_kernel_is_1 = attr.weights.shape.h == 1 && attr.strides.h == 1 &&
-                             attr.dilations.h == 1 &&
-                             attr.padding.prepended.h == 0 &&
-                             attr.padding.appended.h == 0;
-  const bool z_kernel_is_1 = attr.weights.shape.d == 1 && attr.strides.d == 1 &&
-                             attr.dilations.d == 1 &&
-                             attr.padding.prepended.d == 0 &&
-                             attr.padding.appended.d == 0;
-  return GuessBestParams(device, definition, src_slices, dst_slices,
-                         x_kernel_is_1, y_kernel_is_1, z_kernel_is_1);
-}
-
-absl::Status CreateConv3D(const CreationContext& creation_context,
-                          const OperationDef& definition,
-                          const Convolution3DAttributes& attr, Conv3D* result) {
-  *result = Conv3D(definition, attr, *creation_context.device);
-  return result->UploadData(attr.weights, attr.bias, creation_context.context);
-}
-
-}  // namespace cl
-}  // namespace gpu
-}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_3d.h b/tensorflow/lite/delegates/gpu/cl/kernels/conv_3d.h
deleted file mode 100644
index ce2d7794411..00000000000
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_3d.h
+++ /dev/null
@@ -1,303 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_3D_H_
-#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_3D_H_
-
-#include <vector>
-
-#include "tensorflow/lite/delegates/gpu/cl/buffer.h"
-#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
-#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
-#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
-#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
-#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
-#include "tensorflow/lite/delegates/gpu/cl/util.h"
-#include "tensorflow/lite/delegates/gpu/common/data_type.h"
-#include "tensorflow/lite/delegates/gpu/common/operations.h"
-#include "tensorflow/lite/delegates/gpu/common/shape.h"
-#include "tensorflow/lite/delegates/gpu/common/status.h"
-#include "tensorflow/lite/delegates/gpu/common/tensor.h"
-#include "tensorflow/lite/delegates/gpu/common/types.h"
-
-namespace tflite {
-namespace gpu {
-namespace cl {
-
-class Conv3D : public GPUOperation {
- public:
-  Conv3D() = default;
-  absl::Status Tune(const TuningParameters& params) override;
-  absl::Status BindArguments() override;
-  int3 GetGridSize() const override;
-
-  // Move only
-  Conv3D(Conv3D&& operation);
-  Conv3D& operator=(Conv3D&& operation);
-  Conv3D(const Conv3D&) = delete;
-  Conv3D& operator=(const Conv3D&) = delete;
-
- private:
-  enum class WeightsUploadType {
-    LOCAL_MEM_ASYNC_SUBGROUP,  // we use it for PowerVR with workgroup size = 32
-    LOCAL_MEM_BY_THREADS,
-    GLOBAL_MEM,
-    TEXTURES_MEM,
-  };
-
-  struct ConvParams {
-    int4 block_size;  // WHDS
-    int3 work_group_size;
-    int3 work_group_launch_order;
-    int src_depth_loop_size;
-    WeightsUploadType weights_upload_type;
-    bool AreWeightsBuffer() const {
-      return weights_upload_type != WeightsUploadType::TEXTURES_MEM;
-    }
-    bool x_kernel_is_1;
-    bool y_kernel_is_1;
-    bool z_kernel_is_1;
-  };
-
-  Conv3D(const OperationDef& definition, const Convolution3DAttributes& attr,
-         const CLDevice& device);
-
-  template <DataType T>
-  absl::Status UploadData(const tflite::gpu::Tensor<OHWDI, T>& weights,
-                          const tflite::gpu::Tensor<Linear, T>& biases,
-                          CLContext* context);
-  template <DataType T>
-  absl::Status UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights,
-                             CLContext* context);
-
-  template <DataType S, typename T>
-  void RearrangeWeightsData(const tflite::gpu::Tensor<OHWDI, S>& weights,
-                            absl::Span<T> dst);
-
-  friend absl::Status CreateConv3D(const CreationContext& creation_context,
-                                   const OperationDef& definition,
-                                   const Convolution3DAttributes& attr,
-                                   Conv3D* result);
-
-  friend std::string GenerateConv3D(const OperationDef& op_def,
-                                    bool stride_correction,
-                                    const ConvParams& conv_params,
-                                    Arguments* args);
-
-  ConvParams GuessBestParams(const CLDevice& device,
-                             const OperationDef& definition,
-                             const Convolution3DAttributes& attr) const;
-
-  ConvParams GuessBestParams(const CLDevice& device,
-                             const OperationDef& definition, int src_slices,
-                             int dst_slices, bool x_kernel_is_1,
-                             bool y_kernel_is_1, bool z_kernel_is_1) const;
-
-  std::string GenerateConv3D(const OperationDef& op_def, bool stride_correction,
-                             const Conv3D::ConvParams& conv_params);
-
-  int3 stride_;
-  int3 padding_;
-  int3 kernel_size_;
-  int3 dilation_;
-  ConvParams conv_params_;
-};
-
-template <DataType T>
-absl::Status Conv3D::UploadData(const tflite::gpu::Tensor<OHWDI, T>& weights,
-                                const tflite::gpu::Tensor<Linear, T>& biases,
-                                CLContext* context) {
-  RETURN_IF_ERROR(UploadWeights(weights, context));
-  TensorLinearDescriptor desc;
-  desc.storage_type = conv_params_.AreWeightsBuffer()
-                          ? LinearStorageType::BUFFER
-                          : LinearStorageType::TEXTURE_2D;
-  desc.element_type = definition_.GetDataType();
-
-  LinearStorage lt;
-  RETURN_IF_ERROR(CreateLinearStorage(desc, biases, context, &lt));
-  args_.AddObject("biases", AccessType::READ,
-                  absl::make_unique<LinearStorage>(std::move(lt)),
-                  absl::make_unique<TensorLinearDescriptor>(desc));
-  return absl::OkStatus();
-}
-
-template <DataType T>
-absl::Status Conv3D::UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights,
-                                   CLContext* context) {
-  const int block_size = conv_params_.block_size.w;
-  const int dst_slices =
-      AlignByN(DivideRoundUp(weights.shape.o, 4), block_size);
-  const int src_slices = DivideRoundUp(weights.shape.i, 4);
-  const int kernel_x = kernel_size_.x;
-  const int kernel_y = kernel_size_.y;
-  const int kernel_z = kernel_size_.z;
-  const int texture_width = dst_slices;
-  const int texture_height = src_slices * kernel_x * kernel_y * kernel_z;
-
-  const int elements_count =
-      kernel_x * kernel_y * kernel_z * src_slices * dst_slices * 4;
-  const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
-
-  const int float4_size = f32_weights ? 16 : 8;
-
-  Texture2D weights_0;
-  Texture2D weights_1;
-  Texture2D weights_2;
-  Texture2D weights_3;
-  Buffer weights_buf;
-  if (f32_weights) {
-    std::vector<float4> gpu_data(elements_count);
-    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
-    if (conv_params_.AreWeightsBuffer()) {
-      RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
-                                           gpu_data.data(), context,
-                                           &weights_buf));
-    } else {
-      RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), texture_width, texture_height,
-          gpu_data.data(), context, &weights_0));
-      RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), texture_width, texture_height,
-          gpu_data.data() + texture_width * texture_height, context,
-          &weights_1));
-      RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), texture_width, texture_height,
-          gpu_data.data() + texture_width * texture_height * 2, context,
-          &weights_2));
-      RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), texture_width, texture_height,
-          gpu_data.data() + texture_width * texture_height * 3, context,
-          &weights_3));
-    }
-  } else {
-    std::vector<half4> gpu_data(elements_count);
-    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
-    if (conv_params_.AreWeightsBuffer()) {
-      RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
-                                           gpu_data.data(), context,
-                                           &weights_buf));
-    } else {
-      RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), texture_width, texture_height,
-          gpu_data.data(), context, &weights_0));
-      RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), texture_width, texture_height,
-          gpu_data.data() + texture_width * texture_height, context,
-          &weights_1));
-      RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), texture_width, texture_height,
-          gpu_data.data() + texture_width * texture_height * 2, context,
-          &weights_2));
-      RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), texture_width, texture_height,
-          gpu_data.data() + texture_width * texture_height * 3, context,
-          &weights_3));
-    }
-  }
-
-  if (conv_params_.AreWeightsBuffer()) {
-    BufferDescriptor desc;
-    desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
-    desc.element_size = 4;
-    args_.AddObject("weights", AccessType::READ,
-                    absl::make_unique<Buffer>(std::move(weights_buf)),
-                    absl::make_unique<BufferDescriptor>(desc));
-  } else {
-    Texture2DDescriptor desc;
-    desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
-    args_.AddObject("weights0", AccessType::READ,
-                    absl::make_unique<Texture2D>(std::move(weights_0)),
-                    absl::make_unique<Texture2DDescriptor>(desc));
-    args_.AddObject("weights1", AccessType::READ,
-                    absl::make_unique<Texture2D>(std::move(weights_1)),
-                    absl::make_unique<Texture2DDescriptor>(desc));
-    args_.AddObject("weights2", AccessType::READ,
-                    absl::make_unique<Texture2D>(std::move(weights_2)),
-                    absl::make_unique<Texture2DDescriptor>(desc));
-    args_.AddObject("weights3", AccessType::READ,
-                    absl::make_unique<Texture2D>(std::move(weights_3)),
-                    absl::make_unique<Texture2DDescriptor>(desc));
-  }
-
-  return absl::OkStatus();
-}
-
-template <DataType S, typename T>
-void Conv3D::RearrangeWeightsData(const tflite::gpu::Tensor<OHWDI, S>& weights,
-                                  absl::Span<T> dst) {
-  const int block_size = conv_params_.block_size.w;
-  const int dst_slices =
-      AlignByN(DivideRoundUp(weights.shape.o, 4), block_size);
-  const int src_slices = DivideRoundUp(weights.shape.i, 4);
-  const int kernel_x = kernel_size_.x;
-  const int kernel_y = kernel_size_.y;
-  const int kernel_z = kernel_size_.z;
-  const int texture_width = dst_slices;
-  const int texture_height = src_slices * kernel_x * kernel_y * kernel_z;
-
-  int counter = 0;
-  for (int d = 0; d < dst_slices / block_size; ++d) {
-    for (int z = 0; z < kernel_z; ++z) {
-      for (int y = 0; y < kernel_y; ++y) {
-        for (int x = 0; x < kernel_x; ++x) {
-          for (int s = 0; s < src_slices; ++s) {
-            for (int sub_d = 0; sub_d < block_size; ++sub_d) {
-              T filters[4];
-              for (int i = 0; i < 4; ++i) {
-                for (int j = 0; j < 4; ++j) {
-                  const int s_ch = s * 4 + j;
-                  const int d_ch = (d * block_size + sub_d) * 4 + i;
-                  if (s_ch < weights.shape.i && d_ch < weights.shape.o) {
-                    const int f_index =
-                        weights.shape.LinearIndex({d_ch, y, x, z, s_ch});
-                    filters[j][i] = weights.data[f_index];
-                  } else {
-                    filters[j][i] = 0.0f;
-                  }
-                }
-              }
-              if (conv_params_.AreWeightsBuffer()) {
-                dst[counter++] = filters[0];
-                dst[counter++] = filters[1];
-                dst[counter++] = filters[2];
-                dst[counter++] = filters[3];
-              } else {
-                int x_coord = d * block_size + sub_d;
-                int y_coord =
-                    ((z * kernel_y + y) * kernel_x + x) * src_slices + s;
-                int offset = y_coord * dst_slices + x_coord;
-                dst[offset + texture_width * texture_height * 0] = filters[0];
-                dst[offset + texture_width * texture_height * 1] = filters[1];
-                dst[offset + texture_width * texture_height * 2] = filters[2];
-                dst[offset + texture_width * texture_height * 3] = filters[3];
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-absl::Status CreateConv3D(const CreationContext& creation_context,
-                          const OperationDef& definition,
-                          const Convolution3DAttributes& attr, Conv3D* result);
-
-}  // namespace cl
-}  // namespace gpu
-}  // namespace tflite
-
-#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_3D_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.cc
index 949651c1f87..7b8a81755e1 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.cc
@@ -81,19 +81,19 @@ std::string GetComputationPart(const int3& block_size, int element_size,
   return c;
 }
 
-ConvBuffer1x1::ConvParams GetBestParams(const CLDevice& device,
+ConvBuffer1x1::ConvParams GetBestParams(const DeviceInfo& device_info,
                                         const OperationDef& definition,
                                         const BHWC& shape, int src_depth,
                                         int dst_depth) {
   ConvBuffer1x1::ConvParams conv_params;
   conv_params.element_size = 4;
   conv_params.block_size = int3(1, 1, 1);
-  if (!device.IsMali()) {
+  if (!device_info.IsMali()) {
     return conv_params;
   }
   bool can_use_flt8 = (shape.w * shape.b) % 2 == 0 &&
                       definition.precision != CalculationsPrecision::F32;
-  bool is_midgard = device.IsMali() && device.GetInfo().mali_info.IsMidgard();
+  bool is_midgard = device_info.IsMali() && device_info.mali_info.IsMidgard();
   if (is_midgard) {
     if (can_use_flt8) {
       conv_params.element_size = 8;
@@ -105,8 +105,8 @@ ConvBuffer1x1::ConvParams GetBestParams(const CLDevice& device,
   }
 
   int task_size = shape.w * shape.b * shape.h * dst_depth;
-  int block_size =
-      GetRecommendedBlockSizeForConv(device, definition.precision, task_size);
+  int block_size = GetRecommendedBlockSizeForConv(
+      device_info, definition.precision, task_size);
 
   if (!can_use_flt8 && block_size > 4) {
     block_size = 4;
@@ -134,14 +134,15 @@ ConvBuffer1x1::ConvParams GetBestParams(const CLDevice& device,
   return conv_params;
 }
 
-ConvBuffer1x1::ConvParams GetBestParams(const CLDevice& device,
+ConvBuffer1x1::ConvParams GetBestParams(const DeviceInfo& device_info,
                                         const OperationDef& definition,
                                         int src_depth, int dst_depth) {
   ConvBuffer1x1::ConvParams conv_params;
   conv_params.element_size = 4;
   conv_params.block_size = int3(1, 1, 1);
-  if (device.IsMali() && definition.precision == CalculationsPrecision::F16 &&
-      device.GetInfo().compute_units_count <= 4) {
+  if (device_info.IsMali() &&
+      definition.precision == CalculationsPrecision::F16 &&
+      device_info.compute_units_count <= 4) {
     conv_params.block_size.x *= 2;
   }
   return conv_params;
@@ -153,7 +154,7 @@ ConvBuffer1x1::ConvBuffer1x1(const OperationDef& definition,
                              const ConvParams& conv_params)
     : GPUOperation(definition), conv_params_(conv_params) {
   code_ = GenerateConvBuffer1x1(definition_, conv_params_, &args_);
-  work_group_size_ = conv_params_.work_group_size;
+  work_group_size_ = int3(2, 4, 1);
 }
 
 ConvBuffer1x1::ConvBuffer1x1(ConvBuffer1x1&& operation)
@@ -315,12 +316,11 @@ int3 ConvBuffer1x1::GetGridSize() const {
   return int3(grid_x, grid_y, grid_z);
 }
 
-absl::Status ConvBuffer1x1::Tune(const TuningParameters& params) {
-  RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
-  RETURN_IF_ERROR(GetBestWorkGroupConv(params, kernel_, grid_size_,
-                                       &conv_params_.work_group_size));
-  work_group_size_ = conv_params_.work_group_size;
-  return absl::OkStatus();
+void ConvBuffer1x1::GetPossibleKernelWorkGroups(
+    TuningType tuning_type, const DeviceInfo& device_info,
+    const KernelInfo& kernel_info, std::vector<int3>* work_groups) const {
+  GetPossibleWorkGroupsConv(tuning_type, device_info, kernel_info, grid_size_,
+                            work_groups);
 }
 
 bool IsConvBuffer1x1Supported(const OperationDef& definition,
@@ -346,85 +346,80 @@ bool IsConvBuffer1x1Supported(const OperationDef& definition,
          attr.padding.appended.w == 0 && attr.padding.appended.h == 0;
 }
 
-absl::Status CreateConvBuffer1x1(const CreationContext& creation_context,
-                                 const OperationDef& definition,
-                                 const Convolution2DAttributes& attr,
-                                 ConvBuffer1x1* result, const BHWC* shape) {
-  if (!IsConvBuffer1x1Supported(definition, attr)) {
-    return absl::InvalidArgumentError("ConvBuffer1x1 doesn't supported");
-  }
+ConvBuffer1x1 CreateConvBuffer1x1(const DeviceInfo& device_info,
+                                  const OperationDef& definition,
+                                  const Convolution2DAttributes& attr,
+                                  const BHWC* shape) {
   const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
   const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
   ConvBuffer1x1::ConvParams conv_params;
   if (shape) {
-    conv_params = GetBestParams(*creation_context.device, definition, *shape,
-                                src_depth, dst_depth);
+    conv_params =
+        GetBestParams(device_info, definition, *shape, src_depth, dst_depth);
   } else {
-    conv_params = GetBestParams(*creation_context.device, definition, src_depth,
-                                dst_depth);
+    conv_params = GetBestParams(device_info, definition, src_depth, dst_depth);
   }
-  *result = ConvBuffer1x1(definition, conv_params);
-  return result->UploadData(attr.weights, attr.bias, creation_context.context);
+  ConvBuffer1x1 result(definition, conv_params);
+  result.UploadData(attr.weights, attr.bias);
+  return result;
 }
 
-absl::Status CreateConvBuffer1x1(const CreationContext& creation_context,
-                                 const OperationDef& definition,
-                                 const FullyConnectedAttributes& attr,
-                                 ConvBuffer1x1* result, const BHWC* shape) {
+ConvBuffer1x1 CreateConvBuffer1x1(const DeviceInfo& device_info,
+                                  const OperationDef& definition,
+                                  const FullyConnectedAttributes& attr,
+                                  const BHWC* shape) {
   const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
   const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
   ConvBuffer1x1::ConvParams conv_params;
   if (shape) {
-    conv_params = GetBestParams(*creation_context.device, definition, *shape,
-                                src_depth, dst_depth);
+    conv_params =
+        GetBestParams(device_info, definition, *shape, src_depth, dst_depth);
   } else {
-    conv_params = GetBestParams(*creation_context.device, definition, src_depth,
-                                dst_depth);
+    conv_params = GetBestParams(device_info, definition, src_depth, dst_depth);
   }
   conv_params.block_size.x *= conv_params.block_size.y;
   conv_params.block_size.y = 1;
-  *result = ConvBuffer1x1(definition, conv_params);
-  return result->UploadData(attr.weights, attr.bias, creation_context.context);
+  ConvBuffer1x1 result(definition, conv_params);
+  result.UploadData(attr.weights, attr.bias);
+  return result;
 }
 
-absl::Status CreateConvBuffer1x1Wino4x4To6x6(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const Convolution2DAttributes& attr, ConvBuffer1x1* result,
-    const BHWC* shape) {
+ConvBuffer1x1 CreateConvBuffer1x1Wino4x4To6x6(
+    const DeviceInfo& device_info, const OperationDef& definition,
+    const Convolution2DAttributes& attr, const BHWC* shape) {
   const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
   const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
   ConvBuffer1x1::ConvParams conv_params;
   if (shape) {
-    conv_params = GetBestParams(*creation_context.device, definition, *shape,
-                                src_depth, dst_depth);
+    conv_params =
+        GetBestParams(device_info, definition, *shape, src_depth, dst_depth);
   } else {
-    conv_params = GetBestParams(*creation_context.device, definition, src_depth,
-                                dst_depth);
+    conv_params = GetBestParams(device_info, definition, src_depth, dst_depth);
   }
   conv_params.block_size.x *= conv_params.block_size.y;
   conv_params.block_size.y = 1;
   conv_params.different_weights_for_height = true;
-  *result = ConvBuffer1x1(definition, conv_params);
-  return result->UploadDataForWinograd4x4To6x6(
-      attr.weights, *creation_context.device, creation_context.context);
+  ConvBuffer1x1 result(definition, conv_params);
+  result.UploadDataForWinograd4x4To6x6(attr.weights);
+  return result;
 }
 
-absl::Status CreateConvBuffer1x1DynamicWeights(
-    const CreationContext& creation_context, const OperationDef& definition,
+ConvBuffer1x1 CreateConvBuffer1x1DynamicWeights(
+    const DeviceInfo& device_info, const OperationDef& definition,
     const Convolution2DAttributes& attr, const BHWC& weights_shape,
-    ConvBuffer1x1* result, const BHWC* dst_shape) {
+    const BHWC* dst_shape) {
   const int dst_depth = DivideRoundUp(weights_shape.b, 4);
   const int src_depth = DivideRoundUp(weights_shape.c, 4);
   ConvBuffer1x1::ConvParams conv_params;
   if (dst_shape) {
-    conv_params = GetBestParams(*creation_context.device, definition,
-                                *dst_shape, src_depth, dst_depth);
-  } else {
-    conv_params = GetBestParams(*creation_context.device, definition, src_depth,
+    conv_params = GetBestParams(device_info, definition, *dst_shape, src_depth,
                                 dst_depth);
+  } else {
+    conv_params = GetBestParams(device_info, definition, src_depth, dst_depth);
   }
-  *result = ConvBuffer1x1(definition, conv_params);
-  return result->UploadBiases(attr.bias, creation_context.context);
+  ConvBuffer1x1 result(definition, conv_params);
+  result.UploadBiases(attr.bias);
+  return result;
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.h b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.h
index 90df8f2f9ad..f0c75e16e94 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.h
@@ -47,7 +47,10 @@ class ConvBuffer1x1 : public GPUOperation {
   ConvBuffer1x1(const ConvBuffer1x1&) = delete;
   ConvBuffer1x1& operator=(const ConvBuffer1x1&) = delete;
 
-  absl::Status Tune(const TuningParameters& params) override;
+  void GetPossibleKernelWorkGroups(
+      TuningType tuning_type, const DeviceInfo& device_info,
+      const KernelInfo& kernel_info,
+      std::vector<int3>* work_groups) const override;
   int3 GetGridSize() const override;
 
   ConvWeightsDescription GetConvWeightsDescription() const {
@@ -65,45 +68,38 @@ class ConvBuffer1x1 : public GPUOperation {
     // some cases we need separate weights for H dimension and convolution
     // kernel requires very small modifications to support it.
     bool different_weights_for_height = false;
-
-    int3 work_group_size = int3(2, 4, 1);
   };
 
  private:
   ConvBuffer1x1(const OperationDef& definition, const ConvParams& conv_params);
-  friend absl::Status CreateConvBuffer1x1(
-      const CreationContext& creation_context, const OperationDef& definition,
-      const Convolution2DAttributes& attr, ConvBuffer1x1* result,
-      const BHWC* shape);
-  friend absl::Status CreateConvBuffer1x1(
-      const CreationContext& creation_context, const OperationDef& definition,
-      const FullyConnectedAttributes& attr, ConvBuffer1x1* result,
-      const BHWC* shape);
-  friend absl::Status CreateConvBuffer1x1Wino4x4To6x6(
-      const CreationContext& creation_context, const OperationDef& definition,
-      const Convolution2DAttributes& attr, ConvBuffer1x1* result,
-      const BHWC* shape);
-  friend absl::Status CreateConvBuffer1x1DynamicWeights(
-      const CreationContext& creation_context, const OperationDef& definition,
+  friend ConvBuffer1x1 CreateConvBuffer1x1(const DeviceInfo& device_info,
+                                           const OperationDef& definition,
+                                           const Convolution2DAttributes& attr,
+                                           const BHWC* shape);
+  friend ConvBuffer1x1 CreateConvBuffer1x1(const DeviceInfo& device_info,
+                                           const OperationDef& definition,
+                                           const FullyConnectedAttributes& attr,
+                                           const BHWC* shape);
+  friend ConvBuffer1x1 CreateConvBuffer1x1Wino4x4To6x6(
+      const DeviceInfo& device_info, const OperationDef& definition,
+      const Convolution2DAttributes& attr, const BHWC* shape);
+  friend ConvBuffer1x1 CreateConvBuffer1x1DynamicWeights(
+      const DeviceInfo& device_info, const OperationDef& definition,
       const Convolution2DAttributes& attr, const BHWC& weights_shape,
-      ConvBuffer1x1* result, const BHWC* dst_shape);
+      const BHWC* dst_shape);
 
   template <DataType T>
-  absl::Status UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
-                          const tflite::gpu::Tensor<Linear, T>& biases,
-                          CLContext* context);
+  void UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
+                  const tflite::gpu::Tensor<Linear, T>& biases);
   template <DataType T>
-  absl::Status UploadDataForWinograd4x4To6x6(
-      const tflite::gpu::Tensor<OHWI, T>& weights, const CLDevice& device,
-      CLContext* context);
+  void UploadDataForWinograd4x4To6x6(
+      const tflite::gpu::Tensor<OHWI, T>& weights);
 
   template <DataType T>
-  absl::Status UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
-                             CLContext* context);
+  void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights);
 
   template <DataType T>
-  absl::Status UploadBiases(const tflite::gpu::Tensor<Linear, T>& biases,
-                            CLContext* context);
+  void UploadBiases(const tflite::gpu::Tensor<Linear, T>& biases);
 
   std::string GenerateConvBuffer1x1(
       const OperationDef& op_def, const ConvBuffer1x1::ConvParams& conv_params,
@@ -113,32 +109,26 @@ class ConvBuffer1x1 : public GPUOperation {
 };
 
 template <DataType T>
-absl::Status ConvBuffer1x1::UploadData(
-    const tflite::gpu::Tensor<OHWI, T>& weights,
-    const tflite::gpu::Tensor<Linear, T>& biases, CLContext* context) {
-  RETURN_IF_ERROR(UploadWeights(weights, context));
-  RETURN_IF_ERROR(UploadBiases(biases, context));
-  return absl::OkStatus();
+void ConvBuffer1x1::UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
+                               const tflite::gpu::Tensor<Linear, T>& biases) {
+  UploadWeights(weights);
+  UploadBiases(biases);
 }
 
 template <DataType T>
-absl::Status ConvBuffer1x1::UploadDataForWinograd4x4To6x6(
-    const tflite::gpu::Tensor<OHWI, T>& weights, const CLDevice& device,
-    CLContext* context) {
+void ConvBuffer1x1::UploadDataForWinograd4x4To6x6(
+    const tflite::gpu::Tensor<OHWI, T>& weights) {
   tflite::gpu::Tensor<OHWI, T> wino_weights;
   RearrangeWeightsToWinograd4x4To6x6Weights(weights, &wino_weights);
-  RETURN_IF_ERROR(UploadWeights(wino_weights, context));
+  UploadWeights(wino_weights);
   tflite::gpu::Tensor<Linear, DataType::FLOAT32> bias;
   bias.shape = Linear(weights.shape.o);
   bias.data.resize(weights.shape.o, 0.0f);
-  RETURN_IF_ERROR(UploadBiases(bias, context));
-
-  return absl::OkStatus();
+  UploadBiases(bias);
 }
 
 template <DataType T>
-absl::Status ConvBuffer1x1::UploadWeights(
-    const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
+void ConvBuffer1x1::UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights) {
   const int dst_depth = DivideRoundUp(weights.shape.o, 4);
   const int src_depth = DivideRoundUp(weights.shape.i, 4);
 
@@ -149,51 +139,36 @@ absl::Status ConvBuffer1x1::UploadWeights(
   const int elements_count =
       weights.shape.h * weights.shape.w * src_depth * dst_depth_aligned * 4;
 
-  Buffer weights_buffer;
-  if (f32_weights) {
-    std::vector<float4> gpu_data(elements_count);
-    RearrangeWeightsToOHWIOGroupI4O4(weights, conv_params_.block_size.z,
-                                     absl::MakeSpan(gpu_data));
-    RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
-                                         gpu_data.data(), context,
-                                         &weights_buffer));
-  } else {
-    std::vector<half4> gpu_data(elements_count);
-    RearrangeWeightsToOHWIOGroupI4O4(weights, conv_params_.block_size.z,
-                                     absl::MakeSpan(gpu_data));
-    RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
-                                         gpu_data.data(), context,
-                                         &weights_buffer));
-  }
-
   BufferDescriptor desc;
   desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
   desc.element_size = 16;
   desc.memory_type = MemoryType::GLOBAL;
+  desc.size = float4_size * elements_count;
+  desc.data.resize(desc.size);
 
-  args_.AddObject("weights", AccessType::READ,
-                  absl::make_unique<Buffer>(std::move(weights_buffer)),
-                  absl::make_unique<BufferDescriptor>(desc));
-  return absl::OkStatus();
+  if (f32_weights) {
+    float4* ptr = reinterpret_cast<float4*>(desc.data.data());
+    RearrangeWeightsToOHWIOGroupI4O4(weights, conv_params_.block_size.z,
+                                     absl::MakeSpan(ptr, elements_count));
+  } else {
+    half4* ptr = reinterpret_cast<half4*>(desc.data.data());
+    RearrangeWeightsToOHWIOGroupI4O4(weights, conv_params_.block_size.z,
+                                     absl::MakeSpan(ptr, elements_count));
+  }
+
+  args_.AddObject("weights",
+                  absl::make_unique<BufferDescriptor>(std::move(desc)));
 }
 
 template <DataType T>
-absl::Status ConvBuffer1x1::UploadBiases(
-    const tflite::gpu::Tensor<Linear, T>& biases, CLContext* context) {
+void ConvBuffer1x1::UploadBiases(const tflite::gpu::Tensor<Linear, T>& biases) {
   TensorLinearDescriptor desc;
   desc.storage_type = LinearStorageType::BUFFER;
   desc.element_type = definition_.GetDataType();
-
-  tflite::gpu::Tensor<Linear, DataType::FLOAT32> bias = biases;
-  int channels = AlignByN(biases.shape.v, 4 * conv_params_.block_size.z);
-  bias.shape = Linear(channels);
-  bias.data.resize(channels, 0.0f);
-  LinearStorage lt;
-  RETURN_IF_ERROR(CreateLinearStorage(desc, bias, context, &lt));
-  args_.AddObject("biases", AccessType::READ,
-                  absl::make_unique<LinearStorage>(std::move(lt)),
-                  absl::make_unique<TensorLinearDescriptor>(desc));
-  return absl::OkStatus();
+  int depth = AlignByN(biases.shape.v, 4 * conv_params_.block_size.z) / 4;
+  desc.UploadLinearData(biases, depth);
+  args_.AddObject("biases",
+                  absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
 }
 
 bool IsConvBuffer1x1Supported(const OperationDef& definition,
@@ -203,27 +178,24 @@ bool IsConvBuffer1x1Supported(const OperationDef& definition,
                               const BHWC& weights_shape,
                               const Convolution2DAttributes& attr);
 
-absl::Status CreateConvBuffer1x1(const CreationContext& creation_context,
-                                 const OperationDef& definition,
-                                 const Convolution2DAttributes& attr,
-                                 ConvBuffer1x1* result,
-                                 const BHWC* shape = nullptr);
+ConvBuffer1x1 CreateConvBuffer1x1(const DeviceInfo& device_info,
+                                  const OperationDef& definition,
+                                  const Convolution2DAttributes& attr,
+                                  const BHWC* shape = nullptr);
 
-absl::Status CreateConvBuffer1x1(const CreationContext& creation_context,
-                                 const OperationDef& definition,
-                                 const FullyConnectedAttributes& attr,
-                                 ConvBuffer1x1* result,
-                                 const BHWC* shape = nullptr);
+ConvBuffer1x1 CreateConvBuffer1x1(const DeviceInfo& device_info,
+                                  const OperationDef& definition,
+                                  const FullyConnectedAttributes& attr,
+                                  const BHWC* shape = nullptr);
 
-absl::Status CreateConvBuffer1x1DynamicWeights(
-    const CreationContext& creation_context, const OperationDef& definition,
+ConvBuffer1x1 CreateConvBuffer1x1DynamicWeights(
+    const DeviceInfo& device_info, const OperationDef& definition,
     const Convolution2DAttributes& attr, const BHWC& weights_shape,
-    ConvBuffer1x1* result, const BHWC* dst_shape = nullptr);
+    const BHWC* dst_shape = nullptr);
 
-absl::Status CreateConvBuffer1x1Wino4x4To6x6(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const Convolution2DAttributes& attr, ConvBuffer1x1* result,
-    const BHWC* shape = nullptr);
+ConvBuffer1x1 CreateConvBuffer1x1Wino4x4To6x6(
+    const DeviceInfo& device_info, const OperationDef& definition,
+    const Convolution2DAttributes& attr, const BHWC* shape = nullptr);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1_test.cc
index 828eafcc04f..d43329c91d9 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1_test.cc
@@ -56,9 +56,8 @@ TEST_F(OpenCLOperationTest, ConvBuffer1x1SimpleWeights) {
     op_def.dst_tensors.push_back(
         {data_type, TensorStorageType::BUFFER, Layout::HWC});
     TensorFloat32 dst_tensor;
-    ConvBuffer1x1 operation;
-    ASSERT_OK(CreateConvBuffer1x1(creation_context_, op_def, attr, &operation,
-                                  &src_tensor.shape));
+    ConvBuffer1x1 operation = CreateConvBuffer1x1(
+        creation_context_.GetDeviceInfo(), op_def, attr, &src_tensor.shape);
     ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                   BHWC(1, 2, 1, 2), &dst_tensor));
     EXPECT_THAT(dst_tensor.data,
@@ -92,9 +91,8 @@ TEST_F(OpenCLOperationTest, ConvBuffer1x1) {
     op_def.dst_tensors.push_back(
         {data_type, TensorStorageType::BUFFER, Layout::HWC});
     TensorFloat32 dst_tensor;
-    ConvBuffer1x1 operation;
-    ASSERT_OK(CreateConvBuffer1x1(creation_context_, op_def, attr, &operation,
-                                  &src_tensor.shape));
+    ConvBuffer1x1 operation = CreateConvBuffer1x1(
+        creation_context_.GetDeviceInfo(), op_def, attr, &src_tensor.shape);
     ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                   BHWC(1, 2, 1, 4), &dst_tensor));
     EXPECT_THAT(dst_tensor.data,
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.cc
index d5a2a56c19c..c3663634177 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.cc
@@ -45,84 +45,29 @@ int GetOptimalMaxConstantSize(const DeviceInfo& info) {
     return GetAdrenoOptimalMaxConstantSize(info.adreno_info.gpu_version);
   }
 }
-}  // namespace
 
-ConvConstants::ConvConstants(const OperationDef& definition,
-                             const Convolution2DAttributes& attr,
-                             const DeviceInfo& device_info)
-    : GPUOperation(definition),
-      kernel_size_(attr.weights.shape.w, attr.weights.shape.h),
-      stride_(attr.strides.w, attr.strides.h),
-      padding_(-attr.padding.prepended.w, -attr.padding.prepended.h),
-      dilation_(attr.dilations.w, attr.dilations.h),
-      src_channels_(attr.weights.shape.i),
-      dst_channels_(attr.weights.shape.o) {
-  const bool stride_correction =
-      definition_.IsBatchSupported() && stride_.x != 1;
-  code_ =
-      GenerateConvolutionConstantCode(definition_, kernel_size_, src_channels_,
-                                      dst_channels_, stride_correction);
-  if (definition_.precision == CalculationsPrecision::F16 &&
-      device_info.IsAdreno3xx()) {
-    compiler_options_.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE);
-  }
-  if (definition_.precision != CalculationsPrecision::F32 &&
-      device_info.IsPowerVR()) {
-    // BUG, some PowerVRs (GE8320) produce incorrect result without it
-    compiler_options_.push_back(CompilerOptions::CL_OPT_DISABLE);
-  }
-}
-
-ConvConstants::ConvConstants(ConvConstants&& kernel)
-    : GPUOperation(std::move(kernel)),
-      kernel_size_(kernel.kernel_size_),
-      stride_(kernel.stride_),
-      padding_(kernel.padding_),
-      dilation_(kernel.dilation_),
-      src_channels_(kernel.src_channels_),
-      dst_channels_(kernel.dst_channels_) {}
-
-ConvConstants& ConvConstants::operator=(ConvConstants&& kernel) {
-  if (this != &kernel) {
-    std::swap(kernel_size_, kernel.kernel_size_);
-    std::swap(stride_, kernel.stride_);
-    std::swap(padding_, kernel.padding_);
-    std::swap(dilation_, kernel.dilation_);
-    std::swap(src_channels_, kernel.src_channels_);
-    std::swap(dst_channels_, kernel.dst_channels_);
-    GPUOperation::operator=(std::move(kernel));
-  }
-  return *this;
-}
-
-std::string ConvConstants::GenerateConvolutionConstantCode(
-    const OperationDef& op_def, const int2& kernel_size, int src_channels,
-    int dst_channels, bool stride_correction) {
+std::string GenerateConvolutionConstantCode(const OperationDef& op_def,
+                                            const OHWI& weights_shape,
+                                            bool stride_correction,
+                                            GPUOperation* op) {
   auto src_desc = op_def.src_tensors[0];
   src_desc.SetTextureAddressMode(TextureAddressMode::ZERO);
   if (op_def.IsBatchSupported()) {
     src_desc.SetStateVar("BatchedWidth", "true");
   }
-  AddSrcTensor("src_tensor", src_desc);
+  op->AddSrcTensor("src_tensor", src_desc);
 
   auto dst_desc = op_def.dst_tensors[0];
   if (op_def.IsBatchSupported()) {
     dst_desc.SetStateVar("BatchedWidth", "true");
   }
-  AddDstTensor("dst_tensor", dst_desc);
-
-  args_.AddInt("stride_x");
-  args_.AddInt("stride_y");
-  args_.AddInt("padding_x");
-  args_.AddInt("padding_y");
-  args_.AddInt("dilation_x");
-  args_.AddInt("dilation_y");
+  op->AddDstTensor("dst_tensor", dst_desc);
 
   std::string c = GetCommonDefines(op_def.precision);
 
-  const int out_z = DivideRoundUp(dst_channels, 4);
+  const int out_z = DivideRoundUp(weights_shape.o, 4);
   const std::string kOutZ = std::to_string(out_z);
-  const int src_depth = DivideRoundUp(src_channels, 4);
+  const int src_depth = DivideRoundUp(weights_shape.i, 4);
 
   const auto src_tensor_type = op_def.src_tensors[0].storage_type;
   const bool manual_clamp = src_tensor_type == TensorStorageType::BUFFER ||
@@ -176,11 +121,16 @@ std::string ConvConstants::GenerateConvolutionConstantCode(
        "return;\n";
   if (stride_correction) {
     c += "  int start_x = " +
-         GetXStrideCorrected("X", "args.src_tensor.Batch()", "args.stride_x",
-                             "args.padding_x") +
+         GetXStrideCorrectedV2("X", "args.src_tensor.Batch()", "args.stride_x",
+                               "args.padding_x") +
          ";\n";
   } else {
-    c += "  int start_x = X * args.stride_x + args.padding_x;\n";
+    if (op_def.IsBatchSupported()) {
+      c += "  int start_x = X * args.stride_x + args.padding_x * "
+           "args.src_tensor.Batch();\n";
+    } else {
+      c += "  int start_x = X * args.stride_x + args.padding_x;\n";
+    }
   }
   c += "  int start_y = Y * args.stride_y + args.padding_y;\n";
   c += "  ACCUM_FLT4 r[" + kOutZ + "];\n";
@@ -189,22 +139,25 @@ std::string ConvConstants::GenerateConvolutionConstantCode(
   c += "  }\n";
   int filters_counter = 0;
   for (int s = 0; s < src_depth; ++s) {
-    const int ch_count = std::min(4, src_channels - s * 4);
+    const int ch_count = std::min(4, weights_shape.i - s * 4);
     const std::string s_conv = "CONV" + std::to_string(ch_count);
     const std::string s_count = ch_count == 1 ? "" : std::to_string(ch_count);
     const std::string s_type = absl::StrCat("FLT", s_count);
     const std::string s_postfix = postfixes[ch_count - 1];
-    for (int ky = 0; ky < kernel_size.y; ++ky) {
+    const std::string dilation_x =
+        op_def.IsBatchSupported() ? "args.dilation_x * args.src_tensor.Batch()"
+                                  : "args.dilation_x";
+    for (int ky = 0; ky < weights_shape.h; ++ky) {
       std::string s_y = absl::StrCat("(start_y + ", ky, " * args.dilation_y)");
       if (manual_clamp) {
         c += "  {\n";
         c += "  bool y_out = " + s_y + " < 0 || " + s_y +
              " >= args.src_tensor.Height();\n";
       }
-      for (int kx = 0; kx < kernel_size.x; ++kx) {
+      for (int kx = 0; kx < weights_shape.w; ++kx) {
         c += "  {\n";
         std::string s_x =
-            absl::StrCat("(start_x + ", kx, " * args.dilation_x)");
+            absl::StrCat("(start_x + ", kx, " * " + dilation_x + ")");
         if (manual_clamp) {
           c += "    bool x_out = " + s_x + "< 0 || " + s_x +
                ">= args.src_tensor.Width();\n";
@@ -240,25 +193,13 @@ std::string ConvConstants::GenerateConvolutionConstantCode(
   return c;
 }
 
-absl::Status ConvConstants::BindArguments() {
-  RETURN_IF_ERROR(args_.SetInt("stride_x", stride_.x));
-  RETURN_IF_ERROR(args_.SetInt("stride_y", stride_.y));
-  RETURN_IF_ERROR(args_.SetInt("padding_x", padding_.x * src_[0]->Batch()));
-  RETURN_IF_ERROR(args_.SetInt("padding_y", padding_.y));
-  RETURN_IF_ERROR(args_.SetInt("dilation_x", dilation_.x * src_[0]->Batch()));
-  return args_.SetInt("dilation_y", dilation_.y);
-}
+}  // namespace
 
-int3 ConvConstants::GetGridSize() const {
-  const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
-  const int grid_y = dst_[0]->Height();
-  return int3(grid_x, grid_y, 1);
-}
-
-bool IsConvConstantsSupported(const CLDevice& device,
+bool IsConvConstantsSupported(const DeviceInfo& device_info,
                               const OperationDef& definition,
                               const Convolution2DAttributes& attr) {
-  if (device.IsAMD() && definition.precision != CalculationsPrecision::F32 &&
+  if (device_info.IsAMD() &&
+      definition.precision != CalculationsPrecision::F32 &&
       definition.src_tensors[0].storage_type != TensorStorageType::BUFFER) {
     // BUG, some AMD gpus crashe without it
     return false;
@@ -271,34 +212,46 @@ bool IsConvConstantsSupported(const CLDevice& device,
                              ? sizeof(float)
                              : sizeof(half);
   const int filters_buffer_size = filters_count * float_size;
-  const int kConstantMaxSize = GetOptimalMaxConstantSize(device.GetInfo());
+  const int kConstantMaxSize = GetOptimalMaxConstantSize(device_info);
   const int flt4_registers = DivideRoundUp(w_shape.o, 4);
   return filters_buffer_size <= kConstantMaxSize && flt4_registers <= 8;
 }
 
-absl::Status CreateConvConstants(const CreationContext& creation_context,
+GPUOperation CreateConvConstants(const DeviceInfo& device_info,
                                  const OperationDef& definition,
-                                 const Convolution2DAttributes& attr,
-                                 ConvConstants* result) {
-  if (!IsConvConstantsSupported(*creation_context.device, definition, attr)) {
-    return absl::InvalidArgumentError("ConvConstants doesn't supported");
+                                 const Convolution2DAttributes& attr) {
+  GPUOperation op(definition);
+  UploadWeightsForConvConstants(attr.weights, definition.precision, &op);
+  op.args_.AddInt("stride_x", attr.strides.w);
+  op.args_.AddInt("stride_y", attr.strides.h);
+  op.args_.AddInt("padding_x", -attr.padding.prepended.w);
+  op.args_.AddInt("padding_y", -attr.padding.prepended.h);
+  op.args_.AddInt("dilation_x", attr.dilations.w);
+  op.args_.AddInt("dilation_y", attr.dilations.h);
+  op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_ZIs1;
+
+  const bool stride_correction =
+      definition.IsBatchSupported() && attr.strides.w != 1;
+  op.code_ = GenerateConvolutionConstantCode(definition, attr.weights.shape,
+                                             stride_correction, &op);
+  if (definition.precision == CalculationsPrecision::F16 &&
+      device_info.IsAdreno3xx()) {
+    op.compiler_options_.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE);
+  }
+  if (definition.precision != CalculationsPrecision::F32 &&
+      device_info.IsPowerVR()) {
+    // BUG, some PowerVRs (GE8320) produce incorrect result without it
+    op.compiler_options_.push_back(CompilerOptions::CL_OPT_DISABLE);
   }
-  *result = ConvConstants(definition, attr, creation_context.device->GetInfo());
-  RETURN_IF_ERROR(
-      result->UploadWeights(attr.weights, creation_context.context));
 
   TensorLinearDescriptor desc;
   desc.storage_type = LinearStorageType::BUFFER;
   desc.element_type = definition.GetDataType();
   desc.memory_type = MemoryType::CONSTANT;
-
-  LinearStorage lt;
-  RETURN_IF_ERROR(
-      CreateLinearStorage(desc, attr.bias, creation_context.context, &lt));
-  result->args_.AddObject("biases", AccessType::READ,
-                          absl::make_unique<LinearStorage>(std::move(lt)),
-                          absl::make_unique<TensorLinearDescriptor>(desc));
-  return absl::OkStatus();
+  desc.UploadLinearData(attr.bias);
+  op.args_.AddObject(
+      "biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
+  return op;
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.h b/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.h
index 6504b828158..c341ecb5753 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.h
@@ -32,87 +32,8 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-class ConvConstants : public GPUOperation {
- public:
-  ConvConstants() = default;
-  absl::Status BindArguments() override;
-  int3 GetGridSize() const override;
-
-  // Move only
-  ConvConstants(ConvConstants&& kernel);
-  ConvConstants& operator=(ConvConstants&& kernel);
-  ConvConstants(const ConvConstants&) = delete;
-  ConvConstants& operator=(const ConvConstants&) = delete;
-
- private:
-  friend absl::Status CreateConvConstants(
-      const CreationContext& creation_context, const OperationDef& definition,
-      const Convolution2DAttributes& attr, ConvConstants* result);
-  ConvConstants(const OperationDef& definition,
-                const Convolution2DAttributes& attr,
-                const DeviceInfo& device_info);
-
-  template <DataType T>
-  absl::Status UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
-                             CLContext* context);
-
-  template <DataType S, typename T>
-  void RearrangeWeightsData(const tflite::gpu::Tensor<OHWI, S>& weights,
-                            absl::Span<T> dst);
-
-  std::string GenerateConvolutionConstantCode(const OperationDef& op_def,
-                                              const int2& kernel_size,
-                                              int src_channels,
-                                              int dst_channels,
-                                              bool stride_correction);
-
-  int2 kernel_size_;
-  int2 stride_;
-  int2 padding_;
-  int2 dilation_;
-  int src_channels_;
-  int dst_channels_;
-};
-
-template <DataType T>
-absl::Status ConvConstants::UploadWeights(
-    const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
-  const int dst_depth = DivideRoundUp(weights.shape.o, 4);
-  const int kernel_x = weights.shape.w;
-  const int kernel_y = weights.shape.h;
-
-  const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
-
-  BufferDescriptor desc;
-  desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
-  desc.element_size = 4;
-  desc.memory_type = MemoryType::CONSTANT;
-
-  const int float_size = f32_weights ? 4 : 2;
-  const int float_count = src_channels_ * dst_depth * 4 * kernel_x * kernel_y;
-
-  Buffer weights_buffer;
-  if (f32_weights) {
-    std::vector<float4> gpu_data(float_count / 4);
-    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
-    RETURN_IF_ERROR(CreateReadOnlyBuffer(
-        float_size * float_count, gpu_data.data(), context, &weights_buffer));
-  } else {
-    std::vector<half4> gpu_data(float_count / 4);
-    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
-    RETURN_IF_ERROR(CreateReadOnlyBuffer(
-        float_size * float_count, gpu_data.data(), context, &weights_buffer));
-  }
-
-  args_.AddObject("weigths", AccessType::READ,
-                  absl::make_unique<Buffer>(std::move(weights_buffer)),
-                  absl::make_unique<BufferDescriptor>(desc));
-
-  return absl::OkStatus();
-}
-
 template <DataType S, typename T>
-void ConvConstants::RearrangeWeightsData(
+void RearrangeWeightsForConvConstants(
     const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
   const int dst_depth = DivideRoundUp(weights.shape.o, 4);
   const int src_depth = DivideRoundUp(weights.shape.i, 4);
@@ -124,7 +45,7 @@ void ConvConstants::RearrangeWeightsData(
     for (int y = 0; y < kernel_y; ++y) {
       for (int x = 0; x < kernel_x; ++x) {
         for (int d = 0; d < dst_depth; ++d) {
-          const int channels_count = std::min(4, src_channels_ - s * 4);
+          const int channels_count = std::min(4, weights.shape.i - s * 4);
           T filters[4];
           for (int i = 0; i < 4; ++i) {
             for (int j = 0; j < channels_count; ++j) {
@@ -154,14 +75,46 @@ void ConvConstants::RearrangeWeightsData(
   }
 }
 
-bool IsConvConstantsSupported(const CLDevice& device,
+template <DataType T>
+void UploadWeightsForConvConstants(const tflite::gpu::Tensor<OHWI, T>& weights,
+                                   CalculationsPrecision precision,
+                                   GPUOperation* op) {
+  const int dst_depth = DivideRoundUp(weights.shape.o, 4);
+  const int kernel_x = weights.shape.w;
+  const int kernel_y = weights.shape.h;
+
+  const bool f32_weights = precision == CalculationsPrecision::F32;
+  const int float_size = f32_weights ? 4 : 2;
+  const int float_count = weights.shape.i * dst_depth * 4 * kernel_x * kernel_y;
+
+  BufferDescriptor desc;
+  desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+  desc.element_size = 4;
+  desc.memory_type = MemoryType::CONSTANT;
+  desc.size = float_size * float_count;
+  desc.data.resize(desc.size);
+
+  if (f32_weights) {
+    float4* ptr = reinterpret_cast<float4*>(desc.data.data());
+    RearrangeWeightsForConvConstants(weights,
+                                     absl::MakeSpan(ptr, float_count / 4));
+  } else {
+    half4* ptr = reinterpret_cast<half4*>(desc.data.data());
+    RearrangeWeightsForConvConstants(weights,
+                                     absl::MakeSpan(ptr, float_count / 4));
+  }
+
+  op->args_.AddObject("weigths",
+                      absl::make_unique<BufferDescriptor>(std::move(desc)));
+}
+
+bool IsConvConstantsSupported(const DeviceInfo& device_info,
                               const OperationDef& definition,
                               const Convolution2DAttributes& attr);
 
-absl::Status CreateConvConstants(const CreationContext& creation_context,
+GPUOperation CreateConvConstants(const DeviceInfo& device_info,
                                  const OperationDef& definition,
-                                 const Convolution2DAttributes& attr,
-                                 ConvConstants* result);
+                                 const Convolution2DAttributes& attr);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants_test.cc
index 015e862fa65..17821e14e0a 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants_test.cc
@@ -55,9 +55,8 @@ TEST_F(OpenCLOperationTest, ConvConstantsSimpleWeights) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ConvConstants operation;
-      ASSERT_OK(
-          CreateConvConstants(creation_context_, op_def, attr, &operation));
+      GPUOperation operation =
+          CreateConvConstants(creation_context_.GetDeviceInfo(), op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 2, 1), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -91,9 +90,8 @@ TEST_F(OpenCLOperationTest, ConvConstants) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ConvConstants operation;
-      ASSERT_OK(
-          CreateConvConstants(creation_context_, op_def, attr, &operation));
+      GPUOperation operation =
+          CreateConvConstants(creation_context_.GetDeviceInfo(), op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 2, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
index c4e26725f74..786248e67d7 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
@@ -19,11 +19,13 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/strings/substitute.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
 #include "tensorflow/lite/delegates/gpu/cl/precision.h"
 #include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 
 namespace tflite {
@@ -70,57 +72,77 @@ std::string GenerateAsyncUpload(const std::string& local_ptr_name,
   return c;
 }
 
-std::string GenerateBlockCoords(const int3& block_size,
+std::string GenerateBlockCoords(const int4& block_size,
                                 const int3& work_group_launch_order,
-                                bool linear_hw) {
+                                bool linear_spatial, bool need_depth) {
   std::string c;
   int3 launch_remap;
   launch_remap[work_group_launch_order.x] = 0;
   launch_remap[work_group_launch_order.y] = 1;
   launch_remap[work_group_launch_order.z] = 2;
-  if (linear_hw) {
+  if (linear_spatial) {
     if (work_group_launch_order[0] == 0) {
-      c += "  int linear_hw = get_global_id(0);\n";
+      c += "  int linear_spatial = get_global_id(0);\n";
     } else {
-      c += "  int linear_hw = get_group_id(" + std::to_string(launch_remap[0]) +
+      c += "  int linear_spatial = get_group_id(" +
+           std::to_string(launch_remap[0]) +
            ") * get_local_size(0) + get_local_id(0);\n";
     }
-    c += "  int Y = (linear_hw / args.task_size_x) * " +
-         std::to_string(block_size.y) + ";\n";
-    c += "  int X = (linear_hw % args.task_size_x) * " +
-         std::to_string(block_size.x) + ";\n";
-    if (work_group_launch_order[1] == 1) {
-      c += "  int Z = get_global_id(1) * " + std::to_string(block_size.z) +
-           ";\n";
-    } else {
-      c += "  int Z = (get_group_id(" + std::to_string(launch_remap[1]) +
-           ") * get_local_size(1) + get_local_id(1)) * " +
+    if (need_depth) {
+      c += "  int DST_X = (linear_spatial % args.task_size_x) * " +
+           std::to_string(block_size.x) + ";\n";
+      c += "  linear_spatial = linear_spatial / args.task_size_x;\n";
+      c += "  int DST_Y = (linear_spatial % args.task_size_y) * " +
+           std::to_string(block_size.y) + ";\n";
+      c += "  int DST_Z = (linear_spatial / args.task_size_y) * " +
            std::to_string(block_size.z) + ";\n";
-    }
-  } else {
-    if (work_group_launch_order[0] == 0) {
-      c += "  int X = get_global_id(0) * " + std::to_string(block_size.x) +
-           ";\n";
     } else {
-      c += "  int X = (get_group_id(" + std::to_string(launch_remap[0]) +
-           ") * get_local_size(0) + get_local_id(0)) * " +
+      c += "  int DST_Y = (linear_spatial / args.task_size_x) * " +
+           std::to_string(block_size.y) + ";\n";
+      c += "  int DST_X = (linear_spatial % args.task_size_x) * " +
            std::to_string(block_size.x) + ";\n";
     }
     if (work_group_launch_order[1] == 1) {
-      c += "  int Y = get_global_id(1) * " + std::to_string(block_size.y) +
+      c += "  int DST_S = get_global_id(1) * " + std::to_string(block_size.w) +
            ";\n";
     } else {
-      c += "  int Y = (get_group_id(" + std::to_string(launch_remap[1]) +
+      c += "  int DST_S = (get_group_id(" + std::to_string(launch_remap[1]) +
            ") * get_local_size(1) + get_local_id(1)) * " +
+           std::to_string(block_size.w) + ";\n";
+    }
+  } else {
+    if (work_group_launch_order[0] == 0) {
+      c += "  int DST_X = get_global_id(0) * " + std::to_string(block_size.x) +
+           ";\n";
+    } else {
+      c += "  int DST_X = (get_group_id(" + std::to_string(launch_remap[0]) +
+           ") * get_local_size(0) + get_local_id(0)) * " +
+           std::to_string(block_size.x) + ";\n";
+    }
+    std::string global_id_1;
+    if (work_group_launch_order[1] == 1) {
+      global_id_1 = "get_global_id(1)";
+    } else {
+      global_id_1 = "(get_group_id(" + std::to_string(launch_remap[1]) +
+                    ") * get_local_size(1) + get_local_id(1))";
+    }
+    if (need_depth) {
+      c += "  int linear_id_1 = " + global_id_1 + ";\n";
+      c += "  int DST_Z = (linear_id_1 / args.task_size_y) * " +
+           std::to_string(block_size.z) + ";\n";
+      c += "  int DST_Y = (linear_id_1 % args.task_size_y) * " +
+           std::to_string(block_size.y) + ";\n";
+    } else {
+      c += "  int DST_Y = " + global_id_1 + " * " +
            std::to_string(block_size.y) + ";\n";
     }
     if (work_group_launch_order[2] == 2) {
-      c += "  int Z = get_global_id(2) * " + std::to_string(block_size.z) +
+      c += "  int DST_S = get_global_id(2) * " + std::to_string(block_size.w) +
            ";\n";
     } else {
-      c += "  int Z = (get_group_id(" + std::to_string(launch_remap[2]) +
+      c += "  int DST_S = (get_group_id(" + std::to_string(launch_remap[2]) +
            ") * get_local_size(2) + get_local_id(2)) * " +
-           std::to_string(block_size.z) + ";\n";
+           std::to_string(block_size.w) + ";\n";
     }
   }
 
@@ -130,49 +152,69 @@ std::string GenerateBlockCoords(const int3& block_size,
 
 ConvPowerVR::ConvPowerVR(const OperationDef& definition,
                          const Convolution2DAttributes& attr,
-                         const CLDevice& device, const BHWC* dst_shape)
+                         const DeviceInfo& device_info, const BHWC* dst_shape)
     : GPUOperation(definition),
-      stride_padding_(attr.strides.w, attr.strides.h, -attr.padding.prepended.w,
-                      -attr.padding.prepended.h),
-      kernel_dilation_(attr.weights.shape.w, attr.weights.shape.h,
-                       attr.dilations.w, attr.dilations.h),
-      conv_params_(GuessBestParams(device, definition, attr, dst_shape)) {}
+      stride_(attr.strides.w, attr.strides.h, 1, 1),
+      padding_(-attr.padding.prepended.w, -attr.padding.prepended.h, 0, 0),
+      kernel_size_(attr.weights.shape.w, attr.weights.shape.h, 1, 1),
+      dilation_(attr.dilations.w, attr.dilations.h, 1, 1),
+      conv_params_(GuessBestParams(device_info, definition, attr, dst_shape)) {}
 
 ConvPowerVR::ConvPowerVR(const OperationDef& definition,
                          const Convolution2DAttributes& attr,
-                         const BHWC& weights_shape, const CLDevice& device,
-                         const BHWC* dst_shape)
+                         const BHWC& weights_shape,
+                         const DeviceInfo& device_info, const BHWC* dst_shape)
     : GPUOperation(definition),
-      stride_padding_(attr.strides.w, attr.strides.h, -attr.padding.prepended.w,
-                      -attr.padding.prepended.h),
-      kernel_dilation_(weights_shape.w, weights_shape.h, attr.dilations.w,
-                       attr.dilations.h),
-      conv_params_(GuessBestParams(device, definition, attr, weights_shape,
+      stride_(attr.strides.w, attr.strides.h, 1, 1),
+      padding_(-attr.padding.prepended.w, -attr.padding.prepended.h, 0, 0),
+      kernel_size_(weights_shape.w, weights_shape.h, 1, 1),
+      dilation_(attr.dilations.w, attr.dilations.h, 1, 1),
+      conv_params_(GuessBestParams(device_info, definition, attr, weights_shape,
                                    dst_shape)) {}
 
 ConvPowerVR::ConvPowerVR(const OperationDef& definition,
                          const FullyConnectedAttributes& attr,
-                         const CLDevice& device, const BHWC* dst_shape)
+                         const DeviceInfo& device_info, const BHWC* dst_shape)
     : GPUOperation(definition),
-      stride_padding_(1, 1, 0, 0),
-      kernel_dilation_(1, 1, 1, 1),
-      conv_params_(GuessBestParams(device, definition, attr, dst_shape)) {}
+      stride_(1, 1, 1, 1),
+      padding_(0, 0, 0, 0),
+      kernel_size_(1, 1, 1, 1),
+      dilation_(1, 1, 1, 1),
+      conv_params_(GuessBestParams(device_info, definition, attr, dst_shape)) {}
 
 ConvPowerVR::ConvPowerVR(const OperationDef& definition)
     : GPUOperation(definition),
-      stride_padding_(1, 1, 0, 0),
-      kernel_dilation_(1, 1, 1, 1) {}
+      stride_(1, 1, 1, 1),
+      padding_(0, 0, 0, 0),
+      kernel_size_(1, 1, 1, 1),
+      dilation_(1, 1, 1, 1) {}
 
 ConvPowerVR::ConvPowerVR(ConvPowerVR&& operation)
     : GPUOperation(std::move(operation)),
-      stride_padding_(operation.stride_padding_),
-      kernel_dilation_(operation.kernel_dilation_),
+      stride_(operation.stride_),
+      padding_(operation.padding_),
+      kernel_size_(operation.kernel_size_),
+      dilation_(operation.dilation_),
       conv_params_(operation.conv_params_) {}
 
+ConvPowerVR::ConvPowerVR(const OperationDef& definition,
+                         const Convolution3DAttributes& attr,
+                         const DeviceInfo& device_info, const BHWDC* dst_shape)
+    : GPUOperation(definition),
+      stride_(attr.strides.w, attr.strides.h, attr.strides.d, 1),
+      padding_(-attr.padding.prepended.w, -attr.padding.prepended.h,
+               -attr.padding.prepended.d, 0),
+      kernel_size_(attr.weights.shape.w, attr.weights.shape.h,
+                   attr.weights.shape.d, 1),
+      dilation_(attr.dilations.w, attr.dilations.h, attr.dilations.d, 1),
+      conv_params_(GuessBestParams(device_info, definition, attr, dst_shape)) {}
+
 ConvPowerVR& ConvPowerVR::operator=(ConvPowerVR&& operation) {
   if (this != &operation) {
-    std::swap(stride_padding_, operation.stride_padding_);
-    std::swap(kernel_dilation_, operation.kernel_dilation_);
+    std::swap(stride_, operation.stride_);
+    std::swap(padding_, operation.padding_);
+    std::swap(kernel_size_, operation.kernel_size_);
+    std::swap(dilation_, operation.dilation_);
     std::swap(conv_params_, operation.conv_params_);
     GPUOperation::operator=(std::move(operation));
   }
@@ -181,10 +223,9 @@ ConvPowerVR& ConvPowerVR::operator=(ConvPowerVR&& operation) {
 
 void ConvPowerVR::GenerateCode(const DeviceInfo& device_info) {
   const bool stride_correction =
-      definition_.IsBatchSupported() && stride_padding_.x != 1;
+      definition_.IsBatchSupported() && stride_.x != 1;
   code_ =
       GenerateConv(device_info, definition_, stride_correction, conv_params_);
-  work_group_size_ = conv_params_.work_group_size;
   if (definition_.precision == CalculationsPrecision::F16 &&
       device_info.IsPowerVR()) {
     compiler_options_.push_back(CompilerOptions::POWERVR_FP16);
@@ -195,73 +236,93 @@ void ConvPowerVR::GenerateCode(const DeviceInfo& device_info) {
 }
 
 absl::Status ConvPowerVR::BindArguments() {
-  if (!conv_params_.x_kernel_is_1 || !conv_params_.y_kernel_is_1) {
-    RETURN_IF_ERROR(args_.SetInt("stride_x", stride_padding_.x));
-    RETURN_IF_ERROR(args_.SetInt("stride_y", stride_padding_.y));
-    RETURN_IF_ERROR(
-        args_.SetInt("padding_x", stride_padding_.z * src_[0]->Batch()));
-    RETURN_IF_ERROR(args_.SetInt("padding_y", stride_padding_.w));
-    RETURN_IF_ERROR(args_.SetInt("kernel_size_x", kernel_dilation_.x));
-    RETURN_IF_ERROR(args_.SetInt("kernel_size_y", kernel_dilation_.y));
-    RETURN_IF_ERROR(
-        args_.SetInt("dilation_x", kernel_dilation_.z * src_[0]->Batch()));
-    RETURN_IF_ERROR(args_.SetInt("dilation_y", kernel_dilation_.w));
+  if (!conv_params_.x_kernel_is_1) {
+    RETURN_IF_ERROR(args_.SetInt("stride_x", stride_.x));
+    RETURN_IF_ERROR(args_.SetInt("padding_x", padding_.x * src_[0]->Batch()));
+    RETURN_IF_ERROR(args_.SetInt("kernel_size_x", kernel_size_.x));
+    RETURN_IF_ERROR(args_.SetInt("dilation_x", dilation_.x * src_[0]->Batch()));
   }
-  if (conv_params_.linear_hw) {
+  if (!conv_params_.y_kernel_is_1) {
+    RETURN_IF_ERROR(args_.SetInt("stride_y", stride_.y));
+    RETURN_IF_ERROR(args_.SetInt("padding_y", padding_.y));
+    RETURN_IF_ERROR(args_.SetInt("kernel_size_y", kernel_size_.y));
+    RETURN_IF_ERROR(args_.SetInt("dilation_y", dilation_.y));
+  }
+  if (definition_.src_tensors[0].HasAxis(Axis::DEPTH) &&
+      !conv_params_.z_kernel_is_1) {
+    RETURN_IF_ERROR(args_.SetInt("stride_z", stride_.z));
+    RETURN_IF_ERROR(args_.SetInt("padding_z", padding_.z));
+    RETURN_IF_ERROR(args_.SetInt("kernel_size_z", kernel_size_.z));
+    RETURN_IF_ERROR(args_.SetInt("dilation_z", dilation_.z));
+  }
+  if (conv_params_.linear_spatial) {
     const int grid_x = DivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(),
                                      conv_params_.block_size.x);
     RETURN_IF_ERROR(args_.SetInt("task_size_x", grid_x));
   }
+  if (definition_.src_tensors[0].HasAxis(Axis::DEPTH)) {
+    const int task_size_y =
+        DivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y);
+    RETURN_IF_ERROR(args_.SetInt("task_size_y", task_size_y));
+  }
   return absl::OkStatus();
 }
 
 int3 ConvPowerVR::GetGridSize() const {
-  const int grid_x = DivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(),
-                                   conv_params_.block_size.x);
-  const int grid_y =
+  const int task_size_x = DivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(),
+                                        conv_params_.block_size.x);
+  const int task_size_y =
       DivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y);
-  const int grid_z =
-      DivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.z);
+  const int task_size_z =
+      DivideRoundUp(dst_[0]->Depth(), conv_params_.block_size.z);
+  const int task_size_s =
+      DivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.w);
   int3 wg;
 
-  if (conv_params_.linear_hw) {
-    wg.x = DivideRoundUp(grid_x * grid_y, conv_params_.work_group_size.x);
-    wg.y = DivideRoundUp(grid_z, conv_params_.work_group_size.y);
-    return int3(wg[conv_params_.work_group_launch_order[0]] *
-                    conv_params_.work_group_size.x,
-                wg[conv_params_.work_group_launch_order[1]] *
-                    conv_params_.work_group_size.y,
-                1);
+  if (conv_params_.linear_spatial) {
+    int grid_x = task_size_x * task_size_y;
+    if (definition_.src_tensors[0].HasAxis(Axis::DEPTH)) {
+      grid_x *= task_size_z;
+    }
+    wg.x = DivideRoundUp(grid_x, work_group_size_.x);
+    wg.y = DivideRoundUp(task_size_s, work_group_size_.y);
+    return int3(
+        wg[conv_params_.work_group_launch_order[0]] * work_group_size_.x,
+        wg[conv_params_.work_group_launch_order[1]] * work_group_size_.y, 1);
   } else {
-    wg.x = DivideRoundUp(grid_x, conv_params_.work_group_size.x);
-    wg.y = DivideRoundUp(grid_y, conv_params_.work_group_size.y);
-    wg.z = DivideRoundUp(grid_z, conv_params_.work_group_size.z);
-    return int3(wg[conv_params_.work_group_launch_order[0]] *
-                    conv_params_.work_group_size.x,
-                wg[conv_params_.work_group_launch_order[1]] *
-                    conv_params_.work_group_size.y,
-                wg[conv_params_.work_group_launch_order[2]] *
-                    conv_params_.work_group_size.z);
+    int grid_y = task_size_y;
+    if (definition_.src_tensors[0].HasAxis(Axis::DEPTH)) {
+      grid_y *= task_size_z;
+    }
+    wg.x = DivideRoundUp(task_size_x, work_group_size_.x);
+    wg.y = DivideRoundUp(grid_y, work_group_size_.y);
+    wg.z = DivideRoundUp(task_size_s, work_group_size_.z);
+    return int3(
+        wg[conv_params_.work_group_launch_order[0]] * work_group_size_.x,
+        wg[conv_params_.work_group_launch_order[1]] * work_group_size_.y,
+        wg[conv_params_.work_group_launch_order[2]] * work_group_size_.z);
   }
 }
 
-absl::Status ConvPowerVR::Tune(const TuningParameters& params) {
+void ConvPowerVR::GetPossibleKernelWorkGroups(
+    TuningType tuning_type, const DeviceInfo& device_info,
+    const KernelInfo& kernel_info, std::vector<int3>* work_groups) const {
   if (conv_params_.weights_upload_type ==
           WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP ||
       conv_params_.weights_upload_type ==
           WeightsUploadType::LOCAL_MEM_BY_THREADS ||
       conv_params_.fixed_work_group_size) {
-    return absl::OkStatus();
+    work_groups->push_back(work_group_size_);
+    return;
   }
   if (conv_params_.work_group_launch_order[0] == 0 &&
       conv_params_.work_group_launch_order[1] == 1 &&
       conv_params_.work_group_launch_order[2] == 2) {
-    RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
-    RETURN_IF_ERROR(GetBestWorkGroupConv(params, kernel_, grid_size_,
-                                         &conv_params_.work_group_size));
-    work_group_size_ = conv_params_.work_group_size;
+    GetPossibleWorkGroupsConv(tuning_type, device_info, kernel_info, grid_size_,
+                              work_groups);
+  } else {
+    work_groups->push_back(work_group_size_);
   }
-  return absl::OkStatus();
 }
 
 std::string ConvPowerVR::GenerateConv(const DeviceInfo& device_info,
@@ -287,31 +348,80 @@ std::string ConvPowerVR::GenerateConv(const DeviceInfo& device_info,
     AddSrcBuffer("weights", desc);
   }
 
+  const auto& src_def = op_def.src_tensors[0];
+
+  auto generate_id = [&](const std::string& x, const std::string& y,
+                         const std::string& z) {
+    std::string id;
+    if (src_def.HasAxis(Axis::WIDTH)) {
+      id += "_w" + x;
+    }
+    if (src_def.HasAxis(Axis::HEIGHT)) {
+      id += "_h" + y;
+    }
+    if (src_def.HasAxis(Axis::DEPTH)) {
+      id += "_d" + z;
+    }
+    return id;
+  };
+
+  auto generate_id_full = [&](const std::string& x, const std::string& y,
+                              const std::string& z, const std::string& s) {
+    return generate_id(x, y, z) + "_s" + s;
+  };
+
+  auto generate_check = [&](const std::string& x, const std::string& y,
+                            const std::string& z) {
+    std::string check;
+    const std::vector<Axis> axes{Axis::WIDTH, Axis::HEIGHT, Axis::DEPTH};
+    const std::vector<std::string> names{"in_x", "in_y", "in_z"};
+    const std::vector<bool> is_1{conv_params_.x_kernel_is_1,
+                                 conv_params_.y_kernel_is_1,
+                                 conv_params_.z_kernel_is_1};
+    const std::vector<std::string> coords{x, y, z};
+    for (int i = 0; i < axes.size(); ++i) {
+      const auto& axis = axes[i];
+      if (src_def.HasAxis(axis) && !src_def.SupportsZeroClamp(axis) &&
+          !is_1[i]) {
+        if (!check.empty()) {
+          check += " && ";
+        }
+        check += names[i] + coords[i];
+      }
+    }
+    return check;
+  };
+
   auto dst_desc = op_def.dst_tensors[0];
   if (op_def.IsBatchSupported()) {
     dst_desc.SetStateVar("BatchedWidth", "true");
   }
   AddDstTensor("dst_tensor", dst_desc);
 
-  const bool is1x1 = conv_params_.x_kernel_is_1 && conv_params_.y_kernel_is_1;
-  if (!is1x1) {
+  if (!conv_params_.x_kernel_is_1) {
     args_.AddInt("stride_x");
-    args_.AddInt("stride_y");
     args_.AddInt("padding_x");
-    args_.AddInt("padding_y");
     args_.AddInt("kernel_size_x");
-    args_.AddInt("kernel_size_y");
     args_.AddInt("dilation_x");
+  }
+  if (!conv_params_.y_kernel_is_1) {
+    args_.AddInt("stride_y");
+    args_.AddInt("padding_y");
+    args_.AddInt("kernel_size_y");
     args_.AddInt("dilation_y");
   }
-  if (conv_params_.linear_hw) {
+  if (src_def.HasAxis(Axis::DEPTH) && !conv_params_.z_kernel_is_1) {
+    args_.AddInt("stride_z");
+    args_.AddInt("padding_z");
+    args_.AddInt("kernel_size_z");
+    args_.AddInt("dilation_z");
+  }
+  if (conv_params_.linear_spatial) {
     args_.AddInt("task_size_x");
   }
-
-  const auto src_tensor_type = op_def.src_tensors[0].storage_type;
-  const bool buffer_type = src_tensor_type == TensorStorageType::BUFFER ||
-                           src_tensor_type == TensorStorageType::IMAGE_BUFFER;
-  const bool manual_clamp = buffer_type && !is1x1;
+  if (src_def.HasAxis(Axis::DEPTH)) {
+    args_.AddInt("task_size_y");
+  }
 
   const bool need_local_mem =
       conv_params.weights_upload_type ==
@@ -320,10 +430,10 @@ std::string ConvPowerVR::GenerateConv(const DeviceInfo& device_info,
           ConvPowerVR::WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP;
 
   const int local_mem_size =
-      conv_params.block_size.z * 4 * conv_params.src_depth_loop_size;
+      conv_params.block_size.w * 4 * conv_params.src_depth_loop_size;
 
   const bool use_simd_broadcast = conv_params.IsPrivateMemBroadcast();
-  const int simd_size = conv_params.GetSimdSize();
+  const int simd_size = conv_params.simd_size;
 
   const bool late_oob_check = need_local_mem || use_simd_broadcast;
 
@@ -345,179 +455,318 @@ std::string ConvPowerVR::GenerateConv(const DeviceInfo& device_info,
       c += "#pragma OPENCL EXTENSION cl_khr_subgroups : enable\n";
     }
   }
-
-  const int3 work_group_size = conv_params.work_group_size;
-  const int3 block_size = conv_params.block_size;
+  const int4 block_size = conv_params.block_size;
   if (conv_params.fixed_work_group_size) {
     c += "__attribute__((reqd_work_group_size(" +
-         std::to_string(work_group_size.x) + ", " +
-         std::to_string(work_group_size.y) + ", " +
-         std::to_string(work_group_size.z) + ")))\n";
+         std::to_string(work_group_size_.x) + ", " +
+         std::to_string(work_group_size_.y) + ", " +
+         std::to_string(work_group_size_.z) + ")))\n";
   }
   if (use_simd_broadcast && device_info.IsIntel()) {
     c += "__attribute__((intel_reqd_sub_group_size(" +
          std::to_string(simd_size) + ")))\n";
   }
+  std::string dst_oob_check;
+  if (src_def.HasAxis(Axis::DEPTH)) {
+    if (conv_params.linear_spatial) {
+      dst_oob_check =
+          "DST_Z >= args.dst_tensor.Depth() || DST_S >= "
+          "args.dst_tensor.Slices()";
+    } else {
+      dst_oob_check =
+          "DST_X >= args.dst_tensor.Width() || DST_Z >= "
+          "args.dst_tensor.Depth() || DST_S >= args.dst_tensor.Slices()";
+    }
+  } else {
+    if (conv_params.linear_spatial) {
+      dst_oob_check =
+          "DST_Y >= args.dst_tensor.Height() || DST_S >= "
+          "args.dst_tensor.Slices()";
+    } else {
+      dst_oob_check =
+          "DST_X >= args.dst_tensor.Width() || DST_Y >= "
+          "args.dst_tensor.Height() || DST_S >= args.dst_tensor.Slices()";
+    }
+  }
   c += "__kernel void main_function(\n";
   c += "$0) {\n";
-  c += GenerateBlockCoords(conv_params.block_size,
-                           conv_params.work_group_launch_order,
-                           conv_params.linear_hw);
-  std::vector<std::string> dst_x(conv_params.block_size.x);
-  for (int x = 0; x < conv_params.block_size.x; ++x) {
-    dst_x[x] = "(X + " + std::to_string(x) + ")";
-  }
-  std::vector<std::string> dst_y(conv_params.block_size.y);
-  for (int y = 0; y < conv_params.block_size.y; ++y) {
-    dst_y[y] = "(Y + " + std::to_string(y) + ")";
-  }
+  c += GenerateBlockCoords(
+      conv_params.block_size, conv_params.work_group_launch_order,
+      conv_params.linear_spatial, src_def.HasAxis(Axis::DEPTH));
   if (!late_oob_check) {
-    c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() "
-         "|| Z >= args.dst_tensor.Slices()) {\n";
+    c += "  if (" + dst_oob_check + ") {\n";
     c += "    return;\n";
     c += "  }\n";
   }
   if (conv_params.weights_upload_type ==
       ConvPowerVR::WeightsUploadType::LOCAL_MEM_BY_THREADS) {
-    if (conv_params.linear_hw) {
+    if (conv_params.linear_spatial) {
       c += "  int lid = get_local_id(0);\n";
     } else {
       c += "  int lid = get_local_id(1) * " +
-           std::to_string(work_group_size.x) + " + get_local_id(0);\n";
+           std::to_string(work_group_size_.x) + " + get_local_id(0);\n";
     }
   }
   if (use_simd_broadcast) {
     c += "  int simd_id = get_sub_group_local_id();\n";
   }
-  for (int z = 0; z < block_size.z; ++z) {
-    for (int y = 0; y < block_size.y; ++y) {
-      for (int x = 0; x < block_size.x; ++x) {
-        c += "  ACCUM_FLT4 r" + std::to_string(z) + std::to_string(y) +
-             std::to_string(x) + " = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
+  for (int s = 0; s < block_size.w; ++s) {
+    const std::string sind = std::to_string(s);
+    for (int z = 0; z < block_size.z; ++z) {
+      const std::string zind = std::to_string(z);
+      for (int y = 0; y < block_size.y; ++y) {
+        const std::string yind = std::to_string(y);
+        for (int x = 0; x < block_size.x; ++x) {
+          const std::string xind = std::to_string(x);
+          c += "  ACCUM_FLT4 r" + generate_id_full(xind, yind, zind, sind) +
+               " = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
+        }
       }
     }
   }
-  if (!is1x1) {
+  if (!conv_params_.x_kernel_is_1) {
     for (int x = 0; x < block_size.x; ++x) {
+      const std::string xind = std::to_string(x);
+      const std::string xc = "(DST_X + " + xind + ")";
       if (stride_correction) {
-        c += "  int xc" + std::to_string(x) + " = " +
-             GetXStrideCorrected(dst_x[x], "args.src_tensor.Batch()",
-                                 "args.stride_x", "args.padding_x") +
+        c += "  int xc" + xind + " = " +
+             GetXStrideCorrected(xc, "args.src_tensor.Batch()", "args.stride_x",
+                                 "args.padding_x") +
              ";\n";
       } else {
-        c += "  int xc" + std::to_string(x) + " = " + dst_x[x] +
+        c += "  int xc" + xind + " = " + xc +
              " * args.stride_x + args.padding_x;\n";
       }
     }
+  } else {
+    for (int x = 0; x < block_size.x; ++x) {
+      const std::string xind = std::to_string(x);
+      c += "  int xc" + xind + " = DST_X + " + xind + ";\n";
+      if (!src_def.CanReadOutOfBorder(Axis::WIDTH)) {
+        c += "  xc" + xind + " = clamp(xc" + xind +
+             ", 0, args.src_tensor.Width() - 1);\n";
+      }
+    }
+  }
+  if (!conv_params_.y_kernel_is_1) {
     for (int y = 0; y < block_size.y; ++y) {
-      c += "  int yc" + std::to_string(y) + " = " + dst_y[y] +
+      const std::string yind = std::to_string(y);
+      const std::string yc = "(DST_Y + " + yind + ")";
+      c += "  int yc" + yind + " = " + yc +
            " * args.stride_y + args.padding_y;\n";
     }
+  } else {
+    for (int y = 0; y < block_size.y; ++y) {
+      const std::string yind = std::to_string(y);
+      c += "  int yc" + yind + " = DST_Y + " + yind + ";\n";
+      if (!src_def.CanReadOutOfBorder(Axis::HEIGHT)) {
+        c += "  yc" + yind + " = clamp(yc" + yind +
+             ", 0, args.src_tensor.Height() - 1);\n";
+      }
+    }
+  }
+  if (src_def.HasAxis(Axis::DEPTH)) {
+    if (!conv_params_.z_kernel_is_1) {
+      for (int z = 0; z < block_size.z; ++z) {
+        const std::string zind = std::to_string(z);
+        const std::string zc = "(DST_Z + " + zind + ")";
+        c += "  int zc" + zind + " = " + zc +
+             " * args.stride_z + args.padding_z;\n";
+      }
+    } else {
+      for (int z = 0; z < block_size.z; ++z) {
+        const std::string zind = std::to_string(z);
+        c += "  int zc" + zind + " = DST_Z + " + zind + ";\n";
+        if (!src_def.CanReadOutOfBorder(Axis::DEPTH)) {
+          c += "  zc" + zind + " = clamp(zc" + zind +
+               ", 0, args.src_tensor.Depth() - 1);\n";
+        }
+      }
+    }
+  }
+  bool trivial_kernel_size =
+      conv_params_.x_kernel_is_1 && conv_params_.y_kernel_is_1;
+  if (src_def.HasAxis(Axis::DEPTH)) {
+    trivial_kernel_size = trivial_kernel_size && conv_params_.z_kernel_is_1;
   }
   if (need_local_mem) {
     c += "  __local " + weights_data_type + " weights_cache[" +
          std::to_string(local_mem_size) + "];\n";
-  } else {
+  } else if (conv_params.AreWeightsBuffer()) {
     c += "    " + weights_global_ptr + " weights_cache;\n";
+  } else if (!trivial_kernel_size) {
+    c += "  int filter_offset = 0;\n";
   }
-  if (is1x1) {
+  if (conv_params.AreWeightsBuffer()) {
     if (conv_params.different_weights_for_height) {
       c += "  " + weights_global_ptr +
-           " filters_loc = args.weights.GetPtr() + (Z * "
-           "args.src_tensor.Height() + Y * " +
-           std::to_string(block_size.z) + ") * 4 * args.src_tensor.Slices();\n";
+           " filters_loc = args.weights.GetPtr() + (DST_S * "
+           "args.src_tensor.Height() + DST_Y * " +
+           std::to_string(block_size.w) + ") * 4 * args.src_tensor.Slices();\n";
     } else {
+      std::string kernel_spatial_offset = "";
+      if (!conv_params_.x_kernel_is_1) {
+        kernel_spatial_offset += " * args.kernel_size_x";
+      }
+      if (!conv_params_.y_kernel_is_1) {
+        kernel_spatial_offset += " * args.kernel_size_y";
+      }
+      if (src_def.HasAxis(Axis::DEPTH) && !conv_params_.z_kernel_is_1) {
+        kernel_spatial_offset += " * args.kernel_size_z";
+      }
       c += "  " + weights_global_ptr +
-           " filters_loc = args.weights.GetPtr() + Z * 4 * "
-           "args.src_tensor.Slices();\n";
+           " filters_loc = args.weights.GetPtr() + DST_S * 4 * "
+           "args.src_tensor.Slices()" +
+           kernel_spatial_offset + ";\n";
     }
-  } else {
-    c += "  " + weights_global_ptr +
-         " filters_loc = args.weights.GetPtr() + Z * 4 * "
-         "args.src_tensor.Slices() *args.kernel_size_x * args.kernel_size_y;\n";
   }
-  if (buffer_type) {
-    c += "  const int src_layer_offset = args.src_tensor.SliceStride();\n";
+  if (src_def.HasAxis(Axis::DEPTH) && !conv_params_.z_kernel_is_1) {
+    c += "  for (int kz = 0; kz < args.kernel_size_z; ++kz) {\n";
+    for (int z = 0; z < block_size.z; ++z) {
+      const std::string zck = "zck" + std::to_string(z);
+      c += "  int zck" + std::to_string(z) + " = kz * args.dilation_z + zc" +
+           std::to_string(z) + ";\n";
+      if (!src_def.SupportsZeroClamp(Axis::DEPTH)) {
+        c += "  bool in_z" + std::to_string(z) + " = " + zck + " >= 0 && " +
+             zck + " < args.src_tensor.Depth();\n";
+        if (!src_def.CanReadOutOfBorder(Axis::DEPTH)) {
+          c += "  " + zck + " = clamp(" + zck +
+               ", 0, args.src_tensor.Depth() - 1);\n";
+        }
+      }
+    }
   }
-  if (!is1x1) {
+  if (!conv_params_.y_kernel_is_1) {
     c += "  for (int ky = 0; ky < args.kernel_size_y; ++ky) {\n";
     for (int y = 0; y < block_size.y; ++y) {
       const std::string yck = "yck" + std::to_string(y);
       c += "  int " + yck + " = ky * args.dilation_y + yc" + std::to_string(y) +
            ";\n";
-      if (manual_clamp) {
-        c += "  bool my" + std::to_string(y) + " = " + yck + " >= 0 && " + yck +
-             " < args.src_tensor.Height();\n";
-        c += "  " + yck + " = clamp(" + yck +
-             ", 0, args.src_tensor.Height() - 1);\n";
+      if (!src_def.SupportsZeroClamp(Axis::HEIGHT)) {
+        c += "  bool in_y" + std::to_string(y) + " = " + yck + " >= 0 && " +
+             yck + " < args.src_tensor.Height();\n";
+        if (!src_def.CanReadOutOfBorder(Axis::HEIGHT)) {
+          c += "  " + yck + " = clamp(" + yck +
+               ", 0, args.src_tensor.Height() - 1);\n";
+        }
       }
     }
+  }
+  if (!conv_params_.x_kernel_is_1) {
     c += "  for (int kx = 0; kx < args.kernel_size_x; ++kx) {\n";
     for (int x = 0; x < block_size.x; ++x) {
       const std::string xck = "xck" + std::to_string(x);
       c += "  int xck" + std::to_string(x) + " = kx * args.dilation_x + xc" +
            std::to_string(x) + ";\n";
-      if (manual_clamp) {
-        c += "  bool mx" + std::to_string(x) + " = " + xck + " >= 0 && " + xck +
-             " < args.src_tensor.Width();\n";
-        c += "  " + xck + " = clamp(" + xck +
-             ", 0, args.src_tensor.Width() - 1);\n";
+      if (!src_def.SupportsZeroClamp(Axis::WIDTH)) {
+        c += "  bool in_x" + std::to_string(x) + " = " + xck + " >= 0 && " +
+             xck + " < args.src_tensor.Width();\n";
+        if (!src_def.CanReadOutOfBorder(Axis::WIDTH)) {
+          c += "  " + xck + " = clamp(" + xck +
+               ", 0, args.src_tensor.Width() - 1);\n";
+        }
       }
     }
   }
-  if (buffer_type) {
+  const bool need_multiple_slice_strides =
+      src_def.ReturnsZeroForNegOneRead() && !trivial_kernel_size;
+  for (int z = 0; z < block_size.z; ++z) {
+    const std::string zind = std::to_string(z);
     for (int y = 0; y < block_size.y; ++y) {
-      const std::string yck = "yck" + std::to_string(y);
+      const std::string yind = std::to_string(y);
       for (int x = 0; x < block_size.x; ++x) {
-        const std::string xck = "xck" + std::to_string(x);
-        std::string xc =
-            is1x1 ? "min(" + dst_x[x] + ", args.src_tensor.Width() - 1)" : xck;
-        std::string yc =
-            is1x1 ? "min(" + dst_y[y] + ", args.src_tensor.Height() - 1)" : yck;
-        std::string id = std::to_string(y) + std::to_string(x);
-        c += "  int src_a_" + id + " = " + yc +
-             " * args.src_tensor.Width() + " + xc + ";\n";
+        const std::string xind = std::to_string(x);
+        std::string xc = conv_params.x_kernel_is_1 ? "xc" + xind : "xck" + xind;
+        std::string yc = conv_params.y_kernel_is_1 ? "yc" + yind : "yck" + yind;
+        const std::string id = generate_id(xind, yind, zind);
+        std::string coords = "" + xc + ", " + yc;
+        if (src_def.HasAxis(Axis::DEPTH)) {
+          std::string zc =
+              conv_params.z_kernel_is_1 ? "zc" + zind : "zck" + zind;
+          coords += ", " + zc;
+        }
+        if (src_def.IsLinear()) {
+          c += "  args.src_tensor.GetAddress(addr" + id + ", " + coords +
+               ", 0);\n";
+          if (need_multiple_slice_strides) {
+            const std::string check = generate_check(xind, yind, zind);
+            c += "  addr" + id + " = select(-1, addr" + id + ", (" + check +
+                 "));\n";
+            c += "  int ds" + id +
+                 " = select(0, args.src_tensor.SliceStride(), (" + check +
+                 "));\n";
+          }
+        }
       }
     }
   }
+  if (src_def.IsLinear() && !need_multiple_slice_strides) {
+    c += "  int ds = args.src_tensor.SliceStride();\n";
+  }
 
   auto declare_src = [&]() {
-    for (int y = 0; y < block_size.y; ++y) {
-      for (int x = 0; x < block_size.x; ++x) {
-        const std::string id = std::to_string(y) + std::to_string(x);
-        c += "    " + weights_data_type + " src" + id + ";\n";
+    for (int z = 0; z < block_size.z; ++z) {
+      const std::string zind = std::to_string(z);
+      for (int y = 0; y < block_size.y; ++y) {
+        const std::string yind = std::to_string(y);
+        for (int x = 0; x < block_size.x; ++x) {
+          const std::string xind = std::to_string(x);
+          const std::string id = generate_id(xind, yind, zind);
+          c += "    " + weights_data_type + " src" + id + ";\n";
+        }
       }
     }
   };
   const bool conditional_read = device_info.IsMali();
   auto read_src = [&]() {
     const std::string cl_type = ToCLDataType(conv_params.weights_data_type);
-    for (int y = 0; y < block_size.y; ++y) {
-      for (int x = 0; x < block_size.x; ++x) {
-        if (buffer_type) {
-          std::string id = std::to_string(y) + std::to_string(x);
-          if (is1x1) {
-            c += "    src" + id + " = args.src_tensor.Read<" + cl_type +
-                 ">(src_a_" + id + ");\n";
+    for (int z = 0; z < block_size.z; ++z) {
+      const std::string zind = std::to_string(z);
+      for (int y = 0; y < block_size.y; ++y) {
+        const std::string yind = std::to_string(y);
+        for (int x = 0; x < block_size.x; ++x) {
+          const std::string xind = std::to_string(x);
+          std::string id = generate_id(xind, yind, zind);
+          const std::string check = generate_check(xind, yind, zind);
+          std::string address;
+          if (src_def.IsLinear()) {
+            address = "addr" + id;
           } else {
-            std::string condition =
-                "mx" + std::to_string(x) + " && my" + std::to_string(y);
-            if (conditional_read) {
-              c += "    src" + id + " = " + condition +
-                   " ? args.src_tensor.Read<" + cl_type + ">(src_a_" + id +
-                   ") : (FLT4)(0.0f);\n";
+            std::string xc =
+                conv_params.x_kernel_is_1 ? "xc" + xind : "xck" + xind;
+            std::string yc =
+                conv_params.y_kernel_is_1 ? "yc" + yind : "yck" + yind;
+            address = "" + xc + ", " + yc;
+            if (src_def.HasAxis(Axis::DEPTH)) {
+              std::string zc =
+                  conv_params.z_kernel_is_1 ? "zc" + zind : "zck" + zind;
+              address += ", " + zc;
+            }
+            address += ", s";
+          }
+          if (src_def.ReturnsZeroForNegOneRead()) {
+            c += "    src" + id + " = args.src_tensor.Read<" + cl_type + ">(" +
+                 address + ");\n";
+            const std::string ds = trivial_kernel_size ? "ds" : "ds" + id;
+            c += "    " + address + " += " + ds + ";\n";
+          } else {
+            if (!check.empty()) {
+              if (conditional_read) {
+                c += "    src" + id + " = " + check +
+                     " ? args.src_tensor.Read<" + cl_type + ">(" + address +
+                     ") : (FLT4)(0.0f);\n";
+              } else {
+                c += "    src" + id + " = args.src_tensor.Read<" + cl_type +
+                     ">(" + address + ") * (FLT)(" + check + ");\n";
+              }
             } else {
               c += "    src" + id + " = args.src_tensor.Read<" + cl_type +
-                   ">(src_a_" + id + ") * (FLT)(" + condition + ");\n";
+                   ">(" + address + ");\n";
+            }
+            if (src_def.IsLinear()) {
+              c += "    " + address + " += ds;\n";
             }
           }
-          c += "    src_a_" + id + " += src_layer_offset;\n";
-        } else {
-          std::string id = std::to_string(y) + std::to_string(x);
-          const std::string xc = is1x1 ? dst_x[x] : "xck" + std::to_string(x);
-          const std::string yc = is1x1 ? dst_y[y] : "yck" + std::to_string(y);
-          c += "    src" + id + " = args.src_tensor.Read<" + cl_type + ">(" +
-               xc + ", " + yc + ", s);\n";
         }
       }
     }
@@ -527,59 +776,80 @@ std::string ConvPowerVR::GenerateConv(const DeviceInfo& device_info,
         conv_params.weights_data_type == DataType::FLOAT16);
   auto conv_core = [&](int shared_offset) {
     const std::string channels[] = {"x", "y", "z", "w"};
-    for (int z = 0; z < block_size.z; ++z) {
+    for (int s = 0; s < block_size.w; ++s) {
+      const std::string sind = std::to_string(s);
       if (weights_type_as_accum_type) {
         for (int ch = 0; ch < 4; ++ch) {
-          for (int y = 0; y < block_size.y; ++y) {
-            for (int x = 0; x < block_size.x; ++x) {
-              std::string id = std::to_string(y) + std::to_string(x);
-              if (use_simd_broadcast) {
-                int simd_id = (z * 4 + ch + shared_offset) / simd_size;
-                int thread_id = (z * 4 + ch + shared_offset) % simd_size;
-                std::string w_val_x = "sub_group_broadcast(simd_w" +
-                                      std::to_string(simd_id) + ".x, " +
-                                      std::to_string(thread_id) + "u)";
-                std::string w_val_y = "sub_group_broadcast(simd_w" +
-                                      std::to_string(simd_id) + ".y, " +
-                                      std::to_string(thread_id) + "u)";
-                std::string w_val_z = "sub_group_broadcast(simd_w" +
-                                      std::to_string(simd_id) + ".z, " +
-                                      std::to_string(thread_id) + "u)";
-                std::string w_val_w = "sub_group_broadcast(simd_w" +
-                                      std::to_string(simd_id) + ".w, " +
-                                      std::to_string(thread_id) + "u)";
-                c += "    r" + std::to_string(z) + id + ".x += " + w_val_x +
-                     " * src" + id + "." + channels[ch] + ";\n";
-                c += "    r" + std::to_string(z) + id + ".y += " + w_val_y +
-                     " * src" + id + "." + channels[ch] + ";\n";
-                c += "    r" + std::to_string(z) + id + ".z += " + w_val_z +
-                     " * src" + id + "." + channels[ch] + ";\n";
-                c += "    r" + std::to_string(z) + id + ".w += " + w_val_w +
-                     " * src" + id + "." + channels[ch] + ";\n";
-              } else {
-                std::string w_val = "weights_cache[" +
-                                    std::to_string(z * 4 + ch + shared_offset) +
-                                    "]";
-                c += "    r" + std::to_string(z) + id + " += " + w_val +
-                     " * src" + id + "." + channels[ch] + ";\n";
+          for (int z = 0; z < block_size.z; ++z) {
+            const std::string zind = std::to_string(z);
+            for (int y = 0; y < block_size.y; ++y) {
+              const std::string yind = std::to_string(y);
+              for (int x = 0; x < block_size.x; ++x) {
+                const std::string xind = std::to_string(x);
+                std::string R = "r" + generate_id_full(xind, yind, zind, sind);
+                std::string S = "src" + generate_id(xind, yind, zind);
+                if (use_simd_broadcast) {
+                  int simd_id = (s * 4 + ch + shared_offset) / simd_size;
+                  int thread_id = (s * 4 + ch + shared_offset) % simd_size;
+                  std::string w_val_x = "sub_group_broadcast(simd_w" +
+                                        std::to_string(simd_id) + ".x, " +
+                                        std::to_string(thread_id) + "u)";
+                  std::string w_val_y = "sub_group_broadcast(simd_w" +
+                                        std::to_string(simd_id) + ".y, " +
+                                        std::to_string(thread_id) + "u)";
+                  std::string w_val_z = "sub_group_broadcast(simd_w" +
+                                        std::to_string(simd_id) + ".z, " +
+                                        std::to_string(thread_id) + "u)";
+                  std::string w_val_w = "sub_group_broadcast(simd_w" +
+                                        std::to_string(simd_id) + ".w, " +
+                                        std::to_string(thread_id) + "u)";
+                  c += "    " + R + ".x += " + w_val_x + " * " + S + "." +
+                       channels[ch] + ";\n";
+                  c += "    " + R + ".y += " + w_val_y + " * " + S + "." +
+                       channels[ch] + ";\n";
+                  c += "    " + R + ".z += " + w_val_z + " * " + S + "." +
+                       channels[ch] + ";\n";
+                  c += "    " + R + ".w += " + w_val_w + " * " + S + "." +
+                       channels[ch] + ";\n";
+                } else {
+                  const std::string weight_id =
+                      std::to_string(s * 4 + ch + shared_offset);
+                  std::string w_val;
+                  if (conv_params.AreWeightsBuffer()) {
+                    w_val = "weights_cache[" + weight_id + "]";
+                  } else {
+                    w_val = "f" + weight_id;
+                  }
+                  c += "    " + R + " += " + w_val + " * " + S + "." +
+                       channels[ch] + ";\n";
+                }
               }
             }
           }
         }
       } else {  // F32_F16 precision and weights type is float16
-        for (int y = 0; y < block_size.y; ++y) {
-          for (int x = 0; x < block_size.x; ++x) {
-            std::string id = std::to_string(y) + std::to_string(x);
-            std::string R = "r" + std::to_string(z) + id;
-            std::string S = "src" + id;
-            const int dz = z * 4 + shared_offset;
-            std::string f0 = "weights_cache[" + std::to_string(dz + 0) + "]";
-            std::string f1 = "weights_cache[" + std::to_string(dz + 1) + "]";
-            std::string f2 = "weights_cache[" + std::to_string(dz + 2) + "]";
-            std::string f3 = "weights_cache[" + std::to_string(dz + 3) + "]";
-            c += "    " + R + " += convert_float4(" + S + ".x * " + f0 + " + " +
-                 S + ".y * " + f1 + " + " + S + ".z * " + f2 + " + " + S +
-                 ".w * " + f3 + ");\n";
+        for (int z = 0; z < block_size.z; ++z) {
+          const std::string zind = std::to_string(z);
+          for (int y = 0; y < block_size.y; ++y) {
+            const std::string yind = std::to_string(y);
+            for (int x = 0; x < block_size.x; ++x) {
+              const std::string xind = std::to_string(x);
+              std::string R = "r" + generate_id_full(xind, yind, zind, sind);
+              std::string S = "src" + generate_id(xind, yind, zind);
+              std::vector<std::string> F(4);
+              for (int i = 0; i < 4; ++i) {
+                std::string weight_id =
+                    std::to_string(s * 4 + i + shared_offset);
+                if (conv_params.AreWeightsBuffer()) {
+                  F[i] = "weights_cache[" + weight_id + "]";
+                } else {
+                  F[i] = "f" + weight_id;
+                }
+              }
+              c += "    " + R + " += convert_float4(" + S + ".x * " + F[0] +
+                   " + " + S + ".y * " + F[1] + " + " + S + ".z * " + F[2] +
+                   " + " + S + ".w * " + F[3] + ");\n";
+            }
           }
         }
       }
@@ -590,7 +860,7 @@ std::string ConvPowerVR::GenerateConv(const DeviceInfo& device_info,
   c += "  do {\n";
   declare_src();
   const int total_work_items =
-      work_group_size.x * work_group_size.y * work_group_size.z;
+      work_group_size_.x * work_group_size_.y * work_group_size_.z;
   if (conv_params.weights_upload_type ==
       ConvPowerVR::WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP) {
     c += GenerateAsyncUpload("weights_cache", "filters_loc",
@@ -616,8 +886,26 @@ std::string ConvPowerVR::GenerateConv(const DeviceInfo& device_info,
            "];\n";
       c += "    }\n";
     }
-  } else {  // GLOBAL_MEM/CONSTANT_MEM
+  } else if (conv_params.AreWeightsBuffer()) {  // GLOBAL_MEM/CONSTANT_MEM
     c += "    weights_cache = filters_loc;\n";
+  } else {  // TEXTURES_MEM
+    for (int dst_s = 0; dst_s < block_size.w; ++dst_s) {
+      std::string f_y = trivial_kernel_size ? "s" : "filter_offset";
+      if (conv_params.different_weights_for_height) {
+        f_y = "DST_Y * args.src_tensor.Slices() + s";
+      }
+      c += absl::Substitute(
+          R"(    FLT4 f$2 = args.weights0.Read(DST_S + $0, $1);
+    FLT4 f$3 = args.weights1.Read(DST_S + $0, $1);
+    FLT4 f$4 = args.weights2.Read(DST_S + $0, $1);
+    FLT4 f$5 = args.weights3.Read(DST_S + $0, $1);
+)",
+          dst_s, f_y, dst_s * 4 + 0, dst_s * 4 + 1, dst_s * 4 + 2,
+          dst_s * 4 + 3);
+    }
+    if (!trivial_kernel_size) {
+      c += "    filter_offset++;\n";
+    }
   }
   read_src();
   c += "    s += 1;\n";
@@ -628,61 +916,96 @@ std::string ConvPowerVR::GenerateConv(const DeviceInfo& device_info,
   conv_core(0);
   for (int i = 1; i < conv_params.src_depth_loop_size; ++i) {
     read_src();
-    conv_core(i * block_size.z * 4);
+    conv_core(i * block_size.w * 4);
     c += "    s += 1;\n";
   }
-  c += "    filters_loc += " + std::to_string(local_mem_size) + ";\n";
+  if (conv_params.AreWeightsBuffer()) {
+    c += "    filters_loc += " + std::to_string(local_mem_size) + ";\n";
+  }
   c += "  } while (s < args.src_tensor.Slices());\n";
-  if (!is1x1) {
-    c += "  };\n";
+  if (!conv_params.x_kernel_is_1) {
     c += "  };\n";
   }
-  if (conv_params.weights_upload_type ==
-      ConvPowerVR::WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP) {
-    c += GenerateAsyncUpload("weights_cache", "args.biases.GetPtr()", "Z",
-                             block_size.z);
-  } else if (conv_params.weights_upload_type ==
-             ConvPowerVR::WeightsUploadType::LOCAL_MEM_BY_THREADS) {
-    c += "    barrier(CLK_LOCAL_MEM_FENCE);\n";
-    c += GenerateUploadByThreads("weights_cache", "args.biases.GetPtr()", "Z",
-                                 "lid", total_work_items, block_size.z);
-    c += "    barrier(CLK_LOCAL_MEM_FENCE);\n";
-  } else {
-    c += "    weights_cache = args.biases.GetPtr() + Z;\n";
+  if (!conv_params.y_kernel_is_1) {
+    c += "  };\n";
+  }
+  if (src_def.HasAxis(Axis::DEPTH) && !conv_params_.z_kernel_is_1) {
+    c += "  };\n";
+  }
+  if (conv_params.AreWeightsBuffer()) {
+    if (conv_params.weights_upload_type ==
+        ConvPowerVR::WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP) {
+      c += GenerateAsyncUpload("weights_cache", "args.biases.GetPtr()", "DST_S",
+                               block_size.w);
+    } else if (conv_params.weights_upload_type ==
+               ConvPowerVR::WeightsUploadType::LOCAL_MEM_BY_THREADS) {
+      c += "  barrier(CLK_LOCAL_MEM_FENCE);\n";
+      c += GenerateUploadByThreads("weights_cache", "args.biases.GetPtr()",
+                                   "DST_S", "lid", total_work_items,
+                                   block_size.w);
+      c += "  barrier(CLK_LOCAL_MEM_FENCE);\n";
+    } else {
+      c += "  weights_cache = args.biases.GetPtr() + DST_S;\n";
+    }
   }
   if (late_oob_check) {
-    c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() "
-         "|| Z >= args.dst_tensor.Slices()) {\n";
+    c += "  if (" + dst_oob_check + ") {\n";
     c += "    return;\n";
     c += "  }\n";
   }
-  for (int z = 0; z < block_size.z; ++z) {
-    const std::string sz = std::to_string(z);
-    c += "  if (Z + " + sz + " >= args.dst_tensor.Slices()) return;\n";
-    c += "  {\n";
-    c += "    FLT4 bias_val = TO_FLT4(weights_cache[" + sz + "]);\n";
-    for (int y = 0; y < block_size.y; ++y) {
-      for (int x = 0; x < block_size.x; ++x) {
-        const std::string xs = dst_x[x];
-        const std::string ys = dst_y[y];
-        const std::string zs = "Z + " + sz;
-        const std::string r_id = sz + std::to_string(y) + std::to_string(x);
-        bool need_x_check = x != 0;
-        bool need_y_check = y != 0;
-        if (need_x_check && need_y_check) {
-          c += "  if (" + xs + " < args.dst_tensor.Width() && " + ys +
-               " < args.dst_tensor.Height()) {\n";
-        } else if (need_x_check && !need_y_check) {
-          c += "  if (" + xs + " < args.dst_tensor.Width()) {\n";
-        } else if (!need_x_check && need_y_check) {
-          c += "  if (" + ys + " < args.dst_tensor.Height()) {\n";
-        } else {
-          c += "  {\n";
+
+  auto generate_dst_check = [&](int x, int y, int z) {
+    std::string check;
+    const std::vector<Axis> axes{Axis::WIDTH, Axis::HEIGHT, Axis::DEPTH};
+    const std::vector<std::string> names{"Width()", "Height()", "Depth()"};
+    std::vector<std::string> coords(3);
+    coords[0] = "DST_X + " + std::to_string(x);
+    coords[1] = "DST_Y + " + std::to_string(y);
+    coords[2] = "DST_Z + " + std::to_string(z);
+    const std::vector<int> ids{x, y, z};
+    for (int i = 0; i < axes.size(); ++i) {
+      const auto& axis = axes[i];
+      if (src_def.HasAxis(axis) && ids[i] != 0) {
+        if (!check.empty()) {
+          check += " && ";
+        }
+        check += coords[i] + " < args.dst_tensor." + names[i];
+      }
+    }
+    return check;
+  };
+
+  for (int s = 0; s < block_size.w; ++s) {
+    const std::string sind = std::to_string(s);
+    c += "  if (DST_S + " + sind + " >= args.dst_tensor.Slices()) return;\n";
+    c += "  {\n";
+    if (conv_params.AreWeightsBuffer()) {
+      c += "    FLT4 bias_val = TO_FLT4(weights_cache[" + sind + "]);\n";
+    } else {
+      c += "    FLT4 bias_val = args.biases.Read(DST_S + " + sind + ");\n";
+    }
+    for (int z = 0; z < block_size.z; ++z) {
+      const std::string zind = std::to_string(z);
+      for (int y = 0; y < block_size.y; ++y) {
+        const std::string yind = std::to_string(y);
+        for (int x = 0; x < block_size.x; ++x) {
+          const std::string xind = std::to_string(x);
+          const std::string id = generate_id_full(xind, yind, zind, sind);
+          const std::string check = generate_dst_check(x, y, z);
+          std::string coords = "DST_X + " + xind + ", DST_Y + " + yind;
+          if (src_def.HasAxis(Axis::DEPTH)) {
+            coords += ", DST_Z + " + zind;
+          }
+          coords += ", DST_S + " + sind;
+          if (!check.empty()) {
+            c += "  if (" + check + ") {\n";
+          } else {
+            c += "  {\n";
+          }
+          c += "    FLT4 res = TO_FLT4(r" + id + ") + bias_val;\n";
+          c += "    args.dst_tensor.Write(res, " + coords + ");\n";
+          c += "  }\n";
         }
-        c += "    FLT4 res = TO_FLT4(r" + r_id + ") + bias_val;\n";
-        c += "    args.dst_tensor.Write(res, " + xs + ", " + ys + ", " + zs +
-             ");\n";
-        c += "  }\n";
       }
     }
     c += "  }\n";
@@ -692,97 +1015,97 @@ std::string ConvPowerVR::GenerateConv(const DeviceInfo& device_info,
 }
 
 ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
-    const CLDevice& device, const OperationDef& definition, int src_depth,
-    int dst_depth, bool x_kernel_is_1, bool y_kernel_is_1,
-    bool different_weights_for_height, const BHWC* dst_shape) const {
+    const DeviceInfo& device_info, const OperationDef& definition,
+    int src_depth, int dst_depth, bool x_kernel_is_1, bool y_kernel_is_1,
+    bool different_weights_for_height, const BHWC* dst_shape) {
   ConvParams conv_params;
-  conv_params.linear_hw = false;
+  conv_params.linear_spatial = false;
   conv_params.weights_data_type =
       DeduceDataTypeFromPrecision(definition.precision);
   conv_params.x_kernel_is_1 = x_kernel_is_1;
   conv_params.y_kernel_is_1 = y_kernel_is_1;
   conv_params.different_weights_for_height = different_weights_for_height;
-  if (device.IsNvidia()) {
+  if (device_info.IsNvidia()) {
     if (different_weights_for_height) {
-      conv_params.work_group_size = int3(32, 1, 1);
+      work_group_size_ = int3(32, 1, 1);
       conv_params.work_group_launch_order = int3(2, 0, 1);
       conv_params.fixed_work_group_size = true;
     } else {
-      conv_params.linear_hw = true;
-      conv_params.work_group_size = int3(32, 1, 1);
+      conv_params.linear_spatial = true;
+      work_group_size_ = int3(32, 1, 1);
       conv_params.work_group_launch_order = int3(1, 0, 2);
       conv_params.fixed_work_group_size = true;
     }
-    conv_params.block_size = int3(2, 1, 4);
+    conv_params.block_size = int4(2, 1, 1, 4);
     conv_params.src_depth_loop_size = 1;
     conv_params.weights_upload_type = WeightsUploadType::LOCAL_MEM_BY_THREADS;
     if (dst_depth % 4 == 0 || dst_depth >= 8) {
-      conv_params.block_size.z = 4;
+      conv_params.block_size.w = 4;
     } else if (dst_depth % 2 == 0 || dst_depth >= 4) {
-      conv_params.block_size.z = 2;
+      conv_params.block_size.w = 2;
     } else {
-      conv_params.block_size.z = dst_depth;
+      conv_params.block_size.w = dst_depth;
     }
     if (dst_shape) {
       int task_size = dst_shape->w * dst_shape->b * dst_shape->h * dst_depth;
       float task_size_per_cu =
-          static_cast<float>(task_size) / device.GetInfo().compute_units_count;
+          static_cast<float>(task_size) / device_info.compute_units_count;
       int block_size = conv_params.block_size.x * conv_params.block_size.y *
-                       conv_params.block_size.z;
+                       conv_params.block_size.w;
       float threads_per_cu = task_size_per_cu / block_size;
       float warps_per_cu = threads_per_cu / 32 /*warp_size*/;
       if (warps_per_cu < 8.0f) {
         conv_params.block_size.x = 1;
       }
-      if (warps_per_cu < 4.0f && conv_params.block_size.z >= 4) {
-        conv_params.block_size.z /= 2;
+      if (warps_per_cu < 4.0f && conv_params.block_size.w >= 4) {
+        conv_params.block_size.w /= 2;
       }
-      if (warps_per_cu < 2.0f && conv_params.block_size.z >= 2) {
-        conv_params.block_size.z /= 2;
+      if (warps_per_cu < 2.0f && conv_params.block_size.w >= 2) {
+        conv_params.block_size.w /= 2;
       }
     }
     if (src_depth % 2 == 0) {
       conv_params.src_depth_loop_size = 2;
     }
-    if (src_depth % 4 == 0 && conv_params.block_size.z <= 2) {
+    if (src_depth % 4 == 0 && conv_params.block_size.w <= 2) {
       conv_params.src_depth_loop_size = 4;
     }
-  } else if (device.IsPowerVR()) {
+  } else if (device_info.IsPowerVR()) {
     if (different_weights_for_height) {
-      conv_params.work_group_size = int3(32, 1, 1);
+      work_group_size_ = int3(32, 1, 1);
       conv_params.work_group_launch_order = int3(2, 0, 1);
       conv_params.fixed_work_group_size = true;
     } else {
-      conv_params.linear_hw = true;
-      conv_params.work_group_size = int3(32, 1, 1);
+      conv_params.linear_spatial = true;
+      work_group_size_ = int3(32, 1, 1);
       conv_params.work_group_launch_order = int3(1, 0, 2);
       conv_params.fixed_work_group_size = true;
     }
     conv_params.weights_data_type =
         definition.precision == CalculationsPrecision::F16 ? DataType::FLOAT16
                                                            : DataType::FLOAT32;
-    conv_params.block_size = int3(1, 1, 4);
+    conv_params.block_size = int4(1, 1, 1, 4);
     conv_params.src_depth_loop_size = 1;
     conv_params.weights_upload_type =
         WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP;
     if (dst_depth % 8 == 0 || dst_depth >= 32) {
-      conv_params.block_size.z = 8;
+      conv_params.block_size.w = 8;
     } else if (dst_depth % 4 == 0 || dst_depth >= 8) {
-      conv_params.block_size.z = 4;
+      conv_params.block_size.w = 4;
     } else if (dst_depth % 2 == 0 || dst_depth >= 4) {
-      conv_params.block_size.z = 2;
+      conv_params.block_size.w = 2;
     } else {
-      conv_params.block_size.z = dst_depth;
+      conv_params.block_size.w = dst_depth;
     }
     if (definition.precision == CalculationsPrecision::F16) {
-      conv_params.block_size.z = std::min(4, conv_params.block_size.z);
+      conv_params.block_size.w = std::min(4, conv_params.block_size.w);
       if (src_depth % 2 == 0) {
         conv_params.src_depth_loop_size = 2;
       }
-      if (src_depth % 4 == 0 && conv_params.block_size.z <= 2) {
+      if (src_depth % 4 == 0 && conv_params.block_size.w <= 2) {
         conv_params.src_depth_loop_size = 4;
       }
-      if (conv_params.block_size.z == 1) {
+      if (conv_params.block_size.w == 1) {
         if (src_depth % 2 == 0) {
           conv_params.src_depth_loop_size = 2;
         }
@@ -795,64 +1118,64 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
       }
       conv_params.block_size.x = 2;
     }
-  } else if (device.IsAMD()) {
+  } else if (device_info.IsAMD()) {
     if (different_weights_for_height) {
-      conv_params.work_group_size = int3(32, 1, 1);
+      work_group_size_ = int3(32, 1, 1);
       conv_params.work_group_launch_order = int3(2, 0, 1);
       conv_params.fixed_work_group_size = true;
     } else {
-      conv_params.work_group_size = int3(8, 4, 1);
+      work_group_size_ = int3(8, 4, 1);
       conv_params.work_group_launch_order = int3(2, 0, 1);
       conv_params.fixed_work_group_size = true;
     }
 
-    conv_params.block_size = int3(2, 1, 1);
+    conv_params.block_size = int4(2, 1, 1, 1);
     if (x_kernel_is_1 && y_kernel_is_1) {
       conv_params.block_size.y = 2;
     }
     conv_params.src_depth_loop_size = 1;
     conv_params.weights_upload_type = WeightsUploadType::CONSTANT_MEM;
     if (dst_depth % 8 == 0 || dst_depth >= 32) {
-      conv_params.block_size.z = 8;
+      conv_params.block_size.w = 8;
     } else if (dst_depth % 4 == 0 || dst_depth >= 8) {
-      conv_params.block_size.z = 4;
+      conv_params.block_size.w = 4;
     } else if (dst_depth % 2 == 0 || dst_depth >= 4) {
-      conv_params.block_size.z = 2;
+      conv_params.block_size.w = 2;
     } else {
-      conv_params.block_size.z = 1;
+      conv_params.block_size.w = 1;
     }
     if (src_depth % 2 == 0 && src_depth >= 16) {
       conv_params.src_depth_loop_size = 2;
     }
-  } else if (device.IsMali()) {
+  } else if (device_info.IsMali()) {
     int block_size = 2;
     if (dst_shape) {
       int task_size = dst_shape->w * dst_shape->b * dst_shape->h * dst_depth;
-      block_size = GetRecommendedBlockSizeForConv(device, definition.precision,
-                                                  task_size);
+      block_size = GetRecommendedBlockSizeForConv(
+          device_info, definition.precision, task_size);
     }
     if (!x_kernel_is_1 || !y_kernel_is_1) {
       block_size = std::min(block_size, 4);
     }
     if (block_size == 8) {
       if (dst_depth == 1 || dst_depth == 3) {
-        conv_params.block_size = int3(2, 2, 1);
+        conv_params.block_size = int4(2, 2, 1, 1);
       } else {
-        conv_params.block_size = int3(2, 2, 2);
+        conv_params.block_size = int4(2, 2, 1, 2);
       }
     } else if (block_size == 4) {
       if (dst_depth == 1 || dst_depth == 3) {
-        conv_params.block_size = int3(2, 2, 1);
+        conv_params.block_size = int4(2, 2, 1, 1);
       } else {
-        conv_params.block_size = int3(2, 1, 2);
+        conv_params.block_size = int4(2, 1, 1, 2);
       }
     } else if (block_size == 2) {
-      conv_params.block_size = int3(2, 1, 1);
+      conv_params.block_size = int4(2, 1, 1, 1);
     } else {
-      conv_params.block_size = int3(1, 1, 1);
+      conv_params.block_size = int4(1, 1, 1, 1);
     }
     conv_params.src_depth_loop_size = 1;
-    MaliInfo mali_info = device.GetInfo().mali_info;
+    MaliInfo mali_info = device_info.mali_info;
     if (src_depth % 2 == 0 && block_size <= 2 && !mali_info.IsMidgard()) {
       conv_params.src_depth_loop_size = 2;
     }
@@ -860,70 +1183,78 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
         definition.precision == CalculationsPrecision::F16) {
       conv_params.src_depth_loop_size = 4;
     }
-    conv_params.work_group_size = int3(4, 4, 1);
+    work_group_size_ = int3(4, 4, 1);
     conv_params.work_group_launch_order = int3(0, 1, 2);
     conv_params.fixed_work_group_size = false;
     conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
-  } else if (device.IsAdreno()) {
-    conv_params.block_size = int3(2, 2, 1);
-    conv_params.work_group_size = int3(8, 2, 1);
+  } else if (device_info.IsAdreno()) {
+    conv_params.block_size = int4(2, 2, 1, 1);
+    work_group_size_ = int3(8, 2, 1);
     conv_params.work_group_launch_order = int3(0, 1, 2);
     conv_params.fixed_work_group_size = false;
     conv_params.src_depth_loop_size = 1;
-    conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
-  } else if (device.IsIntel()) {
+    if (definition.src_tensors.size() == 2) {
+      // dynamic weights supported only with buffers.
+      conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
+    } else {
+      conv_params.weights_upload_type = WeightsUploadType::TEXTURES_MEM_X4;
+    }
+  } else if (device_info.IsIntel()) {
     if (different_weights_for_height) {
-      conv_params.work_group_size = int3(16, 1, 1);
+      work_group_size_ = int3(16, 1, 1);
       conv_params.work_group_launch_order = int3(0, 1, 2);
       conv_params.fixed_work_group_size = true;
     } else {
-      conv_params.linear_hw = true;
-      conv_params.work_group_size = int3(16, 1, 1);
+      conv_params.linear_spatial = true;
+      work_group_size_ = int3(16, 1, 1);
       conv_params.work_group_launch_order = int3(0, 1, 2);
       conv_params.fixed_work_group_size = true;
     }
-    conv_params.block_size = int3(1, 1, 4);
+    conv_params.block_size = int4(1, 1, 1, 4);
     conv_params.src_depth_loop_size = 1;
+    int sub_group_size = 16;
     if (definition.precision != CalculationsPrecision::F32_F16 &&
-        device.SupportsExtension("cl_khr_subgroups") &&
-        device.SupportsExtension("cl_intel_required_subgroup_size") &&
-        device.IsCL20OrHigher() && device.SupportsSubGroupWithSize(16)) {
+        device_info.SupportsExtension("cl_khr_subgroups") &&
+        device_info.SupportsExtension("cl_intel_required_subgroup_size") &&
+        device_info.IsCL20OrHigher() &&
+        device_info.SupportsSubGroupWithSize(sub_group_size)) {
       conv_params.weights_upload_type =
-          WeightsUploadType::PRIVATE_MEM_SIMD16_BROADCAST;
+          WeightsUploadType::PRIVATE_MEM_SIMD_BROADCAST;
+      conv_params.simd_size = sub_group_size;
     } else {
       conv_params.weights_upload_type = WeightsUploadType::LOCAL_MEM_BY_THREADS;
     }
     if (dst_depth % 4 == 0 || dst_depth >= 8) {
-      conv_params.block_size.z = 4;
+      conv_params.block_size.w = 4;
     } else if (dst_depth % 2 == 0 || dst_depth >= 4) {
-      conv_params.block_size.z = 2;
+      conv_params.block_size.w = 2;
     } else {
-      conv_params.block_size.z = dst_depth;
+      conv_params.block_size.w = dst_depth;
     }
     if (src_depth % 2 == 0) {
       conv_params.src_depth_loop_size = 2;
     }
-    if (src_depth % 4 == 0 && conv_params.block_size.z <= 2) {
+    if (src_depth % 4 == 0 && conv_params.block_size.w <= 2) {
       conv_params.src_depth_loop_size = 4;
     }
   } else {
-    conv_params.block_size = int3(1, 1, 4);
-    conv_params.work_group_size = int3(8, 2, 1);
+    conv_params.block_size = int4(1, 1, 1, 4);
+    work_group_size_ = int3(8, 2, 1);
     conv_params.work_group_launch_order = int3(0, 1, 2);
     conv_params.fixed_work_group_size = false;
     conv_params.src_depth_loop_size = 1;
     conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
     if (dst_depth % 4 == 0 || dst_depth >= 8) {
-      conv_params.block_size.z = 4;
+      conv_params.block_size.w = 4;
     } else if (dst_depth % 2 == 0 || dst_depth >= 4) {
-      conv_params.block_size.z = 2;
+      conv_params.block_size.w = 2;
     } else {
-      conv_params.block_size.z = dst_depth;
+      conv_params.block_size.w = dst_depth;
     }
     if (src_depth % 2 == 0) {
       conv_params.src_depth_loop_size = 2;
     }
-    if (src_depth % 4 == 0 && conv_params.block_size.z <= 2) {
+    if (src_depth % 4 == 0 && conv_params.block_size.w <= 2) {
       conv_params.src_depth_loop_size = 4;
     }
   }
@@ -932,8 +1263,8 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
 }
 
 ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
-    const CLDevice& device, const OperationDef& definition,
-    const Convolution2DAttributes& attr, const BHWC* dst_shape) const {
+    const DeviceInfo& device_info, const OperationDef& definition,
+    const Convolution2DAttributes& attr, const BHWC* dst_shape) {
   const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
   const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
   const bool x_kernel_is_1 = attr.weights.shape.w == 1 && attr.strides.w == 1 &&
@@ -944,14 +1275,49 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
                              attr.dilations.h == 1 &&
                              attr.padding.prepended.h == 0 &&
                              attr.padding.appended.h == 0;
-  return GuessBestParams(device, definition, src_depth, dst_depth,
+  return GuessBestParams(device_info, definition, src_depth, dst_depth,
                          x_kernel_is_1, y_kernel_is_1, false, dst_shape);
 }
 
 ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
-    const CLDevice& device, const OperationDef& definition,
+    const DeviceInfo& device_info, const OperationDef& definition,
+    const Convolution3DAttributes& attr, const BHWDC* dst_shape) {
+  const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
+  const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
+  const bool x_kernel_is_1 = attr.weights.shape.w == 1 && attr.strides.w == 1 &&
+                             attr.dilations.w == 1 &&
+                             attr.padding.prepended.w == 0 &&
+                             attr.padding.appended.w == 0;
+  const bool y_kernel_is_1 = attr.weights.shape.h == 1 && attr.strides.h == 1 &&
+                             attr.dilations.h == 1 &&
+                             attr.padding.prepended.h == 0 &&
+                             attr.padding.appended.h == 0;
+  const bool z_kernel_is_1 = attr.weights.shape.d == 1 && attr.strides.d == 1 &&
+                             attr.dilations.d == 1 &&
+                             attr.padding.prepended.d == 0 &&
+                             attr.padding.appended.d == 0;
+
+  ConvPowerVR::ConvParams result;
+  BHWC shape;
+  if (dst_shape) {
+    shape.b = dst_shape->b;
+    shape.h = dst_shape->h * dst_shape->d;
+    shape.w = dst_shape->w;
+    shape.c = dst_shape->c;
+    result = GuessBestParams(device_info, definition, src_depth, dst_depth,
+                             x_kernel_is_1, y_kernel_is_1, false, &shape);
+  } else {
+    result = GuessBestParams(device_info, definition, src_depth, dst_depth,
+                             x_kernel_is_1, y_kernel_is_1, false, nullptr);
+  }
+  result.z_kernel_is_1 = z_kernel_is_1;
+  return result;
+}
+
+ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
+    const DeviceInfo& device_info, const OperationDef& definition,
     const Convolution2DAttributes& attr, const BHWC& weights_shape,
-    const BHWC* dst_shape) const {
+    const BHWC* dst_shape) {
   const int dst_depth = DivideRoundUp(weights_shape.b, 4);
   const int src_depth = DivideRoundUp(weights_shape.c, 4);
   const bool x_kernel_is_1 =
@@ -960,74 +1326,90 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
   const bool y_kernel_is_1 =
       weights_shape.h == 1 && attr.strides.h == 1 && attr.dilations.h == 1 &&
       attr.padding.prepended.h == 0 && attr.padding.appended.h == 0;
-  return GuessBestParams(device, definition, src_depth, dst_depth,
+  return GuessBestParams(device_info, definition, src_depth, dst_depth,
                          x_kernel_is_1, y_kernel_is_1, false, dst_shape);
 }
 
 ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
-    const CLDevice& device, const OperationDef& definition,
-    const FullyConnectedAttributes& attr, const BHWC* dst_shape) const {
+    const DeviceInfo& device_info, const OperationDef& definition,
+    const FullyConnectedAttributes& attr, const BHWC* dst_shape) {
   const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
   const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
-  ConvPowerVR::ConvParams params = GuessBestParams(
-      device, definition, src_depth, dst_depth, true, true, false, dst_shape);
-  params.work_group_size.x *= params.work_group_size.y;
-  params.work_group_size.y = 1;
+  ConvPowerVR::ConvParams params =
+      GuessBestParams(device_info, definition, src_depth, dst_depth, true, true,
+                      false, dst_shape);
+  work_group_size_.x *= work_group_size_.y;
+  work_group_size_.y = 1;
   params.block_size.x *= params.block_size.y;
   params.block_size.y = 1;
   return params;
 }
 
 ConvPowerVR::ConvParams ConvPowerVR::GuessBestParamsWinograd(
-    const CLDevice& device, const OperationDef& definition,
-    const Convolution2DAttributes& attr, const BHWC* dst_shape) const {
+    const DeviceInfo& device_info, const OperationDef& definition,
+    const Convolution2DAttributes& attr, const BHWC* dst_shape) {
   const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
   const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
-  ConvPowerVR::ConvParams params = GuessBestParams(
-      device, definition, src_depth, dst_depth, true, true, true, dst_shape);
+  ConvPowerVR::ConvParams params =
+      GuessBestParams(device_info, definition, src_depth, dst_depth, true, true,
+                      true, dst_shape);
   params.block_size.x *= params.block_size.y;
   params.block_size.y = 1;
   return params;
 }
 
-absl::Status CreateConvPowerVR(const CreationContext& creation_context,
-                               const OperationDef& definition,
-                               const Convolution2DAttributes& attr,
-                               ConvPowerVR* result, const BHWC* dst_shape) {
-  *result = ConvPowerVR(definition, attr, *creation_context.device, dst_shape);
-  result->GenerateCode(creation_context.device->GetInfo());
-  return result->UploadData(attr.weights, attr.bias, creation_context.context);
+ConvPowerVR CreateConvPowerVR(const DeviceInfo& device_info,
+                              const OperationDef& definition,
+                              const Convolution2DAttributes& attr,
+                              const BHWC* dst_shape) {
+  ConvPowerVR result(definition, attr, device_info, dst_shape);
+  result.GenerateCode(device_info);
+  result.UploadData(attr.weights, attr.bias);
+  return result;
 }
 
-absl::Status CreateConvPowerVR(const CreationContext& creation_context,
-                               const OperationDef& definition,
-                               const FullyConnectedAttributes& attr,
-                               ConvPowerVR* result, const BHWC* dst_shape) {
-  *result = ConvPowerVR(definition, attr, *creation_context.device, dst_shape);
-  result->GenerateCode(creation_context.device->GetInfo());
-  return result->UploadData(attr.weights, attr.bias, creation_context.context);
+ConvPowerVR CreateConvPowerVR(const DeviceInfo& device_info,
+                              const OperationDef& definition,
+                              const FullyConnectedAttributes& attr,
+                              const BHWC* dst_shape) {
+  ConvPowerVR result(definition, attr, device_info, dst_shape);
+  result.GenerateCode(device_info);
+  result.UploadData(attr.weights, attr.bias);
+  return result;
 }
 
-absl::Status CreateConvPowerVRDynamicWeights(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const Convolution2DAttributes& attr, const BHWC& weights_shape,
-    ConvPowerVR* result, const BHWC* dst_shape) {
-  *result = ConvPowerVR(definition, attr, weights_shape,
-                        *creation_context.device, dst_shape);
-  result->GenerateCode(creation_context.device->GetInfo());
-  return result->UploadBias(attr.bias, creation_context.context);
+ConvPowerVR CreateConvPowerVRDynamicWeights(const DeviceInfo& device_info,
+                                            const OperationDef& definition,
+                                            const Convolution2DAttributes& attr,
+                                            const BHWC& weights_shape,
+                                            const BHWC* dst_shape) {
+  ConvPowerVR result(definition, attr, weights_shape, device_info, dst_shape);
+  result.GenerateCode(device_info);
+  result.UploadBias(attr.bias);
+  return result;
 }
 
-absl::Status CreateConvPowerVRWino4x4To6x6(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const Convolution2DAttributes& attr, ConvPowerVR* result,
-    const BHWC* dst_shape) {
-  *result = ConvPowerVR(definition);
-  result->conv_params_ = result->GuessBestParamsWinograd(
-      *creation_context.device, definition, attr, dst_shape);
-  result->GenerateCode(creation_context.device->GetInfo());
-  return result->UploadDataForWinograd4x4To6x6(
-      attr.weights, *creation_context.device, creation_context.context);
+ConvPowerVR CreateConvPowerVRWino4x4To6x6(const DeviceInfo& device_info,
+                                          const OperationDef& definition,
+                                          const Convolution2DAttributes& attr,
+                                          const BHWC* dst_shape) {
+  ConvPowerVR result(definition);
+  result.conv_params_ =
+      result.GuessBestParamsWinograd(device_info, definition, attr, dst_shape);
+  result.GenerateCode(device_info);
+  result.UploadDataForWinograd4x4To6x6(attr.weights);
+  return result;
+}
+
+ConvPowerVR CreateConvPowerVR3D(const DeviceInfo& device_info,
+                                const OperationDef& definition,
+                                const Convolution3DAttributes& attr,
+                                const BHWDC* dst_shape) {
+  ConvPowerVR result(definition, attr, device_info, dst_shape);
+  result.GenerateCode(device_info);
+  result.UploadWeights(attr.weights);
+  result.UploadBias(attr.bias);
+  return result;
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h
index 148dad38708..a26e5770ce8 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_POWERVR_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_POWERVR_H_
 
+#include <cstring>
 #include <vector>
 
 #include "tensorflow/lite/delegates/gpu/cl/buffer.h"
@@ -25,6 +26,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
 #include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
 #include "tensorflow/lite/delegates/gpu/cl/tensor.h"
+#include "tensorflow/lite/delegates/gpu/cl/texture2d.h"
 #include "tensorflow/lite/delegates/gpu/cl/util.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
@@ -41,14 +43,17 @@ namespace cl {
 class ConvPowerVR : public GPUOperation {
  public:
   ConvPowerVR() = default;
-  absl::Status Tune(const TuningParameters& params) override;
+  void GetPossibleKernelWorkGroups(
+      TuningType tuning_type, const DeviceInfo& device_info,
+      const KernelInfo& kernel_info,
+      std::vector<int3>* work_groups) const override;
   absl::Status BindArguments() override;
   int3 GetGridSize() const override;
 
   ConvWeightsDescription GetConvWeightsDescription() const {
     ConvWeightsDescription desc;
     desc.layout = ConvWeightsLayout::kOHWIOGroupI4O4;
-    desc.output_group_size = conv_params_.block_size.z;
+    desc.output_group_size = conv_params_.block_size.w;
     return desc;
   }
 
@@ -64,11 +69,8 @@ class ConvPowerVR : public GPUOperation {
     LOCAL_MEM_BY_THREADS,
     GLOBAL_MEM,
     CONSTANT_MEM,
-    PRIVATE_MEM_SIMD8_BROADCAST,
-    PRIVATE_MEM_SIMD16_BROADCAST,
-    PRIVATE_MEM_SIMD32_BROADCAST,
-    PRIVATE_MEM_SIMD64_BROADCAST,
-    PRIVATE_MEM_SIMD128_BROADCAST,
+    PRIVATE_MEM_SIMD_BROADCAST,
+    TEXTURES_MEM_X4,  // 4 textures for weights
   };
 
   struct ConvParams {
@@ -80,162 +82,146 @@ class ConvPowerVR : public GPUOperation {
     // weights, so for PowerVR in this kernel we have F32 weights for
     // F32_F16 precision mode
     DataType weights_data_type;  // used for weights and biases
-    int3 block_size;
-    int3 work_group_size;
+    int4 block_size;             // WHDS
     int3 work_group_launch_order;
     bool fixed_work_group_size;
-    bool linear_hw;
+    bool linear_spatial;  // spatial dimensions are Width/Height/Depth
     bool different_weights_for_height;
     int src_depth_loop_size;
     WeightsUploadType weights_upload_type;
     bool x_kernel_is_1;
     bool y_kernel_is_1;
+    bool z_kernel_is_1;
+
+    // used only with PRIVATE_MEM_SIMD_BROADCAST
+    int simd_size = 1;
+
+    bool AreWeightsBuffer() const {
+      return weights_upload_type != WeightsUploadType::TEXTURES_MEM_X4;
+    }
 
     bool IsPrivateMemBroadcast() const {
       return weights_upload_type ==
-                 WeightsUploadType::PRIVATE_MEM_SIMD8_BROADCAST ||
-             weights_upload_type ==
-                 WeightsUploadType::PRIVATE_MEM_SIMD16_BROADCAST ||
-             weights_upload_type ==
-                 WeightsUploadType::PRIVATE_MEM_SIMD32_BROADCAST ||
-             weights_upload_type ==
-                 WeightsUploadType::PRIVATE_MEM_SIMD64_BROADCAST ||
-             weights_upload_type ==
-                 WeightsUploadType::PRIVATE_MEM_SIMD128_BROADCAST;
-    }
-
-    int GetSimdSize() const {
-      if (weights_upload_type ==
-          WeightsUploadType::PRIVATE_MEM_SIMD8_BROADCAST) {
-        return 8;
-      } else if (weights_upload_type ==
-                 WeightsUploadType::PRIVATE_MEM_SIMD16_BROADCAST) {
-        return 16;
-      } else if (weights_upload_type ==
-                 WeightsUploadType::PRIVATE_MEM_SIMD32_BROADCAST) {
-        return 32;
-      } else if (weights_upload_type ==
-                 WeightsUploadType::PRIVATE_MEM_SIMD64_BROADCAST) {
-        return 64;
-      } else if (weights_upload_type ==
-                 WeightsUploadType::PRIVATE_MEM_SIMD128_BROADCAST) {
-        return 128;
-      }
-      return 1;
+             WeightsUploadType::PRIVATE_MEM_SIMD_BROADCAST;
     }
   };
 
   ConvPowerVR(const OperationDef& definition,
-              const Convolution2DAttributes& attr, const CLDevice& device,
-              const BHWC* dst_shape = nullptr);
+              const Convolution2DAttributes& attr,
+              const DeviceInfo& device_info, const BHWC* dst_shape = nullptr);
   ConvPowerVR(const OperationDef& definition,
               const Convolution2DAttributes& attr, const BHWC& weights_shape,
-              const CLDevice& device, const BHWC* dst_shape = nullptr);
+              const DeviceInfo& device_info, const BHWC* dst_shape = nullptr);
   ConvPowerVR(const OperationDef& definition,
-              const FullyConnectedAttributes& attr, const CLDevice& device,
-              const BHWC* dst_shape = nullptr);
+              const FullyConnectedAttributes& attr,
+              const DeviceInfo& device_info, const BHWC* dst_shape = nullptr);
   explicit ConvPowerVR(const OperationDef& definition);
+  ConvPowerVR(const OperationDef& definition,
+              const Convolution3DAttributes& attr,
+              const DeviceInfo& device_info, const BHWDC* dst_shape = nullptr);
 
   void GenerateCode(const DeviceInfo& device_info);
 
   template <DataType T>
-  absl::Status UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
-                          const tflite::gpu::Tensor<Linear, T>& biases,
-                          CLContext* context);
+  void UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
+                  const tflite::gpu::Tensor<Linear, T>& biases);
   template <DataType T>
-  absl::Status UploadDataForWinograd4x4To6x6(
-      const tflite::gpu::Tensor<OHWI, T>& weights, const CLDevice& device,
-      CLContext* context);
+  void UploadDataForWinograd4x4To6x6(
+      const tflite::gpu::Tensor<OHWI, T>& weights);
 
   template <DataType T>
-  absl::Status UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
-                             CLContext* context);
+  void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights);
 
   template <DataType T>
-  absl::Status UploadBias(const tflite::gpu::Tensor<Linear, T>& bias,
-                          CLContext* context);
+  void UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights);
 
-  friend absl::Status CreateConvPowerVR(const CreationContext& creation_context,
-                                        const OperationDef& definition,
-                                        const Convolution2DAttributes& attr,
-                                        ConvPowerVR* result,
-                                        const BHWC* dst_shape);
+  template <DataType T>
+  void UploadBias(const tflite::gpu::Tensor<Linear, T>& bias);
 
-  friend absl::Status CreateConvPowerVR(const CreationContext& creation_context,
-                                        const OperationDef& definition,
-                                        const FullyConnectedAttributes& attr,
-                                        ConvPowerVR* result,
-                                        const BHWC* dst_shape);
+  friend ConvPowerVR CreateConvPowerVR(const DeviceInfo& device_info,
+                                       const OperationDef& definition,
+                                       const Convolution2DAttributes& attr,
+                                       const BHWC* dst_shape);
 
-  friend absl::Status CreateConvPowerVRDynamicWeights(
-      const CreationContext& creation_context, const OperationDef& definition,
+  friend ConvPowerVR CreateConvPowerVR(const DeviceInfo& device_info,
+                                       const OperationDef& definition,
+                                       const FullyConnectedAttributes& attr,
+                                       const BHWC* dst_shape);
+
+  friend ConvPowerVR CreateConvPowerVRDynamicWeights(
+      const DeviceInfo& device_info, const OperationDef& definition,
       const Convolution2DAttributes& attr, const BHWC& weights_shape,
-      ConvPowerVR* result, const BHWC* dst_shape);
-
-  friend absl::Status CreateConvPowerVRWino4x4To6x6(
-      const CreationContext& creation_context, const OperationDef& definition,
-      const Convolution2DAttributes& attr, ConvPowerVR* result,
       const BHWC* dst_shape);
 
-  ConvParams GuessBestParams(const CLDevice& device,
+  friend ConvPowerVR CreateConvPowerVRWino4x4To6x6(
+      const DeviceInfo& device_info, const OperationDef& definition,
+      const Convolution2DAttributes& attr, const BHWC* dst_shape);
+
+  friend ConvPowerVR CreateConvPowerVR3D(const DeviceInfo& device_info,
+                                         const OperationDef& definition,
+                                         const Convolution3DAttributes& attr,
+                                         const BHWDC* dst_shape);
+
+  ConvParams GuessBestParams(const DeviceInfo& device_info,
                              const OperationDef& definition,
                              const Convolution2DAttributes& attr,
-                             const BHWC* dst_shape = nullptr) const;
-  ConvParams GuessBestParams(const CLDevice& device,
+                             const BHWC* dst_shape = nullptr);
+  ConvParams GuessBestParams(const DeviceInfo& device_info,
                              const OperationDef& definition,
                              const Convolution2DAttributes& attr,
                              const BHWC& weights_shape,
-                             const BHWC* dst_shape = nullptr) const;
-  ConvParams GuessBestParams(const CLDevice& device,
+                             const BHWC* dst_shape = nullptr);
+  ConvParams GuessBestParams(const DeviceInfo& device_info,
                              const OperationDef& definition,
                              const FullyConnectedAttributes& attr,
-                             const BHWC* dst_shape = nullptr) const;
-  ConvParams GuessBestParamsWinograd(const CLDevice& device,
+                             const BHWC* dst_shape = nullptr);
+  ConvParams GuessBestParamsWinograd(const DeviceInfo& device_info,
                                      const OperationDef& definition,
                                      const Convolution2DAttributes& attr,
-                                     const BHWC* dst_shape = nullptr) const;
-  ConvParams GuessBestParams(const CLDevice& device,
+                                     const BHWC* dst_shape = nullptr);
+  ConvParams GuessBestParams(const DeviceInfo& device_info,
+                             const OperationDef& definition,
+                             const Convolution3DAttributes& attr,
+                             const BHWDC* dst_shape = nullptr);
+  ConvParams GuessBestParams(const DeviceInfo& device_info,
                              const OperationDef& definition, int src_depth,
                              int dst_depth, bool x_kernel_is_1,
                              bool y_kernel_is_1,
                              bool different_weights_for_height,
-                             const BHWC* dst_shape = nullptr) const;
+                             const BHWC* dst_shape = nullptr);
 
   std::string GenerateConv(const DeviceInfo& device_info,
                            const OperationDef& op_def, bool stride_correction,
                            const ConvParams& conv_params);
 
-  int4 stride_padding_;
-  int4 kernel_dilation_;
+  int4 stride_;
+  int4 padding_;
+  int4 kernel_size_;
+  int4 dilation_;
   ConvParams conv_params_;
 };
 
 template <DataType T>
-absl::Status ConvPowerVR::UploadData(
-    const tflite::gpu::Tensor<OHWI, T>& weights,
-    const tflite::gpu::Tensor<Linear, T>& biases, CLContext* context) {
-  RETURN_IF_ERROR(UploadWeights(weights, context));
-  RETURN_IF_ERROR(UploadBias(biases, context));
-  return absl::OkStatus();
+void ConvPowerVR::UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
+                             const tflite::gpu::Tensor<Linear, T>& biases) {
+  UploadWeights(weights);
+  UploadBias(biases);
 }
 
 template <DataType T>
-absl::Status ConvPowerVR::UploadDataForWinograd4x4To6x6(
-    const tflite::gpu::Tensor<OHWI, T>& weights, const CLDevice& device,
-    CLContext* context) {
+void ConvPowerVR::UploadDataForWinograd4x4To6x6(
+    const tflite::gpu::Tensor<OHWI, T>& weights) {
   tflite::gpu::Tensor<OHWI, T> wino_weights;
   RearrangeWeightsToWinograd4x4To6x6Weights(weights, &wino_weights);
-  RETURN_IF_ERROR(UploadWeights(wino_weights, context));
+  UploadWeights(wino_weights);
   tflite::gpu::Tensor<Linear, DataType::FLOAT32> biases;
   biases.shape = Linear(weights.shape.o);
   biases.data.resize(weights.shape.o, 0.0f);
-  RETURN_IF_ERROR(UploadBias(biases, context));
-  return absl::OkStatus();
+  UploadBias(biases);
 }
 
 template <DataType T>
-absl::Status ConvPowerVR::UploadBias(const tflite::gpu::Tensor<Linear, T>& bias,
-                                     CLContext* context) {
+void ConvPowerVR::UploadBias(const tflite::gpu::Tensor<Linear, T>& bias) {
   BufferDescriptor desc;
   desc.element_type = conv_params_.weights_data_type;
   desc.element_size = 4;
@@ -243,98 +229,175 @@ absl::Status ConvPowerVR::UploadBias(const tflite::gpu::Tensor<Linear, T>& bias,
                              ConvPowerVR::WeightsUploadType::CONSTANT_MEM
                          ? MemoryType::CONSTANT
                          : MemoryType::GLOBAL;
-
-  Buffer bias_buffer;
-  int aligned_channels = AlignByN(bias.shape.v, 4 * conv_params_.block_size.z);
+  const int float_size = conv_params_.weights_data_type == DataType::FLOAT32
+                             ? sizeof(float)
+                             : sizeof(half);
+  int aligned_channels = AlignByN(bias.shape.v, 4 * conv_params_.block_size.w);
+  desc.size = float_size * aligned_channels;
+  desc.data.resize(desc.size);
   if (conv_params_.weights_data_type == DataType::FLOAT32) {
-    std::vector<float> gpu_data(aligned_channels);
-    for (int i = 0; i < gpu_data.size(); ++i) {
+    float* gpu_data = reinterpret_cast<float*>(desc.data.data());
+    for (int i = 0; i < aligned_channels; ++i) {
       gpu_data[i] = i < bias.shape.v ? bias.data[i] : 0.0f;
     }
-    RETURN_IF_ERROR(CreateReadOnlyBuffer(sizeof(float) * gpu_data.size(),
-                                         gpu_data.data(), context,
-                                         &bias_buffer));
   } else {
-    std::vector<half> gpu_data(aligned_channels);
-    for (int i = 0; i < gpu_data.size(); ++i) {
+    half* gpu_data = reinterpret_cast<half*>(desc.data.data());
+    for (int i = 0; i < aligned_channels; ++i) {
       gpu_data[i] = i < bias.shape.v ? bias.data[i] : 0.0f;
     }
-    RETURN_IF_ERROR(CreateReadOnlyBuffer(sizeof(half) * gpu_data.size(),
-                                         gpu_data.data(), context,
-                                         &bias_buffer));
   }
-
-  args_.AddObject("biases", AccessType::READ,
-                  absl::make_unique<Buffer>(std::move(bias_buffer)),
-                  absl::make_unique<BufferDescriptor>(desc));
-  return absl::OkStatus();
+  args_.AddObject("biases",
+                  absl::make_unique<BufferDescriptor>(std::move(desc)));
 }
 
 template <DataType T>
-absl::Status ConvPowerVR::UploadWeights(
-    const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
-  const int dst_depth = DivideRoundUp(weights.shape.o, 4);
-  const int src_depth = DivideRoundUp(weights.shape.i, 4);
+void ConvPowerVR::UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights) {
+  const int dst_slices =
+      AlignByN(DivideRoundUp(weights.shape.o, 4), conv_params_.block_size.w);
+  const int src_slices = DivideRoundUp(weights.shape.i, 4);
 
   const bool f32_weights = conv_params_.weights_data_type == DataType::FLOAT32;
   const int float4_size = f32_weights ? sizeof(float4) : sizeof(half4);
 
-  const int dst_depth_aligned = AlignByN(dst_depth, conv_params_.block_size.z);
   const int elements_count =
-      weights.shape.h * weights.shape.w * src_depth * dst_depth_aligned * 4;
+      weights.shape.h * weights.shape.w * src_slices * dst_slices * 4;
+
+  std::vector<uint8_t> data(float4_size * elements_count);
 
-  Buffer weights_buffer;
   if (f32_weights) {
-    std::vector<float4> gpu_data(elements_count);
-    RearrangeWeightsToOHWIOGroupI4O4(weights, conv_params_.block_size.z,
-                                     absl::MakeSpan(gpu_data));
-    RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
-                                         gpu_data.data(), context,
-                                         &weights_buffer));
+    float4* ptr = reinterpret_cast<float4*>(data.data());
+    if (conv_params_.AreWeightsBuffer()) {
+      RearrangeWeightsToOHWIOGroupI4O4(weights, conv_params_.block_size.w,
+                                       absl::MakeSpan(ptr, elements_count));
+    } else {
+      RearrangeWeightsToI4HWIOOGroupO4(weights, conv_params_.block_size.w,
+                                       absl::MakeSpan(ptr, elements_count));
+    }
   } else {
-    std::vector<half4> gpu_data(elements_count);
-    RearrangeWeightsToOHWIOGroupI4O4(weights, conv_params_.block_size.z,
-                                     absl::MakeSpan(gpu_data));
-    RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
-                                         gpu_data.data(), context,
-                                         &weights_buffer));
+    half4* ptr = reinterpret_cast<half4*>(data.data());
+    if (conv_params_.AreWeightsBuffer()) {
+      RearrangeWeightsToOHWIOGroupI4O4(weights, conv_params_.block_size.w,
+                                       absl::MakeSpan(ptr, elements_count));
+    } else {
+      RearrangeWeightsToI4HWIOOGroupO4(weights, conv_params_.block_size.w,
+                                       absl::MakeSpan(ptr, elements_count));
+    }
+  }
+  if (conv_params_.AreWeightsBuffer()) {
+    BufferDescriptor desc;
+    desc.element_type = conv_params_.weights_data_type;
+    desc.element_size = 4;
+    desc.memory_type = conv_params_.weights_upload_type ==
+                               ConvPowerVR::WeightsUploadType::CONSTANT_MEM
+                           ? MemoryType::CONSTANT
+                           : MemoryType::GLOBAL;
+    desc.size = float4_size * elements_count;
+    desc.data = std::move(data);
+    args_.AddObject("weights",
+                    absl::make_unique<BufferDescriptor>(std::move(desc)));
+  } else {
+    const int texture_width = dst_slices;
+    const int texture_height = src_slices * weights.shape.h * weights.shape.w;
+    const int sub_size = float4_size * texture_width * texture_height;
+    for (int i = 0; i < 4; ++i) {
+      Texture2DDescriptor desc;
+      desc.element_type = conv_params_.weights_data_type;
+      desc.size = int2(texture_width, texture_height);
+      desc.data.resize(sub_size);
+      std::memcpy(desc.data.data(), data.data() + sub_size * i, sub_size);
+      const std::string name = "weights" + std::to_string(i);
+      args_.AddObject(name,
+                      absl::make_unique<Texture2DDescriptor>(std::move(desc)));
+    }
   }
-
-  BufferDescriptor desc;
-  desc.element_type = conv_params_.weights_data_type;
-  desc.element_size = 4;
-  desc.memory_type = conv_params_.weights_upload_type ==
-                             ConvPowerVR::WeightsUploadType::CONSTANT_MEM
-                         ? MemoryType::CONSTANT
-                         : MemoryType::GLOBAL;
-
-  args_.AddObject("weights", AccessType::READ,
-                  absl::make_unique<Buffer>(std::move(weights_buffer)),
-                  absl::make_unique<BufferDescriptor>(desc));
-  return absl::OkStatus();
 }
 
-absl::Status CreateConvPowerVR(const CreationContext& creation_context,
-                               const OperationDef& definition,
-                               const Convolution2DAttributes& attr,
-                               ConvPowerVR* result,
-                               const BHWC* dst_shape = nullptr);
+template <DataType T>
+void ConvPowerVR::UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights) {
+  const int block_size = conv_params_.block_size.w;
+  const int dst_slices =
+      AlignByN(DivideRoundUp(weights.shape.o, 4), block_size);
+  const int src_slices = DivideRoundUp(weights.shape.i, 4);
 
-absl::Status CreateConvPowerVR(const CreationContext& creation_context,
-                               const OperationDef& definition,
-                               const FullyConnectedAttributes& attr,
-                               ConvPowerVR* result,
-                               const BHWC* dst_shape = nullptr);
+  const int elements_count = weights.shape.d * weights.shape.h *
+                             weights.shape.w * src_slices * dst_slices * 4;
+  const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
 
-absl::Status CreateConvPowerVRDynamicWeights(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const Convolution2DAttributes& attr, const BHWC& weights_shape,
-    ConvPowerVR* result, const BHWC* dst_shape = nullptr);
+  const int float4_size = f32_weights ? 16 : 8;
 
-absl::Status CreateConvPowerVRWino4x4To6x6(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const Convolution2DAttributes& attr, ConvPowerVR* result,
-    const BHWC* dst_shape = nullptr);
+  std::vector<uint8_t> data(float4_size * elements_count);
+
+  if (f32_weights) {
+    float4* ptr = reinterpret_cast<float4*>(data.data());
+    if (conv_params_.AreWeightsBuffer()) {
+      RearrangeWeightsToODHWIOGroupI4O4(weights, conv_params_.block_size.w,
+                                        absl::MakeSpan(ptr, elements_count));
+    } else {
+      RearrangeWeightsToI4DHWIOOGroupO4(weights, conv_params_.block_size.w,
+                                        absl::MakeSpan(ptr, elements_count));
+    }
+  } else {
+    half4* ptr = reinterpret_cast<half4*>(data.data());
+    if (conv_params_.AreWeightsBuffer()) {
+      RearrangeWeightsToODHWIOGroupI4O4(weights, conv_params_.block_size.w,
+                                        absl::MakeSpan(ptr, elements_count));
+    } else {
+      RearrangeWeightsToI4DHWIOOGroupO4(weights, conv_params_.block_size.w,
+                                        absl::MakeSpan(ptr, elements_count));
+    }
+  }
+
+  if (conv_params_.AreWeightsBuffer()) {
+    BufferDescriptor desc;
+    desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+    desc.element_size = 4;
+    desc.size = float4_size * elements_count;
+    desc.data = std::move(data);
+    args_.AddObject("weights",
+                    absl::make_unique<BufferDescriptor>(std::move(desc)));
+  } else {
+    const int texture_width = dst_slices;
+    const int texture_height =
+        src_slices * weights.shape.d * weights.shape.h * weights.shape.w;
+    int sub_size = float4_size * texture_width * texture_height;
+    for (int i = 0; i < 4; ++i) {
+      Texture2DDescriptor desc;
+      desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+      desc.size = int2(texture_width, texture_height);
+      desc.data.resize(sub_size);
+      memcpy(desc.data.data(), data.data() + sub_size * i, sub_size);
+      const std::string name = "weights" + std::to_string(i);
+      args_.AddObject(name,
+                      absl::make_unique<Texture2DDescriptor>(std::move(desc)));
+    }
+  }
+}
+
+ConvPowerVR CreateConvPowerVR(const DeviceInfo& device_info,
+                              const OperationDef& definition,
+                              const Convolution2DAttributes& attr,
+                              const BHWC* dst_shape = nullptr);
+
+ConvPowerVR CreateConvPowerVR(const DeviceInfo& device_info,
+                              const OperationDef& definition,
+                              const FullyConnectedAttributes& attr,
+                              const BHWC* dst_shape = nullptr);
+
+ConvPowerVR CreateConvPowerVRDynamicWeights(const DeviceInfo& device_info,
+                                            const OperationDef& definition,
+                                            const Convolution2DAttributes& attr,
+                                            const BHWC& weights_shape,
+                                            const BHWC* dst_shape = nullptr);
+
+ConvPowerVR CreateConvPowerVRWino4x4To6x6(const DeviceInfo& device_info,
+                                          const OperationDef& definition,
+                                          const Convolution2DAttributes& attr,
+                                          const BHWC* dst_shape = nullptr);
+
+ConvPowerVR CreateConvPowerVR3D(const DeviceInfo& device_info,
+                                const OperationDef& definition,
+                                const Convolution3DAttributes& attr,
+                                const BHWDC* dst_shape = nullptr);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr_test.cc
index b63a1dbc830..e93df4bcb26 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr_test.cc
@@ -57,8 +57,8 @@ TEST_F(OpenCLOperationTest, ConvPowerVR1x1SimpleWeights) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ConvPowerVR operation;
-      ASSERT_OK(CreateConvPowerVR(creation_context_, op_def, attr, &operation));
+      ConvPowerVR operation =
+          CreateConvPowerVR(creation_context_.GetDeviceInfo(), op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 2, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -92,8 +92,8 @@ TEST_F(OpenCLOperationTest, ConvPowerVR1x1) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ConvPowerVR operation;
-      ASSERT_OK(CreateConvPowerVR(creation_context_, op_def, attr, &operation));
+      ConvPowerVR operation =
+          CreateConvPowerVR(creation_context_.GetDeviceInfo(), op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 2, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -127,8 +127,8 @@ TEST_F(OpenCLOperationTest, ConvPowerVRSimpleWeights) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ConvPowerVR operation;
-      ASSERT_OK(CreateConvPowerVR(creation_context_, op_def, attr, &operation));
+      ConvPowerVR operation =
+          CreateConvPowerVR(creation_context_.GetDeviceInfo(), op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 2, 1), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -162,8 +162,8 @@ TEST_F(OpenCLOperationTest, ConvPowerVR) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ConvPowerVR operation;
-      ASSERT_OK(CreateConvPowerVR(creation_context_, op_def, attr, &operation));
+      ConvPowerVR operation =
+          CreateConvPowerVR(creation_context_.GetDeviceInfo(), op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 2, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.cc
index 88035556c86..bff328772d7 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.cc
@@ -420,38 +420,40 @@ int3 ConvTexture::GetGridSize() const {
   return int3(grid_x, grid_y, grid_z);
 }
 
-absl::Status ConvTexture::Tune(const TuningParameters& params) {
-  RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
-  return GetBestWorkGroupConv(params, kernel_, grid_size_, &work_group_size_);
+void ConvTexture::GetPossibleKernelWorkGroups(
+    TuningType tuning_type, const DeviceInfo& device_info,
+    const KernelInfo& kernel_info, std::vector<int3>* work_groups) const {
+  GetPossibleWorkGroupsConv(tuning_type, device_info, kernel_info, grid_size_,
+                            work_groups);
 }
 
-absl::Status CreateConvTexture(const CreationContext& creation_context,
-                               const OperationDef& definition,
-                               const Convolution2DAttributes& attr,
-                               ConvTexture* result) {
-  *result = ConvTexture(definition, attr);
-  result->GenerateCode(creation_context.device->GetInfo());
-  return result->UploadData(attr.weights, attr.bias, creation_context.context);
+ConvTexture CreateConvTexture(const DeviceInfo& device_info,
+                              const OperationDef& definition,
+                              const Convolution2DAttributes& attr) {
+  ConvTexture result(definition, attr);
+  result.GenerateCode(device_info);
+  result.UploadData(attr.weights, attr.bias);
+  return result;
 }
 
-absl::Status CreateConvTexture(const CreationContext& creation_context,
-                               const OperationDef& definition,
-                               const FullyConnectedAttributes& attr,
-                               ConvTexture* result) {
-  *result = ConvTexture(definition);
-  result->GenerateCode(creation_context.device->GetInfo());
-  return result->UploadData(attr.weights, attr.bias, creation_context.context);
+ConvTexture CreateConvTexture(const DeviceInfo& device_info,
+                              const OperationDef& definition,
+                              const FullyConnectedAttributes& attr) {
+  ConvTexture result(definition);
+  result.GenerateCode(device_info);
+  result.UploadData(attr.weights, attr.bias);
+  return result;
 }
 
-absl::Status CreateConvTextureWino4x4To6x6(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const Convolution2DAttributes& attr, ConvTexture* result) {
-  *result = ConvTexture(definition);
-  result->different_weights_for_height_ = true;
-  result->block_size_ = {4, 1, 2};
-  result->GenerateCode(creation_context.device->GetInfo());
-  return result->UploadDataForWinograd4x4To6x6(
-      attr.weights, *creation_context.device, creation_context.context);
+ConvTexture CreateConvTextureWino4x4To6x6(const DeviceInfo& device_info,
+                                          const OperationDef& definition,
+                                          const Convolution2DAttributes& attr) {
+  ConvTexture result(definition);
+  result.different_weights_for_height_ = true;
+  result.block_size_ = {4, 1, 2};
+  result.GenerateCode(device_info);
+  result.UploadDataForWinograd4x4To6x6(attr.weights);
+  return result;
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h b/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h
index 10efc23a044..b1889265930 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h
@@ -42,7 +42,10 @@ namespace cl {
 class ConvTexture : public GPUOperation {
  public:
   ConvTexture() = default;
-  absl::Status Tune(const TuningParameters& params) override;
+  void GetPossibleKernelWorkGroups(
+      TuningType tuning_type, const DeviceInfo& device_info,
+      const KernelInfo& kernel_info,
+      std::vector<int3>* work_groups) const override;
   absl::Status BindArguments() override;
   int3 GetGridSize() const override;
 
@@ -53,40 +56,30 @@ class ConvTexture : public GPUOperation {
   ConvTexture& operator=(const ConvTexture&) = delete;
 
  private:
-  friend absl::Status CreateConvTexture(const CreationContext& creation_context,
-                                        const OperationDef& definition,
-                                        const Convolution2DAttributes& attr,
-                                        ConvTexture* result);
-  friend absl::Status CreateConvTexture(const CreationContext& creation_context,
-                                        const OperationDef& definition,
-                                        const FullyConnectedAttributes& attr,
-                                        ConvTexture* result);
+  friend ConvTexture CreateConvTexture(const DeviceInfo& device_info,
+                                       const OperationDef& definition,
+                                       const Convolution2DAttributes& attr);
+  friend ConvTexture CreateConvTexture(const DeviceInfo& device_info,
+                                       const OperationDef& definition,
+                                       const FullyConnectedAttributes& attr);
 
-  friend absl::Status CreateConvTextureWino4x4To6x6(
-      const CreationContext& creation_context, const OperationDef& definition,
-      const Convolution2DAttributes& attr, ConvTexture* result);
+  friend ConvTexture CreateConvTextureWino4x4To6x6(
+      const DeviceInfo& device_info, const OperationDef& definition,
+      const Convolution2DAttributes& attr);
 
   ConvTexture(const OperationDef& definition,
               const Convolution2DAttributes& attr);
   explicit ConvTexture(const OperationDef& definition);
   template <DataType T>
-  absl::Status UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
-                          const tflite::gpu::Tensor<Linear, T>& biases,
-                          CLContext* context);
+  void UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
+                  const tflite::gpu::Tensor<Linear, T>& biases);
 
   template <DataType T>
-  absl::Status UploadDataForWinograd4x4To6x6(
-      const tflite::gpu::Tensor<OHWI, T>& weights, const CLDevice& device,
-      CLContext* context);
+  void UploadDataForWinograd4x4To6x6(
+      const tflite::gpu::Tensor<OHWI, T>& weights);
 
   template <DataType T>
-  absl::Status UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
-                             CLContext* context);
-
-  template <DataType S, typename T>
-  void RearrangeWeightsData(const tflite::gpu::Tensor<OHWI, S>& weights,
-                            absl::Span<T> dst_0, absl::Span<T> dst_1,
-                            absl::Span<T> dst_2, absl::Span<T> dst_3);
+  void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights);
 
   void GenerateCode(const DeviceInfo& device_info);
 
@@ -110,30 +103,24 @@ class ConvTexture : public GPUOperation {
 };
 
 template <DataType T>
-absl::Status ConvTexture::UploadData(
-    const tflite::gpu::Tensor<OHWI, T>& weights,
-    const tflite::gpu::Tensor<Linear, T>& biases, CLContext* context) {
-  RETURN_IF_ERROR(UploadWeights(weights, context));
+void ConvTexture::UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
+                             const tflite::gpu::Tensor<Linear, T>& biases) {
+  UploadWeights(weights);
 
   TensorLinearDescriptor desc;
   desc.storage_type = LinearStorageType::TEXTURE_2D;
   desc.element_type = definition_.GetDataType();
-
-  LinearStorage lt;
-  RETURN_IF_ERROR(CreateLinearStorage(desc, biases, context, &lt));
-  args_.AddObject("biases", AccessType::READ,
-                  absl::make_unique<LinearStorage>(std::move(lt)),
-                  absl::make_unique<TensorLinearDescriptor>(desc));
-  return absl::OkStatus();
+  desc.UploadLinearData(biases);
+  args_.AddObject("biases",
+                  absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
 }
 
 template <DataType T>
-absl::Status ConvTexture::UploadDataForWinograd4x4To6x6(
-    const tflite::gpu::Tensor<OHWI, T>& weights, const CLDevice& device,
-    CLContext* context) {
+void ConvTexture::UploadDataForWinograd4x4To6x6(
+    const tflite::gpu::Tensor<OHWI, T>& weights) {
   tflite::gpu::Tensor<OHWI, T> wino_weights;
   RearrangeWeightsToWinograd4x4To6x6Weights(weights, &wino_weights);
-  RETURN_IF_ERROR(UploadWeights(wino_weights, context));
+  UploadWeights(wino_weights);
 
   tflite::gpu::Tensor<Linear, DataType::FLOAT32> bias;
   bias.shape = Linear(1);
@@ -141,155 +128,63 @@ absl::Status ConvTexture::UploadDataForWinograd4x4To6x6(
   TensorLinearDescriptor desc;
   desc.storage_type = LinearStorageType::TEXTURE_2D;
   desc.element_type = definition_.GetDataType();
-
-  LinearStorage lt;
-  RETURN_IF_ERROR(CreateLinearStorage(desc, bias, context, &lt));
-  args_.AddObject("biases", AccessType::READ,
-                  absl::make_unique<LinearStorage>(std::move(lt)),
-                  absl::make_unique<TensorLinearDescriptor>(desc));
-  return absl::OkStatus();
+  desc.UploadLinearData(bias);
+  args_.AddObject("biases",
+                  absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
 }
 
 template <DataType T>
-absl::Status ConvTexture::UploadWeights(
-    const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
+void ConvTexture::UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights) {
   int dst_depth = DivideRoundUp(weights.shape.o, 4);
   dst_depth = AlignByN(dst_depth, block_size_.z);
   const int src_depth = DivideRoundUp(weights.shape.i, 4);
   const int kernel_x = weights.shape.w;
   const int kernel_y = weights.shape.h;
 
-  int texture_width = dst_depth;
-  int texture_height = src_depth * kernel_x * kernel_y;
-
   const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
   DataType data_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
 
-  const int elements_count = texture_width * texture_height;
+  const int elements_count = dst_depth * src_depth * kernel_x * kernel_y * 4;
+  const int float4_size = f32_weights ? sizeof(float4) : sizeof(half4);
 
-  Texture2DDescriptor desc;
-  desc.element_type = data_type;
-
-  Texture2D weights_0;
-  Texture2D weights_1;
-  Texture2D weights_2;
-  Texture2D weights_3;
+  std::vector<uint8_t> data(float4_size * elements_count);
 
   if (f32_weights) {
-    std::vector<float4> gpu_data_0(elements_count);
-    std::vector<float4> gpu_data_1(elements_count);
-    std::vector<float4> gpu_data_2(elements_count);
-    std::vector<float4> gpu_data_3(elements_count);
-    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data_0),
-                         absl::MakeSpan(gpu_data_1), absl::MakeSpan(gpu_data_2),
-                         absl::MakeSpan(gpu_data_3));
-    RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
-                                        texture_height, gpu_data_0.data(),
-                                        context, &weights_0));
-    RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
-                                        texture_height, gpu_data_1.data(),
-                                        context, &weights_1));
-    RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
-                                        texture_height, gpu_data_2.data(),
-                                        context, &weights_2));
-    RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
-                                        texture_height, gpu_data_3.data(),
-                                        context, &weights_3));
+    float4* ptr = reinterpret_cast<float4*>(data.data());
+    RearrangeWeightsToI4HWIOOGroupO4(weights, block_size_.z,
+                                     absl::MakeSpan(ptr, elements_count));
   } else {
-    std::vector<half4> gpu_data_0(elements_count);
-    std::vector<half4> gpu_data_1(elements_count);
-    std::vector<half4> gpu_data_2(elements_count);
-    std::vector<half4> gpu_data_3(elements_count);
-    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data_0),
-                         absl::MakeSpan(gpu_data_1), absl::MakeSpan(gpu_data_2),
-                         absl::MakeSpan(gpu_data_3));
-    RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
-                                        texture_height, gpu_data_0.data(),
-                                        context, &weights_0));
-    RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
-                                        texture_height, gpu_data_1.data(),
-                                        context, &weights_1));
-    RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
-                                        texture_height, gpu_data_2.data(),
-                                        context, &weights_2));
-    RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
-                                        texture_height, gpu_data_3.data(),
-                                        context, &weights_3));
+    half4* ptr = reinterpret_cast<half4*>(data.data());
+    RearrangeWeightsToI4HWIOOGroupO4(weights, block_size_.z,
+                                     absl::MakeSpan(ptr, elements_count));
   }
 
-  args_.AddObject("weights0", AccessType::READ,
-                  absl::make_unique<Texture2D>(std::move(weights_0)),
-                  absl::make_unique<Texture2DDescriptor>(desc));
-  args_.AddObject("weights1", AccessType::READ,
-                  absl::make_unique<Texture2D>(std::move(weights_1)),
-                  absl::make_unique<Texture2DDescriptor>(desc));
-  args_.AddObject("weights2", AccessType::READ,
-                  absl::make_unique<Texture2D>(std::move(weights_2)),
-                  absl::make_unique<Texture2DDescriptor>(desc));
-  args_.AddObject("weights3", AccessType::READ,
-                  absl::make_unique<Texture2D>(std::move(weights_3)),
-                  absl::make_unique<Texture2DDescriptor>(desc));
-  return absl::OkStatus();
-}
-
-template <DataType S, typename T>
-void ConvTexture::RearrangeWeightsData(
-    const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst_0,
-    absl::Span<T> dst_1, absl::Span<T> dst_2, absl::Span<T> dst_3) {
-  int dst_depth = DivideRoundUp(weights.shape.o, 4);
-  dst_depth = AlignByN(dst_depth, block_size_.z);
-  const int src_depth = DivideRoundUp(weights.shape.i, 4);
-  const int kernel_x = weights.shape.w;
-  const int kernel_y = weights.shape.h;
-
-  int texture_width = dst_depth;
-
-  for (int d = 0; d < dst_depth / block_size_.z; ++d) {
-    for (int y = 0; y < kernel_y; ++y) {
-      for (int x = 0; x < kernel_x; ++x) {
-        for (int s = 0; s < src_depth; ++s) {
-          for (int sub_d = 0; sub_d < block_size_.z; ++sub_d) {
-            T filters[4];
-            for (int i = 0; i < 4; ++i) {
-              for (int j = 0; j < 4; ++j) {
-                const int s_ch = s * 4 + j;
-                const int d_ch = (d * block_size_.z + sub_d) * 4 + i;
-                if (s_ch < weights.shape.i && d_ch < weights.shape.o) {
-                  const int f_index =
-                      weights.shape.LinearIndex({d_ch, y, x, s_ch});
-                  filters[j][i] = weights.data[f_index];
-                } else {
-                  filters[j][i] = 0.0f;
-                }
-              }
-            }
-            int x_coord = d * block_size_.z + sub_d;
-            int y_coord = (y * kernel_x + x) * src_depth + s;
-            int offset = y_coord * texture_width + x_coord;
-            dst_0[offset] = filters[0];
-            dst_1[offset] = filters[1];
-            dst_2[offset] = filters[2];
-            dst_3[offset] = filters[3];
-          }
-        }
-      }
-    }
+  const int texture_width = dst_depth;
+  const int texture_height = src_depth * kernel_x * kernel_y;
+  const int sub_size = float4_size * texture_width * texture_height;
+  for (int i = 0; i < 4; ++i) {
+    Texture2DDescriptor desc;
+    desc.element_type = data_type;
+    desc.size = int2(texture_width, texture_height);
+    desc.data.resize(sub_size);
+    memcpy(desc.data.data(), data.data() + sub_size * i, sub_size);
+    const std::string name = "weights" + std::to_string(i);
+    args_.AddObject(name,
+                    absl::make_unique<Texture2DDescriptor>(std::move(desc)));
   }
 }
 
-absl::Status CreateConvTexture(const CreationContext& creation_context,
-                               const OperationDef& definition,
-                               const Convolution2DAttributes& attr,
-                               ConvTexture* result);
+ConvTexture CreateConvTexture(const DeviceInfo& device_info,
+                              const OperationDef& definition,
+                              const Convolution2DAttributes& attr);
 
-absl::Status CreateConvTexture(const CreationContext& creation_context,
-                               const OperationDef& definition,
-                               const FullyConnectedAttributes& attr,
-                               ConvTexture* result);
+ConvTexture CreateConvTexture(const DeviceInfo& device_info,
+                              const OperationDef& definition,
+                              const FullyConnectedAttributes& attr);
 
-absl::Status CreateConvTextureWino4x4To6x6(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const Convolution2DAttributes& attr, ConvTexture* result);
+ConvTexture CreateConvTextureWino4x4To6x6(const DeviceInfo& device_info,
+                                          const OperationDef& definition,
+                                          const Convolution2DAttributes& attr);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture_test.cc
index 6b78d0a4078..2a92573b689 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture_test.cc
@@ -55,8 +55,8 @@ TEST_F(OpenCLOperationTest, ConvTextureSimpleWeights) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ConvTexture operation;
-      ASSERT_OK(CreateConvTexture(creation_context_, op_def, attr, &operation));
+      ConvTexture operation =
+          CreateConvTexture(creation_context_.GetDeviceInfo(), op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 2, 1), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -90,8 +90,8 @@ TEST_F(OpenCLOperationTest, ConvTexture) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ConvTexture operation;
-      ASSERT_OK(CreateConvTexture(creation_context_, op_def, attr, &operation));
+      ConvTexture operation =
+          CreateConvTexture(creation_context_.GetDeviceInfo(), op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 2, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/converter.cc b/tensorflow/lite/delegates/gpu/cl/kernels/converter.cc
index bd5aaed8bc3..d52efb43a08 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/converter.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/converter.cc
@@ -152,8 +152,8 @@ __kernel void from_tensor()" +
     context_ = &environment->context();
     shape_ = BHWC(input_def.dimensions.b, input_def.dimensions.h,
                   input_def.dimensions.w, input_def.dimensions.c);
-    RETURN_IF_ERROR(args_.TransformToCLCode(environment->device().GetInfo(), {},
-                                            &shader_src));
+    RETURN_IF_ERROR(
+        args_.TransformToCLCode(environment->device().info_, {}, &shader_src));
     return environment->program_cache()->GetOrCreateCLKernel(
         shader_src, "from_tensor", environment->context(),
         environment->device(), &kernel_);
@@ -272,8 +272,8 @@ __kernel void to_tensor()" +
     context_ = &environment->context();
     shape_ = BHWC(output_def.dimensions.b, output_def.dimensions.h,
                   output_def.dimensions.w, output_def.dimensions.c);
-    RETURN_IF_ERROR(args_.TransformToCLCode(environment->device().GetInfo(), {},
-                                            &shader_src));
+    RETURN_IF_ERROR(
+        args_.TransformToCLCode(environment->device().info_, {}, &shader_src));
     return environment->program_cache()->GetOrCreateCLKernel(
         shader_src, "to_tensor", environment->context(), environment->device(),
         &kernel_);
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.cc
index a139b3affc9..d22dbbd88cf 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.cc
@@ -17,12 +17,14 @@ limitations under the License.
 
 #include <string>
 #include <utility>
+#include <vector>
 
 #include "absl/strings/substitute.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
 #include "tensorflow/lite/delegates/gpu/cl/precision.h"
 #include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 
 namespace tflite {
@@ -33,46 +35,84 @@ ConvolutionTransposed::ConvolutionTransposed(
     const OperationDef& definition, const ConvolutionTransposedAttributes& attr,
     const DeviceInfo& device_info)
     : GPUOperation(definition),
-      weights_are_buffer_(device_info.IsMali()),
-      kernel_size_(attr.weights.shape.w, attr.weights.shape.h),
-      stride_(attr.stride.w, attr.stride.h),
-      padding_(attr.padding.prepended.w, attr.padding.prepended.h),
-      block_size_(2, 2, 2) {
+      stride_(attr.stride.w, attr.stride.h, 1, 1),
+      block_size_(2, 2, 1, 2) {
+  const bool weights_are_buffer = device_info.IsMali();
   const bool is_f16 = definition.precision == CalculationsPrecision::F16;
   if (device_info.IsMali()) {
     if (device_info.mali_info.IsMidgard()) {
-      block_size_ = is_f16 ? int3(2, 1, 2) : int3(2, 1, 1);
+      block_size_ = is_f16 ? int4(2, 1, 1, 2) : int4(2, 1, 1, 1);
     } else {
-      block_size_ = is_f16 ? int3(2, 2, 2) : int3(2, 2, 1);
+      block_size_ = is_f16 ? int4(2, 2, 1, 2) : int4(2, 2, 1, 1);
     }
   }
   const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
   if (dst_depth == 1 || dst_depth == 3) {
     if (!device_info.IsMali()) {
-      block_size_.y *= block_size_.z;
+      block_size_.y *= block_size_.w;
     }
-    block_size_.z = 1;
+    block_size_.w = 1;
   }
 
+  args_.AddInt("stride_x", stride_.x);
+  args_.AddInt("stride_y", stride_.y);
+  args_.AddInt("padding_x", attr.padding.prepended.w);
+  args_.AddInt("padding_y", attr.padding.prepended.h);
+  args_.AddInt("kernel_size_x", attr.weights.shape.w);
+  args_.AddInt("kernel_size_y", attr.weights.shape.h);
   code_ = GenerateConvolutionTransposedCode(definition_, device_info,
-                                            weights_are_buffer_, block_size_);
+                                            weights_are_buffer, block_size_);
+  UploadWeights(attr.weights, weights_are_buffer);
+}
+
+ConvolutionTransposed::ConvolutionTransposed(
+    const OperationDef& definition,
+    const ConvolutionTransposed3DAttributes& attr,
+    const DeviceInfo& device_info)
+    : GPUOperation(definition),
+      stride_(attr.stride.w, attr.stride.h, attr.stride.d, 1),
+      block_size_(2, 2, 1, 2) {
+  const bool weights_are_buffer = device_info.IsMali();
+  const bool is_f16 = definition.precision == CalculationsPrecision::F16;
+  if (device_info.IsMali()) {
+    if (device_info.mali_info.IsMidgard()) {
+      block_size_ = is_f16 ? int4(2, 1, 1, 2) : int4(2, 1, 1, 1);
+    } else {
+      block_size_ = is_f16 ? int4(2, 2, 1, 2) : int4(2, 2, 1, 1);
+    }
+  }
+  const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
+  if (dst_depth == 1 || dst_depth == 3) {
+    if (!device_info.IsMali()) {
+      block_size_.y *= block_size_.w;
+    }
+    block_size_.w = 1;
+  }
+
+  args_.AddInt("stride_x", stride_.x);
+  args_.AddInt("stride_y", stride_.y);
+  args_.AddInt("stride_z", stride_.z);
+  args_.AddInt("padding_x", attr.padding.prepended.w);
+  args_.AddInt("padding_y", attr.padding.prepended.h);
+  args_.AddInt("padding_z", attr.padding.prepended.d);
+  args_.AddInt("kernel_size_x", attr.weights.shape.w);
+  args_.AddInt("kernel_size_y", attr.weights.shape.h);
+  args_.AddInt("kernel_size_z", attr.weights.shape.d);
+  args_.AddInt("grid_size_y");
+  code_ = GenerateConvolutionTransposedCode(definition_, device_info,
+                                            weights_are_buffer, block_size_);
+  UploadWeights(attr.weights, weights_are_buffer);
 }
 
 ConvolutionTransposed::ConvolutionTransposed(ConvolutionTransposed&& operation)
     : GPUOperation(std::move(operation)),
-      weights_are_buffer_(operation.weights_are_buffer_),
-      kernel_size_(operation.kernel_size_),
       stride_(operation.stride_),
-      padding_(operation.padding_),
       block_size_(operation.block_size_) {}
 
 ConvolutionTransposed& ConvolutionTransposed::operator=(
     ConvolutionTransposed&& operation) {
   if (this != &operation) {
-    std::swap(weights_are_buffer_, operation.weights_are_buffer_);
-    std::swap(kernel_size_, operation.kernel_size_);
     std::swap(stride_, operation.stride_);
-    std::swap(padding_, operation.padding_);
     std::swap(block_size_, operation.block_size_);
     GPUOperation::operator=(std::move(operation));
   }
@@ -81,57 +121,85 @@ ConvolutionTransposed& ConvolutionTransposed::operator=(
 
 std::string ConvolutionTransposed::GenerateConvolutionTransposedCode(
     const OperationDef& op_def, const DeviceInfo& device_info,
-    bool weights_are_buffer, const int3& block_size) {
+    bool weights_are_buffer, const int4& block_size) {
   auto src_desc = op_def.src_tensors[0];
   src_desc.SetTextureAddressMode(TextureAddressMode::ZERO);
   AddSrcTensor("src_tensor", src_desc);
-
   AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
 
-  args_.AddInt("stride_x");
-  args_.AddInt("stride_y");
-  args_.AddInt("padding_x");
-  args_.AddInt("padding_y");
-  args_.AddInt("kernel_size_x");
-  args_.AddInt("kernel_size_y");
-
-  const auto src_tensor_type = op_def.src_tensors[0].storage_type;
-  bool image_buffer = src_tensor_type == TensorStorageType::IMAGE_BUFFER;
-  bool manual_clamp =
-      image_buffer || src_tensor_type == TensorStorageType::BUFFER;
+  const auto& src_def = op_def.src_tensors[0];
 
   std::string c = GetCommonDefines(op_def.precision);
 
-  for (int z = 0; z < block_size.z; ++z) {
+  for (int s = 0; s < block_size.w; ++s) {
     const std::string f0 =
-        weights_are_buffer ? "weights_cache[" + std::to_string(z) + "].s0123"
-                           : "f" + std::to_string(z * 4 + 0);
+        weights_are_buffer ? "weights_cache[" + std::to_string(s) + "].s0123"
+                           : "f" + std::to_string(s * 4 + 0);
     const std::string f1 =
-        weights_are_buffer ? "weights_cache[" + std::to_string(z) + "].s4567"
-                           : "f" + std::to_string(z * 4 + 1);
+        weights_are_buffer ? "weights_cache[" + std::to_string(s) + "].s4567"
+                           : "f" + std::to_string(s * 4 + 1);
     const std::string f2 =
-        weights_are_buffer ? "weights_cache[" + std::to_string(z) + "].s89ab"
-                           : "f" + std::to_string(z * 4 + 2);
+        weights_are_buffer ? "weights_cache[" + std::to_string(s) + "].s89ab"
+                           : "f" + std::to_string(s * 4 + 2);
     const std::string f3 =
-        weights_are_buffer ? "weights_cache[" + std::to_string(z) + "].scdef"
-                           : "f" + std::to_string(z * 4 + 3);
+        weights_are_buffer ? "weights_cache[" + std::to_string(s) + "].scdef"
+                           : "f" + std::to_string(s * 4 + 3);
     switch (op_def.precision) {
       case CalculationsPrecision::F32:
       case CalculationsPrecision::F16:
-        c += "#define CONV" + std::to_string(z) + "(R, S)    \\\n";
+        c += "#define CONV" + std::to_string(s) + "(R, S)    \\\n";
         c += "R += S.x * " + f0 + "; \\\n";
         c += "R += S.y * " + f1 + "; \\\n";
         c += "R += S.z * " + f2 + "; \\\n";
         c += "R += S.w * " + f3 + ";   \n";
         break;
       case CalculationsPrecision::F32_F16:
-        c += "#define CONV" + std::to_string(z) + "(R, S) \\\n";
+        c += "#define CONV" + std::to_string(s) + "(R, S) \\\n";
         c += "R += convert_float4(S.x * " + f0 + " + S.y * " + f1 +
              " + S.z * " + f2 + " + S.w * " + f3 + ");\n";
         break;
     }
   }
 
+  auto generate_id = [&](const std::string& x, const std::string& y,
+                         const std::string& z) {
+    std::string id;
+    if (src_def.HasAxis(Axis::WIDTH)) {
+      id += "_w" + x;
+    }
+    if (src_def.HasAxis(Axis::HEIGHT)) {
+      id += "_h" + y;
+    }
+    if (src_def.HasAxis(Axis::DEPTH)) {
+      id += "_d" + z;
+    }
+    return id;
+  };
+
+  auto generate_id_full = [&](const std::string& x, const std::string& y,
+                              const std::string& z, const std::string& s) {
+    return generate_id(x, y, z) + "_s" + s;
+  };
+
+  auto generate_check = [&](const std::string& x, const std::string& y,
+                            const std::string& z) {
+    std::string check;
+    const std::vector<Axis> axes{Axis::WIDTH, Axis::HEIGHT, Axis::DEPTH};
+    const std::vector<std::string> names{"in_x", "in_y", "in_z"};
+    const std::vector<std::string> coords{x, y, z};
+    for (int i = 0; i < axes.size(); ++i) {
+      const auto& axis = axes[i];
+      if (src_def.HasAxis(axis) && !src_def.SupportsZeroClamp(axis) &&
+          block_size[i] != 1) {
+        if (!check.empty()) {
+          check += " && ";
+        }
+        check += names[i] + coords[i];
+      }
+    }
+    return check;
+  };
+
   switch (op_def.precision) {
     case CalculationsPrecision::F32:
       c += "#define FLT16 float16\n";
@@ -157,23 +225,48 @@ std::string ConvolutionTransposed::GenerateConvolutionTransposedCode(
   c += "  int ceil_x = dst_x / args.stride_x;\n";
   c += "  dst_x = ceil_x * args.stride_x * " + std::to_string(block_size.x) +
        " + rem_x;\n";
-  c += "  int dst_y = get_global_id(1);\n";
+  if (src_def.HasAxis(Axis::DEPTH)) {
+    c += "  int linear_id_y = get_global_id(1);\n";
+    c += "  int dst_y = linear_id_y % args.grid_size_y;\n";
+    c += "  int dst_z = linear_id_y / args.grid_size_y;\n";
+    c += "  int rem_z = dst_z % args.stride_z;\n";
+    c += "  int ceil_z = dst_z / args.stride_z;\n";
+    c += "  dst_z = ceil_z * args.stride_z * " + std::to_string(block_size.z) +
+         " + rem_z;\n";
+    c += "  if (dst_z >= args.dst_tensor.Depth()) return;\n";
+  } else {
+    c += "  int dst_y = get_global_id(1);\n";
+  }
   c += "  int rem_y = dst_y % args.stride_y;\n";
   c += "  int ceil_y = dst_y / args.stride_y;\n";
   c += "  dst_y = ceil_y * args.stride_y * " + std::to_string(block_size.y) +
        " + rem_y;\n";
-  c += "  int dst_z = get_global_id(2) * " + std::to_string(block_size.z) +
+  c += "  int dst_s = get_global_id(2) * " + std::to_string(block_size.w) +
        ";\n";
   c += "  if (dst_x >= args.dst_tensor.Width() || dst_y >= "
-       "args.dst_tensor.Height() || dst_z >= "
+       "args.dst_tensor.Height() || dst_s >= "
        "args.dst_tensor.Slices()) return;\n";
   if (weights_are_buffer) {
-    c += "  int f_base = dst_z * args.src_tensor.Slice() * args.kernel_size_x "
-         "* args.kernel_size_y;\n";
+    c += "  int f_base = dst_s * args.src_tensor.Slices() * args.kernel_size_x "
+         "* args.kernel_size_y";
+    if (src_def.HasAxis(Axis::DEPTH)) {
+      c += " * args.kernel_size_z";
+    }
+    c += ";\n";
   }
-  for (int i = 0; i < block_size.x * block_size.y * block_size.z; ++i) {
-    c += "  ACCUM_FLT4 r" + std::to_string(i) +
-         " = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
+  for (int s = 0; s < block_size.w; ++s) {
+    const std::string sind = std::to_string(s);
+    for (int z = 0; z < block_size.z; ++z) {
+      const std::string zind = std::to_string(z);
+      for (int y = 0; y < block_size.y; ++y) {
+        const std::string yind = std::to_string(y);
+        for (int x = 0; x < block_size.x; ++x) {
+          const std::string xind = std::to_string(x);
+          c += "  ACCUM_FLT4 r" + generate_id_full(xind, yind, zind, sind) +
+               " = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
+        }
+      }
+    }
   }
   c += "  int kernel_first_dst_x = dst_x + args.padding_x;\n";
   c += "  int kernel_first_dst_y = dst_y + args.padding_y;\n";
@@ -189,21 +282,59 @@ std::string ConvolutionTransposed::GenerateConvolutionTransposedCode(
   c +=
       "  int src_y = (kernel_first_dst_y + offset_y_strided) / args.stride_y - "
       "offset_y;\n";
-  c += "  int src_as_dst_y = src_y * args.stride_y;\n";
-  c += "  for (;src_as_dst_y > kernel_last_dst_y; src_y -= 1, src_as_dst_y -= "
-       "args.stride_y) {\n";
+  if (src_def.HasAxis(Axis::DEPTH)) {
+    c += "  int kernel_first_dst_z = dst_z + args.padding_z;\n";
+    c += "  int kernel_last_dst_z = kernel_first_dst_z - args.kernel_size_z;\n";
+    c += "  int offset_z = abs(args.padding_z);\n";
+    c += "  int offset_z_strided = offset_z * args.stride_z;\n";
+    c += "  int src_z = (kernel_first_dst_z + offset_z_strided) / "
+         "args.stride_z - offset_z;\n";
+    c += "  int src_as_dst_z = src_z * args.stride_z;\n";
+    c +=
+        "  for (;src_as_dst_z > kernel_last_dst_z; src_z -= 1, src_as_dst_z -= "
+        "args.stride_z) {\n";
+    for (int z = 0; z < block_size.z; ++z) {
+      const std::string zindex = std::to_string(z);
+      c += "    int sz" + zindex + " = src_z + " + zindex + ";\n";
+      if (!src_def.SupportsZeroClamp(Axis::DEPTH)) {
+        c += "    bool in_z" + zindex + " = sz" + zindex + " >= 0 && sz" +
+             zindex + " < args.src_tensor.Depth();\n";
+        if (!src_def.CanReadOutOfBorder(Axis::DEPTH)) {
+          c += "    sz" + zindex + " = clamp(sz" + zindex +
+               ", 0, args.src_tensor.Depth() - 1);\n";
+        }
+      }
+    }
+    if (block_size.z == 1 && !src_def.SupportsZeroClamp(Axis::DEPTH)) {
+      c += "    if (!in_z0) continue;\n";
+    }
+    c += "    int kernel_z = kernel_first_dst_z - src_as_dst_z;\n";
+    c += "    int src_as_dst_y = src_y * args.stride_y;\n";
+    c += "    int src_y_copy = src_y;\n";
+    c += "    for (;src_as_dst_y > kernel_last_dst_y; src_y_copy -= 1, "
+         "src_as_dst_y -= args.stride_y) {\n";
+  } else {
+    c += "  int src_as_dst_y = src_y * args.stride_y;\n";
+    c += "  for (;src_as_dst_y > kernel_last_dst_y; src_y -= 1, src_as_dst_y "
+         "-= args.stride_y) {\n";
+  }
   for (int y = 0; y < block_size.y; ++y) {
     const std::string yindex = std::to_string(y);
-    c += "    int sy" + yindex + " = src_y + " + yindex + ";\n";
-    if (manual_clamp) {
+    const std::string src_y =
+        src_def.HasAxis(Axis::DEPTH) ? "src_y_copy" : "src_y";
+    c += "    int sy" + yindex + " = " + src_y + " + " + yindex + ";\n";
+    if (!src_def.SupportsZeroClamp(Axis::HEIGHT)) {
       c += "    bool in_y" + yindex + " = sy" + yindex + " >= 0 && sy" +
            yindex + " < args.src_tensor.Height();\n";
-      if (!image_buffer) {
+      if (!src_def.CanReadOutOfBorder(Axis::HEIGHT)) {
         c += "    sy" + yindex + " = clamp(sy" + yindex +
              ", 0, args.src_tensor.Height() - 1);\n";
       }
     }
   }
+  if (block_size.y == 1 && !src_def.SupportsZeroClamp(Axis::HEIGHT)) {
+    c += "      if (!in_y0) continue;\n";
+  }
   c += "    int kernel_y = kernel_first_dst_y - src_as_dst_y;\n";
   c += "    int src_as_dst_x = src_x * args.stride_x;\n";
   c += "    int src_x_copy = src_x;\n";
@@ -213,170 +344,234 @@ std::string ConvolutionTransposed::GenerateConvolutionTransposedCode(
   for (int x = 0; x < block_size.x; ++x) {
     const std::string xindex = std::to_string(x);
     c += "      int sx" + xindex + " = src_x_copy + " + xindex + ";\n";
-    if (manual_clamp) {
+    if (!src_def.SupportsZeroClamp(Axis::WIDTH)) {
       c += "      bool in_x" + xindex + " = sx" + xindex + " >= 0 && sx" +
            xindex + " < args.src_tensor.Width();\n";
-      if (!image_buffer) {
+      if (!src_def.CanReadOutOfBorder(Axis::WIDTH)) {
         c += "      sx" + xindex + " = clamp(sx" + xindex +
              ", 0, args.src_tensor.Width() - 1);\n";
       }
     }
   }
-  for (int y = 0; y < block_size.y; ++y) {
-    const std::string yindex = std::to_string(y);
-    for (int x = 0; x < block_size.x; ++x) {
-      const std::string xindex = std::to_string(x);
-      const std::string id = std::to_string(y * block_size.x + x);
-      c += "      args.src_tensor.GetAddress(addr_" + id + ", sx" + xindex +
-           ", sy" + yindex + ", 0);\n";
-      if (image_buffer) {
-        c += "      addr_" + id + " = select(-1, addr_" + id + ", (in_x" +
-             xindex + " && in_y" + yindex + "));\n";
-        c += absl::Substitute(
-            "      int dz_$0 = select(0, args.src_tensor.SliceStride(), "
-            "(in_x$1 && in_y$2));\n",
-            y * block_size.x + x, x, y);
+  if (block_size.x == 1 && !src_def.SupportsZeroClamp(Axis::WIDTH)) {
+    c += "      if (!in_x0) continue;\n";
+  }
+  for (int z = 0; z < block_size.z; ++z) {
+    const std::string zind = std::to_string(z);
+    for (int y = 0; y < block_size.y; ++y) {
+      const std::string yind = std::to_string(y);
+      for (int x = 0; x < block_size.x; ++x) {
+        const std::string xind = std::to_string(x);
+        const std::string id = generate_id(xind, yind, zind);
+        const std::string check = generate_check(xind, yind, zind);
+        std::string coords = "sx" + xind + ", sy" + yind;
+        if (src_def.HasAxis(Axis::DEPTH)) {
+          coords += ", sz" + zind;
+        }
+        if (src_def.IsLinear()) {
+          c += "      args.src_tensor.GetAddress(addr" + id + ", " + coords +
+               ", 0);\n";
+        }
+        if (src_def.ReturnsZeroForNegOneRead()) {
+          c += "      addr" + id + " = select(-1, addr" + id + ", (" + check +
+               "));\n";
+          c += "      int ds" + id +
+               " = select(0, args.src_tensor.SliceStride(), (" + check +
+               "));\n";
+        }
       }
     }
   }
-  if (src_tensor_type == TensorStorageType::BUFFER) {
-    c += "      int dz = args.src_tensor.SliceStride();\n";
-  }
-  if (block_size.x == 1 && block_size.y == 1 && manual_clamp) {
-    c += "      if (!in_x0 || !in_y0) continue;\n";
+  if (src_def.storage_type == TensorStorageType::BUFFER) {
+    c += "      int ds = args.src_tensor.SliceStride();\n";
   }
   c += "      int kernel_x = kernel_first_dst_x - src_as_dst_x;\n";
-  c += "      int kernel_index = kernel_y * args.kernel_size_x + kernel_x;\n";
+  if (src_def.HasAxis(Axis::DEPTH)) {
+    c += "      int kernel_index = (kernel_z * args.kernel_size_y + kernel_y) "
+         "*  args.kernel_size_x + kernel_x;\n";
+  } else {
+    c += "      int kernel_index = kernel_y * args.kernel_size_x + kernel_x;\n";
+  }
   if (weights_are_buffer) {
     c += "      int f_offset = f_base + kernel_index * "
          "args.src_tensor.Slices() * " +
-         std::to_string(block_size.z) + ";\n";
+         std::to_string(block_size.w) + ";\n";
   } else {
     c += "      int x_c = kernel_index * args.src_tensor.Slices();\n";
   }
   c += "      for (int s = 0; s < args.src_tensor.Slices(); ++s) {\n";
   const bool conditional_read = device_info.IsMali();
-  for (int y = 0; y < block_size.y; ++y) {
-    const std::string yindex = std::to_string(y);
-    for (int x = 0; x < block_size.x; ++x) {
-      const std::string xindex = std::to_string(x);
-      const std::string id = std::to_string(y * block_size.x + x);
-      if (image_buffer) {
-        c += "        FLT4 src" + id + " = args.src_tensor.Read(addr_" + id +
-             "); addr_" + id + " += dz_" + id + ";\n";
-      } else if (manual_clamp) {
-        if (conditional_read) {
-          c += "        FLT4 src" + id + " = in_x" + xindex + " && in_y" +
-               yindex + " ? args.src_tensor.Read(addr_" + id +
-               ") : (FLT4)(0.0f); addr_" + id + " += dz;\n";
+  for (int z = 0; z < block_size.z; ++z) {
+    const std::string zind = std::to_string(z);
+    for (int y = 0; y < block_size.y; ++y) {
+      const std::string yind = std::to_string(y);
+      for (int x = 0; x < block_size.x; ++x) {
+        const std::string xind = std::to_string(x);
+        const std::string id = generate_id(xind, yind, zind);
+        std::string address;
+        if (src_def.IsLinear()) {
+          address = "addr" + id;
         } else {
-          c += "        FLT4 src" + id + " = args.src_tensor.Read(addr_" + id +
-               ") * (FLT)(in_x" + xindex + " && in_y" + yindex + "); addr_" +
-               id + " += dz;\n";
+          address = "sx" + xind + ", sy" + yind;
+          if (src_def.HasAxis(Axis::DEPTH)) {
+            address += ", sz" + zind;
+          }
+          address += ", s";
+        }
+        if (src_def.ReturnsZeroForNegOneRead()) {
+          c += "        FLT4 src" + id + " = args.src_tensor.Read(" + address +
+               "); " + address + " += ds" + id + ";\n";
+        } else {
+          const std::string check = generate_check(xind, yind, zind);
+          if (!check.empty()) {
+            if (conditional_read) {
+              c += "        FLT4 src" + id + " = " + check +
+                   " ? args.src_tensor.Read(" + address + ") : (FLT4)(0.0f);\n";
+            } else {
+              c += "        FLT4 src" + id + " = args.src_tensor.Read(" +
+                   address + ") * (FLT)(" + check + ");\n";
+            }
+          } else {
+            c += "        FLT4 src" + id + " = args.src_tensor.Read(" +
+                 address + ");\n";
+          }
+          if (src_def.IsLinear()) {
+            c += "        addr" + id + " += ds;\n";
+          }
         }
-      } else {
-        c += "        FLT4 src" + id + " = args.src_tensor.Read(sx" + xindex +
-             ", sy" + yindex + ", s);\n";
       }
     }
   }
   if (weights_are_buffer) {
     c += "        __global FLT16* weights_cache = "
          "args.weights.GetPtr(f_offset);\n";
-    c += "        f_offset += " + std::to_string(block_size.z) + ";\n";
+    c += "        f_offset += " + std::to_string(block_size.w) + ";\n";
   } else {
-    for (int z = 0; z < block_size.z; ++z) {
+    for (int s = 0; s < block_size.w; ++s) {
       c += absl::Substitute(
-          R"(        FLT4 f$1 = args.weights0.Read(dst_z + $0, x_c);
-        FLT4 f$2 = args.weights1.Read(dst_z + $0, x_c);
-        FLT4 f$3 = args.weights2.Read(dst_z + $0, x_c);
-        FLT4 f$4 = args.weights3.Read(dst_z + $0, x_c);
+          R"(        FLT4 f$1 = args.weights0.Read(dst_s + $0, x_c);
+        FLT4 f$2 = args.weights1.Read(dst_s + $0, x_c);
+        FLT4 f$3 = args.weights2.Read(dst_s + $0, x_c);
+        FLT4 f$4 = args.weights3.Read(dst_s + $0, x_c);
 )",
-          z, z * 4 + 0, z * 4 + 1, z * 4 + 2, z * 4 + 3);
+          s, s * 4 + 0, s * 4 + 1, s * 4 + 2, s * 4 + 3);
     }
     c += "        x_c++;\n";
   }
-  for (int z = 0; z < block_size.z; ++z) {
-    for (int i = 0; i < block_size.x * block_size.y; ++i) {
-      c += "        CONV" + std::to_string(z) + "(r" +
-           std::to_string(i + z * block_size.x * block_size.y) + ", src" +
-           std::to_string(i) + ");\n";
+  for (int s = 0; s < block_size.w; ++s) {
+    const std::string sind = std::to_string(s);
+    for (int z = 0; z < block_size.z; ++z) {
+      const std::string zind = std::to_string(z);
+      for (int y = 0; y < block_size.y; ++y) {
+        const std::string yind = std::to_string(y);
+        for (int x = 0; x < block_size.x; ++x) {
+          const std::string xind = std::to_string(x);
+          const std::string id = generate_id(xind, yind, zind);
+          const std::string full_id = generate_id_full(xind, yind, zind, sind);
+          c += "        CONV" + sind + "(r" + full_id + ", src" + id + ");\n";
+        }
+      }
     }
   }
   c += "      }\n";
   c += "    }\n";
   c += "  }\n";
-  for (int z = 0; z < block_size.z; ++z) {
-    c += "  if (dst_z < args.dst_tensor.Slices()) {\n";
-    c += "    FLT4 bias_val = args.biases.Read(dst_z);\n";
-    for (int y = 0; y < block_size.y; ++y) {
-      for (int x = 0; x < block_size.x; ++x) {
-        const std::string id =
-            std::to_string((z * block_size.y + y) * block_size.x + x);
-        c += "    {\n";
-        c += "      int xc = dst_x + args.stride_x * " + std::to_string(x) +
-             ";\n";
-        c += "      int yc = dst_y + args.stride_y * " + std::to_string(y) +
-             ";\n";
-        c += "      if (xc < args.dst_tensor.Width() && yc < "
-             "args.dst_tensor.Height()) {\n";
-        c += "        FLT4 res = TO_FLT4(r" + id + ") + bias_val;\n";
-        c += "        args.dst_tensor.Write(res, xc, yc, dst_z);\n";
-        c += "      }\n";
-        c += "    }\n";
+  if (src_def.HasAxis(Axis::DEPTH)) {
+    c += "  }\n";
+  }
+  for (int s = 0; s < block_size.w; ++s) {
+    const std::string sind = std::to_string(s);
+    c += "  if (dst_s < args.dst_tensor.Slices()) {\n";
+    c += "    FLT4 bias_val = args.biases.Read(dst_s);\n";
+    for (int z = 0; z < block_size.z; ++z) {
+      const std::string zind = std::to_string(z);
+      for (int y = 0; y < block_size.y; ++y) {
+        const std::string yind = std::to_string(y);
+        for (int x = 0; x < block_size.x; ++x) {
+          const std::string xind = std::to_string(x);
+          const std::string id = generate_id_full(xind, yind, zind, sind);
+          std::string checks =
+              "xc < args.dst_tensor.Width() && yc < args.dst_tensor.Height()";
+          std::string coords = "xc, yc";
+          c += "    {\n";
+          c += "      int xc = dst_x + args.stride_x * " + xind + ";\n";
+          c += "      int yc = dst_y + args.stride_y * " + yind + ";\n";
+          if (src_def.HasAxis(Axis::DEPTH)) {
+            c += "      int zc = dst_z + args.stride_z * " + zind + ";\n";
+            checks += " && zc < args.dst_tensor.Depth()";
+            coords += ", zc";
+          }
+          c += "      if (" + checks + ") {\n";
+          c += "        FLT4 res = TO_FLT4(r" + id + ") + bias_val;\n";
+          c += "        args.dst_tensor.Write(res, " + coords + ", dst_s);\n";
+          c += "      }\n";
+          c += "    }\n";
+        }
       }
     }
     c += "  }\n";
-    c += "  dst_z++;\n";
+    c += "  dst_s++;\n";
   }
   c += "}\n";
   return c;
 }
 
 absl::Status ConvolutionTransposed::BindArguments() {
-  RETURN_IF_ERROR(args_.SetInt("stride_x", stride_.x));
-  RETURN_IF_ERROR(args_.SetInt("stride_y", stride_.y));
-  RETURN_IF_ERROR(args_.SetInt("padding_x", padding_.x));
-  RETURN_IF_ERROR(args_.SetInt("padding_y", padding_.y));
-  RETURN_IF_ERROR(args_.SetInt("kernel_size_x", kernel_size_.x));
-  return args_.SetInt("kernel_size_y", kernel_size_.y);
+  if (definition_.src_tensors[0].HasAxis(Axis::DEPTH)) {
+    const int aligned_h =
+        AlignByN(dst_[0]->Height(), stride_.y * block_size_.y);
+    RETURN_IF_ERROR(
+        args_.SetInt("grid_size_y", DivideRoundUp(aligned_h, block_size_.y)));
+  }
+  return absl::OkStatus();
 }
 
 int3 ConvolutionTransposed::GetGridSize() const {
   const int aligned_w = AlignByN(dst_[0]->Width(), stride_.x * block_size_.x);
   const int aligned_h = AlignByN(dst_[0]->Height(), stride_.y * block_size_.y);
+  const int aligned_d = AlignByN(dst_[0]->Depth(), stride_.z * block_size_.z);
   const int grid_x = DivideRoundUp(aligned_w, block_size_.x) * dst_[0]->Batch();
-  const int grid_y = DivideRoundUp(aligned_h, block_size_.y);
-  const int grid_z = DivideRoundUp(dst_[0]->Slices(), block_size_.z);
+  const int grid_y = DivideRoundUp(aligned_h, block_size_.y) *
+                     DivideRoundUp(aligned_d, block_size_.z);
+  const int grid_z = DivideRoundUp(dst_[0]->Slices(), block_size_.w);
   return int3(grid_x, grid_y, grid_z);
 }
 
-absl::Status ConvolutionTransposed::Tune(const TuningParameters& params) {
-  RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
-  return GetBestWorkGroupConv(params, kernel_, grid_size_, &work_group_size_);
+void ConvolutionTransposed::GetPossibleKernelWorkGroups(
+    TuningType tuning_type, const DeviceInfo& device_info,
+    const KernelInfo& kernel_info, std::vector<int3>* work_groups) const {
+  GetPossibleWorkGroupsConv(tuning_type, device_info, kernel_info, grid_size_,
+                            work_groups);
 }
 
-absl::Status CreateConvolutionTransposed(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const ConvolutionTransposedAttributes& attr,
-    ConvolutionTransposed* result) {
-  *result = ConvolutionTransposed(definition, attr,
-                                  creation_context.device->GetInfo());
-  RETURN_IF_ERROR(
-      result->UploadWeights(attr.weights, creation_context.context));
+ConvolutionTransposed CreateConvolutionTransposed(
+    const DeviceInfo& device_info, const OperationDef& definition,
+    const ConvolutionTransposedAttributes& attr) {
+  ConvolutionTransposed result(definition, attr, device_info);
 
   TensorLinearDescriptor desc;
   desc.storage_type =
       DeduceLinearStorageType(definition.GetPrimaryStorageType());
   desc.element_type = definition.GetDataType();
+  desc.UploadLinearData(attr.bias);
+  result.args_.AddObject(
+      "biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
+  return result;
+}
 
-  LinearStorage lt;
-  RETURN_IF_ERROR(
-      CreateLinearStorage(desc, attr.bias, creation_context.context, &lt));
-  result->args_.AddObject("biases", AccessType::READ,
-                          absl::make_unique<LinearStorage>(std::move(lt)),
-                          absl::make_unique<TensorLinearDescriptor>(desc));
-  return absl::OkStatus();
+ConvolutionTransposed CreateConvolutionTransposed3D(
+    const DeviceInfo& device_info, const OperationDef& definition,
+    const ConvolutionTransposed3DAttributes& attr) {
+  ConvolutionTransposed result(definition, attr, device_info);
+
+  TensorLinearDescriptor desc;
+  desc.storage_type =
+      DeduceLinearStorageType(definition.GetPrimaryStorageType());
+  desc.element_type = definition.GetDataType();
+  desc.UploadLinearData(attr.bias);
+  result.args_.AddObject(
+      "biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
+  return result;
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.h b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.h
index 44e1c942925..134bfc4839c 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.h
@@ -16,10 +16,12 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONVOLUTION_TRANSPOSED_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONVOLUTION_TRANSPOSED_H_
 
+#include <cstdint>
 #include <vector>
 
 #include "tensorflow/lite/delegates/gpu/cl/buffer.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
 #include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
 #include "tensorflow/lite/delegates/gpu/cl/tensor.h"
 #include "tensorflow/lite/delegates/gpu/cl/texture2d.h"
@@ -38,7 +40,10 @@ namespace cl {
 class ConvolutionTransposed : public GPUOperation {
  public:
   ConvolutionTransposed() = default;
-  absl::Status Tune(const TuningParameters& params) override;
+  void GetPossibleKernelWorkGroups(
+      TuningType tuning_type, const DeviceInfo& device_info,
+      const KernelInfo& kernel_info,
+      std::vector<int3>* work_groups) const override;
   absl::Status BindArguments() override;
   int3 GetGridSize() const override;
 
@@ -49,188 +54,164 @@ class ConvolutionTransposed : public GPUOperation {
   ConvolutionTransposed& operator=(const ConvolutionTransposed&) = delete;
 
  private:
-  friend absl::Status CreateConvolutionTransposed(
-      const CreationContext& creation_context, const OperationDef& definition,
-      const ConvolutionTransposedAttributes& attr,
-      ConvolutionTransposed* result);
-  explicit ConvolutionTransposed(const OperationDef& definition,
-                                 const ConvolutionTransposedAttributes& attr,
-                                 const DeviceInfo& device_info);
-  template <DataType T>
-  absl::Status UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
-                             CLContext* context);
+  friend ConvolutionTransposed CreateConvolutionTransposed(
+      const DeviceInfo& device_info, const OperationDef& definition,
+      const ConvolutionTransposedAttributes& attr);
+  friend ConvolutionTransposed CreateConvolutionTransposed3D(
+      const DeviceInfo& device_info, const OperationDef& definition,
+      const ConvolutionTransposed3DAttributes& attr);
+  ConvolutionTransposed(const OperationDef& definition,
+                        const ConvolutionTransposedAttributes& attr,
+                        const DeviceInfo& device_info);
+  ConvolutionTransposed(const OperationDef& definition,
+                        const ConvolutionTransposed3DAttributes& attr,
+                        const DeviceInfo& device_info);
 
-  template <DataType S, typename T>
-  void RearrangeWeightsData(const tflite::gpu::Tensor<OHWI, S>& weights,
-                            absl::Span<T> dst);
+  template <DataType T>
+  void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
+                     bool weights_are_buffer);
+
+  template <DataType T>
+  void UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights,
+                     bool weights_are_buffer);
 
   std::string GenerateConvolutionTransposedCode(const OperationDef& op_def,
                                                 const DeviceInfo& device_info,
                                                 bool weights_are_buffer,
-                                                const int3& block_size);
-
-  bool weights_are_buffer_;
-
-  int2 kernel_size_;
-  int2 stride_;
-  int2 padding_;
-
-  int3 block_size_ = int3(1, 1, 1);
+                                                const int4& block_size);
+  int4 stride_;
+  int4 block_size_ = int4(1, 1, 1, 1);  // WHDS
 };
 
 template <DataType T>
-absl::Status ConvolutionTransposed::UploadWeights(
-    const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
+void ConvolutionTransposed::UploadWeights(
+    const tflite::gpu::Tensor<OHWI, T>& weights, bool weights_are_buffer) {
   const int dst_depth =
-      AlignByN(DivideRoundUp(weights.shape.o, 4), block_size_.z);
+      AlignByN(DivideRoundUp(weights.shape.o, 4), block_size_.w);
   const int src_depth = DivideRoundUp(weights.shape.i, 4);
-  const int kernel_x = kernel_size_.x;
-  const int kernel_y = kernel_size_.y;
-  int texture_width = dst_depth;
-  int texture_height = src_depth * kernel_x * kernel_y;
+  const int kernel_x = weights.shape.w;
+  const int kernel_y = weights.shape.h;
 
   const int elements_count = kernel_x * kernel_y * src_depth * dst_depth * 4;
   const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
 
   const int float4_size = f32_weights ? 16 : 8;
+  std::vector<uint8_t> data(float4_size * elements_count);
 
-  Texture2D weights_0;
-  Texture2D weights_1;
-  Texture2D weights_2;
-  Texture2D weights_3;
-  Buffer weights_buf;
   if (f32_weights) {
-    std::vector<float4> gpu_data(elements_count);
-    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
-    if (weights_are_buffer_) {
-      RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
-                                           gpu_data.data(), context,
-                                           &weights_buf));
+    float4* ptr = reinterpret_cast<float4*>(data.data());
+    if (weights_are_buffer) {
+      RearrangeWeightsToOHWIOGroupI4O4(weights, block_size_.w,
+                                       absl::MakeSpan(ptr, elements_count));
     } else {
-      RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), dst_depth, src_depth * kernel_x * kernel_y,
-          gpu_data.data(), context, &weights_0));
-      RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), dst_depth, src_depth * kernel_x * kernel_y,
-          gpu_data.data() + texture_width * texture_height, context,
-          &weights_1));
-      RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), dst_depth, src_depth * kernel_x * kernel_y,
-          gpu_data.data() + texture_width * texture_height * 2, context,
-          &weights_2));
-      RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), dst_depth, src_depth * kernel_x * kernel_y,
-          gpu_data.data() + texture_width * texture_height * 3, context,
-          &weights_3));
+      RearrangeWeightsToI4HWIOOGroupO4(weights, block_size_.w,
+                                       absl::MakeSpan(ptr, elements_count));
     }
   } else {
-    std::vector<half4> gpu_data(elements_count);
-    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
-    if (weights_are_buffer_) {
-      RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
-                                           gpu_data.data(), context,
-                                           &weights_buf));
+    half4* ptr = reinterpret_cast<half4*>(data.data());
+    if (weights_are_buffer) {
+      RearrangeWeightsToOHWIOGroupI4O4(weights, block_size_.w,
+                                       absl::MakeSpan(ptr, elements_count));
     } else {
-      RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), dst_depth, src_depth * kernel_x * kernel_y,
-          gpu_data.data(), context, &weights_0));
-      RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), dst_depth, src_depth * kernel_x * kernel_y,
-          gpu_data.data() + texture_width * texture_height, context,
-          &weights_1));
-      RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), dst_depth, src_depth * kernel_x * kernel_y,
-          gpu_data.data() + texture_width * texture_height * 2, context,
-          &weights_2));
-      RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), dst_depth, src_depth * kernel_x * kernel_y,
-          gpu_data.data() + texture_width * texture_height * 3, context,
-          &weights_3));
+      RearrangeWeightsToI4HWIOOGroupO4(weights, block_size_.w,
+                                       absl::MakeSpan(ptr, elements_count));
     }
   }
 
-  if (weights_are_buffer_) {
+  if (weights_are_buffer) {
     BufferDescriptor desc;
     desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
     desc.element_size = 16;
-    args_.AddObject("weights", AccessType::READ,
-                    absl::make_unique<Buffer>(std::move(weights_buf)),
-                    absl::make_unique<BufferDescriptor>(desc));
+    desc.size = float4_size * elements_count;
+    desc.data = std::move(data);
+    args_.AddObject("weights",
+                    absl::make_unique<BufferDescriptor>(std::move(desc)));
   } else {
-    Texture2DDescriptor desc;
-    desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
-    args_.AddObject("weights0", AccessType::READ,
-                    absl::make_unique<Texture2D>(std::move(weights_0)),
-                    absl::make_unique<Texture2DDescriptor>(desc));
-    args_.AddObject("weights1", AccessType::READ,
-                    absl::make_unique<Texture2D>(std::move(weights_1)),
-                    absl::make_unique<Texture2DDescriptor>(desc));
-    args_.AddObject("weights2", AccessType::READ,
-                    absl::make_unique<Texture2D>(std::move(weights_2)),
-                    absl::make_unique<Texture2DDescriptor>(desc));
-    args_.AddObject("weights3", AccessType::READ,
-                    absl::make_unique<Texture2D>(std::move(weights_3)),
-                    absl::make_unique<Texture2DDescriptor>(desc));
-  }
-
-  return absl::OkStatus();
-}
-
-template <DataType S, typename T>
-void ConvolutionTransposed::RearrangeWeightsData(
-    const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
-  const int dst_depth =
-      AlignByN(DivideRoundUp(weights.shape.o, 4), block_size_.z);
-  const int src_depth = DivideRoundUp(weights.shape.i, 4);
-  const int kernel_x = kernel_size_.x;
-  const int kernel_y = kernel_size_.y;
-  int texture_width = dst_depth;
-  int texture_height = src_depth * kernel_x * kernel_y;
-
-  int counter = 0;
-  for (int d = 0; d < dst_depth / block_size_.z; ++d) {
-    for (int y = 0; y < kernel_y; ++y) {
-      for (int x = 0; x < kernel_x; ++x) {
-        for (int s = 0; s < src_depth; ++s) {
-          for (int sub_d = 0; sub_d < block_size_.z; ++sub_d) {
-            T filters[4];
-            for (int i = 0; i < 4; ++i) {
-              for (int j = 0; j < 4; ++j) {
-                const int s_ch = s * 4 + j;
-                const int d_ch = (d * block_size_.z + sub_d) * 4 + i;
-                if (s_ch < weights.shape.i && d_ch < weights.shape.o) {
-                  const int f_index =
-                      weights.shape.LinearIndex({d_ch, y, x, s_ch});
-                  filters[j][i] = weights.data[f_index];
-                } else {
-                  filters[j][i] = 0.0f;
-                }
-              }
-            }
-            if (weights_are_buffer_) {
-              dst[counter++] = filters[0];
-              dst[counter++] = filters[1];
-              dst[counter++] = filters[2];
-              dst[counter++] = filters[3];
-            } else {
-              int x_coord = d * block_size_.z + sub_d;
-              int y_coord = (y * kernel_x + x) * src_depth + s;
-              int offset = y_coord * dst_depth + x_coord;
-              dst[offset + texture_width * texture_height * 0] = filters[0];
-              dst[offset + texture_width * texture_height * 1] = filters[1];
-              dst[offset + texture_width * texture_height * 2] = filters[2];
-              dst[offset + texture_width * texture_height * 3] = filters[3];
-            }
-          }
-        }
-      }
+    int texture_width = dst_depth;
+    int texture_height = src_depth * kernel_x * kernel_y;
+    int sub_size = float4_size * texture_width * texture_height;
+    for (int i = 0; i < 4; ++i) {
+      Texture2DDescriptor desc;
+      desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+      desc.size = int2(texture_width, texture_height);
+      desc.data.resize(sub_size);
+      memcpy(desc.data.data(), data.data() + sub_size * i, sub_size);
+      const std::string name = "weights" + std::to_string(i);
+      args_.AddObject(name,
+                      absl::make_unique<Texture2DDescriptor>(std::move(desc)));
     }
   }
 }
 
-absl::Status CreateConvolutionTransposed(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const ConvolutionTransposedAttributes& attr, ConvolutionTransposed* result);
+template <DataType T>
+void ConvolutionTransposed::UploadWeights(
+    const tflite::gpu::Tensor<OHWDI, T>& weights, bool weights_are_buffer) {
+  const int dst_depth =
+      AlignByN(DivideRoundUp(weights.shape.o, 4), block_size_.w);
+  const int src_depth = DivideRoundUp(weights.shape.i, 4);
+  const int kernel_x = weights.shape.w;
+  const int kernel_y = weights.shape.h;
+  const int kernel_z = weights.shape.d;
+
+  const int elements_count =
+      kernel_x * kernel_y * kernel_z * src_depth * dst_depth * 4;
+  const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
+
+  const int float4_size = f32_weights ? 16 : 8;
+  std::vector<uint8_t> data(float4_size * elements_count);
+
+  if (f32_weights) {
+    float4* ptr = reinterpret_cast<float4*>(data.data());
+    if (weights_are_buffer) {
+      RearrangeWeightsToODHWIOGroupI4O4(weights, block_size_.w,
+                                        absl::MakeSpan(ptr, elements_count));
+    } else {
+      RearrangeWeightsToI4DHWIOOGroupO4(weights, block_size_.w,
+                                        absl::MakeSpan(ptr, elements_count));
+    }
+  } else {
+    half4* ptr = reinterpret_cast<half4*>(data.data());
+    if (weights_are_buffer) {
+      RearrangeWeightsToODHWIOGroupI4O4(weights, block_size_.w,
+                                        absl::MakeSpan(ptr, elements_count));
+    } else {
+      RearrangeWeightsToI4DHWIOOGroupO4(weights, block_size_.w,
+                                        absl::MakeSpan(ptr, elements_count));
+    }
+  }
+
+  if (weights_are_buffer) {
+    BufferDescriptor desc;
+    desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+    desc.element_size = 16;
+    desc.size = float4_size * elements_count;
+    desc.data = std::move(data);
+    args_.AddObject("weights",
+                    absl::make_unique<BufferDescriptor>(std::move(desc)));
+  } else {
+    int texture_width = dst_depth;
+    int texture_height = src_depth * kernel_x * kernel_y * kernel_z;
+    int sub_size = float4_size * texture_width * texture_height;
+    for (int i = 0; i < 4; ++i) {
+      Texture2DDescriptor desc;
+      desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+      desc.size = int2(texture_width, texture_height);
+      desc.data.resize(sub_size);
+      memcpy(desc.data.data(), data.data() + sub_size * i, sub_size);
+      const std::string name = "weights" + std::to_string(i);
+      args_.AddObject(name,
+                      absl::make_unique<Texture2DDescriptor>(std::move(desc)));
+    }
+  }
+}
+
+ConvolutionTransposed CreateConvolutionTransposed(
+    const DeviceInfo& device_info, const OperationDef& definition,
+    const ConvolutionTransposedAttributes& attr);
+
+ConvolutionTransposed CreateConvolutionTransposed3D(
+    const DeviceInfo& device_info, const OperationDef& definition,
+    const ConvolutionTransposed3DAttributes& attr);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.cc
deleted file mode 100644
index eeb3ae15e51..00000000000
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.cc
+++ /dev/null
@@ -1,427 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.h"
-
-#include <string>
-#include <utility>
-
-#include "absl/strings/substitute.h"
-#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
-#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
-#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
-#include "tensorflow/lite/delegates/gpu/common/status.h"
-
-namespace tflite {
-namespace gpu {
-namespace cl {
-
-ConvolutionTransposed3D::ConvolutionTransposed3D(
-    const OperationDef& definition,
-    const ConvolutionTransposed3DAttributes& attr,
-    const DeviceInfo& device_info)
-    : GPUOperation(definition),
-      weights_are_buffer_(device_info.IsMali()),
-      kernel_size_(attr.weights.shape.w, attr.weights.shape.h,
-                   attr.weights.shape.d),
-      stride_(attr.stride.w, attr.stride.h, attr.stride.d),
-      padding_(attr.padding.prepended.w, attr.padding.prepended.h,
-               attr.padding.prepended.d),
-      block_size_(2, 2, 1, 2) {
-  code_ = GenerateConvolutionTransposed3DCode(definition_, weights_are_buffer_,
-                                              block_size_);
-  if (device_info.IsPowerVR() && block_size_.y != 1) {
-    bool is_texture3d = definition_.src_tensors[0].storage_type ==
-                        TensorStorageType::TEXTURE_3D;
-    bool is_texture_array = definition_.src_tensors[0].storage_type ==
-                            TensorStorageType::TEXTURE_ARRAY;
-    if (is_texture3d || is_texture_array) {
-      compiler_options_.push_back(CompilerOptions::CL_OPT_DISABLE);
-    }
-  }
-}
-
-ConvolutionTransposed3D::ConvolutionTransposed3D(
-    ConvolutionTransposed3D&& operation)
-    : GPUOperation(std::move(operation)),
-      weights_are_buffer_(operation.weights_are_buffer_),
-      kernel_size_(operation.kernel_size_),
-      stride_(operation.stride_),
-      padding_(operation.padding_),
-      block_size_(operation.block_size_) {}
-
-ConvolutionTransposed3D& ConvolutionTransposed3D::operator=(
-    ConvolutionTransposed3D&& operation) {
-  if (this != &operation) {
-    std::swap(weights_are_buffer_, operation.weights_are_buffer_);
-    std::swap(kernel_size_, operation.kernel_size_);
-    std::swap(stride_, operation.stride_);
-    std::swap(padding_, operation.padding_);
-    std::swap(block_size_, operation.block_size_);
-    GPUOperation::operator=(std::move(operation));
-  }
-  return *this;
-}
-
-std::string ConvolutionTransposed3D::GenerateConvolutionTransposed3DCode(
-    const OperationDef& op_def, bool weights_are_buffer,
-    const int4& block_size) {
-  auto src_desc = op_def.src_tensors[0];
-  src_desc.SetTextureAddressMode(TextureAddressMode::ZERO);
-  AddSrcTensor("src_tensor", src_desc);
-
-  AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
-
-  args_.AddInt("stride_x");
-  args_.AddInt("stride_y");
-  args_.AddInt("stride_z");
-  args_.AddInt("padding_x");
-  args_.AddInt("padding_y");
-  args_.AddInt("padding_z");
-  args_.AddInt("kernel_size_x");
-  args_.AddInt("kernel_size_y");
-  args_.AddInt("kernel_size_z");
-  args_.AddInt("grid_size_s");
-
-  const auto src_tensor_type = op_def.src_tensors[0].storage_type;
-  bool image_buffer = src_tensor_type == TensorStorageType::IMAGE_BUFFER;
-  bool manual_clamp =
-      image_buffer || src_tensor_type == TensorStorageType::BUFFER;
-
-  std::string c = GetCommonDefines(op_def.precision);
-
-  for (int s = 0; s < block_size.w; ++s) {
-    const std::string f0 =
-        weights_are_buffer ? "weights_cache[" + std::to_string(s) + "].s0123"
-                           : "f" + std::to_string(s * 4 + 0);
-    const std::string f1 =
-        weights_are_buffer ? "weights_cache[" + std::to_string(s) + "].s4567"
-                           : "f" + std::to_string(s * 4 + 1);
-    const std::string f2 =
-        weights_are_buffer ? "weights_cache[" + std::to_string(s) + "].s89ab"
-                           : "f" + std::to_string(s * 4 + 2);
-    const std::string f3 =
-        weights_are_buffer ? "weights_cache[" + std::to_string(s) + "].scdef"
-                           : "f" + std::to_string(s * 4 + 3);
-    switch (op_def.precision) {
-      case CalculationsPrecision::F32:
-      case CalculationsPrecision::F16:
-        c += "#define CONV" + std::to_string(s) + "(R, S)    \\\n";
-        c += "R += S.x * " + f0 + "; \\\n";
-        c += "R += S.y * " + f1 + "; \\\n";
-        c += "R += S.z * " + f2 + "; \\\n";
-        c += "R += S.w * " + f3 + ";   \n";
-        break;
-      case CalculationsPrecision::F32_F16:
-        c += "#define CONV" + std::to_string(s) + "(R, S) \\\n";
-        c += "R += convert_float4(S.x * " + f0 + " + S.y * " + f1 +
-             " + S.z * " + f2 + " + S.w * " + f3 + ");\n";
-        break;
-    }
-  }
-
-  switch (op_def.precision) {
-    case CalculationsPrecision::F32:
-      c += "#define FLT16 float16\n";
-      break;
-    case CalculationsPrecision::F32_F16:
-    case CalculationsPrecision::F16:
-      c += "#define FLT16 half16\n";
-      break;
-  }
-
-  c += "__kernel void main_function(\n";
-  c += "$0) {\n";
-  if (op_def.IsBatchSupported()) {
-    c += "  int linear_id = get_global_id(0);\n";
-    c += "  int dst_x = (linear_id / args.dst_tensor.Batch());\n";
-    c += "  int B = linear_id % args.dst_tensor.Batch();\n";
-    c += "  args.dst_tensor.SetBatchRef(B);\n";
-    c += "  args.src_tensor.SetBatchRef(B);\n";
-  } else {
-    c += "  int dst_x = get_global_id(0);\n";
-  }
-  c += "  int rem_x = dst_x % args.stride_x;\n";
-  c += "  int ceil_x = dst_x / args.stride_x;\n";
-  c += "  dst_x = ceil_x * args.stride_x * " + std::to_string(block_size.x) +
-       " + rem_x;\n";
-  c += "  int dst_y = get_global_id(1);\n";
-  c += "  int rem_y = dst_y % args.stride_y;\n";
-  c += "  int ceil_y = dst_y / args.stride_y;\n";
-  c += "  dst_y = ceil_y * args.stride_y * " + std::to_string(block_size.y) +
-       " + rem_y;\n";
-  c += "  int linear_id_z = get_global_id(2);\n";
-  c += "  int S = (linear_id_z % args.grid_size_s) * " +
-       std::to_string(block_size.w) + ";\n";
-  c += "  int dst_z = linear_id_z / args.grid_size_s;\n";
-  c += "  int rem_z = dst_z % args.stride_z;\n";
-  c += "  int ceil_z = dst_z / args.stride_z;\n";
-  c += "  dst_z = ceil_z * args.stride_z * " + std::to_string(block_size.z) +
-       " + rem_z;\n";
-  c += "  if (dst_x >= args.dst_tensor.Width() || dst_y >= "
-       "args.dst_tensor.Height() || dst_z >= "
-       "args.dst_tensor.Depth()) return;\n";
-  if (weights_are_buffer) {
-    c += "  int f_base = S * args.src_tensor.Slices() * args.kernel_size_x * "
-         "args.kernel_size_y * "
-         "args.kernel_size_z;\n";
-  }
-  for (int i = 0; i < block_size.x * block_size.y * block_size.z * block_size.w;
-       ++i) {
-    c += "  ACCUM_FLT4 r" + std::to_string(i) +
-         " = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
-  }
-  c += "  int kernel_first_dst_x = dst_x + args.padding_x;\n";
-  c += "  int kernel_first_dst_y = dst_y + args.padding_y;\n";
-  c += "  int kernel_first_dst_z = dst_z + args.padding_z;\n";
-  c += "  int kernel_last_dst_x = kernel_first_dst_x - args.kernel_size_x;\n";
-  c += "  int kernel_last_dst_y = kernel_first_dst_y - args.kernel_size_y;\n";
-  c += "  int kernel_last_dst_z = kernel_first_dst_z - args.kernel_size_z;\n";
-  c += "  int offset_x = abs(args.padding_x);\n";
-  c += "  int offset_x_strided = offset_x * args.stride_x;\n";
-  c +=
-      "  int src_x = (kernel_first_dst_x + offset_x_strided) / args.stride_x - "
-      "offset_x;\n";
-  c += "  int offset_y = abs(args.padding_y);\n";
-  c += "  int offset_y_strided = offset_y * args.stride_y;\n";
-  c +=
-      "  int src_y = (kernel_first_dst_y + offset_y_strided) / args.stride_y - "
-      "offset_y;\n";
-  c += "  int offset_z = abs(args.padding_z);\n";
-  c += "  int offset_z_strided = offset_z * args.stride_z;\n";
-  c +=
-      "  int src_z = (kernel_first_dst_z + offset_z_strided) / args.stride_z - "
-      "offset_z;\n";
-  c += "  int src_as_dst_z = src_z * args.stride_z;\n";
-  c += "  for (;src_as_dst_z > kernel_last_dst_z; src_z -= 1, src_as_dst_z -= "
-       "args.stride_z) {\n";
-  for (int z = 0; z < block_size.z; ++z) {
-    const std::string zindex = std::to_string(z);
-    c += "    int sz" + zindex + " = src_z + " + zindex + ";\n";
-    if (src_tensor_type != TensorStorageType::TEXTURE_3D) {
-      c += "    bool in_z" + zindex + " = sz" + zindex + " >= 0 && sz" +
-           zindex + " < args.src_tensor.Depth();\n";
-    }
-  }
-  if (block_size.z == 1 && (src_tensor_type != TensorStorageType::TEXTURE_3D)) {
-    c += "    if (!in_z0) continue;\n";
-  }
-  c += "    int kernel_z = kernel_first_dst_z - src_as_dst_z;\n";
-  c += "    int src_as_dst_y = src_y * args.stride_y;\n";
-  c += "    int src_y_copy = src_y;\n";
-  c += "    for (;src_as_dst_y > kernel_last_dst_y; src_y_copy -= 1, "
-       "src_as_dst_y -= "
-       "args.stride_y) {\n";
-  for (int y = 0; y < block_size.y; ++y) {
-    const std::string yindex = std::to_string(y);
-    c += "      int sy" + yindex + " = src_y_copy + " + yindex + ";\n";
-    if (manual_clamp) {
-      c += "      bool in_y" + yindex + " = sy" + yindex + " >= 0 && sy" +
-           yindex + " < args.src_tensor.Height();\n";
-      if (!image_buffer) {
-        c += "      sy" + yindex + " = clamp(sy" + yindex +
-             ", 0, args.src_tensor.Height() - 1);\n";
-      }
-    }
-  }
-  c += "      int kernel_y = kernel_first_dst_y - src_as_dst_y;\n";
-  c += "      int src_as_dst_x = src_x * args.stride_x;\n";
-  c += "      int src_x_copy = src_x;\n";
-  c += "      for (;src_as_dst_x > kernel_last_dst_x; src_x_copy -= 1, "
-       "src_as_dst_x "
-       "-= args.stride_x) {\n";
-  for (int x = 0; x < block_size.x; ++x) {
-    const std::string xindex = std::to_string(x);
-    c += "        int sx" + xindex + " = src_x_copy + " + xindex + ";\n";
-    if (manual_clamp) {
-      c += "        bool in_x" + xindex + " = sx" + xindex + " >= 0 && sx" +
-           xindex + " < args.src_tensor.Width();\n";
-      if (!image_buffer) {
-        c += "        sx" + xindex + " = clamp(sx" + xindex +
-             ", 0, args.src_tensor.Width() - 1);\n";
-      }
-    }
-  }
-  const std::string layer_offset = "args.src_tensor.SliceStride()";
-  for (int z = 0; z < block_size.z; ++z) {
-    const std::string zindex = std::to_string(z);
-    for (int y = 0; y < block_size.y; ++y) {
-      const std::string yindex = std::to_string(y);
-      for (int x = 0; x < block_size.x; ++x) {
-        const std::string xindex = std::to_string(x);
-        const std::string id =
-            std::to_string((z * block_size.y + y) * block_size.x + x);
-        c += "        args.src_tensor.GetAddress(addr_" + id + ", sx" + xindex +
-             ", sy" + yindex + ", sz" + zindex + ", 0);";
-        if (image_buffer) {
-          c += "        addr_" + id + " = select(-1, addr_" + id + ", (in_x" +
-               xindex + " && in_y" + yindex + "));\n";
-          c += absl::Substitute(
-              "        int dz_$0 = select(0, $3, (in_x$1 && "
-              "in_y$2));\n",
-              id, x, y, layer_offset);
-        }
-      }
-    }
-  }
-  if (src_tensor_type == TensorStorageType::BUFFER) {
-    c += "        int dz = " + layer_offset + ";\n";
-  }
-  if (block_size.x == 1 && block_size.y == 1 && manual_clamp) {
-    c += "        if (!in_x0 || !in_y0) continue;\n";
-  }
-  c += "        int kernel_x = kernel_first_dst_x - src_as_dst_x;\n";
-  c += "        int kernel_index =(kernel_z * args.kernel_size_y + kernel_y) * "
-       "args.kernel_size_x + kernel_x;\n";
-  if (weights_are_buffer) {
-    c += "        int f_offset = f_base + kernel_index * "
-         "args.src_tensor.Slices() * " +
-         std::to_string(block_size.w) + ";\n";
-  } else {
-    c += "        int x_c = kernel_index * args.src_tensor.Slices();\n";
-  }
-  c += "        for (int s = 0; s < args.src_tensor.Slices(); ++s) {\n";
-  for (int y = 0; y < block_size.y; ++y) {
-    const std::string yindex = std::to_string(y);
-    for (int x = 0; x < block_size.x; ++x) {
-      const std::string xindex = std::to_string(x);
-      const std::string id = std::to_string(y * block_size.x + x);
-      if (image_buffer) {
-        c += "          FLT4 src" + id + " = args.src_tensor.Read(addr_" + id +
-             "); addr_" + id + " += dz_" + id + ";\n";
-      } else if (manual_clamp) {
-        c += "          FLT4 src" + id + " = args.src_tensor.Read(addr_" + id +
-             ") * (FLT)(in_x" + xindex + " && in_y" + yindex + "); addr_" + id +
-             " += dz;\n";
-      } else {
-        c += "          FLT4 src" + id + " = args.src_tensor.Read(sx" + xindex +
-             ", sy" + yindex + ", sz0, s);\n";
-      }
-    }
-  }
-  if (weights_are_buffer) {
-    c += "          __global FLT16* weights_cache = "
-         "args.weights.GetPtr(f_offset);\n";
-    c += "          f_offset += " + std::to_string(block_size.w) + ";\n";
-  } else {
-    for (int z = 0; z < block_size.w; ++z) {
-      c += absl::Substitute(
-          R"(          FLT4 f$1 = args.weights0.Read(S + $0, x_c);
-          FLT4 f$2 = args.weights1.Read(S + $0, x_c);
-          FLT4 f$3 = args.weights2.Read(S + $0, x_c);
-          FLT4 f$4 = args.weights3.Read(S + $0, x_c);
-)",
-          z, z * 4 + 0, z * 4 + 1, z * 4 + 2, z * 4 + 3);
-    }
-    c += "          x_c++;\n";
-  }
-  for (int z = 0; z < block_size.w; ++z) {
-    for (int i = 0; i < block_size.x * block_size.y * block_size.z; ++i) {
-      c += "          CONV" + std::to_string(z) + "(r" +
-           std::to_string(i + z * block_size.x * block_size.y * block_size.z) +
-           ", src" + std::to_string(i) + ");\n";
-    }
-  }
-  c += "        }\n";
-  c += "      }\n";
-  c += "    }\n";
-  c += "  }\n";
-  for (int s = 0; s < block_size.w; ++s) {
-    c += "  if (S < args.dst_tensor.Slices()) {\n";
-    c += "    FLT4 bias_val = args.biases.Read(S);\n";
-    for (int z = 0; z < block_size.z; ++z) {
-      for (int y = 0; y < block_size.y; ++y) {
-        for (int x = 0; x < block_size.x; ++x) {
-          const std::string id = std::to_string(
-              ((s * block_size.z + z) * block_size.y + y) * block_size.x + x);
-          c += "    {\n";
-          c += "      int xc = dst_x + args.stride_x * " + std::to_string(x) +
-               ";\n";
-          c += "      int yc = dst_y + args.stride_y * " + std::to_string(y) +
-               ";\n";
-          c += "      int zc = dst_z + args.stride_z * " + std::to_string(z) +
-               ";\n";
-          c += "      if (xc < args.dst_tensor.Width() && yc < "
-               "args.dst_tensor.Height() && zc < args.dst_tensor.Depth()) {\n";
-          c += "        FLT4 res = TO_FLT4(r" + id + ") + bias_val;\n";
-          c += "        args.dst_tensor.Write(res, xc, yc, zc, S)\n";
-          c += "      }\n";
-          c += "    }\n";
-        }
-      }
-    }
-    c += "  }\n";
-    c += "  S++;\n";
-  }
-  c += "}\n";
-  return c;
-}
-
-absl::Status ConvolutionTransposed3D::BindArguments() {
-  RETURN_IF_ERROR(args_.SetInt("stride_x", stride_.x));
-  RETURN_IF_ERROR(args_.SetInt("stride_y", stride_.y));
-  RETURN_IF_ERROR(args_.SetInt("stride_z", stride_.z));
-  RETURN_IF_ERROR(args_.SetInt("padding_x", padding_.x));
-  RETURN_IF_ERROR(args_.SetInt("padding_y", padding_.y));
-  RETURN_IF_ERROR(args_.SetInt("padding_z", padding_.z));
-  RETURN_IF_ERROR(args_.SetInt("kernel_size_x", kernel_size_.x));
-  RETURN_IF_ERROR(args_.SetInt("kernel_size_y", kernel_size_.y));
-  RETURN_IF_ERROR(args_.SetInt("kernel_size_z", kernel_size_.z));
-  return args_.SetInt("grid_size_s",
-                      DivideRoundUp(dst_[0]->Slices(), block_size_.w));
-}
-
-int3 ConvolutionTransposed3D::GetGridSize() const {
-  const int aligned_w = AlignByN(dst_[0]->Width(), stride_.x * block_size_.x);
-  const int aligned_h = AlignByN(dst_[0]->Height(), stride_.y * block_size_.y);
-  const int aligned_d = AlignByN(dst_[0]->Depth(), stride_.z * block_size_.z);
-  const int grid_x = DivideRoundUp(aligned_w, block_size_.x) * dst_[0]->Batch();
-  const int grid_y = DivideRoundUp(aligned_h, block_size_.y);
-  const int grid_z = DivideRoundUp(dst_[0]->Slices(), block_size_.w) *
-                     DivideRoundUp(aligned_d, block_size_.z);
-  return int3(grid_x, grid_y, grid_z);
-}
-
-absl::Status ConvolutionTransposed3D::Tune(const TuningParameters& params) {
-  RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
-  return GetBestWorkGroupConv(params, kernel_, grid_size_, &work_group_size_);
-}
-
-absl::Status CreateConvolutionTransposed3D(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const ConvolutionTransposed3DAttributes& attr,
-    ConvolutionTransposed3D* result) {
-  *result = ConvolutionTransposed3D(definition, attr,
-                                    creation_context.device->GetInfo());
-  RETURN_IF_ERROR(
-      result->UploadWeights(attr.weights, creation_context.context));
-
-  TensorLinearDescriptor desc;
-  desc.storage_type =
-      DeduceLinearStorageType(definition.GetPrimaryStorageType());
-  desc.element_type = definition.GetDataType();
-
-  LinearStorage lt;
-  RETURN_IF_ERROR(
-      CreateLinearStorage(desc, attr.bias, creation_context.context, &lt));
-  result->args_.AddObject("biases", AccessType::READ,
-                          absl::make_unique<LinearStorage>(std::move(lt)),
-                          absl::make_unique<TensorLinearDescriptor>(desc));
-  return absl::OkStatus();
-}
-
-}  // namespace cl
-}  // namespace gpu
-}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.h b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.h
deleted file mode 100644
index 0025d9da7b6..00000000000
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.h
+++ /dev/null
@@ -1,245 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONVOLUTION_TRANSPOSED_3D_H_
-#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONVOLUTION_TRANSPOSED_3D_H_
-
-#include <vector>
-
-#include "tensorflow/lite/delegates/gpu/cl/buffer.h"
-#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
-#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
-#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
-#include "tensorflow/lite/delegates/gpu/cl/texture2d.h"
-#include "tensorflow/lite/delegates/gpu/cl/util.h"
-#include "tensorflow/lite/delegates/gpu/common/data_type.h"
-#include "tensorflow/lite/delegates/gpu/common/operations.h"
-#include "tensorflow/lite/delegates/gpu/common/shape.h"
-#include "tensorflow/lite/delegates/gpu/common/status.h"
-#include "tensorflow/lite/delegates/gpu/common/tensor.h"
-#include "tensorflow/lite/delegates/gpu/common/types.h"
-
-namespace tflite {
-namespace gpu {
-namespace cl {
-
-class ConvolutionTransposed3D : public GPUOperation {
- public:
-  ConvolutionTransposed3D() = default;
-  absl::Status Tune(const TuningParameters& params) override;
-  absl::Status BindArguments() override;
-  int3 GetGridSize() const override;
-
-  // Move only
-  ConvolutionTransposed3D(ConvolutionTransposed3D&& operation);
-  ConvolutionTransposed3D& operator=(ConvolutionTransposed3D&& operation);
-  ConvolutionTransposed3D(const ConvolutionTransposed3D&) = delete;
-  ConvolutionTransposed3D& operator=(const ConvolutionTransposed3D&) = delete;
-
- private:
-  friend absl::Status CreateConvolutionTransposed3D(
-      const CreationContext& creation_context, const OperationDef& definition,
-      const ConvolutionTransposed3DAttributes& attr,
-      ConvolutionTransposed3D* result);
-  ConvolutionTransposed3D(const OperationDef& definition,
-                          const ConvolutionTransposed3DAttributes& attr,
-                          const DeviceInfo& device_info);
-  template <DataType T>
-  absl::Status UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights,
-                             CLContext* context);
-
-  template <DataType S, typename T>
-  void RearrangeWeightsData(const tflite::gpu::Tensor<OHWDI, S>& weights,
-                            absl::Span<T> dst);
-
-  std::string GenerateConvolutionTransposed3DCode(const OperationDef& op_def,
-                                                  bool weights_are_buffer,
-                                                  const int4& block_size);
-
-  bool weights_are_buffer_;
-
-  int3 kernel_size_;
-  int3 stride_;
-  int3 padding_;
-
-  int4 block_size_ = int4(1, 1, 1, 1);  // WHDS
-};
-
-template <DataType T>
-absl::Status ConvolutionTransposed3D::UploadWeights(
-    const tflite::gpu::Tensor<OHWDI, T>& weights, CLContext* context) {
-  const int dst_depth =
-      AlignByN(DivideRoundUp(weights.shape.o, 4), block_size_.z);
-  const int src_depth = DivideRoundUp(weights.shape.i, 4);
-  const int kernel_x = kernel_size_.x;
-  const int kernel_y = kernel_size_.y;
-  const int kernel_z = kernel_size_.z;
-  int texture_width = dst_depth;
-  int texture_height = src_depth * kernel_x * kernel_y * kernel_z;
-
-  const int elements_count =
-      kernel_x * kernel_y * kernel_z * src_depth * dst_depth * 4;
-  const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
-
-  const int float4_size = f32_weights ? 16 : 8;
-
-  Texture2D weights_0;
-  Texture2D weights_1;
-  Texture2D weights_2;
-  Texture2D weights_3;
-  Buffer weights_buf;
-  if (f32_weights) {
-    std::vector<float4> gpu_data(elements_count);
-    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
-    if (weights_are_buffer_) {
-      RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
-                                           gpu_data.data(), context,
-                                           &weights_buf));
-    } else {
-      RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), texture_width, texture_height,
-          gpu_data.data(), context, &weights_0));
-      RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), texture_width, texture_height,
-          gpu_data.data() + texture_width * texture_height, context,
-          &weights_1));
-      RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), texture_width, texture_height,
-          gpu_data.data() + texture_width * texture_height * 2, context,
-          &weights_2));
-      RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), texture_width, texture_height,
-          gpu_data.data() + texture_width * texture_height * 3, context,
-          &weights_3));
-    }
-  } else {
-    std::vector<half4> gpu_data(elements_count);
-    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
-    if (weights_are_buffer_) {
-      RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
-                                           gpu_data.data(), context,
-                                           &weights_buf));
-    } else {
-      RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), texture_width, texture_height,
-          gpu_data.data(), context, &weights_0));
-      RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), texture_width, texture_height,
-          gpu_data.data() + texture_width * texture_height, context,
-          &weights_1));
-      RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), texture_width, texture_height,
-          gpu_data.data() + texture_width * texture_height * 2, context,
-          &weights_2));
-      RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), texture_width, texture_height,
-          gpu_data.data() + texture_width * texture_height * 3, context,
-          &weights_3));
-    }
-  }
-
-  if (weights_are_buffer_) {
-    BufferDescriptor desc;
-    desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
-    desc.element_size = 16;
-    args_.AddObject("weights", AccessType::READ,
-                    absl::make_unique<Buffer>(std::move(weights_buf)),
-                    absl::make_unique<BufferDescriptor>(desc));
-  } else {
-    Texture2DDescriptor desc;
-    desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
-    args_.AddObject("weights0", AccessType::READ,
-                    absl::make_unique<Texture2D>(std::move(weights_0)),
-                    absl::make_unique<Texture2DDescriptor>(desc));
-    args_.AddObject("weights1", AccessType::READ,
-                    absl::make_unique<Texture2D>(std::move(weights_1)),
-                    absl::make_unique<Texture2DDescriptor>(desc));
-    args_.AddObject("weights2", AccessType::READ,
-                    absl::make_unique<Texture2D>(std::move(weights_2)),
-                    absl::make_unique<Texture2DDescriptor>(desc));
-    args_.AddObject("weights3", AccessType::READ,
-                    absl::make_unique<Texture2D>(std::move(weights_3)),
-                    absl::make_unique<Texture2DDescriptor>(desc));
-  }
-
-  return absl::OkStatus();
-}
-
-template <DataType S, typename T>
-void ConvolutionTransposed3D::RearrangeWeightsData(
-    const tflite::gpu::Tensor<OHWDI, S>& weights, absl::Span<T> dst) {
-  const int dst_depth =
-      AlignByN(DivideRoundUp(weights.shape.o, 4), block_size_.w);
-  const int src_depth = DivideRoundUp(weights.shape.i, 4);
-  const int kernel_x = kernel_size_.x;
-  const int kernel_y = kernel_size_.y;
-  const int kernel_z = kernel_size_.z;
-  int texture_width = dst_depth;
-  int texture_height = src_depth * kernel_x * kernel_y * kernel_z;
-
-  int counter = 0;
-  for (int d = 0; d < dst_depth / block_size_.w; ++d) {
-    for (int z = 0; z < kernel_z; ++z) {
-      for (int y = 0; y < kernel_y; ++y) {
-        for (int x = 0; x < kernel_x; ++x) {
-          for (int s = 0; s < src_depth; ++s) {
-            for (int sub_d = 0; sub_d < block_size_.w; ++sub_d) {
-              T filters[4];
-              for (int i = 0; i < 4; ++i) {
-                for (int j = 0; j < 4; ++j) {
-                  const int s_ch = s * 4 + j;
-                  const int d_ch = (d * block_size_.w + sub_d) * 4 + i;
-                  if (s_ch < weights.shape.i && d_ch < weights.shape.o) {
-                    const int f_index =
-                        weights.shape.LinearIndex({d_ch, y, x, z, s_ch});
-                    filters[j][i] = weights.data[f_index];
-                  } else {
-                    filters[j][i] = 0.0f;
-                  }
-                }
-              }
-              if (weights_are_buffer_) {
-                dst[counter++] = filters[0];
-                dst[counter++] = filters[1];
-                dst[counter++] = filters[2];
-                dst[counter++] = filters[3];
-              } else {
-                int x_coord = d * block_size_.w + sub_d;
-                int y_coord =
-                    ((z * kernel_y + y) * kernel_x + x) * src_depth + s;
-                int offset = y_coord * dst_depth + x_coord;
-                dst[offset + texture_width * texture_height * 0] = filters[0];
-                dst[offset + texture_width * texture_height * 1] = filters[1];
-                dst[offset + texture_width * texture_height * 2] = filters[2];
-                dst[offset + texture_width * texture_height * 3] = filters[3];
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-absl::Status CreateConvolutionTransposed3D(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const ConvolutionTransposed3DAttributes& attr,
-    ConvolutionTransposed3D* result);
-
-}  // namespace cl
-}  // namespace gpu
-}  // namespace tflite
-
-#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONVOLUTION_TRANSPOSED_3D_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.cc
index 3e3a5a1f7f4..af952dd3f78 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.cc
@@ -28,16 +28,16 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 ConvolutionTransposed3x3::ConvolutionTransposed3x3(
-    const OperationDef& definition, const CLDevice& device, int2 padding)
+    const OperationDef& definition, const DeviceInfo& device_info, int2 padding)
     : GPUOperation(definition),
       padding_(padding),
       work_group_launch_order_(2, 0, 1) {
   work_group_size_ = int3(8, 4, 1);
-  if (device.IsPowerVR()) {
+  if (device_info.IsPowerVR()) {
     weights_upload_type_ = WeightsUploadType::LOCAL_MEM_ASYNC;
-  } else if (device.IsNvidia() || device.IsIntel()) {
+  } else if (device_info.IsNvidia() || device_info.IsIntel()) {
     weights_upload_type_ = WeightsUploadType::LOCAL_MEM_BY_THREADS;
-  } else if (device.IsAMD()) {
+  } else if (device_info.IsAMD()) {
     weights_upload_type_ = WeightsUploadType::CONSTANT_MEM;
   } else {
     weights_upload_type_ = WeightsUploadType::GLOBAL_MEM;
@@ -45,7 +45,7 @@ ConvolutionTransposed3x3::ConvolutionTransposed3x3(
   code_ = GenerateConvolutionTransposedCode(definition_, weights_upload_type_,
                                             padding_, work_group_launch_order_);
   if (definition_.precision == CalculationsPrecision::F16 &&
-      device.IsPowerVR()) {
+      device_info.IsPowerVR()) {
     compiler_options_.push_back(CompilerOptions::POWERVR_FP16);
   }
 }
@@ -329,38 +329,26 @@ int3 ConvolutionTransposed3x3::GetGridSize() const {
 }
 
 bool IsConvolutionTransposed3x3Supported(
-    const CLDevice& device, const OperationDef& definition,
+    const OperationDef& definition,
     const ConvolutionTransposedAttributes& attr) {
   return attr.weights.shape.w == 3 && attr.weights.shape.h == 3 &&
          attr.stride.w == 2 && attr.stride.h == 2;
 }
 
-absl::Status CreateConvolutionTransposed3x3(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const ConvolutionTransposedAttributes& attr,
-    ConvolutionTransposed3x3* result) {
-  if (!IsConvolutionTransposed3x3Supported(*creation_context.device, definition,
-                                           attr)) {
-    return absl::InvalidArgumentError(
-        "ConvolutionTransposed3x3 doesn't support this attributes");
-  }
+ConvolutionTransposed3x3 CreateConvolutionTransposed3x3(
+    const DeviceInfo& device_info, const OperationDef& definition,
+    const ConvolutionTransposedAttributes& attr) {
   const int2 padding = int2(attr.padding.prepended.w, attr.padding.prepended.h);
-  *result =
-      ConvolutionTransposed3x3(definition, *creation_context.device, padding);
-  RETURN_IF_ERROR(
-      result->UploadWeights(attr.weights, creation_context.context));
+  ConvolutionTransposed3x3 result(definition, device_info, padding);
+  result.UploadWeights(attr.weights);
 
   TensorLinearDescriptor desc;
   desc.storage_type = LinearStorageType::TEXTURE_2D;
   desc.element_type = definition.GetDataType();
-
-  LinearStorage lt;
-  RETURN_IF_ERROR(
-      CreateLinearStorage(desc, attr.bias, creation_context.context, &lt));
-  result->args_.AddObject("biases", AccessType::READ,
-                          absl::make_unique<LinearStorage>(std::move(lt)),
-                          absl::make_unique<TensorLinearDescriptor>(desc));
-  return absl::OkStatus();
+  desc.UploadLinearData(attr.bias);
+  result.args_.AddObject(
+      "biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
+  return result;
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.h b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.h
index 9addfe11984..ad3e459da3e 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.h
@@ -37,8 +37,11 @@ namespace cl {
 class ConvolutionTransposed3x3 : public GPUOperation {
  public:
   ConvolutionTransposed3x3() = default;
-  absl::Status Tune(const TuningParameters& params) override {
-    return absl::OkStatus();
+  void GetPossibleKernelWorkGroups(
+      TuningType tuning_type, const DeviceInfo& device_info,
+      const KernelInfo& kernel_info,
+      std::vector<int3>* work_groups) const override {
+    work_groups->push_back(work_group_size_);
   }
   absl::Status BindArguments() override;
   int3 GetGridSize() const override;
@@ -58,14 +61,12 @@ class ConvolutionTransposed3x3 : public GPUOperation {
 
  private:
   ConvolutionTransposed3x3(const OperationDef& definition,
-                           const CLDevice& device, int2 padding);
-  friend absl::Status CreateConvolutionTransposed3x3(
-      const CreationContext& creation_context, const OperationDef& definition,
-      const ConvolutionTransposedAttributes& attr,
-      ConvolutionTransposed3x3* result);
+                           const DeviceInfo& device_info, int2 padding);
+  friend ConvolutionTransposed3x3 CreateConvolutionTransposed3x3(
+      const DeviceInfo& device_info, const OperationDef& definition,
+      const ConvolutionTransposedAttributes& attr);
   template <DataType T>
-  absl::Status UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
-                             CLContext* context);
+  void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights);
 
   template <DataType S, typename T>
   void RearrangeWeightsData(const tflite::gpu::Tensor<OHWI, S>& weights,
@@ -82,8 +83,8 @@ class ConvolutionTransposed3x3 : public GPUOperation {
 };
 
 template <DataType T>
-absl::Status ConvolutionTransposed3x3::UploadWeights(
-    const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
+void ConvolutionTransposed3x3::UploadWeights(
+    const tflite::gpu::Tensor<OHWI, T>& weights) {
   const int src_depth = DivideRoundUp(weights.shape.i, 4);
   const int dst_depth = DivideRoundUp(weights.shape.o, 4);
   const int kernel_x = 3;  //  This operation support only 3x3 kernel
@@ -93,19 +94,6 @@ absl::Status ConvolutionTransposed3x3::UploadWeights(
   const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
   const int flt4_size = f32_weights ? sizeof(float4) : sizeof(half4);
 
-  Buffer weights_buffer;
-  if (f32_weights) {
-    std::vector<float4> gpu_data(flt4_count);
-    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
-    RETURN_IF_ERROR(CreateReadOnlyBuffer(
-        flt4_size * flt4_count, gpu_data.data(), context, &weights_buffer));
-  } else {
-    std::vector<half4> gpu_data(flt4_count);
-    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
-    RETURN_IF_ERROR(CreateReadOnlyBuffer(
-        flt4_size * flt4_count, gpu_data.data(), context, &weights_buffer));
-  }
-
   BufferDescriptor desc;
   desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
   desc.element_size = 4;
@@ -114,12 +102,19 @@ absl::Status ConvolutionTransposed3x3::UploadWeights(
               ConvolutionTransposed3x3::WeightsUploadType::CONSTANT_MEM
           ? MemoryType::CONSTANT
           : MemoryType::GLOBAL;
+  desc.size = flt4_size * flt4_count;
+  desc.data.resize(desc.size);
 
-  args_.AddObject("weights", AccessType::READ,
-                  absl::make_unique<Buffer>(std::move(weights_buffer)),
-                  absl::make_unique<BufferDescriptor>(desc));
+  if (f32_weights) {
+    float4* ptr = reinterpret_cast<float4*>(desc.data.data());
+    RearrangeWeightsData(weights, absl::MakeSpan(ptr, flt4_count));
+  } else {
+    half4* ptr = reinterpret_cast<half4*>(desc.data.data());
+    RearrangeWeightsData(weights, absl::MakeSpan(ptr, flt4_count));
+  }
 
-  return absl::OkStatus();
+  args_.AddObject("weights",
+                  absl::make_unique<BufferDescriptor>(std::move(desc)));
 }
 
 template <DataType S, typename T>
@@ -178,13 +173,12 @@ void ConvolutionTransposed3x3::RearrangeWeightsData(
 }
 
 bool IsConvolutionTransposed3x3Supported(
-    const CLDevice& device, const OperationDef& definition,
+    const OperationDef& definition,
     const ConvolutionTransposedAttributes& attr);
 
-absl::Status CreateConvolutionTransposed3x3(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const ConvolutionTransposedAttributes& attr,
-    ConvolutionTransposed3x3* result);
+ConvolutionTransposed3x3 CreateConvolutionTransposed3x3(
+    const DeviceInfo& device_info, const OperationDef& definition,
+    const ConvolutionTransposedAttributes& attr);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_test.cc
index 3f72d7479fe..8fbf6b05b43 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_test.cc
@@ -54,9 +54,8 @@ TEST_F(OpenCLOperationTest, ConvolutionTransposed3x3) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ConvolutionTransposed3x3 operation;
-      ASSERT_OK(CreateConvolutionTransposed3x3(creation_context_, op_def, attr,
-                                               &operation));
+      ConvolutionTransposed3x3 operation = CreateConvolutionTransposed3x3(
+          creation_context_.GetDeviceInfo(), op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 4, 4, 1), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.cc
index 4fb93dd3263..19b9a2143e3 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.cc
@@ -189,7 +189,7 @@ int3 ConvolutionTransposed3x3Thin::GetGridSize() const {
 }
 
 bool IsConvolutionTransposed3x3ThinSupported(
-    const CLDevice& device, const ConvolutionTransposedAttributes& attr) {
+    const ConvolutionTransposedAttributes& attr) {
   return attr.weights.shape.o <= 8 && attr.weights.shape.w == 3 &&
          attr.weights.shape.h == 3 && attr.stride.w == 2 &&
          attr.stride.h == 2 && attr.padding.prepended.w == 1 &&
@@ -197,19 +197,12 @@ bool IsConvolutionTransposed3x3ThinSupported(
          attr.padding.appended.h == 1;
 }
 
-absl::Status CreateConvolutionTransposed3x3Thin(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const ConvolutionTransposedAttributes& attr,
-    ConvolutionTransposed3x3Thin* result) {
-  if (!IsConvolutionTransposed3x3ThinSupported(*creation_context.device,
-                                               attr)) {
-    return absl::InvalidArgumentError(
-        "ConvolutionTransposed3x3Thin doesn't support this attributes");
-  }
-  *result = ConvolutionTransposed3x3Thin(definition, attr);
-  RETURN_IF_ERROR(
-      result->UploadData(attr.weights, attr.bias, creation_context.context));
-  return absl::OkStatus();
+ConvolutionTransposed3x3Thin CreateConvolutionTransposed3x3Thin(
+    const DeviceInfo& device_info, const OperationDef& definition,
+    const ConvolutionTransposedAttributes& attr) {
+  ConvolutionTransposed3x3Thin result(definition, attr);
+  result.UploadData(attr.weights, attr.bias);
+  return result;
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.h b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.h
index 5b4c4d05bac..5905f6f6404 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.h
@@ -48,17 +48,15 @@ class ConvolutionTransposed3x3Thin : public GPUOperation {
       delete;
 
  private:
-  friend absl::Status CreateConvolutionTransposed3x3Thin(
-      const CreationContext& creation_context, const OperationDef& definition,
-      const ConvolutionTransposedAttributes& attr,
-      ConvolutionTransposed3x3Thin* result);
+  friend ConvolutionTransposed3x3Thin CreateConvolutionTransposed3x3Thin(
+      const DeviceInfo& device_info, const OperationDef& definition,
+      const ConvolutionTransposedAttributes& attr);
   explicit ConvolutionTransposed3x3Thin(
       const OperationDef& definition,
       const ConvolutionTransposedAttributes& attr);
   template <DataType T>
-  absl::Status UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
-                          const tflite::gpu::Tensor<Linear, T>& biases,
-                          CLContext* context);
+  void UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
+                  const tflite::gpu::Tensor<Linear, T>& biases);
 
   template <DataType S, typename T>
   void RearrangeWeightsData(const tflite::gpu::Tensor<OHWI, S>& weights,
@@ -69,9 +67,9 @@ class ConvolutionTransposed3x3Thin : public GPUOperation {
 };
 
 template <DataType T>
-absl::Status ConvolutionTransposed3x3Thin::UploadData(
+void ConvolutionTransposed3x3Thin::UploadData(
     const tflite::gpu::Tensor<OHWI, T>& weights,
-    const tflite::gpu::Tensor<Linear, T>& biases, CLContext* context) {
+    const tflite::gpu::Tensor<Linear, T>& biases) {
   const int src_depth = DivideRoundUp(weights.shape.i, 4);
   const int dst_depth = DivideRoundUp(weights.shape.o, 4);
   const int kernel_x = 3;  //  This operation support only 3x3 kernel
@@ -79,48 +77,41 @@ absl::Status ConvolutionTransposed3x3Thin::UploadData(
   const int flt4_count = kernel_x * kernel_y * src_depth * dst_depth * 4;
 
   const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
+  const int flt4_size = f32_weights ? sizeof(float4) : sizeof(half4);
 
   BufferDescriptor desc;
   desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
   desc.element_size = 4;
   desc.memory_type = MemoryType::CONSTANT;
+  desc.size = flt4_size * (flt4_count + dst_depth);
+  desc.data.resize(desc.size);
 
-  Buffer weights_buffer;
   if (f32_weights) {
-    std::vector<float4> gpu_data(flt4_count);
-    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
+    float4* gpu_data = reinterpret_cast<float4*>(desc.data.data());
+    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data, flt4_count));
     for (int i = 0; i < dst_depth; ++i) {
       float4 bias_value(0.0f);
       for (int c = 0; c < 4; ++c) {
         int ch = i * 4 + c;
         bias_value[c] = ch < weights.shape.o ? biases.data[ch] : 0.0f;
       }
-      gpu_data.push_back(bias_value);
+      gpu_data[flt4_count + i] = bias_value;
     }
-    RETURN_IF_ERROR(CreateReadOnlyBuffer(sizeof(float4) * gpu_data.size(),
-                                         gpu_data.data(), context,
-                                         &weights_buffer));
   } else {
-    std::vector<half4> gpu_data(flt4_count);
-    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
+    half4* gpu_data = reinterpret_cast<half4*>(desc.data.data());
+    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data, flt4_count));
     for (int i = 0; i < dst_depth; ++i) {
       half4 bias_value(0.0f);
       for (int c = 0; c < 4; ++c) {
         int ch = i * 4 + c;
         bias_value[c] = ch < weights.shape.o ? biases.data[ch] : 0.0f;
       }
-      gpu_data.push_back(bias_value);
+      gpu_data[flt4_count + i] = bias_value;
     }
-    RETURN_IF_ERROR(CreateReadOnlyBuffer(sizeof(half4) * gpu_data.size(),
-                                         gpu_data.data(), context,
-                                         &weights_buffer));
   }
 
-  args_.AddObject("weights", AccessType::READ,
-                  absl::make_unique<Buffer>(std::move(weights_buffer)),
-                  absl::make_unique<BufferDescriptor>(desc));
-
-  return absl::OkStatus();
+  args_.AddObject("weights",
+                  absl::make_unique<BufferDescriptor>(std::move(desc)));
 }
 
 template <DataType S, typename T>
@@ -166,12 +157,11 @@ void ConvolutionTransposed3x3Thin::RearrangeWeightsData(
 }
 
 bool IsConvolutionTransposed3x3ThinSupported(
-    const CLDevice& device, const ConvolutionTransposedAttributes& attr);
+    const ConvolutionTransposedAttributes& attr);
 
-absl::Status CreateConvolutionTransposed3x3Thin(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const ConvolutionTransposedAttributes& attr,
-    ConvolutionTransposed3x3Thin* result);
+ConvolutionTransposed3x3Thin CreateConvolutionTransposed3x3Thin(
+    const DeviceInfo& device_info, const OperationDef& definition,
+    const ConvolutionTransposedAttributes& attr);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin_test.cc
index 82d4492866d..83df267a884 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin_test.cc
@@ -54,9 +54,9 @@ TEST_F(OpenCLOperationTest, ConvolutionTransposed3x3ThinSimpleWeights) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ConvolutionTransposed3x3Thin operation;
-      ASSERT_OK(CreateConvolutionTransposed3x3Thin(creation_context_, op_def,
-                                                   attr, &operation));
+      ConvolutionTransposed3x3Thin operation =
+          CreateConvolutionTransposed3x3Thin(creation_context_.GetDeviceInfo(),
+                                             op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 4, 4, 1), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -90,9 +90,9 @@ TEST_F(OpenCLOperationTest, ConvolutionTransposed3x3Thin) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ConvolutionTransposed3x3Thin operation;
-      ASSERT_OK(CreateConvolutionTransposed3x3Thin(creation_context_, op_def,
-                                                   attr, &operation));
+      ConvolutionTransposed3x3Thin operation =
+          CreateConvolutionTransposed3x3Thin(creation_context_.GetDeviceInfo(),
+                                             op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 4, 4, 1), &dst_tensor));
       EXPECT_THAT(
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.cc
index 4ecb23c318c..d606a822d7e 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.cc
@@ -28,35 +28,36 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 ConvolutionTransposed4x4::ConvolutionTransposed4x4(
-    const OperationDef& definition, const CLDevice& device)
+    const OperationDef& definition, const DeviceInfo& device_info,
+    const ConvolutionTransposedAttributes& attr)
     : GPUOperation(definition) {
   work_group_size_ = int3(8, 4, 1);
-  if (device.IsPowerVR()) {
-    weights_upload_type_ = WeightsUploadType::LOCAL_MEM_ASYNC;
-  } else if (device.IsNvidia() || device.IsIntel()) {
-    weights_upload_type_ = WeightsUploadType::LOCAL_MEM_BY_THREADS;
-  } else if (device.IsAMD()) {
-    weights_upload_type_ = WeightsUploadType::CONSTANT_MEM;
+  WeightsUploadType weights_upload_type = WeightsUploadType::GLOBAL_MEM;
+  if (device_info.IsPowerVR()) {
+    weights_upload_type = WeightsUploadType::LOCAL_MEM_ASYNC;
+  } else if (device_info.IsNvidia() || device_info.IsIntel()) {
+    weights_upload_type = WeightsUploadType::LOCAL_MEM_BY_THREADS;
+  } else if (device_info.IsAMD()) {
+    weights_upload_type = WeightsUploadType::CONSTANT_MEM;
   } else {
-    weights_upload_type_ = WeightsUploadType::GLOBAL_MEM;
+    weights_upload_type = WeightsUploadType::GLOBAL_MEM;
   }
 
-  code_ = GenerateConvolutionTransposedCode(definition_, weights_upload_type_);
+  code_ = GenerateConvolutionTransposedCode(definition_, weights_upload_type);
+  UploadWeights(attr.weights, weights_upload_type);
   if (definition_.precision == CalculationsPrecision::F16 &&
-      device.IsPowerVR()) {
+      device_info.IsPowerVR()) {
     compiler_options_.push_back(CompilerOptions::POWERVR_FP16);
   }
 }
 
 ConvolutionTransposed4x4::ConvolutionTransposed4x4(
     ConvolutionTransposed4x4&& operation)
-    : GPUOperation(std::move(operation)),
-      weights_upload_type_(operation.weights_upload_type_) {}
+    : GPUOperation(std::move(operation)) {}
 
 ConvolutionTransposed4x4& ConvolutionTransposed4x4::operator=(
     ConvolutionTransposed4x4&& operation) {
   if (this != &operation) {
-    std::swap(weights_upload_type_, operation.weights_upload_type_);
     GPUOperation::operator=(std::move(operation));
   }
   return *this;
@@ -307,37 +308,25 @@ int3 ConvolutionTransposed4x4::GetGridSize() const {
 }
 
 bool IsConvolutionTransposed4x4Supported(
-    const CLDevice& device, const OperationDef& definition,
+    const OperationDef& definition,
     const ConvolutionTransposedAttributes& attr) {
   return attr.weights.shape.w == 4 && attr.weights.shape.h == 4 &&
          attr.stride.w == 2 && attr.stride.h == 2 &&
          attr.padding.prepended.w == 1 && attr.padding.prepended.h == 1;
 }
 
-absl::Status CreateConvolutionTransposed4x4(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const ConvolutionTransposedAttributes& attr,
-    ConvolutionTransposed4x4* result) {
-  if (!IsConvolutionTransposed4x4Supported(*creation_context.device, definition,
-                                           attr)) {
-    return absl::InvalidArgumentError(
-        "ConvolutionTransposed4x4 doesn't support this attributes");
-  }
-  *result = ConvolutionTransposed4x4(definition, *creation_context.device);
-  RETURN_IF_ERROR(
-      result->UploadWeights(attr.weights, creation_context.context));
+ConvolutionTransposed4x4 CreateConvolutionTransposed4x4(
+    const DeviceInfo& device_info, const OperationDef& definition,
+    const ConvolutionTransposedAttributes& attr) {
+  ConvolutionTransposed4x4 result(definition, device_info, attr);
 
   TensorLinearDescriptor desc;
   desc.storage_type = LinearStorageType::TEXTURE_2D;
   desc.element_type = definition.GetDataType();
-
-  LinearStorage lt;
-  RETURN_IF_ERROR(
-      CreateLinearStorage(desc, attr.bias, creation_context.context, &lt));
-  result->args_.AddObject("biases", AccessType::READ,
-                          absl::make_unique<LinearStorage>(std::move(lt)),
-                          absl::make_unique<TensorLinearDescriptor>(desc));
-  return absl::OkStatus();
+  desc.UploadLinearData(attr.bias);
+  result.args_.AddObject(
+      "biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
+  return result;
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.h b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.h
index 21ec8c3e293..2577eb47513 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.h
@@ -37,8 +37,11 @@ namespace cl {
 class ConvolutionTransposed4x4 : public GPUOperation {
  public:
   ConvolutionTransposed4x4() = default;
-  absl::Status Tune(const TuningParameters& params) override {
-    return absl::OkStatus();
+  void GetPossibleKernelWorkGroups(
+      TuningType tuning_type, const DeviceInfo& device_info,
+      const KernelInfo& kernel_info,
+      std::vector<int3>* work_groups) const override {
+    work_groups->push_back(work_group_size_);
   }
   absl::Status BindArguments() override;
   int3 GetGridSize() const override;
@@ -58,14 +61,14 @@ class ConvolutionTransposed4x4 : public GPUOperation {
 
  private:
   ConvolutionTransposed4x4(const OperationDef& definition,
-                           const CLDevice& device);
-  friend absl::Status CreateConvolutionTransposed4x4(
-      const CreationContext& creation_context, const OperationDef& definition,
-      const ConvolutionTransposedAttributes& attr,
-      ConvolutionTransposed4x4* result);
+                           const DeviceInfo& device_info,
+                           const ConvolutionTransposedAttributes& attr);
+  friend ConvolutionTransposed4x4 CreateConvolutionTransposed4x4(
+      const DeviceInfo& device_info, const OperationDef& definition,
+      const ConvolutionTransposedAttributes& attr);
   template <DataType T>
-  absl::Status UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
-                             CLContext* context);
+  void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
+                     WeightsUploadType weights_upload_type);
 
   template <DataType S, typename T>
   void RearrangeWeightsData(const tflite::gpu::Tensor<OHWI, S>& weights,
@@ -73,13 +76,12 @@ class ConvolutionTransposed4x4 : public GPUOperation {
 
   std::string GenerateConvolutionTransposedCode(
       const OperationDef& op_def, WeightsUploadType weights_upload_type);
-
-  WeightsUploadType weights_upload_type_;
 };
 
 template <DataType T>
-absl::Status ConvolutionTransposed4x4::UploadWeights(
-    const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
+void ConvolutionTransposed4x4::UploadWeights(
+    const tflite::gpu::Tensor<OHWI, T>& weights,
+    WeightsUploadType weights_upload_type) {
   const int src_depth = DivideRoundUp(weights.shape.i, 4);
   const int dst_depth = DivideRoundUp(weights.shape.o, 4);
   const int kernel_x = 4;  //  This operation support only 4x4 kernel
@@ -89,33 +91,27 @@ absl::Status ConvolutionTransposed4x4::UploadWeights(
   const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
   const int flt4_size = f32_weights ? sizeof(float4) : sizeof(half4);
 
-  Buffer weights_buffer;
-  if (f32_weights) {
-    std::vector<float4> gpu_data(flt4_count);
-    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
-    RETURN_IF_ERROR(CreateReadOnlyBuffer(
-        flt4_size * flt4_count, gpu_data.data(), context, &weights_buffer));
-  } else {
-    std::vector<half4> gpu_data(flt4_count);
-    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
-    RETURN_IF_ERROR(CreateReadOnlyBuffer(
-        flt4_size * flt4_count, gpu_data.data(), context, &weights_buffer));
-  }
-
   BufferDescriptor desc;
   desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
   desc.element_size = 4;
   desc.memory_type =
-      weights_upload_type_ ==
+      weights_upload_type ==
               ConvolutionTransposed4x4::WeightsUploadType::CONSTANT_MEM
           ? MemoryType::CONSTANT
           : MemoryType::GLOBAL;
+  desc.size = flt4_size * flt4_count;
+  desc.data.resize(desc.size);
 
-  args_.AddObject("weights", AccessType::READ,
-                  absl::make_unique<Buffer>(std::move(weights_buffer)),
-                  absl::make_unique<BufferDescriptor>(desc));
+  if (f32_weights) {
+    float4* ptr = reinterpret_cast<float4*>(desc.data.data());
+    RearrangeWeightsData(weights, absl::MakeSpan(ptr, flt4_count));
+  } else {
+    half4* ptr = reinterpret_cast<half4*>(desc.data.data());
+    RearrangeWeightsData(weights, absl::MakeSpan(ptr, flt4_count));
+  }
 
-  return absl::OkStatus();
+  args_.AddObject("weights",
+                  absl::make_unique<BufferDescriptor>(std::move(desc)));
 }
 
 template <DataType S, typename T>
@@ -161,13 +157,12 @@ void ConvolutionTransposed4x4::RearrangeWeightsData(
 }
 
 bool IsConvolutionTransposed4x4Supported(
-    const CLDevice& device, const OperationDef& definition,
+    const OperationDef& definition,
     const ConvolutionTransposedAttributes& attr);
 
-absl::Status CreateConvolutionTransposed4x4(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const ConvolutionTransposedAttributes& attr,
-    ConvolutionTransposed4x4* result);
+ConvolutionTransposed4x4 CreateConvolutionTransposed4x4(
+    const DeviceInfo& device_info, const OperationDef& definition,
+    const ConvolutionTransposedAttributes& attr);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4_test.cc
index 97ee0b5702f..a65479d72b8 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4_test.cc
@@ -55,9 +55,8 @@ TEST_F(OpenCLOperationTest, ConvolutionTransposed4x4) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ConvolutionTransposed4x4 operation;
-      ASSERT_OK(CreateConvolutionTransposed4x4(creation_context_, op_def, attr,
-                                               &operation));
+      ConvolutionTransposed4x4 operation = CreateConvolutionTransposed4x4(
+          creation_context_.GetDeviceInfo(), op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 4, 4, 1), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_test.cc
index dca405c2c7f..1da989d111d 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_test.cc
@@ -55,9 +55,8 @@ TEST_F(OpenCLOperationTest, ConvolutionTransposedSimpleWeights) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ConvolutionTransposed operation;
-      ASSERT_OK(CreateConvolutionTransposed(creation_context_, op_def, attr,
-                                            &operation));
+      ConvolutionTransposed operation = CreateConvolutionTransposed(
+          creation_context_.GetDeviceInfo(), op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 4, 4, 2), &dst_tensor));
       EXPECT_THAT(
@@ -94,9 +93,8 @@ TEST_F(OpenCLOperationTest, ConvolutionTransposed) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ConvolutionTransposed operation;
-      ASSERT_OK(CreateConvolutionTransposed(creation_context_, op_def, attr,
-                                            &operation));
+      ConvolutionTransposed operation = CreateConvolutionTransposed(
+          creation_context_.GetDeviceInfo(), op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 4, 4, 1), &dst_tensor));
       EXPECT_THAT(
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.cc
index 2268313a867..8781eadd867 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.cc
@@ -159,26 +159,19 @@ int3 ConvolutionTransposedThin::GetGridSize() const {
 }
 
 bool IsConvolutionTransposedThinSupported(
-    const CLDevice& device, const ConvolutionTransposedAttributes& attr) {
+    const ConvolutionTransposedAttributes& attr) {
   return attr.weights.shape.o <= 4 && attr.weights.shape.w == attr.stride.w &&
          attr.weights.shape.h == attr.stride.h &&
          attr.padding.prepended.w == 0 && attr.padding.prepended.h == 0 &&
          attr.padding.appended.w == 0 && attr.padding.appended.h == 0;
 }
 
-absl::Status CreateConvolutionTransposedThin(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const ConvolutionTransposedAttributes& attr,
-    ConvolutionTransposedThin* result) {
-  if (!IsConvolutionTransposedThinSupported(*creation_context.device, attr)) {
-    return absl::InvalidArgumentError(
-        "ConvolutionTransposedThin doesn't support this attributes");
-  }
-  *result = ConvolutionTransposedThin(definition, attr,
-                                      creation_context.device->GetInfo());
-  RETURN_IF_ERROR(
-      result->UploadData(attr.weights, attr.bias, creation_context.context));
-  return absl::OkStatus();
+ConvolutionTransposedThin CreateConvolutionTransposedThin(
+    const DeviceInfo& device_info, const OperationDef& definition,
+    const ConvolutionTransposedAttributes& attr) {
+  ConvolutionTransposedThin result(definition, attr, device_info);
+  result.UploadData(attr.weights, attr.bias);
+  return result;
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.h b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.h
index 817887ab7af..7599ad23fde 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.h
@@ -47,17 +47,15 @@ class ConvolutionTransposedThin : public GPUOperation {
       delete;
 
  private:
-  friend absl::Status CreateConvolutionTransposedThin(
-      const CreationContext& creation_context, const OperationDef& definition,
-      const ConvolutionTransposedAttributes& attr,
-      ConvolutionTransposedThin* result);
+  friend ConvolutionTransposedThin CreateConvolutionTransposedThin(
+      const DeviceInfo& device_info, const OperationDef& definition,
+      const ConvolutionTransposedAttributes& attr);
   ConvolutionTransposedThin(const OperationDef& definition,
                             const ConvolutionTransposedAttributes& attr,
                             const DeviceInfo& device_info);
   template <DataType T>
-  absl::Status UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
-                          const tflite::gpu::Tensor<Linear, T>& biases,
-                          CLContext* context);
+  void UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
+                  const tflite::gpu::Tensor<Linear, T>& biases);
 
   template <DataType S, typename T>
   void RearrangeWeightsData(const tflite::gpu::Tensor<OHWI, S>& weights,
@@ -68,50 +66,43 @@ class ConvolutionTransposedThin : public GPUOperation {
 };
 
 template <DataType T>
-absl::Status ConvolutionTransposedThin::UploadData(
+void ConvolutionTransposedThin::UploadData(
     const tflite::gpu::Tensor<OHWI, T>& weights,
-    const tflite::gpu::Tensor<Linear, T>& biases, CLContext* context) {
+    const tflite::gpu::Tensor<Linear, T>& biases) {
   const int src_depth = DivideRoundUp(weights.shape.i, 4);
   const int flt4_count =
       weights.shape.w * weights.shape.h * src_depth * weights.shape.o;
 
   const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
+  const int flt4_size = f32_weights ? sizeof(float4) : sizeof(half4);
 
   BufferDescriptor desc;
   desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
   desc.element_size = 4;
   desc.memory_type = MemoryType::CONSTANT;
+  desc.size = flt4_size * (flt4_count + 1);
+  desc.data.resize(desc.size);
 
-  Buffer weights_buffer;
   if (f32_weights) {
-    std::vector<float4> gpu_data(flt4_count);
-    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
+    float4* gpu_data = reinterpret_cast<float4*>(desc.data.data());
+    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data, flt4_count));
     float4 bias_value(0.0f);
     for (int i = 0; i < weights.shape.o; ++i) {
       bias_value[i] = biases.data[i];
     }
-    gpu_data.push_back(bias_value);
-    RETURN_IF_ERROR(CreateReadOnlyBuffer(sizeof(float4) * gpu_data.size(),
-                                         gpu_data.data(), context,
-                                         &weights_buffer));
+    gpu_data[flt4_count] = bias_value;
   } else {
-    std::vector<half4> gpu_data(flt4_count);
-    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
+    half4* gpu_data = reinterpret_cast<half4*>(desc.data.data());
+    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data, flt4_count));
     half4 bias_value(0.0f);
     for (int i = 0; i < weights.shape.o; ++i) {
       bias_value[i] = biases.data[i];
     }
-    gpu_data.push_back(bias_value);
-    RETURN_IF_ERROR(CreateReadOnlyBuffer(sizeof(half4) * gpu_data.size(),
-                                         gpu_data.data(), context,
-                                         &weights_buffer));
+    gpu_data[flt4_count] = bias_value;
   }
 
-  args_.AddObject("weights", AccessType::READ,
-                  absl::make_unique<Buffer>(std::move(weights_buffer)),
-                  absl::make_unique<BufferDescriptor>(desc));
-
-  return absl::OkStatus();
+  args_.AddObject("weights",
+                  absl::make_unique<BufferDescriptor>(std::move(desc)));
 }
 
 template <DataType S, typename T>
@@ -147,12 +138,11 @@ void ConvolutionTransposedThin::RearrangeWeightsData(
 }
 
 bool IsConvolutionTransposedThinSupported(
-    const CLDevice& device, const ConvolutionTransposedAttributes& attr);
+    const ConvolutionTransposedAttributes& attr);
 
-absl::Status CreateConvolutionTransposedThin(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const ConvolutionTransposedAttributes& attr,
-    ConvolutionTransposedThin* result);
+ConvolutionTransposedThin CreateConvolutionTransposedThin(
+    const DeviceInfo& device_info, const OperationDef& definition,
+    const ConvolutionTransposedAttributes& attr);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin_test.cc
index 36fdf9f2fe9..16968008e24 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin_test.cc
@@ -55,9 +55,8 @@ TEST_F(OpenCLOperationTest, ConvolutionTransposedThinSimpleWeights) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ConvolutionTransposedThin operation;
-      ASSERT_OK(CreateConvolutionTransposedThin(creation_context_, op_def, attr,
-                                                &operation));
+      ConvolutionTransposedThin operation = CreateConvolutionTransposedThin(
+          creation_context_.GetDeviceInfo(), op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 4, 4, 2), &dst_tensor));
       EXPECT_THAT(
@@ -94,9 +93,8 @@ TEST_F(OpenCLOperationTest, ConvolutionTransposedThin) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ConvolutionTransposedThin operation;
-      ASSERT_OK(CreateConvolutionTransposedThin(creation_context_, op_def, attr,
-                                                &operation));
+      ConvolutionTransposedThin operation = CreateConvolutionTransposedThin(
+          creation_context_.GetDeviceInfo(), op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 4, 4, 1), &dst_tensor));
       EXPECT_THAT(
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.cc b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.cc
index 4b4416751fb..f42ad824006 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.cc
@@ -66,100 +66,24 @@ std::string GetSrcValue(int channel_multiplier, const std::string coords) {
 
   return c;
 }
-}  // namespace
 
-DepthwiseConvolution::DepthwiseConvolution(
-    const OperationDef& definition,
-    const DepthwiseConvolution2DAttributes& attr, bool weights_are_buffer)
-    : GPUOperation(definition),
-      weights_are_buffer_(weights_are_buffer),
-      kernel_size_(attr.weights.shape.w, attr.weights.shape.h, 0, 0),
-      stride_(attr.strides.w, attr.strides.h, 0, 0),
-      padding_(-attr.padding.prepended.w, -attr.padding.prepended.h, 0, 0),
-      dilation_(attr.dilations.w, attr.dilations.h, 0, 0),
-      channel_multiplier_(attr.weights.shape.o) {
-  work_group_size_ = int3(8, 8, 1);
-  const bool stride_correction =
-      definition_.IsBatchSupported() && stride_.x != 1;
-  code_ = GenerateDepthwiseConvolutionCode(
-      definition_, stride_correction, channel_multiplier_, weights_are_buffer_);
-}
-
-DepthwiseConvolution::DepthwiseConvolution(
-    const OperationDef& definition,
-    const DepthwiseConvolution3DAttributes& attr, bool weights_are_buffer)
-    : GPUOperation(definition),
-      weights_are_buffer_(weights_are_buffer),
-      kernel_size_(attr.weights.shape.w, attr.weights.shape.h,
-                   attr.weights.shape.d, 0),
-      stride_(attr.strides.w, attr.strides.h, attr.strides.d, 0),
-      padding_(-attr.padding.prepended.w, -attr.padding.prepended.h,
-               -attr.padding.prepended.d, 0),
-      dilation_(attr.dilations.w, attr.dilations.h, attr.dilations.d, 0),
-      channel_multiplier_(attr.weights.shape.o) {
-  work_group_size_ = int3(8, 8, 1);
-  const bool stride_correction =
-      definition_.IsBatchSupported() && stride_.x != 1;
-  code_ = GenerateDepthwiseConvolutionCode(
-      definition_, stride_correction, channel_multiplier_, weights_are_buffer_);
-}
-
-DepthwiseConvolution::DepthwiseConvolution(DepthwiseConvolution&& operation)
-    : GPUOperation(std::move(operation)),
-      weights_are_buffer_(operation.weights_are_buffer_),
-      kernel_size_(operation.kernel_size_),
-      stride_(operation.stride_),
-      padding_(operation.padding_),
-      dilation_(operation.dilation_),
-      channel_multiplier_(operation.channel_multiplier_) {}
-
-DepthwiseConvolution& DepthwiseConvolution::operator=(
-    DepthwiseConvolution&& operation) {
-  if (this != &operation) {
-    std::swap(weights_are_buffer_, operation.weights_are_buffer_);
-    std::swap(kernel_size_, operation.kernel_size_);
-    std::swap(stride_, operation.stride_);
-    std::swap(padding_, operation.padding_);
-    std::swap(dilation_, operation.dilation_);
-    std::swap(channel_multiplier_, operation.channel_multiplier_);
-    GPUOperation::operator=(std::move(operation));
-  }
-  return *this;
-}
-
-std::string DepthwiseConvolution::GenerateDepthwiseConvolutionCode(
-    const OperationDef& op_def, bool stride_correction, int channel_multiplier,
-    bool weights_are_buffer) {
+std::string GenerateDepthwiseConvolutionCode(const OperationDef& op_def,
+                                             bool stride_correction,
+                                             int channel_multiplier,
+                                             bool weights_are_buffer,
+                                             GPUOperation* op) {
   auto src_desc = op_def.src_tensors[0];
   src_desc.SetTextureAddressMode(TextureAddressMode::ZERO);
   if (op_def.IsBatchSupported()) {
     src_desc.SetStateVar("BatchedWidth", "true");
   }
-  AddSrcTensor("src_tensor", src_desc);
+  op->AddSrcTensor("src_tensor", src_desc);
 
   auto dst_desc = op_def.dst_tensors[0];
   if (op_def.IsBatchSupported()) {
     dst_desc.SetStateVar("BatchedWidth", "true");
   }
-  AddDstTensor("dst_tensor", dst_desc);
-
-  args_.AddInt("kernel_size_x");
-  args_.AddInt("stride_x");
-  args_.AddInt("padding_x");
-  args_.AddInt("dilation_x");
-  args_.AddInt("kernel_size_y");
-  args_.AddInt("stride_y");
-  args_.AddInt("padding_y");
-  args_.AddInt("dilation_y");
-  if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
-    args_.AddInt("kernel_size_z");
-    args_.AddInt("stride_z");
-    args_.AddInt("padding_z");
-    args_.AddInt("dilation_z");
-  }
-  if (!IsSpecializedCase(channel_multiplier)) {
-    args_.AddInt("ch_multiplier");
-  }
+  op->AddDstTensor("dst_tensor", dst_desc);
 
   const auto src_tensor_type = op_def.src_tensors[0].storage_type;
 
@@ -171,14 +95,14 @@ std::string DepthwiseConvolution::GenerateDepthwiseConvolutionCode(
   c += "__kernel void main_function(\n";
   c += "$0) {\n";
   c += "  int X = get_global_id(0);\n";
-  c += "  int Y = get_global_id(1);\n";
   if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
-    c += "  int linear_id_2 = get_global_id(2);\n";
-    c += "  int S = linear_id_2 / args.dst_tensor.Depth();\n";
-    c += "  int Z = linear_id_2 % args.dst_tensor.Depth();\n";
+    c += "  int linear_id_1 = get_global_id(1);\n";
+    c += "  int Y = linear_id_1 / args.dst_tensor.Depth();\n";
+    c += "  int Z = linear_id_1 % args.dst_tensor.Depth();\n";
   } else {
-    c += "  int S = get_global_id(2);\n";
+    c += "  int Y = get_global_id(1);\n";
   }
+  c += "  int S = get_global_id(2);\n";
   c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() || "
        "S >= args.dst_tensor.Slices()) { \n";
   c += "    return; \n";
@@ -186,11 +110,16 @@ std::string DepthwiseConvolution::GenerateDepthwiseConvolutionCode(
   c += "  ACCUM_FLT4 r = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
   if (stride_correction) {
     c += "  int x_offseted = " +
-         GetXStrideCorrected("X", "args.src_tensor.Batch()", "args.stride_x",
-                             "args.padding_x") +
+         GetXStrideCorrectedV2("X", "args.src_tensor.Batch()", "args.stride_x",
+                               "args.padding_x") +
          ";\n";
   } else {
-    c += "  int x_offseted = X * args.stride_x + args.padding_x;\n";
+    if (op_def.IsBatchSupported()) {
+      c += "  int x_offseted = X * args.stride_x + args.padding_x * "
+           "args.src_tensor.Batch();\n";
+    } else {
+      c += "  int x_offseted = X * args.stride_x + args.padding_x;\n";
+    }
   }
   c += "  int y_offseted = Y * args.stride_y + args.padding_y;\n";
   std::string weights_offset = "args.kernel_size_x * args.kernel_size_y";
@@ -218,7 +147,10 @@ std::string DepthwiseConvolution::GenerateDepthwiseConvolutionCode(
     c += "    int y_c = y_offseted + ky * args.dilation_y;\n";
     c += "    bool outside_y = y_c < 0 || y_c >= args.src_tensor.Height();\n";
     c += "    for (int kx = 0; kx < args.kernel_size_x; ++kx) {\n";
-    c += "      int x_c = x_offseted + kx * args.dilation_x;\n";
+    const std::string dilation_x =
+        op_def.IsBatchSupported() ? "args.dilation_x * args.src_tensor.Batch()"
+                                  : "args.dilation_x";
+    c += "      int x_c = x_offseted + kx * " + dilation_x + ";\n";
     c += "      bool outside_x = x_c < 0 || x_c >= args.src_tensor.Width();\n";
     c += "      if (" + check + ") {\n";
     if (weights_are_buffer) {
@@ -252,7 +184,10 @@ std::string DepthwiseConvolution::GenerateDepthwiseConvolutionCode(
     c += "  for (int ky = 0; ky < args.kernel_size_y; ++ky) {\n";
     c += "    int y_c = y_offseted + ky * args.dilation_y;\n";
     c += "    for (int kx = 0; kx < args.kernel_size_x; ++kx) {\n";
-    c += "      int x_c = x_offseted + kx * args.dilation_x;\n";
+    const std::string dilation_x =
+        op_def.IsBatchSupported() ? "args.dilation_x * args.src_tensor.Batch()"
+                                  : "args.dilation_x";
+    c += "      int x_c = x_offseted + kx * " + dilation_x + ";\n";
     c += GetSrcValue(channel_multiplier, flat_coords);
     if (weights_are_buffer) {
       c += "      FLT4 f = args.weights.Read(fx_c);\n";
@@ -277,79 +212,80 @@ std::string DepthwiseConvolution::GenerateDepthwiseConvolutionCode(
 
   return c;
 }
+}  // namespace
 
-absl::Status DepthwiseConvolution::BindArguments() {
-  RETURN_IF_ERROR(args_.SetInt("kernel_size_x", kernel_size_.x));
-  RETURN_IF_ERROR(args_.SetInt("stride_x", stride_.x));
-  RETURN_IF_ERROR(args_.SetInt("padding_x", padding_.x * src_[0]->Batch()));
-  RETURN_IF_ERROR(args_.SetInt("dilation_x", dilation_.x * src_[0]->Batch()));
-  RETURN_IF_ERROR(args_.SetInt("kernel_size_y", kernel_size_.y));
-  RETURN_IF_ERROR(args_.SetInt("stride_y", stride_.y));
-  RETURN_IF_ERROR(args_.SetInt("padding_y", padding_.y));
-  RETURN_IF_ERROR(args_.SetInt("dilation_y", dilation_.y));
-  if (definition_.dst_tensors[0].HasAxis(Axis::DEPTH)) {
-    RETURN_IF_ERROR(args_.SetInt("kernel_size_z", kernel_size_.z));
-    RETURN_IF_ERROR(args_.SetInt("stride_z", stride_.z));
-    RETURN_IF_ERROR(args_.SetInt("padding_z", padding_.z));
-    RETURN_IF_ERROR(args_.SetInt("dilation_z", dilation_.z));
+GPUOperation CreateDepthwiseConvolution2D(
+    const DeviceInfo& device_info, const OperationDef& definition,
+    const DepthwiseConvolution2DAttributes& attr) {
+  bool weights_are_buffer = device_info.IsMali();
+  GPUOperation op(definition);
+  op.args_.AddInt("kernel_size_x", attr.weights.shape.w);
+  op.args_.AddInt("stride_x", attr.strides.w);
+  op.args_.AddInt("padding_x", -attr.padding.prepended.w);
+  op.args_.AddInt("dilation_x", attr.dilations.w);
+  op.args_.AddInt("kernel_size_y", attr.weights.shape.h);
+  op.args_.AddInt("stride_y", attr.strides.h);
+  op.args_.AddInt("padding_y", -attr.padding.prepended.h);
+  op.args_.AddInt("dilation_y", attr.dilations.h);
+  if (!IsSpecializedCase(attr.weights.shape.o)) {
+    op.args_.AddInt("ch_multiplier", attr.weights.shape.o);
   }
-  if (!IsSpecializedCase(channel_multiplier_)) {
-    RETURN_IF_ERROR(args_.SetInt("ch_multiplier", channel_multiplier_));
-  }
-  return absl::OkStatus();
-}
-
-int3 DepthwiseConvolution::GetGridSize() const {
-  const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
-  const int grid_y = dst_[0]->Height();
-  const int grid_z = dst_[0]->Slices() * dst_[0]->Depth();
-  return int3(grid_x, grid_y, grid_z);
-}
-
-absl::Status CreateDepthwiseConvolution(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const DepthwiseConvolution2DAttributes& attr,
-    DepthwiseConvolution* result) {
-  bool weights_are_buffer = creation_context.device->IsMali();
-  *result = DepthwiseConvolution(definition, attr, weights_are_buffer);
-  RETURN_IF_ERROR(
-      result->UploadWeights(attr.weights, creation_context.context));
+  const bool stride_correction =
+      definition.IsBatchSupported() && attr.strides.w != 1;
+  op.code_ = GenerateDepthwiseConvolutionCode(definition, stride_correction,
+                                              attr.weights.shape.o,
+                                              weights_are_buffer, &op);
+  UploadWeightsForDWConv2D(attr.weights, weights_are_buffer,
+                           definition.precision, &op);
+  op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_SToZ;
 
   TensorLinearDescriptor desc;
   desc.storage_type = weights_are_buffer ? LinearStorageType::BUFFER
                                          : LinearStorageType::TEXTURE_2D;
   desc.element_type = definition.GetDataType();
-
-  LinearStorage lt;
-  RETURN_IF_ERROR(
-      CreateLinearStorage(desc, attr.bias, creation_context.context, &lt));
-  result->args_.AddObject("biases", AccessType::READ,
-                          absl::make_unique<LinearStorage>(std::move(lt)),
-                          absl::make_unique<TensorLinearDescriptor>(desc));
-  return absl::OkStatus();
+  desc.UploadLinearData(attr.bias);
+  op.args_.AddObject(
+      "biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
+  return op;
 }
 
-absl::Status CreateDepthwiseConvolution(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const DepthwiseConvolution3DAttributes& attr,
-    DepthwiseConvolution* result) {
-  bool weights_are_buffer = creation_context.device->IsMali();
-  *result = DepthwiseConvolution(definition, attr, weights_are_buffer);
-  RETURN_IF_ERROR(
-      result->UploadWeights(attr.weights, creation_context.context));
+GPUOperation CreateDepthwiseConvolution3D(
+    const DeviceInfo& device_info, const OperationDef& definition,
+    const DepthwiseConvolution3DAttributes& attr) {
+  bool weights_are_buffer = device_info.IsMali();
+  GPUOperation op(definition);
+  op.args_.AddInt("kernel_size_x", attr.weights.shape.w);
+  op.args_.AddInt("stride_x", attr.strides.w);
+  op.args_.AddInt("padding_x", -attr.padding.prepended.w);
+  op.args_.AddInt("dilation_x", attr.dilations.w);
+  op.args_.AddInt("kernel_size_y", attr.weights.shape.h);
+  op.args_.AddInt("stride_y", attr.strides.h);
+  op.args_.AddInt("padding_y", -attr.padding.prepended.h);
+  op.args_.AddInt("dilation_y", attr.dilations.h);
+  op.args_.AddInt("kernel_size_z", attr.weights.shape.d);
+  op.args_.AddInt("stride_z", attr.strides.d);
+  op.args_.AddInt("padding_z", -attr.padding.prepended.d);
+  op.args_.AddInt("dilation_z", attr.dilations.d);
+  if (!IsSpecializedCase(attr.weights.shape.o)) {
+    op.args_.AddInt("ch_multiplier", attr.weights.shape.o);
+  }
+  const bool stride_correction =
+      definition.IsBatchSupported() && attr.strides.w != 1;
+  op.code_ = GenerateDepthwiseConvolutionCode(definition, stride_correction,
+                                              attr.weights.shape.o,
+                                              weights_are_buffer, &op);
+  UploadWeightsForDWConv3D(attr.weights, weights_are_buffer,
+                           definition.precision, &op);
+  op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_SToZ;
 
   TensorLinearDescriptor desc;
   desc.storage_type = weights_are_buffer ? LinearStorageType::BUFFER
                                          : LinearStorageType::TEXTURE_2D;
   desc.element_type = definition.GetDataType();
-
-  LinearStorage lt;
-  RETURN_IF_ERROR(
-      CreateLinearStorage(desc, attr.bias, creation_context.context, &lt));
-  result->args_.AddObject("biases", AccessType::READ,
-                          absl::make_unique<LinearStorage>(std::move(lt)),
-                          absl::make_unique<TensorLinearDescriptor>(desc));
-  return absl::OkStatus();
+  desc.UploadLinearData(attr.bias);
+  op.args_.AddObject(
+      "biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
+  return op;
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.h b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.h
index 9a841db82ab..a5a4f5ba339 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.h
@@ -35,126 +35,9 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-class DepthwiseConvolution : public GPUOperation {
- public:
-  DepthwiseConvolution() = default;
-  absl::Status BindArguments() override;
-  int3 GetGridSize() const override;
-
-  // Move only
-  DepthwiseConvolution(DepthwiseConvolution&& operation);
-  DepthwiseConvolution& operator=(DepthwiseConvolution&& operation);
-  DepthwiseConvolution(const DepthwiseConvolution&) = delete;
-  DepthwiseConvolution& operator=(const DepthwiseConvolution&) = delete;
-
- private:
-  friend absl::Status CreateDepthwiseConvolution(
-      const CreationContext& creation_context, const OperationDef& definition,
-      const DepthwiseConvolution2DAttributes& attr,
-      DepthwiseConvolution* result);
-  friend absl::Status CreateDepthwiseConvolution(
-      const CreationContext& creation_context, const OperationDef& definition,
-      const DepthwiseConvolution3DAttributes& attr,
-      DepthwiseConvolution* result);
-  DepthwiseConvolution(const OperationDef& definition,
-                       const DepthwiseConvolution2DAttributes& attr,
-                       bool weights_are_buffer);
-  DepthwiseConvolution(const OperationDef& definition,
-                       const DepthwiseConvolution3DAttributes& attr,
-                       bool weights_are_buffer);
-
-  template <DataType T>
-  absl::Status UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
-                             CLContext* context);
-
-  template <DataType S, typename T>
-  void RearrangeWeightsData(const tflite::gpu::Tensor<OHWI, S>& weights,
-                            absl::Span<T> dst);
-
-  template <DataType T>
-  absl::Status UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights,
-                             CLContext* context);
-
-  template <DataType S, typename T>
-  void RearrangeWeightsData(const tflite::gpu::Tensor<OHWDI, S>& weights,
-                            absl::Span<T> dst);
-
-  std::string GenerateDepthwiseConvolutionCode(const OperationDef& op_def,
-                                               bool stride_correction,
-                                               int channel_multiplier,
-                                               bool weights_are_buffer);
-
-  bool weights_are_buffer_;
-
-  int4 kernel_size_;
-  int4 stride_;
-  int4 padding_;
-  int4 dilation_;
-  int channel_multiplier_;
-};
-
-template <DataType T>
-absl::Status DepthwiseConvolution::UploadWeights(
-    const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
-  const int dst_channels = weights.shape.i * weights.shape.o;
-  const int dst_slices = DivideRoundUp(dst_channels, 4);
-  const int kernel_x = weights.shape.w;
-  const int kernel_y = weights.shape.h;
-
-  const int elements_count = kernel_x * kernel_y * dst_slices;
-
-  const bool fp32_weights = definition_.precision == CalculationsPrecision::F32;
-  const int float4_size = fp32_weights ? 16 : 8;
-
-  Texture2D weights_tex2d;
-  Buffer weights_buf;
-  if (fp32_weights) {
-    std::vector<float4> gpu_data(elements_count);
-    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
-    if (weights_are_buffer_) {
-      RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
-                                           gpu_data.data(), context,
-                                           &weights_buf));
-    } else {
-      RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), kernel_x * kernel_y, dst_slices,
-          gpu_data.data(), context, &weights_tex2d));
-    }
-  } else {
-    std::vector<half4> gpu_data(elements_count);
-    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
-    if (weights_are_buffer_) {
-      RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
-                                           gpu_data.data(), context,
-                                           &weights_buf));
-    } else {
-      RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), kernel_x * kernel_y, dst_slices,
-          gpu_data.data(), context, &weights_tex2d));
-    }
-  }
-
-  if (weights_are_buffer_) {
-    BufferDescriptor desc;
-    desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
-    desc.element_size = 4;
-    args_.AddObject("weights", AccessType::READ,
-                    absl::make_unique<Buffer>(std::move(weights_buf)),
-                    absl::make_unique<BufferDescriptor>(desc));
-  } else {
-    Texture2DDescriptor desc;
-    desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
-    args_.AddObject("weights", AccessType::READ,
-                    absl::make_unique<Texture2D>(std::move(weights_tex2d)),
-                    absl::make_unique<Texture2DDescriptor>(desc));
-  }
-
-  return absl::OkStatus();
-}
-
 template <DataType S, typename T>
-void DepthwiseConvolution::RearrangeWeightsData(
-    const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
+void RearrangeWeightsForDWConv2D(const tflite::gpu::Tensor<OHWI, S>& weights,
+                                 absl::Span<T> dst) {
   const int dst_channels = weights.shape.i * weights.shape.o;
   const int dst_depth = DivideRoundUp(dst_channels, 4);
   const int kernel_x = weights.shape.w;
@@ -182,68 +65,50 @@ void DepthwiseConvolution::RearrangeWeightsData(
 }
 
 template <DataType T>
-absl::Status DepthwiseConvolution::UploadWeights(
-    const tflite::gpu::Tensor<OHWDI, T>& weights, CLContext* context) {
+void UploadWeightsForDWConv2D(const tflite::gpu::Tensor<OHWI, T>& weights,
+                              bool weights_are_buffer,
+                              CalculationsPrecision precision,
+                              GPUOperation* op) {
   const int dst_channels = weights.shape.i * weights.shape.o;
   const int dst_slices = DivideRoundUp(dst_channels, 4);
   const int kernel_x = weights.shape.w;
   const int kernel_y = weights.shape.h;
-  const int kernel_z = weights.shape.d;
 
-  const int elements_count = kernel_x * kernel_y * kernel_z * dst_slices;
+  const int elements_count = kernel_x * kernel_y * dst_slices;
 
-  const bool fp32_weights = definition_.precision == CalculationsPrecision::F32;
+  const bool fp32_weights = precision == CalculationsPrecision::F32;
   const int float4_size = fp32_weights ? 16 : 8;
 
-  Texture2D weights_tex2d;
-  Buffer weights_buf;
+  std::vector<uint8_t> data(float4_size * elements_count);
+
   if (fp32_weights) {
-    std::vector<float4> gpu_data(elements_count);
-    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
-    if (weights_are_buffer_) {
-      RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
-                                           gpu_data.data(), context,
-                                           &weights_buf));
-    } else {
-      RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), kernel_x * kernel_y * kernel_z, dst_slices,
-          gpu_data.data(), context, &weights_tex2d));
-    }
+    float4* ptr = reinterpret_cast<float4*>(data.data());
+    RearrangeWeightsForDWConv2D(weights, absl::MakeSpan(ptr, elements_count));
   } else {
-    std::vector<half4> gpu_data(elements_count);
-    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
-    if (weights_are_buffer_) {
-      RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
-                                           gpu_data.data(), context,
-                                           &weights_buf));
-    } else {
-      RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), kernel_x * kernel_y * kernel_z, dst_slices,
-          gpu_data.data(), context, &weights_tex2d));
-    }
+    half4* ptr = reinterpret_cast<half4*>(data.data());
+    RearrangeWeightsForDWConv2D(weights, absl::MakeSpan(ptr, elements_count));
   }
 
-  if (weights_are_buffer_) {
+  if (weights_are_buffer) {
     BufferDescriptor desc;
     desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
     desc.element_size = 4;
-    args_.AddObject("weights", AccessType::READ,
-                    absl::make_unique<Buffer>(std::move(weights_buf)),
-                    absl::make_unique<BufferDescriptor>(desc));
+    desc.size = float4_size * elements_count;
+    desc.data = std::move(data);
+    op->args_.AddObject("weights", absl::make_unique<BufferDescriptor>(desc));
   } else {
     Texture2DDescriptor desc;
     desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
-    args_.AddObject("weights", AccessType::READ,
-                    absl::make_unique<Texture2D>(std::move(weights_tex2d)),
-                    absl::make_unique<Texture2DDescriptor>(desc));
+    desc.size = int2(kernel_x * kernel_y, dst_slices);
+    desc.data = std::move(data);
+    op->args_.AddObject("weights",
+                        absl::make_unique<Texture2DDescriptor>(desc));
   }
-
-  return absl::OkStatus();
 }
 
 template <DataType S, typename T>
-void DepthwiseConvolution::RearrangeWeightsData(
-    const tflite::gpu::Tensor<OHWDI, S>& weights, absl::Span<T> dst) {
+void RearrangeWeightsForDWConv3D(const tflite::gpu::Tensor<OHWDI, S>& weights,
+                                 absl::Span<T> dst) {
   const int dst_channels = weights.shape.i * weights.shape.o;
   const int dst_slices = DivideRoundUp(dst_channels, 4);
   const int kernel_x = weights.shape.w;
@@ -273,9 +138,57 @@ void DepthwiseConvolution::RearrangeWeightsData(
   }
 }
 
-absl::Status CreateDepthwiseConvolution(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const DepthwiseConvolution2DAttributes& attr, DepthwiseConvolution* result);
+template <DataType T>
+void UploadWeightsForDWConv3D(const tflite::gpu::Tensor<OHWDI, T>& weights,
+                              bool weights_are_buffer,
+                              CalculationsPrecision precision,
+                              GPUOperation* op) {
+  const int dst_channels = weights.shape.i * weights.shape.o;
+  const int dst_slices = DivideRoundUp(dst_channels, 4);
+  const int kernel_x = weights.shape.w;
+  const int kernel_y = weights.shape.h;
+  const int kernel_z = weights.shape.d;
+
+  const int elements_count = kernel_x * kernel_y * kernel_z * dst_slices;
+
+  const bool fp32_weights = precision == CalculationsPrecision::F32;
+  const int float4_size = fp32_weights ? 16 : 8;
+
+  std::vector<uint8_t> data(float4_size * elements_count);
+
+  if (fp32_weights) {
+    float4* ptr = reinterpret_cast<float4*>(data.data());
+    RearrangeWeightsForDWConv3D(weights, absl::MakeSpan(ptr, elements_count));
+  } else {
+    half4* ptr = reinterpret_cast<half4*>(data.data());
+    RearrangeWeightsForDWConv3D(weights, absl::MakeSpan(ptr, elements_count));
+  }
+
+  if (weights_are_buffer) {
+    BufferDescriptor desc;
+    desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+    desc.element_size = 4;
+    desc.size = float4_size * elements_count;
+    desc.data = std::move(data);
+    op->args_.AddObject("weights",
+                        absl::make_unique<BufferDescriptor>(std::move(desc)));
+  } else {
+    Texture2DDescriptor desc;
+    desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+    desc.size = int2(kernel_x * kernel_y * kernel_z, dst_slices);
+    desc.data = std::move(data);
+    op->args_.AddObject(
+        "weights", absl::make_unique<Texture2DDescriptor>(std::move(desc)));
+  }
+}
+
+GPUOperation CreateDepthwiseConvolution2D(
+    const DeviceInfo& device_info, const OperationDef& definition,
+    const DepthwiseConvolution2DAttributes& attr);
+
+GPUOperation CreateDepthwiseConvolution3D(
+    const DeviceInfo& device_info, const OperationDef& definition,
+    const DepthwiseConvolution3DAttributes& attr);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3.cc b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3.cc
index e171231fc0a..01f2e4f9a31 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3.cc
@@ -32,10 +32,9 @@ DepthwiseConv3x3::DepthwiseConv3x3(const OperationDef& definition,
                                    bool local_mem_uploads,
                                    const DeviceInfo& device_info)
     : GPUOperation(definition),
-      weights_are_buffer_(weights_are_buffer),
       local_mem_uploads_(local_mem_uploads) {
   work_group_size_ = int3(8, 4, 1);
-  code_ = GenerateDepthwiseConvCode(definition_, weights_are_buffer_,
+  code_ = GenerateDepthwiseConvCode(definition_, weights_are_buffer,
                                     local_mem_uploads_);
 
   if (definition_.precision == CalculationsPrecision::F16 &&
@@ -46,12 +45,10 @@ DepthwiseConv3x3::DepthwiseConv3x3(const OperationDef& definition,
 
 DepthwiseConv3x3::DepthwiseConv3x3(DepthwiseConv3x3&& operation)
     : GPUOperation(std::move(operation)),
-      weights_are_buffer_(operation.weights_are_buffer_),
       local_mem_uploads_(operation.local_mem_uploads_) {}
 
 DepthwiseConv3x3& DepthwiseConv3x3::operator=(DepthwiseConv3x3&& operation) {
   if (this != &operation) {
-    std::swap(weights_are_buffer_, operation.weights_are_buffer_);
     std::swap(local_mem_uploads_, operation.local_mem_uploads_);
     GPUOperation::operator=(std::move(operation));
   }
@@ -289,11 +286,6 @@ std::string DepthwiseConv3x3::GenerateDepthwiseConvCode(
   return c;
 }
 
-absl::Status DepthwiseConv3x3::BindArguments() {
-  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
-  return args_.SetObjectRef("dst_tensor", dst_[0]);
-}
-
 int3 DepthwiseConv3x3::GetGridSize() const {
   const int grid_x = DivideRoundUp(dst_[0]->Width(), 2) * dst_[0]->Batch();
   const int grid_y = DivideRoundUp(dst_[0]->Height(), 2);
@@ -301,12 +293,15 @@ int3 DepthwiseConv3x3::GetGridSize() const {
   return int3(grid_x, grid_y, grid_z);
 }
 
-absl::Status DepthwiseConv3x3::Tune(const TuningParameters& params) {
+void DepthwiseConv3x3::GetPossibleKernelWorkGroups(
+    TuningType tuning_type, const DeviceInfo& device_info,
+    const KernelInfo& kernel_info, std::vector<int3>* work_groups) const {
   if (local_mem_uploads_) {
-    return absl::OkStatus();
+    work_groups->push_back(work_group_size_);
+  } else {
+    GetPossibleWorkGroups(tuning_type, device_info, kernel_info, grid_size_,
+                          work_groups);
   }
-  RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
-  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
 }
 
 bool IsDepthwiseConv3x3Supported(const DepthwiseConvolution2DAttributes& attr) {
@@ -318,21 +313,15 @@ bool IsDepthwiseConv3x3Supported(const DepthwiseConvolution2DAttributes& attr) {
          attr.padding.appended.h == 1;
 }
 
-absl::Status CreateDepthwiseConv3x3(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const DepthwiseConvolution2DAttributes& attr, DepthwiseConv3x3* result) {
-  if (!IsDepthwiseConv3x3Supported(attr)) {
-    return absl::InvalidArgumentError(
-        "DepthwiseConv3x3 doesn't support this attributes");
-  }
-  bool weights_are_buffer =
-      creation_context.device->IsPowerVR() || creation_context.device->IsMali();
-  bool local_mem_uploads =
-      weights_are_buffer && creation_context.device->IsPowerVR();
-  *result = DepthwiseConv3x3(definition, weights_are_buffer, local_mem_uploads,
-                             creation_context.device->GetInfo());
-  return result->UploadWeightsAndBiases(attr.weights, attr.bias,
-                                        creation_context.context);
+DepthwiseConv3x3 CreateDepthwiseConv3x3(
+    const DeviceInfo& device_info, const OperationDef& definition,
+    const DepthwiseConvolution2DAttributes& attr) {
+  bool weights_are_buffer = device_info.IsPowerVR() || device_info.IsMali();
+  bool local_mem_uploads = weights_are_buffer && device_info.IsPowerVR();
+  DepthwiseConv3x3 result(definition, weights_are_buffer, local_mem_uploads,
+                          device_info);
+  result.UploadWeightsAndBiases(attr.weights, attr.bias, weights_are_buffer);
+  return result;
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3.h b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3.h
index dedc9b530bb..bbe759fe5d4 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3.h
@@ -38,8 +38,10 @@ namespace cl {
 class DepthwiseConv3x3 : public GPUOperation {
  public:
   DepthwiseConv3x3() = default;
-  absl::Status Tune(const TuningParameters& params) override;
-  absl::Status BindArguments() override;
+  void GetPossibleKernelWorkGroups(
+      TuningType tuning_type, const DeviceInfo& device_info,
+      const KernelInfo& kernel_info,
+      std::vector<int3>* work_groups) const override;
   int3 GetGridSize() const override;
 
   // Move only
@@ -53,13 +55,13 @@ class DepthwiseConv3x3 : public GPUOperation {
                             bool weights_are_buffer, bool local_mem_uploads,
                             const DeviceInfo& device_info);
   template <DataType T>
-  absl::Status UploadWeightsAndBiases(
-      const tflite::gpu::Tensor<OHWI, T>& weights,
-      const tflite::gpu::Tensor<Linear, T>& biases, CLContext* context);
+  void UploadWeightsAndBiases(const tflite::gpu::Tensor<OHWI, T>& weights,
+                              const tflite::gpu::Tensor<Linear, T>& biases,
+                              bool weights_are_buffer);
 
-  friend absl::Status CreateDepthwiseConv3x3(
-      const CreationContext& creation_context, const OperationDef& definition,
-      const DepthwiseConvolution2DAttributes& attr, DepthwiseConv3x3* result);
+  friend DepthwiseConv3x3 CreateDepthwiseConv3x3(
+      const DeviceInfo& device_info, const OperationDef& definition,
+      const DepthwiseConvolution2DAttributes& attr);
 
   template <DataType S, typename T>
   void RearrangeWeightsAndBiasesData(
@@ -70,14 +72,13 @@ class DepthwiseConv3x3 : public GPUOperation {
                                         bool weights_are_buffer,
                                         bool local_mem_uploads);
 
-  bool weights_are_buffer_;
   bool local_mem_uploads_;
 };
 
 template <DataType T>
-absl::Status DepthwiseConv3x3::UploadWeightsAndBiases(
+void DepthwiseConv3x3::UploadWeightsAndBiases(
     const tflite::gpu::Tensor<OHWI, T>& weights,
-    const tflite::gpu::Tensor<Linear, T>& biases, CLContext* context) {
+    const tflite::gpu::Tensor<Linear, T>& biases, bool weights_are_buffer) {
   const int src_depth = DivideRoundUp(weights.shape.i, 4);
   int texture_width = 10;  // 3x3 kernel + 1 bias
   int texture_height = src_depth;
@@ -85,50 +86,33 @@ absl::Status DepthwiseConv3x3::UploadWeightsAndBiases(
   const bool fp32_weights = definition_.precision == CalculationsPrecision::F32;
   const int float4_size = fp32_weights ? 16 : 8;
 
-  Texture2D weights_tex2d;
-  Buffer weights_buf;
+  std::vector<uint8_t> data(float4_size * elements_count);
   if (fp32_weights) {
-    std::vector<float4> gpu_data(elements_count);
-    RearrangeWeightsAndBiasesData(weights, biases, absl::MakeSpan(gpu_data));
-    if (weights_are_buffer_) {
-      RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
-                                           gpu_data.data(), context,
-                                           &weights_buf));
-    } else {
-      RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), texture_width, texture_height,
-          gpu_data.data(), context, &weights_tex2d));
-    }
+    float4* ptr = reinterpret_cast<float4*>(data.data());
+    RearrangeWeightsAndBiasesData(weights, biases,
+                                  absl::MakeSpan(ptr, elements_count));
   } else {
-    std::vector<half4> gpu_data(elements_count);
-    RearrangeWeightsAndBiasesData(weights, biases, absl::MakeSpan(gpu_data));
-    if (weights_are_buffer_) {
-      RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
-                                           gpu_data.data(), context,
-                                           &weights_buf));
-    } else {
-      RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), texture_width, texture_height,
-          gpu_data.data(), context, &weights_tex2d));
-    }
+    half4* ptr = reinterpret_cast<half4*>(data.data());
+    RearrangeWeightsAndBiasesData(weights, biases,
+                                  absl::MakeSpan(ptr, elements_count));
   }
 
-  if (weights_are_buffer_) {
+  if (weights_are_buffer) {
     BufferDescriptor desc;
     desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
     desc.element_size = 4;
-    args_.AddObject("weights", AccessType::READ,
-                    absl::make_unique<Buffer>(std::move(weights_buf)),
-                    absl::make_unique<BufferDescriptor>(desc));
+    desc.size = float4_size * elements_count;
+    desc.data = std::move(data);
+    args_.AddObject("weights",
+                    absl::make_unique<BufferDescriptor>(std::move(desc)));
   } else {
     Texture2DDescriptor desc;
     desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
-    args_.AddObject("weights", AccessType::READ,
-                    absl::make_unique<Texture2D>(std::move(weights_tex2d)),
-                    absl::make_unique<Texture2DDescriptor>(desc));
+    desc.size = int2(texture_width, texture_height);
+    desc.data = std::move(data);
+    args_.AddObject("weights",
+                    absl::make_unique<Texture2DDescriptor>(std::move(desc)));
   }
-
-  return absl::OkStatus();
 }
 
 template <DataType S, typename T>
@@ -166,9 +150,9 @@ void DepthwiseConv3x3::RearrangeWeightsAndBiasesData(
 
 bool IsDepthwiseConv3x3Supported(const DepthwiseConvolution2DAttributes& attr);
 
-absl::Status CreateDepthwiseConv3x3(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const DepthwiseConvolution2DAttributes& attr, DepthwiseConv3x3* result);
+DepthwiseConv3x3 CreateDepthwiseConv3x3(
+    const DeviceInfo& device_info, const OperationDef& definition,
+    const DepthwiseConvolution2DAttributes& attr);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3_test.cc
index a88b05bb8b3..24f9e5c1f08 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3_test.cc
@@ -56,9 +56,8 @@ TEST_F(OpenCLOperationTest, DepthwiseConv3x3SimpleWeights) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      DepthwiseConv3x3 operation;
-      ASSERT_OK(
-          CreateDepthwiseConv3x3(creation_context_, op_def, attr, &operation));
+      DepthwiseConv3x3 operation = CreateDepthwiseConv3x3(
+          creation_context_.GetDeviceInfo(), op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 2, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -93,9 +92,8 @@ TEST_F(OpenCLOperationTest, DepthwiseConv3x3) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      DepthwiseConv3x3 operation;
-      ASSERT_OK(
-          CreateDepthwiseConv3x3(creation_context_, op_def, attr, &operation));
+      DepthwiseConv3x3 operation = CreateDepthwiseConv3x3(
+          creation_context_.GetDeviceInfo(), op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 2, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_test.cc
index ac010e7d572..eb43c0c30e3 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_test.cc
@@ -55,9 +55,8 @@ TEST_F(OpenCLOperationTest, DepthwiseConvSimpleWeights) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      DepthwiseConvolution operation;
-      ASSERT_OK(CreateDepthwiseConvolution(creation_context_, op_def, attr,
-                                           &operation));
+      GPUOperation operation = CreateDepthwiseConvolution2D(
+          creation_context_.GetDeviceInfo(), op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 2, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -91,9 +90,8 @@ TEST_F(OpenCLOperationTest, DepthwiseConvNoMultiplier) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      DepthwiseConvolution operation;
-      ASSERT_OK(CreateDepthwiseConvolution(creation_context_, op_def, attr,
-                                           &operation));
+      GPUOperation operation = CreateDepthwiseConvolution2D(
+          creation_context_.GetDeviceInfo(), op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 2, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -128,9 +126,8 @@ TEST_F(OpenCLOperationTest, DepthwiseConvMultiplier2) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      DepthwiseConvolution operation;
-      ASSERT_OK(CreateDepthwiseConvolution(creation_context_, op_def, attr,
-                                           &operation));
+      GPUOperation operation = CreateDepthwiseConvolution2D(
+          creation_context_.GetDeviceInfo(), op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 2, 4), &dst_tensor));
       EXPECT_THAT(
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.cc b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.cc
index f735f1aa047..29565b0910e 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.cc
@@ -42,10 +42,10 @@ std::string GetOneInputCode(const OperationType& op_type,
       result = "\n";
       break;
     case OperationType::ELU:
-      result = "$0.x = $0.x < (FLT)(0.0f) ? exp($0.x) - (FLT)(1.0f) : $0.x;\n";
-      result += "$0.y = $0.y < (FLT)(0.0f) ? exp($0.y) - (FLT)(1.0f) : $0.y;\n";
-      result += "$0.z = $0.z < (FLT)(0.0f) ? exp($0.z) - (FLT)(1.0f) : $0.z;\n";
-      result += "$0.w = $0.w < (FLT)(0.0f) ? exp($0.w) - (FLT)(1.0f) : $0.w;\n";
+      result = "$0.x = $0.x < (FLT)(0.0f) ? expm1($0.x) : $0.x;\n";
+      result += "$0.y = $0.y < (FLT)(0.0f) ? expm1($0.y) : $0.y;\n";
+      result += "$0.z = $0.z < (FLT)(0.0f) ? expm1($0.z) : $0.z;\n";
+      result += "$0.w = $0.w < (FLT)(0.0f) ? expm1($0.w) : $0.w;\n";
       break;
     case OperationType::EXP:
       result = "$0 = exp($0);\n";
@@ -58,8 +58,11 @@ std::string GetOneInputCode(const OperationType& op_type,
     case OperationType::LOG:
       result = "$0 = log($0);\n";
       break;
+    case OperationType::NEG:
+      result = "$0 = -($0);\n";
+      break;
     case OperationType::RSQRT:
-      result = "$0 = (FLT4)(1.0f) / sqrt($0);\n";
+      result = "$0 = rsqrt($0);\n";
       break;
     case OperationType::SIGMOID:
       if (precision != CalculationsPrecision::F32) {
@@ -128,6 +131,43 @@ std::string GetTwoInputCode(const OperationType& op_type,
     case OperationType::SUB:
       result += "$0 = $1 - $2;\n";
       break;
+    // Comparison operators
+    case OperationType::LESS:
+      result = "$0.x = $1.x < $2.x ? (FLT)(1.0f) : (FLT)(0.0f);\n";
+      result += "$0.y = $1.y < $2.y ? (FLT)(1.0f) : (FLT)(0.0f);\n";
+      result += "$0.z = $1.z < $2.z ? (FLT)(1.0f) : (FLT)(0.0f);\n";
+      result += "$0.w = $1.w < $2.w ? (FLT)(1.0f) : (FLT)(0.0f);\n";
+      break;
+    case OperationType::LESS_EQUAL:
+      result = "$0.x = $1.x <= $2.x ? (FLT)(1.0f) : (FLT)(0.0f);\n";
+      result += "$0.y = $1.y <= $2.y ? (FLT)(1.0f) : (FLT)(0.0f);\n";
+      result += "$0.z = $1.z <= $2.z ? (FLT)(1.0f) : (FLT)(0.0f);\n";
+      result += "$0.w = $1.w <= $2.w ? (FLT)(1.0f) : (FLT)(0.0f);\n";
+      break;
+    case OperationType::GREATER:
+      result = "$0.x = $1.x > $2.x ? (FLT)(1.0f) : (FLT)(0.0f);\n";
+      result += "$0.y = $1.y > $2.y ? (FLT)(1.0f) : (FLT)(0.0f);\n";
+      result += "$0.z = $1.z > $2.z ? (FLT)(1.0f) : (FLT)(0.0f);\n";
+      result += "$0.w = $1.w > $2.w ? (FLT)(1.0f) : (FLT)(0.0f);\n";
+      break;
+    case OperationType::GREATER_EQUAL:
+      result = "$0.x = $1.x >= $2.x ? (FLT)(1.0f) : (FLT)(0.0f);\n";
+      result += "$0.y = $1.y >= $2.y ? (FLT)(1.0f) : (FLT)(0.0f);\n";
+      result += "$0.z = $1.z >= $2.z ? (FLT)(1.0f) : (FLT)(0.0f);\n";
+      result += "$0.w = $1.w >= $2.w ? (FLT)(1.0f) : (FLT)(0.0f);\n";
+      break;
+    case OperationType::EQUAL:
+      result = "$0.x = $1.x == $2.x ? (FLT)(1.0f) : (FLT)(0.0f);\n";
+      result += "$0.y = $1.y == $2.y ? (FLT)(1.0f) : (FLT)(0.0f);\n";
+      result += "$0.z = $1.z == $2.z ? (FLT)(1.0f) : (FLT)(0.0f);\n";
+      result += "$0.w = $1.w == $2.w ? (FLT)(1.0f) : (FLT)(0.0f);\n";
+      break;
+    case OperationType::NOT_EQUAL:
+      result = "$0.x = $1.x != $2.x ? (FLT)(1.0f) : (FLT)(0.0f);\n";
+      result += "$0.y = $1.y != $2.y ? (FLT)(1.0f) : (FLT)(0.0f);\n";
+      result += "$0.z = $1.z != $2.z ? (FLT)(1.0f) : (FLT)(0.0f);\n";
+      result += "$0.w = $1.w != $2.w ? (FLT)(1.0f) : (FLT)(0.0f);\n";
+      break;
     default:
       return "Unknown operation type;\n";
   }
@@ -160,82 +200,68 @@ GPUOperation CreateElementwiseOneRuntimeOneScalar(
 
 // Creates simple two input(first input is runtime tensor and second input is
 // constant linear tensor) operation, for example sub, div and etc.
-absl::Status CreateElementwiseTwoInput(
-    const CreationContext& creation_context, const OperationDef& definition,
+GPUOperation CreateElementwiseTwoInput(
+    const DeviceInfo& device_info, const OperationDef& definition,
     const OperationType& op_type,
     const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& constant_tensor,
-    bool swap_inputs, GPUOperation* result) {
+    bool swap_inputs) {
   const BHWC shape = BHWC(1, 1, 1, constant_tensor.shape.v);
-  TensorStorageType storage_type =
-      SelectBestStorageType(*creation_context.context, *creation_context.device,
-                            shape, definition.GetPrimaryStorageType(),
-                            definition.GetDataType(), Layout::HWC);
+  TensorStorageType storage_type = SelectBestStorageType(
+      device_info, shape, definition.GetPrimaryStorageType(),
+      definition.GetDataType(), Layout::HWC);
   TensorDescriptor desc{definition.GetDataType(), storage_type, Layout::HWC};
-  Tensor gpu_tensor;
-  RETURN_IF_ERROR(CreateTensor(*creation_context.context,
-                               *creation_context.device, shape, desc,
-                               &gpu_tensor));
-  RETURN_IF_ERROR(
-      gpu_tensor.WriteData(creation_context.queue, constant_tensor));
+  desc.UploadData(constant_tensor);
 
-  *result = GPUOperation(definition);
-  result->elementwise_ = true;
-  result->args_.AddObject("second_tensor", AccessType::READ,
-                          absl::make_unique<Tensor>(std::move(gpu_tensor)),
-                          absl::make_unique<TensorDescriptor>(desc));
+  GPUOperation result(definition);
+  result.elementwise_ = true;
+  result.args_.AddObject("second_tensor",
+                         absl::make_unique<TensorDescriptor>(std::move(desc)));
   const std::string s_coord = shape.c == 1 ? "0" : "S_COORD";
-  result->code_ = absl::StrCat(
+  result.code_ = absl::StrCat(
       "FLT4 second_val = args.second_tensor.Read(0, 0, ", s_coord, ");\n");
   if (shape.c == 1) {
-    result->code_ += "  second_val.y = second_val.x;\n";
-    result->code_ += "  second_val.z = second_val.x;\n";
-    result->code_ += "  second_val.w = second_val.x;\n";
+    result.code_ += "  second_val.y = second_val.x;\n";
+    result.code_ += "  second_val.z = second_val.x;\n";
+    result.code_ += "  second_val.w = second_val.x;\n";
   }
-  result->code_ += GetTwoInputCode(op_type, "in_out_value", "in_out_value",
-                                   "second_val", swap_inputs);
-  return absl::OkStatus();
+  result.code_ += GetTwoInputCode(op_type, "in_out_value", "in_out_value",
+                                  "second_val", swap_inputs);
+  return result;
 }
 
 // Creates simple two input(first input is runtime tensor and second input is
 // constant HWC tensor) operation, for example sub, div and etc.
-absl::Status CreateElementwiseTwoInput(
-    const CreationContext& creation_context, const OperationDef& definition,
+GPUOperation CreateElementwiseTwoInput(
+    const DeviceInfo& device_info, const OperationDef& definition,
     const OperationType& op_type,
     const tflite::gpu::Tensor<HWC, DataType::FLOAT32>& constant_tensor,
-    bool swap_inputs, GPUOperation* result) {
+    bool swap_inputs) {
   const BHWC shape = BHWC(1, constant_tensor.shape.h, constant_tensor.shape.w,
                           constant_tensor.shape.c);
-  TensorStorageType storage_type =
-      SelectBestStorageType(*creation_context.context, *creation_context.device,
-                            shape, definition.GetPrimaryStorageType(),
-                            definition.GetDataType(), Layout::HWC);
+  TensorStorageType storage_type = SelectBestStorageType(
+      device_info, shape, definition.GetPrimaryStorageType(),
+      definition.GetDataType(), Layout::HWC);
   TensorDescriptor desc{definition.GetDataType(), storage_type, Layout::HWC};
-  Tensor gpu_tensor;
-  RETURN_IF_ERROR(CreateTensor(*creation_context.context,
-                               *creation_context.device, shape, desc,
-                               &gpu_tensor));
-  RETURN_IF_ERROR(
-      gpu_tensor.WriteData(creation_context.queue, constant_tensor));
+  desc.UploadData(constant_tensor);
 
-  *result = GPUOperation(definition);
-  result->elementwise_ = true;
-  result->args_.AddObject("second_tensor", AccessType::READ,
-                          absl::make_unique<Tensor>(std::move(gpu_tensor)),
-                          absl::make_unique<TensorDescriptor>(desc));
+  GPUOperation result(definition);
+  result.elementwise_ = true;
+  result.args_.AddObject("second_tensor",
+                         absl::make_unique<TensorDescriptor>(std::move(desc)));
   const std::string x_coord = shape.w == 1 ? "0" : "X_COORD";
   const std::string y_coord = shape.h == 1 ? "0" : "Y_COORD";
   const std::string s_coord = shape.c == 1 ? "0" : "S_COORD";
-  result->code_ = absl::StrCat("FLT4 second_val = args.second_tensor.Read(",
-                               x_coord, ", ", y_coord, ", ", s_coord, ");\n");
+  result.code_ = absl::StrCat("FLT4 second_val = args.second_tensor.Read(",
+                              x_coord, ", ", y_coord, ", ", s_coord, ");\n");
   if (shape.c == 1) {
-    result->code_ += "  second_val.y = second_val.x;\n";
-    result->code_ += "  second_val.z = second_val.x;\n";
-    result->code_ += "  second_val.w = second_val.x;\n";
+    result.code_ += "  second_val.y = second_val.x;\n";
+    result.code_ += "  second_val.z = second_val.x;\n";
+    result.code_ += "  second_val.w = second_val.x;\n";
   }
-  result->code_ += GetTwoInputCode(op_type, "in_out_value", "in_out_value",
-                                   "second_val", swap_inputs);
+  result.code_ += GetTwoInputCode(op_type, "in_out_value", "in_out_value",
+                                  "second_val", swap_inputs);
 
-  return absl::OkStatus();
+  return result;
 }
 
 }  // namespace
@@ -248,11 +274,10 @@ GPUOperation CreateElementwiseOneInput(const OperationDef& definition,
   return op;
 }
 
-absl::Status CreateElementwise(const CreationContext& creation_context,
+GPUOperation CreateElementwise(const DeviceInfo& device_info,
                                const OperationDef& definition,
                                const OperationType& op_type,
-                               const ElementwiseAttributes& attr,
-                               GPUOperation* result) {
+                               const ElementwiseAttributes& attr) {
   const float* scalar = absl::get_if<float>(&attr.param);
   const auto* linear_tensor =
       absl::get_if<tflite::gpu::Tensor<Linear, DataType::FLOAT32>>(&attr.param);
@@ -260,20 +285,19 @@ absl::Status CreateElementwise(const CreationContext& creation_context,
       absl::get_if<tflite::gpu::Tensor<HWC, DataType::FLOAT32>>(&attr.param);
 
   if (scalar) {
-    *result = CreateElementwiseOneRuntimeOneScalar(
-        definition, op_type, *scalar, attr.runtime_tensor_is_second);
-    return absl::OkStatus();
+    return CreateElementwiseOneRuntimeOneScalar(definition, op_type, *scalar,
+                                                attr.runtime_tensor_is_second);
   } else if (linear_tensor) {
-    return CreateElementwiseTwoInput(creation_context, definition, op_type,
+    return CreateElementwiseTwoInput(device_info, definition, op_type,
                                      *linear_tensor,
-                                     attr.runtime_tensor_is_second, result);
+                                     attr.runtime_tensor_is_second);
   } else if (hwc_tensor) {
-    return CreateElementwiseTwoInput(creation_context, definition, op_type,
-                                     *hwc_tensor, attr.runtime_tensor_is_second,
-                                     result);
+    return CreateElementwiseTwoInput(device_info, definition, op_type,
+                                     *hwc_tensor,
+                                     attr.runtime_tensor_is_second);
+  } else {
+    return GPUOperation(definition);
   }
-  return absl::UnimplementedError(
-      "No elementwise implementation for this case");
 }
 
 GPUOperation CreateElementwiseTwoInput(const OperationDef& definition,
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.h b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.h
index f841cdba9fb..c16899071d6 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.h
@@ -33,11 +33,10 @@ GPUOperation CreateElementwiseOneInput(const OperationDef& definition,
 
 // Creates simple two input(first input is runtime tensor and second input is
 // constant or linear/hwc tensor) operation, for example sub, div and etc.
-absl::Status CreateElementwise(const CreationContext& creation_context,
+GPUOperation CreateElementwise(const DeviceInfo& device_info,
                                const OperationDef& definition,
                                const OperationType& op_type,
-                               const ElementwiseAttributes& attr,
-                               GPUOperation* result);
+                               const ElementwiseAttributes& attr);
 
 // Creates simple two input(2 runtime tensors) operation, for example
 // sub, div and etc.
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise_test.cc
index 23ee6622e8c..b48f66ce600 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise_test.cc
@@ -208,6 +208,30 @@ TEST_F(OpenCLOperationTest, Log) {
   }
 }
 
+TEST_F(OpenCLOperationTest, Neg) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 1, 2);
+  src_tensor.data = {1.0f, -2.0f, 0.0f, 4.0f};
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      GPUOperation operation =
+          CreateElementwiseOneInput(op_def, OperationType::NEG);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 1, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {-1.0f, 2.0f, 0.0f, -4.0f}));
+    }
+  }
+}
+
 TEST_F(OpenCLOperationTest, Rsqrt) {
   TensorFloat32 src_tensor;
   src_tensor.shape = BHWC(1, 2, 1, 2);
@@ -546,9 +570,9 @@ TEST_F(OpenCLOperationTest, MaximumWithScalar) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      GPUOperation operation;
-      ASSERT_OK(CreateElementwise(creation_context_, op_def,
-                                  OperationType::MAXIMUM, attr, &operation));
+      GPUOperation operation =
+          CreateElementwise(creation_context_.GetDeviceInfo(), op_def,
+                            OperationType::MAXIMUM, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
                                     BHWC(1, 4, 1, 1), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -577,9 +601,9 @@ TEST_F(OpenCLOperationTest, MaximumWithConstantLinearTensor) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      GPUOperation operation;
-      ASSERT_OK(CreateElementwise(creation_context_, op_def,
-                                  OperationType::MAXIMUM, attr, &operation));
+      GPUOperation operation =
+          CreateElementwise(creation_context_.GetDeviceInfo(), op_def,
+                            OperationType::MAXIMUM, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -608,9 +632,9 @@ TEST_F(OpenCLOperationTest, MaximumWithConstantHWCTensor) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      GPUOperation operation;
-      ASSERT_OK(CreateElementwise(creation_context_, op_def,
-                                  OperationType::MAXIMUM, attr, &operation));
+      GPUOperation operation =
+          CreateElementwise(creation_context_.GetDeviceInfo(), op_def,
+                            OperationType::MAXIMUM, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -638,9 +662,9 @@ TEST_F(OpenCLOperationTest, MaximumWithConstantHWCTensorBroadcastChannels) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      GPUOperation operation;
-      ASSERT_OK(CreateElementwise(creation_context_, op_def,
-                                  OperationType::MAXIMUM, attr, &operation));
+      GPUOperation operation =
+          CreateElementwise(creation_context_.GetDeviceInfo(), op_def,
+                            OperationType::MAXIMUM, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -694,9 +718,9 @@ TEST_F(OpenCLOperationTest, MinimumWithScalar) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      GPUOperation operation;
-      ASSERT_OK(CreateElementwise(creation_context_, op_def,
-                                  OperationType::MINIMUM, attr, &operation));
+      GPUOperation operation =
+          CreateElementwise(creation_context_.GetDeviceInfo(), op_def,
+                            OperationType::MINIMUM, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
                                     BHWC(1, 4, 1, 1), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -807,9 +831,8 @@ TEST_F(OpenCLOperationTest, SubWithScalarAtFirstPosition) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      GPUOperation operation;
-      ASSERT_OK(CreateElementwise(creation_context_, op_def, OperationType::SUB,
-                                  attr, &operation));
+      GPUOperation operation = CreateElementwise(
+          creation_context_.GetDeviceInfo(), op_def, OperationType::SUB, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
                                     BHWC(1, 4, 1, 1), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -818,6 +841,174 @@ TEST_F(OpenCLOperationTest, SubWithScalarAtFirstPosition) {
   }
 }
 
+TEST_F(OpenCLOperationTest, Less) {
+  TensorFloat32 src_tensor_0, src_tensor_1;
+  src_tensor_0.shape = BHWC(1, 2, 1, 2);
+  src_tensor_1.shape = BHWC(1, 2, 1, 2);
+  src_tensor_0.data = {0.0f, 1.0f, 2.0f, 3.0f};
+  src_tensor_1.data = {1.0f, 0.0f, 2.0f, -4.0f};
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-2f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      GPUOperation operation = CreateElementwiseTwoInput(
+          op_def, OperationType::LESS, src_tensor_1.shape);
+      ASSERT_OK(ExecuteGPUOperation({src_tensor_0, src_tensor_1},
+                                    creation_context_, &operation,
+                                    BHWC(1, 2, 1, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {1.0f, 0.0f, 0.0f, 0.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, LessEqual) {
+  TensorFloat32 src_tensor_0;
+  src_tensor_0.shape = BHWC(1, 2, 1, 2);
+  src_tensor_0.data = {0.0f, 1.0f, 2.0f, 3.0f};
+
+  ElementwiseAttributes attr;
+  attr.param = 2.0f;
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-2f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      GPUOperation operation =
+          CreateElementwise(creation_context_.GetDeviceInfo(), op_def,
+                            OperationType::LESS_EQUAL, attr);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
+                                    BHWC(1, 2, 1, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {1.0f, 1.0f, 1.0f, 0.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, Greater) {
+  TensorFloat32 src_tensor_0;
+  src_tensor_0.shape = BHWC(1, 2, 1, 2);
+  src_tensor_0.data = {0.0f, 1.0f, 2.0f, 3.0f};
+
+  ElementwiseAttributes attr;
+  attr.param = 2.0f;
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-2f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      GPUOperation operation =
+          CreateElementwise(creation_context_.GetDeviceInfo(), op_def,
+                            OperationType::GREATER, attr);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
+                                    BHWC(1, 2, 1, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {0.0f, 0.0f, 0.0f, 1.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, GreaterEqual) {
+  TensorFloat32 src_tensor_0;
+  src_tensor_0.shape = BHWC(1, 2, 1, 2);
+  src_tensor_0.data = {0.0f, 1.0f, 2.0f, 3.0f};
+
+  ElementwiseAttributes attr;
+  attr.param = 2.0f;
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-2f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      GPUOperation operation =
+          CreateElementwise(creation_context_.GetDeviceInfo(), op_def,
+                            OperationType::GREATER_EQUAL, attr);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
+                                    BHWC(1, 2, 1, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {0.0f, 0.0f, 1.0f, 1.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, Equal) {
+  TensorFloat32 src_tensor_0;
+  src_tensor_0.shape = BHWC(1, 2, 1, 2);
+  src_tensor_0.data = {0.0f, 1.0f, 2.0f, 3.0f};
+
+  ElementwiseAttributes attr;
+  attr.param = 2.0f;
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-2f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      GPUOperation operation =
+          CreateElementwise(creation_context_.GetDeviceInfo(), op_def,
+                            OperationType::EQUAL, attr);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
+                                    BHWC(1, 2, 1, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {0.0f, 0.0f, 1.0f, 0.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, NotEqual) {
+  TensorFloat32 src_tensor_0;
+  src_tensor_0.shape = BHWC(1, 2, 1, 2);
+  src_tensor_0.data = {0.0f, 1.0f, 2.0f, 3.0f};
+
+  ElementwiseAttributes attr;
+  attr.param = 2.0f;
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-2f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      GPUOperation operation =
+          CreateElementwise(creation_context_.GetDeviceInfo(), op_def,
+                            OperationType::NOT_EQUAL, attr);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
+                                    BHWC(1, 2, 1, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {1.0f, 1.0f, 0.0f, 1.0f}));
+    }
+  }
+}
+
 }  // namespace
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.cc b/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.cc
index 2ab0284febe..7461ab1d128 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.cc
@@ -35,6 +35,8 @@ FullyConnected::FullyConnected(const OperationDef& definition,
     } else {
       work_group_size_ = int3(32, 4, 1);
     }
+  } else if (device_info.IsIntel()) {
+    work_group_size_ = int3(8, 4, 1);
   } else {
     work_group_size_ = int3(16, 4, 1);
   }
@@ -65,43 +67,51 @@ std::string FullyConnected::GetFullyConnectedKernelCode(
   std::string c = GetCommonDefines(op_def.precision);
   switch (op_def.precision) {
     case CalculationsPrecision::F32:
+      c += "#define accumulate(a, b, c) c = mad(a, b, c)\n";
       c += "#define FLT16 float16\n";
       break;
     case CalculationsPrecision::F32_F16:
+      c += "#define accumulate(a, b, c) c += convert_float4(a * b)\n";
+      c += "#define FLT16 half16\n";
+      break;
     case CalculationsPrecision::F16:
+      c += "#define accumulate(a, b, c) c = mad(a, b, c)\n";
       c += "#define FLT16 half16\n";
       break;
   }
 
-  const std::string wg_x = std::to_string(work_group_size.x);
-  const std::string wg_y = std::to_string(work_group_size.y);
-  c += "__kernel void main_function(\n";
-  c += "$0) {\n";
-  c += "  int gid = get_global_id(0);\n";
-  c += "  bool inside = gid < args.dst_tensor.Slices();\n";
-  c += "  gid = min(gid, args.dst_tensor.Slices() - 1);\n";
-  c += "  int2 tid = (int2)(get_local_id(0), get_local_id(1));\n";
-  c += "  ACCUM_FLT4 s = (ACCUM_FLT4)(0.0f);\n";
-  c += "  for (uint c = tid.y; c < args.src_tensor.Slices(); c += " + wg_y +
-       ") {\n";
-  c += "    FLT4 v = args.src_tensor.Read(0, 0, c);\n";
-  c += "    FLT16 w = args.weights.Read(c * args.dst_tensor.Slices() + gid);\n";
-  c += "    s.x += dot(v, w.s0123);\n";
-  c += "    s.y += dot(v, w.s4567);\n";
-  c += "    s.z += dot(v, w.s89ab);\n";
-  c += "    s.w += dot(v, w.scdef);\n";
-  c += "  }\n";
-  c += "  __local ACCUM_FLT4 temp[" + wg_x + "][" + wg_y + "];\n";
-  c += "  temp[tid.x][tid.y] = s;\n";
-  c += "  barrier(CLK_LOCAL_MEM_FENCE);\n";
-  c += "  if (tid.y == 0 && inside) {\n";
+  c += "#define WG_X " + std::to_string(work_group_size.x) + "\n";
+  c += "#define WG_Y " + std::to_string(work_group_size.y) + "\n";
+
+  c += R"(__kernel void main_function($0) {
+  int gid = get_global_id(0);
+  int2 tid = (int2)(get_local_id(0), get_local_id(1));
+  ACCUM_FLT4 s = (ACCUM_FLT4)(0.0f);
+  if (gid < args.dst_tensor.Slices()) {
+    for (int c = tid.y; c < args.src_tensor.Slices(); c += WG_Y) {
+      FLT4 v = args.src_tensor.Read(0, 0, c);
+      FLT16 w = args.weights.Read(c * args.dst_tensor.Slices() + gid);
+      accumulate(v.s0, w.s0123, s);
+      accumulate(v.s1, w.s4567, s);
+      accumulate(v.s2, w.s89ab, s);
+      accumulate(v.s3, w.scdef, s);
+    }
+  }
+  __local ACCUM_FLT4 temp[WG_X][WG_Y];
+  temp[tid.x][tid.y] = s;
+  barrier(CLK_LOCAL_MEM_FENCE);
+  if (gid >= args.dst_tensor.Slices()) {
+    return;
+  }
+  if (tid.y == 0) {
+)";
   for (int i = 1; i < work_group_size.y; ++i) {
     c += "    s += temp[tid.x][" + std::to_string(i) + "];\n";
   }
-  c += "    FLT4 r0 = TO_FLT4(s) + args.biases.Read(gid);\n";
-  c += "    args.dst_tensor.Write(r0, 0, 0, gid);\n";
-  c += "  }\n";
-  c += "}\n";
+  c += R"(    FLT4 r0 = TO_FLT4(s) + args.biases.Read(gid);
+    args.dst_tensor.Write(r0, 0, 0, gid);
+  }
+})";
 
   return c;
 }
@@ -110,26 +120,20 @@ int3 FullyConnected::GetGridSize() const {
   return int3(dst_[0]->Slices(), 1, 1);
 }
 
-absl::Status CreateFullyConnected(const CreationContext& creation_context,
-                                  const OperationDef& definition,
-                                  const FullyConnectedAttributes& attr,
-                                  FullyConnected* result) {
-  *result = FullyConnected(definition, creation_context.device->GetInfo());
-  RETURN_IF_ERROR(
-      result->UploadWeights(attr.weights, creation_context.context));
+FullyConnected CreateFullyConnected(const DeviceInfo& device_info,
+                                    const OperationDef& definition,
+                                    const FullyConnectedAttributes& attr) {
+  FullyConnected result(definition, device_info);
+  result.UploadWeights(attr.weights);
 
   TensorLinearDescriptor desc;
   desc.storage_type = LinearStorageType::TEXTURE_2D;
   desc.element_type = definition.GetDataType();
+  desc.UploadLinearData(attr.bias);
+  result.args_.AddObject(
+      "biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
 
-  LinearStorage lt;
-  RETURN_IF_ERROR(
-      CreateLinearStorage(desc, attr.bias, creation_context.context, &lt));
-  result->args_.AddObject("biases", AccessType::READ,
-                          absl::make_unique<LinearStorage>(std::move(lt)),
-                          absl::make_unique<TensorLinearDescriptor>(desc));
-
-  return absl::OkStatus();
+  return result;
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.h b/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.h
index ced3913ead7..24bc521a96b 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.h
@@ -16,19 +16,25 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_FULLY_CONNECTED_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_FULLY_CONNECTED_H_
 
+#include <string>
+#include <utility>
 #include <vector>
 
+#include "absl/memory/memory.h"
+#include "tensorflow/lite/delegates/gpu/cl/arguments.h"
 #include "tensorflow/lite/delegates/gpu/cl/buffer.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_kernel.h"
+#include "tensorflow/lite/delegates/gpu/cl/device_info.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/tuning_parameters.h"
 #include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
-#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
-#include "tensorflow/lite/delegates/gpu/cl/util.h"
+#include "tensorflow/lite/delegates/gpu/cl/precision.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
-#include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
 #include "tensorflow/lite/delegates/gpu/common/types.h"
+#include "tensorflow/lite/delegates/gpu/common/util.h"
 
 namespace tflite {
 namespace gpu {
@@ -36,52 +42,51 @@ namespace cl {
 
 template <DataType T, typename S>
 void RearrangeFCWeightsToIOO4I4(const tflite::gpu::Tensor<OHWI, T>& weights,
-                                absl::Span<S> dst) {
+                                S* dst) {
   const int src_channels = weights.shape.i;
   const int padded_src_channels = AlignByN(src_channels, 4);
   const int dst_channels = weights.shape.o;
   const int padded_dst_channels = AlignByN(dst_channels, 4);
 
-  // The weights are to be rearranged in such a way that the first 4 elements of
-  // each row, starting from row_0, are copied onto the destination buffer. The
-  // next set of 4 elements are then copied and so on. As an example, an 8x8
-  // matrix would be rearranged as below.
+  // Change the travelsal order of the weight matrix in the following way:
+  // The matrix is segmented to blocks of 4x4. If (any) dimension of the matrix
+  // size is not divisible by 4, then pad with zeros. Each block is stored
+  // contigously. The 16 elements within a block are ordered as 4 elements of
+  // the first column, 4 elems of the second, etc. Blocks then traversed as
+  // columns first, rows last. As an example, an 8x8 matrix would be traversed
+  // as below.
   //
-  //  | a0 a1 a2 a3 a4 a5 a6 a7 |              | a0 a1 a2 a3 b0 b1 b2 b3 |
-  //  | b0 b1 b2 b3 b4 b5 b6 b7 |              | c0 c1 c2 c3 d0 d1 d2 d3 |
-  //  | c0 c1 c2 c3 c4 c5 c6 c7 |              | e0 e1 e2 e3 f0 f1 f2 f3 |
-  //  | d0 d1 d2 d3 d4 d5 d6 d7 |  --------->  | g0 g1 g2 g3 h0 h1 h2 h3 |
-  //  | e0 e1 e2 e3 e4 e5 e6 e7 |              | a4 a5 a6 a7 b4 b5 b6 b7 |
-  //  | f0 f1 f2 f3 f4 f5 f6 f7 |              | c4 c5 c6 c7 d4 d5 d6 d7 |
-  //  | g0 g1 g2 g3 g4 g5 g6 g7 |              | e4 e5 e6 e7 f4 f5 f6 f7 |
-  //  | h0 h1 h2 h3 h4 h5 h6 h7 |              | g4 g5 g6 g7 h4 h5 h6 h7 |
+  //  |  0  4  8 12 32 36 40 44 |
+  //  |  1  5  9 13 33 37 41 45 |
+  //  |  2  6 10 14 34 38 42 46 |
+  //  |  3  7 11 15 35 39 43 47 |
+  //  | 16 20 24 28 48 52 56 60 |
+  //  | 17 21 25 29 49 53 57 61 |
+  //  | 18 22 26 30 50 54 58 62 |
+  //  | 19 23 27 31 51 55 59 63 |
+  //
+  // The benefit of doing this is that reading contigous 16 elements gives a 4x4
+  // block of the matrix, where the first 4 elements is the first row of the
+  // block, second 4 elements is the second row of the block, etc. Subsequent
+  // blocks contain elements of the same 4 columns.
 
-  for (int y = 0; y < dst_channels; y++) {
-    int x = 0;
-    for (; x + 4 <= src_channels; x += 4) {
-      const int idx_data_0 = src_channels * y + x;
-      S filter = S(weights.data[idx_data_0], weights.data[idx_data_0 + 1],
-                   weights.data[idx_data_0 + 2], weights.data[idx_data_0 + 3]);
-      dst[y + padded_dst_channels * x / 4] = filter;
-    }
-
-    // If the width is not a multiple of 4, padding is required and the padded
-    // region is filled with zeros.
-    if (src_channels != padded_src_channels) {
-      const int idx_data_0 = src_channels * y + x;
-
-      S filter = S(x < src_channels ? weights.data[idx_data_0] : 0.0,
-                   x + 1 < src_channels ? weights.data[idx_data_0 + 1] : 0.0,
-                   x + 2 < src_channels ? weights.data[idx_data_0 + 2] : 0.0,
-                   x + 3 < src_channels ? weights.data[idx_data_0 + 3] : 0.0);
-      dst[y + padded_dst_channels * x / 4] = filter;
-    }
-  }
-
-  // Fill the padded columns with zeros.
-  for (int y = dst_channels; y < padded_dst_channels; y++) {
-    for (int x = 0; x < padded_src_channels; x += 4) {
-      dst[y + padded_dst_channels * x / 4] = S(0.0);
+  for (int block_y = 0; 4 * block_y < padded_dst_channels; block_y++) {
+    for (int y_in_block = 0; y_in_block < 4; y_in_block++) {
+      for (int block_x = 0; 4 * block_x < padded_src_channels; block_x++) {
+        for (int x_in_block = 0; x_in_block < 4; x_in_block++) {
+          int y = 4 * block_y + y_in_block;
+          int x = 4 * block_x + x_in_block;
+          // Consider destination as an array with extents
+          // [padded_src_channels/4][padded_dst_channels/4][4][4]
+          int dst_index = block_x * padded_dst_channels * 4 + block_y * 16 +
+                          x_in_block * 4 + y_in_block;
+          if (x < src_channels && y < dst_channels) {
+            dst[dst_index] = weights.data[src_channels * y + x];
+          } else {
+            dst[dst_index] = 0.0f;
+          }
+        }
+      }
     }
   }
 }
@@ -89,8 +94,11 @@ void RearrangeFCWeightsToIOO4I4(const tflite::gpu::Tensor<OHWI, T>& weights,
 class FullyConnected : public GPUOperation {
  public:
   FullyConnected() = default;
-  absl::Status Tune(const TuningParameters& params) override {
-    return absl::OkStatus();
+  void GetPossibleKernelWorkGroups(
+      TuningType tuning_type, const DeviceInfo& device_info,
+      const KernelInfo& kernel_info,
+      std::vector<int3>* work_groups) const override {
+    work_groups->push_back(work_group_size_);
   }
   int3 GetGridSize() const override;
 
@@ -102,21 +110,20 @@ class FullyConnected : public GPUOperation {
 
  private:
   FullyConnected(const OperationDef& definition, const DeviceInfo& device_info);
-  friend absl::Status CreateFullyConnected(
-      const CreationContext& creation_context, const OperationDef& definition,
-      const FullyConnectedAttributes& attr, FullyConnected* result);
+  friend FullyConnected CreateFullyConnected(
+      const DeviceInfo& device_info, const OperationDef& definition,
+      const FullyConnectedAttributes& attr);
 
   template <DataType T>
-  absl::Status UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
-                             CLContext* context);
+  void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights);
 
   std::string GetFullyConnectedKernelCode(const OperationDef& op_def,
                                           const int3& work_group_size);
 };
 
 template <DataType T>
-absl::Status FullyConnected::UploadWeights(
-    const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
+void FullyConnected::UploadWeights(
+    const tflite::gpu::Tensor<OHWI, T>& weights) {
   const int src_depth = DivideRoundUp(weights.shape.i, 4);
   const int dst_depth = DivideRoundUp(weights.shape.o, 4);
 
@@ -128,33 +135,24 @@ absl::Status FullyConnected::UploadWeights(
   BufferDescriptor desc;
   desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
   desc.element_size = 16;
+  desc.size = float4_size * elements_count;
+  desc.data.resize(desc.size);
 
-  Buffer weights_buffer;
   if (f32_weights) {
-    std::vector<float4> gpu_data(dst_depth * src_depth * 4);
-    RearrangeFCWeightsToIOO4I4(weights, absl::MakeSpan(gpu_data));
-    RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
-                                         gpu_data.data(), context,
-                                         &weights_buffer));
+    float* ptr = reinterpret_cast<float*>(desc.data.data());
+    RearrangeFCWeightsToIOO4I4(weights, ptr);
   } else {
-    std::vector<half4> gpu_data(dst_depth * src_depth * 4);
-    RearrangeFCWeightsToIOO4I4(weights, absl::MakeSpan(gpu_data));
-    RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
-                                         gpu_data.data(), context,
-                                         &weights_buffer));
+    half* ptr = reinterpret_cast<half*>(desc.data.data());
+    RearrangeFCWeightsToIOO4I4(weights, ptr);
   }
 
-  args_.AddObject("weights", AccessType::READ,
-                  absl::make_unique<Buffer>(std::move(weights_buffer)),
-                  absl::make_unique<BufferDescriptor>(desc));
-
-  return absl::OkStatus();
+  args_.AddObject("weights",
+                  absl::make_unique<BufferDescriptor>(std::move(desc)));
 }
 
-absl::Status CreateFullyConnected(const CreationContext& creation_context,
-                                  const OperationDef& definition,
-                                  const FullyConnectedAttributes& attr,
-                                  FullyConnected* result);
+FullyConnected CreateFullyConnected(const DeviceInfo& device_info,
+                                    const OperationDef& definition,
+                                    const FullyConnectedAttributes& attr);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_test.cc
index 900b244ceb2..c9853187b3c 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_test.cc
@@ -19,9 +19,15 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/cl/environment.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/cl/precision.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
-#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
 
 using ::testing::ElementsAreArray;
 using ::testing::FloatNear;
@@ -39,7 +45,8 @@ TEST_F(OpenCLOperationTest, FullyConnected) {
 
   FullyConnectedAttributes attr;
   attr.weights.shape = OHWI(2, 1, 1, 4);
-  attr.weights.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
+  attr.weights.data = {0.0f, 1.0f, 2.0f, 3.0f,  //
+                       4.0f, 5.0f, 6.0f, 7.0f};
   attr.bias.shape = Linear(2);
   attr.bias.data = {0.5f, -0.5f};
 
@@ -52,12 +59,105 @@ TEST_F(OpenCLOperationTest, FullyConnected) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      FullyConnected operation;
-      ASSERT_OK(
-          CreateFullyConnected(creation_context_, op_def, attr, &operation));
+      FullyConnected operation =
+          CreateFullyConnected(creation_context_.GetDeviceInfo(), op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 1, 1, 2), &dst_tensor));
-      EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), {14.5f, 37.5f}));
+      EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), {14.5f, 37.5f}))
+          << "Failed using precision " << ToString(precision);
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, FullyConnectedLarge) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 1, 1, 8);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
+
+  FullyConnectedAttributes attr;
+  attr.weights.shape = OHWI(12, 1, 1, 8);
+  attr.weights.data = {
+      0.0f,  1.0f,  2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,   //
+      8.0f,  9.0f,  10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f,  //
+      16.0f, 17.0f, 18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f,  //
+      24.0f, 25.0f, 26.0f, 27.0f, 28.0f, 29.0f, 30.0f, 31.0f,  //
+      32.0f, 33.0f, 34.0f, 35.0f, 36.0f, 37.0f, 38.0f, 39.0f,  //
+      40.0f, 41.0f, 42.0f, 43.0f, 44.0f, 45.0f, 46.0f, 47.0f,  //
+      48.0f, 49.0f, 50.0f, 51.0f, 52.0f, 53.0f, 54.0f, 55.0f,  //
+      56.0f, 57.0f, 58.0f, 59.0f, 60.0f, 61.0f, 62.0f, 63.0f,  //
+      64.0f, 65.0f, 66.0f, 67.0f, 68.0f, 69.0f, 70.0f, 71.0f,  //
+      72.0f, 73.0f, 74.0f, 75.0f, 76.0f, 77.0f, 78.0f, 79.0f,  //
+      80.0f, 81.0f, 82.0f, 83.0f, 84.0f, 85.0f, 86.0f, 87.0f,  //
+      88.0f, 89.0f, 90.0f, 91.0f, 92.0f, 93.0f, 94.0f, 95.0f,  //
+  };
+  attr.bias.shape = Linear(12);
+  attr.bias.data = {-0.6f, -0.5f, -0.4f, -0.3f, -0.2f, -0.1f,
+                    0.1f,  0.2f,  0.3f,  0.4f,  0.5f,  0.6f};
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 0.0f : 0.601f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      FullyConnected operation =
+          CreateFullyConnected(creation_context_.GetDeviceInfo(), op_def, attr);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 1, 1, 12), &dst_tensor));
+      EXPECT_THAT(
+          dst_tensor.data,
+          Pointwise(FloatNear(eps),
+                    {139.4f, 363.5f, 587.6f, 811.7f, 1035.8f, 1259.9f, 1484.1f,
+                     1708.2f, 1932.3f, 2156.4f, 2380.5f, 2604.6f}))
+          << "Failed using precision " << ToString(precision);
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, FullyConnectedExtraLarge) {
+  static const int kInputSize = 1024;
+  static const int kOutputSize = 1024;
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 1, 1, kInputSize);
+  src_tensor.data.assign(kInputSize, 1.1f);
+
+  FullyConnectedAttributes attr;
+  attr.weights.shape = OHWI(1024, 1, 1, kInputSize);
+  attr.weights.data.assign(kOutputSize * kInputSize, 2.2f);
+  attr.bias.shape = Linear(kOutputSize);
+  attr.bias.data.assign(kOutputSize, 3.3f);
+
+  std::vector<float> expected(kOutputSize, 2481.38f);
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      float eps;
+      switch (precision) {
+        case CalculationsPrecision::F32:
+          eps = 2.45e-3f;
+          break;
+        case CalculationsPrecision::F32_F16:
+          eps = 1.38f;
+          break;
+        case CalculationsPrecision::F16:
+          eps = 38.7f;
+          break;
+      }
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      FullyConnected operation =
+          CreateFullyConnected(creation_context_.GetDeviceInfo(), op_def, attr);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 1, 1, kOutputSize), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), expected))
+          << "Failed using precision " << ToString(precision);
     }
   }
 }
@@ -65,53 +165,74 @@ TEST_F(OpenCLOperationTest, FullyConnected) {
 TEST_F(OpenCLOperationTest, RearrageWeights) {
   tflite::gpu::Tensor<OHWI, DataType::FLOAT32> weights;
   weights.shape = OHWI(8, 1, 1, 8);
-  weights.data = {0.0,  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  10.0, 11.0,
-                  12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 20.0, 21.0, 22.0, 23.0,
-                  24.0, 25.0, 26.0, 27.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0,
-                  36.0, 37.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0,
-                  50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 60.0, 61.0,
-                  62.0, 63.0, 64.0, 65.0, 66.0, 67.0, 70.0, 71.0, 72.0, 73.0,
-                  74.0, 75.0, 76.0, 77.0};
-
-  std::vector<float> expected_rearranged_data = {
-      0.0,  1.0,  2.0,  3.0,  10.0, 11.0, 12.0, 13.0, 20.0, 21.0, 22.0,
-      23.0, 30.0, 31.0, 32.0, 33.0, 40.0, 41.0, 42.0, 43.0, 50.0, 51.0,
-      52.0, 53.0, 60.0, 61.0, 62.0, 63.0, 70.0, 71.0, 72.0, 73.0, 4.0,
-      5.0,  6.0,  7.0,  14.0, 15.0, 16.0, 17.0, 24.0, 25.0, 26.0, 27.0,
-      34.0, 35.0, 36.0, 37.0, 44.0, 45.0, 46.0, 47.0, 54.0, 55.0, 56.0,
-      57.0, 64.0, 65.0, 66.0, 67.0, 74.0, 75.0, 76.0, 77.0,
+  weights.data = {
+      0.0f,  1.0f,  2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,   //
+      10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f,  //
+      20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f, 26.0f, 27.0f,  //
+      30.0f, 31.0f, 32.0f, 33.0f, 34.0f, 35.0f, 36.0f, 37.0f,  //
+      40.0f, 41.0f, 42.0f, 43.0f, 44.0f, 45.0f, 46.0f, 47.0f,  //
+      50.0f, 51.0f, 52.0f, 53.0f, 54.0f, 55.0f, 56.0f, 57.0f,  //
+      60.0f, 61.0f, 62.0f, 63.0f, 64.0f, 65.0f, 66.0f, 67.0f,  //
+      70.0f, 71.0f, 72.0f, 73.0f, 74.0f, 75.0f, 76.0f, 77.0f   //
   };
 
+  std::vector<float> expected_rearranged_data = {
+      // Top-left block
+      0.0f, 10.0f, 20.0f, 30.0f, 1.0f, 11.0f, 21.0f, 31.0f, 2.0f, 12.0f, 22.0f,
+      32.0f, 3.0f, 13.0f, 23.0f, 33.0f,
+      // Bottom-left block
+      40.0f, 50.0f, 60.0f, 70.0f, 41.0f, 51.0f, 61.0f, 71.0f, 42.0f, 52.0f,
+      62.0f, 72.0f, 43.0f, 53.0f, 63.0f, 73.0f,
+      // Top-right block
+      4.0f, 14.0f, 24.0f, 34.0f, 5.0f, 15.0f, 25.0f, 35.0f, 6.0f, 16.0f, 26.0f,
+      36.0f, 7.0f, 17.0f, 27.0f, 37.0f,
+      // Bottom-right block
+      44.0f, 54.0f, 64.0f, 74.0f, 45.0f, 55.0f, 65.0f, 75.0f, 46.0f, 56.0f,
+      66.0f, 76.0f, 47.0f, 57.0f, 67.0f, 77.0f};
+
   std::vector<float> data(8 * 8);
-  float4* data_ptr = static_cast<float4*>(static_cast<void*>(data.data()));
-  RearrangeFCWeightsToIOO4I4(weights, absl::MakeSpan(data_ptr, 8 * 8 / 4));
+  RearrangeFCWeightsToIOO4I4(weights, data.data());
 
   EXPECT_THAT(data, ElementsAreArray(expected_rearranged_data));
 }
 
 TEST_F(OpenCLOperationTest, RearrageWeightsWhenPaddingIsRequired) {
   tflite::gpu::Tensor<OHWI, DataType::FLOAT32> weights;
-  weights.shape = OHWI(7, 1, 1, 7);
+  weights.shape = OHWI(9, 1, 1, 7);
   weights.data = {
-      0.0,  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  10.0, 11.0, 12.0,
-      13.0, 14.0, 15.0, 16.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0,
-      26.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 40.0, 41.0,
-      42.0, 43.0, 44.0, 45.0, 46.0, 50.0, 51.0, 52.0, 53.0, 54.0,
-      55.0, 56.0, 60.0, 61.0, 62.0, 63.0, 64.0, 65.0, 66.0,
+      0.0f,  1.0f,  2.0f,  3.0f,  4.0f,  5.0f,  6.0f,   //
+      10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f,  //
+      20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f, 26.0f,  //
+      30.0f, 31.0f, 32.0f, 33.0f, 34.0f, 35.0f, 36.0f,  //
+      40.0f, 41.0f, 42.0f, 43.0f, 44.0f, 45.0f, 46.0f,  //
+      50.0f, 51.0f, 52.0f, 53.0f, 54.0f, 55.0f, 56.0f,  //
+      60.0f, 61.0f, 62.0f, 63.0f, 64.0f, 65.0f, 66.0f,  //
+      70.0f, 71.0f, 72.0f, 73.0f, 74.0f, 75.0f, 76.0f,  //
+      80.0f, 81.0f, 82.0f, 83.0f, 84.0f, 85.0f, 86.0f,  //
   };
 
   std::vector<float> expected_rearranged_data = {
-      0.0,  1.0,  2.0,  3.0,  10.0, 11.0, 12.0, 13.0, 20.0, 21.0, 22.0,
-      23.0, 30.0, 31.0, 32.0, 33.0, 40.0, 41.0, 42.0, 43.0, 50.0, 51.0,
-      52.0, 53.0, 60.0, 61.0, 62.0, 63.0, 0.0,  0.0,  0.0,  0.0,  4.0,
-      5.0,  6.0,  0.0,  14.0, 15.0, 16.0, 0.0,  24.0, 25.0, 26.0, 0.0,
-      34.0, 35.0, 36.0, 0.0,  44.0, 45.0, 46.0, 0.0,  54.0, 55.0, 56.0,
-      0.0,  64.0, 65.0, 66.0, 0.0,  0.0,  0.0,  0.0,  0.0,
-  };
+      // Top-left block
+      0.0f, 10.0f, 20.0f, 30.0f, 1.0f, 11.0f, 21.0f, 31.0f, 2.0f, 12.0f, 22.0f,
+      32.0f, 3.0f, 13.0f, 23.0f, 33.0f,
+      // Mid-left block
+      40.0f, 50.0f, 60.0f, 70.0f, 41.0f, 51.0f, 61.0f, 71.0f, 42.0f, 52.0f,
+      62.0f, 72.0f, 43.0f, 53.0f, 63.0f, 73.0f,
+      // Bottom-left block
+      80.0f, 0.0f, 0.0f, 0.0f, 81.0f, 0.0f, 0.0f, 0.0f, 82.0f, 0.0f, 0.0f, 0.0f,
+      83.0f, 0.0f, 0.0f, 0.0f,
+      // Top-right block
+      4.0f, 14.0f, 24.0f, 34.0f, 5.0f, 15.0f, 25.0f, 35.0f, 6.0f, 16.0f, 26.0f,
+      36.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+      // Mid-left block
+      44.0f, 54.0f, 64.0f, 74.0f, 45.0f, 55.0f, 65.0f, 75.0f, 46.0f, 56.0f,
+      66.0f, 76.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+      // Bottom-right block
+      84.0f, 0.0f, 0.0f, 0.0f, 85.0f, 0.0f, 0.0f, 0.0f, 86.0f, 0.0f, 0.0f, 0.0f,
+      0.0f, 0.0f, 0.0f, 0.0f};
 
-  std::vector<float> data(8 * 8);
-  float4* data_ptr = static_cast<float4*>(static_cast<void*>(data.data()));
-  RearrangeFCWeightsToIOO4I4(weights, absl::MakeSpan(data_ptr, 8 * 8 / 4));
+  std::vector<float> data(12 * 8);
+  RearrangeFCWeightsToIOO4I4(weights, data.data());
 
   EXPECT_THAT(data, ElementsAreArray(expected_rearranged_data));
 }
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.cc b/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.cc
index 7260048c6d3..b34b8e38b41 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.cc
@@ -49,20 +49,6 @@ std::string GetElementWiseCode(const OperationDef& op_def,
   return c;
 }
 
-absl::Status MergeOperations(const std::vector<GPUOperation*>& linked_ops,
-                             Arguments* merged_args, std::string* merged_code) {
-  for (int i = 0; i < linked_ops.size(); ++i) {
-    std::string code = linked_ops[i]->code_;
-    std::string unique_postfix = absl::StrCat("_link", i + 1);
-    linked_ops[i]->args_.RenameArgs(unique_postfix, &code);
-    *merged_code += "{\n" + code + "\n}\n";
-    RETURN_IF_ERROR(
-        merged_args->Merge(std::move(linked_ops[i]->args_), unique_postfix));
-    linked_ops[i]->AddUniquePostfix(unique_postfix);
-  }
-  return absl::OkStatus();
-}
-
 }  // namespace
 
 DataType OperationDef::GetDataType() const {
@@ -76,20 +62,6 @@ TensorStorageType OperationDef::GetPrimaryStorageType() const {
   return src_tensors[0].storage_type;
 }
 
-bool OperationDef::HasAllTensorsOfType(TensorStorageType storage_type) const {
-  for (const auto& src : src_tensors) {
-    if (src.storage_type != storage_type) {
-      return false;
-    }
-  }
-  for (const auto& dst : dst_tensors) {
-    if (dst.storage_type != storage_type) {
-      return false;
-    }
-  }
-  return true;
-}
-
 bool OperationDef::IsBatchSupported() const {
   for (const auto& src : src_tensors) {
     if (HasAxis(src.layout, Axis::BATCH)) {
@@ -124,6 +96,9 @@ void GPUOperation::SetDst(Tensor* ptr, int index) {
 GPUOperation::GPUOperation(GPUOperation&& operation)
     : args_(std::move(operation.args_)),
       code_(std::move(operation.code_)),
+      work_group_size_(operation.work_group_size_),
+      compiler_options_(std::move(operation.compiler_options_)),
+      tensor_to_grid_(operation.tensor_to_grid_),
       elementwise_(operation.elementwise_),
       linkable_(operation.linkable_),
       check_src_channels_size_(operation.check_src_channels_size_),
@@ -131,17 +106,19 @@ GPUOperation::GPUOperation(GPUOperation&& operation)
       src_(std::move(operation.src_)),
       dst_(std::move(operation.dst_)),
       kernel_(std::move(operation.kernel_)),
-      work_group_size_(operation.work_group_size_),
       grid_size_(operation.grid_size_),
       src_tensors_names_(std::move(operation.src_tensors_names_)),
       dst_tensors_names_(std::move(operation.dst_tensors_names_)),
-      compiler_options_(std::move(operation.compiler_options_)),
-      linked_operations_(std::move(operation.linked_operations_)) {}
+      linkable_count_(operation.linkable_count_),
+      elementwise_code_(std::move(operation.elementwise_code_)) {}
 
 GPUOperation& GPUOperation::operator=(GPUOperation&& operation) {
   if (this != &operation) {
     args_ = std::move(operation.args_);
     code_ = std::move(operation.code_);
+    std::swap(work_group_size_, operation.work_group_size_);
+    compiler_options_ = std::move(operation.compiler_options_);
+    tensor_to_grid_ = operation.tensor_to_grid_;
     elementwise_ = operation.elementwise_;
     linkable_ = operation.linkable_;
     check_src_channels_size_ = operation.check_src_channels_size_;
@@ -149,18 +126,33 @@ GPUOperation& GPUOperation::operator=(GPUOperation&& operation) {
     src_ = std::move(operation.src_);
     dst_ = std::move(operation.dst_);
     kernel_ = std::move(operation.kernel_);
-    std::swap(work_group_size_, operation.work_group_size_);
     std::swap(grid_size_, operation.grid_size_);
     src_tensors_names_ = std::move(operation.src_tensors_names_);
     dst_tensors_names_ = std::move(operation.dst_tensors_names_);
-    compiler_options_ = std::move(operation.compiler_options_);
-    linked_operations_ = std::move(operation.linked_operations_);
+    std::swap(linkable_count_, operation.linkable_count_);
+    elementwise_code_ = std::move(operation.elementwise_code_);
   }
   return *this;
 }
 
-void GPUOperation::AddOperation(GPUOperation* operation) {
-  linked_operations_.push_back(operation);
+absl::Status GPUOperation::AddOperation(GPUOperation* operation) {
+  linkable_count_ += 1;
+  std::string code = operation->code_;
+  std::string unique_postfix = absl::StrCat("_link", linkable_count_);
+  operation->args_.RenameArgs(unique_postfix, &code);
+  elementwise_code_ += "{\n" + code + "\n}\n";
+  RETURN_IF_ERROR(args_.Merge(std::move(operation->args_), unique_postfix));
+  for (int i = 0; i < operation->src_tensors_names_.size(); ++i) {
+    definition_.src_tensors.push_back(
+        operation->definition_.src_tensors[i + 1]);
+    src_tensors_names_.push_back(operation->src_tensors_names_[i] +
+                                 unique_postfix);
+  }
+  for (int i = 0; i < operation->dst_tensors_names_.size(); ++i) {
+    dst_tensors_names_.push_back(operation->dst_tensors_names_[i] +
+                                 unique_postfix);
+  }
+  return absl::OkStatus();
 }
 
 void GPUOperation::AddSrcTensor(const std::string& tensor_name,
@@ -191,12 +183,6 @@ absl::Status GPUOperation::UpdateParams() {
   for (int i = 0; i < dst_tensors_names_.size(); ++i) {
     RETURN_IF_ERROR(args_.SetObjectRef(dst_tensors_names_[i], dst_[i]));
   }
-  for (const auto linked_op : linked_operations_) {
-    for (int i = 0; i < linked_op->src_tensors_names_.size(); ++i) {
-      RETURN_IF_ERROR(args_.SetObjectRef(linked_op->src_tensors_names_[i],
-                                         linked_op->src_[i + 1]));
-    }
-  }
   RETURN_IF_ERROR(BindArguments());
   grid_size_ = GetGridSize();
   return absl::OkStatus();
@@ -222,40 +208,81 @@ absl::Status GPUOperation::Compile(const CreationContext& creation_context) {
 
     std::string code =
         GetElementWiseCode(definition_, check_src_channels_size_);
-    std::string element_wise_code;
-    element_wise_code += "{\n" + code_ + "\n}\n";
-    RETURN_IF_ERROR(
-        MergeOperations(linked_operations_, &args_, &element_wise_code));
+    elementwise_code_ = "{\n" + code_ + "\n}\n" + elementwise_code_;
+    RETURN_IF_ERROR(args_.AllocateObjects(creation_context.context));
     RETURN_IF_ERROR(args_.TransformToCLCode(
-        creation_context.device->GetInfo(),
-        {{dst_tensors_names_[0], element_wise_code}}, &code));
-    code = absl::Substitute(code, args_.GetListOfArgs());
+        creation_context.device->info_,
+        {{dst_tensors_names_[0], elementwise_code_}}, &code));
     RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
         code, "main_function", *creation_context.context,
         *creation_context.device, &kernel_));
   } else {
-    std::string element_wise_code;
-    RETURN_IF_ERROR(
-        MergeOperations(linked_operations_, &args_, &element_wise_code));
+    RETURN_IF_ERROR(args_.AllocateObjects(creation_context.context));
     RETURN_IF_ERROR(args_.TransformToCLCode(
-        creation_context.device->GetInfo(),
-        {{dst_tensors_names_[0], element_wise_code}}, &code_));
+        creation_context.device->info_,
+        {{dst_tensors_names_[0], elementwise_code_}}, &code_));
     RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
         code_, "main_function", compiler_options_, *creation_context.context,
         *creation_context.device, &kernel_));
   }
-  return PostCompileCheck(creation_context.device->GetInfo());
+  return PostCompileCheck(creation_context.device->info_, kernel_.info_);
+}
+
+void GPUOperation::GetPossibleKernelWorkGroups(
+    TuningType tuning_type, const DeviceInfo& device_info,
+    const KernelInfo& kernel_info, std::vector<int3>* work_groups) const {
+  GetPossibleWorkGroups(tuning_type, device_info, kernel_info, grid_size_,
+                        work_groups);
+}
+
+absl::Status GPUOperation::Tune(const TuningParameters& params) {
+  std::vector<int3> possible_work_groups;
+  GetPossibleKernelWorkGroups(params.tuning_type, *params.info, kernel_.info_,
+                              &possible_work_groups);
+  if (possible_work_groups.empty()) {
+    return absl::NotFoundError(
+        "Can not found work_group size to launch kernel");
+  }
+  if (possible_work_groups.size() == 1) {
+    work_group_size_ = possible_work_groups[0];
+    return absl::OkStatus();
+  } else {
+    RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
+    int best_work_group_index;
+    RETURN_IF_ERROR(params.queue->GetBestWorkGroupIndex(
+        kernel_, *params.info, grid_size_, possible_work_groups,
+        &best_work_group_index));
+    work_group_size_ = possible_work_groups[best_work_group_index];
+    return absl::OkStatus();
+  }
 }
 
 int3 GPUOperation::GetGridSize() const {
-  if (elementwise_) {
+  if (elementwise_ || tensor_to_grid_ == TensorToGrid::kWBToX_HDToY_SToZ) {
     const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
-    const int grid_y = dst_[0]->Height();
+    const int grid_y = dst_[0]->Height() * dst_[0]->Depth();
     const int grid_z = dst_[0]->Slices();
     return int3(grid_x, grid_y, grid_z);
-  } else {
-    return int3(0, 0, 0);
   }
+  if (tensor_to_grid_ == TensorToGrid::kWBToX_HDToY_ZIs1) {
+    const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
+    const int grid_y = dst_[0]->Height() * dst_[0]->Depth();
+    const int grid_z = 1;
+    return int3(grid_x, grid_y, grid_z);
+  }
+  if (tensor_to_grid_ == TensorToGrid::kWBToX_HToY_DToZ) {
+    const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
+    const int grid_y = dst_[0]->Height();
+    const int grid_z = dst_[0]->Depth();
+    return int3(grid_x, grid_y, grid_z);
+  }
+  if (tensor_to_grid_ == TensorToGrid::kBToX_YIs1_ZIs1) {
+    const int grid_x = dst_[0]->Batch();
+    const int grid_y = 1;
+    const int grid_z = 1;
+    return int3(grid_x, grid_y, grid_z);
+  }
+  return int3(0, 0, 0);
 }
 
 void GPUOperation::AddUniquePostfix(const std::string& unique_postfix) {
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h b/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h
index 620883f26f4..2fa8c90c1da 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h
@@ -37,11 +37,39 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
+// kCustom: default value
+//   GPUOperation::GetGridSize must be overloaded
+// kWBToX_HDToY_SToZ:
+//   grid_x = dst_[0]->Width() * dst_[0]->Batch();
+//   grid_y = dst_[0]->Height() * dst_[0]->Depth();
+//   grid_z = dst_[0]->Slices();
+// kWBToX_HDToY_ZIs1:
+//   grid_x = dst_[0]->Width() * dst_[0]->Batch();
+//   grid_y = dst_[0]->Height() * dst_[0]->Depth();
+//   grid_z = 1;
+// kWBToX_HToY_DToZ:
+//   grid_x = dst_[0]->Width() * dst_[0]->Batch();
+//   grid_y = dst_[0]->Height();
+//   grid_z = dst_[0]->Depth();
+// kBToX_YIs1_ZIs1:
+//   grid_x = dst_[0]->Batch();
+//   grid_y = 1;
+//   grid_z = 1;
+enum class TensorToGrid {
+  kCustom,
+  kWBToX_HDToY_SToZ,
+  kWBToX_HDToY_ZIs1,
+  kWBToX_HToY_DToZ,
+  kBToX_YIs1_ZIs1
+};
+
 struct CreationContext {
   const CLDevice* device;
   CLContext* context;
   CLCommandQueue* queue;
   ProgramCache* cache;
+
+  const DeviceInfo& GetDeviceInfo() const { return device->info_; }
 };
 
 struct OperationDef {
@@ -55,7 +83,6 @@ struct OperationDef {
   // the structure of kernel, all other resources(biases) types and etc.
   DataType GetPrimaryDataType() const;
   TensorStorageType GetPrimaryStorageType() const;
-  bool HasAllTensorsOfType(TensorStorageType storage_type) const;
   bool IsBatchSupported() const;
 };
 
@@ -80,7 +107,7 @@ class GPUOperation {
   GPUOperation(const GPUOperation&) = delete;
   GPUOperation& operator=(const GPUOperation&) = delete;
 
-  void AddOperation(GPUOperation* operation);
+  absl::Status AddOperation(GPUOperation* operation);
 
   void SetSrc(Tensor* ptr, int index = 0);
   void SetDst(Tensor* ptr, int index = 0);
@@ -93,14 +120,16 @@ class GPUOperation {
     return queue->DispatchImplicit(kernel_, grid_size_, work_group_size_);
   }
 
-  virtual absl::Status Tune(const TuningParameters& params) {
-    RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
-    return GetBestWorkGroup(params, kernel_, grid_size_, &work_group_size_);
-  }
+  virtual void GetPossibleKernelWorkGroups(
+      TuningType tuning_type, const DeviceInfo& device_info,
+      const KernelInfo& kernel_info, std::vector<int3>* work_groups) const;
 
-  virtual absl::Status Compile(const CreationContext& creation_context);
+  absl::Status Tune(const TuningParameters& params);
 
-  virtual absl::Status PostCompileCheck(const DeviceInfo& device_info) {
+  absl::Status Compile(const CreationContext& creation_context);
+
+  virtual absl::Status PostCompileCheck(const DeviceInfo& device_info,
+                                        const KernelInfo& kernel_info) {
     return absl::OkStatus();
   }
 
@@ -120,6 +149,10 @@ class GPUOperation {
 
   Arguments args_;
   std::string code_;
+  int3 work_group_size_ = int3(8, 4, 1);
+  std::vector<CompilerOptions> compiler_options_;
+  // not applicable to elementwise
+  TensorToGrid tensor_to_grid_ = TensorToGrid::kCustom;
 
   bool elementwise_ = false;
   // applicable only with elementwise_ = true;
@@ -136,12 +169,13 @@ class GPUOperation {
   std::vector<Tensor*> src_;
   std::vector<Tensor*> dst_;
   CLKernel kernel_;
-  int3 work_group_size_ = int3(8, 4, 1);
   int3 grid_size_ = int3(0, 0, 0);
   std::vector<std::string> src_tensors_names_;
   std::vector<std::string> dst_tensors_names_;
-  std::vector<CompilerOptions> compiler_options_;
-  std::vector<GPUOperation*> linked_operations_;
+
+ private:
+  int linkable_count_ = 0;
+  std::string elementwise_code_;  // temporary, used during op construction
 };
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/lstm.cc b/tensorflow/lite/delegates/gpu/cl/kernels/lstm.cc
index 0fc5e498de4..c98ac36cd3a 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/lstm.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/lstm.cc
@@ -24,33 +24,14 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 namespace cl {
-
-LSTM::LSTM(const OperationDef& definition, const DeviceInfo& device_info)
-    : GPUOperation(definition) {
-  code_ = GetLSTMCode(definition_, device_info);
-}
-
-LSTM::LSTM(LSTM&& kernel) : GPUOperation(std::move(kernel)) {}
-
-LSTM& LSTM::operator=(LSTM&& kernel) {
-  if (this != &kernel) {
-    GPUOperation::operator=(std::move(kernel));
-  }
-  return *this;
-}
-
-std::string LSTM::GetLSTMCode(const OperationDef& op_def,
-                              const DeviceInfo& device_info) {
-  AddSrcTensor("intermediate", op_def.src_tensors[0]);
-  AddSrcTensor("prev_state", op_def.src_tensors[1]);
-  AddDstTensor("new_state", op_def.dst_tensors[0]);
-  AddDstTensor("activation", op_def.dst_tensors[1]);
-
+namespace {
+std::string GetLSTMCode(const OperationDef& op_def,
+                        const DeviceInfo& device_info) {
   std::string c = GetCommonDefines(op_def.precision);
   c += "__kernel void main_function(\n";
   c += "$0) {\n";
   c += "  int B = get_global_id(0);\n";
-  c += "  int Z = get_global_id(1);\n";
+  c += "  int Z = get_global_id(2);\n";
   c += "  if (Z >= args.activation.Slices() || B >= args.activation.Batch()) "
        "return;\n";
   c += "  FLT4 prev_st = args.prev_state.Read(0, 0, Z, B);\n";
@@ -105,15 +86,18 @@ std::string LSTM::GetLSTMCode(const OperationDef& op_def,
   return c;
 }
 
-int3 LSTM::GetGridSize() const {
-  const int grid_x = dst_[0]->Batch();
-  const int grid_y = dst_[0]->Slices();
-  const int grid_z = 1;
-  return int3(grid_x, grid_y, grid_z);
-}
+}  // namespace
 
-LSTM CreateLSTM(const OperationDef& definition, const DeviceInfo& device_info) {
-  return LSTM(definition, device_info);
+GPUOperation CreateLSTM(const OperationDef& definition,
+                        const DeviceInfo& device_info) {
+  GPUOperation op(definition);
+  op.AddSrcTensor("intermediate", definition.src_tensors[0]);
+  op.AddSrcTensor("prev_state", definition.src_tensors[1]);
+  op.AddDstTensor("new_state", definition.dst_tensors[0]);
+  op.AddDstTensor("activation", definition.dst_tensors[1]);
+  op.code_ = GetLSTMCode(definition, device_info);
+  op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_SToZ;
+  return op;
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/lstm.h b/tensorflow/lite/delegates/gpu/cl/kernels/lstm.h
index 91bfd22a0e6..5d827d46bc3 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/lstm.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/lstm.h
@@ -25,23 +25,8 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-class LSTM : public GPUOperation {
- public:
-  LSTM(const OperationDef& definition, const DeviceInfo& device_info);
-  int3 GetGridSize() const override;
-
-  // Move only
-  LSTM(LSTM&& kernel);
-  LSTM& operator=(LSTM&& kernel);
-  LSTM(const LSTM&) = delete;
-  LSTM& operator=(const LSTM&) = delete;
-
- private:
-  std::string GetLSTMCode(const OperationDef& op_def,
-                          const DeviceInfo& device_info);
-};
-
-LSTM CreateLSTM(const OperationDef& definition, const DeviceInfo& device_info);
+GPUOperation CreateLSTM(const OperationDef& definition,
+                        const DeviceInfo& device_info);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/lstm_full_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/lstm_full_test.cc
new file mode 100644
index 00000000000..08cb622ff91
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/lstm_full_test.cc
@@ -0,0 +1,1181 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Unit test for TFLite LSTM op.
+
+#include <initializer_list>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/delegates/gpu/delegate.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class LSTMOpModel : public SingleOpModel {
+ public:
+  LSTMOpModel(int n_batch, int n_input, int n_cell, int n_output, bool use_cifg,
+              bool use_peephole, bool use_projection_weights,
+              bool use_projection_bias, const TensorType weight_type,
+              bool model_has_legacy_20_inputs, bool is_layer_norm,
+              bool asymmetric_quantize_inputs,
+              std::initializer_list<float> input_to_input_weights,
+              std::initializer_list<float> input_to_forget_weights,
+              std::initializer_list<float> input_to_cell_weights,
+              std::initializer_list<float> input_to_output_weights,
+              std::initializer_list<float> recurrent_to_input_weights,
+              std::initializer_list<float> recurrent_to_forget_weights,
+              std::initializer_list<float> recurrent_to_cell_weights,
+              std::initializer_list<float> recurrent_to_output_weights,
+              std::initializer_list<float> cell_to_input_weights,
+              std::initializer_list<float> cell_to_forget_weights,
+              std::initializer_list<float> cell_to_output_weights,
+              std::initializer_list<float> input_gate_bias,
+              std::initializer_list<float> forget_gate_bias,
+              std::initializer_list<float> cell_gate_bias,
+              std::initializer_list<float> output_gate_bias,
+              std::initializer_list<float> projection_weights,
+              std::initializer_list<float> projection_bias,
+              std::initializer_list<float> input_layer_norm_coefficients,
+              std::initializer_list<float> forget_layer_norm_coefficients,
+              std::initializer_list<float> cell_layer_norm_coefficients,
+              std::initializer_list<float> output_layer_norm_coefficients)
+      : n_input_(n_input),
+        n_output_(n_output),
+        n_batch_(n_batch),
+        weight_type_(weight_type) {
+    input_ = AddInput({TensorType_FLOAT32, {n_batch, n_input}});
+
+    if (use_cifg) {
+      AddNullInput();
+    } else {
+      AddConstInput({weight_type, {n_cell, n_input}}, input_to_input_weights);
+    }
+    AddConstInput({weight_type, {n_cell, n_input}}, input_to_forget_weights);
+    AddConstInput({weight_type, {n_cell, n_input}}, input_to_cell_weights);
+    AddConstInput({weight_type, {n_cell, n_input}}, input_to_output_weights);
+
+    if (use_cifg) {
+      AddNullInput();
+    } else {
+      AddConstInput({weight_type, {n_cell, n_output}},
+                    recurrent_to_input_weights);
+    }
+    AddConstInput({weight_type, {n_cell, n_output}},
+                  recurrent_to_forget_weights);
+    AddConstInput({weight_type, {n_cell, n_output}}, recurrent_to_cell_weights);
+    AddConstInput({weight_type, {n_cell, n_output}},
+                  recurrent_to_output_weights);
+
+    if (use_peephole) {
+      if (use_cifg) {
+        AddNullInput();
+      } else {
+        AddConstInput({weight_type, {n_cell}}, cell_to_input_weights);
+      }
+      AddConstInput({weight_type, {n_cell}}, cell_to_forget_weights);
+      AddConstInput({weight_type, {n_cell}}, cell_to_output_weights);
+    } else {
+      AddNullInput();
+      AddNullInput();
+      AddNullInput();
+    }
+
+    if (use_cifg) {
+      AddNullInput();
+    } else {
+      AddConstInput({TensorType_FLOAT32, {n_cell}}, input_gate_bias);
+    }
+    AddConstInput({TensorType_FLOAT32, {n_cell}}, forget_gate_bias);
+    AddConstInput({TensorType_FLOAT32, {n_cell}}, cell_gate_bias);
+    AddConstInput({TensorType_FLOAT32, {n_cell}}, output_gate_bias);
+
+    if (use_projection_weights) {
+      AddConstInput({weight_type, {n_output, n_cell}}, projection_weights);
+    } else {
+      AddNullInput();
+    }
+    if (use_projection_bias) {
+      CHECK(use_projection_weights);
+      AddConstInput({TensorType_FLOAT32, {n_output}}, projection_bias);
+    } else {
+      AddNullInput();
+    }
+
+    // Adding the 2 state tensors.
+    AddVariableInput({TensorType_FLOAT32, {n_batch, n_output}});
+    AddVariableInput({TensorType_FLOAT32, {n_batch, n_cell}});
+
+    // Layer norm weights.
+    if (!model_has_legacy_20_inputs) {
+      if (is_layer_norm) {
+        if (use_cifg) {
+          AddNullInput();
+        } else {
+          AddConstInput({TensorType_FLOAT32, {n_cell}},
+                        input_layer_norm_coefficients);
+        }
+        AddConstInput({TensorType_FLOAT32, {n_cell}},
+                      forget_layer_norm_coefficients);
+        AddConstInput({TensorType_FLOAT32, {n_cell}},
+                      cell_layer_norm_coefficients);
+        AddConstInput({TensorType_FLOAT32, {n_cell}},
+                      output_layer_norm_coefficients);
+      } else {
+        AddNullInput();
+        AddNullInput();
+        AddNullInput();
+        AddNullInput();
+      }
+    }
+
+    output_ = AddOutput({TensorType_FLOAT32, {n_batch, n_output}});
+
+    // TODO(b/161825581): Add tests where cell_clip and/or proj_clip is not the
+    // default 0.
+    SetBuiltinOp(
+        BuiltinOperator_LSTM, BuiltinOptions_LSTMOptions,
+        CreateLSTMOptions(builder_, ActivationFunctionType_TANH,
+                          /*cell_clip=*/0.0f, /*proj_clip=*/0.0f,
+                          LSTMKernelType_FULL, asymmetric_quantize_inputs)
+            .Union());
+
+    // Input shapes are already set up, no need to pass them again.
+    BuildInterpreter(/*input_shapes=*/{}, /*num_threads=*/-1,
+                     /*allow_fp32_relax_to_fp16=*/false,
+                     /*apply_delegate=*/false);
+
+    auto options = TfLiteGpuDelegateOptionsV2Default();
+    // MeanStddevNormalization is only implemented in OpenCL now.
+    options.experimental_flags |= TFLITE_GPU_EXPERIMENTAL_FLAGS_CL_ONLY;
+    SetDelegate(TfLiteGpuDelegateV2Create(&options));
+  }
+
+  ~LSTMOpModel() { EXPECT_EQ(CountOpsExecutedByCpuKernel(), 0); }
+
+  void SetInput(int offset, const float* begin, const float* end) {
+    SingleOpModel::PopulateTensor(input_, offset, const_cast<float*>(begin),
+                                  const_cast<float*>(end));
+  }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+
+  int num_inputs() { return n_input_; }
+  int num_outputs() { return n_output_; }
+  int num_batches() { return n_batch_; }
+
+ protected:
+  int input_;
+  int output_;
+
+  int n_input_;
+  int n_output_;
+  int n_batch_;
+
+ private:
+  const TensorType weight_type_;
+};
+
+// GetParam() => model_has_legacy_20_inputs
+class LstmOpTest : public ::testing::TestWithParam<bool> {
+ protected:
+  // Weights of the LSTM model. Some are optional.
+  std::initializer_list<float> input_to_input_weights_;
+  std::initializer_list<float> input_to_forget_weights_;
+  std::initializer_list<float> input_to_cell_weights_;
+  std::initializer_list<float> input_to_output_weights_;
+  std::initializer_list<float> recurrent_to_input_weights_;
+  std::initializer_list<float> recurrent_to_forget_weights_;
+  std::initializer_list<float> recurrent_to_cell_weights_;
+  std::initializer_list<float> recurrent_to_output_weights_;
+  std::initializer_list<float> cell_to_input_weights_;
+  std::initializer_list<float> cell_to_forget_weights_;
+  std::initializer_list<float> cell_to_output_weights_;
+  std::initializer_list<float> input_gate_bias_;
+  std::initializer_list<float> forget_gate_bias_;
+  std::initializer_list<float> cell_gate_bias_;
+  std::initializer_list<float> output_gate_bias_;
+  std::initializer_list<float> projection_weights_;
+  std::initializer_list<float> input_layer_norm_coefficients_;
+  std::initializer_list<float> forget_layer_norm_coefficients_;
+  std::initializer_list<float> cell_layer_norm_coefficients_;
+  std::initializer_list<float> output_layer_norm_coefficients_;
+
+  // LSTM input is stored as num_steps * num_batch * num_inputs vector.
+  std::vector<std::vector<std::vector<float>>> lstm_input_;
+  // LSTM output is stored as num_steps * num_batch * num_outputs vector.
+  std::vector<std::vector<std::vector<float>>> lstm_golden_output_;
+
+  // Compares output up to tolerance to the result of the lstm given the input.
+  void VerifyGoldens(LSTMOpModel* lstm, float tolerance) {
+    EXPECT_EQ(lstm->ApplyDelegate(), kTfLiteOk);
+
+    const int num_inputs = lstm->num_inputs();
+    const int num_outputs = lstm->num_outputs();
+    const int num_batches = lstm->num_batches();
+
+    ASSERT_EQ(lstm_input_.size(), lstm_golden_output_.size());
+    const int num_steps = lstm_input_.size();
+
+    for (int i = 0; i < num_steps; ++i) {
+      ASSERT_EQ(num_batches, lstm_input_[i].size());
+      for (int b = 0; b < num_batches; ++b) {
+        ASSERT_EQ(num_inputs, lstm_input_[i][b].size());
+        const float* batch_start = lstm_input_[i][b].data();
+        const float* batch_end = batch_start + num_inputs;
+        lstm->SetInput(b * num_inputs, batch_start, batch_end);
+      }
+
+      lstm->Invoke();
+
+      std::vector<float> expected;
+      ASSERT_EQ(num_batches, lstm_golden_output_[i].size());
+      for (int b = 0; b < num_batches; ++b) {
+        ASSERT_EQ(num_outputs, lstm_golden_output_[i][b].size());
+        const float* batch_start = lstm_golden_output_[i][b].data();
+        const float* batch_end = batch_start + num_outputs;
+        expected.insert(expected.end(), batch_start, batch_end);
+      }
+
+      EXPECT_THAT(lstm->GetOutput(),
+                  ElementsAreArray(ArrayFloatNear(expected, tolerance)));
+    }
+  }
+};
+
+TEST_P(LstmOpTest, NoCifg_NoPeephole_NoProjection_NoLayerNorm) {
+  const int n_batch = 1;
+  const int n_input = 2;
+  // n_cell and n_output have the same size when there is no projection.
+  const int n_cell = 4;
+  const int n_output = 4;
+
+  bool model_has_legacy_20_inputs = GetParam();
+
+  input_to_input_weights_ = {-0.45018822, -0.02338299, -0.0870589,  -0.34550029,
+                             0.04266912,  -0.15680569, -0.34856534, 0.43890524};
+  input_to_cell_weights_ = {-0.50013041, 0.1370284,  0.11810488, 0.2013163,
+                            -0.20583314, 0.44344562, 0.22077113, -0.29909778};
+  input_to_forget_weights_ = {0.09701663,  0.20334584,  -0.50592935,
+                              -0.31343272, -0.40032279, 0.44781327,
+                              0.01387155,  -0.35593212};
+  input_to_output_weights_ = {-0.25065863, -0.28290087, 0.04613829, 0.40525138,
+                              0.44272184,  0.03897077,  -0.1556896, 0.19487578};
+  input_gate_bias_ = {0., 0., 0., 0.};
+  cell_gate_bias_ = {0., 0., 0., 0.};
+  forget_gate_bias_ = {1., 1., 1., 1.};
+  output_gate_bias_ = {0., 0., 0., 0.};
+
+  recurrent_to_input_weights_ = {
+      -0.0063535,  -0.2042388,  0.31454784,  -0.35746509,
+      0.28902304,  0.08183324,  -0.16555229, 0.02286911,
+      -0.13566875, 0.03034258,  0.48091322,  -0.12528998,
+      0.24077177,  -0.51332325, -0.33502164, 0.10629296};
+
+  recurrent_to_cell_weights_ = {
+      -0.3407414,  0.24443203,  -0.2078532,  0.26320225,
+      0.05695659,  -0.00123841, -0.4744786,  -0.35869038,
+      -0.06418842, -0.13502428, -0.501764,   0.22830659,
+      -0.46367589, 0.26016325,  -0.03894562, -0.16368064};
+
+  recurrent_to_forget_weights_ = {
+      -0.48684245, -0.06655136, 0.42224967,  0.2112639,
+      0.27654213,  0.20864892,  -0.07646349, 0.45877004,
+      0.00141793,  -0.14609534, 0.36447752,  0.09196436,
+      0.28053468,  0.01560611,  -0.20127171, -0.01140004};
+
+  recurrent_to_output_weights_ = {
+      0.43385774,  -0.17194885, 0.2718237,  0.09215671,
+      0.24107647,  -0.39835793, 0.18212086, 0.01301402,
+      0.48572797,  -0.50656658, 0.20047462, -0.20607421,
+      -0.51818722, -0.15390486, 0.0468148,  0.39922136};
+
+  // num_steps * num_batch * num_inputs
+  lstm_input_ = {{{2., 3.}}, {{3., 4.}}, {{1., 1.}}};
+  // num_steps * num_batch * num_outputs
+  lstm_golden_output_ = {{{-0.02973187, 0.1229473, 0.20885126, -0.15358765}},
+                         {{-0.03716109, 0.12507336, 0.41193449, -0.20860538}},
+                         {{-0.15053082, 0.09120187, 0.24278517, -0.12222792}}};
+
+  LSTMOpModel lstm(
+      n_batch, n_input, n_cell, n_output,
+      /*use_cifg=*/false, /*use_peephole=*/false,
+      /*use_projection_weights=*/false,
+      /*use_projection_bias=*/false, /*weight_type=*/TensorType_FLOAT32,
+      model_has_legacy_20_inputs,
+      /*is_layer_norm=*/false, /*asymmetric_quantize_inputs=*/false,
+      input_to_input_weights_, input_to_forget_weights_, input_to_cell_weights_,
+      input_to_output_weights_, recurrent_to_input_weights_,
+      recurrent_to_forget_weights_, recurrent_to_cell_weights_,
+      recurrent_to_output_weights_, cell_to_input_weights_,
+      cell_to_forget_weights_, cell_to_output_weights_, input_gate_bias_,
+      forget_gate_bias_, cell_gate_bias_, output_gate_bias_,
+      projection_weights_, {}, input_layer_norm_coefficients_,
+      forget_layer_norm_coefficients_, cell_layer_norm_coefficients_,
+      output_layer_norm_coefficients_);
+
+  VerifyGoldens(&lstm, 0.00001f);
+}
+
+TEST_P(LstmOpTest, Cifg_Peephole_NoProjection_NoLayerNorm) {
+  const int n_batch = 1;
+  const int n_input = 2;
+  // n_cell and n_output have the same size when there is no projection.
+  const int n_cell = 4;
+  const int n_output = 4;
+
+  bool model_has_legacy_20_inputs = GetParam();
+
+  input_to_cell_weights_ = {-0.49770179, -0.27711356, -0.09624726, 0.05100781,
+                            0.04717243,  0.48944736,  -0.38535351, -0.17212132};
+
+  input_to_forget_weights_ = {-0.55291498, -0.42866567, 0.13056988, -0.3633365,
+                              -0.22755712, 0.28253698,  0.24407166, 0.33826375};
+
+  input_to_output_weights_ = {0.10725588,  -0.02335852, -0.55932593,
+                              -0.09426838, -0.44257352, 0.54939759,
+                              0.01533556,  0.42751634};
+  cell_gate_bias_ = {0., 0., 0., 0.};
+  forget_gate_bias_ = {1., 1., 1., 1.};
+  output_gate_bias_ = {0., 0., 0., 0.};
+
+  recurrent_to_cell_weights_ = {
+      0.54066205,  -0.32668582, -0.43562764, -0.56094903,
+      0.42957711,  0.01841056,  -0.32764608, -0.33027974,
+      -0.10826075, 0.20675004,  0.19069612,  -0.03026325,
+      -0.54532051, 0.33003211,  0.44901288,  0.21193194};
+
+  recurrent_to_forget_weights_ = {
+      -0.13832897, -0.0515101,  -0.2359007, -0.16661474,
+      -0.14340827, 0.36986142,  0.23414481, 0.55899,
+      0.10798943,  -0.41174671, 0.17751795, -0.34484994,
+      -0.35874045, -0.11352962, 0.27268326, 0.54058349};
+
+  recurrent_to_output_weights_ = {
+      0.41613156, 0.42610586,  -0.16495961, -0.5663873,
+      0.30579174, -0.05115908, -0.33941799, 0.23364776,
+      0.11178309, 0.09481031,  -0.26424935, 0.46261835,
+      0.50248802, 0.26114327,  -0.43736315, 0.33149987};
+
+  cell_to_forget_weights_ = {0.47485286, -0.51955009, -0.24458408, 0.31544167};
+  cell_to_output_weights_ = {-0.17135078, 0.82760304, 0.85573703, -0.77109635};
+
+  lstm_input_ = {{{2., 3.}}, {{3., 4.}}, {{1., 1.}}};
+  lstm_golden_output_ = {{{-0.36444446, -0.00352185, 0.12886585, -0.05163646}},
+                         {{-0.42312205, -0.01218222, 0.24201041, -0.08124574}},
+                         {{-0.358325, -0.04621704, 0.21641694, -0.06471302}}};
+
+  LSTMOpModel lstm(
+      n_batch, n_input, n_cell, n_output,
+      /*use_cifg=*/true, /*use_peephole=*/true,
+      /*use_projection_weights=*/false,
+      /*use_projection_bias=*/false, /*weight_type=*/TensorType_FLOAT32,
+      model_has_legacy_20_inputs,
+      /*is_layer_norm=*/false, /*asymmetric_quantize_inputs=*/false,
+      input_to_input_weights_, input_to_forget_weights_, input_to_cell_weights_,
+      input_to_output_weights_, recurrent_to_input_weights_,
+      recurrent_to_forget_weights_, recurrent_to_cell_weights_,
+      recurrent_to_output_weights_, cell_to_input_weights_,
+      cell_to_forget_weights_, cell_to_output_weights_, input_gate_bias_,
+      forget_gate_bias_, cell_gate_bias_, output_gate_bias_,
+      projection_weights_, {}, input_layer_norm_coefficients_,
+      forget_layer_norm_coefficients_, cell_layer_norm_coefficients_,
+      output_layer_norm_coefficients_);
+
+  VerifyGoldens(&lstm, 0.00001f);
+}
+
+TEST_P(LstmOpTest, NoCifg_Peephole_Projection_NoLayerNorm) {
+  const int n_batch = 1;
+  const int n_input = 5;
+  const int n_cell = 20;
+  const int n_output = 16;
+
+  bool model_has_legacy_20_inputs = GetParam();
+
+  input_to_input_weights_ = {
+      0.021393683,  0.06124551,    0.046905167,  -0.014657677,  -0.03149463,
+      0.09171803,   0.14647801,    0.10797193,   -0.0057968358, 0.0019193048,
+      -0.2726754,   0.10154029,    -0.018539885, 0.080349885,   -0.10262385,
+      -0.022599787, -0.09121155,   -0.008675967, -0.045206103,  -0.0821282,
+      -0.008045952, 0.015478081,   0.055217247,  0.038719587,   0.044153627,
+      -0.06453243,  0.05031825,    -0.046935108, -0.008164439,  0.014574226,
+      -0.1671009,   -0.15519552,   -0.16819797,  -0.13971269,   -0.11953059,
+      0.25005487,   -0.22790983,   0.009855087,  -0.028140958,  -0.11200698,
+      0.11295408,   -0.0035217577, 0.054485075,  0.05184695,    0.064711206,
+      0.10989193,   0.11674786,    0.03490607,   0.07727357,    0.11390585,
+      -0.1863375,   -0.1034451,    -0.13945189,  -0.049401227,  -0.18767063,
+      0.042483903,  0.14233552,    0.13832581,   0.18350165,    0.14545603,
+      -0.028545704, 0.024939531,   0.050929718,  0.0076203286,  -0.0029723682,
+      -0.042484224, -0.11827596,   -0.09171104,  -0.10808628,   -0.16327988,
+      -0.2273378,   -0.0993647,    -0.017155107, 0.0023917493,  0.049272764,
+      0.0038534778, 0.054764505,   0.089753784,  0.06947234,    0.08014476,
+      -0.04544234,  -0.0497073,    -0.07135631,  -0.048929106,  -0.004042012,
+      -0.009284026, 0.018042054,   0.0036860977, -0.07427302,   -0.11434604,
+      -0.018995456, 0.031487543,   0.012834908,  0.019977754,   0.044256654,
+      -0.39292613,  -0.18519334,   -0.11651281,  -0.06809892,   0.011373677};
+
+  input_to_forget_weights_ = {
+      -0.0018401089, -0.004852237,  0.03698424,   0.014181704,   0.028273236,
+      -0.016726194,  -0.05249759,   -0.10204261,  0.00861066,    -0.040979505,
+      -0.009899187,  0.01923892,    -0.028177269, -0.08535103,   -0.14585495,
+      0.10662567,    -0.01909731,   -0.017883534, -0.0047269356, -0.045103323,
+      0.0030784295,  0.076784775,   0.07463696,   0.094531395,   0.0814421,
+      -0.12257899,   -0.033945758,  -0.031303465, 0.045630626,   0.06843887,
+      -0.13492945,   -0.012480007,  -0.0811829,   -0.07224499,   -0.09628791,
+      0.045100946,   0.0012300825,  0.013964662,  0.099372394,   0.02543059,
+      0.06958324,    0.034257296,   0.0482646,    0.06267997,    0.052625068,
+      0.12784666,    0.07077897,    0.025725935,  0.04165009,    0.07241905,
+      0.018668644,   -0.037377294,  -0.06277783,  -0.08833636,   -0.040120605,
+      -0.011405586,  -0.007808335,  -0.010301386, -0.005102167,  0.027717464,
+      0.05483423,    0.11449111,    0.11289652,   0.10939839,    0.13396506,
+      -0.08402166,   -0.01901462,   -0.044678304, -0.07720565,   0.014350063,
+      -0.11757958,   -0.0652038,    -0.08185733,  -0.076754324,  -0.092614375,
+      0.10405491,    0.052960336,   0.035755895,  0.035839386,   -0.012540553,
+      0.036881298,   0.02913376,    0.03420159,   0.05448447,    -0.054523353,
+      0.02582715,    0.02327355,    -0.011857179, -0.0011980024, -0.034641717,
+      -0.026125094,  -0.17582615,   -0.15923657,  -0.27486774,   -0.0006143371,
+      0.0001771948,  -8.470171e-05, 0.02651807,   0.045790765,   0.06956496};
+
+  input_to_cell_weights_ = {
+      -0.04580283,  -0.09549462,   -0.032418985,  -0.06454633,   -0.043528453,
+      0.043018587,  -0.049152344,  -0.12418144,   -0.078985475,  -0.07596889,
+      0.019484362,  -0.11434962,   -0.0074034138, -0.06314844,   -0.092981495,
+      0.0062155537, -0.025034338,  -0.0028890965, 0.048929527,   0.06235075,
+      0.10665918,   -0.032036792,  -0.08505916,   -0.10843358,   -0.13002433,
+      -0.036816437, -0.02130134,   -0.016518239,  0.0047691227,  -0.0025825808,
+      0.066017866,  0.029991534,   -0.10652836,   -0.1037554,    -0.13056071,
+      -0.03266643,  -0.033702414,  -0.006473424,  -0.04611692,   0.014419339,
+      -0.025174323, 0.0396852,     0.081777506,   0.06157468,    0.10210095,
+      -0.009658194, 0.046511717,   0.03603906,    0.0069369148,  0.015960095,
+      -0.06507666,  0.09551598,    0.053568836,   0.06408714,    0.12835667,
+      -0.008714329, -0.20211966,   -0.12093674,   0.029450472,   0.2849013,
+      -0.029227901, 0.1164364,     -0.08560263,   0.09941786,    -0.036999565,
+      -0.028842626, -0.0033637602, -0.017012902,  -0.09720865,   -0.11193351,
+      -0.029155117, -0.017936034,  -0.009768936,  -0.04223324,   -0.036159635,
+      0.06505112,   -0.021742892,  -0.023377212,  -0.07221364,   -0.06430552,
+      0.05453865,   0.091149814,   0.06387331,    0.007518393,   0.055960953,
+      0.069779344,  0.046411168,   0.10509911,    0.07463894,    0.0075130584,
+      0.012850982,  0.04555431,    0.056955688,   0.06555285,    0.050801456,
+      -0.009862683, 0.00826772,    -0.026555609,  -0.0073611983, -0.0014897042};
+
+  input_to_output_weights_ = {
+      -0.0998932,   -0.07201956,  -0.052803773,  -0.15629593,  -0.15001918,
+      -0.07650751,  0.02359855,   -0.075155355,  -0.08037709,  -0.15093534,
+      0.029517552,  -0.04751393,  0.010350531,   -0.02664851,  -0.016839722,
+      -0.023121163, 0.0077019283, 0.012851257,   -0.05040649,  -0.0129761,
+      -0.021737747, -0.038305793, -0.06870586,   -0.01481247,  -0.001285394,
+      0.10124236,   0.083122835,  0.053313006,   -0.062235646, -0.075637154,
+      -0.027833903, 0.029774971,  0.1130802,     0.09218906,   0.09506135,
+      -0.086665764, -0.037162706, -0.038880914,  -0.035832845, -0.014481564,
+      -0.09825003,  -0.12048569,  -0.097665586,  -0.05287633,  -0.0964047,
+      -0.11366429,  0.035777505,  0.13568819,    0.052451383,  0.050649304,
+      0.05798951,   -0.021852335, -0.099848844,  0.014740475,  -0.078897946,
+      0.04974699,   0.014160473,  0.06973932,    0.04964942,   0.033364646,
+      0.08190124,   0.025535367,  0.050893165,   0.048514254,  0.06945813,
+      -0.078907564, -0.06707616,  -0.11844508,   -0.09986688,  -0.07509403,
+      0.06263226,   0.14925587,   0.20188436,    0.12098451,   0.14639415,
+      0.0015017595, -0.014267382, -0.03417257,   0.012711468,  0.0028300495,
+      -0.024758482, -0.05098548,  -0.0821182,    0.014225672,  0.021544158,
+      0.08949725,   0.07505268,   -0.0020780868, 0.04908258,   0.06476295,
+      -0.022907063, 0.027562456,  0.040185735,   0.019567577,  -0.015598739,
+      -0.049097303, -0.017121866, -0.083368234,  -0.02332002,  -0.0840956};
+
+  input_gate_bias_ = {0.02234832,   0.14757581,  0.18176508,  0.10380666,
+                      0.053110216,  -0.06928846, -0.13942584, -0.11816189,
+                      0.19483899,   0.03652339,  -0.10250295, 0.036714908,
+                      -0.18426876,  0.036065217, 0.21810818,  0.02383196,
+                      -0.043370757, 0.08690144,  -0.04444982, 0.00030581196};
+
+  forget_gate_bias_ = {0.035185695, -0.042891346, -0.03032477, 0.23027696,
+                       0.11098921,  0.15378423,   0.09263801,  0.09790885,
+                       0.09508917,  0.061199076,  0.07665568,  -0.015443159,
+                       -0.03499149, 0.046190713,  0.08895977,  0.10899629,
+                       0.40694186,  0.06030037,   0.012413437, -0.06108739};
+
+  cell_gate_bias_ = {-0.024379363, 0.0055531194, 0.23377132,   0.033463873,
+                     -0.1483596,   -0.10639995,  -0.091433935, 0.058573797,
+                     -0.06809782,  -0.07889636,  -0.043246906, -0.09829136,
+                     -0.4279842,   0.034901652,  0.18797937,   0.0075234566,
+                     0.016178843,  0.1749513,    0.13975595,   0.92058027};
+
+  output_gate_bias_ = {0.046159424, -0.0012809046, 0.03563469,   0.12648113,
+                       0.027195795, 0.35373217,    -0.018957434, 0.008907322,
+                       -0.0762701,  0.12018895,    0.04216877,   0.0022856654,
+                       0.040952638, 0.3147856,     0.08225149,   -0.057416286,
+                       -0.14995944, -0.008040261,  0.13208859,   0.029760877};
+
+  recurrent_to_input_weights_ = {
+      -0.001374326,   -0.078856036,   0.10672688,    0.029162422,
+      -0.11585556,    0.02557986,     -0.13446963,   -0.035785314,
+      -0.01244275,    0.025961924,    -0.02337298,   -0.044228926,
+      -0.055839065,   -0.046598054,   -0.010546039,  -0.06900766,
+      0.027239809,    0.022582639,    -0.013296484,  -0.05459212,
+      0.08981,        -0.045407712,   0.08682226,    -0.06867011,
+      -0.14390695,    -0.02916037,    0.000996957,   0.091420636,
+      0.14283475,     -0.07390571,    -0.06402044,   0.062524505,
+      -0.093129106,   0.04860203,     -0.08364217,   -0.08119002,
+      0.009352075,    0.22920375,     0.0016303885,  0.11583097,
+      -0.13732095,    0.012405723,    -0.07551853,   0.06343048,
+      0.12162708,     -0.031923793,   -0.014335606,  0.01790974,
+      -0.10650317,    -0.0724401,     0.08554849,    -0.05727212,
+      0.06556731,     -0.042729504,   -0.043227166,  0.011683251,
+      -0.013082158,   -0.029302018,   -0.010899579,  -0.062036745,
+      -0.022509435,   -0.00964907,    -0.01567329,   0.04260106,
+      -0.07787477,    -0.11576462,    0.017356863,   0.048673786,
+      -0.017577527,   -0.05527947,    -0.082487635,  -0.040137455,
+      -0.10820036,    -0.04666372,    0.022746278,   -0.07851417,
+      0.01068115,     0.032956902,    0.022433773,   0.0026891115,
+      0.08944216,     -0.0685835,     0.010513544,   0.07228705,
+      0.02032331,     -0.059686817,   -0.0005566496, -0.086984694,
+      0.040414046,    -0.1380399,     0.094208956,   -0.05722982,
+      0.012092817,    -0.04989123,    -0.086576,     -0.003399834,
+      -0.04696032,    -0.045747425,   0.10091314,    0.048676282,
+      -0.029037097,   0.031399418,    -0.0040285117, 0.047237843,
+      0.09504992,     0.041799378,    -0.049185462,  -0.031518843,
+      -0.10516937,    0.026374253,    0.10058866,    -0.0033195973,
+      -0.041975245,   0.0073591834,   0.0033782164,  -0.004325073,
+      -0.10167381,    0.042500053,    -0.01447153,   0.06464186,
+      -0.017142897,   0.03312627,     0.009205989,   0.024138335,
+      -0.011337001,   0.035530265,    -0.010912711,  0.0706555,
+      -0.005894094,   0.051841937,    -0.1401738,    -0.02351249,
+      0.0365468,      0.07590991,     0.08838724,    0.021681072,
+      -0.10086113,    0.019608743,    -0.06195883,   0.077335775,
+      0.023646897,    -0.095322326,   0.02233014,    0.09756986,
+      -0.048691444,   -0.009579111,   0.07595467,    0.11480546,
+      -0.09801813,    0.019894179,    0.08502348,    0.004032281,
+      0.037211012,    0.068537936,    -0.048005626,  -0.091520436,
+      -0.028379958,   -0.01556313,    0.06554592,    -0.045599163,
+      -0.01672207,    -0.020169014,   -0.011877351,  -0.20212261,
+      0.010889619,    0.0047078193,   0.038385306,   0.08540671,
+      -0.017140968,   -0.0035865551,  0.016678626,   0.005633034,
+      0.015963363,    0.00871737,     0.060130805,   0.028611384,
+      0.10109069,     -0.015060172,   -0.07894427,   0.06401885,
+      0.011584063,    -0.024466386,   0.0047652307,  -0.09041358,
+      0.030737216,    -0.0046374933,  0.14215417,    -0.11823516,
+      0.019899689,    0.006106124,    -0.027092824,  0.0786356,
+      0.05052217,     -0.058925,      -0.011402121,  -0.024987547,
+      -0.0013661642,  -0.06832946,    -0.015667673,  -0.1083353,
+      -0.00096863037, -0.06988685,    -0.053350925,  -0.027275559,
+      -0.033664223,   -0.07978348,    -0.025200296,  -0.017207067,
+      -0.058403496,   -0.055697463,   0.005798788,   0.12965427,
+      -0.062582195,   0.0013350133,   -0.10482091,   0.0379771,
+      0.072521195,    -0.0029455067,  -0.13797039,   -0.03628521,
+      0.013806405,    -0.017858358,   -0.01008298,   -0.07700066,
+      -0.017081132,   0.019358726,    0.0027079724,  0.004635139,
+      0.062634714,    -0.02338735,    -0.039547626,  -0.02050681,
+      0.03385117,     -0.083611414,   0.002862572,   -0.09421313,
+      0.058618143,    -0.08598433,    0.00972939,    0.023867095,
+      -0.053934585,   -0.023203006,   0.07452513,    -0.048767887,
+      -0.07314807,    -0.056307215,   -0.10433547,   -0.06440842,
+      0.04328182,     0.04389765,     -0.020006588,  -0.09076438,
+      -0.11652589,    -0.021705797,   0.03345259,    -0.010329105,
+      -0.025767034,   0.013057034,    -0.07316461,   -0.10145612,
+      0.06358255,     0.18531723,     0.07759293,    0.12006465,
+      0.1305557,      0.058638252,    -0.03393652,   0.09622831,
+      -0.16253184,    -2.4580743e-06, 0.079869635,   -0.070196845,
+      -0.005644518,   0.06857898,     -0.12598175,   -0.035084512,
+      0.03156317,     -0.12794146,    -0.031963028,  0.04692781,
+      0.030070418,    0.0071660685,   -0.095516115,  -0.004643372,
+      0.040170413,    -0.062104587,   -0.0037324072, 0.0554317,
+      0.08184801,     -0.019164372,   0.06791302,    0.034257166,
+      -0.10307039,    0.021943003,    0.046745934,   0.0790918,
+      -0.0265588,     -0.007824208,   0.042546265,   -0.00977924,
+      -0.0002440307,  -0.017384544,   -0.017990116,  0.12252321,
+      -0.014512694,   -0.08251313,    0.08861942,    0.13589665,
+      0.026351685,    0.012641483,    0.07466548,    0.044301085,
+      -0.045414884,   -0.051112458,   0.03444247,    -0.08502782,
+      -0.04106223,    -0.028126027,   0.028473156,   0.10467447};
+
+  recurrent_to_cell_weights_ = {
+      -0.037322544,   0.018592842,   0.0056175636,  -0.06253426,
+      0.055647098,    -0.05713207,   -0.05626563,   0.005559383,
+      0.03375411,     -0.025757805,  -0.088049285,  0.06017052,
+      -0.06570978,    0.007384076,   0.035123326,   -0.07920549,
+      0.053676967,    0.044480428,   -0.07663568,   0.0071805613,
+      0.08089997,     0.05143358,    0.038261272,   0.03339287,
+      -0.027673481,   0.044746667,   0.028349208,   0.020090483,
+      -0.019443132,   -0.030755889,  -0.0040000007, 0.04465846,
+      -0.021585021,   0.0031670958,  0.0053199246,  -0.056117613,
+      -0.10893326,    0.076739706,   -0.08509834,   -0.027997585,
+      0.037871376,    0.01449768,    -0.09002357,   -0.06111149,
+      -0.046195522,   0.0422062,     -0.005683705,  -0.1253618,
+      -0.012925729,   -0.04890792,   0.06985068,    0.037654128,
+      0.03398274,     -0.004781977,  0.007032333,   -0.031787455,
+      0.010868644,    -0.031489216,  0.09525667,    0.013939797,
+      0.0058680447,   0.0167067,     0.02668468,    -0.04797466,
+      -0.048885044,   -0.12722108,   0.035304096,   0.06554885,
+      0.00972396,     -0.039238118,  -0.05159735,   -0.11329045,
+      0.1613692,      -0.03750952,   0.06529313,    -0.071974665,
+      -0.11769596,    0.015524369,   -0.0013754242, -0.12446318,
+      0.02786344,     -0.014179351,  0.005264273,   0.14376344,
+      0.015983658,    0.03406988,    -0.06939408,   0.040699873,
+      0.02111075,     0.09669095,    0.041345075,   -0.08316494,
+      -0.07684199,    -0.045768797,  0.032298047,   -0.041805092,
+      0.0119405,      0.0061010392,  0.12652606,    0.0064572375,
+      -0.024950314,   0.11574242,    0.04508852,    -0.04335324,
+      0.06760663,     -0.027437469,  0.07216407,    0.06977076,
+      -0.05438599,    0.034033038,   -0.028602652,  0.05346137,
+      0.043184172,    -0.037189785,  0.10420091,    0.00882477,
+      -0.054019816,   -0.074273005,  -0.030617684,  -0.0028467078,
+      0.024302477,    -0.0038869337, 0.005332455,   0.0013399826,
+      0.04361412,     -0.007001822,  0.09631092,    -0.06702025,
+      -0.042049985,   -0.035070654,  -0.04103342,   -0.10273396,
+      0.0544271,      0.037184782,   -0.13150354,   -0.0058036847,
+      -0.008264958,   0.042035464,   0.05891794,    0.029673764,
+      0.0063542654,   0.044788733,   0.054816857,   0.062257513,
+      -0.00093483756, 0.048938446,   -0.004952862,  -0.007730018,
+      -0.04043371,    -0.017094059,  0.07229206,    -0.023670016,
+      -0.052195564,   -0.025616996,  -0.01520939,   0.045104615,
+      -0.007376126,   0.003533447,   0.006570588,   0.056037236,
+      0.12436656,     0.051817212,   0.028532185,   -0.08686856,
+      0.11868599,     0.07663395,    -0.07323171,   0.03463402,
+      -0.050708205,   -0.04458982,   -0.11590894,   0.021273347,
+      0.1251325,      -0.15313013,   -0.12224372,   0.17228661,
+      0.023029093,    0.086124025,   0.006445803,   -0.03496501,
+      0.028332196,    0.04449512,    -0.042436164,  -0.026587414,
+      -0.006041347,   -0.09292539,   -0.05678812,   0.03897832,
+      0.09465633,     0.008115513,   -0.02171956,   0.08304309,
+      0.071401566,    0.019622514,   0.032163795,   -0.004167056,
+      0.02295182,     0.030739572,   0.056506045,   0.004612461,
+      0.06524936,     0.059999723,   0.046395954,   -0.0045512207,
+      -0.1335546,     -0.030136576,  0.11584653,    -0.014678886,
+      0.0020118146,   -0.09688814,   -0.0790206,    0.039770417,
+      -0.0329582,     0.07922767,    0.029322514,   0.026405897,
+      0.04207835,     -0.07073373,   0.063781224,   0.0859677,
+      -0.10925287,    -0.07011058,   0.048005477,   0.03438226,
+      -0.09606514,    -0.006669445,  -0.043381985,  0.04240257,
+      -0.06955775,    -0.06769346,   0.043903265,   -0.026784198,
+      -0.017840602,   0.024307009,   -0.040079936,  -0.019946516,
+      0.045318738,    -0.12233574,   0.026170589,   0.0074471775,
+      0.15978073,     0.10185836,    0.10298046,    -0.015476589,
+      -0.039390966,   -0.072174534,  0.0739445,     -0.1211869,
+      -0.0347889,     -0.07943156,   0.014809798,   -0.12412325,
+      -0.0030663363,  0.039695457,   0.0647603,     -0.08291318,
+      -0.018529687,   -0.004423833,  0.0037507233,  0.084633216,
+      -0.01514876,    -0.056505352,  -0.012800942,  -0.06994386,
+      0.012962922,    -0.031234352,  0.07029052,    0.016418684,
+      0.03618972,     0.055686004,   -0.08663945,   -0.017404709,
+      -0.054761406,   0.029065743,   0.052404847,   0.020238016,
+      0.0048197987,   -0.0214882,    0.07078733,    0.013016777,
+      0.06262858,     0.009184685,   0.020785125,   -0.043904778,
+      -0.0270329,     -0.03299152,   -0.060088247,  -0.015162964,
+      -0.001828936,   0.12642565,    -0.056757294,  0.013586685,
+      0.09232601,     -0.035886683,  0.06000002,    0.05229691,
+      -0.052580316,   -0.082029596,  -0.010794592,  0.012947712,
+      -0.036429964,   -0.085508935,  -0.13127148,   -0.017744139,
+      0.031502828,    0.036232427,   -0.031581745,  0.023051167,
+      -0.05325106,    -0.03421577,   0.028793324,   -0.034633752,
+      -0.009881397,   -0.043551125,  -0.018609839,  0.0019097115,
+      -0.008799762,   0.056595087,   0.0022273948,  0.055752404};
+
+  recurrent_to_forget_weights_ = {
+      -0.057784554,  -0.026057621,  -0.068447545,   -0.022581743,
+      0.14811787,    0.10826372,    0.09471067,     0.03987225,
+      -0.0039523416, 0.00030638507, 0.053185795,    0.10572994,
+      0.08414449,    -0.022036452,  -0.00066928595, -0.09203576,
+      0.032950465,   -0.10985798,   -0.023809856,   0.0021431844,
+      -0.02196096,   -0.00326074,   0.00058621005,  -0.074678116,
+      -0.06193199,   0.055729095,   0.03736828,     0.020123724,
+      0.061878487,   -0.04729229,   0.034919553,    -0.07585433,
+      -0.04421272,   -0.044019096,  0.085488975,    0.04058006,
+      -0.06890133,   -0.030951202,  -0.024628663,   -0.07672815,
+      0.034293607,   0.08556707,    -0.05293577,    -0.033561368,
+      -0.04899627,   0.0241671,     0.015736353,    -0.095442444,
+      -0.029564252,  0.016493602,   -0.035026584,   0.022337519,
+      -0.026871363,  0.004780428,   0.0077918363,   -0.03601621,
+      0.016435321,   -0.03263031,   -0.09543275,    -0.047392778,
+      0.013454138,   0.028934088,   0.01685226,     -0.086110644,
+      -0.046250615,  -0.01847454,   0.047608484,    0.07339695,
+      0.034546845,   -0.04881143,   0.009128804,    -0.08802852,
+      0.03761666,    0.008096139,   -0.014454086,   0.014361001,
+      -0.023502491,  -0.0011840804, -0.07607001,    0.001856849,
+      -0.06509276,   -0.006021153,  -0.08570962,    -0.1451793,
+      0.060212336,   0.055259194,   0.06974018,     0.049454916,
+      -0.027794661,  -0.08077226,   -0.016179763,   0.1169753,
+      0.17213494,    -0.0056326236, -0.053934924,   -0.0124349,
+      -0.11520337,   0.05409887,    0.088759385,    0.0019655675,
+      0.0042065294,  0.03881498,    0.019844765,    0.041858196,
+      -0.05695512,   0.047233116,   0.038937137,    -0.06542224,
+      0.014429736,   -0.09719407,   0.13908425,     -0.05379757,
+      0.012321099,   0.082840554,   -0.029899208,   0.044217527,
+      0.059855383,   0.07711018,    -0.045319796,   0.0948846,
+      -0.011724666,  -0.0033288454, -0.033542685,   -0.04764985,
+      -0.13873616,   0.040668588,   0.034832682,    -0.015319203,
+      -0.018715994,  0.046002675,   0.0599172,      -0.043107376,
+      0.0294216,     -0.002314414,  -0.022424703,   0.0030315618,
+      0.0014641669,  0.0029166266,  -0.11878115,    0.013738511,
+      0.12375372,    -0.0006038222, 0.029104086,    0.087442465,
+      0.052958444,   0.07558703,    0.04817258,     0.044462286,
+      -0.015213451,  -0.08783778,   -0.0561384,     -0.003008196,
+      0.047060397,   -0.002058388,  0.03429439,     -0.018839769,
+      0.024734668,   0.024614193,   -0.042046934,   0.09597743,
+      -0.0043254104, 0.04320769,    0.0064070094,   -0.0019131786,
+      -0.02558259,   -0.022822596,  -0.023273505,   -0.02464396,
+      -0.10991725,   -0.006240552,  0.0074488563,   0.024044557,
+      0.04383914,    -0.046476185,  0.028658995,    0.060410924,
+      0.050786525,   0.009452605,   -0.0073054377,  -0.024810238,
+      0.0052906186,  0.0066939713,  -0.0020913032,  0.014515517,
+      0.015898481,   0.021362653,   -0.030262267,   0.016587038,
+      -0.011442813,  0.041154444,   -0.007631438,   -0.03423484,
+      -0.010977775,  0.036152758,   0.0066366293,   0.11915515,
+      0.02318443,    -0.041350313,  0.021485701,    -0.10906167,
+      -0.028218046,  -0.00954771,   0.020531068,    -0.11995105,
+      -0.03672871,   0.024019798,   0.014255957,    -0.05221243,
+      -0.00661567,   -0.04630967,   0.033188973,    0.10107534,
+      -0.014027541,  0.030796422,   -0.10270911,    -0.035999842,
+      0.15443139,    0.07684145,    0.036571592,    -0.035900835,
+      -0.0034699554, 0.06209149,    0.015920248,    -0.031122351,
+      -0.03858649,   0.01849943,    0.13872518,     0.01503974,
+      0.069941424,   -0.06948533,   -0.0088794185,  0.061282158,
+      -0.047401894,  0.03100163,    -0.041533746,   -0.10430945,
+      0.044574402,   -0.01425562,   -0.024290353,   0.034563623,
+      0.05866852,    0.023947537,   -0.09445152,    0.035450947,
+      0.02247216,    -0.0042998926, 0.061146557,    -0.10250651,
+      0.020881841,   -0.06747029,   0.10062043,     -0.0023941975,
+      0.03532124,    -0.016341697,  0.09685456,     -0.016764693,
+      0.051808182,   0.05875331,    -0.04536488,    0.001626336,
+      -0.028892258,  -0.01048663,   -0.009793449,   -0.017093895,
+      0.010987891,   0.02357273,    -0.00010856845, 0.0099760275,
+      -0.001845119,  -0.03551521,   0.0018358806,   0.05763657,
+      -0.01769146,   0.040995963,   0.02235177,     -0.060430344,
+      0.11475477,    -0.023854522,  0.10071741,     0.0686208,
+      -0.014250481,  0.034261297,   0.047418304,    0.08562733,
+      -0.030519066,  0.0060542435,  0.014653856,    -0.038836084,
+      0.04096551,    0.032249358,   -0.08355519,    -0.026823482,
+      0.056386515,   -0.010401743,  -0.028396193,   0.08507674,
+      0.014410365,   0.020995233,   0.17040324,     0.11511526,
+      0.02459721,    0.0066619175,  0.025853224,    -0.023133837,
+      -0.081302024,  0.017264642,   -0.009585969,   0.09491168,
+      -0.051313367,  0.054532815,   -0.014298593,   0.10657464,
+      0.007076659,   0.10964551,    0.0409152,      0.008275321,
+      -0.07283536,   0.07937492,    0.04192024,     -0.1075027};
+
+  recurrent_to_output_weights_ = {
+      0.025825322,   -0.05813119,  0.09495884,   -0.045984812,   -0.01255415,
+      -0.0026479573, -0.08196161,  -0.054914974, -0.0046604523,  -0.029587349,
+      -0.044576716,  -0.07480124,  -0.082868785, 0.023254942,    0.027502948,
+      -0.0039728214, -0.08683098,  -0.08116779,  -0.014675607,   -0.037924774,
+      -0.023314456,  -0.007401714, -0.09255757,  0.029460307,    -0.08829125,
+      -0.005139627,  -0.08989442,  -0.0555066,   0.13596267,     -0.025062224,
+      -0.048351806,  -0.03850004,  0.07266485,   -0.022414139,   0.05940088,
+      0.075114764,   0.09597592,   -0.010211725, -0.0049794707,  -0.011523867,
+      -0.025980417,  0.072999895,  0.11091378,   -0.081685916,   0.014416728,
+      0.043229222,   0.034178585,  -0.07530371,  0.035837382,    -0.085607,
+      -0.007721233,  -0.03287832,  -0.043848954, -0.06404588,    -0.06632928,
+      -0.073643476,  0.008214239,  -0.045984086, 0.039764922,    0.03474462,
+      0.060612556,   -0.080590084, 0.049127717,  0.04151091,     -0.030063879,
+      0.008801774,   -0.023021035, -0.019558564, 0.05158114,     -0.010947698,
+      -0.011825728,  0.0075720972, 0.0699727,    -0.0039981045,  0.069350146,
+      0.08799282,    0.016156472,  0.035502106,  0.11695009,     0.006217345,
+      0.13392477,    -0.037875112, 0.025745004,  0.08940699,     -0.00924166,
+      0.0046702605,  -0.036598757, -0.08811812,  0.10522024,     -0.032441203,
+      0.008176899,   -0.04454919,  0.07058152,   0.0067963637,   0.039206743,
+      0.03259838,    0.03725492,   -0.09515802,  0.013326398,    -0.052055415,
+      -0.025676316,  0.03198509,   -0.015951829, -0.058556724,   0.036879618,
+      0.043357447,   0.028362012,  -0.05908629,  0.0059240665,   -0.04995891,
+      -0.019187413,  0.0276265,    -0.01628143,  0.0025863599,   0.08800015,
+      0.035250366,   -0.022165963, -0.07328642,  -0.009415526,   -0.07455109,
+      0.11690406,    0.0363299,    0.07411125,   0.042103454,    -0.009660886,
+      0.019076364,   0.018299393,  -0.046004917, 0.08891175,     0.0431396,
+      -0.026327137,  -0.051502608, 0.08979574,   -0.051670972,   0.04940282,
+      -0.07491107,   -0.021240504, 0.022596184,  -0.034280192,   0.060163025,
+      -0.058211457,  -0.051837247, -0.01349775,  -0.04639988,    -0.035936575,
+      -0.011681591,  0.064818054,  0.0073146066, -0.021745546,   -0.043124277,
+      -0.06471268,   -0.07053354,  -0.029321948, -0.05330136,    0.016933719,
+      -0.053782392,  0.13747959,   -0.1361751,   -0.11569455,    0.0033329215,
+      0.05693899,    -0.053219706, 0.063698,     0.07977434,     -0.07924483,
+      0.06936997,    0.0034815092, -0.007305279, -0.037325785,   -0.07251102,
+      -0.033633437,  -0.08677009,  0.091591336,  -0.14165086,    0.021752775,
+      0.019683983,   0.0011612234, -0.058154266, 0.049996935,    0.0288841,
+      -0.0024567875, -0.14345716,  0.010955264,  -0.10234828,    0.1183656,
+      -0.0010731248, -0.023590032, -0.072285876, -0.0724771,     -0.026382286,
+      -0.0014920527, 0.042667855,  0.0018776858, 0.02986552,     0.009814309,
+      0.0733756,     0.12289186,   0.018043943,  -0.0458958,     0.049412545,
+      0.033632483,   0.05495232,   0.036686596,  -0.013781798,   -0.010036754,
+      0.02576849,    -0.08307328,  0.010112348,  0.042521734,    -0.05869831,
+      -0.071689695,  0.03876447,   -0.13275425,  -0.0352966,     -0.023077697,
+      0.10285965,    0.084736146,  0.15568255,   -0.00040734606, 0.027835453,
+      -0.10292561,   -0.032401145, 0.10053256,   -0.026142767,   -0.08271222,
+      -0.0030240538, -0.016368777, 0.1070414,    0.042672627,    0.013456989,
+      -0.0437609,    -0.022309763, 0.11576483,   0.04108048,     0.061026827,
+      -0.0190714,    -0.0869359,   0.037901703,  0.0610107,      0.07202949,
+      0.01675338,    0.086139716,  -0.08795751,  -0.014898893,   -0.023771819,
+      -0.01965048,   0.007955471,  -0.043740474, 0.03346837,     -0.10549954,
+      0.090567775,   0.042013682,  -0.03176985,  0.12569028,     -0.02421228,
+      -0.029526481,  0.023851605,  0.031539805,  0.05292009,     -0.02344001,
+      -0.07811758,   -0.08834428,  0.10094801,   0.16594367,     -0.06861939,
+      -0.021256343,  -0.041093912, -0.06669611,  0.035498552,    0.021757556,
+      -0.09302526,   -0.015403468, -0.06614931,  -0.051798206,   -0.013874718,
+      0.03630673,    0.010412845,  -0.08077351,  0.046185967,    0.0035662893,
+      0.03541868,    -0.094149634, -0.034814864, 0.003128424,    -0.020674974,
+      -0.03944324,   -0.008110165, -0.11113267,  0.08484226,     0.043586485,
+      0.040582247,   0.0968012,    -0.065249965, -0.028036479,   0.0050708856,
+      0.0017462453,  0.0326779,    0.041296225,  0.09164146,     -0.047743853,
+      -0.015952192,  -0.034451712, 0.084197424,  -0.05347844,    -0.11768019,
+      0.085926116,   -0.08251791,  -0.045081906, 0.0948852,      0.068401024,
+      0.024856757,   0.06978981,   -0.057309967, -0.012775832,   -0.0032452994,
+      0.01977615,    -0.041040014, -0.024264973, 0.063464895,    0.05431621,
+  };
+
+  cell_to_input_weights_ = {
+      0.040369894, 0.030746894,  0.24704495,  0.018586371,  -0.037586458,
+      -0.15312155, -0.11812848,  -0.11465643, 0.20259799,   0.11418174,
+      -0.10116027, -0.011334949, 0.12411352,  -0.076769054, -0.052169047,
+      0.21198851,  -0.38871562,  -0.09061183, -0.09683246,  -0.21929175};
+
+  cell_to_forget_weights_ = {
+      -0.01998659,  -0.15568835,  -0.24248174,   -0.012770197, 0.041331276,
+      -0.072311886, -0.052123554, -0.0066330447, -0.043891653, 0.036225766,
+      -0.047248036, 0.021479502,  0.033189066,   0.11952997,   -0.020432774,
+      0.64658105,   -0.06650122,  -0.03467612,   0.095340036,  0.23647355};
+
+  cell_to_output_weights_ = {0.08286371,  -0.08261836, -0.51210177, 0.002913762,
+                             0.17764764,  -0.5495371,  -0.08460716, -0.24552552,
+                             0.030037103, 0.04123544,  -0.11940523, 0.007358328,
+                             0.1890978,   0.4833202,   -0.34441817, 0.36312827,
+                             -0.26375428, 0.1457655,   -0.19724406, 0.15548733};
+
+  projection_weights_ = {
+      -0.009802181,  0.09401916,    0.0717386,     -0.13895074,  0.09641832,
+      0.060420845,   0.08539281,    0.054285463,   0.061395317,  0.034448683,
+      -0.042991187,  0.019801661,   -0.16840284,   -0.015726732, -0.23041931,
+      -0.024478018,  -0.10959692,   -0.013875541,  0.18600968,   -0.061274476,
+      0.0138165,     -0.08160894,   -0.07661644,   0.032372914,  0.16169067,
+      0.22465782,    -0.03993472,   -0.004017731,  0.08633481,   -0.28869787,
+      0.08682067,    0.17240396,    0.014975425,   0.056431185,  0.031037588,
+      0.16702051,    0.0077946745,  0.15140012,    0.29405436,   0.120285,
+      -0.188994,     -0.027265169,  0.043389652,   -0.022061434, 0.014777949,
+      -0.20203483,   0.094781205,   0.19100232,    0.13987629,   -0.036132768,
+      -0.06426278,   -0.05108664,   0.13221376,    0.009441198,  -0.16715929,
+      0.15859416,    -0.040437475,  0.050779544,   -0.022187516, 0.012166504,
+      0.027685808,   -0.07675938,   -0.0055694645, -0.09444123,  0.0046453946,
+      0.050794356,   0.10770313,    -0.20790008,   -0.07149004,  -0.11425117,
+      0.008225835,   -0.035802525,  0.14374903,    0.15262283,   0.048710253,
+      0.1847461,     -0.007487823,  0.11000021,    -0.09542012,  0.22619456,
+      -0.029149994,  0.08527916,    0.009043713,   0.0042746216, 0.016261552,
+      0.022461696,   0.12689082,    -0.043589946,  -0.12035478,  -0.08361797,
+      -0.050666027,  -0.1248618,    -0.1275799,    -0.071875185, 0.07377272,
+      0.09944291,    -0.18897448,   -0.1593054,    -0.06526116,  -0.040107165,
+      -0.004618631,  -0.067624845,  -0.007576253,  0.10727444,   0.041546922,
+      -0.20424393,   0.06907816,    0.050412357,   0.00724631,   0.039827548,
+      0.12449835,    0.10747581,    0.13708383,    0.09134148,   -0.12617786,
+      -0.06428341,   0.09956831,    0.1208086,     -0.14676677,  -0.0727722,
+      0.1126304,     0.010139365,   0.015571211,   -0.038128063, 0.022913318,
+      -0.042050496,  0.16842307,    -0.060597885,  0.10531834,   -0.06411776,
+      -0.07451711,   -0.03410368,   -0.13393489,   0.06534304,   0.003620307,
+      0.04490757,    0.05970546,    0.05197996,    0.02839995,   0.10434969,
+      -0.013699693,  -0.028353551,  -0.07260381,   0.047201227,  -0.024575593,
+      -0.036445823,  0.07155557,    0.009672501,   -0.02328883,  0.009533515,
+      -0.03606021,   -0.07421458,   -0.028082801,  -0.2678904,   -0.13221288,
+      0.18419984,    -0.13012612,   -0.014588381,  -0.035059117, -0.04824723,
+      0.07830115,    -0.056184657,  0.03277091,    0.025466874,  0.14494097,
+      -0.12522776,   -0.098633975,  -0.10766018,   -0.08317623,  0.08594209,
+      0.07749552,    0.039474737,   0.1776665,     -0.07409566,  -0.0477268,
+      0.29323658,    0.10801441,    0.1154011,     0.013952499,  0.10739139,
+      0.10708251,    -0.051456142,  0.0074137426,  -0.10430189,  0.10034707,
+      0.045594677,   0.0635285,     -0.0715442,    -0.089667566, -0.10811871,
+      0.00026344223, 0.08298446,    -0.009525053,  0.006585689,  -0.24567553,
+      -0.09450807,   0.09648481,    0.026996298,   -0.06419476,  -0.04752702,
+      -0.11063944,   -0.23441927,   -0.17608605,   -0.052156363, 0.067035615,
+      0.19271925,    -0.0032889997, -0.043264326,  0.09663576,   -0.057112187,
+      -0.10100678,   0.0628376,     0.04447668,    0.017961001,  -0.10094388,
+      -0.10190601,   0.18335468,    0.10494553,    -0.052095775, -0.0026118709,
+      0.10539724,    -0.04383912,   -0.042349473,  0.08438151,   -0.1947263,
+      0.02251204,    0.11216432,    -0.10307853,   0.17351969,   -0.039091777,
+      0.08066188,    -0.00561982,   0.12633002,    0.11335965,   -0.0088127935,
+      -0.019777594,  0.06864014,    -0.059751723,  0.016233567,  -0.06894641,
+      -0.28651384,   -0.004228674,  0.019708522,   -0.16305895,  -0.07468996,
+      -0.0855457,    0.099339016,   -0.07580735,   -0.13775392,  0.08434318,
+      0.08330512,    -0.12131499,   0.031935584,   0.09180414,   -0.08876437,
+      -0.08049874,   0.008753825,   0.03498998,    0.030215185,  0.03907079,
+      0.089751154,   0.029194152,   -0.03337423,   -0.019092513, 0.04331237,
+      0.04299654,    -0.036394123,  -0.12915532,   0.09793732,   0.07512415,
+      -0.11319543,   -0.032502122,  0.15661901,    0.07671967,   -0.005491124,
+      -0.19379048,   -0.218606,     0.21448623,    0.017840758,  0.1416943,
+      -0.07051762,   0.19488361,    0.02664691,    -0.18104725,  -0.09334311,
+      0.15026465,    -0.15493552,   -0.057762887,  -0.11604192,  -0.262013,
+      -0.01391798,   0.012185008,   0.11156489,    -0.07483202,  0.06693364,
+      -0.26151478,   0.046425626,   0.036540434,   -0.16435726,  0.17338543,
+      -0.21401681,   -0.11385144,   -0.08283257,   -0.069031075, 0.030635102,
+      0.010969227,   0.11109743,    0.010919218,   0.027526086,  0.13519906,
+      0.01891392,    -0.046839405,  -0.040167913,  0.017953383,  -0.09700955,
+      0.0061885654,  -0.07000971,   0.026893595,   -0.038844477, 0.14543656};
+
+  lstm_input_ = {// Step 1
+                 {{0.787926, 0.151646, 0.071352, 0.118426, 0.458058}},
+                 // Step 2
+                 {{0.596268, 0.998386, 0.568695, 0.864524, 0.571277}},
+                 // Step 3
+                 {{0.073204, 0.296072, 0.743333, 0.069199, 0.045348}},
+                 // Step 4
+                 {{0.867394, 0.291279, 0.013714, 0.482521, 0.626339}}};
+
+  lstm_golden_output_ = {
+      {{-0.00396806, 0.029352, -0.00279226, 0.0159977, -0.00835576, -0.0211779,
+        0.0283512, -0.0114597, 0.00907307, -0.0244004, -0.0152191, -0.0259063,
+        0.00914318, 0.00415118, 0.017147, 0.0134203}},
+
+      {{-0.0166936, 0.0381209, 0.000889694, 0.0143363, -0.0328911, -0.0234288,
+        0.0333051, -0.012229, 0.0110322, -0.0457725, -0.000832209, -0.0202817,
+        0.0327257, 0.0121308, 0.0155969, 0.0312091}},
+
+      {{-0.0213783, 0.0350169, 0.000324794, 0.0276012, -0.0263374, -0.0371449,
+        0.0446149, -0.0205474, 0.0103729, -0.0576349, -0.0150052, -0.0292043,
+        0.0376827, 0.0136115, 0.0243435, 0.0354492}},
+
+      {{-0.0189322, 0.0464512, -0.00251373, 0.0225745, -0.0308346, -0.0317124,
+        0.0460407, -0.0189395, 0.0149363, -0.0530162, -0.0150767, -0.0340193,
+        0.0286833, 0.00824207, 0.0264887, 0.0305169}}};
+
+  LSTMOpModel lstm(
+      n_batch, n_input, n_cell, n_output,
+      /*use_cifg=*/false, /*use_peephole=*/true,
+      /*use_projection_weights=*/true,
+      /*use_projection_bias=*/false, /*weight_type=*/TensorType_FLOAT32,
+      model_has_legacy_20_inputs,
+      /*is_layer_norm=*/false, /*asymmetric_quantize_inputs=*/false,
+      input_to_input_weights_, input_to_forget_weights_, input_to_cell_weights_,
+      input_to_output_weights_, recurrent_to_input_weights_,
+      recurrent_to_forget_weights_, recurrent_to_cell_weights_,
+      recurrent_to_output_weights_, cell_to_input_weights_,
+      cell_to_forget_weights_, cell_to_output_weights_, input_gate_bias_,
+      forget_gate_bias_, cell_gate_bias_, output_gate_bias_,
+      projection_weights_, {}, input_layer_norm_coefficients_,
+      forget_layer_norm_coefficients_, cell_layer_norm_coefficients_,
+      output_layer_norm_coefficients_);
+
+  VerifyGoldens(&lstm, 0.00001f);
+}
+
+TEST_F(LstmOpTest, NoCifg_Peephole_Projection_LayerNorm) {
+  const int n_batch = 1;
+  const int n_input = 5;
+  const int n_cell = 4;
+  const int n_output = 3;
+
+  input_to_input_weights_ = {0.5,  0.6,  0.7,  -0.8, -0.9, 0.1,  0.2,
+                             0.3,  -0.4, 0.5,  -0.8, 0.7,  -0.6, 0.5,
+                             -0.4, -0.5, -0.4, -0.3, -0.2, -0.1};
+
+  input_to_forget_weights_ = {-0.6, -0.1, 0.3,  0.2,  0.9,  -0.5, -0.2,
+                              -0.4, 0.3,  -0.8, -0.4, 0.3,  -0.5, -0.4,
+                              -0.6, 0.3,  -0.4, -0.6, -0.5, -0.5};
+
+  input_to_cell_weights_ = {-0.4, -0.3, -0.2, -0.1, -0.5, 0.5,  -0.2,
+                            -0.3, -0.2, -0.6, 0.6,  -0.1, -0.4, -0.3,
+                            -0.7, 0.7,  -0.9, -0.5, 0.8,  0.6};
+
+  input_to_output_weights_ = {-0.8, -0.4, -0.2, -0.9, -0.1, -0.7, 0.3,
+                              -0.3, -0.8, -0.2, 0.6,  -0.2, 0.4,  -0.7,
+                              -0.3, -0.5, 0.1,  0.5,  -0.6, -0.4};
+
+  input_gate_bias_ = {0.03, 0.15, 0.22, 0.38};
+
+  forget_gate_bias_ = {0.1, -0.3, -0.2, 0.1};
+
+  cell_gate_bias_ = {-0.05, 0.72, 0.25, 0.08};
+
+  output_gate_bias_ = {0.05, -0.01, 0.2, 0.1};
+
+  recurrent_to_input_weights_ = {-0.2, -0.3, 0.4,  0.1,  -0.5, 0.9,
+                                 -0.2, -0.3, -0.7, 0.05, -0.2, -0.6};
+
+  recurrent_to_cell_weights_ = {-0.3, 0.2, 0.1, -0.3, 0.8,  -0.08,
+                                -0.2, 0.3, 0.8, -0.6, -0.1, 0.2};
+
+  recurrent_to_forget_weights_ = {-0.5, -0.3, -0.5, -0.2, 0.6, 0.4,
+                                  0.9,  0.3,  -0.1, 0.2,  0.5, 0.2};
+
+  recurrent_to_output_weights_ = {0.3,  -0.1, 0.1,  -0.2, -0.5, -0.7,
+                                  -0.2, -0.6, -0.1, -0.4, -0.7, -0.2};
+
+  cell_to_input_weights_ = {0.05, 0.1, 0.25, 0.15};
+
+  cell_to_forget_weights_ = {-0.02, -0.15, -0.25, -0.03};
+
+  cell_to_output_weights_ = {0.1, -0.1, -0.5, 0.05};
+
+  input_layer_norm_coefficients_ = {0.1, 0.2, 0.3, 0.5};
+  forget_layer_norm_coefficients_ = {0.2, 0.2, 0.4, 0.3};
+  cell_layer_norm_coefficients_ = {0.7, 0.2, 0.3, 0.8};
+  output_layer_norm_coefficients_ = {0.6, 0.2, 0.2, 0.5};
+
+  projection_weights_ = {-0.1, 0.2,  0.01, -0.2, 0.1,  0.5,
+                         0.3,  0.08, 0.07, 0.2,  -0.4, 0.2};
+
+  lstm_input_ = {
+      {{0.7, 0.8, 0.1, 0.2, 0.3}},
+      {{0.8, 0.1, 0.2, 0.4, 0.5}},
+      {{0.2, 0.7, 0.7, 0.1, 0.7}},
+  };
+
+  lstm_golden_output_ = {{{0.0244077, 0.128027, -0.00170918}},
+                         {{0.0137642, 0.140751, 0.0395835}},
+                         {{-0.00459231, 0.155278, 0.0837377}}};
+
+  LSTMOpModel lstm(
+      n_batch, n_input, n_cell, n_output,
+      /*use_cifg=*/false, /*use_peephole=*/true,
+      /*use_projection_weights=*/true,
+      /*use_projection_bias=*/false, /*weight_type=*/TensorType_FLOAT32,
+      /*model_has_legacy_20_inputs=*/false,
+      /*is_layer_norm=*/true, /*asymmetric_quantize_inputs=*/false,
+      input_to_input_weights_, input_to_forget_weights_, input_to_cell_weights_,
+      input_to_output_weights_, recurrent_to_input_weights_,
+      recurrent_to_forget_weights_, recurrent_to_cell_weights_,
+      recurrent_to_output_weights_, cell_to_input_weights_,
+      cell_to_forget_weights_, cell_to_output_weights_, input_gate_bias_,
+      forget_gate_bias_, cell_gate_bias_, output_gate_bias_,
+      projection_weights_, {}, input_layer_norm_coefficients_,
+      forget_layer_norm_coefficients_, cell_layer_norm_coefficients_,
+      output_layer_norm_coefficients_);
+
+  VerifyGoldens(&lstm, 0.00001f);
+}
+
+TEST_F(LstmOpTest, Cifg_Peephole_Projection_LayerNorm) {
+  const int n_batch = 1;
+  const int n_input = 5;
+  const int n_cell = 4;
+  const int n_output = 3;
+
+  input_to_forget_weights_ = {-0.6, -0.1, 0.3,  0.2,  0.9,  -0.5, -0.2,
+                              -0.4, 0.3,  -0.8, -0.4, 0.3,  -0.5, -0.4,
+                              -0.6, 0.3,  -0.4, -0.6, -0.5, -0.5};
+  input_to_cell_weights_ = {-0.4, -0.3, -0.2, -0.1, -0.5, 0.5,  -0.2,
+                            -0.3, -0.2, -0.6, 0.6,  -0.1, -0.4, -0.3,
+                            -0.7, 0.7,  -0.9, -0.5, 0.8,  0.6};
+  input_to_output_weights_ = {-0.8, -0.4, -0.2, -0.9, -0.1, -0.7, 0.3,
+                              -0.3, -0.8, -0.2, 0.6,  -0.2, 0.4,  -0.7,
+                              -0.3, -0.5, 0.1,  0.5,  -0.6, -0.4};
+
+  forget_gate_bias_ = {0.1, -0.3, -0.2, 0.1};
+  cell_gate_bias_ = {-0.05, 0.72, 0.25, 0.08};
+  output_gate_bias_ = {0.05, -0.01, 0.2, 0.1};
+
+  recurrent_to_cell_weights_ = {-0.3, 0.2, 0.1, -0.3, 0.8,  -0.08,
+                                -0.2, 0.3, 0.8, -0.6, -0.1, 0.2};
+  recurrent_to_forget_weights_ = {-0.5, -0.3, -0.5, -0.2, 0.6, 0.4,
+                                  0.9,  0.3,  -0.1, 0.2,  0.5, 0.2};
+  recurrent_to_output_weights_ = {0.3,  -0.1, 0.1,  -0.2, -0.5, -0.7,
+                                  -0.2, -0.6, -0.1, -0.4, -0.7, -0.2};
+
+  cell_to_forget_weights_ = {-0.02, -0.15, -0.25, -0.03};
+  cell_to_output_weights_ = {0.1, -0.1, -0.5, 0.05};
+
+  forget_layer_norm_coefficients_ = {0.2, 0.2, 0.4, 0.3};
+  cell_layer_norm_coefficients_ = {0.7, 0.2, 0.3, 0.8};
+  output_layer_norm_coefficients_ = {0.6, 0.2, 0.2, 0.5};
+  projection_weights_ = {-0.1, 0.2,  0.01, -0.2, 0.1,  0.5,
+                         0.3,  0.08, 0.07, 0.2,  -0.4, 0.2};
+
+  lstm_input_ = {{{0.7, 0.8, 0.1, 0.2, 0.3}},
+                 {{0.8, 0.1, 0.2, 0.4, 0.5}},
+                 {{0.2, 0.7, 0.7, 0.1, 0.7}}};
+  lstm_golden_output_ = {{{0.02129706, 0.140816242, 0.0112733059}},
+                         {{0.0132302344, 0.152308047, 0.0346313119}},
+                         {{-0.0123688057, 0.165790111, 0.0893077999}}};
+
+  LSTMOpModel lstm(
+      n_batch, n_input, n_cell, n_output,
+      /*use_cifg=*/true, /*use_peephole=*/true,
+      /*use_projection_weights=*/true,
+      /*use_projection_bias=*/false, /*weight_type=*/TensorType_FLOAT32,
+      /*model_has_legacy_20_inputs=*/false,
+      /*is_layer_norm=*/true, /*asymmetric_quantize_inputs=*/false,
+      input_to_input_weights_, input_to_forget_weights_, input_to_cell_weights_,
+      input_to_output_weights_, recurrent_to_input_weights_,
+      recurrent_to_forget_weights_, recurrent_to_cell_weights_,
+      recurrent_to_output_weights_, cell_to_input_weights_,
+      cell_to_forget_weights_, cell_to_output_weights_, input_gate_bias_,
+      forget_gate_bias_, cell_gate_bias_, output_gate_bias_,
+      projection_weights_, {}, input_layer_norm_coefficients_,
+      forget_layer_norm_coefficients_, cell_layer_norm_coefficients_,
+      output_layer_norm_coefficients_);
+
+  VerifyGoldens(&lstm, 0.00001f);
+}
+
+#ifdef GTEST_HAS_DEATH_TEST
+TEST_F(LstmOpTest, InvalidTypes) {
+  const int n_batch = 1;
+  const int n_input = 2;
+  const int n_cell = 4;
+  const int n_output = 4;
+
+  EXPECT_DEATH(
+      LSTMOpModel lstm(
+          n_batch, n_input, n_cell, n_output,
+          /*use_cifg=*/false, /*use_peephole=*/false,
+          /*use_projection_weights=*/false,
+          /*use_projection_bias=*/false,
+          /*weight_type=*/TensorType_INT32,
+          /*model_has_legacy_20_inputs=*/true,
+          /*is_layer_norm=*/false,
+          /*asymmetric_quantize_inputs=*/false,
+          /*input_to_input_weights=*/{}, /*input_to_forget_weights=*/{},
+          /*input_to_cell_weights=*/{}, /*input_to_output_weights=*/{},
+          /*recurrent_to_input_weights=*/{},
+          /*recurrent_to_forget_weights=*/{}, /*recurrent_to_cell_weights=*/{},
+          /*recurrent_to_output_weights=*/{}, /*cell_to_input_weights=*/{},
+          /*cell_to_forget_weights=*/{}, /*cell_to_output_weights=*/{},
+          /*input_gate_bias=*/{}, /*forget_gate_bias=*/{},
+          /*cell_gate_bias=*/{}, /*output_gate_bias=*/{},
+          /*projection_weights=*/{}, /*projection_bias=*/{},
+          /*input_layer_norm_coefficients=*/{},
+          /*forget_layer_norm_coefficients=*/{},
+          /*cell_layer_norm_coefficients=*/{},
+          /*output_layer_norm_coefficients=*/{}),
+      "");
+
+  EXPECT_DEATH(
+      LSTMOpModel lstm(
+          n_batch, n_input, n_cell, n_output,
+          /*use_cifg=*/false, /*use_peephole=*/false,
+          /*use_projection_weights=*/false,
+          /*use_projection_bias=*/false,
+          /*weight_type=*/TensorType_COMPLEX64,
+          /*model_has_legacy_20_inputs=*/true,
+          /*is_layer_norm=*/false,
+          /*asymmetric_quantize_inputs=*/false,
+          /*input_to_input_weights=*/{}, /*input_to_forget_weights=*/{},
+          /*input_to_cell_weights=*/{}, /*input_to_output_weights=*/{},
+          /*recurrent_to_input_weights=*/{},
+          /*recurrent_to_forget_weights=*/{}, /*recurrent_to_cell_weights=*/{},
+          /*recurrent_to_output_weights=*/{}, /*cell_to_input_weights=*/{},
+          /*cell_to_forget_weights=*/{}, /*cell_to_output_weights=*/{},
+          /*input_gate_bias=*/{}, /*forget_gate_bias=*/{},
+          /*cell_gate_bias=*/{}, /*output_gate_bias=*/{},
+          /*projection_weights=*/{}, /*projection_bias=*/{},
+          /*input_layer_norm_coefficients=*/{},
+          /*forget_layer_norm_coefficients=*/{},
+          /*cell_layer_norm_coefficients=*/{},
+          /*output_layer_norm_coefficients=*/{}),
+      "");
+}
+#endif
+
+// Test parameter controls model_has_legacy_20_inputs in LSTMOpModel.
+INSTANTIATE_TEST_SUITE_P(Parameterized, LstmOpTest, ::testing::Bool());
+
+}  // namespace
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/lstm_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/lstm_test.cc
index d7ea3ee6474..8982d99ad9b 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/lstm_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/lstm_test.cc
@@ -67,7 +67,7 @@ TEST_F(OpenCLOperationTest, LSTM) {
       op_def.dst_tensors.push_back({data_type, storage, Layout::BHWC});
       TensorFloat32 new_state;
       TensorFloat32 new_activ;
-      LSTM operation = CreateLSTM(op_def, env_.GetDevicePtr()->GetInfo());
+      GPUOperation operation = CreateLSTM(op_def, env_.GetDevicePtr()->info_);
       ASSERT_OK(ExecuteGPUOperation(
           {src_tensor, prev_state}, creation_context_, &operation,
           {BHWC(1, 1, 1, 4), BHWC(1, 1, 1, 4)}, {&new_state, &new_activ}));
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.cc b/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.cc
index 97ee4878572..0bea5e4b6b7 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.cc
@@ -23,76 +23,26 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 namespace cl {
-
-MaxUnpooling::MaxUnpooling(const OperationDef& definition,
-                           const MaxUnpooling2DAttributes& attr)
-    : GPUOperation(definition),
-      stride_(attr.strides.w, attr.strides.h, 0, 0),
-      padding_(attr.padding.appended.w, attr.padding.appended.h, 0, 0),
-      kernel_size_(attr.kernel.w, attr.kernel.h, 0, 0) {
-  code_ = GetMaxUnpoolingKernelCode(definition_);
-}
-
-MaxUnpooling::MaxUnpooling(const OperationDef& definition,
-                           const MaxUnpooling3DAttributes& attr)
-    : GPUOperation(definition),
-      stride_(attr.strides.w, attr.strides.h, attr.strides.d, 0),
-      padding_(attr.padding.appended.w, attr.padding.appended.h,
-               attr.padding.appended.d, 0),
-      kernel_size_(attr.kernel.w, attr.kernel.h, attr.kernel.d, 0) {
-  code_ = GetMaxUnpoolingKernelCode(definition_);
-}
-
-MaxUnpooling::MaxUnpooling(MaxUnpooling&& kernel)
-    : GPUOperation(std::move(kernel)),
-      stride_(kernel.stride_),
-      padding_(kernel.padding_),
-      kernel_size_(kernel.kernel_size_) {}
-
-MaxUnpooling& MaxUnpooling::operator=(MaxUnpooling&& kernel) {
-  if (this != &kernel) {
-    std::swap(stride_, kernel.stride_);
-    std::swap(padding_, kernel.padding_);
-    std::swap(kernel_size_, kernel.kernel_size_);
-    GPUOperation::operator=(std::move(kernel));
-  }
-  return *this;
-}
-
-std::string MaxUnpooling::GetMaxUnpoolingKernelCode(
-    const OperationDef& op_def) {
+namespace {
+std::string GetMaxUnpoolingKernelCode(const OperationDef& op_def,
+                                      GPUOperation* op) {
   auto src_desc = op_def.src_tensors[0];
   src_desc.SetTextureAddressMode(TextureAddressMode::ZERO);
   if (op_def.IsBatchSupported()) {
     src_desc.SetStateVar("BatchedWidth", "true");
   }
-  AddSrcTensor("src_tensor", src_desc);
+  op->AddSrcTensor("src_tensor", src_desc);
   auto src_ind_desc = op_def.src_tensors[1];
   src_ind_desc.SetTextureAddressMode(TextureAddressMode::ZERO);
   if (op_def.IsBatchSupported()) {
     src_ind_desc.SetStateVar("BatchedWidth", "true");
   }
-  AddSrcTensor("src_indices", src_ind_desc);
+  op->AddSrcTensor("src_indices", src_ind_desc);
   auto dst_desc = op_def.dst_tensors[0];
   if (op_def.IsBatchSupported()) {
     dst_desc.SetStateVar("BatchedWidth", "true");
   }
-  AddDstTensor("dst_tensor", dst_desc);
-  if (op_def.dst_tensors[0].HasAxis(Axis::WIDTH)) {
-    args_.AddInt("kernel_size_x");
-    args_.AddInt("padding_x");
-    args_.AddInt("stride_x");
-  }
-  if (op_def.dst_tensors[0].HasAxis(Axis::HEIGHT)) {
-    args_.AddInt("kernel_size_y");
-    args_.AddInt("padding_y");
-    args_.AddInt("stride_y");
-  }
-  if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
-    args_.AddInt("kernel_size_z");
-    args_.AddInt("padding_z");
-    args_.AddInt("stride_z");
-  }
+  op->AddDstTensor("dst_tensor", dst_desc);
 
   std::string c = GetCommonDefines(op_def.precision);
   c += "__kernel void main_function(\n";
@@ -115,7 +65,8 @@ std::string MaxUnpooling::GetMaxUnpoolingKernelCode(
     c += "  int linear_id_0 = get_global_id(0);\n";
     c += "  int X0 = linear_id_0 / args.dst_tensor.Batch();\n";
     c += "  int B = linear_id_0 % args.dst_tensor.Batch();\n";
-    c += "  int src_x0 = (X0 + args.padding_x) / args.stride_x;\n";
+    c += "  int src_x0 = (X0 + args.padding_x * args.dst_tensor.Batch()) / "
+         "args.stride_x;\n";
     c += "  int src_x = src_x0 * args.dst_tensor.Batch() + B;\n";
   } else {
     c += "  int src_x = (X + args.padding_x) / args.stride_x;\n";
@@ -145,7 +96,8 @@ std::string MaxUnpooling::GetMaxUnpoolingKernelCode(
         "  int4 ind = convert_int4(args.src_indices.Read(" + src_args + "));\n";
   }
   if (op_def.dst_tensors[0].HasAxis(Axis::BATCH)) {
-    c += "  int t_x = X0 - (src_x0 * args.stride_x - args.padding_x);\n";
+    c += "  int t_x = X0 - (src_x0 * args.stride_x - args.padding_x * "
+         "args.dst_tensor.Batch());\n";
   } else {
     c += "  int t_x = X - (src_x * args.stride_x - args.padding_x);\n";
   }
@@ -172,41 +124,37 @@ std::string MaxUnpooling::GetMaxUnpoolingKernelCode(
 
   return c;
 }
+}  // namespace
 
-absl::Status MaxUnpooling::BindArguments() {
-  if (definition_.dst_tensors[0].HasAxis(Axis::WIDTH)) {
-    RETURN_IF_ERROR(args_.SetInt("stride_x", stride_.x));
-    RETURN_IF_ERROR(args_.SetInt("padding_x", padding_.x * src_[0]->Batch()));
-    RETURN_IF_ERROR(args_.SetInt("kernel_size_x", kernel_size_.x));
-  }
-  if (definition_.dst_tensors[0].HasAxis(Axis::HEIGHT)) {
-    RETURN_IF_ERROR(args_.SetInt("stride_y", stride_.y));
-    RETURN_IF_ERROR(args_.SetInt("padding_y", padding_.y));
-    RETURN_IF_ERROR(args_.SetInt("kernel_size_y", kernel_size_.y));
-  }
-  if (definition_.dst_tensors[0].HasAxis(Axis::DEPTH)) {
-    RETURN_IF_ERROR(args_.SetInt("stride_z", stride_.z));
-    RETURN_IF_ERROR(args_.SetInt("padding_z", padding_.z));
-    RETURN_IF_ERROR(args_.SetInt("kernel_size_z", kernel_size_.z));
-  }
-  return absl::OkStatus();
-}
-
-int3 MaxUnpooling::GetGridSize() const {
-  const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
-  const int grid_y = dst_[0]->Height() * dst_[0]->Depth();
-  const int grid_z = dst_[0]->Slices();
-  return int3(grid_x, grid_y, grid_z);
-}
-
-MaxUnpooling CreateMaxUnpooling(const OperationDef& definition,
+GPUOperation CreateMaxUnpooling(const OperationDef& definition,
                                 const MaxUnpooling2DAttributes& attr) {
-  return MaxUnpooling(definition, attr);
+  GPUOperation op(definition);
+  op.args_.AddInt("kernel_size_x", attr.kernel.w);
+  op.args_.AddInt("padding_x", attr.padding.appended.w);
+  op.args_.AddInt("stride_x", attr.strides.w);
+  op.args_.AddInt("kernel_size_y", attr.kernel.h);
+  op.args_.AddInt("padding_y", attr.padding.appended.h);
+  op.args_.AddInt("stride_y", attr.strides.h);
+  op.code_ = GetMaxUnpoolingKernelCode(definition, &op);
+  op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_SToZ;
+  return op;
 }
 
-MaxUnpooling CreateMaxUnpooling(const OperationDef& definition,
+GPUOperation CreateMaxUnpooling(const OperationDef& definition,
                                 const MaxUnpooling3DAttributes& attr) {
-  return MaxUnpooling(definition, attr);
+  GPUOperation op(definition);
+  op.args_.AddInt("kernel_size_x", attr.kernel.w);
+  op.args_.AddInt("padding_x", attr.padding.appended.w);
+  op.args_.AddInt("stride_x", attr.strides.w);
+  op.args_.AddInt("kernel_size_y", attr.kernel.h);
+  op.args_.AddInt("padding_y", attr.padding.appended.h);
+  op.args_.AddInt("stride_y", attr.strides.h);
+  op.args_.AddInt("kernel_size_z", attr.kernel.d);
+  op.args_.AddInt("padding_z", attr.padding.appended.d);
+  op.args_.AddInt("stride_z", attr.strides.d);
+  op.code_ = GetMaxUnpoolingKernelCode(definition, &op);
+  op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_SToZ;
+  return op;
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.h b/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.h
index 0b1420a67c9..c1b6cbf334b 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.h
@@ -25,34 +25,10 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-class MaxUnpooling : public GPUOperation {
- public:
-  MaxUnpooling(const OperationDef& definition,
-               const MaxUnpooling2DAttributes& attr);
-  MaxUnpooling(const OperationDef& definition,
-               const MaxUnpooling3DAttributes& attr);
-
-  absl::Status BindArguments() override;
-  int3 GetGridSize() const override;
-
-  // Move only
-  MaxUnpooling(MaxUnpooling&& kernel);
-  MaxUnpooling& operator=(MaxUnpooling&& kernel);
-  MaxUnpooling(const MaxUnpooling&) = delete;
-  MaxUnpooling& operator=(const MaxUnpooling&) = delete;
-
- private:
-  std::string GetMaxUnpoolingKernelCode(const OperationDef& op_def);
-
-  int4 stride_;
-  int4 padding_;
-  int4 kernel_size_;
-};
-
-MaxUnpooling CreateMaxUnpooling(const OperationDef& definition,
+GPUOperation CreateMaxUnpooling(const OperationDef& definition,
                                 const MaxUnpooling2DAttributes& attr);
 
-MaxUnpooling CreateMaxUnpooling(const OperationDef& definition,
+GPUOperation CreateMaxUnpooling(const OperationDef& definition,
                                 const MaxUnpooling3DAttributes& attr);
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling_test.cc
index c03cb4f89d7..654b3892343 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling_test.cc
@@ -55,7 +55,7 @@ TEST_F(OpenCLOperationTest, MaxUnpooling) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      MaxUnpooling operation = CreateMaxUnpooling(op_def, attr);
+      GPUOperation operation = CreateMaxUnpooling(op_def, attr);
       ASSERT_OK(ExecuteGPUOperation({src_tensor, src_ind_tensor},
                                     creation_context_, &operation,
                                     BHWC(1, 4, 4, 1), &dst_tensor));
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/mean.cc b/tensorflow/lite/delegates/gpu/cl/kernels/mean.cc
index e1628a7e9a7..a4b01ffad16 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/mean.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/mean.cc
@@ -35,6 +35,15 @@ Mean::Mean(const OperationDef& definition, const DeviceInfo& device_info)
   if (device_info.IsAdreno3xx()) {
     work_group_size_ = int3(16, 8, 1);
   }
+  if (device_info.IsMali()) {
+    const MaliInfo& mali_info = device_info.mali_info;
+    if (mali_info.IsMaliT6xx() || mali_info.IsMaliT7xx() ||
+        mali_info.IsMaliT8xx()) {
+      work_group_size_ = int3(8, 4, 1);
+    } else {
+      work_group_size_ = int3(8, 8, 1);
+    }
+  }
   code_ = GetMeanKernelCode(definition_, work_group_size_);
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/mean.h b/tensorflow/lite/delegates/gpu/cl/kernels/mean.h
index cfdd7be53d3..12735c0b916 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/mean.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/mean.h
@@ -31,8 +31,11 @@ class Mean : public GPUOperation {
   Mean() = default;
   Mean(const OperationDef& definition, const DeviceInfo& device_info);
 
-  absl::Status Tune(const TuningParameters& params) override {
-    return absl::OkStatus();
+  void GetPossibleKernelWorkGroups(
+      TuningType tuning_type, const DeviceInfo& device_info,
+      const KernelInfo& kernel_info,
+      std::vector<int3>* work_groups) const override {
+    work_groups->push_back(work_group_size_);
   }
   absl::Status BindArguments() override;
   int3 GetGridSize() const override;
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization.cc b/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization.cc
index bf2ae33ec6d..039db8ee5ee 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include <string>
 
+#include "tensorflow/lite/delegates/gpu/cl/cl_program.h"
+#include "tensorflow/lite/delegates/gpu/cl/device_info.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
 #include "tensorflow/lite/delegates/gpu/cl/precision.h"
@@ -32,10 +34,18 @@ std::string GetVectorReduceCode() {
 })";
 }
 
-std::string GetReduceCode(size_t work_group_size_x, size_t work_group_size_y) {
+std::string GetReduceCode() {
   // If it is supported, use the built-in work_group_reduce_add function.
-  // Otherwise, implement a reduction using __local memory. Note this only works
-  // with power-of-two work group sizes.
+  // Otherwise, implement a reduction using __local memory.
+
+  // In the reduction step add upper half of the still-to-be-summed vector to
+  // the lower half, while taking care of odd sizes and rounding. E.g.:
+  // Number of items still to be summed before: 5
+  // Local memory before: [a, b, c, d, e];
+  // Local memory after: [a+d, b+e, c, d, e];
+  // Threads doing work: id < 2 = floor(5/2)
+  // Offset to the added items: 3 = ceil(5/2)
+  // Number of items still to be summed after: 3 = ceil(5/2)
   return R"(
 #if (__OPENCL_C_VERSION__ >= 200) && (__OPENCL_C_VERSION__ < 300) && \
   !defined(__opencl_c_work_group_collective_functions)
@@ -43,40 +53,65 @@ std::string GetReduceCode(size_t work_group_size_x, size_t work_group_size_y) {
 #endif
 
 #ifdef __opencl_c_work_group_collective_functions
-#define local_reduce(input, tmp) work_group_reduce_add(input)
+#define local_reduce(item, tmp) work_group_reduce_add(item)
 #else  // !defined(__opencl_c_work_group_collective_functions)
-static inline float local_reduce(float input, __local float tmp[)" +
-         std::to_string(work_group_size_y) + "][" +
-         std::to_string(work_group_size_x) + R"(]) {
-  const size_t local_id_x = get_local_id(0);
-  const size_t local_id_y = get_local_id(1);
-  tmp[local_id_y][local_id_x] = input;
-  mem_fence(CLK_LOCAL_MEM_FENCE);
-  size_t reduction_size = get_local_size(0) / 2;
-  while (reduction_size > 0) {
-    if (local_id_x < reduction_size) {
-      tmp[local_id_y][local_id_x] += tmp[local_id_y][local_id_x + reduction_size];
+static inline float local_reduce(float item, __local float* tmp) {
+  const int local_id = get_local_id(0);
+  tmp[local_id] = item;
+  barrier(CLK_LOCAL_MEM_FENCE);
+  // The number of items still need to be summed
+  int reduction_size = get_local_size(0);
+  while (reduction_size > 1) {
+    const int active_thread_limit = reduction_size / 2;
+    const int offset = (reduction_size + 1) / 2;
+    if (local_id < active_thread_limit) {
+      item += tmp[local_id + offset];
+      tmp[local_id] = item;
     }
-    mem_fence(CLK_LOCAL_MEM_FENCE);
-    reduction_size /=  2;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    reduction_size = offset;
   }
-  return tmp[local_id_y][0];
+  return tmp[0];
 }
 #endif  // defined(__opencl_c_work_group_collective_functions)
 )";
 }
+
+std::string GetFilterCode() {
+  return R"(
+static inline float4 filter_outside_tensor(float4 x, int num_channels, int slice) {
+  return select(x, (float4)(0.0f), slice * 4 + (int4)(0, 1, 2, 3) >= num_channels);
+}
+)";
+}
 }  // namespace
 
-MeanStdDevNormalization::MeanStdDevNormalization(const OperationDef& definition)
+MeanStdDevNormalization::MeanStdDevNormalization(const OperationDef& definition,
+                                                 const DeviceInfo& device_info,
+                                                 const int tensor_slices)
     : GPUOperation(definition) {
   // The kernel code does not inherently need a fixed size, but in order to not
   // hardcode the __local array's size for the reductions, we would need to pass
   // that size to the kernel at runtime, and that is currently not supported.
-  // For now, fix workgroup size to 128 threads.
-  work_group_size_.x = 128;
-  work_group_size_.y = 1;
-  work_group_size_.z = 1;
+  // For now, fix workgroup size to the biggest supported by the device, but not
+  // larger than the number of tensor slices.
+  int desired_work_group_size =
+      std::min(tensor_slices, device_info.max_work_group_size_x);
+  if (device_info.IsMali() && desired_work_group_size > 64) {
+    // Don't use more than 64 work items per work group on ARM Mali. They
+    // implement local memory using the global memory, larger workgroups have
+    // severe performance penalty.
+    desired_work_group_size = 64;
+  }
+  work_group_size_.x = desired_work_group_size;
+  work_group_size_.y = 1;  // Required
+  work_group_size_.z = 1;  // Required
   code_ = GetNormalizationCode();
+  if (device_info.cl_version >= OpenCLVersion::CL_3_0) {
+    compiler_options_.push_back(CompilerOptions::CL_3_0);
+  } else if (device_info.cl_version >= OpenCLVersion::CL_2_0) {
+    compiler_options_.push_back(CompilerOptions::CL_2_0);
+  }
 }
 
 std::string MeanStdDevNormalization::GetNormalizationCode() {
@@ -85,27 +120,22 @@ std::string MeanStdDevNormalization::GetNormalizationCode() {
 
   std::string c = GetCommonDefines(definition_.precision);
   c += GetVectorReduceCode();
-  c += GetReduceCode(work_group_size_.x, work_group_size_.y);
-  c += R"(__attribute__((reqd_work_group_size(128, 1, 1)))
-__kernel void main_function(
-$0) {
+  c += GetReduceCode();
+  c += GetFilterCode();
+  c += "__attribute__((reqd_work_group_size(" +
+       std::to_string(work_group_size_.x) + ", 1, 1)))\n";
+  c += R"(__kernel void main_function($0) {
 #ifndef __opencl_c_work_group_collective_functions
   __local float tmp[)" +
-       std::to_string(work_group_size_.y) + "][" +
        std::to_string(work_group_size_.x) + R"(];
 #endif
-  size_t B = get_global_id(1);
-  if (get_global_id(2) > 0) { return; }
-  if (B >= args.src_tensor.Batch()) { return; }
+  const int B = get_global_id(1);
   // Calculate the total sum of the input tensor.
   // First, get a local sum of input[local_id_x + N*local_size_x] for all N.
   float4 private_sum4 = (float4)(0.0f);
   for (int S = get_local_id(0); S < args.src_tensor.Slices(); S += get_local_size(0)) {
     const float4 t = args.src_tensor.Read<float>(0, 0, S, B);
-    // Filter out reads beyond the end of the tensor.
-    const int4 is_after_end_of_tensor = (int4)(0, 1, 2, 3) >= (args.src_tensor.Channels() - S * 4);
-    const float4 filtered_t = select(t, (float4)(0.0f), is_after_end_of_tensor);
-    private_sum4 += filtered_t;
+    private_sum4 += filter_outside_tensor(t, args.src_tensor.Channels(), S);
   }
   // Reduce the vector to a single float and do a workgroup reduce.
   const float private_sum = reduce_vector(private_sum4);
@@ -116,21 +146,18 @@ $0) {
   float4 private_sum_diff_sq4 = (float4)(0.0f);
   for (int S = get_local_id(0); S < args.src_tensor.Slices(); S += get_local_size(0)) {
     const float4 t = args.src_tensor.Read<float>(0, 0, S, B);
-    const float4 diff = t - mean;
-    // Filter out reads beyond the end of the tensor.
-    const int4 is_after_end_of_tensor = (int4)(0, 1, 2, 3) >= (args.src_tensor.Channels() - S * 4);
-    const float4 filtered_diff = select(diff, (float4)(0.0f), is_after_end_of_tensor);
+    const float4 diff = filter_outside_tensor(t - mean, args.src_tensor.Channels(), S);
     // sum_diff_sq += diff²
-    private_sum_diff_sq4 = mad(filtered_diff, filtered_diff, private_sum_diff_sq4);
+    private_sum_diff_sq4 = mad(diff, diff, private_sum_diff_sq4);
   }
   // Reduce
   const float private_sum_diff_sq = reduce_vector(private_sum_diff_sq4);
   const float sum_diff_sq = local_reduce(private_sum_diff_sq, tmp);
   // Calculate 1/stddev (with the 'regulazing constant' as in tensor_utils.cc)
   const float variance = sum_diff_sq / args.src_tensor.Channels();
-  const float stddev_inv =  rsqrt(variance + 1.0e-8f);
+  const float stddev_inv = native_rsqrt(variance + 1.0e-8f);
   // Calculate (t-mean)/stddev for each element
-  for (int S = 0; S < args.src_tensor.Slices(); ++S) {
+  for (int S = get_local_id(0); S < args.src_tensor.Slices(); S += get_local_size(0)) {
     const float4 t = args.src_tensor.Read<float>(0, 0, S, B);
     FLT4 result = TO_FLT4((t - mean) * stddev_inv);
     args.dst_tensor.Write(result, 0, 0, S, B);
@@ -149,8 +176,9 @@ int3 MeanStdDevNormalization::GetGridSize() const {
 }
 
 MeanStdDevNormalization CreateMeanStdDevNormalization(
-    const OperationDef& definition) {
-  return MeanStdDevNormalization(definition);
+    const OperationDef& definition, const DeviceInfo& device_info,
+    const int tensor_slices) {
+  return MeanStdDevNormalization(definition, device_info, tensor_slices);
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization.h b/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization.h
index 7dd45fcb86a..3312d23122f 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_LSTM_NORMALIZATION_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_LSTM_NORMALIZATION_H_
 
+#include "tensorflow/lite/delegates/gpu/cl/device_info.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
@@ -28,10 +29,15 @@ namespace cl {
 // Implements tensor_utils::MeanStddevNormalization
 class MeanStdDevNormalization : public GPUOperation {
  public:
-  explicit MeanStdDevNormalization(const OperationDef& definition);
+  explicit MeanStdDevNormalization(const OperationDef& definition,
+                                   const DeviceInfo& device_info,
+                                   const int tensor_slices);
 
-  absl::Status Tune(const TuningParameters& params) override {
-    return absl::OkStatus();
+  void GetPossibleKernelWorkGroups(
+      TuningType tuning_type, const DeviceInfo& device_info,
+      const KernelInfo& kernel_info,
+      std::vector<int3>* work_groups) const override {
+    work_groups->push_back(work_group_size_);
   }
   int3 GetGridSize() const override;
 
@@ -47,7 +53,8 @@ class MeanStdDevNormalization : public GPUOperation {
 };
 
 MeanStdDevNormalization CreateMeanStdDevNormalization(
-    const OperationDef& definition);
+    const OperationDef& definition, const DeviceInfo& device_info,
+    const int tensor_slices);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization_test.cc
index 57f052557d4..7ceaf964edd 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization_test.cc
@@ -54,7 +54,8 @@ TEST_P(MeanStddevNormalizationTest, SeparateBatches) {
       op_def.src_tensors.push_back({data_type, storage, Layout::BHWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::BHWC});
       TensorFloat32 dst_tensor;
-      auto operation = CreateMeanStdDevNormalization(op_def);
+      auto operation =
+          CreateMeanStdDevNormalization(op_def, env_.GetDevicePtr()->info_, 1);
       ASSERT_OK(ExecuteGPUOperation({src_tensor}, creation_context_, &operation,
                                     BHWC(1, 1, 1, 4), &dst_tensor));
 
@@ -72,45 +73,47 @@ TEST_P(MeanStddevNormalizationTest, SeparateBatches) {
   }
 }
 
+// note: 100.01 is not representable in FP16 (is in FP32), so use 101.0 instead.
 INSTANTIATE_TEST_SUITE_P(
     uKernels, MeanStddevNormalizationTest,
     testing::Values(
         std::make_tuple(0.0f, 0.0f, 0.0f),         // zero mean, zero variance
-        std::make_tuple(0.0f, 0.01f, 2.53e-5f),    // zero mean, small variance
-        std::make_tuple(0.0f, 100.0f, 1.20e-7f),   // zero mean, large variance
+        std::make_tuple(0.0f, 0.01f, 2.63e-4f),    // zero mean, small variance
+        std::make_tuple(0.0f, 100.0f, 2.63e-4f),   // zero mean, large variance
         std::make_tuple(0.01f, 0.0f, 0.0f),        // small mean, zero variance
-        std::make_tuple(0.01f, 0.01f, 2.53e-5f),   // small mean, small variance
-        std::make_tuple(0.01f, 100.0f, 1.20e-7f),  // small mean, large variance
+        std::make_tuple(0.01f, 0.01f, 3.57e-4f),   // small mean, small variance
+        std::make_tuple(1.0f, 100.0f, 2.63e-4f),   // small mean, large variance
         std::make_tuple(100.0f, 0.0f, 0.0f),       // large mean, zero variance
-        std::make_tuple(100.0f, 0.01f, 1.81e-4f),  // large mean, small variance
-        std::make_tuple(100.0f, 100.0f, 1.20e-7f)  // large mean, large variance
+        std::make_tuple(100.0f, 1.0f, 2.63e-4f),   // large mean, small variance
+        std::make_tuple(100.0f, 100.0f, 2.63e-4f)  // large mean, large variance
         ));
 
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(MeanStddevNormalizationTest);
-
 TEST_F(OpenCLOperationTest, MeanStddevNormalizationAllBatches) {
   TensorFloat32 src_tensor;
   src_tensor.shape = BHWC(9, 1, 1, 4);
   src_tensor.data = {
-      0.0f,     0.0f,    0.0f,    0.0f,     // zero mean, zero variance
-      -0.02f,   -0.01f,  0.01f,   0.02f,    // zero mean, small variance
-      -200.0f,  -100.0f, 100.0f,  200.0f,   // zero mean, large variance
-      0.01f,    0.01f,   0.01f,   0.01f,    // small mean, zero variance
-      -0.01f,   0.0f,    0.02f,   0.03f,    // small mean, small variance
-      -199.99f, -99.99f, 100.01f, 200.01f,  // small mean, large variance
-      100.0f,   100.0f,  100.0f,  100.0f,   // large mean, zero variance
-      99.98f,   99.99f,  100.01f, 100.02f,  // large mean, small variance
-      -100.0f,  0.0f,    200.0f,  300.0f,   // large mean, large variance
+      0.0f,    0.0f,    0.0f,   0.0f,    // zero mean, zero variance
+      -0.02f,  -0.01f,  0.01f,  0.02f,   // zero mean, small variance
+      -200.0f, -100.0f, 100.0f, 200.0f,  // zero mean, large variance
+      0.01f,   0.01f,   0.01f,  0.01f,   // small mean, zero variance
+      -0.01f,  0.0f,    0.02f,  0.03f,   // small mean, small variance
+      -199.0f, -99.0f,  101.0f, 201.0f,  // small mean, large variance
+      100.0f,  100.0f,  100.0f, 100.0f,  // large mean, zero variance
+      98.0f,   99.0f,   101.0f, 102.0f,  // large mean, small variance
+      -100.0f, 0.0f,    200.0f, 300.0f,  // large mean, large variance
   };
   for (auto storage : env_.GetSupportedStorages()) {
     for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps =
+          precision == CalculationsPrecision::F32 ? 2.53e-05f : 3.57e-4f;
       OperationDef op_def;
       op_def.precision = precision;
       auto data_type = DeduceDataTypeFromPrecision(precision);
       op_def.src_tensors.push_back({data_type, storage, Layout::BHWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::BHWC});
       TensorFloat32 dst_tensor;
-      auto operation = CreateMeanStdDevNormalization(op_def);
+      auto operation =
+          CreateMeanStdDevNormalization(op_def, env_.GetDevicePtr()->info_, 1);
       ASSERT_OK(ExecuteGPUOperation({src_tensor}, creation_context_, &operation,
                                     BHWC(9, 1, 1, 4), &dst_tensor));
 
@@ -127,8 +130,57 @@ TEST_F(OpenCLOperationTest, MeanStddevNormalizationAllBatches) {
           -ksqrt16, -ksqrt04, ksqrt04, ksqrt16,  // large mean, small variance
           -ksqrt16, -ksqrt04, ksqrt04, ksqrt16,  // large mean, large variance
       };
-      EXPECT_THAT(dst_tensor.data,
-                  Pointwise(FloatNear(1.81e-4f), expected_output));
+      EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), expected_output))
+          << "Failed using precision " << ToString(precision);
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, MeanStddevNormalizationLargeVector) {
+  const float mean = 100.0f;
+  const float diff = 1.0f;
+  // Some large vector that is not a round multiple of any SIMD vector sizes.
+  constexpr int kVectorSize = 16 * 16 + 16 + 1;
+
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 1, 1, kVectorSize);
+  src_tensor.data.resize(kVectorSize);
+  // First input is mean.
+  src_tensor.data[0] = mean;
+  // Rest is alternating between mean + diff and mean - diff.
+  for (int i = 1; i < kVectorSize - 1; i += 2) {
+    src_tensor.data[i + 0] = mean + diff;
+    src_tensor.data[i + 1] = mean - diff;
+  }
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps =
+          precision == CalculationsPrecision::F32 ? 0.0f : 8.60e-4f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::BHWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::BHWC});
+      TensorFloat32 dst_tensor;
+      auto operation = CreateMeanStdDevNormalization(
+          op_def, env_.GetDevicePtr()->info_, (kVectorSize + 3) / 4);
+      ASSERT_OK(ExecuteGPUOperation({src_tensor}, creation_context_, &operation,
+                                    BHWC(1, 1, 1, kVectorSize), &dst_tensor));
+
+      float expected_output[kVectorSize];
+      // First output should be 0.
+      expected_output[0] = 0.0;
+      // Rest should be alternating between ±√(N/(N-1)).
+      const float expected_elem =
+          std::sqrt(static_cast<double>(kVectorSize) /
+                    static_cast<double>(kVectorSize - 1));
+      for (int i = 1; i < kVectorSize - 1; i += 2) {
+        expected_output[i + 0] = +expected_elem;
+        expected_output[i + 1] = -expected_elem;
+      }
+      EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), expected_output))
+          << "Failed using precision " << ToString(precision);
     }
   }
 }
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/mean_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/mean_test.cc
index dbb70127317..b1ae1d354eb 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/mean_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/mean_test.cc
@@ -47,7 +47,7 @@ TEST_F(OpenCLOperationTest, Mean) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      Mean operation = CreateMean(op_def, env_.GetDevicePtr()->GetInfo());
+      Mean operation = CreateMean(op_def, env_.GetDevicePtr()->info_);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 1, 1, 1), &dst_tensor));
       EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), {2.5f}));
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/padding.cc b/tensorflow/lite/delegates/gpu/cl/kernels/padding.cc
index 4e2a6fb2bce..8012e601c0b 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/padding.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/padding.cc
@@ -24,29 +24,15 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 namespace cl {
-
-Padding::Padding(const OperationDef& definition, const PadAttributes& attr)
-    : GPUOperation(definition) {
-  code_ = GetPaddingCode(definition_, attr);
-}
-
-Padding::Padding(Padding&& kernel) : GPUOperation(std::move(kernel)) {}
-
-Padding& Padding::operator=(Padding&& kernel) {
-  if (this != &kernel) {
-    GPUOperation::operator=(std::move(kernel));
-  }
-  return *this;
-}
-
-std::string Padding::GetPaddingCode(const OperationDef& op_def,
-                                    const PadAttributes& attr) {
-  AddSrcTensor("src_tensor", op_def.src_tensors[0]);
-  AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
-  args_.AddInt("prepended_x", attr.prepended.w);
-  args_.AddInt("prepended_y", attr.prepended.h);
-  args_.AddInt("prepended_z", attr.prepended.c);
-  args_.AddInt("prepended_w", attr.prepended.b);
+namespace {
+std::string GetPaddingCode(const OperationDef& op_def,
+                           const PadAttributes& attr, GPUOperation* op) {
+  op->AddSrcTensor("src_tensor", op_def.src_tensors[0]);
+  op->AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
+  op->args_.AddInt("prepended_x", attr.prepended.w);
+  op->args_.AddInt("prepended_y", attr.prepended.h);
+  op->args_.AddInt("prepended_z", attr.prepended.c);
+  op->args_.AddInt("prepended_w", attr.prepended.b);
 
   const std::string dst_batch =
       op_def.dst_tensors[0].HasAxis(Axis::BATCH) ? "B" : "0";
@@ -149,16 +135,14 @@ std::string Padding::GetPaddingCode(const OperationDef& op_def,
   return c;
 }
 
-int3 Padding::GetGridSize() const {
-  const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
-  const int grid_y = dst_[0]->Height();
-  const int grid_z = dst_[0]->Slices();
-  return int3(grid_x, grid_y, grid_z);
-}
+}  // namespace
 
-Padding CreatePadding(const OperationDef& definition,
-                      const PadAttributes& attr) {
-  return Padding(definition, attr);
+GPUOperation CreatePadding(const OperationDef& definition,
+                           const PadAttributes& attr) {
+  GPUOperation op(definition);
+  op.code_ = GetPaddingCode(definition, attr, &op);
+  op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_SToZ;
+  return op;
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/padding.h b/tensorflow/lite/delegates/gpu/cl/kernels/padding.h
index 44d53204e16..81047162d20 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/padding.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/padding.h
@@ -25,24 +25,8 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-class Padding : public GPUOperation {
- public:
-  Padding(const OperationDef& definition, const PadAttributes& attr);
-  int3 GetGridSize() const override;
-
-  // Move only
-  Padding(Padding&& kernel);
-  Padding& operator=(Padding&& kernel);
-  Padding(const Padding&) = delete;
-  Padding& operator=(const Padding&) = delete;
-
- private:
-  std::string GetPaddingCode(const OperationDef& op_def,
-                             const PadAttributes& attr);
-};
-
-Padding CreatePadding(const OperationDef& definition,
-                      const PadAttributes& attr);
+GPUOperation CreatePadding(const OperationDef& definition,
+                           const PadAttributes& attr);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/padding_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/padding_test.cc
index a12183d4d65..426c23d8228 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/padding_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/padding_test.cc
@@ -49,7 +49,7 @@ TEST_F(OpenCLOperationTest, PaddingAppendWidth) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      Padding operation = CreatePadding(op_def, attr);
+      GPUOperation operation = CreatePadding(op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 2, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -77,7 +77,7 @@ TEST_F(OpenCLOperationTest, PaddingPrependWidth) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      Padding operation = CreatePadding(op_def, attr);
+      GPUOperation operation = CreatePadding(op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 2, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -105,7 +105,7 @@ TEST_F(OpenCLOperationTest, PaddingAppendHeight) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      Padding operation = CreatePadding(op_def, attr);
+      GPUOperation operation = CreatePadding(op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 3, 1, 2), &dst_tensor));
       EXPECT_THAT(
@@ -133,7 +133,7 @@ TEST_F(OpenCLOperationTest, PaddingPrependHeight) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      Padding operation = CreatePadding(op_def, attr);
+      GPUOperation operation = CreatePadding(op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 3, 1, 2), &dst_tensor));
       EXPECT_THAT(
@@ -161,7 +161,7 @@ TEST_F(OpenCLOperationTest, PaddingAppendChannels) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      Padding operation = CreatePadding(op_def, attr);
+      GPUOperation operation = CreatePadding(op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 1, 3), &dst_tensor));
       EXPECT_THAT(
@@ -189,7 +189,7 @@ TEST_F(OpenCLOperationTest, PaddingPrependChannels) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      Padding operation = CreatePadding(op_def, attr);
+      GPUOperation operation = CreatePadding(op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 1, 3), &dst_tensor));
       EXPECT_THAT(
@@ -217,7 +217,7 @@ TEST_F(OpenCLOperationTest, PaddingPrependChannelsX4) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      Padding operation = CreatePadding(op_def, attr);
+      GPUOperation operation = CreatePadding(op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 1, 1, 6), &dst_tensor));
       EXPECT_THAT(
@@ -245,7 +245,7 @@ TEST_F(OpenCLOperationTest, PaddingComplex) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      Padding operation = CreatePadding(op_def, attr);
+      GPUOperation operation = CreatePadding(op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 3, 3, 3), &dst_tensor));
       EXPECT_THAT(
@@ -277,7 +277,7 @@ TEST_F(OpenCLOperationTest, PaddingReflectWidth) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      Padding operation = CreatePadding(op_def, attr);
+      GPUOperation operation = CreatePadding(op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 1, 7, 1), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -306,7 +306,7 @@ TEST_F(OpenCLOperationTest, PaddingReflectChannels) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      Padding operation = CreatePadding(op_def, attr);
+      GPUOperation operation = CreatePadding(op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 1, 1, 7), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/pooling.cc b/tensorflow/lite/delegates/gpu/cl/kernels/pooling.cc
index fb077fe4a1a..af164615db1 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/pooling.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/pooling.cc
@@ -23,78 +23,21 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 namespace cl {
-
-Pooling::Pooling(const OperationDef& definition,
-                 const Pooling2DAttributes& attr)
-    : GPUOperation(definition),
-      stride_(attr.strides.w, attr.strides.h, 0, 0),
-      padding_(-attr.padding.prepended.w, -attr.padding.prepended.h, 0, 0),
-      kernel_size_(attr.kernel.w, attr.kernel.h, 0, 0),
-      type_(attr.type),
-      output_indices_(attr.output_indices) {
-  GenerateCode();
-}
-
-Pooling::Pooling(const OperationDef& definition,
-                 const Pooling3DAttributes& attr)
-    : GPUOperation(definition),
-      stride_(attr.strides.w, attr.strides.h, attr.strides.d, 0),
-      padding_(-attr.padding.prepended.w, -attr.padding.prepended.h,
-               -attr.padding.prepended.d, 0),
-      kernel_size_(attr.kernel.w, attr.kernel.h, attr.kernel.d, 0),
-      type_(attr.type),
-      output_indices_(attr.output_indices) {
-  GenerateCode();
-}
-
-Pooling::Pooling(Pooling&& kernel)
-    : GPUOperation(std::move(kernel)),
-      stride_(kernel.stride_),
-      padding_(kernel.padding_),
-      kernel_size_(kernel.kernel_size_),
-      type_(kernel.type_),
-      output_indices_(kernel.output_indices_) {}
-
-Pooling& Pooling::operator=(Pooling&& kernel) {
-  if (this != &kernel) {
-    std::swap(stride_, kernel.stride_);
-    std::swap(padding_, kernel.padding_);
-    std::swap(kernel_size_, kernel.kernel_size_);
-    std::swap(type_, kernel.type_);
-    std::swap(output_indices_, kernel.output_indices_);
-    GPUOperation::operator=(std::move(kernel));
-  }
-  return *this;
-}
-
-std::string Pooling::GetAveragePoolingKernelCode(const OperationDef& op_def,
-                                                 bool stride_correction) {
+namespace {
+std::string GetAveragePoolingKernelCode(const OperationDef& op_def,
+                                        bool stride_correction,
+                                        GPUOperation* op) {
   auto src_desc = op_def.src_tensors[0];
   src_desc.SetTextureAddressMode(TextureAddressMode::ZERO);
   if (op_def.IsBatchSupported()) {
     src_desc.SetStateVar("BatchedWidth", "true");
   }
-  AddSrcTensor("src_tensor", src_desc);
+  op->AddSrcTensor("src_tensor", src_desc);
   auto dst_desc = op_def.dst_tensors[0];
   if (op_def.IsBatchSupported()) {
     dst_desc.SetStateVar("BatchedWidth", "true");
   }
-  AddDstTensor("dst_tensor", dst_desc);
-  if (op_def.dst_tensors[0].HasAxis(Axis::WIDTH)) {
-    args_.AddInt("kernel_size_x");
-    args_.AddInt("padding_x");
-    args_.AddInt("stride_x");
-  }
-  if (op_def.dst_tensors[0].HasAxis(Axis::HEIGHT)) {
-    args_.AddInt("kernel_size_y");
-    args_.AddInt("padding_y");
-    args_.AddInt("stride_y");
-  }
-  if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
-    args_.AddInt("kernel_size_z");
-    args_.AddInt("padding_z");
-    args_.AddInt("stride_z");
-  }
+  op->AddDstTensor("dst_tensor", dst_desc);
 
   std::map<Axis, std::string> axis_to_src_coord = {
       {Axis::WIDTH, "x_c"},  {Axis::HEIGHT, "y_c"}, {Axis::DEPTH, "d_c"},
@@ -149,11 +92,16 @@ std::string Pooling::GetAveragePoolingKernelCode(const OperationDef& op_def,
   c += "  float window_size = 0.0;\n";
   if (stride_correction) {
     c += "  int xs = " +
-         GetXStrideCorrected("X", "args.src_tensor.Batch()", "args.stride_x",
-                             "args.padding_x") +
+         GetXStrideCorrectedV2("X", "args.src_tensor.Batch()", "args.stride_x",
+                               "args.padding_x") +
          ";\n";
   } else {
-    c += "  int xs = X * args.stride_x + args.padding_x;\n";
+    if (op_def.IsBatchSupported()) {
+      c += "  int xs = X * args.stride_x + args.padding_x * "
+           "args.src_tensor.Batch();\n";
+    } else {
+      c += "  int xs = X * args.stride_x + args.padding_x;\n";
+    }
   }
   c += "  int ys = Y * args.stride_y + args.padding_y;\n";
   if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
@@ -195,40 +143,25 @@ std::string Pooling::GetAveragePoolingKernelCode(const OperationDef& op_def,
   return c;
 }
 
-std::string Pooling::GetMaxPoolingKernelCode(const OperationDef& op_def,
-                                             bool stride_correction,
-                                             bool output_indices) {
+std::string GetMaxPoolingKernelCode(const OperationDef& op_def,
+                                    bool stride_correction, bool output_indices,
+                                    GPUOperation* op) {
   auto src_desc = op_def.src_tensors[0];
   if (op_def.IsBatchSupported()) {
     src_desc.SetStateVar("BatchedWidth", "true");
   }
-  AddSrcTensor("src_tensor", src_desc);
+  op->AddSrcTensor("src_tensor", src_desc);
   auto dst_desc = op_def.dst_tensors[0];
   if (op_def.IsBatchSupported()) {
     dst_desc.SetStateVar("BatchedWidth", "true");
   }
-  AddDstTensor("dst_tensor", dst_desc);
+  op->AddDstTensor("dst_tensor", dst_desc);
   if (output_indices) {
     auto dst_ind_desc = op_def.dst_tensors[1];
     if (op_def.IsBatchSupported()) {
       dst_ind_desc.SetStateVar("BatchedWidth", "true");
     }
-    AddDstTensor("dst_indices", dst_ind_desc);
-  }
-  if (op_def.dst_tensors[0].HasAxis(Axis::WIDTH)) {
-    args_.AddInt("kernel_size_x");
-    args_.AddInt("padding_x");
-    args_.AddInt("stride_x");
-  }
-  if (op_def.dst_tensors[0].HasAxis(Axis::HEIGHT)) {
-    args_.AddInt("kernel_size_y");
-    args_.AddInt("padding_y");
-    args_.AddInt("stride_y");
-  }
-  if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
-    args_.AddInt("kernel_size_z");
-    args_.AddInt("padding_z");
-    args_.AddInt("stride_z");
+    op->AddDstTensor("dst_indices", dst_ind_desc);
   }
 
   std::map<Axis, std::string> axis_to_src_coord = {
@@ -282,11 +215,16 @@ std::string Pooling::GetMaxPoolingKernelCode(const OperationDef& op_def,
   }
   if (stride_correction) {
     c += "  int xs = " +
-         GetXStrideCorrected("X", "args.src_tensor.Batch()", "args.stride_x",
-                             "args.padding_x") +
+         GetXStrideCorrectedV2("X", "args.src_tensor.Batch()", "args.stride_x",
+                               "args.padding_x") +
          ";\n";
   } else {
-    c += "  int xs = X * args.stride_x + args.padding_x;\n";
+    if (op_def.IsBatchSupported()) {
+      c += "  int xs = X * args.stride_x + args.padding_x * "
+           "args.src_tensor.Batch();\n";
+    } else {
+      c += "  int xs = X * args.stride_x + args.padding_x;\n";
+    }
   }
   c += "  int ys = Y * args.stride_y + args.padding_y;\n";
   c += "  for (int ky = 0; ky < args.kernel_size_y; ++ky) {\n";
@@ -346,52 +284,51 @@ std::string Pooling::GetMaxPoolingKernelCode(const OperationDef& op_def,
 
   return c;
 }
+}  // namespace
 
-void Pooling::GenerateCode() {
+GPUOperation CreatePooling(const OperationDef& definition,
+                           const Pooling2DAttributes& attr) {
+  GPUOperation op(definition);
+  op.args_.AddInt("kernel_size_x", attr.kernel.w);
+  op.args_.AddInt("padding_x", -attr.padding.prepended.w);
+  op.args_.AddInt("stride_x", attr.strides.w);
+  op.args_.AddInt("kernel_size_y", attr.kernel.h);
+  op.args_.AddInt("padding_y", -attr.padding.prepended.h);
+  op.args_.AddInt("stride_y", attr.strides.h);
   const bool stride_correction =
-      definition_.IsBatchSupported() && stride_.x != 1;
-  if (type_ == PoolingType::AVERAGE) {
-    code_ = GetAveragePoolingKernelCode(definition_, stride_correction);
-  } else if (type_ == PoolingType::MAX) {
-    code_ = GetMaxPoolingKernelCode(definition_, stride_correction,
-                                    output_indices_);
+      definition.IsBatchSupported() && attr.strides.w != 1;
+  if (attr.type == PoolingType::AVERAGE) {
+    op.code_ = GetAveragePoolingKernelCode(definition, stride_correction, &op);
+  } else if (attr.type == PoolingType::MAX) {
+    op.code_ = GetMaxPoolingKernelCode(definition, stride_correction,
+                                       attr.output_indices, &op);
   }
+  op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_SToZ;
+  return op;
 }
 
-absl::Status Pooling::BindArguments() {
-  if (definition_.dst_tensors[0].HasAxis(Axis::WIDTH)) {
-    RETURN_IF_ERROR(args_.SetInt("stride_x", stride_.x));
-    RETURN_IF_ERROR(args_.SetInt("padding_x", padding_.x * src_[0]->Batch()));
-    RETURN_IF_ERROR(args_.SetInt("kernel_size_x", kernel_size_.x));
+GPUOperation CreatePooling(const OperationDef& definition,
+                           const Pooling3DAttributes& attr) {
+  GPUOperation op(definition);
+  op.args_.AddInt("kernel_size_x", attr.kernel.w);
+  op.args_.AddInt("padding_x", -attr.padding.prepended.w);
+  op.args_.AddInt("stride_x", attr.strides.w);
+  op.args_.AddInt("kernel_size_y", attr.kernel.h);
+  op.args_.AddInt("padding_y", -attr.padding.prepended.h);
+  op.args_.AddInt("stride_y", attr.strides.h);
+  op.args_.AddInt("kernel_size_z", attr.kernel.d);
+  op.args_.AddInt("padding_z", -attr.padding.prepended.d);
+  op.args_.AddInt("stride_z", attr.strides.d);
+  const bool stride_correction =
+      definition.IsBatchSupported() && attr.strides.w != 1;
+  if (attr.type == PoolingType::AVERAGE) {
+    op.code_ = GetAveragePoolingKernelCode(definition, stride_correction, &op);
+  } else if (attr.type == PoolingType::MAX) {
+    op.code_ = GetMaxPoolingKernelCode(definition, stride_correction,
+                                       attr.output_indices, &op);
   }
-  if (definition_.dst_tensors[0].HasAxis(Axis::HEIGHT)) {
-    RETURN_IF_ERROR(args_.SetInt("stride_y", stride_.y));
-    RETURN_IF_ERROR(args_.SetInt("padding_y", padding_.y));
-    RETURN_IF_ERROR(args_.SetInt("kernel_size_y", kernel_size_.y));
-  }
-  if (definition_.dst_tensors[0].HasAxis(Axis::DEPTH)) {
-    RETURN_IF_ERROR(args_.SetInt("stride_z", stride_.z));
-    RETURN_IF_ERROR(args_.SetInt("padding_z", padding_.z));
-    RETURN_IF_ERROR(args_.SetInt("kernel_size_z", kernel_size_.z));
-  }
-  return absl::OkStatus();
-}
-
-int3 Pooling::GetGridSize() const {
-  const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
-  const int grid_y = dst_[0]->Height() * dst_[0]->Depth();
-  const int grid_z = dst_[0]->Slices();
-  return int3(grid_x, grid_y, grid_z);
-}
-
-Pooling CreatePooling(const OperationDef& definition,
-                      const Pooling2DAttributes& attr) {
-  return Pooling(definition, attr);
-}
-
-Pooling CreatePooling(const OperationDef& definition,
-                      const Pooling3DAttributes& attr) {
-  return Pooling(definition, attr);
+  op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_SToZ;
+  return op;
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/pooling.h b/tensorflow/lite/delegates/gpu/cl/kernels/pooling.h
index 18bb426f259..81a0dfff4de 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/pooling.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/pooling.h
@@ -27,42 +27,11 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-class Pooling : public GPUOperation {
- public:
-  Pooling(const OperationDef& definition, const Pooling2DAttributes& attr);
-  Pooling(const OperationDef& definition, const Pooling3DAttributes& attr);
+GPUOperation CreatePooling(const OperationDef& definition,
+                           const Pooling2DAttributes& attr);
 
-  absl::Status BindArguments() override;
-  int3 GetGridSize() const override;
-
-  // Move only
-  Pooling(Pooling&& kernel);
-  Pooling& operator=(Pooling&& kernel);
-  Pooling(const Pooling&) = delete;
-  Pooling& operator=(const Pooling&) = delete;
-
- private:
-  std::string GetAveragePoolingKernelCode(const OperationDef& op_def,
-                                          bool stride_correction);
-  std::string GetMaxPoolingKernelCode(const OperationDef& op_def,
-                                      bool stride_correction,
-                                      bool output_indices);
-
-  void GenerateCode();
-
-  int4 stride_;
-  int4 padding_;
-  int4 kernel_size_;
-
-  PoolingType type_;
-  bool output_indices_;
-};
-
-Pooling CreatePooling(const OperationDef& definition,
-                      const Pooling2DAttributes& attr);
-
-Pooling CreatePooling(const OperationDef& definition,
-                      const Pooling3DAttributes& attr);
+GPUOperation CreatePooling(const OperationDef& definition,
+                           const Pooling3DAttributes& attr);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/pooling_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/pooling_test.cc
index 12efd56f5d2..af99b52f2a6 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/pooling_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/pooling_test.cc
@@ -52,7 +52,7 @@ TEST_F(OpenCLOperationTest, AveragePooling) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      Pooling operation = CreatePooling(op_def, attr);
+      GPUOperation operation = CreatePooling(op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 1, 1, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), {3.0f, 4.0f}));
@@ -81,7 +81,7 @@ TEST_F(OpenCLOperationTest, AveragePoolingNonEmptyPadding) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      Pooling operation = CreatePooling(op_def, attr);
+      GPUOperation operation = CreatePooling(op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 2, 1), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -111,7 +111,7 @@ TEST_F(OpenCLOperationTest, MaxPooling) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      Pooling operation = CreatePooling(op_def, attr);
+      GPUOperation operation = CreatePooling(op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 1, 1, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), {8.0f, 7.0f}));
@@ -143,7 +143,7 @@ TEST_F(OpenCLOperationTest, MaxPoolingIndices) {
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
       TensorFloat32 dst_tensor_ind;
-      Pooling operation = CreatePooling(op_def, attr);
+      GPUOperation operation = CreatePooling(op_def, attr);
       ASSERT_OK(ExecuteGPUOperation({src_tensor}, creation_context_, &operation,
                                     {BHWC(1, 1, 1, 2), BHWC(1, 1, 1, 2)},
                                     {&dst_tensor, &dst_tensor_ind}));
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/prelu.cc b/tensorflow/lite/delegates/gpu/cl/kernels/prelu.cc
index 1ca2e096a0e..bcda1f6a628 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/prelu.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/prelu.cc
@@ -18,50 +18,75 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/types/variant.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/cl/storage_type_util.h"
 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
 
 namespace tflite {
 namespace gpu {
 namespace cl {
 
-absl::Status CreatePReLU(const CreationContext& creation_context,
+GPUOperation CreatePReLU(const DeviceInfo& device_info,
                          const OperationDef& definition,
-                         const PReLUAttributes& attr, GPUOperation* result) {
-  *result = GPUOperation(definition);
-  result->elementwise_ = true;
+                         const PReLUAttributes& attr) {
+  GPUOperation result(definition);
+  result.elementwise_ = true;
+
+  std::string alpha_read;
+  auto alpha_linear =
+      absl::get_if<tflite::gpu::Tensor<Linear, DataType::FLOAT32>>(&attr.alpha);
+  if (alpha_linear) {
+    TensorLinearDescriptor desc;
+    desc.storage_type =
+        DeduceLinearStorageType(definition.GetPrimaryStorageType());
+    desc.element_type = definition.GetPrimaryDataType();
+    desc.UploadLinearData(*alpha_linear);
+    result.args_.AddObject(
+        "alpha", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
+    alpha_read = "FLT4 alpha_val = args.alpha.Read(S_COORD);\n";
+  }
+
+  auto alpha_hwc =
+      absl::get_if<tflite::gpu::Tensor<HWC, DataType::FLOAT32>>(&attr.alpha);
+  if (alpha_hwc) {
+    const BHWC shape =
+        BHWC(1, alpha_hwc->shape.h, alpha_hwc->shape.w, alpha_hwc->shape.c);
+    TensorStorageType storage_type = SelectBestStorageType(
+        device_info, shape, definition.GetPrimaryStorageType(),
+        definition.GetDataType(), Layout::HWC);
+    TensorDescriptor desc{definition.GetDataType(), storage_type, Layout::HWC};
+    desc.UploadData(*alpha_hwc);
+    result.args_.AddObject(
+        "alpha", absl::make_unique<TensorDescriptor>(std::move(desc)));
+    const std::string x_coord = shape.w == 1 ? "0" : "X_COORD";
+    const std::string y_coord = shape.h == 1 ? "0" : "Y_COORD";
+    const std::string s_coord = shape.c == 1 ? "0" : "S_COORD";
+    alpha_read = absl::StrCat("FLT4 alpha_val = args.alpha.Read(", x_coord,
+                              ", ", y_coord, ", ", s_coord, ");\n");
+    if (shape.c == 1) {
+      alpha_read += "  alpha_val.y = alpha_val.x;\n";
+      alpha_read += "  alpha_val.z = alpha_val.x;\n";
+      alpha_read += "  alpha_val.w = alpha_val.x;\n";
+    }
+  }
+
   if (attr.clip != 0) {
     if (definition.precision == CalculationsPrecision::F32) {
-      result->args_.AddFloat("clip", attr.clip);
+      result.args_.AddFloat("clip", attr.clip);
     } else {
-      result->args_.AddHalf("clip", half(attr.clip));
+      result.args_.AddHalf("clip", half(attr.clip));
     }
-    result->code_ =
+    result.code_ =
+        alpha_read +
         "in_out_value = clamp(in_out_value, (FLT4)(0.0f), (FLT4)(args.clip)) + "
-        "min((FLT4)(0.0f), in_out_value) * args.alpha.Read(S_COORD);";
+        "min((FLT4)(0.0f), in_out_value) * alpha_val;";
   } else {
-    result->code_ =
+    result.code_ =
+        alpha_read +
         "in_out_value = max((FLT4)(0.0f), in_out_value) + min((FLT4)(0.0f), "
-        "in_out_value) * args.alpha.Read(S_COORD);";
+        "in_out_value) * alpha_val;";
   }
 
-  auto alpha =
-      absl::get_if<tflite::gpu::Tensor<Linear, DataType::FLOAT32>>(&attr.alpha);
-  if (!alpha) {
-    return absl::InvalidArgumentError("Alpha is missing");
-  }
-  TensorLinearDescriptor desc;
-  desc.storage_type =
-      DeduceLinearStorageType(definition.GetPrimaryStorageType());
-  desc.element_type = definition.GetPrimaryDataType();
-
-  LinearStorage lt;
-  RETURN_IF_ERROR(
-      CreateLinearStorage(desc, *alpha, creation_context.context, &lt));
-  result->args_.AddObject("alpha", AccessType::READ,
-                          absl::make_unique<LinearStorage>(std::move(lt)),
-                          absl::make_unique<TensorLinearDescriptor>(desc));
-
-  return absl::OkStatus();
+  return result;
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/prelu.h b/tensorflow/lite/delegates/gpu/cl/kernels/prelu.h
index b673217c799..5d2a41bc6de 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/prelu.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/prelu.h
@@ -31,9 +31,9 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-absl::Status CreatePReLU(const CreationContext& creation_context,
+GPUOperation CreatePReLU(const DeviceInfo& device_info,
                          const OperationDef& definition,
-                         const PReLUAttributes& attr, GPUOperation* result);
+                         const PReLUAttributes& attr);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/prelu_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/prelu_test.cc
index 06ff09ccca7..ef4b8c17324 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/prelu_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/prelu_test.cc
@@ -52,8 +52,8 @@ TEST_F(OpenCLOperationTest, PReLUAlpha) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      GPUOperation operation;
-      ASSERT_OK(CreatePReLU(creation_context_, op_def, attr, &operation));
+      GPUOperation operation =
+          CreatePReLU(creation_context_.GetDeviceInfo(), op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -83,8 +83,8 @@ TEST_F(OpenCLOperationTest, PReLUAlphaClip) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      GPUOperation operation;
-      ASSERT_OK(CreatePReLU(creation_context_, op_def, attr, &operation));
+      GPUOperation operation =
+          CreatePReLU(creation_context_.GetDeviceInfo(), op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -93,6 +93,37 @@ TEST_F(OpenCLOperationTest, PReLUAlphaClip) {
   }
 }
 
+TEST_F(OpenCLOperationTest, PReLUHWCAlpha) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 1, 2);
+  src_tensor.data = {0.0f, -1.0f, -2.0f, 3.0f};
+
+  PReLUAttributes attr;
+  ::tflite::gpu::Tensor<HWC, DataType::FLOAT32> hwc_tensor;
+  hwc_tensor.shape = HWC(2, 1, 2);
+  hwc_tensor.data = {0.5f, -2.0f, 0.7f, 4.7f};
+  attr.alpha = hwc_tensor;
+  attr.clip = 0.0;
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      GPUOperation operation =
+          CreatePReLU(creation_context_.GetDeviceInfo(), op_def, attr);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 1, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {0.0f, 2.0f, -1.4f, 3.0f}));
+    }
+  }
+}
+
 }  // namespace
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.cc b/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.cc
index e0c44e1cda7..1e08eb0ff52 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.cc
@@ -26,7 +26,7 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 GPUOperation CreateQuantizeAndDequantize(
-    const CreationContext& creation_context, const OperationDef& definition,
+    const OperationDef& definition,
     const QuantizeAndDequantizeAttributes& attr) {
   QuantizeAndDequantizeAttributes adjusted_attr = attr;
   const bool is_fp16 = definition.precision == CalculationsPrecision::F16 ||
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.h b/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.h
index 6e028625852..1e37e427af8 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.h
@@ -44,7 +44,7 @@ namespace cl {
 // NOTE: We do not need to nudge min/max values in this op, since they would
 // already be adjusted while generating the quantized model.
 GPUOperation CreateQuantizeAndDequantize(
-    const CreationContext& creation_context, const OperationDef& definition,
+    const OperationDef& definition,
     const QuantizeAndDequantizeAttributes& attr);
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize_test.cc
index 43b5d69323d..40087ad82d3 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize_test.cc
@@ -56,8 +56,7 @@ TEST_F(OpenCLOperationTest, QuantAndDequant_Dim2Bits8) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      GPUOperation operation =
-          CreateQuantizeAndDequantize(creation_context_, op_def, attr);
+      GPUOperation operation = CreateQuantizeAndDequantize(op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 3, 2, 1), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -91,8 +90,7 @@ TEST_F(OpenCLOperationTest, QuantAndDequant_Dim3Bits8_NegativeRange) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      GPUOperation operation =
-          CreateQuantizeAndDequantize(creation_context_, op_def, attr);
+      GPUOperation operation = CreateQuantizeAndDequantize(op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 3, 1, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -126,8 +124,7 @@ TEST_F(OpenCLOperationTest, QuantAndDequant_Dim3Bits16) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      GPUOperation operation =
-          CreateQuantizeAndDequantize(creation_context_, op_def, attr);
+      GPUOperation operation = CreateQuantizeAndDequantize(op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 3, 1, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -161,8 +158,7 @@ TEST_F(OpenCLOperationTest, QuantAndDequant_Dim2Bits16_NegativeRange) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      GPUOperation operation =
-          CreateQuantizeAndDequantize(creation_context_, op_def, attr);
+      GPUOperation operation = CreateQuantizeAndDequantize(op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 3, 2, 1), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/reduce.cc b/tensorflow/lite/delegates/gpu/cl/kernels/reduce.cc
new file mode 100644
index 00000000000..b24d54abbfc
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/reduce.cc
@@ -0,0 +1,103 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/reduce.h"
+
+#include <string>
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/cl/precision.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+std::string GetReduceChannelsKernelCode(const OperationDef& op_def,
+                                        const OperationType& op_type) {
+  std::string c = GetCommonDefines(op_def.precision);
+  if (op_type == OperationType::REDUCE_SUM) {
+    c += "#define OP(a, b) ((a) + (b))\n";
+  } else if (op_type == OperationType::REDUCE_PRODUCT) {
+    c += "#define OP(a, b) ((a) * (b))\n";
+  } else if (op_type == OperationType::REDUCE_MAXIMUM) {
+    c += "#define OP(a, b) max(a, b)\n";
+  } else if (op_type == OperationType::REDUCE_MINIMUM) {
+    c += "#define OP(a, b) min(a, b)\n";
+  }
+  c += "__kernel void main_function($0) {\n";
+  c += "  int X = get_global_id(0);\n";
+  c += "  int Y = get_global_id(1);\n";
+  c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height()) "
+       "return;\n";
+  if (op_type == OperationType::REDUCE_SUM) {
+    c += "  FLT4 reduced = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
+  } else if (op_type == OperationType::REDUCE_PRODUCT) {
+    c += "  FLT4 reduced = (FLT4)(1.0f, 1.0f, 1.0f, 1.0f);\n";
+  } else {
+    c += "  FLT4 V0 = args.src_tensor.Read(X, Y, 0);\n";
+    c += "  FLT4 reduced = (FLT4)(V0.x, V0.x, V0.x, V0.x);\n";
+  }
+  c += "  int s = 0;\n";
+  c += "  for (; s < args.src_tensor.Slices() - 1; ++s) {\n";
+  c += "    FLT4 V = args.src_tensor.Read(X, Y, s);\n";
+  c += "    reduced = OP(reduced, V);\n";
+  c += "  }\n";
+  c += "  FLT reduced_final = OP(OP(reduced.x, reduced.y), OP(reduced.z, "
+       "reduced.w));\n";
+  c += "  FLT last_reduce;\n";
+  c += "  FLT4 last_val = args.src_tensor.Read(X, Y, s);\n";
+  c += "  int ch_rem = args.src_tensor.Channels() % 4;\n";
+  c += "  if (ch_rem == 0) {\n";
+  c += "    last_reduce = OP(OP(last_val.x, last_val.y), OP(last_val.z, "
+       "last_val.w));\n";
+  c += "  } else if (ch_rem == 1) {\n";
+  c += "    last_reduce = OP(OP(last_val.x, last_val.y), last_val.z);\n";
+  c += "  } else if (ch_rem == 2) {\n";
+  c += "    last_reduce = OP(last_val.x, last_val.y);\n";
+  c += "  } else {\n";
+  c += "    last_reduce = last_val.x;\n";
+  c += "  }\n";
+  c += "  reduced_final = OP(reduced_final, last_reduce);\n";
+  c += "  FLT4 result = (FLT4)(reduced_final, 0.0f, 0.0f, 0.0f);\n";
+  c += "  args.dst_tensor.Write(result, X, Y, 0);\n";
+  c += "}\n";
+  return c;
+}
+}  // namespace
+
+GPUOperation CreateReduce(const OperationDef& definition,
+                          const ReduceAttributes& attr,
+                          const OperationType& op_type) {
+  GPUOperation op(definition);
+  auto src_desc = definition.src_tensors[0];
+  if (definition.IsBatchSupported()) {
+    src_desc.SetStateVar("BatchedWidth", "true");
+  }
+  op.AddSrcTensor("src_tensor", src_desc);
+  auto dst_desc = definition.dst_tensors[0];
+  if (definition.IsBatchSupported()) {
+    dst_desc.SetStateVar("BatchedWidth", "true");
+  }
+  op.AddDstTensor("dst_tensor", dst_desc);
+  op.code_ = GetReduceChannelsKernelCode(definition, op_type);
+  op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_ZIs1;
+  return op;
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/reduce.h b/tensorflow/lite/delegates/gpu/cl/kernels/reduce.h
new file mode 100644
index 00000000000..def7ced4871
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/reduce.h
@@ -0,0 +1,34 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_REDUCE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_REDUCE_H_
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+GPUOperation CreateReduce(const OperationDef& definition,
+                          const ReduceAttributes& attr,
+                          const OperationType& op_type);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_REDUCE_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/reduce_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/reduce_test.cc
new file mode 100644
index 00000000000..7f100410d3c
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/reduce_test.cc
@@ -0,0 +1,141 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/reduce.h"
+
+#include <cmath>
+#include <cstdlib>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+using ::testing::FloatNear;
+using ::testing::Pointwise;
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+TEST_F(OpenCLOperationTest, ReduceSumChannels) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 1, 5);
+  src_tensor.data = {1.1, 2.1, 0.7, 0.3, 1.2, 3.1, 4.1, 0.0, 1.0, 4.4};
+  ReduceAttributes attr;
+  attr.axis = Axis::CHANNELS;
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      GPUOperation operation =
+          CreateReduce(op_def, attr, OperationType::REDUCE_SUM);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 1, 1), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), {5.4f, 12.6f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, ReduceProductChannels) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 1, 2);
+  src_tensor.data = {1.1, 2.0, 3.1, 4.0};
+  ReduceAttributes attr;
+  attr.axis = Axis::CHANNELS;
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      GPUOperation operation =
+          CreateReduce(op_def, attr, OperationType::REDUCE_PRODUCT);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 1, 1), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), {2.2f, 12.4f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, ReduceMaxChannels) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 1, 6);
+  src_tensor.data = {1.1,  2.0,  -0.3, -100.0, 32.6, 1.1,
+                     -3.1, -4.0, -5.0, -7.0,   -2.0, -100.0};
+  ReduceAttributes attr;
+  attr.axis = Axis::CHANNELS;
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      GPUOperation operation =
+          CreateReduce(op_def, attr, OperationType::REDUCE_MAXIMUM);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 1, 1), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), {32.6f, -2.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, ReduceMinChannels) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 1, 6);
+  src_tensor.data = {1.1,  2.0,  -0.3, -100.0, 32.6, 1.1,
+                     -3.1, -4.0, -5.0, -7.0,   -2.0, 100.0};
+  ReduceAttributes attr;
+  attr.axis = Axis::CHANNELS;
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      GPUOperation operation =
+          CreateReduce(op_def, attr, OperationType::REDUCE_MINIMUM);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 1, 1), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), {-100.0f, -7.0f}));
+    }
+  }
+}
+
+}  // namespace
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/relu.cc b/tensorflow/lite/delegates/gpu/cl/kernels/relu.cc
index a80dccd6259..5ed06173a89 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/relu.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/relu.cc
@@ -21,8 +21,7 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 namespace cl {
-GPUOperation CreateReLU(const CreationContext& creation_context,
-                        const OperationDef& definition,
+GPUOperation CreateReLU(const OperationDef& definition,
                         const ReLUAttributes& attr) {
   GPUOperation op(definition);
   op.elementwise_ = true;
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/relu.h b/tensorflow/lite/delegates/gpu/cl/kernels/relu.h
index 001e23da41c..1b4e3a81605 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/relu.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/relu.h
@@ -25,8 +25,7 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-GPUOperation CreateReLU(const CreationContext& creation_context,
-                        const OperationDef& definition,
+GPUOperation CreateReLU(const OperationDef& definition,
                         const ReLUAttributes& attr);
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/relu_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/relu_test.cc
index f741a408661..1860986d7e3 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/relu_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/relu_test.cc
@@ -49,7 +49,7 @@ TEST_F(OpenCLOperationTest, ReLUNoClipNoAlpha) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      GPUOperation operation = CreateReLU(creation_context_, op_def, attr);
+      GPUOperation operation = CreateReLU(op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -76,7 +76,7 @@ TEST_F(OpenCLOperationTest, ReLUClip) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      GPUOperation operation = CreateReLU(creation_context_, op_def, attr);
+      GPUOperation operation = CreateReLU(op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -103,7 +103,7 @@ TEST_F(OpenCLOperationTest, ReLUAlpha) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      GPUOperation operation = CreateReLU(creation_context_, op_def, attr);
+      GPUOperation operation = CreateReLU(op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -130,7 +130,7 @@ TEST_F(OpenCLOperationTest, ReLUAlphaClip) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      GPUOperation operation = CreateReLU(creation_context_, op_def, attr);
+      GPUOperation operation = CreateReLU(op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/reshape.cc b/tensorflow/lite/delegates/gpu/cl/kernels/reshape.cc
index 4e2ab1307a5..d965b6f0611 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/reshape.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/reshape.cc
@@ -23,24 +23,8 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 namespace cl {
-
-Reshape::Reshape(const OperationDef& definition) : GPUOperation(definition) {
-  code_ = GetReshapeCode(definition_);
-}
-
-Reshape::Reshape(Reshape&& operation) : GPUOperation(std::move(operation)) {}
-
-Reshape& Reshape::operator=(Reshape&& operation) {
-  if (this != &operation) {
-    GPUOperation::operator=(std::move(operation));
-  }
-  return *this;
-}
-
-std::string Reshape::GetReshapeCode(const OperationDef& op_def) {
-  AddSrcTensor("src_tensor", op_def.src_tensors[0]);
-  AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
-
+namespace {
+std::string GetReshapeCode(const OperationDef& op_def) {
   std::string c = GetCommonDefines(op_def.precision);
   c += "__kernel void main_function(\n";
   c += "$0) {\n";
@@ -96,15 +80,15 @@ std::string Reshape::GetReshapeCode(const OperationDef& op_def) {
   return c;
 }
 
-int3 Reshape::GetGridSize() const {
-  const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
-  const int grid_y = dst_[0]->Height();
-  const int grid_z = dst_[0]->Slices();
-  return int3(grid_x, grid_y, grid_z);
-}
+}  // namespace
 
-Reshape CreateReshape(const OperationDef& definition) {
-  return Reshape(definition);
+GPUOperation CreateReshape(const OperationDef& definition) {
+  GPUOperation op(definition);
+  op.AddSrcTensor("src_tensor", definition.src_tensors[0]);
+  op.AddDstTensor("dst_tensor", definition.dst_tensors[0]);
+  op.code_ = GetReshapeCode(definition);
+  op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_SToZ;
+  return op;
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/reshape.h b/tensorflow/lite/delegates/gpu/cl/kernels/reshape.h
index a5da616c451..59cc5c1560d 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/reshape.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/reshape.h
@@ -25,23 +25,7 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-class Reshape : public GPUOperation {
- public:
-  explicit Reshape(const OperationDef& definition);
-
-  int3 GetGridSize() const override;
-
-  // Move only
-  Reshape(Reshape&& operation);
-  Reshape& operator=(Reshape&& operation);
-  Reshape(const Reshape&) = delete;
-  Reshape& operator=(const Reshape&) = delete;
-
- private:
-  std::string GetReshapeCode(const OperationDef& op_def);
-};
-
-Reshape CreateReshape(const OperationDef& definition);
+GPUOperation CreateReshape(const OperationDef& definition);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/reshape_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/reshape_test.cc
index 8f08eaee4fb..d83acd9b454 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/reshape_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/reshape_test.cc
@@ -45,7 +45,7 @@ TEST_F(OpenCLOperationTest, Reshape) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      Reshape operation = CreateReshape(op_def);
+      GPUOperation operation = CreateReshape(op_def);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 3, 1, 2), &dst_tensor));
       EXPECT_THAT(
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4.cc b/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4.cc
index e5692cbc736..78440e3c843 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4.cc
@@ -23,26 +23,9 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 namespace cl {
+namespace {
 
-Reshapex4::Reshapex4(const OperationDef& definition)
-    : GPUOperation(definition) {
-  code_ = GetReshapeCode(definition_);
-}
-
-Reshapex4::Reshapex4(Reshapex4&& operation)
-    : GPUOperation(std::move(operation)) {}
-
-Reshapex4& Reshapex4::operator=(Reshapex4&& operation) {
-  if (this != &operation) {
-    GPUOperation::operator=(std::move(operation));
-  }
-  return *this;
-}
-
-std::string Reshapex4::GetReshapeCode(const OperationDef& op_def) {
-  AddSrcTensor("src_tensor", op_def.src_tensors[0]);
-  AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
-
+std::string GetReshapeCode(const OperationDef& op_def) {
   std::string c = GetCommonDefines(op_def.precision);
   c += "__kernel void main_function(\n";
   c += "$0) {\n";
@@ -82,15 +65,15 @@ std::string Reshapex4::GetReshapeCode(const OperationDef& op_def) {
   return c;
 }
 
-int3 Reshapex4::GetGridSize() const {
-  const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
-  const int grid_y = dst_[0]->Height();
-  const int grid_z = dst_[0]->Slices();
-  return int3(grid_x, grid_y, grid_z);
-}
+}  // namespace
 
-Reshapex4 CreateReshapex4(const OperationDef& definition) {
-  return Reshapex4(definition);
+GPUOperation CreateReshapex4(const OperationDef& definition) {
+  GPUOperation op(definition);
+  op.AddSrcTensor("src_tensor", definition.src_tensors[0]);
+  op.AddDstTensor("dst_tensor", definition.dst_tensors[0]);
+  op.code_ = GetReshapeCode(definition);
+  op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_SToZ;
+  return op;
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4.h b/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4.h
index 654e37e93be..2052d45b3e1 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4.h
@@ -26,24 +26,8 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-class Reshapex4 : public GPUOperation {
- public:
-  explicit Reshapex4(const OperationDef& definition);
-
-  int3 GetGridSize() const override;
-
-  // Move only
-  Reshapex4(Reshapex4&& operation);
-  Reshapex4& operator=(Reshapex4&& operation);
-  Reshapex4(const Reshapex4&) = delete;
-  Reshapex4& operator=(const Reshapex4&) = delete;
-
- private:
-  std::string GetReshapeCode(const OperationDef& op_def);
-};
-
 // More optimized, but require src_channels % 4 == 0 and dst_channels % 4 == 0
-Reshapex4 CreateReshapex4(const OperationDef& definition);
+GPUOperation CreateReshapex4(const OperationDef& definition);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4_test.cc
index 65b88a94218..635380bf150 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4_test.cc
@@ -45,7 +45,7 @@ TEST_F(OpenCLOperationTest, Reshapex4) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      Reshapex4 operation = CreateReshapex4(op_def);
+      GPUOperation operation = CreateReshapex4(op_def);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 1, 2, 4), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/softmax.cc b/tensorflow/lite/delegates/gpu/cl/kernels/softmax.cc
index be8e979305b..03a53d5716b 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/softmax.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/softmax.cc
@@ -24,32 +24,8 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 namespace cl {
-
-Softmax::Softmax(const OperationDef& definition) : GPUOperation(definition) {
-  code_ = GetSoftmaxKernelCode(definition_);
-}
-
-Softmax::Softmax(Softmax&& kernel) : GPUOperation(std::move(kernel)) {}
-
-Softmax& Softmax::operator=(Softmax&& kernel) {
-  if (this != &kernel) {
-    GPUOperation::operator=(std::move(kernel));
-  }
-  return *this;
-}
-
-std::string Softmax::GetSoftmaxKernelCode(const OperationDef& op_def) {
-  auto src_desc = op_def.src_tensors[0];
-  if (op_def.IsBatchSupported()) {
-    src_desc.SetStateVar("BatchedWidth", "true");
-  }
-  AddSrcTensor("src_tensor", src_desc);
-  auto dst_desc = op_def.dst_tensors[0];
-  if (op_def.IsBatchSupported()) {
-    dst_desc.SetStateVar("BatchedWidth", "true");
-  }
-  AddDstTensor("dst_tensor", dst_desc);
-
+namespace {
+std::string GetSoftmaxKernelCode(const OperationDef& op_def) {
   std::string c = GetCommonDefines(op_def.precision);
   c += "__kernel void main_function(\n";
   c += "$0) {\n";
@@ -74,16 +50,23 @@ std::string Softmax::GetSoftmaxKernelCode(const OperationDef& op_def) {
   c += "}\n";
   return c;
 }
+}  // namespace
 
-int3 Softmax::GetGridSize() const {
-  const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
-  const int grid_y = dst_[0]->Height();
-  const int grid_z = 1;
-  return int3(grid_x, grid_y, grid_z);
-}
-
-Softmax CreateSoftmax(const OperationDef& definition) {
-  return Softmax(definition);
+GPUOperation CreateSoftmax(const OperationDef& definition) {
+  GPUOperation op(definition);
+  auto src_desc = definition.src_tensors[0];
+  if (definition.IsBatchSupported()) {
+    src_desc.SetStateVar("BatchedWidth", "true");
+  }
+  op.AddSrcTensor("src_tensor", src_desc);
+  auto dst_desc = definition.dst_tensors[0];
+  if (definition.IsBatchSupported()) {
+    dst_desc.SetStateVar("BatchedWidth", "true");
+  }
+  op.AddDstTensor("dst_tensor", dst_desc);
+  op.code_ = GetSoftmaxKernelCode(definition);
+  op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_ZIs1;
+  return op;
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/softmax.h b/tensorflow/lite/delegates/gpu/cl/kernels/softmax.h
index 0fa10721df9..17a264766d4 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/softmax.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/softmax.h
@@ -26,26 +26,7 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-class Softmax : public GPUOperation {
- public:
-  Softmax() = default;
-  explicit Softmax(const OperationDef& definition);
-
-  int3 GetGridSize() const override;
-
-  // Move only
-  Softmax(Softmax&& kernel);
-  Softmax& operator=(Softmax&& kernel);
-  Softmax(const Softmax&) = delete;
-  Softmax& operator=(const Softmax&) = delete;
-
-  friend Softmax CreateSoftmax();
-
- private:
-  std::string GetSoftmaxKernelCode(const OperationDef& op_def);
-};
-
-Softmax CreateSoftmax(const OperationDef& definition);
+GPUOperation CreateSoftmax(const OperationDef& definition);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.h b/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.h
index 42cbbabe799..5bc9278d612 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.h
@@ -29,8 +29,11 @@ class Softmax1x1 : public GPUOperation {
  public:
   Softmax1x1() = default;
   explicit Softmax1x1(const OperationDef& definition);
-  absl::Status Tune(const TuningParameters& params) override {
-    return absl::OkStatus();
+  void GetPossibleKernelWorkGroups(
+      TuningType tuning_type, const DeviceInfo& device_info,
+      const KernelInfo& kernel_info,
+      std::vector<int3>* work_groups) const override {
+    work_groups->push_back(work_group_size_);
   }
   absl::Status BindArguments() override;
   int3 GetGridSize() const override;
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/softmax_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/softmax_test.cc
index bab81432248..d201baaa8ee 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/softmax_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/softmax_test.cc
@@ -48,7 +48,7 @@ TEST_F(OpenCLOperationTest, Softmax) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      Softmax operation = CreateSoftmax(op_def);
+      GPUOperation operation = CreateSoftmax(op_def);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth.cc b/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth.cc
index 0fa266aa8e7..f5323b48bae 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth.cc
@@ -25,29 +25,8 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 namespace cl {
-
-SpaceToDepth::SpaceToDepth(const OperationDef& op_def,
-                           const SpaceToDepthAttributes& attr)
-    : GPUOperation(op_def), attr_(attr) {
-  code_ = GetSpaceToDepthCode(definition_);
-}
-
-SpaceToDepth::SpaceToDepth(SpaceToDepth&& operation)
-    : GPUOperation(std::move(operation)), attr_(operation.attr_) {}
-
-SpaceToDepth& SpaceToDepth::operator=(SpaceToDepth&& operation) {
-  if (this != &operation) {
-    attr_ = operation.attr_;
-    GPUOperation::operator=(std::move(operation));
-  }
-  return *this;
-}
-
-std::string SpaceToDepth::GetSpaceToDepthCode(const OperationDef& op_def) {
-  AddSrcTensor("src_tensor", op_def.src_tensors[0]);
-  AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
-  args_.AddInt("block_size");
-
+namespace {
+std::string GetSpaceToDepthCode(const OperationDef& op_def) {
   std::string c = GetCommonDefines(op_def.precision);
   c += "__kernel void main_function(\n";
   c += "$0) {\n";
@@ -87,22 +66,17 @@ std::string SpaceToDepth::GetSpaceToDepthCode(const OperationDef& op_def) {
   c += "}\n";
   return c;
 }
+}  // namespace
 
-absl::Status SpaceToDepth::BindArguments() {
-  RETURN_IF_ERROR(args_.SetInt("block_size", attr_.block_size));
-  return absl::OkStatus();
-}
-
-int3 SpaceToDepth::GetGridSize() const {
-  const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
-  const int grid_y = dst_[0]->Height();
-  const int grid_z = dst_[0]->Slices();
-  return int3(grid_x, grid_y, grid_z);
-}
-
-SpaceToDepth CreateSpaceToDepth(const OperationDef& op_def,
+GPUOperation CreateSpaceToDepth(const OperationDef& op_def,
                                 const SpaceToDepthAttributes& attr) {
-  return SpaceToDepth(op_def, attr);
+  GPUOperation op(op_def);
+  op.AddSrcTensor("src_tensor", op_def.src_tensors[0]);
+  op.AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
+  op.args_.AddInt("block_size", attr.block_size);
+  op.code_ = GetSpaceToDepthCode(op_def);
+  op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_SToZ;
+  return op;
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth.h b/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth.h
index 65ade000836..08aca3054d6 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth.h
@@ -26,24 +26,7 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-class SpaceToDepth : public GPUOperation {
- public:
-  SpaceToDepth(const OperationDef& op_def, const SpaceToDepthAttributes& attr);
-  absl::Status BindArguments() override;
-  int3 GetGridSize() const override;
-
-  SpaceToDepth(SpaceToDepth&& operation);
-  SpaceToDepth& operator=(SpaceToDepth&& operation);
-  SpaceToDepth(const SpaceToDepth&) = delete;
-  SpaceToDepth& operator=(const SpaceToDepth&) = delete;
-
- private:
-  std::string GetSpaceToDepthCode(const OperationDef& op_def);
-
-  SpaceToDepthAttributes attr_;
-};
-
-SpaceToDepth CreateSpaceToDepth(const OperationDef& op_def,
+GPUOperation CreateSpaceToDepth(const OperationDef& op_def,
                                 const SpaceToDepthAttributes& attr);
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth_test.cc
index 02d93582ede..8298d14f7d7 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth_test.cc
@@ -69,7 +69,7 @@ TEST_F(OpenCLOperationTest, SpaceToDepthTensorShape1x2x2x2BlockSize2) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      SpaceToDepth operation = CreateSpaceToDepth(op_def, attr);
+      GPUOperation operation = CreateSpaceToDepth(op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 1, 1, 8), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -95,7 +95,7 @@ TEST_F(OpenCLOperationTest, SpaceToDepthTensorShape1x2x2x3BlockSize2) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      SpaceToDepth operation = CreateSpaceToDepth(op_def, attr);
+      GPUOperation operation = CreateSpaceToDepth(op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 1, 1, 12), &dst_tensor));
       EXPECT_THAT(
@@ -124,7 +124,7 @@ TEST_F(OpenCLOperationTest, SpaceToDepthTensorShape1x4x4x1BlockSize2) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      SpaceToDepth operation = CreateSpaceToDepth(op_def, attr);
+      GPUOperation operation = CreateSpaceToDepth(op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 2, 4), &dst_tensor));
       EXPECT_THAT(
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/special/depthwise_conv_plus_1x1_conv.cc b/tensorflow/lite/delegates/gpu/cl/kernels/special/depthwise_conv_plus_1x1_conv.cc
index e95e758fc95..f451d09d32d 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/special/depthwise_conv_plus_1x1_conv.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/special/depthwise_conv_plus_1x1_conv.cc
@@ -26,34 +26,10 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 namespace cl {
-
-DepthwiseConvPlus1x1Conv::DepthwiseConvPlus1x1Conv(
-    const OperationDef& definition,
-    const DepthwiseConvolution2DAttributes& dw_attr,
-    const Convolution2DAttributes& conv_attr)
-    : GPUOperation(definition), dw_attr_(dw_attr) {
-  work_group_size_ = int3(8, 8, 1);
-  code_ = GenerateCode(definition_, dw_attr_,
-                       DivideRoundUp(conv_attr.weights.shape.o, 4));
-}
-
-DepthwiseConvPlus1x1Conv::DepthwiseConvPlus1x1Conv(
-    DepthwiseConvPlus1x1Conv&& operation)
-    : GPUOperation(std::move(operation)),
-      dw_attr_(std::move(operation.dw_attr_)) {}
-
-DepthwiseConvPlus1x1Conv& DepthwiseConvPlus1x1Conv::operator=(
-    DepthwiseConvPlus1x1Conv&& operation) {
-  if (this != &operation) {
-    dw_attr_ = std::move(operation.dw_attr_);
-    GPUOperation::operator=(std::move(operation));
-  }
-  return *this;
-}
-
-absl::Status DepthwiseConvPlus1x1Conv::UploadWeights(
-    const DepthwiseConvolution2DAttributes& dw_attr,
-    const Convolution2DAttributes& conv_attr, CLContext* context) {
+namespace {
+void UploadWeights(const DepthwiseConvolution2DAttributes& dw_attr,
+                   const Convolution2DAttributes& conv_attr,
+                   CalculationsPrecision precision, GPUOperation* op) {
   int dw_dst_ch_aligned = AlignByN(dw_attr.weights.shape.i, 4);
   int dw_weights_count =
       dw_dst_ch_aligned * dw_attr.weights.shape.h * dw_attr.weights.shape.w;
@@ -116,47 +92,41 @@ absl::Status DepthwiseConvPlus1x1Conv::UploadWeights(
     }
   }
 
-  Buffer constants_buf;
-  const bool fp32_weights = definition_.precision == CalculationsPrecision::F32;
+  const bool fp32_weights = precision == CalculationsPrecision::F32;
   const int float_size = fp32_weights ? 4 : 2;
-  if (fp32_weights) {
-    RETURN_IF_ERROR(CreateReadOnlyBuffer(float_size * gpu_data.size(),
-                                         gpu_data.data(), context,
-                                         &constants_buf));
-  } else {
-    std::vector<half> gpu_data_half(gpu_data.size());
-    for (int i = 0; i < gpu_data.size(); ++i) {
-      gpu_data_half[i] = gpu_data[i];
-    }
-    RETURN_IF_ERROR(CreateReadOnlyBuffer(float_size * gpu_data_half.size(),
-                                         gpu_data_half.data(), context,
-                                         &constants_buf));
-  }
-
   BufferDescriptor desc;
   desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
   desc.element_size = 4;
   desc.memory_type = MemoryType::CONSTANT;
-  args_.AddObject("constants", AccessType::READ,
-                  absl::make_unique<Buffer>(std::move(constants_buf)),
-                  absl::make_unique<BufferDescriptor>(desc));
-  return absl::OkStatus();
+  desc.size = float_size * gpu_data.size();
+  desc.data.resize(desc.size);
+
+  if (fp32_weights) {
+    memcpy(desc.data.data(), gpu_data.data(), desc.size);
+  } else {
+    half* gpu_data_half = reinterpret_cast<half*>(desc.data.data());
+    for (int i = 0; i < gpu_data.size(); ++i) {
+      gpu_data_half[i] = gpu_data[i];
+    }
+  }
+  op->args_.AddObject("constants",
+                      absl::make_unique<BufferDescriptor>(std::move(desc)));
 }
 
-std::string DepthwiseConvPlus1x1Conv::GenerateCode(
-    const OperationDef& op_def, const DepthwiseConvolution2DAttributes& dw_attr,
-    int result_depth) {
+std::string GenerateCode(const OperationDef& op_def,
+                         const DepthwiseConvolution2DAttributes& dw_attr,
+                         int result_depth, GPUOperation* result) {
   auto src_desc = op_def.src_tensors[0];
   src_desc.SetTextureAddressMode(TextureAddressMode::ZERO);
-  AddSrcTensor("src_tensor", src_desc);
-  AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
+  result->AddSrcTensor("src_tensor", src_desc);
+  result->AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
 
-  args_.AddInt("stride_x", dw_attr.strides.w);
-  args_.AddInt("padding_x", -dw_attr.padding.prepended.w);
-  args_.AddInt("dilation_x", dw_attr.dilations.w);
-  args_.AddInt("stride_y", dw_attr.strides.h);
-  args_.AddInt("padding_y", -dw_attr.padding.prepended.h);
-  args_.AddInt("dilation_y", dw_attr.dilations.h);
+  result->args_.AddInt("stride_x", dw_attr.strides.w);
+  result->args_.AddInt("padding_x", -dw_attr.padding.prepended.w);
+  result->args_.AddInt("dilation_x", dw_attr.dilations.w);
+  result->args_.AddInt("stride_y", dw_attr.strides.h);
+  result->args_.AddInt("padding_y", -dw_attr.padding.prepended.h);
+  result->args_.AddInt("dilation_y", dw_attr.dilations.h);
 
   const auto src_tensor_type = op_def.src_tensors[0].storage_type;
 
@@ -241,14 +211,10 @@ std::string DepthwiseConvPlus1x1Conv::GenerateCode(
   return c;
 }
 
-int3 DepthwiseConvPlus1x1Conv::GetGridSize() const {
-  const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
-  const int grid_y = dst_[0]->Height();
-  return int3(grid_x, grid_y, 1);
-}
+}  // namespace
 
 bool IsDepthwiseConvPlus1x1ConvSupported(
-    const CLDevice& device, const OperationDef& definition,
+    const OperationDef& definition,
     const DepthwiseConvolution2DAttributes& dw_attr,
     const Convolution2DAttributes& conv_attr) {
   const auto dw_shape = dw_attr.weights.shape;
@@ -267,15 +233,17 @@ bool IsDepthwiseConvPlus1x1ConvSupported(
   return good_dw && good_conv && recommended_dw && recommended_conv;
 }
 
-absl::Status CreateDepthwiseConvPlus1x1Conv(
-    const CreationContext& creation_context, const OperationDef& definition,
+GPUOperation CreateDepthwiseConvPlus1x1Conv(
+    const OperationDef& definition,
     const DepthwiseConvolution2DAttributes& dw_attr,
-    const Convolution2DAttributes& conv_attr,
-    DepthwiseConvPlus1x1Conv* result) {
-  *result = DepthwiseConvPlus1x1Conv(definition, dw_attr, conv_attr);
-  RETURN_IF_ERROR(
-      result->UploadWeights(dw_attr, conv_attr, creation_context.context));
-  return absl::OkStatus();
+    const Convolution2DAttributes& conv_attr) {
+  GPUOperation result(definition);
+  result.code_ =
+      GenerateCode(definition, dw_attr,
+                   DivideRoundUp(conv_attr.weights.shape.o, 4), &result);
+  result.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_ZIs1;
+  UploadWeights(dw_attr, conv_attr, definition.precision, &result);
+  return result;
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/special/depthwise_conv_plus_1x1_conv.h b/tensorflow/lite/delegates/gpu/cl/kernels/special/depthwise_conv_plus_1x1_conv.h
index b2d3b05d285..b87051104b7 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/special/depthwise_conv_plus_1x1_conv.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/special/depthwise_conv_plus_1x1_conv.h
@@ -33,47 +33,15 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-class DepthwiseConvPlus1x1Conv : public GPUOperation {
- public:
-  DepthwiseConvPlus1x1Conv() = default;
-  int3 GetGridSize() const override;
-
-  // Move only
-  DepthwiseConvPlus1x1Conv(DepthwiseConvPlus1x1Conv&& operation);
-  DepthwiseConvPlus1x1Conv& operator=(DepthwiseConvPlus1x1Conv&& operation);
-  DepthwiseConvPlus1x1Conv(const DepthwiseConvPlus1x1Conv&) = delete;
-  DepthwiseConvPlus1x1Conv& operator=(const DepthwiseConvPlus1x1Conv&) = delete;
-
- private:
-  friend absl::Status CreateDepthwiseConvPlus1x1Conv(
-      const CreationContext& creation_context, const OperationDef& definition,
-      const DepthwiseConvolution2DAttributes& dw_attr,
-      const Convolution2DAttributes& conv_attr,
-      DepthwiseConvPlus1x1Conv* result);
-  DepthwiseConvPlus1x1Conv(const OperationDef& definition,
-                           const DepthwiseConvolution2DAttributes& dw_attr,
-                           const Convolution2DAttributes& conv_attr);
-
-  absl::Status UploadWeights(const DepthwiseConvolution2DAttributes& dw_attr,
-                             const Convolution2DAttributes& conv_attr,
-                             CLContext* context);
-
-  std::string GenerateCode(const OperationDef& op_def,
-                           const DepthwiseConvolution2DAttributes& dw_attr,
-                           int result_depth);
-
-  DepthwiseConvolution2DAttributes dw_attr_;
-};
-
 bool IsDepthwiseConvPlus1x1ConvSupported(
-    const CLDevice& device, const OperationDef& definition,
+    const OperationDef& definition,
     const DepthwiseConvolution2DAttributes& dw_attr,
     const Convolution2DAttributes& conv_attr);
 
-absl::Status CreateDepthwiseConvPlus1x1Conv(
-    const CreationContext& creation_context, const OperationDef& definition,
+GPUOperation CreateDepthwiseConvPlus1x1Conv(
+    const OperationDef& definition,
     const DepthwiseConvolution2DAttributes& dw_attr,
-    const Convolution2DAttributes& conv_attr, DepthwiseConvPlus1x1Conv* result);
+    const Convolution2DAttributes& conv_attr);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc b/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc
index 259f66e0f38..0182ec7d90c 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc
@@ -24,29 +24,9 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 namespace cl {
-
-Transpose::Transpose(const OperationDef& definition,
-                     const TransposeAttributes& attr)
-    : GPUOperation(definition), attr_(attr) {
-  code_ = GetTransposeCode(definition_, attr_);
-}
-
-Transpose::Transpose(Transpose&& operation)
-    : GPUOperation(std::move(operation)), attr_(operation.attr_) {}
-
-Transpose& Transpose::operator=(Transpose&& operation) {
-  if (this != &operation) {
-    attr_ = operation.attr_;
-    GPUOperation::operator=(std::move(operation));
-  }
-  return *this;
-}
-
-std::string Transpose::GetTransposeCode(const OperationDef& op_def,
-                                        const TransposeAttributes& attr) {
-  AddSrcTensor("src_tensor", op_def.src_tensors[0]);
-  AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
-
+namespace {
+std::string GetTransposeCode(const OperationDef& op_def,
+                             const TransposeAttributes& attr) {
   const std::string batch_id =
       op_def.dst_tensors[0].HasAxis(Axis::BATCH) ? "B" : "0";
   std::string c = GetCommonDefines(op_def.precision);
@@ -112,17 +92,16 @@ std::string Transpose::GetTransposeCode(const OperationDef& op_def,
   c += "}\n";
   return c;
 }
+}  // namespace
 
-int3 Transpose::GetGridSize() const {
-  const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
-  const int grid_y = dst_[0]->Height();
-  const int grid_z = dst_[0]->Slices();
-  return int3(grid_x, grid_y, grid_z);
-}
-
-Transpose CreateTranspose(const OperationDef& definition,
-                          const TransposeAttributes& attr) {
-  return Transpose(definition, attr);
+GPUOperation CreateTranspose(const OperationDef& definition,
+                             const TransposeAttributes& attr) {
+  GPUOperation op(definition);
+  op.AddSrcTensor("src_tensor", definition.src_tensors[0]);
+  op.AddDstTensor("dst_tensor", definition.dst_tensors[0]);
+  op.code_ = GetTransposeCode(definition, attr);
+  op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_SToZ;
+  return op;
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/transpose.h b/tensorflow/lite/delegates/gpu/cl/kernels/transpose.h
index 950f838923e..631d5dc08b3 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/transpose.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/transpose.h
@@ -24,25 +24,8 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-class Transpose : public GPUOperation {
- public:
-  Transpose(const OperationDef& definition, const TransposeAttributes& attr);
-  int3 GetGridSize() const override;
-
-  // Move only
-  Transpose(Transpose&& operation);
-  Transpose& operator=(Transpose&& operation);
-  Transpose(const Transpose&) = delete;
-  Transpose& operator=(const Transpose&) = delete;
-
- private:
-  std::string GetTransposeCode(const OperationDef& op_def,
-                               const TransposeAttributes& attr);
-  TransposeAttributes attr_;
-};
-
-Transpose CreateTranspose(const OperationDef& definition,
-                          const TransposeAttributes& attr);
+GPUOperation CreateTranspose(const OperationDef& definition,
+                             const TransposeAttributes& attr);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/transpose_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/transpose_test.cc
index 07e1b9d58aa..1d1fba237a5 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/transpose_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/transpose_test.cc
@@ -48,7 +48,7 @@ TEST_F(OpenCLOperationTest, Transpose) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      Transpose operation = CreateTranspose(op_def, attr);
+      GPUOperation operation = CreateTranspose(op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 1, 3, 2), &dst_tensor));
       EXPECT_THAT(
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/util.cc b/tensorflow/lite/delegates/gpu/cl/kernels/util.cc
index 3fe4ffb4acd..f0e0c412b7e 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/util.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/util.cc
@@ -84,14 +84,15 @@ std::string GetXStrideCorrected(const std::string& src_x,
                           batch_size, stride_x, padding_x);
 }
 
-TextureAddressMode GetFastestZeroMode(const CLDevice& device) {
-  return device.IsAdreno3xx() ? TextureAddressMode::DONT_CARE
-                              : TextureAddressMode::ZERO;
-}
-
-TextureAddressMode GetFastestZeroMode(const DeviceInfo& device_info) {
-  return device_info.IsAdreno3xx() ? TextureAddressMode::DONT_CARE
-                                   : TextureAddressMode::ZERO;
+std::string GetXStrideCorrectedV2(const std::string& src_x,
+                                  const std::string& batch_size,
+                                  const std::string& stride_x,
+                                  const std::string& padding_x) {
+  // int p0 = src_x / batch_size;\n";
+  // int b0 = src_x % batch_size;\n";
+  // return (p0 * stride_x + padding_x) * batch_size + b0;\n";
+  return absl::Substitute("(((($0) / $1) * $2 + $3) * $1 + ($0) % $1)", src_x,
+                          batch_size, stride_x, padding_x);
 }
 
 float4 GetMaskForLastPlane(int channels) {
@@ -113,19 +114,19 @@ int3 GetFirstSuitableWorkGroup(const std::vector<int3>& wgs, int max_wg_size) {
   return {1, 1, 1};
 }
 
-int GetRecommendedBlockSizeForConv(const CLDevice& device,
+int GetRecommendedBlockSizeForConv(const DeviceInfo& device_info,
                                    CalculationsPrecision precision,
                                    int task_size) {
   const float task_size_per_cu =
-      task_size / static_cast<float>(device.GetInfo().compute_units_count);
+      task_size / static_cast<float>(device_info.compute_units_count);
   int block_size = 1;
   float threshold_1 = FLT_MAX;
   float threshold_2 = FLT_MAX;
   float threshold_4 = FLT_MAX;
-  if (!device.IsMali()) {
+  if (!device_info.IsMali()) {
     return 1;
   }
-  MaliInfo mali_info = device.GetInfo().mali_info;
+  MaliInfo mali_info = device_info.mali_info;
   switch (precision) {
     case CalculationsPrecision::F16:
       if (mali_info.IsBifrostGen1()) {
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/util.h b/tensorflow/lite/delegates/gpu/cl/kernels/util.h
index 173a4d43072..1989db9f342 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/util.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/util.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include <string>
 
 #include "absl/types/span.h"
-#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
+#include "tensorflow/lite/delegates/gpu/cl/device_info.h"
 #include "tensorflow/lite/delegates/gpu/cl/precision.h"
 #include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
 #include "tensorflow/lite/delegates/gpu/common/access_type.h"
@@ -44,21 +44,26 @@ std::string GetXStrideCorrected(const std::string& src_x,
                                 const std::string& stride_x,
                                 const std::string& padding_x);
 
+// Calculates correct X coordinate when stride != 1 and batch != 1 for layouts
+// with B after W (for example HWBC4) and WB stored in one axis of GPU
+// resources.
+std::string GetXStrideCorrectedV2(const std::string& src_x,
+                                  const std::string& batch_size,
+                                  const std::string& stride_x,
+                                  const std::string& padding_x);
+
 template <DataType S, typename T>
 void RearrangeWeightsToOHWIOGroupI4O4(
     const tflite::gpu::Tensor<OHWI, S>& weights, int out_group_size,
     absl::Span<T> dst) {
   const int dst_slices = DivideRoundUp(weights.shape.o, 4);
   const int src_slices = DivideRoundUp(weights.shape.i, 4);
-  const int kernel_x = weights.shape.w;
-  const int kernel_y = weights.shape.h;
-
   const int dst_groups = DivideRoundUp(dst_slices, out_group_size);
 
   int counter = 0;
   for (int d = 0; d < dst_groups; ++d) {
-    for (int y = 0; y < kernel_y; ++y) {
-      for (int x = 0; x < kernel_x; ++x) {
+    for (int y = 0; y < weights.shape.h; ++y) {
+      for (int x = 0; x < weights.shape.w; ++x) {
         for (int s = 0; s < src_slices; ++s) {
           for (int d_group = 0; d_group < out_group_size; ++d_group) {
             for (int j = 0; j < 4; ++j) {
@@ -83,6 +88,118 @@ void RearrangeWeightsToOHWIOGroupI4O4(
   }
 }
 
+template <DataType S, typename T>
+void RearrangeWeightsToODHWIOGroupI4O4(
+    const tflite::gpu::Tensor<OHWDI, S>& weights, int out_group_size,
+    absl::Span<T> dst) {
+  const int dst_slices = DivideRoundUp(weights.shape.o, 4);
+  const int src_slices = DivideRoundUp(weights.shape.i, 4);
+  const int dst_groups = DivideRoundUp(dst_slices, out_group_size);
+
+  int counter = 0;
+  for (int d = 0; d < dst_groups; ++d) {
+    for (int z = 0; z < weights.shape.d; ++z) {
+      for (int y = 0; y < weights.shape.h; ++y) {
+        for (int x = 0; x < weights.shape.w; ++x) {
+          for (int s = 0; s < src_slices; ++s) {
+            for (int d_group = 0; d_group < out_group_size; ++d_group) {
+              for (int j = 0; j < 4; ++j) {
+                T filter;
+                for (int i = 0; i < 4; ++i) {
+                  const int s_ch = s * 4 + j;
+                  const int d_ch = (d * out_group_size + d_group) * 4 + i;
+                  if (s_ch < weights.shape.i && d_ch < weights.shape.o) {
+                    const int f_index =
+                        weights.shape.LinearIndex({d_ch, y, x, z, s_ch});
+                    filter[i] = weights.data[f_index];
+                  } else {
+                    filter[i] = 0.0f;
+                  }
+                }
+                dst[counter++] = filter;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+template <DataType S, typename T>
+void RearrangeWeightsToI4HWIOOGroupO4(
+    const tflite::gpu::Tensor<OHWI, S>& weights, int out_group_size,
+    absl::Span<T> dst) {
+  const int dst_slices = DivideRoundUp(weights.shape.o, 4);
+  const int src_slices = DivideRoundUp(weights.shape.i, 4);
+  const int dst_groups = DivideRoundUp(dst_slices, out_group_size);
+
+  int counter = 0;
+  for (int j = 0; j < 4; ++j) {
+    for (int y = 0; y < weights.shape.h; ++y) {
+      for (int x = 0; x < weights.shape.w; ++x) {
+        for (int s = 0; s < src_slices; ++s) {
+          for (int d = 0; d < dst_groups; ++d) {
+            for (int d_group = 0; d_group < out_group_size; ++d_group) {
+              T filter;
+              for (int i = 0; i < 4; ++i) {
+                const int s_ch = s * 4 + j;
+                const int d_ch = (d * out_group_size + d_group) * 4 + i;
+                if (s_ch < weights.shape.i && d_ch < weights.shape.o) {
+                  const int f_index =
+                      weights.shape.LinearIndex({d_ch, y, x, s_ch});
+                  filter[i] = weights.data[f_index];
+                } else {
+                  filter[i] = 0.0f;
+                }
+              }
+              dst[counter++] = filter;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+template <DataType S, typename T>
+void RearrangeWeightsToI4DHWIOOGroupO4(
+    const tflite::gpu::Tensor<OHWDI, S>& weights, int out_group_size,
+    absl::Span<T> dst) {
+  const int dst_slices = DivideRoundUp(weights.shape.o, 4);
+  const int src_slices = DivideRoundUp(weights.shape.i, 4);
+  const int dst_groups = DivideRoundUp(dst_slices, out_group_size);
+
+  int counter = 0;
+  for (int j = 0; j < 4; ++j) {
+    for (int z = 0; z < weights.shape.d; ++z) {
+      for (int y = 0; y < weights.shape.h; ++y) {
+        for (int x = 0; x < weights.shape.w; ++x) {
+          for (int s = 0; s < src_slices; ++s) {
+            for (int d = 0; d < dst_groups; ++d) {
+              for (int d_group = 0; d_group < out_group_size; ++d_group) {
+                T filter;
+                for (int i = 0; i < 4; ++i) {
+                  const int s_ch = s * 4 + j;
+                  const int d_ch = (d * out_group_size + d_group) * 4 + i;
+                  if (s_ch < weights.shape.i && d_ch < weights.shape.o) {
+                    const int f_index =
+                        weights.shape.LinearIndex({d_ch, y, x, z, s_ch});
+                    filter[i] = weights.data[f_index];
+                  } else {
+                    filter[i] = 0.0f;
+                  }
+                }
+                dst[counter++] = filter;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
 // Returns float4 mask for last plane(batch of 4 channels)
 // assumes that plane size is 4;
 // for example we have 7 channels, in our data structures we align it to 8
@@ -95,7 +212,7 @@ float4 GetMaskForLastPlane(int channels);
 int3 GetFirstSuitableWorkGroup(const std::vector<int3>& wgs, int max_wg_size);
 
 // task_size as amount of FLT4 processed elements.
-int GetRecommendedBlockSizeForConv(const CLDevice& device,
+int GetRecommendedBlockSizeForConv(const DeviceInfo& device,
                                    CalculationsPrecision precision,
                                    int task_size);
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc
index 4c3e8ddba05..0f94847f08a 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc
@@ -234,7 +234,7 @@ std::string Winograd4x4To36::GetWinograd4x4To36Code(
   return c;
 }
 
-absl::Status Winograd4x4To36::UploadBt(CLContext* context) {
+void Winograd4x4To36::UploadBt() {
   tflite::gpu::Tensor<Linear, DataType::FLOAT32> bt_aligned;
   bt_aligned.shape = Linear(6 * 8);
   bt_aligned.data.resize(6 * 8);
@@ -250,20 +250,16 @@ absl::Status Winograd4x4To36::UploadBt(CLContext* context) {
   TensorLinearDescriptor desc;
   desc.storage_type = LinearStorageType::TEXTURE_2D;
   desc.element_type = definition_.GetDataType();
-
-  LinearStorage lt;
-  RETURN_IF_ERROR(CreateLinearStorage(desc, bt_aligned, context, &lt));
-  args_.AddObject("bt", AccessType::READ,
-                  absl::make_unique<LinearStorage>(std::move(lt)),
-                  absl::make_unique<TensorLinearDescriptor>(desc));
-  return absl::OkStatus();
+  desc.UploadLinearData(bt_aligned);
+  args_.AddObject("bt",
+                  absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
 }
 
-int3 Winograd4x4To36::SelectBestWorkGroup() {
+int3 Winograd4x4To36::SelectBestWorkGroup(const KernelInfo& kernel_info) const {
   const std::vector<int3> wgs = {{8, 6, 4}, {8, 6, 2}, {4, 6, 2},
                                  {4, 6, 2}, {2, 6, 2}, {2, 6, 1},
                                  {1, 6, 1}, {1, 3, 1}, {1, 1, 1}};
-  return GetFirstSuitableWorkGroup(wgs, kernel_.GetMaxWorkGroupSize());
+  return GetFirstSuitableWorkGroup(wgs, kernel_info.max_work_group_size);
 }
 
 absl::Status Winograd4x4To36::BindArguments() {
@@ -286,25 +282,27 @@ int3 Winograd4x4To36::GetGridSize() const {
   return int3(grid_x, grid_y, grid_z);
 }
 
-absl::Status Winograd4x4To36::Tune(const TuningParameters& params) {
-  switch (params.tuning_type) {
+void Winograd4x4To36::GetPossibleKernelWorkGroups(
+    TuningType tuning_type, const DeviceInfo& device_info,
+    const KernelInfo& kernel_info, std::vector<int3>* work_groups) const {
+  switch (tuning_type) {
     case TuningType::EXHAUSTIVE:
-      RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
-      return GetBestWorkGroup(params, kernel_, grid_size_, &work_group_size_);
+      GetPossibleWorkGroups(tuning_type, device_info, kernel_info, grid_size_,
+                            work_groups);
+      return;
     case TuningType::FAST:
     default:
-      work_group_size_ = SelectBestWorkGroup();
-      return absl::OkStatus();
+      work_groups->push_back(SelectBestWorkGroup(kernel_info));
+      return;
   }
 }
 
-absl::Status CreateWinograd4x4To36(const CreationContext& creation_context,
-                                   const OperationDef& definition,
-                                   const Padding2D& padding,
-                                   Winograd4x4To36* result) {
-  *result =
-      Winograd4x4To36(definition, padding, creation_context.device->GetInfo());
-  return result->UploadBt(creation_context.context);
+Winograd4x4To36 CreateWinograd4x4To36(const DeviceInfo& device_info,
+                                      const OperationDef& definition,
+                                      const Padding2D& padding) {
+  Winograd4x4To36 result(definition, padding, device_info);
+  result.UploadBt();
+  return result;
 }
 
 Winograd36To4x4::Winograd36To4x4(const OperationDef& definition,
@@ -437,7 +435,7 @@ std::string Winograd36To4x4::GetWinograd36To4x4Code(
   return c;
 }
 
-absl::Status Winograd36To4x4::UploadAt(CLContext* context) {
+void Winograd36To4x4::UploadAt() {
   tflite::gpu::Tensor<Linear, DataType::FLOAT32> at_aligned;
   at_aligned.shape = Linear(4 * 8);
   at_aligned.data.resize(4 * 8);
@@ -453,24 +451,19 @@ absl::Status Winograd36To4x4::UploadAt(CLContext* context) {
   TensorLinearDescriptor desc;
   desc.storage_type = LinearStorageType::TEXTURE_2D;
   desc.element_type = definition_.GetDataType();
-  LinearStorage lt;
-  RETURN_IF_ERROR(CreateLinearStorage(desc, at_aligned, context, &lt));
-  args_.AddObject("at", AccessType::READ,
-                  absl::make_unique<LinearStorage>(std::move(lt)),
-                  absl::make_unique<TensorLinearDescriptor>(desc));
-  return absl::OkStatus();
+  desc.UploadLinearData(at_aligned);
+  args_.AddObject("at",
+                  absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
 }
 
-int3 Winograd36To4x4::SelectBestWorkGroup() {
+int3 Winograd36To4x4::SelectBestWorkGroup(const KernelInfo& kernel_info) const {
   const std::vector<int3> wgs = {{32, 4, 2}, {16, 4, 2}, {16, 4, 1},
                                  {8, 4, 1},  {4, 4, 1},  {2, 4, 1},
                                  {1, 4, 1},  {1, 2, 1},  {1, 1, 1}};
-  return GetFirstSuitableWorkGroup(wgs, kernel_.GetMaxWorkGroupSize());
+  return GetFirstSuitableWorkGroup(wgs, kernel_info.max_work_group_size);
 }
 
 absl::Status Winograd36To4x4::BindArguments() {
-  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
-  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
   const int tiles_x = DivideRoundUp(dst_[0]->Width(), 4);
   RETURN_IF_ERROR(args_.SetInt("tiles_x", tiles_x));
   return absl::OkStatus();
@@ -485,34 +478,33 @@ int3 Winograd36To4x4::GetGridSize() const {
   return int3(grid_x, grid_y, grid_z);
 }
 
-absl::Status Winograd36To4x4::Tune(const TuningParameters& params) {
-  switch (params.tuning_type) {
+void Winograd36To4x4::GetPossibleKernelWorkGroups(
+    TuningType tuning_type, const DeviceInfo& device_info,
+    const KernelInfo& kernel_info, std::vector<int3>* work_groups) const {
+  switch (tuning_type) {
     case TuningType::EXHAUSTIVE:
-      RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
-      return GetBestWorkGroup(params, kernel_, grid_size_, &work_group_size_);
+      GetPossibleWorkGroups(tuning_type, device_info, kernel_info, grid_size_,
+                            work_groups);
+      return;
     case TuningType::FAST:
     default:
-      work_group_size_ = SelectBestWorkGroup();
-      return absl::OkStatus();
+      work_groups->push_back(SelectBestWorkGroup(kernel_info));
+      return;
   }
 }
 
-
-absl::Status CreateWinograd36To4x4(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases,
-    Winograd36To4x4* result) {
-  *result = Winograd36To4x4(definition, creation_context.device->GetInfo());
+Winograd36To4x4 CreateWinograd36To4x4(
+    const DeviceInfo& device_info, const OperationDef& definition,
+    const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases) {
+  Winograd36To4x4 result(definition, device_info);
   TensorLinearDescriptor desc;
   desc.storage_type = LinearStorageType::TEXTURE_2D;
   desc.element_type = definition.GetDataType();
-  LinearStorage lt;
-  RETURN_IF_ERROR(
-      CreateLinearStorage(desc, biases, creation_context.context, &lt));
-  result->args_.AddObject("biases", AccessType::READ,
-                          absl::make_unique<LinearStorage>(std::move(lt)),
-                          absl::make_unique<TensorLinearDescriptor>(desc));
-  return result->UploadAt(creation_context.context);
+  desc.UploadLinearData(biases);
+  result.args_.AddObject(
+      "biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
+  result.UploadAt();
+  return result;
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.h b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.h
index ddc1155e0b5..a5da49e7939 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.h
@@ -38,7 +38,10 @@ class Winograd4x4To36 : public GPUOperation {
                   const DeviceInfo& device_info);
   absl::Status BindArguments() override;
   int3 GetGridSize() const override;
-  absl::Status Tune(const TuningParameters& params) override;
+  void GetPossibleKernelWorkGroups(
+      TuningType tuning_type, const DeviceInfo& device_info,
+      const KernelInfo& kernel_info,
+      std::vector<int3>* work_groups) const override;
 
   // Move only
   Winograd4x4To36(Winograd4x4To36&& operation);
@@ -47,24 +50,23 @@ class Winograd4x4To36 : public GPUOperation {
   Winograd4x4To36& operator=(const Winograd4x4To36&) = delete;
 
  private:
-  friend absl::Status CreateWinograd4x4To36(
-      const CreationContext& creation_context, const OperationDef& definition,
-      const Padding2D& padding, Winograd4x4To36* result);
+  friend Winograd4x4To36 CreateWinograd4x4To36(const DeviceInfo& device_info,
+                                               const OperationDef& definition,
+                                               const Padding2D& padding);
 
-  absl::Status UploadBt(CLContext* context);
+  void UploadBt();
 
   std::string GetWinograd4x4To36Code(const OperationDef& op_def);
 
   // Must be called after kernel compilation
-  int3 SelectBestWorkGroup();
+  int3 SelectBestWorkGroup(const KernelInfo& kernel_info) const;
 
   Padding2D padding_;
 };
 
-absl::Status CreateWinograd4x4To36(const CreationContext& creation_context,
-                                   const OperationDef& definition,
-                                   const Padding2D& padding,
-                                   Winograd4x4To36* result);
+Winograd4x4To36 CreateWinograd4x4To36(const DeviceInfo& device_info,
+                                      const OperationDef& definition,
+                                      const Padding2D& padding);
 
 class Winograd36To4x4 : public GPUOperation {
  public:
@@ -73,7 +75,10 @@ class Winograd36To4x4 : public GPUOperation {
                   const DeviceInfo& device_info);
   absl::Status BindArguments() override;
   int3 GetGridSize() const override;
-  absl::Status Tune(const TuningParameters& params) override;
+  void GetPossibleKernelWorkGroups(
+      TuningType tuning_type, const DeviceInfo& device_info,
+      const KernelInfo& kernel_info,
+      std::vector<int3>* work_groups) const override;
 
   // Move only
   Winograd36To4x4(Winograd36To4x4&& operation);
@@ -82,23 +87,21 @@ class Winograd36To4x4 : public GPUOperation {
   Winograd36To4x4& operator=(const Winograd36To4x4&) = delete;
 
  private:
-  friend absl::Status CreateWinograd36To4x4(
-      const CreationContext& creation_context, const OperationDef& definition,
-      const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases,
-      Winograd36To4x4* result);
+  friend Winograd36To4x4 CreateWinograd36To4x4(
+      const DeviceInfo& device_info, const OperationDef& definition,
+      const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases);
 
-  absl::Status UploadAt(CLContext* context);
+  void UploadAt();
 
   std::string GetWinograd36To4x4Code(const OperationDef& op_def);
 
   // Must be called after kernel compilation
-  int3 SelectBestWorkGroup();
+  int3 SelectBestWorkGroup(const KernelInfo& kernel_info) const;
 };
 
-absl::Status CreateWinograd36To4x4(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases,
-    Winograd36To4x4* result);
+Winograd36To4x4 CreateWinograd36To4x4(
+    const DeviceInfo& device_info, const OperationDef& definition,
+    const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/winograd_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/winograd_test.cc
index 1dada33ae04..6e32de3cba9 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/winograd_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/winograd_test.cc
@@ -93,9 +93,8 @@ TEST_F(OpenCLOperationTest, Winograd4x4To36) {
       Padding2D padding;
       padding.prepended = HW(1, 1);
       padding.appended = HW(1, 1);
-      Winograd4x4To36 wino_up;
-      ASSERT_OK(
-          CreateWinograd4x4To36(creation_context_, op_def, padding, &wino_up));
+      Winograd4x4To36 wino_up = CreateWinograd4x4To36(
+          creation_context_.GetDeviceInfo(), op_def, padding);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &wino_up,
                                     BHWC(1, 36, 1, 1), &dst_tensor));
       EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), dst_ref.data));
@@ -162,9 +161,8 @@ TEST_F(OpenCLOperationTest, Winograd36To4x4) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      Winograd36To4x4 wino_down;
-      ASSERT_OK(
-          CreateWinograd36To4x4(creation_context_, op_def, biases, &wino_down));
+      Winograd36To4x4 wino_down = CreateWinograd36To4x4(
+          creation_context_.GetDeviceInfo(), op_def, biases);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &wino_down,
                                     BHWC(1, 4, 4, 1), &dst_tensor));
       EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), dst_ref.data));
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.cc b/tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.cc
index 3771a5b033a..4c0cbc06985 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.cc
@@ -33,64 +33,72 @@ std::vector<int2> Get2DWorkgroupsEqualTo128() {
           {8, 16},  {4, 32}, {2, 64}, {1, 128}};
 }
 
-std::vector<int3> GenerateWorkGroupSizesXY128(
-    int3 grid, int max_work_group_size, WorkGroupSizeAlignment z_alignment) {
+std::vector<int3> GenerateWorkGroupSizesXYMultipleOf(
+    int multiplier, int3 grid, const KernelInfo& kernel_info,
+    const DeviceInfo& device_info, WorkGroupSizeAlignment z_alignment) {
   std::vector<int3> work_groups;
   work_groups.reserve(32);
 
   std::vector<int> possible_z_sizes = GetPossibleSizes(grid.z, z_alignment);
 
-  for (int x = 1; x <= max_work_group_size; x *= 2) {
-    for (int y = 1; y <= max_work_group_size; y *= 2) {
+  for (int x = 1; x <= kernel_info.max_work_group_size; x *= 2) {
+    for (int y = 1; y <= kernel_info.max_work_group_size; y *= 2) {
       int work_group_size_xy = x * y;
-      if (work_group_size_xy % 128 != 0 ||
-          work_group_size_xy > max_work_group_size) {
+      if (work_group_size_xy % multiplier != 0 ||
+          work_group_size_xy > kernel_info.max_work_group_size) {
         continue;
       }
       for (auto z : possible_z_sizes) {
-        if (work_group_size_xy * z > max_work_group_size) {
+        if (work_group_size_xy * z > kernel_info.max_work_group_size) {
           continue;
         }
-        work_groups.push_back({x, y, z});
+        if (x <= device_info.max_work_group_size_x &&
+            y <= device_info.max_work_group_size_y &&
+            z <= device_info.max_work_group_size_z) {
+          work_groups.push_back({x, y, z});
+        }
       }
     }
   }
   return work_groups;
 }
 
-std::vector<int3> GenerateWorkGroupSizesXY128Linear(
-    int3 grid, int max_work_group_size, WorkGroupSizeAlignment z_alignment) {
+std::vector<int3> GenerateWorkGroupSizesXMultipleOf(
+    int multiplier, int3 grid, const KernelInfo& kernel_info,
+    const DeviceInfo& device_info, WorkGroupSizeAlignment z_alignment) {
   std::vector<int3> work_groups;
   work_groups.reserve(32);
 
   std::vector<int> possible_z_sizes = GetPossibleSizes(grid.z, z_alignment);
+  std::vector<int> possible_y_sizes =
+      GetPossibleSizes(grid.y, WorkGroupSizeAlignment::PRECISE);
 
-  for (int x = 128; x <= max_work_group_size && x < grid.x + 128; x += 128) {
-    for (auto z : possible_z_sizes) {
-      if (x * z <= max_work_group_size) {
-        work_groups.push_back({x, 1, z});
+  for (int x = multiplier;
+       x <= kernel_info.max_work_group_size && x < grid.x + multiplier;
+       x += multiplier) {
+    for (auto y : possible_y_sizes) {
+      for (auto z : possible_z_sizes) {
+        if (x <= device_info.max_work_group_size_x &&
+            y <= device_info.max_work_group_size_y &&
+            z <= device_info.max_work_group_size_z &&
+            x * y * z <= kernel_info.max_work_group_size) {
+          work_groups.push_back({x, y, z});
+        }
       }
     }
   }
   return work_groups;
 }
 
-absl::Status GetBestWorkGroupAlignedToGrid(const TuningParameters& params,
-                                           const CLKernel& kernel,
-                                           const int3& grid,
-                                           int3* best_work_group) {
-  std::vector<int3> work_groups;
+void GetWorkGroupsAlignedToGrid(const DeviceInfo& device_info,
+                                const KernelInfo& kernel_info, const int3& grid,
+                                std::vector<int3>* work_groups) {
   int3 max_wg_size;
-  max_wg_size.x = params.info->max_work_group_size_x;
-  max_wg_size.y = params.info->max_work_group_size_y;
-  max_wg_size.z = params.info->max_work_group_size_z;
-  RETURN_IF_ERROR(GenerateWorkGroupSizesAlignedToGrid(
-      grid, max_wg_size, kernel.GetMaxWorkGroupSize(), &work_groups));
-  int best_work_group_index;
-  RETURN_IF_ERROR(params.queue->GetBestWorkGroupIndex(
-      kernel, *params.info, grid, work_groups, &best_work_group_index));
-  *best_work_group = work_groups[best_work_group_index];
-  return absl::OkStatus();
+  max_wg_size.x = device_info.max_work_group_size_x;
+  max_wg_size.y = device_info.max_work_group_size_y;
+  max_wg_size.z = device_info.max_work_group_size_z;
+  GenerateWorkGroupSizesAlignedToGrid(
+      grid, max_wg_size, kernel_info.max_work_group_size, work_groups);
 }
 
 int GetPenalty(int grid_size, int group_size) {
@@ -206,31 +214,24 @@ int3 GetWorkGroupConv(const int3& grid, int max_size, int max_z_size) {
   return int3(wg_x, wg_y, wg_z);
 }
 
-absl::Status GetBestWorkGroupXY128(const TuningParameters& params,
-                                   const CLKernel& kernel, const int3& grid,
-                                   WorkGroupSizeAlignment z_alignment,
-                                   int3* best_work_group) {
-  std::vector<int3> work_groups = GenerateWorkGroupSizesXY128(
-      grid, kernel.GetMaxWorkGroupSize(), z_alignment);
-  int best_work_group_index;
-  RETURN_IF_ERROR(params.queue->GetBestWorkGroupIndex(
-      kernel, *params.info, grid, work_groups, &best_work_group_index));
-  *best_work_group = work_groups[best_work_group_index];
-  return absl::OkStatus();
+void GetPossibleWorkGroupsXYMultipleOf(int multiplier,
+                                       const DeviceInfo& device_info,
+                                       const KernelInfo& kernel_info,
+                                       const int3& grid,
+                                       WorkGroupSizeAlignment z_alignment,
+                                       std::vector<int3>* work_groups) {
+  *work_groups = GenerateWorkGroupSizesXYMultipleOf(
+      multiplier, grid, kernel_info, device_info, z_alignment);
 }
 
-absl::Status GetBestWorkGroupXY128Linear(const TuningParameters& params,
-                                         const CLKernel& kernel,
-                                         const int3& grid,
-                                         WorkGroupSizeAlignment z_alignment,
-                                         int3* best_work_group) {
-  std::vector<int3> work_groups = GenerateWorkGroupSizesXY128Linear(
-      grid, kernel.GetMaxWorkGroupSize(), z_alignment);
-  int best_work_group_index;
-  RETURN_IF_ERROR(params.queue->GetBestWorkGroupIndex(
-      kernel, *params.info, grid, work_groups, &best_work_group_index));
-  *best_work_group = work_groups[best_work_group_index];
-  return absl::OkStatus();
+void GetPossibleWorkGroupsXMultipleOf(int multiplier,
+                                      const DeviceInfo& device_info,
+                                      const KernelInfo& kernel_info,
+                                      const int3& grid,
+                                      WorkGroupSizeAlignment z_alignment,
+                                      std::vector<int3>* work_groups) {
+  *work_groups = GenerateWorkGroupSizesXMultipleOf(
+      multiplier, grid, kernel_info, device_info, z_alignment);
 }
 
 bool XY128RequiresMoreWorkGroupsThenXY128Linear(int width, int height) {
@@ -249,42 +250,47 @@ bool XY128RequiresMoreWorkGroupsThenXY128Linear(int width, int height) {
   return !have_equal_work_groups;
 }
 
-absl::Status GetBestWorkGroup(const TuningParameters& params,
-                              const CLKernel& kernel, const int3& grid,
-                              int3* best_work_group) {
-  switch (params.tuning_type) {
+void GetPossibleWorkGroups(TuningType tuning_type,
+                           const DeviceInfo& device_info,
+                           const KernelInfo& kernel_info, const int3& grid,
+                           std::vector<int3>* work_groups) {
+  switch (tuning_type) {
     case TuningType::FAST:
-      *best_work_group = GetWorkGroup(grid, kernel.GetMaxWorkGroupSize());
-      return absl::OkStatus();
-    case TuningType::EXHAUSTIVE:
-      return GetBestWorkGroupAlignedToGrid(params, kernel, grid,
-                                           best_work_group);
+      work_groups->push_back(
+          GetWorkGroup(grid, kernel_info.max_work_group_size));
+      return;
+    case TuningType::EXHAUSTIVE: {
+      GetWorkGroupsAlignedToGrid(device_info, kernel_info, grid, work_groups);
+      return;
+    }
     default:
-      *best_work_group = {8, 4, 1};
-      return absl::OkStatus();
+      work_groups->push_back({8, 4, 1});
+      return;
   }
 }
 
-absl::Status GetBestWorkGroupConv(const TuningParameters& params,
-                                  const CLKernel& kernel, const int3& grid,
-                                  int3* best_work_group) {
-  switch (params.tuning_type) {
+void GetPossibleWorkGroupsConv(TuningType tuning_type,
+                               const DeviceInfo& device_info,
+                               const KernelInfo& kernel_info, const int3& grid,
+                               std::vector<int3>* work_groups) {
+  switch (tuning_type) {
     case TuningType::FAST: {
       int max_z_size = 16;
-      if (params.info->IsAdreno()) {
-        max_z_size = params.info->adreno_info.gpu_version < 400 ? 16 : 64;
+      if (device_info.IsAdreno()) {
+        max_z_size = device_info.IsAdreno3xx() ? 16 : 64;
       }
-      max_z_size = std::min(max_z_size, params.info->max_work_group_size_z);
-      *best_work_group =
-          GetWorkGroupConv(grid, kernel.GetMaxWorkGroupSize(), max_z_size);
-      return absl::OkStatus();
+      max_z_size = std::min(max_z_size, device_info.max_work_group_size_z);
+      work_groups->push_back(
+          GetWorkGroupConv(grid, kernel_info.max_work_group_size, max_z_size));
+      return;
+    }
+    case TuningType::EXHAUSTIVE: {
+      GetWorkGroupsAlignedToGrid(device_info, kernel_info, grid, work_groups);
+      return;
     }
-    case TuningType::EXHAUSTIVE:
-      return GetBestWorkGroupAlignedToGrid(params, kernel, grid,
-                                           best_work_group);
     default:
-      *best_work_group = {8, 4, 1};
-      return absl::OkStatus();
+      work_groups->push_back({8, 4, 1});
+      return;
   }
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h b/tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h
index 7cc60f4723f..0c1be10782e 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h
@@ -27,20 +27,20 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-// writes best_work_group if successful
-// Here and later you can find XY128, this is because 128 is SIMD width of A6xx
-// And XY128 means that work_group_size.x * work_group_size.y % 128 = 0
-// We need it to correctly work with constants uploading on A6xx
-absl::Status GetBestWorkGroupXY128(const TuningParameters& params,
-                                   const CLKernel& kernel, const int3& grid,
-                                   WorkGroupSizeAlignment z_alignment,
-                                   int3* best_work_group);
+// multiplier can be power of two only
+void GetPossibleWorkGroupsXYMultipleOf(int multiplier,
+                                       const DeviceInfo& device_info,
+                                       const KernelInfo& kernel_info,
+                                       const int3& grid,
+                                       WorkGroupSizeAlignment z_alignment,
+                                       std::vector<int3>* work_groups);
 
-absl::Status GetBestWorkGroupXY128Linear(const TuningParameters& params,
-                                         const CLKernel& kernel,
-                                         const int3& grid,
-                                         WorkGroupSizeAlignment z_alignment,
-                                         int3* best_work_group);
+void GetPossibleWorkGroupsXMultipleOf(int multiplier,
+                                      const DeviceInfo& device_info,
+                                      const KernelInfo& kernel_info,
+                                      const int3& grid,
+                                      WorkGroupSizeAlignment z_alignment,
+                                      std::vector<int3>* work_groups);
 
 int3 GetWorkGroupXY128ConvLinear(const int3& grid);
 
@@ -49,13 +49,15 @@ int3 GetWorkGroupXY128Conv(const int3& grid);
 
 bool XY128RequiresMoreWorkGroupsThenXY128Linear(int width, int height);
 
-absl::Status GetBestWorkGroup(const TuningParameters& params,
-                              const CLKernel& kernel, const int3& grid,
-                              int3* best_work_group);
+void GetPossibleWorkGroups(TuningType tuning_type,
+                           const DeviceInfo& device_info,
+                           const KernelInfo& kernel_info, const int3& grid,
+                           std::vector<int3>* work_groups);
 
-absl::Status GetBestWorkGroupConv(const TuningParameters& params,
-                                  const CLKernel& kernel, const int3& grid,
-                                  int3* best_work_group);
+void GetPossibleWorkGroupsConv(TuningType tuning_type,
+                               const DeviceInfo& device_info,
+                               const KernelInfo& kernel_info, const int3& grid,
+                               std::vector<int3>* work_groups);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/linear_storage.cc b/tensorflow/lite/delegates/gpu/cl/linear_storage.cc
index eb822b620f7..75920f4f8c5 100644
--- a/tensorflow/lite/delegates/gpu/cl/linear_storage.cc
+++ b/tensorflow/lite/delegates/gpu/cl/linear_storage.cc
@@ -23,6 +23,29 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
+TensorLinearDescriptor::TensorLinearDescriptor(TensorLinearDescriptor&& desc)
+    : GPUObjectDescriptor(std::move(desc)),
+      storage_type(desc.storage_type),
+      element_type(desc.element_type),
+      memory_type(desc.memory_type),
+      size(desc.size),
+      data(std::move(desc.data)) {}
+
+TensorLinearDescriptor& TensorLinearDescriptor::operator=(
+    TensorLinearDescriptor&& desc) {
+  if (this != &desc) {
+    std::swap(storage_type, desc.storage_type);
+    std::swap(element_type, desc.element_type);
+    std::swap(memory_type, desc.memory_type);
+    std::swap(size, desc.size);
+    data = std::move(desc.data);
+    GPUObjectDescriptor::operator=(std::move(desc));
+  }
+  return *this;
+}
+
+void TensorLinearDescriptor::Release() { data.clear(); }
+
 GPUResources TensorLinearDescriptor::GetGPUResources() const {
   GPUResources resources;
   resources.ints.push_back("length");
@@ -81,31 +104,62 @@ absl::Status TensorLinearDescriptor::PerformReadSelector(
   }
 }
 
-LinearStorage::LinearStorage(int depth, LinearStorageType storage_type,
-                             DataType data_type)
-    : depth_(depth), storage_type_(storage_type), data_type_(data_type) {}
+absl::Status TensorLinearDescriptor::CreateGPUObject(
+    CLContext* context, GPUObjectPtr* result) const {
+  LinearStorage gpu_storage;
+  RETURN_IF_ERROR(gpu_storage.CreateFromTensorLinearDescriptor(*this, context));
+  *result = absl::make_unique<LinearStorage>(std::move(gpu_storage));
+  return absl::OkStatus();
+}
+
+void TensorLinearDescriptor::UploadLinearData(
+    const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& src,
+    int aligned_size) {
+  size = aligned_size == 0 ? DivideRoundUp(src.shape.v, 4) : aligned_size;
+  if (element_type == DataType::FLOAT32) {
+    data.resize(size * sizeof(float) * 4);
+    float* gpu_data = reinterpret_cast<float*>(data.data());
+    for (int i = 0; i < size * 4; ++i) {
+      if (i < src.shape.v) {
+        gpu_data[i] = src.data[i];
+      } else {
+        gpu_data[i] = 0.0f;
+      }
+    }
+  } else {
+    data.resize(size * sizeof(half) * 4);
+    half* gpu_data = reinterpret_cast<half*>(data.data());
+    for (int i = 0; i < size * 4; ++i) {
+      if (i < src.shape.v) {
+        gpu_data[i] = src.data[i];
+      } else {
+        gpu_data[i] = 0.0f;
+      }
+    }
+  }
+}
+
+void LinearStorage::Release() {
+  if (memory_) {
+    clReleaseMemObject(memory_);
+    memory_ = nullptr;
+  }
+}
 
 LinearStorage::LinearStorage(LinearStorage&& storage)
     : GPUObject(std::move(storage)),
-      texture_storage_(std::move(storage.texture_storage_)),
-      buffer_storage_(std::move(storage.buffer_storage_)),
       memory_(storage.memory_),
       depth_(storage.depth_),
-      name_(std::move(storage.name_)),
-      storage_type_(storage.storage_type_),
-      data_type_(storage.data_type_) {
+      storage_type_(storage.storage_type_) {
   storage.memory_ = nullptr;
 }
 
 LinearStorage& LinearStorage::operator=(LinearStorage&& storage) {
   if (this != &storage) {
-    texture_storage_ = std::move(storage.texture_storage_);
-    buffer_storage_ = std::move(storage.buffer_storage_);
+    Release();
     std::swap(memory_, storage.memory_);
     std::swap(depth_, storage.depth_);
-    name_ = std::move(storage.name_);
     std::swap(storage_type_, storage.storage_type_);
-    std::swap(data_type_, storage.data_type_);
     GPUObject::operator=(std::move(storage));
   }
   return *this;
@@ -132,6 +186,29 @@ absl::Status LinearStorage::GetGPUResources(
   return absl::OkStatus();
 }
 
+absl::Status LinearStorage::CreateFromTensorLinearDescriptor(
+    const TensorLinearDescriptor& desc, CLContext* context) {
+  storage_type_ = desc.storage_type;
+  depth_ = desc.size;
+  uint8_t* data_ptr = desc.data.empty()
+                          ? nullptr
+                          : const_cast<unsigned char*>(desc.data.data());
+  if (storage_type_ == LinearStorageType::BUFFER) {
+    bool read_only = desc.memory_type == MemoryType::CONSTANT;
+    uint8_t* data_ptr = desc.data.empty()
+                            ? nullptr
+                            : const_cast<unsigned char*>(desc.data.data());
+    const int float4_size = desc.element_type == DataType::FLOAT32
+                                ? sizeof(float) * 4
+                                : sizeof(half) * 4;
+    return CreateCLBuffer(context->context(), depth_ * float4_size, read_only,
+                          data_ptr, &memory_);
+  } else {
+    return CreateFloatRGBAImage2D(context->context(), depth_, 1,
+                                  desc.element_type, data_ptr, &memory_);
+  }
+}
+
 LinearStorageType DeduceLinearStorageType(
     TensorStorageType tensor_storage_type) {
   if (tensor_storage_type == TensorStorageType::BUFFER) {
@@ -141,40 +218,6 @@ LinearStorageType DeduceLinearStorageType(
   }
 }
 
-absl::Status CreateBufferLinearStorage(int size, DataType data_type, void* data,
-                                       CLContext* context,
-                                       LinearStorage* result) {
-  const int float4_size =
-      data_type == DataType::FLOAT32 ? sizeof(float4) : sizeof(half4);
-  *result = LinearStorage(size, LinearStorageType::BUFFER, data_type);
-  RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * size, data, context,
-                                       &result->buffer_storage_));
-  result->memory_ = result->buffer_storage_.GetMemoryPtr();
-  return absl::OkStatus();
-}
-
-absl::Status CreateTextureLinearStorage(int size, DataType data_type,
-                                        void* data, CLContext* context,
-                                        LinearStorage* result) {
-  *result = LinearStorage(size, LinearStorageType::TEXTURE_2D, data_type);
-  RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, size, 1, data, context,
-                                      &result->texture_storage_));
-  result->memory_ = result->texture_storage_.GetMemoryPtr();
-  return absl::OkStatus();
-}
-
-absl::Status CreateLinearStorage(const LinearStorageCreateInfo& creation_info,
-                                 int size, void* data, CLContext* context,
-                                 LinearStorage* result) {
-  if (creation_info.storage_type == LinearStorageType::BUFFER) {
-    return CreateBufferLinearStorage(size, creation_info.data_type, data,
-                                     context, result);
-  } else {
-    return CreateTextureLinearStorage(size, creation_info.data_type, data,
-                                      context, result);
-  }
-}
-
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/linear_storage.h b/tensorflow/lite/delegates/gpu/cl/linear_storage.h
index 2c96c79f596..37e7f12dfb3 100644
--- a/tensorflow/lite/delegates/gpu/cl/linear_storage.h
+++ b/tensorflow/lite/delegates/gpu/cl/linear_storage.h
@@ -21,11 +21,9 @@ limitations under the License.
 
 #include "absl/strings/str_cat.h"
 #include "absl/types/span.h"
-#include "tensorflow/lite/delegates/gpu/cl/buffer.h"
 #include "tensorflow/lite/delegates/gpu/cl/gpu_object.h"
 #include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
 #include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
-#include "tensorflow/lite/delegates/gpu/cl/texture2d.h"
 #include "tensorflow/lite/delegates/gpu/cl/util.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
@@ -42,6 +40,20 @@ struct TensorLinearDescriptor : public GPUObjectDescriptor {
   DataType element_type;  // FLOAT32 or FLOAT16
   MemoryType memory_type = MemoryType::GLOBAL;  // applicable for BUFFER
 
+  // optional
+  int size = 0;
+  std::vector<uint8_t> data;
+
+  TensorLinearDescriptor() = default;
+  TensorLinearDescriptor(const TensorLinearDescriptor&) = default;
+  TensorLinearDescriptor& operator=(const TensorLinearDescriptor&) = default;
+  TensorLinearDescriptor(TensorLinearDescriptor&& desc);
+  TensorLinearDescriptor& operator=(TensorLinearDescriptor&& desc);
+
+  void UploadLinearData(
+      const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& src,
+      int aligned_size = 0);
+
   absl::Status PerformSelector(const std::string& selector,
                                const std::vector<std::string>& args,
                                const std::vector<std::string>& template_args,
@@ -50,13 +62,10 @@ struct TensorLinearDescriptor : public GPUObjectDescriptor {
   GPUResources GetGPUResources() const override;
   absl::Status PerformReadSelector(const std::vector<std::string>& args,
                                    std::string* result) const;
-};
 
-struct LinearStorageCreateInfo {
-  LinearStorageType storage_type;
-  DataType data_type;
-  std::string name;      // optional
-  int aligned_size = 0;  // optional, to pad with zeroes
+  absl::Status CreateGPUObject(CLContext* context,
+                               GPUObjectPtr* result) const override;
+  void Release() override;
 };
 
 LinearStorageType DeduceLinearStorageType(
@@ -67,6 +76,7 @@ LinearStorageType DeduceLinearStorageType(
 class LinearStorage : public GPUObject {
  public:
   LinearStorage() {}
+  ~LinearStorage() override { Release(); }
 
   // Move only
   LinearStorage(LinearStorage&& storage);
@@ -74,90 +84,20 @@ class LinearStorage : public GPUObject {
   LinearStorage(const LinearStorage&) = delete;
   LinearStorage& operator=(const LinearStorage&) = delete;
 
-  void SetName(const std::string& name) { name_ = name; }
-
   absl::Status GetGPUResources(const GPUObjectDescriptor* obj_ptr,
                                GPUResourcesWithValue* resources) const override;
 
+  absl::Status CreateFromTensorLinearDescriptor(
+      const TensorLinearDescriptor& desc, CLContext* context);
+
  private:
-  friend absl::Status CreateTextureLinearStorage(int size, DataType data_type,
-                                                 void* data, CLContext* context,
-                                                 LinearStorage* result);
-  friend absl::Status CreateBufferLinearStorage(int size, DataType data_type,
-                                                void* data, CLContext* context,
-                                                LinearStorage* result);
+  void Release();
 
-  LinearStorage(int depth, LinearStorageType storage_type, DataType data_type);
-
-  Texture2D texture_storage_;
-  Buffer buffer_storage_;
-  cl_mem memory_ = nullptr;  // Just a reference to texture_storage_ or
-                             // buffer_storage_ memory, not an owner
+  cl_mem memory_ = nullptr;
   int depth_;
-  std::string name_;
   LinearStorageType storage_type_;
-  DataType data_type_;
 };
 
-absl::Status CreateBufferLinearStorage(int size, DataType data_type, void* data,
-                                       CLContext* context,
-                                       LinearStorage* result);
-
-absl::Status CreateTextureLinearStorage(int size, DataType data_type,
-                                        void* data, CLContext* context,
-                                        LinearStorage* result);
-
-absl::Status CreateLinearStorage(const LinearStorageCreateInfo& creation_info,
-                                 int size, void* data, CLContext* context,
-                                 LinearStorage* result);
-
-template <DataType T>
-absl::Status CreateLinearStorage(const LinearStorageCreateInfo& creation_info,
-                                 const tflite::gpu::Tensor<Linear, T>& tensor,
-                                 CLContext* context, LinearStorage* result) {
-  int size = creation_info.aligned_size != 0 ? creation_info.aligned_size
-                                             : tensor.shape.v;
-  const int depth = DivideRoundUp(size, 4);
-  if (creation_info.data_type == DataType::FLOAT32) {
-    std::vector<float4> gpu_data(depth);
-    CopyLinearFLT4(tensor, absl::MakeSpan(gpu_data));
-    RETURN_IF_ERROR(CreateLinearStorage(creation_info, depth, gpu_data.data(),
-                                        context, result));
-  } else {
-    std::vector<half4> gpu_data(depth);
-    CopyLinearFLT4(tensor, absl::MakeSpan(gpu_data));
-    RETURN_IF_ERROR(CreateLinearStorage(creation_info, depth, gpu_data.data(),
-                                        context, result));
-  }
-  result->SetName(creation_info.name);
-  return absl::OkStatus();
-}
-
-template <DataType T>
-absl::Status CreateLinearStorage(const TensorLinearDescriptor& descriptor,
-                                 const tflite::gpu::Tensor<Linear, T>& tensor,
-                                 CLContext* context, LinearStorage* result) {
-  LinearStorageCreateInfo creation_info;
-  creation_info.storage_type = descriptor.storage_type;
-  creation_info.data_type = descriptor.element_type;
-  int size = creation_info.aligned_size != 0 ? creation_info.aligned_size
-                                             : tensor.shape.v;
-  const int depth = DivideRoundUp(size, 4);
-  if (creation_info.data_type == DataType::FLOAT32) {
-    std::vector<float4> gpu_data(depth);
-    CopyLinearFLT4(tensor, absl::MakeSpan(gpu_data));
-    RETURN_IF_ERROR(CreateLinearStorage(creation_info, depth, gpu_data.data(),
-                                        context, result));
-  } else {
-    std::vector<half4> gpu_data(depth);
-    CopyLinearFLT4(tensor, absl::MakeSpan(gpu_data));
-    RETURN_IF_ERROR(CreateLinearStorage(creation_info, depth, gpu_data.data(),
-                                        context, result));
-  }
-  result->SetName(creation_info.name);
-  return absl::OkStatus();
-}
-
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/program_cache.h b/tensorflow/lite/delegates/gpu/cl/program_cache.h
index 21f9583a59a..81649d677f7 100644
--- a/tensorflow/lite/delegates/gpu/cl/program_cache.h
+++ b/tensorflow/lite/delegates/gpu/cl/program_cache.h
@@ -18,9 +18,9 @@ limitations under the License.
 
 #include <cstdint>
 #include <string>
-#include <unordered_map>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/types/span.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
@@ -93,8 +93,8 @@ class ProgramCache {
   // There is a low probability of a hash collision when cache is deserialized
   // because only fingerprints are serialized instead of full source code.
   bool use_fingerprints_ = false;
-  std::unordered_map<ProgramDescriptor, CLProgram, ProgramDescriptorHasher,
-                     ProgramDescriptorEqual>
+  absl::flat_hash_map<ProgramDescriptor, CLProgram, ProgramDescriptorHasher,
+                      ProgramDescriptorEqual>
       programs_;
 };
 
diff --git a/tensorflow/lite/delegates/gpu/cl/run_tests.sh b/tensorflow/lite/delegates/gpu/cl/run_tests.sh
index 16d2feb8a5a..0eed264a06f 100755
--- a/tensorflow/lite/delegates/gpu/cl/run_tests.sh
+++ b/tensorflow/lite/delegates/gpu/cl/run_tests.sh
@@ -64,11 +64,17 @@ trap "cleanup_device" EXIT
 declare -a BUILD_CONFIG
 abi_version=$(ADB shell getprop ro.product.cpu.abi | tr -d '\r')
 if [[ "$abi_version" == "armeabi-v7a" ]]; then
-#"32 bit"
+#"32 bit ARM"
 BUILD_CONFIG=( --config=android_arm -c opt --copt=-fPIE --linkopt=-pie )
-else
-#"64 bit"
+elif [[ "$abi_version" == "arm64-v8a" ]]; then
+#"64 bit ARM"
 BUILD_CONFIG=( --config=android_arm64 -c opt )
+elif [[ "$abi_version" == "x86_64" ]]; then
+# x86_64
+BUILD_CONFIG=( --config=android_x86_64 -c opt )
+else
+echo "Error: Unknown processor ABI"
+exit 1
 fi
 
 targets=($(bazel query 'tests('$test_target')'))
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/BUILD b/tensorflow/lite/delegates/gpu/cl/selectors/BUILD
index 7ea0ac35f89..520145d7d5b 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/BUILD
@@ -49,6 +49,7 @@ cc_library(
     hdrs = ["default_selector.h"],
     deps = [
         ":subgraph",
+        "//tensorflow/lite/delegates/gpu/cl:device_info",
         "//tensorflow/lite/delegates/gpu/cl:model_hints",
         "//tensorflow/lite/delegates/gpu/cl:tensor_type",
         "//tensorflow/lite/delegates/gpu/cl/kernels:gpu_operation",
@@ -109,6 +110,8 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/cl/kernels:elementwise",
         "//tensorflow/lite/delegates/gpu/cl/kernels:gpu_operation",
         "//tensorflow/lite/delegates/gpu/cl/kernels:mean_stddev_normalization",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:reduce",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:transpose",
         "//tensorflow/lite/delegates/gpu/cl/selectors:default_selector",
         "//tensorflow/lite/delegates/gpu/common:data_type",
         "//tensorflow/lite/delegates/gpu/common:model",
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.cc b/tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.cc
index b577757057e..7fa7978034b 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.cc
@@ -30,218 +30,171 @@ namespace gpu {
 namespace cl {
 namespace {
 
-absl::Status SelectConvolutionAdreno(const Convolution2DAttributes& attr,
-                                     const BHWC& dst_shape,
-                                     const CreationContext& creation_context,
-                                     const OperationDef& op_def,
-                                     ModelHints hints,
-                                     std::unique_ptr<GPUOperation>* ptr) {
-  if (IsConvConstantsSupported(*creation_context.device, op_def, attr)) {
-    ConvConstants conv;
-    RETURN_IF_ERROR(CreateConvConstants(creation_context, op_def, attr, &conv));
-    *ptr = absl::make_unique<ConvConstants>(std::move(conv));
-  } else {
-    ConvTexture conv;
-    RETURN_IF_ERROR(CreateConvTexture(creation_context, op_def, attr, &conv));
-    *ptr = absl::make_unique<ConvTexture>(std::move(conv));
-  }
-  return absl::OkStatus();
-}
-
-absl::Status SelectConvolutionWinogradAdreno(
+std::unique_ptr<GPUOperation> SelectConvolutionAdreno(
     const Convolution2DAttributes& attr, const BHWC& dst_shape,
-    const CreationContext& creation_context, const OperationDef& op_def,
-    ModelHints hints, std::unique_ptr<GPUOperation>* ptr) {
-  ConvTexture conv;
-  RETURN_IF_ERROR(
-      CreateConvTextureWino4x4To6x6(creation_context, op_def, attr, &conv));
-  *ptr = absl::make_unique<ConvTexture>(std::move(conv));
-  return absl::OkStatus();
-}
-
-absl::Status SelectConvolutionDynamicWeightsAdreno(
-    const Convolution2DAttributes& attr, const BHWC& weights_shape,
-    const BHWC& dst_shape, const CreationContext& creation_context,
-    const OperationDef& op_def, ModelHints hints,
-    std::unique_ptr<GPUOperation>* ptr, ConvWeightsDescription* weights_desc) {
-  ConvPowerVR conv;
-  RETURN_IF_ERROR(CreateConvPowerVRDynamicWeights(
-      creation_context, op_def, attr, weights_shape, &conv, &dst_shape));
-  *weights_desc = conv.GetConvWeightsDescription();
-  *ptr = absl::make_unique<ConvPowerVR>(std::move(conv));
-  return absl::OkStatus();
-}
-
-absl::Status SelectConvolutionNVidia(const Convolution2DAttributes& attr,
-                                     const BHWC& dst_shape,
-                                     const CreationContext& creation_context,
-                                     const OperationDef& op_def,
-                                     std::unique_ptr<GPUOperation>* ptr) {
-  if (IsConvConstantsSupported(*creation_context.device, op_def, attr)) {
-    ConvConstants conv;
-    RETURN_IF_ERROR(CreateConvConstants(creation_context, op_def, attr, &conv));
-    *ptr = absl::make_unique<ConvConstants>(std::move(conv));
+    const DeviceInfo& device_info, const OperationDef& op_def,
+    ModelHints hints) {
+  if (IsConvConstantsSupported(device_info, op_def, attr)) {
+    GPUOperation conv = CreateConvConstants(device_info, op_def, attr);
+    return absl::make_unique<GPUOperation>(std::move(conv));
   } else {
-    ConvPowerVR conv;
-    RETURN_IF_ERROR(
-        CreateConvPowerVR(creation_context, op_def, attr, &conv, &dst_shape));
-    *ptr = absl::make_unique<ConvPowerVR>(std::move(conv));
+    ConvTexture conv = CreateConvTexture(device_info, op_def, attr);
+    return absl::make_unique<ConvTexture>(std::move(conv));
   }
-  return absl::OkStatus();
 }
 
-absl::Status SelectConvolutionPowerVR(const Convolution2DAttributes& attr,
-                                      const CreationContext& creation_context,
-                                      const OperationDef& op_def,
-                                      std::unique_ptr<GPUOperation>* ptr) {
-  ConvPowerVR conv;
-  RETURN_IF_ERROR(CreateConvPowerVR(creation_context, op_def, attr, &conv));
-  *ptr = absl::make_unique<ConvPowerVR>(std::move(conv));
-  return absl::OkStatus();
+std::unique_ptr<GPUOperation> SelectConvolutionWinogradAdreno(
+    const Convolution2DAttributes& attr, const BHWC& dst_shape,
+    const DeviceInfo& device_info, const OperationDef& op_def,
+    ModelHints hints) {
+  ConvTexture conv = CreateConvTextureWino4x4To6x6(device_info, op_def, attr);
+  return absl::make_unique<ConvTexture>(std::move(conv));
 }
 
-absl::Status SelectConvolutionMali(const Convolution2DAttributes& attr,
-                                   const BHWC& dst_shape,
-                                   const CreationContext& creation_context,
-                                   const OperationDef& op_def,
-                                   std::unique_ptr<GPUOperation>* ptr) {
+std::unique_ptr<GPUOperation> SelectConvolutionDynamicWeightsAdreno(
+    const Convolution2DAttributes& attr, const BHWC& weights_shape,
+    const BHWC& dst_shape, const DeviceInfo& device_info,
+    const OperationDef& op_def, ModelHints hints,
+    ConvWeightsDescription* weights_desc) {
+  ConvPowerVR conv = CreateConvPowerVRDynamicWeights(
+      device_info, op_def, attr, weights_shape, &dst_shape);
+  *weights_desc = conv.GetConvWeightsDescription();
+  return absl::make_unique<ConvPowerVR>(std::move(conv));
+}
+
+std::unique_ptr<GPUOperation> SelectConvolutionNVidia(
+    const Convolution2DAttributes& attr, const BHWC& dst_shape,
+    const DeviceInfo& device_info, const OperationDef& op_def) {
+  if (IsConvConstantsSupported(device_info, op_def, attr)) {
+    GPUOperation conv = CreateConvConstants(device_info, op_def, attr);
+    return absl::make_unique<GPUOperation>(std::move(conv));
+  } else {
+    ConvPowerVR conv = CreateConvPowerVR(device_info, op_def, attr, &dst_shape);
+    return absl::make_unique<ConvPowerVR>(std::move(conv));
+  }
+}
+
+std::unique_ptr<GPUOperation> SelectConvolutionPowerVR(
+    const Convolution2DAttributes& attr, const DeviceInfo& device_info,
+    const OperationDef& op_def) {
+  ConvPowerVR conv = CreateConvPowerVR(device_info, op_def, attr);
+  return absl::make_unique<ConvPowerVR>(std::move(conv));
+}
+
+std::unique_ptr<GPUOperation> SelectConvolutionMali(
+    const Convolution2DAttributes& attr, const BHWC& dst_shape,
+    const DeviceInfo& device_info, const OperationDef& op_def) {
   if (op_def.src_tensors[0].storage_type == TensorStorageType::BUFFER &&
       IsConvBuffer1x1Supported(op_def, attr)) {
-    ConvBuffer1x1 conv;
-    RETURN_IF_ERROR(
-        CreateConvBuffer1x1(creation_context, op_def, attr, &conv, &dst_shape));
-    *ptr = absl::make_unique<ConvBuffer1x1>(std::move(conv));
+    ConvBuffer1x1 conv =
+        CreateConvBuffer1x1(device_info, op_def, attr, &dst_shape);
+    return absl::make_unique<ConvBuffer1x1>(std::move(conv));
   } else {
-    ConvPowerVR conv;
-    RETURN_IF_ERROR(
-        CreateConvPowerVR(creation_context, op_def, attr, &conv, &dst_shape));
-    *ptr = absl::make_unique<ConvPowerVR>(std::move(conv));
+    ConvPowerVR conv = CreateConvPowerVR(device_info, op_def, attr, &dst_shape);
+    return absl::make_unique<ConvPowerVR>(std::move(conv));
   }
-  return absl::OkStatus();
 }
 
-absl::Status SelectConvolutionWinogradMali(
+std::unique_ptr<GPUOperation> SelectConvolutionWinogradMali(
     const Convolution2DAttributes& attr, const BHWC& dst_shape,
-    const CreationContext& creation_context, const OperationDef& op_def,
-    std::unique_ptr<GPUOperation>* ptr) {
+    const DeviceInfo& device_info, const OperationDef& op_def) {
   if (op_def.src_tensors[0].storage_type == TensorStorageType::BUFFER) {
-    ConvBuffer1x1 conv;
-    RETURN_IF_ERROR(CreateConvBuffer1x1Wino4x4To6x6(creation_context, op_def,
-                                                    attr, &conv, &dst_shape));
-    *ptr = absl::make_unique<ConvBuffer1x1>(std::move(conv));
+    ConvBuffer1x1 conv =
+        CreateConvBuffer1x1Wino4x4To6x6(device_info, op_def, attr, &dst_shape);
+    return absl::make_unique<ConvBuffer1x1>(std::move(conv));
   } else {
-    ConvPowerVR conv;
-    RETURN_IF_ERROR(CreateConvPowerVRWino4x4To6x6(creation_context, op_def,
-                                                  attr, &conv, &dst_shape));
-    *ptr = absl::make_unique<ConvPowerVR>(std::move(conv));
+    ConvPowerVR conv =
+        CreateConvPowerVRWino4x4To6x6(device_info, op_def, attr, &dst_shape);
+    return absl::make_unique<ConvPowerVR>(std::move(conv));
   }
-  return absl::OkStatus();
 }
 
-absl::Status SelectConvolutionDynamicWeightsMali(
+std::unique_ptr<GPUOperation> SelectConvolutionDynamicWeightsMali(
     const Convolution2DAttributes& attr, const BHWC& weights_shape,
-    const BHWC& dst_shape, const CreationContext& creation_context,
+    const BHWC& dst_shape, const DeviceInfo& device_info,
     const OperationDef& op_def, ModelHints hints,
-    std::unique_ptr<GPUOperation>* ptr, ConvWeightsDescription* weights_desc) {
+    ConvWeightsDescription* weights_desc) {
   if (op_def.src_tensors[0].storage_type == TensorStorageType::BUFFER &&
       IsConvBuffer1x1Supported(op_def, weights_shape, attr)) {
-    ConvBuffer1x1 conv;
-    RETURN_IF_ERROR(CreateConvBuffer1x1DynamicWeights(
-        creation_context, op_def, attr, weights_shape, &conv, &dst_shape));
+    ConvBuffer1x1 conv = CreateConvBuffer1x1DynamicWeights(
+        device_info, op_def, attr, weights_shape, &dst_shape);
     *weights_desc = conv.GetConvWeightsDescription();
-    *ptr = absl::make_unique<ConvBuffer1x1>(std::move(conv));
+    return absl::make_unique<ConvBuffer1x1>(std::move(conv));
   } else {
-    ConvPowerVR conv;
-    RETURN_IF_ERROR(CreateConvPowerVRDynamicWeights(
-        creation_context, op_def, attr, weights_shape, &conv, &dst_shape));
+    ConvPowerVR conv = CreateConvPowerVRDynamicWeights(
+        device_info, op_def, attr, weights_shape, &dst_shape);
     *weights_desc = conv.GetConvWeightsDescription();
-    *ptr = absl::make_unique<ConvPowerVR>(std::move(conv));
+    return absl::make_unique<ConvPowerVR>(std::move(conv));
   }
-  return absl::OkStatus();
 }
 
 }  // namespace
 
-absl::Status SelectConvolution(const Convolution2DAttributes& attr,
-                               const BHWC& dst_shape,
-                               const CreationContext& creation_context,
-                               const OperationDef& op_def, ModelHints hints,
-                               std::unique_ptr<GPUOperation>* ptr) {
-  const auto& device_info = creation_context.device->GetInfo();
+std::unique_ptr<GPUOperation> SelectConvolution(
+    const Convolution2DAttributes& attr, const BHWC& dst_shape,
+    const DeviceInfo& device_info, const OperationDef& op_def,
+    ModelHints hints) {
   if (device_info.IsAdreno()) {
-    return SelectConvolutionAdreno(attr, dst_shape, creation_context, op_def,
-                                     hints, ptr);
+    return SelectConvolutionAdreno(attr, dst_shape, device_info, op_def, hints);
   } else if (device_info.IsPowerVR() || device_info.IsAMD() ||
              device_info.IsIntel()) {
-    return SelectConvolutionPowerVR(attr, creation_context, op_def, ptr);
+    return SelectConvolutionPowerVR(attr, device_info, op_def);
   } else if (device_info.IsNvidia()) {
-    return SelectConvolutionNVidia(attr, dst_shape, creation_context, op_def,
-                                     ptr);
+    return SelectConvolutionNVidia(attr, dst_shape, device_info, op_def);
   } else if (device_info.IsMali()) {
-    return SelectConvolutionMali(attr, dst_shape, creation_context, op_def,
-                                   ptr);
+    return SelectConvolutionMali(attr, dst_shape, device_info, op_def);
   } else {
-    return SelectConvolutionAdreno(attr, dst_shape, creation_context, op_def,
-                                     hints, ptr);
+    return SelectConvolutionAdreno(attr, dst_shape, device_info, op_def, hints);
   }
 }
 
-absl::Status SelectConvolutionForWinograd(
+std::unique_ptr<GPUOperation> SelectConvolutionForWinograd(
     const Convolution2DAttributes& attr, const BHWC& dst_shape,
-    const CreationContext& creation_context, const OperationDef& op_def,
-    ModelHints hints, std::unique_ptr<GPUOperation>* ptr) {
-  const auto& device_info = creation_context.device->GetInfo();
+    const DeviceInfo& device_info, const OperationDef& op_def,
+    ModelHints hints) {
   if (device_info.IsAdreno()) {
-    return SelectConvolutionWinogradAdreno(attr, dst_shape, creation_context,
-                                             op_def, hints, ptr);
+    return SelectConvolutionWinogradAdreno(attr, dst_shape, device_info, op_def,
+                                           hints);
   } else if (device_info.IsPowerVR() || device_info.IsAMD() ||
              device_info.IsNvidia() || device_info.IsIntel()) {
-    ConvPowerVR conv;
-      RETURN_IF_ERROR(CreateConvPowerVRWino4x4To6x6(creation_context, op_def,
-                                                    attr, &conv, &dst_shape));
-      *ptr = absl::make_unique<ConvPowerVR>(std::move(conv));
-      return absl::OkStatus();
+    ConvPowerVR conv =
+        CreateConvPowerVRWino4x4To6x6(device_info, op_def, attr, &dst_shape);
+    return absl::make_unique<ConvPowerVR>(std::move(conv));
   } else if (device_info.IsMali()) {
-    return SelectConvolutionWinogradMali(attr, dst_shape, creation_context,
-                                           op_def, ptr);
+    return SelectConvolutionWinogradMali(attr, dst_shape, device_info, op_def);
   } else {
-    return SelectConvolutionWinogradAdreno(attr, dst_shape, creation_context,
-                                             op_def, hints, ptr);
+    return SelectConvolutionWinogradAdreno(attr, dst_shape, device_info, op_def,
+                                           hints);
   }
 }
 
-absl::Status SelectConvolutionWithDynamicWeights(
+std::unique_ptr<GPUOperation> SelectConvolutionWithDynamicWeights(
     const Convolution2DAttributes& attr, const BHWC& weights_shape,
-    const BHWC& dst_shape, const CreationContext& creation_context,
+    const BHWC& dst_shape, const DeviceInfo& device_info,
     const OperationDef& op_def, ModelHints hints,
-    std::unique_ptr<GPUOperation>* ptr, ConvWeightsDescription* weights_desc) {
-  const auto& device_info = creation_context.device->GetInfo();
+    ConvWeightsDescription* weights_desc) {
   if (device_info.IsAdreno()) {
     return SelectConvolutionDynamicWeightsAdreno(attr, weights_shape, dst_shape,
-                                                 creation_context, op_def,
-                                                 hints, ptr, weights_desc);
+                                                 device_info, op_def, hints,
+                                                 weights_desc);
   } else if (device_info.IsMali()) {
     return SelectConvolutionDynamicWeightsMali(attr, weights_shape, dst_shape,
-                                               creation_context, op_def, hints,
-                                               ptr, weights_desc);
+                                               device_info, op_def, hints,
+                                               weights_desc);
   } else {
-    ConvPowerVR conv;
-    RETURN_IF_ERROR(CreateConvPowerVRDynamicWeights(
-        creation_context, op_def, attr, weights_shape, &conv, &dst_shape));
+    ConvPowerVR conv = CreateConvPowerVRDynamicWeights(
+        device_info, op_def, attr, weights_shape, &dst_shape);
     *weights_desc = conv.GetConvWeightsDescription();
-    *ptr = absl::make_unique<ConvPowerVR>(std::move(conv));
-    return absl::OkStatus();
+    return absl::make_unique<ConvPowerVR>(std::move(conv));
   }
 }
 
-absl::Status SelectConverterToConvWeights(
-    const ConvWeightsDescription& weights_desc,
-    const CreationContext& creation_context, const OperationDef& op_def,
-    ModelHints hints, std::unique_ptr<GPUOperation>* ptr) {
+std::unique_ptr<GPUOperation> SelectConverterToConvWeights(
+    const ConvWeightsDescription& weights_desc, const OperationDef& op_def,
+    ModelHints hints) {
   ConverterToConvWeights converter =
       ConverterToConvWeights(op_def, weights_desc);
-  *ptr = absl::make_unique<ConverterToConvWeights>(std::move(converter));
-  return absl::OkStatus();
+  return absl::make_unique<ConverterToConvWeights>(std::move(converter));
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.h b/tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.h
index 58be4b60ce6..f2bacab304c 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.h
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.h
@@ -29,27 +29,25 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-absl::Status SelectConvolution(const Convolution2DAttributes& attr,
-                               const BHWC& dst_shape,
-                               const CreationContext& creation_context,
-                               const OperationDef& op_def, ModelHints hints,
-                               std::unique_ptr<GPUOperation>* ptr);
-
-absl::Status SelectConvolutionForWinograd(
+std::unique_ptr<GPUOperation> SelectConvolution(
     const Convolution2DAttributes& attr, const BHWC& dst_shape,
-    const CreationContext& creation_context, const OperationDef& op_def,
-    ModelHints hints, std::unique_ptr<GPUOperation>* ptr);
+    const DeviceInfo& device_info, const OperationDef& op_def,
+    ModelHints hints);
 
-absl::Status SelectConvolutionWithDynamicWeights(
+std::unique_ptr<GPUOperation> SelectConvolutionForWinograd(
+    const Convolution2DAttributes& attr, const BHWC& dst_shape,
+    const DeviceInfo& device_info, const OperationDef& op_def,
+    ModelHints hints);
+
+std::unique_ptr<GPUOperation> SelectConvolutionWithDynamicWeights(
     const Convolution2DAttributes& attr, const BHWC& weights_shape,
-    const BHWC& dst_shape, const CreationContext& creation_context,
+    const BHWC& dst_shape, const DeviceInfo& device_info,
     const OperationDef& op_def, ModelHints hints,
-    std::unique_ptr<GPUOperation>* ptr, ConvWeightsDescription* weights_desc);
+    ConvWeightsDescription* weights_desc);
 
-absl::Status SelectConverterToConvWeights(
-    const ConvWeightsDescription& weights_desc,
-    const CreationContext& creation_context, const OperationDef& op_def,
-    ModelHints hints, std::unique_ptr<GPUOperation>* ptr);
+std::unique_ptr<GPUOperation> SelectConverterToConvWeights(
+    const ConvWeightsDescription& weights_desc, const OperationDef& op_def,
+    ModelHints hints);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/convolution_transposed_selector.cc b/tensorflow/lite/delegates/gpu/cl/selectors/convolution_transposed_selector.cc
index 56864f2c575..a2cad9de5e2 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/convolution_transposed_selector.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/convolution_transposed_selector.cc
@@ -28,96 +28,71 @@ namespace gpu {
 namespace cl {
 namespace {
 
-absl::Status SelectConvolutionTransposedAdreno(
-    const ConvolutionTransposedAttributes& attr,
-    const CreationContext& creation_context, const OperationDef& op_def,
-    std::unique_ptr<GPUOperation>* ptr) {
-  if (IsConvolutionTransposedThinSupported(*creation_context.device, attr)) {
-    ConvolutionTransposedThin conv;
-    RETURN_IF_ERROR(
-        CreateConvolutionTransposedThin(creation_context, op_def, attr, &conv));
-    *ptr = absl::make_unique<ConvolutionTransposedThin>(std::move(conv));
-  } else if (IsConvolutionTransposed3x3ThinSupported(*creation_context.device,
-                                                     attr)) {
-    ConvolutionTransposed3x3Thin conv;
-    RETURN_IF_ERROR(CreateConvolutionTransposed3x3Thin(creation_context, op_def,
-                                                       attr, &conv));
-    *ptr = absl::make_unique<ConvolutionTransposed3x3Thin>(std::move(conv));
+std::unique_ptr<GPUOperation> SelectConvolutionTransposedAdreno(
+    const ConvolutionTransposedAttributes& attr, const DeviceInfo& device_info,
+    const OperationDef& op_def) {
+  if (IsConvolutionTransposedThinSupported(attr)) {
+    ConvolutionTransposedThin conv =
+        CreateConvolutionTransposedThin(device_info, op_def, attr);
+    return absl::make_unique<ConvolutionTransposedThin>(std::move(conv));
+  } else if (IsConvolutionTransposed3x3ThinSupported(attr)) {
+    ConvolutionTransposed3x3Thin conv =
+        CreateConvolutionTransposed3x3Thin(device_info, op_def, attr);
+    return absl::make_unique<ConvolutionTransposed3x3Thin>(std::move(conv));
   } else {
-    ConvolutionTransposed conv;
-    RETURN_IF_ERROR(
-        CreateConvolutionTransposed(creation_context, op_def, attr, &conv));
-    *ptr = absl::make_unique<ConvolutionTransposed>(std::move(conv));
+    ConvolutionTransposed conv =
+        CreateConvolutionTransposed(device_info, op_def, attr);
+    return absl::make_unique<ConvolutionTransposed>(std::move(conv));
   }
-  return absl::OkStatus();
 }
 
-absl::Status SelectConvolutionTransposedPowerVR(
-    const ConvolutionTransposedAttributes& attr,
-    const CreationContext& creation_context, const OperationDef& op_def,
-    std::unique_ptr<GPUOperation>* ptr) {
-  if (IsConvolutionTransposedThinSupported(*creation_context.device, attr)) {
-    ConvolutionTransposedThin conv;
-    RETURN_IF_ERROR(
-        CreateConvolutionTransposedThin(creation_context, op_def, attr, &conv));
-    *ptr = absl::make_unique<ConvolutionTransposedThin>(std::move(conv));
-  } else if (IsConvolutionTransposed3x3ThinSupported(*creation_context.device,
-                                                     attr)) {
-    ConvolutionTransposed3x3Thin conv;
-    RETURN_IF_ERROR(CreateConvolutionTransposed3x3Thin(creation_context, op_def,
-                                                       attr, &conv));
-    *ptr = absl::make_unique<ConvolutionTransposed3x3Thin>(std::move(conv));
-  } else if (IsConvolutionTransposed3x3Supported(*creation_context.device,
-                                                 op_def, attr)) {
-    ConvolutionTransposed3x3 conv;
-    RETURN_IF_ERROR(
-        CreateConvolutionTransposed3x3(creation_context, op_def, attr, &conv));
-    *ptr = absl::make_unique<ConvolutionTransposed3x3>(std::move(conv));
-  } else if (IsConvolutionTransposed4x4Supported(*creation_context.device,
-                                                 op_def, attr)) {
-    ConvolutionTransposed4x4 conv;
-    RETURN_IF_ERROR(
-        CreateConvolutionTransposed4x4(creation_context, op_def, attr, &conv));
-    *ptr = absl::make_unique<ConvolutionTransposed4x4>(std::move(conv));
+std::unique_ptr<GPUOperation> SelectConvolutionTransposedPowerVR(
+    const ConvolutionTransposedAttributes& attr, const DeviceInfo& device_info,
+    const OperationDef& op_def) {
+  if (IsConvolutionTransposedThinSupported(attr)) {
+    ConvolutionTransposedThin conv =
+        CreateConvolutionTransposedThin(device_info, op_def, attr);
+    return absl::make_unique<ConvolutionTransposedThin>(std::move(conv));
+  } else if (IsConvolutionTransposed3x3ThinSupported(attr)) {
+    ConvolutionTransposed3x3Thin conv =
+        CreateConvolutionTransposed3x3Thin(device_info, op_def, attr);
+    return absl::make_unique<ConvolutionTransposed3x3Thin>(std::move(conv));
+  } else if (IsConvolutionTransposed3x3Supported(op_def, attr)) {
+    ConvolutionTransposed3x3 conv =
+        CreateConvolutionTransposed3x3(device_info, op_def, attr);
+    return absl::make_unique<ConvolutionTransposed3x3>(std::move(conv));
+  } else if (IsConvolutionTransposed4x4Supported(op_def, attr)) {
+    ConvolutionTransposed4x4 conv =
+        CreateConvolutionTransposed4x4(device_info, op_def, attr);
+    return absl::make_unique<ConvolutionTransposed4x4>(std::move(conv));
   } else {
-    ConvolutionTransposed conv;
-    RETURN_IF_ERROR(
-        CreateConvolutionTransposed(creation_context, op_def, attr, &conv));
-    *ptr = absl::make_unique<ConvolutionTransposed>(std::move(conv));
+    ConvolutionTransposed conv =
+        CreateConvolutionTransposed(device_info, op_def, attr);
+    return absl::make_unique<ConvolutionTransposed>(std::move(conv));
   }
-  return absl::OkStatus();
 }
 
-absl::Status SelectConvolutionTransposedMali(
-    const ConvolutionTransposedAttributes& attr,
-    const CreationContext& creation_context, const OperationDef& op_def,
-    std::unique_ptr<GPUOperation>* ptr) {
-  ConvolutionTransposed conv;
-  RETURN_IF_ERROR(
-      CreateConvolutionTransposed(creation_context, op_def, attr, &conv));
-  *ptr = absl::make_unique<ConvolutionTransposed>(std::move(conv));
-  return absl::OkStatus();
+std::unique_ptr<GPUOperation> SelectConvolutionTransposedMali(
+    const ConvolutionTransposedAttributes& attr, const DeviceInfo& device_info,
+    const OperationDef& op_def) {
+  ConvolutionTransposed conv =
+      CreateConvolutionTransposed(device_info, op_def, attr);
+  return absl::make_unique<ConvolutionTransposed>(std::move(conv));
 }
-
 }  // namespace
 
-absl::Status SelectConvolutionTransposed(
-    const ConvolutionTransposedAttributes& attr,
-    const CreationContext& creation_context, const OperationDef& op_def,
-    std::unique_ptr<GPUOperation>* ptr) {
-  const auto& device_info = creation_context.device->GetInfo();
+std::unique_ptr<GPUOperation> SelectConvolutionTransposed(
+    const ConvolutionTransposedAttributes& attr, const DeviceInfo& device_info,
+    const OperationDef& op_def) {
   if (device_info.IsAdreno()) {
-    return SelectConvolutionTransposedAdreno(attr, creation_context, op_def,
-                                             ptr);
+    return SelectConvolutionTransposedAdreno(attr, device_info, op_def);
   } else if (device_info.IsPowerVR() || device_info.IsAMD() ||
              device_info.IsNvidia() || device_info.IsIntel()) {
-    return SelectConvolutionTransposedPowerVR(attr, creation_context, op_def,
-                                              ptr);
+    return SelectConvolutionTransposedPowerVR(attr, device_info, op_def);
   } else if (device_info.IsMali()) {
-    return SelectConvolutionTransposedMali(attr, creation_context, op_def, ptr);
+    return SelectConvolutionTransposedMali(attr, device_info, op_def);
   } else {
-    return SelectConvolutionTransposedAdreno(attr, creation_context, op_def,
-                                             ptr);
+    return SelectConvolutionTransposedAdreno(attr, device_info, op_def);
   }
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/convolution_transposed_selector.h b/tensorflow/lite/delegates/gpu/cl/selectors/convolution_transposed_selector.h
index ff37c1024ad..fd241766eba 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/convolution_transposed_selector.h
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/convolution_transposed_selector.h
@@ -26,10 +26,9 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-absl::Status SelectConvolutionTransposed(
-    const ConvolutionTransposedAttributes& attr,
-    const CreationContext& creation_context, const OperationDef& op_def,
-    std::unique_ptr<GPUOperation>* ptr);
+std::unique_ptr<GPUOperation> SelectConvolutionTransposed(
+    const ConvolutionTransposedAttributes& attr, const DeviceInfo& device_info,
+    const OperationDef& op_def);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/default/default_selector.cc b/tensorflow/lite/delegates/gpu/cl/selectors/default/default_selector.cc
index 7373e3d545c..408fe7c47c8 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/default/default_selector.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/default/default_selector.cc
@@ -28,7 +28,7 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-absl::Status SelectDefault(const CreationContext& creation_context,
+absl::Status SelectDefault(const DeviceInfo& device_info,
                            const OperationDef& op_def, ModelHints hints,
                            const std::vector<Value*>& inputs,
                            const std::vector<Value*>& outputs, const Node& node,
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/default_selector.h b/tensorflow/lite/delegates/gpu/cl/selectors/default_selector.h
index 34004240df4..790da1c80f9 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/default_selector.h
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/default_selector.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <memory>
 
+#include "tensorflow/lite/delegates/gpu/cl/device_info.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
 #include "tensorflow/lite/delegates/gpu/cl/model_hints.h"
 #include "tensorflow/lite/delegates/gpu/cl/selectors/subgraph.h"
@@ -29,7 +30,7 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-absl::Status SelectDefault(const CreationContext& creation_context,
+absl::Status SelectDefault(const DeviceInfo& device_info,
                            const OperationDef& op_def, ModelHints hints,
                            const std::vector<Value*>& inputs,
                            const std::vector<Value*>& outputs, const Node& node,
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/dw_convolution_selector.cc b/tensorflow/lite/delegates/gpu/cl/selectors/dw_convolution_selector.cc
index fafd9078f6f..b04335a4d7d 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/dw_convolution_selector.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/dw_convolution_selector.cc
@@ -26,79 +26,59 @@ namespace gpu {
 namespace cl {
 namespace {
 
-absl::Status SelectDWConvolutionAdreno(
-    const DepthwiseConvolution2DAttributes& attr,
-    const CreationContext& creation_context, const OperationDef& op_def,
-    std::unique_ptr<GPUOperation>* ptr) {
+std::unique_ptr<GPUOperation> SelectDWConvolutionAdreno(
+    const DepthwiseConvolution2DAttributes& attr, const DeviceInfo& device_info,
+    const OperationDef& op_def) {
   if (IsDepthwiseConv3x3Supported(attr)) {
-    DepthwiseConv3x3 dw_conv;
-    RETURN_IF_ERROR(
-        CreateDepthwiseConv3x3(creation_context, op_def, attr, &dw_conv));
-    *ptr = absl::make_unique<DepthwiseConv3x3>(std::move(dw_conv));
+    return absl::make_unique<DepthwiseConv3x3>(
+        CreateDepthwiseConv3x3(device_info, op_def, attr));
   } else {
-    DepthwiseConvolution dw_conv;
-    RETURN_IF_ERROR(
-        CreateDepthwiseConvolution(creation_context, op_def, attr, &dw_conv));
-    *ptr = absl::make_unique<DepthwiseConvolution>(std::move(dw_conv));
+    return absl::make_unique<GPUOperation>(
+        CreateDepthwiseConvolution2D(device_info, op_def, attr));
   }
-  return absl::OkStatus();
 }
 
-absl::Status SelectDWConvolutionPowerVR(
-    const DepthwiseConvolution2DAttributes& attr,
-    const CreationContext& creation_context, const OperationDef& op_def,
-    std::unique_ptr<GPUOperation>* ptr) {
+std::unique_ptr<GPUOperation> SelectDWConvolutionPowerVR(
+    const DepthwiseConvolution2DAttributes& attr, const DeviceInfo& device_info,
+    const OperationDef& op_def) {
   if (IsDepthwiseConv3x3Supported(attr)) {
-    DepthwiseConv3x3 dw_conv;
-    RETURN_IF_ERROR(
-        CreateDepthwiseConv3x3(creation_context, op_def, attr, &dw_conv));
-    *ptr = absl::make_unique<DepthwiseConv3x3>(std::move(dw_conv));
+    return absl::make_unique<DepthwiseConv3x3>(
+        CreateDepthwiseConv3x3(device_info, op_def, attr));
   } else {
-    DepthwiseConvolution dw_conv;
-    RETURN_IF_ERROR(
-        CreateDepthwiseConvolution(creation_context, op_def, attr, &dw_conv));
-    *ptr = absl::make_unique<DepthwiseConvolution>(std::move(dw_conv));
+    return absl::make_unique<GPUOperation>(
+        CreateDepthwiseConvolution2D(device_info, op_def, attr));
   }
-  return absl::OkStatus();
 }
 
-absl::Status SelectDWConvolutionMali(
-    const DepthwiseConvolution2DAttributes& attr,
-    const CreationContext& creation_context, const OperationDef& op_def,
-    std::unique_ptr<GPUOperation>* ptr) {
+std::unique_ptr<GPUOperation> SelectDWConvolutionMali(
+    const DepthwiseConvolution2DAttributes& attr, const DeviceInfo& device_info,
+    const OperationDef& op_def) {
   const auto storage_type = op_def.src_tensors[0].storage_type;
   bool buffer_type = storage_type == TensorStorageType::BUFFER ||
                      storage_type == TensorStorageType::IMAGE_BUFFER;
-  MaliInfo mali_info = creation_context.device->GetInfo().mali_info;
+  const MaliInfo mali_info = device_info.mali_info;
   if (IsDepthwiseConv3x3Supported(attr) && !mali_info.IsMidgard() &&
       !buffer_type && op_def.precision != CalculationsPrecision::F32) {
-    DepthwiseConv3x3 dw_conv;
-    RETURN_IF_ERROR(
-        CreateDepthwiseConv3x3(creation_context, op_def, attr, &dw_conv));
-    *ptr = absl::make_unique<DepthwiseConv3x3>(std::move(dw_conv));
+    return absl::make_unique<DepthwiseConv3x3>(
+        CreateDepthwiseConv3x3(device_info, op_def, attr));
   } else {
-    DepthwiseConvolution dw_conv;
-    RETURN_IF_ERROR(
-        CreateDepthwiseConvolution(creation_context, op_def, attr, &dw_conv));
-    *ptr = absl::make_unique<DepthwiseConvolution>(std::move(dw_conv));
+    return absl::make_unique<GPUOperation>(
+        CreateDepthwiseConvolution2D(device_info, op_def, attr));
   }
-  return absl::OkStatus();
 }
 }  // namespace
 
-absl::Status SelectDWConvolution(const DepthwiseConvolution2DAttributes& attr,
-                                 const CreationContext& creation_context,
-                                 const OperationDef& op_def,
-                                 std::unique_ptr<GPUOperation>* ptr) {
-  const auto& device_info = creation_context.device->GetInfo();
+std::unique_ptr<GPUOperation> SelectDWConvolution(
+    const DepthwiseConvolution2DAttributes& attr, const DeviceInfo& device_info,
+    const OperationDef& op_def) {
   if (device_info.IsAdreno()) {
-    return SelectDWConvolutionAdreno(attr, creation_context, op_def, ptr);
+    return SelectDWConvolutionAdreno(attr, device_info, op_def);
   } else if (device_info.IsPowerVR()) {
-    return SelectDWConvolutionPowerVR(attr, creation_context, op_def, ptr);
+    return SelectDWConvolutionPowerVR(attr, device_info, op_def);
   } else if (device_info.IsMali()) {
-    return SelectDWConvolutionMali(attr, creation_context, op_def, ptr);
+    return SelectDWConvolutionMali(attr, device_info, op_def);
   } else {
-    return SelectDWConvolutionAdreno(attr, creation_context, op_def, ptr);
+    return SelectDWConvolutionAdreno(attr, device_info, op_def);
   }
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/dw_convolution_selector.h b/tensorflow/lite/delegates/gpu/cl/selectors/dw_convolution_selector.h
index 7f7cc6da604..2147b9773e2 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/dw_convolution_selector.h
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/dw_convolution_selector.h
@@ -26,10 +26,9 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-absl::Status SelectDWConvolution(const DepthwiseConvolution2DAttributes& attr,
-                                 const CreationContext& creation_context,
-                                 const OperationDef& op_def,
-                                 std::unique_ptr<GPUOperation>* ptr);
+std::unique_ptr<GPUOperation> SelectDWConvolution(
+    const DepthwiseConvolution2DAttributes& attr, const DeviceInfo& device_info,
+    const OperationDef& op_def);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/fully_connected_selector.cc b/tensorflow/lite/delegates/gpu/cl/selectors/fully_connected_selector.cc
index cb967e45b52..24c48d52f2a 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/fully_connected_selector.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/fully_connected_selector.cc
@@ -27,97 +27,71 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-absl::Status SelectFullyConnectedGeneric(
-    const FullyConnectedAttributes& attr,
-    const CreationContext& creation_context, const OperationDef& op_def,
-    int batch_size, std::unique_ptr<GPUOperation>* ptr) {
+std::unique_ptr<GPUOperation> SelectFullyConnectedGeneric(
+    const FullyConnectedAttributes& attr, const DeviceInfo& device_info,
+    const OperationDef& op_def, int batch_size) {
   if (op_def.IsBatchSupported()) {
-    ConvTexture conv;
-    RETURN_IF_ERROR(CreateConvTexture(creation_context, op_def, attr, &conv));
-    *ptr = absl::make_unique<ConvTexture>(std::move(conv));
+    ConvTexture conv = CreateConvTexture(device_info, op_def, attr);
+    return absl::make_unique<ConvTexture>(std::move(conv));
   } else {
-    FullyConnected fc;
-    RETURN_IF_ERROR(CreateFullyConnected(creation_context, op_def, attr, &fc));
-    *ptr = absl::make_unique<FullyConnected>(std::move(fc));
+    FullyConnected fc = CreateFullyConnected(device_info, op_def, attr);
+    return absl::make_unique<FullyConnected>(std::move(fc));
   }
-  return absl::OkStatus();
 }
 
-absl::Status SelectFullyConnectedAdreno(const FullyConnectedAttributes& attr,
-                                        const CreationContext& creation_context,
-                                        const OperationDef& op_def,
-                                        int batch_size,
-                                        std::unique_ptr<GPUOperation>* ptr) {
+std::unique_ptr<GPUOperation> SelectFullyConnectedAdreno(
+    const FullyConnectedAttributes& attr, const DeviceInfo& device_info,
+    const OperationDef& op_def, int batch_size) {
   if (op_def.IsBatchSupported()) {
-    ConvTexture conv;
-    RETURN_IF_ERROR(CreateConvTexture(creation_context, op_def, attr, &conv));
-    *ptr = absl::make_unique<ConvTexture>(std::move(conv));
+    ConvTexture conv = CreateConvTexture(device_info, op_def, attr);
+    return absl::make_unique<ConvTexture>(std::move(conv));
   } else {
-    FullyConnected fc;
-    RETURN_IF_ERROR(CreateFullyConnected(creation_context, op_def, attr, &fc));
-    *ptr = absl::make_unique<FullyConnected>(std::move(fc));
+    FullyConnected fc = CreateFullyConnected(device_info, op_def, attr);
+    return absl::make_unique<FullyConnected>(std::move(fc));
   }
-  return absl::OkStatus();
 }
 
-absl::Status SelectFullyConnectedPowerVR(
-    const FullyConnectedAttributes& attr,
-    const CreationContext& creation_context, const OperationDef& op_def,
-    int batch_size, std::unique_ptr<GPUOperation>* ptr) {
+std::unique_ptr<GPUOperation> SelectFullyConnectedPowerVR(
+    const FullyConnectedAttributes& attr, const DeviceInfo& device_info,
+    const OperationDef& op_def, int batch_size) {
   if (op_def.IsBatchSupported()) {
-    ConvPowerVR conv;
-    RETURN_IF_ERROR(CreateConvPowerVR(creation_context, op_def, attr, &conv));
-    *ptr = absl::make_unique<ConvPowerVR>(std::move(conv));
+    ConvPowerVR conv = CreateConvPowerVR(device_info, op_def, attr);
+    return absl::make_unique<ConvPowerVR>(std::move(conv));
   } else {
-    FullyConnected fc;
-    RETURN_IF_ERROR(CreateFullyConnected(creation_context, op_def, attr, &fc));
-    *ptr = absl::make_unique<FullyConnected>(std::move(fc));
+    FullyConnected fc = CreateFullyConnected(device_info, op_def, attr);
+    return absl::make_unique<FullyConnected>(std::move(fc));
   }
-  return absl::OkStatus();
 }
 
-absl::Status SelectFullyConnectedMali(const FullyConnectedAttributes& attr,
-                                      const CreationContext& creation_context,
-                                      const OperationDef& op_def,
-                                      int batch_size,
-                                      std::unique_ptr<GPUOperation>* ptr) {
+std::unique_ptr<GPUOperation> SelectFullyConnectedMali(
+    const FullyConnectedAttributes& attr, const DeviceInfo& device_info,
+    const OperationDef& op_def, int batch_size) {
   if (op_def.IsBatchSupported()) {
     if (op_def.src_tensors[0].storage_type == TensorStorageType::BUFFER) {
-      ConvBuffer1x1 conv;
-      RETURN_IF_ERROR(
-          CreateConvBuffer1x1(creation_context, op_def, attr, &conv));
-      *ptr = absl::make_unique<ConvBuffer1x1>(std::move(conv));
+      ConvBuffer1x1 conv = CreateConvBuffer1x1(device_info, op_def, attr);
+      return absl::make_unique<ConvBuffer1x1>(std::move(conv));
     } else {
-      ConvTexture conv;
-      RETURN_IF_ERROR(CreateConvTexture(creation_context, op_def, attr, &conv));
-      *ptr = absl::make_unique<ConvTexture>(std::move(conv));
+      ConvTexture conv = CreateConvTexture(device_info, op_def, attr);
+      return absl::make_unique<ConvTexture>(std::move(conv));
     }
   } else {
-    FullyConnected fc;
-    RETURN_IF_ERROR(CreateFullyConnected(creation_context, op_def, attr, &fc));
-    *ptr = absl::make_unique<FullyConnected>(std::move(fc));
+    FullyConnected fc = CreateFullyConnected(device_info, op_def, attr);
+    return absl::make_unique<FullyConnected>(std::move(fc));
   }
-  return absl::OkStatus();
 }
 
-absl::Status SelectFullyConnected(const FullyConnectedAttributes& attr,
-                                  const CreationContext& creation_context,
-                                  const OperationDef& op_def, int batch_size,
-                                  std::unique_ptr<GPUOperation>* ptr) {
-  const auto& device_info = creation_context.device->GetInfo();
+std::unique_ptr<GPUOperation> SelectFullyConnected(
+    const FullyConnectedAttributes& attr, const DeviceInfo& device_info,
+    const OperationDef& op_def, int batch_size) {
   if (device_info.IsAdreno()) {
-    return SelectFullyConnectedAdreno(attr, creation_context, op_def,
-                                      batch_size, ptr);
+    return SelectFullyConnectedAdreno(attr, device_info, op_def, batch_size);
   } else if (device_info.IsPowerVR() || device_info.IsAMD() ||
              device_info.IsNvidia() || device_info.IsIntel()) {
-    return SelectFullyConnectedPowerVR(attr, creation_context, op_def,
-                                       batch_size, ptr);
+    return SelectFullyConnectedPowerVR(attr, device_info, op_def, batch_size);
   } else if (device_info.IsMali()) {
-    return SelectFullyConnectedMali(attr, creation_context, op_def, batch_size,
-                                    ptr);
+    return SelectFullyConnectedMali(attr, device_info, op_def, batch_size);
   } else {
-    return SelectFullyConnectedGeneric(attr, creation_context, op_def,
-                                       batch_size, ptr);
+    return SelectFullyConnectedGeneric(attr, device_info, op_def, batch_size);
   }
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/fully_connected_selector.h b/tensorflow/lite/delegates/gpu/cl/selectors/fully_connected_selector.h
index 4ae44490996..197c243c5d5 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/fully_connected_selector.h
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/fully_connected_selector.h
@@ -26,10 +26,9 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-absl::Status SelectFullyConnected(const FullyConnectedAttributes& attr,
-                                  const CreationContext& creation_context,
-                                  const OperationDef& op_def, int batch_size,
-                                  std::unique_ptr<GPUOperation>* ptr);
+std::unique_ptr<GPUOperation> SelectFullyConnected(
+    const FullyConnectedAttributes& attr, const DeviceInfo& device_info,
+    const OperationDef& op_def, int batch_size);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc b/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc
index 5661c3d0a37..9b0a169e5b4 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc
@@ -20,6 +20,8 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/elementwise.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/reduce.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/transpose.h"
 #include "tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.h"
 #include "tensorflow/lite/delegates/gpu/cl/selectors/convolution_transposed_selector.h"
 #include "tensorflow/lite/delegates/gpu/cl/selectors/default_selector.h"
@@ -39,7 +41,7 @@ namespace gpu {
 namespace cl {
 namespace {
 bool IsSuitableForWinograd4x4To6x6(const Convolution2DAttributes& attr,
-                                   const CLDevice& device,
+                                   const DeviceInfo& device_info,
                                    const BHWC& dst_shape) {
   const int tiles_x = DivideRoundUp(dst_shape.w, 4);
   const int tiles_y = DivideRoundUp(dst_shape.h, 4);
@@ -49,23 +51,22 @@ bool IsSuitableForWinograd4x4To6x6(const Convolution2DAttributes& attr,
       attr.weights.shape.w == 3 && attr.weights.shape.h == 3 &&
       attr.dilations == HW(1, 1) && attr.strides == HW(1, 1);
   // Mali among other devices has smaller SIMD line size
-  const int min_depth = device.IsMali() ? 16 : 32;
-  const int min_hw = device.IsMali() ? 32 : 128;
+  const int min_depth = device_info.IsMali() ? 16 : 32;
+  const int min_hw = device_info.IsMali() ? 32 : 128;
   const bool recommended_channels =
       dst_depth % 4 == 0 && src_depth >= min_depth && dst_depth >= min_depth;
   const bool recommended_hw = tiles_x * tiles_y >= min_hw;
   return suitable_attributes && recommended_channels && recommended_hw;
 }
 
-absl::Status WinogradFromNode(const CreationContext& creation_context,
+absl::Status WinogradFromNode(const DeviceInfo& device_info,
                               const std::vector<Value*>& inputs,
                               const std::vector<Value*>& outputs,
                               const OperationDef& op_def, ModelHints hints,
                               const BHWC& input_shape, const BHWC& output_shape,
                               const Convolution2DAttributes& attr,
                               GPUOperationsSubgraph* gpu_subgraph) {
-  if (!IsSuitableForWinograd4x4To6x6(attr, *creation_context.device,
-                                     output_shape)) {
+  if (!IsSuitableForWinograd4x4To6x6(attr, device_info, output_shape)) {
     return absl::UnimplementedError("No implementation for this case.");
   }
 
@@ -75,16 +76,14 @@ absl::Status WinogradFromNode(const CreationContext& creation_context,
   const BHWC shape_1{input_shape.b, 36, tiles_x * tiles_y, output_shape.c};
   TensorDescriptor td_0;
   td_0.storage_type = SelectBestStorageType(
-      *creation_context.context, *creation_context.device, shape_0,
-      op_def.src_tensors[0].storage_type, op_def.src_tensors[0].data_type,
-      op_def.src_tensors[0].layout);
+      device_info, shape_0, op_def.src_tensors[0].storage_type,
+      op_def.src_tensors[0].data_type, op_def.src_tensors[0].layout);
   td_0.data_type = op_def.src_tensors[0].data_type;
   td_0.layout = op_def.src_tensors[0].layout;
   TensorDescriptor td_1;
   td_1.storage_type = SelectBestStorageType(
-      *creation_context.context, *creation_context.device, shape_1,
-      op_def.src_tensors[0].storage_type, op_def.src_tensors[0].data_type,
-      op_def.src_tensors[0].layout);
+      device_info, shape_1, op_def.src_tensors[0].storage_type,
+      op_def.src_tensors[0].data_type, op_def.src_tensors[0].layout);
   td_1.data_type = op_def.src_tensors[0].data_type;
   td_1.layout = op_def.src_tensors[0].layout;
   gpu_subgraph->new_tensors = {{shape_0, td_0}, {shape_1, td_1}};
@@ -96,8 +95,8 @@ absl::Status WinogradFromNode(const CreationContext& creation_context,
   winograd_up_def.src_tensors.push_back(op_def.src_tensors[0]);
   winograd_up_def.dst_tensors.push_back(td_0);
   auto& winograd_up = gpu_subgraph->operations[0];
-  RETURN_IF_ERROR(SelectWinograd4x4To36(
-      creation_context, attr.padding, winograd_up_def, &winograd_up.operation));
+  winograd_up.operation =
+      SelectWinograd4x4To36(device_info, attr.padding, winograd_up_def);
   winograd_up.input_ids = {static_cast<int>(inputs[0]->id)};
   winograd_up.output_ids = {-1};
 
@@ -108,8 +107,8 @@ absl::Status WinogradFromNode(const CreationContext& creation_context,
   auto& conv = gpu_subgraph->operations[1];
   conv.input_ids = {-1};
   conv.output_ids = {-2};
-  RETURN_IF_ERROR(SelectConvolutionForWinograd(
-      attr, input_shape, creation_context, conv_def, hints, &conv.operation));
+  conv.operation = SelectConvolutionForWinograd(attr, input_shape, device_info,
+                                                conv_def, hints);
 
   OperationDef winograd_down_def;
   winograd_down_def.precision = op_def.precision;
@@ -123,14 +122,14 @@ absl::Status WinogradFromNode(const CreationContext& creation_context,
     bias_copy.shape = Linear(attr.weights.shape.o);
     bias_copy.data.resize(attr.weights.shape.o);
   }
-  RETURN_IF_ERROR(SelectWinograd36To4x4(creation_context, winograd_down_def,
-                                        bias_copy, &winograd_down.operation));
+  winograd_down.operation =
+      SelectWinograd36To4x4(device_info, winograd_down_def, bias_copy);
   return absl::OkStatus();
 }
 
 }  // namespace
 
-absl::Status GPUOperationFromNode(const CreationContext& creation_context,
+absl::Status GPUOperationFromNode(const DeviceInfo& device_info,
                                   const OperationDef& op_def, ModelHints hints,
                                   const std::vector<Value*>& inputs,
                                   const std::vector<Value*>& outputs,
@@ -159,23 +158,95 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context,
       } else if (inputs.size() == 1 && node.operation.attributes.has_value()) {
         auto attr =
             absl::any_cast<ElementwiseAttributes>(node.operation.attributes);
-        GPUOperation operation;
-        RETURN_IF_ERROR(CreateElementwise(creation_context, op_def, op_type,
-                                          attr, &operation));
+        GPUOperation operation =
+            CreateElementwise(device_info, op_def, op_type, attr);
         *gpu_op = absl::make_unique<GPUOperation>(std::move(operation));
         return absl::OkStatus();
       }
       return absl::UnimplementedError(absl::StrCat(
           "No support of ", node.operation.type, " with this parameters"));
     }
+    case OperationType::BATCHED_MATMUL: {
+      // Currently only batch = 1 is supported.
+      // Matmul replaced with this sequence:
+      //   1) Transpose second tensor(weights). (1x1xHxW)->(Wx1x1xH)
+      //   2) Convert second tensor(weights) from 1) to Convolution weights
+      //   3) Run usual convolution
+      auto second_shape = inputs[1]->tensor.shape;
+      auto dst_shape = outputs[0]->tensor.shape;
+      if (dst_shape.b != 1) {
+        return absl::UnimplementedError(
+            "Currently only batch = 1 supported for BATCHED_MATMUL.");
+      }
+      BHWC weights_shape(second_shape.c, 1, 1, second_shape.w);
+      Convolution2DAttributes attr;
+      attr.strides = HW(1, 1);
+      attr.dilations = HW(1, 1);
+      attr.padding.appended = HW(0, 0);
+      attr.padding.prepended = HW(0, 0);
+      attr.bias.shape = Linear(weights_shape.b);
+      attr.bias.data.resize(weights_shape.b, 0.0f);
+
+      TensorDescriptor transposed_desc = {op_def.src_tensors[1].data_type,
+                                          op_def.src_tensors[1].storage_type,
+                                          Layout::BHWC};
+      transposed_desc.storage_type = SelectBestStorageType(
+          device_info, weights_shape, transposed_desc.storage_type,
+          transposed_desc.data_type, transposed_desc.layout);
+      TensorDescriptor weights_desc = {op_def.src_tensors[1].data_type,
+                                       TensorStorageType::BUFFER, Layout::BHWC};
+      gpu_subgraph->operations.clear();
+      gpu_subgraph->operations.resize(3);
+      auto& transpose_op = gpu_subgraph->operations[0];
+      auto& converter_op = gpu_subgraph->operations[1];
+      auto& conv_op = gpu_subgraph->operations[2];
+      conv_op.input_ids = {static_cast<int>(inputs[0]->id), -1};
+      conv_op.output_ids = {static_cast<int>(outputs[0]->id)};
+      OperationDef conv_def = op_def;
+      conv_def.src_tensors[1] = weights_desc;
+      ConvWeightsDescription conv_weights_desc;
+      conv_op.operation = SelectConvolutionWithDynamicWeights(
+          attr, weights_shape, dst_shape, device_info, conv_def, hints,
+          &conv_weights_desc);
+
+      int aligned_output =
+          AlignByN(weights_shape.b, conv_weights_desc.output_group_size * 4);
+      int aligned_input = AlignByN(weights_shape.c, 4);
+      gpu_subgraph->new_tensors = {{BHWC(1, 1, 1,
+                                         aligned_output * aligned_input *
+                                             weights_shape.h * weights_shape.w),
+                                    weights_desc},
+                                   {weights_shape, transposed_desc}};
+      OperationDef converter_def;
+      converter_def.precision = op_def.precision;
+      converter_def.src_tensors.push_back(transposed_desc);
+      converter_def.dst_tensors.push_back(weights_desc);
+
+      converter_op.input_ids = {-2};
+      converter_op.output_ids = {-1};
+      converter_op.operation =
+          SelectConverterToConvWeights(conv_weights_desc, converter_def, hints);
+
+      OperationDef transpose_def;
+      transpose_def.precision = op_def.precision;
+      transpose_def.src_tensors.push_back(op_def.src_tensors[1]);
+      transpose_def.dst_tensors.push_back(transposed_desc);
+
+      transpose_op.input_ids = {static_cast<int>(inputs[1]->id)};
+      transpose_op.output_ids = {-2};
+      TransposeAttributes transpose_attr;
+      transpose_attr.perm = BHWC(3, 0, 1, 2);
+      transpose_op.operation = absl::make_unique<GPUOperation>(
+          CreateTranspose(transpose_def, transpose_attr));
+      return absl::OkStatus();
+    }
     case OperationType::CONCAT: {
       auto attr = absl::any_cast<ConcatAttributes>(node.operation.attributes);
       std::vector<int> channels(inputs.size());
       for (int i = 0; i < inputs.size(); ++i) {
         channels[i] = inputs[i]->tensor.shape.c;
       }
-      return SelectConcat(attr, channels, op_def,
-                          creation_context.device->GetInfo(), gpu_op);
+      return SelectConcat(attr, channels, op_def, device_info, gpu_op);
     }
     case OperationType::CONVOLUTION_2D: {
       auto attr =
@@ -183,17 +254,22 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context,
       auto input_shape = inputs[0]->tensor.shape;
       auto output_shape = outputs[0]->tensor.shape;
       if (inputs.size() == 1) {
-        if (WinogradFromNode(creation_context, inputs, outputs, op_def, hints,
+        if (WinogradFromNode(device_info, inputs, outputs, op_def, hints,
                              input_shape, output_shape, attr, gpu_subgraph)
                 .ok()) {
           return absl::OkStatus();
         } else {
           gpu_op = InitSingleOpSubgraph(inputs, outputs, gpu_subgraph);
-          return SelectConvolution(attr, output_shape, creation_context, op_def,
-                                   hints, gpu_op);
+          *gpu_op =
+              SelectConvolution(attr, output_shape, device_info, op_def, hints);
+          return absl::OkStatus();
         }
       } else {
         auto weights_shape = inputs[1]->tensor.shape;
+        if (attr.bias.data.empty()) {
+          attr.bias.shape = Linear(weights_shape.b);
+          attr.bias.data.resize(weights_shape.b, 0.0f);
+        }
         TensorDescriptor weights_desc = {op_def.src_tensors[1].data_type,
                                          TensorStorageType::BUFFER,
                                          Layout::BHWC};
@@ -206,9 +282,9 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context,
         OperationDef conv_def = op_def;
         conv_def.src_tensors[1] = weights_desc;
         ConvWeightsDescription conv_weights_desc;
-        RETURN_IF_ERROR(SelectConvolutionWithDynamicWeights(
-            attr, weights_shape, output_shape, creation_context, conv_def,
-            hints, &conv_op.operation, &conv_weights_desc));
+        conv_op.operation = SelectConvolutionWithDynamicWeights(
+            attr, weights_shape, output_shape, device_info, conv_def, hints,
+            &conv_weights_desc);
 
         int aligned_output =
             AlignByN(weights_shape.b, conv_weights_desc.output_group_size * 4);
@@ -225,45 +301,47 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context,
 
         converter_op.input_ids = {static_cast<int>(inputs[1]->id)};
         converter_op.output_ids = {-1};
-        return SelectConverterToConvWeights(conv_weights_desc, creation_context,
-                                            converter_def, hints,
-                                            &converter_op.operation);
+        converter_op.operation = SelectConverterToConvWeights(
+            conv_weights_desc, converter_def, hints);
+        return absl::OkStatus();
       }
     }
     case OperationType::CONVOLUTION_TRANSPOSED: {
       auto attr = absl::any_cast<ConvolutionTransposedAttributes>(
           node.operation.attributes);
-      return SelectConvolutionTransposed(attr, creation_context, op_def,
-                                         gpu_op);
+      *gpu_op = SelectConvolutionTransposed(attr, device_info, op_def);
+      return absl::OkStatus();
     }
     case OperationType::DEPTHWISE_CONVOLUTION: {
       auto attr = absl::any_cast<DepthwiseConvolution2DAttributes>(
           node.operation.attributes);
-      return SelectDWConvolution(attr, creation_context, op_def, gpu_op);
+      *gpu_op = SelectDWConvolution(attr, device_info, op_def);
+      return absl::OkStatus();
     }
     case OperationType::FULLY_CONNECTED: {
       auto attr =
           absl::any_cast<FullyConnectedAttributes>(node.operation.attributes);
-      return SelectFullyConnected(attr, creation_context, op_def,
-                                  inputs[0]->tensor.shape.b, gpu_op);
+      *gpu_op = SelectFullyConnected(attr, device_info, op_def,
+                                     inputs[0]->tensor.shape.b);
+      return absl::OkStatus();
     }
     case OperationType::LSTM: {
-      SelectLSTM(op_def, creation_context.device->GetInfo(), gpu_op);
+      *gpu_op = SelectLSTM(op_def, device_info);
       return absl::OkStatus();
     }
     case OperationType::MAX_UNPOOLING_2D: {
       auto attr =
           absl::any_cast<MaxUnpooling2DAttributes>(node.operation.attributes);
-      SelectMaxUnpooling(attr, op_def, gpu_op);
+      *gpu_op = SelectMaxUnpooling(attr, op_def);
       return absl::OkStatus();
     }
     case OperationType::MEAN: {
       auto attr = absl::any_cast<MeanAttributes>(node.operation.attributes);
-      return SelectMean(attr, op_def, creation_context.device->GetInfo(),
-                        gpu_op);
+      return SelectMean(attr, op_def, device_info, gpu_op);
     }
     case OperationType::MEAN_STDDEV_NORMALIZATION: {
-      MeanStdDevNormalization operation = CreateMeanStdDevNormalization(op_def);
+      MeanStdDevNormalization operation = CreateMeanStdDevNormalization(
+          op_def, device_info, (inputs[0]->tensor.shape.c + 3) / 4);
       *gpu_op =
           absl::make_unique<MeanStdDevNormalization>(std::move(operation));
       return absl::OkStatus();
@@ -276,22 +354,23 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context,
     case OperationType::POOLING_2D: {
       auto attr =
           absl::any_cast<Pooling2DAttributes>(node.operation.attributes);
-      SelectPooling(attr, op_def, gpu_op);
+      *gpu_op = SelectPooling(attr, op_def);
       return absl::OkStatus();
     }
     case OperationType::PRELU: {
       auto attr = absl::any_cast<PReLUAttributes>(node.operation.attributes);
-      return SelectPReLU(attr, creation_context, op_def, gpu_op);
+      *gpu_op = SelectPReLU(attr, device_info, op_def);
+      return absl::OkStatus();
     }
     case OperationType::QUANTIZE_AND_DEQUANTIZE: {
       auto attr = absl::any_cast<QuantizeAndDequantizeAttributes>(
           node.operation.attributes);
-      SelectQuantizeAndDequantize(attr, creation_context, op_def, gpu_op);
+      *gpu_op = SelectQuantizeAndDequantize(attr, op_def);
       return absl::OkStatus();
     }
     case OperationType::RELU: {
       auto attr = absl::any_cast<ReLUAttributes>(node.operation.attributes);
-      SelectReLU(creation_context, attr, op_def, gpu_op);
+      *gpu_op = SelectReLU(attr, op_def);
       return absl::OkStatus();
     }
     case OperationType::RESHAPE: {
@@ -332,6 +411,7 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context,
     case OperationType::EXP:
     case OperationType::HARD_SWISH:
     case OperationType::LOG:
+    case OperationType::NEG:
     case OperationType::RSQRT:
     case OperationType::SIGMOID:
     case OperationType::SIN:
@@ -343,9 +423,15 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context,
       return absl::OkStatus();
     }
     case OperationType::DIV:
+    case OperationType::EQUAL:
+    case OperationType::GREATER:
+    case OperationType::GREATER_EQUAL:
+    case OperationType::LESS:
+    case OperationType::LESS_EQUAL:
     case OperationType::MAXIMUM:
     case OperationType::MINIMUM:
     case OperationType::MUL:
+    case OperationType::NOT_EQUAL:
     case OperationType::POW:
     case OperationType::SQUARED_DIFF:
     case OperationType::SUB: {
@@ -357,18 +443,30 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context,
       } else if (inputs.size() == 1 && node.operation.attributes.has_value()) {
         auto attr =
             absl::any_cast<ElementwiseAttributes>(node.operation.attributes);
-        GPUOperation operation;
-        RETURN_IF_ERROR(CreateElementwise(creation_context, op_def, op_type,
-                                          attr, &operation));
+        GPUOperation operation =
+            CreateElementwise(device_info, op_def, op_type, attr);
         *gpu_op = absl::make_unique<GPUOperation>(std::move(operation));
         return absl::OkStatus();
       }
       return absl::UnimplementedError(absl::StrCat(
           "No support of ", node.operation.type, " with this parameters"));
     }
+    case OperationType::REDUCE_MAXIMUM:
+    case OperationType::REDUCE_MINIMUM:
+    case OperationType::REDUCE_PRODUCT:
+    case OperationType::REDUCE_SUM: {
+      auto attr = absl::any_cast<ReduceAttributes>(node.operation.attributes);
+      if (attr.axis != Axis::CHANNELS) {
+        return absl::UnimplementedError(
+            "Currently we can reduce only in channels dimension.");
+      }
+      GPUOperation operation = CreateReduce(op_def, attr, op_type);
+      *gpu_op = absl::make_unique<GPUOperation>(std::move(operation));
+      return absl::OkStatus();
+    }
     default:
-      return SelectDefault(creation_context, op_def, hints, inputs, outputs,
-                           node, gpu_subgraph);
+      return SelectDefault(device_info, op_def, hints, inputs, outputs, node,
+                           gpu_subgraph);
   }
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.h b/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.h
index f237a385718..640432e0390 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.h
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.h
@@ -29,7 +29,7 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-absl::Status GPUOperationFromNode(const CreationContext& creation_context,
+absl::Status GPUOperationFromNode(const DeviceInfo& device_info,
                                   const OperationDef& op_def, ModelHints hints,
                                   const std::vector<Value*>& inputs,
                                   const std::vector<Value*>& outputs,
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc b/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc
index ca5ec9f4f23..4dbb1ffd734 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc
@@ -45,40 +45,31 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-void SelectLSTM(const OperationDef& op_def, const DeviceInfo& device_info,
-                std::unique_ptr<GPUOperation>* ptr) {
-  LSTM operation = CreateLSTM(op_def, device_info);
-  *ptr = absl::make_unique<LSTM>(std::move(operation));
+std::unique_ptr<GPUOperation> SelectLSTM(const OperationDef& op_def,
+                                         const DeviceInfo& device_info) {
+  return absl::make_unique<GPUOperation>(CreateLSTM(op_def, device_info));
 }
 
-void SelectReLU(const CreationContext& creation_context,
-                const ReLUAttributes& attr, const OperationDef& op_def,
-                std::unique_ptr<GPUOperation>* ptr) {
-  GPUOperation relu = CreateReLU(creation_context, op_def, attr);
-  *ptr = absl::make_unique<GPUOperation>(std::move(relu));
+std::unique_ptr<GPUOperation> SelectReLU(const ReLUAttributes& attr,
+                                         const OperationDef& op_def) {
+  return absl::make_unique<GPUOperation>(CreateReLU(op_def, attr));
 }
 
-absl::Status SelectPReLU(const PReLUAttributes& attr,
-                         const CreationContext& creation_context,
-                         const OperationDef& op_def,
-                         std::unique_ptr<GPUOperation>* ptr) {
-  GPUOperation operation;
-  RETURN_IF_ERROR(CreatePReLU(creation_context, op_def, attr, &operation));
-  *ptr = absl::make_unique<GPUOperation>(std::move(operation));
-  return absl::OkStatus();
+std::unique_ptr<GPUOperation> SelectPReLU(const PReLUAttributes& attr,
+                                          const DeviceInfo& device_info,
+                                          const OperationDef& op_def) {
+  return absl::make_unique<GPUOperation>(
+      CreatePReLU(device_info, op_def, attr));
 }
 
-void SelectPooling(const Pooling2DAttributes& attr, const OperationDef& op_def,
-                   std::unique_ptr<GPUOperation>* ptr) {
-  Pooling pooling = CreatePooling(op_def, attr);
-  *ptr = absl::make_unique<Pooling>(std::move(pooling));
+std::unique_ptr<GPUOperation> SelectPooling(const Pooling2DAttributes& attr,
+                                            const OperationDef& op_def) {
+  return absl::make_unique<GPUOperation>(CreatePooling(op_def, attr));
 }
 
-void SelectMaxUnpooling(const MaxUnpooling2DAttributes& attr,
-                        const OperationDef& op_def,
-                        std::unique_ptr<GPUOperation>* ptr) {
-  MaxUnpooling operation = CreateMaxUnpooling(op_def, attr);
-  *ptr = absl::make_unique<MaxUnpooling>(std::move(operation));
+std::unique_ptr<GPUOperation> SelectMaxUnpooling(
+    const MaxUnpooling2DAttributes& attr, const OperationDef& op_def) {
+  return absl::make_unique<GPUOperation>(CreateMaxUnpooling(op_def, attr));
 }
 
 void SelectAdd(const OperationDef& op_def, const std::vector<int>& channels,
@@ -102,16 +93,16 @@ absl::Status SelectConcat(const ConcatAttributes& attr,
                           std::unique_ptr<GPUOperation>* ptr) {
   switch (attr.axis) {
     case Axis::CHANNELS: {
-      ConcatZ operation = CreateConcatZ(op_def, channels, device_info);
-      *ptr = absl::make_unique<ConcatZ>(std::move(operation));
+      GPUOperation operation = CreateConcatZ(op_def, channels, device_info);
+      *ptr = absl::make_unique<GPUOperation>(std::move(operation));
       return absl::OkStatus();
     }
     case Axis::BATCH:
     case Axis::DEPTH:
     case Axis::HEIGHT:
     case Axis::WIDTH: {
-      ConcatXY operation = CreateConcatXY(op_def, attr);
-      *ptr = absl::make_unique<ConcatXY>(std::move(operation));
+      GPUOperation operation = CreateConcatXY(op_def, attr);
+      *ptr = absl::make_unique<GPUOperation>(std::move(operation));
       return absl::OkStatus();
     }
     default:
@@ -123,25 +114,25 @@ void SelectReshape(int src_channels, int dst_channels,
                    const OperationDef& op_def,
                    std::unique_ptr<GPUOperation>* ptr) {
   if (src_channels % 4 == 0 && dst_channels % 4 == 0) {
-    Reshapex4 operation = CreateReshapex4(op_def);
-    *ptr = absl::make_unique<Reshapex4>(std::move(operation));
+    GPUOperation operation = CreateReshapex4(op_def);
+    *ptr = absl::make_unique<GPUOperation>(std::move(operation));
   } else {
-    Reshape operation = CreateReshape(op_def);
-    *ptr = absl::make_unique<Reshape>(std::move(operation));
+    GPUOperation operation = CreateReshape(op_def);
+    *ptr = absl::make_unique<GPUOperation>(std::move(operation));
   }
 }
 
 void SelectSpaceToDepth(const SpaceToDepthAttributes& attr,
                         const OperationDef& op_def,
                         std::unique_ptr<GPUOperation>* ptr) {
-  SpaceToDepth operation = CreateSpaceToDepth(op_def, attr);
-  *ptr = absl::make_unique<SpaceToDepth>(std::move(operation));
+  GPUOperation operation = CreateSpaceToDepth(op_def, attr);
+  *ptr = absl::make_unique<GPUOperation>(std::move(operation));
 }
 
 void SelectPadding(const PadAttributes& attr, const OperationDef& op_def,
                    std::unique_ptr<GPUOperation>* ptr) {
-  Padding operation = CreatePadding(op_def, attr);
-  *ptr = absl::make_unique<Padding>(std::move(operation));
+  GPUOperation operation = CreatePadding(op_def, attr);
+  *ptr = absl::make_unique<GPUOperation>(std::move(operation));
 }
 
 void SelectStridedSlice(const SliceAttributes& attr, const OperationDef& op_def,
@@ -167,49 +158,38 @@ void SelectSoftmax(const BHWC& shape, const OperationDef& op_def,
     Softmax1x1 operation = CreateSoftmax1x1(op_def);
     *ptr = absl::make_unique<Softmax1x1>(std::move(operation));
   } else {
-    Softmax operation = CreateSoftmax(op_def);
-    *ptr = absl::make_unique<Softmax>(std::move(operation));
+    GPUOperation operation = CreateSoftmax(op_def);
+    *ptr = absl::make_unique<GPUOperation>(std::move(operation));
   }
 }
 
 void SelectTranspose(const TransposeAttributes& attr,
                      const OperationDef& op_def,
                      std::unique_ptr<GPUOperation>* ptr) {
-  Transpose operation = CreateTranspose(op_def, attr);
-  *ptr = absl::make_unique<Transpose>(std::move(operation));
-}
-
-absl::Status SelectWinograd4x4To36(const CreationContext& creation_context,
-                                   const Padding2D& padding,
-                                   const OperationDef& op_def,
-                                   std::unique_ptr<GPUOperation>* ptr) {
-  Winograd4x4To36 operation;
-  RETURN_IF_ERROR(
-      CreateWinograd4x4To36(creation_context, op_def, padding, &operation));
-  *ptr = absl::make_unique<Winograd4x4To36>(std::move(operation));
-  return absl::OkStatus();
-}
-
-absl::Status SelectWinograd36To4x4(
-    const CreationContext& creation_context, const OperationDef& op_def,
-    const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases,
-    std::unique_ptr<GPUOperation>* ptr) {
-  Winograd36To4x4 operation;
-  RETURN_IF_ERROR(
-      CreateWinograd36To4x4(creation_context, op_def, biases, &operation));
-  *ptr = absl::make_unique<Winograd36To4x4>(std::move(operation));
-  return absl::OkStatus();
-}
-
-void SelectQuantizeAndDequantize(const QuantizeAndDequantizeAttributes& attr,
-                                 const CreationContext& creation_context,
-                                 const OperationDef& op_def,
-                                 std::unique_ptr<GPUOperation>* ptr) {
-  GPUOperation operation =
-      CreateQuantizeAndDequantize(creation_context, op_def, attr);
+  GPUOperation operation = CreateTranspose(op_def, attr);
   *ptr = absl::make_unique<GPUOperation>(std::move(operation));
 }
 
+std::unique_ptr<GPUOperation> SelectWinograd4x4To36(
+    const DeviceInfo& device_info, const Padding2D& padding,
+    const OperationDef& op_def) {
+  return absl::make_unique<Winograd4x4To36>(
+      CreateWinograd4x4To36(device_info, op_def, padding));
+}
+
+std::unique_ptr<GPUOperation> SelectWinograd36To4x4(
+    const DeviceInfo& device_info, const OperationDef& op_def,
+    const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases) {
+  return absl::make_unique<Winograd36To4x4>(
+      CreateWinograd36To4x4(device_info, op_def, biases));
+}
+
+std::unique_ptr<GPUOperation> SelectQuantizeAndDequantize(
+    const QuantizeAndDequantizeAttributes& attr, const OperationDef& op_def) {
+  return absl::make_unique<GPUOperation>(
+      CreateQuantizeAndDequantize(op_def, attr));
+}
+
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h b/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h
index 556698ef62f..c6c604da982 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h
@@ -28,24 +28,21 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-void SelectLSTM(const OperationDef& op_def, const DeviceInfo& device_info,
-                std::unique_ptr<GPUOperation>* ptr);
+std::unique_ptr<GPUOperation> SelectLSTM(const OperationDef& op_def,
+                                         const DeviceInfo& device_info);
 
-void SelectReLU(const CreationContext& creation_context,
-                const ReLUAttributes& attr, const OperationDef& op_def,
-                std::unique_ptr<GPUOperation>* ptr);
+std::unique_ptr<GPUOperation> SelectReLU(const ReLUAttributes& attr,
+                                         const OperationDef& op_def);
 
-absl::Status SelectPReLU(const PReLUAttributes& attr,
-                         const CreationContext& creation_context,
-                         const OperationDef& op_def,
-                         std::unique_ptr<GPUOperation>* ptr);
+std::unique_ptr<GPUOperation> SelectPReLU(const PReLUAttributes& attr,
+                                          const DeviceInfo& device_info,
+                                          const OperationDef& op_def);
 
-void SelectPooling(const Pooling2DAttributes& attr, const OperationDef& op_def,
-                   std::unique_ptr<GPUOperation>* ptr);
+std::unique_ptr<GPUOperation> SelectPooling(const Pooling2DAttributes& attr,
+                                            const OperationDef& op_def);
 
-void SelectMaxUnpooling(const MaxUnpooling2DAttributes& attr,
-                        const OperationDef& op_def,
-                        std::unique_ptr<GPUOperation>* ptr);
+std::unique_ptr<GPUOperation> SelectMaxUnpooling(
+    const MaxUnpooling2DAttributes& attr, const OperationDef& op_def);
 
 void SelectAdd(const OperationDef& op_def, const std::vector<int>& channels,
                int dst_channels, std::unique_ptr<GPUOperation>* ptr);
@@ -85,20 +82,16 @@ void SelectTranspose(const TransposeAttributes& attr,
                      const OperationDef& op_def,
                      std::unique_ptr<GPUOperation>* ptr);
 
-absl::Status SelectWinograd4x4To36(const CreationContext& creation_context,
-                                   const Padding2D& padding,
-                                   const OperationDef& op_def,
-                                   std::unique_ptr<GPUOperation>* ptr);
+std::unique_ptr<GPUOperation> SelectWinograd4x4To36(
+    const DeviceInfo& device_info, const Padding2D& padding,
+    const OperationDef& op_def);
 
-absl::Status SelectWinograd36To4x4(
-    const CreationContext& creation_context, const OperationDef& op_def,
-    const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases,
-    std::unique_ptr<GPUOperation>* ptr);
+std::unique_ptr<GPUOperation> SelectWinograd36To4x4(
+    const DeviceInfo& device_info, const OperationDef& op_def,
+    const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases);
 
-void SelectQuantizeAndDequantize(const QuantizeAndDequantizeAttributes& attr,
-                                 const CreationContext& creation_context,
-                                 const OperationDef& op_def,
-                                 std::unique_ptr<GPUOperation>* ptr);
+std::unique_ptr<GPUOperation> SelectQuantizeAndDequantize(
+    const QuantizeAndDequantizeAttributes& attr, const OperationDef& op_def);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/special_selector.cc b/tensorflow/lite/delegates/gpu/cl/selectors/special_selector.cc
index 8a801b460d1..31480f231b0 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/special_selector.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/special_selector.cc
@@ -30,8 +30,8 @@ namespace gpu {
 namespace cl {
 namespace {
 absl::Status TryDepthwiseConvPlus1x1Conv(
-    const CreationContext& creation_context, CalculationsPrecision precision,
-    const GraphFloat32& graph, NodeId first_node_id,
+    CalculationsPrecision precision, const GraphFloat32& graph,
+    NodeId first_node_id,
     const std::map<ValueId, TensorDescriptor>& tensor_descriptors,
     std::set<NodeId>* consumed_nodes, GPUOperationsSubgraph* gpu_subgraph) {
   auto* dw_node = graph.GetNode(first_node_id);
@@ -71,16 +71,13 @@ absl::Status TryDepthwiseConvPlus1x1Conv(
   if (it != tensor_descriptors.end()) {
     op_def.dst_tensors.push_back(it->second);
   }
-  if (!IsDepthwiseConvPlus1x1ConvSupported(*creation_context.device, op_def,
-                                           dw_attr, conv_attr)) {
+  if (!IsDepthwiseConvPlus1x1ConvSupported(op_def, dw_attr, conv_attr)) {
     return absl::NotFoundError("DepthwiseConvPlus1x1Conv not suitable.");
   }
   std::unique_ptr<GPUOperation>* gpu_op =
       InitSingleOpSubgraph(dw_inputs, conv_outputs, gpu_subgraph);
-  DepthwiseConvPlus1x1Conv operation;
-  RETURN_IF_ERROR(CreateDepthwiseConvPlus1x1Conv(
-      creation_context, op_def, dw_attr, conv_attr, &operation));
-  *gpu_op = absl::make_unique<DepthwiseConvPlus1x1Conv>(std::move(operation));
+  auto operation = CreateDepthwiseConvPlus1x1Conv(op_def, dw_attr, conv_attr);
+  *gpu_op = absl::make_unique<GPUOperation>(std::move(operation));
   consumed_nodes->insert(dw_node->id);
   consumed_nodes->insert(conv_node->id);
   return absl::OkStatus();
@@ -88,18 +85,18 @@ absl::Status TryDepthwiseConvPlus1x1Conv(
 }  // namespace
 
 absl::Status GPUSubgraphFromGraph(
-    const CreationContext& creation_context, CalculationsPrecision precision,
+    const DeviceInfo& device_info, CalculationsPrecision precision,
     const GraphFloat32& graph, NodeId first_node_id,
     const std::map<ValueId, TensorDescriptor>& tensor_descriptors,
     std::set<NodeId>* consumed_nodes, GPUOperationsSubgraph* gpu_subgraph) {
-  if (!creation_context.device->IsNvidia()) {
+  if (!device_info.IsNvidia()) {
     return absl::NotFoundError(
         "Experimental feature, enabled for NVidia only, but device is not "
         "nvidia gpu.");
   }
-  if (TryDepthwiseConvPlus1x1Conv(creation_context, precision, graph,
-                                  first_node_id, tensor_descriptors,
-                                  consumed_nodes, gpu_subgraph)
+  if (TryDepthwiseConvPlus1x1Conv(precision, graph, first_node_id,
+                                  tensor_descriptors, consumed_nodes,
+                                  gpu_subgraph)
           .ok()) {
     return absl::OkStatus();
   }
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/special_selector.h b/tensorflow/lite/delegates/gpu/cl/selectors/special_selector.h
index 687d221aac6..3ea99b2515a 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/special_selector.h
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/special_selector.h
@@ -31,7 +31,7 @@ namespace gpu {
 namespace cl {
 
 absl::Status GPUSubgraphFromGraph(
-    const CreationContext& creation_context, CalculationsPrecision precision,
+    const DeviceInfo& device_info, CalculationsPrecision precision,
     const GraphFloat32& graph, NodeId first_node_id,
     const std::map<ValueId, TensorDescriptor>& tensor_descriptors,
     std::set<NodeId>* consumed_nodes, GPUOperationsSubgraph* gpu_subgraph);
diff --git a/tensorflow/lite/delegates/gpu/cl/storage_type_util.cc b/tensorflow/lite/delegates/gpu/cl/storage_type_util.cc
index 755da0c7619..ddcb65e07f9 100644
--- a/tensorflow/lite/delegates/gpu/cl/storage_type_util.cc
+++ b/tensorflow/lite/delegates/gpu/cl/storage_type_util.cc
@@ -15,18 +15,16 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/cl/storage_type_util.h"
 
-#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
-#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
 #include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/util.h"
 
 namespace tflite {
 namespace gpu {
 namespace cl {
 
-bool CanCreateTensorWithShape(const CLContext& context, const CLDevice& device,
-                              const BHWDC& shape,
+bool CanCreateTensorWithShape(const DeviceInfo& device_info, const BHWDC& shape,
                               const TensorDescriptor& descriptor) {
   const int slices = DivideRoundUp(shape.c, 4);
   switch (descriptor.storage_type) {
@@ -35,64 +33,60 @@ bool CanCreateTensorWithShape(const CLContext& context, const CLDevice& device,
           4 * (descriptor.data_type == DataType::FLOAT32 ? 4 : 2);
       const int buffer_size =
           shape.b * shape.w * shape.h * shape.d * slices * flt4_size;
-      return buffer_size <= device.GetInfo().buffer_max_size;
+      return buffer_size <= device_info.buffer_max_size;
     }
     case TensorStorageType::IMAGE_BUFFER:
       return shape.b * shape.w * shape.h * shape.d * slices <=
-             device.GetInfo().image_buffer_max_size;
+             device_info.image_buffer_max_size;
     case TensorStorageType::TEXTURE_3D:
-      if (device.cl_version() < OpenCLVersion::CL_1_2 && slices == 1) {
+      if (device_info.cl_version < OpenCLVersion::CL_1_2 && slices == 1) {
         // clCreateImage3D (that used in CL 1.0/1.1) can not create image with
         // depth = 1 by specification;
         return false;
       }
-      return shape.w * shape.b <= device.GetInfo().image3d_max_width &&
-             shape.h <= device.GetInfo().image3d_max_height &&
-             slices * shape.d <= device.GetInfo().image3d_max_depth;
+      return shape.w * shape.b <= device_info.image3d_max_width &&
+             shape.h <= device_info.image3d_max_height &&
+             slices * shape.d <= device_info.image3d_max_depth;
     case TensorStorageType::TEXTURE_ARRAY:
       // Bug on some Adreno. b/131099086
-      if (slices == 1 && !device.SupportsOneLayerTextureArray()) {
+      if (slices == 1 && !device_info.SupportsOneLayerTextureArray()) {
         return false;
       }
-      return shape.w * shape.b <= device.GetInfo().image2d_max_width &&
-             shape.h <= device.GetInfo().image2d_max_height &&
-             slices * shape.d <= device.GetInfo().image_array_max_layers;
+      return shape.w * shape.b <= device_info.image2d_max_width &&
+             shape.h <= device_info.image2d_max_height &&
+             slices * shape.d <= device_info.image_array_max_layers;
     case TensorStorageType::TEXTURE_2D:
-      return shape.w * shape.b * shape.d <=
-                 device.GetInfo().image2d_max_width &&
-             shape.h * slices <= device.GetInfo().image2d_max_height;
+      return shape.w * shape.b * shape.d <= device_info.image2d_max_width &&
+             shape.h * slices <= device_info.image2d_max_height;
     case TensorStorageType::SINGLE_TEXTURE_2D:
       return shape.c <= 4 &&
-             context.IsFloatTexture2DSupported(shape.c, descriptor.data_type) &&
-             shape.w * shape.b * shape.d <=
-                 device.GetInfo().image2d_max_width &&
-             shape.h <= device.GetInfo().image2d_max_height;
+             device_info.SupportsFloatImage2D(descriptor.data_type, shape.c) &&
+             shape.w * shape.b * shape.d <= device_info.image2d_max_width &&
+             shape.h <= device_info.image2d_max_height;
     default:
       return false;
   }
 }
 
-bool CanCreateTensorWithShape(const CLContext& context, const CLDevice& device,
-                              const BHWC& shape,
+bool CanCreateTensorWithShape(const DeviceInfo& device_info, const BHWC& shape,
                               const TensorDescriptor& descriptor) {
   const BHWDC shape5D(shape.b, shape.h, shape.w, 1, shape.c);
-  return CanCreateTensorWithShape(context, device, shape5D, descriptor);
+  return CanCreateTensorWithShape(device_info, shape5D, descriptor);
 }
 
-TensorStorageType SelectBestStorageType(const CLContext& context,
-                                        const CLDevice& device,
+TensorStorageType SelectBestStorageType(const DeviceInfo& device_info,
                                         const BHWC& shape,
                                         const TensorStorageType& desired,
                                         const DataType& data_type,
                                         const Layout& layout) {
-  if (CanCreateTensorWithShape(context, device, shape,
+  if (CanCreateTensorWithShape(device_info, shape,
                                TensorDescriptor{data_type, desired, layout})) {
     return desired;
   }
   auto GetBestTypeAfterTextureArray = [&]() {
-    if (device.SupportsImageBuffer() &&
+    if (device_info.SupportsImageBuffer() &&
         CanCreateTensorWithShape(
-            context, device, shape,
+            device_info, shape,
             TensorDescriptor{data_type, TensorStorageType::IMAGE_BUFFER,
                              layout})) {
       return TensorStorageType::IMAGE_BUFFER;
@@ -101,9 +95,9 @@ TensorStorageType SelectBestStorageType(const CLContext& context,
     }
   };
   auto GetBestTypeAfterTexture2D = [&]() {
-    if (device.SupportsTextureArray() &&
+    if (device_info.SupportsTextureArray() &&
         CanCreateTensorWithShape(
-            context, device, shape,
+            device_info, shape,
             TensorDescriptor{data_type, TensorStorageType::TEXTURE_ARRAY,
                              layout})) {
       return TensorStorageType::TEXTURE_ARRAY;
@@ -113,7 +107,7 @@ TensorStorageType SelectBestStorageType(const CLContext& context,
   };
   auto GetBestTypeAfterTexture3D = [&]() {
     if (CanCreateTensorWithShape(
-            context, device, shape,
+            device_info, shape,
             TensorDescriptor{data_type, TensorStorageType::TEXTURE_2D,
                              layout})) {
       return TensorStorageType::TEXTURE_2D;
diff --git a/tensorflow/lite/delegates/gpu/cl/storage_type_util.h b/tensorflow/lite/delegates/gpu/cl/storage_type_util.h
index 87fc2206e81..a8a82008461 100644
--- a/tensorflow/lite/delegates/gpu/cl/storage_type_util.h
+++ b/tensorflow/lite/delegates/gpu/cl/storage_type_util.h
@@ -16,8 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_STORAGE_TYPE_UTIL_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_CL_STORAGE_TYPE_UTIL_H_
 
-#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
-#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
+#include "tensorflow/lite/delegates/gpu/cl/device_info.h"
 #include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
@@ -26,16 +25,13 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-bool CanCreateTensorWithShape(const CLContext& context, const CLDevice& device,
-                              const BHWDC& shape,
+bool CanCreateTensorWithShape(const DeviceInfo& device_info, const BHWDC& shape,
                               const TensorDescriptor& descriptor);
 
-bool CanCreateTensorWithShape(const CLContext& context, const CLDevice& device,
-                              const BHWC& shape,
+bool CanCreateTensorWithShape(const DeviceInfo& device_info, const BHWC& shape,
                               const TensorDescriptor& descriptor);
 
-TensorStorageType SelectBestStorageType(const CLContext& context,
-                                        const CLDevice& device,
+TensorStorageType SelectBestStorageType(const DeviceInfo& device_info,
                                         const BHWC& shape,
                                         const TensorStorageType& desired,
                                         const DataType& data_type,
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor.cc b/tensorflow/lite/delegates/gpu/cl/tensor.cc
index 4da3e5e5b63..72c53c5b1ac 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor.cc
+++ b/tensorflow/lite/delegates/gpu/cl/tensor.cc
@@ -28,6 +28,164 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 namespace {
+absl::Status AllocateTensorMemory(const CLContext& context, const BHWDC& shape,
+                                  const TensorDescriptor& descriptor,
+                                  const void* data_ptr, CLMemory* result) {
+  const int slices = DivideRoundUp(shape.c, 4);
+  cl_mem_flags mem_flags = CL_MEM_READ_WRITE;
+  if (data_ptr) {
+    mem_flags |= CL_MEM_COPY_HOST_PTR;
+  }
+  switch (descriptor.storage_type) {
+    case TensorStorageType::BUFFER:
+    case TensorStorageType::IMAGE_BUFFER: {
+      const size_t data_size = shape.b * shape.w * shape.h * shape.d * slices *
+                               4 * SizeOf(descriptor.data_type);
+      cl_int error_code;
+      cl_mem memory = clCreateBuffer(context.context(), mem_flags, data_size,
+                                     const_cast<void*>(data_ptr), &error_code);
+      if (!memory) {
+        return absl::UnknownError(
+            absl::StrCat("Failed to allocate device memory (clCreateBuffer): ",
+                         CLErrorCodeToString(error_code)));
+      }
+      *result = CLMemory(memory, true);
+      return absl::OkStatus();
+    }
+    case TensorStorageType::TEXTURE_2D: {
+      cl_image_desc desc;
+      desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+      desc.image_width = shape.w * shape.b * shape.d;
+      desc.image_height = shape.h * slices;
+      desc.image_depth = 0;
+      desc.image_row_pitch = 0;
+      desc.image_slice_pitch = 0;
+      desc.num_mip_levels = 0;
+      desc.num_samples = 0;
+      desc.buffer = nullptr;
+
+      cl_image_format format;
+      format.image_channel_order = CL_RGBA;
+      format.image_channel_data_type = ToImageChannelType(descriptor.data_type);
+
+      cl_int error_code;
+      cl_mem memory =
+          CreateImage2DLegacy(context.context(), mem_flags, &format, &desc,
+                              const_cast<void*>(data_ptr), &error_code);
+      if (error_code != CL_SUCCESS) {
+        return absl::UnknownError(
+            absl::StrCat("Failed to create 2D texture (clCreateImage): ",
+                         CLErrorCodeToString(error_code)));
+      }
+
+      *result = CLMemory(memory, true);
+      return absl::OkStatus();
+    }
+    case TensorStorageType::TEXTURE_3D: {
+      cl_image_desc desc;
+      desc.image_type = CL_MEM_OBJECT_IMAGE3D;
+      desc.image_width = shape.w * shape.b;
+      desc.image_height = shape.h;
+      desc.image_depth = slices * shape.d;
+      desc.image_row_pitch = 0;
+      desc.image_slice_pitch = 0;
+      desc.num_mip_levels = 0;
+      desc.num_samples = 0;
+      desc.buffer = nullptr;
+
+      cl_image_format format;
+      format.image_channel_order = CL_RGBA;
+      format.image_channel_data_type = ToImageChannelType(descriptor.data_type);
+
+      cl_int error_code;
+      cl_mem memory =
+          CreateImage3DLegacy(context.context(), mem_flags, &format, &desc,
+                              const_cast<void*>(data_ptr), &error_code);
+      if (error_code != CL_SUCCESS) {
+        return absl::UnknownError(
+            absl::StrCat("Failed to create 3D texture (clCreateImage): ",
+                         CLErrorCodeToString(error_code)));
+      }
+
+      *result = CLMemory(memory, true);
+      return absl::OkStatus();
+    }
+    case TensorStorageType::TEXTURE_ARRAY: {
+      cl_image_desc desc;
+      desc.image_type = CL_MEM_OBJECT_IMAGE2D_ARRAY;
+      desc.image_width = shape.w * shape.b;
+      desc.image_height = shape.h;
+      desc.image_depth = 0;
+      desc.image_array_size = slices * shape.d;
+      desc.image_row_pitch = 0;
+      desc.image_slice_pitch = 0;
+      desc.num_mip_levels = 0;
+      desc.num_samples = 0;
+      desc.buffer = nullptr;
+
+      cl_image_format format;
+      format.image_channel_order = CL_RGBA;
+      format.image_channel_data_type = ToImageChannelType(descriptor.data_type);
+
+      cl_int error_code;
+      cl_mem memory =
+          clCreateImage(context.context(), mem_flags, &format, &desc,
+                        const_cast<void*>(data_ptr), &error_code);
+      if (error_code != CL_SUCCESS) {
+        return absl::UnknownError(
+            absl::StrCat("Failed to create 2D texture array (clCreateImage): ",
+                         CLErrorCodeToString(error_code)));
+      }
+
+      *result = CLMemory(memory, true);
+      return absl::OkStatus();
+    }
+
+    case TensorStorageType::SINGLE_TEXTURE_2D: {
+      if (slices != 1) {
+        return absl::InvalidArgumentError(absl::StrCat(
+            "SINGLE_TEXTURE_2D support only channels in range [1-4], but ",
+            shape.c, "was provided"));
+      }
+      cl_image_desc desc;
+      desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+      desc.image_width = shape.w * shape.b * shape.d;
+      desc.image_height = shape.h;
+      desc.image_depth = 0;
+      desc.image_row_pitch = 0;
+      desc.image_slice_pitch = 0;
+      desc.num_mip_levels = 0;
+      desc.num_samples = 0;
+      desc.buffer = nullptr;
+
+      cl_image_format format;
+      if (context.IsFloatTexture2DSupported(shape.c, descriptor.data_type)) {
+        format.image_channel_order = ToChannelOrder(shape.c);
+        format.image_channel_data_type =
+            ToImageChannelType(descriptor.data_type);
+      } else {
+        return absl::InvalidArgumentError(absl::StrCat(
+            "This device doesn't support ", shape.c, "-channel textures."));
+      }
+
+      cl_int error_code;
+      cl_mem memory =
+          CreateImage2DLegacy(context.context(), mem_flags, &format, &desc,
+                              const_cast<void*>(data_ptr), &error_code);
+      if (error_code != CL_SUCCESS) {
+        return absl::UnknownError(
+            absl::StrCat("Failed to create single 2D texture (clCreateImage): ",
+                         CLErrorCodeToString(error_code)));
+      }
+
+      *result = CLMemory(memory, true);
+      return absl::OkStatus();
+    }
+
+    default:
+      return absl::InternalError("Unsupported tensor storage type");
+  }
+}
 
 absl::Status CreateImageBufferFromBuffer(const CLContext& context,
                                          cl_mem memory, DataType data_type,
@@ -53,15 +211,14 @@ absl::Status CreateImageBufferFromBuffer(const CLContext& context,
   return absl::OkStatus();
 }
 
-absl::Status CreateTensor(const CLContext& context, const CLDevice& device,
-                          const BHWDC& shape,
+absl::Status CreateTensor(const CLContext& context, const BHWDC& shape,
                           const TensorDescriptor& descriptor, cl_mem memory,
                           Tensor* result) {
   const bool memory_owner = memory == nullptr;
   if (memory_owner) {
     CLMemory mem;
     RETURN_IF_ERROR(
-        AllocateTensorMemory(context, device, shape, descriptor, &mem));
+        AllocateTensorMemory(context, shape, descriptor, nullptr, &mem));
     memory = mem.Release();
   }
   if (descriptor.storage_type == TensorStorageType::IMAGE_BUFFER) {
@@ -96,6 +253,14 @@ absl::Status CreateTensorShared(const CLContext& context, const BHWDC& shape,
 
 }  // namespace
 
+absl::Status TensorDescriptor::CreateGPUObject(CLContext* context,
+                                               GPUObjectPtr* result) const {
+  Tensor gpu_tensor;
+  RETURN_IF_ERROR(gpu_tensor.CreateFromDescriptor(*this, context));
+  *result = absl::make_unique<Tensor>(std::move(gpu_tensor));
+  return absl::OkStatus();
+}
+
 Tensor::Tensor(cl_mem memory, bool memory_owner, const BHWC& shape,
                const TensorDescriptor& descriptor)
     : memory_(memory),
@@ -281,12 +446,6 @@ absl::Status Tensor::IsValid(const BHWDC& shape) const {
   return absl::OkStatus();
 }
 
-int Tensor::GetChannelsAlignment() const {
-  return descriptor_.storage_type == TensorStorageType::SINGLE_TEXTURE_2D
-             ? shape_.c
-             : 4;
-}
-
 int Tensor::GetAlignedChannels() const {
   return descriptor_.storage_type == TensorStorageType::SINGLE_TEXTURE_2D
              ? shape_.c
@@ -331,11 +490,13 @@ absl::Status Tensor::WriteDataBHWDC(absl::Span<const float> in,
   if (descriptor_.data_type == DataType::FLOAT32) {
     data_f.resize(elements_count);
     data_ptr = data_f.data();
-    DataFromBHWDC(in, absl::MakeSpan(data_f.data(), data_f.size()));
+    DataFromBHWDC(in, shape_, descriptor_,
+                  absl::MakeSpan(data_f.data(), data_f.size()));
   } else {
     data_h.resize(elements_count);
     data_ptr = data_h.data();
-    DataFromBHWDC(in, absl::MakeSpan(data_h.data(), data_h.size()));
+    DataFromBHWDC(in, shape_, descriptor_,
+                  absl::MakeSpan(data_h.data(), data_h.size()));
   }
 
   switch (descriptor_.storage_type) {
@@ -415,9 +576,11 @@ absl::Status Tensor::ReadDataBHWDC(absl::Span<float> out,
   }
 
   if (descriptor_.data_type == DataType::FLOAT32) {
-    DataToBHWDC(absl::MakeConstSpan(data_f.data(), data_f.size()), out);
+    DataToBHWDC(absl::MakeConstSpan(data_f.data(), data_f.size()), shape_,
+                descriptor_, out);
   } else {
-    DataToBHWDC(absl::MakeConstSpan(data_h.data(), data_h.size()), out);
+    DataToBHWDC(absl::MakeConstSpan(data_h.data(), data_h.size()), shape_,
+                descriptor_, out);
   }
 
   return absl::OkStatus();
@@ -434,17 +597,35 @@ absl::Status Tensor::ReadData(CLCommandQueue* queue,
   return ReadDataBHWDC(absl::MakeSpan(dst->data), queue);
 }
 
-absl::Status CreateTensor(const CLContext& context, const CLDevice& device,
-                          const BHWC& shape, const TensorDescriptor& descriptor,
-                          Tensor* result) {
-  const BHWDC shape5D(shape.b, shape.h, shape.w, 1, shape.c);
-  return CreateTensor(context, device, shape5D, descriptor, nullptr, result);
+absl::Status Tensor::CreateFromDescriptor(const TensorDescriptor& desc,
+                                          CLContext* context) {
+  shape_ = desc.shape;
+  descriptor_.data_type = desc.data_type;
+  descriptor_.storage_type = desc.storage_type;
+  descriptor_.layout = desc.layout;
+  memory_owner_ = true;
+  CLMemory memory;
+  RETURN_IF_ERROR(AllocateTensorMemory(*context, shape_, descriptor_,
+                                       desc.data.data(), &memory));
+  memory_ = memory.Release();
+  if (desc.storage_type == TensorStorageType::IMAGE_BUFFER) {
+    RETURN_IF_ERROR(CreateImageBufferFromBuffer(
+        *context, memory_, desc.data_type,
+        shape_.b * shape_.w * shape_.h * shape_.d * DivideRoundUp(shape_.c, 4),
+        &image_buffer_memory_));
+  }
+  return absl::OkStatus();
 }
 
-absl::Status CreateTensor(const CLContext& context, const CLDevice& device,
-                          const BHWDC& shape,
+absl::Status CreateTensor(const CLContext& context, const BHWC& shape,
                           const TensorDescriptor& descriptor, Tensor* result) {
-  return CreateTensor(context, device, shape, descriptor, nullptr, result);
+  const BHWDC shape5D(shape.b, shape.h, shape.w, 1, shape.c);
+  return CreateTensor(context, shape5D, descriptor, nullptr, result);
+}
+
+absl::Status CreateTensor(const CLContext& context, const BHWDC& shape,
+                          const TensorDescriptor& descriptor, Tensor* result) {
+  return CreateTensor(context, shape, descriptor, nullptr, result);
 }
 
 absl::Status CreateSharedTensor(const CLContext& context, cl_mem memory,
@@ -462,227 +643,19 @@ absl::Status CreateSharedTensor(const CLContext& context, cl_mem memory,
   return CreateTensorShared(context, shape, descriptor, memory, result);
 }
 
-absl::Status AllocateTensorMemory(const CLContext& context,
-                                  const CLDevice& device, const BHWC& shape,
+absl::Status AllocateTensorMemory(const CLContext& context, const BHWC& shape,
                                   const TensorDescriptor& descriptor,
                                   CLMemory* result) {
   const BHWDC shape5D(shape.b, shape.h, shape.w, 1, shape.c);
-  return AllocateTensorMemory(context, device, shape5D, descriptor, result);
+  return AllocateTensorMemory(context, shape5D, descriptor, nullptr, result);
 }
 
-absl::Status AllocateTensorMemory(const CLContext& context,
-                                  const CLDevice& device, const BHWDC& shape,
+absl::Status AllocateTensorMemory(const CLContext& context, const BHWDC& shape,
                                   const TensorDescriptor& descriptor,
                                   CLMemory* result) {
-  const int slices = DivideRoundUp(shape.c, 4);
-  switch (descriptor.storage_type) {
-    case TensorStorageType::BUFFER:
-    case TensorStorageType::IMAGE_BUFFER: {
-      const size_t data_size = shape.b * shape.w * shape.h * shape.d * slices *
-                               4 * SizeOf(descriptor.data_type);
-      cl_int error_code;
-      cl_mem memory = clCreateBuffer(context.context(), CL_MEM_READ_WRITE,
-                                     data_size, nullptr, &error_code);
-      if (!memory) {
-        return absl::UnknownError(
-            absl::StrCat("Failed to allocate device memory (clCreateBuffer): ",
-                         CLErrorCodeToString(error_code)));
-      }
-      *result = CLMemory(memory, true);
-      return absl::OkStatus();
-    }
-    case TensorStorageType::TEXTURE_2D: {
-      cl_image_desc desc;
-      desc.image_type = CL_MEM_OBJECT_IMAGE2D;
-      desc.image_width = shape.w * shape.b * shape.d;
-      desc.image_height = shape.h * slices;
-      desc.image_depth = 0;
-      desc.image_row_pitch = 0;
-      desc.image_slice_pitch = 0;
-      desc.num_mip_levels = 0;
-      desc.num_samples = 0;
-      desc.buffer = nullptr;
-
-      cl_image_format format;
-      format.image_channel_order = CL_RGBA;
-      format.image_channel_data_type = ToImageChannelType(descriptor.data_type);
-
-      cl_int error_code;
-      cl_mem memory = CreateImage2DLegacy(context.context(), CL_MEM_READ_WRITE,
-                                          &format, &desc, nullptr, &error_code);
-      if (error_code != CL_SUCCESS) {
-        return absl::UnknownError(
-            absl::StrCat("Failed to create 2D texture (clCreateImage): ",
-                         CLErrorCodeToString(error_code)));
-      }
-
-      *result = CLMemory(memory, true);
-      return absl::OkStatus();
-    }
-    case TensorStorageType::TEXTURE_3D: {
-      cl_image_desc desc;
-      desc.image_type = CL_MEM_OBJECT_IMAGE3D;
-      desc.image_width = shape.w * shape.b;
-      desc.image_height = shape.h;
-      desc.image_depth = slices * shape.d;
-      desc.image_row_pitch = 0;
-      desc.image_slice_pitch = 0;
-      desc.num_mip_levels = 0;
-      desc.num_samples = 0;
-      desc.buffer = nullptr;
-
-      cl_image_format format;
-      format.image_channel_order = CL_RGBA;
-      format.image_channel_data_type = ToImageChannelType(descriptor.data_type);
-
-      cl_int error_code;
-      cl_mem memory = CreateImage3DLegacy(context.context(), CL_MEM_READ_WRITE,
-                                          &format, &desc, nullptr, &error_code);
-      if (error_code != CL_SUCCESS) {
-        return absl::UnknownError(
-            absl::StrCat("Failed to create 3D texture (clCreateImage): ",
-                         CLErrorCodeToString(error_code)));
-      }
-
-      *result = CLMemory(memory, true);
-      return absl::OkStatus();
-    }
-    case TensorStorageType::TEXTURE_ARRAY: {
-      cl_image_desc desc;
-      desc.image_type = CL_MEM_OBJECT_IMAGE2D_ARRAY;
-      desc.image_width = shape.w * shape.b;
-      desc.image_height = shape.h;
-      desc.image_depth = 0;
-      desc.image_array_size = slices * shape.d;
-      desc.image_row_pitch = 0;
-      desc.image_slice_pitch = 0;
-      desc.num_mip_levels = 0;
-      desc.num_samples = 0;
-      desc.buffer = nullptr;
-
-      cl_image_format format;
-      format.image_channel_order = CL_RGBA;
-      format.image_channel_data_type = ToImageChannelType(descriptor.data_type);
-
-      cl_int error_code;
-      cl_mem memory = clCreateImage(context.context(), CL_MEM_READ_WRITE,
-                                    &format, &desc, nullptr, &error_code);
-      if (error_code != CL_SUCCESS) {
-        return absl::UnknownError(
-            absl::StrCat("Failed to create 2D texture array (clCreateImage): ",
-                         CLErrorCodeToString(error_code)));
-      }
-
-      *result = CLMemory(memory, true);
-      return absl::OkStatus();
-    }
-
-    case TensorStorageType::SINGLE_TEXTURE_2D: {
-      if (slices != 1) {
-        return absl::InvalidArgumentError(absl::StrCat(
-            "SINGLE_TEXTURE_2D support only channels in range [1-4], but ",
-            shape.c, "was provided"));
-      }
-      cl_image_desc desc;
-      desc.image_type = CL_MEM_OBJECT_IMAGE2D;
-      desc.image_width = shape.w * shape.b * shape.d;
-      desc.image_height = shape.h;
-      desc.image_depth = 0;
-      desc.image_row_pitch = 0;
-      desc.image_slice_pitch = 0;
-      desc.num_mip_levels = 0;
-      desc.num_samples = 0;
-      desc.buffer = nullptr;
-
-      cl_image_format format;
-      if (context.IsFloatTexture2DSupported(shape.c, descriptor.data_type)) {
-        format.image_channel_order = ToChannelOrder(shape.c);
-        format.image_channel_data_type =
-            ToImageChannelType(descriptor.data_type);
-      } else {
-        return absl::InvalidArgumentError(absl::StrCat(
-            "This device doesn't support ", shape.c, "-channel textures."));
-      }
-
-      cl_int error_code;
-      cl_mem memory = CreateImage2DLegacy(context.context(), CL_MEM_READ_WRITE,
-                                          &format, &desc, nullptr, &error_code);
-      if (error_code != CL_SUCCESS) {
-        return absl::UnknownError(
-            absl::StrCat("Failed to create 2D texture (clCreateImage): ",
-                         CLErrorCodeToString(error_code)));
-      }
-
-      *result = CLMemory(memory, true);
-      return absl::OkStatus();
-    }
-
-    default:
-      return absl::InternalError("Unsupported tensor storage type");
-  }
+  return AllocateTensorMemory(context, shape, descriptor, nullptr, result);
 }
 
-template <typename T>
-void Tensor::DataFromBHWDC(absl::Span<const float> src,
-                           absl::Span<T> dst) const {
-  const int channels_batch = GetChannelsAlignment();
-  for (int b = 0; b < shape_.b; ++b) {
-    for (int s = 0; s < Slices(); ++s) {
-      for (int y = 0; y < shape_.h; ++y) {
-        for (int x = 0; x < shape_.w; ++x) {
-          for (int d = 0; d < shape_.d; ++d) {
-            for (int c = 0; c < channels_batch; ++c) {
-              float value;
-              if (s * 4 + c < shape_.c) {
-                const int cpu_index =
-                    shape_.LinearIndex({b, y, x, d, s * 4 + c});
-                value = src[cpu_index];
-              } else {
-                value = 0.0f;
-              }
-              const int gpu_index = GetLinearIndex(b, x, y, d, s, c);
-              dst[gpu_index] = value;
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-template void Tensor::DataFromBHWDC<float>(absl::Span<const float> src,
-                                           absl::Span<float> dst) const;
-template void Tensor::DataFromBHWDC<half>(absl::Span<const float> src,
-                                          absl::Span<half> dst) const;
-
-template <typename T>
-void Tensor::DataToBHWDC(absl::Span<const T> src, absl::Span<float> dst) const {
-  const int channels_batch = GetChannelsAlignment();
-  for (int b = 0; b < shape_.b; ++b) {
-    for (int s = 0; s < Slices(); ++s) {
-      for (int y = 0; y < shape_.h; ++y) {
-        for (int x = 0; x < shape_.w; ++x) {
-          for (int d = 0; d < shape_.d; ++d) {
-            for (int c = 0; c < channels_batch; ++c) {
-              if (s * 4 + c >= shape_.c) {
-                continue;
-              }
-              const int cpu_index = shape_.LinearIndex({b, y, x, d, s * 4 + c});
-              const int gpu_index = GetLinearIndex(b, x, y, d, s, c);
-              dst[cpu_index] = src[gpu_index];
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-template void Tensor::DataToBHWDC<float>(absl::Span<const float> src,
-                                         absl::Span<float> dst) const;
-template void Tensor::DataToBHWDC<half>(absl::Span<const half> src,
-                                        absl::Span<float> dst) const;
-
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor.h b/tensorflow/lite/delegates/gpu/cl/tensor.h
index 8d914970743..c6056dbbbec 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor.h
+++ b/tensorflow/lite/delegates/gpu/cl/tensor.h
@@ -68,17 +68,6 @@ class Tensor : public GPUObject {
   int Slices() const { return DivideRoundUp(shape_.c, 4); }
   int Batch() const { return shape_.b; }
 
-  // returns int4(width * batch, height, slices, batch)
-  int4 GetWBatchedHSB() const {
-    return int4(shape_.w * shape_.b, shape_.h, Slices(), shape_.b);
-  }
-  int4 GetWBatchedHDS() const {
-    return int4(shape_.w * shape_.b, shape_.h, shape_.d, Slices());
-  }
-
-  int4 GetWHSB() const { return int4(shape_.w, shape_.h, Slices(), shape_.b); }
-  int4 GetWHDS() const { return int4(shape_.w, shape_.h, shape_.d, Slices()); }
-
   TensorDescriptor GetDescriptor() const { return descriptor_; }
   DataType GetDataType() const { return descriptor_.data_type; }
   TensorStorageType GetStorageType() const { return descriptor_.storage_type; }
@@ -103,6 +92,9 @@ class Tensor : public GPUObject {
   absl::Status ReadData(CLCommandQueue* queue, TensorFloat32* dst) const;
   absl::Status ReadData(CLCommandQueue* queue, Tensor5DFloat32* dst) const;
 
+  absl::Status CreateFromDescriptor(const TensorDescriptor& desc,
+                                    CLContext* context);
+
  private:
   absl::Status IsValid(const BHWC& shape) const;
   absl::Status IsValid(const BHWDC& shape) const;
@@ -115,37 +107,6 @@ class Tensor : public GPUObject {
   absl::Status ReadDataBHWDC(absl::Span<float> out,
                              CLCommandQueue* queue) const;
 
-  template <typename T>
-  void DataFromBHWDC(absl::Span<const float> src, absl::Span<T> dst) const;
-  template <typename T>
-  void DataToBHWDC(absl::Span<const T> src, absl::Span<float> dst) const;
-
-  // TODO(sorokin) might be bad performance
-  int GetLinearIndex(int b, int x, int y, int d, int s, int sub_c) const {
-    switch (descriptor_.storage_type) {
-      case TensorStorageType::BUFFER:
-      case TensorStorageType::IMAGE_BUFFER:
-      case TensorStorageType::TEXTURE_ARRAY:
-      case TensorStorageType::TEXTURE_3D:
-        return ((((d * Slices() + s) * shape_.h + y) * shape_.w + x) *
-                    shape_.b +
-                b) *
-                   4 +
-               sub_c;  // DSHWBC4
-      case TensorStorageType::TEXTURE_2D:
-        return ((((y * Slices() + s) * shape_.w + x) * shape_.b + b) *
-                    shape_.d +
-                d) *
-                   4 +
-               sub_c;  // HSWBDC4
-      case TensorStorageType::SINGLE_TEXTURE_2D:
-        return (((y * shape_.w + x) * shape_.b + b) * shape_.d + d) * shape_.c +
-               sub_c;  // HWBDC
-      case TensorStorageType::UNKNOWN:
-        return -1;
-    }
-  }
-
   int3 GetFullTensorRegion() const;
   void Release();
 
@@ -158,22 +119,18 @@ class Tensor : public GPUObject {
 
 using TensorPtr = std::shared_ptr<Tensor>;
 
-absl::Status AllocateTensorMemory(const CLContext& context,
-                                  const CLDevice& device, const BHWC& shape,
+absl::Status AllocateTensorMemory(const CLContext& context, const BHWC& shape,
                                   const TensorDescriptor& descriptor,
                                   CLMemory* result);
 
-absl::Status AllocateTensorMemory(const CLContext& context,
-                                  const CLDevice& device, const BHWDC& shape,
+absl::Status AllocateTensorMemory(const CLContext& context, const BHWDC& shape,
                                   const TensorDescriptor& descriptor,
                                   CLMemory* result);
 
-absl::Status CreateTensor(const CLContext& context, const CLDevice& device,
-                          const BHWC& shape, const TensorDescriptor& descriptor,
-                          Tensor* result);
+absl::Status CreateTensor(const CLContext& context, const BHWC& shape,
+                          const TensorDescriptor& descriptor, Tensor* result);
 
-absl::Status CreateTensor(const CLContext& context, const CLDevice& device,
-                          const BHWDC& shape,
+absl::Status CreateTensor(const CLContext& context, const BHWDC& shape,
                           const TensorDescriptor& descriptor, Tensor* result);
 
 absl::Status CreateSharedTensor(const CLContext& context, cl_mem memory,
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor_test.cc b/tensorflow/lite/delegates/gpu/cl/tensor_test.cc
index 99ba269cf60..d64de5f151b 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/tensor_test.cc
@@ -47,8 +47,7 @@ absl::Status TensorGenericTest(const BHWC& shape,
   }
 
   Tensor tensor;
-  RETURN_IF_ERROR(
-      CreateTensor(env->context(), env->device(), shape, descriptor, &tensor));
+  RETURN_IF_ERROR(CreateTensor(env->context(), shape, descriptor, &tensor));
   RETURN_IF_ERROR(tensor.WriteData(env->queue(), tensor_cpu));
   RETURN_IF_ERROR(tensor.ReadData(env->queue(), &tensor_gpu));
 
@@ -77,8 +76,7 @@ absl::Status Tensor5DGenericTest(const BHWDC& shape,
   }
 
   Tensor tensor;
-  RETURN_IF_ERROR(
-      CreateTensor(env->context(), env->device(), shape, descriptor, &tensor));
+  RETURN_IF_ERROR(CreateTensor(env->context(), shape, descriptor, &tensor));
   RETURN_IF_ERROR(tensor.WriteData(env->queue(), tensor_cpu));
   RETURN_IF_ERROR(tensor.ReadData(env->queue(), &tensor_gpu));
 
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor_type.cc b/tensorflow/lite/delegates/gpu/cl/tensor_type.cc
index e19de02d59d..f31df43539e 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor_type.cc
+++ b/tensorflow/lite/delegates/gpu/cl/tensor_type.cc
@@ -73,6 +73,25 @@ std::string ToString(TensorStorageType type) {
   }
 }
 
+TensorDescriptor::TensorDescriptor(TensorDescriptor&& desc)
+    : GPUObjectDescriptor(std::move(desc)),
+      data_type(desc.data_type),
+      storage_type(desc.storage_type),
+      layout(desc.layout),
+      shape(desc.shape),
+      data(std::move(desc.data)) {}
+TensorDescriptor& TensorDescriptor::operator=(TensorDescriptor&& desc) {
+  if (this != &desc) {
+    std::swap(data_type, desc.data_type);
+    std::swap(storage_type, desc.storage_type);
+    std::swap(layout, desc.layout);
+    std::swap(shape, desc.shape);
+    data = std::move(desc.data);
+    GPUObjectDescriptor::operator=(std::move(desc));
+  }
+  return *this;
+}
+
 GPUResources TensorDescriptor::GetGPUResources() const {
   GPUResources resources;
   if (HasAxis(Axis::WIDTH)) {
@@ -725,6 +744,174 @@ TextureAddressMode TensorDescriptor::ModeFromState() const {
   }
 }
 
+void TensorDescriptor::UploadData(
+    const tflite::gpu::Tensor<HWC, DataType::FLOAT32>& src) {
+  shape = BHWDC(1, src.shape.h, src.shape.w, 1, src.shape.c);
+  UploadData(absl::MakeConstSpan(src.data));
+}
+
+void TensorDescriptor::UploadData(
+    const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& src) {
+  shape = BHWDC(1, 1, 1, 1, src.shape.v);
+  UploadData(absl::MakeConstSpan(src.data));
+}
+
+void TensorDescriptor::UploadData(absl::Span<const float> src) {
+  int aligned_channels = storage_type == TensorStorageType::SINGLE_TEXTURE_2D
+                             ? shape.c
+                             : AlignByN(shape.c, 4);
+  int elements_count = shape.b * shape.w * shape.h * shape.d * aligned_channels;
+  data.resize(elements_count * SizeOf(data_type));
+  if (data_type == DataType::FLOAT32) {
+    float* gpu_data = reinterpret_cast<float*>(data.data());
+    DataFromBHWDC(src, shape, *this, absl::MakeSpan(gpu_data, elements_count));
+  } else {
+    half* gpu_data = reinterpret_cast<half*>(data.data());
+    DataFromBHWDC(src, shape, *this, absl::MakeSpan(gpu_data, elements_count));
+  }
+}
+
+bool TensorDescriptor::SupportsZeroClamp(const Axis& axis) const {
+  switch (storage_type) {
+    case TensorStorageType::UNKNOWN:
+      return false;
+    case TensorStorageType::BUFFER:
+    case TensorStorageType::IMAGE_BUFFER:
+      return false;
+    case TensorStorageType::TEXTURE_ARRAY:
+    case TensorStorageType::TEXTURE_2D:
+    case TensorStorageType::SINGLE_TEXTURE_2D:
+      return axis == Axis::WIDTH || axis == Axis::HEIGHT;
+    case TensorStorageType::TEXTURE_3D:
+      return axis == Axis::WIDTH || axis == Axis::HEIGHT || axis == Axis::DEPTH;
+  }
+}
+
+bool TensorDescriptor::CanReadOutOfBorder(const Axis& axis) const {
+  switch (storage_type) {
+    case TensorStorageType::UNKNOWN:
+      return false;
+    case TensorStorageType::BUFFER:
+      return false;
+    case TensorStorageType::IMAGE_BUFFER:
+    case TensorStorageType::TEXTURE_2D:
+    case TensorStorageType::TEXTURE_3D:
+    case TensorStorageType::SINGLE_TEXTURE_2D:
+    case TensorStorageType::TEXTURE_ARRAY:
+      return true;
+  }
+}
+
+bool TensorDescriptor::IsLinear() const {
+  return storage_type == TensorStorageType::BUFFER ||
+         storage_type == TensorStorageType::IMAGE_BUFFER;
+}
+
+bool TensorDescriptor::ReturnsZeroForNegOneRead() const {
+  return storage_type == TensorStorageType::IMAGE_BUFFER;
+}
+
+namespace {
+int GetLinearIndex(const TensorDescriptor& desc, const BHWDC& shape, int b,
+                   int x, int y, int d, int s, int sub_c) {
+  const int slices = DivideRoundUp(shape.c, 4);
+  switch (desc.storage_type) {
+    case TensorStorageType::BUFFER:
+    case TensorStorageType::IMAGE_BUFFER:
+    case TensorStorageType::TEXTURE_ARRAY:
+    case TensorStorageType::TEXTURE_3D:
+      return ((((d * slices + s) * shape.h + y) * shape.w + x) * shape.b + b) *
+                 4 +
+             sub_c;  // DSHWBC4
+    case TensorStorageType::TEXTURE_2D:
+      return ((((y * slices + s) * shape.w + x) * shape.b + b) * shape.d + d) *
+                 4 +
+             sub_c;  // HSWBDC4
+    case TensorStorageType::SINGLE_TEXTURE_2D:
+      return (((y * shape.w + x) * shape.b + b) * shape.d + d) * shape.c +
+             sub_c;  // HWBDC
+    case TensorStorageType::UNKNOWN:
+      return -1;
+  }
+}
+
+int GetChannelsAlignment(const TensorDescriptor& desc, const BHWDC& shape) {
+  return desc.storage_type == TensorStorageType::SINGLE_TEXTURE_2D ? shape.c
+                                                                   : 4;
+}
+}  // namespace
+
+template <typename T>
+void DataFromBHWDC(absl::Span<const float> src, const BHWDC& shape,
+                   const TensorDescriptor& desc, absl::Span<T> dst) {
+  const int channels_alignment = GetChannelsAlignment(desc, shape);
+  const int slices = DivideRoundUp(shape.c, 4);
+  for (int b = 0; b < shape.b; ++b) {
+    for (int s = 0; s < slices; ++s) {
+      for (int y = 0; y < shape.h; ++y) {
+        for (int x = 0; x < shape.w; ++x) {
+          for (int d = 0; d < shape.d; ++d) {
+            for (int c = 0; c < channels_alignment; ++c) {
+              float value;
+              if (s * 4 + c < shape.c) {
+                const int cpu_index =
+                    shape.LinearIndex({b, y, x, d, s * 4 + c});
+                value = src[cpu_index];
+              } else {
+                value = 0.0f;
+              }
+              int gpu_index = GetLinearIndex(desc, shape, b, x, y, d, s, c);
+              dst[gpu_index] = value;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+template void DataFromBHWDC<float>(absl::Span<const float> src,
+                                   const BHWDC& shape,
+                                   const TensorDescriptor& desc,
+                                   absl::Span<float> dst);
+template void DataFromBHWDC<half>(absl::Span<const float> src,
+                                  const BHWDC& shape,
+                                  const TensorDescriptor& desc,
+                                  absl::Span<half> dst);
+
+template <typename T>
+void DataToBHWDC(absl::Span<const T> src, const BHWDC& shape,
+                 const TensorDescriptor& desc, absl::Span<float> dst) {
+  const int channels_alignment = GetChannelsAlignment(desc, shape);
+  const int slices = DivideRoundUp(shape.c, 4);
+  for (int b = 0; b < shape.b; ++b) {
+    for (int s = 0; s < slices; ++s) {
+      for (int y = 0; y < shape.h; ++y) {
+        for (int x = 0; x < shape.w; ++x) {
+          for (int d = 0; d < shape.d; ++d) {
+            for (int c = 0; c < channels_alignment; ++c) {
+              if (s * 4 + c >= shape.c) {
+                continue;
+              }
+              int cpu_index = shape.LinearIndex({b, y, x, d, s * 4 + c});
+              int gpu_index = GetLinearIndex(desc, shape, b, x, y, d, s, c);
+              dst[cpu_index] = src[gpu_index];
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+template void DataToBHWDC<float>(absl::Span<const float> src,
+                                 const BHWDC& shape,
+                                 const TensorDescriptor& desc,
+                                 absl::Span<float> dst);
+template void DataToBHWDC<half>(absl::Span<const half> src, const BHWDC& shape,
+                                const TensorDescriptor& desc,
+                                absl::Span<float> dst);
+
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor_type.h b/tensorflow/lite/delegates/gpu/cl/tensor_type.h
index 73b15ca322d..2157bf05543 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor_type.h
+++ b/tensorflow/lite/delegates/gpu/cl/tensor_type.h
@@ -49,6 +49,11 @@ struct TensorDescriptor : public GPUObjectDescriptor {
   TensorDescriptor(DataType dt, TensorStorageType st, Layout l)
       : data_type(dt), storage_type(st), layout(l) {}
 
+  TensorDescriptor(const TensorDescriptor&) = default;
+  TensorDescriptor& operator=(const TensorDescriptor&) = default;
+  TensorDescriptor(TensorDescriptor&& desc);
+  TensorDescriptor& operator=(TensorDescriptor&& desc);
+
   bool operator==(const TensorDescriptor& d) const {
     return data_type == d.data_type && storage_type == d.storage_type &&
            layout == d.layout;
@@ -63,6 +68,10 @@ struct TensorDescriptor : public GPUObjectDescriptor {
 
   GPUResources GetGPUResources() const override;
 
+  absl::Status CreateGPUObject(CLContext* context,
+                               GPUObjectPtr* result) const override;
+  void Release() override { data.clear(); }
+
   bool HasAxis(Axis axis) const;
   void SetTextureAddressMode(TextureAddressMode mode);
 
@@ -70,6 +79,19 @@ struct TensorDescriptor : public GPUObjectDescriptor {
       const std::vector<std::string>& args, std::string* value_name,
       std::string* x_coord, std::string* y_coord, std::string* s_coord) const;
 
+  void UploadData(const tflite::gpu::Tensor<HWC, DataType::FLOAT32>& src);
+  void UploadData(const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& src);
+
+  bool SupportsZeroClamp(const Axis& axis) const;
+  bool CanReadOutOfBorder(const Axis& axis) const;
+  bool IsLinear() const;
+
+  // applicable only for types that: IsLinear -> true.
+  // In this case for address we have 1d component - addr (int)
+  // If for addr == -1 this linear storage type returns FLT4(0.0), this function
+  // returns true, otherwise false
+  bool ReturnsZeroForNegOneRead() const;
+
   DataType data_type = DataType::UNKNOWN;
   TensorStorageType storage_type = TensorStorageType::UNKNOWN;
   // This field describes logical layout, actual(physical) GPU layout can be
@@ -77,6 +99,10 @@ struct TensorDescriptor : public GPUObjectDescriptor {
   Layout layout =
       Layout::UNKNOWN;  // Supported layouts is HWC, BHWC, HWDC, BHWDC
 
+  // optional
+  BHWDC shape;
+  std::vector<uint8_t> data;
+
  private:
   absl::Status PerformReadSelector(
       const std::vector<std::string>& args,
@@ -145,8 +171,18 @@ struct TensorDescriptor : public GPUObjectDescriptor {
   bool ParseCoordsFromArgs(const std::vector<std::string>& args, int offset,
                            std::string* xc, std::string* yc, std::string* zc,
                            std::string* sc, std::string* bc) const;
+
+  void UploadData(absl::Span<const float> src);
 };
 
+template <typename T>
+void DataFromBHWDC(absl::Span<const float> src, const BHWDC& shape,
+                   const TensorDescriptor& desc, absl::Span<T> dst);
+
+template <typename T>
+void DataToBHWDC(absl::Span<const T> src, const BHWDC& shape,
+                 const TensorDescriptor& desc, absl::Span<float> dst);
+
 std::string ToString(TensorStorageType type);
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/testing/delegate_testing.cc b/tensorflow/lite/delegates/gpu/cl/testing/delegate_testing.cc
index 10b7ac34404..3a618e55c06 100644
--- a/tensorflow/lite/delegates/gpu/cl/testing/delegate_testing.cc
+++ b/tensorflow/lite/delegates/gpu/cl/testing/delegate_testing.cc
@@ -132,6 +132,7 @@ int main(int argc, char** argv) {
   options.inference_priority1 = TFLITE_GPU_INFERENCE_PRIORITY_MIN_LATENCY;
   options.inference_priority2 = TFLITE_GPU_INFERENCE_PRIORITY_MIN_MEMORY_USAGE;
   options.inference_priority3 = TFLITE_GPU_INFERENCE_PRIORITY_MAX_PRECISION;
+  options.experimental_flags = TFLITE_GPU_EXPERIMENTAL_FLAGS_NONE;
   options.max_delegated_partitions = 1;
   auto* gpu_delegate = TfLiteGpuDelegateV2Create(&options);
   status = gpu_inference->ModifyGraphWithDelegate(gpu_delegate);
diff --git a/tensorflow/lite/delegates/gpu/cl/testing/run_delegate_testing.sh b/tensorflow/lite/delegates/gpu/cl/testing/run_delegate_testing.sh
index 7b86407dbad..70d2a5cf3dc 100755
--- a/tensorflow/lite/delegates/gpu/cl/testing/run_delegate_testing.sh
+++ b/tensorflow/lite/delegates/gpu/cl/testing/run_delegate_testing.sh
@@ -78,11 +78,17 @@ ADB push "$model_path" "$OPENCL_DIR"
 declare -a BUILD_CONFIG
 abi_version=$(ADB shell getprop ro.product.cpu.abi | tr -d '\r')
 if [[ "$abi_version" == "armeabi-v7a" ]]; then
-#"32 bit"
+#"32 bit ARM"
 BUILD_CONFIG=( --config=android_arm -c opt --copt=-fPIE --linkopt=-pie )
-else
-#"64 bit"
+elif [[ "$abi_version" == "arm64-v8a" ]]; then
+#"64 bit ARM"
 BUILD_CONFIG=( --config=android_arm64 -c opt )
+elif [[ "$abi_version" == "x86_64" ]]; then
+# x86_64
+BUILD_CONFIG=( --config=android_x86_64 -c opt )
+else
+echo "Error: Unknown processor ABI"
+exit 1
 fi
 
 bazel build "${BUILD_CONFIG[@]}" //$SHELL_DIR:$BINARY_NAME
diff --git a/tensorflow/lite/delegates/gpu/cl/testing/run_performance_profiling.sh b/tensorflow/lite/delegates/gpu/cl/testing/run_performance_profiling.sh
index 0fd2d33de14..56d1e1010ed 100755
--- a/tensorflow/lite/delegates/gpu/cl/testing/run_performance_profiling.sh
+++ b/tensorflow/lite/delegates/gpu/cl/testing/run_performance_profiling.sh
@@ -83,11 +83,17 @@ ADB push "$model_path" "$OPENCL_DIR"
 declare -a BUILD_CONFIG
 abi_version=$(ADB shell getprop ro.product.cpu.abi | tr -d '\r')
 if [[ "$abi_version" == "armeabi-v7a" ]]; then
-#"32 bit"
+#"32 bit ARM"
 BUILD_CONFIG=( --config=android_arm -c opt --copt=-fPIE --linkopt=-pie )
-else
-#"64 bit"
+elif [[ "$abi_version" == "arm64-v8a" ]]; then
+#"64 bit ARM"
 BUILD_CONFIG=( --config=android_arm64 -c opt )
+elif [[ "$abi_version" == "x86_64" ]]; then
+# x86_64
+BUILD_CONFIG=( --config=android_x86_64 -c opt )
+else
+echo "Error: Unknown processor ABI"
+exit 1
 fi
 
 bazel build "${BUILD_CONFIG[@]}" //$SHELL_DIR:$BINARY_NAME
diff --git a/tensorflow/lite/delegates/gpu/cl/texture2d.cc b/tensorflow/lite/delegates/gpu/cl/texture2d.cc
index cbeafe04c05..28d26f03260 100644
--- a/tensorflow/lite/delegates/gpu/cl/texture2d.cc
+++ b/tensorflow/lite/delegates/gpu/cl/texture2d.cc
@@ -21,44 +21,38 @@ namespace cl {
 namespace {
 
 // Creates new 4-channel 2D texture with cl_channel_type elements
-absl::Status CreateTexture2D(int width, int height, cl_channel_type type,
-                             void* data, CLContext* context,
-                             Texture2D* result) {
-  cl_image_desc desc;
-  desc.image_type = CL_MEM_OBJECT_IMAGE2D;
-  desc.image_width = width;
-  desc.image_height = height;
-  desc.image_depth = 0;
-  desc.image_row_pitch = 0;
-  desc.image_slice_pitch = 0;
-  desc.num_mip_levels = 0;
-  desc.num_samples = 0;
-  desc.buffer = nullptr;
-
-  cl_image_format format;
-  format.image_channel_order = CL_RGBA;
-  format.image_channel_data_type = type;
-
-  cl_mem_flags flags = CL_MEM_READ_WRITE;
-  if (data != nullptr) {
-    flags |= CL_MEM_COPY_HOST_PTR;
-  }
-
-  cl_int error_code;
-  cl_mem texture = CreateImage2DLegacy(context->context(), flags, &format,
-                                       &desc, data, &error_code);
-  if (error_code != CL_SUCCESS) {
-    return absl::UnknownError(
-        absl::StrCat("Failed to create 2D texture (clCreateImage): ",
-                     CLErrorCodeToString(error_code)));
-  }
-
-  *result = Texture2D(texture, width, height, type);
+absl::Status CreateTexture2D(int width, int height, DataType type, void* data,
+                             CLContext* context, Texture2D* result) {
+  cl_mem texture;
+  RETURN_IF_ERROR(CreateFloatRGBAImage2D(context->context(), width, height,
+                                         type, data, &texture));
+  cl_channel_type channel_type =
+      type == DataType::FLOAT32 ? CL_FLOAT : CL_HALF_FLOAT;
+  *result = Texture2D(texture, width, height, channel_type);
 
   return absl::OkStatus();
 }
 }  // namespace
 
+Texture2DDescriptor::Texture2DDescriptor(Texture2DDescriptor&& desc)
+    : GPUObjectDescriptor(std::move(desc)),
+      element_type(desc.element_type),
+      size(desc.size),
+      data(std::move(desc.data)) {}
+
+Texture2DDescriptor& Texture2DDescriptor::operator=(
+    Texture2DDescriptor&& desc) {
+  if (this != &desc) {
+    std::swap(element_type, desc.element_type);
+    std::swap(size, desc.size);
+    data = std::move(desc.data);
+    GPUObjectDescriptor::operator=(std::move(desc));
+  }
+  return *this;
+}
+
+void Texture2DDescriptor::Release() { data.clear(); }
+
 GPUResources Texture2DDescriptor::GetGPUResources() const {
   GPUResources resources;
   GPUImage2DDescriptor desc;
@@ -75,7 +69,7 @@ absl::Status Texture2DDescriptor::PerformSelector(
     return PerformReadSelector(args, result);
   } else {
     return absl::NotFoundError(absl::StrCat(
-        "TensorLinearDescriptor don't have selector with name - ", selector));
+        "Texture2DDescriptor don't have selector with name - ", selector));
   }
 }
 
@@ -93,6 +87,14 @@ absl::Status Texture2DDescriptor::PerformReadSelector(
   return absl::OkStatus();
 }
 
+absl::Status Texture2DDescriptor::CreateGPUObject(CLContext* context,
+                                                  GPUObjectPtr* result) const {
+  Texture2D gpu_texture;
+  RETURN_IF_ERROR(gpu_texture.CreateFromTexture2DDescriptor(*this, context));
+  *result = absl::make_unique<Texture2D>(std::move(gpu_texture));
+  return absl::OkStatus();
+}
+
 Texture2D::Texture2D(cl_mem texture, int width, int height,
                      cl_channel_type type)
     : texture_(texture), width_(width), height_(height), channel_type_(type) {}
@@ -118,8 +120,6 @@ Texture2D& Texture2D::operator=(Texture2D&& texture) {
   return *this;
 }
 
-Texture2D::~Texture2D() { Release(); }
-
 void Texture2D::Release() {
   if (texture_) {
     clReleaseMemObject(texture_);
@@ -141,37 +141,42 @@ absl::Status Texture2D::GetGPUResources(
   return absl::OkStatus();
 }
 
+absl::Status Texture2D::CreateFromTexture2DDescriptor(
+    const Texture2DDescriptor& desc, CLContext* context) {
+  width_ = desc.size.x;
+  height_ = desc.size.y;
+  channel_type_ =
+      desc.element_type == DataType::FLOAT32 ? CL_FLOAT : CL_HALF_FLOAT;
+  uint8_t* data_ptr = desc.data.empty()
+                          ? nullptr
+                          : const_cast<unsigned char*>(desc.data.data());
+  return CreateFloatRGBAImage2D(context->context(), desc.size.x, desc.size.y,
+                                desc.element_type, data_ptr, &texture_);
+}
+
 // Creates new 4-channel 2D texture with f32 elements
 absl::Status CreateTexture2DRGBA32F(int width, int height, CLContext* context,
                                     Texture2D* result) {
-  return CreateTexture2D(width, height, CL_FLOAT, nullptr, context, result);
+  return CreateTexture2D(width, height, DataType::FLOAT32, nullptr, context,
+                         result);
 }
 
 // Creates new 4-channel 2D texture with f16 elements
 absl::Status CreateTexture2DRGBA16F(int width, int height, CLContext* context,
                                     Texture2D* result) {
-  return CreateTexture2D(width, height, CL_HALF_FLOAT, nullptr, context,
+  return CreateTexture2D(width, height, DataType::FLOAT16, nullptr, context,
                          result);
 }
 
 absl::Status CreateTexture2DRGBA(DataType type, int width, int height,
                                  CLContext* context, Texture2D* result) {
-  if (type == DataType::FLOAT32) {
-    return CreateTexture2D(width, height, CL_FLOAT, nullptr, context, result);
-  } else {
-    return CreateTexture2D(width, height, CL_HALF_FLOAT, nullptr, context,
-                           result);
-  }
+  return CreateTexture2D(width, height, type, nullptr, context, result);
 }
 
 absl::Status CreateTexture2DRGBA(DataType type, int width, int height,
                                  void* data, CLContext* context,
                                  Texture2D* result) {
-  if (type == DataType::FLOAT32) {
-    return CreateTexture2D(width, height, CL_FLOAT, data, context, result);
-  } else {
-    return CreateTexture2D(width, height, CL_HALF_FLOAT, data, context, result);
-  }
+  return CreateTexture2D(width, height, type, data, context, result);
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/texture2d.h b/tensorflow/lite/delegates/gpu/cl/texture2d.h
index 54a2732fc90..51e0fc7e42c 100644
--- a/tensorflow/lite/delegates/gpu/cl/texture2d.h
+++ b/tensorflow/lite/delegates/gpu/cl/texture2d.h
@@ -34,6 +34,16 @@ namespace cl {
 struct Texture2DDescriptor : public GPUObjectDescriptor {
   DataType element_type;  // FLOAT32 or FLOAT16
 
+  // optional
+  int2 size = int2(0, 0);
+  std::vector<uint8_t> data;
+
+  Texture2DDescriptor() = default;
+  Texture2DDescriptor(const Texture2DDescriptor&) = default;
+  Texture2DDescriptor& operator=(const Texture2DDescriptor&) = default;
+  Texture2DDescriptor(Texture2DDescriptor&& desc);
+  Texture2DDescriptor& operator=(Texture2DDescriptor&& desc);
+
   absl::Status PerformSelector(const std::string& selector,
                                const std::vector<std::string>& args,
                                const std::vector<std::string>& template_args,
@@ -42,6 +52,10 @@ struct Texture2DDescriptor : public GPUObjectDescriptor {
   GPUResources GetGPUResources() const override;
   absl::Status PerformReadSelector(const std::vector<std::string>& args,
                                    std::string* result) const;
+
+  absl::Status CreateGPUObject(CLContext* context,
+                               GPUObjectPtr* result) const override;
+  void Release() override;
 };
 
 // Texture2D represent formatted GPU data storage.
@@ -57,7 +71,7 @@ class Texture2D : public GPUObject {
   Texture2D(const Texture2D&) = delete;
   Texture2D& operator=(const Texture2D&) = delete;
 
-  ~Texture2D();
+  virtual ~Texture2D() { Release(); }
 
   cl_mem GetMemoryPtr() const { return texture_; }
 
@@ -73,6 +87,9 @@ class Texture2D : public GPUObject {
   absl::Status GetGPUResources(const GPUObjectDescriptor* obj_ptr,
                                GPUResourcesWithValue* resources) const override;
 
+  absl::Status CreateFromTexture2DDescriptor(const Texture2DDescriptor& desc,
+                                             CLContext* context);
+
  private:
   void Release();
 
diff --git a/tensorflow/lite/delegates/gpu/cl/util.cc b/tensorflow/lite/delegates/gpu/cl/util.cc
index ac996d8ffa6..199e0129968 100644
--- a/tensorflow/lite/delegates/gpu/cl/util.cc
+++ b/tensorflow/lite/delegates/gpu/cl/util.cc
@@ -168,6 +168,56 @@ int ChannelTypeToSizeInBytes(cl_channel_type type) {
 
 bool OpenCLSupported() { return LoadOpenCL().ok(); }
 
+absl::Status CreateCLBuffer(cl_context context, int size_in_bytes,
+                            bool read_only, void* data, cl_mem* result) {
+  cl_mem_flags flags = read_only ? CL_MEM_READ_ONLY : CL_MEM_READ_WRITE;
+  if (data) {
+    flags |= CL_MEM_COPY_HOST_PTR;
+  }
+  cl_int error_code;
+  *result = clCreateBuffer(context, flags, size_in_bytes, data, &error_code);
+  if (!*result) {
+    return absl::UnknownError(
+        absl::StrCat("Failed to allocate device memory (clCreateBuffer): ",
+                     CLErrorCodeToString(error_code)));
+  }
+  return absl::OkStatus();
+}
+
+absl::Status CreateFloatRGBAImage2D(cl_context context, int width, int height,
+                                    DataType type, void* data, cl_mem* result) {
+  cl_image_desc desc;
+  desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+  desc.image_width = width;
+  desc.image_height = height;
+  desc.image_depth = 0;
+  desc.image_row_pitch = 0;
+  desc.image_slice_pitch = 0;
+  desc.num_mip_levels = 0;
+  desc.num_samples = 0;
+  desc.buffer = nullptr;
+
+  cl_image_format format;
+  format.image_channel_order = CL_RGBA;
+  format.image_channel_data_type =
+      type == DataType::FLOAT32 ? CL_FLOAT : CL_HALF_FLOAT;
+
+  cl_mem_flags flags = CL_MEM_READ_WRITE;
+  if (data) {
+    flags |= CL_MEM_COPY_HOST_PTR;
+  }
+
+  cl_int error_code;
+  *result =
+      CreateImage2DLegacy(context, flags, &format, &desc, data, &error_code);
+  if (error_code != CL_SUCCESS) {
+    return absl::UnknownError(
+        absl::StrCat("Failed to create 2D texture (clCreateImage): ",
+                     CLErrorCodeToString(error_code)));
+  }
+  return absl::OkStatus();
+}
+
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/util.h b/tensorflow/lite/delegates/gpu/cl/util.h
index 9435bb3a8a2..8e22c017fe7 100644
--- a/tensorflow/lite/delegates/gpu/cl/util.h
+++ b/tensorflow/lite/delegates/gpu/cl/util.h
@@ -49,6 +49,12 @@ void CopyLinearFLT4(const tflite::gpu::Tensor<Linear, S>& src,
   }
 }
 
+absl::Status CreateCLBuffer(cl_context context, int size_in_bytes,
+                            bool read_only, void* data, cl_mem* result);
+
+absl::Status CreateFloatRGBAImage2D(cl_context context, int width, int height,
+                                    DataType type, void* data, cl_mem* result);
+
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/common/BUILD b/tensorflow/lite/delegates/gpu/common/BUILD
index ab2d5d033f7..7125064d7a8 100644
--- a/tensorflow/lite/delegates/gpu/common/BUILD
+++ b/tensorflow/lite/delegates/gpu/common/BUILD
@@ -10,6 +10,7 @@ cc_library(
     srcs = ["convert.cc"],
     hdrs = ["convert.h"],
     deps = [
+        ":data_type",
         ":shape",
         ":status",
         ":tensor",
@@ -22,7 +23,10 @@ cc_library(
 )
 
 exports_files(
-    ["custom_parsers.h"],
+    [
+        "custom_parsers.h",
+        "custom_transformations.h",
+    ],
     visibility = ["//tensorflow/lite/delegates/gpu/common:__subpackages__"],
 )
 
@@ -73,6 +77,7 @@ cc_library(
         ":types",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
     ],
 )
 
@@ -81,11 +86,11 @@ cc_library(
     srcs = ["model.cc"],
     hdrs = ["model.h"],
     deps = [
-        ":data_type",
         ":shape",
         ":status",
         ":tensor",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:any",
         "@com_google_absl//absl/types:optional",
@@ -97,14 +102,15 @@ cc_test(
     srcs = ["model_test.cc"],
     deps = [
         ":model",
+        "@com_google_absl//absl/status",
         "@com_google_googletest//:gtest_main",
     ],
 )
 
 cc_library(
-    name = "model_builder",
-    srcs = ["model_builder.cc"],
-    hdrs = ["model_builder.h"],
+    name = "lstm_parser",
+    srcs = ["lstm_parser.cc"],
+    hdrs = ["lstm_parser.h"],
     deps = [
         ":data_type",
         ":model",
@@ -114,17 +120,45 @@ cc_library(
         ":shape",
         ":status",
         ":tensor",
+        "//tensorflow/lite:string",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/kernels:lstm_shared",
+        "//tensorflow/lite/kernels/internal:quantization_util",
+        "//tensorflow/lite/kernels/internal:tensor",
+        "//tensorflow/lite/kernels/internal:tensor_utils",
+        "//tensorflow/lite/kernels/internal:types",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:any",
+    ],
+)
+
+cc_library(
+    name = "model_builder",
+    srcs = ["model_builder.cc"],
+    hdrs = ["model_builder.h"],
+    deps = [
+        ":data_type",
+        ":lstm_parser",
+        ":model",
+        ":model_builder_helper",
+        ":model_transformer",
+        ":object_reader",
+        ":operations",
+        ":shape",
+        ":status",
+        ":tensor",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
         "//tensorflow/lite/delegates:utils",
-        "//tensorflow/lite:context",
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite:util",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/delegates/gpu/common/transformations:general_transformations",
+        "//tensorflow/lite/delegates/gpu/common/transformations:model_transformations",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/kernels/internal:reference_base",
         "//tensorflow/lite/kernels/internal:tensor",
-        "//tensorflow/lite/schema:schema_fbs",
     ] + tf_platform_alias("custom_parsers", "//tensorflow/lite/delegates/gpu/common/"),
 )
 
@@ -132,10 +166,14 @@ cc_test(
     name = "model_builder_test",
     srcs = ["model_builder_test.cc"],
     deps = [
+        ":data_type",
         ":model_builder",
+        ":shape",
+        ":tensor",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite/c:common",
+        "@com_google_absl//absl/status",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -147,13 +185,12 @@ cc_library(
     deps = [
         ":data_type",
         ":model",
+        ":operations",
         ":shape",
         ":status",
         ":tensor",
-        "//tensorflow/lite:context",
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/delegates:utils",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/kernels/internal:reference_base",
         "//tensorflow/lite/kernels/internal:tensor",
@@ -169,6 +206,7 @@ cc_library(
     hdrs = ["model_transformer.h"],
     deps = [
         ":model",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -183,9 +221,12 @@ cc_library(
         ":model",
         ":model_builder_helper",
         ":status",
+        ":tensor",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/delegates:utils",
         "//tensorflow/lite/kernels:kernel_util",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -195,9 +236,10 @@ cc_library(
     hdrs = ["operations.h"],
     deps = [
         ":data_type",
-        ":model",
         ":shape",
         ":status",
+        ":tensor",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/types:variant",
     ],
 )
@@ -208,10 +250,12 @@ cc_library(
     hdrs = ["quantization_util.h"],
     deps = [
         ":status",
-        "//tensorflow/lite:kernel_api",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels/internal:optimized_base",
+        "//tensorflow/lite/kernels/internal:tensor",
         "//tensorflow/lite/kernels/internal:types",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status",
     ],
 )
 
@@ -221,6 +265,9 @@ cc_test(
     deps = [
         ":quantization_util",
         "//tensorflow/lite:util",
+        "//tensorflow/lite/c:common",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -231,10 +278,7 @@ cc_library(
     name = "shape",
     srcs = ["shape.cc"],
     hdrs = ["shape.h"],
-    deps = [
-        "@com_google_absl//absl/hash",
-        "@com_google_absl//absl/strings",
-    ],
+    deps = ["@com_google_absl//absl/strings"],
 )
 
 cc_test(
@@ -282,6 +326,9 @@ cc_test(
     srcs = ["memory_management_test.cc"],
     deps = [
         ":memory_management",
+        ":shape",
+        ":types",
+        "@com_google_absl//absl/status",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -328,9 +375,5 @@ cc_library(
     name = "workgroup_selection",
     srcs = ["workgroup_selection.cc"],
     hdrs = ["workgroup_selection.h"],
-    deps = [
-        ":status",
-        ":types",
-        ":util",
-    ],
+    deps = [":util"],
 )
diff --git a/tensorflow/lite/delegates/gpu/common/convert.cc b/tensorflow/lite/delegates/gpu/common/convert.cc
index fb0caf9f167..3920692bdca 100644
--- a/tensorflow/lite/delegates/gpu/common/convert.cc
+++ b/tensorflow/lite/delegates/gpu/common/convert.cc
@@ -15,9 +15,19 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/convert.h"
 
+#include <stdint.h>
+#include <string.h>
+
+#include <string>
+#include <vector>
+
 #include <fp16.h>
 #include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
 #include "tensorflow/lite/delegates/gpu/common/types.h"
 #include "tensorflow/lite/delegates/gpu/common/util.h"
 
diff --git a/tensorflow/lite/delegates/gpu/common/convert.h b/tensorflow/lite/delegates/gpu/common/convert.h
index 3aba9c913c5..c7a6c17380a 100644
--- a/tensorflow/lite/delegates/gpu/common/convert.h
+++ b/tensorflow/lite/delegates/gpu/common/convert.h
@@ -16,9 +16,12 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_CONVERT_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_CONVERT_H_
 
+#include <stdint.h>
+
 #include <vector>
 
 #include "absl/types/span.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
diff --git a/tensorflow/lite/delegates/gpu/common/custom_parsers.h b/tensorflow/lite/delegates/gpu/common/custom_parsers.h
index 707087e6fdb..2644864cb58 100644
--- a/tensorflow/lite/delegates/gpu/common/custom_parsers.h
+++ b/tensorflow/lite/delegates/gpu/common/custom_parsers.h
@@ -15,7 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_CUSTOM_PARSERS_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_CUSTOM_PARSERS_H_
 
-#include <string>
+#include <stdint.h>
 
 #include "absl/strings/string_view.h"
 #include "absl/types/any.h"
@@ -27,9 +27,9 @@ namespace gpu {
 
 // Matches the custom operation by the string name and parses attributes stored
 // as flexbuffers.
-absl::Status ParseCustomAttributes(absl::string_view op_name, const void* data,
-                                   uint32_t data_size, absl::any* attr,
-                                   BHWC* output_shape);
+absl::Status ParseCustomAttributes(absl::string_view op_name, int version,
+                                   const void* data, uint32_t data_size,
+                                   absl::any* attr, BHWC* output_shape);
 
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/common/custom_transformations.h b/tensorflow/lite/delegates/gpu/common/custom_transformations.h
new file mode 100644
index 00000000000..3ca73a0d245
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/common/custom_transformations.h
@@ -0,0 +1,29 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_CUSTOM_TRANSFORMATIONS_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_CUSTOM_TRANSFORMATIONS_H_
+
+#include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
+
+namespace tflite {
+namespace gpu {
+
+// Applies all implemented custom model transformations.
+bool ApplyCustomTransformations(ModelTransformer* transformer);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_CUSTOM_TRANSFORMATIONS_H_
diff --git a/tensorflow/lite/delegates/gpu/common/default/BUILD b/tensorflow/lite/delegates/gpu/common/default/BUILD
index b085f68fcfb..91ce7e6c028 100644
--- a/tensorflow/lite/delegates/gpu/common/default/BUILD
+++ b/tensorflow/lite/delegates/gpu/common/default/BUILD
@@ -14,3 +14,12 @@ cc_library(
         "@com_google_absl//absl/types:any",
     ],
 )
+
+cc_library(
+    name = "custom_transformations",
+    srcs = ["custom_transformations.cc"],
+    hdrs = ["//tensorflow/lite/delegates/gpu/common:custom_transformations.h"],
+    deps = [
+        "//tensorflow/lite/delegates/gpu/common:model_transformer",
+    ],
+)
diff --git a/tensorflow/lite/delegates/gpu/common/default/custom_parsers.cc b/tensorflow/lite/delegates/gpu/common/default/custom_parsers.cc
index 9844b8d8aee..a4981a9d459 100644
--- a/tensorflow/lite/delegates/gpu/common/default/custom_parsers.cc
+++ b/tensorflow/lite/delegates/gpu/common/default/custom_parsers.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/custom_parsers.h"
 
+#include <stdint.h>
+
 #include <string>
 
 #include "absl/strings/str_cat.h"
@@ -26,11 +28,11 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 
-absl::Status ParseCustomAttributes(absl::string_view op_name, const void* data,
-                                   uint32_t data_size, absl::any* attr,
-                                   BHWC* output_shape) {
+absl::Status ParseCustomAttributes(absl::string_view op_name, int version,
+                                   const void* data, uint32_t data_size,
+                                   absl::any* attr, BHWC* output_shape) {
   return absl::UnimplementedError(absl::StrCat(
-      "Attributes parsing is not enabled for ", op_name, " operation"));
+      "Attributes parsing is not enabled for ", op_name, " operation."));
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/ir/dialect_registration.cc b/tensorflow/lite/delegates/gpu/common/default/custom_transformations.cc
similarity index 68%
rename from tensorflow/compiler/xla/service/gpu/ir/dialect_registration.cc
rename to tensorflow/lite/delegates/gpu/common/default/custom_transformations.cc
index 2e3461951d8..c57b9276068 100644
--- a/tensorflow/compiler/xla/service/gpu/ir/dialect_registration.cc
+++ b/tensorflow/lite/delegates/gpu/common/default/custom_transformations.cc
@@ -13,8 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/gpu/ir/xla_thunks_ops.h"
+#include "tensorflow/lite/delegates/gpu/common/custom_transformations.h"
 
-// Static initialization for GPU thunks op registration.
-static mlir::DialectRegistration<mlir::xla_thunks::XLAThunksDialect>
-    xla_thunks_ops;
+#include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
+
+namespace tflite {
+namespace gpu {
+
+bool ApplyCustomTransformations(ModelTransformer* transformer) { return true; }
+
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/common/gpu_info.cc b/tensorflow/lite/delegates/gpu/common/gpu_info.cc
index 14fb48a2d2d..b56745df971 100644
--- a/tensorflow/lite/delegates/gpu/common/gpu_info.cc
+++ b/tensorflow/lite/delegates/gpu/common/gpu_info.cc
@@ -15,8 +15,6 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
 
-#include <algorithm>
-#include <cctype>
 #include <string>
 
 #include "absl/strings/ascii.h"
diff --git a/tensorflow/lite/delegates/gpu/common/lstm_parser.cc b/tensorflow/lite/delegates/gpu/common/lstm_parser.cc
new file mode 100644
index 00000000000..94e5cdb8061
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/common/lstm_parser.cc
@@ -0,0 +1,553 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/common/lstm_parser.h"
+
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/str_cat.h"
+#include "absl/types/any.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/model_builder_helper.h"
+#include "tensorflow/lite/delegates/gpu/common/object_reader.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_utils.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+#include "tensorflow/lite/kernels/lstm_shared.h"
+#include "tensorflow/lite/string_type.h"
+
+namespace tflite {
+namespace gpu {
+namespace {
+
+Value* CreateNewSimilarValue(GraphFloat32* graph, const Value* old_value) {
+  Value* new_value = graph->NewValue();
+  new_value->quant_params = old_value->quant_params;
+  new_value->tensor.shape = old_value->tensor.shape;
+  new_value->tensor.type = old_value->tensor.type;
+  new_value->tensor.ref = -1;
+  return new_value;
+}
+
+absl::Status SetFullyConnectedWeights(int weights_tensor_id,
+                                      ObjectReader* reader,
+                                      FullyConnectedAttributes* attr) {
+  Tensor<HW, DataType::FLOAT32> weights;
+  RETURN_IF_ERROR(reader->ReadTensor(weights_tensor_id, &weights));
+  attr->weights.data = std::move(weights.data);
+  attr->weights.id = weights.id;
+  attr->weights.shape.o = weights.shape.h;
+  attr->weights.shape.h = 1;
+  attr->weights.shape.w = 1;
+  attr->weights.shape.i = weights.shape.w;
+  return absl::OkStatus();
+}
+
+bool HasTensor(const TfLiteNode* node, const int index) {
+  return (index < node->inputs->size) &&
+         (node->inputs->data[index] != kTfLiteOptionalTensor);
+}
+
+bool HasCifg(const TfLiteNode* node) {
+  return !HasTensor(
+      node, tflite::ops::builtin::lstm::full::kInputToInputWeightsTensor);
+}
+
+bool HasPeephole(const TfLiteNode* node) {
+  // Use forget weights to detect peephole instead of input weights as input
+  // weights may be missing for cifg.
+  return HasTensor(
+      node, tflite::ops::builtin::lstm::full::kCellToForgetWeightsTensor);
+}
+
+bool HasNormalization(const TfLiteNode* node) {
+  return HasTensor(
+      node,
+      tflite::ops::builtin::lstm::full::kForgetLayerNormCoefficientsTensor);
+}
+
+bool HasProjection(const TfLiteNode* node) {
+  return HasTensor(node,
+                   tflite::ops::builtin::lstm::full::kProjectionWeightsTensor);
+}
+
+// Builds subgraph for a single LSTM gate.
+// Returns a Value representing the gate's output.
+// High-level parameters:
+//   - Has normalization (if true: provide normalization weights).
+//   - Has peephole connection (if true: provide peephole weights).
+//   - Which activation function to use.
+// Note: no support for aux input.
+//
+// Implements the following:
+// (*: matrix multiply, .*: elementwise multiply, +: elementwise add):
+//   temp = input_weights * input_tensor + recurrent_weights * output_state;
+//   if (peephole):
+//     temp += peephole_weights .* cell_state;
+//   if (layer normalization):
+//     gate = activate(normalization_weights .* mean_stddev_norm(temp) + bias);
+//   else:
+//     gate = activate(temp + bias);
+//
+absl::Status BuildLstmGate(GraphFloat32* graph, ObjectReader* reader,
+                           Value* output_state, Value* cell_state,
+                           int input_weight_id, int recurrent_weight_id,
+                           int cell_weight_id, int bias_id,
+                           int normalization_weight_id,
+                           const TfLiteFusedActivation activation,
+                           bool has_peephole, bool has_normalization,
+                           Value** gate_out) {
+  Value* input_times_weights = CreateNewSimilarValue(graph, cell_state);
+  {
+    // #1 matrix multiplication: input_weights * input_tensor
+    // If has no normalization, also adds bias.
+    Node* node = graph->NewNode();
+    node->operation.type = ToString(OperationType::FULLY_CONNECTED);
+    FullyConnectedAttributes fc_attr;
+    RETURN_IF_ERROR(
+        SetFullyConnectedWeights(input_weight_id, reader, &fc_attr));
+    if (!has_normalization) {
+      RETURN_IF_ERROR(reader->ReadTensor(bias_id, &(fc_attr.bias)));
+    }
+    node->operation.attributes = std::move(fc_attr);
+    RETURN_IF_ERROR(
+        reader->AddInput(node, tflite::ops::builtin::lstm::full::kInputTensor));
+    RETURN_IF_ERROR(graph->SetProducer(node->id, input_times_weights->id));
+  }
+
+  Value* output_state_times_weights = CreateNewSimilarValue(graph, cell_state);
+  {
+    // #2 matrix multiplication: recurrent_weights * output_state
+    Node* node = graph->NewNode();
+    node->operation.type = ToString(OperationType::FULLY_CONNECTED);
+    FullyConnectedAttributes fc_attr;
+    RETURN_IF_ERROR(
+        SetFullyConnectedWeights(recurrent_weight_id, reader, &fc_attr));
+    node->operation.attributes = std::move(fc_attr);
+    RETURN_IF_ERROR(graph->AddConsumer(node->id, output_state->id));
+    RETURN_IF_ERROR(
+        graph->SetProducer(node->id, output_state_times_weights->id));
+  }
+
+  Value* cell_state_times_weights;
+  if (has_peephole) {
+    // #3 elementwise multiplication: cell_weight .* cell_state
+    cell_state_times_weights = CreateNewSimilarValue(graph, cell_state);
+    Node* node = graph->NewNode();
+    node->operation.type = ToString(OperationType::MUL);
+    ElementwiseAttributes attr;
+    Tensor<Linear, DataType::FLOAT32> weights;
+    RETURN_IF_ERROR(reader->ReadTensor(cell_weight_id, &weights));
+    attr.param = std::move(weights);
+    node->operation.attributes = std::move(attr);
+    RETURN_IF_ERROR(graph->AddConsumer(node->id, cell_state->id));
+    RETURN_IF_ERROR(graph->SetProducer(node->id, cell_state_times_weights->id));
+  }
+
+  Value* gate_before_normalization = CreateNewSimilarValue(graph, cell_state);
+  Node* add_node = graph->NewNode();
+  {
+    // #4 elementwise addition: #1 + #2 + #3
+    add_node->operation.type = ToString(OperationType::ADD);
+    RETURN_IF_ERROR(graph->AddConsumer(add_node->id, input_times_weights->id));
+    RETURN_IF_ERROR(
+        graph->AddConsumer(add_node->id, output_state_times_weights->id));
+    if (has_peephole) {
+      RETURN_IF_ERROR(
+          graph->AddConsumer(add_node->id, cell_state_times_weights->id));
+    }
+    RETURN_IF_ERROR(
+        graph->SetProducer(add_node->id, gate_before_normalization->id));
+  }
+
+  if (!has_normalization) {
+    // #5 Activation function: activate(temp + bias)
+    // Bias is added in node #1.
+    RETURN_IF_ERROR(MaybeFuseActivation(activation, graph, add_node));
+    *gate_out = gate_before_normalization;
+    return absl::OkStatus();
+  }
+
+  Value* normalized_gate =
+      CreateNewSimilarValue(graph, gate_before_normalization);
+  {
+    // #6 Normalization: normalize(temp)
+    Node* node = graph->NewNode();
+    node->operation.type = ToString(OperationType::MEAN_STDDEV_NORMALIZATION);
+    RETURN_IF_ERROR(
+        graph->AddConsumer(node->id, gate_before_normalization->id));
+    RETURN_IF_ERROR(graph->SetProducer(node->id, normalized_gate->id));
+  }
+  Value* reweighted_normalized_gate =
+      CreateNewSimilarValue(graph, normalized_gate);
+  {
+    // #7 Elementwise multiplication: norm_weights .* #6
+    Node* node = graph->NewNode();
+    node->operation.type = ToString(OperationType::MUL);
+    ElementwiseAttributes attr;
+    Tensor<Linear, DataType::FLOAT32> norm_weights;
+    RETURN_IF_ERROR(reader->ReadTensor(normalization_weight_id, &norm_weights));
+    attr.param = std::move(norm_weights);
+    node->operation.attributes = std::move(attr);
+    RETURN_IF_ERROR(graph->AddConsumer(node->id, normalized_gate->id));
+    RETURN_IF_ERROR(
+        graph->SetProducer(node->id, reweighted_normalized_gate->id));
+  }
+  Value* gate = CreateNewSimilarValue(graph, reweighted_normalized_gate);
+  {
+    // #8 Elementwise add: #7 + bias
+    Node* node = graph->NewNode();
+    node->operation.type = ToString(OperationType::ADD);
+    ElementwiseAttributes attr;
+    Tensor<Linear, DataType::FLOAT32> bias;
+    RETURN_IF_ERROR(reader->ReadTensor(bias_id, &bias));
+    attr.param = std::move(bias);
+    node->operation.attributes = std::move(attr);
+    RETURN_IF_ERROR(
+        graph->AddConsumer(node->id, reweighted_normalized_gate->id));
+    RETURN_IF_ERROR(graph->SetProducer(node->id, gate->id));
+
+    // #9: Activation function
+    RETURN_IF_ERROR(MaybeFuseActivation(activation, graph, node));
+  }
+  *gate_out = gate;
+  return absl::OkStatus();
+}
+
+// Builds subgraph for LSTM cell state update.
+// Returns a Value representing the updated cell state.
+// High-level parameters:
+//  - clip: if > 0, clamp the resulting cell state to [-clip, +clip].
+//
+// Implements the following:
+// (*: matrix multiply, .*: elementwise multiply, +: elementwise add):
+//
+//   cell_state_new = clip(forget_gate .* cell_state + input_gate .* cell_gate);
+//
+absl::Status BuildCellStateUpdate(GraphFloat32* graph, ObjectReader* reader,
+                                  Value* forget_gate, Value* input_gate,
+                                  Value* cell_gate, float cell_clip,
+                                  Value** cell_state_new) {
+  Value* cell_state;
+  RETURN_IF_ERROR(reader->ReadValue(
+      tflite::ops::builtin::lstm::full::kCellStateTensor, &cell_state));
+  Value* cell_state_contrib = CreateNewSimilarValue(graph, cell_gate);
+  {
+    // #1 elementwise multiplication: forget_gate .* cell_state
+    Node* node = graph->NewNode();
+    node->operation.type = ToString(OperationType::MUL);
+    RETURN_IF_ERROR(graph->AddConsumer(node->id, forget_gate->id));
+    RETURN_IF_ERROR(graph->AddConsumer(node->id, cell_state->id));
+    RETURN_IF_ERROR(graph->SetProducer(node->id, cell_state_contrib->id));
+  }
+  Value* cell_gate_contrib = CreateNewSimilarValue(graph, cell_gate);
+  {
+    // #2 elementwise multiplication: input_gate .* cell_gate
+    // Note, with CIFG input_gate is equal to 1-forget_gate.
+    Node* node = graph->NewNode();
+    node->operation.type = ToString(OperationType::MUL);
+    RETURN_IF_ERROR(graph->AddConsumer(node->id, input_gate->id));
+    RETURN_IF_ERROR(graph->AddConsumer(node->id, cell_gate->id));
+    RETURN_IF_ERROR(graph->SetProducer(node->id, cell_gate_contrib->id));
+  }
+  Value* new_cell_state = CreateNewSimilarValue(graph, cell_gate);
+  {
+    // #3 elementwise add: #1 + #2
+    Node* node = graph->NewNode();
+    node->operation.type = ToString(OperationType::ADD);
+    RETURN_IF_ERROR(graph->AddConsumer(node->id, cell_state_contrib->id));
+    RETURN_IF_ERROR(graph->AddConsumer(node->id, cell_gate_contrib->id));
+    RETURN_IF_ERROR(graph->SetProducer(node->id, new_cell_state->id));
+  }
+
+  if (cell_clip <= 0.0f) {
+    *cell_state_new = new_cell_state;
+    return absl::OkStatus();
+  }
+
+  // TODO(b/157166356): Maybe add OperationType::CLAMP ?
+  Value* max_clipped_state = CreateNewSimilarValue(graph, new_cell_state);
+  {
+    // #4 elementwise minimum: min(#3, clip)
+    Node* node = graph->NewNode();
+    node->operation.type = ToString(OperationType::MINIMUM);
+    ElementwiseAttributes attr;
+    attr.param = cell_clip;
+    node->operation.attributes = std::move(attr);
+    RETURN_IF_ERROR(graph->AddConsumer(node->id, new_cell_state->id));
+    RETURN_IF_ERROR(graph->SetProducer(node->id, max_clipped_state->id));
+  }
+  Value* clipped_cell_state = CreateNewSimilarValue(graph, max_clipped_state);
+  {
+    // #5 elementwise maximum: max(#4, -clip)
+    Node* node = graph->NewNode();
+    node->operation.type = ToString(OperationType::MAXIMUM);
+    ElementwiseAttributes attr;
+    attr.param = -cell_clip;
+    node->operation.attributes = std::move(attr);
+    RETURN_IF_ERROR(graph->AddConsumer(node->id, max_clipped_state->id));
+    RETURN_IF_ERROR(graph->SetProducer(node->id, clipped_cell_state->id));
+  }
+  *cell_state_new = clipped_cell_state;
+  return absl::OkStatus();
+}
+
+// Build subgraph for LSTM output state update.
+// Returns value representing the updated output state.
+// High-level parameters:
+//   - Has projection (if true, provide projection_weights).
+//   - Has projection bias (only with projection).
+//   - clip: clamp the projection output to [-clip, clip].
+//   - Which activation function to use.
+// Note the updated output state does not depend on the old output state
+// directly, only through the output gate.
+//
+// Implements the following:
+// (*: matrix multiply, .*: elementwise multiply, +: elementwise add):
+//
+//   temp = output_gate .* activate(cell_state);
+//   if (projection):
+//     output_state_new = clip(projection_weights * temp + projection_bias);
+//   else:
+//     output_state_new = temp;
+//
+absl::Status BuildOutputStateUpdate(GraphFloat32* graph, ObjectReader* reader,
+                                    Value* output_state, Value* output_gate,
+                                    Value* cell_state,
+                                    TfLiteFusedActivation activation,
+                                    bool has_projection, float proj_clip,
+                                    Value** output_state_new) {
+  Value* activated_state = CreateNewSimilarValue(graph, cell_state);
+  {
+    // #1 activation: activate(cell_state)
+    Node* node = graph->NewNode();
+    switch (activation) {
+      case kTfLiteActTanh:
+        node->operation.type = ToString(OperationType::TANH);
+        break;
+      case kTfLiteActSigmoid:
+        node->operation.type = ToString(OperationType::SIGMOID);
+        break;
+      default:
+        return absl::InvalidArgumentError(
+            absl::StrCat("Unsupported activation: ", activation));
+    }
+    RETURN_IF_ERROR(graph->AddConsumer(node->id, cell_state->id));
+    RETURN_IF_ERROR(graph->SetProducer(node->id, activated_state->id));
+  }
+
+  Value* new_output_state = CreateNewSimilarValue(graph, cell_state);
+  {
+    // #2 elementwise multiplication: output_gate .* #1
+    Node* node = graph->NewNode();
+    node->operation.type = ToString(OperationType::MUL);
+    RETURN_IF_ERROR(graph->AddConsumer(node->id, activated_state->id));
+    RETURN_IF_ERROR(graph->AddConsumer(node->id, output_gate->id));
+    RETURN_IF_ERROR(graph->SetProducer(node->id, new_output_state->id));
+  }
+
+  if (!has_projection) {
+    *output_state_new = new_output_state;
+    return absl::OkStatus();
+  }
+
+  Value* projected_output_state = CreateNewSimilarValue(graph, output_state);
+  {
+    // #3 matrix multiplication: projection_weights * #2 + projection_bias
+    Node* node = graph->NewNode();
+    FullyConnectedAttributes fc_attr;
+    RETURN_IF_ERROR(SetFullyConnectedWeights(
+        tflite::ops::builtin::lstm::full::kProjectionWeightsTensor, reader,
+        &fc_attr));
+    // Projection bias is optional
+    reader
+        ->ReadTensor(tflite::ops::builtin::lstm::full::kProjectionBiasTensor,
+                     &(fc_attr.bias))
+        .IgnoreError();
+    node->operation.attributes = std::move(fc_attr);
+    node->operation.type = ToString(OperationType::FULLY_CONNECTED);
+    RETURN_IF_ERROR(graph->AddConsumer(node->id, new_output_state->id));
+    RETURN_IF_ERROR(graph->SetProducer(node->id, projected_output_state->id));
+  }
+
+  if (proj_clip <= 0.0f) {
+    *output_state_new = projected_output_state;
+    return absl::OkStatus();
+  }
+
+  // TODO(b/157166356): Maybe add OperationType::CLAMP ?
+  Value* max_clipped_state =
+      CreateNewSimilarValue(graph, projected_output_state);
+  {
+    // #4 elementwise minimum: min(#3, clip)
+    Node* node = graph->NewNode();
+    node->operation.type = ToString(OperationType::MINIMUM);
+    ElementwiseAttributes attr;
+    attr.param = proj_clip;
+    node->operation.attributes = std::move(attr);
+    RETURN_IF_ERROR(graph->AddConsumer(node->id, projected_output_state->id));
+    RETURN_IF_ERROR(graph->SetProducer(node->id, max_clipped_state->id));
+  }
+  Value* clipped_output_state = CreateNewSimilarValue(graph, max_clipped_state);
+  {
+    // #5 elementwise maximum: max(#4, -clip)
+    Node* node = graph->NewNode();
+    node->operation.type = ToString(OperationType::MAXIMUM);
+    ElementwiseAttributes attr;
+    attr.param = -proj_clip;
+    node->operation.attributes = std::move(attr);
+    RETURN_IF_ERROR(graph->AddConsumer(node->id, max_clipped_state->id));
+    RETURN_IF_ERROR(graph->SetProducer(node->id, clipped_output_state->id));
+  }
+  *output_state_new = clipped_output_state;
+  return absl::OkStatus();
+}
+
+}  // namespace
+
+// Build subgraph for a single LSTM OP.
+// Returns a mapping for the used variable tensors' updated Values.
+//
+// High-level parameters:
+//   - Has CIFG:
+//       If false, calculate input_gate regularly.
+//       If true, calculate input_gate to 1-forget_gate.
+//   - Has peephole: see BuildLstmGate. Applies to all gates.
+//   - Has normalization: see BuildLstmGate. Applies to all gates.
+//   - Has projection, projection_bias, proj_clip: see BuildOutputStateUpdate
+//   - Which activation to use:
+//       Applies to only cell gate and output state update.
+//       Other gates always use Sigmoid.
+//
+absl::Status ParseLSTMAttributes(
+    const TfLiteNode* tflite_node, const TfLiteRegistration* registration,
+    GraphFloat32* graph, ObjectReader* reader, const TfLiteLSTMParams* params,
+    absl::flat_hash_map<int, ValueId>* new_variable_input_values) {
+  const bool has_cifg = HasCifg(tflite_node);
+  const bool has_peephole = HasPeephole(tflite_node);
+  const bool has_normalization = HasNormalization(tflite_node);
+  const bool has_projection = HasProjection(tflite_node);
+
+  Value* old_cell_state;
+  RETURN_IF_ERROR(reader->ReadValue(
+      tflite::ops::builtin::lstm::full::kCellStateTensor, &old_cell_state));
+
+  if (old_cell_state->tensor.shape.b != 1) {
+    return absl::InvalidArgumentError(
+        "Batched execution is not supported for LSTM");
+  }
+
+  Value* old_output_state;
+  RETURN_IF_ERROR(reader->ReadValue(
+      tflite::ops::builtin::lstm::full::kOutputStateTensor, &old_output_state));
+
+  Value* forget_gate;
+  RETURN_IF_ERROR(BuildLstmGate(
+      graph, reader, old_output_state, old_cell_state,
+      tflite::ops::builtin::lstm::full::kInputToForgetWeightsTensor,
+      tflite::ops::builtin::lstm::full::kRecurrentToForgetWeightsTensor,
+      tflite::ops::builtin::lstm::full::kCellToForgetWeightsTensor,
+      tflite::ops::builtin::lstm::full::kForgetGateBiasTensor,
+      tflite::ops::builtin::lstm::full::kForgetLayerNormCoefficientsTensor,
+      kTfLiteActSigmoid, has_peephole, has_normalization, &forget_gate));
+
+  Value* input_gate;
+  if (has_cifg) {
+    // When using cifg, input_gate is computed as (1 - forget_gate).
+    Node* node = graph->NewNode();
+    input_gate = CreateNewSimilarValue(graph, forget_gate);
+
+    node->operation.type = ToString(OperationType::SUB);
+    ElementwiseAttributes attr;
+    attr.param = 1.0f;
+    attr.runtime_tensor_is_second = true;
+    node->operation.attributes = std::move(attr);
+    RETURN_IF_ERROR(graph->AddConsumer(node->id, forget_gate->id));
+    RETURN_IF_ERROR(graph->SetProducer(node->id, input_gate->id));
+  } else {
+    RETURN_IF_ERROR(BuildLstmGate(
+        graph, reader, old_output_state, old_cell_state,
+        tflite::ops::builtin::lstm::full::kInputToInputWeightsTensor,
+        tflite::ops::builtin::lstm::full::kRecurrentToInputWeightsTensor,
+        tflite::ops::builtin::lstm::full::kCellToInputWeightsTensor,
+        tflite::ops::builtin::lstm::full::kInputGateBiasTensor,
+        tflite::ops::builtin::lstm::full::kInputLayerNormCoefficientsTensor,
+        kTfLiteActSigmoid, has_peephole, has_normalization, &input_gate));
+  }
+
+  // Cell state will not have peephole connections to itself
+  Value* cell_gate;
+  RETURN_IF_ERROR(BuildLstmGate(
+      graph, reader, old_output_state, old_cell_state,
+      tflite::ops::builtin::lstm::full::kInputToCellWeightsTensor,
+      tflite::ops::builtin::lstm::full::kRecurrentToCellWeightsTensor,
+      /*cell_weight_id=*/-1,
+      tflite::ops::builtin::lstm::full::kCellGateBiasTensor,
+      tflite::ops::builtin::lstm::full::kCellLayerNormCoefficientsTensor,
+      params->activation, /*has_peephole=*/false, has_normalization,
+      &cell_gate));
+
+  Value* new_cell_state;
+  RETURN_IF_ERROR(BuildCellStateUpdate(graph, reader, forget_gate, input_gate,
+                                       cell_gate, params->cell_clip,
+                                       &new_cell_state));
+
+  Value* output_gate;
+  RETURN_IF_ERROR(BuildLstmGate(
+      graph, reader, old_output_state, new_cell_state,
+      tflite::ops::builtin::lstm::full::kInputToOutputWeightsTensor,
+      tflite::ops::builtin::lstm::full::kRecurrentToOutputWeightsTensor,
+      tflite::ops::builtin::lstm::full::kCellToOutputWeightsTensor,
+      tflite::ops::builtin::lstm::full::kOutputGateBiasTensor,
+      tflite::ops::builtin::lstm::full::kOutputLayerNormCoefficientsTensor,
+      kTfLiteActSigmoid, has_peephole, has_normalization, &output_gate));
+
+  Value* new_output_state;
+  RETURN_IF_ERROR(BuildOutputStateUpdate(graph, reader, old_output_state,
+                                         output_gate, new_cell_state,
+                                         params->activation, has_projection,
+                                         params->proj_clip, &new_output_state));
+
+  {
+    // Copy updated output state to output.
+    Node* node = graph->NewNode();
+    node->operation.type = ToString(OperationType::COPY);
+    RETURN_IF_ERROR(graph->AddConsumer(node->id, new_output_state->id));
+    RETURN_IF_ERROR(reader->AddOutput(
+        node, tflite::ops::builtin::lstm::full::kOutputTensor));
+  }
+
+  new_variable_input_values->clear();
+  new_variable_input_values->emplace(
+      tflite::ops::builtin::lstm::full::kCellStateTensor, new_cell_state->id);
+  new_variable_input_values->emplace(
+      tflite::ops::builtin::lstm::full::kOutputStateTensor,
+      new_output_state->id);
+  return absl::OkStatus();
+}
+
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/common/lstm_parser.h b/tensorflow/lite/delegates/gpu/common/lstm_parser.h
new file mode 100644
index 00000000000..b7c32371abc
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/common/lstm_parser.h
@@ -0,0 +1,34 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_LSTM_PARSER_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_LSTM_PARSER_H_
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/object_reader.h"
+
+namespace tflite {
+namespace gpu {
+
+absl::Status ParseLSTMAttributes(
+    const TfLiteNode* tflite_node, const TfLiteRegistration* registration,
+    GraphFloat32* graph, ObjectReader* reader, const TfLiteLSTMParams* params,
+    absl::flat_hash_map<int, ValueId>* new_variable_input_values);
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_LSTM_PARSER_H_
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management.cc b/tensorflow/lite/delegates/gpu/common/memory_management.cc
index d7e6a060eb2..2a637d54016 100644
--- a/tensorflow/lite/delegates/gpu/common/memory_management.cc
+++ b/tensorflow/lite/delegates/gpu/common/memory_management.cc
@@ -15,17 +15,21 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/memory_management.h"
 
-#include <algorithm>
-#include <limits>
+#include <cstddef>
 #include <numeric>
-#include <queue>
-#include <set>
-#include <type_traits>
+#include <utility>
 #include <vector>
 
+#include "tensorflow/lite/delegates/gpu/common/memory_management/equality_assignment.h"
 #include "tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_breadth_assignment.h"
 #include "tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_size_assignment.h"
+#include "tensorflow/lite/delegates/gpu/common/memory_management/greedy_in_order_assignment.h"
+#include "tensorflow/lite/delegates/gpu/common/memory_management/min_cost_flow_assignment.h"
+#include "tensorflow/lite/delegates/gpu/common/memory_management/naive_assignment.h"
+#include "tensorflow/lite/delegates/gpu/common/memory_management/types.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
 
 namespace tflite {
 namespace gpu {
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management.h b/tensorflow/lite/delegates/gpu/common/memory_management.h
index 7df4947ee3d..9f1adcebd7f 100644
--- a/tensorflow/lite/delegates/gpu/common/memory_management.h
+++ b/tensorflow/lite/delegates/gpu/common/memory_management.h
@@ -16,16 +16,12 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_H_
 
-#include <cstdint>
-#include <memory>
+#include <stddef.h>
+
 #include <vector>
 
 #include "absl/memory/memory.h"
 #include "tensorflow/lite/delegates/gpu/common/memory_management/equality_assignment.h"
-#include "tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_breadth_assignment.h"
-#include "tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_size_assignment.h"
-#include "tensorflow/lite/delegates/gpu/common/memory_management/greedy_in_order_assignment.h"
-#include "tensorflow/lite/delegates/gpu/common/memory_management/min_cost_flow_assignment.h"
 #include "tensorflow/lite/delegates/gpu/common/memory_management/naive_assignment.h"
 #include "tensorflow/lite/delegates/gpu/common/memory_management/types.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management/equality_assignment.h b/tensorflow/lite/delegates/gpu/common/memory_management/equality_assignment.h
index fdccce5159f..018e5a95b51 100644
--- a/tensorflow/lite/delegates/gpu/common/memory_management/equality_assignment.h
+++ b/tensorflow/lite/delegates/gpu/common/memory_management/equality_assignment.h
@@ -16,6 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_EQUALITY_ASSIGNMENT_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_EQUALITY_ASSIGNMENT_H_
 
+#include <stddef.h>
+
+#include <cstddef>
 #include <queue>
 #include <vector>
 
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_breadth_assignment.cc b/tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_breadth_assignment.cc
index 2c138b4c14c..b07ab61a1a5 100644
--- a/tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_breadth_assignment.cc
+++ b/tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_breadth_assignment.cc
@@ -16,12 +16,13 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_breadth_assignment.h"
 
 #include <algorithm>
-#include <cstdint>
-#include <cstdlib>
+#include <cstddef>
 #include <set>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "tensorflow/lite/delegates/gpu/common/memory_management/internal.h"
+#include "tensorflow/lite/delegates/gpu/common/memory_management/types.h"
 
 namespace tflite {
 namespace gpu {
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_breadth_assignment.h b/tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_breadth_assignment.h
index 47035229920..e207ab323b5 100644
--- a/tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_breadth_assignment.h
+++ b/tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_breadth_assignment.h
@@ -16,7 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_GREEDY_BY_BREADTH_ASSIGNMENT_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_GREEDY_BY_BREADTH_ASSIGNMENT_H_
 
-#include <cstdint>
+#include <stddef.h>
+
 #include <vector>
 
 #include "tensorflow/lite/delegates/gpu/common/memory_management/types.h"
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_size_assignment.cc b/tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_size_assignment.cc
index 76309ce8f1b..130f27152cd 100644
--- a/tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_size_assignment.cc
+++ b/tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_size_assignment.cc
@@ -16,8 +16,13 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_size_assignment.h"
 
 #include <algorithm>
+#include <cstddef>
+#include <iterator>
+#include <vector>
 
+#include "absl/status/status.h"
 #include "tensorflow/lite/delegates/gpu/common/memory_management/internal.h"
+#include "tensorflow/lite/delegates/gpu/common/memory_management/types.h"
 
 namespace tflite {
 namespace gpu {
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_size_assignment.h b/tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_size_assignment.h
index b0ad9d18911..198a25c7a57 100644
--- a/tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_size_assignment.h
+++ b/tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_size_assignment.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_GREEDY_BY_SIZE_ASSIGNMENT_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_GREEDY_BY_SIZE_ASSIGNMENT_H_
 
+#include <stddef.h>
+
 #include <vector>
 
 #include "tensorflow/lite/delegates/gpu/common/memory_management/types.h"
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management/greedy_in_order_assignment.h b/tensorflow/lite/delegates/gpu/common/memory_management/greedy_in_order_assignment.h
index 8c3719e4a8b..048ed389700 100644
--- a/tensorflow/lite/delegates/gpu/common/memory_management/greedy_in_order_assignment.h
+++ b/tensorflow/lite/delegates/gpu/common/memory_management/greedy_in_order_assignment.h
@@ -16,7 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_GREEDY_IN_ORDER_ASSIGNMENT_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_GREEDY_IN_ORDER_ASSIGNMENT_H_
 
+#include <stddef.h>
+
 #include <algorithm>
+#include <cstddef>
+#include <iterator>
 #include <list>
 #include <queue>
 #include <set>
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management/internal.cc b/tensorflow/lite/delegates/gpu/common/memory_management/internal.cc
index bbcd373287f..27126aa929f 100644
--- a/tensorflow/lite/delegates/gpu/common/memory_management/internal.cc
+++ b/tensorflow/lite/delegates/gpu/common/memory_management/internal.cc
@@ -16,6 +16,11 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/memory_management/internal.h"
 
 #include <algorithm>
+#include <cstddef>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/memory_management/types.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
 
 namespace tflite {
 namespace gpu {
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management/internal.h b/tensorflow/lite/delegates/gpu/common/memory_management/internal.h
index 702fd2992cc..4d48f75da9f 100644
--- a/tensorflow/lite/delegates/gpu/common/memory_management/internal.h
+++ b/tensorflow/lite/delegates/gpu/common/memory_management/internal.h
@@ -16,9 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_INTERNAL_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_INTERNAL_H_
 
-#include <cstdint>
+#include <stddef.h>
+
 #include <limits>
-#include <memory>
 #include <vector>
 
 #include "absl/memory/memory.h"
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management/internal_test.cc b/tensorflow/lite/delegates/gpu/common/memory_management/internal_test.cc
index 757cb89b366..ed83e3c5109 100644
--- a/tensorflow/lite/delegates/gpu/common/memory_management/internal_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/memory_management/internal_test.cc
@@ -15,8 +15,12 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/memory_management/internal.h"
 
+#include <cstddef>
+#include <vector>
+
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/common/memory_management/types.h"
 
 namespace tflite {
 namespace gpu {
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management/min_cost_flow_assignment.cc b/tensorflow/lite/delegates/gpu/common/memory_management/min_cost_flow_assignment.cc
index 059c23fab33..c56ac2e391b 100644
--- a/tensorflow/lite/delegates/gpu/common/memory_management/min_cost_flow_assignment.cc
+++ b/tensorflow/lite/delegates/gpu/common/memory_management/min_cost_flow_assignment.cc
@@ -16,11 +16,15 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/memory_management/min_cost_flow_assignment.h"
 
 #include <algorithm>
+#include <cstddef>
+#include <limits>
 #include <queue>
-#include <set>
+#include <utility>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "tensorflow/lite/delegates/gpu/common/memory_management/internal.h"
+#include "tensorflow/lite/delegates/gpu/common/memory_management/types.h"
 
 namespace tflite {
 namespace gpu {
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management/min_cost_flow_assignment.h b/tensorflow/lite/delegates/gpu/common/memory_management/min_cost_flow_assignment.h
index 1284c12c5c2..df734ad9ea4 100644
--- a/tensorflow/lite/delegates/gpu/common/memory_management/min_cost_flow_assignment.h
+++ b/tensorflow/lite/delegates/gpu/common/memory_management/min_cost_flow_assignment.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_MIN_COST_FLOW_ASSIGNMENT_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_MIN_COST_FLOW_ASSIGNMENT_H_
 
+#include <stddef.h>
+
 #include <vector>
 
 #include "tensorflow/lite/delegates/gpu/common/memory_management/types.h"
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management/naive_assignment.h b/tensorflow/lite/delegates/gpu/common/memory_management/naive_assignment.h
index 8a00c67d853..d700f62006c 100644
--- a/tensorflow/lite/delegates/gpu/common/memory_management/naive_assignment.h
+++ b/tensorflow/lite/delegates/gpu/common/memory_management/naive_assignment.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_NAIVE_ASSIGNMENT_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_NAIVE_ASSIGNMENT_H_
 
+#include <stddef.h>
+
 #include <vector>
 
 #include "tensorflow/lite/delegates/gpu/common/memory_management/internal.h"
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management/types.cc b/tensorflow/lite/delegates/gpu/common/memory_management/types.cc
index 5cec0cab4c4..101ca5316f1 100644
--- a/tensorflow/lite/delegates/gpu/common/memory_management/types.cc
+++ b/tensorflow/lite/delegates/gpu/common/memory_management/types.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/memory_management/types.h"
 
 #include <algorithm>
-#include <cstdint>
+#include <cstddef>
 #include <queue>
 #include <vector>
 
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management/types.h b/tensorflow/lite/delegates/gpu/common/memory_management/types.h
index a511152ed0b..f3257fcf5f8 100644
--- a/tensorflow/lite/delegates/gpu/common/memory_management/types.h
+++ b/tensorflow/lite/delegates/gpu/common/memory_management/types.h
@@ -16,8 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_TYPES_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_TYPES_H_
 
-#include <cstdint>
-#include <memory>
+#include <stddef.h>
+
+#include <cstddef>
 #include <vector>
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management/types_test.cc b/tensorflow/lite/delegates/gpu/common/memory_management/types_test.cc
index 0312dc27877..22558ec8b94 100644
--- a/tensorflow/lite/delegates/gpu/common/memory_management/types_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/memory_management/types_test.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/memory_management/types.h"
 
+#include <vector>
+
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management_test.cc b/tensorflow/lite/delegates/gpu/common/memory_management_test.cc
index 12f5b6ebe6c..ba951354d17 100644
--- a/tensorflow/lite/delegates/gpu/common/memory_management_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/memory_management_test.cc
@@ -15,8 +15,15 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/memory_management.h"
 
+#include <cstddef>
+#include <vector>
+
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "tensorflow/lite/delegates/gpu/common/memory_management/types.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
 
 namespace tflite {
 namespace gpu {
diff --git a/tensorflow/lite/delegates/gpu/common/model.cc b/tensorflow/lite/delegates/gpu/common/model.cc
index a2f9da428ba..1272710fde8 100644
--- a/tensorflow/lite/delegates/gpu/common/model.cc
+++ b/tensorflow/lite/delegates/gpu/common/model.cc
@@ -15,7 +15,21 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 
+#include <stdint.h>
+
+#include <algorithm>
+#include <iterator>
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
 
 namespace tflite {
 namespace gpu {
@@ -32,6 +46,11 @@ std::vector<Value*> GraphFloat32::inputs() const {
   return FilterValues([](const ValueDef& v) { return v.producer == nullptr; });
 }
 
+std::vector<Value*> GraphFloat32::variable_inputs() const {
+  return FilterValues(
+      [](const ValueDef& v) { return v.value->tensor.is_variable_input; });
+}
+
 std::vector<Value*> GraphFloat32::outputs() const {
   return FilterValues([](const ValueDef& v) { return v.consumers.empty(); });
 }
@@ -397,19 +416,19 @@ absl::Status RemoveFollowingNode(GraphFloat32* graph, const Node* to_remove,
   return graph->DeleteNode(to_remove->id);
 }
 
-absl::Status RemoveOneInputOneOutputNode(GraphFloat32* graph,
-                                         const Node* to_remove) {
-  auto inputs = graph->FindInputs(to_remove->id);
-  auto outputs = graph->FindOutputs(to_remove->id);
+absl::Status RemoveSimpleNodeKeepInput(GraphFloat32* graph,
+                                       const Node* simple_node) {
+  const auto inputs = graph->FindInputs(simple_node->id);
+  const auto outputs = graph->FindOutputs(simple_node->id);
   if (inputs.size() != 1 || outputs.size() != 1) {
-    return absl::InvalidArgumentError(
-        "To_remove node must have 1 input and 1 output");
+    return absl::FailedPreconditionError(
+        "simple_node node must have 1 input and 1 output");
   }
-  auto input_id = inputs[0]->id;
-  auto output_id = outputs[0]->id;
-  Node* producer = graph->FindProducer(input_id);
-  auto consumers = graph->FindConsumers(output_id);
-  RETURN_IF_ERROR(graph->DeleteNode(to_remove->id));
+  const auto input_id = inputs[0]->id;
+  const auto output_id = outputs[0]->id;
+  const Node* producer = graph->FindProducer(input_id);
+  const auto consumers = graph->FindConsumers(output_id);
+  RETURN_IF_ERROR(graph->DeleteNode(simple_node->id));
   for (auto& consumer : consumers) {
     RETURN_IF_ERROR(graph->ReplaceInput(consumer->id, output_id, input_id));
   }
@@ -420,6 +439,38 @@ absl::Status RemoveOneInputOneOutputNode(GraphFloat32* graph,
   return absl::OkStatus();
 }
 
+absl::Status RemoveSimpleNodeKeepOutput(GraphFloat32* graph,
+                                        const Node* simple_node) {
+  const auto inputs = graph->FindInputs(simple_node->id);
+  const auto outputs = graph->FindOutputs(simple_node->id);
+  if (inputs.size() != 1 || outputs.size() != 1) {
+    return absl::FailedPreconditionError(
+        "simple_node must have 1 input and 1 output");
+  }
+  const auto input_id = inputs[0]->id;
+  const auto output_id = outputs[0]->id;
+  const Node* producer = graph->FindProducer(input_id);
+  const auto input_consumers = graph->FindConsumers(input_id);
+  if (input_consumers.size() != 1) {
+    return absl::FailedPreconditionError(
+        "simple_node should be the only consumer on the node.");
+  }
+
+  RETURN_IF_ERROR(graph->DeleteNode(simple_node->id));
+  if (producer) {
+    RETURN_IF_ERROR(graph->RemoveProducer(input_id));
+    RETURN_IF_ERROR(graph->SetProducer(producer->id, output_id));
+  }
+
+  RETURN_IF_ERROR(graph->DeleteValue(input_id));
+
+  const auto output_consumers = graph->FindConsumers(output_id);
+  if (!producer && output_consumers.empty()) {
+    RETURN_IF_ERROR(graph->DeleteValue(output_id));
+  }
+  return absl::OkStatus();
+}
+
 absl::Status AddOutput(GraphFloat32* graph, const Node* from_node,
                        Value** output) {
   auto link = graph->NewValue();
@@ -430,10 +481,22 @@ absl::Status AddOutput(GraphFloat32* graph, const Node* from_node,
 
 absl::Status ConnectTwoNodes(GraphFloat32* graph, const Node* from_node,
                              const Node* to_node, Value** output) {
-  Value* link;
-  RETURN_IF_ERROR(AddOutput(graph, from_node, &link));
-  RETURN_IF_ERROR(graph->AddConsumer(to_node->id, link->id));
-  *output = link;
+  const Node* output_producer =
+      *output ? graph->FindProducer((*output)->id) : nullptr;
+  // Output is already initialized, but producer is not from_node.
+  if (*output && output_producer && output_producer->id != from_node->id) {
+    return absl::InvalidArgumentError("Wrong output is passed.");
+  }
+  // Output is already initialized, and producer is from_node.
+  if (*output) {
+    RETURN_IF_ERROR(graph->AddConsumer(to_node->id, (*output)->id));
+  } else {
+    // Output is not initialized.
+    Value* link;
+    RETURN_IF_ERROR(AddOutput(graph, from_node, &link));
+    RETURN_IF_ERROR(graph->AddConsumer(to_node->id, link->id));
+    *output = link;
+  }
   return absl::OkStatus();
 }
 
diff --git a/tensorflow/lite/delegates/gpu/common/model.h b/tensorflow/lite/delegates/gpu/common/model.h
index f6d160977f9..e81a2dfe1af 100644
--- a/tensorflow/lite/delegates/gpu/common/model.h
+++ b/tensorflow/lite/delegates/gpu/common/model.h
@@ -24,10 +24,8 @@ limitations under the License.
 #include <vector>
 
 #include "absl/memory/memory.h"
-#include "absl/strings/str_cat.h"
 #include "absl/types/any.h"
 #include "absl/types/optional.h"
-#include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
@@ -92,6 +90,9 @@ class GraphFloat32 {
   // @return graph outputs, that are values without consumers.
   std::vector<Value*> outputs() const;
 
+  // @return values updated in place with a previously defined tensor reference.
+  std::vector<Value*> variable_inputs() const;
+
   // @return inputs into the given node. Returns empty vector for deleted node.
   std::vector<Value*> FindInputs(NodeId id) const;
 
@@ -235,14 +236,24 @@ absl::Status RemovePrecedingNode(GraphFloat32* graph, const Node* to_remove,
 absl::Status RemoveFollowingNode(GraphFloat32* graph, const Node* to_remove,
                                  const Node* to_keep);
 
-// Removes to_remove node.
-// Requires that node has one input and one output;
-absl::Status RemoveOneInputOneOutputNode(GraphFloat32* graph,
-                                         const Node* to_remove);
+// Removes simple_node and its output value from the graph. Node is considered
+// simple if it has only one input and one output value. Input value is kept.
+absl::Status RemoveSimpleNodeKeepInput(GraphFloat32* graph,
+                                       const Node* simple_node);
+
+// Removes simple_node and its input value from the graph. Node is considered
+// simple if it has only one input and one output value. Output value is kept.
+// simple_node should be an exclusive consumer of its input value.
+absl::Status RemoveSimpleNodeKeepOutput(GraphFloat32* graph,
+                                        const Node* simple_node);
 
 absl::Status AddOutput(GraphFloat32* graph, const Node* from_node,
                        Value** output);
 
+// Makes a direct connection between from_node and to_node. All input parameters
+// except output are expected to be initialized before passing to the function.
+// If from_node already has an output value, which is not yet consumed by
+// to_node, it may be passed as output parameter.
 absl::Status ConnectTwoNodes(GraphFloat32* graph, const Node* from_node,
                              const Node* to_node, Value** output);
 
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder.cc b/tensorflow/lite/delegates/gpu/common/model_builder.cc
index 4c0fd827834..9a394f8f6c5 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder.cc
@@ -16,67 +16,47 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/model_builder.h"
 
 #include <algorithm>
+#include <any>
 #include <cstdint>
-#include <cstring>
-#include <limits>
+#include <map>
 #include <memory>
+#include <optional>
 #include <set>
 #include <string>
-#include <unordered_map>
 #include <utility>
+#include <variant>
 #include <vector>
 
+#include "absl/base/attributes.h"
+#include "absl/container/flat_hash_map.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
-#include "tensorflow/lite/builtin_op_data.h"
 #include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/context.h"
-#include "tensorflow/lite/context_util.h"
 #include "tensorflow/lite/delegates/gpu/common/custom_parsers.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/lstm_parser.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/model_builder_helper.h"
+#include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
 #include "tensorflow/lite/delegates/gpu/common/object_reader.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
-#include "tensorflow/lite/delegates/gpu/common/transformations/general_transformations.h"
+#include "tensorflow/lite/delegates/gpu/common/transformations/model_transformations.h"
 #include "tensorflow/lite/delegates/utils.h"
 #include "tensorflow/lite/kernels/internal/reference/dequantize.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/util.h"
 
 namespace tflite {
 namespace gpu {
 namespace {
 
-// Creates a node that consumes output from the given node. Because output need
-// to stay the same, newly created node will inherit the output from the given
-// node, which will in turn get newly created copy of output. This is necessary
-// to preserve reference consistency if another node was pointing at that
-// output:
-//   node(output)
-// will turn into:
-//   node(copy(output)) <- passthrough_node(output)
-absl::Status NewPassthroughNode(GraphFloat32* graph, Node* node,
-                                const Value* output, Node** passthru_node) {
-  *passthru_node = graph->NewNode();
-  // Make copies for every output in the original node.
-  RETURN_IF_ERROR(graph->SetProducer((*passthru_node)->id, output->id));
-  Value* copy_output = graph->NewValue();
-  RETURN_IF_ERROR(graph->SetProducer(node->id, copy_output->id));
-  RETURN_IF_ERROR(graph->AddConsumer((*passthru_node)->id, copy_output->id));
-  copy_output->tensor = output->tensor;
-  copy_output->tensor.ref = -1;
-  return absl::OkStatus();
-}
-
 absl::Status CheckTensorIsAvailable(const TfLiteContext* context,
                                     const TfLiteNode* tflite_node, int idx) {
   // If tensor id is in range, it's guaranteed that it'll be available.
@@ -103,73 +83,15 @@ class TFLiteOperationParser {
   virtual absl::Status IsSupported(const TfLiteContext* context,
                                    const TfLiteNode* tflite_node,
                                    const TfLiteRegistration* registration) = 0;
+
+  // Return the value ids in the graph that correspond to the updated values of
+  // the variable input tensor.
+  virtual absl::flat_hash_map<int, ValueId>
+  GetNewValueIdsForVariableInputNodes() {
+    return absl::flat_hash_map<int, ValueId>();
+  }
 };
 
-absl::Status IsActivationSupported(TfLiteFusedActivation fused_activation) {
-  switch (fused_activation) {
-    case kTfLiteActNone:
-    case kTfLiteActRelu:
-    case kTfLiteActReluN1To1:
-    case kTfLiteActRelu6:
-    case kTfLiteActTanh:
-    case kTfLiteActSigmoid:
-      return absl::OkStatus();
-    case kTfLiteActSignBit:
-      return absl::UnimplementedError(
-          "TfLiteFusedActivation.kTfLiteActSignBit");
-
-      // Do not add default; we want compilation error rather than run-time
-      // error.
-  }
-}
-
-// If there is fused activation present, then there will be another node created
-// that will have identical output as the given node. New operation node will
-// depend on the given node output.
-absl::Status MaybeFuseActivation(TfLiteFusedActivation fused_activation,
-                                 GraphFloat32* graph, Node* node) {
-  const auto outputs = graph->FindOutputs(node->id);
-  if (outputs.size() != 1) {
-    return absl::InternalError("Number of outputs != 1");
-  }
-  switch (fused_activation) {
-    case kTfLiteActNone:
-      // Nothing to do here
-      return absl::OkStatus();
-    case kTfLiteActRelu:
-    case kTfLiteActReluN1To1:
-    case kTfLiteActRelu6: {
-      ReLUAttributes attr;
-      attr.clip = fused_activation == kTfLiteActRelu
-                      ? 0.0f
-                      : (fused_activation == kTfLiteActReluN1To1 ? 1.0f : 6.0f);
-      Node* activation_node;
-      RETURN_IF_ERROR(
-          NewPassthroughNode(graph, node, outputs[0], &activation_node));
-      activation_node->operation.type = ToString(OperationType::RELU);
-      activation_node->operation.attributes = attr;
-      return absl::OkStatus();
-    }
-    case kTfLiteActTanh: {
-      Node* activation_node;
-      RETURN_IF_ERROR(
-          NewPassthroughNode(graph, node, outputs[0], &activation_node));
-      activation_node->operation.type = ToString(OperationType::TANH);
-      return absl::OkStatus();
-    }
-    case kTfLiteActSigmoid: {
-      Node* activation_node;
-      RETURN_IF_ERROR(
-          NewPassthroughNode(graph, node, outputs[0], &activation_node));
-      activation_node->operation.type = ToString(OperationType::SIGMOID);
-      return absl::OkStatus();
-    } break;
-    default:
-      return absl::NotFoundError(
-          absl::StrCat("Unsupported fused activation: ", fused_activation));
-  }
-}
-
 HW ToHW(int32_t h, int32_t w) { return HW(h > 0 ? h : 1, w > 0 ? w : 1); }
 
 template <typename AttrT>
@@ -385,6 +307,27 @@ class AddOperationParser : public TFLiteOperationParser {
   }
 };
 
+class BatchedMatMulOperationParser : public TFLiteOperationParser {
+ public:
+  absl::Status IsSupported(const TfLiteContext* context,
+                           const TfLiteNode* tflite_node,
+                           const TfLiteRegistration* registration) final {
+    return CheckInputsOutputs(context, tflite_node,
+                              /*runtime_inputs=*/2, /*outputs=*/1);
+  }
+
+  absl::Status Parse(const TfLiteNode* tflite_node,
+                     const TfLiteRegistration* registration,
+                     GraphFloat32* graph, ObjectReader* reader) final {
+    Node* node = graph->NewNode();
+    node->operation.type = ToString(OperationType::BATCHED_MATMUL);
+    RETURN_IF_ERROR(reader->AddInput(node, 0));
+    RETURN_IF_ERROR(reader->AddInput(node, 1));
+    RETURN_IF_ERROR(reader->AddOutputs(node));
+    return absl::OkStatus();
+  }
+};
+
 class ConcatenationOperationParser : public TFLiteOperationParser {
  public:
   absl::Status IsSupported(const TfLiteContext* context,
@@ -507,7 +450,7 @@ class Conv2DOperationParser : public TFLiteOperationParser {
   absl::Status IsSupported(const TfLiteContext* context,
                            const TfLiteNode* tflite_node,
                            const TfLiteRegistration* registration) final {
-    RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 3));
+    RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 5));
     const int runtime_inputs =
         GetNumberOfRuntimeInputsForNode(context, tflite_node);
     if (runtime_inputs > 2) {
@@ -607,7 +550,7 @@ class DepthwiseConvolutionOperationParser : public TFLiteOperationParser {
   absl::Status IsSupported(const TfLiteContext* context,
                            const TfLiteNode* tflite_node,
                            const TfLiteRegistration* registration) final {
-    RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 3));
+    RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 6));
     RETURN_IF_ERROR(CheckInputsOutputs(context, tflite_node,
                                        /*runtime_inputs=*/1, /*outputs=*/1));
     RETURN_IF_ERROR(CheckTensorIsAvailable(context, tflite_node, 1));
@@ -884,8 +827,10 @@ class ElementwiseOperationParser : public TFLiteOperationParser {
       case OperationType::ABS:
       case OperationType::COPY:
       case OperationType::COS:
+      case OperationType::ELU:
       case OperationType::EXP:
       case OperationType::LOG:
+      case OperationType::NEG:
       case OperationType::RSQRT:
       case OperationType::SIGMOID:
       case OperationType::SIN:
@@ -901,6 +846,8 @@ class ElementwiseOperationParser : public TFLiteOperationParser {
   bool IsTwoArgumentOperation() const {
     switch (operation_type_) {
       case OperationType::DIV:
+      case OperationType::MAXIMUM:
+      case OperationType::MINIMUM:
       case OperationType::POW:
       case OperationType::SQUARED_DIFF:
       case OperationType::SUB:
@@ -912,8 +859,11 @@ class ElementwiseOperationParser : public TFLiteOperationParser {
 
   bool IsTwoArgumentOperationWithConst() const {
     switch (operation_type_) {
-      case OperationType::MINIMUM:
+      case OperationType::DIV:
       case OperationType::MAXIMUM:
+      case OperationType::MINIMUM:
+      case OperationType::POW:
+      case OperationType::SQUARED_DIFF:
       case OperationType::SUB:
         return true;
       default:
@@ -937,6 +887,10 @@ class FullyConnectedOperationParser : public TFLiteOperationParser {
       return absl::UnimplementedError(
           "Unsupported FullyConnected weights format.");
     }
+    if (GetNumberOfRuntimeInputsForNode(context, tflite_node) > 2) {
+      return absl::UnimplementedError(
+          "FullyConnected doesn't support more than 2 runtime inputs.");
+    }
     // TODO(eignasheva): check input shape
     return absl::OkStatus();
   }
@@ -944,11 +898,31 @@ class FullyConnectedOperationParser : public TFLiteOperationParser {
   absl::Status Parse(const TfLiteNode* tflite_node,
                      const TfLiteRegistration* registration,
                      GraphFloat32* graph, ObjectReader* reader) final {
+    const TfLiteFullyConnectedParams* tf_options;
+    RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
+
+    if (reader->GetNumberOfRuntimeInputs() == 2) {
+      // Create Convolution2D, so as it supports runtime weights.
+      Node* node = graph->NewNode();
+      node->operation.type = ToString(OperationType::CONVOLUTION_2D);
+      RETURN_IF_ERROR(reader->AddInput(node, 0));
+      RETURN_IF_ERROR(reader->AddInput(node, 1));
+      RETURN_IF_ERROR(reader->AddOutputs(node));
+
+      Convolution2DAttributes attr;
+      reader->ReadTensor(2, &attr.bias).IgnoreError();  // bias is optional
+
+      attr.strides = HW(1, 1);
+      attr.dilations = HW(1, 1);
+      attr.padding.appended = HW(0, 0);
+      attr.padding.prepended = HW(0, 0);
+      RETURN_IF_ERROR(MaybeFuseActivation(tf_options->activation, graph, node));
+      node->operation.attributes = std::move(attr);
+      return absl::OkStatus();
+    }
     Node* node = graph->NewNode();
     RETURN_IF_ERROR(reader->AddInput(node, 0));
 
-    const TfLiteFullyConnectedParams* tf_options;
-    RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
     if (tf_options->weights_format !=
         kTfLiteFullyConnectedWeightsFormatDefault) {
       return absl::UnimplementedError(
@@ -1030,18 +1004,39 @@ class HardSwishOperationParser : public TFLiteOperationParser {
 //                 /      \
 //           new_state1    activation0
 //
+// For full LSTM cells, see this blog post:
+// https://colah.github.io/posts/2015-08-Understanding-LSTMs/
+// In addition to Peephole connections and Combined Input Forget Gates (CIFG)
+// described in that post, this code also adds the following optional features:
+// - Configurable activations (sigmoid or TANH)
+// - L2 Normalization of gates: https://arxiv.org/abs/1607.06450
+// - Output projection:
+//     https://www.isca-speech.org/archive/interspeech_2014/i14_0338.html
+// - Configurable clipping of cell state and output state.
 class LSTMOperationParser : public TFLiteOperationParser {
  public:
   absl::Status IsSupported(const TfLiteContext* context,
                            const TfLiteNode* tflite_node,
                            const TfLiteRegistration* registration) final {
-    RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 2));
+    RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 3));
     const TfLiteLSTMParams* tf_options;
     RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
     switch (tf_options->kernel_type) {
-      case kTfLiteLSTMFullKernel:
-        // TODO(b/157166356): Add check for input/output tensor counts.
+      case kTfLiteLSTMFullKernel: {
+        const int inputs = NumInputs(tflite_node);
+        if (inputs != 20 && inputs != 24) {
+          return absl::InternalError(
+              absl::StrCat("Expected 20 or 24 input tensors, but node has ",
+                           inputs, " input(s)."));
+        }
+        const int runtime_outputs = NumOutputs(tflite_node);
+        if (runtime_outputs != 1) {
+          return absl::InternalError(
+              absl::StrCat("Expected 1 output tensor, but node has ",
+                           runtime_outputs, " output(s)."));
+        }
         return CheckFullParameters(tf_options);
+      }
       case kTfLiteLSTMBasicKernel:
         RETURN_IF_ERROR(
             CheckInputsConstsOutputs(context, tflite_node, /*runtime_inputs=*/3,
@@ -1063,6 +1058,11 @@ class LSTMOperationParser : public TFLiteOperationParser {
     }
   }
 
+  absl::flat_hash_map<int, ValueId> GetNewValueIdsForVariableInputNodes()
+      final {
+    return new_variable_input_value_map_;
+  }
+
  private:
   absl::Status ParseBasic(const TfLiteNode* tflite_node,
                           const TfLiteRegistration* registration,
@@ -1135,14 +1135,24 @@ class LSTMOperationParser : public TFLiteOperationParser {
                          const TfLiteRegistration* registration,
                          GraphFloat32* graph, ObjectReader* reader,
                          const TfLiteLSTMParams* tf_options) {
-    return absl::UnimplementedError(
-        "Full LSTM support is not yet implemented.");
+    // Invoke full LSTM parser
+    RETURN_IF_ERROR(ParseLSTMAttributes(tflite_node, registration, graph,
+                                        reader, tf_options,
+                                        &new_variable_input_value_map_));
+    return absl::OkStatus();
   }
 
   absl::Status CheckFullParameters(const TfLiteLSTMParams* tf_options) {
-    return absl::UnimplementedError(
-        "Full LSTM support is not yet implemented.");
+    if (tf_options->activation != kTfLiteActSigmoid &&
+        tf_options->activation != kTfLiteActTanh) {
+      return absl::UnimplementedError(
+          "Only sigmoid or tanh activation is supported.");
+    }
+
+    return absl::OkStatus();
   }
+
+  absl::flat_hash_map<int, ValueId> new_variable_input_value_map_;
 };
 
 class MulOperationParser : public TFLiteOperationParser {
@@ -1186,7 +1196,6 @@ class MulOperationParser : public TFLiteOperationParser {
   absl::Status Parse(const TfLiteNode* tflite_node,
                      const TfLiteRegistration* registration,
                      GraphFloat32* graph, ObjectReader* reader) final {
-    // Determine runtime/constant tensors.
     const TfLiteTensor* input0 = reader->GetInputTensor(0);
     if (!input0) {
       return absl::InvalidArgumentError(
@@ -1207,10 +1216,22 @@ class MulOperationParser : public TFLiteOperationParser {
 
     Node* node = graph->NewNode();
     node->operation.type = ToString(OperationType::MUL);
+    RETURN_IF_ERROR(reader->AddOutputs(node));
 
-    // The "larger" input tensor must be bound to 1st input and the "smaller"
-    // input tensor ("mask") must be bound to 2nd input.
+    // Determine runtime/constant tensors.
     if (runtime_tensor0 && runtime_tensor1) {
+      if (input0 == input1) {
+        // replace MUL(A, A) with POW(A, 2.0)
+        // TODO(b/166831113): Support the same inputs for operations.
+        node->operation.type = ToString(OperationType::POW);
+        ElementwiseAttributes attr;
+        attr.param = 2.0f;
+        node->operation.attributes = std::move(attr);
+        return reader->AddInput(node, 0);
+      }
+
+      // The "larger" input tensor must be bound to 1st input and the "smaller"
+      // input tensor must be bound to 2nd input.
       BHWC shape0;
       RETURN_IF_ERROR(ExtractTensorShape(*input0, &shape0));
       BHWC shape1;
@@ -1222,53 +1243,78 @@ class MulOperationParser : public TFLiteOperationParser {
         input_tensor0 = 1;
         input_tensor1 = 0;
       }
-      RETURN_IF_ERROR(
-          ParseApplyMask(node, input_tensor0, input_tensor1, graph, reader));
+      RETURN_IF_ERROR(reader->AddInput(node, input_tensor0));
+      RETURN_IF_ERROR(reader->AddInput(node, input_tensor1));
     } else {
-      // The runtime input tensor must be bound to 1st input and the constant
-      // input tensor must be bound to 2nd input.
-      int runtime_tensor = 0;
-      int constant_tensor = 1;
-      TfLiteIntArray* constant_dims = input1->dims;
-      if (constant_tensor0 && runtime_tensor1) {
-        runtime_tensor = 1;
-        constant_tensor = 0;
-        constant_dims = input0->dims;
-      }
-      RETURN_IF_ERROR(ParseMultiplyScalar(node, runtime_tensor, constant_tensor,
-                                          constant_dims, graph, reader));
+      ElementwiseAttributes attr;
+      RETURN_IF_ERROR(ParseInputsWithConstTensor(node, reader, &attr.param));
+      node->operation.attributes = std::move(attr);
     }
 
     const TfLiteMulParams* tf_options;
     RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
     return MaybeFuseActivation(tf_options->activation, graph, node);
   }
+};
 
- private:
-  absl::Status ParseApplyMask(Node* node, int input_tensor0, int input_tensor1,
-                              GraphFloat32* graph, ObjectReader* reader) {
-    RETURN_IF_ERROR(reader->AddInput(node, input_tensor0));
-    RETURN_IF_ERROR(reader->AddInput(node, input_tensor1));
-    return reader->AddOutputs(node);
+class PackOperationParser : public TFLiteOperationParser {
+ public:
+  absl::Status IsSupported(const TfLiteContext* context,
+                           const TfLiteNode* tflite_node,
+                           const TfLiteRegistration* registration) final {
+    const TfLitePackParams* tf_options;
+    return RetrieveBuiltinData(tflite_node, &tf_options);
   }
 
-  absl::Status ParseMultiplyScalar(Node* node, int runtime_tensor,
-                                   int constant_tensor,
-                                   const TfLiteIntArray* constant_dims,
-                                   GraphFloat32* graph, ObjectReader* reader) {
-    RETURN_IF_ERROR(reader->AddInput(node, runtime_tensor));
-    ElementwiseAttributes attr;
-    if (constant_dims->size <= 0 || NumElements(constant_dims) == 1) {
-      Tensor<Scalar, DataType::FLOAT32> tensor;
-      RETURN_IF_ERROR(reader->ReadTensor(constant_tensor, &tensor));
-      attr.param = tensor.data[0];
+  absl::Status Parse(const TfLiteNode* tflite_node,
+                     const TfLiteRegistration* registration,
+                     GraphFloat32* graph, ObjectReader* reader) final {
+    if (tflite_node->inputs->size == 1) {
+      // Pack with single input can be replaced with Reshape
+      Node* node = graph->NewNode();
+      node->operation.type = ToString(OperationType::RESHAPE);
+      RETURN_IF_ERROR(reader->AddInput(node, 0));
+      RETURN_IF_ERROR(reader->AddOutputs(node));
+      // New shape comes from output shape.
+      ReshapeAttributes attr;
+      attr.new_shape = graph->FindOutputs(node->id)[0]->tensor.shape;
+      node->operation.attributes = attr;
+      return absl::OkStatus();
     } else {
-      Tensor<Linear, DataType::FLOAT32> tensor;
-      RETURN_IF_ERROR(reader->ReadTensor(constant_tensor, &tensor));
-      attr.param = std::move(tensor);
+      // Pack with few inputs can be replaced with Concat
+      const TfLitePackParams* tf_options;
+      RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
+
+      // Read inputs first to make sure const node is added to a graph before
+      // concat node to ensure topological order.
+      std::vector<const Value*> inputs;
+      for (uint32_t idx = 0; idx < tflite_node->inputs->size; ++idx) {
+        Value* value;
+        const auto status = reader->ReadValue(idx, &value);
+        if (status.ok()) {
+          inputs.push_back(value);
+        } else {
+          TensorFloat32 tensor;
+          RETURN_IF_ERROR(reader->ReadTensor(idx, &tensor));
+          Value* value;
+          RETURN_IF_ERROR(NewConstNode(std::move(tensor), graph, &value));
+          inputs.push_back(value);
+        }
+      }
+
+      Node* node = graph->NewNode();
+      node->operation.type = ToString(OperationType::CONCAT);
+      RETURN_IF_ERROR(reader->AddOutputs(node));
+      for (const Value* input : inputs) {
+        RETURN_IF_ERROR(graph->AddConsumer(node->id, input->id));
+      }
+      const TfLiteTensor* output = reader->GetOutputTensor(0);
+      ConcatAttributes attr;
+      RETURN_IF_ERROR(
+          ExtractAxisFromIndex(*output, tf_options->axis, &attr.axis));
+      node->operation.attributes = attr;
+      return absl::OkStatus();
     }
-    node->operation.attributes = std::move(attr);
-    return reader->AddOutputs(node);
   }
 };
 
@@ -1464,6 +1510,52 @@ class Pooling2DOperationParser : public TFLiteOperationParser {
   const PoolingType type_;
 };
 
+class ReduceOperationParser : public TFLiteOperationParser {
+ public:
+  explicit ReduceOperationParser(OperationType operation_type)
+      : operation_type_(operation_type) {}
+
+  absl::Status IsSupported(const TfLiteContext* context,
+                           const TfLiteNode* tflite_node,
+                           const TfLiteRegistration* registration) final {
+    RETURN_IF_ERROR(CheckInputsOutputs(context, tflite_node,
+                                       /*runtime_inputs=*/1, /*outputs=*/1));
+    auto* axes = &context->tensors[tflite_node->inputs->data[1]];
+    if (axes->allocation_type != kTfLiteMmapRo || axes->type != kTfLiteInt32) {
+      return absl::UnimplementedError(
+          "Reduce has unsupported tensor for axes.");
+    }
+    if (tflite::NumElements(axes) != 1) {
+      return absl::UnimplementedError(
+          "Supported reduce in single dimensions only.");
+    }
+    return absl::OkStatus();
+  }
+
+  absl::Status Parse(const TfLiteNode* tflite_node,
+                     const TfLiteRegistration* registration,
+                     GraphFloat32* graph, ObjectReader* reader) final {
+    Node* node = graph->NewNode();
+    node->operation.type = ToString(operation_type_);
+    RETURN_IF_ERROR(reader->AddInput(node, 0));
+    RETURN_IF_ERROR(reader->AddOutputs(node));
+
+    const TfLiteReducerParams* tf_options;
+    RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
+
+    Tensor<Scalar, DataType::INT32> axes;
+    RETURN_IF_ERROR(reader->ReadTensor(1, &axes));
+    const TfLiteTensor* input = reader->GetInputTensor(0);
+    ReduceAttributes attr;
+    RETURN_IF_ERROR(ExtractAxisFromIndex(*input, axes.data[0], &attr.axis));
+    node->operation.attributes = attr;
+    return absl::OkStatus();
+  }
+
+ private:
+  const OperationType operation_type_;
+};
+
 class QuantizeOperationParser : public TFLiteOperationParser {
  public:
   absl::Status IsSupported(const TfLiteContext* context,
@@ -2098,29 +2190,18 @@ class TransposeOperationParser : public TFLiteOperationParser {
     if (perm.data.size() == 4) {
       attr.perm = BHWC(perm.data[0], perm.data[1], perm.data[2], perm.data[3]);
     } else if (perm.data.size() == 3) {
-      std::vector<Axis> index_to_axis = {Axis::CHANNELS, Axis::WIDTH,
-                                         Axis::BATCH};
-      std::map<Axis, Axis> remap = {
-          {Axis::HEIGHT, Axis::HEIGHT},
-          {index_to_axis[perm.data[2]], Axis::BATCH},
-          {index_to_axis[perm.data[1]], Axis::WIDTH},
-          {index_to_axis[perm.data[0]], Axis::CHANNELS}};
-      attr.perm.b = axis_to_index[remap[Axis::BATCH]];
-      attr.perm.h = axis_to_index[remap[Axis::HEIGHT]];
-      attr.perm.w = axis_to_index[remap[Axis::WIDTH]];
-      attr.perm.c = axis_to_index[remap[Axis::CHANNELS]];
-
+      std::vector<Axis> index_to_axis = {Axis::BATCH, Axis::WIDTH,
+                                         Axis::CHANNELS};
+      attr.perm.b = axis_to_index[index_to_axis[perm.data[0]]];
+      attr.perm.h = 1;
+      attr.perm.w = axis_to_index[index_to_axis[perm.data[1]]];
+      attr.perm.c = axis_to_index[index_to_axis[perm.data[2]]];
     } else if (perm.data.size() == 2) {
-      std::vector<Axis> index_to_axis = {Axis::CHANNELS, Axis::BATCH};
-      std::map<Axis, Axis> remap = {
-          {Axis::HEIGHT, Axis::HEIGHT},
-          {Axis::WIDTH, Axis::WIDTH},
-          {index_to_axis[perm.data[1]], Axis::BATCH},
-          {index_to_axis[perm.data[0]], Axis::CHANNELS}};
-      attr.perm.b = axis_to_index[remap[Axis::BATCH]];
-      attr.perm.h = axis_to_index[remap[Axis::HEIGHT]];
-      attr.perm.w = axis_to_index[remap[Axis::WIDTH]];
-      attr.perm.c = axis_to_index[remap[Axis::CHANNELS]];
+      std::vector<Axis> index_to_axis = {Axis::BATCH, Axis::CHANNELS};
+      attr.perm.b = axis_to_index[index_to_axis[perm.data[0]]];
+      attr.perm.h = 1;
+      attr.perm.w = 2;
+      attr.perm.c = axis_to_index[index_to_axis[perm.data[1]]];
     } else {
       return absl::InvalidArgumentError(
           "Permutation for transpose is invalid.");
@@ -2264,6 +2345,7 @@ class RoIToTransformMatrixOperationParser : public TFLiteOperationParser {
   absl::Status IsSupported(const TfLiteContext* context,
                            const TfLiteNode* tflite_node,
                            const TfLiteRegistration* registration) final {
+    RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 2));
     RETURN_IF_ERROR(CheckInputsOutputs(context, tflite_node,
                                        /*runtime_inputs=*/1, /*outputs=*/1));
     return absl::OkStatus();
@@ -2279,57 +2361,23 @@ class RoIToTransformMatrixOperationParser : public TFLiteOperationParser {
     std::string op_name = "roi_to_transform_matrix";
     node->operation.type = op_name;
     BHWC output_shape;
-    RETURN_IF_ERROR(
-        ParseCustomAttributes(op_name, tflite_node->custom_initial_data,
-                              tflite_node->custom_initial_data_size,
-                              &(node->operation.attributes), &output_shape));
+    RETURN_IF_ERROR(ParseCustomAttributes(
+        op_name, registration->version, tflite_node->custom_initial_data,
+        tflite_node->custom_initial_data_size, &(node->operation.attributes),
+        &output_shape));
 
     auto output_value = graph->FindOutputs(node->id)[0];
     output_value->tensor.shape = output_shape;
     return absl::OkStatus();
   }
-
- private:
 };
 
-class RoIToTransformMatrixV2OperationParser : public TFLiteOperationParser {
- public:
-  absl::Status IsSupported(const TfLiteContext* context,
-                           const TfLiteNode* tflite_node,
-                           const TfLiteRegistration* registration) final {
-    RETURN_IF_ERROR(CheckInputsOutputs(context, tflite_node,
-                                       /*runtime_inputs=*/1, /*outputs=*/1));
-    return absl::OkStatus();
-  }
-
-  absl::Status Parse(const TfLiteNode* tflite_node,
-                     const TfLiteRegistration* registration,
-                     GraphFloat32* graph, ObjectReader* reader) final {
-    Node* node = graph->NewNode();
-    RETURN_IF_ERROR(reader->AddInput(node, 0));  // bbox
-    RETURN_IF_ERROR(reader->AddOutputs(node));
-
-    std::string op_name = "roi_to_transform_matrix_v2";
-    node->operation.type = op_name;
-    BHWC output_shape;
-    RETURN_IF_ERROR(
-        ParseCustomAttributes(op_name, tflite_node->custom_initial_data,
-                              tflite_node->custom_initial_data_size,
-                              &(node->operation.attributes), &output_shape));
-
-    auto output_value = graph->FindOutputs(node->id)[0];
-    output_value->tensor.shape = output_shape;
-    return absl::OkStatus();
-  }
-
- private:
-};
-
-class TransformTensorOperationParser : public TFLiteOperationParser {
+class TransformTensorBilinearOperationParser : public TFLiteOperationParser {
  public:
   absl::Status IsSupported(const TfLiteContext* context,
                            const TfLiteNode* tflite_node,
                            const TfLiteRegistration* registration) final {
+    RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 2));
     RETURN_IF_ERROR(CheckInputsOutputs(context, tflite_node,
                                        /*runtime_inputs=*/2, /*outputs=*/1));
     return absl::OkStatus();
@@ -2343,13 +2391,13 @@ class TransformTensorOperationParser : public TFLiteOperationParser {
     RETURN_IF_ERROR(reader->AddInput(node, 1));  // bbox
     RETURN_IF_ERROR(reader->AddOutputs(node));
 
-    std::string op_name = "transform_tensor";
+    std::string op_name = "transform_tensor_bilinear";
     node->operation.type = op_name;
     BHWC output_shape;
-    RETURN_IF_ERROR(
-        ParseCustomAttributes(op_name, tflite_node->custom_initial_data,
-                              tflite_node->custom_initial_data_size,
-                              &(node->operation.attributes), &output_shape));
+    RETURN_IF_ERROR(ParseCustomAttributes(
+        op_name, registration->version, tflite_node->custom_initial_data,
+        tflite_node->custom_initial_data_size, &(node->operation.attributes),
+        &output_shape));
 
     auto output_value = graph->FindOutputs(node->id)[0];
 
@@ -2358,45 +2406,6 @@ class TransformTensorOperationParser : public TFLiteOperationParser {
              graph->FindInputs(node->id)[0]->tensor.shape.c);
     return absl::OkStatus();
   }
-
- private:
-};
-
-class TransformTensorBilinearV2OperationParser : public TFLiteOperationParser {
- public:
-  absl::Status IsSupported(const TfLiteContext* context,
-                           const TfLiteNode* tflite_node,
-                           const TfLiteRegistration* registration) final {
-    RETURN_IF_ERROR(CheckInputsOutputs(context, tflite_node,
-                                       /*runtime_inputs=*/2, /*outputs=*/1));
-    return absl::OkStatus();
-  }
-
-  absl::Status Parse(const TfLiteNode* tflite_node,
-                     const TfLiteRegistration* registration,
-                     GraphFloat32* graph, ObjectReader* reader) final {
-    Node* node = graph->NewNode();
-    RETURN_IF_ERROR(reader->AddInput(node, 0));  // data
-    RETURN_IF_ERROR(reader->AddInput(node, 1));  // bbox
-    RETURN_IF_ERROR(reader->AddOutputs(node));
-
-    std::string op_name = "transform_tensor_bilinear_v2";
-    node->operation.type = op_name;
-    BHWC output_shape;
-    RETURN_IF_ERROR(
-        ParseCustomAttributes(op_name, tflite_node->custom_initial_data,
-                              tflite_node->custom_initial_data_size,
-                              &(node->operation.attributes), &output_shape));
-
-    auto output_value = graph->FindOutputs(node->id)[0];
-
-    output_value->tensor.shape =
-        BHWC(1, output_shape.h, output_shape.w,
-             graph->FindInputs(node->id)[0]->tensor.shape.c);
-    return absl::OkStatus();
-  }
-
- private:
 };
 
 class TransformLandmarksOperationParser : public TFLiteOperationParser {
@@ -2404,6 +2413,7 @@ class TransformLandmarksOperationParser : public TFLiteOperationParser {
   absl::Status IsSupported(const TfLiteContext* context,
                            const TfLiteNode* tflite_node,
                            const TfLiteRegistration* registration) final {
+    RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 2));
     RETURN_IF_ERROR(CheckInputsOutputs(context, tflite_node,
                                        /*runtime_inputs=*/2, /*outputs=*/1));
     return absl::OkStatus();
@@ -2418,53 +2428,17 @@ class TransformLandmarksOperationParser : public TFLiteOperationParser {
     RETURN_IF_ERROR(reader->AddOutputs(node));
     std::string op_name = "transform_landmarks";
     node->operation.type = op_name;
-    BHWC output_shape;
-    RETURN_IF_ERROR(
-        ParseCustomAttributes(op_name, tflite_node->custom_initial_data,
-                              tflite_node->custom_initial_data_size,
-                              &(node->operation.attributes), &output_shape));
+    BHWC output_shape = graph->FindOutputs(node->id)[0]->tensor.shape;
+    RETURN_IF_ERROR(ParseCustomAttributes(
+        op_name, registration->version, tflite_node->custom_initial_data,
+        tflite_node->custom_initial_data_size, &(node->operation.attributes),
+        &output_shape));
 
     auto output_value = graph->FindOutputs(node->id)[0];
 
     output_value->tensor.shape = graph->FindInputs(node->id)[0]->tensor.shape;
     return absl::OkStatus();
   }
-
- private:
-};
-
-class TransformLandmarksV2OperationParser : public TFLiteOperationParser {
- public:
-  absl::Status IsSupported(const TfLiteContext* context,
-                           const TfLiteNode* tflite_node,
-                           const TfLiteRegistration* registration) final {
-    RETURN_IF_ERROR(CheckInputsOutputs(context, tflite_node,
-                                       /*runtime_inputs=*/2, /*outputs=*/1));
-    return absl::OkStatus();
-  }
-
-  absl::Status Parse(const TfLiteNode* tflite_node,
-                     const TfLiteRegistration* registration,
-                     GraphFloat32* graph, ObjectReader* reader) final {
-    Node* node = graph->NewNode();
-    RETURN_IF_ERROR(reader->AddInput(node, 0));  // data
-    RETURN_IF_ERROR(reader->AddInput(node, 1));  // bbox
-    RETURN_IF_ERROR(reader->AddOutputs(node));
-    std::string op_name = "transform_landmarks_v2";
-    node->operation.type = op_name;
-
-    auto output_value = graph->FindOutputs(node->id)[0];
-    output_value->tensor.shape = graph->FindInputs(node->id)[0]->tensor.shape;
-    BHWC output_shape = output_value->tensor.shape;
-    RETURN_IF_ERROR(
-        ParseCustomAttributes(op_name, tflite_node->custom_initial_data,
-                              tflite_node->custom_initial_data_size,
-                              &(node->operation.attributes), &output_shape));
-
-    return absl::OkStatus();
-  }
-
- private:
 };
 
 class Landmarks2TransformMatrixOperationParser : public TFLiteOperationParser {
@@ -2472,6 +2446,7 @@ class Landmarks2TransformMatrixOperationParser : public TFLiteOperationParser {
   absl::Status IsSupported(const TfLiteContext* context,
                            const TfLiteNode* tflite_node,
                            const TfLiteRegistration* registration) final {
+    RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 2));
     return CheckInputsOutputs(context, tflite_node, /*runtime_inputs=*/1,
                               /*outputs=*/1);
   }
@@ -2486,41 +2461,10 @@ class Landmarks2TransformMatrixOperationParser : public TFLiteOperationParser {
     const std::string op_name = "landmarks_to_transform_matrix";
     node->operation.type = op_name;
     BHWC output_shape;
-    RETURN_IF_ERROR(
-        ParseCustomAttributes(op_name, tflite_node->custom_initial_data,
-                              tflite_node->custom_initial_data_size,
-                              &(node->operation.attributes), &output_shape));
-
-    auto output_value = graph->FindOutputs(node->id)[0];
-    output_value->tensor.shape = output_shape;
-    return absl::OkStatus();
-  }
-};
-
-class Landmarks2TransformMatrixV2OperationParser
-    : public TFLiteOperationParser {
- public:
-  absl::Status IsSupported(const TfLiteContext* context,
-                           const TfLiteNode* tflite_node,
-                           const TfLiteRegistration* registration) final {
-    return CheckInputsOutputs(context, tflite_node, /*runtime_inputs=*/1,
-                              /*outputs=*/1);
-  }
-
-  absl::Status Parse(const TfLiteNode* tflite_node,
-                     const TfLiteRegistration* registration,
-                     GraphFloat32* graph, ObjectReader* reader) final {
-    Node* node = graph->NewNode();
-    RETURN_IF_ERROR(reader->AddInput(node, 0));  // landmarks
-    RETURN_IF_ERROR(reader->AddOutputs(node));   // transform matrix
-
-    const std::string op_name = "landmarks_to_transform_matrix_v2";
-    node->operation.type = op_name;
-    BHWC output_shape;
-    RETURN_IF_ERROR(
-        ParseCustomAttributes(op_name, tflite_node->custom_initial_data,
-                              tflite_node->custom_initial_data_size,
-                              &(node->operation.attributes), &output_shape));
+    RETURN_IF_ERROR(ParseCustomAttributes(
+        op_name, registration->version, tflite_node->custom_initial_data,
+        tflite_node->custom_initial_data_size, &(node->operation.attributes),
+        &output_shape));
 
     auto output_value = graph->FindOutputs(node->id)[0];
     output_value->tensor.shape = output_shape;
@@ -2548,10 +2492,10 @@ class AlignmentPointsToTransformMatrixOperationParser
     const std::string op_name = "alignment_points_to_transform_matrix";
     node->operation.type = op_name;
     BHWC output_shape;
-    RETURN_IF_ERROR(
-        ParseCustomAttributes(op_name, tflite_node->custom_initial_data,
-                              tflite_node->custom_initial_data_size,
-                              &(node->operation.attributes), &output_shape));
+    RETURN_IF_ERROR(ParseCustomAttributes(
+        op_name, registration->version, tflite_node->custom_initial_data,
+        tflite_node->custom_initial_data_size, &(node->operation.attributes),
+        &output_shape));
 
     auto output_value = graph->FindOutputs(node->id)[0];
     output_value->tensor.shape = output_shape;
@@ -2646,6 +2590,8 @@ std::unique_ptr<TFLiteOperationParser> NewOperationParser(
       return std::make_unique<AddOperationParser>();
     case kTfLiteBuiltinAveragePool2d:
       return std::make_unique<Pooling2DOperationParser>(PoolingType::AVERAGE);
+    case kTfLiteBuiltinBatchMatmul:
+      return std::make_unique<BatchedMatMulOperationParser>();
     case kTfLiteBuiltinConcatenation:
       return std::make_unique<ConcatenationOperationParser>();
     case kTfLiteBuiltinConv2d:
@@ -2690,10 +2636,23 @@ std::unique_ptr<TFLiteOperationParser> NewOperationParser(
       return std::make_unique<PadOperationParser>(/*mirror_pad=*/true);
     case kTfLiteBuiltinMul:
       return std::make_unique<MulOperationParser>();
+    case kTfLiteBuiltinNeg:
+      return std::make_unique<ElementwiseOperationParser>(OperationType::NEG);
+    case kTfLiteBuiltinPack:
+      return std::make_unique<PackOperationParser>();
     case kTfLiteBuiltinPad:
       return std::make_unique<PadOperationParser>(/*mirror_pad=*/false);
     case kTfLiteBuiltinPow:
       return std::make_unique<ElementwiseOperationParser>(OperationType::POW);
+    case kTfLiteBuiltinReduceMax:
+      return std::make_unique<ReduceOperationParser>(
+          OperationType::REDUCE_MAXIMUM);
+    case kTfLiteBuiltinReduceMin:
+      return std::make_unique<ReduceOperationParser>(
+          OperationType::REDUCE_MINIMUM);
+    case kTfLiteBuiltinReduceProd:
+      return std::make_unique<ReduceOperationParser>(
+          OperationType::REDUCE_PRODUCT);
     case kTfLiteBuiltinQuantize:
       if (allow_quant_ops) {
         return std::make_unique<QuantizeOperationParser>();
@@ -2735,6 +2694,8 @@ std::unique_ptr<TFLiteOperationParser> NewOperationParser(
       return std::make_unique<StridedSliceOperationParser>();
     case kTfLiteBuiltinSub:
       return std::make_unique<ElementwiseOperationParser>(OperationType::SUB);
+    case kTfLiteBuiltinSum:
+      return std::make_unique<ReduceOperationParser>(OperationType::REDUCE_SUM);
     case kTfLiteBuiltinTanh:
       return std::make_unique<ElementwiseOperationParser>(OperationType::TANH);
     case kTfLiteBuiltinTranspose:
@@ -2756,27 +2717,17 @@ std::unique_ptr<TFLiteOperationParser> NewOperationParser(
       if (custom_name == "RoIToTransformMatrix") {
         return std::make_unique<RoIToTransformMatrixOperationParser>();
       }
-      if (custom_name == "RoIToTransformMatrixV2") {
-        return std::make_unique<RoIToTransformMatrixV2OperationParser>();
-      }
-      if (custom_name == "TransformTensor") {
-        return std::make_unique<TransformTensorOperationParser>();
-      }
-      if (custom_name == "TransformTensorBilinearV2") {
-        return std::make_unique<TransformTensorBilinearV2OperationParser>();
+      if (custom_name == "TransformTensor" /*for version 1*/ ||
+          custom_name == "TransformTensorBilinear" /*for version 2*/) {
+        return std::make_unique<TransformTensorBilinearOperationParser>();
       }
       if (custom_name == "TransformLandmarks") {
         return std::make_unique<TransformLandmarksOperationParser>();
       }
-      if (custom_name == "TransformLandmarksV2") {
-        return std::make_unique<TransformLandmarksV2OperationParser>();
-      }
-      if (custom_name == "Landmarks2TransformMatrix") {
+      if (custom_name == "Landmarks2TransformMatrix" ||
+          custom_name == "Landmarks2TransformMatrixV2") {
         return std::make_unique<Landmarks2TransformMatrixOperationParser>();
       }
-      if (custom_name == "Landmarks2TransformMatrixV2") {
-        return std::make_unique<Landmarks2TransformMatrixV2OperationParser>();
-      }
       if (custom_name == "AlignmentPointsToTransformMatrix") {
         return std::make_unique<
             AlignmentPointsToTransformMatrixOperationParser>();
@@ -2884,8 +2835,8 @@ TfLiteIntArray* GetOpsToReplace(TfLiteContext* context, bool allow_quant_ops,
 // guarantee that the order will match the source model tensors order.
 absl::Status PrecreateIOTensors(
     TfLiteContext* context, GraphFloat32* graph, TfLiteIntArray* io_tensors,
-    std::unordered_map<int, int>* quant_conversion_map,
-    std::unordered_map<int, Value*>* tensor_to_value) {
+    absl::flat_hash_map<int, int>* quant_conversion_map,
+    absl::flat_hash_map<int, Value*>* tensor_to_value) {
   for (int i = 0; i < io_tensors->size; ++i) {
     const int tensor_index = io_tensors->data[i];
     const TfLiteTensor& tflite_tensor = context->tensors[tensor_index];
@@ -2896,10 +2847,48 @@ absl::Status PrecreateIOTensors(
   return absl::OkStatus();
 }
 
+absl::Status CopyVariableTensorOutputs(
+    TfLiteNode* tflite_node, TfLiteRegistration* registration,
+    GraphFloat32* graph, ObjectReader& reader,
+    const absl::flat_hash_map<int, ValueId>& new_variable_tensor_values) {
+  absl::flat_hash_map<int, ValueId> new_variable_tensor_values_copy(
+      new_variable_tensor_values);
+  // Retrieve the final value id for the variable input tensors.
+  for (int i = 0; i < tflite_node->inputs->size; i++) {
+    int tensor_idx = tflite_node->inputs->data[i];
+    Value* value;
+    if (!reader.ReadValueByTensorIdx(tensor_idx, &value).ok()) continue;
+    if (value->tensor.is_variable_input) {
+      if (new_variable_tensor_values_copy.find(i) ==
+          new_variable_tensor_values_copy.end()) {
+        return absl::InvalidArgumentError(
+            absl::StrCat(GetOpNameByRegistration(*registration),
+                         " did not provide a new value for the variable input "
+                         "tensor with index ",
+                         tensor_idx));
+      } else {
+        Node* node = graph->NewNode();
+        node->operation.type = ToString(OperationType::COPY);
+        RETURN_IF_ERROR(graph->AddConsumer(
+            node->id, new_variable_tensor_values_copy.at(i)));
+        RETURN_IF_ERROR(reader.AddUpdate(node, i));
+        new_variable_tensor_values_copy.erase(
+            new_variable_tensor_values_copy.find(i));
+      }
+    }
+  }
+  if (!new_variable_tensor_values_copy.empty()) {
+    return absl::InvalidArgumentError(
+        "More input variable tensors asked to be copied than present on the "
+        "node");
+  }
+  return absl::OkStatus();
+}
+
 absl::Status BuildModel(TfLiteContext* context,
                         const TfLiteDelegateParams* delegate_params,
                         GraphFloat32* graph,
-                        std::unordered_map<int, int>* quant_conversion_map) {
+                        absl::flat_hash_map<int, int>* quant_conversion_map) {
   std::vector<std::unique_ptr<TFLiteOperationParser>> operations;
   std::vector<int> tflite_nodes;
   for (int i = 0; i < delegate_params->nodes_to_replace->size; ++i) {
@@ -2925,7 +2914,8 @@ absl::Status BuildModel(TfLiteContext* context,
     operations.push_back(std::move(op_parser));
     tflite_nodes.push_back(i);
   }
-  std::unordered_map<int, Value*> tensor_to_value;
+  absl::flat_hash_map<int, Value*> tensor_to_value;
+  std::vector<ValueId> variable_inputs_to_value_id;
   RETURN_IF_ERROR(PrecreateIOTensors(context, graph,
                                      delegate_params->input_tensors,
                                      quant_conversion_map, &tensor_to_value));
@@ -2946,21 +2936,38 @@ absl::Status BuildModel(TfLiteContext* context,
       return absl::InternalError(absl::StrCat(
           GetOpNameByRegistration(*registration), ": ", status.message()));
     }
+
+    absl::flat_hash_map<int, ValueId> new_value_for_variable_input_tensors =
+        operations[i]->GetNewValueIdsForVariableInputNodes();
+
+    RETURN_IF_ERROR(
+        CopyVariableTensorOutputs(tflite_node, registration, graph, reader,
+                                  new_value_for_variable_input_tensors));
+  }
+
+  // Variable input tensors expect to be unchanged throughout model execution.
+  // They need to be an output of the graph in order to have them unchanged.
+  for (auto value_id : variable_inputs_to_value_id) {
+    if (!graph->IsGraphOutput(value_id)) {
+      return absl::InvalidArgumentError(
+          absl::StrCat("Variable input tensors must be a graph output. Value ",
+                       value_id, " is not a graph output"));
+    }
   }
   return absl::OkStatus();
 }
 
 absl::Status BuildFinalModel(
     TfLiteContext* context, const TfLiteDelegateParams* delegate_params,
-    GraphFloat32* graph, std::unordered_map<int, int>* quant_conversion_map) {
+    GraphFloat32* graph, absl::flat_hash_map<int, int>* quant_conversion_map) {
   RETURN_IF_ERROR(
       BuildModel(context, delegate_params, graph, quant_conversion_map));
 
   // Apply general transformations on the graph.
   NullTransformationReporter reporter;
   ModelTransformer transformer(graph, &reporter);
-  if (!ApplyGeneralTransformations(&transformer)) {
-    return absl::InternalError("Graph general transformations failed");
+  if (!ApplyModelTransformations(&transformer)) {
+    return absl::InternalError("Graph transformations failed");
   }
   return absl::OkStatus();
 }
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder.h b/tensorflow/lite/delegates/gpu/common/model_builder.h
index 1e5016d86b6..ab18f056d58 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder.h
+++ b/tensorflow/lite/delegates/gpu/common/model_builder.h
@@ -16,13 +16,12 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MODEL_BUILDER_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MODEL_BUILDER_H_
 
-#include <cstdint>
-#include <string>
-#include <unordered_map>
-
-#include "tensorflow/lite/context.h"
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
 
 namespace tflite {
 namespace gpu {
@@ -48,7 +47,7 @@ TfLiteIntArray* GetOpsToReplace(TfLiteContext* context,
 absl::Status BuildModel(
     TfLiteContext* context, const TfLiteDelegateParams* delegate_params,
     GraphFloat32* graph,
-    std::unordered_map<int, int>* quant_conversion_map = nullptr);
+    absl::flat_hash_map<int, int>* quant_conversion_map = nullptr);
 
 // Same as above but also apply all transformations on the final graph.
 // Prefer using this method instead of BuildModel.
@@ -62,7 +61,7 @@ absl::Status BuildModel(
 absl::Status BuildFinalModel(
     TfLiteContext* context, const TfLiteDelegateParams* delegate_params,
     GraphFloat32* graph,
-    std::unordered_map<int, int>* quant_conversion_map = nullptr);
+    absl::flat_hash_map<int, int>* quant_conversion_map = nullptr);
 
 // Module-internal converter, exposed for unit testing purpose only.
 absl::Status ConvertTfLiteTensorToTensorRef(const TfLiteTensor& tflite_tensor,
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc b/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc
index 453e33ec916..4f67495152c 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc
@@ -15,22 +15,55 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/model_builder_helper.h"
 
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include <any>
+#include <limits>
 #include <string>
+#include <vector>
 
 #include <fp16.h>
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
-#include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/context.h"
 #include "tensorflow/lite/context_util.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
-#include "tensorflow/lite/delegates/utils.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
 namespace tflite {
 namespace gpu {
+namespace {
+
+// Creates a node that consumes output from the given node. Because output need
+// to stay the same, newly created node will inherit the output from the given
+// node, which will in turn get newly created copy of output. This is necessary
+// to preserve reference consistency if another node was pointing at that
+// output:
+//   node(output)
+// will turn into:
+//   node(copy(output)) <- passthrough_node(output)
+absl::Status NewPassthroughNode(GraphFloat32* graph, Node* node,
+                                const Value* output, Node** passthru_node) {
+  *passthru_node = graph->NewNode();
+  // Make copies for every output in the original node.
+  RETURN_IF_ERROR(graph->SetProducer((*passthru_node)->id, output->id));
+  Value* copy_output = graph->NewValue();
+  RETURN_IF_ERROR(graph->SetProducer(node->id, copy_output->id));
+  RETURN_IF_ERROR(graph->AddConsumer((*passthru_node)->id, copy_output->id));
+  copy_output->tensor = output->tensor;
+  copy_output->tensor.ref = -1;
+  return absl::OkStatus();
+}
+
+}  // namespace
 
 absl::Status GetNodeAndRegistration(TfLiteContext* context, int node_id,
                                     TfLiteNode** tflite_node,
@@ -64,15 +97,19 @@ absl::Status ExtractTensorShape(const TfLiteTensor& tflite_tensor, BHWC* bhwc) {
   const TfLiteIntArray* dims = tflite_tensor.dims;
   switch (dims->size) {
     case 1:
+      // B layout
       *bhwc = BHWC(dims->data[0], 1, 1, 1);
       return absl::OkStatus();
     case 2:
+      // BC layout
       *bhwc = BHWC(dims->data[0], 1, 1, dims->data[1]);
       return absl::OkStatus();
     case 3:
+      // BWC layout
       *bhwc = BHWC(dims->data[0], 1, dims->data[1], dims->data[2]);
       return absl::OkStatus();
     case 4:
+      // BHWC layout
       *bhwc = BHWC(dims->data[0], dims->data[1], dims->data[2], dims->data[3]);
       return absl::OkStatus();
     default:
@@ -82,6 +119,40 @@ absl::Status ExtractTensorShape(const TfLiteTensor& tflite_tensor, BHWC* bhwc) {
   }
 }
 
+absl::Status ExtractAxisFromIndex(const TfLiteTensor& tflite_tensor, int index,
+                                  Axis* axis) {
+  const TfLiteIntArray* dims = tflite_tensor.dims;
+  if (index == -1) {
+    index = dims->size - 1;
+  }
+  if (index < 0 || index >= dims->size) {
+    return absl::OutOfRangeError("Index for axis out of range");
+  }
+  std::vector<Axis> index_to_axis;
+  switch (dims->size) {
+    case 1:
+      // B layout
+      index_to_axis = {Axis::BATCH};
+      break;
+    case 2:
+      // BC layout
+      index_to_axis = {Axis::BATCH, Axis::CHANNELS};
+      break;
+    case 3:
+      // BWC layout
+      index_to_axis = {Axis::BATCH, Axis::WIDTH, Axis::CHANNELS};
+      break;
+    case 4:
+      // BHWC layout
+      index_to_axis = {Axis::BATCH, Axis::HEIGHT, Axis::WIDTH, Axis::CHANNELS};
+      break;
+    default:
+      return absl::UnavailableError("Unknown layout.");
+  }
+  *axis = index_to_axis[index];
+  return absl::OkStatus();
+}
+
 absl::Status ConvertTfLiteTensorToTensorRef(const TfLiteTensor& tflite_tensor,
                                             TensorRef<BHWC>* tensor_ref) {
   tensor_ref->type = ToDataType(tflite_tensor.type);
@@ -307,5 +378,70 @@ absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, BHWC* shape) {
   return absl::OkStatus();
 }
 
+absl::Status IsActivationSupported(TfLiteFusedActivation fused_activation) {
+  switch (fused_activation) {
+    case kTfLiteActNone:
+    case kTfLiteActRelu:
+    case kTfLiteActReluN1To1:
+    case kTfLiteActRelu6:
+    case kTfLiteActTanh:
+    case kTfLiteActSigmoid:
+      return absl::OkStatus();
+    case kTfLiteActSignBit:
+      return absl::UnimplementedError(
+          "TfLiteFusedActivation.kTfLiteActSignBit");
+
+      // Do not add default; we want compilation error rather than run-time
+      // error.
+  }
+}
+
+// If there is fused activation present, then there will be another node created
+// that will have identical output as the given node. New operation node will
+// depend on the given node output.
+absl::Status MaybeFuseActivation(TfLiteFusedActivation fused_activation,
+                                 GraphFloat32* graph, Node* node) {
+  const auto outputs = graph->FindOutputs(node->id);
+  if (outputs.size() != 1) {
+    return absl::InternalError("Number of outputs != 1");
+  }
+  switch (fused_activation) {
+    case kTfLiteActNone:
+      // Nothing to do here
+      return absl::OkStatus();
+    case kTfLiteActRelu:
+    case kTfLiteActReluN1To1:
+    case kTfLiteActRelu6: {
+      ReLUAttributes attr;
+      attr.clip = fused_activation == kTfLiteActRelu
+                      ? 0.0f
+                      : (fused_activation == kTfLiteActReluN1To1 ? 1.0f : 6.0f);
+      Node* activation_node;
+      RETURN_IF_ERROR(
+          NewPassthroughNode(graph, node, outputs[0], &activation_node));
+      activation_node->operation.type = ToString(OperationType::RELU);
+      activation_node->operation.attributes = attr;
+      return absl::OkStatus();
+    }
+    case kTfLiteActTanh: {
+      Node* activation_node;
+      RETURN_IF_ERROR(
+          NewPassthroughNode(graph, node, outputs[0], &activation_node));
+      activation_node->operation.type = ToString(OperationType::TANH);
+      return absl::OkStatus();
+    }
+    case kTfLiteActSigmoid: {
+      Node* activation_node;
+      RETURN_IF_ERROR(
+          NewPassthroughNode(graph, node, outputs[0], &activation_node));
+      activation_node->operation.type = ToString(OperationType::SIGMOID);
+      return absl::OkStatus();
+    } break;
+    default:
+      return absl::NotFoundError(
+          absl::StrCat("Unsupported fused activation: ", fused_activation));
+  }
+}
+
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder_helper.h b/tensorflow/lite/delegates/gpu/common/model_builder_helper.h
index 064c42ae9ed..93889314e81 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder_helper.h
+++ b/tensorflow/lite/delegates/gpu/common/model_builder_helper.h
@@ -16,6 +16,14 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MODEL_BUILDER_HELPER_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MODEL_BUILDER_HELPER_H_
 
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include <string>
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
@@ -25,7 +33,6 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/reference/dequantize.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/internal/types.h"
-#include "tensorflow/lite/kernels/kernel_util.h"
 
 namespace tflite {
 namespace gpu {
@@ -38,6 +45,9 @@ DataType ToDataType(TfLiteType type);
 
 absl::Status ExtractTensorShape(const TfLiteTensor& tflite_tensor, BHWC* bhwc);
 
+absl::Status ExtractAxisFromIndex(const TfLiteTensor& tflite_tensor, int index,
+                                  Axis* axis);
+
 absl::Status ConvertTfLiteTensorToTensorRef(const TfLiteTensor& tflite_tensor,
                                             TensorRef<BHWC>* tensor_ref);
 
@@ -118,6 +128,14 @@ absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, OHWI* shape);
 
 absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, BHWC* shape);
 
+absl::Status IsActivationSupported(TfLiteFusedActivation fused_activation);
+
+// If there is fused activation present, then there will be another node created
+// that will have identical output as the given node. New operation node will
+// depend on the given node output.
+absl::Status MaybeFuseActivation(TfLiteFusedActivation fused_activation,
+                                 GraphFloat32* graph, Node* node);
+
 }  // namespace gpu
 }  // namespace tflite
 
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder_test.cc b/tensorflow/lite/delegates/gpu/common/model_builder_test.cc
index c5ee71b3f3f..9bc848b9210 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder_test.cc
@@ -15,15 +15,21 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/model_builder.h"
 
-#include <cstdlib>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <cstdlib>
+#include <utility>
+#include <vector>
 
-#include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/status.h"
 #include "tensorflow/lite/builtin_ops.h"
-#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/subgraph.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
 #include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/stderr_reporter.h"
 
 namespace tflite {
 namespace gpu {
diff --git a/tensorflow/lite/delegates/gpu/common/model_test.cc b/tensorflow/lite/delegates/gpu/common/model_test.cc
index 87f65eb730a..5a058cdd280 100644
--- a/tensorflow/lite/delegates/gpu/common/model_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_test.cc
@@ -15,11 +15,9 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 
-#include <initializer_list>
-#include <vector>
-
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/status.h"
 
 namespace tflite {
 namespace gpu {
@@ -139,98 +137,140 @@ TEST(Model, RemoveProducer) {
   ASSERT_FALSE(graph.RemoveProducer(graph_output->id).ok());
 }
 
-TEST(Model, RemoveSimpleNodeDegenerateCase) {
-  GraphFloat32 graph;
-  Node* node = graph.NewNode();
-  Value* graph_input = graph.NewValue();
-  Value* graph_output = graph.NewValue();
+class OneNodeModel : public testing::Test {
+ protected:
+  void SetUp() override {
+    node_ = graph_.NewNode();
+    Value* graph_input = graph_.NewValue();
+    Value* graph_output = graph_.NewValue();
+    ASSERT_TRUE(graph_.AddConsumer(node_->id, graph_input->id).ok());
+    ASSERT_TRUE(graph_.SetProducer(node_->id, graph_output->id).ok());
+    EXPECT_THAT(graph_.inputs(), UnorderedElementsAre(graph_input));
+    EXPECT_THAT(graph_.outputs(), UnorderedElementsAre(graph_output));
+    EXPECT_THAT(graph_.nodes(), ElementsAre(node_));
+  }
+  GraphFloat32 graph_;
+  Node* node_;
+};
 
-  ASSERT_TRUE(graph.AddConsumer(node->id, graph_input->id).ok());
-  ASSERT_TRUE(graph.SetProducer(node->id, graph_output->id).ok());
-  EXPECT_THAT(graph.inputs(), UnorderedElementsAre(graph_input));
-  EXPECT_THAT(graph.outputs(), UnorderedElementsAre(graph_output));
-  EXPECT_THAT(graph.nodes(), ElementsAre(node));
-
-  ASSERT_TRUE(RemoveOneInputOneOutputNode(&graph, node).ok());
-  EXPECT_THAT(graph.inputs(), UnorderedElementsAre());
-  EXPECT_THAT(graph.outputs(), UnorderedElementsAre());
-  EXPECT_THAT(graph.nodes(), ElementsAre());
+TEST_F(OneNodeModel, DeleteNodeKeepInput) {
+  ASSERT_TRUE(RemoveSimpleNodeKeepInput(&graph_, node_).ok());
+  EXPECT_TRUE(graph_.inputs().empty());
+  EXPECT_TRUE(graph_.outputs().empty());
+  EXPECT_TRUE(graph_.nodes().empty());
 }
 
-TEST(Model, RemoveSimpleNodeNoPreviousNode) {
-  GraphFloat32 graph;
-  Node* simple_node = graph.NewNode();
-  Node* consumer_node = graph.NewNode();
-  Value* graph_input = graph.NewValue();
-  Value* graph_output = graph.NewValue();
-  Value* value = graph.NewValue();
-
-  ASSERT_TRUE(graph.AddConsumer(simple_node->id, graph_input->id).ok());
-  ASSERT_TRUE(graph.SetProducer(simple_node->id, value->id).ok());
-  ASSERT_TRUE(graph.AddConsumer(consumer_node->id, value->id).ok());
-  ASSERT_TRUE(graph.SetProducer(consumer_node->id, graph_output->id).ok());
-  EXPECT_THAT(graph.inputs(), UnorderedElementsAre(graph_input));
-  EXPECT_THAT(graph.outputs(), UnorderedElementsAre(graph_output));
-  EXPECT_THAT(graph.nodes(), ElementsAre(simple_node, consumer_node));
-
-  ASSERT_TRUE(RemoveOneInputOneOutputNode(&graph, simple_node).ok());
-  EXPECT_THAT(graph.inputs(), UnorderedElementsAre(graph_input));
-  EXPECT_THAT(graph.outputs(), UnorderedElementsAre(graph_output));
-  EXPECT_THAT(graph.nodes(), ElementsAre(consumer_node));
+TEST_F(OneNodeModel, DeleteNodeKeepOutput) {
+  ASSERT_TRUE(RemoveSimpleNodeKeepOutput(&graph_, node_).ok());
+  EXPECT_TRUE(graph_.inputs().empty());
+  EXPECT_TRUE(graph_.outputs().empty());
+  EXPECT_TRUE(graph_.nodes().empty());
 }
 
-TEST(Model, RemoveSimpleNodeNoAfterNodes) {
-  GraphFloat32 graph;
-  Node* simple_node = graph.NewNode();
-  Node* producer_node = graph.NewNode();
-  Value* graph_input = graph.NewValue();
-  Value* graph_output = graph.NewValue();
-  Value* value = graph.NewValue();
+class TwoNodesModel : public testing::Test {
+ protected:
+  void SetUp() override {
+    graph_input_ = graph_.NewValue();
+    first_node_ = graph_.NewNode();
+    value_ = graph_.NewValue();
+    second_node_ = graph_.NewNode();
+    graph_output_ = graph_.NewValue();
 
-  ASSERT_TRUE(graph.AddConsumer(simple_node->id, value->id).ok());
-  ASSERT_TRUE(graph.SetProducer(simple_node->id, graph_output->id).ok());
-  ASSERT_TRUE(graph.AddConsumer(producer_node->id, graph_input->id).ok());
-  ASSERT_TRUE(graph.SetProducer(producer_node->id, value->id).ok());
-  EXPECT_THAT(graph.inputs(), UnorderedElementsAre(graph_input));
-  EXPECT_THAT(graph.outputs(), UnorderedElementsAre(graph_output));
-  EXPECT_THAT(graph.nodes(), ElementsAre(simple_node, producer_node));
+    ASSERT_TRUE(graph_.AddConsumer(first_node_->id, graph_input_->id).ok());
+    ASSERT_TRUE(graph_.SetProducer(first_node_->id, value_->id).ok());
+    ASSERT_TRUE(graph_.AddConsumer(second_node_->id, value_->id).ok());
+    ASSERT_TRUE(graph_.SetProducer(second_node_->id, graph_output_->id).ok());
+    EXPECT_THAT(graph_.inputs(), UnorderedElementsAre(graph_input_));
+    EXPECT_THAT(graph_.outputs(), UnorderedElementsAre(graph_output_));
+    EXPECT_THAT(graph_.nodes(), ElementsAre(first_node_, second_node_));
+  }
+  GraphFloat32 graph_;
+  Node* first_node_;
+  Node* second_node_;
+  Value* graph_input_;
+  Value* value_;
+  Value* graph_output_;
+};
 
-  ASSERT_TRUE(RemoveOneInputOneOutputNode(&graph, simple_node).ok());
-  EXPECT_THAT(graph.inputs(), UnorderedElementsAre(graph_input));
-  EXPECT_THAT(graph.outputs(), UnorderedElementsAre(value));
-  EXPECT_THAT(graph.nodes(), ElementsAre(producer_node));
+TEST_F(TwoNodesModel, DeleteFirstNodeKeepInput) {
+  ASSERT_TRUE(RemoveSimpleNodeKeepInput(&graph_, first_node_).ok());
+  EXPECT_THAT(graph_.inputs(), UnorderedElementsAre(graph_input_));
+  EXPECT_THAT(graph_.outputs(), UnorderedElementsAre(graph_output_));
+  EXPECT_THAT(graph_.nodes(), ElementsAre(second_node_));
 }
 
-TEST(Model, RemoveSimpleNodeGeneralCase) {
-  GraphFloat32 graph;
-  Node* simple_node = graph.NewNode();
-  Node* producer_node = graph.NewNode();
-  Node* consumer_node = graph.NewNode();
-  Value* graph_input = graph.NewValue();
-  Value* graph_output = graph.NewValue();
-  Value* value0 = graph.NewValue();
-  Value* value1 = graph.NewValue();
-
-  ASSERT_TRUE(graph.AddConsumer(producer_node->id, graph_input->id).ok());
-  ASSERT_TRUE(graph.SetProducer(producer_node->id, value0->id).ok());
-  ASSERT_TRUE(graph.AddConsumer(simple_node->id, value0->id).ok());
-  ASSERT_TRUE(graph.SetProducer(simple_node->id, value1->id).ok());
-  ASSERT_TRUE(graph.AddConsumer(consumer_node->id, value1->id).ok());
-  ASSERT_TRUE(graph.SetProducer(consumer_node->id, graph_output->id).ok());
-  EXPECT_THAT(graph.inputs(), UnorderedElementsAre(graph_input));
-  EXPECT_THAT(graph.outputs(), UnorderedElementsAre(graph_output));
-  EXPECT_THAT(graph.nodes(),
-              ElementsAre(simple_node, producer_node, consumer_node));
-
-  ASSERT_TRUE(RemoveOneInputOneOutputNode(&graph, simple_node).ok());
-  EXPECT_THAT(graph.inputs(), UnorderedElementsAre(graph_input));
-  EXPECT_THAT(graph.outputs(), UnorderedElementsAre(graph_output));
-  EXPECT_THAT(graph.nodes(), ElementsAre(producer_node, consumer_node));
-  EXPECT_THAT(graph.values(),
-              UnorderedElementsAre(graph_input, graph_output, value0));
+TEST_F(TwoNodesModel, DeleteFirstNodeKeepOutput) {
+  ASSERT_TRUE(RemoveSimpleNodeKeepOutput(&graph_, first_node_).ok());
+  EXPECT_THAT(graph_.inputs(), UnorderedElementsAre(value_));
+  EXPECT_THAT(graph_.outputs(), UnorderedElementsAre(graph_output_));
+  EXPECT_THAT(graph_.nodes(), ElementsAre(second_node_));
 }
 
-TEST(Model, RemoveSimpleNodeComplexCase) {
+TEST_F(TwoNodesModel, DeleteSecondNodeKeepInput) {
+  ASSERT_TRUE(RemoveSimpleNodeKeepInput(&graph_, second_node_).ok());
+  EXPECT_THAT(graph_.inputs(), UnorderedElementsAre(graph_input_));
+  EXPECT_THAT(graph_.outputs(), UnorderedElementsAre(value_));
+  EXPECT_THAT(graph_.nodes(), ElementsAre(first_node_));
+}
+
+TEST_F(TwoNodesModel, DeleteSecondNodeKeepOutput) {
+  ASSERT_TRUE(RemoveSimpleNodeKeepOutput(&graph_, second_node_).ok());
+  EXPECT_THAT(graph_.inputs(), UnorderedElementsAre(graph_input_));
+  EXPECT_THAT(graph_.outputs(), UnorderedElementsAre(graph_output_));
+  EXPECT_THAT(graph_.nodes(), ElementsAre(first_node_));
+}
+
+class ThreeNodesModel : public testing::Test {
+ protected:
+  void SetUp() override {
+    first_node_ = graph_.NewNode();
+    second_node_ = graph_.NewNode();
+    third_node_ = graph_.NewNode();
+    graph_input_ = graph_.NewValue();
+    value0_ = graph_.NewValue();
+    value1_ = graph_.NewValue();
+    graph_output_ = graph_.NewValue();
+
+    ASSERT_TRUE(graph_.AddConsumer(first_node_->id, graph_input_->id).ok());
+    ASSERT_TRUE(graph_.SetProducer(first_node_->id, value0_->id).ok());
+    ASSERT_TRUE(graph_.AddConsumer(second_node_->id, value0_->id).ok());
+    ASSERT_TRUE(graph_.SetProducer(second_node_->id, value1_->id).ok());
+    ASSERT_TRUE(graph_.AddConsumer(third_node_->id, value1_->id).ok());
+    ASSERT_TRUE(graph_.SetProducer(third_node_->id, graph_output_->id).ok());
+    EXPECT_THAT(graph_.inputs(), UnorderedElementsAre(graph_input_));
+    EXPECT_THAT(graph_.outputs(), UnorderedElementsAre(graph_output_));
+    EXPECT_THAT(graph_.nodes(),
+                ElementsAre(first_node_, second_node_, third_node_));
+  }
+  GraphFloat32 graph_;
+  Node* first_node_;
+  Node* second_node_;
+  Node* third_node_;
+  Value* graph_input_;
+  Value* value0_;
+  Value* value1_;
+  Value* graph_output_;
+};
+
+TEST_F(ThreeNodesModel, DeleteMiddleNodeKeepInput) {
+  ASSERT_TRUE(RemoveSimpleNodeKeepInput(&graph_, second_node_).ok());
+  EXPECT_THAT(graph_.inputs(), UnorderedElementsAre(graph_input_));
+  EXPECT_THAT(graph_.outputs(), UnorderedElementsAre(graph_output_));
+  EXPECT_THAT(graph_.nodes(), ElementsAre(first_node_, third_node_));
+  EXPECT_THAT(graph_.values(),
+              UnorderedElementsAre(graph_input_, value0_, graph_output_));
+}
+
+TEST_F(ThreeNodesModel, DeleteMiddleNodeKeepOutput) {
+  ASSERT_TRUE(RemoveSimpleNodeKeepOutput(&graph_, second_node_).ok());
+  EXPECT_THAT(graph_.inputs(), UnorderedElementsAre(graph_input_));
+  EXPECT_THAT(graph_.outputs(), UnorderedElementsAre(graph_output_));
+  EXPECT_THAT(graph_.nodes(), ElementsAre(first_node_, third_node_));
+  EXPECT_THAT(graph_.values(),
+              UnorderedElementsAre(graph_input_, value1_, graph_output_));
+}
+
+TEST(Model, RemoveSimpleNodeKeepInputComplexCase) {
   // We have this graph and we are going to delete n1 and preserve order of
   // v0, v1 for n0 node and v2, v3 for n2 node
   //  v0   v1
@@ -276,7 +316,11 @@ TEST(Model, RemoveSimpleNodeComplexCase) {
   EXPECT_THAT(graph.outputs(), UnorderedElementsAre(o1, o2));
   EXPECT_THAT(graph.nodes(), ElementsAre(n0, n1, n2));
 
-  ASSERT_TRUE(RemoveOneInputOneOutputNode(&graph, n1).ok());
+  // Node should be the only consumer of the input value to be able to be
+  // deleted with this function.
+  ASSERT_FALSE(RemoveSimpleNodeKeepOutput(&graph, n1).ok());
+
+  ASSERT_TRUE(RemoveSimpleNodeKeepInput(&graph, n1).ok());
   EXPECT_THAT(graph.inputs(), UnorderedElementsAre(v0, v1, v3));
   EXPECT_THAT(graph.outputs(), UnorderedElementsAre(o1, o2));
   EXPECT_THAT(graph.nodes(), ElementsAre(n0, n2));
diff --git a/tensorflow/lite/delegates/gpu/common/model_transformer.cc b/tensorflow/lite/delegates/gpu/common/model_transformer.cc
index 81287dd61e5..3be7ec55196 100644
--- a/tensorflow/lite/delegates/gpu/common/model_transformer.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_transformer.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_join.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 
diff --git a/tensorflow/lite/delegates/gpu/common/model_transformer.h b/tensorflow/lite/delegates/gpu/common/model_transformer.h
index d82a6a687ca..b640b14e0b4 100644
--- a/tensorflow/lite/delegates/gpu/common/model_transformer.h
+++ b/tensorflow/lite/delegates/gpu/common/model_transformer.h
@@ -18,9 +18,10 @@ limitations under the License.
 
 #include <deque>
 #include <string>
-#include <unordered_set>
+#include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 
 namespace tflite {
@@ -126,7 +127,7 @@ class ModelTransformer {
   TransformationReporter* reporter_;
 
   std::deque<NodeId> to_process_;
-  std::unordered_set<NodeId> processed_;
+  absl::flat_hash_set<NodeId> processed_;
 };
 
 class NullTransformationReporter : public TransformationReporter {
diff --git a/tensorflow/lite/delegates/gpu/common/object_reader.cc b/tensorflow/lite/delegates/gpu/common/object_reader.cc
index 41f3ef8ff19..17d64be3930 100644
--- a/tensorflow/lite/delegates/gpu/common/object_reader.cc
+++ b/tensorflow/lite/delegates/gpu/common/object_reader.cc
@@ -16,20 +16,25 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/object_reader.h"
 
 #include <cstdint>
-#include <unordered_map>
+#include <optional>
+#include <string>
 
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/model_builder_helper.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
 #include "tensorflow/lite/delegates/utils.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
 
 namespace tflite {
 namespace gpu {
 
 absl::Status ObjectReader::ReadNonConstantTensor(
-    TfLiteContext* context, std::unordered_map<int, Value*>* tensor_to_value,
-    std::unordered_map<int, int>* quant_conversion_map, GraphFloat32* graph,
+    TfLiteContext* context, absl::flat_hash_map<int, Value*>* tensor_to_value,
+    absl::flat_hash_map<int, int>* quant_conversion_map, GraphFloat32* graph,
     uint32_t tensor_idx, Value** value) {
   if (tensor_idx >= context->tensors_size) {
     return absl::OutOfRangeError(
@@ -67,6 +72,7 @@ absl::Status ObjectReader::ReadNonConstantTensor(
         RETURN_IF_ERROR(
             ConvertTfLiteTensorToTensorRef(*fp_tflite_tensor, &value->tensor));
         value->tensor.ref = fp_tensor_index;
+        value->tensor.is_variable_input = tflite_tensor->is_variable;
         value->quant_params.emplace();
         // tflite_tensor from the outer scope is invalidated due to calling
         // CreateNewTensorWithDifferentType
@@ -84,6 +90,7 @@ absl::Status ObjectReader::ReadNonConstantTensor(
       RETURN_IF_ERROR(
           ConvertTfLiteTensorToTensorRef(*tflite_tensor, &value->tensor));
       value->tensor.ref = tensor_idx;
+      value->tensor.is_variable_input = tflite_tensor->is_variable;
       (*tensor_to_value)[tensor_idx] = value;
     }
   }
@@ -154,6 +161,53 @@ absl::Status ObjectReader::AddInput(const Node* node, uint32_t idx) {
   return graph_->AddConsumer(node->id, input->id);
 }
 
+absl::Status ObjectReader::AddUpdate(const Node* node, uint32_t idx) {
+  if (node_->inputs->size <= idx) {
+    return absl::InvalidArgumentError(absl::StrCat(
+        "Data id ", idx, " must be less than tflite node inputs size ",
+        node_->inputs->size));
+  }
+
+  int update_tensor_idx = node_->inputs->data[idx];
+  TfLiteTensor* update_tensor = context_->tensors + update_tensor_idx;
+  if (!update_tensor->is_variable) {
+    return absl::InvalidArgumentError(
+        "The tensor must be a variable tensor to update it in place");
+  }
+
+  Value* value;
+  RETURN_IF_ERROR(ReadValueByTensorIdx(update_tensor_idx, &value));
+  if (!value->tensor.is_variable_input) {
+    return absl::InternalError(
+        "Variable input tensor is not marked as variable");
+  }
+
+  // We cannot create a cycle in the graph. The way around this when a node
+  // updates a tensor in place would be to add a new value to the graph that
+  // points to the same tensor.
+  Value* updated_value = graph_->NewValue();
+  updated_value->tensor = value->tensor;
+  updated_value->quant_params = value->quant_params;
+  RETURN_IF_ERROR(graph_->SetProducer(node->id, updated_value->id));
+
+  // We also need to update the tensor_to_value arrays so that the nodes added
+  // after the current node will access the tensor with the updated value rather
+  // than the initial value.
+  if (quant_conversion_map_ != nullptr &&
+      quant_conversion_map_->find(update_tensor_idx) !=
+          quant_conversion_map_->end()) {
+    // If quantization conversion map exists, then the index provided is not the
+    // actual tensor idx. We need to find the float version of the tensor from
+    // the map.
+    tensor_to_value_->at(quant_conversion_map_->at(update_tensor_idx)) =
+        updated_value;
+  } else {
+    tensor_to_value_->at(update_tensor_idx) = updated_value;
+  }
+
+  return absl::OkStatus();
+}
+
 TfLiteTensor* ObjectReader::GetInputTensor(int index) const {
   return index >= 0 && index < node_->inputs->size
              ? context_->tensors + node_->inputs->data[index]
diff --git a/tensorflow/lite/delegates/gpu/common/object_reader.h b/tensorflow/lite/delegates/gpu/common/object_reader.h
index be9a89e1b4e..3c7d7f6a859 100644
--- a/tensorflow/lite/delegates/gpu/common/object_reader.h
+++ b/tensorflow/lite/delegates/gpu/common/object_reader.h
@@ -17,8 +17,8 @@ limitations under the License.
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_OBJECT_READER_H_
 
 #include <cstdint>
-#include <unordered_map>
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/model_builder_helper.h"
@@ -34,14 +34,14 @@ namespace gpu {
 class ObjectReader {
  public:
   static absl::Status ReadNonConstantTensor(
-      TfLiteContext* context, std::unordered_map<int, Value*>* tensor_to_value,
-      std::unordered_map<int, int>* quant_conversion_map, GraphFloat32* graph,
+      TfLiteContext* context, absl::flat_hash_map<int, Value*>* tensor_to_value,
+      absl::flat_hash_map<int, int>* quant_conversion_map, GraphFloat32* graph,
       uint32_t tensor_idx, Value** value = nullptr);
 
   ObjectReader(GraphFloat32* graph, TfLiteContext* context,
                const TfLiteNode* node,
-               std::unordered_map<int, Value*>* tensor_to_value,
-               std::unordered_map<int, int>* quant_conversion_map = nullptr)
+               absl::flat_hash_map<int, Value*>* tensor_to_value,
+               absl::flat_hash_map<int, int>* quant_conversion_map = nullptr)
       : graph_(graph),
         context_(context),
         node_(node),
@@ -86,6 +86,8 @@ class ObjectReader {
 
   absl::Status AddInput(const Node* node, uint32_t idx);
 
+  absl::Status AddUpdate(const Node* node, uint32_t idx);
+
   TfLiteTensor* GetInputTensor(int index) const;
 
   TfLiteTensor* GetOutputTensor(int index) const;
@@ -98,8 +100,8 @@ class ObjectReader {
   GraphFloat32* graph_;
   TfLiteContext* context_;
   const TfLiteNode* node_;
-  std::unordered_map<int, Value*>* tensor_to_value_;
-  std::unordered_map<int, int>* quant_conversion_map_;
+  absl::flat_hash_map<int, Value*>* tensor_to_value_;
+  absl::flat_hash_map<int, int>* quant_conversion_map_;
 };
 
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/common/operations.cc b/tensorflow/lite/delegates/gpu/common/operations.cc
index 245a5a80639..19d7bd919c5 100644
--- a/tensorflow/lite/delegates/gpu/common/operations.cc
+++ b/tensorflow/lite/delegates/gpu/common/operations.cc
@@ -15,11 +15,17 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 
+#include <algorithm>
 #include <cstdint>
-#include <unordered_map>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
 
 namespace tflite {
 namespace gpu {
@@ -76,6 +82,8 @@ std::string ToString(enum OperationType op) {
       return "batch_normalization";
     case OperationType::BATCH_TO_SPACE:
       return "batch_to_space";
+    case OperationType::BATCHED_MATMUL:
+      return "batched_matmul";
     case OperationType::CONCAT:
       return "concat";
     case OperationType::CONST:
@@ -94,12 +102,22 @@ std::string ToString(enum OperationType op) {
       return "div";
     case OperationType::ELU:
       return "elu";
+    case OperationType::EQUAL:
+      return "equal";
     case OperationType::EXP:
       return "exp";
     case OperationType::FULLY_CONNECTED:
       return "fully_connected";
+    case OperationType::GREATER:
+      return "greater";
+    case OperationType::GREATER_EQUAL:
+      return "greater_equal";
     case OperationType::HARD_SWISH:
       return "hard_swish";
+    case OperationType::LESS:
+      return "less";
+    case OperationType::LESS_EQUAL:
+      return "less_equal";
     case OperationType::LOG:
       return "log";
     case OperationType::LSTM:
@@ -116,6 +134,10 @@ std::string ToString(enum OperationType op) {
       return "minimum";
     case OperationType::MUL:
       return "mul";
+    case OperationType::NEG:
+      return "neg";
+    case OperationType::NOT_EQUAL:
+      return "not_equal";
     case OperationType::PAD:
       return "pad";
     case OperationType::POOLING_2D:
@@ -126,6 +148,14 @@ std::string ToString(enum OperationType op) {
       return "prelu";
     case OperationType::QUANTIZE_AND_DEQUANTIZE:
       return "quantize_and_dequantize";
+    case OperationType::REDUCE_MAXIMUM:
+      return "reduce_maximum";
+    case OperationType::REDUCE_MINIMUM:
+      return "reduce_minimum";
+    case OperationType::REDUCE_PRODUCT:
+      return "reduce_product";
+    case OperationType::REDUCE_SUM:
+      return "reduce_sum";
     case OperationType::RELU:
       return "relu";
     case OperationType::RESHAPE:
@@ -165,10 +195,11 @@ std::string ToString(enum OperationType op) {
 
 OperationType OperationTypeFromString(const std::string& name) {
   static const auto operations =
-      new std::unordered_map<std::string, OperationType>({
+      new absl::flat_hash_map<std::string, OperationType>({
           {"abs", OperationType::ABS},
           {"add", OperationType::ADD},
           {"batch_normalization", OperationType::BATCH_NORMALIZATION},
+          {"batched_matmul", OperationType::BATCHED_MATMUL},
           {"concat", OperationType::CONCAT},
           {"const", OperationType::CONST},
           {"convolution_2d", OperationType::CONVOLUTION_2D},
@@ -178,9 +209,14 @@ OperationType OperationTypeFromString(const std::string& name) {
           {"depthwise_convolution", OperationType::DEPTHWISE_CONVOLUTION},
           {"div", OperationType::DIV},
           {"elu", OperationType::ELU},
+          {"equal", OperationType::EQUAL},
           {"exp", OperationType::EXP},
           {"fully_connected", OperationType::FULLY_CONNECTED},
+          {"greater", OperationType::GREATER},
+          {"greater_equal", OperationType::GREATER_EQUAL},
           {"hard_swish", OperationType::HARD_SWISH},
+          {"less", OperationType::LESS},
+          {"less_equal", OperationType::LESS_EQUAL},
           {"log", OperationType::LOG},
           {"lstm", OperationType::LSTM},
           {"maximum", OperationType::MAXIMUM},
@@ -190,11 +226,17 @@ OperationType OperationTypeFromString(const std::string& name) {
            OperationType::MEAN_STDDEV_NORMALIZATION},
           {"minimum", OperationType::MINIMUM},
           {"mul", OperationType::MUL},
+          {"neg", OperationType::NEG},
+          {"not_equal", OperationType::NOT_EQUAL},
           {"pad", OperationType::PAD},
           {"pooling_2d", OperationType::POOLING_2D},
           {"pow", OperationType::POW},
           {"prelu", OperationType::PRELU},
           {"quantize_and_dequantize", OperationType::QUANTIZE_AND_DEQUANTIZE},
+          {"reduce_maximum", OperationType::REDUCE_MAXIMUM},
+          {"reduce_minimum", OperationType::REDUCE_MINIMUM},
+          {"reduce_product", OperationType::REDUCE_PRODUCT},
+          {"reduce_sum", OperationType::REDUCE_SUM},
           {"relu", OperationType::RELU},
           {"resize", OperationType::RESIZE},
           {"reshape", OperationType::RESHAPE},
diff --git a/tensorflow/lite/delegates/gpu/common/operations.h b/tensorflow/lite/delegates/gpu/common/operations.h
index 563dbdec96e..a93f63a02b7 100644
--- a/tensorflow/lite/delegates/gpu/common/operations.h
+++ b/tensorflow/lite/delegates/gpu/common/operations.h
@@ -17,14 +17,15 @@ limitations under the License.
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_OPERATIONS_H_
 
 #include <cstdint>
+#include <set>
 #include <string>
 #include <vector>
 
 #include "absl/types/variant.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
-#include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
 
 namespace tflite {
 namespace gpu {
@@ -36,6 +37,7 @@ enum class OperationType {
   ADD,
   BATCH_TO_SPACE,
   BATCH_NORMALIZATION,
+  BATCHED_MATMUL,
   CONCAT,
   CONST,
   CONVOLUTION_2D,
@@ -45,9 +47,14 @@ enum class OperationType {
   DEPTHWISE_CONVOLUTION,
   DIV,
   ELU,
+  EQUAL,
   EXP,
   FULLY_CONNECTED,
+  GREATER,
+  GREATER_EQUAL,
   HARD_SWISH,
+  LESS,
+  LESS_EQUAL,
   LOG,
   LSTM,
   MAXIMUM,
@@ -56,12 +63,18 @@ enum class OperationType {
   MEAN_STDDEV_NORMALIZATION,
   MINIMUM,
   MUL,
+  NEG,
+  NOT_EQUAL,
   PAD,
   POOLING_2D,
   POW,
   PRELU,
   // Used to accurately run inference on quantized models.
   QUANTIZE_AND_DEQUANTIZE,
+  REDUCE_MAXIMUM,
+  REDUCE_MINIMUM,
+  REDUCE_PRODUCT,
+  REDUCE_SUM,
   RELU,
   RESHAPE,
   RESIZE,
@@ -358,6 +371,10 @@ struct PReLUAttributes {
       alpha;
 };
 
+struct ReduceAttributes {
+  Axis axis = Axis::UNKNOWN;
+};
+
 struct SoftmaxAttributes {
   Axis axis = Axis::UNKNOWN;
 };
diff --git a/tensorflow/lite/delegates/gpu/common/quantization_util.cc b/tensorflow/lite/delegates/gpu/common/quantization_util.cc
index 9584d1d98ec..bbd99023a2f 100644
--- a/tensorflow/lite/delegates/gpu/common/quantization_util.cc
+++ b/tensorflow/lite/delegates/gpu/common/quantization_util.cc
@@ -15,15 +15,23 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/quantization_util.h"
 
-#include "tensorflow/lite/builtin_ops.h"
+#include <stdint.h>
+
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 
 namespace tflite {
 namespace gpu {
 namespace {
-void DequantizeInput(TfLiteContext* context, int input_index,
-                     const std::unordered_map<int, int>& quant_conversion_map) {
+void DequantizeInput(
+    TfLiteContext* context, int input_index,
+    const absl::flat_hash_map<int, int>& quant_conversion_map) {
   if (quant_conversion_map.find(input_index) == quant_conversion_map.end()) {
     return;
   }
@@ -50,7 +58,7 @@ void DequantizeInput(TfLiteContext* context, int input_index,
 }
 
 void QuantizeOutput(TfLiteContext* context, int output_index,
-                    const std::unordered_map<int, int>& quant_conversion_map) {
+                    const absl::flat_hash_map<int, int>& quant_conversion_map) {
   if (quant_conversion_map.find(output_index) == quant_conversion_map.end()) {
     return;
   }
@@ -80,7 +88,7 @@ void QuantizeOutput(TfLiteContext* context, int output_index,
 
 absl::Status DequantizeInputs(
     TfLiteContext* context, const std::vector<uint32_t>& input_indices,
-    const std::unordered_map<int, int>& quant_conversion_map) {
+    const absl::flat_hash_map<int, int>& quant_conversion_map) {
   for (auto index : input_indices) {
     DequantizeInput(context, static_cast<int>(index), quant_conversion_map);
   }
@@ -89,7 +97,7 @@ absl::Status DequantizeInputs(
 
 absl::Status DequantizeInputs(
     TfLiteContext* context, const std::vector<int64_t>& input_indices,
-    const std::unordered_map<int, int>& quant_conversion_map) {
+    const absl::flat_hash_map<int, int>& quant_conversion_map) {
   for (auto index : input_indices) {
     DequantizeInput(context, static_cast<int>(index), quant_conversion_map);
   }
@@ -98,7 +106,7 @@ absl::Status DequantizeInputs(
 
 absl::Status QuantizeOutputs(
     TfLiteContext* context, const std::vector<uint32_t>& output_indices,
-    const std::unordered_map<int, int>& quant_conversion_map) {
+    const absl::flat_hash_map<int, int>& quant_conversion_map) {
   for (auto index : output_indices) {
     QuantizeOutput(context, static_cast<int>(index), quant_conversion_map);
   }
@@ -108,7 +116,7 @@ absl::Status QuantizeOutputs(
 
 absl::Status QuantizeOutputs(
     TfLiteContext* context, const std::vector<int64_t>& output_indices,
-    const std::unordered_map<int, int>& quant_conversion_map) {
+    const absl::flat_hash_map<int, int>& quant_conversion_map) {
   for (auto index : output_indices) {
     QuantizeOutput(context, static_cast<int>(index), quant_conversion_map);
   }
diff --git a/tensorflow/lite/delegates/gpu/common/quantization_util.h b/tensorflow/lite/delegates/gpu/common/quantization_util.h
index 26512531f29..584f6876a9c 100644
--- a/tensorflow/lite/delegates/gpu/common/quantization_util.h
+++ b/tensorflow/lite/delegates/gpu/common/quantization_util.h
@@ -16,9 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_QUANTIZATION_UTIL_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_QUANTIZATION_UTIL_H_
 
-#include <unordered_map>
+#include <stdint.h>
+
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 
@@ -32,11 +34,11 @@ namespace gpu {
 // tensor and its original quantized one.
 absl::Status DequantizeInputs(
     TfLiteContext* context, const std::vector<uint32_t>& input_indices,
-    const std::unordered_map<int, int>& quant_conversion_map);
+    const absl::flat_hash_map<int, int>& quant_conversion_map);
 
 absl::Status DequantizeInputs(
     TfLiteContext* context, const std::vector<int64_t>& input_indices,
-    const std::unordered_map<int, int>& quant_conversion_map);
+    const absl::flat_hash_map<int, int>& quant_conversion_map);
 
 // Quantizes output tensors post-inference, leaving float tensors intact.
 // output_indices contains (fp32) inputs to be quantized, which are outputs of
@@ -45,11 +47,11 @@ absl::Status DequantizeInputs(
 // tensor and its original quantized one.
 absl::Status QuantizeOutputs(
     TfLiteContext* context, const std::vector<uint32_t>& output_indices,
-    const std::unordered_map<int, int>& quant_conversion_map);
+    const absl::flat_hash_map<int, int>& quant_conversion_map);
 
 absl::Status QuantizeOutputs(
     TfLiteContext* context, const std::vector<int64_t>& output_indices,
-    const std::unordered_map<int, int>& quant_conversion_map);
+    const absl::flat_hash_map<int, int>& quant_conversion_map);
 }  // namespace gpu
 }  // namespace tflite
 
diff --git a/tensorflow/lite/delegates/gpu/common/quantization_util_test.cc b/tensorflow/lite/delegates/gpu/common/quantization_util_test.cc
index 064a2a2e6b2..ffded543123 100644
--- a/tensorflow/lite/delegates/gpu/common/quantization_util_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/quantization_util_test.cc
@@ -15,8 +15,18 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/quantization_util.h"
 
+#include <stdint.h>
+
+#include <algorithm>
+#include <limits>
+#include <memory>
+#include <vector>
+
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/util.h"
 
 using ::testing::Eq;
@@ -151,7 +161,7 @@ TEST(DequantizeInputs, Int8) {
   PopulateContext(tensors, context);
 
   std::vector<uint32_t> input_indices = {1};
-  std::unordered_map<int, int> quant_conversion_map = {{1, 0}};
+  absl::flat_hash_map<int, int> quant_conversion_map = {{1, 0}};
 
   auto status = DequantizeInputs(&context, input_indices, quant_conversion_map);
   EXPECT_TRUE(status.ok());
@@ -176,7 +186,7 @@ TEST(DequantizeInputs, UInt8) {
   PopulateContext(tensors, context);
 
   std::vector<int64_t> input_indices = {1};
-  std::unordered_map<int, int> quant_conversion_map = {{1, 0}};
+  absl::flat_hash_map<int, int> quant_conversion_map = {{1, 0}};
 
   auto status = DequantizeInputs(&context, input_indices, quant_conversion_map);
   EXPECT_TRUE(status.ok());
@@ -199,7 +209,7 @@ TEST(QuantizeOutputs, Int8) {
   PopulateContext(tensors, context);
 
   std::vector<uint32_t> output_indices = {0};
-  std::unordered_map<int, int> quant_conversion_map = {{0, 1}};
+  absl::flat_hash_map<int, int> quant_conversion_map = {{0, 1}};
 
   auto status = QuantizeOutputs(&context, output_indices, quant_conversion_map);
   EXPECT_TRUE(status.ok());
@@ -221,7 +231,7 @@ TEST(QuantizeOutputs, UInt8) {
   PopulateContext(tensors, context);
 
   std::vector<int64_t> output_indices = {0};
-  std::unordered_map<int, int> quant_conversion_map = {{0, 1}};
+  absl::flat_hash_map<int, int> quant_conversion_map = {{0, 1}};
 
   auto status = QuantizeOutputs(&context, output_indices, quant_conversion_map);
   EXPECT_TRUE(status.ok());
diff --git a/tensorflow/lite/delegates/gpu/common/shape.cc b/tensorflow/lite/delegates/gpu/common/shape.cc
index 074637a7774..c66ecea1215 100644
--- a/tensorflow/lite/delegates/gpu/common/shape.cc
+++ b/tensorflow/lite/delegates/gpu/common/shape.cc
@@ -14,6 +14,11 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 
+#include <stdint.h>
+
+#include <string>
+#include <vector>
+
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 
diff --git a/tensorflow/lite/delegates/gpu/common/shape.h b/tensorflow/lite/delegates/gpu/common/shape.h
index 544d2c1f4d0..a017ff28e63 100644
--- a/tensorflow/lite/delegates/gpu/common/shape.h
+++ b/tensorflow/lite/delegates/gpu/common/shape.h
@@ -16,9 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_SHAPE_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_SHAPE_H_
 
-#include <sys/types.h>
+#include <stddef.h>
+#include <stdint.h>
 
-#include <algorithm>
 #include <array>
 #include <functional>
 #include <numeric>
@@ -26,8 +26,6 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "absl/hash/hash.h"
-
 namespace tflite {
 namespace gpu {
 
diff --git a/tensorflow/lite/delegates/gpu/common/shape_test.cc b/tensorflow/lite/delegates/gpu/common/shape_test.cc
index 41519115729..3cbf1fddfc2 100644
--- a/tensorflow/lite/delegates/gpu/common/shape_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/shape_test.cc
@@ -14,10 +14,10 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 
-#include <initializer_list>
+#include <stdint.h>
+
 #include <vector>
 
-#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/gpu/common/status.h b/tensorflow/lite/delegates/gpu/common/status.h
index d6b5dd8a94a..22dcc11d57f 100644
--- a/tensorflow/lite/delegates/gpu/common/status.h
+++ b/tensorflow/lite/delegates/gpu/common/status.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_STATUS_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_STATUS_H_
 
-#include "absl/status/status.h"
-#define RETURN_IF_ERROR(s) {auto c=(s);if(!c.ok())return c;}
+#include "absl/status/status.h"  // IWYU pragma: export
+#define RETURN_IF_ERROR(s) {auto c=(s);if(!c.ok())return c;}         // IWYU pragma: export
 
 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_STATUS_H_
diff --git a/tensorflow/lite/delegates/gpu/common/tensor.h b/tensorflow/lite/delegates/gpu/common/tensor.h
index fc39d3485ba..ba0fd48810c 100644
--- a/tensorflow/lite/delegates/gpu/common/tensor.h
+++ b/tensorflow/lite/delegates/gpu/common/tensor.h
@@ -16,7 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TENSOR_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TENSOR_H_
 
-#include <string>
+#include <stdint.h>
+
 #include <vector>
 
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
@@ -71,6 +72,10 @@ struct TensorRef {
   // Opaque reference to a tensor. Upstream component is responsible for
   // resolving this reference into an actual tensor.
   int64_t ref = -1;
+
+  // Specifies if the tensor should be a variable input tensor that must be an
+  // output as well as an input to the graph.
+  bool is_variable_input = false;
 };
 
 template <typename ShapeT, DataType Type>
diff --git a/tensorflow/lite/delegates/gpu/common/testing/BUILD b/tensorflow/lite/delegates/gpu/common/testing/BUILD
index a7f97eb67b3..dd8792d6895 100644
--- a/tensorflow/lite/delegates/gpu/common/testing/BUILD
+++ b/tensorflow/lite/delegates/gpu/common/testing/BUILD
@@ -10,6 +10,8 @@ cc_library(
     hdrs = ["interpreter_utils.h"],
     deps = [
         "//tensorflow/lite:framework",
+        "//tensorflow/lite:string",
+        "//tensorflow/lite/c:common",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common:tensor",
@@ -25,13 +27,12 @@ cc_library(
     hdrs = ["tflite_model_reader.h"],
     deps = [
         "//tensorflow/lite:framework_lib",
-        "//tensorflow/lite:kernel_api",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/delegates/gpu/common:model",
         "//tensorflow/lite/delegates/gpu/common:model_builder",
+        "//tensorflow/lite/delegates/gpu/common:model_transformer",
         "//tensorflow/lite/delegates/gpu/common:status",
-        "//tensorflow/lite/delegates/gpu/common/transformations:general_transformations",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/delegates/gpu/common/transformations:model_transformations",
     ],
 )
diff --git a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/BUILD b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/BUILD
index b5ceff30d1e..50150964e92 100644
--- a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/BUILD
+++ b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/BUILD
@@ -24,10 +24,12 @@ cc_library(
     hdrs = ["utils.h"],
     deps = [
         "//tensorflow/lite:framework",
+        "//tensorflow/lite:string",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -48,6 +50,7 @@ cc_test(
     deps = [
         ":feature_parity",
         ":utils",
+        "//tensorflow/lite:framework_lib",
         "//tensorflow/lite/delegates/gpu:gl_delegate",
         "@com_google_googletest//:gtest_main",
     ],
@@ -65,6 +68,7 @@ cc_test(
     deps = [
         ":feature_parity",
         ":utils",
+        "//tensorflow/lite:framework_lib",
         "//tensorflow/lite/delegates/gpu:delegate",
         "@com_google_googletest//:gtest_main",
     ],
@@ -82,6 +86,7 @@ cc_test(
     deps = [
         ":feature_parity",
         ":utils",
+        "//tensorflow/lite:framework_lib",
         "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate",
         "@com_google_googletest//:gtest_main",
     ],
diff --git a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/feature_parity.h b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/feature_parity.h
index 7661a4ad296..dacb486e303 100644
--- a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/feature_parity.h
+++ b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/feature_parity.h
@@ -16,9 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TESTING_FEATURE_PARITY_FEATURE_PARITY_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TESTING_FEATURE_PARITY_FEATURE_PARITY_H_
 
-#include <functional>
-#include <string>
-#include <utility>
 #include <vector>
 
 #include "tensorflow/lite/delegates/gpu/common/testing/feature_parity/generators/add.h"
diff --git a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/generators/BUILD b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/generators/BUILD
index 4fef0a28525..56894c8810a 100644
--- a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/generators/BUILD
+++ b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/generators/BUILD
@@ -20,9 +20,7 @@ cc_library(
     srcs = ["add.cc"],
     hdrs = ["add.h"],
     deps = [
-        "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
-        "//tensorflow/lite/delegates/gpu/common:shape",
         "//tensorflow/lite/delegates/gpu/common/testing/feature_parity:utils",
         "@flatbuffers",
     ],
diff --git a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/generators/add.cc b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/generators/add.cc
index dbb3851ca56..06649b36e79 100644
--- a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/generators/add.cc
+++ b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/generators/add.cc
@@ -15,11 +15,14 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/testing/feature_parity/generators/add.h"
 
+#include <stdint.h>
+
+#include <string>
+#include <utility>
 #include <vector>
 
-#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/delegates/gpu/common/testing/feature_parity/utils.h"
-#include "tensorflow/lite/model.h"
 #include "tensorflow/lite/version.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/opencl_test.cc b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/opencl_test.cc
index 24c0e0c424b..3dbb8638196 100644
--- a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/opencl_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/opencl_test.cc
@@ -13,11 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <stdint.h>
+
+#include <memory>
+#include <vector>
+
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/delegates/gpu/common/testing/feature_parity/feature_parity.h"
 #include "tensorflow/lite/delegates/gpu/common/testing/feature_parity/utils.h"
 #include "tensorflow/lite/delegates/gpu/delegate.h"
+#include "tensorflow/lite/interpreter.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/opengl_test.cc b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/opengl_test.cc
index 2f403d2e583..ed0aa104e65 100644
--- a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/opengl_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/opengl_test.cc
@@ -15,12 +15,14 @@ limitations under the License.
 
 #include <cstdint>
 #include <memory>
+#include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/delegates/gpu/common/testing/feature_parity/feature_parity.h"
 #include "tensorflow/lite/delegates/gpu/common/testing/feature_parity/utils.h"
 #include "tensorflow/lite/delegates/gpu/gl_delegate.h"
+#include "tensorflow/lite/interpreter.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/utils.cc b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/utils.cc
index bdcbf7ed62e..6eb94f63b6f 100644
--- a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/utils.cc
+++ b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/utils.cc
@@ -15,15 +15,18 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/testing/feature_parity/utils.h"
 
+#include <memory>
+#include <optional>
 #include <ostream>
 #include <string>
+#include <utility>
 
 #include "absl/status/status.h"
 #include "absl/strings/substitute.h"
-#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/model.h"
+#include "tensorflow/lite/string_type.h"
 
 std::ostream& operator<<(std::ostream& os, const TfLiteTensor& tensor) {
   std::string shape;
diff --git a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/utils.h b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/utils.h
index 7c34978fb55..20d43b85468 100644
--- a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/utils.h
+++ b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/utils.h
@@ -16,14 +16,24 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TESTING_FEATURE_PARITY_UTILS_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TESTING_FEATURE_PARITY_UTILS_H_
 
+#include <stddef.h>
+
 #include <cstdint>
+#include <memory>
+#include <optional>
 #include <ostream>
 #include <string>
+#include <tuple>
+#include <utility>
 #include <vector>
 
 #include <gmock/gmock.h>
+#include <gtest/gtest.h>
 #include "absl/status/status.h"
+#include "absl/types/span.h"
+#include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/model.h"
+#include "tensorflow/lite/string_type.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/xnnpack_test.cc b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/xnnpack_test.cc
index 3d05d64437d..bdd12951c8c 100644
--- a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/xnnpack_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/xnnpack_test.cc
@@ -13,11 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <stdint.h>
+
+#include <memory>
+#include <vector>
+
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/delegates/gpu/common/testing/feature_parity/feature_parity.h"
 #include "tensorflow/lite/delegates/gpu/common/testing/feature_parity/utils.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+#include "tensorflow/lite/interpreter.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/delegates/gpu/common/testing/interpreter_utils.cc b/tensorflow/lite/delegates/gpu/common/testing/interpreter_utils.cc
index 08d9448f7e5..ae00e213fa3 100644
--- a/tensorflow/lite/delegates/gpu/common/testing/interpreter_utils.cc
+++ b/tensorflow/lite/delegates/gpu/common/testing/interpreter_utils.cc
@@ -16,15 +16,18 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/testing/interpreter_utils.h"
 
 #include <cstring>
+#include <memory>
+#include <string>
 #include <vector>
 
 #include "absl/memory/memory.h"
-#include "tensorflow/lite/context.h"
 #include "tensorflow/lite/core/api/op_resolver.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/string_type.h"
 
 namespace tflite {
 namespace gpu {
diff --git a/tensorflow/lite/delegates/gpu/common/testing/interpreter_utils.h b/tensorflow/lite/delegates/gpu/common/testing/interpreter_utils.h
index ca2825b7563..86656abbe0f 100644
--- a/tensorflow/lite/delegates/gpu/common/testing/interpreter_utils.h
+++ b/tensorflow/lite/delegates/gpu/common/testing/interpreter_utils.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include <vector>
 
-#include "tensorflow/lite/context.h"
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/op_resolver.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
diff --git a/tensorflow/lite/delegates/gpu/common/testing/tflite_model_reader.cc b/tensorflow/lite/delegates/gpu/common/testing/tflite_model_reader.cc
index 0faa621f72f..7ba3de641ef 100644
--- a/tensorflow/lite/delegates/gpu/common/testing/tflite_model_reader.cc
+++ b/tensorflow/lite/delegates/gpu/common/testing/tflite_model_reader.cc
@@ -14,16 +14,18 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/gpu/common/testing/tflite_model_reader.h"
 
+#include <stddef.h>
+
 #include <memory>
 
-#include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/core/api/op_resolver.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/model_builder.h"
+#include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
-#include "tensorflow/lite/delegates/gpu/common/transformations/general_transformations.h"
+#include "tensorflow/lite/delegates/gpu/common/transformations/model_transformations.h"
 #include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model.h"
 #include "tensorflow/lite/model_builder.h"
 
 namespace tflite {
@@ -79,7 +81,6 @@ absl::Status BuildFromFlatBuffer(const tflite::FlatBufferModel& flatbuffer,
   if (interpreter_builder(&interpreter) != kTfLiteOk || !interpreter) {
     return absl::InternalError("Unable to prepare TfLite interpreter.");
   }
-  interpreter->UseNNAPI(false);
   TfLiteDelegate delegate;
   delegate.data_ = graph;
   delegate.flags = kTfLiteDelegateFlagsNone;
@@ -94,8 +95,8 @@ absl::Status BuildFromFlatBuffer(const tflite::FlatBufferModel& flatbuffer,
 
   NullTransformationReporter reporter;
   ModelTransformer transformer(graph, &reporter);
-  if (!ApplyGeneralTransformations(&transformer)) {
-    return absl::InternalError("Graph general transformations failed");
+  if (!ApplyModelTransformations(&transformer)) {
+    return absl::InternalError("Graph transformations failed");
   }
 
   return absl::OkStatus();
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/BUILD b/tensorflow/lite/delegates/gpu/common/transformations/BUILD
index bf26b03f534..6cb358bcc93 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/BUILD
+++ b/tensorflow/lite/delegates/gpu/common/transformations/BUILD
@@ -1,3 +1,5 @@
+load("//tensorflow/core/platform:build_config.bzl", "tf_platform_alias")
+
 package(
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],  # Apache 2.0
@@ -12,9 +14,9 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:model",
         "//tensorflow/lite/delegates/gpu/common:model_transformer",
         "//tensorflow/lite/delegates/gpu/common:operations",
-        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:shape",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
         "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:any",
     ],
 )
@@ -24,11 +26,11 @@ cc_library(
     srcs = ["add_quant_adjustments.cc"],
     hdrs = ["add_quant_adjustments.h"],
     deps = [
-        "//tensorflow/lite/delegates/gpu/common:data_type",
         "//tensorflow/lite/delegates/gpu/common:model",
         "//tensorflow/lite/delegates/gpu/common:model_transformer",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:any",
@@ -40,10 +42,13 @@ cc_test(
     srcs = ["add_quant_adjustments_test.cc"],
     deps = [
         ":add_quant_adjustments",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
         "//tensorflow/lite/delegates/gpu/common:model",
         "//tensorflow/lite/delegates/gpu/common:model_transformer",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:shape",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/types:any",
         "@com_google_absl//absl/types:optional",
         "@com_google_googletest//:gtest_main",
@@ -56,9 +61,13 @@ cc_library(
     hdrs = ["fuse_add_to_conv.h"],
     deps = [
         "//tensorflow/lite/delegates/gpu/common:data_type",
+        "//tensorflow/lite/delegates/gpu/common:model",
         "//tensorflow/lite/delegates/gpu/common:model_transformer",
         "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:shape",
         "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -67,8 +76,13 @@ cc_test(
     srcs = ["fuse_add_to_conv_test.cc"],
     deps = [
         ":fuse_add_to_conv",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
+        "//tensorflow/lite/delegates/gpu/common:model",
+        "//tensorflow/lite/delegates/gpu/common:model_transformer",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:shape",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
+        "@com_google_absl//absl/status",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -82,8 +96,10 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:model",
         "//tensorflow/lite/delegates/gpu/common:model_transformer",
         "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:shape",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common:tensor",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -92,17 +108,21 @@ cc_test(
     srcs = ["fuse_mul_to_conv_test.cc"],
     deps = [
         ":fuse_mul_to_conv",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
         "//tensorflow/lite/delegates/gpu/common:model",
+        "//tensorflow/lite/delegates/gpu/common:model_transformer",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:shape",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
+        "@com_google_absl//absl/status",
         "@com_google_googletest//:gtest_main",
     ],
 )
 
 cc_library(
-    name = "general_transformations",
-    srcs = ["general_transformations.cc"],
-    hdrs = ["general_transformations.h"],
+    name = "model_transformations",
+    srcs = ["model_transformations.cc"],
+    hdrs = ["model_transformations.h"],
     deps = [
         ":add_quant_adjustments",
         ":fuse_add_to_conv",
@@ -112,7 +132,7 @@ cc_library(
         ":merge_padding_with",
         ":remove_noop",
         "//tensorflow/lite/delegates/gpu/common:model_transformer",
-    ],
+    ] + tf_platform_alias("custom_transformations", "//tensorflow/lite/delegates/gpu/common/"),
 )
 
 cc_library(
@@ -123,7 +143,8 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:model",
         "//tensorflow/lite/delegates/gpu/common:model_transformer",
         "//tensorflow/lite/delegates/gpu/common:operations",
-        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:shape",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/types:any",
     ],
@@ -138,6 +159,8 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:model_transformer",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:shape",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/types:any",
         "@com_google_googletest//:gtest_main",
     ],
@@ -151,8 +174,11 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:model",
         "//tensorflow/lite/delegates/gpu/common:model_transformer",
         "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:shape",
         "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:any",
     ],
 )
@@ -165,6 +191,9 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:model",
         "//tensorflow/lite/delegates/gpu/common:model_transformer",
         "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:shape",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/types:any",
         "@com_google_googletest//:gtest_main",
     ],
@@ -173,7 +202,6 @@ cc_test(
 cc_library(
     name = "matching",
     hdrs = ["matching.h"],
-    deps = ["//tensorflow/lite/delegates/gpu/common:model"],
 )
 
 cc_library(
@@ -186,7 +214,9 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:model",
         "//tensorflow/lite/delegates/gpu/common:model_transformer",
         "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:shape",
         "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:any",
@@ -198,10 +228,13 @@ cc_test(
     srcs = ["merge_padding_with_test.cc"],
     deps = [
         ":merge_padding_with",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
         "//tensorflow/lite/delegates/gpu/common:model",
         "//tensorflow/lite/delegates/gpu/common:model_transformer",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:shape",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/types:any",
         "@com_google_googletest//:gtest_main",
     ],
@@ -216,8 +249,11 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:model",
         "//tensorflow/lite/delegates/gpu/common:model_transformer",
         "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:shape",
         "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -230,6 +266,9 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:model",
         "//tensorflow/lite/delegates/gpu/common:model_transformer",
         "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:shape",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
+        "@com_google_absl//absl/status",
         "@com_google_googletest//:gtest_main",
     ],
 )
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/add_bias.cc b/tensorflow/lite/delegates/gpu/common/transformations/add_bias.cc
index 29d70d8f4a9..4c6b08e2fb4 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/add_bias.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/add_bias.cc
@@ -15,13 +15,18 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/transformations/add_bias.h"
 
+#include <memory>
+#include <string>
+#include <vector>
+
 #include "absl/memory/memory.h"
-#include "absl/strings/str_cat.h"
 #include "absl/types/any.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
-#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
 
 namespace tflite {
 namespace gpu {
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/add_quant_adjustments.cc b/tensorflow/lite/delegates/gpu/common/transformations/add_quant_adjustments.cc
index 6262d1575b7..7f43d70c842 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/add_quant_adjustments.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/add_quant_adjustments.cc
@@ -15,15 +15,19 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/transformations/add_quant_adjustments.h"
 
+#include <memory>
+#include <optional>
 #include <string>
+#include <vector>
 
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "absl/types/any.h"
-#include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
 
 namespace tflite {
 namespace gpu {
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/add_quant_adjustments_test.cc b/tensorflow/lite/delegates/gpu/common/transformations/add_quant_adjustments_test.cc
index 2ff84981f9d..9ef909d4ab7 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/add_quant_adjustments_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/add_quant_adjustments_test.cc
@@ -15,14 +15,20 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/transformations/add_quant_adjustments.h"
 
-#include <gmock/gmock.h>
+#include <memory>
+#include <string>
+#include <vector>
+
 #include <gtest/gtest.h>
+#include "absl/status/status.h"
 #include "absl/types/any.h"
 #include "absl/types/optional.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
 
 namespace tflite {
 namespace gpu {
@@ -59,7 +65,7 @@ TEST(AddQuantAdjustments, OneNode) {
 
   ASSERT_TRUE(graph.AddConsumer(add_node->id, input->id).ok());
 
-  Value* output;
+  Value* output = nullptr;
   AddQuantParams(&input->quant_params, /*min=*/0.0, /*max=*/2.0,
                  /*scale=*/0.008);
   ASSERT_TRUE(AddOutput(&graph, add_node, &output).ok());
@@ -114,18 +120,18 @@ TEST(AddQuantAdjustments, GeneralCase) {
 
   // Connections.
   ASSERT_TRUE(graph.AddConsumer(add1_node->id, input->id).ok());
-  Value* link1;
+  Value* link1 = nullptr;
   ASSERT_TRUE(ConnectTwoNodes(&graph, add1_node, quant_node, &link1).ok());
   AddQuantParams(&link1->quant_params, /*min=*/0.0, /*max=*/2.0,
                  /*scale=*/0.008);
   link1->tensor.shape = BHWC(1, 4, 4, 8);
   ASSERT_TRUE(graph.AddConsumer(add2_node->id, link1->id).ok());
-  Value* link2;
+  Value* link2 = nullptr;
   ASSERT_TRUE(ConnectTwoNodes(&graph, quant_node, add2_node, &link2).ok());
   AddQuantParams(&link2->quant_params, /*min=*/-1.0, /*max=*/1.0,
                  /*scale=*/0.008);
   link2->tensor.shape = BHWC(1, 4, 4, 8);
-  Value* output;
+  Value* output = nullptr;
   ASSERT_TRUE(AddOutput(&graph, add2_node, &output).ok());
   AddQuantParams(&output->quant_params, /*min=*/-1.0, /*max=*/1.0,
                  /*scale=*/0.008);
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.cc b/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.cc
index fdbd6e03755..2b432bad877 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.cc
@@ -15,8 +15,20 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.h"
 
+#include <any>
+#include <memory>
+#include <string>
+#include <variant>
+#include <vector>
+
+#include "absl/strings/string_view.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
 
 namespace tflite {
 namespace gpu {
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.h b/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.h
index 53a0cef63c8..26f93dc3765 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.h
+++ b/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.h
@@ -20,7 +20,6 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
-#include "tensorflow/lite/delegates/gpu/common/status.h"
 
 namespace tflite {
 namespace gpu {
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv_test.cc b/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv_test.cc
index 4a48c7c0b28..76bf7e4a72a 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv_test.cc
@@ -15,10 +15,20 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.h"
 
+#include <any>
+#include <memory>
+#include <string>
+#include <vector>
+
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
 
 using ::testing::FloatNear;
 using ::testing::Pointwise;
@@ -57,11 +67,11 @@ TEST(MergeConvolutionWithAddTest, Smoke) {
 
   ASSERT_TRUE(graph.AddConsumer(conv_node->id, input->id).ok());
 
-  Value* output;
+  Value* output = nullptr;
   ASSERT_TRUE(AddOutput(&graph, add_node, &output).ok());
   output->tensor.shape = BHWC(1, 4, 4, 16);
 
-  Value* link1;
+  Value* link1 = nullptr;
   ASSERT_TRUE(ConnectTwoNodes(&graph, conv_node, add_node, &link1).ok());
   link1->tensor.shape = BHWC(1, 4, 4, 16);
 
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv.cc b/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv.cc
index 25ec6299f11..41bd485a76c 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv.cc
@@ -15,9 +15,18 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv.h"
 
+#include <any>
+#include <memory>
+#include <string>
+#include <variant>
+#include <vector>
+
+#include "absl/strings/string_view.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
 
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv.h b/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv.h
index 8d64ae50488..92fab4553f1 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv.h
+++ b/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv.h
@@ -20,7 +20,6 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
-#include "tensorflow/lite/delegates/gpu/common/status.h"
 
 namespace tflite {
 namespace gpu {
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv_test.cc b/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv_test.cc
index ea990dd8267..b35cb832335 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv_test.cc
@@ -15,11 +15,20 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv.h"
 
+#include <any>
+#include <memory>
+#include <string>
+#include <vector>
+
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
 
 using ::testing::FloatNear;
 using ::testing::Pointwise;
@@ -58,11 +67,11 @@ TEST(MergeConvolutionWithMulTest, Smoke) {
 
   ASSERT_TRUE(graph.AddConsumer(conv_node->id, input->id).ok());
 
-  Value* output;
+  Value* output = nullptr;
   ASSERT_TRUE(AddOutput(&graph, mul_node, &output).ok());
   output->tensor.shape = BHWC(1, 4, 4, 16);
 
-  Value* link1;
+  Value* link1 = nullptr;
   ASSERT_TRUE(ConnectTwoNodes(&graph, conv_node, mul_node, &link1).ok());
   link1->tensor.shape = BHWC(1, 4, 4, 16);
 
@@ -109,11 +118,11 @@ TEST(MergeMulWithConvolutionTest, Smoke) {
 
   ASSERT_TRUE(graph.AddConsumer(mul_node->id, input->id).ok());
 
-  Value* output;
+  Value* output = nullptr;
   ASSERT_TRUE(AddOutput(&graph, conv_node, &output).ok());
   output->tensor.shape = BHWC(1, 4, 4, 16);
 
-  Value* link1;
+  Value* link1 = nullptr;
   ASSERT_TRUE(ConnectTwoNodes(&graph, mul_node, conv_node, &link1).ok());
   link1->tensor.shape = BHWC(1, 4, 4, 16);
 
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/make_fully_connected.cc b/tensorflow/lite/delegates/gpu/common/transformations/make_fully_connected.cc
index 1236cdec214..226e7d4b2a9 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/make_fully_connected.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/make_fully_connected.cc
@@ -15,11 +15,17 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/transformations/make_fully_connected.h"
 
+#include <memory>
+#include <string>
+#include <vector>
+
 #include "absl/memory/memory.h"
 #include "absl/types/any.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
-#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
 
 namespace tflite {
 namespace gpu {
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/make_fully_connected_test.cc b/tensorflow/lite/delegates/gpu/common/transformations/make_fully_connected_test.cc
index d3606d4a097..29f1b4bfbef 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/make_fully_connected_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/make_fully_connected_test.cc
@@ -15,13 +15,18 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/transformations/make_fully_connected.h"
 
-#include <gmock/gmock.h>
+#include <memory>
+#include <string>
+#include <vector>
+
 #include <gtest/gtest.h>
+#include "absl/status/status.h"
 #include "absl/types/any.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
 
 namespace tflite {
 namespace gpu {
@@ -68,16 +73,16 @@ TEST(MakeFullyConnected, Smoke) {
 
   ASSERT_TRUE(graph.AddConsumer(conv1x1_node0->id, input->id).ok());
 
-  Value* output;
+  Value* output = nullptr;
   ASSERT_TRUE(AddOutput(&graph, conv1x1_node2, &output).ok());
   output->tensor.shape = BHWC(1, 1, 1, 32);
 
-  Value* link1;
+  Value* link1 = nullptr;
   ASSERT_TRUE(
       ConnectTwoNodes(&graph, conv1x1_node0, conv4x4_node1, &link1).ok());
   link1->tensor.shape = BHWC(1, 4, 4, 16);
 
-  Value* link2;
+  Value* link2 = nullptr;
   ASSERT_TRUE(
       ConnectTwoNodes(&graph, conv4x4_node1, conv1x1_node2, &link2).ok());
   link2->tensor.shape = BHWC(1, 1, 1, 16);
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/make_padding.cc b/tensorflow/lite/delegates/gpu/common/transformations/make_padding.cc
index 17aac83baf7..51335a83c38 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/make_padding.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/make_padding.cc
@@ -15,11 +15,19 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/transformations/make_padding.h"
 
+#include <memory>
+#include <string>
+#include <vector>
+
 #include "absl/memory/memory.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/any.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
 
 namespace tflite {
 namespace gpu {
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/make_padding_test.cc b/tensorflow/lite/delegates/gpu/common/transformations/make_padding_test.cc
index f8be3218239..8aafd75ba5b 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/make_padding_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/make_padding_test.cc
@@ -15,12 +15,18 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/transformations/make_padding.h"
 
-#include <gmock/gmock.h>
+#include <memory>
+#include <string>
+#include <vector>
+
 #include <gtest/gtest.h>
+#include "absl/status/status.h"
 #include "absl/types/any.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
 
 namespace tflite {
 namespace gpu {
@@ -38,7 +44,7 @@ TEST(MakePadding, Smoke) {
   attr.axis = Axis::HEIGHT;
   concat_node->operation.attributes = attr;
 
-  Value* output;
+  Value* output = nullptr;
   ASSERT_TRUE(AddOutput(&graph, concat_node, &output).ok());
   output->tensor.shape = BHWC(1, 7, 3, 5);
 
@@ -50,7 +56,7 @@ TEST(MakePadding, Smoke) {
       std::vector<float>(const_attr.tensor.shape.DimensionsProduct(), 0);
   const_node->operation.attributes = const_attr;
 
-  Value* const_link;
+  Value* const_link = nullptr;
   ASSERT_TRUE(
       ConnectTwoNodes(&graph, const_node, concat_node, &const_link).ok());
   const_link->tensor.shape = const_attr.tensor.shape;
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/matching.h b/tensorflow/lite/delegates/gpu/common/transformations/matching.h
index 0dfd21e50ba..b28c8b05fed 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/matching.h
+++ b/tensorflow/lite/delegates/gpu/common/transformations/matching.h
@@ -18,9 +18,10 @@ limitations under the License.
 
 // A file provides predicates to match subgraphs.
 
+#include <algorithm>
+#include <iterator>
 #include <string>
-
-#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include <vector>
 
 namespace tflite {
 namespace gpu {
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with.cc b/tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with.cc
index 6a4e24b5042..509d715f550 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with.cc
@@ -15,16 +15,22 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with.h"
 
+#include <memory>
 #include <string>
+#include <variant>
 #include <vector>
 
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/any.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
 #include "tensorflow/lite/delegates/gpu/common/transformations/matching.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with_test.cc b/tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with_test.cc
index 40029efbc65..826a9b82854 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with_test.cc
@@ -15,13 +15,19 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with.h"
 
-#include <gmock/gmock.h>
+#include <memory>
+#include <string>
+#include <vector>
+
 #include <gtest/gtest.h>
+#include "absl/status/status.h"
 #include "absl/types/any.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
 
 namespace tflite {
 namespace gpu {
@@ -40,7 +46,7 @@ TEST(MergePaddingWith, Smoke) {
   pad_node->operation.attributes = attr;
 
   auto conv_node = graph.NewNode();
-  Value* temp;
+  Value* temp = nullptr;
   ASSERT_TRUE(ConnectTwoNodes(&graph, pad_node, conv_node, &temp).ok());
   ASSERT_TRUE(AddOutput(&graph, conv_node, &temp).ok());
   conv_node->operation.type = ToString(OperationType::CONVOLUTION_2D);
@@ -77,16 +83,17 @@ TEST(MergePaddingWith, MergeTwo) {
   pad_node1->operation.attributes = attr;
 
   auto pad_node2 = graph.NewNode();
-  Value* temp;
-  ASSERT_TRUE(ConnectTwoNodes(&graph, pad_node1, pad_node2, &temp).ok());
+  Value* temp1 = nullptr;
+  ASSERT_TRUE(ConnectTwoNodes(&graph, pad_node1, pad_node2, &temp1).ok());
   pad_node2->operation.type = ToString(OperationType::PAD);
   attr.prepended = BHWC(0, 0, 0, 0);
   attr.appended = BHWC(0, 2, 2, 0);
   pad_node2->operation.attributes = attr;
 
   auto conv_node = graph.NewNode();
-  ASSERT_TRUE(ConnectTwoNodes(&graph, pad_node2, conv_node, &temp).ok());
-  ASSERT_TRUE(AddOutput(&graph, conv_node, &temp).ok());
+  Value* temp2 = nullptr;
+  ASSERT_TRUE(ConnectTwoNodes(&graph, pad_node2, conv_node, &temp2).ok());
+  ASSERT_TRUE(AddOutput(&graph, conv_node, &temp2).ok());
   conv_node->operation.type = ToString(OperationType::CONVOLUTION_2D);
   Convolution2DAttributes conv_attr;
   conv_attr.padding.appended = HW(0, 0);
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/general_transformations.cc b/tensorflow/lite/delegates/gpu/common/transformations/model_transformations.cc
similarity index 87%
rename from tensorflow/lite/delegates/gpu/common/transformations/general_transformations.cc
rename to tensorflow/lite/delegates/gpu/common/transformations/model_transformations.cc
index f9ae7f41f8f..d1a6cf127f5 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/general_transformations.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/model_transformations.cc
@@ -13,8 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/lite/delegates/gpu/common/transformations/general_transformations.h"
+#include "tensorflow/lite/delegates/gpu/common/transformations/model_transformations.h"
 
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/common/custom_transformations.h"
+#include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
 #include "tensorflow/lite/delegates/gpu/common/transformations/add_quant_adjustments.h"
 #include "tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.h"
 #include "tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv.h"
@@ -26,6 +30,8 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 
+namespace {
+
 bool ApplyGeneralTransformations(ModelTransformer* transformer) {
   // whenever any of these transforms return false, that means that a graph
   // is in the broken state and processing should not continue.
@@ -57,5 +63,12 @@ bool ApplyGeneralTransformations(ModelTransformer* transformer) {
                             NewMergeMulWithConvolution().get());
 }
 
+}  // namespace
+
+bool ApplyModelTransformations(ModelTransformer* transformer) {
+  return ApplyCustomTransformations(transformer) &&
+         ApplyGeneralTransformations(transformer);
+}
+
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/general_transformations.h b/tensorflow/lite/delegates/gpu/common/transformations/model_transformations.h
similarity index 89%
rename from tensorflow/lite/delegates/gpu/common/transformations/general_transformations.h
rename to tensorflow/lite/delegates/gpu/common/transformations/model_transformations.h
index ffc5bba4f1a..69592c9777b 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/general_transformations.h
+++ b/tensorflow/lite/delegates/gpu/common/transformations/model_transformations.h
@@ -21,8 +21,9 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 
+// Applies custom and general transformations to the model in the proper order.
 // @return false when something went wrong that turned a graph in a broken state
-bool ApplyGeneralTransformations(ModelTransformer* transformer);
+bool ApplyModelTransformations(ModelTransformer* transformer);
 
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/remove_noop.cc b/tensorflow/lite/delegates/gpu/common/transformations/remove_noop.cc
index 6cc370899e4..a97d9185c71 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/remove_noop.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/remove_noop.cc
@@ -15,14 +15,25 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/transformations/remove_noop.h"
 
+#include <algorithm>
+#include <any>
+#include <functional>
+#include <iterator>
+#include <memory>
 #include <string>
+#include <utility>
+#include <variant>
 #include <vector>
 
 #include "absl/memory/memory.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
 
 namespace tflite {
 namespace gpu {
@@ -118,7 +129,7 @@ class RemoveIdentityReshape : public NodeTransformation {
       return {TransformStatus::SKIPPED,
               "Can not apply transformation when node output is graph output"};
     }
-    absl::Status status = RemoveOneInputOneOutputNode(graph, node);
+    absl::Status status = RemoveSimpleNodeKeepInput(graph, node);
     if (!status.ok()) {
       return {TransformStatus::INVALID,
               "Unable to remove a node: " + std::string(status.message())};
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/remove_noop_test.cc b/tensorflow/lite/delegates/gpu/common/transformations/remove_noop_test.cc
index a6aafee4f06..b76962d3ecb 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/remove_noop_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/remove_noop_test.cc
@@ -15,12 +15,20 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/transformations/remove_noop.h"
 
+#include <any>
+#include <memory>
+#include <string>
+#include <vector>
+
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/status.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
 
 namespace tflite {
 namespace gpu {
@@ -35,12 +43,12 @@ TEST(RemoveSingleInputAdd, Smoke) {
   ASSERT_TRUE(graph.AddConsumer(first_node->id, input->id).ok());
 
   auto add_node = graph.NewNode();
-  Value* output;
+  Value* output = nullptr;
   ASSERT_TRUE(AddOutput(&graph, add_node, &output).ok());
   add_node->operation.type = ToString(OperationType::ADD);
   add_node->operation.attributes = ElementwiseAttributes();
 
-  Value* temp;
+  Value* temp = nullptr;
   ASSERT_TRUE(ConnectTwoNodes(&graph, first_node, add_node, &temp).ok());
   ASSERT_EQ(2, graph.nodes().size());
   ASSERT_EQ(3, graph.values().size());
@@ -63,14 +71,14 @@ TEST(RemoveSingleInputAdd, DoNotTrigger_TensorHWC) {
   ASSERT_TRUE(graph.AddConsumer(first_node->id, input->id).ok());
 
   auto add_node = graph.NewNode();
-  Value* output;
+  Value* output = nullptr;
   ASSERT_TRUE(AddOutput(&graph, add_node, &output).ok());
   add_node->operation.type = ToString(OperationType::ADD);
   ElementwiseAttributes attr;
   attr.param = Tensor<HWC, DataType::FLOAT32>();
   add_node->operation.attributes = attr;
 
-  Value* temp;
+  Value* temp = nullptr;
   ASSERT_TRUE(ConnectTwoNodes(&graph, first_node, add_node, &temp).ok());
   ASSERT_EQ(2, graph.nodes().size());
   ASSERT_EQ(3, graph.values().size());
@@ -90,14 +98,14 @@ TEST(RemoveSingleInputAdd, DoNotTrigger_LinearTensor) {
   ASSERT_TRUE(graph.AddConsumer(first_node->id, input->id).ok());
 
   auto add_node = graph.NewNode();
-  Value* output;
+  Value* output = nullptr;
   ASSERT_TRUE(AddOutput(&graph, add_node, &output).ok());
   add_node->operation.type = ToString(OperationType::ADD);
   ElementwiseAttributes attr;
   attr.param = Tensor<Linear, DataType::FLOAT32>();
   add_node->operation.attributes = attr;
 
-  Value* temp;
+  Value* temp = nullptr;
   ASSERT_TRUE(ConnectTwoNodes(&graph, first_node, add_node, &temp).ok());
   ASSERT_EQ(2, graph.nodes().size());
   ASSERT_EQ(3, graph.values().size());
@@ -117,14 +125,14 @@ TEST(RemoveSingleInputAdd, DoNotTrigger_Scalar) {
   ASSERT_TRUE(graph.AddConsumer(first_node->id, input->id).ok());
 
   auto add_node = graph.NewNode();
-  Value* output;
+  Value* output = nullptr;
   ASSERT_TRUE(AddOutput(&graph, add_node, &output).ok());
   add_node->operation.type = ToString(OperationType::ADD);
   ElementwiseAttributes attr;
   attr.param = 0.5f;
   add_node->operation.attributes = attr;
 
-  Value* temp;
+  Value* temp = nullptr;
   ASSERT_TRUE(ConnectTwoNodes(&graph, first_node, add_node, &temp).ok());
   ASSERT_EQ(2, graph.nodes().size());
   ASSERT_EQ(3, graph.values().size());
@@ -146,13 +154,14 @@ TEST(RemoveSingleInputAdd, DoNotTrigger_Multiple) {
   ASSERT_TRUE(graph.AddConsumer(node_b->id, input->id).ok());
 
   auto add_node = graph.NewNode();
-  Value* output;
+  Value* output = nullptr;
   ASSERT_TRUE(AddOutput(&graph, add_node, &output).ok());
   add_node->operation.type = ToString(OperationType::ADD);
 
-  Value* temp;
-  ASSERT_TRUE(ConnectTwoNodes(&graph, node_a, add_node, &temp).ok());
-  ASSERT_TRUE(ConnectTwoNodes(&graph, node_b, add_node, &temp).ok());
+  Value* temp_a = nullptr;
+  Value* temp_b = nullptr;
+  ASSERT_TRUE(ConnectTwoNodes(&graph, node_a, add_node, &temp_a).ok());
+  ASSERT_TRUE(ConnectTwoNodes(&graph, node_b, add_node, &temp_b).ok());
   ASSERT_EQ(3, graph.nodes().size());
   ASSERT_EQ(4, graph.values().size());
 
@@ -171,7 +180,7 @@ TEST(RemoveDegenerateUpsampling, Smoke) {
   ASSERT_TRUE(graph.AddConsumer(first_node->id, input->id).ok());
 
   auto node_to_remove = graph.NewNode();
-  Value* output;
+  Value* output = nullptr;
   ASSERT_TRUE(AddOutput(&graph, node_to_remove, &output).ok());
   output->tensor.shape = BHWC(1, 5, 5, 1);
   node_to_remove->operation.type = ToString(OperationType::RESIZE);
@@ -180,7 +189,7 @@ TEST(RemoveDegenerateUpsampling, Smoke) {
   attr.type = SamplingType::BILINEAR;
   node_to_remove->operation.attributes = attr;
 
-  Value* link;
+  Value* link = nullptr;
   ASSERT_TRUE(ConnectTwoNodes(&graph, first_node, node_to_remove, &link).ok());
   link->tensor.shape = output->tensor.shape;
   ASSERT_EQ(2, graph.nodes().size());
diff --git a/tensorflow/lite/delegates/gpu/common/types.h b/tensorflow/lite/delegates/gpu/common/types.h
index 8725b4234fe..4ddb46f305d 100644
--- a/tensorflow/lite/delegates/gpu/common/types.h
+++ b/tensorflow/lite/delegates/gpu/common/types.h
@@ -19,7 +19,6 @@ limitations under the License.
 #include <array>
 #include <cstddef>
 #include <cstdint>
-#include <string>
 
 #include <fp16.h>
 
diff --git a/tensorflow/lite/delegates/gpu/common/winograd_util.cc b/tensorflow/lite/delegates/gpu/common/winograd_util.cc
index 16be80eef41..4b9581d0f39 100644
--- a/tensorflow/lite/delegates/gpu/common/winograd_util.cc
+++ b/tensorflow/lite/delegates/gpu/common/winograd_util.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/winograd_util.h"
 
+#include <cmath>
+#include <vector>
+
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
diff --git a/tensorflow/lite/delegates/gpu/common/winograd_util.h b/tensorflow/lite/delegates/gpu/common/winograd_util.h
index 2e80a6ce121..e88ceacb490 100644
--- a/tensorflow/lite/delegates/gpu/common/winograd_util.h
+++ b/tensorflow/lite/delegates/gpu/common/winograd_util.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_WINOGRAD_UTIL_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_WINOGRAD_UTIL_H_
 
+#include <vector>
+
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
diff --git a/tensorflow/lite/delegates/gpu/common/workgroup_selection.cc b/tensorflow/lite/delegates/gpu/common/workgroup_selection.cc
index 3abab71829f..439eb0ade90 100644
--- a/tensorflow/lite/delegates/gpu/common/workgroup_selection.cc
+++ b/tensorflow/lite/delegates/gpu/common/workgroup_selection.cc
@@ -15,7 +15,10 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/workgroup_selection.h"
 
+#include <math.h>
+
 #include <set>
+#include <vector>
 
 #include "tensorflow/lite/delegates/gpu/common/util.h"
 
@@ -184,9 +187,10 @@ template std::vector<uint3> GenerateWorkGroupSizes(
     WorkGroupSizeAlignment z_alignment);
 
 template <typename T>
-absl::Status GenerateWorkGroupSizesAlignedToGrid(
-    const T& grid, const T& max_work_group_size,
-    const int max_work_group_invocations, std::vector<T>* work_groups) {
+void GenerateWorkGroupSizesAlignedToGrid(const T& grid,
+                                         const T& max_work_group_size,
+                                         const int max_work_group_invocations,
+                                         std::vector<T>* work_groups) {
   auto alignment = WorkGroupSizeAlignment::PRECISE;
   *work_groups = GenerateWorkGroupSizes<T>(
       grid, /*min_work_group_total_size = */ 32, max_work_group_invocations,
@@ -196,16 +200,15 @@ absl::Status GenerateWorkGroupSizesAlignedToGrid(
     AddCornerCases(grid, max_work_group_invocations, max_work_group_size,
                    alignment, alignment, alignment, work_groups);
   }
-  return absl::OkStatus();
 }
 
 // Specializations of GenerateWorkGroupSizesAlignedToGrid for int3 and uint3
 
-template absl::Status GenerateWorkGroupSizesAlignedToGrid(
+template void GenerateWorkGroupSizesAlignedToGrid(
     const int3& grid, const int3& max_work_group_size,
     const int max_work_group_invocations, std::vector<int3>* work_groups);
 
-template absl::Status GenerateWorkGroupSizesAlignedToGrid(
+template void GenerateWorkGroupSizesAlignedToGrid(
     const uint3& grid, const uint3& max_work_group_size,
     const int max_work_group_invocations, std::vector<uint3>* work_groups);
 
diff --git a/tensorflow/lite/delegates/gpu/common/workgroup_selection.h b/tensorflow/lite/delegates/gpu/common/workgroup_selection.h
index 75967cb04df..67c51b45177 100644
--- a/tensorflow/lite/delegates/gpu/common/workgroup_selection.h
+++ b/tensorflow/lite/delegates/gpu/common/workgroup_selection.h
@@ -18,9 +18,6 @@ limitations under the License.
 
 #include <vector>
 
-#include "tensorflow/lite/delegates/gpu/common/status.h"
-#include "tensorflow/lite/delegates/gpu/common/types.h"
-
 namespace tflite {
 namespace gpu {
 
@@ -42,9 +39,10 @@ std::vector<T> GenerateWorkGroupSizes(
     WorkGroupSizeAlignment y_alignment, WorkGroupSizeAlignment z_alignment);
 
 template <typename T>
-absl::Status GenerateWorkGroupSizesAlignedToGrid(
-    const T& grid, const T& max_work_group_size,
-    const int max_work_group_invocations, std::vector<T>* work_groups);
+void GenerateWorkGroupSizesAlignedToGrid(const T& grid,
+                                         const T& max_work_group_size,
+                                         const int max_work_group_invocations,
+                                         std::vector<T>* work_groups);
 
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/delegate.cc b/tensorflow/lite/delegates/gpu/delegate.cc
index 0f2d9811633..bfc2b7f08c4 100644
--- a/tensorflow/lite/delegates/gpu/delegate.cc
+++ b/tensorflow/lite/delegates/gpu/delegate.cc
@@ -18,9 +18,9 @@ limitations under the License.
 #include <cstdint>
 #include <memory>
 #include <thread>  // NOLINT(build/c++11)
-#include <unordered_map>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/memory/memory.h"
 #include "absl/types/span.h"
 #include "tensorflow/lite/builtin_ops.h"
@@ -350,7 +350,7 @@ class DelegateKernel {
   // Whenever quantized inference is enabled, this maps the tensor index of each
   // originally quantized (8-bit) tensor to its float version added in
   // model_builder - and vice versa.
-  std::unordered_map<int, int> quant_conversion_map_;
+  absl::flat_hash_map<int, int> quant_conversion_map_;
   std::thread::id thread_id_prepare_;  // thread id used for Prapare()
   bool enforce_same_thread_ = false;   // flag to enforce same thread for Invoke
 };
diff --git a/tensorflow/lite/delegates/gpu/gl/BUILD b/tensorflow/lite/delegates/gpu/gl/BUILD
index 91472261d04..d39f5e3c34a 100644
--- a/tensorflow/lite/delegates/gpu/gl/BUILD
+++ b/tensorflow/lite/delegates/gpu/gl/BUILD
@@ -29,6 +29,7 @@ cc_library(
         ":runtime_options",
         ":stats",
         ":variable",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "//tensorflow/lite/delegates/gpu/common:model",
@@ -66,6 +67,7 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/gl/kernels:converter",
         "//tensorflow/lite/delegates/gpu/gl/kernels:registry",
         "//tensorflow/lite/delegates/gpu/gl/workgroups:default_calculator",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/types:span",
     ],
@@ -125,6 +127,7 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/gl/compiler:fuse_inplace",
         "//tensorflow/lite/delegates/gpu/gl/compiler:shader_code",
         "//tensorflow/lite/delegates/gpu/gl/compiler:shader_codegen",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/types:any",
     ],
diff --git a/tensorflow/lite/delegates/gpu/gl/api.cc b/tensorflow/lite/delegates/gpu/gl/api.cc
index 0240a5cfbed..f50b8cb5d5c 100644
--- a/tensorflow/lite/delegates/gpu/gl/api.cc
+++ b/tensorflow/lite/delegates/gpu/gl/api.cc
@@ -19,10 +19,10 @@ limitations under the License.
 #include <cstdint>
 #include <deque>
 #include <mutex>  // NOLINT
-#include <unordered_map>
 #include <unordered_set>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
@@ -46,7 +46,7 @@ namespace gpu {
 namespace gl {
 namespace {
 
-using ObjectsSizes = std::unordered_map<ValueId, size_t>;
+using ObjectsSizes = absl::flat_hash_map<ValueId, size_t>;
 
 enum class InferenceContextState {
   NOT_STARTED,
@@ -313,7 +313,7 @@ class CompiledModelImpl
       full_shaders[shader.second] = shader.first;
     }
 
-    std::unordered_map<std::string, size_t> partial_shader_to_index;
+    absl::flat_hash_map<std::string, size_t> partial_shader_to_index;
     std::vector<std::string> partial_shaders;
     for (const auto& program : programs_) {
       // Remove a header from a shader.
@@ -366,16 +366,16 @@ class CompiledModelImpl
   std::vector<GlShader> shaders_;
 
   // Shaders are serialized in order of their indices.
-  std::unordered_map<std::string, size_t> shader_to_index_;
+  absl::flat_hash_map<std::string, size_t> shader_to_index_;
   std::deque<ProgramParameters> programs_;
-  std::unordered_map<ValueId, size_t> object_sizes_;
+  absl::flat_hash_map<ValueId, size_t> object_sizes_;
   CompilerStats stats_;
 };
 }  // namespace
 
 absl::Status Compile(const CompilationOptions& options,
                      const GraphFloat32& model,
-                     const std::unordered_set<int>& tflite_graph_io,
+                     const std::unordered_set<int>& tflite_graph_io,  // NOLINT
                      const NodeShader& node_shader,
                      const WorkgroupsCalculator& workgroup_calculator,
                      std::unique_ptr<CompiledModel>* compiled_model) {
diff --git a/tensorflow/lite/delegates/gpu/gl/api.h b/tensorflow/lite/delegates/gpu/gl/api.h
index c37eb9b7772..11498243757 100644
--- a/tensorflow/lite/delegates/gpu/gl/api.h
+++ b/tensorflow/lite/delegates/gpu/gl/api.h
@@ -67,7 +67,7 @@ class CompiledModel {
 // Turns the given model into "compiled" form that is suitable for inference.
 absl::Status Compile(const CompilationOptions& options,
                      const GraphFloat32& model,
-                     const std::unordered_set<int>& tflite_graph_io,
+                     const std::unordered_set<int>& tflite_graph_io,  // NOLINT
                      const NodeShader& node_shader,
                      const WorkgroupsCalculator& workgroup_calculator,
                      std::unique_ptr<CompiledModel>* compiled_model);
diff --git a/tensorflow/lite/delegates/gpu/gl/api2.cc b/tensorflow/lite/delegates/gpu/gl/api2.cc
index c8bf6dd063a..c12463800a9 100644
--- a/tensorflow/lite/delegates/gpu/gl/api2.cc
+++ b/tensorflow/lite/delegates/gpu/gl/api2.cc
@@ -18,10 +18,10 @@ limitations under the License.
 #include <algorithm>
 #include <cstring>
 #include <string>
-#include <unordered_map>
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/memory/memory.h"
 #include "absl/types/span.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
@@ -542,7 +542,7 @@ class InferenceBuilderImpl : public InferenceBuilder {
     auto workgroup_calculator = NewDefaultWorkgroupsCalculator(*gpu_info_);
     auto external_objects = absl::make_unique<ObjectManager>();
     std::vector<GlShader> shaders;
-    std::unordered_map<std::string, size_t> shader_to_index;
+    absl::flat_hash_map<std::string, size_t> shader_to_index;
     RuntimeOptions runtime_options;
     auto runtime =
         absl::make_unique<Runtime>(runtime_options, *gpu_info_,
diff --git a/tensorflow/lite/delegates/gpu/gl/compiler.cc b/tensorflow/lite/delegates/gpu/gl/compiler.cc
index d316505a0e0..eba25171ca3 100644
--- a/tensorflow/lite/delegates/gpu/gl/compiler.cc
+++ b/tensorflow/lite/delegates/gpu/gl/compiler.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/memory/memory.h"
 #include "absl/types/any.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
@@ -102,9 +103,10 @@ class CompilerImpl : public Compiler {
     }
   }
 
-  absl::Status Compile(const GraphFloat32& graph,
-                       const std::unordered_set<int>& tflite_graph_io,
-                       const ShaderCodeCallback& callback) final {
+  absl::Status Compile(
+      const GraphFloat32& graph,
+      const std::unordered_set<int>& tflite_graph_io,  // NOLINT
+      const ShaderCodeCallback& callback) final {
     // It is important to have ids in a compiled graph identical to the given
     // graph.
     RETURN_IF_ERROR(graph.MakeExactCopy(&compiled_graph_));
@@ -158,7 +160,7 @@ class CompilerImpl : public Compiler {
     }
 
     // Prepare internal objects.
-    std::unordered_map<ValueId, Object> objects;
+    absl::flat_hash_map<ValueId, Object> objects;
     for (auto value : compiled_graph_.values()) {
       Object object = MakePHWC4Ref(value->id, value->tensor.shape);
       object.data_type = value->tensor.type;
diff --git a/tensorflow/lite/delegates/gpu/gl/compiler.h b/tensorflow/lite/delegates/gpu/gl/compiler.h
index 7769890b769..03ea3dd2a90 100644
--- a/tensorflow/lite/delegates/gpu/gl/compiler.h
+++ b/tensorflow/lite/delegates/gpu/gl/compiler.h
@@ -40,9 +40,10 @@ class Compiler {
   // Goes over a graph and generates OpenGL shaders for the given graph.
   // Callback is called for every generated shader. Callback may execute shaders
   // as they come or store them elsewhere to execute later.
-  virtual absl::Status Compile(const GraphFloat32& graph,
-                               const std::unordered_set<int>& tflite_graph_io,
-                               const ShaderCodeCallback& callback) = 0;
+  virtual absl::Status Compile(
+      const GraphFloat32& graph,
+      const std::unordered_set<int>& tflite_graph_io,  // NOLINT
+      const ShaderCodeCallback& callback) = 0;
 };
 
 std::unique_ptr<Compiler> NewCompiler(
diff --git a/tensorflow/lite/delegates/gpu/gl/compiler/BUILD b/tensorflow/lite/delegates/gpu/gl/compiler/BUILD
index 601e809fffa..58c723a37e3 100644
--- a/tensorflow/lite/delegates/gpu/gl/compiler/BUILD
+++ b/tensorflow/lite/delegates/gpu/gl/compiler/BUILD
@@ -101,6 +101,7 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/gl:node_shader",
         "//tensorflow/lite/delegates/gpu/gl:object",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -150,6 +151,7 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/gl:node_shader",
         "//tensorflow/lite/delegates/gpu/gl:object",
         "//tensorflow/lite/delegates/gpu/gl:variable",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -164,6 +166,7 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:model_transformer",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:types",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:any",
         "@com_google_absl//absl/types:variant",
@@ -193,6 +196,7 @@ cc_library(
         ":preprocessor",
         "//tensorflow/lite/delegates/gpu/common:types",
         "//tensorflow/lite/delegates/gpu/gl:variable",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:variant",
diff --git a/tensorflow/lite/delegates/gpu/gl/compiler/compiled_node.cc b/tensorflow/lite/delegates/gpu/gl/compiler/compiled_node.cc
index 4048a07d087..035fce56d31 100644
--- a/tensorflow/lite/delegates/gpu/gl/compiler/compiled_node.cc
+++ b/tensorflow/lite/delegates/gpu/gl/compiler/compiled_node.cc
@@ -15,8 +15,7 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/gl/compiler/compiled_node.h"
 
-#include <unordered_set>
-
+#include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/gl/compiler/rename.h"
@@ -28,7 +27,7 @@ namespace gl {
 absl::Status MergeCode(CompiledNodeAttributes* attr,
                        CompiledNodeAttributes* merged_attr) {
   // build a map of known names.
-  std::unordered_set<std::string> known_names;
+  absl::flat_hash_set<std::string> known_names;
   for (const auto& parameter : merged_attr->code.parameters) {
     known_names.insert(parameter.name);
   }
diff --git a/tensorflow/lite/delegates/gpu/gl/compiler/fuse_auto_input.cc b/tensorflow/lite/delegates/gpu/gl/compiler/fuse_auto_input.cc
index d0408c6a7be..36d8fa8c1c7 100644
--- a/tensorflow/lite/delegates/gpu/gl/compiler/fuse_auto_input.cc
+++ b/tensorflow/lite/delegates/gpu/gl/compiler/fuse_auto_input.cc
@@ -16,9 +16,9 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/gl/compiler/fuse_auto_input.h"
 
 #include <string>
-#include <unordered_set>
 #include <vector>
 
+#include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_replace.h"
 #include "absl/types/any.h"
@@ -102,7 +102,7 @@ TransformResult FuseAutoInput::ApplyToNode(Node* node, GraphFloat32* graph) {
 
   // Skip fusions which will result in duplicate inputs, e.g. diamond shapes.
   {
-    std::unordered_set<ValueId> all_inputs;
+    absl::flat_hash_set<ValueId> all_inputs;
     for (const auto& node_to_fuse : nodes_to_fuse) {
       for (const auto& input : graph->FindInputs(node_to_fuse.first->id)) {
         if (all_inputs.find(input->id) != all_inputs.end()) {
diff --git a/tensorflow/lite/delegates/gpu/gl/compiler/object_accessor.h b/tensorflow/lite/delegates/gpu/gl/compiler/object_accessor.h
index 78e7a2f1e17..318709fe7ff 100644
--- a/tensorflow/lite/delegates/gpu/gl/compiler/object_accessor.h
+++ b/tensorflow/lite/delegates/gpu/gl/compiler/object_accessor.h
@@ -16,8 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_OBJECT_ACCESSOR_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_OBJECT_ACCESSOR_H_
 
+#include <map>
 #include <string>
-#include <unordered_map>
 #include <vector>
 
 #include "tensorflow/lite/delegates/gpu/gl/compiler/preprocessor.h"
@@ -85,7 +85,7 @@ class ObjectAccessor : public InlineRewrite {
   RewriteStatus RewriteWrite(absl::string_view location,
                              absl::string_view value, std::string* output);
 
-  std::unordered_map<std::string, Object> name_to_object_;
+  std::map<std::string, Object> name_to_object_;
 
   const bool is_mali_;
   const bool sampler_textures_;
diff --git a/tensorflow/lite/delegates/gpu/gl/compiler/rename.cc b/tensorflow/lite/delegates/gpu/gl/compiler/rename.cc
index 956f6afae28..b41ba473b85 100644
--- a/tensorflow/lite/delegates/gpu/gl/compiler/rename.cc
+++ b/tensorflow/lite/delegates/gpu/gl/compiler/rename.cc
@@ -16,10 +16,10 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/gl/compiler/rename.h"
 
 #include <algorithm>
-#include <unordered_map>
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/str_split.h"
@@ -86,7 +86,7 @@ class VariableRewriter : public InlineRewrite {
   const std::string inline_delimiter_;
   const NameFunctor name_func_;
 
-  std::unordered_map<std::string, Variable> name_to_variable_;
+  absl::flat_hash_map<std::string, Variable> name_to_variable_;
 };
 
 // Rewrites names of all objects according to returned values from the
@@ -168,7 +168,7 @@ class ObjectRewriter : public InlineRewrite {
   const std::string inline_delimiter_;
   const NameFunctor name_func_;
 
-  std::unordered_map<std::string, std::pair<std::string, Object>>
+  absl::flat_hash_map<std::string, std::pair<std::string, Object>>
       name_to_object_;
 };
 
diff --git a/tensorflow/lite/delegates/gpu/gl/compiler/variable_accessor.h b/tensorflow/lite/delegates/gpu/gl/compiler/variable_accessor.h
index c9946a00395..db4b031548b 100644
--- a/tensorflow/lite/delegates/gpu/gl/compiler/variable_accessor.h
+++ b/tensorflow/lite/delegates/gpu/gl/compiler/variable_accessor.h
@@ -16,11 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_VARIABLE_ACCESSOR_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_VARIABLE_ACCESSOR_H_
 
-#include <string>
-#include <unordered_map>
 #include <set>
+#include <string>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/lite/delegates/gpu/gl/compiler/preprocessor.h"
 #include "tensorflow/lite/delegates/gpu/gl/variable.h"
 
@@ -72,7 +72,7 @@ class VariableAccessor : public InlineRewrite {
  private:
   const bool inline_values_;
   const bool vulkan_support_;
-  std::unordered_map<std::string, Variable> name_to_variable_;
+  absl::flat_hash_map<std::string, Variable> name_to_variable_;
   std::set<std::string> shared_variables_;
   std::set<std::string> uniform_parameters_;
 };
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/BUILD b/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
index a367a60ba41..a5d49b2c394 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
@@ -155,7 +155,10 @@ cc_library(
     name = "custom_registry",
     srcs = ["custom_registry.cc"],
     hdrs = ["custom_registry.h"],
-    deps = ["//tensorflow/lite/delegates/gpu/gl:node_shader"],
+    deps = [
+        "//tensorflow/lite/delegates/gpu/gl:node_shader",
+        "@com_google_absl//absl/container:flat_hash_map",
+    ],
 )
 
 cc_library(
@@ -346,6 +349,7 @@ cc_library(
     srcs = ["mul.cc"],
     hdrs = ["mul.h"],
     deps = [
+        "//tensorflow/lite/delegates/gpu/common:convert",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common:types",
@@ -669,6 +673,8 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/gl:request_gpu_info",
         "//tensorflow/lite/delegates/gpu/gl:runtime_options",
         "//tensorflow/lite/delegates/gpu/gl/workgroups:default_calculator",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_googletest//:gtest",
         "@com_google_googletest//:gtest_main",
     ],
@@ -774,6 +780,7 @@ cc_library(
                "//conditions:default": NON_TFLITE_GPU_BINARY_RELEASE_OPERATORS,
            }) + [
         ":custom_registry",
+        "@com_google_absl//absl/container:flat_hash_map",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/gl:node_shader",
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/custom_registry.cc b/tensorflow/lite/delegates/gpu/gl/kernels/custom_registry.cc
index f5c5429e867..a01e885adef 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/custom_registry.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/custom_registry.cc
@@ -17,15 +17,16 @@ limitations under the License.
 
 #include <memory>
 #include <string>
-#include <unordered_map>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
+
 namespace tflite {
 namespace gpu {
 namespace gl {
 
 void RegisterCustomOps(
-    std::unordered_map<std::string, std::vector<std::unique_ptr<NodeShader>>>*
+    absl::flat_hash_map<std::string, std::vector<std::unique_ptr<NodeShader>>>*
         shaders) {}
 
 }  // namespace gl
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/custom_registry.h b/tensorflow/lite/delegates/gpu/gl/kernels/custom_registry.h
index 9a979a982db..7b2a841bca9 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/custom_registry.h
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/custom_registry.h
@@ -18,9 +18,9 @@ limitations under the License.
 
 #include <memory>
 #include <string>
-#include <unordered_map>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/lite/delegates/gpu/gl/node_shader.h"
 
 namespace tflite {
@@ -29,7 +29,7 @@ namespace gl {
 
 // Registers custom operations.
 void RegisterCustomOps(
-    std::unordered_map<std::string, std::vector<std::unique_ptr<NodeShader>>>*
+    absl::flat_hash_map<std::string, std::vector<std::unique_ptr<NodeShader>>>*
         shaders_);
 
 }  // namespace gl
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/elementwise.cc b/tensorflow/lite/delegates/gpu/gl/kernels/elementwise.cc
index 5d50fcc0118..42dafe7597e 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/elementwise.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/elementwise.cc
@@ -69,6 +69,9 @@ class ElementwiseOneArgument : public NodeShader {
             value_0.w = value_0.w > 0.0 ? log(value_0.w) : nan;
         )";
         break;
+      case OperationType::NEG:
+        source = "value_0 = -(value_0);";
+        break;
       case OperationType::RSQRT:
         source = R"(
             const float nan = normalize(vec4(0, 0, 0, 0)).x;
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/elementwise_test.cc b/tensorflow/lite/delegates/gpu/gl/kernels/elementwise_test.cc
index a32a4ea9f76..5ff7bfc9ed7 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/elementwise_test.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/elementwise_test.cc
@@ -129,6 +129,18 @@ TEST(ElementwiseOneArgumentTest, Log) {
               Pointwise(FloatNear(1e-6), {0.0, 1.14473, 0.0, 0.0}));
 }
 
+TEST(ElementwiseOneArgumentTest, Neg) {
+  OperationType op_type = OperationType::NEG;
+  const BHWC shape(1, 2, 2, 1);
+  SingleOpModel model({/*type=*/ToString(op_type), /*attributes=*/{}},
+                      /*inputs=*/{GetTensorRef(0, shape)},
+                      /*outputs=*/{GetTensorRef(1, shape)});
+  ASSERT_TRUE(model.PopulateTensor(0, {1.0, -3.1415926, 0.0, 1.0}));
+  ASSERT_OK(model.Invoke(*NewElementwiseNodeShader(op_type)));
+  EXPECT_THAT(model.GetOutput(0),
+              Pointwise(FloatNear(1e-6), {-1.0, 3.1415926, 0.0, -1.0}));
+}
+
 TEST(ElementwiseOneArgumentTest, Rsqrt) {
   OperationType op_type = OperationType::RSQRT;
   const BHWC shape(1, 2, 2, 1);
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/mul.cc b/tensorflow/lite/delegates/gpu/gl/kernels/mul.cc
index b66decc3ca3..b524def4bf0 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/mul.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/mul.cc
@@ -23,6 +23,7 @@ limitations under the License.
 
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
+#include "tensorflow/lite/delegates/gpu/common/convert.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/types.h"
 
@@ -81,18 +82,10 @@ absl::Status GenerateApplyMaskCode(const NodeShader::GenerationContext& ctx,
 absl::Status GenerateMultiplyScalarCode(
     const NodeShader::GenerationContext& ctx, GeneratedCode* generated_code) {
   const auto& attr = absl::any_cast<const ElementwiseAttributes&>(ctx.op_attr);
-  auto muls = absl::get_if<Tensor<Linear, DataType::FLOAT32>>(&attr.param);
-  auto scalar = absl::get_if<float>(&attr.param);
 
-  const auto* hwc_tensor =
-      absl::get_if<Tensor<HWC, DataType::FLOAT32>>(&attr.param);
-  if (hwc_tensor) {
-    return absl::UnimplementedError("Mul does not support HWC constant tensor");
-  }
-
-  if (scalar) {
+  if (absl::holds_alternative<float>(attr.param)) {
     *generated_code = {
-        /*parameters=*/{{"scalar", *scalar}},
+        /*parameters=*/{{"scalar", absl::get<float>(attr.param)}},
         /*objects=*/{},
         /*shared_variables=*/{},
         /*workload=*/uint3(),
@@ -101,13 +94,16 @@ absl::Status GenerateMultiplyScalarCode(
         /*input=*/IOStructure::AUTO,
         /*output=*/IOStructure::AUTO,
     };
-  } else {
-    if (!muls) {
-      return absl::InvalidArgumentError("Empty parameters for Multiplication.");
-    }
+    return absl::OkStatus();
+  }
+
+  if (absl::holds_alternative<Tensor<Linear, DataType::FLOAT32>>(attr.param)) {
     *generated_code = {
         /*parameters=*/{},
-        /*objects=*/{{"mul_buffer", MakeReadonlyObject(muls->data)}},
+        /*objects=*/
+        {{"mul_buffer",
+          MakeReadonlyObject(
+              absl::get<Tensor<Linear, DataType::FLOAT32>>(attr.param).data)}},
         /*shared_variables=*/{},
         // Declare workload explicitly because shader depends on gid.z.
         /*workload=*/
@@ -119,9 +115,35 @@ absl::Status GenerateMultiplyScalarCode(
         /*input=*/IOStructure::AUTO,
         /*output=*/IOStructure::AUTO,
     };
+    return absl::OkStatus();
   }
 
-  return absl::OkStatus();
+  if (absl::holds_alternative<Tensor<HWC, DataType::FLOAT32>>(attr.param)) {
+    *generated_code = {
+        /*parameters=*/{},
+        /*objects=*/
+        {{"hwc_buffer",
+          MakeReadonlyObject(
+              uint3(static_cast<int>(ctx.input_shapes[0][2]),
+                    static_cast<int>(ctx.input_shapes[0][1]),
+                    DivideRoundUp(static_cast<int>(ctx.input_shapes[0][3]), 4)),
+              ConvertToPHWC4(
+                  absl::get<Tensor<HWC, DataType::FLOAT32>>(attr.param)))}},
+        /*shared_variables=*/{},
+        // Declare workload explicitly because shader depends on gid.z.
+        /*workload=*/
+        uint3(static_cast<int>(ctx.input_shapes[0][2]),
+              static_cast<int>(ctx.input_shapes[0][1]),
+              DivideRoundUp(static_cast<int>(ctx.input_shapes[0][3]), 4)),
+        /*workgroup=*/uint3(),
+        /*source_code=*/"value_0 *= $hwc_buffer[gid.x, gid.y, gid.z]$;",
+        /*input=*/IOStructure::AUTO,
+        /*output=*/IOStructure::AUTO,
+    };
+    return absl::OkStatus();
+  }
+
+  return absl::InvalidArgumentError("Unsupported Multiplication case.");
 }
 
 class Multiply : public NodeShader {
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/mul_test.cc b/tensorflow/lite/delegates/gpu/gl/kernels/mul_test.cc
index e19f00f763e..04e3ae46ec8 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/mul_test.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/mul_test.cc
@@ -74,6 +74,32 @@ TEST(MulTest, Linear) {
   EXPECT_THAT(model.GetOutput(0), Pointwise(FloatNear(1e-6), {2, 6, 6, 12}));
 }
 
+TEST(MulTest, ConstTensor3D) {
+  TensorRef<BHWC> input;
+  input.type = DataType::FLOAT32;
+  input.ref = 0;
+  input.shape = BHWC(1, 1, 2, 2);
+
+  TensorRef<BHWC> output;
+  output.type = DataType::FLOAT32;
+  output.ref = 1;
+  output.shape = BHWC(1, 1, 2, 2);
+
+  ElementwiseAttributes attr;
+  Tensor<HWC, DataType::FLOAT32> tensor_3d;
+  tensor_3d.shape.h = 1;
+  tensor_3d.shape.w = 2;
+  tensor_3d.shape.c = 2;
+  tensor_3d.id = 2;
+  tensor_3d.data = {-2, 2, -3, 3};
+  attr.param = std::move(tensor_3d);
+
+  SingleOpModel model({ToString(OperationType::MUL), attr}, {input}, {output});
+  ASSERT_TRUE(model.PopulateTensor(0, {1, 2, 3, 4}));
+  ASSERT_OK(model.Invoke(*NewMultiplyNodeShader()));
+  EXPECT_THAT(model.GetOutput(0), Pointwise(FloatNear(1e-6), {-2, 4, -9, 12}));
+}
+
 TEST(MulTest, MaskChannel1) {
   TensorRef<BHWC> input;
   input.type = DataType::FLOAT32;
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/registry.cc b/tensorflow/lite/delegates/gpu/gl/kernels/registry.cc
index da6aad720a2..e6cd539500e 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/registry.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/registry.cc
@@ -18,10 +18,10 @@ limitations under the License.
 #include <functional>
 #include <memory>
 #include <string>
-#include <unordered_map>
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
@@ -103,6 +103,7 @@ class Registry : public NodeShader {
     insert_elementwise_op(Type::EXP);
     insert_elementwise_op(Type::HARD_SWISH);
     insert_elementwise_op(Type::LOG);
+    insert_elementwise_op(Type::NEG);
     insert_elementwise_op(Type::MAXIMUM);
     insert_elementwise_op(Type::MINIMUM);
     insert_elementwise_op(Type::POW);
@@ -139,7 +140,7 @@ class Registry : public NodeShader {
   }
 
  private:
-  std::unordered_map<std::string, std::vector<std::unique_ptr<NodeShader>>>
+  absl::flat_hash_map<std::string, std::vector<std::unique_ptr<NodeShader>>>
       shaders_;
 };
 
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/test_util.cc b/tensorflow/lite/delegates/gpu/gl/kernels/test_util.cc
index e9abec7eec6..21a53acd9c9 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/test_util.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/test_util.cc
@@ -17,10 +17,10 @@ limitations under the License.
 
 #include <memory>
 #include <string>
-#include <unordered_map>
-#include <unordered_set>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
@@ -78,7 +78,7 @@ absl::Status SingleOpModel::Invoke(const CompilationOptions& compile_options,
 
   // Create buffers for input tensors.
   {
-    std::unordered_map<int, uint32_t> tensor_to_id;
+    absl::flat_hash_map<int, uint32_t> tensor_to_id;
     for (const auto* input : graph_.inputs()) {
       tensor_to_id[input->tensor.ref] = input->id;
     }
@@ -101,9 +101,9 @@ absl::Status SingleOpModel::Invoke(const CompilationOptions& compile_options,
   GpuInfo gpu_info;
   RETURN_IF_ERROR(RequestGpuInfo(&gpu_info));
   std::unique_ptr<CompiledModel> compiled_model;
-  RETURN_IF_ERROR(Compile(
-      compile_options, graph_, /*tflite_graph_io=*/std::unordered_set<int>(),
-      shader, *NewDefaultWorkgroupsCalculator(gpu_info), &compiled_model));
+  RETURN_IF_ERROR(Compile(compile_options, graph_, /*tflite_graph_io=*/{},
+                          shader, *NewDefaultWorkgroupsCalculator(gpu_info),
+                          &compiled_model));
 
   // Get inference context.
   auto command_queue = NewCommandQueue(gpu_info);
diff --git a/tensorflow/lite/delegates/gpu/gl/runtime.cc b/tensorflow/lite/delegates/gpu/gl/runtime.cc
index b7e01a33570..7f0cbe0284b 100644
--- a/tensorflow/lite/delegates/gpu/gl/runtime.cc
+++ b/tensorflow/lite/delegates/gpu/gl/runtime.cc
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include <algorithm>
 #include <cstdint>
-#include <unordered_map>
 #include <vector>
 
 #include "absl/strings/str_cat.h"
diff --git a/tensorflow/lite/delegates/gpu/gl/variable.h b/tensorflow/lite/delegates/gpu/gl/variable.h
index 1c5bb26db62..5237481f96e 100644
--- a/tensorflow/lite/delegates/gpu/gl/variable.h
+++ b/tensorflow/lite/delegates/gpu/gl/variable.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <array>
 #include <cstdint>
+#include <string>
 #include <vector>
 
 #include "absl/types/variant.h"
diff --git a/tensorflow/lite/delegates/gpu/gl/workgroups/BUILD b/tensorflow/lite/delegates/gpu/gl/workgroups/BUILD
index 52fdb7435f9..1048912d754 100644
--- a/tensorflow/lite/delegates/gpu/gl/workgroups/BUILD
+++ b/tensorflow/lite/delegates/gpu/gl/workgroups/BUILD
@@ -32,15 +32,16 @@ cc_library(
     deps = select({
         "//tensorflow/lite/delegates/gpu:tflite_gpu_binary_release": [],
         "//conditions:default": [
-            ":default_calculator",
-            "//tensorflow/lite/delegates/gpu/gl:common_cc_fbs",
-            "//tensorflow/lite/delegates/gpu/gl:workgroups_cc_fbs",
-            "//tensorflow/lite/delegates/gpu/common:gpu_info",
-            "//tensorflow/lite/delegates/gpu/gl:metadata_cc_fbs",
             ":calculator",
+            ":default_calculator",
+            "@com_google_absl//absl/container:flat_hash_map",
             "@com_google_absl//absl/memory",
             "@flatbuffers",
+            "//tensorflow/lite/delegates/gpu/common:gpu_info",
             "//tensorflow/lite/delegates/gpu/common:types",
+            "//tensorflow/lite/delegates/gpu/gl:common_cc_fbs",
+            "//tensorflow/lite/delegates/gpu/gl:metadata_cc_fbs",
+            "//tensorflow/lite/delegates/gpu/gl:workgroups_cc_fbs",
         ],
     }),
 )
diff --git a/tensorflow/lite/delegates/gpu/gl/workgroups/calculator_from_metadata.cc b/tensorflow/lite/delegates/gpu/gl/workgroups/calculator_from_metadata.cc
index 7976fd54ed0..8a269e7cf25 100644
--- a/tensorflow/lite/delegates/gpu/gl/workgroups/calculator_from_metadata.cc
+++ b/tensorflow/lite/delegates/gpu/gl/workgroups/calculator_from_metadata.cc
@@ -18,8 +18,8 @@ limitations under the License.
 #ifndef TFLITE_GPU_BINARY_RELEASE
 
 #include <memory>
-#include <unordered_map>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/memory/memory.h"
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
@@ -62,7 +62,7 @@ class WorkgroupsCalculatorFromMetadata : public WorkgroupsCalculator {
   }
 
  private:
-  std::unordered_map<NodeId, uint3> workgroups_;
+  absl::flat_hash_map<NodeId, uint3> workgroups_;
   std::unique_ptr<WorkgroupsCalculator> default_calculator_;
 };
 
diff --git a/tensorflow/lite/delegates/gpu/gl_delegate.cc b/tensorflow/lite/delegates/gpu/gl_delegate.cc
index 0587cb4f3a3..8b049d483b1 100644
--- a/tensorflow/lite/delegates/gpu/gl_delegate.cc
+++ b/tensorflow/lite/delegates/gpu/gl_delegate.cc
@@ -35,7 +35,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
-#include "tensorflow/lite/delegates/gpu/common/transformations/general_transformations.h"
+#include "tensorflow/lite/delegates/gpu/common/transformations/model_transformations.h"
 #include "tensorflow/lite/delegates/gpu/gl/api.h"
 #include "tensorflow/lite/delegates/gpu/gl/command_queue.h"
 #include "tensorflow/lite/delegates/gpu/gl/compiler.h"
@@ -138,8 +138,8 @@ class Delegate {
     // Apply general transformations on the graph.
     NullTransformationReporter reporter;
     ModelTransformer transformer(&graph, &reporter);
-    if (!ApplyGeneralTransformations(&transformer)) {
-      return absl::InternalError("Graph general transformations failed");
+    if (!ApplyModelTransformations(&transformer)) {
+      return absl::InternalError("Graph transformations failed");
     }
 
     if (!env_) RETURN_IF_ERROR(EglEnvironment::NewEglEnvironment(&env_));
@@ -160,7 +160,7 @@ class Delegate {
       tensors_[value->id] = {value->tensor.shape, 0};
     }
 
-    std::unordered_set<int> tflite_graph_io;
+    std::unordered_set<int> tflite_graph_io;  // NOLINT
 
     // Prepare graph inputs.
     //
diff --git a/tensorflow/lite/delegates/gpu/metal/api.cc b/tensorflow/lite/delegates/gpu/metal/api.cc
index fcab962ee61..3359174e9f8 100644
--- a/tensorflow/lite/delegates/gpu/metal/api.cc
+++ b/tensorflow/lite/delegates/gpu/metal/api.cc
@@ -371,6 +371,7 @@ absl::Status RegisterPrimaryOps(const GraphFloat32& graph, const Node* node,
     case OperationType::EXP:
     case OperationType::HARD_SWISH:
     case OperationType::LOG:
+    case OperationType::NEG:
     case OperationType::RSQRT:
     case OperationType::SIGMOID:
     case OperationType::SIN:
@@ -404,10 +405,22 @@ absl::Status RegisterPrimaryOps(const GraphFloat32& graph, const Node* node,
     } break;
     case OperationType::BATCH_NORMALIZATION:
     case OperationType::BATCH_TO_SPACE:
+    case OperationType::BATCHED_MATMUL:
     case OperationType::CONST:
     case OperationType::LSTM:
     // TODO(b/162763635): implement MeanStddevNormalization for Metal.
     case OperationType::MEAN_STDDEV_NORMALIZATION:
+    case OperationType::REDUCE_MAXIMUM:
+    case OperationType::REDUCE_MINIMUM:
+    case OperationType::REDUCE_PRODUCT:
+    case OperationType::REDUCE_SUM:
+    // comparison operations
+    case OperationType::LESS:
+    case OperationType::LESS_EQUAL:
+    case OperationType::EQUAL:
+    case OperationType::NOT_EQUAL:
+    case OperationType::GREATER:
+    case OperationType::GREATER_EQUAL:
     case OperationType::SPACE_TO_BATCH:
     case OperationType::TRANSPOSE:
     case OperationType::UNKNOWN:
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/BUILD b/tensorflow/lite/delegates/gpu/metal/kernels/BUILD
index f4f4c180976..e90f8a41c8b 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/BUILD
@@ -122,6 +122,7 @@ cc_library(
     srcs = ["conv.cc"],
     hdrs = ["conv.h"],
     deps = [
+        "//tensorflow/lite/delegates/gpu/common:data_type",
         "//tensorflow/lite/delegates/gpu/common:model",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:shape",
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/elementwise.cc b/tensorflow/lite/delegates/gpu/metal/kernels/elementwise.cc
index 53c1c5b38dd..fb4de84be8e 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/elementwise.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/elementwise.cc
@@ -16,9 +16,9 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/metal/kernels/elementwise.h"
 
 #include <cstddef>
-#include <unordered_map>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/strings/substitute.h"
 #include "tensorflow/lite/delegates/gpu/common/convert.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
@@ -32,7 +32,7 @@ namespace metal {
 namespace {
 
 std::string OneInputFunctor(OperationType op_type, const std::string& value) {
-  const std::unordered_map<OperationType, std::string> functors{
+  const absl::flat_hash_map<OperationType, std::string> functors{
       {OperationType::ABS, "abs($0)"},
       {OperationType::SIN, "sin($0)"},
       {OperationType::HARD_SWISH,
@@ -45,6 +45,7 @@ std::string OneInputFunctor(OperationType op_type, const std::string& value) {
        "$0.w < FLT(0.0f) ? exp($0.w) - FLT(1.0f) : $0.w)"},
       {OperationType::EXP, "exp($0)"},
       {OperationType::LOG, "log($0)"},
+      {OperationType::NEG, "-($0)"},
       {OperationType::SQRT, "sqrt($0)"},
       {OperationType::RSQRT, "1.0 / sqrt($0)"},
       {OperationType::SQUARE, "$0 * $0"},
@@ -62,7 +63,7 @@ std::string OneInputFunctor(OperationType op_type, const std::string& value) {
 
 std::string TwoInputFunctor(OperationType op_type, const std::string& value0,
                             const std::string& value1) {
-  const std::unordered_map<OperationType, std::string> functors{
+  const absl::flat_hash_map<OperationType, std::string> functors{
       {OperationType::ADD, "$0 + $1"},
       {OperationType::DIV, "$0 / $1"},
       {OperationType::MAXIMUM, "max($0, $1)"},
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/elementwise_test.mm b/tensorflow/lite/delegates/gpu/metal/kernels/elementwise_test.mm
index 4972fdeb1a9..867ed596ed8 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/elementwise_test.mm
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/elementwise_test.mm
@@ -257,6 +257,19 @@ TensorRef<BHWC> GetTensorRef(int ref, const BHWC& shape) {
   XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
 }
 
+- (void)testNeg {
+  OperationType op_type = OperationType::NEG;
+  const BHWC shape(1, 2, 2, 1);
+  SingleOpModel model({/*type=*/ToString(op_type), /*attributes=*/{}},
+                      /*inputs=*/{GetTensorRef(0, shape)},
+                      /*outputs=*/{GetTensorRef(1, shape)});
+  XCTAssertTrue(model.PopulateTensor(0, {-1.0, 3.1415926, 0.0, 1.0}));
+  auto status = model.Invoke();
+  XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
+  status = CompareVectors({1.0, -3.1415926, 0.0, -1.0}, model.GetOutput(0), 1e-6f);
+  XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
+}
+
 - (void)testPow {
   OperationType op_type = OperationType::POW;
   const BHWC shape(1, 2, 2, 1);
diff --git a/tensorflow/lite/delegates/gpu/metal_delegate.mm b/tensorflow/lite/delegates/gpu/metal_delegate.mm
index 45bfe1f3b2f..e97e89d54c0 100644
--- a/tensorflow/lite/delegates/gpu/metal_delegate.mm
+++ b/tensorflow/lite/delegates/gpu/metal_delegate.mm
@@ -26,6 +26,7 @@ limitations under the License.
 #include <thread>
 #include <vector>
 
+#include "absl/container/flat_hash_set.h"
 #include "absl/types/span.h"
 #include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/c/common.h"
@@ -37,7 +38,6 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/quantization_util.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
-#include "tensorflow/lite/delegates/gpu/common/transformations/general_transformations.h"
 #include "tensorflow/lite/delegates/gpu/common/types.h"
 #include "tensorflow/lite/delegates/gpu/metal/api.h"
 #include "tensorflow/lite/delegates/gpu/metal/buffer_convert.h"
@@ -613,7 +613,7 @@ class Delegate {
   // Whenever quantized inference is enabled, this maps the tensor index of each
   // originally quantized (8-bit) tensor to its float version added in
   // model_builder - and vice versa.
-  std::unordered_map<int, int> quant_conversion_map_;
+  absl::flat_hash_map<int, int> quant_conversion_map_;
 
   TFLInferenceContext* inference_context_;
   // input and output buffers are passed into Metal inference engine
@@ -631,7 +631,7 @@ class Delegate {
   std::vector<BufferDescriptor> graph_inputs_;
   std::vector<BufferDescriptor> graph_outputs_;
 
-  id<MTLComputeCommandEncoder> external_command_encoder_;
+  id<MTLComputeCommandEncoder> external_command_encoder_ = nil;
   std::function<id<MTLComputeCommandEncoder>(bool is_last)> control_encoder_;
   id<MTLCommandQueue> command_queue_;
   std::unique_ptr<GpuAlarmClock> gpu_alarm_clock_;
diff --git a/tensorflow/lite/delegates/hexagon/builders/BUILD b/tensorflow/lite/delegates/hexagon/builders/BUILD
index 63ff274c7b7..ef4b0e957c1 100644
--- a/tensorflow/lite/delegates/hexagon/builders/BUILD
+++ b/tensorflow/lite/delegates/hexagon/builders/BUILD
@@ -85,6 +85,7 @@ cc_library(
         "//tensorflow/lite/kernels:padding",
         "//tensorflow/lite/kernels/internal:optimized_base",
         "//tensorflow/lite/kernels/internal:tensor",
+        "@farmhash_archive//:farmhash",
         "@hexagon_nn//:hexagon_nn_ops",
     ],
 )
diff --git a/tensorflow/lite/delegates/hexagon/builders/conv_2d_builder.cc b/tensorflow/lite/delegates/hexagon/builders/conv_2d_builder.cc
index cfddd2c2b97..c6d20004227 100644
--- a/tensorflow/lite/delegates/hexagon/builders/conv_2d_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/conv_2d_builder.cc
@@ -267,13 +267,13 @@ TfLiteStatus Conv2dOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
     auto* conv_op = graph_builder_->AddNode(GetTFLiteNodeID());
     conv_op->SetOpType(OP_DepthwiseSupernode_8x8p32to8);
     conv_op->AddInput(space_to_batch_op_out);
-    conv_op->AddInput(TensorID(weights_data_node_->GetID(), 0));
+    conv_op->AddInput(graph_builder_->GetHexagonTensorId(inputs->data[1]));
     conv_op->AddInput(TensorID(data_min_const->GetID(), 0));
     conv_op->AddInput(TensorID(data_max_const->GetID(), 0));
     conv_op->AddInput(TensorID(weights_min_node_->GetID(), 0));
     conv_op->AddInput(TensorID(weights_max_node_->GetID(), 0));
     conv_op->AddInput(TensorID(stride_node->GetID(), 0));
-    conv_op->AddInput(TensorID(bias_data_node_->GetID(), 0));
+    conv_op->AddInput(graph_builder_->GetHexagonTensorId(inputs->data[2]));
     conv_op->AddInput(TensorID(bias_min_node_->GetID(), 0));
     conv_op->AddInput(TensorID(bias_max_node_->GetID(), 0));
     conv_op->AddInput(TensorID(conv_output_min_const->GetID(), 0));
@@ -330,13 +330,13 @@ TfLiteStatus Conv2dOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
     }
     // Inputs
     AddInput(graph_builder_->GetHexagonTensorId(inputs->data[0]));
-    AddInput(TensorID(weights_data_node_->GetID(), 0));
+    AddInput(graph_builder_->GetHexagonTensorId(inputs->data[1]));
     AddInput(TensorID(data_min_const->GetID(), 0));
     AddInput(TensorID(data_max_const->GetID(), 0));
     AddInput(TensorID(weights_min_node_->GetID(), 0));
     AddInput(TensorID(weights_max_node_->GetID(), 0));
     AddInput(TensorID(stride_node->GetID(), 0));
-    AddInput(TensorID(bias_data_node_->GetID(), 0));
+    AddInput(graph_builder_->GetHexagonTensorId(inputs->data[2]));
     AddInput(TensorID(bias_min_node_->GetID(), 0));
     AddInput(TensorID(bias_max_node_->GetID(), 0));
     AddInput(TensorID(conv_output_min_const->GetID(), 0));
diff --git a/tensorflow/lite/delegates/hexagon/builders/conv_2d_builder.h b/tensorflow/lite/delegates/hexagon/builders/conv_2d_builder.h
index 4980b294481..1407f06154b 100644
--- a/tensorflow/lite/delegates/hexagon/builders/conv_2d_builder.h
+++ b/tensorflow/lite/delegates/hexagon/builders/conv_2d_builder.h
@@ -62,10 +62,8 @@ class Conv2dOpBuilder : public OpBuilder {
   std::vector<float> transposed_weights_;
   std::vector<int> stride_shape_;
   std::vector<int> weight_shape_;
-  OpBuilder* weights_data_node_ = nullptr;
   OpBuilder* weights_min_node_ = nullptr;
   OpBuilder* weights_max_node_ = nullptr;
-  OpBuilder* bias_data_node_ = nullptr;
   OpBuilder* bias_min_node_ = nullptr;
   OpBuilder* bias_max_node_ = nullptr;
 
diff --git a/tensorflow/lite/delegates/hexagon/builders/conv_2d_helpers.cc b/tensorflow/lite/delegates/hexagon/builders/conv_2d_helpers.cc
index bf68bbe5a25..b33e28f4e71 100644
--- a/tensorflow/lite/delegates/hexagon/builders/conv_2d_helpers.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/conv_2d_helpers.cc
@@ -106,6 +106,7 @@ TfLiteStatus Conv2dOpBuilder::InitializeWeightsNodes(
   const bool is_per_channel_quant = weights_quant_params->scale->size > 1;
 
   // WEIGHTS DATA.
+  OpBuilder* weights_data_node = nullptr;
   if (op_node_.op_type == OP_Supernode_8x8p32to8) {
     // Hexagon lib expects the weight tensor in HWCN, TFLite uses NHWC.
     // Transpose NHWC -> HWCN
@@ -137,7 +138,7 @@ TfLiteStatus Conv2dOpBuilder::InitializeWeightsNodes(
                                         weights_tensor.data.uint8, hwcn_shape,
                                         hwcn.data());
     }
-    weights_data_node_ = graph_builder_->AddConstNodeWithData(
+    weights_data_node = graph_builder_->AddConstNodeWithData(
         weight_shape_.data(), reinterpret_cast<char*>(hwcn.data()),
         hwcn.size() * sizeof(hwcn[0]));
   } else if (op_node_.op_type == OP_DepthwiseSupernode_8x8p32to8) {
@@ -156,17 +157,17 @@ TfLiteStatus Conv2dOpBuilder::InitializeWeightsNodes(
       for (int i = 0; i < converted_data.size(); ++i) {
         converted_data[i] = weights_tensor.data.int8[i] ^ k8BitSignFlipConstant;
       }
-      weights_data_node_ = graph_builder_->AddConstNodeWithData(
+      weights_data_node = graph_builder_->AddConstNodeWithData(
           weight_shape_.data(), reinterpret_cast<char*>(converted_data.data()),
           converted_data.size() * sizeof(converted_data[0]));
     } else {
-      weights_data_node_ = graph_builder_->AddConstNodeWithData(
+      weights_data_node = graph_builder_->AddConstNodeWithData(
           weight_shape_.data(), weights_tensor.data.raw,
           NumElements(&weights_tensor) * sizeof(weights_tensor.data.uint8[0]));
     }
   }
-  graph_builder_->AddTensorWithID(inputs->data[1], weights_data_node_->GetID(),
-                                  0);
+  graph_builder_->AddTensorWithID(inputs->data[1], weights_data_node->GetID(),
+                                  0, /*overwrite=*/true);
 
   // WEIGHTS QUANTIZATION.
   float weights_min = 0;
@@ -229,9 +230,11 @@ TfLiteStatus Conv2dOpBuilder::ProcessPerChannelQuantizedBias(
   }
   // Add nodes for bias.
   const std::vector<int> bias_shape = {1, 1, 1, bias_size};
-  bias_data_node_ = graph_builder_->AddConstNodeWithData(
+  auto* bias_data_node = graph_builder_->AddConstNodeWithData(
       bias_shape.data(), reinterpret_cast<char*>(preprocessed_bias_data.data()),
       preprocessed_bias_data.size() * sizeof(preprocessed_bias_data[0]));
+  graph_builder_->AddTensorWithID(inputs->data[2], bias_data_node->GetID(), 0,
+                                  /*overwrite=*/true);
   return kTfLiteOk;
 }
 
@@ -248,8 +251,10 @@ TfLiteStatus Conv2dOpBuilder::InitializeBiasNodes(const TfLiteIntArray* inputs,
     ProcessPerChannelQuantizedBias(inputs, outputs, context, &bias_min,
                                    &bias_max);
   } else {
-    bias_data_node_ =
+    auto* bias_data_node =
         graph_builder_->AddConstNodeWithData(inputs->data[2], bias_tensor);
+    graph_builder_->AddTensorWithID(inputs->data[2], bias_data_node->GetID(), 0,
+                                    /*overwrite=*/true);
     TF_LITE_ENSURE_STATUS(
         ComputeMinAndMaxQuantValues(bias_tensor, &bias_min, &bias_max));
   }
diff --git a/tensorflow/lite/delegates/hexagon/builders/min_max_builder.cc b/tensorflow/lite/delegates/hexagon/builders/min_max_builder.cc
index bcfae6032c8..0c6dea2096d 100644
--- a/tensorflow/lite/delegates/hexagon/builders/min_max_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/min_max_builder.cc
@@ -27,10 +27,6 @@ TfLiteStatus MinMaxOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
   int b_tensor_id = inputs->data[1];
   const auto& a_tensor = context->tensors[a_tensor_id];
   const auto& b_tensor = context->tensors[b_tensor_id];
-  if (a_tensor.allocation_type == kTfLiteMmapRo)
-    graph_builder_->AddConstNodeWithData(a_tensor_id, a_tensor);
-  if (b_tensor.allocation_type == kTfLiteMmapRo)
-    graph_builder_->AddConstNodeWithData(b_tensor_id, b_tensor);
   AddInput(graph_builder_->GetHexagonTensorId(a_tensor_id));
   AddInput(graph_builder_->GetHexagonTensorId(b_tensor_id));
 
diff --git a/tensorflow/lite/delegates/hexagon/builders/op_builder.cc b/tensorflow/lite/delegates/hexagon/builders/op_builder.cc
index 0f32a4de6e1..80aa4c8155c 100644
--- a/tensorflow/lite/delegates/hexagon/builders/op_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/op_builder.cc
@@ -18,10 +18,59 @@ limitations under the License.
 #include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/delegates/hexagon/builders/op_factory.h"
+#include <farmhash.h>
 
 namespace tflite {
 namespace delegates {
 namespace hexagon {
+namespace {
+// Farmhash Fingerprint
+inline uint64_t CombineFingerprints(uint64_t l, uint64_t h) {
+  // Murmur-inspired hashing.
+  const uint64_t kMul = 0x9ddfea08eb382d69ULL;
+  uint64_t a = (l ^ h) * kMul;
+  a ^= (a >> 47);
+  uint64_t b = (h ^ a) * kMul;
+  b ^= (b >> 44);
+  b *= kMul;
+  b ^= (b >> 41);
+  b *= kMul;
+  return b;
+}
+
+inline uint64_t ComputeHash(const int shape[], const char* data,
+                            const int data_len) {
+  return CombineFingerprints(
+      ::util::Fingerprint64(data, data_len),
+      ::util::Fingerprint64(reinterpret_cast<const char*>(shape),
+                              sizeof(shape[0]) * 4));
+}
+
+inline uint64_t ComputeHash(const TfLiteTensor& tensor, const int shape[],
+                            int int8_to_uint8) {
+  auto data_hash = ComputeHash(shape, tensor.data.raw_const, tensor.bytes);
+  auto int8_to_uint8_hash = ::util::Fingerprint64(
+      reinterpret_cast<char*>(&int8_to_uint8), sizeof(int8_to_uint8));
+  return CombineFingerprints(data_hash, int8_to_uint8_hash);
+}
+
+int GetElementSize(TfLiteType type) {
+  switch (type) {
+    case kTfLiteFloat32:
+      return sizeof(float);
+    case kTfLiteBool:
+      return sizeof(bool);
+    case kTfLiteInt32:
+      return sizeof(int32_t);
+    case kTfLiteInt8:
+      return sizeof(int8_t);
+    case kTfLiteUInt8:
+      return sizeof(uint8_t);
+    default:
+      return sizeof(int8_t);
+  }
+}
+}  // namespace
 
 OpBuilder* GraphBuilder::CreateOpBuilderFromTfLiteOp(int op_type,
                                                      TfLiteNode* node) {
@@ -116,8 +165,20 @@ OpBuilder* GraphBuilder::CreateOpBuilderFromTfLiteOp(int op_type,
   }
 }
 
+OpBuilder* GraphBuilder::LookupConstData(uint64_t cache_key) {
+  auto lookup_result = cache_.find(cache_key);
+  if (lookup_result != cache_.end()) return lookup_result->second;
+  return nullptr;
+}
+
+void GraphBuilder::AddToCache(uint64_t cache_key, OpBuilder* value) {
+  cache_[cache_key] = value;
+}
+
 OpBuilder* GraphBuilder::AddConstNodeWithData(const int shape[], char* data,
                                               int data_size) {
+  auto cache_key = ComputeHash(shape, data, data_size);
+  if (auto lookup_result = LookupConstData(cache_key)) return lookup_result;
   builders_.emplace_back(new OpBuilder(this, OP_Const));
   builders_.back()->SetConstNode();
   builders_.back()->SetNodeId(builders_.size());
@@ -125,22 +186,36 @@ OpBuilder* GraphBuilder::AddConstNodeWithData(const int shape[], char* data,
       graph_id_, builders_.size(), shape[0], shape[1], shape[2], shape[3],
       reinterpret_cast<const uint8_t*>(data), data_size);
   if (error != 0) {
-    context_->ReportError(context_, "Error adding const node with shape id: %d",
-                          (int)builders_.size());
+    TF_LITE_KERNEL_LOG(context_, "Error adding const node with shape id: %d",
+                       static_cast<int>(builders_.size()));
     return nullptr;
   }
+  AddToCache(cache_key, builders_.back().get());
   return builders_.back().get();
 }
 
 OpBuilder* GraphBuilder::AddConstNodeWithData(int tensor_id,
                                               const TfLiteTensor& tensor,
                                               bool int8_to_uint8) {
+  // Fetch shape of tensor and pad 1's so it is always 4D.
+  int batch_size, height_size, width_size, depth_size;
+  GetDims(&batch_size, &height_size, &width_size, &depth_size, tensor.dims);
+  const int shape[] = {batch_size, height_size, width_size, depth_size};
+
+  auto cache_key = ComputeHash(tensor, shape, int8_to_uint8 ? 1 : 0);
+  if (auto lookup_result = LookupConstData(cache_key)) {
+    // If tensor is cached but with no id, that can happen when the same
+    // data is added from a constant value (not tensor). We can cache the data
+    // and reuse it.
+    // We assign the tensor to this cached const node before returning.
+    if (!HasTensor(tensor_id))
+      AddTensorWithID(tensor_id, lookup_result->GetID(), 0);
+    return lookup_result;
+  }
   builders_.emplace_back(new OpBuilder(this, OP_Const));
   const int node_id = builders_.size();
   builders_.back()->SetConstNode();
   builders_.back()->SetNodeId(node_id);
-  int batch_size, height_size, width_size, depth_size;
-  GetDims(&batch_size, &height_size, &width_size, &depth_size, tensor.dims);
   int error = hexagon_nn_->hexagon_nn_append_const_node(
       graph_id_, node_id, batch_size, height_size, width_size, depth_size,
       reinterpret_cast<const uint8_t*>(tensor.data.raw), tensor.bytes);
@@ -150,19 +225,26 @@ OpBuilder* GraphBuilder::AddConstNodeWithData(int tensor_id,
     return nullptr;
   }
   AddTensorWithID(tensor_id, node_id, 0);
+  // We need to return the builder with result, so we can't rely
+  // on builders_.back() as it can change while casting, so we hold pointer
+  // and update with value from casting if needed.
+  OpBuilder* result_builder = builders_.back().get();
   // Cast int8 to uint8 if requested.
   // This will add cast op to uint8 and update tensor map to point
   // to the casted tensor.
   if (int8_to_uint8 && tensor.type == kTfLiteInt8) {
-    AddCastOp(context_, OP_Quantized_CastInt8ToUInt8, tensor_id);
+    AddCastOp(context_, OP_Quantized_CastInt8ToUInt8, tensor_id,
+              &result_builder);
   }
-  return builders_.back().get();
+  AddToCache(cache_key, result_builder);
+  return result_builder;
 }
 
 // TODO(b/154604279): Support these casting ops in Hexagon op profiling (which
 // seems to key tensors on a single op, which may not be the case now).
 TfLiteStatus GraphBuilder::AddCastOp(TfLiteContext* context, int op_type,
-                                     int tensor_id) {
+                                     int tensor_id,
+                                     OpBuilder** cast_op_builder) {
   // Create a new OpBuilder for casting the tensor.
   OpBuilder* cast_builder = CreateCastBuilder(this, op_type);
   builders_.emplace_back(cast_builder);
@@ -177,6 +259,7 @@ TfLiteStatus GraphBuilder::AddCastOp(TfLiteContext* context, int op_type,
   TF_LITE_ENSURE_STATUS(cast_builder->RegisterOutputs(tensor_data, context));
 
   TfLiteIntArrayFree(tensor_data);
+  if (cast_op_builder != nullptr) *cast_op_builder = cast_builder;
   return kTfLiteOk;
 }
 
@@ -192,12 +275,12 @@ TfLiteStatus GraphBuilder::AddInputTensors(const TfLiteIntArray* input_tensors,
     const int tensor_id = input_tensors->data[i];
     const auto& tensor = context->tensors[tensor_id];
     if (tensor.allocation_type == kTfLiteMmapRo) continue;
-    input_op->AddOutput(tensor.dims);
+    input_op->AddOutput(tensor.dims, GetElementSize(tensor.type));
     AddTensorWithID(tensor_id, input_op->GetID(), num_inputs);
     // If tensor is of type int8, add an op to cast it to uint8.
     if (tensor.type == kTfLiteInt8) {
-      TF_LITE_ENSURE_STATUS(
-          AddCastOp(context, OP_Quantized_CastInt8ToUInt8, tensor_id));
+      TF_LITE_ENSURE_STATUS(AddCastOp(context, OP_Quantized_CastInt8ToUInt8,
+                                      tensor_id, /*cast_op_builder=*/nullptr));
     }
     ++num_inputs;
   }
@@ -215,8 +298,8 @@ TfLiteStatus GraphBuilder::AddOutputTensors(
     const auto& tensor = context->tensors[tensor_id];
     // If tensor is of type int8, add an op to cast it to uint8.
     if (tensor.type == kTfLiteInt8) {
-      TF_LITE_ENSURE_STATUS(
-          AddCastOp(context, OP_Quantized_CastUInt8ToInt8, tensor_id));
+      TF_LITE_ENSURE_STATUS(AddCastOp(context, OP_Quantized_CastUInt8ToInt8,
+                                      tensor_id, /*cast_op_builder=*/nullptr));
     }
     hexagon_output_ids.push_back(GetHexagonTensorId(tensor_id));
   }
@@ -231,9 +314,10 @@ TfLiteStatus GraphBuilder::AddOutputTensors(
   return kTfLiteOk;
 }
 
-OpBuilder::TensorID OpBuilder::AddOutput(const TfLiteIntArray* dims) {
+OpBuilder::TensorID OpBuilder::AddOutput(const TfLiteIntArray* dims,
+                                         int element_size) {
   op_node_.outputs.push_back(hexagon_nn_output());
-  op_node_.outputs.back().elementsize = sizeof(uint8_t);
+  op_node_.outputs.back().elementsize = element_size;
   op_node_.outputs.back().rank = 4;
   // TODO(karimnosseir): What is a good to estimate the max size ?
   int batch_size, height_size, width_size, depth_size;
diff --git a/tensorflow/lite/delegates/hexagon/builders/op_builder.h b/tensorflow/lite/delegates/hexagon/builders/op_builder.h
index 52b130c756f..c2a2889b142 100644
--- a/tensorflow/lite/delegates/hexagon/builders/op_builder.h
+++ b/tensorflow/lite/delegates/hexagon/builders/op_builder.h
@@ -16,6 +16,7 @@ limitations under the License.
 #define TENSORFLOW_LITE_DELEGATES_HEXAGON_BUILDERS_OP_BUILDER_H_
 
 #include <limits>
+#include <map>
 #include <memory>
 #include <string>
 #include <utility>
@@ -131,9 +132,9 @@ class OpBuilder {
   void AddInput(const TensorID& tensor_id) { input_ids_.push_back(tensor_id); }
 
   // Adds Output to the current node, the output has shape defined in 'dims'.
-  // This assumes the data type is uint8.
+  // The size of each element is defined using 'element_size'.
   // Returns the TensorID identifying this output in the graph.
-  TensorID AddOutput(const TfLiteIntArray* dims);
+  TensorID AddOutput(const TfLiteIntArray* dims, int element_size);
 
   // Adds Output to the current node, each element in the output has
   // size 'elementsize' and rank 'rank' and for each dimension in the output
@@ -316,11 +317,22 @@ class GraphBuilder {
   bool AddTensorWithID(int tflite_tensor_id, int hexagon_node_id,
                        int hexagon_node_output_id, bool overwrite = false) {
     if (!overwrite && HasTensor(tflite_tensor_id)) {
+      TF_LITE_KERNEL_LOG(
+          context_,
+          "Trying to add duplicate tensor without overwrite, tflite_tensor_id "
+          "%d, hexagon_node_id %d, hexagon_node_output_id %d",
+          tflite_tensor_id, hexagon_node_id, hexagon_node_output_id);
       return false;
     }
     if (tensors_.size() <= tflite_tensor_id) {
       tensors_.resize(tflite_tensor_id + 1);
     }
+    if (hexagon_node_id == -1 || hexagon_node_output_id == -1)
+      TF_LITE_KERNEL_LOG(context_,
+                         "Trying to add invalid id, tflite_tensor_id "
+                         "%d, hexagon_node_id %d, hexagon_node_output_id %d",
+                         tflite_tensor_id, hexagon_node_id,
+                         hexagon_node_output_id);
     tensors_[tflite_tensor_id] =
         OpBuilder::TensorID(hexagon_node_id, hexagon_node_output_id);
     return true;
@@ -348,6 +360,14 @@ class GraphBuilder {
   int GetMaxBatchSize() const { return max_size_for_batch_; }
 
  private:
+  // Lookup in cache if data with key 'cache_key' is present.
+  // Return OpBuilder* for the data if found, nullptr otherwise.
+  OpBuilder* LookupConstData(uint64_t cache_key);
+
+  // Inserts 'value' in cache, with key equals 'cache_key'.
+  // If data in cache with same key then it will be overwritten.
+  void AddToCache(uint64_t cache_key, OpBuilder* value);
+
   // Helper method to fetch dimensions.
   // TODO(karimnosseir): Move this method to shared place.
   void GetDims(int* batch_size, int* height_size, int* width_size,
@@ -360,7 +380,10 @@ class GraphBuilder {
   }
 
   // Adds a Cast op to convert a tensor from int8 to uint8 (or vice versa).
-  TfLiteStatus AddCastOp(TfLiteContext* context, int op_type, int tensor_id);
+  // The builder which has the casting operator is filled in 'cast_op_builder'
+  // if not nullptr.
+  TfLiteStatus AddCastOp(TfLiteContext* context, int op_type, int tensor_id,
+                         OpBuilder** cast_op_builder);
 
   const HexagonNN* hexagon_nn_ = nullptr;
   TfLiteContext* context_ = nullptr;
@@ -373,6 +396,11 @@ class GraphBuilder {
   // If the graph being built supports dynamic batch, this represents
   // the maximum value for batch.
   int max_size_for_batch_ = -1;
+
+  // Cache for const data in the graph.
+  // Key is hash of the data, value is pointer to the OpBuilder* for the added
+  // data.
+  std::map<uint64_t, OpBuilder*> cache_;
 };
 
 }  // namespace hexagon
diff --git a/tensorflow/lite/delegates/hexagon/builders/transpose_builder.cc b/tensorflow/lite/delegates/hexagon/builders/transpose_builder.cc
index 4a7304d011e..eb0c2668edc 100644
--- a/tensorflow/lite/delegates/hexagon/builders/transpose_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/transpose_builder.cc
@@ -29,15 +29,7 @@ TfLiteStatus TransposeOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
   const auto& input_tensor = context->tensors[tensor_id];
   AddInput(graph_builder_->GetHexagonTensorId(tensor_id));
   // permutation tensor.
-  tensor_id = inputs->data[1];
-  const auto& control_tensor = context->tensors[tensor_id];
-  if (control_tensor.allocation_type == kTfLiteMmapRo) {
-    auto* const_control_tensor_node =
-        graph_builder_->AddConstNodeWithData(tensor_id, control_tensor);
-    AddInput(TensorID(const_control_tensor_node->GetID(), 0));
-  } else {
-    AddInput(graph_builder_->GetHexagonTensorId(tensor_id));
-  }
+  AddInput(graph_builder_->GetHexagonTensorId(inputs->data[1]));
 
   TF_LITE_ENSURE_STATUS(ComputeAndAddMinAndMax(context, input_tensor));
 
diff --git a/tensorflow/lite/delegates/hexagon/builders/transpose_conv_2d_builder.cc b/tensorflow/lite/delegates/hexagon/builders/transpose_conv_2d_builder.cc
index d2620f71007..3e852533394 100644
--- a/tensorflow/lite/delegates/hexagon/builders/transpose_conv_2d_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/transpose_conv_2d_builder.cc
@@ -97,8 +97,6 @@ TfLiteStatus TransposeConv2dOpBuilder::PopulateSubGraph(
       filter_depth_size;
   GetDims(&filter_batch_size, &filter_height_size, &filter_width_size,
           &filter_depth_size, weights_tensor.dims);
-  weight_shape_ = {filter_batch_size, filter_height_size, filter_width_size,
-                   filter_depth_size};
   // Weights tensor could be int8 even for per-tensor quantization.
   // Therefore, we look at the number of scale values to check if it is
   // per-channel quantized.
@@ -106,25 +104,7 @@ TfLiteStatus TransposeConv2dOpBuilder::PopulateSubGraph(
       reinterpret_cast<TfLiteAffineQuantization*>(
           weights_tensor.quantization.params);
   const bool is_per_channel_quant = weights_quant_params->scale->size > 1;
-
-  OpBuilder* const_weights_node;
-  if (weights_tensor.type == kTfLiteInt8) {
-    std::vector<uint8_t> weights_data(NumElements(&weights_tensor));
-    const int8_t* original_data = weights_tensor.data.int8;
-    // Flip bits on the weight values so that the int8 values are treated
-    // as uint8.
-    for (int i = 0; i < NumElements(&weights_tensor); ++i) {
-      weights_data[i] = original_data[i] ^ k8BitSignFlipConstant;
-    }
-    const_weights_node = graph_builder_->AddConstNodeWithData(
-        weight_shape_.data(), reinterpret_cast<char*>(weights_data.data()),
-        weights_data.size() * sizeof(weights_data[0]));
-  } else {
-    const_weights_node = graph_builder_->AddConstNodeWithData(
-        weight_shape_.data(), weights_tensor.data.raw, weights_tensor.bytes);
-  }
-  graph_builder_->AddTensorWithID(tensor_id, const_weights_node->GetID(), 0);
-  AddInput(TensorID(const_weights_node->GetID(), 0));
+  AddInput(graph_builder_->GetHexagonTensorId(tensor_id));
 
   // Handle weights quantization.
   float weights_min = 0;
diff --git a/tensorflow/lite/delegates/hexagon/builders/transpose_conv_2d_builder.h b/tensorflow/lite/delegates/hexagon/builders/transpose_conv_2d_builder.h
index 0a6a90a0297..4afab9894f0 100644
--- a/tensorflow/lite/delegates/hexagon/builders/transpose_conv_2d_builder.h
+++ b/tensorflow/lite/delegates/hexagon/builders/transpose_conv_2d_builder.h
@@ -47,7 +47,7 @@ class TransposeConv2dOpBuilder : public OpBuilder {
   TensorID node_output_;
   std::vector<float> transposed_weights_;
   std::vector<int> stride_shape_;
-  std::vector<int> weight_shape_, bias_shape_;
+  std::vector<int> bias_shape_;
   std::vector<int> bias_data_;
 
   // Non-null only if node has per-channel quantized weights/biases.
diff --git a/tensorflow/lite/delegates/hexagon/hexagon_delegate_kernel.cc b/tensorflow/lite/delegates/hexagon/hexagon_delegate_kernel.cc
index cdf6b555929..83ebc15510e 100644
--- a/tensorflow/lite/delegates/hexagon/hexagon_delegate_kernel.cc
+++ b/tensorflow/lite/delegates/hexagon/hexagon_delegate_kernel.cc
@@ -264,8 +264,9 @@ TfLiteStatus HexagonDelegateKernel::BuildGraph(
       if (tensor_id == -1) continue;
       const auto& input_tensor = context->tensors[tensor_id];
       if (input_tensor.allocation_type == kTfLiteMmapRo) {
-        builder_->AddConstNodeWithData(tensor_id, input_tensor,
-                                       /*int8_to_uint8*/ true);
+        builder_->AddConstNodeWithData(
+            tensor_id, input_tensor,
+            /*int8_to_uint8*/ (input_tensor.type == kTfLiteInt8));
       }
     }
     auto* op_builder =
diff --git a/tensorflow/lite/delegates/interpreter_utils.h b/tensorflow/lite/delegates/interpreter_utils.h
index f736c2db1f4..fa55046f01f 100644
--- a/tensorflow/lite/delegates/interpreter_utils.h
+++ b/tensorflow/lite/delegates/interpreter_utils.h
@@ -22,7 +22,6 @@ limitations under the License.
 
 namespace tflite {
 namespace delegates {
-#if !TFLITE_EXPERIMENTAL_RUNTIME_EAGER
 class InterpreterUtils {
  public:
   /// Invokes an interpreter with automatic fallback from delegation to CPU.
@@ -45,7 +44,6 @@ class InterpreterUtils {
   /// WARNING: This is an experimental API and subject to change.
   static TfLiteStatus InvokeWithCPUFallback(Interpreter* interpreter);
 };
-#endif  // !TFLITE_EXPERIMENTAL_RUNTIME_EAGER
 }  // namespace delegates
 }  // namespace tflite
 
diff --git a/tensorflow/lite/delegates/nnapi/BUILD b/tensorflow/lite/delegates/nnapi/BUILD
index 53ac979c34c..7a34b0846f2 100644
--- a/tensorflow/lite/delegates/nnapi/BUILD
+++ b/tensorflow/lite/delegates/nnapi/BUILD
@@ -1,4 +1,5 @@
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
+load("//tensorflow:tensorflow.bzl", "get_compatible_with_portable")
 
 package(
     default_visibility = [
@@ -26,6 +27,7 @@ cc_library(
         "nnapi_delegate.h",
         "nnapi_delegate_kernel.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     deps = [
         "//tensorflow/lite:allocation",
         "//tensorflow/lite:kernel_api",
diff --git a/tensorflow/lite/delegates/nnapi/acceleration_test_list.cc b/tensorflow/lite/delegates/nnapi/acceleration_test_list.cc
index 5183ab4b062..c30e22892cc 100644
--- a/tensorflow/lite/delegates/nnapi/acceleration_test_list.cc
+++ b/tensorflow/lite/delegates/nnapi/acceleration_test_list.cc
@@ -309,6 +309,8 @@ QuantizedLstmTest/BasicQuantizedLstmTest/29
 
 # quantize_test
 QuantizeOpTest/UINT8,29
+QuantizeOpTest/UInt8UInt8.+,29
+QuantizeOpTest/Int8Int8.+,30
 QuantizeOpTest/INT8,30
 
 # rank
@@ -345,6 +347,8 @@ ResizeBilinearOpTest/ResizeBilinearOpTest/.+/0,29
 // align_corners & half_pixel_centers are not implemented in NNAPI before API 30
 ResizeNearestNeighborOpTest/ResizeNearestNeighborOpTest.+AlignCorners.*/0,30
 ResizeNearestNeighborOpTest/ResizeNearestNeighborOpTest.+HalfPixelCenters.*/0,30
+// 16-bit tests are not supported
+-ResizeNearestNeighborOpTest.+Int16/.+
 // Only models with constant size tensor are accelerated
 ResizeNearestNeighborOpTest/ResizeNearestNeighborOpTest/.+/0,29
 
@@ -360,6 +364,7 @@ SelectOpTest/.+,29
 -SliceOpTest/SliceOpTest/SliceString/.+
 -SliceOpTest/SliceOpTest/SliceInt64/.+
 -SliceOpTest/SliceOpTest/SliceBool/.+
+-SliceOpTest/SliceOpTest/SliceInt16/.+
 # Only constant tensors
 SliceOpTest/SliceOpTest/.+/0,29
 
@@ -409,6 +414,7 @@ TopKV2OpTest/TopKV2OpTest/.+/0,29
 -TransposeTest/5DDividedIntoTwo2Ds.*
 -TransposeTest/Complex5DTest.*
 -TransposeTest/.+DynamicTensor
+-TransposeTest/TestRefOps4DInt16
 TransposeTest/.+
 
 # transpose_conv_test
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
index ce55d671b5d..d81d950af76 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
@@ -103,18 +103,35 @@ std::string NnApiErrorDescription(int error_code) {
 }
 // LINT.ThenChange()
 
-#define RETURN_TFLITE_ERROR_IF_NN_ERROR(context, code, call_desc, p_errno)    \
-  do {                                                                        \
-    const auto _code = (code);                                                \
-    const auto _call_desc = (call_desc);                                      \
-    if (_code != ANEURALNETWORKS_NO_ERROR) {                                  \
-      const auto error_desc = NnApiErrorDescription(_code);                   \
-      context->ReportError(context,                                           \
-                           "NN API returned error %s at line %d while %s.\n", \
-                           error_desc.c_str(), __LINE__, _call_desc);         \
-      *p_errno = _code;                                                       \
-      return kTfLiteError;                                                    \
-    }                                                                         \
+#define RETURN_TFLITE_ERROR_IF_NN_ERROR(context, code, call_desc, p_errno)  \
+  do {                                                                      \
+    const auto _code = (code);                                              \
+    const auto _call_desc = (call_desc);                                    \
+    if (_code != ANEURALNETWORKS_NO_ERROR) {                                \
+      const auto error_desc = NnApiErrorDescription(_code);                 \
+      TF_LITE_KERNEL_LOG(context,                                           \
+                         "NN API returned error %s at line %d while %s.\n", \
+                         error_desc.c_str(), __LINE__, _call_desc);         \
+      *p_errno = _code;                                                     \
+      return kTfLiteError;                                                  \
+    }                                                                       \
+  } while (0)
+
+#define RETURN_TFLITE_ERROR_IF_NN_ERROR_FOR_TENSOR(context, code, call_desc, \
+                                                   p_tensor, p_errno)        \
+  do {                                                                       \
+    const auto _code = (code);                                               \
+    const auto _call_desc = (call_desc);                                     \
+    if (_code != ANEURALNETWORKS_NO_ERROR) {                                 \
+      const auto error_desc = NnApiErrorDescription(_code);                  \
+      TF_LITE_KERNEL_LOG(context,                                            \
+                         "NN API returned error %s at line %d while %s "     \
+                         "for tensor '%s'.\n",                               \
+                         error_desc.c_str(), __LINE__, _call_desc,           \
+                         (p_tensor)->name ? (p_tensor)->name : "no-name");   \
+      *p_errno = _code;                                                      \
+      return kTfLiteError;                                                   \
+    }                                                                        \
   } while (0)
 
 bool IsFloat(TfLiteType type) {
@@ -147,6 +164,48 @@ bool IsQuantized(TfLiteType type) {
   }
 }
 
+bool IsInt32(TfLiteType type) {
+  switch (type) {
+    case kTfLiteInt32:
+      return true;
+    default:
+      return false;
+  }
+}
+
+bool IsFloatOrQuantized(TfLiteType type) {
+  switch (type) {
+    case kTfLiteFloat32:
+    case kTfLiteUInt8:
+    case kTfLiteInt8:
+      return true;
+    default:
+      return false;
+  }
+}
+
+bool IsFloatOrInt32(TfLiteType type) {
+  switch (type) {
+    case kTfLiteFloat32:
+    case kTfLiteInt32:
+      return true;
+    default:
+      return false;
+  }
+}
+
+bool IsFloatQuantizedOrInt32(TfLiteType type) {
+  switch (type) {
+    case kTfLiteFloat32:
+    case kTfLiteUInt8:
+    case kTfLiteInt8:
+    case kTfLiteInt32:
+      return true;
+    default:
+      return false;
+  }
+}
+
 bool IsScalarInputSupported(int builtin_code) {
   switch (builtin_code) {
     case kTfLiteBuiltinAdd:
@@ -307,6 +366,87 @@ bool IsHybridOperator(const TfLiteContext* context, int builtin_code,
   }
 }
 
+bool HasUnspecifiedDimension(const TfLiteTensor* tensor) {
+  if (tensor->dims_signature) {
+    for (int i : TfLiteIntArrayView(tensor->dims_signature)) {
+      if (i == -1) return true;
+    }
+  }
+  return false;
+}
+
+ANeuralNetworksOperandType ConvertTensorTypeToNNType(
+    const TfLiteTensor* tensor, TfLiteType ann_type_equivalent) {
+  int32_t nn_type = 0;
+  float scale = 0.0f;
+  int32_t zero_point = 0;
+  switch (tensor->type) {
+    case kTfLiteFloat32:
+      nn_type = ANEURALNETWORKS_TENSOR_FLOAT32;
+      break;
+    case kTfLiteUInt8:
+      nn_type = ann_type_equivalent == kTfLiteInt32
+                    ? ANEURALNETWORKS_TENSOR_INT32
+                    : ANEURALNETWORKS_TENSOR_QUANT8_ASYMM;
+      scale = tensor->params.scale;
+      zero_point = tensor->params.zero_point;
+      if (scale == 0) {
+        // TENSOR_QUANT8_ASYMM and ANEURALNETWORKS_TENSOR_QUANT8_ASYMM
+        // with zero scale are not valid in NNAPI.
+        scale = 1;
+      }
+      break;
+    case kTfLiteInt8:
+      nn_type = ANEURALNETWORKS_TENSOR_QUANT8_SYMM;
+      scale = tensor->params.scale;
+      zero_point = tensor->params.zero_point;
+      if (ann_type_equivalent == kTfLiteUInt8) {
+        nn_type = ANEURALNETWORKS_TENSOR_QUANT8_ASYMM;
+        zero_point += 128;
+      } else if (ann_type_equivalent == kTfLiteInt32) {
+        nn_type = ANEURALNETWORKS_TENSOR_INT32;
+        zero_point += 128;
+      }
+      if (scale == 0) {
+        // TENSOR_QUANT8_ASYMM and ANEURALNETWORKS_TENSOR_QUANT8_ASYMM
+        // with zero scale are not valid in NNAPI.
+        scale = 1;
+      }
+      break;
+    case kTfLiteInt32:
+      nn_type = ANEURALNETWORKS_TENSOR_INT32;
+      scale = tensor->params.scale;
+      zero_point = tensor->params.zero_point;
+      break;
+    case kTfLiteBool:
+      nn_type = ANEURALNETWORKS_TENSOR_BOOL8;
+      break;
+    case kTfLiteInt16:
+      nn_type = ANEURALNETWORKS_TENSOR_QUANT16_SYMM;
+      scale = tensor->params.scale;
+      zero_point = tensor->params.zero_point;
+      break;
+    default:
+      break;
+  }
+  uint32_t tensor_rank = static_cast<uint32_t>(tensor->dims->size);
+  uint32_t* tensor_dims = reinterpret_cast<uint32_t*>(tensor->dims->data);
+  static uint32_t scalar_rank = 1;
+  // treat scalar input as single cell tensor in NNAPI.
+  if (tensor_rank == 0) {
+    tensor_rank = scalar_rank;
+    tensor_dims = &scalar_rank;
+  }
+  ANeuralNetworksOperandType nn_operand_type{
+      .type = nn_type,
+      .dimensionCount = tensor_rank,
+      .dimensions = tensor_dims,
+      .scale = scale,
+      .zeroPoint = zero_point,
+  };
+  return nn_operand_type;
+}
+
 constexpr size_t kDefaultByteAlignmentForNNAPI = 16;
 
 static size_t getNumPaddingBytes(size_t byte_size) {
@@ -554,7 +694,8 @@ class NNAPIOpBuilder {
                  std::map<const MMAPAllocation*, ANeuralNetworksMemory*>*
                      allocation_mapping,
                  std::vector<int>* nnapi_to_tflite_op_mapping,
-                 ANeuralNetworksModel* nn_model, int* nnapi_errno)
+                 ANeuralNetworksModel* nn_model, int* nnapi_errno,
+                 bool allow_dynamic_dimensions)
       : nnapi_(nnapi),
         context_(context),
         operand_mapping_(tensor_mapping),
@@ -562,7 +703,8 @@ class NNAPIOpBuilder {
         allocation_memory_mapping_(allocation_mapping),
         nnapi_to_tflite_op_mapping_(nnapi_to_tflite_op_mapping),
         nn_model_(nn_model),
-        nnapi_errno_(nnapi_errno) {}
+        nnapi_errno_(nnapi_errno),
+        allow_dynamic_dimensions_(allow_dynamic_dimensions) {}
 
   TfLiteStatus AddScalarBoolOperand(bool value) {
     return AddScalarOperand<bool>(value, ANEURALNETWORKS_BOOL);
@@ -867,10 +1009,10 @@ class NNAPIOpBuilder {
     TF_LITE_ENSURE_EQ(context_, NumElements(tensor), 1);
 
     ANeuralNetworksOperandType operand_type{.type = nn_type};
-    RETURN_TFLITE_ERROR_IF_NN_ERROR(
+    RETURN_TFLITE_ERROR_IF_NN_ERROR_FOR_TENSOR(
         context_,
         nnapi_->ANeuralNetworksModel_addOperand(nn_model_, &operand_type),
-        "adding operand", nnapi_errno_);
+        "adding operand", tensor, nnapi_errno_);
     int ann_tensor_index = operand_mapping_->lite_index_to_ann(tensor_index);
     if (ann_tensor_index != -1) {
       augmented_inputs_.push_back(ann_tensor_index);
@@ -1171,8 +1313,20 @@ class NNAPIOpBuilder {
             TfLiteTypeGetName(tensor_type));
         return kTfLiteError;
     }
+    bool has_unspecified_dimensions = HasUnspecifiedDimension(tensor);
     uint32_t tensor_rank = static_cast<uint32_t>(tensor->dims->size);
-    uint32_t* tensor_dims = reinterpret_cast<uint32_t*>(tensor->dims->data);
+    std::vector<uint32_t> dims_unspecified(tensor_rank, 0);
+    if (has_unspecified_dimensions) {
+      for (int i = 0; i < tensor->dims_signature->size; i++) {
+        dims_unspecified[i] = tensor->dims_signature->data[i] == -1
+                                  ? 0
+                                  : tensor->dims_signature->data[i];
+      }
+    }
+    uint32_t* tensor_dims =
+        has_unspecified_dimensions && allow_dynamic_dimensions_
+            ? dims_unspecified.data()
+            : reinterpret_cast<uint32_t*>(tensor->dims->data);
     if (scalar_as_tensor && tensor_rank == 0) {
       // Use rank 1, shape {1} operand for TFLite scalar tensors.
       tensor_rank = 1;
@@ -1185,17 +1339,18 @@ class NNAPIOpBuilder {
 
     ANeuralNetworksOperandType operand_type{nn_type, tensor_rank, tensor_dims,
                                             scale, zeroPoint};
-    RETURN_TFLITE_ERROR_IF_NN_ERROR(
+    RETURN_TFLITE_ERROR_IF_NN_ERROR_FOR_TENSOR(
         context_,
         nnapi_->ANeuralNetworksModel_addOperand(nn_model_, &operand_type),
-        "adding operand", nnapi_errno_);
+        "adding operand", tensor, nnapi_errno_);
 
     if (nn_type == ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL) {
-      RETURN_TFLITE_ERROR_IF_NN_ERROR(
+      RETURN_TFLITE_ERROR_IF_NN_ERROR_FOR_TENSOR(
           context_,
           nnapi_->ANeuralNetworksModel_setOperandSymmPerChannelQuantParams(
               nn_model_, ann_tensor_index, &ann_perchannel_params),
-          "setting new operand per channel quantization params", nnapi_errno_);
+          "setting new operand per channel quantization params", tensor,
+          nnapi_errno_);
     }
     if (tensor->allocation_type == kTfLiteMmapRo) {
       if (IsQuantized(tensor_type) && need_int8_conversion &&
@@ -1224,12 +1379,12 @@ class NNAPIOpBuilder {
           new_tensor->data.uint8[i] = static_cast<const uint8_t>(
               static_cast<int32_t>(tensor->data.int8[i]) + 128);
         }
-        RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        RETURN_TFLITE_ERROR_IF_NN_ERROR_FOR_TENSOR(
             context_,
             nnapi_->ANeuralNetworksModel_setOperandValue(
                 nn_model_, ann_tensor_index, new_tensor->data.raw,
                 new_tensor->bytes),
-            "setting new operand value", nnapi_errno_);
+            "setting new operand value", tensor, nnapi_errno_);
 #ifdef TFLITE_NNAPI_ALLOW_MMAP_SHARING
       } else if (tensor->allocation &&
                  static_cast<const Allocation*>(tensor->allocation)->type() ==
@@ -1249,19 +1404,19 @@ class NNAPIOpBuilder {
         // Compute the offset to the base pointer of the MMAPAllocation.
         auto offset = reinterpret_cast<const uint8_t*>(tensor->data.raw) -
                       reinterpret_cast<const uint8_t*>(mmap_alloc->base());
-        RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        RETURN_TFLITE_ERROR_IF_NN_ERROR_FOR_TENSOR(
             context_,
             nnapi_->ANeuralNetworksModel_setOperandValueFromMemory(
                 nn_model_, ann_tensor_index, ann_memory_handle, offset,
                 tensor->bytes),
-            "setting new operand value from memory", nnapi_errno_);
+            "setting new operand value from memory", tensor, nnapi_errno_);
 #endif
       } else {
-        RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        RETURN_TFLITE_ERROR_IF_NN_ERROR_FOR_TENSOR(
             context_,
             nnapi_->ANeuralNetworksModel_setOperandValue(
                 nn_model_, ann_tensor_index, tensor->data.raw, tensor->bytes),
-            "setting new operand value", nnapi_errno_);
+            "setting new operand value", tensor, nnapi_errno_);
       }
     }
     indices->push_back(ann_tensor_index);
@@ -1301,6 +1456,9 @@ class NNAPIOpBuilder {
 
   // Return status code of the latest NNAPI call.
   int* nnapi_errno_;
+
+  // Whether to allow dynamic batch size without re-compilation.
+  bool allow_dynamic_dimensions_;
 };  // namespace nnapi
 
 namespace {
@@ -1414,11 +1572,30 @@ bool ExpectIsFloatOrQuant8Operator(const TfLiteContext* context,
                                    const TfLiteNode* node,
                                    OpValidationContext* val_ctx) {
   const auto input_type = context->tensors[node->inputs->data[0]].type;
-  return Expect(IsFloat(input_type) || IsQuantized(input_type),
+  return Expect(IsFloatOrQuantized(input_type),
                 NNAPIValidationFailureType::kUnsupportedInputType,
                 "Input should be Float or Quant8", val_ctx);
 }
 
+bool ExpectIsFloatOrInt32Operator(const TfLiteContext* context,
+                                  const TfLiteNode* node,
+                                  OpValidationContext* val_ctx) {
+  const auto input_type = context->tensors[node->inputs->data[0]].type;
+  return Expect(IsFloatOrInt32(input_type),
+                NNAPIValidationFailureType::kUnsupportedInputType,
+                "Input should be Float or Int32", val_ctx);
+}
+
+bool ExpectIsFloatQuant8OrInt32Operator(const TfLiteContext* context,
+                                        const TfLiteNode* node,
+                                        OpValidationContext* val_ctx) {
+  const auto input_type = context->tensors[node->inputs->data[0]].type;
+  return Expect(IsFloatQuantizedOrInt32(input_type),
+                NNAPIValidationFailureType::kUnsupportedInputType,
+                "Input should be Float, Quant8, or Int32", val_ctx);
+}
+
+// When using NN API version 1.0 or 1.1, the condition below must be true for
 // When using NN API version 1.0 or 1.1, the condition below must be true for
 // quantized versions of the following ops:
 // * CONV_2D
@@ -1455,7 +1632,17 @@ bool NNAPIDelegateKernel::Validate(
   switch (builtin_code) {
     case kTfLiteBuiltinAdd: {
       ExpectMaxOpVersion(version, 2, &val_ctx);
-      ExpectIsFloatOrQuant8Operator(context, node, &val_ctx);
+      if (android_sdk_version >= kMinSdkVersionForNNAPI13) {
+        ExpectIsFloatQuant8OrInt32Operator(context, node, &val_ctx);
+        if (IsInt32(context->tensors[node->inputs->data[0]].type)) {
+          Expect(reinterpret_cast<TfLiteAddParams*>(node->builtin_data)
+                         ->activation == kTfLiteActNone,
+                 NNAPIValidationFailureType::kNoActivationExpected,
+                 "No activation function supported", &val_ctx);
+        }
+      } else {
+        ExpectIsFloatOrQuant8Operator(context, node, &val_ctx);
+      }
     } break;
     case kTfLiteBuiltinArgMax:
     case kTfLiteBuiltinArgMin: {
@@ -1500,7 +1687,17 @@ bool NNAPIDelegateKernel::Validate(
     } break;
     case kTfLiteBuiltinMul: {
       ExpectMaxOpVersion(version, 2, &val_ctx);
-      ExpectIsFloatOrQuant8Operator(context, node, &val_ctx);
+      if (android_sdk_version >= kMinSdkVersionForNNAPI13) {
+        ExpectIsFloatQuant8OrInt32Operator(context, node, &val_ctx);
+        if (IsInt32(context->tensors[node->inputs->data[0]].type)) {
+          Expect(reinterpret_cast<TfLiteMulParams*>(node->builtin_data)
+                         ->activation == kTfLiteActNone,
+                 NNAPIValidationFailureType::kNoActivationExpected,
+                 "No activation function supported", &val_ctx);
+        }
+      } else {
+        ExpectIsFloatOrQuant8Operator(context, node, &val_ctx);
+      }
     } break;
     case kTfLiteBuiltinAveragePool2d: {
       ExpectMaxOpVersion(version, 2, &val_ctx);
@@ -1592,7 +1789,7 @@ bool NNAPIDelegateKernel::Validate(
       }
     } break;
     case kTfLiteBuiltinFullyConnected: {
-      ExpectMaxOpVersion(version, 4, &val_ctx);
+      ExpectMaxOpVersion(version, 5, &val_ctx);
       // TODO(b/132950584): Add support for FullyConnected with no bias.
       Expect(node->inputs->size == 3 &&
                  node->inputs->data[2] != kTfLiteOptionalTensor,
@@ -1699,6 +1896,15 @@ bool NNAPIDelegateKernel::Validate(
       ExpectMinAndroidSdkVersion(android_sdk_version, kMinSdkVersionForNNAPI12,
                                  &val_ctx);
       ExpectIsFloatOrQuant8Operator(context, node, &val_ctx);
+      Expect(node->inputs->size >= 2,
+             NNAPIValidationFailureType::kUnsupportedOperatorVariant,
+             "Expected at least 2 inputs", &val_ctx);
+      if (node->inputs->size >= 2) {
+        Expect(context->tensors[node->inputs->data[1]].allocation_type ==
+                   kTfLiteMmapRo,
+               NNAPIValidationFailureType::kInputTensorShouldHaveConstantShape,
+               "The size input tensor must be constant.", &val_ctx);
+      }
       auto builtin = reinterpret_cast<TfLiteResizeNearestNeighborParams*>(
           node->builtin_data);
       if (android_sdk_version <= kMinSdkVersionForNNAPI12) {
@@ -1846,9 +2052,17 @@ bool NNAPIDelegateKernel::Validate(
       Expect((android_sdk_version >= kMinSdkVersionForNNAPI11 &&
               IsFloat(input_type)) ||
                  (android_sdk_version >= kMinSdkVersionForNNAPI12 &&
-                  IsQuantized(input_type)),
+                  IsQuantized(input_type)) ||
+                 (android_sdk_version >= kMinSdkVersionForNNAPI13 &&
+                  IsInt32(input_type)),
              NNAPIValidationFailureType::kUnsupportedInputType,
              "NNAPI only support float sub.", &val_ctx);
+      if (IsInt32(input_type)) {
+        Expect(reinterpret_cast<TfLiteSubParams*>(node->builtin_data)
+                       ->activation == kTfLiteActNone,
+               NNAPIValidationFailureType::kNoActivationExpected,
+               "No activation function supported", &val_ctx);
+      }
       const int input0_rank =
           context->tensors[node->inputs->data[0]].dims->size;
       const int input1_rank =
@@ -2338,13 +2552,20 @@ bool NNAPIDelegateKernel::Validate(
              "Input should be Float32.", &val_ctx);
     } break;
     case kTfLiteBuiltinQuantize: {
-      ExpectOpVersion(version, 1, &val_ctx);
+      ExpectMaxOpVersion(version, 2, &val_ctx);
       ExpectMinAndroidSdkVersion(android_sdk_version, kMinSdkVersionForNNAPI12,
                                  &val_ctx);
       const auto value_type = context->tensors[node->inputs->data[0]].type;
-      Expect(value_type == kTfLiteFloat32,
+      Expect(value_type == kTfLiteFloat32 || IsQuantized(value_type),
              NNAPIValidationFailureType::kUnsupportedInputType,
-             "Value should be Float32.", &val_ctx);
+             "Value should be quantized or Float32.", &val_ctx);
+      if (IsQuantized(value_type)) {
+        const auto quantization_params =
+            context->tensors[node->inputs->data[0]].params;
+        Expect(quantization_params.scale > 0.f,
+               NNAPIValidationFailureType::kUnsupportedQuantizationParameters,
+               "Quantization scale should be > 0.", &val_ctx);
+      }
       const auto output_type = context->tensors[node->outputs->data[0]].type;
       if (android_sdk_version < kMinSdkVersionForNNAPI13) {
         Expect(output_type == kTfLiteUInt8,
@@ -3186,6 +3407,15 @@ TfLiteStatus NNAPIDelegateKernel::Map(
       *nn_op_type = ANEURALNETWORKS_LOG_SOFTMAX;
     } break;
     case kTfLiteBuiltinQuantize: {
+      auto input_index = mapping_args.node->inputs->data[0];
+      // NNAPI doesn't support requantization cases but only quantizations
+      // from float. Dequantizing our input adding a Dequantize node before
+      // this one.
+      if (IsQuantized(mapping_args.context->tensors[input_index].type)) {
+        mapping_args.builder->AddDequantize(0, input_index, kTfLiteFloat32,
+                                            mapping_args.node_index);
+      }
+
       *nn_op_type = ANEURALNETWORKS_QUANTIZE;
     } break;
     case kTfLiteBuiltinReduceAny: {
@@ -3317,12 +3547,12 @@ TfLiteStatus NNAPIDelegateKernel::Prepare(TfLiteContext* context,
     return kTfLiteError;
   }
 
+  const auto delegate_options =
+      StatefulNnApiDelegate::GetOptions(node->delegate);
   if (nn_compilation_) {
     return kTfLiteOk;
   }
 
-  const auto delegate_options =
-      StatefulNnApiDelegate::GetOptions(node->delegate);
   ANeuralNetworksCompilation* compilation = nullptr;
   if (!nnapi_devices_.empty()) {
     // Compile for the selected accelerator.
@@ -3468,6 +3698,53 @@ TfLiteStatus NNAPIDelegateKernel::Invoke(TfLiteContext* context,
           "setting execution loop timeout", nnapi_errno);
     }
   }
+  // Check if the size of input and output memory pool needs to be resized.
+  if (delegate_options.allow_dynamic_dimensions) {
+    size_t total_input_byte_size = 0;
+    // Make the TensorFlow Lite inputs and outputs to ann_indices.
+    for (int i : TfLiteIntArrayView(node->inputs)) {
+      // Constant tensors are not NNAPI inputs.
+      if (i != kTfLiteOptionalTensor &&
+          context->tensors[i].allocation_type != kTfLiteMmapRo &&
+          // The delegate might not have mapped this input (this can
+          // happen if one tensor is split in several ones)
+          operand_mapping_.lite_index_to_ann(i) != -1) {
+        if (context->tensors[i].buffer_handle != kTfLiteNullBufferHandle) {
+          continue;
+        }
+        const TfLiteType nn_type_conversion =
+            operand_mapping_.lite_index_to_ann_type_conversion(i);
+        int tensor_size = 0;
+        if (nn_type_conversion == kTfLiteNoType) {
+          tensor_size = context->tensors[i].bytes;
+        } else {
+          size_t type_size;
+          TF_LITE_ENSURE_OK(
+              context, GetSizeOfType(context, nn_type_conversion, &type_size));
+          tensor_size = NumElements(&context->tensors[i]) * type_size;
+        }
+        total_input_byte_size += tensor_size;
+        total_input_byte_size += getNumPaddingBytes(tensor_size);
+      }
+    }
+    if (total_input_byte_size > nn_input_memory_->get_byte_size()) {
+      nn_input_memory_.reset(
+          new NNMemory(nnapi_, "input_pool", total_input_byte_size));
+    }
+
+    size_t total_output_byte_size = 0;
+    for (int i : TfLiteIntArrayView(node->outputs)) {
+      if (context->tensors[i].buffer_handle != kTfLiteNullBufferHandle) {
+        continue;
+      }
+      total_output_byte_size += context->tensors[i].bytes;
+      total_output_byte_size += getNumPaddingBytes(context->tensors[i].bytes);
+    }
+    if (total_output_byte_size > nn_output_memory_->get_byte_size()) {
+      nn_output_memory_.reset(
+          new NNMemory(nnapi_, "output_pool", total_output_byte_size));
+    }
+  }
 
   // Set the input tensor buffers. Note: we access tflite tensors using
   // absolute indices but NN api indices inputs by relative indices.
@@ -3481,24 +3758,32 @@ TfLiteStatus NNAPIDelegateKernel::Invoke(TfLiteContext* context,
     if (absolute_input_index == kTfLiteOptionalTensor) {
       continue;
     }
+    ANeuralNetworksOperandType input_nn_operand_type;
+    ANeuralNetworksOperandType* input_nn_operand_type_ptr = nullptr;
     TfLiteTensor* tensor = &context->tensors[absolute_input_index];
+    TfLiteType ann_type_equivalent =
+        operand_mapping_.lite_index_to_ann_type_conversion(
+            absolute_input_index);
+    if (delegate_options.allow_dynamic_dimensions &&
+        HasUnspecifiedDimension(tensor)) {
+      input_nn_operand_type =
+          ConvertTensorTypeToNNType(tensor, ann_type_equivalent);
+      input_nn_operand_type_ptr = &input_nn_operand_type;
+    }
     if (tensor->allocation_type != kTfLiteMmapRo) {
       if (tensor->buffer_handle != kTfLiteNullBufferHandle &&
           tensor->buffer_handle < tensor_memory_map_->size()) {
-        RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        RETURN_TFLITE_ERROR_IF_NN_ERROR_FOR_TENSOR(
             context,
             nnapi_->ANeuralNetworksExecution_setInputFromMemory(
-                execution, relative_input_index, nullptr,
+                execution, relative_input_index, input_nn_operand_type_ptr,
                 tensor_memory_map_->at(tensor->buffer_handle).memory, 0,
                 tensor->bytes),
-            "associating NNAPI execution input with a memory object",
+            "associating NNAPI execution input with a memory object", tensor,
             nnapi_errno);
         relative_input_index++;
         continue;
       }
-      TfLiteType ann_type_equivalent =
-          operand_mapping_.lite_index_to_ann_type_conversion(
-              absolute_input_index);
       int tensor_size = 0;
       if (ann_type_equivalent != kTfLiteNoType) {
         const auto num_elements = NumElements(tensor);
@@ -3541,23 +3826,23 @@ TfLiteStatus NNAPIDelegateKernel::Invoke(TfLiteContext* context,
         TF_LITE_ENSURE_OK(
             context, GetSizeOfType(context, ann_type_equivalent, &type_size));
         tensor_size = NumElements(tensor) * type_size;
-        RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        RETURN_TFLITE_ERROR_IF_NN_ERROR_FOR_TENSOR(
             context,
             nnapi_->ANeuralNetworksExecution_setInputFromMemory(
-                execution, relative_input_index, nullptr,
+                execution, relative_input_index, input_nn_operand_type_ptr,
                 nn_input_memory_->get_handle(), input_offset, tensor_size),
-            "associating NNAPI execution input with a memory object",
+            "associating NNAPI execution input with a memory object", tensor,
             nnapi_errno);
       } else {
         // copy data to pre-allocated shared memory.
         memcpy(nn_input_memory_->get_data_ptr() + input_offset,
                tensor->data.raw, tensor->bytes);
-        RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        RETURN_TFLITE_ERROR_IF_NN_ERROR_FOR_TENSOR(
             context,
             nnapi_->ANeuralNetworksExecution_setInputFromMemory(
-                execution, relative_input_index, nullptr,
+                execution, relative_input_index, input_nn_operand_type_ptr,
                 nn_input_memory_->get_handle(), input_offset, tensor->bytes),
-            "associating NNAPI execution input with a memory object",
+            "associating NNAPI execution input with a memory object", tensor,
             nnapi_errno);
         tensor_size = tensor->bytes;
       }
@@ -3576,24 +3861,36 @@ TfLiteStatus NNAPIDelegateKernel::Invoke(TfLiteContext* context,
     if (operand_mapping_.lite_index_to_ann(output_index) == -1) {
       continue;
     }
+    ANeuralNetworksOperandType output_nn_operand_type;
+    ANeuralNetworksOperandType* output_nn_operand_type_ptr = nullptr;
     TfLiteTensor* tensor = &context->tensors[output_index];
+    if (delegate_options.allow_dynamic_dimensions &&
+        HasUnspecifiedDimension(tensor)) {
+      TfLiteType ann_type_equivalent =
+          operand_mapping_.lite_index_to_ann_type_conversion(output_index);
+      output_nn_operand_type =
+          ConvertTensorTypeToNNType(tensor, ann_type_equivalent);
+      output_nn_operand_type_ptr = &output_nn_operand_type;
+    }
     if (tensor->buffer_handle != kTfLiteNullBufferHandle &&
         tensor->buffer_handle < tensor_memory_map_->size()) {
-      RETURN_TFLITE_ERROR_IF_NN_ERROR(
+      RETURN_TFLITE_ERROR_IF_NN_ERROR_FOR_TENSOR(
           context,
           nnapi_->ANeuralNetworksExecution_setOutputFromMemory(
-              execution, relative_output_index, nullptr,
+              execution, relative_output_index, output_nn_operand_type_ptr,
               tensor_memory_map_->at(tensor->buffer_handle).memory, 0,
               tensor->bytes),
-          "associating NNAPI execution output to a memory object", nnapi_errno);
+          "associating NNAPI execution output to a memory object", tensor,
+          nnapi_errno);
 
     } else {
-      RETURN_TFLITE_ERROR_IF_NN_ERROR(
+      RETURN_TFLITE_ERROR_IF_NN_ERROR_FOR_TENSOR(
           context,
           nnapi_->ANeuralNetworksExecution_setOutputFromMemory(
-              execution, relative_output_index, nullptr,
+              execution, relative_output_index, output_nn_operand_type_ptr,
               nn_output_memory_->get_handle(), output_offset, tensor->bytes),
-          "associating NNAPI execution output to a memory object", nnapi_errno);
+          "associating NNAPI execution output to a memory object", tensor,
+          nnapi_errno);
       output_offset += tensor->bytes;
       output_offset += getNumPaddingBytes(tensor->bytes);
     }
@@ -3729,16 +4026,15 @@ void NNAPIDelegateKernel::AddDequantizeOperatorsWhereNeeded(
   }
 }
 
-TfLiteStatus NNAPIDelegateKernel::AddOpsAndTensors(TfLiteContext* context,
-                                                   int* nnapi_errno) {
+TfLiteStatus NNAPIDelegateKernel::AddOpsAndTensors(
+    TfLiteContext* context, int* nnapi_errno, bool allow_dynamic_dimensions) {
   DequantizeMapping dequantize_mapping;
   // The operand builder allows creating a single op. It is created outside
   // the for loop to avoid reallocating the vectors.
   NNAPIOpBuilder builder(nnapi_, context, &operand_mapping_,
                          &dequantize_mapping, &allocation_memory_mapping_,
                          &nnapi_to_tflite_op_mapping_, nn_model_.get(),
-                         nnapi_errno);
-
+                         nnapi_errno, allow_dynamic_dimensions);
   // If we have target accelerators the target SDK version might be
   // different than the current android version.
   target_sdk_version_ = nnapi_->android_sdk_version;
@@ -3746,7 +4042,6 @@ TfLiteStatus NNAPIDelegateKernel::AddOpsAndTensors(TfLiteContext* context,
     TF_LITE_ENSURE_STATUS(GetTargetSdkVersion(
         context, nnapi_, nnapi_devices_, &target_sdk_version_, nnapi_errno));
   }
-
   // Add Tensors.
   for (auto node_index : nodes_) {
     // Obtain the op and registration.
@@ -4093,7 +4388,7 @@ TfLiteStatus NNAPIDelegateKernel::AddOpsAndTensors(TfLiteContext* context,
     int nn_op_type;
     TF_LITE_ENSURE_STATUS(
         Map(context, reg->builtin_code, reg->version, target_sdk_version_,
-            {context, &builder, node, &model_state_outputs_,
+            {context, &builder, node, node_index, &model_state_outputs_,
              &model_state_tfl_inputs_, &feedback_loops_, nnapi_errno},
             &nn_op_type));
 
@@ -4133,7 +4428,8 @@ TfLiteStatus NNAPIDelegateKernel::BuildGraph(
     const TfLiteIntArray* input_tensors, const TfLiteIntArray* output_tensors,
     int* nnapi_errno) {
   // Build the ops and tensors.
-  TF_LITE_ENSURE_STATUS(AddOpsAndTensors(context, nnapi_errno));
+  TF_LITE_ENSURE_STATUS(AddOpsAndTensors(
+      context, nnapi_errno, delegate_options.allow_dynamic_dimensions));
   // Map input and output tensor indices to ANN
   std::vector<uint32_t> inputs;
   inputs.reserve(input_tensors->size);
@@ -4222,6 +4518,9 @@ TfLiteStatus NNAPIDelegateKernel::BuildGraph(
 }  // namespace nnapi
 }  // namespace delegate
 
+using ::tflite::delegate::nnapi::kMinSdkVersionForNNAPI;
+using ::tflite::delegate::nnapi::kMinSdkVersionForNNAPI11;
+using ::tflite::delegate::nnapi::kMinSdkVersionForNNAPI12;
 using ::tflite::delegate::nnapi::NNAPIDelegateKernel;
 
 StatefulNnApiDelegate::Data::Data(const NnApi* nnapi) : nnapi(nnapi) {}
@@ -4284,6 +4583,9 @@ StatefulNnApiDelegate::StatefulNnApiDelegate(const NnApi* nnapi,
       options.max_execution_timeout_duration_ns;
   delegate_data_.max_execution_loop_timeout_duration_ns =
       options.max_execution_loop_timeout_duration_ns;
+  if (nnapi->android_sdk_version >= kMinSdkVersionForNNAPI11) {
+    delegate_data_.allow_dynamic_dimensions = options.allow_dynamic_dimensions;
+  }
   TFLITE_LOG_PROD_ONCE(tflite::TFLITE_LOG_INFO,
                        "Created TensorFlow Lite delegate for NNAPI.");
   Prepare = DoPrepare;
@@ -4291,6 +4593,10 @@ StatefulNnApiDelegate::StatefulNnApiDelegate(const NnApi* nnapi,
   CopyToBufferHandle = DoCopyToBufferHandle;
   FreeBufferHandle = DoFreeBufferHandle;
   data_ = &delegate_data_;
+  if (delegate_data_.allow_dynamic_dimensions) {
+    flags |= kTfLiteDelegateFlagsAllowDynamicTensors;
+    flags |= kTfLiteDelegateFlagsRequirePropagatedShapes;
+  }
 }
 
 StatefulNnApiDelegate::StatefulNnApiDelegate()
@@ -4321,6 +4627,7 @@ const StatefulNnApiDelegate::Options StatefulNnApiDelegate::GetOptions(
       delegate_data->max_execution_timeout_duration_ns;
   options.max_execution_loop_timeout_duration_ns =
       delegate_data->max_execution_loop_timeout_duration_ns;
+  options.allow_dynamic_dimensions = delegate_data->allow_dynamic_dimensions;
   return options;
 }
 
@@ -4384,9 +4691,6 @@ int StatefulNnApiDelegate::GetNnApiErrno() const {
   return delegate_data_.nnapi_errno;
 }
 
-using ::tflite::delegate::nnapi::kMinSdkVersionForNNAPI;
-using ::tflite::delegate::nnapi::kMinSdkVersionForNNAPI12;
-
 // static
 TfLiteStatus StatefulNnApiDelegate::GetNodesSupportedByAccelerator(
     TfLiteContext* context, TfLiteDelegate* delegate, const NnApi* nnapi,
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.h b/tensorflow/lite/delegates/nnapi/nnapi_delegate.h
index 27add64563d..bd4165d8a17 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.h
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.h
@@ -116,6 +116,15 @@ class StatefulNnApiDelegate : public TfLiteDelegate {
     // within the specified duration, the execution will be aborted. If set to
     // 0, the default timeout for loops will be used.
     uint64_t max_execution_loop_timeout_duration_ns = 0;
+
+    // Whether to allow dynamic dimension sizes without re-compilation.
+    // A tensor of with dynamic dimension must have a valid dim_signature
+    // defined.
+    // Only supported in NNAPI 1.1 and newer versions.
+    // WARNING: Setting this flag to true may result in model being rejected by
+    // accelerator. This should only be enabled if the target device supports
+    // dynamic dimensions of the model.
+    bool allow_dynamic_dimensions = false;
   };
 
   // Uses default options.
@@ -224,6 +233,8 @@ class StatefulNnApiDelegate : public TfLiteDelegate {
     // Specifies the maximum expected duration in nanosecond for WHILE loops in
     // the execution
     uint64_t max_execution_loop_timeout_duration_ns = 0;
+    // Whether to allow dynamic dimension sizes without re-compilation.
+    bool allow_dynamic_dimensions = false;
 
     explicit Data(const NnApi* nnapi);
     ~Data();
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h b/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h
index dbe3f76bc52..36c1dd32efb 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h
@@ -111,6 +111,7 @@ struct NNAPIOpMappingArgs {
   TfLiteContext* context;
   NNAPIOpBuilder* builder;
   TfLiteNode* node;
+  int node_index;
   std::vector<int>* model_state_outputs;
   std::vector<int>* model_state_tfl_inputs;
   std::vector<std::tuple<int, int>>* feedback_loops;
@@ -163,6 +164,7 @@ class NNMemory {
 
   ANeuralNetworksMemory* get_handle() { return nn_memory_handle_; }
   uint8_t* get_data_ptr() { return data_ptr_; }
+  size_t get_byte_size() { return byte_size_; }
 
  private:
   // NnApi instance to use. Not owned by this object.
@@ -352,7 +354,8 @@ class NNAPIDelegateKernel {
       const TfLiteContext* context, int builtin_code, const TfLiteNode* node,
       int tflite_node_index, NNAPIOpBuilder* builder, int* nnapi_errno);
 
-  TfLiteStatus AddOpsAndTensors(TfLiteContext* context, int* nnapi_errno);
+  TfLiteStatus AddOpsAndTensors(TfLiteContext* context, int* nnapi_errno,
+                                bool allow_dynamic_dimensions);
 
   TfLiteStatus BuildGraph(TfLiteContext* context,
                           const StatefulNnApiDelegate::Options& options,
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
index 8abf15dacb9..205a44991dc 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
@@ -222,6 +222,56 @@ TEST(NNAPIDelegate, ResizeInputTensorsWorks) {
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({1.0, 1.3, 1.1, 1.5}));
 }
 
+TEST(NNAPIDelegate, ResizeDynamicBatchInputTensorsWorks) {
+  StatefulNnApiDelegate::Options options;
+  options.allow_dynamic_dimensions = true;
+
+  FloatAddOpModel m(options,
+                    {TensorType_FLOAT32, /*shape=*/{1, 3, 2, 1}, /*min=*/0.0f,
+                     /*max=*/0.0f, /*scale=*/0.0f,
+                     /*zero_point=*/0, /*per_channel_quantization=*/false,
+                     /*per_channel_quantization_scales=*/{},
+                     /*per_channel_quantization_offsets=*/{},
+                     /*channel_index=*/0, /*traversal_order=*/{},
+                     /*format=*/{},
+                     /*block_size=*/{}, /*block_map=*/{},
+                     /*shape_signature=*/{1, -1, 2, 1}},
+                    {TensorType_FLOAT32, /*shape=*/{1, 3, 2, 1}, /*min=*/0.0f,
+                     /*max=*/0.0f, /*scale=*/0.0f,
+                     /*zero_point=*/0, /*per_channel_quantization=*/false,
+                     /*per_channel_quantization_scales=*/{},
+                     /*per_channel_quantization_offsets=*/{},
+                     /*channel_index=*/0, /*traversal_order=*/{},
+                     /*format=*/{},
+                     /*block_size=*/{}, /*block_map=*/{},
+                     /*shape_signature=*/{1, -1, 2, 1}},
+                    {TensorType_FLOAT32, /*shape=*/{}, /*min=*/0.0f,
+                     /*max=*/0.0f, /*scale=*/0.0f,
+                     /*zero_point=*/0, /*per_channel_quantization=*/false,
+                     /*per_channel_quantization_scales=*/{},
+                     /*per_channel_quantization_offsets=*/{},
+                     /*channel_index=*/0, /*traversal_order=*/{},
+                     /*format=*/{},
+                     /*block_size=*/{}, /*block_map=*/{},
+                     /*shape_signature=*/{1, -1, 2, 1}},
+                    ActivationFunctionType_NONE);
+  EXPECT_EQ(m.ResizeInputTensor(m.input1(), {1, 3, 2, 1}), kTfLiteOk);
+  EXPECT_EQ(m.ResizeInputTensor(m.input2(), {1, 3, 2, 1}), kTfLiteOk);
+  EXPECT_EQ(m.AllocateTensors(), kTfLiteOk);
+  m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8, 0.9, 0.7});
+  m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 0.5, 0.2, 0.8});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({-1.9, 0.4, 1.0, 1.3, 1.1, 1.5}));
+
+  EXPECT_EQ(m.ResizeInputTensor(m.input1(), {1, 2, 2, 1}), kTfLiteOk);
+  EXPECT_EQ(m.ResizeInputTensor(m.input2(), {1, 2, 2, 1}), kTfLiteOk);
+  EXPECT_EQ(m.AllocateTensors(), kTfLiteOk);
+  m.PopulateTensor<float>(m.input1(), {0.7, 0.8, 0.9, 0.7});
+  m.PopulateTensor<float>(m.input2(), {0.3, 0.5, 0.2, 0.8});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1.0, 1.3, 1.1, 1.5}));
+}
+
 // Sanity check for the state-ful NNAPI delegate.
 TEST(NNAPIDelegate, StatefulDelegate) {
   StatefulNnApiDelegate::Options options;
@@ -2632,7 +2682,7 @@ class RNNOpModel : public SingleOpModelWithNNAPI {
     weights_ = AddInput(weights);
     recurrent_weights_ = AddInput(recurrent_weights);
     bias_ = AddInput(TensorType_FLOAT32);
-    hidden_state_ = AddInput(TensorType_FLOAT32, true);
+    hidden_state_ = AddVariableInput(TensorType_FLOAT32);
     output_ = AddOutput(TensorType_FLOAT32);
     SetBuiltinOp(
         BuiltinOperator_RNN, BuiltinOptions_RNNOptions,
@@ -2822,9 +2872,8 @@ class BaseSVDFOpModel : public SingleOpModelWithNNAPI {
     // when using NNAPI delegate.
     bias_ = AddInput(TensorType_FLOAT32);
     const int num_filters = units * rank;
-    activation_state_ = AddInput(
-        TensorData{TensorType_FLOAT32, {batches, memory_size * num_filters}},
-        /*is_variable=*/true);
+    activation_state_ = AddVariableInput(
+        TensorData{TensorType_FLOAT32, {batches, memory_size * num_filters}});
     output_ = AddOutput(TensorType_FLOAT32);
     SetBuiltinOp(
         BuiltinOperator_SVDF, BuiltinOptions_SVDFOptions,
@@ -3048,8 +3097,8 @@ class LSTMOpModel : public SingleOpModelWithNNAPI {
     }
 
     // Adding the 2 input state tensors.
-    input_activation_state_ = AddInput(TensorType_FLOAT32, true);
-    input_cell_state_ = AddInput(TensorType_FLOAT32, true);
+    input_activation_state_ = AddVariableInput(TensorType_FLOAT32);
+    input_cell_state_ = AddVariableInput(TensorType_FLOAT32);
 
     const bool use_layer_norm = input_shapes.size() > 20;
     // Layer norm weights.
diff --git a/tensorflow/lite/delegates/utils/dummy_delegate/README.md b/tensorflow/lite/delegates/utils/dummy_delegate/README.md
index ae17f1b67d3..d55ba421cba 100644
--- a/tensorflow/lite/delegates/utils/dummy_delegate/README.md
+++ b/tensorflow/lite/delegates/utils/dummy_delegate/README.md
@@ -20,11 +20,11 @@ the ideas above. For more sophisticated examples, refer to [Flex delegate](https
 
 ## Testing & Tooling
 
-There are currently **two optionss** to plug in a newly created TFLite delegate
+There are currently **two options** to plug in a newly created TFLite delegate
 to reuse existing TFLite kernel tests and and tooling:
 
 - Utilize the **[delegate registrar](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/delegates)**
-mechansim
+mechanism
 - Utilize the
 **[external delegate](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/delegates/external)**
 mechanism.
@@ -126,13 +126,13 @@ In this **alternative approach to reuse existing Tensorflow Lite kernel testing
 and tooling**, we first create an external delegate adaptor like the [`external_delegate_adaptor.cc`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/utils/dummy_delegate/external_delegate_adaptor.cc) here, and create the corresponding BUILD target
 to build a dynamic library.
 
-Afterwards, one could build binaries or use pre-built ones that are linked with
-the
+Afterwards, one could build binaries or use pre-built ones to run with the
+dummy delegate as long as the binary is linked with the
 [`external_delegate_provider`](https://github.com/tensorflow/tensorflow/blob/8c6f2d55762f3fc94f98fdd8b3c5d59ee1276dba/tensorflow/lite/tools/delegates/BUILD#L145-L159)
 library which supports command-line flags as described
 [here](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/delegates#external-delegate-provider).
-Note this delegate provider has already been linked to existing testing and
-tooling binaries.
+Note this external delegate provider has already been linked to existing testing
+and tooling binaries.
 
 For example, the following illustrates how to benchmark the dummy delegate here
 via this external-delegate approach. We could use similar commands for testing
diff --git a/tensorflow/lite/delegates/xnnpack/BUILD b/tensorflow/lite/delegates/xnnpack/BUILD
index 3c580edae10..f9825f62773 100644
--- a/tensorflow/lite/delegates/xnnpack/BUILD
+++ b/tensorflow/lite/delegates/xnnpack/BUILD
@@ -1,4 +1,5 @@
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite_combined")
+load("//tensorflow:tensorflow.bzl", "get_compatible_with_portable")
 
 package(
     default_visibility = ["//visibility:public"],
@@ -38,6 +39,7 @@ cc_library(
 cc_library(
     name = "xnnpack_delegate_hdrs_only",
     hdrs = ["xnnpack_delegate.h"],
+    compatible_with = get_compatible_with_portable(),
     visibility = ["//tensorflow/lite:__subpackages__"],
     deps = [
         "//tensorflow/lite/c:common",
diff --git a/tensorflow/lite/delegates/xnnpack/binary_elementwise_tester.cc b/tensorflow/lite/delegates/xnnpack/binary_elementwise_tester.cc
index 02fec4f5a61..1ba48c3c0e5 100644
--- a/tensorflow/lite/delegates/xnnpack/binary_elementwise_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/binary_elementwise_tester.cc
@@ -91,12 +91,16 @@ void BinaryElementwiseTester::Test(tflite::BuiltinOperator binary_op,
 
   std::unique_ptr<Interpreter> delegate_interpreter;
   ASSERT_EQ(
-      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+      InterpreterBuilder(
+          model,
+          ::tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates())(
           &delegate_interpreter),
       kTfLiteOk);
   std::unique_ptr<Interpreter> default_interpreter;
   ASSERT_EQ(
-      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+      InterpreterBuilder(
+          model,
+          ::tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates())(
           &default_interpreter),
       kTfLiteOk);
 
diff --git a/tensorflow/lite/delegates/xnnpack/conv_2d_tester.cc b/tensorflow/lite/delegates/xnnpack/conv_2d_tester.cc
index dec1c589682..f5a5f809993 100644
--- a/tensorflow/lite/delegates/xnnpack/conv_2d_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/conv_2d_tester.cc
@@ -39,12 +39,16 @@ void Conv2DTester::Test(TfLiteDelegate* delegate) const {
 
   std::unique_ptr<Interpreter> delegate_interpreter;
   ASSERT_EQ(
-      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+      InterpreterBuilder(
+          model,
+          ::tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates())(
           &delegate_interpreter),
       kTfLiteOk);
   std::unique_ptr<Interpreter> default_interpreter;
   ASSERT_EQ(
-      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+      InterpreterBuilder(
+          model,
+          ::tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates())(
           &default_interpreter),
       kTfLiteOk);
 
diff --git a/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.cc b/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.cc
index 238a29c9b9d..d846dcf9929 100644
--- a/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.cc
@@ -39,12 +39,16 @@ void DepthwiseConv2DTester::Test(TfLiteDelegate* delegate) const {
 
   std::unique_ptr<Interpreter> delegate_interpreter;
   ASSERT_EQ(
-      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+      InterpreterBuilder(
+          model,
+          ::tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates())(
           &delegate_interpreter),
       kTfLiteOk);
   std::unique_ptr<Interpreter> default_interpreter;
   ASSERT_EQ(
-      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+      InterpreterBuilder(
+          model,
+          ::tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates())(
           &default_interpreter),
       kTfLiteOk);
 
diff --git a/tensorflow/lite/delegates/xnnpack/fully_connected_tester.cc b/tensorflow/lite/delegates/xnnpack/fully_connected_tester.cc
index 9696b07b7a3..ff3e974a4e4 100644
--- a/tensorflow/lite/delegates/xnnpack/fully_connected_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/fully_connected_tester.cc
@@ -59,12 +59,16 @@ void FullyConnectedTester::Test(TfLiteDelegate* delegate) const {
 
   std::unique_ptr<Interpreter> delegate_interpreter;
   ASSERT_EQ(
-      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+      InterpreterBuilder(
+          model,
+          ::tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates())(
           &delegate_interpreter),
       kTfLiteOk);
   std::unique_ptr<Interpreter> default_interpreter;
   ASSERT_EQ(
-      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+      InterpreterBuilder(
+          model,
+          ::tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates())(
           &default_interpreter),
       kTfLiteOk);
 
diff --git a/tensorflow/lite/delegates/xnnpack/leaky_relu_tester.cc b/tensorflow/lite/delegates/xnnpack/leaky_relu_tester.cc
index 7aefccaa671..e830760a2f9 100644
--- a/tensorflow/lite/delegates/xnnpack/leaky_relu_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/leaky_relu_tester.cc
@@ -44,12 +44,16 @@ void LeakyReluTester::Test(TfLiteDelegate* delegate) const {
 
   std::unique_ptr<Interpreter> delegate_interpreter;
   ASSERT_EQ(
-      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+      InterpreterBuilder(
+          model,
+          ::tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates())(
           &delegate_interpreter),
       kTfLiteOk);
   std::unique_ptr<Interpreter> default_interpreter;
   ASSERT_EQ(
-      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+      InterpreterBuilder(
+          model,
+          ::tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates())(
           &default_interpreter),
       kTfLiteOk);
 
diff --git a/tensorflow/lite/delegates/xnnpack/pad_tester.cc b/tensorflow/lite/delegates/xnnpack/pad_tester.cc
index e364b880124..e9688188d9f 100644
--- a/tensorflow/lite/delegates/xnnpack/pad_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/pad_tester.cc
@@ -63,12 +63,16 @@ void PadTester::Test(TfLiteDelegate* delegate) const {
 
   std::unique_ptr<Interpreter> delegate_interpreter;
   ASSERT_EQ(
-      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+      InterpreterBuilder(
+          model,
+          ::tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates())(
           &delegate_interpreter),
       kTfLiteOk);
   std::unique_ptr<Interpreter> default_interpreter;
   ASSERT_EQ(
-      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+      InterpreterBuilder(
+          model,
+          ::tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates())(
           &default_interpreter),
       kTfLiteOk);
 
diff --git a/tensorflow/lite/delegates/xnnpack/pool_2d_tester.cc b/tensorflow/lite/delegates/xnnpack/pool_2d_tester.cc
index fab83e76fd2..6f7993b0df4 100644
--- a/tensorflow/lite/delegates/xnnpack/pool_2d_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/pool_2d_tester.cc
@@ -43,15 +43,19 @@ void Pool2DTester::Test(tflite::BuiltinOperator pool_op,
   const tflite::Model* model = tflite::GetModel(buffer.data());
 
   std::unique_ptr<tflite::Interpreter> delegate_interpreter;
-  ASSERT_EQ(tflite::InterpreterBuilder(
-                model, tflite::ops::builtin::BuiltinOpResolver())(
-                &delegate_interpreter),
-            kTfLiteOk);
+  ASSERT_EQ(
+      tflite::InterpreterBuilder(
+          model,
+          tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates())(
+          &delegate_interpreter),
+      kTfLiteOk);
   std::unique_ptr<tflite::Interpreter> default_interpreter;
-  ASSERT_EQ(tflite::InterpreterBuilder(
-                model, tflite::ops::builtin::BuiltinOpResolver())(
-                &default_interpreter),
-            kTfLiteOk);
+  ASSERT_EQ(
+      tflite::InterpreterBuilder(
+          model,
+          tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates())(
+          &default_interpreter),
+      kTfLiteOk);
 
   ASSERT_TRUE(delegate_interpreter);
   ASSERT_TRUE(default_interpreter);
diff --git a/tensorflow/lite/delegates/xnnpack/prelu_tester.cc b/tensorflow/lite/delegates/xnnpack/prelu_tester.cc
index ab20c2c51dc..01361075c1f 100644
--- a/tensorflow/lite/delegates/xnnpack/prelu_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/prelu_tester.cc
@@ -45,12 +45,16 @@ void PreluTester::Test(TfLiteDelegate* delegate) const {
 
   std::unique_ptr<Interpreter> delegate_interpreter;
   ASSERT_EQ(
-      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+      InterpreterBuilder(
+          model,
+          ::tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates())(
           &delegate_interpreter),
       kTfLiteOk);
   std::unique_ptr<Interpreter> default_interpreter;
   ASSERT_EQ(
-      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+      InterpreterBuilder(
+          model,
+          ::tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates())(
           &default_interpreter),
       kTfLiteOk);
 
diff --git a/tensorflow/lite/delegates/xnnpack/reduce_tester.cc b/tensorflow/lite/delegates/xnnpack/reduce_tester.cc
index edd09ba9d07..f9db35e6e28 100644
--- a/tensorflow/lite/delegates/xnnpack/reduce_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/reduce_tester.cc
@@ -45,12 +45,16 @@ void ReduceTester::Test(tflite::BuiltinOperator reduce_op,
 
   std::unique_ptr<Interpreter> delegate_interpreter;
   ASSERT_EQ(
-      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+      InterpreterBuilder(
+          model,
+          ::tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates())(
           &delegate_interpreter),
       kTfLiteOk);
   std::unique_ptr<Interpreter> default_interpreter;
   ASSERT_EQ(
-      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+      InterpreterBuilder(
+          model,
+          ::tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates())(
           &default_interpreter),
       kTfLiteOk);
 
diff --git a/tensorflow/lite/delegates/xnnpack/reshape_tester.cc b/tensorflow/lite/delegates/xnnpack/reshape_tester.cc
index 534f90d37df..6e16c9fe1c0 100644
--- a/tensorflow/lite/delegates/xnnpack/reshape_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/reshape_tester.cc
@@ -46,12 +46,16 @@ void ReshapeTester::Test(TfLiteDelegate* delegate) const {
 
   std::unique_ptr<Interpreter> delegate_interpreter;
   ASSERT_EQ(
-      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+      InterpreterBuilder(
+          model,
+          ::tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates())(
           &delegate_interpreter),
       kTfLiteOk);
   std::unique_ptr<Interpreter> default_interpreter;
   ASSERT_EQ(
-      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+      InterpreterBuilder(
+          model,
+          ::tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates())(
           &default_interpreter),
       kTfLiteOk);
 
diff --git a/tensorflow/lite/delegates/xnnpack/resize_bilinear_tester.cc b/tensorflow/lite/delegates/xnnpack/resize_bilinear_tester.cc
index 34730c05719..52f8921391a 100644
--- a/tensorflow/lite/delegates/xnnpack/resize_bilinear_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/resize_bilinear_tester.cc
@@ -44,12 +44,16 @@ void ResizeBilinearTester::Test(TfLiteDelegate* delegate) const {
 
   std::unique_ptr<Interpreter> delegate_interpreter;
   ASSERT_EQ(
-      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+      InterpreterBuilder(
+          model,
+          ::tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates())(
           &delegate_interpreter),
       kTfLiteOk);
   std::unique_ptr<Interpreter> default_interpreter;
   ASSERT_EQ(
-      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+      InterpreterBuilder(
+          model,
+          ::tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates())(
           &default_interpreter),
       kTfLiteOk);
 
diff --git a/tensorflow/lite/delegates/xnnpack/softmax_tester.cc b/tensorflow/lite/delegates/xnnpack/softmax_tester.cc
index c93aa0d789f..e3636a9e960 100644
--- a/tensorflow/lite/delegates/xnnpack/softmax_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/softmax_tester.cc
@@ -44,12 +44,16 @@ void SoftmaxTester::Test(TfLiteDelegate* delegate) const {
 
   std::unique_ptr<Interpreter> delegate_interpreter;
   ASSERT_EQ(
-      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+      InterpreterBuilder(
+          model,
+          ::tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates())(
           &delegate_interpreter),
       kTfLiteOk);
   std::unique_ptr<Interpreter> default_interpreter;
   ASSERT_EQ(
-      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+      InterpreterBuilder(
+          model,
+          ::tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates())(
           &default_interpreter),
       kTfLiteOk);
 
diff --git a/tensorflow/lite/delegates/xnnpack/unary_elementwise_tester.cc b/tensorflow/lite/delegates/xnnpack/unary_elementwise_tester.cc
index ad6984538dc..4b34d80d82b 100644
--- a/tensorflow/lite/delegates/xnnpack/unary_elementwise_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/unary_elementwise_tester.cc
@@ -52,12 +52,16 @@ void UnaryElementwiseTester::Test(tflite::BuiltinOperator unary_op,
 
   std::unique_ptr<Interpreter> delegate_interpreter;
   ASSERT_EQ(
-      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+      InterpreterBuilder(
+          model,
+          ::tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates())(
           &delegate_interpreter),
       kTfLiteOk);
   std::unique_ptr<Interpreter> default_interpreter;
   ASSERT_EQ(
-      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+      InterpreterBuilder(
+          model,
+          ::tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates())(
           &default_interpreter),
       kTfLiteOk);
 
diff --git a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
index eec223597cb..914c4ec7f8f 100644
--- a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
+++ b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
@@ -1216,7 +1216,7 @@ class Subgraph {
 
     const TfLiteTensor& bias_tensor = tensors[node->inputs->data[2]];
     TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
-        logging_context, filter_tensor, node->inputs->data[2], node_index));
+        logging_context, bias_tensor, node->inputs->data[2], node_index));
     TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, bias_tensor, 1,
                                            node->inputs->data[2]));
     if (quasi_static_tensors.count(node->inputs->data[2]) == 0) {
diff --git a/tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm b/tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm
index 665131195e4..7f060034012 100644
--- a/tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm
+++ b/tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm
@@ -387,7 +387,7 @@ void ProcessInputWithQuantizedModel(
 - (void)dealloc {
 #if TFLITE_USE_GPU_DELEGATE
   if (delegate) {
-    TFLGpuDelegateDelete(delegate);
+    DeleteGpuDelegate(delegate);
   }
 #endif
   [self teardownAVCapture];
@@ -416,10 +416,10 @@ void ProcessInputWithQuantizedModel(
   tflite::InterpreterBuilder(*model, resolver)(&interpreter);
 
 #if TFLITE_USE_GPU_DELEGATE
-  TFLGpuDelegateOptions options;
+  GpuDelegateOptions options;
   options.allow_precision_loss = true;
-  options.wait_type = TFLGpuDelegateWaitTypeActive;
-  delegate = TFLGpuDelegateCreate(&options);
+  options.wait_type = GpuDelegateOptions::WaitType::kActive;
+  delegate = NewGpuDelegate(&options);
   interpreter->ModifyGraphWithDelegate(delegate);
 #endif
 
diff --git a/tensorflow/lite/examples/minimal/minimal.cc b/tensorflow/lite/examples/minimal/minimal.cc
index ba142da799a..84cdf17e23f 100644
--- a/tensorflow/lite/examples/minimal/minimal.cc
+++ b/tensorflow/lite/examples/minimal/minimal.cc
@@ -28,8 +28,6 @@ limitations under the License.
 //
 // Usage: minimal <tflite model>
 
-using namespace tflite;
-
 #define TFLITE_MINIMAL_CHECK(x)                              \
   if (!(x)) {                                                \
     fprintf(stderr, "Error at %s:%d\n", __FILE__, __LINE__); \
@@ -48,10 +46,13 @@ int main(int argc, char* argv[]) {
       tflite::FlatBufferModel::BuildFromFile(filename);
   TFLITE_MINIMAL_CHECK(model != nullptr);
 
-  // Build the interpreter
+  // Build the interpreter with the InterpreterBuilder.
+  // Note: all Interpreters should be built with the InterpreterBuilder,
+  // which allocates memory for the Intrepter and does various set up
+  // tasks so that the Interpreter can read the provided model.
   tflite::ops::builtin::BuiltinOpResolver resolver;
-  InterpreterBuilder builder(*model, resolver);
-  std::unique_ptr<Interpreter> interpreter;
+  tflite::InterpreterBuilder builder(*model, resolver);
+  std::unique_ptr<tflite::Interpreter> interpreter;
   builder(&interpreter);
   TFLITE_MINIMAL_CHECK(interpreter != nullptr);
 
@@ -61,7 +62,9 @@ int main(int argc, char* argv[]) {
   tflite::PrintInterpreterState(interpreter.get());
 
   // Fill input buffers
-  // TODO(user): Insert code to fill input tensors
+  // TODO(user): Insert code to fill input tensors.
+  // Note: The buffer of the input tensor with index `i` of type T can
+  // be accessed with `T* input = interpreter->typed_input_tensor<T>(i);`
 
   // Run inference
   TFLITE_MINIMAL_CHECK(interpreter->Invoke() == kTfLiteOk);
@@ -70,6 +73,8 @@ int main(int argc, char* argv[]) {
 
   // Read output buffers
   // TODO(user): Insert getting data out code.
+  // Note: The buffer of the output tensor with index `i` of type T can
+  // be accessed with `T* output = interpreter->typed_output_tensor<T>(i);`
 
   return 0;
 }
diff --git a/tensorflow/lite/experimental/acceleration/compatibility/BUILD b/tensorflow/lite/experimental/acceleration/compatibility/BUILD
index 78a9d2eb8d8..6c5a32b0795 100644
--- a/tensorflow/lite/experimental/acceleration/compatibility/BUILD
+++ b/tensorflow/lite/experimental/acceleration/compatibility/BUILD
@@ -13,8 +13,8 @@
 # limitations under the License.
 # ==============================================================================
 
-load("@flatbuffers//:build_defs.bzl", "flatbuffer_cc_library")
-load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
+load("@flatbuffers//:build_defs.bzl", "flatbuffer_cc_library", "flatbuffer_java_library")
+load("//tensorflow/lite:special_rules.bzl", "tflite_extra_gles_deps", "tflite_portable_test_suite")
 
 package(
     default_visibility = [
@@ -28,6 +28,14 @@ flatbuffer_cc_library(
     srcs = ["database.fbs"],
 )
 
+exports_files(srcs = ["database.fbs"])
+
+flatbuffer_java_library(
+    name = "database_fbs_java",
+    srcs = ["database.fbs"],
+    package_prefix = "org.tensorflow",
+)
+
 cc_library(
     name = "devicedb",
     srcs = [
@@ -91,25 +99,33 @@ genrule(
         --output_source_file $(location :devicedb-sample.cc) \
         --array_variable_name g_tflite_acceleration_devicedb_sample_binary
     """,
-    tools = [":convert_binary_to_cc_source"],
+    exec_tools = [":convert_binary_to_cc_source"],
+)
+
+cc_library(
+    name = "devicedb_sample",
+    srcs = ["devicedb-sample.cc"],
+    hdrs = ["devicedb-sample.h"],
+    deps = [":database_fbs"],
 )
 
 cc_test(
     name = "devicedb_test",
     srcs = [
-        "devicedb-sample.cc",
-        "devicedb-sample.h",
         "devicedb_test.cc",
     ],
     deps = [
         ":database_fbs",
         ":devicedb",
+        ":devicedb_sample",
         "//tensorflow/lite/testing:util",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
     ],
 )
 
+exports_files(["gpu_compatibility.bin"])
+
 genrule(
     name = "gpu_compatibility_binary",
     srcs = ["gpu_compatibility.bin"],
@@ -126,7 +142,7 @@ genrule(
         --output_source_file $(location :gpu_compatibility_binary.cc) \
         --array_variable_name g_tflite_acceleration_gpu_compatibility_binary
     """,
-    tools = [":convert_binary_to_cc_source"],
+    exec_tools = [":convert_binary_to_cc_source"],
 )
 
 cc_library(
@@ -152,11 +168,22 @@ cc_library(
         ":android_info",
         ":database_fbs",
         ":devicedb",
-        "//tensorflow/lite/delegates/gpu:delegate",
-        "//tensorflow/lite/delegates/gpu/common:gpu_info",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@flatbuffers",
+        "//tensorflow/lite/delegates/gpu:delegate",
+        "//tensorflow/lite/delegates/gpu/common:gpu_info",
+    ] + tflite_extra_gles_deps(),
+)
+
+cc_test(
+    name = "gpu_compatibility_test",
+    srcs = ["gpu_compatibility_test.cc"],
+    tags = ["no_mac"],  # b/163222453
+    deps = [
+        ":devicedb_sample",
+        ":gpu_compatibility",
+        "@com_google_googletest//:gtest_main",
     ],
 )
 
diff --git a/tensorflow/lite/experimental/acceleration/compatibility/database.fbs b/tensorflow/lite/experimental/acceleration/compatibility/database.fbs
index cf5aaa6d795..8c77718068a 100644
--- a/tensorflow/lite/experimental/acceleration/compatibility/database.fbs
+++ b/tensorflow/lite/experimental/acceleration/compatibility/database.fbs
@@ -28,15 +28,15 @@ enum Comparison : byte {
 // The structure describes a decision tree, with multiple matching branches.
 // The branches are applied depth-first.
 table DeviceDatabase {
-  root:[tflite.acceleration.DeviceDecisionTreeNode];
+  root:[DeviceDecisionTreeNode];
 }
 
 table DeviceDecisionTreeNode {
   // The variables are strings, as we have multiple clients that want to
   // introduce their own fields. Known variables are listed in variables.h.
   variable:string (shared);
-  comparison:tflite.acceleration.Comparison;
-  items:[tflite.acceleration.DeviceDecisionTreeEdge];
+  comparison:Comparison;
+  items:[DeviceDecisionTreeEdge];
 }
 
 table DeviceDecisionTreeEdge {
@@ -44,9 +44,9 @@ table DeviceDecisionTreeEdge {
   value:string (key, shared);
   // Which child branches should also be consulted and used to override this
   // node.
-  children:[tflite.acceleration.DeviceDecisionTreeNode];
+  children:[DeviceDecisionTreeNode];
   // What information can be derived about this device.
-  derived_properties:[tflite.acceleration.DerivedProperty];
+  derived_properties:[DerivedProperty];
 }
 
 // Derived variable value to combine with detected variables.
diff --git a/tensorflow/lite/experimental/acceleration/compatibility/devicedb-sample.json b/tensorflow/lite/experimental/acceleration/compatibility/devicedb-sample.json
index 61f9e1210f9..444b4b52d9b 100644
--- a/tensorflow/lite/experimental/acceleration/compatibility/devicedb-sample.json
+++ b/tensorflow/lite/experimental/acceleration/compatibility/devicedb-sample.json
@@ -100,10 +100,6 @@
                             {
                               "variable": "tflite.gpu.status",
                               "value": "SUPPORTED"
-                            },
-                            {
-                              "variable": "tflite.gpu.opencl_status",
-                              "value": "SUPPORTED"
                             }
                           ]
                         }
@@ -150,7 +146,7 @@
                           "value": "j8y18lte",
                           "derived_properties": [
                             {
-                              "variable": "tflite.gpu.opencl_status",
+                              "variable": "tflite.gpu.status",
                               "value": "SUPPORTED"
                             }
                           ]
diff --git a/tensorflow/lite/experimental/acceleration/compatibility/devicedb_test.cc b/tensorflow/lite/experimental/acceleration/compatibility/devicedb_test.cc
index 4b08c2ff874..5cd500c66af 100644
--- a/tensorflow/lite/experimental/acceleration/compatibility/devicedb_test.cc
+++ b/tensorflow/lite/experimental/acceleration/compatibility/devicedb_test.cc
@@ -19,7 +19,6 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/experimental/acceleration/compatibility/database_generated.h"
 #include "tensorflow/lite/experimental/acceleration/compatibility/devicedb-sample.h"
 #include "tensorflow/lite/experimental/acceleration/compatibility/variables.h"
 #include "tensorflow/lite/testing/util.h"
@@ -116,7 +115,7 @@ TEST_F(DeviceDbTest, StatusLookupWithDevice) {
   variables[kDeviceModel] = "sm_j810m";
   variables[kDeviceName] = "j8y18lte";
   UpdateVariablesFromDatabase(&variables, *device_db_);
-  EXPECT_EQ(variables[gpu::kOpenCLStatus], gpu::kStatusSupported);
+  EXPECT_EQ(variables[gpu::kStatus], gpu::kStatusSupported);
 }
 
 TEST_F(DeviceDbTest, StatusLookupBasedOnDerivedProperties) {
diff --git a/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.cc b/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.cc
index e04f5d18db4..4f40878da22 100644
--- a/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.cc
+++ b/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.cc
@@ -98,5 +98,9 @@ TfLiteGpuDelegateOptionsV2 GPUCompatibilityList::GetBestOptionsFor(
   return TfLiteGpuDelegateOptionsV2Default();
 }
 
+bool GPUCompatibilityList::IsDatabaseLoaded() const {
+  return database_ != nullptr;
+}
+
 }  // namespace acceleration
 }  // namespace tflite
diff --git a/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.h b/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.h
index f975fe04f22..1c5e9dec997 100644
--- a/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.h
+++ b/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.h
@@ -53,6 +53,9 @@ class GPUCompatibilityList {
  public:
   // Construct list from bundled data.
   GPUCompatibilityList();
+  // Constructs list from the given flatbuffer data.
+  explicit GPUCompatibilityList(
+      const unsigned char* compatibility_list_flatbuffer);
   // Returns true if the provided device specs are supported by the database.
   bool Includes(const AndroidInfo& android_info,
                 const ::tflite::gpu::GpuInfo& gpu_info) const;
@@ -73,10 +76,9 @@ class GPUCompatibilityList {
 
   GPUCompatibilityList(const GPUCompatibilityList&) = delete;
   GPUCompatibilityList& operator=(const GPUCompatibilityList&) = delete;
+  bool IsDatabaseLoaded() const;
 
  protected:
-  explicit GPUCompatibilityList(
-      const unsigned char* compatibility_list_flatbuffer);
   const DeviceDatabase* database_;
 };
 
diff --git a/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility_test.cc b/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility_test.cc
new file mode 100644
index 00000000000..5576b47dcd9
--- /dev/null
+++ b/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility_test.cc
@@ -0,0 +1,87 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.h"
+
+#include <memory>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/acceleration/compatibility/devicedb-sample.h"
+
+namespace {
+
+class GPUCompatibilityTest : public ::testing::Test {
+ protected:
+  GPUCompatibilityTest() {
+    list_ = absl::make_unique<tflite::acceleration::GPUCompatibilityList>(
+        g_tflite_acceleration_devicedb_sample_binary);
+  }
+
+  std::unique_ptr<tflite::acceleration::GPUCompatibilityList> list_;
+};
+
+TEST_F(GPUCompatibilityTest, ReturnsSupportedForFullMatch) {
+  ASSERT_TRUE(list_->IsDatabaseLoaded());
+
+  tflite::acceleration::AndroidInfo android_info = {.android_sdk_version = "24",
+                                                    .model = "m712c"};
+
+  tflite::gpu::GpuInfo tflite_gpu_info = {
+      .major_version = 3,
+      .minor_version = 1,
+  };
+
+  EXPECT_TRUE(list_->Includes(android_info, tflite_gpu_info));
+}
+
+TEST_F(GPUCompatibilityTest, ReturnsUnsupportedForFullMatch) {
+  ASSERT_TRUE(list_->IsDatabaseLoaded());
+
+  tflite::acceleration::AndroidInfo android_info = {.android_sdk_version = "28",
+                                                    .model = "SM-G960F",
+                                                    .device = "starlte",
+                                                    .manufacturer = "Samsung"};
+  tflite::gpu::GpuInfo tflite_gpu_info = {
+      .renderer_name = "Mali-G72",
+      .major_version = 3,
+      .minor_version = 2,
+  };
+  EXPECT_FALSE(list_->Includes(android_info, tflite_gpu_info));
+}
+
+TEST_F(GPUCompatibilityTest, ReturnsDefaultOptions) {
+  ASSERT_TRUE(list_->IsDatabaseLoaded());
+
+  tflite::acceleration::AndroidInfo android_info;
+  tflite::gpu::GpuInfo tflite_gpu_info;
+  auto default_options = TfLiteGpuDelegateOptionsV2Default();
+  auto best_options = list_->GetBestOptionsFor(android_info, tflite_gpu_info);
+  EXPECT_EQ(best_options.is_precision_loss_allowed,
+            default_options.is_precision_loss_allowed);
+  EXPECT_EQ(best_options.inference_preference,
+            default_options.inference_preference);
+  EXPECT_EQ(best_options.inference_priority1,
+            default_options.inference_priority1);
+  EXPECT_EQ(best_options.inference_priority2,
+            default_options.inference_priority2);
+  EXPECT_EQ(best_options.inference_priority3,
+            default_options.inference_priority3);
+  EXPECT_EQ(best_options.experimental_flags,
+            default_options.experimental_flags);
+  EXPECT_EQ(best_options.max_delegated_partitions,
+            default_options.max_delegated_partitions);
+}
+
+}  // namespace
diff --git a/tensorflow/lite/experimental/acceleration/compatibility/variables.h b/tensorflow/lite/experimental/acceleration/compatibility/variables.h
index 3904dbdb486..4e0b864c037 100644
--- a/tensorflow/lite/experimental/acceleration/compatibility/variables.h
+++ b/tensorflow/lite/experimental/acceleration/compatibility/variables.h
@@ -71,12 +71,10 @@ namespace gpu {
 // GPU-delegate derived properties.
 
 // Whether the GPU delegate works in general.
-// ("UNSET", "UNKNOWN", "SUPPORTED", "UNSUPPORTED").
+// Possible values are ("", "SUPPORTED", "UNSUPPORTED"). An empty value for
+// this field means that the device is unsupported.
 constexpr char kStatus[] = "tflite.gpu.status";
 
-// Whether OpenCL should be allowed. Possible values are the SupportStatus enums
-// ("UNSET", "UNKNOWN", "SUPPORTED", "UNSUPPORTED").
-constexpr char kOpenCLStatus[] = "tflite.gpu.opencl_status";
 constexpr char kStatusSupported[] = "SUPPORTED";
 constexpr char kStatusUnsupported[] = "UNSUPPORTED";
 }  // namespace gpu
diff --git a/tensorflow/lite/experimental/acceleration/configuration/configuration.proto b/tensorflow/lite/experimental/acceleration/configuration/configuration.proto
index 3091eec6d46..8a27e280f83 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/configuration.proto
+++ b/tensorflow/lite/experimental/acceleration/configuration/configuration.proto
@@ -51,8 +51,6 @@ enum Delegate {
   GPU = 2;
   HEXAGON = 3;
   XNNPACK = 4;
-  // TODO(b/157893534): Support exposing edgetpu tflite delegate creation
-  // options.
   EDGETPU = 5;
 }
 
@@ -162,6 +160,43 @@ message XNNPackSettings {
   optional int32 num_threads = 1;
 }
 
+// EdgeTPU Delegate settings
+//
+message EdgeTpuSettings {
+  // Generic definitions of EdgeTPU power states.
+  enum PowerState {
+    // Undefined power state.
+    UNDEFINED = 0;
+
+    // TPU core is off but control cluster is on.
+    TPU_CORE_OFF = 1;
+
+    // A non-active low-power state that has much smaller transition time to
+    // active compared to off.
+    READY = 2;
+
+    // Device is inactive but ready and all previous data is retained (e.g.
+    // cached parameters).
+    READY_WITH_RETENTION = 3;
+
+    // Minimum power active state.
+    ACTIVE_MIN_POWER = 4;
+
+    // Low performance, low power.
+    ACTIVE_LOW_POWER = 5;
+
+    // The normal performance and power. This setting usually provides the
+    // optimal perf/power trade-off for the average use-case.
+    ACTIVE = 6;
+
+    // Maximum performance level. Potentially higher power and thermal. This
+    // setting may not be allowed in production depending on the system.
+    OVER_DRIVE = 7;
+  }
+
+  optional PowerState inference_power_state = 1;
+}
+
 message CPUSettings {
   optional int32 num_threads = 1;
 }
@@ -185,6 +220,9 @@ message TFLiteSettings {
 
   // Shared delegation settings.
   optional int32 max_delegated_partitions = 7;
+
+  // For configuring the EdgeTpuDelegate.
+  optional EdgeTpuSettings edgetpu_settings = 8;
 }
 
 // Whether to automatically fallback to TFLite CPU path on delegation errors.
diff --git a/tensorflow/lite/experimental/delegates/coreml/BUILD b/tensorflow/lite/experimental/delegates/coreml/BUILD
index 2985cd3a315..ee209704161 100644
--- a/tensorflow/lite/experimental/delegates/coreml/BUILD
+++ b/tensorflow/lite/experimental/delegates/coreml/BUILD
@@ -59,6 +59,9 @@ objc_library(
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/delegates:utils",
         "//tensorflow/lite/experimental/delegates/coreml/builders:op_builder",
+        "//tensorflow/lite/experimental/delegates/coreml/builders:op_validator",
+        "//tensorflow/lite/experimental/delegates/coreml/builders:util",
+        "//tensorflow/lite/kernels:kernel_util",
     ],
 )
 
diff --git a/tensorflow/lite/experimental/delegates/coreml/builders/op_builder.cc b/tensorflow/lite/experimental/delegates/coreml/builders/op_builder.cc
index c775f4fdb48..bf8fb333894 100644
--- a/tensorflow/lite/experimental/delegates/coreml/builders/op_builder.cc
+++ b/tensorflow/lite/experimental/delegates/coreml/builders/op_builder.cc
@@ -46,6 +46,8 @@ OpBuilder* GraphBuilder::AddBuilder(int builtin_code, const TfLiteNode* node) {
       return AddBuilder(CreateLogisticOpBuilder, node);
     case kTfLiteBuiltinMaxPool2d:
       return AddBuilder(CreateMaxPool2dOpBuilder, node);
+    case kTfLiteBuiltinMean:
+      return AddBuilder(CreateMeanOpBuilder, node);
     case kTfLiteBuiltinMirrorPad:
       return AddBuilder(CreateMirrorPadOpBuilder, node);
     case kTfLiteBuiltinMul:
diff --git a/tensorflow/lite/experimental/delegates/coreml/builders/op_factory.h b/tensorflow/lite/experimental/delegates/coreml/builders/op_factory.h
index 4245021fc2f..c70dbf2b8e4 100644
--- a/tensorflow/lite/experimental/delegates/coreml/builders/op_factory.h
+++ b/tensorflow/lite/experimental/delegates/coreml/builders/op_factory.h
@@ -32,6 +32,7 @@ OpBuilder* CreateFullyConnectedOpBuilder(GraphBuilder* graph_builder);
 OpBuilder* CreateHardSwishOpBuilder(GraphBuilder* graph_builder);
 OpBuilder* CreateLogisticOpBuilder(GraphBuilder* graph_builder);
 OpBuilder* CreateMaxPool2dOpBuilder(GraphBuilder* graph_builder);
+OpBuilder* CreateMeanOpBuilder(GraphBuilder* graph_builder);
 OpBuilder* CreateMirrorPadOpBuilder(GraphBuilder* graph_builder);
 OpBuilder* CreateMulOpBuilder(GraphBuilder* graph_builder);
 // PAD handles PAD and PADV2 together.
diff --git a/tensorflow/lite/experimental/delegates/coreml/builders/op_validator.h b/tensorflow/lite/experimental/delegates/coreml/builders/op_validator.h
index b099fd7493a..a97a0f32798 100644
--- a/tensorflow/lite/experimental/delegates/coreml/builders/op_validator.h
+++ b/tensorflow/lite/experimental/delegates/coreml/builders/op_validator.h
@@ -31,6 +31,8 @@ bool IsDepthwiseConvolutionOpSupported(const TfLiteRegistration* registration,
 bool IsFullyConnectedOpSupported(const TfLiteRegistration* registration,
                                  const TfLiteNode* node,
                                  TfLiteContext* context);
+bool IsMeanOpSupported(const TfLiteRegistration* registration,
+                       const TfLiteNode* node, TfLiteContext* context);
 bool IsMirrorPadOpSupported(const TfLiteRegistration* registration,
                             const TfLiteNode* node, TfLiteContext* context);
 bool IsPadOpSupported(const TfLiteRegistration* registration,
diff --git a/tensorflow/lite/experimental/delegates/coreml/builders/pooling_layer_builder.cc b/tensorflow/lite/experimental/delegates/coreml/builders/pooling_layer_builder.cc
index 8859639b1fb..d3e3f6b6495 100644
--- a/tensorflow/lite/experimental/delegates/coreml/builders/pooling_layer_builder.cc
+++ b/tensorflow/lite/experimental/delegates/coreml/builders/pooling_layer_builder.cc
@@ -18,6 +18,8 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/experimental/delegates/coreml/builders/op_factory.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
 
 namespace tflite {
 namespace delegates {
@@ -29,13 +31,15 @@ const char* PoolingLayerBuilder::DebugName() {
     case kTfLiteBuiltinAveragePool2d:
       GetDebugName("PoolingLayerBuilder (AVERAGE)", node_id_, str_debug_name_);
       break;
-
     case kTfLiteBuiltinMaxPool2d:
       GetDebugName("PoolingLayerBuilder (MAX)", node_id_, str_debug_name_);
       break;
     case kTfLiteBuiltinL2Pool2d:
-      GetDebugName("PoolingLayerBuilder (L2, unsupported)",
-                   node_id_, str_debug_name_);
+      GetDebugName("PoolingLayerBuilder (L2, unsupported)", node_id_,
+                   str_debug_name_);
+      break;
+    case kTfLiteBuiltinMean:
+      GetDebugName("PoolingLayerBuilder (MEAN)", node_id_, str_debug_name_);
       break;
     default:
       GetDebugName("PoolingLayerBuilder (ERROR)", node_id_, str_debug_name_);
@@ -44,13 +48,18 @@ const char* PoolingLayerBuilder::DebugName() {
 }
 
 CoreML::Specification::NeuralNetworkLayer* PoolingLayerBuilder::Build() {
-  if (layer_ == nullptr) {
-    layer_.reset(new CoreML::Specification::NeuralNetworkLayer);
-  }
   layer_->set_name(DebugName());
+  auto* pooling_params = layer_->mutable_pooling();
+
+  if (pooling_type_ == kTfLiteBuiltinMean) {
+    pooling_params->set_type(
+        CoreML::Specification::PoolingLayerParams::AVERAGE);
+    pooling_params->set_globalpooling(true);
+    return layer_.release();
+  }
+
   const TfLitePoolParams* params =
       reinterpret_cast<const TfLitePoolParams*>(builtin_data_);
-  auto* pooling_params = layer_->mutable_pooling();
   pooling_params->mutable_stride()->Add(params->stride_height);
   pooling_params->mutable_stride()->Add(params->stride_width);
   pooling_params->mutable_kernelsize()->Add(params->filter_height);
@@ -89,7 +98,12 @@ CoreML::Specification::NeuralNetworkLayer* PoolingLayerBuilder::Build() {
 
 TfLiteStatus PoolingLayerBuilder::RegisterInputs(const TfLiteIntArray* inputs,
                                                  TfLiteContext* context) {
-  if (inputs->size != 1) {
+  if (pooling_type_ == kTfLiteBuiltinMean) {
+    if (inputs->size != 2) {
+      TF_LITE_KERNEL_LOG(context, "Wrong # of inputs to Mean!.");
+      return kTfLiteError;
+    }
+  } else if (inputs->size != 1) {
     TF_LITE_KERNEL_LOG(context, "Wrong # of inputs to Pooling!.");
     return kTfLiteError;
   }
@@ -115,6 +129,38 @@ OpBuilder* CreateMaxPool2dOpBuilder(GraphBuilder* graph_builder) {
   return new PoolingLayerBuilder(graph_builder, kTfLiteBuiltinMaxPool2d);
 }
 
+OpBuilder* CreateMeanOpBuilder(GraphBuilder* graph_builder) {
+  return new PoolingLayerBuilder(graph_builder, kTfLiteBuiltinMean);
+}
+
+// Only supports averaging over H and W dimensions, as
+bool IsMeanOpSupported(const TfLiteRegistration* registration,
+                       const TfLiteNode* node, TfLiteContext* context) {
+  const TfLiteTensor* input = GetInput(context, node, 0);
+  const TfLiteTensor* axis = GetInput(context, node, 1);
+  const auto* params =
+      reinterpret_cast<TfLiteReducerParams*>(node->builtin_data);
+
+  if (!params->keep_dims) {
+    TF_LITE_KERNEL_LOG(context, "keep_dims should be true for Mean op.");
+    return false;
+  }
+  if (input->dims->size != 4) {
+    TF_LITE_KERNEL_LOG(context, "Mean op is only supported for 4D input.");
+    return false;
+  }
+  const int* axis_data = GetTensorData<int>(axis);
+  std::vector<bool> axis_mask = {false, true, true, false};
+  for (int i = 0; i < axis->dims->data[0]; ++i) {
+    if (!axis_mask[(axis_data[i] + 4) % 4]) {
+      TF_LITE_KERNEL_LOG(context,
+                         "Mean op should reduce for H and W dimensions.");
+      return false;
+    }
+  }
+  return true;
+}
+
 }  // namespace coreml
 }  // namespace delegates
 }  // namespace tflite
diff --git a/tensorflow/lite/experimental/delegates/coreml/coreml_delegate.mm b/tensorflow/lite/experimental/delegates/coreml/coreml_delegate.mm
index 2cca58aa9fc..173546be879 100644
--- a/tensorflow/lite/experimental/delegates/coreml/coreml_delegate.mm
+++ b/tensorflow/lite/experimental/delegates/coreml/coreml_delegate.mm
@@ -109,6 +109,9 @@ bool IsNodeSupportedByDelegate(const TfLiteRegistration* registration, const TfL
     case kTfLiteBuiltinMirrorPad: {
       return delegates::coreml::IsMirrorPadOpSupported(registration, node, context);
     }
+    case kTfLiteBuiltinMean: {
+      return delegates::coreml::IsMeanOpSupported(registration, node, context);
+    }
     case kTfLiteBuiltinMul: {
       return node->builtin_data != nullptr &&
              delegates::coreml::IsBinaryOpSupported(registration, node, context);
diff --git a/tensorflow/lite/experimental/ios/BUILD.apple b/tensorflow/lite/experimental/ios/BUILD.apple
index b4df93088b3..11868fe044d 100644
--- a/tensorflow/lite/experimental/ios/BUILD.apple
+++ b/tensorflow/lite/experimental/ios/BUILD.apple
@@ -1,7 +1,12 @@
 # TensorFlow Lite for iOS
 
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
-load("//tensorflow/lite/experimental/ios:ios.bzl", "TFL_MINIMUM_OS_VERSION", "tflite_ios_static_framework")
+load(
+    "//tensorflow/lite/experimental/ios:ios.bzl",
+    "TFL_MINIMUM_OS_VERSION",
+    "strip_common_include_path_prefix",
+    "tflite_ios_static_framework",
+)
 load("@build_bazel_rules_apple//apple:ios.bzl", "ios_static_framework")
 
 package(
@@ -16,33 +21,27 @@ sh_binary(
     srcs = [
         "hide_symbols_with_allowlist.sh",
     ],
+    visibility = [
+        "//tensorflow/lite:__subpackages__",
+        "//tensorflow_lite_support:__subpackages__",
+    ],
 )
 
-# When the static framework is built with bazel, the all header files are moved
-# to the "Headers" directory with no header path prefixes. This auxiliary rule
-# is used for stripping the path prefix to the "common.h" file included by the
-# "c_api.h" header.
-genrule(
-    name = "strip_c_api_include_hdr",
-    srcs = ["//tensorflow/lite/c:c_api.h"],
-    outs = ["c_api.h"],
-    cmd = """
-    sed 's|#include ".*common.h"|#include "common.h"|'\
-    "$(location //tensorflow/lite/c:c_api.h)"\
-    > "$@"
-    """,
+strip_common_include_path_prefix(
+    name = "strip_common_include_path_core",
+    hdr_labels = [
+        "//tensorflow/lite/c:c_api.h",
+        "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate.h",
+    ],
 )
 
-# Similar rule as above, but for the "xnnpack_delegate.h" header.
-genrule(
-    name = "strip_xnnpack_include_hdr",
-    srcs = ["//tensorflow/lite/delegates/xnnpack:xnnpack_delegate.h"],
-    outs = ["xnnpack_delegate.h"],
-    cmd = """
-    sed 's|#include ".*common.h"|#include "common.h"|'\
-    "$(location //tensorflow/lite/delegates/xnnpack:xnnpack_delegate.h)"\
-    > "$@"
-    """,
+strip_common_include_path_prefix(
+    name = "strip_common_include_path_subspecs",
+    hdr_labels = [
+        "//tensorflow/lite/delegates/gpu:metal_delegate.h",
+        "//tensorflow/lite/experimental/delegates/coreml:coreml_delegate.h",
+    ],
+    prefix = "TensorFlowLiteC/",
 )
 
 # bazel build -c opt --config=ios_fat //tensorflow/lite/experimental/ios:TensorFlowLiteC_framework
@@ -79,23 +78,12 @@ ios_static_framework(
     ],
 )
 
-genrule(
-    name = "strip_coreml_include_hdr",
-    srcs = ["//tensorflow/lite/experimental/delegates/coreml:coreml_delegate.h"],
-    outs = ["coreml_delegate.h"],
-    cmd = """
-    sed 's|#include ".*common.h"|#include "TensorFlowLiteC/common.h"|'\
-    "$(location //tensorflow/lite/experimental/delegates/coreml:coreml_delegate.h)"\
-    > "$@"
-    """,
-)
-
 # This target builds the Core ML delegate as a separate static framework, which
 # does not include the TensorFlow Lite runtime. As this target does not contain
 # TensorFlow Lite runtime, it is intended to be linked along with the
 # TensorFlowLiteC framework above in a composable way.
 #
-# bazel build -c opt --config=ios_fat //tensorflow/lite/experimental/ios:TensorFlowLiteCCoreMl_framework
+# bazel build -c opt --config=ios_fat //tensorflow/lite/experimental/ios:TensorFlowLiteCCoreML_framework
 tflite_ios_static_framework(
     name = "TensorFlowLiteCCoreML_framework",
     hdrs = [
@@ -118,7 +106,7 @@ tflite_ios_static_framework(
 tflite_ios_static_framework(
     name = "TensorFlowLiteCMetal_framework",
     hdrs = [
-        "//tensorflow/lite/delegates/gpu:metal_delegate.h",
+        ":metal_delegate.h",
     ],
     allowlist_symbols_file = ":allowlist_TensorFlowLiteCMetal.txt",
     bundle_name = "TensorFlowLiteCMetal",
diff --git a/tensorflow/lite/experimental/ios/hide_symbols_with_allowlist.sh b/tensorflow/lite/experimental/ios/hide_symbols_with_allowlist.sh
index 6841643173f..27253cdc511 100755
--- a/tensorflow/lite/experimental/ios/hide_symbols_with_allowlist.sh
+++ b/tensorflow/lite/experimental/ios/hide_symbols_with_allowlist.sh
@@ -33,8 +33,7 @@ LD_DEBUGGABLE_FLAGS="-x"
 # LD_DEBUGGABLE_FLAGS="-d"
 
 # Exits if C++ symbols are found in the allowlist.
-if grep -q "^__Z" "${ALLOWLIST_FILE_PATH}"
-then
+if grep -q "^__Z" "${ALLOWLIST_FILE_PATH}"; then
   echo "ERROR: Failed in symbol hiding. This rule does not permit hiding of" \
        "C++ symbols due to possible serious problems mixing symbol hiding," \
        "shared libraries and the C++ runtime." \
@@ -59,8 +58,7 @@ IFS=' ' read -r -a archs <<< "${archs_str}"
 merge_cmd=(xcrun lipo)
 
 # Merges object files and hide symbols for each architecture.
-for arch in "${archs[@]}"
-do
+for arch in "${archs[@]}"; do
     archdir=$(mktemp -t "${arch}" -d)
     arch_file="${archdir}/${arch}"
 
diff --git a/tensorflow/lite/experimental/ios/ios.bzl b/tensorflow/lite/experimental/ios/ios.bzl
index 1125e85f3d6..63747eb8d1a 100644
--- a/tensorflow/lite/experimental/ios/ios.bzl
+++ b/tensorflow/lite/experimental/ios/ios.bzl
@@ -17,7 +17,7 @@ TFL_DISABLED_SANITIZER_TAGS = [
     "notsan",
 ]
 
-# iOS static framework with symbol allowlist. Exported C++ symbbols might cause
+# iOS static framework with symbol allowlist. Exported C++ symbols might cause
 # symbol collision with other libraries. List of symbols to allowlist can be
 # generated by running `nm -m -g FRAMEWORK_LIBRARY | grep _TfLite` for framework
 # built with `ios_static_framework` rule.
@@ -71,3 +71,32 @@ def tflite_ios_static_framework(
             "//tensorflow/lite/experimental/ios:hide_symbols_with_allowlist",
         ],
     )
+
+# When the static framework is built with bazel, the all header files are moved
+# to the "Headers" directory with no header path prefixes. This auxiliary rule
+# is used for stripping the path prefix to the "common.h" file included by the
+# "c_api.h" header.
+def strip_common_include_path_prefix(name, hdr_labels, prefix = ""):
+    """Create modified header files with the common.h include path stripped out.
+
+    Args:
+      name: The name to be used as a prefix to the generated genrules.
+      hdr_labels: List of header labels to strip out the include path. Each
+          label must end with a colon followed by the header file name.
+      prefix: Optional prefix path to prepend to the common.h inclusion path.
+    """
+
+    for hdr_label in hdr_labels:
+        hdr_filename = hdr_label.split(":")[-1]
+        hdr_basename = hdr_filename.split(".")[0]
+
+        native.genrule(
+            name = "{}_{}".format(name, hdr_basename),
+            srcs = [hdr_label],
+            outs = [hdr_filename],
+            cmd = """
+            sed 's|#include ".*common.h"|#include "{}common.h"|'\
+            "$(location {})"\
+            > "$@"
+            """.format(prefix, hdr_label),
+        )
diff --git a/tensorflow/lite/experimental/kernels/BUILD b/tensorflow/lite/experimental/kernels/BUILD
index 70ae658213f..ffab0c2a373 100644
--- a/tensorflow/lite/experimental/kernels/BUILD
+++ b/tensorflow/lite/experimental/kernels/BUILD
@@ -54,7 +54,6 @@ cc_library(
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/kernels:op_macros",
-        "//tensorflow/lite/kernels/internal:optimized",
         "//tensorflow/lite/kernels/internal:optimized_base",
         "//tensorflow/lite/kernels/internal:tensor",
         "@flatbuffers",
diff --git a/tensorflow/lite/experimental/kernels/unidirectional_sequence_gru_test.cc b/tensorflow/lite/experimental/kernels/unidirectional_sequence_gru_test.cc
index 593d714e557..0436636a0af 100644
--- a/tensorflow/lite/experimental/kernels/unidirectional_sequence_gru_test.cc
+++ b/tensorflow/lite/experimental/kernels/unidirectional_sequence_gru_test.cc
@@ -37,7 +37,7 @@ class GRUOpModel : public SingleOpModel {
       : n_batch_(n_batch), n_input_(n_input), n_output_(n_output) {
     input_ = AddInput(TensorType_FLOAT32);
     input_state_ =
-        AddInput(TensorData{TensorType_FLOAT32, {n_batch, n_output}}, true);
+        AddVariableInput(TensorData{TensorType_FLOAT32, {n_batch, n_output}});
     gate_weight_ = AddInput(TensorType_FLOAT32);
     gate_bias_ = AddInput(TensorType_FLOAT32);
     candidate_weight_ = AddInput(TensorType_FLOAT32);
diff --git a/tensorflow/lite/experimental/resource/BUILD b/tensorflow/lite/experimental/resource/BUILD
index fce783d67f2..b1b53afa45b 100644
--- a/tensorflow/lite/experimental/resource/BUILD
+++ b/tensorflow/lite/experimental/resource/BUILD
@@ -1,3 +1,5 @@
+load("//tensorflow:tensorflow.bzl", "get_compatible_with_portable")
+
 package(
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],  # Apache 2.0
@@ -16,6 +18,7 @@ cc_library(
         "resource_variable.h",
         "static_hashtable.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     deps = [
         "//tensorflow/lite:string_util",
         "//tensorflow/lite/c:common",
diff --git a/tensorflow/lite/experimental/swift/BUILD.apple b/tensorflow/lite/experimental/swift/BUILD.apple
index d16e5b1030b..28b3c76e52c 100644
--- a/tensorflow/lite/experimental/swift/BUILD.apple
+++ b/tensorflow/lite/experimental/swift/BUILD.apple
@@ -118,7 +118,7 @@ swift_library(
     name = "TestsLibrary",
     testonly = 1,
     srcs = glob(["Tests/*.swift"]),
-    tags = TFL_DEFAULT_TAGS,
+    tags = TFL_DEFAULT_TAGS + ["nobuilder"],
     deps = [
         ":Resources",
         ":TensorFlowLiteAllDelegates",
diff --git a/tensorflow/lite/experimental/swift/Sources/MetalDelegate.swift b/tensorflow/lite/experimental/swift/Sources/MetalDelegate.swift
index 7d7e79de0c1..4d0060231f6 100644
--- a/tensorflow/lite/experimental/swift/Sources/MetalDelegate.swift
+++ b/tensorflow/lite/experimental/swift/Sources/MetalDelegate.swift
@@ -33,7 +33,7 @@ public final class MetalDelegate: Delegate {
   public init(options: Options = Options()) {
     self.options = options
     var delegateOptions = TFLGpuDelegateOptions()
-    delegateOptions.allow_precision_loss = options.allowsPrecisionLoss
+    delegateOptions.allow_precision_loss = options.isPrecisionLossAllowed
     delegateOptions.wait_type = options.waitType.cWaitType
     delegateOptions.enable_quantization = options.isQuantizationEnabled
     cDelegate = TFLGpuDelegateCreate(&delegateOptions)
@@ -49,7 +49,13 @@ extension MetalDelegate {
   public struct Options: Equatable, Hashable {
     /// Indicates whether the GPU delegate allows precision loss, such as allowing `Float16`
     /// precision for a `Float32` computation. The default is `false`.
-    public var allowsPrecisionLoss = false
+    public var isPrecisionLossAllowed = false
+
+    @available(*, deprecated: 2.4, renamed: "isPrecisionLossAllowed")
+    public var allowsPrecisionLoss: Bool {
+      get { return isPrecisionLossAllowed }
+      set(value) { isPrecisionLossAllowed = value }
+    }
 
     /// A type indicating how the current thread should wait for work on the GPU to complete. The
     /// default is `passive`.
diff --git a/tensorflow/lite/experimental/swift/Tests/MetalDelegateTests.swift b/tensorflow/lite/experimental/swift/Tests/MetalDelegateTests.swift
index 8af43842d7a..8e8de7c320d 100644
--- a/tensorflow/lite/experimental/swift/Tests/MetalDelegateTests.swift
+++ b/tensorflow/lite/experimental/swift/Tests/MetalDelegateTests.swift
@@ -20,16 +20,16 @@ class MetalDelegateTests: XCTestCase {
 
   func testInitDefaultGPUDelegateOptions() {
     let delegate = MetalDelegate()
-    XCTAssertFalse(delegate.options.allowsPrecisionLoss)
+    XCTAssertFalse(delegate.options.isPrecisionLossAllowed)
     XCTAssertEqual(delegate.options.waitType, .passive)
   }
 
   func testInitWithCustomGPUDelegateOptions() {
     var options = MetalDelegate.Options()
-    options.allowsPrecisionLoss = true
+    options.isPrecisionLossAllowed = true
     options.waitType = .active
     let delegate = MetalDelegate(options: options)
-    XCTAssertTrue(delegate.options.allowsPrecisionLoss)
+    XCTAssertTrue(delegate.options.isPrecisionLossAllowed)
     XCTAssertEqual(delegate.options.waitType, .active)
   }
 
@@ -58,15 +58,15 @@ class MetalDelegateOptionsTests: XCTestCase {
 
   func testInitWithDefaultValues() {
     let options = MetalDelegate.Options()
-    XCTAssertFalse(options.allowsPrecisionLoss)
+    XCTAssertFalse(options.isPrecisionLossAllowed)
     XCTAssertEqual(options.waitType, .passive)
   }
 
   func testInitWithCustomValues() {
     var options = MetalDelegate.Options()
-    options.allowsPrecisionLoss = true
+    options.isPrecisionLossAllowed = true
     options.waitType = .active
-    XCTAssertTrue(options.allowsPrecisionLoss)
+    XCTAssertTrue(options.isPrecisionLossAllowed)
     XCTAssertEqual(options.waitType, .active)
   }
 
@@ -75,17 +75,17 @@ class MetalDelegateOptionsTests: XCTestCase {
     var options2 = MetalDelegate.Options()
     XCTAssertEqual(options1, options2)
 
-    options1.allowsPrecisionLoss = true
-    options2.allowsPrecisionLoss = true
+    options1.isPrecisionLossAllowed = true
+    options2.isPrecisionLossAllowed = true
     XCTAssertEqual(options1, options2)
 
     options1.waitType = .none
     options2.waitType = .none
     XCTAssertEqual(options1, options2)
 
-    options2.allowsPrecisionLoss = false
+    options2.isPrecisionLossAllowed = false
     XCTAssertNotEqual(options1, options2)
-    options1.allowsPrecisionLoss = false
+    options1.isPrecisionLossAllowed = false
 
     options1.waitType = .aggressive
     XCTAssertNotEqual(options1, options2)
diff --git a/tensorflow/lite/experimental/tflite_api_dispatcher/BUILD b/tensorflow/lite/experimental/tflite_api_dispatcher/BUILD
index f4ade28eab8..fff0a47c323 100644
--- a/tensorflow/lite/experimental/tflite_api_dispatcher/BUILD
+++ b/tensorflow/lite/experimental/tflite_api_dispatcher/BUILD
@@ -1,5 +1,3 @@
-load("//tensorflow/lite:build_def.bzl", "if_tflite_experimental_runtime", "tflite_experimental_runtime_linkopts")
-
 package(
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],  # Apache 2.0
@@ -8,14 +6,9 @@ package(
 cc_library(
     name = "tflite_api_dispatcher",
     hdrs = ["tflite_api_dispatcher.h"],
-    defines = if_tflite_experimental_runtime(
-        if_eager = ["TFLITE_EXPERIMENTAL_RUNTIME_EAGER"],
-        if_non_eager = ["TFLITE_EXPERIMENTAL_RUNTIME_NON_EAGER"],
-        if_none = [],
-    ),
     deps = [
         "//tensorflow/lite:framework_lib",
-    ] + tflite_experimental_runtime_linkopts(),
+    ],
 )
 
 cc_library(
@@ -24,14 +17,5 @@ cc_library(
     deps = [
         ":tflite_api_dispatcher",
         "//tensorflow/lite:framework_lib",
-    ] + tflite_experimental_runtime_linkopts(
-        if_eager = [
-            # "//tensorflow/lite/experimental/tf_runtime/opdef:tflrt_opdefs",
-            # "//tensorflow/lite/experimental/tf_runtime/tfrt_ops:tfrt_tflite_ops_alwayslink",
-        ],
-        if_non_eager = [
-            # "//tensorflow/lite/experimental/tf_runtime/tfrt_kernels:tfrt_tflite_interpreter_alwayslink",
-            # "//third_party/tf_runtime:basic_kernels_alwayslink",
-        ],
-    ),
+    ],
 )
diff --git a/tensorflow/lite/experimental/tflite_api_dispatcher/tflite_api_dispatcher.h b/tensorflow/lite/experimental/tflite_api_dispatcher/tflite_api_dispatcher.h
index ab822d6ae7b..ecb90b48c50 100644
--- a/tensorflow/lite/experimental/tflite_api_dispatcher/tflite_api_dispatcher.h
+++ b/tensorflow/lite/experimental/tflite_api_dispatcher/tflite_api_dispatcher.h
@@ -18,25 +18,12 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_EXPERIMENTAL_TFLITE_API_DISPATCHER_TFLITE_API_DISPATCHER_H_
 #define TENSORFLOW_LITE_EXPERIMENTAL_TFLITE_API_DISPATCHER_TFLITE_API_DISPATCHER_H_
 
-#ifndef TFLITE_EXPERIMENTAL_RUNTIME_EAGER
-#define TFLITE_EXPERIMENTAL_RUNTIME_EAGER (0)
-#endif
-
-#ifndef TFLITE_EXPERIMENTAL_RUNTIME_NON_EAGER
-#define TFLITE_EXPERIMENTAL_RUNTIME_NON_EAGER (0)
-#endif
-
-#if TFLITE_EXPERIMENTAL_RUNTIME_EAGER && TFLITE_EXPERIMENTAL_RUNTIME_NON_EAGER
-#error \
-    "TFLITE_EXPERIMENTAL_RUNTIME_EAGER and " \
-    "TFLITE_EXPERIMENTAL_RUNTIME_NON_EAGER should not both be true."
+#ifndef TFLITE_EXPERIMENTAL_RUNTIME
+#define TFLITE_EXPERIMENTAL_RUNTIME (0)
 #endif
 
 // Import the relevant interpreter and model files.
-#if TFLITE_EXPERIMENTAL_RUNTIME_EAGER
-#include "tensorflow/lite/experimental/tf_runtime/lib/eager_model.h"
-#include "tensorflow/lite/experimental/tf_runtime/public/eager_interpreter.h"
-#elif TFLITE_EXPERIMENTAL_RUNTIME_NON_EAGER
+#if TFLITE_EXPERIMENTAL_RUNTIME
 #include "tensorflow/lite/experimental/tf_runtime/lib/model.h"
 #include "tensorflow/lite/experimental/tf_runtime/public/interpreter.h"
 #else
@@ -47,12 +34,7 @@ limitations under the License.
 namespace tflite_api_dispatcher {
 
 // Use the correct interpreter.
-#if TFLITE_EXPERIMENTAL_RUNTIME_EAGER
-using Interpreter = tflrt::EagerInterpreter;
-using InterpreterBuilder = tflrt::EagerTfLiteInterpreterBuilderAPI;
-using TfLiteModel = tflite::FlatBufferModel;
-using TfLiteVerifier = tflite::TfLiteVerifier;
-#elif TFLITE_EXPERIMENTAL_RUNTIME_NON_EAGER
+#if TFLITE_EXPERIMENTAL_RUNTIME
 using Interpreter = tflrt::TfLiteInterpreterAPI;
 using InterpreterBuilder = tflrt::TfLiteInterpreterBuilderAPI;
 using TfLiteModel = tflrt::BEFModel;
diff --git a/tensorflow/lite/g3doc/_book.yaml b/tensorflow/lite/g3doc/_book.yaml
index e28fee87316..22fd564635c 100644
--- a/tensorflow/lite/g3doc/_book.yaml
+++ b/tensorflow/lite/g3doc/_book.yaml
@@ -40,15 +40,17 @@ upper_tabs:
         status: external
 
       - heading: "Text"
-      - title: "Text classification"
+      - title: "Text classification with Model Maker"
         path: /lite/tutorials/model_maker_text_classification
+      - title: "Question Answer with Model Maker"
+        path: /lite/tutorials/model_maker_question_answer
 
       - heading: "Microcontrollers"
       - title: "Gesture recognition"
-        path: https://codelabs.developers.google.com/codelabs/sparkfun-tensorflow/#0
+        path: https://blog.tensorflow.org/2019/11/how-to-get-started-with-machine.html
         status: external
       - title: "Hotword detection"
-        path: https://blog.tensorflow.org/2019/11/how-to-get-started-with-machine.html
+        path: https://codelabs.developers.google.com/codelabs/sparkfun-tensorflow/#0
         status: external
       - title: "Person detection"
         path: https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/person_detection
@@ -74,47 +76,67 @@ upper_tabs:
         path: /lite/guide/roadmap
 
       - heading: "Convert a model"
-      - title: "TensorFlow Lite converter"
+      - title: "Overview"
         path: /lite/convert/
-      - title: "Python API"
-        path: /lite/convert/python_api
-      - title: "Command line"
-        path: /lite/convert/cmdline
-      - title: "Convert quantized models"
-        path: /lite/convert/quantization
       - title: "Convert RNN models"
         path: /lite/convert/rnn
       - title: "Add metadata"
         path: /lite/convert/metadata
-      - title: "Composite operation fusion"
-        path: /lite/convert/operation_fusion
-      - title: "1.x compatibility"
-        path: /lite/convert/1x_compatibility
+      - title: "Sample models"
+        path: /lite/guide/hosted_models
+      - title: "API updates"
+        path: /lite/convert/api_updates
 
-      - heading: "Inference"
+      - heading: "Create a model"
+      - title: "TensorFlow Lite Model Maker"
+        status: experimental
+        path: /lite/guide/model_maker
+
+      - heading: "Run Inference"
       - title: "Overview"
         path: /lite/guide/inference
-      - title: "Integrate models with metadata"
-        path: /lite/guide/codegen
-      - title: "Custom operators"
-        path: /lite/guide/ops_custom
-      - title: "Operator versions"
-        path: /lite/guide/ops_version
       - title: "Operator compatibility"
         path: /lite/guide/ops_compatibility
-      - title: "Select operators from TensorFlow"
+      - title: "Select operators"
         path: /lite/guide/ops_select
+      - title: "Custom operators"
+        path: /lite/guide/ops_custom
+      - title: "Fused operators"
+        path: /lite/convert/operation_fusion
+      - title: "Operator versions"
+        path: /lite/guide/ops_version
         status: experimental
-      - title: "Process input and output data"
-        path: /lite/guide/lite_support
-      - title: "List of hosted models"
-        path: /lite/guide/hosted_models
+
+      - heading: "Run Inference with metadata"
+      - title: "Overview"
+        path: /lite/inference_with_metadata/overview
+      - title: "Generate model interfaces with codegen"
+        path: /lite/inference_with_metadata/codegen
+      - title: "Integrate models with Task Library"
+        path: /lite/inference_with_metadata/task_library/overview
+        section:
+        - title: "ImageClassifier"
+          path: /lite/inference_with_metadata/task_library/image_classifier
+        - title: "ObjectDetector"
+          path: /lite/inference_with_metadata/task_library/object_detector
+        - title: "ImageSegmenter"
+          path: /lite/inference_with_metadata/task_library/image_segmenter
+        - title: "NLClassifier"
+          path: /lite/inference_with_metadata/task_library/nl_classifier
+        - title: "BertNLClassifier"
+          path: /lite/inference_with_metadata/task_library/bert_nl_classifier
+        - title: "BertQuestionAnswerer"
+          path: /lite/inference_with_metadata/task_library/bert_question_answerer
+        - title: "Customized API"
+          path: /lite/inference_with_metadata/task_library/customized_task_api
+      - title: "Customize input and output data processing"
+        path: /lite/inference_with_metadata/lite_support
 
       - heading: "Performance"
       - title: "Best practices"
         path: /lite/performance/best_practices
-      - title: "Benchmarks"
-        path: /lite/performance/benchmarks
+      - title: "Measurement"
+        path: /lite/performance/measurement
       - title: "Delegates"
         path: /lite/performance/delegates
         status: experimental
@@ -142,16 +164,24 @@ upper_tabs:
         path: /lite/performance/post_training_integer_quant
       - title: "Post-training float16 quantization"
         path: /lite/performance/post_training_float16_quant
+      - title: "Post-training integer quantization with int16 activations"
+        path: /lite/performance/post_training_integer_quant_16x8
+        status: experimental
       - title: "Quantization specification"
         path: /lite/performance/quantization_spec
 
       - heading: "Build TensorFlow Lite"
+      - title: "Build for Android"
+        path: /lite/guide/build_android
       - title: "Build for iOS"
         path: /lite/guide/build_ios
       - title: "Build for ARM64"
         path: /lite/guide/build_arm64
       - title: "Build for Raspberry Pi"
         path: /lite/guide/build_rpi
+      - title: "Reduce binary size"
+        path: /lite/guide/reduce_binary_size
+        status: experimental
 
       - heading: "Microcontrollers"
       - title: "Overview"
@@ -197,16 +227,14 @@ upper_tabs:
     - name: "API"
       skip_translation: true
       contents:
-      - title: API Reference
+      - title: "API Reference"
         path: /lite/api_docs/
       - heading: "Python"
-      - title: "Overview"
-        status: external
-        path: /api_docs/python/tf/lite
+      - include: /lite/api_docs/python/tf/lite/_toc.yaml
       - heading: "Android (Java)"
       - include: /lite/api_docs/java/_toc.yaml
       - heading: "C++"
-      - title: Overview
+      - title: "Overview"
         path: /lite/api_docs/cc/
       - include: /lite/api_docs/cc/_doxygen.yaml
 
diff --git a/tensorflow/lite/g3doc/api_docs/index.md b/tensorflow/lite/g3doc/api_docs/index.md
index 5db55fb28a3..98e5acb6ba2 100644
--- a/tensorflow/lite/g3doc/api_docs/index.md
+++ b/tensorflow/lite/g3doc/api_docs/index.md
@@ -4,7 +4,7 @@ The API reference documentation provides detailed information for each of the
 classes and methods in the TensorFlow Lite library. Choose your preferred
 platform from the list below.
 
-*   [Python API reference](https://tensorflow.org/api_docs/python/tf/lite)
+*   [Python API reference](https://tensorflow.org/lite/api_docs/python/tf/lite)
 *   [Android (Java) API reference](https://tensorflow.org/lite/api_docs/java/org/tensorflow/lite/package-summary)
 *   iOS API reference (coming soon)
 *   [C++ API reference](https://tensorflow.org/lite/api_docs/cc)
diff --git a/tensorflow/lite/g3doc/convert/1x_compatibility.md b/tensorflow/lite/g3doc/convert/1x_compatibility.md
deleted file mode 100644
index ceb99bad5e2..00000000000
--- a/tensorflow/lite/g3doc/convert/1x_compatibility.md
+++ /dev/null
@@ -1,120 +0,0 @@
-# TensorFlow 1.x Compatibility <a name="differences"></a>
-
-The `tf.lite.TFLiteConverter` Python API was updated between TensorFlow 1.x and
-2.x. This document explains the differences between the two versions, and
-provides information about how to use the 1.x version if required.
-
-If any of the changes raise concerns, please file a
-[GitHub Issue](https://github.com/tensorflow/tensorflow/issues).
-
-Note: We highly recommend that you
-[migrate your TensorFlow 1.x code to TensorFlow 2.x code](https://www.tensorflow.org/guide/migrate)
-.
-
-## Model formats
-
-#### SavedModel and Keras
-
-The `tf.lite.TFLiteConverter` API supports SavedModel and Keras HDF5 files
-generated in both TensorFlow 1.x and 2.x.
-
-#### Frozen Graph
-
-Note: TensorFlow 2.x no longer supports the generation of frozen graph models.
-
-The `tf.compat.v1.lite.TFLiteConverter` API supports frozen graph models
-generated in TensorFlow 1.x, as shown below:
-
-```python
-import tensorflow as tf
-# Path to the frozen graph file
-graph_def_file = 'frozen_graph.pb'
-# A list of the names of the model's input tensors
-input_arrays = ['input_name']
-# A list of the names of the model's output tensors
-output_arrays = ['output_name']
-# Load and convert the frozen graph
-converter = tf.compat.v1.lite.TFLiteConverter.from_frozen_graph(
-  graph_def_file, input_arrays, output_arrays)
-tflite_model = converter.convert()
-# Write the converted model to disk
-open("converted_model.tflite", "wb").write(tflite_model)
-```
-
-## Converter attributes
-
-#### Renamed attributes
-
-The following 1.x attribute has been renamed in 2.x.
-
-*   `target_ops` has been renamed to `target_spec.supported_ops` - In 2.x, in
-    line with future additions to the optimization framework, it has become an
-    attribute of `TargetSpec` and has been renamed to `supported_ops`.
-
-#### Unsupported attributes
-
-The following 1.x attributes have been removed in 2.x.
-
-*   _Quantization_ - In 2.x,
-    [quantize aware training](https://www.tensorflow.org/model_optimization/guide/quantization/training)
-    is supported through the Keras API and
-    [post training quantization](https://www.tensorflow.org/lite/performance/post_training_quantization)
-    uses fewer streamlined converter flags. Thus, the following attributes and
-    methods related to quantization have been removed:
-    *   `inference_type`
-    *   `quantized_input_stats`
-    *   `post_training_quantize`
-    *   `default_ranges_stats`
-    *   `reorder_across_fake_quant`
-    *   `change_concat_input_ranges`
-    *   `get_input_arrays()`
-*   _Visualization_ - In 2.x, the recommended approach for visualizing a
-    TensorFlow Lite graph is to use
-    [visualize.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tools/visualize.py)
-    . Unlike GraphViz, it enables users to visualize the graph after post
-    training quantization has occurred. Thus, the following attributes related
-    to graph visualization have been removed:
-    *   `output_format`
-    *   `dump_graphviz_dir`
-    *   `dump_graphviz_video`
-*   _Frozen graph_ - In 2.x, the frozen graph model format has been removed.
-    Thus, the following attribute related to frozen graphs has been removed:
-    *   `drop_control_dependency`
-
-## Unsupported APIs
-
-The following section explains several significant features in 1.x that have
-been removed in 2.x.
-
-#### Conversion APIs
-
-The following methods were deprecated in 1.x and have been removed in 2.x:
-
-*   `lite.toco_convert`
-*   `lite.TocoConverter`
-
-#### `lite.constants` API
-
-The `lite.constants` API was removed in 2.x in order to decrease duplication
-between TensorFlow and TensorFlow Lite. The following list maps the
-`lite.constant` type to the TensorFlow type:
-
-*   `lite.constants.FLOAT`: `tf.float32`
-*   `lite.constants.INT8`: `tf.int8`
-*   `lite.constants.INT32`: `tf.int32`
-*   `lite.constants.INT64`: `tf.int64`
-*   `lite.constants.STRING`: `tf.string`
-*   `lite.constants.QUANTIZED_UINT8`: `tf.uint8`
-
-Additionally, the deprecation of the `output_format` flag in `TFLiteConverter`
-led to the removal of the following constants:
-
-*   `lite.constants.TFLITE`
-*   `lite.constants.GRAPHVIZ_DOT`
-
-#### `lite.OpHint` API
-
-The `OpHint` API is currently unsupported due to an incompatibility with the 2.x
-APIs. This API enables conversion of LSTM based models. Support for LSTMs in 2.x
-is being investigated. All related `lite.experimental` APIs have been removed
-due to this issue.
diff --git a/tensorflow/lite/g3doc/convert/api_updates.md b/tensorflow/lite/g3doc/convert/api_updates.md
new file mode 100644
index 00000000000..a990b4f76db
--- /dev/null
+++ b/tensorflow/lite/g3doc/convert/api_updates.md
@@ -0,0 +1,48 @@
+# API Updates <a name="api_updates"></a>
+
+This page provides information about updates made to the
+`tf.lite.TFLiteConverter` [Python API](index.md) in TensorFlow 2.x.
+
+Note: If any of the changes raise concerns, please file a
+[GitHub issue](https://github.com/tensorflow/tensorflow/issues/new?template=60-tflite-converter-issue.md).
+
+*   TensorFlow 2.3
+
+    *   Support integer (previously, only float) input/output type for integer
+        quantized models using the new `inference_input_type` and
+        `inference_output_type` attributes. Refer to this
+        [example usage](../performance/post_training_quantization.md#integer_only).
+    *   Support conversion and resizing of models with dynamic dimensions.
+    *   Added a new experimental quantization mode with 16-bit activations and
+        8-bit weights.
+
+*   TensorFlow 2.2
+
+    *   By default, leverage [MLIR-based conversion](https://mlir.llvm.org/),
+        Google's cutting edge compiler technology for machine learning. This
+        enables conversion of new classes of models, including Mask R-CNN,
+        Mobile BERT, etc and supports models with functional control flow.
+
+*   TensorFlow 2.0 vs TensorFlow 1.x
+
+    *   Renamed the `target_ops` attribute to `target_spec.supported_ops`
+    *   Removed the following attributes:
+        *   _quantization_: `inference_type`, `quantized_input_stats`,
+            `post_training_quantize`, `default_ranges_stats`,
+            `reorder_across_fake_quant`, `change_concat_input_ranges`,
+            `get_input_arrays()`. Instead,
+            [quantize aware training](https://www.tensorflow.org/model_optimization/guide/quantization/training)
+            is supported through the `tf.keras` API and
+            [post training quantization](../performance/post_training_quantization.md)
+            uses fewer attributes.
+        *   _visualization_: `output_format`, `dump_graphviz_dir`,
+            `dump_graphviz_video`. Instead, the recommended approach for
+            visualizing a TensorFlow Lite model is to use
+            [visualize.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tools/visualize.py).
+        *   _frozen graphs_: `drop_control_dependency`, as frozen graphs are
+            unsupported in TensorFlow 2.x.
+    *   Removed other converter APIs such as `tf.lite.toco_convert` and
+        `tf.lite.TocoConverter`
+    *   Removed other related APIs such as `tf.lite.OpHint` and
+        `tf.lite.constants` (the `tf.lite.constants.*` types have been mapped to
+        `tf.*` TensorFlow data types, to reduce duplication)
diff --git a/tensorflow/lite/g3doc/convert/cmdline.md b/tensorflow/lite/g3doc/convert/cmdline.md
deleted file mode 100644
index 64d3e315b97..00000000000
--- a/tensorflow/lite/g3doc/convert/cmdline.md
+++ /dev/null
@@ -1,116 +0,0 @@
-# Converter command line reference
-
-This page describes how to use the [TensorFlow Lite converter](index.md) using
-the command line tool. However, the [Python API](python_api.md) is recommended
-for the majority of cases.
-
-Note: This only contains documentation on the command line tool in TensorFlow 2.
-Documentation on using the command line tool in TensorFlow 1 is available on
-GitHub
-([reference](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/r1/convert/cmdline_reference.md),
-[example](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/r1/convert/cmdline_examples.md)).
-
-## High-level overview
-
-The TensorFlow Lite Converter has a command line tool named `tflite_convert`,
-which supports basic models. Use the [Python API](python_api.md) for any
-conversions involving optimizations, or any additional parameters (e.g.
-signatures in [SavedModels](https://www.tensorflow.org/guide/saved_model) or
-custom objects in
-[Keras models](https://www.tensorflow.org/guide/keras/overview)).
-
-## Usage
-
-The following example shows a `SavedModel` being converted:
-
-```sh
-tflite_convert \
-  --saved_model_dir=/tmp/mobilenet_saved_model \
-  --output_file=/tmp/mobilenet.tflite
-```
-
-The inputs and outputs are specified using the following commonly used flags:
-
-*   `--output_file`. Type: string. Specifies the full path of the output file.
-*   `--saved_model_dir`. Type: string. Specifies the full path to the directory
-    containing the SavedModel generated in 1.X or 2.X.
-*   `--keras_model_file`. Type: string. Specifies the full path of the HDF5 file
-    containing the `tf.keras` model generated in 1.X or 2.X.
-
-To use all of the available flags, use the following command:
-
-```sh
-tflite_convert --help
-```
-
-The following flag can be used for compatibility with the TensorFlow 1.X version
-of the converter CLI:
-
-*   `--enable_v1_converter`. Type: bool. Enables user to enable the 1.X command
-    line flags instead of the 2.X flags. The 1.X command line flags are
-    specified
-    [here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/r1/convert/cmdline_reference.md).
-
-## Installing the converter CLI
-
-To obtain the latest version of the TensorFlow Lite converter CLI, we recommend
-installing the nightly build using
-[pip](https://www.tensorflow.org/install/pip):
-
-```sh
-pip install tf-nightly
-```
-
-Alternatively, you can
-[clone the TensorFlow repository](https://www.tensorflow.org/install/source) and
-use `bazel` to run the command:
-
-```sh
-bazel run //tensorflow/lite/python:tflite_convert -- \
-  --saved_model_dir=/tmp/mobilenet_saved_model \
-  --output_file=/tmp/mobilenet.tflite
-```
-
-### Custom ops in the new converter
-
-There is a behavior change in how models containing
-[custom ops](https://www.tensorflow.org/lite/guide/ops_custom) (those for which
-users previously set `--allow_custom_ops` before) are handled in the
-[new converter](https://github.com/tensorflow/tensorflow/blob/917ebfe5fc1dfacf8eedcc746b7989bafc9588ef/tensorflow/lite/python/lite.py#L81).
-
-**Built-in TensorFlow op**
-
-If you are converting a model with a built-in TensorFlow op that does not exist
-in TensorFlow Lite, you should set `--allow_custom_ops` argument (same as
-before), explained [here](https://www.tensorflow.org/lite/guide/ops_custom).
-
-**Custom op in TensorFlow**
-
-If you are converting a model with a custom TensorFlow op, it is recommended
-that you write a [TensorFlow kernel](https://www.tensorflow.org/guide/create_op)
-and [TensorFlow Lite kernel](https://www.tensorflow.org/lite/guide/ops_custom).
-This ensures that the model is working end-to-end, from TensorFlow and
-TensorFlow Lite. This also requires setting the `--allow_custom_ops` argument.
-
-**Advanced custom op usage (not recommended)**
-
-If the above is not possible, you can still convert a TensorFlow model
-containing a custom op without a corresponding kernel. You will need to pass the
-[OpDef](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/op_def.proto)
-of the custom op in TensorFlow using `--custom_opdefs` flag, as long as you have
-the corresponding OpDef registered in the TensorFlow global op registry. This
-ensures that the TensorFlow model is valid (i.e. loadable by the TensorFlow
-runtime).
-
-If the custom op is not part of the global TensorFlow op registry, then the
-corresponding OpDef needs to be specified via the `--custom_opdefs` flag. This
-is a list of an OpDef proto in string that needs to be additionally registered.
-Below is an example of a TFLiteAwesomeCustomOp with 2 inputs, 1 output, and 2
-attributes:
-
-```sh
---custom_opdefs="name: 'TFLiteAwesomeCustomOp' input_arg: { name: 'InputA'
-type: DT_FLOAT } input_arg: { name: ‘InputB' type: DT_FLOAT }
-output_arg: { name: 'Output' type: DT_FLOAT } attr : { name: 'Attr1' type:
-'float'} attr : { name: 'Attr2' type: 'list(float)'}"
-```
diff --git a/tensorflow/lite/g3doc/convert/index.md b/tensorflow/lite/g3doc/convert/index.md
index 71b5fd71737..87cc92826db 100644
--- a/tensorflow/lite/g3doc/convert/index.md
+++ b/tensorflow/lite/g3doc/convert/index.md
@@ -1,68 +1,228 @@
 # TensorFlow Lite converter
 
 The TensorFlow Lite converter takes a TensorFlow model and generates a
-TensorFlow Lite [`FlatBuffer`](https://google.github.io/flatbuffers/) file
-(`.tflite`). The converter supports
-[SavedModel directories](https://www.tensorflow.org/guide/saved_model),
-[`tf.keras` models](https://www.tensorflow.org/guide/keras/overview), and
-[concrete functions](https://tensorflow.org/guide/concrete_function).
+TensorFlow Lite model (an optimized
+[FlatBuffer](https://google.github.io/flatbuffers/) format identified by the
+`.tflite` file extension). You have the following two options for using the
+converter:
 
-Note: This page contains documentation on the converter API for TensorFlow 2.0.
-The API for TensorFlow 1.X is available
-[here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/r1/convert/index.md).
+1.  [Python API](#python_api) (***recommended***): This makes it easier to
+    convert models as part of the model development pipeline, apply
+    optimizations, add metadata and has many more features.
+2.  [Command line](#cmdline): This only supports basic model conversion.
 
-## New in TF 2.2
+Note: In case you encounter any issues during model conversion, create a
+[GitHub issue](https://github.com/tensorflow/tensorflow/issues/new?template=60-tflite-converter-issue.md).
 
-TensorFlow Lite has switched to use a new converter backend by default - in the
-nightly builds and TF 2.2 stable. Why we did we switch?
+![TFLite converter workflow](../images/convert/convert.png)
 
-*   Enables conversion of new classes of models, including Mask R-CNN, Mobile
-    BERT, and many more
-*   Adds support for functional control flow (enabled by default in TensorFlow
-    2.x)
-*   Tracks original TensorFlow node name and Python code, and exposes them
-    during conversion if errors occur
-*   Leverages MLIR, Google's cutting edge compiler technology for ML, which
-    makes it easier to extend to accommodate feature requests
-*   Adds basic support for models with input tensors containing unknown
-    dimensions
-*   Supports all existing converter functionality
+## Python API <a name="python_api"></a>
 
-In case you encounter any issues:
+*Helper code: To identify the installed TensorFlow version, run
+`print(tf.__version__)` and to learn more about the TensorFlow Lite converter
+API, run `print(help(tf.lite.TFLiteConverter))`.*
 
-*   Please create a
-    [GitHub issue](https://github.com/tensorflow/tensorflow/issues/new?template=60-tflite-converter-issue.md)
-    with the component label “TFLiteConverter.” Please include:
-    *   Command used to run the converter or code if you’re using the Python API
-    *   The output from the converter invocation
-    *   The input model to the converter
-    *   If the conversion is successful, but the generated model is wrong, state
-        what is wrong:
-        *   Producing wrong results and / or decrease in accuracy
-        *   Producing correct results, but the model is slower than expected
-            (model generated from old converter)
-*   If you are using the allow_custom_ops feature, please read the
-    [Python API](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/convert/python_api.md)
-    and
-    [Command Line Tool](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/convert/cmdline.md)
-    documentation
-*   Switch to the old converter by setting `--experimental_new_converter=false`
-    (from the [tflite_convert](https://www.tensorflow.org/lite/convert/cmdline)
-    command line tool) or `converter.experimental_new_converter=False` (from the
-    [Python API](https://www.tensorflow.org/api_docs/python/tf/lite/TFLiteConverter))
+If you've
+[installed TensorFlow 2.x](https://www.tensorflow.org/install/pip#tensorflow-2-packages-are-available),
+you have the following two options: (*if you've
+[installed TensorFlow 1.x](https://www.tensorflow.org/install/pip#older-versions-of-tensorflow),
+refer to
+[Github](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/r1/convert/python_api.md)*)
 
-## Device deployment
+*   Convert a TensorFlow 2.x model using
+    [`tf.lite.TFLiteConverter`](https://www.tensorflow.org/api_docs/python/tf/lite/TFLiteConverter).
+    A TensorFlow 2.x model is stored using the SavedModel format and is
+    generated either using the high-level `tf.keras.*` APIs (a Keras model) or
+    the low-level `tf.*` APIs (from which you generate concrete functions). As a
+    result, you have the following three options (examples are in the next few
+    sections):
 
-The TensorFlow Lite `FlatBuffer` file is then deployed to a client device (e.g.
-mobile, embedded) and run locally using the TensorFlow Lite interpreter. This
-conversion process is shown in the diagram below:
+    *   `tf.lite.TFLiteConverter.from_saved_model()` (**recommended**): Converts
+        a [SavedModel](https://www.tensorflow.org/guide/saved_model).
+    *   `tf.lite.TFLiteConverter.from_keras_model()`: Converts a
+        [Keras](https://www.tensorflow.org/guide/keras/overview) model.
+    *   `tf.lite.TFLiteConverter.from_concrete_functions()`: Converts
+        [concrete functions](https://www.tensorflow.org/guide/intro_to_graphs).
 
-![TFLite converter workflow](../images/convert/workflow.svg)
+*   Convert a TensorFlow 1.x model using
+    [`tf.compat.v1.lite.TFLiteConverter`](https://www.tensorflow.org/api_docs/python/tf/compat/v1/lite/TFLiteConverter)
+    (examples are on
+    [Github](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/r1/convert/python_api.md)):
 
-## Converting models
+    *   `tf.compat.v1.lite.TFLiteConverter.from_saved_model()`: Converts a
+        [SavedModel](https://www.tensorflow.org/guide/saved_model).
+    *   `tf.compat.v1.lite.TFLiteConverter.from_keras_model_file()`: Converts a
+        [Keras](https://www.tensorflow.org/guide/keras/overview) model.
+    *   `tf.compat.v1.lite.TFLiteConverter.from_session()`: Converts a GraphDef
+        from a session.
+    *   `tf.compat.v1.lite.TFLiteConverter.from_frozen_graph()`: Converts a
+        Frozen GraphDef from a file. If you have checkpoints, then first convert
+        it to a Frozen GraphDef file and then use this API as shown
+        [here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/r1/convert/python_api.md#checkpoints).
 
-The TensorFlow Lite converter should be used from the
-[Python API](python_api.md). Using the Python API makes it easier to convert
-models as part of a model development pipeline and helps mitigate
-[compatibility](../guide/ops_compatibility.md) issues early on. Alternatively,
-the [command line tool](cmdline.md) supports basic models.
+Note: The following sections assume you've both installed TensorFlow 2.x and
+trained models in TensorFlow 2.x.
+
+### Convert a SavedModel (recommended) <a name="saved_model"></a>
+
+The following example shows how to convert a
+[SavedModel](https://www.tensorflow.org/guide/saved_model) into a TensorFlow
+Lite model.
+
+```python
+import tensorflow as tf
+
+# Convert the model
+converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir) # path to the SavedModel directory
+tflite_model = converter.convert().
+
+# Save the model.
+with open('model.tflite', 'wb') as f:
+  f.write(tflite_model)
+```
+
+### Convert a Keras model <a name="keras"></a>
+
+The following example shows how to convert a
+[Keras](https://www.tensorflow.org/guide/keras/overview) model into a TensorFlow
+Lite model.
+
+```python
+import tensorflow as tf
+
+# Create a model using high-level tf.keras.* APIs
+model = tf.keras.models.Sequential([
+    tf.keras.layers.Dense(units=1, input_shape=[1])
+    tf.keras.layers.Dense(units=16, activation='relu'),
+    tf.keras.layers.Dense(units=1)
+])
+model.compile(optimizer='sgd', loss='mean_squared_error') # compile the model
+model.fit(x=[-1, 0, 1], y=[-3, -1, 1], epochs=5) # train the model
+# (to generate a SavedModel) tf.saved_model.save(model, "saved_model_keras_dir")
+
+# Convert the model.
+converter = tf.lite.TFLiteConverter.from_keras_model(model)
+tflite_model = converter.convert()
+
+# Save the model.
+with open('model.tflite', 'wb') as f:
+  f.write(tflite_model)
+```
+
+### Convert concrete functions <a name="concrete_function"></a>
+
+The following example shows how to convert
+[concrete functions](https://www.tensorflow.org/guide/intro_to_graphs) into a
+TensorFlow Lite model.
+
+Note: Currently, it only supports the conversion of a single concrete function.
+
+```python
+import tensorflow as tf
+
+# Create a model using low-level tf.* APIs
+class Squared(tf.Module):
+  @tf.function
+  def __call__(self, x):
+    return tf.square(x)
+model = Squared()
+# (ro run your model) result = Squared(5.0) # This prints "25.0"
+# (to generate a SavedModel) tf.saved_model.save(model, "saved_model_tf_dir")
+concrete_func = model.__call__.get_concrete_function()
+
+# Convert the model
+converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func])
+tflite_model = converter.convert()
+
+# Save the model.
+with open('model.tflite', 'wb') as f:
+  f.write(tflite_model)
+```
+
+### Other features
+
+*   Apply [optimizations](../performance/model_optimization.md). A common
+    optimization used is
+    [post training quantization](../performance/post_training_quantization.md),
+    which can further reduce your model latency and size with minimal loss in
+    accuracy.
+
+*   Handle unsupported operations. You have the following options if your model
+    has operators:
+
+    1.  Supported in TensorFlow but unsupported in TensorFlow Lite: If you have
+        size constraints, you need to
+        [create the TensorFlow Lite operator](../guide/ops_custom.md), otherwise
+        just [use TensorFlow operators](../guide/ops_select.md) in your
+        TensorFlow Lite model.
+
+    2.  Unsupported in TensorFlow: You need to
+        [create the TensorFlow operator](https://www.tensorflow.org/guide/create_op)
+        and then [create the TensorFlow Lite operator](../guide/ops_custom.md).
+        If you were unsuccessful at creating the TensorFlow operator or don't
+        wish to create one (**not recommended, proceed with caution**), you can
+        still convert using the `custom_opdefs` attribute and then directly
+        [create the TensorFlow Lite operator](../guide/ops_custom.md). The
+        `custom_opdefs` attribute is a string containing an (or a list of)
+        [OpDef](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/op_def.proto)
+        (s) or operator definition proto(s). Below is an example of a
+        `TFLiteAwesomeCustomOp` with 1 input, 1 output, and 2 attributes:
+
+        ```python
+          converter.custom_opdefs="""name: 'TFLiteAwesomeCustomOp' input_arg:
+          { name: 'In' type: DT_FLOAT } output_arg: { name: 'Out' type: DT_FLOAT }
+          attr : { name: 'a1' type: 'float'} attr : { name: 'a2' type: 'list(float)'}"""
+        ```
+
+## Command Line Tool <a name="cmdline"></a>
+
+**It is highly recommended that you use the [Python API](#python_api) listed
+above instead, if possible.**
+
+If you've
+[installed TensorFlow 2.x from pip](https://www.tensorflow.org/install/pip), use
+the `tflite_convert` command as follows: (*if you've
+[installed TensorFlow 2.x from source](https://www.tensorflow.org/install/source)
+then you can replace '`tflite_convert`' with '`bazel run
+//tensorflow/lite/python:tflite_convert --`' in the following
+sections, and if you've
+[installed TensorFlow 1.x](https://www.tensorflow.org/install/pip#older-versions-of-tensorflow)
+then refer to Github
+([reference](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/r1/convert/cmdline_reference.md),
+[examples](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/r1/convert/cmdline_examples.md)))*
+
+`tflite_convert`: To view all the available flags, use the following command:
+
+```sh
+$ tflite_convert --help
+
+`--output_file`. Type: string. Full path of the output file.
+`--saved_model_dir`. Type: string. Full path to the SavedModel directory.
+`--keras_model_file`. Type: string. Full path to the Keras H5 model file.
+`--enable_v1_converter`. Type: bool. (default False) Enables the converter and flags used in TF 1.x instead of TF 2.x.
+
+You are required to provide the `--output_file` flag and either the `--saved_model_dir` or `--keras_model_file` flag.
+```
+
+### Converting a SavedModel <a name="cmdline_saved_model"></a>
+
+```sh
+tflite_convert \
+  --saved_model_dir=/tmp/mobilenet_saved_model \
+  --output_file=/tmp/mobilenet.tflite
+```
+
+### Converting a Keras H5 model <a name="cmdline_keras_model"></a>
+
+```sh
+tflite_convert \
+  --keras_model_file=/tmp/mobilenet_keras_model.h5 \
+  --output_file=/tmp/mobilenet.tflite
+```
+
+## Next Steps
+
+*   Add [metadata](metadata.md), which makes it easier to create platform
+    specific wrapper code when deploying models on devices.
+*   Use the [TensorFlow Lite interpreter](../guide/inference.md) to run
+    inference on a client device (e.g. mobile, embedded).
diff --git a/tensorflow/lite/g3doc/convert/metadata.md b/tensorflow/lite/g3doc/convert/metadata.md
index 4279e409416..667e12fae6f 100644
--- a/tensorflow/lite/g3doc/convert/metadata.md
+++ b/tensorflow/lite/g3doc/convert/metadata.md
@@ -7,9 +7,9 @@ input / output information. The metadata consists of both
 *   human readable parts which convey the best practice when using the model,
     and
 *   machine readable parts that can be leveraged by code generators, such as the
-    [TensorFlow Lite Android code generator](../guide/codegen.md#generate-code-with-tensorflow-lite-android-code-generator)
+    [TensorFlow Lite Android code generator](../inference_with_metadata/codegen.md#generate-code-with-tensorflow-lite-android-code-generator)
     and the
-    [Android Studio ML Binding feature](../guide/codegen.md#generate-code-with-android-studio-ml-model-binding).
+    [Android Studio ML Binding feature](../inference_with_metadata/codegen.md#generate-code-with-android-studio-ml-model-binding).
 
 All image models published on
 [TensorFlow Lite hosted models](https://www.tensorflow.org/lite/guide/hosted_models)
@@ -47,9 +47,9 @@ There are three parts to the model metadata in the
     [SubGraphMetadata.output_tensor_metadata](https://github.com/tensorflow/tflite-support/blob/4cd0551658b6e26030e0ba7fc4d3127152e0d4ae/tensorflow_lite_support/metadata/metadata_schema.fbs#L599).
 
 Since TensorFlow Lite only supports single subgraph at this point, the
-[TensorFlow Lite code generator](../guide/codegen.md#generate-code-with-tensorflow-lite-android-code-generator)
+[TensorFlow Lite code generator](../inference_with_metadata/codegen.md#generate-code-with-tensorflow-lite-android-code-generator)
 and the
-[Android Studio ML Binding feature](../guide/codegen.md#generate-code-with-android-studio-ml-model-binding)
+[Android Studio ML Binding feature](../inference_with_metadata/codegen.md#generate-code-with-android-studio-ml-model-binding)
 will use `ModelMetadata.name` and `ModelMetadata.description`, instead of
 `SubGraphMetadata.name` and `SubGraphMetadata.description`, when displaying
 metadata and generating code.
@@ -82,11 +82,11 @@ is compatible with existing TFLite framework and Interpreter. See
 [Pack mtadata and associated files into the model](#pack-metadata-and-associated-files-into-the-model)
 for more details.
 
-The associate file information can be recored in the metadata. Depending on the
+The associated file information can be recored in the metadata. Depending on the
 file type and where the file is attached to (i.e. `ModelMetadata`,
 `SubGraphMetadata`, and `TensorMetadata`),
-[the TensorFlow Lite Android code generator](../guide/codegen.md) may apply
-corresponding pre/post processing automatically to the object. See
+[the TensorFlow Lite Android code generator](../inference_with_metadata/codegen.md)
+may apply corresponding pre/post processing automatically to the object. See
 [the \<Codegen usage\> section of each associate file type](https://github.com/tensorflow/tflite-support/blob/4cd0551658b6e26030e0ba7fc4d3127152e0d4ae/tensorflow_lite_support/metadata/metadata_schema.fbs#L77-L127)
 in the schema for more details.
 
@@ -161,8 +161,7 @@ are two independent steps. Here are the details.
 and the
 [TensorFlow Lite C++ API](https://github.com/tensorflow/tensorflow/blob/09ec15539eece57b257ce9074918282d88523d56/tensorflow/lite/c/common.h#L391).
 \
-[2] The
-[metadata extractor library](../guide/codegen.md#read-the-metadata-from-models)
+[2] The [metadata extractor library](#read-the-metadata-from-models)
 
 When processing image data for uint8 models, normalization and quantization are
 sometimes skipped. It is fine to do so when the pixel values are in the range of
@@ -348,6 +347,9 @@ with open(export_json_file, "w") as f:
   f.write(json_file)
 ```
 
+Android Studio also supports displaying metadata through the
+[Android Studio ML Binding feature](https://developer.android.com/studio/preview/features#tensor-flow-lite-models).
+
 ## Metadata versioning
 
 The
@@ -391,5 +393,94 @@ largest version number among the versions of all the fields populated and the
 smallest compatible version indicated by the file identifier. The minimum
 necessary metadata parser version is automatically populated by the
 `MetadataPopulator` when the metadata is populated into a TFLite model. See the
-[metadata extractor](../guide/codegen.md#read-the-metadata-from-models) about
-how the minimum necessary metadata parser version is used.
+[metadata extractor](#read-the-metadata-from-models) for more information on how
+the minimum necessary metadata parser version is used.
+
+## Read the metadata from models
+
+The Metadata Extractor library is convenient tool to read the metadata and
+associated files from a models across different platforms (see the
+[Java version](https://github.com/tensorflow/tflite-support/tree/master/tensorflow_lite_support/metadata/java)
+and the
+[C++ version](https://github.com/tensorflow/tflite-support/tree/master/tensorflow_lite_support/metadata/cc)).
+You can build your own metadata extractor tool in other languages using the
+Flatbuffers library.
+
+### Read the metadata in Java
+
+To use the Metadata Extractor library in your Android app, we recommend using
+the
+[TensorFlow Lite Metadata AAR hosted at JCenter](https://bintray.com/google/tensorflow/tensorflow-lite-metadata).
+It contains the `MetadataExtractor` class, as well as the FlatBuffers Java
+bindings for the
+[metadata schema](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/metadata/metadata_schema.fbs)
+and the
+[model schema](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/schema/schema.fbs).
+
+You can specify this in your `build.gradle` dependencies as follows:
+
+```build
+dependencies {
+    implementation 'org.tensorflow:tensorflow-lite-metadata:0.0.0-nightly'
+}
+```
+
+You can initialize a `MetadataExtractor` object with a `ByteBuffer` that points
+to the model:
+
+```java
+public MetadataExtractor(ByteBuffer buffer);
+```
+
+The `ByteBuffer` must remain unchanged for the entire lifetime of the
+`MetadataExtractor` object. The initialization may fail if the Flatbuffers file
+identifier of the model metadata does not match that of the metadata parser. See
+[metadata versioning](#metadata-versioning) for more information.
+
+With matching file identifiers, the metadata extractor will successfully read
+metadata generated from all past and future schema due to the Flatbuffers'
+forwards and backwards compatibility mechanism. However, fields from future
+schemas cannot be extracted by older metadata extractors. The
+[minimum necessary parser version](#the-minimum-necessary-metadata-parser-version)
+of the metadata indicates the minimum version of metadata parser that can read
+the metadata Flatbuffers in full. You can use the following method to verify if
+the minimum necessary parser version condition is met:
+
+```java
+public final boolean isMinimumParserVersionSatisfied();
+```
+
+Passing in a model without metadata is allowed. However, invoking methods that
+read from the metadata will cause runtime errors. You can check if a model has
+metadata by invoking the `hasMetadata` method:
+
+```java
+public boolean hasMetadata();
+```
+
+`MetadataExtractor` provides convenient functions for you to get the
+input/output tensors' metadata. For example,
+
+```java
+public int getInputTensorCount();
+public TensorMetadata getInputTensorMetadata(int inputIndex);
+public QuantizationParams getInputTensorQuantizationParams(int inputIndex);
+public int[] getInputTensorShape(int inputIndex);
+public int getoutputTensorCount();
+public TensorMetadata getoutputTensorMetadata(int inputIndex);
+public QuantizationParams getoutputTensorQuantizationParams(int inputIndex);
+public int[] getoutputTensorShape(int inputIndex);
+```
+
+You can also read associated files through their names with the
+`getAssociatedFile` method:
+
+```java
+public InputStream getAssociatedFile(String fileName);
+```
+
+Though the
+[TensorFlow Lite model schema](https://github.com/tensorflow/tensorflow/blob/aa7ff6aa28977826e7acae379e82da22482b2bf2/tensorflow/lite/schema/schema.fbs#L1075)
+supports multiple subgraphs, the TFLite Interpreter currently only supports a
+single subgraph. Therefore, `MetadataExtractor` omits subgraph index as an input
+argument in its methods.
diff --git a/tensorflow/lite/g3doc/convert/python_api.md b/tensorflow/lite/g3doc/convert/python_api.md
deleted file mode 100644
index 0c43a795514..00000000000
--- a/tensorflow/lite/g3doc/convert/python_api.md
+++ /dev/null
@@ -1,267 +0,0 @@
-# Converter Python API guide
-
-This page provides examples on how to use the
-[TensorFlow Lite converter](index.md) using the Python API.
-
-Note: This only contains documentation on the Python API in TensorFlow 2.
-Documentation on using the Python API in TensorFlow 1 is available on
-[GitHub](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/r1/convert/python_api.md).
-
-[TOC]
-
-## Python API
-
-The Python API for converting TensorFlow models to TensorFlow Lite is
-`tf.lite.TFLiteConverter`. `TFLiteConverter` provides the following classmethods
-to convert a model based on the original model format:
-
-*   `TFLiteConverter.from_saved_model()`: Converts
-    [SavedModel directories](https://www.tensorflow.org/guide/saved_model).
-*   `TFLiteConverter.from_keras_model()`: Converts
-    [`tf.keras` models](https://www.tensorflow.org/guide/keras/overview).
-*   `TFLiteConverter.from_concrete_functions()`: Converts
-    [concrete functions](https://tensorflow.org/guide/concrete_function).
-
-This document contains [example usages](#examples) of the API and
-[instructions](#versioning) on running the different versions of TensorFlow.
-
-## Examples <a name="examples"></a>
-
-### Converting a SavedModel <a name="saved_model"></a>
-
-The following example shows how to convert a
-[SavedModel](https://www.tensorflow.org/guide/saved_model) into a TensorFlow
-Lite [`FlatBuffer`](https://google.github.io/flatbuffers/).
-
-```python
-import tensorflow as tf
-
-# Construct a basic model.
-root = tf.train.Checkpoint()
-root.v1 = tf.Variable(3.)
-root.v2 = tf.Variable(2.)
-root.f = tf.function(lambda x: root.v1 * root.v2 * x)
-
-# Save the model in SavedModel format.
-export_dir = "/tmp/test_saved_model"
-input_data = tf.constant(1., shape=[1, 1])
-to_save = root.f.get_concrete_function(input_data)
-tf.saved_model.save(root, export_dir, to_save)
-
-# Convert the model.
-converter = tf.lite.TFLiteConverter.from_saved_model(export_dir)
-tflite_model = converter.convert()
-
-# Save the TF Lite model.
-with tf.io.gfile.GFile('model.tflite', 'wb') as f:
-  f.write(tflite_model)
-```
-
-This API does not have the option of specifying the input shape of any input
-arrays. If your model requires specifying the input shape, use the
-[`from_concrete_functions`](#concrete_function) classmethod instead. The code
-looks similar to the following:
-
-```python
-model = tf.saved_model.load(export_dir)
-concrete_func = model.signatures[
-  tf.saved_model.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
-concrete_func.inputs[0].set_shape([1, 256, 256, 3])
-converter = TFLiteConverter.from_concrete_functions([concrete_func])
-```
-
-### Converting a Keras model <a name="keras"></a>
-
-The following example shows how to convert a
-[`tf.keras` model](https://www.tensorflow.org/guide/keras/overview) into a
-TensorFlow Lite [`FlatBuffer`](https://google.github.io/flatbuffers/).
-
-```python
-import tensorflow as tf
-
-# Create a simple Keras model.
-x = [-1, 0, 1, 2, 3, 4]
-y = [-3, -1, 1, 3, 5, 7]
-
-model = tf.keras.models.Sequential(
-    [tf.keras.layers.Dense(units=1, input_shape=[1])])
-model.compile(optimizer='sgd', loss='mean_squared_error')
-model.fit(x, y, epochs=50)
-
-# Convert the model.
-converter = tf.lite.TFLiteConverter.from_keras_model(model)
-tflite_model = converter.convert()
-
-# Save the TF Lite model.
-with tf.io.gfile.GFile('model.tflite', 'wb') as f:
-  f.write(tflite_model)
-```
-
-If your model requires specifying the input shape, use `tf.keras.layers.Input`
-or `tf.keras.layers.InputLayer` to create a Keras model with a fixed input shape
-as seen below or use the [`from_concrete_functions`](#concrete_function)
-classmethod as shown in the prior section to set the shape of the input arrays
-prior to conversion.
-
-```python
-input = tf.keras.layers.Input(shape=(1), batch_size=1)
-dense_layer = tf.keras.layers.Dense(units=1, input_shape=[1])
-model = tf.keras.Model(input, dense_layer(input))
-```
-
-```python
-model = tf.keras.models.Sequential(
-    [tf.keras.layers.InputLayer(input_shape=(1), batch_size=1),
-     tf.keras.layers.Dense(units=1, input_shape=[1])])
-```
-
-### Converting a concrete function <a name="concrete_function"></a>
-
-The following example shows how to convert a TensorFlow
-[concrete function](https://tensorflow.org/guide/concrete_function) into a
-TensorFlow Lite [`FlatBuffer`](https://google.github.io/flatbuffers/).
-
-```python
-import tensorflow as tf
-
-# Construct a basic model.
-root = tf.train.Checkpoint()
-root.v1 = tf.Variable(3.)
-root.v2 = tf.Variable(2.)
-root.f = tf.function(lambda x: root.v1 * root.v2 * x)
-
-# Create the concrete function.
-input_data = tf.constant(1., shape=[1, 1])
-concrete_func = root.f.get_concrete_function(input_data)
-
-# Convert the model.
-#
-# `from_concrete_function` takes in a list of concrete functions, however,
-# currently only supports converting one function at a time. Converting multiple
-# functions is under development.
-converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func])
-tflite_model = converter.convert()
-
-# Save the TF Lite model.
-with tf.io.gfile.GFile('model.tflite', 'wb') as f:
-  f.write(tflite_model)
-```
-
-### End-to-end MobileNet conversion <a name="mobilenet"></a>
-
-The following example shows how to convert and run inference on a pre-trained
-`tf.keras` MobileNet model to TensorFlow Lite. It compares the results of the
-TensorFlow and TensorFlow Lite model on random data. In order to load the model
-from file, use `model_path` instead of `model_content`.
-
-```python
-import numpy as np
-import tensorflow as tf
-
-# Load the MobileNet tf.keras model.
-model = tf.keras.applications.MobileNetV2(
-    weights="imagenet", input_shape=(224, 224, 3))
-
-# Convert the model.
-converter = tf.lite.TFLiteConverter.from_keras_model(model)
-tflite_model = converter.convert()
-
-# Load TFLite model and allocate tensors.
-interpreter = tf.lite.Interpreter(model_content=tflite_model)
-interpreter.allocate_tensors()
-
-# Get input and output tensors.
-input_details = interpreter.get_input_details()
-output_details = interpreter.get_output_details()
-
-# Test the TensorFlow Lite model on random input data.
-input_shape = input_details[0]['shape']
-input_data = np.array(np.random.random_sample(input_shape), dtype=np.float32)
-interpreter.set_tensor(input_details[0]['index'], input_data)
-
-interpreter.invoke()
-
-# The function `get_tensor()` returns a copy of the tensor data.
-# Use `tensor()` in order to get a pointer to the tensor.
-tflite_results = interpreter.get_tensor(output_details[0]['index'])
-
-# Test the TensorFlow model on random input data.
-tf_results = model(tf.constant(input_data))
-
-# Compare the result.
-for tf_result, tflite_result in zip(tf_results, tflite_results):
-  np.testing.assert_almost_equal(tf_result, tflite_result, decimal=5)
-```
-
-#### TensorFlow Lite Metadata
-
-Note: TensorFlow Lite Metadata is in experimental (beta) phase.
-
-TensorFlow Lite metadata provides a standard for model descriptions. The
-metadata is an important source of knowledge about what the model does and its
-input / output information. This makes it easier for other developers to
-understand the best practices and for code generators to create platform
-specific wrapper code. For more information, please refer to the
-[TensorFlow Lite Metadata](metadata.md) section.
-
-## Installing TensorFlow <a name="versioning"></a>
-
-### Installing the TensorFlow nightly <a name="2.0-nightly"></a>
-
-The TensorFlow nightly can be installed using the following command:
-
-```sh
-pip install tf-nightly
-```
-
-### Build from source code <a name="latest_package"></a>
-
-In order to run the latest version of the TensorFlow Lite Converter Python API,
-either install the nightly build with
-[pip](https://www.tensorflow.org/install/pip) (recommended) or
-[Docker](https://www.tensorflow.org/install/docker), or
-[build the pip package from source](https://www.tensorflow.org/install/source).
-
-### Custom ops in the experimental new converter
-
-There is a behavior change in how models containing
-[custom ops](https://www.tensorflow.org/lite/guide/ops_custom) (those for which
-users previously set `allow_custom_ops` before) are handled in the
-[new converter](https://github.com/tensorflow/tensorflow/blob/917ebfe5fc1dfacf8eedcc746b7989bafc9588ef/tensorflow/lite/python/lite.py#L81).
-
-**Built-in TensorFlow op**
-
-If you are converting a model with a built-in TensorFlow op that does not exist
-in TensorFlow Lite, you should set the `allow_custom_ops` attribute (same as
-before), explained [here](https://www.tensorflow.org/lite/guide/ops_custom).
-
-**Custom op in TensorFlow**
-
-If you are converting a model with a custom TensorFlow op, it is recommended
-that you write a [TensorFlow kernel](https://www.tensorflow.org/guide/create_op)
-and [TensorFlow Lite kernel](https://www.tensorflow.org/lite/guide/ops_custom).
-This ensures that the model is working end-to-end, from TensorFlow and
-TensorFlow Lite. This also requires setting the `allow_custom_ops` attribute.
-
-**Advanced custom op usage (not recommended)**
-
-If the above is not possible, you can still convert a TensorFlow model
-containing a custom op without a corresponding kernel. You will need to pass the
-[OpDef](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/op_def.proto)
-of the custom op in TensorFlow using `--custom_opdefs` flag, as long as you have
-the corresponding OpDef registered in the TensorFlow global op registry. This
-ensures that the TensorFlow model is valid (i.e. loadable by the TensorFlow
-runtime).
-
-If the custom op is not part of the global TensorFlow op registry, then the
-corresponding OpDef needs to be specified via the `--custom_opdefs` flag. This
-is a list of an OpDef proto in string that needs to be additionally registered.
-Below is an example of a TFLiteAwesomeCustomOp with 2 inputs, 1 output, and 2
-attributes:
-
-```python
-converter.custom_opdefs="""name: 'TFLiteAwesomeCustomOp' input_arg: { name: 'InputA'
-type: DT_FLOAT } input_arg: { name: ‘InputB' type: DT_FLOAT }
-output_arg: { name: 'Output' type: DT_FLOAT } attr : { name: 'Attr1' type:
-'float'} attr : { name: 'Attr2' type: 'list(float)'}"""
-```
diff --git a/tensorflow/lite/g3doc/convert/quantization.md b/tensorflow/lite/g3doc/convert/quantization.md
deleted file mode 100644
index 41593fb29f9..00000000000
--- a/tensorflow/lite/g3doc/convert/quantization.md
+++ /dev/null
@@ -1,79 +0,0 @@
-# Converting Quantized Models
-
-This page provides information for how to convert quantized TensorFlow Lite
-models. For more details, please see the
-[model optimization](../performance/model_optimization.md).
-
-# Post-training: Quantizing models for CPU model size
-
-The simplest way to create a small model is to quantize the weights to 8 bits
-and quantize the inputs/activations "on-the-fly", during inference. This
-has latency benefits, but prioritizes size reduction.
-
-During conversion, set the `optimizations` flag to optimize for size:
-
-```python
-converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
-converter.optimizations = [tf.lite.Optimize.DEFAULT]
-tflite_quant_model = converter.convert()
-```
-
-# Full integer quantization of weights and activations
-
-We can get further latency improvements, reductions in peak memory usage, and
-access to integer only hardware accelerators by making sure all model math is
-quantized. To do this, we need to measure the dynamic range of activations and
-inputs with a representative data set. You can simply create an input data
-generator and provide it to our converter.
-
-```python
-import tensorflow as tf
-
-def representative_dataset_gen():
-  for _ in range(num_calibration_steps):
-    # Get sample input data as a numpy array in a method of your choosing.
-    yield [input]
-
-converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
-converter.optimizations = [tf.lite.Optimize.DEFAULT]
-converter.representative_dataset = representative_dataset_gen
-tflite_quant_model = converter.convert()
-```
-
-# During training: Quantizing models for integer-only execution
-
-Quantizing models for integer-only execution gets a model with even faster
-latency, smaller size, and integer-only accelerators compatible model.
-Currently, this requires training a model with
-["fake-quantization" nodes](https://github.com/tensorflow/tensorflow/tree/r1.13/tensorflow/contrib/quantize).
-
-This is only available in the v1 converter. A longer term solution that's
-compatible with 2.0 semantics is in progress.
-
-Convert the graph:
-
-```python
-converter = tf.compat.v1.lite.TFLiteConverter.from_saved_model(saved_model_dir)
-converter.inference_type = tf.lite.constants.QUANTIZED_UINT8
-input_arrays = converter.get_input_arrays()
-converter.quantized_input_stats = {input_arrays[0] : (0., 1.)}  # mean_value, std_dev
-tflite_model = converter.convert()
-```
-
-For fully integer models, the inputs are uint8. When the `inference_type` is set
-to `QUANTIZED_UINT8` as above, the real_input_value is standardised using the
-[standard-score](https://en.wikipedia.org/wiki/Standard_score) as follows:
-
-real_input_value = (quantized_input_value - mean_value) / std_dev_value
-
-The `mean_value` and `std_dev values` specify how those uint8 values map to the
-float input values used while training the model. For more details, please see
-the
-[TFLiteConverter](https://www.tensorflow.org/api_docs/python/tf/compat/v1/lite/TFLiteConverter)
-
-`mean` is the integer value from 0 to 255 that maps to floating point 0.0f.
-`std_dev` is 255 / (float_max - float_min).
-
-For most users, we recommend using post-training quantization. We are working on
-new tools for post-training and training-time quantization that we hope will
-simplify generating quantized models.
diff --git a/tensorflow/lite/g3doc/guide/android.md b/tensorflow/lite/g3doc/guide/android.md
index a1493090588..420269de941 100644
--- a/tensorflow/lite/g3doc/guide/android.md
+++ b/tensorflow/lite/g3doc/guide/android.md
@@ -16,7 +16,7 @@ to continuously classify whatever it sees from the device's rear-facing camera.
 The application can run either on device or emulator.
 
 Inference is performed using the TensorFlow Lite Java API and the
-[TensorFlow Lite Android Support Library](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/java/README.md).
+[TensorFlow Lite Android Support Library](../inference_with_metadata/lite_support.md).
 The demo app classifies frames in real-time, displaying the top most probable
 classifications. It allows the user to choose between a floating point or
 [quantized](https://www.tensorflow.org/lite/performance/post_training_quantization)
@@ -41,6 +41,36 @@ as a starting point.
 The following sections contain some useful information for working with
 TensorFlow Lite on Android.
 
+### Use the TensorFlow Lite Task Library
+
+TensorFlow Lite Task Library contains a set of powerful and easy-to-use
+task-specific libraries for app developers to create ML experiences with TFLite.
+It provides optimized out-of-box model interfaces for popular machine learning
+tasks, such as image classification, question and answer, etc. The model
+interfaces are specifically designed for each task to achieve the best
+performance and usability. Task Library works cross-platform and is supported on
+Java, C++, and Swift (coming soon).
+
+To use the Support Library in your Android app, we recommend using the AAR
+hosted at JCenter for
+[Task Vision library](https://bintray.com/google/tensorflow/tensorflow-lite-task-vision)
+and
+[Task Text library](https://bintray.com/google/tensorflow/tensorflow-lite-task-text)
+, respectively.
+
+You can specify this in your `build.gradle` dependencies as follows:
+
+```build
+dependencies {
+    implementation 'org.tensorflow:tensorflow-lite-task-vision:0.0.0-nightly'
+    implementation 'org.tensorflow:tensorflow-lite-task-text:0.0.0-nightly'
+}
+```
+
+See the introduction in the
+[TensorFlow Lite Task Library overview](../inference_with_metadata/task_library/overview.md)
+for more details.
+
 ### Use the TensorFlow Lite Android Support Library
 
 The TensorFlow Lite Android Support Library makes it easier to integrate models
@@ -52,8 +82,19 @@ It supports common data formats for inputs and outputs, including images and
 arrays. It also provides pre- and post-processing units that perform tasks such
 as image resizing and cropping.
 
+To use the Support Library in your Android app, we recommend using the
+[TensorFlow Lite Support Library AAR hosted at JCenter](https://bintray.com/google/tensorflow/tensorflow-lite-support).
+
+You can specify this in your `build.gradle` dependencies as follows:
+
+```build
+dependencies {
+    implementation 'org.tensorflow:tensorflow-lite-support:0.0.0-nightly'
+}
+```
+
 To get started, follow the instructions in the
-[TensorFlow Lite Android Support Library README.md](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/java/README.md).
+[TensorFlow Lite Android Support Library](../inference_with_metadata/lite_support.md).
 
 ### Use the TensorFlow Lite AAR from JCenter
 
@@ -92,176 +133,6 @@ To learn more about `abiFilters`, see
 [`NdkOptions`](https://google.github.io/android-gradle-dsl/current/com.android.build.gradle.internal.dsl.NdkOptions.html)
 in the Android Gradle documentation.
 
-### Build TensorFlow Lite locally
-
-In some cases, you might wish to use a local build of TensorFlow Lite. For
-example, you may be building a custom binary that includes
-[operations selected from TensorFlow](https://www.tensorflow.org/lite/guide/ops_select),
-or you may wish to make local changes to TensorFlow Lite.
-
-#### Set up build environment using Docker
-
-*   Download the Docker file. By downloading the Docker file, you agree that the
-    following terms of service govern your use thereof:
-
-*By clicking to accept, you hereby agree that all use of the Android Studio and
-Android Native Development Kit will be governed by the Android Software
-Development Kit License Agreement available at
-https://developer.android.com/studio/terms (such URL may be updated or changed
-by Google from time to time).*
-
-{% dynamic if 'tflite-android-tos' in user.acknowledged_walls and request.tld !=
-'cn' %} You can download the Docker file
-<a href="https://raw.githubusercontent.com/tensorflow/tensorflow/master/tensorflow/tools/dockerfiles/tflite-android.Dockerfile">here</a>
-{% dynamic else %} You must acknowledge the terms of service to download the
-file.
-<a class="button button-blue devsite-acknowledgement-link" data-globally-unique-wall-id="tflite-android-tos">Acknowledge</a>
-{% dynamic endif %}
-
-*   You can optionally change the Android SDK or NDK version. Put the downloaded
-    Docker file in an empty folder and build your docker image by running:
-
-```shell
-docker build . -t tflite-builder -f tflite-android.Dockerfile
-```
-
-*   Start the docker container interactively by mounting your current folder to
-    /tmp inside the container (note that /tensorflow_src is the TensorFlow
-    repository inside the container):
-
-```shell
-docker run -it -v $PWD:/tmp tflite-builder bash
-```
-
-If you use PowerShell on Windows, replace "$PWD" with "pwd".
-
-If you would like to use a TensorFlow repository on the host, mount that host
-directory instead (-v hostDir:/tmp).
-
-*   Once you are inside the container, you can run the following to download
-    additional Android tools and libraries (note that you may need to accept the
-    license):
-
-```shell
-android update sdk --no-ui -a --filter tools,platform-tools,android-${ANDROID_API_LEVEL},build-tools-${ANDROID_BUILD_TOOLS_VERSION}
-```
-
-You can now proceed to the "Build and Install" section. After you are finished
-building the libraries, you can copy them to /tmp inside the container so that
-you can access them on the host.
-
-#### Set up build environment without Docker
-
-##### Install Bazel and Android Prerequisites
-
-Bazel is the primary build system for TensorFlow. To build with it, you must
-have it and the Android NDK and SDK installed on your system.
-
-1.  Install the latest version of the [Bazel build system](https://bazel.build/versions/master/docs/install.html).
-2.  The Android NDK is required to build the native (C/C++) TensorFlow Lite
-    code. The current recommended version is 17c, which may be found
-    [here](https://developer.android.com/ndk/downloads/older_releases.html#ndk-17c-downloads).
-3.  The Android SDK and build tools may be obtained
-    [here](https://developer.android.com/tools/revisions/build-tools.html), or
-    alternatively as part of
-    [Android Studio](https://developer.android.com/studio/index.html). Build
-    tools API >= 23 is the recommended version for building TensorFlow Lite.
-
-##### Configure WORKSPACE and .bazelrc
-
-Run the `./configure` script in the root TensorFlow checkout directory, and
-answer "Yes" when the script asks to interactively configure the `./WORKSPACE`
-for Android builds. The script will attempt to configure settings using the
-following environment variables:
-
-*   `ANDROID_SDK_HOME`
-*   `ANDROID_SDK_API_LEVEL`
-*   `ANDROID_NDK_HOME`
-*   `ANDROID_NDK_API_LEVEL`
-
-If these variables aren't set, they must be provided interactively in the script
-prompt. Successful configuration should yield entries similar to the following
-in the `.tf_configure.bazelrc` file in the root folder:
-
-```shell
-build --action_env ANDROID_NDK_HOME="/usr/local/android/android-ndk-r17c"
-build --action_env ANDROID_NDK_API_LEVEL="21"
-build --action_env ANDROID_BUILD_TOOLS_VERSION="28.0.3"
-build --action_env ANDROID_SDK_API_LEVEL="23"
-build --action_env ANDROID_SDK_HOME="/usr/local/android/android-sdk-linux"
-```
-
-#### Build and install
-
-Once Bazel is properly configured, you can build the TensorFlow Lite AAR from
-the root checkout directory as follows:
-
-```sh
-bazel build -c opt --fat_apk_cpu=x86,x86_64,arm64-v8a,armeabi-v7a \
-  --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
-  //tensorflow/lite/java:tensorflow-lite
-```
-
-This will generate an AAR file in `bazel-bin/tensorflow/lite/java/`. Note
-that this builds a "fat" AAR with several different architectures; if you don't
-need all of them, use the subset appropriate for your deployment environment.
-From there, there are several approaches you can take to use the .aar in your
-Android Studio project.
-
-##### Add AAR directly to project
-
-Move the `tensorflow-lite.aar` file into a directory called `libs` in your
-project. Modify your app's `build.gradle` file to reference the new directory
-and replace the existing TensorFlow Lite dependency with the new local library,
-e.g.:
-
-```
-allprojects {
-    repositories {
-        jcenter()
-        flatDir {
-            dirs 'libs'
-        }
-    }
-}
-
-dependencies {
-    compile(name:'tensorflow-lite', ext:'aar')
-}
-```
-
-##### Install AAR to local Maven repository
-
-Execute the following command from your root checkout directory:
-
-```sh
-mvn install:install-file \
-  -Dfile=bazel-bin/tensorflow/lite/java/tensorflow-lite.aar \
-  -DgroupId=org.tensorflow \
-  -DartifactId=tensorflow-lite -Dversion=0.1.100 -Dpackaging=aar
-```
-
-In your app's `build.gradle`, ensure you have the `mavenLocal()` dependency and
-replace the standard TensorFlow Lite dependency with the one that has support
-for select TensorFlow ops:
-
-```
-allprojects {
-    repositories {
-        jcenter()
-        mavenLocal()
-    }
-}
-
-dependencies {
-    implementation 'org.tensorflow:tensorflow-lite:0.1.100'
-}
-```
-
-Note that the `0.1.100` version here is purely for the sake of
-testing/development. With the local AAR installed, you can use the standard
-[TensorFlow Lite Java inference APIs](../guide/inference.md) in your app code.
-
 ## Build Android app using C++
 
 There are two ways to use TFLite through C++ if you build your app with the NDK:
diff --git a/tensorflow/lite/g3doc/guide/build_android.md b/tensorflow/lite/g3doc/guide/build_android.md
new file mode 100644
index 00000000000..792c609bc0e
--- /dev/null
+++ b/tensorflow/lite/g3doc/guide/build_android.md
@@ -0,0 +1,190 @@
+# Build TensorFlow Lite for Android
+
+This document describes how to build TensorFlow Lite Android library on your
+own. Normally, you do not need to locally build TensorFlow Lite Android library.
+If you just want to use it, the easiest way is using the
+[TensorFlow Lite AAR hosted at JCenter](https://bintray.com/google/tensorflow/tensorflow-lite).
+See [Android quickstart](../guide/android.md) for more details on how to use
+them in your Android projects.
+
+## Build TensorFlow Lite locally
+
+In some cases, you might wish to use a local build of TensorFlow Lite. For
+example, you may be building a custom binary that includes
+[operations selected from TensorFlow](https://www.tensorflow.org/lite/guide/ops_select),
+or you may wish to make local changes to TensorFlow Lite.
+
+### Set up build environment using Docker
+
+*   Download the Docker file. By downloading the Docker file, you agree that the
+    following terms of service govern your use thereof:
+
+*By clicking to accept, you hereby agree that all use of the Android Studio and
+Android Native Development Kit will be governed by the Android Software
+Development Kit License Agreement available at
+https://developer.android.com/studio/terms (such URL may be updated or changed
+by Google from time to time).*
+
+{% dynamic if 'tflite-android-tos' in user.acknowledged_walls and request.tld !=
+'cn' %} You can download the Docker file
+<a href="https://raw.githubusercontent.com/tensorflow/tensorflow/master/tensorflow/tools/dockerfiles/tflite-android.Dockerfile">here</a>
+{% dynamic else %} You must acknowledge the terms of service to download the
+file.
+<a class="button button-blue devsite-acknowledgement-link" data-globally-unique-wall-id="tflite-android-tos">Acknowledge</a>
+{% dynamic endif %}
+
+*   You can optionally change the Android SDK or NDK version. Put the downloaded
+    Docker file in an empty folder and build your docker image by running:
+
+```shell
+docker build . -t tflite-builder -f tflite-android.Dockerfile
+```
+
+*   Start the docker container interactively by mounting your current folder to
+    /tmp inside the container (note that /tensorflow_src is the TensorFlow
+    repository inside the container):
+
+```shell
+docker run -it -v $PWD:/tmp tflite-builder bash
+```
+
+If you use PowerShell on Windows, replace "$PWD" with "pwd".
+
+If you would like to use a TensorFlow repository on the host, mount that host
+directory instead (-v hostDir:/tmp).
+
+*   Once you are inside the container, you can run the following to download
+    additional Android tools and libraries (note that you may need to accept the
+    license):
+
+```shell
+android update sdk --no-ui -a --filter tools,platform-tools,android-${ANDROID_API_LEVEL},build-tools-${ANDROID_BUILD_TOOLS_VERSION}
+```
+
+You can now proceed to the "Build and Install" section. After you are finished
+building the libraries, you can copy them to /tmp inside the container so that
+you can access them on the host.
+
+### Set up build environment without Docker
+
+#### Install Bazel and Android Prerequisites
+
+Bazel is the primary build system for TensorFlow. To build with it, you must
+have it and the Android NDK and SDK installed on your system.
+
+1.  Install the latest version of the [Bazel build system](https://bazel.build/versions/master/docs/install.html).
+2.  The Android NDK is required to build the native (C/C++) TensorFlow Lite
+    code. The current recommended version is 17c, which may be found
+    [here](https://developer.android.com/ndk/downloads/older_releases.html#ndk-17c-downloads).
+3.  The Android SDK and build tools may be obtained
+    [here](https://developer.android.com/tools/revisions/build-tools.html), or
+    alternatively as part of
+    [Android Studio](https://developer.android.com/studio/index.html). Build
+    tools API >= 23 is the recommended version for building TensorFlow Lite.
+
+#### Configure WORKSPACE and .bazelrc
+
+Run the `./configure` script in the root TensorFlow checkout directory, and
+answer "Yes" when the script asks to interactively configure the `./WORKSPACE`
+for Android builds. The script will attempt to configure settings using the
+following environment variables:
+
+*   `ANDROID_SDK_HOME`
+*   `ANDROID_SDK_API_LEVEL`
+*   `ANDROID_NDK_HOME`
+*   `ANDROID_NDK_API_LEVEL`
+
+If these variables aren't set, they must be provided interactively in the script
+prompt. Successful configuration should yield entries similar to the following
+in the `.tf_configure.bazelrc` file in the root folder:
+
+```shell
+build --action_env ANDROID_NDK_HOME="/usr/local/android/android-ndk-r17c"
+build --action_env ANDROID_NDK_API_LEVEL="21"
+build --action_env ANDROID_BUILD_TOOLS_VERSION="28.0.3"
+build --action_env ANDROID_SDK_API_LEVEL="23"
+build --action_env ANDROID_SDK_HOME="/usr/local/android/android-sdk-linux"
+```
+
+### Build and install
+
+Once Bazel is properly configured, you can build the TensorFlow Lite AAR from
+the root checkout directory as follows:
+
+```sh
+bazel build -c opt --fat_apk_cpu=x86,x86_64,arm64-v8a,armeabi-v7a \
+  --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
+  //tensorflow/lite/java:tensorflow-lite
+```
+
+This will generate an AAR file in `bazel-bin/tensorflow/lite/java/`. Note
+that this builds a "fat" AAR with several different architectures; if you don't
+need all of them, use the subset appropriate for your deployment environment.
+
+Caution: Following feature is experimental and only available at HEAD. You can
+build smaller AAR files targeting only a set of models as follows:
+
+```sh
+bash tensorflow/lite/tools/build_aar.sh \
+  --input_models=model1,model2 \
+  --target_archs=x86,x86_64,arm64-v8a,armeabi-v7a
+```
+
+Above script will generate the `tensorflow-lite.aar` file and optionally the
+`tensorflow-lite-select-tf-ops.aar` file if one of the models is using
+Tensorflow ops. For more details, please see the
+[Reduce TensorFlow Lite binary size](../guide/reduce_binary_size.md) section.
+
+#### Add AAR directly to project
+
+Move the `tensorflow-lite.aar` file into a directory called `libs` in your
+project. Modify your app's `build.gradle` file to reference the new directory
+and replace the existing TensorFlow Lite dependency with the new local library,
+e.g.:
+
+```
+allprojects {
+    repositories {
+        jcenter()
+        flatDir {
+            dirs 'libs'
+        }
+    }
+}
+
+dependencies {
+    compile(name:'tensorflow-lite', ext:'aar')
+}
+```
+
+#### Install AAR to local Maven repository
+
+Execute the following command from your root checkout directory:
+
+```sh
+mvn install:install-file \
+  -Dfile=bazel-bin/tensorflow/lite/java/tensorflow-lite.aar \
+  -DgroupId=org.tensorflow \
+  -DartifactId=tensorflow-lite -Dversion=0.1.100 -Dpackaging=aar
+```
+
+In your app's `build.gradle`, ensure you have the `mavenLocal()` dependency and
+replace the standard TensorFlow Lite dependency with the one that has support
+for select TensorFlow ops:
+
+```
+allprojects {
+    repositories {
+        jcenter()
+        mavenLocal()
+    }
+}
+
+dependencies {
+    implementation 'org.tensorflow:tensorflow-lite:0.1.100'
+}
+```
+
+Note that the `0.1.100` version here is purely for the sake of
+testing/development. With the local AAR installed, you can use the standard
+[TensorFlow Lite Java inference APIs](../guide/inference.md) in your app code.
diff --git a/tensorflow/lite/g3doc/guide/build_arm64.md b/tensorflow/lite/g3doc/guide/build_arm64.md
index 30ad231cabf..c07c81cd69b 100644
--- a/tensorflow/lite/g3doc/guide/build_arm64.md
+++ b/tensorflow/lite/g3doc/guide/build_arm64.md
@@ -1,22 +1,22 @@
 # Build TensorFlow Lite for ARM64 boards
 
-This page describes how to build the TensorFlow Lite static library for
-ARM64-based computers. If you just want to start using TensorFlow Lite to
+This page describes how to build the TensorFlow Lite static and shared libraries
+for ARM64-based computers. If you just want to start using TensorFlow Lite to
 execute your models, the fastest option is to install the TensorFlow Lite
 runtime package as shown in the [Python quickstart](python.md).
 
-Note: This page shows how to compile only the C++ static library for
-TensorFlow Lite. Alternative install options include: [install just the Python
-interpreter API](python.md) (for inferencing only); [install the full
-TensorFlow package from pip](https://www.tensorflow.org/install/pip);
-or [build the full TensorFlow package](
-https://www.tensorflow.org/install/source).
+Note: This page shows how to compile only the C++ static and shared libraries
+for TensorFlow Lite. Alternative install options include:
+[install just the Python interpreter API](python.md) (for inferencing only);
+[install the full TensorFlow package from pip](https://www.tensorflow.org/install/pip);
+or
+[build the full TensorFlow package](https://www.tensorflow.org/install/source).
 
-## Cross-compile for ARM64
+## Cross-compile for ARM64 with Make
 
 To ensure the proper build environment, we recommend using one of our TensorFlow
-Docker images such as [tensorflow/tensorflow:nightly-devel](
-https://hub.docker.com/r/tensorflow/tensorflow/tags/).
+Docker images such as
+[tensorflow/tensorflow:devel](https://hub.docker.com/r/tensorflow/tensorflow/tags/).
 
 To get started, install the toolchain and libs:
 
@@ -28,10 +28,10 @@ sudo apt-get install crossbuild-essential-arm64
 If you are using Docker, you may not use `sudo`.
 
 Now git-clone the TensorFlow repository
-(`https://github.com/tensorflow/tensorflow`)—if you're using the TensorFlow
-Docker image, the repo is already provided in `/tensorflow_src/`—and then run
-this script at the root of the TensorFlow repository to download all the
-build dependencies:
+(https://github.com/tensorflow/tensorflow)—if you're using the TensorFlow Docker
+image, the repo is already provided in `/tensorflow_src/`—and then run this
+script at the root of the TensorFlow repository to download all the build
+dependencies:
 
 ```bash
 ./tensorflow/lite/tools/make/download_dependencies.sh
@@ -59,8 +59,8 @@ sudo apt-get install build-essential
 ```
 
 Now git-clone the TensorFlow repository
-(`https://github.com/tensorflow/tensorflow`) and run this at the root of
-the repository:
+(https://github.com/tensorflow/tensorflow) and run this at the root of the
+repository:
 
 ```bash
 ./tensorflow/lite/tools/make/download_dependencies.sh
@@ -76,3 +76,68 @@ Then compile:
 
 This should compile a static library in:
 `tensorflow/lite/tools/make/gen/linux_aarch64/lib/libtensorflow-lite.a`.
+
+## Cross-compile for ARM64 with Bazel
+
+You can use
+[ARM GCC toolchains](https://github.com/tensorflow/tensorflow/tree/master/third_party/toolchains/embedded/arm-linux)
+with Bazel to build an ARM64 shared library.
+
+Note: The generated shared library requires glibc 2.28 or higher to run.
+
+The following instructions have been tested on Ubuntu 16.04.3 64-bit PC (AMD64)
+and TensorFlow devel docker image
+[tensorflow/tensorflow:devel](https://hub.docker.com/r/tensorflow/tensorflow/tags/).
+
+To cross compile TensorFlow Lite with Bazel, follow the steps:
+
+#### Step 1. Install Bazel
+
+Bazel is the primary build system for TensorFlow. Install the latest version of
+the [Bazel build system](https://bazel.build/versions/master/docs/install.html).
+
+**Note:** If you're using the TensorFlow Docker image, Bazel is already
+available.
+
+#### Step 2. Clone TensorFlow repository
+
+```sh
+git clone https://github.com/tensorflow/tensorflow.git tensorflow_src
+```
+
+**Note:** If you're using the TensorFlow Docker image, the repo is already
+provided in `/tensorflow_src/`.
+
+#### Step 3. Build ARM64 binary
+
+##### C library
+
+```bash
+bazel build --config=elinux_aarch64 -c opt //tensorflow/lite/c:libtensorflowlite_c.so
+```
+
+Check
+[TensorFlow Lite C API](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/c)
+page for the detail.
+
+##### C++ library
+
+```bash
+bazel build --config=elinux_aarch64 -c opt //tensorflow/lite:libtensorflowlite.so
+```
+
+You can find a shared library library in:
+`bazel-bin/tensorflow/lite/libtensorflowlite.so`.
+
+Currently, there is no straightforward way to extract all header files needed,
+so you must include all header files in tensorflow/lite/ from the TensorFlow
+repository. Additionally, you will need header files from FlatBuffers and
+Abseil.
+
+##### Etc
+
+You can also build other Bazel targets with the toolchain. Here are some useful
+targets.
+
+*   //tensorflow/lite/tools/benchmark:benchmark_model
+*   //tensorflow/lite/examples/label_image:label_image
diff --git a/tensorflow/lite/g3doc/guide/build_rpi.md b/tensorflow/lite/g3doc/guide/build_rpi.md
index 0f49ed91315..3b420926991 100644
--- a/tensorflow/lite/g3doc/guide/build_rpi.md
+++ b/tensorflow/lite/g3doc/guide/build_rpi.md
@@ -1,66 +1,83 @@
 # Build TensorFlow Lite for Raspberry Pi
 
-This page describes how to build the TensorFlow Lite static library for
-Raspberry Pi. If you just want to start using TensorFlow Lite to execute your
-models, the fastest option is to install the TensorFlow Lite runtime package as
-shown in the [Python quickstart](python.md).
+This page describes how to build the TensorFlow Lite static and shared libraries
+for Raspberry Pi. If you just want to start using TensorFlow Lite to execute
+your models, the fastest option is to install the TensorFlow Lite runtime
+package as shown in the [Python quickstart](python.md).
 
-**Note:** This page shows how to compile only the C++ static library for
+**Note:** This page shows how to compile the C++ static and shared libraries for
 TensorFlow Lite. Alternative install options include:
 [install just the Python interpreter API](python.md) (for inferencing only);
 [install the full TensorFlow package from pip](https://www.tensorflow.org/install/pip);
 or
 [build the full TensorFlow package](https://www.tensorflow.org/install/source_rpi).
 
-## Cross-compile for Raspberry Pi
+**Note:** This page only covers 32-bit builds. If you're looking for 64-bit
+builds, check [Build for ARM64](build_arm64.md) page.
+
+## Cross-compile for Raspberry Pi with Make
 
 The following instructions have been tested on Ubuntu 16.04.3 64-bit PC (AMD64)
 and TensorFlow devel docker image
-[tensorflow/tensorflow:nightly-devel](https://hub.docker.com/r/tensorflow/tensorflow/tags/).
+[tensorflow/tensorflow:devel](https://hub.docker.com/r/tensorflow/tensorflow/tags/).
 
 To cross compile TensorFlow Lite follow the steps:
 
-1.  Clone official Raspberry Pi cross-compilation toolchain:
+#### Step 1. Clone official Raspberry Pi cross-compilation toolchain
 
-    ```sh
-    git clone https://github.com/raspberrypi/tools.git rpi_tools
-    ```
+```sh
+git clone https://github.com/raspberrypi/tools.git rpi_tools
+```
 
-2.  Clone TensorFlow repository:
+#### Step 2. Clone TensorFlow repository
 
-    ```sh
-    git clone https://github.com/tensorflow/tensorflow.git tensorflow_src
-    ```
+```sh
+git clone https://github.com/tensorflow/tensorflow.git tensorflow_src
+```
 
-    **Note:** If you're using the TensorFlow Docker image, the repo is already
-    provided in `/tensorflow_src/`.
+**Note:** If you're using the TensorFlow Docker image, the repo is already
+provided in `/tensorflow_src/`.
 
-3.  Run following script at the root of the TensorFlow repository to download
-    all the build dependencies:
+#### Step 3. Run following script at the root of the TensorFlow repository to download
 
-    ```sh
-    cd tensorflow_src && ./tensorflow/lite/tools/make/download_dependencies.sh
-    ```
+all the build dependencies:
 
-    **Note:** You only need to do this once.
+```sh
+cd tensorflow_src && ./tensorflow/lite/tools/make/download_dependencies.sh
+```
 
-4.  To build ARMv7 binary for Raspberry Pi 2, 3 and 4 execute:
+**Note:** You only need to do this once.
 
-    ```sh
-    PATH=../rpi_tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf/bin:$PATH ./tensorflow/lite/tools/make/build_rpi_lib.sh
-    ```
+#### Step 4a. To build ARMv7 binary for Raspberry Pi 2, 3 and 4
 
-    **Note:** This should compile a static library in:
-    `tensorflow/lite/tools/make/gen/rpi_armv7l/lib/libtensorflow-lite.a`.
+```sh
+PATH=../rpi_tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf/bin:$PATH \
+  ./tensorflow/lite/tools/make/build_rpi_lib.sh
+```
 
-5.  To build ARMv6 binary for Raspberry Pi Zero execute:
+**Note:** This should compile a static library in:
+`tensorflow/lite/tools/make/gen/rpi_armv7l/lib/libtensorflow-lite.a`.
 
-    ```sh
-    PATH=../rpi_tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf/bin:$PATH ./tensorflow/lite/tools/make/build_rpi_lib.sh TARGET_ARCH=armv6
-    ```
+You can add additional Make options or target names to the `build_rpi_lib.sh`
+script since it's a wrapper of Make with TFLite
+[Makefile](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tools/make/Makefile).
+Here are some possible options:
 
-    **Note:** This should compile a static library in:
-    `tensorflow/lite/tools/make/gen/rpi_armv6/lib/libtensorflow-lite.a`.
+```sh
+./tensorflow/lite/tools/make/build_rpi_lib.sh clean # clean object files
+./tensorflow/lite/tools/make/build_rpi_lib.sh -j 16 # run with 16 jobs to leverage more CPU cores
+./tensorflow/lite/tools/make/build_rpi_lib.sh label_image # # build label_image binary
+```
+
+#### Step 4b. To build ARMv6 binary for Raspberry Pi Zero
+
+```sh
+PATH=../rpi_tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf/bin:$PATH \
+  ./tensorflow/lite/tools/make/build_rpi_lib.sh TARGET_ARCH=armv6
+```
+
+**Note:** This should compile a static library in:
+`tensorflow/lite/tools/make/gen/rpi_armv6/lib/libtensorflow-lite.a`.
 
 ## Compile natively on Raspberry Pi
 
@@ -69,32 +86,97 @@ GNU/Linux 10 (buster), gcc version 8.3.0 (Raspbian 8.3.0-6+rpi1):
 
 To natively compile TensorFlow Lite follow the steps:
 
-1.  Log in to your Raspberry Pi and install the toolchain:
+#### Step 1. Log in to your Raspberry Pi and install the toolchain
 
-    ```sh
-    sudo apt-get install build-essential
-    ```
+```sh
+sudo apt-get install build-essential
+```
 
-2.  Clone TensorFlow repository:
+#### Step 2. Clone TensorFlow repository
 
-    ```sh
-    git clone https://github.com/tensorflow/tensorflow.git tensorflow_src
-    ```
+```sh
+git clone https://github.com/tensorflow/tensorflow.git tensorflow_src
+```
 
-3.  Run following script at the root of the TensorFlow repository to download
-    all the build dependencies:
+#### Step 3. Run following script at the root of the TensorFlow repository to download all the build dependencies
 
-    ```sh
-    cd tensorflow_src && ./tensorflow/lite/tools/make/download_dependencies.sh
-    ```
+```sh
+cd tensorflow_src && ./tensorflow/lite/tools/make/download_dependencies.sh
+```
 
-    **Note:** You only need to do this once.
+**Note:** You only need to do this once.
 
-4.  You should then be able to compile TensorFlow Lite with:
+#### Step 4. You should then be able to compile TensorFlow Lite with:
 
-    ```sh
-    ./tensorflow/lite/tools/make/build_rpi_lib.sh
-    ```
+```sh
+./tensorflow/lite/tools/make/build_rpi_lib.sh
+```
 
-    **Note:** This should compile a static library in:
-    `tensorflow/lite/tools/make/gen/lib/rpi_armv6/libtensorflow-lite.a`.
+**Note:** This should compile a static library in:
+`tensorflow/lite/tools/make/gen/lib/rpi_armv6/libtensorflow-lite.a`.
+
+## Cross-compile for armhf with Bazel
+
+You can use
+[ARM GCC toolchains](https://github.com/tensorflow/tensorflow/tree/master/third_party/toolchains/embedded/arm-linux)
+with Bazel to build an armhf shared library which is compatibile with Raspberry
+Pi 2, 3 and 4.
+
+Note: The generated shared library requires glibc 2.28 or higher to run.
+
+The following instructions have been tested on Ubuntu 16.04.3 64-bit PC (AMD64)
+and TensorFlow devel docker image
+[tensorflow/tensorflow:devel](https://hub.docker.com/r/tensorflow/tensorflow/tags/).
+
+To cross compile TensorFlow Lite with Bazel, follow the steps:
+
+#### Step 1. Install Bazel
+
+Bazel is the primary build system for TensorFlow. Install the latest version of
+the [Bazel build system](https://bazel.build/versions/master/docs/install.html).
+
+**Note:** If you're using the TensorFlow Docker image, Bazel is already
+available.
+
+#### Step 2. Clone TensorFlow repository
+
+```sh
+git clone https://github.com/tensorflow/tensorflow.git tensorflow_src
+```
+
+**Note:** If you're using the TensorFlow Docker image, the repo is already
+provided in `/tensorflow_src/`.
+
+#### Step 3. Build ARMv7 binary for Raspberry Pi 2, 3 and 4
+
+##### C library
+
+```bash
+bazel build --config=elinux_armhf -c opt //tensorflow/lite/c:libtensorflowlite_c.so
+```
+
+Check
+[TensorFlow Lite C API](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/c)
+page for the detail.
+
+##### C++ library
+
+```bash
+bazel build --config=elinux_armhf -c opt //tensorflow/lite:libtensorflowlite.so
+```
+
+You can find a shared library library in:
+`bazel-bin/tensorflow/lite/libtensorflowlite.so`.
+
+Currently, there is no straightforward way to extract all header files needed,
+so you must include all header files in tensorflow/lite/ from the TensorFlow
+repository. Additionally, you will need header files from FlatBuffers and
+Abseil.
+
+##### Etc
+
+You can also build other Bazel targets with the toolchain. Here are some useful
+targets.
+
+*   //tensorflow/lite/tools/benchmark:benchmark_model
+*   //tensorflow/lite/examples/label_image:label_image
diff --git a/tensorflow/lite/g3doc/guide/get_started.md b/tensorflow/lite/g3doc/guide/get_started.md
index c9543c7f553..df206e73416 100644
--- a/tensorflow/lite/g3doc/guide/get_started.md
+++ b/tensorflow/lite/g3doc/guide/get_started.md
@@ -67,6 +67,10 @@ If you have designed and trained your own TensorFlow model, or you have trained
 a model obtained from another source, you must
 [convert it to the TensorFlow Lite format](#2_convert_the_model_format).
 
+You can also try [The TensorFlow Lite Model Maker library](model_maker.md) which
+simplifies the process of training a TensorFlow Lite model using custom
+datasets.
+
 ## 2. Convert the model
 
 <a id="2_convert_the_model_format"></a>
diff --git a/tensorflow/lite/g3doc/guide/hosted_models.md b/tensorflow/lite/g3doc/guide/hosted_models.md
index a97be10648a..32887a53c11 100644
--- a/tensorflow/lite/g3doc/guide/hosted_models.md
+++ b/tensorflow/lite/g3doc/guide/hosted_models.md
@@ -4,8 +4,8 @@ The following is an incomplete list of pre-trained models optimized to work with
 TensorFlow Lite.
 
 To get started choosing a model, visit <a href="../models">Models</a> page with
-end-to-end examples, or pick a [TensorFlow Lite model from TensorFlow Hub]
-(https://tfhub.dev/s?deployment-format=lite).
+end-to-end examples, or pick a
+[TensorFlow Lite model from TensorFlow Hub](https://tfhub.dev/s?deployment-format=lite).
 
 Note: The best model for a given application depends on your requirements. For
 example, some applications might benefit from higher accuracy, while others
@@ -16,6 +16,9 @@ models to find the optimal balance between size, performance, and accuracy.
 
 For more information about image classification, see
 <a href="../models/image_classification/overview.md">Image classification</a>.
+Explore the TensorFlow Lite Task Library for instructions about
+[how to integrate image classification models](../inference_with_metadata/task_library/image_classifier)
+in just a few lines of code.
 
 ### Quantized models
 
@@ -24,7 +27,8 @@ classification models offer the smallest model size and fastest performance, at
 the expense of accuracy. The performance values are measured on Pixel 3 on
 Android 10.
 
-You can find many [quantized models](https://tfhub.dev/s?deployment-format=lite&module-type=image-classification&q=quantized)
+You can find many
+[quantized models](https://tfhub.dev/s?deployment-format=lite&module-type=image-classification&q=quantized)
 from TensorFlow Hub and get more model information there.
 
 Model name                  | Paper and model                                                                                                                                                                   | Model size | Top-1 accuracy | Top-5 accuracy | CPU, 4 threads | NNAPI
@@ -54,8 +58,8 @@ Inception_V4_quant          | [paper](https://arxiv.org/abs/1602.07261), [tflite
 Note: The model files include both TF Lite FlatBuffer and Tensorflow frozen
 Graph.
 
-Note: Performance numbers were benchmarked on Pixel-3 (Android 10).
-Accuracy numbers were computed using the
+Note: Performance numbers were benchmarked on Pixel-3 (Android 10). Accuracy
+numbers were computed using the
 [TFLite image classification evaluation tool](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification).
 
 ### Floating point models
@@ -65,7 +69,8 @@ performance. <a href="../performance/gpu">GPU acceleration</a> requires the use
 of floating point models. The performance values are measured on Pixel 3 on
 Android 10.
 
-You can find many [image classification models](https://tfhub.dev/s?deployment-format=lite&module-type=image-classification)
+You can find many
+[image classification models](https://tfhub.dev/s?deployment-format=lite&module-type=image-classification)
 from TensorFlow Hub and get more model information there.
 
 Model name            | Paper and model                                                                                                                                                                           | Model size | Top-1 accuracy | Top-5 accuracy | CPU, 4 threads | GPU    | NNAPI
@@ -102,8 +107,9 @@ The following image classification models were created using
 <a href="https://cloud.google.com/automl/">Cloud AutoML</a>. The performance
 values are measured on Pixel 3 on Android 10.
 
-You can find these models in [TensorFlow Hub](https://tfhub.dev/s?deployment-format=lite&q=MnasNet)
-and get more model information there.
+You can find these models in
+[TensorFlow Hub](https://tfhub.dev/s?deployment-format=lite&q=MnasNet) and get
+more model information there.
 
 Model Name       | Paper and model                                                                                                                                                | Model size | Top-1 accuracy | Top-5 accuracy | CPU, 4 threads | GPU     | NNAPI
 ---------------- | :------------------------------------------------------------------------------------------------------------------------------------------------------------: | ---------: | -------------: | -------------: | -------------: | ------: | ----:
@@ -116,16 +122,20 @@ MnasNet_1.0_192  | [paper](https://arxiv.org/abs/1807.11626), [tflite&pb](https:
 MnasNet_1.0_224  | [paper](https://arxiv.org/abs/1807.11626), [tflite&pb](https://storage.cloud.google.com/download.tensorflow.org/models/tflite/mnasnet_1.0_224_09_07_2018.tgz)  | 17 Mb      | 74.08%         | 91.75%         | 19.4 ms        | 8.7 ms  | 19 ms
 MnasNet_1.3_224  | [paper](https://arxiv.org/abs/1807.11626), [tflite&pb](https://storage.cloud.google.com/download.tensorflow.org/models/tflite/mnasnet_1.3_224_09_07_2018.tgz)  | 24 Mb      | 75.24%         | 92.55%         | 27.9 ms        | 10.6 ms | 22.0 ms
 
-Note: Performance numbers were benchmarked on Pixel-3 (Android 10).
-Accuracy numbers were computed using the
+Note: Performance numbers were benchmarked on Pixel-3 (Android 10). Accuracy
+numbers were computed using the
 [TFLite image classification evaluation tool](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification).
 
 ## Object detection
 
 For more information about object detection, see
-<a href="../models/object_detection/overview.md">Object detection</a>.
+<a href="../models/object_detection/overview.md">Object detection</a>. Explore
+the TensorFlow Lite Task Library for instructions about
+[how to integrate object detection models](../inference_with_metadata/task_library/object_detector)
+in just a few lines of code.
 
-Please find [object detection models](https://tfhub.dev/s?deployment-format=lite&module-type=image-object-detection)
+Please find
+[object detection models](https://tfhub.dev/s?deployment-format=lite&module-type=image-object-detection)
 from TensorFlow Hub.
 
 ## Pose estimation
@@ -133,21 +143,29 @@ from TensorFlow Hub.
 For more information about pose estimation, see
 <a href="../models/pose_estimation/overview.md">Pose estimation</a>.
 
-Please find [pose estimation models](https://tfhub.dev/s?deployment-format=lite&module-type=image-pose-detection)
+Please find
+[pose estimation models](https://tfhub.dev/s?deployment-format=lite&module-type=image-pose-detection)
 from TensorFlow Hub.
 
 ## Image segmentation
 
 For more information about image segmentation, see
-<a href="../models/segmentation/overview.md">Segmentation</a>.
+<a href="../models/segmentation/overview.md">Segmentation</a>. Explore the
+TensorFlow Lite Task Library for instructions about
+[how to integrate image segmentation models](../inference_with_metadata/task_library/image_segmenter)
+in just a few lines of code.
 
-Please find [image segmentation models](https://tfhub.dev/s?deployment-format=lite&module-type=image-segmentation)
+Please find
+[image segmentation models](https://tfhub.dev/s?deployment-format=lite&module-type=image-segmentation)
 from TensorFlow Hub.
 
 ## Question and Answer
 
-For more information about text classification with Mobile BERT, see
-<a href="../models/bert_qa/overview.md">Question And Answer</a>.
+For more information about question and answer with MobileBERT, see
+<a href="../models/bert_qa/overview.md">Question And Answer</a>. Explore the
+TensorFlow Lite Task Library for instructions about
+[how to integrate question and answer models](../inference_with_metadata/task_library/bert_question_answerer)
+in just a few lines of code.
 
 Please find [Mobile BERT model](https://tfhub.dev/tensorflow/mobilebert/1) from
 TensorFlow Hub.
diff --git a/tensorflow/lite/g3doc/guide/inference.md b/tensorflow/lite/g3doc/guide/inference.md
index fbf03ab84b5..e5e65fac069 100644
--- a/tensorflow/lite/g3doc/guide/inference.md
+++ b/tensorflow/lite/g3doc/guide/inference.md
@@ -84,7 +84,7 @@ platform specific wrapper code. The wrapper code removes the need to interact
 directly with `ByteBuffer` on Android. Instead, developers can interact with the
 TensorFlow Lite model with typed objects such as `Bitmap` and `Rect`. For more
 information, please refer to the
-[TensorFlow Lite Android wrapper code generator](codegen.md).
+[TensorFlow Lite Android wrapper code generator](../inference_with_metadata/codegen.md).
 
 ### iOS
 
@@ -365,7 +365,7 @@ TfLiteInterpreterInvoke(interpreter);
 
 // Extract the output tensor data.
 const TfLiteTensor* output_tensor =
-//      TfLiteInterpreterGetOutputTensor(interpreter, 0);
+    TfLiteInterpreterGetOutputTensor(interpreter, 0);
 TfLiteTensorCopyToBuffer(output_tensor, output.data(),
                          output.size() * sizeof(float));
 
@@ -377,7 +377,9 @@ TfLiteModelDelete(model);
 
 ## Load and run a model in C++
 
-*Platforms: Android and Linux*
+*Platforms: Android, iOS, and Linux*
+
+Note: C++ API on iOS is only available when using bazel.
 
 In C++, the model is stored in
 [`FlatBufferModel`](https://www.tensorflow.org/lite/api_docs/cc/class/tflite/flat-buffer-model.html)
@@ -488,11 +490,11 @@ output_data = interpreter.get_tensor(output_details[0]['index'])
 print(output_data)
 ```
 
-Alternatively to loading the model as a pre-converted `.tflite` file, you can
-combine your code with the
-[TensorFlow Lite Converter Python API](../convert/python_api.md)
+As an alternative to loading the model as a pre-converted `.tflite` file, you
+can combine your code with the
+[TensorFlow Lite Converter Python API](https://www.tensorflow.org/lite/convert/python_api)
 (`tf.lite.TFLiteConverter`), allowing you to convert your TensorFlow model into
-the TensorFlow Lite format and then run an inference:
+the TensorFlow Lite format and then run inference:
 
 ```python
 import numpy as np
diff --git a/tensorflow/lite/g3doc/guide/ios.md b/tensorflow/lite/g3doc/guide/ios.md
index 8f15069201b..0353f2fb525 100644
--- a/tensorflow/lite/g3doc/guide/ios.md
+++ b/tensorflow/lite/g3doc/guide/ios.md
@@ -114,6 +114,28 @@ objc_library(
 )
 ```
 
+#### C/C++ API
+
+Alternatively, you can use
+[C API](https://www.tensorflow.org/code/tensorflow/lite/c/c_api.h)
+or [C++ API](https://tensorflow.org/lite/api_docs/cc)
+
+```python
+# Using C API directly
+objc_library(
+  deps = [
+      "//tensorflow/lite/c:c_api",
+  ],
+)
+
+# Using C++ API directly
+objc_library(
+  deps = [
+      "//third_party/tensorflow/lite:framework",
+  ],
+)
+```
+
 ### Import the library
 
 For Swift files, import the TensorFlow Lite module:
diff --git a/tensorflow/lite/g3doc/guide/model_maker.md b/tensorflow/lite/g3doc/guide/model_maker.md
new file mode 100644
index 00000000000..956bd127bcf
--- /dev/null
+++ b/tensorflow/lite/g3doc/guide/model_maker.md
@@ -0,0 +1,66 @@
+# TensorFlow Lite Model Maker
+
+## Overview
+
+The TensorFlow Lite Model Maker library simplifies the process of training a
+TensorFlow Lite model using custom dataset. It uses transfer learning to reduce
+the amount of training data required and shorten the training time.
+
+## Supported Tasks
+
+The Model Maker library currently supports the following ML tasks. Click the
+links below for guides on how to train the model.
+
+Supported Tasks                                                                                          | Task Utility
+-------------------------------------------------------------------------------------------------------- | ------------
+Image Classification [guide](https://www.tensorflow.org/lite/tutorials/model_maker_image_classification) | Classify images into predefined categories.
+Text Classification [guide](https://www.tensorflow.org/lite/tutorials/model_maker_text_classification)   | Classify text into predefined categories.
+Question Answer [guide](https://www.tensorflow.org/lite/tutorials/model_maker_question_answer)           | Find the answer in a certain context for a given question.
+
+## End-to-End Example
+
+Model Maker allows you to train a TensorFlow Lite model using custom datasets in
+just a few lines of code. For example, here are the steps to train an image
+classification model.
+
+```python
+# Load input data specific to an on-device ML app.
+data = ImageClassifierDataLoader.from_folder('flower_photos/')
+train_data, test_data = data.split(0.9)
+
+# Customize the TensorFlow model.
+model = image_classifier.create(data)
+
+# Evaluate the model.
+loss, accuracy = model.evaluate(test_data)
+
+# Export to Tensorflow Lite model and label file in `export_dir`.
+model.export(export_dir='/tmp/')
+```
+
+For more details, see the
+[image classification guide](https://www.tensorflow.org/lite/tutorials/model_maker_image_classification).
+
+## Installation
+
+There are two ways to install Model Maker.
+
+*   Install a prebuilt pip package.
+
+```shell
+pip install tflite-model-maker
+```
+
+If you want to install nightly version, please follow the command:
+
+```shell
+pip install tflite-model-maker-nightly
+```
+
+*   Clone the source code from GitHub and install.
+
+```shell
+git clone https://github.com/tensorflow/examples
+cd examples/tensorflow_examples/lite/model_maker/pip_package
+pip install -e .
+```
diff --git a/tensorflow/lite/g3doc/guide/ops_compatibility.md b/tensorflow/lite/g3doc/guide/ops_compatibility.md
index d1462cb09c7..38b7ad1e063 100644
--- a/tensorflow/lite/g3doc/guide/ops_compatibility.md
+++ b/tensorflow/lite/g3doc/guide/ops_compatibility.md
@@ -5,12 +5,11 @@ inference models. As they are processed by the TensorFlow Lite Optimizing
 Converter, those operations may be elided or fused, before the supported
 operations are mapped to their TensorFlow Lite counterparts.
 
-Since the set of TensorFlow Lite operations is smaller than TensorFlow's, not
-every model is convertible. Even for supported operations, very specific usage
-patterns are sometimes expected, for performance reasons. We expect to expand
-the set of supported operations in future TensorFlow Lite releases. Additional
-ops can be included by [using select TensorFlow ops](ops_select.md), at the cost
-of binary size.
+Since the TensorFlow Lite builtin operator library only supports a limited
+number of TensorFlow operators, not every model is convertible. Even for
+supported operations, very specific usage patterns are sometimes expected, for
+performance reasons. We expect to expand the set of supported operations in
+future TensorFlow Lite releases.
 
 The best way to understand how to build a TensorFlow model that can be used with
 TensorFlow Lite is to carefully consider how operations are converted and
@@ -29,49 +28,11 @@ requires "fake-quantization" during model training, getting range information
 via a calibration data set, or doing "on-the-fly" range estimation. See
 [quantization](../performance/model_optimization.md).
 
-## Data format and broadcasting
+## Supported operations and restrictions
 
-At the moment TensorFlow Lite supports only TensorFlow's "NHWC" format, and
-broadcasting is only support in a limited number of ops (`tf.add`, `tf.mul`,
-`tf.sub`, and `tf.div`).
-
-## Compatible operations
-
-The following TensorFlow operations are usually mapped to their TensorFlow Lite
-counterparts:
-
-*   `tf.batch_to_space_nd` —As long as the input tensor is 3D or 4D (1 batch + 1
-    or 2 spatial + 1 other) and the crops attribute is not used.
-*   `tf.exp`
-*   `tf.fake_quant`
-*   `tf.matmul` —As the second argument is constant and transposition is not
-    used*
-*   `tf.nn.avg_pool`
-*   `tf.nn.conv2d` —As long as the filter is constant.
-*   `tf.nn.depthwise_conv2d` —As long as the filter is constant and rate is `[1,
-    1]`.
-*   `tf.nn.l2_normalize` —As long as normalization is done along the last
-    dimension.
-*   `tf.nn.local_response_normalization`
-*   `tf.nn.log_softmax` —As long as axis is not provided.
-*   `tf.nn.max_pool`
-*   `tf.nn.softmax` —As long as tensors are 2D and axis is the last dimension.
-*   `tf.nn.top_k`
-*   `tf.one_hot`
-*   `tf.pad` —As long as `mode` and `constant_values` are not used.
-*   `tf.reduce_mean` —As long as the `reduction_indices` attribute is not used.
-*   `tf.reshape`
-*   `tf.sigmoid`
-*   `tf.space_to_batch_nd` —As long as the input tensor is 3D or 4D (1 batch + 1
-    or 2 spatial + 1 other).
-*   `tf.space_to_depth`
-*   `tf.split` —As long as num is not provided and `num_or_size_split` contains
-    number of splits as a 0D tensor.
-*   `tf.squeeze` —As long as `axis` is not provided.
-*   `tf.squared_difference`
-*   `tf.strided_slice` —As long as `ellipsis_mask` and `new_axis_mask` are not
-    used.
-*   `tf.transpose` —As long as `conjugate` is not used.
+TensorFlow Lite supports a subset of TensorFlow operations with some
+limitations. For full list of operations and limitations see
+[TF Lite Ops page](https://www.tensorflow.org/mlir/tfl_ops).
 
 ## Straight-forward conversions, constant-folding and fusing
 
@@ -118,1029 +79,7 @@ from the graph:
 Note: Many of those operations don't have TensorFlow Lite equivalents, and the
 corresponding model will not be convertible if they can't be elided or fused.
 
-## Unsupported operations
-
-TensorFlow operation not listed above are likely unsupported. Notably, the
-following common ops are not supported at the moment:
-
-*   `tf.depth_to_space`
-
-## TensorFlow Lite operations
-
-The following TensorFlow Lite operations are fully supported and used in place
-of the TensorFlow operations listed above:
-
-**ABS**
-
-```
-Inputs {
-  0: a tensor
-}
-Outputs {
-  0: elementwise abs of the input
-}
-```
-
-**ADD**
-
-```
-Inputs {
-  0: a tensor
-  1: a tensor
-}
-Outputs {
-  0: elementwise sum of the input tensors
-}
-Options {
-  fused_activation_function:  NONE|RELU|RELU6
-}
-```
-
-**ADD_N**
-
-```
-Inputs {
-  0-N: any number of tensors (must have same size and shape)
-}
-Outputs {
-  0: elementwise sum of the input tensors
-}
-```
-
-**ARG_MAX**
-
-```
-Inputs {
-  0: a tensor
-  1: a tensor
-}
-Outputs {
-  0: A tensor of indices of maximum values.
-}
-```
-
-**ARG_MIN**
-
-```
-Inputs {
-  0: a tensor
-  1: a tensor
-}
-Outputs {
-  0: A tensor of indices of minimum values.
-}
-```
-
-**AVERAGE_POOL_2D**
-
-```
-Inputs {
-  0: a tensor
-}
-Outputs {
-  0: a tensor where each entry is the mean of the input values in the
-     corresponding window.
-}
-Options {
-  fused_activation_function:  NONE|RELU|RELU6
-  padding: SAME|VALID
-  stride_w,stride_h: stride of the sliding window
-  filter_width,filter_height: size of the sliding window
-}
-```
-
-**BATCH_TO_SPACE_ND**
-
-```
-Inputs {
-  0: 3D-4D tensor
-  1: 1D tensor
-  2: 2D tensor
-}
-Outputs {
-  0: tensor rearranged using block_shape. See tf.batch_to_space_nd for
-     details.
-}
-```
-
-**CONCATENATION**
-
-```
-Inputs {
-  0-N: any number of tensors
-}
-Outputs {
-  0: concatenation of the input tensors along the given axis.
-}
-Options {
-  fused_activation_function:  NONE|RELU|RELU6
-  axis: dimension along which the concatenation is performed
-}
-```
-
-**CONV_2D**
-
-```
-Inputs {
-  0: 4D tensor
-  1: filter
-  2: bias (optional)
-}
-Outputs {
-  0: result of 2D convolution of the input tensor
-}
-Options {
-  fused_activation_function:  NONE|RELU|RELU6
-  padding: SAME|VALID
-  stride_w,stride_h: stride of the filter window
-}
-```
-
-**TRANSPOSE_CONV**
-
-```
-Inputs {
-  0: output_shape
-  1: filter
-  2: 4D tensor
-}
-Outputs {
-  0: the transpose (gradient) of conv2d
-}
-Options {
-  padding: SAME|VALID
-  stride_w,stride_h: stride of the filter window
-}
-```
-
-**DEPTHWISE_CONV_2D**
-
-```
-Inputs {
-  0: 4D tensor
-  1: filter
-  2: bias (optional)
-}
-Outputs {
-  0: result of a depthwise-2D convolution of the input tensor
-}
-Options {
-  fused_activation_function:  NONE|RELU|RELU6
-  padding: SAME|VALID
-  stride_w,stride_h: stride of the filter window
-  depth_multiplier: relation between the last dimension of the input and output
-    tensors
-}
-```
-
-**ELU**
-
-```
-Inputs {
-  0: a tensor
-}
-Outputs {
-  0: a tensor equivalent to exp(features) - 1 if < 0, features otherwise.
-}
-```
-
-**EQUAL**
-
-```
-Inputs {
-  0: a tensor
-  1: a tensor
-}
-Outputs {
-  0: a tensor of type bool, true whenever an element of the first tensor is
-  equal to the corresponding element of the second tensor.
-}
-```
-
-**EXP**
-
-```
-Inputs {
-  0: tensor
-}
-Outputs {
-  0: result of computing element-wise exponential of the input tensor
-}
-```
-
-**FILL**
-
-```
-Inputs {
-  0: a 1D tensor
-  1: a 0D (scalar) tensor
-}
-Outputs {
-  0: A tensor of shape `tensor 0` filled with the value in `tensor 1`.
-}
-```
-
-**FLOOR**
-
-```
-Inputs {
-  0: tensor
-}
-Outputs: {
-  0: result of computing element-wise floor of the input tensor
-}
-```
-
-**FLOOR_DIV**
-
-```
-Inputs {
-  0: a tensor
-  1: a tensor
-}
-Outputs {
-  0: result of computing element-wise floor of `tensor 0` divided by `tensor 1`.
-}
-```
-
-**FLOOR_MOD**
-
-```
-Inputs {
-  0: a tensor
-  1: a tensor
-}
-Outputs {
-  0: result of computing element-wise floor of `tensor 0` modulo `tensor 1`.
-}
-```
-
-**CEIL**
-
-```
-Inputs {
-  0: a tensor
-}
-Outputs {
-  0: result of computing element-wise ceil of the input tensor
-}
-```
-
-**FULLY_CONNECTED**
-
-```
-Inputs {
-  0: 4D tensor
-  1: filter
-  2: bias (optional)
-}
-Outputs {
-  0: output of a fully (densely) connected layer, which connects all
-     elements in the input tensor with each element in this tensor.
-}
-Options {
-  fused_activation_function:  NONE|RELU|RELU6
-}
-```
-
-**GATHER**
-
-```
-Inputs {
-  0: params tensor
-  1: indices tensor
-  2: axis tensor (optional)
-}
-Outputs {
-  0: a tensor with same type as the params tensor.
-}
-```
-
-**GATHER_ND**
-
-```
-Inputs {
-  0: params tensor
-  1: indices tensor
-}
-Outputs {
-  0: a tensor with same type as the params tensor.
-}
-```
-
-**GREATER**
-
-```
-Inputs {
-  0: a tensor
-  1: a tensor
-}
-Outputs {
-  0: a tensor of type bool, true whenever an element of the first tensor is
-  greater than the corresponding element of the second tensor.
-}
-```
-
-**GREATER_EQUAL**
-
-```
-Inputs {
-  0: a tensor
-  1: a tensor
-}
-Outputs {
-  0: a tensor of type bool, true whenever an element of the first tensor is
-  greater than or equal to the corresponding element of the second tensor.
-}
-```
-
-**L2_NORMALIZATION**
-
-```
-Inputs {
-  0: input tensor
-}
-Outputs {
-  0: normalized tensor (along the last dimension)
-}
-Options {
-  fused_activation_function:  NONE|RELU|RELU6
-}
-```
-
-**L2_POOL_2D**
-
-```
-Inputs {
-  0: a tensor
-}
-Outputs {
-  0: a tensor equivalent to tf.sqrt(tf.nn.ave_pool(tf.square(input))
-}
-Options {
-  fused_activation_function:  NONE|RELU|RELU6
-  padding: SAME|VALID
-  stride_w,stride_h: stride of the sliding window
-  filter_width,filter_height: size of the sliding window
-}
-```
-
-**LEAKY_RELU**
-
-```
-Inputs {
-  0: a tensor
-}
-Outputs {
-  0: a tensor equivalent to max(input, input * alpha)
-}
-Options {
-  alpha: slope of the activation at x < 0 (provided alpha <= 1)
-}
-```
-
-**LESS**
-
-```
-Inputs {
-  0: a tensor
-  1: a tensor
-}
-Outputs {
-  0: a tensor of type bool, true whenever an element of the first tensor is less
-  than the corresponding element of the second tensor.
-}
-```
-
-**LESS_EQUAL**
-
-```
-Inputs {
-  0: a tensor
-  1: a tensor
-}
-Outputs {
-  0: a tensor of type bool, true whenever an element of the first tensor is less
-  than or equal to the corresponding element of the second tensor.
-}
-```
-
-**LOCAL_RESPONSE_NORMALIZATION**
-
-```
-Inputs {
-  0: a tensor
-}
-Outputs {
-  0: a tensor equivalent to tf.nn.local_response_normalization
-}
-Options {
-  radius
-  bias
-  alpha
-  beta
-}
-```
-
-**LOGICAL_OR**
-
-```
-Inputs {
-  0: a list of tensors.
-  1: a list of tensors.
-}
-Outputs {
-  0: A tensor of logical_or output tensors.
-}
-```
-
-**LOGISTIC**
-
-```
-Inputs {
-  0: a tensor
-}
-Outputs {
-  0: a tensor equivalent to 1 / (1 + exp(-input))
-}
-```
-
-**LOG**
-
-```
-Inputs {
-  0: a tensor
-}
-Outputs {
-  0: a tensor equivalent to log(input)
-}
-```
-
-**LOG_SOFTMAX**
-
-```
-Inputs {
-  0: tensor
-}
-Outputs {
-  0: tensor equivalent to logits - log(reduce_sum(exp(logits), -1))
-}
-```
-
-**MAX_POOL_2D**
-
-```
-Inputs {
-  0: a tensor
-}
-Outputs {
-  0: a tensor where each entry is the maximum of the input values in the
-     corresponding window.
-}
-Options {
-  fused_activation_function:  NONE|RELU|RELU6
-  padding: SAME|VALID
-  stride_w,stride_h: stride of the sliding window
-  filter_width,filter_height: size of the sliding window
-}
-```
-
-**MUL**
-
-```
-Inputs {
-  0: a tensor
-  1: a tensor
-}
-Outputs {
-  0: elementwise multiplication of the input tensors
-}
-Options {
-  fused_activation_function:  NONE|RELU|RELU6
-}
-```
-
-**NEG**
-
-```
-Inputs {
-  0: a tensor
-}
-Outputs {
-  0: elementwise negation of the input tensor
-}
-```
-
-**NON_MAX_SUPPRESSION_V4**
-
-```
-Inputs {
-  0: boxes in format [y1, x1, y2, x2]
-  1: scores
-  2: max number of detections
-  3: IOU threshold
-  4: score threshold
-}
-Outputs {
-  0: selected indices
-  1: number of selected indices
-}
-```
-
-**NON_MAX_SUPPRESSION_V5**
-
-```
-Inputs {
-  0: boxes in format [y1, x1, y2, x2]
-  1: scores
-  2: max number of detections
-  3: IOU threshold
-  4: score threshold
-  5: soft NMS sigma
-}
-Outputs {
-  0: selected indices
-  1: selected scores
-  2: number of selected indices
-}
-```
-
-**PACK**
-
-```
-Inputs {
-  0: a list of tensors.
-  1: an integer.
-}
-Outputs {
-  0: A tensor of stacked tensors.
-}
-```
-
-**PAD**
-
-```
-Inputs {
-  0: tensor
-  1: tensor
-}
-Outputs {
-  0: tensor where additional values are added before and after the contents of
-     each dimension
-}
-```
-
-**MEAN (tf.reduce_mean)**
-
-```
-Inputs {
-  0: tensor
-  1: tensor
-}
-Outputs {
-  0: tensor containing the mean of the elements
-}
-Options {
-  keep_dims: whether to retain reduced dimensions
-}
-```
-
-**NOT_EQUAL**
-
-```
-Inputs {
-  0: a tensor
-  1: a tensor
-}
-Outputs {
-  0: a tensor of type bool, true whenever an element of the first tensor is not
-  equal to the corresponding element of the second tensor.
-}
-```
-
-**POW**
-
-```
-Inputs {
-  0: a tensor
-  1: a tensor
-}
-Outputs {
-  0: elementwise pow of the input tensors
-}
-```
-
-**RANGE**
-
-```
-Inputs {
-  0: a 0D (scalar) tensor
-  1: a 0D (scalar) tensor
-  2: a 0D (scalar) tensor
-}
-Outputs {
-  0: A 1D tensor of type `dtype` defined by a sequence where `tensor 0` is the
-  start, `tensor 1` is the limit, and `tensor 2` is the delta.
-}
-Options {
-  dtype
-}
-```
-
-**RANK**
-
-```
-Inputs {
-  0: a tensor
-}
-Outputs {
-  0: a 0-D int32 Tensor representing the rank of input
-}
-```
-
-**RELU**
-
-```
-Inputs {
-  0: a tensor
-}
-Outputs {
-  0: a tensor equivalent to max(0, input)
-}
-```
-
-**RELU_N1_TO_1**
-
-```
-Inputs {
-  0: a tensor
-}
-Outputs {
-  0: a tensor equivalent to max(-1, min(input, 1)
-}
-```
-
-**RELU6**
-
-```
-Inputs {
-  0: a tensor
-}
-Outputs {
-  0: a tensor equivalent to max(0, min(input, 6)
-}
-```
-
-**RESHAPE**
-
-```
-Inputs {
-  0: a tensor
-  1: ignored
-}
-Outputs {
-  0: a tensor with the same elements as the input but with the new shape
-}
-Options {
-  new_shape
-}
-```
-
-**RESIZE_BILINEAR**
-
-```
-Inputs {
-  0: a 4D tensor
-  1: a 1D tensor with 2 elements
-}
-Outputs {
-  0: A tensor of type `tensor 0` resized according to `tensor 1` height/width values
-  using bilinear interpolation.
-}
-Options {
-  align_corners
-}
-```
-
-**RESIZE_NEAREST_NEIGHBOR**
-
-```
-Inputs {
-  0: a 4D tensor
-  1: a 1D tensor with 2 elements
-}
-Outputs {
-  0: A tensor of type `tensor 0` resized according to `tensor 1` height/width values
-  using nearest neighbors interpolation.
-}
-Options {
-  align_corners
-}
-```
-
-**RSQRT**
-
-```
-Inputs {
-  0: a tensor
-}
-Outputs {
-  0: result of computing element-wise reciprocal square root of the input tensor
-}
-```
-
-**REVERSE_SEQUENCE**
-
-```
-Inputs {
-  0: a tensor
-  1: a 1-D tensor which specifies the length of sequence to be reversed in each
-  dim
-}
-Outputs {
-  0: a tensor with the same shape as the input tensor
-}
-Options {
-  seq_dim: a 0-D int tensor (scalar). The dimension which is partially
-  reversed.
-  batch_dim: a 0-D int tensor (scalar). Defaults to 0. The dimension along
-  which reversal is performed.
-}
-```
-
-**SHAPE**
-
-```
-Inputs {
-  0: a tensor
-}
-Outputs {
-  0: a 1D tensor representing the shape of the input tensor
-}
-Options {
-  out_type: the output type of the op (int32 or int64). Defaults to int32.
-}
-```
-
-**ROUND**
-
-```
-Inputs {
-  0: a tensor
-}
-Outputs {
-  0: result of computing element-wise round of the input tensor
-}
-```
-
-**SLICE**
-
-```
-Inputs {
-  0: tensor
-  1: 1D tensor
-  2: 1D tensor
-}
-Outputs {
-  0: slice of the input tensor of the given size from the given begin index.
-}
-```
-
-**SOFTMAX**
-
-```
-Inputs {
-  0: a tensor
-}
-Outputs {
-  0: a tensor equivalent to exp(input) / tf.reduce_sum(exp(input * beta), dim),
-     where dim is always the last dimension of the input tensor.
-}
-Options {
-  beta
-}
-```
-
-**SPACE_TO_DEPTH**
-
-```
-Inputs {
-  0: a 4D tensor
-}
-Outputs {
-  0: a tensor rearranged using block_size. See tf.space_to_depth for details.
-}
-Options {
-  block_size
-}
-```
-
-**SPACE_TO_BATCH_ND**
-
-```
-Inputs {
-  0: 3D-4D tensor
-  1: 1D tensor
-  2: 2D tensor
-}
-Outputs {
-  0: a tensor rearranged using block_shape. See tf.space_to_batch_nd for
-     details.
-}
-```
-
-**SPARSE_TO_DENSE**
-
-```
-Inputs {
-  0: 0D or 1D or 2D tensor
-  1: 1D tensor
-  2: 0D or 1D tensor
-  3: 0D tensor
-  4: a boolean value
-}
-Outputs {
-  0: Dense Tensor of shape output_shape. Has the same type as sparse_values.
-}
-```
-
-**SPLIT**
-
-```
-Inputs {
-  0: 0D tensor (axis)
-  1: tensor (input)
-}
-Outputs {
-  0-N: subtensors built from the input tensors
-}
-Options {
-  num_splits: Specifies number of outputs
-}
-```
-
-**SPLIT_V**
-
-```
-Inputs {
-  0: tensor (input)
-  1: 1-D tensor (size_splits)
-  2: 0-D tensor (axis)
-}
-Outputs {
-  0-N: subtensors built from the input tensors
-}
-Options {
-  num_splits: Specifies number of outputs
-}
-```
-
-**SQRT**
-
-```
-Inputs {
-  0: a tensor
-}
-Outputs {
-  0: result of computing element-wise square root of the input tensor
-}
-```
-
-**SQUEEZE**
-
-```
-Inputs {
-  0: tensor
-}
-Outputs {
-  0: tensor without any dimensions of size 1
-}
-Options {
-  squeeze_dims
-}
-```
-
-**STRIDED_SLICE**
-
-```
-Inputs {
-  0: tensor
-  1: 1D tensor
-  2: 1D tensor
-  3: 1D tensor
-}
-Outputs {
-  0: slice of the input tensor of the given size
-}
-Options {
-  begin_mask: mask for begin indices
-  end_mask: mask for end indices
-  shrink_axis_mask: mask that indicates which dimensions to remove
-}
-```
-
-**TANH**
-
-```
-Inputs {
-  0: a tensor
-}
-Outputs {
-  0: result of computing element-wise hyperbolic tangent of the input tensor
-}
-```
-
-**TOP_K**
-
-```
-Inputs {
-  0: tensor
-  1: OD tensor
-}
-Outputs {
-  0: k largest element along each last dimensional slice
-  1: indices of values within the last dimension of the input tensor
-}
-```
-
-**TRANSPOSE**
-
-```
-Inputs {
-  0: tensor
-  1: tensor
-}
-Outputs {
-  0: tensor permuted according to perm
-}
-```
-
-**SELECT**
-
-```
-Inputs {
-  0: tensor
-  1: tensor
-  2: tensor
-}
-Outputs {
-  0: tensor that contains the elementwise values of 'tensor 1' if the
-  corresponding value of 'tensor 0' is true or the value of 'tensor 2' if false.
-}
-```
-
-**UNPACK**
-
-```
-Inputs {
-  0: a tensor.
-  1: an integer.
-  2: an integer.
-}
-Outputs {
-  0-N: tensors of unpacked tensor.
-}
-```
-
-**WHERE**
-
-```
-Inputs {
-  0: A tensor of type bool.
-  1: A tensor which may have the same shape as condition. If condition is rank
-     1, x may have higher rank, but its first dimension must match the size of
-     condition.
-  2: A tensor with the same shape and type as x.
-}
-Outputs {
-  0: A tensor with the same type and shape as x, y if they are non-None, or
-     a tensor with shape (num_true, dim_size(condition)).
-}
-```
-
-**ZEROS_LIKE**
-
-```
-Inputs {
-  0: a tensor
-}
-Outputs {
-  0: A tensor of the same shape and type as x but filled with zeros
-}
-```
-
-**FILL**
-
-```
-Inputs {
-  0: A Tensor. Must be one of the following types: int32, int64. 1-D. Represents the shape of the output tensor.
-  1: A Tensor. 0-D (scalar). Value to fill the returned tensor.
-}
-Outputs {
-  0: A tensor of the same type as value (input1).
-}
-```
-
+## Experimental Operations
 The following TensorFlow Lite operations are present, but not ready for custom
 models:
 
diff --git a/tensorflow/lite/g3doc/guide/ops_custom.md b/tensorflow/lite/g3doc/guide/ops_custom.md
index e2ccc9c72d8..5cd43e2b32f 100644
--- a/tensorflow/lite/g3doc/guide/ops_custom.md
+++ b/tensorflow/lite/g3doc/guide/ops_custom.md
@@ -1,10 +1,14 @@
 # Custom operators
 
-TensorFlow Lite currently supports a subset of TensorFlow operators. It supports
-the use of user-provided implementations (known as custom implementations) if
-the model contains an operator that is not supported. Providing custom kernels
-is also a way of executing a series of TensorFlow operations as a single fused
-TensorFlow Lite operation.
+Since the TensorFlow Lite builtin operator library only supports a limited
+number of TensorFlow operators, not every model is convertible. For details,
+refer to [operator compatibility](ops_compatibility.md).
+
+To allow conversion, users can provide their own custom implementation of an
+unsupported TensorFlow operator in TensorFlow Lite, known as a custom operator.
+*Instead, if you want to combine a series of unsupported (or supported)
+TensorFlow operators into a single fused optimized custom operator, refer to
+[operator fusing](https://www.tensorflow.org/lite/convert/operation_fusion).*
 
 Using custom operators consists of three steps.
 
@@ -17,8 +21,8 @@ Using custom operators consists of three steps.
 *   Test and profile your operator correctness and performance, respectively. If
     you wish to test just your custom operator, it is best to create a model
     with just your custom operator and using the
-    [benchmark_model](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/benchmark/benchmark_model_test.cc)
-    program.
+    [benchmark model](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/benchmark/benchmark_model_test.cc)
+    code.
 
 Below we describe a complete example of defining `Sin` and some links to
 existing conversion process involving custom operators.
@@ -131,8 +135,6 @@ implementations of builtins by using the `AddBuiltin`.
 
 ## Best practices
 
-### Writing TensorFlow Lite kernels best practices
-
 1.  Optimize memory allocations and de-allocations cautiously. Allocating memory
     in `Prepare` is more efficient than in `Invoke`, and allocating memory
     before a loop is better than in every iteration. Use temporary tensors data
@@ -175,61 +177,3 @@ implementations of builtins by using the `AddBuiltin`.
     Your code must not leave memory hanging when `TF_LITE_ENSURE` is used, i.e.,
     these macros should be used before any resources are allocated that will
     leak.
-
-### Conversion best practices
-
-The example above was easy to convert since it was a builtin operator in
-TensorFlow. If you are defining a new operator that fuses many operators or you
-have complicated shapes or types, you might need to provide more information and
-use graph transformations to rewrite an existing graph to use your operator
-instead of the builtin TensorFlow one.
-
-#### Converting TensorFlow models to convert graphs
-
-In TensorFlow you can use the `tf.lite.OpHint` class to encapsulate groups of
-operators when you create a TensorFlow graph. This encapsulation allows you then
-to extract a graph def that has references to those operators. `tf.lite.OpHint`
-is currently experimental and should only be used by advanced users. A full
-example of how to use this class is in the
-[OpHint code](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/python/op_hint.py).
-
-In addition, you can also use a manual graph substitution approach to rewrite
-Tensorflow graphs. There is an example of how this is done in single shot object
-based detection models
-[export script](https://github.com/tensorflow/models/blob/master/research/object_detection/export_tflite_ssd_graph.py).
-
-### TF graph attributes
-
-When `tflite_convert` converts a TensorFlow graph into TFLite format, it makes
-some assumptions about custom operations. If the assumptions are not correct,
-the generated graph may not execute.
-
-It is possible to add additional information about your custom op output to the
-TF graph before it is converted. The following attributes are supported:
-
--   **_output_quantized** a boolean attribute, true if the operation outputs are
-    quantized
--   **_output_types** a list of types for output tensors
--   **_output_shapes** a list of shapes for output tensors
-
-#### Setting the attributes
-
-The following example demonstrates how the attributes can be set:
-
-```python
-frozen_graph_def = tf.graph_util.convert_variables_to_constants(...)
-for node in frozen_graph_def.node:
-    if node.op == 'sin':
-      node.attr['_output_types'].list.type.extend([
-          types_pb2.DT_FLOAT,
-      ])
-      node.attr['_output_shapes'].list.shape.extend([
-          tf.TensorShape([10]),
-      ])
-      node.attr['_output_quantized'].b = False
-tflite_model = tf.lite.toco_convert(
-        frozen_graph_def,...)
-```
-
-**Note:** After the attributes are set, the graph cannot be executed by
-TensorFlow. Therefore, the attributes should be set just before the conversion.
diff --git a/tensorflow/lite/g3doc/guide/ops_select.md b/tensorflow/lite/g3doc/guide/ops_select.md
index 5aa3e96cae2..3aa81528c1f 100644
--- a/tensorflow/lite/g3doc/guide/ops_select.md
+++ b/tensorflow/lite/g3doc/guide/ops_select.md
@@ -1,96 +1,43 @@
-# Select TensorFlow operators to use in TensorFlow Lite
+# Select TensorFlow operators
 
 Caution: This feature is experimental.
 
-The TensorFlow Lite builtin op library has grown rapidly and will continue to
-grow, but there remains a long tail of TensorFlow ops that are not yet natively
-supported by TensorFlow Lite. These unsupported ops can be a point of friction
-in the TensorFlow Lite model conversion process. To that end, the team has
-recently been working on an experimental mechanism for reducing this friction.
+Since the TensorFlow Lite builtin operator library only supports a limited
+number of TensorFlow operators, not every model is convertible. For details,
+refer to [operator compatibility](ops_compatibility.md).
 
-This document outlines how to use TensorFlow Lite with select TensorFlow ops.
-*Note that this feature is experimental and is under active development.* As you
-use this feature, keep in mind the [known limitations](#known-limitations), and
-please send feedback about models that work and issues you are facing to
-tflite@tensorflow.org.
+To allow conversion, users can enable the usage of
+[certain TensorFlow ops](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/flex/allowlisted_flex_ops.cc)
+in their TensorFlow Lite model. However, running TensorFlow Lite models with
+TensorFlow ops requires pulling in the core TensorFlow runtime, which increases
+the TensorFlow Lite interpreter binary size. For Android, you can avoid this by
+selectively building only required Tensorflow ops. For the details, refer to
+[reduce binary size](../guide/reduce_binary_size.md).
 
-TensorFlow Lite will continue to have
-[TensorFlow Lite builtin ops](ops_compatibility.md) optimized for mobile and
-embedded devices. However, TensorFlow Lite models can now use a subset of
-TensorFlow ops when TFLite builtin ops are not sufficient.
+This document outlines how to [convert](#convert_a_model) and
+[run](#run_inference) a TensorFlow Lite model containing TensorFlow ops on a
+platform of your choice. It also discusses
+[performance and size metrics](#metrics) and
+[known limitations](#known_limitations).
 
-Models converted with TensorFlow ops will require a TensorFlow Lite interpreter
-that has a larger binary size than the interpreter with only TFLite builtin ops.
-Additionally, performance optimizations will not be available for any TensorFlow
-ops in the TensorFlow Lite model.
+## Convert a model
 
-This document outlines how to [convert](#converting-the-model) and
-[run](#running-the-model) a TFLite model with TensorFlow ops on your platform of
-choice. It also discusses some [known limitations](#known-limitations), the
-[future plans](#future-plans) for this feature, and basic
-[performance and size metrics](#metrics).
-
-## Converting the model
-
-To convert a TensorFlow model to a TensorFlow Lite model with TensorFlow ops,
-use the `target_spec.supported_ops` argument in the
-[TensorFlow Lite converter](../convert/). The following values are valid options
-for `target_spec.supported_ops`:
-
-*   `TFLITE_BUILTINS` - Converts models using TensorFlow Lite builtin ops.
-*   `SELECT_TF_OPS` - Converts models using TensorFlow ops. The exact subset of
-    supported ops can be found in the allowlist at
-    `lite/delegates/flex/allowlisted_flex_ops.cc`.
-
-Note: `target_spec.supported_ops` was previously `target_ops` in the Python API.
-
-The recommended approach is to convert the model with `TFLITE_BUILTINS`, then
-with both `TFLITE_BUILTINS,SELECT_TF_OPS`, and finally with only
-`SELECT_TF_OPS`. Using both options (i.e. `TFLITE_BUILTINS,SELECT_TF_OPS`)
-creates models with TensorFlow Lite ops where possible. Using only
-`SELECT_TF_OPS` is useful when the model contains TensorFlow ops that are only
-partially supported by TensorFlow Lite, and one would like to avoid those
-limitations.
-
-The following example shows how to use this feature in the
-[`TFLiteConverter`](../convert/python_api.md) Python API.
+The following example shows how to generate a TensorFlow Lite model with select
+TensorFlow ops.
 
 ```python
 import tensorflow as tf
 
 converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
-converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS,
-                                       tf.lite.OpsSet.SELECT_TF_OPS]
+converter.target_spec.supported_ops = [
+  tf.lite.OpsSet.TFLITE_BUILTINS, # enable TensorFlow Lite ops.
+  tf.lite.OpsSet.SELECT_TF_OPS # enable TensorFlow ops.
+]
 tflite_model = converter.convert()
 open("converted_model.tflite", "wb").write(tflite_model)
 ```
 
-The following example shows how to use this feature in the
-[`tflite_convert`](../convert/cmdline_examples.md) command line tool using the
-command line flag `target_ops`.
-
-```sh
-tflite_convert \
-  --output_file=/tmp/foo.tflite \
-  --graph_def_file=/tmp/foo.pb \
-  --input_arrays=input \
-  --output_arrays=MobilenetV1/Predictions/Reshape_1 \
-  --target_ops=TFLITE_BUILTINS,SELECT_TF_OPS
-```
-
-When building and running `tflite_convert` directly with `bazel`, please pass
-`--define=tflite_convert_with_select_tf_ops=true` as an additional argument.
-
-```sh
-bazel run --define=tflite_convert_with_select_tf_ops=true tflite_convert -- \
-  --output_file=/tmp/foo.tflite \
-  --graph_def_file=/tmp/foo.pb \
-  --input_arrays=input \
-  --output_arrays=MobilenetV1/Predictions/Reshape_1 \
-  --target_ops=TFLITE_BUILTINS,SELECT_TF_OPS
-```
-
-## Running the model
+## Run Inference
 
 When using a TensorFlow Lite model that has been converted with support for
 select TensorFlow ops, the client must also use a TensorFlow Lite runtime that
@@ -98,8 +45,10 @@ includes the necessary library of TensorFlow ops.
 
 ### Android AAR
 
-For Android, we recommend using the prebuilt [AAR with TensorFlow ops hosted at
-JCenter](https://bintray.com/google/tensorflow/tensorflow-lite-select-tf-ops).
+To reduce the binary size, please build your own custom AAR files as guided in
+the [next section](#building-the-android-aar). If the binary size is not a
+considerable concern, we recommend using the prebuilt
+[AAR with TensorFlow ops hosted at JCenter](https://bintray.com/google/tensorflow/tensorflow-lite-select-tf-ops).
 
 You can specify this in your `build.gradle` dependencies by adding it alongside
 the standard TensorFlow Lite AAR as follows:
@@ -112,9 +61,9 @@ dependencies {
 }
 ```
 
-Once you've added the dependency, the necessary delegate for handling
-the graph's TensorFlow ops should be automatically installed for
-graphs that require them.
+Once you've added the dependency, the necessary delegate for handling the
+graph's TensorFlow ops should be automatically installed for graphs that require
+them.
 
 *Note*: The TensorFlow ops dependency is relatively large, so you'll probably
 want to filter out unnecessary x86 ABIs in your `.gradle` file by setting up
@@ -132,23 +81,32 @@ android {
 
 #### Building the Android AAR
 
-For more advanced cases, you can also build the library manually. Assuming a
-<a href="android.md">working TensorFlow Lite build environment</a>, build the
-Android AAR with select TensorFlow ops as follows:
+For reducing the binary size or other advanced cases, you can also build the
+library manually. Assuming a <a href="android.md">working TensorFlow Lite build
+environment</a>, build the Android AAR with select TensorFlow ops as follows:
 
 ```sh
-bazel build --cxxopt='--std=c++14' -c opt   \
-  --config=android_arm --config=monolithic  \
-  //tensorflow/lite/java:tensorflow-lite-select-tf-ops
+sh tensorflow/lite/tools/build_aar.sh \
+  --input_models=/a/b/model_one.tflite,/c/d/model_two.tflite \
+  --target_archs=x86,x86_64,arm64-v8a,armeabi-v7a
 ```
 
-This will generate an AAR file in `bazel-bin/tensorflow/lite/java/`. From there,
-you can either import the AAR directly into your project, or publish the custom
-AAR to your local Maven repository:
+This will generate the AAR file `bazel-bin/tmp/tensorflow-lite.aar` for
+TensorFlow Lite built-in and custom ops; and generate the AAR file
+`bazel-bin/tmp/tensorflow-lite-select-tf-ops.aar` for TensorFlow ops. If you
+don't have a working build environment, You can also
+[build above files with docker](../guide/reduce_binary_size.md#selectively_build_tensorflow_lite_with_docker).
+
+From there, you can either import the AAR files directly into your project, or
+publish the custom AAR files to your local Maven repository:
 
 ```sh
 mvn install:install-file \
-  -Dfile=bazel-bin/tensorflow/lite/java/tensorflow-lite-select-tf-ops.aar \
+  -Dfile=bazel-bin/tmp/tensorflow-lite.aar \
+  -DgroupId=org.tensorflow \
+  -DartifactId=tensorflow-lite -Dversion=0.1.100 -Dpackaging=aar
+mvn install:install-file \
+  -Dfile=bazel-bin/tmp/tensorflow-lite-select-tf-ops.aar \
   -DgroupId=org.tensorflow \
   -DartifactId=tensorflow-lite-select-tf-ops -Dversion=0.1.100 -Dpackaging=aar
 ```
@@ -166,7 +124,8 @@ allprojects {
 }
 
 dependencies {
-    implementation 'org.tensorflow:tensorflow-lite-with-select-tf-ops:0.1.100'
+    implementation 'org.tensorflow:tensorflow-lite:0.1.100'
+    implementation 'org.tensorflow:tensorflow-lite-select-tf-ops:0.1.100'
 }
 ```
 
@@ -248,28 +207,23 @@ creating the interpreter at runtime as long as the delegate is linked into the
 client library. It is not necessary to explicitly install the delegate instance
 as is typically required with other delegate types.
 
-### Python pip package
+### Python
 
-Flex ops are included in the nightly build of the TensorFlow Python package. You
-can use TFLite models containing Flex ops by the same Python API as normal
-TFLite models. The nightly TensorFlow build can be installed with this command:
+TensorFlow Lite with select TensorFlow ops will be installed automatically with
+the [TensorFlow pip package](https://www.tensorflow.org/install/pip). You can
+also choose to only install the
+[TensorFlow Lite Interpreter pip package](https://www.tensorflow.org/lite/guide/python#install_just_the_tensorflow_lite_interpreter).
 
-```sh
-pip install tf-nightly
-```
-
-Flex ops will be added to the TensorFlow Python package's and the
-`tflite_runtime`
-[package](https://www.tensorflow.org/lite/guide/python#install_just_the_tensorflow_lite_interpreter)
-from version 2.3 for Linux and 2.4 for other environments.
+Note: TensorFlow Lite with select TensorFlow ops are available in the TensorFlow
+pip package version since 2.3 for Linux and 2.4 for other environments.
 
 ## Metrics
 
 ### Performance
 
 When using a mixture of both builtin and select TensorFlow ops, all of the same
-TensorFlow Lite optimizations and optimized builtin kernels will be be available
-and usable with the converted model.
+TensorFlow Lite optimizations and optimized builtin ops will be be available and
+usable with the converted model.
 
 The following table describes the average time taken to run inference on
 MobileNet on a Pixel 2. The listed times are an average of 100 runs. These
@@ -285,34 +239,32 @@ Using only TF ops (`SELECT_TF_OPS`)  | 264.5
 The following table describes the binary size of TensorFlow Lite for each build.
 These targets were built for Android using `--config=android_arm -c opt`.
 
-Build                 | C++ Binary Size | Android APK Size
---------------------- | --------------- | ----------------
-Only built-in ops     | 796 KB          | 561 KB
-Built-in ops + TF ops | 23.0 MB         | 8.0 MB
+Build                     | C++ Binary Size | Android APK Size
+------------------------- | --------------- | ----------------
+Only built-in ops         | 796 KB          | 561 KB
+Built-in ops + TF ops     | 23.0 MB         | 8.0 MB
+Built-in ops + TF ops (1) | 4.1 MB          | 1.8 MB
+
+(1) These libraries are selectively built for
+[i3d-kinetics-400 model](https://tfhub.dev/deepmind/i3d-kinetics-400/1) with 8
+TFLite builtin ops and 3 Tensorflow ops. For more details, please see the
+[Reduce TensorFlow Lite binary size](../guide/reduce_binary_size.md) section.
 
 ## Known limitations
 
-The following is a list of some of the known limitations:
-
-*   Control flow ops are not yet supported.
-*   The
-    [`post_training_quantization`](https://www.tensorflow.org/performance/post_training_quantization)
-    flag is currently not supported for TensorFlow ops, so it will not quantize
-    weights for any TensorFlow ops. In models with both TensorFlow Lite builtin
-    ops and TensorFlow ops, the weights for the builtin ops will be quantized.
-*   Ops that require explicit initialization from resources, like `HashTableV2`,
-    are not yet supported.
-*   Certain TensorFlow ops may not support the full set of input/output types
-    that are typically available on stock TensorFlow.
+*   Unsupported types: Certain TensorFlow ops may not support the full set of
+    input/output types that are typically available in TensorFlow.
+*   Unsupported ops: Control flow ops and ops that require explicit
+    initialization from resources, like `HashTableV2`, are not yet supported.
+*   Unsupported optimizations: If you apply an optimization known as
+    [post training quantization](../performance/post_training_quantization.md),
+    only the TensorFlow Lite ops will be quantized (or optimized), but the
+    TensorFlow ops will remain as float (or unoptimized).
 
 ## Future plans
 
 The following is a list of improvements to this pipeline that are in progress:
 
-*   *Selective registration* - There is work being done to make it simple to
-    generate TFLite interpreter binaries that only contain the TensorFlow ops
-    required for a particular set of models.
-*   *Improved usability* - The conversion process will be simplified to only
-    require a single pass through the converter.
 *   *Improved performance* - Work is being done to ensure TensorFlow Lite with
-    TensorFlow ops has performance parity to TensorFlow Mobile.
+    TensorFlow ops nicely cooperates with hardware accelerated delegates, for
+    example, NNAPI and GPU delegates.
diff --git a/tensorflow/lite/g3doc/guide/python.md b/tensorflow/lite/g3doc/guide/python.md
index 1cef1651517..1f68a0aa5a2 100644
--- a/tensorflow/lite/g3doc/guide/python.md
+++ b/tensorflow/lite/g3doc/guide/python.md
@@ -39,7 +39,7 @@ pip3 install https://dl.google.com/coral/python/tflite_runtime-2.1.0.post1-cp37-
 <table>
 <tr><th>Platform</th><th>Python</th><th>URL</th></tr>
 <tr>
-  <td style="white-space:nowrap" rowspan="3">Linux (ARM 32)</td>
+  <td style="white-space:nowrap" rowspan="4">Linux (ARM 32)</td>
   <td style="white-space:nowrap">3.5</td>
   <td>https://dl.google.com/coral/python/tflite_runtime-2.1.0.post1-cp35-cp35m-linux_armv7l.whl</td>
 </tr>
@@ -54,7 +54,13 @@ pip3 install https://dl.google.com/coral/python/tflite_runtime-2.1.0.post1-cp37-
   <td>https://dl.google.com/coral/python/tflite_runtime-2.1.0.post1-cp37-cp37m-linux_armv7l.whl</td>
 </tr>
 <tr>
-  <td style="white-space:nowrap" rowspan="3">Linux (ARM 64)</td>
+  <!-- ARM 32 -->
+  <td style="white-space:nowrap">3.8</td>
+  <td>https://dl.google.com/coral/python/tflite_runtime-2.1.0.post1-cp38-cp38-linux_armv7l.whl</td>
+</tr>
+
+<tr>
+  <td style="white-space:nowrap" rowspan="4">Linux (ARM 64)</td>
   <td style="white-space:nowrap">3.5</td>
   <td>https://dl.google.com/coral/python/tflite_runtime-2.1.0.post1-cp35-cp35m-linux_aarch64.whl</td>
 </tr>
@@ -69,7 +75,13 @@ pip3 install https://dl.google.com/coral/python/tflite_runtime-2.1.0.post1-cp37-
   <td>https://dl.google.com/coral/python/tflite_runtime-2.1.0.post1-cp37-cp37m-linux_aarch64.whl</td>
 </tr>
 <tr>
-  <td style="white-space:nowrap" rowspan="3">Linux (x86-64)</td>
+  <!-- ARM 64 -->
+  <td style="white-space:nowrap">3.8</td>
+  <td>https://dl.google.com/coral/python/tflite_runtime-2.1.0.post1-cp38-cp38-linux_aarch64.whl</td>
+</tr>
+
+<tr>
+  <td style="white-space:nowrap" rowspan="4">Linux (x86-64)</td>
   <td style="white-space:nowrap">3.5</td>
   <td>https://dl.google.com/coral/python/tflite_runtime-2.1.0.post1-cp35-cp35m-linux_x86_64.whl</td>
 </tr>
@@ -83,6 +95,11 @@ pip3 install https://dl.google.com/coral/python/tflite_runtime-2.1.0.post1-cp37-
   <td style="white-space:nowrap">3.7</td>
   <td>https://dl.google.com/coral/python/tflite_runtime-2.1.0.post1-cp37-cp37m-linux_x86_64.whl</td>
 </tr>
+<tr>
+  <!-- x86-64 -->
+  <td style="white-space:nowrap">3.8</td>
+  <td>https://dl.google.com/coral/python/tflite_runtime-2.1.0.post1-cp38-cp38-linux_x86_64.whl</td>
+</tr>
 
 <tr>
   <td style="white-space:nowrap" rowspan="3">macOS 10.14</td>
diff --git a/tensorflow/lite/g3doc/guide/reduce_binary_size.md b/tensorflow/lite/g3doc/guide/reduce_binary_size.md
new file mode 100644
index 00000000000..4d012efd67b
--- /dev/null
+++ b/tensorflow/lite/g3doc/guide/reduce_binary_size.md
@@ -0,0 +1,156 @@
+# Reduce TensorFlow Lite binary size
+
+## Overview
+
+When deploying models for on-device machine learning (ODML) applications, it is
+important to be aware of the limited memory that is available on mobile devices.
+Model binary sizes are closely correlated to the number of ops used in the
+model. TensorFlow Lite enables you to reduce model binary sizes by using
+selective builds. Selective builds skip unused operations in your model set and
+produce a compact library with just the runtime and the op kernels required for
+the model to run on your mobile device.
+
+Selective build applies on the following three operations libraries.
+
+1.  [TensorFlow Lite built-in ops library](https://www.tensorflow.org/lite/guide/ops_compatibility)
+1.  [TensorFlow Lite custom ops](https://www.tensorflow.org/lite/guide/ops_custom)
+1.  [Select TensorFlow ops library](https://www.tensorflow.org/lite/guide/ops_select)
+
+The table below demonstrates the impact of selective builds for some common use
+cases:
+
+<table>
+  <thead>
+    <tr>
+      <th>Model Name</th>
+      <th>Domain</th>
+      <th>Target architecture</th>
+      <th>AAR file size(s)</th>
+    </tr>
+  </thead>
+  <tr>
+    <td rowspan = 2>
+      <a href="https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224.tgz">Mobilenet_1.0_224(float)</a>
+    </td>
+    <td rowspan = 2>Image classification</td>
+    <td>armeabi-v7a</td>
+    <td>tensorflow-lite.aar (296,635 bytes)</td>
+  </tr>
+   <tr>
+    <td>arm64-v8a</td>
+    <td>tensorflow-lite.aar (382,892 bytes)</td>
+  </tr>
+  <tr>
+    <td rowspan = 2>
+      <a href="https://tfhub.dev/google/lite-model/spice/">SPICE</a>
+    </td>
+    <td rowspan = 2>Sound pitch extraction</td>
+    <td>armeabi-v7a</td>
+    <td>tensorflow-lite.aar (375,813 bytes)<br />tensorflow-lite-select-tf-ops.aar (1,676,380 bytes)</td>
+  </tr>
+   <tr>
+    <td>arm64-v8a</td>
+    <td>tensorflow-lite.aar (421,826 bytes)<br />tensorflow-lite-select-tf-ops.aar (2,298,630 bytes)</td>
+  </tr>
+  <tr>
+    <td rowspan = 2>
+      <a href="https://tfhub.dev/deepmind/i3d-kinetics-400/1">i3d-kinetics-400</a>
+    </td>
+    <td rowspan = 2>Video classification</td>
+    <td>armeabi-v7a</td>
+    <td>tensorflow-lite.aar (240,085 bytes)<br />tensorflow-lite-select-tf-ops.aar (1,708,597 bytes)</td>
+  </tr>
+   <tr>
+    <td>arm64-v8a</td>
+    <td>tensorflow-lite.aar (273,713 bytes)<br />tensorflow-lite-select-tf-ops.aar (2,339,697 bytes)</td>
+  </tr>
+ </table>
+
+Note: This feature is currently experimental and available since version 2.4 and
+may change.
+
+## Known issues/limitations
+
+1.  Selective Build for C API and iOS version is not supported currently.
+
+## Selectively build TensorFlow Lite with Bazel
+
+This section assumes that you have downloaded TensorFlow source codes and
+[set up the local development environment](https://www.tensorflow.org/lite/guide/android#build_tensorflow_lite_locally)
+to Bazel.
+
+### Build AAR files for Android project
+
+You can build the custom TensorFlow Lite AARs by providing your model file paths
+as follows.
+
+```sh
+sh tensorflow/lite/tools/build_aar.sh \
+  --input_models=/a/b/model_one.tflite,/c/d/model_two.tflite \
+  --target_archs=x86,x86_64,arm64-v8a,armeabi-v7a
+```
+
+The above command will generate the AAR file `bazel-bin/tmp/tensorflow-lite.aar`
+for TensorFlow Lite built-in and custom ops; and optionally, generates the aar
+file `bazel-bin/tmp/tensorflow-lite-select-tf-ops.aar` if your models contain
+Select TensorFlow ops. Note that this builds a "fat" AAR with several different
+architectures; if you don't need all of them, use the subset appropriate for
+your deployment environment.
+
+### Advanced Usage: Build with custom ops
+
+If you have developed Tensorflow Lite models with custom ops, you can build them
+by adding the following flags to the build command:
+
+```sh
+sh tensorflow/lite/tools/build_aar.sh \
+  --input_models=/a/b/model_one.tflite,/c/d/model_two.tflite \
+  --target_archs=x86,x86_64,arm64-v8a,armeabi-v7a \
+  --tflite_custom_ops_srcs=/e/f/file1.cc,/g/h/file2.h \
+  --tflite_custom_ops_deps=dep1,dep2
+```
+
+The `tflite_custom_ops_srcs` flag contains source files of your custom ops and
+the `tflite_custom_ops_deps` flag contains dependencies to build those source
+files. Note that these dependencies must exist in the TensorFlow repo.
+
+## Selectively Build TensorFlow Lite with Docker
+
+This section assumes that you have installed
+[Docker](https://docs.docker.com/get-docker/) on your local machine and
+[built the TensorFlow Lite docker file](https://www.tensorflow.org/lite/guide/android#set_up_build_environment_using_docker).
+
+### Build AAR files for Android project
+
+Download the script for building with Docker by running:
+
+```sh
+curl -o build_aar_with_docker.sh \
+  https://raw.githubusercontent.com/tensorflow/tensorflow/master/tensorflow/lite/tools/build_aar_with_docker.sh &&
+chmod +x build_aar_with_docker.sh
+```
+
+Then, you can build the custom TensorFlow Lite AAR by providing your model file
+paths as follows.
+
+```sh
+sh build_aar_with_docker.sh \
+  --input_models=/a/b/model_one.tflite,/c/d/model_two.tflite \
+  --target_archs=x86,x86_64,arm64-v8a,armeabi-v7a \
+  --checkpoint=master
+```
+
+The `checkpoint` flag is a commit, a branch or a tag of the TensorFlow repo that
+you want to checkout before building the libraries. The above command will
+generate the AAR file `tensorflow-lite.aar` for TensorFlow Lite built-in and
+custom ops and optionally the AAR file `tensorflow-lite-select-tf-ops.aar` for
+Select TensorFlow ops in your current directory.
+
+## Add AAR files to project
+
+Add AAR files by directly
+[importing the AAR into your project](https://www.tensorflow.org/lite/guide/android#add_aar_directly_to_project),
+or by
+[publishing the custom AAR to your local Maven repository](https://www.tensorflow.org/lite/guide/android#install_aar_to_local_maven_repository).
+Note that you have to add the AAR files for `tensorflow-lite-select-tf-ops.aar`
+as well if you generate it.
diff --git a/tensorflow/lite/g3doc/guide/roadmap.md b/tensorflow/lite/g3doc/guide/roadmap.md
index b762db12c44..7adb2d1b3ba 100644
--- a/tensorflow/lite/g3doc/guide/roadmap.md
+++ b/tensorflow/lite/g3doc/guide/roadmap.md
@@ -37,6 +37,13 @@ roadmap and provide us feedback in the
 *   **More models and examples**
     *   More examples to demonstrate model usage as well as new features and
         APIs, covering different platforms.
+*   **Task Library**
+    *   Improve the usability of the C++ Task Library, such as providing
+        prebuilt binaries and creating user-friendly workflows for users who
+        want to build from source code.
+    *   Release reference examples of using the Task Library.
+    *   Enable more task types.
+    *   Improve cross-platform support and enable more tasks for iOS.
 
 ## Performance
 
diff --git a/tensorflow/lite/g3doc/images/convert/convert.png b/tensorflow/lite/g3doc/images/convert/convert.png
new file mode 100644
index 00000000000..7fe84ff9b2d
Binary files /dev/null and b/tensorflow/lite/g3doc/images/convert/convert.png differ
diff --git a/tensorflow/lite/g3doc/images/convert/workflow.svg b/tensorflow/lite/g3doc/images/convert/workflow.svg
index c0c45628952..727f6a1dbfb 100644
--- a/tensorflow/lite/g3doc/images/convert/workflow.svg
+++ b/tensorflow/lite/g3doc/images/convert/workflow.svg
@@ -1 +1 @@
-<svg version="1.1" viewBox="0.0 0.0 620.0 380.0" fill="none" stroke="none" stroke-linecap="square" stroke-miterlimit="10" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns="http://www.w3.org/2000/svg"><clipPath id="p.0"><path d="m0 0l720.0 0l0 540.0l-720.0 0l0 -540.0z" clip-rule="nonzero"/></clipPath><g clip-path="url(#p.0)"><path fill="#000000" fill-opacity="0.0" d="m0 0l720.0 0l0 540.0l-720.0 0z" fill-rule="evenodd"/><path fill="#f3f3f3" d="m12.700788 11.509187l317.00787 0l0 292.31497l-317.00787 0z" fill-rule="evenodd"/><path stroke="#cccccc" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m12.700788 11.509187l317.00787 0l0 292.31497l-317.00787 0z" fill-rule="evenodd"/><path fill="#434343" d="m287.5871 289.67102q0 1.0 -0.75 1.546875q-0.734375 0.53125 -2.078125 0.53125q-1.421875 0 -2.21875 -0.4375l0 -1.015625q0.515625 0.265625 1.109375 0.421875q0.59375 0.140625 1.140625 0.140625q0.84375 0 1.296875 -0.265625q0.453125 -0.265625 0.453125 -0.828125q0 -0.40625 -0.359375 -0.703125q-0.359375 -0.296875 -1.40625 -0.703125q-1.0 -0.375 -1.421875 -0.640625q-0.421875 -0.28125 -0.625 -0.625q-0.203125 -0.359375 -0.203125 -0.84375q0 -0.875 0.703125 -1.375q0.71875 -0.515625 1.953125 -0.515625q1.15625 0 2.25 0.46875l-0.375 0.875q-1.078125 -0.4375 -1.953125 -0.4375q-0.765625 0 -1.15625 0.25q-0.390625 0.234375 -0.390625 0.65625q0 0.28125 0.140625 0.484375q0.15625 0.203125 0.46875 0.390625q0.328125 0.171875 1.25 0.53125q1.28125 0.453125 1.71875 0.921875q0.453125 0.46875 0.453125 1.171875zm4.764435 2.078125q-1.578125 0 -2.5 -0.953125q-0.90625 -0.96875 -0.90625 -2.671875q0 -1.734375 0.84375 -2.75q0.859375 -1.015625 2.28125 -1.015625q1.34375 0 2.125 0.890625q0.78125 0.875 0.78125 2.328125l0 0.671875l-4.90625 0q0.03125 1.265625 0.625 1.921875q0.609375 0.640625 1.703125 0.640625q1.140625 0 2.265625 -0.484375l0 0.96875q-0.5625 0.25 -1.078125 0.34375q-0.515625 0.109375 -1.234375 0.109375zm-0.296875 -6.484375q-0.859375 0 -1.375 0.5625q-0.5 0.5625 -0.59375 1.546875l3.734375 0q0 -1.015625 -0.453125 -1.5625q-0.453125 -0.546875 -1.3125 -0.546875zm8.024445 -0.90625q0.46875 0 0.84375 0.078125l-0.140625 1.0q-0.453125 -0.09375 -0.78125 -0.09375q-0.875 0 -1.5 0.703125q-0.609375 0.703125 -0.609375 1.75l0 3.828125l-1.078125 0l0 -7.125l0.890625 0l0.125 1.3125l0.0625 0q0.390625 -0.703125 0.953125 -1.078125q0.5625 -0.375 1.234375 -0.375zm3.7374573 7.265625l-2.703125 -7.125l1.15625 0l1.53125 4.21875q0.53125 1.484375 0.625 1.9375l0.046875 0q0.0625 -0.359375 0.4375 -1.4375q0.390625 -1.078125 1.71875 -4.71875l1.15625 0l-2.703125 7.125l-1.265625 0zm8.130188 0.125q-1.578125 0 -2.5 -0.953125q-0.90625 -0.96875 -0.90625 -2.671875q0 -1.734375 0.84375 -2.75q0.859375 -1.015625 2.28125 -1.015625q1.34375 0 2.125 0.890625q0.78125 0.875 0.78125 2.328125l0 0.671875l-4.90625 0q0.03125 1.265625 0.625 1.921875q0.609375 0.640625 1.703125 0.640625q1.140625 0 2.265625 -0.484375l0 0.96875q-0.5625 0.25 -1.078125 0.34375q-0.515625 0.109375 -1.234375 0.109375zm-0.296875 -6.484375q-0.859375 0 -1.375 0.5625q-0.5 0.5625 -0.59375 1.546875l3.734375 0q0 -1.015625 -0.453125 -1.5625q-0.453125 -0.546875 -1.3125 -0.546875zm8.024445 -0.90625q0.46875 0 0.84375 0.078125l-0.140625 1.0q-0.453125 -0.09375 -0.78125 -0.09375q-0.875 0 -1.5 0.703125q-0.609375 0.703125 -0.609375 1.75l0 3.828125l-1.078125 0l0 -7.125l0.890625 0l0.125 1.3125l0.0625 0q0.390625 -0.703125 0.953125 -1.078125q0.5625 -0.375 1.234375 -0.375z" fill-rule="nonzero"/><path fill="#d9d9d9" d="m70.40157 31.61155l201.6063 0l0 69.98425l-201.6063 0z" fill-rule="evenodd"/><path stroke="#cccccc" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" stroke-dasharray="4.0,3.0" d="m70.40157 31.61155l201.6063 0l0 69.98425l-201.6063 0z" fill-rule="evenodd"/><path fill="#434343" d="m129.30531 52.13155l-1.0 0l0 -7.6875l-2.7031326 0l0 -0.875l6.4218826 0l0 0.875l-2.71875 0l0 7.6875zm6.576172 0.125q-1.421875 0 -2.25 -0.875q-0.828125 -0.875 -0.828125 -2.40625q0 -1.5625 0.765625 -2.46875q0.765625 -0.921875 2.0625 -0.921875q1.203125 0 1.90625 0.796875q0.703125 0.796875 0.703125 2.09375l0 0.625l-4.421875 0q0.03125 1.125 0.5625 1.71875q0.546875 0.578125 1.53125 0.578125q1.03125 0 2.046875 -0.4375l0 0.875q-0.515625 0.21875 -0.984375 0.3125q-0.453125 0.109375 -1.09375 0.109375zm-0.265625 -5.84375q-0.78125 0 -1.25 0.5q-0.453125 0.5 -0.53125 1.390625l3.359375 0q0 -0.921875 -0.40625 -1.40625q-0.40625 -0.484375 -1.171875 -0.484375zm8.669922 5.71875l0 -4.15625q0 -0.78125 -0.359375 -1.171875q-0.34375 -0.390625 -1.109375 -0.390625q-1.015625 0 -1.484375 0.546875q-0.46875 0.546875 -0.46875 1.796875l0 3.375l-0.96875 0l0 -6.421875l0.796875 0l0.15625 0.875l0.046875 0q0.296875 -0.46875 0.828125 -0.734375q0.546875 -0.265625 1.203125 -0.265625q1.171875 0 1.75 0.5625q0.59375 0.5625 0.59375 1.796875l0 4.1875l-0.984375 0zm7.1152344 -1.75q0 0.890625 -0.671875 1.390625q-0.65625 0.484375 -1.875 0.484375q-1.265625 0 -1.984375 -0.40625l0 -0.90625q0.46875 0.234375 0.984375 0.375q0.53125 0.125 1.03125 0.125q0.765625 0 1.171875 -0.234375q0.40625 -0.25 0.40625 -0.75q0 -0.375 -0.328125 -0.640625q-0.3125 -0.265625 -1.265625 -0.625q-0.890625 -0.34375 -1.28125 -0.59375q-0.375 -0.25 -0.5625 -0.5625q-0.171875 -0.3125 -0.171875 -0.75q0 -0.78125 0.640625 -1.234375q0.640625 -0.46875 1.75 -0.46875q1.03125 0 2.03125 0.421875l-0.359375 0.796875q-0.953125 -0.390625 -1.75 -0.390625q-0.6875 0 -1.046875 0.21875q-0.34375 0.203125 -0.34375 0.59375q0 0.25 0.125 0.4375q0.140625 0.171875 0.421875 0.34375q0.296875 0.15625 1.140625 0.46875q1.140625 0.421875 1.53125 0.84375q0.40625 0.421875 0.40625 1.0625zm7.1308594 -1.46875q0 1.578125 -0.796875 2.46875q-0.78125 0.875 -2.1875 0.875q-0.859375 0 -1.53125 -0.40625q-0.65625 -0.40625 -1.03125 -1.15625q-0.359375 -0.765625 -0.359375 -1.78125q0 -1.5625 0.78125 -2.4375q0.796875 -0.890625 2.1875 -0.890625q1.34375 0 2.140625 0.90625q0.796875 0.890625 0.796875 2.421875zm-4.890625 0q0 1.234375 0.484375 1.875q0.5 0.640625 1.453125 0.640625q0.953125 0 1.4375 -0.640625q0.5 -0.640625 0.5 -1.875q0 -1.21875 -0.5 -1.859375q-0.484375 -0.640625 -1.453125 -0.640625q-0.953125 0 -1.4375 0.640625q-0.484375 0.625 -0.484375 1.859375zm9.529297 -3.328125q0.421875 0 0.765625 0.078125l-0.140625 0.90625q-0.390625 -0.09375 -0.703125 -0.09375q-0.78125 0 -1.34375 0.640625q-0.546875 0.625 -0.546875 1.5625l0 3.453125l-0.96875 0l0 -6.421875l0.796875 0l0.125 1.1875l0.046875 0q0.34375 -0.625 0.84375 -0.96875q0.515625 -0.34375 1.125 -0.34375zm3.1015625 6.546875l-1.0 0l0 -8.5625l4.78125 0l0 0.875l-3.78125 0l0 3.140625l3.546875 0l0 0.890625l-3.546875 0l0 3.65625zm6.0214844 0l-0.96875 0l0 -9.125l0.96875 0l0 9.125zm7.6132812 -3.21875q0 1.578125 -0.796875 2.46875q-0.78125 0.875 -2.1875 0.875q-0.859375 0 -1.53125 -0.40625q-0.65625 -0.40625 -1.03125 -1.15625q-0.359375 -0.765625 -0.359375 -1.78125q0 -1.5625 0.78125 -2.4375q0.796875 -0.890625 2.1875 -0.890625q1.34375 0 2.140625 0.90625q0.796875 0.890625 0.796875 2.421875zm-4.890625 0q0 1.234375 0.484375 1.875q0.5 0.640625 1.453125 0.640625q0.953125 0 1.4375 -0.640625q0.5 -0.640625 0.5 -1.875q0 -1.21875 -0.5 -1.859375q-0.484375 -0.640625 -1.453125 -0.640625q-0.953125 0 -1.4375 0.640625q-0.484375 0.625 -0.484375 1.859375zm11.841797 3.21875l-1.1875 -3.765625q-0.109375 -0.34375 -0.40625 -1.578125l-0.046875 0q-0.234375 1.03125 -0.421875 1.59375l-1.203125 3.75l-1.125 0l-1.75 -6.421875l1.015625 0q0.625 2.421875 0.9375 3.6875q0.328125 1.265625 0.375 1.703125l0.046875 0q0.0625 -0.328125 0.203125 -0.859375q0.15625 -0.53125 0.265625 -0.84375l1.171875 -3.6875l1.046875 0l1.15625 3.6875q0.328125 1.0 0.4375 1.6875l0.046875 0q0.03125 -0.203125 0.125 -0.640625q0.109375 -0.453125 1.234375 -4.734375l1.0 0l-1.765625 6.421875l-1.15625 0zm12.732422 0l-1.0625 -2.71875l-3.4375 0l-1.046875 2.71875l-1.015625 0l3.390625 -8.609375l0.828125 0l3.375 8.609375l-1.03125 0zm-1.375 -3.625l-1.0 -2.65625q-0.1875 -0.5 -0.390625 -1.234375q-0.140625 0.5625 -0.375 1.234375l-1.0 2.65625l2.765625 0zm9.015625 -2.453125q0 1.3125 -0.890625 2.015625q-0.890625 0.6875 -2.53125 0.6875l-1.015625 0l0 3.375l-1.0 0l0 -8.5625l2.234375 0q3.203125 0 3.203125 2.484375zm-4.4375 1.859375l0.90625 0q1.3125 0 1.90625 -0.421875q0.59375 -0.4375 0.59375 -1.390625q0 -0.84375 -0.5625 -1.25q-0.546875 -0.421875 -1.734375 -0.421875l-1.109375 0l0 3.484375zm6.2246094 4.21875l0 -8.5625l1.0 0l0 8.5625l-1.0 0zm7.345703 -1.75q0 0.890625 -0.671875 1.390625q-0.65625 0.484375 -1.875 0.484375q-1.265625 0 -1.984375 -0.40625l0 -0.90625q0.46875 0.234375 0.984375 0.375q0.53125 0.125 1.03125 0.125q0.765625 0 1.171875 -0.234375q0.40625 -0.25 0.40625 -0.75q0 -0.375 -0.328125 -0.640625q-0.3125 -0.265625 -1.265625 -0.625q-0.890625 -0.34375 -1.28125 -0.59375q-0.375 -0.25 -0.5625 -0.5625q-0.171875 -0.3125 -0.171875 -0.75q0 -0.78125 0.640625 -1.234375q0.640625 -0.46875 1.75 -0.46875q1.03125 0 2.03125 0.421875l-0.359375 0.796875q-0.953125 -0.390625 -1.75 -0.390625q-0.6875 0 -1.046875 0.21875q-0.34375 0.203125 -0.34375 0.59375q0 0.25 0.125 0.4375q0.140625 0.171875 0.421875 0.34375q0.296875 0.15625 1.140625 0.46875q1.140625 0.421875 1.53125 0.84375q0.40625 0.421875 0.40625 1.0625z" fill-rule="nonzero"/><path fill="#f3f3f3" d="m343.54068 100.7874l249.0079 0l0 203.02364l-249.0079 0z" fill-rule="evenodd"/><path stroke="#cccccc" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m343.54068 100.7874l249.0079 0l0 203.02364l-249.0079 0z" fill-rule="evenodd"/><path fill="#434343" d="m356.54068 291.73602q-1.546875 0 -2.40625 -0.953125q-0.84375 -0.953125 -0.84375 -2.6875q0 -1.796875 0.859375 -2.765625q0.859375 -0.984375 2.453125 -0.984375q0.515625 0 1.03125 0.109375q0.515625 0.109375 0.8125 0.265625l-0.328125 0.921875q-0.359375 -0.15625 -0.796875 -0.25q-0.421875 -0.09375 -0.734375 -0.09375q-2.171875 0 -2.171875 2.78125q0 1.3125 0.515625 2.015625q0.53125 0.703125 1.578125 0.703125q0.890625 0 1.828125 -0.390625l0 0.96875q-0.71875 0.359375 -1.796875 0.359375zm4.5639343 -0.125l-1.078125 0l0 -10.125l1.078125 0l0 10.125zm3.3710938 0l-1.078125 0l0 -7.125l1.078125 0l0 7.125zm-1.171875 -9.0625q0 -0.375 0.1875 -0.546875q0.1875 -0.171875 0.453125 -0.171875q0.265625 0 0.453125 0.171875q0.1875 0.171875 0.1875 0.546875q0 0.359375 -0.1875 0.546875q-0.1875 0.171875 -0.453125 0.171875q-0.265625 0 -0.453125 -0.171875q-0.1875 -0.1875 -0.1875 -0.546875zm6.480438 9.1875q-1.578125 0 -2.5 -0.953125q-0.90625 -0.96875 -0.90625 -2.671875q0 -1.734375 0.84375 -2.75q0.859375 -1.015625 2.28125 -1.015625q1.34375 0 2.125 0.890625q0.78125 0.875 0.78125 2.328125l0 0.671875l-4.90625 0q0.03125 1.265625 0.625 1.921875q0.609375 0.640625 1.703125 0.640625q1.140625 0 2.265625 -0.484375l0 0.96875q-0.5625 0.25 -1.078125 0.34375q-0.515625 0.109375 -1.234375 0.109375zm-0.296875 -6.484375q-0.859375 0 -1.375 0.5625q-0.5 0.5625 -0.59375 1.546875l3.734375 0q0 -1.015625 -0.453125 -1.5625q-0.453125 -0.546875 -1.3125 -0.546875zm9.649445 6.359375l0 -4.609375q0 -0.875 -0.40625 -1.296875q-0.390625 -0.4375 -1.234375 -0.4375q-1.125 0 -1.65625 0.609375q-0.515625 0.59375 -0.515625 2.0l0 3.734375l-1.078125 0l0 -7.125l0.890625 0l0.171875 0.96875l0.046875 0q0.328125 -0.53125 0.921875 -0.8125q0.609375 -0.296875 1.34375 -0.296875q1.296875 0 1.9375 0.625q0.65625 0.625 0.65625 1.984375l0 4.65625l-1.078125 0zm5.6022644 -0.765625q0.28125 0 0.546875 -0.03125q0.265625 -0.046875 0.421875 -0.09375l0 0.828125q-0.171875 0.078125 -0.515625 0.125q-0.34375 0.0625 -0.609375 0.0625q-2.078125 0 -2.078125 -2.171875l0 -4.25l-1.015625 0l0 -0.515625l1.015625 -0.453125l0.453125 -1.515625l0.625 0l0 1.65625l2.078125 0l0 0.828125l-2.078125 0l0 4.203125q0 0.640625 0.3125 0.984375q0.3125 0.34375 0.84375 0.34375z" fill-rule="nonzero"/><path fill="#f4cccc" d="m127.456696 236.45866l87.49606 0l0 30.992111l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m127.456696 236.45866l87.49606 0l0 30.992111l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m155.70459 249.47473l0 -6.734375l-2.125 0l0 -0.75l5.1875 0l0 0.75l-2.21875 0l0 6.734375l-0.84375 0zm4.437546 0l0 -7.484375l4.3125 0l0 0.734375l-3.46875 0l0 2.34375l2.796875 0l0 0.734375l-2.796875 0l0 3.671875l-0.84375 0zm5.859421 -7.46875l0.984375 0l0 0.078125q-0.078125 0.078125 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0 6.234375l3.59375 0l0 0.71875l-4.46875 0l0 -7.46875zm6.406296 7.46875l0 -0.703125l1.40625 0l0 -4.078125l-1.34375 0l0 -0.703125l2.203125 0l0 4.78125l1.28125 0l0 0.703125l-3.546875 0zm1.78125 -6.640625q-0.25 0 -0.4375 -0.171875q-0.171875 -0.1875 -0.171875 -0.4375q0 -0.265625 0.171875 -0.4375q0.171875 -0.1875 0.4375 -0.1875q0.25 0 0.4375 0.1875q0.1875 0.1875 0.1875 0.4375q0 0.25 -0.1875 0.4375q-0.1875 0.171875 -0.4375 0.171875zm8.343796 6.140625q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm3.9375458 0.625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0z" fill-rule="nonzero"/><path fill="#000000" d="m147.59514 262.88098q0.4375 0 0.84375 -0.25q0.40625 -0.25 0.65625 -0.671875l0.625 0.40625q-0.375 0.625 -0.875 0.9375q-0.5 0.296875 -1.21875 0.296875q-0.84375 0 -1.5 -0.40625q-0.65625 -0.421875 -1.046875 -1.265625q-0.390625 -0.859375 -0.390625 -2.15625q0 -1.375 0.421875 -2.234375q0.421875 -0.859375 1.0625 -1.21875q0.65625 -0.375 1.40625 -0.375q0.78125 0 1.359375 0.390625q0.59375 0.390625 0.890625 1.078125l-0.71875 0.34375q-0.015625 0 -0.015625 0q0 -0.015625 0 -0.015625q-0.3125 -0.625 -0.703125 -0.875q-0.375 -0.25 -0.84375 -0.25q-0.9375 0 -1.484375 0.828125q-0.546875 0.8125 -0.546875 2.28125q0 0.921875 0.265625 1.640625q0.28125 0.71875 0.75 1.125q0.484375 0.390625 1.0625 0.390625zm1.375 -5.171875q0.015625 -0.015625 0.015625 -0.015625q0.03125 0 0.109375 0.0625l-0.09375 0.046875l-0.03125 -0.09375zm0.140625 0.046875q0.046875 0.109375 -0.015625 0l0.015625 0zm4.093796 5.8125q-0.734375 0 -1.3125 -0.359375q-0.578125 -0.359375 -0.90625 -1.0q-0.3125 -0.65625 -0.3125 -1.484375q0 -0.828125 0.3125 -1.46875q0.328125 -0.65625 0.90625 -1.015625q0.578125 -0.375 1.3125 -0.375q0.734375 0 1.3125 0.375q0.578125 0.359375 0.890625 1.015625q0.328125 0.640625 0.328125 1.46875q0 0.828125 -0.328125 1.484375q-0.3125 0.640625 -0.890625 1.0q-0.578125 0.359375 -1.3125 0.359375zm0 -0.71875q0.46875 0 0.828125 -0.265625q0.375 -0.28125 0.578125 -0.765625q0.21875 -0.484375 0.21875 -1.109375q0 -0.9375 -0.46875 -1.53125q-0.453125 -0.59375 -1.15625 -0.59375q-0.703125 0 -1.171875 0.59375q-0.453125 0.59375 -0.453125 1.53125q0 0.625 0.203125 1.109375q0.21875 0.484375 0.578125 0.765625q0.375 0.265625 0.84375 0.265625zm3.8594208 -4.859375l0.84375 0l0 0.96875q0.328125 -0.5 0.8125 -0.796875q0.5 -0.296875 1.046875 -0.296875q0.734375 0 1.171875 0.5625q0.4375 0.546875 0.4375 1.71875l0 3.328125l-0.84375 0l0 -3.296875q0 -0.8125 -0.28125 -1.1875q-0.265625 -0.375 -0.71875 -0.375q-0.375 0 -0.75 0.21875q-0.375 0.21875 -0.625 0.609375q-0.25 0.390625 -0.25 0.875l0 3.15625l-0.84375 0l0 -5.484375zm10.593796 0q-0.1875 0.96875 -0.796875 2.40625l-1.328125 3.078125l-0.671875 0l-2.171875 -5.484375l0.859375 0l1.6875 4.296875l0.890625 -2.03125q0.546875 -1.25 0.71875 -2.265625l0.8125 0zm3.8125458 5.609375q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm6.640671 -1.0625l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.375 0.21875 -0.625 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.875 0l0 -5.5l0.90625 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.484375 -0.296875 1.046875 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625zm6.343796 3.796875q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm3.9375458 0.625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm6.640671 -1.0625l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.375 0.21875 -0.625 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.875 0l0 -5.5l0.90625 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.484375 -0.296875 1.046875 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625z" fill-rule="nonzero"/><path fill="#93c47d" d="m230.92084 236.45866l87.49606 0l0 30.992111l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m230.92084 236.45866l87.49606 0l0 30.992111l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m259.16873 249.47473l0 -6.734375l-2.125 0l0 -0.75l5.1875 0l0 0.75l-2.21875 0l0 6.734375l-0.84375 0zm4.437561 0l0 -7.484375l4.3125 0l0 0.734375l-3.46875 0l0 2.34375l2.796875 0l0 0.734375l-2.796875 0l0 3.671875l-0.84375 0zm5.8594055 -7.46875l0.984375 0l0 0.078125q-0.078125 0.078125 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0 6.234375l3.59375 0l0 0.71875l-4.46875 0l0 -7.46875zm6.406311 7.46875l0 -0.703125l1.40625 0l0 -4.078125l-1.34375 0l0 -0.703125l2.203125 0l0 4.78125l1.28125 0l0 0.703125l-3.546875 0zm1.78125 -6.640625q-0.25 0 -0.4375 -0.171875q-0.171875 -0.1875 -0.171875 -0.4375q0 -0.265625 0.171875 -0.4375q0.171875 -0.1875 0.4375 -0.1875q0.25 0 0.4375 0.1875q0.1875 0.1875 0.1875 0.4375q0 0.25 -0.1875 0.4375q-0.1875 0.171875 -0.4375 0.171875zm8.3437805 6.140625q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm3.937561 0.625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0z" fill-rule="nonzero"/><path fill="#000000" d="m245.60614 263.47473l0 -7.484375l4.3125 0l0 0.734375l-3.46875 0l0 2.34375l2.796875 0l0 0.734375l-2.796875 0l0 3.671875l-0.84375 0zm6.015671 0l0 -0.703125l1.609375 0l0 -6.578125l-1.546875 0l0 -0.703125l2.421875 0l0 7.28125l1.609375 0l0 0.703125l-4.09375 0zm7.968796 -5.609375q1.1875 0 1.796875 0.625q0.625 0.609375 0.625 2.0625l0 2.921875l-0.9375 0l0 -0.84375q-0.5 0.96875 -1.875 0.96875q-0.90625 0 -1.421875 -0.40625q-0.515625 -0.421875 -0.515625 -1.09375q0 -0.578125 0.359375 -1.0q0.375 -0.4375 1.0 -0.671875q0.640625 -0.234375 1.390625 -0.234375q0.6875 0 1.234375 0.0625q-0.0625 -0.921875 -0.484375 -1.296875q-0.40625 -0.375 -1.21875 -0.375q-0.421875 0 -0.796875 0.15625q-0.375 0.15625 -0.6875 0.453125l-0.421875 -0.5625q0.765625 -0.765625 1.953125 -0.765625zm-0.3125 5.078125q0.890625 0 1.40625 -0.515625q0.515625 -0.53125 0.5625 -1.515625q-0.53125 -0.078125 -1.15625 -0.078125q-0.90625 0 -1.4375 0.296875q-0.53125 0.296875 -0.53125 0.921875q0 0.890625 1.15625 0.890625zm8.718811 0.03125q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm3.8594055 0.625q-0.484375 0 -0.90625 -0.21875q-0.421875 -0.21875 -0.703125 -0.625l-0.3125 0.71875l-0.546875 0l0 -7.984375l0.984375 0l0 0.09375q-0.078125 0.0625 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0 2.8125q0.265625 -0.453125 0.71875 -0.703125q0.453125 -0.265625 0.921875 -0.265625q1.03125 0 1.640625 0.71875q0.609375 0.71875 0.609375 2.09375q0 0.9375 -0.328125 1.609375q-0.3125 0.65625 -0.84375 0.984375q-0.53125 0.328125 -1.125 0.328125zm-0.109375 -0.765625q0.65625 0 1.078125 -0.5q0.4375 -0.515625 0.4375 -1.609375q0 -1.046875 -0.40625 -1.578125q-0.390625 -0.546875 -1.078125 -0.546875q-0.671875 0 -1.09375 0.609375q-0.421875 0.59375 -0.421875 1.546875q0 2.078125 1.484375 2.078125zm5.578186 0.765625q-0.875 0 -1.390625 -0.640625q-0.515625 -0.640625 -0.5 -1.90625l0.015625 -3.0625l0.84375 0l0 3.0625q0 0.984375 0.328125 1.421875q0.34375 0.4375 0.921875 0.4375q0.609375 0 1.03125 -0.484375q0.4375 -0.484375 0.4375 -1.40625l0 -3.03125l0.84375 0l0 4.625q0 0.296875 0.015625 0.484375q0.015625 0.1875 0.09375 0.375l-0.828125 0q-0.078125 -0.1875 -0.09375 -0.375q-0.015625 -0.1875 -0.015625 -0.46875q-0.265625 0.453125 -0.71875 0.71875q-0.453125 0.25 -0.984375 0.25zm8.5625305 -6.78125l-0.015625 0.015625q-0.328125 -0.421875 -0.5625 -0.5625q-0.234375 -0.15625 -0.65625 -0.15625q-0.5625 0 -0.890625 0.34375q-0.328125 0.328125 -0.328125 1.078125l0 0.453125l1.8125 0l0 0.6875l-1.8125 0l0 4.796875l-0.828125 0l0 -4.796875l-1.1875 0l0 -0.6875l1.1875 0l0 -0.453125q0 -1.0625 0.546875 -1.578125q0.546875 -0.53125 1.46875 -0.53125q0.53125 0 0.984375 0.1875q0.453125 0.1875 0.734375 0.5625l-0.453125 0.640625zm-0.140625 0.03125q0 -0.03125 0.046875 0q0.046875 0.015625 0.0625 0.015625l-0.046875 0.046875l-0.0625 -0.046875l0 -0.015625zm0.125 -0.015625q0.078125 0.09375 0.03125 0.0625q-0.03125 -0.03125 -0.046875 -0.03125l0.015625 -0.03125zm6.015686 -0.015625l-0.015625 0.015625q-0.328125 -0.421875 -0.5625 -0.5625q-0.234375 -0.15625 -0.65625 -0.15625q-0.5625 0 -0.890625 0.34375q-0.328125 0.328125 -0.328125 1.078125l0 0.453125l1.8125 0l0 0.6875l-1.8125 0l0 4.796875l-0.828125 0l0 -4.796875l-1.1875 0l0 -0.6875l1.1875 0l0 -0.453125q0 -1.0625 0.546875 -1.578125q0.546875 -0.53125 1.46875 -0.53125q0.53125 0 0.984375 0.1875q0.453125 0.1875 0.734375 0.5625l-0.453125 0.640625zm-0.140625 0.03125q0 -0.03125 0.046875 0q0.046875 0.015625 0.0625 0.015625l-0.046875 0.046875l-0.0625 -0.046875l0 -0.015625zm0.125 -0.015625q0.078125 0.09375 0.03125 0.0625q-0.03125 -0.03125 -0.046875 -0.03125l0.015625 -0.03125zm4.0625305 6.765625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm6.640686 -1.0625l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.375 0.21875 -0.625 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.875 0l0 -5.5l0.90625 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.484375 -0.296875 1.046875 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625z" fill-rule="nonzero"/><path fill="#f4cccc" d="m359.81628 236.43504l87.49606 0l0 30.992111l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m359.81628 236.43504l87.49606 0l0 30.992111l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m388.06418 249.4511l0 -6.734375l-2.125 0l0 -0.75l5.1875 0l0 0.75l-2.21875 0l0 6.734375l-0.84375 0zm4.4375305 0l0 -7.484375l4.3125 0l0 0.734375l-3.46875 0l0 2.34375l2.796875 0l0 0.734375l-2.796875 0l0 3.671875l-0.84375 0zm5.859436 -7.46875l0.984375 0l0 0.078125q-0.078125 0.078125 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0 6.234375l3.59375 0l0 0.71875l-4.46875 0l0 -7.46875zm6.4062805 7.46875l0 -0.703125l1.40625 0l0 -4.078125l-1.34375 0l0 -0.703125l2.203125 0l0 4.78125l1.28125 0l0 0.703125l-3.546875 0zm1.78125 -6.640625q-0.25 0 -0.4375 -0.171875q-0.171875 -0.1875 -0.171875 -0.4375q0 -0.265625 0.171875 -0.4375q0.171875 -0.1875 0.4375 -0.1875q0.25 0 0.4375 0.1875q0.1875 0.1875 0.1875 0.4375q0 0.25 -0.1875 0.4375q-0.1875 0.171875 -0.4375 0.171875zm8.343811 6.140625q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm3.9375305 0.625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0z" fill-rule="nonzero"/><path fill="#000000" d="m371.76718 263.4511l0 -0.703125l1.40625 0l0 -4.078125l-1.34375 0l0 -0.703125l2.203125 0l0 4.78125l1.28125 0l0 0.703125l-3.546875 0zm1.78125 -6.640625q-0.25 0 -0.4375 -0.171875q-0.171875 -0.1875 -0.171875 -0.4375q0 -0.26564026 0.171875 -0.43751526q0.171875 -0.1875 0.4375 -0.1875q0.25 0 0.4375 0.1875q0.1875 0.1875 0.1875 0.43751526q0 0.25 -0.1875 0.4375q-0.1875 0.171875 -0.4375 0.171875zm3.8750305 1.15625l0.84375 0l0 0.96875q0.328125 -0.5 0.8125 -0.796875q0.5 -0.296875 1.046875 -0.296875q0.734375 0 1.171875 0.5625q0.4375 0.546875 0.4375 1.71875l0 3.328125l-0.84375 0l0 -3.296875q0 -0.8125 -0.28125 -1.1875q-0.265625 -0.375 -0.71875 -0.375q-0.375 0 -0.75 0.21875q-0.375 0.21875 -0.625 0.609375q-0.25 0.390625 -0.25 0.875l0 3.15625l-0.84375 0l0 -5.484375zm10.468811 4.984375q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm3.9375305 0.625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm6.640686 -1.0625l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.375 0.21875 -0.625 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.875 0l0 -5.5l0.90625 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.484375 -0.296875 1.046875 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625zm1.7344055 -1.1875l0.875 0l0 1.078125q0.1875 -0.59375 0.625 -0.890625q0.4375 -0.3125 1.046875 -0.3125q0.625 0 1.140625 0.328125q0.53125 0.3125 0.84375 0.953125q0.3125 0.625 0.3125 1.546875q0 0.921875 -0.328125 1.59375q-0.328125 0.65625 -0.859375 1.0q-0.53125 0.328125 -1.140625 0.328125q-0.484375 0 -0.921875 -0.21875q-0.421875 -0.234375 -0.703125 -0.640625l0 2.71875l-0.890625 0l0 -7.484375zm2.375 4.859375q0.65625 0 1.109375 -0.5q0.453125 -0.5 0.453125 -1.625q0 -1.015625 -0.40625 -1.5625q-0.390625 -0.546875 -1.125 -0.546875q-0.671875 0 -1.109375 0.578125q-0.421875 0.5625 -0.421875 1.71875q0.03125 0.953125 0.421875 1.453125q0.390625 0.484375 1.078125 0.484375zm8.015686 -3.71875l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.375 0.21875 -0.625 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.875 0l0 -5.5l0.90625 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.484375 -0.296875 1.046875 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625zm4.2812805 4.421875q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm6.859436 2.78125q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm3.9375305 0.625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm6.640686 -1.0625l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.375 0.21875 -0.625 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.875 0l0 -5.5l0.90625 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.484375 -0.296875 1.046875 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625z" fill-rule="nonzero"/><path fill="#c9daf8" d="m495.33072 194.56627l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m495.33072 194.56627l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m531.6832 210.09796l2.90625 0l0 4.15625q-0.6875 0.21875 -1.390625 0.328125q-0.703125 0.125 -1.625 0.125q-1.9375 0 -3.03125 -1.15625q-1.078125 -1.171875 -1.078125 -3.25q0 -1.34375 0.53125 -2.34375q0.546875 -1.0 1.546875 -1.53125q1.015625 -0.53125 2.359375 -0.53125q1.375 0 2.5625 0.5l-0.390625 0.875q-1.15625 -0.484375 -2.234375 -0.484375q-1.5625 0 -2.453125 0.9375q-0.875 0.921875 -0.875 2.578125q0 1.734375 0.84375 2.640625q0.859375 0.890625 2.5 0.890625q0.890625 0 1.734375 -0.21875l0 -2.625l-1.90625 0l0 -0.890625zm10.392578 -1.59375q0 1.3125 -0.890625 2.015625q-0.890625 0.6875 -2.53125 0.6875l-1.015625 0l0 3.375l-1.0 0l0 -8.5625l2.234375 0q3.203125 0 3.203125 2.484375zm-4.4375 1.859375l0.90625 0q1.3125 0 1.90625 -0.421875q0.59375 -0.4375 0.59375 -1.390625q0 -0.84375 -0.5625 -1.25q-0.546875 -0.421875 -1.734375 -0.421875l-1.109375 0l0 3.484375zm12.693359 -4.34375l0 5.53125q0 1.46875 -0.890625 2.3125q-0.875 0.84375 -2.421875 0.84375q-1.546875 0 -2.390625 -0.84375q-0.84375 -0.859375 -0.84375 -2.328125l0 -5.515625l1.0 0l0 5.578125q0 1.078125 0.578125 1.65625q0.59375 0.578125 1.71875 0.578125q1.09375 0 1.671875 -0.578125q0.59375 -0.578125 0.59375 -1.65625l0 -5.578125l0.984375 0z" fill-rule="nonzero"/><path fill="#c9daf8" d="m495.33072 236.43504l87.49606 0l0 30.992111l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m495.33072 236.43504l87.49606 0l0 30.992111l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m532.15686 248.65422q-1.40625 0 -2.234375 0.9375q-0.8125 0.9375 -0.8125 2.578125q0 1.671875 0.78125 2.59375q0.796875 0.921875 2.25 0.921875q0.90625 0 2.046875 -0.328125l0 0.87501526q-0.890625 0.34375 -2.1875 0.34375q-1.890625 0 -2.921875 -1.1562653q-1.03125 -1.15625 -1.03125 -3.265625q0 -1.328125 0.484375 -2.3125q0.5 -1.0 1.4375 -1.53125q0.9375 -0.546875 2.203125 -0.546875q1.34375 0 2.359375 0.484375l-0.421875 0.859375q-0.984375 -0.453125 -1.953125 -0.453125zm9.3359375 1.71875q0 1.3125 -0.890625 2.015625q-0.890625 0.6875 -2.53125 0.6875l-1.015625 0l0 3.3750153l-1.0 0l0 -8.562515l2.234375 0q3.203125 0 3.203125 2.484375zm-4.4375 1.859375l0.90625 0q1.3125 0 1.90625 -0.421875q0.59375 -0.4375 0.59375 -1.390625q0 -0.84375 -0.5625 -1.25q-0.546875 -0.421875 -1.734375 -0.421875l-1.109375 0l0 3.484375zm12.693359 -4.34375l0 5.53125q0 1.46875 -0.890625 2.3125q-0.875 0.84376526 -2.421875 0.84376526q-1.546875 0 -2.390625 -0.84376526q-0.84375 -0.859375 -0.84375 -2.328125l0 -5.515625l1.0 0l0 5.578125q0 1.078125 0.578125 1.65625q0.59375 0.578125 1.71875 0.578125q1.09375 0 1.671875 -0.578125q0.59375 -0.578125 0.59375 -1.65625l0 -5.578125l0.984375 0z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m214.95276 251.95473l15.968506 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m214.95276 251.95473l12.541412 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m227.49417 251.95473l-1.124588 1.124588l3.0897675 -1.124588l-3.0897675 -1.124588z" fill-rule="evenodd"/><path fill="#d9ead3" d="m352.5748 58.32546l20.53543 0l0 20.535435l-20.53543 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m352.5748 58.32546l20.53543 0l0 20.535435l-20.53543 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m377.0735 52.719162l77.480316 0l0 31.748032l-77.480316 0z" fill-rule="evenodd"/><path fill="#000000" d="m394.0891 68.87978q0 2.109375 -1.15625 3.234375q-1.140625 1.125 -3.3125 1.125l-2.375 0l0 -8.5625l2.625 0q2.0 0 3.109375 1.109375q1.109375 1.09375 1.109375 3.09375zm-1.046875 0.03125q0 -1.671875 -0.84375 -2.515625q-0.84375 -0.859375 -2.5 -0.859375l-1.453125 0l0 6.84375l1.21875 0q1.78125 0 2.671875 -0.875q0.90625 -0.875 0.90625 -2.59375zm6.763672 4.328125l-0.203125 -0.921875l-0.046875 0q-0.46875 0.609375 -0.953125 0.828125q-0.46875 0.21875 -1.1875 0.21875q-0.953125 0 -1.5 -0.5q-0.546875 -0.5 -0.546875 -1.40625q0 -1.9375 3.109375 -2.03125l1.09375 -0.03125l0 -0.40625q0 -0.75 -0.328125 -1.109375q-0.3125 -0.359375 -1.03125 -0.359375q-0.8125 0 -1.8125 0.484375l-0.3125 -0.75q0.484375 -0.25 1.046875 -0.390625q0.5625 -0.15625 1.140625 -0.15625q1.140625 0 1.6875 0.515625q0.5625 0.5 0.5625 1.625l0 4.390625l-0.71875 0zm-2.203125 -0.6875q0.90625 0 1.421875 -0.5q0.53125 -0.5 0.53125 -1.390625l0 -0.578125l-0.984375 0.03125q-1.15625 0.046875 -1.671875 0.375q-0.5 0.3125 -0.5 0.984375q0 0.53125 0.3125 0.8125q0.3125 0.265625 0.890625 0.265625zm7.001953 0q0.25 0 0.484375 -0.03125q0.25 -0.046875 0.390625 -0.078125l0 0.734375q-0.15625 0.078125 -0.46875 0.125q-0.296875 0.0625 -0.546875 0.0625q-1.859375 0 -1.859375 -1.96875l0 -3.828125l-0.921875 0l0 -0.46875l0.921875 -0.40625l0.40625 -1.359375l0.5625 0l0 1.484375l1.859375 0l0 0.75l-1.859375 0l0 3.78125q0 0.578125 0.265625 0.890625q0.28125 0.3125 0.765625 0.3125zm6.111328 0.6875l-0.203125 -0.921875l-0.046875 0q-0.46875 0.609375 -0.953125 0.828125q-0.46875 0.21875 -1.1875 0.21875q-0.953125 0 -1.5 -0.5q-0.546875 -0.5 -0.546875 -1.40625q0 -1.9375 3.109375 -2.03125l1.09375 -0.03125l0 -0.40625q0 -0.75 -0.328125 -1.109375q-0.3125 -0.359375 -1.03125 -0.359375q-0.8125 0 -1.8125 0.484375l-0.3125 -0.75q0.484375 -0.25 1.046875 -0.390625q0.5625 -0.15625 1.140625 -0.15625q1.140625 0 1.6875 0.515625q0.5625 0.5 0.5625 1.625l0 4.390625l-0.71875 0zm-2.203125 -0.6875q0.90625 0 1.421875 -0.5q0.53125 -0.5 0.53125 -1.390625l0 -0.578125l-0.984375 0.03125q-1.15625 0.046875 -1.671875 0.375q-0.5 0.3125 -0.5 0.984375q0 0.53125 0.3125 0.8125q0.3125 0.265625 0.890625 0.265625zm10.822266 0.6875l-1.0 0l0 -7.6875l-2.703125 0l0 -0.875l6.421875 0l0 0.875l-2.71875 0l0 7.6875zm2.8417969 -6.421875l1.046875 0l1.40625 3.65625q0.453125 1.265625 0.5625 1.8125l0.046875 0q0.078125 -0.296875 0.3125 -1.015625q0.25 -0.734375 1.609375 -4.453125l1.03125 0l-2.75 7.3125q-0.421875 1.078125 -0.96875 1.53125q-0.546875 0.46875 -1.34375 0.46875q-0.4375 0 -0.875 -0.109375l0 -0.78125q0.328125 0.078125 0.71875 0.078125q1.0 0 1.4375 -1.125l0.359375 -0.921875l-2.59375 -6.453125zm10.046875 6.546875q-0.625 0 -1.140625 -0.234375q-0.515625 -0.234375 -0.875 -0.71875l-0.0625 0q0.0625 0.5625 0.0625 1.0625l0 2.65625l-0.96875 0l0 -9.3125l0.796875 0l0.125 0.875l0.046875 0q0.375 -0.53125 0.875 -0.765625q0.5 -0.234375 1.140625 -0.234375q1.28125 0 1.96875 0.875q0.703125 0.875 0.703125 2.453125q0 1.578125 -0.703125 2.46875q-0.703125 0.875 -1.96875 0.875zm-0.140625 -5.84375q-0.984375 0 -1.421875 0.546875q-0.4375 0.546875 -0.453125 1.734375l0 0.21875q0 1.359375 0.453125 1.9375q0.453125 0.578125 1.453125 0.578125q0.828125 0 1.296875 -0.671875q0.46875 -0.671875 0.46875 -1.859375q0 -1.203125 -0.46875 -1.84375q-0.46875 -0.640625 -1.328125 -0.640625zm7.2285156 5.84375q-1.421875 0 -2.25 -0.875q-0.828125 -0.875 -0.828125 -2.40625q0 -1.5625 0.765625 -2.46875q0.765625 -0.921875 2.0625 -0.921875q1.203125 0 1.90625 0.796875q0.703125 0.796875 0.703125 2.09375l0 0.625l-4.421875 0q0.03125 1.125 0.5625 1.71875q0.546875 0.578125 1.53125 0.578125q1.03125 0 2.046875 -0.4375l0 0.875q-0.515625 0.21875 -0.984375 0.3125q-0.453125 0.109375 -1.09375 0.109375zm-0.265625 -5.84375q-0.78125 0 -1.25 0.5q-0.453125 0.5 -0.53125 1.390625l3.359375 0q0 -0.921875 -0.40625 -1.40625q-0.40625 -0.484375 -1.171875 -0.484375z" fill-rule="nonzero"/><path fill="#c9daf8" d="m469.5223 58.951443l20.53543 0l0 20.535435l-20.53543 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m469.5223 58.951443l20.53543 0l0 20.535435l-20.53543 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m494.021 53.345146l100.0 0l0 31.748032l-100.0 0z" fill-rule="evenodd"/><path fill="#000000" d="m504.19287 65.30264l2.421875 0q1.703125 0 2.46875 0.515625q0.765625 0.5 0.765625 1.59375q0 0.765625 -0.421875 1.265625q-0.421875 0.5 -1.25 0.640625l0 0.0625q1.953125 0.328125 1.953125 2.046875q0 1.140625 -0.78125 1.796875q-0.765625 0.640625 -2.15625 0.640625l-3.0 0l0 -8.5625zm1.0 3.65625l1.640625 0q1.0625 0 1.515625 -0.328125q0.46875 -0.328125 0.46875 -1.109375q0 -0.71875 -0.515625 -1.03125q-0.515625 -0.328125 -1.640625 -0.328125l-1.46875 0l0 2.796875zm0 0.84375l0 3.21875l1.796875 0q1.03125 0 1.546875 -0.40625q0.53125 -0.40625 0.53125 -1.265625q0 -0.796875 -0.546875 -1.171875q-0.53125 -0.375 -1.625 -0.375l-1.703125 0zm10.587891 4.0625l-0.203125 -0.921875l-0.046875 0q-0.46875 0.609375 -0.953125 0.828125q-0.46875 0.21875 -1.1875 0.21875q-0.953125 0 -1.5 -0.5q-0.546875 -0.5 -0.546875 -1.40625q0 -1.9375 3.109375 -2.03125l1.09375 -0.03125l0 -0.40625q0 -0.75 -0.328125 -1.109375q-0.3125 -0.359375 -1.03125 -0.359375q-0.8125 0 -1.8125 0.484375l-0.3125 -0.75q0.484375 -0.25 1.046875 -0.390625q0.5625 -0.15625 1.140625 -0.15625q1.140625 0 1.6875 0.515625q0.5625 0.5 0.5625 1.625l0 4.390625l-0.71875 0zm-2.203125 -0.6875q0.90625 0 1.421875 -0.5q0.53125 -0.5 0.53125 -1.390625l0 -0.578125l-0.984375 0.03125q-1.15625 0.046875 -1.671875 0.375q-0.5 0.3125 -0.5 0.984375q0 0.53125 0.3125 0.8125q0.3125 0.265625 0.890625 0.265625zm7.486328 0.8125q-1.390625 0 -2.15625 -0.859375q-0.765625 -0.859375 -0.765625 -2.4375q0 -1.609375 0.78125 -2.484375q0.78125 -0.890625 2.203125 -0.890625q0.46875 0 0.921875 0.109375q0.46875 0.09375 0.734375 0.234375l-0.296875 0.828125q-0.328125 -0.140625 -0.703125 -0.21875q-0.375 -0.078125 -0.671875 -0.078125q-1.953125 0 -1.953125 2.484375q0 1.1875 0.46875 1.828125q0.484375 0.625 1.421875 0.625q0.796875 0 1.640625 -0.34375l0 0.859375q-0.640625 0.34375 -1.625 0.34375zm4.1191406 -3.40625q0.25 -0.359375 0.765625 -0.9375l2.0625 -2.203125l1.15625 0l-2.59375 2.734375l2.78125 3.6875l-1.171875 0l-2.28125 -3.03125l-0.71875 0.625l0 2.40625l-0.96875 0l0 -9.125l0.96875 0l0 4.84375q0 0.3125 -0.046875 1.0l0.046875 0zm8.048828 3.40625q-1.421875 0 -2.25 -0.875q-0.828125 -0.875 -0.828125 -2.40625q0 -1.5625 0.765625 -2.46875q0.765625 -0.921875 2.0625 -0.921875q1.203125 0 1.90625 0.796875q0.703125 0.796875 0.703125 2.09375l0 0.625l-4.421875 0q0.03125 1.125 0.5625 1.71875q0.546875 0.578125 1.53125 0.578125q1.03125 0 2.046875 -0.4375l0 0.875q-0.515625 0.21875 -0.984375 0.3125q-0.453125 0.109375 -1.09375 0.109375zm-0.265625 -5.84375q-0.78125 0 -1.25 0.5q-0.453125 0.5 -0.53125 1.390625l3.359375 0q0 -0.921875 -0.40625 -1.40625q-0.40625 -0.484375 -1.171875 -0.484375zm8.669922 5.71875l0 -4.15625q0 -0.78125 -0.359375 -1.171875q-0.34375 -0.390625 -1.109375 -0.390625q-1.015625 0 -1.484375 0.546875q-0.46875 0.546875 -0.46875 1.796875l0 3.375l-0.96875 0l0 -6.421875l0.796875 0l0.15625 0.875l0.046875 0q0.296875 -0.46875 0.828125 -0.734375q0.546875 -0.265625 1.203125 -0.265625q1.171875 0 1.75 0.5625q0.59375 0.5625 0.59375 1.796875l0 4.1875l-0.984375 0zm7.3496094 -0.859375l-0.0625 0q-0.671875 0.984375 -2.015625 0.984375q-1.25 0 -1.953125 -0.859375q-0.703125 -0.875 -0.703125 -2.453125q0 -1.59375 0.703125 -2.46875q0.703125 -0.890625 1.953125 -0.890625q1.3125 0 2.015625 0.953125l0.078125 0l-0.046875 -0.453125l-0.03125 -0.453125l0 -2.625l0.984375 0l0 9.125l-0.796875 0l-0.125 -0.859375zm-1.953125 0.15625q1.0 0 1.4375 -0.53125q0.453125 -0.546875 0.453125 -1.75l0 -0.203125q0 -1.375 -0.453125 -1.953125q-0.453125 -0.578125 -1.4375 -0.578125q-0.859375 0 -1.3125 0.671875q-0.453125 0.65625 -0.453125 1.859375q0 1.234375 0.4375 1.859375q0.453125 0.625 1.328125 0.625z" fill-rule="nonzero"/><path fill="#d9ead3" d="m26.510082 150.08989l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m26.510082 150.08989l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m51.586056 162.60596q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm5.890671 -6.15625l-0.015625 0.015625q-0.328125 -0.421875 -0.5625 -0.5625q-0.234375 -0.15625 -0.65625 -0.15625q-0.5625 0 -0.890625 0.34375q-0.328125 0.328125 -0.328125 1.078125l0 0.453125l1.8125 0l0 0.6875l-1.8125 0l0 4.796875l-0.828125 0l0 -4.796875l-1.1875 0l0 -0.6875l1.1875 0l0 -0.453125q0 -1.0625 0.546875 -1.578125q0.546875 -0.53125 1.46875 -0.53125q0.53125 0 0.984375 0.1875q0.453125 0.1875 0.734375 0.5625l-0.453125 0.640625zm-0.140625 0.03125q0 -0.03125 0.046875 0q0.046875 0.015625 0.0625 0.015625l-0.046875 0.046875l-0.0625 -0.046875l0 -0.015625zm0.125 -0.015625q0.078125 0.09375 0.03125 0.0625q-0.03125 -0.03125 -0.046875 -0.03125l0.015625 -0.03125zm3.6562958 6.765625q-0.296875 0 -0.5 -0.203125q-0.203125 -0.203125 -0.203125 -0.46875q0 -0.28125 0.203125 -0.484375q0.203125 -0.203125 0.5 -0.203125q0.265625 0 0.46875 0.203125q0.21875 0.203125 0.21875 0.484375q0 0.265625 -0.21875 0.46875q-0.203125 0.203125 -0.46875 0.203125zm5.1250496 -3.875l-0.578125 0.65625l0 3.09375l-0.90625 0l0 -7.46875l1.015625 0l0 0.078125q-0.078125 0.078125 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0 2.921875l3.125 -3.5q0.296875 0.0625 0.609375 0.0625l0.3125 0l-2.828125 3.21875l3.03125 4.25l-1.078125 0.046875l-2.59375 -3.796875zm7.281296 3.875q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm6.640671 -1.0625l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.375 0.21875 -0.625 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.875 0l0 -5.5l0.90625 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.484375 -0.296875 1.046875 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625zm3.9375458 -1.3125q1.1875 0 1.796875 0.625q0.625 0.609375 0.625 2.0625l0 2.921875l-0.9375 0l0 -0.84375q-0.5 0.96875 -1.875 0.96875q-0.90625 0 -1.421875 -0.40625q-0.515625 -0.421875 -0.515625 -1.09375q0 -0.578125 0.359375 -1.0q0.375 -0.4375 1.0 -0.671875q0.640625 -0.234375 1.390625 -0.234375q0.6875 0 1.234375 0.0625q-0.0625 -0.921875 -0.484375 -1.296875q-0.40625 -0.375 -1.21875 -0.375q-0.421875 0 -0.796875 0.15625q-0.375 0.15625 -0.6875 0.453125l-0.421875 -0.5625q0.765625 -0.765625 1.953125 -0.765625zm-0.3125 5.078125q0.890625 0 1.40625 -0.515625q0.515625 -0.53125 0.5625 -1.515625q-0.53125 -0.078125 -1.15625 -0.078125q-0.90625 0 -1.4375 0.296875q-0.53125 0.296875 -0.53125 0.921875q0 0.890625 1.15625 0.890625zm6.781296 -2.703125q1.03125 0.3125 1.453125 0.6875q0.4375 0.359375 0.4375 0.953125q0 0.734375 -0.59375 1.234375q-0.578125 0.484375 -1.671875 0.484375q-1.390625 0 -2.328125 -0.875l0.46875 -0.8125l0.015625 -0.015625l0.015625 0.015625q0.375 0.484375 0.765625 0.734375q0.40625 0.234375 1.078125 0.234375q0.65625 0 1.015625 -0.234375q0.375 -0.234375 0.375 -0.640625q0 -0.359375 -0.296875 -0.578125q-0.296875 -0.234375 -1.078125 -0.484375q-2.0625 -0.59375 -2.0625 -1.703125q0 -0.640625 0.515625 -1.0q0.53125 -0.375 1.5 -0.375q0.75 0 1.25 0.203125q0.515625 0.203125 0.9375 0.65625l-0.5 0.59375l0 0.015625q-0.265625 -0.390625 -0.734375 -0.609375q-0.453125 -0.21875 -0.921875 -0.21875q-0.515625 0 -0.859375 0.1875q-0.328125 0.171875 -0.328125 0.5q0 0.296875 0.328125 0.546875q0.34375 0.25 1.21875 0.5zm1.15625 -0.875q0 -0.0625 0.09375 0l-0.03125 0.046875l-0.0625 -0.046875zm0.140625 -0.03125q0.03125 0.046875 0.015625 0.0625q0 0.015625 -0.03125 0q-0.015625 -0.015625 -0.03125 -0.03125l0.046875 -0.03125zm-3.375 2.53125q0 0.046875 -0.109375 0l0.03125 -0.0625l0.078125 0.046875l0 0.015625zm-0.140625 0.03125q-0.03125 -0.046875 -0.03125 -0.046875q0.015625 0 0.0625 0.015625l-0.03125 0.03125z" fill-rule="nonzero"/><path fill="#000000" d="m55.711124 171.62158l0.796875 0l0 0.546875q0.1875 -0.3125 0.484375 -0.484375q0.3125 -0.1875 0.625 -0.1875q0.359375 0 0.625 0.234375q0.28125 0.21875 0.359375 0.5625q0.140625 -0.359375 0.46875 -0.578125q0.34375 -0.21875 0.765625 -0.21875q0.53125 0 0.796875 0.390625q0.28125 0.375 0.25 1.0l0 4.21875l-0.78125 0l0 -3.890625q0 -0.6875 -0.140625 -0.890625q-0.125 -0.203125 -0.390625 -0.203125q-0.203125 0 -0.40625 0.203125q-0.203125 0.203125 -0.34375 0.53125q-0.125 0.3125 -0.125 0.625l0 3.625l-0.796875 0l0 -3.8125q0 -0.671875 -0.125 -0.90625q-0.125 -0.234375 -0.46875 -0.234375q-0.1875 0 -0.375 0.171875q-0.1875 0.15625 -0.3125 0.453125q-0.109375 0.28125 -0.109375 0.671875l0 3.65625l-0.796875 0l0 -5.484375zm8.546925 5.578125q-0.7343788 0 -1.3125038 -0.359375q-0.578125 -0.359375 -0.90625 -1.0q-0.3125 -0.65625 -0.3125 -1.484375q0 -0.828125 0.3125 -1.46875q0.328125 -0.65625 0.90625 -1.015625q0.578125 -0.375 1.3125038 -0.375q0.734375 0 1.3125 0.375q0.578125 0.359375 0.890625 1.015625q0.328125 0.640625 0.328125 1.46875q0 0.828125 -0.328125 1.484375q-0.3125 0.640625 -0.890625 1.0q-0.578125 0.359375 -1.3125 0.359375zm0 -0.71875q0.46875 0 0.828125 -0.265625q0.375 -0.28125 0.578125 -0.765625q0.21875 -0.484375 0.21875 -1.109375q0 -0.9375 -0.46875 -1.53125q-0.453125 -0.59375 -1.15625 -0.59375q-0.7031288 0 -1.1718788 0.59375q-0.453125 0.59375 -0.453125 1.53125q0 0.625 0.203125 1.109375q0.21875 0.484375 0.578125 0.765625q0.375 0.265625 0.8437538 0.265625zm5.812546 0.75q-0.59375 0 -1.109375 -0.328125q-0.515625 -0.34375 -0.84375 -1.0q-0.3125 -0.65625 -0.3125 -1.59375q0 -0.953125 0.328125 -1.578125q0.34375 -0.640625 0.859375 -0.9375q0.53125 -0.3125 1.125 -0.3125q0.546875 0 0.953125 0.25q0.421875 0.25 0.640625 0.6875l0 -3.296875l0.90625 0l0 0.09375q-0.0625 0.0625 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0.015625 6.59375q0 0.296875 0.015625 0.484375q0.015625 0.1875 0.109375 0.375l-0.859375 0q-0.078125 -0.1875 -0.09375 -0.375q-0.015625 -0.1875 -0.015625 -0.484375q-0.265625 0.46875 -0.6875 0.734375q-0.40625 0.25 -0.921875 0.25zm0.125 -0.765625q0.75 0 1.09375 -0.578125q0.34375 -0.59375 0.34375 -1.546875q0 -0.984375 -0.375 -1.5625q-0.375 -0.59375 -1.125 -0.59375q-0.734375 0 -1.125 0.53125q-0.375 0.53125 -0.375 1.46875q0 1.046875 0.40625 1.671875q0.40625 0.609375 1.15625 0.609375zm6.328171 0.765625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm2.4844208 3.28125l0 -0.703125l1.609375 0l0 -6.578125l-1.546875 0l0 -0.703125l2.421875 0l0 7.28125l1.609375 0l0 0.703125l-4.09375 0z" fill-rule="nonzero"/><path fill="#f4cccc" d="m76.08137 63.718502l87.496056 0l0 30.99213l-87.496056 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m76.08137 63.718502l87.496056 0l0 30.99213l-87.496056 0z" fill-rule="evenodd"/><path fill="#000000" d="m101.15734 83.234566q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm5.890671 -6.15625l-0.015625 0.015625q-0.328125 -0.421875 -0.5625 -0.5625q-0.234375 -0.15625 -0.65625 -0.15625q-0.5625 0 -0.890625 0.34375q-0.328125 0.328125 -0.328125 1.078125l0 0.453125l1.8125 0l0 0.6875l-1.8125 0l0 4.796875l-0.828125 0l0 -4.796875l-1.1875 0l0 -0.6875l1.1875 0l0 -0.453125q0 -1.0625 0.546875 -1.578125q0.546875 -0.53125 1.46875 -0.53125q0.53125 0 0.984375 0.1875q0.453125 0.1875 0.734375 0.5625l-0.453125 0.640625zm-0.140625 0.03125q0 -0.03125 0.046875 0q0.046875 0.015625 0.0625 0.015625l-0.046875 0.046875l-0.0625 -0.046875l0 -0.015625zm0.125 -0.015625q0.078125 0.09375 0.03125 0.0625q-0.03125 -0.03125 -0.046875 -0.03125l0.015625 -0.03125zm3.6562958 6.765625q-0.296875 0 -0.5 -0.203125q-0.203125 -0.203125 -0.203125 -0.46875q0 -0.28125 0.203125 -0.484375q0.203125 -0.203125 0.5 -0.203125q0.265625 0 0.46875 0.203125q0.21875 0.203125 0.21875 0.484375q0 0.265625 -0.21875 0.46875q-0.203125 0.203125 -0.46875 0.203125zm5.125046 -3.875l-0.578125 0.65625l0 3.09375l-0.90625 0l0 -7.46875l1.015625 0l0 0.078125q-0.078125 0.078125 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0 2.921875l3.125 -3.5q0.296875 0.0625 0.609375 0.0625l0.3125 0l-2.828125 3.21875l3.03125 4.25l-1.078125 0.046875l-2.59375 -3.796875zm7.281296 3.875q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm6.640663 -1.0625l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.375 0.21875 -0.625 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.8749924 0l0 -5.5l0.9062424 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.484375 -0.296875 1.046875 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625zm3.9375458 -1.3125q1.1875 0 1.796875 0.625q0.625 0.609375 0.625 2.0625l0 2.921875l-0.9375 0l0 -0.84375q-0.5 0.96875 -1.875 0.96875q-0.90625 0 -1.421875 -0.40625q-0.515625 -0.421875 -0.515625 -1.09375q0 -0.578125 0.359375 -1.0q0.375 -0.4375 1.0 -0.671875q0.640625 -0.234375 1.390625 -0.234375q0.6875 0 1.234375 0.0625q-0.0625 -0.921875 -0.484375 -1.296875q-0.40625 -0.375 -1.21875 -0.375q-0.421875 0 -0.796875 0.15625q-0.375 0.15625 -0.6875 0.453125l-0.421875 -0.5625q0.765625 -0.765625 1.953125 -0.765625zm-0.3125 5.078125q0.890625 0 1.40625 -0.515625q0.515625 -0.53125 0.5625 -1.515625q-0.53125 -0.078125 -1.15625 -0.078125q-0.90625 0 -1.4375 0.296875q-0.53125 0.296875 -0.53125 0.921875q0 0.890625 1.15625 0.890625zm6.781296 -2.703125q1.03125 0.3125 1.453125 0.6875q0.4375 0.359375 0.4375 0.953125q0 0.734375 -0.59375 1.234375q-0.578125 0.484375 -1.671875 0.484375q-1.390625 0 -2.328125 -0.875l0.46875 -0.8125l0.015625 -0.015625l0.015625 0.015625q0.375 0.484375 0.765625 0.734375q0.40625 0.234375 1.078125 0.234375q0.65625 0 1.015625 -0.234375q0.375 -0.234375 0.375 -0.640625q0 -0.359375 -0.296875 -0.578125q-0.296875 -0.234375 -1.078125 -0.484375q-2.0625 -0.59375 -2.0625 -1.703125q0 -0.640625 0.515625 -1.0q0.53125 -0.375 1.5 -0.375q0.75 0 1.25 0.203125q0.515625 0.203125 0.9375 0.65625l-0.5 0.59375l0 0.015625q-0.265625 -0.390625 -0.734375 -0.609375q-0.453125 -0.21875 -0.921875 -0.21875q-0.515625 0 -0.859375 0.1875q-0.328125 0.171875 -0.328125 0.5q0 0.296875 0.328125 0.546875q0.34375 0.25 1.21875 0.5zm1.15625 -0.875q0 -0.0625 0.09375 0l-0.03125 0.046875l-0.0625 -0.046875zm0.140625 -0.03125q0.03125 0.046875 0.015625 0.0625q0 0.015625 -0.03125 0q-0.015625 -0.015625 -0.03125 -0.03125l0.046875 -0.03125zm-3.375 2.53125q0 0.046875 -0.109375 0l0.03125 -0.0625l0.078125 0.046875l0 0.015625zm-0.140625 0.03125q-0.03125 -0.046875 -0.03125 -0.046875q0.015625 0 0.0625 0.015625l-0.03125 0.03125z" fill-rule="nonzero"/><path fill="#f4cccc" d="m176.69817 63.71982l87.49605 0l0 30.992126l-87.49605 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m176.69817 63.71982l87.49605 0l0 30.992126l-87.49605 0z" fill-rule="evenodd"/><path fill="#000000" d="m187.2167 83.73588l1.8125 -8.5625l1.0 0l-1.625 7.65625l3.3125 0l-0.1875 0.90625l-4.3125 0zm8.955078 -6.5q1.109375 0 1.75 0.65625q0.65625 0.65625 0.65625 1.8125q0 1.09375 -0.421875 2.078125q-0.421875 0.984375 -1.140625 1.515625q-0.71875 0.53125 -1.625 0.53125q-1.125 0 -1.765625 -0.65625q-0.640625 -0.671875 -0.640625 -1.8125q0 -1.125 0.421875 -2.09375q0.4375 -0.984375 1.15625 -1.5q0.734375 -0.53125 1.609375 -0.53125zm1.390625 2.328125q0 -0.671875 -0.375 -1.078125q-0.359375 -0.421875 -0.984375 -0.421875q-0.640625 0 -1.15625 0.4375q-0.5 0.421875 -0.78125 1.203125q-0.28125 0.765625 -0.28125 1.703125q0 0.765625 0.375 1.1875q0.375 0.421875 1.078125 0.421875q0.609375 0 1.09375 -0.421875q0.484375 -0.4375 0.75 -1.21875q0.28125 -0.796875 0.28125 -1.8125zm6.451172 4.171875l-0.1875 -3.53125q-0.03125 -0.546875 -0.03125 -1.0l0 -0.921875l-0.046875 0l-0.296875 0.6875l-0.484375 1.109375l-1.703125 3.65625l-1.1875 0l-0.25 -6.421875l0.953125 0l0.109375 3.484375l0 0.515625q0 0.859375 -0.046875 1.578125l0.03125 0q0.28125 -0.734375 0.8125 -1.890625l1.71875 -3.6875l1.078125 0l0.21875 3.484375q0.03125 0.984375 0.03125 1.53125l0 0.3125l-0.015625 0.25l0.03125 0q0.171875 -0.515625 0.484375 -1.28125q0.328125 -0.78125 1.90625 -4.296875l1.03125 0l-2.953125 6.421875l-1.203125 0zm8.15625 0l-0.984375 0l1.953125 -9.125l0.984375 0l-1.953125 9.125zm4.625 0.125q-1.078125 0 -1.703125 -0.640625q-0.609375 -0.640625 -0.609375 -1.78125q0 -1.09375 0.4375 -2.109375q0.4375 -1.015625 1.15625 -1.578125q0.71875 -0.5625 1.578125 -0.5625q0.90625 0 1.359375 0.390625q0.453125 0.390625 0.453125 1.09375q0 1.046875 -0.984375 1.65625q-0.96875 0.59375 -2.78125 0.59375l-0.1875 0l-0.03125 0.46875q0 0.765625 0.359375 1.203125q0.359375 0.4375 1.125 0.4375q0.359375 0 0.75 -0.109375q0.390625 -0.109375 0.96875 -0.390625l0 0.859375q-0.546875 0.25 -0.96875 0.359375q-0.421875 0.109375 -0.921875 0.109375zm0.8125 -5.828125q-0.609375 0 -1.140625 0.5625q-0.53125 0.546875 -0.8125 1.515625l0.078125 0q1.328125 0 2.03125 -0.34375q0.71875 -0.34375 0.71875 -1.015625q0 -0.3125 -0.21875 -0.515625q-0.203125 -0.203125 -0.65625 -0.203125zm3.5273438 5.703125l-0.734375 -6.421875l0.984375 0l0.375 3.59375q0.140625 1.515625 0.140625 2.125l0.03125 0q0.75 -1.625 1.046875 -2.1875l1.90625 -3.53125l1.046875 0l-3.46875 6.421875l-1.328125 0zm7.1210938 0.125q-1.078125 0 -1.703125 -0.640625q-0.609375 -0.640625 -0.609375 -1.78125q0 -1.09375 0.4375 -2.109375q0.4375 -1.015625 1.15625 -1.578125q0.71875 -0.5625 1.578125 -0.5625q0.90625 0 1.359375 0.390625q0.453125 0.390625 0.453125 1.09375q0 1.046875 -0.984375 1.65625q-0.96875 0.59375 -2.78125 0.59375l-0.1875 0l-0.03125 0.46875q0 0.765625 0.359375 1.203125q0.359375 0.4375 1.125 0.4375q0.359375 0 0.75 -0.109375q0.390625 -0.109375 0.96875 -0.390625l0 0.859375q-0.546875 0.25 -0.96875 0.359375q-0.421875 0.109375 -0.921875 0.109375zm0.8125 -5.828125q-0.609375 0 -1.140625 0.5625q-0.53125 0.546875 -0.8125 1.515625l0.078125 0q1.328125 0 2.03125 -0.34375q0.71875 -0.34375 0.71875 -1.015625q0 -0.3125 -0.21875 -0.515625q-0.203125 -0.203125 -0.65625 -0.203125zm3.5273438 5.703125l-0.984375 0l1.953125 -9.125l0.984375 0l-1.953125 9.125zm9.6171875 -2.71875l-2.90625 0l-1.4375 2.71875l-1.109375 0l4.6875 -8.5625l1.015625 0l1.078125 8.5625l-1.0 0l-0.328125 -2.71875zm-0.109375 -0.921875l-0.203125 -1.75q-0.140625 -1.046875 -0.171875 -2.046875q-0.21875 0.515625 -0.46875 1.03125q-0.25 0.5 -1.46875 2.765625l2.3125 0zm8.802734 -2.71875q0 1.453125 -0.96875 2.21875q-0.953125 0.765625 -2.8125 0.765625l-0.796875 0l-0.71875 3.375l-1.0 0l1.8125 -8.5625l1.9375 0q1.25 0 1.890625 0.5625q0.65625 0.546875 0.65625 1.640625zm-4.390625 2.140625l0.78125 0q1.265625 0 1.921875 -0.53125q0.65625 -0.546875 0.65625 -1.578125q0 -0.734375 -0.40625 -1.046875q-0.40625 -0.328125 -1.25 -0.328125l-0.96875 0l-0.734375 3.484375zm4.8847656 4.21875l1.828125 -8.5625l0.984375 0l-1.828125 8.5625l-0.984375 0z" fill-rule="nonzero"/><path fill="#c9daf8" d="m495.33072 152.69751l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m495.33072 152.69751l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m527.26624 172.71358l-1.140625 0l-4.6875 -7.1875l-0.046875 0q0.09375 1.265625 0.09375 2.3125l0 4.875l-0.921875 0l0 -8.5625l1.125 0l4.671875 7.15625l0.046875 0q0 -0.15625 -0.046875 -1.015625q-0.046875 -0.859375 -0.03125 -1.234375l0 -4.90625l0.9375 0l0 8.5625zm9.046875 0l-1.140625 0l-4.6875 -7.1875l-0.046875 0q0.09375 1.265625 0.09375 2.3125l0 4.875l-0.921875 0l0 -8.5625l1.125 0l4.671875 7.15625l0.046875 0q0 -0.15625 -0.046875 -1.015625q-0.046875 -0.859375 -0.03125 -1.234375l0 -4.90625l0.9375 0l0 8.5625zm10.8515625 0l-1.0625 -2.71875l-3.4375 0l-1.046875 2.71875l-1.015625 0l3.390625 -8.609375l0.828125 0l3.375 8.609375l-1.03125 0zm-1.375 -3.625l-1.0 -2.65625q-0.1875 -0.5 -0.390625 -1.234375q-0.140625 0.5625 -0.375 1.234375l-1.0 2.65625l2.765625 0zm9.015625 -2.453125q0 1.3125 -0.890625 2.015625q-0.890625 0.6875 -2.53125 0.6875l-1.015625 0l0 3.375l-1.0 0l0 -8.5625l2.234375 0q3.203125 0 3.203125 2.484375zm-4.4375 1.859375l0.90625 0q1.3125 0 1.90625 -0.421875q0.59375 -0.4375 0.59375 -1.390625q0 -0.84375 -0.5625 -1.25q-0.546875 -0.421875 -1.734375 -0.421875l-1.109375 0l0 3.484375zm6.2246094 4.21875l0 -8.5625l1.0 0l0 8.5625l-1.0 0z" fill-rule="nonzero"/><path fill="#c9daf8" d="m495.33072 110.82874l87.49606 0l0 30.992119l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m495.33072 110.82874l87.49606 0l0 30.992119l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m535.1793 130.2198q0 -0.390625 0.171875 -0.59375q0.1875 -0.203125 0.515625 -0.203125q0.34375 0 0.53125 0.203125q0.1875 0.203125 0.1875 0.59375q0 0.390625 -0.1875 0.59375q-0.1875 0.203125 -0.53125 0.203125q-0.296875 0 -0.5 -0.1875q-0.1875 -0.1875 -0.1875 -0.609375zm3.1933594 0q0 -0.390625 0.171875 -0.59375q0.1875 -0.203125 0.515625 -0.203125q0.34375 0 0.53125 0.203125q0.1875 0.203125 0.1875 0.59375q0 0.390625 -0.1875 0.59375q-0.1875 0.203125 -0.53125 0.203125q-0.296875 0 -0.5 -0.1875q-0.1875 -0.1875 -0.1875 -0.609375zm3.1933594 0q0 -0.390625 0.171875 -0.59375q0.1875 -0.203125 0.515625 -0.203125q0.34375 0 0.53125 0.203125q0.1875 0.203125 0.1875 0.59375q0 0.390625 -0.1875 0.59375q-0.1875 0.203125 -0.53125 0.203125q-0.296875 0 -0.5 -0.1875q-0.1875 -0.1875 -0.1875 -0.609375z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m70.25812 181.08202l0 27.68837l100.944885 0l0 27.681717" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m70.25812 181.08202l0 27.68837l100.94487 0l0 24.254623" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m171.20299 233.02501l-1.1245728 -1.1245728l1.1245728 3.0897675l1.124588 -3.0897675z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m447.31235 251.9311l24.009003 0l0 0.06298828l24.022491 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m447.31235 251.9311l24.009003 0l0 0.06298828l20.595398 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m491.91675 251.9941l-1.1245728 1.124588l3.0897522 -1.124588l-3.0897522 -1.124588z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m447.31235 251.9311l24.009003 0l0 -41.858276l24.022491 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m447.31235 251.9311l24.009003 0l0 -41.858276l20.595398 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m491.91675 210.07283l-1.1245728 1.124588l3.0897522 -1.124588l-3.0897522 -1.1245728z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m447.31235 251.9311l24.009003 0l0 -83.74803l24.022491 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m447.31235 251.9311l24.009003 0l0 -83.74803l20.595398 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m491.91675 168.18307l-1.1245728 1.1245728l3.0897522 -1.1245728l-3.0897522 -1.124588z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m447.31235 251.9311l24.009003 0l0 -125.6063l24.022491 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m447.31235 251.9311l24.009003 0l0 -125.6063l20.595398 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m491.91675 126.32481l-1.1245728 1.1245804l3.0897522 -1.1245804l-3.0897522 -1.124588z" fill-rule="evenodd"/><path fill="#93c47d" d="m127.45759 150.08989l87.49607 0l0 30.992126l-87.49607 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m127.45759 150.08989l87.49607 0l0 30.992126l-87.49607 0z" fill-rule="evenodd"/><path fill="#000000" d="m144.78352 165.98096q1.0625 0.4375 1.46875 0.90625q0.40625 0.46875 0.40625 1.171875q0 0.5625 -0.265625 1.0625q-0.265625 0.484375 -0.828125 0.796875q-0.5625 0.3125 -1.390625 0.3125q-1.453125 0 -2.34375 -0.953125l0.4375 -0.75l0 -0.015625q0 0 0 0.015625q0 0 0 0q0.328125 0.421875 0.828125 0.6875q0.515625 0.25 1.1875 0.25q0.671875 0 1.109375 -0.359375q0.4375 -0.375 0.4375 -0.921875q0 -0.34375 -0.140625 -0.578125q-0.125 -0.234375 -0.484375 -0.453125q-0.359375 -0.234375 -1.078125 -0.546875q-1.109375 -0.4375 -1.59375 -0.984375q-0.46875 -0.5625 -0.46875 -1.234375q0 -0.84375 0.609375 -1.34375q0.625 -0.515625 1.671875 -0.515625q0.609375 0 1.140625 0.25q0.546875 0.25 0.9375 0.6875l-0.46875 0.625l-0.015625 0.015625q-0.359375 -0.484375 -0.75 -0.671875q-0.390625 -0.203125 -0.96875 -0.203125q-0.578125 0 -0.9375 0.328125q-0.34375 0.3125 -0.34375 0.765625q0 0.34375 0.140625 0.609375q0.15625 0.25 0.546875 0.5q0.390625 0.25 1.15625 0.546875zm1.03125 -1.84375q0 -0.046875 0.046875 -0.015625q0.046875 0.015625 0.0625 0.015625l-0.03125 0.046875l-0.078125 -0.046875l0 0zm0.125 -0.03125q0.078125 0.09375 0.03125 0.0625q-0.03125 -0.03125 -0.046875 -0.03125l0.015625 -0.03125zm-3.546875 4.375q0 0.03125 -0.046875 0.015625q-0.046875 -0.03125 -0.0625 -0.03125l0.03125 -0.046875l0.078125 0.046875l0 0.015625zm-0.125 0.03125q-0.078125 -0.09375 0.015625 -0.046875l-0.015625 0.046875zm7.859421 -4.015625q1.1875 0 1.796875 0.625q0.625 0.609375 0.625 2.0625l0 2.921875l-0.9375 0l0 -0.84375q-0.5 0.96875 -1.875 0.96875q-0.90625 0 -1.421875 -0.40625q-0.515625 -0.421875 -0.515625 -1.09375q0 -0.578125 0.359375 -1.0q0.375 -0.4375 1.0 -0.671875q0.640625 -0.234375 1.390625 -0.234375q0.6875 0 1.234375 0.0625q-0.0625 -0.921875 -0.484375 -1.296875q-0.40625 -0.375 -1.21875 -0.375q-0.421875 0 -0.796875 0.15625q-0.375 0.15625 -0.6875 0.453125l-0.421875 -0.5625q0.765625 -0.765625 1.953125 -0.765625zm-0.3125 5.078125q0.890625 0 1.40625 -0.515625q0.515625 -0.53125 0.5625 -1.515625q-0.53125 -0.078125 -1.15625 -0.078125q-0.90625 0 -1.4375 0.296875q-0.53125 0.296875 -0.53125 0.921875q0 0.890625 1.15625 0.890625zm8.843796 -4.953125q-0.1875 0.96875 -0.796875 2.40625l-1.328125 3.078125l-0.671875 0l-2.171875 -5.484375l0.859375 0l1.6875 4.296875l0.890625 -2.03125q0.546875 -1.25 0.71875 -2.265625l0.8125 0zm3.8125458 5.609375q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm4.343796 3.40625q-0.59375 0 -1.109375 -0.328125q-0.515625 -0.34375 -0.84375 -1.0q-0.3125 -0.65625 -0.3125 -1.59375q0 -0.953125 0.328125 -1.578125q0.34375 -0.640625 0.859375 -0.9375q0.53125 -0.3125 1.125 -0.3125q0.546875 0 0.953125 0.25q0.421875 0.25 0.640625 0.6875l0 -3.296875l0.90625 0l0 0.09375q-0.0625 0.0625 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0.015625 6.59375q0 0.296875 0.015625 0.484375q0.015625 0.1875 0.109375 0.375l-0.859375 0q-0.078125 -0.1875 -0.09375 -0.375q-0.015625 -0.1875 -0.015625 -0.484375q-0.265625 0.46875 -0.6875 0.734375q-0.40625 0.25 -0.921875 0.25zm0.125 -0.765625q0.75 0 1.09375 -0.578125q0.34375 -0.59375 0.34375 -1.546875q0 -0.984375 -0.375 -1.5625q-0.375 -0.59375 -1.125 -0.59375q-0.734375 0 -1.125 0.53125q-0.375 0.53125 -0.375 1.46875q0 1.046875 0.40625 1.671875q0.40625 0.609375 1.15625 0.609375zm3.5469208 0.640625l0 -7.46875l0.671875 0l1.84375 3.65625l1.890625 -3.671875l0.625 0l0 7.484375l-0.78125 0l0 -5.640625l-1.625 3.015625l-0.328125 0l-1.515625 -2.984375l0 5.609375l-0.78125 0zm8.515671 0.09375q-0.734375 0 -1.3125 -0.359375q-0.578125 -0.359375 -0.90625 -1.0q-0.3125 -0.65625 -0.3125 -1.484375q0 -0.828125 0.3125 -1.46875q0.328125 -0.65625 0.90625 -1.015625q0.578125 -0.375 1.3125 -0.375q0.734375 0 1.3125 0.375q0.578125 0.359375 0.890625 1.015625q0.328125 0.640625 0.328125 1.46875q0 0.828125 -0.328125 1.484375q-0.3125 0.640625 -0.890625 1.0q-0.578125 0.359375 -1.3125 0.359375zm0 -0.71875q0.46875 0 0.828125 -0.265625q0.375 -0.28125 0.578125 -0.765625q0.21875 -0.484375 0.21875 -1.109375q0 -0.9375 -0.46875 -1.53125q-0.453125 -0.59375 -1.15625 -0.59375q-0.703125 0 -1.171875 0.59375q-0.453125 0.59375 -0.453125 1.53125q0 0.625 0.203125 1.109375q0.21875 0.484375 0.578125 0.765625q0.375 0.265625 0.84375 0.265625zm5.812546 0.75q-0.59375 0 -1.109375 -0.328125q-0.515625 -0.34375 -0.84375 -1.0q-0.3125 -0.65625 -0.3125 -1.59375q0 -0.953125 0.328125 -1.578125q0.34375 -0.640625 0.859375 -0.9375q0.53125 -0.3125 1.125 -0.3125q0.546875 0 0.953125 0.25q0.421875 0.25 0.640625 0.6875l0 -3.296875l0.90625 0l0 0.09375q-0.0625 0.0625 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0.015625 6.59375q0 0.296875 0.015625 0.484375q0.015625 0.1875 0.109375 0.375l-0.859375 0q-0.078125 -0.1875 -0.09375 -0.375q-0.015625 -0.1875 -0.015625 -0.484375q-0.265625 0.46875 -0.6875 0.734375q-0.40625 0.25 -0.921875 0.25zm0.125 -0.765625q0.75 0 1.09375 -0.578125q0.34375 -0.59375 0.34375 -1.546875q0 -0.984375 -0.375 -1.5625q-0.375 -0.59375 -1.125 -0.59375q-0.734375 0 -1.125 0.53125q-0.375 0.53125 -0.375 1.46875q0 1.046875 0.40625 1.671875q0.40625 0.609375 1.15625 0.609375zm6.328171 0.765625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm2.4844208 3.28125l0 -0.703125l1.609375 0l0 -6.578125l-1.546875 0l0 -0.703125l2.421875 0l0 7.28125l1.609375 0l0 0.703125l-4.09375 0z" fill-rule="nonzero"/><path fill="#d9ead3" d="m228.40378 150.08989l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m228.40378 150.08989l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m251.54225 162.5122q0.4375 0 0.84375 -0.25q0.40625 -0.25 0.65625 -0.671875l0.625 0.40625q-0.375 0.625 -0.875 0.9375q-0.5 0.296875 -1.21875 0.296875q-0.84375 0 -1.5 -0.40625q-0.65625 -0.421875 -1.046875 -1.265625q-0.390625 -0.859375 -0.390625 -2.15625q0 -1.375 0.421875 -2.234375q0.421875 -0.859375 1.0625 -1.21875q0.65625 -0.375 1.40625 -0.375q0.78125 0 1.359375 0.390625q0.59375 0.390625 0.890625 1.078125l-0.71875 0.34375q-0.015625 0 -0.015625 0q0 -0.015625 0 -0.015625q-0.3125 -0.625 -0.703125 -0.875q-0.375 -0.25 -0.84375 -0.25q-0.9375 0 -1.484375 0.828125q-0.546875 0.8125 -0.546875 2.28125q0 0.921875 0.265625 1.640625q0.28125 0.71875 0.75 1.125q0.484375 0.390625 1.0625 0.390625zm1.375 -5.171875q0.015625 -0.015625 0.015625 -0.015625q0.03125 0 0.109375 0.0625l-0.09375 0.046875l-0.03125 -0.09375zm0.140625 0.046875q0.046875 0.109375 -0.015625 0l0.015625 0zm4.093796 5.8125q-0.734375 0 -1.3125 -0.359375q-0.578125 -0.359375 -0.90625 -1.0q-0.3125 -0.65625 -0.3125 -1.484375q0 -0.828125 0.3125 -1.46875q0.328125 -0.65625 0.90625 -1.015625q0.578125 -0.375 1.3125 -0.375q0.734375 0 1.3125 0.375q0.578125 0.359375 0.890625 1.015625q0.328125 0.640625 0.328125 1.46875q0 0.828125 -0.328125 1.484375q-0.3125 0.640625 -0.890625 1.0q-0.578125 0.359375 -1.3125 0.359375zm0 -0.71875q0.46875 0 0.828125 -0.265625q0.375 -0.28125 0.578125 -0.765625q0.21875 -0.484375 0.21875 -1.109375q0 -0.9375 -0.46875 -1.53125q-0.453125 -0.59375 -1.15625 -0.59375q-0.703125 0 -1.171875 0.59375q-0.453125 0.59375 -0.453125 1.53125q0 0.625 0.203125 1.109375q0.21875 0.484375 0.578125 0.765625q0.375 0.265625 0.84375 0.265625zm3.859436 -4.859375l0.84375 0l0 0.96875q0.328125 -0.5 0.8125 -0.796875q0.5 -0.296875 1.046875 -0.296875q0.734375 0 1.171875 0.5625q0.4375 0.546875 0.4375 1.71875l0 3.328125l-0.84375 0l0 -3.296875q0 -0.8125 -0.28125 -1.1875q-0.265625 -0.375 -0.71875 -0.375q-0.375 0 -0.75 0.21875q-0.375 0.21875 -0.625 0.609375q-0.25 0.390625 -0.25 0.875l0 3.15625l-0.84375 0l0 -5.484375zm8.6094055 4.84375q0.828125 0 1.421875 -0.671875l0.515625 0.578125q-0.8125 0.859375 -1.984375 0.859375q-0.796875 0 -1.421875 -0.359375q-0.625 -0.375 -0.984375 -1.03125q-0.34375 -0.65625 -0.34375 -1.46875q0 -0.8125 0.34375 -1.453125q0.359375 -0.65625 0.984375 -1.03125q0.625 -0.375 1.40625 -0.375q0.65625 0 1.1875 0.28125q0.546875 0.265625 0.890625 0.734375l-0.546875 0.53125l0 0.015625q-0.359375 -0.453125 -0.71875 -0.640625q-0.359375 -0.1875 -0.90625 -0.1875q-0.46875 0 -0.875 0.265625q-0.390625 0.25 -0.640625 0.71875q-0.234375 0.46875 -0.234375 1.078125q0 0.609375 0.234375 1.109375q0.25 0.484375 0.6875 0.765625q0.4375 0.28125 0.984375 0.28125zm1.328125 -3.375q0 -0.078125 0.109375 0l-0.046875 0.0625l-0.0625 -0.0625zm0.140625 -0.015625q0.046875 0.078125 0.015625 0.0625q-0.015625 -0.03125 -0.046875 -0.046875l0.03125 -0.015625zm6.171936 -0.3125l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.375 0.21875 -0.625 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.875 0l0 -5.5l0.90625 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.484375 -0.296875 1.046875 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625zm4.2812805 4.421875q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm6.859436 2.78125q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm3.9375305 0.625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0z" fill-rule="nonzero"/><path fill="#000000" d="m240.08907 177.10596l0 -7.484375l4.3125 0l0 0.734375l-3.46875 0l0 2.34375l2.796875 0l0 0.734375l-2.796875 0l0 3.671875l-0.84375 0zm7.718796 0.125q-0.875 0 -1.390625 -0.640625q-0.515625 -0.640625 -0.5 -1.90625l0.015625 -3.0625l0.84375 0l0 3.0625q0 0.984375 0.328125 1.421875q0.34375 0.4375 0.921875 0.4375q0.609375 0 1.03125 -0.484375q0.4375 -0.484375 0.4375 -1.40625l0 -3.03125l0.84375 0l0 4.625q0 0.296875 0.015625 0.484375q0.015625 0.1875 0.09375 0.375l-0.828125 0q-0.078125 -0.1875 -0.09375 -0.375q-0.015625 -0.1875 -0.015625 -0.46875q-0.265625 0.453125 -0.71875 0.71875q-0.453125 0.25 -0.984375 0.25zm4.203171 -5.609375l0.84375 0l0 0.96875q0.328125 -0.5 0.8125 -0.796875q0.5 -0.296875 1.046875 -0.296875q0.734375 0 1.171875 0.5625q0.43748474 0.546875 0.43748474 1.71875l0 3.328125l-0.84373474 0l0 -3.296875q0 -0.8125 -0.28125 -1.1875q-0.265625 -0.375 -0.71875 -0.375q-0.375 0 -0.75 0.21875q-0.375 0.21875 -0.625 0.609375q-0.25 0.390625 -0.25 0.875l0 3.15625l-0.84375 0l0 -5.484375zm8.609421 4.84375q0.828125 0 1.421875 -0.671875l0.515625 0.578125q-0.8125 0.859375 -1.984375 0.859375q-0.796875 0 -1.421875 -0.359375q-0.625 -0.375 -0.984375 -1.03125q-0.34375 -0.65625 -0.34375 -1.46875q0 -0.8125 0.34375 -1.453125q0.359375 -0.65625 0.984375 -1.03125q0.625 -0.375 1.40625 -0.375q0.65625 0 1.1875 0.28125q0.546875 0.265625 0.890625 0.734375l-0.546875 0.53125l0 0.015625q-0.359375 -0.453125 -0.71875 -0.640625q-0.359375 -0.1875 -0.90625 -0.1875q-0.46875 0 -0.875 0.265625q-0.390625 0.25 -0.640625 0.71875q-0.234375 0.46875 -0.234375 1.078125q0 0.609375 0.234375 1.109375q0.25 0.484375 0.6875 0.765625q0.4375 0.28125 0.984375 0.28125zm1.328125 -3.375q0 -0.078125 0.109375 0l-0.046875 0.0625l-0.0625 -0.0625zm0.140625 -0.015625q0.046875 0.078125 0.015625 0.0625q-0.015625 -0.03125 -0.046875 -0.046875l0.03125 -0.015625zm6.3906555 3.53125q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm1.875061 0.5l0 -0.703125l1.40625 0l0 -4.078125l-1.34375 0l0 -0.703125l2.203125 0l0 4.78125l1.28125 0l0 0.703125l-3.546875 0zm1.78125 -6.640625q-0.25 0 -0.4375 -0.171875q-0.171875 -0.1875 -0.171875 -0.4375q0 -0.265625 0.171875 -0.4375q0.171875 -0.1875 0.4375 -0.1875q0.25 0 0.4375 0.1875q0.1875 0.1875 0.1875 0.4375q0 0.25 -0.1875 0.4375q-0.1875 0.171875 -0.4375 0.171875zm6.0156555 6.734375q-0.734375 0 -1.3125 -0.359375q-0.578125 -0.359375 -0.90625 -1.0q-0.3125 -0.65625 -0.3125 -1.484375q0 -0.828125 0.3125 -1.46875q0.328125 -0.65625 0.90625 -1.015625q0.578125 -0.375 1.3125 -0.375q0.734375 0 1.3125 0.375q0.578125 0.359375 0.890625 1.015625q0.328125 0.640625 0.328125 1.46875q0 0.828125 -0.328125 1.484375q-0.3125 0.640625 -0.890625 1.0q-0.578125 0.359375 -1.3125 0.359375zm0 -0.71875q0.46875 0 0.828125 -0.265625q0.375 -0.28125 0.578125 -0.765625q0.21875 -0.484375 0.21875 -1.109375q0 -0.9375 -0.46875 -1.53125q-0.453125 -0.59375 -1.15625 -0.59375q-0.703125 0 -1.171875 0.59375q-0.453125 0.59375 -0.453125 1.53125q0 0.625 0.203125 1.109375q0.21875 0.484375 0.578125 0.765625q0.375 0.265625 0.84375 0.265625zm3.859436 -4.859375l0.84375 0l0 0.96875q0.328125 -0.5 0.8125 -0.796875q0.5 -0.296875 1.046875 -0.296875q0.734375 0 1.171875 0.5625q0.4375 0.546875 0.4375 1.71875l0 3.328125l-0.84375 0l0 -3.296875q0 -0.8125 -0.28125 -1.1875q-0.265625 -0.375 -0.71875 -0.375q-0.375 0 -0.75 0.21875q-0.375 0.21875 -0.625 0.609375q-0.25 0.390625 -0.25 0.875l0 3.15625l-0.84375 0l0 -5.484375zm9.8750305 7.578125q-0.984375 -0.46875 -1.734375 -1.265625q-0.734375 -0.78125 -1.140625 -1.765625q-0.390625 -1.0 -0.390625 -2.0625q0 -1.0625 0.390625 -2.03125q0.390625 -0.96875 1.109375 -1.734375q0.734375 -0.765625 1.703125 -1.21875l0.21875 0.75q-1.171875 0.640625 -1.875 1.796875q-0.703125 1.140625 -0.703125 2.46875q0 1.34375 0.71875 2.53125q0.71875 1.171875 1.9375 1.8125l-0.234375 0.71875zm4.656311 -5.328125q1.03125 0.3125 1.453125 0.6875q0.4375 0.359375 0.4375 0.953125q0 0.734375 -0.59375 1.234375q-0.578125 0.484375 -1.671875 0.484375q-1.390625 0 -2.328125 -0.875l0.46875 -0.8125l0.015625 -0.015625l0.015625 0.015625q0.375 0.484375 0.765625 0.734375q0.40625 0.234375 1.078125 0.234375q0.65625 0 1.015625 -0.234375q0.375 -0.234375 0.375 -0.640625q0 -0.359375 -0.296875 -0.578125q-0.296875 -0.234375 -1.078125 -0.484375q-2.0625 -0.59375 -2.0625 -1.703125q0 -0.640625 0.515625 -1.0q0.53125 -0.375 1.5 -0.375q0.75 0 1.25 0.203125q0.515625 0.203125 0.9375 0.65625l-0.5 0.59375l0 0.015625q-0.265625 -0.390625 -0.734375 -0.609375q-0.453125 -0.21875 -0.921875 -0.21875q-0.515625 0 -0.859375 0.1875q-0.328125 0.171875 -0.328125 0.5q0 0.296875 0.328125 0.546875q0.34375 0.25 1.21875 0.5zm1.15625 -0.875q0 -0.0625 0.09375 0l-0.03125 0.046875l-0.0625 -0.046875zm0.140625 -0.03125q0.03125 0.046875 0.015625 0.0625q0 0.015625 -0.03125 0q-0.015625 -0.015625 -0.03125 -0.03125l0.046875 -0.03125zm-3.375 2.53125q0 0.046875 -0.109375 0l0.03125 -0.0625l0.078125 0.046875l0 0.015625zm-0.140625 0.03125q-0.03125 -0.046875 -0.03125 -0.046875q0.015625 0 0.0625 0.015625l-0.03125 0.03125zm6.2031555 2.953125q1.203125 -0.640625 1.921875 -1.8125q0.71875 -1.1875 0.71875 -2.53125q0 -1.328125 -0.703125 -2.46875q-0.703125 -1.15625 -1.875 -1.796875l0.234375 -0.75q0.953125 0.453125 1.671875 1.21875q0.734375 0.765625 1.125 1.734375q0.40625 0.96875 0.40625 2.03125q0 1.0625 -0.40625 2.0625q-0.40625 0.984375 -1.15625 1.765625q-0.734375 0.796875 -1.71875 1.265625l-0.21875 -0.71875z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m239.93134 94.31036l0 27.88977l32.22049 0l0 27.889755" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m239.93135 94.31036l0 27.88977l32.220474 0l0 24.462677" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m272.15182 146.66281l-1.1245728 -1.124588l1.1245728 3.0897675l1.1245728 -3.0897675z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m171.20563 181.08202l0 55.370087" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m171.20561 181.08202l0 51.942993" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m171.20561 233.02501l-1.1245728 -1.1245728l1.1245728 3.0897675l1.124588 -3.0897675z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m272.15182 181.08202l0 27.68837l-100.944885 0l0 27.681717" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m272.15182 181.08202l0 27.68837l-100.9449 0l0 24.254623" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m171.20692 233.02501l-1.1245728 -1.1245728l1.1245728 3.0897675l1.124588 -3.0897675z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m220.43396 94.71982l0 27.637794l-38.929123 0l0 27.637794" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m220.43398 94.71982l0 27.637794l-38.92914 0l0 24.210716" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m181.50484 146.56831l-1.124588 -1.1245728l1.124588 3.0897675l1.124588 -3.0897675z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m119.8294 94.71063l0 27.401573l40.661415 0l0 27.401573" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m119.8294 94.71063l0 27.401573l40.661415 0l0 23.974495" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m160.49081 146.0867l-1.124588 -1.124588l1.124588 3.0897675l1.124588 -3.0897675z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m99.990395 94.27887l0 27.90551l-29.732277 0l0 27.90551" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m99.990395 94.27887l0 27.90551l-29.732277 0l0 24.478432" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m70.25812 146.66281l-1.124588 -1.124588l1.124588 3.0897675l1.1245804 -3.0897675z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m318.4169 251.95473l41.385834 -0.03149414" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m318.4169 251.95473l37.95877 -0.028900146" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m356.37564 251.92584l-1.1237183 1.1254272l3.0889282 -1.1269226l-3.0906372 -1.1222382z" fill-rule="evenodd"/><path fill="#93c47d" d="m352.57086 15.863517l20.53543 0l0 20.53543l-20.53543 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m352.57086 15.863517l20.53543 0l0 20.53543l-20.53543 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m377.56693 10.257217l87.49606 0l0 31.748032l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m388.7388 30.777218l-1.0 0l0 -8.5625l4.78125 0l0 0.875l-3.78125 0l0 3.140625l3.546875 0l0 0.890625l-3.546875 0l0 3.65625zm6.0214844 0l-0.96875 0l0 -6.421875l0.96875 0l0 6.421875zm-1.046875 -8.15625q0 -0.34375 0.15625 -0.5q0.171875 -0.15625 0.421875 -0.15625q0.234375 0 0.390625 0.171875q0.171875 0.15625 0.171875 0.484375q0 0.328125 -0.171875 0.484375q-0.15625 0.15625 -0.390625 0.15625q-0.25 0 -0.421875 -0.15625q-0.15625 -0.15625 -0.15625 -0.484375zm4.0820312 8.15625l-0.96875 0l0 -9.125l0.96875 0l0 9.125zm4.7851562 0.125q-1.421875 0 -2.25 -0.875q-0.828125 -0.875 -0.828125 -2.40625q0 -1.5625 0.765625 -2.46875q0.765625 -0.921875 2.0625 -0.921875q1.203125 0 1.90625 0.796875q0.703125 0.796875 0.703125 2.09375l0 0.625l-4.421875 0q0.03125 1.125 0.5625 1.71875q0.546875 0.578125 1.53125 0.578125q1.03125 0 2.046875 -0.4375l0 0.875q-0.515625 0.21875 -0.984375 0.3125q-0.453125 0.109375 -1.09375 0.109375zm-0.265625 -5.84375q-0.78125 0 -1.25 0.5q-0.453125 0.5 -0.53125 1.390625l3.359375 0q0 -0.921875 -0.40625 -1.40625q-0.40625 -0.484375 -1.171875 -0.484375zm10.287109 0.046875l-1.625 0l0 5.671875l-0.984375 0l0 -5.671875l-1.140625 0l0 -0.4375l1.140625 -0.34375l0 -0.359375q0 -2.375 2.078125 -2.375q0.5 0 1.1875 0.203125l-0.25 0.78125q-0.5625 -0.171875 -0.953125 -0.171875q-0.5625 0 -0.828125 0.375q-0.25 0.359375 -0.25 1.15625l0 0.421875l1.625 0l0 0.75zm6.7226562 2.453125q0 1.578125 -0.796875 2.46875q-0.78125 0.875 -2.1875 0.875q-0.859375 0 -1.53125 -0.40625q-0.65625 -0.40625 -1.03125 -1.15625q-0.359375 -0.765625 -0.359375 -1.78125q0 -1.5625 0.78125 -2.4375q0.796875 -0.890625 2.1875 -0.890625q1.34375 0 2.140625 0.90625q0.796875 0.890625 0.796875 2.421875zm-4.890625 0q0 1.234375 0.484375 1.875q0.5 0.640625 1.453125 0.640625q0.953125 0 1.4375 -0.640625q0.5 -0.640625 0.5 -1.875q0 -1.21875 -0.5 -1.859375q-0.484375 -0.640625 -1.453125 -0.640625q-0.953125 0 -1.4375 0.640625q-0.484375 0.625 -0.484375 1.859375zm9.529297 -3.328125q0.421875 0 0.765625 0.078125l-0.140625 0.90625q-0.390625 -0.09375 -0.703125 -0.09375q-0.78125 0 -1.34375 0.640625q-0.546875 0.625 -0.546875 1.5625l0 3.453125l-0.96875 0l0 -6.421875l0.796875 0l0.125 1.1875l0.046875 0q0.34375 -0.625 0.84375 -0.96875q0.515625 -0.34375 1.125 -0.34375zm10.1484375 6.546875l0 -4.171875q0 -0.78125 -0.328125 -1.15625q-0.328125 -0.390625 -1.015625 -0.390625q-0.90625 0 -1.34375 0.53125q-0.4375 0.515625 -0.4375 1.59375l0 3.59375l-0.96875 0l0 -4.171875q0 -0.78125 -0.328125 -1.15625q-0.328125 -0.390625 -1.03125 -0.390625q-0.90625 0 -1.34375 0.546875q-0.421875 0.546875 -0.421875 1.796875l0 3.375l-0.96875 0l0 -6.421875l0.796875 0l0.15625 0.875l0.046875 0q0.265625 -0.46875 0.765625 -0.734375q0.515625 -0.265625 1.125 -0.265625q1.515625 0 1.96875 1.09375l0.046875 0q0.296875 -0.5 0.828125 -0.796875q0.546875 -0.296875 1.25 -0.296875q1.09375 0 1.625 0.5625q0.546875 0.5625 0.546875 1.796875l0 4.1875l-0.96875 0zm6.9277344 0l-0.203125 -0.921875l-0.046875 0q-0.46875 0.609375 -0.953125 0.828125q-0.46875 0.21875 -1.1875 0.21875q-0.953125 0 -1.5 -0.5q-0.546875 -0.5 -0.546875 -1.40625q0 -1.9375 3.109375 -2.03125l1.09375 -0.03125l0 -0.40625q0 -0.75 -0.328125 -1.109375q-0.3125 -0.359375 -1.03125 -0.359375q-0.8125 0 -1.8125 0.484375l-0.3125 -0.75q0.484375 -0.25 1.046875 -0.390625q0.5625 -0.15625 1.140625 -0.15625q1.140625 0 1.6875 0.515625q0.5625 0.5 0.5625 1.625l0 4.390625l-0.71875 0zm-2.203125 -0.6875q0.90625 0 1.421875 -0.5q0.53125 -0.5 0.53125 -1.390625l0 -0.578125l-0.984375 0.03125q-1.15625 0.046875 -1.671875 0.375q-0.5 0.3125 -0.5 0.984375q0 0.53125 0.3125 0.8125q0.3125 0.265625 0.890625 0.265625zm7.001953 0q0.25 0 0.484375 -0.03125q0.25 -0.046875 0.390625 -0.078125l0 0.734375q-0.15625 0.078125 -0.46875 0.125q-0.296875 0.0625 -0.546875 0.0625q-1.859375 0 -1.859375 -1.96875l0 -3.828125l-0.921875 0l0 -0.46875l0.921875 -0.40625l0.40625 -1.359375l0.5625 0l0 1.484375l1.859375 0l0 0.75l-1.859375 0l0 3.78125q0 0.578125 0.265625 0.890625q0.28125 0.3125 0.765625 0.3125z" fill-rule="nonzero"/><path fill="#f4cccc" d="m469.5223 15.863517l20.53543 0l0 20.53543l-20.53543 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m469.5223 15.863517l20.53543 0l0 20.53543l-20.53543 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m494.51706 10.829396l100.00003 0l0 31.748032l-100.00003 0z" fill-rule="evenodd"/><path fill="#000000" d="m504.68893 31.349398l0 -8.5625l1.0 0l0 8.5625l-1.0 0zm7.5957336 0l0 -4.15625q0 -0.78125 -0.35940552 -1.171875q-0.34375 -0.390625 -1.109375 -0.390625q-1.015625 0 -1.484375 0.546875q-0.46875 0.546875 -0.46875 1.796875l0 3.375l-0.96875 0l0 -6.421875l0.796875 0l0.15625 0.875l0.046875 0q0.296875 -0.46875 0.828125 -0.734375q0.546875 -0.265625 1.203125 -0.265625q1.1719055 0 1.7500305 0.5625q0.59375 0.5625 0.59375 1.796875l0 4.1875l-0.984375 0zm5.8652344 -5.671875l-1.625 0l0 5.671875l-0.984375 0l0 -5.671875l-1.140625 0l0 -0.4375l1.140625 -0.34375l0 -0.359375q0 -2.375 2.078125 -2.375q0.5 0 1.1875 0.203125l-0.25 0.78125q-0.5625 -0.171875 -0.953125 -0.171875q-0.5625 0 -0.828125 0.375q-0.25 0.359375 -0.25 1.15625l0 0.421875l1.625 0l0 0.75zm4.1132812 -0.875q0.421875 0 0.765625 0.078125l-0.140625 0.90625q-0.390625 -0.09375 -0.703125 -0.09375q-0.78125 0 -1.34375 0.640625q-0.546875 0.625 -0.546875 1.5625l0 3.453125l-0.96875 0l0 -6.421875l0.796875 0l0.125 1.1875l0.046875 0q0.34375 -0.625 0.84375 -0.96875q0.515625 -0.34375 1.125 -0.34375zm5.9140625 6.546875l-0.203125 -0.921875l-0.046875 0q-0.46875 0.609375 -0.953125 0.828125q-0.46875 0.21875 -1.1875 0.21875q-0.953125 0 -1.5 -0.5q-0.546875 -0.5 -0.546875 -1.40625q0 -1.9375 3.109375 -2.03125l1.09375 -0.03125l0 -0.40625q0 -0.75 -0.328125 -1.109375q-0.3125 -0.359375 -1.03125 -0.359375q-0.8125 0 -1.8125 0.484375l-0.3125 -0.75q0.484375 -0.25 1.046875 -0.390625q0.5625 -0.15625 1.140625 -0.15625q1.140625 0 1.6875 0.515625q0.5625 0.5 0.5625 1.625l0 4.390625l-0.71875 0zm-2.203125 -0.6875q0.90625 0 1.421875 -0.5q0.53125 -0.5 0.53125 -1.390625l0 -0.578125l-0.984375 0.03125q-1.15625 0.046875 -1.671875 0.375q-0.5 0.3125 -0.5 0.984375q0 0.53125 0.3125 0.8125q0.3125 0.265625 0.890625 0.265625zm9.064453 -1.0625q0 0.890625 -0.671875 1.390625q-0.65625 0.484375 -1.875 0.484375q-1.265625 0 -1.984375 -0.40625l0 -0.90625q0.46875 0.234375 0.984375 0.375q0.53125 0.125 1.03125 0.125q0.765625 0 1.171875 -0.234375q0.40625 -0.25 0.40625 -0.75q0 -0.375 -0.328125 -0.640625q-0.3125 -0.265625 -1.265625 -0.625q-0.890625 -0.34375 -1.28125 -0.59375q-0.375 -0.25 -0.5625 -0.5625q-0.171875 -0.3125 -0.171875 -0.75q0 -0.78125 0.640625 -1.234375q0.640625 -0.46875 1.75 -0.46875q1.03125 0 2.03125 0.421875l-0.359375 0.796875q-0.953125 -0.390625 -1.75 -0.390625q-0.6875 0 -1.046875 0.21875q-0.34375 0.203125 -0.34375 0.59375q0 0.25 0.125 0.4375q0.140625 0.171875 0.421875 0.34375q0.296875 0.15625 1.140625 0.46875q1.140625 0.421875 1.53125 0.84375q0.40625 0.421875 0.40625 1.0625zm3.6621094 1.0625q0.25 0 0.484375 -0.03125q0.25 -0.046875 0.390625 -0.078125l0 0.734375q-0.15625 0.078125 -0.46875 0.125q-0.296875 0.0625 -0.546875 0.0625q-1.859375 0 -1.859375 -1.96875l0 -3.828125l-0.921875 0l0 -0.46875l0.921875 -0.40625l0.40625 -1.359375l0.5625 0l0 1.484375l1.859375 0l0 0.75l-1.859375 0l0 3.78125q0 0.578125 0.265625 0.890625q0.28125 0.3125 0.765625 0.3125zm5.095703 -5.859375q0.421875 0 0.765625 0.078125l-0.140625 0.90625q-0.390625 -0.09375 -0.703125 -0.09375q-0.78125 0 -1.34375 0.640625q-0.546875 0.625 -0.546875 1.5625l0 3.453125l-0.96875 0l0 -6.421875l0.796875 0l0.125 1.1875l0.046875 0q0.34375 -0.625 0.84375 -0.96875q0.515625 -0.34375 1.125 -0.34375zm2.8828125 0.125l0 4.171875q0 0.78125 0.34375 1.171875q0.359375 0.375 1.125 0.375q1.015625 0 1.46875 -0.546875q0.46875 -0.546875 0.46875 -1.796875l0 -3.375l0.96875 0l0 6.421875l-0.796875 0l-0.140625 -0.859375l-0.046875 0q-0.296875 0.46875 -0.828125 0.734375q-0.53125 0.25 -1.21875 0.25q-1.171875 0 -1.75 -0.5625q-0.578125 -0.5625 -0.578125 -1.78125l0 -4.203125l0.984375 0zm9.005859 6.546875q-1.390625 0 -2.15625 -0.859375q-0.765625 -0.859375 -0.765625 -2.4375q0 -1.609375 0.78125 -2.484375q0.78125 -0.890625 2.203125 -0.890625q0.46875 0 0.921875 0.109375q0.46875 0.09375 0.734375 0.234375l-0.296875 0.828125q-0.328125 -0.140625 -0.703125 -0.21875q-0.375 -0.078125 -0.671875 -0.078125q-1.953125 0 -1.953125 2.484375q0 1.1875 0.46875 1.828125q0.484375 0.625 1.421875 0.625q0.796875 0 1.640625 -0.34375l0 0.859375q-0.640625 0.34375 -1.625 0.34375zm5.2285156 -0.8125q0.25 0 0.484375 -0.03125q0.25 -0.046875 0.390625 -0.078125l0 0.734375q-0.15625 0.078125 -0.46875 0.125q-0.296875 0.0625 -0.546875 0.0625q-1.859375 0 -1.859375 -1.96875l0 -3.828125l-0.921875 0l0 -0.46875l0.921875 -0.40625l0.40625 -1.359375l0.5625 0l0 1.484375l1.859375 0l0 0.75l-1.859375 0l0 3.78125q0 0.578125 0.265625 0.890625q0.28125 0.3125 0.765625 0.3125zm3.0800781 -5.734375l0 4.171875q0 0.78125 0.34375 1.171875q0.359375 0.375 1.125 0.375q1.015625 0 1.46875 -0.546875q0.46875 -0.546875 0.46875 -1.796875l0 -3.375l0.96875 0l0 6.421875l-0.796875 0l-0.140625 -0.859375l-0.046875 0q-0.296875 0.46875 -0.828125 0.734375q-0.53125 0.25 -1.21875 0.25q-1.171875 0 -1.75 -0.5625q-0.578125 -0.5625 -0.578125 -1.78125l0 -4.203125l0.984375 0zm9.380859 -0.125q0.421875 0 0.765625 0.078125l-0.140625 0.90625q-0.390625 -0.09375 -0.703125 -0.09375q-0.78125 0 -1.34375 0.640625q-0.546875 0.625 -0.546875 1.5625l0 3.453125l-0.96875 0l0 -6.421875l0.796875 0l0.125 1.1875l0.046875 0q0.34375 -0.625 0.84375 -0.96875q0.515625 -0.34375 1.125 -0.34375zm4.6796875 6.671875q-1.421875 0 -2.25 -0.875q-0.828125 -0.875 -0.828125 -2.40625q0 -1.5625 0.765625 -2.46875q0.765625 -0.921875 2.0625 -0.921875q1.203125 0 1.90625 0.796875q0.703125 0.796875 0.703125 2.09375l0 0.625l-4.421875 0q0.03125 1.125 0.5625 1.71875q0.546875 0.578125 1.53125 0.578125q1.03125 0 2.046875 -0.4375l0 0.875q-0.515625 0.21875 -0.984375 0.3125q-0.453125 0.109375 -1.09375 0.109375zm-0.265625 -5.84375q-0.78125 0 -1.25 0.5q-0.453125 0.5 -0.53125 1.390625l3.359375 0q0 -0.921875 -0.40625 -1.40625q-0.40625 -0.484375 -1.171875 -0.484375z" fill-rule="nonzero"/></g></svg>
+<svg version="1.1" viewBox="0.0 0.0 620.0 310.0" fill="none" stroke="none" stroke-linecap="square" stroke-miterlimit="10" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns="http://www.w3.org/2000/svg"><clipPath id="p.0"><path d="m0 0l720.0 0l0 540.0l-720.0 0l0 -540.0z" clip-rule="nonzero"/></clipPath><g clip-path="url(#p.0)"><path fill="#000000" fill-opacity="0.0" d="m0 0l720.0 0l0 540.0l-720.0 0z" fill-rule="evenodd"/><path fill="#f3f3f3" d="m12.700788 11.509187l317.00787 0l0 292.31497l-317.00787 0z" fill-rule="evenodd"/><path stroke="#cccccc" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m12.700788 11.509187l317.00787 0l0 292.31497l-317.00787 0z" fill-rule="evenodd"/><path fill="#434343" d="m287.5871 289.67102q0 1.0 -0.75 1.546875q-0.734375 0.53125 -2.078125 0.53125q-1.421875 0 -2.21875 -0.4375l0 -1.015625q0.515625 0.265625 1.109375 0.421875q0.59375 0.140625 1.140625 0.140625q0.84375 0 1.296875 -0.265625q0.453125 -0.265625 0.453125 -0.828125q0 -0.40625 -0.359375 -0.703125q-0.359375 -0.296875 -1.40625 -0.703125q-1.0 -0.375 -1.421875 -0.640625q-0.421875 -0.28125 -0.625 -0.625q-0.203125 -0.359375 -0.203125 -0.84375q0 -0.875 0.703125 -1.375q0.71875 -0.515625 1.953125 -0.515625q1.15625 0 2.25 0.46875l-0.375 0.875q-1.078125 -0.4375 -1.953125 -0.4375q-0.765625 0 -1.15625 0.25q-0.390625 0.234375 -0.390625 0.65625q0 0.28125 0.140625 0.484375q0.15625 0.203125 0.46875 0.390625q0.328125 0.171875 1.25 0.53125q1.28125 0.453125 1.71875 0.921875q0.453125 0.46875 0.453125 1.171875zm4.764435 2.078125q-1.578125 0 -2.5 -0.953125q-0.90625 -0.96875 -0.90625 -2.671875q0 -1.734375 0.84375 -2.75q0.859375 -1.015625 2.28125 -1.015625q1.34375 0 2.125 0.890625q0.78125 0.875 0.78125 2.328125l0 0.671875l-4.90625 0q0.03125 1.265625 0.625 1.921875q0.609375 0.640625 1.703125 0.640625q1.140625 0 2.265625 -0.484375l0 0.96875q-0.5625 0.25 -1.078125 0.34375q-0.515625 0.109375 -1.234375 0.109375zm-0.296875 -6.484375q-0.859375 0 -1.375 0.5625q-0.5 0.5625 -0.59375 1.546875l3.734375 0q0 -1.015625 -0.453125 -1.5625q-0.453125 -0.546875 -1.3125 -0.546875zm8.024445 -0.90625q0.46875 0 0.84375 0.078125l-0.140625 1.0q-0.453125 -0.09375 -0.78125 -0.09375q-0.875 0 -1.5 0.703125q-0.609375 0.703125 -0.609375 1.75l0 3.828125l-1.078125 0l0 -7.125l0.890625 0l0.125 1.3125l0.0625 0q0.390625 -0.703125 0.953125 -1.078125q0.5625 -0.375 1.234375 -0.375zm3.7374573 7.265625l-2.703125 -7.125l1.15625 0l1.53125 4.21875q0.53125 1.484375 0.625 1.9375l0.046875 0q0.0625 -0.359375 0.4375 -1.4375q0.390625 -1.078125 1.71875 -4.71875l1.15625 0l-2.703125 7.125l-1.265625 0zm8.130188 0.125q-1.578125 0 -2.5 -0.953125q-0.90625 -0.96875 -0.90625 -2.671875q0 -1.734375 0.84375 -2.75q0.859375 -1.015625 2.28125 -1.015625q1.34375 0 2.125 0.890625q0.78125 0.875 0.78125 2.328125l0 0.671875l-4.90625 0q0.03125 1.265625 0.625 1.921875q0.609375 0.640625 1.703125 0.640625q1.140625 0 2.265625 -0.484375l0 0.96875q-0.5625 0.25 -1.078125 0.34375q-0.515625 0.109375 -1.234375 0.109375zm-0.296875 -6.484375q-0.859375 0 -1.375 0.5625q-0.5 0.5625 -0.59375 1.546875l3.734375 0q0 -1.015625 -0.453125 -1.5625q-0.453125 -0.546875 -1.3125 -0.546875zm8.024445 -0.90625q0.46875 0 0.84375 0.078125l-0.140625 1.0q-0.453125 -0.09375 -0.78125 -0.09375q-0.875 0 -1.5 0.703125q-0.609375 0.703125 -0.609375 1.75l0 3.828125l-1.078125 0l0 -7.125l0.890625 0l0.125 1.3125l0.0625 0q0.390625 -0.703125 0.953125 -1.078125q0.5625 -0.375 1.234375 -0.375z" fill-rule="nonzero"/><path fill="#d9d9d9" d="m70.40157 31.61155l201.6063 0l0 69.98425l-201.6063 0z" fill-rule="evenodd"/><path stroke="#cccccc" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" stroke-dasharray="4.0,3.0" d="m70.40157 31.61155l201.6063 0l0 69.98425l-201.6063 0z" fill-rule="evenodd"/><path fill="#434343" d="m129.30531 52.13155l-1.0 0l0 -7.6875l-2.7031326 0l0 -0.875l6.4218826 0l0 0.875l-2.71875 0l0 7.6875zm6.576172 0.125q-1.421875 0 -2.25 -0.875q-0.828125 -0.875 -0.828125 -2.40625q0 -1.5625 0.765625 -2.46875q0.765625 -0.921875 2.0625 -0.921875q1.203125 0 1.90625 0.796875q0.703125 0.796875 0.703125 2.09375l0 0.625l-4.421875 0q0.03125 1.125 0.5625 1.71875q0.546875 0.578125 1.53125 0.578125q1.03125 0 2.046875 -0.4375l0 0.875q-0.515625 0.21875 -0.984375 0.3125q-0.453125 0.109375 -1.09375 0.109375zm-0.265625 -5.84375q-0.78125 0 -1.25 0.5q-0.453125 0.5 -0.53125 1.390625l3.359375 0q0 -0.921875 -0.40625 -1.40625q-0.40625 -0.484375 -1.171875 -0.484375zm8.669922 5.71875l0 -4.15625q0 -0.78125 -0.359375 -1.171875q-0.34375 -0.390625 -1.109375 -0.390625q-1.015625 0 -1.484375 0.546875q-0.46875 0.546875 -0.46875 1.796875l0 3.375l-0.96875 0l0 -6.421875l0.796875 0l0.15625 0.875l0.046875 0q0.296875 -0.46875 0.828125 -0.734375q0.546875 -0.265625 1.203125 -0.265625q1.171875 0 1.75 0.5625q0.59375 0.5625 0.59375 1.796875l0 4.1875l-0.984375 0zm7.1152344 -1.75q0 0.890625 -0.671875 1.390625q-0.65625 0.484375 -1.875 0.484375q-1.265625 0 -1.984375 -0.40625l0 -0.90625q0.46875 0.234375 0.984375 0.375q0.53125 0.125 1.03125 0.125q0.765625 0 1.171875 -0.234375q0.40625 -0.25 0.40625 -0.75q0 -0.375 -0.328125 -0.640625q-0.3125 -0.265625 -1.265625 -0.625q-0.890625 -0.34375 -1.28125 -0.59375q-0.375 -0.25 -0.5625 -0.5625q-0.171875 -0.3125 -0.171875 -0.75q0 -0.78125 0.640625 -1.234375q0.640625 -0.46875 1.75 -0.46875q1.03125 0 2.03125 0.421875l-0.359375 0.796875q-0.953125 -0.390625 -1.75 -0.390625q-0.6875 0 -1.046875 0.21875q-0.34375 0.203125 -0.34375 0.59375q0 0.25 0.125 0.4375q0.140625 0.171875 0.421875 0.34375q0.296875 0.15625 1.140625 0.46875q1.140625 0.421875 1.53125 0.84375q0.40625 0.421875 0.40625 1.0625zm7.1308594 -1.46875q0 1.578125 -0.796875 2.46875q-0.78125 0.875 -2.1875 0.875q-0.859375 0 -1.53125 -0.40625q-0.65625 -0.40625 -1.03125 -1.15625q-0.359375 -0.765625 -0.359375 -1.78125q0 -1.5625 0.78125 -2.4375q0.796875 -0.890625 2.1875 -0.890625q1.34375 0 2.140625 0.90625q0.796875 0.890625 0.796875 2.421875zm-4.890625 0q0 1.234375 0.484375 1.875q0.5 0.640625 1.453125 0.640625q0.953125 0 1.4375 -0.640625q0.5 -0.640625 0.5 -1.875q0 -1.21875 -0.5 -1.859375q-0.484375 -0.640625 -1.453125 -0.640625q-0.953125 0 -1.4375 0.640625q-0.484375 0.625 -0.484375 1.859375zm9.529297 -3.328125q0.421875 0 0.765625 0.078125l-0.140625 0.90625q-0.390625 -0.09375 -0.703125 -0.09375q-0.78125 0 -1.34375 0.640625q-0.546875 0.625 -0.546875 1.5625l0 3.453125l-0.96875 0l0 -6.421875l0.796875 0l0.125 1.1875l0.046875 0q0.34375 -0.625 0.84375 -0.96875q0.515625 -0.34375 1.125 -0.34375zm3.1015625 6.546875l-1.0 0l0 -8.5625l4.78125 0l0 0.875l-3.78125 0l0 3.140625l3.546875 0l0 0.890625l-3.546875 0l0 3.65625zm6.0214844 0l-0.96875 0l0 -9.125l0.96875 0l0 9.125zm7.6132812 -3.21875q0 1.578125 -0.796875 2.46875q-0.78125 0.875 -2.1875 0.875q-0.859375 0 -1.53125 -0.40625q-0.65625 -0.40625 -1.03125 -1.15625q-0.359375 -0.765625 -0.359375 -1.78125q0 -1.5625 0.78125 -2.4375q0.796875 -0.890625 2.1875 -0.890625q1.34375 0 2.140625 0.90625q0.796875 0.890625 0.796875 2.421875zm-4.890625 0q0 1.234375 0.484375 1.875q0.5 0.640625 1.453125 0.640625q0.953125 0 1.4375 -0.640625q0.5 -0.640625 0.5 -1.875q0 -1.21875 -0.5 -1.859375q-0.484375 -0.640625 -1.453125 -0.640625q-0.953125 0 -1.4375 0.640625q-0.484375 0.625 -0.484375 1.859375zm11.841797 3.21875l-1.1875 -3.765625q-0.109375 -0.34375 -0.40625 -1.578125l-0.046875 0q-0.234375 1.03125 -0.421875 1.59375l-1.203125 3.75l-1.125 0l-1.75 -6.421875l1.015625 0q0.625 2.421875 0.9375 3.6875q0.328125 1.265625 0.375 1.703125l0.046875 0q0.0625 -0.328125 0.203125 -0.859375q0.15625 -0.53125 0.265625 -0.84375l1.171875 -3.6875l1.046875 0l1.15625 3.6875q0.328125 1.0 0.4375 1.6875l0.046875 0q0.03125 -0.203125 0.125 -0.640625q0.109375 -0.453125 1.234375 -4.734375l1.0 0l-1.765625 6.421875l-1.15625 0zm12.732422 0l-1.0625 -2.71875l-3.4375 0l-1.046875 2.71875l-1.015625 0l3.390625 -8.609375l0.828125 0l3.375 8.609375l-1.03125 0zm-1.375 -3.625l-1.0 -2.65625q-0.1875 -0.5 -0.390625 -1.234375q-0.140625 0.5625 -0.375 1.234375l-1.0 2.65625l2.765625 0zm9.015625 -2.453125q0 1.3125 -0.890625 2.015625q-0.890625 0.6875 -2.53125 0.6875l-1.015625 0l0 3.375l-1.0 0l0 -8.5625l2.234375 0q3.203125 0 3.203125 2.484375zm-4.4375 1.859375l0.90625 0q1.3125 0 1.90625 -0.421875q0.59375 -0.4375 0.59375 -1.390625q0 -0.84375 -0.5625 -1.25q-0.546875 -0.421875 -1.734375 -0.421875l-1.109375 0l0 3.484375zm6.2246094 4.21875l0 -8.5625l1.0 0l0 8.5625l-1.0 0zm7.345703 -1.75q0 0.890625 -0.671875 1.390625q-0.65625 0.484375 -1.875 0.484375q-1.265625 0 -1.984375 -0.40625l0 -0.90625q0.46875 0.234375 0.984375 0.375q0.53125 0.125 1.03125 0.125q0.765625 0 1.171875 -0.234375q0.40625 -0.25 0.40625 -0.75q0 -0.375 -0.328125 -0.640625q-0.3125 -0.265625 -1.265625 -0.625q-0.890625 -0.34375 -1.28125 -0.59375q-0.375 -0.25 -0.5625 -0.5625q-0.171875 -0.3125 -0.171875 -0.75q0 -0.78125 0.640625 -1.234375q0.640625 -0.46875 1.75 -0.46875q1.03125 0 2.03125 0.421875l-0.359375 0.796875q-0.953125 -0.390625 -1.75 -0.390625q-0.6875 0 -1.046875 0.21875q-0.34375 0.203125 -0.34375 0.59375q0 0.25 0.125 0.4375q0.140625 0.171875 0.421875 0.34375q0.296875 0.15625 1.140625 0.46875q1.140625 0.421875 1.53125 0.84375q0.40625 0.421875 0.40625 1.0625z" fill-rule="nonzero"/><path fill="#f3f3f3" d="m343.54068 100.7874l249.0079 0l0 203.02364l-249.0079 0z" fill-rule="evenodd"/><path stroke="#cccccc" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m343.54068 100.7874l249.0079 0l0 203.02364l-249.0079 0z" fill-rule="evenodd"/><path fill="#434343" d="m356.54068 291.73602q-1.546875 0 -2.40625 -0.953125q-0.84375 -0.953125 -0.84375 -2.6875q0 -1.796875 0.859375 -2.765625q0.859375 -0.984375 2.453125 -0.984375q0.515625 0 1.03125 0.109375q0.515625 0.109375 0.8125 0.265625l-0.328125 0.921875q-0.359375 -0.15625 -0.796875 -0.25q-0.421875 -0.09375 -0.734375 -0.09375q-2.171875 0 -2.171875 2.78125q0 1.3125 0.515625 2.015625q0.53125 0.703125 1.578125 0.703125q0.890625 0 1.828125 -0.390625l0 0.96875q-0.71875 0.359375 -1.796875 0.359375zm4.5639343 -0.125l-1.078125 0l0 -10.125l1.078125 0l0 10.125zm3.3710938 0l-1.078125 0l0 -7.125l1.078125 0l0 7.125zm-1.171875 -9.0625q0 -0.375 0.1875 -0.546875q0.1875 -0.171875 0.453125 -0.171875q0.265625 0 0.453125 0.171875q0.1875 0.171875 0.1875 0.546875q0 0.359375 -0.1875 0.546875q-0.1875 0.171875 -0.453125 0.171875q-0.265625 0 -0.453125 -0.171875q-0.1875 -0.1875 -0.1875 -0.546875zm6.480438 9.1875q-1.578125 0 -2.5 -0.953125q-0.90625 -0.96875 -0.90625 -2.671875q0 -1.734375 0.84375 -2.75q0.859375 -1.015625 2.28125 -1.015625q1.34375 0 2.125 0.890625q0.78125 0.875 0.78125 2.328125l0 0.671875l-4.90625 0q0.03125 1.265625 0.625 1.921875q0.609375 0.640625 1.703125 0.640625q1.140625 0 2.265625 -0.484375l0 0.96875q-0.5625 0.25 -1.078125 0.34375q-0.515625 0.109375 -1.234375 0.109375zm-0.296875 -6.484375q-0.859375 0 -1.375 0.5625q-0.5 0.5625 -0.59375 1.546875l3.734375 0q0 -1.015625 -0.453125 -1.5625q-0.453125 -0.546875 -1.3125 -0.546875zm9.649445 6.359375l0 -4.609375q0 -0.875 -0.40625 -1.296875q-0.390625 -0.4375 -1.234375 -0.4375q-1.125 0 -1.65625 0.609375q-0.515625 0.59375 -0.515625 2.0l0 3.734375l-1.078125 0l0 -7.125l0.890625 0l0.171875 0.96875l0.046875 0q0.328125 -0.53125 0.921875 -0.8125q0.609375 -0.296875 1.34375 -0.296875q1.296875 0 1.9375 0.625q0.65625 0.625 0.65625 1.984375l0 4.65625l-1.078125 0zm5.6022644 -0.765625q0.28125 0 0.546875 -0.03125q0.265625 -0.046875 0.421875 -0.09375l0 0.828125q-0.171875 0.078125 -0.515625 0.125q-0.34375 0.0625 -0.609375 0.0625q-2.078125 0 -2.078125 -2.171875l0 -4.25l-1.015625 0l0 -0.515625l1.015625 -0.453125l0.453125 -1.515625l0.625 0l0 1.65625l2.078125 0l0 0.828125l-2.078125 0l0 4.203125q0 0.640625 0.3125 0.984375q0.3125 0.34375 0.84375 0.34375z" fill-rule="nonzero"/><path fill="#f4cccc" d="m127.456696 236.45866l87.49606 0l0 30.992111l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m127.456696 236.45866l87.49606 0l0 30.992111l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m155.70459 249.47473l0 -6.734375l-2.125 0l0 -0.75l5.1875 0l0 0.75l-2.21875 0l0 6.734375l-0.84375 0zm4.437546 0l0 -7.484375l4.3125 0l0 0.734375l-3.46875 0l0 2.34375l2.796875 0l0 0.734375l-2.796875 0l0 3.671875l-0.84375 0zm5.859421 -7.46875l0.984375 0l0 0.078125q-0.078125 0.078125 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0 6.234375l3.59375 0l0 0.71875l-4.46875 0l0 -7.46875zm6.406296 7.46875l0 -0.703125l1.40625 0l0 -4.078125l-1.34375 0l0 -0.703125l2.203125 0l0 4.78125l1.28125 0l0 0.703125l-3.546875 0zm1.78125 -6.640625q-0.25 0 -0.4375 -0.171875q-0.171875 -0.1875 -0.171875 -0.4375q0 -0.265625 0.171875 -0.4375q0.171875 -0.1875 0.4375 -0.1875q0.25 0 0.4375 0.1875q0.1875 0.1875 0.1875 0.4375q0 0.25 -0.1875 0.4375q-0.1875 0.171875 -0.4375 0.171875zm8.343796 6.140625q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm3.9375458 0.625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0z" fill-rule="nonzero"/><path fill="#000000" d="m147.59514 262.88098q0.4375 0 0.84375 -0.25q0.40625 -0.25 0.65625 -0.671875l0.625 0.40625q-0.375 0.625 -0.875 0.9375q-0.5 0.296875 -1.21875 0.296875q-0.84375 0 -1.5 -0.40625q-0.65625 -0.421875 -1.046875 -1.265625q-0.390625 -0.859375 -0.390625 -2.15625q0 -1.375 0.421875 -2.234375q0.421875 -0.859375 1.0625 -1.21875q0.65625 -0.375 1.40625 -0.375q0.78125 0 1.359375 0.390625q0.59375 0.390625 0.890625 1.078125l-0.71875 0.34375q-0.015625 0 -0.015625 0q0 -0.015625 0 -0.015625q-0.3125 -0.625 -0.703125 -0.875q-0.375 -0.25 -0.84375 -0.25q-0.9375 0 -1.484375 0.828125q-0.546875 0.8125 -0.546875 2.28125q0 0.921875 0.265625 1.640625q0.28125 0.71875 0.75 1.125q0.484375 0.390625 1.0625 0.390625zm1.375 -5.171875q0.015625 -0.015625 0.015625 -0.015625q0.03125 0 0.109375 0.0625l-0.09375 0.046875l-0.03125 -0.09375zm0.140625 0.046875q0.046875 0.109375 -0.015625 0l0.015625 0zm4.093796 5.8125q-0.734375 0 -1.3125 -0.359375q-0.578125 -0.359375 -0.90625 -1.0q-0.3125 -0.65625 -0.3125 -1.484375q0 -0.828125 0.3125 -1.46875q0.328125 -0.65625 0.90625 -1.015625q0.578125 -0.375 1.3125 -0.375q0.734375 0 1.3125 0.375q0.578125 0.359375 0.890625 1.015625q0.328125 0.640625 0.328125 1.46875q0 0.828125 -0.328125 1.484375q-0.3125 0.640625 -0.890625 1.0q-0.578125 0.359375 -1.3125 0.359375zm0 -0.71875q0.46875 0 0.828125 -0.265625q0.375 -0.28125 0.578125 -0.765625q0.21875 -0.484375 0.21875 -1.109375q0 -0.9375 -0.46875 -1.53125q-0.453125 -0.59375 -1.15625 -0.59375q-0.703125 0 -1.171875 0.59375q-0.453125 0.59375 -0.453125 1.53125q0 0.625 0.203125 1.109375q0.21875 0.484375 0.578125 0.765625q0.375 0.265625 0.84375 0.265625zm3.8594208 -4.859375l0.84375 0l0 0.96875q0.328125 -0.5 0.8125 -0.796875q0.5 -0.296875 1.046875 -0.296875q0.734375 0 1.171875 0.5625q0.4375 0.546875 0.4375 1.71875l0 3.328125l-0.84375 0l0 -3.296875q0 -0.8125 -0.28125 -1.1875q-0.265625 -0.375 -0.71875 -0.375q-0.375 0 -0.75 0.21875q-0.375 0.21875 -0.625 0.609375q-0.25 0.390625 -0.25 0.875l0 3.15625l-0.84375 0l0 -5.484375zm10.593796 0q-0.1875 0.96875 -0.796875 2.40625l-1.328125 3.078125l-0.671875 0l-2.171875 -5.484375l0.859375 0l1.6875 4.296875l0.890625 -2.03125q0.546875 -1.25 0.71875 -2.265625l0.8125 0zm3.8125458 5.609375q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm6.640671 -1.0625l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.375 0.21875 -0.625 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.875 0l0 -5.5l0.90625 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.484375 -0.296875 1.046875 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625zm6.343796 3.796875q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm3.9375458 0.625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm6.640671 -1.0625l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.375 0.21875 -0.625 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.875 0l0 -5.5l0.90625 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.484375 -0.296875 1.046875 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625z" fill-rule="nonzero"/><path fill="#93c47d" d="m230.92084 236.45866l87.49606 0l0 30.992111l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m230.92084 236.45866l87.49606 0l0 30.992111l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m259.16873 249.47473l0 -6.734375l-2.125 0l0 -0.75l5.1875 0l0 0.75l-2.21875 0l0 6.734375l-0.84375 0zm4.437561 0l0 -7.484375l4.3125 0l0 0.734375l-3.46875 0l0 2.34375l2.796875 0l0 0.734375l-2.796875 0l0 3.671875l-0.84375 0zm5.8594055 -7.46875l0.984375 0l0 0.078125q-0.078125 0.078125 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0 6.234375l3.59375 0l0 0.71875l-4.46875 0l0 -7.46875zm6.406311 7.46875l0 -0.703125l1.40625 0l0 -4.078125l-1.34375 0l0 -0.703125l2.203125 0l0 4.78125l1.28125 0l0 0.703125l-3.546875 0zm1.78125 -6.640625q-0.25 0 -0.4375 -0.171875q-0.171875 -0.1875 -0.171875 -0.4375q0 -0.265625 0.171875 -0.4375q0.171875 -0.1875 0.4375 -0.1875q0.25 0 0.4375 0.1875q0.1875 0.1875 0.1875 0.4375q0 0.25 -0.1875 0.4375q-0.1875 0.171875 -0.4375 0.171875zm8.3437805 6.140625q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm3.937561 0.625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0z" fill-rule="nonzero"/><path fill="#000000" d="m245.60614 263.47473l0 -7.484375l4.3125 0l0 0.734375l-3.46875 0l0 2.34375l2.796875 0l0 0.734375l-2.796875 0l0 3.671875l-0.84375 0zm6.015671 0l0 -0.703125l1.609375 0l0 -6.578125l-1.546875 0l0 -0.703125l2.421875 0l0 7.28125l1.609375 0l0 0.703125l-4.09375 0zm7.968796 -5.609375q1.1875 0 1.796875 0.625q0.625 0.609375 0.625 2.0625l0 2.921875l-0.9375 0l0 -0.84375q-0.5 0.96875 -1.875 0.96875q-0.90625 0 -1.421875 -0.40625q-0.515625 -0.421875 -0.515625 -1.09375q0 -0.578125 0.359375 -1.0q0.375 -0.4375 1.0 -0.671875q0.640625 -0.234375 1.390625 -0.234375q0.6875 0 1.234375 0.0625q-0.0625 -0.921875 -0.484375 -1.296875q-0.40625 -0.375 -1.21875 -0.375q-0.421875 0 -0.796875 0.15625q-0.375 0.15625 -0.6875 0.453125l-0.421875 -0.5625q0.765625 -0.765625 1.953125 -0.765625zm-0.3125 5.078125q0.890625 0 1.40625 -0.515625q0.515625 -0.53125 0.5625 -1.515625q-0.53125 -0.078125 -1.15625 -0.078125q-0.90625 0 -1.4375 0.296875q-0.53125 0.296875 -0.53125 0.921875q0 0.890625 1.15625 0.890625zm8.718811 0.03125q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm3.8594055 0.625q-0.484375 0 -0.90625 -0.21875q-0.421875 -0.21875 -0.703125 -0.625l-0.3125 0.71875l-0.546875 0l0 -7.984375l0.984375 0l0 0.09375q-0.078125 0.0625 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0 2.8125q0.265625 -0.453125 0.71875 -0.703125q0.453125 -0.265625 0.921875 -0.265625q1.03125 0 1.640625 0.71875q0.609375 0.71875 0.609375 2.09375q0 0.9375 -0.328125 1.609375q-0.3125 0.65625 -0.84375 0.984375q-0.53125 0.328125 -1.125 0.328125zm-0.109375 -0.765625q0.65625 0 1.078125 -0.5q0.4375 -0.515625 0.4375 -1.609375q0 -1.046875 -0.40625 -1.578125q-0.390625 -0.546875 -1.078125 -0.546875q-0.671875 0 -1.09375 0.609375q-0.421875 0.59375 -0.421875 1.546875q0 2.078125 1.484375 2.078125zm5.578186 0.765625q-0.875 0 -1.390625 -0.640625q-0.515625 -0.640625 -0.5 -1.90625l0.015625 -3.0625l0.84375 0l0 3.0625q0 0.984375 0.328125 1.421875q0.34375 0.4375 0.921875 0.4375q0.609375 0 1.03125 -0.484375q0.4375 -0.484375 0.4375 -1.40625l0 -3.03125l0.84375 0l0 4.625q0 0.296875 0.015625 0.484375q0.015625 0.1875 0.09375 0.375l-0.828125 0q-0.078125 -0.1875 -0.09375 -0.375q-0.015625 -0.1875 -0.015625 -0.46875q-0.265625 0.453125 -0.71875 0.71875q-0.453125 0.25 -0.984375 0.25zm8.5625305 -6.78125l-0.015625 0.015625q-0.328125 -0.421875 -0.5625 -0.5625q-0.234375 -0.15625 -0.65625 -0.15625q-0.5625 0 -0.890625 0.34375q-0.328125 0.328125 -0.328125 1.078125l0 0.453125l1.8125 0l0 0.6875l-1.8125 0l0 4.796875l-0.828125 0l0 -4.796875l-1.1875 0l0 -0.6875l1.1875 0l0 -0.453125q0 -1.0625 0.546875 -1.578125q0.546875 -0.53125 1.46875 -0.53125q0.53125 0 0.984375 0.1875q0.453125 0.1875 0.734375 0.5625l-0.453125 0.640625zm-0.140625 0.03125q0 -0.03125 0.046875 0q0.046875 0.015625 0.0625 0.015625l-0.046875 0.046875l-0.0625 -0.046875l0 -0.015625zm0.125 -0.015625q0.078125 0.09375 0.03125 0.0625q-0.03125 -0.03125 -0.046875 -0.03125l0.015625 -0.03125zm6.015686 -0.015625l-0.015625 0.015625q-0.328125 -0.421875 -0.5625 -0.5625q-0.234375 -0.15625 -0.65625 -0.15625q-0.5625 0 -0.890625 0.34375q-0.328125 0.328125 -0.328125 1.078125l0 0.453125l1.8125 0l0 0.6875l-1.8125 0l0 4.796875l-0.828125 0l0 -4.796875l-1.1875 0l0 -0.6875l1.1875 0l0 -0.453125q0 -1.0625 0.546875 -1.578125q0.546875 -0.53125 1.46875 -0.53125q0.53125 0 0.984375 0.1875q0.453125 0.1875 0.734375 0.5625l-0.453125 0.640625zm-0.140625 0.03125q0 -0.03125 0.046875 0q0.046875 0.015625 0.0625 0.015625l-0.046875 0.046875l-0.0625 -0.046875l0 -0.015625zm0.125 -0.015625q0.078125 0.09375 0.03125 0.0625q-0.03125 -0.03125 -0.046875 -0.03125l0.015625 -0.03125zm4.0625305 6.765625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm6.640686 -1.0625l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.375 0.21875 -0.625 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.875 0l0 -5.5l0.90625 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.484375 -0.296875 1.046875 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625z" fill-rule="nonzero"/><path fill="#f4cccc" d="m359.81628 236.43504l87.49606 0l0 30.992111l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m359.81628 236.43504l87.49606 0l0 30.992111l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m388.06418 249.4511l0 -6.734375l-2.125 0l0 -0.75l5.1875 0l0 0.75l-2.21875 0l0 6.734375l-0.84375 0zm4.4375305 0l0 -7.484375l4.3125 0l0 0.734375l-3.46875 0l0 2.34375l2.796875 0l0 0.734375l-2.796875 0l0 3.671875l-0.84375 0zm5.859436 -7.46875l0.984375 0l0 0.078125q-0.078125 0.078125 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0 6.234375l3.59375 0l0 0.71875l-4.46875 0l0 -7.46875zm6.4062805 7.46875l0 -0.703125l1.40625 0l0 -4.078125l-1.34375 0l0 -0.703125l2.203125 0l0 4.78125l1.28125 0l0 0.703125l-3.546875 0zm1.78125 -6.640625q-0.25 0 -0.4375 -0.171875q-0.171875 -0.1875 -0.171875 -0.4375q0 -0.265625 0.171875 -0.4375q0.171875 -0.1875 0.4375 -0.1875q0.25 0 0.4375 0.1875q0.1875 0.1875 0.1875 0.4375q0 0.25 -0.1875 0.4375q-0.1875 0.171875 -0.4375 0.171875zm8.343811 6.140625q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm3.9375305 0.625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0z" fill-rule="nonzero"/><path fill="#000000" d="m371.76718 263.4511l0 -0.703125l1.40625 0l0 -4.078125l-1.34375 0l0 -0.703125l2.203125 0l0 4.78125l1.28125 0l0 0.703125l-3.546875 0zm1.78125 -6.640625q-0.25 0 -0.4375 -0.171875q-0.171875 -0.1875 -0.171875 -0.4375q0 -0.26564026 0.171875 -0.43751526q0.171875 -0.1875 0.4375 -0.1875q0.25 0 0.4375 0.1875q0.1875 0.1875 0.1875 0.43751526q0 0.25 -0.1875 0.4375q-0.1875 0.171875 -0.4375 0.171875zm3.8750305 1.15625l0.84375 0l0 0.96875q0.328125 -0.5 0.8125 -0.796875q0.5 -0.296875 1.046875 -0.296875q0.734375 0 1.171875 0.5625q0.4375 0.546875 0.4375 1.71875l0 3.328125l-0.84375 0l0 -3.296875q0 -0.8125 -0.28125 -1.1875q-0.265625 -0.375 -0.71875 -0.375q-0.375 0 -0.75 0.21875q-0.375 0.21875 -0.625 0.609375q-0.25 0.390625 -0.25 0.875l0 3.15625l-0.84375 0l0 -5.484375zm10.468811 4.984375q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm3.9375305 0.625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm6.640686 -1.0625l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.375 0.21875 -0.625 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.875 0l0 -5.5l0.90625 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.484375 -0.296875 1.046875 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625zm1.7344055 -1.1875l0.875 0l0 1.078125q0.1875 -0.59375 0.625 -0.890625q0.4375 -0.3125 1.046875 -0.3125q0.625 0 1.140625 0.328125q0.53125 0.3125 0.84375 0.953125q0.3125 0.625 0.3125 1.546875q0 0.921875 -0.328125 1.59375q-0.328125 0.65625 -0.859375 1.0q-0.53125 0.328125 -1.140625 0.328125q-0.484375 0 -0.921875 -0.21875q-0.421875 -0.234375 -0.703125 -0.640625l0 2.71875l-0.890625 0l0 -7.484375zm2.375 4.859375q0.65625 0 1.109375 -0.5q0.453125 -0.5 0.453125 -1.625q0 -1.015625 -0.40625 -1.5625q-0.390625 -0.546875 -1.125 -0.546875q-0.671875 0 -1.109375 0.578125q-0.421875 0.5625 -0.421875 1.71875q0.03125 0.953125 0.421875 1.453125q0.390625 0.484375 1.078125 0.484375zm8.015686 -3.71875l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.375 0.21875 -0.625 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.875 0l0 -5.5l0.90625 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.484375 -0.296875 1.046875 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625zm4.2812805 4.421875q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm6.859436 2.78125q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm3.9375305 0.625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm6.640686 -1.0625l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.375 0.21875 -0.625 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.875 0l0 -5.5l0.90625 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.484375 -0.296875 1.046875 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625z" fill-rule="nonzero"/><path fill="#c9daf8" d="m495.33072 194.56627l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m495.33072 194.56627l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m531.6832 210.09796l2.90625 0l0 4.15625q-0.6875 0.21875 -1.390625 0.328125q-0.703125 0.125 -1.625 0.125q-1.9375 0 -3.03125 -1.15625q-1.078125 -1.171875 -1.078125 -3.25q0 -1.34375 0.53125 -2.34375q0.546875 -1.0 1.546875 -1.53125q1.015625 -0.53125 2.359375 -0.53125q1.375 0 2.5625 0.5l-0.390625 0.875q-1.15625 -0.484375 -2.234375 -0.484375q-1.5625 0 -2.453125 0.9375q-0.875 0.921875 -0.875 2.578125q0 1.734375 0.84375 2.640625q0.859375 0.890625 2.5 0.890625q0.890625 0 1.734375 -0.21875l0 -2.625l-1.90625 0l0 -0.890625zm10.392578 -1.59375q0 1.3125 -0.890625 2.015625q-0.890625 0.6875 -2.53125 0.6875l-1.015625 0l0 3.375l-1.0 0l0 -8.5625l2.234375 0q3.203125 0 3.203125 2.484375zm-4.4375 1.859375l0.90625 0q1.3125 0 1.90625 -0.421875q0.59375 -0.4375 0.59375 -1.390625q0 -0.84375 -0.5625 -1.25q-0.546875 -0.421875 -1.734375 -0.421875l-1.109375 0l0 3.484375zm12.693359 -4.34375l0 5.53125q0 1.46875 -0.890625 2.3125q-0.875 0.84375 -2.421875 0.84375q-1.546875 0 -2.390625 -0.84375q-0.84375 -0.859375 -0.84375 -2.328125l0 -5.515625l1.0 0l0 5.578125q0 1.078125 0.578125 1.65625q0.59375 0.578125 1.71875 0.578125q1.09375 0 1.671875 -0.578125q0.59375 -0.578125 0.59375 -1.65625l0 -5.578125l0.984375 0z" fill-rule="nonzero"/><path fill="#c9daf8" d="m495.33072 236.43504l87.49606 0l0 30.992111l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m495.33072 236.43504l87.49606 0l0 30.992111l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m532.15686 248.65422q-1.40625 0 -2.234375 0.9375q-0.8125 0.9375 -0.8125 2.578125q0 1.671875 0.78125 2.59375q0.796875 0.921875 2.25 0.921875q0.90625 0 2.046875 -0.328125l0 0.87501526q-0.890625 0.34375 -2.1875 0.34375q-1.890625 0 -2.921875 -1.1562653q-1.03125 -1.15625 -1.03125 -3.265625q0 -1.328125 0.484375 -2.3125q0.5 -1.0 1.4375 -1.53125q0.9375 -0.546875 2.203125 -0.546875q1.34375 0 2.359375 0.484375l-0.421875 0.859375q-0.984375 -0.453125 -1.953125 -0.453125zm9.3359375 1.71875q0 1.3125 -0.890625 2.015625q-0.890625 0.6875 -2.53125 0.6875l-1.015625 0l0 3.3750153l-1.0 0l0 -8.562515l2.234375 0q3.203125 0 3.203125 2.484375zm-4.4375 1.859375l0.90625 0q1.3125 0 1.90625 -0.421875q0.59375 -0.4375 0.59375 -1.390625q0 -0.84375 -0.5625 -1.25q-0.546875 -0.421875 -1.734375 -0.421875l-1.109375 0l0 3.484375zm12.693359 -4.34375l0 5.53125q0 1.46875 -0.890625 2.3125q-0.875 0.84376526 -2.421875 0.84376526q-1.546875 0 -2.390625 -0.84376526q-0.84375 -0.859375 -0.84375 -2.328125l0 -5.515625l1.0 0l0 5.578125q0 1.078125 0.578125 1.65625q0.59375 0.578125 1.71875 0.578125q1.09375 0 1.671875 -0.578125q0.59375 -0.578125 0.59375 -1.65625l0 -5.578125l0.984375 0z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m214.95276 251.95473l15.968506 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m214.95276 251.95473l12.541412 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m227.49417 251.95473l-1.124588 1.124588l3.0897675 -1.124588l-3.0897675 -1.124588z" fill-rule="evenodd"/><path fill="#d9ead3" d="m352.5748 58.32546l20.53543 0l0 20.535435l-20.53543 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m352.5748 58.32546l20.53543 0l0 20.535435l-20.53543 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m377.0735 52.719162l77.480316 0l0 31.748032l-77.480316 0z" fill-rule="evenodd"/><path fill="#000000" d="m394.0891 68.87978q0 2.109375 -1.15625 3.234375q-1.140625 1.125 -3.3125 1.125l-2.375 0l0 -8.5625l2.625 0q2.0 0 3.109375 1.109375q1.109375 1.09375 1.109375 3.09375zm-1.046875 0.03125q0 -1.671875 -0.84375 -2.515625q-0.84375 -0.859375 -2.5 -0.859375l-1.453125 0l0 6.84375l1.21875 0q1.78125 0 2.671875 -0.875q0.90625 -0.875 0.90625 -2.59375zm6.763672 4.328125l-0.203125 -0.921875l-0.046875 0q-0.46875 0.609375 -0.953125 0.828125q-0.46875 0.21875 -1.1875 0.21875q-0.953125 0 -1.5 -0.5q-0.546875 -0.5 -0.546875 -1.40625q0 -1.9375 3.109375 -2.03125l1.09375 -0.03125l0 -0.40625q0 -0.75 -0.328125 -1.109375q-0.3125 -0.359375 -1.03125 -0.359375q-0.8125 0 -1.8125 0.484375l-0.3125 -0.75q0.484375 -0.25 1.046875 -0.390625q0.5625 -0.15625 1.140625 -0.15625q1.140625 0 1.6875 0.515625q0.5625 0.5 0.5625 1.625l0 4.390625l-0.71875 0zm-2.203125 -0.6875q0.90625 0 1.421875 -0.5q0.53125 -0.5 0.53125 -1.390625l0 -0.578125l-0.984375 0.03125q-1.15625 0.046875 -1.671875 0.375q-0.5 0.3125 -0.5 0.984375q0 0.53125 0.3125 0.8125q0.3125 0.265625 0.890625 0.265625zm7.001953 0q0.25 0 0.484375 -0.03125q0.25 -0.046875 0.390625 -0.078125l0 0.734375q-0.15625 0.078125 -0.46875 0.125q-0.296875 0.0625 -0.546875 0.0625q-1.859375 0 -1.859375 -1.96875l0 -3.828125l-0.921875 0l0 -0.46875l0.921875 -0.40625l0.40625 -1.359375l0.5625 0l0 1.484375l1.859375 0l0 0.75l-1.859375 0l0 3.78125q0 0.578125 0.265625 0.890625q0.28125 0.3125 0.765625 0.3125zm6.111328 0.6875l-0.203125 -0.921875l-0.046875 0q-0.46875 0.609375 -0.953125 0.828125q-0.46875 0.21875 -1.1875 0.21875q-0.953125 0 -1.5 -0.5q-0.546875 -0.5 -0.546875 -1.40625q0 -1.9375 3.109375 -2.03125l1.09375 -0.03125l0 -0.40625q0 -0.75 -0.328125 -1.109375q-0.3125 -0.359375 -1.03125 -0.359375q-0.8125 0 -1.8125 0.484375l-0.3125 -0.75q0.484375 -0.25 1.046875 -0.390625q0.5625 -0.15625 1.140625 -0.15625q1.140625 0 1.6875 0.515625q0.5625 0.5 0.5625 1.625l0 4.390625l-0.71875 0zm-2.203125 -0.6875q0.90625 0 1.421875 -0.5q0.53125 -0.5 0.53125 -1.390625l0 -0.578125l-0.984375 0.03125q-1.15625 0.046875 -1.671875 0.375q-0.5 0.3125 -0.5 0.984375q0 0.53125 0.3125 0.8125q0.3125 0.265625 0.890625 0.265625zm10.822266 0.6875l-1.0 0l0 -7.6875l-2.703125 0l0 -0.875l6.421875 0l0 0.875l-2.71875 0l0 7.6875zm2.8417969 -6.421875l1.046875 0l1.40625 3.65625q0.453125 1.265625 0.5625 1.8125l0.046875 0q0.078125 -0.296875 0.3125 -1.015625q0.25 -0.734375 1.609375 -4.453125l1.03125 0l-2.75 7.3125q-0.421875 1.078125 -0.96875 1.53125q-0.546875 0.46875 -1.34375 0.46875q-0.4375 0 -0.875 -0.109375l0 -0.78125q0.328125 0.078125 0.71875 0.078125q1.0 0 1.4375 -1.125l0.359375 -0.921875l-2.59375 -6.453125zm10.046875 6.546875q-0.625 0 -1.140625 -0.234375q-0.515625 -0.234375 -0.875 -0.71875l-0.0625 0q0.0625 0.5625 0.0625 1.0625l0 2.65625l-0.96875 0l0 -9.3125l0.796875 0l0.125 0.875l0.046875 0q0.375 -0.53125 0.875 -0.765625q0.5 -0.234375 1.140625 -0.234375q1.28125 0 1.96875 0.875q0.703125 0.875 0.703125 2.453125q0 1.578125 -0.703125 2.46875q-0.703125 0.875 -1.96875 0.875zm-0.140625 -5.84375q-0.984375 0 -1.421875 0.546875q-0.4375 0.546875 -0.453125 1.734375l0 0.21875q0 1.359375 0.453125 1.9375q0.453125 0.578125 1.453125 0.578125q0.828125 0 1.296875 -0.671875q0.46875 -0.671875 0.46875 -1.859375q0 -1.203125 -0.46875 -1.84375q-0.46875 -0.640625 -1.328125 -0.640625zm7.2285156 5.84375q-1.421875 0 -2.25 -0.875q-0.828125 -0.875 -0.828125 -2.40625q0 -1.5625 0.765625 -2.46875q0.765625 -0.921875 2.0625 -0.921875q1.203125 0 1.90625 0.796875q0.703125 0.796875 0.703125 2.09375l0 0.625l-4.421875 0q0.03125 1.125 0.5625 1.71875q0.546875 0.578125 1.53125 0.578125q1.03125 0 2.046875 -0.4375l0 0.875q-0.515625 0.21875 -0.984375 0.3125q-0.453125 0.109375 -1.09375 0.109375zm-0.265625 -5.84375q-0.78125 0 -1.25 0.5q-0.453125 0.5 -0.53125 1.390625l3.359375 0q0 -0.921875 -0.40625 -1.40625q-0.40625 -0.484375 -1.171875 -0.484375z" fill-rule="nonzero"/><path fill="#c9daf8" d="m469.5223 58.951443l20.53543 0l0 20.535435l-20.53543 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m469.5223 58.951443l20.53543 0l0 20.535435l-20.53543 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m494.021 53.345146l100.0 0l0 31.748032l-100.0 0z" fill-rule="evenodd"/><path fill="#000000" d="m504.19287 65.30264l2.421875 0q1.703125 0 2.46875 0.515625q0.765625 0.5 0.765625 1.59375q0 0.765625 -0.421875 1.265625q-0.421875 0.5 -1.25 0.640625l0 0.0625q1.953125 0.328125 1.953125 2.046875q0 1.140625 -0.78125 1.796875q-0.765625 0.640625 -2.15625 0.640625l-3.0 0l0 -8.5625zm1.0 3.65625l1.640625 0q1.0625 0 1.515625 -0.328125q0.46875 -0.328125 0.46875 -1.109375q0 -0.71875 -0.515625 -1.03125q-0.515625 -0.328125 -1.640625 -0.328125l-1.46875 0l0 2.796875zm0 0.84375l0 3.21875l1.796875 0q1.03125 0 1.546875 -0.40625q0.53125 -0.40625 0.53125 -1.265625q0 -0.796875 -0.546875 -1.171875q-0.53125 -0.375 -1.625 -0.375l-1.703125 0zm10.587891 4.0625l-0.203125 -0.921875l-0.046875 0q-0.46875 0.609375 -0.953125 0.828125q-0.46875 0.21875 -1.1875 0.21875q-0.953125 0 -1.5 -0.5q-0.546875 -0.5 -0.546875 -1.40625q0 -1.9375 3.109375 -2.03125l1.09375 -0.03125l0 -0.40625q0 -0.75 -0.328125 -1.109375q-0.3125 -0.359375 -1.03125 -0.359375q-0.8125 0 -1.8125 0.484375l-0.3125 -0.75q0.484375 -0.25 1.046875 -0.390625q0.5625 -0.15625 1.140625 -0.15625q1.140625 0 1.6875 0.515625q0.5625 0.5 0.5625 1.625l0 4.390625l-0.71875 0zm-2.203125 -0.6875q0.90625 0 1.421875 -0.5q0.53125 -0.5 0.53125 -1.390625l0 -0.578125l-0.984375 0.03125q-1.15625 0.046875 -1.671875 0.375q-0.5 0.3125 -0.5 0.984375q0 0.53125 0.3125 0.8125q0.3125 0.265625 0.890625 0.265625zm7.486328 0.8125q-1.390625 0 -2.15625 -0.859375q-0.765625 -0.859375 -0.765625 -2.4375q0 -1.609375 0.78125 -2.484375q0.78125 -0.890625 2.203125 -0.890625q0.46875 0 0.921875 0.109375q0.46875 0.09375 0.734375 0.234375l-0.296875 0.828125q-0.328125 -0.140625 -0.703125 -0.21875q-0.375 -0.078125 -0.671875 -0.078125q-1.953125 0 -1.953125 2.484375q0 1.1875 0.46875 1.828125q0.484375 0.625 1.421875 0.625q0.796875 0 1.640625 -0.34375l0 0.859375q-0.640625 0.34375 -1.625 0.34375zm4.1191406 -3.40625q0.25 -0.359375 0.765625 -0.9375l2.0625 -2.203125l1.15625 0l-2.59375 2.734375l2.78125 3.6875l-1.171875 0l-2.28125 -3.03125l-0.71875 0.625l0 2.40625l-0.96875 0l0 -9.125l0.96875 0l0 4.84375q0 0.3125 -0.046875 1.0l0.046875 0zm8.048828 3.40625q-1.421875 0 -2.25 -0.875q-0.828125 -0.875 -0.828125 -2.40625q0 -1.5625 0.765625 -2.46875q0.765625 -0.921875 2.0625 -0.921875q1.203125 0 1.90625 0.796875q0.703125 0.796875 0.703125 2.09375l0 0.625l-4.421875 0q0.03125 1.125 0.5625 1.71875q0.546875 0.578125 1.53125 0.578125q1.03125 0 2.046875 -0.4375l0 0.875q-0.515625 0.21875 -0.984375 0.3125q-0.453125 0.109375 -1.09375 0.109375zm-0.265625 -5.84375q-0.78125 0 -1.25 0.5q-0.453125 0.5 -0.53125 1.390625l3.359375 0q0 -0.921875 -0.40625 -1.40625q-0.40625 -0.484375 -1.171875 -0.484375zm8.669922 5.71875l0 -4.15625q0 -0.78125 -0.359375 -1.171875q-0.34375 -0.390625 -1.109375 -0.390625q-1.015625 0 -1.484375 0.546875q-0.46875 0.546875 -0.46875 1.796875l0 3.375l-0.96875 0l0 -6.421875l0.796875 0l0.15625 0.875l0.046875 0q0.296875 -0.46875 0.828125 -0.734375q0.546875 -0.265625 1.203125 -0.265625q1.171875 0 1.75 0.5625q0.59375 0.5625 0.59375 1.796875l0 4.1875l-0.984375 0zm7.3496094 -0.859375l-0.0625 0q-0.671875 0.984375 -2.015625 0.984375q-1.25 0 -1.953125 -0.859375q-0.703125 -0.875 -0.703125 -2.453125q0 -1.59375 0.703125 -2.46875q0.703125 -0.890625 1.953125 -0.890625q1.3125 0 2.015625 0.953125l0.078125 0l-0.046875 -0.453125l-0.03125 -0.453125l0 -2.625l0.984375 0l0 9.125l-0.796875 0l-0.125 -0.859375zm-1.953125 0.15625q1.0 0 1.4375 -0.53125q0.453125 -0.546875 0.453125 -1.75l0 -0.203125q0 -1.375 -0.453125 -1.953125q-0.453125 -0.578125 -1.4375 -0.578125q-0.859375 0 -1.3125 0.671875q-0.453125 0.65625 -0.453125 1.859375q0 1.234375 0.4375 1.859375q0.453125 0.625 1.328125 0.625z" fill-rule="nonzero"/><path fill="#d9ead3" d="m26.510082 150.08989l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m26.510082 150.08989l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m51.586056 162.60596q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm5.890671 -6.15625l-0.015625 0.015625q-0.328125 -0.421875 -0.5625 -0.5625q-0.234375 -0.15625 -0.65625 -0.15625q-0.5625 0 -0.890625 0.34375q-0.328125 0.328125 -0.328125 1.078125l0 0.453125l1.8125 0l0 0.6875l-1.8125 0l0 4.796875l-0.828125 0l0 -4.796875l-1.1875 0l0 -0.6875l1.1875 0l0 -0.453125q0 -1.0625 0.546875 -1.578125q0.546875 -0.53125 1.46875 -0.53125q0.53125 0 0.984375 0.1875q0.453125 0.1875 0.734375 0.5625l-0.453125 0.640625zm-0.140625 0.03125q0 -0.03125 0.046875 0q0.046875 0.015625 0.0625 0.015625l-0.046875 0.046875l-0.0625 -0.046875l0 -0.015625zm0.125 -0.015625q0.078125 0.09375 0.03125 0.0625q-0.03125 -0.03125 -0.046875 -0.03125l0.015625 -0.03125zm3.6562958 6.765625q-0.296875 0 -0.5 -0.203125q-0.203125 -0.203125 -0.203125 -0.46875q0 -0.28125 0.203125 -0.484375q0.203125 -0.203125 0.5 -0.203125q0.265625 0 0.46875 0.203125q0.21875 0.203125 0.21875 0.484375q0 0.265625 -0.21875 0.46875q-0.203125 0.203125 -0.46875 0.203125zm5.1250496 -3.875l-0.578125 0.65625l0 3.09375l-0.90625 0l0 -7.46875l1.015625 0l0 0.078125q-0.078125 0.078125 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0 2.921875l3.125 -3.5q0.296875 0.0625 0.609375 0.0625l0.3125 0l-2.828125 3.21875l3.03125 4.25l-1.078125 0.046875l-2.59375 -3.796875zm7.281296 3.875q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm6.640671 -1.0625l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.375 0.21875 -0.625 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.875 0l0 -5.5l0.90625 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.484375 -0.296875 1.046875 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625zm3.9375458 -1.3125q1.1875 0 1.796875 0.625q0.625 0.609375 0.625 2.0625l0 2.921875l-0.9375 0l0 -0.84375q-0.5 0.96875 -1.875 0.96875q-0.90625 0 -1.421875 -0.40625q-0.515625 -0.421875 -0.515625 -1.09375q0 -0.578125 0.359375 -1.0q0.375 -0.4375 1.0 -0.671875q0.640625 -0.234375 1.390625 -0.234375q0.6875 0 1.234375 0.0625q-0.0625 -0.921875 -0.484375 -1.296875q-0.40625 -0.375 -1.21875 -0.375q-0.421875 0 -0.796875 0.15625q-0.375 0.15625 -0.6875 0.453125l-0.421875 -0.5625q0.765625 -0.765625 1.953125 -0.765625zm-0.3125 5.078125q0.890625 0 1.40625 -0.515625q0.515625 -0.53125 0.5625 -1.515625q-0.53125 -0.078125 -1.15625 -0.078125q-0.90625 0 -1.4375 0.296875q-0.53125 0.296875 -0.53125 0.921875q0 0.890625 1.15625 0.890625zm6.781296 -2.703125q1.03125 0.3125 1.453125 0.6875q0.4375 0.359375 0.4375 0.953125q0 0.734375 -0.59375 1.234375q-0.578125 0.484375 -1.671875 0.484375q-1.390625 0 -2.328125 -0.875l0.46875 -0.8125l0.015625 -0.015625l0.015625 0.015625q0.375 0.484375 0.765625 0.734375q0.40625 0.234375 1.078125 0.234375q0.65625 0 1.015625 -0.234375q0.375 -0.234375 0.375 -0.640625q0 -0.359375 -0.296875 -0.578125q-0.296875 -0.234375 -1.078125 -0.484375q-2.0625 -0.59375 -2.0625 -1.703125q0 -0.640625 0.515625 -1.0q0.53125 -0.375 1.5 -0.375q0.75 0 1.25 0.203125q0.515625 0.203125 0.9375 0.65625l-0.5 0.59375l0 0.015625q-0.265625 -0.390625 -0.734375 -0.609375q-0.453125 -0.21875 -0.921875 -0.21875q-0.515625 0 -0.859375 0.1875q-0.328125 0.171875 -0.328125 0.5q0 0.296875 0.328125 0.546875q0.34375 0.25 1.21875 0.5zm1.15625 -0.875q0 -0.0625 0.09375 0l-0.03125 0.046875l-0.0625 -0.046875zm0.140625 -0.03125q0.03125 0.046875 0.015625 0.0625q0 0.015625 -0.03125 0q-0.015625 -0.015625 -0.03125 -0.03125l0.046875 -0.03125zm-3.375 2.53125q0 0.046875 -0.109375 0l0.03125 -0.0625l0.078125 0.046875l0 0.015625zm-0.140625 0.03125q-0.03125 -0.046875 -0.03125 -0.046875q0.015625 0 0.0625 0.015625l-0.03125 0.03125z" fill-rule="nonzero"/><path fill="#000000" d="m55.711124 171.62158l0.796875 0l0 0.546875q0.1875 -0.3125 0.484375 -0.484375q0.3125 -0.1875 0.625 -0.1875q0.359375 0 0.625 0.234375q0.28125 0.21875 0.359375 0.5625q0.140625 -0.359375 0.46875 -0.578125q0.34375 -0.21875 0.765625 -0.21875q0.53125 0 0.796875 0.390625q0.28125 0.375 0.25 1.0l0 4.21875l-0.78125 0l0 -3.890625q0 -0.6875 -0.140625 -0.890625q-0.125 -0.203125 -0.390625 -0.203125q-0.203125 0 -0.40625 0.203125q-0.203125 0.203125 -0.34375 0.53125q-0.125 0.3125 -0.125 0.625l0 3.625l-0.796875 0l0 -3.8125q0 -0.671875 -0.125 -0.90625q-0.125 -0.234375 -0.46875 -0.234375q-0.1875 0 -0.375 0.171875q-0.1875 0.15625 -0.3125 0.453125q-0.109375 0.28125 -0.109375 0.671875l0 3.65625l-0.796875 0l0 -5.484375zm8.546925 5.578125q-0.7343788 0 -1.3125038 -0.359375q-0.578125 -0.359375 -0.90625 -1.0q-0.3125 -0.65625 -0.3125 -1.484375q0 -0.828125 0.3125 -1.46875q0.328125 -0.65625 0.90625 -1.015625q0.578125 -0.375 1.3125038 -0.375q0.734375 0 1.3125 0.375q0.578125 0.359375 0.890625 1.015625q0.328125 0.640625 0.328125 1.46875q0 0.828125 -0.328125 1.484375q-0.3125 0.640625 -0.890625 1.0q-0.578125 0.359375 -1.3125 0.359375zm0 -0.71875q0.46875 0 0.828125 -0.265625q0.375 -0.28125 0.578125 -0.765625q0.21875 -0.484375 0.21875 -1.109375q0 -0.9375 -0.46875 -1.53125q-0.453125 -0.59375 -1.15625 -0.59375q-0.7031288 0 -1.1718788 0.59375q-0.453125 0.59375 -0.453125 1.53125q0 0.625 0.203125 1.109375q0.21875 0.484375 0.578125 0.765625q0.375 0.265625 0.8437538 0.265625zm5.812546 0.75q-0.59375 0 -1.109375 -0.328125q-0.515625 -0.34375 -0.84375 -1.0q-0.3125 -0.65625 -0.3125 -1.59375q0 -0.953125 0.328125 -1.578125q0.34375 -0.640625 0.859375 -0.9375q0.53125 -0.3125 1.125 -0.3125q0.546875 0 0.953125 0.25q0.421875 0.25 0.640625 0.6875l0 -3.296875l0.90625 0l0 0.09375q-0.0625 0.0625 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0.015625 6.59375q0 0.296875 0.015625 0.484375q0.015625 0.1875 0.109375 0.375l-0.859375 0q-0.078125 -0.1875 -0.09375 -0.375q-0.015625 -0.1875 -0.015625 -0.484375q-0.265625 0.46875 -0.6875 0.734375q-0.40625 0.25 -0.921875 0.25zm0.125 -0.765625q0.75 0 1.09375 -0.578125q0.34375 -0.59375 0.34375 -1.546875q0 -0.984375 -0.375 -1.5625q-0.375 -0.59375 -1.125 -0.59375q-0.734375 0 -1.125 0.53125q-0.375 0.53125 -0.375 1.46875q0 1.046875 0.40625 1.671875q0.40625 0.609375 1.15625 0.609375zm6.328171 0.765625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm2.4844208 3.28125l0 -0.703125l1.609375 0l0 -6.578125l-1.546875 0l0 -0.703125l2.421875 0l0 7.28125l1.609375 0l0 0.703125l-4.09375 0z" fill-rule="nonzero"/><path fill="#f4cccc" d="m76.08137 63.718502l87.496056 0l0 30.99213l-87.496056 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m76.08137 63.718502l87.496056 0l0 30.99213l-87.496056 0z" fill-rule="evenodd"/><path fill="#000000" d="m101.15734 83.234566q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm5.890671 -6.15625l-0.015625 0.015625q-0.328125 -0.421875 -0.5625 -0.5625q-0.234375 -0.15625 -0.65625 -0.15625q-0.5625 0 -0.890625 0.34375q-0.328125 0.328125 -0.328125 1.078125l0 0.453125l1.8125 0l0 0.6875l-1.8125 0l0 4.796875l-0.828125 0l0 -4.796875l-1.1875 0l0 -0.6875l1.1875 0l0 -0.453125q0 -1.0625 0.546875 -1.578125q0.546875 -0.53125 1.46875 -0.53125q0.53125 0 0.984375 0.1875q0.453125 0.1875 0.734375 0.5625l-0.453125 0.640625zm-0.140625 0.03125q0 -0.03125 0.046875 0q0.046875 0.015625 0.0625 0.015625l-0.046875 0.046875l-0.0625 -0.046875l0 -0.015625zm0.125 -0.015625q0.078125 0.09375 0.03125 0.0625q-0.03125 -0.03125 -0.046875 -0.03125l0.015625 -0.03125zm3.6562958 6.765625q-0.296875 0 -0.5 -0.203125q-0.203125 -0.203125 -0.203125 -0.46875q0 -0.28125 0.203125 -0.484375q0.203125 -0.203125 0.5 -0.203125q0.265625 0 0.46875 0.203125q0.21875 0.203125 0.21875 0.484375q0 0.265625 -0.21875 0.46875q-0.203125 0.203125 -0.46875 0.203125zm5.125046 -3.875l-0.578125 0.65625l0 3.09375l-0.90625 0l0 -7.46875l1.015625 0l0 0.078125q-0.078125 0.078125 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0 2.921875l3.125 -3.5q0.296875 0.0625 0.609375 0.0625l0.3125 0l-2.828125 3.21875l3.03125 4.25l-1.078125 0.046875l-2.59375 -3.796875zm7.281296 3.875q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm6.640663 -1.0625l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.375 0.21875 -0.625 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.8749924 0l0 -5.5l0.9062424 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.484375 -0.296875 1.046875 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625zm3.9375458 -1.3125q1.1875 0 1.796875 0.625q0.625 0.609375 0.625 2.0625l0 2.921875l-0.9375 0l0 -0.84375q-0.5 0.96875 -1.875 0.96875q-0.90625 0 -1.421875 -0.40625q-0.515625 -0.421875 -0.515625 -1.09375q0 -0.578125 0.359375 -1.0q0.375 -0.4375 1.0 -0.671875q0.640625 -0.234375 1.390625 -0.234375q0.6875 0 1.234375 0.0625q-0.0625 -0.921875 -0.484375 -1.296875q-0.40625 -0.375 -1.21875 -0.375q-0.421875 0 -0.796875 0.15625q-0.375 0.15625 -0.6875 0.453125l-0.421875 -0.5625q0.765625 -0.765625 1.953125 -0.765625zm-0.3125 5.078125q0.890625 0 1.40625 -0.515625q0.515625 -0.53125 0.5625 -1.515625q-0.53125 -0.078125 -1.15625 -0.078125q-0.90625 0 -1.4375 0.296875q-0.53125 0.296875 -0.53125 0.921875q0 0.890625 1.15625 0.890625zm6.781296 -2.703125q1.03125 0.3125 1.453125 0.6875q0.4375 0.359375 0.4375 0.953125q0 0.734375 -0.59375 1.234375q-0.578125 0.484375 -1.671875 0.484375q-1.390625 0 -2.328125 -0.875l0.46875 -0.8125l0.015625 -0.015625l0.015625 0.015625q0.375 0.484375 0.765625 0.734375q0.40625 0.234375 1.078125 0.234375q0.65625 0 1.015625 -0.234375q0.375 -0.234375 0.375 -0.640625q0 -0.359375 -0.296875 -0.578125q-0.296875 -0.234375 -1.078125 -0.484375q-2.0625 -0.59375 -2.0625 -1.703125q0 -0.640625 0.515625 -1.0q0.53125 -0.375 1.5 -0.375q0.75 0 1.25 0.203125q0.515625 0.203125 0.9375 0.65625l-0.5 0.59375l0 0.015625q-0.265625 -0.390625 -0.734375 -0.609375q-0.453125 -0.21875 -0.921875 -0.21875q-0.515625 0 -0.859375 0.1875q-0.328125 0.171875 -0.328125 0.5q0 0.296875 0.328125 0.546875q0.34375 0.25 1.21875 0.5zm1.15625 -0.875q0 -0.0625 0.09375 0l-0.03125 0.046875l-0.0625 -0.046875zm0.140625 -0.03125q0.03125 0.046875 0.015625 0.0625q0 0.015625 -0.03125 0q-0.015625 -0.015625 -0.03125 -0.03125l0.046875 -0.03125zm-3.375 2.53125q0 0.046875 -0.109375 0l0.03125 -0.0625l0.078125 0.046875l0 0.015625zm-0.140625 0.03125q-0.03125 -0.046875 -0.03125 -0.046875q0.015625 0 0.0625 0.015625l-0.03125 0.03125z" fill-rule="nonzero"/><path fill="#f4cccc" d="m176.69817 63.71982l87.49605 0l0 30.992126l-87.49605 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m176.69817 63.71982l87.49605 0l0 30.992126l-87.49605 0z" fill-rule="evenodd"/><path fill="#000000" d="m187.2167 83.73588l1.8125 -8.5625l1.0 0l-1.625 7.65625l3.3125 0l-0.1875 0.90625l-4.3125 0zm8.955078 -6.5q1.109375 0 1.75 0.65625q0.65625 0.65625 0.65625 1.8125q0 1.09375 -0.421875 2.078125q-0.421875 0.984375 -1.140625 1.515625q-0.71875 0.53125 -1.625 0.53125q-1.125 0 -1.765625 -0.65625q-0.640625 -0.671875 -0.640625 -1.8125q0 -1.125 0.421875 -2.09375q0.4375 -0.984375 1.15625 -1.5q0.734375 -0.53125 1.609375 -0.53125zm1.390625 2.328125q0 -0.671875 -0.375 -1.078125q-0.359375 -0.421875 -0.984375 -0.421875q-0.640625 0 -1.15625 0.4375q-0.5 0.421875 -0.78125 1.203125q-0.28125 0.765625 -0.28125 1.703125q0 0.765625 0.375 1.1875q0.375 0.421875 1.078125 0.421875q0.609375 0 1.09375 -0.421875q0.484375 -0.4375 0.75 -1.21875q0.28125 -0.796875 0.28125 -1.8125zm6.451172 4.171875l-0.1875 -3.53125q-0.03125 -0.546875 -0.03125 -1.0l0 -0.921875l-0.046875 0l-0.296875 0.6875l-0.484375 1.109375l-1.703125 3.65625l-1.1875 0l-0.25 -6.421875l0.953125 0l0.109375 3.484375l0 0.515625q0 0.859375 -0.046875 1.578125l0.03125 0q0.28125 -0.734375 0.8125 -1.890625l1.71875 -3.6875l1.078125 0l0.21875 3.484375q0.03125 0.984375 0.03125 1.53125l0 0.3125l-0.015625 0.25l0.03125 0q0.171875 -0.515625 0.484375 -1.28125q0.328125 -0.78125 1.90625 -4.296875l1.03125 0l-2.953125 6.421875l-1.203125 0zm8.15625 0l-0.984375 0l1.953125 -9.125l0.984375 0l-1.953125 9.125zm4.625 0.125q-1.078125 0 -1.703125 -0.640625q-0.609375 -0.640625 -0.609375 -1.78125q0 -1.09375 0.4375 -2.109375q0.4375 -1.015625 1.15625 -1.578125q0.71875 -0.5625 1.578125 -0.5625q0.90625 0 1.359375 0.390625q0.453125 0.390625 0.453125 1.09375q0 1.046875 -0.984375 1.65625q-0.96875 0.59375 -2.78125 0.59375l-0.1875 0l-0.03125 0.46875q0 0.765625 0.359375 1.203125q0.359375 0.4375 1.125 0.4375q0.359375 0 0.75 -0.109375q0.390625 -0.109375 0.96875 -0.390625l0 0.859375q-0.546875 0.25 -0.96875 0.359375q-0.421875 0.109375 -0.921875 0.109375zm0.8125 -5.828125q-0.609375 0 -1.140625 0.5625q-0.53125 0.546875 -0.8125 1.515625l0.078125 0q1.328125 0 2.03125 -0.34375q0.71875 -0.34375 0.71875 -1.015625q0 -0.3125 -0.21875 -0.515625q-0.203125 -0.203125 -0.65625 -0.203125zm3.5273438 5.703125l-0.734375 -6.421875l0.984375 0l0.375 3.59375q0.140625 1.515625 0.140625 2.125l0.03125 0q0.75 -1.625 1.046875 -2.1875l1.90625 -3.53125l1.046875 0l-3.46875 6.421875l-1.328125 0zm7.1210938 0.125q-1.078125 0 -1.703125 -0.640625q-0.609375 -0.640625 -0.609375 -1.78125q0 -1.09375 0.4375 -2.109375q0.4375 -1.015625 1.15625 -1.578125q0.71875 -0.5625 1.578125 -0.5625q0.90625 0 1.359375 0.390625q0.453125 0.390625 0.453125 1.09375q0 1.046875 -0.984375 1.65625q-0.96875 0.59375 -2.78125 0.59375l-0.1875 0l-0.03125 0.46875q0 0.765625 0.359375 1.203125q0.359375 0.4375 1.125 0.4375q0.359375 0 0.75 -0.109375q0.390625 -0.109375 0.96875 -0.390625l0 0.859375q-0.546875 0.25 -0.96875 0.359375q-0.421875 0.109375 -0.921875 0.109375zm0.8125 -5.828125q-0.609375 0 -1.140625 0.5625q-0.53125 0.546875 -0.8125 1.515625l0.078125 0q1.328125 0 2.03125 -0.34375q0.71875 -0.34375 0.71875 -1.015625q0 -0.3125 -0.21875 -0.515625q-0.203125 -0.203125 -0.65625 -0.203125zm3.5273438 5.703125l-0.984375 0l1.953125 -9.125l0.984375 0l-1.953125 9.125zm9.6171875 -2.71875l-2.90625 0l-1.4375 2.71875l-1.109375 0l4.6875 -8.5625l1.015625 0l1.078125 8.5625l-1.0 0l-0.328125 -2.71875zm-0.109375 -0.921875l-0.203125 -1.75q-0.140625 -1.046875 -0.171875 -2.046875q-0.21875 0.515625 -0.46875 1.03125q-0.25 0.5 -1.46875 2.765625l2.3125 0zm8.802734 -2.71875q0 1.453125 -0.96875 2.21875q-0.953125 0.765625 -2.8125 0.765625l-0.796875 0l-0.71875 3.375l-1.0 0l1.8125 -8.5625l1.9375 0q1.25 0 1.890625 0.5625q0.65625 0.546875 0.65625 1.640625zm-4.390625 2.140625l0.78125 0q1.265625 0 1.921875 -0.53125q0.65625 -0.546875 0.65625 -1.578125q0 -0.734375 -0.40625 -1.046875q-0.40625 -0.328125 -1.25 -0.328125l-0.96875 0l-0.734375 3.484375zm4.8847656 4.21875l1.828125 -8.5625l0.984375 0l-1.828125 8.5625l-0.984375 0z" fill-rule="nonzero"/><path fill="#c9daf8" d="m495.33072 152.69751l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m495.33072 152.69751l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m527.26624 172.71358l-1.140625 0l-4.6875 -7.1875l-0.046875 0q0.09375 1.265625 0.09375 2.3125l0 4.875l-0.921875 0l0 -8.5625l1.125 0l4.671875 7.15625l0.046875 0q0 -0.15625 -0.046875 -1.015625q-0.046875 -0.859375 -0.03125 -1.234375l0 -4.90625l0.9375 0l0 8.5625zm9.046875 0l-1.140625 0l-4.6875 -7.1875l-0.046875 0q0.09375 1.265625 0.09375 2.3125l0 4.875l-0.921875 0l0 -8.5625l1.125 0l4.671875 7.15625l0.046875 0q0 -0.15625 -0.046875 -1.015625q-0.046875 -0.859375 -0.03125 -1.234375l0 -4.90625l0.9375 0l0 8.5625zm10.8515625 0l-1.0625 -2.71875l-3.4375 0l-1.046875 2.71875l-1.015625 0l3.390625 -8.609375l0.828125 0l3.375 8.609375l-1.03125 0zm-1.375 -3.625l-1.0 -2.65625q-0.1875 -0.5 -0.390625 -1.234375q-0.140625 0.5625 -0.375 1.234375l-1.0 2.65625l2.765625 0zm9.015625 -2.453125q0 1.3125 -0.890625 2.015625q-0.890625 0.6875 -2.53125 0.6875l-1.015625 0l0 3.375l-1.0 0l0 -8.5625l2.234375 0q3.203125 0 3.203125 2.484375zm-4.4375 1.859375l0.90625 0q1.3125 0 1.90625 -0.421875q0.59375 -0.4375 0.59375 -1.390625q0 -0.84375 -0.5625 -1.25q-0.546875 -0.421875 -1.734375 -0.421875l-1.109375 0l0 3.484375zm6.2246094 4.21875l0 -8.5625l1.0 0l0 8.5625l-1.0 0z" fill-rule="nonzero"/><path fill="#c9daf8" d="m495.33072 110.82874l87.49606 0l0 30.992119l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m495.33072 110.82874l87.49606 0l0 30.992119l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m535.1793 130.2198q0 -0.390625 0.171875 -0.59375q0.1875 -0.203125 0.515625 -0.203125q0.34375 0 0.53125 0.203125q0.1875 0.203125 0.1875 0.59375q0 0.390625 -0.1875 0.59375q-0.1875 0.203125 -0.53125 0.203125q-0.296875 0 -0.5 -0.1875q-0.1875 -0.1875 -0.1875 -0.609375zm3.1933594 0q0 -0.390625 0.171875 -0.59375q0.1875 -0.203125 0.515625 -0.203125q0.34375 0 0.53125 0.203125q0.1875 0.203125 0.1875 0.59375q0 0.390625 -0.1875 0.59375q-0.1875 0.203125 -0.53125 0.203125q-0.296875 0 -0.5 -0.1875q-0.1875 -0.1875 -0.1875 -0.609375zm3.1933594 0q0 -0.390625 0.171875 -0.59375q0.1875 -0.203125 0.515625 -0.203125q0.34375 0 0.53125 0.203125q0.1875 0.203125 0.1875 0.59375q0 0.390625 -0.1875 0.59375q-0.1875 0.203125 -0.53125 0.203125q-0.296875 0 -0.5 -0.1875q-0.1875 -0.1875 -0.1875 -0.609375z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m70.25812 181.08202l0 27.68837l100.944885 0l0 27.681717" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m70.25812 181.08202l0 27.68837l100.94487 0l0 24.254623" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m171.20299 233.02501l-1.1245728 -1.1245728l1.1245728 3.0897675l1.124588 -3.0897675z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m447.31235 251.9311l24.009003 0l0 0.06298828l24.022491 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m447.31235 251.9311l24.009003 0l0 0.06298828l20.595398 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m491.91675 251.9941l-1.1245728 1.124588l3.0897522 -1.124588l-3.0897522 -1.124588z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m447.31235 251.9311l24.009003 0l0 -41.858276l24.022491 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m447.31235 251.9311l24.009003 0l0 -41.858276l20.595398 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m491.91675 210.07283l-1.1245728 1.124588l3.0897522 -1.124588l-3.0897522 -1.1245728z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m447.31235 251.9311l24.009003 0l0 -83.74803l24.022491 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m447.31235 251.9311l24.009003 0l0 -83.74803l20.595398 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m491.91675 168.18307l-1.1245728 1.1245728l3.0897522 -1.1245728l-3.0897522 -1.124588z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m447.31235 251.9311l24.009003 0l0 -125.6063l24.022491 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m447.31235 251.9311l24.009003 0l0 -125.6063l20.595398 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m491.91675 126.32481l-1.1245728 1.1245804l3.0897522 -1.1245804l-3.0897522 -1.124588z" fill-rule="evenodd"/><path fill="#93c47d" d="m127.45759 150.08989l87.49607 0l0 30.992126l-87.49607 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m127.45759 150.08989l87.49607 0l0 30.992126l-87.49607 0z" fill-rule="evenodd"/><path fill="#000000" d="m144.78352 165.98096q1.0625 0.4375 1.46875 0.90625q0.40625 0.46875 0.40625 1.171875q0 0.5625 -0.265625 1.0625q-0.265625 0.484375 -0.828125 0.796875q-0.5625 0.3125 -1.390625 0.3125q-1.453125 0 -2.34375 -0.953125l0.4375 -0.75l0 -0.015625q0 0 0 0.015625q0 0 0 0q0.328125 0.421875 0.828125 0.6875q0.515625 0.25 1.1875 0.25q0.671875 0 1.109375 -0.359375q0.4375 -0.375 0.4375 -0.921875q0 -0.34375 -0.140625 -0.578125q-0.125 -0.234375 -0.484375 -0.453125q-0.359375 -0.234375 -1.078125 -0.546875q-1.109375 -0.4375 -1.59375 -0.984375q-0.46875 -0.5625 -0.46875 -1.234375q0 -0.84375 0.609375 -1.34375q0.625 -0.515625 1.671875 -0.515625q0.609375 0 1.140625 0.25q0.546875 0.25 0.9375 0.6875l-0.46875 0.625l-0.015625 0.015625q-0.359375 -0.484375 -0.75 -0.671875q-0.390625 -0.203125 -0.96875 -0.203125q-0.578125 0 -0.9375 0.328125q-0.34375 0.3125 -0.34375 0.765625q0 0.34375 0.140625 0.609375q0.15625 0.25 0.546875 0.5q0.390625 0.25 1.15625 0.546875zm1.03125 -1.84375q0 -0.046875 0.046875 -0.015625q0.046875 0.015625 0.0625 0.015625l-0.03125 0.046875l-0.078125 -0.046875l0 0zm0.125 -0.03125q0.078125 0.09375 0.03125 0.0625q-0.03125 -0.03125 -0.046875 -0.03125l0.015625 -0.03125zm-3.546875 4.375q0 0.03125 -0.046875 0.015625q-0.046875 -0.03125 -0.0625 -0.03125l0.03125 -0.046875l0.078125 0.046875l0 0.015625zm-0.125 0.03125q-0.078125 -0.09375 0.015625 -0.046875l-0.015625 0.046875zm7.859421 -4.015625q1.1875 0 1.796875 0.625q0.625 0.609375 0.625 2.0625l0 2.921875l-0.9375 0l0 -0.84375q-0.5 0.96875 -1.875 0.96875q-0.90625 0 -1.421875 -0.40625q-0.515625 -0.421875 -0.515625 -1.09375q0 -0.578125 0.359375 -1.0q0.375 -0.4375 1.0 -0.671875q0.640625 -0.234375 1.390625 -0.234375q0.6875 0 1.234375 0.0625q-0.0625 -0.921875 -0.484375 -1.296875q-0.40625 -0.375 -1.21875 -0.375q-0.421875 0 -0.796875 0.15625q-0.375 0.15625 -0.6875 0.453125l-0.421875 -0.5625q0.765625 -0.765625 1.953125 -0.765625zm-0.3125 5.078125q0.890625 0 1.40625 -0.515625q0.515625 -0.53125 0.5625 -1.515625q-0.53125 -0.078125 -1.15625 -0.078125q-0.90625 0 -1.4375 0.296875q-0.53125 0.296875 -0.53125 0.921875q0 0.890625 1.15625 0.890625zm8.843796 -4.953125q-0.1875 0.96875 -0.796875 2.40625l-1.328125 3.078125l-0.671875 0l-2.171875 -5.484375l0.859375 0l1.6875 4.296875l0.890625 -2.03125q0.546875 -1.25 0.71875 -2.265625l0.8125 0zm3.8125458 5.609375q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm4.343796 3.40625q-0.59375 0 -1.109375 -0.328125q-0.515625 -0.34375 -0.84375 -1.0q-0.3125 -0.65625 -0.3125 -1.59375q0 -0.953125 0.328125 -1.578125q0.34375 -0.640625 0.859375 -0.9375q0.53125 -0.3125 1.125 -0.3125q0.546875 0 0.953125 0.25q0.421875 0.25 0.640625 0.6875l0 -3.296875l0.90625 0l0 0.09375q-0.0625 0.0625 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0.015625 6.59375q0 0.296875 0.015625 0.484375q0.015625 0.1875 0.109375 0.375l-0.859375 0q-0.078125 -0.1875 -0.09375 -0.375q-0.015625 -0.1875 -0.015625 -0.484375q-0.265625 0.46875 -0.6875 0.734375q-0.40625 0.25 -0.921875 0.25zm0.125 -0.765625q0.75 0 1.09375 -0.578125q0.34375 -0.59375 0.34375 -1.546875q0 -0.984375 -0.375 -1.5625q-0.375 -0.59375 -1.125 -0.59375q-0.734375 0 -1.125 0.53125q-0.375 0.53125 -0.375 1.46875q0 1.046875 0.40625 1.671875q0.40625 0.609375 1.15625 0.609375zm3.5469208 0.640625l0 -7.46875l0.671875 0l1.84375 3.65625l1.890625 -3.671875l0.625 0l0 7.484375l-0.78125 0l0 -5.640625l-1.625 3.015625l-0.328125 0l-1.515625 -2.984375l0 5.609375l-0.78125 0zm8.515671 0.09375q-0.734375 0 -1.3125 -0.359375q-0.578125 -0.359375 -0.90625 -1.0q-0.3125 -0.65625 -0.3125 -1.484375q0 -0.828125 0.3125 -1.46875q0.328125 -0.65625 0.90625 -1.015625q0.578125 -0.375 1.3125 -0.375q0.734375 0 1.3125 0.375q0.578125 0.359375 0.890625 1.015625q0.328125 0.640625 0.328125 1.46875q0 0.828125 -0.328125 1.484375q-0.3125 0.640625 -0.890625 1.0q-0.578125 0.359375 -1.3125 0.359375zm0 -0.71875q0.46875 0 0.828125 -0.265625q0.375 -0.28125 0.578125 -0.765625q0.21875 -0.484375 0.21875 -1.109375q0 -0.9375 -0.46875 -1.53125q-0.453125 -0.59375 -1.15625 -0.59375q-0.703125 0 -1.171875 0.59375q-0.453125 0.59375 -0.453125 1.53125q0 0.625 0.203125 1.109375q0.21875 0.484375 0.578125 0.765625q0.375 0.265625 0.84375 0.265625zm5.812546 0.75q-0.59375 0 -1.109375 -0.328125q-0.515625 -0.34375 -0.84375 -1.0q-0.3125 -0.65625 -0.3125 -1.59375q0 -0.953125 0.328125 -1.578125q0.34375 -0.640625 0.859375 -0.9375q0.53125 -0.3125 1.125 -0.3125q0.546875 0 0.953125 0.25q0.421875 0.25 0.640625 0.6875l0 -3.296875l0.90625 0l0 0.09375q-0.0625 0.0625 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0.015625 6.59375q0 0.296875 0.015625 0.484375q0.015625 0.1875 0.109375 0.375l-0.859375 0q-0.078125 -0.1875 -0.09375 -0.375q-0.015625 -0.1875 -0.015625 -0.484375q-0.265625 0.46875 -0.6875 0.734375q-0.40625 0.25 -0.921875 0.25zm0.125 -0.765625q0.75 0 1.09375 -0.578125q0.34375 -0.59375 0.34375 -1.546875q0 -0.984375 -0.375 -1.5625q-0.375 -0.59375 -1.125 -0.59375q-0.734375 0 -1.125 0.53125q-0.375 0.53125 -0.375 1.46875q0 1.046875 0.40625 1.671875q0.40625 0.609375 1.15625 0.609375zm6.328171 0.765625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm2.4844208 3.28125l0 -0.703125l1.609375 0l0 -6.578125l-1.546875 0l0 -0.703125l2.421875 0l0 7.28125l1.609375 0l0 0.703125l-4.09375 0z" fill-rule="nonzero"/><path fill="#d9ead3" d="m228.40378 150.08989l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m228.40378 150.08989l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m251.54225 162.5122q0.4375 0 0.84375 -0.25q0.40625 -0.25 0.65625 -0.671875l0.625 0.40625q-0.375 0.625 -0.875 0.9375q-0.5 0.296875 -1.21875 0.296875q-0.84375 0 -1.5 -0.40625q-0.65625 -0.421875 -1.046875 -1.265625q-0.390625 -0.859375 -0.390625 -2.15625q0 -1.375 0.421875 -2.234375q0.421875 -0.859375 1.0625 -1.21875q0.65625 -0.375 1.40625 -0.375q0.78125 0 1.359375 0.390625q0.59375 0.390625 0.890625 1.078125l-0.71875 0.34375q-0.015625 0 -0.015625 0q0 -0.015625 0 -0.015625q-0.3125 -0.625 -0.703125 -0.875q-0.375 -0.25 -0.84375 -0.25q-0.9375 0 -1.484375 0.828125q-0.546875 0.8125 -0.546875 2.28125q0 0.921875 0.265625 1.640625q0.28125 0.71875 0.75 1.125q0.484375 0.390625 1.0625 0.390625zm1.375 -5.171875q0.015625 -0.015625 0.015625 -0.015625q0.03125 0 0.109375 0.0625l-0.09375 0.046875l-0.03125 -0.09375zm0.140625 0.046875q0.046875 0.109375 -0.015625 0l0.015625 0zm4.093796 5.8125q-0.734375 0 -1.3125 -0.359375q-0.578125 -0.359375 -0.90625 -1.0q-0.3125 -0.65625 -0.3125 -1.484375q0 -0.828125 0.3125 -1.46875q0.328125 -0.65625 0.90625 -1.015625q0.578125 -0.375 1.3125 -0.375q0.734375 0 1.3125 0.375q0.578125 0.359375 0.890625 1.015625q0.328125 0.640625 0.328125 1.46875q0 0.828125 -0.328125 1.484375q-0.3125 0.640625 -0.890625 1.0q-0.578125 0.359375 -1.3125 0.359375zm0 -0.71875q0.46875 0 0.828125 -0.265625q0.375 -0.28125 0.578125 -0.765625q0.21875 -0.484375 0.21875 -1.109375q0 -0.9375 -0.46875 -1.53125q-0.453125 -0.59375 -1.15625 -0.59375q-0.703125 0 -1.171875 0.59375q-0.453125 0.59375 -0.453125 1.53125q0 0.625 0.203125 1.109375q0.21875 0.484375 0.578125 0.765625q0.375 0.265625 0.84375 0.265625zm3.859436 -4.859375l0.84375 0l0 0.96875q0.328125 -0.5 0.8125 -0.796875q0.5 -0.296875 1.046875 -0.296875q0.734375 0 1.171875 0.5625q0.4375 0.546875 0.4375 1.71875l0 3.328125l-0.84375 0l0 -3.296875q0 -0.8125 -0.28125 -1.1875q-0.265625 -0.375 -0.71875 -0.375q-0.375 0 -0.75 0.21875q-0.375 0.21875 -0.625 0.609375q-0.25 0.390625 -0.25 0.875l0 3.15625l-0.84375 0l0 -5.484375zm8.6094055 4.84375q0.828125 0 1.421875 -0.671875l0.515625 0.578125q-0.8125 0.859375 -1.984375 0.859375q-0.796875 0 -1.421875 -0.359375q-0.625 -0.375 -0.984375 -1.03125q-0.34375 -0.65625 -0.34375 -1.46875q0 -0.8125 0.34375 -1.453125q0.359375 -0.65625 0.984375 -1.03125q0.625 -0.375 1.40625 -0.375q0.65625 0 1.1875 0.28125q0.546875 0.265625 0.890625 0.734375l-0.546875 0.53125l0 0.015625q-0.359375 -0.453125 -0.71875 -0.640625q-0.359375 -0.1875 -0.90625 -0.1875q-0.46875 0 -0.875 0.265625q-0.390625 0.25 -0.640625 0.71875q-0.234375 0.46875 -0.234375 1.078125q0 0.609375 0.234375 1.109375q0.25 0.484375 0.6875 0.765625q0.4375 0.28125 0.984375 0.28125zm1.328125 -3.375q0 -0.078125 0.109375 0l-0.046875 0.0625l-0.0625 -0.0625zm0.140625 -0.015625q0.046875 0.078125 0.015625 0.0625q-0.015625 -0.03125 -0.046875 -0.046875l0.03125 -0.015625zm6.171936 -0.3125l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.375 0.21875 -0.625 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.875 0l0 -5.5l0.90625 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.484375 -0.296875 1.046875 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625zm4.2812805 4.421875q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm6.859436 2.78125q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm3.9375305 0.625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0z" fill-rule="nonzero"/><path fill="#000000" d="m240.08907 177.10596l0 -7.484375l4.3125 0l0 0.734375l-3.46875 0l0 2.34375l2.796875 0l0 0.734375l-2.796875 0l0 3.671875l-0.84375 0zm7.718796 0.125q-0.875 0 -1.390625 -0.640625q-0.515625 -0.640625 -0.5 -1.90625l0.015625 -3.0625l0.84375 0l0 3.0625q0 0.984375 0.328125 1.421875q0.34375 0.4375 0.921875 0.4375q0.609375 0 1.03125 -0.484375q0.4375 -0.484375 0.4375 -1.40625l0 -3.03125l0.84375 0l0 4.625q0 0.296875 0.015625 0.484375q0.015625 0.1875 0.09375 0.375l-0.828125 0q-0.078125 -0.1875 -0.09375 -0.375q-0.015625 -0.1875 -0.015625 -0.46875q-0.265625 0.453125 -0.71875 0.71875q-0.453125 0.25 -0.984375 0.25zm4.203171 -5.609375l0.84375 0l0 0.96875q0.328125 -0.5 0.8125 -0.796875q0.5 -0.296875 1.046875 -0.296875q0.734375 0 1.171875 0.5625q0.43748474 0.546875 0.43748474 1.71875l0 3.328125l-0.84373474 0l0 -3.296875q0 -0.8125 -0.28125 -1.1875q-0.265625 -0.375 -0.71875 -0.375q-0.375 0 -0.75 0.21875q-0.375 0.21875 -0.625 0.609375q-0.25 0.390625 -0.25 0.875l0 3.15625l-0.84375 0l0 -5.484375zm8.609421 4.84375q0.828125 0 1.421875 -0.671875l0.515625 0.578125q-0.8125 0.859375 -1.984375 0.859375q-0.796875 0 -1.421875 -0.359375q-0.625 -0.375 -0.984375 -1.03125q-0.34375 -0.65625 -0.34375 -1.46875q0 -0.8125 0.34375 -1.453125q0.359375 -0.65625 0.984375 -1.03125q0.625 -0.375 1.40625 -0.375q0.65625 0 1.1875 0.28125q0.546875 0.265625 0.890625 0.734375l-0.546875 0.53125l0 0.015625q-0.359375 -0.453125 -0.71875 -0.640625q-0.359375 -0.1875 -0.90625 -0.1875q-0.46875 0 -0.875 0.265625q-0.390625 0.25 -0.640625 0.71875q-0.234375 0.46875 -0.234375 1.078125q0 0.609375 0.234375 1.109375q0.25 0.484375 0.6875 0.765625q0.4375 0.28125 0.984375 0.28125zm1.328125 -3.375q0 -0.078125 0.109375 0l-0.046875 0.0625l-0.0625 -0.0625zm0.140625 -0.015625q0.046875 0.078125 0.015625 0.0625q-0.015625 -0.03125 -0.046875 -0.046875l0.03125 -0.015625zm6.3906555 3.53125q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm1.875061 0.5l0 -0.703125l1.40625 0l0 -4.078125l-1.34375 0l0 -0.703125l2.203125 0l0 4.78125l1.28125 0l0 0.703125l-3.546875 0zm1.78125 -6.640625q-0.25 0 -0.4375 -0.171875q-0.171875 -0.1875 -0.171875 -0.4375q0 -0.265625 0.171875 -0.4375q0.171875 -0.1875 0.4375 -0.1875q0.25 0 0.4375 0.1875q0.1875 0.1875 0.1875 0.4375q0 0.25 -0.1875 0.4375q-0.1875 0.171875 -0.4375 0.171875zm6.0156555 6.734375q-0.734375 0 -1.3125 -0.359375q-0.578125 -0.359375 -0.90625 -1.0q-0.3125 -0.65625 -0.3125 -1.484375q0 -0.828125 0.3125 -1.46875q0.328125 -0.65625 0.90625 -1.015625q0.578125 -0.375 1.3125 -0.375q0.734375 0 1.3125 0.375q0.578125 0.359375 0.890625 1.015625q0.328125 0.640625 0.328125 1.46875q0 0.828125 -0.328125 1.484375q-0.3125 0.640625 -0.890625 1.0q-0.578125 0.359375 -1.3125 0.359375zm0 -0.71875q0.46875 0 0.828125 -0.265625q0.375 -0.28125 0.578125 -0.765625q0.21875 -0.484375 0.21875 -1.109375q0 -0.9375 -0.46875 -1.53125q-0.453125 -0.59375 -1.15625 -0.59375q-0.703125 0 -1.171875 0.59375q-0.453125 0.59375 -0.453125 1.53125q0 0.625 0.203125 1.109375q0.21875 0.484375 0.578125 0.765625q0.375 0.265625 0.84375 0.265625zm3.859436 -4.859375l0.84375 0l0 0.96875q0.328125 -0.5 0.8125 -0.796875q0.5 -0.296875 1.046875 -0.296875q0.734375 0 1.171875 0.5625q0.4375 0.546875 0.4375 1.71875l0 3.328125l-0.84375 0l0 -3.296875q0 -0.8125 -0.28125 -1.1875q-0.265625 -0.375 -0.71875 -0.375q-0.375 0 -0.75 0.21875q-0.375 0.21875 -0.625 0.609375q-0.25 0.390625 -0.25 0.875l0 3.15625l-0.84375 0l0 -5.484375zm9.8750305 7.578125q-0.984375 -0.46875 -1.734375 -1.265625q-0.734375 -0.78125 -1.140625 -1.765625q-0.390625 -1.0 -0.390625 -2.0625q0 -1.0625 0.390625 -2.03125q0.390625 -0.96875 1.109375 -1.734375q0.734375 -0.765625 1.703125 -1.21875l0.21875 0.75q-1.171875 0.640625 -1.875 1.796875q-0.703125 1.140625 -0.703125 2.46875q0 1.34375 0.71875 2.53125q0.71875 1.171875 1.9375 1.8125l-0.234375 0.71875zm4.656311 -5.328125q1.03125 0.3125 1.453125 0.6875q0.4375 0.359375 0.4375 0.953125q0 0.734375 -0.59375 1.234375q-0.578125 0.484375 -1.671875 0.484375q-1.390625 0 -2.328125 -0.875l0.46875 -0.8125l0.015625 -0.015625l0.015625 0.015625q0.375 0.484375 0.765625 0.734375q0.40625 0.234375 1.078125 0.234375q0.65625 0 1.015625 -0.234375q0.375 -0.234375 0.375 -0.640625q0 -0.359375 -0.296875 -0.578125q-0.296875 -0.234375 -1.078125 -0.484375q-2.0625 -0.59375 -2.0625 -1.703125q0 -0.640625 0.515625 -1.0q0.53125 -0.375 1.5 -0.375q0.75 0 1.25 0.203125q0.515625 0.203125 0.9375 0.65625l-0.5 0.59375l0 0.015625q-0.265625 -0.390625 -0.734375 -0.609375q-0.453125 -0.21875 -0.921875 -0.21875q-0.515625 0 -0.859375 0.1875q-0.328125 0.171875 -0.328125 0.5q0 0.296875 0.328125 0.546875q0.34375 0.25 1.21875 0.5zm1.15625 -0.875q0 -0.0625 0.09375 0l-0.03125 0.046875l-0.0625 -0.046875zm0.140625 -0.03125q0.03125 0.046875 0.015625 0.0625q0 0.015625 -0.03125 0q-0.015625 -0.015625 -0.03125 -0.03125l0.046875 -0.03125zm-3.375 2.53125q0 0.046875 -0.109375 0l0.03125 -0.0625l0.078125 0.046875l0 0.015625zm-0.140625 0.03125q-0.03125 -0.046875 -0.03125 -0.046875q0.015625 0 0.0625 0.015625l-0.03125 0.03125zm6.2031555 2.953125q1.203125 -0.640625 1.921875 -1.8125q0.71875 -1.1875 0.71875 -2.53125q0 -1.328125 -0.703125 -2.46875q-0.703125 -1.15625 -1.875 -1.796875l0.234375 -0.75q0.953125 0.453125 1.671875 1.21875q0.734375 0.765625 1.125 1.734375q0.40625 0.96875 0.40625 2.03125q0 1.0625 -0.40625 2.0625q-0.40625 0.984375 -1.15625 1.765625q-0.734375 0.796875 -1.71875 1.265625l-0.21875 -0.71875z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m239.93134 94.31036l0 27.88977l32.22049 0l0 27.889755" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m239.93135 94.31036l0 27.88977l32.220474 0l0 24.462677" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m272.15182 146.66281l-1.1245728 -1.124588l1.1245728 3.0897675l1.1245728 -3.0897675z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m171.20563 181.08202l0 55.370087" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m171.20561 181.08202l0 51.942993" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m171.20561 233.02501l-1.1245728 -1.1245728l1.1245728 3.0897675l1.124588 -3.0897675z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m272.15182 181.08202l0 27.68837l-100.944885 0l0 27.681717" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m272.15182 181.08202l0 27.68837l-100.9449 0l0 24.254623" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m171.20692 233.02501l-1.1245728 -1.1245728l1.1245728 3.0897675l1.124588 -3.0897675z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m220.43396 94.71982l0 27.637794l-38.929123 0l0 27.637794" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m220.43398 94.71982l0 27.637794l-38.92914 0l0 24.210716" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m181.50484 146.56831l-1.124588 -1.1245728l1.124588 3.0897675l1.124588 -3.0897675z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m119.8294 94.71063l0 27.401573l40.661415 0l0 27.401573" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m119.8294 94.71063l0 27.401573l40.661415 0l0 23.974495" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m160.49081 146.0867l-1.124588 -1.124588l1.124588 3.0897675l1.124588 -3.0897675z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m99.990395 94.27887l0 27.90551l-29.732277 0l0 27.90551" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m99.990395 94.27887l0 27.90551l-29.732277 0l0 24.478432" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m70.25812 146.66281l-1.124588 -1.124588l1.124588 3.0897675l1.1245804 -3.0897675z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m318.4169 251.95473l41.385834 -0.03149414" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m318.4169 251.95473l37.95877 -0.028900146" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m356.37564 251.92584l-1.1237183 1.1254272l3.0889282 -1.1269226l-3.0906372 -1.1222382z" fill-rule="evenodd"/><path fill="#93c47d" d="m352.57086 15.863517l20.53543 0l0 20.53543l-20.53543 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m352.57086 15.863517l20.53543 0l0 20.53543l-20.53543 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m377.56693 10.257217l87.49606 0l0 31.748032l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m388.7388 30.777218l-1.0 0l0 -8.5625l4.78125 0l0 0.875l-3.78125 0l0 3.140625l3.546875 0l0 0.890625l-3.546875 0l0 3.65625zm6.0214844 0l-0.96875 0l0 -6.421875l0.96875 0l0 6.421875zm-1.046875 -8.15625q0 -0.34375 0.15625 -0.5q0.171875 -0.15625 0.421875 -0.15625q0.234375 0 0.390625 0.171875q0.171875 0.15625 0.171875 0.484375q0 0.328125 -0.171875 0.484375q-0.15625 0.15625 -0.390625 0.15625q-0.25 0 -0.421875 -0.15625q-0.15625 -0.15625 -0.15625 -0.484375zm4.0820312 8.15625l-0.96875 0l0 -9.125l0.96875 0l0 9.125zm4.7851562 0.125q-1.421875 0 -2.25 -0.875q-0.828125 -0.875 -0.828125 -2.40625q0 -1.5625 0.765625 -2.46875q0.765625 -0.921875 2.0625 -0.921875q1.203125 0 1.90625 0.796875q0.703125 0.796875 0.703125 2.09375l0 0.625l-4.421875 0q0.03125 1.125 0.5625 1.71875q0.546875 0.578125 1.53125 0.578125q1.03125 0 2.046875 -0.4375l0 0.875q-0.515625 0.21875 -0.984375 0.3125q-0.453125 0.109375 -1.09375 0.109375zm-0.265625 -5.84375q-0.78125 0 -1.25 0.5q-0.453125 0.5 -0.53125 1.390625l3.359375 0q0 -0.921875 -0.40625 -1.40625q-0.40625 -0.484375 -1.171875 -0.484375zm10.287109 0.046875l-1.625 0l0 5.671875l-0.984375 0l0 -5.671875l-1.140625 0l0 -0.4375l1.140625 -0.34375l0 -0.359375q0 -2.375 2.078125 -2.375q0.5 0 1.1875 0.203125l-0.25 0.78125q-0.5625 -0.171875 -0.953125 -0.171875q-0.5625 0 -0.828125 0.375q-0.25 0.359375 -0.25 1.15625l0 0.421875l1.625 0l0 0.75zm6.7226562 2.453125q0 1.578125 -0.796875 2.46875q-0.78125 0.875 -2.1875 0.875q-0.859375 0 -1.53125 -0.40625q-0.65625 -0.40625 -1.03125 -1.15625q-0.359375 -0.765625 -0.359375 -1.78125q0 -1.5625 0.78125 -2.4375q0.796875 -0.890625 2.1875 -0.890625q1.34375 0 2.140625 0.90625q0.796875 0.890625 0.796875 2.421875zm-4.890625 0q0 1.234375 0.484375 1.875q0.5 0.640625 1.453125 0.640625q0.953125 0 1.4375 -0.640625q0.5 -0.640625 0.5 -1.875q0 -1.21875 -0.5 -1.859375q-0.484375 -0.640625 -1.453125 -0.640625q-0.953125 0 -1.4375 0.640625q-0.484375 0.625 -0.484375 1.859375zm9.529297 -3.328125q0.421875 0 0.765625 0.078125l-0.140625 0.90625q-0.390625 -0.09375 -0.703125 -0.09375q-0.78125 0 -1.34375 0.640625q-0.546875 0.625 -0.546875 1.5625l0 3.453125l-0.96875 0l0 -6.421875l0.796875 0l0.125 1.1875l0.046875 0q0.34375 -0.625 0.84375 -0.96875q0.515625 -0.34375 1.125 -0.34375zm10.1484375 6.546875l0 -4.171875q0 -0.78125 -0.328125 -1.15625q-0.328125 -0.390625 -1.015625 -0.390625q-0.90625 0 -1.34375 0.53125q-0.4375 0.515625 -0.4375 1.59375l0 3.59375l-0.96875 0l0 -4.171875q0 -0.78125 -0.328125 -1.15625q-0.328125 -0.390625 -1.03125 -0.390625q-0.90625 0 -1.34375 0.546875q-0.421875 0.546875 -0.421875 1.796875l0 3.375l-0.96875 0l0 -6.421875l0.796875 0l0.15625 0.875l0.046875 0q0.265625 -0.46875 0.765625 -0.734375q0.515625 -0.265625 1.125 -0.265625q1.515625 0 1.96875 1.09375l0.046875 0q0.296875 -0.5 0.828125 -0.796875q0.546875 -0.296875 1.25 -0.296875q1.09375 0 1.625 0.5625q0.546875 0.5625 0.546875 1.796875l0 4.1875l-0.96875 0zm6.9277344 0l-0.203125 -0.921875l-0.046875 0q-0.46875 0.609375 -0.953125 0.828125q-0.46875 0.21875 -1.1875 0.21875q-0.953125 0 -1.5 -0.5q-0.546875 -0.5 -0.546875 -1.40625q0 -1.9375 3.109375 -2.03125l1.09375 -0.03125l0 -0.40625q0 -0.75 -0.328125 -1.109375q-0.3125 -0.359375 -1.03125 -0.359375q-0.8125 0 -1.8125 0.484375l-0.3125 -0.75q0.484375 -0.25 1.046875 -0.390625q0.5625 -0.15625 1.140625 -0.15625q1.140625 0 1.6875 0.515625q0.5625 0.5 0.5625 1.625l0 4.390625l-0.71875 0zm-2.203125 -0.6875q0.90625 0 1.421875 -0.5q0.53125 -0.5 0.53125 -1.390625l0 -0.578125l-0.984375 0.03125q-1.15625 0.046875 -1.671875 0.375q-0.5 0.3125 -0.5 0.984375q0 0.53125 0.3125 0.8125q0.3125 0.265625 0.890625 0.265625zm7.001953 0q0.25 0 0.484375 -0.03125q0.25 -0.046875 0.390625 -0.078125l0 0.734375q-0.15625 0.078125 -0.46875 0.125q-0.296875 0.0625 -0.546875 0.0625q-1.859375 0 -1.859375 -1.96875l0 -3.828125l-0.921875 0l0 -0.46875l0.921875 -0.40625l0.40625 -1.359375l0.5625 0l0 1.484375l1.859375 0l0 0.75l-1.859375 0l0 3.78125q0 0.578125 0.265625 0.890625q0.28125 0.3125 0.765625 0.3125z" fill-rule="nonzero"/><path fill="#f4cccc" d="m469.5223 15.863517l20.53543 0l0 20.53543l-20.53543 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m469.5223 15.863517l20.53543 0l0 20.53543l-20.53543 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m494.51706 10.829396l100.00003 0l0 31.748032l-100.00003 0z" fill-rule="evenodd"/><path fill="#000000" d="m504.68893 31.349398l0 -8.5625l1.0 0l0 8.5625l-1.0 0zm7.5957336 0l0 -4.15625q0 -0.78125 -0.35940552 -1.171875q-0.34375 -0.390625 -1.109375 -0.390625q-1.015625 0 -1.484375 0.546875q-0.46875 0.546875 -0.46875 1.796875l0 3.375l-0.96875 0l0 -6.421875l0.796875 0l0.15625 0.875l0.046875 0q0.296875 -0.46875 0.828125 -0.734375q0.546875 -0.265625 1.203125 -0.265625q1.1719055 0 1.7500305 0.5625q0.59375 0.5625 0.59375 1.796875l0 4.1875l-0.984375 0zm5.8652344 -5.671875l-1.625 0l0 5.671875l-0.984375 0l0 -5.671875l-1.140625 0l0 -0.4375l1.140625 -0.34375l0 -0.359375q0 -2.375 2.078125 -2.375q0.5 0 1.1875 0.203125l-0.25 0.78125q-0.5625 -0.171875 -0.953125 -0.171875q-0.5625 0 -0.828125 0.375q-0.25 0.359375 -0.25 1.15625l0 0.421875l1.625 0l0 0.75zm4.1132812 -0.875q0.421875 0 0.765625 0.078125l-0.140625 0.90625q-0.390625 -0.09375 -0.703125 -0.09375q-0.78125 0 -1.34375 0.640625q-0.546875 0.625 -0.546875 1.5625l0 3.453125l-0.96875 0l0 -6.421875l0.796875 0l0.125 1.1875l0.046875 0q0.34375 -0.625 0.84375 -0.96875q0.515625 -0.34375 1.125 -0.34375zm5.9140625 6.546875l-0.203125 -0.921875l-0.046875 0q-0.46875 0.609375 -0.953125 0.828125q-0.46875 0.21875 -1.1875 0.21875q-0.953125 0 -1.5 -0.5q-0.546875 -0.5 -0.546875 -1.40625q0 -1.9375 3.109375 -2.03125l1.09375 -0.03125l0 -0.40625q0 -0.75 -0.328125 -1.109375q-0.3125 -0.359375 -1.03125 -0.359375q-0.8125 0 -1.8125 0.484375l-0.3125 -0.75q0.484375 -0.25 1.046875 -0.390625q0.5625 -0.15625 1.140625 -0.15625q1.140625 0 1.6875 0.515625q0.5625 0.5 0.5625 1.625l0 4.390625l-0.71875 0zm-2.203125 -0.6875q0.90625 0 1.421875 -0.5q0.53125 -0.5 0.53125 -1.390625l0 -0.578125l-0.984375 0.03125q-1.15625 0.046875 -1.671875 0.375q-0.5 0.3125 -0.5 0.984375q0 0.53125 0.3125 0.8125q0.3125 0.265625 0.890625 0.265625zm9.064453 -1.0625q0 0.890625 -0.671875 1.390625q-0.65625 0.484375 -1.875 0.484375q-1.265625 0 -1.984375 -0.40625l0 -0.90625q0.46875 0.234375 0.984375 0.375q0.53125 0.125 1.03125 0.125q0.765625 0 1.171875 -0.234375q0.40625 -0.25 0.40625 -0.75q0 -0.375 -0.328125 -0.640625q-0.3125 -0.265625 -1.265625 -0.625q-0.890625 -0.34375 -1.28125 -0.59375q-0.375 -0.25 -0.5625 -0.5625q-0.171875 -0.3125 -0.171875 -0.75q0 -0.78125 0.640625 -1.234375q0.640625 -0.46875 1.75 -0.46875q1.03125 0 2.03125 0.421875l-0.359375 0.796875q-0.953125 -0.390625 -1.75 -0.390625q-0.6875 0 -1.046875 0.21875q-0.34375 0.203125 -0.34375 0.59375q0 0.25 0.125 0.4375q0.140625 0.171875 0.421875 0.34375q0.296875 0.15625 1.140625 0.46875q1.140625 0.421875 1.53125 0.84375q0.40625 0.421875 0.40625 1.0625zm3.6621094 1.0625q0.25 0 0.484375 -0.03125q0.25 -0.046875 0.390625 -0.078125l0 0.734375q-0.15625 0.078125 -0.46875 0.125q-0.296875 0.0625 -0.546875 0.0625q-1.859375 0 -1.859375 -1.96875l0 -3.828125l-0.921875 0l0 -0.46875l0.921875 -0.40625l0.40625 -1.359375l0.5625 0l0 1.484375l1.859375 0l0 0.75l-1.859375 0l0 3.78125q0 0.578125 0.265625 0.890625q0.28125 0.3125 0.765625 0.3125zm5.095703 -5.859375q0.421875 0 0.765625 0.078125l-0.140625 0.90625q-0.390625 -0.09375 -0.703125 -0.09375q-0.78125 0 -1.34375 0.640625q-0.546875 0.625 -0.546875 1.5625l0 3.453125l-0.96875 0l0 -6.421875l0.796875 0l0.125 1.1875l0.046875 0q0.34375 -0.625 0.84375 -0.96875q0.515625 -0.34375 1.125 -0.34375zm2.8828125 0.125l0 4.171875q0 0.78125 0.34375 1.171875q0.359375 0.375 1.125 0.375q1.015625 0 1.46875 -0.546875q0.46875 -0.546875 0.46875 -1.796875l0 -3.375l0.96875 0l0 6.421875l-0.796875 0l-0.140625 -0.859375l-0.046875 0q-0.296875 0.46875 -0.828125 0.734375q-0.53125 0.25 -1.21875 0.25q-1.171875 0 -1.75 -0.5625q-0.578125 -0.5625 -0.578125 -1.78125l0 -4.203125l0.984375 0zm9.005859 6.546875q-1.390625 0 -2.15625 -0.859375q-0.765625 -0.859375 -0.765625 -2.4375q0 -1.609375 0.78125 -2.484375q0.78125 -0.890625 2.203125 -0.890625q0.46875 0 0.921875 0.109375q0.46875 0.09375 0.734375 0.234375l-0.296875 0.828125q-0.328125 -0.140625 -0.703125 -0.21875q-0.375 -0.078125 -0.671875 -0.078125q-1.953125 0 -1.953125 2.484375q0 1.1875 0.46875 1.828125q0.484375 0.625 1.421875 0.625q0.796875 0 1.640625 -0.34375l0 0.859375q-0.640625 0.34375 -1.625 0.34375zm5.2285156 -0.8125q0.25 0 0.484375 -0.03125q0.25 -0.046875 0.390625 -0.078125l0 0.734375q-0.15625 0.078125 -0.46875 0.125q-0.296875 0.0625 -0.546875 0.0625q-1.859375 0 -1.859375 -1.96875l0 -3.828125l-0.921875 0l0 -0.46875l0.921875 -0.40625l0.40625 -1.359375l0.5625 0l0 1.484375l1.859375 0l0 0.75l-1.859375 0l0 3.78125q0 0.578125 0.265625 0.890625q0.28125 0.3125 0.765625 0.3125zm3.0800781 -5.734375l0 4.171875q0 0.78125 0.34375 1.171875q0.359375 0.375 1.125 0.375q1.015625 0 1.46875 -0.546875q0.46875 -0.546875 0.46875 -1.796875l0 -3.375l0.96875 0l0 6.421875l-0.796875 0l-0.140625 -0.859375l-0.046875 0q-0.296875 0.46875 -0.828125 0.734375q-0.53125 0.25 -1.21875 0.25q-1.171875 0 -1.75 -0.5625q-0.578125 -0.5625 -0.578125 -1.78125l0 -4.203125l0.984375 0zm9.380859 -0.125q0.421875 0 0.765625 0.078125l-0.140625 0.90625q-0.390625 -0.09375 -0.703125 -0.09375q-0.78125 0 -1.34375 0.640625q-0.546875 0.625 -0.546875 1.5625l0 3.453125l-0.96875 0l0 -6.421875l0.796875 0l0.125 1.1875l0.046875 0q0.34375 -0.625 0.84375 -0.96875q0.515625 -0.34375 1.125 -0.34375zm4.6796875 6.671875q-1.421875 0 -2.25 -0.875q-0.828125 -0.875 -0.828125 -2.40625q0 -1.5625 0.765625 -2.46875q0.765625 -0.921875 2.0625 -0.921875q1.203125 0 1.90625 0.796875q0.703125 0.796875 0.703125 2.09375l0 0.625l-4.421875 0q0.03125 1.125 0.5625 1.71875q0.546875 0.578125 1.53125 0.578125q1.03125 0 2.046875 -0.4375l0 0.875q-0.515625 0.21875 -0.984375 0.3125q-0.453125 0.109375 -1.09375 0.109375zm-0.265625 -5.84375q-0.78125 0 -1.25 0.5q-0.453125 0.5 -0.53125 1.390625l3.359375 0q0 -0.921875 -0.40625 -1.40625q-0.40625 -0.484375 -1.171875 -0.484375z" fill-rule="nonzero"/></g></svg>
diff --git a/tensorflow/lite/g3doc/guide/codegen.md b/tensorflow/lite/g3doc/inference_with_metadata/codegen.md
similarity index 50%
rename from tensorflow/lite/g3doc/guide/codegen.md
rename to tensorflow/lite/g3doc/inference_with_metadata/codegen.md
index 84dd2ffade9..b447573da41 100644
--- a/tensorflow/lite/g3doc/guide/codegen.md
+++ b/tensorflow/lite/g3doc/inference_with_metadata/codegen.md
@@ -1,20 +1,4 @@
-# Integrate TensorFlow Lite models with metadata
-
-[TensorFlow Lite metadata](../convert/metadata.md) contains a rich description
-of what the model does and how to use the model. It can empower code generators,
-such as the
-[TensorFlow Lite Android code generator](#generate-code-with-tensorflow-lite-android-code-generator)
-and the
-[Android Studio ML Binding feature](#generate-code-with-android-studio-ml-model-binding),
-to automatically generates the inference code for you. It can also be used to
-configure your custom inference pipeline.
-
-Browse
-[TensorFlow Lite hosted models](https://www.tensorflow.org/lite/guide/hosted_models)
-and [TensorFlow Hub](https://tfhub.dev/s?deployment-format=lite) to download
-pretrained models with metadata. All image models have been supported.
-
-## Generate code with TensorFlow Lite Android code generator
+# Generate model interfaces with TensorFlow Lite code generator
 
 Note: TensorFlow Lite wrapper code generator currently only supports Android.
 
@@ -30,7 +14,7 @@ under relevant fields in
 [metadata_schema.fbs](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/metadata/metadata_schema.fbs),
 to see how the codegen tool parses each field.
 
-### Generate Wrapper Code
+## Generate wrapper Code
 
 You will need to install the following tooling in your terminal:
 
@@ -53,17 +37,17 @@ environment, it maybe easier to zip up the result in a zip archive and download
 it to your Android Studio project:
 
 ```python
-## Zip up the generated code
+# Zip up the generated code
 !zip -r classify_wrapper.zip classify_wrapper/
 
-## Kick off the download
+# Download the archive
 from google.colab import files
 files.download('classify_wrapper.zip')
 ```
 
-### Using the generated code
+## Using the generated code
 
-#### Step 1: Import the generated code
+### Step 1: Import the generated code
 
 Unzip the generated code if necessary into a directory structure. The root of
 the generated code is assumed to be `SRC_ROOT`.
@@ -75,7 +59,7 @@ select `SRC_ROOT`
 Using the above example, the directory and the module imported would be called
 `classify_wrapper`.
 
-#### Step 2: Update the app's `build.gradle` file
+### Step 2: Update the app's `build.gradle` file
 
 In the app module that will be consuming the generated library module:
 
@@ -93,7 +77,7 @@ Under the dependencies section, add the following:
 implementation project(":classify_wrapper")
 ```
 
-#### Step 3: Using the model
+### Step 3: Using the model
 
 ```java
 // 1. Initialize the model
@@ -119,7 +103,7 @@ if(null != myImageClassifier) {
 }
 ```
 
-### Accelerating model inference
+## Accelerating model inference
 
 The generated code provides a way for developers to accelerate their code
 through the use of [delegates](../performance/delegates.md) and the number of
@@ -143,12 +127,11 @@ try {
 }
 ```
 
-### Troubleshooting
+## Troubleshooting
 
-#### Getting 'java.io.FileNotFoundException: This file can not be opened as a file descriptor; it is probably compressed'
-
-Under the app module that will uses the library module, insert the following
-lines under the android section:
+If you get a 'java.io.FileNotFoundException: This file can not be opened as a
+file descriptor; it is probably compressed' error, insert the following lines
+under the android section of the app module that will uses the library module:
 
 ```build
 aaptOptions {
@@ -168,76 +151,3 @@ for more details.
 Note: Code generated by the TensorFlow Lite Android code generator may include
 some latest API or experimental features, which can be a super set of the one
 generated by the Android Studio ML Model Binding.
-
-## Read the metadata from models
-
-The Metadata Extractor library is a convenient tool to read the metadata and
-associated files from a models across different platforms (see the
-[Java version](https://github.com/tensorflow/tflite-support/tree/master/tensorflow_lite_support/metadata)
-and the C++ version is coming soon). Users can also build their own metadata
-extractor tool in other languages using the Flatbuffers library.
-
-### Read the metadata in Java
-
-Note: the Java Metadata Extractor library is available as an Android library
-dependency: `org.tensorflow:tensorflow-lite-metadata`.
-
-You can initialize a `MetadataExtractor` with a `ByteBuffer` that points to the
-model:
-
-```java
-public MetadataExtractor(ByteBuffer buffer);
-```
-
-The `ByteBuffer` must remain unchanged for the whole lifetime of the
-`MetadataExtractor`. The initialization may fail if the Flatbuffers file
-identifier of the model metadata does not match the one of the metadata parser.
-See [metadata versioning](../convert/metadata.md#metadata-versioning) for more
-information.
-
-As long as the file identifer is satisfied, the metadata extractor will not fail
-when reading metadata generated from an old or a future scheme due to the
-Flatbuffers forward and backwards compatibility mechanism. But fields from
-future schemas cannot be extracted by older metadata extractors. The
-[minimum necessary parser version](../convert/metadata.md#the-minimum-necessary-metadata-parser-version)
-of the metadata indicates the minimum version of metadata parser that can read
-the metadata Flatbuffers in full. You can use the following method to verify if
-the minimum necessary parser version is satisfied:
-
-```java
-public final boolean isMinimumParserVersionSatisfied();
-```
-
-It is allowed to pass in a model without metadata. However, invoking methods
-that read from the metadata will cause runtime errors. You can check if a model
-has metadata by invoking the method:
-
-```java
-public boolean hasMetadata();
-```
-
-`MetadataExtractor` provides convenient functions for you to get the
-input/output tensors' metadata. For example,
-
-```java
-public int getInputTensorCount();
-public TensorMetadata getInputTensorMetadata(int inputIndex);
-public QuantizationParams getInputTensorQuantizationParams(int inputIndex);
-public int[] getInputTensorShape(int inputIndex);
-public int getoutputTensorCount();
-public TensorMetadata getoutputTensorMetadata(int inputIndex);
-public QuantizationParams getoutputTensorQuantizationParams(int inputIndex);
-public int[] getoutputTensorShape(int inputIndex);
-```
-
-You can also read associated files through their names with the method:
-
-```java
-public InputStream getAssociatedFile(String fileName);
-```
-
-Though the
-[TensorFlow Lite model schema](https://github.com/tensorflow/tensorflow/blob/aa7ff6aa28977826e7acae379e82da22482b2bf2/tensorflow/lite/schema/schema.fbs#L1075)
-supports multiple subgraphs, the TFLite Interpreter only supports single
-subgraph so far. Therefore, `MetadataExtractor` omits subgraph index as an input
-in its methods.
diff --git a/tensorflow/lite/g3doc/guide/lite_support.md b/tensorflow/lite/g3doc/inference_with_metadata/lite_support.md
similarity index 96%
rename from tensorflow/lite/g3doc/guide/lite_support.md
rename to tensorflow/lite/g3doc/inference_with_metadata/lite_support.md
index 39eeeee3684..ce7b9c0765b 100644
--- a/tensorflow/lite/g3doc/guide/lite_support.md
+++ b/tensorflow/lite/g3doc/inference_with_metadata/lite_support.md
@@ -40,6 +40,10 @@ dependencies {
 }
 ```
 
+Explore the
+[TensorFlow Lite Support Library AAR hosted at JCenter](https://bintray.com/google/tensorflow/tensorflow-lite-support)
+for different versions of the Support Library.
+
 ### Basic image manipulation and conversion
 
 The TensorFlow Lite Support Library has a suite of basic image manipulation
@@ -72,7 +76,7 @@ tImage = imageProcessor.process(tImage);
 ```
 
 `DataType` of a tensor can be read through the
-[metadata exractor library](../guide/codegen.md#read-the-metadata-from-models)
+[metadata exractor library](../convert/metadata.md#read-the-metadata-from-models)
 as well as other model information.
 
 ### Create output objects and run the model
@@ -235,4 +239,4 @@ TensorBuffer dequantizedBuffer = probabilityProcessor.process(probabilityBuffer)
 ```
 
 The quantization parameters of a tensor can be read through the
-[metadata exractor library](../guide/codegen.md#read-the-metadata-from-models).
+[metadata exractor library](../convert/metadata.md#read-the-metadata-from-models).
diff --git a/tensorflow/lite/g3doc/inference_with_metadata/overview.md b/tensorflow/lite/g3doc/inference_with_metadata/overview.md
new file mode 100644
index 00000000000..8caa92a6b68
--- /dev/null
+++ b/tensorflow/lite/g3doc/inference_with_metadata/overview.md
@@ -0,0 +1,51 @@
+# TensorFlow Lite inference with metadata
+
+Inferencing [models with metadata](../convert/metadata.md) can be as easy as
+just a few lines of code. TensorFlow Lite metadata contains a rich description
+of what the model does and how to use the model. It can empower code generators
+to automatically generate the inference code for you, such as using the
+[TensorFlow Lite Android code generator](codegen.md#generate-code-with-tensorflow-lite-android-code-generator)
+and the
+[Android Studio ML Binding feature](codegen.md#generate-code-with-android-studio-ml-model-binding).
+It can also be used to configure your custom inference pipeline.
+
+## Tools and libraries
+
+TensorFlow Lite provides varieties of tools and libraries to serve different
+tiers of deployment requirements as follows:
+
+### Generate model interface with the TensorFlow Lite Code Generator
+
+[TensorFlow Lite Code Generator](codegen.md) is an executable that generates
+model interface automatically based on the metadata. It currently supports
+Android with Java. The wrapper code removes the need to interact directly with
+`ByteBuffer`. Instead, developers can interact with the TensorFlow Lite model
+with typed objects such as `Bitmap` and `Rect`. Android Studio users can also
+get access to the codegen feature through
+[Android Studio ML Binding](codegen.md#generate-code-with-android-studio-ml-model-binding).
+
+### Leverage out-of-box APIs with the TensorFlow Lite Task Library
+
+[TensorFlow Lite Task Library](task_library/overview.md) provides optimized
+ready-to-use model interfaces for popular machine learning tasks, such as image
+classification, question and answer, etc. The model interfaces are specifically
+designed for each task to achieve the best performance and usability. Task
+Library works cross-platform and is supported on Java, C++, and Swift.
+
+### Build custom inference pipelines with the TensorFlow Lite Support Library
+
+[TensorFlow Lite Support Library](lite_support.md) is a cross-platform library
+that helps to customize model interface and build inference pipelines. It
+contains varieties of util methods and data structures to perform pre/post
+processing and data conversion. It is also designed to match the behavior of
+TensorFlow modules, such as TF.Image and TF.Text, ensuring consistency from
+training to inferencing.
+
+## Explore pretrained models with metadata
+
+Browse
+[TensorFlow Lite hosted models](https://www.tensorflow.org/lite/guide/hosted_models)
+and [TensorFlow Hub](https://tfhub.dev/s?deployment-format=lite) to download
+pretrained models with metadata for both vision and text tasks. Also see
+different options of
+[visualizing the metadata](../convert/metadata.md#visualize-the-metadata).
diff --git a/tensorflow/lite/g3doc/inference_with_metadata/task_library/bert_nl_classifier.md b/tensorflow/lite/g3doc/inference_with_metadata/task_library/bert_nl_classifier.md
new file mode 100644
index 00000000000..41fc958d53e
--- /dev/null
+++ b/tensorflow/lite/g3doc/inference_with_metadata/task_library/bert_nl_classifier.md
@@ -0,0 +1,122 @@
+# Integrate BERT natural language classifier
+
+The Task Library `BertNLClassifier` API is very similar to the `NLClassifier`
+that classifies input text into different categories, except that this API is
+specially tailored for Bert related models that require Wordpiece and
+Sentencepiece tokenizations outside the TFLite model.
+
+## Key features of the BertNLClassifier API
+
+*   Takes a single string as input, performs classification with the string and
+    outputs <Label, Score> pairs as classification results.
+
+*   Performs out-of-graph
+    [Wordpiece](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/cc/text/tokenizers/bert_tokenizer.h)
+    or
+    [Sentencepiece](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/cc/text/tokenizers/sentencepiece_tokenizer.h)
+    tokenizations on input text.
+
+## Supported BertNLClassifier models
+
+The following models are compatible with the `BertNLClassifier` API.
+
+*   Bert Models created by
+    [TensorFlow Lite Model Maker for text Classfication](https://www.tensorflow.org/lite/tutorials/model_maker_text_classification).
+
+*   Custom models that meet the
+    [model compatibility requirements](#model-compatibility-requirements).
+
+## Run inference in Java
+
+### Step 1: Import Gradle dependency and other settings
+
+Copy the `.tflite` model file to the assets directory of the Android module
+where the model will be run. Specify that the file should not be compressed, and
+add the TensorFlow Lite library to the module’s `build.gradle` file:
+
+```java
+android {
+    // Other settings
+
+    // Specify tflite file should not be compressed for the app apk
+    aaptOptions {
+        noCompress "tflite"
+    }
+
+}
+
+dependencies {
+    // Other dependencies
+
+    // Import the Task Text Library dependency
+    implementation 'org.tensorflow:tensorflow-lite-task-text:0.0.0-nightly'
+}
+```
+
+### Step 2: Run inference using the API
+
+```java
+// Initialization
+BertNLClassifier classifier = BertNLClassifier.createFromFile(context, modelFile);
+
+// Run inference
+List<Category> results = classifier.classify(input);
+```
+
+See the
+[source code](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/java/src/java/org/tensorflow/lite/task/text/nlclassifier/BertNLClassifier.java)
+for more details.
+
+## Run inference in C++
+
+Note: We are working on improving the usability of the C++ Task Library, such as
+providing prebuilt binaries and creating user-friendly workflows to build from
+source code. The C++ API may be subject to change.
+
+```c++
+// Initialization
+std::unique_ptr<BertNLClassifier> classifier = BertNLClassifier::CreateFromFile(model_path).value();
+
+// Run inference
+std::vector<core::Category> categories = classifier->Classify(kInput);
+```
+
+See the
+[source code](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/cc/task/text/nlclassifier/bert_nl_classifier.h)
+for more details.
+
+## Example results
+
+Here is an example of the classification results of movie reviews using the
+[MobileBert](https://www.tensorflow.org/lite/tutorials/model_maker_text_classification)
+model from Model Maker.
+
+Input: "it's a charming and often affecting journey"
+
+Output:
+
+```
+category[0]: 'negative' : '0.00006'
+category[1]: 'positive' : '0.99994'
+```
+
+Try out the simple
+[CLI demo tool for BertNLClassifier](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/examples/task/text/desktop/README.md#bertnlclassifier)
+with your own model and test data.
+
+## Model compatibility requirements
+
+The `BetNLClassifier` API expects a TFLite model with mandatory
+[TFLite Model Metadata](../../convert/metadata.md).
+
+The Metadata should meet the following requiresments:
+
+*   input_process_units for Wordpiece/Sentencepiece Tokenizer
+
+*   3 input tensors with names "ids", "mask" and "segment_ids" for the output of
+    the tokenizer
+
+*   1 output tensor of type float32, with a optionally attached label file. If a
+    label file is attached, the file should be a plain text file with one label
+    per line and the number of labels should match the number of categories as
+    the model outputs.
diff --git a/tensorflow/lite/g3doc/inference_with_metadata/task_library/bert_question_answerer.md b/tensorflow/lite/g3doc/inference_with_metadata/task_library/bert_question_answerer.md
new file mode 100644
index 00000000000..9c41f23aff8
--- /dev/null
+++ b/tensorflow/lite/g3doc/inference_with_metadata/task_library/bert_question_answerer.md
@@ -0,0 +1,136 @@
+# Integrate BERT question answerer
+
+The Task Library `BertQuestionAnswerer` API loads a Bert model and answers
+questions based on the content of a given passage. For more information, see the
+documentation for the Question-Answer model
+<a href="../../models/bert_qa/overview.md">here</a>.
+
+## Key features of the BertQuestionAnswerer API
+
+*   Takes two text inputs as question and context and outputs a list of possible
+    answers.
+
+*   Performs out-of-graph Wordpiece or Sentencepiece tokenizations on input
+    text.
+
+## Supported BertQuestionAnswerer models
+
+The following models are compatible with the `BertNLClassifier` API.
+
+*   Models created by
+    [TensorFlow Lite Model Maker for Question Answer](https://www.tensorflow.org/lite/tutorials/model_maker_question_answer).
+
+*   The
+    [pretrained BERT models on TensorFlow Hub](https://tfhub.dev/tensorflow/collections/lite/task-library/bert-question-answerer/1).
+
+*   Custom models that meet the
+    [model compatibility requirements](#model-compatibility-requirements).
+
+## Run inference in Java
+
+### Step 1: Import Gradle dependency and other settings
+
+Copy the `.tflite` model file to the assets directory of the Android module
+where the model will be run. Specify that the file should not be compressed, and
+add the TensorFlow Lite library to the module’s `build.gradle` file:
+
+```java
+android {
+    // Other settings
+
+    // Specify tflite file should not be compressed for the app apk
+    aaptOptions {
+        noCompress "tflite"
+    }
+
+}
+
+dependencies {
+    // Other dependencies
+
+    // Import the Task Text Library dependency
+    implementation 'org.tensorflow:tensorflow-lite-task-text:0.0.0-nightly'
+}
+```
+
+### Step 2: Run inference using the API
+
+```java
+// Initialization
+BertQuestionAnswerer answerer = BertQuestionAnswerer.createFromFile(androidContext, modelFile);
+
+// Run inference
+List<QaAnswer> answers = answerer.answer(contextOfTheQuestion, questionToAsk);
+);
+```
+
+See the
+[source code](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/java/src/java/org/tensorflow/lite/task/text/qa/BertQuestionAnswerer.java)
+for more details.
+
+## Run inference in C++
+
+Note: we are working on improving the usability of the C++ Task Library, such as
+providing prebuilt binaries and creating user-friendly workflows to build from
+source code. The C++ API may be subject to change.
+
+```c++
+// Initialization
+std::unique_ptr<BertQuestionAnswerer> answerer = BertQuestionAnswerer::CreateFromFile(model_file).value();
+
+// Run inference
+std::vector<QaAnswer> positive_results = answerer->Answer(context_of_question, question_to_ask);
+```
+
+See the
+[source code](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/cc/task/text/qa/bert_question_answerer.h)
+for more details.
+
+## Example results
+
+Here is an example of the answer results of
+[ALBERT model](https://tfhub.dev/tensorflow/lite-model/albert_lite_base/squadv1/1).
+
+Context: "The Amazon rainforest, alternatively, the Amazon Jungle, also known in
+English as Amazonia, is a moist broadleaf tropical rainforest in the Amazon
+biome that covers most of the Amazon basin of South America. This basin
+encompasses 7,000,000 km2 (2,700,000 sq mi), of which
+5,500,000 km2 (2,100,000 sq mi) are covered by the rainforest. This region
+includes territory belonging to nine nations."
+
+Question: "Where is Amazon rainforest?"
+
+Answers:
+
+```
+answer[0]:  'South America.'
+logit: 1.84847, start_index: 39, end_index: 40
+answer[1]:  'most of the Amazon basin of South America.'
+logit: 1.2921, start_index: 34, end_index: 40
+answer[2]:  'the Amazon basin of South America.'
+logit: -0.0959535, start_index: 36, end_index: 40
+answer[3]:  'the Amazon biome that covers most of the Amazon basin of South America.'
+logit: -0.498558, start_index: 28, end_index: 40
+answer[4]:  'Amazon basin of South America.'
+logit: -0.774266, start_index: 37, end_index: 40
+
+```
+
+Try out the simple
+[CLI demo tool for BertQuestionAnswerer](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/examples/task/text/desktop/README.md#bert-question-answerer)
+with your own model and test data.
+
+## Model compatibility requirements
+
+The `BertQuestionAnswerer` API expects a TFLite model with mandatory
+[TFLite Model Metadata](../../convert/metadata.md).
+
+The Metadata should meet the following requiresments:
+
+*   `input_process_units` for Wordpiece/Sentencepiece Tokenizer
+
+*   3 input tensors with names "ids", "mask" and "segment_ids" for the output of
+    the tokenizer
+
+*   2 output tensors with names "end_logits" and "start_logits" to indicate the
+    answer's relative position in the context
diff --git a/tensorflow/lite/g3doc/inference_with_metadata/task_library/customized_task_api.md b/tensorflow/lite/g3doc/inference_with_metadata/task_library/customized_task_api.md
new file mode 100644
index 00000000000..68e701d0796
--- /dev/null
+++ b/tensorflow/lite/g3doc/inference_with_metadata/task_library/customized_task_api.md
@@ -0,0 +1,448 @@
+# Build you own Task API
+
+<a href="overview.md">TensorFlow Lite Task Library</a> provides prebuilt
+native/Android/iOS APIs on top of the same infrastructure that abstracts
+TensorFlow. You can extend the Task API infrastructure to build customized APIs
+if your model is not supported by existing Task libraries.
+
+## Overview
+
+Task API infrastructure has a two-layer structure: the bottom C++ layer
+encapsulating the native TFLite runtime and the top Java/ObjC layer that
+communicates with the C++ layer through JNI or native wrapper.
+
+Implementing all the TensorFlow logic in only C++ minimizes cost, maximizes
+inference performance and simplifies the overall workflow across platforms.
+
+To create a Task class, extend the
+[BaseTaskApi](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/cc/task/core/base_task_api.h)
+to provide conversion logic between TFLite model interface and Task API
+interface, then use the Java/ObjC utilities to create corresponding APIs. With
+all TensorFlow details hidden, you can deploy the TFLite model in your apps
+without any machine learning knowledge.
+
+TensorFlow Lite provides some prebuilt APIs for most popular
+<a href="overview.md#supported_tasks">Vision and NLP tasks</a>. You can build
+your own APIs for other tasks using the Task API infrastructure.
+
+<div align="center">![prebuilt_task_apis](images/prebuilt_task_apis.svg)
+<div align="center">Figure 1. prebuilt Task APIs
+<div align="left">
+
+## Build your own API with Task API infra
+
+### C++ API
+
+All TFLite details are implemented in the native API. Create an API object by
+using one of the factory functions and get model results by calling functions
+defined in the interface.
+
+#### Sample usage
+
+Here is an example using the C++
+[`BertQuestionAnswerer`](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/cc/task/text/qa/bert_question_answerer.h)
+for
+[MobileBert](https://tfhub.dev/tensorflow/lite-model/mobilebert/1/default/1).
+
+```cpp
+  char kBertModelPath[] = "path/to/model.tflite";
+  // Create the API from a model file
+  std::unique_ptr<BertQuestionAnswerer> question_answerer =
+      BertQuestionAnswerer::CreateFromFile(kBertModelPath);
+
+  char kContext[] = ...; // context of a question to be answered
+  char kQuestion[] = ...; // question to be answered
+  // ask a question
+  std::vector<QaAnswer> answers = question_answerer.Answer(kContext, kQuestion);
+  // answers[0].text is the best answer
+```
+
+#### Building the API
+
+<div align="center">![native_task_api](images/native_task_api.svg)
+<div align="center">Figure 2. Native Task API
+<div align="left">
+
+To build an API object,you must provide the following information by extending
+[`BaseTaskApi`](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/cc/task/core/base_task_api.h)
+
+*   __Determine the API I/O__ - Your API should expose similar input/output
+    across different platforms. e.g `BertQuestionAnswerer` takes two strings
+    `(std::string& context, std::string& question)` as input and outputs a
+    vector of possible answer and probabilities as `std::vector<QaAnswer>`. This
+    is done by specifying the corresponding types in `BaseTaskApi`'s
+    [template parameter](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/cc/task/core/base_task_api.h?q="template <class OutputType, class... InputTypes>").
+    With the template parameters specified, the
+    [`BaseTaskApi::Infer`](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/cc/task/core/base_task_api.h?q="Infer\(InputTypes... args\)")
+    function will have the correct input/output types. This function can be
+    directly called by API clients, but it is a good practice to wrap it inside
+    a model-specific function, in this case, `BertQuestionAnswerer::Answer`.
+
+    ```cpp
+    class BertQuestionAnswerer : public BaseTaskApi<
+                                  std::vector<QaAnswer>, // OutputType
+                                  const std::string&, const std::string& // InputTypes
+                                  > {
+      // Model specific function delegating calls to BaseTaskApi::Infer
+      std::vector<QaAnswer> Answer(const std::string& context, const std::string& question) {
+        return Infer(context, question).value();
+      }
+    }
+    ```
+
+*   __Provide conversion logic between API I/O and input/output tensor of the
+    model__ - With input and output types specified, the subclasses also need to
+    implement the typed functions
+    [`BaseTaskApi::Preprocess`](https://github.com/tensorflow/tflite-support/blob/5cea306040c40b06d6e0ed4e5baf6c307db7bd00/tensorflow_lite_support/cc/task/core/base_task_api.h#L74)
+    and
+    [`BaseTaskApi::Postprocess`](https://github.com/tensorflow/tflite-support/blob/5cea306040c40b06d6e0ed4e5baf6c307db7bd00/tensorflow_lite_support/cc/task/core/base_task_api.h#L80).
+    The two functions provide
+    [inputs](https://github.com/tensorflow/tensorflow/blob/1b84e5af78f85b8d3c4687b7dee65b78113f81cc/tensorflow/lite/schema/schema.fbs#L1007)
+    and
+    [outputs](https://github.com/tensorflow/tensorflow/blob/1b84e5af78f85b8d3c4687b7dee65b78113f81cc/tensorflow/lite/schema/schema.fbs#L1008)
+    from the TFLite `FlatBuffer`. The subclass is responsible for assigning
+    values from the API I/O to I/O tensors. See the complete implementation
+    example in
+    [`BertQuestionAnswerer`](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/cc/task/text/qa/bert_question_answerer.cc).
+
+    ```cpp
+    class BertQuestionAnswerer : public BaseTaskApi<
+                                  std::vector<QaAnswer>, // OutputType
+                                  const std::string&, const std::string& // InputTypes
+                                  > {
+      // Convert API input into into tensors
+      absl::Status BertQuestionAnswerer::Preprocess(
+        const std::vector<TfLiteTensor*>& input_tensors, // input tensors of the model
+        const std::string& context, const std::string& query // InputType of the API
+      ) {
+        // Perform tokenization on input strings
+        ...
+        // Populate IDs, Masks and SegmentIDs to corresponding input tensors
+        PopulateTensor(input_ids, input_tensors[0]);
+        PopulateTensor(input_mask, input_tensors[1]);
+        PopulateTensor(segment_ids, input_tensors[2]);
+        return absl::OkStatus();
+      }
+
+      // Convert output tensors into API output
+      StatusOr<std::vector<QaAnswer>> // OutputType
+      BertQuestionAnswerer::Postprocess(
+        const std::vector<const TfLiteTensor*>& output_tensors, // output tensors of the model
+      ) {
+        // Get start/end logits of prediction result from output tensors
+        std::vector<float> end_logits;
+        std::vector<float> start_logits;
+        // output_tensors[0]: end_logits FLOAT[1, 384]
+        PopulateVector(output_tensors[0], &end_logits);
+        // output_tensors[1]: start_logits FLOAT[1, 384]
+        PopulateVector(output_tensors[1], &start_logits);
+        ...
+        std::vector<QaAnswer::Pos> orig_results;
+        // Look up the indices from vocabulary file and build results
+        ...
+        return orig_results;
+      }
+    }
+    ```
+
+*   __Create factory functions of the API__ - A model file and a
+    [`OpResolver`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/core/api/op_resolver.h)
+    are needed to initialize the
+    [`tflite::Interpreter`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/interpreter.h).
+    [`TaskAPIFactory`](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/cc/task/core/task_api_factory.h)
+    provides utility functions to create BaseTaskApi instances.
+
+    Note: By default
+    [`TaskAPIFactory`](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/cc/task/core/task_api_factory.h)
+    provides a
+    [`BuiltInOpResolver`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/kernels/register.h).
+    If your model needs customized ops or a subset of built-in ops, you can
+    register them by creating a
+    [`MutableOpResolver`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/mutable_op_resolver.h).
+
+    You must also provide any files associated with the model. e.g,
+    `BertQuestionAnswerer` can also have an additional file for its tokenizer's
+    vocabulary.
+
+    ```cpp
+    class BertQuestionAnswerer : public BaseTaskApi<
+                                  std::vector<QaAnswer>, // OutputType
+                                  const std::string&, const std::string& // InputTypes
+                                  > {
+      // Factory function to create the API instance
+      StatusOr<std::unique_ptr<QuestionAnswerer>>
+      BertQuestionAnswerer::CreateBertQuestionAnswerer(
+          const std::string& path_to_model, // model to passed to TaskApiFactory
+          const std::string& path_to_vocab  // additional model specific files
+      ) {
+        // Creates an API object by calling one of the utils from TaskAPIFactory
+        std::unique_ptr<BertQuestionAnswerer> api_to_init;
+        ASSIGN_OR_RETURN(
+            api_to_init,
+            core::TaskAPIFactory::CreateFromFile<BertQuestionAnswerer>(
+                path_to_model,
+                absl::make_unique<tflite::ops::builtin::BuiltinOpResolver>(),
+                kNumLiteThreads));
+
+        // Perform additional model specific initializations
+        // In this case building a vocabulary vector from the vocab file.
+        api_to_init->InitializeVocab(path_to_vocab);
+        return api_to_init;
+      }
+    }
+    ```
+
+### Android API
+
+Create Android APIs by defining Java/Kotlin interface and delegating the logic
+to the C++ layer through JNI. Android API requires native API to be built first.
+
+#### Sample usage
+
+Here is an example using Java
+[`BertQuestionAnswerer`](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/java/src/java/org/tensorflow/lite/task/text/qa/BertQuestionAnswerer.java)
+for
+[MobileBert](https://tfhub.dev/tensorflow/lite-model/mobilebert/1/default/1).
+
+```java
+  String BERT_MODEL_FILE = "path/to/model.tflite";
+  String VOCAB_FILE = "path/to/vocab.txt";
+  // Create the API from a model file and vocabulary file
+    BertQuestionAnswerer bertQuestionAnswerer =
+        BertQuestionAnswerer.createBertQuestionAnswerer(
+            ApplicationProvider.getApplicationContext(), BERT_MODEL_FILE, VOCAB_FILE);
+
+  String CONTEXT = ...; // context of a question to be answered
+  String QUESTION = ...; // question to be answered
+  // ask a question
+  List<QaAnswer> answers = bertQuestionAnswerer.answer(CONTEXT, QUESTION);
+  // answers.get(0).text is the best answer
+```
+
+#### Building the API
+
+<div align="center">![android_task_api](images/android_task_api.svg)
+<div align="center">Figure 3. Android Task API
+<div align="left">
+
+Similar to Native APIs, to build an API object, the client needs to provide the
+following information by extending
+[`BaseTaskApi`](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/java/src/java/org/tensorflow/lite/task/core/BaseTaskApi.java),
+which provides JNI handlings for all Java Task APIs.
+
+*   __Determine the API I/O__ - This usually mirriors the native interfaces. e.g
+    `BertQuestionAnswerer` takes `(String context, String question)` as input
+    and outputs `List<QaAnswer>`. The implementation calls a private native
+    function with similar signature, except it has an additional parameter `long
+    nativeHandle`, which is the pointer returned from C++.
+
+    ```java
+    class BertQuestionAnswerer extends BaseTaskApi {
+      public List<QaAnswer> answer(String context, String question) {
+        return answerNative(getNativeHandle(), context, question);
+      }
+
+      private static native List<QaAnswer> answerNative(
+                                            long nativeHandle, // C++ pointer
+                                            String context, String question // API I/O
+                                           );
+
+    }
+    ```
+
+*   __Create factory functions of the API__ - This also mirrors native factory
+    functions, except Android factory functions also need to take
+    [`Context`](https://developer.android.com/reference/android/content/Context)
+    for file access. The implementation calls one of the utilities in
+    [`TaskJniUtils`](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/java/src/java/org/tensorflow/lite/task/core/TaskJniUtils.java)
+    to build the corresponding C++ API object and pass its pointer to the
+    `BaseTaskApi` constructor.
+
+    ```java
+      class BertQuestionAnswerer extends BaseTaskApi {
+        private static final String BERT_QUESTION_ANSWERER_NATIVE_LIBNAME =
+                                                  "bert_question_answerer_jni";
+
+        // Extending super constructor by providing the
+        // native handle(pointer of corresponding C++ API object)
+        private BertQuestionAnswerer(long nativeHandle) {
+          super(nativeHandle);
+        }
+
+        public static BertQuestionAnswerer createBertQuestionAnswerer(
+                                            Context context, // Accessing Android files
+                                            String pathToModel, String pathToVocab) {
+          return new BertQuestionAnswerer(
+              // The util first try loads the JNI module with name
+              // BERT_QUESTION_ANSWERER_NATIVE_LIBNAME, then opens two files,
+              // converts them into ByteBuffer, finally ::initJniWithBertByteBuffers
+              // is called with the buffer for a C++ API object pointer
+              TaskJniUtils.createHandleWithMultipleAssetFilesFromLibrary(
+                  context,
+                  BertQuestionAnswerer::initJniWithBertByteBuffers,
+                  BERT_QUESTION_ANSWERER_NATIVE_LIBNAME,
+                  pathToModel,
+                  pathToVocab));
+        }
+
+        // modelBuffers[0] is tflite model file buffer, and modelBuffers[1] is vocab file buffer.
+        // returns C++ API object pointer casted to long
+        private static native long initJniWithBertByteBuffers(ByteBuffer... modelBuffers);
+
+      }
+    ```
+
+*   __Implement the JNI module for native functions__ - All Java native methods
+    are implemented by calling a corresponding native function from the JNI
+    module. The factory functions would create a native API object and return
+    its pointer as a long type to Java. In later calls to Java API, the long
+    type pointer is passed back to JNI and cast back to the native API object.
+    The native API results are then converted back to Java results.
+
+    For example, this is how
+    [bert_question_answerer_jni](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/java/src/native/task/text/qa/bert_question_answerer_jni.cc)
+    is implemented.
+
+    ```cpp
+      // Implements BertQuestionAnswerer::initJniWithBertByteBuffers
+      extern "C" JNIEXPORT jlong JNICALL
+      Java_org_tensorflow_lite_task_text_qa_BertQuestionAnswerer_initJniWithBertByteBuffers(
+          JNIEnv* env, jclass thiz, jobjectArray model_buffers) {
+        // Convert Java ByteBuffer object into a buffer that can be read by native factory functions
+        absl::string_view model =
+            GetMappedFileBuffer(env, env->GetObjectArrayElement(model_buffers, 0));
+
+        // Creates the native API object
+        absl::StatusOr<std::unique_ptr<QuestionAnswerer>> status =
+            BertQuestionAnswerer::CreateFromBuffer(
+                model.data(), model.size());
+        if (status.ok()) {
+          // converts the object pointer to jlong and return to Java.
+          return reinterpret_cast<jlong>(status->release());
+        } else {
+          return kInvalidPointer;
+        }
+      }
+
+      // Implements BertQuestionAnswerer::answerNative
+      extern "C" JNIEXPORT jobject JNICALL
+      Java_org_tensorflow_lite_task_text_qa_BertQuestionAnswerer_answerNative(
+      JNIEnv* env, jclass thiz, jlong native_handle, jstring context, jstring question) {
+      // Convert long to native API object pointer
+      QuestionAnswerer* question_answerer = reinterpret_cast<QuestionAnswerer*>(native_handle);
+
+      // Calls the native API
+      std::vector<QaAnswer> results = question_answerer->Answer(JStringToString(env, context),
+                                             JStringToString(env, question));
+
+      // Converts native result(std::vector<QaAnswer>) to Java result(List<QaAnswerer>)
+      jclass qa_answer_class =
+        env->FindClass("org/tensorflow/lite/task/text/qa/QaAnswer");
+      jmethodID qa_answer_ctor =
+        env->GetMethodID(qa_answer_class, "<init>", "(Ljava/lang/String;IIF)V");
+      return ConvertVectorToArrayList<QaAnswer>(
+        env, results,
+        [env, qa_answer_class, qa_answer_ctor](const QaAnswer& ans) {
+          jstring text = env->NewStringUTF(ans.text.data());
+          jobject qa_answer =
+              env->NewObject(qa_answer_class, qa_answer_ctor, text, ans.pos.start,
+                             ans.pos.end, ans.pos.logit);
+          env->DeleteLocalRef(text);
+          return qa_answer;
+        });
+      }
+
+      // Implements BaseTaskApi::deinitJni by delete the native object
+      extern "C" JNIEXPORT void JNICALL Java_task_core_BaseTaskApi_deinitJni(
+          JNIEnv* env, jobject thiz, jlong native_handle) {
+        delete reinterpret_cast<QuestionAnswerer*>(native_handle);
+      }
+    ```
+
+### iOS API
+
+Create iOS APIs by wrapping a native API object into a ObjC API object. The
+created API object can be used in either ObjC or Swift. iOS API requires the
+native API to be built first.
+
+#### Sample usage
+
+Here is an example using ObjC
+[`TFLBertQuestionAnswerer`](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/ios/task/text/qa/Sources/TFLBertQuestionAnswerer.h)
+for [MobileBert](https://tfhub.dev/tensorflow/lite-model/mobilebert/1/default/1)
+in Swfit.
+
+```swift
+  static let mobileBertModelPath = "path/to/model.tflite";
+  // Create the API from a model file and vocabulary file
+  let mobileBertAnswerer = TFLBertQuestionAnswerer.mobilebertQuestionAnswerer(
+      modelPath: mobileBertModelPath)
+
+  static let context = ...; // context of a question to be answered
+  static let question = ...; // question to be answered
+  // ask a question
+  let answers = mobileBertAnswerer.answer(
+      context: TFLBertQuestionAnswererTest.context, question: TFLBertQuestionAnswererTest.question)
+  // answers.[0].text is the best answer
+```
+
+#### Building the API
+
+<div align="center">![ios_task_api](images/ios_task_api.svg)
+<div align="center">Figure 4. iOS Task API
+<div align="left">
+
+iOS API is a simple ObjC wrapper on top of native API. Build the API by
+following the steps below:
+
+*   __Define the ObjC wrapper__ - Define an ObjC class and delegate the
+    implementations to the corresponding native API object. Note the native
+    dependencies can only appear in a .mm file due to Swift's inability to
+    interop with C++.
+
+    *   .h file
+
+    ```objc
+      @interface TFLBertQuestionAnswerer : NSObject
+
+      // Delegate calls to the native BertQuestionAnswerer::CreateBertQuestionAnswerer
+      + (instancetype)mobilebertQuestionAnswererWithModelPath:(NSString*)modelPath
+                                                    vocabPath:(NSString*)vocabPath
+          NS_SWIFT_NAME(mobilebertQuestionAnswerer(modelPath:vocabPath:));
+
+      // Delegate calls to the native BertQuestionAnswerer::Answer
+      - (NSArray<TFLQAAnswer*>*)answerWithContext:(NSString*)context
+                                         question:(NSString*)question
+          NS_SWIFT_NAME(answer(context:question:));
+    }
+    ```
+
+    *   .mm file
+
+    ```objc
+      using BertQuestionAnswererCPP = ::tflite::task::text::qa::BertQuestionAnswerer;
+
+      @implementation TFLBertQuestionAnswerer {
+        // define an iVar for the native API object
+        std::unique_ptr<QuestionAnswererCPP> _bertQuestionAnswerwer;
+      }
+
+      // Initilalize the native API object
+      + (instancetype)mobilebertQuestionAnswererWithModelPath:(NSString *)modelPath
+                                              vocabPath:(NSString *)vocabPath {
+        absl::StatusOr<std::unique_ptr<QuestionAnswererCPP>> cQuestionAnswerer =
+            BertQuestionAnswererCPP::CreateBertQuestionAnswerer(MakeString(modelPath),
+                                                                MakeString(vocabPath));
+        _GTMDevAssert(cQuestionAnswerer.ok(), @"Failed to create BertQuestionAnswerer");
+        return [[TFLBertQuestionAnswerer alloc]
+            initWithQuestionAnswerer:std::move(cQuestionAnswerer.value())];
+      }
+
+      // Calls the native API and converts C++ results into ObjC results
+      - (NSArray<TFLQAAnswer *> *)answerWithContext:(NSString *)context question:(NSString *)question {
+        std::vector<QaAnswerCPP> results =
+          _bertQuestionAnswerwer->Answer(MakeString(context), MakeString(question));
+        return [self arrayFromVector:results];
+      }
+    }
+    ```
diff --git a/tensorflow/lite/g3doc/inference_with_metadata/task_library/image_classifier.md b/tensorflow/lite/g3doc/inference_with_metadata/task_library/image_classifier.md
new file mode 100644
index 00000000000..94d6b513096
--- /dev/null
+++ b/tensorflow/lite/g3doc/inference_with_metadata/task_library/image_classifier.md
@@ -0,0 +1,170 @@
+# Integrate image classifiers
+
+Image classification is a common use of machine learning to identify what an
+image represents. For example, we might want to know what type of animal appears
+in a given picture. The task of predicting what an image represents is called
+_image classification_. An image classifier is trained to recognize various
+classes of images. For example, a model might be trained to recognize photos
+representing three different types of animals: rabbits, hamsters, and dogs. See
+the
+[introduction of image classification](../../models/image_classification/overview.md)
+for more information about image classifiers.
+
+Use the Task Library `ImageClassifier` API to deploy your custom image
+classifiers or pretrained ones into your model apps.
+
+## Key features of the ImageClassifier API
+
+*   Input image processing, including rotation, resizing, and color space
+    conversion.
+
+*   Region of interest of the input image.
+
+*   Label map locale.
+
+*   Score threshold to filter results.
+
+*   Top-k classification results.
+
+*   Label allowlist and denylist.
+
+## Supported image classifier models
+
+The following models are guaranteed to be compatible with the `ImageClassifier`
+API.
+
+*   Models created by
+    [TensorFlow Lite Model Maker for Image Classfication](https://www.tensorflow.org/lite/tutorials/model_maker_image_classification).
+
+*   The
+    [pretrained image classification models from TensorFlow Lite Hosted Models](https://www.tensorflow.org/lite/guide/hosted_models#image_classification).
+
+*   The
+    [pretrained image classification models on TensorFlow Hub](https://tfhub.dev/tensorflow/collections/lite/task-library/image-classifier/1).
+
+*   Models created by
+    [AutoML Vision Edge Image Classification](https://cloud.google.com/vision/automl/docs/edge-quickstart).
+
+*   Custom models that meet the
+    [model compatibility requirements](#model-compatibility-requirements).
+
+## Run inference in Java
+
+### Step 1: Import Gradle dependency and other settings
+
+Copy the `.tflite` model file to the assets directory of the Android module
+where the model will be run. Specify that the file should not be compressed, and
+add the TensorFlow Lite library to the module’s `build.gradle` file:
+
+```java
+android {
+    // Other settings
+
+    // Specify tflite file should not be compressed for the app apk
+    aaptOptions {
+        noCompress "tflite"
+    }
+
+}
+
+dependencies {
+    // Other dependencies
+
+    // Import the Task Vision Library dependency
+    implementation 'org.tensorflow:tensorflow-lite-task-vision:0.0.0-nightly'
+
+}
+```
+
+### Step 2: Using the model
+
+```java
+// Initialization
+ImageClassifierOptions options = ImageClassifierOptions.builder().setMaxResults(1).build();
+ImageClassifier imageClassifier = ImageClassifier.createFromFileAndOptions(context, modelFile, options);
+
+// Run inference
+List<Classifications> results = imageClassifier.classify(image);
+```
+
+See the
+[source code and javadoc](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/java/src/java/org/tensorflow/lite/task/vision/classifier/ImageClassifier.java)
+for more options to configure `ImageClassifier`.
+
+## Run inference in C++
+
+Note: we are working on improving the usability of the C++ Task Library, such as
+providing prebuilt binaries and creating user-friendly workflows to build from
+source code. The C++ API may be subject to change.
+
+```c++
+// Initialization
+ImageClassifierOptions options;
+options.mutable_model_file_with_metadata()->set_file_name(model_file);
+std::unique_ptr<ImageClassifier> image_classifier = ImageClassifier::CreateFromOptions(options).value();
+
+// Run inference
+const ClassificationResult result = image_classifier->Classify(*frame_buffer).value();
+```
+
+See the
+[source code](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/cc/task/vision/image_classifier.h)
+for more options to configure `ImageClassifier`.
+
+## Example results
+
+Here is an example of the classification results of a
+[bird classifier](https://tfhub.dev/google/lite-model/aiy/vision/classifier/birds_V1/3).
+
+<img src="images/sparrow.jpg" alt="sparrow" width="50%">
+
+```
+Results:
+  Rank #0:
+   index       : 671
+   score       : 0.91406
+   class name  : /m/01bwb9
+   display name: Passer domesticus
+  Rank #1:
+   index       : 670
+   score       : 0.00391
+   class name  : /m/01bwbt
+   display name: Passer montanus
+  Rank #2:
+   index       : 495
+   score       : 0.00391
+   class name  : /m/0bwm6m
+   display name: Passer italiae
+```
+
+Try out the simple
+[CLI demo tool for ImageClassifier](https://github.com/tensorflow/tflite-support/tree/master/tensorflow_lite_support/examples/task/vision/desktop#image-classifier)
+with your own model and test data.
+
+## Model compatibility requirements
+
+The `ImageClassifier` API expects a TFLite model with mandatory
+[TFLite Model Metadata](../../convert/metadata.md).
+
+The compatible image classifier models should meet the following requirements:
+
+*   Input image tensor (kTfLiteUInt8/kTfLiteFloat32)
+
+    -   image input of size `[batch x height x width x channels]`.
+    -   batch inference is not supported (`batch` is required to be 1).
+    -   only RGB inputs are supported (`channels` is required to be 3).
+    -   if type is kTfLiteFloat32, NormalizationOptions are required to be
+        attached to the metadata for input normalization.
+
+*   Output score tensor (kTfLiteUInt8/kTfLiteFloat32)
+
+    -   with `N` classes and either 2 or 4 dimensions, i.e. `[1 x N]` or `[1 x 1
+        x 1 x N]`
+    -   optional (but recommended) label map(s) as AssociatedFile-s with type
+        TENSOR_AXIS_LABELS, containing one label per line. The first such
+        AssociatedFile (if any) is used to fill the `label` field (named as
+        `class_name` in C++) of the results. The `display_name` field is filled
+        from the AssociatedFile (if any) whose locale matches the
+        `display_names_locale` field of the `ImageClassifierOptions` used at
+        creation time ("en" by default, i.e. English). If none of these are
+        available, only the `index` field of the results will be filled.
diff --git a/tensorflow/lite/g3doc/inference_with_metadata/task_library/image_segmenter.md b/tensorflow/lite/g3doc/inference_with_metadata/task_library/image_segmenter.md
new file mode 100644
index 00000000000..c17370be026
--- /dev/null
+++ b/tensorflow/lite/g3doc/inference_with_metadata/task_library/image_segmenter.md
@@ -0,0 +1,162 @@
+# Integrate image segmenters
+
+Image segmenters predict whether each pixel of an image is associated with a
+certain class. This is in contrast to
+<a href="../../models/object_detection/overview.md">object detection</a>, which
+detects objects in rectangular regions, and
+<a href="../../models/image_classification/overview.md">image
+classification</a>, which classifies the overall image. See the
+[introduction of image segmentation](../../models/segmentation/overview.md) for
+more information about image segmenters.
+
+Use the Task Library `ImageSegmenter` API to deploy your custom image segmenters
+or pretrained ones into your model apps.
+
+## Key features of the ImageSegmenter API
+
+*   Input image processing, including rotation, resizing, and color space
+    conversion.
+
+*   Label map locale.
+
+*   Two output types, category mask and confidence masks.
+
+*   Colored label for display purpose.
+
+## Supported image segmenter models
+
+The following models are guaranteed to be compatible with the `ImageSegmenter`
+API.
+
+*   The
+    [pretrained image segmentation models on TensorFlow Hub](https://tfhub.dev/tensorflow/collections/lite/task-library/image-segmenter/1).
+
+*   Custom models that meet the
+    [model compatibility requirements](#model-compatibility-requirements).
+
+## Run inference in Java
+
+### Step 1: Import Gradle dependency and other settings
+
+Copy the `.tflite` model file to the assets directory of the Android module
+where the model will be run. Specify that the file should not be compressed, and
+add the TensorFlow Lite library to the module’s `build.gradle` file:
+
+```java
+android {
+    // Other settings
+
+    // Specify tflite file should not be compressed for the app apk
+    aaptOptions {
+        noCompress "tflite"
+    }
+
+}
+
+dependencies {
+    // Other dependencies
+
+    // Import the Task Vision Library dependency
+    implementation 'org.tensorflow:tensorflow-lite-task-vision:0.0.0-nightly'
+}
+```
+
+### Step 2: Using the model
+
+```java
+// Initialization
+ImageSegmenterOptions options = ImageSegmenterOptions.builder().setOutputType(OutputType.CONFIDENCE_MASK).build();
+ImageSegmenter imageSegmenter = ImageSegmenter.createFromFileAndOptions(context, modelFile, options);
+
+// Run inference
+List<Segmentation> results = imageSegmenter.segment(image);
+```
+
+See the
+[source code and javadoc](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/java/src/java/org/tensorflow/lite/task/vision/segmenter/ImageSegmenter.java)
+for more options to configure `ImageSegmenter`.
+
+## Run inference in C++
+
+Note: we are working on improving the usability of the C++ Task Library, such as
+providing prebuilt binaries and creating user-friendly workflows to build from
+source code. The C++ API may be subject to change.
+
+```c++
+// Initialization
+ImageSegmenterOptions options;
+options.mutable_model_file_with_metadata()->set_file_name(model_file);
+std::unique_ptr<ImageSegmenter> image_segmenter = ImageSegmenter::CreateFromOptions(options).value();
+
+// Run inference
+const SegmentationResult result = image_segmenter->Segment(*frame_buffer).value();
+```
+
+See the
+[source code](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/cc/task/vision/image_segmenter.h)
+for more options to configure `ImageSegmenter`.
+
+## Example results
+
+Here is an example of the segmentation results of
+[deeplab_v3](https://tfhub.dev/tensorflow/lite-model/deeplabv3/1/metadata/1), a
+generic segmentation model available on TensorFlow Hub.
+
+<img src="images/plane.jpg" alt="plane" width="50%">
+
+```
+Color Legend:
+ (r: 000, g: 000, b: 000):
+  index       : 0
+  class name  : background
+ (r: 128, g: 000, b: 000):
+  index       : 1
+  class name  : aeroplane
+
+# (omitting multiple lines for conciseness) ...
+
+ (r: 128, g: 192, b: 000):
+  index       : 19
+  class name  : train
+ (r: 000, g: 064, b: 128):
+  index       : 20
+  class name  : tv
+Tip: use a color picker on the output PNG file to inspect the output mask with
+this legend.
+```
+
+The segmentation category mask should looks like:
+
+<img src="images/segmentation-output.png" alt="segmentation-output" width="30%">
+
+Try out the simple
+[CLI demo tool for ImageSegmenter](https://github.com/tensorflow/tflite-support/tree/master/tensorflow_lite_support/examples/task/vision/desktop#image-segmenter)
+with your own model and test data.
+
+## Model compatibility requirements
+
+The `ImageSegmenter` API expects a TFLite model with mandatory
+[TFLite Model Metadata](../../convert/metadata.md).
+
+*   Input image tensor (kTfLiteUInt8/kTfLiteFloat32)
+
+    -   image input of size `[batch x height x width x channels]`.
+    -   batch inference is not supported (`batch` is required to be 1).
+    -   only RGB inputs are supported (`channels` is required to be 3).
+    -   if type is kTfLiteFloat32, NormalizationOptions are required to be
+        attached to the metadata for input normalization.
+
+*   Output masks tensor: (kTfLiteUInt8/kTfLiteFloat32)
+
+    -   tensor of size `[batch x mask_height x mask_width x num_classes]`, where
+        `batch` is required to be 1, `mask_width` and `mask_height` are the
+        dimensions of the segmentation masks produced by the model, and
+        `num_classes` is the number of classes supported by the model.
+    -   optional (but recommended) label map(s) can be attached as
+        AssociatedFile-s with type TENSOR_AXIS_LABELS, containing one label per
+        line. The first such AssociatedFile (if any) is used to fill the `label`
+        field (named as `class_name` in C++) of the results. The `display_name`
+        field is filled from the AssociatedFile (if any) whose locale matches
+        the `display_names_locale` field of the `ImageSegmenterOptions` used at
+        creation time ("en" by default, i.e. English). If none of these are
+        available, only the `index` field of the results will be filled.
diff --git a/tensorflow/lite/g3doc/inference_with_metadata/task_library/images/android_task_api.svg b/tensorflow/lite/g3doc/inference_with_metadata/task_library/images/android_task_api.svg
new file mode 100644
index 00000000000..c9554b47e77
--- /dev/null
+++ b/tensorflow/lite/g3doc/inference_with_metadata/task_library/images/android_task_api.svg
@@ -0,0 +1 @@
+<?xml version="1.0" encoding="utf-8" standalone="no"?><!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 20010904//EN" "http://www.w3.org/TR/2001/REC-SVG-20010904/DTD/svg10.dtd"><svg xmlns="http://www.w3.org/2000/svg" width="491" height="594" xmlns:xlink="http://www.w3.org/1999/xlink"><desc style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">Created with Raphaël 2.2.0</desc><defs style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"><path stroke-linecap="round" d="M5,0 0,2.5 5,5z" id="raphael-marker-block" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></path><marker id="raphael-marker-endblock55-objjks7y" markerHeight="5" markerWidth="5" orient="auto" refX="2.5" refY="2.5" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"><use xlink:href="#raphael-marker-block" transform="rotate(180 2.5 2.5) scale(1,1)" stroke-width="1.0000" fill="#00acc1" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></use></marker><marker id="raphael-marker-endblock55-obj68k5o" markerHeight="5" markerWidth="5" orient="auto" refX="2.5" refY="2.5" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"><use xlink:href="#raphael-marker-block" transform="rotate(180 2.5 2.5) scale(1,1)" stroke-width="1.0000" fill="#00acc1" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></use></marker><marker id="raphael-marker-endblock55-objev9wf" markerHeight="5" markerWidth="5" orient="auto" refX="2.5" refY="2.5" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"><use xlink:href="#raphael-marker-block" transform="rotate(180 2.5 2.5) scale(1,1)" stroke-width="1.0000" fill="#d4e157" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></use></marker><marker id="raphael-marker-endblock55-objvt0oa" markerHeight="5" markerWidth="5" orient="auto" refX="2.5" refY="2.5" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"><use xlink:href="#raphael-marker-block" transform="rotate(180 2.5 2.5) scale(1,1)" stroke-width="1.0000" fill="#d4e157" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></use></marker></defs><rect x="10" y="10" width="163.625" height="28" rx="0" ry="0" fill="#0000ff" stroke="#ffffff" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><rect x="15" y="15" width="153.625" height="18" rx="0" ry="0" fill="#0000ff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="91.8125" y="24" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#ffffff" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="5.5" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">Android Task API</tspan></text><rect x="10" y="48" width="154.4375" height="57.21875" rx="0" ry="0" fill="none" stroke="#000000" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><rect x="20" y="67.609375" width="134.4375" height="18" rx="0" ry="0" fill="#ffffff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="87.21875" y="76.609375" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#000000" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="5.5" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">Android client</tspan></text><rect x="10" y="517.3125" width="154.4375" height="57.21875" rx="0" ry="0" fill="none" stroke="#000000" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><rect x="20" y="536.921875" width="134.4375" height="18" rx="0" ry="0" fill="#ffffff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="87.21875" y="545.921875" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#000000" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="5.5" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">Android client</tspan></text><path fill="none" stroke="#000000" d="M87.21875,105.21875L87.21875,517.3125" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></path><rect x="184.4375" y="48" width="106.4375" height="57.21875" rx="0" ry="0" fill="none" stroke="#000000" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><rect x="194.4375" y="58" width="86.4375" height="37.21875" rx="0" ry="0" fill="#ffffff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="237.65625" y="76.609375" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#000000" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="-4.1015625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">Java </tspan><tspan dy="19.2" x="237.65625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"> Interface</tspan></text><rect x="184.4375" y="517.3125" width="106.4375" height="57.21875" rx="0" ry="0" fill="none" stroke="#000000" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><rect x="194.4375" y="527.3125" width="86.4375" height="37.21875" rx="0" ry="0" fill="#ffffff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="237.65625" y="545.921875" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#000000" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="-4.1015625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">Java </tspan><tspan dy="19.2" x="237.65625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"> Interface</tspan></text><path fill="none" stroke="#000000" d="M237.65625,105.21875L237.65625,517.3125" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></path><rect x="310.875" y="48" width="87.21875" height="57.21875" rx="0" ry="0" fill="none" stroke="#000000" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><rect x="320.875" y="58" width="67.21875" height="37.21875" rx="0" ry="0" fill="#ffffff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="354.484375" y="76.609375" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#000000" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="-4.1015625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">native </tspan><tspan dy="19.2" x="354.484375" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"> API</tspan></text><rect x="310.875" y="517.3125" width="87.21875" height="57.21875" rx="0" ry="0" fill="none" stroke="#000000" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><rect x="320.875" y="527.3125" width="67.21875" height="37.21875" rx="0" ry="0" fill="#ffffff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="354.484375" y="545.921875" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#000000" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="-4.1015625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">native </tspan><tspan dy="19.2" x="354.484375" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"> API</tspan></text><path fill="none" stroke="#000000" d="M354.484375,105.21875L354.484375,517.3125" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></path><rect x="104.828125" y="111.609375" width="115.21875" height="37.21875" rx="0" ry="0" fill="#ffffff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="162.4375" y="130.21875" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#00acc1" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="-4.1015625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">Java/Kotlin </tspan><tspan dy="19.2" x="162.4375" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">API input</tspan></text><path fill="none" stroke="#00acc1" d="M87.21875,162.4375C87.21875,162.4375,206.54771423339844,162.4375,232.65092515945435,162.4375" stroke-width="2" marker-end="url(#raphael-marker-endblock55-objjks7y)" stroke-dasharray="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></path><rect x="218.25" y="182.4375" width="38.8125" height="28" rx="0" ry="0" fill="#00acc1" stroke="#ffffff" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><rect x="223.25" y="187.4375" width="28.8125" height="18" rx="0" ry="0" fill="#00acc1" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="237.65625" y="196.4375" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#ffffff" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="5.5" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">JNI</tspan></text><rect x="252.859375" y="216.828125" width="86.421875" height="37.21875" rx="0" ry="0" fill="#ffffff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="296.0703125" y="235.4375" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#00acc1" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="-4.1015625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">native </tspan><tspan dy="19.2" x="296.0703125" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"> API input</tspan></text><path fill="none" stroke="#00acc1" d="M237.65625,267.65625C237.65625,267.65625,327.1527045266703,267.65625,349.483118717651,267.65625" stroke-width="2" marker-end="url(#raphael-marker-endblock55-obj68k5o)" stroke-dasharray="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></path><rect x="301.46875" y="287.65625" width="106.03125" height="47.21875" rx="0" ry="0" fill="#ff6f00" stroke="#ffffff" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><rect x="306.46875" y="292.65625" width="96.03125" height="37.21875" rx="0" ry="0" fill="#ff6f00" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="354.484375" y="311.265625" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#ffffff" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="-4.1015625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">model </tspan><tspan dy="19.2" x="354.484375" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"> invocation</tspan></text><rect x="248.0625" y="341.265625" width="96.015625" height="37.21875" rx="0" ry="0" fill="#ffffff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="296.0703125" y="359.875" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#d4e157" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="-4.1015625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">native </tspan><tspan dy="19.2" x="296.0703125" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"> API output</tspan></text><path fill="none" stroke="#d4e157" d="M354.484375,392.09375C354.484375,392.09375,264.9879204733297,392.09375,242.657506282349,392.09375" stroke-width="2" marker-end="url(#raphael-marker-endblock55-objev9wf)" stroke-dasharray="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></path><rect x="218.25" y="412.09375" width="38.8125" height="28" rx="0" ry="0" fill="#d4e157" stroke="#ffffff" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><rect x="223.25" y="417.09375" width="28.8125" height="18" rx="0" ry="0" fill="#d4e157" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="237.65625" y="426.09375" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#ffffff" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="5.5" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">JNI</tspan></text><rect x="104.828125" y="446.484375" width="115.21875" height="37.21875" rx="0" ry="0" fill="#ffffff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="162.4375" y="465.09375" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#d4e157" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="-4.1015625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">Java/Kotlin </tspan><tspan dy="19.2" x="162.4375" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"> API output</tspan></text><path fill="none" stroke="#d4e157" d="M237.65625,497.3125C237.65625,497.3125,118.32728576660156,497.3125,92.22407484054565,497.3125" stroke-width="2" marker-end="url(#raphael-marker-endblock55-objvt0oa)" stroke-dasharray="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></path></svg>
\ No newline at end of file
diff --git a/tensorflow/lite/g3doc/inference_with_metadata/task_library/images/detection-output.png b/tensorflow/lite/g3doc/inference_with_metadata/task_library/images/detection-output.png
new file mode 100644
index 00000000000..c8d56f405c4
Binary files /dev/null and b/tensorflow/lite/g3doc/inference_with_metadata/task_library/images/detection-output.png differ
diff --git a/tensorflow/lite/g3doc/inference_with_metadata/task_library/images/dogs.jpg b/tensorflow/lite/g3doc/inference_with_metadata/task_library/images/dogs.jpg
new file mode 100644
index 00000000000..9db4bee75d4
Binary files /dev/null and b/tensorflow/lite/g3doc/inference_with_metadata/task_library/images/dogs.jpg differ
diff --git a/tensorflow/lite/g3doc/inference_with_metadata/task_library/images/ios_task_api.svg b/tensorflow/lite/g3doc/inference_with_metadata/task_library/images/ios_task_api.svg
new file mode 100644
index 00000000000..615b12347e9
--- /dev/null
+++ b/tensorflow/lite/g3doc/inference_with_metadata/task_library/images/ios_task_api.svg
@@ -0,0 +1 @@
+<?xml version="1.0" encoding="utf-8" standalone="no"?><!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 20010904//EN" "http://www.w3.org/TR/2001/REC-SVG-20010904/DTD/svg10.dtd"><svg xmlns="http://www.w3.org/2000/svg" width="452" height="632" xmlns:xlink="http://www.w3.org/1999/xlink"><desc style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">Created with Raphaël 2.2.0</desc><defs style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"><path stroke-linecap="round" d="M5,0 0,2.5 5,5z" id="raphael-marker-block" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></path><marker id="raphael-marker-endblock55-objk5zfc" markerHeight="5" markerWidth="5" orient="auto" refX="2.5" refY="2.5" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"><use xlink:href="#raphael-marker-block" transform="rotate(180 2.5 2.5) scale(1,1)" stroke-width="1.0000" fill="#00acc1" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></use></marker><marker id="raphael-marker-endblock55-obj0wlxn" markerHeight="5" markerWidth="5" orient="auto" refX="2.5" refY="2.5" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"><use xlink:href="#raphael-marker-block" transform="rotate(180 2.5 2.5) scale(1,1)" stroke-width="1.0000" fill="#00acc1" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></use></marker><marker id="raphael-marker-endblock55-objqcufj" markerHeight="5" markerWidth="5" orient="auto" refX="2.5" refY="2.5" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"><use xlink:href="#raphael-marker-block" transform="rotate(180 2.5 2.5) scale(1,1)" stroke-width="1.0000" fill="#d4e157" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></use></marker><marker id="raphael-marker-endblock55-objy4vsc" markerHeight="5" markerWidth="5" orient="auto" refX="2.5" refY="2.5" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"><use xlink:href="#raphael-marker-block" transform="rotate(180 2.5 2.5) scale(1,1)" stroke-width="1.0000" fill="#d4e157" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></use></marker></defs><rect x="10" y="10" width="125.21875" height="28" rx="0" ry="0" fill="#0000ff" stroke="#ffffff" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><rect x="15" y="15" width="115.21875" height="18" rx="0" ry="0" fill="#0000ff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="72.609375" y="24" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#ffffff" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="5.5" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">iOS Task API</tspan></text><rect x="10" y="48" width="116.03125" height="57.21875" rx="0" ry="0" fill="none" stroke="#000000" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><rect x="20" y="67.609375" width="96.03125" height="18" rx="0" ry="0" fill="#ffffff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="68.015625" y="76.609375" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#000000" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="5.5" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">iOS client</tspan></text><rect x="10" y="555.75" width="116.03125" height="57.21875" rx="0" ry="0" fill="none" stroke="#000000" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><rect x="20" y="575.359375" width="96.03125" height="18" rx="0" ry="0" fill="#ffffff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="68.015625" y="584.359375" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#000000" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="5.5" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">iOS client</tspan></text><path fill="none" stroke="#000000" d="M68.015625,105.21875L68.015625,555.75" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></path><rect x="146.03125" y="48" width="106.4375" height="57.21875" rx="0" ry="0" fill="none" stroke="#000000" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><rect x="156.03125" y="58" width="86.4375" height="37.21875" rx="0" ry="0" fill="#ffffff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="199.25" y="76.609375" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#000000" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="-4.1015625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">ObjC </tspan><tspan dy="19.2" x="199.25" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"> Interface</tspan></text><rect x="146.03125" y="555.75" width="106.4375" height="57.21875" rx="0" ry="0" fill="none" stroke="#000000" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><rect x="156.03125" y="565.75" width="86.4375" height="37.21875" rx="0" ry="0" fill="#ffffff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="199.25" y="584.359375" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#000000" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="-4.1015625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">ObjC </tspan><tspan dy="19.2" x="199.25" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"> Interface</tspan></text><path fill="none" stroke="#000000" d="M199.25,105.21875L199.25,555.75" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></path><rect x="272.46875" y="48" width="87.21875" height="57.21875" rx="0" ry="0" fill="none" stroke="#000000" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><rect x="282.46875" y="58" width="67.21875" height="37.21875" rx="0" ry="0" fill="#ffffff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="316.078125" y="76.609375" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#000000" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="-4.1015625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">native </tspan><tspan dy="19.2" x="316.078125" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"> API</tspan></text><rect x="272.46875" y="555.75" width="87.21875" height="57.21875" rx="0" ry="0" fill="none" stroke="#000000" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><rect x="282.46875" y="565.75" width="67.21875" height="37.21875" rx="0" ry="0" fill="#ffffff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="316.078125" y="584.359375" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#000000" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="-4.1015625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">native </tspan><tspan dy="19.2" x="316.078125" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"> API</tspan></text><path fill="none" stroke="#000000" d="M316.078125,105.21875L316.078125,555.75" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></path><rect x="80.8125" y="111.609375" width="105.640625" height="37.21875" rx="0" ry="0" fill="#ffffff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="133.6328125" y="130.21875" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#00acc1" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="-4.1015625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">Swift/ObjC </tspan><tspan dy="19.2" x="133.6328125" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">API input</tspan></text><path fill="none" stroke="#00acc1" d="M68.015625,162.4375C68.015625,162.4375,170.23761106934398,162.4375,194.2458021334819,162.4375" stroke-width="2" marker-end="url(#raphael-marker-endblock55-objk5zfc)" stroke-dasharray="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></path><rect x="160.640625" y="182.4375" width="77.21875" height="47.21875" rx="0" ry="0" fill="#00acc1" stroke="#ffffff" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><rect x="165.640625" y="187.4375" width="67.21875" height="37.21875" rx="0" ry="0" fill="#00acc1" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="199.25" y="206.046875" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#ffffff" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="-4.1015625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">native </tspan><tspan dy="19.2" x="199.25" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"> wrapper</tspan></text><rect x="214.453125" y="236.046875" width="86.421875" height="37.21875" rx="0" ry="0" fill="#ffffff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="257.6640625" y="254.65625" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#00acc1" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="-4.1015625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">native </tspan><tspan dy="19.2" x="257.6640625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"> API input</tspan></text><path fill="none" stroke="#00acc1" d="M199.25,286.875C199.25,286.875,288.7464545266703,286.875,311.076868717651,286.875" stroke-width="2" marker-end="url(#raphael-marker-endblock55-obj0wlxn)" stroke-dasharray="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></path><rect x="263.0625" y="306.875" width="106.03125" height="47.21875" rx="0" ry="0" fill="#ff6f00" stroke="#ffffff" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><rect x="268.0625" y="311.875" width="96.03125" height="37.21875" rx="0" ry="0" fill="#ff6f00" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="316.078125" y="330.484375" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#ffffff" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="-4.1015625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">model </tspan><tspan dy="19.2" x="316.078125" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"> invocation</tspan></text><rect x="209.65625" y="360.484375" width="96.015625" height="37.21875" rx="0" ry="0" fill="#ffffff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="257.6640625" y="379.09375" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#d4e157" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="-4.1015625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">native </tspan><tspan dy="19.2" x="257.6640625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"> API output</tspan></text><path fill="none" stroke="#d4e157" d="M316.078125,411.3125C316.078125,411.3125,226.58167047332972,411.3125,204.251256282349,411.3125" stroke-width="2" marker-end="url(#raphael-marker-endblock55-objqcufj)" stroke-dasharray="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></path><rect x="160.640625" y="431.3125" width="77.21875" height="47.21875" rx="0" ry="0" fill="#d4e157" stroke="#ffffff" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><rect x="165.640625" y="436.3125" width="67.21875" height="37.21875" rx="0" ry="0" fill="#d4e157" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="199.25" y="454.921875" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#ffffff" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="-4.1015625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">native </tspan><tspan dy="19.2" x="199.25" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"> wrapper</tspan></text><rect x="80.8125" y="484.921875" width="105.640625" height="37.21875" rx="0" ry="0" fill="#ffffff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="133.6328125" y="503.53125" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#d4e157" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="-4.1015625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">Swift/ObjC </tspan><tspan dy="19.2" x="133.6328125" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"> API output</tspan></text><path fill="none" stroke="#d4e157" d="M199.25,535.75C199.25,535.75,97.02801393065602,535.75,73.0198228665181,535.75" stroke-width="2" marker-end="url(#raphael-marker-endblock55-objy4vsc)" stroke-dasharray="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></path></svg>
\ No newline at end of file
diff --git a/tensorflow/lite/g3doc/inference_with_metadata/task_library/images/native_task_api.svg b/tensorflow/lite/g3doc/inference_with_metadata/task_library/images/native_task_api.svg
new file mode 100644
index 00000000000..e87c95a40c1
--- /dev/null
+++ b/tensorflow/lite/g3doc/inference_with_metadata/task_library/images/native_task_api.svg
@@ -0,0 +1 @@
+<?xml version="1.0" encoding="utf-8" standalone="no"?><!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 20010904//EN" "http://www.w3.org/TR/2001/REC-SVG-20010904/DTD/svg10.dtd"><svg xmlns="http://www.w3.org/2000/svg" width="659" height="556" xmlns:xlink="http://www.w3.org/1999/xlink"><desc style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">Created with Raphaël 2.2.0</desc><defs style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"><path stroke-linecap="round" d="M5,0 0,2.5 5,5z" id="raphael-marker-block" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></path><marker id="raphael-marker-endblock55-obji09o4" markerHeight="5" markerWidth="5" orient="auto" refX="2.5" refY="2.5" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"><use xlink:href="#raphael-marker-block" transform="rotate(180 2.5 2.5) scale(1,1)" stroke-width="1.0000" fill="#00acc1" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></use></marker><marker id="raphael-marker-endblock55-obj7hc3v" markerHeight="5" markerWidth="5" orient="auto" refX="2.5" refY="2.5" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"><use xlink:href="#raphael-marker-block" transform="rotate(180 2.5 2.5) scale(1,1)" stroke-width="1.0000" fill="#ff6f00" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></use></marker><marker id="raphael-marker-endblock55-objuf9tb" markerHeight="5" markerWidth="5" orient="auto" refX="2.5" refY="2.5" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"><use xlink:href="#raphael-marker-block" transform="rotate(180 2.5 2.5) scale(1,1)" stroke-width="1.0000" fill="#ff6f00" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></use></marker><marker id="raphael-marker-endblock55-objokijf" markerHeight="5" markerWidth="5" orient="auto" refX="2.5" refY="2.5" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"><use xlink:href="#raphael-marker-block" transform="rotate(180 2.5 2.5) scale(1,1)" stroke-width="1.0000" fill="#d4e157" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></use></marker></defs><rect x="10" y="10" width="154.03125" height="28" rx="0" ry="0" fill="#0000ff" stroke="#ffffff" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><rect x="15" y="15" width="144.03125" height="18" rx="0" ry="0" fill="#0000ff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="87.015625" y="24" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#ffffff" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="5.5" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">native Task API</tspan></text><rect x="10" y="48" width="116.03125" height="57.21875" rx="0" ry="0" fill="none" stroke="#000000" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><rect x="20" y="67.609375" width="96.03125" height="18" rx="0" ry="0" fill="#ffffff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="68.015625" y="76.609375" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#000000" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="5.5" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">C++ client</tspan></text><rect x="10" y="478.875" width="116.03125" height="57.21875" rx="0" ry="0" fill="none" stroke="#000000" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><rect x="20" y="498.484375" width="96.03125" height="18" rx="0" ry="0" fill="#ffffff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="68.015625" y="507.484375" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#000000" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="5.5" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">C++ client</tspan></text><path fill="none" stroke="#000000" d="M68.015625,105.21875L68.015625,478.875" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></path><rect x="146.03125" y="48" width="116.03125" height="57.21875" rx="0" ry="0" fill="none" stroke="#000000" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><rect x="156.03125" y="67.609375" width="96.03125" height="18" rx="0" ry="0" fill="#ffffff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="204.046875" y="76.609375" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#000000" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="5.5" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">preprocess</tspan></text><rect x="146.03125" y="478.875" width="116.03125" height="57.21875" rx="0" ry="0" fill="none" stroke="#000000" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><rect x="156.03125" y="498.484375" width="96.03125" height="18" rx="0" ry="0" fill="#ffffff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="204.046875" y="507.484375" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#000000" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="5.5" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">preprocess</tspan></text><path fill="none" stroke="#000000" d="M204.046875,105.21875L204.046875,478.875" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></path><rect x="295.65625" y="48" width="87.21875" height="57.21875" rx="0" ry="0" fill="none" stroke="#000000" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><rect x="305.65625" y="58" width="67.21875" height="37.21875" rx="0" ry="0" fill="#ffffff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="339.265625" y="76.609375" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#000000" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="-4.1015625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">TFLite </tspan><tspan dy="19.2" x="339.265625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"> runtime</tspan></text><rect x="295.65625" y="478.875" width="87.21875" height="57.21875" rx="0" ry="0" fill="none" stroke="#000000" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><rect x="305.65625" y="488.875" width="67.21875" height="37.21875" rx="0" ry="0" fill="#ffffff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="339.265625" y="507.484375" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#000000" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="-4.1015625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">TFLite </tspan><tspan dy="19.2" x="339.265625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"> runtime</tspan></text><path fill="none" stroke="#000000" d="M339.265625,105.21875L339.265625,478.875" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></path><rect x="421.296875" y="48" width="125.625" height="57.21875" rx="0" ry="0" fill="none" stroke="#000000" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><rect x="431.296875" y="67.609375" width="105.625" height="18" rx="0" ry="0" fill="#ffffff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="484.109375" y="76.609375" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#000000" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="5.5" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">postprocess</tspan></text><rect x="421.296875" y="478.875" width="125.625" height="57.21875" rx="0" ry="0" fill="none" stroke="#000000" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><rect x="431.296875" y="498.484375" width="105.625" height="18" rx="0" ry="0" fill="#ffffff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="484.109375" y="507.484375" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#000000" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="5.5" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">postprocess</tspan></text><path fill="none" stroke="#000000" d="M484.109375,105.21875L484.109375,478.875" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></path><rect x="92.8125" y="111.609375" width="86.4375" height="37.21875" rx="0" ry="0" fill="#ffffff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="136.03125" y="130.21875" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#00acc1" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="-4.1015625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">native </tspan><tspan dy="19.2" x="136.03125" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">API input</tspan></text><path fill="none" stroke="#00acc1" d="M68.015625,162.4375C68.015625,162.4375,174.50227653980255,162.4375,199.04412201186642,162.4375" stroke-width="2" marker-end="url(#raphael-marker-endblock55-obji09o4)" stroke-dasharray="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></path><rect x="131.828125" y="182.4375" width="144.4375" height="28" rx="0" ry="0" fill="#00acc1" stroke="#ffffff" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><rect x="136.828125" y="187.4375" width="134.4375" height="18" rx="0" ry="0" fill="#00acc1" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="204.046875" y="196.4375" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#ffffff" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="5.5" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">data to tensor</tspan></text><rect x="214.046875" y="226.4375" width="115.21875" height="18" rx="0" ry="0" fill="#ffffff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="271.65625" y="235.4375" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#ff6f00" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="5.5" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">input tensor</tspan></text><path fill="none" stroke="#ff6f00" d="M204.046875,248.4375C204.046875,248.4375,309.8390848468989,248.4375,334.2725395625157,248.4375" stroke-width="2" marker-end="url(#raphael-marker-endblock55-obj7hc3v)" stroke-dasharray="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></path><rect x="286.25" y="268.4375" width="106.03125" height="47.21875" rx="0" ry="0" fill="#ff6f00" stroke="#ffffff" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><rect x="291.25" y="273.4375" width="96.03125" height="37.21875" rx="0" ry="0" fill="#ff6f00" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="339.265625" y="292.046875" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#ffffff" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="-4.1015625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">model </tspan><tspan dy="19.2" x="339.265625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"> invocation</tspan></text><rect x="349.265625" y="331.65625" width="124.84375" height="18" rx="0" ry="0" fill="#ffffff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="411.6875" y="340.65625" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#ff6f00" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="5.5" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">output tensor</tspan></text><path fill="none" stroke="#ff6f00" d="M339.265625,353.65625C339.265625,353.65625,453.5913529600948,353.65625,479.1025139355652,353.65625" stroke-width="2" marker-end="url(#raphael-marker-endblock55-objuf9tb)" stroke-dasharray="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></path><rect x="411.890625" y="373.65625" width="144.4375" height="28" rx="0" ry="0" fill="#d4e157" stroke="#ffffff" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><rect x="416.890625" y="378.65625" width="134.4375" height="18" rx="0" ry="0" fill="#d4e157" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="484.109375" y="387.65625" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#ffffff" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="5.5" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">tensor to data</tspan></text><rect x="228.046875" y="408.046875" width="96.03125" height="37.21875" rx="0" ry="0" fill="#ffffff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="276.0625" y="426.65625" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#d4e157" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="-4.1015625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">native </tspan><tspan dy="19.2" x="276.0625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">API output</tspan></text><path fill="none" stroke="#d4e157" d="M484.109375,458.875C484.109375,458.875,120.11422207392752,458.875,73.01523988378631,458.875" stroke-width="2" marker-end="url(#raphael-marker-endblock55-objokijf)" stroke-dasharray="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></path></svg>
\ No newline at end of file
diff --git a/tensorflow/lite/g3doc/inference_with_metadata/task_library/images/plane.jpg b/tensorflow/lite/g3doc/inference_with_metadata/task_library/images/plane.jpg
new file mode 100644
index 00000000000..0edefa40a03
Binary files /dev/null and b/tensorflow/lite/g3doc/inference_with_metadata/task_library/images/plane.jpg differ
diff --git a/tensorflow/lite/g3doc/inference_with_metadata/task_library/images/prebuilt_task_apis.svg b/tensorflow/lite/g3doc/inference_with_metadata/task_library/images/prebuilt_task_apis.svg
new file mode 100644
index 00000000000..c9aced3dea5
--- /dev/null
+++ b/tensorflow/lite/g3doc/inference_with_metadata/task_library/images/prebuilt_task_apis.svg
@@ -0,0 +1 @@
+<svg version="1.1" viewBox="0.0 0.0 560.3700787401575 156.24409448818898" fill="none" stroke="none" stroke-linecap="square" stroke-miterlimit="10" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns="http://www.w3.org/2000/svg"><clipPath id="p.0"><path d="m0 0l560.37006 0l0 156.2441l-560.37006 0l0 -156.2441z" clip-rule="nonzero"/></clipPath><g clip-path="url(#p.0)"><path fill="#000000" fill-opacity="0.0" d="m0 0l560.37006 0l0 156.2441l-560.37006 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m143.04504 24.922682l8.283463 0l0 3.8740158l-8.283463 0z" fill-rule="evenodd"/><path fill="#ff9900" d="m147.16116 8.367561l0 0c0 -2.968771 2.406662 -5.3754354 5.3754425 -5.3754354l78.225494 0c1.4256592 0 2.792923 0.566339 3.8010101 1.5744286c1.0080872 1.00809 1.5744324 2.3753529 1.5744324 3.8010068l0 21.501097c0 2.968771 -2.406662 5.375437 -5.3754425 5.375437l-78.225494 0c-2.9687805 0 -5.3754425 -2.4066658 -5.3754425 -5.375437z" fill-rule="evenodd"/><path fill="#000000" d="m161.50356 15.764359l3.0 0q2.296875 0 2.296875 1.781251q0 0.984375 -0.984375 1.5q0.640625 0.15625 0.984375 0.59375q0.359375 0.421875 0.359375 1.09375q0 1.03125 -0.703125 1.578125q-0.6875 0.546875 -2.015625 0.546875l-3.0 0l0 -0.859375l0.359375 -0.03125q0.296875 -0.03125 0.296875 -0.296875l0 -4.96875l-0.59375 -0.03125l0 -0.90625095zm1.90625 3.859376l0 2.234375l1.0 0q1.375 0 1.375 -1.125q0 -0.546875 -0.34375 -0.828125q-0.34375 -0.28125 -0.984375 -0.28125l-1.046875 0zm0 -2.875l0 1.9375l0.78125 0q0.65625 0 0.96875 -0.265625q0.328125 -0.28125 0.328125 -0.765625q0 -0.5 -0.328125 -0.703125q-0.3125 -0.203125 -0.890625 -0.203125l-0.859375 0zm9.227631 0.96875l0 3.953125q0 0.171875 0.046875 0.234375q0.0625 0.0625 0.21875 0.078125l0.34375 0.015625l0 0.859375l-1.703125 0l0 -0.625l-0.03125 0q-0.53125 0.765625 -1.453125 0.765625q-1.09375 0 -1.625 -0.6875q-0.515625 -0.703125 -0.515625 -1.90625q0 -1.453125 0.703125 -2.25q0.703125 -0.8125 2.109375 -0.8125q0.90625 0 1.90625 0.375zm-1.234375 3.453125l0 -2.765625q-0.296875 -0.140625 -0.828125 -0.140625q-0.71875 0 -1.046875 0.578125q-0.3125 0.578125 -0.3125 1.515625q0 1.734375 1.109375 1.734375q0.46875 0 0.765625 -0.28125q0.3125 -0.28125 0.3125 -0.640625zm5.276535 -2.859375q-0.25 -0.09375 -0.625 -0.09375q-0.359375 0 -0.578125 0.171875q-0.21875 0.15625 -0.21875 0.390625q0 0.234375 0.078125 0.375q0.09375 0.140625 0.265625 0.234375q0.265625 0.140625 0.625 0.234375q0.375 0.09375 0.5625 0.15625q0.1875 0.0625 0.453125 0.203125q0.265625 0.140625 0.40625 0.296875q0.375 0.390625 0.375 1.015625q0 0.796875 -0.578125 1.25q-0.578125 0.453125 -1.484375 0.453125q-1.296875 0 -1.953125 -0.328125l0 -1.484375l0.953125 -0.078125l0 0.515625q0 0.46875 0.890625 0.46875q0.90625 0 0.90625 -0.65625q0 -0.234375 -0.15625 -0.375q-0.15625 -0.15625 -0.3125 -0.203125q-0.140625 -0.0625 -0.359375 -0.109375q-0.203125 -0.046875 -0.40625 -0.09375q-0.1875 -0.0625 -0.421875 -0.15625q-0.21875 -0.09375 -0.5 -0.265625q-0.546875 -0.34375 -0.546875 -1.171875q0 -0.828125 0.578125 -1.265625q0.59375 -0.453125 1.484375 -0.453125q0.890625 0 1.765625 0.421875l0 1.28125l-0.953125 0.078125l0 -0.453125q0 -0.265625 -0.25 -0.359375zm4.69664 -0.96875q0.796875 0 1.265625 0.390625q0.46875 0.390625 0.46875 1.09375q0 0.46875 -0.203125 0.828125q-0.203125 0.34375 -0.5 0.546875q-0.296875 0.203125 -0.734375 0.328125q-0.703125 0.21875 -1.609375 0.21875q0.046875 0.5625 0.359375 0.90625q0.3125 0.34375 0.96875 0.34375q0.671875 0 1.328125 -0.46875l0.40625 0.875q-0.203125 0.1875 -0.71875 0.390625q-0.5 0.203125 -1.15625 0.203125q-1.296875 0 -1.90625 -0.71875q-0.609375 -0.71875 -0.609375 -1.96875q0 -1.265625 0.6875 -2.109375q0.703125 -0.859375 1.953125 -0.859375zm-0.484375 2.4375q0.390625 -0.078125 0.71875 -0.3125q0.328125 -0.25 0.328125 -0.578125q0 -0.640625 -0.640625 -0.640625q-0.59375 0 -0.921875 0.46875q-0.3125 0.46875 -0.34375 1.140625q0.46875 -0.015625 0.859375 -0.078125zm4.9389343 1.890625l0 -4.84375l-0.84375 0q-0.234375 0 -0.234375 0.328125l0 0.5l-1.09375 -0.078125l0 -1.812501l5.578125 0l0 1.812501l-1.09375 0.078125l0 -0.5q0 -0.171875 -0.046875 -0.25q-0.046875 -0.078125 -0.25 -0.078125l-0.734375 0l0 5.09375l0.84375 0.03125l0 0.90625l-3.0 0l0 -0.859375l0.5625 -0.03125q0.3125 -0.03125 0.3125 -0.296875zm8.70546 -3.953125l0 3.953125q0 0.171875 0.046875 0.234375q0.0625 0.0625 0.21875 0.078125l0.34375 0.015625l0 0.859375l-1.703125 0l0 -0.625l-0.03125 0q-0.53125 0.765625 -1.453125 0.765625q-1.09375 0 -1.625 -0.6875q-0.515625 -0.703125 -0.515625 -1.90625q0 -1.453125 0.703125 -2.25q0.703125 -0.8125 2.109375 -0.8125q0.90625 0 1.90625 0.375zm-1.234375 3.453125l0 -2.765625q-0.296875 -0.140625 -0.828125 -0.140625q-0.71875 0 -1.046875 0.578125q-0.3125 0.578125 -0.3125 1.515625q0 1.734375 1.109375 1.734375q0.46875 0 0.765625 -0.28125q0.3125 -0.28125 0.3125 -0.640625zm5.276535 -2.859375q-0.25 -0.09375 -0.625 -0.09375q-0.359375 0 -0.578125 0.171875q-0.21875 0.15625 -0.21875 0.390625q0 0.234375 0.078125 0.375q0.09375 0.140625 0.265625 0.234375q0.265625 0.140625 0.625 0.234375q0.375 0.09375 0.5625 0.15625q0.1875 0.0625 0.453125 0.203125q0.265625 0.140625 0.40625 0.296875q0.375 0.390625 0.375 1.015625q0 0.796875 -0.578125 1.25q-0.578125 0.453125 -1.484375 0.453125q-1.296875 0 -1.953125 -0.328125l0 -1.484375l0.953125 -0.078125l0 0.515625q0 0.46875 0.890625 0.46875q0.90625 0 0.90625 -0.65625q0 -0.234375 -0.15625 -0.375q-0.15625 -0.15625 -0.3125 -0.203125q-0.140625 -0.0625 -0.359375 -0.109375q-0.203125 -0.046875 -0.40625 -0.09375q-0.1875 -0.0625 -0.421875 -0.15625q-0.21875 -0.09375 -0.5 -0.265625q-0.546875 -0.34375 -0.546875 -1.171875q0 -0.828125 0.578125 -1.265625q0.59375 -0.453125 1.484375 -0.453125q0.890625 0 1.765625 0.421875l0 1.28125l-0.953125 0.078125l0 -0.453125q0 -0.265625 -0.25 -0.359375zm2.681015 3.359375l0 -5.203125q0 -0.15625 -0.0625 -0.21875q-0.0625 -0.078125 -0.203125 -0.078125l-0.375 -0.03125l0 -0.87500095l1.890625 0l0 4.609376q0.59375 -0.046875 1.03125 -0.359375q0.4375 -0.3125 0.59375 -0.8125q0.03125 -0.109375 0.03125 -0.171875q0 -0.171875 -0.203125 -0.171875l-0.40625 -0.03125l0 -0.828125l2.203125 0l0 0.859375l-0.5 0.03125q-0.140625 1.078125 -0.9375 1.703125l1.03125 1.875l0.65625 0.03125l0 0.859375l-1.5625 0l-1.203125 -2.21875q-0.359375 0.09375 -0.734375 0.15625l0 1.171875l0.015625 0l0.609375 0.03125l0 0.859375l-2.53125 0l0 -0.8125l0.34375 -0.03125q0.15625 -0.015625 0.234375 -0.078125q0.078125 -0.078125 0.078125 -0.265625zm5.628067 0l1.71875 -4.96875l-0.671875 -0.0625l0 -0.87500095l2.4375 0l2.1875 6.156251l0 0l0.60939026 0.03125l0 0.90625l-2.7500153 0l0 -0.859375l0.40625 -0.03125q0.1875 -0.03125 0.25 -0.09375q0.0625 -0.0625 0 -0.234375l-0.21875 -0.65625l-2.5 0l-0.3125 0.9375l0.640625 0.03125l0 0.90625l-2.46875 0l0 -0.859375l0.359375 -0.03125q0.21875 -0.03125 0.3125 -0.296875zm2.71875 -4.703125l-0.984375 3.03125l1.984375 0l-0.96875 -3.03125l-0.03125 0zm4.3568115 6.90625l0 -5.15625q0 -0.1875 -0.0625 -0.25q-0.0625 -0.078125 -0.203125 -0.078125l-0.453125 -0.03125l0 -0.859375l1.796875 0l0 0.6875q0.171875 -0.3125 0.609375 -0.578125q0.453125 -0.265625 1.078125 -0.265625q1.921875 0 1.921875 2.703125q0 1.4375 -0.625 2.203125q-0.625 0.765625 -1.703125 0.765625q-0.640625 0 -1.140625 -0.3125l0 1.484375l0.859375 0.03125l0 0.84375l-2.75 0l0 -0.8125l0.359375 -0.03125q0.15625 -0.015625 0.234375 -0.09375q0.078125 -0.0625 0.078125 -0.25zm3.375 -3.71875q0 -1.78125 -1.078125 -1.78125q-0.453125 0 -0.765625 0.28125q-0.3125 0.265625 -0.3125 0.640625l0 2.4375q0.40625 0.265625 0.984375 0.265625q0.578125 0 0.875 -0.53125q0.296875 -0.53125 0.296875 -1.3125zm4.0059357 -2.65625l0 4.46875l0.609375 0.03125l0 0.859375l-2.53125 0l0 -0.8125l0.34375 -0.03125q0.3125 -0.03125 0.3125 -0.34375l0 -2.953125q0 -0.1875 -0.0625 -0.25q-0.0625 -0.0625 -0.203125 -0.0625l-0.375 -0.015625l0 -0.890625l1.90625 0zm-1.34375 -0.921875q-0.21875 -0.21875 -0.21875 -0.5625q0 -0.34375095 0.21875 -0.56250095q0.21875 -0.234375 0.578125 -0.234375q0.375 0 0.59375 0.234375q0.234375 0.21875 0.234375 0.56250095q0 0.34375 -0.234375 0.5625q-0.21875 0.21875 -0.59375 0.21875q-0.359375 0 -0.578125 -0.21875z" fill-rule="nonzero"/><path fill="#ff9900" d="m126.44907 65.90535l0 0c0 -2.968769 2.4066696 -5.375435 5.375435 -5.375435l119.64284 0c1.4256439 0 2.7929077 0.5663376 3.8009949 1.5744286c1.0080872 1.008091 1.5744324 2.3753548 1.5744324 3.8010063l0 21.501099c0 2.968773 -2.406662 5.375435 -5.3754272 5.375435l-119.64284 0c-2.9687653 0 -5.375435 -2.406662 -5.375435 -5.375435z" fill-rule="evenodd"/><path fill="#000000" d="m148.8283 77.207146q0 1.546875 -0.65625 2.546875q-0.640625 0.984375 -1.875 1.3125l0 0.03125q0.921875 0.046875 1.765625 0.5625q0.59375 0.375 1.390625 0.375q0.1875 0 0.375 -0.03125l-0.140625 1.15625q-0.171875 0 -0.328125 0l-1.1875 -0.203125q-0.75 -0.25 -1.46875 -0.59375q-0.8125 -0.390625 -1.515625 -0.390625q-0.34375 0 -0.671875 0.078125l0.171875 -0.859375q-2.859375 -0.375 -2.859375 -3.984375q0 -1.109375 0.3125 -1.9375q0.328125 -0.84375 0.875 -1.296875q1.0625 -0.921875 2.484375 -0.921875q1.421875 0 2.375 1.015625q0.953125 1.0 0.953125 3.140625zm-5.5 -0.0625q0 2.96875 2.0625 2.96875q1.109375 0 1.5625 -0.90625q0.390625 -0.796875 0.390625 -1.96875q0 -1.046875 -0.375 -1.921875q-0.1875 -0.484375 -0.609375 -0.75q-0.40625 -0.28125 -1.0625 -0.28125q-0.65625 0 -1.140625 0.4375q-0.46875 0.421875 -0.65625 1.046875q-0.171875 0.609375 -0.171875 1.375zm12.749313 3.078125l0 0.953125l-1.9375 0l0 -0.78125q-0.65625 0.9375 -1.9375 0.9375q-1.859375 0 -1.859375 -2.078125l0 -2.765625q0 -0.328125 -0.3125 -0.34375l-0.375 -0.015625l0 -0.984375l2.109375 0l0 3.796875q0 0.640625 0.171875 0.96875q0.1875 0.3125 0.75 0.3125q0.5625 0 0.90625 -0.34375q0.34375 -0.34375 0.34375 -0.84375l0 -2.53125q0 -0.1875 -0.078125 -0.265625q-0.0625 -0.078125 -0.21875 -0.09375l-0.390625 -0.015625l0 -0.984375l2.109375 0l0 4.703125q0 0.1875 0.0625 0.25q0.0625 0.0625 0.25 0.09375l0.40625 0.03125zm3.7483063 -5.265625q0.90625 0 1.421875 0.4375q0.53125 0.4375 0.53125 1.234375q0 0.53125 -0.234375 0.9375q-0.21875 0.390625 -0.5625 0.625q-0.328125 0.21875 -0.8125 0.375q-0.796875 0.234375 -1.8125 0.234375q0.046875 0.640625 0.40625 1.03125q0.359375 0.390625 1.09375 0.390625q0.75 0 1.5 -0.53125l0.453125 0.96875q-0.25 0.21875 -0.8125 0.453125q-0.5625 0.21875 -1.296875 0.21875q-1.46875 0 -2.15625 -0.8125q-0.6875 -0.8125 -0.6875 -2.21875q0 -1.421875 0.78125 -2.375q0.78125 -0.96875 2.1875 -0.96875zm-0.546875 2.75q0.4375 -0.078125 0.8125 -0.34375q0.375 -0.28125 0.375 -0.65625q0 -0.734375 -0.71875 -0.734375q-0.671875 0 -1.03125 0.546875q-0.359375 0.53125 -0.40625 1.265625q0.53125 0 0.96875 -0.078125zm6.4955444 -1.65625q-0.296875 -0.109375 -0.703125 -0.109375q-0.40625 0 -0.65625 0.1875q-0.25 0.1875 -0.25 0.46875q0 0.265625 0.09375 0.421875q0.09375 0.140625 0.28125 0.25q0.296875 0.15625 0.71875 0.265625q0.421875 0.109375 0.625 0.1875q0.203125 0.0625 0.5 0.21875q0.3125 0.15625 0.46875 0.328125q0.421875 0.453125 0.421875 1.140625q0 0.90625 -0.65625 1.421875q-0.640625 0.5 -1.65625 0.5q-1.46875 0 -2.203125 -0.375l0 -1.671875l1.078125 -0.078125l0 0.578125q0 0.53125 1.0 0.53125q1.015625 0 1.015625 -0.734375q0 -0.265625 -0.171875 -0.4375q-0.171875 -0.171875 -0.34375 -0.21875q-0.171875 -0.0625 -0.40625 -0.125q-0.234375 -0.0625 -0.453125 -0.125q-0.21875 -0.0625 -0.484375 -0.15625q-0.25 -0.109375 -0.5625 -0.296875q-0.609375 -0.390625 -0.609375 -1.3125q0 -0.9375 0.65625 -1.4375q0.65625 -0.515625 1.65625 -0.515625q1.0 0 1.984375 0.484375l0 1.4375l-1.078125 0.078125l0 -0.5q0 -0.296875 -0.265625 -0.40625zm3.479538 -2.1875l0.78125 0l0 1.28125l1.59375 0l-0.125 1.0l-1.46875 0l0 3.15625q0 0.53125 0.1875 0.75q0.1875 0.203125 0.59375 0.203125q0.40625 0 0.8125 -0.25l0.359375 0.921875q-0.609375 0.421875 -1.53125 0.421875q-0.53125 0 -0.90625 -0.140625q-0.375 -0.140625 -0.5625 -0.296875q-0.171875 -0.171875 -0.265625 -0.5q-0.09375 -0.328125 -0.109375 -0.515625q-0.015625 -0.1875 -0.015625 -0.546875l0 -3.203125l-0.84375 0l0.109375 -0.875q0.5625 -0.046875 0.84375 -0.390625q0.296875 -0.359375 0.546875 -1.015625zm5.3521576 1.28125l0 5.03125l0.6875 0.046875l0 0.953125l-2.859375 0l0 -0.90625l0.40625 -0.046875q0.34375 -0.03125 0.34375 -0.375l0 -3.34375q0 -0.203125 -0.0625 -0.265625q-0.0625 -0.078125 -0.234375 -0.078125l-0.421875 -0.03125l0 -0.984375l2.140625 0zm-1.515625 -1.03125q-0.25 -0.25 -0.25 -0.640625q0 -0.390625 0.25 -0.640625q0.265625 -0.265625 0.671875 -0.265625q0.40625 0 0.65625 0.265625q0.265625 0.25 0.265625 0.640625q0 0.390625 -0.265625 0.640625q-0.25 0.234375 -0.65625 0.234375q-0.40625 0 -0.671875 -0.234375zm5.6614075 6.1875q1.3125 0 1.3125 -2.109375q0 -1.078125 -0.296875 -1.625q-0.296875 -0.546875 -0.984375 -0.546875q-0.6875 0 -1.015625 0.53125q-0.328125 0.515625 -0.328125 1.4375q0 1.6875 0.625 2.125q0.28125 0.1875 0.6875 0.1875zm-2.765625 -2.15625q0 -0.875 0.265625 -1.53125q0.265625 -0.65625 0.71875 -1.0q0.84375 -0.65625 1.875 -0.65625q0.71875 0 1.21875 0.234375q0.5 0.234375 0.78125 0.546875q0.28125 0.296875 0.46875 0.890625q0.203125 0.578125 0.203125 1.359375q0 1.65625 -0.8125 2.515625q-0.796875 0.859375 -2.046875 0.859375q-1.25 0 -1.96875 -0.8125q-0.703125 -0.8125 -0.703125 -2.40625zm6.2735443 -2.046875l0 -0.953125l2.03125 0l0 0.78125q0.296875 -0.453125 0.8125 -0.703125q0.53125 -0.265625 1.125 -0.265625q0.90625 0 1.40625 0.53125q0.5 0.515625 0.5 1.5625l0 3.125l0.6875 0.046875l0 0.953125l-2.859375 0l0 -0.90625l0.390625 -0.046875q0.1875 -0.015625 0.265625 -0.09375q0.09375 -0.078125 0.09375 -0.296875l0 -2.46875q0 -0.65625 -0.203125 -0.96875q-0.203125 -0.328125 -0.78125 -0.328125q-0.5625 0 -0.90625 0.359375q-0.328125 0.359375 -0.328125 0.84375l0 2.90625l0.671875 0.046875l0 0.953125l-2.84375 0l0 -0.90625l0.390625 -0.046875q0.171875 -0.015625 0.25 -0.09375q0.09375 -0.078125 0.09375 -0.296875l0 -3.328125q0 -0.359375 -0.296875 -0.375l-0.5 -0.03125zm7.684433 3.734375l1.9375 -5.59375l-0.765625 -0.078125l0 -0.96875l2.75 0l2.453125 6.921875l0.015625 0l0.6875 0.046875l0 1.015625l-3.109375 0l0 -0.96875l0.46875 -0.046875q0.203125 -0.015625 0.265625 -0.078125q0.078125 -0.078125 0.015625 -0.28125l-0.25 -0.734375l-2.8125 0l-0.359375 1.046875l0.71875 0.046875l0 1.015625l-2.765625 0l0 -0.96875l0.390625 -0.046875q0.25 -0.015625 0.359375 -0.328125zm3.0625 -5.296875l-1.109375 3.421875l2.21875 0l-1.078125 -3.421875l-0.03125 0zm4.3971863 1.5625l0 -0.953125l2.03125 0l0 0.78125q0.296875 -0.453125 0.8125 -0.703125q0.53125 -0.265625 1.125 -0.265625q0.90625 0 1.40625 0.53125q0.5 0.515625 0.5 1.5625l0 3.125l0.6875 0.046875l0 0.953125l-2.859375 0l0 -0.90625l0.390625 -0.046875q0.1875 -0.015625 0.265625 -0.09375q0.09375 -0.078125 0.09375 -0.296875l0 -2.46875q0 -0.65625 -0.203125 -0.96875q-0.203125 -0.328125 -0.78125 -0.328125q-0.5625 0 -0.90625 0.359375q-0.328125 0.359375 -0.328125 0.84375l0 2.90625l0.671875 0.046875l0 0.953125l-2.84375 0l0 -0.90625l0.390625 -0.046875q0.171875 -0.015625 0.25 -0.09375q0.09375 -0.078125 0.09375 -0.296875l0 -3.328125q0 -0.359375 -0.296875 -0.375l-0.5 -0.03125zm10.418808 -0.046875q-0.296875 -0.109375 -0.703125 -0.109375q-0.40625 0 -0.65625 0.1875q-0.25 0.1875 -0.25 0.46875q0 0.265625 0.09375 0.421875q0.09375 0.140625 0.28125 0.25q0.296875 0.15625 0.71875 0.265625q0.421875 0.109375 0.625 0.1875q0.203125 0.0625 0.5 0.21875q0.3125 0.15625 0.46875 0.328125q0.421875 0.453125 0.421875 1.140625q0 0.90625 -0.65625 1.421875q-0.640625 0.5 -1.65625 0.5q-1.46875 0 -2.203125 -0.375l0 -1.671875l1.078125 -0.078125l0 0.578125q0 0.53125 1.0 0.53125q1.015625 0 1.015625 -0.734375q0 -0.265625 -0.171875 -0.4375q-0.171875 -0.171875 -0.34375 -0.21875q-0.171875 -0.0625 -0.40625 -0.125q-0.234375 -0.0625 -0.453125 -0.125q-0.21875 -0.0625 -0.484375 -0.15625q-0.25 -0.109375 -0.5625 -0.296875q-0.609375 -0.390625 -0.609375 -1.3125q0 -0.9375 0.65625 -1.4375q0.65625 -0.515625 1.65625 -0.515625q1.0 0 1.984375 0.484375l0 1.4375l-1.078125 0.078125l0 -0.5q0 -0.296875 -0.265625 -0.40625zm6.240158 -0.671875l1.15625 0l1.25 4.46875l0.015625 0l0.828125 -3.71875l-0.609375 -0.03125l0 -0.953125l2.40625 0l0 0.921875l-0.390625 0.015625q-0.25 0.03125 -0.328125 0.296875l-1.140625 4.796875l-1.734375 0l-0.984375 -3.578125l-0.03125 0l-1.015625 3.578125l-1.765625 0l-1.21875 -4.78125q-0.0625 -0.171875 -0.125 -0.234375q-0.0625 -0.0625 -0.21875 -0.078125l-0.34375 -0.015625l0 -0.921875l2.75 0l0 0.953125l-0.671875 0.03125l0.828125 3.6875l0.015625 0l1.328125 -4.4375zm8.57132 -0.421875q0.90625 0 1.421875 0.4375q0.53125 0.4375 0.53125 1.234375q0 0.53125 -0.234375 0.9375q-0.21875 0.390625 -0.5625 0.625q-0.328125 0.21875 -0.8125 0.375q-0.796875 0.234375 -1.8125 0.234375q0.046875 0.640625 0.40625 1.03125q0.359375 0.390625 1.09375 0.390625q0.75 0 1.5 -0.53125l0.453125 0.96875q-0.25 0.21875 -0.8125 0.453125q-0.5625 0.21875 -1.296875 0.21875q-1.46875 0 -2.15625 -0.8125q-0.6875 -0.8125 -0.6875 -2.21875q0 -1.421875 0.78125 -2.375q0.78125 -0.96875 2.1875 -0.96875zm-0.546875 2.75q0.4375 -0.078125 0.8125 -0.34375q0.375 -0.28125 0.375 -0.65625q0 -0.734375 -0.71875 -0.734375q-0.671875 0 -1.03125 0.546875q-0.359375 0.53125 -0.40625 1.265625q0.53125 0 0.96875 -0.078125zm4.0892944 2.125l0 -3.328125q0 -0.1875 -0.0625 -0.265625q-0.0625 -0.078125 -0.234375 -0.09375l-0.5 -0.03125l0 -0.96875l2.015625 0l0 0.890625q0.21875 -0.46875 0.671875 -0.765625q0.453125 -0.3125 1.046875 -0.3125q0.609375 0 1.140625 0.265625l0 1.796875l-1.109375 0.078125l0 -0.546875q0 -0.25 -0.125 -0.3125q-0.125 -0.046875 -0.328125 -0.046875q-0.46875 0 -0.78125 0.34375q-0.3125 0.328125 -0.3125 0.859375l0 2.78125l1.046875 0.046875l0 0.953125l-3.21875 0l0 -0.90625l0.40625 -0.046875q0.171875 -0.015625 0.25 -0.09375q0.09375 -0.078125 0.09375 -0.296875zm7.809662 -4.875q0.90625 0 1.421875 0.4375q0.53125 0.4375 0.53125 1.234375q0 0.53125 -0.234375 0.9375q-0.21875 0.390625 -0.5625 0.625q-0.328125 0.21875 -0.8125 0.375q-0.796875 0.234375 -1.8125 0.234375q0.046875 0.640625 0.40625 1.03125q0.359375 0.390625 1.09375 0.390625q0.75 0 1.5 -0.53125l0.453125 0.96875q-0.25 0.21875 -0.8125 0.453125q-0.5625 0.21875 -1.296875 0.21875q-1.46875 0 -2.15625 -0.8125q-0.6875 -0.8125 -0.6875 -2.21875q0 -1.421875 0.78125 -2.375q0.78125 -0.96875 2.1875 -0.96875zm-0.546875 2.75q0.4375 -0.078125 0.8125 -0.34375q0.375 -0.28125 0.375 -0.65625q0 -0.734375 -0.71875 -0.734375q-0.671875 0 -1.03125 0.546875q-0.359375 0.53125 -0.40625 1.265625q0.53125 0 0.96875 -0.078125zm4.0892944 2.125l0 -3.328125q0 -0.1875 -0.0625 -0.265625q-0.0625 -0.078125 -0.234375 -0.09375l-0.5 -0.03125l0 -0.96875l2.015625 0l0 0.890625q0.21875 -0.46875 0.671875 -0.765625q0.453125 -0.3125 1.046875 -0.3125q0.609375 0 1.140625 0.265625l0 1.796875l-1.109375 0.078125l0 -0.546875q0 -0.25 -0.125 -0.3125q-0.125 -0.046875 -0.328125 -0.046875q-0.46875 0 -0.78125 0.34375q-0.3125 0.328125 -0.3125 0.859375l0 2.78125l1.046875 0.046875l0 0.953125l-3.21875 0l0 -0.90625l0.40625 -0.046875q0.171875 -0.015625 0.25 -0.09375q0.09375 -0.078125 0.09375 -0.296875z" fill-rule="nonzero"/><path fill="#ff9900" d="m6.3882904 65.90535l0 0c0 -2.968769 2.4066648 -5.375435 5.375436 -5.375435l84.80818 0c1.4256592 0 2.792923 0.5663376 3.8010101 1.5744286c1.0080872 1.008091 1.5744247 2.3753548 1.5744247 3.8010063l0 21.501099c0 2.968773 -2.406662 5.375435 -5.375435 5.375435l-84.80818 0c-2.968771 0 -5.375436 -2.406662 -5.375436 -5.375435z" fill-rule="evenodd"/><path fill="#000000" d="m21.444216 81.175896l0 -0.96875l0.421875 -0.046875q0.3125 -0.015625 0.3125 -0.328125l0 -5.578125l-0.6875 -0.03125l0 -1.03125l2.171875 0l3.390625 5.515625l0.03125 0l0 -4.453125l-0.796875 -0.03125l0 -1.03125l2.921875 0l0 0.96875l-0.421875 0.046875q-0.296875 0.015625 -0.296875 0.375l0 6.59375l-1.40625 0l-3.46875 -5.59375l-0.015625 0l0 4.53125l0.78125 0.046875l0 1.015625l-2.9375 0zm14.147554 0l-5.687502 0l0 -0.96875l0.390625 -0.046875q0.34375 -0.03125 0.34375 -0.328125l0 -5.578125l-0.6875 -0.03125l0 -1.03125l2.906252 0l0 0.96875l-0.40625 0.046875q-0.34375 0.03125 -0.34375 0.375l0 5.359375l1.984375 0q0.171875 0 0.21875 -0.09375q0.0625 -0.09375 0.0625 -0.296875l0 -0.828125l1.21875 0.078125l0 2.375zm5.600296 -6.703125q-0.140625 -0.234375 -0.984375 -0.234375q-1.125 0 -1.703125 0.765625q-0.5625 0.75 -0.5625 2.171875q0 2.96875 2.21875 2.96875q0.03125 0 0.3125 0q0.28125 0 0.53125 -0.078125q0.25 -0.078125 0.296875 -0.15625q0.0625 -0.09375 0.0625 -0.265625l0 -0.96875l1.21875 0.09375l0 2.0625q-0.9375 0.5 -2.375 0.5q-1.84375 0 -2.8125 -1.03125q-0.953125 -1.03125 -0.953125 -3.078125q0 -1.125 0.3125 -1.953125q0.328125 -0.84375 0.890625 -1.3125q1.078125 -0.90625 2.5625 -0.90625q1.203125 0 2.25 0.5l0 2.03125l-1.21875 0.078125l0 -0.921875q0 -0.1875 -0.046875 -0.265625zm4.536545 -1.859375l0 7.5625l0.6875 0.046875l0 0.953125l-2.859375 0l0 -0.90625l0.390625 -0.046875q0.34375 -0.03125 0.34375 -0.375l0 -5.921875q0 -0.265625 -0.296875 -0.28125l-0.421875 -0.03125l0 -1.0l2.15625 0zm6.8103943 2.765625l0 4.46875q0 0.1875 0.0625 0.265625q0.078125 0.0625 0.234375 0.078125l0.390625 0.03125l0 0.953125l-1.90625 0l0 -0.703125l-0.03125 0q-0.609375 0.859375 -1.65625 0.859375q-1.21875 0 -1.8125 -0.78125q-0.578125 -0.78125 -0.578125 -2.140625q0 -1.625 0.78125 -2.53125q0.796875 -0.921875 2.375 -0.921875q1.03125 0 2.140625 0.421875zm-1.390625 3.890625l0 -3.109375q-0.328125 -0.15625 -0.921875 -0.15625q-0.8125 0 -1.171875 0.65625q-0.359375 0.640625 -0.359375 1.703125q0 1.953125 1.25 1.953125q0.53125 0 0.859375 -0.3125q0.34375 -0.328125 0.34375 -0.734375zm5.9602966 -3.21875q-0.296875 -0.109375 -0.703125 -0.109375q-0.40625 0 -0.65625 0.1875q-0.25 0.1875 -0.25 0.46875q0 0.265625 0.09375 0.421875q0.09375 0.140625 0.28125 0.25q0.296875 0.15625 0.71875 0.265625q0.421875 0.109375 0.625 0.1875q0.203125 0.0625 0.5 0.21875q0.3125 0.15625 0.46875 0.328125q0.421875 0.453125 0.421875 1.140625q0 0.90625 -0.65625 1.421875q-0.640625 0.5 -1.65625 0.5q-1.46875 0 -2.203125 -0.375l0 -1.671875l1.078125 -0.078125l0 0.578125q0 0.53125 1.0 0.53125q1.015625 0 1.015625 -0.734375q0 -0.265625 -0.171875 -0.4375q-0.171875 -0.171875 -0.34375 -0.21875q-0.171875 -0.0625 -0.40625 -0.125q-0.234375 -0.0625 -0.453125 -0.125q-0.21875 -0.0625 -0.484375 -0.15625q-0.25 -0.109375 -0.5625 -0.296875q-0.609375 -0.390625 -0.609375 -1.3125q0 -0.9375 0.65625 -1.4375q0.65625 -0.515625 1.65625 -0.515625q1.0 0 1.984375 0.484375l0 1.4375l-1.078125 0.078125l0 -0.5q0 -0.296875 -0.265625 -0.40625zm5.412033 0q-0.296875 -0.109375 -0.703125 -0.109375q-0.40625 0 -0.65625 0.1875q-0.25 0.1875 -0.25 0.46875q0 0.265625 0.09375 0.421875q0.09375 0.140625 0.28125 0.25q0.296875 0.15625 0.71875 0.265625q0.421875 0.109375 0.625 0.1875q0.203125 0.0625 0.5 0.21875q0.3125 0.15625 0.46875 0.328125q0.4218712 0.453125 0.4218712 1.140625q0 0.90625 -0.6562462 1.421875q-0.640625 0.5 -1.65625 0.5q-1.46875 0 -2.203125 -0.375l0 -1.671875l1.078125 -0.078125l0 0.578125q0 0.53125 1.0 0.53125q1.015625 0 1.015625 -0.734375q0 -0.265625 -0.171875 -0.4375q-0.171875 -0.171875 -0.34375 -0.21875q-0.171875 -0.0625 -0.40625 -0.125q-0.234375 -0.0625 -0.453125 -0.125q-0.21875 -0.0625 -0.484375 -0.15625q-0.25 -0.109375 -0.5625 -0.296875q-0.609375 -0.390625 -0.609375 -1.3125q0 -0.9375 0.65625 -1.4375q0.65625 -0.515625 1.65625 -0.515625q1.0 0 1.984375 0.484375l0 1.4375l-1.078125 0.078125l0 -0.5q0 -0.296875 -0.265625 -0.40625zm4.4276543 -0.90625l0 5.03125l0.6875 0.046875l0 0.953125l-2.859375 0l0 -0.90625l0.40625 -0.046875q0.34375 -0.03125 0.34375 -0.375l0 -3.34375q0 -0.203125 -0.0625 -0.265625q-0.0625 -0.078125 -0.234375 -0.078125l-0.421875 -0.03125l0 -0.984375l2.140625 0zm-1.515625 -1.03125q-0.25 -0.25 -0.25 -0.640625q0 -0.390625 0.25 -0.640625q0.265625 -0.265625 0.671875 -0.265625q0.40625 0 0.65625 0.265625q0.265625 0.25 0.265625 0.640625q0 0.390625 -0.265625 0.640625q-0.25 0.234375 -0.65625 0.234375q-0.40625 0 -0.671875 -0.234375zm3.6614075 5.71875l0 -3.6875l-1.03125 0l0 -1.0l1.03125 0l0 -0.453125q0 -1.1875 0.53125 -1.703125q0.515625 -0.484375 1.484375 -0.484375q0.96875 0 1.71875 0.421875l-0.34375 0.953125q-0.578125 -0.296875 -1.078125 -0.296875q-0.5 0 -0.703125 0.25q-0.1875 0.234375 -0.1875 0.75l0 0.5625l1.78125 0l0 1.0l-1.78125 0l0 4.015625l1.015625 0.0625l0 0.953125l-3.1875 0l0 -0.90625l0.390625 -0.046875q0.1875 -0.015625 0.265625 -0.09375q0.09375 -0.078125 0.09375 -0.296875zm5.846283 -4.6875l0 5.03125l0.6875 0.046875l0 0.953125l-2.859375 0l0 -0.90625l0.40625 -0.046875q0.34375 -0.03125 0.34375 -0.375l0 -3.34375q0 -0.203125 -0.0625 -0.265625q-0.0625 -0.078125 -0.234375 -0.078125l-0.421875 -0.03125l0 -0.984375l2.140625 0zm-1.515625 -1.03125q-0.25 -0.25 -0.25 -0.640625q0 -0.390625 0.25 -0.640625q0.265625 -0.265625 0.671875 -0.265625q0.40625 0 0.65625 0.265625q0.265625 0.25 0.265625 0.640625q0 0.390625 -0.265625 0.640625q-0.25 0.234375 -0.65625 0.234375q-0.40625 0 -0.671875 -0.234375zm5.9114075 0.84375q0.90625 0 1.421875 0.4375q0.53125 0.4375 0.53125 1.234375q0 0.53125 -0.234375 0.9375q-0.21875 0.390625 -0.5625 0.625q-0.328125 0.21875 -0.8125 0.375q-0.796875 0.234375 -1.8125 0.234375q0.046875 0.640625 0.40625 1.03125q0.359375 0.390625 1.09375 0.390625q0.75 0 1.5 -0.53125l0.453125 0.96875q-0.25 0.21875 -0.8125 0.453125q-0.5625 0.21875 -1.296875 0.21875q-1.46875 0 -2.15625 -0.8125q-0.6875 -0.8125 -0.6875 -2.21875q0 -1.421875 0.78125 -2.375q0.78125 -0.96875 2.1875 -0.96875zm-0.546875 2.75q0.4375 -0.078125 0.8125 -0.34375q0.375 -0.28125 0.375 -0.65625q0 -0.734375 -0.71875 -0.734375q-0.671875 0 -1.03125 0.546875q-0.359375 0.53125 -0.40625 1.265625q0.53125 0 0.96875 -0.078125zm4.0892944 2.125l0 -3.328125q0 -0.1875 -0.0625 -0.265625q-0.0625 -0.078125 -0.234375 -0.09375l-0.5 -0.03125l0 -0.96875l2.015625 0l0 0.890625q0.21875 -0.46875 0.671875 -0.765625q0.453125 -0.3125 1.046875 -0.3125q0.609375 0 1.140625 0.265625l0 1.796875l-1.109375 0.078125l0 -0.546875q0 -0.25 -0.125 -0.3125q-0.125 -0.046875 -0.328125 -0.046875q-0.46875 0 -0.78125 0.34375q-0.3125 0.328125 -0.3125 0.859375l0 2.78125l1.046875 0.046875l0 0.953125l-3.21875 0l0 -0.90625l0.40625 -0.046875q0.171875 -0.015625 0.25 -0.09375q0.09375 -0.078125 0.09375 -0.296875z" fill-rule="nonzero"/><path fill="#ff9900" d="m-1.4698163E-5 123.44314l0 0c0 -2.968773 2.4066646 -5.375435 5.375436 -5.375435l97.186134 0c1.4256592 0 2.792923 0.5663376 3.8010101 1.5744247c1.0080872 1.0080948 1.5744324 2.3753586 1.5744324 3.8010101l0 21.501091c0 2.9687805 -2.4066696 5.3754425 -5.3754425 5.3754425l-97.186134 0c-2.9687712 0 -5.375436 -2.406662 -5.375436 -5.3754425z" fill-rule="evenodd"/><path fill="#000000" d="m14.727401 130.83994l2.999999 0q2.296875 0 2.296875 1.78125q0 0.984375 -0.984375 1.5q0.640625 0.15625 0.984375 0.59375q0.359375 0.421875 0.359375 1.09375q0 1.03125 -0.703125 1.578125q-0.6875 0.546875 -2.015625 0.546875l-2.999999 0l0 -0.859375l0.359375 -0.03125q0.296875 -0.03125 0.296875 -0.296875l0 -4.96875l-0.59375 -0.03125l0 -0.90625zm1.906249 3.859375l0 2.234375l1.0 0q1.375 0 1.375 -1.125q0 -0.546875 -0.34375 -0.828125q-0.34375 -0.28125 -0.984375 -0.28125l-1.046875 0zm0 -2.875l0 1.9375l0.78125 0q0.65625 0 0.96875 -0.265625q0.328125 -0.28125 0.328125 -0.765625q0 -0.5 -0.328125 -0.703125q-0.3125 -0.203125 -0.890625 -0.203125l-0.859375 0zm7.1182556 0.59375q0.796875 0 1.265625 0.390625q0.46875 0.390625 0.46875 1.09375q0 0.46875 -0.203125 0.828125q-0.203125 0.34375 -0.5 0.546875q-0.296875 0.203125 -0.734375 0.328125q-0.703125 0.21875 -1.609375 0.21875q0.046875 0.5625 0.359375 0.90625q0.3125 0.34375 0.96875 0.34375q0.671875 0 1.328125 -0.46875l0.40625 0.875q-0.203125 0.1875 -0.71875 0.390625q-0.5 0.203125 -1.15625 0.203125q-1.296875 0 -1.90625 -0.71875q-0.609375 -0.71875 -0.609375 -1.96875q0 -1.265625 0.6875 -2.109375q0.703125 -0.859375 1.953125 -0.859375zm-0.484375 2.4375q0.390625 -0.078125 0.71875 -0.3125q0.328125 -0.25 0.328125 -0.578125q0 -0.640625 -0.640625 -0.640625q-0.59375 0 -0.921875 0.46875q-0.3125 0.46875 -0.34375 1.140625q0.46875 -0.015625 0.859375 -0.078125zm3.6264343 1.890625l0 -2.953125q0 -0.171875 -0.0625 -0.234375q-0.0625 -0.078125 -0.203125 -0.09375l-0.453125 -0.03125l0 -0.859375l1.796875 0l0 0.796875q0.203125 -0.40625 0.59375 -0.671875q0.40625 -0.28125 0.9375 -0.28125q0.53125 0 1.015625 0.234375l0 1.578125l-1.0 0.078125l0 -0.484375q0 -0.21875 -0.109375 -0.265625q-0.109375 -0.0625 -0.28125 -0.0625q-0.421875 0 -0.703125 0.3125q-0.265625 0.296875 -0.265625 0.765625l0 2.46875l0.921875 0.046875l0 0.84375l-2.84375 0l0 -0.8125l0.34375 -0.03125q0.15625 -0.015625 0.234375 -0.078125q0.078125 -0.078125 0.078125 -0.265625zm5.43886 -5.296875l0.703125 0l0 1.125l1.40625 0l-0.109375 0.890625l-1.296875 0l0 2.8125q0 0.46875 0.15625 0.65625q0.171875 0.171875 0.53125 0.171875q0.359375 0 0.71875 -0.21875l0.328125 0.828125q-0.53125 0.375 -1.359375 0.375q-0.484375 0 -0.8125 -0.125q-0.328125 -0.125 -0.484375 -0.265625q-0.1562519 -0.15625 -0.2500019 -0.4375q-0.078125 -0.296875 -0.09375 -0.453125q-0.015625 -0.171875 -0.015625 -0.5l0 -2.84375l-0.75 0l0.109375 -0.78125q0.5 -0.03125 0.75 -0.34375q0.2656269 -0.328125 0.4687519 -0.890625zm2.8299408 6.484375l0 -0.859375l0.375 -0.03125q0.28125 -0.03125 0.28125 -0.296875l0 -4.96875l-0.609375 -0.03125l0 -0.90625l1.921875 0l3.015625 4.90625l0.015625 0l0 -3.96875l-0.703125 -0.03125l0 -0.90625l2.609375 0l0 0.875l-0.390625 0.03125q-0.25 0.015625 -0.25 0.328125l0 5.859375l-1.25 0l-3.078125 -4.96875l-0.015625 0l0 4.03125l0.6875 0.03125l0 0.90625l-2.609375 0zm12.575226 0l-5.0625 0l0 -0.859375l0.34375 -0.03125q0.3125 -0.03125 0.3125 -0.296875l0 -4.96875l-0.609375 -0.03125l0 -0.90625l2.578125 0l0 0.875l-0.359375 0.03125q-0.3125 0.03125 -0.3125 0.34375l0 4.75l1.78125 0q0.140625 0 0.1875 -0.078125q0.046875 -0.078125 0.046875 -0.265625l0 -0.75l1.09375 0.078125l0 2.109375zm4.956833 -5.953125q-0.109375 -0.203125 -0.875 -0.203125q-0.984375 0 -1.5 0.671875q-0.5 0.671875 -0.5 1.9375q0 2.625 1.96875 2.625q0.03125 0 0.28125 0q0.25 0 0.46875 -0.0625q0.21875 -0.078125 0.265625 -0.140625q0.046875 -0.078125 0.046875 -0.234375l0 -0.859375l1.078125 0.078125l0 1.84375q-0.828125 0.4375 -2.109375 0.4375q-1.625 0 -2.484375 -0.90625q-0.859375 -0.921875 -0.859375 -2.734375q0 -1.0 0.28125 -1.75q0.296875 -0.75 0.78125 -1.171875q0.96875 -0.796875 2.28125 -0.796875q1.078125 0 2.015625 0.4375l0 1.8125l-1.09375 0.078125l0 -0.828125q0 -0.15625 -0.046875 -0.234375zm4.0401306 -1.640625l0 6.703125l0.59375 0.03125l0 0.859375l-2.53125 0l0 -0.8125l0.359375 -0.03125q0.296875 -0.03125 0.296875 -0.34375l0 -5.25q0 -0.234375 -0.265625 -0.25l-0.359375 -0.03125l0 -0.875l1.90625 0zm6.050598 2.453125l0 3.953125q0 0.171875 0.046875 0.234375q0.0625 0.0625 0.21875 0.078125l0.34375 0.015625l0 0.859375l-1.703125 0l0 -0.625l-0.03125 0q-0.53125 0.765625 -1.453125 0.765625q-1.09375 0 -1.625 -0.6875q-0.515625 -0.703125 -0.515625 -1.90625q0 -1.453125 0.703125 -2.25q0.703125 -0.8125 2.109375 -0.8125q0.90625 0 1.90625 0.375zm-1.234375 3.453125l0 -2.765625q-0.296875 -0.140625 -0.828125 -0.140625q-0.71875 0 -1.046875 0.578125q-0.3125 0.578125 -0.3125 1.515625q0 1.734375 1.109375 1.734375q0.46875 0 0.765625 -0.28125q0.3125 -0.28125 0.3125 -0.640625zm5.2765274 -2.859375q-0.25 -0.09375 -0.625 -0.09375q-0.359375 0 -0.578125 0.171875q-0.21874237 0.15625 -0.21874237 0.390625q0 0.234375 0.078125 0.375q0.09374237 0.140625 0.26561737 0.234375q0.265625 0.140625 0.625 0.234375q0.375 0.09375 0.5625 0.15625q0.1875 0.0625 0.453125 0.203125q0.265625 0.140625 0.40625 0.296875q0.375 0.390625 0.375 1.015625q0 0.796875 -0.578125 1.25q-0.578125 0.453125 -1.484375 0.453125q-1.2968674 0 -1.9531174 -0.328125l0 -1.484375l0.953125 -0.078125l0 0.515625q0 0.46875 0.8906174 0.46875q0.90625 0 0.90625 -0.65625q0 -0.234375 -0.15625 -0.375q-0.15625 -0.15625 -0.3125 -0.203125q-0.140625 -0.0625 -0.359375 -0.109375q-0.203125 -0.046875 -0.40625 -0.09375q-0.18749237 -0.0625 -0.42186737 -0.15625q-0.21875 -0.09375 -0.5 -0.265625q-0.546875 -0.34375 -0.546875 -1.171875q0 -0.828125 0.578125 -1.265625q0.59375 -0.453125 1.4843674 -0.453125q0.890625 0 1.765625 0.421875l0 1.28125l-0.953125 0.078125l0 -0.453125q0 -0.265625 -0.25 -0.359375zm4.806015 0q-0.25 -0.09375 -0.625 -0.09375q-0.359375 0 -0.578125 0.171875q-0.21875 0.15625 -0.21875 0.390625q0 0.234375 0.078125 0.375q0.09375 0.140625 0.265625 0.234375q0.265625 0.140625 0.625 0.234375q0.375 0.09375 0.5625 0.15625q0.1875 0.0625 0.453125 0.203125q0.265625 0.140625 0.40625 0.296875q0.375 0.390625 0.375 1.015625q0 0.796875 -0.578125 1.25q-0.578125 0.453125 -1.484375 0.453125q-1.296875 0 -1.953125 -0.328125l0 -1.484375l0.953125 -0.078125l0 0.515625q0 0.46875 0.890625 0.46875q0.90625 0 0.90625 -0.65625q0 -0.234375 -0.15625 -0.375q-0.15625 -0.15625 -0.3125 -0.203125q-0.140625 -0.0625 -0.359375 -0.109375q-0.203125 -0.046875 -0.40625 -0.09375q-0.1875 -0.0625 -0.421875 -0.15625q-0.21875 -0.09375 -0.5 -0.265625q-0.546875 -0.34375 -0.546875 -1.171875q0 -0.828125 0.578125 -1.265625q0.59375 -0.453125 1.484375 -0.453125q0.890625 0 1.765625 0.421875l0 1.28125l-0.953125 0.078125l0 -0.453125q0 -0.265625 -0.25 -0.359375zm3.94664 -0.8125l0 4.46875l0.609375 0.03125l0 0.859375l-2.53125 0l0 -0.8125l0.34375 -0.03125q0.3125 -0.03125 0.3125 -0.34375l0 -2.953125q0 -0.1875 -0.0625 -0.25q-0.0625 -0.0625 -0.203125 -0.0625l-0.375 -0.015625l0 -0.890625l1.90625 0zm-1.34375 -0.921875q-0.21875 -0.21875 -0.21875 -0.5625q0 -0.34375 0.21875 -0.5625q0.21875 -0.234375 0.578125 -0.234375q0.375 0 0.59375 0.234375q0.234375 0.21875 0.234375 0.5625q0 0.34375 -0.234375 0.5625q-0.21875 0.21875 -0.59375 0.21875q-0.359375 0 -0.578125 -0.21875zm3.2324066 5.09375l0 -3.28125l-0.90625 0l0 -0.890625l0.90625 0l0 -0.40625q0 -1.046875 0.484375 -1.5q0.453125 -0.4375 1.3125 -0.4375q0.859375 0 1.53125 0.375l-0.3125 0.84375q-0.5 -0.265625 -0.953125 -0.265625q-0.453125 0 -0.625 0.21875q-0.15625 0.21875 -0.15625 0.671875l0 0.5l1.578125 0l0 0.890625l-1.578125 0l0 3.578125l0.890625 0.03125l0 0.859375l-2.828125 0l0 -0.8125l0.359375 -0.03125q0.15625 -0.015625 0.21875 -0.078125q0.078125 -0.078125 0.078125 -0.265625zm5.208481 -4.171875l0 4.46875l0.609375 0.03125l0 0.859375l-2.53125 0l0 -0.8125l0.34375 -0.03125q0.3125 -0.03125 0.3125 -0.34375l0 -2.953125q0 -0.1875 -0.0625 -0.25q-0.0625 -0.0625 -0.203125 -0.0625l-0.375 -0.015625l0 -0.890625l1.90625 0zm-1.34375 -0.921875q-0.21875 -0.21875 -0.21875 -0.5625q0 -0.34375 0.21875 -0.5625q0.21875 -0.234375 0.578125 -0.234375q0.375 0 0.59375 0.234375q0.234375 0.21875 0.234375 0.5625q0 0.34375 -0.234375 0.5625q-0.21875 0.21875 -0.59375 0.21875q-0.359375 0 -0.578125 -0.21875zm5.2480316 0.765625q0.796875 0 1.265625 0.390625q0.46875 0.390625 0.46875 1.09375q0 0.46875 -0.203125 0.828125q-0.203125 0.34375 -0.5 0.546875q-0.296875 0.203125 -0.734375 0.328125q-0.703125 0.21875 -1.609375 0.21875q0.046875 0.5625 0.359375 0.90625q0.3125 0.34375 0.96875 0.34375q0.671875 0 1.328125 -0.46875l0.40625 0.875q-0.203125 0.1875 -0.71875 0.390625q-0.5 0.203125 -1.15625 0.203125q-1.296875 0 -1.90625 -0.71875q-0.609375 -0.71875 -0.609375 -1.96875q0 -1.265625 0.6875 -2.109375q0.703125 -0.859375 1.953125 -0.859375zm-0.484375 2.4375q0.390625 -0.078125 0.71875 -0.3125q0.328125 -0.25 0.328125 -0.578125q0 -0.640625 -0.640625 -0.640625q-0.59375 0 -0.921875 0.46875q-0.3125 0.46875 -0.34375 1.140625q0.46875 -0.015625 0.859375 -0.078125zm3.6264343 1.890625l0 -2.953125q0 -0.171875 -0.0625 -0.234375q-0.0625 -0.078125 -0.203125 -0.09375l-0.453125 -0.03125l0 -0.859375l1.796875 0l0 0.796875q0.203125 -0.40625 0.59375 -0.671875q0.40625 -0.28125 0.9375 -0.28125q0.53125 0 1.015625 0.234375l0 1.578125l-1.0 0.078125l0 -0.484375q0 -0.21875 -0.109375 -0.265625q-0.109375 -0.0625 -0.28125 -0.0625q-0.421875 0 -0.703125 0.3125q-0.265625 0.296875 -0.265625 0.765625l0 2.46875l0.921875 0.046875l0 0.84375l-2.84375 0l0 -0.8125l0.34375 -0.03125q0.15625 -0.015625 0.234375 -0.078125q0.078125 -0.078125 0.078125 -0.265625z" fill-rule="nonzero"/><path fill="#ff9900" d="m357.90506 65.907974l0 0c0 -2.968769 2.4066467 -5.375435 5.3754272 -5.375435l116.90268 0c1.4256592 0 2.7929077 0.5663376 3.8009949 1.5744286c1.0080872 1.008091 1.5744324 2.3753548 1.5744324 3.8010063l0 21.501099c0 2.968773 -2.4066467 5.375435 -5.3754272 5.375435l-116.90268 0c-2.9687805 0 -5.3754272 -2.406662 -5.3754272 -5.375435z" fill-rule="evenodd"/><path fill="#000000" d="m371.1282 73.194145l3.390625 0q2.578125 0 2.578125 2.015625q0 1.09375 -1.09375 1.671875q0.703125 0.1875 1.109375 0.671875q0.40625 0.46875 0.40625 1.234375q0 1.171875 -0.796875 1.78125q-0.78125 0.609375 -2.28125 0.609375l-3.375 0l0 -0.96875l0.390625 -0.046875q0.359375 -0.03125 0.359375 -0.328125l0 -5.578125l-0.6875 -0.03125l0 -1.03125zm2.140625 4.34375l0 2.515625l1.140625 0q1.546875 0 1.546875 -1.28125q0 -0.59375 -0.390625 -0.90625q-0.375 -0.328125 -1.109375 -0.328125l-1.1875 0zm0 -3.234375l0 2.171875l0.890625 0q0.75 0 1.09375 -0.296875q0.359375 -0.3125 0.359375 -0.859375q0 -0.5625 -0.359375 -0.78125q-0.34375 -0.234375 -1.0 -0.234375l-0.984375 0zm10.395935 1.078125l0 4.46875q0 0.1875 0.0625 0.265625q0.078125 0.0625 0.234375 0.078125l0.390625 0.03125l0 0.953125l-1.90625 0l0 -0.703125l-0.03125 0q-0.609375 0.859375 -1.65625 0.859375q-1.21875 0 -1.8125 -0.78125q-0.578125 -0.78125 -0.578125 -2.140625q0 -1.625 0.78125 -2.53125q0.796875 -0.921875 2.375 -0.921875q1.03125 0 2.140625 0.421875zm-1.390625 3.890625l0 -3.109375q-0.328125 -0.15625 -0.921875 -0.15625q-0.8125 0 -1.171875 0.65625q-0.359375 0.640625 -0.359375 1.703125q0 1.953125 1.25 1.953125q0.53125 0 0.859375 -0.3125q0.34375 -0.328125 0.34375 -0.734375zm5.9602966 -3.21875q-0.296875 -0.109375 -0.703125 -0.109375q-0.40625 0 -0.65625 0.1875q-0.25 0.1875 -0.25 0.46875q0 0.265625 0.09375 0.421875q0.09375 0.140625 0.28125 0.25q0.296875 0.15625 0.71875 0.265625q0.421875 0.109375 0.625 0.1875q0.203125 0.0625 0.5 0.21875q0.3125 0.15625 0.46875 0.328125q0.421875 0.453125 0.421875 1.140625q0 0.90625 -0.65625 1.421875q-0.640625 0.5 -1.65625 0.5q-1.46875 0 -2.203125 -0.375l0 -1.671875l1.078125 -0.078125l0 0.578125q0 0.53125 1.0 0.53125q1.015625 0 1.015625 -0.734375q0 -0.265625 -0.171875 -0.4375q-0.171875 -0.171875 -0.34375 -0.21875q-0.171875 -0.0625 -0.40625 -0.125q-0.234375 -0.0625 -0.453125 -0.125q-0.21875 -0.0625 -0.484375 -0.15625q-0.25 -0.109375 -0.5625 -0.296875q-0.609375 -0.390625 -0.609375 -1.3125q0 -0.9375 0.65625 -1.4375q0.65625 -0.515625 1.65625 -0.515625q1.0 0 1.984375 0.484375l0 1.4375l-1.078125 0.078125l0 -0.5q0 -0.296875 -0.265625 -0.40625zm5.271393 -1.09375q0.90625 0 1.421875 0.4375q0.53125 0.4375 0.53125 1.234375q0 0.53125 -0.234375 0.9375q-0.21875 0.390625 -0.5625 0.625q-0.328125 0.21875 -0.8125 0.375q-0.796875 0.234375 -1.8125 0.234375q0.046875 0.640625 0.40625 1.03125q0.359375 0.390625 1.09375 0.390625q0.75 0 1.5 -0.53125l0.453125 0.96875q-0.25 0.21875 -0.8125 0.453125q-0.5625 0.21875 -1.296875 0.21875q-1.46875 0 -2.15625 -0.8125q-0.6875 -0.8125 -0.6875 -2.21875q0 -1.421875 0.78125 -2.375q0.78125 -0.96875 2.1875 -0.96875zm-0.546875 2.75q0.4375 -0.078125 0.8125 -0.34375q0.375 -0.28125 0.375 -0.65625q0 -0.734375 -0.71875 -0.734375q-0.671875 0 -1.03125 0.546875q-0.359375 0.53125 -0.40625 1.265625q0.53125 0 0.96875 -0.078125zm5.3549194 -3.078125l1.609375 4.828125l0.0625 0l1.6875 -5.1875l-0.734375 -0.046875l0 -1.03125l2.75 0l0 0.96875l-0.390625 0.046875q-0.15625 0.015625 -0.234375 0.09375q-0.0625 0.078125 -0.140625 0.265625l-2.265625 6.609375l-1.5625 0l-2.4375 -6.921875l-0.6875 -0.03125l0 -1.03125l3.09375 0l0 0.96875l-0.390625 0.046875q-0.375 0.015625 -0.375 0.28125q0 0.0625 0.015625 0.140625zm8.004303 0.515625l0 5.03125l0.6875 0.046875l0 0.953125l-2.859375 0l0 -0.90625l0.40625 -0.046875q0.34375 -0.03125 0.34375 -0.375l0 -3.34375q0 -0.203125 -0.0625 -0.265625q-0.0625 -0.078125 -0.234375 -0.078125l-0.421875 -0.03125l0 -0.984375l2.140625 0zm-1.515625 -1.03125q-0.25 -0.25 -0.25 -0.640625q0 -0.390625 0.25 -0.640625q0.265625 -0.265625 0.671875 -0.265625q0.40625 0 0.65625 0.265625q0.265625 0.25 0.265625 0.640625q0 0.390625 -0.265625 0.640625q-0.25 0.234375 -0.65625 0.234375q-0.40625 0 -0.671875 -0.234375zm6.0520325 1.9375q-0.296875 -0.109375 -0.703125 -0.109375q-0.40625 0 -0.65625 0.1875q-0.25 0.1875 -0.25 0.46875q0 0.265625 0.09375 0.421875q0.09375 0.140625 0.28125 0.25q0.296875 0.15625 0.71875 0.265625q0.421875 0.109375 0.625 0.1875q0.203125 0.0625 0.5 0.21875q0.3125 0.15625 0.46875 0.328125q0.421875 0.453125 0.421875 1.140625q0 0.90625 -0.65625 1.421875q-0.640625 0.5 -1.65625 0.5q-1.46875 0 -2.203125 -0.375l0 -1.671875l1.078125 -0.078125l0 0.578125q0 0.53125 1.0 0.53125q1.015625 0 1.015625 -0.734375q0 -0.265625 -0.171875 -0.4375q-0.171875 -0.171875 -0.34375 -0.21875q-0.171875 -0.0625 -0.40625 -0.125q-0.234375 -0.0625 -0.453125 -0.125q-0.21875 -0.0625 -0.484375 -0.15625q-0.25 -0.109375 -0.5625 -0.296875q-0.609375 -0.390625 -0.609375 -1.3125q0 -0.9375 0.65625 -1.4375q0.65625 -0.515625 1.65625 -0.515625q1.0 0 1.984375 0.484375l0 1.4375l-1.078125 0.078125l0 -0.5q0 -0.296875 -0.265625 -0.40625zm4.4276733 -0.90625l0 5.03125l0.6875 0.046875l0 0.953125l-2.859375 0l0 -0.90625l0.40625 -0.046875q0.34375 -0.03125 0.34375 -0.375l0 -3.34375q0 -0.203125 -0.0625 -0.265625q-0.0625 -0.078125 -0.234375 -0.078125l-0.421875 -0.03125l0 -0.984375l2.140625 0zm-1.515625 -1.03125q-0.25 -0.25 -0.25 -0.640625q0 -0.390625 0.25 -0.640625q0.265625 -0.265625 0.671875 -0.265625q0.40625 0 0.65625 0.265625q0.265625 0.25 0.265625 0.640625q0 0.390625 -0.265625 0.640625q-0.25 0.234375 -0.65625 0.234375q-0.40625 0 -0.671875 -0.234375zm5.6614075 6.1875q1.3125 0 1.3125 -2.109375q0 -1.078125 -0.296875 -1.625q-0.296875 -0.546875 -0.984375 -0.546875q-0.6875 0 -1.015625 0.53125q-0.328125 0.515625 -0.328125 1.4375q0 1.6875 0.625 2.125q0.28125 0.1875 0.6875 0.1875zm-2.765625 -2.15625q0 -0.875 0.265625 -1.53125q0.265625 -0.65625 0.71875 -1.0q0.84375 -0.65625 1.875 -0.65625q0.71875 0 1.21875 0.234375q0.5 0.234375 0.78125 0.546875q0.28125 0.296875 0.46875 0.890625q0.203125 0.578125 0.203125 1.359375q0 1.65625 -0.8125 2.515625q-0.796875 0.859375 -2.046875 0.859375q-1.25 0 -1.96875 -0.8125q-0.703125 -0.8125 -0.703125 -2.40625zm6.273529 -2.046875l0 -0.953125l2.03125 0l0 0.78125q0.296875 -0.453125 0.8125 -0.703125q0.53125 -0.265625 1.125 -0.265625q0.90625 0 1.40625 0.53125q0.5 0.515625 0.5 1.5625l0 3.125l0.6875 0.046875l0 0.953125l-2.859375 0l0 -0.90625l0.390625 -0.046875q0.1875 -0.015625 0.265625 -0.09375q0.09375 -0.078125 0.09375 -0.296875l0 -2.46875q0 -0.65625 -0.203125 -0.96875q-0.203125 -0.328125 -0.78125 -0.328125q-0.5625 0 -0.90625 0.359375q-0.328125 0.359375 -0.328125 0.84375l0 2.90625l0.671875 0.046875l0 0.953125l-2.84375 0l0 -0.90625l0.390625 -0.046875q0.171875 -0.015625 0.25 -0.09375q0.09375 -0.078125 0.09375 -0.296875l0 -3.328125q0 -0.359375 -0.296875 -0.375l-0.5 -0.03125zm8.881317 3.734375l0 -5.453125l-0.921875 0q-0.28125 0 -0.28125 0.375l0 0.5625l-1.21875 -0.078125l0 -2.046875l6.265625 0l0 2.046875l-1.21875 0.078125l0 -0.5625q0 -0.203125 -0.0625 -0.28125q-0.0625 -0.09375 -0.28125 -0.09375l-0.828125 0l0 5.734375l0.953125 0.046875l0 1.015625l-3.390625 0l0 -0.96875l0.640625 -0.046875q0.34375 -0.03125 0.34375 -0.328125zm9.807922 -4.453125l0 4.46875q0 0.1875 0.0625 0.265625q0.078125 0.0625 0.234375 0.078125l0.390625 0.03125l0 0.953125l-1.90625 0l0 -0.703125l-0.03125 0q-0.609375 0.859375 -1.65625 0.859375q-1.21875 0 -1.8125 -0.78125q-0.578125 -0.78125 -0.578125 -2.140625q0 -1.625 0.78125 -2.53125q0.796875 -0.921875 2.375 -0.921875q1.03125 0 2.140625 0.421875zm-1.390625 3.890625l0 -3.109375q-0.328125 -0.15625 -0.921875 -0.15625q-0.8125 0 -1.171875 0.65625q-0.359375 0.640625 -0.359375 1.703125q0 1.953125 1.25 1.953125q0.53125 0 0.859375 -0.3125q0.34375 -0.328125 0.34375 -0.734375zm5.9602966 -3.21875q-0.296875 -0.109375 -0.703125 -0.109375q-0.40625 0 -0.65625 0.1875q-0.25 0.1875 -0.25 0.46875q0 0.265625 0.09375 0.421875q0.09375 0.140625 0.28125 0.25q0.296875 0.15625 0.71875 0.265625q0.421875 0.109375 0.625 0.1875q0.203125 0.0625 0.5 0.21875q0.3125 0.15625 0.46875 0.328125q0.421875 0.453125 0.421875 1.140625q0 0.90625 -0.65625 1.421875q-0.640625 0.5 -1.65625 0.5q-1.46875 0 -2.203125 -0.375l0 -1.671875l1.078125 -0.078125l0 0.578125q0 0.53125 1.0 0.53125q1.015625 0 1.015625 -0.734375q0 -0.265625 -0.171875 -0.4375q-0.171875 -0.171875 -0.34375 -0.21875q-0.171875 -0.0625 -0.40625 -0.125q-0.234375 -0.0625 -0.453125 -0.125q-0.21875 -0.0625 -0.484375 -0.15625q-0.25 -0.109375 -0.5625 -0.296875q-0.609375 -0.390625 -0.609375 -1.3125q0 -0.9375 0.65625 -1.4375q0.65625 -0.515625 1.65625 -0.515625q1.0 0 1.984375 0.484375l0 1.4375l-1.078125 0.078125l0 -0.5q0 -0.296875 -0.265625 -0.40625zm3.0057678 3.78125l0 -5.84375q0 -0.1875 -0.0625 -0.265625q-0.0625 -0.078125 -0.234375 -0.078125l-0.421875 -0.03125l0 -1.0l2.140625 0l0 5.203125q0.65625 -0.0625 1.140625 -0.40625q0.5 -0.34375 0.671875 -0.921875q0.046875 -0.109375 0.046875 -0.1875q0 -0.203125 -0.234375 -0.203125l-0.453125 -0.015625l0 -0.9375l2.484375 0l0 0.96875l-0.578125 0.015625q-0.15625 1.234375 -1.0625 1.9375l1.1875 2.109375l0.71875 0.046875l0 0.953125l-1.75 0l-1.359375 -2.5q-0.40625 0.109375 -0.8125 0.171875l0 1.328125l0 0l0.6875 0.046875l0 0.953125l-2.859375 0l0 -0.90625l0.40625 -0.046875q0.171875 -0.015625 0.25 -0.09375q0.09375 -0.078125 0.09375 -0.296875zm6.343933 0l1.9375 -5.59375l-0.765625 -0.078125l0 -0.96875l2.75 0l2.453125 6.921875l0.015625 0l0.6875 0.046875l0 1.015625l-3.109375 0l0 -0.96875l0.46875 -0.046875q0.203125 -0.015625 0.265625 -0.078125q0.078125 -0.078125 0.015625 -0.28125l-0.25 -0.734375l-2.8125 0l-0.359375 1.046875l0.71875 0.046875l0 1.015625l-2.765625 0l0 -0.96875l0.390625 -0.046875q0.25 -0.015625 0.359375 -0.328125zm3.0625 -5.296875l-1.109375 3.421875l2.21875 0l-1.078125 -3.421875l-0.03125 0zm4.8915405 7.78125l0 -5.8125q0 -0.203125 -0.0625 -0.28125q-0.0625 -0.078125 -0.234375 -0.078125l-0.5 -0.03125l0 -0.96875l2.03125 0l0 0.78125q0.1875 -0.359375 0.6875 -0.65625q0.5 -0.3125 1.203125 -0.3125q2.171875 0 2.171875 3.046875q0 1.625 -0.71875 2.5q-0.703125 0.859375 -1.90625 0.859375q-0.71875 0 -1.28125 -0.359375l0 1.65625l0.96875 0.046875l0 0.953125l-3.09375 0l0 -0.90625l0.390625 -0.046875q0.1875 -0.015625 0.265625 -0.09375q0.078125 -0.078125 0.078125 -0.296875zm3.8125 -4.1875q0 -2.0 -1.21875 -2.0q-0.5 0 -0.859375 0.3125q-0.34375 0.296875 -0.34375 0.71875l0 2.75q0.453125 0.3125 1.109375 0.3125q0.65625 0 0.984375 -0.59375q0.328125 -0.609375 0.328125 -1.5zm4.5035706 -2.984375l0 5.03125l0.6875 0.046875l0 0.953125l-2.859375 0l0 -0.90625l0.40625 -0.046875q0.34375 -0.03125 0.34375 -0.375l0 -3.34375q0 -0.203125 -0.0625 -0.265625q-0.0625 -0.078125 -0.234375 -0.078125l-0.421875 -0.03125l0 -0.984375l2.140625 0zm-1.515625 -1.03125q-0.25 -0.25 -0.25 -0.640625q0 -0.390625 0.25 -0.640625q0.265625 -0.265625 0.671875 -0.265625q0.40625 0 0.65625 0.265625q0.265625 0.25 0.265625 0.640625q0 0.390625 -0.265625 0.640625q-0.25 0.234375 -0.65625 0.234375q-0.40625 0 -0.671875 -0.234375z" fill-rule="nonzero"/><path fill="#ff9900" d="m473.916 123.446304l0 0c0 -2.968773 2.4066772 -5.375435 5.375458 -5.375435l74.760925 0c1.4256592 0 2.7929077 0.5663376 3.8010254 1.5744247c1.0080566 1.0080872 1.5744019 2.375351 1.5744019 3.8010101l0 21.501099c0 2.9687653 -2.4066772 5.3754272 -5.3754272 5.3754272l-74.760925 0l0 0c-2.9687805 0 -5.375458 -2.406662 -5.375458 -5.3754272z" fill-rule="evenodd"/><path fill="#000000" d="m488.88403 131.34435q1.109375 0 1.859375 0.78125q0.75 0.78125 0.75 2.390625q0 1.609375 -0.75 2.4375q-0.734375 0.828125 -2.0 0.828125q-1.265625 0 -1.984375 -0.8125q-0.71875 -0.828125 -0.71875 -2.40625q0 -0.859375 0.25 -1.5q0.25 -0.65625 0.65625 -1.0q0.84375 -0.71875 1.9375 -0.71875zm-1.671875 3.171875q0 2.3125 1.59375 2.3125q0.859375 0 1.203125 -0.703125q0.328125 -0.625 0.328125 -1.53125q0 -0.8125 -0.28125 -1.484375q-0.171875 -0.375 -0.5 -0.59375q-0.3125 -0.21875 -0.828125 -0.21875q-0.5 0 -0.875 0.34375q-0.375 0.328125 -0.515625 0.8125q-0.125 0.46875 -0.125 1.0625zm5.2257385 2.859375l0 -5.359375q0 -0.203125 -0.234375 -0.21875l-0.328125 -0.015625l0 -0.78125l1.640625 0l0 2.21875q0.421875 -0.390625 1.125 -0.390625q0.890625 0 1.390625 0.59375q0.515625 0.59375 0.515625 1.65625q0 2.703125 -2.25 2.703125q-0.484375 0 -1.03125 -0.125q-0.53125 -0.109375 -0.828125 -0.28125zm1.078125 -3.015625l0 2.40625q0.359375 0.1875 0.828125 0.1875q1.0625 0 1.0625 -1.71875q0 -1.578125 -1.046875 -1.578125q-0.34375 0 -0.59375 0.203125q-0.25 0.1875 -0.25 0.5zm5.327057 -1.390625l0 4.90625q0 0.75 -0.234375 1.21875q-0.3125 0.5625 -1.25 0.5625q-0.671875 0 -1.203125 -0.328125l0.34375 -0.78125q0.359375 0.203125 0.65625 0.203125q0.3125 0 0.4375 -0.1875q0.140625 -0.1875 0.140625 -0.640625l0 -3.890625q0 -0.15625 -0.046875 -0.203125q-0.046875 -0.0625 -0.1875 -0.078125l-0.328125 -0.015625l0 -0.765625l1.671875 0zm-1.1875 -0.8125q-0.1875 -0.1875 -0.1875 -0.484375q0 -0.296875 0.1875 -0.5q0.203125 -0.203125 0.515625 -0.203125q0.328125 0 0.53125 0.203125q0.203125 0.203125 0.203125 0.5q0 0.296875 -0.203125 0.484375q-0.203125 0.1875 -0.53125 0.1875q-0.3125 0 -0.515625 -0.1875zm4.546112 0.671875q0.6875 0 1.09375 0.34375q0.421875 0.328125 0.421875 0.953125q0 0.40625 -0.1875 0.71875q-0.171875 0.3125 -0.4375 0.5q-0.25 0.171875 -0.625 0.28125q-0.625 0.1875 -1.40625 0.1875q0.03125 0.484375 0.296875 0.796875q0.28125 0.296875 0.859375 0.296875q0.578125 0 1.15625 -0.40625l0.359375 0.75q-0.1875 0.171875 -0.640625 0.359375q-0.4375 0.171875 -1.0 0.171875q-1.140625 0 -1.671875 -0.625q-0.53125 -0.640625 -0.53125 -1.734375q0 -1.109375 0.609375 -1.84375q0.609375 -0.75 1.703125 -0.75zm-0.4375 2.125q0.34375 -0.0625 0.625 -0.265625q0.296875 -0.21875 0.296875 -0.5q0 -0.578125 -0.5625 -0.578125q-0.515625 0 -0.796875 0.421875q-0.28125 0.421875 -0.3125 1.0q0.421875 -0.015625 0.75 -0.078125zm3.7335815 0.3125q0 0.78125 0.296875 1.21875q0.296875 0.4375 0.859375 0.4375q0.5625 0 1.125 -0.390625l0.390625 0.71875q-0.65625 0.53125 -1.65625 0.53125q-1.0 0 -1.578125 -0.625q-0.5625 -0.625 -0.5625 -1.859375q0 -1.234375 0.65625 -1.84375q0.65625 -0.625 1.484375 -0.625q0.828125 0 1.53125 0.375l0 1.21875l-0.859375 0.0625l0 -0.453125q0 -0.25 -0.1875 -0.3125q-0.171875 -0.0625 -0.375 -0.0625q-1.125 0 -1.125 1.609375zm4.2998047 -3.296875l0.609375 0l0 1.0l1.234375 0l-0.09375 0.78125l-1.140625 0l0 2.453125q0 0.40625 0.140625 0.578125q0.15625 0.15625 0.46875 0.15625q0.3125 0 0.625 -0.203125l0.28125 0.71875q-0.46875 0.328125 -1.1875 0.328125q-0.421875 0 -0.71875 -0.109375q-0.28125 -0.09375 -0.421875 -0.21875q-0.140625 -0.140625 -0.21875 -0.390625q-0.0625 -0.25 -0.078125 -0.390625q0 -0.15625 0 -0.4375l0 -2.484375l-0.671875 0l0.09375 -0.6875q0.4375 -0.03125 0.65625 -0.296875q0.234375 -0.28125 0.421875 -0.796875zm2.485443 5.6875l0 -0.75l0.3125 -0.03125q0.265625 -0.03125 0.265625 -0.265625l0 -4.328125l-0.53125 -0.03125l0 -0.796875l2.5625 0q1.375 0 2.140625 0.734375q0.765625 0.71875 0.765625 2.140625q0 0.890625 -0.234375 1.546875q-0.234375 0.65625 -0.625 1.03125q-0.8125 0.75 -2.0 0.75l-2.65625 0zm1.71875 -5.296875l0 4.40625l0.953125 0q0.796875 0 1.25 -0.578125q0.453125 -0.578125 0.453125 -1.6875q0 -2.140625 -1.828125 -2.140625l-0.828125 0zm6.828186 0.46875q0.6875 0 1.09375 0.34375q0.421875 0.328125 0.421875 0.953125q0 0.40625 -0.1875 0.71875q-0.171875 0.3125 -0.4375 0.5q-0.25 0.171875 -0.625 0.28125q-0.625 0.1875 -1.40625 0.1875q0.03125 0.484375 0.296875 0.796875q0.28125 0.296875 0.859375 0.296875q0.578125 0 1.15625 -0.40625l0.359375 0.75q-0.1875 0.171875 -0.640625 0.359375q-0.4375 0.171875 -1.0 0.171875q-1.140625 0 -1.671875 -0.625q-0.53125 -0.640625 -0.53125 -1.734375q0 -1.109375 0.609375 -1.84375q0.609375 -0.75 1.703125 -0.75zm-0.4375 2.125q0.34375 -0.0625 0.625 -0.265625q0.296875 -0.21875 0.296875 -0.5q0 -0.578125 -0.5625 -0.578125q-0.515625 0 -0.796875 0.421875q-0.28125 0.421875 -0.3125 1.0q0.421875 -0.015625 0.75 -0.078125zm3.6398315 -2.984375l0.609375 0l0 1.0l1.234375 0l-0.09375 0.78125l-1.140625 0l0 2.453125q0 0.40625 0.140625 0.578125q0.15625 0.15625 0.46875 0.15625q0.3125 0 0.625 -0.203125l0.28125 0.71875q-0.46875 0.328125 -1.1875 0.328125q-0.421875 0 -0.71875 -0.109375q-0.28125 -0.09375 -0.421875 -0.21875q-0.140625 -0.140625 -0.21875 -0.390625q-0.0625 -0.25 -0.078125 -0.390625q0 -0.15625 0 -0.4375l0 -2.484375l-0.671875 0l0.09375 -0.6875q0.4375 -0.03125 0.65625 -0.296875q0.234375 -0.28125 0.421875 -0.796875zm4.8292236 0.859375q0.6875 0 1.09375 0.34375q0.421875 0.328125 0.421875 0.953125q0 0.40625 -0.1875 0.71875q-0.171875 0.3125 -0.4375 0.5q-0.25 0.171875 -0.625 0.28125q-0.625 0.1875 -1.40625 0.1875q0.03125 0.484375 0.296875 0.796875q0.28125 0.296875 0.859375 0.296875q0.578125 0 1.15625 -0.40625l0.359375 0.75q-0.1875 0.171875 -0.640625 0.359375q-0.4375 0.171875 -1.0 0.171875q-1.140625 0 -1.671875 -0.625q-0.53125 -0.640625 -0.53125 -1.734375q0 -1.109375 0.609375 -1.84375q0.609375 -0.75 1.703125 -0.75zm-0.4375 2.125q0.34375 -0.0625 0.625 -0.265625q0.296875 -0.21875 0.296875 -0.5q0 -0.578125 -0.5625 -0.578125q-0.515625 0 -0.796875 0.421875q-0.28125 0.421875 -0.3125 1.0q0.421875 -0.015625 0.75 -0.078125zm3.7335815 0.3125q0 0.78125 0.296875 1.21875q0.296875 0.4375 0.859375 0.4375q0.5625 0 1.125 -0.390625l0.390625 0.71875q-0.65625 0.53125 -1.65625 0.53125q-1.0 0 -1.578125 -0.625q-0.5625 -0.625 -0.5625 -1.859375q0 -1.234375 0.65625 -1.84375q0.65625 -0.625 1.484375 -0.625q0.828125 0 1.53125 0.375l0 1.21875l-0.859375 0.0625l0 -0.453125q0 -0.25 -0.1875 -0.3125q-0.171875 -0.0625 -0.375 -0.0625q-1.125 0 -1.125 1.609375zm4.2998047 -3.296875l0.609375 0l0 1.0l1.234375 0l-0.09375 0.78125l-1.140625 0l0 2.453125q0 0.40625 0.140625 0.578125q0.15625 0.15625 0.46875 0.15625q0.3125 0 0.625 -0.203125l0.28125 0.71875q-0.46875 0.328125 -1.1875 0.328125q-0.421875 0 -0.71875 -0.109375q-0.28125 -0.09375 -0.421875 -0.21875q-0.140625 -0.140625 -0.21875 -0.390625q-0.0625 -0.25 -0.078125 -0.390625q0 -0.15625 0 -0.4375l0 -2.484375l-0.671875 0l0.09375 -0.6875q0.4375 -0.03125 0.65625 -0.296875q0.234375 -0.28125 0.421875 -0.796875zm4.439392 5.0q1.015625 0 1.015625 -1.640625q0 -0.828125 -0.234375 -1.25q-0.21875 -0.4375 -0.765625 -0.4375q-0.53125 0 -0.78125 0.421875q-0.25 0.40625 -0.25 1.109375q0 1.3125 0.484375 1.65625q0.21875 0.140625 0.53125 0.140625zm-2.140625 -1.65625q0 -0.703125 0.203125 -1.203125q0.203125 -0.515625 0.546875 -0.78125q0.65625 -0.5 1.46875 -0.5q0.5625 0 0.9375 0.1875q0.390625 0.171875 0.609375 0.40625q0.21875 0.234375 0.375 0.6875q0.15625 0.453125 0.15625 1.078125q0 1.28125 -0.625 1.953125q-0.625 0.65625 -1.609375 0.65625q-0.96875 0 -1.515625 -0.625q-0.546875 -0.640625 -0.546875 -1.859375zm5.503357 1.296875l0 -2.578125q0 -0.15625 -0.0625 -0.21875q-0.046875 -0.0625 -0.171875 -0.0625l-0.390625 -0.03125l0 -0.75l1.578125 0l0 0.6875q0.15625 -0.359375 0.515625 -0.59375q0.359375 -0.234375 0.8125 -0.234375q0.46875 0 0.890625 0.203125l0 1.390625l-0.875 0.0625l0 -0.421875q0 -0.1875 -0.09375 -0.234375q-0.09375 -0.046875 -0.25 -0.046875q-0.375 0 -0.609375 0.265625q-0.234375 0.265625 -0.234375 0.671875l0 2.15625l0.8125 0.046875l0 0.734375l-2.5 0l0 -0.703125l0.296875 -0.03125q0.140625 -0.015625 0.203125 -0.078125q0.078125 -0.0625 0.078125 -0.234375z" fill-rule="nonzero"/><path fill="#ff9900" d="m122.40648 123.44314l0 0c0 -2.968773 2.4066696 -5.375435 5.375435 -5.375435l127.73733 0c1.4256439 0 2.792923 0.5663376 3.8010101 1.5744247c1.0080872 1.0080948 1.5744324 2.3753586 1.5744324 3.8010101l0 21.501091c0 2.9687805 -2.4066772 5.3754425 -5.3754425 5.3754425l-127.73733 0c-2.9687653 0 -5.375435 -2.406662 -5.375435 -5.3754425z" fill-rule="evenodd"/><path fill="#000000" d="m137.1603 130.83994l3.0 0q2.296875 0 2.296875 1.78125q0 0.984375 -0.984375 1.5q0.640625 0.15625 0.984375 0.59375q0.359375 0.421875 0.359375 1.09375q0 1.03125 -0.703125 1.578125q-0.6875 0.546875 -2.015625 0.546875l-3.0 0l0 -0.859375l0.359375 -0.03125q0.296875 -0.03125 0.296875 -0.296875l0 -4.96875l-0.59375 -0.03125l0 -0.90625zm1.90625 3.859375l0 2.234375l1.0 0q1.375 0 1.375 -1.125q0 -0.546875 -0.34375 -0.828125q-0.34375 -0.28125 -0.984375 -0.28125l-1.046875 0zm0 -2.875l0 1.9375l0.78125 0q0.65625 0 0.96875 -0.265625q0.328125 -0.28125 0.328125 -0.765625q0 -0.5 -0.328125 -0.703125q-0.3125 -0.203125 -0.890625 -0.203125l-0.859375 0zm7.1182556 0.59375q0.796875 0 1.265625 0.390625q0.46875 0.390625 0.46875 1.09375q0 0.46875 -0.203125 0.828125q-0.203125 0.34375 -0.5 0.546875q-0.296875 0.203125 -0.734375 0.328125q-0.703125 0.21875 -1.609375 0.21875q0.046875 0.5625 0.359375 0.90625q0.3125 0.34375 0.96875 0.34375q0.671875 0 1.328125 -0.46875l0.40625 0.875q-0.203125 0.1875 -0.71875 0.390625q-0.5 0.203125 -1.15625 0.203125q-1.296875 0 -1.90625 -0.71875q-0.609375 -0.71875 -0.609375 -1.96875q0 -1.265625 0.6875 -2.109375q0.703125 -0.859375 1.953125 -0.859375zm-0.484375 2.4375q0.390625 -0.078125 0.71875 -0.3125q0.328125 -0.25 0.328125 -0.578125q0 -0.640625 -0.640625 -0.640625q-0.59375 0 -0.921875 0.46875q-0.3125 0.46875 -0.34375 1.140625q0.46875 -0.015625 0.859375 -0.078125zm3.6264343 1.890625l0 -2.953125q0 -0.171875 -0.0625 -0.234375q-0.0625 -0.078125 -0.203125 -0.09375l-0.453125 -0.03125l0 -0.859375l1.796875 0l0 0.796875q0.203125 -0.40625 0.59375 -0.671875q0.40625 -0.28125 0.9375 -0.28125q0.53125 0 1.015625 0.234375l0 1.578125l-1.0 0.078125l0 -0.484375q0 -0.21875 -0.109375 -0.265625q-0.109375 -0.0625 -0.28125 -0.0625q-0.421875 0 -0.703125 0.3125q-0.265625 0.296875 -0.265625 0.765625l0 2.46875l0.921875 0.046875l0 0.84375l-2.84375 0l0 -0.8125l0.34375 -0.03125q0.15625 -0.015625 0.234375 -0.078125q0.078125 -0.078125 0.078125 -0.265625zm5.438858 -5.296875l0.703125 0l0 1.125l1.40625 0l-0.109375 0.890625l-1.296875 0l0 2.8125q0 0.46875 0.15625 0.65625q0.171875 0.171875 0.53125 0.171875q0.359375 0 0.71875 -0.21875l0.328125 0.828125q-0.53125 0.375 -1.359375 0.375q-0.484375 0 -0.8125 -0.125q-0.328125 -0.125 -0.484375 -0.265625q-0.15625 -0.15625 -0.25 -0.4375q-0.078125 -0.296875 -0.09375 -0.453125q-0.015625 -0.171875 -0.015625 -0.5l0 -2.84375l-0.75 0l0.109375 -0.78125q0.5 -0.03125 0.75 -0.34375q0.265625 -0.328125 0.46875 -0.890625zm9.126816 2.953125q0 1.375 -0.578125 2.265625q-0.578125 0.875 -1.6875 1.171875l0 0.03125q0.828125 0.046875 1.578125 0.484375q0.53125 0.328125 1.234375 0.328125q0.171875 0 0.328125 -0.015625l-0.125 1.03125q-0.140625 0 -0.28125 0l-1.0625 -0.171875q-0.671875 -0.234375 -1.3125 -0.53125q-0.71875 -0.34375 -1.328125 -0.34375q-0.3125 0 -0.609375 0.0625l0.15625 -0.765625q-2.53125 -0.328125 -2.53125 -3.546875q0 -0.96875 0.28125 -1.703125q0.28125 -0.75 0.765625 -1.15625q0.9375 -0.828125 2.203125 -0.828125q1.265625 0 2.109375 0.90625q0.859375 0.890625 0.859375 2.78125zm-4.890625 -0.046875q0 2.640625 1.828125 2.640625q0.984375 0 1.390625 -0.8125q0.34375 -0.71875 0.34375 -1.75q0 -0.921875 -0.328125 -1.703125q-0.171875 -0.421875 -0.546875 -0.65625q-0.359375 -0.25 -0.953125 -0.25q-0.578125 0 -1.0 0.390625q-0.421875 0.375 -0.578125 0.921875q-0.15625 0.53125 -0.15625 1.21875zm11.325821 2.734375l0 0.84375l-1.734375 0l0 -0.6875q-0.578125 0.828125 -1.71875 0.828125q-1.65625 0 -1.65625 -1.84375l0 -2.46875q0 -0.28125 -0.265625 -0.296875l-0.34375 -0.015625l0 -0.875l1.875 0l0 3.375q0 0.5625 0.15625 0.859375q0.171875 0.28125 0.671875 0.28125q0.5 0 0.796875 -0.296875q0.3125 -0.3125 0.3125 -0.75l0 -2.25q0 -0.171875 -0.0625 -0.234375q-0.0625 -0.078125 -0.203125 -0.09375l-0.34375 -0.015625l0 -0.875l1.875 0l0 4.171875q0 0.171875 0.046875 0.234375q0.0625 0.0625 0.21875 0.078125l0.375 0.03125zm3.3291626 -4.671875q0.796875 0 1.265625 0.390625q0.46875 0.390625 0.46875 1.09375q0 0.46875 -0.203125 0.828125q-0.203125 0.34375 -0.5 0.546875q-0.296875 0.203125 -0.734375 0.328125q-0.703125 0.21875 -1.609375 0.21875q0.046875 0.5625 0.359375 0.90625q0.3125 0.34375 0.96875 0.34375q0.671875 0 1.328125 -0.46875l0.40625 0.875q-0.203125 0.1875 -0.71875 0.390625q-0.5 0.203125 -1.15625 0.203125q-1.296875 0 -1.90625 -0.71875q-0.609375 -0.71875 -0.609375 -1.96875q0 -1.265625 0.6875 -2.109375q0.703125 -0.859375 1.953125 -0.859375zm-0.484375 2.4375q0.390625 -0.078125 0.71875 -0.3125q0.328125 -0.25 0.328125 -0.578125q0 -0.640625 -0.640625 -0.640625q-0.59375 0 -0.921875 0.46875q-0.3125 0.46875 -0.34375 1.140625q0.46875 -0.015625 0.859375 -0.078125zm5.7514343 -1.46875q-0.25 -0.09375 -0.625 -0.09375q-0.359375 0 -0.578125 0.171875q-0.21875 0.15625 -0.21875 0.390625q0 0.234375 0.078125 0.375q0.09375 0.140625 0.265625 0.234375q0.265625 0.140625 0.625 0.234375q0.375 0.09375 0.5625 0.15625q0.1875 0.0625 0.453125 0.203125q0.265625 0.140625 0.40625 0.296875q0.375 0.390625 0.375 1.015625q0 0.796875 -0.578125 1.25q-0.578125 0.453125 -1.484375 0.453125q-1.296875 0 -1.953125 -0.328125l0 -1.484375l0.953125 -0.078125l0 0.515625q0 0.46875 0.890625 0.46875q0.90625 0 0.90625 -0.65625q0 -0.234375 -0.15625 -0.375q-0.15625 -0.15625 -0.3125 -0.203125q-0.140625 -0.0625 -0.359375 -0.109375q-0.203125 -0.046875 -0.40625 -0.09375q-0.1875 -0.0625 -0.421875 -0.15625q-0.21875 -0.09375 -0.5 -0.265625q-0.546875 -0.34375 -0.546875 -1.171875q0 -0.828125 0.578125 -1.265625q0.59375 -0.453125 1.484375 -0.453125q0.890625 0 1.765625 0.421875l0 1.28125l-0.953125 0.078125l0 -0.453125q0 -0.265625 -0.25 -0.359375zm3.0899658 -1.9375l0.703125 0l0 1.125l1.40625 0l-0.109375 0.890625l-1.296875 0l0 2.8125q0 0.46875 0.15625 0.65625q0.171875 0.171875 0.53125 0.171875q0.359375 0 0.71875 -0.21875l0.328125 0.828125q-0.53125 0.375 -1.359375 0.375q-0.484375 0 -0.8125 -0.125q-0.328125 -0.125 -0.484375 -0.265625q-0.15625 -0.15625 -0.25 -0.4375q-0.078125 -0.296875 -0.09375 -0.453125q-0.015625 -0.171875 -0.015625 -0.5l0 -2.84375l-0.75 0l0.109375 -0.78125q0.5 -0.03125 0.75 -0.34375q0.265625 -0.328125 0.46875 -0.890625zm4.767441 1.125l0 4.46875l0.609375 0.03125l0 0.859375l-2.53125 0l0 -0.8125l0.34375 -0.03125q0.3125 -0.03125 0.3125 -0.34375l0 -2.953125q0 -0.1875 -0.0625 -0.25q-0.0625 -0.0625 -0.203125 -0.0625l-0.375 -0.015625l0 -0.890625l1.90625 0zm-1.34375 -0.921875q-0.21875 -0.21875 -0.21875 -0.5625q0 -0.34375 0.21875 -0.5625q0.21875 -0.234375 0.578125 -0.234375q0.375 0 0.59375 0.234375q0.234375 0.21875 0.234375 0.5625q0 0.34375 -0.234375 0.5625q-0.21875 0.21875 -0.59375 0.21875q-0.359375 0 -0.578125 -0.21875zm5.0136566 5.5q1.171875 0 1.171875 -1.875q0 -0.953125 -0.265625 -1.4375q-0.265625 -0.484375 -0.875 -0.484375q-0.609375 0 -0.90625 0.46875q-0.28125 0.46875 -0.28125 1.28125q0 1.5 0.546875 1.875q0.25 0.171875 0.609375 0.171875zm-2.4375 -1.90625q0 -0.78125 0.234375 -1.359375q0.234375 -0.59375 0.625 -0.890625q0.75 -0.578125 1.671875 -0.578125q0.640625 0 1.078125 0.203125q0.4375 0.203125 0.6875 0.484375q0.25 0.265625 0.421875 0.78125q0.1875 0.515625 0.1875 1.21875q0 1.46875 -0.71875 2.234375q-0.703125 0.75 -1.828125 0.75q-1.109375 0 -1.734375 -0.71875q-0.625 -0.71875 -0.625 -2.125zm5.5484314 -1.8125l0 -0.859375l1.8125 0l0 0.6875q0.265625 -0.390625 0.71875 -0.609375q0.46875 -0.234375 1.0 -0.234375q0.8125 0 1.25 0.46875q0.453125 0.453125 0.453125 1.375l0 2.78125l0.59375 0.03125l0 0.859375l-2.53125 0l0 -0.8125l0.359375 -0.03125q0.15625 -0.015625 0.21875 -0.078125q0.078125 -0.078125 0.078125 -0.265625l0 -2.203125q0 -0.578125 -0.1875 -0.859375q-0.171875 -0.296875 -0.6875 -0.296875q-0.5 0 -0.796875 0.328125q-0.296875 0.3125 -0.296875 0.75l0 2.578125l0.609375 0.03125l0 0.859375l-2.53125 0l0 -0.8125l0.34375 -0.03125q0.171875 -0.015625 0.234375 -0.078125q0.078125 -0.078125 0.078125 -0.265625l0 -2.953125q0 -0.328125 -0.265625 -0.328125l-0.453125 -0.03125zm6.831314 3.3125l1.71875 -4.96875l-0.671875 -0.0625l0 -0.875l2.4375 0l2.1875 6.15625l0 0l0.609375 0.03125l0 0.90625l-2.75 0l0 -0.859375l0.40625 -0.03125q0.1875 -0.03125 0.25 -0.09375q0.0625 -0.0625 0 -0.234375l-0.21875 -0.65625l-2.5 0l-0.3125 0.9375l0.640625 0.03125l0 0.90625l-2.46875 0l0 -0.859375l0.359375 -0.03125q0.21875 -0.03125 0.3125 -0.296875zm2.71875 -4.703125l-0.984375 3.03125l1.984375 0l-0.96875 -3.03125l-0.03125 0zm3.8982544 1.390625l0 -0.859375l1.8125 0l0 0.6875q0.265625 -0.390625 0.71875 -0.609375q0.46875 -0.234375 1.0 -0.234375q0.8125 0 1.25 0.46875q0.453125 0.453125 0.453125 1.375l0 2.78125l0.59375 0.03125l0 0.859375l-2.53125 0l0 -0.8125l0.359375 -0.03125q0.15625 -0.015625 0.21875 -0.078125q0.078125 -0.078125 0.078125 -0.265625l0 -2.203125q0 -0.578125 -0.1875 -0.859375q-0.171875 -0.296875 -0.6875 -0.296875q-0.5 0 -0.796875 0.328125q-0.296875 0.3125 -0.296875 0.75l0 2.578125l0.609375 0.03125l0 0.859375l-2.53125 0l0 -0.8125l0.34375 -0.03125q0.171875 -0.015625 0.234375 -0.078125q0.078125 -0.078125 0.078125 -0.265625l0 -2.953125q0 -0.328125 -0.265625 -0.328125l-0.453125 -0.03125zm9.253189 -0.046875q-0.25 -0.09375 -0.625 -0.09375q-0.359375 0 -0.578125 0.171875q-0.21875 0.15625 -0.21875 0.390625q0 0.234375 0.078125 0.375q0.09375 0.140625 0.265625 0.234375q0.265625 0.140625 0.625 0.234375q0.375 0.09375 0.5625 0.15625q0.1875 0.0625 0.453125 0.203125q0.265625 0.140625 0.40625 0.296875q0.375 0.390625 0.375 1.015625q0 0.796875 -0.578125 1.25q-0.578125 0.453125 -1.484375 0.453125q-1.296875 0 -1.953125 -0.328125l0 -1.484375l0.953125 -0.078125l0 0.515625q0 0.46875 0.890625 0.46875q0.90625 0 0.90625 -0.65625q0 -0.234375 -0.15625 -0.375q-0.15625 -0.15625 -0.3125 -0.203125q-0.140625 -0.0625 -0.359375 -0.109375q-0.203125 -0.046875 -0.40625 -0.09375q-0.1875 -0.0625 -0.421875 -0.15625q-0.21875 -0.09375 -0.5 -0.265625q-0.546875 -0.34375 -0.546875 -1.171875q0 -0.828125 0.578125 -1.265625q0.59375 -0.453125 1.484375 -0.453125q0.890625 0 1.765625 0.421875l0 1.28125l-0.953125 0.078125l0 -0.453125q0 -0.265625 -0.25 -0.359375zm5.54039 -0.59375l1.03125 0l1.109375 3.953125l0.03125 0l0.71875 -3.28125l-0.546875 -0.03125l0 -0.859375l2.140625 0l0 0.8125l-0.34375 0.03125q-0.21875 0.015625 -0.296875 0.25l-1.0 4.265625l-1.546875 0l-0.859375 -3.171875l-0.046875 0l-0.890625 3.171875l-1.578125 0l-1.078125 -4.25q-0.046875 -0.15625 -0.109375 -0.203125q-0.0625 -0.046875 -0.1875 -0.0625l-0.3125 -0.03125l0 -0.8125l2.4375 0l0 0.859375l-0.59375 0.03125l0.734375 3.265625l0.015625 0l1.171875 -3.9375zm7.6280518 -0.375q0.796875 0 1.265625 0.390625q0.46875 0.390625 0.46875 1.09375q0 0.46875 -0.203125 0.828125q-0.203125 0.34375 -0.5 0.546875q-0.296875 0.203125 -0.734375 0.328125q-0.703125 0.21875 -1.609375 0.21875q0.046875 0.5625 0.359375 0.90625q0.3125 0.34375 0.96875 0.34375q0.671875 0 1.328125 -0.46875l0.40625 0.875q-0.203125 0.1875 -0.71875 0.390625q-0.5 0.203125 -1.15625 0.203125q-1.296875 0 -1.90625 -0.71875q-0.609375 -0.71875 -0.609375 -1.96875q0 -1.265625 0.6875 -2.109375q0.703125 -0.859375 1.953125 -0.859375zm-0.484375 2.4375q0.390625 -0.078125 0.71875 -0.3125q0.328125 -0.25 0.328125 -0.578125q0 -0.640625 -0.640625 -0.640625q-0.59375 0 -0.921875 0.46875q-0.3125 0.46875 -0.34375 1.140625q0.46875 -0.015625 0.859375 -0.078125zm3.6264343 1.890625l0 -2.953125q0 -0.171875 -0.0625 -0.234375q-0.0625 -0.078125 -0.203125 -0.09375l-0.453125 -0.03125l0 -0.859375l1.796875 0l0 0.796875q0.203125 -0.40625 0.59375 -0.671875q0.40625 -0.28125 0.9375 -0.28125q0.53125 0 1.015625 0.234375l0 1.578125l-1.0 0.078125l0 -0.484375q0 -0.21875 -0.109375 -0.265625q-0.109375 -0.0625 -0.28125 -0.0625q-0.421875 0 -0.703125 0.3125q-0.265625 0.296875 -0.265625 0.765625l0 2.46875l0.921875 0.046875l0 0.84375l-2.84375 0l0 -0.8125l0.34375 -0.03125q0.15625 -0.015625 0.234375 -0.078125q0.078125 -0.078125 0.078125 -0.265625zm6.938858 -4.328125q0.796875 0 1.265625 0.390625q0.46875 0.390625 0.46875 1.09375q0 0.46875 -0.203125 0.828125q-0.203125 0.34375 -0.5 0.546875q-0.296875 0.203125 -0.734375 0.328125q-0.703125 0.21875 -1.609375 0.21875q0.046875 0.5625 0.359375 0.90625q0.3125 0.34375 0.96875 0.34375q0.671875 0 1.328125 -0.46875l0.40625 0.875q-0.203125 0.1875 -0.71875 0.390625q-0.5 0.203125 -1.15625 0.203125q-1.296875 0 -1.90625 -0.71875q-0.609375 -0.71875 -0.609375 -1.96875q0 -1.265625 0.6875 -2.109375q0.703125 -0.859375 1.953125 -0.859375zm-0.484375 2.4375q0.390625 -0.078125 0.71875 -0.3125q0.328125 -0.25 0.328125 -0.578125q0 -0.640625 -0.640625 -0.640625q-0.59375 0 -0.921875 0.46875q-0.3125 0.46875 -0.34375 1.140625q0.46875 -0.015625 0.859375 -0.078125zm3.6264343 1.890625l0 -2.953125q0 -0.171875 -0.0625 -0.234375q-0.0625 -0.078125 -0.203125 -0.09375l-0.453125 -0.03125l0 -0.859375l1.796875 0l0 0.796875q0.203125 -0.40625 0.59375 -0.671875q0.40625 -0.28125 0.9375 -0.28125q0.53125 0 1.015625 0.234375l0 1.578125l-1.0 0.078125l0 -0.484375q0 -0.21875 -0.109375 -0.265625q-0.109375 -0.0625 -0.28125 -0.0625q-0.421875 0 -0.703125 0.3125q-0.265625 0.296875 -0.265625 0.765625l0 2.46875l0.921875 0.046875l0 0.84375l-2.84375 0l0 -0.8125l0.34375 -0.03125q0.15625 -0.015625 0.234375 -0.078125q0.078125 -0.078125 0.078125 -0.265625z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m191.64935 35.244095l0 25.29134" fill-rule="evenodd"/><path stroke="#ff9900" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m191.64935 35.244095l0 19.29134" fill-rule="evenodd"/><path fill="#ff9900" stroke="#ff9900" stroke-width="1.0" stroke-linecap="butt" d="m189.99763 54.535435l1.6517181 4.5380974l1.6517334 -4.5380974z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m191.64592 92.78188l0 25.291336" fill-rule="evenodd"/><path stroke="#ff9900" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m191.64592 92.78188l0 19.291344" fill-rule="evenodd"/><path fill="#ff9900" stroke="#ff9900" stroke-width="1.0" stroke-linecap="butt" d="m189.99419 112.07323l1.6517334 4.5380936l1.6517334 -4.5380936z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m421.7318 92.78451l94.92914 25.291336" fill-rule="evenodd"/><path stroke="#ff9900" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m421.73184 92.78451l89.13135 23.746681" fill-rule="evenodd"/><path fill="#ff9900" stroke="#ff9900" stroke-width="1.0" stroke-linecap="butt" d="m510.438 118.12725l4.810364 -0.42775726l-3.9599304 -2.7643585z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m54.16782 92.78188l-0.1889801 25.291336" fill-rule="evenodd"/><path stroke="#ff9900" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m54.16782 92.78188l-0.14414597 19.291504" fill-rule="evenodd"/><path fill="#ff9900" stroke="#ff9900" stroke-width="1.0" stroke-linecap="butt" d="m52.371986 112.06105l1.6177788 4.550316l1.6855965 -4.525635z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m191.64935 35.244095l-137.48032 25.29134" fill-rule="evenodd"/><path stroke="#ff9900" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m191.64935 35.244095l-131.57933 24.205776" fill-rule="evenodd"/><path fill="#ff9900" stroke="#ff9900" stroke-width="1.0" stroke-linecap="butt" d="m59.771175 57.825397l-4.16436 2.4455376l4.762047 0.80340576z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m191.64935 35.244095l230.07874 25.29134" fill-rule="evenodd"/><path stroke="#ff9900" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m191.64935 35.244095l224.11469 24.635738" fill-rule="evenodd"/><path fill="#ff9900" stroke="#ff9900" stroke-width="1.0" stroke-linecap="butt" d="m415.58356 61.521675l4.6914062 -1.1459808l-4.3304443 -2.137703z" fill-rule="evenodd"/><path fill="#ff9900" d="m281.3436 123.44633l0 0c0 -2.968773 2.4066772 -5.375435 5.3754272 -5.375435l78.225525 0c1.4256592 0 2.7929077 0.5663376 3.8009949 1.5744324c1.0080872 1.0080872 1.5744324 2.375351 1.5744324 3.8010025l0 21.501106c0 2.9687653 -2.4066467 5.3754272 -5.3754272 5.3754272l-78.225525 0c-2.96875 0 -5.3754272 -2.406662 -5.3754272 -5.3754272z" fill-rule="evenodd"/><path fill="#000000" d="m293.84845 136.56313l0 -4.28125l-0.53125 -0.03125l0 -0.796875l2.265625 0l0 0.75l-0.3125 0.03125q-0.265625 0.03125 -0.265625 0.296875l0 4.296875l0.53125 0.03125l0 0.796875l-2.265625 0l0 -0.75l0.3125 -0.03125q0.140625 -0.015625 0.203125 -0.078125q0.0625 -0.0625 0.0625 -0.234375zm2.238739 -2.84375l0 -0.75l1.578125 0l0 0.609375q0.515625 -0.75 1.421875 -0.75q0.921875 0 1.234375 0.75q0.53125 -0.75 1.421875 -0.75q0.671875 0 1.03125 0.40625q0.359375 0.40625 0.359375 1.21875l0 2.421875l0.53125 0.03125l0 0.75l-2.21875 0l0 -0.703125l0.3125 -0.03125q0.140625 -0.015625 0.203125 -0.078125q0.0625 -0.0625 0.0625 -0.21875l0 -1.9375q0 -0.515625 -0.125 -0.75q-0.109375 -0.25 -0.53125 -0.25q-0.421875 0 -0.65625 0.265625q-0.234375 0.25 -0.234375 0.6875l0 2.234375l0.515625 0.03125l0 0.75l-2.203125 0l0 -0.703125l0.3125 -0.03125q0.140625 -0.015625 0.203125 -0.078125q0.0625 -0.0625 0.0625 -0.234375l0 -1.921875q0 -0.515625 -0.125 -0.75q-0.109375 -0.25 -0.546875 -0.25q-0.421875 0 -0.65625 0.265625q-0.21875 0.265625 -0.21875 0.671875l0 2.25l0.5 0.03125l0 0.75l-2.1875 0l0 -0.703125l0.296875 -0.03125q0.140625 -0.015625 0.203125 -0.078125q0.078125 -0.0625 0.078125 -0.234375l0 0l0 -2.578125q0 -0.15625 -0.046875 -0.21875q-0.046875 -0.0625 -0.1875 -0.0625l-0.390625 -0.03125zm12.3281555 -0.5625l0 3.46875q0 0.140625 0.046875 0.203125q0.046875 0.046875 0.1875 0.0625l0.296875 0.015625l0 0.75l-1.484375 0l0 -0.546875l-0.03125 0q-0.46875 0.671875 -1.28125 0.671875q-0.953125 0 -1.40625 -0.609375q-0.453125 -0.609375 -0.453125 -1.65625q0 -1.28125 0.609375 -1.984375q0.625 -0.703125 1.84375 -0.703125q0.796875 0 1.671875 0.328125zm-1.078125 3.015625l0 -2.421875q-0.265625 -0.109375 -0.71875 -0.109375q-0.625 0 -0.90625 0.5q-0.28125 0.5 -0.28125 1.328125q0 1.515625 0.96875 1.515625q0.40625 0 0.671875 -0.25q0.265625 -0.25 0.265625 -0.5625zm6.3203125 1.46875q-0.078125 1.046875 -0.59375 1.53125q-0.515625 0.484375 -1.609375 0.484375q-1.078125 0 -1.703125 -0.3125l0 -1.296875l0.890625 -0.078125l0 0.453125q0 0.25 0.234375 0.328125q0.234375 0.09375 0.640625 0.09375q0.59375 0 0.828125 -0.359375q0.25 -0.359375 0.25 -1.0l0 -0.515625q-0.4375 0.515625 -1.234375 0.515625q-0.484375 0 -0.859375 -0.1875q-0.359375 -0.1875 -0.5625 -0.5q-0.40625 -0.640625 -0.40625 -1.484375q0 -1.234375 0.65625 -1.859375q0.671875 -0.625 1.765625 -0.625q0.921875 0 1.734375 0.34375l0 3.453125q0 0.796875 -0.03125 1.015625zm-1.0625 -1.890625l0 -2.0q-0.328125 -0.109375 -0.734375 -0.109375q-0.578125 0 -0.890625 0.421875q-0.296875 0.40625 -0.296875 1.125q0 1.484375 0.96875 1.484375q0.40625 0 0.671875 -0.25q0.28125 -0.25 0.28125 -0.671875zm4.3401184 -2.921875q0.6875 0 1.09375 0.34375q0.421875 0.328125 0.421875 0.953125q0 0.40625 -0.1875 0.71875q-0.171875 0.3125 -0.4375 0.5q-0.25 0.171875 -0.625 0.28125q-0.625 0.1875 -1.40625 0.1875q0.03125 0.484375 0.296875 0.796875q0.28125 0.296875 0.859375 0.296875q0.578125 0 1.15625 -0.40625l0.359375 0.75q-0.1875 0.171875 -0.640625 0.359375q-0.4375 0.171875 -1.0 0.171875q-1.140625 0 -1.671875 -0.625q-0.53125 -0.640625 -0.53125 -1.734375q0 -1.109375 0.609375 -1.84375q0.609375 -0.75 1.703125 -0.75zm-0.4375 2.125q0.34375 -0.0625 0.625 -0.265625q0.296875 -0.21875 0.296875 -0.5q0 -0.578125 -0.5625 -0.578125q-0.515625 0 -0.796875 0.421875q-0.28125 0.421875 -0.3125 1.0q0.421875 -0.015625 0.75 -0.078125zm6.3585815 -2.515625q-0.09375 -0.171875 -0.765625 -0.171875q-0.859375 0 -1.3125 0.59375q-0.4375 0.578125 -0.4375 1.6875q0 2.3125 1.71875 2.3125q0.03125 0 0.25 0q0.21875 0 0.40625 -0.0625q0.203125 -0.0625 0.234375 -0.125q0.046875 -0.0625 0.046875 -0.203125l0 -0.75l0.953125 0.0625l0 1.609375q-0.734375 0.390625 -1.859375 0.390625q-1.421875 0 -2.171875 -0.796875q-0.75 -0.8125 -0.75 -2.390625q0 -0.890625 0.25 -1.53125q0.25 -0.65625 0.6875 -1.015625q0.828125 -0.703125 1.984375 -0.703125q0.9375 0 1.765625 0.375l0 1.59375l-0.953125 0.0625l0 -0.734375q0 -0.140625 -0.046875 -0.203125zm3.5374146 -1.4375l0 5.875l0.53125 0.03125l0 0.75l-2.21875 0l0 -0.703125l0.296875 -0.03125q0.28125 -0.03125 0.28125 -0.296875l0 -4.609375q0 -0.203125 -0.234375 -0.21875l-0.328125 -0.015625l0 -0.78125l1.671875 0zm5.29541 2.15625l0 3.46875q0 0.140625 0.046875 0.203125q0.046875 0.046875 0.1875 0.0625l0.296875 0.015625l0 0.75l-1.484375 0l0 -0.546875l-0.03125 0q-0.46875 0.671875 -1.28125 0.671875q-0.953125 0 -1.40625 -0.609375q-0.453125 -0.609375 -0.453125 -1.65625q0 -1.28125 0.609375 -1.984375q0.625 -0.703125 1.84375 -0.703125q0.796875 0 1.671875 0.328125zm-1.078125 3.015625l0 -2.421875q-0.265625 -0.109375 -0.71875 -0.109375q-0.625 0 -0.90625 0.5q-0.28125 0.5 -0.28125 1.328125q0 1.515625 0.96875 1.515625q0.40625 0 0.671875 -0.25q0.265625 -0.25 0.265625 -0.5625zm4.617157 -2.5q-0.21875 -0.078125 -0.546875 -0.078125q-0.3125 0 -0.5 0.140625q-0.1875 0.140625 -0.1875 0.359375q0 0.203125 0.0625 0.328125q0.078125 0.109375 0.21875 0.1875q0.234375 0.125 0.5625 0.21875q0.328125 0.078125 0.484375 0.140625q0.15625 0.046875 0.390625 0.171875q0.25 0.125 0.359375 0.265625q0.328125 0.34375 0.328125 0.875q0 0.703125 -0.515625 1.109375q-0.5 0.390625 -1.28125 0.390625q-1.140625 0 -1.71875 -0.296875l0 -1.296875l0.84375 -0.0625l0 0.453125q0 0.40625 0.78125 0.40625q0.78125 0 0.78125 -0.5625q0 -0.21875 -0.140625 -0.34375q-0.125 -0.125 -0.265625 -0.171875q-0.125 -0.046875 -0.3125 -0.09375q-0.171875 -0.046875 -0.34375 -0.09375q-0.171875 -0.046875 -0.375 -0.125q-0.203125 -0.078125 -0.4375 -0.234375q-0.484375 -0.3125 -0.484375 -1.03125q0 -0.71875 0.515625 -1.109375q0.515625 -0.390625 1.296875 -0.390625q0.78125 0 1.546875 0.375l0 1.109375l-0.84375 0.0625l0 -0.390625q0 -0.234375 -0.21875 -0.3125zm4.2070007 0q-0.21875 -0.078125 -0.546875 -0.078125q-0.3125 0 -0.5 0.140625q-0.1875 0.140625 -0.1875 0.359375q0 0.203125 0.0625 0.328125q0.078125 0.109375 0.21875 0.1875q0.234375 0.125 0.5625 0.21875q0.328125 0.078125 0.484375 0.140625q0.15625 0.046875 0.390625 0.171875q0.25 0.125 0.359375 0.265625q0.328125 0.34375 0.328125 0.875q0 0.703125 -0.515625 1.109375q-0.5 0.390625 -1.28125 0.390625q-1.140625 0 -1.71875 -0.296875l0 -1.296875l0.84375 -0.0625l0 0.453125q0 0.40625 0.78125 0.40625q0.78125 0 0.78125 -0.5625q0 -0.21875 -0.140625 -0.34375q-0.125 -0.125 -0.265625 -0.171875q-0.125 -0.046875 -0.3125 -0.09375q-0.171875 -0.046875 -0.34375 -0.09375q-0.171875 -0.046875 -0.375 -0.125q-0.203125 -0.078125 -0.4375 -0.234375q-0.484375 -0.3125 -0.484375 -1.03125q0 -0.71875 0.515625 -1.109375q0.515625 -0.390625 1.296875 -0.390625q0.78125 0 1.546875 0.375l0 1.109375l-0.84375 0.0625l0 -0.390625q0 -0.234375 -0.21875 -0.3125zm3.4569702 -0.703125l0 3.90625l0.53125 0.03125l0 0.75l-2.21875 0l0 -0.703125l0.296875 -0.03125q0.28125 -0.03125 0.28125 -0.296875l0 -2.59375q0 -0.15625 -0.0625 -0.21875q-0.046875 -0.0625 -0.171875 -0.0625l-0.328125 -0.015625l0 -0.765625l1.671875 0zm-1.1875 -0.8125q-0.1875 -0.1875 -0.1875 -0.484375q0 -0.296875 0.1875 -0.5q0.203125 -0.203125 0.515625 -0.203125q0.328125 0 0.515625 0.203125q0.203125 0.203125 0.203125 0.5q0 0.296875 -0.203125 0.484375q-0.1875 0.1875 -0.515625 0.1875q-0.3125 0 -0.515625 -0.1875zm2.839264 4.453125l0 -2.859375l-0.796875 0l0 -0.78125l0.796875 0l0 -0.359375q0 -0.90625 0.421875 -1.3125q0.390625 -0.390625 1.140625 -0.390625q0.765625 0 1.359375 0.328125l-0.28125 0.75q-0.4375 -0.234375 -0.84375 -0.234375q-0.390625 0 -0.546875 0.203125q-0.140625 0.1875 -0.140625 0.578125l0 0.4375l1.390625 0l0 0.78125l-1.390625 0l0 3.125l0.796875 0.03125l0 0.75l-2.484375 0l0 -0.703125l0.3125 -0.03125q0.140625 -0.015625 0.203125 -0.078125q0.0625 -0.0625 0.0625 -0.234375zm4.5607605 -3.640625l0 3.90625l0.53125 0.03125l0 0.75l-2.21875 0l0 -0.703125l0.296875 -0.03125q0.28125 -0.03125 0.28125 -0.296875l0 -2.59375q0 -0.15625 -0.0625 -0.21875q-0.046875 -0.0625 -0.171875 -0.0625l-0.328125 -0.015625l0 -0.765625l1.671875 0zm-1.1875 -0.8125q-0.1875 -0.1875 -0.1875 -0.484375q0 -0.296875 0.1875 -0.5q0.203125 -0.203125 0.515625 -0.203125q0.328125 0 0.515625 0.203125q0.203125 0.203125 0.203125 0.5q0 0.296875 -0.203125 0.484375q-0.1875 0.1875 -0.515625 0.1875q-0.3125 0 -0.515625 -0.1875zm4.604889 0.671875q0.6875 0 1.09375 0.34375q0.421875 0.328125 0.421875 0.953125q0 0.40625 -0.1875 0.71875q-0.171875 0.3125 -0.4375 0.5q-0.25 0.171875 -0.625 0.28125q-0.625 0.1875 -1.40625 0.1875q0.03125 0.484375 0.296875 0.796875q0.28125 0.296875 0.859375 0.296875q0.578125 0 1.15625 -0.40625l0.359375 0.75q-0.1875 0.171875 -0.640625 0.359375q-0.4375 0.171875 -1.0 0.171875q-1.140625 0 -1.671875 -0.625q-0.53125 -0.640625 -0.53125 -1.734375q0 -1.109375 0.609375 -1.84375q0.609375 -0.75 1.703125 -0.75zm-0.4375 2.125q0.34375 -0.0625 0.625 -0.265625q0.296875 -0.21875 0.296875 -0.5q0 -0.578125 -0.5625 -0.578125q-0.515625 0 -0.796875 0.421875q-0.28125 0.421875 -0.3125 1.0q0.421875 -0.015625 0.75 -0.078125zm3.186676 1.65625l0 -2.578125q0 -0.15625 -0.0625 -0.21875q-0.046875 -0.0625 -0.171875 -0.0625l-0.390625 -0.03125l0 -0.75l1.578125 0l0 0.6875q0.15625 -0.359375 0.515625 -0.59375q0.359375 -0.234375 0.8125 -0.234375q0.46875 0 0.890625 0.203125l0 1.390625l-0.875 0.0625l0 -0.421875q0 -0.1875 -0.09375 -0.234375q-0.09375 -0.046875 -0.25 -0.046875q-0.375 0 -0.609375 0.265625q-0.234375 0.265625 -0.234375 0.671875l0 2.15625l0.8125 0.046875l0 0.734375l-2.5 0l0 -0.703125l0.296875 -0.03125q0.140625 -0.015625 0.203125 -0.078125q0.078125 -0.0625 0.078125 -0.234375z" fill-rule="nonzero"/><path fill="#ff9900" d="m374.3386 123.446304l0 0c0 -2.968773 2.4066467 -5.375435 5.3754272 -5.375435l84.80817 0c1.4256592 0 2.7929382 0.5663376 3.8010254 1.5744247c1.0080872 1.0080872 1.5744324 2.375351 1.5744324 3.8010101l0 21.501099c0 2.9687653 -2.4066772 5.3754272 -5.375458 5.3754272l-84.80817 0c-2.9687805 0 -5.3754272 -2.406662 -5.3754272 -5.3754272z" fill-rule="evenodd"/><path fill="#000000" d="m387.77484 136.5631l0 -4.28125l-0.53125 -0.03125l0 -0.796875l2.265625 0l0 0.75l-0.3125 0.03125q-0.265625 0.03125 -0.265625 0.296875l0 4.296875l0.53125 0.03125l0 0.796875l-2.265625 0l0 -0.75l0.3125 -0.03125q0.140625 -0.015625 0.203125 -0.078125q0.0625 -0.0625 0.0625 -0.234375zm2.238739 -2.84375l0 -0.75l1.578125 0l0 0.609375q0.515625 -0.75 1.421875 -0.75q0.921875 0 1.234375 0.75q0.53125 -0.75 1.421875 -0.75q0.671875 0 1.03125 0.40625q0.359375 0.40625 0.359375 1.21875l0 2.421875l0.53125 0.03125l0 0.75l-2.21875 0l0 -0.703125l0.3125 -0.03125q0.140625 -0.015625 0.203125 -0.078125q0.0625 -0.0625 0.0625 -0.21875l0 -1.9375q0 -0.515625 -0.125 -0.75q-0.109375 -0.25 -0.53125 -0.25q-0.421875 0 -0.65625 0.265625q-0.234375 0.25 -0.234375 0.6875l0 2.234375l0.515625 0.03125l0 0.75l-2.203125 0l0 -0.703125l0.3125 -0.03125q0.140625 -0.015625 0.203125 -0.078125q0.0625 -0.0625 0.0625 -0.234375l0 -1.921875q0 -0.515625 -0.125 -0.75q-0.109375 -0.25 -0.546875 -0.25q-0.421875 0 -0.65625 0.265625q-0.21875 0.265625 -0.21875 0.671875l0 2.25l0.5 0.03125l0 0.75l-2.1875 0l0 -0.703125l0.296875 -0.03125q0.140625 -0.015625 0.203125 -0.078125q0.078125 -0.0625 0.078125 -0.234375l0 0l0 -2.578125q0 -0.15625 -0.046875 -0.21875q-0.046875 -0.0625 -0.1875 -0.0625l-0.390625 -0.03125zm12.3281555 -0.5625l0 3.46875q0 0.140625 0.046875 0.203125q0.046875 0.046875 0.1875 0.0625l0.296875 0.015625l0 0.75l-1.484375 0l0 -0.546875l-0.03125 0q-0.46875 0.671875 -1.28125 0.671875q-0.953125 0 -1.40625 -0.609375q-0.453125 -0.609375 -0.453125 -1.65625q0 -1.28125 0.609375 -1.984375q0.625 -0.703125 1.84375 -0.703125q0.796875 0 1.671875 0.328125zm-1.078125 3.015625l0 -2.421875q-0.265625 -0.109375 -0.71875 -0.109375q-0.625 0 -0.90625 0.5q-0.28125 0.5 -0.28125 1.328125q0 1.515625 0.96875 1.515625q0.40625 0 0.671875 -0.25q0.265625 -0.25 0.265625 -0.5625zm6.3203125 1.46875q-0.078125 1.046875 -0.59375 1.53125q-0.515625 0.484375 -1.609375 0.484375q-1.078125 0 -1.703125 -0.3125l0 -1.296875l0.890625 -0.078125l0 0.453125q0 0.25 0.234375 0.328125q0.234375 0.09375 0.640625 0.09375q0.59375 0 0.828125 -0.359375q0.25 -0.359375 0.25 -1.0l0 -0.515625q-0.4375 0.515625 -1.234375 0.515625q-0.484375 0 -0.859375 -0.1875q-0.359375 -0.1875 -0.5625 -0.5q-0.40625 -0.640625 -0.40625 -1.484375q0 -1.234375 0.65625 -1.859375q0.671875 -0.625 1.765625 -0.625q0.921875 0 1.734375 0.34375l0 3.453125q0 0.796875 -0.03125 1.015625zm-1.0625 -1.890625l0 -2.0q-0.328125 -0.109375 -0.734375 -0.109375q-0.578125 0 -0.890625 0.421875q-0.296875 0.40625 -0.296875 1.125q0 1.484375 0.96875 1.484375q0.40625 0 0.671875 -0.25q0.28125 -0.25 0.28125 -0.671875zm4.3401184 -2.921875q0.6875 0 1.09375 0.34375q0.421875 0.328125 0.421875 0.953125q0 0.40625 -0.1875 0.71875q-0.171875 0.3125 -0.4375 0.5q-0.25 0.171875 -0.625 0.28125q-0.625 0.1875 -1.40625 0.1875q0.03125 0.484375 0.296875 0.796875q0.28125 0.296875 0.859375 0.296875q0.578125 0 1.15625 -0.40625l0.359375 0.75q-0.1875 0.171875 -0.640625 0.359375q-0.4375 0.171875 -1.0 0.171875q-1.140625 0 -1.671875 -0.625q-0.53125 -0.640625 -0.53125 -1.734375q0 -1.109375 0.609375 -1.84375q0.609375 -0.75 1.703125 -0.75zm-0.4375 2.125q0.34375 -0.0625 0.625 -0.265625q0.296875 -0.21875 0.296875 -0.5q0 -0.578125 -0.5625 -0.578125q-0.515625 0 -0.796875 0.421875q-0.28125 0.421875 -0.3125 1.0q0.421875 -0.015625 0.75 -0.078125zm4.5460815 1.859375q1.125 0 1.125 -0.890625q0 -0.46875 -0.5 -0.6875q-0.265625 -0.140625 -0.6875 -0.25q-0.40625 -0.109375 -0.6875 -0.21875q-0.265625 -0.125 -0.546875 -0.328125q-0.546875 -0.40625 -0.546875 -1.25q0 -0.859375 0.578125 -1.34375q0.59375 -0.5 1.546875 -0.5q0.96875 0 1.78125 0.375l0 1.40625l-0.9375 0.0625l0 -0.546875q0 -0.140625 -0.046875 -0.203125q-0.09375 -0.171875 -0.65625 -0.171875q-0.546875 0 -0.828125 0.203125q-0.265625 0.1875 -0.265625 0.59375q0 0.296875 0.234375 0.53125q0.140625 0.140625 0.4375 0.25q0.296875 0.09375 0.625 0.203125q0.328125 0.09375 0.578125 0.21875q0.265625 0.109375 0.546875 0.328125q0.578125 0.421875 0.578125 1.296875q0 0.875 -0.640625 1.390625q-0.625 0.5 -1.65625 0.5q-1.03125 0 -1.859375 -0.390625l0 -1.46875l0.953125 -0.0625l0 0.5625q0 0.140625 0.03125 0.21875q0.046875 0.0625 0.234375 0.125q0.203125 0.046875 0.609375 0.046875zm5.2755127 -3.984375q0.6875 0 1.09375 0.34375q0.421875 0.328125 0.421875 0.953125q0 0.40625 -0.1875 0.71875q-0.171875 0.3125 -0.4375 0.5q-0.25 0.171875 -0.625 0.28125q-0.625 0.1875 -1.40625 0.1875q0.03125 0.484375 0.296875 0.796875q0.28125 0.296875 0.859375 0.296875q0.578125 0 1.15625 -0.40625l0.359375 0.75q-0.1875 0.171875 -0.640625 0.359375q-0.4375 0.171875 -1.0 0.171875q-1.140625 0 -1.671875 -0.625q-0.53125 -0.640625 -0.53125 -1.734375q0 -1.109375 0.609375 -1.84375q0.609375 -0.75 1.703125 -0.75zm-0.4375 2.125q0.34375 -0.0625 0.625 -0.265625q0.296875 -0.21875 0.296875 -0.5q0 -0.578125 -0.5625 -0.578125q-0.515625 0 -0.796875 0.421875q-0.28125 0.421875 -0.3125 1.0q0.421875 -0.015625 0.75 -0.078125zm6.7492065 2.6875q-0.078125 1.046875 -0.59375 1.53125q-0.515625 0.484375 -1.609375 0.484375q-1.078125 0 -1.703125 -0.3125l0 -1.296875l0.890625 -0.078125l0 0.453125q0 0.25 0.234375 0.328125q0.234375 0.09375 0.640625 0.09375q0.59375 0 0.828125 -0.359375q0.25 -0.359375 0.25 -1.0l0 -0.515625q-0.4375 0.515625 -1.234375 0.515625q-0.484375 0 -0.859375 -0.1875q-0.359375 -0.1875 -0.5625 -0.5q-0.40625 -0.640625 -0.40625 -1.484375q0 -1.234375 0.65625 -1.859375q0.671875 -0.625 1.765625 -0.625q0.921875 0 1.734375 0.34375l0 3.453125q0 0.796875 -0.03125 1.015625zm-1.0625 -1.890625l0 -2.0q-0.328125 -0.109375 -0.734375 -0.109375q-0.578125 0 -0.890625 0.421875q-0.296875 0.40625 -0.296875 1.125q0 1.484375 0.96875 1.484375q0.40625 0 0.671875 -0.25q0.28125 -0.25 0.28125 -0.671875zm1.9338684 -2.03125l0 -0.75l1.578125 0l0 0.609375q0.515625 -0.75 1.421875 -0.75q0.921875 0 1.234375 0.75q0.53125 -0.75 1.421875 -0.75q0.671875 0 1.03125 0.40625q0.359375 0.40625 0.359375 1.21875l0 2.421875l0.53125 0.03125l0 0.75l-2.21875 0l0 -0.703125l0.3125 -0.03125q0.140625 -0.015625 0.203125 -0.078125q0.0625 -0.0625 0.0625 -0.21875l0 -1.9375q0 -0.515625 -0.125 -0.75q-0.109375 -0.25 -0.53125 -0.25q-0.421875 0 -0.65625 0.265625q-0.234375 0.25 -0.234375 0.6875l0 2.234375l0.515625 0.03125l0 0.75l-2.203125 0l0 -0.703125l0.3125 -0.03125q0.140625 -0.015625 0.203125 -0.078125q0.0625 -0.0625 0.0625 -0.234375l0 -1.921875q0 -0.515625 -0.125 -0.75q-0.109375 -0.25 -0.546875 -0.25q-0.421875 0 -0.65625 0.265625q-0.21875 0.265625 -0.21875 0.671875l0 2.25l0.5 0.03125l0 0.75l-2.1875 0l0 -0.703125l0.296875 -0.03125q0.140625 -0.015625 0.203125 -0.078125q0.078125 -0.0625 0.078125 -0.234375l0 0l0 -2.578125q0 -0.15625 -0.046875 -0.21875q-0.046875 -0.0625 -0.1875 -0.0625l-0.390625 -0.03125zm10.4844055 -0.890625q0.6875 0 1.09375 0.34375q0.421875 0.328125 0.421875 0.953125q0 0.40625 -0.1875 0.71875q-0.171875 0.3125 -0.4375 0.5q-0.25 0.171875 -0.625 0.28125q-0.625 0.1875 -1.40625 0.1875q0.03125 0.484375 0.296875 0.796875q0.28125 0.296875 0.859375 0.296875q0.578125 0 1.15625 -0.40625l0.359375 0.75q-0.1875 0.171875 -0.640625 0.359375q-0.4375 0.171875 -1.0 0.171875q-1.140625 0 -1.671875 -0.625q-0.53125 -0.640625 -0.53125 -1.734375q0 -1.109375 0.609375 -1.84375q0.609375 -0.75 1.703125 -0.75zm-0.4375 2.125q0.34375 -0.0625 0.625 -0.265625q0.296875 -0.21875 0.296875 -0.5q0 -0.578125 -0.5625 -0.578125q-0.515625 0 -0.796875 0.421875q-0.28125 0.421875 -0.3125 1.0q0.421875 -0.015625 0.75 -0.078125zm2.5460815 -1.25l0 -0.734375l1.578125 0l0 0.609375q0.25 -0.359375 0.640625 -0.546875q0.40625 -0.203125 0.875 -0.203125q0.703125 0 1.09375 0.40625q0.390625 0.40625 0.390625 1.21875l0 2.421875l0.53125 0.03125l0 0.75l-2.21875 0l0 -0.703125l0.296875 -0.03125q0.140625 -0.015625 0.203125 -0.078125q0.078125 -0.0625 0.078125 -0.234375l0 -1.921875q0 -0.515625 -0.171875 -0.75q-0.15625 -0.25 -0.59375 -0.25q-0.4375 0 -0.703125 0.28125q-0.265625 0.265625 -0.265625 0.65625l0 2.25l0.53125 0.03125l0 0.75l-2.21875 0l0 -0.703125l0.3125 -0.03125q0.140625 -0.015625 0.203125 -0.078125q0.0625 -0.0625 0.0625 -0.234375l0 -2.578125q0 -0.28125 -0.234375 -0.296875l-0.390625 -0.03125zm6.3172913 -1.734375l0.609375 0l0 1.0l1.234375 0l-0.09375 0.78125l-1.140625 0l0 2.453125q0 0.40625 0.140625 0.578125q0.15625 0.15625 0.46875 0.15625q0.3125 0 0.625 -0.203125l0.28125 0.71875q-0.46875 0.328125 -1.1875 0.328125q-0.421875 0 -0.71875 -0.109375q-0.28125 -0.09375 -0.421875 -0.21875q-0.140625 -0.140625 -0.21875 -0.390625q-0.0625 -0.25 -0.078125 -0.390625q0 -0.15625 0 -0.4375l0 -2.484375l-0.671875 0l0.09375 -0.6875q0.4375 -0.03125 0.65625 -0.296875q0.234375 -0.28125 0.421875 -0.796875zm4.829193 0.859375q0.6875 0 1.09375 0.34375q0.421875 0.328125 0.421875 0.953125q0 0.40625 -0.1875 0.71875q-0.171875 0.3125 -0.4375 0.5q-0.25 0.171875 -0.625 0.28125q-0.625 0.1875 -1.40625 0.1875q0.03125 0.484375 0.296875 0.796875q0.28125 0.296875 0.859375 0.296875q0.578125 0 1.15625 -0.40625l0.359375 0.75q-0.1875 0.171875 -0.640625 0.359375q-0.4375 0.171875 -1.0 0.171875q-1.140625 0 -1.671875 -0.625q-0.53125 -0.640625 -0.53125 -1.734375q0 -1.109375 0.609375 -1.84375q0.609375 -0.75 1.703125 -0.75zm-0.4375 2.125q0.34375 -0.0625 0.625 -0.265625q0.296875 -0.21875 0.296875 -0.5q0 -0.578125 -0.5625 -0.578125q-0.515625 0 -0.796875 0.421875q-0.28125 0.421875 -0.3125 1.0q0.421875 -0.015625 0.75 -0.078125zm3.1867065 1.65625l0 -2.578125q0 -0.15625 -0.0625 -0.21875q-0.046875 -0.0625 -0.171875 -0.0625l-0.390625 -0.03125l0 -0.75l1.578125 0l0 0.6875q0.15625 -0.359375 0.515625 -0.59375q0.359375 -0.234375 0.8125 -0.234375q0.46875 0 0.890625 0.203125l0 1.390625l-0.875 0.0625l0 -0.421875q0 -0.1875 -0.09375 -0.234375q-0.09375 -0.046875 -0.25 -0.046875q-0.375 0 -0.609375 0.265625q-0.234375 0.265625 -0.234375 0.671875l0 2.15625l0.8125 0.046875l0 0.734375l-2.5 0l0 -0.703125l0.296875 -0.03125q0.140625 -0.015625 0.203125 -0.078125q0.078125 -0.0625 0.078125 -0.234375z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m421.7318 92.78451l0.3779602 25.291336" fill-rule="evenodd"/><path stroke="#ff9900" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m421.73184 92.78451l0.28829956 19.292007" fill-rule="evenodd"/><path fill="#ff9900" stroke="#ff9900" stroke-width="1.0" stroke-linecap="butt" d="m420.3686 112.1012l1.7193604 4.5129166l1.5837402 -4.562271z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m421.7318 92.78451l-95.90549 25.291336" fill-rule="evenodd"/><path stroke="#ff9900" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m421.73184 92.78451l-90.10388 23.761383" fill-rule="evenodd"/><path fill="#ff9900" stroke="#ff9900" stroke-width="1.0" stroke-linecap="butt" d="m331.2068 114.94876l-3.9668884 2.7543106l4.809265 0.43994904z" fill-rule="evenodd"/></g></svg>
\ No newline at end of file
diff --git a/tensorflow/lite/g3doc/inference_with_metadata/task_library/images/segmentation-output.png b/tensorflow/lite/g3doc/inference_with_metadata/task_library/images/segmentation-output.png
new file mode 100644
index 00000000000..e871df337f2
Binary files /dev/null and b/tensorflow/lite/g3doc/inference_with_metadata/task_library/images/segmentation-output.png differ
diff --git a/tensorflow/lite/g3doc/inference_with_metadata/task_library/images/sparrow.jpg b/tensorflow/lite/g3doc/inference_with_metadata/task_library/images/sparrow.jpg
new file mode 100644
index 00000000000..25d213ea406
Binary files /dev/null and b/tensorflow/lite/g3doc/inference_with_metadata/task_library/images/sparrow.jpg differ
diff --git a/tensorflow/lite/g3doc/inference_with_metadata/task_library/nl_classifier.md b/tensorflow/lite/g3doc/inference_with_metadata/task_library/nl_classifier.md
new file mode 100644
index 00000000000..c2c1b2ac68b
--- /dev/null
+++ b/tensorflow/lite/g3doc/inference_with_metadata/task_library/nl_classifier.md
@@ -0,0 +1,151 @@
+# Integrate Natural language classifier
+
+The Task Library's `NLClassifier` API classifies input text into different
+categories, and is a versatile and configurable API that can handle most text
+classification models.
+
+## Key features of the NLClassifier API
+
+*   Takes a single string as input, performs classification with the string and
+    outputs <Label, Score> pairs as classification results.
+
+*   Optional Regex Tokenization available for input text.
+
+*   Configurable to adapt different classification models.
+
+## Supported NLClassifier models
+
+The following models are guaranteed to be compatible with the `NLClassifier`
+API.
+
+*   The <a href="../../models/text_classification/overview.md">movie review
+    sentiment classification</a> model.
+
+*   Models with `average_word_vec` spec created by
+    [TensorFlow Lite Model Maker for text Classfication](https://www.tensorflow.org/lite/tutorials/model_maker_text_classification).
+
+*   Custom models that meet the
+    [model compatibility requirements](#model-compatibility-requirements).
+
+## Run inference in Java
+
+### Step 1: Import Gradle dependency and other settings
+
+Copy the `.tflite` model file to the assets directory of the Android module
+where the model will be run. Specify that the file should not be compressed, and
+add the TensorFlow Lite library to the module’s `build.gradle` file:
+
+```java
+android {
+    // Other settings
+
+    // Specify tflite file should not be compressed for the app apk
+    aaptOptions {
+        noCompress "tflite"
+    }
+
+}
+
+dependencies {
+    // Other dependencies
+
+    // Import the Task Text Library dependency
+    implementation 'org.tensorflow:tensorflow-lite-task-text:0.0.0-nightly'
+}
+```
+
+### Step 2: Run inference using the API
+
+```java
+// Initialization, use NLClassifierOptions to configure input and output tensors
+NLClassifierOptions options = NLClassifierOptions.builder().setInputTensorName(INPUT_TENSOR_NAME).setOutputScoreTensorName(OUTPUT_SCORE_TENSOR_NAME).build();
+NLClassifier classifier = NLClassifier.createFromFileAndOptions(context, modelFile, options);
+
+// Run inference
+List<Category> results = classifier.classify(input);
+```
+
+See the
+[source code](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/java/src/java/org/tensorflow/lite/task/text/nlclassifier/NLClassifier.java)
+for more options to configure `NLClassifier`.
+
+## Run inference in C++
+
+Note: We are working on improving the usability of the C++ Task Library, such as
+providing prebuilt binaries and creating user-friendly workflows to build from
+source code. The C++ API may be subject to change.
+
+```c++
+// Initialization
+std::unique_ptr<NLClassifier> classifier = NLClassifier::CreateFromFileAndOptions(
+    model_path,
+    {
+      .input_tensor_name=kInputTensorName,
+      .output_score_tensor_name=kOutputScoreTensorName,
+    }).value();
+
+// Run inference
+std::vector<core::Category> categories = classifier->Classify(kInput);
+```
+
+See the
+[source code](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/cc/task/text/nlclassifier/nl_classifier.h)
+for more details.
+
+## Example results
+
+Here is an example of the classification results of the
+[movie review model](https://www.tensorflow.org/lite/models/text_classification/overview).
+
+Input: "What a waste of my time."
+
+Output:
+
+```
+category[0]: 'Negative' : '0.81313'
+category[1]: 'Positive' : '0.18687'
+```
+
+Try out the simple
+[CLI demo tool for NLClassifier](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/examples/task/text/desktop/README.md#nlclassifier)
+with your own model and test data.
+
+## Model compatibility requirements
+
+Depending on the use case, the `NLClassifier` API can load a TFLite model with
+or without [TFLite Model Metadata](../../convert/metadata.md).
+
+The compatible models should meet the following requirements:
+
+*   Input tensor: (kTfLiteString/kTfLiteInt32)
+
+    -   Input of the model should be either a kTfLiteString tensor raw input
+        string or a kTfLiteInt32 tensor for regex tokenized indices of raw input
+        string.
+    -   If input type is kTfLiteString, no [Metadata](../../convert/metadata.md)
+        is required for the model.
+    -   If input type is kTfLiteInt32, a `RegexTokenizer` needs to be set up in
+        the input tensor's [Metadata](../../convert/metadata.md).
+
+*   Output score tensor:
+    (kTfLiteUInt8/kTfLiteInt8/kTfLiteInt16/kTfLiteFloat32/kTfLiteFloat64)
+
+    -   Mandatory output tensor for the score of each category classified.
+
+    -   If type is one of the Int types, dequantize it to double/float to
+        corresponding platforms
+
+    -   Can have an optional associated file in the output tensor's
+        corresponding [Metadata](../../convert/metadata.md) for category labels,
+        the file should be a plain text file with one label per line, and the
+        number of labels should match the number of categories as the model
+        outputs.
+
+*   Output label tensor: (kTfLiteString/kTfLiteInt32)
+
+    -   Optional output tensor for the label for each category, should be of the
+        same length as the output score tensor. If this tensor is not present,
+        the API uses score indices as classnames.
+
+    -   Will be ignored if the associated label file is present in output score
+        tensor's Metadata.
diff --git a/tensorflow/lite/g3doc/inference_with_metadata/task_library/object_detector.md b/tensorflow/lite/g3doc/inference_with_metadata/task_library/object_detector.md
new file mode 100644
index 00000000000..09ce3a12a49
--- /dev/null
+++ b/tensorflow/lite/g3doc/inference_with_metadata/task_library/object_detector.md
@@ -0,0 +1,179 @@
+# Integrate object detectors
+
+Object detectors can identify which of a known set of objects might be present
+and provide information about their positions within the given image or a video
+stream. An object detector is trained to detect the presence and location of
+multiple classes of objects. For example, a model might be trained with images
+that contain various pieces of fruit, along with a _label_ that specifies the
+class of fruit they represent (e.g. an apple, a banana, or a strawberry), and
+data specifying where each object appears in the image. See the
+[introduction of object detection](../../models/object_detection/overview.md)
+for more information about object detectors.
+
+Use the Task Library `ObjectDetector` API to deploy your custom object detectors
+or pretrained ones into your model apps.
+
+## Key features of the ObjectDetector API
+
+*   Input image processing, including rotation, resizing, and color space
+    conversion.
+
+*   Label map locale.
+
+*   Score threshold to filter results.
+
+*   Top-k detection results.
+
+*   Label allowlist and denylist.
+
+## Supported object detector models
+
+The following models are guaranteed to be compatible with the `ObjectDetector`
+API.
+
+*   The
+    [pretrained object detection models on TensorFlow Hub](https://tfhub.dev/tensorflow/collections/lite/task-library/object-detector/1).
+
+*   Models created by
+    [AutoML Vision Edge Object Detection](https://cloud.google.com/vision/automl/object-detection/docs).
+
+*   Custom models that meet the
+    [model compatibility requirements](#model-compatibility-requirements).
+
+## Run inference in Java
+
+### Step 1: Import Gradle dependency and other settings
+
+Copy the `.tflite` model file to the assets directory of the Android module
+where the model will be run. Specify that the file should not be compressed, and
+add the TensorFlow Lite library to the module’s `build.gradle` file:
+
+```java
+android {
+    // Other settings
+
+    // Specify tflite file should not be compressed for the app apk
+    aaptOptions {
+        noCompress "tflite"
+    }
+
+}
+
+dependencies {
+    // Other dependencies
+
+    // Import the Task Vision Library dependency
+    implementation 'org.tensorflow:tensorflow-lite-task-vision:0.0.0-nightly'
+}
+```
+
+### Step 2: Using the model
+
+```java
+// Initialization
+ObjectDetectorOptions options = ObjectDetectorOptions.builder().setMaxResults(1).build();
+ObjectDetector objectDetector = ObjectDetector.createFromFileAndOptions(context, modelFile, options);
+
+// Run inference
+List<Detection> results = objectDetector.detect(image);
+```
+
+See the
+[source code and javadoc](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/java/src/java/org/tensorflow/lite/task/vision/detector/ObjectDetector.java)
+for more options to configure `ObjectDetector`.
+
+## Run inference in C++
+
+Note: we are working on improving the usability of the C++ Task Library, such as
+providing prebuilt binaries and creating user-friendly workflows to build from
+source code. The C++ API may be subject to change.
+
+```c++
+// Initialization
+ObjectDetectorOptions options;
+options.mutable_model_file_with_metadata()->set_file_name(model_file);
+std::unique_ptr<ObjectDetector> object_detector = ObjectDetector::CreateFromOptions(options).value();
+
+// Run inference
+const DetectionResult result = object_detector->Detect(*frame_buffer).value();
+```
+
+See the
+[source code](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/cc/task/vision/object_detector.h)
+for more options to configure `ObjectDetector`.
+
+## Example results
+
+Here is an example of the detection results of
+[ssd mobilenet v1](https://tfhub.dev/tensorflow/lite-model/ssd_mobilenet_v1/1/metadata/1)
+from TensorFlow Hub.
+
+<img src="images/dogs.jpg" alt="dogs" width="50%">
+
+```
+Results:
+ Detection #0 (red):
+  Box: (x: 355, y: 133, w: 190, h: 206)
+  Top-1 class:
+   index       : 17
+   score       : 0.73828
+   class name  : dog
+ Detection #1 (green):
+  Box: (x: 103, y: 15, w: 138, h: 369)
+  Top-1 class:
+   index       : 17
+   score       : 0.73047
+   class name  : dog
+```
+
+Render the bounding boxes onto the input image:
+
+<img src="images/detection-output.png" alt="detection output" width="50%">
+
+Try out the simple
+[CLI demo tool for ObjectDetector](https://github.com/tensorflow/tflite-support/tree/master/tensorflow_lite_support/examples/task/vision/desktop#object-detector)
+with your own model and test data.
+
+## Model compatibility requirements
+
+The `ObjectDetector` API expects a TFLite model with mandatory
+[TFLite Model Metadata](../../convert/metadata.md).
+
+The compatible object detector models should meet the following requirements:
+
+*   Input image tensor: (kTfLiteUInt8/kTfLiteFloat32)
+
+    -   image input of size `[batch x height x width x channels]`.
+    -   batch inference is not supported (`batch` is required to be 1).
+    -   only RGB inputs are supported (`channels` is required to be 3).
+    -   if type is kTfLiteFloat32, NormalizationOptions are required to be
+        attached to the metadata for input normalization.
+
+*   Output tensors must be the 4 outputs of a `DetectionPostProcess` op, i.e:
+
+    -   Locations tensor (kTfLiteFloat32)
+        -   tensor of size `[1 x num_results x 4]`, the inner array representing
+            bounding boxes in the form [top, left, right, bottom].
+        -   BoundingBoxProperties are required to be attached to the metadata
+            and must specify `type=BOUNDARIES` and `coordinate_type=RATIO.
+    -   Classes tensor (kTfLiteFloat32)
+
+        -   tensor of size `[1 x num_results]`, each value representing the
+            integer index of a class.
+        -   optional (but recommended) label map(s) can be attached as
+            AssociatedFile-s with type TENSOR_VALUE_LABELS, containing one label
+            per line. The first such AssociatedFile (if any) is used to fill the
+            `class_name` field of the results. The `display_name` field is
+            filled from the AssociatedFile (if any) whose locale matches the
+            `display_names_locale` field of the `ObjectDetectorOptions` used at
+            creation time ("en" by default, i.e. English). If none of these are
+            available, only the `index` field of the results will be filled.
+
+    -   Scores tensor (kTfLiteFloat32)
+
+        -   tensor of size `[1 x num_results]`, each value representing the
+            score of the detected object.
+
+    -   Number of detection tensor (kTfLiteFloat32)
+
+        -   integer num_results as a tensor of size `[1]`.
diff --git a/tensorflow/lite/g3doc/inference_with_metadata/task_library/overview.md b/tensorflow/lite/g3doc/inference_with_metadata/task_library/overview.md
new file mode 100644
index 00000000000..2a2b124c7f9
--- /dev/null
+++ b/tensorflow/lite/g3doc/inference_with_metadata/task_library/overview.md
@@ -0,0 +1,51 @@
+# TensorFlow Lite Task Library
+
+TensorFlow Lite Task Library contains a set of powerful and easy-to-use
+task-specific libraries for app developers to create ML experiences with TFLite.
+It provides optimized out-of-box model interfaces for popular machine learning
+tasks, such as image classification, question and answer, etc. The model
+interfaces are specifically designed for each task to achieve the best
+performance and usability. Task Library works cross-platform and is supported on
+Java, C++, and Swift (coming soon).
+
+## What to expect from the Task Library
+
+*   **Clean and well-defined APIs usable by non-ML-experts** \
+    Inference can be done within just 5 lines of code. Use the powerful and
+    easy-to-use APIs in the Task library as building blocks to help you easily
+    develop ML with TFLite on mobile devices.
+
+*   **Complex but common data processing** \
+    Supports common vision and natural language processing logic to convert
+    between your data and the data format required by the model. Provides the
+    same, shareable processing logic for training and inference.
+
+*   **High performance gain** \
+    Data processing would take no more than a few milliseconds, ensuring the
+    fast inference experience using TensorFlow Lite.
+
+*   **Extensibility and customization** \
+    You can leverage all benefits the Task Library infrastructure provides and
+    easily build your own Android/iOS inference APIs.
+
+## Supported tasks
+
+Below is the list of the supported task types. The list is expected to grow as
+we continue enabling more and more use cases.
+
+*   **Vision APIs**
+
+    *   [ImageClassifier](image_classifier.md)
+    *   [ObjectDetector](object_detector.md)
+    *   [ImageSegmenter](image_segmenter.md)
+
+*   **Natural Language (NL) APIs**
+
+    *   [NLClassifier](nl_classifier.md)
+    *   [BertNLCLassifier](bert_nl_classifier.md)
+    *   [BertQuestionAnswerer](bert_question_answerer.md)
+
+*   **Custom APIs**
+
+    *   Extend Task API infrastructure and build
+        [customized API](customized_task_api.md).
diff --git a/tensorflow/lite/g3doc/microcontrollers/build_convert.md b/tensorflow/lite/g3doc/microcontrollers/build_convert.md
index 89c2c00fd55..cf4864674c2 100644
--- a/tensorflow/lite/g3doc/microcontrollers/build_convert.md
+++ b/tensorflow/lite/g3doc/microcontrollers/build_convert.md
@@ -12,36 +12,19 @@ guidance on designing and training a model to fit in limited memory.
 For an end-to-end, runnable example of building and converting a model, see the
 following Colab which is part of the *Hello World* example:
 
-<a class="button button-primary" href="https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/hello_world/create_sine_model.ipynb">create_sine_model.ipynb</a>
+<a class="button button-primary" href="https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/hello_world/train/train_hello_world_model.ipynb">train_hello_world_model.ipynb</a>
 
 ## Model conversion
 
 To convert a trained TensorFlow model to run on microcontrollers, you should use
 the
-[TensorFlow Lite converter Python API](https://www.tensorflow.org/lite/convert/python_api).
+[TensorFlow Lite converter Python API](https://www.tensorflow.org/lite/convert/).
 This will convert the model into a
 [`FlatBuffer`](https://google.github.io/flatbuffers/), reducing the model size,
 and modify it to use TensorFlow Lite operations.
 
-### Quantization
-
 To obtain the smallest possible model size, you should consider using
-[Post-training quantization](https://www.tensorflow.org/lite/performance/post_training_quantization).
-This will reduce the precision of the numbers in your model, which results in a
-smaller model size. However, this is likely to reduce accuracy, particularly for
-small models. It is important to profile the accuracy of your model before and
-after quantization to confirm that this loss is acceptable.
-
-The following Python snippet shows how to convert a model using post-training
-quantization:
-
-```python
-import tensorflow as tf
-converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
-converter.optimizations = [tf.lite.Optimize.DEFAULT]
-quantized_model = converter.convert()
-open("converted_model.tflite", "wb").write(quantized_model)
-```
+[post-training quantization](https://www.tensorflow.org/lite/performance/post_training_quantization).
 
 ### Convert to a C array
 
diff --git a/tensorflow/lite/g3doc/microcontrollers/get_started.md b/tensorflow/lite/g3doc/microcontrollers/get_started.md
index 999438311d8..9e6d0fdcbf4 100644
--- a/tensorflow/lite/g3doc/microcontrollers/get_started.md
+++ b/tensorflow/lite/g3doc/microcontrollers/get_started.md
@@ -1,14 +1,29 @@
 # Get started with microcontrollers
 
-This document will help you get started using TensorFlow Lite for
-Microcontrollers. It explains how to run the framework's example applications,
-then walks through the code for a simple application that runs inference on a
+This document explains how to train a model and run inference using a
 microcontroller.
 
+## The Hello World example
+
+The
+[Hello World](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/hello_world)
+example is designed to demonstrate the absolute basics of using TensorFlow Lite
+for Microcontrollers. We train and run a model that replicates a sine function,
+i.e, it takes a single number as its input, and outputs the number's
+[sine](https://en.wikipedia.org/wiki/Sine) value. When deployed to the
+microcontroller, its predictions are used to either blink LEDs or control an
+animation.
+
+The end-to-end workflow involves the following steps:
+
+1.  [Train a model](#train-a-model) (in Python): A jupyter notebook to train,
+    convert and optimize a model for on-device use.
+2.  [Run inference](#run-inference) (in C++ 11): An end-to-end unit test that
+    runs inference on the model using the [C++ library](library.md).
+
 ## Get a supported device
 
-To follow this guide, you'll need a supported hardware device. The example
-application we'll be using has been tested on the following devices:
+The example application we'll be using has been tested on the following devices:
 
 *   [Arduino Nano 33 BLE Sense](https://store.arduino.cc/usa/nano-33-ble-sense-with-headers)
     (using Arduino IDE)
@@ -30,61 +45,27 @@ application we'll be using has been tested on the following devices:
 Learn more about supported platforms in
 [TensorFlow Lite for Microcontrollers](index.md).
 
-## Explore the examples
+## Train a model
 
-TensorFlow Lite for Microcontrollers comes with several example applications
-that demonstrate its use for various tasks. At the time of writing, the
-following are available:
+Note: You can skip this section and use the trained model included in the
+example code.
 
-*   [Hello World](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/hello_world) -
-    Demonstrates the absolute basics of using TensorFlow Lite for
-    Microcontrollers
-*   [Micro speech](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/micro_speech) -
-    Captures audio with a microphone in order to detect the words "yes" and "no"
-*   [Person detection](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/person_detection) -
-    Captures camera data with an image sensor in order to detect the presence or
-    absence of a person
-*   [Magic wand](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/magic_wand) -
-    Captures accelerometer data in order to classify three different physical
-    gestures
+Use Google colaboratory to
+[train your own model](https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/hello_world/train/train_hello_world_model.ipynb).
+For more details, refer to the `README.md`:
 
-Each example application has a `README.md` file that explains how it can be
-deployed to its supported platforms.
+<a class="button button-primary" href="https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/hello_world/train/README.md">Hello
+World Training README.md</a>
 
-The rest of this guide walks through the
-[Hello World](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/hello_world)
-example application.
+## Run inference
 
-## The Hello World example
-
-This example is designed to demonstrate the absolute basics of using TensorFlow
-Lite for Microcontrollers. It includes the full end-to-end workflow of training
-a model, converting it for use with TensorFlow Lite, and running inference on a
-microcontroller.
-
-In the example, a model is trained to replicate a sine function. It takes a
-single number as its input, and outputs the number's
-[sine](https://en.wikipedia.org/wiki/Sine). When deployed to a microcontroller,
-its predictions are used to either blink LEDs or control an animation.
-
-The example includes the following:
-
-*   A Jupyter notebook that demonstrates how the model is trained and converted
-*   A C++ 11 application that runs inference using the model, tested to work
-    with Arduino, SparkFun Edge, STM32F746G discovery kit, and macOS
-*   A unit test that demonstrates the process of running inference
-
-### Run the example
-
-To run the example on your device, walk through the instructions in the
+To run the model on your device, we will walk through the instructions in the
 `README.md`:
 
 <a class="button button-primary" href="https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/hello_world/README.md">Hello
 World README.md</a>
 
-## How to run inference
-
-The following section walks through the *Hello World* example's
+The following sections walk through the example's
 [`hello_world_test.cc`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/hello_world/hello_world_test.cc),
 unit test which demonstrates how to run inference using TensorFlow Lite for
 Microcontrollers. It loads the model and runs inference several times.
@@ -359,18 +340,3 @@ the example's application code, located in
 It follows a similar process, but generates an input value based on how many
 inferences have been run, and calls a device-specific function that displays the
 model's output to the user.
-
-## Next steps
-
-To understand how the library can be used with a variety of models and
-applications, we recommend deploying the other examples and walking through
-their code.
-
-<a class="button button-primary" href="https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples">Example
-applications on GitHub</a>
-
-To learn how to use the library in your own project, read
-[Understand the C++ library](library.md).
-
-For information about training and converting models for deployment on
-microcontrollers, read [Build and convert models](build_convert.md).
diff --git a/tensorflow/lite/g3doc/microcontrollers/index.md b/tensorflow/lite/g3doc/microcontrollers/index.md
index 7a531d48f8b..77059de446a 100644
--- a/tensorflow/lite/g3doc/microcontrollers/index.md
+++ b/tensorflow/lite/g3doc/microcontrollers/index.md
@@ -1,22 +1,24 @@
 # TensorFlow Lite for Microcontrollers
 
-TensorFlow Lite for Microcontrollers is an experimental port of TensorFlow Lite
-designed to run machine learning models on microcontrollers and other devices
-with only kilobytes of memory.
+TensorFlow Lite for Microcontrollers is designed to run machine learning models
+on microcontrollers and other devices with only few kilobytes of memory. The
+core runtime just fits in 16 KB on an Arm Cortex M3 and can run many basic
+models. It doesn't require operating system support, any standard C or C++
+libraries, or dynamic memory allocation.
 
-It doesn't require operating system support, any standard C or C++ libraries, or
-dynamic memory allocation. The core runtime fits in 16 KB on an Arm Cortex M3,
-and with enough operators to run a speech keyword detection model, takes up a
-total of 22 KB.
+## Why microcontrollers are important
 
-There are example applications demonstrating the use of microcontrollers for
-tasks including wake word detection, gesture classification from accelerometer
-data, and image classification using camera data.
-
-## Get started
-
-To try the example applications and learn how to use the API, read
-[Get started with microcontrollers](get_started.md).
+Microcontrollers are typically small, low-powered computing devices that are
+embedded within hardware that requires basic computation. By bringing machine
+learning to tiny microcontrollers, we can boost the intelligence of billions of
+devices that we use in our lives, including household appliances and Internet of
+Things devices, without relying on expensive hardware or reliable internet
+connections, which is often subject to bandwidth and power constraints and
+results in high latency. This can also help preserve privacy, since no data
+leaves the device. Imagine smart appliances that can adapt to your daily
+routine, intelligent industrial sensors that understand the difference between
+problems and normal operation, and magical toys that can help kids learn in fun
+and delightful ways.
 
 ## Supported platforms
 
@@ -24,13 +26,12 @@ TensorFlow Lite for Microcontrollers is written in C++ 11 and requires a 32-bit
 platform. It has been tested extensively with many processors based on the
 [Arm Cortex-M Series](https://developer.arm.com/ip-products/processors/cortex-m)
 architecture, and has been ported to other architectures including
-[ESP32](https://www.espressif.com/en/products/hardware/esp32/overview).
+[ESP32](https://www.espressif.com/en/products/hardware/esp32/overview). The
+framework is available as an Arduino library. It can also generate projects for
+development environments such as Mbed. It is open source and can be included in
+any C++ 11 project.
 
-The framework is available as an Arduino library. It can also generate projects
-for development environments such as Mbed. It is open source and can be included
-in any C++ 11 project.
-
-There are example applications available for the following development boards:
+The following development boards are supported:
 
 *   [Arduino Nano 33 BLE Sense](https://store.arduino.cc/usa/nano-33-ble-sense-with-headers)
 *   [SparkFun Edge](https://www.sparkfun.com/products/15170)
@@ -41,61 +42,43 @@ There are example applications available for the following development boards:
 *   [Espressif ESP32-DevKitC](https://www.espressif.com/en/products/hardware/esp32-devkitc/overview)
 *   [Espressif ESP-EYE](https://www.espressif.com/en/products/hardware/esp-eye/overview)
 
-To learn more about the libraries and examples, see
-[Get started with microcontrollers](get_started.md).
+## Explore the examples
 
-## Why microcontrollers are important
+Each example application is on
+[Github](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples)
+and has a `README.md` file that explains how it can be deployed to its supported
+platforms. Some examples also have end-to-end tutorials using a specific
+platform, as given below:
 
-Microcontrollers are typically small, low-powered computing devices that are
-often embedded within hardware that requires basic computation, including
-household appliances and Internet of Things devices. Billions of
-microcontrollers are manufactured each year.
+*   [Hello World](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/hello_world) -
+    Demonstrates the absolute basics of using TensorFlow Lite for
+    Microcontrollers
+    *   [Tutorial using any supported device](get_started.md)
+*   [Micro speech](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/micro_speech) -
+    Captures audio with a microphone to detect the words "yes" and "no"
+    *   [Tutorial using SparkFun Edge](https://codelabs.developers.google.com/codelabs/sparkfun-tensorflow/#0)
+*   [Magic wand](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/magic_wand) -
+    Captures accelerometer data to classify three different physical gestures
+    *   [Tutorial using Arduino Nano 33 BLE Sense](https://codelabs.developers.google.com/codelabs/ai-magicwand/#0)
+*   [Person detection](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/person_detection) -
+    Captures camera data with an image sensor to detect the presence or absence
+    of a person
 
-Microcontrollers are often optimized for low energy consumption and small size,
-at the cost of reduced processing power, memory, and storage. Some
-microcontrollers have features designed to optimize performance on machine
-learning tasks.
+## Workflow
 
-By running machine learning inference on microcontrollers, developers can add AI
-to a vast range of hardware devices without relying on network connectivity,
-which is often subject to bandwidth and power constraints and results in high
-latency. Running inference on-device can also help preserve privacy, since no
-data has to leave the device.
+The following steps are required to deploy and run a TensorFlow model on a
+microcontroller:
 
-## Developer workflow
-
-To deploy a TensorFlow model to a microcontroller, you will need to follow this
-process:
-
-1.  **Create or obtain a TensorFlow model**
-
-    The model must be small enough to fit on your target device after
-    conversion, and it can only use
-    [supported operations](build_convert.md#operation-support). If you want to
-    use operations that are not currently supported, you can provide your own
-    implementations.
-
-2.  **Convert the model to a TensorFlow Lite FlatBuffer**
-
-    You will convert your model into the standard TensorFlow Lite format using
-    the [TensorFlow Lite converter](build_convert.md#model-conversion). You may
-    wish to output a quantized model, since these are smaller in size and more
-    efficient to execute.
-
-3.  **Convert the FlatBuffer to a C byte array**
-
-    Models are kept in read-only program memory and provided in the form of a
-    simple C file. Standard tools can be used to
-    [convert the FlatBuffer into a C array](build_convert.md#convert-to-a-c-array).
-
-4.  **Integrate the TensorFlow Lite for Microcontrollers C++ library**
-
-    Write your microcontroller code to collect data, perform inference using the
-    [C++ library](library.md), and make use of the results.
-
-5.  **Deploy to your device**
-
-    Build and deploy the program to your device.
+1.  **Train a model**:
+    *   *Generate a small TensorFlow model* that can fit your target device and
+        contains [supported operations](build_convert.md#operation-support).
+    *   *Convert to a TensorFlow Lite model* using the
+        [TensorFlow Lite converter](build_convert.md#model-conversion).
+    *   *Convert to a C byte array* using
+        [standard tools](build_convert.md#convert-to-a-c-array) to store it in a
+        read-only program memory on device.
+2.  **Run inference** on device using the [C++ library](library.md) and process
+    the results.
 
 ## Limitations
 
@@ -110,9 +93,13 @@ The following limitations should be considered:
     TensorFlow operations
 *   Support for a limited set of devices
 *   Low-level C++ API requiring manual memory management
-*   Training is not supported
+*   On device training is not supported
 
 ## Next steps
 
-Read [Get started with microcontrollers](get_started.md) to try the example
-applications and learn how to use the API.
+*   [Get started with microcontrollers](get_started.md) to try the example
+    application and learn how to use the API.
+*   [Understand the C++ library](library.md) to learn how to use the library in
+    your own project.
+*   [Build and convert models](build_convert.md) to learn more about training
+    and converting models for deployment on microcontrollers.
diff --git a/tensorflow/lite/g3doc/models/bert_qa/overview.md b/tensorflow/lite/g3doc/models/bert_qa/overview.md
index f63f1d22ebb..de20f382bdb 100644
--- a/tensorflow/lite/g3doc/models/bert_qa/overview.md
+++ b/tensorflow/lite/g3doc/models/bert_qa/overview.md
@@ -21,9 +21,13 @@ with the
 [TensorFlow Lite APIs](https://www.tensorflow.org/api_docs/python/tf/lite), you
 can download our starter question and answer model.
 
-<a class="button button-primary" href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/bert_qa/mobilebert_qa_vocab.zip">Download
+<a class="button button-primary" href="https://tfhub.dev/tensorflow/lite-model/mobilebert/1/metadata/1?lite-format=tflite">Download
 starter model and vocab</a>
 
+For more information about medatada and associated fields (e.g. `vocab.txt`) see
+<a href="https://www.tensorflow.org/lite/convert/metadata#read_the_metadata_from_models">Read
+the metadata from models</a>.
+
 ## How it works
 
 The model can be used to build a system that can answer users’ questions in
@@ -64,7 +68,7 @@ Performance benchmark numbers are generated with the tool
   </thead>
   <tr>
     <td rowspan = 3>
-      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/bert_qa/mobilebert_qa_vocab.zip">Mobile Bert</a>
+      <a href="https://tfhub.dev/tensorflow/lite-model/mobilebert/1/metadata/1?lite-format=tflite">Mobile Bert</a>
     </td>
     <td rowspan = 3>
       100.5 Mb
diff --git a/tensorflow/lite/g3doc/models/object_detection/overview.md b/tensorflow/lite/g3doc/models/object_detection/overview.md
index 4b29211f642..ae7dfa4910d 100644
--- a/tensorflow/lite/g3doc/models/object_detection/overview.md
+++ b/tensorflow/lite/g3doc/models/object_detection/overview.md
@@ -2,7 +2,7 @@
 
 <img src="../images/detection.png" class="attempt-right">
 
-Detect multiple objects within an image, with bounding boxes. Recognize 80
+Detect multiple objects within an image, with bounding boxes. Recognize 90
 different classes of objects.
 
 ## Get started
@@ -20,12 +20,16 @@ If you are using a platform other than Android or iOS, or you are already
 familiar with the <a href="https://www.tensorflow.org/api_docs/python/tf/lite">TensorFlow Lite APIs</a>, you can
 download our starter object detection model and the accompanying labels.
 
-<a class="button button-primary" href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/coco_ssd_mobilenet_v1_1.0_quant_2018_06_29.zip">Download
-starter model and labels</a>
+<a class="button button-primary" href="https://tfhub.dev/tensorflow/lite-model/ssd_mobilenet_v1/1/metadata/1?lite-format=tflite">Download
+starter model with Medatada</a>
 
 For more information about the starter model, see
 <a href="#starter_model">Starter model</a>.
 
+For more information about Medatada and associated fields (eg: `labels.txt`) see
+<a href="https://www.tensorflow.org/lite/convert/metadata#read_the_metadata_from_models">Read
+the metadata from models</a>
+
 ## What is object detection?
 
 Given an image or a video stream, an object detection model can identify which
@@ -197,7 +201,7 @@ Performance benchmark numbers are generated with the tool
   </thead>
   <tr>
     <td rowspan = 3>
-      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/coco_ssd_mobilenet_v1_1.0_quant_2018_06_29.zip">COCO SSD MobileNet v1</a>
+      <a href="https://tfhub.dev/tensorflow/lite-model/ssd_mobilenet_v1/1/metadata/1?lite-format=tflite">COCO SSD MobileNet v1</a>
     </td>
     <td rowspan = 3>
       27 Mb
@@ -227,16 +231,16 @@ Performance benchmark numbers are generated with the tool
 We recommend starting with this pre-trained quantized COCO SSD MobileNet v1
 model.
 
-<a class="button button-primary" href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/coco_ssd_mobilenet_v1_1.0_quant_2018_06_29.zip">Download
+<a class="button button-primary" href="https://tfhub.dev/tensorflow/lite-model/ssd_mobilenet_v1/1/metadata/1?lite-format=tflite">Download
 starter model and labels</a>
 
 ### Uses and limitations
 
 The object detection model we provide can identify and locate up to 10 objects
-in an image. It is trained to recognize 80 classes of object. For a full list of
-classes, see the labels file in the
-<a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/coco_ssd_mobilenet_v1_1.0_quant_2018_06_29.zip">model
-zip</a>.
+in an image. It is trained to recognize 90 classes of objects. For a full list
+of classes, see the labels file embedded in the model with
+<a href="https://www.tensorflow.org/lite/convert/metadata#visualize_the_metadata">metadata
+visualiztion</a>.
 
 If you want to train a model to recognize new classes, see
 <a href="#customize_model">Customize model</a>.
@@ -296,10 +300,10 @@ each object. There will always be 10 objects detected.
 
 ## Customize model
 
-The pre-trained models we provide are trained to detect 80 classes of object.
+The pre-trained models we provide are trained to detect 90 classes of objects.
 For a full list of classes, see the labels file in the
-<a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/coco_ssd_mobilenet_v1_1.0_quant_2018_06_29.zip">model
-zip</a>.
+<a href="https://tfhub.dev/tensorflow/lite-model/ssd_mobilenet_v1/1/metadata/1?lite-format=tflite">model
+metadata</a>.
 
 You can use a technique known as transfer learning to re-train a model to
 recognize classes not in the original set. For example, you could re-train the
diff --git a/tensorflow/lite/g3doc/models/segmentation/overview.md b/tensorflow/lite/g3doc/models/segmentation/overview.md
index 497a9a39eca..408b0cadd91 100644
--- a/tensorflow/lite/g3doc/models/segmentation/overview.md
+++ b/tensorflow/lite/g3doc/models/segmentation/overview.md
@@ -23,7 +23,7 @@ familiar with the
 <a href="https://www.tensorflow.org/api_docs/python/tf/lite">TensorFlow Lite
 APIs</a>, you can download our starter image segmentation model.
 
-<a class="button button-primary" href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/gpu/deeplabv3_257_mv_gpu.tflite">Download
+<a class="button button-primary" href="https://tfhub.dev/tensorflow/lite-model/deeplabv3/1/metadata/2?lite-format=tflite">Download
 starter model</a>
 
 ## How it works
@@ -60,7 +60,7 @@ Performance benchmark numbers are generated with the tool
   </thead>
   <tr>
     <td rowspan = 3>
-      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/gpu/deeplabv3_257_mv_gpu.tflite">Deeplab v3</a>
+      <a href="https://tfhub.dev/tensorflow/lite-model/deeplabv3/1/metadata/2?lite-format=tflite">Deeplab v3</a>
     </td>
     <td rowspan = 3>
       2.7 Mb
diff --git a/tensorflow/lite/g3doc/models/smart_reply/overview.md b/tensorflow/lite/g3doc/models/smart_reply/overview.md
index 40a69d064ef..f902406ff52 100644
--- a/tensorflow/lite/g3doc/models/smart_reply/overview.md
+++ b/tensorflow/lite/g3doc/models/smart_reply/overview.md
@@ -8,8 +8,8 @@ Our smart reply model generates reply suggestions based on chat messages. The
 suggestions are intended to be contextually relevant, one-touch responses that
 help the user to easily reply to an incoming message.
 
-<a class="button button-primary" href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/smartreply_1.0_2017_11_01.zip">Download
-starter model and labels</a>
+<a class="button button-primary" href="https://tfhub.dev/tensorflow/lite-model/smartreply/1/default/1?lite-format=tflite">Download
+starter model</a>
 
 ### Sample application
 
diff --git a/tensorflow/lite/g3doc/models/style_transfer/overview.ipynb b/tensorflow/lite/g3doc/models/style_transfer/overview.ipynb
index 4f2e08a73e8..7445a7c8be7 100644
--- a/tensorflow/lite/g3doc/models/style_transfer/overview.ipynb
+++ b/tensorflow/lite/g3doc/models/style_transfer/overview.ipynb
@@ -3,7 +3,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "g_nWetWWd_ns"
       },
       "source": [
@@ -12,11 +11,9 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "cellView": "form",
-        "colab": {},
-        "colab_type": "code",
         "id": "2pHVBk_seED1"
       },
       "outputs": [],
@@ -37,7 +34,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "M7vSdG6sAIQn"
       },
       "source": [
@@ -47,7 +43,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "fwc5GKHBASdc"
       },
       "source": [
@@ -70,7 +65,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "31O0iaROAw8z"
       },
       "source": [
@@ -101,7 +95,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "ak0S4gkOCSxs"
       },
       "source": [
@@ -111,7 +104,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "oee6G_bBCgAM"
       },
       "source": [
@@ -127,7 +119,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "a7ZETsRVNMo7"
       },
       "source": [
@@ -137,7 +128,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "3n8oObKZN4c8"
       },
       "source": [
@@ -146,10 +136,8 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "xz62Lb1oNm97"
       },
       "outputs": [],
@@ -160,10 +148,8 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "1Ua5FpcJNrIj"
       },
       "outputs": [],
@@ -183,7 +169,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "1b988wrrQnVF"
       },
       "source": [
@@ -192,10 +177,8 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "16g57cIMQnen"
       },
       "outputs": [],
@@ -210,7 +193,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "MQZXL7kON-gM"
       },
       "source": [
@@ -223,10 +205,8 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "Cg0Vi-rXRUFl"
       },
       "outputs": [],
@@ -269,7 +249,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "xE4Yt8nArTeR"
       },
       "source": [
@@ -278,10 +257,8 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "ncPA4esJRcEu"
       },
       "outputs": [],
@@ -304,7 +281,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "CJ7R-CHbjC3s"
       },
       "source": [
@@ -314,7 +290,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "euu00ldHjKwD"
       },
       "source": [
@@ -323,10 +298,8 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "o3zd9cTFRiS_"
       },
       "outputs": [],
@@ -357,7 +330,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "00t8S2PekIyW"
       },
       "source": [
@@ -366,10 +338,8 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "cZp5bCj8SX1w"
       },
       "outputs": [],
@@ -405,7 +375,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "vv_71Td-QtrW"
       },
       "source": [
@@ -416,10 +385,8 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "eJcAURXQQtJ7"
       },
       "outputs": [],
@@ -432,10 +399,8 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "4S3yg2MgkmRD"
       },
       "outputs": [],
@@ -460,7 +425,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "9k9jGIep8p1c"
       },
       "source": [
@@ -492,21 +456,14 @@
         "\u003c/table\u003e\n",
         "\n",
         "*\u0026ast; 4 threads used. \u003cbr/\u003e*\n",
-        "*\u0026ast;\u0026ast; 2 threads on iPhone for the best performance.*\n",
-        "\n"
+        "*\u0026ast;\u0026ast; 2 threads on iPhone for the best performance.*\n"
       ]
     }
   ],
   "metadata": {
     "colab": {
       "collapsed_sections": [],
-      "last_runtime": {
-        "build_target": "//learning/brain/python/client:colab_notebook",
-        "kind": "private"
-      },
-      "name": "Artistic Style Transfer with TensorFlow Lite.ipynb",
-      "private_outputs": true,
-      "provenance": [],
+      "name": "overview.ipynb",
       "toc_visible": true
     },
     "kernelspec": {
diff --git a/tensorflow/lite/g3doc/models/text_classification/overview.md b/tensorflow/lite/g3doc/models/text_classification/overview.md
index a462507b56a..c3463188394 100644
--- a/tensorflow/lite/g3doc/models/text_classification/overview.md
+++ b/tensorflow/lite/g3doc/models/text_classification/overview.md
@@ -16,7 +16,7 @@ If you are using a platform other than Android, or you are already familiar with
 the TensorFlow Lite APIs, you can download our starter text classification
 model.
 
-<a class="button button-primary" href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/text_classification/text_classification.tflite">Download
+<a class="button button-primary" href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/text_classification/text_classification_v2.tflite">Download
 starter model</a>
 
 ## How it works
diff --git a/tensorflow/lite/g3doc/performance/benchmarks.md b/tensorflow/lite/g3doc/performance/benchmarks.md
deleted file mode 100644
index 7b1eb5c9919..00000000000
--- a/tensorflow/lite/g3doc/performance/benchmarks.md
+++ /dev/null
@@ -1,204 +0,0 @@
-# Performance benchmarks
-
-This document lists TensorFlow Lite performance benchmarks when running well
-known models on some Android and iOS devices.
-
-These performance benchmark numbers were generated with the
-[Android TFLite benchmark binary](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark)
-and the [iOS benchmark app](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark/ios).
-
-## Android performance benchmarks
-
-For Android benchmarks, the CPU affinity is set to use big cores on the device to
-reduce variance (see [details](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark#reducing-variance-between-runs-on-android)).
-
-It assumes that models were download and unzipped to the
-`/data/local/tmp/tflite_models` directory. The benchmark binary is built
-using [these instructions](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark#on-android)
-and assumed in the `/data/local/tmp` directory.
-
-To run the benchmark:
-
-```sh
-adb shell /data/local/tmp/benchmark_model \
-  --num_threads=4 \
-  --graph=/data/local/tmp/tflite_models/${GRAPH} \
-  --warmup_runs=1 \
-  --num_runs=50
-```
-
-To run with nnapi delegate, please set `--use_nnapi=true`. To run with gpu
-delegate, please set `--use_gpu=true`.
-
-The performance values below are measured on Android 10.
-
-<table>
-  <thead>
-    <tr>
-      <th>Model Name</th>
-      <th>Device </th>
-      <th>CPU, 4 threads</th>
-      <th>GPU</th>
-      <th>NNAPI</th>
-    </tr>
-  </thead>
-  <tr>
-    <td rowspan = 2>
-      <a href="https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224.tgz">Mobilenet_1.0_224(float)</a>
-    </td>
-    <td>Pixel 3 </td>
-    <td>23.9 ms</td>
-    <td>6.45 ms</td>
-    <td>13.8 ms</td>
-  </tr>
-   <tr>
-     <td>Pixel 4 </td>
-    <td>14.0 ms</td>
-    <td>9.0 ms</td>
-    <td>14.8 ms</td>
-  </tr>
-  <tr>
-    <td rowspan = 2>
-      <a href="https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz">Mobilenet_1.0_224 (quant)</a>
-    </td>
-    <td>Pixel 3 </td>
-    <td>13.4 ms</td>
-    <td>--- </td>
-    <td>6.0 ms</td>
-  </tr>
-   <tr>
-     <td>Pixel 4 </td>
-    <td>5.0 ms</td>
-    <td>--- </td>
-    <td>3.2 ms</td>
-  </tr>
-  <tr>
-    <td rowspan = 2>
-      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/nasnet_mobile_2018_04_27.tgz">NASNet mobile</a>
-    </td>
-    <td>Pixel 3 </td>
-    <td>56 ms</td>
-    <td>--- </td>
-    <td>102 ms</td>
-  </tr>
-   <tr>
-     <td>Pixel 4 </td>
-    <td>34.5 ms</td>
-    <td>--- </td>
-    <td>99.0 ms</td>
-  </tr>
-  <tr>
-    <td rowspan = 2>
-      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/squeezenet_2018_04_27.tgz">SqueezeNet</a>
-    </td>
-    <td>Pixel 3 </td>
-    <td>35.8 ms</td>
-    <td>9.5 ms </td>
-    <td>18.5 ms</td>
-  </tr>
-   <tr>
-     <td>Pixel 4 </td>
-    <td>23.9 ms</td>
-    <td>11.1 ms</td>
-    <td>19.0 ms</td>
-  </tr>
-  <tr>
-    <td rowspan = 2>
-      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/inception_resnet_v2_2018_04_27.tgz">Inception_ResNet_V2</a>
-    </td>
-    <td>Pixel 3 </td>
-    <td>422 ms</td>
-    <td>99.8 ms </td>
-    <td>201 ms</td>
-  </tr>
-   <tr>
-     <td>Pixel 4 </td>
-    <td>272.6 ms</td>
-    <td>87.2 ms</td>
-    <td>171.1 ms</td>
-  </tr>
-  <tr>
-    <td rowspan = 2>
-      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/inception_v4_2018_04_27.tgz">Inception_V4</a>
-    </td>
-    <td>Pixel 3 </td>
-    <td>486 ms</td>
-    <td>93 ms </td>
-    <td>292 ms</td>
-  </tr>
-   <tr>
-     <td>Pixel 4 </td>
-    <td>324.1 ms</td>
-    <td>97.6 ms</td>
-    <td>186.9 ms</td>
-  </tr>
-
- </table>
-
-## iOS benchmarks
-
-To run iOS benchmarks, the
-[benchmark app](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark/ios)
-was modified to include the appropriate model and `benchmark_params.json` was
-modified to set `num_threads` to 2. For GPU delegate, `"use_gpu" : "1"` and
-`"gpu_wait_type" : "aggressive"` options were also added to
-`benchmark_params.json`.
-
-<table>
-  <thead>
-    <tr>
-      <th>Model Name</th>
-      <th>Device </th>
-      <th>CPU, 2 threads</th>
-      <th>GPU</th>
-    </tr>
-  </thead>
-  <tr>
-    <td>
-      <a href="https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224.tgz">Mobilenet_1.0_224(float)</a>
-    </td>
-    <td>iPhone XS </td>
-    <td>14.8 ms</td>
-    <td>3.4 ms</td>
-  </tr>
-  <tr>
-    <td>
-      <a href="https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz)">Mobilenet_1.0_224 (quant)</a>
-    </td>
-    <td>iPhone XS </td>
-    <td>11 ms</td>
-    <td>---</td>
-  </tr>
-  <tr>
-    <td>
-      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/nasnet_mobile_2018_04_27.tgz">NASNet mobile</a>
-    </td>
-    <td>iPhone XS </td>
-    <td>30.4 ms</td>
-    <td>---</td>
-  </tr>
-  <tr>
-    <td>
-      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/squeezenet_2018_04_27.tgz">SqueezeNet</a>
-    </td>
-    <td>iPhone XS </td>
-    <td>21.1 ms</td>
-    <td>15.5 ms</td>
-  </tr>
-  <tr>
-    <td>
-      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/inception_resnet_v2_2018_04_27.tgz">Inception_ResNet_V2</a>
-    </td>
-    <td>iPhone XS </td>
-    <td>261.1 ms</td>
-    <td>45.7 ms</td>
-  </tr>
-  <tr>
-    <td>
-      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/inception_v4_2018_04_27.tgz">Inception_V4</a>
-    </td>
-    <td>iPhone XS </td>
-    <td>309 ms</td>
-    <td>54.4 ms</td>
-  </tr>
- </table>
diff --git a/tensorflow/lite/g3doc/performance/best_practices.md b/tensorflow/lite/g3doc/performance/best_practices.md
index e4abb564b26..9df0ace4db0 100644
--- a/tensorflow/lite/g3doc/performance/best_practices.md
+++ b/tensorflow/lite/g3doc/performance/best_practices.md
@@ -38,6 +38,12 @@ has a built-in profiler that shows per operator profiling statistics. This can
 help in understanding performance bottlenecks and which operators dominate the
 computation time.
 
+You can also use
+[TensrFlow Lite tracing](measurement.md#trace_tensorflow_lite_internals_in_android)
+to profile the model in your Android application, using standard Android system
+tracing, and to visualize the operator invocations by time with GUI based
+profiling tools.
+
 ## Profile and optimize operators in the graph
 
 If a particular operator appears frequently in the model and, based on
@@ -116,7 +122,7 @@ interpreter execution. TensorFlow Lite can use delegates by:
 
 Be aware that some accelerators work better for different types of models. Some
 delegates only support float models or models optimized in a specific way. It is
-important to [benchmark](benchmarks.md) each delegate to see if it is a good
+important to [benchmark](measurement.md) each delegate to see if it is a good
 choice for your application. For example, if you have a very small model, it may
 not be worth delegating the model to either the NN API or the GPU. Conversely,
 accelerators are a great choice for large models that have high arithmetic
diff --git a/tensorflow/lite/g3doc/performance/coreml_delegate.md b/tensorflow/lite/g3doc/performance/coreml_delegate.md
index 2803b080a13..8d588fb1b1e 100644
--- a/tensorflow/lite/g3doc/performance/coreml_delegate.md
+++ b/tensorflow/lite/g3doc/performance/coreml_delegate.md
@@ -69,35 +69,42 @@ The Core ML delegate uses C API for Objective-C codes.
 
 #### Step 1. Include `coreml_delegate.h`.
 
-```objectivec
+```c
 #include "tensorflow/lite/experimental/delegates/coreml/coreml_delegate.h"
 ```
 
 #### Step 2. Create a delegate and initialize a TensorFlow Lite Interpreter
 
-After initializing the interpreter, call `interpreter->ModifyGraphWithDelegate`
-with initialized Core ML delegate to apply the delegate.
+After initializing the interpreter options, call
+`TfLiteInterpreterOptionsAddDelegate` with initialized Core ML delegate to apply
+the delegate. Then initialize the interpreter with the created option.
 
-```objectivec
-// initializer interpreter with model.
-tflite::InterpreterBuilder(*model, resolver)(&interpreter);
+```c
+// Initialize interpreter with model
+TfLiteModel* model = TfLiteModelCreateFromFile(model_path);
 
-// Add following section to use the Core ML delegate.
-TfLiteCoreMlDelegateOptions options = {};
-delegate = TfLiteCoreMlDelegateCreate(&options);
-if (delegate != nullptr) {
-  interpreter->ModifyGraphWithDelegate(delegate);
-}
-// ...
+// Initialize interpreter with Core ML delegate
+TfLiteInterpreterOptions* options = TfLiteInterpreterOptionsCreate();
+TfLiteDelegate* delegate = TfLiteCoreMlDelegateCreate(NULL);  // default config
+TfLiteInterpreterOptionsAddDelegate(options, delegate);
+TfLiteInterpreterOptionsDelete(options);
+
+TfLiteInterpreter* interpreter = TfLiteInterpreterCreate(model, options);
+
+TfLiteInterpreterAllocateTensors(interpreter);
+
+// Run inference ...
 ```
 
-#### Step 3. Dispose the delegate when it is no longer used.
+#### Step 3. Dispose resources when it is no longer used.
 
 Add this code to the section where you dispose of the delegate (e.g. `dealloc`
 of class).
 
-```objectivec
+```c
+TfLiteInterpreterDelete(interpreter);
 TfLiteCoreMlDelegateDelete(delegate);
+TfLiteModelDelete(model);
 ```
 
 ## Best practices
@@ -116,17 +123,17 @@ pass `TfLiteCoreMlDelegateAllDevices`. Following example shows how to do this:
 var options = CoreMLDelegate.Options()
 options.enabledDevices = .all
 let coreMLDelegate = CoreMLDelegate(options: options)!
-let interpreter: try Interpreter(modelPath: modelPath,
-                                delegates: [coreMLDelegate])
+let interpreter = try Interpreter(modelPath: modelPath,
+                                  delegates: [coreMLDelegate])
 ```
 
 #### Objective-C
 
-```objectivec
+```c
 TfLiteCoreMlDelegateOptions options;
 options.enabled_devices = TfLiteCoreMlDelegateAllDevices;
-delegate = TfLiteCoreMlDelegateCreate(&options);
-interpreter->ModifyGraphWithDelegate(delegate);
+TfLiteDelegate* delegate = TfLiteCoreMlDelegateCreate(&options);
+// Initialize interpreter with delegate
 ```
 
 ### Using Metal(GPU) delegate as a fallback.
@@ -149,14 +156,14 @@ let interpreter = try Interpreter(modelPath: modelPath,
 
 #### Objective-C
 
-```objectivec
+```c
 TfLiteCoreMlDelegateOptions options = {};
 delegate = TfLiteCoreMlDelegateCreate(&options);
-if (delegate == nullptr) {
+if (delegate == NULL) {
   // Add Metal delegate options if necessary
-  delegate = TFLGpuDelegateCreate(nullptr);
+  delegate = TFLGpuDelegateCreate(NULL);
 }
-interpreter->ModifyGraphWithDelegate(delegate);
+// Initialize interpreter with delegate
 ```
 
 The delegate creation logic reads device's machine id (e.g. iPhone11,1) to
diff --git a/tensorflow/lite/g3doc/performance/delegates.md b/tensorflow/lite/g3doc/performance/delegates.md
index 760e7273fc4..14aeece21fa 100644
--- a/tensorflow/lite/g3doc/performance/delegates.md
+++ b/tensorflow/lite/g3doc/performance/delegates.md
@@ -86,6 +86,8 @@ execute Conv2D and Mean operations faster.
 
 ```c++
 #include "tensorflow/lite/util.h"
+#include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/context_util.h"
 
 // This is where the execution of the operations or whole graph happens.
 // The class below has an empty implementation just as a guideline
diff --git a/tensorflow/lite/g3doc/performance/gpu.md b/tensorflow/lite/g3doc/performance/gpu.md
index 3cea6febb21..9b4c1825494 100644
--- a/tensorflow/lite/g3doc/performance/gpu.md
+++ b/tensorflow/lite/g3doc/performance/gpu.md
@@ -54,9 +54,9 @@ dependencies {
 
 #### Step 3. Build and run
 
-Run → Run ‘app’.  When you run the application you will see a button for
-enabling the GPU. Change from quantized to a float model and then click GPU to
-run on the GPU.
+Run → Run ‘app’. When you run the application you will see a button for enabling
+the GPU. Change from quantized to a float model and then click GPU to run on the
+GPU.
 
 ![running android gpu demo and switch to gpu](images/android_gpu_demo.gif)
 
@@ -70,9 +70,8 @@ Note: This requires XCode v10.1 or later.
 #### Step 1. Get the demo source code and make sure it compiles.
 
 Follow our iOS Demo App [tutorial](https://www.tensorflow.org/lite/demo_ios).
-This will get you to a point where the unmodified iOS camera demo is working
-on your phone.
-
+This will get you to a point where the unmodified iOS camera demo is working on
+your phone.
 
 #### Step 2. Modify the Podfile to use the TensorFlow Lite GPU CocoaPod
 
@@ -82,8 +81,8 @@ Until TensorFlow Lite 2.0.0
 
 We have built a binary CocoaPod that includes the GPU delegate. To switch the
 project to use it, modify the
-`tensorflow/tensorflow/lite/examples/ios/camera/Podfile` file to use
-the `TensorFlowLiteGpuExperimental` pod instead of `TensorFlowLite`.
+`tensorflow/tensorflow/lite/examples/ios/camera/Podfile` file to use the
+`TensorFlowLiteGpuExperimental` pod instead of `TensorFlowLite`.
 
 ```
 target 'YourProjectName'
@@ -205,25 +204,36 @@ In your application code, include the GPU delegate header and call the
 `Interpreter::ModifyGraphWithDelegate` function to register the GPU delegate to
 the interpreter:
 
-```cpp
-#import "tensorflow/lite/delegates/gpu/metal_delegate.h"
+```objc
+#include "tensorflow/lite/c/c_api.h"
+#include "tensorflow/lite/delegates/gpu/metal_delegate.h"
+
+// Initialize model
+TfLiteModel* model = TfLiteModelCreateFromFile(model_path);
 
 // Initialize interpreter with GPU delegate
-std::unique_ptr<Interpreter> interpreter;
-InterpreterBuilder(*model, resolver)(&interpreter);
-auto* delegate = NewGpuDelegate(nullptr);  // default config
-if (interpreter->ModifyGraphWithDelegate(delegate) != kTfLiteOk) return false;
+TfLiteInterpreterOptions* options = TfLiteInterpreterOptionsCreate();
+TfLiteDelegate* delegate = TFLGPUDelegateCreate(nil);  // default config
+TfLiteInterpreterOptionsAddDelegate(options, metal_delegate);
+TfLiteInterpreter* interpreter = TfLiteInterpreterCreate(model, options);
+TfLiteInterpreterOptionsDelete(options);
+
+TfLiteInterpreterAllocateTensors(interpreter);
+
+NSMutableData *input_data = [NSMutableData dataWithLength:input_size * sizeof(float)];
+NSMutableData *output_data = [NSMutableData dataWithLength:output_size * sizeof(float)];
+TfLiteTensor* input = TfLiteInterpreterGetInputTensor(interpreter, 0);
+const TfLiteTensor* output = TfLiteInterpreterGetOutputTensor(interpreter, 0);
 
 // Run inference
-while (true) {
-  WriteToInputTensor(interpreter->typed_input_tensor<float>(0));
-  if (interpreter->Invoke() != kTfLiteOk) return false;
-  ReadFromOutputTensor(interpreter->typed_output_tensor<float>(0));
-}
+TfLiteTensorCopyFromBuffer(input, inputData.bytes, inputData.length);
+TfLiteInterpreterInvoke(interpreter);
+TfLiteTensorCopyToBuffer(output, outputData.mutableBytes, outputData.length);
 
 // Clean up
-interpreter = nullptr;
-DeleteGpuDelegate(delegate);
+TfLiteInterpreterDelete(interpreter);
+TFLGpuDelegateDelete(metal_delegate);
+TfLiteModelDelete(model);
 ```
 
 ## Supported Models and Ops
@@ -231,16 +241,17 @@ DeleteGpuDelegate(delegate);
 With the release of the GPU delegate, we included a handful of models that can
 be run on the backend:
 
-* [MobileNet v1 (224x224) image classification](https://ai.googleblog.com/2017/06/mobilenets-open-source-models-for.html) [[download]](https://storage.googleapis.com/download.tensorflow.org/models/tflite/gpu/mobilenet_v1_1.0_224.tflite)
-<br /><i>(image classification model designed for mobile and embedded based vision applications)</i>
-* [DeepLab segmentation (257x257)](https://ai.googleblog.com/2018/03/semantic-image-segmentation-with.html) [[download]](https://storage.googleapis.com/download.tensorflow.org/models/tflite/gpu/deeplabv3_257_mv_gpu.tflite)
-<br /><i>(image segmentation model that assigns semantic labels (e.g., dog, cat, car) to every pixel in the input image)</i>
-* [MobileNet SSD object detection](https://ai.googleblog.com/2018/07/accelerated-training-and-inference-with.html) [[download]](https://storage.googleapis.com/download.tensorflow.org/models/tflite/gpu/mobile_ssd_v2_float_coco.tflite)
-<br /><i>(image classification model that detects multiple objects with bounding boxes)</i>
-* [PoseNet for pose estimation](https://github.com/tensorflow/tfjs-models/tree/master/posenet) [[download]](https://storage.googleapis.com/download.tensorflow.org/models/tflite/gpu/multi_person_mobilenet_v1_075_float.tflite)
-<br /><i>(vision model that estimates the poses of a person(s) in image or video)</i>
+*   [MobileNet v1 (224x224) image classification](https://ai.googleblog.com/2017/06/mobilenets-open-source-models-for.html) [[download]](https://storage.googleapis.com/download.tensorflow.org/models/tflite/gpu/mobilenet_v1_1.0_224.tflite)
+    <br /><i>(image classification model designed for mobile and embedded based vision applications)</i>
+*   [DeepLab segmentation (257x257)](https://ai.googleblog.com/2018/03/semantic-image-segmentation-with.html) [[download]](https://storage.googleapis.com/download.tensorflow.org/models/tflite/gpu/deeplabv3_257_mv_gpu.tflite)
+    <br /><i>(image segmentation model that assigns semantic labels (e.g., dog, cat, car) to every pixel in the input image)</i>
+*   [MobileNet SSD object detection](https://ai.googleblog.com/2018/07/accelerated-training-and-inference-with.html) [[download]](https://storage.googleapis.com/download.tensorflow.org/models/tflite/gpu/mobile_ssd_v2_float_coco.tflite)
+    <br /><i>(image classification model that detects multiple objects with bounding boxes)</i>
+*   [PoseNet for pose estimation](https://github.com/tensorflow/tfjs-models/tree/master/posenet) [[download]](https://storage.googleapis.com/download.tensorflow.org/models/tflite/gpu/multi_person_mobilenet_v1_075_float.tflite)
+    <br /><i>(vision model that estimates the poses of a person(s) in image or video)</i>
 
-To see a full list of supported ops, please see the [advanced documentation](gpu_advanced.md).
+To see a full list of supported ops, please see the
+[advanced documentation](gpu_advanced.md).
 
 ## Non-supported models and ops
 
diff --git a/tensorflow/lite/g3doc/performance/gpu_advanced.md b/tensorflow/lite/g3doc/performance/gpu_advanced.md
index 1614523b705..c2fff7702d9 100644
--- a/tensorflow/lite/g3doc/performance/gpu_advanced.md
+++ b/tensorflow/lite/g3doc/performance/gpu_advanced.md
@@ -197,7 +197,7 @@ default options (which are explicated in the Basic Usage example above).
 
 // THIS:
 var options = MetalDelegate.Options()
-options.allowsPrecisionLoss = false
+options.isPrecisionLossAllowed = false
 options.waitType = .passive
 options.isQuantizationEnabled = false
 let delegate = MetalDelegate(options: options)
diff --git a/tensorflow/lite/g3doc/performance/images/as_select_profiling_mode.png b/tensorflow/lite/g3doc/performance/images/as_select_profiling_mode.png
new file mode 100644
index 00000000000..9ba5ba89355
Binary files /dev/null and b/tensorflow/lite/g3doc/performance/images/as_select_profiling_mode.png differ
diff --git a/tensorflow/lite/g3doc/performance/images/as_traces.png b/tensorflow/lite/g3doc/performance/images/as_traces.png
new file mode 100644
index 00000000000..cbc2b14b8e9
Binary files /dev/null and b/tensorflow/lite/g3doc/performance/images/as_traces.png differ
diff --git a/tensorflow/lite/g3doc/performance/images/perfetto_traces.png b/tensorflow/lite/g3doc/performance/images/perfetto_traces.png
new file mode 100644
index 00000000000..94b2654217b
Binary files /dev/null and b/tensorflow/lite/g3doc/performance/images/perfetto_traces.png differ
diff --git a/tensorflow/lite/g3doc/performance/measurement.md b/tensorflow/lite/g3doc/performance/measurement.md
new file mode 100644
index 00000000000..9d2f7247ac7
--- /dev/null
+++ b/tensorflow/lite/g3doc/performance/measurement.md
@@ -0,0 +1,524 @@
+# Performance measurement
+
+## Benchmark tools
+
+TensorFlow Lite benchmark tools currently measure and calculate statistics for
+the following important performance metrics:
+
+*   Initialization time
+*   Inference time of warmup state
+*   Inference time of steady state
+*   Memory usage during initialization time
+*   Overall memory usage
+
+The benchmark tools are available as benchmark apps for Android and iOS and as
+native command-line binaries, and they all share the same core performance
+measurement logic. Note that the available options and output formats are
+slightly different due to the differences in runtime environment.
+
+### Android benchmark app
+
+There are two options of using the benchmark tool with Android. One is a
+[native benchmark binary](#native-benchmark-binary) and another is an Android
+benchmark app, a better gauge of how the model would perform in the app. Either
+way, the numbers from the benchmark tool will still differ slightly from when
+running inference with the model in the actual app.
+
+This Android benchmark app has no UI. Install and run it by using the `adb`
+command and retrieve results by using the `adb logcat` command.
+
+#### Download or build the app
+
+Download the nightly pre-built Android benchmark apps using the links below:
+
+*   [android_aarch64](https://storage.googleapis.com/tensorflow-nightly-public/prod/tensorflow/release/lite/tools/nightly/latest/android_aarch64_benchmark_model.apk)
+
+*   [android_arm](https://storage.googleapis.com/tensorflow-nightly-public/prod/tensorflow/release/lite/tools/nightly/latest/android_arm_benchmark_model.apk)
+
+You can also build the app from source by following these
+[instructions](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark/android).
+
+Note: It is required to build the app from the source if you want to run the
+Android benchmark apk on x86 CPU or Hexagon delegate or if your model contains
+[select TF operators](../guide/ops_select) or
+[custom operators](../guide/ops_custom).
+
+#### Prepare benchmark
+
+Before running the benchmark app, install the app and push the model file to the
+device as follows:
+
+```shell
+adb install -r -d -g android_aarch64_benchmark_model.apk
+adb push your_model.tflite /data/local/tmp
+```
+
+#### Run benchmark
+
+```shell
+adb shell am start -S \
+  -n org.tensorflow.lite.benchmark/.BenchmarkModelActivity \
+  --es args '"--graph=/data/local/tmp/your_model.tflite \
+              --num_threads=4"'
+```
+
+`graph` is a required parameter.
+
+*   `graph`: `string` \
+    The path to the TFLite model file.
+
+You can specify more optional parameters for running the benchmark.
+
+*   `num_threads`: `int` (default=1) \
+    The number of threads to use for running TFLite interpreter.
+*   `use_gpu`: `bool` (default=false) \
+    Use [GPU delegate](gpu).
+*   `use_nnapi`: `bool` (default=false) \
+    Use [NNAPI delegate](nnapi).
+*   `use_xnnpack`: `bool` (default=`false`) \
+    Use
+    [XNNPACK delegate](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/delegates/xnnpack).
+*   `use_hexagon`: `bool` (default=`false`) \
+    Use [Hexagon delegate](hexagon_delegate).
+
+Depending on the device you are using, some of these options may not be
+available or have no effect. Refer to
+[parameters](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark#parameters)
+for more performance parameters that you could run with the benchmark app.
+
+View the results using the `logcat` command:
+
+```shell
+adb logcat | grep "Average inference"
+```
+
+The benchmark results are reported as:
+
+```
+... tflite  : Average inference timings in us: Warmup: 91471, Init: 4108, Inference: 80660.1
+```
+
+### Native benchmark binary
+
+Benchmark tool is also provided as a native binary `benchmark_model`. You can
+execute this tool from a shell command line on Linux, Mac, embedded devices and
+Android devices.
+
+#### Download or build the binary
+
+Download the nightly pre-built native command-line binaries by following the
+links below:
+
+*   [linux_x86-64](https://storage.googleapis.com/tensorflow-nightly-public/prod/tensorflow/release/lite/tools/nightly/latest/linux_x86-64_benchmark_model)
+*   [linux_aarch64](https://storage.googleapis.com/tensorflow-nightly-public/prod/tensorflow/release/lite/tools/nightly/latest/linux_aarch64_benchmark_model)
+*   [linux_arm](https://storage.googleapis.com/tensorflow-nightly-public/prod/tensorflow/release/lite/tools/nightly/latest/linux_arm_benchmark_model)
+*   [android_aarch64](https://storage.googleapis.com/tensorflow-nightly-public/prod/tensorflow/release/lite/tools/nightly/latest/android_aarch64_benchmark_model)
+*   [android_arm](https://storage.googleapis.com/tensorflow-nightly-public/prod/tensorflow/release/lite/tools/nightly/latest/android_arm_benchmark_model)
+
+You can also build the native benchmark binary from
+[source](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark)
+on your computer.
+
+```shell
+bazel build -c opt //tensorflow/lite/tools/benchmark:benchmark_model
+```
+
+To build with Android NDK toolchain, you need to set up the build environment
+first by following this
+[guide](../guide/build_android#set_up_build_environment_without_docker), or use
+the docker image as described in this
+[guide](../guide/build_android#set_up_build_environment_using_docker).
+
+```shell
+bazel build -c opt --config=android_arm64 \
+  //tensorflow/lite/tools/benchmark:benchmark_model
+```
+
+Note: It is a valid approach to push and execute binaries directly on an Android
+device for benchmarking, but it can result in subtle (but observable)
+differences in performance relative to execution within an actual Android app.
+In particular, Android's scheduler tailors behavior based on thread and process
+priorities, which differ between a foreground Activity/Application and a regular
+background binary executed via `adb shell ...`. This tailored behavior is most
+evident when enabling multi-threaded CPU execution with TensorFlow Lite.
+Therefore, the Android benchmark app is preferred for performance measurement.
+
+#### Run benchmark
+
+To run benchmarks on your computer, execute the binary from the shell.
+
+```shell
+path/to/downloaded_or_built/benchmark_model \
+  --graph=your_model.tflite \
+  --num_threads=4
+```
+
+You can use the same set of
+[parameters](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark#parameters)
+as mentioned above with the native command-line binary.
+
+#### Profiling model ops
+
+The benchmark model binary also allows you to profile model ops and get the
+execution times of each operator. To do this, pass the flag
+`--enable_op_profiling=true` to `benchmark_model` during invocation. Details are
+explained
+[here](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark#profiling-model-operators).
+
+### Native benchmark binary for multiple performance options in a single run
+
+A convenient and simple C++ binary is also provided to
+[benchmark multiple performance options](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark#benchmark-multiple-performance-options-in-a-single-run)
+in a single run. This binary is built based on the aforementioned benchmark tool
+that could only benchmark a single performance option at a time. They share the
+same build/install/run process, but the BUILD target name of this binary is
+`benchmark_model_performance_options` and it takes some additional parameters.
+An important parameter for this binary is:
+
+`perf_options_list`: `string` (default='all') \
+A comma-separated list of TFLite performance options to benchmark.
+
+You can get nightly pre-built binaries for this tool as listed below:
+
+*   [linux_x86-64](https://storage.googleapis.com/tensorflow-nightly-public/prod/tensorflow/release/lite/tools/nightly/latest/linux_x86-64_benchmark_model_performance_options)
+*   [linux_aarch64](https://storage.googleapis.com/tensorflow-nightly-public/prod/tensorflow/release/lite/tools/nightly/latest/linux_aarch64_benchmark_model_performance_options)
+*   [linux_arm](https://storage.googleapis.com/tensorflow-nightly-public/prod/tensorflow/release/lite/tools/nightly/latest/linux_arm_benchmark_model_performance_options)
+*   [android_aarch64](https://storage.googleapis.com/tensorflow-nightly-public/prod/tensorflow/release/lite/tools/nightly/latest/android_aarch64_benchmark_model_performance_options)
+*   [android_arm](https://storage.googleapis.com/tensorflow-nightly-public/prod/tensorflow/release/lite/tools/nightly/latest/android_arm_benchmark_model_performance_options)
+
+### iOS benchamark app
+
+To run benchmarks on iOS device, you need to build the app from
+[source](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark/ios).
+Put the TensorFlow Lite model file in the
+[benchmark_data](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/benchmark_data)
+directory of the source tree and modify the `benchmark_params.json` file. Those
+files are packaged into the app and the app reads data from the directory. Visit
+the
+[iOS benchmark app](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark/ios)
+for detailed instructions.
+
+## Performance benchmarks for well known models
+
+This section lists TensorFlow Lite performance benchmarks when running well
+known models on some Android and iOS devices.
+
+### Android performance benchmarks
+
+These performance benchmark numbers were generated with the
+[native benchmark binary](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark).
+
+For Android benchmarks, the CPU affinity is set to use big cores on the device
+to reduce variance (see
+[details](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark#reducing-variance-between-runs-on-android)).
+
+It assumes that models were downloaded and unzipped to the
+`/data/local/tmp/tflite_models` directory. The benchmark binary is built using
+[these instructions](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark#on-android)
+and assumed to be in the `/data/local/tmp` directory.
+
+To run the benchmark:
+
+```sh
+adb shell /data/local/tmp/benchmark_model \
+  --num_threads=4 \
+  --graph=/data/local/tmp/tflite_models/${GRAPH} \
+  --warmup_runs=1 \
+  --num_runs=50
+```
+
+To run with nnapi delegate, set `--use_nnapi=true`. To run with GPU delegate,
+set `--use_gpu=true`.
+
+The performance values below are measured on Android 10.
+
+<table>
+  <thead>
+    <tr>
+      <th>Model Name</th>
+      <th>Device </th>
+      <th>CPU, 4 threads</th>
+      <th>GPU</th>
+      <th>NNAPI</th>
+    </tr>
+  </thead>
+  <tr>
+    <td rowspan = 2>
+      <a href="https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224.tgz">Mobilenet_1.0_224(float)</a>
+    </td>
+    <td>Pixel 3 </td>
+    <td>23.9 ms</td>
+    <td>6.45 ms</td>
+    <td>13.8 ms</td>
+  </tr>
+   <tr>
+     <td>Pixel 4 </td>
+    <td>14.0 ms</td>
+    <td>9.0 ms</td>
+    <td>14.8 ms</td>
+  </tr>
+  <tr>
+    <td rowspan = 2>
+      <a href="https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz">Mobilenet_1.0_224 (quant)</a>
+    </td>
+    <td>Pixel 3 </td>
+    <td>13.4 ms</td>
+    <td>--- </td>
+    <td>6.0 ms</td>
+  </tr>
+   <tr>
+     <td>Pixel 4 </td>
+    <td>5.0 ms</td>
+    <td>--- </td>
+    <td>3.2 ms</td>
+  </tr>
+  <tr>
+    <td rowspan = 2>
+      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/nasnet_mobile_2018_04_27.tgz">NASNet mobile</a>
+    </td>
+    <td>Pixel 3 </td>
+    <td>56 ms</td>
+    <td>--- </td>
+    <td>102 ms</td>
+  </tr>
+   <tr>
+     <td>Pixel 4 </td>
+    <td>34.5 ms</td>
+    <td>--- </td>
+    <td>99.0 ms</td>
+  </tr>
+  <tr>
+    <td rowspan = 2>
+      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/squeezenet_2018_04_27.tgz">SqueezeNet</a>
+    </td>
+    <td>Pixel 3 </td>
+    <td>35.8 ms</td>
+    <td>9.5 ms </td>
+    <td>18.5 ms</td>
+  </tr>
+   <tr>
+     <td>Pixel 4 </td>
+    <td>23.9 ms</td>
+    <td>11.1 ms</td>
+    <td>19.0 ms</td>
+  </tr>
+  <tr>
+    <td rowspan = 2>
+      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/inception_resnet_v2_2018_04_27.tgz">Inception_ResNet_V2</a>
+    </td>
+    <td>Pixel 3 </td>
+    <td>422 ms</td>
+    <td>99.8 ms </td>
+    <td>201 ms</td>
+  </tr>
+   <tr>
+     <td>Pixel 4 </td>
+    <td>272.6 ms</td>
+    <td>87.2 ms</td>
+    <td>171.1 ms</td>
+  </tr>
+  <tr>
+    <td rowspan = 2>
+      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/inception_v4_2018_04_27.tgz">Inception_V4</a>
+    </td>
+    <td>Pixel 3 </td>
+    <td>486 ms</td>
+    <td>93 ms </td>
+    <td>292 ms</td>
+  </tr>
+   <tr>
+     <td>Pixel 4 </td>
+    <td>324.1 ms</td>
+    <td>97.6 ms</td>
+    <td>186.9 ms</td>
+  </tr>
+
+ </table>
+
+### iOS performance benchmarks
+
+These performance benchmark numbers were generated with the
+[iOS benchmark app](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark/ios).
+
+To run iOS benchmarks, the benchmark app was modified to include the appropriate
+model and `benchmark_params.json` was modified to set `num_threads` to 2. To use
+the GPU delegate, `"use_gpu" : "1"` and `"gpu_wait_type" : "aggressive"` options
+were also added to `benchmark_params.json`.
+
+<table>
+  <thead>
+    <tr>
+      <th>Model Name</th>
+      <th>Device </th>
+      <th>CPU, 2 threads</th>
+      <th>GPU</th>
+    </tr>
+  </thead>
+  <tr>
+    <td>
+      <a href="https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224.tgz">Mobilenet_1.0_224(float)</a>
+    </td>
+    <td>iPhone XS </td>
+    <td>14.8 ms</td>
+    <td>3.4 ms</td>
+  </tr>
+  <tr>
+    <td>
+      <a href="https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz)">Mobilenet_1.0_224 (quant)</a>
+    </td>
+    <td>iPhone XS </td>
+    <td>11 ms</td>
+    <td>---</td>
+  </tr>
+  <tr>
+    <td>
+      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/nasnet_mobile_2018_04_27.tgz">NASNet mobile</a>
+    </td>
+    <td>iPhone XS </td>
+    <td>30.4 ms</td>
+    <td>---</td>
+  </tr>
+  <tr>
+    <td>
+      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/squeezenet_2018_04_27.tgz">SqueezeNet</a>
+    </td>
+    <td>iPhone XS </td>
+    <td>21.1 ms</td>
+    <td>15.5 ms</td>
+  </tr>
+  <tr>
+    <td>
+      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/inception_resnet_v2_2018_04_27.tgz">Inception_ResNet_V2</a>
+    </td>
+    <td>iPhone XS </td>
+    <td>261.1 ms</td>
+    <td>45.7 ms</td>
+  </tr>
+  <tr>
+    <td>
+      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/inception_v4_2018_04_27.tgz">Inception_V4</a>
+    </td>
+    <td>iPhone XS </td>
+    <td>309 ms</td>
+    <td>54.4 ms</td>
+  </tr>
+ </table>
+
+## Trace TensorFlow Lite internals in Android
+
+Note: This feature is experimental and available only when the Android app is
+built with the nightly released Tensorflow Lite library. Stable libraries up to
+v2.3 do not support this.
+
+Internal events from the TensorFlow Lite interpreter of an Android app can be
+captured by
+[Android tracing tools](https://developer.android.com/topic/performance/tracing).
+It is the same event with Android
+[Trace](https://developer.android.com/reference/android/os/Trace) API, so the
+captured events from Java/Kotlin code are seen together with TensorFlow Lite
+internal events.
+
+Some examples of events are:
+
+*   Operator invocation
+*   Graph modification by deleagate
+*   Tensor allocation
+
+Among different options for capturing traces, this guide covers the Android
+Studio CPU Profiler and the System Tracing app. Refer to
+[Perfetto command-line tool](https://developer.android.com/studio/command-line/perfetto)
+or
+[Systrace command-line tool](https://developer.android.com/topic/performance/tracing/command-line)
+for other options.
+
+### Adding trace events in Java code
+
+This is a code snippet from the
+[Image Classification](https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/android)
+example app. TensorFlow Lite interpreter runs in the
+`recognizeImage/runInference` section. This step is optional but it is useful to
+help notice where the inference call is made.
+
+```java
+  Trace.beginSection("recognizeImage");
+  ...
+  // Runs the inference call.
+  Trace.beginSection("runInference");
+  tflite.run(inputImageBuffer.getBuffer(), outputProbabilityBuffer.getBuffer().rewind());
+  Trace.endSection();
+  ...
+  Trace.endSection();
+
+```
+
+### Enable TensorFlow Lite tracing
+
+To enable TensorFlow Lite tracing, set the Android system property
+`debug.tflite.tracing` to 1 before starting the Android app.
+
+```shell
+adb shell setprop debug.tflite.trace 1
+```
+
+If this property has been set when TensorFlow Lite interpreter is initialized,
+key events (e.g., operator invocation) from the interpreter will be traced.
+
+After you captured all the traces, disable tracing by setting the property value
+to 0.
+
+```shell
+adb shell setprop debug.tflite.trace 0
+```
+
+### Android Studio CPU Profiler
+
+Capture traces with the
+[Android Studio CPU Profiler](https://developer.android.com/studio/profile/cpu-profiler)
+by following the steps below:
+
+1.  Select **Run > Profile 'app'** from the top menus.
+
+2.  Click anywhere in CPU timeline when the Profiler window appears.
+
+3.  Select 'Trace System Calls' among CPU Profiling modes.
+
+    ![Select 'Trace System Calls'](images/as_select_profiling_mode.png)
+
+4.  Press 'Record' button.
+
+5.  Press 'Stop' button.
+
+6.  Investigate the trace result.
+
+    ![Android Studio trace](images/as_traces.png)
+
+In this example, you can see the hierarchy of events in a thread and statistics
+for each operator time and also see the data flow of the whole app among
+threads.
+
+### System Tracing app
+
+Capture traces without Android Studio by following the steps detailed in
+[System Tracing app](https://developer.android.com/topic/performance/tracing/on-device).
+
+In this example, the same TFLite events were captured and saved to the Perfetto
+or Systrace format depending on the version of Android device. The captured
+trace files can be opened in the [Perfetto UI](https://ui.perfetto.dev/#!/).
+
+![Perfetto trace](images/perfetto_traces.png)
+
+### Using the tracing data
+
+The tracing data allows you to identify performance bottlenecks.
+
+Here are some examples of insights that you can get from the profiler and
+potential solutions to improve performance:
+
+*   If the number of available CPU cores is smaller than the number of inference
+    threads, then the CPU scheduling overhead can lead to subpar performance.
+    You can reschedule other CPU intensive tasks in your application to avoid
+    overlapping with your model inference or tweak the number of interpreter
+    threads.
+*   If the operators are not fully delegated, then some parts of the model graph
+    are executed on the CPU rather than the expected hardware accelerator. You
+    can substitute the unsupported operators with similar supported operators.
diff --git a/tensorflow/lite/g3doc/performance/model_optimization.md b/tensorflow/lite/g3doc/performance/model_optimization.md
index 14ab9c6a0c4..17b6867d692 100644
--- a/tensorflow/lite/g3doc/performance/model_optimization.md
+++ b/tensorflow/lite/g3doc/performance/model_optimization.md
@@ -33,8 +33,8 @@ models have the following benefits:
     translate to better performance and stability.
 
 Quantization can reduce the size of a model in all of these cases, potentially
-at the expense of some accuracy. Pruning can reduce the size of a model for
-download by making it more easily compressible.
+at the expense of some accuracy. Pruning and clustering can reduce the size of a
+model for download by making it more easily compressible.
 
 ### Latency reduction
 
@@ -54,7 +54,7 @@ Some hardware accelerators, such as the
 with models that have been correctly optimized.
 
 Generally, these types of devices require models to be quantized in a specific
-way. See each hardware accelerators documentation to learn more about their
+way. See each hardware accelerator's documentation to learn more about their
 requirements.
 
 ## Trade-offs
@@ -70,7 +70,8 @@ certain models may gain some accuracy as a result of the optimization process.
 
 ## Types of optimization
 
-TensorFlow Lite currently supports optimization via quantization and pruning.
+TensorFlow Lite currently supports optimization via quantization, pruning and
+clustering.
 
 These are part of the
 [TensorFlow Model Optimization Toolkit](https://www.tensorflow.org/model_optimization),
@@ -134,6 +135,17 @@ technique for reducing model download size.
 
 In the future, TensorFlow Lite will provide latency reduction for pruned models.
 
+### Clustering
+
+[Clustering](https://www.tensorflow.org/model_optimization/guide/clustering)
+works by grouping the weights of each layer in a model into a predefined number
+of clusters, then sharing the centroid values for the weights belonging to each
+individual cluster. This reduces the number of unique weight values in a model,
+thus reducing its complexity.
+
+As a result, clustered models can be compressed more effectively, providing
+deployment benefits similar to pruning.
+
 ## Development workflow
 
 As a starting point, check if the models in
@@ -149,4 +161,4 @@ is the better option. See additional optimization techniques under the
 [TensorFlow Model Optimization Toolkit](https://www.tensorflow.org/model_optimization).
 
 If you want to further reduce your model size, you can try [pruning](#pruning)
-prior to quantizing your models.
+and/or [clustering](#clustering) prior to quantizing your models.
diff --git a/tensorflow/lite/g3doc/performance/post_training_float16_quant.ipynb b/tensorflow/lite/g3doc/performance/post_training_float16_quant.ipynb
index 6015d3e1a65..2ebaaaf0703 100644
--- a/tensorflow/lite/g3doc/performance/post_training_float16_quant.ipynb
+++ b/tensorflow/lite/g3doc/performance/post_training_float16_quant.ipynb
@@ -3,7 +3,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "c8Cx-rUMVX25"
       },
       "source": [
@@ -15,8 +14,6 @@
       "execution_count": 1,
       "metadata": {
         "cellView": "form",
-        "colab": {},
-        "colab_type": "code",
         "id": "I9sUhVL_VZNO"
       },
       "outputs": [],
@@ -37,7 +34,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "6Y8E0lw5eYWm"
       },
       "source": [
@@ -47,7 +43,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "CGuqeuPSVNo-"
       },
       "source": [
@@ -70,7 +65,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "BTC1rDAuei_1"
       },
       "source": [
@@ -86,7 +80,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "2XsEP17Zelz9"
       },
       "source": [
@@ -96,7 +89,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "dDqqUIZjZjac"
       },
       "source": [
@@ -107,8 +99,6 @@
       "cell_type": "code",
       "execution_count": 2,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "gyqAw1M9lyab"
       },
       "outputs": [],
@@ -126,12 +116,7 @@
       "cell_type": "code",
       "execution_count": 3,
       "metadata": {
-        "colab": {
-          "height": 34
-        },
-        "colab_type": "code",
-        "id": "c6nb7OPlXs_3",
-        "outputId": "be7e4e14-cd67-4554-e928-ad803f36dad9"
+        "id": "c6nb7OPlXs_3"
       },
       "outputs": [
         {
@@ -154,7 +139,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "eQ6Q0qqKZogR"
       },
       "source": [
@@ -165,12 +149,7 @@
       "cell_type": "code",
       "execution_count": 4,
       "metadata": {
-        "colab": {
-          "height": 102
-        },
-        "colab_type": "code",
-        "id": "hWSAjQWagIHl",
-        "outputId": "9bf2b530-5a05-415f-f856-cab3642256e9"
+        "id": "hWSAjQWagIHl"
       },
       "outputs": [
         {
@@ -230,7 +209,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "5NMaNZQCkW9X"
       },
       "source": [
@@ -240,7 +218,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "xl8_fzVAZwOh"
       },
       "source": [
@@ -255,8 +232,6 @@
       "cell_type": "code",
       "execution_count": 5,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "_i8B2nDZmAgQ"
       },
       "outputs": [],
@@ -268,7 +243,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "F2o2ZfF0aiCx"
       },
       "source": [
@@ -279,8 +253,6 @@
       "cell_type": "code",
       "execution_count": 6,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "vptWZq2xnclo"
       },
       "outputs": [],
@@ -293,12 +265,7 @@
       "cell_type": "code",
       "execution_count": 7,
       "metadata": {
-        "colab": {
-          "height": 34
-        },
-        "colab_type": "code",
-        "id": "Ie9pQaQrn5ue",
-        "outputId": "5df7381a-78ee-4f3e-e1a9-0f3a028384cf"
+        "id": "Ie9pQaQrn5ue"
       },
       "outputs": [
         {
@@ -322,7 +289,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "7BONhYtYocQY"
       },
       "source": [
@@ -333,8 +299,6 @@
       "cell_type": "code",
       "execution_count": 8,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "HEZ6ET1AHAS3"
       },
       "outputs": [],
@@ -346,7 +310,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "xW84iMYjHd9t"
       },
       "source": [
@@ -357,12 +320,7 @@
       "cell_type": "code",
       "execution_count": 9,
       "metadata": {
-        "colab": {
-          "height": 34
-        },
-        "colab_type": "code",
-        "id": "yuNfl3CoHNK3",
-        "outputId": "839f02cd-0a8c-4551-aaa3-0c05c845ad2e"
+        "id": "yuNfl3CoHNK3"
       },
       "outputs": [
         {
@@ -387,7 +345,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "PhMmUTl4sbkz"
       },
       "source": [
@@ -398,12 +355,7 @@
       "cell_type": "code",
       "execution_count": 10,
       "metadata": {
-        "colab": {
-          "height": 68
-        },
-        "colab_type": "code",
-        "id": "JExfcfLDscu4",
-        "outputId": "6ca316c2-cb0e-40e9-ffb1-a8bcf267e101"
+        "id": "JExfcfLDscu4"
       },
       "outputs": [
         {
@@ -423,7 +375,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "L8lQHMp_asCq"
       },
       "source": [
@@ -433,7 +384,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "-5l6-ciItvX6"
       },
       "source": [
@@ -443,7 +393,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "Ap_jE7QRvhPf"
       },
       "source": [
@@ -454,8 +403,6 @@
       "cell_type": "code",
       "execution_count": 11,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "Jn16Rc23zTss"
       },
       "outputs": [],
@@ -468,8 +415,6 @@
       "cell_type": "code",
       "execution_count": 12,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "J8Pztk1mvNVL"
       },
       "outputs": [],
@@ -481,7 +426,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "2opUt_JTdyEu"
       },
       "source": [
@@ -492,8 +436,6 @@
       "cell_type": "code",
       "execution_count": 13,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "AKslvo2kwWac"
       },
       "outputs": [],
@@ -512,12 +454,7 @@
       "cell_type": "code",
       "execution_count": 14,
       "metadata": {
-        "colab": {
-          "height": 281
-        },
-        "colab_type": "code",
-        "id": "XZClM2vo3_bm",
-        "outputId": "fec12377-9f68-45a7-b4a6-ad902d8db171"
+        "id": "XZClM2vo3_bm"
       },
       "outputs": [
         {
@@ -547,8 +484,6 @@
       "cell_type": "code",
       "execution_count": 15,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "3gwhv4lKbYZ4"
       },
       "outputs": [],
@@ -567,12 +502,7 @@
       "cell_type": "code",
       "execution_count": 16,
       "metadata": {
-        "colab": {
-          "height": 281
-        },
-        "colab_type": "code",
-        "id": "CIH7G_MwbY2x",
-        "outputId": "6a65e499-6618-4b3e-94f6-1d12af8fb251"
+        "id": "CIH7G_MwbY2x"
       },
       "outputs": [
         {
@@ -599,7 +529,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "LwN7uIdCd8Gw"
       },
       "source": [
@@ -610,8 +539,6 @@
       "cell_type": "code",
       "execution_count": 17,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "05aeAuWjvjPx"
       },
       "outputs": [],
@@ -652,12 +579,7 @@
       "cell_type": "code",
       "execution_count": 18,
       "metadata": {
-        "colab": {
-          "height": 34
-        },
-        "colab_type": "code",
-        "id": "T5mWkSbMcU5z",
-        "outputId": "818e9142-70cf-420b-8e64-38c2ca11a370"
+        "id": "T5mWkSbMcU5z"
       },
       "outputs": [
         {
@@ -675,7 +597,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "Km3cY9ry8ZlG"
       },
       "source": [
@@ -686,12 +607,7 @@
       "cell_type": "code",
       "execution_count": 19,
       "metadata": {
-        "colab": {
-          "height": 34
-        },
-        "colab_type": "code",
-        "id": "-9cnwiPp6EGm",
-        "outputId": "53e00eac-51af-4030-be1a-3df986640f8d"
+        "id": "-9cnwiPp6EGm"
       },
       "outputs": [
         {
@@ -713,7 +629,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "L7lfxkor8pgv"
       },
       "source": [
@@ -741,7 +656,6 @@
     "colab": {
       "collapsed_sections": [],
       "name": "post_training_float16_quant.ipynb",
-      "provenance": [],
       "toc_visible": true
     },
     "kernelspec": {
diff --git a/tensorflow/lite/g3doc/performance/post_training_integer_quant.ipynb b/tensorflow/lite/g3doc/performance/post_training_integer_quant.ipynb
index a2835f53d82..21c7bd9be60 100644
--- a/tensorflow/lite/g3doc/performance/post_training_integer_quant.ipynb
+++ b/tensorflow/lite/g3doc/performance/post_training_integer_quant.ipynb
@@ -3,7 +3,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "_DDaAex5Q7u-"
       },
       "source": [
@@ -15,8 +14,6 @@
       "execution_count": null,
       "metadata": {
         "cellView": "form",
-        "colab": {},
-        "colab_type": "code",
         "id": "W1dWWdNHQ9L0"
       },
       "outputs": [],
@@ -37,7 +34,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "6Y8E0lw5eYWm"
       },
       "source": [
@@ -47,7 +43,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "CIGrZZPTZVeO"
       },
       "source": [
@@ -70,7 +65,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "BTC1rDAuei_1"
       },
       "source": [
@@ -82,15 +76,12 @@
         "\n",
         "You actually have several options as to how much you want to quantize a model. In this tutorial, you'll perform \"full integer quantization,\" which converts all weights and activation outputs into 8-bit integer data—whereas other strategies may leave some amount of data in floating-point.\n",
         "\n",
-        "To learn more about the various quantization strategies, read about [TensorFlow Lite model optimization](https://www.tensorflow.org/lite/performance/model_optimization).\n",
-        "\n",
-        "\n"
+        "To learn more about the various quantization strategies, read about [TensorFlow Lite model optimization](https://www.tensorflow.org/lite/performance/model_optimization).\n"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "dDqqUIZjZjac"
       },
       "source": [
@@ -100,7 +91,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "I0nR5AMEWq0H"
       },
       "source": [
@@ -111,8 +101,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "WsN6s5L1ieNl"
       },
       "outputs": [],
@@ -128,7 +116,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "2XsEP17Zelz9"
       },
       "source": [
@@ -138,7 +125,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "5NMaNZQCkW9X"
       },
       "source": [
@@ -151,12 +137,7 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {
-          "height": 51
-        },
-        "colab_type": "code",
-        "id": "eMsw_6HujaqM",
-        "outputId": "0f362bef-a5b8-46f2-c41c-cba008998b72"
+        "id": "eMsw_6HujaqM"
       },
       "outputs": [
         {
@@ -225,7 +206,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "KuTEoGFYd8aM"
       },
       "source": [
@@ -235,7 +215,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "xl8_fzVAZwOh"
       },
       "source": [
@@ -250,8 +229,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "_i8B2nDZmAgQ"
       },
       "outputs": [],
@@ -264,7 +241,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "7BONhYtYocQY"
       },
       "source": [
@@ -274,7 +250,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "jPYZwgZTwJMT"
       },
       "source": [
@@ -284,7 +259,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "Hjvq1vpJd4U_"
       },
       "source": [
@@ -295,12 +269,7 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {
-          "height": 34
-        },
-        "colab_type": "code",
-        "id": "HEZ6ET1AHAS3",
-        "outputId": "82a75458-10d2-484a-8e09-a8af56212e10"
+        "id": "HEZ6ET1AHAS3"
       },
       "outputs": [
         {
@@ -328,7 +297,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "o5wuE-RcdX_3"
       },
       "source": [
@@ -338,7 +306,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "UgKDdnHQEhpb"
       },
       "source": [
@@ -348,7 +315,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "rTe8avZJHMDO"
       },
       "source": [
@@ -360,10 +326,7 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "FiwiWU3gHdkW",
-        "outputId": "61093d59-5b47-4e59-a577-46f056281bab"
+        "id": "FiwiWU3gHdkW"
       },
       "outputs": [
         {
@@ -397,7 +360,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "_GC3HFlptf7x"
       },
       "source": [
@@ -410,12 +372,7 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {
-          "height": 51
-        },
-        "colab_type": "code",
-        "id": "id1OEKFELQwp",
-        "outputId": "024a710f-44cc-43d1-89a7-456a1727523c"
+        "id": "id1OEKFELQwp"
       },
       "outputs": [
         {
@@ -438,7 +395,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "RACBJuj2XO8x"
       },
       "source": [
@@ -452,7 +408,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "FQgTqbvPvxGJ"
       },
       "source": [
@@ -462,7 +417,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "mwR9keYAwArA"
       },
       "source": [
@@ -473,12 +427,7 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {
-          "height": 51
-        },
-        "colab_type": "code",
-        "id": "kzjEjcDs3BHa",
-        "outputId": "0462645b-f8e1-489a-f703-8093f83645d5"
+        "id": "kzjEjcDs3BHa"
       },
       "outputs": [
         {
@@ -516,7 +465,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "wYd6NxD03yjB"
       },
       "source": [
@@ -527,12 +475,7 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {
-          "height": 51
-        },
-        "colab_type": "code",
-        "id": "PaNkOS-twz4k",
-        "outputId": "b7b22b48-c305-4b4c-80c6-506d9f3c2013"
+        "id": "PaNkOS-twz4k"
       },
       "outputs": [
         {
@@ -555,7 +498,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "TO17AP84wzBb"
       },
       "source": [
@@ -565,7 +507,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "sse224YJ4KMm"
       },
       "source": [
@@ -575,7 +516,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "4_9nZ4nv4b9P"
       },
       "source": [
@@ -586,12 +526,7 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {
-          "height": 34
-        },
-        "colab_type": "code",
-        "id": "BEY59dC14uRv",
-        "outputId": "20a3397a-1466-48eb-f421-adc8ebf3f60f"
+        "id": "BEY59dC14uRv"
       },
       "outputs": [
         {
@@ -624,7 +559,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "9t9yaTeF9fyM"
       },
       "source": [
@@ -634,7 +568,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "L8lQHMp_asCq"
       },
       "source": [
@@ -647,8 +580,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "X092SbeWfd1A"
       },
       "outputs": [],
@@ -679,12 +610,6 @@
         "    interpreter.invoke()\n",
         "    output = interpreter.get_tensor(output_details[\"index\"])[0]\n",
         "\n",
-        "    # Check if the output type is quantized, then rescale output data to float\n",
-        "    if output_details['dtype'] == np.uint8:\n",
-        "      output_scale, output_zero_point = output_details[\"quantization\"]\n",
-        "      test_image = test_image.astype(np.float32)\n",
-        "      test_image = test_image / input_scale + input_zero_point\n",
-        "\n",
         "    predictions[i] = output.argmax()\n",
         "\n",
         "  return predictions\n"
@@ -693,18 +618,15 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "2opUt_JTdyEu"
       },
       "source": [
-        "### Test the models on one image\n",
-        "\n"
+        "### Test the models on one image\n"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "QpPpFPaz7eEM"
       },
       "source": [
@@ -719,8 +641,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "zR2cHRUcUZ6e"
       },
       "outputs": [],
@@ -745,7 +665,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "A5OTJ_6Vcslt"
       },
       "source": [
@@ -756,12 +675,7 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {
-          "height": 296
-        },
-        "colab_type": "code",
-        "id": "iTK0x980coto",
-        "outputId": "1881b045-e953-416f-a25f-6c083409c7be"
+        "id": "iTK0x980coto"
       },
       "outputs": [
         {
@@ -785,7 +699,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "o3N6-UGl1dfE"
       },
       "source": [
@@ -796,12 +709,7 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {
-          "height": 296
-        },
-        "colab_type": "code",
-        "id": "rc1i9umMcp0t",
-        "outputId": "480bc68f-812b-460e-82fe-d66f70b4345e"
+        "id": "rc1i9umMcp0t"
       },
       "outputs": [
         {
@@ -825,7 +733,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "LwN7uIdCd8Gw"
       },
       "source": [
@@ -835,7 +742,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "RFKOD4DG8XmU"
       },
       "source": [
@@ -846,8 +752,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "05aeAuWjvjPx"
       },
       "outputs": [],
@@ -869,7 +773,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "xnFilQpBuMh5"
       },
       "source": [
@@ -880,12 +783,7 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {
-          "height": 34
-        },
-        "colab_type": "code",
-        "id": "T5mWkSbMcU5z",
-        "outputId": "7e05d400-1455-4c1a-f3f0-b81422c3a0ba"
+        "id": "T5mWkSbMcU5z"
       },
       "outputs": [
         {
@@ -903,7 +801,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "Km3cY9ry8ZlG"
       },
       "source": [
@@ -914,12 +811,7 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {
-          "height": 34
-        },
-        "colab_type": "code",
-        "id": "-9cnwiPp6EGm",
-        "outputId": "1e7409cf-748d-45c9-aa2f-36ccd9454f45"
+        "id": "-9cnwiPp6EGm"
       },
       "outputs": [
         {
@@ -937,7 +829,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "L7lfxkor8pgv"
       },
       "source": [
@@ -951,7 +842,6 @@
     "colab": {
       "collapsed_sections": [],
       "name": "post_training_integer_quant.ipynb",
-      "provenance": [],
       "toc_visible": true
     },
     "kernelspec": {
diff --git a/tensorflow/lite/g3doc/performance/post_training_integer_quant_16x8.ipynb b/tensorflow/lite/g3doc/performance/post_training_integer_quant_16x8.ipynb
new file mode 100644
index 00000000000..5983a80d10b
--- /dev/null
+++ b/tensorflow/lite/g3doc/performance/post_training_integer_quant_16x8.ipynb
@@ -0,0 +1,590 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "c8Cx-rUMVX25"
+      },
+      "source": [
+        "##### Copyright 2020 The TensorFlow Authors."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "id": "I9sUhVL_VZNO"
+      },
+      "outputs": [],
+      "source": [
+        "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+        "# you may not use this file except in compliance with the License.\n",
+        "# You may obtain a copy of the License at\n",
+        "#\n",
+        "# https://www.apache.org/licenses/LICENSE-2.0\n",
+        "#\n",
+        "# Unless required by applicable law or agreed to in writing, software\n",
+        "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+        "# See the License for the specific language governing permissions and\n",
+        "# limitations under the License."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "6Y8E0lw5eYWm"
+      },
+      "source": [
+        "# Post-training integer quantization with int16 activations"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "CGuqeuPSVNo-"
+      },
+      "source": [
+        "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca target=\"_blank\" href=\"https://www.tensorflow.org/lite/performance/post_training_quant_16x8\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" /\u003eView on TensorFlow.org\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/performance/post_training_quant_16x8.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca target=\"_blank\" href=\"https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/performance/post_training_quant_16x8.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca href=\"https://storage.googleapis.com/tensorflow_docs/tensorflow/lite/g3doc/performance/post_training_quant_16x8.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "\u003c/table\u003e"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "BTC1rDAuei_1"
+      },
+      "source": [
+        "## Overview\n",
+        "\n",
+        "[TensorFlow Lite](https://www.tensorflow.org/lite/) now supports\n",
+        "converting activations to 16-bit integer values and weights to 8-bit integer values during model conversion from TensorFlow to TensorFlow Lite's flat buffer format. We refer to this mode as the \"16x8 quantization mode\". This mode can improve accuracy of the quantized model significantly, when activations are sensitive to the quantization, while still achieving almost 3-4x reduction in model size. Moreover, this fully quantized model can be consumed by integer-only hardware accelerators. \n",
+        "\n",
+        "Some examples of models that benefit from this mode of the post-training quantization include: \n",
+        "* super-resolution, \n",
+        "* audio signal processing such\n",
+        "as noise cancelling and beamforming, \n",
+        "* image de-noising, \n",
+        "* HDR reconstruction\n",
+        "from a single image\n",
+        "\n",
+        "In this tutorial, you train an MNIST model from scratch, check its accuracy in TensorFlow, and then convert the model into a Tensorflow Lite flatbuffer using this mode. At the end you check the accuracy of the converted model and compare it to the original float32 model. Note that this example demonstrates the usage of this mode and doesn't show benefits over other available quantization techniques in TensorFlow Lite."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "2XsEP17Zelz9"
+      },
+      "source": [
+        "## Build an MNIST model"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "dDqqUIZjZjac"
+      },
+      "source": [
+        "### Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "gyqAw1M9lyab"
+      },
+      "outputs": [],
+      "source": [
+        "import logging\n",
+        "logging.getLogger(\"tensorflow\").setLevel(logging.DEBUG)\n",
+        "\n",
+        "import tensorflow as tf\n",
+        "from tensorflow import keras\n",
+        "import numpy as np\n",
+        "import pathlib"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "srTSFKjn1tMp"
+      },
+      "source": [
+        "Check that the 16x8 quantization mode is available "
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "c6nb7OPlXs_3"
+      },
+      "outputs": [],
+      "source": [
+        "tf.lite.OpsSet.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "eQ6Q0qqKZogR"
+      },
+      "source": [
+        "### Train and export the model"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "hWSAjQWagIHl"
+      },
+      "outputs": [],
+      "source": [
+        "# Load MNIST dataset\n",
+        "mnist = keras.datasets.mnist\n",
+        "(train_images, train_labels), (test_images, test_labels) = mnist.load_data()\n",
+        "\n",
+        "# Normalize the input image so that each pixel value is between 0 to 1.\n",
+        "train_images = train_images / 255.0\n",
+        "test_images = test_images / 255.0\n",
+        "\n",
+        "# Define the model architecture\n",
+        "model = keras.Sequential([\n",
+        "  keras.layers.InputLayer(input_shape=(28, 28)),\n",
+        "  keras.layers.Reshape(target_shape=(28, 28, 1)),\n",
+        "  keras.layers.Conv2D(filters=12, kernel_size=(3, 3), activation=tf.nn.relu),\n",
+        "  keras.layers.MaxPooling2D(pool_size=(2, 2)),\n",
+        "  keras.layers.Flatten(),\n",
+        "  keras.layers.Dense(10)\n",
+        "])\n",
+        "\n",
+        "# Train the digit classification model\n",
+        "model.compile(optimizer='adam',\n",
+        "              loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),\n",
+        "              metrics=['accuracy'])\n",
+        "model.fit(\n",
+        "  train_images,\n",
+        "  train_labels,\n",
+        "  epochs=1,\n",
+        "  validation_data=(test_images, test_labels)\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "5NMaNZQCkW9X"
+      },
+      "source": [
+        "For the example, you trained the model for just a single epoch, so it only trains to ~96% accuracy."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "xl8_fzVAZwOh"
+      },
+      "source": [
+        "### Convert to a TensorFlow Lite model\n",
+        "\n",
+        "Using the Python [TFLiteConverter](https://www.tensorflow.org/lite/convert/python_api), you can now convert the trained model into a TensorFlow Lite model.\n",
+        "\n",
+        "Now, convert the model using `TFliteConverter` into default float32 format:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "_i8B2nDZmAgQ"
+      },
+      "outputs": [],
+      "source": [
+        "converter = tf.lite.TFLiteConverter.from_keras_model(model)\n",
+        "tflite_model = converter.convert()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "F2o2ZfF0aiCx"
+      },
+      "source": [
+        "Write it out to a `.tflite` file:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "vptWZq2xnclo"
+      },
+      "outputs": [],
+      "source": [
+        "tflite_models_dir = pathlib.Path(\"/tmp/mnist_tflite_models/\")\n",
+        "tflite_models_dir.mkdir(exist_ok=True, parents=True)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Ie9pQaQrn5ue"
+      },
+      "outputs": [],
+      "source": [
+        "tflite_model_file = tflite_models_dir/\"mnist_model.tflite\"\n",
+        "tflite_model_file.write_bytes(tflite_model)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "7BONhYtYocQY"
+      },
+      "source": [
+        "To instead quantize the model to 16x8 quantization mode, first set the `optimizations` flag to use default optimizations. Then specify that 16x8 quantization mode is the required supported operation in the  target specification:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "HEZ6ET1AHAS3"
+      },
+      "outputs": [],
+      "source": [
+        "converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
+        "converter.target_spec.supported_ops = [tf.lite.OpsSet.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8]"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "zLxQwZq9CpN7"
+      },
+      "source": [
+        "As in the case of int8 post-training quantization, it is possible to produce a fully integer quantized model by setting converter options `inference_input(output)_type` to tf.int16."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "yZekFJC5-fOG"
+      },
+      "source": [
+        "Set the calibration data:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Y3a6XFqvHbYM"
+      },
+      "outputs": [],
+      "source": [
+        "mnist_train, _ = tf.keras.datasets.mnist.load_data()\n",
+        "images = tf.cast(mnist_train[0], tf.float32) / 255.0\n",
+        "mnist_ds = tf.data.Dataset.from_tensor_slices((images)).batch(1)\n",
+        "def representative_data_gen():\n",
+        "  for input_value in mnist_ds.take(100):\n",
+        "    # Model has only one input so each data point has one element.\n",
+        "    yield [input_value]\n",
+        "converter.representative_dataset = representative_data_gen"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "xW84iMYjHd9t"
+      },
+      "source": [
+        "Finally, convert the model as usual. Note, by default the converted model will still use float input and outputs for invocation convenience."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "yuNfl3CoHNK3"
+      },
+      "outputs": [],
+      "source": [
+        "tflite_16x8_model = converter.convert()\n",
+        "tflite_model_16x8_file = tflite_models_dir/\"mnist_model_quant_16x8.tflite\"\n",
+        "tflite_model_16x8_file.write_bytes(tflite_16x8_model)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "PhMmUTl4sbkz"
+      },
+      "source": [
+        "Note how the resulting file is approximately `1/3` the size."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "JExfcfLDscu4"
+      },
+      "outputs": [],
+      "source": [
+        "!ls -lh {tflite_models_dir}"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "L8lQHMp_asCq"
+      },
+      "source": [
+        "## Run the TensorFlow Lite models"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "-5l6-ciItvX6"
+      },
+      "source": [
+        "Run the TensorFlow Lite model using the Python TensorFlow Lite Interpreter."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Ap_jE7QRvhPf"
+      },
+      "source": [
+        "### Load the model into the interpreters"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Jn16Rc23zTss"
+      },
+      "outputs": [],
+      "source": [
+        "interpreter = tf.lite.Interpreter(model_path=str(tflite_model_file))\n",
+        "interpreter.allocate_tensors()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "J8Pztk1mvNVL"
+      },
+      "outputs": [],
+      "source": [
+        "interpreter_16x8 = tf.lite.Interpreter(model_path=str(tflite_model_16x8_file))\n",
+        "interpreter_16x8.allocate_tensors()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "2opUt_JTdyEu"
+      },
+      "source": [
+        "### Test the models on one image"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "AKslvo2kwWac"
+      },
+      "outputs": [],
+      "source": [
+        "test_image = np.expand_dims(test_images[0], axis=0).astype(np.float32)\n",
+        "\n",
+        "input_index = interpreter.get_input_details()[0][\"index\"]\n",
+        "output_index = interpreter.get_output_details()[0][\"index\"]\n",
+        "\n",
+        "interpreter.set_tensor(input_index, test_image)\n",
+        "interpreter.invoke()\n",
+        "predictions = interpreter.get_tensor(output_index)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "XZClM2vo3_bm"
+      },
+      "outputs": [],
+      "source": [
+        "import matplotlib.pylab as plt\n",
+        "\n",
+        "plt.imshow(test_images[0])\n",
+        "template = \"True:{true}, predicted:{predict}\"\n",
+        "_ = plt.title(template.format(true= str(test_labels[0]),\n",
+        "                              predict=str(np.argmax(predictions[0]))))\n",
+        "plt.grid(False)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "3gwhv4lKbYZ4"
+      },
+      "outputs": [],
+      "source": [
+        "test_image = np.expand_dims(test_images[0], axis=0).astype(np.float32)\n",
+        "\n",
+        "input_index = interpreter_16x8.get_input_details()[0][\"index\"]\n",
+        "output_index = interpreter_16x8.get_output_details()[0][\"index\"]\n",
+        "\n",
+        "interpreter_16x8.set_tensor(input_index, test_image)\n",
+        "interpreter_16x8.invoke()\n",
+        "predictions = interpreter_16x8.get_tensor(output_index)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "CIH7G_MwbY2x"
+      },
+      "outputs": [],
+      "source": [
+        "plt.imshow(test_images[0])\n",
+        "template = \"True:{true}, predicted:{predict}\"\n",
+        "_ = plt.title(template.format(true= str(test_labels[0]),\n",
+        "                              predict=str(np.argmax(predictions[0]))))\n",
+        "plt.grid(False)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "LwN7uIdCd8Gw"
+      },
+      "source": [
+        "### Evaluate the models"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "05aeAuWjvjPx"
+      },
+      "outputs": [],
+      "source": [
+        "# A helper function to evaluate the TF Lite model using \"test\" dataset.\n",
+        "def evaluate_model(interpreter):\n",
+        "  input_index = interpreter.get_input_details()[0][\"index\"]\n",
+        "  output_index = interpreter.get_output_details()[0][\"index\"]\n",
+        "\n",
+        "  # Run predictions on every image in the \"test\" dataset.\n",
+        "  prediction_digits = []\n",
+        "  for test_image in test_images:\n",
+        "    # Pre-processing: add batch dimension and convert to float32 to match with\n",
+        "    # the model's input data format.\n",
+        "    test_image = np.expand_dims(test_image, axis=0).astype(np.float32)\n",
+        "    interpreter.set_tensor(input_index, test_image)\n",
+        "\n",
+        "    # Run inference.\n",
+        "    interpreter.invoke()\n",
+        "\n",
+        "    # Post-processing: remove batch dimension and find the digit with highest\n",
+        "    # probability.\n",
+        "    output = interpreter.tensor(output_index)\n",
+        "    digit = np.argmax(output()[0])\n",
+        "    prediction_digits.append(digit)\n",
+        "\n",
+        "  # Compare prediction results with ground truth labels to calculate accuracy.\n",
+        "  accurate_count = 0\n",
+        "  for index in range(len(prediction_digits)):\n",
+        "    if prediction_digits[index] == test_labels[index]:\n",
+        "      accurate_count += 1\n",
+        "  accuracy = accurate_count * 1.0 / len(prediction_digits)\n",
+        "\n",
+        "  return accuracy"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "T5mWkSbMcU5z"
+      },
+      "outputs": [],
+      "source": [
+        "print(evaluate_model(interpreter))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Km3cY9ry8ZlG"
+      },
+      "source": [
+        "Repeat the evaluation on the 16x8 quantized model:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "-9cnwiPp6EGm"
+      },
+      "outputs": [],
+      "source": [
+        "# NOTE: This quantization mode is an experimental post-training mode,\n",
+        "# it does not have any optimized kernels implementations or\n",
+        "# specialized machine learning hardware accelerators. Therefore,\n",
+        "# it could be slower than the float interpreter.\n",
+        "print(evaluate_model(interpreter_16x8))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "L7lfxkor8pgv"
+      },
+      "source": [
+        "In this example, you have quantized a model to 16x8 with no difference in the accuracy, but with the 3x reduced size.\n"
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "collapsed_sections": [],
+      "name": "post_training_integer_quant_16x8.ipynb",
+      "toc_visible": true
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/tensorflow/lite/g3doc/performance/post_training_quant.ipynb b/tensorflow/lite/g3doc/performance/post_training_quant.ipynb
index bae28ca4a5d..311c8180ceb 100644
--- a/tensorflow/lite/g3doc/performance/post_training_quant.ipynb
+++ b/tensorflow/lite/g3doc/performance/post_training_quant.ipynb
@@ -3,7 +3,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "_-GR0EDHM1SO"
       },
       "source": [
@@ -15,8 +14,6 @@
       "execution_count": 1,
       "metadata": {
         "cellView": "form",
-        "colab": {},
-        "colab_type": "code",
         "id": "R3yYtBPkM2qZ"
       },
       "outputs": [],
@@ -37,7 +34,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "6Y8E0lw5eYWm"
       },
       "source": [
@@ -47,7 +43,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "CIGrZZPTZVeO"
       },
       "source": [
@@ -70,7 +65,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "BTC1rDAuei_1"
       },
       "source": [
@@ -107,7 +101,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "2XsEP17Zelz9"
       },
       "source": [
@@ -117,7 +110,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "dDqqUIZjZjac"
       },
       "source": [
@@ -128,8 +120,6 @@
       "cell_type": "code",
       "execution_count": 2,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "gyqAw1M9lyab"
       },
       "outputs": [],
@@ -146,7 +136,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "eQ6Q0qqKZogR"
       },
       "source": [
@@ -157,12 +146,7 @@
       "cell_type": "code",
       "execution_count": 3,
       "metadata": {
-        "colab": {
-          "height": 51
-        },
-        "colab_type": "code",
-        "id": "hWSAjQWagIHl",
-        "outputId": "961899f8-1597-4417-b21d-cae94a330ecc"
+        "id": "hWSAjQWagIHl"
       },
       "outputs": [
         {
@@ -219,7 +203,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "5NMaNZQCkW9X"
       },
       "source": [
@@ -229,7 +212,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "xl8_fzVAZwOh"
       },
       "source": [
@@ -244,8 +226,6 @@
       "cell_type": "code",
       "execution_count": 4,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "_i8B2nDZmAgQ"
       },
       "outputs": [],
@@ -257,7 +237,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "F2o2ZfF0aiCx"
       },
       "source": [
@@ -268,8 +247,6 @@
       "cell_type": "code",
       "execution_count": 5,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "vptWZq2xnclo"
       },
       "outputs": [],
@@ -282,12 +259,7 @@
       "cell_type": "code",
       "execution_count": 6,
       "metadata": {
-        "colab": {
-          "height": 34
-        },
-        "colab_type": "code",
-        "id": "Ie9pQaQrn5ue",
-        "outputId": "046db0bc-1745-4e94-9f21-f7e91bdaebda"
+        "id": "Ie9pQaQrn5ue"
       },
       "outputs": [
         {
@@ -311,7 +283,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "7BONhYtYocQY"
       },
       "source": [
@@ -322,12 +293,7 @@
       "cell_type": "code",
       "execution_count": 7,
       "metadata": {
-        "colab": {
-          "height": 34
-        },
-        "colab_type": "code",
-        "id": "g8PUvLWDlmmz",
-        "outputId": "d79b45d3-babf-4890-8036-de2f497da88a"
+        "id": "g8PUvLWDlmmz"
       },
       "outputs": [
         {
@@ -353,7 +319,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "PhMmUTl4sbkz"
       },
       "source": [
@@ -364,12 +329,7 @@
       "cell_type": "code",
       "execution_count": 8,
       "metadata": {
-        "colab": {
-          "height": 119
-        },
-        "colab_type": "code",
-        "id": "JExfcfLDscu4",
-        "outputId": "d1fda4c2-343e-40fb-f90f-b6bde00c523e"
+        "id": "JExfcfLDscu4"
       },
       "outputs": [
         {
@@ -392,7 +352,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "L8lQHMp_asCq"
       },
       "source": [
@@ -405,7 +364,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "Ap_jE7QRvhPf"
       },
       "source": [
@@ -416,8 +374,6 @@
       "cell_type": "code",
       "execution_count": 9,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "Jn16Rc23zTss"
       },
       "outputs": [],
@@ -430,8 +386,6 @@
       "cell_type": "code",
       "execution_count": 10,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "J8Pztk1mvNVL"
       },
       "outputs": [],
@@ -443,7 +397,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "2opUt_JTdyEu"
       },
       "source": [
@@ -454,8 +407,6 @@
       "cell_type": "code",
       "execution_count": 11,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "AKslvo2kwWac"
       },
       "outputs": [],
@@ -474,12 +425,7 @@
       "cell_type": "code",
       "execution_count": 12,
       "metadata": {
-        "colab": {
-          "height": 281
-        },
-        "colab_type": "code",
-        "id": "XZClM2vo3_bm",
-        "outputId": "0fa4155b-01f8-4fea-f586-d9044d73572e"
+        "id": "XZClM2vo3_bm"
       },
       "outputs": [
         {
@@ -508,7 +454,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "LwN7uIdCd8Gw"
       },
       "source": [
@@ -519,8 +464,6 @@
       "cell_type": "code",
       "execution_count": 13,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "05aeAuWjvjPx"
       },
       "outputs": [],
@@ -561,12 +504,7 @@
       "cell_type": "code",
       "execution_count": 14,
       "metadata": {
-        "colab": {
-          "height": 34
-        },
-        "colab_type": "code",
-        "id": "DqXBnDfJ7qxL",
-        "outputId": "78f393f8-c4a5-41e0-abe4-ab6a5c394e51"
+        "id": "DqXBnDfJ7qxL"
       },
       "outputs": [
         {
@@ -584,7 +522,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "Km3cY9ry8ZlG"
       },
       "source": [
@@ -595,12 +532,7 @@
       "cell_type": "code",
       "execution_count": 15,
       "metadata": {
-        "colab": {
-          "height": 34
-        },
-        "colab_type": "code",
-        "id": "-9cnwiPp6EGm",
-        "outputId": "d82552d7-8a2c-49dc-a19a-56010a013102"
+        "id": "-9cnwiPp6EGm"
       },
       "outputs": [
         {
@@ -618,7 +550,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "L7lfxkor8pgv"
       },
       "source": [
@@ -628,7 +559,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "M0o1FtmWeKZm"
       },
       "source": [
@@ -645,8 +575,6 @@
       "cell_type": "code",
       "execution_count": 16,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "jrXZxSJiJfYN"
       },
       "outputs": [],
@@ -665,12 +593,7 @@
       "cell_type": "code",
       "execution_count": 17,
       "metadata": {
-        "colab": {
-          "height": 34
-        },
-        "colab_type": "code",
-        "id": "LwnV4KxwVEoG",
-        "outputId": "7d50f90d-6104-43a3-863c-28db9465d483"
+        "id": "LwnV4KxwVEoG"
       },
       "outputs": [
         {
@@ -696,12 +619,7 @@
       "cell_type": "code",
       "execution_count": 18,
       "metadata": {
-        "colab": {
-          "height": 34
-        },
-        "colab_type": "code",
-        "id": "2qkZD0VoVExe",
-        "outputId": "76a47590-fa91-49b9-f568-4e00b46c9537"
+        "id": "2qkZD0VoVExe"
       },
       "outputs": [
         {
@@ -728,12 +646,7 @@
       "cell_type": "code",
       "execution_count": 19,
       "metadata": {
-        "colab": {
-          "height": 102
-        },
-        "colab_type": "code",
-        "id": "vhOjeg1x9Knp",
-        "outputId": "c643a660-f815-49f0-ac4b-ac48af3c1203"
+        "id": "vhOjeg1x9Knp"
       },
       "outputs": [
         {
@@ -755,7 +668,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "qqHLaqFMCjRZ"
       },
       "source": [
@@ -770,7 +682,6 @@
     "colab": {
       "collapsed_sections": [],
       "name": "post_training_quant.ipynb",
-      "provenance": [],
       "toc_visible": true
     },
     "kernelspec": {
diff --git a/tensorflow/lite/g3doc/performance/post_training_quantization.md b/tensorflow/lite/g3doc/performance/post_training_quantization.md
index 6198798978f..2fd4f078c4c 100644
--- a/tensorflow/lite/g3doc/performance/post_training_quantization.md
+++ b/tensorflow/lite/g3doc/performance/post_training_quantization.md
@@ -89,6 +89,9 @@ interface as the original float only model.
 [TensorFlow Lite for Microcontrollers](https://www.tensorflow.org/lite/microcontrollers)
 and [Coral Edge TPUs](https://coral.ai/).*
 
+Note: Starting TensorFlow 2.3.0, we support the `inference_input_type` and
+`inference_output_type` attributes.
+
 Additionally, to ensure compatibility with integer only devices (such as 8-bit
 microcontrollers) and accelerators (such as the Coral Edge TPU), you can enforce
 full integer quantization for all ops including the input and output, by using
@@ -196,6 +199,9 @@ The disadvantage of this quantization is:
 
 Note: This is an experimental feature.
 
+A tutorial for this quantization mode can be found
+[here](post_training_integer_quant_16x8.ipynb).
+
 ### Model accuracy
 
 Since weights are quantized post training, there could be an accuracy loss,
diff --git a/tensorflow/lite/g3doc/r1/convert/cmdline_examples.md b/tensorflow/lite/g3doc/r1/convert/cmdline_examples.md
index 4c001bc7c90..e38c2b3e215 100644
--- a/tensorflow/lite/g3doc/r1/convert/cmdline_examples.md
+++ b/tensorflow/lite/g3doc/r1/convert/cmdline_examples.md
@@ -2,175 +2,195 @@
 
 This page shows how to use the TensorFlow Lite Converter in the command line.
 
+_Note: If possible, use the **recommended** [Python API](python_api.md)
+instead._
+
 ## Command-line tools <a name="tools"></a>
 
+### Starting from TensorFlow 1.9
+
 There are two approaches to running the converter in the command line.
 
-*   `tflite_convert`: Starting from TensorFlow 1.9, the command-line tool
-    `tflite_convert` is installed as part of the Python package. All of the
-    examples below use `tflite_convert` for simplicity.
-    *   Example: `tflite_convert --output_file=...`
-*   `bazel`: In order to run the latest version of the TensorFlow Lite Converter
-    either install the nightly build using
-    [pip](https://www.tensorflow.org/install/pip) or
-    [clone the TensorFlow repository](https://www.tensorflow.org/install/source)
-    and use `bazel`.
-    *   Example: `bazel run
-        //third_party/tensorflow/lite/python:tflite_convert --
+*   `tflite_convert` (**recommended**):
+    *   *Install*: TensorFlow using
+        [pip](https://www.tensorflow.org/install/pip).
+    *   *Example*: `tflite_convert --output_file=...`
+*   `bazel`:
+    *   *Install*: TensorFlow from
+        [source](https://www.tensorflow.org/install/source).
+    *   *Example*: `bazel run tensorflow/lite/python:tflite_convert --
         --output_file=...`
 
-### Converting models prior to TensorFlow 1.9 <a name="pre_tensorflow_1.9"></a>
+*All of the following examples use `tflite_convert` for simplicity.
+Alternatively, you can replace '`tflite_convert`' with '`bazel run
+tensorflow/lite/python:tflite_convert --`'*
+
+### Prior to TensorFlow 1.9 <a name="pre_tensorflow_1.9"></a>
 
 The recommended approach for using the converter prior to TensorFlow 1.9 is the
-[Python API](python_api.md#pre_tensorflow_1.9). If a command line tool is
-desired, the `toco` command line tool was available in TensorFlow 1.7. Enter
-`toco --help` in Terminal for additional details on the command-line flags
-available. There were no command line tools in TensorFlow 1.8.
+[Python API](python_api.md). Only in TensorFlow 1.7, a command line tool `toco`
+was available (run `toco --help` for additional details).
 
-## Basic examples <a name="basic"></a>
+## Usage <a name="usage"></a>
 
-The following section shows examples of how to convert a basic float-point model
-from each of the supported data formats into a TensorFlow Lite FlatBuffers.
+### Setup <a name="download_models"></a>
 
-### Convert a TensorFlow GraphDef <a name="graphdef"></a>
-
-The follow example converts a basic TensorFlow GraphDef (frozen by
-[freeze_graph.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/freeze_graph.py))
-into a TensorFlow Lite FlatBuffer to perform floating-point inference. Frozen
-graphs contain the variables stored in Checkpoint files as Const ops.
+Before we begin, download the models required to run the examples in this
+document:
 
 ```
+echo "Download MobileNet V1"
 curl https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_0.50_128_frozen.tgz \
   | tar xzv -C /tmp
+
+echo "Download Inception V1"
+curl https://storage.googleapis.com/download.tensorflow.org/models/inception_v1_2016_08_28_frozen.pb.tar.gz \
+  | tar xzv -C /tmp
+```
+
+### Basic examples <a name="basic"></a>
+
+The following section shows examples of how to convert a basic model from each
+of the supported data formats into a TensorFlow Lite model.
+
+#### Convert a SavedModel <a name="savedmodel"></a>
+
+```
+tflite_convert \
+  --saved_model_dir=/tmp/saved_model \
+  --output_file=/tmp/foo.tflite
+```
+
+#### Convert a tf.keras model <a name="keras"></a>
+
+```
+tflite_convert \
+  --keras_model_file=/tmp/keras_model.h5 \
+  --output_file=/tmp/foo.tflite
+```
+
+#### Convert a Frozen GraphDef <a name="graphdef"></a>
+
+```
 tflite_convert \
-  --output_file=/tmp/foo.tflite \
   --graph_def_file=/tmp/mobilenet_v1_0.50_128/frozen_graph.pb \
+  --output_file=/tmp/foo.tflite \
   --input_arrays=input \
   --output_arrays=MobilenetV1/Predictions/Reshape_1
 ```
 
-The value for `input_shapes` is automatically determined whenever possible.
+Frozen GraphDef models (or frozen graphs) are produced by
+[freeze_graph.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/freeze_graph.py)
+and require additional flags `--input_arrays` and `--output_arrays` as this
+information is not stored in the model format.
 
-### Convert a TensorFlow SavedModel <a name="savedmodel"></a>
+### Advanced examples
 
-The follow example converts a basic TensorFlow SavedModel into a Tensorflow Lite
-FlatBuffer to perform floating-point inference.
+#### Convert a quantization aware trained model into a quantized TensorFlow Lite model
+
+If you have a quantization aware trained model (i.e, a model inserted with
+`FakeQuant*` operations which record the (min, max) ranges of tensors in order
+to quantize them), then convert it into a quantized TensorFlow Lite model as
+shown below:
 
 ```
 tflite_convert \
+  --graph_def_file=/tmp/some_mobilenetv1_quantized_frozen_graph.pb \
   --output_file=/tmp/foo.tflite \
-  --saved_model_dir=/tmp/saved_model
-```
-
-[SavedModel](https://www.tensorflow.org/guide/saved_model#using_savedmodel_with_estimators)
-has fewer required flags than frozen graphs due to access to additional data
-contained within the SavedModel. The values for `--input_arrays` and
-`--output_arrays` are an aggregated, alphabetized list of the inputs and outputs
-in the [SignatureDefs](../../serving/signature_defs.md) within
-the
-[MetaGraphDef](https://www.tensorflow.org/saved_model#apis_to_build_and_load_a_savedmodel)
-specified by `--saved_model_tag_set`. As with the GraphDef, the value for
-`input_shapes` is automatically determined whenever possible.
-
-There is currently no support for MetaGraphDefs without a SignatureDef or for
-MetaGraphDefs that use the [`assets/`
-directory](https://www.tensorflow.org/guide/saved_model#structure_of_a_savedmodel_directory).
-
-### Convert a tf.Keras model <a name="keras"></a>
-
-The following example converts a `tf.keras` model into a TensorFlow Lite
-Flatbuffer. The `tf.keras` file must contain both the model and the weights.
-
-```
-tflite_convert \
-  --output_file=/tmp/foo.tflite \
-  --keras_model_file=/tmp/keras_model.h5
-```
-
-## Quantization
-
-### Convert a TensorFlow GraphDef for quantized inference <a name="graphdef_quant"></a>
-
-The TensorFlow Lite Converter is compatible with fixed point quantization models
-described
-[here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/quantize/README.md).
-These are float models with `FakeQuant*` ops inserted at the boundaries of fused
-layers to record min-max range information. This generates a quantized inference
-workload that reproduces the quantization behavior that was used during
-training.
-
-The following command generates a quantized TensorFlow Lite FlatBuffer from a
-"quantized" TensorFlow GraphDef.
-
-```
-tflite_convert \
-  --output_file=/tmp/foo.tflite \
-  --graph_def_file=/tmp/some_quantized_graph.pb \
-  --inference_type=QUANTIZED_UINT8 \
   --input_arrays=input \
   --output_arrays=MobilenetV1/Predictions/Reshape_1 \
-  --mean_values=128 \
-  --std_dev_values=127
+  --inference_type=INT8 \
+  --mean_values=-0.5 \
+  --std_dev_values=127.7
 ```
 
-### Use \"dummy-quantization\" to try out quantized inference on a float graph <a name="dummy_quant"></a>
+*If you're setting `--inference_type=QUANTIZED_UINT8` then update
+`--mean_values=128` and `--std_dev_values=127`*
 
-In order to evaluate the possible benefit of generating a quantized graph, the
-converter allows "dummy-quantization" on float graphs. The flags
-`--default_ranges_min` and `--default_ranges_max` accept plausible values for
-the min-max ranges of the values in all arrays that do not have min-max
-information. "Dummy-quantization" will produce lower accuracy but will emulate
-the performance of a correctly quantized model.
+#### Convert a model with \"dummy-quantization\" into a quantized TensorFlow Lite model
+
+If you have a regular float model and only want to estimate the benefit of a
+quantized model, i.e, estimate the performance of the model as if it were
+quantized aware trained, then perform "dummy-quantization" using the flags
+`--default_ranges_min` and `--default_ranges_max`. When specified, they will be
+used as default (min, max) range for all the tensors that lack (min, max) range
+information. This will allow quantization to proceed and help you emulate the
+performance of a quantized TensorFlow Lite model but it will have a lower
+accuracy.
 
 The example below contains a model using Relu6 activation functions. Therefore,
 a reasonable guess is that most activation ranges should be contained in [0, 6].
 
 ```
-curl https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_0.50_128_frozen.tgz \
-  | tar xzv -C /tmp
 tflite_convert \
-  --output_file=/tmp/foo.cc \
   --graph_def_file=/tmp/mobilenet_v1_0.50_128/frozen_graph.pb \
-  --inference_type=QUANTIZED_UINT8 \
+  --output_file=/tmp/foo.tflite \
   --input_arrays=input \
   --output_arrays=MobilenetV1/Predictions/Reshape_1 \
+  --inference_type=INT8 \
+  --mean_values=-0.5 \
+  --std_dev_values=127.7 \
   --default_ranges_min=0 \
-  --default_ranges_max=6 \
-  --mean_values=128 \
-  --std_dev_values=127
+  --default_ranges_max=6
 ```
 
-## Specifying input and output arrays
+*If you're setting `--inference_type=QUANTIZED_UINT8` then update
+`--mean_values=128` and `--std_dev_values=127`*
 
-### Multiple input arrays
+#### Convert a model with select TensorFlow operators.
+
+Since TensorFlow Lite only supports a limited number of TensorFlow operators,
+not every model is convertible. For details, refer to
+[operator compatibility](https://www.tensorflow.org/lite/guide/ops_compatibility).
+To allow conversion, users can enable the usage of
+[certain TensorFlow ops](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/flex/allowlisted_flex_ops.cc)
+in their TensorFlow Lite model, as shown in the following example.
+
+```
+tflite_convert \
+  --graph_def_file=/tmp/foo.pb \
+  --output_file=/tmp/foo.tflite \
+  --input_arrays=input \
+  --output_arrays=MobilenetV1/Predictions/Reshape_1 \
+  --target_ops=TFLITE_BUILTINS,SELECT_TF_OPS
+```
+
+When building and running `tflite_convert` with `bazel`, please pass
+`--define=tflite_convert_with_select_tf_ops=true` as an additional argument.
+
+```
+bazel run --define=tflite_convert_with_select_tf_ops=true tflite_convert -- \
+  --graph_def_file=/tmp/foo.pb \
+  --output_file=/tmp/foo.tflite \
+  --input_arrays=input \
+  --output_arrays=MobilenetV1/Predictions/Reshape_1 \
+  --target_ops=TFLITE_BUILTINS,SELECT_TF_OPS
+```
+
+#### Convert a model with multiple input arrays
 
 The flag `input_arrays` takes in a comma-separated list of input arrays as seen
 in the example below. This is useful for models or subgraphs with multiple
-inputs.
+inputs. Note that `--input_shapes` is provided as a colon-separated list. Each
+input shape corresponds to the input array at the same position in the
+respective list.
 
 ```
-curl https://storage.googleapis.com/download.tensorflow.org/models/inception_v1_2016_08_28_frozen.pb.tar.gz \
-  | tar xzv -C /tmp
 tflite_convert \
   --graph_def_file=/tmp/inception_v1_2016_08_28_frozen.pb \
   --output_file=/tmp/foo.tflite \
-  --input_shapes=1,28,28,96:1,28,28,16:1,28,28,192:1,28,28,64 \
   --input_arrays=InceptionV1/InceptionV1/Mixed_3b/Branch_1/Conv2d_0a_1x1/Relu,InceptionV1/InceptionV1/Mixed_3b/Branch_2/Conv2d_0a_1x1/Relu,InceptionV1/InceptionV1/Mixed_3b/Branch_3/MaxPool_0a_3x3/MaxPool,InceptionV1/InceptionV1/Mixed_3b/Branch_0/Conv2d_0a_1x1/Relu \
+  --input_shapes=1,28,28,96:1,28,28,16:1,28,28,192:1,28,28,64 \
   --output_arrays=InceptionV1/Logits/Predictions/Reshape_1
 ```
 
-Note that `input_shapes` is provided as a colon-separated list. Each input shape
-corresponds to the input array at the same position in the respective list.
+#### Convert a model with multiple output arrays
 
-### Multiple output arrays
-
-The flag `output_arrays` takes in a comma-separated list of output arrays as
+The flag `--output_arrays` takes in a comma-separated list of output arrays as
 seen in the example below. This is useful for models or subgraphs with multiple
 outputs.
 
 ```
-curl https://storage.googleapis.com/download.tensorflow.org/models/inception_v1_2016_08_28_frozen.pb.tar.gz \
-  | tar xzv -C /tmp
 tflite_convert \
   --graph_def_file=/tmp/inception_v1_2016_08_28_frozen.pb \
   --output_file=/tmp/foo.tflite \
@@ -178,50 +198,45 @@ tflite_convert \
   --output_arrays=InceptionV1/InceptionV1/Mixed_3b/Branch_1/Conv2d_0a_1x1/Relu,InceptionV1/InceptionV1/Mixed_3b/Branch_2/Conv2d_0a_1x1/Relu
 ```
 
-### Specifying subgraphs
+### Convert a model by specifying subgraphs
 
 Any array in the input file can be specified as an input or output array in
-order to extract subgraphs out of an input graph file. The TensorFlow Lite
-Converter discards the parts of the graph outside of the specific subgraph. Use
-[graph visualizations](#graph_visualizations) to identify the input and output
-arrays that make up the desired subgraph.
+order to extract subgraphs out of an input model file. The TensorFlow Lite
+Converter discards the parts of the model outside of the specific subgraph. Use
+[visualization](#visualization) to identify the input and output arrays that
+make up the desired subgraph.
 
 The follow command shows how to extract a single fused layer out of a TensorFlow
 GraphDef.
 
 ```
-curl https://storage.googleapis.com/download.tensorflow.org/models/inception_v1_2016_08_28_frozen.pb.tar.gz \
-  | tar xzv -C /tmp
 tflite_convert \
   --graph_def_file=/tmp/inception_v1_2016_08_28_frozen.pb \
   --output_file=/tmp/foo.pb \
-  --input_shapes=1,28,28,96:1,28,28,16:1,28,28,192:1,28,28,64 \
   --input_arrays=InceptionV1/InceptionV1/Mixed_3b/Branch_1/Conv2d_0a_1x1/Relu,InceptionV1/InceptionV1/Mixed_3b/Branch_2/Conv2d_0a_1x1/Relu,InceptionV1/InceptionV1/Mixed_3b/Branch_3/MaxPool_0a_3x3/MaxPool,InceptionV1/InceptionV1/Mixed_3b/Branch_0/Conv2d_0a_1x1/Relu \
+  --input_shapes=1,28,28,96:1,28,28,16:1,28,28,192:1,28,28,64 \
   --output_arrays=InceptionV1/InceptionV1/Mixed_3b/concat_v2
 ```
 
-Note that the final representation in TensorFlow Lite FlatBuffers tends to have
+Note that the final representation in TensorFlow Lite models tends to have
 coarser granularity than the very fine granularity of the TensorFlow GraphDef
 representation. For example, while a fully-connected layer is typically
-represented as at least four separate ops in TensorFlow GraphDef (Reshape,
-MatMul, BiasAdd, Relu...), it is typically represented as a single "fused" op
-(FullyConnected) in the converter's optimized representation and in the final
-on-device representation. As the level of granularity gets coarser, some
-intermediate arrays (say, the array between the MatMul and the BiasAdd in the
-TensorFlow GraphDef) are dropped.
+represented as at least four separate operations in TensorFlow GraphDef
+(Reshape, MatMul, BiasAdd, Relu...), it is typically represented as a single
+"fused" op (FullyConnected) in the converter's optimized representation and in
+the final on-device representation. As the level of granularity gets coarser,
+some intermediate arrays (say, the array between the MatMul and the BiasAdd in
+the TensorFlow GraphDef) are dropped.
 
 When specifying intermediate arrays as `--input_arrays` and `--output_arrays`,
 it is desirable (and often required) to specify arrays that are meant to survive
-in the final form of the graph, after fusing. These are typically the outputs of
+in the final form of the model, after fusing. These are typically the outputs of
 activation functions (since everything in each layer until the activation
 function tends to get fused).
 
-## Logging
+## Visualization <a name="visualization"></a>
 
-
-## Graph visualizations
-
-The converter can export a graph to the Graphviz Dot format for easy
+The converter can export a model to the Graphviz Dot format for easy
 visualization using either the `--output_format` flag or the
 `--dump_graphviz_dir` flag. The subsections below outline the use cases for
 each.
@@ -229,21 +244,20 @@ each.
 ### Using `--output_format=GRAPHVIZ_DOT` <a name="using_output_format_graphviz_dot"></a>
 
 The first way to get a Graphviz rendering is to pass `GRAPHVIZ_DOT` into
-`--output_format`. This results in a plausible visualization of the graph. This
+`--output_format`. This results in a plausible visualization of the model. This
 reduces the requirements that exist during conversion from a TensorFlow GraphDef
-to a TensorFlow Lite FlatBuffer. This may be useful if the conversion to TFLite
-is failing.
+to a TensorFlow Lite model. This may be useful if the conversion to TFLite is
+failing.
 
 ```
-curl https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_0.50_128_frozen.tgz \
-  | tar xzv -C /tmp
 tflite_convert \
   --graph_def_file=/tmp/mobilenet_v1_0.50_128/frozen_graph.pb \
   --output_file=/tmp/foo.dot \
   --output_format=GRAPHVIZ_DOT \
-  --input_shape=1,128,128,3 \
   --input_arrays=input \
+  --input_shape=1,128,128,3 \
   --output_arrays=MobilenetV1/Predictions/Reshape_1
+
 ```
 
 The resulting `.dot` file can be rendered into a PDF as follows:
@@ -267,12 +281,10 @@ Example PDF files are viewable online in the next section.
 The second way to get a Graphviz rendering is to pass the `--dump_graphviz_dir`
 flag, specifying a destination directory to dump Graphviz rendering to. Unlike
 the previous approach, this one retains the original output format. This
-provides a visualization of the actual graph resulting from a specific
+provides a visualization of the actual model resulting from a specific
 conversion process.
 
 ```
-curl https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_0.50_128_frozen.tgz \
-  | tar xzv -C /tmp
 tflite_convert \
   --graph_def_file=/tmp/mobilenet_v1_0.50_128/frozen_graph.pb \
   --output_file=/tmp/foo.tflite \
@@ -283,14 +295,14 @@ tflite_convert \
 
 This generates a few files in the destination directory. The two most important
 files are `toco_AT_IMPORT.dot` and `/tmp/toco_AFTER_TRANSFORMATIONS.dot`.
-`toco_AT_IMPORT.dot` represents the original graph containing only the
+`toco_AT_IMPORT.dot` represents the original model containing only the
 transformations done at import time. This tends to be a complex visualization
 with limited information about each node. It is useful in situations where a
 conversion command fails.
 
-`toco_AFTER_TRANSFORMATIONS.dot` represents the graph after all transformations
+`toco_AFTER_TRANSFORMATIONS.dot` represents the model after all transformations
 were applied to it, just before it is exported. Typically, this is a much
-smaller graph with more information about each node.
+smaller model with more information about each node.
 
 As before, these can be rendered to PDFs:
 
@@ -316,15 +328,15 @@ Sample output files can be seen here below. Note that it is the same
 <tr><td>before</td><td>after</td></tr>
 </table>
 
-### Graph "video" logging
+### Video logging
 
 When `--dump_graphviz_dir` is used, one may additionally pass
-`--dump_graphviz_video`. This causes a graph visualization to be dumped after
-each individual graph transformation, resulting in thousands of files.
+`--dump_graphviz_video`. This causes a model visualization to be dumped after
+each individual model transformation, resulting in thousands of files.
 Typically, one would then bisect into these files to understand when a given
-change was introduced in the graph.
+change was introduced in the model.
 
-### Legend for the graph visualizations <a name="graphviz_legend"></a>
+### Legend for the Visualizations <a name="graphviz_legend"></a>
 
 *   Operators are red square boxes with the following hues of red:
     *   Most operators are
diff --git a/tensorflow/lite/g3doc/r1/convert/cmdline_reference.md b/tensorflow/lite/g3doc/r1/convert/cmdline_reference.md
index 8cca69d5963..826bb7afdbb 100644
--- a/tensorflow/lite/g3doc/r1/convert/cmdline_reference.md
+++ b/tensorflow/lite/g3doc/r1/convert/cmdline_reference.md
@@ -1,42 +1,41 @@
 # Converter command line reference
 
 This page is complete reference of command-line flags used by the TensorFlow
-Lite Converter's command line starting from TensorFlow 1.9 up until the most
-recent build of TensorFlow.
+Lite Converter's command line tool.
 
 ## High-level flags
 
 The following high level flags specify the details of the input and output
 files. The flag `--output_file` is always required. Additionally, either
-`--graph_def_file`, `--saved_model_dir` or `--keras_model_file` is required.
+`--saved_model_dir`, `--keras_model_file` or `--graph_def_file` is required.
 
 *   `--output_file`. Type: string. Specifies the full path of the output file.
-*   `--graph_def_file`. Type: string. Specifies the full path of the input
-    GraphDef file frozen using
-    [freeze_graph.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/freeze_graph.py).
 *   `--saved_model_dir`. Type: string. Specifies the full path to the directory
     containing the SavedModel.
 *   `--keras_model_file`. Type: string. Specifies the full path of the HDF5 file
     containing the tf.keras model.
+*   `--graph_def_file`. Type: string. Specifies the full path of the input
+    GraphDef file frozen using
+    [freeze_graph.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/freeze_graph.py).
 *   `--output_format`. Type: string. Default: `TFLITE`. Specifies the format of
     the output file. Allowed values:
-    *   `TFLITE`: TensorFlow Lite FlatBuffer format.
+    *   `TFLITE`: TensorFlow Lite model format.
     *   `GRAPHVIZ_DOT`: GraphViz `.dot` format containing a visualization of the
         graph after graph transformations.
         *   Note that passing `GRAPHVIZ_DOT` to `--output_format` leads to loss
-            of TFLite specific transformations. Therefore, the resulting
-            visualization may not reflect the final set of graph
-            transformations. To get a final visualization with all graph
-            transformations use `--dump_graphviz_dir` instead.
+            of TFLite specific transformations. To get a final visualization
+            with all graph transformations use `--dump_graphviz_dir` instead.
 
 The following flags specify optional parameters when using SavedModels.
 
-*   `--saved_model_tag_set`. Type: string. Default:
-    [kSavedModelTagServe](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/cc/saved_model/tag_constants.h).
+*   `--saved_model_tag_set`. Type: string. Default: "serve" (for more options,
+    refer to
+    [tag_constants.h](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/cc/saved_model/tag_constants.h)).
     Specifies a comma-separated set of tags identifying the MetaGraphDef within
     the SavedModel to analyze. All tags in the tag set must be specified.
-*   `--saved_model_signature_key`. Type: string. Default:
-    `tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY`.
+*   `--saved_model_signature_key`. Type: string. Default: "serving_default" (for
+    more options, refer to
+    [tf.compat.v1.saved_model.signature_constants](https://www.tensorflow.org/api_docs/python/tf/compat/v1/saved_model/signature_constants)).
     Specifies the key identifying the SignatureDef containing inputs and
     outputs.
 
@@ -46,9 +45,9 @@ The following flags specify optional parameters when using SavedModels.
 file.
 
 *   `--input_arrays`. Type: comma-separated list of strings. Specifies the list
-    of names of input activation tensors.
+    of names of input tensors.
 *   `--output_arrays`. Type: comma-separated list of strings. Specifies the list
-    of names of output activation tensors.
+    of names of output tensors.
 
 The following flags define properties of the input tensors. Each item in the
 `--input_arrays` flag should correspond to each item in the following flags
@@ -56,8 +55,7 @@ based on index.
 
 *   `--input_shapes`. Type: colon-separated list of comma-separated lists of
     integers. Each comma-separated list of integers gives the shape of one of
-    the input arrays specified in
-    [TensorFlow convention](https://www.tensorflow.org/guide/tensors#shape).
+    the input arrays.
     *   Example: `--input_shapes=1,60,80,3` for a typical vision model means a
         batch size of 1, an input image height of 60, an input image width of
         80, and an input image depth of 3 (representing RGB channels).
@@ -65,24 +63,24 @@ based on index.
         has a shape of [2, 3] and "bar" has a shape of [4, 5, 6].
 *   `--std_dev_values`, `--mean_values`. Type: comma-separated list of floats.
     These specify the (de-)quantization parameters of the input array, when it
-    is quantized. This is only needed if `inference_input_type` is
+    is quantized. This is only needed if `inference_input_type` is `INT8` or
     `QUANTIZED_UINT8`.
     *   The meaning of `mean_values` and `std_dev_values` is as follows: each
         quantized value in the quantized input array will be interpreted as a
         mathematical real number (i.e. as an input activation value) according
         to the following formula:
-        *   `real_value = (quantized_input_value - mean_value) / std_dev_value`.
+        *   `real_value = (quantized_value - mean_value) / std_dev_value`.
     *   When performing float inference (`--inference_type=FLOAT`) on a
         quantized input, the quantized input would be immediately dequantized by
         the inference code according to the above formula, before proceeding
         with float inference.
-    *   When performing quantized inference
-        (`--inference_type=QUANTIZED_UINT8`), no dequantization is performed by
-        the inference code. However, the quantization parameters of all arrays,
-        including those of the input arrays as specified by `mean_value` and
-        `std_dev_value`, determine the fixed-point multipliers used in the
-        quantized inference code. `mean_value` must be an integer when
-        performing quantized inference.
+    *   When performing quantized inference (`inference_type`
+        is`INT8`or`QUANTIZED_UINT8`), no dequantization is performed by the
+        inference code. However, the quantization parameters of all arrays,
+        including those of the input arrays as specified
+        by`mean_value`and`std_dev_value`, determine the fixed-point multipliers
+        used in the quantized inference code.`mean_value` must be an integer
+        when performing quantized inference.
 
 ## Transformation flags
 
@@ -92,7 +90,7 @@ have.
 
 *   `--inference_type`. Type: string. Default: `FLOAT`. Data type of all
     real-number arrays in the output file except for input arrays (defined by
-    `--inference_input_type`). Must be `{FLOAT, QUANTIZED_UINT8}`.
+    `--inference_input_type`). Must be `{FLOAT, INT8, QUANTIZED_UINT8}`.
 
     This flag only impacts real-number arrays including float and quantized
     arrays. This excludes all other data types including plain integer arrays
@@ -101,6 +99,9 @@ have.
     *   If `FLOAT`, then real-numbers arrays will be of type float in the output
         file. If they were quantized in the input file, then they get
         dequantized.
+    *   If `INT8`, then real-numbers arrays will be quantized as int8 in the
+        output file. If they were float in the input file, then they get
+        quantized.
     *   If `QUANTIZED_UINT8`, then real-numbers arrays will be quantized as
         uint8 in the output file. If they were float in the input file, then
         they get quantized.
@@ -109,7 +110,8 @@ have.
     array in the output file. By default the `--inference_type` is used as type
     of all of the input arrays. Flag is primarily intended for generating a
     float-point graph with a quantized input array. A Dequantized operator is
-    added immediately after the input array. Must be `{FLOAT, QUANTIZED_UINT8}`.
+    added immediately after the input array. Must be `{FLOAT, INT8,
+    QUANTIZED_UINT8}`.
 
     The flag is typically used for vision models taking a bitmap as input but
     requiring floating-point inference. For such image models, the uint8 input
diff --git a/tensorflow/lite/g3doc/r1/convert/index.md b/tensorflow/lite/g3doc/r1/convert/index.md
index 4080689ce26..7a4e8c7bc95 100644
--- a/tensorflow/lite/g3doc/r1/convert/index.md
+++ b/tensorflow/lite/g3doc/r1/convert/index.md
@@ -1,48 +1,48 @@
 # TensorFlow Lite converter
 
-The TensorFlow Lite converter is used to convert TensorFlow models into an
-optimized [FlatBuffer](https://google.github.io/flatbuffers/) format, so that
-they can be used by the TensorFlow Lite interpreter.
+The TensorFlow Lite converter takes a TensorFlow model and generates a
+TensorFlow Lite model, which is an optimized
+[FlatBuffer](https://google.github.io/flatbuffers/) (identified by the `.tflite`
+file extension).
 
 Note: This page contains documentation on the converter API for TensorFlow 1.x.
 The API for TensorFlow 2.0 is available
 [here](https://www.tensorflow.org/lite/convert/).
 
-## FlatBuffers
+## Options
+
+The TensorFlow Lite Converter can be used in two ways:
+
+*   [Python API](python_api.md) (**recommended**): Using the Python API makes it
+    easier to convert models as part of a model development pipeline and helps
+    mitigate compatibility issues early on.
+*   [Command line](cmdline_examples.md)
+
+## Workflow
+
+### Why use the 'FlatBuffer' format?
 
 FlatBuffer is an efficient open-source cross-platform serialization library. It
-is similar to
-[protocol buffers](https://developers.google.com/protocol-buffers), with the
-distinction that FlatBuffers do not need a parsing/unpacking step to a secondary
-representation before data can be accessed, avoiding per-object memory
-allocation. The code footprint of FlatBuffers is an order of magnitude smaller
-than protocol buffers.
+is similar to [protocol buffers](https://developers.google.com/protocol-buffers)
+used in the TensorFlow model format, with the distinction that FlatBuffers do
+not need a parsing/unpacking step to a secondary representation before data can
+be accessed, avoiding per-object memory allocation. The code footprint of
+FlatBuffers is an order of magnitude smaller than protocol buffers.
 
-## From model training to device deployment
-
-The TensorFlow Lite converter generates a TensorFlow Lite
-[FlatBuffer](https://google.github.io/flatbuffers/) file (`.tflite`) from a
-TensorFlow model.
+### Convert the model
 
 The converter supports the following input formats:
 
 *   [SavedModels](https://www.tensorflow.org/guide/saved_model#using_savedmodel_with_estimators)
-*   Frozen `GraphDef`: Models generated by
+*   `tf.keras` H5 models.
+*   Frozen `GraphDef` models generated using
     [freeze_graph.py](https://www.tensorflow.org/code/tensorflow/python/tools/freeze_graph.py).
-*   `tf.keras` HDF5 models.
-*   Any model taken from a `tf.Session` (Python API only).
+*   `tf.Session` models (Python API only).
 
-The TensorFlow Lite `FlatBuffer` file is then deployed to a client device, and
-the TensorFlow Lite interpreter uses the compressed model for on-device
-inference. This conversion process is shown in the diagram below:
+### Run inference
+
+The TensorFlow Lite model is then deployed to a client device, and the
+TensorFlow Lite interpreter uses the compressed model for on-device inference.
+This conversion process is shown in the diagram below:
 
 ![TFLite converter workflow](../images/convert/workflow.svg)
-
-## Options
-
-The TensorFlow Lite Converter can be used from either of these two options:
-
-*   [Python](python_api.md) (**Preferred**): Using the Python API makes it
-    easier to convert models as part of a model development pipeline, and helps
-    mitigate [compatibility](../tf_ops_compatibility.md) issues early on.
-*   [Command line](cmdline_examples.md)
diff --git a/tensorflow/lite/g3doc/r1/convert/python_api.md b/tensorflow/lite/g3doc/r1/convert/python_api.md
index 30d65750100..0eca3a43677 100644
--- a/tensorflow/lite/g3doc/r1/convert/python_api.md
+++ b/tensorflow/lite/g3doc/r1/convert/python_api.md
@@ -1,119 +1,67 @@
 # Converter Python API guide
 
 This page describes how to convert TensorFlow models into the TensorFlow Lite
-format using the TensorFlow Lite Converter Python API.
+format using the
+[`tf.compat.v1.lite.TFLiteConverter`](https://www.tensorflow.org/api_docs/python/tf/compat/v1/lite/TFLiteConverter)
+Python API. It provides the following class methods based on the original format
+of the model:
 
-If you're looking for information about how to run a TensorFlow Lite model,
-see [TensorFlow Lite inference](../guide/inference.md).
+*   `tf.compat.v1.lite.TFLiteConverter.from_saved_model()`: Converts a
+    [SavedModel](https://www.tensorflow.org/guide/saved_model).
+*   `tf.compat.v1.lite.TFLiteConverter.from_keras_model_file()`: Converts a
+    [Keras](https://www.tensorflow.org/guide/keras/overview) model file.
+*   `tf.compat.v1.lite.TFLiteConverter.from_session()`: Converts a GraphDef from
+    a session.
+*   `tf.compat.v1.lite.TFLiteConverter.from_frozen_graph()`: Converts a Frozen
+    GraphDef from a file. If you have checkpoints, then first convert it to a
+    Frozen GraphDef file and then use this API as shown [here](#checkpoints).
 
-Note: This page describes the converter in the TensorFlow nightly release,
-installed using `pip install tf-nightly`. For docs describing older versions
-reference ["Converting models from TensorFlow 1.12"](#pre_tensorflow_1.12).
-
-
-## High-level overview
-
-While the TensorFlow Lite Converter can be used from the command line, it is
-often convenient to use in a Python script as part of the model development
-pipeline. This allows you to know early that you are designing a model that can
-be targeted to devices with mobile.
-
-## API
-
-The API for converting TensorFlow models to TensorFlow Lite is
-`tf.lite.TFLiteConverter`, which provides class methods based on the original
-format of the model. For example, `TFLiteConverter.from_session()` is available
-for GraphDefs, `TFLiteConverter.from_saved_model()` is available for
-SavedModels, and `TFLiteConverter.from_keras_model_file()` is available for
-`tf.Keras` files.
-
-Example usages for simple float-point models are shown in
-[Basic Examples](#basic). Examples usages for more complex models is shown in
-[Complex Examples](#complex).
+In the following sections, we discuss [basic examples](#basic) and
+[complex examples](#complex).
 
 ## Basic examples <a name="basic"></a>
 
-The following section shows examples of how to convert a basic float-point model
-from each of the supported data formats into a TensorFlow Lite FlatBuffers.
+The following section shows examples of how to convert a basic model from each
+of the supported model formats into a TensorFlow Lite model.
 
-### Exporting a GraphDef from tf.Session <a name="basic_graphdef_sess"></a>
+### Convert a SavedModel <a name="basic_savedmodel"></a>
 
-The following example shows how to convert a TensorFlow GraphDef into a
-TensorFlow Lite FlatBuffer from a `tf.Session` object.
+The following example shows how to convert a
+[SavedModel](https://www.tensorflow.org/guide/saved_model) into a TensorFlow
+Lite model.
 
 ```python
 import tensorflow as tf
 
-img = tf.placeholder(name="img", dtype=tf.float32, shape=(1, 64, 64, 3))
-var = tf.get_variable("weights", dtype=tf.float32, shape=(1, 64, 64, 3))
-val = img + var
-out = tf.identity(val, name="out")
-
-with tf.Session() as sess:
-  sess.run(tf.global_variables_initializer())
-  converter = tf.lite.TFLiteConverter.from_session(sess, [img], [out])
-  tflite_model = converter.convert()
-  open("converted_model.tflite", "wb").write(tflite_model)
-```
-
-### Exporting a GraphDef from file <a name="basic_graphdef_file"></a>
-
-The following example shows how to convert a TensorFlow GraphDef into a
-TensorFlow Lite FlatBuffer when the GraphDef is stored in a file. Both `.pb` and
-`.pbtxt` files are accepted.
-
-The example uses
-[Mobilenet_1.0_224](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_1.0_224_frozen.tgz).
-The function only supports GraphDefs frozen using
-[freeze_graph.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/freeze_graph.py).
-
-```python
-import tensorflow as tf
-
-graph_def_file = "/path/to/Downloads/mobilenet_v1_1.0_224/frozen_graph.pb"
-input_arrays = ["input"]
-output_arrays = ["MobilenetV1/Predictions/Softmax"]
-
-converter = tf.lite.TFLiteConverter.from_frozen_graph(
-  graph_def_file, input_arrays, output_arrays)
+# Convert the model.
+converter = tf.compat.v1.lite.TFLiteConverter.from_saved_model(saved_model_dir)
 tflite_model = converter.convert()
-open("converted_model.tflite", "wb").write(tflite_model)
+
+# Save the model.
+with open('model.tflite', 'wb') as f:
+  f.write(tflite_model)
 ```
 
-### Exporting a SavedModel <a name="basic_savedmodel"></a>
+### Convert a Keras model file <a name="basic_keras_file"></a>
 
-The following example shows how to convert a SavedModel into a TensorFlow Lite
-FlatBuffer.
+The following example shows how to convert a
+[Keras](https://www.tensorflow.org/guide/keras/overview) model file into a
+TensorFlow Lite model.
 
 ```python
 import tensorflow as tf
 
-converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
+# Convert the model.
+converter = tf.compat.v1.lite.TFLiteConverter.from_keras_model_file('keras_model.h5')
 tflite_model = converter.convert()
-open("converted_model.tflite", "wb").write(tflite_model)
+
+# Save the model.
+with open('model.tflite', 'wb') as f:
+  f.write(tflite_model)
 ```
 
-For more complex SavedModels, the optional parameters that can be passed into
-`TFLiteConverter.from_saved_model()` are `input_arrays`, `input_shapes`,
-`output_arrays`, `tag_set` and `signature_key`. Details of each parameter are
-available by running `help(tf.lite.TFLiteConverter)`.
-
-### Exporting a tf.keras File <a name="basic_keras_file"></a>
-
-The following example shows how to convert a `tf.keras` model into a TensorFlow
-Lite FlatBuffer. This example requires
-[`h5py`](http://docs.h5py.org/en/latest/build.html) to be installed.
-
-```python
-import tensorflow as tf
-
-converter = tf.lite.TFLiteConverter.from_keras_model_file("keras_model.h5")
-tflite_model = converter.convert()
-open("converted_model.tflite", "wb").write(tflite_model)
-```
-
-The `tf.keras` file must contain both the model and the weights. A comprehensive
-example including model construction can be seen below.
+The Keras file contains both the model and the weights. A comprehensive example
+is given below.
 
 ```python
 import numpy as np
@@ -134,61 +82,139 @@ y = np.random.random((1, 3, 3))
 model.train_on_batch(x, y)
 model.predict(x)
 
-# Save tf.keras model in HDF5 format.
-keras_file = "keras_model.h5"
+# Save tf.keras model in H5 format.
+keras_file = 'keras_model.h5'
 tf.keras.models.save_model(model, keras_file)
 
-# Convert to TensorFlow Lite model.
-converter = tf.lite.TFLiteConverter.from_keras_model_file(keras_file)
+# Convert the model.
+converter = tf.compat.v1.lite.TFLiteConverter.from_keras_model_file(keras_file)
 tflite_model = converter.convert()
-open("converted_model.tflite", "wb").write(tflite_model)
+
+# Save the model.
+with open('model.tflite', 'wb') as f:
+  f.write(tflite_model)
 ```
 
-## Complex examples <a name="complex"></a>
+### Convert a GraphDef from a session <a name="basic_graphdef_sess"></a>
 
-For models where the default value of the attributes is not sufficient, the
-attribute's values should be set before calling `convert()`. In order to call
-any constants use `tf.lite.constants.<CONSTANT_NAME>` as seen below with
-`QUANTIZED_UINT8`. Run `help(tf.lite.TFLiteConverter)` in the Python
-terminal for detailed documentation on the attributes.
-
-Although the examples are demonstrated on GraphDefs containing only constants.
-The same logic can be applied irrespective of the input data format.
-
-### Exporting a quantized GraphDef <a name="complex_quant"></a>
-
-The following example shows how to convert a quantized model into a TensorFlow
-Lite FlatBuffer.
+The following example shows how to convert a GraphDef from a `tf.Session` object
+into a TensorFlow Lite model .
 
 ```python
 import tensorflow as tf
 
-img = tf.placeholder(name="img", dtype=tf.float32, shape=(1, 64, 64, 3))
-const = tf.constant([1., 2., 3.]) + tf.constant([1., 4., 4.])
-val = img + const
-out = tf.fake_quant_with_min_max_args(val, min=0., max=1., name="output")
+img = tf.placeholder(name='img', dtype=tf.float32, shape=(1, 64, 64, 3))
+var = tf.get_variable('weights', dtype=tf.float32, shape=(1, 64, 64, 3))
+val = img + var
+out = tf.identity(val, name='out')
 
 with tf.Session() as sess:
-  converter = tf.lite.TFLiteConverter.from_session(sess, [img], [out])
-  converter.inference_type = tf.lite.constants.QUANTIZED_UINT8
-  input_arrays = converter.get_input_arrays()
-  converter.quantized_input_stats = {input_arrays[0] : (0., 1.)}  # mean, std_dev
+  sess.run(tf.global_variables_initializer())
+
+  # Convert the model.
+  converter = tf.compat.v1.lite.TFLiteConverter.from_session(sess, [img], [out])
   tflite_model = converter.convert()
-  open("converted_model.tflite", "wb").write(tflite_model)
+
+  # Save the model.
+  with open('model.tflite', 'wb') as f:
+    f.write(tflite_model)
 ```
 
+### Convert a Frozen GraphDef from file <a name="basic_graphdef_file"></a>
 
-## Additional instructions
+The following example shows how to convert a Frozen GraphDef (or a frozen
+graph), usually generated using the
+[freeze_graph.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/freeze_graph.py)
+script, into a TensorFlow Lite model.
 
-### Build from source code <a name="latest_package"></a>
+The example uses
+[Mobilenet_1.0_224](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_1.0_224_frozen.tgz).
 
-In order to run the latest version of the TensorFlow Lite Converter Python API,
-either install the nightly build with
-[pip](https://www.tensorflow.org/install/pip) (recommended) or
-[Docker](https://www.tensorflow.org/install/docker), or
-[build the pip package from source](https://www.tensorflow.org/install/source).
+```python
+import tensorflow as tf
 
-### Converting models from TensorFlow 1.12 <a name="pre_tensorflow_1.12"></a>
+# Convert the model.
+converter = tf.compat.v1.lite.TFLiteConverter.from_frozen_graph(
+    graph_def_file='/path/to/mobilenet_v1_1.0_224/frozen_graph.pb',
+                    # both `.pb` and `.pbtxt` files are accepted.
+    input_arrays=['input'],
+    input_shapes={'input' : [1, 224, 224,3]},
+    output_arrays=['MobilenetV1/Predictions/Softmax']
+)
+tflite_model = converter.convert()
+
+# Save the model.
+with open('model.tflite', 'wb') as f:
+  f.write(tflite_model)
+```
+
+#### Convert checkpoints <a name="checkpoints"></a>
+
+1.  Convert checkpoints to a Frozen GraphDef as follows
+    (*[reference](https://laid.delanover.com/how-to-freeze-a-graph-in-tensorflow/)*):
+
+    *   Install [bazel](https://docs.bazel.build/versions/master/install.html)
+    *   Clone the TensorFlow repository: `git clone
+        https://github.com/tensorflow/tensorflow.git`
+    *   Build freeze graph tool: `bazel build
+        tensorflow/python/tools:freeze_graph`
+        *   The directory from which you run this should contain a file named
+            'WORKSPACE'.
+        *   If you're running on Ubuntu 16.04 OS and face issues, update the
+            command to `bazel build -c opt --copt=-msse4.1 --copt=-msse4.2
+            tensorflow/python/tools:freeze_graph`
+    *   Run freeze graph tool: `bazel run tensorflow/python/tools:freeze_graph
+        --input_graph=/path/to/graph.pbtxt --input_binary=false
+        --input_checkpoint=/path/to/model.ckpt-00010
+        --output_graph=/path/to/frozen_graph.pb
+        --output_node_names=name1,name2.....`
+        *   If you have an input `*.pb` file instead of `*.pbtxt`, then replace
+            `--input_graph=/path/to/graph.pbtxt --input_binary=false` with
+            `--input_graph=/path/to/graph.pb`
+        *   You can find the output names by exploring the graph using
+            [Netron](https://github.com/lutzroeder/netron) or
+            [summarize graph tool](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/tools/graph_transforms#inspecting-graphs).
+
+2.  Now [convert the Frozen GraphDef file](#basic_graphdef_file) to a TensorFlow
+    Lite model as shown in the example above.
+
+## Complex examples <a name="complex"></a>
+
+For models where the default value of the attributes is not sufficient, the
+attribute's values should be set before calling `convert()`. Run
+`help(tf.compat.v1.lite.TFLiteConverter)` in the Python terminal for detailed
+documentation on the attributes.
+
+### Convert a quantize aware trained model <a name="complex_quant"></a>
+
+The following example shows how to convert a quantize aware trained model into a
+TensorFlow Lite model.
+
+The example uses
+[Mobilenet_1.0_224](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_1.0_224_frozen.tgz).
+
+```python
+import tensorflow as tf
+
+# Convert the model.
+converter = tf.compat.v1.lite.TFLiteConverter.from_frozen_graph(
+    graph_def_file='/path/to/mobilenet_v1_1.0_224/frozen_graph.pb',
+    input_arrays=['input'],
+    input_shapes={'input' : [1, 224, 224,3]},
+    output_arrays=['MobilenetV1/Predictions/Softmax'],
+)
+converter.quantized_input_stats = {'input' : (0., 1.)}  # mean, std_dev (input range is [-1, 1])
+converter.inference_type = tf.int8 # this is the recommended type.
+# converter.inference_input_type=tf.uint8 # optional
+# converter.inference_output_type=tf.uint8 # optional
+tflite_model = converter.convert()
+
+# Save the model.
+with open('quantized_model.tflite', 'wb') as f:
+  f.write(tflite_model)
+```
+
+## Convert models from TensorFlow 1.12 <a name="pre_tensorflow_1.12"></a>
 
 Reference the following table to convert TensorFlow models to TensorFlow Lite in
 and before TensorFlow 1.12. Run `help()` to get details of each API.
diff --git a/tensorflow/lite/g3doc/tools/build_py_api_docs.py b/tensorflow/lite/g3doc/tools/build_py_api_docs.py
index 90a8e45ca6a..089e6bc1a1c 100644
--- a/tensorflow/lite/g3doc/tools/build_py_api_docs.py
+++ b/tensorflow/lite/g3doc/tools/build_py_api_docs.py
@@ -55,7 +55,7 @@ FLAGS = flags.FLAGS
 def main(_):
   doc_generator = generate_lib.DocGenerator(
       root_title='TensorFlow Lite',
-      py_modules=[('lite', tf.lite)],
+      py_modules=[('tf.lite', tf.lite)],
       base_dir=str(pathlib.Path(tf.__file__).parent),
       code_url_prefix=FLAGS.code_url_prefix,
       search_hints=FLAGS.search_hints,
diff --git a/tensorflow/lite/g3doc/tutorials/_index.yaml b/tensorflow/lite/g3doc/tutorials/_index.yaml
index 06d5e780cd7..287e921ed7a 100644
--- a/tensorflow/lite/g3doc/tutorials/_index.yaml
+++ b/tensorflow/lite/g3doc/tutorials/_index.yaml
@@ -132,18 +132,18 @@ landing_page:
     items:
     - classname: tfo-landing-page-card
       description: >
-        <a href="https://codelabs.developers.google.com/codelabs/recognize-flowers-with-tensorflow-on-android/#0">
+        <a href="https://codelabs.developers.google.com/codelabs/sparkfun-tensorflow/#0">
           <h3 class="no-link">Hotword detection</h3>
         </a>
         Train a tiny speech model that can detect simple hotwords.
-      path: https://github.com/tensorflow/examples/blob/master/lite/examples/object_detection/raspberry_pi/
+      path: https://codelabs.developers.google.com/codelabs/sparkfun-tensorflow/#0
     - classname: tfo-landing-page-card
       description: >
-        <a href="https://codelabs.developers.google.com/codelabs/recognize-flowers-with-tensorflow-on-android/#0">
+        <a href="https://blog.tensorflow.org/2019/11/how-to-get-started-with-machine.html">
           <h3 class="no-link">Gesture recognition</h3>
         </a>
         Train a model that can recognize different gestures using accelerometer data.
-      path: https://github.com/tensorflow/examples/blob/master/lite/examples/object_detection/raspberry_pi/
+      path: https://blog.tensorflow.org/2019/11/how-to-get-started-with-machine.html
 
 
   # Next steps
diff --git a/tensorflow/lite/g3doc/tutorials/model_maker_image_classification.ipynb b/tensorflow/lite/g3doc/tutorials/model_maker_image_classification.ipynb
index 99ebb7087f2..7be3820c3ef 100644
--- a/tensorflow/lite/g3doc/tutorials/model_maker_image_classification.ipynb
+++ b/tensorflow/lite/g3doc/tutorials/model_maker_image_classification.ipynb
@@ -1,10 +1,25 @@
 {
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "name": "model_maker_image_classification.ipynb",
+      "provenance": [],
+      "collapsed_sections": [],
+      "toc_visible": true
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    }
+  },
   "cells": [
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "h2q27gKz1H20"
+        "id": "h2q27gKz1H20",
+        "colab_type": "text"
       },
       "source": [
         "##### Copyright 2019 The TensorFlow Authors."
@@ -12,14 +27,12 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {
         "cellView": "form",
-        "colab": {},
+        "id": "TUfAcER1oUS6",
         "colab_type": "code",
-        "id": "TUfAcER1oUS6"
+        "colab": {}
       },
-      "outputs": [],
       "source": [
         "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
         "# you may not use this file except in compliance with the License.\n",
@@ -32,13 +45,15 @@
         "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
         "# See the License for the specific language governing permissions and\n",
         "# limitations under the License."
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "Gb7qyhNL1yWt"
+        "id": "Gb7qyhNL1yWt",
+        "colab_type": "text"
       },
       "source": [
         "# Image classification with TensorFlow Lite Model Maker"
@@ -47,31 +62,31 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "nDABAblytltI"
+        "id": "nDABAblytltI",
+        "colab_type": "text"
       },
       "source": [
-        "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n",
-        "  \u003ctd\u003e\n",
-        "    \u003ca target=\"_blank\" href=\"https://www.tensorflow.org/lite/tutorials/model_maker_image_classification\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" /\u003eView on TensorFlow.org\u003c/a\u003e\n",
-        "  \u003c/td\u003e\n",
-        "  \u003ctd\u003e\n",
-        "    \u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/tutorials/model_maker_image_classification.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n",
-        "  \u003c/td\u003e\n",
-        "  \u003ctd\u003e\n",
-        "    \u003ca target=\"_blank\" href=\"https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/tutorials/model_maker_image_classification.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\n",
-        "  \u003c/td\u003e\n",
-        "  \u003ctd\u003e\n",
-        "    \u003ca href=\"https://storage.googleapis.com/tensorflow_docs/tensorflow/tensorflow/lite/g3doc/tutorials/model_maker_image_classification.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\n",
-        "  \u003c/td\u003e\n",
-        "\u003c/table\u003e"
+        "<table class=\"tfo-notebook-buttons\" align=\"left\">\n",
+        "  <td>\n",
+        "    <a target=\"_blank\" href=\"https://www.tensorflow.org/lite/tutorials/model_maker_image_classification\"><img src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" />View on TensorFlow.org</a>\n",
+        "  </td>\n",
+        "  <td>\n",
+        "    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/tutorials/model_maker_image_classification.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n",
+        "  </td>\n",
+        "  <td>\n",
+        "    <a target=\"_blank\" href=\"https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/tutorials/model_maker_image_classification.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n",
+        "  </td>\n",
+        "  <td>\n",
+        "    <a href=\"https://storage.googleapis.com/tensorflow_docs/tensorflow/tensorflow/lite/g3doc/tutorials/model_maker_image_classification.ipynb\"><img src=\"https://www.tensorflow.org/images/download_logo_32px.png\" />Download notebook</a>\n",
+        "  </td>\n",
+        "</table>"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "m86-Nh4pMHqY"
+        "id": "m86-Nh4pMHqY",
+        "colab_type": "text"
       },
       "source": [
         "Model Maker library simplifies the process of adapting and converting a TensorFlow neural-network model to particular input data when deploying this model for on-device ML applications.\n",
@@ -82,8 +97,8 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "bcLF2PKkSbV3"
+        "id": "bcLF2PKkSbV3",
+        "colab_type": "text"
       },
       "source": [
         "## Prerequisites\n",
@@ -93,22 +108,172 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {
-        "colab": {},
+        "id": "6cv3K3oaksJv",
         "colab_type": "code",
-        "id": "6cv3K3oaksJv"
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 1000
+        },
+        "outputId": "911fb544-f618-4cf7-e11d-f42c460d8f67"
       },
-      "outputs": [],
       "source": [
-        "!pip install git+https://github.com/tensorflow/examples.git#egg=tensorflow-examples[model_maker]"
+        "!pip install tflite-model-maker"
+      ],
+      "execution_count": 1,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "Collecting tflite-model-maker\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/13/bc/4c23b9cb9ef612a1f48bac5543bd531665de5eab8f8231111aac067f8c30/tflite_model_maker-0.1.2-py3-none-any.whl (104kB)\n",
+            "\u001b[K     |████████████████████████████████| 112kB 3.0MB/s \n",
+            "\u001b[?25hCollecting sentencepiece\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)\n",
+            "\u001b[K     |████████████████████████████████| 1.1MB 8.9MB/s \n",
+            "\u001b[?25hRequirement already satisfied: numpy>=1.17.3 in /usr/local/lib/python3.6/dist-packages (from tflite-model-maker) (1.18.5)\n",
+            "Collecting tflite-support==0.1.0rc3.dev2\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/fa/c5/5e9ee3abd5b4ef8294432cd714407f49a66befa864905b66ee8bdc612795/tflite_support-0.1.0rc3.dev2-cp36-cp36m-manylinux2010_x86_64.whl (1.0MB)\n",
+            "\u001b[K     |████████████████████████████████| 1.0MB 15.4MB/s \n",
+            "\u001b[?25hRequirement already satisfied: tensorflow-hub>=0.8.0 in /usr/local/lib/python3.6/dist-packages (from tflite-model-maker) (0.9.0)\n",
+            "Requirement already satisfied: pillow in /usr/local/lib/python3.6/dist-packages (from tflite-model-maker) (7.0.0)\n",
+            "Requirement already satisfied: absl-py in /usr/local/lib/python3.6/dist-packages (from tflite-model-maker) (0.10.0)\n",
+            "Collecting flatbuffers==1.12\n",
+            "  Downloading https://files.pythonhosted.org/packages/eb/26/712e578c5f14e26ae3314c39a1bdc4eb2ec2f4ddc89b708cf8e0a0d20423/flatbuffers-1.12-py2.py3-none-any.whl\n",
+            "Collecting tf-models-nightly\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/d3/e9/c4e5a451c268a5a75a27949562364f6086f6bb33b226a065a8beceefa9ba/tf_models_nightly-2.3.0.dev20200914-py2.py3-none-any.whl (993kB)\n",
+            "\u001b[K     |████████████████████████████████| 1.0MB 23.3MB/s \n",
+            "\u001b[?25hCollecting fire\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/34/a7/0e22e70778aca01a52b9c899d9c145c6396d7b613719cd63db97ffa13f2f/fire-0.3.1.tar.gz (81kB)\n",
+            "\u001b[K     |████████████████████████████████| 81kB 9.1MB/s \n",
+            "\u001b[?25hRequirement already satisfied: tensorflow-datasets>=2.1.0 in /usr/local/lib/python3.6/dist-packages (from tflite-model-maker) (2.1.0)\n",
+            "Collecting tf-nightly\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/33/d4/61c47ae889b490b9c5f07f4f61bdc057c158a1a1979c375fa019d647a19e/tf_nightly-2.4.0.dev20200914-cp36-cp36m-manylinux2010_x86_64.whl (390.1MB)\n",
+            "\u001b[K     |████████████████████████████████| 390.2MB 43kB/s \n",
+            "\u001b[?25hCollecting pybind11>=2.4\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/89/e3/d576f6f02bc75bacbc3d42494e8f1d063c95617d86648dba243c2cb3963e/pybind11-2.5.0-py2.py3-none-any.whl (296kB)\n",
+            "\u001b[K     |████████████████████████████████| 296kB 50.8MB/s \n",
+            "\u001b[?25hRequirement already satisfied: six>=1.12.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow-hub>=0.8.0->tflite-model-maker) (1.15.0)\n",
+            "Requirement already satisfied: protobuf>=3.8.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow-hub>=0.8.0->tflite-model-maker) (3.12.4)\n",
+            "Requirement already satisfied: gin-config in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly->tflite-model-maker) (0.3.0)\n",
+            "Requirement already satisfied: matplotlib in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly->tflite-model-maker) (3.2.2)\n",
+            "Requirement already satisfied: kaggle>=1.3.9 in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly->tflite-model-maker) (1.5.8)\n",
+            "Requirement already satisfied: pandas>=0.22.0 in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly->tflite-model-maker) (1.0.5)\n",
+            "Requirement already satisfied: dataclasses in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly->tflite-model-maker) (0.7)\n",
+            "Requirement already satisfied: google-api-python-client>=1.6.7 in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly->tflite-model-maker) (1.7.12)\n",
+            "Collecting py-cpuinfo>=3.3.0\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/f6/f5/8e6e85ce2e9f6e05040cf0d4e26f43a4718bcc4bce988b433276d4b1a5c1/py-cpuinfo-7.0.0.tar.gz (95kB)\n",
+            "\u001b[K     |████████████████████████████████| 102kB 13.7MB/s \n",
+            "\u001b[?25hCollecting tf-slim>=1.1.0\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/02/97/b0f4a64df018ca018cc035d44f2ef08f91e2e8aa67271f6f19633a015ff7/tf_slim-1.1.0-py2.py3-none-any.whl (352kB)\n",
+            "\u001b[K     |████████████████████████████████| 358kB 46.5MB/s \n",
+            "\u001b[?25hCollecting opencv-python-headless\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/b6/2a/496e06fd289c01dc21b11970be1261c87ce1cc22d5340c14b516160822a7/opencv_python_headless-4.4.0.42-cp36-cp36m-manylinux2014_x86_64.whl (36.6MB)\n",
+            "\u001b[K     |████████████████████████████████| 36.6MB 92kB/s \n",
+            "\u001b[?25hRequirement already satisfied: scipy>=0.19.1 in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly->tflite-model-maker) (1.4.1)\n",
+            "Collecting tensorflow-model-optimization>=0.4.1\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/55/38/4fd48ea1bfcb0b6e36d949025200426fe9c3a8bfae029f0973d85518fa5a/tensorflow_model_optimization-0.5.0-py2.py3-none-any.whl (172kB)\n",
+            "\u001b[K     |████████████████████████████████| 174kB 54.9MB/s \n",
+            "\u001b[?25hRequirement already satisfied: google-cloud-bigquery>=0.31.0 in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly->tflite-model-maker) (1.21.0)\n",
+            "Requirement already satisfied: oauth2client in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly->tflite-model-maker) (4.1.3)\n",
+            "Collecting seqeval\n",
+            "  Downloading https://files.pythonhosted.org/packages/34/91/068aca8d60ce56dd9ba4506850e876aba5e66a6f2f29aa223224b50df0de/seqeval-0.0.12.tar.gz\n",
+            "Requirement already satisfied: psutil>=5.4.3 in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly->tflite-model-maker) (5.4.8)\n",
+            "Requirement already satisfied: pycocotools in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly->tflite-model-maker) (2.0.2)\n",
+            "Collecting pyyaml>=5.1\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/64/c2/b80047c7ac2478f9501676c988a5411ed5572f35d1beff9cae07d321512c/PyYAML-5.3.1.tar.gz (269kB)\n",
+            "\u001b[K     |████████████████████████████████| 276kB 47.1MB/s \n",
+            "\u001b[?25hRequirement already satisfied: Cython in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly->tflite-model-maker) (0.29.21)\n",
+            "Requirement already satisfied: tensorflow-addons in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly->tflite-model-maker) (0.8.3)\n",
+            "Requirement already satisfied: termcolor in /usr/local/lib/python3.6/dist-packages (from fire->tflite-model-maker) (1.1.0)\n",
+            "Requirement already satisfied: tensorflow-metadata in /usr/local/lib/python3.6/dist-packages (from tensorflow-datasets>=2.1.0->tflite-model-maker) (0.24.0)\n",
+            "Requirement already satisfied: future in /usr/local/lib/python3.6/dist-packages (from tensorflow-datasets>=2.1.0->tflite-model-maker) (0.16.0)\n",
+            "Requirement already satisfied: promise in /usr/local/lib/python3.6/dist-packages (from tensorflow-datasets>=2.1.0->tflite-model-maker) (2.3)\n",
+            "Requirement already satisfied: attrs>=18.1.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow-datasets>=2.1.0->tflite-model-maker) (20.2.0)\n",
+            "Requirement already satisfied: dill in /usr/local/lib/python3.6/dist-packages (from tensorflow-datasets>=2.1.0->tflite-model-maker) (0.3.2)\n",
+            "Requirement already satisfied: tqdm in /usr/local/lib/python3.6/dist-packages (from tensorflow-datasets>=2.1.0->tflite-model-maker) (4.41.1)\n",
+            "Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow-datasets>=2.1.0->tflite-model-maker) (2.23.0)\n",
+            "Requirement already satisfied: wrapt in /usr/local/lib/python3.6/dist-packages (from tensorflow-datasets>=2.1.0->tflite-model-maker) (1.12.1)\n",
+            "Requirement already satisfied: wheel>=0.26 in /usr/local/lib/python3.6/dist-packages (from tf-nightly->tflite-model-maker) (0.35.1)\n",
+            "Requirement already satisfied: opt-einsum>=2.3.2 in /usr/local/lib/python3.6/dist-packages (from tf-nightly->tflite-model-maker) (3.3.0)\n",
+            "Requirement already satisfied: grpcio>=1.8.6 in /usr/local/lib/python3.6/dist-packages (from tf-nightly->tflite-model-maker) (1.32.0)\n",
+            "Collecting tb-nightly<3.0.0a0,>=2.4.0a0\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/fc/cb/4dfe0d65bffb5e9663261ff664e6f5a2d37672b31dae27a0f14721ac00d3/tb_nightly-2.4.0a20200914-py3-none-any.whl (10.1MB)\n",
+            "\u001b[K     |████████████████████████████████| 10.1MB 45.9MB/s \n",
+            "\u001b[?25hRequirement already satisfied: keras-preprocessing<1.2,>=1.1.1 in /usr/local/lib/python3.6/dist-packages (from tf-nightly->tflite-model-maker) (1.1.2)\n",
+            "Requirement already satisfied: google-pasta>=0.1.8 in /usr/local/lib/python3.6/dist-packages (from tf-nightly->tflite-model-maker) (0.2.0)\n",
+            "Collecting tf-estimator-nightly\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/bd/9a/3bfb9994eda11e426c809ebdf434e2ac5824a0784d980018bb53fd1620ec/tf_estimator_nightly-2.4.0.dev2020091401-py2.py3-none-any.whl (460kB)\n",
+            "\u001b[K     |████████████████████████████████| 460kB 45.6MB/s \n",
+            "\u001b[?25hRequirement already satisfied: typing-extensions>=3.7.4.2 in /usr/local/lib/python3.6/dist-packages (from tf-nightly->tflite-model-maker) (3.7.4.3)\n",
+            "Requirement already satisfied: h5py<2.11.0,>=2.10.0 in /usr/local/lib/python3.6/dist-packages (from tf-nightly->tflite-model-maker) (2.10.0)\n",
+            "Requirement already satisfied: gast==0.3.3 in /usr/local/lib/python3.6/dist-packages (from tf-nightly->tflite-model-maker) (0.3.3)\n",
+            "Requirement already satisfied: astunparse==1.6.3 in /usr/local/lib/python3.6/dist-packages (from tf-nightly->tflite-model-maker) (1.6.3)\n",
+            "Requirement already satisfied: setuptools in /usr/local/lib/python3.6/dist-packages (from protobuf>=3.8.0->tensorflow-hub>=0.8.0->tflite-model-maker) (50.3.0)\n",
+            "Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->tf-models-nightly->tflite-model-maker) (2.8.1)\n",
+            "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->tf-models-nightly->tflite-model-maker) (2.4.7)\n",
+            "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->tf-models-nightly->tflite-model-maker) (1.2.0)\n",
+            "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.6/dist-packages (from matplotlib->tf-models-nightly->tflite-model-maker) (0.10.0)\n",
+            "Requirement already satisfied: urllib3<1.25,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from kaggle>=1.3.9->tf-models-nightly->tflite-model-maker) (1.24.3)\n",
+            "Requirement already satisfied: slugify in /usr/local/lib/python3.6/dist-packages (from kaggle>=1.3.9->tf-models-nightly->tflite-model-maker) (0.0.1)\n",
+            "Requirement already satisfied: certifi in /usr/local/lib/python3.6/dist-packages (from kaggle>=1.3.9->tf-models-nightly->tflite-model-maker) (2020.6.20)\n",
+            "Requirement already satisfied: python-slugify in /usr/local/lib/python3.6/dist-packages (from kaggle>=1.3.9->tf-models-nightly->tflite-model-maker) (4.0.1)\n",
+            "Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas>=0.22.0->tf-models-nightly->tflite-model-maker) (2018.9)\n",
+            "Requirement already satisfied: google-auth>=1.4.1 in /usr/local/lib/python3.6/dist-packages (from google-api-python-client>=1.6.7->tf-models-nightly->tflite-model-maker) (1.17.2)\n",
+            "Requirement already satisfied: uritemplate<4dev,>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from google-api-python-client>=1.6.7->tf-models-nightly->tflite-model-maker) (3.0.1)\n",
+            "Requirement already satisfied: httplib2<1dev,>=0.17.0 in /usr/local/lib/python3.6/dist-packages (from google-api-python-client>=1.6.7->tf-models-nightly->tflite-model-maker) (0.17.4)\n",
+            "Requirement already satisfied: google-auth-httplib2>=0.0.3 in /usr/local/lib/python3.6/dist-packages (from google-api-python-client>=1.6.7->tf-models-nightly->tflite-model-maker) (0.0.4)\n",
+            "Requirement already satisfied: dm-tree~=0.1.1 in /usr/local/lib/python3.6/dist-packages (from tensorflow-model-optimization>=0.4.1->tf-models-nightly->tflite-model-maker) (0.1.5)\n",
+            "Requirement already satisfied: google-cloud-core<2.0dev,>=1.0.3 in /usr/local/lib/python3.6/dist-packages (from google-cloud-bigquery>=0.31.0->tf-models-nightly->tflite-model-maker) (1.0.3)\n",
+            "Requirement already satisfied: google-resumable-media!=0.4.0,<0.5.0dev,>=0.3.1 in /usr/local/lib/python3.6/dist-packages (from google-cloud-bigquery>=0.31.0->tf-models-nightly->tflite-model-maker) (0.4.1)\n",
+            "Requirement already satisfied: pyasn1>=0.1.7 in /usr/local/lib/python3.6/dist-packages (from oauth2client->tf-models-nightly->tflite-model-maker) (0.4.8)\n",
+            "Requirement already satisfied: rsa>=3.1.4 in /usr/local/lib/python3.6/dist-packages (from oauth2client->tf-models-nightly->tflite-model-maker) (4.6)\n",
+            "Requirement already satisfied: pyasn1-modules>=0.0.5 in /usr/local/lib/python3.6/dist-packages (from oauth2client->tf-models-nightly->tflite-model-maker) (0.2.8)\n",
+            "Requirement already satisfied: Keras>=2.2.4 in /usr/local/lib/python3.6/dist-packages (from seqeval->tf-models-nightly->tflite-model-maker) (2.4.3)\n",
+            "Requirement already satisfied: typeguard in /usr/local/lib/python3.6/dist-packages (from tensorflow-addons->tf-models-nightly->tflite-model-maker) (2.7.1)\n",
+            "Requirement already satisfied: googleapis-common-protos<2,>=1.52.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow-metadata->tensorflow-datasets>=2.1.0->tflite-model-maker) (1.52.0)\n",
+            "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests>=2.19.0->tensorflow-datasets>=2.1.0->tflite-model-maker) (3.0.4)\n",
+            "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests>=2.19.0->tensorflow-datasets>=2.1.0->tflite-model-maker) (2.10)\n",
+            "Requirement already satisfied: werkzeug>=0.11.15 in /usr/local/lib/python3.6/dist-packages (from tb-nightly<3.0.0a0,>=2.4.0a0->tf-nightly->tflite-model-maker) (1.0.1)\n",
+            "Requirement already satisfied: markdown>=2.6.8 in /usr/local/lib/python3.6/dist-packages (from tb-nightly<3.0.0a0,>=2.4.0a0->tf-nightly->tflite-model-maker) (3.2.2)\n",
+            "Requirement already satisfied: google-auth-oauthlib<0.5,>=0.4.1 in /usr/local/lib/python3.6/dist-packages (from tb-nightly<3.0.0a0,>=2.4.0a0->tf-nightly->tflite-model-maker) (0.4.1)\n",
+            "Requirement already satisfied: tensorboard-plugin-wit>=1.6.0 in /usr/local/lib/python3.6/dist-packages (from tb-nightly<3.0.0a0,>=2.4.0a0->tf-nightly->tflite-model-maker) (1.7.0)\n",
+            "Requirement already satisfied: text-unidecode>=1.3 in /usr/local/lib/python3.6/dist-packages (from python-slugify->kaggle>=1.3.9->tf-models-nightly->tflite-model-maker) (1.3)\n",
+            "Requirement already satisfied: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.4.1->google-api-python-client>=1.6.7->tf-models-nightly->tflite-model-maker) (4.1.1)\n",
+            "Requirement already satisfied: google-api-core<2.0.0dev,>=1.14.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-core<2.0dev,>=1.0.3->google-cloud-bigquery>=0.31.0->tf-models-nightly->tflite-model-maker) (1.16.0)\n",
+            "Requirement already satisfied: importlib-metadata; python_version < \"3.8\" in /usr/local/lib/python3.6/dist-packages (from markdown>=2.6.8->tb-nightly<3.0.0a0,>=2.4.0a0->tf-nightly->tflite-model-maker) (1.7.0)\n",
+            "Requirement already satisfied: requests-oauthlib>=0.7.0 in /usr/local/lib/python3.6/dist-packages (from google-auth-oauthlib<0.5,>=0.4.1->tb-nightly<3.0.0a0,>=2.4.0a0->tf-nightly->tflite-model-maker) (1.3.0)\n",
+            "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.6/dist-packages (from importlib-metadata; python_version < \"3.8\"->markdown>=2.6.8->tb-nightly<3.0.0a0,>=2.4.0a0->tf-nightly->tflite-model-maker) (3.1.0)\n",
+            "Requirement already satisfied: oauthlib>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<0.5,>=0.4.1->tb-nightly<3.0.0a0,>=2.4.0a0->tf-nightly->tflite-model-maker) (3.1.0)\n",
+            "Building wheels for collected packages: fire, py-cpuinfo, seqeval, pyyaml\n",
+            "  Building wheel for fire (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+            "  Created wheel for fire: filename=fire-0.3.1-py2.py3-none-any.whl size=111005 sha256=8f09a5a04716eb30229b33f5a9031fa22413bd4f709aac5155f4f26c6b070f47\n",
+            "  Stored in directory: /root/.cache/pip/wheels/c1/61/df/768b03527bf006b546dce284eb4249b185669e65afc5fbb2ac\n",
+            "  Building wheel for py-cpuinfo (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+            "  Created wheel for py-cpuinfo: filename=py_cpuinfo-7.0.0-cp36-none-any.whl size=20071 sha256=574e1452bf1fb528837233653837cf69e38804b69190421918f8570a6f5f7c79\n",
+            "  Stored in directory: /root/.cache/pip/wheels/f1/93/7b/127daf0c3a5a49feb2fecd468d508067c733fba5192f726ad1\n",
+            "  Building wheel for seqeval (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+            "  Created wheel for seqeval: filename=seqeval-0.0.12-cp36-none-any.whl size=7423 sha256=788a558edd9264e4bbc86ed4a69b393b367e12e33a8c922f64289530f289f1c6\n",
+            "  Stored in directory: /root/.cache/pip/wheels/4f/32/0a/df3b340a82583566975377d65e724895b3fad101a3fb729f68\n",
+            "  Building wheel for pyyaml (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+            "  Created wheel for pyyaml: filename=PyYAML-5.3.1-cp36-cp36m-linux_x86_64.whl size=44619 sha256=12850ae3031f2d470b0d073f988afc480e64941f0fdf179c25fb17e03a39d550\n",
+            "  Stored in directory: /root/.cache/pip/wheels/a7/c1/ea/cf5bd31012e735dc1dfea3131a2d5eae7978b251083d6247bd\n",
+            "Successfully built fire py-cpuinfo seqeval pyyaml\n",
+            "Installing collected packages: sentencepiece, pybind11, flatbuffers, tflite-support, py-cpuinfo, tf-slim, opencv-python-headless, tensorflow-model-optimization, tb-nightly, tf-estimator-nightly, tf-nightly, seqeval, pyyaml, tf-models-nightly, fire, tflite-model-maker\n",
+            "  Found existing installation: PyYAML 3.13\n",
+            "    Uninstalling PyYAML-3.13:\n",
+            "      Successfully uninstalled PyYAML-3.13\n",
+            "Successfully installed fire-0.3.1 flatbuffers-1.12 opencv-python-headless-4.4.0.42 py-cpuinfo-7.0.0 pybind11-2.5.0 pyyaml-5.3.1 sentencepiece-0.1.91 seqeval-0.0.12 tb-nightly-2.4.0a20200914 tensorflow-model-optimization-0.5.0 tf-estimator-nightly-2.4.0.dev2020091401 tf-models-nightly-2.3.0.dev20200914 tf-nightly-2.4.0.dev20200914 tf-slim-1.1.0 tflite-model-maker-0.1.2 tflite-support-0.1.0rc3.dev2\n"
+          ],
+          "name": "stdout"
+        }
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "Gx1HGRoFQ54j"
+        "id": "Gx1HGRoFQ54j",
+        "colab_type": "text"
       },
       "source": [
         "Import the required packages."
@@ -116,33 +281,32 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {
-        "colab": {},
+        "id": "XtxiUeZEiXpt",
         "colab_type": "code",
-        "id": "XtxiUeZEiXpt"
+        "colab": {}
       },
-      "outputs": [],
       "source": [
         "import numpy as np\n",
         "\n",
         "import tensorflow as tf\n",
         "assert tf.__version__.startswith('2')\n",
         "\n",
-        "from tensorflow_examples.lite.model_maker.core.data_util.image_dataloader import ImageClassifierDataLoader\n",
-        "from tensorflow_examples.lite.model_maker.core.task import image_classifier\n",
-        "from tensorflow_examples.lite.model_maker.core.task.configs import QuantizationConfig\n",
-        "from tensorflow_examples.lite.model_maker.core.task.model_spec import mobilenet_v2_spec\n",
-        "from tensorflow_examples.lite.model_maker.core.task.model_spec import ImageModelSpec\n",
+        "from tflite_model_maker import configs\n",
+        "from tflite_model_maker import image_classifier\n",
+        "from tflite_model_maker import ImageClassifierDataLoader\n",
+        "from tflite_model_maker import model_spec\n",
         "\n",
         "import matplotlib.pyplot as plt"
-      ]
+      ],
+      "execution_count": 2,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "KKRaYHABpob5"
+        "id": "KKRaYHABpob5",
+        "colab_type": "text"
       },
       "source": [
         "## Simple End-to-End Example"
@@ -151,8 +315,8 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "SiZZ5DHXotaW"
+        "id": "SiZZ5DHXotaW",
+        "colab_type": "text"
       },
       "source": [
         "### Get the data path\n",
@@ -162,38 +326,51 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {
         "cellView": "form",
-        "colab": {},
+        "id": "3jz5x0JoskPv",
         "colab_type": "code",
-        "id": "3jz5x0JoskPv"
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 51
+        },
+        "outputId": "07a05f1d-b20d-4f80-8175-76416747476b"
       },
-      "outputs": [],
       "source": [
         "image_path = tf.keras.utils.get_file(\n",
         "      'flower_photos',\n",
         "      'https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz',\n",
         "      untar=True)"
+      ],
+      "execution_count": 4,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "Downloading data from https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz\n",
+            "228818944/228813984 [==============================] - 1s 0us/step\n"
+          ],
+          "name": "stdout"
+        }
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "a55MR6i6nuDm"
+        "id": "a55MR6i6nuDm",
+        "colab_type": "text"
       },
       "source": [
         "You could replace `image_path` with your own image folders. As for uploading data to colab, you could find the upload button in the left sidebar shown in the image below with the red rectangle. Just have a try to upload a zip file and unzip it. The root file path is the current path.\n",
         "\n",
-        "\u003cimg src=\"https://storage.googleapis.com/download.tensorflow.org/models/tflite/screenshots/model_maker_image_classification.png\" alt=\"Upload File\" width=\"800\" hspace=\"100\"\u003e"
+        "<img src=\"https://storage.googleapis.com/download.tensorflow.org/models/tflite/screenshots/model_maker_image_classification.png\" alt=\"Upload File\" width=\"800\" hspace=\"100\">"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "NNRNv_mloS89"
+        "id": "NNRNv_mloS89",
+        "colab_type": "text"
       },
       "source": [
         "If you prefer not to upload your images to the cloud, you could try to run the library locally following the [guide](https://github.com/tensorflow/examples/tree/master/tensorflow_examples/lite/model_maker) in github."
@@ -202,8 +379,8 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "w-VDriAdsowu"
+        "id": "w-VDriAdsowu",
+        "colab_type": "text"
       },
       "source": [
         "### Run the example\n",
@@ -213,8 +390,8 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "6ahtcO86tZBL"
+        "id": "6ahtcO86tZBL",
+        "colab_type": "text"
       },
       "source": [
         "Step 1.   Load input data specific to an on-device ML app. Split it to training data and testing data."
@@ -222,23 +399,35 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {
-        "colab": {},
+        "id": "lANoNS_gtdH1",
         "colab_type": "code",
-        "id": "lANoNS_gtdH1"
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 34
+        },
+        "outputId": "d2ad1069-373b-47ff-ead1-92982df9f652"
       },
-      "outputs": [],
       "source": [
         "data = ImageClassifierDataLoader.from_folder(image_path)\n",
         "train_data, test_data = data.split(0.9)"
+      ],
+      "execution_count": 5,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "INFO:tensorflow:Load image with size: 3670, num_label: 5, labels: daisy, dandelion, roses, sunflowers, tulips.\n"
+          ],
+          "name": "stdout"
+        }
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "Y_9IWyIztuRF"
+        "id": "Y_9IWyIztuRF",
+        "colab_type": "text"
       },
       "source": [
         "Step 2. Customize the TensorFlow model."
@@ -246,22 +435,22 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {
-        "colab": {},
+        "id": "yRXMZbrwtyRD",
         "colab_type": "code",
-        "id": "yRXMZbrwtyRD"
+        "colab": {}
       },
-      "outputs": [],
       "source": [
         "model = image_classifier.create(train_data)"
-      ]
+      ],
+      "execution_count": 11,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "oxU2fDr-t2Ya"
+        "id": "oxU2fDr-t2Ya",
+        "colab_type": "text"
       },
       "source": [
         "Step 3. Evaluate the model."
@@ -269,22 +458,22 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {
-        "colab": {},
+        "id": "wQr02VxJt6Cs",
         "colab_type": "code",
-        "id": "wQr02VxJt6Cs"
+        "colab": {}
       },
-      "outputs": [],
       "source": [
         "loss, accuracy = model.evaluate(test_data)"
-      ]
+      ],
+      "execution_count": 13,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "eVZw9zU8t84y"
+        "id": "eVZw9zU8t84y",
+        "colab_type": "text"
       },
       "source": [
         "Step 4.  Export to TensorFlow Lite model.\n",
@@ -295,22 +484,22 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {
-        "colab": {},
+        "id": "Zb-eIzfluCoa",
         "colab_type": "code",
-        "id": "Zb-eIzfluCoa"
+        "colab": {}
       },
-      "outputs": [],
       "source": [
         "model.export(export_dir='.')"
-      ]
+      ],
+      "execution_count": 14,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "pyju1qc_v-wy"
+        "id": "pyju1qc_v-wy",
+        "colab_type": "text"
       },
       "source": [
         "After this simple 4 steps, we could further use TensorFlow Lite model file and label file in on-device applications like in [image classification](https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification) reference app.\n"
@@ -319,8 +508,8 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "R1QG32ivs9lF"
+        "id": "R1QG32ivs9lF",
+        "colab_type": "text"
       },
       "source": [
         "## Detailed Process\n",
@@ -334,8 +523,8 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "ygEncJxtl-nQ"
+        "id": "ygEncJxtl-nQ",
+        "colab_type": "text"
       },
       "source": [
         "### Step 1: Load Input Data Specific to an On-device ML App\n",
@@ -344,52 +533,52 @@
         "\n",
         "The dataset has the following directory structure:\n",
         "\n",
-        "\u003cpre\u003e\n",
-        "\u003cb\u003eflower_photos\u003c/b\u003e\n",
-        "|__ \u003cb\u003edaisy\u003c/b\u003e\n",
+        "<pre>\n",
+        "<b>flower_photos</b>\n",
+        "|__ <b>daisy</b>\n",
         "    |______ 100080576_f52e8ee070_n.jpg\n",
         "    |______ 14167534527_781ceb1b7a_n.jpg\n",
         "    |______ ...\n",
-        "|__ \u003cb\u003edandelion\u003c/b\u003e\n",
+        "|__ <b>dandelion</b>\n",
         "    |______ 10043234166_e6dd915111_n.jpg\n",
         "    |______ 1426682852_e62169221f_m.jpg\n",
         "    |______ ...\n",
-        "|__ \u003cb\u003eroses\u003c/b\u003e\n",
+        "|__ <b>roses</b>\n",
         "    |______ 102501987_3cdb8e5394_n.jpg\n",
         "    |______ 14982802401_a3dfb22afb.jpg\n",
         "    |______ ...\n",
-        "|__ \u003cb\u003esunflowers\u003c/b\u003e\n",
+        "|__ <b>sunflowers</b>\n",
         "    |______ 12471791574_bb1be83df4.jpg\n",
         "    |______ 15122112402_cafa41934f.jpg\n",
         "    |______ ...\n",
-        "|__ \u003cb\u003etulips\u003c/b\u003e\n",
+        "|__ <b>tulips</b>\n",
         "    |______ 13976522214_ccec508fe7.jpg\n",
         "    |______ 14487943607_651e8062a1_m.jpg\n",
         "    |______ ...\n",
-        "\u003c/pre\u003e"
+        "</pre>"
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {
-        "colab": {},
+        "id": "7tOfUr2KlgpU",
         "colab_type": "code",
-        "id": "7tOfUr2KlgpU"
+        "colab": {}
       },
-      "outputs": [],
       "source": [
         "image_path = tf.keras.utils.get_file(\n",
         "      'flower_photos',\n",
         "      'https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz',\n",
         "      untar=True)"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "E051HBUM5owi"
+        "id": "E051HBUM5owi",
+        "colab_type": "text"
       },
       "source": [
         "Use `ImageClassifierDataLoader` class to load data.\n",
@@ -399,22 +588,22 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {
-        "colab": {},
+        "id": "I_fOlZsklmlL",
         "colab_type": "code",
-        "id": "I_fOlZsklmlL"
+        "colab": {}
       },
-      "outputs": [],
       "source": [
         "data = ImageClassifierDataLoader.from_folder(image_path)"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "u501eT4koURB"
+        "id": "u501eT4koURB",
+        "colab_type": "text"
       },
       "source": [
         "Split it to training data (80%), validation data (10%, optional) and testing data (10%)."
@@ -422,23 +611,23 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {
-        "colab": {},
+        "id": "cY4UU5SUobtJ",
         "colab_type": "code",
-        "id": "cY4UU5SUobtJ"
+        "colab": {}
       },
-      "outputs": [],
       "source": [
         "train_data, rest_data = data.split(0.8)\n",
         "validation_data, test_data = rest_data.split(0.5)"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "Z9_MYPie3EMO"
+        "id": "Z9_MYPie3EMO",
+        "colab_type": "text"
       },
       "source": [
         "Show 25 image examples with labels."
@@ -446,13 +635,11 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {
-        "colab": {},
+        "id": "Ih4Wx44I482b",
         "colab_type": "code",
-        "id": "Ih4Wx44I482b"
+        "colab": {}
       },
-      "outputs": [],
       "source": [
         "plt.figure(figsize=(10,10))\n",
         "for i, (image, label) in enumerate(data.dataset.take(25)):\n",
@@ -463,13 +650,15 @@
         "  plt.imshow(image.numpy(), cmap=plt.cm.gray)\n",
         "  plt.xlabel(data.index_to_label[label.numpy()])\n",
         "plt.show()"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "AWuoensX4vDA"
+        "id": "AWuoensX4vDA",
+        "colab_type": "text"
       },
       "source": [
         "### Step 2: Customize the TensorFlow Model\n",
@@ -479,22 +668,22 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {
-        "colab": {},
+        "id": "TvYSUuJY3QxR",
         "colab_type": "code",
-        "id": "TvYSUuJY3QxR"
+        "colab": {}
       },
-      "outputs": [],
       "source": [
         "model = image_classifier.create(train_data, validation_data=validation_data)"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "4JFOKWnH9x8_"
+        "id": "4JFOKWnH9x8_",
+        "colab_type": "text"
       },
       "source": [
         "Have a look at the detailed model structure."
@@ -502,22 +691,22 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {
-        "colab": {},
+        "id": "QNXAfjl192dC",
         "colab_type": "code",
-        "id": "QNXAfjl192dC"
+        "colab": {}
       },
-      "outputs": [],
       "source": [
         "model.summary()"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "LP5FPk_tOxoZ"
+        "id": "LP5FPk_tOxoZ",
+        "colab_type": "text"
       },
       "source": [
         "### Step 3: Evaluate the Customized Model\n",
@@ -527,22 +716,22 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {
-        "colab": {},
+        "id": "A8c2ZQ0J3Riy",
         "colab_type": "code",
-        "id": "A8c2ZQ0J3Riy"
+        "colab": {}
       },
-      "outputs": [],
       "source": [
         "loss, accuracy = model.evaluate(test_data)"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "6ZCrYOWoCt05"
+        "id": "6ZCrYOWoCt05",
+        "colab_type": "text"
       },
       "source": [
         "We could plot the predicted results in 100 test images. Predicted labels with red color are the wrong predicted results while others are correct."
@@ -550,13 +739,11 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {
-        "colab": {},
+        "id": "n9O9Kx7nDQWD",
         "colab_type": "code",
-        "id": "n9O9Kx7nDQWD"
+        "colab": {}
       },
-      "outputs": [],
       "source": [
         "# A helper function that returns 'red'/'black' depending on if its two input\n",
         "# parameter matches or not.\n",
@@ -584,13 +771,15 @@
         "  ax.xaxis.label.set_color(color)\n",
         "  plt.xlabel('Predicted: %s' % predict_label)\n",
         "plt.show()"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "S3H0rkbLUZAG"
+        "id": "S3H0rkbLUZAG",
+        "colab_type": "text"
       },
       "source": [
         "If the accuracy doesn't meet the app requirement, one could refer to [Advanced Usage](#scrollTo=zNDBP2qA54aK) to explore alternatives such as changing to a larger model, adjusting re-training parameters etc."
@@ -599,8 +788,8 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "aeHoGAceO2xV"
+        "id": "aeHoGAceO2xV",
+        "colab_type": "text"
       },
       "source": [
         "### Step 4: Export to TensorFlow Lite Model\n",
@@ -610,22 +799,22 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {
-        "colab": {},
+        "id": "Im6wA9lK3TQB",
         "colab_type": "code",
-        "id": "Im6wA9lK3TQB"
+        "colab": {}
       },
-      "outputs": [],
       "source": [
         "model.export(export_dir='.')"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "ROS2Ay2jMPCl"
+        "id": "ROS2Ay2jMPCl",
+        "colab_type": "text"
       },
       "source": [
         "The TensorFlow Lite model file and label file could be used in [image classification](https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification) reference app.\n",
@@ -636,71 +825,31 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "-4jQaxyT5_KV"
+        "id": "-4jQaxyT5_KV",
+        "colab_type": "text"
       },
       "source": [
-        "Here, we also demonstrate how to use the above files to run and evaluate the TensorFlow Lite model."
+        "You can also evalute the tflite model with the `evaluate_tflite` method."
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {
-        "colab": {},
+        "id": "S1YoPX5wOK-u",
         "colab_type": "code",
-        "id": "S1YoPX5wOK-u"
+        "colab": {}
       },
-      "outputs": [],
       "source": [
-        "# Read TensorFlow Lite model from TensorFlow Lite file.\n",
-        "with tf.io.gfile.GFile('model.tflite', 'rb') as f:\n",
-        "  model_content = f.read()\n",
-        "\n",
-        "# Initialze TensorFlow Lite inpterpreter.\n",
-        "interpreter = tf.lite.Interpreter(model_content=model_content)\n",
-        "interpreter.allocate_tensors()\n",
-        "input_index = interpreter.get_input_details()[0]['index']\n",
-        "output = interpreter.tensor(interpreter.get_output_details()[0][\"index\"])\n",
-        "\n",
-        "# Run predictions on each test image data and calculate accuracy.\n",
-        "accurate_count = 0\n",
-        "for i, (image, label) in enumerate(test_data.dataset):\n",
-        "    # Pre-processing should remain the same. Currently, just normalize each pixel value and resize image according to the model's specification.\n",
-        "    image, _ = model.preprocess(image, label)\n",
-        "    # Add batch dimension and convert to float32 to match with the model's input\n",
-        "    # data format.\n",
-        "    image = tf.expand_dims(image, 0).numpy()\n",
-        "\n",
-        "    # Run inference.\n",
-        "    interpreter.set_tensor(input_index, image)\n",
-        "    interpreter.invoke()\n",
-        "\n",
-        "    # Post-processing: remove batch dimension and find the label with highest\n",
-        "    # probability.\n",
-        "    predict_label = np.argmax(output()[0])\n",
-        "\n",
-        "    accurate_count += (predict_label == label.numpy())\n",
-        "\n",
-        "accuracy = accurate_count * 1.0 / test_data.size\n",
-        "print('TensorFlow Lite model accuracy = %.4f' % accuracy)"
-      ]
+        "model.evaluate_tflite('model.tflite', test_data)"
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "fuHB-NFqpKTD"
-      },
-      "source": [
-        "Note that preprocessing for inference should be the same as training. Currently, preprocessing contains normalizing each pixel value and resizing the image according to the model's specification. For  EfficientNet-Lite0, input image should be normalized to `[0, 1]` and resized to `[224, 224, 3]`."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "zNDBP2qA54aK"
+        "id": "zNDBP2qA54aK",
+        "colab_type": "text"
       },
       "source": [
         "## Advanced Usage\n",
@@ -722,19 +871,18 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "Gc4Jk8TvBQfm"
+        "id": "Gc4Jk8TvBQfm",
+        "colab_type": "text"
       },
       "source": [
-        "## Post-training quantization on the TensorFLow Lite model\n",
-        "\n"
+        "## Post-training quantization on the TensorFLow Lite model\n"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "tD8BOYrHBiDt"
+        "id": "tD8BOYrHBiDt",
+        "colab_type": "text"
       },
       "source": [
         "[Post-training quantization](https://www.tensorflow.org/lite/performance/post_training_quantization) is a conversion technique that can reduce model size and inference latency, while also improving CPU and hardware accelerator latency, with little degradation in model accuracy. Thus, it's widely used to optimize the model.\n"
@@ -743,8 +891,8 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "iyIo0d5TCzE2"
+        "id": "iyIo0d5TCzE2",
+        "colab_type": "text"
       },
       "source": [
         "Model Maker supports multiple post-training quantization options. Let's take full integer quantization as an instance. First, define the quantization config to enforce enforce full integer quantization for all ops including the input and output. The input type and output type are `uint8` by default. You may also change them to other types like `int8` by setting `inference_input_type` and `inference_output_type` in config."
@@ -752,22 +900,22 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {
-        "colab": {},
+        "id": "k8hL2mstCxQl",
         "colab_type": "code",
-        "id": "k8hL2mstCxQl"
+        "colab": {}
       },
-      "outputs": [],
       "source": [
-        "config = QuantizationConfig.create_full_integer_quantization(representative_data=test_data, is_integer_only=True)"
-      ]
+        "config = configs.QuantizationConfig.create_full_integer_quantization(representative_data=test_data, is_integer_only=True)"
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "K1gzx_rmFMOA"
+        "id": "K1gzx_rmFMOA",
+        "colab_type": "text"
       },
       "source": [
         "Then we export TensorFlow Lite model with such configuration."
@@ -775,22 +923,22 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {
-        "colab": {},
+        "id": "WTJzFQnJFMjr",
         "colab_type": "code",
-        "id": "WTJzFQnJFMjr"
+        "colab": {}
       },
-      "outputs": [],
       "source": [
         "model.export(export_dir='.', tflite_filename='model_quant.tflite', quantization_config=config)"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "Safo0e40wKZW"
+        "id": "Safo0e40wKZW",
+        "colab_type": "text"
       },
       "source": [
         "In Colab, you can download the model named `model_quant.tflite` from the left sidebar, same as the uploading part mentioned above."
@@ -799,8 +947,8 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "A4kiTJtZ_sDm"
+        "id": "A4kiTJtZ_sDm",
+        "colab_type": "text"
       },
       "source": [
         "## Change the model\n"
@@ -809,8 +957,8 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "794vgj6ud7Ep"
+        "id": "794vgj6ud7Ep",
+        "colab_type": "text"
       },
       "source": [
         "### Change to the model that's supported in this library.\n",
@@ -822,22 +970,22 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {
-        "colab": {},
+        "id": "7JKsJ6-P6ae1",
         "colab_type": "code",
-        "id": "7JKsJ6-P6ae1"
+        "colab": {}
       },
-      "outputs": [],
       "source": [
-        "model = image_classifier.create(train_data, model_spec=mobilenet_v2_spec, validation_data=validation_data)"
-      ]
+        "model = image_classifier.create(train_data, model_spec=model_spec.mobilenet_v2_spec, validation_data=validation_data)"
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "gm_B1Wv08AxR"
+        "id": "gm_B1Wv08AxR",
+        "colab_type": "text"
       },
       "source": [
         "Evaluate the newly retrained MobileNetV2 model to see the accuracy and loss in testing data."
@@ -845,22 +993,22 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {
-        "colab": {},
+        "id": "lB2Go3HW8X7_",
         "colab_type": "code",
-        "id": "lB2Go3HW8X7_"
+        "colab": {}
       },
-      "outputs": [],
       "source": [
         "loss, accuracy = model.evaluate(test_data)"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "vAciGzVWtmWp"
+        "id": "vAciGzVWtmWp",
+        "colab_type": "text"
       },
       "source": [
         "### Change to the model in TensorFlow Hub\n",
@@ -874,24 +1022,24 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {
-        "colab": {},
+        "id": "xdiMF2WMfAR4",
         "colab_type": "code",
-        "id": "xdiMF2WMfAR4"
+        "colab": {}
       },
-      "outputs": [],
       "source": [
-        "inception_v3_spec = ImageModelSpec(\n",
+        "inception_v3_spec = model_spec.ImageModelSpec(\n",
         "    uri='https://tfhub.dev/google/imagenet/inception_v3/feature_vector/1')\n",
         "inception_v3_spec.input_image_shape = [299, 299]"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "T_GGIoXZCs5F"
+        "id": "T_GGIoXZCs5F",
+        "colab_type": "text"
       },
       "source": [
         "Then, by setting parameter `model_spec` to `inception_v3_spec` in `create` method, we could retrain the Inception V3 model.\n",
@@ -902,8 +1050,8 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "UhZ5IRKdeex3"
+        "id": "UhZ5IRKdeex3",
+        "colab_type": "text"
       },
       "source": [
         "### Change your own custom model"
@@ -912,8 +1060,8 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "svTjlZhrCrcV"
+        "id": "svTjlZhrCrcV",
+        "colab_type": "text"
       },
       "source": [
         "If we'd like to use the custom model that's not in TensorFlow Hub, we should create and export [ModelSpec](https://www.tensorflow.org/hub/api_docs/python/hub/ModuleSpec) in TensorFlow Hub.\n",
@@ -924,8 +1072,8 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "4M9bn703AHt2"
+        "id": "4M9bn703AHt2",
+        "colab_type": "text"
       },
       "source": [
         "## Change the training hyperparameters\n",
@@ -943,22 +1091,22 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {
-        "colab": {},
+        "id": "A3k7mhH54QcK",
         "colab_type": "code",
-        "id": "A3k7mhH54QcK"
+        "colab": {}
       },
-      "outputs": [],
       "source": [
         "model = image_classifier.create(train_data, validation_data=validation_data, epochs=10)"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "VaYBQymQDsXU"
+        "id": "VaYBQymQDsXU",
+        "colab_type": "text"
       },
       "source": [
         "Evaluate the newly retrained model with 10 training epochs."
@@ -966,45 +1114,16 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {
-        "colab": {},
+        "id": "VafIYpKWD4Sw",
         "colab_type": "code",
-        "id": "VafIYpKWD4Sw"
+        "colab": {}
       },
-      "outputs": [],
       "source": [
         "loss, accuracy = model.evaluate(test_data)"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
     }
-  ],
-  "metadata": {
-    "accelerator": "GPU",
-    "colab": {
-      "collapsed_sections": [],
-      "name": "image_classification.ipynb",
-      "private_outputs": true,
-      "provenance": [],
-      "toc_visible": true
-    },
-    "kernelspec": {
-      "display_name": "Python 3",
-      "language": "python",
-      "name": "python3"
-    },
-    "language_info": {
-      "codemirror_mode": {
-        "name": "ipython",
-        "version": 3
-      },
-      "file_extension": ".py",
-      "mimetype": "text/x-python",
-      "name": "python",
-      "nbconvert_exporter": "python",
-      "pygments_lexer": "ipython3",
-      "version": "3.6.8"
-    }
-  },
-  "nbformat": 4,
-  "nbformat_minor": 0
-}
+  ]
+}
\ No newline at end of file
diff --git a/tensorflow/lite/g3doc/tutorials/model_maker_question_answer.ipynb b/tensorflow/lite/g3doc/tutorials/model_maker_question_answer.ipynb
new file mode 100644
index 00000000000..4abee967a09
--- /dev/null
+++ b/tensorflow/lite/g3doc/tutorials/model_maker_question_answer.ipynb
@@ -0,0 +1,842 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "name": "model_maker_question_answer.ipynb",
+      "provenance": [],
+      "collapsed_sections": [],
+      "toc_visible": true
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "h2q27gKz1H20",
+        "colab_type": "text"
+      },
+      "source": [
+        "##### Copyright 2020 The TensorFlow Authors."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "cellView": "form",
+        "id": "TUfAcER1oUS6",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+        "# you may not use this file except in compliance with the License.\n",
+        "# You may obtain a copy of the License at\n",
+        "#\n",
+        "# https://www.apache.org/licenses/LICENSE-2.0\n",
+        "#\n",
+        "# Unless required by applicable law or agreed to in writing, software\n",
+        "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+        "# See the License for the specific language governing permissions and\n",
+        "# limitations under the License."
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Gb7qyhNL1yWt",
+        "colab_type": "text"
+      },
+      "source": [
+        "# Question Answer with TensorFlow Lite Model Maker"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Fw5Y7snSuG51",
+        "colab_type": "text"
+      },
+      "source": [
+        "<table class=\"tfo-notebook-buttons\" align=\"left\">\n",
+        "  <td>\n",
+        "    <a target=\"_blank\" href=\"https://www.tensorflow.org/lite/tutorials/model_maker_question_answer\"><img src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" />View on TensorFlow.org</a>\n",
+        "  </td>\n",
+        "  <td>\n",
+        "    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/tutorials/model_maker_question_answer.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n",
+        "  </td>\n",
+        "  <td>\n",
+        "    <a target=\"_blank\" href=\"https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/tutorials/model_maker_question_answer.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n",
+        "  </td>\n",
+        "  <td>\n",
+        "    <a href=\"https://storage.googleapis.com/tensorflow_docs/tensorflow/tensorflow/lite/g3doc/tutorials/model_maker_question_answer.ipynb\"><img src=\"https://www.tensorflow.org/images/download_logo_32px.png\" />Download notebook</a>\n",
+        "  </td>\n",
+        "</table>"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "sr3q-gvm3cI8",
+        "colab_type": "text"
+      },
+      "source": [
+        "The TensorFlow Lite Model Maker library simplifies the process of adapting and converting a TensorFlow model to particular input data when deploying this model for on-device ML applications.\n",
+        "\n",
+        "This notebook shows an end-to-end example that utilizes the Model Maker library to illustrate the adaptation and conversion of a commonly-used question answer model for question answer task."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "UxEHFTk755qw",
+        "colab_type": "text"
+      },
+      "source": [
+        "# Introduction to Question Answer Task"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "cFbKTCF25-SG",
+        "colab_type": "text"
+      },
+      "source": [
+        "The supported task in this library is extractive question answer task, which means given a passage and a question, the answer is the span in the passage. The image below shows an example for question answer.\n",
+        "\n",
+        "\n",
+        "<p align=\"center\"><img src=\"https://storage.googleapis.com/download.tensorflow.org/models/tflite/screenshots/model_maker_squad_showcase.png\"  width=\"500\"></p>\n",
+        "\n",
+        "<p align=\"center\">\n",
+        "    <em>Answers are spans in the passage (image credit: <a href=\"https://rajpurkar.github.io/mlx/qa-and-squad/\">SQuAD blog</a>) </em>\n",
+        "</p>\n",
+        "\n",
+        "As for the model of question answer task, the inputs should be the passage and question pair that are already preprocessed, the outputs should be the start logits and end logits for each token in the passage.\n",
+        "The size of input could be set and adjusted according to the length of passage and question."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "gb7P4WQta8Ub",
+        "colab_type": "text"
+      },
+      "source": [
+        "## End-to-End Overview\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "w7cIHjIfbDlG",
+        "colab_type": "text"
+      },
+      "source": [
+        "The following code snippet demonstrates how to get the model within a few lines of code. The overall process includes 5 steps: (1) choose a model, (2) load data, (3) retrain the model, (4) evaluate, and (5) export it to TensorFlow Lite format."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "xQPdlxZBYuZG",
+        "colab_type": "text"
+      },
+      "source": [
+        "```python\n",
+        "# Chooses a model specification that represents the model.\n",
+        "spec = model_spec.get('mobilebert_qa')\n",
+        "\n",
+        "# Gets the training data and validation data.\n",
+        "train_data = QuestionAnswerDataLoader.from_squad(train_data_path, spec, is_training=True)\n",
+        "validation_data = QuestionAnswerDataLoader.from_squad(validation_data_path, spec, is_training=False)\n",
+        "\n",
+        "# Fine-tunes the model.\n",
+        "model = question_answer.create(train_data, model_spec=spec)\n",
+        "\n",
+        "# Gets the evaluation result.\n",
+        "metric = model.evaluate(validation_data)\n",
+        "\n",
+        "# Exports the model to the TensorFlow Lite format in the export directory.\n",
+        "model.export(export_dir)\n",
+        "```"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "exScAdvBbNEi",
+        "colab_type": "text"
+      },
+      "source": [
+        "The following sections explain the code in more detail."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "bcLF2PKkSbV3",
+        "colab_type": "text"
+      },
+      "source": [
+        "## Prerequisites\n",
+        "\n",
+        "To run this example, install the required packages, including the Model Maker package from the [GitHub repo](https://github.com/tensorflow/examples/tree/master/tensorflow_examples/lite/model_maker)."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "qhl8lqVamEty",
+        "colab_type": "code",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 1000
+        },
+        "outputId": "f2ef33ec-ad6b-4b45-9c50-6d65118e80da"
+      },
+      "source": [
+        "!pip install tflite-model-maker"
+      ],
+      "execution_count": 1,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "Collecting tflite-model-maker\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/13/bc/4c23b9cb9ef612a1f48bac5543bd531665de5eab8f8231111aac067f8c30/tflite_model_maker-0.1.2-py3-none-any.whl (104kB)\n",
+            "\r\u001b[K     |███▏                            | 10kB 28.4MB/s eta 0:00:01\r\u001b[K     |██████▎                         | 20kB 1.8MB/s eta 0:00:01\r\u001b[K     |█████████▍                      | 30kB 2.4MB/s eta 0:00:01\r\u001b[K     |████████████▋                   | 40kB 2.7MB/s eta 0:00:01\r\u001b[K     |███████████████▊                | 51kB 2.1MB/s eta 0:00:01\r\u001b[K     |██████████████████▉             | 61kB 2.4MB/s eta 0:00:01\r\u001b[K     |██████████████████████          | 71kB 2.7MB/s eta 0:00:01\r\u001b[K     |█████████████████████████▏      | 81kB 2.9MB/s eta 0:00:01\r\u001b[K     |████████████████████████████▎   | 92kB 3.1MB/s eta 0:00:01\r\u001b[K     |███████████████████████████████▌| 102kB 3.0MB/s eta 0:00:01\r\u001b[K     |████████████████████████████████| 112kB 3.0MB/s \n",
+            "\u001b[?25hRequirement already satisfied: absl-py in /usr/local/lib/python3.6/dist-packages (from tflite-model-maker) (0.10.0)\n",
+            "Collecting tf-nightly\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/33/d4/61c47ae889b490b9c5f07f4f61bdc057c158a1a1979c375fa019d647a19e/tf_nightly-2.4.0.dev20200914-cp36-cp36m-manylinux2010_x86_64.whl (390.1MB)\n",
+            "\u001b[K     |████████████████████████████████| 390.2MB 43kB/s \n",
+            "\u001b[?25hRequirement already satisfied: numpy>=1.17.3 in /usr/local/lib/python3.6/dist-packages (from tflite-model-maker) (1.18.5)\n",
+            "Requirement already satisfied: pillow in /usr/local/lib/python3.6/dist-packages (from tflite-model-maker) (7.0.0)\n",
+            "Collecting tf-models-nightly\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/d3/e9/c4e5a451c268a5a75a27949562364f6086f6bb33b226a065a8beceefa9ba/tf_models_nightly-2.3.0.dev20200914-py2.py3-none-any.whl (993kB)\n",
+            "\u001b[K     |████████████████████████████████| 1.0MB 57.6MB/s \n",
+            "\u001b[?25hCollecting flatbuffers==1.12\n",
+            "  Downloading https://files.pythonhosted.org/packages/eb/26/712e578c5f14e26ae3314c39a1bdc4eb2ec2f4ddc89b708cf8e0a0d20423/flatbuffers-1.12-py2.py3-none-any.whl\n",
+            "Requirement already satisfied: tensorflow-hub>=0.8.0 in /usr/local/lib/python3.6/dist-packages (from tflite-model-maker) (0.9.0)\n",
+            "Collecting fire\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/34/a7/0e22e70778aca01a52b9c899d9c145c6396d7b613719cd63db97ffa13f2f/fire-0.3.1.tar.gz (81kB)\n",
+            "\u001b[K     |████████████████████████████████| 81kB 11.5MB/s \n",
+            "\u001b[?25hCollecting sentencepiece\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)\n",
+            "\u001b[K     |████████████████████████████████| 1.1MB 50.9MB/s \n",
+            "\u001b[?25hCollecting tflite-support==0.1.0rc3.dev2\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/fa/c5/5e9ee3abd5b4ef8294432cd714407f49a66befa864905b66ee8bdc612795/tflite_support-0.1.0rc3.dev2-cp36-cp36m-manylinux2010_x86_64.whl (1.0MB)\n",
+            "\u001b[K     |████████████████████████████████| 1.0MB 50.9MB/s \n",
+            "\u001b[?25hRequirement already satisfied: tensorflow-datasets>=2.1.0 in /usr/local/lib/python3.6/dist-packages (from tflite-model-maker) (2.1.0)\n",
+            "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from absl-py->tflite-model-maker) (1.15.0)\n",
+            "Requirement already satisfied: termcolor>=1.1.0 in /usr/local/lib/python3.6/dist-packages (from tf-nightly->tflite-model-maker) (1.1.0)\n",
+            "Requirement already satisfied: opt-einsum>=2.3.2 in /usr/local/lib/python3.6/dist-packages (from tf-nightly->tflite-model-maker) (3.3.0)\n",
+            "Collecting tb-nightly<3.0.0a0,>=2.4.0a0\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/fc/cb/4dfe0d65bffb5e9663261ff664e6f5a2d37672b31dae27a0f14721ac00d3/tb_nightly-2.4.0a20200914-py3-none-any.whl (10.1MB)\n",
+            "\u001b[K     |████████████████████████████████| 10.1MB 51.4MB/s \n",
+            "\u001b[?25hRequirement already satisfied: typing-extensions>=3.7.4.2 in /usr/local/lib/python3.6/dist-packages (from tf-nightly->tflite-model-maker) (3.7.4.3)\n",
+            "Requirement already satisfied: wheel>=0.26 in /usr/local/lib/python3.6/dist-packages (from tf-nightly->tflite-model-maker) (0.35.1)\n",
+            "Collecting tf-estimator-nightly\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/bd/9a/3bfb9994eda11e426c809ebdf434e2ac5824a0784d980018bb53fd1620ec/tf_estimator_nightly-2.4.0.dev2020091401-py2.py3-none-any.whl (460kB)\n",
+            "\u001b[K     |████████████████████████████████| 460kB 36.0MB/s \n",
+            "\u001b[?25hRequirement already satisfied: google-pasta>=0.1.8 in /usr/local/lib/python3.6/dist-packages (from tf-nightly->tflite-model-maker) (0.2.0)\n",
+            "Requirement already satisfied: h5py<2.11.0,>=2.10.0 in /usr/local/lib/python3.6/dist-packages (from tf-nightly->tflite-model-maker) (2.10.0)\n",
+            "Requirement already satisfied: keras-preprocessing<1.2,>=1.1.1 in /usr/local/lib/python3.6/dist-packages (from tf-nightly->tflite-model-maker) (1.1.2)\n",
+            "Requirement already satisfied: wrapt>=1.11.1 in /usr/local/lib/python3.6/dist-packages (from tf-nightly->tflite-model-maker) (1.12.1)\n",
+            "Requirement already satisfied: grpcio>=1.8.6 in /usr/local/lib/python3.6/dist-packages (from tf-nightly->tflite-model-maker) (1.32.0)\n",
+            "Requirement already satisfied: protobuf>=3.9.2 in /usr/local/lib/python3.6/dist-packages (from tf-nightly->tflite-model-maker) (3.12.4)\n",
+            "Requirement already satisfied: gast==0.3.3 in /usr/local/lib/python3.6/dist-packages (from tf-nightly->tflite-model-maker) (0.3.3)\n",
+            "Requirement already satisfied: astunparse==1.6.3 in /usr/local/lib/python3.6/dist-packages (from tf-nightly->tflite-model-maker) (1.6.3)\n",
+            "Requirement already satisfied: scipy>=0.19.1 in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly->tflite-model-maker) (1.4.1)\n",
+            "Collecting pyyaml>=5.1\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/64/c2/b80047c7ac2478f9501676c988a5411ed5572f35d1beff9cae07d321512c/PyYAML-5.3.1.tar.gz (269kB)\n",
+            "\u001b[K     |████████████████████████████████| 276kB 59.8MB/s \n",
+            "\u001b[?25hCollecting tensorflow-model-optimization>=0.4.1\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/55/38/4fd48ea1bfcb0b6e36d949025200426fe9c3a8bfae029f0973d85518fa5a/tensorflow_model_optimization-0.5.0-py2.py3-none-any.whl (172kB)\n",
+            "\u001b[K     |████████████████████████████████| 174kB 51.0MB/s \n",
+            "\u001b[?25hRequirement already satisfied: pandas>=0.22.0 in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly->tflite-model-maker) (1.0.5)\n",
+            "Requirement already satisfied: dataclasses in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly->tflite-model-maker) (0.7)\n",
+            "Requirement already satisfied: Cython in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly->tflite-model-maker) (0.29.21)\n",
+            "Collecting opencv-python-headless\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/b6/2a/496e06fd289c01dc21b11970be1261c87ce1cc22d5340c14b516160822a7/opencv_python_headless-4.4.0.42-cp36-cp36m-manylinux2014_x86_64.whl (36.6MB)\n",
+            "\u001b[K     |████████████████████████████████| 36.6MB 83kB/s \n",
+            "\u001b[?25hRequirement already satisfied: kaggle>=1.3.9 in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly->tflite-model-maker) (1.5.8)\n",
+            "Requirement already satisfied: pycocotools in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly->tflite-model-maker) (2.0.2)\n",
+            "Requirement already satisfied: oauth2client in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly->tflite-model-maker) (4.1.3)\n",
+            "Requirement already satisfied: matplotlib in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly->tflite-model-maker) (3.2.2)\n",
+            "Collecting tf-slim>=1.1.0\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/02/97/b0f4a64df018ca018cc035d44f2ef08f91e2e8aa67271f6f19633a015ff7/tf_slim-1.1.0-py2.py3-none-any.whl (352kB)\n",
+            "\u001b[K     |████████████████████████████████| 358kB 55.9MB/s \n",
+            "\u001b[?25hCollecting seqeval\n",
+            "  Downloading https://files.pythonhosted.org/packages/34/91/068aca8d60ce56dd9ba4506850e876aba5e66a6f2f29aa223224b50df0de/seqeval-0.0.12.tar.gz\n",
+            "Requirement already satisfied: psutil>=5.4.3 in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly->tflite-model-maker) (5.4.8)\n",
+            "Collecting py-cpuinfo>=3.3.0\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/f6/f5/8e6e85ce2e9f6e05040cf0d4e26f43a4718bcc4bce988b433276d4b1a5c1/py-cpuinfo-7.0.0.tar.gz (95kB)\n",
+            "\u001b[K     |████████████████████████████████| 102kB 13.5MB/s \n",
+            "\u001b[?25hRequirement already satisfied: google-api-python-client>=1.6.7 in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly->tflite-model-maker) (1.7.12)\n",
+            "Requirement already satisfied: gin-config in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly->tflite-model-maker) (0.3.0)\n",
+            "Requirement already satisfied: tensorflow-addons in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly->tflite-model-maker) (0.8.3)\n",
+            "Requirement already satisfied: google-cloud-bigquery>=0.31.0 in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly->tflite-model-maker) (1.21.0)\n",
+            "Collecting pybind11>=2.4\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/89/e3/d576f6f02bc75bacbc3d42494e8f1d063c95617d86648dba243c2cb3963e/pybind11-2.5.0-py2.py3-none-any.whl (296kB)\n",
+            "\u001b[K     |████████████████████████████████| 296kB 47.9MB/s \n",
+            "\u001b[?25hRequirement already satisfied: promise in /usr/local/lib/python3.6/dist-packages (from tensorflow-datasets>=2.1.0->tflite-model-maker) (2.3)\n",
+            "Requirement already satisfied: tensorflow-metadata in /usr/local/lib/python3.6/dist-packages (from tensorflow-datasets>=2.1.0->tflite-model-maker) (0.24.0)\n",
+            "Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow-datasets>=2.1.0->tflite-model-maker) (2.23.0)\n",
+            "Requirement already satisfied: dill in /usr/local/lib/python3.6/dist-packages (from tensorflow-datasets>=2.1.0->tflite-model-maker) (0.3.2)\n",
+            "Requirement already satisfied: attrs>=18.1.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow-datasets>=2.1.0->tflite-model-maker) (20.2.0)\n",
+            "Requirement already satisfied: tqdm in /usr/local/lib/python3.6/dist-packages (from tensorflow-datasets>=2.1.0->tflite-model-maker) (4.41.1)\n",
+            "Requirement already satisfied: future in /usr/local/lib/python3.6/dist-packages (from tensorflow-datasets>=2.1.0->tflite-model-maker) (0.16.0)\n",
+            "Requirement already satisfied: werkzeug>=0.11.15 in /usr/local/lib/python3.6/dist-packages (from tb-nightly<3.0.0a0,>=2.4.0a0->tf-nightly->tflite-model-maker) (1.0.1)\n",
+            "Requirement already satisfied: setuptools>=41.0.0 in /usr/local/lib/python3.6/dist-packages (from tb-nightly<3.0.0a0,>=2.4.0a0->tf-nightly->tflite-model-maker) (50.3.0)\n",
+            "Requirement already satisfied: tensorboard-plugin-wit>=1.6.0 in /usr/local/lib/python3.6/dist-packages (from tb-nightly<3.0.0a0,>=2.4.0a0->tf-nightly->tflite-model-maker) (1.7.0)\n",
+            "Requirement already satisfied: google-auth-oauthlib<0.5,>=0.4.1 in /usr/local/lib/python3.6/dist-packages (from tb-nightly<3.0.0a0,>=2.4.0a0->tf-nightly->tflite-model-maker) (0.4.1)\n",
+            "Requirement already satisfied: google-auth<2,>=1.6.3 in /usr/local/lib/python3.6/dist-packages (from tb-nightly<3.0.0a0,>=2.4.0a0->tf-nightly->tflite-model-maker) (1.17.2)\n",
+            "Requirement already satisfied: markdown>=2.6.8 in /usr/local/lib/python3.6/dist-packages (from tb-nightly<3.0.0a0,>=2.4.0a0->tf-nightly->tflite-model-maker) (3.2.2)\n",
+            "Requirement already satisfied: dm-tree~=0.1.1 in /usr/local/lib/python3.6/dist-packages (from tensorflow-model-optimization>=0.4.1->tf-models-nightly->tflite-model-maker) (0.1.5)\n",
+            "Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas>=0.22.0->tf-models-nightly->tflite-model-maker) (2018.9)\n",
+            "Requirement already satisfied: python-dateutil>=2.6.1 in /usr/local/lib/python3.6/dist-packages (from pandas>=0.22.0->tf-models-nightly->tflite-model-maker) (2.8.1)\n",
+            "Requirement already satisfied: certifi in /usr/local/lib/python3.6/dist-packages (from kaggle>=1.3.9->tf-models-nightly->tflite-model-maker) (2020.6.20)\n",
+            "Requirement already satisfied: python-slugify in /usr/local/lib/python3.6/dist-packages (from kaggle>=1.3.9->tf-models-nightly->tflite-model-maker) (4.0.1)\n",
+            "Requirement already satisfied: slugify in /usr/local/lib/python3.6/dist-packages (from kaggle>=1.3.9->tf-models-nightly->tflite-model-maker) (0.0.1)\n",
+            "Requirement already satisfied: urllib3<1.25,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from kaggle>=1.3.9->tf-models-nightly->tflite-model-maker) (1.24.3)\n",
+            "Requirement already satisfied: pyasn1>=0.1.7 in /usr/local/lib/python3.6/dist-packages (from oauth2client->tf-models-nightly->tflite-model-maker) (0.4.8)\n",
+            "Requirement already satisfied: rsa>=3.1.4 in /usr/local/lib/python3.6/dist-packages (from oauth2client->tf-models-nightly->tflite-model-maker) (4.6)\n",
+            "Requirement already satisfied: httplib2>=0.9.1 in /usr/local/lib/python3.6/dist-packages (from oauth2client->tf-models-nightly->tflite-model-maker) (0.17.4)\n",
+            "Requirement already satisfied: pyasn1-modules>=0.0.5 in /usr/local/lib/python3.6/dist-packages (from oauth2client->tf-models-nightly->tflite-model-maker) (0.2.8)\n",
+            "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->tf-models-nightly->tflite-model-maker) (1.2.0)\n",
+            "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->tf-models-nightly->tflite-model-maker) (2.4.7)\n",
+            "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.6/dist-packages (from matplotlib->tf-models-nightly->tflite-model-maker) (0.10.0)\n",
+            "Requirement already satisfied: Keras>=2.2.4 in /usr/local/lib/python3.6/dist-packages (from seqeval->tf-models-nightly->tflite-model-maker) (2.4.3)\n",
+            "Requirement already satisfied: google-auth-httplib2>=0.0.3 in /usr/local/lib/python3.6/dist-packages (from google-api-python-client>=1.6.7->tf-models-nightly->tflite-model-maker) (0.0.4)\n",
+            "Requirement already satisfied: uritemplate<4dev,>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from google-api-python-client>=1.6.7->tf-models-nightly->tflite-model-maker) (3.0.1)\n",
+            "Requirement already satisfied: typeguard in /usr/local/lib/python3.6/dist-packages (from tensorflow-addons->tf-models-nightly->tflite-model-maker) (2.7.1)\n",
+            "Requirement already satisfied: google-cloud-core<2.0dev,>=1.0.3 in /usr/local/lib/python3.6/dist-packages (from google-cloud-bigquery>=0.31.0->tf-models-nightly->tflite-model-maker) (1.0.3)\n",
+            "Requirement already satisfied: google-resumable-media!=0.4.0,<0.5.0dev,>=0.3.1 in /usr/local/lib/python3.6/dist-packages (from google-cloud-bigquery>=0.31.0->tf-models-nightly->tflite-model-maker) (0.4.1)\n",
+            "Requirement already satisfied: googleapis-common-protos<2,>=1.52.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow-metadata->tensorflow-datasets>=2.1.0->tflite-model-maker) (1.52.0)\n",
+            "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests>=2.19.0->tensorflow-datasets>=2.1.0->tflite-model-maker) (3.0.4)\n",
+            "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests>=2.19.0->tensorflow-datasets>=2.1.0->tflite-model-maker) (2.10)\n",
+            "Requirement already satisfied: requests-oauthlib>=0.7.0 in /usr/local/lib/python3.6/dist-packages (from google-auth-oauthlib<0.5,>=0.4.1->tb-nightly<3.0.0a0,>=2.4.0a0->tf-nightly->tflite-model-maker) (1.3.0)\n",
+            "Requirement already satisfied: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from google-auth<2,>=1.6.3->tb-nightly<3.0.0a0,>=2.4.0a0->tf-nightly->tflite-model-maker) (4.1.1)\n",
+            "Requirement already satisfied: importlib-metadata; python_version < \"3.8\" in /usr/local/lib/python3.6/dist-packages (from markdown>=2.6.8->tb-nightly<3.0.0a0,>=2.4.0a0->tf-nightly->tflite-model-maker) (1.7.0)\n",
+            "Requirement already satisfied: text-unidecode>=1.3 in /usr/local/lib/python3.6/dist-packages (from python-slugify->kaggle>=1.3.9->tf-models-nightly->tflite-model-maker) (1.3)\n",
+            "Requirement already satisfied: google-api-core<2.0.0dev,>=1.14.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-core<2.0dev,>=1.0.3->google-cloud-bigquery>=0.31.0->tf-models-nightly->tflite-model-maker) (1.16.0)\n",
+            "Requirement already satisfied: oauthlib>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<0.5,>=0.4.1->tb-nightly<3.0.0a0,>=2.4.0a0->tf-nightly->tflite-model-maker) (3.1.0)\n",
+            "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.6/dist-packages (from importlib-metadata; python_version < \"3.8\"->markdown>=2.6.8->tb-nightly<3.0.0a0,>=2.4.0a0->tf-nightly->tflite-model-maker) (3.1.0)\n",
+            "Building wheels for collected packages: fire, pyyaml, seqeval, py-cpuinfo\n",
+            "  Building wheel for fire (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+            "  Created wheel for fire: filename=fire-0.3.1-py2.py3-none-any.whl size=111005 sha256=f0b82e6b31e21d6db3591478a37188c727533acefe415b16b456c85ef9bef47c\n",
+            "  Stored in directory: /root/.cache/pip/wheels/c1/61/df/768b03527bf006b546dce284eb4249b185669e65afc5fbb2ac\n",
+            "  Building wheel for pyyaml (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+            "  Created wheel for pyyaml: filename=PyYAML-5.3.1-cp36-cp36m-linux_x86_64.whl size=44619 sha256=cdbc63ead8369d7403f47b1adff163ebde2636c9f0c2a5ebd6413d156b2b7a9f\n",
+            "  Stored in directory: /root/.cache/pip/wheels/a7/c1/ea/cf5bd31012e735dc1dfea3131a2d5eae7978b251083d6247bd\n",
+            "  Building wheel for seqeval (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+            "  Created wheel for seqeval: filename=seqeval-0.0.12-cp36-none-any.whl size=7423 sha256=3ac4a1cc3b88a9b1a1ed8217f2b8d3abb7f936e853383025888b94019d98a856\n",
+            "  Stored in directory: /root/.cache/pip/wheels/4f/32/0a/df3b340a82583566975377d65e724895b3fad101a3fb729f68\n",
+            "  Building wheel for py-cpuinfo (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+            "  Created wheel for py-cpuinfo: filename=py_cpuinfo-7.0.0-cp36-none-any.whl size=20071 sha256=b5491e6fcabbf9ae464c0def53ec6ec27bbf01230ff96f4e34c6a7c44d55d5c9\n",
+            "  Stored in directory: /root/.cache/pip/wheels/f1/93/7b/127daf0c3a5a49feb2fecd468d508067c733fba5192f726ad1\n",
+            "Successfully built fire pyyaml seqeval py-cpuinfo\n",
+            "Installing collected packages: tb-nightly, flatbuffers, tf-estimator-nightly, tf-nightly, pyyaml, tensorflow-model-optimization, opencv-python-headless, sentencepiece, tf-slim, seqeval, py-cpuinfo, tf-models-nightly, fire, pybind11, tflite-support, tflite-model-maker\n",
+            "  Found existing installation: PyYAML 3.13\n",
+            "    Uninstalling PyYAML-3.13:\n",
+            "      Successfully uninstalled PyYAML-3.13\n",
+            "Successfully installed fire-0.3.1 flatbuffers-1.12 opencv-python-headless-4.4.0.42 py-cpuinfo-7.0.0 pybind11-2.5.0 pyyaml-5.3.1 sentencepiece-0.1.91 seqeval-0.0.12 tb-nightly-2.4.0a20200914 tensorflow-model-optimization-0.5.0 tf-estimator-nightly-2.4.0.dev2020091401 tf-models-nightly-2.3.0.dev20200914 tf-nightly-2.4.0.dev20200914 tf-slim-1.1.0 tflite-model-maker-0.1.2 tflite-support-0.1.0rc3.dev2\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "l6lRhVK9Q_0U",
+        "colab_type": "text"
+      },
+      "source": [
+        "Import the required packages."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "XtxiUeZEiXpt",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "import numpy as np\n",
+        "import os\n",
+        "\n",
+        "import tensorflow as tf\n",
+        "assert tf.__version__.startswith('2')\n",
+        "\n",
+        "from tflite_model_maker import configs\n",
+        "from tflite_model_maker import model_spec\n",
+        "from tflite_model_maker import question_answer\n",
+        "from tflite_model_maker import QuestionAnswerDataLoader"
+      ],
+      "execution_count": 3,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "l65ctmtW7_FF",
+        "colab_type": "text"
+      },
+      "source": [
+        "The \"End-to-End Overview\" demonstrates a simple end-to-end example. The following sections walk through the example step by step to show more detail."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "kJ_B8fMDOhMR",
+        "colab_type": "text"
+      },
+      "source": [
+        "## Choose a model_spec that represents a model for question answer\n",
+        "\n",
+        "Each `model_spec` object represents a specific model for question answer. The Model Maker currently supports MobileBERT and BERT-Base models.\n",
+        "\n",
+        "Supported Model | Name of model_spec | Model Description\n",
+        "--- | --- | ---\n",
+        "[MobileBERT](https://arxiv.org/pdf/2004.02984.pdf)  | 'mobilebert_qa' | 4.3x smaller and 5.5x faster than BERT-Base while achieving competitive results, suitable for on-device scenario.\n",
+        "[MobileBERT-SQuAD](https://arxiv.org/pdf/2004.02984.pdf)  | 'mobilebert_qa_squad' | Same model architecture as MobileBERT model and the initial model is already retrained on [SQuAD1.1](https://rajpurkar.github.io/SQuAD-explorer/).\n",
+        "[BERT-Base](https://arxiv.org/pdf/1810.04805.pdf) | 'bert_qa' | Standard BERT model that widely used in NLP tasks.\n",
+        "\n",
+        "In this tutorial, [MobileBERT-SQuAD](https://arxiv.org/pdf/2004.02984.pdf) is used as an example. Since the model is already retrained on [SQuAD1.1](https://rajpurkar.github.io/SQuAD-explorer/), it could coverage faster for question answer task.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "vEAWuZQ1PFiX",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "spec = model_spec.get('mobilebert_qa_squad')"
+      ],
+      "execution_count": 4,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "ygEncJxtl-nQ",
+        "colab_type": "text"
+      },
+      "source": [
+        "## Load Input Data Specific to an On-device ML App and Preprocess the Data\n",
+        "\n",
+        "The [TriviaQA](https://nlp.cs.washington.edu/triviaqa/) is a reading comprehension dataset containing over 650K question-answer-evidence triples. In this tutorial, you will use a subset of this dataset to learn how to use the Model Maker library.\n",
+        "\n",
+        "To load the data, convert the TriviaQA dataset to the [SQuAD1.1](https://rajpurkar.github.io/SQuAD-explorer/) format by running the [converter Python script](https://github.com/mandarjoshi90/triviaqa#miscellaneous) with `--sample_size=8000` and a set of `web` data. Modify the conversion code a little bit by:\n",
+        "* Skipping the samples that couldn't find any answer in the context document;\n",
+        "* Getting the original answer in the context without uppercase or lowercase.\n",
+        "\n",
+        "Download the archived version of the already converted dataset."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "7tOfUr2KlgpU",
+        "colab_type": "code",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 85
+        },
+        "outputId": "a5d42181-82ea-47a2-f364-825701e4a1f8"
+      },
+      "source": [
+        "train_data_path = tf.keras.utils.get_file(\n",
+        "    fname='triviaqa-web-train-8000.json',\n",
+        "    origin='https://storage.googleapis.com/download.tensorflow.org/models/tflite/dataset/triviaqa-web-train-8000.json')\n",
+        "validation_data_path = tf.keras.utils.get_file(\n",
+        "    fname='triviaqa-verified-web-dev.json',\n",
+        "    origin='https://storage.googleapis.com/download.tensorflow.org/models/tflite/dataset/triviaqa-verified-web-dev.json')"
+      ],
+      "execution_count": 5,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "Downloading data from https://storage.googleapis.com/download.tensorflow.org/models/tflite/dataset/triviaqa-web-train-8000.json\n",
+            "32571392/32570663 [==============================] - 1s 0us/step\n",
+            "Downloading data from https://storage.googleapis.com/download.tensorflow.org/models/tflite/dataset/triviaqa-verified-web-dev.json\n",
+            "1171456/1167744 [==============================] - 0s 0us/step\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "UfZk8GNr_1nc",
+        "colab_type": "text"
+      },
+      "source": [
+        "You can also train the MobileBERT model with your own dataset. If you are running this notebook on Colab, upload your data by using the left sidebar.\n",
+        "\n",
+        "<img src=\"https://storage.googleapis.com/download.tensorflow.org/models/tflite/screenshots/model_maker_question_answer.png\" alt=\"Upload File\" width=\"800\" hspace=\"100\">\n",
+        "\n",
+        "If you prefer not to upload your data to the cloud, you can also run the library offline by following the [guide](https://github.com/tensorflow/examples/tree/master/tensorflow_examples/lite/model_maker)."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "E051HBUM5owi",
+        "colab_type": "text"
+      },
+      "source": [
+        "Use the `QuestionAnswerDataLoader.from_squad` method to load and preprocess the [SQuAD format](https://rajpurkar.github.io/SQuAD-explorer/) data according to a specific `model_spec`. You can use either SQuAD2.0 or SQuAD1.1 formats. Setting parameter `version_2_with_negative` as `True` means the formats is SQuAD2.0. Otherwise, the format is SQuAD1.1. By default, `version_2_with_negative` is `False`."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "I_fOlZsklmlL",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "train_data = QuestionAnswerDataLoader.from_squad(train_data_path, spec, is_training=True)\n",
+        "validation_data = QuestionAnswerDataLoader.from_squad(validation_data_path, spec, is_training=False)"
+      ],
+      "execution_count": 6,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "AWuoensX4vDA",
+        "colab_type": "text"
+      },
+      "source": [
+        "## Customize the TensorFlow Model\n",
+        "\n",
+        "Create a custom question answer model based on the loaded data. The `create` function comprises the following steps:\n",
+        "\n",
+        "1. Creates the model for question answer according to `model_spec`.\n",
+        "2. Train the question answer model. The default epochs and the default batch size are set according to two variables `default_training_epochs` and `default_batch_size` in the `model_spec` object."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "TvYSUuJY3QxR",
+        "colab_type": "code",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 68
+        },
+        "outputId": "67c38b57-3596-467b-c71b-b1a3980e68c7"
+      },
+      "source": [
+        "model = question_answer.create(train_data, model_spec=spec)"
+      ],
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "INFO:tensorflow:Retraining the models...\n"
+          ],
+          "name": "stdout"
+        },
+        {
+          "output_type": "stream",
+          "text": [
+            "INFO:tensorflow:Retraining the models...\n"
+          ],
+          "name": "stderr"
+        },
+        {
+          "output_type": "stream",
+          "text": [
+            "Epoch 1/2\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "0JKI-pNc8idH",
+        "colab_type": "text"
+      },
+      "source": [
+        "Have a look at the detailed model structure."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "gd7Hs8TF8n3H",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "model.summary()"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "LP5FPk_tOxoZ",
+        "colab_type": "text"
+      },
+      "source": [
+        "## Evaluate the Customized Model\n",
+        "\n",
+        "Evaluate the model on the validation data and get a dict of metrics including `f1` score and `exact match` etc. Note that metrics are different for SQuAD1.1 and SQuAD2.0."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "A8c2ZQ0J3Riy",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "model.evaluate(validation_data)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "aeHoGAceO2xV",
+        "colab_type": "text"
+      },
+      "source": [
+        "## Export to TensorFlow Lite Model\n",
+        "\n",
+        "Convert the existing model to TensorFlow Lite model format that you can later use in an on-device ML application."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "TwA2Z2pokQJc",
+        "colab_type": "text"
+      },
+      "source": [
+        "Since MobileBERT is too big for on-device applications, use dynamic range quantization on the model to compress MobileBERT by 4x with the minimal loss of performance. First, define the quantization configuration:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "1wBVTO8qkmum",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "config = configs.QuantizationConfig.create_dynamic_range_quantization(optimizations=[tf.lite.Optimize.OPTIMIZE_FOR_LATENCY])\n",
+        "config._experimental_new_quantizer = True"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "qea2YkEGkOTH",
+        "colab_type": "text"
+      },
+      "source": [
+        "Export the quantized TFLite model according to the quantization config and save the vocabulary to a vocab file. The default TFLite model filename is `model.tflite`, and the default vocab filename is `vocab`."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "Im6wA9lK3TQB",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "model.export(export_dir='.', quantization_config=config)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "w12kvDdHJIGH",
+        "colab_type": "text"
+      },
+      "source": [
+        "You can use the TensorFlow Lite model file and vocab file in the [bert_qa](https://github.com/tensorflow/examples/tree/master/lite/examples/bert_qa/android) reference app by downloading it from the left sidebar on Colab."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "HZKYthlVrTos",
+        "colab_type": "text"
+      },
+      "source": [
+        "You can also evalute the tflite model with the `evaluate_tflite` method. This step is expected to take a long time."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "ochbq95ZrVFX",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "model.evaluate_tflite('model.tflite', validation_data)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "EoWiA_zX8rxE",
+        "colab_type": "text"
+      },
+      "source": [
+        "## Advanced Usage\n",
+        "\n",
+        "The `create` function is the critical part of this library in which the `model_spec` parameter defines the model specification. The `BertQAModelSpec` class is currently supported. There are 2 models: MobileBERT model, BERT-Base model. The `create` function comprises the following steps:\n",
+        "\n",
+        "1. Creates the model for question answer according to `model_spec`.\n",
+        "2. Train the question answer model.\n",
+        "\n",
+        "This section describes several advanced topics, including adjusting the model, tuning the training hyperparameters etc."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "mwtiksguDfhl",
+        "colab_type": "text"
+      },
+      "source": [
+        "### Adjust the model\n",
+        "\n",
+        "You can adjust the model infrastructure like parameters `seq_len` and `query_len` in the `BertQAModelSpec` class.\n",
+        "\n",
+        "Adjustable parameters for model:\n",
+        "\n",
+        "* `seq_len`: Length of the passage to feed into the model.\n",
+        "* `query_len`: Length of the question to feed into the model.\n",
+        "* `doc_stride`: The stride when doing a sliding window approach to take chunks of the documents.\n",
+        "* `initializer_range`: The stdev of the truncated_normal_initializer for initializing all weight matrices.\n",
+        "* `trainable`: Boolean, whether pre-trained layer is trainable.\n",
+        "\n",
+        "Adjustable parameters for training pipeline:\n",
+        "\n",
+        "* `model_dir`: The location of the model checkpoint files. If not set, temporary directory will be used.\n",
+        "* `dropout_rate`: The rate for dropout.\n",
+        "* `learning_rate`: The initial learning rate for Adam.\n",
+        "* `predict_batch_size`: Batch size for prediction.\n",
+        "* `tpu`: TPU address to connect to. Only used if using tpu.\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "cAOd5_bzH9AQ",
+        "colab_type": "text"
+      },
+      "source": [
+        "For example, you can train the model with a longer sequence length. If you change the model, you must first construct a new `model_spec`."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "e9WBN0UTQoMN",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "new_spec = model_spec.get('mobilebert_qa')\n",
+        "new_spec.seq_len = 512"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "6LSTdghTP0Cv",
+        "colab_type": "text"
+      },
+      "source": [
+        "The remaining steps are the same. Note that you must rerun both the `dataloader` and `create` parts as different model specs may have different preprocessing steps.\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "LvQuy7RSDir3",
+        "colab_type": "text"
+      },
+      "source": [
+        "### Tune training hyperparameters\n",
+        "You can also tune the training hyperparameters like `epochs` and `batch_size` to impact the model performance. For instance,\n",
+        "\n",
+        "*   `epochs`: more epochs could achieve better performance, but may lead to overfitting.\n",
+        "*   `batch_size`: number of samples to use in one training step.\n",
+        "\n",
+        "For example, you can train with more epochs and with a bigger batch size like:\n",
+        "\n",
+        "```python\n",
+        "model = question_answer.create(train_data, model_spec=spec, epochs=5, batch_size=64)\n",
+        "```"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Eq6B9lKMfhS6",
+        "colab_type": "text"
+      },
+      "source": [
+        "### Change the Model Architecture\n",
+        "\n",
+        "You can change the base model your data trains on by changing the `model_spec`. For example, to change to the BERT-Base model, run:\n",
+        "\n",
+        "```python\n",
+        "spec = model_spec.get('bert_qa')\n",
+        "```"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "L2d7yycrgu6L",
+        "colab_type": "text"
+      },
+      "source": [
+        "The remaining steps are the same."
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/tensorflow/lite/g3doc/tutorials/model_maker_text_classification.ipynb b/tensorflow/lite/g3doc/tutorials/model_maker_text_classification.ipynb
index 4a620960899..480fb426c7e 100644
--- a/tensorflow/lite/g3doc/tutorials/model_maker_text_classification.ipynb
+++ b/tensorflow/lite/g3doc/tutorials/model_maker_text_classification.ipynb
@@ -1,10 +1,25 @@
 {
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "name": "model_maker_text_classification.ipynb",
+      "provenance": [],
+      "collapsed_sections": [],
+      "toc_visible": true
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    }
+  },
   "cells": [
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "h2q27gKz1H20"
+        "id": "h2q27gKz1H20",
+        "colab_type": "text"
       },
       "source": [
         "##### Copyright 2019 The TensorFlow Authors."
@@ -12,14 +27,12 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {
         "cellView": "form",
-        "colab": {},
+        "id": "TUfAcER1oUS6",
         "colab_type": "code",
-        "id": "TUfAcER1oUS6"
+        "colab": {}
       },
-      "outputs": [],
       "source": [
         "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
         "# you may not use this file except in compliance with the License.\n",
@@ -32,13 +45,15 @@
         "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
         "# See the License for the specific language governing permissions and\n",
         "# limitations under the License."
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "Gb7qyhNL1yWt"
+        "id": "Gb7qyhNL1yWt",
+        "colab_type": "text"
       },
       "source": [
         "# Text classification with TensorFlow Lite Model Maker"
@@ -47,68 +62,227 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "Fw5Y7snSuG51"
+        "id": "Fw5Y7snSuG51",
+        "colab_type": "text"
       },
       "source": [
-        "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n",
-        "  \u003ctd\u003e\n",
-        "    \u003ca target=\"_blank\" href=\"https://www.tensorflow.org/lite/tutorials/model_maker_text_classification\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" /\u003eView on TensorFlow.org\u003c/a\u003e\n",
-        "  \u003c/td\u003e\n",
-        "  \u003ctd\u003e\n",
-        "    \u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/tutorials/model_maker_text_classification.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n",
-        "  \u003c/td\u003e\n",
-        "  \u003ctd\u003e\n",
-        "    \u003ca target=\"_blank\" href=\"https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/tutorials/model_maker_text_classification.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\n",
-        "  \u003c/td\u003e\n",
-        "  \u003ctd\u003e\n",
-        "    \u003ca href=\"https://storage.googleapis.com/tensorflow_docs/tensorflow/tensorflow/lite/g3doc/tutorials/model_maker_text_classification.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\n",
-        "  \u003c/td\u003e\n",
-        "\u003c/table\u003e"
+        "<table class=\"tfo-notebook-buttons\" align=\"left\">\n",
+        "  <td>\n",
+        "    <a target=\"_blank\" href=\"https://www.tensorflow.org/lite/tutorials/model_maker_text_classification\"><img src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" />View on TensorFlow.org</a>\n",
+        "  </td>\n",
+        "  <td>\n",
+        "    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/tutorials/model_maker_text_classification.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n",
+        "  </td>\n",
+        "  <td>\n",
+        "    <a target=\"_blank\" href=\"https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/tutorials/model_maker_text_classification.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n",
+        "  </td>\n",
+        "  <td>\n",
+        "    <a href=\"https://storage.googleapis.com/tensorflow_docs/tensorflow/tensorflow/lite/g3doc/tutorials/model_maker_text_classification.ipynb\"><img src=\"https://www.tensorflow.org/images/download_logo_32px.png\" />Download notebook</a>\n",
+        "  </td>\n",
+        "</table>"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "sr3q-gvm3cI8"
+        "id": "sr3q-gvm3cI8",
+        "colab_type": "text"
       },
       "source": [
-        "The TensorFlow Lite Model Maker library simplifies the process of adapting and converting a TensorFlow neural-network model to particular input data when deploying this model for on-device ML applications.\n",
+        "The TensorFlow Lite Model Maker library simplifies the process of adapting and converting a TensorFlow model to particular input data when deploying this model for on-device ML applications.\n",
         "\n",
-        "This notebook shows an end-to-end example that utilizes this Model Maker library to illustrate the adaption and conversion of a commonly-used text classification model to classify movie reviews on a mobile device."
+        "This notebook shows an end-to-end example that utilizes the Model Maker library to illustrate the adaptation and conversion of a commonly-used text classification model to classify movie reviews on a mobile device. The text classification model classifies text into predefined categories.The inputs should be preprocessed text and the outputs are the probabilities of the categories. The dataset used in this tutorial are positive and negative movie reviews."
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "bcLF2PKkSbV3"
+        "id": "bcLF2PKkSbV3",
+        "colab_type": "text"
       },
       "source": [
-        "## Prerequisites\n",
-        "\n",
-        "To run this example, we first need to install several required packages, including Model Maker package that in github [repo](https://github.com/tensorflow/examples/tree/master/tensorflow_examples/lite/model_maker)."
+        "## Prerequisites\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "2vvAObmTqglq",
+        "colab_type": "text"
+      },
+      "source": [
+        "### Install the required packages\n",
+        "To run this example, install the required packages, including the Model Maker package from the [GitHub repo](https://github.com/tensorflow/examples/tree/master/tensorflow_examples/lite/model_maker)."
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {
-        "colab": {},
+        "id": "qhl8lqVamEty",
         "colab_type": "code",
-        "id": "qhl8lqVamEty"
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 1000
+        },
+        "outputId": "9757d141-eb26-46e2-fc48-376e0e244142"
       },
-      "outputs": [],
       "source": [
-        "!pip install git+https://github.com/tensorflow/examples.git#egg=tensorflow-examples[model_maker]"
+        "!pip install tflite-model-maker"
+      ],
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "Collecting tflite-model-maker\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/13/bc/4c23b9cb9ef612a1f48bac5543bd531665de5eab8f8231111aac067f8c30/tflite_model_maker-0.1.2-py3-none-any.whl (104kB)\n",
+            "\r\u001b[K     |███▏                            | 10kB 20.4MB/s eta 0:00:01\r\u001b[K     |██████▎                         | 20kB 5.8MB/s eta 0:00:01\r\u001b[K     |█████████▍                      | 30kB 7.1MB/s eta 0:00:01\r\u001b[K     |████████████▋                   | 40kB 7.7MB/s eta 0:00:01\r\u001b[K     |███████████████▊                | 51kB 6.9MB/s eta 0:00:01\r\u001b[K     |██████████████████▉             | 61kB 7.8MB/s eta 0:00:01\r\u001b[K     |██████████████████████          | 71kB 8.0MB/s eta 0:00:01\r\u001b[K     |█████████████████████████▏      | 81kB 8.4MB/s eta 0:00:01\r\u001b[K     |████████████████████████████▎   | 92kB 7.9MB/s eta 0:00:01\r\u001b[K     |███████████████████████████████▌| 102kB 8.2MB/s eta 0:00:01\r\u001b[K     |████████████████████████████████| 112kB 8.2MB/s \n",
+            "\u001b[?25hRequirement already satisfied: tensorflow-hub>=0.8.0 in /usr/local/lib/python3.6/dist-packages (from tflite-model-maker) (0.9.0)\n",
+            "Collecting fire\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/34/a7/0e22e70778aca01a52b9c899d9c145c6396d7b613719cd63db97ffa13f2f/fire-0.3.1.tar.gz (81kB)\n",
+            "\u001b[K     |████████████████████████████████| 81kB 7.7MB/s \n",
+            "\u001b[?25hCollecting flatbuffers==1.12\n",
+            "  Downloading https://files.pythonhosted.org/packages/eb/26/712e578c5f14e26ae3314c39a1bdc4eb2ec2f4ddc89b708cf8e0a0d20423/flatbuffers-1.12-py2.py3-none-any.whl\n",
+            "Collecting tf-models-nightly\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/d3/e9/c4e5a451c268a5a75a27949562364f6086f6bb33b226a065a8beceefa9ba/tf_models_nightly-2.3.0.dev20200914-py2.py3-none-any.whl (993kB)\n",
+            "\u001b[K     |████████████████████████████████| 1.0MB 17.6MB/s \n",
+            "\u001b[?25hCollecting sentencepiece\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)\n",
+            "\u001b[K     |████████████████████████████████| 1.1MB 31.6MB/s \n",
+            "\u001b[?25hRequirement already satisfied: numpy>=1.17.3 in /usr/local/lib/python3.6/dist-packages (from tflite-model-maker) (1.18.5)\n",
+            "Collecting tf-nightly\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/33/d4/61c47ae889b490b9c5f07f4f61bdc057c158a1a1979c375fa019d647a19e/tf_nightly-2.4.0.dev20200914-cp36-cp36m-manylinux2010_x86_64.whl (390.1MB)\n",
+            "\u001b[K     |████████████████████████████████| 390.2MB 46kB/s \n",
+            "\u001b[?25hRequirement already satisfied: pillow in /usr/local/lib/python3.6/dist-packages (from tflite-model-maker) (7.0.0)\n",
+            "Requirement already satisfied: absl-py in /usr/local/lib/python3.6/dist-packages (from tflite-model-maker) (0.10.0)\n",
+            "Requirement already satisfied: tensorflow-datasets>=2.1.0 in /usr/local/lib/python3.6/dist-packages (from tflite-model-maker) (2.1.0)\n",
+            "Collecting tflite-support==0.1.0rc3.dev2\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/fa/c5/5e9ee3abd5b4ef8294432cd714407f49a66befa864905b66ee8bdc612795/tflite_support-0.1.0rc3.dev2-cp36-cp36m-manylinux2010_x86_64.whl (1.0MB)\n",
+            "\u001b[K     |████████████████████████████████| 1.0MB 50.0MB/s \n",
+            "\u001b[?25hRequirement already satisfied: protobuf>=3.8.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow-hub>=0.8.0->tflite-model-maker) (3.12.4)\n",
+            "Requirement already satisfied: six>=1.12.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow-hub>=0.8.0->tflite-model-maker) (1.15.0)\n",
+            "Requirement already satisfied: termcolor in /usr/local/lib/python3.6/dist-packages (from fire->tflite-model-maker) (1.1.0)\n",
+            "Requirement already satisfied: pycocotools in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly->tflite-model-maker) (2.0.2)\n",
+            "Collecting tensorflow-model-optimization>=0.4.1\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/55/38/4fd48ea1bfcb0b6e36d949025200426fe9c3a8bfae029f0973d85518fa5a/tensorflow_model_optimization-0.5.0-py2.py3-none-any.whl (172kB)\n",
+            "\u001b[K     |████████████████████████████████| 174kB 57.7MB/s \n",
+            "\u001b[?25hCollecting tf-slim>=1.1.0\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/02/97/b0f4a64df018ca018cc035d44f2ef08f91e2e8aa67271f6f19633a015ff7/tf_slim-1.1.0-py2.py3-none-any.whl (352kB)\n",
+            "\u001b[K     |████████████████████████████████| 358kB 54.9MB/s \n",
+            "\u001b[?25hRequirement already satisfied: tensorflow-addons in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly->tflite-model-maker) (0.8.3)\n",
+            "Requirement already satisfied: kaggle>=1.3.9 in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly->tflite-model-maker) (1.5.8)\n",
+            "Requirement already satisfied: oauth2client in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly->tflite-model-maker) (4.1.3)\n",
+            "Collecting seqeval\n",
+            "  Downloading https://files.pythonhosted.org/packages/34/91/068aca8d60ce56dd9ba4506850e876aba5e66a6f2f29aa223224b50df0de/seqeval-0.0.12.tar.gz\n",
+            "Requirement already satisfied: scipy>=0.19.1 in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly->tflite-model-maker) (1.4.1)\n",
+            "Requirement already satisfied: dataclasses in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly->tflite-model-maker) (0.7)\n",
+            "Requirement already satisfied: matplotlib in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly->tflite-model-maker) (3.2.2)\n",
+            "Collecting pyyaml>=5.1\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/64/c2/b80047c7ac2478f9501676c988a5411ed5572f35d1beff9cae07d321512c/PyYAML-5.3.1.tar.gz (269kB)\n",
+            "\u001b[K     |████████████████████████████████| 276kB 55.1MB/s \n",
+            "\u001b[?25hRequirement already satisfied: Cython in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly->tflite-model-maker) (0.29.21)\n",
+            "Requirement already satisfied: google-cloud-bigquery>=0.31.0 in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly->tflite-model-maker) (1.21.0)\n",
+            "Collecting opencv-python-headless\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/b6/2a/496e06fd289c01dc21b11970be1261c87ce1cc22d5340c14b516160822a7/opencv_python_headless-4.4.0.42-cp36-cp36m-manylinux2014_x86_64.whl (36.6MB)\n",
+            "\u001b[K     |████████████████████████████████| 36.6MB 88kB/s \n",
+            "\u001b[?25hRequirement already satisfied: psutil>=5.4.3 in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly->tflite-model-maker) (5.4.8)\n",
+            "Requirement already satisfied: gin-config in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly->tflite-model-maker) (0.3.0)\n",
+            "Requirement already satisfied: google-api-python-client>=1.6.7 in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly->tflite-model-maker) (1.7.12)\n",
+            "Requirement already satisfied: pandas>=0.22.0 in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly->tflite-model-maker) (1.0.5)\n",
+            "Collecting py-cpuinfo>=3.3.0\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/f6/f5/8e6e85ce2e9f6e05040cf0d4e26f43a4718bcc4bce988b433276d4b1a5c1/py-cpuinfo-7.0.0.tar.gz (95kB)\n",
+            "\u001b[K     |████████████████████████████████| 102kB 11.1MB/s \n",
+            "\u001b[?25hRequirement already satisfied: grpcio>=1.8.6 in /usr/local/lib/python3.6/dist-packages (from tf-nightly->tflite-model-maker) (1.32.0)\n",
+            "Requirement already satisfied: opt-einsum>=2.3.2 in /usr/local/lib/python3.6/dist-packages (from tf-nightly->tflite-model-maker) (3.3.0)\n",
+            "Requirement already satisfied: gast==0.3.3 in /usr/local/lib/python3.6/dist-packages (from tf-nightly->tflite-model-maker) (0.3.3)\n",
+            "Requirement already satisfied: wheel>=0.26 in /usr/local/lib/python3.6/dist-packages (from tf-nightly->tflite-model-maker) (0.35.1)\n",
+            "Requirement already satisfied: astunparse==1.6.3 in /usr/local/lib/python3.6/dist-packages (from tf-nightly->tflite-model-maker) (1.6.3)\n",
+            "Requirement already satisfied: h5py<2.11.0,>=2.10.0 in /usr/local/lib/python3.6/dist-packages (from tf-nightly->tflite-model-maker) (2.10.0)\n",
+            "Requirement already satisfied: typing-extensions>=3.7.4.2 in /usr/local/lib/python3.6/dist-packages (from tf-nightly->tflite-model-maker) (3.7.4.3)\n",
+            "Collecting tb-nightly<3.0.0a0,>=2.4.0a0\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/fc/cb/4dfe0d65bffb5e9663261ff664e6f5a2d37672b31dae27a0f14721ac00d3/tb_nightly-2.4.0a20200914-py3-none-any.whl (10.1MB)\n",
+            "\u001b[K     |████████████████████████████████| 10.1MB 46.1MB/s \n",
+            "\u001b[?25hRequirement already satisfied: google-pasta>=0.1.8 in /usr/local/lib/python3.6/dist-packages (from tf-nightly->tflite-model-maker) (0.2.0)\n",
+            "Collecting tf-estimator-nightly\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/bd/9a/3bfb9994eda11e426c809ebdf434e2ac5824a0784d980018bb53fd1620ec/tf_estimator_nightly-2.4.0.dev2020091401-py2.py3-none-any.whl (460kB)\n",
+            "\u001b[K     |████████████████████████████████| 460kB 51.7MB/s \n",
+            "\u001b[?25hRequirement already satisfied: keras-preprocessing<1.2,>=1.1.1 in /usr/local/lib/python3.6/dist-packages (from tf-nightly->tflite-model-maker) (1.1.2)\n",
+            "Requirement already satisfied: wrapt>=1.11.1 in /usr/local/lib/python3.6/dist-packages (from tf-nightly->tflite-model-maker) (1.12.1)\n",
+            "Requirement already satisfied: tensorflow-metadata in /usr/local/lib/python3.6/dist-packages (from tensorflow-datasets>=2.1.0->tflite-model-maker) (0.24.0)\n",
+            "Requirement already satisfied: future in /usr/local/lib/python3.6/dist-packages (from tensorflow-datasets>=2.1.0->tflite-model-maker) (0.16.0)\n",
+            "Requirement already satisfied: promise in /usr/local/lib/python3.6/dist-packages (from tensorflow-datasets>=2.1.0->tflite-model-maker) (2.3)\n",
+            "Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow-datasets>=2.1.0->tflite-model-maker) (2.23.0)\n",
+            "Requirement already satisfied: attrs>=18.1.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow-datasets>=2.1.0->tflite-model-maker) (20.2.0)\n",
+            "Requirement already satisfied: tqdm in /usr/local/lib/python3.6/dist-packages (from tensorflow-datasets>=2.1.0->tflite-model-maker) (4.41.1)\n",
+            "Requirement already satisfied: dill in /usr/local/lib/python3.6/dist-packages (from tensorflow-datasets>=2.1.0->tflite-model-maker) (0.3.2)\n",
+            "Collecting pybind11>=2.4\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/89/e3/d576f6f02bc75bacbc3d42494e8f1d063c95617d86648dba243c2cb3963e/pybind11-2.5.0-py2.py3-none-any.whl (296kB)\n",
+            "\u001b[K     |████████████████████████████████| 296kB 55.2MB/s \n",
+            "\u001b[?25hRequirement already satisfied: setuptools in /usr/local/lib/python3.6/dist-packages (from protobuf>=3.8.0->tensorflow-hub>=0.8.0->tflite-model-maker) (50.3.0)\n",
+            "Requirement already satisfied: dm-tree~=0.1.1 in /usr/local/lib/python3.6/dist-packages (from tensorflow-model-optimization>=0.4.1->tf-models-nightly->tflite-model-maker) (0.1.5)\n",
+            "Requirement already satisfied: typeguard in /usr/local/lib/python3.6/dist-packages (from tensorflow-addons->tf-models-nightly->tflite-model-maker) (2.7.1)\n",
+            "Requirement already satisfied: python-slugify in /usr/local/lib/python3.6/dist-packages (from kaggle>=1.3.9->tf-models-nightly->tflite-model-maker) (4.0.1)\n",
+            "Requirement already satisfied: urllib3<1.25,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from kaggle>=1.3.9->tf-models-nightly->tflite-model-maker) (1.24.3)\n",
+            "Requirement already satisfied: certifi in /usr/local/lib/python3.6/dist-packages (from kaggle>=1.3.9->tf-models-nightly->tflite-model-maker) (2020.6.20)\n",
+            "Requirement already satisfied: slugify in /usr/local/lib/python3.6/dist-packages (from kaggle>=1.3.9->tf-models-nightly->tflite-model-maker) (0.0.1)\n",
+            "Requirement already satisfied: python-dateutil in /usr/local/lib/python3.6/dist-packages (from kaggle>=1.3.9->tf-models-nightly->tflite-model-maker) (2.8.1)\n",
+            "Requirement already satisfied: rsa>=3.1.4 in /usr/local/lib/python3.6/dist-packages (from oauth2client->tf-models-nightly->tflite-model-maker) (4.6)\n",
+            "Requirement already satisfied: pyasn1>=0.1.7 in /usr/local/lib/python3.6/dist-packages (from oauth2client->tf-models-nightly->tflite-model-maker) (0.4.8)\n",
+            "Requirement already satisfied: pyasn1-modules>=0.0.5 in /usr/local/lib/python3.6/dist-packages (from oauth2client->tf-models-nightly->tflite-model-maker) (0.2.8)\n",
+            "Requirement already satisfied: httplib2>=0.9.1 in /usr/local/lib/python3.6/dist-packages (from oauth2client->tf-models-nightly->tflite-model-maker) (0.17.4)\n",
+            "Requirement already satisfied: Keras>=2.2.4 in /usr/local/lib/python3.6/dist-packages (from seqeval->tf-models-nightly->tflite-model-maker) (2.4.3)\n",
+            "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.6/dist-packages (from matplotlib->tf-models-nightly->tflite-model-maker) (0.10.0)\n",
+            "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->tf-models-nightly->tflite-model-maker) (1.2.0)\n",
+            "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->tf-models-nightly->tflite-model-maker) (2.4.7)\n",
+            "Requirement already satisfied: google-resumable-media!=0.4.0,<0.5.0dev,>=0.3.1 in /usr/local/lib/python3.6/dist-packages (from google-cloud-bigquery>=0.31.0->tf-models-nightly->tflite-model-maker) (0.4.1)\n",
+            "Requirement already satisfied: google-cloud-core<2.0dev,>=1.0.3 in /usr/local/lib/python3.6/dist-packages (from google-cloud-bigquery>=0.31.0->tf-models-nightly->tflite-model-maker) (1.0.3)\n",
+            "Requirement already satisfied: google-auth-httplib2>=0.0.3 in /usr/local/lib/python3.6/dist-packages (from google-api-python-client>=1.6.7->tf-models-nightly->tflite-model-maker) (0.0.4)\n",
+            "Requirement already satisfied: uritemplate<4dev,>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from google-api-python-client>=1.6.7->tf-models-nightly->tflite-model-maker) (3.0.1)\n",
+            "Requirement already satisfied: google-auth>=1.4.1 in /usr/local/lib/python3.6/dist-packages (from google-api-python-client>=1.6.7->tf-models-nightly->tflite-model-maker) (1.17.2)\n",
+            "Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas>=0.22.0->tf-models-nightly->tflite-model-maker) (2018.9)\n",
+            "Requirement already satisfied: markdown>=2.6.8 in /usr/local/lib/python3.6/dist-packages (from tb-nightly<3.0.0a0,>=2.4.0a0->tf-nightly->tflite-model-maker) (3.2.2)\n",
+            "Requirement already satisfied: google-auth-oauthlib<0.5,>=0.4.1 in /usr/local/lib/python3.6/dist-packages (from tb-nightly<3.0.0a0,>=2.4.0a0->tf-nightly->tflite-model-maker) (0.4.1)\n",
+            "Requirement already satisfied: tensorboard-plugin-wit>=1.6.0 in /usr/local/lib/python3.6/dist-packages (from tb-nightly<3.0.0a0,>=2.4.0a0->tf-nightly->tflite-model-maker) (1.7.0)\n",
+            "Requirement already satisfied: werkzeug>=0.11.15 in /usr/local/lib/python3.6/dist-packages (from tb-nightly<3.0.0a0,>=2.4.0a0->tf-nightly->tflite-model-maker) (1.0.1)\n",
+            "Requirement already satisfied: googleapis-common-protos<2,>=1.52.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow-metadata->tensorflow-datasets>=2.1.0->tflite-model-maker) (1.52.0)\n",
+            "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests>=2.19.0->tensorflow-datasets>=2.1.0->tflite-model-maker) (2.10)\n",
+            "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests>=2.19.0->tensorflow-datasets>=2.1.0->tflite-model-maker) (3.0.4)\n",
+            "Requirement already satisfied: text-unidecode>=1.3 in /usr/local/lib/python3.6/dist-packages (from python-slugify->kaggle>=1.3.9->tf-models-nightly->tflite-model-maker) (1.3)\n",
+            "Requirement already satisfied: google-api-core<2.0.0dev,>=1.14.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-core<2.0dev,>=1.0.3->google-cloud-bigquery>=0.31.0->tf-models-nightly->tflite-model-maker) (1.16.0)\n",
+            "Requirement already satisfied: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.4.1->google-api-python-client>=1.6.7->tf-models-nightly->tflite-model-maker) (4.1.1)\n",
+            "Requirement already satisfied: importlib-metadata; python_version < \"3.8\" in /usr/local/lib/python3.6/dist-packages (from markdown>=2.6.8->tb-nightly<3.0.0a0,>=2.4.0a0->tf-nightly->tflite-model-maker) (1.7.0)\n",
+            "Requirement already satisfied: requests-oauthlib>=0.7.0 in /usr/local/lib/python3.6/dist-packages (from google-auth-oauthlib<0.5,>=0.4.1->tb-nightly<3.0.0a0,>=2.4.0a0->tf-nightly->tflite-model-maker) (1.3.0)\n",
+            "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.6/dist-packages (from importlib-metadata; python_version < \"3.8\"->markdown>=2.6.8->tb-nightly<3.0.0a0,>=2.4.0a0->tf-nightly->tflite-model-maker) (3.1.0)\n",
+            "Requirement already satisfied: oauthlib>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<0.5,>=0.4.1->tb-nightly<3.0.0a0,>=2.4.0a0->tf-nightly->tflite-model-maker) (3.1.0)\n",
+            "Building wheels for collected packages: fire, seqeval, pyyaml, py-cpuinfo\n",
+            "  Building wheel for fire (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+            "  Created wheel for fire: filename=fire-0.3.1-py2.py3-none-any.whl size=111005 sha256=9eaa2d36e17621d136f8ab1707a5a4e8994c53d5076a9edde21aab7696ba3e09\n",
+            "  Stored in directory: /root/.cache/pip/wheels/c1/61/df/768b03527bf006b546dce284eb4249b185669e65afc5fbb2ac\n",
+            "  Building wheel for seqeval (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+            "  Created wheel for seqeval: filename=seqeval-0.0.12-cp36-none-any.whl size=7423 sha256=1ce4604da2a395f0304db708bf2e2c1831033ed8b1f7c23927d70ed9ed7b7110\n",
+            "  Stored in directory: /root/.cache/pip/wheels/4f/32/0a/df3b340a82583566975377d65e724895b3fad101a3fb729f68\n",
+            "  Building wheel for pyyaml (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+            "  Created wheel for pyyaml: filename=PyYAML-5.3.1-cp36-cp36m-linux_x86_64.whl size=44619 sha256=d51b6ef3e90de74d0c1cee8f7aafe0a6d8674348c8437cd89ad5c60a6c3dc726\n",
+            "  Stored in directory: /root/.cache/pip/wheels/a7/c1/ea/cf5bd31012e735dc1dfea3131a2d5eae7978b251083d6247bd\n",
+            "  Building wheel for py-cpuinfo (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+            "  Created wheel for py-cpuinfo: filename=py_cpuinfo-7.0.0-cp36-none-any.whl size=20071 sha256=096439bff3cb3e4cc21b86472c629017fd9c972d6e2ed231e1a91d2096fc687d\n",
+            "  Stored in directory: /root/.cache/pip/wheels/f1/93/7b/127daf0c3a5a49feb2fecd468d508067c733fba5192f726ad1\n",
+            "Successfully built fire seqeval pyyaml py-cpuinfo\n",
+            "Installing collected packages: fire, flatbuffers, tensorflow-model-optimization, tf-slim, seqeval, pyyaml, opencv-python-headless, sentencepiece, tb-nightly, tf-estimator-nightly, tf-nightly, py-cpuinfo, tf-models-nightly, pybind11, tflite-support, tflite-model-maker\n",
+            "  Found existing installation: PyYAML 3.13\n",
+            "    Uninstalling PyYAML-3.13:\n",
+            "      Successfully uninstalled PyYAML-3.13\n",
+            "Successfully installed fire-0.3.1 flatbuffers-1.12 opencv-python-headless-4.4.0.42 py-cpuinfo-7.0.0 pybind11-2.5.0 pyyaml-5.3.1 sentencepiece-0.1.91 seqeval-0.0.12 tb-nightly-2.4.0a20200914 tensorflow-model-optimization-0.5.0 tf-estimator-nightly-2.4.0.dev2020091401 tf-models-nightly-2.3.0.dev20200914 tf-nightly-2.4.0.dev20200914 tf-slim-1.1.0 tflite-model-maker-0.1.2 tflite-support-0.1.0rc3.dev2\n"
+          ],
+          "name": "stdout"
+        }
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "l6lRhVK9Q_0U"
+        "id": "l6lRhVK9Q_0U",
+        "colab_type": "text"
       },
       "source": [
         "Import the required packages."
@@ -116,13 +290,11 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {
-        "colab": {},
+        "id": "XtxiUeZEiXpt",
         "colab_type": "code",
-        "id": "XtxiUeZEiXpt"
+        "colab": {}
       },
-      "outputs": [],
       "source": [
         "import numpy as np\n",
         "import os\n",
@@ -130,747 +302,748 @@
         "import tensorflow as tf\n",
         "assert tf.__version__.startswith('2')\n",
         "\n",
-        "from tensorflow_examples.lite.model_maker.core.data_util.text_dataloader import TextClassifierDataLoader\n",
-        "from tensorflow_examples.lite.model_maker.core.task.model_spec import AverageWordVecModelSpec\n",
-        "from tensorflow_examples.lite.model_maker.core.task.model_spec import BertClassifierModelSpec\n",
-        "from tensorflow_examples.lite.model_maker.core.task import text_classifier"
-      ]
+        "from tflite_model_maker import configs\n",
+        "from tflite_model_maker import model_spec\n",
+        "from tflite_model_maker import text_classifier\n",
+        "from tflite_model_maker import TextClassifierDataLoader"
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "06sWWfvE6I8e"
-      },
-      "source": [
-        "## Simple End-to-End Example"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "BRd13bfetO7B"
+        "id": "BRd13bfetO7B",
+        "colab_type": "text"
       },
       "source": [
         "### Get the data path\n",
-        "Let's get some texts to play with this simple end-to-end example."
+        "Download the dataset for this tutorial."
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {
-        "colab": {},
+        "id": "R2BSkxWg6Rhx",
         "colab_type": "code",
-        "id": "R2BSkxWg6Rhx"
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 51
+        },
+        "outputId": "972b735a-d5f6-4152-9c0f-12b7b97b8d86"
       },
-      "outputs": [],
       "source": [
-        "data_path = tf.keras.utils.get_file(\n",
-        "      fname='aclImdb',\n",
-        "      origin='http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz',\n",
-        "      untar=True)"
+        "data_dir = tf.keras.utils.get_file(\n",
+        "      fname='SST-2.zip',\n",
+        "      origin='https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8',\n",
+        "      extract=True)\n",
+        "data_dir = os.path.join(os.path.dirname(data_dir), 'SST-2')"
+      ],
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "Downloading data from https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8\n",
+            "7446528/7439277 [==============================] - 0s 0us/step\n"
+          ],
+          "name": "stdout"
+        }
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "6MSCjPAvs2EQ"
+        "id": "6MSCjPAvs2EQ",
+        "colab_type": "text"
       },
       "source": [
-        " You could replace it with your own text folders. As for uploading data to colab, you could find the upload button in the left sidebar shown in the image below with the red rectangle. Just have a try to upload a zip file and unzip it. The root file path is the current path.\n",
+        "You can also upload your own dataset to work through this tutorial. Upload your dataset by using the left sidebar in Colab.\n",
         "\n",
-        "\u003cimg src=\"https://storage.googleapis.com/download.tensorflow.org/models/tflite/screenshots/model_maker_text_classification.png\" alt=\"Upload File\" width=\"800\" hspace=\"100\"\u003e\n"
+        "<img src=\"https://storage.googleapis.com/download.tensorflow.org/models/tflite/screenshots/model_maker_text_classification.png\" alt=\"Upload File\" width=\"800\" hspace=\"100\">\n"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "uO5egTlrtWxm"
+        "id": "uO5egTlrtWxm",
+        "colab_type": "text"
       },
       "source": [
-        "If you prefer not to upload your images to the cloud, you could try to run the library locally following the [guide](https://github.com/tensorflow/examples/tree/master/tensorflow_examples/lite/model_maker) in github."
+        "If you prefer not to upload your dataset to the cloud, you can also locally run the library by following the [guide](https://github.com/tensorflow/examples/tree/master/tensorflow_examples/lite/model_maker)."
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "WlKU3SMX6TnB"
+        "id": "xushUyZXqP59",
+        "colab_type": "text"
       },
       "source": [
-        "### Run the example\n",
+        "## End-to-End Workflow"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "WlKU3SMX6TnB",
+        "colab_type": "text"
+      },
+      "source": [
+        "This workflow consists of five steps as outlined below:"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "PBPUIhEjMjTR",
+        "colab_type": "text"
+      },
+      "source": [
+        "Step 1. Choose a model specification that represents a text classification model.\n",
         "\n",
-        "The example just consists of 6 lines of code as shown below, representing 5 steps of the overall process."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "PBPUIhEjMjTR"
-      },
-      "source": [
-        "Step 0. Choose a `model_spec` that represents a model for text classifier."
+        "This tutorial uses [MobileBERT](https://arxiv.org/pdf/2004.02984.pdf) as an example."
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {
-        "colab": {},
+        "id": "CtdZ-JDwMimd",
         "colab_type": "code",
-        "id": "CtdZ-JDwMimd"
+        "colab": {}
       },
-      "outputs": [],
       "source": [
-        "model_spec = AverageWordVecModelSpec()"
-      ]
+        "spec = model_spec.get('mobilebert_classifier')"
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "s5U-A3tw6Y27"
+        "id": "s5U-A3tw6Y27",
+        "colab_type": "text"
       },
       "source": [
-        "Step 1.   Load train and test data specific to an on-device ML app and preprocess the data according to specific `model_spec`."
+        "Step 2.   Load train and test data specific to an on-device ML app and preprocess the data according to a specific `model_spec`."
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {
-        "colab": {},
+        "id": "HD5BvzWe6YKa",
         "colab_type": "code",
-        "id": "HD5BvzWe6YKa"
+        "colab": {}
       },
-      "outputs": [],
       "source": [
-        "train_data = TextClassifierDataLoader.from_folder(os.path.join(data_path, 'train'), model_spec=model_spec, class_labels=['pos', 'neg'])\n",
-        "test_data = TextClassifierDataLoader.from_folder(os.path.join(data_path, 'test'), model_spec=model_spec, is_training=False, shuffle=False)"
-      ]
+        "train_data = TextClassifierDataLoader.from_csv(\n",
+        "      filename=os.path.join(os.path.join(data_dir, 'train.tsv')),\n",
+        "      text_column='sentence',\n",
+        "      label_column='label',\n",
+        "      model_spec=spec,\n",
+        "      delimiter='\\t',\n",
+        "      is_training=True)\n",
+        "test_data = TextClassifierDataLoader.from_csv(\n",
+        "      filename=os.path.join(os.path.join(data_dir, 'dev.tsv')),\n",
+        "      text_column='sentence',\n",
+        "      label_column='label',\n",
+        "      model_spec=spec,\n",
+        "      delimiter='\\t',\n",
+        "      is_training=False)"
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "2uZkLR6N6gDR"
+        "id": "2uZkLR6N6gDR",
+        "colab_type": "text"
       },
       "source": [
-        "Step 2. Customize the TensorFlow model."
+        "Step 3. Customize the TensorFlow model."
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {
-        "colab": {},
+        "id": "kwlYdTcg63xy",
         "colab_type": "code",
-        "id": "kwlYdTcg63xy"
+        "colab": {}
       },
-      "outputs": [],
       "source": [
-        "model = text_classifier.create(train_data, model_spec=model_spec)"
-      ]
+        "model = text_classifier.create(train_data, model_spec=spec)"
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "-BzCHLWJ6h7q"
+        "id": "-BzCHLWJ6h7q",
+        "colab_type": "text"
       },
       "source": [
-        "Step 3. Evaluate the model."
+        "Step 4. Evaluate the model."
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {
-        "colab": {},
+        "id": "8xmnl6Yy7ARn",
         "colab_type": "code",
-        "id": "8xmnl6Yy7ARn"
+        "colab": {}
       },
-      "outputs": [],
       "source": [
         "loss, acc = model.evaluate(test_data)"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "CgCDMe0e6jlT"
+        "id": "CgCDMe0e6jlT",
+        "colab_type": "text"
       },
       "source": [
-        "Step 4.  Export to TensorFlow Lite  model.\n",
-        "You could download it in the left sidebar same as the uploading part for your own use."
+        "Step 5.  Export as a TensorFlow Lite  model.\n",
+        "\n",
+        "Since MobileBERT is too big for on-device applications, use [dynamic range quantization](https://www.tensorflow.org/lite/performance/post_training_quantization#dynamic_range_quantization) on the model to compress it by almost 4x with minimal performance degradation."
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {
-        "colab": {},
+        "id": "ZQRLmkGumr9Y",
         "colab_type": "code",
-        "id": "Hm_UULdW7A9T"
+        "colab": {}
       },
-      "outputs": [],
       "source": [
-        "model.export(export_dir='.')"
-      ]
+        "config = configs.QuantizationConfig.create_dynamic_range_quantization(optimizations=[tf.lite.Optimize.OPTIMIZE_FOR_LATENCY])\n",
+        "config._experimental_new_quantizer = True"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "Hm_UULdW7A9T",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "model.export(export_dir='mobilebert/', quantization_config=config)"
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "rVxaf3x_7OfB"
+        "id": "rVxaf3x_7OfB",
+        "colab_type": "text"
       },
       "source": [
-        "After this simple 5 steps, we could further use TensorFlow Lite model file and label file in on-device applications like in [text classification](https://github.com/tensorflow/examples/tree/master/lite/examples/text_classification) reference app."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "l65ctmtW7_FF"
-      },
-      "source": [
-        "## Detailed Process\n",
+        "You can also download the model using the left sidebar in Colab.\n",
         "\n",
-        "In the above, we tried the simple end-to-end example. The following walks through the example step by step to show more detail."
+        "After executing the 5 steps above, you can further use the TensorFlow Lite model file and label file in on-device applications like in a [text classification](https://github.com/tensorflow/examples/tree/master/lite/examples/text_classification) reference app."
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "kJ_B8fMDOhMR"
+        "id": "l65ctmtW7_FF",
+        "colab_type": "text"
       },
       "source": [
-        "### Step 0: Choose a model_spec that represents a model for text classifier.\n",
+        "The following sections walk through the example step by step to show more detail."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "kJ_B8fMDOhMR",
+        "colab_type": "text"
+      },
+      "source": [
+        "## Choose a `model_spec` that Represents a Model for Text Classifier\n",
         "\n",
-        "each `model_spec` object represents a specific model for the text classifier. Currently, we support averging word embedding model and BERT-base model."
+        "Each `model_spec` object represents a specific model for the text classifier. TensorFlow Lite Model Maker currently supports [MobileBERT](https://arxiv.org/pdf/2004.02984.pdf), averaging word embeddings and [BERT-Base]((https://arxiv.org/pdf/1810.04805.pdf) models.\n",
+        "\n",
+        "Supported Model | Name of model_spec | Model Description\n",
+        "--- | --- | ---\n",
+        "MobileBERT | 'mobilebert_classifier' | 4.3x smaller and 5.5x faster than BERT-Base while achieving competitive results, suitable for on-device applications.\n",
+        "BERT-Base | 'bert_classifier' | Standard BERT model that is widely used in NLP tasks.\n",
+        "averaging word embedding | 'average_word_vec' | Averaging text word embeddings with RELU activation.\n",
+        "\n",
+        "This tutorial uses a smaller model, `average_word_vec` that you can retrain multiple times to demonstrate the process."
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {
-        "colab": {},
+        "id": "vEAWuZQ1PFiX",
         "colab_type": "code",
-        "id": "vEAWuZQ1PFiX"
+        "colab": {}
       },
-      "outputs": [],
       "source": [
-        "model_spec = AverageWordVecModelSpec()"
+        "spec = model_spec.get('average_word_vec')"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "ygEncJxtl-nQ",
+        "colab_type": "text"
+      },
+      "source": [
+        "## Load Input Data Specific to an On-device ML App\n",
+        "\n",
+        "The [SST-2](https://nlp.stanford.edu/sentiment/index.html) (Stanford Sentiment Treebank) is one of the tasks in the [GLUE](https://gluebenchmark.com/) benchmark . It contains 67,349 movie reviews for training and 872 movie reviews for validation. The dataset has two classes: positive and negative movie reviews.\n",
+        "\n",
+        "Download the archived version of the dataset and extract it.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "7tOfUr2KlgpU",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "data_dir = tf.keras.utils.get_file(\n",
+        "      fname='SST-2.zip',\n",
+        "      origin='https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8',\n",
+        "      extract=True)\n",
+        "data_dir = os.path.join(os.path.dirname(data_dir), 'SST-2')"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "E051HBUM5owi",
+        "colab_type": "text"
+      },
+      "source": [
+        "The SST-2 dataset has `train.tsv` for training and `dev.tsv` for validation. The files have the following format:\n",
+        "\n",
+        "sentence | label\n",
+        "--- | ---\n",
+        "it 's a charming and often affecting journey . | 1\n",
+        "unflinchingly bleak and desperate | 0\n",
+        "\n",
+        "A positive review is labeled 1 and a negative review is labeled 0.\n",
+        "\n",
+        "Use the `TestClassifierDataLoader.from_csv` method to load the data."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "I_fOlZsklmlL",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "train_data = TextClassifierDataLoader.from_csv(\n",
+        "      filename=os.path.join(os.path.join(data_dir, 'train.tsv')),\n",
+        "      text_column='sentence',\n",
+        "      label_column='label',\n",
+        "      model_spec=spec,\n",
+        "      delimiter='\\t',\n",
+        "      is_training=True)\n",
+        "test_data = TextClassifierDataLoader.from_csv(\n",
+        "      filename=os.path.join(os.path.join(data_dir, 'dev.tsv')),\n",
+        "      text_column='sentence',\n",
+        "      label_column='label',\n",
+        "      model_spec=spec,\n",
+        "      delimiter='\\t',\n",
+        "      is_training=False)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "MlHvVvv2hw4H",
+        "colab_type": "text"
+      },
+      "source": [
+        "The Model Maker library also supports the `from_folder()` method to load data. It assumes that the text data of the same class are in the same subdirectory and that the subfolder name is the class name. Each text file contains one movie review sample. The `class_labels` parameter is used to specify which the subfolders."
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "ygEncJxtl-nQ"
+        "id": "AWuoensX4vDA",
+        "colab_type": "text"
       },
       "source": [
-        "### Step 1: Load Input Data Specific to an On-device ML App\n",
+        "## Customize the TensorFlow Model\n",
         "\n",
-        "The IMDB dataset contains 25000 movie reviews for training and 25000 movie reviews for testing from the [Internet Movie Database](https://www.imdb.com/). The dataset has two classes: positive and negative movie reviews.\n",
-        "\n",
-        "Download the archive version of the dataset and untar it.\n",
-        "\n",
-        "The IMDB dataset has the following directory structure:\n",
-        "\n",
-        "\u003cpre\u003e\n",
-        "\u003cb\u003eaclImdb\u003c/b\u003e\n",
-        "|__ \u003cb\u003etrain\u003c/b\u003e\n",
-        "    |______ \u003cb\u003epos\u003c/b\u003e: [1962_10.txt, 2499_10.txt, ...]\n",
-        "    |______ \u003cb\u003eneg\u003c/b\u003e: [104_3.txt, 109_2.txt, ...]\n",
-        "    |______ unsup: [12099_0.txt, 1424_0.txt, ...]\n",
-        "|__ \u003cb\u003etest\u003c/b\u003e\n",
-        "    |______ \u003cb\u003epos\u003c/b\u003e: [1384_9.txt, 191_9.txt, ...]\n",
-        "    |______ \u003cb\u003eneg\u003c/b\u003e: [1629_1.txt, 21_1.txt]\n",
-        "\n",
-        "\u003c/pre\u003e\n",
-        "\n",
-        "Note that the text data under `train/unsup` folder are unlabeled documents for unsupervised learning and such data should be ignored in this tutorial.\n"
+        "Create a custom text classifier model based on the loaded data."
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {
-        "colab": {},
+        "id": "TvYSUuJY3QxR",
         "colab_type": "code",
-        "id": "7tOfUr2KlgpU"
+        "colab": {}
       },
-      "outputs": [],
       "source": [
-        "data_path = tf.keras.utils.get_file(\n",
-        "      fname='aclImdb',\n",
-        "      origin='http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz',\n",
-        "      untar=True)"
-      ]
+        "model = text_classifier.create(train_data, model_spec=spec, epochs=10)"
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "E051HBUM5owi"
+        "id": "0JKI-pNc8idH",
+        "colab_type": "text"
       },
       "source": [
-        "Use `TextClassifierDataLoader` to load data.\n",
-        "\n",
-        "As for `from_folder()` method, it could load data from the folder. It assumes that the text data of the same class are in the same subdirectory and the subfolder name is the class name. Each text file contains one movie review sample.\n",
-        "\n",
-        "Parameter `class_labels` is used to specify which subfolder should be considered. As for `train` folder, this parameter is used to skip `unsup` subfolder.\n"
+        "Examine the detailed model structure."
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {
-        "colab": {},
+        "id": "gd7Hs8TF8n3H",
         "colab_type": "code",
-        "id": "I_fOlZsklmlL"
+        "colab": {}
       },
-      "outputs": [],
-      "source": [
-        "train_data = TextClassifierDataLoader.from_folder(os.path.join(data_path, 'train'), model_spec=model_spec, class_labels=['pos', 'neg'])\n",
-        "test_data = TextClassifierDataLoader.from_folder(os.path.join(data_path, 'test'), model_spec=model_spec, is_training=False, shuffle=False)\n",
-        "train_data, validation_data = train_data.split(0.9)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "AWuoensX4vDA"
-      },
-      "source": [
-        "### Step 2: Customize the TensorFlow Model\n",
-        "\n",
-        "Create a custom text classifier model based on the loaded data. Currently, we support averaging word embedding and BERT-base model."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "TvYSUuJY3QxR"
-      },
-      "outputs": [],
-      "source": [
-        "model = text_classifier.create(train_data, model_spec=model_spec, validation_data=validation_data)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "0JKI-pNc8idH"
-      },
-      "source": [
-        "Have a look at the detailed model structure."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "gd7Hs8TF8n3H"
-      },
-      "outputs": [],
       "source": [
         "model.summary()"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "LP5FPk_tOxoZ"
+        "id": "LP5FPk_tOxoZ",
+        "colab_type": "text"
       },
       "source": [
-        "### Step 3: Evaluate the Customized Model\n",
+        "## Evaluate the Customized Model\n",
         "\n",
-        "Evaluate the result of the model, get the loss and accuracy of the model.\n",
+        "Evaluate the result of the model and get the loss and accuracy of the model.\n",
         "\n",
-        "Evaluate the loss and accuracy in `test_data`. If no data is given the results are evaluated on the data that's splitted in the `create` method."
+        "Evaluate the loss and accuracy in the test data."
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {
-        "colab": {},
+        "id": "A8c2ZQ0J3Riy",
         "colab_type": "code",
-        "id": "A8c2ZQ0J3Riy"
+        "colab": {}
       },
-      "outputs": [],
       "source": [
         "loss, acc = model.evaluate(test_data)"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "aeHoGAceO2xV"
+        "id": "aeHoGAceO2xV",
+        "colab_type": "text"
       },
       "source": [
-        "### Step 4: Export to TensorFlow Lite Model\n",
+        "## Export as a TensorFlow Lite Model\n",
         "\n",
-        "Convert the existing model to TensorFlow Lite model format that could be later used in on-device ML application. Meanwhile, save the text labels in label file and vocabulary in vocab file. The default TFLite filename is `model.tflite`, the default label filename is `label.txt`, the default vocab filename is `vocab`."
+        "Convert the existing model to TensorFlow Lite model format that you can later use in an on-device ML application. Save the text labels in a label file and vocabulary in a vocab file. The default TFLite filename is `model.tflite`, the default label filename is `label.txt` and the default vocab filename is `vocab`."
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {
-        "colab": {},
+        "id": "Im6wA9lK3TQB",
         "colab_type": "code",
-        "id": "Im6wA9lK3TQB"
+        "colab": {}
       },
-      "outputs": [],
       "source": [
-        "model.export(export_dir='.')"
+        "model.export(export_dir='average_word_vec/')"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "w12kvDdHJIGH",
+        "colab_type": "text"
+      },
+      "source": [
+        "The TensorFlow Lite model file and label file can be used in the [text classification](https://github.com/tensorflow/examples/tree/master/lite/examples/text_classification) reference app by adding `model.tflite`, `text_label.txt` and `vocab.txt` to the [assets directory](https://github.com/tensorflow/examples/tree/master/lite/examples/text_classification/android/app/src/main/assets). Do not forget to also change the filenames in the [code](https://github.com/tensorflow/examples/blob/master/lite/examples/text_classification/android/app/src/main/java/org/tensorflow/lite/examples/textclassification/TextClassificationClient.java#L43)."
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "w12kvDdHJIGH"
+        "id": "HZKYthlVrTos",
+        "colab_type": "text"
       },
       "source": [
-        "The TensorFlow Lite model file and label file could be used in the [text classification](https://github.com/tensorflow/examples/tree/master/lite/examples/text_classification) reference app.\n",
-        "\n",
-        "In detail, we could add `movie_review_classifier.tflite`, `text_label.txt` and `vocab.txt` to the [assets directory](https://github.com/tensorflow/examples/tree/master/lite/examples/text_classification/android/app/src/main/assets) folder. Meanwhile, change the filenames in [code](https://github.com/tensorflow/examples/blob/master/lite/examples/text_classification/android/app/src/main/java/org/tensorflow/lite/examples/textclassification/TextClassificationClient.java#L43). "
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "HZKYthlVrTos"
-      },
-      "source": [
-        "Here, we also demonstrate how to use the above files to run and evaluate the TensorFlow Lite model."
+        "You can evalute the tflite model with `evaluate_tflite` method."
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {
-        "colab": {},
+        "id": "ochbq95ZrVFX",
         "colab_type": "code",
-        "id": "ochbq95ZrVFX"
+        "colab": {}
       },
-      "outputs": [],
       "source": [
-        "# Read TensorFlow Lite model from TensorFlow Lite file.\n",
-        "with tf.io.gfile.GFile('model.tflite', 'rb') as f:\n",
-        "  model_content = f.read()\n",
-        "\n",
-        "# Read label names from label file.\n",
-        "with tf.io.gfile.GFile('labels.txt', 'r') as f:\n",
-        "  label_names = f.read().split('\\n')\n",
-        "\n",
-        "# Initialze TensorFlow Lite inpterpreter.\n",
-        "interpreter = tf.lite.Interpreter(model_content=model_content)\n",
-        "interpreter.allocate_tensors()\n",
-        "input_index = interpreter.get_input_details()[0]['index']\n",
-        "output = interpreter.tensor(interpreter.get_output_details()[0][\"index\"])\n",
-        "\n",
-        "# Run predictions on each test data and calculate accuracy.\n",
-        "accurate_count = 0\n",
-        "for text, label in test_data.dataset:\n",
-        "    # Add batch dimension and convert to float32 to match with the model's input\n",
-        "    # data format.\n",
-        "    text = tf.expand_dims(text, 0)\n",
-        "\n",
-        "    # Run inference.\n",
-        "    interpreter.set_tensor(input_index, text)\n",
-        "    interpreter.invoke()\n",
-        "\n",
-        "    # Post-processing: remove batch dimension and find the label with highest\n",
-        "    # probability.\n",
-        "    predict_label = np.argmax(output()[0])\n",
-        "    # Get label name with label index.\n",
-        "    predict_label_name = label_names[predict_label]\n",
-        "    accurate_count += (predict_label == label.numpy())\n",
-        "\n",
-        "accuracy = accurate_count * 1.0 / test_data.size\n",
-        "print('TensorFlow Lite model accuracy = %.4f' % accuracy)"
-      ]
+        "model.evaluate_tflite('average_word_vec/model.tflite', test_data)"
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "KLKmboKFtgc2"
-      },
-      "source": [
-        "Note that preprocessing for inference should be the same as training. Currently, preprocessing contains split the text to tokens by '\\W', encode the tokens to ids, the pad the text with `pad_id` to have the length of `seq_length`."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "EoWiA_zX8rxE"
+        "id": "EoWiA_zX8rxE",
+        "colab_type": "text"
       },
       "source": [
         "## Advanced Usage\n",
         "\n",
-        "The `create` function is the critical part of this library in which parameter `model_spec` defines the specification of the model, currently `AverageWordVecModelSpec` and `BertModelSpec` is supported. The `create` function contains the following steps for `AverageWordVecModelSpec`:\n",
+        "The `create` function is the driver function that the Model Maker library uses to create models. The `model spec` parameter defines the model specification. The `AverageWordVecModelSpec` and `BertClassifierModelSpec` classes are currently supported. The `create` function comprises of the following steps:\n",
         "\n",
-        "1.   Tokenize the text and select the top `num_words` most frequent words to generate the vocubulary. The default value of `num_words` in `AverageWordVecModelSpec` object is `10000`.\n",
-        "2.   Encode the text string tokens to int ids.\n",
-        "3.   Create the text classifier model. Currently, this library supports one model: average the word embedding of the text with RELU activation, then leverage softmax dense layer for classification. As for [Embedding layer](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Embedding), the input dimension is the size of the vocabulary, the output dimension is `AverageWordVecModelSpec` object's variable `wordvec_dim` which default value is `16`, the input length is `AverageWordVecModelSpec` object's variable `seq_len` which default value is `256`.\n",
-        "4.   Train the classifier model. The default epoch is `2` and the default batch size is `32`.\n",
+        "1. Creates the model for the text classifier according to `model_spec`.\n",
+        "2. Trains the classifier model.  The default epochs and the default batch size are set by the `default_training_epochs` and `default_batch_size` variables in the `model_spec` object.\n",
         "\n",
-        "In this section, we describe several advanced topics, including adjusting the model, changing the training hyperparameters etc.\n"
+        "This section covers advanced usage topics like adjusting the model and the training hyperparameters."
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "mwtiksguDfhl"
+        "id": "mwtiksguDfhl",
+        "colab_type": "text"
       },
       "source": [
-        "## Adjust the model\n",
+        "### Adjust the model\n",
         "\n",
-        "We could adjust the model infrastructure like variables `wordvec_dim`, `seq_len` in `AverageWordVecModelSpec` class.\n"
+        "You can adjust the model infrastructure like the `wordvec_dim` and the `seq_len` variables in the `AverageWordVecModelSpec` class.\n"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "cAOd5_bzH9AQ"
+        "id": "cAOd5_bzH9AQ",
+        "colab_type": "text"
       },
       "source": [
-        "*   `wordvec_dim`: Dimension of word embedding.\n",
-        "*   `seq_len`: length of sequence.\n",
-        "\n",
-        "For example, we could train with larger `wordvec_dim`. If we change the model, we need to construct the new `model_spec` firstly."
+        "For example, you can train the model with a larger value of `wordvec_dim`. Note that you must construct a new `model_spec` if you modify the model."
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {
-        "colab": {},
+        "id": "e9WBN0UTQoMN",
         "colab_type": "code",
-        "id": "e9WBN0UTQoMN"
+        "colab": {}
       },
-      "outputs": [],
       "source": [
-        "new_model_spec = AverageWordVecModelSpec(wordvec_dim=32)"
-      ]
+        "new_model_spec = model_spec.AverageWordVecModelSpec(wordvec_dim=32)"
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "6LSTdghTP0Cv"
+        "id": "6LSTdghTP0Cv",
+        "colab_type": "text"
       },
       "source": [
-        "Secondly, we should get the preprocessed data accordingly."
+        "Get the preprocessed data."
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {
-        "colab": {},
+        "id": "DVZurFBORG3J",
         "colab_type": "code",
-        "id": "DVZurFBORG3J"
+        "colab": {}
       },
-      "outputs": [],
       "source": [
-        "new_train_data = TextClassifierDataLoader.from_folder(os.path.join(data_path, 'train'), model_spec=new_model_spec, class_labels=['pos', 'neg'])\n",
-        "new_train_data, new_validation_data = new_train_data.split(0.9)"
-      ]
+        "new_train_data = TextClassifierDataLoader.from_csv(\n",
+        "      filename=os.path.join(os.path.join(data_dir, 'train.tsv')),\n",
+        "      text_column='sentence',\n",
+        "      label_column='label',\n",
+        "      model_spec=new_model_spec,\n",
+        "      delimiter='\\t',\n",
+        "      is_training=True)"
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "tD7QVVHeRZoM"
+        "id": "tD7QVVHeRZoM",
+        "colab_type": "text"
       },
       "source": [
-        "Finally, we could train the new model."
+        "Train the new model."
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {
-        "colab": {},
+        "id": "PzpV246_JGEu",
         "colab_type": "code",
-        "id": "PzpV246_JGEu"
+        "colab": {}
       },
-      "outputs": [],
       "source": [
-        "model = text_classifier.create(new_train_data, model_spec=new_model_spec, validation_data=new_validation_data)"
-      ]
+        "model = text_classifier.create(new_train_data, model_spec=new_model_spec)"
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "LvQuy7RSDir3"
+        "id": "E8VxPiOLy4Gv",
+        "colab_type": "text"
       },
       "source": [
-        "### Change the training hyperparameters\n",
-        "We could also change the training hyperparameters like `epochs` and `batch_size` that could affect the model accuracy. For instance,\n",
+        "You can also adjust the MobileBERT model.\n",
+        "\n",
+        "The model parameters you can adjust are:\n",
+        "\n",
+        "* `seq_len`: Length of the sequence to feed into the model.\n",
+        "* `initializer_range`: The standard deviation of the truncated_normal_initializer for initializing all weight matrices.\n",
+        "* `trainable`: Boolean that specifies whether the pre-trained layer is trainable.\n",
+        "\n",
+        "The training pipeline parameters you can adjust are:\n",
+        "\n",
+        "* `model_dir`: The location of the model checkpoint files. If not set, a temporary directory will be used.\n",
+        "* `dropout_rate`: The dropout rate.\n",
+        "* `learning_rate`: The initial learning rate for the Adam optimizer.\n",
+        "* `tpu`: TPU address to connect to.\n",
+        "\n",
+        "For instance, you can set the `seq_len=256` (default is 128). This allows the model to classify longer text."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "4tr9BLcjy4Sh",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "new_model_spec = model_spec.get('mobilebert_classifier')\n",
+        "new_model_spec.seq_len = 256"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "LvQuy7RSDir3",
+        "colab_type": "text"
+      },
+      "source": [
+        "### Tune the training hyperparameters\n",
+        "You can also tune the training hyperparameters like `epochs` and `batch_size` that affect the model accuracy. For instance,\n",
         "\n",
         "*   `epochs`: more epochs could achieve better accuracy, but may lead to overfitting.\n",
-        "*   `batch_size`: number of samples to use in one training step.\n",
+        "*   `batch_size`: the number of samples to use in one training step.\n",
         "\n",
-        "For example, we could train with more epochs."
+        "For example, you can train with more epochs."
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {
-        "colab": {},
+        "id": "rnWFaYZBG6NW",
         "colab_type": "code",
-        "id": "rnWFaYZBG6NW"
+        "colab": {}
       },
-      "outputs": [],
       "source": [
-        "model = text_classifier.create(train_data, model_spec=model_spec, validation_data=validation_data, epochs=5)"
-      ]
+        "model = text_classifier.create(train_data, model_spec=spec, epochs=20)"
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "nUaKQZBQHBQR"
+        "id": "nUaKQZBQHBQR",
+        "colab_type": "text"
       },
       "source": [
-        "Evaluate the newly retrained model with 5 training epochs."
+        "Evaluate the newly retrained model with 20 training epochs."
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {
-        "colab": {},
+        "id": "BMPi1xflHDSY",
         "colab_type": "code",
-        "id": "BMPi1xflHDSY"
+        "colab": {}
       },
-      "outputs": [],
       "source": [
         "loss, accuracy = model.evaluate(test_data)"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "Eq6B9lKMfhS6"
+        "id": "Eq6B9lKMfhS6",
+        "colab_type": "text"
       },
       "source": [
-        "### Change the Model\n",
+        "### Change the Model Architecture\n",
         "\n",
-        "We could change the model by changing the `model_spec`. The following shows how we change to BERT-base model.\n",
+        "You can change the model by changing the `model_spec`. The following shows how to change to BERT-Base model.\n",
         "\n",
-        "First, we could change `model_spec` to `BertModelSpec`."
+        "Change the `model_spec` to BERT-Base model for the text classifier."
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {
-        "colab": {},
+        "id": "QfFCWrwyggrT",
         "colab_type": "code",
-        "id": "QfFCWrwyggrT"
+        "colab": {}
       },
-      "outputs": [],
       "source": [
-        "model_spec = BertClassifierModelSpec()"
-      ]
+        "spec = model_spec.get('bert_classifier')"
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "L2d7yycrgu6L"
+        "id": "L2d7yycrgu6L",
+        "colab_type": "text"
       },
       "source": [
-        "The remaining steps remains the same.\n",
-        "\n",
-        "Load data and preprocess the data according to `model_spec`."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "6GQXQO54iyyE"
-      },
-      "outputs": [],
-      "source": [
-        "train_data = TextClassifierDataLoader.from_folder(os.path.join(data_path, 'train'), model_spec=model_spec, class_labels=['pos', 'neg'])\n",
-        "test_data = TextClassifierDataLoader.from_folder(os.path.join(data_path, 'test'), model_spec=model_spec, is_training=False, shuffle=False)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "ZTMqpDXCi11Q"
-      },
-      "source": [
-        "Then retrain the model. Note that it could take a long time to retrain the BERT model. we just set `epochs` equals 1 to demonstrate it."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "c991Bdkgi1Bf"
-      },
-      "outputs": [],
-      "source": [
-        "model = text_classifier.create(train_data, model_spec=model_spec, epochs=1)"
+        "The remaining steps are the same."
       ]
     }
-  ],
-  "metadata": {
-    "accelerator": "GPU",
-    "colab": {
-      "collapsed_sections": [],
-      "name": "text_classification.ipynb",
-      "private_outputs": true,
-      "provenance": [],
-      "toc_visible": true
-    },
-    "kernelspec": {
-      "display_name": "Python 3",
-      "name": "python3"
-    }
-  },
-  "nbformat": 4,
-  "nbformat_minor": 0
-}
+  ]
+}
\ No newline at end of file
diff --git a/tensorflow/lite/graph_info.cc b/tensorflow/lite/graph_info.cc
index 8968fe6cb21..47fa8ff86b1 100644
--- a/tensorflow/lite/graph_info.cc
+++ b/tensorflow/lite/graph_info.cc
@@ -40,7 +40,7 @@ class PartitionGraphIntoIndependentNodeSubsetsImpl {
       std::vector<NodeSubset>* node_subsets)
       : info_(info),
         node_subsets_(node_subsets),
-        node_type_(info->num_nodes(), NodeSubset::kTfNonPartition) {
+        node_type_(info_->num_total_nodes(), NodeSubset::kTfNonPartition) {
     // Populate the node_type_ map.
     for (auto node_index : TfLiteIntArrayView(nodes_to_partition)) {
       node_type_[node_index] = NodeSubset::kTfPartition;
@@ -54,10 +54,11 @@ class PartitionGraphIntoIndependentNodeSubsetsImpl {
     tensor_epochs_.clear();
     tensor_epochs_.resize(info_->num_tensors(), kEpochAlwaysReady);
     node_epochs_.clear();
-    node_epochs_.resize(info_->num_nodes(), kEpochNotReady);
+    node_epochs_.resize(info_->num_execution_nodes(), kEpochNotReady);
     // Set computed tensors to be kEpochNotReady (initializer set everything to
     // AlwaysReady).
-    for (int node_index = 0; node_index < info_->num_nodes(); node_index++) {
+    for (int node_index = 0; node_index < info_->num_execution_nodes();
+         node_index++) {
       const TfLiteNode& node = info_->node(node_index);
       for (int output_tensor_index : TfLiteIntArrayView(node.outputs)) {
         tensor_epochs_[output_tensor_index] = kEpochNotReady;
@@ -112,10 +113,10 @@ class PartitionGraphIntoIndependentNodeSubsetsImpl {
     kEpochAlwaysReady = -2
   };
 
-  // Updates the  node `node_index` and returns true if it is assigned to an
-  // epoch. False is returned if the node is already set to an epoch, its inputs
-  // are not all assigned to epochs, or if it cannot be assigned to the current
-  // epoch since the epoch's node_type doesn't match.
+  // Updates the node at `node_index` in the execution plan and returns true if
+  // it is assigned to an epoch. False is returned if the node is already set to
+  // an epoch, its inputs are not all assigned to epochs, or if it cannot be
+  // assigned to the current epoch since the epoch's node_type doesn't match.
   bool UpdateNode(int node_index) {
     const TfLiteNode& node = info_->node(node_index);
     NodeSubset& current_subset = node_subsets_->back();
@@ -132,18 +133,20 @@ class PartitionGraphIntoIndependentNodeSubsetsImpl {
         return false;
       }
     }
+
+    int original_node_idx = info_->node_index(node_index);
     // When we are starting a new epoch, the first ready node defines
     // the type of that epoch.
     if (current_subset.type == NodeSubset::kTfUnexplored) {
-      current_subset.type = node_type_[node_index];
+      current_subset.type = node_type_[original_node_idx];
     }
     // The node gets assigned to this epoch if it is the same type as
     // the epoch's assigned type. Note, if this is the current ready
     // node encountered during this epoch, this condition will be
     // automatically true.
-    if (current_subset.type == node_type_[node_index]) {
+    if (current_subset.type == node_type_[original_node_idx]) {
       node_epochs_[node_index] = current_epoch;
-      current_subset.nodes.push_back(info_->node_index(node_index));
+      current_subset.nodes.push_back(original_node_idx);
       // All outputs of this node now are assigned to this epoch as
       // well.
       for (int output_tensor_index : TfLiteIntArrayView(node.outputs)) {
@@ -180,7 +183,8 @@ class PartitionGraphIntoIndependentNodeSubsetsImpl {
     // loop until no more nodes can be updated.
     while (true) {
       bool did_something = false;
-      for (int node_index = 0; node_index < info_->num_nodes(); node_index++) {
+      for (int node_index = 0; node_index < info_->num_execution_nodes();
+           node_index++) {
         if (UpdateNode(node_index)) {
           did_something = true;
         }
@@ -193,6 +197,9 @@ class PartitionGraphIntoIndependentNodeSubsetsImpl {
   const GraphInfo* info_;
   // List of node_subsets to populate
   std::vector<NodeSubset>* node_subsets_;
+  // NOTE: This vector contains a place-holder for *all* nodes in the graph, not
+  // just ones in the execution plan. This is because nodes_to_partition is
+  // passed in as a list of original node indices & not execution plan indices.
   std::vector<NodeSubset::Type> node_type_;
   // Maps from tensor index to the epoch in which it is assigned. Also special
   // negative values of kEpochNotReady if not assigned, kEpochAlwaysReady if it
diff --git a/tensorflow/lite/graph_info.h b/tensorflow/lite/graph_info.h
index cf84c1466af..2236f99068b 100644
--- a/tensorflow/lite/graph_info.h
+++ b/tensorflow/lite/graph_info.h
@@ -34,15 +34,21 @@ class GraphInfo {
   // num_tensors().
   virtual TfLiteTensor* tensor(size_t index) = 0;
 
-  // Total number of nodes in the graph.
-  virtual size_t num_nodes() const = 0;
+  // Number of nodes in the current execution plan.
+  virtual size_t num_execution_nodes() const = 0;
 
-  // Returns a node given its index which is expected to be between 0 and
-  // num_nodes().
+  // Total number of known nodes, which may include nodes that are no longer in
+  // the execution plan. This happens in case of applying multiple delegates.
+  // Should be >= num_execution_nodes()
+  virtual size_t num_total_nodes() const = 0;
+
+  // Returns a node given its index in the execution plan, which is expected to
+  // be between 0 and num_execution_nodes().
   virtual const TfLiteNode& node(size_t index) const = 0;
 
   // Returns an implementation-specific node index which may be different from
-  // index.
+  // execution-plan index.
+  // Expected to be between 0 and num_total_nodes().
   virtual size_t node_index(size_t index) const = 0;
 
   // Returns the indices of the input tensors.
diff --git a/tensorflow/lite/graph_info_test.cc b/tensorflow/lite/graph_info_test.cc
index 82d63ed6846..4ab11d9db18 100644
--- a/tensorflow/lite/graph_info_test.cc
+++ b/tensorflow/lite/graph_info_test.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/lite/graph_info.h"
+
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-
-#include "tensorflow/lite/graph_info.h"
 #include "tensorflow/lite/testing/util.h"
 
 namespace tflite {
@@ -34,6 +34,8 @@ class SimpleTestGraph : public GraphInfo {
  public:
   explicit SimpleTestGraph(int node_index_offset = 0)
       : node_index_offset_(node_index_offset) {
+    // 'node_index_offset' number of nodes are not present in the execution
+    // plan. (and hence not considered for partitioning)
     for (int i = 0; i < node_index_offset; ++i) AddNode({}, {});
   }
 
@@ -44,7 +46,8 @@ class SimpleTestGraph : public GraphInfo {
     }
   }
 
-  size_t num_nodes() const override {
+  size_t num_total_nodes() const override { return nodes_.size(); }
+  size_t num_execution_nodes() const override {
     return nodes_.size() - node_index_offset_;
   }
   const TfLiteNode& node(size_t index) const override {
@@ -156,7 +159,7 @@ TEST(PartitionTest, Nodes1PartitionNodes0) {
   CheckPartitionSubgraphs(generated_subgraphs, {expected_subgraph});
 }
 
-TEST(PartitionTest, Nodes1PartitionNodes0WithOffset) {
+TEST(PartitionTest, Nodes1PartitionNodes0_WithOffset) {
   constexpr int node_index_offset = 17;
   SimpleTestGraph graph(node_index_offset);
   graph.AddTensors(2);
@@ -243,6 +246,33 @@ TEST(PartitionTest, Nodes2PartitionNodes1) {
                           {expected_subgraph0, expected_subgraph1});
 }
 
+// Same as above, but with node offset to ensure correct handling of original vs
+// execution plan indices.
+TEST(PartitionTest, Nodes2PartitionNodes1_WithOffset) {
+  constexpr int node_index_offset = 17;
+  SimpleTestGraph graph(node_index_offset);
+  graph.AddTensors(3);
+  graph.AddNode({0}, {1});
+  graph.AddNode({1}, {2});
+  graph.SetInputsAndOutputs({0}, {2});
+  std::vector<int> nodes_to_partition = {node_index_offset + 1};
+  std::vector<NodeSubset> generated_subgraphs;
+  PartitionGraph(graph, nodes_to_partition, &generated_subgraphs);
+
+  NodeSubset expected_subgraph0;
+  expected_subgraph0.type = NodeSubset::kTfPartition;
+  expected_subgraph0.nodes = {node_index_offset + 0};
+  expected_subgraph0.input_tensors = {0};
+  expected_subgraph0.output_tensors = {1};
+  NodeSubset expected_subgraph1;
+  expected_subgraph1.type = NodeSubset::kTfPartition;
+  expected_subgraph1.nodes = {node_index_offset + 1};
+  expected_subgraph1.input_tensors = {1};
+  expected_subgraph1.output_tensors = {2};
+  CheckPartitionSubgraphs(generated_subgraphs,
+                          {expected_subgraph0, expected_subgraph1});
+}
+
 // Test a 2 node graph where both nodes are fully partitioned.
 // Input: tensor(0) -> node(0) -> tensor(1) -> node(1) -> tensor(2),
 //    nodes_to_partition = [0, 1]
diff --git a/tensorflow/lite/interpreter.cc b/tensorflow/lite/interpreter.cc
index 88dcb37898a..25acac96cf4 100644
--- a/tensorflow/lite/interpreter.cc
+++ b/tensorflow/lite/interpreter.cc
@@ -52,8 +52,6 @@ static_assert(sizeof(TfLiteFloat16) == sizeof(uint16_t),
 
 namespace tflite {
 
-namespace impl {
-
 namespace {
 
 // Gets the current TfLiteQuantization from the legacy TfLiteQuantizationParams.
@@ -86,9 +84,8 @@ TfLiteQuantization GetQuantizationFromLegacy(
 }  // namespace
 
 Interpreter::Interpreter(ErrorReporter* error_reporter)
-    : error_reporter_(error_reporter ? error_reporter : DefaultErrorReporter()),
-      lazy_delegate_provider_(
-          TfLiteDelegatePtr(nullptr, [](TfLiteDelegate*) {})) {
+    : error_reporter_(error_reporter ? error_reporter
+                                     : DefaultErrorReporter()) {
   // TODO(b/128420794): Include the TFLite runtime version in the log.
   // Prod logging is useful for mobile platforms where scraping console logs is
   // critical for debugging.
@@ -113,7 +110,7 @@ Interpreter::Interpreter(ErrorReporter* error_reporter)
   external_contexts_[kTfLiteCpuBackendContext] =
       own_external_cpu_backend_context_.get();
 
-  UseNNAPI(false);
+  primary_subgraph().UseNNAPI(false);
 }
 
 Interpreter::~Interpreter() {
@@ -163,6 +160,12 @@ void Interpreter::SetExternalContext(TfLiteExternalContextType type,
   primary_subgraph().SetExternalContext(type, ctx);
 }
 
+TfLiteStatus Interpreter::SetCustomAllocationForTensor(
+    int tensor_index, const TfLiteCustomAllocation& allocation) {
+  return primary_subgraph().SetCustomAllocationForTensor(tensor_index,
+                                                         allocation);
+}
+
 TfLiteStatus Interpreter::SetInputs(std::vector<int> inputs) {
   return primary_subgraph().SetInputs(std::move(inputs));
 }
@@ -178,12 +181,53 @@ TfLiteStatus Interpreter::SetVariables(std::vector<int> variables) {
 TfLiteStatus Interpreter::AllocateTensors() {
   // Apply the default delegate that TFLite will enable at this point to allow
   // other user-level delegates to be applied first.
-  if (lazy_delegate_provider_) {
-    // The execution will fall back to default implementation if the XNNPACK
-    // delegate fails to be applied. Therefore, we ignore the return status
-    // here and let it fall through the rest of the code.
-    ModifyGraphWithDelegate(std::move(lazy_delegate_provider_));
-    lazy_delegate_provider_.reset();
+  if (!lazy_delegate_providers_.empty()) {
+    TFLITE_LOG(TFLITE_LOG_INFO,
+               "Applying %zu TensorFlow Lite delegate(s) lazily.",
+               lazy_delegate_providers_.size());
+    // At the momement, XNNPACK delegate is the only one that might be applied
+    // by default, in which case, the execution will fall back to default
+    // implementation if the XNNPACK delegate fails to be applied. Therefore, we
+    // ignore the return status here and let it fall through the rest of the
+    // code.
+    for (size_t i = 0; i < lazy_delegate_providers_.size(); ++i) {
+      auto status =
+          ModifyGraphWithDelegate(std::move(lazy_delegate_providers_[i]));
+      switch (status) {
+        case kTfLiteOk:
+          TFLITE_LOG(TFLITE_LOG_INFO,
+                     "Successfully applied the default TensorFlow Lite "
+                     "delegate indexed at %zu.",
+                     i);
+          break;
+        case kTfLiteError:
+          TF_LITE_REPORT_ERROR(error_reporter_,
+                               "Failed to apply the default TensorFlow Lite "
+                               "delegate indexed at %zu.",
+                               i);
+          return kTfLiteError;
+        case kTfLiteDelegateError:
+          TF_LITE_REPORT_ERROR(
+              error_reporter_,
+              "Error in applying the default TensorFlow Lite delegate indexed "
+              "at %zu, and all previously applied delegates are reverted.",
+              i);
+          break;
+        case kTfLiteApplicationError:
+          TF_LITE_REPORT_ERROR(error_reporter_,
+                               "Ignoring failed application of the default "
+                               "TensorFlow Lite delegate indexed at %zu.",
+                               i);
+          break;
+        default:
+          TF_LITE_REPORT_ERROR(error_reporter_,
+                               "Unknown status (%d) after applying the default "
+                               "TensorFlow Lite delegate indexed at %zu.",
+                               status, i);
+          return kTfLiteError;
+      }
+    }
+    lazy_delegate_providers_.clear();
   }
 
   return primary_subgraph().AllocateTensors();
@@ -300,7 +344,12 @@ TfLiteStatus Interpreter::SetExecutionPlan(const std::vector<int>& new_plan) {
   return primary_subgraph().SetExecutionPlan(new_plan);
 }
 
-void Interpreter::UseNNAPI(bool enable) { primary_subgraph().UseNNAPI(enable); }
+void Interpreter::UseNNAPI(bool enable) {
+  TFLITE_LOG_PROD_ONCE(TFLITE_LOG_INFO,
+                       "Interpreter::UseNNAPI() is deprecated. Use "
+                       "tflite::NnApiDelegate() directly instead.");
+  primary_subgraph().UseNNAPI(enable);
+}
 
 TfLiteStatus Interpreter::SetNumThreads(int num_threads) {
   if (num_threads < -1) {
@@ -356,13 +405,6 @@ TfLiteStatus Interpreter::ModifyGraphWithDelegate(TfLiteDelegate* delegate) {
   return status;
 }
 
-TfLiteStatus Interpreter::ModifyGraphWithDelegate(TfLiteDelegatePtr delegate) {
-  // Note that we retain ownership of the delegate even if graph modification
-  // fails, as delegate use will be in an indeterminate state at that point.
-  owned_delegates_.push_back(std::move(delegate));
-  return ModifyGraphWithDelegate(owned_delegates_.back().get());
-}
-
 TfLiteStatus Interpreter::RemoveAllDelegates() {
   for (auto& subgraph : subgraphs_) {
     TF_LITE_ENSURE_STATUS(subgraph->RemoveAllDelegates());
@@ -431,6 +473,4 @@ Profiler* Interpreter::GetProfiler() {
   return primary_subgraph().GetProfiler();
 }
 
-}  // namespace impl
-
 }  // namespace tflite
diff --git a/tensorflow/lite/interpreter.h b/tensorflow/lite/interpreter.h
index 653283bc234..77253ca836d 100644
--- a/tensorflow/lite/interpreter.h
+++ b/tensorflow/lite/interpreter.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <complex>
 #include <cstdio>
 #include <cstdlib>
+#include <functional>
 #include <memory>
 #include <vector>
 
@@ -35,10 +36,6 @@ limitations under the License.
 #include "tensorflow/lite/stderr_reporter.h"
 #include "tensorflow/lite/type_to_tflitetype.h"
 
-#if TFLITE_EXPERIMENTAL_RUNTIME_EAGER
-#include "tensorflow/lite/experimental/tf_runtime/public/eager_interpreter.h"
-#endif
-
 namespace tflite {
 
 class InterpreterTest;
@@ -47,8 +44,6 @@ namespace delegates {
 class InterpreterUtils;  // Class for friend declarations.
 }  // namespace delegates
 
-namespace impl {
-
 /// An interpreter for a graph of nodes that input and output from tensors.
 /// Each node of the graph processes a set of input tensors and produces a
 /// set of output Tensors. All inputs/output tensors are referenced by index.
@@ -56,35 +51,39 @@ namespace impl {
 /// Usage:
 ///
 /// <pre><code>
-/// // Create basic model
-/// Interpreter foo(2, 1);
-/// foo.SetTensorParametersReadWrite(0, ...);
-/// foo.SetTensorParametersReadOnly(1, ...);
-/// foo.SetNodeParameters(0, ...)
-/// // Resize input array to 1 length.
-/// foo.ResizeInputTensor(0, 1);
-/// foo.AllocateTensors();
-/// // Install array data
-/// foo.typed_tensor<float>(0)[0] = 3;
-/// foo.Invoke();
-/// foo.typed_tensor<float>(0)[0] = 4;
-/// foo.Invoke();
-/// // Resize input array and set data.
-/// foo.ResizeInputTensor(0, 2);
-/// foo.AllocateTensors();
-/// foo.typed_tensor<float>(0)[0] = 4;
-/// foo.typed_tensor<float>(0)[1] = 8;
-/// foo.Invoke();
+/// // Create model from file. Note that the model instance must outlive the
+/// // interpreter instance.
+/// auto model = tflite::FlatBufferModel::BuildFromFile(...);
+/// if (model == nullptr) {
+///   // Return error.
+/// }
+/// // Create an Interpreter with an InterpreterBuilder.
+/// std::unique_ptr<tflite::Interpreter> interpreter;
+/// tflite::ops::builtin::BuiltinOpResolver resolver;
+/// if (InterpreterBuilder(*model, resolver)(&interpreter) != kTfLiteOk) {
+///   // Return failure.
+/// }
+/// interpreter->AllocateTensors();
+///
+/// auto input = interpreter->typed_tensor<float>(0);
+/// for (int i = 0; i < input_size; i++) {
+///   input[i] = ...;
+//  }
+/// interpreter.Invoke();
 /// </code></pre>
 ///
+/// Note: for nearly all practical use cases, one should not directly construct
+/// an Interpreter object, but rather use the InterpreterBuilder.
 
 class Interpreter {
  public:
-  /// Instantiate an interpreter. All errors associated with reading and
-  /// processing this model will be forwarded to the error_reporter object.
+  // Instantiate an interpreter. All errors associated with reading and
+  // processing this model will be forwarded to the error_reporter object.
   //
-  /// Note, if error_reporter is nullptr, then a default StderrReporter is
-  /// used. Ownership of 'error_reporter' remains with the caller.
+  // Note, if error_reporter is nullptr, then a default StderrReporter is
+  // used. Ownership of 'error_reporter' remains with the caller.
+  // WARNING: Use of this constructor outside of an InterpreterBuilder is not
+  // recommended.
   explicit Interpreter(ErrorReporter* error_reporter = DefaultErrorReporter());
 
   ~Interpreter();
@@ -364,7 +363,13 @@ class Interpreter {
   /// Returns status of success or failure.
   TfLiteStatus Invoke();
 
-  /// Enable or disable the NN API (true to enable)
+  /// Enable or disable NNAPI (true to enable). Disabled by default.
+  ///
+  /// WARNING: NNAPI cannot be disabled after the graph has been prepared
+  /// (via `AllocateTensors`) with NNAPI enabled.
+  ///
+  /// NOTE: This API is deprecated, prefer using the NNAPI delegate directly.
+  /// This method will be removed in a future release.
   void UseNNAPI(bool enable);
 
   /// Set the number of threads available to the interpreter.
@@ -412,10 +417,29 @@ class Interpreter {
       std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)>;
 
   /// Same as ModifyGraphWithDelegate except this interpreter takes
-  /// ownership of the provided delegate. Be sure to construct the unique_ptr
-  /// with a suitable destruction function.
+  /// ownership of the provided delegate.
   /// WARNING: This is an experimental API and subject to change.
-  TfLiteStatus ModifyGraphWithDelegate(TfLiteDelegatePtr delegate);
+  template <typename Delegate, typename Deleter>
+  inline TfLiteStatus ModifyGraphWithDelegate(
+      std::unique_ptr<Delegate, Deleter> delegate) {
+    Deleter deleter = std::move(delegate.get_deleter());
+
+    // Note that we retain ownership of the delegate even if graph modification
+    // fails, as delegate use will be in an indeterminate state at that point.
+    owned_delegates_.emplace_back(
+        delegate.release(), [deleter](TfLiteDelegate* delegate_to_delete) {
+          deleter(
+              static_cast<typename std::unique_ptr<Delegate, Deleter>::pointer>(
+                  delegate_to_delete));
+        });
+    return ModifyGraphWithDelegate(owned_delegates_.back().get());
+  }
+
+  /// This overload is *never* OK. TfLiteDelegate is a C structure, so it has no
+  /// virtual destructor. The default deleter of the unique_ptr does not know
+  /// how to delete C++ objects deriving from TfLiteDelegate.
+  TfLiteStatus ModifyGraphWithDelegate(
+      std::unique_ptr<TfLiteDelegate> delegate) = delete;
 
   /// Ensure the data in `tensor.data` is readable. In case delegate is used,
   /// it might require to copy the data from delegate buffer to raw memory.
@@ -498,6 +522,29 @@ class Interpreter {
   void SetExternalContext(TfLiteExternalContextType type,
                           TfLiteExternalContext* ctx);
 
+  // Assigns (or reassigns) a custom memory allocation for the given tensor.
+  // If AllocateTensors() is called after this, the runtime does not consider
+  // the tensor during internal memory planning and will continue using the
+  // provided allocation for the tensor (assuming it satisfies the expected
+  // tensor byte length).
+  // The runtime does NOT take ownership of the underlying memory.
+  // Note that while this function can be called again to set a new allocation
+  // for the tensor, it can no longer be reset to the TFLite arena memory.
+  //
+  // Parameters should satisfy the following conditions:
+  // 1. tensor->allocation_type == kTfLiteArenaRw or kTfLiteArenaRwPersistent
+  //    In general, this is true for I/O tensors & variable tensors.
+  // 2. allocation->data has the appropriate permissions for runtime access
+  //    (Read-only for inputs, Read-Write for others), and outlives Interpreter.
+  // 3. allocation->bytes >= tensor->bytes.
+  //    This condition is checked again if any tensors are resized.
+  // 4. allocation->data should be aligned to kDefaultTensorAlignment
+  //    defined in lite/util.h. (Currently 64 bytes)
+  //
+  // WARNING: This is an experimental interface that is subject to change.
+  TfLiteStatus SetCustomAllocationForTensor(
+      int tensor_index, const TfLiteCustomAllocation& allocation);
+
 #ifndef DOXYGEN_SKIP
   /// Adds `subgraphs_to_add` subgraphs, preserving pre-existing Subgraph
   /// entries. The value pointed to by `first_new_subgraph_index` will be set to
@@ -571,7 +618,9 @@ class Interpreter {
   // interpreter instance. Useful if client delegate ownership is burdensome.
   // WARNING: This is an experimental API and subject to change.
   // TODO(b/116667551): Use TfLiteExternalContext for storing state.
-  std::vector<TfLiteDelegatePtr> owned_delegates_;
+  std::vector<
+      std::unique_ptr<TfLiteDelegate, std::function<void(TfLiteDelegate*)>>>
+      owned_delegates_;
 
   // Profiler that has been installed and is owned by this interpreter instance.
   // Useful if client profiler ownership is burdensome.
@@ -598,19 +647,11 @@ class Interpreter {
   // A map of resources. Owned by interpreter and shared by multiple subgraphs.
   resource::ResourceMap resources_;
 
-  // Indicating a delegate that the TFLite interpreter will apply by default.
-  // A nullptr value means there's no delegate to be applied by default or the
-  // delegate has been applied and doesn't need to be applied again.
-  TfLiteDelegatePtr lazy_delegate_provider_;
+  // Indicating delegates that the TFLite interpreter will apply by default.
+  // An empty one means there's no delegate to be applied by default or
+  // delegates have been applied and doesn't need to be applied again.
+  std::vector<TfLiteDelegatePtr> lazy_delegate_providers_;
 };
 
-}  // namespace impl
-
-#if TFLITE_EXPERIMENTAL_RUNTIME_EAGER
-using Interpreter = tflrt::EagerInterpreter;
-#else
-using Interpreter = impl::Interpreter;
-#endif
-
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_INTERPRETER_H_
diff --git a/tensorflow/lite/interpreter_builder.cc b/tensorflow/lite/interpreter_builder.cc
index 07c5251fab3..6c2b2e24558 100644
--- a/tensorflow/lite/interpreter_builder.cc
+++ b/tensorflow/lite/interpreter_builder.cc
@@ -27,16 +27,12 @@ limitations under the License.
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/api/flatbuffer_conversions.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/profiling/platform_profiler.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/shared_library.h"
-#include "tensorflow/lite/tflite_with_xnnpack_optional.h"
 #include "tensorflow/lite/util.h"
 #include "tensorflow/lite/version.h"
 
-#if defined(TFLITE_ENABLE_DEFAULT_PROFILER)
-#include "tensorflow/lite/profiling/platform_profiler.h"
-#endif
-
 // aligned_alloc is available (via cstdlib/stdlib.h) with C++17/C11.
 #if __cplusplus >= 201703L || __STDC_VERSION__ >= 201112L
 #if !defined(__ANDROID__) || __ANDROID_API__ >= 28
@@ -141,8 +137,6 @@ TFLITE_ATTRIBUTE_WEAK Interpreter::TfLiteDelegatePtr AcquireFlexDelegate() {
   return Interpreter::TfLiteDelegatePtr(nullptr, [](TfLiteDelegate*) {});
 }
 
-namespace impl {
-
 InterpreterBuilder::InterpreterBuilder(const FlatBufferModel& model,
                                        const OpResolver& op_resolver)
     : model_(model.GetModel()),
@@ -631,9 +625,7 @@ TfLiteStatus InterpreterBuilder::operator()(
     (*interpreter)->AddSubgraphs(subgraphs->size() - 1);
   }
 
-#if defined(TFLITE_ENABLE_DEFAULT_PROFILER)
-  (*interpreter)->SetProfiler(tflite::profiling::CreatePlatformProfiler());
-#endif
+  (*interpreter)->SetProfiler(tflite::profiling::MaybeCreatePlatformProfiler());
 
   for (int subgraph_index = 0; subgraph_index < subgraphs->size();
        ++subgraph_index) {
@@ -675,8 +667,8 @@ TfLiteStatus InterpreterBuilder::operator()(
   }
 
   if (num_fp32_tensors_ > 0) {
-    (*interpreter)->lazy_delegate_provider_ =
-        MaybeCreateXNNPACKDelegate(num_threads);
+    (*interpreter)->lazy_delegate_providers_ =
+        op_resolver_.GetDelegates(num_threads);
   }
 
   if (ApplyDelegates(interpreter->get(), num_threads) != kTfLiteOk)
@@ -685,6 +677,4 @@ TfLiteStatus InterpreterBuilder::operator()(
   return kTfLiteOk;
 }
 
-}  // namespace impl
-
 }  // namespace tflite
diff --git a/tensorflow/lite/interpreter_builder.h b/tensorflow/lite/interpreter_builder.h
index c6638b94835..4b0052f66ce 100644
--- a/tensorflow/lite/interpreter_builder.h
+++ b/tensorflow/lite/interpreter_builder.h
@@ -30,26 +30,23 @@ limitations under the License.
 
 namespace tflite {
 
-namespace impl {
-
 /// Build an interpreter capable of interpreting `model`.
 ///
-/// model: A model whose lifetime must be at least as long as any
+/// `model`: A model whose lifetime must be at least as long as any
 ///   interpreter(s) created by the builder. In principle multiple interpreters
 ///   can be made from a single model.
-/// op_resolver: An instance that implements the OpResolver interface, which
-/// maps
-///   custom op names and builtin op codes to op registrations. The lifetime
-///   of the provided `op_resolver` object must be at least as long as the
-///   InterpreterBuilder; unlike `model` and `error_reporter`, the `op_resolver`
-///   does not need to exist for the duration of any created Interpreter
-///   objects.
-/// error_reporter: a functor that is called to report errors that handles
+/// `op_resolver`: An instance that implements the `OpResolver` interface, which
+///   maps custom op names and builtin op codes to op registrations. The
+///   lifetime of the provided `op_resolver` object must be at least as long as
+///   the `InterpreterBuilder`; unlike `model` and `error_reporter`, the
+///   `op_resolver` does not need to exist for the duration of any created
+///   `Interpreter` objects.
+/// `error_reporter`: a functor that is called to report errors that handles
 ///   printf var arg semantics. The lifetime of the `error_reporter` object must
-///   be greater than or equal to the Interpreter created by operator().
+///   be greater than or equal to the `Interpreter` created by `operator()`.
 ///
 /// Returns a kTfLiteOk when successful and sets interpreter to a valid
-/// Interpreter. Note: The user must ensure the model lifetime (and error
+/// Interpreter. Note: The user must ensure the lifetime of the model (and error
 /// reporter, if provided) is at least as long as interpreter's lifetime.
 class InterpreterBuilder {
  public:
@@ -97,8 +94,6 @@ class InterpreterBuilder {
   int num_fp32_tensors_ = 0;
 };
 
-}  // namespace impl
-
 }  // namespace tflite
 
 #endif  // TENSORFLOW_LITE_INTERPRETER_BUILDER_H_
diff --git a/tensorflow/lite/interpreter_test.cc b/tensorflow/lite/interpreter_test.cc
index 899811b3fea..bd0f724a7bf 100644
--- a/tensorflow/lite/interpreter_test.cc
+++ b/tensorflow/lite/interpreter_test.cc
@@ -22,8 +22,10 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "third_party/eigen3/Eigen/Core"
+#include "tensorflow/lite/builtin_op_data.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/external_cpu_backend_context.h"
+#include "tensorflow/lite/kernels/builtin_op_kernels.h"
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
@@ -41,11 +43,7 @@ class InterpreterTest : public ::testing::Test {
   template <typename Delegate>
   static TfLiteStatus ModifyGraphWithDelegate(
       Interpreter* interpreter, std::unique_ptr<Delegate> delegate) {
-    Interpreter::TfLiteDelegatePtr tflite_delegate(
-        delegate.release(), [](TfLiteDelegate* delegate) {
-          delete reinterpret_cast<Delegate*>(delegate);
-        });
-    return interpreter->ModifyGraphWithDelegate(std::move(tflite_delegate));
+    return interpreter->ModifyGraphWithDelegate(std::move(delegate));
   }
 
  protected:
@@ -1480,6 +1478,290 @@ TEST_F(CancellationTest, CancelDuringInvoke) {
   ASSERT_EQ(invoke_error_code, kTfLiteError);
 }
 
+// Tests functionality related to custom memory allocations in TFLite.
+class TestCustomAllocation : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    // Simple model with two custom ops that add 2 float tensors each.
+    interpreter_.reset(new Interpreter);
+    interpreter_->AddTensors(7);
+    interpreter_->SetInputs({0, 1});
+    interpreter_->SetOutputs({3, 4, 6});
+    TfLiteQuantizationParams quant;
+    interpreter_->SetTensorParametersReadWrite(0, kTfLiteFloat32, "", {3},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(1, kTfLiteFloat32, "", {3},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(2, kTfLiteFloat32, "", {3},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(3, kTfLiteFloat32, "", {3},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(4, kTfLiteFloat32, "", {3},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(5, kTfLiteFloat32, "", {3},
+                                               quant, /*is_variable=*/true);
+    interpreter_->SetTensorParametersReadWrite(6, kTfLiteFloat32, "", {3},
+                                               quant);
+    auto* add_reg = ops::builtin::Register_ADD();
+    TfLiteAddParams* builtin_data0 =
+        reinterpret_cast<TfLiteAddParams*>(malloc(sizeof(TfLiteAddParams)));
+    TfLiteAddParams* builtin_data1 =
+        reinterpret_cast<TfLiteAddParams*>(malloc(sizeof(TfLiteAddParams)));
+    TfLiteAddParams* builtin_data2 =
+        reinterpret_cast<TfLiteAddParams*>(malloc(sizeof(TfLiteAddParams)));
+    TfLiteAddParams* builtin_data3 =
+        reinterpret_cast<TfLiteAddParams*>(malloc(sizeof(TfLiteAddParams)));
+    builtin_data0->activation = kTfLiteActNone;
+    builtin_data1->activation = kTfLiteActNone;
+    builtin_data2->activation = kTfLiteActNone;
+    builtin_data3->activation = kTfLiteActNone;
+    interpreter_->AddNodeWithParameters({0, 0}, {2}, nullptr, 0, builtin_data0,
+                                        add_reg);
+    interpreter_->AddNodeWithParameters({1, 1}, {3}, nullptr, 0, builtin_data1,
+                                        add_reg);
+    interpreter_->AddNodeWithParameters({2, 1}, {4}, nullptr, 0, builtin_data2,
+                                        add_reg);
+    interpreter_->AddNodeWithParameters({0, 5}, {6}, nullptr, 0, builtin_data3,
+                                        add_reg);
+    interpreter_->SetVariables({5});
+  }
+
+  void AssignCustomAllocForTensor(int tensor_idx, int required_alignment) {
+    const TfLiteTensor* tensor = interpreter_->tensor(tensor_idx);
+    auto tensor_alloc = NewCustomAlloc(tensor->bytes, required_alignment);
+    ASSERT_EQ(
+        interpreter_->SetCustomAllocationForTensor(tensor_idx, tensor_alloc),
+        kTfLiteOk);
+  }
+
+  void VerifyInvoke() {
+    std::vector<float> input = {1.0f, 2.0f, 3.0f};
+    std::vector<float> variable = {0.0f, 1.0f, 2.0f};
+    std::vector<float> expected_output = {2.0f, 4.0f, 6.0f};
+
+    // typed_tensor<...> should work irrespective of custom alloc, since it
+    // accesses output_tensor.data.
+    memcpy(interpreter_->typed_tensor<float>(interpreter_->variables()[0]),
+           variable.data(), 3 * sizeof(float));
+    memcpy(interpreter_->typed_tensor<float>(0), input.data(),
+           3 * sizeof(float));
+    memcpy(interpreter_->typed_tensor<float>(1), input.data(),
+           3 * sizeof(float));
+    ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
+    TfLiteTensor* output_tensor =
+        interpreter_->tensor(interpreter_->outputs()[0]);
+    for (int i = 0; i < 3; ++i) {
+      EXPECT_EQ(output_tensor->data.f[i], expected_output[i]) << i;
+    }
+  }
+
+  // Actual initialized allocation is more than num_bytes, to account for
+  // required_allocation.
+  TfLiteCustomAllocation NewCustomAlloc(size_t num_bytes,
+                                        int required_alignment) {
+    // Extra memory to ensure alignment.
+    char* new_alloc = new char[num_bytes + required_alignment];
+    char* new_underlying_buffer_aligned_ptr = reinterpret_cast<char*>(
+        AlignTo(required_alignment, reinterpret_cast<intptr_t>(new_alloc)));
+    custom_alloc_buffers_.emplace_back(new_alloc);
+
+    return TfLiteCustomAllocation(
+        {new_underlying_buffer_aligned_ptr, num_bytes});
+  }
+
+  intptr_t AlignTo(size_t alignment, intptr_t offset) {
+    return offset % alignment == 0 ? offset
+                                   : offset + (alignment - offset % alignment);
+  }
+
+  void TearDown() override {
+    interpreter_.reset();
+    custom_alloc_buffers_.clear();
+  }
+
+ protected:
+  TfLiteAddParams add_params_;
+  std::unique_ptr<Interpreter> interpreter_;
+  std::vector<std::unique_ptr<char[]>> custom_alloc_buffers_;
+};
+
+TEST_F(TestCustomAllocation, InvalidAlignment) {
+  const TfLiteTensor* input_tensor =
+      interpreter_->tensor(interpreter_->inputs()[0]);
+  intptr_t dummy_ptr = kDefaultTensorAlignment - 1;
+  TfLiteCustomAllocation input_alloc{reinterpret_cast<void*>(dummy_ptr),
+                                     input_tensor->bytes};
+  ASSERT_EQ(interpreter_->SetCustomAllocationForTensor(
+                interpreter_->inputs()[0], input_alloc),
+            kTfLiteError);
+
+  // Allocate tensors & Invoke should still work.
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+  VerifyInvoke();
+}
+
+TEST_F(TestCustomAllocation, InsufficientBytes) {
+  auto input_alloc = NewCustomAlloc(4, kDefaultTensorAlignment);
+  ASSERT_EQ(interpreter_->SetCustomAllocationForTensor(
+                interpreter_->inputs()[0], input_alloc),
+            kTfLiteError);
+
+  // Allocate tensors & Invoke should still work.
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+  VerifyInvoke();
+}
+
+TEST_F(TestCustomAllocation, CustomInputAlloc) {
+  // Set custom allocation for one input tensor.
+  AssignCustomAllocForTensor(interpreter_->inputs()[0],
+                             /*required_alignment=*/kDefaultTensorAlignment);
+
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+  VerifyInvoke();
+}
+
+TEST_F(TestCustomAllocation, CustomInputAlloc_MultipleAssigns) {
+  // Set custom allocation for one input tensor.
+  AssignCustomAllocForTensor(interpreter_->inputs()[0],
+                             /*required_alignment=*/kDefaultTensorAlignment);
+
+  AssignCustomAllocForTensor(interpreter_->inputs()[0],
+                             /*required_alignment=*/kDefaultTensorAlignment);
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+  VerifyInvoke();
+
+  AssignCustomAllocForTensor(interpreter_->inputs()[0],
+                             /*required_alignment=*/kDefaultTensorAlignment);
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+  VerifyInvoke();
+}
+
+TEST_F(TestCustomAllocation, CustomInputAlloc_AllocateTensorsBefore) {
+  // Allocate tensors.
+  // Allocating now will cause TFLite to reserve some extra memory, but nothing
+  // should break.
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+
+  AssignCustomAllocForTensor(interpreter_->inputs()[0],
+                             /*required_alignment=*/kDefaultTensorAlignment);
+
+  VerifyInvoke();
+}
+
+TEST_F(TestCustomAllocation, CustomInputAndOutputAllocs) {
+  // Set custom allocations for all IO tensors.
+  AssignCustomAllocForTensor(interpreter_->inputs()[0],
+                             /*required_alignment=*/kDefaultTensorAlignment);
+  AssignCustomAllocForTensor(interpreter_->inputs()[1],
+                             /*required_alignment=*/kDefaultTensorAlignment);
+  AssignCustomAllocForTensor(interpreter_->outputs()[0],
+                             /*required_alignment=*/kDefaultTensorAlignment);
+  AssignCustomAllocForTensor(interpreter_->outputs()[1],
+                             /*required_alignment=*/kDefaultTensorAlignment);
+
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+  VerifyInvoke();
+}
+
+// Ensure that custom allocs work for tensors on persistent arena as well.
+TEST_F(TestCustomAllocation, CustomAlloc_VariableTensor) {
+  // Set custom allocation for one input tensor.
+  AssignCustomAllocForTensor(interpreter_->variables()[0],
+                             /*required_alignment=*/kDefaultTensorAlignment);
+
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+  VerifyInvoke();
+
+  AssignCustomAllocForTensor(interpreter_->variables()[0],
+                             /*required_alignment=*/kDefaultTensorAlignment);
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+
+  std::vector<float> input = {2.0f, 3.0f, 4.0f};
+  std::vector<float> variable = {1.0f, 2.0f, 3.0f};
+  std::vector<float> expected_output = {3.0f, 5.0f, 7.0f};
+  memcpy(interpreter_->typed_tensor<float>(interpreter_->variables()[0]),
+         variable.data(), 3 * sizeof(float));
+  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
+  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
+  ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
+
+  // expected_output = input + variable
+  TfLiteTensor* output_tensor =
+      interpreter_->tensor(interpreter_->outputs()[2]);
+  for (int i = 0; i < 3; ++i) {
+    EXPECT_EQ(output_tensor->data.f[i], expected_output[i]) << i;
+  }
+}
+
+TEST_F(TestCustomAllocation, ResizeTensorsWithoutEnoughMemory) {
+  // Set custom allocations for all input tensors.
+  AssignCustomAllocForTensor(interpreter_->inputs()[0],
+                             /*required_alignment=*/kDefaultTensorAlignment);
+  AssignCustomAllocForTensor(interpreter_->inputs()[1],
+                             /*required_alignment=*/kDefaultTensorAlignment);
+
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+
+  // Now resize tensors to double the size.
+  ASSERT_EQ(interpreter_->ResizeInputTensor(interpreter_->inputs()[0], {2, 3}),
+            kTfLiteOk);
+  ASSERT_EQ(interpreter_->ResizeInputTensor(interpreter_->inputs()[1], {2, 3}),
+            kTfLiteOk);
+
+  // Since the custom memory previously allocated isn't enough,
+  // AllocateTensors() will fail.
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteError);
+  // Interpreter should no longer be in invokable state, so expect failure.
+  ASSERT_EQ(interpreter_->Invoke(), kTfLiteError);
+}
+
+TEST_F(TestCustomAllocation, ResizeTensorsWithEnoughMemory) {
+  // Set custom allocations for all input tensors, with double the required
+  // memory.
+  const TfLiteTensor* input0_tensor =
+      interpreter_->tensor(interpreter_->inputs()[0]);
+  auto input0_alloc =
+      NewCustomAlloc(2 * input0_tensor->bytes, kDefaultTensorAlignment);
+  ASSERT_EQ(interpreter_->SetCustomAllocationForTensor(
+                interpreter_->inputs()[0], input0_alloc),
+            kTfLiteOk);
+  const TfLiteTensor* input1_tensor =
+      interpreter_->tensor(interpreter_->inputs()[1]);
+  auto input1_alloc =
+      NewCustomAlloc(2 * input1_tensor->bytes, kDefaultTensorAlignment);
+  ASSERT_EQ(interpreter_->SetCustomAllocationForTensor(
+                interpreter_->inputs()[1], input1_alloc),
+            kTfLiteOk);
+
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+
+  // Now resize tensors to double the size.
+  ASSERT_EQ(interpreter_->ResizeInputTensor(interpreter_->inputs()[0], {6, 1}),
+            kTfLiteOk);
+  ASSERT_EQ(interpreter_->ResizeInputTensor(interpreter_->inputs()[1], {6, 1}),
+            kTfLiteOk);
+
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+
+  std::vector<float> input = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+  std::vector<float> expected_output = {2.0f, 4.0f, 6.0f, 8.0f, 10.0f, 12.0f};
+  TfLiteTensor* tensor = interpreter_->tensor(interpreter_->outputs()[0]);
+  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 6 * sizeof(float));
+  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 6 * sizeof(float));
+  ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
+  for (int i = 0; i < 6; ++i) {
+    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
+  }
+
+  ASSERT_EQ(interpreter_->ResizeInputTensor(interpreter_->inputs()[0], {3, 1}),
+            kTfLiteOk);
+  ASSERT_EQ(interpreter_->ResizeInputTensor(interpreter_->inputs()[1], {3, 1}),
+            kTfLiteOk);
+
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+  VerifyInvoke();
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java b/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java
index bb62b44f9cb..4665f59ac4a 100644
--- a/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java
+++ b/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java
@@ -326,58 +326,55 @@ public class Camera2BasicFragment extends Fragment
     final int deviceIndex = deviceView.getCheckedItemPosition();
     final int numThreads = np.getValue();
 
-    backgroundHandler.post(() -> {
-      if (modelIndex == currentModel && deviceIndex == currentDevice
+    backgroundHandler.post(
+        () -> {
+          if (modelIndex == currentModel
+              && deviceIndex == currentDevice
               && numThreads == currentNumThreads) {
-        return;
-      }
-      currentModel = modelIndex;
-      currentDevice = deviceIndex;
-      currentNumThreads = numThreads;
+            return;
+          }
+          currentModel = modelIndex;
+          currentDevice = deviceIndex;
+          currentNumThreads = numThreads;
 
-      // Disable classifier while updating
-      if (classifier != null) {
-        classifier.close();
-        classifier = null;
-      }
+          // Disable classifier while updating
+          if (classifier != null) {
+            classifier.close();
+            classifier = null;
+          }
 
-      // Lookup names of parameters.
-      String model = modelStrings.get(modelIndex);
-      String device = deviceStrings.get(deviceIndex);
+          // Lookup names of parameters.
+          String model = modelStrings.get(modelIndex);
+          String device = deviceStrings.get(deviceIndex);
 
-      Log.i(TAG, "Changing model to " + model + " device " + device);
+          Log.i(TAG, "Changing model to " + model + " device " + device);
 
-      // Try to load model.
-      try {
-        if (model.equals(mobilenetV1Quant)) {
-          classifier = new ImageClassifierQuantizedMobileNet(getActivity());
-        } else if (model.equals(mobilenetV1Float)) {
-          classifier = new ImageClassifierFloatMobileNet(getActivity());
-        } else {
-          showToast("Failed to load model");
-        }
-      } catch (IOException e) {
-        Log.d(TAG, "Failed to load", e);
-        classifier = null;
-      }
+          // Try to load model.
+          try {
+            if (model.equals(mobilenetV1Quant)) {
+              classifier = new ImageClassifierQuantizedMobileNet(getActivity());
+            } else if (model.equals(mobilenetV1Float)) {
+              classifier = new ImageClassifierFloatMobileNet(getActivity());
+            } else {
+              showToast("Failed to load model");
+            }
+          } catch (IOException e) {
+            Log.d(TAG, "Failed to load", e);
+            classifier = null;
+          }
 
-      // Customize the interpreter to the type of device we want to use.
-      if (classifier == null) {
-        return;
-      }
-      classifier.setNumThreads(numThreads);
-      if (device.equals(cpu)) {
-      } else if (device.equals(gpu)) {
-        if (model.equals(mobilenetV1Quant)) {
-          showToast("gpu requires float model.");
-          classifier = null;
-        } else {
-          classifier.useGpu();
-        }
-      } else if (device.equals(nnApi)) {
-        classifier.useNNAPI();
-      }
-    });
+          // Customize the interpreter to the type of device we want to use.
+          if (classifier == null) {
+            return;
+          }
+          classifier.setNumThreads(numThreads);
+          if (device.equals(cpu)) {
+          } else if (device.equals(gpu)) {
+            classifier.useGpu();
+          } else if (device.equals(nnApi)) {
+            classifier.useNNAPI();
+          }
+        });
   }
 
   /** Connect the buttons to their event handler. */
diff --git a/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java b/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java
index 2e483d89216..21149c9e0a3 100644
--- a/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java
+++ b/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java
@@ -172,7 +172,10 @@ public abstract class ImageClassifier {
 
   public void useGpu() {
     if (gpuDelegate == null) {
-      gpuDelegate = new GpuDelegate();
+      GpuDelegate.Options options = new GpuDelegate.Options();
+      options.setQuantizedModelsAllowed(true);
+
+      gpuDelegate = new GpuDelegate(options);
       tfliteOptions.addDelegate(gpuDelegate);
       recreateInterpreter();
     }
diff --git a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
index 59afc0c3608..e1d69c6dc3a 100644
--- a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
+++ b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
@@ -137,6 +137,16 @@ public final class Interpreter implements AutoCloseable {
       return this;
     }
 
+    /**
+     * Advanced: Set if the interpreter is able to be cancelled.
+     *
+     * @see {@link Interpreter#setCancelled(boolean)}.
+     */
+    public Options setCancellable(boolean allow) {
+      this.allowCancellation = allow;
+      return this;
+    }
+
     /**
      * Experimental: Enable an optimized set of floating point CPU kernels (provided by XNNPACK).
      *
@@ -152,7 +162,7 @@ public final class Interpreter implements AutoCloseable {
      * <ul>
      *   <li>Startup time and resize time may increase.
      *   <li>Baseline memory consumption may increase.
-     *   <li>Compatibility with other delegates (e.g., GPU) has not been fully validated.
+     *   <li>May be ignored if another delegate (eg NNAPI) have been applied.
      *   <li>Quantized models will not see any benefit.
      * </ul>
      *
@@ -167,6 +177,7 @@ public final class Interpreter implements AutoCloseable {
     Boolean useNNAPI;
     Boolean allowFp16PrecisionForFp32;
     Boolean allowBufferHandleOutput;
+    Boolean allowCancellation;
     Boolean useXNNPACK;
     final List<Delegate> delegates = new ArrayList<>();
   }
@@ -298,6 +309,8 @@ public final class Interpreter implements AutoCloseable {
    *     bound to the output {@link Tensor}. See {@link Options#setAllowBufferHandleOutput()}.
    * @throws IllegalArgumentException if {@code input} or {@code output} is null or empty, or if
    *     error occurs when running the inference.
+   * @throws IllegalArgumentException (EXPERIMENTAL, subject to change) if the inference is
+   *     interrupted by {@code setCancelled(true)}.
    */
   public void run(Object input, Object output) {
     Object[] inputs = {input};
@@ -524,6 +537,27 @@ public final class Interpreter implements AutoCloseable {
     wrapper.resetVariableTensors();
   }
 
+   /**
+   * Advanced: Interrupts inference in the middle of a call to {@link Interpreter#run}.
+   *
+   * <p>A cancellation flag will be set to true when this function gets called. The interpreter will
+   * check the flag between Op invocations, and if it's {@code true}, the interpreter will stop
+   * execution. The interpreter will remain a cancelled state until explicitly "uncancelled" by
+   * {@code setCancelled(false)}.
+   *
+   * <p>WARNING: This is an experimental API and subject to change.
+   *
+   * @param cancelled {@code true} to cancel inference in a best-effort way; {@code false} to
+   * resume.
+   * @throws IllegalStateException if the interpreter is not initialized with the cancellable
+   * option, which is by default off.
+   *
+   * @see {@link Interpreter.Options#setCancellable(boolean)}.
+   */
+  public void setCancelled(boolean cancelled) {
+    wrapper.setCancelled(cancelled);
+  }
+
   int getExecutionPlanLength() {
     checkNotClosed();
     return wrapper.getExecutionPlanLength();
diff --git a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
index 5e9a6eecf00..6b1686f139f 100644
--- a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
+++ b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
@@ -70,6 +70,9 @@ final class NativeInterpreterWrapper implements AutoCloseable {
     this.errorHandle = errorHandle;
     this.modelHandle = modelHandle;
     this.interpreterHandle = createInterpreter(modelHandle, errorHandle, options.numThreads);
+    if (options.allowCancellation != null && options.allowCancellation) {
+      this.cancellationFlagHandle = createCancellationFlag(interpreterHandle);
+    }
     this.inputTensors = new Tensor[getInputCount(interpreterHandle)];
     this.outputTensors = new Tensor[getOutputCount(interpreterHandle)];
     if (options.allowFp16PrecisionForFp32 != null) {
@@ -105,9 +108,11 @@ final class NativeInterpreterWrapper implements AutoCloseable {
       }
     }
     delete(errorHandle, modelHandle, interpreterHandle);
+    deleteCancellationFlag(cancellationFlagHandle);
     errorHandle = 0;
     modelHandle = 0;
     interpreterHandle = 0;
+    cancellationFlagHandle = 0;
     modelByteBuffer = null;
     inputsIndexes = null;
     outputsIndexes = null;
@@ -333,6 +338,21 @@ final class NativeInterpreterWrapper implements AutoCloseable {
     return getExecutionPlanLength(interpreterHandle);
   }
 
+  /**
+   * Sets internal cancellation flag. If it's true, the interpreter will try to interrupt any
+   * invocation between ops.
+   */
+  void setCancelled(boolean value) {
+    if (cancellationFlagHandle == 0) {
+      throw new IllegalStateException(
+          "Cannot cancel the inference. Have you called Interpreter.Options.setCancellable?");
+    }
+    setCancelled(interpreterHandle, cancellationFlagHandle, value);
+  }
+
+  private static native void setCancelled(
+      long interpreterHandle, long cancellationFlagHandle, boolean value);
+
   private void applyDelegates(Interpreter.Options options) {
     // First apply the flex delegate if necessary. This ensures the graph is fully resolved before
     // applying other delegates.
@@ -397,6 +417,8 @@ final class NativeInterpreterWrapper implements AutoCloseable {
 
   private long modelHandle;
 
+  private long cancellationFlagHandle = 0;
+
   private long inferenceDurationNanoseconds = -1;
 
   private ByteBuffer modelByteBuffer;
@@ -458,5 +480,9 @@ final class NativeInterpreterWrapper implements AutoCloseable {
 
   private static native void resetVariableTensors(long interpreterHandle, long errorHandle);
 
+  private static native long createCancellationFlag(long interpreterHandle);
+
+  private static native long deleteCancellationFlag(long cancellationFlagHandle);
+
   private static native void delete(long errorHandle, long modelHandle, long interpreterHandle);
 }
diff --git a/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc b/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
index 2d1844fbd39..006c08f89c1 100644
--- a/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
+++ b/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <stdio.h>
 #include <time.h>
 
+#include <atomic>
 #include <vector>
 
 #include "tensorflow/lite/c/common.h"
@@ -367,8 +368,14 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_useXNNPACK(
     }
     tflite_api_dispatcher::Interpreter::TfLiteDelegatePtr delegate(
         xnnpack_create(&options), xnnpack_delete);
-    if (interpreter->ModifyGraphWithDelegate(std::move(delegate)) !=
-        kTfLiteOk) {
+    auto delegation_status =
+        interpreter->ModifyGraphWithDelegate(std::move(delegate));
+    // kTfLiteApplicationError occurs in cases where delegation fails but
+    // the runtime is invokable (eg. another delegate has already been applied).
+    // We don't throw an Exception in that case.
+    // TODO(b/166483905): Add support for multiple delegates when model allows.
+    if (delegation_status != kTfLiteOk &&
+        delegation_status != kTfLiteApplicationError) {
       ThrowException(env, kIllegalArgumentException,
                      "Internal error: Failed to apply XNNPACK delegate: %s",
                      error_reporter->CachedErrorMessage());
@@ -499,6 +506,7 @@ JNIEXPORT void JNICALL Java_org_tensorflow_lite_NativeInterpreterWrapper_run(
   if (error_reporter == nullptr) return;
 
   if (interpreter->Invoke() != kTfLiteOk) {
+    // TODO(b/168266570): Return InterruptedException.
     ThrowException(env, kIllegalArgumentException,
                    "Internal error: Failed to run on the given Interpreter: %s",
                    error_reporter->CachedErrorMessage());
@@ -606,6 +614,43 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_resetVariableTensors(
   }
 }
 
+JNIEXPORT jlong JNICALL
+Java_org_tensorflow_lite_NativeInterpreterWrapper_createCancellationFlag(
+    JNIEnv* env, jclass clazz, jlong interpreter_handle) {
+  tflite_api_dispatcher::Interpreter* interpreter =
+      convertLongToInterpreter(env, interpreter_handle);
+  if (interpreter == nullptr) {
+    ThrowException(env, kIllegalArgumentException,
+                   "Internal error: Invalid handle to interpreter.");
+  }
+  std::atomic_bool* cancellation_flag = new std::atomic_bool(false);
+  interpreter->SetCancellationFunction(cancellation_flag, [](void* payload) {
+    std::atomic_bool* cancellation_flag =
+        reinterpret_cast<std::atomic_bool*>(payload);
+    return cancellation_flag->load() == true;
+  });
+  return reinterpret_cast<jlong>(cancellation_flag);
+}
+
+JNIEXPORT void JNICALL
+Java_org_tensorflow_lite_NativeInterpreterWrapper_deleteCancellationFlag(
+    JNIEnv* env, jclass clazz, jlong flag_handle) {
+  std::atomic_bool* cancellation_flag =
+      reinterpret_cast<std::atomic_bool*>(flag_handle);
+  delete cancellation_flag;
+}
+
+JNIEXPORT void JNICALL
+Java_org_tensorflow_lite_NativeInterpreterWrapper_setCancelled(
+    JNIEnv* env, jclass clazz, jlong interpreter_handle, jlong flag_handle,
+    jboolean value) {
+  std::atomic_bool* cancellation_flag =
+      reinterpret_cast<std::atomic_bool*>(flag_handle);
+  if (cancellation_flag != nullptr) {
+    cancellation_flag->store(static_cast<bool>(value));
+  }
+}
+
 JNIEXPORT void JNICALL Java_org_tensorflow_lite_NativeInterpreterWrapper_delete(
     JNIEnv* env, jclass clazz, jlong error_handle, jlong model_handle,
     jlong interpreter_handle) {
diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
index 8f52422dde0..4f205f1cdaf 100644
--- a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
@@ -635,6 +635,47 @@ public final class InterpreterTest {
     }
   }
 
+  @Test
+  public void testCancelInference() throws Exception {
+    float[][][][] inputs = new float[2][8][8][3];
+    float[][][][] parsedOutputs = new float[2][8][8][3];
+    Interpreter interpreter = new Interpreter(
+        MODEL_BUFFER, new Interpreter.Options().setCancellable(true));
+
+    // Part 1: Should be interrupted when flag is set to true.
+    try {
+      interpreter.setCancelled(true);
+      interpreter.run(inputs, parsedOutputs);
+      fail();
+    } catch (IllegalArgumentException e) {
+    // TODO(b/168266570): Return InterruptedException.
+      assertThat(e)
+          .hasMessageThat()
+          .contains(
+              "Internal error: Failed to run on the given Interpreter: Client requested cancel"
+                  + " during Invoke()");
+    }
+
+    // Part 2: Should be resumed when flag is set to false.
+    interpreter.setCancelled(false);
+    interpreter.run(inputs, parsedOutputs);
+  }
+
+  @Test
+  public void testCancelInferenceOnNoncancellableInterpreter() throws Exception {
+    Interpreter interpreter = new Interpreter(MODEL_BUFFER);
+
+    try {
+      interpreter.setCancelled(true);
+      fail();
+    } catch (IllegalStateException e) {
+      assertThat(e)
+          .hasMessageThat()
+          .contains(
+              "Cannot cancel the inference. Have you called Interpreter.Options.setCancellable?");
+    }
+  }
+
   private static FloatBuffer fill(FloatBuffer buffer, float value) {
     while (buffer.hasRemaining()) {
       buffer.put(value);
diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/nnapi/NnApiDelegateTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/nnapi/NnApiDelegateTest.java
index 45d66e24d35..fc9038c4de0 100644
--- a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/nnapi/NnApiDelegateTest.java
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/nnapi/NnApiDelegateTest.java
@@ -56,6 +56,25 @@ public final class NnApiDelegateTest {
     }
   }
 
+  @Test
+  public void testInterpreterWithNnApiAndXNNPack() throws Exception {
+    Interpreter.Options options = new Interpreter.Options();
+    options.setUseXNNPACK(true);
+
+    try (NnApiDelegate delegate = new NnApiDelegate();
+        Interpreter interpreter = new Interpreter(MODEL_BUFFER, options.addDelegate(delegate))) {
+      float[] oneD = {1.23f, 6.54f, 7.81f};
+      float[][] twoD = {oneD, oneD, oneD, oneD, oneD, oneD, oneD, oneD};
+      float[][][] threeD = {twoD, twoD, twoD, twoD, twoD, twoD, twoD, twoD};
+      float[][][][] fourD = {threeD, threeD};
+      float[][][][] parsedOutputs = new float[2][8][8][3];
+      interpreter.run(fourD, parsedOutputs);
+      float[] outputOneD = parsedOutputs[0][0][0];
+      float[] expected = {3.69f, 19.62f, 23.43f};
+      assertThat(outputOneD).usingTolerance(0.1f).containsExactly(expected).inOrder();
+    }
+  }
+
   @Test
   public void testInterpreterWithNnApiAllowFp16() throws Exception {
     Interpreter.Options options = new Interpreter.Options();
diff --git a/tensorflow/lite/kernels/BUILD b/tensorflow/lite/kernels/BUILD
index a56d370afeb..44014fa42d9 100644
--- a/tensorflow/lite/kernels/BUILD
+++ b/tensorflow/lite/kernels/BUILD
@@ -2,6 +2,7 @@ load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 load("//tensorflow/lite/micro:build_def.bzl", "micro_copts")
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite_combined")
 load("//tensorflow:tensorflow.bzl", "tf_opts_nortti_if_android")
+load("//tensorflow:tensorflow.bzl", "get_compatible_with_portable")
 
 package(
     default_visibility = [
@@ -257,11 +258,12 @@ cc_library(
         "eigen_support.h",
     ],
     copts = tflite_copts() + EXTRA_EIGEN_COPTS,
+    visibility = ["//visibility:private"],
     deps = [
         ":op_macros",
         "//tensorflow/lite:arena_planner",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/kernels/internal:optimized",
+        "//tensorflow/lite/kernels/internal:optimized_eigen",
         "//third_party/eigen3",
     ],
 )
@@ -273,7 +275,7 @@ cc_test(
     deps = [
         ":eigen_support",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/kernels/internal:optimized",
+        "//tensorflow/lite/kernels/internal:optimized_eigen",
         "//third_party/eigen3",
         "@com_google_googletest//:gtest",
     ],
@@ -281,19 +283,11 @@ cc_test(
 
 cc_library(
     name = "tflite_with_ruy_enabled",
+    compatible_with = get_compatible_with_portable(),
     defines = ["TFLITE_WITH_RUY"],
     visibility = ["//visibility:private"],
 )
 
-cc_library(
-    name = "tflite_with_ruy_and_caching_enabled",
-    defines = [
-        "TFLITE_WITH_RUY",
-        "TFLITE_WITH_RUY_GEMV",
-    ],
-    visibility = ["//visibility:private"],
-)
-
 cc_library(
     name = "tflite_with_ruy_default",
     visibility = ["//visibility:private"],
@@ -312,6 +306,7 @@ cc_library(
 
 cc_library(
     name = "tflite_with_ruy",
+    compatible_with = get_compatible_with_portable(),
     deps = select({
         ":tflite_with_ruy_explicit_true": [":tflite_with_ruy_enabled"],
         ":tflite_with_ruy_explicit_false": [],
@@ -327,6 +322,7 @@ cc_library(
     hdrs = [
         "cpu_backend_context.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
     deps = [
         ":tflite_with_ruy",
@@ -346,6 +342,7 @@ cc_library(
     hdrs = [
         "cpu_backend_threadpool.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
     deps = [
         ":cpu_backend_context",
@@ -385,6 +382,7 @@ cc_library(
         "cpu_backend_gemm_params.h",
         "cpu_backend_gemm_ruy.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
     deps = [
         ":tflite_with_ruy",
@@ -429,6 +427,7 @@ cc_library(
     hdrs = [
         "op_macros.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
     deps = ["//tensorflow/lite/micro:debug_log"],
 )
@@ -441,6 +440,7 @@ cc_library(
     hdrs = [
         "kernel_util.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts() + micro_copts(),
     deps = [
         "//tensorflow/lite/c:common",
@@ -496,6 +496,7 @@ cc_library(
     name = "padding",
     srcs = [],
     hdrs = ["padding.h"],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
     deps = [
         "//tensorflow/lite/c:common",
@@ -601,7 +602,6 @@ BUILTIN_KERNEL_DEPS = [
     ":cpu_backend_context",
     ":cpu_backend_gemm",
     ":cpu_backend_threadpool",
-    ":eigen_support",
     ":kernel_util",
     ":lstm_eval",
     ":lstm_shared",
@@ -618,7 +618,6 @@ BUILTIN_KERNEL_DEPS = [
     "//tensorflow/lite/kernels/internal:compatibility",
     "//tensorflow/lite/kernels/internal:cpu_check",
     "//tensorflow/lite/kernels/internal:kernel_utils",
-    "//tensorflow/lite/kernels/internal:optimized",
     "//tensorflow/lite/kernels/internal:optimized_base",
     "//tensorflow/lite/kernels/internal:quantization_util",
     "//tensorflow/lite/kernels/internal:reference_base",
@@ -626,7 +625,14 @@ BUILTIN_KERNEL_DEPS = [
     "//tensorflow/lite/kernels/internal:tensor",
     "//tensorflow/lite/kernels/internal:tensor_utils",
     "//tensorflow/lite/kernels/internal:types",
-]
+] + select({
+    ":tflite_with_ruy_explicit_true": [],
+    # Eigen multi-therading optimizations are only used when ruy is disabled.
+    "//conditions:default": [
+        ":eigen_support",
+        "//tensorflow/lite/kernels/internal:optimized_eigen",
+    ],
+})
 
 cc_library(
     name = "builtin_op_kernels",
@@ -634,6 +640,7 @@ cc_library(
     hdrs = [
         "dequantize.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts() + tf_opts_nortti_if_android() + EXTRA_EIGEN_COPTS,
     visibility = ["//visibility:private"],
     deps = BUILTIN_KERNEL_DEPS + [
@@ -644,25 +651,6 @@ cc_library(
     ],
 )
 
-# Creates a target where Ruy is unconditionally enabled along with caching
-# on GEMV operations. This is useful for TF Lite deployments where custom
-# copts are not allowed, e.g. b/156119344
-cc_library(
-    name = "builtin_op_kernels_ruy_and_caching",
-    srcs = BUILTIN_KERNEL_SRCS,
-    hdrs = [
-        "dequantize.h",
-    ],
-    copts = tflite_copts() + tf_opts_nortti_if_android() + EXTRA_EIGEN_COPTS,
-    visibility = ["//visibility:private"],
-    deps = BUILTIN_KERNEL_DEPS + [
-        "@ruy//ruy/profiler:instrumentation",
-        "//tensorflow/lite/kernels/internal:cppmath",
-        "//tensorflow/lite:string",
-        "@farmhash_archive//:farmhash",
-    ] + [":tflite_with_ruy_and_caching_enabled"],
-)
-
 cc_library(
     name = "variable_op_kernels",
     srcs = [
@@ -697,20 +685,22 @@ cc_test(
 
 cc_library(
     name = "custom_ops",
-    srcs = ["rfft2d.cc"],
+    srcs = [
+        "complex_support.cc",
+        "cumsum.cc",
+        "rfft2d.cc",
+    ],
     hdrs = ["custom_ops_register.h"],
     copts = tflite_copts(),
     deps = [
         ":kernel_util",
-        ":op_macros",
-        "//tensorflow/lite:context",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/kernels/hashtable:hashtable_op_kernels",
-        "//tensorflow/lite/kernels/internal:kernel_utils",
+        "//tensorflow/lite/kernels/internal:optimized_base",
         "//tensorflow/lite/kernels/internal:tensor",
         "//tensorflow/lite/kernels/internal:types",
         "//third_party/fft2d:fft2d_headers",
         "@fft2d",
+        "@flatbuffers",
         "@ruy//ruy/profiler:instrumentation",
     ],
 )
@@ -719,6 +709,7 @@ cc_library(
     name = "lstm_eval",
     srcs = ["lstm_eval.cc"],
     hdrs = ["lstm_eval.h"],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
     deps = [
         ":cpu_backend_context",
@@ -737,6 +728,7 @@ cc_library(
 cc_library(
     name = "lstm_shared",
     hdrs = ["lstm_shared.h"],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
 )
 
@@ -757,6 +749,7 @@ cc_library(
     deps = [
         ":builtin_op_kernels",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite:tflite_with_xnnpack_optional",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/schema:schema_fbs",
     ],
@@ -771,26 +764,11 @@ cc_library(
         "fully_connected.h",
         "register.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     deps = [
         ":builtin_op_kernels",
         "//tensorflow/lite:framework_lib",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/schema:schema_fbs",
-    ],
-)
-
-#  TODO(b/156664104) Remove once runtime flag available.
-cc_library(
-    name = "builtin_ops_ruy_and_caching_enabled",
-    srcs = ["register.cc"],
-    hdrs = [
-        "builtin_op_kernels.h",
-        "fully_connected.h",
-        "register.h",
-    ],
-    deps = [
-        ":builtin_op_kernels_ruy_and_caching",
-        "//tensorflow/lite:framework_lib",
+        "//tensorflow/lite:tflite_with_xnnpack_optional",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/schema:schema_fbs",
     ],
@@ -2132,6 +2110,7 @@ cc_test(
     srcs = ["rfft2d_test.cc"],
     deps = [
         ":custom_ops",
+        ":test_main",
         ":test_util",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/c:common",
@@ -2285,4 +2264,34 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "complex_support_test",
+    srcs = ["complex_support_test.cc"],
+    deps = [
+        ":custom_ops",
+        ":test_main",
+        ":test_util",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/testing:util",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
+
+cc_test(
+    name = "cumsum_test",
+    srcs = ["cumsum_test.cc"],
+    deps = [
+        ":custom_ops",
+        ":test_main",
+        ":test_util",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/testing:util",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
+
 tflite_portable_test_suite_combined(combine_conditions = {"deps": [":test_main"]})
diff --git a/tensorflow/lite/kernels/activations.cc b/tensorflow/lite/kernels/activations.cc
index 654ccbc27ec..00cd035470f 100644
--- a/tensorflow/lite/kernels/activations.cc
+++ b/tensorflow/lite/kernels/activations.cc
@@ -298,7 +298,6 @@ void HardSwishFree(TfLiteContext* context, void* buffer) {
   delete static_cast<HardSwishData*>(buffer);
 }
 
-
 TfLiteStatus HardSwishPrepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_STATUS(GenericPrepare(context, node));
   TfLiteTensor* output = GetOutput(context, node, 0);
@@ -865,12 +864,10 @@ TfLiteStatus TanhEval(TfLiteContext* context, TfLiteNode* node) {
       TanhParams params;
       params.input_left_shift = data->input_left_shift;
       if (kernel_type == kReference || (data->input_multiplier > 0)) {
-        const int size =
-            MatchingFlatSize(GetTensorShape(input), GetTensorShape(output));
-
         reference_integer_ops::Tanh(
-            data->input_multiplier, data->input_left_shift, size,
-            GetTensorData<int16_t>(input), GetTensorData<int16_t>(output));
+            data->input_multiplier, data->input_left_shift,
+            GetTensorShape(input), GetTensorData<int16_t>(input),
+            GetTensorShape(output), GetTensorData<int16_t>(output));
       } else {
         optimized_ops::Tanh(
             params, GetTensorShape(input), GetTensorData<int16_t>(input),
@@ -1184,6 +1181,7 @@ T ApplyPrelu(T input, T alpha) {
   return input >= 0.0 ? input : input * alpha;
 }
 
+template <KernelType kernel_type>
 TfLiteStatus PreluEval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input = GetInput(context, node, 0);
   const TfLiteTensor* alpha = GetInput(context, node, 1);
@@ -1191,18 +1189,38 @@ TfLiteStatus PreluEval(TfLiteContext* context, TfLiteNode* node) {
   const PreluOpData* data = reinterpret_cast<PreluOpData*>(node->user_data);
   switch (input->type) {
     case kTfLiteFloat32: {
-      if (data->requires_broadcast) {
-        reference_ops::BroadcastBinaryFunction4DSlow<float, float, float>(
-            GetTensorShape(input), GetTensorData<float>(input),
-            GetTensorShape(alpha), GetTensorData<float>(alpha),
-            GetTensorShape(output), GetTensorData<float>(output),
-            ApplyPrelu<float>);
+      if (kernel_type == kGenericOptimized) {
+        tflite::ArithmeticParams op_params;
+        bool need_broadcast = optimized_ops::ProcessBroadcastShapes(
+            GetTensorShape(input), GetTensorShape(alpha), &op_params);
+        if (need_broadcast) {
+          optimized_ops::BroadcastPReluDispatch(
+              op_params, GetTensorShape(input), GetTensorData<float>(input),
+              GetTensorShape(alpha), GetTensorData<float>(alpha),
+              GetTensorShape(output), GetTensorData<float>(output),
+              ApplyPrelu<float>);
+        } else {
+          const int flat_size =
+              MatchingElementsSize(GetTensorShape(input), GetTensorShape(alpha),
+                                   GetTensorShape(output));
+          optimized_ops::PReluElementWise(
+              flat_size, op_params, GetTensorData<float>(alpha),
+              GetTensorData<float>(input), GetTensorData<float>(output));
+        }
       } else {
-        reference_ops::BinaryFunction<float, float, float>(
-            GetTensorShape(input), GetTensorData<float>(input),
-            GetTensorShape(alpha), GetTensorData<float>(alpha),
-            GetTensorShape(output), GetTensorData<float>(output),
-            ApplyPrelu<float>);
+        if (data->requires_broadcast) {
+          reference_ops::BroadcastBinaryFunction4DSlow<float, float, float>(
+              GetTensorShape(input), GetTensorData<float>(input),
+              GetTensorShape(alpha), GetTensorData<float>(alpha),
+              GetTensorShape(output), GetTensorData<float>(output),
+              ApplyPrelu<float>);
+        } else {
+          reference_ops::BinaryFunction<float, float, float>(
+              GetTensorShape(input), GetTensorData<float>(input),
+              GetTensorShape(alpha), GetTensorData<float>(alpha),
+              GetTensorShape(output), GetTensorData<float>(output),
+              ApplyPrelu<float>);
+        }
       }
       return kTfLiteOk;
     } break;
@@ -1463,10 +1481,17 @@ TfLiteRegistration* Register_LOG_SOFTMAX() {
   return &r;
 }
 
+TfLiteRegistration* Register_PRELU_REF() {
+  static TfLiteRegistration r = {
+      activations::PreluInit, activations::PreluFree, activations::PreluPrepare,
+      activations::PreluEval<activations::kReference>};
+  return &r;
+}
+
 TfLiteRegistration* Register_PRELU() {
-  static TfLiteRegistration r = {activations::PreluInit, activations::PreluFree,
-                                 activations::PreluPrepare,
-                                 activations::PreluEval};
+  static TfLiteRegistration r = {
+      activations::PreluInit, activations::PreluFree, activations::PreluPrepare,
+      activations::PreluEval<activations::kGenericOptimized>};
   return &r;
 }
 
diff --git a/tensorflow/lite/kernels/activations_test.cc b/tensorflow/lite/kernels/activations_test.cc
index d8f883b9c1d..6e0316538b9 100644
--- a/tensorflow/lite/kernels/activations_test.cc
+++ b/tensorflow/lite/kernels/activations_test.cc
@@ -50,6 +50,10 @@ TfLiteRegistration* Register_LOGISTIC_REF();
 TfLiteRegistration* Register_LOGISTIC_GENERIC_OPT();
 TfLiteRegistration* Register_LOGISTIC_FIXED_POINT_OPT();
 
+// PRelu kernel registrations.
+TfLiteRegistration* Register_PRELU_REF();
+TfLiteRegistration* Register_PRELU();
+
 }  // namespace builtin
 }  // namespace ops
 
@@ -2031,6 +2035,11 @@ TEST(QuantizedActivationsOpTest, LogSoftmaxInt8) {
                                      }));
 }
 
+const auto kPReluKernelMap = new std::map<string, TfLiteRegistration*>({
+    {"Reference", ops::builtin::Register_PRELU_REF()},
+    {"GenericOptimized", ops::builtin::Register_PRELU()},
+});
+
 // A base class of PRelu op model. It provides the constructor for
 // FloatPReluOpModel and QuantizedPReluOpModel.
 class BasePReluOpModel : public SingleOpModel {
@@ -2087,7 +2096,14 @@ class QuantizedPReluOpModel : public BasePReluOpModel {
   }
 };
 
-TEST(FloatActivationsOpTest, PRelu) {
+class PReluOpTest : public SingleOpTest {
+ protected:
+  const std::map<string, TfLiteRegistration*>& GetKernelMap() override {
+    return *kPReluKernelMap;
+  }
+};
+
+TEST_P(PReluOpTest, PReluFloat32) {
   FloatPReluOpModel m({TensorType_FLOAT32, {1, 2, 2, 3}},
                       {TensorType_FLOAT32, {1, 1, 3}});
 
@@ -2107,7 +2123,7 @@ TEST(FloatActivationsOpTest, PRelu) {
                              }));
 }
 
-TEST(FloatActivationsOpTest, PReluSameShapes) {
+TEST_P(PReluOpTest, PReluFloat32SameShapes) {
   FloatPReluOpModel m({TensorType_FLOAT32, {1, 2, 2, 3}},
                       {TensorType_FLOAT32, {1, 2, 2, 3}});
 
@@ -2132,7 +2148,7 @@ TEST(FloatActivationsOpTest, PReluSameShapes) {
                              }));
 }
 
-TEST(QuantizedActivationsOpTest, PRelu) {
+TEST_P(PReluOpTest, PReluUInt8) {
   const float kMin = -1;
   const float kMax = 127.f / 128.f;
   QuantizedPReluOpModel m({TensorType_UINT8, {1, 2, 2, 3}, kMin, kMax},
@@ -2162,7 +2178,7 @@ TEST(QuantizedActivationsOpTest, PRelu) {
                                       }));
 }
 
-TEST(QuantizedActivationsOpTest, PReluSameShapes) {
+TEST_P(PReluOpTest, PReluUInt8SameShapes) {
   const float kMin = -1;
   const float kMax = 127.f / 128.f;
   QuantizedPReluOpModel m({TensorType_UINT8, {1, 2, 2, 3}, kMin, kMax},
@@ -2197,7 +2213,7 @@ TEST(QuantizedActivationsOpTest, PReluSameShapes) {
                                       }));
 }
 
-TEST(QuantizedActivationsOpTest, PReluInt8) {
+TEST_P(PReluOpTest, PReluInt8) {
   const float kMin = -1;
   const float kMax = 127.f / 128.f;
   QuantizedPReluOpModel m({TensorType_INT8, {1, 2, 2, 3}, kMin, kMax},
@@ -2227,7 +2243,7 @@ TEST(QuantizedActivationsOpTest, PReluInt8) {
                                      }));
 }
 
-TEST(QuantizedActivationsOpTest, PReluInt8SameShapes) {
+TEST_P(PReluOpTest, PReluInt8SameShapes) {
   const float kMin = -1;
   const float kMax = 127.f / 128.f;
   QuantizedPReluOpModel m({TensorType_INT8, {1, 2, 2, 3}, kMin, kMax},
@@ -2303,5 +2319,9 @@ INSTANTIATE_TEST_SUITE_P(
     LogisticOpTest, LogisticOpTest,
     ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kLogisticKernelMap)));
 
+INSTANTIATE_TEST_SUITE_P(
+    PReluOpTest, PReluOpTest,
+    ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kPReluKernelMap)));
+
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/basic_rnn_test.cc b/tensorflow/lite/kernels/basic_rnn_test.cc
index 2146d086c9a..675c7dcbf5b 100644
--- a/tensorflow/lite/kernels/basic_rnn_test.cc
+++ b/tensorflow/lite/kernels/basic_rnn_test.cc
@@ -179,7 +179,7 @@ class RNNOpModel : public SingleOpModel {
     weights_ = AddInput(weights);
     recurrent_weights_ = AddInput(recurrent_weights);
     bias_ = AddInput(TensorType_FLOAT32);
-    hidden_state_ = AddInput(TensorType_FLOAT32, true);
+    hidden_state_ = AddVariableInput(TensorType_FLOAT32);
     output_ = AddOutput(TensorType_FLOAT32);
     SetBuiltinOp(BuiltinOperator_RNN, BuiltinOptions_RNNOptions,
                  CreateRNNOptions(builder_, ActivationFunctionType_RELU,
diff --git a/tensorflow/lite/kernels/bidirectional_sequence_lstm.cc b/tensorflow/lite/kernels/bidirectional_sequence_lstm.cc
index 1ce131a96ac..45d973d1d98 100644
--- a/tensorflow/lite/kernels/bidirectional_sequence_lstm.cc
+++ b/tensorflow/lite/kernels/bidirectional_sequence_lstm.cc
@@ -1136,10 +1136,19 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       const int fw_row_sums_size = fw_row_sums->dims->data[0];
       const int bw_row_sums_size = bw_row_sums->dims->data[0];
       TfLiteStatus fw_pass_status = lstm_eval::EvalHybrid(
-          input, fw_input_to_input_weights, fw_input_to_forget_weights,
-          fw_input_to_cell_weights, fw_input_to_output_weights,
-          fw_recurrent_to_input_weights, fw_recurrent_to_forget_weights,
-          fw_recurrent_to_cell_weights, fw_recurrent_to_output_weights,
+          input, fw_input_to_input_weights,
+          /*input_to_input_weights_ledger*/ nullptr, fw_input_to_forget_weights,
+          /*input_to_forget_weights_ledger*/ nullptr, fw_input_to_cell_weights,
+          /*input_to_cell_weights_ledger*/ nullptr, fw_input_to_output_weights,
+          /*input_to_output_weights_ledger*/ nullptr,
+          fw_recurrent_to_input_weights,
+          /*recurrent_to_input_weights_ledger*/ nullptr,
+          fw_recurrent_to_forget_weights,
+          /*recurrent_to_forget_weights_ledger*/ nullptr,
+          fw_recurrent_to_cell_weights,
+          /*recurrent_to_cell_weights_ledger*/ nullptr,
+          fw_recurrent_to_output_weights,
+          /*recurrent_to_output_weights_ledger*/ nullptr,
           fw_cell_to_input_weights, fw_cell_to_forget_weights,
           fw_cell_to_output_weights,
           /*input_layer_norm_coefficients=*/nullptr,
@@ -1149,7 +1158,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           fw_aux_input_to_input_weights, fw_aux_input_to_forget_weights,
           fw_aux_input_to_cell_weights, fw_aux_input_to_output_weights,
           fw_input_gate_bias, fw_forget_gate_bias, fw_cell_gate_bias,
-          fw_output_gate_bias, fw_projection_weights, fw_projection_bias,
+          fw_output_gate_bias, fw_projection_weights,
+          /*projection_weights_ledger*/ nullptr, fw_projection_bias,
           &lstm_params,
           /*forward_sequence=*/true, /*time_major=*/true, /*output_offset=*/0,
           fw_scratch_buffer, GetTemporary(context, node, kInputScalingFactors),
@@ -1167,10 +1177,19 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       TF_LITE_ENSURE_OK(context, fw_pass_status);
 
       TfLiteStatus bw_pass_status = lstm_eval::EvalHybrid(
-          bw_input, bw_input_to_input_weights, bw_input_to_forget_weights,
-          bw_input_to_cell_weights, bw_input_to_output_weights,
-          bw_recurrent_to_input_weights, bw_recurrent_to_forget_weights,
-          bw_recurrent_to_cell_weights, bw_recurrent_to_output_weights,
+          bw_input, bw_input_to_input_weights,
+          /*input_to_input_weights_ledger*/ nullptr, bw_input_to_forget_weights,
+          /*input_to_forget_weights_ledger*/ nullptr, bw_input_to_cell_weights,
+          /*input_to_cell_weights_ledger*/ nullptr, bw_input_to_output_weights,
+          /*input_to_output_weights_ledger*/ nullptr,
+          bw_recurrent_to_input_weights,
+          /*recurrent_to_input_weights_ledger*/ nullptr,
+          bw_recurrent_to_forget_weights,
+          /*recurrent_to_forget_weights_ledger*/ nullptr,
+          bw_recurrent_to_cell_weights,
+          /*recurrent_to_cell_weights_ledger*/ nullptr,
+          bw_recurrent_to_output_weights,
+          /*recurrent_to_output_weights_ledger*/ nullptr,
           bw_cell_to_input_weights, bw_cell_to_forget_weights,
           bw_cell_to_output_weights,
           /*input_layer_norm_coefficients=*/nullptr,
@@ -1180,7 +1199,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           bw_aux_input_to_input_weights, bw_aux_input_to_forget_weights,
           bw_aux_input_to_cell_weights, bw_aux_input_to_output_weights,
           bw_input_gate_bias, bw_forget_gate_bias, bw_cell_gate_bias,
-          bw_output_gate_bias, bw_projection_weights, bw_projection_bias,
+          bw_output_gate_bias, bw_projection_weights,
+          /*projection_weights_ledger*/ nullptr, bw_projection_bias,
           &lstm_params,
           /*forward_sequence=*/false, /*time_major=*/true, bw_output_offset,
           bw_scratch_buffer, GetTemporary(context, node, kInputScalingFactors),
diff --git a/tensorflow/lite/kernels/bidirectional_sequence_lstm_test.cc b/tensorflow/lite/kernels/bidirectional_sequence_lstm_test.cc
index 778751aa04b..6f47fd0d315 100644
--- a/tensorflow/lite/kernels/bidirectional_sequence_lstm_test.cc
+++ b/tensorflow/lite/kernels/bidirectional_sequence_lstm_test.cc
@@ -160,20 +160,16 @@ class BidirectionalLSTMOpModel : public SingleOpModel {
     }
 
     // Adding the 2 input state tensors.
-    fw_input_activation_state_ =
-        AddInput(TensorData{TensorType_FLOAT32, {n_fw_output_ * n_batch_}},
-                 /*is_variable=*/true);
-    fw_input_cell_state_ =
-        AddInput(TensorData{TensorType_FLOAT32, {n_fw_cell_ * n_batch_}},
-                 /*is_variable=*/true);
+    fw_input_activation_state_ = AddVariableInput(
+        TensorData{TensorType_FLOAT32, {n_fw_output_ * n_batch_}});
+    fw_input_cell_state_ = AddVariableInput(
+        TensorData{TensorType_FLOAT32, {n_fw_cell_ * n_batch_}});
 
     // Adding the 2 input state tensors.
-    bw_input_activation_state_ =
-        AddInput(TensorData{TensorType_FLOAT32, {n_bw_output_ * n_batch_}},
-                 /*is_variable=*/true);
-    bw_input_cell_state_ =
-        AddInput(TensorData{TensorType_FLOAT32, {n_bw_cell_ * n_batch_}},
-                 /*is_variable=*/true);
+    bw_input_activation_state_ = AddVariableInput(
+        TensorData{TensorType_FLOAT32, {n_bw_output_ * n_batch_}});
+    bw_input_cell_state_ = AddVariableInput(
+        TensorData{TensorType_FLOAT32, {n_bw_cell_ * n_batch_}});
 
     fw_output_ = AddOutput(TensorType_FLOAT32);
 
diff --git a/tensorflow/lite/kernels/bidirectional_sequence_rnn_test.cc b/tensorflow/lite/kernels/bidirectional_sequence_rnn_test.cc
index 870b99d7437..e683a2a2271 100644
--- a/tensorflow/lite/kernels/bidirectional_sequence_rnn_test.cc
+++ b/tensorflow/lite/kernels/bidirectional_sequence_rnn_test.cc
@@ -680,11 +680,11 @@ class BidirectionalRNNOpModel : public SingleOpModel {
     fw_weights_ = AddInput(tensor_type);
     fw_recurrent_weights_ = AddInput(tensor_type);
     fw_bias_ = AddInput(TensorType_FLOAT32);
-    fw_hidden_state_ = AddInput(TensorType_FLOAT32, true);
+    fw_hidden_state_ = AddVariableInput(TensorType_FLOAT32);
     bw_weights_ = AddInput(tensor_type);
     bw_recurrent_weights_ = AddInput(tensor_type);
     bw_bias_ = AddInput(TensorType_FLOAT32);
-    bw_hidden_state_ = AddInput(TensorType_FLOAT32, true);
+    bw_hidden_state_ = AddVariableInput(TensorType_FLOAT32);
 
     const auto input_shape =
         (time_major) ? std::vector<int>({sequence_len_, batches_, input_size_})
diff --git a/tensorflow/lite/kernels/complex_support.cc b/tensorflow/lite/kernels/complex_support.cc
new file mode 100644
index 00000000000..7f5886c2e51
--- /dev/null
+++ b/tensorflow/lite/kernels/complex_support.cc
@@ -0,0 +1,146 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <complex>
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+// TODO(b/165735381): Promote this op to builtin-op when we can add new builtin
+// ops.
+
+namespace tflite {
+namespace ops {
+namespace custom {
+namespace complex {
+
+static const int kInputTensor = 0;
+static const int kOutputTensor = 0;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+
+  TF_LITE_ENSURE(context, input->type == kTfLiteComplex64 ||
+                              input->type == kTfLiteComplex128);
+
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  if (input->type == kTfLiteComplex64) {
+    TF_LITE_ENSURE_EQ(context, output->type, kTfLiteFloat32);
+  } else {
+    TF_LITE_ENSURE(context, output->type = kTfLiteFloat64);
+  }
+
+  TfLiteIntArray* output_shape = TfLiteIntArrayCopy(input->dims);
+  return context->ResizeTensor(context, output, output_shape);
+}
+
+template <typename T, typename ExtractF>
+void ExtractData(const TfLiteTensor* input, ExtractF extract_func,
+                 TfLiteTensor* output) {
+  const std::complex<T>* input_data = GetTensorData<std::complex<T>>(input);
+  T* output_data = GetTensorData<T>(output);
+  const int input_size = NumElements(input);
+  for (int i = 0; i < input_size; ++i) {
+    *output_data++ = extract_func(*input_data++);
+  }
+}
+
+TfLiteStatus EvalReal(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  switch (input->type) {
+    case kTfLiteComplex64: {
+      ExtractData<float>(
+          input,
+          static_cast<float (*)(const std::complex<float>&)>(std::real<float>),
+          output);
+      break;
+    }
+    case kTfLiteComplex128: {
+      ExtractData<double>(input,
+                          static_cast<double (*)(const std::complex<double>&)>(
+                              std::real<double>),
+                          output);
+      break;
+    }
+    default: {
+      TF_LITE_KERNEL_LOG(context,
+                         "Unsupported input type, Real op only supports "
+                         "complex input, but got: ",
+                         TfLiteTypeGetName(input->type));
+      return kTfLiteError;
+    }
+  }
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus EvalImag(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  switch (input->type) {
+    case kTfLiteComplex64: {
+      ExtractData<float>(
+          input,
+          static_cast<float (*)(const std::complex<float>&)>(std::imag<float>),
+          output);
+      break;
+    }
+    case kTfLiteComplex128: {
+      ExtractData<double>(input,
+                          static_cast<double (*)(const std::complex<double>&)>(
+                              std::imag<double>),
+                          output);
+      break;
+    }
+    default: {
+      TF_LITE_KERNEL_LOG(context,
+                         "Unsupported input type, Imag op only supports "
+                         "complex input, but got: ",
+                         TfLiteTypeGetName(input->type));
+      return kTfLiteError;
+    }
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace complex
+
+TfLiteRegistration* Register_REAL() {
+  static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr,
+                                 complex::Prepare, complex::EvalReal};
+  return &r;
+}
+
+TfLiteRegistration* Register_IMAG() {
+  static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr,
+                                 complex::Prepare, complex::EvalImag};
+  return &r;
+}
+
+}  // namespace custom
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/complex_support_test.cc b/tensorflow/lite/kernels/complex_support_test.cc
new file mode 100644
index 00000000000..20f88678c97
--- /dev/null
+++ b/tensorflow/lite/kernels/complex_support_test.cc
@@ -0,0 +1,161 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <complex>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/custom_ops_register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/testing/util.h"
+
+namespace tflite {
+namespace ops {
+namespace custom {
+
+TfLiteRegistration* Register_REAL();
+TfLiteRegistration* Register_IMAG();
+
+namespace {
+
+template <typename T>
+class RealOpModel : public SingleOpModel {
+ public:
+  RealOpModel(const TensorData& input, const TensorData& output) {
+    input_ = AddInput(input);
+
+    output_ = AddOutput(output);
+
+    const std::vector<uint8_t> custom_option;
+    SetCustomOp("Real", custom_option, Register_REAL);
+
+    BuildInterpreter({GetShape(input_)});
+  }
+
+  int input() { return input_; }
+
+  std::vector<T> GetOutput() { return ExtractVector<T>(output_); }
+
+ private:
+  int input_;
+  int output_;
+};
+
+TEST(RealOpTest, SimpleFloatTest) {
+  RealOpModel<float> m({TensorType_COMPLEX64, {2, 4}},
+                       {TensorType_FLOAT32, {}});
+
+  m.PopulateTensor<std::complex<float>>(m.input(), {{75, 0},
+                                                    {-6, -1},
+                                                    {9, 0},
+                                                    {-10, 5},
+                                                    {-3, 2},
+                                                    {-6, 11},
+                                                    {0, 0},
+                                                    {22.1, 33.3}});
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(), testing::ElementsAreArray(ArrayFloatNear(
+                                 {75, -6, 9, -10, -3, -6, 0, 22.1f})));
+}
+
+TEST(RealOpTest, SimpleDoubleTest) {
+  RealOpModel<double> m({TensorType_COMPLEX128, {2, 4}},
+                        {TensorType_FLOAT64, {}});
+
+  m.PopulateTensor<std::complex<double>>(m.input(), {{75, 0},
+                                                     {-6, -1},
+                                                     {9, 0},
+                                                     {-10, 5},
+                                                     {-3, 2},
+                                                     {-6, 11},
+                                                     {0, 0},
+                                                     {22.1, 33.3}});
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(), testing::ElementsAreArray(ArrayFloatNear(
+                                 {75, -6, 9, -10, -3, -6, 0, 22.1f})));
+}
+
+template <typename T>
+class ImagOpModel : public SingleOpModel {
+ public:
+  ImagOpModel(const TensorData& input, const TensorData& output) {
+    input_ = AddInput(input);
+
+    output_ = AddOutput(output);
+
+    const std::vector<uint8_t> custom_option;
+    SetCustomOp("Imag", custom_option, Register_IMAG);
+
+    BuildInterpreter({GetShape(input_)});
+  }
+
+  int input() { return input_; }
+
+  std::vector<T> GetOutput() { return ExtractVector<T>(output_); }
+
+ private:
+  int input_;
+  int output_;
+};
+
+TEST(ImagOpTest, SimpleFloatTest) {
+  ImagOpModel<float> m({TensorType_COMPLEX64, {2, 4}},
+                       {TensorType_FLOAT32, {}});
+
+  m.PopulateTensor<std::complex<float>>(m.input(), {{75, 7},
+                                                    {-6, -1},
+                                                    {9, 3.5},
+                                                    {-10, 5},
+                                                    {-3, 2},
+                                                    {-6, 11},
+                                                    {0, 0},
+                                                    {22.1, 33.3}});
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(), testing::ElementsAreArray(ArrayFloatNear(
+                                 {7, -1, 3.5f, 5, 2, 11, 0, 33.3f})));
+}
+
+TEST(ImagOpTest, SimpleDoubleTest) {
+  ImagOpModel<double> m({TensorType_COMPLEX128, {2, 4}},
+                        {TensorType_FLOAT64, {}});
+
+  m.PopulateTensor<std::complex<double>>(m.input(), {{75, 7},
+                                                     {-6, -1},
+                                                     {9, 3.5},
+                                                     {-10, 5},
+                                                     {-3, 2},
+                                                     {-6, 11},
+                                                     {0, 0},
+                                                     {22.1, 33.3}});
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(), testing::ElementsAreArray(ArrayFloatNear(
+                                 {7, -1, 3.5f, 5, 2, 11, 0, 33.3f})));
+}
+
+}  // namespace
+}  // namespace custom
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/conv.cc b/tensorflow/lite/kernels/conv.cc
index 3c11ecf172b..1b12945b2f3 100644
--- a/tensorflow/lite/kernels/conv.cc
+++ b/tensorflow/lite/kernels/conv.cc
@@ -19,14 +19,21 @@ limitations under the License.
 #include <cstdint>
 #include <vector>
 
+// Only use multi-threaded Eigen if ruy is disabled.
+#if !defined(TFLITE_WITH_RUY)
+#define TFLITE_WITH_MULTITHREADED_EIGEN
+#endif
+
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
+#if defined(TFLITE_WITH_MULTITHREADED_EIGEN)
 #include "tensorflow/lite/kernels/eigen_support.h"
+#endif
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 // b/131835803 forces us to include multithreaded_conv.h before optimized_ops.h
-#ifndef TFLITE_WITH_RUY
+#if defined(TFLITE_WITH_MULTITHREADED_EIGEN)
 #include "tensorflow/lite/kernels/internal/optimized/multithreaded_conv.h"
 #endif
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
@@ -122,12 +129,16 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   // Instead, we allocate a new object to use as scratch space for im2col, and
   // to carry information from Prepare() to Eval().
   auto* data = new OpData;
+#if defined(TFLITE_WITH_MULTITHREADED_EIGEN)
   eigen_support::IncrementUsageCounter(context);
+#endif
   return data;
 }
 
 void Free(TfLiteContext* context, void* buffer) {
+#if defined(TFLITE_WITH_MULTITHREADED_EIGEN)
   eigen_support::DecrementUsageCounter(context);
+#endif
   delete reinterpret_cast<OpData*>(buffer);
 }
 
@@ -765,12 +776,7 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node,
       break;
     }
     case kMultithreadOptimized: {
-#ifdef TFLITE_WITH_RUY
-      // See Register_CONV_2D: we should never be here when TFLITE_WITH_RUY
-      // was enabled. We #if out this code in order to get the corresponding
-      // binary size benefits.
-      TFLITE_DCHECK(false);
-#else
+#if defined(TFLITE_WITH_MULTITHREADED_EIGEN)
       const float* filter_data;
       if (data->need_hwcn_weights) {
         filter_data = GetTensorData<float>(hwcn_weights);
@@ -785,7 +791,12 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node,
           GetTensorData<float>(output), GetTensorShape(im2col),
           GetTensorData<float>(im2col));
       break;
-#endif
+#else  // !defined(TFLITE_WITH_MULTITHREADED_EIGEN)
+      // See Register_CONV_2D: we should never be here when TFLITE_WITH_RUY
+      // was enabled. We #if out this code in order to get the corresponding
+      // binary size benefits.
+      TFLITE_DCHECK(false);
+#endif  // defined(TFLITE_WITH_MULTITHREADED_EIGEN)
     }
   }
 }
@@ -1053,11 +1064,10 @@ TfLiteRegistration* Register_CONVOLUTION_CBLAS_OPT() {
 TfLiteRegistration* Register_CONV_2D() {
 #if defined TFLITE_USE_APPLE_ACCELERATE_FOR_CONV
   return Register_CONVOLUTION_CBLAS_OPT();
-#elif defined TFLITE_WITH_RUY
-  // TFLITE_WITH_RUY optimizes the generic kernel type.
-  return Register_CONVOLUTION_GENERIC_OPT();
-#else
+#elif defined TFLITE_WITH_MULTITHREADED_EIGEN
   return Register_CONVOLUTION_MULTITHREADED_OPT();
+#else
+  return Register_CONVOLUTION_GENERIC_OPT();
 #endif
 }
 
diff --git a/tensorflow/lite/kernels/cumsum.cc b/tensorflow/lite/kernels/cumsum.cc
new file mode 100644
index 00000000000..173de0959fa
--- /dev/null
+++ b/tensorflow/lite/kernels/cumsum.cc
@@ -0,0 +1,125 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "flatbuffers/flexbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+// TODO(b/161933288): Promote this op to builtin-op when we can add new builtin
+// ops.
+
+namespace tflite {
+namespace ops {
+namespace custom {
+namespace cumsum {
+
+typedef struct {
+  bool exclusive;
+  bool reverse;
+} TfLiteCumsumParams;
+
+static const int kInputTensor = 0;
+static const int kAxisTensor = 1;
+static const int kOutputTensor = 0;
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  auto* data = new TfLiteCumsumParams;
+  const uint8_t* buffer_data = reinterpret_cast<const uint8_t*>(buffer);
+
+  const flexbuffers::Map& m = flexbuffers::GetRoot(buffer_data, length).AsMap();
+  data->exclusive = m["exclusive"].AsBool();
+  data->reverse = m["reverse"].AsBool();
+  return data;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<TfLiteCumsumParams*>(buffer);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* axis = GetInput(context, node, kAxisTensor);
+
+  TF_LITE_ENSURE(context,
+                 input->type == kTfLiteInt32 || input->type == kTfLiteFloat32);
+  TF_LITE_ENSURE_EQ(context, axis->type, kTfLiteInt32);
+
+  TF_LITE_ENSURE_EQ(context, NumElements(axis), 1);
+
+  TF_LITE_ENSURE(context, NumDimensions(input) >= 1);
+
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  TfLiteIntArray* output_shape = TfLiteIntArrayCopy(input->dims);
+  return context->ResizeTensor(context, output, output_shape);
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* axis_tensor = GetInput(context, node, kAxisTensor);
+
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  auto* params = reinterpret_cast<TfLiteCumsumParams*>(node->user_data);
+
+  int axis = *GetTensorData<int>(axis_tensor);
+  if (axis < 0) axis += NumDimensions(input);
+
+  if (axis < 0 || axis >= NumDimensions(input)) {
+    TF_LITE_KERNEL_LOG(context, "Invalid axis: ", axis);
+    return kTfLiteError;
+  }
+
+  switch (input->type) {
+    case kTfLiteInt32: {
+      optimized_ops::CumSum(GetTensorData<int>(input), GetTensorShape(input),
+                            axis, params->exclusive, params->reverse,
+                            GetTensorData<int>(output));
+      break;
+    }
+    case kTfLiteFloat32: {
+      optimized_ops::CumSum(GetTensorData<float>(input), GetTensorShape(input),
+                            axis, params->exclusive, params->reverse,
+                            GetTensorData<float>(output));
+      break;
+    }
+    default: {
+      TF_LITE_KERNEL_LOG(
+          context,
+          "Unsupported input type, cumsum only supports int32 & float32.");
+      return kTfLiteError;
+    }
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace cumsum
+
+TfLiteRegistration* Register_CUMSUM() {
+  static TfLiteRegistration r = {cumsum::Init, cumsum::Free, cumsum::Prepare,
+                                 cumsum::Eval};
+  return &r;
+}
+
+}  // namespace custom
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/cumsum_test.cc b/tensorflow/lite/kernels/cumsum_test.cc
new file mode 100644
index 00000000000..092defdcba3
--- /dev/null
+++ b/tensorflow/lite/kernels/cumsum_test.cc
@@ -0,0 +1,148 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "flatbuffers/flexbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/custom_ops_register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/testing/util.h"
+
+namespace tflite {
+namespace ops {
+namespace custom {
+
+TfLiteRegistration* Register_CUMSUM();
+
+namespace {
+
+template <typename T>
+class CumsumOpModel : public SingleOpModel {
+ public:
+  CumsumOpModel(const TensorData& input, const TensorData& output,
+                bool exclusive, bool reverse) {
+    input_ = AddInput(input);
+    axis_ = AddInput({TensorType_INT32, {1}});
+
+    output_ = AddOutput(output);
+
+    flexbuffers::Builder fbb;
+    fbb.Map([&]() {
+      fbb.Bool("exclusive", exclusive);
+      fbb.Bool("reverse", reverse);
+    });
+    fbb.Finish();
+    SetCustomOp("Cumsum", fbb.GetBuffer(), Register_CUMSUM);
+
+    BuildInterpreter({GetShape(input_), GetShape(axis_)});
+  }
+
+  int input() { return input_; }
+  int axis() { return axis_; }
+
+  std::vector<T> GetOutput() { return ExtractVector<T>(output_); }
+
+ private:
+  int input_;
+  int axis_;
+  int output_;
+};
+
+TEST(CumsumOpTest, SimpleIntTest) {
+  CumsumOpModel<int32_t> m({TensorType_INT32, {2, 4}}, {TensorType_INT32, {}},
+                           false, false);
+
+  m.PopulateTensor<int>(m.input(), {1, 2, 3, 4, 5, 6, 7, 8});
+  m.PopulateTensor<int>(m.axis(), {1});
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(),
+              testing::ElementsAreArray({1, 3, 6, 10, 5, 11, 18, 26}));
+}
+
+TEST(CumsumOpTest, SimpleIntAxis0Test) {
+  CumsumOpModel<int32_t> m({TensorType_INT32, {2, 4}}, {TensorType_INT32, {}},
+                           false, false);
+
+  m.PopulateTensor<int>(m.input(), {1, 2, 3, 4, 5, 6, 7, 8});
+  m.PopulateTensor<int>(m.axis(), {0});
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(),
+              testing::ElementsAreArray({1, 2, 3, 4, 6, 8, 10, 12}));
+}
+
+TEST(CumsumOpTest, Simple1DIntTest) {
+  CumsumOpModel<int32_t> m({TensorType_INT32, {8}}, {TensorType_INT32, {}},
+                           false, false);
+
+  m.PopulateTensor<int>(m.input(), {1, 2, 3, 4, 5, 6, 7, 8});
+  m.PopulateTensor<int>(m.axis(), {0});
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(),
+              testing::ElementsAreArray({1, 3, 6, 10, 15, 21, 28, 36}));
+}
+
+TEST(CumsumOpTest, SimpleIntReverseTest) {
+  CumsumOpModel<int32_t> m({TensorType_INT32, {2, 4}}, {TensorType_INT32, {}},
+                           false, true);
+
+  m.PopulateTensor<int>(m.input(), {1, 2, 3, 4, 5, 6, 7, 8});
+  m.PopulateTensor<int>(m.axis(), {1});
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(),
+              testing::ElementsAreArray({10, 9, 7, 4, 26, 21, 15, 8}));
+}
+
+TEST(CumsumOpTest, SimpleIntExclusiveTest) {
+  CumsumOpModel<int32_t> m({TensorType_INT32, {2, 4}}, {TensorType_INT32, {}},
+                           true, false);
+
+  m.PopulateTensor<int>(m.input(), {1, 2, 3, 4, 5, 6, 7, 8});
+  m.PopulateTensor<int>(m.axis(), {1});
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(),
+              testing::ElementsAreArray({0, 1, 3, 6, 0, 5, 11, 18}));
+}
+
+TEST(CumsumOpTest, SimpleFloatTest) {
+  CumsumOpModel<float> m({TensorType_FLOAT32, {2, 4}}, {TensorType_FLOAT32, {}},
+                         false, false);
+
+  m.PopulateTensor<float>(m.input(), {1, 2, 3, 4, 5, 6, 7, 8});
+  m.PopulateTensor<int>(m.axis(), {1});
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(), testing::ElementsAreArray(
+                                 ArrayFloatNear({1, 3, 6, 10, 5, 11, 18, 26})));
+}
+
+}  // namespace
+}  // namespace custom
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/custom_ops_register.h b/tensorflow/lite/kernels/custom_ops_register.h
index 3abc893243b..659091f12fd 100644
--- a/tensorflow/lite/kernels/custom_ops_register.h
+++ b/tensorflow/lite/kernels/custom_ops_register.h
@@ -21,12 +21,16 @@ namespace tflite {
 namespace ops {
 namespace custom {
 
+TfLiteRegistration* Register_CUMSUM();
 TfLiteRegistration* Register_RFFT2D();
 TfLiteRegistration* Register_HASHTABLE();
 TfLiteRegistration* Register_HASHTABLE_FIND();
 TfLiteRegistration* Register_HASHTABLE_IMPORT();
 TfLiteRegistration* Register_HASHTABLE_SIZE();
-}
+TfLiteRegistration* Register_REAL();
+TfLiteRegistration* Register_IMAG();
+
+}  // namespace custom
 }  // namespace ops
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/densify_test.cc b/tensorflow/lite/kernels/densify_test.cc
index d453606cf2e..384a2bed3c7 100644
--- a/tensorflow/lite/kernels/densify_test.cc
+++ b/tensorflow/lite/kernels/densify_test.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <cstdint>
-#include <initializer_list>
 #include <memory>
 #include <vector>
 
@@ -42,7 +41,7 @@ using ::testing::ElementsAreArray;
 template <typename T>
 class DensifyOpModel : public SingleOpModel {
  public:
-  DensifyOpModel(const TensorData& input, std::initializer_list<T> input_data,
+  DensifyOpModel(const TensorData& input, const std::vector<T>& input_data,
                  int version = 1) {
     input_ = AddConstSparseInput(input, input_data);
     output_ = AddOutput({input.type, input.shape});
@@ -65,9 +64,8 @@ class DensifyOpModel : public SingleOpModel {
 };
 
 TEST(DensifyOpTest, Float) {
-  std::initializer_list<float> dense_values = {6, 0, 9, 8, 0, 0,
-                                               0, 0, 5, 0, 0, 7};
-  std::initializer_list<float> sparse_values = {6, 9, 8, 5, 7};
+  std::vector<float> dense_values = {6, 0, 9, 8, 0, 0, 0, 0, 5, 0, 0, 7};
+  std::vector<float> sparse_values = {6, 9, 8, 5, 7};
   TensorData input = {};
   input.type = TensorType_FLOAT32;
   input.shape = {3, 4};
@@ -80,9 +78,8 @@ TEST(DensifyOpTest, Float) {
 }
 
 TEST(DensifyOpTest, Float3D) {
-  std::initializer_list<float> dense_values = {6, 0, 9, 8, 0, 0,
-                                               0, 0, 5, 0, 0, 7};
-  std::initializer_list<float> sparse_values = {6, 9, 8, 5, 7};
+  std::vector<float> dense_values = {6, 0, 9, 8, 0, 0, 0, 0, 5, 0, 0, 7};
+  std::vector<float> sparse_values = {6, 9, 8, 5, 7};
   TensorData input = {};
   input.type = TensorType_FLOAT32;
   input.shape = {3, 2, 2};
@@ -95,9 +92,8 @@ TEST(DensifyOpTest, Float3D) {
 }
 
 TEST(DensifyOpTest, Int8) {
-  std::initializer_list<int8_t> dense_values = {6, 0, 9, 8, 0, 0,
-                                                0, 0, 5, 0, 0, 7};
-  std::initializer_list<int8_t> sparse_values = {6, 9, 8, 5, 7};
+  std::vector<int8_t> dense_values = {6, 0, 9, 8, 0, 0, 0, 0, 5, 0, 0, 7};
+  std::vector<int8_t> sparse_values = {6, 9, 8, 5, 7};
   TensorData input = {};
   input.type = TensorType_INT8;
   input.shape = {3, 4};
diff --git a/tensorflow/lite/kernels/elementwise.cc b/tensorflow/lite/kernels/elementwise.cc
index 61c6aeaa811..d23cdedc6c8 100644
--- a/tensorflow/lite/kernels/elementwise.cc
+++ b/tensorflow/lite/kernels/elementwise.cc
@@ -17,8 +17,10 @@ limitations under the License.
 #include <stdlib.h>
 
 #include <cmath>
+#include <limits>
 
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
@@ -31,6 +33,22 @@ namespace builtin {
 namespace elementwise {
 namespace {
 
+constexpr char kAbsName[] = "Abs";
+constexpr char kSinName[] = "Sin";
+constexpr char kCosName[] = "Cos";
+constexpr char kLogName[] = "Log";
+constexpr char kSqrtName[] = "Sqrt";
+constexpr char kRsqrtName[] = "Rsqrt";
+constexpr char kSquareName[] = "Square";
+constexpr char kNotName[] = "Not";
+
+struct OpData {
+  int32_t multiplier;
+  int32_t shift;
+  int input_offset;
+  int output_offset;
+};
+
 bool IsNumericSupportedType(const TfLiteType type) {
   return type == kTfLiteFloat32;
 }
@@ -39,6 +57,10 @@ bool IsLogicalSupportedType(const TfLiteType type) {
   return type == kTfLiteBool;
 }
 
+bool IsAbsSupportedType(const TfLiteType type) {
+  return type == kTfLiteFloat32 || type == kTfLiteInt8;
+}
+
 typedef bool (*IsSupportedType)(TfLiteType);
 template <IsSupportedType is_supported_type, const char* op_name>
 TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteNode* node) {
@@ -54,9 +76,44 @@ TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteNode* node) {
                                TfLiteIntArrayCopy(input->dims));
 }
 
+TfLiteStatus AbsPrepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(
+      context, (GenericPrepare<IsAbsSupportedType, kAbsName>(context, node)),
+      kTfLiteOk);
+  const TfLiteTensor* input = GetInput(context, node, 0);
+  if (input->type == kTfLiteInt8) {
+    TfLiteTensor* output = GetOutput(context, node, 0);
+    auto* op_data = static_cast<OpData*>(node->user_data);
+    TF_LITE_ENSURE_EQ(context, input->quantization.type,
+                      kTfLiteAffineQuantization);
+    TF_LITE_ENSURE_EQ(context, output->quantization.type,
+                      kTfLiteAffineQuantization);
+    const auto* input_params =
+        reinterpret_cast<TfLiteAffineQuantization*>(input->quantization.params);
+    const auto* output_params = reinterpret_cast<TfLiteAffineQuantization*>(
+        output->quantization.params);
+    TF_LITE_ENSURE(context, input_params != nullptr);
+    TF_LITE_ENSURE(context, input_params->scale != nullptr);
+    TF_LITE_ENSURE(context, input_params->scale->size > 0);
+    TF_LITE_ENSURE(context, input_params->zero_point->size > 0);
+    TF_LITE_ENSURE(context, output_params != nullptr);
+    TF_LITE_ENSURE(context, output_params->scale != nullptr);
+    TF_LITE_ENSURE(context, output_params->scale->size > 0);
+    TF_LITE_ENSURE(context, output_params->zero_point->size > 0);
+    op_data->input_offset = input_params->zero_point->data[0];
+    op_data->output_offset = output_params->zero_point->data[0];
+    const float input_scale = input_params->scale->data[0];
+    const float output_scale = output_params->scale->data[0];
+    double scale = input_scale / output_scale;
+    QuantizeMultiplier(scale, &op_data->multiplier, &op_data->shift);
+  }
+  return kTfLiteOk;
+}
+
 template <typename T>
 inline TfLiteStatus EvalImpl(TfLiteContext* context, TfLiteNode* node,
-                             T func(T), TfLiteType expected_type) {
+                             std::function<T(T)> func,
+                             TfLiteType expected_type) {
   const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
   TF_LITE_ENSURE_TYPES_EQ(context, input->type, expected_type);
@@ -79,8 +136,39 @@ inline TfLiteStatus EvalLogical(TfLiteContext* context, TfLiteNode* node,
   return EvalImpl<bool>(context, node, bool_func, kTfLiteBool);
 }
 
+void* AbsInit(TfLiteContext* context, const char* buffer, size_t length) {
+  return new OpData();
+}
+
+void AbsFree(TfLiteContext* context, void* buffer) {
+  delete static_cast<OpData*>(buffer);
+}
+
 TfLiteStatus AbsEval(TfLiteContext* context, TfLiteNode* node) {
-  return EvalNumeric(context, node, std::abs);
+  const TfLiteType type = GetInput(context, node, 0)->type;
+  switch (type) {
+    case kTfLiteFloat32:
+      return EvalImpl<float>(context, node, std::abs<float>, type);
+    case kTfLiteInt8: {
+      const auto* op_data = static_cast<const OpData*>(node->user_data);
+      const int kMinInt8 = std::numeric_limits<int8_t>::min();
+      const int kMaxInt8 = std::numeric_limits<int8_t>::max();
+      std::function<int8_t(int8_t)> func = [&](int8_t i) {
+        const int32_t value = std::abs(i - op_data->input_offset);
+        return std::min(
+            std::max(op_data->output_offset +
+                         MultiplyByQuantizedMultiplier(
+                             value, op_data->multiplier, op_data->shift),
+                     kMinInt8),
+            kMaxInt8);
+      };
+      return EvalImpl<int8_t>(context, node, func, type);
+    }
+    default:
+      TF_LITE_KERNEL_LOG(context, "Current data type %s is not supported.",
+                         TfLiteTypeGetName(type));
+      return kTfLiteError;
+  }
 }
 
 TfLiteStatus SinEval(TfLiteContext* context, TfLiteNode* node) {
@@ -111,24 +199,12 @@ TfLiteStatus LogicalNotEval(TfLiteContext* context, TfLiteNode* node) {
   return EvalLogical(context, node, [](bool v) { return !v; });
 }
 
-constexpr char kAbsName[] = "Abs";
-constexpr char kSinName[] = "Sin";
-constexpr char kCosName[] = "Cos";
-constexpr char kLogName[] = "Log";
-constexpr char kSqrtName[] = "Sqrt";
-constexpr char kRsqrtName[] = "Rsqrt";
-constexpr char kSquareName[] = "Square";
-constexpr char kNotName[] = "Not";
-
 }  // namespace
 }  // namespace elementwise
 
 TfLiteRegistration* Register_ABS() {
-  static TfLiteRegistration r = {
-      /*init=*/nullptr, /*free=*/nullptr,
-      elementwise::GenericPrepare<elementwise::IsNumericSupportedType,
-                                  elementwise::kAbsName>,
-      elementwise::AbsEval};
+  static TfLiteRegistration r = {elementwise::AbsInit, elementwise::AbsFree,
+                                 elementwise::AbsPrepare, elementwise::AbsEval};
   return &r;
 }
 
diff --git a/tensorflow/lite/kernels/elementwise_test.cc b/tensorflow/lite/kernels/elementwise_test.cc
index 9495be0e590..e0f198f8f9b 100644
--- a/tensorflow/lite/kernels/elementwise_test.cc
+++ b/tensorflow/lite/kernels/elementwise_test.cc
@@ -47,6 +47,44 @@ class ElementWiseOpFloatModel : public ElementWiseOpBaseModel {
   }
 };
 
+class ElementWiseOpInt8Model : public ElementWiseOpBaseModel {
+ public:
+  ElementWiseOpInt8Model(BuiltinOperator op, TensorData input_tensor_data,
+                         TensorData output_tensor_data) {
+    input_ = AddInput(input_tensor_data);
+    output_ = AddOutput(output_tensor_data);
+    SetBuiltinOp(op, BuiltinOptions_NONE, 0);
+    BuildInterpreter({input_tensor_data.shape});
+  }
+
+  template <typename T>
+  void AsymmetricQuantizeAndPopulate(int index,
+                                     const std::vector<float>& data) {
+    std::vector<int8_t> q(data.size());
+    float scaling_factor;
+    int zero_point;
+    tensor_utils::AsymmetricQuantizeFloats(data.data(), data.size(), q.data(),
+                                           &scaling_factor, &zero_point);
+    PopulateTensor<T>(index, /*offset=*/0, reinterpret_cast<T*>(q.data()),
+                      reinterpret_cast<T*>(q.data() + q.size()));
+  }
+
+  template <typename T>
+  std::vector<float> ExtractDequantVector(int index) {
+    auto vec = ExtractVector<T>(index);
+    TfLiteTensor* t = interpreter_->tensor(index);
+    auto* affine_quantization =
+        reinterpret_cast<TfLiteAffineQuantization*>(t->quantization.params);
+    float scaling_factor = affine_quantization->scale->data[0];
+    int zero_point = affine_quantization->zero_point->data[0];
+    std::vector<float> output;
+    for (const auto& v : vec) {
+      output.push_back((static_cast<T>(v) - zero_point) * scaling_factor);
+    }
+    return output;
+  }
+};
+
 class ElementWiseOpBoolModel : public ElementWiseOpBaseModel {
  public:
   ElementWiseOpBoolModel(BuiltinOperator op,
@@ -98,6 +136,36 @@ TEST(FloatActivationsOpTest, Abs) {
                                                   }));
 }
 
+TEST(FloatActivationsOpTest, AbsInt8) {
+  std::vector<float> data = {15., 46., 78., -142., -1., -17., -49., 113.};
+  std::vector<float> abs_data(data.size());
+  for (int i = 0; i < abs_data.size(); i++) {
+    abs_data[i] = std::abs(data[i]);
+  }
+  const auto minmax = std::minmax_element(data.begin(), data.end());
+  const float abs_max = std::max(std::abs(*minmax.first), *minmax.second);
+  const float kInputScale = (*minmax.second - *minmax.first) / 255.0;
+  const float kOutputScale = abs_max / 255.0;
+  const int input_zero_point = 127 - *minmax.second;
+  const int output_zero_point = -128;
+  ElementWiseOpInt8Model m(
+      BuiltinOperator_ABS,
+      {TensorType_INT8,
+       {1, 8},
+       *minmax.first,
+       *minmax.second,
+       kInputScale,
+       input_zero_point,
+       true,
+       {kInputScale},
+       {input_zero_point}},
+      {TensorType_INT8, {1, 8}, 0, abs_max, kOutputScale, output_zero_point});
+  m.AsymmetricQuantizeAndPopulate<int8_t>(m.input(), data);
+  m.Invoke();
+  EXPECT_THAT(m.ExtractDequantVector<int8_t>(m.output()),
+              ElementsAreArray(ArrayFloatNear(abs_data, kInputScale)));
+}
+
 TEST(ElementWise, Sqrt) {
   ElementWiseOpFloatModel m(BuiltinOperator_SQRT, {1, 1, 4, 1});
   m.PopulateTensor<float>(m.input(), {0, 1, 2, 4});
diff --git a/tensorflow/lite/kernels/fully_connected_test.cc b/tensorflow/lite/kernels/fully_connected_test.cc
index 7f02ed079bd..9a80c4eebfa 100644
--- a/tensorflow/lite/kernels/fully_connected_test.cc
+++ b/tensorflow/lite/kernels/fully_connected_test.cc
@@ -1144,7 +1144,7 @@ class SparseFullyConnectedOpModel : public SingleOpModel {
   SparseFullyConnectedOpModel(TfLiteRegistration* registration, int units,
                               int batches, const TensorData& input,
                               const TensorData& weights,
-                              std::initializer_list<T> weights_data,
+                              const std::vector<T>& weights_data,
                               int num_threads = 1)
       : batches_(batches), units_(units) {
     int total_input_size = 1;
diff --git a/tensorflow/lite/kernels/internal/BUILD b/tensorflow/lite/kernels/internal/BUILD
index 2707871df16..4d4ea84685e 100644
--- a/tensorflow/lite/kernels/internal/BUILD
+++ b/tensorflow/lite/kernels/internal/BUILD
@@ -1,7 +1,9 @@
+load("@bazel_skylib//lib:selects.bzl", "selects")
 load("//tensorflow:tensorflow.bzl", "transitive_hdrs")
 load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 load("//tensorflow/lite/micro:build_def.bzl", "micro_copts")
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite_combined")
+load("//tensorflow:tensorflow.bzl", "get_compatible_with_portable")
 
 package(
     default_visibility = [
@@ -46,6 +48,7 @@ NEON_FLAGS_IF_APPLICABLE = select({
 cc_library(
     name = "compatibility",
     hdrs = ["compatibility.h"],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
     deps = [
         "//tensorflow/lite/kernels:op_macros",
@@ -55,6 +58,7 @@ cc_library(
 cc_library(
     name = "types",
     hdrs = ["types.h"],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
     deps = [
         ":compatibility",
@@ -218,6 +222,7 @@ cc_library(
     name = "common",
     srcs = [],
     hdrs = ["common.h"],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
     deps = [
         ":cppmath",
@@ -252,6 +257,7 @@ cc_library(
         "optimized/optimized_ops.h",
         "optimized/sparse_ops/fully_connected.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
     deps = [
         ":common",
@@ -335,7 +341,7 @@ cc_library(
 )
 
 cc_library(
-    name = "optimized",
+    name = "optimized_eigen",
     hdrs = [
         "optimized/eigen_spatial_convolutions.h",
         "optimized/eigen_tensor_reduced_instantiations_oss.h",
@@ -371,6 +377,7 @@ cc_library(
         "max.h",
         "min.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
 )
 
@@ -378,6 +385,7 @@ cc_library(
     name = "quantization_util",
     srcs = ["quantization_util.cc"],
     hdrs = ["quantization_util.h"],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts() + micro_copts(),
     deps = [
         ":compatibility",
@@ -405,6 +413,7 @@ cc_library(
     hdrs = [
         "transpose_utils.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
     deps = [
         ":types",
@@ -426,6 +435,7 @@ cc_library(
     hdrs = [
         "strided_slice_logic.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
     deps = [
         ":compatibility",
@@ -458,6 +468,7 @@ cc_library(
         "reference/integer_ops/fully_connected.h",
         "reference/integer_ops/l2normalization.h",
         "reference/integer_ops/logistic.h",
+        "reference/integer_ops/mean.h",
         "reference/integer_ops/mul.h",
         "reference/integer_ops/pooling.h",
         "reference/integer_ops/tanh.h",
@@ -486,12 +497,13 @@ cc_library(
         "//conditions:default": [
             "reference/integer_ops/dequantize.h",
             "reference/integer_ops/log_softmax.h",
-            "reference/integer_ops/mean.h",
             "reference/integer_ops/transpose_conv.h",
             "reference/reference_ops.h",
+            "reference/string_comparisons.h",
             "reference/sparse_ops/fully_connected.h",
         ],
     }),
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
     deps = [
         ":common",
@@ -560,6 +572,7 @@ cc_library(
         "reference/round.h",
         "reference/softmax.h",
         "reference/strided_slice.h",
+        "reference/string_comparisons.h",
         "reference/sub.h",
         "reference/tanh.h",
     ],
@@ -597,9 +610,15 @@ cc_library(
 cc_library(
     name = "tensor",
     hdrs = [
-        "tensor.h",
+        "portable_tensor.h",
         "tensor_ctypes.h",
-    ],
+    ] + select({
+        ":tf_lite_static_memory": [],
+        "//conditions:default": [
+            "tensor.h",
+        ],
+    }),
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
     deps = [
         ":types",
@@ -612,9 +631,14 @@ cc_library(
 cc_library(
     name = "reference",
     hdrs = [
-        "tensor.h",
+        "portable_tensor.h",
         "tensor_ctypes.h",
-    ],
+    ] + select({
+        ":tf_lite_static_memory": [],
+        "//conditions:default": [
+            "tensor.h",
+        ],
+    }),
     copts = tflite_copts(),
     deps = [
         ":types",
@@ -632,6 +656,7 @@ cc_library(
         "reference/portable_tensor_utils.h",
         "reference/portable_tensor_utils_impl.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
     deps = [
         ":common",
@@ -652,6 +677,7 @@ cc_library(
         "optimized/neon_tensor_utils.h",
         "optimized/neon_tensor_utils_impl.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts() + NEON_FLAGS_IF_APPLICABLE + HARD_FP_FLAGS_IF_APPLICABLE,
     deps = [
         ":common",
@@ -676,6 +702,7 @@ cc_library(
         "optimized/sse_tensor_utils.h",
         "optimized/sse_tensor_utils_impl.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
     deps = [
         ":cpu_check",
@@ -693,6 +720,7 @@ cc_library(
     name = "kernel_utils",
     srcs = ["kernel_utils.cc"],
     hdrs = ["kernel_utils.h"],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts() + micro_copts(),
     deps = [
         ":tensor_utils",
@@ -715,6 +743,7 @@ cc_library(
         "mfcc_mel_filterbank.h",
         "spectrogram.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
     deps = [
         "//third_party/fft2d:fft2d_headers",
@@ -730,75 +759,42 @@ cc_library(
     hdrs = [
         "tensor_utils.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts() + NEON_FLAGS_IF_APPLICABLE,
     deps = [
         ":cpu_check",
         "//third_party/eigen3",
         "//tensorflow/lite/c:common",
-    ] + select({
-        ":aarch64": [
-            ":neon_tensor_utils",
-        ],
-        ":arm": [
-            ":neon_tensor_utils",
-        ],
-        ":arm64-v8a": [
-            ":neon_tensor_utils",
-        ],
-        ":armeabi-v7a": [
-            ":neon_tensor_utils",
-        ],
-        ":armhf": [
-            ":neon_tensor_utils",
-        ],
-        ":armv7a": [
-            ":neon_tensor_utils",
-        ],
-        ":haswell": [
+    ] + selects.with_or({
+        (
+            ":aarch64",
+            ":arm",
+            ":arm64-v8a",
+            ":armeabi-v7a",
+            ":armhf",
+            ":armv7a",
+            ":ios_armv7",
+            ":ios_arm64",
+            ":ios_arm64e",
+            ":raspberry_pi_with_neon",
+        ): [":neon_tensor_utils"],
+        (
+            ":darwin",
+            ":darwin_x86_64",
+            ":freebsd",
+            ":haswell",
+            ":ios_x86_64",
+            ":x86_64",
+            ":x86",
+            ":k8",
+            ":windows",
+        ): [
             ":sse_tensor_utils",
         ],
-        ":ios_armv7": [
-            ":neon_tensor_utils",
-        ],
-        ":ios_arm64": [
-            ":neon_tensor_utils",
-        ],
-        ":ios_arm64e": [
-            ":neon_tensor_utils",
-        ],
-        ":raspberry_pi_with_neon": [
-            ":neon_tensor_utils",
-        ],
-        ":ios_x86_64": [
-            ":sse_tensor_utils",
-        ],
-        ":x86_64": [
-            ":sse_tensor_utils",
-        ],
-        ":x86": [
-            ":sse_tensor_utils",
-        ],
-        ":k8": [
-            ":sse_tensor_utils",
-        ],
-        ":darwin": [
-            ":sse_tensor_utils",
-        ],
-        ":darwin_x86_64": [
-            ":sse_tensor_utils",
-        ],
-        ":freebsd": [
-            ":sse_tensor_utils",
-        ],
-        ":windows": [
-            ":sse_tensor_utils",
-        ],
-        ":tf_lite_static_memory": [
-            ":portable_tensor_utils",
-        ],
-        "//conditions:default": [
-            ":portable_tensor_utils",
-        ],
+        (
+            ":tf_lite_static_memory",
+            "//conditions:default",
+        ): [":portable_tensor_utils"],
     }),
 )
 
@@ -1057,6 +1053,7 @@ cc_library(
         "optimized/neon_check.h",
         "optimized/sse_check.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
     deps = select({
         ":haswell": tflite_deps_intel,
diff --git a/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h b/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h
index 1b4b88fc622..0f4257d3700 100644
--- a/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h
+++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h
@@ -1819,8 +1819,7 @@ inline void DepthwiseConvWithRounding(
 #if defined(__ANDROID__) && defined(__clang__)
   CpuFlags cpu_flags;
   GetCpuFlags(&cpu_flags);
-  // TODO(b/150208140): Re-enable once erroneous activation in test is resolved.
-  const bool has_dot_product_instructions = false && cpu_flags.neon_dotprod;
+  const bool has_dot_product_instructions = cpu_flags.neon_dotprod;
 
   // Dispatch to dot-product 3x3 kernels when supported.
   if (has_dot_product_instructions) {
diff --git a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
index c505ee81767..f4cd76386eb 100644
--- a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
@@ -2536,9 +2536,9 @@ inline void MulSimpleBroadcast(int size, const ArithmeticParams& params,
     const int32 input2_val = params.input2_offset + input2_data[i];
     const int32 unclamped_result =
         params.output_offset +
-        MultiplyByQuantizedMultiplierSmallerThanOneExp(input1_val * input2_val,
-                                                       params.output_multiplier,
-                                                       params.output_shift);
+        MultiplyByQuantizedMultiplier(input1_val * input2_val,
+                                      params.output_multiplier,
+                                      params.output_shift);
     const int32 clamped_output =
         std::min(params.quantized_activation_max,
                  std::max(params.quantized_activation_min, unclamped_result));
@@ -8132,6 +8132,166 @@ inline void BroadcastMinimumDispatch(const ArithmeticParams& params,
                           MinimumElementwise, MinimumScalarBroadcast);
 }
 
+template <typename T>
+void CumsumImpl(const T* input_data, const RuntimeShape& shape, int axis,
+                bool exclusive, bool reverse, T* output_data) {
+  Eigen::array<Eigen::DenseIndex, 3> dims = {1, 1, 1};
+
+  for (int i = 0; i < axis; ++i) {
+    dims[0] *= shape.Dims(i);
+  }
+  dims[1] = shape.Dims(axis);
+  for (int i = axis + 1; i < shape.DimensionsCount(); ++i) {
+    dims[2] *= shape.Dims(i);
+  }
+
+  typedef Eigen::TensorMap<
+      Eigen::Tensor<const T, 3, Eigen::RowMajor, Eigen::DenseIndex>,
+      Eigen::Aligned>
+      ConstTensor;
+  typedef Eigen::TensorMap<
+      Eigen::Tensor<T, 3, Eigen::RowMajor, Eigen::DenseIndex>, Eigen::Aligned>
+      Tensor;
+  ConstTensor input(input_data, dims);
+  Tensor output(output_data, dims);
+
+  if (reverse) {
+    Eigen::array<bool, 3> reverse_idx = {false, true, false};
+    output =
+        input.reverse(reverse_idx).cumsum(1, exclusive).reverse(reverse_idx);
+  } else {
+    output = input.cumsum(1, exclusive);
+  }
+}
+
+template <typename T>
+void CumSum(const T* input_data, const RuntimeShape& shape, int axis,
+            bool exclusive, bool reverse, T* output_data) {
+  const int dim = shape.DimensionsCount();
+  TFLITE_DCHECK_GE(dim, 1);
+  CumsumImpl<T>(input_data, shape, axis, exclusive, reverse, output_data);
+}
+
+inline void PReluScalarBroadcast(int size, const ArithmeticParams& params,
+                                 float alpha, const float* input_data,
+                                 float* output_data) {
+  ruy::profiler::ScopeLabel label("PreluScalarBroadcast/float");
+  int i = 0;
+
+#ifdef USE_NEON
+  const float32x4_t zero_dup = vdupq_n_f32(0.0f);
+  const float32x4_t alpha_dup = vdupq_n_f32(alpha);
+  for (; i <= size - 16; i += 16) {
+    const float32x4_t input1 = vld1q_f32(input_data + i);
+    const float32x4_t input2 = vld1q_f32(input_data + i + 4);
+    const float32x4_t input3 = vld1q_f32(input_data + i + 8);
+    const float32x4_t input4 = vld1q_f32(input_data + i + 12);
+
+    const float32x4_t temp1 = vmulq_f32(input1, alpha_dup);
+    const float32x4_t temp2 = vmulq_f32(input2, alpha_dup);
+    const float32x4_t temp3 = vmulq_f32(input3, alpha_dup);
+    const float32x4_t temp4 = vmulq_f32(input4, alpha_dup);
+
+    const uint32x4_t mask1 = vcgeq_f32(input1, zero_dup);
+    const uint32x4_t mask2 = vcgeq_f32(input2, zero_dup);
+    const uint32x4_t mask3 = vcgeq_f32(input3, zero_dup);
+    const uint32x4_t mask4 = vcgeq_f32(input4, zero_dup);
+
+    const float32x4_t result1 = vbslq_f32(mask1, input1, temp1);
+    vst1q_f32(output_data + i, result1);
+    const float32x4_t result2 = vbslq_f32(mask2, input2, temp2);
+    vst1q_f32(output_data + i + 4, result2);
+    const float32x4_t result3 = vbslq_f32(mask3, input3, temp3);
+    vst1q_f32(output_data + i + 8, result3);
+    const float32x4_t result4 = vbslq_f32(mask4, input4, temp4);
+    vst1q_f32(output_data + i + 12, result4);
+  }
+
+  for (; i <= size - 4; i += 4) {
+    const float32x4_t input = vld1q_f32(input_data + i);
+    const float32x4_t temp = vmulq_f32(input, alpha_dup);
+    const uint32x4_t mask = vcgeq_f32(input, zero_dup);
+    const float32x4_t result = vbslq_f32(mask, input, temp);
+    vst1q_f32(output_data + i, result);
+  }
+#endif  // USE_NEON
+  for (; i < size; ++i) {
+    const float input = input_data[i];
+    output_data[i] = input >= 0.f ? input : input * alpha;
+  }
+}
+
+inline void PReluElementWise(int flat_size, const ArithmeticParams& params,
+                             const float* alpha_data, const float* input_data,
+                             float* output_data) {
+  ruy::profiler::ScopeLabel label("PreluElementWise/float");
+
+  int i = 0;
+#ifdef USE_NEON
+  const float32x4_t zero_dup = vdupq_n_f32(0.0f);
+  for (; i <= flat_size - 16; i += 16) {
+    const float32x4_t input1 = vld1q_f32(input_data + i);
+    const float32x4_t alpha1 = vld1q_f32(alpha_data + i);
+    const float32x4_t input2 = vld1q_f32(input_data + i + 4);
+    const float32x4_t alpha2 = vld1q_f32(alpha_data + i + 4);
+    const float32x4_t input3 = vld1q_f32(input_data + i + 8);
+    const float32x4_t alpha3 = vld1q_f32(alpha_data + i + 8);
+    const float32x4_t input4 = vld1q_f32(input_data + i + 12);
+    const float32x4_t alpha4 = vld1q_f32(alpha_data + i + 12);
+
+    const float32x4_t temp1 = vmulq_f32(input1, alpha1);
+    const float32x4_t temp2 = vmulq_f32(input2, alpha2);
+    const float32x4_t temp3 = vmulq_f32(input3, alpha3);
+    const float32x4_t temp4 = vmulq_f32(input4, alpha4);
+
+    const uint32x4_t mask1 = vcgeq_f32(input1, zero_dup);
+    const uint32x4_t mask2 = vcgeq_f32(input2, zero_dup);
+    const uint32x4_t mask3 = vcgeq_f32(input3, zero_dup);
+    const uint32x4_t mask4 = vcgeq_f32(input4, zero_dup);
+
+    const float32x4_t result1 = vbslq_f32(mask1, input1, temp1);
+    vst1q_f32(output_data + i, result1);
+    const float32x4_t result2 = vbslq_f32(mask2, input2, temp2);
+    vst1q_f32(output_data + i + 4, result2);
+    const float32x4_t result3 = vbslq_f32(mask3, input3, temp3);
+    vst1q_f32(output_data + i + 8, result3);
+    const float32x4_t result4 = vbslq_f32(mask4, input4, temp4);
+    vst1q_f32(output_data + i + 12, result4);
+  }
+
+  for (; i <= flat_size - 4; i += 4) {
+    const float32x4_t input = vld1q_f32(input_data + i);
+    const float32x4_t alpha = vld1q_f32(alpha_data + i);
+
+    const float32x4_t temp = vmulq_f32(input, alpha);
+    const uint32x4_t mask = vcgeq_f32(input, zero_dup);
+    const float32x4_t result = vbslq_f32(mask, input, temp);
+    vst1q_f32(output_data + i, result);
+  }
+#endif  // USE_NEON
+  for (; i < flat_size; ++i) {
+    const float input = input_data[i];
+    const float alpha = alpha_data[i];
+    output_data[i] = input >= 0.f ? input : input * alpha;
+  }
+}
+
+inline void BroadcastPReluDispatch(
+    const ArithmeticParams& params, const RuntimeShape& input_shape,
+    const float* input_data, const RuntimeShape& alpha_shape,
+    const float* alpha_data, const RuntimeShape& output_shape,
+    float* output_data, float (*func)(float, float)) {
+  if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast) {
+    return reference_ops::BroadcastBinaryFunction4DSlow<float, float, float>(
+        input_shape, input_data, alpha_shape, alpha_data, output_shape,
+        output_data, func);
+  }
+
+  BinaryBroadcastFiveFold(params, input_shape, input_data, alpha_shape,
+                          alpha_data, output_shape, output_data,
+                          PReluElementWise, PReluScalarBroadcast);
+}
+
 }  // namespace optimized_ops
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/internal/portable_tensor.h b/tensorflow/lite/kernels/internal/portable_tensor.h
new file mode 100644
index 00000000000..8b0f6d1e535
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/portable_tensor.h
@@ -0,0 +1,123 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_PORTABLE_TENSOR_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_PORTABLE_TENSOR_H_
+
+#include <complex>
+#include <vector>
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+inline RuntimeShape GetTensorShape(std::vector<int32_t> data) {
+  return RuntimeShape(data.size(), data.data());
+}
+
+// A list of tensors in a format that can be used by kernels like split and
+// concatenation.
+template <typename T>
+class VectorOfTensors {
+ public:
+  // Build with the tensors in 'tensor_list'.
+  VectorOfTensors(const TfLiteContext& context,
+                  const TfLiteIntArray& tensor_list) {
+    int num_tensors = tensor_list.size;
+
+    all_data_.reserve(num_tensors);
+    all_shape_.reserve(num_tensors);
+    all_shape_ptr_.reserve(num_tensors);
+
+    for (int i = 0; i < num_tensors; ++i) {
+      TfLiteTensor* t = &context.tensors[tensor_list.data[i]];
+      all_data_.push_back(GetTensorData<T>(t));
+      all_shape_.push_back(GetTensorShape(t));
+    }
+
+    // Taking the pointer from inside a std::vector is only OK if the vector is
+    // never modified, so we populate all_shape in the previous loop and then we
+    // are free to grab iterators here.
+    for (int i = 0; i < num_tensors; ++i) {
+      all_shape_ptr_.push_back(&all_shape_[i]);
+    }
+  }
+  // Return a pointer to the data pointers of all tensors in the list. For
+  // example:
+  //   float* const* f = v.data();
+  //   f[0][1] is the second element of the first tensor.
+  T* const* data() const { return all_data_.data(); }
+
+  // Return a pointer the shape pointers of all tensors in the list. For
+  // example:
+  //   const RuntimeShape* const* d = v.dims();
+  //   dims[1] are the dimensions of the second tensor in the list.
+  const RuntimeShape* const* shapes() const { return all_shape_ptr_.data(); }
+
+ private:
+  std::vector<T*> all_data_;
+  std::vector<RuntimeShape> all_shape_;
+  std::vector<RuntimeShape*> all_shape_ptr_;
+};
+
+// A list of quantized tensors in a format that can be used by kernels like
+// split and concatenation.
+class VectorOfQuantizedTensors : public VectorOfTensors<uint8_t> {
+ public:
+  // Build with the tensors in 'tensor_list'.
+  VectorOfQuantizedTensors(const TfLiteContext& context,
+                           const TfLiteIntArray& tensor_list)
+      : VectorOfTensors<uint8_t>(context, tensor_list) {
+    for (int i = 0; i < tensor_list.size; ++i) {
+      TfLiteTensor* t = &context.tensors[tensor_list.data[i]];
+      zero_point_.push_back(t->params.zero_point);
+      scale_.push_back(t->params.scale);
+    }
+  }
+
+  const float* scale() const { return scale_.data(); }
+  const int32_t* zero_point() const { return zero_point_.data(); }
+
+ private:
+  std::vector<int32_t> zero_point_;
+  std::vector<float> scale_;
+};
+
+// Writes randomly accessed values from `input` sequentially into `output`.
+template <typename T>
+class SequentialTensorWriter {
+ public:
+  SequentialTensorWriter(const TfLiteTensor* input, TfLiteTensor* output) {
+    input_data_ = GetTensorData<T>(input);
+    output_ptr_ = GetTensorData<T>(output);
+  }
+  SequentialTensorWriter(const T* input_data, T* output_data)
+      : input_data_(input_data), output_ptr_(output_data) {}
+
+  void Write(int position) { *output_ptr_++ = input_data_[position]; }
+  void WriteN(int position, int len) {
+    memcpy(output_ptr_, &input_data_[position], sizeof(T) * len);
+    output_ptr_ += len;
+  }
+
+ private:
+  const T* input_data_;
+  T* output_ptr_;
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_PORTABLE_TENSOR_H_
diff --git a/tensorflow/lite/kernels/internal/reference/comparisons.h b/tensorflow/lite/kernels/internal/reference/comparisons.h
index 49844ab1539..6344bdc72f9 100644
--- a/tensorflow/lite/kernels/internal/reference/comparisons.h
+++ b/tensorflow/lite/kernels/internal/reference/comparisons.h
@@ -18,7 +18,6 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/types.h"
-#include "tensorflow/lite/string_util.h"
 
 namespace tflite {
 
@@ -51,18 +50,6 @@ inline bool LessEqualFn(T lhs, T rhs) {
   return lhs <= rhs;
 }
 
-inline bool StringRefEqualFn(const StringRef& lhs, const StringRef& rhs) {
-  if (lhs.len != rhs.len) return false;
-  for (int i = 0; i < lhs.len; ++i) {
-    if (lhs.str[i] != rhs.str[i]) return false;
-  }
-  return true;
-}
-
-inline bool StringRefNotEqualFn(const StringRef& lhs, const StringRef& rhs) {
-  return !StringRefEqualFn(lhs, rhs);
-}
-
 template <typename T>
 using ComparisonFn = bool (*)(T, T);
 
@@ -78,22 +65,6 @@ inline void ComparisonImpl(
   }
 }
 
-inline void ComparisonStringImpl(bool (*F)(const StringRef&, const StringRef&),
-                                 const RuntimeShape& input1_shape,
-                                 const TfLiteTensor* input1,
-                                 const RuntimeShape& input2_shape,
-                                 const TfLiteTensor* input2,
-                                 const RuntimeShape& output_shape,
-                                 bool* output_data) {
-  const int64_t flatsize =
-      MatchingFlatSize(input1_shape, input2_shape, output_shape);
-  for (int64_t i = 0; i < flatsize; ++i) {
-    const auto lhs = GetString(input1, i);
-    const auto rhs = GetString(input2, i);
-    output_data[i] = F(lhs, rhs);
-  }
-}
-
 template <ComparisonFn<float> F>
 inline void Comparison(const ComparisonParams& op_params,
                        const RuntimeShape& input1_shape,
@@ -180,31 +151,6 @@ inline void BroadcastComparison4DSlowImpl(
   }
 }
 
-inline void BroadcastComparison4DSlowStringImpl(
-    bool (*F)(const StringRef&, const StringRef&),
-    const RuntimeShape& unextended_input1_shape, const TfLiteTensor* input1,
-    const RuntimeShape& unextended_input2_shape, const TfLiteTensor* input2,
-    const RuntimeShape& unextended_output_shape, bool* output_data) {
-  const BroadcastComparison4DSlowCommon dims =
-      BroadcastComparison4DSlowPreprocess(unextended_input1_shape,
-                                          unextended_input2_shape,
-                                          unextended_output_shape);
-
-  for (int b = 0; b < dims.output_shape.Dims(0); ++b) {
-    for (int y = 0; y < dims.output_shape.Dims(1); ++y) {
-      for (int x = 0; x < dims.output_shape.Dims(2); ++x) {
-        for (int c = 0; c < dims.output_shape.Dims(3); ++c) {
-          const auto lhs =
-              GetString(input1, SubscriptToIndex(dims.desc1, b, y, x, c));
-          const auto rhs =
-              GetString(input2, SubscriptToIndex(dims.desc2, b, y, x, c));
-          output_data[Offset(dims.output_shape, b, y, x, c)] = F(lhs, rhs);
-        }
-      }
-    }
-  }
-}
-
 template <ComparisonFn<float> F>
 inline void BroadcastComparison4DSlow(const ComparisonParams& op_params,
                                       const RuntimeShape& input1_shape,
diff --git a/tensorflow/lite/kernels/internal/reference/conv.h b/tensorflow/lite/kernels/internal/reference/conv.h
index d4bf46a86b8..b912ac1b3a4 100644
--- a/tensorflow/lite/kernels/internal/reference/conv.h
+++ b/tensorflow/lite/kernels/internal/reference/conv.h
@@ -59,28 +59,31 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
   const int output_width = output_shape.Dims(2);
   for (int batch = 0; batch < batches; ++batch) {
     for (int out_y = 0; out_y < output_height; ++out_y) {
+      const int in_y_origin = (out_y * stride_height) - pad_height;
       for (int out_x = 0; out_x < output_width; ++out_x) {
+        const int in_x_origin = (out_x * stride_width) - pad_width;
         for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
-          const int in_x_origin = (out_x * stride_width) - pad_width;
-          const int in_y_origin = (out_y * stride_height) - pad_height;
           float total = 0.f;
           for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            const int in_y = in_y_origin + dilation_height_factor * filter_y;
             for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              const int in_x = in_x_origin + dilation_width_factor * filter_x;
+
+              // Zero padding by omitting the areas outside the image.
+              const bool is_point_inside_image =
+                  (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                  (in_y < input_height);
+
+              if (!is_point_inside_image) {
+                continue;
+              }
+
               for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
-                const int in_x = in_x_origin + dilation_width_factor * filter_x;
-                const int in_y =
-                    in_y_origin + dilation_height_factor * filter_y;
-                // If the location is outside the bounds of the input image,
-                // use zero as a default value.
-                if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
-                    (in_y < input_height)) {
-                  float input_value = input_data[Offset(
-                      input_shape, batch, in_y, in_x, in_channel)];
-                  float filter_value =
-                      filter_data[Offset(filter_shape, out_channel, filter_y,
-                                         filter_x, in_channel)];
-                  total += (input_value * filter_value);
-                }
+                float input_value = input_data[Offset(input_shape, batch, in_y,
+                                                      in_x, in_channel)];
+                float filter_value = filter_data[Offset(
+                    filter_shape, out_channel, filter_y, filter_x, in_channel)];
+                total += (input_value * filter_value);
               }
             }
           }
@@ -139,29 +142,32 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
   const int output_width = output_shape.Dims(2);
   for (int batch = 0; batch < batches; ++batch) {
     for (int out_y = 0; out_y < output_height; ++out_y) {
+      const int in_y_origin = (out_y * stride_height) - pad_height;
       for (int out_x = 0; out_x < output_width; ++out_x) {
+        const int in_x_origin = (out_x * stride_width) - pad_width;
         for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
-          const int in_x_origin = (out_x * stride_width) - pad_width;
-          const int in_y_origin = (out_y * stride_height) - pad_height;
           int32_t acc = 0;
           for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            const int in_y = in_y_origin + dilation_height_factor * filter_y;
             for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              const int in_x = in_x_origin + dilation_width_factor * filter_x;
+
+              // Zero padding by omitting the areas outside the image.
+              const bool is_point_inside_image =
+                  (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                  (in_y < input_height);
+
+              if (!is_point_inside_image) {
+                continue;
+              }
+
               for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
-                const int in_x = in_x_origin + dilation_width_factor * filter_x;
-                const int in_y =
-                    in_y_origin + dilation_height_factor * filter_y;
-                // If the location is outside the bounds of the input image,
-                // use zero as a default value.
-                if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
-                    (in_y < input_height)) {
-                  int32_t input_val = input_data[Offset(
-                      input_shape, batch, in_y, in_x, in_channel)];
-                  int32_t filter_val =
-                      filter_data[Offset(filter_shape, out_channel, filter_y,
-                                         filter_x, in_channel)];
-                  acc +=
-                      (filter_val + filter_offset) * (input_val + input_offset);
-                }
+                int32_t input_val = input_data[Offset(input_shape, batch, in_y,
+                                                      in_x, in_channel)];
+                int32_t filter_val = filter_data[Offset(
+                    filter_shape, out_channel, filter_y, filter_x, in_channel)];
+                acc +=
+                    (filter_val + filter_offset) * (input_val + input_offset);
               }
             }
           }
@@ -258,5 +264,4 @@ inline void HybridConvPerChannel(
 }  // namespace reference_ops
 }  // namespace tflite
 
-
 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CONV_H_
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h b/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h
index f4bcb2bd06e..3e9cd0caa51 100644
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h
@@ -63,45 +63,47 @@ inline void ConvPerChannel(
   const int output_width = output_shape.Dims(2);
   for (int batch = 0; batch < batches; ++batch) {
     for (int out_y = 0; out_y < output_height; ++out_y) {
+      const int in_y_origin = (out_y * stride_height) - pad_height;
       for (int out_x = 0; out_x < output_width; ++out_x) {
+        const int in_x_origin = (out_x * stride_width) - pad_width;
         for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
-          const int in_x_origin = (out_x * stride_width) - pad_width;
-          const int in_y_origin = (out_y * stride_height) - pad_height;
           int32_t acc = 0;
           for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            const int in_y = in_y_origin + dilation_height_factor * filter_y;
             for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              const int in_x = in_x_origin + dilation_width_factor * filter_x;
+
+              // Zero padding by omitting the areas outside the image.
+              const bool is_point_inside_image =
+                  (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                  (in_y < input_height);
+
+              if (!is_point_inside_image) {
+                continue;
+              }
+
               for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
-                const int in_x = in_x_origin + dilation_width_factor * filter_x;
-                const int in_y =
-                    in_y_origin + dilation_height_factor * filter_y;
-                // Zero padding by omitting the areas outside the image.
-                const bool is_point_inside_image =
-                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
-                    (in_y < input_height);
-                if (is_point_inside_image) {
-                  int32_t input_val = input_data[Offset(
-                      input_shape, batch, in_y, in_x, in_channel)];
-                  int32_t filter_val =
-                      filter_data[Offset(filter_shape, out_channel, filter_y,
-                                         filter_x, in_channel)];
-                  // Accumulate with 32 bits accumulator.
-                  // In the nudging process during model quantization, we force
-                  // real value of 0.0 be represented by a quantized value. This
-                  // guarantees that the input_offset is a int8_t, even though
-                  // it is represented using int32_t. int32_t += int8_t *
-                  // (int8_t - int8_t) so the highest value we can get from each
-                  // accumulation is [-127, 127] * ([-128, 127] -
-                  // [-128, 127]), which is [-32512, 32512]. log2(32512)
-                  // = 14.98, which means we can accumulate at least 2^16
-                  // multiplications without overflow. The accumulator is
-                  // applied to a filter so the accumulation logic will hold as
-                  // long as the filter size (filter_y * filter_x * in_channel)
-                  // does not exceed 2^16, which is the case in all the models
-                  // we have seen so far.
-                  // TODO(jianlijianli): Add a check to make sure the
-                  // accumulator depth is smaller than 2^16.
-                  acc += filter_val * (input_val + input_offset);
-                }
+                int32_t input_val = input_data[Offset(input_shape, batch, in_y,
+                                                      in_x, in_channel)];
+                int32_t filter_val = filter_data[Offset(
+                    filter_shape, out_channel, filter_y, filter_x, in_channel)];
+                // Accumulate with 32 bits accumulator.
+                // In the nudging process during model quantization, we force
+                // real value of 0.0 be represented by a quantized value. This
+                // guarantees that the input_offset is a int8_t, even though
+                // it is represented using int32_t. int32_t += int8_t *
+                // (int8_t - int8_t) so the highest value we can get from each
+                // accumulation is [-127, 127] * ([-128, 127] -
+                // [-128, 127]), which is [-32512, 32512]. log2(32512)
+                // = 14.98, which means we can accumulate at least 2^16
+                // multiplications without overflow. The accumulator is
+                // applied to a filter so the accumulation logic will hold as
+                // long as the filter size (filter_y * filter_x * in_channel)
+                // does not exceed 2^16, which is the case in all the models
+                // we have seen so far.
+                // TODO(jianlijianli): Add a check to make sure the
+                // accumulator depth is smaller than 2^16.
+                acc += filter_val * (input_val + input_offset);
               }
             }
           }
@@ -164,35 +166,37 @@ inline void ConvPerChannel(
   const int output_width = output_shape.Dims(2);
   for (int batch = 0; batch < batches; ++batch) {
     for (int out_y = 0; out_y < output_height; ++out_y) {
+      const int in_y_origin = (out_y * stride_height) - pad_height;
       for (int out_x = 0; out_x < output_width; ++out_x) {
+        const int in_x_origin = (out_x * stride_width) - pad_width;
         for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
-          const int in_x_origin = (out_x * stride_width) - pad_width;
-          const int in_y_origin = (out_y * stride_height) - pad_height;
           std::int64_t acc = 0;
           for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            const int in_y = in_y_origin + dilation_height_factor * filter_y;
             for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              const int in_x = in_x_origin + dilation_width_factor * filter_x;
+
+              // Zero padding by omitting the areas outside the image.
+              const bool is_point_inside_image =
+                  (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                  (in_y < input_height);
+
+              if (!is_point_inside_image) {
+                continue;
+              }
+
               for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
-                const int in_x = in_x_origin + dilation_width_factor * filter_x;
-                const int in_y =
-                    in_y_origin + dilation_height_factor * filter_y;
-                // Zero padding by omitting the areas outside the image.
-                const bool is_point_inside_image =
-                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
-                    (in_y < input_height);
-                if (is_point_inside_image) {
-                  int32_t input_val = input_data[Offset(
-                      input_shape, batch, in_y, in_x, in_channel)];
-                  int32_t filter_val =
-                      filter_data[Offset(filter_shape, out_channel, filter_y,
-                                         filter_x, in_channel)];
-                  // Accumulate with 64 bits accumulator.
-                  // int64_t += int8_t * int16_t so the highest value we can
-                  // get from each accumulation is [-127, 127] * ([-32768,
-                  // 32767] -
-                  // [-32768, 32767]), which is [-8322945, 8322945].
-                  // log2(8322945) = 22.99.
-                  acc += filter_val * input_val;
-                }
+                int32_t input_val = input_data[Offset(input_shape, batch, in_y,
+                                                      in_x, in_channel)];
+                int32_t filter_val = filter_data[Offset(
+                    filter_shape, out_channel, filter_y, filter_x, in_channel)];
+                // Accumulate with 64 bits accumulator.
+                // int64_t += int8_t * int16_t so the highest value we can
+                // get from each accumulation is [-127, 127] * ([-32768,
+                // 32767] -
+                // [-32768, 32767]), which is [-8322945, 8322945].
+                // log2(8322945) = 22.99.
+                acc += filter_val * input_val;
               }
             }
           }
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/mean.h b/tensorflow/lite/kernels/internal/reference/integer_ops/mean.h
index 1e29f8c61a7..bd484270012 100644
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/mean.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/mean.h
@@ -23,9 +23,9 @@ namespace reference_integer_ops {
 template <typename integer_type>
 inline void Mean(const tflite::MeanParams& op_params, int32_t multiplier,
                  int32_t shift, const RuntimeShape& unextended_input_shape,
-                 const integer_type* input_data, int32 input_zero_point,
+                 const integer_type* input_data, int32_t input_zero_point,
                  const RuntimeShape& unextended_output_shape,
-                 integer_type* output_data, int32 output_zero_point) {
+                 integer_type* output_data, int32_t output_zero_point) {
   // Current implementation only supports dimension equals 4 and simultaneous
   // reduction over width and height.
   TFLITE_CHECK_EQ(unextended_input_shape.DimensionsCount(), 4);
@@ -53,7 +53,7 @@ inline void Mean(const tflite::MeanParams& op_params, int32_t multiplier,
 
   for (int out_b = 0; out_b < output_batch; ++out_b) {
     for (int out_d = 0; out_d < output_depth; ++out_d) {
-      int32 acc = 0;
+      int32_t acc = 0;
       for (int in_h = 0; in_h < input_height; ++in_h) {
         for (int in_w = 0; in_w < input_width; ++in_w) {
           acc += input_data[Offset(input_shape, out_b, in_h, in_w, out_d)] -
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h b/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h
index baae65ab30e..81ff34fef63 100644
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h
@@ -25,8 +25,8 @@ namespace reference_integer_ops {
 
 inline void Tanh(int32_t input_zero_point, int32_t input_range_radius,
                  int32_t input_multiplier, int32_t input_shift,
-                 int32_t input_size, const int8_t* input_data,
-                 int8_t* output_data) {
+                 const RuntimeShape& input_shape, const int8_t* input_data,
+                 const RuntimeShape& output_shape, int8_t* output_data) {
   // Integer bits must be in sync with Prepare() function.
   static constexpr int32_t kInputIntegerBits = 4;
   static constexpr int32_t kOutputScale = 7;
@@ -34,7 +34,9 @@ inline void Tanh(int32_t input_zero_point, int32_t input_range_radius,
   static constexpr int32_t kMaxInt8 = std::numeric_limits<int8_t>::max();
   using F4 = gemmlowp::FixedPoint<int32_t, kInputIntegerBits>;
 
-  for (int i = 0; i < input_size; ++i) {
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  for (int i = 0; i < flat_size; ++i) {
     const int32_t input =
         static_cast<int32_t>(input_data[i]) - input_zero_point;
     if (input <= -input_range_radius) {
@@ -58,14 +60,16 @@ inline void Tanh(int32_t input_zero_point, int32_t input_range_radius,
 }
 
 inline void Tanh(int32_t input_multiplier, int32_t input_left_shift,
-                 int32_t input_size, const int16_t* ptr_input_data,
-                 int16_t* ptr_output_data) {
+                 const RuntimeShape& input_shape, const int16_t* ptr_input_data,
+                 const RuntimeShape& output_shape, int16_t* ptr_output_data) {
   // We use the LUT for sigmoid and take into account, that
   // tanh(x) = 2*sigmoid(2*x) - 1
 
   int32_t input_data_mul = (input_multiplier > 0) ? input_multiplier : 1;
 
-  for (int i = 0; i < input_size; ++i, ptr_input_data++, ptr_output_data++) {
+  int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  for (int i = 0; i < flat_size; ++i, ptr_input_data++, ptr_output_data++) {
     int32_t input_data = (*ptr_input_data) * input_data_mul;
 
     if (input_left_shift == 1) {
diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
index d257a170091..338adf8c2ee 100644
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
@@ -182,7 +182,7 @@ void PortableMatrixBatchVectorMultiplyAccumulate(
 
   for (int batch = 0; batch < n_batch; ++batch, vectors += m_cols) {
     const float batch_scaling_factor = scaling_factors[batch];
-    const float batch_offset = input_offset[batch];
+    const int32_t batch_offset = input_offset[batch];
     const int8_t* row_ptr = matrix;
     for (int row = 0; row < m_rows; ++row) {
       int32_t dotprod = 0;
diff --git a/tensorflow/lite/kernels/internal/reference/reduce.h b/tensorflow/lite/kernels/internal/reference/reduce.h
index 597d015d0b1..7953b4347c6 100644
--- a/tensorflow/lite/kernels/internal/reference/reduce.h
+++ b/tensorflow/lite/kernels/internal/reference/reduce.h
@@ -186,11 +186,11 @@ inline bool Mean(const T* input_data, const int* input_dims,
   }
 
   // Calculate mean by dividing output_data by num of aggregated element.
-  U num_elements_in_axis = 1;
+  size_t num_elements_in_axis = 1;
   for (int idx = 0; idx < num_resolved_axis; ++idx) {
     size_t current = static_cast<size_t>(input_dims[resolved_axis[idx]]);
     // Overflow prevention.
-    if (current > (std::numeric_limits<U>::max() / num_elements_in_axis)) {
+    if (current > (std::numeric_limits<size_t>::max() / num_elements_in_axis)) {
       return false;
     }
     num_elements_in_axis *= current;
@@ -359,11 +359,11 @@ inline bool QuantizedMeanOrSum(const T* input_data, int32_t input_zero_point,
   }
 
   // Calculate mean by dividing output_data by num of aggregated element.
-  U num_elements_in_axis = 1;
+  size_t num_elements_in_axis = 1;
   for (int idx = 0; idx < num_resolved_axis; ++idx) {
     size_t current = static_cast<size_t>(input_dims[resolved_axis[idx]]);
     // Overflow prevention.
-    if (current > (std::numeric_limits<U>::max() / num_elements_in_axis)) {
+    if (current > (std::numeric_limits<size_t>::max() / num_elements_in_axis)) {
       return false;
     }
     num_elements_in_axis *= current;
diff --git a/tensorflow/lite/kernels/internal/reference/reference_ops.h b/tensorflow/lite/kernels/internal/reference/reference_ops.h
index 43b0265fd52..df771bcca27 100644
--- a/tensorflow/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/lite/kernels/internal/reference/reference_ops.h
@@ -59,6 +59,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/reference/round.h"
 #include "tensorflow/lite/kernels/internal/reference/softmax.h"
 #include "tensorflow/lite/kernels/internal/reference/strided_slice.h"
+#include "tensorflow/lite/kernels/internal/reference/string_comparisons.h"
 #include "tensorflow/lite/kernels/internal/reference/sub.h"
 #include "tensorflow/lite/kernels/internal/reference/tanh.h"
 #include "tensorflow/lite/kernels/internal/strided_slice_logic.h"
@@ -2383,6 +2384,10 @@ template <typename D, typename T>
 void SelectTrueCoords(const RuntimeShape& input_condition_shape,
                       const D* input_condition_data, T* output_data) {
   const size_t size = input_condition_shape.FlatSize();
+  if (size == 0) {
+    // Dimension is zero, in which case we don't need to output.
+    return;
+  }
   const size_t cond_rank = input_condition_shape.DimensionsCount();
 
   std::vector<int> dims_to_count(cond_rank, 0);
diff --git a/tensorflow/lite/kernels/internal/reference/softmax.h b/tensorflow/lite/kernels/internal/reference/softmax.h
index b035b433a0b..1b3f11818e5 100644
--- a/tensorflow/lite/kernels/internal/reference/softmax.h
+++ b/tensorflow/lite/kernels/internal/reference/softmax.h
@@ -16,7 +16,6 @@ limitations under the License.
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SOFTMAX_H_
 
 #include <limits>
-#include <vector>
 
 #include "fixedpoint/fixedpoint.h"
 #include "tensorflow/lite/kernels/internal/common.h"
@@ -49,15 +48,15 @@ inline void Softmax(const SoftmaxParams& params,
     // Compute sum.
     float sum = 0.f;
     for (int c = 0; c < depth; ++c) {
-      sum += std::exp((input_data[i * depth + c] - max) *
-                      static_cast<float>(params.beta));
+      const float exp_c = std::exp((input_data[i * depth + c] - max) *
+                                   static_cast<float>(params.beta));
+      output_data[i * depth + c] = exp_c;
+      sum += exp_c;
     }
 
     // Compute result.
     for (int c = 0; c < depth; ++c) {
-      output_data[i * depth + c] = std::exp((input_data[i * depth + c] - max) *
-                                            static_cast<float>(params.beta)) /
-                                   sum;
+      output_data[i * depth + c] = output_data[i * depth + c] / sum;
     }
   }
 }
@@ -145,6 +144,23 @@ inline void Softmax(const SoftmaxParams& params,
   }
 }
 
+// Computes exp(input - max_input)
+inline int16_t SoftMaxCalculateExp(const SoftmaxParams& params,
+                                   const int16_t* input_data, const int depth,
+                                   int16_t max_in_row, int i, int c) {
+  int32_t input_diff = input_data[i * depth + c] - max_in_row;
+  // scale the input_diff such that [-65535, 0] correspond to [-10.0, 0.0]
+  // exp lut generated with range [-10, 0], as exp(-10) is negligible.
+  int32_t scaled_diff = MultiplyByQuantizedMultiplier(
+      input_diff, params.input_multiplier, params.input_left_shift);
+  // recenter to [-32768, 32767]
+  int32_t sym_scaled_diff = scaled_diff + 32767;
+  int16_t sat_sym_scaled_diff =
+      std::min(std::max(sym_scaled_diff, static_cast<int32_t>(-32768)),
+               static_cast<int32_t>(32767));
+  // apply the exp() LUT activation function
+  return generic_int16_table_lookup(sat_sym_scaled_diff, params.exp_lut);
+}
 // Quantized softmax with int16_t input and int16_t output.
 inline void SoftmaxInt16(const SoftmaxParams& params,
                          const RuntimeShape& input_shape,
@@ -164,28 +180,16 @@ inline void SoftmaxInt16(const SoftmaxParams& params,
       max_in_row = std::max(max_in_row, input_data[i * depth + c]);
     }
 
-    // Compute exp(input - max_input)
-    std::vector<int16_t> exp_result_Q015(depth);
+    // This loops computes the exp values and their sum. We will need the exp
+    // values later on in the function so we cache them in the output_data
+    // buffer. This is an optimization done to avoid calculating the exp values
+    // twice making use of the output_data buffer as scratch memory.
+    int32_t sum_of_exps = 0;  // Q16.15 fixed point format.
+    int16_t* exp_results_Q015 = output_data + i * depth;
     for (int c = 0; c < depth; ++c) {
-      int32_t input_diff = input_data[i * depth + c] - max_in_row;
-      // scale the input_diff such that [-65535, 0] correspond to [-10.0, 0.0]
-      int32_t scaled_diff = MultiplyByQuantizedMultiplier(
-          input_diff, params.input_multiplier, params.input_left_shift);
-      // recenter to [-32768, 32767]
-      int32_t sym_scaled_diff = scaled_diff + 32767;
-      int16_t sat_sym_scaled_diff =
-          std::min(std::max(sym_scaled_diff, static_cast<int32_t>(-32768)),
-                   static_cast<int32_t>(32767));
-      // apply the exp() LUT activation function
-      exp_result_Q015[c] =
-          generic_int16_table_lookup(sat_sym_scaled_diff, params.exp_lut);
-    }
-
-    // sum_of_exps is a Q16.15 fixed point format.
-    int32_t sum_of_exps = 0;
-    for (int c = 0; c < depth; ++c) {
-      // Q16.15 + Q0.15
-      sum_of_exps += exp_result_Q015[c];
+      exp_results_Q015[c] =
+          SoftMaxCalculateExp(params, input_data, depth, max_in_row, i, c);
+      sum_of_exps += exp_results_Q015[c];
     }
 
     // Compute the reciprocal 1/sum_of_exps
@@ -211,7 +215,7 @@ inline void SoftmaxInt16(const SoftmaxParams& params,
     for (int c = 0; c < depth; ++c) {
       uint8_t right_shift = 31 - headroom_plus_one;
       int64_t round = 1 << (right_shift - 1);
-      int32_t result = (static_cast<int64_t>(exp_result_Q015[c]) *
+      int32_t result = (static_cast<int64_t>(exp_results_Q015[c]) *
                             static_cast<int64_t>(reciprocal_scale_Q015) +
                         round) >>
                        right_shift;
diff --git a/tensorflow/lite/kernels/internal/reference/string_comparisons.h b/tensorflow/lite/kernels/internal/reference/string_comparisons.h
new file mode 100644
index 00000000000..61c43ac73f0
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/reference/string_comparisons.h
@@ -0,0 +1,84 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_STRING_COMPARISONS_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_STRING_COMPARISONS_H_
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/reference/comparisons.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+#include "tensorflow/lite/string_util.h"
+
+namespace tflite {
+
+namespace reference_ops {
+
+inline bool StringRefEqualFn(const StringRef& lhs, const StringRef& rhs) {
+  if (lhs.len != rhs.len) return false;
+  for (int i = 0; i < lhs.len; ++i) {
+    if (lhs.str[i] != rhs.str[i]) return false;
+  }
+  return true;
+}
+
+inline bool StringRefNotEqualFn(const StringRef& lhs, const StringRef& rhs) {
+  return !StringRefEqualFn(lhs, rhs);
+}
+
+inline void ComparisonStringImpl(bool (*F)(const StringRef&, const StringRef&),
+                                 const RuntimeShape& input1_shape,
+                                 const TfLiteTensor* input1,
+                                 const RuntimeShape& input2_shape,
+                                 const TfLiteTensor* input2,
+                                 const RuntimeShape& output_shape,
+                                 bool* output_data) {
+  const int64_t flatsize =
+      MatchingFlatSize(input1_shape, input2_shape, output_shape);
+  for (int64_t i = 0; i < flatsize; ++i) {
+    const auto lhs = GetString(input1, i);
+    const auto rhs = GetString(input2, i);
+    output_data[i] = F(lhs, rhs);
+  }
+}
+
+inline void BroadcastComparison4DSlowStringImpl(
+    bool (*F)(const StringRef&, const StringRef&),
+    const RuntimeShape& unextended_input1_shape, const TfLiteTensor* input1,
+    const RuntimeShape& unextended_input2_shape, const TfLiteTensor* input2,
+    const RuntimeShape& unextended_output_shape, bool* output_data) {
+  const BroadcastComparison4DSlowCommon dims =
+      BroadcastComparison4DSlowPreprocess(unextended_input1_shape,
+                                          unextended_input2_shape,
+                                          unextended_output_shape);
+
+  for (int b = 0; b < dims.output_shape.Dims(0); ++b) {
+    for (int y = 0; y < dims.output_shape.Dims(1); ++y) {
+      for (int x = 0; x < dims.output_shape.Dims(2); ++x) {
+        for (int c = 0; c < dims.output_shape.Dims(3); ++c) {
+          const auto lhs =
+              GetString(input1, SubscriptToIndex(dims.desc1, b, y, x, c));
+          const auto rhs =
+              GetString(input2, SubscriptToIndex(dims.desc2, b, y, x, c));
+          output_data[Offset(dims.output_shape, b, y, x, c)] = F(lhs, rhs);
+        }
+      }
+    }
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_STRING_COMPARISONS_H_
diff --git a/tensorflow/lite/kernels/internal/resize_nearest_neighbor_test.cc b/tensorflow/lite/kernels/internal/resize_nearest_neighbor_test.cc
index f8a455e7451..debeb36e48f 100644
--- a/tensorflow/lite/kernels/internal/resize_nearest_neighbor_test.cc
+++ b/tensorflow/lite/kernels/internal/resize_nearest_neighbor_test.cc
@@ -91,6 +91,17 @@ TEST(ResizeNearestNeighborReference, Test2x2To3x3) {
                                      output_shape, output_data);
 }
 
+TEST(ResizeNearestNeighborReference, Test2x2To3x3Int16) {
+  RuntimeShape input_shape = {1, 2, 2, 1};
+  std::vector<int16_t> input_data = {1, 2, 3, 4};
+  std::vector<int32> output_size_data = {3, 3};
+  RuntimeShape output_shape = {1, 3, 3, 1};
+  std::vector<int16_t> output_data = {1, 1, 2, 1, 1, 2, 3, 3, 4};
+
+  TestReferenceResizeNearestNeighbor(input_shape, input_data, output_size_data,
+                                     output_shape, output_data);
+}
+
 TEST(ResizeNearestNeighborReference, Test2x2To3x3_AlignCorners) {
   RuntimeShape input_shape = {1, 2, 2, 1};
   std::vector<uint8> input_data = {1, 2, 3, 4};
diff --git a/tensorflow/lite/kernels/internal/tensor.h b/tensorflow/lite/kernels/internal/tensor.h
index 905552fc640..84de43caeb5 100644
--- a/tensorflow/lite/kernels/internal/tensor.h
+++ b/tensorflow/lite/kernels/internal/tensor.h
@@ -15,112 +15,13 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_H_
 
-#include <complex>
-#include <vector>
-
-#include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
-#include "tensorflow/lite/kernels/internal/types.h"
+// Most functionality has been moved into a version of this file that doesn't
+// rely on std::string, so that it can be used in TFL Micro.
+#include "tensorflow/lite/kernels/internal/portable_tensor.h"
 #include "tensorflow/lite/string_util.h"
 
 namespace tflite {
 
-inline RuntimeShape GetTensorShape(std::vector<int32_t> data) {
-  return RuntimeShape(data.size(), data.data());
-}
-
-// A list of tensors in a format that can be used by kernels like split and
-// concatenation.
-template <typename T>
-class VectorOfTensors {
- public:
-  // Build with the tensors in 'tensor_list'.
-  VectorOfTensors(const TfLiteContext& context,
-                  const TfLiteIntArray& tensor_list) {
-    int num_tensors = tensor_list.size;
-
-    all_data_.reserve(num_tensors);
-    all_shape_.reserve(num_tensors);
-    all_shape_ptr_.reserve(num_tensors);
-
-    for (int i = 0; i < num_tensors; ++i) {
-      TfLiteTensor* t = &context.tensors[tensor_list.data[i]];
-      all_data_.push_back(GetTensorData<T>(t));
-      all_shape_.push_back(GetTensorShape(t));
-    }
-
-    // Taking the pointer from inside a std::vector is only OK if the vector is
-    // never modified, so we populate all_shape in the previous loop and then we
-    // are free to grab iterators here.
-    for (int i = 0; i < num_tensors; ++i) {
-      all_shape_ptr_.push_back(&all_shape_[i]);
-    }
-  }
-  // Return a pointer to the data pointers of all tensors in the list. For
-  // example:
-  //   float* const* f = v.data();
-  //   f[0][1] is the second element of the first tensor.
-  T* const* data() const { return all_data_.data(); }
-
-  // Return a pointer the shape pointers of all tensors in the list. For
-  // example:
-  //   const RuntimeShape* const* d = v.dims();
-  //   dims[1] are the dimensions of the second tensor in the list.
-  const RuntimeShape* const* shapes() const { return all_shape_ptr_.data(); }
-
- private:
-  std::vector<T*> all_data_;
-  std::vector<RuntimeShape> all_shape_;
-  std::vector<RuntimeShape*> all_shape_ptr_;
-};
-
-// A list of quantized tensors in a format that can be used by kernels like
-// split and concatenation.
-class VectorOfQuantizedTensors : public VectorOfTensors<uint8_t> {
- public:
-  // Build with the tensors in 'tensor_list'.
-  VectorOfQuantizedTensors(const TfLiteContext& context,
-                           const TfLiteIntArray& tensor_list)
-      : VectorOfTensors<uint8_t>(context, tensor_list) {
-    for (int i = 0; i < tensor_list.size; ++i) {
-      TfLiteTensor* t = &context.tensors[tensor_list.data[i]];
-      zero_point_.push_back(t->params.zero_point);
-      scale_.push_back(t->params.scale);
-    }
-  }
-
-  const float* scale() const { return scale_.data(); }
-  const int32_t* zero_point() const { return zero_point_.data(); }
-
- private:
-  std::vector<int32_t> zero_point_;
-  std::vector<float> scale_;
-};
-
-// Writes randomly accessed values from `input` sequentially into `output`.
-template <typename T>
-class SequentialTensorWriter {
- public:
-  SequentialTensorWriter(const TfLiteTensor* input, TfLiteTensor* output) {
-    input_data_ = GetTensorData<T>(input);
-    output_ptr_ = GetTensorData<T>(output);
-  }
-  SequentialTensorWriter(const T* input_data, T* output_data)
-      : input_data_(input_data), output_ptr_(output_data) {}
-
-  void Write(int position) { *output_ptr_++ = input_data_[position]; }
-  void WriteN(int position, int len) {
-    memcpy(output_ptr_, &input_data_[position], sizeof(T) * len);
-    output_ptr_ += len;
-  }
-
- private:
-  const T* input_data_;
-  T* output_ptr_;
-};
-
-// String ops are not yet supported on platforms w/ static memory.
-#ifndef TF_LITE_STATIC_MEMORY
 template <>
 class SequentialTensorWriter<string> {
  public:
@@ -140,7 +41,6 @@ class SequentialTensorWriter<string> {
   TfLiteTensor* output_;
   DynamicBuffer buffer_;
 };
-#endif  // TF_LITE_STATIC_MEMORY
 
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/internal/tensor_utils_test.cc b/tensorflow/lite/kernels/internal/tensor_utils_test.cc
index 117152c2a9d..1d0a4d50eb2 100644
--- a/tensorflow/lite/kernels/internal/tensor_utils_test.cc
+++ b/tensorflow/lite/kernels/internal/tensor_utils_test.cc
@@ -2051,6 +2051,37 @@ TEST(uKernels, MeanStddevNormalizationAllBatches) {
                           ArrayFloatNear(expected_output, 1.81e-4f)));
 }
 
+TEST(uKernels, MeanStddevNormalizationLargeVector) {
+  const float mean = 100.0f;
+  const float diff = 1.0f;
+  // Some large vector that is not a round multiple of any SIMD vector sizes.
+  // Note this is odd.
+  constexpr int kVectorSize = 16 * 16 + 16 + 1;
+
+  float input[kVectorSize];
+  // First input is mean.
+  input[0] = mean;
+  // Rest is alternating between mean + diff and mean - diff.
+  for (int i = 1; i < kVectorSize - 1; i += 2) {
+    input[i + 0] = mean + diff;
+    input[i + 1] = mean - diff;
+  }
+  float output[kVectorSize];
+  MeanStddevNormalization(input, output, kVectorSize, 1);
+
+  float expected_output[kVectorSize];
+  // First output should be 0.
+  expected_output[0] = 0.0;
+  // Rest should be alternating between ±√(N/(N-1)).
+  const float expected_elem = std::sqrt(static_cast<double>(kVectorSize) /
+                                        static_cast<double>(kVectorSize - 1));
+  for (int i = 1; i < kVectorSize - 1; i += 2) {
+    expected_output[i + 0] = +expected_elem;
+    expected_output[i + 1] = -expected_elem;
+  }
+  EXPECT_THAT(output, testing::Pointwise(testing::FloatEq(), expected_output));
+}
+
 }  // namespace tensor_utils
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/kernel_util.cc b/tensorflow/lite/kernels/kernel_util.cc
index 74c8c88d953..27d9da84c0a 100644
--- a/tensorflow/lite/kernels/kernel_util.cc
+++ b/tensorflow/lite/kernels/kernel_util.cc
@@ -28,8 +28,10 @@ limitations under the License.
 
 namespace tflite {
 
-const TfLiteTensor* GetInput(const TfLiteContext* context,
-                             const TfLiteNode* node, int index) {
+namespace {
+
+inline TfLiteTensor* GetMutableInput(const TfLiteContext* context,
+                                     const TfLiteNode* node, int index) {
   if (context->tensors != nullptr) {
     return &context->tensors[node->inputs->data[index]];
   } else {
@@ -37,14 +39,16 @@ const TfLiteTensor* GetInput(const TfLiteContext* context,
   }
 }
 
+}  // anonymous namespace.
+
+const TfLiteTensor* GetInput(const TfLiteContext* context,
+                             const TfLiteNode* node, int index) {
+  return GetMutableInput(context, node, index);
+}
+
 TfLiteTensor* GetVariableInput(TfLiteContext* context, const TfLiteNode* node,
                                int index) {
-  TfLiteTensor* tensor = nullptr;
-  if (context->tensors != nullptr) {
-    tensor = &context->tensors[node->inputs->data[index]];
-  } else {
-    tensor = context->GetTensor(context, node->inputs->data[index]);
-  }
+  TfLiteTensor* tensor = GetMutableInput(context, node, index);
   return tensor->is_variable ? tensor : nullptr;
 }
 
@@ -62,11 +66,7 @@ const TfLiteTensor* GetOptionalInputTensor(const TfLiteContext* context,
   const bool use_tensor = index < node->inputs->size &&
                           node->inputs->data[index] != kTfLiteOptionalTensor;
   if (use_tensor) {
-    if (context->tensors != nullptr) {
-      return &context->tensors[node->inputs->data[index]];
-    } else {
-      return context->GetTensor(context, node->inputs->data[index]);
-    }
+    return GetMutableInput(context, node, index);
   }
   return nullptr;
 }
diff --git a/tensorflow/lite/kernels/lstm.cc b/tensorflow/lite/kernels/lstm.cc
index c39f715446b..6d67f759ce8 100644
--- a/tensorflow/lite/kernels/lstm.cc
+++ b/tensorflow/lite/kernels/lstm.cc
@@ -55,6 +55,10 @@ struct OpData {
   int scratch_tensor_index;
   lstm_eval::IntegerLstmParameter integer_lstm_param;
   bool compute_row_sums;
+
+  // Only used for sparse hybrid lstm kernels.
+  int ledger_index;
+  bool ledger_initialized;
 };
 
 namespace full {
@@ -77,6 +81,63 @@ enum HybridTemporaryTensor {
   kNumHybridTemporaryTensors = 12,
 };
 
+constexpr int kLedgersToAdd = 9;
+constexpr int kInputToInputWeightsLedgerOffset = 0;
+constexpr int kInputToForgetWeightsLedgerOffset = 1;
+constexpr int kInputToCellWeightsLedgerOffset = 2;
+constexpr int kInputToOutputWeightsLedgerOffset = 3;
+constexpr int kRecurrentToInputWeightsLedgerOffset = 4;
+constexpr int kRecurrentToForgetWeightsLedgerOffset = 5;
+constexpr int kRecurrentToCellWeightsLedgerOffset = 6;
+constexpr int kRecurrentToOutputWeightsLedgerOffset = 7;
+constexpr int kProjectionWeightsLedgerOffset = 8;
+
+TfLiteStatus make_ledger(const TfLiteSparsity* sparsity, TfLiteContext* context,
+                         TfLiteTensor* ledger) {
+  ledger->type = kTfLiteUInt8;
+  ledger->allocation_type = kTfLiteArenaRwPersistent;
+  if (sparsity == nullptr) {
+    return kTfLiteOk;
+  }
+  TfLiteIntArray* ledger_size = TfLiteIntArrayCreate(1);
+  ledger_size->data[0] = sparsity->dim_metadata[1].array_indices->size +
+                         sparsity->dim_metadata[1].array_segments->size - 1;
+  return context->ResizeTensor(context, ledger, ledger_size);
+}
+
+TfLiteStatus copy_ledger(const TfLiteSparsity* sparsity, TfLiteTensor* ledger) {
+  if (sparsity == nullptr) {
+    return kTfLiteOk;
+  }
+
+  const auto* array_segments = sparsity->dim_metadata[1].array_segments;
+  const auto* array_indices = sparsity->dim_metadata[1].array_indices;
+  uint8_t* output_data = GetTensorData<uint8_t>(ledger);
+  int output_data_ptr = 0;
+
+  for (int i = 0; i < array_segments->size - 1; i++) {
+    int row_start = array_segments->data[i];
+    int row_end = array_segments->data[i + 1];
+    if (row_end - row_start > UINT8_MAX) {
+      return kTfLiteError;
+    }
+    // Copy num of non-zero blocks in row i.
+    output_data[output_data_ptr] = static_cast<uint8_t>(row_end - row_start);
+    output_data_ptr++;
+
+    for (int j = row_start; j < row_end; j++) {
+      if (array_indices->data[j] > UINT8_MAX) {
+        return kTfLiteError;
+      }
+      // Copy indices of non-zero blocks in row i.
+      output_data[output_data_ptr] =
+          static_cast<uint8_t>(array_indices->data[j]);
+      output_data_ptr++;
+    }
+  }
+  return kTfLiteOk;
+}
+
 TfLiteStatus PopulateQuantizedLstmParams8x8_16(
     TfLiteContext* context, TfLiteNode* node,
     lstm_eval::IntegerLstmParameter* integer_lstm_param) {
@@ -744,6 +805,9 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   // TODO(b/159066113): maybe just add the minimum required temp tensors?
   context->AddTensors(context, kNumHybridTemporaryTensors,
                       &op_data->scratch_tensor_index);
+  // Tensors used for the sparse hybrid kernel.
+  context->AddTensors(context, /*tensors_to_add=*/kLedgersToAdd,
+                      &op_data->ledger_index);
   return op_data;
 }
 
@@ -1239,6 +1303,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // The weights are of consistent type, so it suffices to check one.
   const bool is_hybrid_op = IsHybridOp(input, input_to_output_weights);
 
+  const bool is_sparse_op = (input_to_output_weights->sparsity != nullptr);
+
   // The type of Integer LSTM.
   const int num_intermediate_tensors = node->intermediates->size;
   if (is_integer) {
@@ -1251,7 +1317,12 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   TfLiteIntArrayFree(node->temporaries);
   if (is_hybrid_op) {
-    node->temporaries = TfLiteIntArrayCreate(kNumHybridTemporaryTensors);
+    if (is_sparse_op) {
+      node->temporaries =
+          TfLiteIntArrayCreate(kNumHybridTemporaryTensors + kLedgersToAdd);
+    } else {
+      node->temporaries = TfLiteIntArrayCreate(kNumHybridTemporaryTensors);
+    }
   } else if (is_integer) {
     if (is_8x8_16) {
       node->temporaries = TfLiteIntArrayCreate(6);
@@ -1289,7 +1360,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   }
 
   if (is_hybrid_op) {
-    op_data->compute_row_sums = true;
+    if (!is_sparse_op) {
+      op_data->compute_row_sums = true;
+    }
     // Allocate temporary tensors to store quantized values of input,
     // output_state and cell_state tensors.
     node->temporaries->data[kInputQuantized] =
@@ -1454,6 +1527,125 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
       TF_LITE_ENSURE_OK(
           context, context->ResizeTensor(context, row_sums, row_sums_size));
     }
+
+    if (is_sparse_op) {
+      op_data->ledger_initialized = false;
+      int offset = kNumHybridTemporaryTensors;
+      {
+        node->temporaries->data[offset + kInputToInputWeightsLedgerOffset] =
+            op_data->ledger_index + kInputToInputWeightsLedgerOffset;
+        const TfLiteTensor* input_to_input_weights =
+            GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
+        TfLiteTensor* input_to_input_weights_ledger =
+            &context->tensors[op_data->ledger_index +
+                              kInputToInputWeightsLedgerOffset];
+        auto status = make_ledger(input_to_input_weights == nullptr
+                                      ? nullptr
+                                      : input_to_input_weights->sparsity,
+                                  context, input_to_input_weights_ledger);
+        if (status != kTfLiteOk) return status;
+      }
+      {
+        node->temporaries->data[offset + kInputToForgetWeightsLedgerOffset] =
+            op_data->ledger_index + kInputToForgetWeightsLedgerOffset;
+        const TfLiteTensor* input_to_forget_weights =
+            GetInput(context, node, kInputToForgetWeightsTensor);
+        TfLiteTensor* input_to_forget_weights_ledger =
+            &context->tensors[op_data->ledger_index +
+                              kInputToForgetWeightsLedgerOffset];
+        auto status = make_ledger(input_to_forget_weights->sparsity, context,
+                                  input_to_forget_weights_ledger);
+        if (status != kTfLiteOk) return status;
+      }
+      {
+        node->temporaries->data[offset + kInputToCellWeightsLedgerOffset] =
+            op_data->ledger_index + kInputToCellWeightsLedgerOffset;
+        const TfLiteTensor* input_to_cell_weights =
+            GetInput(context, node, kInputToCellWeightsTensor);
+        TfLiteTensor* input_to_cell_weights_ledger =
+            &context->tensors[op_data->ledger_index +
+                              kInputToCellWeightsLedgerOffset];
+        auto status = make_ledger(input_to_cell_weights->sparsity, context,
+                                  input_to_cell_weights_ledger);
+        if (status != kTfLiteOk) return status;
+      }
+      {
+        node->temporaries->data[offset + kInputToOutputWeightsLedgerOffset] =
+            op_data->ledger_index + kInputToOutputWeightsLedgerOffset;
+        const TfLiteTensor* input_to_output_weights =
+            GetInput(context, node, kInputToOutputWeightsTensor);
+        TfLiteTensor* input_to_output_weights_ledger =
+            &context->tensors[op_data->ledger_index +
+                              kInputToOutputWeightsLedgerOffset];
+        auto status = make_ledger(input_to_output_weights->sparsity, context,
+                                  input_to_output_weights_ledger);
+        if (status != kTfLiteOk) return status;
+      }
+      {
+        node->temporaries->data[offset + kRecurrentToInputWeightsLedgerOffset] =
+            op_data->ledger_index + kRecurrentToInputWeightsLedgerOffset;
+        const TfLiteTensor* recurrent_to_input_weights = GetOptionalInputTensor(
+            context, node, kRecurrentToInputWeightsTensor);
+        TfLiteTensor* recurrent_to_input_weights_ledger =
+            &context->tensors[op_data->ledger_index +
+                              kRecurrentToInputWeightsLedgerOffset];
+        auto status = make_ledger(recurrent_to_input_weights == nullptr
+                                      ? nullptr
+                                      : recurrent_to_input_weights->sparsity,
+                                  context, recurrent_to_input_weights_ledger);
+        if (status != kTfLiteOk) return status;
+      }
+      {
+        node->temporaries
+            ->data[offset + kRecurrentToForgetWeightsLedgerOffset] =
+            op_data->ledger_index + kRecurrentToForgetWeightsLedgerOffset;
+        const TfLiteTensor* recurrent_to_forget_weights =
+            GetInput(context, node, kRecurrentToForgetWeightsTensor);
+        TfLiteTensor* recurrent_to_forget_weights_ledger =
+            &context->tensors[op_data->ledger_index +
+                              kRecurrentToForgetWeightsLedgerOffset];
+        auto status = make_ledger(recurrent_to_forget_weights->sparsity,
+                                  context, recurrent_to_forget_weights_ledger);
+        if (status != kTfLiteOk) return status;
+      }
+      {
+        node->temporaries->data[offset + kRecurrentToCellWeightsLedgerOffset] =
+            op_data->ledger_index + kRecurrentToCellWeightsLedgerOffset;
+        const TfLiteTensor* recurrent_to_cell_weights =
+            GetInput(context, node, kRecurrentToCellWeightsTensor);
+        TfLiteTensor* recurrent_to_cell_weights_ledger =
+            &context->tensors[op_data->ledger_index +
+                              kRecurrentToCellWeightsLedgerOffset];
+        auto status = make_ledger(recurrent_to_cell_weights->sparsity, context,
+                                  recurrent_to_cell_weights_ledger);
+        if (status != kTfLiteOk) return status;
+      }
+      {
+        node->temporaries
+            ->data[offset + kRecurrentToOutputWeightsLedgerOffset] =
+            op_data->ledger_index + kRecurrentToOutputWeightsLedgerOffset;
+        const TfLiteTensor* recurrent_to_output_weights =
+            GetInput(context, node, kRecurrentToOutputWeightsTensor);
+        TfLiteTensor* recurrent_to_output_weights_ledger =
+            &context->tensors[op_data->ledger_index +
+                              kRecurrentToOutputWeightsLedgerOffset];
+        auto status = make_ledger(recurrent_to_output_weights->sparsity,
+                                  context, recurrent_to_output_weights_ledger);
+        if (status != kTfLiteOk) return status;
+      }
+      {
+        node->temporaries->data[offset + kProjectionWeightsLedgerOffset] =
+            op_data->ledger_index + kProjectionWeightsLedgerOffset;
+        const TfLiteTensor* projection_weights =
+            GetInput(context, node, kProjectionWeightsTensor);
+        TfLiteTensor* projection_weights_ledger =
+            &context->tensors[op_data->ledger_index +
+                              kProjectionWeightsLedgerOffset];
+        auto status = make_ledger(projection_weights->sparsity, context,
+                                  projection_weights_ledger);
+        if (status != kTfLiteOk) return status;
+      }
+    }
   }
 
   if (is_integer) {
@@ -1624,14 +1816,116 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     case kTfLiteUInt8:
     case kTfLiteInt8: {
       const bool is_hybrid = (input->type == kTfLiteFloat32);
+      const bool is_sparse = input_to_output_weights->sparsity != nullptr;
       if (is_hybrid) {
         TfLiteTensor* row_sums = GetTemporary(context, node, kRowSums);
         const int row_sums_size = row_sums->dims->data[0];
+        if (is_sparse) {
+          TfLiteTensor* input_to_input_weights_ledger =
+              &context->tensors[op_data->ledger_index +
+                                kInputToInputWeightsLedgerOffset];
+          TfLiteTensor* input_to_forget_weights_ledger =
+              &context->tensors[op_data->ledger_index +
+                                kInputToForgetWeightsLedgerOffset];
+          TfLiteTensor* input_to_cell_weights_ledger =
+              &context->tensors[op_data->ledger_index +
+                                kInputToCellWeightsLedgerOffset];
+          TfLiteTensor* input_to_output_weights_ledger =
+              &context->tensors[op_data->ledger_index +
+                                kInputToOutputWeightsLedgerOffset];
+          TfLiteTensor* recurrent_to_input_weights_ledger =
+              &context->tensors[op_data->ledger_index +
+                                kRecurrentToInputWeightsLedgerOffset];
+          TfLiteTensor* recurrent_to_forget_weights_ledger =
+              &context->tensors[op_data->ledger_index +
+                                kRecurrentToForgetWeightsLedgerOffset];
+          TfLiteTensor* recurrent_to_cell_weights_ledger =
+              &context->tensors[op_data->ledger_index +
+                                kRecurrentToCellWeightsLedgerOffset];
+          TfLiteTensor* recurrent_to_output_weights_ledger =
+              &context->tensors[op_data->ledger_index +
+                                kRecurrentToOutputWeightsLedgerOffset];
+          TfLiteTensor* projection_weights_ledger =
+              &context->tensors[op_data->ledger_index +
+                                kProjectionWeightsLedgerOffset];
+          if (!op_data->ledger_initialized) {
+            copy_ledger(input_to_input_weights == nullptr
+                            ? nullptr
+                            : input_to_input_weights->sparsity,
+                        input_to_input_weights_ledger);
+            copy_ledger(input_to_forget_weights->sparsity,
+                        input_to_forget_weights_ledger);
+            copy_ledger(input_to_cell_weights->sparsity,
+                        input_to_cell_weights_ledger);
+            copy_ledger(input_to_output_weights->sparsity,
+                        input_to_output_weights_ledger);
+            copy_ledger(recurrent_to_input_weights == nullptr
+                            ? nullptr
+                            : recurrent_to_input_weights->sparsity,
+                        recurrent_to_input_weights_ledger);
+            copy_ledger(recurrent_to_forget_weights->sparsity,
+                        recurrent_to_forget_weights_ledger);
+            copy_ledger(recurrent_to_cell_weights->sparsity,
+                        recurrent_to_cell_weights_ledger);
+            copy_ledger(recurrent_to_output_weights->sparsity,
+                        recurrent_to_output_weights_ledger);
+            copy_ledger(projection_weights->sparsity,
+                        projection_weights_ledger);
+            op_data->ledger_initialized = true;
+          }
+          return lstm_eval::EvalHybrid(
+              input, input_to_input_weights, input_to_input_weights_ledger,
+              input_to_forget_weights, input_to_forget_weights_ledger,
+              input_to_cell_weights, input_to_cell_weights_ledger,
+              input_to_output_weights, input_to_output_weights_ledger,
+              recurrent_to_input_weights, recurrent_to_input_weights_ledger,
+              recurrent_to_forget_weights, recurrent_to_forget_weights_ledger,
+              recurrent_to_cell_weights, recurrent_to_cell_weights_ledger,
+              recurrent_to_output_weights, recurrent_to_output_weights_ledger,
+              cell_to_input_weights, cell_to_forget_weights,
+              cell_to_output_weights, input_layer_norm_coefficients,
+              forget_layer_norm_coefficients, cell_layer_norm_coefficients,
+              output_layer_norm_coefficients,
+              /*aux_input=*/nullptr,
+              /*aux_input_to_input_weights=*/nullptr,
+              /*aux_input_to_forget_weights=*/nullptr,
+              /*aux_input_to_cell_weights=*/nullptr,
+              /*aux_input_to_output_weights=*/nullptr, input_gate_bias,
+              forget_gate_bias, cell_gate_bias, output_gate_bias,
+              projection_weights, projection_weights_ledger, projection_bias,
+              params,
+              /*forward_sequence=*/true, /*time_major=*/true,
+              /*output_offset=*/0, GetTemporary(context, node, kScratchBuffer),
+              GetTemporary(context, node, kInputScalingFactors),
+              /*aux_input_sf=*/nullptr,
+              GetTemporary(context, node, kOutputStateScalingFactors),
+              GetTemporary(context, node, kProductScalingFactors),
+              GetTemporary(context, node, kRecoveredCellWeights),
+              GetTemporary(context, node, kInputQuantized),
+              /*aux_input_quantized=*/nullptr,
+              GetTemporary(context, node, kOutputStateQuantized),
+              GetTemporary(context, node, kCellStateQuantized), output_state,
+              cell_state, GetTemporary(context, node, kAccumScratch), output,
+              GetTemporary(context, node, kInputZeroPoints),
+              /*aux_input_zp=*/nullptr,
+              GetTemporary(context, node, kOutputStateZeroPoints), row_sums,
+              row_sums_size, &op_data->compute_row_sums,
+              CpuBackendContext::GetFromContext(context));
+        }
         return lstm_eval::EvalHybrid(
-            input, input_to_input_weights, input_to_forget_weights,
-            input_to_cell_weights, input_to_output_weights,
-            recurrent_to_input_weights, recurrent_to_forget_weights,
-            recurrent_to_cell_weights, recurrent_to_output_weights,
+            input, input_to_input_weights,
+            /*input_to_input_weights_ledger*/ nullptr, input_to_forget_weights,
+            /*input_to_forget_weights_ledger*/ nullptr, input_to_cell_weights,
+            /*input_to_cell_weights_ledger*/ nullptr, input_to_output_weights,
+            /*input_to_output_weights_ledger*/ nullptr,
+            recurrent_to_input_weights,
+            /*recurrent_to_input_weights_ledger*/ nullptr,
+            recurrent_to_forget_weights,
+            /*recurrent_to_forget_weights_ledger*/ nullptr,
+            recurrent_to_cell_weights,
+            /*recurrent_to_cell_weights_ledger*/ nullptr,
+            recurrent_to_output_weights,
+            /*recurrent_to_output_weights_ledger*/ nullptr,
             cell_to_input_weights, cell_to_forget_weights,
             cell_to_output_weights, input_layer_norm_coefficients,
             forget_layer_norm_coefficients, cell_layer_norm_coefficients,
@@ -1641,7 +1935,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
             /*aux_input_to_cell_weights=*/nullptr,
             /*aux_input_to_output_weights=*/nullptr, input_gate_bias,
             forget_gate_bias, cell_gate_bias, output_gate_bias,
-            projection_weights, projection_bias, params,
+            projection_weights, /*projection_weights_ledger*/ nullptr,
+            projection_bias, params,
             /*forward_sequence=*/true, /*time_major=*/true, /*output_offset=*/0,
             GetTemporary(context, node, kScratchBuffer),
             GetTemporary(context, node, kInputScalingFactors),
diff --git a/tensorflow/lite/kernels/lstm_eval.cc b/tensorflow/lite/kernels/lstm_eval.cc
index 9087bbeada9..695100fa92f 100644
--- a/tensorflow/lite/kernels/lstm_eval.cc
+++ b/tensorflow/lite/kernels/lstm_eval.cc
@@ -312,6 +312,7 @@ void CalculateLstmGateHybrid(
     // Input and weights
     const int8_t* input, const float* input_sf, const int32_t* input_zp,
     const int8_t* input_to_gate_weights,
+    const uint8_t* input_to_gate_weights_ledger,
     const float input_to_gate_weights_scale, int32_t* input_to_gate_row_sums,
     // Aux input and weights
     const int8_t* aux_input, const float* aux_input_sf,
@@ -321,6 +322,7 @@ void CalculateLstmGateHybrid(
     // Output state and weights
     const int8_t* output_state, const float* output_state_sf,
     const int32_t* output_state_zp, const int8_t* recurrent_to_gate_weights,
+    const uint8_t* recurrent_to_gate_weights_ledger,
     const float recurrent_to_gate_weights_scale,
     int32_t* recurrent_to_gate_row_sums,
     // Cell state and weights (peephole LSTM)
@@ -356,11 +358,22 @@ void CalculateLstmGateHybrid(
   // For each batch and cell: compute input_weight * input.
   // Skip if input is all zeros.
   if (!is_input_all_zeros) {
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        input_to_gate_weights, n_cell, n_input, input,
-        input_to_gate_weights_scale, input_sf, n_batch, gate,
-        /*per_channel_scale=*/nullptr, input_zp, accum_scratch,
-        input_to_gate_row_sums, compute_row_sums, scratch0, context);
+    if (input_to_gate_weights_ledger != nullptr) {
+      std::vector<float> scales(n_batch);
+      for (int i = 0; i < n_batch; i++) {
+        scales[i] = input_to_gate_weights_scale * input_sf[i];
+      }
+      tensor_utils::SparseMatrixBatchVectorMultiplyAccumulate(
+          input_to_gate_weights, input_to_gate_weights_ledger, n_cell, n_input,
+          input, scales.data(), n_batch, gate);
+
+    } else {
+      tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+          input_to_gate_weights, n_cell, n_input, input,
+          input_to_gate_weights_scale, input_sf, n_batch, gate,
+          /*per_channel_scale=*/nullptr, input_zp, accum_scratch,
+          input_to_gate_row_sums, compute_row_sums, scratch0, context);
+    }
   }
   // For each batch and cell: compute aux_input_weight * aux_input.
   // Skip if auxiliary input is not available or all zeros.
@@ -374,11 +387,21 @@ void CalculateLstmGateHybrid(
   // For each batch and cell: compute recurrent_weight * output_state.
   // Skip if output state is all zeros.
   if (!is_output_state_all_zeros) {
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        recurrent_to_gate_weights, n_cell, n_output, output_state,
-        recurrent_to_gate_weights_scale, output_state_sf, n_batch, gate,
-        /*per_channel_scale=*/nullptr, output_state_zp, accum_scratch,
-        recurrent_to_gate_row_sums, compute_row_sums, scratch0, context);
+    if (recurrent_to_gate_weights_ledger != nullptr) {
+      std::vector<float> scales(n_batch);
+      for (int i = 0; i < n_batch; i++) {
+        scales[i] = recurrent_to_gate_weights_scale * input_sf[i];
+      }
+      tensor_utils::SparseMatrixBatchVectorMultiplyAccumulate(
+          recurrent_to_gate_weights, recurrent_to_gate_weights_ledger, n_cell,
+          n_output, output_state, scales.data(), n_batch, gate);
+    } else {
+      tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+          recurrent_to_gate_weights, n_cell, n_output, output_state,
+          recurrent_to_gate_weights_scale, output_state_sf, n_batch, gate,
+          /*per_channel_scale=*/nullptr, output_state_zp, accum_scratch,
+          recurrent_to_gate_row_sums, compute_row_sums, scratch0, context);
+    }
   }
   // For each batch and cell: compute cell_weight .* cell_state (peephole LSTM)
   if (use_peephole) {
@@ -422,11 +445,12 @@ void CalculateLstmGateHybrid(
 void CalculateLstmOutputHybrid(
     int n_batch, int n_cell, int n_output, const float* cell_state,
     const float* output_gate, TfLiteFusedActivation activation,
-    const int8_t* projection_weights, float projection_weights_scale,
-    const float* projection_bias, const float proj_clip, float* output_state,
-    bool asymmetric_quantize_inputs, int32_t* projection_weights_row_sums,
-    bool* compute_row_sums, CpuBackendContext* context, float* scratch0,
-    int8_t* scratch1, float* scratch2, int32_t* scratch3, int32_t* scratch4) {
+    const int8_t* projection_weights, const uint8_t* projection_weights_ledger,
+    float projection_weights_scale, const float* projection_bias,
+    const float proj_clip, float* output_state, bool asymmetric_quantize_inputs,
+    int32_t* projection_weights_row_sums, bool* compute_row_sums,
+    CpuBackendContext* context, float* scratch0, int8_t* scratch1,
+    float* scratch2, int32_t* scratch3, int32_t* scratch4) {
   tensor_utils::ApplyActivationToVector(cell_state, n_batch * n_cell,
                                         activation, scratch0);
   tensor_utils::VectorVectorCwiseProduct(output_gate, scratch0,
@@ -447,11 +471,21 @@ void CalculateLstmOutputHybrid(
       tensor_utils::BatchQuantizeFloats(scratch0, n_batch, n_cell, scratch1,
                                         scratch2, scratch3,
                                         asymmetric_quantize_inputs);
-      tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-          projection_weights, n_output, n_cell, scratch1,
-          projection_weights_scale, scratch2, n_batch, output_state,
-          /*per_channel_scale=*/nullptr, scratch3, scratch4,
-          projection_weights_row_sums, compute_row_sums, scratch2, context);
+      if (projection_weights_ledger != nullptr) {
+        std::vector<float> scales(n_batch);
+        for (int i = 0; i < n_batch; i++) {
+          scales[i] = projection_weights_scale * scratch2[i];
+        }
+        tensor_utils::SparseMatrixBatchVectorMultiplyAccumulate(
+            projection_weights, projection_weights_ledger, n_output, n_cell,
+            scratch1, scales.data(), n_batch, output_state);
+      } else {
+        tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+            projection_weights, n_output, n_cell, scratch1,
+            projection_weights_scale, scratch2, n_batch, output_state,
+            /*per_channel_scale=*/nullptr, scratch3, scratch4,
+            projection_weights_row_sums, compute_row_sums, scratch2, context);
+      }
     }
     if (proj_clip > 0.0f) {
       tensor_utils::CwiseClipping(output_state, n_batch * n_output, proj_clip);
@@ -673,7 +707,7 @@ void CalculateLstmGateInteger8x8_8(
   tensor_utils::ApplyLayerNormFloat(
       gate, layer_norm_gate_weight, layer_norm_gate_scale_a,
       layer_norm_gate_scale_b, gate_bias, n_batch, n_cell, gate);
-  // Apply activation.  // Apply activation
+  // Apply activation.
   switch (activation) {
     case kTfLiteActSigmoid:
       tensor_utils::ApplySigmoidFloat(gate, n_batch, n_cell, gate);
@@ -955,11 +989,16 @@ inline void LstmStepFloat(
 //   output_ptr       - size 'n_batch * output_batch_leading_dim'
 inline void LstmStepHybrid(
     const float* input_ptr, const int8_t* input_to_input_weights_ptr,
+    const uint8_t* input_to_input_weights_ledger_ptr,
     float input_to_input_weights_scale,
     const int8_t* input_to_forget_weights_ptr,
+    const uint8_t* input_to_forget_weights_ledger_ptr,
     float input_to_forget_weights_scale,
-    const int8_t* input_to_cell_weights_ptr, float input_to_cell_weights_scale,
+    const int8_t* input_to_cell_weights_ptr,
+    const uint8_t* input_to_cell_weights_ledger_ptr,
+    float input_to_cell_weights_scale,
     const int8_t* input_to_output_weights_ptr,
+    const uint8_t* input_to_output_weights_ledger_ptr,
     float input_to_output_weights_scale, const float* aux_input_ptr,
     const int8_t* aux_input_to_input_weights_ptr,
     float aux_input_to_input_weights_scale,
@@ -970,12 +1009,16 @@ inline void LstmStepHybrid(
     const int8_t* aux_input_to_output_weights_ptr,
     float aux_input_to_output_weights_scale,
     const int8_t* recurrent_to_input_weights_ptr,
+    const uint8_t* recurrent_to_input_weights_ledger_ptr,
     float recurrent_to_input_weights_scale,
     const int8_t* recurrent_to_forget_weights_ptr,
+    const uint8_t* recurrent_to_forget_weights_ledger_ptr,
     float recurrent_to_forget_weights_scale,
     const int8_t* recurrent_to_cell_weights_ptr,
+    const uint8_t* recurrent_to_cell_weights_ledger_ptr,
     float recurrent_to_cell_weights_scale,
     const int8_t* recurrent_to_output_weights_ptr,
+    const uint8_t* recurrent_to_output_weights_ledger_ptr,
     float recurrent_to_output_weights_scale,
     const int8_t* cell_to_input_weights_ptr, float cell_to_input_weights_scale,
     const int8_t* cell_to_forget_weights_ptr,
@@ -988,19 +1031,21 @@ inline void LstmStepHybrid(
     const float* output_layer_norm_coefficients_ptr,
     const float* input_gate_bias_ptr, const float* forget_gate_bias_ptr,
     const float* cell_gate_bias_ptr, const float* output_gate_bias_ptr,
-    const int8_t* projection_weights_ptr, float projection_weights_scale,
-    const float* projection_bias_ptr, const TfLiteLSTMParams* params,
-    int n_batch, int n_cell, int n_input, int n_aux_input, int n_output,
-    int output_batch_leading_dim, float* scratch0, float* scratch1,
-    float* scratch2, float* scratch3, float* input_sf, float* aux_input_sf,
-    float* output_state_sf, float* scaling_factors_scratch,
-    float* recovered_cell_weights, int8_t* quantized_input_ptr,
-    int8_t* quantized_aux_input_ptr, int8_t* quantized_output_state_ptr,
-    int8_t* quantized_output_scratch, float* output_state_ptr,
-    float* cell_state_ptr, int32_t* accum_scratch_ptr, float* output_ptr,
-    int32_t* input_zp, int32_t* aux_input_zp, int32_t* output_state_zp,
-    int32_t* row_sums, int row_sums_size, bool* compute_row_sums,
-    bool asymmetric_quantize_inputs, CpuBackendContext* context) {
+    const int8_t* projection_weights_ptr,
+    const uint8_t* projection_weights_ledger_ptr,
+    float projection_weights_scale, const float* projection_bias_ptr,
+    const TfLiteLSTMParams* params, int n_batch, int n_cell, int n_input,
+    int n_aux_input, int n_output, int output_batch_leading_dim,
+    float* scratch0, float* scratch1, float* scratch2, float* scratch3,
+    float* input_sf, float* aux_input_sf, float* output_state_sf,
+    float* scaling_factors_scratch, float* recovered_cell_weights,
+    int8_t* quantized_input_ptr, int8_t* quantized_aux_input_ptr,
+    int8_t* quantized_output_state_ptr, int8_t* quantized_output_scratch,
+    float* output_state_ptr, float* cell_state_ptr, int32_t* accum_scratch_ptr,
+    float* output_ptr, int32_t* input_zp, int32_t* aux_input_zp,
+    int32_t* output_state_zp, int32_t* row_sums, int row_sums_size,
+    bool* compute_row_sums, bool asymmetric_quantize_inputs,
+    CpuBackendContext* context) {
   ruy::profiler::ScopeLabel label("LstmStepHybrid");
   // Since we have already checked that weights are all there or none, we
   // can check the existence of only one to the get the condition.
@@ -1106,11 +1151,12 @@ inline void LstmStepHybrid(
     // Calculate the input gate. (If not CIFG.)
     CalculateLstmGateHybrid(
         quantized_input_ptr, input_sf, input_zp, input_to_input_weights_ptr,
-        input_to_input_weights_scale, input_to_input_row_sums,
-        quantized_aux_input_ptr, aux_input_sf, aux_input_zp,
-        aux_input_to_input_weights_ptr, aux_input_to_input_weights_scale,
-        aux_input_to_input_row_sums, quantized_output_state_ptr,
-        output_state_sf, output_state_zp, recurrent_to_input_weights_ptr,
+        input_to_input_weights_ledger_ptr, input_to_input_weights_scale,
+        input_to_input_row_sums, quantized_aux_input_ptr, aux_input_sf,
+        aux_input_zp, aux_input_to_input_weights_ptr,
+        aux_input_to_input_weights_scale, aux_input_to_input_row_sums,
+        quantized_output_state_ptr, output_state_sf, output_state_zp,
+        recurrent_to_input_weights_ptr, recurrent_to_input_weights_ledger_ptr,
         recurrent_to_input_weights_scale, recurrent_to_input_row_sums,
         cell_state_ptr, cell_to_input_weights_ptr, cell_to_input_weights_scale,
         input_layer_norm_coefficients_ptr, input_gate_bias_ptr, n_batch,
@@ -1122,11 +1168,12 @@ inline void LstmStepHybrid(
   // Calculate the forget gate.
   CalculateLstmGateHybrid(
       quantized_input_ptr, input_sf, input_zp, input_to_forget_weights_ptr,
-      input_to_forget_weights_scale, input_to_forget_row_sums,
-      quantized_aux_input_ptr, aux_input_sf, aux_input_zp,
-      aux_input_to_forget_weights_ptr, aux_input_to_forget_weights_scale,
-      aux_input_to_forget_row_sums, quantized_output_state_ptr, output_state_sf,
-      output_state_zp, recurrent_to_forget_weights_ptr,
+      input_to_forget_weights_ledger_ptr, input_to_forget_weights_scale,
+      input_to_forget_row_sums, quantized_aux_input_ptr, aux_input_sf,
+      aux_input_zp, aux_input_to_forget_weights_ptr,
+      aux_input_to_forget_weights_scale, aux_input_to_forget_row_sums,
+      quantized_output_state_ptr, output_state_sf, output_state_zp,
+      recurrent_to_forget_weights_ptr, recurrent_to_forget_weights_ledger_ptr,
       recurrent_to_forget_weights_scale, recurrent_to_forget_row_sums,
       cell_state_ptr, cell_to_forget_weights_ptr, cell_to_forget_weights_scale,
       forget_layer_norm_coefficients_ptr, forget_gate_bias_ptr, n_batch,
@@ -1137,11 +1184,12 @@ inline void LstmStepHybrid(
   // Calculate the cell update gate.
   CalculateLstmGateHybrid(
       quantized_input_ptr, input_sf, input_zp, input_to_cell_weights_ptr,
-      input_to_cell_weights_scale, input_to_cell_row_sums,
-      quantized_aux_input_ptr, aux_input_sf, aux_input_zp,
-      aux_input_to_cell_weights_ptr, aux_input_to_cell_weights_scale,
-      aux_input_to_cell_row_sums, quantized_output_state_ptr, output_state_sf,
-      output_state_zp, recurrent_to_cell_weights_ptr,
+      input_to_cell_weights_ledger_ptr, input_to_cell_weights_scale,
+      input_to_cell_row_sums, quantized_aux_input_ptr, aux_input_sf,
+      aux_input_zp, aux_input_to_cell_weights_ptr,
+      aux_input_to_cell_weights_scale, aux_input_to_cell_row_sums,
+      quantized_output_state_ptr, output_state_sf, output_state_zp,
+      recurrent_to_cell_weights_ptr, recurrent_to_cell_weights_ledger_ptr,
       recurrent_to_cell_weights_scale, recurrent_to_cell_row_sums,
       /*cell_state=*/nullptr, /*cell_to_gate_weights=*/nullptr,
       /*cell_to_gate_weights_scale=*/0.0f, cell_layer_norm_coefficients_ptr,
@@ -1157,11 +1205,12 @@ inline void LstmStepHybrid(
   // Calculate the output gate.
   CalculateLstmGateHybrid(
       quantized_input_ptr, input_sf, input_zp, input_to_output_weights_ptr,
-      input_to_output_weights_scale, input_to_output_row_sums,
-      quantized_aux_input_ptr, aux_input_sf, aux_input_zp,
-      aux_input_to_output_weights_ptr, aux_input_to_output_weights_scale,
-      aux_input_to_output_row_sums, quantized_output_state_ptr, output_state_sf,
-      output_state_zp, recurrent_to_output_weights_ptr,
+      input_to_output_weights_ledger_ptr, input_to_output_weights_scale,
+      input_to_output_row_sums, quantized_aux_input_ptr, aux_input_sf,
+      aux_input_zp, aux_input_to_output_weights_ptr,
+      aux_input_to_output_weights_scale, aux_input_to_output_row_sums,
+      quantized_output_state_ptr, output_state_sf, output_state_zp,
+      recurrent_to_output_weights_ptr, recurrent_to_output_weights_ledger_ptr,
       recurrent_to_output_weights_scale, recurrent_to_output_row_sums,
       cell_state_ptr, cell_to_output_weights_ptr, cell_to_output_weights_scale,
       output_layer_norm_coefficients_ptr, output_gate_bias_ptr, n_batch,
@@ -1172,11 +1221,11 @@ inline void LstmStepHybrid(
   // Update the output state.
   CalculateLstmOutputHybrid(
       n_batch, n_cell, n_output, cell_state_ptr, output_gate_scratch,
-      params->activation, projection_weights_ptr, projection_weights_scale,
-      projection_bias_ptr, params->proj_clip, output_state_ptr,
-      asymmetric_quantize_inputs, projection_weights_row_sums, compute_row_sums,
-      context, scratch2, quantized_output_scratch, input_sf, input_zp,
-      accum_scratch_ptr);
+      params->activation, projection_weights_ptr, projection_weights_ledger_ptr,
+      projection_weights_scale, projection_bias_ptr, params->proj_clip,
+      output_state_ptr, asymmetric_quantize_inputs, projection_weights_row_sums,
+      compute_row_sums, context, scratch2, quantized_output_scratch, input_sf,
+      input_zp, accum_scratch_ptr);
   // Copy output state to the output. Note that the output's rows may not be
   // contiguous (output_batch_leading_dim != n_output).
   for (int b = 0; b < n_batch; b++) {
@@ -1829,13 +1878,21 @@ TfLiteStatus EvalFloat(
 
 TfLiteStatus EvalHybrid(
     const TfLiteTensor* input, const TfLiteTensor* input_to_input_weights,
+    const TfLiteTensor* input_to_input_weights_ledger,
     const TfLiteTensor* input_to_forget_weights,
+    const TfLiteTensor* input_to_forget_weights_ledger,
     const TfLiteTensor* input_to_cell_weights,
+    const TfLiteTensor* input_to_cell_weights_ledger,
     const TfLiteTensor* input_to_output_weights,
+    const TfLiteTensor* input_to_output_weights_ledger,
     const TfLiteTensor* recurrent_to_input_weights,
+    const TfLiteTensor* recurrent_to_input_weights_ledger,
     const TfLiteTensor* recurrent_to_forget_weights,
+    const TfLiteTensor* recurrent_to_forget_weights_ledger,
     const TfLiteTensor* recurrent_to_cell_weights,
+    const TfLiteTensor* recurrent_to_cell_weights_ledger,
     const TfLiteTensor* recurrent_to_output_weights,
+    const TfLiteTensor* recurrent_to_output_weights_ledger,
     const TfLiteTensor* cell_to_input_weights,
     const TfLiteTensor* cell_to_forget_weights,
     const TfLiteTensor* cell_to_output_weights,
@@ -1850,9 +1907,11 @@ TfLiteStatus EvalHybrid(
     const TfLiteTensor* aux_input_to_output_weights,
     const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
     const TfLiteTensor* cell_gate_bias, const TfLiteTensor* output_gate_bias,
-    const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
-    const TfLiteLSTMParams* params, bool forward_sequence, bool time_major,
-    int output_offset, TfLiteTensor* scratch_buffer, TfLiteTensor* input_sf,
+    const TfLiteTensor* projection_weights,
+    const TfLiteTensor* projection_weights_ledger,
+    const TfLiteTensor* projection_bias, const TfLiteLSTMParams* params,
+    bool forward_sequence, bool time_major, int output_offset,
+    TfLiteTensor* scratch_buffer, TfLiteTensor* input_sf,
     TfLiteTensor* aux_input_sf, TfLiteTensor* output_state_sf,
     TfLiteTensor* prod_scaling_factors, TfLiteTensor* recovered_cell_weights,
     TfLiteTensor* input_quantized, TfLiteTensor* aux_input_quantized,
@@ -1929,12 +1988,16 @@ TfLiteStatus EvalHybrid(
           GetTensorData<float>(output) + t_rel * output_step + output_offset;
       LstmStepHybrid(
           input_ptr, GetTensorData<int8_t>(input_to_input_weights),
+          GetTensorData<uint8_t>(input_to_input_weights_ledger),
           GetTensorScale(input_to_input_weights),
           GetTensorData<int8_t>(input_to_forget_weights),
+          GetTensorData<uint8_t>(input_to_forget_weights_ledger),
           GetTensorScale(input_to_forget_weights),
           GetTensorData<int8_t>(input_to_cell_weights),
+          GetTensorData<uint8_t>(input_to_cell_weights_ledger),
           GetTensorScale(input_to_cell_weights),
           GetTensorData<int8_t>(input_to_output_weights),
+          GetTensorData<uint8_t>(input_to_output_weights_ledger),
           GetTensorScale(input_to_output_weights), aux_input_ptr,
           GetTensorData<int8_t>(aux_input_to_input_weights),
           GetTensorScale(aux_input_to_input_weights),
@@ -1945,12 +2008,16 @@ TfLiteStatus EvalHybrid(
           GetTensorData<int8_t>(aux_input_to_output_weights),
           GetTensorScale(aux_input_to_output_weights),
           GetTensorData<int8_t>(recurrent_to_input_weights),
+          GetTensorData<uint8_t>(recurrent_to_input_weights_ledger),
           GetTensorScale(recurrent_to_input_weights),
           GetTensorData<int8_t>(recurrent_to_forget_weights),
+          GetTensorData<uint8_t>(recurrent_to_forget_weights_ledger),
           GetTensorScale(recurrent_to_forget_weights),
           GetTensorData<int8_t>(recurrent_to_cell_weights),
+          GetTensorData<uint8_t>(recurrent_to_cell_weights_ledger),
           GetTensorScale(recurrent_to_cell_weights),
           GetTensorData<int8_t>(recurrent_to_output_weights),
+          GetTensorData<uint8_t>(recurrent_to_output_weights_ledger),
           GetTensorScale(recurrent_to_output_weights),
           GetTensorData<int8_t>(cell_to_input_weights),
           GetTensorScale(cell_to_input_weights),
@@ -1967,6 +2034,7 @@ TfLiteStatus EvalHybrid(
           GetTensorData<float>(cell_gate_bias),
           GetTensorData<float>(output_gate_bias),
           GetTensorData<int8_t>(projection_weights),
+          GetTensorData<uint8_t>(projection_weights_ledger),
           GetTensorScale(projection_weights),
           GetTensorData<float>(projection_bias), params, n_batch, n_cell,
           n_input, aux_input_size, n_output, output_batch_leading_dim,
@@ -2018,12 +2086,16 @@ TfLiteStatus EvalHybrid(
 
         LstmStepHybrid(
             input_ptr, GetTensorData<int8_t>(input_to_input_weights),
+            GetTensorData<uint8_t>(input_to_input_weights_ledger),
             GetTensorScale(input_to_input_weights),
             GetTensorData<int8_t>(input_to_forget_weights),
+            GetTensorData<uint8_t>(input_to_forget_weights_ledger),
             GetTensorScale(input_to_forget_weights),
             GetTensorData<int8_t>(input_to_cell_weights),
+            GetTensorData<uint8_t>(input_to_cell_weights_ledger),
             GetTensorScale(input_to_cell_weights),
             GetTensorData<int8_t>(input_to_output_weights),
+            GetTensorData<uint8_t>(input_to_output_weights_ledger),
             GetTensorScale(input_to_output_weights), aux_input_ptr,
             GetTensorData<int8_t>(aux_input_to_input_weights),
             GetTensorScale(aux_input_to_input_weights),
@@ -2034,12 +2106,16 @@ TfLiteStatus EvalHybrid(
             GetTensorData<int8_t>(aux_input_to_output_weights),
             GetTensorScale(aux_input_to_output_weights),
             GetTensorData<int8_t>(recurrent_to_input_weights),
+            GetTensorData<uint8_t>(recurrent_to_input_weights_ledger),
             GetTensorScale(recurrent_to_input_weights),
             GetTensorData<int8_t>(recurrent_to_forget_weights),
+            GetTensorData<uint8_t>(recurrent_to_forget_weights_ledger),
             GetTensorScale(recurrent_to_forget_weights),
             GetTensorData<int8_t>(recurrent_to_cell_weights),
+            GetTensorData<uint8_t>(recurrent_to_cell_weights_ledger),
             GetTensorScale(recurrent_to_cell_weights),
             GetTensorData<int8_t>(recurrent_to_output_weights),
+            GetTensorData<uint8_t>(recurrent_to_output_weights_ledger),
             GetTensorScale(recurrent_to_output_weights),
             GetTensorData<int8_t>(cell_to_input_weights),
             GetTensorScale(cell_to_input_weights),
@@ -2056,6 +2132,7 @@ TfLiteStatus EvalHybrid(
             GetTensorData<float>(cell_gate_bias),
             GetTensorData<float>(output_gate_bias),
             GetTensorData<int8_t>(projection_weights),
+            GetTensorData<uint8_t>(projection_weights_ledger),
             GetTensorScale(projection_weights),
             GetTensorData<float>(projection_bias), params,
             /*n_batch=*/1, n_cell, n_input, aux_input_size, n_output,
diff --git a/tensorflow/lite/kernels/lstm_eval.h b/tensorflow/lite/kernels/lstm_eval.h
index d3fdf037b5c..5807c9ee56d 100644
--- a/tensorflow/lite/kernels/lstm_eval.h
+++ b/tensorflow/lite/kernels/lstm_eval.h
@@ -125,13 +125,21 @@ TfLiteStatus EvalFloat(
 
 TfLiteStatus EvalHybrid(
     const TfLiteTensor* input, const TfLiteTensor* input_to_input_weights,
+    const TfLiteTensor* input_to_input_weights_ledger,
     const TfLiteTensor* input_to_forget_weights,
+    const TfLiteTensor* input_to_forget_weights_ledger,
     const TfLiteTensor* input_to_cell_weights,
+    const TfLiteTensor* input_to_cell_weights_ledger,
     const TfLiteTensor* input_to_output_weights,
+    const TfLiteTensor* input_to_output_weights_ledger,
     const TfLiteTensor* recurrent_to_input_weights,
+    const TfLiteTensor* recurrent_to_input_weights_ledger,
     const TfLiteTensor* recurrent_to_forget_weights,
+    const TfLiteTensor* recurrent_to_forget_weights_ledger,
     const TfLiteTensor* recurrent_to_cell_weights,
+    const TfLiteTensor* recurrent_to_cell_weights_ledger,
     const TfLiteTensor* recurrent_to_output_weights,
+    const TfLiteTensor* recurrent_to_output_weights_ledger,
     const TfLiteTensor* cell_to_input_weights,
     const TfLiteTensor* cell_to_forget_weights,
     const TfLiteTensor* cell_to_output_weights,
@@ -146,9 +154,11 @@ TfLiteStatus EvalHybrid(
     const TfLiteTensor* aux_input_to_output_weights,
     const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
     const TfLiteTensor* cell_gate_bias, const TfLiteTensor* output_gate_bias,
-    const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
-    const TfLiteLSTMParams* params, bool forward_sequence, bool time_major,
-    int output_offset, TfLiteTensor* scratch_buffer, TfLiteTensor* input_sf,
+    const TfLiteTensor* projection_weights,
+    const TfLiteTensor* projection_weights_ledger,
+    const TfLiteTensor* projection_bias, const TfLiteLSTMParams* params,
+    bool forward_sequence, bool time_major, int output_offset,
+    TfLiteTensor* scratch_buffer, TfLiteTensor* input_sf,
     TfLiteTensor* aux_input_sf, TfLiteTensor* output_state_sf,
     TfLiteTensor* prod_scaling_factors, TfLiteTensor* recovered_cell_weights,
     TfLiteTensor* input_quantized, TfLiteTensor* aux_input_quantized,
diff --git a/tensorflow/lite/kernels/lstm_eval_test.cc b/tensorflow/lite/kernels/lstm_eval_test.cc
index adaa5db1e20..c7d935a4b4f 100644
--- a/tensorflow/lite/kernels/lstm_eval_test.cc
+++ b/tensorflow/lite/kernels/lstm_eval_test.cc
@@ -906,14 +906,14 @@ void TestOneHybridAsymmLSTM() {
   constexpr float kDefaultScale = 18.0;
   ops::builtin::lstm_eval::EvalHybrid(
       one_parameter.GetFloatInput(),
-      HybridLstmParam::addScale(one_parameter.Geti2i(), kDefaultScale),
-      HybridLstmParam::addScale(one_parameter.Geti2f(), kDefaultScale),
-      HybridLstmParam::addScale(one_parameter.Geti2c(), kDefaultScale),
-      HybridLstmParam::addScale(one_parameter.Geti2o(), kDefaultScale),
-      HybridLstmParam::addScale(one_parameter.Getr2i(), kDefaultScale),
-      HybridLstmParam::addScale(one_parameter.Getr2f(), kDefaultScale),
-      HybridLstmParam::addScale(one_parameter.Getr2c(), kDefaultScale),
-      HybridLstmParam::addScale(one_parameter.Getr2o(), kDefaultScale),
+      HybridLstmParam::addScale(one_parameter.Geti2i(), kDefaultScale), nullptr,
+      HybridLstmParam::addScale(one_parameter.Geti2f(), kDefaultScale), nullptr,
+      HybridLstmParam::addScale(one_parameter.Geti2c(), kDefaultScale), nullptr,
+      HybridLstmParam::addScale(one_parameter.Geti2o(), kDefaultScale), nullptr,
+      HybridLstmParam::addScale(one_parameter.Getr2i(), kDefaultScale), nullptr,
+      HybridLstmParam::addScale(one_parameter.Getr2f(), kDefaultScale), nullptr,
+      HybridLstmParam::addScale(one_parameter.Getr2c(), kDefaultScale), nullptr,
+      HybridLstmParam::addScale(one_parameter.Getr2o(), kDefaultScale), nullptr,
       /*cell_to_input_weights=*/nullptr,
       /*cell_to_forget_weights=*/nullptr,
       /*cell_to_output_weights=*/nullptr, one_parameter.GetInputLayerNorm(),
@@ -926,7 +926,7 @@ void TestOneHybridAsymmLSTM() {
       /*aux_input_to_output_weights=*/nullptr, one_parameter.GetInputBias(),
       one_parameter.GetForgetBias(), one_parameter.GetCellBias(),
       one_parameter.GetOutputBias(),
-      HybridLstmParam::addScale(one_parameter.GetProjection(), 1.0),
+      HybridLstmParam::addScale(one_parameter.GetProjection(), 1.0), nullptr,
       one_parameter.GetProjectionBias(), &param,
       /*forward_sequence=*/true,
       /*time_major=*/true,
diff --git a/tensorflow/lite/kernels/lstm_test.cc b/tensorflow/lite/kernels/lstm_test.cc
index 16e28619daf..ccdc8193f09 100644
--- a/tensorflow/lite/kernels/lstm_test.cc
+++ b/tensorflow/lite/kernels/lstm_test.cc
@@ -101,8 +101,8 @@ class LSTMOpModel : public SingleOpModel {
     }
 
     // Adding the 2 state tensors.
-    AddInput({TensorType_FLOAT32, {n_batch, n_output}}, true);
-    AddInput({TensorType_FLOAT32, {n_batch, n_cell}}, true);
+    AddVariableInput({TensorType_FLOAT32, {n_batch, n_output}});
+    AddVariableInput({TensorType_FLOAT32, {n_batch, n_cell}});
 
     // Layer norm weights.
     if (!model_has_legacy_20_inputs) {
@@ -1412,16 +1412,14 @@ class LSTMIntegerOpModel : public SingleOpModel {
     }
 
     // Adding the 2 state tensors.
-    AddInput({TensorType_INT16,
-              {n_batch, n_output},
-              ranges[18].first,
-              ranges[18].second},
-             true);
-    AddInput({TensorType_INT16,
-              {n_batch, n_cell},
-              ranges[19].first,
-              ranges[19].second},
-             true);
+    AddVariableInput({TensorType_INT16,
+                      {n_batch, n_output},
+                      ranges[18].first,
+                      ranges[18].second});
+    AddVariableInput({TensorType_INT16,
+                      {n_batch, n_cell},
+                      ranges[19].first,
+                      ranges[19].second});
 
     // Layer norm weights.
     if (use_layer_norm) {
@@ -2114,11 +2112,623 @@ TEST(LstmOpTest, InvalidTypes) {
 }
 #endif
 
+class HybridSparseLSTMOpModel : public ::tflite::SingleOpModel {
+ public:
+  HybridSparseLSTMOpModel(
+      int n_batch, int n_input, int n_cell, int n_output, bool use_cifg,
+      bool use_peephole, bool use_projection_weights, bool use_projection_bias,
+      float cell_clip, float proj_clip,
+      const std::vector<std::vector<int>>& input_shapes,
+      const TensorData& input_weights_td,
+      const std::vector<float>& input_to_input_weights,
+      const std::vector<float>& input_to_forget_weights,
+      const std::vector<float>& input_to_cell_weights,
+      const std::vector<float>& input_to_output_weights,
+      const TensorData& recurrent_weights_td,
+      const std::vector<float>& recurrent_to_input_weights,
+      const std::vector<float>& recurrent_to_forget_weights,
+      const std::vector<float>& recurrent_to_cell_weights,
+      const std::vector<float>& recurrent_to_output_weights,
+      const ::tflite::TensorType& weight_type = ::tflite::TensorType_INT8)
+      : n_batch_(n_batch),
+        n_input_(n_input),
+        n_cell_(n_cell),
+        n_output_(n_output) {
+    input_ = AddInput(::tflite::TensorType_FLOAT32);
+
+    if (use_cifg) {
+      input_to_input_weights_ = AddNullInput();
+    } else {
+      input_to_input_weights_ =
+          AddConstSparseInput(input_weights_td, input_to_input_weights, true);
+    }
+
+    input_to_forget_weights_ =
+        AddConstSparseInput(input_weights_td, input_to_forget_weights, true);
+
+    input_to_cell_weights_ =
+        AddConstSparseInput(input_weights_td, input_to_cell_weights, true);
+
+    input_to_output_weights_ =
+        AddConstSparseInput(input_weights_td, input_to_output_weights, true);
+
+    if (use_cifg) {
+      recurrent_to_input_weights_ = AddNullInput();
+    } else {
+      recurrent_to_input_weights_ = AddConstSparseInput(
+          recurrent_weights_td, recurrent_to_input_weights, true);
+    }
+
+    recurrent_to_forget_weights_ = AddConstSparseInput(
+        recurrent_weights_td, recurrent_to_forget_weights, true);
+    recurrent_to_cell_weights_ = AddConstSparseInput(
+        recurrent_weights_td, recurrent_to_cell_weights, true);
+    recurrent_to_output_weights_ = AddConstSparseInput(
+        recurrent_weights_td, recurrent_to_output_weights, true);
+
+    if (use_peephole) {
+      if (use_cifg) {
+        cell_to_input_weights_ = AddNullInput();
+      } else {
+        cell_to_input_weights_ = AddInput(weight_type);
+      }
+      cell_to_forget_weights_ = AddInput(weight_type);
+      cell_to_output_weights_ = AddInput(weight_type);
+    } else {
+      cell_to_input_weights_ = AddNullInput();
+      cell_to_forget_weights_ = AddNullInput();
+      cell_to_output_weights_ = AddNullInput();
+    }
+
+    if (use_cifg) {
+      input_gate_bias_ = AddNullInput();
+    } else {
+      input_gate_bias_ = AddInput(::tflite::TensorType_FLOAT32);
+    }
+    forget_gate_bias_ = AddInput(::tflite::TensorType_FLOAT32);
+    cell_bias_ = AddInput(::tflite::TensorType_FLOAT32);
+    output_gate_bias_ = AddInput(::tflite::TensorType_FLOAT32);
+
+    if (use_projection_weights) {
+      projection_weights_ = AddInput(weight_type);
+      if (use_projection_bias) {
+        projection_bias_ = AddInput(::tflite::TensorType_FLOAT32);
+      } else {
+        projection_bias_ = AddNullInput();
+      }
+    } else {
+      projection_weights_ = AddNullInput();
+      projection_bias_ = AddNullInput();
+    }
+
+    // Adding the 2 state tensors.
+    output_state_ = AddVariableInput(::tflite::TensorData{
+        ::tflite::TensorType_FLOAT32, {n_output_ * n_batch_}});
+    cell_state_ = AddVariableInput(::tflite::TensorData{
+        ::tflite::TensorType_FLOAT32, {n_cell_ * n_batch_}});
+
+    if (use_cifg) {
+      input_layer_norm_weights_ = AddNullInput();
+    } else {
+      input_layer_norm_weights_ = AddInput(::tflite::TensorType_FLOAT32);
+    }
+    forget_layer_norm_weights_ = AddInput(::tflite::TensorType_FLOAT32);
+    cell_layer_norm_weights_ = AddInput(::tflite::TensorType_FLOAT32);
+    output_layer_norm_weights_ = AddInput(::tflite::TensorType_FLOAT32);
+
+    output_ = AddOutput(::tflite::TensorType_FLOAT32);
+
+    SetBuiltinOp(
+        BuiltinOperator_LSTM, BuiltinOptions_LSTMOptions,
+        CreateLSTMOptions(builder_, ActivationFunctionType_TANH, cell_clip,
+                          proj_clip, LSTMKernelType_FULL, false)
+            .Union());
+    BuildInterpreter(input_shapes);
+  }
+
+  void SetCellToInputWeights(std::vector<float> f) {
+    SignedSymmetricQuantizeAndPopulate(cell_to_input_weights_, f);
+  }
+
+  void SetCellToForgetWeights(std::vector<float> f) {
+    SignedSymmetricQuantizeAndPopulate(cell_to_forget_weights_, f);
+  }
+
+  void SetCellToOutputWeights(std::vector<float> f) {
+    SignedSymmetricQuantizeAndPopulate(cell_to_output_weights_, f);
+  }
+
+  void SetInputLayerNormWeights(std::vector<float> f) {
+    PopulateTensor(input_layer_norm_weights_, f);
+  }
+
+  void SetForgetLayerNormWeights(std::vector<float> f) {
+    PopulateTensor(forget_layer_norm_weights_, f);
+  }
+
+  void SetCellLayerNormWeights(std::vector<float> f) {
+    PopulateTensor(cell_layer_norm_weights_, f);
+  }
+
+  void SetOutputLayerNormWeights(std::vector<float> f) {
+    PopulateTensor(output_layer_norm_weights_, f);
+  }
+
+  void SetInputGateBias(std::vector<float> f) {
+    PopulateTensor(input_gate_bias_, f);
+  }
+
+  void SetForgetGateBias(std::vector<float> f) {
+    PopulateTensor(forget_gate_bias_, f);
+  }
+
+  void SetCellBias(std::vector<float> f) { PopulateTensor(cell_bias_, f); }
+
+  void SetOutputGateBias(std::vector<float> f) {
+    PopulateTensor(output_gate_bias_, f);
+  }
+
+  void SetProjectionWeights(std::vector<float> f) {
+    SignedSymmetricQuantizeAndPopulate(projection_weights_, f);
+  }
+
+  void SetProjectionBias(std::vector<float> f) {
+    PopulateTensor(projection_bias_, f);
+  }
+
+  void SetInput(int offset, const float* begin, const float* end) {
+    PopulateTensor(input_, offset, const_cast<float*>(begin),
+                   const_cast<float*>(end));
+  }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+
+  int num_inputs() { return n_input_; }
+  int num_outputs() { return n_output_; }
+  int num_cells() { return n_cell_; }
+  int num_batches() { return n_batch_; }
+
+ protected:
+  int input_;
+  int input_to_input_weights_;
+  int input_to_forget_weights_;
+  int input_to_cell_weights_;
+  int input_to_output_weights_;
+
+  int recurrent_to_input_weights_;
+  int recurrent_to_forget_weights_;
+  int recurrent_to_cell_weights_;
+  int recurrent_to_output_weights_;
+
+  int cell_to_input_weights_;
+  int cell_to_forget_weights_;
+  int cell_to_output_weights_;
+
+  int input_layer_norm_weights_;
+  int forget_layer_norm_weights_;
+  int cell_layer_norm_weights_;
+  int output_layer_norm_weights_;
+
+  int input_gate_bias_;
+  int forget_gate_bias_;
+  int cell_bias_;
+  int output_gate_bias_;
+
+  int projection_weights_;
+  int projection_bias_;
+
+  int output_state_;
+  int cell_state_;
+
+  int output_;
+
+  int n_batch_;
+  int n_input_;
+  int n_cell_;
+  int n_output_;
+};
+
+class BaseSparseLstmTest : public ::testing::Test {
+ protected:
+  // Weights of the Sparse Layer Norm LSTM model. Some are optional.
+  std::vector<float> input_to_input_weights_;
+  std::vector<float> input_to_cell_weights_;
+  std::vector<float> input_to_forget_weights_;
+  std::vector<float> input_to_output_weights_;
+  std::vector<float> input_gate_bias_;
+  std::vector<float> cell_gate_bias_;
+  std::vector<float> forget_gate_bias_;
+  std::vector<float> output_gate_bias_;
+  std::vector<float> recurrent_to_input_weights_;
+  std::vector<float> recurrent_to_cell_weights_;
+  std::vector<float> recurrent_to_forget_weights_;
+  std::vector<float> recurrent_to_output_weights_;
+  std::vector<float> cell_to_input_weights_;
+  std::vector<float> cell_to_forget_weights_;
+  std::vector<float> cell_to_output_weights_;
+  std::vector<float> input_layer_norm_weights_;
+  std::vector<float> forget_layer_norm_weights_;
+  std::vector<float> cell_layer_norm_weights_;
+  std::vector<float> output_layer_norm_weights_;
+  std::vector<float> projection_weights_;
+
+  std::vector<int> input_to_input_weights_size_;
+  std::vector<int> input_to_cell_weights_size_;
+  std::vector<int> input_to_forget_weights_size_;
+  std::vector<int> input_to_output_weights_size_;
+  std::vector<int> recurrent_to_input_weights_size_;
+  std::vector<int> recurrent_to_cell_weights_size_;
+  std::vector<int> recurrent_to_forget_weights_size_;
+  std::vector<int> recurrent_to_output_weights_size_;
+
+  int n_batch_;
+  int n_input_;
+  int n_cell_;
+  int n_output_;
+  float cell_clip_;
+  float proj_clip_;
+
+  // Layer Norm LSTM input is stored as num_batch x num_inputs vector.
+  std::vector<std::vector<float>> sparse_layer_norm_lstm_input_;
+
+  // Compares output up to tolerance to the result of the layer_norm_lstm given
+  // the input.
+  void VerifyGoldens(const std::vector<std::vector<float>>& input,
+                     const std::vector<std::vector<float>>& output,
+                     HybridSparseLSTMOpModel* sparse_layer_norm_lstm,
+                     float tolerance = 1e-5) {
+    const int num_batches = input.size();
+    EXPECT_GT(num_batches, 0);
+    const int num_inputs = sparse_layer_norm_lstm->num_inputs();
+    EXPECT_GT(num_inputs, 0);
+    const int input_sequence_size = input[0].size() / num_inputs;
+    EXPECT_GT(input_sequence_size, 0);
+    for (int i = 0; i < input_sequence_size; ++i) {
+      for (int b = 0; b < num_batches; ++b) {
+        const float* batch_start = input[b].data() + i * num_inputs;
+        const float* batch_end = batch_start + num_inputs;
+
+        sparse_layer_norm_lstm->SetInput(
+            b * sparse_layer_norm_lstm->num_inputs(), batch_start, batch_end);
+      }
+
+      sparse_layer_norm_lstm->Invoke();
+
+      const int num_outputs = sparse_layer_norm_lstm->num_outputs();
+      std::vector<float> expected;
+      for (int b = 0; b < num_batches; ++b) {
+        const float* golden_start_batch = output[b].data() + i * num_outputs;
+        const float* golden_end_batch = golden_start_batch + num_outputs;
+        expected.insert(expected.end(), golden_start_batch, golden_end_batch);
+      }
+      EXPECT_THAT(
+          sparse_layer_norm_lstm->GetOutput(),
+          ElementsAreArray(::tflite::ArrayFloatNear(expected, tolerance)));
+    }
+  }
+};
+
+class NoCifgPeepholeProjectionNoClippingSparseLstmTest
+    : public BaseSparseLstmTest {
+  void SetUp() override {
+    n_batch_ = 2;
+    n_input_ = 48;
+    n_cell_ = 4;
+    n_output_ = 16;
+    cell_clip_ = 0.0;
+    proj_clip_ = 0.0;
+
+    /* clang-format off */
+    input_to_input_weights_ = {
+      /* 1st row */
+      1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9, 10.1, 11.11, 12.12, 13.13,
+      14.14, 15.15, 16.16, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 33.33, 34.34, 35.35, 36.36, 37.37, 38.38,
+      39.39, 40.40, 41.41, 42.42, 43.43, 44.44, 0.0, 0.0, 0.0, 0.0,
+      /* 2nd row */
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, -17.17, -18.18, -19.19, -20.2, -21.21, -22.22, -23.23, -24.24,
+      -25.25, -26.26, -27.27, -28.28, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      /* 3rd row */
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 17.17, -18.18, 19.19, -20.2, 21.21, -22.22, 23.23, -24.24, 25.25,
+      -26.26, 27.27, -28.28, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      /* 4th row */
+      -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, -9.9, 10.1, -11.11, 12.12,
+      -13.13, 14.14, -15.15, 16.16, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -33.33, 34.34, -35.35, 36.36, -37.37,
+      38.38, -39.39, 40.40, -41.41, 42.42, -43.43, 44.44, 0.0, 0.0, 0.0, 0};
+    input_to_input_weights_size_ = {4, 48};
+
+    input_to_forget_weights_ = {
+      /* 1st row */
+      1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9, 10.1, 11.11, 12.12, 13.13,
+      14.14, 15.15, 16.16, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 33.33, 34.34, 35.35, 36.36, 37.37, 38.38,
+      39.39, 40.40, 41.41, 42.42, 43.43, 44.44, 0.0, 0.0, 0.0, 0.0,
+      /* 2nd row */
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, -17.17, -18.18, -19.19, -20.2, -21.21, -22.22, -23.23, -24.24,
+      -25.25, -26.26, -27.27, -28.28, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      /* 3rd row */
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 17.17, -18.18, 19.19, -20.2, 21.21, -22.22, 23.23, -24.24, 25.25,
+      -26.26, 27.27, -28.28, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      /* 4th row */
+      -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, -9.9, 10.1, -11.11, 12.12,
+      -13.13, 14.14, -15.15, 16.16, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -33.33, 34.34, -35.35, 36.36, -37.37,
+      38.38, -39.39, 40.40, -41.41, 42.42, -43.43, 44.44, 0.0, 0.0, 0.0, 0};
+    input_to_forget_weights_size_ = {4, 48};
+
+    input_to_cell_weights_ = {
+      /* 1st row */
+      1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9, 10.1, 11.11, 12.12, 13.13,
+      14.14, 15.15, 16.16, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 33.33, 34.34, 35.35, 36.36, 37.37, 38.38,
+      39.39, 40.40, 41.41, 42.42, 43.43, 44.44, 0.0, 0.0, 0.0, 0.0,
+      /* 2nd row */
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, -17.17, -18.18, -19.19, -20.2, -21.21, -22.22, -23.23, -24.24,
+      -25.25, -26.26, -27.27, -28.28, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      /* 3rd row */
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 17.17, -18.18, 19.19, -20.2, 21.21, -22.22, 23.23, -24.24, 25.25,
+      -26.26, 27.27, -28.28, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      /* 4th row */
+      -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, -9.9, 10.1, -11.11, 12.12,
+      -13.13, 14.14, -15.15, 16.16, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -33.33, 34.34, -35.35, 36.36, -37.37,
+      38.38, -39.39, 40.40, -41.41, 42.42, -43.43, 44.44, 0.0, 0.0, 0.0, 0};
+    input_to_cell_weights_size_ = {4, 48};
+
+    input_to_output_weights_ = {
+      /* 1st row */
+      1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9, 10.1, 11.11, 12.12, 13.13,
+      14.14, 15.15, 16.16, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 33.33, 34.34, 35.35, 36.36, 37.37, 38.38,
+      39.39, 40.40, 41.41, 42.42, 43.43, 44.44, 0.0, 0.0, 0.0, 0.0,
+      /* 2nd row */
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, -17.17, -18.18, -19.19, -20.2, -21.21, -22.22, -23.23, -24.24,
+      -25.25, -26.26, -27.27, -28.28, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      /* 3rd row */
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 17.17, -18.18, 19.19, -20.2, 21.21, -22.22, 23.23, -24.24, 25.25,
+      -26.26, 27.27, -28.28, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      /* 4th row */
+      -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, -9.9, 10.1, -11.11, 12.12,
+      -13.13, 14.14, -15.15, 16.16, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -33.33, 34.34, -35.35, 36.36, -37.37,
+      38.38, -39.39, 40.40, -41.41, 42.42, -43.43, 44.44, 0.0, 0.0, 0.0, 0};
+    input_to_output_weights_size_ = {4, 48};
+
+    input_gate_bias_ = {0.03, 0.15, 0.22, 0.38};
+
+    forget_gate_bias_ = {0.1, -0.3, -0.2, 0.1};
+
+    cell_gate_bias_ = {-0.05, 0.72, 0.25, 0.08};
+
+    output_gate_bias_ = {0.05, -0.01, 0.2, 0.1};
+
+    recurrent_to_input_weights_ = {
+      -0.2, -0.3, 0.4, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0,   // 1st row
+      0.1,  -0.5, 0.9, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0,   // 2nd row
+      -0.2, -0.3, -0.7, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0,  // 3rd row
+      0.05, -0.2, -0.6, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0,  // 4th row
+    };
+    recurrent_to_input_weights_size_ = {4, 16};
+
+    recurrent_to_cell_weights_ = {
+      -0.3, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0,     // 1st row
+      -0.3, 0.8,  -0.08, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0,  // 2nd row
+      -0.2, 0.3, 0.8, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0,     // 3rd row
+      -0.6, -0.1, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0,    // 4th row
+    };
+    recurrent_to_cell_weights_size_ = {4, 16};
+
+    recurrent_to_forget_weights_ = {
+      -0.5, -0.3, -0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0,  // 1st row
+      -0.2, 0.6, 0.4, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0,  // 2nd row
+      0.9,  0.3,  -0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0,  // 3rd row
+      0.2, 0.5, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0,    // 4th row
+    };
+    recurrent_to_forget_weights_size_ = {4, 16};
+
+    recurrent_to_output_weights_ = {
+      0.3,  -0.1, 0.1,  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0,  // 1st row
+      -0.2, -0.5, -0.7,  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0,  // 2nd row
+      -0.2, -0.6, -0.1,  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0,  // 3rd row
+      -0.4, -0.7, -0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0,  // 4th row
+    };
+    recurrent_to_output_weights_size_ = {4, 16};
+
+    cell_to_input_weights_ = {0.05, 0.1, 0.25, 0.15};
+
+    cell_to_forget_weights_ = {-0.02, -0.15, -0.25, -0.03};
+
+    cell_to_output_weights_ = {0.1, -0.1, -0.5, 0.05};
+
+    input_layer_norm_weights_ = {0.1, 0.2, 0.3, 0.5};
+    forget_layer_norm_weights_ = {0.2, 0.2, 0.4, 0.3};
+    cell_layer_norm_weights_ = {0.7, 0.2, 0.3, 0.8};
+    output_layer_norm_weights_ = {0.6, 0.2, 0.2, 0.5};
+
+    projection_weights_ = {
+      -0.1, 0.2, 0.01, -0.2,  // 1st row
+      0.1, 0.5, 0.3, 0.08,    // 2nd row
+      0.07, 0.2, -0.4, 0.2,   // 3rd row
+      0.0, 0.0, 0.0, 0.0,     // 4th row
+      0.0, 0.0, 0.0, 0.0,     // 5th row
+      0.0, 0.0, 0.0, 0.0,     // 6th row
+      0.0, 0.0, 0.0, 0.0,     // 7th row
+      0.0, 0.0, 0.0, 0.0,     // 8th row
+      0.0, 0.0, 0.0, 0.0,     // 9th row
+      0.0, 0.0, 0.0, 0.0,     // 10th row
+      0.0, 0.0, 0.0, 0.0,     // 11th row
+      0.0, 0.0, 0.0, 0.0,     // 12th row
+      0.0, 0.0, 0.0, 0.0,     // 13th row
+      0.0, 0.0, 0.0, 0.0,     // 14th row
+      0.0, 0.0, 0.0, 0.0,     // 15th row
+      0.0, 0.0, 0.0, 0.0,     // 16th row
+      0.0, 0.0, 0.0, 0.0,     // 17th row
+      0.0, 0.0, 0.0, 0.0,     // 18th row
+    };
+
+    sparse_layer_norm_lstm_input_ = {
+      // Batch0: 2 (input_sequence_size) * 45 (n_input_)
+      {
+        1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0,
+        -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
+        1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0,
+        -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,  // seq 0
+        2.5, 0.0, -2.1, 0.0, 3.0, 0.0, -1.3, 0.0, 1.3, 0.0, -1.1, 0.0, 2.0, 0.0,
+        -1.7, 0.0, 1.9, 0.0, -1.5, 0.0, 0.5, 0.0, -0.7, 0.0, 0.8, 0.0, -0.3,
+        0.0, 2.8, 0.0, -2.8, 0.0, 1.1, -2.3, 1.9, -1.9, 2.1, -0.5, 2.4, -0.1,
+        1.0, -2.5, 0.7, -1.9, 0.2,  0.1, 0.2, 0.3,  // seq 1
+      },
+      // Batch1: 2 (input_sequence_size) * 45 (n_input_)
+      {
+        1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0,
+        -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
+        1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0,
+        -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,  // seq 0
+        2.5, 0.0, -2.1, 0.0, 3.0, 0.0, -1.3, 0.0, 1.3, 0.0, -1.1, 0.0, 2.0, 0.0,
+        -1.7, 0.0, 1.9, 0.0, -1.5, 0.0, 0.5, 0.0, -0.7, 0.0, 0.8, 0.0, -0.3,
+        0.0, 2.8, 0.0, -2.8, 0.0, 1.1, -2.3, 1.9, -1.9, 2.1, -0.5, 2.4, -0.1,
+        1.0, -2.5, 0.7, -1.9, 0.2, -1.0, 1.0, -1.0,   // seq 1
+      },
+    };
+    /* clang-format on */
+  }
+};
+
+TEST_F(NoCifgPeepholeProjectionNoClippingSparseLstmTest,
+       HybridSparseLstmBlackBoxTest) {
+  TensorData input_weight = {};
+  input_weight.type = TensorType_FLOAT32;
+  input_weight.shape = {4, 48};
+  input_weight.traversal_order = {0, 1, 2};
+  input_weight.format = {kTfLiteDimDense, kTfLiteDimSparseCSR};
+  input_weight.block_map = {1};
+  input_weight.block_size = {16};
+  TensorData recurrent_weight = {};
+  recurrent_weight.type = TensorType_FLOAT32;
+  recurrent_weight.shape = {4, 16};
+  recurrent_weight.traversal_order = {0, 1, 2};
+  recurrent_weight.format = {kTfLiteDimDense, kTfLiteDimSparseCSR};
+  recurrent_weight.block_map = {1};
+  recurrent_weight.block_size = {16};
+  HybridSparseLSTMOpModel sparse_layer_norm_lstm(
+      n_batch_, n_input_, n_cell_, n_output_,
+      /*use_cifg=*/false, /*use_peephole=*/true,
+      /*use_projection_weights=*/true,
+      /*use_projection_bias=*/false, cell_clip_, proj_clip_,
+      {
+          {n_batch_, n_input_},  // input tensor
+
+          {input_to_input_weights_size_},
+          {input_to_forget_weights_size_},
+          {input_to_cell_weights_size_},
+          {input_to_output_weights_size_},
+
+          {recurrent_to_input_weights_size_},
+          {recurrent_to_forget_weights_size_},
+          {recurrent_to_cell_weights_size_},
+          {recurrent_to_output_weights_size_},
+
+          {n_cell_},  // cell_to_input_weight tensor
+          {n_cell_},  // cell_to_forget_weight tensor
+          {n_cell_},  // cell_to_output_weight tensor
+
+          {n_cell_},  // input_gate_bias tensor
+          {n_cell_},  // forget_gate_bias tensor
+          {n_cell_},  // cell_bias tensor
+          {n_cell_},  // output_gate_bias tensor
+
+          {n_output_, n_cell_},  // projection_weight tensor
+          {0},                   // projection_bias tensor
+
+          {n_output_ * n_batch_},  // output_state tensor
+          {n_cell_ * n_batch_},    // cell_state tensor
+
+          {n_cell_},  // input_layer_norm_weight tensor
+          {n_cell_},  // forget_layer_norm_weight tensor
+          {n_cell_},  // cell_layer_norm_weight tensor
+          {n_cell_},  // output_layer_norm_weight tensor
+      },
+      input_weight, input_to_input_weights_, input_to_forget_weights_,
+      input_to_cell_weights_, input_to_output_weights_, recurrent_weight,
+      recurrent_to_input_weights_, recurrent_to_forget_weights_,
+      recurrent_to_cell_weights_, recurrent_to_output_weights_);
+
+  sparse_layer_norm_lstm.SetInputGateBias(input_gate_bias_);
+  sparse_layer_norm_lstm.SetCellBias(cell_gate_bias_);
+  sparse_layer_norm_lstm.SetForgetGateBias(forget_gate_bias_);
+  sparse_layer_norm_lstm.SetOutputGateBias(output_gate_bias_);
+
+  sparse_layer_norm_lstm.SetCellToInputWeights(cell_to_input_weights_);
+  sparse_layer_norm_lstm.SetCellToForgetWeights(cell_to_forget_weights_);
+  sparse_layer_norm_lstm.SetCellToOutputWeights(cell_to_output_weights_);
+
+  sparse_layer_norm_lstm.SetInputLayerNormWeights(input_layer_norm_weights_);
+  sparse_layer_norm_lstm.SetForgetLayerNormWeights(forget_layer_norm_weights_);
+  sparse_layer_norm_lstm.SetCellLayerNormWeights(cell_layer_norm_weights_);
+  sparse_layer_norm_lstm.SetOutputLayerNormWeights(output_layer_norm_weights_);
+
+  sparse_layer_norm_lstm.SetProjectionWeights(projection_weights_);
+
+  /* clang-format off */
+  const std::vector<std::vector<float>> sparse_layer_norm_lstm_golden_output = {
+    {
+      // Batch0: 2 (input_sequence_size) * 3 (n_output_)
+      0.0550758, 0.138464, -0.0628034, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.0, 0.0,
+      0.069672, 0.195428, -0.0605584, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.0, 0.0,
+    },
+    {
+      // Batch1: 3 (input_sequence_size) * 3 (n_output_)
+      0.0550758, 0.138464, -0.0628034, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.0, 0.0,
+      0.069672, 0.195428, -0.0605584, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.0, 0.0,
+    }};
+  /* clang-format on */
+
+  VerifyGoldens(sparse_layer_norm_lstm_input_,
+                sparse_layer_norm_lstm_golden_output, &sparse_layer_norm_lstm);
+}
+
 // Test parameter controls asymmetric_quantize_inputs in LSTMOpModel.
 INSTANTIATE_TEST_SUITE_P(
     Parameterized, LstmOpTest,
     ::testing::Combine(::testing::Values(TensorType_FLOAT32, TensorType_UINT8,
-                                         TensorType_UINT8),
+                                         TensorType_INT8),
                        ::testing::Bool(), ::testing::Bool()));
 
 }  // namespace
diff --git a/tensorflow/lite/kernels/mul_test.cc b/tensorflow/lite/kernels/mul_test.cc
index 9499fd40bea..5efeb6b29b6 100644
--- a/tensorflow/lite/kernels/mul_test.cc
+++ b/tensorflow/lite/kernels/mul_test.cc
@@ -113,7 +113,7 @@ TEST(FloatMulOpTest, ActivationRELU_N1_TO_1) {
 }
 
 TEST(FloatMulOpTest, VariousInputShapes) {
-  std::vector<std::vector<int>> test_shapes = {
+  const std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     FloatMulOpModel m({TensorType_FLOAT32, test_shapes[i]},
@@ -130,7 +130,7 @@ TEST(FloatMulOpTest, VariousInputShapes) {
 }
 
 TEST(FloatMulOpTest, WithScalarBroadcast) {
-  std::vector<std::vector<int>> test_shapes = {
+  const std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     FloatMulOpModel m({TensorType_FLOAT32, test_shapes[i]},
@@ -147,7 +147,7 @@ TEST(FloatMulOpTest, WithScalarBroadcast) {
 }
 
 TEST(FloatMulOpTest, WithBroadcast) {
-  std::vector<std::vector<int>> test_shapes = {
+  const std::vector<std::vector<int>> test_shapes = {
       {2, 4}, {2, 1, 4}, {1, 2, 4}, {1, 2, 1, 4}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     FloatMulOpModel m({TensorType_FLOAT32, test_shapes[i]},
@@ -166,9 +166,9 @@ TEST(FloatMulOpTest, WithBroadcast) {
 
 TEST(FloatMulOpTest, MixedBroadcast) {
   const std::vector<int> base_shape = {2, 3, 1, 2};
-  std::vector<std::vector<int>> test_shapes = {
+  const std::vector<std::vector<int>> test_shapes = {
       {1, 1, 3, 2}, {1, 3, 1, 2}, {2, 1, 3, 1}, {2, 3, 1, 1}};
-  std::vector<std::vector<float>> test_outputs = {
+  const std::vector<std::vector<float>> test_outputs = {
       {-0.06f, 0.69f,  0.12f,  1.15f, -0.30f, 2.07f,  0.18f,  0.15f, -0.36f,
        0.25f,  0.90f,  0.45f,  0.16f, -0.33f, -0.32f, -0.55f, 0.80f, -0.99f,
        0.24f,  0.84f,  -0.48f, 1.40f, 1.20f,  2.52f,  -0.32f, 0.00f, 0.64f,
@@ -214,7 +214,7 @@ TEST(FloatMulOpTest, MixedBroadcast) {
 }
 
 TEST(FloatMulOpTest, WithBroadcast2Elements) {
-  std::vector<std::vector<int>> test_shapes = {
+  const std::vector<std::vector<int>> test_shapes = {
       {2, 2}, {2, 1, 2}, {1, 2, 2}, {1, 2, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     FloatMulOpModel m({TensorType_FLOAT32, test_shapes[i]},
@@ -250,7 +250,7 @@ TEST(IntegerMulOpTest, ActivationRELU_N1_TO_1) {
 }
 
 TEST(IntegerMulOpTest, VariousInputShapes) {
-  std::vector<std::vector<int>> test_shapes = {
+  const std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     IntegerMulOpModel m({TensorType_INT32, test_shapes[i]},
@@ -265,7 +265,7 @@ TEST(IntegerMulOpTest, VariousInputShapes) {
 }
 
 TEST(IntegerMulOpTest, WithBroadcast) {
-  std::vector<std::vector<int>> test_shapes = {
+  const std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     IntegerMulOpModel m({TensorType_INT32, test_shapes[i]},
@@ -392,32 +392,39 @@ float GetTolerance(int min, int max) {
 
 template <TensorType tensor_type, typename integer_dtype>
 void WithBroadcast() {
-  float kQuantizedTolerance = GetTolerance(-3.0, 3.0);
-  std::vector<std::vector<int>> test_shapes = {
+  const float kQuantizedTolerance = GetTolerance(-3.0, 3.0);
+  const std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  // Test with a smaller than 1 and greater than 1 quantization multiplier
+  const std::vector<std::pair<float, float>> test_input_range = {{-3.0, 3.0},
+                                                                 {-6.0, 6.0}};
   for (int i = 0; i < test_shapes.size(); ++i) {
-    QuantizedMulOpModel m({tensor_type, test_shapes[i], -3.0, 3.0},
-                          {tensor_type, {}, -3.0, 3.0},  // always a scalar
-                          {tensor_type, {}, -3.0, 3.0},
-                          ActivationFunctionType_NONE);
-    m.QuantizeAndPopulate<integer_dtype>(m.input1(),
-                                         {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
-    m.QuantizeAndPopulate<integer_dtype>(m.input2(), {0.1});
-    m.Invoke();
-    EXPECT_THAT(m.GetDequantizedOutput<integer_dtype>(),
-                ElementsAreArray(ArrayFloatNear(
-                    {-0.2, 0.02, 0.07, 0.08, 0.11, 0.2}, kQuantizedTolerance)))
-        << "With shape number " << i;
+    for (int j = 0; j < test_input_range.size(); ++j) {
+      const std::pair<float, float>& input_range = test_input_range[j];
+      QuantizedMulOpModel m(
+          {tensor_type, test_shapes[i], input_range.first, input_range.second},
+          {tensor_type, {}, input_range.first, input_range.second},
+          {tensor_type, {}, -0.2, 0.2}, ActivationFunctionType_NONE);
+      m.QuantizeAndPopulate<integer_dtype>(m.input1(),
+                                           {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
+      m.QuantizeAndPopulate<integer_dtype>(m.input2(), {0.1});
+      m.Invoke();
+      EXPECT_THAT(
+          m.GetDequantizedOutput<integer_dtype>(),
+          ElementsAreArray(ArrayFloatNear({-0.2, 0.02, 0.07, 0.08, 0.11, 0.2},
+                                          kQuantizedTolerance)))
+          << "With shape number " << i << " and range number " << j;
+    }
   }
 }
 
 template <enum TensorType tensor_type, typename integer_dtype>
 void QuantizedWithMixedBroadcast() {
-  float kQuantizedTolerance = GetTolerance(-3.f, 3.f);
+  const float kQuantizedTolerance = GetTolerance(-3.f, 3.f);
   const std::vector<int> base_shape = {2, 3, 1, 2};
-  std::vector<std::vector<int>> test_shapes = {
+  const std::vector<std::vector<int>> test_shapes = {
       {1, 1, 3, 2}, {1, 3, 1, 2}, {2, 1, 3, 1}, {2, 3, 1, 1}};
-  std::vector<std::vector<float>> test_outputs = {
+  const std::vector<std::vector<float>> test_outputs = {
       {-0.06f, 0.69f,  0.12f,  1.15f, -0.30f, 2.07f,  0.18f,  0.15f, -0.36f,
        0.25f,  0.90f,  0.45f,  0.16f, -0.33f, -0.32f, -0.55f, 0.80f, -0.99f,
        0.24f,  0.84f,  -0.48f, 1.40f, 1.20f,  2.52f,  -0.32f, 0.00f, 0.64f,
diff --git a/tensorflow/lite/kernels/non_max_suppression_test.cc b/tensorflow/lite/kernels/non_max_suppression_test.cc
index 9b7baa147e5..3ca54010fcd 100644
--- a/tensorflow/lite/kernels/non_max_suppression_test.cc
+++ b/tensorflow/lite/kernels/non_max_suppression_test.cc
@@ -77,7 +77,7 @@ class NonMaxSuppressionV4OpModel : public BaseNMSOp {
       input_max_output_size_ =
           AddConstInput(TensorType_INT32, {max_output_size});
     } else {
-      input_max_output_size_ = AddInput(TensorType_INT32, {});
+      input_max_output_size_ = AddInput(TensorType_INT32);
     }
     input_iou_threshold_ = AddConstInput(TensorType_FLOAT32, {iou_threshold});
     input_score_threshold_ = AddInput({TensorType_FLOAT32, {}});
@@ -168,7 +168,7 @@ class NonMaxSuppressionV5OpModel : public BaseNMSOp {
       input_max_output_size_ =
           AddConstInput(TensorType_INT32, {max_output_size});
     } else {
-      input_max_output_size_ = AddInput(TensorType_INT32, {});
+      input_max_output_size_ = AddInput(TensorType_INT32);
     }
     input_iou_threshold_ = AddConstInput(TensorType_FLOAT32, {iou_threshold});
     input_score_threshold_ = AddInput({TensorType_FLOAT32, {}});
diff --git a/tensorflow/lite/kernels/optional_tensor_test.cc b/tensorflow/lite/kernels/optional_tensor_test.cc
index 9e83c74da8d..1468c89e375 100644
--- a/tensorflow/lite/kernels/optional_tensor_test.cc
+++ b/tensorflow/lite/kernels/optional_tensor_test.cc
@@ -94,10 +94,10 @@ class LSTMOpModel : public SingleOpModel {
     }
 
     // Adding the 2 input state tensors.
-    input_activation_state_ =
-        AddInput(TensorData{TensorType_FLOAT32, {n_output_ * n_batch_}}, true);
+    input_activation_state_ = AddVariableInput(
+        TensorData{TensorType_FLOAT32, {n_output_ * n_batch_}});
     input_cell_state_ =
-        AddInput(TensorData{TensorType_FLOAT32, {n_cell_ * n_batch_}}, true);
+        AddVariableInput(TensorData{TensorType_FLOAT32, {n_cell_ * n_batch_}});
 
     output_ = AddOutput(TensorType_FLOAT32);
 
diff --git a/tensorflow/lite/kernels/quant_basic_lstm_test.cc b/tensorflow/lite/kernels/quant_basic_lstm_test.cc
index 3e081c221c5..787cb6d2e97 100644
--- a/tensorflow/lite/kernels/quant_basic_lstm_test.cc
+++ b/tensorflow/lite/kernels/quant_basic_lstm_test.cc
@@ -61,9 +61,8 @@ class QuantizedLSTMOpModel : public MultiOpModel {
 
     input_ = AddInput(input_tensor_data);
 
-    prev_output_ =
-        AddInput({TensorType_UINT8, output_shape, 0.0f, 0.0f, 1. / 128., 128},
-                 /*is_variable=*/true);
+    prev_output_ = AddVariableInput(
+        {TensorType_UINT8, output_shape, 0.0f, 0.0f, 1. / 128., 128});
     // Biases and Weights have to be constant in order to allow NNAPI
     // delegation
     weights_ = AddConstInput<uint8_t>({TensorType_UINT8, weight_shape, 0.0f,
@@ -72,9 +71,8 @@ class QuantizedLSTMOpModel : public MultiOpModel {
     biases_ = AddConstInput<int32_t>(
         {TensorType_INT32, bias_shape, 0.0f, 0.0f, weightsScale / 128, 0},
         biases);
-    prev_cell_state_ =
-        AddInput({TensorType_INT16, state_shape, 0.0f, 0.0f, 1. / 2048., 0},
-                 /*is_variable=*/true);
+    prev_cell_state_ = AddVariableInput(
+        {TensorType_INT16, state_shape, 0.0f, 0.0f, 1. / 2048., 0});
 
     sum_out_ = AddOutput(input_tensor_data);
 
diff --git a/tensorflow/lite/kernels/range.cc b/tensorflow/lite/kernels/range.cc
index fe67d055ded..71ee4208ed9 100644
--- a/tensorflow/lite/kernels/range.cc
+++ b/tensorflow/lite/kernels/range.cc
@@ -41,8 +41,8 @@ template <typename T>
 TfLiteStatus GetSize(TfLiteContext* context, T start, T limit, T delta,
                      int* size) {
   TF_LITE_ENSURE(context, !std::equal_to<T>()(delta, 0));
-  TF_LITE_ENSURE(context,
-                 (start > limit && delta < 0) || (start < limit && delta > 0));
+  TF_LITE_ENSURE(
+      context, (start >= limit && delta < 0) || (start <= limit && delta > 0));
   *size =
       (std::is_integral<T>::value
            ? ((std::abs(limit - start) + std::abs(delta) - 1) / std::abs(delta))
diff --git a/tensorflow/lite/kernels/range_test.cc b/tensorflow/lite/kernels/range_test.cc
index 52f7231def9..bb11d15b000 100644
--- a/tensorflow/lite/kernels/range_test.cc
+++ b/tensorflow/lite/kernels/range_test.cc
@@ -112,5 +112,15 @@ TEST(RangeOpModel, FloatNegativeDelta) {
   EXPECT_THAT(model.GetOutput(), ElementsAre(10, 7, 4));
 }
 
+TEST(RangeOpModel, EmptyOutput) {
+  RangeOpModel<int32_t> model(TensorType_INT32);
+  model.PopulateTensor<int32_t>(model.start(), {0});
+  model.PopulateTensor<int32_t>(model.limit(), {0});
+  model.PopulateTensor<int32_t>(model.delta(), {1});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(0));
+  EXPECT_THAT(model.GetOutput(), ElementsAre());
+}
+
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/register.cc b/tensorflow/lite/kernels/register.cc
index 28a0f3fd849..8d8095bdc82 100644
--- a/tensorflow/lite/kernels/register.cc
+++ b/tensorflow/lite/kernels/register.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/builtin_op_kernels.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/tflite_with_xnnpack_optional.h"
 
 namespace tflite {
 namespace ops {
@@ -33,7 +34,8 @@ TfLiteRegistration* Register_DETECTION_POSTPROCESS();
 namespace builtin {
 
 BuiltinOpResolver::BuiltinOpResolver() {
-  AddBuiltin(BuiltinOperator_ABS, Register_ABS());
+  AddBuiltin(BuiltinOperator_ABS, Register_ABS(), /* min_version = */ 1,
+             /* max_version = */ 2);
   AddBuiltin(BuiltinOperator_HARD_SWISH, Register_HARD_SWISH());
   AddBuiltin(BuiltinOperator_RELU, Register_RELU(), /* min_version = */ 1,
              /* max_version = */ 2);
@@ -123,7 +125,7 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_RESIZE_NEAREST_NEIGHBOR,
              Register_RESIZE_NEAREST_NEIGHBOR(),
              /* min_version = */ 1,
-             /* max_version = */ 3);
+             /* max_version = */ 4);
   AddBuiltin(BuiltinOperator_SKIP_GRAM, Register_SKIP_GRAM());
   AddBuiltin(BuiltinOperator_SPACE_TO_DEPTH, Register_SPACE_TO_DEPTH(),
              /* min_version = */ 1,
@@ -134,7 +136,7 @@ BuiltinOpResolver::BuiltinOpResolver() {
              /* max_version = */ 4);
   AddBuiltin(BuiltinOperator_TRANSPOSE, Register_TRANSPOSE(),
              /* min_version = */ 1,
-             /* max_version = */ 4);
+             /* max_version = */ 5);
   AddBuiltin(BuiltinOperator_MEAN, Register_MEAN(),
              /* min_version = */ 1,
              /* max_version = */ 3);
@@ -201,7 +203,7 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_SELECT_V2, Register_SELECT_V2());
   AddBuiltin(BuiltinOperator_SLICE, Register_SLICE(),
              /* min_version = */ 1,
-             /* max_version = */ 3);
+             /* max_version = */ 4);
   AddBuiltin(BuiltinOperator_SIN, Register_SIN());
   AddBuiltin(BuiltinOperator_COS, Register_COS());
   AddBuiltin(BuiltinOperator_TRANSPOSE_CONV, Register_TRANSPOSE_CONV(),
@@ -302,6 +304,21 @@ BuiltinOpResolver::BuiltinOpResolver() {
             tflite::ops::custom::Register_DETECTION_POSTPROCESS());
 }
 
+OpResolver::TfLiteDelegatePtrVector BuiltinOpResolver::GetDelegates(
+    int num_threads) const {
+  OpResolver::TfLiteDelegatePtrVector delegates;
+  auto xnnpack_delegate = tflite::MaybeCreateXNNPACKDelegate(num_threads);
+  if (xnnpack_delegate != nullptr) {
+    delegates.push_back(std::move(xnnpack_delegate));
+  }
+  return delegates;
+}
+
+OpResolver::TfLiteDelegatePtrVector
+BuiltinOpResolverWithoutDefaultDelegates::GetDelegates(int num_threads) const {
+  return OpResolver::TfLiteDelegatePtrVector();
+}
+
 }  // namespace builtin
 }  // namespace ops
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/register.h b/tensorflow/lite/kernels/register.h
index a2a41ea9428..1a6095c7140 100644
--- a/tensorflow/lite/kernels/register.h
+++ b/tensorflow/lite/kernels/register.h
@@ -22,9 +22,22 @@ namespace tflite {
 namespace ops {
 namespace builtin {
 
+// This built-in op resolver provides a list of TfLite delegates that could be
+// applied by TfLite interpreter by default.
 class BuiltinOpResolver : public MutableOpResolver {
  public:
   BuiltinOpResolver();
+  OpResolver::TfLiteDelegatePtrVector GetDelegates(
+      int num_threads) const override;
+};
+
+// TfLite interpreter could apply a TfLite delegate by default. To completely
+// disable this behavior, one could choose to use the following class
+// BuiltinOpResolverWithoutDefaultDelegates.
+class BuiltinOpResolverWithoutDefaultDelegates : public BuiltinOpResolver {
+ public:
+  BuiltinOpResolverWithoutDefaultDelegates() : BuiltinOpResolver() {}
+  OpResolver::TfLiteDelegatePtrVector GetDelegates(int num_threads) const final;
 };
 
 }  // namespace builtin
diff --git a/tensorflow/lite/kernels/register_ref.cc b/tensorflow/lite/kernels/register_ref.cc
index 233520e2165..c8fb46adb96 100644
--- a/tensorflow/lite/kernels/register_ref.cc
+++ b/tensorflow/lite/kernels/register_ref.cc
@@ -25,6 +25,7 @@ namespace ops {
 
 namespace custom {
 
+TfLiteRegistration* Register_NUMERIC_VERIFY_REF();
 TfLiteRegistration* Register_AUDIO_SPECTROGRAM();
 TfLiteRegistration* Register_MFCC();
 TfLiteRegistration* Register_DETECTION_POSTPROCESS();
@@ -108,6 +109,7 @@ TfLiteRegistration* Register_REDUCE_ANY();
 TfLiteRegistration* Register_SELECT();
 TfLiteRegistration* Register_SLICE_REF();
 TfLiteRegistration* Register_SIN();
+TfLiteRegistration* Register_COS();
 TfLiteRegistration* Register_TRANSPOSECONV_REF();
 TfLiteRegistration* Register_EXPAND_DIMS();
 TfLiteRegistration* Register_SPARSE_TO_DENSE();
@@ -116,6 +118,7 @@ TfLiteRegistration* Register_NOT_EQUAL();
 TfLiteRegistration* Register_SQRT();
 TfLiteRegistration* Register_RSQRT();
 TfLiteRegistration* Register_SHAPE();
+TfLiteRegistration* Register_RANK();
 TfLiteRegistration* Register_POW();
 TfLiteRegistration* Register_FAKE_QUANT();
 TfLiteRegistration* Register_PACK();
@@ -133,12 +136,26 @@ TfLiteRegistration* Register_LEAKY_RELU();
 TfLiteRegistration* Register_SQUARED_DIFFERENCE();
 TfLiteRegistration* Register_FILL();
 TfLiteRegistration* Register_MIRROR_PAD();
+TfLiteRegistration* Register_UNIQUE();
+TfLiteRegistration* Register_REVERSE_V2();
+TfLiteRegistration* Register_ADD_N();
+TfLiteRegistration* Register_GATHER_ND();
+TfLiteRegistration* Register_WHERE();
+TfLiteRegistration* Register_REVERSE_SEQUENCE();
+TfLiteRegistration* Register_MATRIX_DIAG();
 TfLiteRegistration* Register_QUANTIZE();
+TfLiteRegistration* Register_MATRIX_SET_DIAG();
+TfLiteRegistration* Register_IF();
+TfLiteRegistration* Register_WHILE();
+TfLiteRegistration* Register_NON_MAX_SUPPRESSION_V4();
+TfLiteRegistration* Register_NON_MAX_SUPPRESSION_V5();
+TfLiteRegistration* Register_SCATTER_ND();
+TfLiteRegistration* Register_DENSIFY();
+TfLiteRegistration* Register_BATCH_MATMUL_REF();
 TfLiteRegistration* Register_HARD_SWISH_REF();
 TfLiteRegistration* Register_DEPTH_TO_SPACE_REF();
 TfLiteRegistration* Register_SELECT_V2();
 TfLiteRegistration* Register_SEGMENT_SUM();
-TfLiteRegistration* Register_BATCH_MATMUL_REF();
 
 namespace {
 
@@ -172,128 +189,267 @@ const TfLiteRegistration* BuiltinRefOpResolver::FindOp(const char* op,
 }
 
 BuiltinRefOpResolver::BuiltinRefOpResolver() {
-  AddBuiltin(BuiltinOperator_ABS, Register_ABS());
-  AddBuiltin(BuiltinOperator_RELU, Register_RELU());
+  AddBuiltin(BuiltinOperator_ABS, Register_ABS(), /* min_version = */ 1,
+             /* max_version = */ 2);
+  AddBuiltin(BuiltinOperator_HARD_SWISH, Register_HARD_SWISH_REF());
+  AddBuiltin(BuiltinOperator_RELU, Register_RELU(), /* min_version = */ 1,
+             /* max_version = */ 2);
   AddBuiltin(BuiltinOperator_RELU_N1_TO_1, Register_RELU_N1_TO_1());
-  AddBuiltin(BuiltinOperator_RELU6, Register_RELU6());
-  AddBuiltin(BuiltinOperator_TANH, Register_TANH_REF());
-  AddBuiltin(BuiltinOperator_LOGISTIC, Register_LOGISTIC_REF());
-  AddBuiltin(BuiltinOperator_AVERAGE_POOL_2D, Register_AVERAGE_POOL_REF());
-  AddBuiltin(BuiltinOperator_MAX_POOL_2D, Register_MAX_POOL_REF());
+  AddBuiltin(BuiltinOperator_RELU6, Register_RELU6(), /* min_version = */ 1,
+             /* max_version = */ 2);
+  AddBuiltin(BuiltinOperator_TANH, Register_TANH_REF(), /* min_version = */ 1,
+             /* max_version = */ 3);
+  AddBuiltin(BuiltinOperator_LOGISTIC, Register_LOGISTIC_REF(),
+             /* min_version = */ 1,
+             /* max_version = */ 3);
+  AddBuiltin(BuiltinOperator_AVERAGE_POOL_2D, Register_AVERAGE_POOL_REF(),
+             /* min_version = */ 1,
+             /* max_version = */ 3);
+  AddBuiltin(BuiltinOperator_MAX_POOL_2D, Register_MAX_POOL_REF(),
+             /* min_version = */ 1,
+             /* max_version = */ 3);
   AddBuiltin(BuiltinOperator_L2_POOL_2D, Register_L2_POOL_REF());
-  AddBuiltin(BuiltinOperator_CONV_2D, Register_CONVOLUTION_REF());
+  AddBuiltin(BuiltinOperator_CONV_2D, Register_CONVOLUTION_REF(),
+             /* min_version = */ 1,
+             /* max_version = */ 5);
   AddBuiltin(BuiltinOperator_DEPTHWISE_CONV_2D,
              Register_DEPTHWISE_CONVOLUTION_REF(),
-             /* min_version */ 1,
-             /* max_version */ 2);
-  AddBuiltin(BuiltinOperator_SVDF, Register_SVDF());
-  AddBuiltin(BuiltinOperator_RNN, Register_RNN());
+             /* min_version = */ 1,
+             /* max_version = */ 6);
+  AddBuiltin(BuiltinOperator_SVDF, Register_SVDF(),
+             /* min_version = */ 1,
+             /* max_version = */ 4);
+  AddBuiltin(BuiltinOperator_RNN, Register_RNN(),
+             /* min_version = */ 1,
+             /* max_version = */ 3);
   AddBuiltin(BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN,
-             Register_BIDIRECTIONAL_SEQUENCE_RNN());
+             Register_BIDIRECTIONAL_SEQUENCE_RNN(),
+             /* min_version = */ 1,
+             /* max_version = */ 3);
   AddBuiltin(BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN,
-             Register_UNIDIRECTIONAL_SEQUENCE_RNN());
-  AddBuiltin(BuiltinOperator_EMBEDDING_LOOKUP, Register_EMBEDDING_LOOKUP());
+             Register_UNIDIRECTIONAL_SEQUENCE_RNN(),
+             /* min_version = */ 1,
+             /* max_version = */ 3);
+  AddBuiltin(BuiltinOperator_EMBEDDING_LOOKUP, Register_EMBEDDING_LOOKUP(),
+             /* min_version = */ 1,
+             /* max_version = */ 3);
   AddBuiltin(BuiltinOperator_EMBEDDING_LOOKUP_SPARSE,
              Register_EMBEDDING_LOOKUP_SPARSE());
   AddBuiltin(BuiltinOperator_FULLY_CONNECTED, Register_FULLY_CONNECTED_REF(),
              /* min_version */ 1,
-             /* max_version */ 2);
+             /* max_version */ 9);
   AddBuiltin(BuiltinOperator_LSH_PROJECTION, Register_LSH_PROJECTION());
   AddBuiltin(BuiltinOperator_HASHTABLE_LOOKUP, Register_HASHTABLE_LOOKUP());
-  AddBuiltin(BuiltinOperator_SOFTMAX, Register_SOFTMAX());
-  AddBuiltin(BuiltinOperator_CONCATENATION, Register_CONCATENATION_REF());
-  AddBuiltin(BuiltinOperator_ADD, Register_ADD_REF());
+  AddBuiltin(BuiltinOperator_SOFTMAX, Register_SOFTMAX(),
+             /* min_version = */ 1,
+             /* max_version = */ 3);
+  AddBuiltin(BuiltinOperator_CONCATENATION, Register_CONCATENATION_REF(),
+             /* min_version = */ 1,
+             /* max_version = */ 3);
+  AddBuiltin(BuiltinOperator_ADD, Register_ADD_REF(),
+             /* min_version */ 1,
+             /* max_version */ 4);
   AddBuiltin(BuiltinOperator_SPACE_TO_BATCH_ND,
-             Register_SPACE_TO_BATCH_ND_REF());
+             Register_SPACE_TO_BATCH_ND_REF(),
+             /* min_version = */ 1,
+             /* max_version = */ 3);
   AddBuiltin(BuiltinOperator_BATCH_TO_SPACE_ND,
-             Register_BATCH_TO_SPACE_ND_REF());
-  AddBuiltin(BuiltinOperator_MUL, Register_MUL_REF());
-  AddBuiltin(BuiltinOperator_L2_NORMALIZATION, Register_L2NORM_REF());
+             Register_BATCH_TO_SPACE_ND_REF(),
+             /* min_version = */ 1,
+             /* max_version = */ 3);
+  AddBuiltin(BuiltinOperator_MUL, Register_MUL_REF(), /* min_version = */ 1,
+             /* max_version = */ 4);
+  AddBuiltin(BuiltinOperator_L2_NORMALIZATION, Register_L2NORM_REF(),
+             /* min_version = */ 1,
+             /* max_version = */ 2);
   AddBuiltin(BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION,
              Register_LOCAL_RESPONSE_NORM_REF());
   AddBuiltin(BuiltinOperator_LSTM, Register_LSTM(), /* min_version */ 1,
-             /* max_version */ 2);
+             /* max_version */ 4);
   AddBuiltin(BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM,
              Register_BIDIRECTIONAL_SEQUENCE_LSTM(), /* min_version */ 1,
-             /* max_version */ 2);
+             /* max_version */ 3);
   AddBuiltin(BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM,
-             Register_UNIDIRECTIONAL_SEQUENCE_LSTM());
-  AddBuiltin(BuiltinOperator_PAD, Register_PAD_REF());
-  AddBuiltin(BuiltinOperator_PADV2, Register_PADV2_REF());
+             Register_UNIDIRECTIONAL_SEQUENCE_LSTM(), /* min_version = */ 1,
+             /* max_version = */ 3);
+  AddBuiltin(BuiltinOperator_PAD, Register_PAD_REF(), /* min_version = */ 1,
+             /* max_version = */ 3);
+  AddBuiltin(BuiltinOperator_PADV2, Register_PADV2_REF(), /* min_version = */ 1,
+             /* max_version = */ 3);
   AddBuiltin(BuiltinOperator_RESHAPE, Register_RESHAPE());
-  AddBuiltin(BuiltinOperator_RESIZE_BILINEAR, Register_RESIZE_BILINEAR_REF());
+  AddBuiltin(BuiltinOperator_RESIZE_BILINEAR, Register_RESIZE_BILINEAR_REF(),
+             /* min_version = */ 1,
+             /* max_version = */ 3);
   AddBuiltin(BuiltinOperator_RESIZE_NEAREST_NEIGHBOR,
-             Register_RESIZE_NEAREST_NEIGHBOR_REF());
+             Register_RESIZE_NEAREST_NEIGHBOR_REF(),
+             /* min_version = */ 1,
+             /* max_version = */ 4);
   AddBuiltin(BuiltinOperator_SKIP_GRAM, Register_SKIP_GRAM());
-  AddBuiltin(BuiltinOperator_SPACE_TO_DEPTH, Register_SPACE_TO_DEPTH_REF());
+  AddBuiltin(BuiltinOperator_SPACE_TO_DEPTH, Register_SPACE_TO_DEPTH_REF(),
+             /* min_version = */ 1,
+             /* max_version = */ 2);
   AddBuiltin(BuiltinOperator_DEPTH_TO_SPACE, Register_DEPTH_TO_SPACE_REF());
-  AddBuiltin(BuiltinOperator_GATHER, Register_GATHER());
-  AddBuiltin(BuiltinOperator_TRANSPOSE, Register_TRANSPOSE_REF());
-  AddBuiltin(BuiltinOperator_MEAN, Register_MEAN_REF());
-  AddBuiltin(BuiltinOperator_DIV, Register_DIV_REF());
-  AddBuiltin(BuiltinOperator_SUB, Register_SUB_REF());
-  AddBuiltin(BuiltinOperator_SPLIT, Register_SPLIT());
-  AddBuiltin(BuiltinOperator_SPLIT_V, Register_SPLIT_V());
+  AddBuiltin(BuiltinOperator_GATHER, Register_GATHER(),
+             /* min_version = */ 1,
+             /* max_version = */ 4);
+  AddBuiltin(BuiltinOperator_TRANSPOSE, Register_TRANSPOSE_REF(),
+             /* min_version = */ 1,
+             /* max_version = */ 5);
+  AddBuiltin(BuiltinOperator_MEAN, Register_MEAN_REF(),
+             /* min_version = */ 1,
+             /* max_version = */ 3);
+  AddBuiltin(BuiltinOperator_DIV, Register_DIV_REF(),
+             /* min_version = */ 1,
+             /* max_version = */ 2);
+  AddBuiltin(BuiltinOperator_SUB, Register_SUB_REF(),
+             /* min_version = */ 1,
+             /* max_version = */ 5);
+  AddBuiltin(BuiltinOperator_SPLIT, Register_SPLIT(),
+             /* min_version = */ 1,
+             /* max_version = */ 4);
+  AddBuiltin(BuiltinOperator_SPLIT_V, Register_SPLIT_V(),
+             /* min_version = */ 1,
+             /* max_version = */ 2);
   AddBuiltin(BuiltinOperator_SQUEEZE, Register_SQUEEZE());
-  AddBuiltin(BuiltinOperator_STRIDED_SLICE, Register_STRIDED_SLICE_REF());
+  AddBuiltin(BuiltinOperator_STRIDED_SLICE, Register_STRIDED_SLICE_REF(),
+             /* min_version = */ 1,
+             /* max_version = */ 4);
   AddBuiltin(BuiltinOperator_EXP, Register_EXP());
-  AddBuiltin(BuiltinOperator_TOPK_V2, Register_TOPK_V2());
+  AddBuiltin(BuiltinOperator_TOPK_V2, Register_TOPK_V2(),
+             /* min_version = */ 1,
+             /* max_version = */ 2);
   AddBuiltin(BuiltinOperator_LOG, Register_LOG());
-  AddBuiltin(BuiltinOperator_LOG_SOFTMAX, Register_LOG_SOFTMAX_REF());
+  AddBuiltin(BuiltinOperator_LOG_SOFTMAX, Register_LOG_SOFTMAX_REF(),
+             /* min_version = */ 1,
+             /* max_version = */ 2);
   AddBuiltin(BuiltinOperator_CAST, Register_CAST());
   AddBuiltin(BuiltinOperator_DEQUANTIZE, Register_DEQUANTIZE(),
-             /* min_version */ 1,
-             /* max_version */ 2);
+             /* min_version = */ 1,
+             /* max_version = */ 4);
   AddBuiltin(BuiltinOperator_PRELU, Register_PRELU());
-  AddBuiltin(BuiltinOperator_MAXIMUM, Register_MAXIMUM());
-  AddBuiltin(BuiltinOperator_MINIMUM, Register_MINIMUM());
-  AddBuiltin(BuiltinOperator_ARG_MAX, Register_ARG_MAX());
-  AddBuiltin(BuiltinOperator_ARG_MIN, Register_ARG_MIN());
-  AddBuiltin(BuiltinOperator_GREATER, Register_GREATER());
-  AddBuiltin(BuiltinOperator_GREATER_EQUAL, Register_GREATER_EQUAL());
-  AddBuiltin(BuiltinOperator_LESS, Register_LESS());
-  AddBuiltin(BuiltinOperator_LESS_EQUAL, Register_LESS_EQUAL());
+  AddBuiltin(BuiltinOperator_MAXIMUM, Register_MAXIMUM(),
+             /* min_version = */ 1,
+             /* max_version = */ 4);
+  AddBuiltin(BuiltinOperator_MINIMUM, Register_MINIMUM(),
+             /* min_version = */ 1,
+             /* max_version = */ 4);
+  AddBuiltin(BuiltinOperator_ARG_MAX, Register_ARG_MAX(),
+             /* min_version = */ 1,
+             /* max_version = */ 2);
+  AddBuiltin(BuiltinOperator_ARG_MIN, Register_ARG_MIN(),
+             /* min_version = */ 1,
+             /* max_version = */ 2);
+  AddBuiltin(BuiltinOperator_GREATER, Register_GREATER(),
+             /* min_version = */ 1,
+             /* max_version = */ 2);
+  AddBuiltin(BuiltinOperator_GREATER_EQUAL, Register_GREATER_EQUAL(),
+             /* min_version = */ 1,
+             /* max_version = */ 2);
+  AddBuiltin(BuiltinOperator_LESS, Register_LESS(),
+             /* min_version = */ 1,
+             /* max_version = */ 2);
+  AddBuiltin(BuiltinOperator_LESS_EQUAL, Register_LESS_EQUAL(),
+             /* min_version = */ 1,
+             /* max_version = */ 2);
   AddBuiltin(BuiltinOperator_FLOOR, Register_FLOOR_REF());
   AddBuiltin(BuiltinOperator_NEG, Register_NEG());
-  AddBuiltin(BuiltinOperator_SELECT, Register_SELECT());
-  AddBuiltin(BuiltinOperator_SLICE, Register_SLICE_REF());
+  AddBuiltin(BuiltinOperator_SELECT, Register_SELECT(),
+             /* min_version = */ 1,
+             /* max_version = */ 2);
+  AddBuiltin(BuiltinOperator_SELECT_V2, Register_SELECT_V2());
+  AddBuiltin(BuiltinOperator_SLICE, Register_SLICE_REF(),
+             /* min_version = */ 1,
+             /* max_version = */ 4);
   AddBuiltin(BuiltinOperator_SIN, Register_SIN());
-  AddBuiltin(BuiltinOperator_TRANSPOSE_CONV, Register_TRANSPOSECONV_REF());
-  AddBuiltin(BuiltinOperator_TILE, Register_TILE());
-  AddBuiltin(BuiltinOperator_SUM, Register_SUM());
+  AddBuiltin(BuiltinOperator_COS, Register_COS());
+  AddBuiltin(BuiltinOperator_TRANSPOSE_CONV, Register_TRANSPOSECONV_REF(),
+             /* min_version = */ 1,
+             /* max_version = */ 3);
+  AddBuiltin(BuiltinOperator_TILE, Register_TILE(),
+             /* min_version = */ 1,
+             /* max_version = */ 2);
+  AddBuiltin(BuiltinOperator_SUM, Register_SUM(),
+             /* min_version = */ 1,
+             /* max_version = */ 2);
   AddBuiltin(BuiltinOperator_REDUCE_PROD, Register_REDUCE_PROD());
-  AddBuiltin(BuiltinOperator_REDUCE_MAX, Register_REDUCE_MAX());
-  AddBuiltin(BuiltinOperator_REDUCE_MIN, Register_REDUCE_MIN());
+  AddBuiltin(BuiltinOperator_REDUCE_MAX, Register_REDUCE_MAX(),
+             /* min_version = */ 1,
+             /* max_version = */ 2);
+  AddBuiltin(BuiltinOperator_REDUCE_MIN, Register_REDUCE_MIN(),
+             /* min_version = */ 1,
+             /* max_version = */ 2);
   AddBuiltin(BuiltinOperator_REDUCE_ANY, Register_REDUCE_ANY());
   AddBuiltin(BuiltinOperator_EXPAND_DIMS, Register_EXPAND_DIMS());
-  AddBuiltin(BuiltinOperator_SPARSE_TO_DENSE, Register_SPARSE_TO_DENSE());
-  AddBuiltin(BuiltinOperator_EQUAL, Register_EQUAL());
-  AddBuiltin(BuiltinOperator_NOT_EQUAL, Register_NOT_EQUAL());
+  AddBuiltin(BuiltinOperator_SPARSE_TO_DENSE, Register_SPARSE_TO_DENSE(),
+             /* min_version = */ 1,
+             /* max_version = */ 3);
+  AddBuiltin(BuiltinOperator_EQUAL, Register_EQUAL(),
+             /* min_version = */ 1,
+             /* max_version = */ 3);
+  AddBuiltin(BuiltinOperator_NOT_EQUAL, Register_NOT_EQUAL(),
+             /* min_version = */ 1,
+             /* max_version = */ 3);
   AddBuiltin(BuiltinOperator_SQRT, Register_SQRT());
   AddBuiltin(BuiltinOperator_RSQRT, Register_RSQRT());
   AddBuiltin(BuiltinOperator_SHAPE, Register_SHAPE());
+  AddBuiltin(BuiltinOperator_RANK, Register_RANK());
   AddBuiltin(BuiltinOperator_POW, Register_POW());
   AddBuiltin(BuiltinOperator_FAKE_QUANT, Register_FAKE_QUANT(), 1, 2);
-  AddBuiltin(BuiltinOperator_PACK, Register_PACK());
+  AddBuiltin(BuiltinOperator_PACK, Register_PACK(),
+             /* min_version = */ 1,
+             /* max_version = */ 3);
   AddBuiltin(BuiltinOperator_ONE_HOT, Register_ONE_HOT());
   AddBuiltin(BuiltinOperator_LOGICAL_OR, Register_LOGICAL_OR());
   AddBuiltin(BuiltinOperator_LOGICAL_AND, Register_LOGICAL_AND());
   AddBuiltin(BuiltinOperator_LOGICAL_NOT, Register_LOGICAL_NOT());
-  AddBuiltin(BuiltinOperator_UNPACK, Register_UNPACK());
-  AddBuiltin(BuiltinOperator_FLOOR_DIV, Register_FLOOR_DIV());
+  AddBuiltin(BuiltinOperator_UNPACK, Register_UNPACK(),
+             /* min_version = */ 1,
+             /* max_version = */ 4);
+  AddBuiltin(BuiltinOperator_FLOOR_DIV, Register_FLOOR_DIV(),
+             /* min_version = */ 1,
+             /* max_version = */ 2);
   AddBuiltin(BuiltinOperator_SQUARE, Register_SQUARE());
   AddBuiltin(BuiltinOperator_ZEROS_LIKE, Register_ZEROS_LIKE());
   AddBuiltin(BuiltinOperator_FLOOR_MOD, Register_FLOOR_MOD());
   AddBuiltin(BuiltinOperator_RANGE, Register_RANGE());
-  AddBuiltin(BuiltinOperator_LEAKY_RELU, Register_LEAKY_RELU());
+  AddBuiltin(BuiltinOperator_LEAKY_RELU, Register_LEAKY_RELU(),
+             /* min_version = */ 1,
+             /* max_version = */ 2);
   AddBuiltin(BuiltinOperator_SQUARED_DIFFERENCE, Register_SQUARED_DIFFERENCE());
-  AddBuiltin(BuiltinOperator_FILL, Register_FILL());
-  AddBuiltin(BuiltinOperator_MIRROR_PAD, Register_MIRROR_PAD());
-  AddBuiltin(BuiltinOperator_QUANTIZE, Register_QUANTIZE());
-  AddBuiltin(BuiltinOperator_HARD_SWISH, Register_HARD_SWISH_REF());
-  AddBuiltin(BuiltinOperator_SELECT_V2, Register_SELECT_V2());
-  AddBuiltin(BuiltinOperator_SEGMENT_SUM, Register_SEGMENT_SUM());
-
+  AddBuiltin(BuiltinOperator_FILL, Register_FILL(),
+             /* min_version = */ 1,
+             /* max_version = */ 2);
+  AddBuiltin(BuiltinOperator_MIRROR_PAD, Register_MIRROR_PAD(),
+             /* min_version = */ 1,
+             /* max_version = */ 2);
+  AddBuiltin(BuiltinOperator_UNIQUE, Register_UNIQUE());
+  AddBuiltin(BuiltinOperator_REVERSE_V2, Register_REVERSE_V2(),
+             /* min_version = */ 1,
+             /* max_version = */ 2);
+  AddBuiltin(BuiltinOperator_ADD_N, Register_ADD_N());
+  AddBuiltin(BuiltinOperator_GATHER_ND, Register_GATHER_ND(),
+             /* min_version = */ 1,
+             /* max_version = */ 2);
+  AddBuiltin(BuiltinOperator_WHERE, Register_WHERE());
+  AddBuiltin(BuiltinOperator_REVERSE_SEQUENCE, Register_REVERSE_SEQUENCE());
+  AddBuiltin(BuiltinOperator_MATRIX_DIAG, Register_MATRIX_DIAG());
+  AddBuiltin(BuiltinOperator_QUANTIZE, Register_QUANTIZE(),
+             /* min_version = */ 1,
+             /* max_version = */ 2);
+  AddBuiltin(BuiltinOperator_MATRIX_SET_DIAG, Register_MATRIX_SET_DIAG());
+  AddBuiltin(BuiltinOperator_IF, Register_IF());
+  AddBuiltin(BuiltinOperator_WHILE, Register_WHILE());
+  AddBuiltin(BuiltinOperator_NON_MAX_SUPPRESSION_V4,
+             Register_NON_MAX_SUPPRESSION_V4());
+  AddBuiltin(BuiltinOperator_NON_MAX_SUPPRESSION_V5,
+             Register_NON_MAX_SUPPRESSION_V5());
+  AddBuiltin(BuiltinOperator_SCATTER_ND, Register_SCATTER_ND());
+  AddBuiltin(BuiltinOperator_DENSIFY, Register_DENSIFY());
+  AddBuiltin(BuiltinOperator_BATCH_MATMUL, Register_BATCH_MATMUL_REF(),
+             /* min_version = */ 1,
+             /* max_version = */ 2);
+  AddCustom("NumericVerify",
+            tflite::ops::custom::Register_NUMERIC_VERIFY_REF());
   // TODO(andrewharp, ahentz): Move these somewhere more appropriate so that
   // custom ops aren't always included by default.
   AddCustom("Mfcc", tflite::ops::custom::Register_MFCC());
diff --git a/tensorflow/lite/kernels/resize_nearest_neighbor.cc b/tensorflow/lite/kernels/resize_nearest_neighbor.cc
index 13c54c4f906..b8d0b2bc793 100644
--- a/tensorflow/lite/kernels/resize_nearest_neighbor.cc
+++ b/tensorflow/lite/kernels/resize_nearest_neighbor.cc
@@ -121,10 +121,15 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
         op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
         GetTensorShape(size), GetTensorData<int32>(size),
         GetTensorShape(output), GetTensorData<int8_t>(output));
+  } else if (output->type == kTfLiteInt16) {
+    reference_ops::ResizeNearestNeighbor(
+        op_params, GetTensorShape(input), GetTensorData<int16_t>(input),
+        GetTensorShape(size), GetTensorData<int32>(size),
+        GetTensorShape(output), GetTensorData<int16_t>(output));
   } else {
-    TF_LITE_KERNEL_LOG(context,
-                       "Output type is %s, requires float, uint8 or int8.",
-                       TfLiteTypeGetName(output->type));
+    TF_LITE_KERNEL_LOG(
+        context, "Output type is %s, requires float, uint8, int8 or int16.",
+        TfLiteTypeGetName(output->type));
     return kTfLiteError;
   }
 
diff --git a/tensorflow/lite/kernels/resize_nearest_neighbor_test.cc b/tensorflow/lite/kernels/resize_nearest_neighbor_test.cc
index b22ad48afb9..5f3d982df66 100644
--- a/tensorflow/lite/kernels/resize_nearest_neighbor_test.cc
+++ b/tensorflow/lite/kernels/resize_nearest_neighbor_test.cc
@@ -106,6 +106,14 @@ TEST_P(ResizeNearestNeighborOpTest, HorizontalResizeInt8) {
   EXPECT_THAT(m.GetOutput<int8_t>(),
               ElementsAreArray(ArrayFloatNear({-3, -3, 6})));
 }
+TEST_P(ResizeNearestNeighborOpTest, HorizontalResizeInt16) {
+  ResizeNearestNeighborOpModel m({TensorType_INT16, {1, 1, 2, 1}}, {1, 3},
+                                 GetParam());
+  m.SetInput<int16_t>({-3, 6});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int16_t>(),
+              ElementsAreArray(ArrayFloatNear({-3, -3, 6})));
+}
 TEST_P(ResizeNearestNeighborOpTest, VerticalResize) {
   ResizeNearestNeighborOpModel m({TensorType_FLOAT32, {1, 2, 1, 1}}, {3, 1},
                                  GetParam());
@@ -130,6 +138,14 @@ TEST_P(ResizeNearestNeighborOpTest, VerticalResizeInt8) {
   EXPECT_THAT(m.GetOutput<int8_t>(),
               ElementsAreArray(ArrayFloatNear({3, 3, -9})));
 }
+TEST_P(ResizeNearestNeighborOpTest, VerticalResizeInt16) {
+  ResizeNearestNeighborOpModel m({TensorType_INT16, {1, 2, 1, 1}}, {3, 1},
+                                 GetParam());
+  m.SetInput<int16_t>({3, -9});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int16_t>(),
+              ElementsAreArray(ArrayFloatNear({3, 3, -9})));
+}
 TEST_P(ResizeNearestNeighborOpTest, TwoDimensionalResize) {
   ResizeNearestNeighborOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}}, {3, 3},
                                  GetParam());
@@ -172,6 +188,20 @@ TEST_P(ResizeNearestNeighborOpTest, TwoDimensionalResizeInt8) {
                                          9, 9, 12,  //
                                      })));
 }
+TEST_P(ResizeNearestNeighborOpTest, TwoDimensionalResizeInt16) {
+  ResizeNearestNeighborOpModel m({TensorType_INT16, {1, 2, 2, 1}}, {3, 3},
+                                 GetParam());
+  m.SetInput<int16_t>({
+      3, -6,  //
+      9, 12   //
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int16_t>(), ElementsAreArray(ArrayFloatNear({
+                                          3, 3, -6,  //
+                                          3, 3, -6,  //
+                                          9, 9, 12,  //
+                                      })));
+}
 TEST_P(ResizeNearestNeighborOpTest, TwoDimensionalResizeWithTwoBatches) {
   ResizeNearestNeighborOpModel m({TensorType_FLOAT32, {2, 2, 2, 1}}, {3, 3},
                                  GetParam());
@@ -284,6 +314,25 @@ TEST_P(ResizeNearestNeighborOpTest, TwoDimensionalResizeWithTwoBatchesInt8) {
                                          12, 12, 16,  //
                                      })));
 }
+TEST_P(ResizeNearestNeighborOpTest, TwoDimensionalResizeWithTwoBatchesInt16) {
+  ResizeNearestNeighborOpModel m({TensorType_INT16, {2, 2, 2, 1}}, {3, 3},
+                                 GetParam());
+  m.SetInput<int16_t>({
+      3, 6,    //
+      9, -12,  //
+      -4, 10,  //
+      12, 16   //
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int16_t>(), ElementsAreArray(ArrayFloatNear({
+                                          3, 3, 6,     //
+                                          3, 3, 6,     //
+                                          9, 9, -12,   //
+                                          -4, -4, 10,  //
+                                          -4, -4, 10,  //
+                                          12, 12, 16,  //
+                                      })));
+}
 TEST_P(ResizeNearestNeighborOpTest, ThreeDimensionalResizeUInt8) {
   ResizeNearestNeighborOpModel m({TensorType_UINT8, {1, 2, 2, 2}}, {3, 3},
                                  GetParam());
@@ -342,6 +391,20 @@ TEST_P(ResizeNearestNeighborOpTest, ThreeDimensionalResizeInt8) {
                                          10, 12, 10, 12, -14, 16,  //
                                      })));
 }
+TEST_P(ResizeNearestNeighborOpTest, ThreeDimensionalResizeInt16) {
+  ResizeNearestNeighborOpModel m({TensorType_INT16, {1, 2, 2, 2}}, {3, 3},
+                                 GetParam());
+  m.SetInput<int16_t>({
+      3, 4, -6, 10,     //
+      10, 12, -14, 16,  //
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int16_t>(), ElementsAreArray(ArrayFloatNear({
+                                          3, 4, 3, 4, -6, 10,       //
+                                          3, 4, 3, 4, -6, 10,       //
+                                          10, 12, 10, 12, -14, 16,  //
+                                      })));
+}
 INSTANTIATE_TEST_SUITE_P(ResizeNearestNeighborOpTest,
                          ResizeNearestNeighborOpTest,
                          testing::Values(TestType::kConst, TestType::kDynamic));
diff --git a/tensorflow/lite/kernels/rfft2d.cc b/tensorflow/lite/kernels/rfft2d.cc
index 9aeee53f637..4b0b4a6140a 100644
--- a/tensorflow/lite/kernels/rfft2d.cc
+++ b/tensorflow/lite/kernels/rfft2d.cc
@@ -248,13 +248,15 @@ void Rfft2dReorder(int fft_height, int fft_width, double** fft_input_output) {
     fft_input_output[i][0] = fft_input_output[fft_height - i][0];
     fft_input_output[i][1] = -fft_input_output[fft_height - i][1];
   }
-  fft_input_output[0][fft_width] = fft_input_output[0][1];
+
+  double temp = fft_input_output[0][1];
   fft_input_output[0][fft_width + 1] = 0;
   fft_input_output[0][1] = 0;
   fft_input_output[fft_height_half][fft_width] =
       fft_input_output[fft_height_half][1];
   fft_input_output[fft_height_half][fft_width + 1] = 0;
   fft_input_output[fft_height_half][1] = 0;
+  fft_input_output[0][fft_width] = temp;
 
   // Reorder the frequency matrix from
   //    [[F(0, 0),  F(0, -1/4),   F(0, -2/4)],
diff --git a/tensorflow/lite/kernels/rfft2d_test.cc b/tensorflow/lite/kernels/rfft2d_test.cc
index e9b23bacf0c..e7d806a5a76 100644
--- a/tensorflow/lite/kernels/rfft2d_test.cc
+++ b/tensorflow/lite/kernels/rfft2d_test.cc
@@ -150,9 +150,3 @@ TEST(Rfft2dOpTest, InputDimsGreaterThan2) {
 }  // namespace custom
 }  // namespace ops
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/slice.cc b/tensorflow/lite/kernels/slice.cc
index bb123302995..3f6eb73e843 100644
--- a/tensorflow/lite/kernels/slice.cc
+++ b/tensorflow/lite/kernels/slice.cc
@@ -214,6 +214,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     case kTfLiteInt8:
       TF_LITE_SLICE(int8_t, kernel_type);
       break;
+    case kTfLiteInt16:
+      TF_LITE_SLICE(int16_t, kernel_type);
+      break;
     case kTfLiteUInt8:
       TF_LITE_SLICE(uint8_t, kernel_type);
       break;
diff --git a/tensorflow/lite/kernels/slice_test.cc b/tensorflow/lite/kernels/slice_test.cc
index 1e61e1e68aa..346a283fdd4 100644
--- a/tensorflow/lite/kernels/slice_test.cc
+++ b/tensorflow/lite/kernels/slice_test.cc
@@ -226,6 +226,16 @@ TEST_P(SliceOpTest, SliceInt8) {
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({3, 3, 3, 5, 5, 5}));
 }
 
+TEST_P(SliceOpTest, SliceInt16) {
+  SliceOpModel<int16_t, int32_t> m({3, 2, 3, 1}, {4}, {1, 0, 0, 0}, {4},
+                                   {2, 1, -1, 1}, TensorType_INT32,
+                                   TensorType_INT16, GetParam());
+  m.SetInput({1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 1, 3, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({3, 3, 3, 5, 5, 5}));
+}
+
 TEST_P(SliceOpTest, SliceString) {
   SliceOpModel<string, int32_t> m({3, 2, 3, 1}, {4}, {1, 0, 0, 0}, {4},
                                   {2, 1, -1, 1}, TensorType_INT32,
diff --git a/tensorflow/lite/kernels/strided_slice.cc b/tensorflow/lite/kernels/strided_slice.cc
index 83221cd4a3d..d10e99c1997 100644
--- a/tensorflow/lite/kernels/strided_slice.cc
+++ b/tensorflow/lite/kernels/strided_slice.cc
@@ -71,17 +71,27 @@ StridedSliceParams BuildStridedSliceParams(StridedSliceContext* op_context) {
   op_params.stop_indices_count = op_context->dims;
   op_params.strides_count = op_context->dims;
 
-  for (int i = 0; i < op_context->dims; ++i) {
-    op_params.start_indices[i] = GetTensorData<int32_t>(op_context->begin)[i];
-    op_params.stop_indices[i] = GetTensorData<int32_t>(op_context->end)[i];
-    op_params.strides[i] = GetTensorData<int32_t>(op_context->strides)[i];
-  }
-
   op_params.begin_mask = op_context->params->begin_mask;
   op_params.ellipsis_mask = 0;
   op_params.end_mask = op_context->params->end_mask;
   op_params.new_axis_mask = 0;
   op_params.shrink_axis_mask = op_context->params->shrink_axis_mask;
+
+  int begin_count = GetTensorShape(op_context->begin).Dims(0);
+  for (int i = 0; i < begin_count; ++i) {
+    op_params.start_indices[i] = GetTensorData<int32_t>(op_context->begin)[i];
+    op_params.stop_indices[i] = GetTensorData<int32_t>(op_context->end)[i];
+    op_params.strides[i] = GetTensorData<int32_t>(op_context->strides)[i];
+  }
+
+  // If the length of begin and end smaller than number of input dims, set the
+  // mask bit of begin and end for that index.
+  for (int i = begin_count; i < op_context->dims; ++i) {
+    op_params.start_indices[i] = op_params.stop_indices[i] = 0;
+    op_params.strides[i] = 1;
+    op_params.begin_mask |= (1 << i);
+    op_params.end_mask |= (1 << i);
+  }
   return op_params;
 }
 
@@ -95,7 +105,7 @@ TfLiteStatus ResizeOutputTensor(TfLiteContext* context,
   RuntimeShape input_shape = GetTensorShape(op_context->input);
 
   for (int idx = op_context->dims - 1; idx >= 0; --idx) {
-    int32_t stride = GetTensorData<int32_t>(op_context->strides)[idx];
+    int32_t stride = op_params.strides[idx];
     TF_LITE_ENSURE_MSG(context, stride != 0, "stride value has to be non-zero");
 
     int32_t begin =
diff --git a/tensorflow/lite/kernels/strided_slice_test.cc b/tensorflow/lite/kernels/strided_slice_test.cc
index 5f625d3f201..f174c236d98 100644
--- a/tensorflow/lite/kernels/strided_slice_test.cc
+++ b/tensorflow/lite/kernels/strided_slice_test.cc
@@ -649,5 +649,28 @@ TYPED_TEST(StridedSliceOpTest, In5D_IdentityShrinkAxis1) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2, 1, 2}));
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3, 4}));
 }
+
+TYPED_TEST(StridedSliceOpTest, In3D_SmallBegin) {
+  StridedSliceOpModel<TypeParam> m({2, 3, 2}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  m.SetBegin({0});
+  m.SetEnd({1});
+  m.SetStrides({1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 3, 2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6}));
+}
+
+TYPED_TEST(StridedSliceOpTest, In3D_SmallBeginWithhrinkAxis1) {
+  StridedSliceOpModel<TypeParam> m({2, 3, 2}, {1}, {1}, {1}, 0, 0, 0, 0, 1);
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  m.SetBegin({0});
+  m.SetEnd({1});
+  m.SetStrides({1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6}));
+}
+
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/svdf_test.cc b/tensorflow/lite/kernels/svdf_test.cc
index b0ac2011948..c13810d3b75 100644
--- a/tensorflow/lite/kernels/svdf_test.cc
+++ b/tensorflow/lite/kernels/svdf_test.cc
@@ -143,9 +143,8 @@ class BaseSVDFOpModel : public SingleOpModel {
     weights_time_ = AddInput(weights_time_type);
     bias_ = AddNullInput();
     const int num_filters = units * rank;
-    activation_state_ = AddInput(
-        TensorData{TensorType_FLOAT32, {batches, memory_size * num_filters}},
-        /*is_variable=*/true);
+    activation_state_ = AddVariableInput(
+        TensorData{TensorType_FLOAT32, {batches, memory_size * num_filters}});
     output_ = AddOutput(TensorType_FLOAT32);
     SetBuiltinOp(BuiltinOperator_SVDF, BuiltinOptions_SVDFOptions,
                  CreateSVDFOptions(builder_, rank, ActivationFunctionType_NONE,
@@ -482,9 +481,8 @@ class IntegerSVDFOpModel : public SingleOpModel {
     weights_time_ =
         AddInput({TensorType_INT16, {num_filters, memory_size}, -1, 1});
     bias_ = AddInput({TensorType_INT32, {units}, -512, 512});
-    activation_state_ = AddInput(
-        {TensorType_INT16, {batches, memory_size * num_filters}, -16, 16},
-        /*is_variable=*/true);
+    activation_state_ = AddVariableInput(
+        {TensorType_INT16, {batches, memory_size * num_filters}, -16, 16});
     output_ = AddOutput({TensorType_INT8, {batches, units}, -0.5, 0.5});
     SetBuiltinOp(
         BuiltinOperator_SVDF, BuiltinOptions_SVDFOptions,
diff --git a/tensorflow/lite/kernels/test_main.cc b/tensorflow/lite/kernels/test_main.cc
index a1b1a913281..3b3797890a3 100644
--- a/tensorflow/lite/kernels/test_main.cc
+++ b/tensorflow/lite/kernels/test_main.cc
@@ -31,8 +31,10 @@ void InitKernelTest(int* argc, char** argv) {
     // In Android Q, the NNAPI delegate avoids delegation if the only device
     // is the reference CPU. However, for testing purposes, we still want
     // delegation coverage, so force use of this reference path.
-    delegate_providers->MutableParams()->Set<std::string>(
-        "nnapi_accelerator_name", "nnapi-reference");
+    auto* params = delegate_providers->MutableParams();
+    if (!params->HasValueSet<std::string>("nnapi_accelerator_name")) {
+      params->Set<std::string>("nnapi_accelerator_name", "nnapi-reference");
+    }
   }
 }
 
diff --git a/tensorflow/lite/kernels/test_util.cc b/tensorflow/lite/kernels/test_util.cc
index c3a40252cfb..d8e1e84f938 100644
--- a/tensorflow/lite/kernels/test_util.cc
+++ b/tensorflow/lite/kernels/test_util.cc
@@ -79,12 +79,23 @@ std::vector<Matcher<std::complex<float>>> ArrayComplex64Near(
   return matchers;
 }
 
-int SingleOpModel::AddInput(const TensorData& t, bool is_variable) {
+int SingleOpModel::AddInput(const TensorData& t) {
   int id = 0;
   if (t.per_channel_quantization) {
     id = AddTensorPerChannelQuant(t);
   } else {
-    id = AddTensor<float>(t, {}, is_variable);
+    id = AddTensor<float>(t, {});
+  }
+  inputs_.push_back(id);
+  return id;
+}
+
+int SingleOpModel::AddVariableInput(const TensorData& t) {
+  int id = 0;
+  if (t.per_channel_quantization) {
+    id = AddTensorPerChannelQuant(t);
+  } else {
+    id = AddTensor<float>(t, {}, true);
   }
   inputs_.push_back(id);
   return id;
@@ -145,10 +156,22 @@ void SingleOpModel::SetCustomOp(
       CustomOptionsFormat_FLEXBUFFERS));
 }
 
+void SingleOpModel::AllocateAndDelegate(bool apply_delegate) {
+  CHECK(interpreter_->AllocateTensors() == kTfLiteOk)
+      << "Cannot allocate tensors";
+  interpreter_->ResetVariableTensors();
+
+  // In some rare cases a test may need to postpone modifying the graph with
+  // a delegate, e.g. if tensors are not fully specified. In such cases the
+  // test has to explicitly call ApplyDelegate() when necessary.
+  if (apply_delegate) ApplyDelegate();
+}
+
 void SingleOpModel::BuildInterpreter(std::vector<std::vector<int>> input_shapes,
                                      int num_threads,
                                      bool allow_fp32_relax_to_fp16,
-                                     bool apply_delegate) {
+                                     bool apply_delegate,
+                                     bool allocate_and_delegate) {
   auto opcodes = builder_.CreateVector(opcodes_);
   auto operators = builder_.CreateVector(operators_);
   auto tensors = builder_.CreateVector(tensors_);
@@ -190,14 +213,9 @@ void SingleOpModel::BuildInterpreter(std::vector<std::vector<int>> input_shapes,
 
   interpreter_->SetAllowFp16PrecisionForFp32(allow_fp32_relax_to_fp16);
 
-  CHECK(interpreter_->AllocateTensors() == kTfLiteOk)
-      << "Cannot allocate tensors";
-  interpreter_->ResetVariableTensors();
-
-  // In some rare cases a test may need to postpone modifying the graph with
-  // a delegate, e.g. if tensors are not fully specified. In such cases the
-  // test has to explicitly call ApplyDelegate() when necessary.
-  if (apply_delegate) ApplyDelegate();
+  if (allocate_and_delegate) {
+    AllocateAndDelegate(apply_delegate);
+  }
 }
 
 TfLiteStatus SingleOpModel::ApplyDelegate() {
@@ -229,7 +247,7 @@ void SingleOpModel::BuildInterpreter(
     std::vector<std::vector<int>> input_shapes) {
   BuildInterpreter(input_shapes, /*num_threads=*/-1,
                    /*allow_fp32_relax_to_fp16=*/false,
-                   /*apply_delegate=*/true);
+                   /*apply_delegate=*/true, /*allocate_and_delegate=*/true);
 }
 
 // static
diff --git a/tensorflow/lite/kernels/test_util.h b/tensorflow/lite/kernels/test_util.h
index c08a40f06a8..f739827c5b3 100644
--- a/tensorflow/lite/kernels/test_util.h
+++ b/tensorflow/lite/kernels/test_util.h
@@ -105,6 +105,7 @@ inline std::vector<float> Dequantize(const std::vector<T>& data, float scale,
 // the actual data is known. This mimics what happens in practice: quantization
 // parameters are calculated during training or post training..
 struct TensorData {
+  // NOLINTNEXTLINE
   TensorData(TensorType type = TensorType_FLOAT32, std::vector<int> shape = {},
              float min = 0.0f, float max = 0.0f, float scale = 0.0f,
              int32_t zero_point = 0, bool per_channel_quantization = false,
@@ -112,7 +113,8 @@ struct TensorData {
              std::vector<int64_t> per_channel_quantization_offsets = {},
              int32_t channel_index = 0, std::vector<int> traversal_order = {},
              std::vector<TfLiteDimensionType> format = {},
-             std::vector<int> block_size = {}, std::vector<int> block_map = {})
+             std::vector<int> block_size = {}, std::vector<int> block_map = {},
+             std::vector<int> shape_signature = {})
       : type(type),
         shape(shape),
         min(min),
@@ -128,7 +130,8 @@ struct TensorData {
         traversal_order(traversal_order),
         format(format),
         block_size(block_size),
-        block_map(block_map) {}
+        block_map(block_map),
+        shape_signature(shape_signature) {}
   TensorType type;
   std::vector<int> shape;
   float min;
@@ -143,6 +146,7 @@ struct TensorData {
   std::vector<TfLiteDimensionType> format;
   std::vector<int> block_size;
   std::vector<int> block_map;
+  std::vector<int> shape_signature;
 };
 
 class SingleOpResolver : public OpResolver {
@@ -185,10 +189,8 @@ class SingleOpModel {
   SingleOpModel& operator=(const SingleOpModel&) = delete;
 
   // Add a TensorType input tensor and return its index.
-  int AddInput(TensorType type, bool is_variable = false) {
-    return AddInput(TensorData{type}, is_variable);
-  }
-  int AddInput(const TensorData& t, bool is_variable = false);
+  int AddInput(const TensorData& t);
+  int AddVariableInput(const TensorData& t);
 
   int AddIntermediate(TensorType type, const std::vector<float>& scale,
                       const std::vector<int64_t>& zero_point);
@@ -211,9 +213,82 @@ class SingleOpModel {
     return AddConstInput(TensorData{type, shape}, data);
   }
 
+  // TODO(b/166202747): Use a better way to do type specialization. Reduce
+  // duplicate code in the two functions below.
+  int AddConstSparseInput(const TensorData& t,
+                          const std::vector<int8_t>& data) {
+    int id = tensors_.size();
+    const int dims_count = t.traversal_order.size();
+    std::vector<int8_t> dense_data(data);
+
+    tflite::optimize::sparsity::FormatConverter<int8_t> converter(
+        t.shape, t.traversal_order, t.format, t.block_size, t.block_map);
+    converter.DenseToSparse(dense_data.data());
+
+    const auto dim_metadata = converter.GetDimMetadata();
+    const auto sparse_data = converter.GetData();
+
+    // Build sparsity parameter.
+    std::vector<flatbuffers::Offset<DimensionMetadata>> fb_dim_metadata(
+        dims_count);
+    for (int i = 0; i < dims_count; i++) {
+      const int metadata_idx = 2 * i;
+      if (i < t.shape.size() &&
+          t.format[t.traversal_order[i]] == kTfLiteDimSparseCSR) {
+        auto array_segments =
+            CreateInt32Vector(builder_,
+                              builder_.CreateVector(dim_metadata[metadata_idx]))
+                .Union();
+        auto array_indices =
+            CreateInt32Vector(
+                builder_, builder_.CreateVector(dim_metadata[metadata_idx + 1]))
+                .Union();
+        fb_dim_metadata[i] = CreateDimensionMetadata(
+            builder_, DimensionType_SPARSE_CSR, 0,
+            SparseIndexVector_Int32Vector, array_segments,
+            SparseIndexVector_Int32Vector, array_indices);
+      } else {
+        fb_dim_metadata[i] = CreateDimensionMetadata(
+            builder_, DimensionType_DENSE, dim_metadata[metadata_idx][0]);
+      }
+    }
+
+    flatbuffers::Offset<SparsityParameters> s_param = CreateSparsityParameters(
+        builder_, builder_.CreateVector(t.traversal_order),
+        builder_.CreateVector(t.block_map),
+        builder_.CreateVector(fb_dim_metadata));
+
+    int buffer_id = 0;
+    if (!data.empty()) {
+      // Initialize buffers list with empty buffer to allow for non-const
+      // tensors.
+      if (buffers_.empty()) {
+        buffers_.push_back(CreateBuffer(builder_, builder_.CreateVector({})));
+      }
+
+      // Add compressed data as a Buffer to buffers list.
+      buffer_id = buffers_.size();
+      auto data_buffer = builder_.CreateVector(
+          reinterpret_cast<const uint8_t*>(sparse_data.data()),
+          sparse_data.size());
+      buffers_.push_back(CreateBuffer(builder_, data_buffer));
+    }
+
+    tensors_.push_back(CreateTensor(
+        builder_, builder_.CreateVector<int>(t.shape), t.type,
+        /*buffer=*/buffer_id,
+        /*name=*/0, /*quantization=*/0, /*is_variable=*/false, s_param));
+
+    inputs_.push_back(id);
+    tensor_data_[id] = t;
+
+    return id;
+  }
+
   // Add a constant sparse tensor as input.
   template <typename T>
-  int AddConstSparseInput(const TensorData& t, std::initializer_list<T> data) {
+  int AddConstSparseInput(const TensorData& t, const std::vector<T>& data,
+                          bool symmetric_quantize = false) {
     int id = tensors_.size();
     const int dims_count = t.traversal_order.size();
     std::vector<T> dense_data(data);
@@ -255,8 +330,9 @@ class SingleOpModel {
         builder_.CreateVector(t.block_map),
         builder_.CreateVector(fb_dim_metadata));
 
+    flatbuffers::Offset<QuantizationParameters> q_params = 0;
     int buffer_id = 0;
-    if (data.size()) {
+    if (!data.empty()) {
       // Initialize buffers list with empty buffer to allow for non-const
       // tensors.
       if (buffers_.empty()) {
@@ -265,16 +341,31 @@ class SingleOpModel {
 
       // Add compressed data as a Buffer to buffers list.
       buffer_id = buffers_.size();
-      auto data_buffer = builder_.CreateVector(
-          reinterpret_cast<const uint8_t*>(sparse_data.data()),
-          sizeof(T) * sparse_data.size());
-      buffers_.push_back(CreateBuffer(builder_, data_buffer));
+      if (symmetric_quantize) {
+        const int length = sparse_data.size();
+        std::vector<int8_t> q(length);
+        float min, max, scaling_factor;
+        tensor_utils::SymmetricQuantizeFloats(
+            sparse_data.data(), length, q.data(), &min, &max, &scaling_factor);
+        q_params = CreateQuantizationParameters(
+            builder_, 0, 0, builder_.CreateVector<float>({scaling_factor}),
+            builder_.CreateVector<int64_t>({0}));
+        auto data_buffer = builder_.CreateVector(
+            reinterpret_cast<const uint8_t*>(q.data()), q.size());
+        buffers_.push_back(CreateBuffer(builder_, data_buffer));
+      } else {
+        auto data_buffer = builder_.CreateVector(
+            reinterpret_cast<const uint8_t*>(sparse_data.data()),
+            sizeof(T) * sparse_data.size());
+        buffers_.push_back(CreateBuffer(builder_, data_buffer));
+      }
     }
 
-    tensors_.push_back(CreateTensor(
-        builder_, builder_.CreateVector<int>(t.shape), t.type,
-        /*buffer=*/buffer_id,
-        /*name=*/0, /*quantization=*/0, /*is_variable=*/false, s_param));
+    tensors_.push_back(
+        CreateTensor(builder_, builder_.CreateVector<int>(t.shape),
+                     symmetric_quantize ? TensorType_INT8 : t.type,
+                     /*buffer=*/buffer_id,
+                     /*name=*/0, q_params, /*is_variable=*/false, s_param));
 
     inputs_.push_back(id);
     tensor_data_[id] = t;
@@ -286,7 +377,6 @@ class SingleOpModel {
   int AddNullInput();
 
   // Add a TensorType output tensor and return its index.
-  int AddOutput(TensorType type) { return AddOutput(TensorData{type}); }
   int AddOutput(const TensorData& t);
 
   template <typename T>
@@ -389,11 +479,15 @@ class SingleOpModel {
                    const std::vector<uint8_t>& custom_option,
                    const std::function<TfLiteRegistration*()>& registration);
 
+  // Allocate tensors and apply delegate.
+  // Note that this is called by default in BuiltInterpreter().
+  void AllocateAndDelegate(bool apply_delegate);
+
   // Build the interpreter for this model. Also, resize and allocate all
   // tensors given the shapes of the inputs.
   void BuildInterpreter(std::vector<std::vector<int>> input_shapes,
                         int num_threads, bool allow_fp32_relax_to_fp16,
-                        bool apply_delegate);
+                        bool apply_delegate, bool allocate_and_delegate = true);
 
   void BuildInterpreter(std::vector<std::vector<int>> input_shapes);
 
@@ -582,10 +676,11 @@ class SingleOpModel {
       buffers_.push_back(CreateBuffer(builder_, data_buffer));
     }
 
-    tensors_.push_back(CreateTensor(builder_,
-                                    builder_.CreateVector<int>(t.shape), t.type,
-                                    /*buffer=*/buffer_id,
-                                    /*name=*/0, q_params, is_variable));
+    tensors_.push_back(CreateTensor(
+        builder_, builder_.CreateVector<int>(t.shape), t.type,
+        /*buffer=*/buffer_id,
+        /*name=*/0, q_params, is_variable,
+        /*sparsity=*/0, builder_.CreateVector<int>(t.shape_signature)));
 
     tensor_data_[id] = t;
 
@@ -839,7 +934,9 @@ struct TypeUnion;
 template <>
 struct TypeUnion<float> {
  public:
+  // NOLINTNEXTLINE
   static constexpr TensorType tensor_type = TensorType::TensorType_FLOAT32;
+  // NOLINTNEXTLINE
   static constexpr TfLiteType tflite_type = TfLiteType::kTfLiteFloat32;
   typedef float ScalarType;
 };
@@ -847,7 +944,9 @@ struct TypeUnion<float> {
 template <>
 struct TypeUnion<int32_t> {
  public:
+  // NOLINTNEXTLINE
   static constexpr TensorType tensor_type = TensorType::TensorType_INT32;
+  // NOLINTNEXTLINE
   static constexpr TfLiteType tflite_type = TfLiteType::kTfLiteInt32;
   typedef int32_t ScalarType;
 };
@@ -855,7 +954,9 @@ struct TypeUnion<int32_t> {
 template <>
 struct TypeUnion<int16_t> {
  public:
+  // NOLINTNEXTLINE
   static constexpr TensorType tensor_type = TensorType::TensorType_INT16;
+  // NOLINTNEXTLINE
   static constexpr TfLiteType tflite_type = TfLiteType::kTfLiteInt16;
   typedef int16_t ScalarType;
 };
@@ -863,7 +964,9 @@ struct TypeUnion<int16_t> {
 template <>
 struct TypeUnion<int8_t> {
  public:
+  // NOLINTNEXTLINE
   static constexpr TensorType tensor_type = TensorType::TensorType_INT8;
+  // NOLINTNEXTLINE
   static constexpr TfLiteType tflite_type = TfLiteType::kTfLiteInt8;
   typedef int8_t ScalarType;
 };
@@ -871,7 +974,9 @@ struct TypeUnion<int8_t> {
 template <>
 struct TypeUnion<uint8_t> {
  public:
+  // NOLINTNEXTLINE
   static constexpr TensorType tensor_type = TensorType::TensorType_UINT8;
+  // NOLINTNEXTLINE
   static constexpr TfLiteType tflite_type = TfLiteType::kTfLiteUInt8;
   typedef uint8_t ScalarType;
 };
diff --git a/tensorflow/lite/kernels/transpose.cc b/tensorflow/lite/kernels/transpose.cc
index 3a6d1b1f1ed..f5ddcb2b362 100644
--- a/tensorflow/lite/kernels/transpose.cc
+++ b/tensorflow/lite/kernels/transpose.cc
@@ -130,6 +130,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
         TF_LITE_TRANSPOSE(reference_ops, int8_t);
       }
       break;
+    case kTfLiteInt16:
+      TF_LITE_TRANSPOSE(reference_ops, int16_t);
+      break;
     case kTfLiteInt64:
       TF_LITE_TRANSPOSE(reference_ops, int64_t);
       break;
diff --git a/tensorflow/lite/kernels/transpose_conv.cc b/tensorflow/lite/kernels/transpose_conv.cc
index 07dc4bbac53..9ecd16274a0 100644
--- a/tensorflow/lite/kernels/transpose_conv.cc
+++ b/tensorflow/lite/kernels/transpose_conv.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
-#include "tensorflow/lite/kernels/eigen_support.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 // NOLINTNEXTLINE - This header file should't go to the top.
 #include "tensorflow/lite/kernels/internal/optimized/integer_ops/transpose_conv.h"
@@ -96,13 +95,10 @@ struct OpData {
 };
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
-  auto* data = new OpData;
-  eigen_support::IncrementUsageCounter(context);
-  return data;
+  return new OpData;
 }
 
 void Free(TfLiteContext* context, void* buffer) {
-  eigen_support::DecrementUsageCounter(context);
   delete reinterpret_cast<OpData*>(buffer);
 }
 
diff --git a/tensorflow/lite/kernels/transpose_test.cc b/tensorflow/lite/kernels/transpose_test.cc
index a88abec7161..f4fc106949f 100644
--- a/tensorflow/lite/kernels/transpose_test.cc
+++ b/tensorflow/lite/kernels/transpose_test.cc
@@ -180,13 +180,14 @@ TEST(TransposeTest, TestRefOps4D) {
   ASSERT_EQ(out, ref);
 }
 
-TEST(TransposeTest, TestRefOps4DInt8) {
-  std::vector<int8_t> out;
+template <typename T>
+void TransposeTestTestRefOps4D() {
+  std::vector<T> out;
   // Basic 4d.
   RunTestPermutation({2, 3, 4, 5}, {2, 0, 1, 3}, &out);
   ASSERT_EQ(
       out,
-      std::vector<int8_t>(
+      std::vector<T>(
           {0,  1,  2,  3,  4,  20, 21, 22, 23, 24, 40,  41,  42,  43,  44,
            60, 61, 62, 63, 64, 80, 81, 82, 83, 84, 100, 101, 102, 103, 104,
            5,  6,  7,  8,  9,  25, 26, 27, 28, 29, 45,  46,  47,  48,  49,
@@ -197,11 +198,15 @@ TEST(TransposeTest, TestRefOps4DInt8) {
            75, 76, 77, 78, 79, 95, 96, 97, 98, 99, 115, 116, 117, 118, 119}));
   RunTestPermutation({2, 3, 4, 5}, {0, 1, 2, 3}, &out);
   // Basic identity.
-  std::vector<int8_t> ref(out.size());
+  std::vector<T> ref(out.size());
   for (int k = 0; k < ref.size(); k++) ref[k] = k;
   ASSERT_EQ(out, ref);
 }
 
+TEST(TransposeTest, TestRefOps4DInt8) { TransposeTestTestRefOps4D<int8_t>(); }
+
+TEST(TransposeTest, TestRefOps4DInt16) { TransposeTestTestRefOps4D<int16_t>(); }
+
 class TransposeOpModel : public SingleOpModel {
  public:
   void SetInput(std::initializer_list<float> data) {
diff --git a/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc b/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc
index 0849c6dc0e4..d6c9fb93d0a 100644
--- a/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc
+++ b/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc
@@ -650,11 +650,20 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       TfLiteTensor* row_sums = GetTemporary(context, node, kRowSums);
       const int row_sums_size = row_sums->dims->data[0];
       return lstm_eval::EvalHybrid(
-          input, input_to_input_weights, input_to_forget_weights,
-          input_to_cell_weights, input_to_output_weights,
-          recurrent_to_input_weights, recurrent_to_forget_weights,
-          recurrent_to_cell_weights, recurrent_to_output_weights,
-          cell_to_input_weights, cell_to_forget_weights, cell_to_output_weights,
+          input, input_to_input_weights,
+          /*input_to_input_weights_ledger*/ nullptr, input_to_forget_weights,
+          /*input_to_forget_weights_ledger*/ nullptr, input_to_cell_weights,
+          /*input_to_cell_weights_ledger*/ nullptr, input_to_output_weights,
+          /*input_to_output_weights_ledger*/ nullptr,
+          recurrent_to_input_weights,
+          /*recurrent_to_input_weights_ledger*/ nullptr,
+          recurrent_to_forget_weights,
+          /*recurrent_to_forget_weights_ledger*/ nullptr,
+          recurrent_to_cell_weights,
+          /*recurrent_to_cell_weights_ledger*/ nullptr,
+          recurrent_to_output_weights,
+          /*recurrent_to_output_weights_ledger*/ nullptr, cell_to_input_weights,
+          cell_to_forget_weights, cell_to_output_weights,
           input_layer_norm_coefficients, forget_layer_norm_coefficients,
           cell_layer_norm_coefficients, output_layer_norm_coefficients,
           /*aux_input=*/nullptr,
@@ -663,7 +672,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           /*aux_input_to_cell_weights=*/nullptr,
           /*aux_input_to_output_weights=*/nullptr, input_gate_bias,
           forget_gate_bias, cell_gate_bias, output_gate_bias,
-          projection_weights, projection_bias, &lstm_params,
+          projection_weights, /*projection_weights_ledger*/ nullptr,
+          projection_bias, &lstm_params,
           /*forward_sequence=*/true, time_major,
           /*output_offset=*/0, scratch_buffer,
           GetTemporary(context, node, kInputScalingFactors),
diff --git a/tensorflow/lite/kernels/unidirectional_sequence_lstm_test.cc b/tensorflow/lite/kernels/unidirectional_sequence_lstm_test.cc
index 74584ec9e85..90a96ca98fe 100644
--- a/tensorflow/lite/kernels/unidirectional_sequence_lstm_test.cc
+++ b/tensorflow/lite/kernels/unidirectional_sequence_lstm_test.cc
@@ -101,11 +101,10 @@ class UnidirectionalLSTMOpModel : public SingleOpModel {
     }
 
     // Adding the 2 state tensors.
-    output_state_ =
-        AddInput(TensorData{TensorType_FLOAT32, {n_output_ * n_batch_}},
-                 /*is_variable=*/true);
-    cell_state_ = AddInput(TensorData{TensorType_FLOAT32, {n_cell_ * n_batch_}},
-                           /*is_variable=*/true);
+    output_state_ = AddVariableInput(
+        TensorData{TensorType_FLOAT32, {n_output_ * n_batch_}});
+    cell_state_ =
+        AddVariableInput(TensorData{TensorType_FLOAT32, {n_cell_ * n_batch_}});
 
     // Layer norm weights.
     if (is_layer_norm) {
diff --git a/tensorflow/lite/kernels/unidirectional_sequence_rnn_test.cc b/tensorflow/lite/kernels/unidirectional_sequence_rnn_test.cc
index f1486267c17..8ae562ea0b0 100644
--- a/tensorflow/lite/kernels/unidirectional_sequence_rnn_test.cc
+++ b/tensorflow/lite/kernels/unidirectional_sequence_rnn_test.cc
@@ -183,7 +183,7 @@ class UnidirectionalRNNOpModel : public SingleOpModel {
     weights_ = AddInput(weights);
     recurrent_weights_ = AddInput(recurrent_weights);
     bias_ = AddInput(TensorType_FLOAT32);
-    hidden_state_ = AddInput(TensorType_FLOAT32, true);
+    hidden_state_ = AddVariableInput(TensorType_FLOAT32);
     output_ = AddOutput(TensorType_FLOAT32);
     SetBuiltinOp(BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN,
                  BuiltinOptions_SequenceRNNOptions,
diff --git a/tensorflow/lite/kernels/where.cc b/tensorflow/lite/kernels/where.cc
index a20efa8baaa..8eb09bf2798 100644
--- a/tensorflow/lite/kernels/where.cc
+++ b/tensorflow/lite/kernels/where.cc
@@ -90,6 +90,13 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
                       ResizeOutputTensor(context, cond_tensor, output));
   }
 
+  TfLiteIntArray* dims = cond_tensor->dims;
+  if (dims->size == 0) {
+    // Scalar tensors are not supported.
+    TF_LITE_KERNEL_LOG(context, "Where op requires condition w/ rank > 0");
+    return kTfLiteError;
+  }
+
   reference_ops::SelectTrueCoords(GetTensorShape(cond_tensor),
                                   GetTensorData<bool>(cond_tensor),
                                   GetTensorData<int64_t>(output));
diff --git a/tensorflow/lite/kernels/where_test.cc b/tensorflow/lite/kernels/where_test.cc
index ba93bed6e74..4a77470e89f 100644
--- a/tensorflow/lite/kernels/where_test.cc
+++ b/tensorflow/lite/kernels/where_test.cc
@@ -51,6 +51,30 @@ class IntegerWhereOpModel : public BaseWhereOpModel {
   std::vector<int64_t> GetOutput() { return ExtractVector<int64_t>(output_); }
 };
 
+template <typename T1>
+class ConstInputWhereOpModel : public SingleOpModel {
+ public:
+  ConstInputWhereOpModel(T1 constant_values, const TensorData& output) {
+    input_ = AddConstInput(GetTensorType<T1>(), {constant_values}, {});
+    output_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_WHERE, BuiltinOptions_WhereOptions,
+                 CreateWhereOptions(builder_).Union());
+    BuildInterpreter({{}});
+  }
+
+  int input() { return input_; }
+  std::vector<int64_t> GetOutput() { return ExtractVector<int64_t>(output_); }
+
+ protected:
+  int input_;
+  int output_;
+};
+
+TEST(WhereOpTest, ScalarValueFail) {
+  ConstInputWhereOpModel<bool> m(false, {TensorType_INT64, {}});
+  EXPECT_EQ(m.InvokeUnchecked(), kTfLiteError);
+}
+
 TEST(WhereOpTest, SelectFromVectorNoResult) {
   IntegerWhereOpModel m({TensorType_BOOL, {3}}, {TensorType_INT64, {}});
   m.PopulateTensor<bool>(m.input(), {false, false, false});
diff --git a/tensorflow/lite/micro/BUILD b/tensorflow/lite/micro/BUILD
index 9b3d0d623cc..65690891ab3 100644
--- a/tensorflow/lite/micro/BUILD
+++ b/tensorflow/lite/micro/BUILD
@@ -1,3 +1,4 @@
+load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 load(
     "//tensorflow/lite/micro/testing:micro_test.bzl",
     "tflite_micro_cc_test",
@@ -30,26 +31,24 @@ cc_library(
     srcs = [
         "micro_allocator.cc",
         "micro_interpreter.cc",
-        "micro_optional_debug_tools.cc",
         "simple_memory_allocator.cc",
     ],
     hdrs = [
         "micro_allocator.h",
         "micro_interpreter.h",
-        "micro_optional_debug_tools.h",
         "simple_memory_allocator.h",
     ],
     copts = micro_copts(),
     deps = [
         ":memory_helpers",
         ":micro_compatibility",
+        ":micro_profiler",
         ":op_resolvers",
         "//tensorflow/lite:type_to_tflitetype",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/kernels/internal:compatibility",
         "//tensorflow/lite/kernels/internal:tensor",
-        "//tensorflow/lite/micro:micro_profiler",
         "//tensorflow/lite/micro/memory_planner",
         "//tensorflow/lite/micro/memory_planner:greedy_memory_planner",
         "//tensorflow/lite/schema:schema_fbs",
@@ -151,7 +150,6 @@ cc_library(
         "micro_string.h",
     ],
     copts = micro_copts(),
-    deps = ["//tensorflow/lite/c:common"],
 )
 
 cc_library(
@@ -379,3 +377,10 @@ tflite_micro_cc_test(
         "//tensorflow/lite/micro/testing:test_conv_model",
     ],
 )
+
+bzl_library(
+    name = "build_def_bzl",
+    srcs = ["build_def.bzl"],
+    visibility = [":micro"],
+    deps = ["//tensorflow:tensorflow_bzl"],
+)
diff --git a/tensorflow/lite/micro/CONTRIBUTING.md b/tensorflow/lite/micro/CONTRIBUTING.md
new file mode 100644
index 00000000000..063e1df2161
--- /dev/null
+++ b/tensorflow/lite/micro/CONTRIBUTING.md
@@ -0,0 +1,323 @@
+<!--
+Semi-automated TOC generation with instructions from
+https://github.com/ekalinin/github-markdown-toc#auto-insert-and-update-toc
+-->
+
+<!--ts-->
+
+*   [Resources](#resources)
+*   [Contributing Guidelines](#contributing-guidelines)
+    *   [General Pull Request Guidelines](#general-pull-request-guidelines)
+    *   [Guidelines for Specific Contribution Categories](#guidelines-for-specific-contribution-categories)
+        *   [Bug Fixes](#bug-fixes)
+        *   [Reference Kernel Implementations](#reference-kernel-implementations)
+        *   [Optimized Kernel Implementations](#optimized-kernel-implementations)
+        *   [New Target / Platform / IDE / Examples](#new-target--platform--ide--examples)
+        *   [New Features](#new-features)
+*   [Development Workflow Notes](#development-workflow-notes)
+    *   [Before submitting your PR](#before-submitting-your-pr)
+    *   [During the PR review](#during-the-pr-review)
+    *   [Reviewer notes](#reviewer-notes)
+    *   [Python notes](#python-notes)
+
+<!-- Added by: advaitjain, at: Tue 08 Sep 2020 04:00:31 PM PDT -->
+
+<!--te-->
+
+# Resources
+
+A
+[TF Lite Micro Github issue](https://github.com/tensorflow/tensorflow/issues/new?labels=comp%3Amicro&template=70-tflite-micro-issue.md)
+should be the primary method of getting in touch with the TensorFlow Lite Micro
+(TFLM) team.
+
+The following resources may also be useful:
+
+1.  SIG Micro [email group](https://groups.google.com/a/tensorflow.org/g/micro)
+    and
+    [monthly meetings](http://doc/1YHq9rmhrOUdcZnrEnVCWvd87s2wQbq4z17HbeRl-DBc).
+
+1.  SIG Micro [gitter chat room](https://gitter.im/tensorflow/sig-micro).
+
+# Contributing Guidelines
+
+We look forward to your contributions to the TensorFlow Lite Micro codebase and
+provide guidelines with the goal of enabling community contributions while still
+maintaining code health, maintainability, and consistency in style.
+
+Please note that while these guidelines may seem onerous to some developers,
+they are derived from Google's software engineering best practices.
+
+Before we describe project-specific guidelines, we recommend that external
+contributors read these tips from the Google Testing Blog:
+
+*   [Code Health: Providing Context with Commit Messages and Bug Reports](https://testing.googleblog.com/2017/09/code-health-providing-context-with.html)
+*   [Code Health: Understanding Code In Review](https://testing.googleblog.com/2018/05/code-health-understanding-code-in-review.html)
+*   [Code Health: Too Many Comments on Your Code Reviews?](https://testing.googleblog.com/2017/06/code-health-too-many-comments-on-your.html)
+*   [Code Health: To Comment or Not to Comment?](https://testing.googleblog.com/2017/07/code-health-to-comment-or-not-to-comment.html)
+
+We also recommend that contributors take a look at the
+[Tensorflow Contributing Guidelines](https://github.com/tensorflow/tensorflow/blob/master/CONTRIBUTING.md).
+
+## General Pull Request Guidelines
+
+We strongly recommend that contributors:
+
+1.  Initiate a conversation with the TFLM team via a
+    [TF Lite Micro Github issue](https://github.com/tensorflow/tensorflow/issues/new?labels=comp%3Amicro&template=70-tflite-micro-issue.md)
+    as early as possible.
+
+    *   This enables us to give guidance on how to proceed, prevent duplicated
+        effort and also point to alternatives as well as context if we are not
+        able to accept a particular contribution at a given time.
+
+    *   Ideally, you should make an issue ***before*** starting to work on a
+        pull request and provide context on both what you want to contribute and
+        why.
+
+1.  Once step 1. is complete and it is determined that a PR from an external
+    contributor is the way to go, please follow these guidelines from
+    [Google's Engineering Practices documentation](https://google.github.io/eng-practices/):
+
+    *   [Send Small Pull Requests](https://google.github.io/eng-practices/review/developer/small-cls.html)
+
+        *   If a pull request is doing more than one thing, the reviewer will
+            request that it be broken up into two or more PRs.
+
+    *   [Write Good Pull Request Descriptions](https://google.github.io/eng-practices/review/developer/cl-descriptions.html)
+
+        *   We require that all PR descriptions link to the github issue created
+            in step 1.
+
+        *   While github offers flexibility in linking
+            [commits and issues](https://github.blog/2011-04-09-issues-2-0-the-next-generation/#commits-issues),
+            we require that the PR description have a separate line with either
+            `Fixes #nn` (if the PR fixes the issue) or `Issue #nn` if the PR
+            addresses some aspect of an issue without fixing it.
+
+        *   We will be adding internal checks that automate this requirement by
+            matching the PR description to the regexp: `(Fixes|Issue) #`
+
+1.  Unit tests are critical to a healthy codebase. PRs without tests should be
+    the exception rather than the norm. And contributions to improve, simplify,
+    or make the unit tests more exhaustive are welcome! Please refer to
+    [this guideline](https://google.github.io/eng-practices/review/developer/small-cls.html#test_code)
+    on how test code and writing small PRs should be reconciled.
+
+## Guidelines for Specific Contribution Categories
+
+We provide some additional guidelines for different categories of contributions.
+
+### Bug Fixes
+
+Pull requests that fix bugs are always welcome and often uncontroversial, unless
+there is a conflict between different requirements from the platform, or if
+fixing a bug needs a bigger architectural change.
+
+1.  Create a
+    [TF Lite Micro Github issue](https://github.com/tensorflow/tensorflow/issues/new?labels=comp%3Amicro&template=70-tflite-micro-issue.md)
+    to determine the scope of the bug fix.
+1.  Send a PR (if that is determined to be the best path forward).
+1.  Bugfix PRs should be accompanied by a test case that fails prior to the fix
+    and passes with the fix. This validates that the fix works as expected, and
+    helps prevent future regressions.
+
+### Reference Kernel Implementations
+
+Pull requests that port reference kernels from TF Lite Mobile to TF Lite Micro
+are welcome once we have enouch context from the contributor on why the
+additional kernel is needed.
+
+1.  Please create a
+    [TF Lite Micro Github issue](https://github.com/tensorflow/tensorflow/issues/new?labels=comp%3Amicro&template=70-tflite-micro-issue.md)
+    before starting on any such PRs with as much context as possible, such as:
+
+    *   What is the model architecture?
+    *   What is the application that you are targetting?
+    *   What embedded target(s) are you planning to run on?
+    *   Motivate your use-case and the need for adding support for this
+        additional OP.
+
+1.  In the interest of having
+    [small pull requests](https://google.github.io/eng-practices/review/developer/small-cls.html),
+    limit each pull request to porting a single kernel (and the corresponding
+    test).
+
+1.  TODO(b/165627437): Create and link to a guide to porting reference ops.
+
+### Optimized Kernel Implementations
+
+In order to have the TFLM codebase be a central repository of optimized kernel
+implementations, we would like to make some improvements to the current
+infrastructure to enable adding and maintaining optimized kernel implementations
+in a scalable way.
+
+Until that work is complete, we are requesting a ***pause*** on contributions that
+add new optimized kernel implementations. We plan to make these improvements by
+October 2020 and will provide additional guidelines at that time.
+
+*   If you would like to have an exception to this pause, with the understanding
+    that your optimized kernels will break as we improve the underlying
+    framework, then please send an email to the [SIG Micro email
+    group](https://groups.google.com/a/tensorflow.org/g/micro) to figure out
+    a middle ground.
+
+*   Every optimized kernel directory must have a README.md with the github IDs
+    of the maintainers and any other relevant documentation. PRs that add
+    maintainers to the existing optimized kernels are always welcome.
+
+### New Target / Platform / IDE / Examples
+
+As discussed in the
+[SIG-micro Aug 12, 2020 meeting](http://doc/1YHq9rmhrOUdcZnrEnVCWvd87s2wQbq4z17HbeRl-DBc),
+we are currently ***pausing*** accepting pull requests that add new targets,
+platforms, IDE integration or examples while we revisit some of the
+infrastructure to enable us to make this process easier and more scalable.
+
+In the meantime, snapshotting and/or forking the tensorflow repo could be a
+viable way to prototype platform support.
+
+Having said that, we still invite
+[TF Lite Micro Github issues](https://github.com/tensorflow/tensorflow/issues/new?labels=comp%3Amicro&template=70-tflite-micro-issue.md)
+on this topic as we would like to enable such integration in the future.
+
+### New Features
+
+As discussed in the
+[SIG-micro Aug 12, 2020 meeting](http://doc/1YHq9rmhrOUdcZnrEnVCWvd87s2wQbq4z17HbeRl-DBc),
+we are currently ***pausing*** accepting pull requests that add new features while
+we revisit some of the infrastructure to enable us to make this process easier
+and more scalable.
+
+Having said that, we still invite feature requests via
+[TF Lite Micro Github issues](https://github.com/tensorflow/tensorflow/issues/new?labels=comp%3Amicro&template=70-tflite-micro-issue.md)
+to determine if the requested feature aligns with the TFLM roadmap.
+
+# Development Workflow Notes
+
+## Before submitting your PR
+
+1.  Run in-place clang-format on all the files that are modified in your git
+    tree with
+
+    ```
+    clang-format -i -style=google `git ls-files -m | grep "\.cc"`
+    clang-format -i -style=google `git ls-files -m | grep "\.h"`
+    ```
+
+1.  Make sure your code is lint-free.
+
+    Get a copy of
+    [cpplint](https://github.com/google/styleguide/tree/gh-pages/cpplint)
+
+    Run cpplint.py on all modified files in your git tree:
+
+    ```
+    cpplint.py `git ls-files -m`
+    ```
+
+1.  Run all the tests for x86, and any other platform that you are modifying.
+
+    ```
+    tensorflow/lite/micro/tools/make/tools/ci_build/test_x86.sh
+    ```
+
+    Please check the READMEs in the optimized kernel directories for specific
+    instructions.
+
+1.  Sometimes, bugs are caught by the address sanitizer that can go unnoticed
+    via the Makefile. To run a test with the address sanitizer, use the
+    following command (replace `micro_interpreter_test` with the target that you
+    want to test:
+
+    ```
+    CC=clang BAZEL_COMPILER=llvm bazel run --copt=-DADDRESS_SANITIZER \
+    --copt=-fsanitize=address --linkopt=-fsanitize=address \
+    tensorflow/lite/micro:micro_interpreter_test
+    ```
+
+## During the PR review
+
+1.  Do not change the git version history.
+
+    *   Always merge upstream/master (***do not rebase***) and no force-pushes
+        please.
+
+    *   Having an extra merge commit it ok as the github review tool handles
+        that gracefully.
+
+    Assuming that you forked tensorflow and added a remote called upstream with:
+
+    `git remote add upstream https://github.com/tensorflow/tensorflow.git`
+
+    Fetch the latest changes from upstream and merge into your local branch.
+
+    ```
+    git fetch upstream
+    git merge upstream/master
+    ```
+
+    In case of a merge conflict, resolve via:
+
+    ```
+    git mergetool
+
+    # Use your favorite diff tools (e.g. meld) to resolve the conflicts.
+
+    git add <files that were manually resolved>
+
+    git commit
+    ```
+
+1.  If a force push seems to be the only path forward, please stop and let your
+    PR reviewer know ***before*** force pushing. We will attempt to do the merge
+    for you. This will also help us better understand in what conditions a
+    force-push may be unavoidable.
+
+## Reviewer notes
+
+*   [GIthub CLI](https://cli.github.com) can be useful to quickly checkout a PR
+    to test locally.
+
+    `gh pr checkout <PR number>`
+
+*   Google engineers on the Tensorflow team will have the permissions to push
+    edits to most PRs. This can be useful to make some small fixes as a result
+    of errors due to internal checks that are not easily reproducible via
+    github.
+
+    One example of this is
+    [this comment](https://github.com/tensorflow/tensorflow/pull/38634#issuecomment-683190474).
+
+    And a sketch of the steps:
+
+    ```
+    git remote add <remote_name> git@github.com:<PR author>/tensorflow.git
+    git fetch <remote_name>
+
+    git checkout -b <local-branch-name> <remote_name>/<PR branch name>
+
+    # make changes and commit to local branch
+
+    # push changes to remove branch
+
+    git push <remote_name> <PR branch name>
+
+    # remove the temp remote to clean up your git environment.
+
+    git remote rm <remote_name>
+    ```
+
+## Python notes
+
+Most PRs for TensorFlow Lite Micro will be C++ only. Adding some notes on Python
+that can be expanded and improved as necessary.
+
+*   [TensorFlow guide](https://www.tensorflow.org/community/contribute/code_style#python_style)
+    for Python development
+
+*   [yapf](https://github.com/google/yapf/) should be used for formatting.
+
+    ```
+    yapf log_parser.py -i --style='{based_on_style: pep8, indent_width: 2}'
+    ```
diff --git a/tensorflow/lite/micro/MEMORY.md b/tensorflow/lite/micro/MEMORY.md
new file mode 100644
index 00000000000..e9e73ef9ce9
--- /dev/null
+++ b/tensorflow/lite/micro/MEMORY.md
@@ -0,0 +1,112 @@
+# Memory Management in TensorFlow Lite Micro
+
+This document outlines how memory is managed internally by TensorFlow Lite Micro (TFLM) today. It outlines the "online" allocation strategy used by the default TFLM APIs for loading a model into a shared tensor arena.
+
+## Tensor Arena
+
+The main "working" space for TFLM allocations is inside a single `char` or `int8_t` buffer. This buffer can be managed by passing it directly into a `tflite::MicroInterpreter` constructor or through a `tflite::MicroAllocator` instance that can be passed into a `tflite::MicroInterpreter` constructor. Internally, the `tflite::MicroAllocator` classifies allocations into 3 different sections:
+
+* **Head** - non-persistent allocations.
+* **Temporary** - short term "scoped" allocations.
+* **Tail** - persistent allocations.
+
+The illustration below represents typical allocations in TFLM:
+```
+--------------------------------------------------------------------------------
+|        |                     |                                               |
+|  HEAD  |<--  TEMPORARY    -->|                    TAIL                       |
+|        |                     |                                               |
+--------------------------------------------------------------------------------
+* Lowest Address                                               Highest Address *
+```
+
+### Head Section
+
+This non-persistent section typically holds shared Tensor buffers. This section does not allocate small iterative chunks, it can only be set by a specific length for the entire section.
+
+This allocation length of this section is managed by the `tflite::GreedyMemoryPlanner`. That memory planner looks at the entire graph of a model and tries to reuse as many buffers as possible to create the smallest length for the head. The Tensor buffers for this section can be accessed via a `TfLiteEvalTensor` or `TfLiteTensor` instance on the `tflite::MicroInterpreter`.
+
+### Temporary Section
+
+This section is used to allocate "scoped" or short-term, non-guaranteed buffers. Allocations from this section start from the current end address of the head section and grow towards the tail section. An allocation chain can be reset (and must be reset before adjusting the head) and moves the current allocation start address back to the end of the head section.
+
+ TFLM currently uses these allocations for a scope allocation of large C structs or scratch memory that is expected to be valid for at least the lifetime of a method call. This section.
+
+### Tail Section
+
+This section holds all persistent allocations used by TFLM. This section contains many random sized allocations and grows towards the end of the head section. Allocations in this section come from a variety of areas inside of TFLM. TFLM provides a [recording API](#Recording-Memory-APIs) to assist with auditing the contents of this section.
+
+## Recording Memory APIs
+
+TFLM provides simple APIs for auditing memory usage in the shared tensor arena. These APIs are opt-in and require some additional memory overhead and a working debug logging implementation [(reference implementation)](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/debug_log.cc).
+
+A typical bare-bones TFLM interpreter setup looks as such:
+
+```c++
+// Buffer for the tensor arena:
+size_t tensor_arena_size = 2048;
+uint8_t tensor_arena[tensor_arena_size];
+
+// Interpreter using the shared tensor arena above:
+tflite::MicroInterpreter interpreter(
+  tflite::GetModel(my_model_data), ops_resolver,
+  tensor_arena, tensor_arena_size, error_reporter);
+
+// Invoke one time which will allocate internals:
+if (interpreter.Invoke() != kTfLiteOk) {
+  TF_LITE_REPORT_ERROR(error_reporter, "Exception during invoke()!");
+}
+```
+
+Recording API can simply be used by including the `RecordingMicroInterpreter` class (`recording_micro_interpreter.h`) and replace `tflite::MicroInterpreter` with `tflite::RecordingMicroInterpreter`. The same call to `invoke()` is performed, but another call is made to `PrintAllocations()` which will output detailed allocation logging:
+
+```c++
+// Add an include to the recording API:
+#include "recording_micro_interpreter.h"
+
+// Simply change the class name from 'MicroInterpreter' to 'RecordingMicroInterpreter':
+tflite::RecoridngMicroInterpreter interpreter(
+  tflite::GetModel(my_model_data), ops_resolver,
+  tensor_arena, tensor_arena_size, error_reporter);
+
+// Invoke one time which will allocate internals:
+if (interpreter.Invoke() != kTfLiteOk) {
+  TF_LITE_REPORT_ERROR(error_reporter, "Exception during invoke()!");
+}
+
+// Print out detailed allocation information:
+interpreter.PrintAllocations();
+```
+
+The output of this call will look something similar to this (output from the [memory_arena_threshold_test](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/memory_arena_threshold_test.cc#L205)):
+```sh
+[RecordingMicroAllocator] Arena allocation total 9568 bytes
+[RecordingMicroAllocator] Arena allocation head 7744 bytes
+[RecordingMicroAllocator] Arena allocation tail 1824 bytes
+[RecordingMicroAllocator] 'TfLiteEvalTensor data' used 360 bytes with alignment overhead (requested 360 bytes for 15 allocations)
+[RecordingMicroAllocator] 'Persistent TfLiteTensor data' used 0 bytes with alignment overhead (requested 0 bytes for 0 tensors)
+[RecordingMicroAllocator] 'Persistent TfLiteTensor quantization data' used 0 bytes with alignment overhead (requested 0 bytes for 0 allocations)
+[RecordingMicroAllocator] 'TfLiteTensor variable buffer data' used 0 bytes with alignment overhead (requested 0 bytes for 0 allocations)
+[RecordingMicroAllocator] 'NodeAndRegistration struct' used 392 bytes with alignment overhead (requested 392 bytes for 7 NodeAndRegistration structs)
+[RecordingMicroAllocator] 'Operator runtime data' used 136 bytes with alignment overhead (requested 136 bytes for 5 OpData structs)
+```
+
+### Allocation Section Details
+
+More information about each recorded allocation section:
+
+* 'TfLiteEvalTensor data'
+  * C struct that holds the data type, dimension, and a pointer to the buffer representing the Tensor.
+* 'Persistent TfLiteTensor data'
+  * C struct that holds more information than a `TfLiteEvalTensor` struct in the graph.
+  * Allocations in this bucket will only show up when accessing tensors from the accessors on `tflite::MicroInterpreter`.
+* 'Persistent TfLiteTensor quantization data'
+  * Length of persistent quantization data assigned to persistent `TfLiteTensor` structs.
+  * Allocations in this bucket will only show up when accessing tensors from the accessors on `tflite::MicroInterpreter`.
+* 'TfLiteTensor variable buffer data'
+  * Length of buffer data from a variable tensor (retains data throughout calls to `invoke()`).
+* 'NodeAndRegistration struct'
+  * C struct that holds a `TfLiteRegistration` and `TfLiteNode` struct instance.
+  * Each operator in a model will contain one `NodeAndRegistration` struct.
+* 'Operator runtime data'
+  * Persistent allocations of data cached by TFLM kernels (e.g. quantization params, multipliers, etc).
diff --git a/tensorflow/lite/micro/all_ops_resolver.cc b/tensorflow/lite/micro/all_ops_resolver.cc
index ff461cb947e..d722ec146fb 100644
--- a/tensorflow/lite/micro/all_ops_resolver.cc
+++ b/tensorflow/lite/micro/all_ops_resolver.cc
@@ -63,6 +63,7 @@ AllOpsResolver::AllOpsResolver() {
   AddPadV2();
   AddPrelu();
   AddQuantize();
+  AddReduceMax();
   AddRelu();
   AddRelu6();
   AddReshape();
@@ -72,6 +73,7 @@ AllOpsResolver::AllOpsResolver() {
   AddSin();
   AddSoftmax();
   AddSplit();
+  AddSplitV();
   AddSqrt();
   AddSquare();
   AddStridedSlice();
diff --git a/tensorflow/lite/micro/benchmarks/Makefile.inc b/tensorflow/lite/micro/benchmarks/Makefile.inc
index 4a57ef39d69..d9dfba265ed 100644
--- a/tensorflow/lite/micro/benchmarks/Makefile.inc
+++ b/tensorflow/lite/micro/benchmarks/Makefile.inc
@@ -1,7 +1,3 @@
-$(eval $(call add_third_party_download,$(PERSON_MODEL_URL),$(PERSON_MODEL_MD5),person_model_grayscale,))
-$(eval $(call add_third_party_download,$(PERSON_MODEL_INT8_URL),$(PERSON_MODEL_INT8_MD5),person_model_int8,))
-
-
 KEYWORD_BENCHMARK_SRCS := \
 tensorflow/lite/micro/benchmarks/keyword_benchmark.cc \
 tensorflow/lite/micro/benchmarks/keyword_scrambled_model_data.cc
diff --git a/tensorflow/compiler/mlir/tfjs/ir/dialect_registration.cc b/tensorflow/lite/micro/cortex_m4_generic/debug_log.cc
similarity index 78%
rename from tensorflow/compiler/mlir/tfjs/ir/dialect_registration.cc
rename to tensorflow/lite/micro/cortex_m4_generic/debug_log.cc
index 44ce38469b8..baebe1f5964 100644
--- a/tensorflow/compiler/mlir/tfjs/ir/dialect_registration.cc
+++ b/tensorflow/lite/micro/cortex_m4_generic/debug_log.cc
@@ -13,7 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.h"
+#include "tensorflow/lite/micro/debug_log.h"
 
-// Static initialization for TensorFlow.js op registration.
-static mlir::DialectRegistration<mlir::tfjs::TFJSDialect> tfjs_ops;
+#ifdef DEBUG
+#include <cstdio>
+#endif
+
+extern "C" void DebugLog(const char* s) {
+#ifdef DEBUG
+  fprintf(stderr, "%s", s);
+#endif
+}
diff --git a/tensorflow/lite/micro/examples/hello_world/README.md b/tensorflow/lite/micro/examples/hello_world/README.md
index 0966a1fd1b1..4253b470759 100644
--- a/tensorflow/lite/micro/examples/hello_world/README.md
+++ b/tensorflow/lite/micro/examples/hello_world/README.md
@@ -86,6 +86,11 @@ get it started.
     *   Plug in the microSD card into the J11 connector.
     *   Push the RST button. If a red LED is lit beside RST button, push the CFG
         button.
+    *   Type or copy next commands one-by-another into serial terminal: `setenv
+        loadaddr 0x10800000 setenv bootfile app.elf setenv bootdelay 1 setenv
+        bootcmd fatload mmc 0 \$\{loadaddr\} \$\{bootfile\} \&\& bootelf
+        saveenv`
+    *   Push the RST button.
 
 6.  If you have the MetaWare Debugger installed in your environment:
 
@@ -274,6 +279,13 @@ Following the Steps to run hello world example at HIMAX WE1 EVB platform.
     cd ../../../../../downloads/himax_we1_sdk/image_gen_linux_v3/
     ```
 
+    make sure this tool directory is in $PATH. You can permanently set it to
+    PATH by
+
+    ```
+    export PATH=$PATH:$(pwd)
+    ```
+
 5.  run image generate tool, generate flash image file.
 
     *   Before running image generate tool, by typing `sudo chmod +x image_gen`
diff --git a/tensorflow/lite/micro/examples/hello_world/sparkfun_edge/output_handler.cc b/tensorflow/lite/micro/examples/hello_world/sparkfun_edge/output_handler.cc
index 2e727095a5c..87f2cdff104 100644
--- a/tensorflow/lite/micro/examples/hello_world/sparkfun_edge/output_handler.cc
+++ b/tensorflow/lite/micro/examples/hello_world/sparkfun_edge/output_handler.cc
@@ -55,7 +55,7 @@ void HandleOutput(tflite::ErrorReporter* error_reporter, float x_value,
     // The blue LED is lit for all negative values
     am_devices_led_on(am_bsp_psLEDs, AM_BSP_LED_BLUE);
     // The red LED is lit in only some cases
-    if (y_value <= -0.75) {
+    if (y_value <= -0.75f) {
       am_devices_led_on(am_bsp_psLEDs, AM_BSP_LED_RED);
     } else {
       am_devices_led_off(am_bsp_psLEDs, AM_BSP_LED_RED);
@@ -68,13 +68,14 @@ void HandleOutput(tflite::ErrorReporter* error_reporter, float x_value,
     // The green LED is lit for all positive values
     am_devices_led_on(am_bsp_psLEDs, AM_BSP_LED_GREEN);
     // The yellow LED is lit in only some cases
-    if (y_value >= 0.75) {
+    if (y_value >= 0.75f) {
       am_devices_led_on(am_bsp_psLEDs, AM_BSP_LED_YELLOW);
     } else {
       am_devices_led_off(am_bsp_psLEDs, AM_BSP_LED_YELLOW);
     }
   }
   // Log the current X and Y values
-  TF_LITE_REPORT_ERROR(error_reporter, "x_value: %f, y_value: %f\n", x_value,
-                       y_value);
+  TF_LITE_REPORT_ERROR(error_reporter, "x_value: %f, y_value: %f\n",
+                       static_cast<double>(x_value),
+                       static_cast<double>(y_value));
 }
diff --git a/tensorflow/lite/micro/examples/image_recognition_experimental/BUILD b/tensorflow/lite/micro/examples/image_recognition_experimental/BUILD
new file mode 100644
index 00000000000..d3bcd69d1c7
--- /dev/null
+++ b/tensorflow/lite/micro/examples/image_recognition_experimental/BUILD
@@ -0,0 +1,36 @@
+# Description:
+#   TensorFlow Lite for Microcontrollers image recognition example.
+
+load(
+    "//tensorflow/lite/micro/testing:micro_test.bzl",
+    "tflite_micro_cc_test",
+)
+
+licenses(["notice"])  # Apache 2.0
+
+cc_library(
+    name = "image_model_data",
+    srcs = [
+        "first_10_cifar_images.cc",
+        "image_recognition_model.cc",
+    ],
+    hdrs = [
+        "first_10_cifar_images.h",
+        "image_recognition_model.h",
+        "util.h",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "image_recognition_test",
+    srcs = ["image_recognition_test.cc"],
+    deps = [
+        ":image_model_data",
+        "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/micro:micro_error_reporter",
+        "//tensorflow/lite/micro:micro_framework",
+        "//tensorflow/lite/micro:op_resolvers",
+        "//tensorflow/lite/micro/testing:micro_test",
+        "//tensorflow/lite/schema:schema_fbs",
+    ],
+)
diff --git a/tensorflow/lite/micro/examples/image_recognition_experimental/Makefile.inc b/tensorflow/lite/micro/examples/image_recognition_experimental/Makefile.inc
index 2fdfb0e6779..76b21cb2580 100644
--- a/tensorflow/lite/micro/examples/image_recognition_experimental/Makefile.inc
+++ b/tensorflow/lite/micro/examples/image_recognition_experimental/Makefile.inc
@@ -20,6 +20,7 @@ tensorflow/lite/micro/examples/image_recognition_experimental/image_recognition_
 $(MAKEFILE_DIR)/downloads/image_recognition_model/image_recognition_model.cc
 
 IMAGE_RECOGNITION_TEST_HDRS := \
+tensorflow/lite/micro/examples/image_recognition_experimental/first_10_cifar_images.h \
 tensorflow/lite/micro/examples/image_recognition_experimental/image_recognition_model.h \
 tensorflow/lite/micro/examples/image_recognition_experimental/util.h
 
diff --git a/tensorflow/lite/micro/examples/magic_wand/README.md b/tensorflow/lite/micro/examples/magic_wand/README.md
index 0cf3b8e74c3..fea1eda4d6d 100644
--- a/tensorflow/lite/micro/examples/magic_wand/README.md
+++ b/tensorflow/lite/micro/examples/magic_wand/README.md
@@ -12,6 +12,7 @@ then outputs the gesture to the serial port.
 
 -   [Getting started](#getting-started)
 -   [Deploy to Arduino](#deploy-to-arduino)
+-   [Deploy to Himax WE1 EVB](#deploy-to-himax-we1-evb)
 -   [Deploy to SparkFun Edge](#deploy-to-sparkfun-edge)
 -   [Run the tests on a development machine](#run-the-tests-on-a-development-machine)
 -   [Train your own model](#train-your-own-model)
@@ -140,6 +141,146 @@ SLOPE:
  * * * * * * * *
 ```
 
+## Deploy to Himax WE1 EVB
+
+The following instructions will help you build and deploy this example to
+[HIMAX WE1 EVB](https://github.com/HimaxWiseEyePlus/bsp_tflu/tree/master/HIMAX_WE1_EVB_board_brief)
+board. To undstand more about using this board, please check
+[HIMAX WE1 EVB user guide](https://github.com/HimaxWiseEyePlus/bsp_tflu/tree/master/HIMAX_WE1_EVB_user_guide).
+
+### Initial Setup
+
+To use the HIMAX WE1 EVB, please make sure following software are installed:
+
+#### MetaWare Development Toolkit
+
+See
+[Install the Synopsys DesignWare ARC MetaWare Development Toolkit](/tensorflow/lite/micro/tools/make/targets/arc/README.md#install-the-synopsys-designware-arc-metaware-development-toolkit)
+section for instructions on toolchain installation.
+
+#### Make Tool version
+
+A `'make'` tool is required for deploying Tensorflow Lite Micro applications on
+HIMAX WE1 EVB, See
+[Check make tool version](/tensorflow/lite/micro/tools/make/targets/arc/README.md#make-tool)
+section for proper environment.
+
+#### Serial Terminal Emulation Application
+
+There are 2 main purposes for HIMAX WE1 EVB Debug UART port
+
+-   print application output
+-   burn application to flash by using xmodem send application binary
+
+You can use any terminal emulation program (like [PuTTY](https://www.putty.org/)
+or [minicom](https://linux.die.net/man/1/minicom)).
+
+### Generate Example Project
+
+The example project for HIMAX WE1 EVB platform can be generated with the
+following command:
+
+Download related third party data
+
+```
+make -f tensorflow/lite/micro/tools/make/Makefile TARGET=himax_we1_evb third_party_downloads
+```
+
+Generate magic wand project
+
+```
+make -f tensorflow/lite/micro/tools/make/Makefile generate_magic_wand_make_project TARGET=himax_we1_evb
+```
+
+### Build and Burn Example
+
+Following the Steps to run magic wand example at HIMAX WE1 EVB platform.
+
+1.  Go to the generated example project directory.
+
+    ```
+    cd tensorflow/lite/micro/tools/make/gen/himax_we1_evb_arc/prj/magic_wand/make
+    ```
+
+2.  Build the example using
+
+    ```
+    make app
+    ```
+
+3.  After example build finish, copy ELF file and map file to image generate
+    tool directory. \
+    image generate tool directory located at
+    `'tensorflow/lite/micro/tools/make/downloads/himax_we1_sdk/image_gen_linux_v3/'`
+
+    ```
+    cp magic_wand.elf himax_we1_evb.map ../../../../../downloads/himax_we1_sdk/image_gen_linux_v3/
+    ```
+
+4.  Go to flash image generate tool directory.
+
+    ```
+    cd ../../../../../downloads/himax_we1_sdk/image_gen_linux_v3/
+    ```
+
+    make sure this tool directory is in $PATH. You can permanently set it to
+    PATH by
+
+    ```
+    export PATH=$PATH:$(pwd)
+    ```
+
+5.  run image generate tool, generate flash image file.
+
+    *   Before running image generate tool, by typing `sudo chmod +x image_gen`
+        and `sudo chmod +x sign_tool` to make sure it is executable.
+
+    ```
+    image_gen -e magic_wand.elf -m himax_we1_evb.map -o out.img
+    ```
+
+6.  Download flash image file to HIMAX WE1 EVB by UART:
+
+    *   more detail about download image through UART can be found at
+        [HIMAX WE1 EVB update Flash image](https://github.com/HimaxWiseEyePlus/bsp_tflu/tree/master/HIMAX_WE1_EVB_user_guide#flash-image-update)
+
+After these steps, press reset button on the HIMAX WE1 EVB, you will see
+application output in the serial terminal. Perform following gestures
+`'Wing'`,`'Ring'`,`'Slope'` and you can see the otuput in serial terminal.
+
+```
+WING:
+*         *         *
+ *       * *       *
+  *     *   *     *
+   *   *     *   *
+    * *       * *
+     *         *
+```
+
+```
+RING:
+          *
+       *     *
+     *         *
+    *           *
+     *         *
+       *     *
+          *
+```
+
+```
+SLOPE:
+        *
+       *
+      *
+     *
+    *
+   *
+  *
+ * * * * * * * *
+```
+
 ## Deploy to SparkFun Edge
 
 The following instructions will help you build and deploy this sample on the
diff --git a/tensorflow/lite/micro/examples/magic_wand/himax_we1_evb/accelerometer_handler.cc b/tensorflow/lite/micro/examples/magic_wand/himax_we1_evb/accelerometer_handler.cc
new file mode 100644
index 00000000000..9d83b01be05
--- /dev/null
+++ b/tensorflow/lite/micro/examples/magic_wand/himax_we1_evb/accelerometer_handler.cc
@@ -0,0 +1,89 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/examples/magic_wand/accelerometer_handler.h"
+
+#include "hx_drv_tflm.h"
+
+int begin_index = 0;
+
+namespace {
+// Ring buffer size
+constexpr int ring_buffer_size = 600;
+// Ring buffer
+float save_data[ring_buffer_size] = {0.0};
+// Flag to start detect gesture
+bool pending_initial_data = true;
+// Available data count in accelerometer FIFO
+int available_count = 0;
+
+}  // namespace
+
+TfLiteStatus SetupAccelerometer(tflite::ErrorReporter* error_reporter) {
+  if (hx_drv_accelerometer_initial() != HX_DRV_LIB_PASS) {
+    TF_LITE_REPORT_ERROR(error_reporter, "setup fail");
+    return kTfLiteError;
+  }
+
+  TF_LITE_REPORT_ERROR(error_reporter, "setup done");
+
+  return kTfLiteOk;
+}
+
+bool ReadAccelerometer(tflite::ErrorReporter* error_reporter, float* input,
+                       int length) {
+  // Check how many accelerometer data
+  available_count = hx_drv_accelerometer_available_count();
+
+  if (available_count == 0) return false;
+
+  for (int i = 0; i < available_count; i++) {
+    float x, y, z;
+    hx_drv_accelerometer_receive(&x, &y, &z);
+
+    const float norm_x = -x;
+    const float norm_y = y;
+    const float norm_z = z;
+
+    // Save data in milli-g unit
+    save_data[begin_index++] = norm_x * 1000;
+    save_data[begin_index++] = norm_y * 1000;
+    save_data[begin_index++] = norm_z * 1000;
+
+    // If reach end of buffer, return to 0 position
+    if (begin_index >= ring_buffer_size) begin_index = 0;
+  }
+
+  // Check if data enough for prediction
+  if (pending_initial_data && begin_index >= 200) {
+    pending_initial_data = false;
+  }
+
+  // Return if we don't have enough data
+  if (pending_initial_data) {
+    return false;
+  }
+
+  // Copy the requested number of bytes to the provided input tensor
+  for (int i = 0; i < length; ++i) {
+    int ring_array_index = begin_index + i - length;
+    if (ring_array_index < 0) {
+      ring_array_index += ring_buffer_size;
+    }
+    input[i] = save_data[ring_array_index];
+  }
+
+  return true;
+}
diff --git a/tensorflow/lite/micro/examples/micro_speech/README.md b/tensorflow/lite/micro/examples/micro_speech/README.md
index a4a2f2d3be7..7d4c060b6f4 100644
--- a/tensorflow/lite/micro/examples/micro_speech/README.md
+++ b/tensorflow/lite/micro/examples/micro_speech/README.md
@@ -22,6 +22,8 @@ kilobytes of Flash.
 -   [Deploy to SparkFun Edge](#deploy-to-sparkfun-edge)
 -   [Deploy to STM32F746](#deploy-to-STM32F746)
 -   [Deploy to NXP FRDM K66F](#deploy-to-nxp-frdm-k66f)
+-   [Deploy to HIMAX WE1 EVB](#deploy-to-himax-we1-evb)
+-   [Deploy to CEVA-BX1](#deploy-to-ceva-bx1)
 -   [Run on macOS](#run-on-macos)
 -   [Run the tests on a development machine](#run-the-tests-on-a-development-machine)
 -   [Train your own model](#train-your-own-model)
@@ -34,16 +36,14 @@ board. General information and instructions on using the board with TensorFlow
 Lite Micro can be found in the common
 [ARC targets description](/tensorflow/lite/micro/tools/make/targets/arc/README.md).
 
-This example is quantized with symmetric uint8 scheme. As noted in
-[kernels/arc_mli/README.md](/tensorflow/lite/micro/kernels/arc_mli/README.md),
-embARC MLI supports optimized kernels for int8 quantization only. Therefore,
-this example will only use TFLM reference kernels.
+This example uses asymmetric int8 quantization and can therefore leverage
+optimized int8 kernels from the embARC MLI library
 
-The ARC EM SDP board contains the rich set of extension interfaces. You can
-choose any compatible microphone and modify
+The ARC EM SDP board contains a rich set of extension interfaces. You can choose
+any compatible microphone and modify
 [audio_provider.cc](/tensorflow/lite/micro/examples/micro_speech/audio_provider.cc)
-file accordingly to use input from your specific camera. By default, results of
-running this example are printed to the console. If you would like to instead
+file accordingly to use input from your specific microphone. By default, results
+of running this example are printed to the console. If you would like to instead
 implement some target-specific actions, you need to modify
 [command_responder.cc](/tensorflow/lite/micro/examples/micro_speech/command_responder.cc)
 accordingly.
@@ -63,9 +63,14 @@ recommended to get started with example for mock data. The project for ARC EM
 SDP platform can be generated with the following command:
 
 ```
-make -f tensorflow/lite/micro/tools/make/Makefile TARGET=arc_emsdp TAGS=no_arc_mli generate_micro_speech_mock_make_project
+make -f tensorflow/lite/micro/tools/make/Makefile \
+TARGET=arc_emsdp TAGS=reduce_codesize  \
+generate_micro_speech_mock_make_project
 ```
 
+Note that `TAGS=reduce_codesize` applies example specific changes of code to
+reduce total size of application. It can be ommited.
+
 ### Build and Run Example
 
 For more detailed information on building and running examples see the
@@ -106,6 +111,11 @@ get it started.
     *   Plug in the microSD card into the J11 connector.
     *   Push the RST button. If a red LED is lit beside RST button, push the CFG
         button.
+    *   Type or copy next commands one-by-another into serial terminal: `setenv
+        loadaddr 0x10800000 setenv bootfile app.elf setenv bootdelay 1 setenv
+        bootcmd fatload mmc 0 \$\{loadaddr\} \$\{bootfile\} \&\& bootelf
+        saveenv`
+    *   Push the RST button.
 
 6.  If you have the MetaWare Debugger installed in your environment:
 
@@ -497,29 +507,21 @@ using [ARM Mbed](https://github.com/ARMmbed/mbed-cli).
     make -f tensorflow/lite/micro/tools/make/Makefile TARGET=mbed TAGS="nxp_k66f" generate_micro_speech_mbed_project
     ```
 
-4.  Go to the location of the generated project. The generated project is
-    usually in
+4.  Change into the following directory that has been generated:
     `tensorflow/lite/micro/tools/make/gen/mbed_cortex-m4/prj/micro_speech/mbed`
 
-5.  Create a mbed project using the generated files: `mbed new .`
+5.  Create an Mbed project using the generated files, run ensuring your
+    environment is using Python 2.7: `mbed config root .`
 
-6.  Change the project setting to use C++ 11 rather than C++ 14 using:
+6.  Next, tell Mbed to download the dependencies and prepare to build: `mbed
+    deploy`
 
-    ```
-    python -c 'import fileinput, glob;
-    for filename in glob.glob("mbed-os/tools/profiles/*.json"):
-      for line in fileinput.input(filename, inplace=True):
-        print line.replace("\"-std=gnu++14\"","\"-std=c++11\", \"-fpermissive\"")'
-    ```
+7.  Finally, we can run the following command to compile the code: `mbed compile
+    -m K66F -t GCC_ARM`
 
-7.  To compile project, use the following command:
-
-    ```
-    mbed compile --target K66F --toolchain GCC_ARM --profile release
-    ```
-
-8.  For some mbed compilers, you may get compile error in mbed_rtc_time.cpp. Go
-    to `mbed-os/platform/mbed_rtc_time.h` and comment line 32 and line 37:
+8.  For some Mbed compilers (such as GCC), you may get compile error in
+    mbed_rtc_time.cpp. Go to `mbed-os/platform/mbed_rtc_time.h` and comment line
+    32 and line 37:
 
     ```
     //#if !defined(__GNUC__) || defined(__CC_ARM) || defined(__clang__)
@@ -530,11 +532,10 @@ using [ARM Mbed](https://github.com/ARMmbed/mbed-cli).
     //#endif
     ```
 
-9.  Look at helpful resources from NXP website such as
-    [NXP FRDM-K66F User guide](https://www.nxp.com/docs/en/user-guide/FRDMK66FUG.pdf)
-    and
-    [NXP FRDM-K66F Getting Started](https://www.nxp.com/document/guide/get-started-with-the-frdm-k66f:NGS-FRDM-K66F)
-    to understand information about the board.
+9.  If your system does not recognize the board with the `mbed detect` command.
+    Follow the instructions for setting up
+    [DAPLink](https://armmbed.github.io/DAPLink/?board=FRDM-K66F) for the
+    [K66F](https://os.mbed.com/platforms/FRDM-K66F/).
 
 10. Connect the USB cable to the micro USB port. When the Ethernet port is
     facing towards you, the micro USB port is left of the Ethernet port.
@@ -542,7 +543,7 @@ using [ARM Mbed](https://github.com/ARMmbed/mbed-cli).
 11. To compile and flash in a single step, add the `--flash` option:
 
     ```
-    mbed compile --target K66F --toolchain GCC_ARM --profile release --flash
+    mbed compile -m K66F -t GCC_ARM --flash
     ```
 
 12. Disconnect USB cable from the device to power down the device and connect
@@ -562,6 +563,142 @@ using [ARM Mbed](https://github.com/ARMmbed/mbed-cli).
     in black color. If there is no output on the serial port, you can connect
     headphone to headphone port to check if audio loopback path is working.
 
+## Deploy to HIMAX WE1 EVB
+
+The following instructions will help you build and deploy this example to
+[HIMAX WE1 EVB](https://github.com/HimaxWiseEyePlus/bsp_tflu/tree/master/HIMAX_WE1_EVB_board_brief)
+board. To undstand more about using this board, please check
+[HIMAX WE1 EVB user guide](https://github.com/HimaxWiseEyePlus/bsp_tflu/tree/master/HIMAX_WE1_EVB_user_guide).
+
+### Initial Setup
+
+To use the HIMAX WE1 EVB, please make sure following software are installed:
+
+#### MetaWare Development Toolkit
+
+See
+[Install the Synopsys DesignWare ARC MetaWare Development Toolkit](/tensorflow/lite/micro/tools/make/targets/arc/README.md#install-the-synopsys-designware-arc-metaware-development-toolkit)
+section for instructions on toolchain installation.
+
+#### Make Tool version
+
+A `'make'` tool is required for deploying Tensorflow Lite Micro applications on
+HIMAX WE1 EVB, See
+[Check make tool version](/tensorflow/lite/micro/tools/make/targets/arc/README.md#make-tool)
+section for proper environment.
+
+#### Serial Terminal Emulation Application
+
+There are 2 main purposes for HIMAX WE1 EVB Debug UART port
+
+-   print application output
+-   burn application to flash by using xmodem send application binary
+
+You can use any terminal emulation program (like [PuTTY](https://www.putty.org/)
+or [minicom](https://linux.die.net/man/1/minicom)).
+
+### Generate Example Project
+
+The example project for HIMAX WE1 EVB platform can be generated with the
+following command:
+
+Download related third party data
+
+```
+make -f tensorflow/lite/micro/tools/make/Makefile TARGET=himax_we1_evb third_party_downloads
+```
+
+Generate micro speech project
+
+```
+make -f tensorflow/lite/micro/tools/make/Makefile generate_micro_speech_make_project TARGET=himax_we1_evb
+```
+
+### Build and Burn Example
+
+Following the Steps to run micro speech example at HIMAX WE1 EVB platform.
+
+1.  Go to the generated example project directory.
+
+    ```
+    cd tensorflow/lite/micro/tools/make/gen/himax_we1_evb_arc/prj/micro_speech/make
+    ```
+
+2.  Build the example using
+
+    ```
+    make app
+    ```
+
+3.  After example build finish, copy ELF file and map file to image generate
+    tool directory. \
+    image generate tool directory located at
+    `'tensorflow/lite/micro/tools/make/downloads/himax_we1_sdk/image_gen_linux_v3/'`
+
+    ```
+    cp micro_speech.elf himax_we1_evb.map ../../../../../downloads/himax_we1_sdk/image_gen_linux_v3/
+    ```
+
+4.  Go to flash image generate tool directory.
+
+    ```
+    cd ../../../../../downloads/himax_we1_sdk/image_gen_linux_v3/
+    ```
+
+    make sure this tool directory is in $PATH. You can permanently set it to
+    PATH by
+
+    ```
+    export PATH=$PATH:$(pwd)
+    ```
+
+5.  run image generate tool, generate flash image file.
+
+    *   Before running image generate tool, by typing `sudo chmod +x image_gen`
+        and `sudo chmod +x sign_tool` to make sure it is executable.
+
+    ```
+    image_gen -e micro_speech.elf -m himax_we1_evb.map -o out.img
+    ```
+
+6.  Download flash image file to HIMAX WE1 EVB by UART:
+
+    *   more detail about download image through UART can be found at
+        [HIMAX WE1 EVB update Flash image](https://github.com/HimaxWiseEyePlus/bsp_tflu/tree/master/HIMAX_WE1_EVB_user_guide#flash-image-update)
+
+After these steps, press reset button on the HIMAX WE1 EVB, you will see
+application output in the serial terminal and lighting LED.
+
+![Animation on Himax WE1 EVB](https://raw.githubusercontent.com/HimaxWiseEyePlus/bsp_tflu/master/HIMAX_WE1_EVB_user_guide/images/tflm_example_micro_speech_int8_led.gif)
+
+## Deploy to CEVA-BX1
+
+The following instructions will help you build and deploy the sample to the
+[CEVA-BX1](https://www.ceva-dsp.com/product/ceva-bx1-sound/)
+
+1.  Contact CEVA at [sales@ceva-dsp.com](mailto:sales@ceva-dsp.com)
+2.  Download and install CEVA-BX Toolbox v18.0.2 and run
+3.  Set the TARGET_TOOLCHAIN_ROOT variable in
+    /tensorflow/lite/micro/tools/make/templates/ceva_bx1/ceva_app_makefile.tpl
+    To your installation location. For example: TARGET_TOOLCHAIN_ROOT :=
+    /home/myuser/work/CEVA-ToolBox/V18/BX
+4.  Generate the Makefile for the project: /tensorflow$ make -f
+    tensorflow/lite/micro/tools/make/Makefile TARGET=ceva TARGET_ARCH=bx1
+    generate_micro_speech_make_project
+5.  Build the project:
+    /tensorflow/lite/micro/tools/make/gen/ceva_bx1/prj/micro_speech/make$ make
+6.  This should build the project and create a file called micro_speech.elf.
+7.  The supplied configuarion reads input from a files and expects a file called
+    input.wav (easily changed in audio_provider.cc) to be placed in the same
+    directory of the .elf file
+8.  We used Google's speech command dataset: V0.0.2:
+    http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz V0.0.1:
+    http://download.tensorflow.org/data/speech_commands_v0.01.tar.gz
+9.  Follow CEVA Toolbox instructions for creating a debug target and running the
+    project.
+10. Output should look like: Heard silence (208) @352ms Heard no (201) @1696ms
+    Heard yes (203) @3904ms
+
 ## Run on macOS
 
 The example contains an audio provider compatible with macOS. If you have access
diff --git a/tensorflow/lite/micro/examples/micro_speech/apollo3/Makefile.inc b/tensorflow/lite/micro/examples/micro_speech/apollo3/Makefile.inc
index 21e167e9290..6d550e2f2b4 100644
--- a/tensorflow/lite/micro/examples/micro_speech/apollo3/Makefile.inc
+++ b/tensorflow/lite/micro/examples/micro_speech/apollo3/Makefile.inc
@@ -15,7 +15,7 @@ ifeq ($(TARGET), apollo3evb)
 	@mkdir -p $(dir $@)
 	$(CXX) $(CXXFLAGS) $(INCLUDES) \
 	-o $(PUSHBUTTON_MICRO_SPEECH_TEST_BINARY) $(PUSHBUTTON_MICRO_SPEECH_TEST_OBJS) \
-	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
+	$(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
   pushbutton_micro_speech_test: $(PUSHBUTTON_MICRO_SPEECH_TEST_BINARY)
   pushbutton_micro_speech_test_bin: $(PUSHBUTTON_MICRO_SPEECH_TEST_BINARY).bin
   test_pushbutton_micro_speech: $(PUSHBUTTON_MICRO_SPEECH_TEST_BINARY)
@@ -39,7 +39,7 @@ ifeq ($(TARGET), apollo3evb)
 	@mkdir -p $(dir $@)
 	$(CXX) $(CXXFLAGS) $(INCLUDES) \
 	-o $(PUSHBUTTON_CMSIS_SPEECH_TEST_BINARY) $(PUSHBUTTON_CMSIS_SPEECH_TEST_OBJS) \
-	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
+	$(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
   pushbutton_cmsis_speech_test: $(PUSHBUTTON_CMSIS_SPEECH_TEST_BINARY)
   pushbutton_cmsis_speech_test_bin: $(PUSHBUTTON_CMSIS_SPEECH_TEST_BINARY).bin
   test_pushbutton_cmsis_speech: $(PUSHBUTTON_CMSIS_SPEECH_TEST_BINARY)
@@ -62,7 +62,7 @@ ifeq ($(TARGET), apollo3evb)
 	@mkdir -p $(dir $@)
 	$(CXX) $(CXXFLAGS) $(INCLUDES) \
 	-o $(PREPROCESSOR_1K_MICRO_TEST_BINARY) $(PREPROCESSOR_1K_MICRO_TEST_OBJS) \
-	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
+	$(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
   preprocessor_1k_micro_test: $(PREPROCESSOR_1K_MICRO_TEST_BINARY)
   preprocessor_1k_micro_test_bin: $(PREPROCESSOR_1K_MICRO_TEST_BINARY).bin
   test_preprocessor_1k_micro: $(PREPROCESSOR_1K_MICRO_TEST_BINARY)
@@ -85,7 +85,7 @@ ifeq ($(TARGET), apollo3evb)
 	@mkdir -p $(dir $@)
 	$(CXX) $(CXXFLAGS) $(INCLUDES) \
 	-o $(PREPROCESSOR_1K_CMSIS_TEST_BINARY) $(PREPROCESSOR_1K_CMSIS_TEST_OBJS) \
-	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
+	$(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
   preprocessor_1k_cmsis_test: $(PREPROCESSOR_1K_CMSIS_TEST_BINARY)
   preprocessor_1k_cmsis_test_bin: $(PREPROCESSOR_1K_CMSIS_TEST_BINARY).bin
   test_preprocessor_1k_cmsis: $(PREPROCESSOR_1K_CMSIS_TEST_BINARY)
diff --git a/tensorflow/lite/micro/examples/micro_speech/arc_emsdp/Makefile.inc b/tensorflow/lite/micro/examples/micro_speech/arc_emsdp/Makefile.inc
index 850263f0eb9..74860daf82c 100644
--- a/tensorflow/lite/micro/examples/micro_speech/arc_emsdp/Makefile.inc
+++ b/tensorflow/lite/micro/examples/micro_speech/arc_emsdp/Makefile.inc
@@ -4,7 +4,8 @@ ifeq ($(TARGET), arc_emsdp)
 # In particular:
 # - Extend Heap and stack size for application needs
 # - Use Linker command file with better usage of fast memory
-# - In case project was generated with MLI usage, reduce scratch buffers.
+# - Optional (TAGS=reduce_codesize): In case project was 
+#   generated with MLI usage, reduce scratch buffers.
 
   MICRO_SPEECH_HDRS += \
   micro_speech_patch.txt
@@ -15,14 +16,35 @@ ifeq ($(TARGET), arc_emsdp)
   MICRO_SPEECH_MOCK_HDRS += \
   micro_speech_patch.txt
 
-%/micro_speech_patch.txt: %/emsdp.lcf %/Makefile
-	@cp tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp_v2.lcf $< 
-	@echo emsdp.lcf > $@
+  ARC_MLI_BACKEND_PATH = /tensorflow/lite/micro/kernels/arc_mli
+
+# Apply changes in generated project files. 
+# See related comment echoed (@echo <comment>) after each change 
+# to get understanding on it's purpose.
+%/micro_speech_patch.txt: %/emsdp.lcf %/Makefile %$(ARC_MLI_BACKEND_PATH)/depthwise_conv.cc
+	@cp tensorflow/lite/micro/examples/micro_speech/arc_emsdp/emsdp.lcf $< 
+	@echo emsdp.lcf: Example specific memory map > $@
+	
 	@sed -E -i 's#-Hheap=[^ ]*#\-Hheap=16K \-Hstack=16K#g' $(word 2, $^)
-	@sed -E -i 's#MLI_ONLY *\?= *false#MLI_ONLY \?= false\n\
+	@echo Makefile: Set exact heap/stack size >> $@
+
+	@sed -E -i 's#MLI_ONLY *\?= *false#MLI_ONLY \?= true\n\
 	CXXFLAGS += -DSCRATCH_MEM_X_SIZE=0 -DSCRATCH_MEM_Y_SIZE=0 -DSCRATCH_MEM_Z_SIZE=0\
 	CCFLAGS += -DSCRATCH_MEM_X_SIZE=0 -DSCRATCH_MEM_Y_SIZE=0 -DSCRATCH_MEM_Z_SIZE=0#'\
 	  $(word 2, $^)
-	@echo Makefile >> $@
+	@echo Makefile: set scratch buffers size to 0 >> $@
+	@echo Makefile: No Reference fallback for MLI supported functions >> $@
+
+
+ifneq ($(filter $(ALL_TAGS), reduce_codesize),)
+# In case 'reduce_codesize' tag is present, we replace common MLI functions with 
+# specializations appropriate for this particular graph. But such changes of code 
+# with high probability may not be acceptable for other graphs and will need 
+# to be adjusted by the user
+
+	@sed -E -i 's#mli_krn_depthwise_conv2d_hwcn_sa8_sa8_sa32\(# \
+	mli_krn_depthwise_conv2d_hwcn_sa8_sa8_sa32_generic\(#g' $(word 3, $^)
+	@echo $(word 3, $^): Use generic function >> $@
+endif
 
 endif
diff --git a/tensorflow/lite/micro/examples/micro_speech/arc_emsdp/emsdp.lcf b/tensorflow/lite/micro/examples/micro_speech/arc_emsdp/emsdp.lcf
new file mode 100644
index 00000000000..ae17db1164a
--- /dev/null
+++ b/tensorflow/lite/micro/examples/micro_speech/arc_emsdp/emsdp.lcf
@@ -0,0 +1,72 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Difference with common EMSDP LCF file (to reduce data access time): 
+# - move data and ro_data in data from external PSRAM to DCCM (includes model)
+# - move text from SRAM to ICCM
+# - move BSS from PSRAM to XCCM (includes tensor arena)
+# - move stack to YCCM (as exclusive bank not shared with other data)
+#
+# CCMWRAP memory regions indicate unusable portions of the address space
+#   due to CCM memory wrapping into upper addresses beyond its size
+
+MEMORY {
+    PSRAM   : ORIGIN = 0x10000400, LENGTH = (0x01000000 >> 1) - 0x400
+    SRAM    : ORIGIN = 0x20000000, LENGTH = 0x00040000
+    IVT     : ORIGIN = 0x60000000, LENGTH = 0x400
+    ICCM0   : ORIGIN = 0x60000400, LENGTH = (0x00020000 - 0x400)
+#   CCMWRAP0: ORIGIN = 0x60020000, LENGTH = 0x0ffe0000
+    DCCM    : ORIGIN = 0x80000000, LENGTH = 0x00020000
+#   CCMWRAP1: ORIGIN = 0x80020000, LENGTH = 0x0ffe0000
+    XCCM    : ORIGIN = 0x90000000, LENGTH = 0x00004000
+#   CCMWRAP2: ORIGIN = 0x90004000, LENGTH = 0x0fffc000
+    YCCM    : ORIGIN = 0xa0000000, LENGTH = 0x00004000
+#   CCMWRAP3: ORIGIN = 0xa0004000, LENGTH = 0x0fffc000
+    }
+
+SECTIONS {
+
+    GROUP BLOCK(4) : {
+        .vectors (TEXT) SIZE(DEFINED _IVTSIZE?_IVTSIZE:756): {} = FILL(0xa5a5a5a5,4)
+    } > IVT
+
+    GROUP BLOCK(4): {
+        .text? : { *('.text$crt*') }
+        * (TEXT): {}
+        * (LIT): {}
+    } > ICCM0
+
+    GROUP BLOCK(4): {
+        .debug_log? : {}
+    } > SRAM
+
+    GROUP BLOCK(4): {
+    /* _SDA_BASE_ computed implicitly */
+        .sdata?: {}
+        .sbss?: {}
+        * (DATA): {}
+       .Zdata? : {}
+       .heap? ALIGN(4) SIZE(DEFINED _HEAPSIZE?_HEAPSIZE:8K): {}
+    } > DCCM
+
+    GROUP BLOCK(4): {
+        * (BSS): {}
+        .Xdata? : {}
+    } > XCCM
+
+    GROUP BLOCK(4): {
+       .stack ALIGN(4) SIZE(DEFINED _STACKSIZE?_STACKSIZE:8K): {}
+        .Ydata? : {}
+    } > YCCM
+}
+
+
diff --git a/tensorflow/lite/micro/examples/micro_speech/ceva/audio_provider.cc b/tensorflow/lite/micro/examples/micro_speech/ceva/audio_provider.cc
new file mode 100755
index 00000000000..4764fcb105a
--- /dev/null
+++ b/tensorflow/lite/micro/examples/micro_speech/ceva/audio_provider.cc
@@ -0,0 +1,118 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/examples/micro_speech/audio_provider.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "tensorflow/lite/micro/examples/micro_speech/micro_features/micro_model_settings.h"
+
+namespace {
+
+int32_t g_latest_audio_timestamp = 0;
+
+constexpr int kNoOfSamples = 512;
+bool g_is_audio_initialized = false;
+constexpr int kAudioCaptureBufferSize = kAudioSampleFrequency * 0.5;
+int16_t g_audio_capture_buffer[kAudioCaptureBufferSize];
+int16_t g_audio_output_buffer[kMaxAudioSampleSize];
+int16_t audio[513];
+
+}  // namespace
+
+// Main entry point for getting audio data.
+TfLiteStatus GetAudioSamples(tflite::ErrorReporter* error_reporter,
+                             int start_ms, int duration_ms,
+                             int* audio_samples_size, int16_t** audio_samples) {
+  if (!g_is_audio_initialized) {
+    g_is_audio_initialized = true;
+  }
+  // This should only be called when the main thread notices that the latest
+  // audio sample data timestamp has changed, so that there's new data in the
+  // capture ring buffer. The ring buffer will eventually wrap around and
+  // overwrite the data, but the assumption is that the main thread is checking
+  // often enough and the buffer is large enough that this call will be made
+  // before that happens.
+  const int start_offset = start_ms * (kAudioSampleFrequency / 1000);
+  const int duration_sample_count =
+      duration_ms * (kAudioSampleFrequency / 1000);
+  for (int i = 0; i < duration_sample_count; ++i) {
+    const int capture_index = (start_offset + i) % kAudioCaptureBufferSize;
+    g_audio_output_buffer[i] = g_audio_capture_buffer[capture_index];
+  }
+  *audio_samples_size = kMaxAudioSampleSize;
+  *audio_samples = g_audio_output_buffer;
+  return kTfLiteOk;
+}
+int32_t LatestAudioTimestamp() { return g_latest_audio_timestamp; }
+
+char filename[50] = "input.wav";
+FILE* infile;
+void init_audio() {
+  uint8_t mem[3];
+  printf("Using filename %s\n", filename);
+
+  infile = fopen(filename, "rb");
+  if (!infile) {
+    printf("Can't open file\n");
+    exit(1);
+  }
+
+  // skip wav header
+  for (int i = 0; i < 44; i++) {
+    fread(mem, 1, 1, infile);
+  }
+}
+
+void read_samples() {
+  int i = 0;
+  uint8_t mem[3];
+  bool done;
+  for (int i = 0; i < kNoOfSamples; i++) {
+    if (fread((char*)mem, 1, 2, infile) == 2) {
+      audio[i] = (int16_t)mem[0] + (((int16_t)mem[1]) << 8);
+    } else {
+      done = true;
+      fclose(infile);
+      infile = fopen(filename, "rb");
+      printf("EOF reached\n");
+
+      break;
+    }
+  }
+}
+
+void CaptureSamples(const int16_t* sample_data) {
+  const int sample_size = kNoOfSamples;
+  const int32_t time_in_ms =
+      g_latest_audio_timestamp + (sample_size / (kAudioSampleFrequency / 1000));
+
+  const int32_t start_sample_offset =
+      g_latest_audio_timestamp * (kAudioSampleFrequency / 1000);
+
+  for (int i = 0; i < sample_size; ++i) {
+    const int capture_index =
+        (start_sample_offset + i) % kAudioCaptureBufferSize;
+    g_audio_capture_buffer[capture_index] = sample_data[i];
+  }
+  // This is how we let the outside world know that new audio data has arrived.
+  g_latest_audio_timestamp = time_in_ms;
+}
+
+void GetAudio() {
+  read_samples();
+  CaptureSamples(audio);
+}
\ No newline at end of file
diff --git a/tensorflow/lite/micro/examples/micro_speech/ceva/main_functions.cc b/tensorflow/lite/micro/examples/micro_speech/ceva/main_functions.cc
new file mode 100644
index 00000000000..db19645f2b1
--- /dev/null
+++ b/tensorflow/lite/micro/examples/micro_speech/ceva/main_functions.cc
@@ -0,0 +1,206 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <fstream>
+#include <iostream>
+
+#include "tensorflow/lite/micro/examples/micro_speech/audio_provider.h"
+#include "tensorflow/lite/micro/examples/micro_speech/command_responder.h"
+#include "tensorflow/lite/micro/examples/micro_speech/feature_provider.h"
+#include "tensorflow/lite/micro/examples/micro_speech/micro_features/micro_model_settings.h"
+#include "tensorflow/lite/micro/examples/micro_speech/micro_features/model.h"
+#include "tensorflow/lite/micro/examples/micro_speech/recognize_commands.h"
+#include "tensorflow/lite/micro/micro_interpreter.h"
+#include "tensorflow/lite/micro/micro_mutable_op_resolver.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/version.h"
+
+extern int32_t g_latest_audio_timestamp;
+void setup_tf();
+void CaptureSamples(const int16_t* sample_data);
+void init_audio();
+void GetAudio();
+int detection_loop();
+void read_samples();
+int32_t LatestAudioTimestamp();
+extern "C" {
+void setup() {
+  init_audio();
+  setup_tf();
+}
+}
+
+extern "C" {
+void loop() {
+  // get audio samples
+  // run detection
+  GetAudio();
+  detection_loop();
+}
+}
+
+// Globals, used for compatibility with Arduino-style sketches.
+namespace {
+tflite::ErrorReporter* error_reporter = nullptr;
+const tflite::Model* model = nullptr;
+tflite::MicroInterpreter* interpreter = nullptr;
+TfLiteTensor* model_input = nullptr;
+FeatureProvider* feature_provider = nullptr;
+RecognizeCommands* recognizer = nullptr;
+int32_t previous_time = 0;
+int8_t feature_buffer[kFeatureElementCount];
+int8_t* model_input_buffer = nullptr;
+// Create an area of memory to use for input, output, and intermediate arrays.
+// The size of this will depend on the model you're using, and may need to be
+// determined by experimentation.
+constexpr int kTensorArenaSize = 10 * 1024;
+uint8_t tensor_arena[kTensorArenaSize];
+}  // namespace
+
+// The name of this function is important for Arduino compatibility.
+void setup_tf() {
+  // Set up logging. Google style is to avoid globals or statics because of
+  // lifetime uncertainty, but since this has a trivial destructor it's okay.
+  // NOLINTNEXTLINE(runtime-global-variables)
+  static tflite::MicroErrorReporter micro_error_reporter;
+  int i;
+  error_reporter = &micro_error_reporter;
+
+  // Map the model into a usable data structure. This doesn't involve any
+  // copying or parsing, it's a very lightweight operation.
+  model = tflite::GetModel(g_model);
+  if (model->version() != TFLITE_SCHEMA_VERSION) {
+    TF_LITE_REPORT_ERROR(error_reporter,
+                         "Model provided is schema version %d not equal to "
+                         "supported version %d.",
+                         model->version(), TFLITE_SCHEMA_VERSION);
+    return;
+  }
+
+  // Pull in only the operation implementations we need.
+  // This relies on a complete list of all the ops needed by this graph.
+  // An easier approach is to just use the AllOpsResolver, but this will
+  // incur some penalty in code space for op implementations that are not
+  // needed by this graph.
+  //
+  // tflite::AllOpsResolver resolver;
+  // NOLINTNEXTLINE(runtime-global-variables)
+  static tflite::MicroMutableOpResolver<4> micro_op_resolver(error_reporter);
+  if (micro_op_resolver.AddDepthwiseConv2D() != kTfLiteOk) {
+    return;
+  }
+  if (micro_op_resolver.AddFullyConnected() != kTfLiteOk) {
+    return;
+  }
+  if (micro_op_resolver.AddSoftmax() != kTfLiteOk) {
+    return;
+  }
+  if (micro_op_resolver.AddReshape() != kTfLiteOk) {
+    return;
+  }
+  // Build an interpreter to run the model with.
+  static tflite::MicroInterpreter static_interpreter(
+      model, micro_op_resolver, tensor_arena, kTensorArenaSize, error_reporter);
+  interpreter = &static_interpreter;
+
+  // Allocate memory from the tensor_arena for the model's tensors.
+  TfLiteStatus allocate_status = interpreter->AllocateTensors();
+  if (allocate_status != kTfLiteOk) {
+    TF_LITE_REPORT_ERROR(error_reporter, "AllocateTensors() failed");
+    return;
+  }
+
+  // Get information about the memory area to use for the model's input.
+  model_input = interpreter->input(0);
+  if ((model_input->dims->size != 2) || (model_input->dims->data[0] != 1) ||
+      (model_input->dims->data[1] !=
+       (kFeatureSliceCount * kFeatureSliceSize)) ||
+      (model_input->type != kTfLiteInt8)) {
+    TF_LITE_REPORT_ERROR(error_reporter,
+                         "Bad input tensor parameters in model");
+    return;
+  }
+  model_input_buffer = model_input->data.int8;
+
+  // Prepare to access the audio spectrograms from a microphone or other source
+  // that will provide the inputs to the neural network.
+  // NOLINTNEXTLINE(runtime-global-variables)
+  static FeatureProvider static_feature_provider(kFeatureElementCount,
+                                                 feature_buffer);
+  feature_provider = &static_feature_provider;
+
+  static RecognizeCommands static_recognizer(error_reporter);
+  recognizer = &static_recognizer;
+
+  previous_time = 0;
+}
+
+int detection_loop() {
+  // Fetch the spectrogram for the current time.
+  int retVal = 0;
+  const int32_t current_time = LatestAudioTimestamp();
+  int how_many_new_slices = 0;
+  int static frame_counter = 0;
+
+  TfLiteStatus feature_status = feature_provider->PopulateFeatureData(
+      error_reporter, previous_time, current_time, &how_many_new_slices);
+
+  printf("frame =  %d\n", frame_counter);
+  frame_counter++;
+  if (feature_status != kTfLiteOk) {
+    TF_LITE_REPORT_ERROR(error_reporter, "Feature generation failed");
+    return retVal;
+  }
+  previous_time = current_time;
+  // If no new audio samples have been received since last time, don't bother
+  // running the network model.
+  if (how_many_new_slices == 0) {
+    printf("no new slices\n");
+    return retVal;
+  }
+  // Copy feature buffer to input tensor
+  for (int i = 0; i < kFeatureElementCount; i++) {
+    model_input_buffer[i] = feature_buffer[i];
+  }
+
+  // Run the model on the spectrogram input and make sure it succeeds.
+  TfLiteStatus invoke_status = interpreter->Invoke();
+  if (invoke_status != kTfLiteOk) {
+    TF_LITE_REPORT_ERROR(error_reporter, "Invoke failed");
+    return retVal;
+  }
+
+  // The output from the model is a vector containing the scores for each
+  // kind of prediction, so figure out what the highest scoring category was.
+  TfLiteTensor* output = interpreter->output(0);
+
+  const char* found_command = nullptr;
+  uint8_t score = 0;
+  bool is_new_command = false;
+  TfLiteStatus process_status = recognizer->ProcessLatestResults(
+      output, current_time, &found_command, &score, &is_new_command);
+  if (process_status != kTfLiteOk) {
+    TF_LITE_REPORT_ERROR(error_reporter,
+                         "RecognizeCommands::ProcessLatestResults() failed");
+    return retVal;
+  }
+  // Do something based on the recognized command. The default implementation
+  // just prints to the error console, but you should replace this with your
+  // own function for a real application.
+  RespondToCommand(error_reporter, current_time, found_command, score,
+                   is_new_command);
+
+  return retVal;
+}
diff --git a/tensorflow/lite/micro/examples/micro_speech/himax_we1_evb/audio_provider.cc b/tensorflow/lite/micro/examples/micro_speech/himax_we1_evb/audio_provider.cc
new file mode 100644
index 00000000000..a2137d2f8d2
--- /dev/null
+++ b/tensorflow/lite/micro/examples/micro_speech/himax_we1_evb/audio_provider.cc
@@ -0,0 +1,57 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/examples/micro_speech/audio_provider.h"
+
+#include "hx_drv_tflm.h"
+#include "tensorflow/lite/micro/examples/micro_speech/micro_features/micro_model_settings.h"
+
+namespace {
+// Feedback silence buffer when beginning start_ms <= 0
+int16_t g_silence[kMaxAudioSampleSize] = {0};
+// Latest time-stamp
+int32_t g_latest_audio_timestamp = 0;
+// config about audio data size and address
+hx_drv_mic_data_config_t mic_config;
+// Flag for check if audio is initialize or not
+bool g_is_audio_initialized = false;
+}  // namespace
+
+TfLiteStatus GetAudioSamples(tflite::ErrorReporter* error_reporter,
+                             int start_ms, int duration_ms,
+                             int* audio_samples_size, int16_t** audio_samples) {
+  if (!g_is_audio_initialized) {
+    if (hx_drv_mic_initial() != HX_DRV_LIB_PASS) return kTfLiteError;
+
+    hx_drv_mic_on();
+    g_is_audio_initialized = true;
+  }
+
+  if (start_ms > 0) {
+    hx_drv_mic_capture(&mic_config);
+  } else {
+    mic_config.data_size = kMaxAudioSampleSize;
+    mic_config.data_address = (uint32_t)g_silence;
+  }
+
+  *audio_samples_size = mic_config.data_size;
+  *audio_samples = (int16_t*)mic_config.data_address;
+  return kTfLiteOk;
+}
+
+int32_t LatestAudioTimestamp() {
+  hx_drv_mic_timestamp_get(&g_latest_audio_timestamp);
+  return g_latest_audio_timestamp;
+}
diff --git a/tensorflow/lite/micro/examples/micro_speech/himax_we1_evb/command_responder.cc b/tensorflow/lite/micro/examples/micro_speech/himax_we1_evb/command_responder.cc
new file mode 100644
index 00000000000..deda6c8dc50
--- /dev/null
+++ b/tensorflow/lite/micro/examples/micro_speech/himax_we1_evb/command_responder.cc
@@ -0,0 +1,60 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/examples/micro_speech/command_responder.h"
+
+#include "hx_drv_tflm.h"
+
+static int32_t last_command_time = 0;
+static uint32_t loop = 0;
+static bool all_on = 0;
+
+void RespondToCommand(tflite::ErrorReporter* error_reporter,
+                      int32_t current_time, const char* found_command,
+                      uint8_t score, bool is_new_command) {
+  loop++;
+  if (is_new_command) {
+    TF_LITE_REPORT_ERROR(error_reporter, "Heard %s (%d) @%dms", found_command,
+                         score, current_time);
+    if (found_command[0] == 'y') {
+      last_command_time = current_time;
+      hx_drv_led_off(HX_DRV_LED_RED);
+      hx_drv_led_on(HX_DRV_LED_GREEN);
+    } else if (found_command[0] == 'n') {
+      last_command_time = current_time;
+      hx_drv_led_off(HX_DRV_LED_GREEN);
+      hx_drv_led_on(HX_DRV_LED_RED);
+    }
+  }
+
+  if (last_command_time != 0) {
+    if (last_command_time < (current_time - 3000)) {
+      last_command_time = 0;
+      hx_drv_led_off(HX_DRV_LED_GREEN);
+      hx_drv_led_off(HX_DRV_LED_RED);
+    }
+  } else {
+    if ((loop % 10) == 0) {
+      if (all_on) {
+        hx_drv_led_on(HX_DRV_LED_RED);
+        hx_drv_led_on(HX_DRV_LED_GREEN);
+      } else {
+        hx_drv_led_off(HX_DRV_LED_RED);
+        hx_drv_led_off(HX_DRV_LED_GREEN);
+      }
+      all_on = !all_on;
+    }
+  }
+}
diff --git a/tensorflow/lite/micro/examples/person_detection/Makefile.inc b/tensorflow/lite/micro/examples/person_detection/Makefile.inc
index a295bb83f71..304dd95d874 100644
--- a/tensorflow/lite/micro/examples/person_detection/Makefile.inc
+++ b/tensorflow/lite/micro/examples/person_detection/Makefile.inc
@@ -1,6 +1,3 @@
-$(eval $(call add_third_party_download,$(PERSON_MODEL_URL),$(PERSON_MODEL_MD5),person_model_grayscale,))
-$(eval $(call add_third_party_download,$(RUY_URL),$(RUY_MD5),ruy,))
-
 person_detection_MODEL_SRCS := \
 tensorflow/lite/micro/examples/person_detection/model_settings.cc \
 $(MAKEFILE_DIR)/downloads/person_model_grayscale/person_detect_model_data.cc
diff --git a/tensorflow/lite/micro/examples/person_detection/README.md b/tensorflow/lite/micro/examples/person_detection/README.md
index 423941dcad8..3069edf810b 100644
--- a/tensorflow/lite/micro/examples/person_detection/README.md
+++ b/tensorflow/lite/micro/examples/person_detection/README.md
@@ -28,8 +28,8 @@ This example is quantized with symmetric uint8 scheme. As noted in
 embARC MLI supports optimized kernels for int8 quantization only. Therefore,
 this example will only use TFLM reference kernels.
 
-The ARC EM SDP board contains the reach set of extension interfaces. You can
-choose any compatible camera and modify
+The ARC EM SDP board contains a rich set of extension interfaces. You can choose
+any compatible camera and modify
 [image_provider.cc](/tensorflow/lite/micro/examples/person_detection/image_provider.cc)
 file accordingly to use input from your specific camera. By default, results of
 running this example are printed to the console. If you would like to instead
@@ -94,6 +94,11 @@ get it started.
     *   Plug in the microSD card into the J11 connector.
     *   Push the RST button. If a red LED is lit beside RST button, push the CFG
         button.
+    *   Type or copy next commands one-by-another into serial terminal: `setenv
+        loadaddr 0x10800000 setenv bootfile app.elf setenv bootdelay 1 setenv
+        bootcmd fatload mmc 0 \$\{loadaddr\} \$\{bootfile\} \&\& bootelf
+        saveenv`
+    *   Push the RST button.
 
 6.  If you have the MetaWare Debugger installed in your environment:
 
diff --git a/tensorflow/lite/micro/examples/person_detection/person_detection_test.cc b/tensorflow/lite/micro/examples/person_detection/person_detection_test.cc
index 548b95e0acc..22ea96f6d49 100644
--- a/tensorflow/lite/micro/examples/person_detection/person_detection_test.cc
+++ b/tensorflow/lite/micro/examples/person_detection/person_detection_test.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/lite/micro/micro_error_reporter.h"
 #include "tensorflow/lite/micro/micro_interpreter.h"
 #include "tensorflow/lite/micro/micro_mutable_op_resolver.h"
-#include "tensorflow/lite/micro/micro_optional_debug_tools.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/version.h"
@@ -45,7 +44,6 @@ TF_LITE_MICRO_TEST(TestInvoke) {
                          "to supported version %d.\n",
                          model->version(), TFLITE_SCHEMA_VERSION);
   }
-  PrintModelData(model, &micro_error_reporter);
 
   // Pull in only the operation implementations we need.
   // This relies on a complete list of all the ops needed by this graph.
diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/README.md b/tensorflow/lite/micro/examples/person_detection_experimental/README.md
index 4312e3a8b8b..f5f1d64d2ab 100644
--- a/tensorflow/lite/micro/examples/person_detection_experimental/README.md
+++ b/tensorflow/lite/micro/examples/person_detection_experimental/README.md
@@ -50,9 +50,14 @@ The example project for ARC EM SDP platform can be generated with the following
 command:
 
 ```
-make -f tensorflow/lite/micro/tools/make/Makefile TARGET=arc_emsdp generate_person_detection_int8_make_project
+make -f tensorflow/lite/micro/tools/make/Makefile \
+TARGET=arc_emsdp TAGS=reduce_codesize \
+generate_person_detection_int8_make_project
 ```
 
+Note that `TAGS=reduce_codesize` applies example specific changes of code to
+reduce total size of application. It can be ommited.
+
 ### Build and Run Example
 
 For more detailed information on building and running examples see the
@@ -93,6 +98,11 @@ get it started.
     *   Plug in the microSD card into the J11 connector.
     *   Push the RST button. If a red LED is lit beside RST button, push the CFG
         button.
+    *   Type or copy next commands one-by-another into serial terminal: `setenv
+        loadaddr 0x10800000 setenv bootfile app.elf setenv bootdelay 1 setenv
+        bootcmd fatload mmc 0 \$\{loadaddr\} \$\{bootfile\} \&\& bootelf
+        saveenv`
+    *   Push the RST button.
 
 6.  If you have the MetaWare Debugger installed in your environment:
 
@@ -343,6 +353,13 @@ Following the Steps to run person detection example at HIMAX WE1 EVB platform.
     cd ../../../../../downloads/himax_we1_sdk/image_gen_linux_v3/
     ```
 
+    make sure this tool directory is in $PATH. You can permanently set it to
+    PATH by
+
+    ```
+    export PATH=$PATH:$(pwd)
+    ```
+
 5.  run image generate tool, generate flash image file.
 
     *   Before running image generate tool, by typing `sudo chmod +x image_gen`
diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/Makefile.inc b/tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/Makefile.inc
index c00f9b89953..0ecfdfa8737 100644
--- a/tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/Makefile.inc
+++ b/tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/Makefile.inc
@@ -4,6 +4,8 @@ ifeq ($(TARGET), arc_emsdp)
 # for experimental person detection example. In particular:
 # - Use Linker command file with better usage of fast memory
 # - Stripout TFLM reference code by default.
+# - Optional: replace mli switchers with specialized kernels 
+#   for smaller code size
 
   person_detection_HDRS += \
   person_detection_int8_patch.txt
@@ -11,11 +13,31 @@ ifeq ($(TARGET), arc_emsdp)
   person_detection_TEST_HDRS += \
   person_detection_int8_patch.txt
   
-
-%/person_detection_int8_patch.txt: %/emsdp.lcf %/Makefile
+  ARC_MLI_BACKEND_PATH = /tensorflow/lite/micro/kernels/arc_mli
+  
+# Apply changes in generated project files. 
+# See related comment echoed (@echo <comment>) after each change 
+# to get understanding on it's purpose.
+%/person_detection_int8_patch.txt: %/emsdp.lcf %/Makefile %$(ARC_MLI_BACKEND_PATH)/conv.cc %$(ARC_MLI_BACKEND_PATH)/depthwise_conv.cc %$(ARC_MLI_BACKEND_PATH)/pooling.cc
 	@cp tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/emsdp.lcf $< 
-	@echo emsdp.lcf > $@
+	@echo emsdp.lcf: Replace with example specific memory map  > $@
+
 	@sed -E -i 's#MLI_ONLY *\?= *false#MLI_ONLY \?= true#' $(word 2, $^)
-	@echo Makefile > $@
+	@echo Makefile: No Reference fallback for MLI supported functions >> $@
+
+ifneq ($(filter $(ALL_TAGS), reduce_codesize),)
+# In case 'reduce_codesize' tag is present, we replace common MLI functions with 
+# specializations appropriate for this particular graph. But such changes of code 
+# with high probability may not be acceptable for other graphs and will need 
+# to be adjusted by the user
+
+	@sed -E -i 's#mli_krn_conv2d_nhwc_sa8_sa8_sa32#mli_krn_conv2d_nhwc_sa8_sa8_sa32_k1x1_nopad#' $(word 3, $^)
+	@sed -E -i 's#mli_krn_depthwise_conv2d_hwcn_sa8_sa8_sa32#mli_krn_depthwise_conv2d_hwcn_sa8_sa8_sa32_k3x3_krnpad#' $(word 4, $^)
+	@sed -E -i 's#mli_krn_avepool_hwc_sa8#mli_krn_avepool_hwc_sa8_k3x3_nopad#' $(word 5, $^)
+	@sed -E -i 's#mli_krn_maxpool_hwc_sa8\(in_ptr, \&cfg, out_ptr\);#return kTfLiteError;#' $(word 5, $^)
+	@echo $(word 3, $^): Use specialization >> $@
+	@echo $(word 4, $^): Use specialization >> $@
+	@echo $(word 5, $^): Use specialization and remove max pooling >> $@
+endif
 
 endif
diff --git a/tensorflow/lite/micro/himax_we1_evb/debug_log.cc b/tensorflow/lite/micro/himax_we1_evb/debug_log.cc
index 5bc3c7fae35..3431b8d2a59 100644
--- a/tensorflow/lite/micro/himax_we1_evb/debug_log.cc
+++ b/tensorflow/lite/micro/himax_we1_evb/debug_log.cc
@@ -13,10 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// Implementation for the DebugLog() function that prints to the UART on the
-// SparkFun Edge microcontroller. The same should work for other targets using
-// the Ambiq Apollo 3.
-
 #include "tensorflow/lite/micro/debug_log.h"
 
 #include "hx_drv_tflm.h"
diff --git a/tensorflow/lite/micro/kernels/BUILD b/tensorflow/lite/micro/kernels/BUILD
index d88bf91688c..18a19ab599c 100644
--- a/tensorflow/lite/micro/kernels/BUILD
+++ b/tensorflow/lite/micro/kernels/BUILD
@@ -54,6 +54,7 @@ cc_library(
         "resize_nearest_neighbor.cc",
         "round.cc",
         "split.cc",
+        "split_v.cc",
         "strided_slice.cc",
         "sub.cc",
         "tanh.cc",
@@ -520,6 +521,7 @@ cc_library(
     hdrs = ["kernel_util.h"],
     deps = [
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/kernels/internal:compatibility",
         "//tensorflow/lite/kernels/internal:types",
     ],
 )
diff --git a/tensorflow/lite/micro/kernels/add.cc b/tensorflow/lite/micro/kernels/add.cc
index 79a04875def..7c63eeaba98 100644
--- a/tensorflow/lite/micro/kernels/add.cc
+++ b/tensorflow/lite/micro/kernels/add.cc
@@ -110,19 +110,22 @@ void EvalAdd(TfLiteContext* context, TfLiteNode* node, TfLiteAddParams* params,
   tflite::ArithmeticParams op_params;
   SetActivationParams(data->output_activation_min_f32,
                       data->output_activation_max_f32, &op_params);
-#define TF_LITE_ADD(opname)                                               \
-  reference_ops::opname(op_params, tflite::micro::GetTensorShape(input1), \
-                        tflite::micro::GetTensorData<float>(input1),      \
-                        tflite::micro::GetTensorShape(input2),            \
-                        tflite::micro::GetTensorData<float>(input2),      \
-                        tflite::micro::GetTensorShape(output),            \
-                        tflite::micro::GetTensorData<float>(output))
   if (data->requires_broadcast) {
-    TF_LITE_ADD(BroadcastAdd4DSlow);
+    reference_ops::BroadcastAdd4DSlow(
+        op_params, tflite::micro::GetTensorShape(input1),
+        tflite::micro::GetTensorData<float>(input1),
+        tflite::micro::GetTensorShape(input2),
+        tflite::micro::GetTensorData<float>(input2),
+        tflite::micro::GetTensorShape(output),
+        tflite::micro::GetTensorData<float>(output));
   } else {
-    TF_LITE_ADD(Add);
+    reference_ops::Add(op_params, tflite::micro::GetTensorShape(input1),
+                       tflite::micro::GetTensorData<float>(input1),
+                       tflite::micro::GetTensorShape(input2),
+                       tflite::micro::GetTensorData<float>(input2),
+                       tflite::micro::GetTensorShape(output),
+                       tflite::micro::GetTensorData<float>(output));
   }
-#undef TF_LITE_ADD
 }
 
 TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
@@ -147,27 +150,42 @@ TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
     bool need_broadcast = reference_ops::ProcessBroadcastShapes(
         tflite::micro::GetTensorShape(input1),
         tflite::micro::GetTensorShape(input2), &op_params);
-#define TF_LITE_ADD(type, opname, dtype)                         \
-  type::opname(op_params, tflite::micro::GetTensorShape(input1), \
-               tflite::micro::GetTensorData<dtype>(input1),      \
-               tflite::micro::GetTensorShape(input2),            \
-               tflite::micro::GetTensorData<dtype>(input2),      \
-               tflite::micro::GetTensorShape(output),            \
-               tflite::micro::GetTensorData<dtype>(output));
     if (output->type == kTfLiteInt8) {
       if (need_broadcast) {
-        TF_LITE_ADD(reference_integer_ops, BroadcastAdd4DSlow, int8_t);
+        reference_integer_ops::BroadcastAdd4DSlow(
+            op_params, tflite::micro::GetTensorShape(input1),
+            tflite::micro::GetTensorData<int8_t>(input1),
+            tflite::micro::GetTensorShape(input2),
+            tflite::micro::GetTensorData<int8_t>(input2),
+            tflite::micro::GetTensorShape(output),
+            tflite::micro::GetTensorData<int8_t>(output));
       } else {
-        TF_LITE_ADD(reference_integer_ops, Add, int8_t);
+        reference_integer_ops::Add(
+            op_params, tflite::micro::GetTensorShape(input1),
+            tflite::micro::GetTensorData<int8_t>(input1),
+            tflite::micro::GetTensorShape(input2),
+            tflite::micro::GetTensorData<int8_t>(input2),
+            tflite::micro::GetTensorShape(output),
+            tflite::micro::GetTensorData<int8_t>(output));
       }
     } else {
       if (need_broadcast) {
-        TF_LITE_ADD(reference_ops, BroadcastAdd4DSlow, uint8_t);
+        reference_ops::BroadcastAdd4DSlow(
+            op_params, tflite::micro::GetTensorShape(input1),
+            tflite::micro::GetTensorData<uint8_t>(input1),
+            tflite::micro::GetTensorShape(input2),
+            tflite::micro::GetTensorData<uint8_t>(input2),
+            tflite::micro::GetTensorShape(output),
+            tflite::micro::GetTensorData<uint8_t>(output));
       } else {
-        TF_LITE_ADD(reference_ops, Add, uint8_t);
+        reference_ops::Add(op_params, tflite::micro::GetTensorShape(input1),
+                           tflite::micro::GetTensorData<uint8_t>(input1),
+                           tflite::micro::GetTensorShape(input2),
+                           tflite::micro::GetTensorData<uint8_t>(input2),
+                           tflite::micro::GetTensorShape(output),
+                           tflite::micro::GetTensorData<uint8_t>(output));
       }
     }
-#undef TF_LITE_ADD
   }
 
   return kTfLiteOk;
diff --git a/tensorflow/lite/micro/kernels/arc_mli/README.md b/tensorflow/lite/micro/kernels/arc_mli/README.md
index 9bd0085f373..1ded4c84f8a 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/README.md
+++ b/tensorflow/lite/micro/kernels/arc_mli/README.md
@@ -1,5 +1,12 @@
 # EmbARC MLI Library Based Optimizations of TensorFlow Lite Micro Kernels for ARC Platforms.
 
+## Maintainers
+
+*   [dzakhar](https://github.com/dzakhar)
+*   [JaccovG](https://github.com/JaccovG)
+
+## Introduction
+
 This folder contains kernel implementations which use optimized
 [embARC MLI Library](https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli).
 It allows acceleration of inference operations which use int8 (asymmetric
diff --git a/tensorflow/lite/micro/kernels/arc_mli/conv.cc b/tensorflow/lite/micro/kernels/arc_mli/conv.cc
index 905feb1a529..2bdbe679087 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/conv.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/conv.cc
@@ -38,22 +38,29 @@ constexpr int kInputTensor = 0;
 constexpr int kFilterTensor = 1;
 constexpr int kBiasTensor = 2;
 constexpr int kOutputTensor = 0;
-constexpr int kMaxChannels = 256;
 
 // Conv is quantized along dimension 0:
 // https://www.tensorflow.org/lite/performance/quantization_spec
 constexpr int kConvQuantizedDimension = 0;
 
+// This file has 2 implementation of Conv.
+
 struct OpData {
   TfLitePaddingValues padding;
+
+  // Cached tensor zero point values for quantized operations.
+  int32_t input_zero_point;
+  int32_t filter_zero_point;
+  int32_t output_zero_point;
+
   // The scaling factor from input to output (aka the 'real multiplier') can
   // be represented as a fixed point multiplier plus a left shift.
   int32_t output_multiplier;
   int output_shift;
 
   // Per channel output multiplier and shift.
-  int32_t per_channel_output_multiplier[kMaxChannels];
-  int32_t per_channel_output_shift[kMaxChannels];
+  int32_t* per_channel_output_multiplier;
+  int32_t* per_channel_output_shift;
 
   // The range of the fused activation layer. For example for kNone and
   // uint8_t these would be 0 and 255.
@@ -85,16 +92,15 @@ bool IsMliApplicable(TfLiteContext* context, const TfLiteTensor* input,
                  (params->dilation_width_factor == 1) &&
                  (params->dilation_height_factor == 1) &&
                  (affine_quantization->scale->size ==
-                  filter->dims->data[kConvQuantizedDimension]) &&
-                 affine_quantization->scale->size <= (kMaxChannels * 2);
+                  filter->dims->data[kConvQuantizedDimension]);
   return ret_val;
 }
 
 TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
-                             TfLiteConvParams* params, int width, int height,
-                             int filter_width, int filter_height, int out_width,
-                             int out_height, const TfLiteType data_type,
-                             bool mli_is_applicable, OpData* data) {
+                             const TfLiteConvParams* params, int width,
+                             int height, int filter_width, int filter_height,
+                             int out_width, int out_height,
+                             const TfLiteType data_type, OpData* data) {
   bool has_bias = node->inputs->size == 3;
   // Check number of inputs/outputs
   TF_LITE_ENSURE(context, has_bias || node->inputs->size == 2);
@@ -106,16 +112,16 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
       params->stride_height, params->stride_width,
       params->dilation_height_factor, params->dilation_width_factor, height,
       width, filter_height, filter_width, padding, &out_height, &out_width);
-
   // Note that quantized inference requires that all tensors have their
   // parameters set. This is usually done during quantized training.
 #if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
-  if (data_type != kTfLiteFloat32 && !mli_is_applicable) {
-    const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-    const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
-    const TfLiteTensor* bias =
-        GetOptionalInputTensor(context, node, kBiasTensor);
-    TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
+  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  if (data_type != kTfLiteFloat32 &&
+      !IsMliApplicable(context, input, filter, bias, params)) {
     int output_channels = filter->dims->data[kConvQuantizedDimension];
 
     TF_LITE_ENSURE_STATUS(tflite::PopulateConvolutionQuantizationParams(
@@ -130,21 +136,82 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
   return kTfLiteOk;
 }
 
-TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                           TfLiteConvParams* params, OpData* data,
-                           const TfLiteTensor* input,
-                           const TfLiteTensor* filter, const TfLiteTensor* bias,
-                           TfLiteTensor* im2col, TfLiteTensor* hwcn_weights,
-                           TfLiteTensor* output) {
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+
+  OpData* data = static_cast<OpData*>(node->user_data);
+  const auto params = static_cast<const TfLiteConvParams*>(node->builtin_data);
+
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
+
+  int input_width = input->dims->data[2];
+  int input_height = input->dims->data[1];
+  int filter_width = filter->dims->data[2];
+  int filter_height = filter->dims->data[1];
+  int output_width = output->dims->data[2];
+  int output_height = output->dims->data[1];
+
+  // Dynimically allocate per-channel quantization parameters.
+  const int num_channels = filter->dims->data[kConvQuantizedDimension];
+  data->per_channel_output_multiplier =
+      reinterpret_cast<int32_t*>(context->AllocatePersistentBuffer(
+          context, num_channels * sizeof(int32_t)));
+  data->per_channel_output_shift =
+      reinterpret_cast<int32_t*>(context->AllocatePersistentBuffer(
+          context, num_channels * sizeof(int32_t)));
+
+  // All per-channel quantized tensors need valid zero point and scale arrays.
+  if (input->type == kTfLiteInt8) {
+    TF_LITE_ENSURE_EQ(context, filter->quantization.type,
+                      kTfLiteAffineQuantization);
+
+    const auto* affine_quantization =
+        static_cast<TfLiteAffineQuantization*>(filter->quantization.params);
+    TF_LITE_ENSURE(context, affine_quantization);
+    TF_LITE_ENSURE(context, affine_quantization->scale);
+    TF_LITE_ENSURE(context, affine_quantization->zero_point);
+
+    TF_LITE_ENSURE(context,
+                   affine_quantization->scale->size == 1 ||
+                       affine_quantization->scale->size ==
+                           filter->dims->data[kConvQuantizedDimension]);
+    TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size,
+                      affine_quantization->zero_point->size);
+  }
+
+  TF_LITE_ENSURE_STATUS(CalculateOpData(
+      context, node, params, input_width, input_height, filter_width,
+      filter_height, output_width, output_height, input->type, data));
+
+  data->input_zero_point = input->params.zero_point;
+  data->filter_zero_point = filter->params.zero_point;
+  data->output_zero_point = output->params.zero_point;
+
+  return kTfLiteOk;
+}  // namespace conv
+
+void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
+                   TfLiteConvParams* params, const OpData& data,
+                   const TfLiteTensor* input, const TfLiteTensor* filter,
+                   const TfLiteTensor* bias, TfLiteTensor* im2col,
+                   TfLiteTensor* hwcn_weights, TfLiteTensor* output) {
 #if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
-  const int32_t input_offset = -input->params.zero_point;
-  const int32_t filter_offset = -filter->params.zero_point;
-  const int32_t output_offset = output->params.zero_point;
+  const int32_t input_offset = -data.input_zero_point;
+  const int32_t filter_offset = -data.filter_zero_point;
+  const int32_t output_offset = data.output_zero_point;
 
   ConvParams op_params;
   op_params.padding_type = RuntimePaddingType(params->padding);
-  op_params.padding_values.width = data->padding.width;
-  op_params.padding_values.height = data->padding.height;
+  op_params.padding_values.width = data.padding.width;
+  op_params.padding_values.height = data.padding.height;
   op_params.stride_width = params->stride_width;
   op_params.stride_height = params->stride_height;
   op_params.dilation_width_factor = params->dilation_width_factor;
@@ -152,44 +219,42 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
   op_params.input_offset = input_offset;
   op_params.weights_offset = filter_offset;
   op_params.output_offset = output_offset;
-  op_params.output_multiplier = data->output_multiplier;
-  op_params.output_shift = -data->output_shift;
-  op_params.quantized_activation_min = data->output_activation_min;
-  op_params.quantized_activation_max = data->output_activation_max;
+  op_params.output_multiplier = data.output_multiplier;
+  op_params.output_shift = -data.output_shift;
+  op_params.quantized_activation_min = data.output_activation_min;
+  op_params.quantized_activation_max = data.output_activation_max;
   reference_ops::Conv(op_params, GetTensorShape(input),
                       GetTensorData<uint8_t>(input), GetTensorShape(filter),
                       GetTensorData<uint8_t>(filter), GetTensorShape(bias),
                       GetTensorData<int32_t>(bias), GetTensorShape(output),
                       GetTensorData<uint8_t>(output), GetTensorShape(im2col),
                       GetTensorData<uint8_t>(im2col), nullptr);
-  return kTfLiteOk;
 #else
   TF_LITE_KERNEL_LOG(context,
                      "Type %s (%d) is not supported by ARC MLI Library.",
                      TfLiteTypeGetName(input->type), input->type);
-  return kTfLiteError;
 #endif
 }
 
 TfLiteStatus EvalMliQuantizedPerChannel(
     TfLiteContext* context, TfLiteNode* node, TfLiteConvParams* params,
-    OpData* data, const TfLiteTensor* input, const TfLiteTensor* filter,
+    const OpData& data, const TfLiteTensor* input, const TfLiteTensor* filter,
     const TfLiteTensor* bias, TfLiteTensor* output) {
   // Run Conv MLI kernel
   // MLI optimized version only supports int8_t dataype and dilation factor of 1
   if ((input->type == kTfLiteInt8) && (params->dilation_width_factor == 1) &&
       (params->dilation_height_factor == 1)) {
-    mli_tensor mli_in = {0};
-    mli_tensor mli_weights = {0};
-    mli_tensor mli_bias = {0};
-    mli_tensor mli_out = {0};
+    mli_tensor mli_in = {};
+    mli_tensor mli_weights = {};
+    mli_tensor mli_bias = {};
+    mli_tensor mli_out = {};
     mli_conv2d_cfg cfg = {};
 
     // reuse space allocated for OpData parameters
-    mli_weights.el_params.asym.scale.pi16 =
-        (int16_t*)data->per_channel_output_multiplier;
-    mli_bias.el_params.asym.scale.pi16 =
-        (int16_t*)data->per_channel_output_shift;
+    mli_weights.el_params.asym.scale.pi32 =
+        (int32_t*)data.per_channel_output_multiplier;
+    mli_bias.el_params.asym.scale.pi32 =
+        (int32_t*)data.per_channel_output_shift;
 
     int16_t filter_zero_point = 0;
     int16_t bias_zero_point = 0;
@@ -219,10 +284,10 @@ TfLiteStatus EvalMliQuantizedPerChannel(
       cfg.padding_top = 0;
       cfg.padding_bottom = 0;
     } else {
-      cfg.padding_left = data->padding.width;
-      cfg.padding_right = data->padding.width + data->padding.width_offset;
-      cfg.padding_top = data->padding.height;
-      cfg.padding_bottom = data->padding.height + data->padding.height_offset;
+      cfg.padding_left = data.padding.width;
+      cfg.padding_right = data.padding.width + data.padding.width_offset;
+      cfg.padding_top = data.padding.height;
+      cfg.padding_bottom = data.padding.height + data.padding.height_offset;
     }
 
     // for height slicing
@@ -276,7 +341,7 @@ TfLiteStatus EvalMliQuantizedPerChannel(
     mli_tensor* b_ptr = b_is_local ? b_slice.Sub() : &bias_local;
 
     void* input_buffer_ptr = NULL;
-    int input_buffer_size = 0;
+    uint32_t input_buffer_size = 0;
 
     while (!w_slice.Done()) {
       mli_mov_tensor_sync(w_slice.Sub(), &copy_config, w_ptr);
@@ -331,54 +396,51 @@ TfLiteStatus EvalMliQuantizedPerChannel(
   return kTfLiteOk;
 }
 
-TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
-                                     TfLiteConvParams* params, OpData* data,
-                                     const TfLiteTensor* input,
-                                     const TfLiteTensor* filter,
-                                     const TfLiteTensor* bias,
-                                     TfLiteTensor* output) {
+void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
+                             TfLiteConvParams* params, const OpData& data,
+                             const TfLiteTensor* input,
+                             const TfLiteTensor* filter,
+                             const TfLiteTensor* bias, TfLiteTensor* output,
+                             TfLiteTensor* im2col) {
 #if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
   ConvParams op_params;
-  op_params.input_offset = -input->params.zero_point;
-  op_params.output_offset = output->params.zero_point;
+  op_params.input_offset = -data.input_zero_point;
+  op_params.output_offset = data.output_zero_point;
   op_params.stride_height = params->stride_height;
   op_params.stride_width = params->stride_width;
   op_params.dilation_height_factor = params->dilation_height_factor;
   op_params.dilation_width_factor = params->dilation_width_factor;
-  op_params.padding_values.height = data->padding.height;
-  op_params.padding_values.width = data->padding.width;
-  op_params.quantized_activation_min = data->output_activation_min;
-  op_params.quantized_activation_max = data->output_activation_max;
+  op_params.padding_values.height = data.padding.height;
+  op_params.padding_values.width = data.padding.width;
+  op_params.quantized_activation_min = data.output_activation_min;
+  op_params.quantized_activation_max = data.output_activation_max;
 
   reference_integer_ops::ConvPerChannel(
-      op_params, data->per_channel_output_multiplier,
-      data->per_channel_output_shift, GetTensorShape(input),
+      op_params, data.per_channel_output_multiplier,
+      data.per_channel_output_shift, GetTensorShape(input),
       GetTensorData<int8_t>(input), GetTensorShape(filter),
       GetTensorData<int8_t>(filter), GetTensorShape(bias),
       GetTensorData<int32_t>(bias), GetTensorShape(output),
       GetTensorData<int8_t>(output));
-  return kTfLiteOk;
 #else
   TF_LITE_KERNEL_LOG(context,
                      "Node configuration is not supported by ARC MLI Library.");
-  return kTfLiteError;
 #endif
 }
 
-TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
-                       TfLiteConvParams* params, OpData* data,
-                       const TfLiteTensor* input, const TfLiteTensor* filter,
-                       const TfLiteTensor* bias, TfLiteTensor* im2col,
-                       TfLiteTensor* hwcn_weights, TfLiteTensor* output) {
+void EvalFloat(TfLiteContext* context, TfLiteNode* node,
+               TfLiteConvParams* params, const OpData& data,
+               const TfLiteTensor* input, const TfLiteTensor* filter,
+               const TfLiteTensor* bias, TfLiteTensor* im2col,
+               TfLiteTensor* hwcn_weights, TfLiteTensor* output) {
 #if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
   float output_activation_min, output_activation_max;
   CalculateActivationRange(params->activation, &output_activation_min,
                            &output_activation_max);
-
   ConvParams op_params;
   op_params.padding_type = RuntimePaddingType(params->padding);
-  op_params.padding_values.width = data->padding.width;
-  op_params.padding_values.height = data->padding.height;
+  op_params.padding_values.width = data.padding.width;
+  op_params.padding_values.height = data.padding.height;
   op_params.stride_width = params->stride_width;
   op_params.stride_height = params->stride_height;
   op_params.dilation_width_factor = params->dilation_width_factor;
@@ -392,12 +454,10 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
                       GetTensorData<float>(bias), GetTensorShape(output),
                       GetTensorData<float>(output), GetTensorShape(im2col),
                       GetTensorData<float>(im2col));
-  return kTfLiteOk;
 #else
   TF_LITE_KERNEL_LOG(context,
                      "Type %s (%d) is not supported by ARC MLI Library.",
                      TfLiteTypeGetName(input->type), input->type);
-  return kTfLiteError;
 #endif
 }
 
@@ -409,59 +469,26 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
   const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
 
-  int input_width = input->dims->data[2];
-  int input_height = input->dims->data[1];
-  int filter_width = filter->dims->data[2];
-  int filter_height = filter->dims->data[1];
-  int output_width = output->dims->data[2];
-  int output_height = output->dims->data[1];
-
-  OpData data;
-
-  // All per-channel quantized tensors need valid zero point and scale arrays.
-  if (input->type == kTfLiteInt8) {
-    TF_LITE_ENSURE_EQ(context, filter->quantization.type,
-                      kTfLiteAffineQuantization);
-
-    const auto* affine_quantization =
-        reinterpret_cast<TfLiteAffineQuantization*>(
-            filter->quantization.params);
-    TF_LITE_ENSURE(context, affine_quantization);
-    TF_LITE_ENSURE(context, affine_quantization->scale);
-    TF_LITE_ENSURE(context, affine_quantization->zero_point);
-
-    TF_LITE_ENSURE(context,
-                   affine_quantization->scale->size == 1 ||
-                       affine_quantization->scale->size ==
-                           filter->dims->data[kConvQuantizedDimension]);
-    TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size,
-                      affine_quantization->zero_point->size);
-  }
-  bool mli_is_applicable =
-      IsMliApplicable(context, input, filter, bias, params);
-  TF_LITE_ENSURE_STATUS(
-      CalculateOpData(context, node, params, input_width, input_height,
-                      filter_width, filter_height, output_width, output_height,
-                      input->type, mli_is_applicable, &data));
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpData& data = *(static_cast<const OpData*>(node->user_data));
 
   switch (input->type) {  // Already know in/out types are same.
     case kTfLiteFloat32:
-      return EvalFloat(context, node, params, &data, input, filter, bias,
-                       nullptr, nullptr, output);
+      EvalFloat(context, node, params, data, input, filter, bias, nullptr,
+                nullptr, output);
       break;
     case kTfLiteInt8:
-      if (mli_is_applicable) {
-        return EvalMliQuantizedPerChannel(context, node, params, &data, input,
-                                          filter, bias, output);
-
+      if (IsMliApplicable(context, input, filter, bias, params)) {
+        EvalMliQuantizedPerChannel(context, node, params, data, input, filter,
+                                   bias, output);
       } else {
-        return EvalQuantizedPerChannel(context, node, params, &data, input,
-                                       filter, bias, output);
+        EvalQuantizedPerChannel(context, node, params, data, input, filter,
+                                bias, output, nullptr);
       }
       break;
     case kTfLiteUInt8:
-      return EvalQuantized(context, node, params, &data, input, filter, bias,
-                           nullptr, nullptr, output);
+      EvalQuantized(context, node, params, data, input, filter, bias, nullptr,
+                    nullptr, output);
       break;
     default:
       TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
@@ -474,9 +501,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace conv
 
 TfLiteRegistration Register_CONV_2D() {
-  return {/*init=*/nullptr,
+  return {/*init=*/conv::Init,
           /*free=*/nullptr,
-          /*prepare=*/nullptr,
+          /*prepare=*/conv::Prepare,
           /*invoke=*/conv::Eval,
           /*profiling_string=*/nullptr,
           /*builtin_code=*/0,
diff --git a/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc b/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc
index 9f8a6b4004c..ae98b996987 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc
@@ -40,7 +40,6 @@ constexpr int kInputTensor = 0;
 constexpr int kFilterTensor = 1;
 constexpr int kBiasTensor = 2;
 constexpr int kOutputTensor = 0;
-constexpr int kMaxChannels = 256;
 
 // Depthwise conv is quantized along dimension 3:
 // https://www.tensorflow.org/lite/performance/quantization_spec
@@ -48,15 +47,20 @@ constexpr int kDepthwiseConvQuantizedDimension = 3;
 
 struct OpData {
   TfLitePaddingValues padding;
+
+  // Cached tensor zero point values for quantized operations.
+  int32_t input_zero_point;
+  int32_t filter_zero_point;
+  int32_t output_zero_point;
+
   // The scaling factor from input to output (aka the 'real multiplier') can
   // be represented as a fixed point multiplier plus a left shift.
   int32_t output_multiplier;
   int output_shift;
 
   // Per channel output multiplier and shift.
-  int32_t per_channel_output_multiplier[kMaxChannels];
-  int32_t per_channel_output_shift[kMaxChannels];
-
+  int32_t* per_channel_output_multiplier;
+  int32_t* per_channel_output_shift;
   // The range of the fused activation layer. For example for kNone and
   // uint8_t these would be 0 and 255.
   int32_t output_activation_min;
@@ -81,16 +85,14 @@ bool IsMliApplicable(TfLiteContext* context, const TfLiteTensor* input,
                  (params->dilation_height_factor == 1) &&
                  (affine_quantization->scale->size ==
                   filter->dims->data[kDepthwiseConvQuantizedDimension]) &&
-                 ((in_ch == filters_num) || (in_ch == 1)) &&
-                 affine_quantization->scale->size <= (kMaxChannels * 2);
+                 ((in_ch == filters_num) || (in_ch == 1));
   return ret_val;
 }
 
 TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
                              TfLiteDepthwiseConvParams* params, int width,
                              int height, int filter_width, int filter_height,
-                             const TfLiteType data_type, bool mli_is_applicable,
-                             OpData* data) {
+                             const TfLiteType data_type, OpData* data) {
   bool has_bias = node->inputs->size == 3;
   // Check number of inputs/outputs
   TF_LITE_ENSURE(context, has_bias || node->inputs->size == 2);
@@ -105,30 +107,21 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
   // Note that quantized inference requires that all tensors have their
   // parameters set. This is usually done during quantized training.
 #if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
-  if (data_type != kTfLiteFloat32 && !mli_is_applicable) {
-    const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-    const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
-    const TfLiteTensor* bias =
-        GetOptionalInputTensor(context, node, kBiasTensor);
-    TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
+  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  if (data_type != kTfLiteFloat32 &&
+      !IsMliApplicable(context, input, filter, bias, params)) {
     int num_channels = filter->dims->data[kDepthwiseConvQuantizedDimension];
 
-    // Ensure filter and bias channel count does not exceed space reserved for
-    // quantization metadata.
-    const auto filter_quantization =
-        reinterpret_cast<TfLiteAffineQuantization*>(
-            filter->quantization.params);
-    const auto bias_quantization =
-        reinterpret_cast<TfLiteAffineQuantization*>(bias->quantization.params);
-    TF_LITE_ENSURE(context, filter_quantization->scale->size <= kMaxChannels);
-    TF_LITE_ENSURE(context, bias_quantization->scale->size <= kMaxChannels);
-
-    TF_LITE_ENSURE_STATUS(tflite::PopulateConvolutionQuantizationParams(
+    return tflite::PopulateConvolutionQuantizationParams(
         context, input, filter, bias, output, params->activation,
         &data->output_multiplier, &data->output_shift,
         &data->output_activation_min, &data->output_activation_max,
         data->per_channel_output_multiplier,
-        reinterpret_cast<int*>(data->per_channel_output_shift), num_channels));
+        reinterpret_cast<int*>(data->per_channel_output_shift), num_channels);
   }
 #endif
   return kTfLiteOk;
@@ -136,10 +129,74 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
 
 }  // namespace
 
-TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
-                       TfLiteDepthwiseConvParams* params, OpData* data,
-                       const TfLiteTensor* input, const TfLiteTensor* filter,
-                       const TfLiteTensor* bias, TfLiteTensor* output) {
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+
+  auto* params =
+      reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
+  OpData* data = static_cast<OpData*>(node->user_data);
+
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
+
+  const TfLiteType data_type = input->type;
+  int width = SizeOfDimension(input, 2);
+  int height = SizeOfDimension(input, 1);
+  int filter_width = SizeOfDimension(filter, 2);
+  int filter_height = SizeOfDimension(filter, 1);
+
+  // Per channel quantization is only needed for int8 inference. For other
+  // quantized types, only a single scale and zero point is needed.
+  const int num_channels = filter->dims->data[kDepthwiseConvQuantizedDimension];
+  // Dynimically allocate per-channel quantization parameters.
+  data->per_channel_output_multiplier =
+      reinterpret_cast<int32_t*>(context->AllocatePersistentBuffer(
+          context, num_channels * sizeof(int32_t)));
+  data->per_channel_output_shift =
+      reinterpret_cast<int32_t*>(context->AllocatePersistentBuffer(
+          context, num_channels * sizeof(int32_t)));
+
+  // All per-channel quantized tensors need valid zero point and scale arrays.
+  if (input->type == kTfLiteInt8) {
+    TF_LITE_ENSURE_EQ(context, filter->quantization.type,
+                      kTfLiteAffineQuantization);
+
+    const auto* affine_quantization =
+        reinterpret_cast<TfLiteAffineQuantization*>(
+            filter->quantization.params);
+    TF_LITE_ENSURE(context, affine_quantization);
+    TF_LITE_ENSURE(context, affine_quantization->scale);
+    TF_LITE_ENSURE(context, affine_quantization->zero_point);
+    TF_LITE_ENSURE(
+        context, affine_quantization->scale->size == 1 ||
+                     affine_quantization->scale->size ==
+                         filter->dims->data[kDepthwiseConvQuantizedDimension]);
+    TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size,
+                      affine_quantization->zero_point->size);
+  }
+
+  TF_LITE_ENSURE_STATUS(CalculateOpData(context, node, params, width, height,
+                                        filter_width, filter_height, data_type,
+                                        data));
+
+  data->input_zero_point = input->params.zero_point;
+  data->filter_zero_point = filter->params.zero_point;
+  data->output_zero_point = output->params.zero_point;
+
+  return kTfLiteOk;
+}
+
+void EvalFloat(TfLiteContext* context, TfLiteNode* node,
+               TfLiteDepthwiseConvParams* params, const OpData& data,
+               const TfLiteTensor* input, const TfLiteTensor* filter,
+               const TfLiteTensor* bias, TfLiteTensor* output) {
 #if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
   float output_activation_min, output_activation_max;
   CalculateActivationRange(params->activation, &output_activation_min,
@@ -148,8 +205,8 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
   tflite::DepthwiseParams op_params;
   // Padding type is ignored, but still set.
   op_params.padding_type = PaddingType::kSame;
-  op_params.padding_values.width = data->padding.width;
-  op_params.padding_values.height = data->padding.height;
+  op_params.padding_values.width = data.padding.width;
+  op_params.padding_values.height = data.padding.height;
   op_params.stride_width = params->stride_width;
   op_params.stride_height = params->stride_height;
   op_params.dilation_width_factor = params->dilation_width_factor;
@@ -163,30 +220,27 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
       GetTensorShape(filter), GetTensorData<float>(filter),
       GetTensorShape(bias), GetTensorData<float>(bias), GetTensorShape(output),
       GetTensorData<float>(output));
-  return kTfLiteOk;
 #else
   TF_LITE_KERNEL_LOG(context,
                      "Type %s (%d) is not supported by ARC MLI Library.",
                      TfLiteTypeGetName(input->type), input->type);
-  return kTfLiteError;
 #endif
 }
-
 TfLiteStatus EvalMliQuantizedPerChannel(
     TfLiteContext* context, TfLiteNode* node, TfLiteDepthwiseConvParams* params,
-    OpData* data, const TfLiteTensor* input, const TfLiteTensor* filter,
+    const OpData& data, const TfLiteTensor* input, const TfLiteTensor* filter,
     const TfLiteTensor* bias, TfLiteTensor* output) {
   // Run Depthwise Conv MLI kernel
-  mli_tensor mli_in = {0};
-  mli_tensor mli_weights = {0};
-  mli_tensor mli_bias = {0};
-  mli_tensor mli_out = {0};
+  mli_tensor mli_in = {};
+  mli_tensor mli_weights = {};
+  mli_tensor mli_bias = {};
+  mli_tensor mli_out = {};
   mli_conv2d_cfg cfg = {};
 
   // reuse space allocated for OpData parameters
-  mli_weights.el_params.asym.scale.pi16 =
-      (int16_t*)data->per_channel_output_multiplier;
-  mli_bias.el_params.asym.scale.pi16 = (int16_t*)data->per_channel_output_shift;
+  mli_weights.el_params.asym.scale.pi32 =
+      (int32_t*)data.per_channel_output_multiplier;
+  mli_bias.el_params.asym.scale.pi32 = (int32_t*)data.per_channel_output_shift;
 
   int16_t filter_zero_point = 0;
   int16_t bias_zero_point = 0;
@@ -216,10 +270,10 @@ TfLiteStatus EvalMliQuantizedPerChannel(
     cfg.padding_top = 0;
     cfg.padding_bottom = 0;
   } else {
-    cfg.padding_left = data->padding.width;
-    cfg.padding_right = data->padding.width + data->padding.width_offset;
-    cfg.padding_top = data->padding.height;
-    cfg.padding_bottom = data->padding.height + data->padding.height_offset;
+    cfg.padding_left = data.padding.width;
+    cfg.padding_right = data.padding.width + data.padding.width_offset;
+    cfg.padding_top = data.padding.height;
+    cfg.padding_bottom = data.padding.height + data.padding.height_offset;
   }
 
   // for height slicing
@@ -287,7 +341,7 @@ TfLiteStatus EvalMliQuantizedPerChannel(
   mli_tensor* b_ptr = b_is_local ? b_slice.Sub() : &bias_local;
 
   void* input_buffer_ptr = NULL;
-  int input_buffer_size = 0;
+  uint32_t input_buffer_size = 0;
   int padding_top = cfg.padding_top;
   int padding_bottom = cfg.padding_bottom;
 
@@ -348,89 +402,87 @@ TfLiteStatus EvalMliQuantizedPerChannel(
   return kTfLiteOk;
 }
 
-TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
-                                     TfLiteDepthwiseConvParams* params,
-                                     OpData* data, const TfLiteTensor* input,
-                                     const TfLiteTensor* filter,
-                                     const TfLiteTensor* bias,
-                                     TfLiteTensor* output) {
+void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
+                             TfLiteDepthwiseConvParams* params,
+                             const OpData& data, const TfLiteTensor* input,
+                             const TfLiteTensor* filter,
+                             const TfLiteTensor* bias, TfLiteTensor* output) {
 #if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
   DepthwiseParams op_params;
   op_params.padding_type = PaddingType::kSame;
-  op_params.padding_values.width = data->padding.width;
-  op_params.padding_values.height = data->padding.height;
+  op_params.padding_values.width = data.padding.width;
+  op_params.padding_values.height = data.padding.height;
   op_params.stride_width = params->stride_width;
   op_params.stride_height = params->stride_height;
   op_params.dilation_width_factor = params->dilation_width_factor;
   op_params.dilation_height_factor = params->dilation_height_factor;
   op_params.depth_multiplier = params->depth_multiplier;
-  op_params.input_offset = -input->params.zero_point;
+  op_params.input_offset = -data.input_zero_point;
   op_params.weights_offset = 0;
-  op_params.output_offset = output->params.zero_point;
-  op_params.quantized_activation_min = data->output_activation_min;
-  op_params.quantized_activation_max = data->output_activation_max;
+  op_params.output_offset = data.output_zero_point;
+  op_params.quantized_activation_min = std::numeric_limits<int8_t>::min();
+  op_params.quantized_activation_max = std::numeric_limits<int8_t>::max();
 
   reference_integer_ops::DepthwiseConvPerChannel(
-      op_params, data->per_channel_output_multiplier,
-      data->per_channel_output_shift, GetTensorShape(input),
+      op_params, data.per_channel_output_multiplier,
+      data.per_channel_output_shift, GetTensorShape(input),
       GetTensorData<int8_t>(input), GetTensorShape(filter),
       GetTensorData<int8_t>(filter), GetTensorShape(bias),
       GetTensorData<int32_t>(bias), GetTensorShape(output),
       GetTensorData<int8_t>(output));
-  return kTfLiteOk;
 #else
   TF_LITE_KERNEL_LOG(context,
                      "Node configuration is not supported by ARC MLI Library.");
-  return kTfLiteError;
 #endif
 }
 
-TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                           TfLiteDepthwiseConvParams* params, OpData* data,
-                           const TfLiteTensor* input,
-                           const TfLiteTensor* filter, const TfLiteTensor* bias,
-                           TfLiteTensor* output) {
+void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
+                   TfLiteDepthwiseConvParams* params, const OpData& data,
+                   const TfLiteTensor* input, const TfLiteTensor* filter,
+                   const TfLiteTensor* bias, TfLiteTensor* output) {
 #if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
-  const int32_t input_offset = -input->params.zero_point;
-  const int32_t filter_offset = -filter->params.zero_point;
-  const int32_t output_offset = output->params.zero_point;
+  const int32_t input_offset = -data.input_zero_point;
+  const int32_t filter_offset = -data.filter_zero_point;
+  const int32_t output_offset = data.output_zero_point;
 
   tflite::DepthwiseParams op_params;
   // Padding type is ignored, but still set.
   op_params.padding_type = PaddingType::kSame;
-  op_params.padding_values.width = data->padding.width;
-  op_params.padding_values.height = data->padding.height;
+  op_params.padding_values.width = data.padding.width;
+  op_params.padding_values.height = data.padding.height;
   op_params.stride_width = params->stride_width;
   op_params.stride_height = params->stride_height;
   op_params.dilation_width_factor = params->dilation_width_factor;
   op_params.dilation_height_factor = params->dilation_height_factor;
   op_params.depth_multiplier = params->depth_multiplier;
-  op_params.quantized_activation_min = data->output_activation_min;
-  op_params.quantized_activation_max = data->output_activation_max;
+  op_params.quantized_activation_min = data.output_activation_min;
+  op_params.quantized_activation_max = data.output_activation_max;
   op_params.input_offset = input_offset;
   op_params.weights_offset = filter_offset;
   op_params.output_offset = output_offset;
-  op_params.output_multiplier = data->output_multiplier;
+  op_params.output_multiplier = data.output_multiplier;
   // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
-  op_params.output_shift = -data->output_shift;
+  op_params.output_shift = -data.output_shift;
 
   tflite::reference_ops::DepthwiseConv(
       op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
       GetTensorShape(filter), GetTensorData<uint8_t>(filter),
       GetTensorShape(bias), GetTensorData<int32_t>(bias),
       GetTensorShape(output), GetTensorData<uint8_t>(output));
-  return kTfLiteOk;
 #else
   TF_LITE_KERNEL_LOG(context,
                      "Type %s (%d) is not supported by ARC MLI Library.",
                      TfLiteTypeGetName(input->type), input->type);
-  return kTfLiteError;
 #endif
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+
   auto* params =
       reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
+  const OpData& data = *(static_cast<const OpData*>(node->user_data));
 
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
@@ -438,55 +490,21 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* bias =
       (NumInputs(node) == 3) ? GetInput(context, node, kBiasTensor) : nullptr;
 
-  const TfLiteType data_type = input->type;
-  int width = SizeOfDimension(input, 2);
-  int height = SizeOfDimension(input, 1);
-  int filter_width = SizeOfDimension(filter, 2);
-  int filter_height = SizeOfDimension(filter, 1);
-
-  OpData data;
-
-  // All per-channel quantized tensors need valid zero point and scale arrays.
-  if (input->type == kTfLiteInt8) {
-    TF_LITE_ENSURE_EQ(context, filter->quantization.type,
-                      kTfLiteAffineQuantization);
-
-    const auto* affine_quantization =
-        reinterpret_cast<TfLiteAffineQuantization*>(
-            filter->quantization.params);
-    TF_LITE_ENSURE(context, affine_quantization);
-    TF_LITE_ENSURE(context, affine_quantization->scale);
-    TF_LITE_ENSURE(context, affine_quantization->zero_point);
-    TF_LITE_ENSURE(
-        context, affine_quantization->scale->size == 1 ||
-                     affine_quantization->scale->size ==
-                         filter->dims->data[kDepthwiseConvQuantizedDimension]);
-    TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size,
-                      affine_quantization->zero_point->size);
-  }
-
-  bool mli_is_applicable =
-      IsMliApplicable(context, input, filter, bias, params);
-  TF_LITE_ENSURE_STATUS(CalculateOpData(context, node, params, width, height,
-                                        filter_width, filter_height, data_type,
-                                        mli_is_applicable, &data));
   switch (input->type) {  // Already know in/out types are same.
     case kTfLiteFloat32:
-      return EvalFloat(context, node, params, &data, input, filter, bias,
-                       output);
+      EvalFloat(context, node, params, data, input, filter, bias, output);
       break;
     case kTfLiteInt8:
-      if (mli_is_applicable) {
-        return EvalMliQuantizedPerChannel(context, node, params, &data, input,
-                                          filter, bias, output);
+      if (IsMliApplicable(context, input, filter, bias, params)) {
+        EvalMliQuantizedPerChannel(context, node, params, data, input, filter,
+                                   bias, output);
       } else {
-        return EvalQuantizedPerChannel(context, node, params, &data, input,
-                                       filter, bias, output);
+        EvalQuantizedPerChannel(context, node, params, data, input, filter,
+                                bias, output);
       }
       break;
     case kTfLiteUInt8:
-      return EvalQuantized(context, node, params, &data, input, filter, bias,
-                           output);
+      EvalQuantized(context, node, params, data, input, filter, bias, output);
       break;
     default:
       TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
@@ -499,9 +517,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace depthwise_conv
 
 TfLiteRegistration Register_DEPTHWISE_CONV_2D() {
-  return {/*init=*/nullptr,
+  return {/*init=*/depthwise_conv::Init,
           /*free=*/nullptr,
-          /*prepare=*/nullptr,
+          /*prepare=*/depthwise_conv::Prepare,
           /*invoke=*/depthwise_conv::Eval,
           /*profiling_string=*/nullptr,
           /*builtin_code=*/0,
diff --git a/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc b/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc
index 24b3fed0998..2d201653efc 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc
@@ -65,7 +65,7 @@ bool IsMliApplicable(TfLiteContext* context, const TfLiteTensor* input,
 }
 
 TfLiteStatus CalculateOpData(TfLiteContext* context,
-                             TfLiteFullyConnectedParams* params,
+                             const TfLiteFullyConnectedParams* params,
                              TfLiteType data_type, const TfLiteTensor* input,
                              const TfLiteTensor* filter,
                              const TfLiteTensor* bias, TfLiteTensor* output,
@@ -91,47 +91,41 @@ TfLiteStatus CalculateOpData(TfLiteContext* context,
 }  // namespace
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
-  OpData* data = nullptr;
-  TfLiteStatus status = context->AllocatePersistentBuffer(
-      context, sizeof(OpData), reinterpret_cast<void**>(&data));
-  if (status != kTfLiteOk || data == nullptr) {
-    return nullptr;
-  }
-  return data;
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
 }
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  OpData* data = reinterpret_cast<OpData*>(node->user_data);
-  auto* params =
-      reinterpret_cast<TfLiteFullyConnectedParams*>(node->builtin_data);
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+
+  OpData* data = static_cast<OpData*>(node->user_data);
+  const auto params =
+      static_cast<const TfLiteFullyConnectedParams*>(node->builtin_data);
 
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
   const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
-  TF_LITE_ENSURE(context, data != nullptr);
   TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
   TF_LITE_ENSURE_MSG(context, input->type == filter->type,
                      "Hybrid models are not supported on TFLite Micro.");
 
-  TfLiteType data_type = input->type;
-  TF_LITE_ENSURE_STATUS(CalculateOpData(context, params, data_type, input,
-                                        filter, bias, output, data));
-
-  return kTfLiteOk;
+  return CalculateOpData(context, params, input->type, input, filter, bias,
+                         output, data);
 }
 
 TfLiteStatus EvalMliQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
-                                  TfLiteFullyConnectedParams* params,
-                                  OpData* data, const TfLiteTensor* input,
+                                  const TfLiteFullyConnectedParams* params,
+                                  const OpData& data, const TfLiteTensor* input,
                                   const TfLiteTensor* filter,
                                   const TfLiteTensor* bias,
                                   TfLiteTensor* output) {
-  mli_tensor mli_in = {0};
-  mli_tensor mli_weights = {0};
-  mli_tensor mli_bias = {0};
-  mli_tensor mli_out = {0};
+  mli_tensor mli_in = {};
+  mli_tensor mli_weights = {};
+  mli_tensor mli_bias = {};
+  mli_tensor mli_out = {};
 
   ConvertToMliTensor<int8_t>(input, &mli_in);
   ConvertToMliTensor<int8_t>(filter, &mli_weights);
@@ -229,19 +223,18 @@ TfLiteStatus EvalMliQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
 }
 
 TfLiteStatus EvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
-                               TfLiteFullyConnectedParams* params, OpData* data,
-                               const TfLiteTensor* input,
+                               const OpData& data, const TfLiteTensor* input,
                                const TfLiteTensor* filter,
                                const TfLiteTensor* bias, TfLiteTensor* output) {
 #if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
-  FullyConnectedParams op_params;
+  tflite::FullyConnectedParams op_params;
   op_params.input_offset = -input->params.zero_point;
   op_params.weights_offset = -filter->params.zero_point;
   op_params.output_offset = output->params.zero_point;
-  op_params.output_multiplier = data->output_multiplier;
-  op_params.output_shift = -data->output_shift;
-  op_params.quantized_activation_min = data->output_activation_min;
-  op_params.quantized_activation_max = data->output_activation_max;
+  op_params.output_multiplier = data.output_multiplier;
+  op_params.output_shift = -data.output_shift;
+  op_params.quantized_activation_min = data.output_activation_min;
+  op_params.quantized_activation_max = data.output_activation_max;
 
   reference_integer_ops::FullyConnected(
       op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
@@ -257,8 +250,7 @@ TfLiteStatus EvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
 }
 
 TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                           TfLiteFullyConnectedParams* params, OpData* data,
-                           const TfLiteTensor* input,
+                           const OpData& data, const TfLiteTensor* input,
                            const TfLiteTensor* filter, const TfLiteTensor* bias,
                            TfLiteTensor* output) {
 #if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
@@ -270,11 +262,11 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
   op_params.input_offset = input_offset;
   op_params.weights_offset = filter_offset;
   op_params.output_offset = output_offset;
-  op_params.output_multiplier = data->output_multiplier;
+  op_params.output_multiplier = data.output_multiplier;
   // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
-  op_params.output_shift = -data->output_shift;
-  op_params.quantized_activation_min = data->output_activation_min;
-  op_params.quantized_activation_max = data->output_activation_max;
+  op_params.output_shift = -data.output_shift;
+  op_params.quantized_activation_min = data.output_activation_min;
+  op_params.quantized_activation_max = data.output_activation_max;
 
 #define TF_LITE_FULLY_CONNECTED(output_data_type)                      \
   reference_ops::FullyConnected(                                       \
@@ -294,6 +286,7 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
                          TfLiteTypeGetName(output->type), output->type);
       return kTfLiteError;
   }
+
   return kTfLiteOk;
 #else
   TF_LITE_KERNEL_LOG(context,
@@ -304,12 +297,12 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
 }
 
 TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
-                       TfLiteFullyConnectedParams* params, OpData* data,
+                       TfLiteFusedActivation activation,
                        const TfLiteTensor* input, const TfLiteTensor* filter,
                        const TfLiteTensor* bias, TfLiteTensor* output) {
 #if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
   float output_activation_min, output_activation_max;
-  CalculateActivationRange(params->activation, &output_activation_min,
+  CalculateActivationRange(activation, &output_activation_min,
                            &output_activation_max);
   tflite::FullyConnectedParams op_params;
   op_params.float_activation_min = output_activation_min;
@@ -329,38 +322,38 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  auto* params =
-      reinterpret_cast<TfLiteFullyConnectedParams*>(node->builtin_data);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+  const auto* params =
+      static_cast<const TfLiteFullyConnectedParams*>(node->builtin_data);
 
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
   const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
-  OpData* data = reinterpret_cast<OpData*>(node->user_data);
-  TF_LITE_ENSURE(context, data != nullptr);
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpData& data = *(static_cast<const OpData*>(node->user_data));
 
   // Checks in Prepare ensure input, output and filter types are all the same.
   switch (input->type) {
     case kTfLiteFloat32:
-      return EvalFloat(context, node, params, data, input, filter, bias,
+      return EvalFloat(context, node, params->activation, input, filter, bias,
                        output);
     case kTfLiteInt8:
       if (IsMliApplicable(context, input, filter, bias, params)) {
         return EvalMliQuantizedInt8(context, node, params, data, input, filter,
                                     bias, output);
       } else {
-        return EvalQuantizedInt8(context, node, params, data, input, filter,
-                                 bias, output);
+        return EvalQuantizedInt8(context, node, data, input, filter, bias,
+                                 output);
       }
 
     case kTfLiteUInt8:
-      return EvalQuantized(context, node, params, data, input, filter, bias,
-                           output);
+      return EvalQuantized(context, node, data, input, filter, bias, output);
 
     default:
       TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
-                         TfLiteTypeGetName(filter->type), filter->type);
+                         TfLiteTypeGetName(input->type), input->type);
       return kTfLiteError;
   }
   return kTfLiteOk;
diff --git a/tensorflow/lite/micro/kernels/arc_mli/mli_slicers.cc b/tensorflow/lite/micro/kernels/arc_mli/mli_slicers.cc
index 4637470f62e..e20eea22a03 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/mli_slicers.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/mli_slicers.cc
@@ -29,15 +29,15 @@ TensorSlicer::TensorSlicer(const mli_tensor* full_tensor, int slice_dim,
       pad_pre_(padding_pre),
       pad_post_(padding_post),
       overlap_(overlap),
-      sub_cfg_{0},
-      sub_tensor_{0},
+      sub_cfg_{},
+      sub_tensor_{},
       done_(false) {
   /* In the interleave mode, the slicing happens from the deepest dimension up
   to the slice_dim for example in an HWC layout this can mode can be used to
   slice in the C dimenstion. in this mode the data is not contiguous in memory
   anymore */
   if (interleave_mode) {
-    for (int i = 0; i < full_tensor->rank; i++) {
+    for (int i = 0; i < static_cast<int>(full_tensor->rank); i++) {
       if (i > slice_dim) {
         sub_cfg_.size[i] = 1;
       } else if (i == slice_dim) {
@@ -53,7 +53,7 @@ TensorSlicer::TensorSlicer(const mli_tensor* full_tensor, int slice_dim,
     dimension up to the slice_dim for example in an HWC layout this mode can be
     used to slice in the H dimension. in this mode the data of the slice is
     still contiguous in memory (if that was the case in the input tensor */
-    for (int i = 0; i < full_tensor->rank; i++) {
+    for (int i = 0; i < static_cast<int>(full_tensor->rank); i++) {
       if (i < slice_dim) {
         sub_cfg_.size[i] = 1;
       } else if (i == slice_dim) {
diff --git a/tensorflow/lite/micro/kernels/arc_mli/mli_tf_utils.h b/tensorflow/lite/micro/kernels/arc_mli/mli_tf_utils.h
index 1764f1fdf45..2888ad51810 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/mli_tf_utils.h
+++ b/tensorflow/lite/micro/kernels/arc_mli/mli_tf_utils.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 
 constexpr int kFracBitsQ15 = 15;
+constexpr int kFracBitsQ31 = 31;
 
 namespace tflite {
 namespace ops {
@@ -50,14 +51,14 @@ static void ConvertToMliQuantParams(const TfLiteTensor* tfT, mli_tensor* mliT) {
   float fscale = tfT->params.scale;
   int exp;
   frexpf(fscale, &exp);
-  int frac_bits = kFracBitsQ15 - exp;
-  int32_t iscale = (1 << frac_bits) * fscale + 0.5f;
+  int frac_bits = kFracBitsQ31 - exp;
+  int32_t iscale = (int32_t)((1ll << frac_bits) * fscale + 0.5f);
   mliT->el_params.asym.scale_frac_bits = frac_bits;
-  mliT->el_params.asym.scale.i16 = (int16_t)iscale;
+  mliT->el_params.asym.scale.i32 = (int32_t)iscale;
 }
 
-static void ConvertToMliQuantParamsPerChannel(const TfLiteTensor* tfT,
-                                              mli_tensor* mliT) {
+static inline void ConvertToMliQuantParamsPerChannel(const TfLiteTensor* tfT,
+                                                     mli_tensor* mliT) {
   // mli tensor scale and zero_point arrays should be allocated at this point
   TFLITE_DCHECK_NE(mliT->el_params.asym.scale.pi16, 0);
   TFLITE_DCHECK_NE(mliT->el_params.asym.zero_point.pi16, 0);
@@ -75,7 +76,7 @@ static void ConvertToMliQuantParamsPerChannel(const TfLiteTensor* tfT,
   for (int i = 0; i < num_channels; i++) {
     int exp;
     frexpf(fscale[i], &exp);
-    int cur_frac_bits = kFracBitsQ15 - exp;
+    int cur_frac_bits = kFracBitsQ31 - exp;
     if (i == 0) {
       min_frac_bits = cur_frac_bits;
     } else {
@@ -86,8 +87,8 @@ static void ConvertToMliQuantParamsPerChannel(const TfLiteTensor* tfT,
   mliT->el_params.asym.scale_frac_bits = min_frac_bits;
 
   for (int i = 0; i < num_channels; i++) {
-    int16_t iscale = (int16_t)((1 << min_frac_bits) * fscale[i] + 0.5f);
-    mliT->el_params.asym.scale.pi16[i] = iscale;
+    int32_t iscale = (int32_t)((1ll << min_frac_bits) * fscale[i] + 0.5f);
+    mliT->el_params.asym.scale.pi32[i] = iscale;
   }
 }
 
diff --git a/tensorflow/lite/micro/kernels/arc_mli/pooling.cc b/tensorflow/lite/micro/kernels/arc_mli/pooling.cc
index 44bc966a8e2..2194d3c71f2 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/pooling.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/pooling.cc
@@ -66,10 +66,9 @@ TfLiteStatus CalculateOpData(const TfLiteContext* context,
   return kTfLiteOk;
 }
 
-TfLiteStatus AverageEvalFloat(TfLiteContext* context, const TfLiteNode* node,
-                              const TfLitePoolParams* params,
-                              const OpData* data, const TfLiteTensor* input,
-                              TfLiteTensor* output) {
+void AverageEvalFloat(TfLiteContext* context, const TfLiteNode* node,
+                      const TfLitePoolParams* params, const OpData* data,
+                      const TfLiteTensor* input, TfLiteTensor* output) {
 #if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
   float activation_min, activation_max;
   CalculateActivationRange(params->activation, &activation_min,
@@ -87,12 +86,10 @@ TfLiteStatus AverageEvalFloat(TfLiteContext* context, const TfLiteNode* node,
   reference_ops::AveragePool(
       op_params, GetTensorShape(input), GetTensorData<float>(input),
       GetTensorShape(output), GetTensorData<float>(output));
-  return kTfLiteOk;
 #else
   TF_LITE_KERNEL_LOG(context,
                      "Type %s (%d) is not supported by ARC MLI Library.",
                      TfLiteTypeGetName(input->type), input->type);
-  return kTfLiteError;
 #endif
 }
 
@@ -100,9 +97,9 @@ TfLiteStatus AverageEvalFloat(TfLiteContext* context, const TfLiteNode* node,
 TfLiteStatus EvalMli(TfLiteContext* context, const TfLitePoolParams* params,
                      const OpData* data, const TfLiteTensor* input,
                      TfLiteTensor* output, const MliPoolingType pooling_type) {
-  mli_tensor mli_in = {0};
-  mli_tensor mli_out = {0};
-  mli_pool_cfg cfg = {0};
+  mli_tensor mli_in = {};
+  mli_tensor mli_out = {};
+  mli_pool_cfg cfg = {};
 
   ConvertToMliTensor<int8_t>(input, &mli_in);
   ConvertToMliTensor<int8_t>(output, &mli_out);
@@ -178,16 +175,15 @@ TfLiteStatus EvalMli(TfLiteContext* context, const TfLitePoolParams* params,
   return kTfLiteOk;
 }
 
-TfLiteStatus AverageEvalQuantized(TfLiteContext* context,
-                                  const TfLiteNode* node,
-                                  const TfLitePoolParams* params,
-                                  const OpData* data, const TfLiteTensor* input,
-                                  TfLiteTensor* output) {
+void AverageEvalQuantized(TfLiteContext* context, const TfLiteNode* node,
+                          const TfLitePoolParams* params, const OpData* data,
+                          const TfLiteTensor* input, TfLiteTensor* output) {
 #if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
   TFLITE_DCHECK(input->type == kTfLiteUInt8 || input->type == kTfLiteInt8);
   int32_t activation_min, activation_max;
   (void)CalculateActivationRangeQuantized(context, params->activation, output,
                                           &activation_min, &activation_max);
+
   PoolParams op_params;
   op_params.stride_height = params->stride_height;
   op_params.stride_width = params->stride_width;
@@ -207,19 +203,17 @@ TfLiteStatus AverageEvalQuantized(TfLiteContext* context,
         op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
         GetTensorShape(output), GetTensorData<int8_t>(output));
   }
-  return kTfLiteOk;
 #else
   TF_LITE_KERNEL_LOG(
       context,
       "Node configuration or type %s (%d) is not supported by ARC MLI Library.",
       TfLiteTypeGetName(input->type), input->type);
-  return kTfLiteError;
 #endif
 }
 
-TfLiteStatus MaxEvalFloat(TfLiteContext* context, TfLiteNode* node,
-                          TfLitePoolParams* params, OpData* data,
-                          const TfLiteTensor* input, TfLiteTensor* output) {
+void MaxEvalFloat(TfLiteContext* context, TfLiteNode* node,
+                  TfLitePoolParams* params, OpData* data,
+                  const TfLiteTensor* input, TfLiteTensor* output) {
 #if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
   float activation_min, activation_max;
   CalculateActivationRange(params->activation, &activation_min,
@@ -237,20 +231,19 @@ TfLiteStatus MaxEvalFloat(TfLiteContext* context, TfLiteNode* node,
   reference_ops::MaxPool(op_params, GetTensorShape(input),
                          GetTensorData<float>(input), GetTensorShape(output),
                          GetTensorData<float>(output));
-  return kTfLiteOk;
 #else
   TF_LITE_KERNEL_LOG(context,
                      "Type %s (%d) is not supported by ARC MLI Library.",
                      TfLiteTypeGetName(input->type), input->type);
-  return kTfLiteError;
 #endif
 }
 
-TfLiteStatus MaxEvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                              TfLitePoolParams* params, OpData* data,
-                              const TfLiteTensor* input, TfLiteTensor* output) {
+void MaxEvalQuantized(TfLiteContext* context, TfLiteNode* node,
+                      TfLitePoolParams* params, OpData* data,
+                      const TfLiteTensor* input, TfLiteTensor* output) {
 #if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
   TFLITE_DCHECK(input->type == kTfLiteUInt8 || input->type == kTfLiteInt8);
+
   int32_t activation_min, activation_max;
   (void)CalculateActivationRangeQuantized(context, params->activation, output,
                                           &activation_min, &activation_max);
@@ -274,13 +267,11 @@ TfLiteStatus MaxEvalQuantized(TfLiteContext* context, TfLiteNode* node,
         op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
         GetTensorShape(output), GetTensorData<int8_t>(output));
   }
-  return kTfLiteOk;
 #else
   TF_LITE_KERNEL_LOG(
       context,
       "Node configuration or type %s (%d) is not supported by ARC MLI Library.",
       TfLiteTypeGetName(input->type), input->type);
-  return kTfLiteError;
 #endif
 }
 }  // namespace
@@ -297,15 +288,14 @@ TfLiteStatus AverageEval(TfLiteContext* context, TfLiteNode* node) {
   // Inputs and outputs share the same type, guaranteed by the converter.
   switch (input->type) {
     case kTfLiteFloat32:
-      return AverageEvalFloat(context, node, params, &data, input, output);
+      AverageEvalFloat(context, node, params, &data, input, output);
       break;
     case kTfLiteUInt8:
     case kTfLiteInt8:
       if (IsMliApplicable(context, input, params)) {
-        return EvalMli(context, params, &data, input, output, AveragePooling);
+        EvalMli(context, params, &data, input, output, AveragePooling);
       } else {
-        return AverageEvalQuantized(context, node, params, &data, input,
-                                    output);
+        AverageEvalQuantized(context, node, params, &data, input, output);
       }
       break;
     default:
@@ -327,14 +317,14 @@ TfLiteStatus MaxEval(TfLiteContext* context, TfLiteNode* node) {
 
   switch (input->type) {
     case kTfLiteFloat32:
-      return MaxEvalFloat(context, node, params, &data, input, output);
+      MaxEvalFloat(context, node, params, &data, input, output);
       break;
     case kTfLiteUInt8:
     case kTfLiteInt8:
       if (IsMliApplicable(context, input, params)) {
-        return EvalMli(context, params, &data, input, output, MaxPooling);
+        EvalMli(context, params, &data, input, output, MaxPooling);
       } else {
-        return MaxEvalQuantized(context, node, params, &data, input, output);
+        MaxEvalQuantized(context, node, params, &data, input, output);
       }
       break;
     default:
diff --git a/tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.cc b/tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.cc
index 534d5ef3230..aaf04154602 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.cc
@@ -25,6 +25,7 @@ namespace tflite {
 namespace ops {
 namespace micro {
 
+#ifdef __Xxy
 static void get_arc_two_buffer_sizes(int request_size_1, int request_size_2,
                                      int* grant_size_1, int* grant_size_2) {
   int maxrequest = 0;
@@ -66,7 +67,6 @@ static void get_arc_two_buffer_sizes(int request_size_1, int request_size_2,
 
 static TfLiteStatus get_arc_scratch_buffer_for_io_tensors(
     TfLiteContext* context, mli_tensor* in, mli_tensor* out) {
-#ifdef __Xxy
   int request_size_in = 0;
   int request_size_out = 0;
   int grant_size_in = 0;
@@ -103,9 +103,10 @@ static TfLiteStatus get_arc_scratch_buffer_for_io_tensors(
     out->capacity = grant_size_out;
     if (out->data == NULL) return kTfLiteError;
   }
-#endif
+
   return kTfLiteOk;
 }
+#endif
 
 TfLiteStatus get_arc_scratch_buffer_for_conv_tensors(TfLiteContext* context,
                                                      mli_tensor* in,
@@ -246,12 +247,15 @@ TfLiteStatus arc_scratch_buffer_calc_slice_size_io(
   int max_lines_in = 0;
   int max_lines_out = 0;
   int max_out_lines_for_input = 0;
-  bool fit = (in->capacity >= in_height * line_size_in) &&
-             (out->capacity >= out_height * line_size_out);
+  bool fit = (static_cast<int>(in->capacity) >= in_height * line_size_in) &&
+             (static_cast<int>(out->capacity) >= out_height * line_size_out);
   if (fit) {
     // in case both tensors completely fit in the capacity, there is no need for
-    // slicing
-    *in_slice_height = in_height;
+    // slicing. As padding can affect effective input region, we also derive it
+    // from output height, and rely on a clipping logic which intend to reduce
+    // last smaller slice. I.e the only slice is a kind of
+    // "smaller last slice that need to be corrected"
+    *in_slice_height = std::max(in_height, out_height * stride_height);
     *out_slice_height = out_height;
   } else {
     // First compute how many lines fit into the input tensor, and compute how
@@ -298,8 +302,8 @@ TfLiteStatus arc_scratch_buffer_calc_slice_size_weights(
   int max_ch_weigths = 0;
   int max_ch_bias = 0;
 
-  bool fit = (weights->capacity >= channels * ch_size_w) &&
-             (bias->capacity >= channels * ch_size_b);
+  bool fit = (static_cast<int>(weights->capacity) >= channels * ch_size_w) &&
+             (static_cast<int>(bias->capacity) >= channels * ch_size_b);
   if (fit) {
     // in case both tensors completely fit in the capacity, there is no need for
     // slicing
diff --git a/tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.cc b/tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.cc
index 2ee91da5eb7..1e188fc420b 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.cc
@@ -78,8 +78,8 @@ void *get_arc_scratch_buffer(int size) {
   // find a local memory that fits the data size.
   for (int mem_idx = 0; mem_idx < num_mem; ++mem_idx) {
     // Best Fit
-    if ((size <= scratch_sizes[mem_idx]) &&
-        (scratch_sizes[mem_idx] - size < best_mem_delta)) {
+    if ((size <= static_cast<int>(scratch_sizes[mem_idx])) &&
+        (static_cast<int>(scratch_sizes[mem_idx]) - size < best_mem_delta)) {
       best_mem_idx = mem_idx;
       best_mem_delta = scratch_sizes[mem_idx] - size;
     }
@@ -97,7 +97,7 @@ void get_arc_scratch_buffer_max_size(int *size) {
   const int num_mem = sizeof(scratch_mem) / sizeof(scratch_mem[0]);
   // find the largest available buffer.
   for (int i = 0; i < num_mem; i++) {
-    if (scratch_sizes[i] > maxavailable) {
+    if (static_cast<int>(scratch_sizes[i]) > maxavailable) {
       maxavailable = scratch_sizes[i];
     }
   }
@@ -110,10 +110,10 @@ void get_arc_scratch_buffer_two_max_sizes(int *size1, int *size2) {
   const int num_mem = sizeof(scratch_mem) / sizeof(scratch_mem[0]);
   // find the two largest available buffers.
   for (int i = 0; i < num_mem; i++) {
-    if (scratch_sizes[i] > maxavailable) {
+    if (static_cast<int>(scratch_sizes[i]) > maxavailable) {
       secondavail = maxavailable;
       maxavailable = scratch_sizes[i];
-    } else if (scratch_sizes[i] > secondavail) {
+    } else if (static_cast<int>(scratch_sizes[i]) > secondavail) {
       secondavail = scratch_sizes[i];
     }
   }
diff --git a/tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h b/tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h
index f139659960e..6e3feb3a53d 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h
+++ b/tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h
@@ -24,8 +24,8 @@ namespace ops {
 namespace micro {
 
 void init_arc_scratch_buffers(void);
-void* get_arc_scratch_buffer(
-    int size);  // Function to assign fast memory from one of 3 scratch buffers.
+void* get_arc_scratch_buffer(int size);  // Function to assign fast memory
+                                         // from one of 3 scratch buffers.
 
 void get_arc_scratch_buffer_max_size(int* size);
 void get_arc_scratch_buffer_two_max_sizes(int* size1, int* size2);
diff --git a/tensorflow/lite/micro/kernels/circular_buffer.cc b/tensorflow/lite/micro/kernels/circular_buffer.cc
index b5a8ae1be3b..7f5aebaca2d 100644
--- a/tensorflow/lite/micro/kernels/circular_buffer.cc
+++ b/tensorflow/lite/micro/kernels/circular_buffer.cc
@@ -17,8 +17,6 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
-#include "tensorflow/lite/kernels/internal/reference/integer_ops/add.h"
-#include "tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
diff --git a/tensorflow/lite/micro/kernels/cmsis-nn/add.cc b/tensorflow/lite/micro/kernels/cmsis-nn/add.cc
index c98e7a2c329..6db88839073 100644
--- a/tensorflow/lite/micro/kernels/cmsis-nn/add.cc
+++ b/tensorflow/lite/micro/kernels/cmsis-nn/add.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/memory_helpers.h"
 
 namespace tflite {
@@ -96,18 +97,20 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteAddParams* params,
 }
 
 void EvalAdd(TfLiteContext* context, TfLiteNode* node, TfLiteAddParams* params,
-             const OpData* data, const TfLiteTensor* input1,
-             const TfLiteTensor* input2, TfLiteTensor* output) {
+             const OpData* data, const TfLiteEvalTensor* input1,
+             const TfLiteEvalTensor* input2, TfLiteEvalTensor* output) {
   float output_activation_min, output_activation_max;
   CalculateActivationRange(params->activation, &output_activation_min,
                            &output_activation_max);
   tflite::ArithmeticParams op_params;
   SetActivationParams(output_activation_min, output_activation_max, &op_params);
-#define TF_LITE_ADD(opname)                                                   \
-  reference_ops::opname(op_params, GetTensorShape(input1),                    \
-                        GetTensorData<float>(input1), GetTensorShape(input2), \
-                        GetTensorData<float>(input2), GetTensorShape(output), \
-                        GetTensorData<float>(output))
+#define TF_LITE_ADD(opname)                                               \
+  reference_ops::opname(op_params, tflite::micro::GetTensorShape(input1), \
+                        tflite::micro::GetTensorData<float>(input1),      \
+                        tflite::micro::GetTensorShape(input2),            \
+                        tflite::micro::GetTensorData<float>(input2),      \
+                        tflite::micro::GetTensorShape(output),            \
+                        tflite::micro::GetTensorData<float>(output))
   if (data->requires_broadcast) {
     TF_LITE_ADD(BroadcastAdd4DSlow);
   } else {
@@ -118,9 +121,9 @@ void EvalAdd(TfLiteContext* context, TfLiteNode* node, TfLiteAddParams* params,
 
 TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
                               TfLiteAddParams* params, const OpData* data,
-                              const TfLiteTensor* input1,
-                              const TfLiteTensor* input2,
-                              TfLiteTensor* output) {
+                              const TfLiteEvalTensor* input1,
+                              const TfLiteEvalTensor* input2,
+                              TfLiteEvalTensor* output) {
   if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
     tflite::ArithmeticParams op_params;
     op_params.left_shift = data->left_shift;
@@ -136,27 +139,32 @@ TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
     SetActivationParams(data->output_activation_min,
                         data->output_activation_max, &op_params);
     bool need_broadcast = reference_ops::ProcessBroadcastShapes(
-        GetTensorShape(input1), GetTensorShape(input2), &op_params);
-#define TF_LITE_ADD(type, opname, dtype)                             \
-  type::opname(op_params, GetTensorShape(input1),                    \
-               GetTensorData<dtype>(input1), GetTensorShape(input2), \
-               GetTensorData<dtype>(input2), GetTensorShape(output), \
-               GetTensorData<dtype>(output));
+        tflite::micro::GetTensorShape(input1),
+        tflite::micro::GetTensorShape(input2), &op_params);
+#define TF_LITE_ADD(type, opname, dtype)                         \
+  type::opname(op_params, tflite::micro::GetTensorShape(input1), \
+               tflite::micro::GetTensorData<dtype>(input1),      \
+               tflite::micro::GetTensorShape(input2),            \
+               tflite::micro::GetTensorData<dtype>(input2),      \
+               tflite::micro::GetTensorShape(output),            \
+               tflite::micro::GetTensorData<dtype>(output));
     if (output->type == kTfLiteInt8) {
       if (need_broadcast) {
         TF_LITE_ADD(reference_integer_ops, BroadcastAdd4DSlow, int8_t);
       } else {
         arm_elementwise_add_s8(
-            GetTensorData<int8_t>(input1), GetTensorData<int8_t>(input2),
+            tflite::micro::GetTensorData<int8_t>(input1),
+            tflite::micro::GetTensorData<int8_t>(input2),
             op_params.input1_offset, op_params.input1_multiplier,
             op_params.input1_shift, op_params.input2_offset,
             op_params.input2_multiplier, op_params.input2_shift,
-            op_params.left_shift, GetTensorData<int8_t>(output),
+            op_params.left_shift, tflite::micro::GetTensorData<int8_t>(output),
             op_params.output_offset, op_params.output_multiplier,
             op_params.output_shift, op_params.quantized_activation_min,
             op_params.quantized_activation_max,
-            MatchingElementsSize(GetTensorShape(input1), GetTensorShape(input2),
-                                 GetTensorShape(output)));
+            MatchingElementsSize(tflite::micro::GetTensorShape(input1),
+                                 tflite::micro::GetTensorShape(input2),
+                                 tflite::micro::GetTensorShape(output)));
       }
     } else {
       if (need_broadcast) {
@@ -196,9 +204,12 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteAddParams*>(node->builtin_data);
 
-  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteEvalTensor* input1 =
+      tflite::micro::GetEvalInput(context, node, kInputTensor1);
+  const TfLiteEvalTensor* input2 =
+      tflite::micro::GetEvalInput(context, node, kInputTensor2);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
 
   TFLITE_DCHECK(node->user_data != nullptr);
   const OpData* data = static_cast<const OpData*>(node->user_data);
diff --git a/tensorflow/lite/micro/kernels/cmsis-nn/conv.cc b/tensorflow/lite/micro/kernels/cmsis-nn/conv.cc
index 834f107dad0..3537711e937 100644
--- a/tensorflow/lite/micro/kernels/cmsis-nn/conv.cc
+++ b/tensorflow/lite/micro/kernels/cmsis-nn/conv.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/padding.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
 
 namespace tflite {
 namespace ops {
@@ -43,6 +44,12 @@ constexpr int kConvQuantizedDimension = 0;
 
 struct OpData {
   TfLitePaddingValues padding;
+
+  // Cached tensor zero point values for quantized operations.
+  int32_t input_zero_point;
+  int32_t filter_zero_point;
+  int32_t output_zero_point;
+
   // The scaling factor from input to output (aka the 'real multiplier') can
   // be represented as a fixed point multiplier plus a left shift.
   int32_t output_multiplier;
@@ -57,6 +64,9 @@ struct OpData {
   // uint8_t these would be 0 and 255.
   int32_t output_activation_min;
   int32_t output_activation_max;
+
+  // Index to buffer for optimizations if applicable.
+  int buffer_idx;
 };
 
 inline PaddingType RuntimePaddingType(TfLitePadding padding) {
@@ -110,16 +120,17 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
-  return context->AllocatePersistentBuffer(context, sizeof(int));
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
 }
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 #if defined(__ARM_FEATURE_DSP) || defined(__ARM_FEATURE_MVE)
-  OpData data;
   int32_t buf_size = 0;
 
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
   auto* params = reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
-
+  auto* data = reinterpret_cast<OpData*>(node->user_data);
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
   const TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
@@ -148,11 +159,13 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   output_dims.w = output->dims->data[2];
   output_dims.c = output_shape.Dims(3);
 
-  int* buffer_idx = reinterpret_cast<int*>(node->user_data);
-
   TF_LITE_ENSURE_STATUS(CalculateOpData(
       context, node, params, input_dims.w, input_dims.h, filter_dims.w,
-      filter_dims.h, output_dims.w, output_dims.h, input->type, &data));
+      filter_dims.h, output_dims.w, output_dims.h, input->type, data));
+
+  data->input_zero_point = input->params.zero_point;
+  data->filter_zero_point = filter->params.zero_point;
+  data->output_zero_point = output->params.zero_point;
 
   if (input->type == kTfLiteInt8) {
     // Initialize cmsis-nn convolution parameters
@@ -163,40 +176,41 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     conv_params.stride.w = params->stride_width;
     conv_params.dilation.h = params->dilation_height_factor;
     conv_params.dilation.w = params->dilation_width_factor;
-    conv_params.padding.h = data.padding.height;
-    conv_params.padding.w = data.padding.width;
-    conv_params.activation.min = data.output_activation_min;
-    conv_params.activation.max = data.output_activation_max;
+    conv_params.padding.h = data->padding.height;
+    conv_params.padding.w = data->padding.width;
+    conv_params.activation.min = data->output_activation_min;
+    conv_params.activation.max = data->output_activation_max;
 
     buf_size = arm_convolve_wrapper_s8_get_buffer_size(
         &conv_params, &input_dims, &filter_dims, &output_dims);
   }
 
-  node->user_data = buffer_idx;
   if (buf_size > 0) {
-    TF_LITE_ENSURE_STATUS(
-        context->RequestScratchBufferInArena(context, buf_size, buffer_idx));
+    TF_LITE_ENSURE_STATUS(context->RequestScratchBufferInArena(
+        context, buf_size, &data->buffer_idx));
   } else {
-    *buffer_idx = -1;
+    data->buffer_idx = -1;
   }
 #endif
   return kTfLiteOk;
 }
 
 TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                           TfLiteConvParams* params, OpData* data,
-                           const TfLiteTensor* input,
-                           const TfLiteTensor* filter, const TfLiteTensor* bias,
-                           TfLiteTensor* im2col, TfLiteTensor* hwcn_weights,
-                           TfLiteTensor* output) {
-  const int32_t input_offset = -input->params.zero_point;
-  const int32_t filter_offset = -filter->params.zero_point;
-  const int32_t output_offset = output->params.zero_point;
+                           TfLiteConvParams* params, const OpData& data,
+                           const TfLiteEvalTensor* input,
+                           const TfLiteEvalTensor* filter,
+                           const TfLiteEvalTensor* bias,
+                           TfLiteEvalTensor* im2col,
+                           TfLiteEvalTensor* hwcn_weights,
+                           TfLiteEvalTensor* output) {
+  const int32_t input_offset = -data.input_zero_point;
+  const int32_t filter_offset = -data.filter_zero_point;
+  const int32_t output_offset = data.output_zero_point;
 
   ConvParams op_params;
   op_params.padding_type = RuntimePaddingType(params->padding);
-  op_params.padding_values.width = data->padding.width;
-  op_params.padding_values.height = data->padding.height;
+  op_params.padding_values.width = data.padding.width;
+  op_params.padding_values.height = data.padding.height;
   op_params.stride_width = params->stride_width;
   op_params.stride_height = params->stride_height;
   op_params.dilation_width_factor = params->dilation_width_factor;
@@ -204,46 +218,53 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
   op_params.input_offset = input_offset;
   op_params.weights_offset = filter_offset;
   op_params.output_offset = output_offset;
-  op_params.output_multiplier = data->output_multiplier;
-  op_params.output_shift = -data->output_shift;
-  op_params.quantized_activation_min = data->output_activation_min;
-  op_params.quantized_activation_max = data->output_activation_max;
-  reference_ops::Conv(op_params, GetTensorShape(input),
-                      GetTensorData<uint8_t>(input), GetTensorShape(filter),
-                      GetTensorData<uint8_t>(filter), GetTensorShape(bias),
-                      GetTensorData<int32_t>(bias), GetTensorShape(output),
-                      GetTensorData<uint8_t>(output), GetTensorShape(im2col),
-                      GetTensorData<uint8_t>(im2col), nullptr);
+  op_params.output_multiplier = data.output_multiplier;
+  op_params.output_shift = -data.output_shift;
+  op_params.quantized_activation_min = data.output_activation_min;
+  op_params.quantized_activation_max = data.output_activation_max;
+  reference_ops::Conv(op_params, tflite::micro::GetTensorShape(input),
+                      tflite::micro::GetTensorData<uint8_t>(input),
+                      tflite::micro::GetTensorShape(filter),
+                      tflite::micro::GetTensorData<uint8_t>(filter),
+                      tflite::micro::GetTensorShape(bias),
+                      tflite::micro::GetTensorData<int32_t>(bias),
+                      tflite::micro::GetTensorShape(output),
+                      tflite::micro::GetTensorData<uint8_t>(output),
+                      tflite::micro::GetTensorShape(im2col),
+                      tflite::micro::GetTensorData<uint8_t>(im2col), nullptr);
   return kTfLiteOk;
 }
 
 TfLiteStatus EvalQuantizedPerChannel(
     TfLiteContext* context, TfLiteNode* node, TfLiteConvParams* params,
-    OpData* data, const TfLiteTensor* input, const TfLiteTensor* filter,
-    const TfLiteTensor* bias, TfLiteTensor* output, TfLiteTensor* im2col) {
+    const OpData& data, const TfLiteEvalTensor* input,
+    const TfLiteEvalTensor* filter, const TfLiteEvalTensor* bias,
+    TfLiteEvalTensor* output, TfLiteEvalTensor* im2col) {
+#if defined(__ARM_FEATURE_DSP) || defined(__ARM_FEATURE_MVE)
+
   // Initialize cmsis-nn convolution parameters
   cmsis_nn_conv_params conv_params;
-  conv_params.input_offset = -input->params.zero_point;
-  conv_params.output_offset = output->params.zero_point;
+  conv_params.input_offset = -data.input_zero_point;
+  conv_params.output_offset = data.output_zero_point;
   conv_params.stride.h = params->stride_height;
   conv_params.stride.w = params->stride_width;
   conv_params.dilation.h = params->dilation_height_factor;
   conv_params.dilation.w = params->dilation_width_factor;
-  conv_params.padding.h = data->padding.height;
-  conv_params.padding.w = data->padding.width;
-  conv_params.activation.min = data->output_activation_min;
-  conv_params.activation.max = data->output_activation_max;
+  conv_params.padding.h = data.padding.height;
+  conv_params.padding.w = data.padding.width;
+  conv_params.activation.min = data.output_activation_min;
+  conv_params.activation.max = data.output_activation_max;
 
   // Initialize cmsis-nn per channel quantization parameters
   cmsis_nn_per_channel_quant_params quant_params;
-  quant_params.multiplier = data->per_channel_output_multiplier;
-  quant_params.shift = data->per_channel_output_shift;
+  quant_params.multiplier =
+      const_cast<int32_t*>(data.per_channel_output_multiplier);
+  quant_params.shift = const_cast<int32_t*>(data.per_channel_output_shift);
 
-#if defined(__ARM_FEATURE_DSP) || defined(__ARM_FEATURE_MVE)
-  RuntimeShape filter_shape = GetTensorShape(filter);
-  RuntimeShape input_shape = GetTensorShape(input);
-  RuntimeShape output_shape = GetTensorShape(output);
-  RuntimeShape bias_shape = GetTensorShape(bias);
+  RuntimeShape filter_shape = tflite::micro::GetTensorShape(filter);
+  RuntimeShape input_shape = tflite::micro::GetTensorShape(input);
+  RuntimeShape output_shape = tflite::micro::GetTensorShape(output);
+  RuntimeShape bias_shape = tflite::micro::GetTensorShape(bias);
 
   // Consistency check.
   TFLITE_DCHECK_LE(conv_params.activation.min, conv_params.activation.max);
@@ -253,7 +274,7 @@ TfLiteStatus EvalQuantizedPerChannel(
   const int batch_size = MatchingDim(input_shape, 0, output_shape, 0);
   const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
   const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
-  if (GetTensorData<int8_t>(bias)) {
+  if (tflite::micro::GetTensorData<int8_t>(bias)) {
     TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
   }
 
@@ -291,9 +312,8 @@ TfLiteStatus EvalQuantizedPerChannel(
   ctx.buf = nullptr;
   ctx.size = 0;
 
-  auto* buffer_idx = reinterpret_cast<int*>(node->user_data);
-  if (*buffer_idx > -1) {
-    ctx.buf = context->GetScratchBuffer(context, *buffer_idx);
+  if (data.buffer_idx > -1) {
+    ctx.buf = context->GetScratchBuffer(context, data.buffer_idx);
     // Note: ctx.size is currently not used in cmsis-nn.
     // The buffer should be allocated in the Prepare function through
     // arm_convolve_wrapper_s8_get_buffer_size
@@ -303,9 +323,10 @@ TfLiteStatus EvalQuantizedPerChannel(
   // the parameters passed
   arm_status status = arm_convolve_wrapper_s8(
       &ctx, &conv_params, &quant_params, &input_dims,
-      GetTensorData<int8_t>(input), &filter_dims, GetTensorData<int8_t>(filter),
-      &bias_dims, GetTensorData<int32_t>(bias), &output_dims,
-      GetTensorData<int8_t>(output));
+      tflite::micro::GetTensorData<int8_t>(input), &filter_dims,
+      tflite::micro::GetTensorData<int8_t>(filter), &bias_dims,
+      tflite::micro::GetTensorData<int32_t>(bias), &output_dims,
+      tflite::micro::GetTensorData<int8_t>(output));
 
   if (status == ARM_MATH_SUCCESS) {
     return kTfLiteOk;
@@ -318,42 +339,45 @@ TfLiteStatus EvalQuantizedPerChannel(
     "CMSIS-NN optimization for conv not available for this target. Using reference kernel.")
 
   ConvParams op_params;
-  op_params.input_offset = -input->params.zero_point;
-  op_params.output_offset = output->params.zero_point;
   op_params.stride_height = params->stride_height;
   op_params.stride_width = params->stride_width;
   op_params.dilation_height_factor = params->dilation_height_factor;
   op_params.dilation_width_factor = params->dilation_width_factor;
-  op_params.padding_values.height = data->padding.height;
-  op_params.padding_values.width = data->padding.width;
-  op_params.quantized_activation_min = data->output_activation_min;
-  op_params.quantized_activation_max = data->output_activation_max;
+  op_params.padding_values.height = data.padding.height;
+  op_params.padding_values.width = data.padding.width;
+  op_params.quantized_activation_min = data.output_activation_min;
+  op_params.quantized_activation_max = data.output_activation_max;
 
   reference_integer_ops::ConvPerChannel(
-      op_params, data->per_channel_output_multiplier,
-      data->per_channel_output_shift, GetTensorShape(input),
-      GetTensorData<int8_t>(input), GetTensorShape(filter),
-      GetTensorData<int8_t>(filter), GetTensorShape(bias),
-      GetTensorData<int32_t>(bias), GetTensorShape(output),
-      GetTensorData<int8_t>(output));
+      op_params, data.per_channel_output_multiplier,
+      data.per_channel_output_shift, tflite::micro::GetTensorShape(input),
+      tflite::micro::GetTensorData<int8_t>(input),
+      tflite::micro::GetTensorShape(filter),
+      tflite::micro::GetTensorData<int8_t>(filter),
+      tflite::micro::GetTensorShape(bias),
+      tflite::micro::GetTensorData<int32_t>(bias),
+      tflite::micro::GetTensorShape(output),
+      tflite::micro::GetTensorData<int8_t>(output));
 
 #endif
   return kTfLiteOk;
 }
 
 TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
-                       TfLiteConvParams* params, OpData* data,
-                       const TfLiteTensor* input, const TfLiteTensor* filter,
-                       const TfLiteTensor* bias, TfLiteTensor* im2col,
-                       TfLiteTensor* hwcn_weights, TfLiteTensor* output) {
+                       TfLiteConvParams* params, const OpData& data,
+                       const TfLiteEvalTensor* input,
+                       const TfLiteEvalTensor* filter,
+                       const TfLiteEvalTensor* bias, TfLiteEvalTensor* im2col,
+                       TfLiteEvalTensor* hwcn_weights,
+                       TfLiteEvalTensor* output) {
   float output_activation_min, output_activation_max;
   CalculateActivationRange(params->activation, &output_activation_min,
                            &output_activation_max);
-
+  // TODO(b/154032858): Investigate removing extra copies.
   ConvParams op_params;
   op_params.padding_type = RuntimePaddingType(params->padding);
-  op_params.padding_values.width = data->padding.width;
-  op_params.padding_values.height = data->padding.height;
+  op_params.padding_values.width = data.padding.width;
+  op_params.padding_values.height = data.padding.height;
   op_params.stride_width = params->stride_width;
   op_params.stride_height = params->stride_height;
   op_params.dilation_width_factor = params->dilation_width_factor;
@@ -361,66 +385,47 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
   op_params.float_activation_min = output_activation_min;
   op_params.float_activation_max = output_activation_max;
 
-  reference_ops::Conv(op_params, GetTensorShape(input),
-                      GetTensorData<float>(input), GetTensorShape(filter),
-                      GetTensorData<float>(filter), GetTensorShape(bias),
-                      GetTensorData<float>(bias), GetTensorShape(output),
-                      GetTensorData<float>(output), GetTensorShape(im2col),
-                      GetTensorData<float>(im2col));
+  reference_ops::Conv(op_params, tflite::micro::GetTensorShape(input),
+                      tflite::micro::GetTensorData<float>(input),
+                      tflite::micro::GetTensorShape(filter),
+                      tflite::micro::GetTensorData<float>(filter),
+                      tflite::micro::GetTensorShape(bias),
+                      tflite::micro::GetTensorData<float>(bias),
+                      tflite::micro::GetTensorShape(output),
+                      tflite::micro::GetTensorData<float>(output),
+                      tflite::micro::GetTensorShape(im2col),
+                      tflite::micro::GetTensorData<float>(im2col));
   return kTfLiteOk;
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
 
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
-  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  const TfLiteEvalTensor* filter =
+      tflite::micro::GetEvalInput(context, node, kFilterTensor);
+  const TfLiteEvalTensor* bias =
+      (NumInputs(node) == 3)
+          ? tflite::micro::GetEvalInput(context, node, kBiasTensor)
+          : nullptr;
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
 
-  int input_width = input->dims->data[2];
-  int input_height = input->dims->data[1];
-  int filter_width = filter->dims->data[2];
-  int filter_height = filter->dims->data[1];
-  int output_width = output->dims->data[2];
-  int output_height = output->dims->data[1];
-
-  OpData data;
-
-  // All per-channel quantized tensors need valid zero point and scale arrays.
-  if (input->type == kTfLiteInt8) {
-    TF_LITE_ENSURE_EQ(context, filter->quantization.type,
-                      kTfLiteAffineQuantization);
-
-    const auto* affine_quantization =
-        reinterpret_cast<TfLiteAffineQuantization*>(
-            filter->quantization.params);
-    TF_LITE_ENSURE(context, affine_quantization);
-    TF_LITE_ENSURE(context, affine_quantization->scale);
-    TF_LITE_ENSURE(context, affine_quantization->zero_point);
-    TF_LITE_ENSURE(context,
-                   affine_quantization->scale->size == 1 ||
-                       affine_quantization->scale->size ==
-                           filter->dims->data[kConvQuantizedDimension]);
-    TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size,
-                      affine_quantization->zero_point->size);
-  }
-
-  TF_LITE_ENSURE_STATUS(CalculateOpData(
-      context, node, params, input_width, input_height, filter_width,
-      filter_height, output_width, output_height, input->type, &data));
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpData& data = *(static_cast<const OpData*>(node->user_data));
 
   switch (input->type) {  // Already know in/out types are same.
     case kTfLiteFloat32:
-      return EvalFloat(context, node, params, &data, input, filter, bias,
-                       nullptr, nullptr, output);
+      EvalFloat(context, node, params, data, input, filter, bias, nullptr,
+                nullptr, output);
       break;
     case kTfLiteInt8:
-      return EvalQuantizedPerChannel(context, node, params, &data, input,
-                                     filter, bias, output, nullptr);
+      return EvalQuantizedPerChannel(context, node, params, data, input, filter,
+                                     bias, output, nullptr);
       break;
     case kTfLiteUInt8:
-      return EvalQuantized(context, node, params, &data, input, filter, bias,
+      return EvalQuantized(context, node, params, data, input, filter, bias,
                            nullptr, nullptr, output);
       break;
     default:
diff --git a/tensorflow/lite/micro/kernels/cmsis-nn/depthwise_conv.cc b/tensorflow/lite/micro/kernels/cmsis-nn/depthwise_conv.cc
index 457b3f854de..42ac15a0837 100644
--- a/tensorflow/lite/micro/kernels/cmsis-nn/depthwise_conv.cc
+++ b/tensorflow/lite/micro/kernels/cmsis-nn/depthwise_conv.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/padding.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
 
 namespace tflite {
 namespace ops {
@@ -44,6 +45,12 @@ constexpr int kDepthwiseConvQuantizedDimension = 3;
 
 struct OpData {
   TfLitePaddingValues padding;
+
+  // Cached tensor zero point values for quantized operations.
+  int32_t input_zero_point;
+  int32_t filter_zero_point;
+  int32_t output_zero_point;
+
   // The scaling factor from input to output (aka the 'real multiplier') can
   // be represented as a fixed point multiplier plus a left shift.
   int32_t output_multiplier;
@@ -115,6 +122,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   const TfLiteType data_type = input->type;
   int width = SizeOfDimension(input, 2);
@@ -150,8 +158,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                                         filter_width, filter_height, data_type,
                                         data));
 
+  data->input_zero_point = input->params.zero_point;
+  data->filter_zero_point = filter->params.zero_point;
+  data->output_zero_point = output->params.zero_point;
+
   if (input->type == kTfLiteInt8) {
-    const TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
     RuntimeShape input_shape = GetTensorShape(input);
     RuntimeShape output_shape = GetTensorShape(output);
     RuntimeShape filter_shape = GetTensorShape(filter);
@@ -200,8 +211,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
 void EvalFloat(TfLiteContext* context, TfLiteNode* node,
                TfLiteDepthwiseConvParams* params, const OpData* data,
-               const TfLiteTensor* input, const TfLiteTensor* filter,
-               const TfLiteTensor* bias, TfLiteTensor* output) {
+               const TfLiteEvalTensor* input, const TfLiteEvalTensor* filter,
+               const TfLiteEvalTensor* bias, TfLiteEvalTensor* output) {
   float output_activation_min, output_activation_max;
   CalculateActivationRange(params->activation, &output_activation_min,
                            &output_activation_max);
@@ -220,25 +231,30 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node,
   op_params.float_activation_max = output_activation_max;
 
   tflite::reference_ops::DepthwiseConv(
-      op_params, GetTensorShape(input), GetTensorData<float>(input),
-      GetTensorShape(filter), GetTensorData<float>(filter),
-      GetTensorShape(bias), GetTensorData<float>(bias), GetTensorShape(output),
-      GetTensorData<float>(output));
+      op_params, tflite::micro::GetTensorShape(input),
+      tflite::micro::GetTensorData<float>(input),
+      tflite::micro::GetTensorShape(filter),
+      tflite::micro::GetTensorData<float>(filter),
+      tflite::micro::GetTensorShape(bias),
+      tflite::micro::GetTensorData<float>(bias),
+      tflite::micro::GetTensorShape(output),
+      tflite::micro::GetTensorData<float>(output));
 }
 
 void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
                              TfLiteDepthwiseConvParams* params, OpData* data,
-                             const TfLiteTensor* input,
-                             const TfLiteTensor* filter,
-                             const TfLiteTensor* bias, TfLiteTensor* output) {
+                             const TfLiteEvalTensor* input,
+                             const TfLiteEvalTensor* filter,
+                             const TfLiteEvalTensor* bias,
+                             TfLiteEvalTensor* output) {
   cmsis_nn_dw_conv_params dw_conv_params;
   dw_conv_params.dilation.h = params->dilation_height_factor;
   dw_conv_params.dilation.w = params->dilation_width_factor;
   // Call to reference implementation can be removed when dilation is supported
   // in the optimized implementations.
   if (1 == dw_conv_params.dilation.h && 1 == dw_conv_params.dilation.w) {
-    dw_conv_params.input_offset = -input->params.zero_point;
-    dw_conv_params.output_offset = output->params.zero_point;
+    dw_conv_params.input_offset = -data->input_zero_point;
+    dw_conv_params.output_offset = data->output_zero_point;
     dw_conv_params.stride.h = params->stride_height;
     dw_conv_params.stride.w = params->stride_width;
     dw_conv_params.padding.h = data->padding.height;
@@ -252,10 +268,10 @@ void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
     quant_params.multiplier = data->per_channel_output_multiplier;
     quant_params.shift = data->per_channel_output_shift;
 
-    RuntimeShape filter_shape = GetTensorShape(filter);
-    RuntimeShape input_shape = GetTensorShape(input);
-    RuntimeShape output_shape = GetTensorShape(output);
-    RuntimeShape bias_shape = GetTensorShape(bias);
+    RuntimeShape filter_shape = tflite::micro::GetTensorShape(filter);
+    RuntimeShape input_shape = tflite::micro::GetTensorShape(input);
+    RuntimeShape output_shape = tflite::micro::GetTensorShape(output);
+    RuntimeShape bias_shape = tflite::micro::GetTensorShape(bias);
 
     TFLITE_DCHECK_LE(dw_conv_params.activation.min,
                      dw_conv_params.activation.max);
@@ -263,7 +279,7 @@ void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
     const int batch_size = MatchingDim(input_shape, 0, output_shape, 0);
     const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
 
-    if (GetTensorData<int8_t>(bias)) {
+    if (tflite::micro::GetTensorData<int8_t>(bias)) {
       TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
     }
 
@@ -300,13 +316,14 @@ void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
       ctx.buf = context->GetScratchBuffer(context, data->buffer_idx);
     }
 
-    TFLITE_DCHECK_EQ(arm_depthwise_conv_wrapper_s8(
-                         &ctx, &dw_conv_params, &quant_params, &input_dims,
-                         GetTensorData<int8_t>(input), &filter_dims,
-                         GetTensorData<int8_t>(filter), &bias_dims,
-                         GetTensorData<int32_t>(bias), &output_dims,
-                         GetTensorData<int8_t>(output)),
-                     ARM_MATH_SUCCESS);
+    TFLITE_DCHECK_EQ(
+        arm_depthwise_conv_wrapper_s8(
+            &ctx, &dw_conv_params, &quant_params, &input_dims,
+            tflite::micro::GetTensorData<int8_t>(input), &filter_dims,
+            tflite::micro::GetTensorData<int8_t>(filter), &bias_dims,
+            tflite::micro::GetTensorData<int32_t>(bias), &output_dims,
+            tflite::micro::GetTensorData<int8_t>(output)),
+        ARM_MATH_SUCCESS);
   } else {
     DepthwiseParams op_params;
     op_params.padding_type = PaddingType::kSame;
@@ -317,30 +334,34 @@ void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
     op_params.dilation_width_factor = params->dilation_width_factor;
     op_params.dilation_height_factor = params->dilation_height_factor;
     op_params.depth_multiplier = params->depth_multiplier;
-    op_params.input_offset = -input->params.zero_point;
+    op_params.input_offset = -data->input_zero_point;
     op_params.weights_offset = 0;
-    op_params.output_offset = output->params.zero_point;
+    op_params.output_offset = data->output_zero_point;
     // TODO(b/130439627): Use calculated value for clamping.
     op_params.quantized_activation_min = std::numeric_limits<int8_t>::min();
     op_params.quantized_activation_max = std::numeric_limits<int8_t>::max();
 
     reference_integer_ops::DepthwiseConvPerChannel(
         op_params, data->per_channel_output_multiplier,
-        data->per_channel_output_shift, GetTensorShape(input),
-        GetTensorData<int8_t>(input), GetTensorShape(filter),
-        GetTensorData<int8_t>(filter), GetTensorShape(bias),
-        GetTensorData<int32_t>(bias), GetTensorShape(output),
-        GetTensorData<int8_t>(output));
+        data->per_channel_output_shift, tflite::micro::GetTensorShape(input),
+        tflite::micro::GetTensorData<int8_t>(input),
+        tflite::micro::GetTensorShape(filter),
+        tflite::micro::GetTensorData<int8_t>(filter),
+        tflite::micro::GetTensorShape(bias),
+        tflite::micro::GetTensorData<int32_t>(bias),
+        tflite::micro::GetTensorShape(output),
+        tflite::micro::GetTensorData<int8_t>(output));
   }
 }
 
 void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
                    TfLiteDepthwiseConvParams* params, const OpData* data,
-                   const TfLiteTensor* input, const TfLiteTensor* filter,
-                   const TfLiteTensor* bias, TfLiteTensor* output) {
-  const int32_t input_offset = -input->params.zero_point;
-  const int32_t filter_offset = -filter->params.zero_point;
-  const int32_t output_offset = output->params.zero_point;
+                   const TfLiteEvalTensor* input,
+                   const TfLiteEvalTensor* filter, const TfLiteEvalTensor* bias,
+                   TfLiteEvalTensor* output) {
+  const int32_t input_offset = -data->input_zero_point;
+  const int32_t filter_offset = -data->filter_zero_point;
+  const int32_t output_offset = data->output_zero_point;
 
   tflite::DepthwiseParams op_params;
   // Padding type is ignored, but still set.
@@ -363,34 +384,39 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
 
   if (1 == op_params.dilation_width_factor &&
       1 == op_params.dilation_height_factor) {
-    RuntimeShape filter_shape = GetTensorShape(filter);
+    RuntimeShape filter_shape = tflite::micro::GetTensorShape(filter);
     const int filter_height = filter_shape.Dims(1);
     const int filter_width = filter_shape.Dims(2);
-    RuntimeShape input_shape = GetTensorShape(input);
+    RuntimeShape input_shape = tflite::micro::GetTensorShape(input);
     const int input_height = input_shape.Dims(1);
     const int input_width = input_shape.Dims(2);
     const int input_depth = input_shape.Dims(3);
-    RuntimeShape output_shape = GetTensorShape(output);
+    RuntimeShape output_shape = tflite::micro::GetTensorShape(output);
     const int output_height = output_shape.Dims(1);
     const int output_width = output_shape.Dims(2);
     arm_depthwise_conv_u8_basic_ver1(
-        GetTensorData<uint8_t>(input), input_width, input_height, input_depth,
-        GetTensorData<uint8_t>(filter), filter_width, filter_height,
-        op_params.depth_multiplier, op_params.padding_values.width,
-        op_params.padding_values.height, op_params.stride_width,
-        op_params.stride_height, op_params.dilation_width_factor,
-        op_params.dilation_height_factor, GetTensorData<int32_t>(bias),
-        op_params.input_offset, op_params.weights_offset,
-        op_params.output_offset, GetTensorData<uint8_t>(output), output_width,
+        tflite::micro::GetTensorData<uint8_t>(input), input_width, input_height,
+        input_depth, tflite::micro::GetTensorData<uint8_t>(filter),
+        filter_width, filter_height, op_params.depth_multiplier,
+        op_params.padding_values.width, op_params.padding_values.height,
+        op_params.stride_width, op_params.stride_height,
+        op_params.dilation_width_factor, op_params.dilation_height_factor,
+        tflite::micro::GetTensorData<int32_t>(bias), op_params.input_offset,
+        op_params.weights_offset, op_params.output_offset,
+        tflite::micro::GetTensorData<uint8_t>(output), output_width,
         output_height, op_params.quantized_activation_min,
         op_params.quantized_activation_max, op_params.output_shift,
         op_params.output_multiplier);
   } else {
     tflite::reference_ops::DepthwiseConv(
-        op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
-        GetTensorShape(filter), GetTensorData<uint8_t>(filter),
-        GetTensorShape(bias), GetTensorData<int32_t>(bias),
-        GetTensorShape(output), GetTensorData<uint8_t>(output));
+        op_params, tflite::micro::GetTensorShape(input),
+        tflite::micro::GetTensorData<uint8_t>(input),
+        tflite::micro::GetTensorShape(filter),
+        tflite::micro::GetTensorData<uint8_t>(filter),
+        tflite::micro::GetTensorShape(bias),
+        tflite::micro::GetTensorData<int32_t>(bias),
+        tflite::micro::GetTensorShape(output),
+        tflite::micro::GetTensorData<uint8_t>(output));
   }
 }
 
@@ -402,11 +428,16 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
   OpData& data = *(static_cast<OpData*>(node->user_data));
 
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
-  const TfLiteTensor* bias =
-      (NumInputs(node) == 3) ? GetInput(context, node, kBiasTensor) : nullptr;
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  const TfLiteEvalTensor* filter =
+      tflite::micro::GetEvalInput(context, node, kFilterTensor);
+  const TfLiteEvalTensor* bias =
+      (NumInputs(node) == 3)
+          ? tflite::micro::GetEvalInput(context, node, kBiasTensor)
+          : nullptr;
 
   // TODO(aselle): Consider whether float conv and quantized conv should be
   // separate ops to avoid dispatch overhead here.
diff --git a/tensorflow/lite/micro/kernels/cmsis-nn/fully_connected.cc b/tensorflow/lite/micro/kernels/cmsis-nn/fully_connected.cc
index 074f4a9f251..b8542956dba 100644
--- a/tensorflow/lite/micro/kernels/cmsis-nn/fully_connected.cc
+++ b/tensorflow/lite/micro/kernels/cmsis-nn/fully_connected.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
 
 namespace tflite {
 namespace ops {
@@ -43,6 +44,11 @@ struct OpData {
   int input_quantized_index;
   // Index to buffer for optimizations if applicable.
   int buffer_idx;
+
+  // Cached tensor zero point values for quantized operations.
+  int32_t input_zero_point;
+  int32_t filter_zero_point;
+  int32_t output_zero_point;
 };
 
 constexpr int kInputTensor = 0;
@@ -69,6 +75,9 @@ TfLiteStatus CalculateOpData(TfLiteContext* context,
     TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
         context, activation, output, &data->output_activation_min,
         &data->output_activation_max));
+    data->input_zero_point = input->params.zero_point;
+    data->filter_zero_point = filter->params.zero_point;
+    data->output_zero_point = output->params.zero_point;
   }
   return status;
 }
@@ -125,25 +134,27 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus EvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
-                               const OpData& data, const TfLiteTensor* input,
-                               const TfLiteTensor* filter,
-                               const TfLiteTensor* bias, TfLiteTensor* output) {
+                               const OpData& data,
+                               const TfLiteEvalTensor* input,
+                               const TfLiteEvalTensor* filter,
+                               const TfLiteEvalTensor* bias,
+                               TfLiteEvalTensor* output) {
   // The 'if' condition can be removed when null handling of bias is added to
   // arm_fully_connected_s8
-  if (nullptr != GetTensorData<int32_t>(bias)) {
-    RuntimeShape output_shape = GetTensorShape(output);
+  if (nullptr != tflite::micro::GetTensorData<int32_t>(bias)) {
+    RuntimeShape output_shape = tflite::micro::GetTensorShape(output);
     TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 2);
     const int batches = output_shape.Dims(0);
     const int output_depth = output_shape.Dims(1);
-    const RuntimeShape filter_shape = GetTensorShape(filter);
+    const RuntimeShape filter_shape = tflite::micro::GetTensorShape(filter);
     const int filter_dim_count = filter_shape.DimensionsCount();
     const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
-    const RuntimeShape input_shape = GetTensorShape(input);
+    const RuntimeShape input_shape = tflite::micro::GetTensorShape(input);
 
     cmsis_nn_fc_params fc_params;
-    fc_params.input_offset = -input->params.zero_point;
-    fc_params.filter_offset = -filter->params.zero_point;
-    fc_params.output_offset = output->params.zero_point;
+    fc_params.input_offset = -data.input_zero_point;
+    fc_params.output_offset = data.output_zero_point;
+    fc_params.filter_offset = -data.filter_zero_point;
     fc_params.activation.min = data.output_activation_min;
     fc_params.activation.max = data.output_activation_max;
 
@@ -186,17 +197,18 @@ TfLiteStatus EvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
 
     TF_LITE_ENSURE_EQ(
         context,
-        arm_fully_connected_s8(&ctx, &fc_params, &quant_params, &input_dims,
-                               GetTensorData<int8_t>(input), &filter_dims,
-                               GetTensorData<int8_t>(filter), &bias_dims,
-                               GetTensorData<int32_t>(bias), &output_dims,
-                               GetTensorData<int8_t>(output)),
+        arm_fully_connected_s8(
+            &ctx, &fc_params, &quant_params, &input_dims,
+            tflite::micro::GetTensorData<int8_t>(input), &filter_dims,
+            tflite::micro::GetTensorData<int8_t>(filter), &bias_dims,
+            tflite::micro::GetTensorData<int32_t>(bias), &output_dims,
+            tflite::micro::GetTensorData<int8_t>(output)),
         ARM_MATH_SUCCESS);
   } else {
     tflite::FullyConnectedParams op_params;
-    op_params.input_offset = -input->params.zero_point;
-    op_params.weights_offset = -filter->params.zero_point;
-    op_params.output_offset = output->params.zero_point;
+    op_params.input_offset = -data.input_zero_point;
+    op_params.weights_offset = -data.filter_zero_point;
+    op_params.output_offset = data.output_zero_point;
     op_params.output_multiplier = data.output_multiplier;
     // TODO(b/138810107): Figure out whether output shift should be inverted
     op_params.output_shift = -data.output_shift;
@@ -204,21 +216,26 @@ TfLiteStatus EvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
     op_params.quantized_activation_max = data.output_activation_max;
 
     reference_integer_ops::FullyConnected(
-        op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
-        GetTensorShape(filter), GetTensorData<int8_t>(filter),
-        GetTensorShape(bias), GetTensorData<int32_t>(bias),
-        GetTensorShape(output), GetTensorData<int8_t>(output));
+        op_params, tflite::micro::GetTensorShape(input),
+        tflite::micro::GetTensorData<int8_t>(input),
+        tflite::micro::GetTensorShape(filter),
+        tflite::micro::GetTensorData<int8_t>(filter),
+        tflite::micro::GetTensorShape(bias),
+        tflite::micro::GetTensorData<int32_t>(bias),
+        tflite::micro::GetTensorShape(output),
+        tflite::micro::GetTensorData<int8_t>(output));
   }
   return kTfLiteOk;
 }
 
 TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                           const OpData& data, const TfLiteTensor* input,
-                           const TfLiteTensor* filter, const TfLiteTensor* bias,
-                           TfLiteTensor* output) {
-  const int32_t input_offset = -input->params.zero_point;
-  const int32_t filter_offset = -filter->params.zero_point;
-  const int32_t output_offset = output->params.zero_point;
+                           const OpData& data, const TfLiteEvalTensor* input,
+                           const TfLiteEvalTensor* filter,
+                           const TfLiteEvalTensor* bias,
+                           TfLiteEvalTensor* output) {
+  const int32_t input_offset = -data.input_zero_point;
+  const int32_t filter_offset = -data.filter_zero_point;
+  const int32_t output_offset = data.output_zero_point;
 
   tflite::FullyConnectedParams op_params;
   op_params.input_offset = input_offset;
@@ -230,12 +247,16 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
   op_params.quantized_activation_min = data.output_activation_min;
   op_params.quantized_activation_max = data.output_activation_max;
 
-#define TF_LITE_FULLY_CONNECTED(output_data_type)                      \
-  reference_ops::FullyConnected(                                       \
-      op_params, GetTensorShape(input), GetTensorData<uint8_t>(input), \
-      GetTensorShape(filter), GetTensorData<uint8_t>(filter),          \
-      GetTensorShape(bias), GetTensorData<int32_t>(bias),              \
-      GetTensorShape(output), GetTensorData<output_data_type>(output))
+#define TF_LITE_FULLY_CONNECTED(output_data_type)      \
+  reference_ops::FullyConnected(                       \
+      op_params, tflite::micro::GetTensorShape(input), \
+      tflite::micro::GetTensorData<uint8_t>(input),    \
+      tflite::micro::GetTensorShape(filter),           \
+      tflite::micro::GetTensorData<uint8_t>(filter),   \
+      tflite::micro::GetTensorShape(bias),             \
+      tflite::micro::GetTensorData<int32_t>(bias),     \
+      tflite::micro::GetTensorShape(output),           \
+      tflite::micro::GetTensorData<output_data_type>(output))
   switch (output->type) {
     case kTfLiteUInt8:
       TF_LITE_FULLY_CONNECTED(uint8_t);
@@ -254,8 +275,9 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
 
 TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
                        TfLiteFusedActivation activation,
-                       const TfLiteTensor* input, const TfLiteTensor* filter,
-                       const TfLiteTensor* bias, TfLiteTensor* output) {
+                       const TfLiteEvalTensor* input,
+                       const TfLiteEvalTensor* filter,
+                       const TfLiteEvalTensor* bias, TfLiteEvalTensor* output) {
   float output_activation_min, output_activation_max;
   CalculateActivationRange(activation, &output_activation_min,
                            &output_activation_max);
@@ -263,10 +285,14 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
   op_params.float_activation_min = output_activation_min;
   op_params.float_activation_max = output_activation_max;
   tflite::reference_ops::FullyConnected(
-      op_params, GetTensorShape(input), GetTensorData<float>(input),
-      GetTensorShape(filter), GetTensorData<float>(filter),
-      GetTensorShape(bias), GetTensorData<float>(bias), GetTensorShape(output),
-      GetTensorData<float>(output));
+      op_params, tflite::micro::GetTensorShape(input),
+      tflite::micro::GetTensorData<float>(input),
+      tflite::micro::GetTensorShape(filter),
+      tflite::micro::GetTensorData<float>(filter),
+      tflite::micro::GetTensorShape(bias),
+      tflite::micro::GetTensorData<float>(bias),
+      tflite::micro::GetTensorShape(output),
+      tflite::micro::GetTensorData<float>(output));
   return kTfLiteOk;
 }
 
@@ -275,10 +301,14 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const auto* params =
       static_cast<const TfLiteFullyConnectedParams*>(node->builtin_data);
 
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
-  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  const TfLiteEvalTensor* filter =
+      tflite::micro::GetEvalInput(context, node, kWeightsTensor);
+  const TfLiteEvalTensor* bias =
+      tflite::micro::GetEvalInput(context, node, kBiasTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
 
   TFLITE_DCHECK(node->user_data != nullptr);
   const OpData& data = *(static_cast<const OpData*>(node->user_data));
diff --git a/tensorflow/lite/micro/kernels/cmsis-nn/mul.cc b/tensorflow/lite/micro/kernels/cmsis-nn/mul.cc
index 6f9113a02f6..e7e23818f5e 100644
--- a/tensorflow/lite/micro/kernels/cmsis-nn/mul.cc
+++ b/tensorflow/lite/micro/kernels/cmsis-nn/mul.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/memory_helpers.h"
 
 namespace tflite {
@@ -38,6 +39,11 @@ struct OpData {
 
   int32_t output_multiplier;
   int output_shift;
+
+  // Cached tensor zero point values for quantized operations.
+  int32_t input1_zero_point;
+  int32_t input2_zero_point;
+  int32_t output_zero_point;
 };
 
 TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
@@ -56,8 +62,9 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
         context, params->activation, output, &data->output_activation_min,
         &data->output_activation_max));
 
-    double real_multiplier =
-        input1->params.scale * input2->params.scale / output->params.scale;
+    double real_multiplier = static_cast<double>(input1->params.scale) *
+                             static_cast<double>(input2->params.scale) /
+                             static_cast<double>(output->params.scale);
     QuantizeMultiplier(real_multiplier, &data->output_multiplier,
                        &data->output_shift);
   }
@@ -65,6 +72,11 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
   return kTfLiteOk;
 }
 
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
+}
+
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input1 = GetInput(context, node, kInput1Tensor);
   const TfLiteTensor* input2 = GetInput(context, node, kInput2Tensor);
@@ -74,44 +86,59 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     return AllocateOutputDimensionsFromInput(context, input1, input2, output);
   }
 
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+  auto* params = reinterpret_cast<TfLiteMulParams*>(node->builtin_data);
+  TFLITE_DCHECK(node->user_data != nullptr);
+  OpData* data = static_cast<OpData*>(node->user_data);
+
+  data->input1_zero_point = input1->params.zero_point;
+  data->input2_zero_point = input2->params.zero_point;
+  data->output_zero_point = output->params.zero_point;
+  CalculateOpData(context, node, params, data);
+
   return kTfLiteOk;
 }
 
 void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                   TfLiteMulParams* params, OpData* data,
-                   const TfLiteTensor* input1, const TfLiteTensor* input2,
-                   TfLiteTensor* output) {
+                   TfLiteMulParams* params, const OpData& data,
+                   const TfLiteEvalTensor* input1,
+                   const TfLiteEvalTensor* input2, TfLiteEvalTensor* output) {
   if (output->type == kTfLiteInt8 || output->type == kTfLiteUInt8) {
     tflite::ArithmeticParams op_params;
-    SetActivationParams(data->output_activation_min,
-                        data->output_activation_max, &op_params);
-    op_params.input1_offset = -input1->params.zero_point;
-    op_params.input2_offset = -input2->params.zero_point;
-    op_params.output_offset = output->params.zero_point;
-    op_params.output_multiplier = data->output_multiplier;
-    op_params.output_shift = data->output_shift;
+    SetActivationParams(data.output_activation_min, data.output_activation_max,
+                        &op_params);
+    op_params.input1_offset = -data.input1_zero_point;
+    op_params.input2_offset = -data.input2_zero_point;
+    op_params.output_offset = data.output_zero_point;
+    op_params.output_multiplier = data.output_multiplier;
+    op_params.output_shift = data.output_shift;
     bool need_broadcast = reference_ops::ProcessBroadcastShapes(
-        GetTensorShape(input1), GetTensorShape(input2), &op_params);
+        tflite::micro::GetTensorShape(input1),
+        tflite::micro::GetTensorShape(input2), &op_params);
 
-#define TF_LITE_MUL(type, opname, dtype)                             \
-  type::opname(op_params, GetTensorShape(input1),                    \
-               GetTensorData<dtype>(input1), GetTensorShape(input2), \
-               GetTensorData<dtype>(input2), GetTensorShape(output), \
-               GetTensorData<dtype>(output));
+#define TF_LITE_MUL(type, opname, dtype)                         \
+  type::opname(op_params, tflite::micro::GetTensorShape(input1), \
+               tflite::micro::GetTensorData<dtype>(input1),      \
+               tflite::micro::GetTensorShape(input2),            \
+               tflite::micro::GetTensorData<dtype>(input2),      \
+               tflite::micro::GetTensorShape(output),            \
+               tflite::micro::GetTensorData<dtype>(output));
 
     if (output->type == kTfLiteInt8) {
       if (need_broadcast) {
         TF_LITE_MUL(reference_integer_ops, BroadcastMul4DSlow, int8_t);
       } else {
         arm_elementwise_mul_s8(
-            GetTensorData<int8_t>(input1), GetTensorData<int8_t>(input2),
+            tflite::micro::GetTensorData<int8_t>(input1),
+            tflite::micro::GetTensorData<int8_t>(input2),
             op_params.input1_offset, op_params.input2_offset,
-            GetTensorData<int8_t>(output), op_params.output_offset,
-            op_params.output_multiplier, op_params.output_shift,
-            op_params.quantized_activation_min,
+            tflite::micro::GetTensorData<int8_t>(output),
+            op_params.output_offset, op_params.output_multiplier,
+            op_params.output_shift, op_params.quantized_activation_min,
             op_params.quantized_activation_max,
-            MatchingElementsSize(GetTensorShape(input1), GetTensorShape(input2),
-                                 GetTensorShape(output)));
+            MatchingElementsSize(tflite::micro::GetTensorShape(input1),
+                                 tflite::micro::GetTensorShape(input2),
+                                 tflite::micro::GetTensorShape(output)));
       }
     } else if (output->type == kTfLiteUInt8) {
       if (need_broadcast) {
@@ -125,9 +152,8 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
 }
 
 void EvalFloat(TfLiteContext* context, TfLiteNode* node,
-               TfLiteMulParams* params, OpData* data,
-               const TfLiteTensor* input1, const TfLiteTensor* input2,
-               TfLiteTensor* output) {
+               TfLiteMulParams* params, const TfLiteEvalTensor* input1,
+               const TfLiteEvalTensor* input2, TfLiteEvalTensor* output) {
   float output_activation_min, output_activation_max;
   CalculateActivationRange(params->activation, &output_activation_min,
                            &output_activation_max);
@@ -135,12 +161,15 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node,
   SetActivationParams(output_activation_min, output_activation_max, &op_params);
 
   bool need_broadcast = reference_ops::ProcessBroadcastShapes(
-      GetTensorShape(input1), GetTensorShape(input2), &op_params);
-#define TF_LITE_MUL(opname)                                                   \
-  reference_ops::opname(op_params, GetTensorShape(input1),                    \
-                        GetTensorData<float>(input1), GetTensorShape(input2), \
-                        GetTensorData<float>(input2), GetTensorShape(output), \
-                        GetTensorData<float>(output));
+      tflite::micro::GetTensorShape(input1),
+      tflite::micro::GetTensorShape(input2), &op_params);
+#define TF_LITE_MUL(opname)                                               \
+  reference_ops::opname(op_params, tflite::micro::GetTensorShape(input1), \
+                        tflite::micro::GetTensorData<float>(input1),      \
+                        tflite::micro::GetTensorShape(input2),            \
+                        tflite::micro::GetTensorData<float>(input2),      \
+                        tflite::micro::GetTensorShape(output),            \
+                        tflite::micro::GetTensorData<float>(output));
 
   if (need_broadcast) {
     TF_LITE_MUL(BroadcastMul4DSlow);
@@ -152,21 +181,24 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node,
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteMulParams*>(node->builtin_data);
-  OpData data;
 
-  const TfLiteTensor* input1 = GetInput(context, node, kInput1Tensor);
-  const TfLiteTensor* input2 = GetInput(context, node, kInput2Tensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteEvalTensor* input1 =
+      tflite::micro::GetEvalInput(context, node, kInput1Tensor);
+  const TfLiteEvalTensor* input2 =
+      tflite::micro::GetEvalInput(context, node, kInput2Tensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
 
-  CalculateOpData(context, node, params, &data);
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpData& data = *(static_cast<const OpData*>(node->user_data));
 
   switch (input1->type) {
     case kTfLiteUInt8:
     case kTfLiteInt8:
-      EvalQuantized(context, node, params, &data, input1, input2, output);
+      EvalQuantized(context, node, params, data, input1, input2, output);
       break;
     case kTfLiteFloat32:
-      EvalFloat(context, node, params, &data, input1, input2, output);
+      EvalFloat(context, node, params, input1, input2, output);
       break;
     default:
       TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
@@ -179,8 +211,14 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace mul
 
 TfLiteRegistration Register_MUL() {
-  return {nullptr /* Init */, nullptr /* Free */, nullptr /* Prepare */,
-          mul::Eval};
+  return {/* Init=*/mul::Init,
+          /* Free=*/nullptr,
+          /* Prepare=*/mul::Prepare,
+          /*invoke=*/mul::Eval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
 }
 
 }  // namespace micro
diff --git a/tensorflow/lite/micro/kernels/cmsis-nn/pooling.cc b/tensorflow/lite/micro/kernels/cmsis-nn/pooling.cc
index d0babb4b98d..4229b2c244c 100644
--- a/tensorflow/lite/micro/kernels/cmsis-nn/pooling.cc
+++ b/tensorflow/lite/micro/kernels/cmsis-nn/pooling.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/padding.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
 
 namespace tflite {
 namespace ops {
@@ -72,7 +73,7 @@ TfLiteStatus CalculateOpData(TfLiteContext* context,
 
 void AverageEvalFloat(const TfLiteContext* context, const TfLiteNode* node,
                       const TfLitePoolParams* params, const OpData& data,
-                      const TfLiteTensor* input, TfLiteTensor* output) {
+                      const TfLiteEvalTensor* input, TfLiteEvalTensor* output) {
   float activation_min, activation_max;
   CalculateActivationRange(params->activation, &activation_min,
                            &activation_max);
@@ -86,14 +87,16 @@ void AverageEvalFloat(const TfLiteContext* context, const TfLiteNode* node,
   op_params.padding_values.width = data.padding.width;
   op_params.float_activation_min = activation_min;
   op_params.float_activation_max = activation_max;
-  reference_ops::AveragePool(
-      op_params, GetTensorShape(input), GetTensorData<float>(input),
-      GetTensorShape(output), GetTensorData<float>(output));
+  reference_ops::AveragePool(op_params, tflite::micro::GetTensorShape(input),
+                             tflite::micro::GetTensorData<float>(input),
+                             tflite::micro::GetTensorShape(output),
+                             tflite::micro::GetTensorData<float>(output));
 }
 
 void AverageEvalQuantized(TfLiteContext* context, const TfLiteNode* node,
                           const TfLitePoolParams* params, const OpData& data,
-                          const TfLiteTensor* input, TfLiteTensor* output) {
+                          const TfLiteEvalTensor* input,
+                          TfLiteEvalTensor* output) {
   TFLITE_DCHECK(input->type == kTfLiteUInt8 || input->type == kTfLiteInt8);
 
   PoolParams op_params;
@@ -107,14 +110,15 @@ void AverageEvalQuantized(TfLiteContext* context, const TfLiteNode* node,
   op_params.quantized_activation_max = data.activation_max;
 
   if (input->type == kTfLiteUInt8) {
-    reference_ops::AveragePool(
-        op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
-        GetTensorShape(output), GetTensorData<uint8_t>(output));
+    reference_ops::AveragePool(op_params, tflite::micro::GetTensorShape(input),
+                               tflite::micro::GetTensorData<uint8_t>(input),
+                               tflite::micro::GetTensorShape(output),
+                               tflite::micro::GetTensorData<uint8_t>(output));
   } else {
-    RuntimeShape input_shape = GetTensorShape(input);
+    RuntimeShape input_shape = tflite::micro::GetTensorShape(input);
     TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
 
-    RuntimeShape output_shape = GetTensorShape(output);
+    RuntimeShape output_shape = tflite::micro::GetTensorShape(output);
     TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
 
     const int depth = MatchingDim(input_shape, 3, output_shape, 3);
@@ -154,15 +158,16 @@ void AverageEvalQuantized(TfLiteContext* context, const TfLiteNode* node,
 
     TFLITE_DCHECK_EQ(
         arm_avgpool_s8(&ctx, &pool_params, &input_dims,
-                       GetTensorData<int8_t>(input), &filter_dims, &output_dims,
-                       GetTensorData<int8_t>(output)),
+                       tflite::micro::GetTensorData<int8_t>(input),
+                       &filter_dims, &output_dims,
+                       tflite::micro::GetTensorData<int8_t>(output)),
         ARM_MATH_SUCCESS);
   }
 }
 
 void MaxEvalFloat(TfLiteContext* context, TfLiteNode* node,
                   TfLitePoolParams* params, const OpData& data,
-                  TfLiteTensor* input, TfLiteTensor* output) {
+                  const TfLiteEvalTensor* input, TfLiteEvalTensor* output) {
   float activation_min, activation_max;
   CalculateActivationRange(params->activation, &activation_min,
                            &activation_max);
@@ -175,14 +180,16 @@ void MaxEvalFloat(TfLiteContext* context, TfLiteNode* node,
   op_params.padding_values.width = data.padding.width;
   op_params.float_activation_min = activation_min;
   op_params.float_activation_max = activation_max;
-  reference_ops::MaxPool(op_params, GetTensorShape(input),
-                         GetTensorData<float>(input), GetTensorShape(output),
-                         GetTensorData<float>(output));
+  reference_ops::MaxPool(op_params, tflite::micro::GetTensorShape(input),
+                         tflite::micro::GetTensorData<float>(input),
+                         tflite::micro::GetTensorShape(output),
+                         tflite::micro::GetTensorData<float>(output));
 }
 
 void MaxEvalQuantizedUInt8(TfLiteContext* context, TfLiteNode* node,
                            TfLitePoolParams* params, const OpData& data,
-                           TfLiteTensor* input, TfLiteTensor* output) {
+                           const TfLiteEvalTensor* input,
+                           TfLiteEvalTensor* output) {
   tflite::PoolParams op_params;
   op_params.stride_height = params->stride_height;
   op_params.stride_width = params->stride_width;
@@ -192,16 +199,18 @@ void MaxEvalQuantizedUInt8(TfLiteContext* context, TfLiteNode* node,
   op_params.padding_values.width = data.padding.width;
   op_params.quantized_activation_min = data.activation_min;
   op_params.quantized_activation_max = data.activation_max;
-  reference_ops::MaxPool(op_params, GetTensorShape(input),
-                         GetTensorData<uint8_t>(input), GetTensorShape(output),
-                         GetTensorData<uint8_t>(output));
+  reference_ops::MaxPool(op_params, tflite::micro::GetTensorShape(input),
+                         tflite::micro::GetTensorData<uint8_t>(input),
+                         tflite::micro::GetTensorShape(output),
+                         tflite::micro::GetTensorData<uint8_t>(output));
 }
 
 TfLiteStatus MaxEvalInt8(TfLiteContext* context, const TfLiteNode* node,
                          const TfLitePoolParams* params, const OpData& data,
-                         TfLiteTensor* input, TfLiteTensor* output) {
-  RuntimeShape input_shape = GetTensorShape(input);
-  RuntimeShape output_shape = GetTensorShape(output);
+                         const TfLiteEvalTensor* input,
+                         TfLiteEvalTensor* output) {
+  RuntimeShape input_shape = tflite::micro::GetTensorShape(input);
+  RuntimeShape output_shape = tflite::micro::GetTensorShape(output);
   const int depth = MatchingDim(input_shape, 3, output_shape, 3);
 
   cmsis_nn_dims input_dims;
@@ -237,10 +246,12 @@ TfLiteStatus MaxEvalInt8(TfLiteContext* context, const TfLiteNode* node,
     ctx.buf = context->GetScratchBuffer(context, data.buffer_idx);
   }
 
-  TFLITE_DCHECK_EQ(arm_max_pool_s8(&ctx, &pool_params, &input_dims,
-                                   GetTensorData<int8_t>(input), &filter_dims,
-                                   &output_dims, GetTensorData<int8_t>(output)),
-                   ARM_MATH_SUCCESS);
+  TFLITE_DCHECK_EQ(
+      arm_max_pool_s8(&ctx, &pool_params, &input_dims,
+                      tflite::micro::GetTensorData<int8_t>(input), &filter_dims,
+                      &output_dims,
+                      tflite::micro::GetTensorData<int8_t>(output)),
+      ARM_MATH_SUCCESS);
 
   return kTfLiteOk;
 }
@@ -307,8 +318,10 @@ TfLiteStatus AverageEval(TfLiteContext* context, TfLiteNode* node) {
 
   const OpData& data = *(static_cast<const OpData*>(node->user_data));
 
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
 
   // Inputs and outputs share the same type, guaranteed by the converter.
   switch (input->type) {
@@ -332,9 +345,10 @@ TfLiteStatus MaxEval(TfLiteContext* context, TfLiteNode* node) {
 
   const OpData& data = *(static_cast<const OpData*>(node->user_data));
 
-  TfLiteTensor* input = &context->tensors[flatbuffers::EndianScalar(
-      node->inputs->data[kInputTensor])];
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
 
   switch (input->type) {
     case kTfLiteFloat32:
diff --git a/tensorflow/lite/micro/kernels/cmsis-nn/softmax.cc b/tensorflow/lite/micro/kernels/cmsis-nn/softmax.cc
index 790af35f217..194bba4f26a 100644
--- a/tensorflow/lite/micro/kernels/cmsis-nn/softmax.cc
+++ b/tensorflow/lite/micro/kernels/cmsis-nn/softmax.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "cmsis/CMSIS/NN/Include/arm_nnfunctions.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
 
 namespace tflite {
 namespace ops {
@@ -47,8 +48,6 @@ TfLiteStatus CalculateSoftmaxParams(TfLiteContext* context,
         TF_LITE_ENSURE(context, output->params.scale == 1.f / 256);
       }
     }
-    TF_LITE_ENSURE(context, (output->params.scale == 1.f / 256) ||
-                                (output->params.scale == 1.f / 255));
 
     static const int kScaledDiffIntegerBits = 5;
 
@@ -71,37 +70,53 @@ TfLiteStatus CalculateSoftmaxParams(TfLiteContext* context,
 
 }  // namespace
 
+void* SoftmaxInit(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(SoftmaxParams));
+}
+
 TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = static_cast<TfLiteSoftmaxParams*>(node->builtin_data);
+
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
   const TfLiteTensor* input = GetInput(context, node, 0);
   TF_LITE_ENSURE(context, NumDimensions(input) >= 1);
 
-  return kTfLiteOk;
+  TfLiteTensor* output = GetOutput(context, node, 0);
+
+  TFLITE_DCHECK(node->user_data != nullptr);
+  SoftmaxParams* data = static_cast<SoftmaxParams*>(node->user_data);
+  return CalculateSoftmaxParams(context, input, output, params, data);
 }
 
 // Takes a tensor and performs softmax along the last dimension.
-void SoftmaxFloat(const TfLiteTensor* input, TfLiteTensor* output,
+void SoftmaxFloat(const TfLiteEvalTensor* input, TfLiteEvalTensor* output,
                   const SoftmaxParams& op_data) {
-  tflite::reference_ops::Softmax(
-      op_data, GetTensorShape(input), GetTensorData<float>(input),
-      GetTensorShape(output), GetTensorData<float>(output));
+  tflite::reference_ops::Softmax(op_data, tflite::micro::GetTensorShape(input),
+                                 tflite::micro::GetTensorData<float>(input),
+                                 tflite::micro::GetTensorShape(output),
+                                 tflite::micro::GetTensorData<float>(output));
 }
 
-void SoftmaxQuantized(const TfLiteTensor* input, TfLiteTensor* output,
+void SoftmaxQuantized(const TfLiteEvalTensor* input, TfLiteEvalTensor* output,
                       const SoftmaxParams& op_data) {
-  const auto input_shape = GetTensorShape(input);
-  const auto output_shape = GetTensorShape(output);
+  const auto input_shape = tflite::micro::GetTensorShape(input);
+  const auto output_shape = tflite::micro::GetTensorShape(output);
 
   if (input->type == kTfLiteUInt8) {
-    tflite::reference_ops::Softmax(op_data, input_shape,
-                                   GetTensorData<uint8_t>(input), output_shape,
-                                   GetTensorData<uint8_t>(output));
+    tflite::reference_ops::Softmax(
+        op_data, tflite::micro::GetTensorShape(input),
+        tflite::micro::GetTensorData<uint8_t>(input),
+        tflite::micro::GetTensorShape(output),
+        tflite::micro::GetTensorData<uint8_t>(output));
   } else {
     if (output->type == kTfLiteInt16) {
       tflite::reference_ops::Softmax(
-          op_data, GetTensorShape(input), GetTensorData<int8_t>(input),
-          GetTensorShape(output), GetTensorData<int16_t>(output));
+          op_data, tflite::micro::GetTensorShape(input),
+          tflite::micro::GetTensorData<int8_t>(input),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<int16_t>(output));
     } else {
       const int trailing_dim = input_shape.DimensionsCount() - 1;
       const int outer_size =
@@ -109,31 +124,30 @@ void SoftmaxQuantized(const TfLiteTensor* input, TfLiteTensor* output,
       const int depth =
           MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
 
-      arm_softmax_s8(GetTensorData<int8_t>(input), outer_size, depth,
-                     op_data.input_multiplier, op_data.input_left_shift,
-                     op_data.diff_min, GetTensorData<int8_t>(output));
+      arm_softmax_s8(tflite::micro::GetTensorData<int8_t>(input), outer_size,
+                     depth, op_data.input_multiplier, op_data.input_left_shift,
+                     op_data.diff_min,
+                     tflite::micro::GetTensorData<int8_t>(output));
     }
   }
 }
 
 TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
-  auto* params = static_cast<TfLiteSoftmaxParams*>(node->builtin_data);
+  const TfLiteEvalTensor* input = tflite::micro::GetEvalInput(context, node, 0);
+  TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, 0);
 
-  const TfLiteTensor* input = GetInput(context, node, 0);
-  TfLiteTensor* output = GetOutput(context, node, 0);
-
-  SoftmaxParams op_data;
-  TF_LITE_ENSURE_STATUS(
-      CalculateSoftmaxParams(context, input, output, params, &op_data));
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const SoftmaxParams& data =
+      *(static_cast<const SoftmaxParams*>(node->user_data));
 
   switch (input->type) {
     case kTfLiteFloat32: {
-      SoftmaxFloat(input, output, op_data);
+      SoftmaxFloat(input, output, data);
       return kTfLiteOk;
     }
     case kTfLiteInt8:
     case kTfLiteUInt8: {
-      SoftmaxQuantized(input, output, op_data);
+      SoftmaxQuantized(input, output, data);
       return kTfLiteOk;
     }
     default:
@@ -142,10 +156,11 @@ TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
       return kTfLiteError;
   }
 }
+
 }  // namespace activations
 
 TfLiteRegistration Register_SOFTMAX() {
-  return {/*init=*/nullptr,
+  return {/*init=*/activations::SoftmaxInit,
           /*free=*/nullptr,
           /*prepare=*/activations::SoftmaxPrepare,
           /*invoke=*/activations::SoftmaxEval,
diff --git a/tensorflow/lite/micro/kernels/concatenation.cc b/tensorflow/lite/micro/kernels/concatenation.cc
index f64362745be..636a7636a7b 100644
--- a/tensorflow/lite/micro/kernels/concatenation.cc
+++ b/tensorflow/lite/micro/kernels/concatenation.cc
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/portable_tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
diff --git a/tensorflow/lite/micro/kernels/ethos-u/ethosu.cc b/tensorflow/lite/micro/kernels/ethos-u/ethosu.cc
index ddb144406bb..05b44714773 100644
--- a/tensorflow/lite/micro/kernels/ethos-u/ethosu.cc
+++ b/tensorflow/lite/micro/kernels/ethos-u/ethosu.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <ethosu_driver.h>
 
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/tools/make/downloads/flatbuffers/include/flatbuffers/flexbuffers.h"
 
 namespace tflite {
@@ -26,30 +27,58 @@ namespace ethosu {
 
 constexpr uint8_t CO_TYPE_ETHOSU = 1;
 
+struct OpData {
+  int cms_data_size;
+  int base_addr_idx;
+  int base_addr_size_idx;
+};
+
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
-  return nullptr;
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
 }
 
 void Free(TfLiteContext* context, void* buffer) {}
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(context != nullptr);
   TF_LITE_ENSURE(context, node->inputs->size > 0);
-  TF_LITE_ENSURE(context, context->tensors);
+  TFLITE_DCHECK(node->user_data != nullptr);
   TF_LITE_ENSURE(context, node->custom_initial_data_size > 0);
+
+  OpData* data = static_cast<OpData*>(node->user_data);
+  int num_base_addr = node->inputs->size + node->outputs->size;
+
+  // Request arrays for the base address pointers and sizes
+  TF_LITE_ENSURE_STATUS(context->RequestScratchBufferInArena(
+      context, num_base_addr * sizeof(uint64_t), &data->base_addr_idx));
+  TF_LITE_ENSURE_STATUS(context->RequestScratchBufferInArena(
+      context, num_base_addr * sizeof(size_t), &data->base_addr_size_idx));
+
+  // Get command stream data size
+  TfLiteTensor* tensor = context->GetTensor(context, node->inputs->data[0]);
+  data->cms_data_size = tensor->bytes;
+
   return kTfLiteOk;
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(context != nullptr);
+  TFLITE_DCHECK(context->GetScratchBuffer != nullptr);
+
   // Get base addresses
-  TfLiteTensor* tensor;
-  int num_base_addr = node->inputs->size + node->outputs->size;
+  TfLiteEvalTensor* tensor;
   int i = 0;
   int num_tensors = 0;
-  uint64_t base_addrs[num_base_addr];
   void* cms_data;
-  int cms_data_size;
   uint8_t co_type;
   int result;
+  const OpData* data = static_cast<const OpData*>(node->user_data);
+  uint64_t* base_addrs = static_cast<uint64_t*>(
+      context->GetScratchBuffer(context, data->base_addr_idx));
+  size_t* base_addrs_size = static_cast<size_t*>(
+      context->GetScratchBuffer(context, data->base_addr_size_idx));
 
   const uint8_t* custom_data =
       static_cast<uint8_t const*>(node->custom_initial_data);
@@ -60,26 +89,32 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     return kTfLiteError;
   }
 
-  // Get command stream data address and size
-  tensor = &(context->tensors[node->inputs->data[0]]);
+  // Get command stream data address
+  tensor = context->GetEvalTensor(context, node->inputs->data[0]);
   cms_data = reinterpret_cast<void*>(tensor->data.uint8);
-  cms_data_size = tensor->bytes;
 
   // Get adresses to weights/scratch/input data
   for (i = 1; i < node->inputs->size; ++i) {
-    tensor = &(context->tensors[node->inputs->data[i]]);
+    tensor = context->GetEvalTensor(context, node->inputs->data[i]);
     base_addrs[num_tensors] = reinterpret_cast<uint64_t>(tensor->data.uint8);
+    base_addrs_size[num_tensors] = tensor->dims->size;
     num_tensors++;
   }
 
   // Get adresses to output data
   for (i = 0; i < node->outputs->size; ++i) {
-    tensor = &(context->tensors[node->outputs->data[i]]);
+    tensor = context->GetEvalTensor(context, node->outputs->data[i]);
     base_addrs[num_tensors] = reinterpret_cast<uint64_t>(tensor->data.uint8);
+    base_addrs_size[num_tensors] = tensor->dims->size;
     num_tensors++;
   }
 
-  result = ethosu_invoke(cms_data, cms_data_size, base_addrs, num_tensors);
+  // Ethos-U guarantees that the tensors that require a base pointer are among
+  // the 8 first tensors
+  num_tensors = std::min(num_tensors, 8);
+
+  result = ethosu_invoke_v2(cms_data, data->cms_data_size, base_addrs,
+                            base_addrs_size, num_tensors);
   if (-1 == result) {
     return kTfLiteError;
   } else {
@@ -89,8 +124,16 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
 }  // namespace ethosu
 
-TfLiteRegistration Register_ETHOSU() {
-  return {ethosu::Init, ethosu::Free, ethosu::Prepare, ethosu::Eval};
+TfLiteRegistration* Register_ETHOSU() {
+  static TfLiteRegistration r = {ethosu::Init,
+                                 ethosu::Free,
+                                 ethosu::Prepare,
+                                 ethosu::Eval,
+                                 /*profiling_string=*/nullptr,
+                                 /*builtin_code=*/0,
+                                 /*custom_name=*/nullptr,
+                                 /*version=*/0};
+  return &r;
 }
 
 const char* GetString_ETHOSU() { return "ethos-u"; }
diff --git a/tensorflow/lite/micro/kernels/kernel_util.cc b/tensorflow/lite/micro/kernels/kernel_util.cc
index 860887add69..deca92b648f 100644
--- a/tensorflow/lite/micro/kernels/kernel_util.cc
+++ b/tensorflow/lite/micro/kernels/kernel_util.cc
@@ -15,38 +15,11 @@ limitations under the License.
 
 #include "tensorflow/lite/micro/kernels/kernel_util.h"
 
+#include "tensorflow/lite/c/common.h"
+
 namespace tflite {
 namespace micro {
 
-const TfLiteEvalTensor* GetEvalInput(const TfLiteContext* context,
-                                     const TfLiteNode* node, int index) {
-  return GetMutableEvalInput(context, node, index);
-}
-
-TfLiteEvalTensor* GetMutableEvalInput(const TfLiteContext* context,
-                                      const TfLiteNode* node, int index) {
-  TFLITE_DCHECK(context != nullptr);
-  TFLITE_DCHECK(node != nullptr);
-  return context->GetEvalTensor(context, node->inputs->data[index]);
-}
-
-TfLiteEvalTensor* GetEvalOutput(const TfLiteContext* context,
-                                const TfLiteNode* node, int index) {
-  TFLITE_DCHECK(context != nullptr);
-  TFLITE_DCHECK(node != nullptr);
-  return context->GetEvalTensor(context, node->outputs->data[index]);
-}
-
-const RuntimeShape GetTensorShape(const TfLiteEvalTensor* tensor) {
-  if (tensor == nullptr) {
-    return RuntimeShape();
-  }
-  TfLiteIntArray* dims = tensor->dims;
-  const int dims_size = dims->size;
-  const int32_t* dims_data = reinterpret_cast<const int32_t*>(dims->data);
-  return RuntimeShape(dims_size, dims_data);
-}
-
 bool HaveSameShapes(const TfLiteEvalTensor* input1,
                     const TfLiteEvalTensor* input2) {
   TFLITE_DCHECK(input1 != nullptr);
@@ -54,5 +27,15 @@ bool HaveSameShapes(const TfLiteEvalTensor* input1,
   return TfLiteIntArrayEqual(input1->dims, input2->dims);
 }
 
+const RuntimeShape GetTensorShape(const TfLiteEvalTensor* tensor) {
+  if (tensor == nullptr || tensor->dims == nullptr) {
+    return RuntimeShape();
+  }
+  TfLiteIntArray* dims = tensor->dims;
+  const int dims_size = dims->size;
+  const int32_t* dims_data = reinterpret_cast<const int32_t*>(dims->data);
+  return RuntimeShape(dims_size, dims_data);
+}
+
 }  // namespace micro
 }  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/kernel_util.h b/tensorflow/lite/micro/kernels/kernel_util.h
index fe702dceee0..79cd58ec045 100644
--- a/tensorflow/lite/micro/kernels/kernel_util.h
+++ b/tensorflow/lite/micro/kernels/kernel_util.h
@@ -16,24 +16,38 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_MICRO_KERNELS_KERNEL_UTIL_H_
 #define TENSORFLOW_LITE_MICRO_KERNELS_KERNEL_UTIL_H_
 
+#include <cstdint>
+
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 
 namespace tflite {
 namespace micro {
 
-// Returns the TfLiteEvalTensor struct for a given input index in a node.
-const TfLiteEvalTensor* GetEvalInput(const TfLiteContext* context,
-                                     const TfLiteNode* node, int index);
-
 // Returns a mutable tensor for a given input index. is_variable must be checked
 // during prepare when the full TfLiteTensor is available.
-TfLiteEvalTensor* GetMutableEvalInput(const TfLiteContext* context,
-                                      const TfLiteNode* node, int index);
+inline TfLiteEvalTensor* GetMutableEvalInput(const TfLiteContext* context,
+                                             const TfLiteNode* node,
+                                             int index) {
+  TFLITE_DCHECK(context != nullptr);
+  TFLITE_DCHECK(node != nullptr);
+  return context->GetEvalTensor(context, node->inputs->data[index]);
+}
+
+// Returns the TfLiteEvalTensor struct for a given input index in a node.
+inline const TfLiteEvalTensor* GetEvalInput(const TfLiteContext* context,
+                                            const TfLiteNode* node, int index) {
+  return GetMutableEvalInput(context, node, index);
+}
 
 // Returns the TfLiteEvalTensor struct for a given output index in a node.
-TfLiteEvalTensor* GetEvalOutput(const TfLiteContext* context,
-                                const TfLiteNode* node, int index);
+inline TfLiteEvalTensor* GetEvalOutput(const TfLiteContext* context,
+                                       const TfLiteNode* node, int index) {
+  TFLITE_DCHECK(context != nullptr);
+  TFLITE_DCHECK(node != nullptr);
+  return context->GetEvalTensor(context, node->outputs->data[index]);
+}
 
 // Returns data for a TfLiteEvalTensor struct.
 template <typename T>
diff --git a/tensorflow/lite/micro/kernels/l2norm.cc b/tensorflow/lite/micro/kernels/l2norm.cc
index f864efa271c..02fdfc0f39b 100644
--- a/tensorflow/lite/micro/kernels/l2norm.cc
+++ b/tensorflow/lite/micro/kernels/l2norm.cc
@@ -14,9 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/portable_tensor.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/l2normalization.h"
 #include "tensorflow/lite/kernels/internal/reference/l2normalization.h"
-#include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/kernels/kernel_util.h"
 
diff --git a/tensorflow/lite/micro/kernels/micro_ops.h b/tensorflow/lite/micro/kernels/micro_ops.h
index 7e63f342280..f86b28a9ff2 100644
--- a/tensorflow/lite/micro/kernels/micro_ops.h
+++ b/tensorflow/lite/micro/kernels/micro_ops.h
@@ -67,6 +67,7 @@ TfLiteRegistration Register_PAD();
 TfLiteRegistration Register_PADV2();
 TfLiteRegistration Register_PRELU();
 TfLiteRegistration Register_QUANTIZE();
+TfLiteRegistration Register_REDUCE_MAX();
 TfLiteRegistration Register_RELU();
 TfLiteRegistration Register_RELU6();
 TfLiteRegistration Register_RESHAPE();
@@ -76,6 +77,7 @@ TfLiteRegistration Register_RSQRT();
 TfLiteRegistration Register_SIN();
 TfLiteRegistration Register_SOFTMAX();
 TfLiteRegistration Register_SPLIT();
+TfLiteRegistration Register_SPLIT_V();
 TfLiteRegistration Register_SQRT();
 TfLiteRegistration Register_SQUARE();
 TfLiteRegistration Register_STRIDED_SLICE();
diff --git a/tensorflow/lite/micro/kernels/pad.cc b/tensorflow/lite/micro/kernels/pad.cc
index 39f86cbf9a3..497632f22a0 100644
--- a/tensorflow/lite/micro/kernels/pad.cc
+++ b/tensorflow/lite/micro/kernels/pad.cc
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/portable_tensor.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
diff --git a/tensorflow/lite/micro/kernels/quantize.cc b/tensorflow/lite/micro/kernels/quantize.cc
index 832379060dd..efa92046353 100644
--- a/tensorflow/lite/micro/kernels/quantize.cc
+++ b/tensorflow/lite/micro/kernels/quantize.cc
@@ -65,13 +65,15 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE(context, input->type == kTfLiteFloat32 ||
                               input->type == kTfLiteInt16 ||
                               input->type == kTfLiteInt8);
-  TF_LITE_ENSURE(context,
-                 output->type == kTfLiteUInt8 || output->type == kTfLiteInt8);
+  TF_LITE_ENSURE(context, output->type == kTfLiteUInt8 ||
+                              output->type == kTfLiteInt8 ||
+                              output->type == kTfLiteInt16);
 
-  if ((input->type == kTfLiteInt16 || input->type == kTfLiteInt8) &&
-      output->type == kTfLiteInt8) {
-    double effective_scale =
-        static_cast<double>(input->params.scale / output->params.scale);
+  if (((input->type == kTfLiteInt16 || input->type == kTfLiteInt8) &&
+       output->type == kTfLiteInt8) ||
+      (input->type == kTfLiteInt16 && output->type == kTfLiteInt16)) {
+    double effective_scale = static_cast<double>(input->params.scale) /
+                             static_cast<double>(output->params.scale);
 
     QuantizeMultiplier(effective_scale, &data->output_multiplier,
                        &data->output_shift);
@@ -107,6 +109,13 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
             tflite::micro::GetTensorShape(output),
             tflite::micro::GetTensorData<uint8_t>(output));
         break;
+      case kTfLiteInt16:
+        reference_ops::AffineQuantize(
+            data->quantization_params, tflite::micro::GetTensorShape(input),
+            tflite::micro::GetTensorData<float>(input),
+            tflite::micro::GetTensorShape(output),
+            tflite::micro::GetTensorData<int16_t>(output));
+        return kTfLiteOk;
       default:
         TF_LITE_KERNEL_LOG(context, "Input %s, output %s not supported.",
                            TfLiteTypeGetName(input->type),
@@ -123,6 +132,13 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
                                   data->quantization_params.zero_point,
                                   tflite::micro::GetTensorData<int8_t>(output));
         break;
+      case kTfLiteInt16:
+        reference_ops::Requantize(
+            tflite::micro::GetTensorData<int16_t>(input), size,
+            data->output_multiplier, data->output_shift, data->input_zero_point,
+            data->quantization_params.zero_point,
+            tflite::micro::GetTensorData<int16_t>(output));
+        return kTfLiteOk;
       default:
         TF_LITE_KERNEL_LOG(context, "Input %s, output %s not supported.",
                            TfLiteTypeGetName(input->type),
diff --git a/tensorflow/lite/micro/kernels/quantize_test.cc b/tensorflow/lite/micro/kernels/quantize_test.cc
index 588b23ca834..b92758d3b93 100644
--- a/tensorflow/lite/micro/kernels/quantize_test.cc
+++ b/tensorflow/lite/micro/kernels/quantize_test.cc
@@ -173,6 +173,32 @@ TF_LITE_MICRO_TEST(QuantizeOpTestInt8NoScale) {
       dims, values, dims, values, values_quantized, scale, zero_point, output);
 }
 
+TF_LITE_MICRO_TEST(QuantizeOpTestInt16) {
+  const int length = 10;
+  const int dims[] = {2, 2, 5};
+  const float values[] = {-63.5, -63,  -62.5, -62,  -61.5,
+                          62,    62.5, 63,    63.5, 64};
+  const float scale = 0.5;
+  const int zero_point = -1;
+  int16_t output[length];
+  int16_t values_quantized[length];
+  tflite::testing::TestQuantizeFloat(
+      dims, values, dims, values, values_quantized, scale, zero_point, output);
+}
+
+TF_LITE_MICRO_TEST(QuantizeOpTestInt16NoScale) {
+  const int length = 10;
+  const int dims[] = {2, 2, 5};
+  const float values[] = {-128, -127, -126, -125, -124,
+                          123,  124,  125,  126,  127};
+  const float scale = 1.0;
+  const int zero_point = 0;
+  int16_t output[length];
+  int16_t values_quantized[length];
+  tflite::testing::TestQuantizeFloat(
+      dims, values, dims, values, values_quantized, scale, zero_point, output);
+}
+
 TF_LITE_MICRO_TEST(QuantizeOpTestInt16toInt8) {
   const int length = 10;
   const int dims[] = {2, 2, 5};
@@ -190,6 +216,40 @@ TF_LITE_MICRO_TEST(QuantizeOpTestInt16toInt8) {
                                   output_zero_point, output_quantized);
 }
 
+TF_LITE_MICRO_TEST(QuantizeOpTestInt16toInt16) {
+  const int length = 10;
+  const int dims[] = {2, 2, 5};
+  const float values[] = {-64, -62, -60, -58, -56, 54, 56, 58, 60, 62};
+  const float input_scale = 2.f;
+  const int input_zero_point = 0;
+  const float output_scale = 0.5;
+  const int output_zero_point = 32;
+  int16_t output_quantized[length];
+  int16_t values_quantized[length];
+  int16_t input_quantized[length];
+  tflite::testing::TestRequantize(dims, values, input_quantized, input_scale,
+                                  input_zero_point, dims, values,
+                                  values_quantized, output_scale,
+                                  output_zero_point, output_quantized);
+}
+
+TF_LITE_MICRO_TEST(QuantizeOpTestInt16toInt16NoZeroPoint) {
+  const int length = 10;
+  const int dims[] = {2, 2, 5};
+  const float values[] = {-32, -31, -30, -29, -28, 27, 28, 29, 30, 31};
+  const float input_scale = 1.f;
+  const int input_zero_point = 0;
+  const float output_scale = 0.5;
+  const int output_zero_point = 0;
+  int16_t output_quantized[length];
+  int16_t values_quantized[length];
+  int16_t input_quantized[length];
+  tflite::testing::TestRequantize(dims, values, input_quantized, input_scale,
+                                  input_zero_point, dims, values,
+                                  values_quantized, output_scale,
+                                  output_zero_point, output_quantized);
+}
+
 TF_LITE_MICRO_TEST(QuantizeOpTestInt8toInt8) {
   const int length = 10;
   const int dims[] = {2, 2, 5};
diff --git a/tensorflow/lite/micro/kernels/reduce.cc b/tensorflow/lite/micro/kernels/reduce.cc
index 5cae782482e..3ad1068d918 100644
--- a/tensorflow/lite/micro/kernels/reduce.cc
+++ b/tensorflow/lite/micro/kernels/reduce.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/mean.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
@@ -32,10 +33,27 @@ namespace reduce {
 constexpr int kMaxNumberOfAxis = 4;
 constexpr int kMaxNumberOfReducedAxis = 2;
 
+struct OpData {
+  int32_t multiplier;
+  int shift;
+  int temp_buffer_idx;
+  int resolved_axis_idx;
+  int input_zp;
+  float input_scale;
+  int output_zp;
+  float output_scale;
+  int num_output_elements;
+};
+
+void* InitReduce(TfLiteContext* context, const char* buffer, size_t length) {
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
+}
+
 TfLiteStatus PrepareSimple(TfLiteContext* context, TfLiteNode* node) {
   // Inputs Tensor (dtype depends on quantization):
   // [0] = Input
   // [1] = Axis
+  const TfLiteTensor* input = GetInput(context, node, 0);
 
   // Outputs Tensor (dtype depends on quantization):
   // [0] = Output
@@ -47,10 +65,59 @@ TfLiteStatus PrepareSimple(TfLiteContext* context, TfLiteNode* node) {
   // Validate axis type
   const TfLiteTensor* axis = GetInput(context, node, 1);
   TF_LITE_ENSURE_TYPES_EQ(context, axis->type, kTfLiteInt32);
+
+  if (input->type == kTfLiteInt8) {
+    OpData* data = static_cast<OpData*>(node->user_data);
+    const TfLiteTensor* output = GetOutput(context, node, 0);
+    const double real_multiplier = static_cast<double>(input->params.scale) /
+                                   static_cast<double>(output->params.scale);
+    QuantizeMultiplier(real_multiplier, &data->multiplier, &data->shift);
+  }
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus PrepareMax(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_OK(context, PrepareSimple(context, node));
+
+  OpData* op_data = static_cast<OpData*>(node->user_data);
+  const TfLiteTensor* input = GetInput(context, node, 0);
+  const TfLiteTensor* output = GetOutput(context, node, 0);
+  const TfLiteTensor* axis = GetInput(context, node, 1);
+
+  op_data->input_scale = input->params.scale;
+  op_data->output_scale = output->params.scale;
+  op_data->num_output_elements = NumElements(output);
+
+  context->RequestScratchBufferInArena(context, sizeof(int) * input->dims->size,
+                                       &op_data->temp_buffer_idx);
+  context->RequestScratchBufferInArena(
+      context, sizeof(int) * static_cast<int>(ElementCount(*axis->dims)),
+      &op_data->resolved_axis_idx);
+
   return kTfLiteOk;
 }
 
 TfLiteStatus PrepareMeanOrSum(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, 0);
+  OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
+  const TfLiteTensor* output = GetOutput(context, node, 0);
+  if (input->type == kTfLiteInt8) {
+    const double real_multiplier = static_cast<double>(input->params.scale) /
+                                   static_cast<double>(output->params.scale);
+    QuantizeMultiplier(real_multiplier, &op_data->multiplier, &op_data->shift);
+  }
+
+  int output_size = NumElements(output);
+  if (input->type == kTfLiteInt8 || input->type == kTfLiteUInt8) {
+    context->RequestScratchBufferInArena(context, output_size * sizeof(int32_t),
+                                         &op_data->temp_buffer_idx);
+    op_data->input_zp = input->params.zero_point;
+    op_data->input_scale = input->params.scale;
+    op_data->output_zp = output->params.zero_point;
+    op_data->output_scale = output->params.scale;
+  }
+
   TF_LITE_ENSURE_OK(context, PrepareSimple(context, node));
   // TODO(b/144955155): Support uint8_t(b/144955155) and int8_t(b/144955018)
   return kTfLiteOk;
@@ -74,26 +141,25 @@ TfLiteStatus EvalMean(TfLiteContext* context, TfLiteNode* node) {
   TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, 0);
   TfLiteReducerParams* params =
       reinterpret_cast<TfLiteReducerParams*>(node->builtin_data);
+  OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
 
   int num_axis = static_cast<int>(ElementCount(*axis->dims));
   int temp_index[kMaxNumberOfAxis];
   int resolved_axis[kMaxNumberOfReducedAxis];
 
+  tflite::MeanParams op_params;
+  ResolveAxis(tflite::micro::GetTensorData<int>(axis), num_axis, &op_params);
+  // TODO(b/146571391): Support only 4D Input and 2D Axis for Mean until
+  // scratch tensor allocation has been implemented in (b/132070898)
+  bool is_valid_inputs = (input->dims->size == 4 && op_params.axis_count == 2 &&
+                          ((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
+                           (op_params.axis[0] == 2 && op_params.axis[1] == 1)));
+  TF_LITE_ENSURE_MSG(
+      context, is_valid_inputs == true,
+      "Number of Input "
+      "dimensions != 4 OR the Axis is not either [1, 2] or [2, 1]");
   switch (input->type) {
     case kTfLiteFloat32: {
-      tflite::MeanParams op_params;
-      ResolveAxis(tflite::micro::GetTensorData<int>(axis), num_axis,
-                  &op_params);
-      // TODO(b/146571391): Support only 4D Input and 2D Axis for Mean until
-      // scratch tensor allocation has been implemented in (b/132070898)
-      bool is_valid_inputs =
-          (input->dims->size == 4 && op_params.axis_count == 2 &&
-           ((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
-            (op_params.axis[0] == 2 && op_params.axis[1] == 1)));
-      TF_LITE_ENSURE_MSG(
-          context, is_valid_inputs == true,
-          "Number of Input "
-          "dimensions != 4 OR the Axis is not either [1, 2] or [2, 1]");
       // TODO(b/139102329): Handle the below special case in the combined
       // reference method.
       // Defer to specialized implementation for 4D Mean across axes 1 & 2.
@@ -114,18 +180,145 @@ TfLiteStatus EvalMean(TfLiteContext* context, TfLiteNode* node) {
                 tflite::micro::GetTensorData<float>(output)));
       }
     } break;
+    case kTfLiteInt8: {
+      if (params->keep_dims) {
+        reference_integer_ops::Mean(
+            op_params, op_data->multiplier, op_data->shift,
+            tflite::micro::GetTensorShape(input),
+            tflite::micro::GetTensorData<int8_t>(input), op_data->input_zp,
+            tflite::micro::GetTensorShape(output),
+            tflite::micro::GetTensorData<int8_t>(output), op_data->output_zp);
+      } else if (op_data->input_zp == op_data->output_zp &&
+                 op_data->input_scale == op_data->output_scale) {
+        int32_t* temp_buffer = static_cast<int32_t*>(
+            context->GetScratchBuffer(context, op_data->temp_buffer_idx));
+        TF_LITE_ENSURE(
+            context,
+            reference_ops::Mean(
+                tflite::micro::GetTensorData<int8_t>(input), input->dims->data,
+                input->dims->size, tflite::micro::GetTensorData<int8_t>(output),
+                output->dims->data, output->dims->size,
+                tflite::micro::GetTensorData<int>(axis), num_axis,
+                params->keep_dims, temp_index, resolved_axis, temp_buffer));
+      } else {
+        int32_t* temp_buffer = static_cast<int32_t*>(
+            context->GetScratchBuffer(context, op_data->temp_buffer_idx));
+        TF_LITE_ENSURE(
+            context,
+            reference_ops::QuantizedMeanOrSum(
+                tflite::micro::GetTensorData<int8_t>(input), op_data->input_zp,
+                op_data->input_scale, input->dims->data, input->dims->size,
+                tflite::micro::GetTensorData<int8_t>(output),
+                op_data->output_zp, op_data->output_scale, output->dims->data,
+                output->dims->size, tflite::micro::GetTensorData<int>(axis),
+                num_axis, params->keep_dims, temp_index, resolved_axis,
+                temp_buffer, false));
+      }
+    } break;
+    case kTfLiteUInt8: {
+      if (params->keep_dims) {
+        reference_ops::Mean(op_params, tflite::micro::GetTensorShape(input),
+                            tflite::micro::GetTensorData<uint8_t>(input),
+                            op_data->input_zp, op_data->input_scale,
+                            tflite::micro::GetTensorShape(output),
+                            tflite::micro::GetTensorData<uint8_t>(output),
+                            op_data->output_zp, op_data->output_scale);
+      } else if (op_data->input_zp == op_data->output_zp &&
+                 op_data->input_scale == op_data->output_scale) {
+        uint32_t* temp_buffer = static_cast<uint32_t*>(
+            context->GetScratchBuffer(context, op_data->temp_buffer_idx));
+        TF_LITE_ENSURE(
+            context,
+            reference_ops::Mean(tflite::micro::GetTensorData<uint8_t>(input),
+                                input->dims->data, input->dims->size,
+                                tflite::micro::GetTensorData<uint8_t>(output),
+                                output->dims->data, output->dims->size,
+                                tflite::micro::GetTensorData<int>(axis),
+                                num_axis, params->keep_dims, temp_index,
+                                resolved_axis, temp_buffer));
+      } else {
+        uint32_t* temp_buffer = static_cast<uint32_t*>(
+            context->GetScratchBuffer(context, op_data->temp_buffer_idx));
+        TF_LITE_ENSURE(
+            context,
+            reference_ops::QuantizedMeanOrSum(
+                tflite::micro::GetTensorData<uint8_t>(input), op_data->input_zp,
+                op_data->input_scale, input->dims->data, input->dims->size,
+                tflite::micro::GetTensorData<uint8_t>(output),
+                op_data->output_zp, op_data->output_scale, output->dims->data,
+                output->dims->size, tflite::micro::GetTensorData<int>(axis),
+                num_axis, params->keep_dims, temp_index, resolved_axis,
+                temp_buffer, false));
+      }
+    } break;
     default:
       // TODO(b/144955155): Support uint8_t(b/144955155) and int8_t(b/144955018)
       TF_LITE_ENSURE_MSG(context, false,
-                         "Currently, only float32 input type "
+                         "Currently, only float32, int8 or uint8 input type "
                          "is supported.");
   }
   return kTfLiteOk;
 }
+
+TfLiteStatus EvalMax(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteEvalTensor* input = tflite::micro::GetEvalInput(context, node, 0);
+  const TfLiteEvalTensor* axis = tflite::micro::GetEvalInput(context, node, 1);
+  TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, 0);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
+  TfLiteReducerParams* params =
+      static_cast<TfLiteReducerParams*>(node->builtin_data);
+  OpData* op_data = static_cast<OpData*>(node->user_data);
+
+  // Interpret an axis tensor with null dimensions as a scalar
+  int num_axis = static_cast<int>(ElementCount(*axis->dims));
+  int* temp_buffer = static_cast<int*>(
+      context->GetScratchBuffer(context, op_data->temp_buffer_idx));
+  int* resolved_axis = static_cast<int*>(
+      context->GetScratchBuffer(context, op_data->resolved_axis_idx));
+  switch (input->type) {
+    case kTfLiteFloat32:
+      TF_LITE_ENSURE(
+          context,
+          reference_ops::ReduceGeneric<float>(
+              tflite::micro::GetTensorData<float>(input), input->dims->data,
+              input->dims->size, tflite::micro::GetTensorData<float>(output),
+              output->dims->data, output->dims->size,
+              tflite::micro::GetTensorData<int>(axis), num_axis,
+              params->keep_dims, temp_buffer, resolved_axis,
+              std::numeric_limits<float>::lowest(),
+              [](const float current, const float in) -> float {
+                return (in > current) ? in : current;
+              }));
+      break;
+    case kTfLiteInt8:
+      TF_LITE_ENSURE_EQ(context, static_cast<double>(op_data->input_scale),
+                        static_cast<double>(op_data->output_scale));
+      TF_LITE_ENSURE_EQ(context, op_data->input_zp, op_data->output_zp);
+      TF_LITE_ENSURE(
+          context,
+          reference_ops::ReduceGeneric<int8_t>(
+              tflite::micro::GetTensorData<int8_t>(input), input->dims->data,
+              input->dims->size, tflite::micro::GetTensorData<int8_t>(output),
+              output->dims->data, output->dims->size,
+              tflite::micro::GetTensorData<int>(axis), num_axis,
+              params->keep_dims, temp_buffer, resolved_axis,
+              std::numeric_limits<int8_t>::lowest(),
+              [](const int8_t current, const int8_t in) -> int8_t {
+                return (in > current) ? in : current;
+              }));
+      break;
+    default:
+      TF_LITE_KERNEL_LOG(context,
+                         "Only float32 and int8 types are supported.\n");
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
 }  // namespace reduce
 
 TfLiteRegistration Register_MEAN() {
-  return {/*init=*/nullptr,
+  return {/*init=*/reduce::InitReduce,
           /*free=*/nullptr,
           /*prepare=*/reduce::PrepareMeanOrSum,
           /*invoke=*/reduce::EvalMean,
@@ -134,6 +327,18 @@ TfLiteRegistration Register_MEAN() {
           /*custom_name=*/nullptr,
           /*version=*/0};
 }
+
+TfLiteRegistration Register_REDUCE_MAX() {
+  return {/*init=*/reduce::InitReduce,
+          /*free=*/nullptr,
+          /*prepare=*/reduce::PrepareMax,
+          /*invoke=*/reduce::EvalMax,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
+}
+
 }  // namespace micro
 }  // namespace ops
 }  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/reduce_test.cc b/tensorflow/lite/micro/kernels/reduce_test.cc
index 1e3ded2bd77..26a73b6c8ce 100644
--- a/tensorflow/lite/micro/kernels/reduce_test.cc
+++ b/tensorflow/lite/micro/kernels/reduce_test.cc
@@ -25,7 +25,7 @@ namespace testing {
 namespace {
 
 // Common inputs and outputs.
-// static const int kInputElements4D = 24;
+static const int kInputElements4D = 24;
 static const int kInputShape4D[] = {4, 2, 2, 3, 2};
 static const float kInputData4D[] = {
     1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0,
@@ -43,6 +43,7 @@ template <typename T>
 TfLiteStatus ValidateReduceGoldens(TfLiteTensor* tensors, int tensors_size,
                                    const T* expected_output_data,
                                    T* output_data, int output_length,
+                                   const TfLiteRegistration& registration,
                                    TfLiteReducerParams* params,
                                    float tolerance = 1e-5) {
   int inputs_array_data[] = {2, 0, 1};
@@ -50,7 +51,6 @@ TfLiteStatus ValidateReduceGoldens(TfLiteTensor* tensors, int tensors_size,
   int outputs_array_data[] = {1, 2};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
-  const TfLiteRegistration registration = tflite::ops::micro::Register_MEAN();
   micro::KernelRunner runner(registration, tensors, tensors_size, inputs_array,
                              outputs_array, params, micro_test::reporter);
 
@@ -74,6 +74,8 @@ void TestMeanFloatInput4D(const int* input_dims_data, const float* input_data,
   TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
   const int output_dims_count = ElementCount(*output_dims);
 
+  const TfLiteRegistration registration = tflite::ops::micro::Register_MEAN();
+
   constexpr int num_of_inputs = 2;   // input and axis
   constexpr int num_of_outputs = 1;  // output
 
@@ -84,10 +86,114 @@ void TestMeanFloatInput4D(const int* input_dims_data, const float* input_data,
       CreateFloatTensor(output_data, output_dims),
   };
 
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, ValidateReduceGoldens(
+                     tensors, tensors_size, expected_output_data, output_data,
+                     output_dims_count, registration, params, tolerance));
+}
+
+void TestReduceOpFloat(const int* input_dims_data, const float* input_data,
+                       const int* axis_dims_data, const int32_t* axis_data,
+                       const int* output_dims_data, float* output_data,
+                       const float* expected_output_data,
+                       const TfLiteRegistration& registration,
+                       TfLiteReducerParams* params, float tolerance = 1e-5) {
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+  TfLiteIntArray* axis_dims = IntArrayFromInts(axis_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
+
+  constexpr int num_of_inputs = 2;   // input and axis
+  constexpr int num_of_outputs = 1;  // output
+
+  constexpr int tensors_size = num_of_inputs + num_of_outputs;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateFloatTensor(input_data, input_dims),
+      CreateInt32Tensor(axis_data, axis_dims),
+      CreateFloatTensor(output_data, output_dims),
+  };
+
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, ValidateReduceGoldens(
+                     tensors, tensors_size, expected_output_data, output_data,
+                     output_dims_count, registration, params, tolerance));
+}
+
+template <typename T>
+void TestReduceOpQuantized(
+    const int* input_dims_data, const float* input_data, T* input_data_quant,
+    float input_scale, int input_zero_point, const int* axis_dims_data,
+    const int32_t* axis_data, const int* output_dims_data,
+    const float* expected_output_data, T* output_data_quant,
+    T* expected_output_data_quant, float output_scale, int output_zero_point,
+    const TfLiteRegistration& registration, TfLiteReducerParams* params) {
+  // Convert dimesion arguments to TfLiteArrays
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+  TfLiteIntArray* axis_dims = IntArrayFromInts(axis_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
+
+  // Get number of elements in input and output tensors
+  const int output_dims_count = ElementCount(*output_dims);
+
+  // Initialize tensors
+  constexpr int tensors_size = 3;
+  TfLiteTensor tensors[] = {
+      CreateQuantizedTensor(input_data, input_data_quant, input_dims,
+                            input_scale, input_zero_point),
+      CreateInt32Tensor(axis_data, axis_dims),
+      CreateQuantizedTensor(output_data_quant, output_dims, output_scale,
+                            output_zero_point),
+  };
+
+  // Quantize expected output
+  tflite::AsymmetricQuantize(expected_output_data, expected_output_data_quant,
+                             output_dims_count, output_scale,
+                             output_zero_point);
+
   TF_LITE_MICRO_EXPECT_EQ(
       kTfLiteOk,
-      ValidateReduceGoldens(tensors, tensors_size, expected_output_data,
-                            output_data, output_dims_count, params, tolerance));
+      ValidateReduceGoldens(tensors, tensors_size, expected_output_data_quant,
+                            output_data_quant, output_dims_count, registration,
+                            params, 0.01));
+}
+
+template <typename T>
+void TestMeanOpQuantized(const int* input_dims_data, const float* input_data,
+                         T* input_data_quant, float input_scale,
+                         int input_zero_point, const int* axis_dims_data,
+                         const int32_t* axis_data, const int* output_dims_data,
+                         const float* expected_output_data,
+                         T* output_data_quant, T* expected_output_data_quant,
+                         float output_scale, int output_zero_point,
+                         TfLiteReducerParams* params) {
+  // Convert dimesion arguments to TfLiteArrays
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+  TfLiteIntArray* axis_dims = IntArrayFromInts(axis_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
+
+  // Get number of elements in input and output tensors
+  const int output_dims_count = ElementCount(*output_dims);
+
+  // Initialize tensors
+  constexpr int tensors_size = 3;
+  TfLiteTensor tensors[] = {
+      CreateQuantizedTensor(input_data, input_data_quant, input_dims,
+                            input_scale, input_zero_point),
+      CreateInt32Tensor(axis_data, axis_dims),
+      CreateQuantizedTensor(output_data_quant, output_dims, output_scale,
+                            output_zero_point),
+  };
+
+  // Quantize expected output
+  tflite::AsymmetricQuantize(expected_output_data, expected_output_data_quant,
+                             output_dims_count, output_scale,
+                             output_zero_point);
+
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk,
+      ValidateReduceGoldens(tensors, tensors_size, expected_output_data_quant,
+                            output_data_quant, output_dims_count,
+                            tflite::ops::micro::Register_MEAN(), params, 1.0));
 }
 
 }  // namespace
@@ -110,10 +216,55 @@ TF_LITE_MICRO_TEST(MeanFloat4DKeepDims) {
       &params);
 }
 
+TF_LITE_MICRO_TEST(MeanInt84DKeepDims) {
+  int8_t expected_output_data_quant[tflite::testing::kOutputElements];
+  int8_t output_data_quant[tflite::testing::kOutputElements];
+  int8_t input_data_quant[tflite::testing::kInputElements4D];
+
+  float input_scale = 0.5f;
+  int input_zero_point = 0;
+  float output_scale = 0.5f;
+  int output_zero_point = 0;
+
+  TfLiteReducerParams params = {
+      true  // keep_dims
+  };
+
+  tflite::testing::TestMeanOpQuantized<int8_t>(
+      tflite::testing::kInputShape4D, tflite::testing::kInputData4D,
+      input_data_quant, input_scale, input_zero_point,
+      tflite::testing::kAxisShape, tflite::testing::kAxisData,
+      tflite::testing::kOutputShape, tflite::testing::kGoldenData,
+      output_data_quant, expected_output_data_quant, output_scale,
+      output_zero_point, &params);
+}
+
+TF_LITE_MICRO_TEST(MeanUInt84DKeepDims) {
+  uint8_t expected_output_data_quant[tflite::testing::kOutputElements];
+  uint8_t output_data_quant[tflite::testing::kOutputElements];
+  uint8_t input_data_quant[tflite::testing::kInputElements4D];
+
+  float input_scale = 0.5f;
+  int input_zero_point = 128;
+  float output_scale = 0.5f;
+  int output_zero_point = 128;
+
+  TfLiteReducerParams params = {
+      true  // keep_dims
+  };
+
+  tflite::testing::TestMeanOpQuantized<uint8_t>(
+      tflite::testing::kInputShape4D, tflite::testing::kInputData4D,
+      input_data_quant, input_scale, input_zero_point,
+      tflite::testing::kAxisShape, tflite::testing::kAxisData,
+      tflite::testing::kOutputShape, tflite::testing::kGoldenData,
+      output_data_quant, expected_output_data_quant, output_scale,
+      output_zero_point, &params);
+}
+
 TF_LITE_MICRO_TEST(MeanFloat4DWithoutKeepDims) {
   const int kOutputShape[] = {2, 2, 2};
   float output_data[tflite::testing::kOutputElements];
-
   TfLiteReducerParams params = {
       false  // keep_dims
   };
@@ -124,6 +275,50 @@ TF_LITE_MICRO_TEST(MeanFloat4DWithoutKeepDims) {
       tflite::testing::kGoldenData, output_data, &params);
 }
 
+TF_LITE_MICRO_TEST(MeanInt84DWithoutKeepDims) {
+  int8_t expected_output_data_quant[tflite::testing::kOutputElements];
+  int8_t output_data_quant[tflite::testing::kOutputElements];
+  int8_t input_data_quant[tflite::testing::kInputElements4D];
+
+  const int kOutputShape[] = {2, 2, 2};
+  TfLiteReducerParams params = {
+      false  // keep_dims
+  };
+  float input_scale = 0.5f;
+  int input_zero_point = 0;
+  float output_scale = 0.5f;
+  int output_zero_point = 0;
+
+  tflite::testing::TestMeanOpQuantized<int8_t>(
+      tflite::testing::kInputShape4D, tflite::testing::kInputData4D,
+      input_data_quant, input_scale, input_zero_point,
+      tflite::testing::kAxisShape, tflite::testing::kAxisData, kOutputShape,
+      tflite::testing::kGoldenData, output_data_quant,
+      expected_output_data_quant, output_scale, output_zero_point, &params);
+}
+
+TF_LITE_MICRO_TEST(MeanUInt84DWithoutKeepDims) {
+  uint8_t expected_output_data_quant[tflite::testing::kOutputElements];
+  uint8_t output_data_quant[tflite::testing::kOutputElements];
+  uint8_t input_data_quant[tflite::testing::kInputElements4D];
+
+  const int kOutputShape[] = {2, 2, 2};
+  TfLiteReducerParams params = {
+      false  // keep_dims
+  };
+  float input_scale = 0.5f;
+  int input_zero_point = 128;
+  float output_scale = 0.5f;
+  int output_zero_point = 128;
+
+  tflite::testing::TestMeanOpQuantized<uint8_t>(
+      tflite::testing::kInputShape4D, tflite::testing::kInputData4D,
+      input_data_quant, input_scale, input_zero_point,
+      tflite::testing::kAxisShape, tflite::testing::kAxisData, kOutputShape,
+      tflite::testing::kGoldenData, output_data_quant,
+      expected_output_data_quant, output_scale, output_zero_point, &params);
+}
+
 TF_LITE_MICRO_TEST(MeanFloat4DWithoutKeepDimsWithPrecision) {
   const int kInputShape4D[] = {4, 2, 2, 3, 1};
   const float kInputData4D[] = {1.0,  24.0, 13.0, 3.0,  9.0,  17.0,
@@ -132,7 +327,6 @@ TF_LITE_MICRO_TEST(MeanFloat4DWithoutKeepDimsWithPrecision) {
   const int kOutputShape[] = {2, 2, 1};
   const float kGoldenData[] = {11.166667, 19.833334};
   float output_data[kOutputElements];
-
   TfLiteReducerParams params = {
       false  // keep_dims
   };
@@ -143,4 +337,140 @@ TF_LITE_MICRO_TEST(MeanFloat4DWithoutKeepDimsWithPrecision) {
       &params);
 }
 
+TF_LITE_MICRO_TEST(FloatMaxOpTestNotKeepDims) {
+  const int input_shape[] = {3, 4, 3, 2};
+  const float input_data[] = {1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,
+                              9.0,  10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+                              17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0};
+  const int axis_shape[] = {1, 4};
+  const int32_t axis_data[] = {1, 0, -3, -3};
+  const int output_shape[] = {1, 2};
+  const float expected_output_data[] = {23, 24};
+  float output_data[2];
+
+  TfLiteReducerParams params = {false};
+
+  tflite::testing::TestReduceOpFloat(
+      input_shape, input_data, axis_shape, axis_data, output_shape, output_data,
+      expected_output_data, tflite::ops::micro::Register_REDUCE_MAX(), &params);
+}
+
+TF_LITE_MICRO_TEST(FloatMaxOpTestKeepDims) {
+  const int input_shape[] = {3, 4, 3, 2};
+  const float input_data[] = {1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,
+                              9.0,  10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+                              17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0};
+  const int axis_shape[] = {1, 2};
+  const int32_t axis_data[] = {0, 2};
+  const int output_shape[] = {1, 3};
+  const float expected_output_data[] = {20, 22, 24};
+  float output_data[3];
+
+  TfLiteReducerParams params = {true};
+
+  tflite::testing::TestReduceOpFloat(
+      input_shape, input_data, axis_shape, axis_data, output_shape, output_data,
+      expected_output_data, tflite::ops::micro::Register_REDUCE_MAX(), &params);
+}
+
+TF_LITE_MICRO_TEST(Int8MaxOpTestKeepDims) {
+  const int input_shape[] = {3, 1, 3, 2};
+  const float input_data[] = {0.4, 0.2, 0.3, 0.4, 0.5, 0.6};
+  const int axis_shape[] = {1, 1};
+  const int32_t axis_data[] = {1, 1};
+  const int output_shape[] = {1, 2};
+  const float expected_output_data[] = {0.5, 0.6};
+
+  float input_scale = 2 / 255.0;
+  int input_zp = 0;
+
+  TfLiteReducerParams params = {true};
+
+  int8_t input_data_quant[6];
+  int8_t output_data_quant[2];
+  int8_t expected_output_data_quant[2];
+
+  tflite::testing::TestReduceOpQuantized<int8_t>(
+      input_shape, input_data, input_data_quant, input_scale, input_zp,
+      axis_shape, axis_data, output_shape, expected_output_data,
+      output_data_quant, expected_output_data_quant, input_scale, input_zp,
+      tflite::ops::micro::Register_REDUCE_MAX(), &params);
+}
+
+TF_LITE_MICRO_TEST(Int8MaxOpTestWithoutKeepDims) {
+  const int input_shape[] = {3, 1, 3, 2};
+  const float input_data[] = {0.4, 0.2, 0.3, 0.4, 0.5, 0.6};
+  const int axis_shape[] = {1, 1};
+  const int32_t axis_data[] = {1, 1};
+  const int output_shape[] = {1, 2};
+  const float expected_output_data[] = {0.5, 0.6};
+
+  float input_scale = 2 / 255.0;
+  int input_zp = 0;
+  float output_scale = 2 / 255.0;
+  int output_zp = 0;
+
+  TfLiteReducerParams params = {false};
+
+  int8_t input_data_quant[6];
+  int8_t output_data_quant[2];
+  int8_t expected_output_data_quant[2];
+
+  tflite::testing::TestReduceOpQuantized<int8_t>(
+      input_shape, input_data, input_data_quant, input_scale, input_zp,
+      axis_shape, axis_data, output_shape, expected_output_data,
+      output_data_quant, expected_output_data_quant, output_scale, output_zp,
+      tflite::ops::micro::Register_REDUCE_MAX(), &params);
+}
+
+TF_LITE_MICRO_TEST(MeanInt84DWithoutKeepDimsWithPrecision) {
+  const int kInputShape4D[] = {4, 2, 2, 3, 1};
+  const float kInputData4D[] = {1.0,  24.0, 13.0, 3.0,  9.0,  17.0,
+                                11.0, 36.0, 14.0, 19.0, 17.0, 22.0};
+  const int kOutputShape[] = {2, 2, 1};
+  const float kGoldenData[] = {11.166667, 19.833334};
+  TfLiteReducerParams params = {
+      false  // keep_dims
+  };
+  float input_scale = 0.5f;
+  int input_zero_point = 0;
+  float output_scale = 0.5f;
+  int output_zero_point = 0;
+
+  int8_t output_data_quant[2];
+  int8_t expected_output_data_quant[2];
+  int8_t input_data_quant[12];
+
+  tflite::testing::TestMeanOpQuantized<int8_t>(
+      kInputShape4D, kInputData4D, input_data_quant, input_scale,
+      input_zero_point, tflite::testing::kAxisShape, tflite::testing::kAxisData,
+      kOutputShape, kGoldenData, output_data_quant, expected_output_data_quant,
+      output_scale, output_zero_point, &params);
+}
+
+TF_LITE_MICRO_TEST(MeanUInt84DWithoutKeepDimsWithPrecision) {
+  const int kInputShape4D[] = {4, 2, 2, 3, 1};
+  const float kInputData4D[] = {1.0,  24.0, 13.0, 3.0,  9.0,  17.0,
+                                11.0, 36.0, 14.0, 19.0, 17.0, 22.0};
+  const int kOutputShape[] = {2, 2, 1};
+  const float kGoldenData[] = {11.166667, 19.833334};
+  TfLiteReducerParams params = {
+      false  // keep_dims
+  };
+
+  float input_scale = 0.5f;
+  int input_zero_point = 128;
+  float output_scale = 0.5f;
+  int output_zero_point = 128;
+
+  uint8_t output_data_quant[2];
+  uint8_t expected_output_data_quant[2];
+  uint8_t input_data_quant[12];
+
+  tflite::testing::TestMeanOpQuantized<uint8_t>(
+      kInputShape4D, kInputData4D, input_data_quant, input_scale,
+      input_zero_point, tflite::testing::kAxisShape, tflite::testing::kAxisData,
+      kOutputShape, kGoldenData, output_data_quant, expected_output_data_quant,
+      output_scale, output_zero_point, &params);
+}
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/kernels/resize_nearest_neighbor_test.cc b/tensorflow/lite/micro/kernels/resize_nearest_neighbor_test.cc
index 47813d5d8a3..bd65077f23a 100644
--- a/tensorflow/lite/micro/kernels/resize_nearest_neighbor_test.cc
+++ b/tensorflow/lite/micro/kernels/resize_nearest_neighbor_test.cc
@@ -48,7 +48,7 @@ void TestResizeNearestNeighbor(const int* input_dims_data, const T* input_data,
                                const int* output_dims_data, T* output_data) {
   TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
 
-  int expected_size_dims_data[] = {2, 1, 2};
+  int expected_size_dims_data[] = {1, 2};
   TfLiteIntArray* expected_size_dims =
       IntArrayFromInts(expected_size_dims_data);
 
@@ -63,6 +63,8 @@ void TestResizeNearestNeighbor(const int* input_dims_data, const T* input_data,
       TestCreateTensor(output_data, output_dims),
   };
 
+  tensors[1].allocation_type = kTfLiteMmapRo;
+
   TfLiteResizeNearestNeighborParams builtin_data = {false, false};
 
   int inputs_array_data[] = {2, 0, 1};
diff --git a/tensorflow/lite/micro/kernels/split_v.cc b/tensorflow/lite/micro/kernels/split_v.cc
new file mode 100755
index 00000000000..600523aba21
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/split_v.cc
@@ -0,0 +1,135 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+namespace split_v {
+
+template <typename T>
+TfLiteStatus SplitImpl(TfLiteContext* context, TfLiteNode* node,
+                       const TfLiteEvalTensor* input, int axis_value) {
+  const TfLiteIntArray* input_dims = input->dims;
+  const TfLiteEvalTensor* output0 =
+      tflite::micro::GetEvalOutput(context, node, 0);
+
+  const int split_dimensions = input_dims->size;
+
+  TFLITE_DCHECK_LT(axis_value, split_dimensions);
+  TFLITE_DCHECK_EQ(output0->dims->size, split_dimensions);
+
+  int64_t split_size = 0;
+  const int output_count = NumOutputs(node);
+  for (int i = 0; i < output_count; i++) {
+    split_size +=
+        tflite::micro::GetEvalOutput(context, node, i)->dims->data[axis_value];
+  }
+  TFLITE_DCHECK_EQ(split_size, input_dims->data[axis_value]);
+  int64_t outer_size = 1;
+  for (int i = 0; i < axis_value; ++i) {
+    outer_size *= input_dims->data[i];
+  }
+
+  int64_t base_inner_size = 1;
+  for (int i = axis_value + 1; i < split_dimensions; ++i) {
+    base_inner_size *= input_dims->data[i];
+  }
+
+  const T* input_ptr = tflite::micro::GetTensorData<T>(input);
+  for (int k = 0; k < outer_size; ++k) {
+    for (int i = 0; i < output_count; ++i) {
+      TfLiteEvalTensor* output_tensor =
+          tflite::micro::GetEvalOutput(context, node, i);
+      T* output_data = tflite::micro::GetTensorData<T>(output_tensor);
+      const int copy_size =
+          output_tensor->dims->data[axis_value] * base_inner_size;
+      T* output_ptr = output_data + k * copy_size;
+      for (int j = 0; j < copy_size; ++j) output_ptr[j] = input_ptr[j];
+      input_ptr += copy_size;
+    }
+  }
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 3);
+
+  // Dynamic output tensors are needed if axis tensor is not constant.
+  // But Micro doesn't support dynamic memory allocation, so we only support
+  // constant axis tensor for now.
+  const TfLiteTensor* axis = GetInput(context, node, 2);
+  TF_LITE_ENSURE_MSG(context, IsConstantTensor(axis),
+                     "Non constant axis tensor not supported");
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteEvalTensor* input = tflite::micro::GetEvalInput(context, node, 0);
+  const TfLiteEvalTensor* axis = tflite::micro::GetEvalInput(context, node, 2);
+
+  int axis_value = tflite::micro::GetTensorData<int32_t>(axis)[0];
+  if (axis_value < 0) {
+    axis_value += input->dims->size;
+  }
+
+  TF_LITE_ENSURE(context, axis_value >= 0);
+  TF_LITE_ENSURE(context, axis_value < input->dims->size);
+
+  switch (input->type) {
+    case kTfLiteFloat32: {
+      return SplitImpl<float>(context, node, input, axis_value);
+    }
+    case kTfLiteInt8: {
+      return SplitImpl<int8_t>(context, node, input, axis_value);
+    }
+    case kTfLiteInt16: {
+      return SplitImpl<int16_t>(context, node, input, axis_value);
+    }
+    case kTfLiteInt32: {
+      return SplitImpl<int32_t>(context, node, input, axis_value);
+    }
+    default:
+      TF_LITE_KERNEL_LOG(context, "Type %s currently not supported.",
+                         TfLiteTypeGetName(input->type));
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace split_v
+
+TfLiteRegistration Register_SPLIT_V() {
+  return {/*init=*/nullptr,
+          /*free=*/nullptr,
+          /*prepare=*/split_v::Prepare,
+          /*invoke=*/split_v::Eval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/split_v_test.cc b/tensorflow/lite/micro/kernels/split_v_test.cc
new file mode 100755
index 00000000000..8816ecc4f5d
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/split_v_test.cc
@@ -0,0 +1,468 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/micro/all_ops_resolver.h"
+#include "tensorflow/lite/micro/debug_log.h"
+#include "tensorflow/lite/micro/kernels/kernel_runner.h"
+#include "tensorflow/lite/micro/testing/micro_test.h"
+#include "tensorflow/lite/micro/testing/test_utils.h"
+
+namespace tflite {
+namespace testing {
+
+template <int N>
+struct OutputTensors {
+  float* data[N];
+  int* dims[N];
+  float* expected_output_data[N];
+};
+template <int N>
+void TestSplitVFloat(const int* input_dims_data, const float* input_data,
+                     const int* axis_dims_data, const int32_t* axis_data,
+                     const int* split_dims_data, const int32_t* split_data,
+                     const OutputTensors<N>& output_tensors) {
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+  TfLiteIntArray* axis_dims = IntArrayFromInts(axis_dims_data);
+  TfLiteIntArray* split_dims = IntArrayFromInts(split_dims_data);
+  TfLiteIntArray* output_dims[N];
+  for (int i = 0; i < N; i++)
+    output_dims[i] = IntArrayFromInts(output_tensors.dims[i]);
+
+  // Place a unique value in the uninitialized output buffer.
+  for (int i = 0; i < N; i++) {
+    int dim_count = ElementCount(*output_dims[i]);
+    for (int j = 0; j < dim_count; j++) {
+      (output_tensors.data[i])[j] = 23;
+    }
+  }
+  constexpr int input_size = 1;
+  constexpr int axis_size = 1;
+  constexpr int split_size = 1;
+  constexpr int output_size = N;
+
+  constexpr int tensors_size =
+      input_size + output_size + axis_size + split_size;
+
+  // first input tensor is data
+  // second is size_splits
+  // third is axis
+  // then come outputs
+
+  TfLiteTensor tensors[tensors_size];
+  tensors[0] = CreateFloatTensor(input_data, input_dims);
+  tensors[1] = CreateQuantized32Tensor(split_data, split_dims, 1.0);
+  tensors[2] = CreateQuantized32Tensor(axis_data, axis_dims, 1.0);
+
+  // add output tensors
+  for (int i = 0; i < N; i++)
+    tensors[3 + i] = CreateFloatTensor(output_tensors.data[i], output_dims[i]);
+
+  tensors[2].allocation_type = kTfLiteMmapRo;
+  tensors[1].allocation_type = kTfLiteMmapRo;
+
+  int inputs_array_data[] = {3, 0, 1, 2};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[N + 1];
+  outputs_array_data[0] = N;
+  for (int i = 0; i < N; i++) outputs_array_data[i + 1] = i + 3;
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+
+  const TfLiteRegistration registration =
+      tflite::ops::micro::Register_SPLIT_V();
+  micro::KernelRunner runner(registration, tensors, tensors_size, inputs_array,
+                             outputs_array, nullptr, micro_test::reporter);
+
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.InitAndPrepare());
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
+
+  for (int i = 0; i < N; i++) {
+    int dim_count = ElementCount(*output_dims[i]);
+    for (int j = 0; j < dim_count; j++) {
+      TF_LITE_MICRO_EXPECT_NEAR((output_tensors.expected_output_data[i])[j],
+                                (output_tensors.data[i])[j], 1e-5f);
+    }
+  }
+}
+
+}  // namespace testing
+}  // namespace tflite
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(SPLIT_V_ThreeOutputs) {
+  constexpr int output1_dims_count = 3;
+  constexpr int output2_dims_count = 3;
+  constexpr int output3_dims_count = 6;
+  float output1_data[output1_dims_count];
+  float output2_data[output2_dims_count];
+  float output3_data[output3_dims_count];
+  int input_shape[] = {2, 4, 3};
+  float input_values[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
+  int axis_shape[] = {1, 1};
+  int32_t axis_values[] = {0};
+  int split_shape[] = {1, 3};
+  int32_t split_values[] = {1, 1, 2};
+  int output1_shape[] = {2, 1, 3};
+  float output1_values[] = {1, 2, 3};
+  int output2_shape[] = {2, 1, 3};
+  float output2_values[] = {4, 5, 6};
+  int output3_shape[] = {2, 2, 3};
+  float output3_values[] = {7, 8, 9, 10, 11, 12};
+
+  tflite::testing::OutputTensors<3> output_tensors;
+  output_tensors.data[0] = output1_data;
+  output_tensors.data[1] = output2_data;
+  output_tensors.data[2] = output3_data;
+
+  output_tensors.dims[0] = output1_shape;
+  output_tensors.dims[1] = output2_shape;
+  output_tensors.dims[2] = output3_shape;
+
+  output_tensors.expected_output_data[0] = output1_values;
+  output_tensors.expected_output_data[1] = output2_values;
+  output_tensors.expected_output_data[2] = output3_values;
+
+  tflite::testing::TestSplitVFloat(input_shape, input_values, axis_shape,
+                                   axis_values, split_shape, split_values,
+                                   output_tensors);
+}
+
+TF_LITE_MICRO_TEST(SPLIT_V_FourDimensionalFloatAxis0) {
+  constexpr int output1_dims_count = 8;
+  constexpr int output2_dims_count = 8;
+  float output1_data[output1_dims_count];
+  float output2_data[output2_dims_count];
+
+  int input_shape[] = {4, 2, 2, 2, 2};
+  float input_values[] = {1, 2,  3,  4,  5,  6,  7,  8,
+                          9, 10, 11, 12, 13, 14, 15, 16};
+  int axis_shape[] = {1, 1};
+  int32_t axis_values[] = {0};
+  int split_shape[] = {1, 2};
+  int32_t split_values[] = {1, 1};
+  int output1_shape[] = {4, 1, 2, 2, 2};
+  float output1_values[] = {1, 2, 3, 4, 5, 6, 7, 8};
+  int output2_shape[] = {4, 1, 2, 2, 2};
+  float output2_values[] = {9, 10, 11, 12, 13, 14, 15, 16};
+
+  tflite::testing::OutputTensors<2> output_tensors;
+
+  output_tensors.data[0] = output1_data;
+  output_tensors.data[1] = output2_data;
+
+  output_tensors.dims[0] = output1_shape;
+  output_tensors.dims[1] = output2_shape;
+
+  output_tensors.expected_output_data[0] = output1_values;
+  output_tensors.expected_output_data[1] = output2_values;
+
+  tflite::testing::TestSplitVFloat(input_shape, input_values, axis_shape,
+                                   axis_values, split_shape, split_values,
+                                   output_tensors);
+}
+
+TF_LITE_MICRO_TEST(SPLIT_V_FourDimensionalFloatAxis1) {
+  constexpr int output1_dims_count = 8;
+  constexpr int output2_dims_count = 8;
+  float output1_data[output1_dims_count];
+  float output2_data[output2_dims_count];
+
+  int input_shape[] = {4, 2, 2, 2, 2};
+  float input_values[] = {1, 2,  3,  4,  5,  6,  7,  8,
+                          9, 10, 11, 12, 13, 14, 15, 16};
+  int axis_shape[] = {1, 1};
+  int32_t axis_values[] = {1};
+  int split_shape[] = {1, 2};
+  int32_t split_values[] = {1, 1};
+  int output1_shape[] = {4, 2, 1, 2, 2};
+  float output1_values[] = {1, 2, 3, 4, 9, 10, 11, 12};
+  int output2_shape[] = {4, 2, 1, 2, 2};
+  float output2_values[] = {5, 6, 7, 8, 13, 14, 15, 16};
+
+  tflite::testing::OutputTensors<2> output_tensors;
+
+  output_tensors.data[0] = output1_data;
+  output_tensors.data[1] = output2_data;
+
+  output_tensors.dims[0] = output1_shape;
+  output_tensors.dims[1] = output2_shape;
+
+  output_tensors.expected_output_data[0] = output1_values;
+  output_tensors.expected_output_data[1] = output2_values;
+
+  tflite::testing::TestSplitVFloat(input_shape, input_values, axis_shape,
+                                   axis_values, split_shape, split_values,
+                                   output_tensors);
+}
+
+TF_LITE_MICRO_TEST(SPLIT_VFourDimensionalFloatAxis2) {
+  constexpr int output1_dims_count = 8;
+  constexpr int output2_dims_count = 8;
+  float output1_data[output1_dims_count];
+  float output2_data[output2_dims_count];
+
+  int input_shape[] = {4, 2, 2, 2, 2};
+  float input_values[] = {1, 2,  3,  4,  5,  6,  7,  8,
+                          9, 10, 11, 12, 13, 14, 15, 16};
+  int axis_shape[] = {1, 1};
+  int32_t axis_values[] = {2};
+  int split_shape[] = {1, 2};
+  int32_t split_values[] = {1, 1};
+  int output1_shape[] = {4, 2, 2, 1, 2};
+  float output1_values[] = {1, 2, 5, 6, 9, 10, 13, 14};
+  int output2_shape[] = {4, 2, 2, 1, 2};
+  float output2_values[] = {3, 4, 7, 8, 11, 12, 15, 16};
+
+  tflite::testing::OutputTensors<2> output_tensors;
+
+  output_tensors.data[0] = output1_data;
+  output_tensors.data[1] = output2_data;
+
+  output_tensors.dims[0] = output1_shape;
+  output_tensors.dims[1] = output2_shape;
+
+  output_tensors.expected_output_data[0] = output1_values;
+  output_tensors.expected_output_data[1] = output2_values;
+
+  tflite::testing::TestSplitVFloat(input_shape, input_values, axis_shape,
+                                   axis_values, split_shape, split_values,
+                                   output_tensors);
+}
+
+TF_LITE_MICRO_TEST(SPLIT_V_FourDimensionalFloatAxis3) {
+  constexpr int output1_dims_count = 8;
+  constexpr int output2_dims_count = 8;
+  float output1_data[output1_dims_count];
+  float output2_data[output2_dims_count];
+  int input_shape[] = {4, 2, 2, 2, 2};
+  float input_values[] = {1, 2,  3,  4,  5,  6,  7,  8,
+                          9, 10, 11, 12, 13, 14, 15, 16};
+  int axis_shape[] = {1, 1};
+  int32_t axis_values[] = {3};
+  int split_shape[] = {1, 2};
+  int32_t split_values[] = {1, 1};
+  int output1_shape[] = {4, 2, 2, 2, 1};
+  float output1_values[] = {1, 3, 5, 7, 9, 11, 13, 15};
+  int output2_shape[] = {4, 2, 2, 2, 1};
+  float output2_values[] = {2, 4, 6, 8, 10, 12, 14, 16};
+
+  tflite::testing::OutputTensors<2> output_tensors;
+
+  output_tensors.data[0] = output1_data;
+  output_tensors.data[1] = output2_data;
+
+  output_tensors.dims[0] = output1_shape;
+  output_tensors.dims[1] = output2_shape;
+
+  output_tensors.expected_output_data[0] = output1_values;
+  output_tensors.expected_output_data[1] = output2_values;
+
+  tflite::testing::TestSplitVFloat(input_shape, input_values, axis_shape,
+                                   axis_values, split_shape, split_values,
+                                   output_tensors);
+}
+
+TF_LITE_MICRO_TEST(SPLIT_V_FourDimensionalFloatNegativeAxis) {
+  constexpr int output1_dims_count = 8;
+  constexpr int output2_dims_count = 8;
+  float output1_data[output1_dims_count];
+  float output2_data[output2_dims_count];
+
+  int input_shape[] = {4, 2, 2, 2, 2};
+  float input_values[] = {1, 2,  3,  4,  5,  6,  7,  8,
+                          9, 10, 11, 12, 13, 14, 15, 16};
+  int axis_shape[] = {1, 1};
+  int32_t axis_values[] = {-4};
+  int split_shape[] = {1, 2};
+  int32_t split_values[] = {1, 1};
+  int output1_shape[] = {4, 1, 2, 2, 2};
+  float output1_values[] = {1, 2, 3, 4, 5, 6, 7, 8};
+  int output2_shape[] = {4, 1, 2, 2, 2};
+  float output2_values[] = {9, 10, 11, 12, 13, 14, 15, 16};
+
+  tflite::testing::OutputTensors<2> output_tensors;
+
+  output_tensors.data[0] = output1_data;
+  output_tensors.data[1] = output2_data;
+
+  output_tensors.dims[0] = output1_shape;
+  output_tensors.dims[1] = output2_shape;
+
+  output_tensors.expected_output_data[0] = output1_values;
+  output_tensors.expected_output_data[1] = output2_values;
+
+  tflite::testing::TestSplitVFloat(input_shape, input_values, axis_shape,
+                                   axis_values, split_shape, split_values,
+                                   output_tensors);
+}
+
+TF_LITE_MICRO_TEST(SPLIT_V_OneDimensionalFloatAxis0) {
+  constexpr int output1_dims_count = 1;
+  constexpr int output2_dims_count = 1;
+  constexpr int output3_dims_count = 1;
+  constexpr int output4_dims_count = 1;
+  constexpr int output5_dims_count = 1;
+  constexpr int output6_dims_count = 1;
+  constexpr int output7_dims_count = 1;
+  constexpr int output8_dims_count = 1;
+
+  float output1_data[output1_dims_count];
+  float output2_data[output2_dims_count];
+  float output3_data[output3_dims_count];
+  float output4_data[output4_dims_count];
+  float output5_data[output5_dims_count];
+  float output6_data[output6_dims_count];
+  float output7_data[output7_dims_count];
+  float output8_data[output8_dims_count];
+  int input_shape[] = {1, 8};
+  float input_values[] = {1, 2, 3, 4, 5, 6, 7, 8};
+  int axis_shape[] = {1, 1};
+  int32_t axis_value[] = {0};
+  int split_size_shape[] = {1, 8};
+  int32_t split[] = {1, 1, 1, 1, 1, 1, 1, 1};
+  int output1_shape[] = {1, 1};
+  float output1_values[] = {1};
+  int output2_shape[] = {1, 1};
+  float output2_values[] = {2};
+
+  int output3_shape[] = {1, 1};
+  float output3_values[] = {3};
+  int output4_shape[] = {1, 1};
+  float output4_values[] = {4};
+
+  int output5_shape[] = {1, 1};
+  float output5_values[] = {5};
+  int output6_shape[] = {1, 1};
+  float output6_values[] = {6};
+
+  int output7_shape[] = {1, 1};
+  float output7_values[] = {7};
+  int output8_shape[] = {1, 1};
+  float output8_values[] = {8};
+
+  tflite::testing::OutputTensors<8> output_tensors;
+
+  output_tensors.data[0] = output1_data;
+  output_tensors.data[1] = output2_data;
+  output_tensors.data[2] = output3_data;
+  output_tensors.data[3] = output4_data;
+  output_tensors.data[4] = output5_data;
+  output_tensors.data[5] = output6_data;
+  output_tensors.data[6] = output7_data;
+  output_tensors.data[7] = output8_data;
+
+  output_tensors.dims[0] = output1_shape;
+  output_tensors.dims[1] = output2_shape;
+  output_tensors.dims[2] = output3_shape;
+  output_tensors.dims[3] = output4_shape;
+  output_tensors.dims[4] = output5_shape;
+  output_tensors.dims[5] = output6_shape;
+  output_tensors.dims[6] = output7_shape;
+  output_tensors.dims[7] = output8_shape;
+
+  output_tensors.expected_output_data[0] = output1_values;
+  output_tensors.expected_output_data[1] = output2_values;
+  output_tensors.expected_output_data[2] = output3_values;
+  output_tensors.expected_output_data[3] = output4_values;
+  output_tensors.expected_output_data[4] = output5_values;
+  output_tensors.expected_output_data[5] = output6_values;
+  output_tensors.expected_output_data[6] = output7_values;
+  output_tensors.expected_output_data[7] = output8_values;
+
+  tflite::testing::TestSplitVFloat(input_shape, input_values, axis_shape,
+                                   axis_value, split_size_shape, split,
+                                   output_tensors);
+}
+
+TF_LITE_MICRO_TEST(SPLIT_V_OneDimensionalFloatTest2) {
+  constexpr int output1_dims_count = 1;
+  constexpr int output2_dims_count = 1;
+  constexpr int output3_dims_count = 1;
+  constexpr int output4_dims_count = 1;
+  constexpr int output5_dims_count = 1;
+  constexpr int output6_dims_count = 1;
+  constexpr int output7_dims_count = 2;
+
+  float output1_data[output1_dims_count];
+  float output2_data[output2_dims_count];
+  float output3_data[output3_dims_count];
+  float output4_data[output4_dims_count];
+  float output5_data[output5_dims_count];
+  float output6_data[output6_dims_count];
+  float output7_data[output7_dims_count];
+
+  int input_shape[] = {1, 8};
+  float input_values[] = {1, 2, 3, 4, 5, 6, 7, 8};
+  int axis_shape[] = {1, 1};
+  int32_t axis_value[] = {0};
+  int split_size_shape[] = {1, 8};
+  int32_t split[] = {1, 1, 1, 1, 1, 1, 2, -1};
+  int output1_shape[] = {1, 1};
+  float output1_values[] = {1};
+  int output2_shape[] = {1, 1};
+  float output2_values[] = {2};
+
+  int output3_shape[] = {1, 1};
+  float output3_values[] = {3};
+  int output4_shape[] = {1, 1};
+  float output4_values[] = {4};
+
+  int output5_shape[] = {1, 1};
+  float output5_values[] = {5};
+  int output6_shape[] = {1, 1};
+  float output6_values[] = {6};
+
+  int output7_shape[] = {1, 2};
+  float output7_values[] = {7, 8};
+  int output8_shape[] = {1, 0};
+  float output8_values[1] = {};
+
+  tflite::testing::OutputTensors<8> output_tensors;
+
+  output_tensors.data[0] = output1_data;
+  output_tensors.data[1] = output2_data;
+  output_tensors.data[2] = output3_data;
+  output_tensors.data[3] = output4_data;
+  output_tensors.data[4] = output5_data;
+  output_tensors.data[5] = output6_data;
+  output_tensors.data[6] = output7_data;
+  output_tensors.data[7] = NULL;
+
+  output_tensors.dims[0] = output1_shape;
+  output_tensors.dims[1] = output2_shape;
+  output_tensors.dims[2] = output3_shape;
+  output_tensors.dims[3] = output4_shape;
+  output_tensors.dims[4] = output5_shape;
+  output_tensors.dims[5] = output6_shape;
+  output_tensors.dims[6] = output7_shape;
+  output_tensors.dims[7] = output8_shape;
+
+  output_tensors.expected_output_data[0] = output1_values;
+  output_tensors.expected_output_data[1] = output2_values;
+  output_tensors.expected_output_data[2] = output3_values;
+  output_tensors.expected_output_data[3] = output4_values;
+  output_tensors.expected_output_data[4] = output5_values;
+  output_tensors.expected_output_data[5] = output6_values;
+  output_tensors.expected_output_data[6] = output7_values;
+  output_tensors.expected_output_data[7] = output8_values;
+
+  tflite::testing::TestSplitVFloat(input_shape, input_values, axis_shape,
+                                   axis_value, split_size_shape, split,
+                                   output_tensors);
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/kernels/tanh.cc b/tensorflow/lite/micro/kernels/tanh.cc
index 5fa32f8f7ce..0f257dfe56b 100644
--- a/tensorflow/lite/micro/kernels/tanh.cc
+++ b/tensorflow/lite/micro/kernels/tanh.cc
@@ -124,8 +124,9 @@ TfLiteStatus TanhEval(TfLiteContext* context, TfLiteNode* node) {
     case kTfLiteInt8: {
       reference_integer_ops::Tanh(
           data.input_zero_point, data.input_range_radius, data.input_multiplier,
-          data.input_left_shift, NumElements(input->dims),
+          data.input_left_shift, tflite::micro::GetTensorShape(input),
           tflite::micro::GetTensorData<int8_t>(input),
+          tflite::micro::GetTensorShape(output),
           tflite::micro::GetTensorData<int8_t>(output));
       return kTfLiteOk;
     } break;
diff --git a/tensorflow/lite/micro/kernels/vexriscv/README.md b/tensorflow/lite/micro/kernels/vexriscv/README.md
new file mode 100644
index 00000000000..228f179b8bf
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/vexriscv/README.md
@@ -0,0 +1,49 @@
+# VexRISC-V
+
+## Maintainers
+
+*   [danielyou0230](https://github.com/danielyou0230)
+*   [tal-x](https://github.com/tcal-x)
+
+## Background
+
+The optimized kernels for
+[VexRISC-V](https://github.com/SpinalHDL/VexRiscv)/[Litex](https://github.com/enjoy-digital/litex)
+are used to run Tensorflow Lite Micro in Zephyr on either
+
+*   Digilent Arty board (e.g. Arty A7)
+*   [Renode](https://github.com/renode/renode): Open source simulation framework
+    (no hardware required)
+
+To run on Digilent Arty board (FPGA,) you'll also need a soft-CPU gateware for
+the FPGA, please see
+[Tensorflow lite demo running in Zephyr on Litex/VexRiscv SoC](https://github.com/antmicro/litex-vexriscv-tensorflow-lite-demo)
+by Antmicro for more details.
+
+For general utilities, please refer to `utils/` under this directory, see
+[README](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/kernels/vexriscv/utils/README.md)
+for available utilities
+
+## Info
+
+To use VexRISC-V optimized kernels instead of reference kernel add
+`TAGS=vexriscv` to the make command. The kernels that doesn't have optimization
+for a certain micro architecture fallback to use TFLM reference kernels.
+
+# Example
+
+To compile the binary file with VexRISC-V optimizations, one can use the
+following command
+
+```
+make -f tensorflow/lite/micro/tools/make/Makefile \
+TAGS=vexriscv \
+TARGET=zephyr_vexriscv \
+person_detection_int8_bin
+```
+
+## Optimized kernels
+
+The following kernels are optimized specific to VexRISCV
+
+*   [DepthwiseConv2D](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/kernels/vexriscv/doc/DepthwiseConv2D_int8.md)
diff --git a/tensorflow/lite/micro/kernels/vexriscv/depthwise_conv.cc b/tensorflow/lite/micro/kernels/vexriscv/depthwise_conv.cc
new file mode 100644
index 00000000000..028c1111281
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/vexriscv/depthwise_conv.cc
@@ -0,0 +1,527 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h"
+#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/padding.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+namespace depthwise_conv {
+namespace vexriscv {
+
+constexpr int kChannelStep = 32;
+
+inline void DepthwiseConvPerChannel(
+    const DepthwiseParams& params, const int32_t* output_multiplier,
+    const int32_t* output_shift, const RuntimeShape& input_shape,
+    const int8_t* input_data, const RuntimeShape& filter_shape,
+    const int8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    int8_t* output_data) {
+  // Get parameters.
+  // TODO(b/141565753): Re-introduce ScopedProfilingLabel on Micro.
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int depth_multiplier = params.depth_multiplier;
+  const int32_t input_offset = params.input_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+
+  // Check dimensions of the tensors.
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int input_depth = input_shape.Dims(3);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
+  TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int m = 0; m < depth_multiplier; ++m) {
+          const int in_x_origin = (out_x * stride_width) - pad_width;
+          const int in_y_origin = (out_y * stride_height) - pad_height;
+          // Divide channels to chunks of size kChannelStep
+          for (int begin_ch = 0; begin_ch < input_depth;
+               begin_ch += kChannelStep) {
+            // Allocate a partial result accumulator for each channel
+            // in current chunks
+            int32_t acc[kChannelStep] = {0};
+            // Calculate the last channel for current chunk
+            const int steps =
+                std::min(input_depth, begin_ch + kChannelStep) - begin_ch;
+
+            // Accumulate partial results to acc for a small chunk of channels
+            for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+              const int in_y = in_y_origin + dilation_height_factor * filter_y;
+              for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                // Zero padding by omitting the areas outside the image.
+                const bool is_point_inside_image =
+                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height);
+
+                if (!is_point_inside_image) {
+                  continue;
+                }
+
+                for (int offset_ch = 0; offset_ch < steps; ++offset_ch) {
+                  const int in_channel = begin_ch + offset_ch;
+                  const int output_channel = m + in_channel * depth_multiplier;
+
+                  int32_t input_val = input_data[Offset(
+                      input_shape, batch, in_y, in_x, in_channel)];
+                  int32_t filter_val = filter_data[Offset(
+                      filter_shape, 0, filter_y, filter_x, output_channel)];
+                  // Accumulate with 32 bits accumulator.
+                  // In the nudging process during model quantization, we force
+                  // real value of 0.0 be represented by a quantized value. This
+                  // guarantees that the input_offset is a int8_t, even though
+                  // it is represented using int32_t. int32_t += int8_t *
+                  // (int8_t - int8_t) so the highest value we can get from each
+                  // accumulation is [-127, 127] * ([-128, 127] -
+                  // [-128, 127]), which is [-32512, 32512]. log2(32512)
+                  // = 14.98, which means we can accumulate at least 2^16
+                  // multiplications without overflow. The accumulator is
+                  // applied to a filter so the accumulation logic will hold as
+                  // long as the filter size (filter_y * filter_x * in_channel)
+                  // does not exceed 2^16, which is the case in all the models
+                  // we have seen so far.
+                  // TODO(jianlijianli): Add a check to make sure the
+                  // accumulator depth is smaller than 2^16.
+                  acc[offset_ch] += filter_val * (input_val + input_offset);
+                }
+              }
+            }
+
+            // Add bias / activations for current chunk of channels
+            for (int offset_ch = 0; offset_ch < steps; ++offset_ch) {
+              const int in_channel = begin_ch + offset_ch;
+              const int output_channel = m + in_channel * depth_multiplier;
+
+              int32_t value = acc[offset_ch];
+              if (bias_data) {
+                value += bias_data[output_channel];
+              }
+
+              value = MultiplyByQuantizedMultiplier(
+                  value, output_multiplier[output_channel],
+                  output_shift[output_channel]);
+              value += output_offset;
+              value = std::max(value, output_activation_min);
+              value = std::min(value, output_activation_max);
+
+              output_data[Offset(output_shape, batch, out_y, out_x,
+                                 output_channel)] = static_cast<int8_t>(value);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+inline void DepthwiseConv(
+    const DepthwiseParams& params, const RuntimeShape& input_shape,
+    const uint8_t* input_data, const RuntimeShape& filter_shape,
+    const uint8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    uint8_t* output_data) {
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int depth_multiplier = params.depth_multiplier;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+  const int32_t input_offset = params.input_offset;
+  const int32_t filter_offset = params.weights_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_multiplier = params.output_multiplier;
+  const int output_shift = params.output_shift;
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int input_depth = input_shape.Dims(3);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
+  TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int m = 0; m < depth_multiplier; m++) {
+          const int in_x_origin = (out_x * stride_width) - pad_width;
+          const int in_y_origin = (out_y * stride_height) - pad_height;
+          // Divide channels to chunks of size kChannelStep
+          for (int begin_ch = 0; begin_ch < input_depth;
+               begin_ch += kChannelStep) {
+            // Allocate a partial result accumulator for each channel
+            // in current chunks
+            int32_t acc[kChannelStep] = {0};
+            // Calculate the last channel for current chunk
+            const int steps =
+                std::min(input_depth, begin_ch + kChannelStep) - begin_ch;
+
+            // Accumulate partial results to acc for a small chunk of channels
+            for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+              const int in_y = in_y_origin + dilation_height_factor * filter_y;
+              for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                // Zero padding by omitting the areas outside the image.
+                const bool is_point_inside_image =
+                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height);
+
+                if (!is_point_inside_image) {
+                  continue;
+                }
+
+                for (int offset_ch = 0; offset_ch < steps; ++offset_ch) {
+                  const int in_channel = begin_ch + offset_ch;
+                  const int output_channel = m + in_channel * depth_multiplier;
+
+                  int32_t input_val = input_data[Offset(
+                      input_shape, batch, in_y, in_x, in_channel)];
+                  int32_t filter_val = filter_data[Offset(
+                      filter_shape, 0, filter_y, filter_x, output_channel)];
+                  acc[offset_ch] +=
+                      (filter_val + filter_offset) * (input_val + input_offset);
+                }
+              }
+            }
+
+            // Add bias / activations for current chunk of channels
+            for (int offset_ch = 0; offset_ch < steps; ++offset_ch) {
+              const int in_channel = begin_ch + offset_ch;
+              const int output_channel = m + in_channel * depth_multiplier;
+
+              int32_t value = acc[offset_ch];
+              if (bias_data) {
+                value += bias_data[output_channel];
+              }
+
+              value = MultiplyByQuantizedMultiplier(value, output_multiplier,
+                                                    output_shift);
+              value += output_offset;
+              value = std::max(value, output_activation_min);
+              value = std::min(value, output_activation_max);
+
+              output_data[Offset(output_shape, batch, out_y, out_x,
+                                 output_channel)] = static_cast<uint8_t>(value);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace vexriscv
+
+namespace {
+
+constexpr int kInputTensor = 0;
+constexpr int kFilterTensor = 1;
+constexpr int kBiasTensor = 2;
+constexpr int kOutputTensor = 0;
+
+// Depthwise conv is quantized along dimension 3:
+// https://www.tensorflow.org/lite/performance/quantization_spec
+constexpr int kDepthwiseConvQuantizedDimension = 3;
+
+struct OpData {
+  TfLitePaddingValues padding;
+
+  // Cached tensor zero point values for quantized operations.
+  int32_t input_zero_point;
+  int32_t filter_zero_point;
+  int32_t output_zero_point;
+
+  // The scaling factor from input to output (aka the 'real multiplier') can
+  // be represented as a fixed point multiplier plus a left shift.
+  int32_t output_multiplier;
+  int output_shift;
+
+  // Per channel output multiplier and shift.
+  int32_t* per_channel_output_multiplier;
+  int32_t* per_channel_output_shift;
+  // The range of the fused activation layer. For example for kNone and
+  // uint8_t these would be 0 and 255.
+  int32_t output_activation_min;
+  int32_t output_activation_max;
+};
+
+TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
+                             TfLiteDepthwiseConvParams* params, int width,
+                             int height, int filter_width, int filter_height,
+                             const TfLiteType data_type, OpData* data) {
+  bool has_bias = node->inputs->size == 3;
+  // Check number of inputs/outputs
+  TF_LITE_ENSURE(context, has_bias || node->inputs->size == 2);
+  TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
+
+  int unused_output_height, unused_output_width;
+  data->padding = ComputePaddingHeightWidth(
+      params->stride_height, params->stride_width, 1, 1, height, width,
+      filter_height, filter_width, params->padding, &unused_output_height,
+      &unused_output_width);
+
+  // Note that quantized inference requires that all tensors have their
+  // parameters set. This is usually done during quantized training.
+  if (data_type != kTfLiteFloat32) {
+    const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+    const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
+    const TfLiteTensor* bias =
+        GetOptionalInputTensor(context, node, kBiasTensor);
+    TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+    int num_channels = filter->dims->data[kDepthwiseConvQuantizedDimension];
+
+    return tflite::PopulateConvolutionQuantizationParams(
+        context, input, filter, bias, output, params->activation,
+        &data->output_multiplier, &data->output_shift,
+        &data->output_activation_min, &data->output_activation_max,
+        data->per_channel_output_multiplier,
+        reinterpret_cast<int*>(data->per_channel_output_shift), num_channels);
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+
+  auto* params =
+      reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
+  OpData* data = static_cast<OpData*>(node->user_data);
+
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
+
+  const TfLiteType data_type = input->type;
+  int width = SizeOfDimension(input, 2);
+  int height = SizeOfDimension(input, 1);
+  int filter_width = SizeOfDimension(filter, 2);
+  int filter_height = SizeOfDimension(filter, 1);
+
+  // Per channel quantization is only needed for int8_t inference. For other
+  // quantized types, only a single scale and zero point is needed.
+  const int num_channels = filter->dims->data[kDepthwiseConvQuantizedDimension];
+  // Dynimically allocate per-channel quantization parameters.
+  data->per_channel_output_multiplier =
+      reinterpret_cast<int32_t*>(context->AllocatePersistentBuffer(
+          context, num_channels * sizeof(int32_t)));
+  data->per_channel_output_shift =
+      reinterpret_cast<int32_t*>(context->AllocatePersistentBuffer(
+          context, num_channels * sizeof(int32_t)));
+
+  // All per-channel quantized tensors need valid zero point and scale arrays.
+  if (input->type == kTfLiteInt8) {
+    TF_LITE_ENSURE_EQ(context, filter->quantization.type,
+                      kTfLiteAffineQuantization);
+
+    const auto* affine_quantization =
+        reinterpret_cast<TfLiteAffineQuantization*>(
+            filter->quantization.params);
+    TF_LITE_ENSURE(context, affine_quantization);
+    TF_LITE_ENSURE(context, affine_quantization->scale);
+    TF_LITE_ENSURE(context, affine_quantization->zero_point);
+    TF_LITE_ENSURE(
+        context, affine_quantization->scale->size == 1 ||
+                     affine_quantization->scale->size ==
+                         filter->dims->data[kDepthwiseConvQuantizedDimension]);
+    TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size,
+                      affine_quantization->zero_point->size);
+  }
+
+  TF_LITE_ENSURE_STATUS(CalculateOpData(context, node, params, width, height,
+                                        filter_width, filter_height, data_type,
+                                        data));
+
+  data->input_zero_point = input->params.zero_point;
+  data->filter_zero_point = filter->params.zero_point;
+  data->output_zero_point = output->params.zero_point;
+
+  return kTfLiteOk;
+}
+
+void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
+                             TfLiteDepthwiseConvParams* params,
+                             const OpData& data, const TfLiteEvalTensor* input,
+                             const TfLiteEvalTensor* filter,
+                             const TfLiteEvalTensor* bias,
+                             TfLiteEvalTensor* output) {
+  DepthwiseParams op_params;
+  op_params.padding_type = PaddingType::kSame;
+  op_params.padding_values.width = data.padding.width;
+  op_params.padding_values.height = data.padding.height;
+  op_params.stride_width = params->stride_width;
+  op_params.stride_height = params->stride_height;
+  op_params.dilation_width_factor = params->dilation_width_factor;
+  op_params.dilation_height_factor = params->dilation_height_factor;
+  op_params.depth_multiplier = params->depth_multiplier;
+  op_params.input_offset = -data.input_zero_point;
+  op_params.weights_offset = 0;
+  op_params.output_offset = data.output_zero_point;
+  // TODO(b/130439627): Use calculated value for clamping.
+  op_params.quantized_activation_min = std::numeric_limits<int8_t>::min();
+  op_params.quantized_activation_max = std::numeric_limits<int8_t>::max();
+
+  vexriscv::DepthwiseConvPerChannel(
+      op_params, data.per_channel_output_multiplier,
+      data.per_channel_output_shift, tflite::micro::GetTensorShape(input),
+      tflite::micro::GetTensorData<int8_t>(input),
+      tflite::micro::GetTensorShape(filter),
+      tflite::micro::GetTensorData<int8_t>(filter),
+      tflite::micro::GetTensorShape(bias),
+      tflite::micro::GetTensorData<int32_t>(bias),
+      tflite::micro::GetTensorShape(output),
+      tflite::micro::GetTensorData<int8_t>(output));
+}
+
+void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
+                   TfLiteDepthwiseConvParams* params, const OpData& data,
+                   const TfLiteEvalTensor* input,
+                   const TfLiteEvalTensor* filter, const TfLiteEvalTensor* bias,
+                   TfLiteEvalTensor* output) {
+  const int32_t input_offset = -data.input_zero_point;
+  const int32_t filter_offset = -data.filter_zero_point;
+  const int32_t output_offset = data.output_zero_point;
+
+  tflite::DepthwiseParams op_params;
+  // Padding type is ignored, but still set.
+  op_params.padding_type = PaddingType::kSame;
+  op_params.padding_values.width = data.padding.width;
+  op_params.padding_values.height = data.padding.height;
+  op_params.stride_width = params->stride_width;
+  op_params.stride_height = params->stride_height;
+  op_params.dilation_width_factor = params->dilation_width_factor;
+  op_params.dilation_height_factor = params->dilation_height_factor;
+  op_params.depth_multiplier = params->depth_multiplier;
+  op_params.quantized_activation_min = data.output_activation_min;
+  op_params.quantized_activation_max = data.output_activation_max;
+  op_params.input_offset = input_offset;
+  op_params.weights_offset = filter_offset;
+  op_params.output_offset = output_offset;
+  op_params.output_multiplier = data.output_multiplier;
+  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
+  op_params.output_shift = -data.output_shift;
+
+  vexriscv::DepthwiseConv(op_params, tflite::micro::GetTensorShape(input),
+                          tflite::micro::GetTensorData<uint8_t>(input),
+                          tflite::micro::GetTensorShape(filter),
+                          tflite::micro::GetTensorData<uint8_t>(filter),
+                          tflite::micro::GetTensorShape(bias),
+                          tflite::micro::GetTensorData<int32_t>(bias),
+                          tflite::micro::GetTensorShape(output),
+                          tflite::micro::GetTensorData<uint8_t>(output));
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+
+  auto* params =
+      reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
+  const OpData& data = *(static_cast<const OpData*>(node->user_data));
+
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  const TfLiteEvalTensor* filter =
+      tflite::micro::GetEvalInput(context, node, kFilterTensor);
+  const TfLiteEvalTensor* bias =
+      (NumInputs(node) == 3)
+          ? tflite::micro::GetEvalInput(context, node, kBiasTensor)
+          : nullptr;
+
+  // TODO(aselle): Consider whether float conv and quantized conv should be
+  // separate ops to avoid dispatch overhead here.
+  switch (input->type) {  // Already know in/out types are same.
+    case kTfLiteInt8:
+      EvalQuantizedPerChannel(context, node, params, data, input, filter, bias,
+                              output);
+      break;
+    case kTfLiteUInt8:
+      EvalQuantized(context, node, params, data, input, filter, bias, output);
+      break;
+    default:
+      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
+                         TfLiteTypeGetName(input->type), input->type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace depthwise_conv
+
+TfLiteRegistration Register_DEPTHWISE_CONV_2D() {
+  return {/*init=*/depthwise_conv::Init,
+          /*free=*/nullptr,
+          /*prepare=*/depthwise_conv::Prepare,
+          /*invoke=*/depthwise_conv::Eval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/vexriscv/doc/DepthwiseConv2D_int8.md b/tensorflow/lite/micro/kernels/vexriscv/doc/DepthwiseConv2D_int8.md
new file mode 100644
index 00000000000..f4de0d92521
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/vexriscv/doc/DepthwiseConv2D_int8.md
@@ -0,0 +1,237 @@
+# Design of DepthwiseConv2D for VexRISCV
+
+*   Author: Daniel You (Google SWE intern, Summer 2020)
+*   Github Profile: [danielyou0230](https://github.com/danielyou0230)
+*   Last Update: August 28, 2020
+*   [PR#42715](https://github.com/tensorflow/tensorflow/pull/42715) (see
+    experiment results in the PR message)
+
+## Overview
+
+The kernel is optimized based on the reference kernel in Tensorflow Lite.
+Different from the straightforward implementation, this implementation takes
+memory layout in TF Lite (`NHWC`) into account, which leverages memory hierarchy
+to reduce memory miss count, to be more specific, it performs depthwise
+convolution for every channel in a fixed spatial position (iterate `C`-axis
+first, then `W`-axis, `H`-axis, and `N`-axis).
+
+## Objective
+
+With the debut of Artificial Intelligence (AI) products and services, our lives
+have been changed ever since. While much of those applications are cloud-based
+implementations, there are still many cases where AI algorithms have to be run
+on resource constrained devices. Current machine learning frameworks are still
+not well optimized for those platforms, thereby preventing more complicated
+applications running on them with acceptable performance.
+
+This design focuses on improving the performance of kernels in TensorFlow Lite
+Micro, to be more specific, this design involves one of the most popular kernels
+among the models deployed on edge devices: DepthwiseConv2D (see
+[TensorFlow Python API](https://www.tensorflow.org/api_docs/python/tf/keras/layers/DepthwiseConv2D);
+[discussion on MobileNetV1](https://groups.google.com/g/keras-users/c/sec8pYjJwwE)
+on Google groups.) The goal is to reduce the inference time on those devices
+which can in turn save more energy on them or more importantly, enable more
+complex applications running on them.
+
+## Background
+
+Existing works aim to optimize on-CPU performance focus on leveraging CPU
+specific instruction like SIMD instructions in RISC-V Vector and other
+counterparts like AVX and SSE intrinsics. An implementation released by Facebook
+([pytorch/FBGEMM](https://github.com/pytorch/FBGEMM/tree/master/src))
+demonstrated the potential that can be achieved with the aforementioned vector
+instructions.
+
+The alternative approach is to optimize on GPUs. Modern GPUs are well-known for
+having great performance in matrix multiplication and parallel computation (e.g.
+CUDA from Nvidia). Those powerful GPUs enable machine learning researchers to
+explore a wide variety of models and solve complicated problems. For resource
+constrained embedded processors, however, incorporating a GPU may not fit the
+limited hardware and power budget for their applications. Unlike running
+TensorFlow Python APIs on desktop or servers, TensorFlow Lite and TensorFlow
+Lite Micro are made to efficiently run inference on those devices, which enables
+the possibilities to make machine learning applications ubiquitous in our life.
+
+## Requirements and scale
+
+After detailed analysis on memory access patterns in existing implementations, I
+found existing code under-utilizes the memory hierarchy, specifically, the SRAM
+cache, to reduce excessive memory access time, which would be approximately 100
+times slower if memory access were optimized
+([Latency Numbers Every Programmer Should Know](https://gist.github.com/jboner/2841832),
+[The Effect Of CPU Caches And Memory Access Patterns](http://kejser.org/the-effect-of-cpu-caches-and-memory-access-patterns/).)
+Therefore, this design aims to improve the memory access pattern to better fit
+the memory layout of the TensorFlow Lite C++ library. Any integer-based models
+with DepthwiseConv2D layers using TensorFlow Lite Micro will benefit from this
+change.
+
+To begin with, the memory layout of tensors in TensorFlow Lite C++ library uses
+`NHWC` format `(n, height, width, channel)` and flattened to an 1-d tensor, the
+index of `(n, h, w, c)` in the tensor can then be calculated with `((n * H + h)
+* W + w) * C + c`. The reference implementation is depicted as follows:
+
+```
+for i-th input among N inputs
+  for c-th input channel
+    for (y, x) in input that are convolving with the filter
+      access element (i, y, x, c) in the input
+```
+
+Thus, if the current element is `(i, y, x, c)` at index `((i * H + y) * W + x) *
+C + c`, next element will be `(i, y, x + 1, c)` at index `((i * H + y) * W + (x
++ 1)) * C + c`, the difference of indices between two consecutive accesses is
+`C` (illustrated below,) which is apparently not a sequential access.
+
+![dconv_arr_index](https://user-images.githubusercontent.com/21079720/91612344-0f25e600-e932-11ea-9278-7c8161748711.png)
+
+In response to the poor memory access pattern in the reference, it would be
+beneficial to implement DepthwiseConv2D in a depth-centric manner, namely,
+accessing elements at a fixed spatial location `(y, x)` for each channel. The
+access order then becomes sequential on the 1-d tensor because the layout of
+tensors are in the format of `NHWC`.
+
+## Design ideas
+
+Instead of accessing the memory in a non-sequential manner, this design proposes
+to change the access pattern to be consistent with the memory layout in the
+current TensorFlow Lite C++ library. The idea can be broken down into two major
+parts:
+
+*   Relating sequential memory access to DepthwiseConv2D
+*   Depthwise convolution with sequential memory access scheme
+
+### Relating sequential memory access to DepthwiseConv2D
+
+Contrary to the reference implementation, the proposed solution re-orders the
+calculation to access the elements sequentially in the tensor, namely, `(0, 1,
+2, ..., H * W * C - 1)`. This can be done by interchanging the order of two
+inner loops: `for i-th input for (y, x) in input that are convolving with the
+filter for c-th input channel access element (i, y, x, c) in the input`
+
+In this case, if the current element is `(i, y, x, c)` at index `((i * H + y) *
+W + x) * C + c`, the next element will be `((i * H + y) * W + x) * C + (c + 1)`,
+the difference of between two consecutive access becomes `1`, thereby fully
+re-using the data in a cache block.
+
+### Depthwise convolution with sequential memory access scheme
+
+In the existing TF Lite reference implementation, each element in the output is
+calculated by performing `(filter_h * filter_w)` multiplications and additions
+in a row. With the proposed design, memory access patterns can be greatly
+improved by re-ordering the calculations.
+
+Rather than calculating the results in a row, this design rearranges the
+operations. To calculate the output at a specific spatial location for all
+channels (see the colored cells in the output tensor in the figure below) the
+resulting order of calculations is illustrated below, the involving input/filter
+locations are represented as `(spatial index, channel)`
+
+![dconv_org_vis](https://user-images.githubusercontent.com/21079720/91612427-409eb180-e932-11ea-9c30-a205c8f3e461.png)
+
+The calculation for each element at the output is completed when it reaches the
+bold coordinates in the table. From the table, this scheme only gets partial
+results until it reaches the last location (i.e., `(#9, 0)` to `(#9, C-1)`).
+Ideally, we can use the output tensor directly as an accumulator, no extra space
+is needed at runtime. Yet, since the output tensor is limited (8 bits) in an
+integer model, accumulating intermediate values at the output tensor will cause
+overflow: the product of two `int8` values is in the range of `int16` and there
+are `H * W` values to be accumulated, the range of the value before quantization
+is `H * W * MAX_INT16`. Therefore, an `int32` accumulator is adequate as long as
+the number of accumulations `(H*W*C)` does not exceed `2^16`. To address
+overflow when accumulating at output tensor and provide better memory access
+pattern, an `int32` array of size equals to number of channels (`C`) as
+accumulators is enough, since those `C` calculations are done once a set of
+spatial locations (`#1` to `#9`) are convolved, we don't have to allocate an
+array with size equals to the output tensor to accumulate the values.
+
+Original        | Optimized
+:-------------: | :-------------:
+(#1, 0)         | (#1, 0)
+(#2, 0)         | (#1, 1)
+...             | ...
+**(#9, 0)**     | (#1, C - 1)
+(#1, 1)         | (#2, 0)
+...             | ...
+**(#9, 1)**     | **(#9, 0)**
+...             | ...
+**(#9, C - 1)** | **(#9, C - 1)**
+
+If we implement this idea, i.e. allocating a temporary array with size equals to
+`C`, we can follow the loop structure shown below, this would work just fine,
+but as we can see in the routine, it involves allocating an `int32` array of
+**size in proportional to the input channel**, which is not preferable in those
+resource limited devices because we cannot assure there will always be enough
+memory given any application or model.
+
+```
+for i-th input among N inputs
+  for each (out_y, out_x)
+    for m < depth_multiplier; step_size = 1
+      calculate origin (in_y_origin, in_x_origin) to perform convolution
+
+      // Accumulate partial results in buffer given a origin
+      create an int32 buffer of size output_channel as accumulators
+
+      for each (filter_y, filter_x)
+        calculate (in_y, in_x) to perform convolution
+        for in_ch < in_channel; step_size = 1
+          calculate out_ch
+          // accumulate partial results
+          buffer[ch_offset] += input[indexOf(i, y, x, in_ch)] *
+                               filter[indexOf(0, f_y, f_x, out_ch)]
+
+      for in_ch < in_channel; step_size = 1
+        calculate out_ch
+        // Add bias / activation / requantize
+        value = postAccumulation(buffer[out_ch])
+        output[indexOf(i, out_y, out_x, out_ch)] = value
+```
+
+Instead, we can further breakdown the structure into chunks, namely, we can add
+an additional nested loop inside to iterate `K` channels a time until all
+channels are processed, the modified loop structure is depicted below and the
+visualization is shown in the figure below the loop.
+
+```
+for i-th input among N inputs
+  for each (out_y, out_x)
+    for m < depth_multiplier; step_size = 1
+      calculate origin (in_y_origin, in_x_origin) to perform convolution
+
+      // Accumulate partial results in buffer for K channels given a origin
+      for ch < input_ch; step_size = K
+        create an int32 buffer of size K as accumulator for current chunk
+
+        for each (filter_y, filter_x)
+          calculate (in_y, in_x) to perform convolution
+          for ch_offset < channel_step; step_size = 1
+            calculate in_ch and out_ch
+            // accumulate partial results
+            buffer[ch_offset] += input[indexOf(i, y, x, in_ch)] *
+                                 filter[indexOf(0, f_y, f_x, out_ch)]
+
+        for ch_offset < channel_step; step_size = 1
+          // Add bias / activation / requantize
+          value = postAccumulation(buffer[ch_offset])
+          output[indexOf(i, out_y, out_x, out_ch)] = value
+```
+
+![dconv_design_vis](https://user-images.githubusercontent.com/21079720/91612374-2369e300-e932-11ea-90eb-898c0270794e.png)
+
+The final problem is how the choice of `K`, according to the soft-CPU
+configuration, we have a cache size of 4KB and each memory block is 32 bytes.
+Combined with the input format we use (`int8`) whenever the OS fetches a block
+of input tensor, it loads 32 `int8` to the cache. To fully utilize that block,
+we can choose the size of the buffer to accommodate 32 partial results (128
+byte, or 4 blocks,) most applications keep the number of channels to be power of
+2s (except for the input,) 32 is a reasonable value to perform depthwise
+convolution for both small and large numbers of channels in the model.
+
+## Alternatives considered
+
+An alternative design is to dynamically allocate a buffer for each channel (an
+`int32` array of size equals to number of output channels.) This approach is
+easier to implement since after `H * W * C` calculations, we can requantize
+those `C` values and store them into the output tensor. However, we are running
+on memory constrained devices, dynamic allocation is not encouraged by the
+upstream developers.
diff --git a/tensorflow/lite/micro/kernels/vexriscv/utils/README.md b/tensorflow/lite/micro/kernels/vexriscv/utils/README.md
new file mode 100644
index 00000000000..48c1e52e133
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/vexriscv/utils/README.md
@@ -0,0 +1,133 @@
+# VexRISC-V utils
+
+This directory contains scripts for some utilities when debugging with TFLM
+applications.
+
+## log_parser.py
+
+This script is used to analyze the function call stack obtained when a
+application is running on Renode with GDB session attached or with Renode's
+logging set to true (include the following line in your `*.resc` file of enter
+this command in Renode every time you launch an simulator.)
+
+```
+sysbus.cpu LogFunctionNames true
+```
+
+Also, make sure you check out Antmicro's repo
+[antmicro/litex-vexriscv-tensorflow-lite-demo](https://github.com/antmicro/litex-vexriscv-tensorflow-lite-demo)
+for the guide to run TensorFlow Lite Micro examples on Renode.
+
+In the following guide with GDB, we will be using the example gdb script
+described here
+[Cobbled-together Profiler](https://xobs.io/cobbled-together-profiler/)
+
+### Launch Renode console
+
+Include (`include` or `i` for short) the renode script (`*.resc`), DO NOT start
+the simulation yet.
+
+The symbol `@` is the path where Renode is installed, you can navigate to
+anywhere on the disk as long as you follow the linux syntax (`../` for parent
+directory, etc.), here I install Renode under home (`/home/$USER` or `~/`)
+directory and I put the demo repository litex-vexriscv-tensorflow-lite-demo
+under home directory.
+
+```
+i @../litex-vexriscv-tensorflow-lite-demo/renode/litex-vexriscv-tflite.resc
+```
+
+### Start GDB server on Renode on port 8833
+
+```
+machine StartGdbServer 8833
+```
+
+### Launch GDB
+
+First you need to find a proper GDB executable for your target architecture,
+here, we will follow Antmicro's repo and use the riscv GDB executable from
+zephyr
+
+Usage: `[GDB] -x [GDB_SCRIPT] [TFLM_BINARY]`
+
+Example: `/opt/zephyr-sdk/riscv64-zephyr-elf/bin/riscv64-zephyr-elf-gdb \ -x
+profiling.gdb \
+../tensorflow/tensorflow/lite/micro/tools/make/gen/zephyr_vexriscv_x86_64/magic_wand/build/zephyr/zephyr.elf`
+
+### Connect GDB to Renode's gdbserver on the same port
+
+```
+(gdb) target remote :8833
+(gdb) monitor start
+(gdb) continue
+
+# Run the function in the GDB script with required parameter
+(gdb) poor_profile 1000
+```
+
+### Interrupt the gdb script regularly with a shell command
+
+```
+for i in $(seq 1000); do echo $i...; killall -INT riscv64-zephyr-elf-gdb; sleep 5; done
+```
+
+### Interpreting the log
+
+#### Parse and visualize the log
+
+```
+# The following command is used to parse and visualize the log file
+# obtained from GDB and only keep top 7 most frequent functions in the
+# image, for detail usage of the script, please refer to the source code
+
+python log_parser.py [INPUT] --regex=gdb_regex.json --visualize --top=7 --source=gdb
+```
+
+Since we are redirecting the gdb interrupt messages to the file
+`<path-you-run-gdb>/profile.txt` (see the gdb script,) we can now parse the log
+and visualize it. (set the image title with argument `--title`)
+
+```
+python log_parser.py profile.txt --regex=gdb_regex.json --visualize --top=7 --title=magic_wand
+```
+
+![image](https://user-images.githubusercontent.com/21079720/91764987-198fec00-eb8d-11ea-8eb1-90355fe4f28c.png)
+
+#### Get the statistic of the function call hierarchy
+
+To get a more detail view of how the entire function call stack looks like and
+how many time the function is called with the exact same call stack, we can add
+another option `--full-trace` to the script and it will generate a `*.json` file
+for the complete call stack trace. `python log_parser.py profile.txt
+--regex=gdb_regex.json --visualize --top=7 --full-trace`
+
+```
+# In the `*.json` file
+root
+|-- fcn0
+|    |-- [stack0, stack1, ...] # List of function call stacks, see below
+|
+|-- fcn1
+|    |-- [stack0, stack1, ...]
+...
+```
+
+```
+# Each stack* object contains the following information
+stack*
+|-- counts: 5 # Number of occurence with the exact same call stack
+|-- [list of functions in the call stack]
+```
+
+![image](https://user-images.githubusercontent.com/21079720/91755189-8bf9cf80-eb7f-11ea-884c-2354f3470271.png)
+
+### Customizing `*.json` used in the script
+
+The regular expression used in this script is configured with a standard
+`*.json` file with the following content:
+
+*   `base`: Base regular expression to clean up the log, this is set to clean up
+    the ANSI color codes in GDB
+*   `custom`: A series of other regular expressions (the script will run them in
+    order) to extract the information from the the log
diff --git a/tensorflow/lite/micro/kernels/vexriscv/utils/gdb_regex.json b/tensorflow/lite/micro/kernels/vexriscv/utils/gdb_regex.json
new file mode 100644
index 00000000000..22a986f752f
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/vexriscv/utils/gdb_regex.json
@@ -0,0 +1,9 @@
+{
+    "base": "(\\x1b\\[\\d?\\d?m)",
+    "custom": [
+        "(\\w+ in ([^()])*)",
+        "tflite[^()]*",
+        "(\\w+) \\(",
+        "#\\d+  (.*) \\("
+    ]
+}
\ No newline at end of file
diff --git a/tensorflow/lite/micro/kernels/vexriscv/utils/log_parser.py b/tensorflow/lite/micro/kernels/vexriscv/utils/log_parser.py
new file mode 100644
index 00000000000..2294088239a
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/vexriscv/utils/log_parser.py
@@ -0,0 +1,340 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+r"""Analyze function call stack from GDB or Renode
+
+See README for detail usage
+
+Example usage:
+
+python log_parser.py profile.txt --regex=gdb_regex.json --visualize --top=7
+
+* To add a title in the graph, use the optional argument --title to set it
+
+Example usage:
+
+python log_parser.py profile.txt --regex=gdb_regex.json \
+--visualize --top=7 --title=magic_wand
+
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import collections
+import json
+import os
+import re
+import matplotlib.pyplot as plt
+
+
+def readlines(filename):
+  """
+  Arg:
+    filename(str):
+
+  Return:
+    (list of str):
+  """
+  with open(filename, "r") as f:
+    content = f.read().splitlines()
+
+  return content
+
+
+def writelines(data, filename):
+  # Write parsed log to file
+  with open(filename, "w") as f:
+    for line in data:
+      f.write(line + "\n")
+
+
+def load_regex_parser(filename):
+  """
+  Arg:
+    filename: string for the input json file containing regex
+  """
+  assert filename is not None
+
+  with open(filename, "r") as f:
+    content = json.load(f)
+
+  regex_parser = {}
+  for key, val in content.items():
+    if isinstance(val, list):
+      regexs = []
+      for pattern in val:
+        regexs.append(re.compile(pattern))
+
+      regex_parser[key] = regexs
+    else:
+      regex_parser[key] = re.compile(val)
+
+  return regex_parser
+
+
+def gdb_log_parser(data, output, re_file, ignore_list=None, full_trace=False):
+  """
+  Args:
+    data: list of strings of logs from GDB
+    output: string of output filename
+    re_file: path to the regex *.json file
+    ignore_list: list of string (functions) to ignore
+    full_trace: bool to generate full stack trace of the log
+  """
+  regex_parser = load_regex_parser(re_file)
+
+  trace = collections.defaultdict(list)
+  stack = []
+  processed = []
+  for line in data:
+    # Skip invalid lines
+    if not line.startswith("#"):
+      continue
+
+    # Skip redundant lines
+    if not full_trace and not line.startswith("#0"):
+      continue
+
+    # Remove ANSI color symbols
+    # line = ANSI_CLEANER.sub("", line)
+    line = regex_parser["base"].sub("", line)
+
+    # Extract function names with regex
+    find = None
+    for r in regex_parser["custom"]:
+      find = r.findall(line)
+
+      if len(find) != 0:
+        break
+
+    if find is None or len(find) == 0:
+      continue
+
+    # Extract content from `re.findall` results
+    target = find[0][0] if isinstance(find[0], tuple) else find[0]
+
+    # Extract function name from `$ADDR in $NAME`, e.g.
+    # `0x40002998 in __addsf3` -> `__addsf3`
+    if " in " in target:
+      target = target.split()[-1]
+
+    # Remove leading/trailing spaces
+    target = target.strip()
+
+    if full_trace:
+      if line.startswith("#0") and stack:
+        # Encode the trace to string
+        temp = "/".join(stack)
+        trace[stack[0]].append(temp)
+
+        # Clear up previous stack
+        stack.clear()
+
+      stack.append(target)
+
+    if not line.startswith("#0"):
+      continue
+
+    if ignore_list and target in ignore_list:
+      continue
+
+    # Strip the string before adding into parsed list
+    processed.append(target)
+
+  print("Extracted {} lines".format(len(processed)))
+
+  # Write parsed log to file
+  writelines(processed, output)
+
+  if full_trace:
+    content = {}
+    for top, paths in trace.items():
+      content[top] = []
+      counter = collections.Counter(paths)
+
+      for path, counts in counter.items():
+        info = {"counts": counts, "path": path.split("/")}
+        content[top].append(info)
+
+    name = os.path.splitext(output)[0]
+    with open(name + ".json", "w") as f:
+      json.dump(content, f, sort_keys=True, indent=4)
+
+  print("Parsed the log to `{}`".format(output))
+
+
+def renode_log_parser(data, output, ignore_list=None):
+  """
+  Args:
+    data: list of strings of logs from Renode
+    output: string of output filename
+    ignore_list: list of string (functions) to ignore
+  """
+  message = "Entering function"
+  extractor = re.compile(r"{} (.*) at".format(message))
+
+  ignore_count = 0
+  processed = []
+  for idx, line in enumerate(data):
+    print("Processing {:.2f}%".format((idx + 1) / len(data) * 100.), end="\r")
+
+    if message not in line:
+      continue
+
+    find = extractor.findall(line)
+
+    # Skip invalid find or unnamed functions
+    if len(find) == 0 or len(find[0].split()) == 0:
+      continue
+
+    entry = find[0].split()[0]
+
+    if ignore_list and entry in ignore_list:
+      ignore_count += 1
+      continue
+
+    processed.append(entry)
+
+  print("Extracted {} lines ({:.2f}%); {} lines are ignored ({:.2f}%)".format(
+      len(processed),
+      len(processed) / len(data) * 100., ignore_count,
+      ignore_count / len(data) * 100.))
+
+  # Write parsed log to file
+  writelines(processed, output)
+
+  print("Parsed the log to `{}`".format(output))
+
+
+def parse_log(filename,
+              output=None,
+              re_file=None,
+              source="gdb",
+              ignore=None,
+              full_trace=False):
+  """
+  Args:
+    filename(str)
+    output(str)
+  """
+  data = readlines(filename)
+  print("Raw log: {} lines".format(len(data)))
+
+  ignore_list = None
+  if ignore is not None:
+    ignore_list = set(readlines(ignore))
+    print("* {} patterns in the ignore list".format(len(ignore_list)))
+
+  name, ext = None, None
+  if output is None:
+    name, ext = os.path.splitext(filename)
+    output = "{}-parsed{}".format(name, ext)
+
+  if source == "gdb":
+    gdb_log_parser(data, output, re_file, ignore_list, full_trace)
+  elif source == "renode":
+    renode_log_parser(data, output, ignore_list=ignore_list)
+  else:
+    raise NotImplementedError
+
+
+def visualize_log(filename, top=None, title=None, show=False, save=True):
+  """
+  Arg:
+    filename(str)
+  """
+  data = readlines(filename)
+  print("Parsed log: {} lines".format(len(data)))
+
+  x, y = get_frequency(data)
+
+  if top is not None:
+    top *= -1
+    x, y = x[top:], y[top:]
+
+  plt.figure(figsize=(3, 5))
+  plt.barh(x, y)
+  plt.xlabel("Frequency")
+
+  if title:
+    plt.title(title)
+
+  if show:
+    plt.show()
+
+  if save:
+    fig_name = "{}.png".format(os.path.splitext(filename)[0])
+    plt.savefig(fname=fig_name, bbox_inches="tight", dpi=300)
+    print("Figure saved in {}".format(fig_name))
+
+
+def get_frequency(data):
+  """
+  Arg:
+    data(list of str):
+
+  Return:
+    keys(list of str):
+    vals(list of str):
+  """
+  counter = collections.Counter(data)
+
+  keys = [pair[0] for pair in sorted(counter.items(), key=lambda x: x[1])]
+  vals = sorted(counter.values())
+
+  return keys, vals
+
+
+if __name__ == "__main__":
+  parser = argparse.ArgumentParser()
+  parser.add_argument("input", type=str, help="Input raw log file.")
+  parser.add_argument("--output",
+                      type=str,
+                      help="Parsed log file. Default: [NAME]-parsed.[EXT]")
+  parser.add_argument("--regex",
+                      type=str,
+                      help="Path to the regex files for parsing GDB log.")
+  parser.add_argument("--visualize",
+                      action="store_true",
+                      help="Parse and visualize")
+  parser.add_argument("--top", type=int, help="Top # to visualize")
+  parser.add_argument("--source",
+                      type=str,
+                      default="gdb",
+                      choices=["gdb", "renode"],
+                      help="Source of where the log is captured")
+  parser.add_argument(
+      "--ignore",
+      type=str,
+      help="List of functions (one for each line in the file) to \
+                  ignore after parsing.")
+  parser.add_argument("--full-trace", action="store_true", help="")
+  parser.add_argument("--title",
+                      type=str,
+                      help="Set title for the visualized image")
+
+  args = parser.parse_args()
+
+  if args.output is None:
+    fname, extension = os.path.splitext(args.input)
+    args.output = "{}-parsed{}".format(fname, extension)
+
+  parse_log(args.input, args.output, args.regex, args.source, args.ignore,
+            args.full_trace)
+
+  if args.visualize:
+    visualize_log(args.output, top=args.top, title=args.title)
diff --git a/tensorflow/lite/micro/micro_allocator.cc b/tensorflow/lite/micro/micro_allocator.cc
index 881b9b9abb0..edac0d5ae5e 100644
--- a/tensorflow/lite/micro/micro_allocator.cc
+++ b/tensorflow/lite/micro/micro_allocator.cc
@@ -242,6 +242,21 @@ TfLiteStatus AllocationInfoBuilder::AddTensors(const SubGraph* subgraph,
     for (size_t n = 0; n < op->inputs()->size(); ++n) {
       const int tensor_index = op->inputs()->Get(n);
       AllocationInfo* current = &info_[tensor_index];
+
+      // TODO(b/166484865): Figure out a more general solution.
+      // This workaround is needed to handle situations where subgraph input !=
+      // operator input.
+      // In case operator input(s) are not in subgraph inputs initialize them.
+      if (current->first_created == 0) {
+        for (size_t op_input = 0; op_input < op->inputs()->size(); ++op_input) {
+          const int op_tensor_index = op->inputs()->Get(op_input);
+          AllocationInfo* op_current = &info_[op_tensor_index];
+          if (op_current->needs_allocating && op_current->first_created == -1) {
+            op_current->first_created = i;
+          }
+        }
+      }
+
       if (((current->last_used == -1) || (current->last_used < i))) {
         current->last_used = i;
       }
@@ -337,8 +352,8 @@ TfLiteStatus AllocationInfoBuilder::AddScratchBuffers(
     current->bytes = handle->bytes;
     current->first_created = handle->node_idx;
     current->last_used = handle->node_idx;
-    current->needs_allocating = true;
     current->offline_offset = kOnlinePlannedBuffer;
+    current->needs_allocating = true;
   }
   return kTfLiteOk;
 }
@@ -655,6 +670,7 @@ TfLiteStatus MicroAllocator::StartModelAllocation(
 
   model_is_allocating_ = true;
 
+  TF_LITE_ENSURE_STATUS(InitScratchBufferHandles());
   TF_LITE_ENSURE_STATUS(AllocateTfLiteEvalTensors(model, eval_tensors));
   TF_LITE_ENSURE_STATUS(
       AllocateNodeAndRegistrations(model, node_and_registrations));
@@ -665,7 +681,8 @@ TfLiteStatus MicroAllocator::StartModelAllocation(
 }
 
 TfLiteStatus MicroAllocator::FinishModelAllocation(
-    const Model* model, TfLiteEvalTensor* eval_tensors) {
+    const Model* model, TfLiteEvalTensor* eval_tensors,
+    void** scratch_buffer_handles) {
   if (!model_is_allocating_) {
     TF_LITE_REPORT_ERROR(error_reporter_,
                          "MicroAllocator: Model allocation finished before "
@@ -676,9 +693,13 @@ TfLiteStatus MicroAllocator::FinishModelAllocation(
   const SubGraph* subgraph = GetSubGraphFromModel(model);
   TFLITE_DCHECK(subgraph != nullptr);
 
+  TF_LITE_ENSURE_STATUS(MoveScratchBufferHandlesToTail());
   TF_LITE_ENSURE_STATUS(CommitStaticMemoryPlan(model, subgraph, eval_tensors));
   TF_LITE_ENSURE_STATUS(AllocateVariables(subgraph, eval_tensors));
 
+  if (scratch_buffer_handles != nullptr) {
+    *scratch_buffer_handles = scratch_buffer_handles_;
+  }
   model_is_allocating_ = false;
   return kTfLiteOk;
 }
@@ -690,49 +711,39 @@ void* MicroAllocator::AllocatePersistentBuffer(size_t bytes) {
 TfLiteStatus MicroAllocator::RequestScratchBufferInArena(int node_id,
                                                          size_t bytes,
                                                          int* buffer_idx) {
-  // A consistency check to make sure scratch_buffer_handles_ is contiguous i.e.
-  // scratch_buffer_handles_ is pointing to the last allocation from memory
-  // allocator.
-  if (scratch_buffer_handles_ != nullptr &&
-      reinterpret_cast<uint8_t*>(scratch_buffer_handles_) !=
-          memory_allocator_->GetTail()) {
-    TF_LITE_REPORT_ERROR(error_reporter_,
-                         "Internal error: AllocateFromTail can not be called "
-                         "between two RequestScratchBufferInArena calls.");
-    return kTfLiteError;
+  // This method is only called during Prepare stage, when the scratch buffer
+  // handles are placed in the head.
+
+  // Allocate space for the new scratch buffer handle.
+  TF_LITE_ENSURE_STATUS(memory_allocator_->EnsureHeadSize(
+      sizeof(internal::ScratchBufferHandle) * (scratch_buffer_count_ + 1),
+      alignof(internal::ScratchBufferHandle)));
+
+  if (scratch_buffer_handles_ == nullptr) {
+    // If this is the first scratch buffer handle, place it in the buffer head.
+    scratch_buffer_handles_ = reinterpret_cast<internal::ScratchBufferHandle*>(
+        memory_allocator_->GetBufferHead());
   }
 
+  // Initialize the handle. `data` field will be set during memory planning.
   internal::ScratchBufferHandle* handle =
-      reinterpret_cast<internal::ScratchBufferHandle*>(
-          memory_allocator_->AllocateFromTail(
-              sizeof(internal::ScratchBufferHandle),
-              alignof(internal::ScratchBufferHandle)));
-  if (handle == nullptr) {
-    TF_LITE_REPORT_ERROR(error_reporter_,
-                         "Failed to register scratch buffer handle for node %s",
-                         node_id);
-    return kTfLiteError;
-  }
+      scratch_buffer_handles_ + scratch_buffer_count_;
   *handle = {};
   handle->bytes = bytes;
   handle->node_idx = node_id;
+
+  // Buffer idx starts from 0 in this implementation.
   *buffer_idx = scratch_buffer_count_;
   scratch_buffer_count_ += 1;
-  // scratch_buffer_handles_ is in reverse order. The following code ensures
-  // that scratch_buffers[0] is pointing to the newly allocated handle.
-  scratch_buffer_handles_ = handle;
   return kTfLiteOk;
 }
 
-void* MicroAllocator::GetScratchBuffer(int buffer_idx) const {
-  if (static_cast<size_t>(buffer_idx) >= scratch_buffer_count_) {
-    TF_LITE_REPORT_ERROR(error_reporter_,
-                         "Buffer %d not found. %d buffers available.",
-                         buffer_idx, scratch_buffer_count_);
-    return nullptr;
-  }
-  // scratch_buffer_handles_ is in reverse order.
-  return scratch_buffer_handles_[scratch_buffer_count_ - buffer_idx - 1].data;
+void* MicroAllocator::GetScratchBuffer(void* scratch_buffer_handles,
+                                       int buffer_idx) {
+  internal::ScratchBufferHandle* handle =
+      reinterpret_cast<internal::ScratchBufferHandle*>(scratch_buffer_handles) +
+      buffer_idx;
+  return handle->data;
 }
 
 size_t MicroAllocator::used_bytes() const {
@@ -1035,7 +1046,6 @@ TfLiteStatus MicroAllocator::CommitStaticMemoryPlan(
         builder.GetOfflinePlannedOffsets(model, &offline_planner_offsets));
     TF_LITE_ENSURE_STATUS(
         builder.AddTensors(subgraph, offline_planner_offsets, eval_tensors));
-
     TF_LITE_ENSURE_STATUS(builder.AddScratchBuffers(scratch_buffer_handles_));
     const AllocationInfo* allocation_info = builder.Finish();
 
@@ -1051,16 +1061,16 @@ TfLiteStatus MicroAllocator::CommitStaticMemoryPlan(
 
     size_t actual_available_arena_size =
         memory_allocator_->GetAvailableMemory(kBufferAlignment);
+
     // Make sure we have enough arena size.
     if (planner.GetMaximumMemorySize() > actual_available_arena_size) {
       TF_LITE_REPORT_ERROR(
           error_reporter_,
-          "Arena size is too small for activation buffers. Needed %d but only "
-          "%d was available.",
+          "Arena size is too small for all buffers. Needed %u but only "
+          "%u was available.",
           planner.GetMaximumMemorySize(), actual_available_arena_size);
       return kTfLiteError;
     }
-
     // Commit the plan.
     TF_LITE_ENSURE_STATUS(CommitPlan(error_reporter_, &planner,
                                      memory_allocator_->GetBufferHead(),
@@ -1073,4 +1083,27 @@ TfLiteStatus MicroAllocator::CommitStaticMemoryPlan(
   return kTfLiteOk;
 }
 
+TfLiteStatus MicroAllocator::InitScratchBufferHandles() {
+  scratch_buffer_count_ = 0;
+  scratch_buffer_handles_ = nullptr;
+  return kTfLiteOk;
+}
+
+TfLiteStatus MicroAllocator::MoveScratchBufferHandlesToTail() {
+  if (scratch_buffer_count_ == 0) {
+    return kTfLiteOk;
+  }
+  auto src = scratch_buffer_handles_;
+  internal::ScratchBufferHandle* dest =
+      reinterpret_cast<internal::ScratchBufferHandle*>(
+          memory_allocator_->AllocateFromTail(
+              sizeof(internal::ScratchBufferHandle) * scratch_buffer_count_,
+              alignof(internal::ScratchBufferHandle)));
+  for (size_t i = 0; i < scratch_buffer_count_; i++) {
+    *(dest + i) = *(src + i);
+  }
+  scratch_buffer_handles_ = dest;
+  return kTfLiteOk;
+}
+
 }  // namespace tflite
diff --git a/tensorflow/lite/micro/micro_allocator.h b/tensorflow/lite/micro/micro_allocator.h
index efd11b8b230..5b478832d3d 100644
--- a/tensorflow/lite/micro/micro_allocator.h
+++ b/tensorflow/lite/micro/micro_allocator.h
@@ -123,9 +123,12 @@ class MicroAllocator {
   // the 'head' section of the memory arena. All variable tensor data will also
   // be allocated. This method should be called after assigning model resources
   // in StartModelAllocation(). The eval_tensors pointer should be the value
-  // passed into this class during StartModelAllocation().
+  // passed into this class during StartModelAllocation(). Scratch buffer
+  // handles are stored in the out-param `scratch_buffer_handles`. This value
+  // will be used in `GetScratchBuffer` call to retrieve scratch buffers.
   TfLiteStatus FinishModelAllocation(const Model* model,
-                                     TfLiteEvalTensor* eval_tensors);
+                                     TfLiteEvalTensor* eval_tensors,
+                                     void** scratch_buffer_handles = nullptr);
 
   // Allocates a TfLiteTensor struct and populates the returned value with
   // properties from the model flatbuffer. This struct is allocated from
@@ -160,12 +163,18 @@ class MicroAllocator {
   // This method only allocates a BufferHandle holding information for memory
   // planning. The buffer ptr is ready after `FinishModelAllocation` and can
   // be retrieved by `GetScratchBuffer` method using the returned buffer_idx.
-  // Note that there should be no tail allocation between two consecutive
-  // `RequestScratchBufferInArena` calls.
+  // Note that this method should only be called in the Prepare stage.
   TfLiteStatus RequestScratchBufferInArena(int node_id, size_t bytes,
                                            int* buffer_idx);
-  // Returns the pointer to the planned scratch buffer.
-  void* GetScratchBuffer(int buffer_idx) const;
+
+  // Return the number of scratch buffers in the allocator.
+  size_t GetScratchBufferCount() const { return scratch_buffer_count_; }
+
+  // Return the pointer to the planned scratch buffer. `scratch_buffer_handles`
+  // should be the corresponding value returned in `FinishModelAllocation`.
+  // `scratch_buffer_handles` is intentionally desigend as void*. The actual
+  // data type is an implementation detail, and is only visible in this class.
+  static void* GetScratchBuffer(void* scratch_buffer_handles, int buffer_idx);
 
   // Returns the arena usage in bytes, only available after
   // `FinishModelAllocation`. Otherwise, it will return 0.
@@ -236,13 +245,16 @@ class MicroAllocator {
   ErrorReporter* error_reporter_;
   bool model_is_allocating_;
 
-  // In reverse order for efficiency.
-  // i.e. scratch_buffer_handles_[0] is the handle for the last buffer,
-  // corresponding to the last RequestScratchBufferInArena call.
+  // Points to the first allocated scratch buffer handle.
+  // Scratch buffer handles are placed in the head during `Prepare` stage and
+  // then moved to the tail for static memory plan.
   internal::ScratchBufferHandle* scratch_buffer_handles_ = nullptr;
   // How many scratch buffers have been allocated.
   size_t scratch_buffer_count_ = 0;
 
+  virtual TfLiteStatus InitScratchBufferHandles();
+  virtual TfLiteStatus MoveScratchBufferHandlesToTail();
+
   TF_LITE_REMOVE_VIRTUAL_DELETE
 };
 
diff --git a/tensorflow/lite/micro/micro_allocator_test.cc b/tensorflow/lite/micro/micro_allocator_test.cc
index 32d52a994d9..1ac68443f1a 100644
--- a/tensorflow/lite/micro/micro_allocator_test.cc
+++ b/tensorflow/lite/micro/micro_allocator_test.cc
@@ -735,4 +735,64 @@ TF_LITE_MICRO_TEST(TestAllocateTfLiteTensorWithReset) {
   TF_LITE_MICRO_EXPECT(tensor2 == tensor1);
 }
 
+TF_LITE_MICRO_TEST(TestOperatorInputsNotInSubgraphInputs) {
+  constexpr int nbr_tensors = 5;
+  tflite::AllOpsResolver op_resolver = tflite::testing::GetOpResolver();
+  tflite::NodeAndRegistration* node_and_registration;
+  const int32_t metadata_buffer[tflite::testing::kOfflinePlannerHeaderSize +
+                                nbr_tensors] = {
+      1, 0, nbr_tensors,  // header: version, subgraph, nbr tensors
+      // memory offsets:
+      0,    // t0
+      0,    // t1
+      0,    // t2
+      48,   // t3
+      -1};  // t4
+
+  int t0 = 0;
+  int t1 = 1;
+  int t2 = 2;
+  int t3 = 3;
+  int t4 = 4;
+
+  int num_conns = 2;
+  tflite::testing::NodeConnection node_list[2] = {
+      {
+          {t0, t1, t2},  // t0: input (actual input part of subgraph inputs as
+                         // well as operator inputs)
+                         // t1: scratch1 (only in operator inputs)
+                         // t2: scratch2 (only in operator inputs)
+          {t3}           // output
+      },
+      {
+          {t3},  // input
+          {t4}   // output
+      },
+  };
+
+  const tflite::Model* model = tflite::testing::GetModelWithOfflinePlanning(
+      nbr_tensors, metadata_buffer, node_list, num_conns,
+      1 /* only first tensor (t0) is in subgraph input list*/);
+
+  TfLiteEvalTensor* eval_tensors = nullptr;
+  constexpr size_t arena_size = 4096;
+  uint8_t arena[arena_size];
+  tflite::MicroAllocator* allocator =
+      tflite::MicroAllocator::Create(arena, arena_size, micro_test::reporter);
+
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk,
+      allocator->StartModelAllocation(model, op_resolver,
+                                      &node_and_registration, &eval_tensors));
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, allocator->FinishModelAllocation(model, eval_tensors));
+
+  uint8_t* start = eval_tensors[0].data.uint8;
+  TF_LITE_MICRO_EXPECT_EQ(0, eval_tensors[0].data.uint8 - start);
+  TF_LITE_MICRO_EXPECT_EQ(0, eval_tensors[1].data.uint8 - start);
+  TF_LITE_MICRO_EXPECT_EQ(0, eval_tensors[2].data.uint8 - start);
+  TF_LITE_MICRO_EXPECT_EQ(48, eval_tensors[3].data.uint8 - start);
+  TF_LITE_MICRO_EXPECT_EQ(0, eval_tensors[4].data.uint8 - start);
+}
+
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/micro_interpreter.cc b/tensorflow/lite/micro/micro_interpreter.cc
index 8c2f8e031d8..a17e40fe0c7 100644
--- a/tensorflow/lite/micro/micro_interpreter.cc
+++ b/tensorflow/lite/micro/micro_interpreter.cc
@@ -59,13 +59,31 @@ TfLiteStatus ContextHelper::RequestScratchBufferInArena(TfLiteContext* ctx,
                                                         size_t bytes,
                                                         int* buffer_idx) {
   ContextHelper* helper = reinterpret_cast<ContextHelper*>(ctx->impl_);
-  return helper->allocator_->RequestScratchBufferInArena(
-      helper->current_node_idx_, bytes, buffer_idx);
+
+  // We can not forward the scratch buffer request to the allocator yet,
+  // otherwise the scratch buffer handles will ruin the data in `temp` section.
+  // These requests will be processed once the `temp` section is deallocated,
+  // i.e. after a node has been prepared.
+
+  if (helper->scratch_buffer_count_ >= kMaxScratchBuffersPerOp) {
+    TF_LITE_REPORT_ERROR(
+        helper->error_reporter_,
+        "Node %d is allocating too many scratch buffers per op, max=%d",
+        helper->current_node_idx_, helper->scratch_buffer_count_);
+  }
+  helper->scrach_buffer_sizes_[helper->scratch_buffer_count_] = bytes;
+  // buffer_idx is 0 indexed.
+  *buffer_idx = helper->scratch_buffer_count_ +
+                helper->allocator_->GetScratchBufferCount();
+  helper->scratch_buffer_count_++;
+  return kTfLiteOk;
 }
 
 void* ContextHelper::GetScratchBuffer(TfLiteContext* ctx, int buffer_idx) {
-  return reinterpret_cast<ContextHelper*>(ctx->impl_)
-      ->allocator_->GetScratchBuffer(buffer_idx);
+  ContextHelper* helper = reinterpret_cast<ContextHelper*>(ctx->impl_);
+
+  return helper->allocator_->GetScratchBuffer(helper->scratch_buffer_handles_,
+                                              buffer_idx);
 }
 
 void ContextHelper::ReportOpError(struct TfLiteContext* context,
@@ -92,12 +110,39 @@ TfLiteEvalTensor* ContextHelper::GetEvalTensor(
   return &helper->eval_tensors_[tensor_idx];
 }
 
-void ContextHelper::SetNodeIndex(int idx) { current_node_idx_ = idx; }
+void ContextHelper::SetNodeIndex(int idx) {
+  if (scratch_buffer_count_ != 0) {
+    TF_LITE_REPORT_ERROR(error_reporter_,
+                         "Internal error: Please commit scratch buffers "
+                         "befrore moving to the next node");
+  }
+  current_node_idx_ = idx;
+}
 
 void ContextHelper::SetTfLiteEvalTensors(TfLiteEvalTensor* eval_tensors) {
   eval_tensors_ = eval_tensors;
 }
 
+void ContextHelper::SetScratchBufferHandles(void* scratch_buffer_handle) {
+  scratch_buffer_handles_ = scratch_buffer_handle;
+}
+
+TfLiteStatus ContextHelper::CommitScratchBuffers() {
+  size_t initial_buffer_count = allocator_->GetScratchBufferCount();
+  for (size_t i = 0; i < scratch_buffer_count_; i++) {
+    int buffer_id;
+    allocator_->RequestScratchBufferInArena(
+        current_node_idx_, scrach_buffer_sizes_[i], &buffer_id);
+    if (static_cast<size_t>(buffer_id) != initial_buffer_count + i) {
+      TF_LITE_REPORT_ERROR(
+          error_reporter_,
+          "Internal error. Scratch buffers are not contiguous.\n");
+    }
+  }
+  scratch_buffer_count_ = 0;
+  return kTfLiteOk;
+}
+
 }  // namespace internal
 
 MicroInterpreter::MicroInterpreter(const Model* model,
@@ -229,6 +274,7 @@ TfLiteStatus MicroInterpreter::AllocateTensors() {
   // TODO(b/16157777): This call would not be needed if ContextHelper rolled
   // into the interpreter.
   context_helper_.SetTfLiteEvalTensors(eval_tensors_);
+  context_.tensors_size = subgraph_->tensors()->size();
 
   // If the system is big endian then convert weights from the flatbuffer from
   // little to big endian on startup so that it does not need to be done during
@@ -297,6 +343,7 @@ TfLiteStatus MicroInterpreter::AllocateTensors() {
       }
     }
     allocator_.ResetTempAllocations();
+    context_helper_.CommitScratchBuffers();
   }
   context_helper_.SetNodeIndex(-1);
 
@@ -306,8 +353,12 @@ TfLiteStatus MicroInterpreter::AllocateTensors() {
   context_.RequestScratchBufferInArena = nullptr;
   context_.GetScratchBuffer = context_helper_.GetScratchBuffer;
 
+  void* scratch_buffer_handles = nullptr;
+
   TF_LITE_ENSURE_OK(&context_,
-                    allocator_.FinishModelAllocation(model_, eval_tensors_));
+                    allocator_.FinishModelAllocation(model_, eval_tensors_,
+                                                     &scratch_buffer_handles));
+  context_helper_.SetScratchBufferHandles(scratch_buffer_handles);
   TF_LITE_ENSURE_STATUS(ResetVariableTensors());
 
   tensors_allocated_ = true;
diff --git a/tensorflow/lite/micro/micro_interpreter.h b/tensorflow/lite/micro/micro_interpreter.h
index 67d74574e61..f36d9d80f96 100644
--- a/tensorflow/lite/micro/micro_interpreter.h
+++ b/tensorflow/lite/micro/micro_interpreter.h
@@ -25,13 +25,15 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/micro/micro_allocator.h"
 #include "tensorflow/lite/micro/micro_op_resolver.h"
+#include "tensorflow/lite/portable_type_to_tflitetype.h"
 #include "tensorflow/lite/schema/schema_generated.h"
-#include "tensorflow/lite/type_to_tflitetype.h"
 
 namespace tflite {
 
 namespace internal {
 
+constexpr size_t kMaxScratchBuffersPerOp = 8;
+
 // A helper class to encapsulate the implementation of APIs in Context.
 // context->impl_ points to an instance of this class.
 // Check tensorflow/lite/c/common.h for detailed descriptions.
@@ -53,19 +55,28 @@ class ContextHelper {
                                  int tensor_idx);
   static TfLiteEvalTensor* GetEvalTensor(const struct TfLiteContext* context,
                                          int tensor_idx);
+  // Commits all scratch buffer allocations to MicroAllocator.
+  TfLiteStatus CommitScratchBuffers();
 
   // Sets the current node index to assist with scratch buffer allocations:
   void SetNodeIndex(int idx);
 
   // Sets the pointer to a list of TfLiteEvalTensor instances.
   void SetTfLiteEvalTensors(TfLiteEvalTensor* eval_tensors);
+  // Sets the pointer to scratch buffer handle, which is needed by
+  // `GetScratchBuffer`.
+  void SetScratchBufferHandles(void* scratch_buffer_handle);
 
  private:
-  MicroAllocator* allocator_;
-  ErrorReporter* error_reporter_;
-  const Model* model_;
-  TfLiteEvalTensor* eval_tensors_;
+  MicroAllocator* allocator_ = nullptr;
+  ErrorReporter* error_reporter_ = nullptr;
+  const Model* model_ = nullptr;
+  TfLiteEvalTensor* eval_tensors_ = nullptr;
+  void* scratch_buffer_handles_ = nullptr;
   int current_node_idx_ = -1;
+
+  size_t scrach_buffer_sizes_[kMaxScratchBuffersPerOp];
+  size_t scratch_buffer_count_ = 0;
 };
 
 }  // namespace internal
diff --git a/tensorflow/lite/micro/micro_interpreter_test.cc b/tensorflow/lite/micro/micro_interpreter_test.cc
index 150dbead337..0d8b61a532a 100644
--- a/tensorflow/lite/micro/micro_interpreter_test.cc
+++ b/tensorflow/lite/micro/micro_interpreter_test.cc
@@ -19,7 +19,6 @@ limitations under the License.
 
 #include "tensorflow/lite/core/api/flatbuffer_conversions.h"
 #include "tensorflow/lite/micro/all_ops_resolver.h"
-#include "tensorflow/lite/micro/micro_optional_debug_tools.h"
 #include "tensorflow/lite/micro/micro_utils.h"
 #include "tensorflow/lite/micro/recording_micro_allocator.h"
 #include "tensorflow/lite/micro/test_helpers.h"
@@ -84,6 +83,7 @@ TF_LITE_MICRO_TEST(TestInterpreter) {
     TF_LITE_MICRO_EXPECT_LE(interpreter.arena_used_bytes(), 928 + 100);
     TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(1), interpreter.inputs_size());
     TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(2), interpreter.outputs_size());
+    TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(4), interpreter.tensors_size());
 
     TfLiteTensor* input = interpreter.input(0);
     TF_LITE_MICRO_EXPECT_NE(nullptr, input);
@@ -113,9 +113,6 @@ TF_LITE_MICRO_TEST(TestInterpreter) {
     TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(4), output->bytes);
     TF_LITE_MICRO_EXPECT_NE(nullptr, output->data.i32);
     TF_LITE_MICRO_EXPECT_EQ(42, output->data.i32[0]);
-
-    // Just to make sure that this method works.
-    tflite::PrintInterpreterState(&interpreter);
   }
 
   TF_LITE_MICRO_EXPECT_EQ(tflite::testing::MockCustom::freed_, true);
@@ -220,38 +217,45 @@ TF_LITE_MICRO_TEST(TestKernelMemoryPlanning) {
 
   tflite::AllOpsResolver op_resolver = tflite::testing::GetOpResolver();
 
-  constexpr size_t allocator_buffer_size = 2048;
+  constexpr size_t allocator_buffer_size = 4096;
   uint8_t allocator_buffer[allocator_buffer_size];
-  tflite::MicroInterpreter interpreter(model, op_resolver, allocator_buffer,
-                                       allocator_buffer_size,
-                                       micro_test::reporter);
-  TF_LITE_MICRO_EXPECT_EQ(interpreter.AllocateTensors(), kTfLiteOk);
-  TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(1), interpreter.inputs_size());
-  TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(2), interpreter.outputs_size());
 
-  TfLiteTensor* input = interpreter.input(0);
-  TF_LITE_MICRO_EXPECT_EQ(1, input->dims->size);
-  TF_LITE_MICRO_EXPECT_EQ(3, input->dims->data[0]);
-  input->data.uint8[0] = 2;
-  input->data.uint8[1] = 3;
-  input->data.uint8[2] = 1;
+  tflite::RecordingMicroAllocator* allocator =
+      tflite::RecordingMicroAllocator::Create(
+          allocator_buffer, allocator_buffer_size, micro_test::reporter);
 
-  uint8_t expected_median = 2;
+  // Make sure kernel memory planning works in multi-tenant context.
+  for (int i = 0; i < 3; i++) {
+    tflite::MicroInterpreter interpreter(model, op_resolver, allocator,
+                                         micro_test::reporter);
+    TF_LITE_MICRO_EXPECT_EQ(interpreter.AllocateTensors(), kTfLiteOk);
+    TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(1), interpreter.inputs_size());
+    TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(2), interpreter.outputs_size());
 
-  {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, interpreter.Invoke());
-    TfLiteTensor* median = interpreter.output(0);
-    TF_LITE_MICRO_EXPECT_EQ(expected_median, median->data.uint8[0]);
-    TfLiteTensor* invoke_count = interpreter.output(1);
-    TF_LITE_MICRO_EXPECT_EQ(1, invoke_count->data.i32[0]);
-  }
+    TfLiteTensor* input = interpreter.input(0);
+    TF_LITE_MICRO_EXPECT_EQ(1, input->dims->size);
+    TF_LITE_MICRO_EXPECT_EQ(3, input->dims->data[0]);
+    input->data.uint8[0] = 2;
+    input->data.uint8[1] = 3;
+    input->data.uint8[2] = 1;
 
-  {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, interpreter.Invoke());
-    TfLiteTensor* median = interpreter.output(0);
-    TF_LITE_MICRO_EXPECT_EQ(expected_median, median->data.uint8[0]);
-    TfLiteTensor* invoke_count = interpreter.output(1);
-    TF_LITE_MICRO_EXPECT_EQ(2, invoke_count->data.i32[0]);
+    uint8_t expected_median = 2;
+
+    {
+      TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, interpreter.Invoke());
+      TfLiteTensor* median = interpreter.output(0);
+      TF_LITE_MICRO_EXPECT_EQ(expected_median, median->data.uint8[0]);
+      TfLiteTensor* invoke_count = interpreter.output(1);
+      TF_LITE_MICRO_EXPECT_EQ(1, invoke_count->data.i32[0]);
+    }
+
+    {
+      TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, interpreter.Invoke());
+      TfLiteTensor* median = interpreter.output(0);
+      TF_LITE_MICRO_EXPECT_EQ(expected_median, median->data.uint8[0]);
+      TfLiteTensor* invoke_count = interpreter.output(1);
+      TF_LITE_MICRO_EXPECT_EQ(2, invoke_count->data.i32[0]);
+    }
   }
 }
 
@@ -262,7 +266,7 @@ TF_LITE_MICRO_TEST(TestVariableTensorReset) {
   tflite::AllOpsResolver op_resolver = tflite::testing::GetOpResolver();
 
   constexpr size_t allocator_buffer_size =
-      2096 /* optimal arena size at the time of writting. */ +
+      3072 /* optimal arena size at the time of writting. */ +
       16 /* alignment */ + 100 /* some headroom */;
   uint8_t allocator_buffer[allocator_buffer_size];
   tflite::MicroInterpreter interpreter(model, op_resolver, allocator_buffer,
diff --git a/tensorflow/lite/micro/micro_mutable_op_resolver.h b/tensorflow/lite/micro/micro_mutable_op_resolver.h
index 834b464210a..de1bfd00472 100644
--- a/tensorflow/lite/micro/micro_mutable_op_resolver.h
+++ b/tensorflow/lite/micro/micro_mutable_op_resolver.h
@@ -309,6 +309,11 @@ class MicroMutableOpResolver : public MicroOpResolver {
                       tflite::ops::micro::Register_QUANTIZE(), ParseQuantize);
   }
 
+  TfLiteStatus AddReduceMax() {
+    return AddBuiltin(BuiltinOperator_REDUCE_MAX,
+                      tflite::ops::micro::Register_REDUCE_MAX(), ParseReducer);
+  }
+
   TfLiteStatus AddRelu() {
     return AddBuiltin(BuiltinOperator_RELU, tflite::ops::micro::Register_RELU(),
                       ParseRelu);
@@ -355,6 +360,11 @@ class MicroMutableOpResolver : public MicroOpResolver {
                       tflite::ops::micro::Register_SPLIT(), ParseSplit);
   }
 
+  TfLiteStatus AddSplitV() {
+    return AddBuiltin(BuiltinOperator_SPLIT_V,
+                      tflite::ops::micro::Register_SPLIT_V(), ParseSplitV);
+  }
+
   TfLiteStatus AddSqrt() {
     return AddBuiltin(BuiltinOperator_SQRT, tflite::ops::micro::Register_SQRT(),
                       ParseSqrt);
diff --git a/tensorflow/lite/micro/micro_optional_debug_tools.cc b/tensorflow/lite/micro/micro_optional_debug_tools.cc
deleted file mode 100644
index 4617b3d9825..00000000000
--- a/tensorflow/lite/micro/micro_optional_debug_tools.cc
+++ /dev/null
@@ -1,186 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/lite/micro/micro_optional_debug_tools.h"
-
-// `cinttypes` requires `__STDC_FORMAT_MACROS` to be defined to expose `PRId32`.
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <cinttypes>
-#include <cstddef>
-#include <cstdint>
-#include <cstdio>
-#include <vector>
-
-#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/micro/memory_helpers.h"
-#include "tensorflow/lite/micro/micro_allocator.h"
-#include "tensorflow/lite/micro/micro_interpreter.h"
-#include "tensorflow/lite/schema/schema_generated.h"
-
-namespace tflite {
-namespace {
-
-std::vector<int> flatbuffersVector2StdVector(
-    const flatbuffers::Vector<int32_t>& fVector) {
-  std::vector<int> stdVector;
-  stdVector.reserve(fVector.size());
-  for (size_t i = 0; i < fVector.size(); i++) {
-    stdVector.push_back(fVector.Get(i));
-  }
-  return stdVector;
-}
-
-void PrintIntVector(const std::vector<int>& v) {
-  for (const auto& it : v) {
-    printf(" %d", it);
-  }
-  printf("\n");
-}
-
-void PrintTfLiteIntVector(const TfLiteIntArray* v) {
-  if (!v) {
-    printf(" (null)\n");
-    return;
-  }
-  for (int k = 0; k < v->size; k++) {
-    printf(" %d", v->data[k]);
-  }
-  printf("\n");
-}
-
-const char* TensorTypeName(TfLiteType type) {
-  switch (type) {
-    case kTfLiteNoType:
-      return "kTfLiteNoType";
-    case kTfLiteFloat32:
-      return "kTfLiteFloat32";
-    case kTfLiteInt32:
-      return "kTfLiteInt32";
-    case kTfLiteUInt8:
-      return "kTfLiteUInt8";
-    case kTfLiteInt8:
-      return "kTfLiteInt8";
-    case kTfLiteInt64:
-      return "kTfLiteInt64";
-    case kTfLiteString:
-      return "kTfLiteString";
-    case kTfLiteBool:
-      return "kTfLiteBool";
-    case kTfLiteInt16:
-      return "kTfLiteInt16";
-    case kTfLiteComplex64:
-      return "kTfLiteComplex64";
-    case kTfLiteComplex128:
-      return "kTfLiteComplex128";
-    case kTfLiteFloat16:
-      return "kTfLiteFloat16";
-    case kTfLiteFloat64:
-      return "kTfLiteFloat64";
-  }
-  return "(invalid)";
-}
-
-const char* AllocTypeName(TfLiteAllocationType type) {
-  switch (type) {
-    case kTfLiteMemNone:
-      return "kTfLiteMemNone";
-    case kTfLiteMmapRo:
-      return "kTfLiteMmapRo";
-    case kTfLiteDynamic:
-      return "kTfLiteDynamic";
-    case kTfLiteArenaRw:
-      return "kTfLiteArenaRw";
-    case kTfLiteArenaRwPersistent:
-      return "kTfLiteArenaRwPersistent";
-    case kTfLitePersistentRo:
-      return "kTfLitePersistentRo";
-  }
-  return "(invalid)";
-}
-}  // namespace
-
-// Helper function to print model flatbuffer data. This function is not called
-// by default. Hence it's not linked in to the final binary code.
-void PrintModelData(const Model* model, ErrorReporter* error_reporter) {
-#ifndef TF_LITE_STRIP_ERROR_STRINGS
-  auto* subgraphs = model->subgraphs();
-  const SubGraph* subgraph = (*subgraphs)[0];
-  const flatbuffers::Vector<flatbuffers::Offset<Tensor>>* tensors =
-      subgraph->tensors();
-  const flatbuffers::Vector<flatbuffers::Offset<Buffer>>* buffers =
-      model->buffers();
-  TF_LITE_REPORT_ERROR(error_reporter, "==== Model info: =====");
-  for (size_t i = 0; i < tensors->size(); ++i) {
-    const tflite::Tensor& flatbuffer_tensor = *tensors->Get(i);
-    size_t type_size, tensor_size;
-    auto* buffer = (*buffers)[flatbuffer_tensor.buffer()];
-    auto* array = buffer->data();
-    int array_size = 0;
-    if (array) {
-      array_size = array->size();
-    }
-    BytesRequiredForTensor(flatbuffer_tensor, &tensor_size, &type_size,
-                           error_reporter);
-    TF_LITE_REPORT_ERROR(
-        error_reporter, "Tensor index: %d arena tensor %d size %d ", i,
-        !array_size && !flatbuffer_tensor.is_variable(), tensor_size);
-  }
-#endif
-}
-
-// Prints a dump of what tensors and what nodes are in the interpreter.
-void PrintInterpreterState(MicroInterpreter* interpreter) {
-  printf("Interpreter has %zu tensors and %zu nodes\n",
-         interpreter->tensors_size(), interpreter->operators_size());
-  printf("Inputs:");
-  PrintIntVector(flatbuffersVector2StdVector(interpreter->inputs()));
-  printf("Outputs:");
-  PrintIntVector(flatbuffersVector2StdVector(interpreter->outputs()));
-  printf("\n");
-
-  for (size_t tensor_index = 0; tensor_index < interpreter->tensors_size();
-       tensor_index++) {
-    TfLiteTensor* tensor = interpreter->tensor(static_cast<int>(tensor_index));
-    printf("Tensor %3zu %10s %15s %10zu bytes (%4.1f MB) ", tensor_index,
-           TensorTypeName(tensor->type), AllocTypeName(tensor->allocation_type),
-           tensor->bytes, static_cast<double>(tensor->bytes / (1 << 20)));
-    PrintTfLiteIntVector(tensor->dims);
-  }
-  printf("\n");
-
-  for (size_t node_index = 0; node_index < interpreter->operators_size();
-       node_index++) {
-    const NodeAndRegistration node_and_reg =
-        interpreter->node_and_registration(static_cast<int>(node_index));
-    const TfLiteNode& node = node_and_reg.node;
-    const TfLiteRegistration* reg = node_and_reg.registration;
-    if (reg->custom_name != nullptr) {
-      printf("Node %3zu Operator Custom Name %s\n", node_index,
-             reg->custom_name);
-    } else {
-      printf("Node %3zu Operator Builtin Code %3" PRId32 " %s\n", node_index,
-             reg->builtin_code, EnumNamesBuiltinOperator()[reg->builtin_code]);
-    }
-    printf("  Inputs:");
-    PrintTfLiteIntVector(node.inputs);
-    printf("  Outputs:");
-    PrintTfLiteIntVector(node.outputs);
-  }
-}
-
-}  // namespace tflite
diff --git a/tensorflow/lite/micro/micro_optional_debug_tools.h b/tensorflow/lite/micro/micro_optional_debug_tools.h
deleted file mode 100644
index cc9630e6f12..00000000000
--- a/tensorflow/lite/micro/micro_optional_debug_tools.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Optional debugging functionality. For small sized binaries, these are not
-// needed.
-#ifndef TENSORFLOW_LITE_MICRO_MICRO_OPTIONAL_DEBUG_TOOLS_H_
-#define TENSORFLOW_LITE_MICRO_MICRO_OPTIONAL_DEBUG_TOOLS_H_
-
-#include "tensorflow/lite/micro/micro_interpreter.h"
-
-namespace tflite {
-// Helper function to print model flatbuffer data. This function is not called
-// by default. Hence it's not linked in to the final binary code.
-void PrintModelData(const Model* model, ErrorReporter* error_reporter);
-// Prints a dump of what tensors and what nodes are in the interpreter.
-void PrintInterpreterState(MicroInterpreter* interpreter);
-}  // namespace tflite
-
-#endif  // TENSORFLOW_LITE_MICRO_MICRO_OPTIONAL_DEBUG_TOOLS_H_
diff --git a/tensorflow/lite/micro/micro_string.cc b/tensorflow/lite/micro/micro_string.cc
index 6d6495ed7c9..ad769f69b0e 100644
--- a/tensorflow/lite/micro/micro_string.cc
+++ b/tensorflow/lite/micro/micro_string.cc
@@ -23,6 +23,7 @@ limitations under the License.
 
 #include <cstdarg>
 #include <cstdint>
+#include <cstring>
 
 namespace {
 
@@ -125,7 +126,8 @@ char* FastFloatToBufferLeft(float f, char* buffer) {
   const int32_t exponent_shift = 23;
   const int32_t exponent_bias = 127;
   const uint32_t fraction_mask = 0x007fffff;
-  const uint32_t u = *reinterpret_cast<uint32_t*>(&f);
+  uint32_t u;
+  memcpy(&u, &f, sizeof(int32_t));
   const int32_t exponent =
       ((u & exponent_mask) >> exponent_shift) - exponent_bias;
   const uint32_t fraction = (u & fraction_mask);
@@ -190,13 +192,15 @@ char* FastFloatToBufferLeft(float f, char* buffer) {
   // works properly.
   *current = '0';
 
-  // Shift fraction values and prepent zeros.
-  for (int i = 0; i < fraction_digits; i++) {
-    current--;
-    *(current + leading_zeros) = *current;
-    *current = '0';
+  // Shift fraction values and prepend zeros if necessary.
+  if (leading_zeros != 0) {
+    for (int i = 0; i < fraction_digits; i++) {
+      current--;
+      *(current + leading_zeros) = *current;
+      *current = '0';
+    }
+    current += kMaxFractionalDigits;
   }
-  current += kMaxFractionalDigits;
 
   // Truncate trailing zeros for cleaner logs. Ensure we leave at least one
   // fractional character for the case when scaled_fraction is 0.
diff --git a/tensorflow/lite/micro/micro_string_test.cc b/tensorflow/lite/micro/micro_string_test.cc
index 400f908f97f..f69812c5a9e 100644
--- a/tensorflow/lite/micro/micro_string_test.cc
+++ b/tensorflow/lite/micro/micro_string_test.cc
@@ -120,6 +120,15 @@ TF_LITE_MICRO_TEST(FloatFormatShouldPrintFractionCorrectly) {
   TF_LITE_MICRO_EXPECT_STRING_EQ(golden, buffer);
 }
 
+TF_LITE_MICRO_TEST(FloatFormatShouldPrintFractionCorrectlyNoLeadingZeros) {
+  const int kBufferLen = 24;
+  char buffer[kBufferLen];
+  const char golden[] = "Float: 1.6332993*2^-1";
+  int bytes_written = MicroSnprintf(buffer, kBufferLen, "Float: %f", 0.816650);
+  TF_LITE_MICRO_EXPECT_EQ(static_cast<int>(sizeof(golden)), bytes_written);
+  TF_LITE_MICRO_EXPECT_STRING_EQ(golden, buffer);
+}
+
 TF_LITE_MICRO_TEST(StringFormatOverrunShouldTruncate) {
   const int kBufferLen = 10;
   char buffer[kBufferLen];
diff --git a/tensorflow/lite/micro/test_helpers.cc b/tensorflow/lite/micro/test_helpers.cc
index 23c7ca96408..dd5e996ac26 100644
--- a/tensorflow/lite/micro/test_helpers.cc
+++ b/tensorflow/lite/micro/test_helpers.cc
@@ -107,8 +107,11 @@ class ModelBuilder {
 
   // Constructs the flatbuffer model using `builder_` and return a pointer to
   // it. The returned model has the same lifetime as `builder_`.
+  // Note the default value of 0 for num_subgraph_inputs means all tensor inputs
+  // are in subgraph input list.
   const Model* BuildModel(std::initializer_list<Tensor> inputs,
-                          std::initializer_list<Tensor> outputs);
+                          std::initializer_list<Tensor> outputs,
+                          size_t num_subgraph_inputs = 0);
 
  private:
   // Adds a tensor to the model.
@@ -179,7 +182,8 @@ void ModelBuilder::AddMetadata(const char* description_string,
 
 const Model* ModelBuilder::BuildModel(
     std::initializer_list<ModelBuilder::Tensor> inputs,
-    std::initializer_list<ModelBuilder::Tensor> outputs) {
+    std::initializer_list<ModelBuilder::Tensor> outputs,
+    size_t num_subgraph_inputs) {
   // Model schema requires an empty buffer at idx 0.
   size_t buffer_size = 1 + ModelBuilder::nbr_of_metadata_buffers_;
   flatbuffers::Offset<Buffer> buffers[kMaxMetadataBuffers];
@@ -193,10 +197,21 @@ const Model* ModelBuilder::BuildModel(
 
   // TFLM only supports single subgraph.
   constexpr size_t subgraphs_size = 1;
+
+  // Find out number of subgraph inputs.
+  if (num_subgraph_inputs == 0) {
+    // This is the default case.
+    num_subgraph_inputs = inputs.size();
+  } else {
+    // A non-zero value of num_subgraph_inputs means that some of
+    // the operator input tensors are not subgraph inputs.
+    TFLITE_DCHECK(num_subgraph_inputs < inputs.size());
+  }
+
   const flatbuffers::Offset<SubGraph> subgraphs[subgraphs_size] = {
       tflite::CreateSubGraph(
           *builder_, builder_->CreateVector(tensors_, next_tensor_id_),
-          builder_->CreateVector(inputs.begin(), inputs.size()),
+          builder_->CreateVector(inputs.begin(), num_subgraph_inputs),
           builder_->CreateVector(outputs.begin(), outputs.size()),
           builder_->CreateVector(operators_, next_operator_id_),
           builder_->CreateString("test_subgraph"))};
@@ -301,7 +316,8 @@ const Model* BuildSimpleModelWithBranch() {
 const Model* BuildModelWithOfflinePlanning(int number_of_tensors,
                                            const int32_t* metadata_buffer,
                                            NodeConnection* node_conn,
-                                           int num_conns) {
+                                           int num_conns,
+                                           int num_subgraph_inputs) {
   using flatbuffers::Offset;
   flatbuffers::FlatBufferBuilder* fb_builder = BuilderInstance();
 
@@ -323,8 +339,8 @@ const Model* BuildModelWithOfflinePlanning(int number_of_tensors,
       "OfflineMemoryAllocation", metadata_buffer,
       number_of_tensors + tflite::testing::kOfflinePlannerHeaderSize);
 
-  return model_builder.BuildModel(node_conn[0].input,
-                                  node_conn[num_conns - 1].output);
+  return model_builder.BuildModel(
+      node_conn[0].input, node_conn[num_conns - 1].output, num_subgraph_inputs);
 }
 
 const Model* BuildSimpleMockModel() {
@@ -593,13 +609,18 @@ TfLiteStatus SimpleStatefulOp::Prepare(TfLiteContext* context,
   TF_LITE_ENSURE_STATUS(context->RequestScratchBufferInArena(
       context, sizeof(uint8_t) * NumElements(input->dims),
       &data->sorting_buffer));
+  // We can interleave scratch / persistent buffer allocation.
+  data->invoke_count = reinterpret_cast<int*>(
+      context->AllocatePersistentBuffer(context, sizeof(int)));
+  *data->invoke_count = 0;
+
   return kTfLiteOk;
 }
 
 TfLiteStatus SimpleStatefulOp::Invoke(TfLiteContext* context,
                                       TfLiteNode* node) {
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
-  data->invoke_count += 1;
+  *data->invoke_count += 1;
 
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   const uint8_t* input_data = GetTensorData<uint8_t>(input);
@@ -626,7 +647,7 @@ TfLiteStatus SimpleStatefulOp::Invoke(TfLiteContext* context,
   int32_t* invoke_count_data = GetTensorData<int32_t>(invoke_count);
 
   median_data[0] = sorting_buffer[size / 2];
-  invoke_count_data[0] = data->invoke_count;
+  invoke_count_data[0] = *data->invoke_count;
   return kTfLiteOk;
 }
 
@@ -710,9 +731,10 @@ const Model* GetSimpleModelWithBranch() {
 const Model* GetModelWithOfflinePlanning(int num_tensors,
                                          const int32_t* metadata_buffer,
                                          NodeConnection* node_conn,
-                                         int num_conns) {
+                                         int num_conns,
+                                         int num_subgraph_inputs) {
   const Model* model = BuildModelWithOfflinePlanning(
-      num_tensors, metadata_buffer, node_conn, num_conns);
+      num_tensors, metadata_buffer, node_conn, num_conns, num_subgraph_inputs);
   return model;
 }
 
diff --git a/tensorflow/lite/micro/test_helpers.h b/tensorflow/lite/micro/test_helpers.h
index a7897145d26..d80f70a1bbc 100644
--- a/tensorflow/lite/micro/test_helpers.h
+++ b/tensorflow/lite/micro/test_helpers.h
@@ -49,7 +49,7 @@ class SimpleStatefulOp {
   static constexpr int kMedianTensor = 0;
   static constexpr int kInvokeCount = 1;
   struct OpData {
-    int invoke_count = 0;
+    int* invoke_count = nullptr;
     int sorting_buffer = kBufferNotAllocated;
   };
 
@@ -88,10 +88,22 @@ const Model* GetComplexMockModel();
 const Model* GetSimpleModelWithBranch();
 
 // Returns a simple flatbuffer model with offline planned tensors
+// @param[in]       num_tensors           Number of tensors in the model.
+// @param[in]       metadata_buffer       Metadata for offline planner.
+// @param[in]       node_con              List of connections, i.e. operators
+//                                        in the model.
+// @param[in]       num_conns             Number of connections.
+// @param[in]       num_subgraph_inputs   How many of the input tensors are in
+//                                        the subgraph inputs. The default value
+//                                        of 0 means all of the input tensors
+//                                        are in the subgraph input list. There
+//                                        must be at least 1 input tensor in the
+//                                        subgraph input list.
 const Model* GetModelWithOfflinePlanning(int num_tensors,
                                          const int32_t* metadata_buffer,
                                          NodeConnection* node_conn,
-                                         int num_conns);
+                                         int num_conns,
+                                         int num_subgraph_inputs = 0);
 
 // Returns a flatbuffer model with `simple_stateful_op`
 const Model* GetSimpleStatefulModel();
diff --git a/tensorflow/lite/micro/testing/BUILD b/tensorflow/lite/micro/testing/BUILD
index 6f4b2502f4a..207d500c53d 100644
--- a/tensorflow/lite/micro/testing/BUILD
+++ b/tensorflow/lite/micro/testing/BUILD
@@ -1,3 +1,4 @@
+load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 load(
     "//tensorflow/lite/micro/testing:micro_test.bzl",
     "tflite_micro_cc_test",
@@ -78,3 +79,10 @@ py_binary(
         "@absl_py//absl:app",
     ],
 )
+
+bzl_library(
+    name = "micro_test_bzl",
+    srcs = ["micro_test.bzl"],
+    visibility = ["//visibility:private"],
+    deps = ["//tensorflow/lite/micro:build_def_bzl"],
+)
diff --git a/tensorflow/lite/micro/testing/test_linux_binary.sh b/tensorflow/lite/micro/testing/test_linux_binary.sh
index 1e967be1f61..30cf0413c4f 100755
--- a/tensorflow/lite/micro/testing/test_linux_binary.sh
+++ b/tensorflow/lite/micro/testing/test_linux_binary.sh
@@ -1,4 +1,4 @@
-#!/bin/bash -e
+#!/bin/bash
 # Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensorflow/lite/micro/testing/test_utils.cc b/tensorflow/lite/micro/testing/test_utils.cc
index 4d931bdd33b..8c16ed70bbc 100644
--- a/tensorflow/lite/micro/testing/test_utils.cc
+++ b/tensorflow/lite/micro/testing/test_utils.cc
@@ -178,7 +178,7 @@ TfLiteTensor CreateQuantizedTensor(const int8_t* data, TfLiteIntArray* dims,
   return result;
 }
 
-TfLiteTensor CreateQuantizedTensor(float* data, uint8_t* quantized_data,
+TfLiteTensor CreateQuantizedTensor(const float* data, uint8_t* quantized_data,
                                    TfLiteIntArray* dims, bool is_variable) {
   TfLiteTensor result;
   SymmetricQuantize(data, dims, quantized_data, &result.params.scale);
@@ -192,7 +192,7 @@ TfLiteTensor CreateQuantizedTensor(float* data, uint8_t* quantized_data,
   return result;
 }
 
-TfLiteTensor CreateQuantizedTensor(float* data, int8_t* quantized_data,
+TfLiteTensor CreateQuantizedTensor(const float* data, int8_t* quantized_data,
                                    TfLiteIntArray* dims, bool is_variable) {
   TfLiteTensor result;
   SignedSymmetricQuantize(data, dims, quantized_data, &result.params.scale);
@@ -206,7 +206,7 @@ TfLiteTensor CreateQuantizedTensor(float* data, int8_t* quantized_data,
   return result;
 }
 
-TfLiteTensor CreateQuantizedTensor(float* data, int16_t* quantized_data,
+TfLiteTensor CreateQuantizedTensor(const float* data, int16_t* quantized_data,
                                    TfLiteIntArray* dims, bool is_variable) {
   TfLiteTensor result;
   SignedSymmetricQuantize(data, dims, quantized_data, &result.params.scale);
diff --git a/tensorflow/lite/micro/testing/test_utils.h b/tensorflow/lite/micro/testing/test_utils.h
index e83ac806d8a..1b5f5c64979 100644
--- a/tensorflow/lite/micro/testing/test_utils.h
+++ b/tensorflow/lite/micro/testing/test_utils.h
@@ -30,7 +30,6 @@ namespace testing {
 
 // Note: These methods are deprecated, do not use.  See b/141332970.
 
-
 // Derives the quantization range max from scaling factor and zero point.
 template <typename T>
 inline float MaxFromZeroPointScale(const int zero_point, const float scale) {
@@ -81,15 +80,15 @@ TfLiteTensor CreateQuantizedTensor(const int8_t* data, TfLiteIntArray* dims,
                                    float min, float max,
                                    bool is_variable = false);
 
-TfLiteTensor CreateQuantizedTensor(float* data, uint8_t* quantized_data,
+TfLiteTensor CreateQuantizedTensor(const float* data, uint8_t* quantized_data,
                                    TfLiteIntArray* dims,
                                    bool is_variable = false);
 
-TfLiteTensor CreateQuantizedTensor(float* data, int8_t* quantized_data,
+TfLiteTensor CreateQuantizedTensor(const float* data, int8_t* quantized_data,
                                    TfLiteIntArray* dims,
                                    bool is_variable = false);
 
-TfLiteTensor CreateQuantizedTensor(float* data, int16_t* quantized_data,
+TfLiteTensor CreateQuantizedTensor(const float* data, int16_t* quantized_data,
                                    TfLiteIntArray* dims,
                                    bool is_variable = false);
 
diff --git a/tensorflow/lite/micro/tools/ci_build/test_all.sh b/tensorflow/lite/micro/tools/ci_build/test_all.sh
index e0cb0b325ef..e79d0d4d1ad 100755
--- a/tensorflow/lite/micro/tools/ci_build/test_all.sh
+++ b/tensorflow/lite/micro/tools/ci_build/test_all.sh
@@ -35,10 +35,10 @@ make -f tensorflow/lite/micro/tools/make/Makefile \
 echo "Starting to run micro tests at `date`"
 
 echo "Running x86 tests at `date`"
-tensorflow/lite/micro/tools/ci_build/test_x86.sh
+tensorflow/lite/micro/tools/ci_build/test_x86.sh PRESUBMIT
 
 echo "Running bluepill tests at `date`"
-tensorflow/lite/micro/tools/ci_build/test_bluepill.sh
+tensorflow/lite/micro/tools/ci_build/test_bluepill.sh PRESUBMIT
 
 echo "Running mbed tests at `date`"
 tensorflow/lite/micro/tools/ci_build/test_mbed.sh PRESUBMIT
@@ -47,7 +47,7 @@ echo "Running Sparkfun tests at `date`"
 tensorflow/lite/micro/tools/ci_build/test_sparkfun.sh
 
 echo "Running stm32f4 tests at `date`"
-tensorflow/lite/micro/tools/ci_build/test_stm32f4.sh
+tensorflow/lite/micro/tools/ci_build/test_stm32f4.sh PRESUBMIT
 
 echo "Running Arduino tests at `date`"
 tensorflow/lite/micro/tools/ci_build/test_arduino.sh
diff --git a/tensorflow/lite/micro/tools/ci_build/test_bluepill.sh b/tensorflow/lite/micro/tools/ci_build/test_bluepill.sh
index 1f957e9dcab..3c687068c56 100755
--- a/tensorflow/lite/micro/tools/ci_build/test_bluepill.sh
+++ b/tensorflow/lite/micro/tools/ci_build/test_bluepill.sh
@@ -32,6 +32,21 @@ TARGET=bluepill
 # TODO(b/143715361): downloading first to allow for parallel builds.
 readable_run make -f tensorflow/lite/micro/tools/make/Makefile TARGET=${TARGET} third_party_downloads
 
-# TODO(b/143286954): Run all the tests once they pass.
-readable_run make -j8 -f tensorflow/lite/micro/tools/make/Makefile TARGET=${TARGET} kernel_add_test
+# check that the release build is ok.
+readable_run make -f tensorflow/lite/micro/tools/make/Makefile clean
+readable_run make -j8 -f tensorflow/lite/micro/tools/make/Makefile TARGET=${TARGET} build BUILD_TYPE=release
+
+# TODO(b/168334217): enable debug build once it does not fail.
+#readable_run make -f tensorflow/lite/micro/tools/make/Makefile clean
+#readable_run make -j8 -f tensorflow/lite/micro/tools/make/Makefile TARGET=${TARGET} build BUILD_TYPE=debug
+
+readable_run make -f tensorflow/lite/micro/tools/make/Makefile clean
+readable_run make -j8 -f tensorflow/lite/micro/tools/make/Makefile TARGET=${TARGET} build
+
+# TODO(b/149597202): Running tests via renode are disabled as part of the
+# continuous integration until we can get Docker running inside Docker. However,
+# if this script is run locally, the tests will still be run.
+if [[ ${1} != "PRESUBMIT" ]]; then
+readable_run make -f tensorflow/lite/micro/tools/make/Makefile TARGET=${TARGET} test
+fi
 
diff --git a/tensorflow/lite/micro/tools/ci_build/test_stm32f4.sh b/tensorflow/lite/micro/tools/ci_build/test_stm32f4.sh
index 2ef1bb1f97f..4f744e6aa2b 100755
--- a/tensorflow/lite/micro/tools/ci_build/test_stm32f4.sh
+++ b/tensorflow/lite/micro/tools/ci_build/test_stm32f4.sh
@@ -40,7 +40,9 @@ readable_run make -j8 -f tensorflow/lite/micro/tools/make/Makefile BUILD_TYPE=re
 readable_run make -f tensorflow/lite/micro/tools/make/Makefile clean
 readable_run make -j8 -f tensorflow/lite/micro/tools/make/Makefile TAGS=${TAGS} TARGET=${TARGET} build
 
-# TODO(b/149597202): Disabled until we can get Docker running inside Docker.
-# Parallell builds doesn't work very well with this
-#readable_run make -f tensorflow/lite/micro/tools/make/Makefile TAGS=${TAGS} TARGET=${TARGET} test
-
+# TODO(b/149597202): Running tests via renode are disabled as part of the
+# continuous integration until we can get Docker running inside Docker. However,
+# if this script is run locally, the tests will still be run.
+if [[ ${1} != "PRESUBMIT" ]]; then
+readable_run make -f tensorflow/lite/micro/tools/make/Makefile TAGS=${TAGS} TARGET=${TARGET} test
+fi
diff --git a/tensorflow/lite/micro/tools/ci_build/test_x86.sh b/tensorflow/lite/micro/tools/ci_build/test_x86.sh
index 49e20b4f84d..84d6eef3bfd 100755
--- a/tensorflow/lite/micro/tools/ci_build/test_x86.sh
+++ b/tensorflow/lite/micro/tools/ci_build/test_x86.sh
@@ -29,6 +29,9 @@ readable_run make -f tensorflow/lite/micro/tools/make/Makefile clean
 # TODO(b/143715361): downloading first to allow for parallel builds.
 readable_run make -f tensorflow/lite/micro/tools/make/Makefile third_party_downloads
 
+# Next, build w/o TF_LITE_STATIC_MEMORY to catch additional build errors.
+readable_run make -j8 -f tensorflow/lite/micro/tools/make/Makefile BUILD_TYPE=no_tf_lite_static_memory build
+
 # First make sure that the release build succeeds.
 readable_run make -j8 -f tensorflow/lite/micro/tools/make/Makefile BUILD_TYPE=release build
 
@@ -36,3 +39,18 @@ readable_run make -j8 -f tensorflow/lite/micro/tools/make/Makefile BUILD_TYPE=re
 # debugging info on failures.
 readable_run make -f tensorflow/lite/micro/tools/make/Makefile clean
 readable_run make -s -j8 -f tensorflow/lite/micro/tools/make/Makefile test
+
+# Also repeat for the debug build.
+readable_run make -f tensorflow/lite/micro/tools/make/Makefile clean
+readable_run make -s -j8 -f tensorflow/lite/micro/tools/make/Makefile BUILD_TYPE=debug test
+
+if [[ ${1} != "PRESUBMIT" ]]; then
+  # Most of TFLM external contributors only use make. We are building a subset of
+  # targets with bazel as part of this script to make it easier for external
+  # contributors to fix these errors prior to creating a pull request.
+  #
+  # We only run the bazel command when this script is run locally (i.e. not via
+  # test_all.sh) to avoid duplicate work on the CI system and also avoid
+  # installing bazel on the TFLM Docker image.
+  readable_run bazel build tensorflow/lite/micro:all
+fi
diff --git a/tensorflow/lite/micro/tools/make/Makefile b/tensorflow/lite/micro/tools/make/Makefile
index 62510159547..ef9b084df9b 100644
--- a/tensorflow/lite/micro/tools/make/Makefile
+++ b/tensorflow/lite/micro/tools/make/Makefile
@@ -74,14 +74,30 @@ TEST_SCRIPT := tensorflow/lite/micro/testing/test_linux_binary.sh
 
 MICROLITE_LIBS := -lm
 
+
+# For each tag specified on the command line we add -D<tag> to the cflags to
+# allow for #idefs in the code.
+#
+# We apply the following transformations (via the tr command):
+#   1. Convert the tag name to uppercase (TAGS=xtensa_hifimini -> -DXTENSA_HIFIMINI)
+#   2. (Temporarily) Replace dash with underscore (TAGS=cmsis-nn -> -DCMSIS_NN)
+#
+# Transformation 2 is needed because CMSIS-NN is not a valid macro name.
+#
+# TODO(b/168824958): remove dash->underscore transformation once the cmsis-nn
+# and ethos-u directories have been renamed.
+TAG_DEFINES := $(foreach TAG,$(TAGS),-D$(shell echo $(TAG) | tr [a-z] [A-Z] | tr - _))
+
+OPTIMIZATION_LEVEL := -O3
+
 CC_WARNINGS := -Werror -Wsign-compare -Wdouble-promotion \
                -Wshadow -Wunused-variable -Wmissing-field-initializers \
-               -Wunused-function
+               -Wunused-function -Wswitch -Wvla
 # TODO(b/150240249): Add in -fno-rtti once that works for the Xtensa toolchain.
-# TODO(b/159155203): Consider TF_LITE_STATIC_MEMORY to align more with the fact
-# this flag is for an optimized micro runtime.
-CXXFLAGS := -std=c++11 -DTF_LITE_STATIC_MEMORY $(CC_WARNINGS)
-CCFLAGS  := -std=c11   -DTF_LITE_STATIC_MEMORY $(CC_WARNINGS)
+CXXFLAGS := -std=c++11 -Wstrict-aliasing -DTF_LITE_STATIC_MEMORY \
+	    $(CC_WARNINGS) $(OPTIMIZATION_LEVEL) $(TAG_DEFINES)
+CCFLAGS  := -DTF_LITE_STATIC_MEMORY $(CC_WARNINGS) $(OPTIMIZATION_LEVEL) \
+            $(TAG_DEFINES)
 ARFLAGS := -r
 
 # override these in the makefile.inc for specific compiler targets
@@ -92,11 +108,20 @@ ifeq ($(BUILD_TYPE), debug)
 	CXXFLAGS += -DDEBUG -g
 	CCFLAGS  += -DDEBUG -g
 else ifeq ($(BUILD_TYPE), release)
-	CXXFLAGS += -DNDEBUG -O3 -DTF_LITE_STRIP_ERROR_STRINGS
-	CCFLAGS  += -DNDEBUG -O3 -DTF_LITE_STRIP_ERROR_STRINGS
+	CXXFLAGS += -DNDEBUG -DTF_LITE_STRIP_ERROR_STRINGS
+	CCFLAGS  += -DNDEBUG -DTF_LITE_STRIP_ERROR_STRINGS
+else ifeq ($(BUILD_TYPE), no_tf_lite_static_memory)
+	# This build should not be used to run any binaries/tests since
+	# TF_LITE_STATIC_MEMORY should be defined for all micro builds. However,
+	# having a build without TF_LITE_STATIC_MEMORY is useful to catch errors in
+	# code that is shared between TfLite Mobile and TfLite Micro. See this issue
+	# for more details:
+	# https://github.com/tensorflow/tensorflow/issues/43076
+	CXXFLAGS := $(filter-out -DTF_LITE_STATIC_MEMORY, $(CXXFLAGS))
+	CCFLAGS := $(filter-out -DTF_LITE_STATIC_MEMORY, $(CCFLAGS))
 else
-	CXXFLAGS += -DNDEBUG -O3
-	CCFLAGS  += -DNDEBUG -O3
+	CXXFLAGS += -DNDEBUG
+	CCFLAGS  += -DNDEBUG
 endif
 
 # This library is the main target for this makefile. It will contain a minimal
@@ -106,7 +131,14 @@ MICROLITE_LIB_NAME := libtensorflow-microlite.a
 # These two must be defined before we include the target specific Makefile.inc
 # because we filter out the examples that are not supported for those targets.
 # See targets/xtensa_xpg_makefile.inc for an example.
-MICRO_LITE_EXAMPLE_TESTS := $(shell find tensorflow/lite/micro/examples/ -name Makefile.inc)
+#
+# We limit max depth of directories to search to not include target specific
+# Makefiles that are included directly by the main example Makefile. See
+# examples/micro_speech/Makefile.inc for an example. At the same time, we
+# search till an arbitrary depth for files named Makefile_internal.inc as a way
+# to bypass this check and allow for deeper directory structures.
+MICRO_LITE_EXAMPLE_TESTS := $(shell find tensorflow/lite/micro/examples/ -maxdepth 2 -name Makefile.inc)
+MICRO_LITE_EXAMPLE_TESTS += $(shell find tensorflow/lite/micro/examples/ -name Makefile_internal.inc)
 MICRO_LITE_BENCHMARKS := $(wildcard tensorflow/lite/micro/benchmarks/Makefile.inc)
 
 MICROLITE_TEST_SRCS := \
@@ -176,6 +208,7 @@ tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h \
 tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h \
 tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h \
 tensorflow/lite/kernels/internal/reference/integer_ops/l2normalization.h \
+tensorflow/lite/kernels/internal/reference/integer_ops/mean.h \
 tensorflow/lite/kernels/internal/reference/integer_ops/mul.h \
 tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h \
 tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h \
@@ -200,21 +233,22 @@ tensorflow/lite/kernels/internal/reference/tanh.h \
 tensorflow/lite/kernels/internal/cppmath.h \
 tensorflow/lite/kernels/internal/max.h \
 tensorflow/lite/kernels/internal/min.h \
+tensorflow/lite/kernels/internal/portable_tensor.h \
 tensorflow/lite/kernels/internal/strided_slice_logic.h \
-tensorflow/lite/kernels/internal/tensor.h \
 tensorflow/lite/kernels/internal/tensor_ctypes.h \
 tensorflow/lite/kernels/internal/types.h \
 tensorflow/lite/kernels/kernel_util.h \
 tensorflow/lite/kernels/op_macros.h \
 tensorflow/lite/kernels/padding.h \
+tensorflow/lite/portable_type_to_tflitetype.h \
 tensorflow/lite/schema/schema_generated.h \
-tensorflow/lite/string_type.h \
-tensorflow/lite/string_util.h \
-tensorflow/lite/type_to_tflitetype.h \
 tensorflow/lite/version.h
 
+# TODO(b/165940489): Figure out how to avoid including fixed point
+# platform-specific headers.
 THIRD_PARTY_CC_HDRS := \
 third_party/gemmlowp/fixedpoint/fixedpoint.h \
+third_party/gemmlowp/fixedpoint/fixedpoint_neon.h \
 third_party/gemmlowp/fixedpoint/fixedpoint_sse.h \
 third_party/gemmlowp/internal/detect_platform.h \
 third_party/gemmlowp/LICENSE \
@@ -258,6 +292,8 @@ THIRD_PARTY_DOWNLOADS :=
 $(eval $(call add_third_party_download,$(GEMMLOWP_URL),$(GEMMLOWP_MD5),gemmlowp,))
 $(eval $(call add_third_party_download,$(FLATBUFFERS_URL),$(FLATBUFFERS_MD5),flatbuffers,))
 $(eval $(call add_third_party_download,$(RUY_URL),$(RUY_MD5),ruy,))
+$(eval $(call add_third_party_download,$(PERSON_MODEL_URL),$(PERSON_MODEL_MD5),person_model_grayscale,))
+$(eval $(call add_third_party_download,$(PERSON_MODEL_INT8_URL),$(PERSON_MODEL_INT8_MD5),person_model_int8,))
 
 # These target-specific makefiles should modify or replace options like
 # CXXFLAGS or LIBS to work for a specific targeted architecture. All logic
@@ -341,7 +377,7 @@ $(BINDIR)%_test : $(OBJDIR)%_test.o $(MICROLITE_LIB_PATH)
 	@mkdir -p $(dir $@)
 	$(CXX) $(CXXFLAGS) $(INCLUDES) \
 	-o $@ $< \
-	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
+	$(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
 
 $(BINDIR)%.test_target: $(BINDIR)%_test
 	@test -f $(TEST_SCRIPT) || (echo 'Unable to find the test script. Is the software emulation available in $(TARGET)?'; exit 1)
diff --git a/tensorflow/lite/micro/tools/make/helper_functions.inc b/tensorflow/lite/micro/tools/make/helper_functions.inc
index 83cb0e31254..a2127e4bdca 100644
--- a/tensorflow/lite/micro/tools/make/helper_functions.inc
+++ b/tensorflow/lite/micro/tools/make/helper_functions.inc
@@ -161,12 +161,48 @@ $(PRJDIR)$(3)/$(1)/Makefile: tensorflow/lite/micro/tools/make/templates/arc/arc_
 $(PRJDIR)$(3)/$(1)/%: tensorflow/lite/micro/tools/make/templates/arc/%.tpl
 	@cp $$< $$@
 
-$(foreach var,$(ARC_TARGET_FILES_DIRS),$(eval $(call path_changing_copy_file,$(PRJDIR)$(3)/$(1),$(var))))
+$(foreach var,$(ARC_TARGET_COPY_FILES), $(eval $(call path_changing_copy_file,\
+    $(PRJDIR)$(3)/$(1)/$(word 1, $(subst !, ,$(var))),\
+    $(word 2, $(subst !, ,$(var))))))
 
 endif
 endef
 
 
+define generate_ceva_bx1_project
+ifeq ($(TARGET), ceva)
+ifeq ($(TARGET_ARCH), bx1)
+
+$(PRJDIR)$(3)/$(1)/Makefile: tensorflow/lite/micro/tools/make/templates/ceva_bx1/ceva_app_makefile.tpl
+	@mkdir -p $$(dir $$@)
+	@sed -E 's#\%\{SRCS\}\%#$(4)#g' $$< | \
+	sed -E 's#\%\{CC\}\%#$(CC_TOOL)#g' | \
+	sed -E 's#\%\{CXX\}\%#$(CXX_TOOL)#g' | \
+	sed -E 's#\%\{LD\}\%#$(LD_TOOL)#g' | \
+	sed -E 's#\%\{EXECUTABLE\}\%#$(3).elf#g' | \
+	sed -E 's#\%\{LD_FLAGS\}\%#$(6)#g' | \
+	sed -E 's#\%\{CXX_FLAGS\}\%#$(7)#g' | \
+	sed -E 's#\%\{CC_FLAGS\}\%#$(8)#g' | \
+	sed -E 's#\%\{EXTRA_APP_SETTINGS\}\%#$(ARC_EXTRA_APP_SETTINGS)#g' | \
+	sed -E 's#\%\{EXTRA_APP_RULES\}\%#$(ARC_EXTRA_APP_RULES)#g' | \
+	sed -E 's#\%\{BIN_DEPEND\}\%#$(ARC_BIN_DEPEND)#g' | \
+	sed -E 's#\%\{BIN_RULE\}\%#$(ARC_BIN_RULE)#g' | \
+	sed -E 's#\%\{EXTRA_RM_TARGETS\}\%#$(ARC_EXTRA_RM_TARGETS)#g' | \
+	sed -E 's#\%\{APP_RUN_CMD\}\%#$(ARC_APP_RUN_CMD)#g' | \
+	sed -E 's#\%\{APP_DEBUG_CMD\}\%#$(ARC_APP_DEBUG_CMD)#g' | \
+	sed -E 's#\%\{EXTRA_EXECUTE_RULES\}\%#$(ARC_EXTRA_EXECUTE_RULES)#g' > $$@
+
+$(PRJDIR)$(3)/$(1)/%: tensorflow/lite/micro/tools/make/templates/ceva_bx1/%.tpl
+	@cp $$< $$@
+
+$(foreach var,$(CEVA_TARGET_FILES_DIRS),$(eval $(call path_changing_copy_file,$(PRJDIR)$(3)/$(1),$(var))))
+
+endif
+endif
+endef
+
+
+
 
 
 # Creates a set of rules to build a standalone Arduino project for an
@@ -403,6 +439,7 @@ endef
 define generate_microlite_projects
 $(call generate_project,make,$(MAKE_PROJECT_FILES) $($(1)_MAKE_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(LDFLAGS) $(MICROLITE_LIBS),$(CXXFLAGS) $(GENERATED_PROJECT_INCLUDES), $(CCFLAGS) $(GENERATED_PROJECT_INCLUDES),$(TARGET_TOOLCHAIN_ROOT),$(TARGET_TOOLCHAIN_PREFIX))
 $(call generate_arc_project,make,$(MAKE_PROJECT_FILES) $($(1)_MAKE_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(LDFLAGS) $(GENERATED_PROJECT_LIBS),$(CXXFLAGS) $(GENERATED_PROJECT_INCLUDES), $(CCFLAGS) $(GENERATED_PROJECT_INCLUDES))
+$(call generate_ceva_bx1_project,make,$(MAKE_PROJECT_FILES) $($(1)_MAKE_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(LDFLAGS) $(GENERATED_PROJECT_LIBS),$(CXXFLAGS) $(GENERATED_PROJECT_INCLUDES), $(CCFLAGS) $(GENERATED_PROJECT_INCLUDES))
 $(call generate_project,mbed,$(MBED_PROJECT_FILES) $($(1)_MBED_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(MICROLITE_LIBS),$(CXXFLAGS),$(CCFLAGS),$(TARGET_TOOLCHAIN_ROOT),$(TARGET_TOOLCHAIN_PREFIX))
 $(call generate_project,keil,$(KEIL_PROJECT_FILES) $($(1)_KEIL_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(MICROLITE_LIBS),$(CXXFLAGS),$(CCFLAGS),$(TARGET_TOOLCHAIN_ROOT),$(TARGET_TOOLCHAIN_PREFIX))
 ifeq (,$(findstring _benchmark,$(1)))
@@ -427,13 +464,13 @@ $(1)_LOCAL_SRCS := $$(call specialize,$$($(1)_LOCAL_SRCS))
 ALL_SRCS += $$($(1)_LOCAL_SRCS)
 $(1)_LOCAL_HDRS := $(3)
 $(1)_LOCAL_OBJS := $$(addprefix $$(OBJDIR), \
-$$(patsubst %.S,%.o,$$(patsubst %.cc,%.o,$$(patsubst %.c,%.o,$$($(1)_LOCAL_SRCS)))))
+$$(patsubst %.cc,%.o,$$(patsubst %.c,%.o,$$($(1)_LOCAL_SRCS))))
 $(1)_BINARY := $$(BINDIR)$(1)
 $$($(1)_BINARY): $$($(1)_LOCAL_OBJS) $$(MICROLITE_LIB_PATH)
 	@mkdir -p $$(dir $$@)
 	$$(CXX) $$(CXXFLAGS) $$(INCLUDES) \
 	-o $$($(1)_BINARY) $$($(1)_LOCAL_OBJS) \
-	$$(LIBFLAGS) $$(MICROLITE_LIB_PATH) $$(LDFLAGS) $$(MICROLITE_LIBS)
+	$$(MICROLITE_LIB_PATH) $$(LDFLAGS) $$(MICROLITE_LIBS)
 $(1): $$($(1)_BINARY)
 $(1)_bin: $$($(1)_BINARY).bin
 test_$(1): $$($(1)_BINARY)
diff --git a/tensorflow/lite/micro/tools/make/targets/apollo3evb_makefile.inc b/tensorflow/lite/micro/tools/make/targets/apollo3evb_makefile.inc
index 68792496ec3..bfd7e5010b2 100644
--- a/tensorflow/lite/micro/tools/make/targets/apollo3evb_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/apollo3evb_makefile.inc
@@ -108,7 +108,7 @@ $(MAKEFILE_DIR)/downloads/$(AM_SDK_DEST)/$(SF_BSPS_DEST): $(MAKEFILE_DIR)/downlo
   # _main.c contains application and target specific initialization, like
   # setting clock speed, default uart setups, etc. and an implementation
   # of the DebugLog interfaces.
-  MICROLITE_CC_SRCS += \
+  APOLLO3_SDK_CC_SRCS += \
     $(APOLLO3_SDK)/boards/apollo3_evb/examples/hello_world/gcc_patched/startup_gcc.c \
     $(APOLLO3_SDK)/utils/am_util_delay.c \
     $(APOLLO3_SDK)/utils/am_util_faultisr.c \
@@ -117,21 +117,24 @@ $(MAKEFILE_DIR)/downloads/$(AM_SDK_DEST)/$(SF_BSPS_DEST): $(MAKEFILE_DIR)/downlo
     $(APOLLO3_SDK)/devices/am_devices_led.c
 
   CMSIS_SRC_DIR := $(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Source
-  THIRD_PARTY_CC_SRCS := \
-  $(CMSIS_SRC_DIR)/BasicMathFunctions/arm_dot_prod_q15.c \
-  $(CMSIS_SRC_DIR)/BasicMathFunctions/arm_mult_q15.c \
-  $(CMSIS_SRC_DIR)/TransformFunctions/arm_rfft_init_q15.c \
-  $(CMSIS_SRC_DIR)/TransformFunctions/arm_rfft_q15.c \
-  $(CMSIS_SRC_DIR)/TransformFunctions/arm_bitreversal2.c \
-  $(CMSIS_SRC_DIR)/TransformFunctions/arm_cfft_q15.c \
-  $(CMSIS_SRC_DIR)/TransformFunctions/arm_cfft_radix4_q15.c \
-  $(CMSIS_SRC_DIR)/CommonTables/arm_const_structs.c \
-  $(CMSIS_SRC_DIR)/CommonTables/arm_common_tables.c \
-  $(CMSIS_SRC_DIR)/StatisticsFunctions/arm_mean_q15.c \
-  $(CMSIS_SRC_DIR)/StatisticsFunctions/arm_max_q7.c
+
+  CMSIS_CC_SRCS += \
+    $(CMSIS_SRC_DIR)/BasicMathFunctions/arm_dot_prod_q15.c \
+    $(CMSIS_SRC_DIR)/BasicMathFunctions/arm_mult_q15.c \
+    $(CMSIS_SRC_DIR)/TransformFunctions/arm_rfft_init_q15.c \
+    $(CMSIS_SRC_DIR)/TransformFunctions/arm_rfft_q15.c \
+    $(CMSIS_SRC_DIR)/TransformFunctions/arm_bitreversal2.c \
+    $(CMSIS_SRC_DIR)/TransformFunctions/arm_cfft_q15.c \
+    $(CMSIS_SRC_DIR)/TransformFunctions/arm_cfft_radix4_q15.c \
+    $(CMSIS_SRC_DIR)/CommonTables/arm_const_structs.c \
+    $(CMSIS_SRC_DIR)/CommonTables/arm_common_tables.c \
+    $(CMSIS_SRC_DIR)/StatisticsFunctions/arm_mean_q15.c \
+    $(CMSIS_SRC_DIR)/StatisticsFunctions/arm_max_q7.c
 
   MICRO_SPEECH_TEST_SRCS += \
-    $(AP3_MICRO_DIR)/_main.c
+    $(AP3_MICRO_DIR)/_main.c \
+    $(APOLLO3_SDK_CC_SRCS) \
+    $(CMSIS_CC_SRCS)
 
   TEST_SCRIPT := tensorflow/lite/micro/testing/test_apollo3evb_binary.sh
   # These are tests that don't currently work on the Apollo3 board.
diff --git a/tensorflow/lite/micro/tools/make/targets/arc/README.md b/tensorflow/lite/micro/tools/make/targets/arc/README.md
index 366aede5db4..a614a80e993 100644
--- a/tensorflow/lite/micro/tools/make/targets/arc/README.md
+++ b/tensorflow/lite/micro/tools/make/targets/arc/README.md
@@ -1,5 +1,12 @@
 # Building TensorFlow Lite for Microcontrollers for Synopsys DesignWare ARC EM/HS Processors
 
+## Maintainers
+
+*   [dzakhar](https://github.com/dzakhar)
+*   [JaccovG](https://github.com/JaccovG)
+
+## Introduction
+
 This document contains the general information on building and running
 TensorFlow Lite Micro for targets based on the Synopsys ARC EM/HS Processors.
 
@@ -97,7 +104,8 @@ output from the EM SDP.
 
 If you want to self-boot your application (start it independently from a
 debugger connection), you also need a microSD card with a minimum size of 512 MB
-and a way to write to the card from your development host
+and a way to write to the card from your development host. Note that the card
+must be formatted as FAT32 with default cluster size (but less than 32 Kbytes)
 
 ### Connect the Board
 
@@ -207,17 +215,33 @@ In both cases you will see the application output in the serial terminal.
 1.  Use the following command in the same command shell you used for building
     the application, as described in the previous step
 
+```
     make flash
+```
 
-2.  Copy the content of the created *./bin* folder into the root of microSD
+1.  Copy the content of the created *./bin* folder into the root of microSD
     card. Note that the card must be formatted as FAT32 with default cluster
     size (but less than 32 Kbytes)
 
-3.  Plug in the microSD card into the J11 connector.
+2.  Plug in the microSD card into the J11 connector.
 
-4.  Push the RST button. If a red LED is lit beside RST button, push the CFG
+3.  Push the RST button. If a red LED is lit beside RST button, push the CFG
     button.
 
+4.  Using serial terminal, create uboot environment file to automatically run
+    the application on start-up. Type or copy next sequence of commands into
+    serial terminal one-by-another:
+
+```
+   setenv loadaddr 0x10800000
+   setenv bootfile app.elf
+   setenv bootdelay 1
+   setenv bootcmd fatload mmc 0 \$\{loadaddr\} \$\{bootfile\} \&\& bootelf
+   saveenv
+```
+
+1.  Reset the board (see step 4 above)
+
 You will see the application output in the serial terminal.
 
 ## Custom ARC EM/HS Platform
diff --git a/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc b/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc
index 596f219d3d1..28c0fcd8571 100644
--- a/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc
+++ b/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc
@@ -19,9 +19,10 @@ ifeq ($(TARGET_ARCH), arc)
 
   DLR := $$$$
 
-  # List of folders to search project files for copy with path changing
-  # For instance, TCF and LCF files are copied into the root of generated project
-  ARC_TARGET_FILES_DIRS ?=
+  # List of pairs <dst>!<src>. Each of pairs declares destination file in generated project tree,
+  # and source file in user environment. Destination and source are separated by '!' symbol 
+  # likewise to "add_third_party_download" define in helper_functions.inc
+  ARC_TARGET_COPY_FILES ?=
 
   # For the following variables see arc_app_makefile.tpl for usage
 
@@ -59,11 +60,11 @@ endef
 
 # Copy rule generator to do file copies with changing paths in generated project
 # Arguments are:
-# 1 - Path files in generated project.
-# 2 - Path files in the source repo
+# 1 - Path to file in generated project (destination).
+# 2 - Path to files in the source repo (source).
 # Used in helper_functions.inc for arc projects to copy files
 define path_changing_copy_file
-$(1)/%: $(2)/%
+$(1) : $(2) third_party_downloads
 	@mkdir -p $$(dir $$@)
 	@cp $$< $$@
 endef
@@ -97,7 +98,7 @@ ifeq ($(ARC_TOOLCHAIN), mwdt)
 # the configuration bundled with MWDT (usually without .tcf extension) and that doesn't require copying.
 ifneq (,$(findstring .tcf,$(TCF_FILE)))
   TCF_FILE_NAME = $(notdir $(TCF_FILE))
-  ARC_TARGET_FILES_DIRS = $(dir $(TCF_FILE))
+  ARC_TARGET_COPY_FILES += $(notdir $(TCF_FILE))!$(TCF_FILE)
   MAKE_PROJECT_FILES += $(TCF_FILE_NAME)
 else
   TCF_FILE_NAME = $(TCF_FILE)
@@ -112,14 +113,12 @@ endif
   
   PLATFORM_LDFLAGS = -tcf=$(TCF_FILE_NAME) 
   
-  PLATFORM_LDFLAGS += -Hnocopyr -m -Hldopt=-Coutput=memory.map -Hheap=2K 
+  PLATFORM_LDFLAGS += -Hnocopyr -m -Hldopt=-Coutput=memory.map -Hheap=24K 
 
 ifneq ($(LCF_FILE), )
   PLATFORM_LDFLAGS += $(notdir $(LCF_FILE))
+  ARC_TARGET_COPY_FILES += $(notdir $(LCF_FILE))!$(LCF_FILE)
   MAKE_PROJECT_FILES += $(notdir $(LCF_FILE))
-ifeq ($(filter $(ARC_TARGET_FILES_DIRS), $(dir $(LCF_FILE))),)
-  ARC_TARGET_FILES_DIRS += $(dir $(LCF_FILE))
-endif
 endif
 
   CXXFLAGS := $(filter-out -std=c++11,$(CXXFLAGS))
diff --git a/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp.lcf b/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp.lcf
index 780fd7b9750..5dc53cc1585 100644
--- a/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp.lcf
+++ b/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp.lcf
@@ -49,12 +49,14 @@ SECTIONS {
         .text? : { *('.text$crt*') }
         * (TEXT): {}
         * (LIT): {}
-    } > SRAM
+        * (DATA): {}
+        * (BSS): {}
+    } > PSRAM
 
     GROUP BLOCK(4): {
        .Zdata? : {}
        .stack ALIGN(4) SIZE(DEFINED _STACKSIZE?_STACKSIZE:32K): {}
-       .heap? ALIGN(4) SIZE(DEFINED _HEAPSIZE?_HEAPSIZE:8K): {}
+       .heap? ALIGN(4) SIZE(DEFINED _HEAPSIZE?_HEAPSIZE:24K): {}
     } > DCCM
         
     GROUP BLOCK(4): {
@@ -65,18 +67,6 @@ SECTIONS {
         .Ydata? : {}
     } > YCCM
 
-    GROUP BLOCK(4): {
-    /* _SDA_BASE_ computed implicitly */
-        .sdata?: {}
-        .sbss?: {}
-        * (DATA): {}
-        * (BSS): {}
-    } > PSRAM
-
-    GROUP BLOCK(4): {
-        .rodata_in_data? : {}
-    } > PSRAM
-
     GROUP BLOCK(4): {
         .debug_log? : {}
     } > SRAM
diff --git a/tensorflow/lite/micro/tools/make/targets/arc_emsdp_makefile.inc b/tensorflow/lite/micro/tools/make/targets/arc_emsdp_makefile.inc
index 405b9698cca..99f2de05890 100644
--- a/tensorflow/lite/micro/tools/make/targets/arc_emsdp_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/arc_emsdp_makefile.inc
@@ -31,9 +31,7 @@ endif
 
   TCF_FILE = $(PWD)/$(MAKEFILE_DIR)/downloads/$(MLI_LIB_DIR)/hw/emsdp_em11d_em9d_dfss.tcf
   LCF_FILE = $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp/emsdp.lcf
-  UBOOT_FILE := $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp/uboot.env
-  UBOOT_FILE_NAME := $(notdir $(UBOOT_FILE))
-    
+
 
 include $(MAKEFILE_DIR)/targets/arc/arc_common.inc
   
@@ -44,7 +42,6 @@ include $(MAKEFILE_DIR)/targets/arc/arc_common.inc
    ARC_EXTRA_APP_RULES = \
      $(DLR)\(BIN_FILE\): $(DLR)\(BIN_DIR\) $(DLR)\(OUT_NAME\)\
      \n\t\@$(DLR)\(CP\) $(DLR)\(OUT_NAME\) $(DLR)\(BIN_FILE\)\
-     \n\t\@$(DLR)\(CP\) $(UBOOT_FILE_NAME) $(DLR)\(BIN_DIR\)$(DLR)\(PS\)$(UBOOT_FILE_NAME)\
      \n \
      \n$(DLR)\(BIN_DIR\):\
      \n\t\@$(DLR)\(MKDIR\) $(DLR)\(BIN_DIR\)\
@@ -58,11 +55,6 @@ include $(MAKEFILE_DIR)/targets/arc/arc_common.inc
    ARC_APP_DEBUG_CMD = mdb -OK -digilent -nooptions $(DLR)\(DBG_ARGS\)
    ARC_EXTRA_EXECUTE_RULES = 
 
-  MAKE_PROJECT_FILES += $(UBOOT_FILE_NAME)
-ifeq ($(filter $(ARC_TARGET_FILES_DIRS), $(dir $(UBOOT_FILE))),)
-  ARC_TARGET_FILES_DIRS += $(dir $(UBOOT_FILE))
-endif
-
   MAKE_PROJECT_FILES := $(filter-out README_MAKE.md, $(MAKE_PROJECT_FILES)) README_ARC_EMSDP.md
 
   # for default EMSDP configuration we can use em9d_va rt libs
diff --git a/tensorflow/lite/micro/tools/make/targets/bluepill/bluepill.lds b/tensorflow/lite/micro/tools/make/targets/bluepill/bluepill.lds
index 4366f864bd8..95638201dd1 100644
--- a/tensorflow/lite/micro/tools/make/targets/bluepill/bluepill.lds
+++ b/tensorflow/lite/micro/tools/make/targets/bluepill/bluepill.lds
@@ -12,32 +12,27 @@ limitations under the License.
 
 /* Copied and modified from:
    https://github.com/google/stm32_bare_lib/blob/master/stm32_linker_layout.lds
-   
+
    Modifications:
-    * increased the flash size to 128K from 64K. While the bluepill board has
-      64K of flash, our tests currently need more than that. Changing the flash
-      to 128K at least enables running the tests in the emulator.
-      
-      TODO(b/143153151): Change this back to 64K if/when the tests fit on real
-      bluepill hardware.
+    * increased the flash size to 512K and RAM to 256K. This far exceeds the
+      actual hardware but enables running the tests in the emulator.
 */
 
 /*
  * 0x00000000 - 0x07ffffff - aliased to flash or sys memory depending on BOOT jumpers.
- * 0x08000000 - 0x0801ffff - Flash.
+ * 0x08000000 - 0x0807ffff - Flash.
  * 0x1ffff000 - 0x1ffff7ff - Boot firmware in system memory.
  * 0x1ffff800 - 0x1fffffff - Option bytes.
- * 0x20000000 - 0x20004fff - SRAM.
+ * 0x20000000 - 0x2003ffff - SRAM.
  * 0x40000000 - 0x40023400 - Peripherals
  */
 
 /* Define main entry point */
 ENTRY(_main)
 
-/* 20K of RAM and 128K of FLASH */
 MEMORY {
-RAM (xrw) : ORIGIN = 0x20000000, LENGTH = 20K
-FLASH (rx) : ORIGIN = 0x8000000, LENGTH = 128K
+RAM (xrw) : ORIGIN = 0x20000000, LENGTH = 256K
+FLASH (rx) : ORIGIN = 0x8000000, LENGTH = 512K
 }
 
 /* Compute where the stack ends rather than hard coding it */
diff --git a/tensorflow/lite/micro/tools/make/targets/bluepill_makefile.inc b/tensorflow/lite/micro/tools/make/targets/bluepill_makefile.inc
index 62230f6a80a..ed2fb74f600 100644
--- a/tensorflow/lite/micro/tools/make/targets/bluepill_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/bluepill_makefile.inc
@@ -12,10 +12,7 @@ ifeq ($(TARGET), bluepill)
 #bluepill target to compile without it.
   PLATFORM_FLAGS = \
     -DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK \
-    -DTF_LITE_STATIC_MEMORY \
     -DTF_LITE_MCU_DEBUG_LOG \
-    -DNDEBUG \
-    -fno-rtti \
     -fmessage-length=0 \
     -fno-exceptions \
     -fno-unwind-tables \
@@ -25,21 +22,20 @@ ifeq ($(TARGET), bluepill)
     -MMD \
     -mcpu=cortex-m3 \
     -mthumb \
-    -std=gnu++11 \
-    -Wvla \
     -Wall \
     -Wextra \
+    -Wno-vla \
     -Wno-unused-parameter \
     -Wno-strict-aliasing \
+    -Wno-shadow \
     -Wno-type-limits \
     -fno-delete-null-pointer-checks \
-    -fno-threadsafe-statics \
     -fomit-frame-pointer \
-    -fno-use-cxa-atexit \
     -nostdlib \
     -g \
     -Os
-  CXXFLAGS += $(PLATFORM_FLAGS)
+  CXXFLAGS += $(PLATFORM_FLAGS) -fno-rtti -fno-threadsafe-statics \
+              -fno-use-cxa-atexit
   CCFLAGS += $(PLATFORM_FLAGS)
   LDFLAGS += \
     -T $(MAKEFILE_DIR)/targets/bluepill/bluepill.lds \
@@ -58,12 +54,24 @@ ifeq ($(TARGET), bluepill)
     $(MAKEFILE_DIR)/downloads/stm32_bare_lib/source/debug_log.c
   MICROLITE_CC_SRCS := $(filter-out $(EXCLUDED_SRCS), $(MICROLITE_CC_SRCS))
   TEST_SCRIPT := tensorflow/lite/micro/testing/test_bluepill_binary.sh
-  # These are tests that don't currently work on the blue pill.
+
+  # TODO(b/143286954): Figure out why some tests fail and enable ince the issues
+  # are resolved.
   EXCLUDED_TESTS := \
     tensorflow/lite/micro/micro_interpreter_test.cc \
-    tensorflow/lite/micro/simple_tensor_allocator_test.cc
+    tensorflow/lite/micro/micro_allocator_test.cc \
+    tensorflow/lite/micro/memory_helpers_test.cc \
+    tensorflow/lite/micro/memory_arena_threshold_test.cc \
+    tensorflow/lite/micro/kernels/circular_buffer_test.cc
   MICROLITE_TEST_SRCS := $(filter-out $(EXCLUDED_TESTS), $(MICROLITE_TEST_SRCS))
 
+  EXCLUDED_EXAMPLE_TESTS := \
+    tensorflow/lite/micro/examples/magic_wand/Makefile.inc \
+    tensorflow/lite/micro/examples/micro_speech/Makefile.inc \
+    tensorflow/lite/micro/examples/image_recognition_experimental/Makefile.inc
+  MICRO_LITE_EXAMPLE_TESTS := $(filter-out $(EXCLUDED_EXAMPLE_TESTS), $(MICRO_LITE_EXAMPLE_TESTS))
+
+
 # These are microcontroller-specific rules for converting the ELF output
 # of the linker into a binary image that can be loaded directly.
 OBJCOPY := $(TARGET_TOOLCHAIN_PREFIX)objcopy
diff --git a/tensorflow/lite/micro/tools/make/targets/ceva/CEVA_BX1_TFLM.ld b/tensorflow/lite/micro/tools/make/targets/ceva/CEVA_BX1_TFLM.ld
new file mode 100755
index 00000000000..75652b2948c
--- /dev/null
+++ b/tensorflow/lite/micro/tools/make/targets/ceva/CEVA_BX1_TFLM.ld
@@ -0,0 +1,219 @@
+OUTPUT(a.elf)
+
+/* By default, program starts from reset address (the default location of the interrupt table) */
+ENTRY(__cxd_inttbl_start)
+
+/** Memory configuration parameters.
+ *  The parameters become application symbols and can be referred from application
+ */
+__internal_code_start = DEFINED(__internal_code_start) ? __internal_code_start : 0x00000000;
+__internal_code_size  = DEFINED(__internal_code_size ) ? __internal_code_size  : 256k;
+__internal_data_start = DEFINED(__internal_data_start) ? __internal_data_start : 0x00000000;
+__internal_data_size  = DEFINED(__internal_data_size ) ? __internal_data_size  : 512k;
+__external_start      = DEFINED(__external_start     ) ? __external_start      : 0x40000000;
+__external_size       = DEFINED(__external_size      ) ? __external_size       : 0x40000000;
+__rom_start           = DEFINED(__rom_start          ) ? __rom_start           : 0xC0000000;
+__rom_size            = DEFINED(__rom_size           ) ? __rom_size            : 1024M;
+
+__malloc_size         = DEFINED(__malloc_size        ) ? __malloc_size         : 32k;
+__stack_size          = DEFINED(__stack_size         ) ? __stack_size          : 32k;
+__arg_sect_size       = DEFINED(__arg_sect_size      ) ? __arg_sect_size       : 512;
+
+MEMORY {
+    INTERNAL_CODE  (rx) : ORIGIN = __internal_code_start, LENGTH = __internal_code_size
+    INTERNAL_DATA  (rw) : ORIGIN = __internal_data_start, LENGTH = __internal_data_size
+    EXTERNAL      (rwx) : ORIGIN = __external_start     , LENGTH = __external_size
+    ROM            (rx) : ORIGIN = __rom_start          , LENGTH = __rom_size
+}
+
+SECTIONS {
+    .inttbl : ALIGN(0x20) {
+        /** The interrupt vector table. Contains the NMI
+         *  and maskable interrupt handlers
+         */
+        . = 0x0;
+        KEEP(*(.inttbl))
+        . = ALIGN(0x20);
+    	KEEP(*(.sinttbl))
+    } >INTERNAL_CODE
+
+    .data.internal : ALIGN(0x20) {
+        PROVIDE(__data_internal_start = ABSOLUTE(.));
+        /* Don't map any data at address zero to avoid issues with C NULL
+         * pointer checks
+         */
+        . += 0x4;
+
+        PROVIDE(__data_start = ABSOLUTE(.));
+        *(.data .data.*)
+        PROVIDE(__data_end = ABSOLUTE(.));
+        PROVIDE(__data_size = ABSOLUTE(+__data_end - __data_start));
+
+        PROVIDE(__sdata_start = ABSOLUTE(.));
+        *(.sdata .sdata.*)
+        PROVIDE(__sdata_end = ABSOLUTE(.));
+        PROVIDE(__sdata_size = ABSOLUTE(+__sdata_end - __sdata_start));
+
+        PROVIDE(__data_internal_end = ABSOLUTE(.));
+        PROVIDE(__data_internal_size = ABSOLUTE(__data_internal_end - __data_internal_start));
+    } >INTERNAL_DATA
+
+    .data.internal.clone (NOLOAD) : ALIGN(0x20) {
+  		PROVIDE(__data_internal_clone_start = ABSOLUTE(.));
+		. = ABSOLUTE(. + __data_internal_size);
+  	} >INTERNAL_DATA
+
+    .data.internal.ro : ALIGN(0x20) {
+        PROVIDE(__data_internal_ro_start = ABSOLUTE(.));
+        PROVIDE(__rodata_start = ABSOLUTE(.));
+        *(.rodata .rodata.*)
+        PROVIDE(__rodata_end = ABSOLUTE(.));
+        PROVIDE(__rodata_size = ABSOLUTE(+__rodata_end - __rodata_start));
+
+        PROVIDE(__data_internal_ro_end = ABSOLUTE(.));
+        PROVIDE(__data_internal_ro_size = ABSOLUTE(__data_internal_ro_end - __data_internal_ro_start));
+    } >INTERNAL_DATA
+
+    .cst.call : ALIGN(4) {
+        PROVIDE(__cst_call_start = ABSOLUTE(.));
+        *(.cst.call)
+        PROVIDE(__cst_call_end = ABSOLUTE(.));
+    } >INTERNAL_DATA
+
+    .cst.mov : ALIGN(4) {
+        PROVIDE(__cst_mov_start = ABSOLUTE(.));
+        *(.cst.mov)
+        PROVIDE(__cst_mov_end = ABSOLUTE(.));
+    } >INTERNAL_DATA
+
+    .bss (NOLOAD) : ALIGN(0x20) {
+        PROVIDE(__bss_start = ABSOLUTE(.));
+        *(.bss .bss.*)
+        PROVIDE(__common_start = ABSOLUTE(.));
+        *(COMMON)
+        PROVIDE(__common_end = ABSOLUTE(.));
+        PROVIDE(__common_size = ABSOLUTE(+__common_end - __common_start));
+        PROVIDE(__bss_end = ABSOLUTE(.));
+        PROVIDE(__bss_size = ABSOLUTE(+__bss_end - __bss_start));
+    } >INTERNAL_DATA
+
+    __STACK_SECT (NOLOAD) : ALIGN(0x10) {
+        __stack_start = ABSOLUTE(.);
+        . = . + __stack_size;
+        __stack_end = ABSOLUTE(.);
+    } >INTERNAL_DATA
+
+    .text : ALIGN(0x20) {
+        PROVIDE(__text_start = ABSOLUTE(.));
+        /* The __call_saved* functions need to be placed at low addresses for
+         * calling with absolute call instructions
+         */
+        *(.text.__call_saved*)
+        *(.text .text.*)
+        PROVIDE(__text_end = ABSOLUTE(.));
+    } >EXTERNAL
+
+    .data.external : ALIGN(0x20) {
+        /** .data1, .rodata1, .sdata1 are all for large symbols which cannot
+         * fit in limited internal memory. We put them in external memory by
+         * default. */
+        PROVIDE(__data_external_start = ABSOLUTE(.));
+
+        PROVIDE(__data1_start = ABSOLUTE(.));
+        *(.data1 .data1.*)
+        PROVIDE(__data1_end = ABSOLUTE(.));
+
+        PROVIDE(__sdata1_start = ABSOLUTE(.));
+        *(.sdata1 .sdata1.*)
+        PROVIDE(__sdata1_end = ABSOLUTE(.));
+        PROVIDE(__sdata1_size = ABSOLUTE(+__sdata1_end - __sdata1_start));
+
+        PROVIDE(__data_external_end = ABSOLUTE(.));
+        PROVIDE(__data_external_size = ABSOLUTE(__data_external_end - __data_external_start));
+    } >EXTERNAL
+
+    .data.external.clone (NOLOAD) : ALIGN(0x20) {
+        PROVIDE(__data_external_clone_start = ABSOLUTE(.));
+		. = ABSOLUTE(. + __data_external_size);
+   } >EXTERNAL
+
+    .data.external.ro : ALIGN(0x20) {
+        /** .data1, .rodata1, .sdata1 are all for large symbols which cannot
+         * fit in limited internal memory. We put them in external memory by
+         * default. */
+        PROVIDE(__data_external_ro_start = ABSOLUTE(.));
+
+        PROVIDE(__rodata1_start = ABSOLUTE(.));
+        *(.rodata1 .rodata1.*)
+        PROVIDE(__rodata1_end = ABSOLUTE(.));
+        PROVIDE(__rodata1_size = ABSOLUTE(+__rodata1_end - __rodata1_start));
+
+        /* Constructors and destructors are called once per program invocation,
+         * so are never in the hot path; they shouldn't waste space in limited
+         * internal memory so we place them in slower, external memory */
+
+        . = ALIGN(4); /* constructors must be aligned on a word boundary */
+        PROVIDE(__init_array_start = ABSOLUTE(.));
+        KEEP(*(SORT_BY_INIT_PRIORITY(.init_array.*) SORT_BY_INIT_PRIORITY(.ctors.*)));
+        KEEP(*(SORT_BY_INIT_PRIORITY(.init_array*) SORT_BY_INIT_PRIORITY(.ctors*)));
+        PROVIDE(__init_array_end = ABSOLUTE(.));
+
+        PROVIDE(__fini_array_start = ABSOLUTE(.));
+        /* destructors are run in reverse order of their priority */
+        KEEP(*(SORT_BY_INIT_PRIORITY(.fini_array.*) SORT_BY_INIT_PRIORITY(.dtors.*)));
+        KEEP(*(SORT_BY_INIT_PRIORITY(.fini_array*) SORT_BY_INIT_PRIORITY(.dtors*)));
+        PROVIDE(__fini_array_end = ABSOLUTE(.));
+
+        PROVIDE(__data_external_ro_end = ABSOLUTE(.));
+        PROVIDE(__data_external_ro_size = ABSOLUTE(__data_external_ro_end - __data_external_ro_start));
+    } >EXTERNAL
+
+    .bss1 (NOLOAD) : ALIGN(0x20) {
+        /**
+         * `.bss1` is for large zero-initialized symbols that do not fit in
+         * internal data
+         */
+        PROVIDE(__bss1_start = ABSOLUTE(.));
+        *(.bss1 .bss1.*)
+        PROVIDE(__large_common_start = ABSOLUTE(.));
+        *(LARGE_COMMON)
+        PROVIDE(__large_common_end = ABSOLUTE(.));
+        PROVIDE(__large_common_size = ABSOLUTE(+__large_common_end - __large_common_start));
+        PROVIDE(__bss1_end = ABSOLUTE(.));
+        PROVIDE(__bss1_size = ABSOLUTE(+__bss1_end - __bss1_start));
+    } >EXTERNAL
+
+    /* Program arguments are loaded by `_start` routine from `__arg_sect_start`.
+     * When the user has set a zero size for the section, argc, and argv
+     * will be zero and NULL, respectively.
+     * Although likely small, they are on the slow path so by default they
+     * go at the end of external memory
+     */
+    __ARG_SECT (NOLOAD) : ALIGN(0x4) {
+        __arg_sect_start = .;
+        . = . + (__arg_sect_size ? __arg_sect_size + 4 : 0);
+        __arg_sect_end = .;
+    } >EXTERNAL
+
+    __MALLOC_SECT (NOLOAD) : ALIGN(0x10) {
+        PROVIDE(__malloc_start = ABSOLUTE(.));
+        . = . + __malloc_size;
+        PROVIDE(__malloc_end = ABSOLUTE(.));
+    } >EXTERNAL
+
+    data_internal_loadable_addr = __data_internal_clone_start;
+    data_external_loadable_addr = __data_external_clone_start;
+
+    /DISCARD/ : {
+        /* Note:  The CEVA Debugger and Restriction Checker use information 
+         * stored in the ".note.CEVA-arch" section. Do NOT discard this section
+         * for projects in development phase. This section has no effect on the
+         * applications footprint */
+        *(.comment)
+        *(.note.GNU-stack)
+        /* The X-DSP ABI uses a custom relocation format stored in its own
+         * section. These are left in the binary by default but are unneeded. */
+        *(.ceva_reloc)
+    }
+
+}
diff --git a/tensorflow/lite/micro/tools/make/targets/ceva/CEVA_BX1_TFLM_18.0.2.ld b/tensorflow/lite/micro/tools/make/targets/ceva/CEVA_BX1_TFLM_18.0.2.ld
new file mode 100755
index 00000000000..0abbef4f89d
--- /dev/null
+++ b/tensorflow/lite/micro/tools/make/targets/ceva/CEVA_BX1_TFLM_18.0.2.ld
@@ -0,0 +1,190 @@
+OUTPUT(a.elf)
+
+/* By default, program starts from reset address (the default location of the interrupt table) */
+ENTRY(__cxd_inttbl_start)
+
+/** Memory configuration parameters.
+ *  The parameters become application symbols and can be referred from application
+ */
+__internal_code_start = DEFINED(__internal_code_start) ? __internal_code_start : 0x00000000;
+__internal_code_size  = DEFINED(__internal_code_size ) ? __internal_code_size  : 256k;
+__internal_data_start = DEFINED(__internal_data_start) ? __internal_data_start : 0x00000000;
+__internal_data_size  = DEFINED(__internal_data_size ) ? __internal_data_size  : 512k;
+__external_start      = DEFINED(__external_start     ) ? __external_start      : 0x40000000;
+__external_size       = DEFINED(__external_size      ) ? __external_size       : 0x40000000;
+__rom_start           = DEFINED(__rom_start          ) ? __rom_start           : 0xC0000000;
+__rom_size            = DEFINED(__rom_size           ) ? __rom_size            : 1024M;
+
+__malloc_size         = DEFINED(__malloc_size        ) ? __malloc_size         : 64k;
+__stack_size          = DEFINED(__stack_size         ) ? __stack_size          : 64k;
+__arg_sect_size       = DEFINED(__arg_sect_size      ) ? __arg_sect_size       : 512;
+
+MEMORY {
+    INTERNAL_CODE  (rx) : ORIGIN = __internal_code_start, LENGTH = __internal_code_size
+    INTERNAL_DATA  (rw) : ORIGIN = __internal_data_start, LENGTH = __internal_data_size
+    EXTERNAL      (rwx) : ORIGIN = __external_start     , LENGTH = __external_size
+    ROM            (rx) : ORIGIN = __rom_start          , LENGTH = __rom_size
+}
+
+SECTIONS {
+    .inttbl : ALIGN(0x20) {
+        /** The interrupt vector resides at address zero and contains the NMI
+         *  and maskable interrupt handlers
+         */
+        . = 0x0;
+        KEEP(*(.inttbl))
+        . = ALIGN(0x20);
+    	KEEP(*(.sinttbl))
+    } >INTERNAL_CODE AT>ROM
+
+    .data.internal : ALIGN(0x20) {
+        PROVIDE(__data_internal_start = ABSOLUTE(.));
+        /* Don't map any data at address zero to avoid issues with C NULL
+         * pointer checks
+         */
+        . += 0x4;
+
+        PROVIDE(__data_start = ABSOLUTE(.));
+        *(.data .data.*)
+        PROVIDE(__data_end = ABSOLUTE(.));
+        PROVIDE(__data_size = +__data_end - __data_start);
+
+        PROVIDE(__sdata_start = ABSOLUTE(.));
+        *(.sdata .sdata.*)
+        PROVIDE(__sdata_end = ABSOLUTE(.));
+        PROVIDE(__sdata_size = +__sdata_end - __sdata_start);
+
+        PROVIDE(__rodata_start = ABSOLUTE(.));
+        *(.rodata .rodata.*)
+        PROVIDE(__rodata_end = ABSOLUTE(.));
+        PROVIDE(__rodata_size = +__rodata_end - __rodata_start);
+
+        PROVIDE(__data_internal_end = ABSOLUTE(.));
+        PROVIDE(__data_internal_size = __data_internal_end - __data_internal_start);
+    } >INTERNAL_DATA AT>ROM
+
+    .cst.call : ALIGN(4) {
+        PROVIDE(__cst_call_start = ABSOLUTE(.));
+        *(.cst.call)
+        PROVIDE(__cst_call_end = ABSOLUTE(.));
+    } >INTERNAL_DATA AT>ROM
+
+    .cst.mov : ALIGN(4) {
+        PROVIDE(__cst_mov_start = ABSOLUTE(.));
+        *(.cst.mov)
+        PROVIDE(__cst_mov_end = ABSOLUTE(.));
+    } >INTERNAL_DATA AT>ROM
+
+    .bss (NOLOAD) : ALIGN(0x20) {
+        PROVIDE(__bss_start = ABSOLUTE(.));
+        *(.bss .bss.*)
+        PROVIDE(__common_start = ABSOLUTE(.));
+        *(COMMON)
+        PROVIDE(__common_end = ABSOLUTE(.));
+        PROVIDE(__common_size = +__common_end - __common_start);
+        PROVIDE(__bss_end = ABSOLUTE(.));
+        PROVIDE(__bss_size = +__bss_end - __bss_start);
+    } >INTERNAL_DATA
+
+    __STACK_SECT (NOLOAD) : ALIGN(0x10) {
+        __stack_start = ABSOLUTE(.);
+        . = . + __stack_size;
+        __stack_end = ABSOLUTE(.);
+    } >INTERNAL_DATA
+
+    .text : ALIGN(0x20) {
+        PROVIDE(__text_start = ABSOLUTE(.));
+        /* The __call_saved* functions need to be placed at low addresses for
+         * calling with absolute call instructions
+         */
+        *(.text.__call_saved*)
+        *(.text .text.*)
+        PROVIDE(__text_end = ABSOLUTE(.));
+    } >EXTERNAL AT>ROM
+
+    .data.external : ALIGN(0x20) {
+        /** .data1, .rodata1, .sdata1 are all for large symbols which cannot
+         * fit in limited internal memory. We put them in external memory by
+         * default. */
+        PROVIDE(__data_external_start = ABSOLUTE(.));
+
+        PROVIDE(__data1_start = ABSOLUTE(.));
+        *(.data1 .data1.*)
+        PROVIDE(__data1_end = ABSOLUTE(.));
+
+        PROVIDE(__sdata1_start = ABSOLUTE(.));
+        *(.sdata1 .sdata1.*)
+        PROVIDE(__sdata1_end = ABSOLUTE(.));
+        PROVIDE(__sdata1_size = +__sdata1_end - __sdata1_start);
+
+        PROVIDE(__rodata1_start = ABSOLUTE(.));
+        *(.rodata1 .rodata1.*)
+        PROVIDE(__rodata1_end = ABSOLUTE(.));
+        PROVIDE(__rodata1_size = +__rodata1_end - __rodata1_start);
+
+        /* Constructors and destructors are called once per program invocation,
+         * so are never in the hot path; they shouldn't waste space in limited
+         * internal memory so we place them in slower, external memory
+         */
+
+        . = ALIGN(4); /* constructors must be aligned on a word boundary */
+        PROVIDE(__init_array_start = ABSOLUTE(.));
+        KEEP(*(SORT_BY_INIT_PRIORITY(.init_array.*) SORT_BY_INIT_PRIORITY(.ctors.*)));
+        KEEP(*(SORT_BY_INIT_PRIORITY(.init_array*) SORT_BY_INIT_PRIORITY(.ctors*)));
+        PROVIDE(__init_array_end = ABSOLUTE(.));
+
+        PROVIDE(__fini_array_start = ABSOLUTE(.));
+        /* destructors are run in reverse order of their priority */
+        KEEP(*(SORT_BY_INIT_PRIORITY(.fini_array.*) SORT_BY_INIT_PRIORITY(.dtors.*)));
+        KEEP(*(SORT_BY_INIT_PRIORITY(.fini_array*) SORT_BY_INIT_PRIORITY(.dtors*)));
+        PROVIDE(__fini_array_end = ABSOLUTE(.));
+
+        PROVIDE(__data_external_end = ABSOLUTE(.));
+        PROVIDE(__data_external_size = __data_external_end - __data_external_start);
+    } >EXTERNAL AT>ROM
+
+    .bss1 (NOLOAD) : ALIGN(0x20) {
+        /**
+         * `.bss1` is for large zero-initialized symbols that do not fit in
+         * internal data
+         */
+        PROVIDE(__bss1_start = ABSOLUTE(.));
+        *(.bss1 .bss1.*)
+        PROVIDE(__large_common_start = ABSOLUTE(.));
+        *(LARGE_COMMON)
+        PROVIDE(__large_common_end = ABSOLUTE(.));
+        PROVIDE(__large_common_size = +__large_common_end - __large_common_start);
+        PROVIDE(__bss1_end = ABSOLUTE(.));
+        PROVIDE(__bss1_size = +__bss1_end - __bss1_start);
+    } >EXTERNAL
+
+    /* Program arguments are loaded by `_start` routine from `__arg_sect_start`.
+     * When the user has set a zero size for the section, argc, and argv
+     * will be zero and NULL, respectively.
+     * Although likely small, they are on the slow path so by default they
+     * go at the end of external memory
+     */
+    __ARG_SECT (NOLOAD) : ALIGN(0x4) {
+        __arg_sect_start = .;
+        . = . + (__arg_sect_size ? __arg_sect_size + 4 : 0);
+        __arg_sect_end = .;
+    } >EXTERNAL
+
+    __MALLOC_SECT (NOLOAD) : ALIGN(0x10) {
+        PROVIDE(__malloc_start = ABSOLUTE(.));
+        . = . + __malloc_size;
+        PROVIDE(__malloc_end = ABSOLUTE(.));
+    } >EXTERNAL
+
+    /DISCARD/ : {
+        /* Discarding .note.CEVA-arch saves a fair amount of space but
+         * confounds the restriction checker. YMMV */
+        /* *(.note.CEVA-arch) */
+        *(.comment)
+        *(.note.GNU-stack)
+        /* The X-DSP ABI uses a custom relocation format stored in its own
+         * section. These are left in the binary by default but are unneeded. */
+        *(.ceva_reloc)
+    }
+
+}
diff --git a/tensorflow/lite/micro/tools/make/targets/ceva/CEVA_BX1_TFLM_18.0.3.ld b/tensorflow/lite/micro/tools/make/targets/ceva/CEVA_BX1_TFLM_18.0.3.ld
new file mode 100755
index 00000000000..75652b2948c
--- /dev/null
+++ b/tensorflow/lite/micro/tools/make/targets/ceva/CEVA_BX1_TFLM_18.0.3.ld
@@ -0,0 +1,219 @@
+OUTPUT(a.elf)
+
+/* By default, program starts from reset address (the default location of the interrupt table) */
+ENTRY(__cxd_inttbl_start)
+
+/** Memory configuration parameters.
+ *  The parameters become application symbols and can be referred from application
+ */
+__internal_code_start = DEFINED(__internal_code_start) ? __internal_code_start : 0x00000000;
+__internal_code_size  = DEFINED(__internal_code_size ) ? __internal_code_size  : 256k;
+__internal_data_start = DEFINED(__internal_data_start) ? __internal_data_start : 0x00000000;
+__internal_data_size  = DEFINED(__internal_data_size ) ? __internal_data_size  : 512k;
+__external_start      = DEFINED(__external_start     ) ? __external_start      : 0x40000000;
+__external_size       = DEFINED(__external_size      ) ? __external_size       : 0x40000000;
+__rom_start           = DEFINED(__rom_start          ) ? __rom_start           : 0xC0000000;
+__rom_size            = DEFINED(__rom_size           ) ? __rom_size            : 1024M;
+
+__malloc_size         = DEFINED(__malloc_size        ) ? __malloc_size         : 32k;
+__stack_size          = DEFINED(__stack_size         ) ? __stack_size          : 32k;
+__arg_sect_size       = DEFINED(__arg_sect_size      ) ? __arg_sect_size       : 512;
+
+MEMORY {
+    INTERNAL_CODE  (rx) : ORIGIN = __internal_code_start, LENGTH = __internal_code_size
+    INTERNAL_DATA  (rw) : ORIGIN = __internal_data_start, LENGTH = __internal_data_size
+    EXTERNAL      (rwx) : ORIGIN = __external_start     , LENGTH = __external_size
+    ROM            (rx) : ORIGIN = __rom_start          , LENGTH = __rom_size
+}
+
+SECTIONS {
+    .inttbl : ALIGN(0x20) {
+        /** The interrupt vector table. Contains the NMI
+         *  and maskable interrupt handlers
+         */
+        . = 0x0;
+        KEEP(*(.inttbl))
+        . = ALIGN(0x20);
+    	KEEP(*(.sinttbl))
+    } >INTERNAL_CODE
+
+    .data.internal : ALIGN(0x20) {
+        PROVIDE(__data_internal_start = ABSOLUTE(.));
+        /* Don't map any data at address zero to avoid issues with C NULL
+         * pointer checks
+         */
+        . += 0x4;
+
+        PROVIDE(__data_start = ABSOLUTE(.));
+        *(.data .data.*)
+        PROVIDE(__data_end = ABSOLUTE(.));
+        PROVIDE(__data_size = ABSOLUTE(+__data_end - __data_start));
+
+        PROVIDE(__sdata_start = ABSOLUTE(.));
+        *(.sdata .sdata.*)
+        PROVIDE(__sdata_end = ABSOLUTE(.));
+        PROVIDE(__sdata_size = ABSOLUTE(+__sdata_end - __sdata_start));
+
+        PROVIDE(__data_internal_end = ABSOLUTE(.));
+        PROVIDE(__data_internal_size = ABSOLUTE(__data_internal_end - __data_internal_start));
+    } >INTERNAL_DATA
+
+    .data.internal.clone (NOLOAD) : ALIGN(0x20) {
+  		PROVIDE(__data_internal_clone_start = ABSOLUTE(.));
+		. = ABSOLUTE(. + __data_internal_size);
+  	} >INTERNAL_DATA
+
+    .data.internal.ro : ALIGN(0x20) {
+        PROVIDE(__data_internal_ro_start = ABSOLUTE(.));
+        PROVIDE(__rodata_start = ABSOLUTE(.));
+        *(.rodata .rodata.*)
+        PROVIDE(__rodata_end = ABSOLUTE(.));
+        PROVIDE(__rodata_size = ABSOLUTE(+__rodata_end - __rodata_start));
+
+        PROVIDE(__data_internal_ro_end = ABSOLUTE(.));
+        PROVIDE(__data_internal_ro_size = ABSOLUTE(__data_internal_ro_end - __data_internal_ro_start));
+    } >INTERNAL_DATA
+
+    .cst.call : ALIGN(4) {
+        PROVIDE(__cst_call_start = ABSOLUTE(.));
+        *(.cst.call)
+        PROVIDE(__cst_call_end = ABSOLUTE(.));
+    } >INTERNAL_DATA
+
+    .cst.mov : ALIGN(4) {
+        PROVIDE(__cst_mov_start = ABSOLUTE(.));
+        *(.cst.mov)
+        PROVIDE(__cst_mov_end = ABSOLUTE(.));
+    } >INTERNAL_DATA
+
+    .bss (NOLOAD) : ALIGN(0x20) {
+        PROVIDE(__bss_start = ABSOLUTE(.));
+        *(.bss .bss.*)
+        PROVIDE(__common_start = ABSOLUTE(.));
+        *(COMMON)
+        PROVIDE(__common_end = ABSOLUTE(.));
+        PROVIDE(__common_size = ABSOLUTE(+__common_end - __common_start));
+        PROVIDE(__bss_end = ABSOLUTE(.));
+        PROVIDE(__bss_size = ABSOLUTE(+__bss_end - __bss_start));
+    } >INTERNAL_DATA
+
+    __STACK_SECT (NOLOAD) : ALIGN(0x10) {
+        __stack_start = ABSOLUTE(.);
+        . = . + __stack_size;
+        __stack_end = ABSOLUTE(.);
+    } >INTERNAL_DATA
+
+    .text : ALIGN(0x20) {
+        PROVIDE(__text_start = ABSOLUTE(.));
+        /* The __call_saved* functions need to be placed at low addresses for
+         * calling with absolute call instructions
+         */
+        *(.text.__call_saved*)
+        *(.text .text.*)
+        PROVIDE(__text_end = ABSOLUTE(.));
+    } >EXTERNAL
+
+    .data.external : ALIGN(0x20) {
+        /** .data1, .rodata1, .sdata1 are all for large symbols which cannot
+         * fit in limited internal memory. We put them in external memory by
+         * default. */
+        PROVIDE(__data_external_start = ABSOLUTE(.));
+
+        PROVIDE(__data1_start = ABSOLUTE(.));
+        *(.data1 .data1.*)
+        PROVIDE(__data1_end = ABSOLUTE(.));
+
+        PROVIDE(__sdata1_start = ABSOLUTE(.));
+        *(.sdata1 .sdata1.*)
+        PROVIDE(__sdata1_end = ABSOLUTE(.));
+        PROVIDE(__sdata1_size = ABSOLUTE(+__sdata1_end - __sdata1_start));
+
+        PROVIDE(__data_external_end = ABSOLUTE(.));
+        PROVIDE(__data_external_size = ABSOLUTE(__data_external_end - __data_external_start));
+    } >EXTERNAL
+
+    .data.external.clone (NOLOAD) : ALIGN(0x20) {
+        PROVIDE(__data_external_clone_start = ABSOLUTE(.));
+		. = ABSOLUTE(. + __data_external_size);
+   } >EXTERNAL
+
+    .data.external.ro : ALIGN(0x20) {
+        /** .data1, .rodata1, .sdata1 are all for large symbols which cannot
+         * fit in limited internal memory. We put them in external memory by
+         * default. */
+        PROVIDE(__data_external_ro_start = ABSOLUTE(.));
+
+        PROVIDE(__rodata1_start = ABSOLUTE(.));
+        *(.rodata1 .rodata1.*)
+        PROVIDE(__rodata1_end = ABSOLUTE(.));
+        PROVIDE(__rodata1_size = ABSOLUTE(+__rodata1_end - __rodata1_start));
+
+        /* Constructors and destructors are called once per program invocation,
+         * so are never in the hot path; they shouldn't waste space in limited
+         * internal memory so we place them in slower, external memory */
+
+        . = ALIGN(4); /* constructors must be aligned on a word boundary */
+        PROVIDE(__init_array_start = ABSOLUTE(.));
+        KEEP(*(SORT_BY_INIT_PRIORITY(.init_array.*) SORT_BY_INIT_PRIORITY(.ctors.*)));
+        KEEP(*(SORT_BY_INIT_PRIORITY(.init_array*) SORT_BY_INIT_PRIORITY(.ctors*)));
+        PROVIDE(__init_array_end = ABSOLUTE(.));
+
+        PROVIDE(__fini_array_start = ABSOLUTE(.));
+        /* destructors are run in reverse order of their priority */
+        KEEP(*(SORT_BY_INIT_PRIORITY(.fini_array.*) SORT_BY_INIT_PRIORITY(.dtors.*)));
+        KEEP(*(SORT_BY_INIT_PRIORITY(.fini_array*) SORT_BY_INIT_PRIORITY(.dtors*)));
+        PROVIDE(__fini_array_end = ABSOLUTE(.));
+
+        PROVIDE(__data_external_ro_end = ABSOLUTE(.));
+        PROVIDE(__data_external_ro_size = ABSOLUTE(__data_external_ro_end - __data_external_ro_start));
+    } >EXTERNAL
+
+    .bss1 (NOLOAD) : ALIGN(0x20) {
+        /**
+         * `.bss1` is for large zero-initialized symbols that do not fit in
+         * internal data
+         */
+        PROVIDE(__bss1_start = ABSOLUTE(.));
+        *(.bss1 .bss1.*)
+        PROVIDE(__large_common_start = ABSOLUTE(.));
+        *(LARGE_COMMON)
+        PROVIDE(__large_common_end = ABSOLUTE(.));
+        PROVIDE(__large_common_size = ABSOLUTE(+__large_common_end - __large_common_start));
+        PROVIDE(__bss1_end = ABSOLUTE(.));
+        PROVIDE(__bss1_size = ABSOLUTE(+__bss1_end - __bss1_start));
+    } >EXTERNAL
+
+    /* Program arguments are loaded by `_start` routine from `__arg_sect_start`.
+     * When the user has set a zero size for the section, argc, and argv
+     * will be zero and NULL, respectively.
+     * Although likely small, they are on the slow path so by default they
+     * go at the end of external memory
+     */
+    __ARG_SECT (NOLOAD) : ALIGN(0x4) {
+        __arg_sect_start = .;
+        . = . + (__arg_sect_size ? __arg_sect_size + 4 : 0);
+        __arg_sect_end = .;
+    } >EXTERNAL
+
+    __MALLOC_SECT (NOLOAD) : ALIGN(0x10) {
+        PROVIDE(__malloc_start = ABSOLUTE(.));
+        . = . + __malloc_size;
+        PROVIDE(__malloc_end = ABSOLUTE(.));
+    } >EXTERNAL
+
+    data_internal_loadable_addr = __data_internal_clone_start;
+    data_external_loadable_addr = __data_external_clone_start;
+
+    /DISCARD/ : {
+        /* Note:  The CEVA Debugger and Restriction Checker use information 
+         * stored in the ".note.CEVA-arch" section. Do NOT discard this section
+         * for projects in development phase. This section has no effect on the
+         * applications footprint */
+        *(.comment)
+        *(.note.GNU-stack)
+        /* The X-DSP ABI uses a custom relocation format stored in its own
+         * section. These are left in the binary by default but are unneeded. */
+        *(.ceva_reloc)
+    }
+
+}
diff --git a/tensorflow/lite/micro/tools/make/targets/ceva_bx1_makefile.inc b/tensorflow/lite/micro/tools/make/targets/ceva_bx1_makefile.inc
new file mode 100755
index 00000000000..b683c62c8fb
--- /dev/null
+++ b/tensorflow/lite/micro/tools/make/targets/ceva_bx1_makefile.inc
@@ -0,0 +1,49 @@
+
+ifeq ($(TARGET), ceva)
+ifeq ($(TARGET_ARCH), bx1)
+ #TARGET_ARCH := 
+CXXFLAGS := -std=c++11 -DTF_LITE_STATIC_MEMORY
+CCFLAGS  := -std=c11   -DTF_LITE_STATIC_MEMORY
+
+
+  PLATFORM_ARGS = \
+    -c \
+	-fmessage-length=0 \
+	-fpermissive \
+	-Os \
+	-g3 \
+	-Wall \
+	-pedantic \
+	-D_LIBCPP_INLINE_VISIBILITY="" \
+	-D_LIBCPP_EXTERN_TEMPLATE_INLINE_VISIBILITY="" \
+	--target=cevabx1-elf \
+	-mcpu=cevabx1v1.0.0 \
+	-m32x32 \
+	-mgetbits \
+	-mloop-buffer-size=10 \
+	-mfp=1 \
+	-mdpfp=1
+
+  TARGET_TOOLCHAIN_PREFIX := ceva
+  CXX_TOOL := clang++
+  CC_TOOL := clang
+  LD_TOOL := ceva-elf-ld
+  LD := ceva-elf-ld
+
+  CXXFLAGS += $(PLATFORM_ARGS)
+  CCFLAGS += $(PLATFORM_ARGS)
+  LDFLAGS += \
+	  -T \
+	CEVA_TFLM.ld \
+	--no-relax \
+	--no-gc-sections \
+	-defsym \
+	__internal_data_size=512k \
+	-defsym \
+	__internal_code_size=256k \
+#-L/home/yaire/CEVA-ToolBox/V18/BX/cevatools/lib/clang/7.1.0/cevabx1-unknown-unknown-elf/lib/rtlv1.0.0-fp1-dpfp1 \
+#	-lc++ -lc++abi -lc -lcompiler-rt
+    
+
+endif
+endif
diff --git a/tensorflow/lite/micro/tools/make/targets/cortex_m4_generic_makefile.inc b/tensorflow/lite/micro/tools/make/targets/cortex_m4_generic_makefile.inc
new file mode 100644
index 00000000000..cc34d684054
--- /dev/null
+++ b/tensorflow/lite/micro/tools/make/targets/cortex_m4_generic_makefile.inc
@@ -0,0 +1,51 @@
+# Generic Makefile target for ARM Cortex M4 builds.
+# REQUIRED:
+#   - TOOLCHAIN_PATH: The path to the ARM GCC toolchain to use.
+
+ifeq ($(TARGET), cortex_m4_generic)
+  TARGET_ARCH := arm
+  TARGET_TOOLCHAIN_PREFIX := arm-none-eabi-
+  export PATH := $(TOOLCHAIN_PATH):$(PATH)
+
+  PLATFORM_FLAGS = \
+    -DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK \
+    -DTF_LITE_STATIC_MEMORY \
+    -DNDEBUG \
+    -DTF_LITE_MCU_DEBUG_LOG \
+    -D __FPU_PRESENT=1 \
+    -DARM_MATH_CM4 \
+    -fno-rtti \
+    -fmessage-length=0 \
+    -fno-exceptions \
+    -fno-unwind-tables \
+    -ffunction-sections \
+    -fdata-sections \
+    -funsigned-char \
+    -MMD \
+    -mcpu=cortex-m4 \
+    -mthumb \
+    -mfpu=fpv4-sp-d16 \
+    -mfloat-abi=softfp \
+    -std=gnu++11 \
+    -Wvla \
+    -Wall \
+    -Wextra \
+    -Wno-shadow \
+    -Wno-missing-field-initializers \
+    -Wno-strict-aliasing \
+    -Wno-type-limits \
+    -Wno-unused-function \
+    -Wno-unused-parameter \
+    -fno-delete-null-pointer-checks \
+    -fno-threadsafe-statics \
+    -fomit-frame-pointer \
+    -fno-use-cxa-atexit \
+    -O3
+
+  CXXFLAGS += $(PLATFORM_FLAGS)
+  CCFLAGS += $(PLATFORM_FLAGS)
+
+  LDFLAGS += -Wl,--gc-sections
+
+endif
+
diff --git a/tensorflow/lite/micro/tools/make/targets/esp32_makefile.inc b/tensorflow/lite/micro/tools/make/targets/esp32_makefile.inc
index b55331b24ba..b334224e3a2 100644
--- a/tensorflow/lite/micro/tools/make/targets/esp32_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/esp32_makefile.inc
@@ -2,4 +2,6 @@
 
 ifeq ($(TARGET), esp)
   TARGET_ARCH := xtensa-esp32
+  CCFLAGS := $(filter-out -std=c11,$(CCFLAGS))
+  CFLAGS += -std=c11
 endif
diff --git a/tensorflow/lite/micro/tools/make/targets/hexagon_makefile.inc b/tensorflow/lite/micro/tools/make/targets/hexagon_makefile.inc
index e46ca0717a4..3bbe6f9aeb9 100644
--- a/tensorflow/lite/micro/tools/make/targets/hexagon_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/hexagon_makefile.inc
@@ -1,10 +1,19 @@
 # Settings for Hexagon toolchain.
 # REQUIRED:
-#   - Hexagon SDK 3.5 Toolkit (for hexagon-clang++, hexagon-sim).
-#   - HEXAGON_SDK_PREFIX environment variable must be set to location of
+#   - Hexagon SDK 3.5 Toolkit (for qurt, posix libs).
+#     HEXAGON_SDK_ROOT environment variable must be set to location of
 #     Hexagon_SDK/<version>/ on your machine.
+#   - Hexagon Tools root (for hexagon-clang++, hexagon-sim).
+#     The tool folder may be a part of the Hexagon SDK
+#      (e.g. $(HEXAGON_SDK_ROOT)/tools/HEXAGON_Tools) or installed
+#       separately.
+#     HEXAGON_ROOT environment variable must be set to location of
+#     HEXAGON_Tools on your machine.
+#   - HEXAGON_TOOL_VER: The Hexagon tool version (installed under HEXAGON_ROOT).
+#      For example: 8.3.07
 #   - HEXAGON_CPU_VER: The CPU version to use, will cause a compiler exception
-#                  without providing a version. Acceptable values: v55-v67
+#      without providing a version. Valid values may vary depending on tools
+#      version, but generally in the range: v55-v67
 #
 # Unlike other targets, there is not currently a way to automatically download
 # the Hexagon SDK.  For this reason, users are required to manually download
@@ -12,8 +21,16 @@
 ifeq ($(TARGET), hexagon)
   TARGET_ARCH := hexagon
 
-  ifndef HEXAGON_SDK_PREFIX
-    $(error HEXAGON_SDK_PREFIX is undefined)
+  ifndef HEXAGON_SDK_ROOT
+    $(error HEXAGON_SDK_ROOT is undefined)
+  endif
+
+  ifndef HEXAGON_TOOL_VER
+    $(error HEXAGON_TOOL_VER is undefined)
+  endif
+
+  ifndef HEXAGON_ROOT
+    $(error HEXAGON_ROOT is undefined)
   endif
 
   ifndef HEXAGON_CPU_VER
@@ -55,6 +72,7 @@ ifeq ($(TARGET), hexagon)
     -mcpu=$(HEXAGON_CPU_VER) \
     -m$(HEXAGON_CPU_VER)
 
+  export PATH := $(HEXAGON_ROOT)/$(HEXAGON_TOOL_VER)/Tools/bin:$(PATH)
   TARGET_TOOLCHAIN_PREFIX := hexagon-
   CXX_TOOL := clang++
   CC_TOOL := clang
@@ -63,11 +81,11 @@ ifeq ($(TARGET), hexagon)
   CCFLAGS += $(PLATFORM_ARGS)
   LDFLAGS += \
     -Wl,--gc-sections -lhexagon \
-    $(HEXAGON_SDK_PREFIX)/tools/HEXAGON_Tools/8.3.07/Tools/target/hexagon/lib/v66/libstdc++.a
+    $(HEXAGON_ROOT)/$(HEXAGON_TOOL_VER)/Tools/target/hexagon/lib/v66/libstdc++.a
 
   INCLUDES += \
-    -I$(HEXAGON_SDK_PREFIX)/libs/common/qurt/computev66/include/posix \
-    -I$(HEXAGON_SDK_PREFIX)/libs/common/qurt/computev66/include/qurt
+    -I$(HEXAGON_SDK_ROOT)/libs/common/qurt/computev66/include/posix \
+    -I$(HEXAGON_SDK_ROOT)/libs/common/qurt/computev66/include/qurt
 
   TEST_SCRIPT := tensorflow/lite/micro/testing/test_hexagon_binary.sh
 endif
diff --git a/tensorflow/lite/micro/tools/make/targets/himax_we1_evb_makefile.inc b/tensorflow/lite/micro/tools/make/targets/himax_we1_evb_makefile.inc
index 60fc2e7cca1..d19ce680b41 100644
--- a/tensorflow/lite/micro/tools/make/targets/himax_we1_evb_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/himax_we1_evb_makefile.inc
@@ -30,23 +30,24 @@ ifeq ($(TARGET), himax_we1_evb)
   DEFAULT_STACKSZ := 8192
 
   TCF_FILE_NAME = $(notdir $(TCF_FILE))
-  ARC_TARGET_FILES_DIRS = $(dir $(TCF_FILE_NAME))
+  ARC_TARGET_COPY_FILES += $(notdir $(TCF_FILE))!$(TCF_FILE)
   MAKE_PROJECT_FILES += $(TCF_FILE_NAME)
+
+
     
   LCF_FILE_NAME = $(notdir $(LCF_FILE))
-  ARC_TARGET_FILES_DIRS += $(dir $(LCF_FILE))
+  ARC_TARGET_COPY_FILES += $(notdir $(LCF_FILE))!$(LCF_FILE)
   MAKE_PROJECT_FILES += $(LCF_FILE_NAME)
   
   ARCLIB_FILE_NAME = $(notdir $(ARCLIB_FILE))
-  ARC_TARGET_FILES_DIRS += $(dir $(ARCLIB_FILE))
+  ARC_TARGET_COPY_FILES += $(notdir $(ARCLIB_FILE))!$(ARCLIB_FILE)
   MAKE_PROJECT_FILES += $(ARCLIB_FILE_NAME)
   
   LIB_HEADER_FILE_NAME = $(notdir $(LIB_HEADER_FILE))
-  ARC_TARGET_FILES_DIRS += $(dir $(LIB_HEADER_FILE))
+  ARC_TARGET_COPY_FILES += $(notdir $(LIB_HEADER_FILE))!$(LIB_HEADER_FILE)
   MAKE_PROJECT_FILES += $(LIB_HEADER_FILE_NAME)
   
   
-    
   # Need a pointer to the TCF and lcf file
 
   PLATFORM_FLAGS = \
diff --git a/tensorflow/lite/micro/tools/make/targets/stm32f4_makefile.inc b/tensorflow/lite/micro/tools/make/targets/stm32f4_makefile.inc
index 15ee93d4e19..e9ee7296999 100644
--- a/tensorflow/lite/micro/tools/make/targets/stm32f4_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/stm32f4_makefile.inc
@@ -62,35 +62,24 @@ ifeq ($(TARGET), stm32f4)
   TEST_SCRIPT := tensorflow/lite/micro/testing/test_stm32f4_binary.sh
   # TODO, non working tests.. the micro_speech example partly works
   # TODO(b/158324045): Examine why some tests fail here.
+
   EXCLUDED_TESTS := \
     tensorflow/lite/micro/micro_interpreter_test.cc \
     tensorflow/lite/micro/micro_allocator_test.cc \
     tensorflow/lite/micro/memory_helpers_test.cc \
     tensorflow/lite/micro/memory_arena_threshold_test.cc \
     tensorflow/lite/micro/recording_micro_allocator_test.cc \
-    tensorflow/lite/micro/kernels/logistic_test.cc \
-    tensorflow/lite/micro/kernels/logical_test.cc \
-    tensorflow/lite/micro/kernels/maximum_minimum_test.cc \
-    tensorflow/lite/micro/kernels/comparisons_test.cc \
-    tensorflow/lite/micro/kernels/reshape_test.cc \
-    tensorflow/lite/micro/kernels/arg_min_max_test.cc \
-    tensorflow/lite/micro/kernels/elementwise_test.cc \
-    tensorflow/lite/micro/kernels/strided_slice_test.cc \
-    tensorflow/lite/micro/kernels/prelu_test.cc \
-    tensorflow/lite/micro/kernels/pack_test.cc \
-    tensorflow/lite/micro/kernels/activations_test.cc \
-    tensorflow/lite/micro/kernels/dequantize_test.cc \
-    tensorflow/lite/micro/kernels/unpack_test.cc \
-    tensorflow/lite/micro/kernels/split_test.cc \
-    tensorflow/lite/micro/simple_tensor_allocator_test.cc
+    tensorflow/lite/micro/kernels/circular_buffer_test.cc \
+    tensorflow/lite/micro/kernels/conv_test.cc \
+    tensorflow/lite/micro/kernels/fully_connected_test.cc
+
   MICROLITE_TEST_SRCS := $(filter-out $(EXCLUDED_TESTS), $(MICROLITE_TEST_SRCS))
+
   EXCLUDED_EXAMPLE_TESTS := \
     tensorflow/lite/micro/examples/magic_wand/Makefile.inc \
     tensorflow/lite/micro/examples/person_detection/Makefile.inc \
     tensorflow/lite/micro/examples/person_detection_experimental/Makefile.inc \
-    tensorflow/lite/micro/examples/mobilenet_v2/Makefile.inc \
     tensorflow/lite/micro/examples/micro_speech/Makefile.inc \
-    tensorflow/lite/micro/examples/ds_cnn_l/Makefile.inc \
     tensorflow/lite/micro/examples/image_recognition_experimental/Makefile.inc
   MICRO_LITE_EXAMPLE_TESTS := $(filter-out $(EXCLUDED_EXAMPLE_TESTS), $(MICRO_LITE_EXAMPLE_TESTS))
 
diff --git a/tensorflow/lite/micro/tools/make/targets/xtensa_hifi_makefile.inc b/tensorflow/lite/micro/tools/make/targets/xtensa_hifi_makefile.inc
index 539f0b87ee8..4ef11530f74 100644
--- a/tensorflow/lite/micro/tools/make/targets/xtensa_hifi_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/xtensa_hifi_makefile.inc
@@ -1,11 +1,29 @@
-# Settings for Xtensa toolchain.
-# Derived from xtensa_xpg_makefile.inc
-# The Xtensa environment variables should be configured externally (XTENSA_CORE, XTENSA_SYSTEM)
+# Settings for Xtensa toolchain for the hifi kernels.
+# REQUIRED:
+#  Environment variables:
+#   - XTENSA_BASE  must be set to location of
+#     the Xtensa developer tools installation directory.
+#  Command line arguments:
+#   - XTENSA_TOOLS_VERSION: For example: RI-2019.2-linux
+#   - XTENSA_CORE: The name of the Xtensa core to use
+#      For example: hifi3
 
 ifeq ($(TARGET), xtensa_hifi)
   TARGET_ARCH := hifi3_bd5
 
-$(eval $(call add_third_party_download,$(XTENSA_HIFI4_URL),$(XTENSA_HIFI4_MD5),xa_nnlib_hifi4,))
+  ifndef XTENSA_BASE
+    $(error XTENSA_BASE is undefined)
+  endif
+
+  ifndef XTENSA_TOOLS_VERSION
+    $(error XTENSA_TOOLS_VERSION is undefined)
+  endif
+
+  ifndef XTENSA_CORE
+    $(error XTENSA_CORE is undefined)
+  endif
+
+  $(eval $(call add_third_party_download,$(XTENSA_HIFI4_URL),$(XTENSA_HIFI4_MD5),xa_nnlib_hifi4,))
 
   PLATFORM_ARGS = \
     -mno-mul16 \
@@ -23,6 +41,7 @@ $(eval $(call add_third_party_download,$(XTENSA_HIFI4_URL),$(XTENSA_HIFI4_MD5),x
   TF_LITE_MICRO_FLAGS = \
     -DTF_LITE_STATIC_MEMORY\
 
+  export PATH := $(XTENSA_BASE)/tools/$(XTENSA_TOOLS_VERSION)/XtensaTools/bin:$(PATH)
   TARGET_TOOLCHAIN_PREFIX := xt-
   CXX_TOOL := clang++
   CC_TOOL := clang
diff --git a/tensorflow/lite/micro/tools/make/targets/xtensa_hifimini_makefile.inc b/tensorflow/lite/micro/tools/make/targets/xtensa_hifimini_makefile.inc
index 809e39b5c6f..664e5c3373c 100644
--- a/tensorflow/lite/micro/tools/make/targets/xtensa_hifimini_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/xtensa_hifimini_makefile.inc
@@ -1,12 +1,28 @@
 # Settings for Xtensa toolchain for the hifimini kernels.
 # REQUIRED:
-#   - RI2019.2 Toolkit (for xt-clang/xt-clang++).
-#   - XTENSA_CORE: The name of the core to use, will cause a compiler exception
-#                  without providing a core.
+#  Environment variables:
+#   - XTENSA_BASE  must be set to location of
+#     the Xtensa developer tools installation directory.
+#  Command line arguments:
+#   - XTENSA_TOOLS_VERSION: For example: RI-2019.2-linux
+#   - XTENSA_CORE: The name of the Xtensa core to use
+#      For example: hifimini
 
 ifeq ($(TARGET), xtensa_hifimini)
   TARGET_ARCH := xtensa_hifimini
 
+  ifndef XTENSA_BASE
+    $(error XTENSA_BASE is undefined)
+  endif
+
+  ifndef XTENSA_TOOLS_VERSION
+    $(error XTENSA_TOOLS_VERSION is undefined)
+  endif
+
+  ifndef XTENSA_CORE
+    $(error XTENSA_CORE is undefined)
+  endif
+
   PLATFORM_ARGS = \
     -DTF_LITE_MCU_DEBUG_LOG \
     --xtensa-core=$(XTENSA_CORE) \
@@ -20,6 +36,7 @@ ifeq ($(TARGET), xtensa_hifimini)
     -fmessage-length=0 \
     -fno-threadsafe-statics
 
+  export PATH := $(XTENSA_BASE)/tools/$(XTENSA_TOOLS_VERSION)/XtensaTools/bin:$(PATH)
   TARGET_TOOLCHAIN_PREFIX := xt-
   CXX_TOOL := clang++
   CC_TOOL := clang
diff --git a/tensorflow/lite/micro/tools/make/targets/xtensa_hifimini_staging_makefile.inc b/tensorflow/lite/micro/tools/make/targets/xtensa_hifimini_staging_makefile.inc
index 5e87db13c01..557b8f6e9e6 100644
--- a/tensorflow/lite/micro/tools/make/targets/xtensa_hifimini_staging_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/xtensa_hifimini_staging_makefile.inc
@@ -1,12 +1,28 @@
-# Settings for Xtensa toolchain for hifimini_staging kernels.
+# Settings for Xtensa toolchain for the hifimini kernels.
 # REQUIRED:
-#   - RI2019.2 Toolkit (for xt-clang/xt-clang++).
-#   - XTENSA_CORE: The name of the core to use, will cause a compiler exception
-#                  without providing a core.
+#  Environment variables:
+#   - XTENSA_BASE  must be set to location of
+#     the Xtensa developer tools installation directory.
+#  Command line arguments:
+#   - XTENSA_TOOLS_VERSION: For example: RI-2019.2-linux
+#   - XTENSA_CORE: The name of the Xtensa core to use
+#      For example: hifimini
 
 ifeq ($(TARGET), xtensa_hifimini_staging)
   TARGET_ARCH := xtensa_hifimini_staging
 
+  ifndef XTENSA_BASE
+    $(error XTENSA_BASE is undefined)
+  endif
+
+  ifndef XTENSA_TOOLS_VERSION
+    $(error XTENSA_TOOLS_VERSION is undefined)
+  endif
+
+  ifndef XTENSA_CORE
+    $(error XTENSA_CORE is undefined)
+  endif
+
   PLATFORM_ARGS = \
     -DTF_LITE_MCU_DEBUG_LOG \
     --xtensa-core=$(XTENSA_CORE) \
@@ -20,6 +36,7 @@ ifeq ($(TARGET), xtensa_hifimini_staging)
     -fmessage-length=0 \
     -fno-threadsafe-statics
 
+  export PATH := $(XTENSA_BASE)/tools/$(XTENSA_TOOLS_VERSION)/XtensaTools/bin:$(PATH)
   TARGET_TOOLCHAIN_PREFIX := xt-
   CXX_TOOL := clang++
   CC_TOOL := clang
diff --git a/tensorflow/lite/micro/tools/make/templates/ceva_bx1/ceva_app_makefile.tpl b/tensorflow/lite/micro/tools/make/templates/ceva_bx1/ceva_app_makefile.tpl
new file mode 100755
index 00000000000..a76169d7c21
--- /dev/null
+++ b/tensorflow/lite/micro/tools/make/templates/ceva_bx1/ceva_app_makefile.tpl
@@ -0,0 +1,71 @@
+
+TARGET_TOOLCHAIN_ROOT := /home/yaire/CEVA-ToolBox/V18.02/BX
+#CC = %{CC_TOOL}%
+#CXX = %{CXX_TOOL}%
+#LD = %{LD_TOOL}%
+CC = ${TARGET_TOOLCHAIN_ROOT}/cevatools/bin/clang
+CXX = ${TARGET_TOOLCHAIN_ROOT}/cevatools/bin/clang++
+LD = ${TARGET_TOOLCHAIN_ROOT}/cevatools/bin/ceva-elf-ld
+AS = ${TARGET_TOOLCHAIN_ROOT}/cevatools/bin/ceva-elf-as
+TOOLS_OBJS := \
+${TARGET_TOOLCHAIN_ROOT}/cevatools/lib/clang/4.0.1/cevabx1-unknown-unknown-elf/lib/rtlv1.0.0-fp1-dpfp1/crt0.o ${TARGET_TOOLCHAIN_ROOT}/cevatools/lib/clang/4.0.1/cevabx1-unknown-unknown-elf/lib/rtlv1.0.0-fp1-dpfp1/crtn.o
+TOOLS_LIBS := \
+-lc++ -lc++abi -lc -lcompiler-rt
+
+  LDFLAGS += \
+	  -T \
+	../../../../../targets/ceva/CEVA_BX1_TFLM_18.0.2.ld \
+	--no-relax \
+	--no-gc-sections \
+	-defsym \
+	__internal_data_size=512k \
+	-defsym \
+	__internal_code_size=256k \
+	-L${TARGET_TOOLCHAIN_ROOT}/cevatools/lib/clang/4.0.1/cevabx1-unknown-unknown-elf/lib/rtlv1.0.0-fp1-dpfp1 \
+	-lc++ -lc++abi -lc -lcompiler-rt
+    
+
+OUT_NAME = %{EXECUTABLE}%
+
+CXXFLAGS += %{CXX_FLAGS}%
+CCFLAGS += %{CC_FLAGS}%
+
+#=============================================================
+# Files and directories
+#=============================================================
+SRCS := \
+%{SRCS}%
+
+OBJS := \
+$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(SRCS)))
+
+
+#=============================================================
+# Common rules
+#=============================================================
+.PHONY: all app flash clean run debug
+
+%.o: %.cc
+	$(CXX) $(CXXFLAGS) $(EXT_CFLAGS) $(INCLUDES) -c $< -o $@
+
+%.o: %.c
+	$(CC) $(CCFLAGS) $(EXT_CFLAGS) $(INCLUDES) -c $< -o $@
+
+$(OUT_NAME): $(OBJS)
+	$(LD)  -o $@ $(OBJS) $(TOOLS_OBJS) ${TOOLS_LIBS} $(LDFLAGS)
+
+%{EXTRA_APP_RULES}%
+
+
+#=================================================================
+# Global rules
+#=================================================================
+all: $(OUT_NAME)
+
+app: $(OUT_NAME)
+
+clean: 
+	-@$(RM) $(call fix_platform_path,$(OBJS))
+	-@$(RM) $(OUT_NAME) %{EXTRA_RM_TARGETS}%
+
+
diff --git a/tensorflow/lite/micro/tools/make/templates/ceva_bx1/ceva_app_makefile_v18.0.3.tpl b/tensorflow/lite/micro/tools/make/templates/ceva_bx1/ceva_app_makefile_v18.0.3.tpl
new file mode 100755
index 00000000000..2847aaa80b0
--- /dev/null
+++ b/tensorflow/lite/micro/tools/make/templates/ceva_bx1/ceva_app_makefile_v18.0.3.tpl
@@ -0,0 +1,71 @@
+
+TARGET_TOOLCHAIN_ROOT := /home/yaire/CEVA-ToolBox/V18.02/BX
+#CC = %{CC_TOOL}%
+#CXX = %{CXX_TOOL}%
+#LD = %{LD_TOOL}%
+CC = ${TARGET_TOOLCHAIN_ROOT}/cevatools/bin/clang
+CXX = ${TARGET_TOOLCHAIN_ROOT}/cevatools/bin/clang++
+LD = ${TARGET_TOOLCHAIN_ROOT}/cevatools/bin/ceva-elf-ld
+AS = ${TARGET_TOOLCHAIN_ROOT}/cevatools/bin/ceva-elf-as
+TOOLS_OBJS := \
+${TARGET_TOOLCHAIN_ROOT}/cevatools/lib/clang/7.1.0/cevabx1-unknown-unknown-elf/lib/rtlv1.0.0-fp1-dpfp1/crt0.o ${TARGET_TOOLCHAIN_ROOT}/cevatools/lib/clang/7.1.0/cevabx1-unknown-unknown-elf/lib/rtlv1.0.0-fp1-dpfp1/crtn.o
+TOOLS_LIBS := \
+-lc++ -lc++abi -lc -lcompiler-rt
+
+  LDFLAGS += \
+	  -T \
+	../../../../../targets/ceva/CEVA_BX1_TFLM_18.0.2.ld \
+	--no-relax \
+	--no-gc-sections \
+	-defsym \
+	__internal_data_size=512k \
+	-defsym \
+	__internal_code_size=256k \
+	-L/home/yaire/CEVA-ToolBox/V18/BX/cevatools/lib/clang/7.1.0/cevabx1-unknown-unknown-elf/lib/rtlv1.0.0-fp1-dpfp1 \
+	-lc++ -lc++abi -lc -lcompiler-rt
+    
+
+OUT_NAME = %{EXECUTABLE}%
+
+CXXFLAGS += %{CXX_FLAGS}%
+CCFLAGS += %{CC_FLAGS}%
+
+#=============================================================
+# Files and directories
+#=============================================================
+SRCS := \
+%{SRCS}%
+
+OBJS := \
+$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(SRCS)))
+
+
+#=============================================================
+# Common rules
+#=============================================================
+.PHONY: all app flash clean run debug
+
+%.o: %.cc
+	$(CXX) $(CXXFLAGS) $(EXT_CFLAGS) $(INCLUDES) -c $< -o $@
+
+%.o: %.c
+	$(CC) $(CCFLAGS) $(EXT_CFLAGS) $(INCLUDES) -c $< -o $@
+
+$(OUT_NAME): $(OBJS)
+	$(LD)  -o $@ $(OBJS) $(TOOLS_OBJS) ${TOOLS_LIBS} $(LDFLAGS)
+
+%{EXTRA_APP_RULES}%
+
+
+#=================================================================
+# Global rules
+#=================================================================
+all: $(OUT_NAME)
+
+app: $(OUT_NAME)
+
+clean: 
+	-@$(RM) $(call fix_platform_path,$(OBJS))
+	-@$(RM) $(OUT_NAME) %{EXTRA_RM_TARGETS}%
+
+
diff --git a/tensorflow/lite/micro/tools/make/templates/esp/components/tfmicro/CMakeLists.txt.tpl b/tensorflow/lite/micro/tools/make/templates/esp/components/tfmicro/CMakeLists.txt.tpl
index 43ee3e214ff..a3425175c6e 100644
--- a/tensorflow/lite/micro/tools/make/templates/esp/components/tfmicro/CMakeLists.txt.tpl
+++ b/tensorflow/lite/micro/tools/make/templates/esp/components/tfmicro/CMakeLists.txt.tpl
@@ -34,4 +34,5 @@ target_compile_options(${COMPONENT_LIB} PRIVATE
 
 target_compile_options(${COMPONENT_LIB} PRIVATE %{CC_FLAGS}%)
 target_compile_options(${COMPONENT_LIB} PRIVATE $<$<COMPILE_LANGUAGE:CXX>: %{CXX_FLAGS}% >)
+target_compile_options(${COMPONENT_LIB} INTERFACE $<$<IN_LIST:-DTF_LITE_STATIC_MEMORY,$<TARGET_PROPERTY:${COMPONENT_LIB},COMPILE_OPTIONS>>:-DTF_LITE_STATIC_MEMORY>)
 target_link_libraries(${COMPONENT_LIB} PRIVATE %{LINKER_FLAGS}%)
diff --git a/tensorflow/lite/micro/tools/make/third_party_downloads.inc b/tensorflow/lite/micro/tools/make/third_party_downloads.inc
index 7c6dc211963..3bbeaeaff54 100644
--- a/tensorflow/lite/micro/tools/make/third_party_downloads.inc
+++ b/tensorflow/lite/micro/tools/make/third_party_downloads.inc
@@ -76,11 +76,11 @@ PERSON_MODEL_MD5 := "55b85f76e2995153e660391d4a209ef1"
 PERSON_MODEL_INT8_URL := "https://storage.googleapis.com/download.tensorflow.org/data/tf_lite_micro_person_data_int8_grayscale_2020_06_23.zip"
 PERSON_MODEL_INT8_MD5 := "9b5b6d4677dd0a91b1bb992d1c4c0417"
 
-EMBARC_MLI_URL := "http://mirror.tensorflow.org/github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/archive/58284867ca52d1f43b25045e8601999d7359d986.zip"
-EMBARC_MLI_MD5 := "2bf4982a327fdaa9d475803ce014d1ef"
+EMBARC_MLI_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/archive/ef7dd3c4e37d74a908f30713a7d0121387d3c678.zip"
+EMBARC_MLI_MD5 := "65c4ff3f4a2963e90fd014f97c69f451"
 
-EMBARC_MLI_PRE_COMPILED_URL := "http://mirror.tensorflow.org/github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/releases/download/Release_1.1_RC2/embARC_MLI_package.zip"
-EMBARC_MLI_PRE_COMPILED_MD5 := "a95ff9e0370434484f14e7e4114327f6"
+EMBARC_MLI_PRE_COMPILED_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/releases/download/Release_1.1_RC3/embARC_MLI_package.zip"
+EMBARC_MLI_PRE_COMPILED_MD5 := "173990c2dde4efef6a2c95b92d1f0244"
 
 ZEPHYR_URL := "http://mirror.tensorflow.org/github.com/antmicro/zephyr/archive/55e36b9.zip"
 ZEPHYR_MD5 := "755622eb4812fde918a6382b65d50c3b"
@@ -88,10 +88,8 @@ ZEPHYR_MD5 := "755622eb4812fde918a6382b65d50c3b"
 XTENSA_HIFI4_URL :="http://mirror.tensorflow.org/github.com/foss-xtensa/nnlib-hifi4/raw/master/archive/xa_nnlib_06_27.zip"
 XTENSA_HIFI4_MD5 :="45fdc1209a8da62ab568aa6040f7eabf"
 
-ETHOSU_URL := "http://mirror.tensorflow.org/git.mlplatform.org/ml/ethos-u/ethos-u-core-driver.git/snapshot/ethos-u-core-driver-bcb5aaa99756f1b5c1295b079ebdd60996bc75a5.tar.gz"
-ETHOSU_MD5 := "d2073c8d88fc167fd5c46b5dcda58ea1"
-
-HIMAX_WE1_SDK_URL ="http://mirror.tensorflow.org/www.himax.com.tw/we-i/himax_we1_sdk_v02.zip"
-HIMAX_WE1_SDK_MD5 ="9a4b2f29b16052764e437b64bdcba816"
-
+ETHOSU_URL := "https://git.mlplatform.org/ml/ethos-u/ethos-u-core-driver.git/snapshot/ethos-u-core-driver-2b201c340788ac582cec160b7217c2b5405b04f9.tar.gz"
+ETHOSU_MD5 := "0c148b90a1ee01de398892eb3a63e717"
 
+HIMAX_WE1_SDK_URL ="https://www.himax.com.tw/we-i/himax_we1_sdk_v04.zip"
+HIMAX_WE1_SDK_MD5 ="40b3ccb3c2e41210fe5c970d61e7e7d3"
diff --git a/tensorflow/lite/model.h b/tensorflow/lite/model.h
index 079b4ad2a40..39c7fd77c16 100644
--- a/tensorflow/lite/model.h
+++ b/tensorflow/lite/model.h
@@ -18,23 +18,9 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_MODEL_H_
 #define TENSORFLOW_LITE_MODEL_H_
 
+#include "tensorflow/lite/interpreter_builder.h"
 #include "tensorflow/lite/model_builder.h"
 
-#if TFLITE_EXPERIMENTAL_RUNTIME_EAGER
-#include "tensorflow/lite/experimental/tf_runtime/lib/eager_model.h"
-#else
-#include "tensorflow/lite/interpreter_builder.h"
-#endif
-
-namespace tflite {
-
-#if TFLITE_EXPERIMENTAL_RUNTIME_EAGER
-using InterpreterBuilder = tflrt::EagerTfLiteInterpreterBuilderAPI;
-using Interpreter = tflrt::EagerInterpreter;
-#else
-using InterpreterBuilder = impl::InterpreterBuilder;
-#endif
-
-}  // namespace tflite
+// TODO(b/168725050): Address the issue of proxy header in this file.
 
 #endif  // TENSORFLOW_LITE_MODEL_H_
diff --git a/tensorflow/lite/model_builder.cc b/tensorflow/lite/model_builder.cc
index c63ba47b5cf..23e6ea3322f 100644
--- a/tensorflow/lite/model_builder.cc
+++ b/tensorflow/lite/model_builder.cc
@@ -29,10 +29,6 @@ limitations under the License.
 #include "tensorflow/lite/util.h"
 #include "tensorflow/lite/version.h"
 
-#if defined(TFLITE_ENABLE_DEFAULT_PROFILER)
-#include "tensorflow/lite/profiling/platform_profiler.h"
-#endif
-
 namespace tflite {
 
 namespace {
diff --git a/tensorflow/lite/model_xnnpack_test.cc b/tensorflow/lite/model_xnnpack_test.cc
index 73860807c00..f04334c7711 100644
--- a/tensorflow/lite/model_xnnpack_test.cc
+++ b/tensorflow/lite/model_xnnpack_test.cc
@@ -30,7 +30,7 @@ TEST(FloatModel, WithXnnpackDelegate) {
 
   std::unique_ptr<Interpreter> interpreter;
   ASSERT_EQ(InterpreterBuilder(*model,
-                               ops::builtin::BuiltinOpResolver{})(&interpreter),
+                               ops::builtin::BuiltinOpResolver())(&interpreter),
             kTfLiteOk);
   ASSERT_TRUE(interpreter);
 
@@ -48,4 +48,32 @@ TEST(FloatModel, WithXnnpackDelegate) {
 #endif
 }
 
+TEST(FloatModel, DefaultXnnpackDelegateNotAllowed) {
+  // Note: this graph will be fully delegated by the XNNPACK delegate.
+  auto model = FlatBufferModel::BuildFromFile(
+      "tensorflow/lite/testdata/multi_add.bin");
+  ASSERT_TRUE(model);
+
+  std::unique_ptr<Interpreter> interpreter;
+  ASSERT_EQ(
+      InterpreterBuilder(
+          *model, ops::builtin::BuiltinOpResolverWithoutDefaultDelegates())(
+          &interpreter),
+      kTfLiteOk);
+  ASSERT_TRUE(interpreter);
+
+  ASSERT_EQ(interpreter->AllocateTensors(), kTfLiteOk);
+
+#if TFLITE_HAS_ATTRIBUTE_WEAK || defined(TFLITE_BUILD_WITH_XNNPACK_DELEGATE)
+  // As we don't allow applying xnnpack delegate by default, we will expect the
+  // following:
+  EXPECT_LT(1, interpreter->execution_plan().size());
+  int first_node_id = interpreter->execution_plan()[0];
+  const auto& first_node_reg =
+      interpreter->node_and_registration(first_node_id)->second;
+  const std::string op_name = GetOpNameByRegistration(first_node_reg);
+  EXPECT_EQ("ADD", op_name);
+#endif
+}
+
 }  // namespace tflite
diff --git a/tensorflow/lite/nnapi/BUILD b/tensorflow/lite/nnapi/BUILD
index 82d775dd94b..8d08c22a44c 100644
--- a/tensorflow/lite/nnapi/BUILD
+++ b/tensorflow/lite/nnapi/BUILD
@@ -1,4 +1,5 @@
 load("//tensorflow/lite:special_rules.bzl", "if_nnapi")
+load("//tensorflow:tensorflow.bzl", "get_compatible_with_portable")
 
 package(
     default_visibility = [
@@ -13,6 +14,7 @@ cc_library(
         "NeuralNetworksShim.h",
         "NeuralNetworksTypes.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     linkopts = if_nnapi(["-ldl"]),
 )
 
@@ -25,6 +27,7 @@ cc_library(
     hdrs = [
         "nnapi_implementation.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     linkopts = if_nnapi(["-ldl"]) + if_nnapi(
         supported = ["-lrt"],
         supported_android = [],
@@ -38,6 +41,7 @@ cc_library(
     name = "nnapi_util",
     srcs = ["nnapi_util.cc"],
     hdrs = ["nnapi_util.h"],
+    compatible_with = get_compatible_with_portable(),
     deps = [
         ":nnapi_implementation",
         "//tensorflow/lite:util",
diff --git a/tensorflow/lite/optional_debug_tools.cc b/tensorflow/lite/optional_debug_tools.cc
index 8ee5c3b3f56..ef4ee1cb4e3 100644
--- a/tensorflow/lite/optional_debug_tools.cc
+++ b/tensorflow/lite/optional_debug_tools.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/optional_debug_tools.h"
 
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 namespace tflite {
 
@@ -81,6 +82,8 @@ const char* AllocTypeName(TfLiteAllocationType type) {
       return "kTfLiteArenaRwPersistent";
     case kTfLitePersistentRo:
       return "kTfLitePersistentRo";
+    case kTfLiteCustom:
+      return "kTfLiteCustom";
   }
   return "(invalid)";
 }
diff --git a/tensorflow/lite/portable_type_to_tflitetype.h b/tensorflow/lite/portable_type_to_tflitetype.h
new file mode 100644
index 00000000000..208efcce5b2
--- /dev/null
+++ b/tensorflow/lite/portable_type_to_tflitetype.h
@@ -0,0 +1,74 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_PORTABLE_TYPE_TO_TFLITETYPE_H_
+#define TENSORFLOW_LITE_PORTABLE_TYPE_TO_TFLITETYPE_H_
+
+// Most of the definitions have been moved to this subheader so that Micro
+// can include it without relying on <string>, which isn't available on all
+// platforms.
+
+// Arduino build defines abs as a macro here. That is invalid C++, and breaks
+// libc++'s <complex> header, undefine it.
+#ifdef abs
+#undef abs
+#endif
+
+#include <complex>
+
+#include "tensorflow/lite/c/common.h"
+
+namespace tflite {
+
+// Map statically from a C++ type to a TfLiteType. Used in interpreter for
+// safe casts.
+// Example:
+//  typeToTfLiteType<bool>() -> kTfLiteBool
+template <typename T>
+constexpr TfLiteType typeToTfLiteType() {
+  return kTfLiteNoType;
+}
+// Map from TfLiteType to the corresponding C++ type.
+// Example:
+//   TfLiteTypeToType<kTfLiteBool>::Type -> bool
+template <TfLiteType TFLITE_TYPE_ENUM>
+struct TfLiteTypeToType {};  // Specializations below
+
+// Template specialization for both typeToTfLiteType and TfLiteTypeToType.
+#define MATCH_TYPE_AND_TFLITE_TYPE(CPP_TYPE, TFLITE_TYPE_ENUM) \
+  template <>                                                  \
+  constexpr TfLiteType typeToTfLiteType<CPP_TYPE>() {          \
+    return TFLITE_TYPE_ENUM;                                   \
+  }                                                            \
+  template <>                                                  \
+  struct TfLiteTypeToType<TFLITE_TYPE_ENUM> {                  \
+    using Type = CPP_TYPE;                                     \
+  }
+
+// No string mapping is included here, since the TF Lite packed representation
+// doesn't correspond to a C++ type well.
+MATCH_TYPE_AND_TFLITE_TYPE(int, kTfLiteInt32);
+MATCH_TYPE_AND_TFLITE_TYPE(int16_t, kTfLiteInt16);
+MATCH_TYPE_AND_TFLITE_TYPE(int64_t, kTfLiteInt64);
+MATCH_TYPE_AND_TFLITE_TYPE(float, kTfLiteFloat32);
+MATCH_TYPE_AND_TFLITE_TYPE(unsigned char, kTfLiteUInt8);
+MATCH_TYPE_AND_TFLITE_TYPE(int8_t, kTfLiteInt8);
+MATCH_TYPE_AND_TFLITE_TYPE(bool, kTfLiteBool);
+MATCH_TYPE_AND_TFLITE_TYPE(std::complex<float>, kTfLiteComplex64);
+MATCH_TYPE_AND_TFLITE_TYPE(std::complex<double>, kTfLiteComplex128);
+MATCH_TYPE_AND_TFLITE_TYPE(TfLiteFloat16, kTfLiteFloat16);
+MATCH_TYPE_AND_TFLITE_TYPE(double, kTfLiteFloat64);
+
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_PORTABLE_TYPE_TO_TFLITETYPE_H_
diff --git a/tensorflow/lite/profiling/BUILD b/tensorflow/lite/profiling/BUILD
index ac957590c21..b54e742e4b5 100644
--- a/tensorflow/lite/profiling/BUILD
+++ b/tensorflow/lite/profiling/BUILD
@@ -1,13 +1,14 @@
+load("//tensorflow:tensorflow.bzl", "if_not_windows")
+load("//tensorflow:tensorflow.bzl", "get_compatible_with_portable")
 load("//tensorflow/lite:build_def.bzl", "tflite_copts")
+load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite_combined")
 
 package(
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],  # Apache 2.0
 )
 
-common_copts = [
-    "-Wall",
-] + tflite_copts()
+common_copts = tflite_copts() + if_not_windows(["-Wall"])
 
 cc_library(
     name = "profiler",
@@ -23,6 +24,16 @@ cc_library(
     ],
 )
 
+cc_test(
+    name = "profiler_test",
+    srcs = ["profiler_test.cc"],
+    deps = [
+        ":profiler",
+        ":test_main",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_library(
     name = "atrace_profiler",
     srcs = ["atrace_profiler.cc"],
@@ -35,10 +46,21 @@ cc_library(
     ],
 )
 
+cc_test(
+    name = "atrace_profiler_test",
+    srcs = ["atrace_profiler_test.cc"],
+    deps = [
+        ":atrace_profiler",
+        ":test_main",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_library(
     name = "platform_profiler",
     srcs = ["platform_profiler.cc"],
     hdrs = ["platform_profiler.h"],
+    compatible_with = get_compatible_with_portable(),
     copts = common_copts,
     deps = [
         "//tensorflow/lite/core/api",
@@ -48,16 +70,6 @@ cc_library(
     }),
 )
 
-cc_test(
-    name = "profiler_test",
-    srcs = ["profiler_test.cc"],
-    deps = [
-        ":profiler",
-        "//tensorflow/lite/testing:util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
 cc_library(
     name = "profile_buffer",
     hdrs = ["profile_buffer.h"],
@@ -69,6 +81,16 @@ cc_library(
     ],
 )
 
+cc_test(
+    name = "profile_buffer_test",
+    srcs = ["profile_buffer_test.cc"],
+    deps = [
+        ":profile_buffer",
+        ":test_main",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_library(
     name = "time",
     srcs = ["time.cc"],
@@ -76,6 +98,16 @@ cc_library(
     copts = common_copts,
 )
 
+cc_test(
+    name = "time_test",
+    srcs = ["time_test.cc"],
+    deps = [
+        ":test_main",
+        ":time",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_library(
     name = "memory_info",
     srcs = ["memory_info.cc"],
@@ -83,31 +115,21 @@ cc_library(
     copts = common_copts,
 )
 
-cc_test(
-    name = "time_test",
-    srcs = ["time_test.cc"],
-    copts = common_copts,
-    deps = [
-        ":time",
-        "//tensorflow/lite/testing:util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
 cc_test(
     name = "memory_info_test",
     srcs = ["memory_info_test.cc"],
-    copts = common_copts,
     tags = [
         # Some low-level checks, like heap size check, may break in asan, msan
         # and tsan. So, disable such tests.
         "noasan",
         "nomsan",
         "notsan",
+        # TODO(b/166227284): Fix the test for Android.
+        "tflite_not_portable_android",
     ],
     deps = [
         ":memory_info",
-        "//tensorflow/lite/testing:util",
+        ":test_main",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -125,10 +147,9 @@ cc_library(
 cc_test(
     name = "profile_summary_formatter_test",
     srcs = ["profile_summary_formatter_test.cc"],
-    copts = common_copts,
     deps = [
         ":profile_summary_formatter",
-        "//tensorflow/lite/testing:util",
+        ":test_main",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -151,26 +172,28 @@ cc_library(
 cc_test(
     name = "profile_summarizer_test",
     srcs = ["profile_summarizer_test.cc"],
-    copts = common_copts,
     deps = [
         ":profile_summarizer",
         ":profiler",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/kernels:subgraph_test_util",
         "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_library(
+    name = "test_main",
+    testonly = 1,
+    srcs = ["test_main.cc"],
+    visibility = ["//visibility:private"],
+    deps = [
         "//tensorflow/lite/testing:util",
         "@com_google_googletest//:gtest",
     ],
 )
 
-cc_test(
-    name = "profile_buffer_test",
-    srcs = ["profile_buffer_test.cc"],
-    deps = [
-        ":profile_buffer",
-        "//tensorflow/lite/testing:util",
-        "@com_google_googletest//:gtest",
-    ],
-)
+tflite_portable_test_suite_combined(combine_conditions = {"deps": [":test_main"]})
diff --git a/tensorflow/lite/profiling/atrace_profiler.cc b/tensorflow/lite/profiling/atrace_profiler.cc
index 4bdaf9d9e06..ec796daf7f2 100644
--- a/tensorflow/lite/profiling/atrace_profiler.cc
+++ b/tensorflow/lite/profiling/atrace_profiler.cc
@@ -15,6 +15,9 @@ limitations under the License.
 #include "tensorflow/lite/profiling/atrace_profiler.h"
 
 #include <dlfcn.h>
+#if defined(__ANDROID__)
+#include <sys/system_properties.h>
+#endif
 
 #include <type_traits>
 
@@ -89,8 +92,20 @@ class ATraceProfiler : public tflite::Profiler {
   FpEndSection atrace_end_section_;
 };
 
-std::unique_ptr<tflite::Profiler> CreateATraceProfiler() {
+std::unique_ptr<tflite::Profiler> MaybeCreateATraceProfiler() {
+#if defined(TFLITE_ENABLE_DEFAULT_PROFILER)
   return std::unique_ptr<tflite::Profiler>(new ATraceProfiler());
+#else  // TFLITE_ENABLE_DEFAULT_PROFILER
+#if defined(__ANDROID__)
+  constexpr char kTraceProp[] = "debug.tflite.trace";
+  char trace_enabled[PROP_VALUE_MAX] = "";
+  int length = __system_property_get(kTraceProp, trace_enabled);
+  if (length == 1 && trace_enabled[0] == '1') {
+    return std::unique_ptr<tflite::Profiler>(new ATraceProfiler());
+  }
+#endif  // __ANDROID__
+  return nullptr;
+#endif  // TFLITE_ENABLE_DEFAULT_PROFILER
 }
 
 }  // namespace profiling
diff --git a/tensorflow/lite/profiling/atrace_profiler.h b/tensorflow/lite/profiling/atrace_profiler.h
index d103cbc8536..64f59e88582 100644
--- a/tensorflow/lite/profiling/atrace_profiler.h
+++ b/tensorflow/lite/profiling/atrace_profiler.h
@@ -22,7 +22,10 @@ limitations under the License.
 namespace tflite {
 namespace profiling {
 
-std::unique_ptr<tflite::Profiler> CreateATraceProfiler();
+// Creates a profiler which reports the traced events to the Android ATrace.
+// Nullptr will be returned if the Android system property 'debug.tflite.trace'
+// is not set or the property value is not 1.
+std::unique_ptr<tflite::Profiler> MaybeCreateATraceProfiler();
 
 }  // namespace profiling
 }  // namespace tflite
diff --git a/tensorflow/lite/profiling/atrace_profiler_test.cc b/tensorflow/lite/profiling/atrace_profiler_test.cc
new file mode 100644
index 00000000000..4c7cdba0d39
--- /dev/null
+++ b/tensorflow/lite/profiling/atrace_profiler_test.cc
@@ -0,0 +1,56 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/profiling/atrace_profiler.h"
+
+#if defined(__ANDROID__)
+#include <sys/system_properties.h>
+#endif
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+namespace tflite {
+namespace profiling {
+
+namespace {
+
+TEST(ATraceProfilerTest, MaybeCreateATraceProfiler) {
+  auto initial_state_profiler = MaybeCreateATraceProfiler();
+#if !defined(TFLITE_ENABLE_DEFAULT_PROFILER)
+  EXPECT_EQ(nullptr, initial_state_profiler.get());
+#else
+  EXPECT_NE(nullptr, initial_state_profiler.get());
+#endif
+
+#if defined(__ANDROID__)
+  if (__system_property_set("debug.tflite.trace", "1") == 0) {
+    auto on_state_profiler = MaybeCreateATraceProfiler();
+    EXPECT_NE(nullptr, on_state_profiler.get());
+  }
+
+  if (__system_property_set("debug.tflite.trace", "0") == 0) {
+    auto off_state_profiler = MaybeCreateATraceProfiler();
+#if !defined(TFLITE_ENABLE_DEFAULT_PROFILER)
+    EXPECT_EQ(nullptr, off_state_profiler.get());
+#else
+    EXPECT_NE(nullptr, off_state_profiler.get());
+#endif
+  }
+#endif  // __ANDROID__
+}
+
+}  // namespace
+}  // namespace profiling
+}  // namespace tflite
diff --git a/tensorflow/lite/profiling/memory_info_test.cc b/tensorflow/lite/profiling/memory_info_test.cc
index a6bd2e4a667..9b580b75adf 100644
--- a/tensorflow/lite/profiling/memory_info_test.cc
+++ b/tensorflow/lite/profiling/memory_info_test.cc
@@ -15,7 +15,6 @@ limitations under the License.
 #include "tensorflow/lite/profiling/memory_info.h"
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/testing/util.h"
 
 namespace tflite {
 namespace profiling {
@@ -71,9 +70,3 @@ TEST(MemoryUsage, IsSupported) {
 }  // namespace memory
 }  // namespace profiling
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/profiling/platform_profiler.cc b/tensorflow/lite/profiling/platform_profiler.cc
index cd0770c2348..6ee290cb982 100644
--- a/tensorflow/lite/profiling/platform_profiler.cc
+++ b/tensorflow/lite/profiling/platform_profiler.cc
@@ -25,11 +25,11 @@ limitations under the License.
 namespace tflite {
 namespace profiling {
 
-std::unique_ptr<tflite::Profiler> CreatePlatformProfiler() {
+std::unique_ptr<tflite::Profiler> MaybeCreatePlatformProfiler() {
 #if defined(__ANDROID__)
-  return CreateATraceProfiler();
+  return MaybeCreateATraceProfiler();
 #else
-  return std::unique_ptr<tflite::Profiler>(nullptr);
+  return nullptr;
 #endif
 }
 
diff --git a/tensorflow/lite/profiling/platform_profiler.h b/tensorflow/lite/profiling/platform_profiler.h
index 87361b30b50..52a51f87634 100644
--- a/tensorflow/lite/profiling/platform_profiler.h
+++ b/tensorflow/lite/profiling/platform_profiler.h
@@ -22,7 +22,7 @@ limitations under the License.
 namespace tflite {
 namespace profiling {
 
-std::unique_ptr<tflite::Profiler> CreatePlatformProfiler();
+std::unique_ptr<tflite::Profiler> MaybeCreatePlatformProfiler();
 
 }  // namespace profiling
 }  // namespace tflite
diff --git a/tensorflow/lite/profiling/profile_buffer_test.cc b/tensorflow/lite/profiling/profile_buffer_test.cc
index ab98cbb0d13..457b6ff2aba 100644
--- a/tensorflow/lite/profiling/profile_buffer_test.cc
+++ b/tensorflow/lite/profiling/profile_buffer_test.cc
@@ -20,7 +20,6 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/testing/util.h"
 
 namespace tflite {
 namespace profiling {
@@ -121,9 +120,3 @@ TEST(ProfileBufferTest, Enable) {
 }  // namespace
 }  // namespace profiling
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/profiling/profile_summarizer_test.cc b/tensorflow/lite/profiling/profile_summarizer_test.cc
index 98d26196b75..fd81c00e603 100644
--- a/tensorflow/lite/profiling/profile_summarizer_test.cc
+++ b/tensorflow/lite/profiling/profile_summarizer_test.cc
@@ -26,7 +26,6 @@ limitations under the License.
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/profiling/buffered_profiler.h"
-#include "tensorflow/lite/testing/util.h"
 #include "tensorflow/lite/version.h"
 
 namespace tflite {
@@ -224,9 +223,3 @@ TEST_F(ProfileSummarizerIfOpTest, TestIfFalse) {
 }  // namespace
 }  // namespace profiling
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/profiling/profile_summary_formatter_test.cc b/tensorflow/lite/profiling/profile_summary_formatter_test.cc
index 78d46aae1ea..0de0e733842 100644
--- a/tensorflow/lite/profiling/profile_summary_formatter_test.cc
+++ b/tensorflow/lite/profiling/profile_summary_formatter_test.cc
@@ -19,7 +19,6 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/testing/util.h"
 
 namespace tflite {
 namespace profiling {
@@ -156,9 +155,3 @@ TEST(SummaryWriterTest, DelegationShortSummary) {
 }  // namespace
 }  // namespace profiling
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/profiling/profiler_test.cc b/tensorflow/lite/profiling/profiler_test.cc
index 1d8455e3647..c59dca9738e 100644
--- a/tensorflow/lite/profiling/profiler_test.cc
+++ b/tensorflow/lite/profiling/profiler_test.cc
@@ -22,7 +22,6 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/testing/util.h"
 
 namespace tflite {
 namespace profiling {
@@ -136,9 +135,3 @@ TEST(ProfilingTest, NoopProfiler) {
 }  // namespace
 }  // namespace profiling
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/profiling/test_main.cc b/tensorflow/lite/profiling/test_main.cc
new file mode 100644
index 00000000000..df6b8cb0477
--- /dev/null
+++ b/tensorflow/lite/profiling/test_main.cc
@@ -0,0 +1,23 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/testing/util.h"
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/profiling/time_test.cc b/tensorflow/lite/profiling/time_test.cc
index 6f08479adeb..8a85de9fe51 100644
--- a/tensorflow/lite/profiling/time_test.cc
+++ b/tensorflow/lite/profiling/time_test.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "tensorflow/lite/profiling/time.h"
 #include <gtest/gtest.h>
-#include "tensorflow/lite/testing/util.h"
 
 namespace tflite {
 namespace profiling {
@@ -48,9 +47,3 @@ TEST(TimeTest, SleepForMicros) {
 }  // namespace time
 }  // namespace profiling
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/python/BUILD b/tensorflow/lite/python/BUILD
index 3f4e187b4eb..498a074d2c0 100644
--- a/tensorflow/lite/python/BUILD
+++ b/tensorflow/lite/python/BUILD
@@ -164,6 +164,7 @@ py_test(
     tags = [
         "no_mac",  # TODO(b/148247402): flatbuffers import broken on Mac OS.
         "no_windows",
+        "notsan",  # b/168812578
     ],
     deps = [
         ":lite",
@@ -211,11 +212,16 @@ py_library(
         "//tensorflow/lite:__subpackages__",
     ],
     deps = [
-        ":lite_constants",
         ":op_hint",
         ":schema_py",
+        "//tensorflow/lite/toco:toco_flags_proto_py",
+        "//tensorflow/python:convert_to_constants",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:error_interpolation",
+        "//tensorflow/python:graph_util",
         "//tensorflow/python:tf_optimizer",
-        "//tensorflow/python/eager:wrap_function",
+        "//tensorflow/python/eager:function",
+        "//tensorflow/python/training:saver",
         "@absl_py//absl/logging",
         "@flatbuffers//:runtime_py",
         "@six_archive//:six",
diff --git a/tensorflow/lite/python/convert.py b/tensorflow/lite/python/convert.py
index 8a9f8929b99..84140a10356 100644
--- a/tensorflow/lite/python/convert.py
+++ b/tensorflow/lite/python/convert.py
@@ -288,6 +288,55 @@ Alternative, use virtualenv.""")
         pass
 
 
+def build_toco_flags(inference_type=lite_constants.FLOAT,
+                     inference_input_type=None,
+                     input_format=lite_constants.TENSORFLOW_GRAPHDEF,
+                     output_format=lite_constants.TFLITE,
+                     default_ranges_stats=None,
+                     drop_control_dependency=True,
+                     reorder_across_fake_quant=False,
+                     allow_custom_ops=False,
+                     custom_opdefs=None,
+                     post_training_quantize=False,
+                     quantize_to_float16=False,
+                     dump_graphviz_dir=None,
+                     dump_graphviz_video=False,
+                     target_ops=None,
+                     conversion_summary_dir=None,
+                     **_):
+  """Build the TOCO flags object from params."""
+  toco = _toco_flags_pb2.TocoFlags()
+  toco.input_format = input_format
+  toco.output_format = output_format
+  toco.inference_type = util.convert_dtype_to_tflite_type(inference_type)
+  if inference_input_type:
+    toco.inference_input_type = util.convert_dtype_to_tflite_type(
+        inference_input_type)
+  else:
+    toco.inference_input_type = toco.inference_type
+  toco.drop_control_dependency = drop_control_dependency
+  toco.reorder_across_fake_quant = reorder_across_fake_quant
+  toco.allow_custom_ops = allow_custom_ops
+  if custom_opdefs:
+    toco.custom_opdefs.extend(custom_opdefs)
+  toco.post_training_quantize = post_training_quantize
+  toco.quantize_to_float16 = quantize_to_float16
+  if default_ranges_stats:
+    toco.default_ranges_min = default_ranges_stats[0]
+    toco.default_ranges_max = default_ranges_stats[1]
+  if dump_graphviz_dir:
+    toco.dump_graphviz_dir = dump_graphviz_dir
+  toco.dump_graphviz_include_video = dump_graphviz_video
+  if conversion_summary_dir:
+    toco.conversion_summary_dir = conversion_summary_dir
+  if target_ops:
+    if OpsSet.SELECT_TF_OPS in set(target_ops):
+      toco.enable_select_tf_ops = True
+    if set(target_ops) == set([OpsSet.SELECT_TF_OPS]):
+      toco.force_select_tf_ops = True
+  return toco
+
+
 def build_toco_convert_protos(input_tensors,
                               output_tensors,
                               inference_type=lite_constants.FLOAT,
@@ -403,37 +452,13 @@ def build_toco_convert_protos(input_tensors,
     RuntimeError: If TOCO fails to convert (in which case the runtime error's
       error text will contain the TOCO error log)
   """
-  toco = _toco_flags_pb2.TocoFlags()
-  toco.input_format = input_format
-  toco.output_format = output_format
-  toco.inference_type = util.convert_dtype_to_tflite_type(inference_type)
-  if inference_input_type:
-    toco.inference_input_type = util.convert_dtype_to_tflite_type(
-        inference_input_type)
-  else:
-    toco.inference_input_type = toco.inference_type
-  toco.drop_control_dependency = drop_control_dependency
-  toco.reorder_across_fake_quant = reorder_across_fake_quant
-  toco.allow_custom_ops = allow_custom_ops
-  if custom_opdefs:
-    toco.custom_opdefs.extend(custom_opdefs)
-  toco.post_training_quantize = post_training_quantize
-  toco.quantize_to_float16 = quantize_to_float16
-  if default_ranges_stats:
-    toco.default_ranges_min = default_ranges_stats[0]
-    toco.default_ranges_max = default_ranges_stats[1]
-  if dump_graphviz_dir:
-    toco.dump_graphviz_dir = dump_graphviz_dir
-  toco.dump_graphviz_include_video = dump_graphviz_video
-  if conversion_summary_dir:
-    toco.conversion_summary_dir = conversion_summary_dir
-  if target_ops:
-    if set(target_ops) == set([OpsSet.TFLITE_BUILTINS, OpsSet.SELECT_TF_OPS]):
-      toco.enable_select_tf_ops = True
-    elif set(target_ops) == set([OpsSet.SELECT_TF_OPS]):
-      toco.enable_select_tf_ops = True
-      toco.force_select_tf_ops = True
-
+  toco = build_toco_flags(inference_type, inference_input_type, input_format,
+                          output_format, default_ranges_stats,
+                          drop_control_dependency, reorder_across_fake_quant,
+                          allow_custom_ops, custom_opdefs,
+                          post_training_quantize, quantize_to_float16,
+                          dump_graphviz_dir, dump_graphviz_video, target_ops,
+                          conversion_summary_dir)
   model = _model_flags_pb2.ModelFlags()
   model.change_concat_input_ranges = change_concat_input_ranges
   for idx, input_tensor in enumerate(input_tensors):
@@ -575,6 +600,30 @@ def toco_convert_impl(input_data, input_tensors, output_tensors,
   return data
 
 
+def convert_saved_model(saved_model_dir=None,
+                        saved_model_version=0,
+                        saved_model_tags=None,
+                        saved_model_exported_names=None,
+                        **kwargs):
+  """Converts a saved_model using TF Lite converter."""
+  model_flags = _model_flags_pb2.ModelFlags()
+  if saved_model_dir:
+    model_flags.saved_model_dir = saved_model_dir
+  model_flags.saved_model_version = saved_model_version
+  if saved_model_tags:
+    model_flags.saved_model_tags.extend(saved_model_tags)
+  if saved_model_exported_names:
+    model_flags.saved_model_exported_names.extend(saved_model_exported_names)
+  toco_flags = build_toco_flags(**kwargs)
+  data = toco_convert_protos(
+      model_flags.SerializeToString(),
+      toco_flags.SerializeToString(),
+      None,  # input_data, unused
+      None,  # debug_info_str, unused
+      enable_mlir_converter=True)
+  return data
+
+
 @_tf_export(v1=["lite.toco_convert"])
 @deprecation.deprecated(None, "Use `lite.TFLiteConverter` instead.")
 def toco_convert(input_data, input_tensors, output_tensors, *args, **kwargs):
diff --git a/tensorflow/lite/python/interpreter.py b/tensorflow/lite/python/interpreter.py
index 12ee41d6dee..c7f86c6d6d8 100644
--- a/tensorflow/lite/python/interpreter.py
+++ b/tensorflow/lite/python/interpreter.py
@@ -185,8 +185,8 @@ class Interpreter(object):
           objects returned by lite.load_delegate().
       num_threads: Sets the number of threads used by the interpreter and
         available to CPU kernels. If not set, the interpreter will use an
-        implementation-dependent default number of threads. Currently,
-        only a subset of kernels, such as conv, support multi-threading.
+        implementation-dependent default number of threads. Currently, only a
+        subset of kernels, such as conv, support multi-threading.
 
     Raises:
       ValueError: If the interpreter was unable to create.
@@ -194,19 +194,33 @@ class Interpreter(object):
     if not hasattr(self, '_custom_op_registerers'):
       self._custom_op_registerers = []
     if model_path and not model_content:
+      custom_op_registerers_by_name = [
+          x for x in self._custom_op_registerers if isinstance(x, str)
+      ]
+      custom_op_registerers_by_func = [
+          x for x in self._custom_op_registerers if not isinstance(x, str)
+      ]
       self._interpreter = (
           _interpreter_wrapper.CreateWrapperFromFile(
-              model_path, self._custom_op_registerers))
+              model_path, custom_op_registerers_by_name,
+              custom_op_registerers_by_func))
       if not self._interpreter:
         raise ValueError('Failed to open {}'.format(model_path))
     elif model_content and not model_path:
+      custom_op_registerers_by_name = [
+          x for x in self._custom_op_registerers if isinstance(x, str)
+      ]
+      custom_op_registerers_by_func = [
+          x for x in self._custom_op_registerers if not isinstance(x, str)
+      ]
       # Take a reference, so the pointer remains valid.
       # Since python strings are immutable then PyString_XX functions
       # will always return the same pointer.
       self._model_content = model_content
       self._interpreter = (
           _interpreter_wrapper.CreateWrapperFromBuffer(
-              model_content, self._custom_op_registerers))
+              model_content, custom_op_registerers_by_name,
+              custom_op_registerers_by_func))
     elif not model_content and not model_path:
       raise ValueError('`model_path` or `model_content` must be specified.')
     else:
@@ -528,25 +542,26 @@ class Interpreter(object):
     return self._interpreter.ResetVariableTensors()
 
   # Experimental and subject to change.
-  def _native_interpreter(self):
-    """Returns the underlying InterpreterWrapper object.
+  def _native_handle(self):
+    """Returns a pointer to the underlying tflite::Interpreter instance.
 
-    This allows users to extend tflite.Interpreter's functionality in custom cpp
-    function. For example,
-    at cpp level:
-      void SomeNewFeature(InterpreterWrapper* wrapper) {
-        // Get access to tflite::Interpreter
-        auto* interpreter = wrapper->interpreter();
-        // ...
-      }
-    at python level:
-      def some_new_feature(interpreter):
-        _cpp_to_py_wrapper.SomeNewFeature(interpreter._native_interpreter())
+    This allows extending tflite.Interpreter's functionality in a custom C++
+    function. Consider how that may work in a custom pybind wrapper:
+
+      m.def("SomeNewFeature", ([](py::object handle) {
+        auto* interpreter =
+          reinterpret_cast<tflite::Interpreter*>(handle.cast<intptr_t>());
+        ...
+      }))
+
+    and corresponding Python call:
+
+      SomeNewFeature(interpreter.native_handle())
 
     Note: This approach is fragile. Users must guarantee the C++ extension build
     is consistent with the tflite.Interpreter's underlying C++ build.
     """
-    return self._interpreter
+    return self._interpreter.interpreter()
 
 
 class InterpreterWithCustomOps(Interpreter):
@@ -573,8 +588,10 @@ class InterpreterWithCustomOps(Interpreter):
       experimental_delegates: Experimental. Subject to change. List of
         [TfLiteDelegate](https://www.tensorflow.org/lite/performance/delegates)
           objects returned by lite.load_delegate().
-      custom_op_registerers: List of str, symbol names of functions that take a
-        pointer to a MutableOpResolver and register a custom op.
+      custom_op_registerers: List of str (symbol names) or functions that take a
+        pointer to a MutableOpResolver and register a custom op. When passing
+        functions, use a pybind function that takes a uintptr_t that can be
+        recast as a pointer to a MutableOpResolver.
 
     Raises:
       ValueError: If the interpreter was unable to create.
diff --git a/tensorflow/lite/python/interpreter_test.py b/tensorflow/lite/python/interpreter_test.py
index cc74f4d8fbc..bcb338b84cf 100644
--- a/tensorflow/lite/python/interpreter_test.py
+++ b/tensorflow/lite/python/interpreter_test.py
@@ -42,7 +42,7 @@ from tensorflow.python.platform import test
 
 class InterpreterCustomOpsTest(test_util.TensorFlowTestCase):
 
-  def testRegisterer(self):
+  def testRegistererByName(self):
     interpreter = interpreter_wrapper.InterpreterWithCustomOps(
         model_path=resource_loader.get_path_to_datafile(
             'testdata/permute_float.tflite'),
@@ -50,6 +50,14 @@ class InterpreterCustomOpsTest(test_util.TensorFlowTestCase):
     self.assertTrue(interpreter._safe_to_run())
     self.assertEqual(test_registerer.get_num_test_registerer_calls(), 1)
 
+  def testRegistererByFunc(self):
+    interpreter = interpreter_wrapper.InterpreterWithCustomOps(
+        model_path=resource_loader.get_path_to_datafile(
+            'testdata/permute_float.tflite'),
+        custom_op_registerers=[test_registerer.TF_TestRegisterer])
+    self.assertTrue(interpreter._safe_to_run())
+    self.assertEqual(test_registerer.get_num_test_registerer_calls(), 1)
+
   def testRegistererFailure(self):
     bogus_name = 'CompletelyBogusRegistererName'
     with self.assertRaisesRegex(
@@ -72,14 +80,16 @@ class InterpreterTest(test_util.TensorFlowTestCase):
     with self.assertRaisesRegex(ValueError, 'num_threads should >= 1'):
       interpreter_wrapper.Interpreter(
           model_path=resource_loader.get_path_to_datafile(
-              'testdata/permute_float.tflite'), num_threads=-1)
+              'testdata/permute_float.tflite'),
+          num_threads=-1)
 
   def testThreads_WrongType(self):
     with self.assertRaisesRegex(ValueError,
                                 'type of num_threads should be int'):
       interpreter_wrapper.Interpreter(
           model_path=resource_loader.get_path_to_datafile(
-              'testdata/permute_float.tflite'), num_threads=4.2)
+              'testdata/permute_float.tflite'),
+          num_threads=4.2)
 
   def testFloat(self):
     interpreter = interpreter_wrapper.Interpreter(
@@ -116,7 +126,8 @@ class InterpreterTest(test_util.TensorFlowTestCase):
   def testFloatWithTwoThreads(self):
     interpreter = interpreter_wrapper.Interpreter(
         model_path=resource_loader.get_path_to_datafile(
-            'testdata/permute_float.tflite'), num_threads=2)
+            'testdata/permute_float.tflite'),
+        num_threads=2)
     interpreter.allocate_tensors()
 
     input_details = interpreter.get_input_details()
@@ -158,8 +169,7 @@ class InterpreterTest(test_util.TensorFlowTestCase):
 
     test_input = np.array([[1, 2, 3, 4]], dtype=np.uint8)
     expected_output = np.array([[4, 3, 2, 1]], dtype=np.uint8)
-    interpreter.resize_tensor_input(input_details[0]['index'],
-                                    test_input.shape)
+    interpreter.resize_tensor_input(input_details[0]['index'], test_input.shape)
     interpreter.allocate_tensors()
     interpreter.set_tensor(input_details[0]['index'], test_input)
     interpreter.invoke()
@@ -267,8 +277,7 @@ class InterpreterTestErrorPropagation(test_util.TensorFlowTestCase):
   def testInvalidModelFile(self):
     with self.assertRaisesRegex(ValueError,
                                 'Could not open \'totally_invalid_file_name\''):
-      interpreter_wrapper.Interpreter(
-          model_path='totally_invalid_file_name')
+      interpreter_wrapper.Interpreter(model_path='totally_invalid_file_name')
 
   def testInvokeBeforeReady(self):
     interpreter = interpreter_wrapper.Interpreter(
@@ -423,16 +432,19 @@ class InterpreterDelegateTest(test_util.TensorFlowTestCase):
     self.skipTest('TODO(b/142136355): fix flakiness and re-enable')
     # Track which order destructions were doned in
     destructions = []
+
     def register_destruction(x):
       destructions.append(
           x if isinstance(x, str) else six.ensure_text(x, 'utf-8'))
       return 0
+
     # Make a wrapper for the callback so we can send this to ctypes
     delegate = interpreter_wrapper.load_delegate(self._delegate_file)
     # Make an interpreter with the delegate
     interpreter = interpreter_wrapper.Interpreter(
         model_path=resource_loader.get_path_to_datafile(
-            'testdata/permute_float.tflite'), experimental_delegates=[delegate])
+            'testdata/permute_float.tflite'),
+        experimental_delegates=[delegate])
 
     class InterpreterDestroyCallback(object):
 
diff --git a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
index 7295a46193e..adfa760f147 100644
--- a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
+++ b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include <stdarg.h>
 
+#include <functional>
 #include <sstream>
 #include <string>
 
@@ -168,17 +169,22 @@ bool RegisterCustomOpByName(const char* registerer_name,
 InterpreterWrapper* InterpreterWrapper::CreateInterpreterWrapper(
     std::unique_ptr<tflite_api_dispatcher::TfLiteModel> model,
     std::unique_ptr<PythonErrorReporter> error_reporter,
-    const std::vector<std::string>& registerers, std::string* error_msg) {
+    const std::vector<std::string>& registerers_by_name,
+    const std::vector<std::function<void(uintptr_t)>>& registerers_by_func,
+    std::string* error_msg) {
   if (!model) {
     *error_msg = error_reporter->message();
     return nullptr;
   }
 
   auto resolver = absl::make_unique<tflite::ops::builtin::BuiltinOpResolver>();
-  for (const auto& registerer : registerers) {
+  for (const auto& registerer : registerers_by_name) {
     if (!RegisterCustomOpByName(registerer.c_str(), resolver.get(), error_msg))
       return nullptr;
   }
+  for (const auto& registerer : registerers_by_func) {
+    registerer(reinterpret_cast<uintptr_t>(resolver.get()));
+  }
   auto interpreter = CreateInterpreter(model.get(), *resolver);
   if (!interpreter) {
     *error_msg = error_reporter->message();
@@ -655,18 +661,27 @@ PyObject* InterpreterWrapper::tensor(PyObject* base_object, int i) {
 }
 
 InterpreterWrapper* InterpreterWrapper::CreateWrapperCPPFromFile(
-    const char* model_path, const std::vector<std::string>& registerers,
+    const char* model_path, const std::vector<std::string>& registerers_by_name,
+    const std::vector<std::function<void(uintptr_t)>>& registerers_by_func,
     std::string* error_msg) {
   std::unique_ptr<PythonErrorReporter> error_reporter(new PythonErrorReporter);
   std::unique_ptr<tflite_api_dispatcher::TfLiteModel> model =
       tflite_api_dispatcher::TfLiteModel::BuildFromFile(model_path,
                                                         error_reporter.get());
   return CreateInterpreterWrapper(std::move(model), std::move(error_reporter),
-                                  registerers, error_msg);
+                                  registerers_by_name, registerers_by_func,
+                                  error_msg);
+}
+
+InterpreterWrapper* InterpreterWrapper::CreateWrapperCPPFromFile(
+    const char* model_path, const std::vector<std::string>& registerers,
+    std::string* error_msg) {
+  return CreateWrapperCPPFromFile(model_path, registerers, {}, error_msg);
 }
 
 InterpreterWrapper* InterpreterWrapper::CreateWrapperCPPFromBuffer(
-    PyObject* data, const std::vector<std::string>& registerers,
+    PyObject* data, const std::vector<std::string>& registerers_by_name,
+    const std::vector<std::function<void(uintptr_t)>>& registerers_by_func,
     std::string* error_msg) {
   char* buf = nullptr;
   Py_ssize_t length;
@@ -679,7 +694,14 @@ InterpreterWrapper* InterpreterWrapper::CreateWrapperCPPFromBuffer(
       tflite_api_dispatcher::TfLiteModel::BuildFromBuffer(buf, length,
                                                           error_reporter.get());
   return CreateInterpreterWrapper(std::move(model), std::move(error_reporter),
-                                  registerers, error_msg);
+                                  registerers_by_name, registerers_by_func,
+                                  error_msg);
+}
+
+InterpreterWrapper* InterpreterWrapper::CreateWrapperCPPFromBuffer(
+    PyObject* data, const std::vector<std::string>& registerers,
+    std::string* error_msg) {
+  return CreateWrapperCPPFromBuffer(data, registerers, {}, error_msg);
 }
 
 PyObject* InterpreterWrapper::ResetVariableTensors() {
diff --git a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.h b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.h
index 5580eaa0f4b..6b83d2d06db 100644
--- a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.h
+++ b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_PYTHON_INTERPRETER_WRAPPER_INTERPRETER_WRAPPER_H_
 #define TENSORFLOW_LITE_PYTHON_INTERPRETER_WRAPPER_INTERPRETER_WRAPPER_H_
 
+#include <functional>
 #include <memory>
 #include <string>
 #include <vector>
@@ -51,11 +52,20 @@ class InterpreterWrapper {
   static InterpreterWrapper* CreateWrapperCPPFromFile(
       const char* model_path, const std::vector<std::string>& registerers,
       std::string* error_msg);
+  static InterpreterWrapper* CreateWrapperCPPFromFile(
+      const char* model_path,
+      const std::vector<std::string>& registerers_by_name,
+      const std::vector<std::function<void(uintptr_t)>>& registerers_by_func,
+      std::string* error_msg);
 
   // SWIG caller takes ownership of pointer.
   static InterpreterWrapper* CreateWrapperCPPFromBuffer(
       PyObject* data, const std::vector<std::string>& registerers,
       std::string* error_msg);
+  static InterpreterWrapper* CreateWrapperCPPFromBuffer(
+      PyObject* data, const std::vector<std::string>& registerers_by_name,
+      const std::vector<std::function<void(uintptr_t)>>& registerers_by_func,
+      std::string* error_msg);
 
   ~InterpreterWrapper();
   PyObject* AllocateTensors();
@@ -106,7 +116,9 @@ class InterpreterWrapper {
   static InterpreterWrapper* CreateInterpreterWrapper(
       std::unique_ptr<tflite_api_dispatcher::TfLiteModel> model,
       std::unique_ptr<PythonErrorReporter> error_reporter,
-      const std::vector<std::string>& registerers, std::string* error_msg);
+      const std::vector<std::string>& registerers_by_name,
+      const std::vector<std::function<void(uintptr_t)>>& registerers_by_func,
+      std::string* error_msg);
 
   InterpreterWrapper(
       std::unique_ptr<tflite_api_dispatcher::TfLiteModel> model,
diff --git a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper_pybind11.cc b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper_pybind11.cc
index a85bdc8baf4..61771ff62a4 100644
--- a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper_pybind11.cc
+++ b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper_pybind11.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "pybind11/functional.h"
 #include "pybind11/pybind11.h"
 #include "pybind11/pytypes.h"
 #include "pybind11/stl.h"
@@ -42,6 +43,20 @@ PYBIND11_MODULE(_pywrap_tensorflow_interpreter_wrapper, m) {
           }
           return wrapper;
         });
+  m.def("CreateWrapperFromFile",
+        [](const std::string& model_path,
+           const std::vector<std::string>& registerers_by_name,
+           const std::vector<std::function<void(uintptr_t)>>&
+               registerers_by_func) {
+          std::string error;
+          auto* wrapper = ::InterpreterWrapper::CreateWrapperCPPFromFile(
+              model_path.c_str(), registerers_by_name, registerers_by_func,
+              &error);
+          if (!wrapper) {
+            throw std::invalid_argument(error);
+          }
+          return wrapper;
+        });
   m.def("CreateWrapperFromBuffer",
         [](const py::bytes& data, const std::vector<std::string>& registerers) {
           std::string error;
@@ -52,6 +67,19 @@ PYBIND11_MODULE(_pywrap_tensorflow_interpreter_wrapper, m) {
           }
           return wrapper;
         });
+  m.def("CreateWrapperFromBuffer",
+        [](const py::bytes& data,
+           const std::vector<std::string>& registerers_by_name,
+           const std::vector<std::function<void(uintptr_t)>>&
+               registerers_by_func) {
+          std::string error;
+          auto* wrapper = ::InterpreterWrapper::CreateWrapperCPPFromBuffer(
+              data.ptr(), registerers_by_name, registerers_by_func, &error);
+          if (!wrapper) {
+            throw std::invalid_argument(error);
+          }
+          return wrapper;
+        });
   py::class_<InterpreterWrapper>(m, "InterpreterWrapper")
       .def("AllocateTensors",
            [](InterpreterWrapper& self) {
@@ -153,5 +181,8 @@ PYBIND11_MODULE(_pywrap_tensorflow_interpreter_wrapper, m) {
           },
           R"pbdoc(
              ask the interpreter to set the number of threads to use.
-          )pbdoc");
+          )pbdoc")
+      .def("interpreter", [](InterpreterWrapper& self) {
+        return reinterpret_cast<intptr_t>(self.interpreter());
+      });
 }
diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index 56397110e5b..1d30f50c155 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -38,6 +38,7 @@ from tensorflow.lite.experimental.microfrontend.python.ops import audio_microfro
 from tensorflow.lite.experimental.tensorboard.ops_util import get_potentially_supported_ops  # pylint: disable=unused-import
 from tensorflow.lite.python import lite_constants as constants
 from tensorflow.lite.python.convert import build_toco_convert_protos  # pylint: disable=unused-import
+from tensorflow.lite.python.convert import convert_saved_model as _convert_saved_model
 from tensorflow.lite.python.convert import ConverterError  # pylint: disable=unused-import
 from tensorflow.lite.python.convert import mlir_quantize as _mlir_quantize
 from tensorflow.lite.python.convert import mlir_sparsify as _mlir_sparsify
@@ -61,7 +62,7 @@ from tensorflow.lite.python.util import get_grappler_config as _get_grappler_con
 from tensorflow.lite.python.util import get_tensor_name as _get_tensor_name
 from tensorflow.lite.python.util import get_tensors_from_tensor_names as _get_tensors_from_tensor_names
 from tensorflow.lite.python.util import is_frozen_graph as _is_frozen_graph
-from tensorflow.lite.python.util import modify_integer_quantized_model_io_type as _modify_integer_quantized_model_io_type
+from tensorflow.lite.python.util import modify_model_io_type as _modify_model_io_type
 from tensorflow.lite.python.util import run_graph_optimizations as _run_graph_optimizations
 from tensorflow.lite.python.util import set_tensor_shapes as _set_tensor_shapes
 from tensorflow.python import keras as _keras
@@ -325,8 +326,9 @@ class QuantizationMode(object):
     else:
       return False, None
 
-  def flags_modify_model_io_type(
-      self, input_type=constants.FLOAT, output_type=constants.FLOAT):
+  def flags_modify_model_io_type(self,
+                                 input_type=constants.FLOAT,
+                                 output_type=constants.FLOAT):
     """Flags for modifying the input and output type of a tflite model."""
     is_post_training_quantize = self.quantizer_flags(input_type, output_type)[0]
     is_training_time_only_quantize = self.training_time_int8_allow_float() and \
@@ -548,33 +550,7 @@ class TFLiteConverterBase(object):
 
 
 class TFLiteConverterBaseV2(TFLiteConverterBase):
-  """Converter subclass to share functionality between V2 converters.
-
-  Attributes:
-    allow_custom_ops: Boolean indicating whether to allow custom operations.
-      When False, any unknown operation is an error. When True, custom ops are
-      created for any op that is unknown. The developer needs to provide these
-      to the TensorFlow Lite runtime with a custom resolver. (default False)
-    optimizations: Experimental flag, subject to change. A list of optimizations
-      to apply when converting the model. E.g. `[Optimize.DEFAULT]`
-    representative_dataset: A representative dataset that can be used to
-      generate input and output samples for the model. The converter can use the
-      dataset to evaluate different optimizations. Note that this is an optional
-      attribute but it is necessary if INT8 is the only support builtin ops in
-      target ops.
-    target_spec: Experimental flag, subject to change. Specification of target
-      device.
-    inference_input_type: Data type of the input layer. Note that integer types
-      (tf.int8 and tf.uint8) are currently only supported for post training
-      integer quantization. (default tf.float32, must be in {tf.float32,
-      tf.int8, tf.uint8})
-    inference_output_type: Data type of the output layer. Note that integer
-      types (tf.int8 and tf.uint8) are currently only supported for post
-      training integer quantization. (default tf.float32, must be in
-      {tf.float32, tf.int8, tf.uint8})
-    experimental_new_converter: Experimental flag, subject to change. Enables
-      MLIR-based conversion instead of TOCO conversion. (default True)
-  """
+  """Converter subclass to share functionality between V2 converters."""
 
   def __init__(self):
     """Constructor for TFLiteConverter."""
@@ -676,8 +652,7 @@ class TFLiteConverterBaseV2(TFLiteConverterBase):
     flags_modify_model_io_type = quant_mode.flags_modify_model_io_type(
         self.inference_input_type, self.inference_output_type)
     if flags_modify_model_io_type:
-      result = _modify_integer_quantized_model_io_type(
-          result, **flags_modify_model_io_type)
+      result = _modify_model_io_type(result, **flags_modify_model_io_type)
 
     if self._experimental_sparsify_model:
       result = _mlir_sparsify(result)
@@ -737,19 +712,63 @@ class TFLiteSavedModelConverterV2(TFLiteConverterBaseV2):
     saved_model.load_graph(graph, tags=self._saved_model_tags)
     meta_graph = saved_model.get_meta_graph_def_from_tags(
         self._saved_model_tags)
-    signature_def = meta_graph.signature_def[
-        _signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
-    input_tensors = [
-        graph.get_tensor_by_name(signature_def.inputs[key].name)
-        for key in signature_def.inputs
-    ]
-    output_tensors = [
-        graph.get_tensor_by_name(signature_def.outputs[key].name)
-        for key in signature_def.outputs
-    ]
-    return super(TFLiteSavedModelConverterV2,
-                 self).convert(meta_graph.graph_def, input_tensors,
-                               output_tensors)
+    # If we can't use saved model importer, then fallback
+    # to frozen graph conversion path.
+    if self.saved_model_dir is None or not self.experimental_new_converter:
+      signature_def = meta_graph.signature_def[
+          _signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
+      input_tensors = [
+          graph.get_tensor_by_name(signature_def.inputs[key].name)
+          for key in signature_def.inputs
+      ]
+      output_tensors = [
+          graph.get_tensor_by_name(signature_def.outputs[key].name)
+          for key in signature_def.outputs
+      ]
+      result = _freeze_saved_model(
+          self.saved_model_dir, None, None, None, self._saved_model_tags,
+          _signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY)
+      graph_def = result[0]
+      # We make sure to clear the saved_model_dir as there is some
+      # legacy code down in the caller that checks this.
+      # TODO(b/162537905): Clean these indirect dependencies.
+      self.saved_model_dir = None
+      return super(TFLiteSavedModelConverterV2,
+                   self).convert(graph_def, input_tensors,
+                                 output_tensors)
+
+    if self._trackable_obj is None:
+      self._debug_info = _get_debug_info(
+          _build_debug_info_func(self._funcs[0].graph), meta_graph.graph_def)
+    else:
+      self._debug_info = _get_debug_info(
+          _convert_debug_info_func(self._trackable_obj.graph_debug_info),
+          meta_graph.graph_def)
+
+    # Get quantization options and do some sanity checks.
+    quant_mode = QuantizationMode(self.optimizations, self.target_spec,
+                                  self.representative_dataset,
+                                  meta_graph.graph_def)
+    self._validate_inference_input_output_types(quant_mode)
+
+    converter_kwargs = self._get_base_converter_args()
+    converter_kwargs.update(quant_mode.converter_flags())
+
+    result = _convert_saved_model(**converter_kwargs)
+    calibrate_and_quantize, flags = quant_mode.quantizer_flags(
+        self.inference_input_type, self.inference_output_type)
+    if calibrate_and_quantize:
+      result = self._calibrate_quantize_model(result, **flags)
+
+    flags_modify_model_io_type = quant_mode.flags_modify_model_io_type(
+        self.inference_input_type, self.inference_output_type)
+    if flags_modify_model_io_type:
+      result = _modify_model_io_type(result, **flags_modify_model_io_type)
+
+    if self._experimental_sparsify_model:
+      result = _mlir_sparsify(result)
+
+    return result
 
 
 class TFLiteKerasModelConverterV2(TFLiteConverterBaseV2):
@@ -962,15 +981,14 @@ class TFLiteConverterV2(TFLiteFrozenGraphConverterV2):
       device.
     inference_input_type: Data type of the input layer. Note that integer types
       (tf.int8 and tf.uint8) are currently only supported for post training
-      integer quantization. (default tf.float32, must be in {tf.float32,
-      tf.int8, tf.uint8})
+      integer quantization and quantization aware training. (default tf.float32,
+      must be in {tf.float32, tf.int8, tf.uint8})
     inference_output_type: Data type of the output layer. Note that integer
       types (tf.int8 and tf.uint8) are currently only supported for post
-      training integer quantization. (default tf.float32, must be in
-      {tf.float32, tf.int8, tf.uint8})
+      training integer quantization and quantization aware training. (default
+      tf.float32, must be in {tf.float32, tf.int8, tf.uint8})
     experimental_new_converter: Experimental flag, subject to change. Enables
       MLIR-based conversion instead of TOCO conversion. (default True)
-
   Example usage:
 
     ```python
@@ -1057,9 +1075,8 @@ class TFLiteConverterV2(TFLiteFrozenGraphConverterV2):
           signature_key = signature_keys[0]
       logging.warning("Invoking the TF1 implementation of TFLiteConverter "
                       "because eager is disabled. Consider enabling eager.")
-      return TFLiteConverter.from_saved_model(saved_model_dir,
-                                              signature_key=signature_key,
-                                              tag_set=tags)
+      return TFLiteConverter.from_saved_model(
+          saved_model_dir, signature_key=signature_key, tag_set=tags)
 
     # Ensures any graphs created in Eager mode are able to run. This is required
     # in order to create a tf.estimator.Exporter that exports a TFLite model.
@@ -1119,78 +1136,7 @@ class TFLiteConverterV2(TFLiteFrozenGraphConverterV2):
 
 
 class TFLiteConverterBaseV1(TFLiteConverterBase):
-  """Converter subclass to share functionality between V1 converters.
-
-  Attributes:
-    inference_type: Target data type of real-number arrays in the output file.
-      Must be `{tf.float32, tf.uint8}`. If `optimzations` are provided, this
-      parameter is ignored. (default tf.float32)
-    inference_input_type: Target data type of real-number input arrays. Allows
-      for a different type for input arrays. If an integer type is provided and
-      `optimizations` are not used, `quantized_input_stats` must be provided.
-      If `inference_type` is tf.uint8, signaling conversion to a fully quantized
-      model from a quantization-aware trained input model, then
-      `inference_input_type` defaults to tf.uint8. In all other cases,
-      `inference_input_type` defaults to tf.float32. Must be `{tf.float32,
-      tf.uint8, tf.int8}`
-    inference_output_type: Target data type of real-number output arrays. Allows
-      for a different type for output arrays. If `inference_type` is tf.uint8,
-      signaling conversion to a fully quantized model from a quantization-aware
-      trained output model, then `inference_output_type` defaults to tf.uint8.
-      In all other cases, `inference_output_type` must be tf.float32, an error
-      will be thrown otherwise. Must be `{tf.float32, tf.uint8, tf.int8}`
-    output_format: Output file format. Currently must be `{TFLITE,
-      GRAPHVIZ_DOT}`. (default TFLITE)
-    quantized_input_stats: Dict of strings representing input tensor names
-      mapped to tuple of floats representing the mean and standard deviation
-      of the training data (e.g., {"foo" : (0., 1.)}). Only need if
-        `inference_input_type` is `QUANTIZED_UINT8`. real_input_value =
-        (quantized_input_value - mean_value) / std_dev_value. (default {})
-    default_ranges_stats: Tuple of integers representing (min, max) range values
-      for all arrays without a specified range. Intended for experimenting with
-      quantization via "dummy quantization". (default None)
-    drop_control_dependency: Boolean indicating whether to drop control
-      dependencies silently. This is due to TFLite not supporting control
-      dependencies. (default True)
-    reorder_across_fake_quant: Boolean indicating whether to reorder FakeQuant
-      nodes in unexpected locations. Used when the location of the FakeQuant
-      nodes is preventing graph transformations necessary to convert the graph.
-      Results in a graph that differs from the quantized training graph,
-      potentially causing differing arithmetic behavior. (default False)
-    change_concat_input_ranges: Boolean to change behavior of min/max ranges for
-      inputs and outputs of the concat operator for quantized models. Changes
-      the ranges of concat operator overlap when true. (default False)
-    allow_custom_ops: Boolean indicating whether to allow custom operations.
-      When false any unknown operation is an error. When true, custom ops are
-      created for any op that is unknown. The developer will need to provide
-      these to the TensorFlow Lite runtime with a custom resolver. (default
-      False)
-    post_training_quantize: Deprecated. Please specify `[Optimize.DEFAULT]` for
-      `optimizations` instead. Boolean indicating whether to quantize the
-      weights of the converted float model.  Model size will be reduced and
-      there will be latency improvements (at the cost of accuracy). (default
-      False)
-    dump_graphviz_dir: Full filepath of folder to dump the graphs at various
-      stages of processing GraphViz .dot files. Preferred over
-      --output_format=GRAPHVIZ_DOT in order to keep the requirements of the
-      output file. (default None)
-    dump_graphviz_video: Boolean indicating whether to dump the graph after
-      every graph transformation. (default False)
-    conversion_summary_dir: A string indicating the path to the generated
-      conversion logs.
-    target_ops: Deprecated. Please specify `target_spec.supported_ops` instead.
-      Set of OpsSet options indicating which converter to use. (default
-      set([OpsSet.TFLITE_BUILTINS]))
-    target_spec: Experimental flag, subject to change. Specification of target
-      device.
-    optimizations: Experimental flag, subject to change. A list of optimizations
-      to apply when converting the model. E.g. `[Optimize.DEFAULT]`
-    representative_dataset: A representative dataset that can be used to
-      generate input and output samples for the model. The converter can use the
-      dataset to evaluate different optimizations.
-    experimental_new_converter: Experimental flag, subject to change. Enables
-      MLIR-based conversion instead of TOCO conversion. (default True)
-  """
+  """Converter subclass to share functionality between V1 converters."""
 
   def __init__(self, experimental_debug_info_func):
     """Constructor for TFLiteConverter.
@@ -1679,30 +1625,26 @@ class TFLiteConverter(TFLiteFrozenGraphConverter):
       Must be `{tf.float32, tf.uint8}`. If `optimzations` are provided, this
       parameter is ignored. (default tf.float32)
     inference_input_type: Target data type of real-number input arrays. Allows
-      for a different type for input arrays.
-      If an integer type is provided and `optimizations` are not used,
-      `quantized_input_stats` must be provided.
-      If `inference_type` is tf.uint8, signaling conversion to a fully quantized
+      for a different type for input arrays. If an integer type is provided and
+      `optimizations` are not used, `quantized_input_stats` must be provided. If
+      `inference_type` is tf.uint8, signaling conversion to a fully quantized
       model from a quantization-aware trained input model, then
-      `inference_input_type` defaults to tf.uint8.
-      In all other cases, `inference_input_type` defaults to tf.float32.
-      Must be `{tf.float32, tf.uint8, tf.int8}`
+      `inference_input_type` defaults to tf.uint8. In all other cases,
+      `inference_input_type` defaults to tf.float32. Must be `{tf.float32,
+      tf.uint8, tf.int8}`
     inference_output_type: Target data type of real-number output arrays. Allows
-      for a different type for output arrays.
-      If `inference_type` is tf.uint8, signaling conversion to a fully quantized
-      model from a quantization-aware trained output model, then
-      `inference_output_type` defaults to tf.uint8.
+      for a different type for output arrays. If `inference_type` is tf.uint8,
+      signaling conversion to a fully quantized model from a quantization-aware
+      trained output model, then `inference_output_type` defaults to tf.uint8.
       In all other cases, `inference_output_type` must be tf.float32, an error
-      will be thrown otherwise.
-      Must be `{tf.float32, tf.uint8, tf.int8}`
+      will be thrown otherwise. Must be `{tf.float32, tf.uint8, tf.int8}`
     output_format: Output file format. Currently must be `{TFLITE,
       GRAPHVIZ_DOT}`. (default TFLITE)
     quantized_input_stats: Dict of strings representing input tensor names
       mapped to tuple of floats representing the mean and standard deviation
       of the training data (e.g., {"foo" : (0., 1.)}). Only need if
-      `inference_input_type` is `QUANTIZED_UINT8`.
-      real_input_value = (quantized_input_value - mean_value) / std_dev_value.
-      (default {})
+        `inference_input_type` is `QUANTIZED_UINT8`. real_input_value =
+        (quantized_input_value - mean_value) / std_dev_value. (default {})
     default_ranges_stats: Tuple of integers representing (min, max) range values
       for all arrays without a specified range. Intended for experimenting with
       quantization via "dummy quantization". (default None)
@@ -1720,13 +1662,13 @@ class TFLiteConverter(TFLiteFrozenGraphConverter):
     allow_custom_ops: Boolean indicating whether to allow custom operations.
       When false any unknown operation is an error. When true, custom ops are
       created for any op that is unknown. The developer will need to provide
-      these to the TensorFlow Lite runtime with a custom resolver.
-      (default False)
+      these to the TensorFlow Lite runtime with a custom resolver. (default
+      False)
     post_training_quantize: Deprecated. Please specify `[Optimize.DEFAULT]` for
       `optimizations` instead. Boolean indicating whether to quantize the
       weights of the converted float model.  Model size will be reduced and
-      there will be latency improvements (at the cost of accuracy).
-      (default False)
+      there will be latency improvements (at the cost of accuracy). (default
+      False)
     dump_graphviz_dir: Full filepath of folder to dump the graphs at various
       stages of processing GraphViz .dot files. Preferred over
       --output_format=GRAPHVIZ_DOT in order to keep the requirements of the
@@ -1736,40 +1678,41 @@ class TFLiteConverter(TFLiteFrozenGraphConverter):
     conversion_summary_dir: A string indicating the path to the generated
       conversion logs.
     target_ops: Deprecated. Please specify `target_spec.supported_ops` instead.
-      Set of OpsSet options indicating which converter to use.
-      (default set([OpsSet.TFLITE_BUILTINS]))
+      Set of OpsSet options indicating which converter to use. (default
+      set([OpsSet.TFLITE_BUILTINS]))
     target_spec: Experimental flag, subject to change. Specification of target
       device.
     optimizations: Experimental flag, subject to change. A list of optimizations
       to apply when converting the model. E.g. `[Optimize.DEFAULT]`
     representative_dataset: A representative dataset that can be used to
-      generate input and output samples for the model. The converter can use
-      the dataset to evaluate different optimizations.
-    experimental_new_converter: Experimental flag, subject to change.
-      Enables MLIR-based conversion instead of TOCO conversion. (default True)
-
+      generate input and output samples for the model. The converter can use the
+      dataset to evaluate different optimizations.
+    experimental_new_converter: Experimental flag, subject to change. Enables
+      MLIR-based conversion instead of TOCO conversion. (default True)
   Example usage:
 
     ```python
     # Converting a GraphDef from session.
-    converter = tf.compat.v1.TFLiteConverter.from_session(
+    converter = tf.compat.v1.lite.TFLiteConverter.from_session(
       sess, in_tensors, out_tensors)
     tflite_model = converter.convert()
     open("converted_model.tflite", "wb").write(tflite_model)
 
     # Converting a GraphDef from file.
-    converter = tf.compat.v1.TFLiteConverter.from_frozen_graph(
+    converter = tf.compat.v1.lite.TFLiteConverter.from_frozen_graph(
       graph_def_file, input_arrays, output_arrays)
     tflite_model = converter.convert()
     open("converted_model.tflite", "wb").write(tflite_model)
 
     # Converting a SavedModel.
-    converter = tf.compat.v1.TFLiteConverter.from_saved_model(saved_model_dir)
+    converter = tf.compat.v1.lite.TFLiteConverter.from_saved_model(
+        saved_model_dir)
     tflite_model = converter.convert()
     open("converted_model.tflite", "wb").write(tflite_model)
 
     # Converting a tf.keras model.
-    converter = tf.compat.v1.TFLiteConverter.from_keras_model_file(keras_model)
+    converter = tf.compat.v1.lite.TFLiteConverter.from_keras_model_file(
+        keras_model)
     tflite_model = converter.convert()
     open("converted_model.tflite", "wb").write(tflite_model)
     ```
diff --git a/tensorflow/lite/python/lite_v2_test.py b/tensorflow/lite/python/lite_v2_test.py
index 714eb249ec9..9cdfcade2f0 100644
--- a/tensorflow/lite/python/lite_v2_test.py
+++ b/tensorflow/lite/python/lite_v2_test.py
@@ -85,8 +85,6 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
 
     # Convert model.
     converter = lite.TFLiteConverterV2.from_concrete_functions([concrete_func])
-    # We don't support integer types as we don't have statistical information
-    # to quantize (only supported for post training integer quantization).
     with self.assertRaises(ValueError) as error:
       converter.inference_input_type = inference_input_output_type
       converter.inference_output_type = inference_input_output_type
@@ -212,8 +210,6 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
     # Convert quantized model.
     quantized_converter = lite.TFLiteConverterV2.from_concrete_functions([func])
     quantized_converter.optimizations = [lite.Optimize.DEFAULT]
-    # We don't support integer types as we don't have statistical information
-    # to quantize (only supported for post training integer quantization).
     with self.assertRaises(ValueError) as error:
       quantized_converter.inference_input_type = inference_input_output_type
       quantized_converter.inference_output_type = inference_input_output_type
@@ -223,11 +219,20 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
         'must be tf.float32.', str(error.exception))
 
   @parameterized.named_parameters(
-      ('_DefaultFLOAT32InputOutput', lite.constants.FLOAT),
-      ('_INT8InputOutput', lite.constants.INT8),
-      ('_UINT8InputOutput', lite.constants.QUANTIZED_UINT8))
-  def testPostTrainingIntegerAllowFloatQuantization(
-      self, inference_input_output_type):
+      ('_Default', False, False, lite.constants.FLOAT),
+      ('_INT8InputOutput', False, False, lite.constants.INT8),
+      ('_UINT8InputOutput', False, False, lite.constants.QUANTIZED_UINT8),
+      ('_INT16Quantize', False, True, lite.constants.FLOAT),
+      ('_INT16Quantize_INT16InputOutput', False, True, lite.constants.INT16),
+      ('_IntOnly', True, False, lite.constants.FLOAT),
+      ('_IntOnly_INT8InputOutput', True, False, lite.constants.INT8),
+      ('_IntOnly_UINT8InputOutput', True, False,
+       lite.constants.QUANTIZED_UINT8),
+      ('_IntOnly_INT16Quantize', True, True, lite.constants.FLOAT),
+      ('_IntOnly_INT16Quantize_INT16InputOutput', True, True,
+       lite.constants.INT16))
+  def testIntegerQuantization(self, is_int_only, is_int16_quantize,
+                              inference_input_output_type):
     func, calibration_gen = self._getIntegerQuantizeModel()
 
     # Convert float model.
@@ -239,111 +244,8 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
     quantized_converter = lite.TFLiteConverterV2.from_concrete_functions([func])
     quantized_converter.optimizations = [lite.Optimize.DEFAULT]
     quantized_converter.representative_dataset = calibration_gen
-    quantized_converter.inference_input_type = inference_input_output_type
-    quantized_converter.inference_output_type = inference_input_output_type
-    quantized_tflite_model = quantized_converter.convert()
-    self.assertIsNotNone(quantized_tflite_model)
-
-    interpreter = Interpreter(model_content=quantized_tflite_model)
-    interpreter.allocate_tensors()
-    input_details = interpreter.get_input_details()
-    self.assertLen(input_details, 1)
-    self.assertEqual(inference_input_output_type.as_numpy_dtype,
-                     input_details[0]['dtype'])
-    output_details = interpreter.get_output_details()
-    self.assertLen(output_details, 1)
-    self.assertEqual(inference_input_output_type.as_numpy_dtype,
-                     output_details[0]['dtype'])
-
-    # Ensure that the quantized tflite model is smaller.
-    self.assertLess(len(quantized_tflite_model), len(tflite_model))
-
-  def testPostTrainingIntegerAllowFloatQuantizationINT16InputOutput(self):
-    func, calibration_gen = self._getIntegerQuantizeModel()
-
-    # Convert float model.
-    converter = lite.TFLiteConverterV2.from_concrete_functions([func])
-    tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
-
-    # Post-training quantization 16x8 with float fallback allowed.
-    quantized_converter = lite.TFLiteConverterV2.from_concrete_functions([func])
-    quantized_converter.optimizations = [lite.Optimize.DEFAULT]
-    quantized_converter.representative_dataset = calibration_gen
-    quantized_converter.target_spec.supported_ops = [
-        lite.OpsSet.\
-        EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8,
-        lite.OpsSet.TFLITE_BUILTINS
-    ]
-    inference_input_output_type = lite.constants.INT16
-    quantized_converter.inference_input_type = inference_input_output_type
-    quantized_converter.inference_output_type = inference_input_output_type
-    quantized_tflite_model = quantized_converter.convert()
-    self.assertIsNotNone(quantized_tflite_model)
-
-    interpreter = Interpreter(model_content=quantized_tflite_model)
-    interpreter.allocate_tensors()
-    input_details = interpreter.get_input_details()
-    self.assertLen(input_details, 1)
-    self.assertEqual(inference_input_output_type.as_numpy_dtype,
-                     input_details[0]['dtype'])
-    output_details = interpreter.get_output_details()
-    self.assertLen(output_details, 1)
-    self.assertEqual(inference_input_output_type.as_numpy_dtype,
-                     output_details[0]['dtype'])
-
-    # Ensure that the quantized tflite model is smaller.
-    self.assertLess(len(quantized_tflite_model), len(tflite_model))
-
-  def testPostTrainingIntegerQuant16x8MismatchInferenceParams(self):
-    # In this test we check that when we do 16x8 post-training
-    # quantization and set inference_input(output)_type to
-    # constants.INT8, we have an error.
-    func, calibration_gen = self._getIntegerQuantizeModel()
-
-    # Convert quantized model.
-    quantized_converter = lite.TFLiteConverterV2.from_concrete_functions([func])
-    quantized_converter.optimizations = [lite.Optimize.DEFAULT]
-    quantized_converter.representative_dataset = calibration_gen
-    quantized_converter.target_spec.supported_ops = [
-        lite.OpsSet.\
-          EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8
-        ]
-
-    with self.assertRaises(ValueError) as error:
-      quantized_converter.inference_input_type = lite.constants.INT8
-      quantized_converter.inference_output_type = lite.constants.INT8
-      quantized_converter.convert()
-    self.assertEqual(
-        "The inference_input_type and inference_output_type "
-        "must be in ['tf.float32', 'tf.int16'].", str(error.exception))
-
-  @parameterized.named_parameters(
-      ('_DefaultFLOAT32InputOutput_UseTargetTypesFlag', lite.constants.FLOAT,
-       False, False),
-      ('_DefaultFLOAT32InputOutput', lite.constants.FLOAT, True, False),
-      ('_INT8InputOutput', lite.constants.INT8, True, False),
-      ('_UINT8InputOutput', lite.constants.QUANTIZED_UINT8, True, False),
-      ('_INT16InputOutput', lite.constants.INT16, True, True))
-  @test_util.run_v2_only
-  def testPostTrainingIntegerNoFloatQuantization(self,
-                                                 inference_input_output_type,
-                                                 use_target_ops_flag,
-                                                 quantization_16x8):
-    func, calibration_gen = self._getIntegerQuantizeModel()
-
-    # Convert float model.
-    converter = lite.TFLiteConverterV2.from_concrete_functions([func])
-    tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
-
-    # Convert model by specifying target spec (instead of optimizations), since
-    # when targeting an integer only backend, quantization is mandatory.
-    quantized_converter = lite.TFLiteConverterV2.from_concrete_functions([func])
-    quantized_converter.optimizations = [lite.Optimize.DEFAULT]
-    quantized_converter.representative_dataset = calibration_gen
-    if use_target_ops_flag:
-      if quantization_16x8:
+    if is_int_only:
+      if is_int16_quantize:
         quantized_converter.target_spec.supported_ops = [
             lite.OpsSet.\
             EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8
@@ -353,7 +255,12 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
             lite.OpsSet.TFLITE_BUILTINS_INT8
         ]
     else:
-      quantized_converter.target_spec.supported_types = [lite.constants.INT8]
+      if is_int16_quantize:
+        quantized_converter.target_spec.supported_ops = [
+            lite.OpsSet.\
+            EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8,
+            lite.OpsSet.TFLITE_BUILTINS
+        ]
     quantized_converter.inference_input_type = inference_input_output_type
     quantized_converter.inference_output_type = inference_input_output_type
     quantized_tflite_model = quantized_converter.convert()
@@ -373,6 +280,30 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
     # Ensure that the quantized tflite model is smaller.
     self.assertLess(len(quantized_tflite_model), len(tflite_model))
 
+  @parameterized.named_parameters(
+      ('_INT16Quantize_INT8InputOutput', True, lite.constants.INT8))
+  def testInvalidIntegerQuantization(self, is_int16_quantize,
+                                     inference_input_output_type):
+    func, calibration_gen = self._getIntegerQuantizeModel()
+
+    # Convert quantized model.
+    quantized_converter = lite.TFLiteConverterV2.from_concrete_functions([func])
+    quantized_converter.optimizations = [lite.Optimize.DEFAULT]
+    quantized_converter.representative_dataset = calibration_gen
+    if is_int16_quantize:
+      quantized_converter.target_spec.supported_ops = [
+          lite.OpsSet.\
+          EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8,
+          lite.OpsSet.TFLITE_BUILTINS
+      ]
+    with self.assertRaises(ValueError) as error:
+      quantized_converter.inference_input_type = lite.constants.INT8
+      quantized_converter.inference_output_type = lite.constants.INT8
+      quantized_converter.convert()
+    self.assertEqual(
+        "The inference_input_type and inference_output_type "
+        "must be in ['tf.float32', 'tf.int16'].", str(error.exception))
+
   def testCalibrateAndQuantizeBuiltinInt16(self):
     func, calibration_gen = self._getIntegerQuantizeModel()
 
@@ -556,6 +487,36 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
     converter.convert()
     self._assertValidDebugInfo(converter._debug_info)
 
+  @test_util.run_v2_only
+  def testFlexOpWithInt8OpSet(self):
+    model = tf.keras.Sequential()
+    input_shape = (1, 4, 4, 4, 1)
+    model.add(
+        tf.keras.layers.Conv3D(
+            4,
+            kernel_size=(1, 1, 1),
+            activation='relu',
+            input_shape=input_shape[1:]))
+    model.add(tf.keras.layers.Flatten())
+    model.add(tf.keras.layers.Dense(2, activation='relu'))
+
+    @tf.function(
+        input_signature=[tf.TensorSpec(shape=input_shape, dtype=tf.float32)])
+    def _call_fn(inputs):
+      return model(inputs, training=False)
+
+    concrete_func = _call_fn.get_concrete_function(
+        tf.TensorSpec(input_shape, dtype=tf.float32))
+
+    converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func])
+    converter.optimizations = [tf.lite.Optimize.DEFAULT]
+    converter.target_spec.supported_ops = [
+        tf.lite.OpsSet.TFLITE_BUILTINS_INT8,
+        tf.lite.OpsSet.SELECT_TF_OPS,
+    ]
+    tflite_model = converter.convert()
+    self.assertTrue(tflite_model)
+
 
 class FromSavedModelTest(lite_v2_test_util.ModelTest):
 
@@ -766,6 +727,18 @@ class FromSavedModelTest(lite_v2_test_util.ModelTest):
     converter.convert()
     self._assertValidDebugInfo(converter._debug_info)
 
+  @test_util.run_v2_only
+  def testFallbackPath(self):
+    """Test a SavedModel fallback path using old converter."""
+    saved_model_dir = self._createV1SavedModel(shape=[1, 16, 16, 3])
+
+    # Convert model and ensure model is not None.
+    converter = lite.TFLiteConverterV2.from_saved_model(saved_model_dir)
+    converter.experimental_new_converter = False
+    tflite_model = converter.convert()
+
+    self.assertTrue(tflite_model)
+
 
 class FromKerasModelTest(lite_v2_test_util.ModelTest):
 
diff --git a/tensorflow/lite/python/optimize/calibration_wrapper.cc b/tensorflow/lite/python/optimize/calibration_wrapper.cc
index b608d529c85..de3de413c1d 100644
--- a/tensorflow/lite/python/optimize/calibration_wrapper.cc
+++ b/tensorflow/lite/python/optimize/calibration_wrapper.cc
@@ -248,6 +248,14 @@ PyObject* CalibrationWrapper::SetTensor(int index, PyObject* value) {
   tensor = interpreter_->tensor(index);
 
   size_t size = PyArray_NBYTES(array);
+
+  if (tensor->type == kTfLiteString) {
+    tflite::DynamicBuffer buffer;
+    buffer.AddString(reinterpret_cast<const char*>(PyArray_BYTES(array)), size);
+    buffer.WriteToTensor(interpreter_->tensor(index), /*new_shape=*/nullptr);
+    Py_RETURN_NONE;
+  }
+
   if (size != tensor->bytes) {
     PyErr_Format(PyExc_ValueError,
                  "numpy array had %zu bytes but expected %zu bytes.", size,
diff --git a/tensorflow/lite/python/optimize/calibrator_test.py b/tensorflow/lite/python/optimize/calibrator_test.py
index 6590212e25e..371b3514ca3 100644
--- a/tensorflow/lite/python/optimize/calibrator_test.py
+++ b/tensorflow/lite/python/optimize/calibrator_test.py
@@ -91,6 +91,21 @@ class CalibratorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         input_gen, constants.FLOAT, constants.FLOAT, True, 'conv2d_8/BiasAdd')
     self.assertIsNotNone(quantized_model)
 
+  def test_calibration_with_string_input(self):
+    model_path = resource_loader.get_path_to_datafile(
+        'test_data/string_input_flex_model.bin')
+    with open(model_path, 'rb') as fp:
+      model_with_string_input = fp.read()
+    quantizer = _calibrator.Calibrator(model_with_string_input)
+    # Input generator for the model.
+    def input_gen():
+      for i in range(10):
+        yield [np.array(u'Test' + str(i))]
+
+    quantized_model = quantizer.calibrate_and_quantize_single(
+        input_gen, constants.FLOAT, constants.FLOAT, True, 'Identity')
+    self.assertIsNotNone(quantized_model)
+
   @parameterized.named_parameters(
       # Activation type Int8
       ('UseActivationTypeInt8 - EnableMlirQuantizer', constants.INT8),
diff --git a/tensorflow/lite/python/optimize/test_data/string_input_flex_model.bin b/tensorflow/lite/python/optimize/test_data/string_input_flex_model.bin
new file mode 100644
index 00000000000..3ee08810c36
Binary files /dev/null and b/tensorflow/lite/python/optimize/test_data/string_input_flex_model.bin differ
diff --git a/tensorflow/lite/python/util.py b/tensorflow/lite/python/util.py
index 79d2775d1dc..bb66d78a89c 100644
--- a/tensorflow/lite/python/util.py
+++ b/tensorflow/lite/python/util.py
@@ -31,7 +31,6 @@ import flatbuffers
 from tensorflow.core.protobuf import config_pb2 as _config_pb2
 from tensorflow.core.protobuf import graph_debug_info_pb2
 from tensorflow.core.protobuf import meta_graph_pb2 as _meta_graph_pb2
-from tensorflow.lite.python import lite_constants as _lite_constants
 from tensorflow.lite.python import schema_py_generated as schema_fb
 from tensorflow.lite.python.op_hint import convert_op_hints_to_stubs
 from tensorflow.lite.python.op_hint import find_all_hinted_output_nodes
@@ -77,8 +76,7 @@ _MAP_TFLITE_ENUM_TO_TF_TYPES = {
 
 _TFLITE_FILE_IDENTIFIER = b"TFL3"
 
-_TFLITE_MODEL_INPUT_OUTPUT_TYPES = (_lite_constants.FLOAT, _lite_constants.INT8,
-                                    _lite_constants.QUANTIZED_UINT8)
+_TFLITE_MODEL_INPUT_OUTPUT_TYPES = (dtypes.float32, dtypes.int8, dtypes.uint8)
 
 
 def convert_dtype_to_tflite_type(tf_dtype):
@@ -119,7 +117,7 @@ def _convert_tflite_enum_type_to_tf_type(tflite_enum_type):
   return tf_type
 
 
-def _get_dtype_name(tf_type):
+def _get_tf_type_name(tf_type):
   """Converts tf.dtype (eg: tf.float32) to str (eg: "tf.float32")."""
   return "tf." + tf_type.name
 
@@ -627,118 +625,57 @@ def _remove_tensors_from_model(model, remove_tensors_idxs):
     logging.debug("Removed tensors marked for deletion")
 
 
-def _validate_and_find_int8_quantized_inputs_outputs(model):
-  """Validate that model input is quantized and output is dequantized."""
-  if len(model.subgraphs) > 1:
-    raise ValueError("Model must only have one subgraph. Instead, it has "
-                     "{} subgraphs.".format(len(model.subgraphs)))
+def _modify_model_input_type(model, inference_input_type=dtypes.float32):
+  """Modify model input type."""
+
+  if inference_input_type == dtypes.float32:
+    return
+
+  if inference_input_type not in _TFLITE_MODEL_INPUT_OUTPUT_TYPES:
+    raise ValueError(
+        "Unsupported `inference_output_type` value. Expected to be in {}, "
+        "instead got {}.".format(tuple(_get_tf_type_name(t) for t in
+                                       _TFLITE_MODEL_INPUT_OUTPUT_TYPES),
+                                 _get_tf_type_name(inference_input_type)))
+
   subgraph = model.subgraphs[0]
   tensors = subgraph.tensors
   operators = subgraph.operators
 
-  # Ensure model has atleast one quantize and dequantize operator
-  quant_opcode_idx, dequant_opcode_idx = None, None
+  # Find all quantize operators
+  quant_opcode_idxs = []
   for idx, opcode in enumerate(model.operatorCodes):
     if opcode.builtinCode == schema_fb.BuiltinOperator.QUANTIZE:
-      quant_opcode_idx = idx
-    elif opcode.builtinCode == schema_fb.BuiltinOperator.DEQUANTIZE:
-      dequant_opcode_idx = idx
-    if quant_opcode_idx is not None and dequant_opcode_idx is not None:
-      break
-  if quant_opcode_idx is None and dequant_opcode_idx is None:
-    raise ValueError("Model is not integer quantized as it does not "
-                     "contain quantize/dequantize operators.")
+      quant_opcode_idxs.append(idx)
+  if not quant_opcode_idxs:
+    raise ValueError("Model input is not quantized.")
 
-  # Ensure model inputs and outputs are integer quantized
-  input_quant_ops, output_dequant_ops = [], []
+  # Ensure that the model input is quantized
+  input_quant_ops = []
   for op in operators:
-    # Find input quantize operator
-    if op.opcodeIndex == quant_opcode_idx and op.inputs[0] in subgraph.inputs:
-      pos, float_tensor, int_tensor = \
-          "input", tensors[op.inputs[0]], tensors[op.outputs[0]]
+    # Check if the operator quantizes an input
+    if op.opcodeIndex in quant_opcode_idxs and op.inputs[0] in subgraph.inputs:
+      # If found, validate the operator input/output tensor types
+      float_tensor, int_tensor = tensors[op.inputs[0]], tensors[op.outputs[0]]
+      if float_tensor.type != schema_fb.TensorType.FLOAT32:
+        raise ValueError(
+            "Model input type must be tf.float32. Expected type for tensor "
+            "with name '{}' is tf.float32, instead type is {}".format(
+                float_tensor.name, _get_tf_type_name(
+                    _convert_tflite_enum_type_to_tf_type(float_tensor.type))))
+      if int_tensor.type != schema_fb.TensorType.INT8:
+        raise ValueError(
+            "Model input is not quantized. Expected type for tensor "
+            "with name '{}' is tf.int8, instead type is {}".format(
+                int_tensor.name, _get_tf_type_name(
+                    _convert_tflite_enum_type_to_tf_type(int_tensor.type))))
       input_quant_ops.append(op)
-    # Find output dequantize operator
-    elif op.opcodeIndex == dequant_opcode_idx and \
-        op.outputs[0] in subgraph.outputs:
-      pos, float_tensor, int_tensor = \
-          "output", tensors[op.outputs[0]], tensors[op.inputs[0]]
-      output_dequant_ops.append(op)
-    # Otherwise, ignore
-    else:
-      continue
-    # If found, validate the input/output tensor type
-    if float_tensor.type != schema_fb.TensorType.FLOAT32:
-      raise ValueError(
-          "Model {} type must be tf.float32. Expected type for tensor with "
-          "name '{}' is tf.float32, instead type is tf.{}".format(
-              pos, float_tensor.name,
-              _convert_tflite_enum_type_to_tf_type(float_tensor.type).name))
-    if int_tensor.type != schema_fb.TensorType.INT8:
-      raise ValueError(
-          "Model is not integer quantized. Expected type for tensor with "
-          "name '{}' is tf.int8, instead type is tf.{}".format(
-              int_tensor.name,
-              _convert_tflite_enum_type_to_tf_type(int_tensor.type).name))
 
-  return input_quant_ops, output_dequant_ops
-
-
-def modify_integer_quantized_model_io_type(
-    model, inference_input_type=_lite_constants.FLOAT,
-    inference_output_type=_lite_constants.FLOAT):
-  """Modify the float input/output type of an integer quantized model.
-
-  Args:
-    model: An int8 quantized tflite model with float input and output.
-    inference_input_type: tf.DType representing final input type.
-      (default tf.float32)
-    inference_output_type: tf.DType representing final output type.
-      (default tf.float32)
-
-  Returns:
-    An int8 quantized tflite model with modified input and/or output type.
-
-  Raises:
-    ValueError: If the model is not int8 quantized or the inference_input_type
-      and/or inference_input_type is unsupported.
-    RuntimeError: If the modification was unsuccessful.
-
-  """
-  # Return if input and output types default to float
-  if inference_input_type == _lite_constants.FLOAT and \
-      inference_output_type == _lite_constants.FLOAT:
-    return model
-
-  # Validate input and output types
-  if inference_input_type not in _TFLITE_MODEL_INPUT_OUTPUT_TYPES:
-    raise ValueError("The `inference_input_type` should be in {}".format(
-        tuple(_get_dtype_name(t) for t in _TFLITE_MODEL_INPUT_OUTPUT_TYPES)))
-  if inference_output_type not in _TFLITE_MODEL_INPUT_OUTPUT_TYPES:
-    raise ValueError("The `inference_output_type` should be in {}".format(
-        tuple(_get_dtype_name(t) for t in _TFLITE_MODEL_INPUT_OUTPUT_TYPES)))
-
-  logging.debug(("Attempting to modify the model input from tf.float32 to %s "
-                 "and output from tf.float32 to %s"),
-                _get_dtype_name(inference_input_type),
-                _get_dtype_name(inference_output_type))
-  # Convert the model to an object
-  model = _convert_model_from_bytearray_to_object(model)
-
-  # Validate the integer quantized model
-  input_quant_ops, output_dequant_ops = \
-      _validate_and_find_int8_quantized_inputs_outputs(model)
-
-  # Initialize references and variables
-  if len(model.subgraphs) > 1:
-    raise ValueError("Model must only have one subgraph. Instead, it has "
-                     "{} subgraphs.".format(len(model.subgraphs)))
-  subgraph = model.subgraphs[0]
-  tensors = subgraph.tensors
-  operators = subgraph.operators
-  remove_tensors_idxs = set()
+  if len(subgraph.inputs) != len(input_quant_ops):
+    raise ValueError("Model input is not quantized.")
 
   # Modify model input type
-  if inference_input_type == _lite_constants.QUANTIZED_UINT8:
+  if inference_input_type == dtypes.uint8:
     # Change quant op (float to int8) to quant op (uint8 to int8)
     for op in input_quant_ops:
       int8_quantization = tensors[op.outputs[0]].quantization
@@ -747,35 +684,150 @@ def modify_integer_quantized_model_io_type(
       uint8_quantization.zeroPoint = [int8_quantization.zeroPoint[0] + 128]
       tensors[op.inputs[0]].quantization = uint8_quantization
       tensors[op.inputs[0]].type = schema_fb.TensorType.UINT8
-  elif inference_input_type == _lite_constants.INT8:
+  elif inference_input_type == dtypes.int8:
     # Remove the inputs and the quant operator
+    remove_tensors_idxs = set()
     for op in input_quant_ops:
       subgraph.inputs[subgraph.inputs == op.inputs[0]] = op.outputs[0]
       remove_tensors_idxs.add(op.inputs[0])
       operators.remove(op)
+    # Remove tensors marked for deletion.
+    _remove_tensors_from_model(model, remove_tensors_idxs)
+  else:
+    raise ValueError(
+        "Unsupported `inference_input_type` value. Expected to be in {}, "
+        "instead got {}.".format(tuple(_get_tf_type_name(t) for t in
+                                       _TFLITE_MODEL_INPUT_OUTPUT_TYPES),
+                                 _get_tf_type_name(inference_input_type)))
+
+
+def _modify_model_output_type(model, inference_output_type=dtypes.float32):
+  """Modify model output type."""
+
+  if inference_output_type == dtypes.float32:
+    return
+
+  if inference_output_type not in _TFLITE_MODEL_INPUT_OUTPUT_TYPES:
+    raise ValueError(
+        "Unsupported `inference_output_type` value. Expected to be in {}, "
+        "instead got {}.".format(tuple(_get_tf_type_name(t) for t in
+                                       _TFLITE_MODEL_INPUT_OUTPUT_TYPES),
+                                 _get_tf_type_name(inference_output_type)))
+
+  subgraph = model.subgraphs[0]
+  tensors = subgraph.tensors
+  operators = subgraph.operators
+
+  # Find all dequantize operators
+  dequant_opcode_idxs = []
+  for idx, opcode in enumerate(model.operatorCodes):
+    if opcode.builtinCode == schema_fb.BuiltinOperator.DEQUANTIZE:
+      dequant_opcode_idxs.append(idx)
+  if not dequant_opcode_idxs:
+    raise ValueError("Model output is not dequantized.")
+
+  # Ensure that the model output is dequantized
+  output_dequant_ops = []
+  for op in operators:
+    # Check if the operator dequantizes an output
+    if op.opcodeIndex in dequant_opcode_idxs and \
+        op.outputs[0] in subgraph.outputs:
+      # If found, validate the operator input/output tensor types
+      int_tensor, float_tensor = tensors[op.inputs[0]], tensors[op.outputs[0]]
+      if float_tensor.type != schema_fb.TensorType.FLOAT32:
+        raise ValueError(
+            "Model output type must be tf.float32. Expected type for tensor "
+            "with name '{}' is tf.float32, instead type is {}".format(
+                float_tensor.name, _get_tf_type_name(
+                    _convert_tflite_enum_type_to_tf_type(float_tensor.type))))
+      if int_tensor.type != schema_fb.TensorType.INT8:
+        raise ValueError(
+            "Model output is not dequantized. Expected type for tensor "
+            "with name '{}' is tf.int8, instead type is {}".format(
+                int_tensor.name, _get_tf_type_name(
+                    _convert_tflite_enum_type_to_tf_type(int_tensor.type))))
+      output_dequant_ops.append(op)
+
+  if len(subgraph.outputs) != len(output_dequant_ops):
+    raise ValueError("Model output is not dequantized.")
 
   # Modify model output type
-  if inference_output_type == _lite_constants.QUANTIZED_UINT8:
+  if inference_output_type == dtypes.uint8:
+    # Find a quantize operator
+    quant_opcode_idx = -1
+    for idx, opcode in enumerate(model.operatorCodes):
+      if opcode.builtinCode == schema_fb.BuiltinOperator.QUANTIZE:
+        quant_opcode_idx = idx
+        break
+    # Create a quantize operator, if none exist
+    if quant_opcode_idx == -1:
+      quant_op = schema_fb.OperatorCodeT()
+      quant_op.builtinCode = schema_fb.BuiltinOperator.QUANTIZE
+      model.operatorCodes.append(quant_op)
+      quant_opcode_idx = len(model.operatorCodes) - 1
     # Change dequant op (int8 to float) to quant op (int8 to uint8)
     for op in output_dequant_ops:
-      op.opcodeIndex = input_quant_ops[0].opcodeIndex
+      op.opcodeIndex = quant_opcode_idx
       int8_quantization = tensors[op.inputs[0]].quantization
       uint8_quantization = schema_fb.QuantizationParametersT()
       uint8_quantization.scale = [int8_quantization.scale[0]]
       uint8_quantization.zeroPoint = [int8_quantization.zeroPoint[0] + 128]
       tensors[op.outputs[0]].quantization = uint8_quantization
       tensors[op.outputs[0]].type = schema_fb.TensorType.UINT8
-  elif inference_output_type == _lite_constants.INT8:
+  elif inference_output_type == dtypes.int8:
     # Remove the outputs and the dequant operator
+    remove_tensors_idxs = set()
     for op in output_dequant_ops:
       subgraph.outputs[subgraph.outputs == op.outputs[0]] = op.inputs[0]
       remove_tensors_idxs.add(op.outputs[0])
       operators.remove(op)
+    # Remove tensors marked for deletion.
+    _remove_tensors_from_model(model, remove_tensors_idxs)
+  else:
+    raise ValueError(
+        "Unsupported `inference_output_type` value. Expected to be in {}, "
+        "instead got {}.".format(tuple(_get_tf_type_name(t) for t in
+                                       _TFLITE_MODEL_INPUT_OUTPUT_TYPES),
+                                 _get_tf_type_name(inference_output_type)))
 
-  # Remove tensors marked for deletion.
-  _remove_tensors_from_model(model, remove_tensors_idxs)
 
-  # Convert the model to a bytearray
-  model = _convert_model_from_object_to_bytearray(model)
+def modify_model_io_type(
+    model, inference_input_type=dtypes.float32,
+    inference_output_type=dtypes.float32):
+  """Modify the input/output type of a tflite model.
+
+  Args:
+    model: A tflite model.
+    inference_input_type: tf.DType representing modified input type.
+      (default tf.float32. If model input is int8 quantized, it must be in
+      {tf.float32, tf.int8, tf.uint8}, else it must be tf.float32)
+    inference_output_type: tf.DType representing modified output type.
+      (default tf.float32. If model output is int8 dequantized, it must be in
+      {tf.float32, tf.int8, tf.uint8}, else it must be tf.float32)
+
+  Returns:
+    A tflite model with modified input/output type.
+
+  Raises:
+    ValueError: If `inference_input_type`/`inference_output_type` is unsupported
+      or a supported integer type is specified for a model whose input/output is
+      not quantized/dequantized.
+    RuntimeError: If the modification was unsuccessful.
+
+  """
+  if inference_input_type == dtypes.float32 and \
+      inference_output_type == dtypes.float32:
+    return model
+
+  model_object = _convert_model_from_bytearray_to_object(model)
+
+  if len(model_object.subgraphs) > 1:
+    raise ValueError("Model must only have one subgraph. Instead, it has "
+                     "{} subgraphs.".format(len(model_object.subgraphs)))
+
+  _modify_model_input_type(model_object, inference_input_type)
+
+  _modify_model_output_type(model_object, inference_output_type)
+
+  return _convert_model_from_object_to_bytearray(model_object)
 
-  return model
diff --git a/tensorflow/lite/python/util_test.py b/tensorflow/lite/python/util_test.py
index 820cda4c7d6..1950331ce02 100644
--- a/tensorflow/lite/python/util_test.py
+++ b/tensorflow/lite/python/util_test.py
@@ -358,8 +358,7 @@ class UtilModifyIntegerQuantizedModelIOTypeTest(
     # Run model inference with float input output type
     output_data = _run_tflite_inference(model, tf.float32, tf.float32)
     # Run model inference with modified integer input output type
-    model_io = util.modify_integer_quantized_model_io_type(
-        model, in_tftype, out_tftype)
+    model_io = util.modify_model_io_type(model, in_tftype, out_tftype)
     output_io_data = _run_tflite_inference(model_io, in_tftype, out_tftype)
 
      # Validate that both the outputs are the same
diff --git a/tensorflow/lite/schema/BUILD b/tensorflow/lite/schema/BUILD
index 0bbb2d5e95d..3f61dc7c85b 100644
--- a/tensorflow/lite/schema/BUILD
+++ b/tensorflow/lite/schema/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow:tensorflow.bzl", "py_test")
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
 load("@flatbuffers//:build_defs.bzl", "flatbuffer_cc_library")
+load("//tensorflow:tensorflow.bzl", "get_compatible_with_portable")
 
 package(
     default_visibility = [
@@ -64,6 +65,7 @@ exports_files([
 flatbuffer_cc_library(
     name = "schema_fbs",
     srcs = ["schema.fbs"],
+    compatible_with = get_compatible_with_portable(),
 )
 
 # Generic schema for flatbuffer converter (but with mutable makes bigger).
diff --git a/tensorflow/lite/schema/schema_generated.h b/tensorflow/lite/schema/schema_generated.h
index a4691b70e49..c5013edb179 100755
--- a/tensorflow/lite/schema/schema_generated.h
+++ b/tensorflow/lite/schema/schema_generated.h
@@ -4742,11 +4742,11 @@ flatbuffers::Offset<ConcatenationOptions> CreateConcatenationOptions(flatbuffers
 
 struct AddOptionsT : public flatbuffers::NativeTable {
   typedef AddOptions TableType;
-  bool pot_scale_int16;
   tflite::ActivationFunctionType fused_activation_function;
+  bool pot_scale_int16;
   AddOptionsT()
-      : pot_scale_int16(true),
-        fused_activation_function(tflite::ActivationFunctionType_NONE) {
+      : fused_activation_function(tflite::ActivationFunctionType_NONE),
+        pot_scale_int16(true) {
   }
 };
 
@@ -4756,16 +4756,16 @@ struct AddOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
     VT_FUSED_ACTIVATION_FUNCTION = 4,
     VT_POT_SCALE_INT16 = 6
   };
-  bool pot_scale_int16() const {
-    return GetField<uint8_t>(VT_POT_SCALE_INT16, 0) != 0;
-  }
   tflite::ActivationFunctionType fused_activation_function() const {
     return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
   }
+  bool pot_scale_int16() const {
+    return GetField<uint8_t>(VT_POT_SCALE_INT16, 1) != 0;
+  }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
-           VerifyField<uint8_t>(verifier, VT_POT_SCALE_INT16) &&
            VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
+           VerifyField<uint8_t>(verifier, VT_POT_SCALE_INT16) &&
            verifier.EndTable();
   }
   AddOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -4779,6 +4779,9 @@ struct AddOptionsBuilder {
   void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
     fbb_.AddElement<int8_t>(AddOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
   }
+  void add_pot_scale_int16(bool pot_scale_int16) {
+    fbb_.AddElement<uint8_t>(AddOptions::VT_POT_SCALE_INT16, static_cast<uint8_t>(pot_scale_int16), 1);
+  }
   explicit AddOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -4793,8 +4796,10 @@ struct AddOptionsBuilder {
 
 inline flatbuffers::Offset<AddOptions> CreateAddOptions(
     flatbuffers::FlatBufferBuilder &_fbb,
-    tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE) {
+    tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE,
+    bool pot_scale_int16 = true) {
   AddOptionsBuilder builder_(_fbb);
+  builder_.add_pot_scale_int16(pot_scale_int16);
   builder_.add_fused_activation_function(fused_activation_function);
   return builder_.Finish();
 }
@@ -5914,11 +5919,11 @@ flatbuffers::Offset<DepthToSpaceOptions> CreateDepthToSpaceOptions(flatbuffers::
 
 struct SubOptionsT : public flatbuffers::NativeTable {
   typedef SubOptions TableType;
-  bool pot_scale_int16;
   tflite::ActivationFunctionType fused_activation_function;
+  bool pot_scale_int16;
   SubOptionsT()
-      : pot_scale_int16(true),
-        fused_activation_function(tflite::ActivationFunctionType_NONE) {
+      : fused_activation_function(tflite::ActivationFunctionType_NONE),
+        pot_scale_int16(true) {
   }
 };
 
@@ -5928,16 +5933,16 @@ struct SubOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
     VT_FUSED_ACTIVATION_FUNCTION = 4,
     VT_POT_SCALE_INT16 = 6
   };
-  bool pot_scale_int16() const {
-    return GetField<uint8_t>(VT_POT_SCALE_INT16, 0) != 0;
-  }
   tflite::ActivationFunctionType fused_activation_function() const {
     return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
   }
+  bool pot_scale_int16() const {
+    return GetField<uint8_t>(VT_POT_SCALE_INT16, 1) != 0;
+  }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
-           VerifyField<uint8_t>(verifier, VT_POT_SCALE_INT16) &&
            VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
+           VerifyField<uint8_t>(verifier, VT_POT_SCALE_INT16) &&
            verifier.EndTable();
   }
   SubOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -5951,6 +5956,9 @@ struct SubOptionsBuilder {
   void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
     fbb_.AddElement<int8_t>(SubOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
   }
+  void add_pot_scale_int16(bool pot_scale_int16) {
+    fbb_.AddElement<uint8_t>(SubOptions::VT_POT_SCALE_INT16, static_cast<uint8_t>(pot_scale_int16), 1);
+  }
   explicit SubOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -5965,8 +5973,10 @@ struct SubOptionsBuilder {
 
 inline flatbuffers::Offset<SubOptions> CreateSubOptions(
     flatbuffers::FlatBufferBuilder &_fbb,
-    tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE) {
+    tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE,
+    bool pot_scale_int16 = true) {
   SubOptionsBuilder builder_(_fbb);
+  builder_.add_pot_scale_int16(pot_scale_int16);
   builder_.add_fused_activation_function(fused_activation_function);
   return builder_.Finish();
 }
@@ -11405,6 +11415,7 @@ inline void AddOptions::UnPackTo(AddOptionsT *_o, const flatbuffers::resolver_fu
   (void)_o;
   (void)_resolver;
   { auto _e = fused_activation_function(); _o->fused_activation_function = _e; }
+  { auto _e = pot_scale_int16(); _o->pot_scale_int16 = _e; }
 }
 
 inline flatbuffers::Offset<AddOptions> AddOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const AddOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
@@ -11416,9 +11427,11 @@ inline flatbuffers::Offset<AddOptions> CreateAddOptions(flatbuffers::FlatBufferB
   (void)_o;
   struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const AddOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _fused_activation_function = _o->fused_activation_function;
+  auto _pot_scale_int16 = _o->pot_scale_int16;
   return tflite::CreateAddOptions(
       _fbb,
-      _fused_activation_function);
+      _fused_activation_function,
+      _pot_scale_int16);
 }
 
 inline MulOptionsT *MulOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
@@ -11921,6 +11934,7 @@ inline void SubOptions::UnPackTo(SubOptionsT *_o, const flatbuffers::resolver_fu
   (void)_o;
   (void)_resolver;
   { auto _e = fused_activation_function(); _o->fused_activation_function = _e; }
+  { auto _e = pot_scale_int16(); _o->pot_scale_int16 = _e; }
 }
 
 inline flatbuffers::Offset<SubOptions> SubOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SubOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
@@ -11932,9 +11946,11 @@ inline flatbuffers::Offset<SubOptions> CreateSubOptions(flatbuffers::FlatBufferB
   (void)_o;
   struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SubOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _fused_activation_function = _o->fused_activation_function;
+  auto _pot_scale_int16 = _o->pot_scale_int16;
   return tflite::CreateSubOptions(
       _fbb,
-      _fused_activation_function);
+      _fused_activation_function,
+      _pot_scale_int16);
 }
 
 inline DivOptionsT *DivOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
diff --git a/tensorflow/lite/shared_library.h b/tensorflow/lite/shared_library.h
index 7cf34a03125..a7bd91b3a0a 100644
--- a/tensorflow/lite/shared_library.h
+++ b/tensorflow/lite/shared_library.h
@@ -33,11 +33,11 @@ class SharedLibrary {
     return ::LoadLibrary(lib);
   }
   static inline void* GetLibrarySymbol(void* handle, const char* symbol) {
-    return static_cast<void*>(
+    return reinterpret_cast<void*>(
         GetProcAddress(static_cast<HMODULE>(handle), symbol));
   }
   static inline void* GetSymbol(const char* symbol) {
-    return static_cast<void*>(GetProcAddress(nullptr, symbol));
+    return reinterpret_cast<void*>(GetProcAddress(nullptr, symbol));
   }
   static inline int UnLoadLibrary(void* handle) {
     return FreeLibrary(static_cast<HMODULE>(handle));
diff --git a/tensorflow/lite/string_util.h b/tensorflow/lite/string_util.h
index 2086f9badbf..b8f3fcd3b9f 100644
--- a/tensorflow/lite/string_util.h
+++ b/tensorflow/lite/string_util.h
@@ -16,8 +16,9 @@ limitations under the License.
 // Util methods to read and write String tensors.
 // String tensors are considered to be char tensor with protocol.
 //   [0, 3] 4 bytes: N, num of strings in the tensor in little endian.
-//   [(i+1)*4, (i+1)*4+3] 4 bytes: offset of i-th string in little endian.
-//   [(N+2)*4, (N+2)*4+3] 4 bytes: length of the whole char buffer.
+//   [(i+1)*4, (i+1)*4+3] 4 bytes: offset of i-th string in little endian,
+//                                 for i from 0 to N-1.
+//   [(N+1)*4, (N+1)*4+3] 4 bytes: length of the whole char buffer.
 //   [offset(i), offset(i+1) - 1] : content of i-th string.
 // Example of a string tensor:
 // [
@@ -76,9 +77,6 @@ class DynamicBuffer {
   // The function allocates space for the buffer but does NOT take ownership.
   int WriteToBuffer(char** buffer);
 
-  // String tensors are not generally supported on platforms w/ static memory.
-  // TODO(b/156130024): Remove this guard after removing header from TFLM deps.
-#ifndef TF_LITE_STATIC_MEMORY
   // Fill content into a string tensor, with the given new_shape. The new shape
   // must match the number of strings in this object. Caller relinquishes
   // ownership of new_shape. If 'new_shape' is nullptr, keep the tensor's
@@ -87,7 +85,6 @@ class DynamicBuffer {
 
   // Fill content into a string tensor. Set shape to {num_strings}.
   void WriteToTensorAsVector(TfLiteTensor* tensor);
-#endif  // TF_LITE_STATIC_MEMORY
 
  private:
   // Data buffer to store contents of strings, not including headers.
diff --git a/tensorflow/lite/testing/BUILD b/tensorflow/lite/testing/BUILD
index 3d4527e926e..02cd86b61f0 100644
--- a/tensorflow/lite/testing/BUILD
+++ b/tensorflow/lite/testing/BUILD
@@ -35,7 +35,19 @@ exports_files([
     name = "zip_test_%s" % test_name,
     size = "medium",
     srcs = ["generated_examples_zip_test.cc"],
-    args = args + select({
+    additional_test_tags_args = {
+        "xnnpack": (
+            # TODO(b/162696268): remove 'notap' once the bug is fixed.
+            ["notap"],
+            ["--use_xnnpack=true"],
+        ),
+    },
+    conversion_mode = conversion_mode,
+    data = [
+        ":zip_%s" % test_name,
+    ],
+    shard_count = 20,
+    test_args = args + select({
         "//tensorflow:android": [],
         "//conditions:default": [
             "--zip_file_path=$(location :zip_%s)" % test_name,
@@ -44,18 +56,12 @@ exports_files([
             "--unzip_binary_path=/usr/bin/unzip",
         ],
     }),
-    conversion_mode = conversion_mode,
-    data = [
-        ":zip_%s" % test_name,
-    ],
-    shard_count = 20,
-    tags = tags + [
-        "gen_zip_test",
+    test_name = test_name,
+    test_tags = tags + [
         "no_gpu",  # Executing with TF GPU configurations is redundant.
         "no_oss",
         "tflite_not_portable_intentional",
     ],
-    test_name = test_name,
     deps = [
         ":parse_testdata_lib",
         ":tflite_driver",
diff --git a/tensorflow/lite/testing/model_coverage/model_coverage_lib.py b/tensorflow/lite/testing/model_coverage/model_coverage_lib.py
index 71a1a31ac4c..d9cd6883a8d 100644
--- a/tensorflow/lite/testing/model_coverage/model_coverage_lib.py
+++ b/tensorflow/lite/testing/model_coverage/model_coverage_lib.py
@@ -83,7 +83,8 @@ def _convert(converter, **kwargs):
   Args:
     converter: TFLiteConverter object.
     **kwargs: Additional arguments to be passed into the converter. Supported
-      flags are {"target_ops", "post_training_quantize", "quantize_to_float16"}.
+      flags are {"target_ops", "post_training_quantize",
+      "quantize_to_float16", "post_training_quantize_16x8", "model_input_size"}.
 
   Returns:
     The converted TFLite model in serialized format.
@@ -97,9 +98,58 @@ def _convert(converter, **kwargs):
     converter.optimizations = [_lite.Optimize.DEFAULT]
   if kwargs.get("quantize_to_float16", False):
     converter.target_spec.supported_types = [constants.FLOAT16]
+  if kwargs.get("post_training_quantize_16x8", False):
+    input_size = kwargs.get("model_input_size")
+
+    def _get_calib_data_func():
+
+      def representative_data_gen():
+        num_calibration = 20
+        for _ in range(num_calibration):
+          yield [
+              np.random.rand(
+                  1,
+                  input_size[0],
+                  input_size[1],
+                  input_size[2],
+              ).astype(np.float32)
+          ]
+
+      return representative_data_gen
+
+    converter.optimizations = [_lite.Optimize.DEFAULT]
+    converter.target_spec.supported_ops = \
+      [_lite.OpsSet.\
+        EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8]
+    converter.representative_dataset = _get_calib_data_func()
   return converter.convert()
 
 
+def _check_model_quantized_to_16x8(tflite_model):
+  """Checks that the activations are quantized into int16.
+
+    Args:
+      tflite_model: Serialized TensorFlow Lite model.
+
+    Raises:
+      ValueError: Activations with int16 type are not found.
+  """
+  interpreter = _get_tflite_interpreter(tflite_model)
+  interpreter.allocate_tensors()
+  all_tensor_details = interpreter.get_tensor_details()
+
+  found_input = False
+  for tensor in all_tensor_details:
+    if "_int16" in tensor["name"]:
+      found_input = True
+      if tensor["dtype"] is not np.int16:
+        raise ValueError("Activations should be int16.")
+
+  # Check that we found activations in the correct type: int16
+  if not found_input:
+    raise ValueError("Could not find int16 activations.")
+
+
 def _get_tflite_interpreter(tflite_model, input_shapes_resize=None):
   """Creates a TFLite interpreter with resized input tensors.
 
@@ -447,6 +497,7 @@ def test_frozen_graph_quant(filename,
   # unless we are quantizing to float16.
   if ("target_ops" in kwargs and
       not kwargs.get("quantize_to_float16", False) and
+      not kwargs.get("post_training_quantize_16x8", False) and
       set(kwargs["target_ops"]) == set([_lite.OpsSet.SELECT_TF_OPS])):
     if has_quant_tensor:
       raise ValueError("--post_training_quantize flag unexpectedly altered the "
@@ -537,12 +588,20 @@ def test_saved_model(directory,
       signature_key=signature_key)
   tflite_model = _convert(converter, **kwargs)
 
+  # 5 decimal places by default
+  tolerance = 5
+  if kwargs.get("post_training_quantize_16x8", False):
+    _check_model_quantized_to_16x8(tflite_model)
+    # only 2 decimal places for full quantization
+    tolerance = 2
+
   tf_eval_func = evaluate_saved_model(directory, tag_set, signature_key)
   compare_models(
       tflite_model,
       tf_eval_func,
       input_data=input_data,
-      input_data_range=input_data_range)
+      input_data_range=input_data_range,
+      tolerance=tolerance)
 
 
 def test_saved_model_v2(directory,
diff --git a/tensorflow/lite/testing/model_coverage/model_coverage_lib_test.py b/tensorflow/lite/testing/model_coverage/model_coverage_lib_test.py
index 03a0004b2fc..2733363fc3a 100644
--- a/tensorflow/lite/testing/model_coverage/model_coverage_lib_test.py
+++ b/tensorflow/lite/testing/model_coverage/model_coverage_lib_test.py
@@ -156,6 +156,39 @@ class EvaluateSavedModel(test.TestCase):
         saved_model.simple_save(sess, saved_model_dir, inputs, outputs)
     model_coverage.test_saved_model(saved_model_dir)
 
+  def testPostTrainingQuantize16x8(self):
+    """Test for post-training quantization mode: activations/weights - int16/int8."""
+    saved_model_dir = os.path.join(self.get_temp_dir(), 'simple_savedmodel')
+
+    input_size = [5, 5, 3]
+    kernel_size = [3, 3, 1]
+    layer_name = 'test_conv2d'
+    input_0 = keras.layers.Input(shape=input_size)
+    layer_0 = keras.layers.Conv2D(
+        filters=kernel_size[-1],
+        kernel_size=kernel_size[0:2],
+        use_bias=False,
+        name=layer_name)(
+            input_0)
+    model = keras.models.Model(inputs=[input_0], outputs=[layer_0])
+    keras_layer = [layer for layer in model.layers if layer.name == layer_name
+                  ][0]
+    keras_layer.set_weights([
+        np.random.rand(
+            input_size[-1],
+            kernel_size[0],
+            kernel_size[1],
+            kernel_size[2],
+        ).astype(np.float32)
+    ])
+
+    saved_model.save(model, saved_model_dir)
+
+    model_coverage.test_saved_model(
+        saved_model_dir,
+        post_training_quantize_16x8=True,
+        model_input_size=input_size)
+
 
 class EvaluateKerasModel(test.TestCase):
 
diff --git a/tensorflow/lite/testing/op_tests/leaky_relu.py b/tensorflow/lite/testing/op_tests/leaky_relu.py
index e37df7722f5..0d2ec384917 100644
--- a/tensorflow/lite/testing/op_tests/leaky_relu.py
+++ b/tensorflow/lite/testing/op_tests/leaky_relu.py
@@ -28,12 +28,13 @@ from tensorflow.lite.testing.zip_test_utils import register_make_test_function
 def make_leaky_relu_tests(options):
   """Make a set of tests to do LeakyRelu."""
 
-  test_parameters = [
-      {
-          "input_shape": [[], [1], [5], [1, 10, 10, 3], [3, 3, 3, 3]],
-          "alpha": [0.1, 1.0, 2.0, -0.1, -1.0, -2.0],
-      },
-  ]
+  test_parameters = [{
+      "input_shape": [[], [1], [5], [1, 10, 10, 3], [3, 3, 3, 3]],
+      "alpha": [0.1, 1.0, 2.0, -0.1, -1.0, -2.0],
+      "fully_quantize": [False, True],
+      "input_range": [(-3, 10)],
+      "quant_16x8": [False, True],
+  }]
 
   def build_graph(parameters):
     """Build the graph for the test case."""
diff --git a/tensorflow/lite/testing/op_tests/range.py b/tensorflow/lite/testing/op_tests/range.py
index ad3d2dfc252..d78742f08fc 100644
--- a/tensorflow/lite/testing/op_tests/range.py
+++ b/tensorflow/lite/testing/op_tests/range.py
@@ -29,7 +29,7 @@ def make_range_tests(options):
 
   test_parameters = [{
       "dtype": [tf.int32, tf.float32],
-      "offset": [10, 100, 1000],
+      "offset": [10, 100, 1000, 0],
       "delta": [1, 2, 3, 4, -1, -2, -3, -4],
   }]
 
diff --git a/tensorflow/lite/testing/op_tests/rfft2d.py b/tensorflow/lite/testing/op_tests/rfft2d.py
index 1e4ea42d879..e7525f13896 100644
--- a/tensorflow/lite/testing/op_tests/rfft2d.py
+++ b/tensorflow/lite/testing/op_tests/rfft2d.py
@@ -30,9 +30,10 @@ def make_rfft2d_tests(options):
 
   test_parameters = [{
       "input_dtype": [tf.float32],
-      "input_shape": [[8, 8], [3, 8, 8]],
+      "input_shape": [[8, 8], [3, 8, 8], [3, 1, 16]],
       "fft_length": [
-          None, [4, 4], [4, 8], [8, 4], [8, 8], [8, 16], [16, 8], [16, 16]
+          None, [4, 4], [4, 8], [8, 4], [8, 8], [8, 16], [16, 8], [16, 16],
+          [1, 8], [1, 16]
       ]
   }]
 
diff --git a/tensorflow/lite/testing/op_tests/strided_slice.py b/tensorflow/lite/testing/op_tests/strided_slice.py
index bc1b0115c24..4f2cbd9b64a 100644
--- a/tensorflow/lite/testing/op_tests/strided_slice.py
+++ b/tensorflow/lite/testing/op_tests/strided_slice.py
@@ -43,17 +43,17 @@ def _make_strided_slice_tests(options, test_parameters, expected_tf_failures=0):
       begin = tf.compat.v1.placeholder(
           dtype=parameters["index_type"],
           name="begin",
-          shape=[len(parameters["input_shape"])])
+          shape=[len(parameters["begin"])])
       end = tf.compat.v1.placeholder(
           dtype=parameters["index_type"],
           name="end",
-          shape=[len(parameters["input_shape"])])
+          shape=[len(parameters["end"])])
       strides = None
       if parameters["strides"] is not None:
         strides = tf.compat.v1.placeholder(
             dtype=parameters["index_type"],
             name="strides",
-            shape=[len(parameters["input_shape"])])
+            shape=[len(parameters["strides"])])
       tensors = [input_tensor, begin, end]
       if strides is not None:
         tensors.append(strides)
@@ -141,7 +141,7 @@ def make_strided_slice_tests(options):
           "begin_mask": [0],
           "end_mask": [0],
           "shrink_axis_mask": [1],
-          "constant_indices": [True],
+          "constant_indices": [True, False],
           "fully_quantize": [False],
       },
       # 2-D
diff --git a/tensorflow/lite/testing/op_tests/transpose_conv.py b/tensorflow/lite/testing/op_tests/transpose_conv.py
index 09c1b5f4f14..d99cd204bcb 100644
--- a/tensorflow/lite/testing/op_tests/transpose_conv.py
+++ b/tensorflow/lite/testing/op_tests/transpose_conv.py
@@ -44,7 +44,8 @@ def make_transpose_conv_tests(options):
           "data_format": ["NHWC"],
           "channel_multiplier": [1, 2],
           "output_shape": [[]],
-          "fully_quantize": [False]
+          "fully_quantize": [False],
+          "const_weight_bias": [False]
       },
       # TODO(yunluli): Adding simple tests for now to unblock edgetpu debugging.
       # Need to add more test cases.
@@ -57,7 +58,20 @@ def make_transpose_conv_tests(options):
           "data_format": ["NHWC"],
           "channel_multiplier": [1],
           "output_shape": [[1, 3, 3, 2]],
-          "fully_quantize": [True]
+          "fully_quantize": [True],
+          "const_weight_bias": [True]
+      },
+      {
+          "input_shape": [[1, 3, 3, 1]],
+          "filter_size": [[3, 3, 2, 1]],
+          "has_bias": [False],
+          "strides": [[1, 1, 1, 1]],
+          "padding": ["SAME"],
+          "data_format": ["NHWC"],
+          "channel_multiplier": [1],
+          "output_shape": [[1, 3, 3, 2]],
+          "fully_quantize": [False],
+          "const_weight_bias": [True]
       },
       {
           "input_shape": [[1, 3, 3, 1]],
@@ -68,7 +82,8 @@ def make_transpose_conv_tests(options):
           "data_format": ["NHWC"],
           "channel_multiplier": [1],
           "output_shape": [[1, 6, 6, 2]],
-          "fully_quantize": [True]
+          "fully_quantize": [True],
+          "const_weight_bias": [True]
       },
       {
           "input_shape": [[1, 4, 3, 1]],
@@ -79,7 +94,8 @@ def make_transpose_conv_tests(options):
           "data_format": ["NHWC"],
           "channel_multiplier": [1],
           "output_shape": [[1, 8, 6, 2]],
-          "fully_quantize": [True]
+          "fully_quantize": [True],
+          "const_weight_bias": [True]
       },
       {
           "input_shape": [[1, 3, 3, 1]],
@@ -90,14 +106,15 @@ def make_transpose_conv_tests(options):
           "data_format": ["NHWC"],
           "channel_multiplier": [1],
           "output_shape": [[1, 3, 3, 2]],
-          "fully_quantize": [True]
+          "fully_quantize": [True],
+          "const_weight_bias": [True]
       },
   ]
 
   def get_tensor_shapes(parameters):
     input_shape = parameters["input_shape"]
     filter_size = parameters["filter_size"]
-    if not parameters["fully_quantize"]:
+    if not parameters["const_weight_bias"]:
       filter_shape = filter_size + [
           input_shape[3], parameters["channel_multiplier"]
       ]
@@ -113,7 +130,7 @@ def make_transpose_conv_tests(options):
     filter_input = tf.compat.v1.placeholder(
         dtype=tf.float32, name="filter", shape=filter_shape)
 
-    if not parameters["fully_quantize"]:
+    if not parameters["const_weight_bias"]:
       input_tensors = [input_tensor, filter_input]
       conv_outputs = tf.nn.conv2d(
           input_tensor,
@@ -130,8 +147,11 @@ def make_transpose_conv_tests(options):
           data_format=parameters["data_format"])
     else:
       input_tensors = [input_tensor]
-      filter_input = create_tensor_data(
-          np.float32, filter_shape, min_value=-1, max_value=1)
+      if parameters["fully_quantize"]:
+        filter_input = create_tensor_data(
+            np.float32, filter_shape, min_value=-1, max_value=1)
+      else:
+        filter_input = create_tensor_data(np.float32, filter_shape)
       out = tf.nn.conv2d_transpose(
           input_tensor,
           filter_input,
@@ -140,27 +160,38 @@ def make_transpose_conv_tests(options):
           padding=parameters["padding"],
           data_format=parameters["data_format"])
       if parameters["has_bias"]:
-        bias_input = create_tensor_data(
-            np.float32, (parameters["output_shape"][-1],),
-            min_value=-1,
-            max_value=1)
+        if parameters["fully_quantize"]:
+          bias_input = create_tensor_data(
+              np.float32, (parameters["output_shape"][-1],),
+              min_value=-1,
+              max_value=1)
+        else:
+          bias_input = create_tensor_data(np.float32,
+                                          (parameters["output_shape"][-1],))
         out = tf.nn.bias_add(
             out, bias_input, data_format=parameters["data_format"])
 
+        mul_data = create_tensor_data(np.float32,
+                                      (parameters["output_shape"][-1],))
+        out = tf.math.multiply(out, mul_data)
+
     return input_tensors, [out]
 
   def build_inputs(parameters, sess, inputs, outputs):
     input_shape, filter_shape = get_tensor_shapes(parameters)
-    if not parameters["fully_quantize"]:
+    if not parameters["const_weight_bias"]:
       values = [
           create_tensor_data(np.float32, input_shape),
           create_tensor_data(np.float32, filter_shape)
       ]
     else:
-      values = [
-          create_tensor_data(
-              np.float32, input_shape, min_value=-1, max_value=1),
-      ]
+      if parameters["fully_quantize"]:
+        values = [
+            create_tensor_data(
+                np.float32, input_shape, min_value=-1, max_value=1),
+        ]
+      else:
+        values = [create_tensor_data(np.float32, input_shape),]
 
     return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
diff --git a/tensorflow/lite/testing/op_tests/where.py b/tensorflow/lite/testing/op_tests/where.py
index 49802422e3f..90db8d56f25 100644
--- a/tensorflow/lite/testing/op_tests/where.py
+++ b/tensorflow/lite/testing/op_tests/where.py
@@ -33,6 +33,11 @@ def make_where_tests(options):
           "input_shape_set": [([1, 2, 3, 4], [1, 2, 3, 4]),],
           "use_where_v2": [False, True],
       },
+      {
+          "input_dtype": [tf.float32, tf.int32],
+          "input_shape_set": [([], []),],
+          "use_where_v2": [],
+      },
   ]
 
   def build_graph(parameters):
diff --git a/tensorflow/lite/testing/tflite_driver.cc b/tensorflow/lite/testing/tflite_driver.cc
index 526b3968b21..bb2d37241d4 100644
--- a/tensorflow/lite/testing/tflite_driver.cc
+++ b/tensorflow/lite/testing/tflite_driver.cc
@@ -361,7 +361,10 @@ TfLiteDriver::TfLiteDriver(DelegateType delegate_type, bool reference_kernel)
   if (reference_kernel) {
     resolver_.reset(new ops::builtin::BuiltinRefOpResolver);
   } else {
-    resolver_.reset(new ops::builtin::BuiltinOpResolver);
+    // TODO(b/168278077): change back to use BuiltinOpResolver after zip tests
+    // are fully validated against TfLite delegates.
+    resolver_.reset(
+        new ops::builtin::BuiltinOpResolverWithoutDefaultDelegates());
     ops::builtin::BuiltinOpResolver* buildinop_resolver_ =
         reinterpret_cast<ops::builtin::BuiltinOpResolver*>(resolver_.get());
     buildinop_resolver_->AddCustom("RFFT2D",
diff --git a/tensorflow/lite/testing/zip_test_utils.py b/tensorflow/lite/testing/zip_test_utils.py
index f20361ccc71..0340886d37d 100644
--- a/tensorflow/lite/testing/zip_test_utils.py
+++ b/tensorflow/lite/testing/zip_test_utils.py
@@ -162,7 +162,8 @@ def format_result(t):
     values = ["{:.9f}".format(value) for value in list(t.flatten())]
     return ",".join(values)
   else:
-    return _pywrap_string_util.SerializeAsHexString(t.flatten())
+    # SerializeAsHexString returns bytes in PY3, so decode if appropriate.
+    return _pywrap_string_util.SerializeAsHexString(t.flatten()).decode("utf-8")
 
 
 def write_examples(fp, examples):
diff --git a/tensorflow/lite/toco/python/toco_python_api.cc b/tensorflow/lite/toco/python/toco_python_api.cc
index 0f21d0854ae..eea3fb005bd 100644
--- a/tensorflow/lite/toco/python/toco_python_api.cc
+++ b/tensorflow/lite/toco/python/toco_python_api.cc
@@ -118,11 +118,6 @@ PyObject* TocoConvert(PyObject* model_flags_proto_txt_raw,
     PyErr_SetString(PyExc_ValueError, "Toco flags are invalid.");
     return nullptr;
   }
-  std::string input_contents_txt = ConvertArg(input_contents_txt_raw, &error);
-  if (error) {
-    PyErr_SetString(PyExc_ValueError, "Input GraphDef is invalid.");
-    return nullptr;
-  }
 
   // Use TOCO to produce new outputs.
   toco::ModelFlags model_flags;
@@ -153,10 +148,18 @@ PyObject* TocoConvert(PyObject* model_flags_proto_txt_raw,
   }
 
   tensorflow::GraphDef graph_def;
-  if (!graph_def.ParseFromString(input_contents_txt)) {
-    PyErr_SetString(PyExc_ValueError,
-                    "Failed to convert GraphDef to Python String.");
-    return nullptr;
+  std::string input_contents_txt;
+  if (model_flags.saved_model_dir().empty()) {
+    input_contents_txt = ConvertArg(input_contents_txt_raw, &error);
+    if (error) {
+      PyErr_SetString(PyExc_ValueError, "Input GraphDef is invalid.");
+      return nullptr;
+    }
+    if (!graph_def.ParseFromString(input_contents_txt)) {
+      PyErr_SetString(PyExc_ValueError,
+                      "Failed to convert GraphDef to Python String.");
+      return nullptr;
+    }
   }
 
   auto& dump_options = *GraphVizDumpOptions::singleton();
diff --git a/tensorflow/lite/toco/tflite/op_version.cc b/tensorflow/lite/toco/tflite/op_version.cc
index 5908ccf07ba..053ee9cf02a 100644
--- a/tensorflow/lite/toco/tflite/op_version.cc
+++ b/tensorflow/lite/toco/tflite/op_version.cc
@@ -125,6 +125,7 @@ std::string GetMinimumRuntimeVersionForModel(const Model& model) {
           {{OperatorType::kTranspose, 1}, "1.6.0"},
           {{OperatorType::kTranspose, 2}, "1.14.0"},
           {{OperatorType::kTranspose, 3}, "1.15.0"},
+          {{OperatorType::kTranspose, 5}, kPendingReleaseOpVersion},
           {{OperatorType::kLstmCell, 1}, "1.7.0"},
           {{OperatorType::kLstmCell, 2}, "1.10.0"},
           {{OperatorType::kLstmCell, 3}, "1.14.0"},
@@ -151,6 +152,7 @@ std::string GetMinimumRuntimeVersionForModel(const Model& model) {
           {{OperatorType::kResizeNearestNeighbor, 1}, "1.13.1"},
           {{OperatorType::kResizeNearestNeighbor, 2}, "1.14.0"},
           {{OperatorType::kResizeNearestNeighbor, 3}, kPendingReleaseOpVersion},
+          {{OperatorType::kResizeNearestNeighbor, 4}, kPendingReleaseOpVersion},
           {{OperatorType::kSqueeze, 1}, "1.6.0"},
           {{OperatorType::kSplit, 1}, "1.5.0"},
           {{OperatorType::kSplit, 2}, "1.14.0"},
@@ -180,6 +182,7 @@ std::string GetMinimumRuntimeVersionForModel(const Model& model) {
           {{OperatorType::kSlice, 1}, "1.14.0"},
           {{OperatorType::kSlice, 2}, "1.14.0"},
           {{OperatorType::kSlice, 3}, "1.14.0"},
+          {{OperatorType::kSlice, 4}, kPendingReleaseOpVersion},
           {{OperatorType::kTanh, 1}, "1.14.0"},
           {{OperatorType::kTanh, 2}, "1.14.0"},
           {{OperatorType::kTanh, 3}, kPendingReleaseOpVersion},
diff --git a/tensorflow/lite/tools/BUILD b/tensorflow/lite/tools/BUILD
index 1f57cad7f7a..a94aca941c9 100644
--- a/tensorflow/lite/tools/BUILD
+++ b/tensorflow/lite/tools/BUILD
@@ -84,6 +84,17 @@ py_binary(
     ],
 )
 
+py_binary(
+    name = "reverse_xxd_dump_from_cc",
+    srcs = ["reverse_xxd_dump_from_cc.py"],
+    python_version = "PY3",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":flatbuffer_utils",
+        "//tensorflow/python:platform",
+    ],
+)
+
 py_binary(
     name = "randomize_weights",
     srcs = ["randomize_weights.py"],
@@ -267,8 +278,8 @@ cc_library(
         "//tensorflow/core:tensorflow",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:util",
-        "@com_google_absl//absl/strings",
         "@flatbuffers",
+        "@jsoncpp_git//:jsoncpp",
     ],
 )
 
@@ -296,6 +307,27 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "list_flex_ops_no_kernel",
+    srcs = ["list_flex_ops_no_kernel.cc"],
+    hdrs = ["list_flex_ops.h"],
+    deps = [
+        "//tensorflow/lite:framework",
+        "@jsoncpp_git//:jsoncpp",
+    ],
+)
+
+tf_cc_binary(
+    name = "list_flex_ops_no_kernel_main",
+    srcs = ["list_flex_ops_main.cc"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":list_flex_ops_no_kernel",
+        "//tensorflow/lite/tools:command_line_flags",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 tf_cc_test(
     name = "list_flex_ops_test",
     srcs = ["list_flex_ops_test.cc"],
diff --git a/tensorflow/lite/tools/benchmark/BUILD b/tensorflow/lite/tools/benchmark/BUILD
index 774e7ed7088..eb3f37aef58 100644
--- a/tensorflow/lite/tools/benchmark/BUILD
+++ b/tensorflow/lite/tools/benchmark/BUILD
@@ -158,7 +158,6 @@ cc_library(
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/kernels:cpu_backend_context",
-        "//tensorflow/lite/profiling:platform_profiler",
         "//tensorflow/lite/profiling:profile_summary_formatter",
         "//tensorflow/lite/profiling:profiler",
         "//tensorflow/lite/tools:logging",
diff --git a/tensorflow/lite/tools/benchmark/README.md b/tensorflow/lite/tools/benchmark/README.md
index 8d7e6643d79..df432daa2e2 100644
--- a/tensorflow/lite/tools/benchmark/README.md
+++ b/tensorflow/lite/tools/benchmark/README.md
@@ -1,4 +1,4 @@
-# TFLite Model Benchmark Tool
+# TFLite Model Benchmark Tool with C++ Binary
 
 ## Description
 
@@ -34,20 +34,8 @@ and the following optional parameters:
 *   `run_delay`: `float` (default=-1.0) \
     The delay in seconds between subsequent benchmark runs. Non-positive values
     mean use no delay.
-*   `use_legacy_nnapi`: `bool` (default=false) \
-    Whether to use the legacy
-    [Android NNAPI](https://developer.android.com/ndk/guides/neuralnetworks/)
-    TFLite path, which requires the graph to be fully compatible with NNAPI.
-    This is available on recent Android devices. Note that some Android P
-    devices will fail to use NNAPI for models in `/data/local/tmp/` and this
-    benchmark tool will not correctly use NNAPI.
 *   `enable_op_profiling`: `bool` (default=false) \
     Whether to enable per-operator profiling measurement.
-*   `enable_platform_tracing`: `bool` (default=false) \
-    Whether to enable platform-wide tracing. Needs to be combined with
-    'enable_op_profiling'. Note, the platform-wide tracing might not work if the
-    tool runs as a commandline native binary. For example, on Android, the
-    ATrace-based tracing only works when the tool is launched as an APK.
 *   `profiling_output_csv_file`: `str` (default="") \
     File path to export profile data to as CSV. The results are printed to
     `stdout` if option is not set. Requires `enable_op_profiling` to be `true`
@@ -65,8 +53,7 @@ The following simply lists the names of these parameters and additional notes
 where applicable. For details about each parameter, please refer to
 [this page](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tools/delegates/README.md#tflite-delegate-registrar).
 #### Common parameters
-* `max_delegated_partitions`: `int` (default=0) \
-Note when `use_legacy_nnapi` is selected, this parameter won't work.
+* `max_delegated_partitions`: `int` (default=0)
 * `min_nodes_per_partition`:`int` (default=0)
 
 #### GPU delegate
diff --git a/tensorflow/lite/tools/benchmark/android/README.md b/tensorflow/lite/tools/benchmark/android/README.md
index f73939c96bf..d41090d9515 100644
--- a/tensorflow/lite/tools/benchmark/android/README.md
+++ b/tensorflow/lite/tools/benchmark/android/README.md
@@ -1,12 +1,12 @@
-# TFLite Android Model Benchmark Tool
+# TFLite Model Benchmark Tool with Android Apk
 
 ## Description
 
 This Android benchmark app is a simple wrapper around the TensorFlow Lite
 [command-line benchmark utility](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark).
 
-Pushing and executing binaries directly on Android is a valid approach to
-benchmarking, but it can result in subtle (but observable) differences in
+Pushing and executing binaries directly on an Android device is a valid approach
+to benchmarking, but it can result in subtle (but observable) differences in
 performance relative to execution within an actual Android app. In particular,
 Android's scheduler tailors behavior based on thread and process priorities,
 which differ between a foreground Activity/Application and a regular background
@@ -96,7 +96,13 @@ page for more detailed information.
 (0)-(3) Follow the steps (0)-(3) of [build/install/run](#to-buildinstallrun)
 section.
 
-(4) Set up Quick Settings tile for System Tracing app on your device. Follow the
+(4) Enable platform tracing.
+
+```
+adb shell setprop debug.tflite.trace 1
+```
+
+(5) Set up Quick Settings tile for System Tracing app on your device. Follow the
 [instruction](https://developer.android.com/topic/performance/tracing/on-device#set-up-tile).
 The System Tracing tile will be added to the Quick Settings panel.
 
@@ -105,20 +111,20 @@ Refer to the
 [guide](https://developer.android.com/topic/performance/tracing/on-device#app-menu)
 for more information.
 
-(5) Tap the System Tracing tile, which has the label "Record trace". The tile
+(6) Tap the System Tracing tile, which has the label "Record trace". The tile
 becomes enabled, and a persistent notification appears to notify you that the
 system is now recording a trace.
 
-(6) Run the benchmark with platform tracing enabled.
+(7) Run the benchmark with platform tracing enabled.
 
 ```
 adb shell am start -S \
   -n org.tensorflow.lite.benchmark/.BenchmarkModelActivity \
   --es args '"--graph=/data/local/tmp/mobilenet_quant_v1_224.tflite \
-  --num_threads=4 --enable_op_profiling=true --enable_platform_tracing=true"'
+  --num_threads=4"'
 ```
 
-(7) Wait until the benchmark finishes. It can be checked from Android log
+(8) Wait until the benchmark finishes. It can be checked from Android log
 messages, e.g.,
 
 ```
@@ -127,14 +133,14 @@ adb logcat | grep "Average inference"
 ... tflite  : Average inference timings in us: Warmup: 91471, Init: 4108, Inference: 80660.1
 ```
 
-(8) Stop tracing by tapping either the System Tracing tile in the Quick Settings
+(9) Stop tracing by tapping either the System Tracing tile in the Quick Settings
 panel or on the System Tracing notification. The system displays a new
 notification that contains the message "Saving trace". When saving is complete,
 the system dismisses the notification and displays a third notification "Trace
 saved", confirming that your trace has been saved and that you're ready to share
 the system trace.
 
-(9)
+(10)
 [Share](https://developer.android.com/topic/performance/tracing/on-device#share-trace)
 a trace file,
 [convert](https://developer.android.com/topic/performance/tracing/on-device#converting_between_trace_formats)
@@ -143,3 +149,9 @@ between tracing formats and
 an HTML report. Note that, the captured tracing file format is either in
 Perfetto format or in Systrace format depending on the Android version of your
 device. Select the appropriate method to handle the generated file.
+
+(11) Disable platform tracing.
+
+```
+adb shell setprop debug.tflite.trace 0
+```
diff --git a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
index 9da48badfbc..511244cee88 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
@@ -34,7 +34,6 @@ limitations under the License.
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/op_resolver.h"
-#include "tensorflow/lite/profiling/platform_profiler.h"
 #include "tensorflow/lite/profiling/profile_summary_formatter.h"
 #include "tensorflow/lite/string_util.h"
 #include "tensorflow/lite/tools/benchmark/benchmark_utils.h"
@@ -61,20 +60,6 @@ constexpr int kOpProfilingEnabledDefault = true;
 constexpr int kOpProfilingEnabledDefault = false;
 #endif
 
-// Dumps platform-wide tracing files via a platform-based profiler that's built
-// upon platform tracing tools, like ATrace on Android etc.
-class PlatformProfilingListener : public BenchmarkListener {
- public:
-  explicit PlatformProfilingListener(Interpreter* interpreter) {
-    TFLITE_TOOLS_CHECK(interpreter);
-    platform_profiler_ = profiling::CreatePlatformProfiler();
-    interpreter->SetProfiler(platform_profiler_.get());
-  }
-
- private:
-  std::unique_ptr<tflite::Profiler> platform_profiler_;
-};
-
 // Dumps ruy profiling events if the ruy profiler is enabled.
 class RuyProfileListener : public BenchmarkListener {
  public:
@@ -259,8 +244,6 @@ BenchmarkParams BenchmarkTfLiteModel::DefaultParams() {
                           BenchmarkParam::Create<std::string>(""));
   default_params.AddParam("input_layer_value_files",
                           BenchmarkParam::Create<std::string>(""));
-  default_params.AddParam("use_legacy_nnapi",
-                          BenchmarkParam::Create<bool>(false));
   default_params.AddParam("allow_fp16", BenchmarkParam::Create<bool>(false));
   default_params.AddParam("require_full_delegation",
                           BenchmarkParam::Create<bool>(false));
@@ -271,8 +254,6 @@ BenchmarkParams BenchmarkTfLiteModel::DefaultParams() {
                           BenchmarkParam::Create<int32_t>(1024));
   default_params.AddParam("profiling_output_csv_file",
                           BenchmarkParam::Create<std::string>(""));
-  default_params.AddParam("enable_platform_tracing",
-                          BenchmarkParam::Create<bool>(false));
 
   for (const auto& delegate_provider :
        tools::GetRegisteredDelegateProviders()) {
@@ -324,7 +305,6 @@ std::vector<Flag> BenchmarkTfLiteModel::GetFlags() {
           "input_layer_value_range of the input_name will be ignored. The file "
           "format is binary and it should be array format or null separated "
           "strings format."),
-      CreateFlag<bool>("use_legacy_nnapi", &params_, "use legacy nnapi api"),
       CreateFlag<bool>("allow_fp16", &params_, "allow fp16"),
       CreateFlag<bool>("require_full_delegation", &params_,
                        "require delegate to run the entire graph"),
@@ -334,10 +314,7 @@ std::vector<Flag> BenchmarkTfLiteModel::GetFlags() {
       CreateFlag<std::string>(
           "profiling_output_csv_file", &params_,
           "File path to export profile data as CSV, if not set "
-          "prints to stdout."),
-      CreateFlag<bool>("enable_platform_tracing", &params_,
-                       "enable platform-wide tracing, only meaningful when "
-                       "--enable_op_profiling is set to true.")};
+          "prints to stdout.")};
 
   flags.insert(flags.end(), specific_flags.begin(), specific_flags.end());
 
@@ -363,9 +340,6 @@ void BenchmarkTfLiteModel::LogParams() {
   LOG_BENCHMARK_PARAM(std::string, "input_layer_value_files",
                       "Input value files", verbose);
 
-#if defined(__ANDROID__)
-  LOG_BENCHMARK_PARAM(bool, "use_legacy_nnapi", "Use legacy nnapi", verbose);
-#endif
   LOG_BENCHMARK_PARAM(bool, "allow_fp16", "Allow fp16", verbose);
   LOG_BENCHMARK_PARAM(bool, "require_full_delegation",
                       "Require full delegation", verbose);
@@ -375,8 +349,6 @@ void BenchmarkTfLiteModel::LogParams() {
                       "Max profiling buffer entries", verbose);
   LOG_BENCHMARK_PARAM(std::string, "profiling_output_csv_file",
                       "CSV File to export profiling data to", verbose);
-  LOG_BENCHMARK_PARAM(bool, "enable_platform_tracing",
-                      "Enable platform-wide tracing", verbose);
 
   for (const auto& delegate_provider :
        tools::GetRegisteredDelegateProviders()) {
@@ -635,7 +607,6 @@ TfLiteStatus BenchmarkTfLiteModel::Init() {
   profiling_listener_ = MayCreateProfilingListener();
   if (profiling_listener_) AddListener(profiling_listener_.get());
 
-  interpreter_->UseNNAPI(params_.Get<bool>("use_legacy_nnapi"));
   interpreter_->SetAllowFp16PrecisionForFp32(params_.Get<bool>("allow_fp16"));
 
   owned_delegates_.clear();
@@ -669,18 +640,21 @@ TfLiteStatus BenchmarkTfLiteModel::Init() {
         return kTfLiteError;
       }
       if (fully_delegated) {
-        TFLITE_LOG(INFO) << "Applied " << delegate_provider->GetName()
+        TFLITE_LOG(INFO) << "Explicitly applied "
+                         << delegate_provider->GetName()
                          << " delegate, and the model graph will be completely"
                          << " executed by the delegate.";
       } else if (num_delegated_kernels > 0) {
-        TFLITE_LOG(INFO) << "Applied " << delegate_provider->GetName()
+        TFLITE_LOG(INFO) << "Explicitly applied "
+                         << delegate_provider->GetName()
                          << " delegate, and the model graph will be partially"
                          << " executed by the delegate w/ "
                          << num_delegated_kernels << " delegate kernels.";
       } else {
-        TFLITE_LOG(INFO) << "Though " << delegate_provider->GetName()
-                         << " delegate is applied, the model graph will not be"
-                         << " executed by the delegate.";
+        TFLITE_LOG(INFO)
+            << "Though " << delegate_provider->GetName()
+            << " delegate is explicitly applied, the model graph will not be"
+            << " executed by the delegate.";
       }
     }
     owned_delegates_.emplace_back(std::move(delegate));
@@ -750,11 +724,6 @@ std::unique_ptr<BenchmarkListener>
 BenchmarkTfLiteModel::MayCreateProfilingListener() const {
   if (!params_.Get<bool>("enable_op_profiling")) return nullptr;
 
-  if (params_.Get<bool>("enable_platform_tracing")) {
-    return std::unique_ptr<BenchmarkListener>(
-        new PlatformProfilingListener(interpreter_.get()));
-  }
-
   return std::unique_ptr<BenchmarkListener>(new ProfilingListener(
       interpreter_.get(), params_.Get<int32_t>("max_profiling_buffer_entries"),
       params_.Get<std::string>("profiling_output_csv_file"),
diff --git a/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h b/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h
index 3398d178561..d320a90d005 100644
--- a/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h
+++ b/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h
@@ -47,7 +47,8 @@ extern "C" {
 typedef enum TfLiteStatus {
   kTfLiteOk = 0,
   kTfLiteError = 1,
-  kTfLiteDelegateError = 2
+  kTfLiteDelegateError = 2,
+  kTfLiteApplicationError = 3
 } TfLiteStatus;
 
 // The list of external context types known to TF Lite. This list exists solely
@@ -88,7 +89,7 @@ typedef struct TfLiteIntArray {
 // https://github.com/google/re2/commit/b94b7cd42e9f02673cd748c1ac1d16db4052514c
 #if (!defined(__clang__) && defined(__GNUC__) && __GNUC__ == 6 && \
      __GNUC_MINOR__ >= 1) ||                                      \
-    defined(HEXAGON)
+    defined(HEXAGON) || (__clang_major__ == 7 && __clang_minor__ == 1)
   int data[0];
 #else
   int data[];
@@ -358,6 +359,8 @@ typedef union TfLitePtrUnion {
 //  * kTfLitePersistentRo: Allocated and populated during prepare. This is
 //        useful for tensors that can be computed during prepare and treated
 //        as constant inputs for downstream ops (also in prepare).
+//  * kTfLiteCustom: Custom memory allocation provided by the user. See
+//        TfLiteCustomAllocation below.
 typedef enum TfLiteAllocationType {
   kTfLiteMemNone = 0,
   kTfLiteMmapRo,
@@ -365,6 +368,7 @@ typedef enum TfLiteAllocationType {
   kTfLiteArenaRwPersistent,
   kTfLiteDynamic,
   kTfLitePersistentRo,
+  kTfLiteCustom,
 } TfLiteAllocationType;
 
 // The delegates should use zero or positive integers to represent handles.
@@ -397,6 +401,15 @@ typedef struct TfLiteSparsity {
   int dim_metadata_size;
 } TfLiteSparsity;
 
+// Defines a custom memory allocation not owned by the runtime.
+// `data` should be aligned to kDefaultTensorAlignment defined in
+// lite/util.h. (Currently 64 bytes)
+// NOTE: See Interpreter.SetCustomAllocationForTensor for details on usage.
+typedef struct TfLiteCustomAllocation {
+  void* data;
+  size_t bytes;
+} TfLiteCustomAllocation;
+
 // An tensor in the interpreter system which is a wrapper around a buffer of
 // data including a dimensionality (or NULL if not currently defined).
 #ifndef TF_LITE_STATIC_MEMORY
@@ -861,7 +874,26 @@ typedef enum TfLiteDelegateFlags {
   //
   // If the delegate isn't capable to handle dynamic tensors, this flag need
   // to be set to false.
-  kTfLiteDelegateFlagsAllowDynamicTensors = 1
+  kTfLiteDelegateFlagsAllowDynamicTensors = 1,
+
+  // This flag can be used by delegates (that allow dynamic tensors) to ensure
+  // applicable tensor shapes are automatically propagated in the case of tensor
+  // resizing.
+  // This means that non-dynamic (allocation_type != kTfLiteDynamic) I/O tensors
+  // of a delegate kernel will have correct shapes before its Prepare() method
+  // is called. The runtime leverages TFLite builtin ops in the original
+  // execution plan to propagate shapes.
+  //
+  // A few points to note:
+  // 1. This requires kTfLiteDelegateFlagsAllowDynamicTensors. If that flag is
+  // false, this one is redundant since the delegate kernels are re-initialized
+  // every time tensors are resized.
+  // 2. Enabling this flag adds some overhead to AllocateTensors(), since extra
+  // work is required to prepare the original execution plan.
+  // 3. This flag requires that the original execution plan only have ops with
+  // valid registrations (and not 'dummy' custom ops like with Flex).
+  // WARNING: This feature is experimental and subject to change.
+  kTfLiteDelegateFlagsRequirePropagatedShapes = 2
 } TfLiteDelegateFlags;
 
 // WARNING: This is an experimental interface that is subject to change.
diff --git a/tensorflow/lite/tools/build_aar.sh b/tensorflow/lite/tools/build_aar.sh
new file mode 100755
index 00000000000..6d84d5b35b1
--- /dev/null
+++ b/tensorflow/lite/tools/build_aar.sh
@@ -0,0 +1,214 @@
+#!/bin/bash
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+ROOT_DIR="$(cd "${SCRIPT_DIR}/../../../" && pwd)"
+
+function print_usage {
+  echo "Usage:"
+  echo "  $(basename ${BASH_SOURCE}) \\"
+  echo "    --input_models=model1.tflite,model2.tflite \\"
+  echo "    --target_archs=x86,x86_64,arm64-v8a,armeabi-v7a \\"
+  echo "    --tflite_custom_ops_srcs=file1.cc,file2.h \\"
+  echo "    --tflite_custom_ops_deps=dep1,dep2"
+  echo ""
+  echo "Where: "
+  echo "  --input_models: Supported TFLite models. "
+  echo "  --target_archs: Supported arches included in the aar file."
+  echo "  --tflite_custom_ops_srcs: The src files for building additional TFLite custom ops if any."
+  echo "  --tflite_custom_ops_deps: Dependencies for building additional TFLite custom ops if any."
+  echo ""
+  exit 1
+}
+
+function generate_list_field {
+  local name="$1"
+  local list_string="$2"
+  local list=(${list_string//,/ })
+
+  local message+=("$name=[")
+  for item in "${list[@]}"
+  do
+    message+=("\"$item\",")
+  done
+  message+=('],')
+  printf '%s' "${message[@]}"
+}
+
+function print_output {
+  echo "Output can be found here:"
+  for i in "$@"
+  do
+    # Check if the file exist.
+    ls -1a ${ROOT_DIR}/$i
+  done
+}
+
+function generate_tflite_aar {
+  pushd ${TMP_DIR} > /dev/null
+  # Generate the BUILD file.
+  message=(
+    'load("//tensorflow/lite:build_def.bzl", "tflite_custom_android_library")'
+    'load("//tensorflow/lite/java:aar_with_jni.bzl", "aar_with_jni")'
+    ''
+    'tflite_custom_android_library('
+    '    name = "custom_tensorflowlite",'
+  )
+  message+=('    '$(generate_list_field "models" $MODEL_NAMES))
+  message+=('    '$(generate_list_field "srcs" $TFLITE_OPS_SRCS))
+  message+=('    '$(generate_list_field "deps" $FLAG_TFLITE_OPS_DEPS))
+  message+=(
+    ')'
+    ''
+    'aar_with_jni('
+    '    name = "tensorflow-lite",'
+    '    android_library = ":custom_tensorflowlite",'
+    ')'
+    ''
+  )
+  printf '%s\n' "${message[@]}" >> BUILD
+
+  # Build the aar package.
+  popd > /dev/null
+  bazel build -c opt --cxxopt='--std=c++14' \
+        --fat_apk_cpu=${TARGET_ARCHS} \
+        --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
+        //tmp:tensorflow-lite
+
+   OUT_FILES="${OUT_FILES} bazel-bin/tmp/tensorflow-lite.aar"
+}
+
+function generate_flex_aar {
+  pushd ${TMP_DIR}
+  # Generating the BUILD file.
+  message=(
+    'load("//tensorflow/lite/delegates/flex:build_def.bzl", "tflite_flex_android_library")'
+    'load("//tensorflow/lite/java:aar_with_jni.bzl", "aar_with_jni")'
+    ''
+    'tflite_flex_android_library('
+    '    name = "custom_tensorflowlite_flex",'
+    )
+  message+=('    '$(generate_list_field "models" $MODEL_NAMES))
+  message+=(
+    ')'
+    ''
+    'aar_with_jni('
+    '    name = "tensorflow-lite-select-tf-ops",'
+    '    android_library = ":custom_tensorflowlite_flex",'
+    ')'
+  )
+  printf '%s\n' "${message[@]}" >> BUILD
+
+  cp ${ROOT_DIR}/tensorflow/lite/java/AndroidManifest.xml .
+  cp ${ROOT_DIR}/tensorflow/lite/java/proguard.flags .
+  popd
+
+  # Build the aar package.
+  bazel build -c opt --cxxopt='--std=c++14' \
+      --fat_apk_cpu=${TARGET_ARCHS} \
+      --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
+      //tmp:tensorflow-lite-select-tf-ops
+
+  OUT_FILES="${OUT_FILES} bazel-bin/tmp/tensorflow-lite-select-tf-ops.aar"
+}
+
+# Check command line flags.
+TARGET_ARCHS=x86,x86_64,arm64-v8a,armeabi-v7a
+
+if [ "$#" -gt 4 ]; then
+  echo "ERROR: Too many arguments."
+  print_usage
+fi
+
+for i in "$@"
+do
+case $i in
+    --input_models=*)
+      FLAG_MODELS="${i#*=}"
+      shift;;
+    --target_archs=*)
+      TARGET_ARCHS="${i#*=}"
+      shift;;
+    --tflite_custom_ops_srcs=*)
+      FLAG_TFLITE_OPS_SRCS="${i#*=}"
+      shift;;
+    --tflite_custom_ops_deps=*)
+      FLAG_TFLITE_OPS_DEPS="${i#*=}"
+      shift;;
+    *)
+      echo "ERROR: Unrecognized argument: ${i}"
+      print_usage;;
+esac
+done
+
+# Check if users already run configure
+cd $ROOT_DIR
+if [ ! -f "$ROOT_DIR/.tf_configure.bazelrc" ]; then
+  echo "ERROR: Please run ./configure first."
+  exit 1
+else
+  if ! grep -q ANDROID_SDK_HOME "$ROOT_DIR/.tf_configure.bazelrc"; then
+    echo "ERROR: Please run ./configure with Android config."
+    exit 1
+  fi
+fi
+
+# Build the standard aar package of no models provided.
+if [ -z ${FLAG_MODELS} ]; then
+  bazel build -c opt --cxxopt='--std=c++14' \
+    --fat_apk_cpu=${TARGET_ARCHS} \
+    --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
+    //tensorflow/lite/java:tensorflow-lite
+
+  print_output bazel-bin/tensorflow/lite/java/tensorflow-lite.aar
+  exit 0
+fi
+
+# Prepare the tmp directory.
+TMP_DIR="${ROOT_DIR}/tmp/"
+rm -rf ${TMP_DIR} && mkdir -p ${TMP_DIR}
+
+# Copy models to tmp directory.
+MODEL_NAMES=""
+for model in $(echo ${FLAG_MODELS} | sed "s/,/ /g")
+do
+  cp ${model} ${TMP_DIR}
+  MODEL_NAMES="${MODEL_NAMES},$(basename ${model})"
+done
+
+# Copy srcs of additional tflite ops to tmp directory.
+TFLITE_OPS_SRCS=""
+for src_file in $(echo ${FLAG_TFLITE_OPS_SRCS} | sed "s/,/ /g")
+do
+  cp ${src_file} ${TMP_DIR}
+  TFLITE_OPS_SRCS="${TFLITE_OPS_SRCS},$(basename ${src_file})"
+done
+
+# Build the custom aar package.
+generate_tflite_aar
+
+# Build flex aar if one of the models contain flex ops.
+bazel build -c opt --config=monolithic //tensorflow/lite/tools:list_flex_ops_no_kernel_main
+bazel-bin/tensorflow/lite/tools/list_flex_ops_no_kernel_main --graphs=${FLAG_MODELS} > ${TMP_DIR}/ops_list.txt
+if [[ `cat ${TMP_DIR}/ops_list.txt` != "[]" ]]; then
+  generate_flex_aar
+fi
+
+# List the output files.
+rm -rf ${TMP_DIR}
+print_output ${OUT_FILES}
diff --git a/tensorflow/lite/tools/build_aar_with_docker.sh b/tensorflow/lite/tools/build_aar_with_docker.sh
new file mode 100755
index 00000000000..094406e2076
--- /dev/null
+++ b/tensorflow/lite/tools/build_aar_with_docker.sh
@@ -0,0 +1,118 @@
+#!/bin/bash
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+function print_usage {
+  echo "Usage:"
+  echo "  $(basename ${BASH_SOURCE}) \\"
+  echo "    --input_models=model1.tflite,model2.tflite \\"
+  echo "    --target_archs=x86,x86_64,arm64-v8a,armeabi-v7a \\"
+  echo "    --checkpoint=master"
+  echo ""
+  echo "Where: "
+  echo "  --input_models: Supported TFLite models. "
+  echo "  --target_archs: Supported arches included in the aar file."
+  echo "  --checkpoint: Checkpoint of the github repo, could be a branch, a commit or a tag. Default: master"
+  echo ""
+  exit 1
+}
+
+# Check command line flags.
+ARGUMENTS=$@
+BUILD_FLAGS=""
+TARGET_ARCHS=x86,x86_64,arm64-v8a,armeabi-v7a
+FLAG_CHECKPOINT="master"
+
+if [ "$#" -gt 3 ]; then
+  echo "ERROR: Too many arguments."
+  print_usage
+fi
+
+for i in "$@"
+do
+case $i in
+    --input_models=*)
+      FLAG_MODELS="${i#*=}"
+      BUILD_FLAGS="${BUILD_FLAGS} ${i}"
+      shift;;
+    --target_archs=*)
+      TARGET_ARCHS="${i#*=}"
+      BUILD_FLAGS="${BUILD_FLAGS} ${i}"
+      shift;;
+    --checkpoint=*)
+      FLAG_CHECKPOINT="${i#*=}"
+      shift;;
+    *)
+      echo "ERROR: Unrecognized argument: ${i}"
+      print_usage;;
+esac
+done
+
+if [ ! -d /tensorflow_src ]; then
+  # Running on host.
+  for model in $(echo ${FLAG_MODELS} | sed "s/,/ /g")
+  do
+    FLAG_DIR="${FLAG_DIR} -v ${model}:${model}"
+  done
+  docker run --rm -it -v $PWD:/host_dir -v ${SCRIPT_DIR}:/script_dir ${FLAG_DIR} \
+    --entrypoint /script_dir/build_aar_with_docker.sh tflite-builder \
+    ${ARGUMENTS}
+  exit 0
+else
+  # Running inside docker container, download the SDK first.
+  android update sdk --no-ui -a \
+    --filter tools,platform-tools,android-${ANDROID_API_LEVEL},build-tools-${ANDROID_BUILD_TOOLS_VERSION}
+
+  cd /tensorflow_src
+
+  # Run configure.
+  configs=(
+    '/usr/bin/python3'
+    '/usr/lib/python3/dist-packages'
+    'N'
+    'N'
+    'N'
+    'N'
+    '-march=native -Wno-sign-compare'
+    'y'
+    '/android/sdk'
+  )
+  printf '%s\n' "${configs[@]}" | ./configure
+
+  # Pull the latest code from tensorflow.
+  git pull -a
+  git checkout ${FLAG_CHECKPOINT}
+
+  # Building with bazel.
+  bash /tensorflow_src/tensorflow/lite/tools/build_aar.sh ${BUILD_FLAGS}
+
+  # Copy the output files from docker container.
+  clear
+  OUT_FILES="/tensorflow_src/bazel-bin/tmp/tensorflow-lite.aar"
+  OUT_FILES="${OUT_FILES} /tensorflow_src/bazel-bin/tmp/tensorflow-lite-select-tf-ops.aar"
+  echo "Output can be found here:"
+  for i in ${OUT_FILES}
+  do
+    if [ -f $i ]; then
+      cp $i /host_dir
+      basename $i
+    fi
+  done
+fi
+
diff --git a/tensorflow/lite/tools/cmake/README.md b/tensorflow/lite/tools/cmake/README.md
new file mode 100644
index 00000000000..7624b6623c2
--- /dev/null
+++ b/tensorflow/lite/tools/cmake/README.md
@@ -0,0 +1,50 @@
+# Build TensorFlow Lite with CMake
+
+This page describes how to build the TensorFlow Lite static library with CMake
+tool.
+
+The following instructions have been tested on Ubuntu 16.04.3 64-bit PC (AMD64)
+and TensorFlow devel docker image
+[tensorflow/tensorflow:devel](https://hub.docker.com/r/tensorflow/tensorflow/tags/).
+
+**Note:** This is an experimental that is subject to change.
+
+**Note:** The following are not currently supported: Android, iOS, Tests and
+Host Tools (i.e benchmark / analysis tools etc.)
+
+#### Step 1. Install CMake tool
+
+It requires CMake 3.16 or higher. On Ubunutu, you can simply run the following
+command.
+
+```sh
+sudo apt-get install cmake
+```
+
+Or you can follow [the offcial cmake installation guide](https://cmake.org/install/)
+
+#### Step 2. Clone TensorFlow repository
+
+```sh
+git clone https://github.com/tensorflow/tensorflow.git tensorflow_src
+```
+
+**Note:** If you're using the TensorFlow Docker image, the repo is already
+provided in `/tensorflow_src/`.
+
+#### Step 3. Create CMake build directory and run CMake tool
+
+```sh
+mkdir tflite_build
+cd tflite_build
+cmake ../tensorflow_src/tensorflow/lite
+```
+
+#### Step 4. Build TensorFlow Lite
+
+```sh
+cmake --build . -j
+```
+
+**Note:** This should compile a static library `libtensorflow-lite.a` in the
+current directory.
diff --git a/tensorflow/lite/tools/cmake/modules/Findeigen.cmake b/tensorflow/lite/tools/cmake/modules/Findeigen.cmake
new file mode 100644
index 00000000000..1ffb54790fa
--- /dev/null
+++ b/tensorflow/lite/tools/cmake/modules/Findeigen.cmake
@@ -0,0 +1,24 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# tensorflow-lite uses find_package for this package, so override the system
+# installation and build from source instead.
+include(eigen)
+if(eigen_POPULATED)
+  set(EIGEN_FOUND TRUE)
+  get_target_property(EIGEN_INCLUDE_DIRS eigen INTERFACE_DIRECTORIES)
+  set(EIGEN_LIBRARIES Eigen3::Eigen)
+endif()
+
diff --git a/tensorflow/lite/tools/cmake/modules/Findfarmhash.cmake b/tensorflow/lite/tools/cmake/modules/Findfarmhash.cmake
new file mode 100644
index 00000000000..1b0dc28f624
--- /dev/null
+++ b/tensorflow/lite/tools/cmake/modules/Findfarmhash.cmake
@@ -0,0 +1,25 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# tensorflow-lite uses find_package for this package, so override the system
+# installation and build from source instead.
+include(farmhash)
+if(farmhash_POPULATED)
+  set(FARMHASH_FOUND TRUE)
+  get_target_property(FARMHASH_INCLUDE_DIRS farmhash INTERFACE_DIRECTORIES)
+  add_library(farmhash::farmhash ALIAS farmhash)
+  set(FARMHASH_LIBRARIES farmhash::farmhash)
+endif()
+
diff --git a/tensorflow/lite/tools/cmake/modules/Findfft2d.cmake b/tensorflow/lite/tools/cmake/modules/Findfft2d.cmake
new file mode 100644
index 00000000000..0d074323ed0
--- /dev/null
+++ b/tensorflow/lite/tools/cmake/modules/Findfft2d.cmake
@@ -0,0 +1,37 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# tensorflow-lite uses find_package for this package, so override the system
+# installation and build from source instead.
+include(fft2d)
+if(fft2d_POPULATED)
+  set(FFT2D_FOUND TRUE CACHE BOOL "Found FF2D")
+  get_target_property(FFT2D_INCLUDE_DIRS fft2d INCLUDE_DIRECTORIES)
+  set(FFT2D_INCLUDE_DIRS ${FFT2D_INCLUDE_DIRS} CACHE STRING
+    "FFT2D include dirs"
+  )
+  set(FFT2D_LIBRARIES
+    fft2d_alloc
+    fft2d_fft4f2d
+    fft2d_fftsg
+    fft2d_fftsg2d
+    fft2d_fftsg3d
+    fft2d_shrtdct
+    CACHE
+    STRING
+    "FFT2D libraries"
+  )
+endif()
+
diff --git a/tensorflow/lite/tools/cmake/modules/Findflatbuffers.cmake b/tensorflow/lite/tools/cmake/modules/Findflatbuffers.cmake
new file mode 100644
index 00000000000..feb447b133f
--- /dev/null
+++ b/tensorflow/lite/tools/cmake/modules/Findflatbuffers.cmake
@@ -0,0 +1,27 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# tensorflow-lite uses find_package for this package, so override the system
+# installation and build from source instead.
+include(flatbuffers)
+if(flatbuffers_POPULATED)
+  set(FLATBUFFERS_FOUND TRUE)
+  get_target_property(FLATBUFFERS_INCLUDE_DIRS flatbuffers INCLUDE_DIRECTORIES)
+  set(FLATBUFFERS_LIBRARIES flatbuffers)
+  set(FLATBUFFERS_PROJECT_DIR "${flatbuffers_SOURCE_DIR}" CACHE STRING
+    "Flatbuffers project dir"
+  )
+endif()
+
diff --git a/tensorflow/lite/tools/cmake/modules/Findgemmlowp.cmake b/tensorflow/lite/tools/cmake/modules/Findgemmlowp.cmake
new file mode 100644
index 00000000000..70331ad0a69
--- /dev/null
+++ b/tensorflow/lite/tools/cmake/modules/Findgemmlowp.cmake
@@ -0,0 +1,29 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# tensorflow-lite uses find_package for this package, so override the system
+# installation and build from source instead.
+include(gemmlowp)
+if(gemmlowp_POPULATED)
+  set(GEMMLOWP_FOUND TRUE)
+  get_target_property(GEMMLOWP_INCLUDE_DIRS gemmlowp INTERFACE_DIRECTORIES)
+  set(GEMMLOWP_LIBRARIES
+    gemmlowp
+    gemmlowp_fixedpoint
+    gemmlowp_profiler
+    gemmlowp_eight_bit_int_gemm
+  )
+endif()
+
diff --git a/tensorflow/lite/tools/cmake/modules/Findneon2sse.cmake b/tensorflow/lite/tools/cmake/modules/Findneon2sse.cmake
new file mode 100644
index 00000000000..83543852c87
--- /dev/null
+++ b/tensorflow/lite/tools/cmake/modules/Findneon2sse.cmake
@@ -0,0 +1,23 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# tensorflow-lite uses find_package for this package, so override the system
+# installation and build from source instead.
+include(neon2sse)
+if(neon2sse_POPULATED)
+  set(NEON2SSE_FOUND TRUE)
+  get_target_property(NEON2SSE_INCLUDE_DIRS NEON_2_SSE INTERFACE_DIRECTORIES)
+  set(NEON2SSE_LIBRARIES NEON_2_SSE)
+endif()
diff --git a/tensorflow/lite/tools/cmake/modules/Findruy.cmake b/tensorflow/lite/tools/cmake/modules/Findruy.cmake
new file mode 100644
index 00000000000..e1517eebb04
--- /dev/null
+++ b/tensorflow/lite/tools/cmake/modules/Findruy.cmake
@@ -0,0 +1,16 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+include(ruy)
diff --git a/tensorflow/lite/tools/cmake/modules/Findxnnpack.cmake b/tensorflow/lite/tools/cmake/modules/Findxnnpack.cmake
new file mode 100644
index 00000000000..ee32c9a592e
--- /dev/null
+++ b/tensorflow/lite/tools/cmake/modules/Findxnnpack.cmake
@@ -0,0 +1,16 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+include(xnnpack)
diff --git a/tensorflow/lite/tools/cmake/modules/OverridableFetchContent.cmake b/tensorflow/lite/tools/cmake/modules/OverridableFetchContent.cmake
new file mode 100644
index 00000000000..9ed95109ba9
--- /dev/null
+++ b/tensorflow/lite/tools/cmake/modules/OverridableFetchContent.cmake
@@ -0,0 +1,583 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+include(FetchContent)
+
+# Pairs of regex --> replacement strings that map Git repositories to archive
+# URLs. GIT_COMMIT is replaced with the hash of the commit.
+set(OVERRIDABLE_FETCH_CONTENT_GITHUB_MATCH
+  "^https?://github.com/([^/]+)/([^/.]+)(\\.git)?\$"
+)
+set(OVERRIDABLE_FETCH_CONTENT_GITHUB_REPLACE
+  "https://github.com/\\1/\\2/archive/GIT_COMMIT.zip"
+)
+set(OVERRIDABLE_FETCH_CONTENT_GITLAB_MATCH
+  "^https?://gitlab.com/([^/]+)/([^/.]+)(\\.git)?"
+)
+set(OVERRIDABLE_FETCH_CONTENT_GITLAB_REPLACE
+  "https://gitlab.com/\\1/\\2/-/archive/GIT_COMMIT/\\2-GIT_COMMIT.tar.gz"
+)
+set(OVERRIDABLE_FETCH_CONTENT_GOOGLESOURCE_MATCH
+  "^(https?://[^.]+\\.googlesource\\.com/.*)"
+)
+set(OVERRIDABLE_FETCH_CONTENT_GOOGLESOURCE_REPLACE
+  "\\1/+archive/GIT_COMMIT.tar.gz"
+)
+# List of prefixes for regex match and replacement variables that map Git
+# repositories to archive URLs.
+list(APPEND OVERRIDABLE_FETCH_CONTENT_GIT_TRANSFORMS
+  OVERRIDABLE_FETCH_CONTENT_GITHUB
+  OVERRIDABLE_FETCH_CONTENT_GITLAB
+  OVERRIDABLE_FETCH_CONTENT_GOOGLESOURCE
+)
+
+# Pairs of regex --> replacement strings that map Git repositories to raw file
+# URLs.
+set(OVERRIDABLE_FETCH_CONTENT_GITHUB_FILE_MATCH
+  "${OVERRIDABLE_FETCH_CONTENT_GITHUB_MATCH}"
+)
+set(OVERRIDABLE_FETCH_CONTENT_GITHUB_FILE_REPLACE
+  "https://raw.githubusercontent.com/\\1/\\2/GIT_COMMIT/FILE_PATH"
+)
+set(OVERRIDABLE_FETCH_CONTENT_GITLAB_FILE_MATCH
+  "${OVERRIDABLE_FETCH_CONTENT_GITLAB_MATCH}"
+)
+set(OVERRIDABLE_FETCH_CONTENT_GITLAB_FILE_REPLACE
+  "https://gitlab.com/\\1/\\2/-/raw/GIT_COMMIT/FILE_PATH"
+)
+set(OVERRIDABLE_FETCH_CONTENT_GOOGLESOURCE_FILE_MATCH
+  "${OVERRIDABLE_FETCH_CONTENT_GOOGLESOURCE_MATCH}"
+)
+# This isn't the raw file, gitiles doesn't support raw file download without
+# decoding the file from base64.
+set(OVERRIDABLE_FETCH_CONTENT_GOOGLESOURCE_FILE_REPLACE
+  "\\1/+/GIT_COMMIT/FILE_PATH"
+)
+
+# List of prefixes for regex match and replacement variables that map Git
+# repositories to archive URLs.
+list(APPEND OVERRIDABLE_FETCH_CONTENT_GIT_FILE_TRANSFORMS
+  OVERRIDABLE_FETCH_CONTENT_GITHUB_FILE
+  OVERRIDABLE_FETCH_CONTENT_GITLAB_FILE
+  OVERRIDABLE_FETCH_CONTENT_GOOGLESOURCE_FILE
+)
+
+# Try applying replacements to string.
+#
+# TRANSFORMS: List which contains prefixes for  _MATCH / _REPLACE replacements
+# to try. For example, given the list "FOO" this will try to apply a regex
+# replacement with the value of FOO_MATCH and FOO_REPLACE.
+# TO_REPLACE: String to apply replacements to.
+# OUTPUT_VAR: Name of the variable to store the URL if successful. If
+# conversion fails this variable will be empty.
+function(_ApplyReplacements TRANSFORMS TO_REPLACE OUTPUT_VAR)
+  foreach(PREFIX ${TRANSFORMS})
+    message(VERBOSE "Try converting ${GIT_REPOSITORY} with ${${PREFIX}_MATCH}")
+    set(MATCH "${${PREFIX}_MATCH}")
+    set(REPLACE "${${PREFIX}_REPLACE}")
+    if(MATCH AND REPLACE)
+      string(REGEX REPLACE
+        "${MATCH}"
+        "${REPLACE}"
+        REPLACED
+        "${TO_REPLACE}"
+      )
+      if(NOT "${REPLACED}" STREQUAL "${TO_REPLACE}")
+        set(${OUTPUT_VAR} "${REPLACED}" PARENT_SCOPE)
+      endif()
+    endif()
+  endforeach()
+endfunction()
+
+
+# Try to convert a Git repository to an archive URL.
+#
+# GIT_REPOSITORY: Repository URL to convert.
+# GIT_COMMIT: Commit hash or tag to convert.
+# REPORT_WARNING: Whether to report a warning if conversion fails.
+# OUTPUT_VAR: Name of the variable to store the URL if successful. If
+# conversion fails this variable will be empty.
+function(_GitRepoArchiveUrl GIT_REPOSITORY GIT_COMMIT REPORT_WARNING OUTPUT_VAR)
+  list(REMOVE_DUPLICATES OVERRIDABLE_FETCH_CONTENT_GIT_TRANSFORMS)
+  _ApplyReplacements(
+    "${OVERRIDABLE_FETCH_CONTENT_GIT_TRANSFORMS}"
+    "${GIT_REPOSITORY}"
+    REPLACED
+  )
+  if(REPLACED)
+    string(REPLACE "GIT_COMMIT" "${GIT_COMMIT}" WITH_COMMIT "${REPLACED}")
+    message(VERBOSE "${GIT_REPOSITORY} / ${GIT_COMMIT} --> ${WITH_COMMIT}")
+    set(${OUTPUT_VAR} "${WITH_COMMIT}" PARENT_SCOPE)
+  elseif(REPORT_WARNING)
+    message(WARNING
+      "Unable to map ${GIT_REPOSITORY} / ${GIT_COMMIT} to an archive URL"
+    )
+  endif()
+endfunction()
+
+
+# Try to convert a Git repository, commit and relative path to a link to the
+# file.
+#
+# GIT_REPOSITORY: Repository URL to convert.
+# GIT_COMMIT: Commit hash or tag to convert.
+# FILE_PATH: Path to the file.
+# OUTPUT_VAR: Name of the variable to store the URL if successful. If
+# conversion fails this variable will be empty.
+function(_GitRepoFileUrl GIT_REPOSITORY GIT_COMMIT FILE_PATH OUTPUT_VAR)
+  list(REMOVE_DUPLICATES OVERRIDABLE_FETCH_CONTENT_GIT_FILE_TRANSFORMS)
+  _ApplyReplacements(
+    "${OVERRIDABLE_FETCH_CONTENT_GIT_FILE_TRANSFORMS}"
+    "${GIT_REPOSITORY}"
+    REPLACED
+  )
+  if(REPLACED)
+    string(REPLACE "GIT_COMMIT" "${GIT_COMMIT}" WITH_COMMIT "${REPLACED}")
+    string(REPLACE "FILE_PATH" "${FILE_PATH}" WITH_FILE "${WITH_COMMIT}")
+    message(VERBOSE
+      "${GIT_REPOSITORY} / ${GIT_COMMIT} / ${FILE_PATH} --> ${WITH_FILE}"
+    )
+    set(${OUTPUT_VAR} "${WITH_FILE}" PARENT_SCOPE)
+  else()
+    message(WARNING
+      "Unable to map ${GIT_REPOSITORY} / ${GIT_COMMIT} / ${FILE_PATH} to a URL"
+    )
+  endif()
+endfunction()
+
+
+# Try to determine the license URL from a path within the content and
+# cache LICENSE_FILE and LICENSE_URL properties.
+#
+# CONTENT_NAME: Name of the content that hosts the license.
+# LICENSE_FILE: Relative path in the archive.
+# OUTPUT_VAR: Name of variable to store / retrieve the license URL.
+function(_LicenseFileToUrl CONTENT_NAME LICENSE_FILE OUTPUT_VAR)
+  foreach(PROPERTY GIT_REPOSITORY GIT_COMMIT LICENSE_URL)
+    _OverridableFetchContent_GetProperty(
+      "${CONTENT_NAME}"
+      "${PROPERTY}"
+      "${PROPERTY}"
+    )
+  endforeach()
+  _OverridableFetchContent_SetProperty(
+    "${CONTENT_NAME}"
+    LICENSE_FILE
+    "License for ${CONTENT_NAME}"
+    "${LICENSE_FILE}"
+  )
+  if(NOT LICENSE_URL)
+    if(GIT_REPOSITORY AND GIT_COMMIT)
+      # Try to synthesize the license URL from the repo path.
+      _GitRepoFileUrl(
+        "${GIT_REPOSITORY}"
+        "${GIT_COMMIT}"
+        "${LICENSE_FILE}"
+        LICENSE_URL
+      )
+      if(LICENSE_URL)
+        _OverridableFetchContent_SetProperty(
+          "${CONTENT_NAME}"
+          LICENSE_URL
+          "License URL for ${CONTENT_NAME}"
+          "${LICENSE_URL}"
+        )
+        set(${OUTPUT_VAR} "${LICENSE_URL}" PARENT_SCOPE)
+      endif()
+    endif()
+  endif()
+endfunction()
+
+
+# Replacement for FetchContent_Declare() that allows the user to override the
+# download URL for Git and URL sources and also favor fetching via URL vs.
+# a Git repo using variables external to this method.
+#
+# See FetchContent_Declare() and ExternalProject_Add() for the arguments
+# supported by this method.
+#
+# In addition to FetchContent_Declare() and ExternalProject_Add() arguments,
+# this method supports LICENSE_FILE that enables the caller to specify the
+# relative path of the license in the downloaded archive which disables the
+# search for a license in OverridableFetchContent_Populate().
+# LICENSE_URL can be specified to override the URL of the LICENSE_FILE if
+# a direct link to the URL can't be formed from the download path.
+#
+# It's possible to override, GIT_REPOSITORY, GIT_TAG, URL and URL_HASH for
+# a target by setting
+# OVERRIDABLE_FETCH_CONTENT_<contentName>_<variable> where <contentName> is the
+# CONTENT_NAME argument content provided to this method and <variable> is the
+# argument of this method to override. For example, given CONTENT_NAME = foo
+# the GIT_REPOSITORY can be overridden by setting foo_GIT_REPOSITORY to the
+# value to use instead.
+#
+# To convert a GIT_REPOSITORY / GIT_TAG reference to a URL,
+# set OVERRIDABLE_FETCH_CONTENT_GIT_REPOSITORY_AND_TAG_TO_URL_<contentName>
+# to ON for one repository or
+# OVERRIDABLE_FETCH_CONTENT_GIT_REPOSITORY_AND_TAG_TO_URL to ON for all
+# repositories. This will, where possible, convert a GIT_REPOSITORY / GIT_TAG
+# reference to a URL to download instead which is much faster to copy than
+# cloning a git repo.
+#
+# If OVERRIDABLE_FETCH_CONTENT_USE_GIT is ON, when a GIT_REPOSITORY and a
+# download URL are specified this method will clone the GIT_REPOSITORY. When
+# OVERRIDABLE_FETCH_CONTENT_USE_GIT is OFF or not set and both GIT_REPOSITORY
+# and download URL are specified the download URL is used instead.
+#
+# To override the archive URL before it's passed to FetchContent_Declare()
+# set OVERRIDABLE_FETCH_CONTENT_<contentName>_MATCH to a regular expression
+# to match the archive URL and OVERRIDABLE_FETCH_CONTENT_<contentName>_REPLACE
+# with the string to replace the archive URL.
+#
+# All content names passed to this method are added to the global property
+# OVERRIDABLE_FETCH_CONTENT_LIST.
+function(OverridableFetchContent_Declare CONTENT_NAME)
+  set(OVERRIDABLE_ARGS
+    GIT_REPOSITORY
+    GIT_TAG
+    URL
+    URL_HASH
+    URL_MD5
+  )
+  set(ALL_VALUE_ARGS LICENSE_FILE LICENSE_URL ${OVERRIDABLE_ARGS})
+  cmake_parse_arguments(ARGS
+    ""
+    "${ALL_VALUE_ARGS}"
+    ""
+    ${ARGN}
+  )
+  # Optionally override parsed arguments with values from variables in the form
+  # ${CONTENT_NAME}_${OVERRIDABLE_ARG}.
+  foreach(OVERRIDABLE_ARG in ${OVERRIDABLE_ARGS})
+    set(OVERRIDE_VALUE
+      ${OVERRIDABLE_FETCH_CONTENT_${CONTENT_NAME}_${OVERRIDABLE_ARG}}
+    )
+    if(NOT "${OVERRIDE_VALUE}" STREQUAL "")
+      set(ARGS_${OVERRIDABLE_ARG} "${OVERRIDE_VALUE}")
+      message(VERBOSE "Overriding ${OVERRIDABLE_ARG} of content "
+        "${CONTENT_NAME} with '${OVERRIDE_VALUE}'"
+      )
+    endif()
+  endforeach()
+
+  # If specified, save the source URL so it's possible to synthesize a link to
+  # the license when the content is populated.
+  if(ARGS_GIT_REPOSITORY AND ARGS_GIT_TAG)
+    _OverridableFetchContent_SetProperty(
+      "${CONTENT_NAME}"
+      GIT_REPOSITORY
+      "Git repo for ${CONTENT_NAME}"
+      "${ARGS_GIT_REPOSITORY}"
+    )
+    _OverridableFetchContent_SetProperty(
+      "${CONTENT_NAME}"
+      GIT_COMMIT
+      "Git commit for ${CONTENT_NAME}"
+      "${ARGS_GIT_TAG}"
+    )
+  endif()
+
+  # Set the license file and URL properties.
+  if(ARGS_LICENSE_URL)
+    _OverridableFetchContent_SetProperty(
+      "${CONTENT_NAME}"
+      LICENSE_URL
+      "License URL for ${CONTENT_NAME}"
+      "${ARGS_LICENSE_URL}"
+    )
+  endif()
+  if(ARGS_LICENSE_FILE)
+    _LicenseFileToUrl(
+      "${CONTENT_NAME}"
+      "${ARGS_LICENSE_FILE}"
+      ARGS_LICENSE_URL
+    )
+  endif()
+
+  # Try mapping to an archive URL.
+  set(ARCHIVE_URL "")
+  if(ARGS_GIT_REPOSITORY AND ARGS_GIT_TAG)
+    _GitRepoArchiveUrl(
+      "${ARGS_GIT_REPOSITORY}"
+      "${ARGS_GIT_TAG}"
+      OFF
+      ARCHIVE_URL
+    )
+    # If conversion from git repository to archive URL is enabled.
+    if(OVERRIDABLE_FETCH_CONTENT_GIT_REPOSITORY_AND_TAG_TO_URL_${CONTENT_NAME}
+       OR OVERRIDABLE_FETCH_CONTENT_GIT_REPOSITORY_AND_TAG_TO_URL)
+      # Try converting to an archive URL.
+      if(NOT ARGS_URL)
+        _GitRepoArchiveUrl(
+          "${ARGS_GIT_REPOSITORY}"
+          "${ARGS_GIT_TAG}"
+          ON
+          ARGS_URL
+        )
+        set(ARCHIVE_URL "${ARGS_URL}")
+      endif()
+    endif()
+  endif()
+
+  # If a download URL and git repository with tag are specified either use
+  # the git repo or the download URL.
+  if(ARGS_URL AND ARGS_GIT_REPOSITORY)
+    if(OVERRIDABLE_FETCH_CONTENT_USE_GIT)
+      unset(ARGS_URL)
+      unset(ARGS_URL_HASH)
+      unset(ARGS_URL_MD5)
+    else()
+      unset(ARGS_GIT_REPOSITORY)
+      unset(ARGS_GIT_TAG)
+    endif()
+  endif()
+
+  # Optionally map the archive URL to a mirror.
+  if(ARGS_URL)
+    _ApplyReplacements(
+      "OVERRIDABLE_FETCH_CONTENT_${CONTENT_NAME}"
+      "${ARGS_URL}"
+      REPLACED
+    )
+    if(REPLACED)
+      set(ARGS_URL "${REPLACED}")
+    endif()
+  endif()
+
+  # Save the archive URL.
+  if(ARGS_URL)
+    set(ARCHIVE_URL "${ARGS_URL}")
+  endif()
+  if(ARCHIVE_URL)
+    _OverridableFetchContent_SetProperty(
+      "${CONTENT_NAME}"
+      ARCHIVE_URL
+      "Archive URL for ${CONTENT_NAME}"
+      "${ARCHIVE_URL}"
+    )
+  endif()
+
+  # Build the list of arguments to pass to FetchContent_Declare() starting with
+  # the overridable arguments.
+  set(OUTPUT_ARGS "")
+  foreach(OVERRIDABLE_ARG ${OVERRIDABLE_ARGS})
+    set(OVERRIDABLE_ARG_VALUE "${ARGS_${OVERRIDABLE_ARG}}")
+    if(OVERRIDABLE_ARG_VALUE)
+      list(APPEND OUTPUT_ARGS ${OVERRIDABLE_ARG} "${OVERRIDABLE_ARG_VALUE}")
+    endif()
+  endforeach()
+  list(APPEND OUTPUT_ARGS ${ARGS_UNPARSED_ARGUMENTS})
+
+  # Add all defined packages to a global property.
+  get_property(OVERRIDABLE_FETCH_CONTENT_LIST GLOBAL PROPERTY
+    OVERRIDABLE_FETCH_CONTENT_LIST
+  )
+  set(DOCUMENTATION "List of all fetched content")
+  define_property(GLOBAL PROPERTY OVERRIDABLE_FETCH_CONTENT_LIST
+    BRIEF_DOCS "${DOCUMENTATION}"
+    FULL_DOCS "${DOCUMENTATION}"
+  )
+  list(APPEND OVERRIDABLE_FETCH_CONTENT_LIST "${CONTENT_NAME}")
+  set_property(GLOBAL PROPERTY OVERRIDABLE_FETCH_CONTENT_LIST
+    "${OVERRIDABLE_FETCH_CONTENT_LIST}"
+  )
+
+  message(VERBOSE "FetchContent_Declare(${CONTENT_NAME} ${OUTPUT_ARGS}")
+  FetchContent_Declare("${CONTENT_NAME}" ${OUTPUT_ARGS})
+endfunction()
+
+
+# Get a property name for this module.
+# CONTENT_NAME: Name of the content associated with the FetchContent function.
+# PROPERTY_NAME: Name of the property.
+# OUTPUT_VAR: Variable to store the name in.
+function(_OverridableFetchContent_GetPropertyName CONTENT_NAME PROPERTY_NAME
+    OUTPUT_VAR)
+  # The implementation of FetchContent_GetProperties() uses the lower case
+  # content name to prefix property names so follow the same pattern here.
+  string(TOLOWER "${CONTENT_NAME}" CONTENT_NAME_LOWER)
+  set(${OUTPUT_VAR}
+    "_OverridableFetchContent_${CONTENT_NAME_LOWER}_${PROPERTY_NAME}"
+    PARENT_SCOPE
+  )
+endfunction()
+
+
+# Set a global property for this module.
+# CONTENT_NAME: Name of the content associated with the FetchContent function.
+# PROPERTY_NAME: Name of the property to set.
+# DOCUMENTATION: Documentation string for the property.
+# PROPERTY_VALUE: Value to set the property to.
+function(_OverridableFetchContent_SetProperty CONTENT_NAME PROPERTY_NAME
+    DOCUMENTATION PROPERTY_VALUE)
+  _OverridableFetchContent_GetPropertyName(
+    "${CONTENT_NAME}"
+    "${PROPERTY_NAME}"
+    GLOBAL_PROPERTY_NAME
+  )
+  define_property(GLOBAL PROPERTY "${GLOBAL_PROPERTY_NAME}"
+    BRIEF_DOCS "${DOCUMENTATION}"
+    FULL_DOCS "${DOCUMENTATION}"
+  )
+  set_property(GLOBAL PROPERTY "${GLOBAL_PROPERTY_NAME}" "${PROPERTY_VALUE}")
+endfunction()
+
+
+# Get a global property for this module.
+# CONTENT_NAME: Name of the content associated with the FetchContent function.
+# PROPERTY_NAME: Name of the property to get.
+# OUTPUT_VAR: Variable to store the value in.
+function(_OverridableFetchContent_GetProperty CONTENT_NAME PROPERTY_NAME
+    OUTPUT_VAR)
+  _OverridableFetchContent_GetPropertyName(
+    "${CONTENT_NAME}"
+    "${PROPERTY_NAME}"
+    GLOBAL_PROPERTY_NAME
+  )
+  get_property(VALUE GLOBAL PROPERTY "${GLOBAL_PROPERTY_NAME}")
+  if(VALUE)
+    set(${OUTPUT_VAR} "${VALUE}" PARENT_SCOPE)
+  endif()
+endfunction()
+
+
+# Export a list of variables to the parent scope of the caller function.
+macro(_OverridableFetchContent_ExportToParentScope)
+  # Export requested variables to the parent scope.
+  foreach(VARIABLE_NAME ${ARGN})
+    if(${VARIABLE_NAME})
+      message(DEBUG "Export ${VARIABLE_NAME} ${${VARIABLE_NAME}}")
+      set(${VARIABLE_NAME} "${${VARIABLE_NAME}}" PARENT_SCOPE)
+    endif()
+  endforeach()
+endmacro()
+
+
+# Wrapper around FetchContent_GetProperties().
+#
+# Sets the same variables as FetchContent_GetProperties() in addition to:
+# * <contentName>_LICENSE_FILE: License file relative to
+#   <contentName>_SOURCE_DIR if found.
+# * <contentName>_LICENSE_URL: License URL if the file is found.
+# * <contentName_ARCHIVE_URL: URL to the source package.
+function(OverridableFetchContent_GetProperties CONTENT_NAME)
+  set(EXPORT_VARIABLE_ARGS SOURCE_DIR BINARY_DIR POPULATED)
+  cmake_parse_arguments(ARGS
+    ""
+    "${EXPORT_VARIABLE_ARGS}"
+    ""
+    ${ARGN}
+  )
+
+  # The implementation of FetchContent_Populate() uses the lower case
+  # content name to prefix returned variable names.
+  string(TOLOWER "${CONTENT_NAME}" CONTENT_NAME_LOWER)
+  # Get the names of the variables to export to the parent scope.
+  set(EXPORT_VARIABLES "")
+  set(OUTPUT_ARGS "")
+  foreach(ARG_NAME ${EXPORT_VARIABLE_ARGS})
+    set(ARG_VARIABLE_NAME "ARGS_${ARG_NAME}")
+    set(ARG_VARIABLE_VALUE "${${ARG_VARIABLE_NAME}}")
+    list(APPEND EXPORT_VARIABLES "${CONTENT_NAME_LOWER}_${ARG_NAME}")
+    if(ARG_VARIABLE_VALUE)
+      list(APPEND EXPORT_VARIABLES "${ARG_VARIABLE_VALUE}")
+      list(APPEND OUTPUT_ARGS "${ARG_NAME}" "${ARG_VARIABLE_VALUE}")
+    endif()
+  endforeach()
+  list(APPEND OUTPUT_ARGS ${ARGS_UNPARSED_ARGUMENTS})
+
+  foreach(EXPORT_PROPERTY LICENSE_FILE LICENSE_URL ARCHIVE_URL)
+    _OverridableFetchContent_GetProperty("${CONTENT_NAME}"
+      "${EXPORT_PROPERTY}"
+      "${EXPORT_PROPERTY}"
+    )
+    set(PROPERTY_VALUE "${${EXPORT_PROPERTY}}")
+    if(PROPERTY_VALUE)
+      set(${CONTENT_NAME}_${EXPORT_PROPERTY} "${PROPERTY_VALUE}" PARENT_SCOPE)
+    endif()
+  endforeach()
+  FetchContent_GetProperties("${CONTENT_NAME}" ${OUTPUT_ARGS})
+  _OverridableFetchContent_ExportToParentScope(${EXPORT_VARIABLES})
+endfunction()
+
+
+# Replacement for FetchContent_Populate() that searches a newly cloned
+# repository for a top level license file and provides it to the caller
+# via the <contentName>_LICENSE_FILE and <contentName>_LICENSE_URL variables
+# where <contentName> is the value passed as the CONTENT_NAME argument of this
+# method.
+#
+# To ensure a fetched repo has a license file and URL
+# OVERRIDABLE_FETCH_CONTENT_LICENSE_CHECK_<contentName> to ON for one
+# repository or OVERRIDABLE_FETCH_CONTENT_LICENSE_CHECK to ON for all
+# repositories.
+function(OverridableFetchContent_Populate CONTENT_NAME)
+  # The implementation of FetchContent_Populate() uses the lower case
+  # content name to prefix returned variable names.
+  string(TOLOWER "${CONTENT_NAME}" CONTENT_NAME_LOWER)
+
+  FetchContent_Populate("${CONTENT_NAME}")
+  OverridableFetchContent_GetProperties("${CONTENT_NAME}")
+
+  # If a license file isn't cached try finding it in the repo.
+  set(LICENSE_FILE "${${CONTENT_NAME_LOWER}_LICENSE_FILE}")
+  set(LICENSE_URL "${${CONTENT_NAME_LOWER}_LICENSE_URL}")
+  if(${CONTENT_NAME}_POPULATED AND NOT LICENSE_FILE)
+    set(SOURCE_DIR "${${CONTENT_NAME_LOWER}_SOURCE_DIR}")
+    find_file(_${CONTENT_NAME_LOWER}_LICENSE_FILE
+      NAMES LICENSE LICENSE.md LICENSE.txt NOTICE COPYING
+      PATHS "${SOURCE_DIR}"
+      DOC "${CONTENT_NAME} license file"
+      NO_DEFAULT_PATH
+      NO_CMAKE_FIND_ROOT_PATH
+    )
+    set(LICENSE_FILE "${_${CONTENT_NAME_LOWER}_LICENSE_FILE}")
+    if(LICENSE_FILE)
+      file(RELATIVE_PATH LICENSE_FILE "${SOURCE_DIR}" "${LICENSE_FILE}")
+      file(TO_CMAKE_PATH "${LICENSE_FILE}" LICENSE_FILE)
+    endif()
+  endif()
+  # If a LICENSE_FILE was found populate the URL.
+  if(LICENSE_FILE AND NOT LICENSE_URL)
+    _LicenseFileToUrl(
+      "${CONTENT_NAME}"
+      "${LICENSE_FILE}"
+      LICENSE_URL
+    )
+  endif()
+
+  # If enabled, check for source licenses.
+  if(OVERRIDABLE_FETCH_CONTENT_LICENSE_CHECK OR
+     OVERRIDABLE_FETCH_CONTENT_LICENSE_CHECK_${CONTENT_NAME})
+    message(DEBUG "LICENSE_FILE: ${LICENSE_FILE}, LICENSE_URL: ${LICENSE_URL}")
+    if(NOT LICENSE_FILE)
+      message(FATAL_ERROR
+        "Required license file not found for ${CONTENT_NAME}"
+      )
+    endif()
+    if(NOT LICENSE_URL)
+      message(FATAL_ERROR
+        "Required license URL not found for ${CONTENT_NAME}"
+      )
+    endif()
+  endif()
+
+  # Export return values to the parent scope.
+  set(EXPORT_VARIABLES "")
+  foreach(VARIABLE_POSTFIX SOURCE_DIR BINARY_DIR POPULATED)
+    list(APPEND EXPORT_VARIABLES "${CONTENT_NAME_LOWER}_${VARIABLE_POSTFIX}")
+  endforeach()
+  _OverridableFetchContent_ExportToParentScope(${EXPORT_VARIABLES})
+endfunction()
diff --git a/tensorflow/lite/tools/cmake/modules/abseil-cpp.cmake b/tensorflow/lite/tools/cmake/modules/abseil-cpp.cmake
new file mode 100644
index 00000000000..5f362f45c75
--- /dev/null
+++ b/tensorflow/lite/tools/cmake/modules/abseil-cpp.cmake
@@ -0,0 +1,44 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Use absl_base as a proxy for the project being included.
+if(TARGET absl_base OR abseil-cpp_POPULATED)
+  return()
+endif()
+
+include(OverridableFetchContent)
+
+OverridableFetchContent_Declare(
+  abseil-cpp
+  GIT_REPOSITORY https://github.com/abseil/abseil-cpp
+  GIT_TAG 20200225.2 # TODO: What version does GRPC and TFLite need?
+  GIT_SHALLOW TRUE
+  GIT_PROGRESS TRUE
+  PREFIX "${CMAKE_BINARY_DIR}"
+  SOURCE_DIR "${CMAKE_BINARY_DIR}/abseil-cpp"
+)
+OverridableFetchContent_GetProperties(abseil-cpp)
+if(NOT abseil-cpp_POPULATED)
+  OverridableFetchContent_Populate(abseil-cpp)
+endif()
+
+set(ABSL_USE_GOOGLETEST_HEAD OFF CACHE BOOL "Disable googletest")
+set(ABSL_RUN_TESTS OFF CACHE BOOL "Disable build of ABSL tests")
+add_subdirectory(
+  "${abseil-cpp_SOURCE_DIR}"
+  "${abseil-cpp_BINARY_DIR}"
+  EXCLUDE_FROM_ALL
+)
+
diff --git a/tensorflow/lite/tools/cmake/modules/absl-config.cmake b/tensorflow/lite/tools/cmake/modules/absl-config.cmake
new file mode 100644
index 00000000000..75041749bd1
--- /dev/null
+++ b/tensorflow/lite/tools/cmake/modules/absl-config.cmake
@@ -0,0 +1,187 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# grpc uses find_package in CONFIG mode for this package, so override the
+# system installation and build from source instead.
+include(abseil-cpp)
+if(abseil-cpp_POPULATED)
+  set(_ABSL_LIBRARY_NAMES
+    algorithm
+    algorithm_container
+    any
+    atomic_hook
+    atomic_hook_test_helper
+    awesome
+    bad_any_cast
+    bad_any_cast_impl
+    bad_optional_access
+    bad_variant_access
+    base
+    base_internal
+    bind_front
+    bits
+    btree
+    btree_test_common
+    city
+    civil_time
+    compare
+    compressed_tuple
+    config
+    conformance_testing
+    container
+    container_common
+    container_memory
+    cord
+    cord_test_helpers
+    core_headers
+    counting_allocator
+    debugging
+    debugging_internal
+    demangle_internal
+    dynamic_annotations
+    endian
+    errno_saver
+    examine_stack
+    exception_safety_testing
+    exception_testing
+    exponential_biased
+    failure_signal_handler
+    fantastic_lib
+    fast_type_id
+    fixed_array
+    flags
+    flags_commandlineflag
+    flags_commandlineflag_internal
+    flags_config
+    flags_internal
+    flags_marshalling
+    flags_parse
+    flags_path_util
+    flags_private_handle_accessor
+    flags_program_name
+    flags_reflection
+    flags_usage
+    flags_usage_internal
+    flat_hash_map
+    flat_hash_set
+    function_ref
+    graphcycles_internal
+    hash
+    hash_function_defaults
+    hash_generator_testing
+    hash_policy_testing
+    hash_policy_traits
+    hash_testing
+    hashtable_debug
+    hashtable_debug_hooks
+    hashtablez_sampler
+    have_sse
+    hdrs
+    inlined_vector
+    inlined_vector_internal
+    int128
+    kernel_timeout_internal
+    layout
+    leak_check
+    leak_check_api_disabled_for_testing
+    leak_check_api_enabled_for_testing
+    leak_check_disable
+    log_severity
+    main_lib
+    malloc_internal
+    memory
+    meta
+    node_hash_map
+    node_hash_policy
+    node_hash_set
+    numeric
+    optional
+    per_thread_sem_test_common
+    periodic_sampler
+    pow10_helper
+    pretty_function
+    random_bit_gen_ref
+    random_distributions
+    random_internal_distribution_caller
+    random_internal_distribution_test_util
+    random_internal_explicit_seed_seq
+    random_internal_fast_uniform_bits
+    random_internal_fastmath
+    random_internal_generate_real
+    random_internal_iostream_state_saver
+    random_internal_mock_helpers
+    random_internal_mock_overload_set
+    random_internal_nonsecure_base
+    random_internal_pcg_engine
+    random_internal_platform
+    random_internal_pool_urbg
+    random_internal_randen
+    random_internal_randen_engine
+    random_internal_randen_hwaes
+    random_internal_randen_hwaes_impl
+    random_internal_randen_slow
+    random_internal_salted_seed_seq
+    random_internal_seed_material
+    random_internal_sequence_urbg
+    random_internal_traits
+    random_internal_uniform_helper
+    random_internal_wide_multiply
+    random_mocking_bit_gen
+    random_random
+    random_seed_gen_exception
+    random_seed_sequences
+    raw_hash_map
+    raw_hash_set
+    raw_logging_internal
+    scoped_set_env
+    span
+    spinlock_test_common
+    spinlock_wait
+    spy_hash_state
+    stack_consumption
+    stacktrace
+    status
+    str_format
+    str_format_internal
+    strerror
+    strings
+    strings_internal
+    symbolize
+    synchronization
+    test_instance_tracker
+    thread_pool
+    throw_delegate
+    time
+    time_internal_test_util
+    time_zone
+    tracked
+    type_traits
+    unordered_map_constructor_test
+    unordered_map_lookup_test
+    unordered_map_members_test
+    unordered_map_modifiers_test
+    unordered_set_constructor_test
+    unordered_set_lookup_test
+    unordered_set_members_test
+    unordered_set_modifiers_test
+    utility
+    variant
+  )
+  set(_ABSL_LIBRARIES ${_ABSL_LIBRARY_NAMES})
+  foreach(_LIBRARY ${_ABSL_LIBRARY_NAMES})
+    list(APPEND _ABSL_LIBRARIES "absl::${LIBRARY}")
+  endforeach()
+  set(ABSL_LIBRARIES ${ABSL_LIBRARIES} CACHE STRING "absl libs")
+endif()
diff --git a/tensorflow/lite/tools/cmake/modules/eigen.cmake b/tensorflow/lite/tools/cmake/modules/eigen.cmake
new file mode 100644
index 00000000000..983e0735948
--- /dev/null
+++ b/tensorflow/lite/tools/cmake/modules/eigen.cmake
@@ -0,0 +1,95 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(TARGET eigen OR eigen_POPULATED)
+  return()
+endif()
+
+include(OverridableFetchContent)
+
+OverridableFetchContent_Declare(
+  eigen
+  GIT_REPOSITORY https://gitlab.com/libeigen/eigen
+  # TODO: Verify this is the version required by TFLite
+  GIT_TAG d10b27fe37736d2944630ecd7557cefa95cf87c9
+  # It's not currently (cmake 3.17) possible to shallow clone with a GIT TAG
+  # as cmake attempts to git checkout the commit hash after the clone
+  # which doesn't work as it's a shallow clone hence a different commit hash.
+  # https://gitlab.kitware.com/cmake/cmake/-/issues/17770
+  # GIT_SHALLOW TRUE
+  GIT_PROGRESS TRUE
+  PREFIX "${CMAKE_BINARY_DIR}"
+  SOURCE_DIR "${CMAKE_BINARY_DIR}/eigen"
+  LICENSE_FILE "COPYING.MPL2"
+)
+OverridableFetchContent_GetProperties(eigen)
+if(NOT eigen_POPULATED)
+  OverridableFetchContent_Populate(eigen)
+endif()
+
+# Patch Eigen to disable Fortran compiler check for BLAS and LAPACK tests.
+if(NOT EIGEN_DISABLED_FORTRAN_COMPILER_CHECK)
+  file(WRITE "${eigen_SOURCE_DIR}/cmake/language_support.cmake" "
+      function(workaround_9220 language language_works)
+        set(\${language_works} OFF PARENT_SCOPE)
+      endfunction()"
+  )
+endif()
+# Patch Eigen to disable benchmark suite.
+if(NOT EIGEN_BUILD_BTL)
+  file(WRITE "${eigen_SOURCE_DIR}/bench/spbench/CMakeLists.txt" "")
+endif()
+
+set(EIGEN_DISABLED_FORTRAN_COMPILER_CHECK ON CACHE BOOL "Disabled Fortran")
+
+set(EIGEN_LEAVE_TEST_IN_ALL_TARGET OFF CACHE BOOL
+  "Remove tests from all target."
+)
+set(BUILD_TESTING OFF CACHE BOOL "Disable tests.")
+set(EIGEN_TEST_CXX11 OFF CACHE BOOL "Disable tests of C++11 features.")
+set(EIGEN_BUILD_BTL OFF CACHE BOOL "Disable benchmark suite.")
+set(EIGEN_BUILD_PKGCONFIG OFF CACHE BOOL "Disable pkg-config.")
+set(EIGEN_SPLIT_LARGE_TESTS OFF CACHE BOOL "Disable test splitting.")
+set(EIGEN_DEFAULT_TO_ROW_MAJOR OFF CACHE BOOL
+  "Disable row-major matrix storage"
+)
+set(EIGEN_TEST_NOQT ON CACHE BOOL "Disable Qt support in tests.")
+set(EIGEN_TEST_SSE2 OFF CACHE BOOL "Disable SSE2 test.")
+set(EIGEN_TEST_SSE3 OFF CACHE BOOL "Disable SSE3 test.")
+set(EIGEN_TEST_SSSE3 OFF CACHE BOOL "Disable SSSE3 test.")
+set(EIGEN_TEST_SSE4_1 OFF CACHE BOOL "Disable SSE4.1 test.")
+set(EIGEN_TEST_SSE4_2 OFF CACHE BOOL "Disable SSE4.2 test.")
+set(EIGEN_TEST_AVX OFF CACHE BOOL "Disable AVX test.")
+set(EIGEN_TEST_FMA OFF CACHE BOOL "Disable FMA test.")
+set(EIGEN_TEST_AVX512 OFF CACHE BOOL "Disable AVX512 test.")
+set(EIGEN_TEST_F16C OFF CACHE BOOL "Disable F16C test.")
+set(EIGEN_TEST_ALTIVEC OFF CACHE BOOL "Disable AltiVec test.")
+set(EIGEN_TEST_VSX OFF CACHE BOOL "Disable VSX test.")
+set(EIGEN_TEST_MSA OFF CACHE BOOL "Disable MSA test.")
+set(EIGEN_TEST_NEON OFF CACHE BOOL "Disable NEON test.")
+set(EIGEN_TEST_NEON64 OFF CACHE BOOL "Disable NEON64 test.")
+set(EIGEN_TEST_Z13 OFF CACHE BOOL "Disable Z13 test.")
+set(EIGEN_TEST_Z14 OFF CACHE BOOL "Disable Z14 test.")
+set(EIGEN_TEST_OPENMP OFF CACHE BOOL "Disable OpenMP test.")
+set(EIGEN_TEST_NO_EXPLICIT_VECTORIZATION OFF CACHE BOOL "Disable vectorization")
+set(EIGEN_TEST_X87 OFF CACHE BOOL "Disable X87 instructions test")
+set(EIGEN_TEST_32BIT OFF CACHE BOOL "Disable 32-bit instructions test")
+set(EIGEN_TEST_NO_EXPLICIT_ALIGNMENT OFF CACHE BOOL "Disable alignment test")
+set(EIGEN_TEST_NO_EXCEPTIONS OFF CACHE BOOL "Disable alignment test")
+set(EIGEN_TEST_SYCL OFF CACHE BOOL "Disable Sycl test")
+set(EIGEN_SYCL_TRISYCL OFF CACHE BOOL "Disable triSYCL test")
+# Make sure only MPL2.0 or more permissively licensed code is included.
+add_compile_definitions(EIGEN_MPL2_ONLY)
+add_subdirectory("${eigen_SOURCE_DIR}" "${eigen_BINARY_DIR}" EXCLUDE_FROM_ALL)
diff --git a/tensorflow/lite/tools/cmake/modules/farmhash.cmake b/tensorflow/lite/tools/cmake/modules/farmhash.cmake
new file mode 100644
index 00000000000..09ec7bdf64f
--- /dev/null
+++ b/tensorflow/lite/tools/cmake/modules/farmhash.cmake
@@ -0,0 +1,48 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(TARGET farmhash OR farmhash_POPULATED)
+  return()
+endif()
+
+include(OverridableFetchContent)
+
+OverridableFetchContent_Declare(
+  farmhash
+  GIT_REPOSITORY https://github.com/google/farmhash
+  # TODO: Reference the source of this.
+  GIT_TAG 816a4ae622e964763ca0862d9dbd19324a1eaf45
+  # It's not currently possible to shallow clone with a GIT TAG
+  # as cmake attempts to git checkout the commit hash after the clone
+  # which doesn't work as it's a shallow clone hence a different commit hash.
+  # https://gitlab.kitware.com/cmake/cmake/-/issues/17770
+  # GIT_SHALLOW TRUE
+  GIT_PROGRESS TRUE
+  SOURCE_DIR "${CMAKE_BINARY_DIR}/farmhash"
+)
+OverridableFetchContent_GetProperties(farmhash)
+if(NOT farmhash_POPULATED)
+  OverridableFetchContent_Populate(farmhash)
+endif()
+
+set(FARMHASH_SOURCE_DIR "${farmhash_SOURCE_DIR}" CACHE PATH
+  "Source directory for the CMake project."
+)
+
+add_subdirectory(
+  "${CMAKE_CURRENT_LIST_DIR}/farmhash"
+  "${farmhash_BINARY_DIR}"
+  EXCLUDE_FROM_ALL
+)
diff --git a/tensorflow/lite/tools/cmake/modules/farmhash/CMakeLists.txt b/tensorflow/lite/tools/cmake/modules/farmhash/CMakeLists.txt
new file mode 100644
index 00000000000..7029926b6d4
--- /dev/null
+++ b/tensorflow/lite/tools/cmake/modules/farmhash/CMakeLists.txt
@@ -0,0 +1,39 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+project(farmhash CXX)
+
+set(FARMHASH_SOURCE_DIR "" CACHE PATH
+  "Directory that contains the farmhash project"
+)
+if(NOT FARMHASH_SOURCE_DIR)
+  message(FATAL_ERROR "Must specify source directory")
+endif()
+
+# Transcribed from farmhash/src/Makefile.am
+include(CheckCXXSourceCompiles)
+check_cxx_source_compiles(
+  "int main(int argc, char* argv[]) { return (int)__builtin_expect(0, 0); }"
+  FARMHASH_HAS_BUILTIN_EXPECT
+)
+
+add_library(farmhash
+  "${FARMHASH_SOURCE_DIR}/src/farmhash.cc"
+  "${FARMHASH_SOURCE_DIR}/src/farmhash.h"
+)
+target_include_directories(farmhash PUBLIC "${FARMHASH_SOURCE_DIR}/src")
+if(NOT FARMHASH_HAS_BUILTIN_EXPECT)
+  target_compile_definitions(farmhash PUBLIC -DFARMHASH_NO_BUILTIN_EXPECT)
+endif()
diff --git a/tensorflow/lite/tools/cmake/modules/fft2d.cmake b/tensorflow/lite/tools/cmake/modules/fft2d.cmake
new file mode 100644
index 00000000000..93ac8c1419f
--- /dev/null
+++ b/tensorflow/lite/tools/cmake/modules/fft2d.cmake
@@ -0,0 +1,41 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(TARGET fft2d OR fft2d_POPULATED)
+  return()
+endif()
+
+include(OverridableFetchContent)
+
+OverridableFetchContent_Declare(
+  fft2d
+  URL https://storage.googleapis.com/mirror.tensorflow.org/www.kurims.kyoto-u.ac.jp/~ooura/fft2d.tgz
+  # TODO: Reference where this comes from.
+  URL_HASH SHA256=ada7e99087c4ed477bfdf11413f2ba8db8a840ba9bbf8ac94f4f3972e2a7cec9
+  SOURCE_DIR "${CMAKE_BINARY_DIR}/fft2d"
+  LICENSE_FILE "readme2d.txt"
+  LICENSE_URL "http://www.kurims.kyoto-u.ac.jp/~ooura/fft.html"
+)
+OverridableFetchContent_GetProperties(fft2d)
+if(NOT fft2d_POPULATED)
+  OverridableFetchContent_Populate(fft2d)
+endif()
+
+set(FFT2D_SOURCE_DIR "${fft2d_SOURCE_DIR}" CACHE PATH "fft2d source")
+add_subdirectory(
+  "${CMAKE_CURRENT_LIST_DIR}/fft2d"
+  "${fft2d_BINARY_DIR}"
+  EXCLUDE_FROM_ALL
+)
diff --git a/tensorflow/lite/tools/cmake/modules/fft2d/CMakeLists.txt b/tensorflow/lite/tools/cmake/modules/fft2d/CMakeLists.txt
new file mode 100644
index 00000000000..e7a5ed9b443
--- /dev/null
+++ b/tensorflow/lite/tools/cmake/modules/fft2d/CMakeLists.txt
@@ -0,0 +1,54 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+project(fft2d C)
+
+set(FFT2D_SOURCE_DIR "" CACHE PATH
+  "Directory that contains the fft2d project"
+)
+if(NOT FFT2D_SOURCE_DIR)
+  message(FATAL_ERROR "Must specify source directory")
+endif()
+
+# fft2d doesn't have a CMake project so define it here transcribed from
+# sample2d/Makefile.
+
+# A developer should link this library if they haven't provided their own
+# implementation of these allocation methods.
+add_library(fft2d_alloc
+  "${FFT2D_SOURCE_DIR}/alloc.c"
+  "${FFT2D_SOURCE_DIR}/alloc.h"
+)
+target_include_directories(fft2d_alloc PUBLIC "${FFT2D_SOURCE_DIR}")
+
+# Requires implementation of fft2d_alloc.
+add_library(fft2d_fft4f2d "${FFT2D_SOURCE_DIR}/fft4f2d.c")
+target_include_directories(fft2d_fft4f2d PRIVATE "${FFT2D_SOURCE_DIR}")
+
+add_library(fft2d_fftsg "${FFT2D_SOURCE_DIR}/fftsg.c")
+
+# Requires implementation of fft2d_alloc.
+add_library(fft2d_fftsg2d "${FFT2D_SOURCE_DIR}/fftsg2d.c")
+target_link_libraries(fft2d_fftsg2d fft2d_fftsg)
+target_include_directories(fft2d_fftsg2d PRIVATE "${FFT2D_SOURCE_DIR}")
+
+# Requires implementation of fft2d_alloc.
+add_library(fft2d_fftsg3d "${FFT2D_SOURCE_DIR}/fftsg3d.c")
+target_link_libraries(fft2d_fftsg3d fft2d_fftsg)
+target_include_directories(fft2d_fftsg3d PRIVATE "${FFT2D_SOURCE_DIR}")
+
+add_library(fft2d_shrtdct "${FFT2D_SOURCE_DIR}/shrtdct.c")
+
+add_library(fft2d ALIAS fft2d_fftsg2d)
diff --git a/tensorflow/lite/tools/cmake/modules/flatbuffers.cmake b/tensorflow/lite/tools/cmake/modules/flatbuffers.cmake
new file mode 100644
index 00000000000..38380ca43ae
--- /dev/null
+++ b/tensorflow/lite/tools/cmake/modules/flatbuffers.cmake
@@ -0,0 +1,43 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(TARGET flatbuffers OR flatbuffers_POPULATED)
+  return()
+endif()
+
+include(FetchContent)
+
+OverridableFetchContent_Declare(
+  flatbuffers
+  GIT_REPOSITORY https://github.com/google/flatbuffers
+  GIT_TAG v1.12.0 # TODO: What version does TFLite need?
+  GIT_SHALLOW TRUE
+  GIT_PROGRESS TRUE
+  SOURCE_DIR "${CMAKE_BINARY_DIR}/flatbuffers"
+)
+OverridableFetchContent_GetProperties(flatbuffers)
+if(NOT flatbuffers_POPULATED)
+  OverridableFetchContent_Populate(flatbuffers)
+endif()
+
+# Required for Windows, since it has macros called min & max which
+# clashes with std::min
+add_definitions(-DNOMINMAX=1)
+add_subdirectory(
+  "${flatbuffers_SOURCE_DIR}"
+  "${flatbuffers_BINARY_DIR}"
+  EXCLUDE_FROM_ALL
+)
+remove_definitions(-DNOMINMAX)
diff --git a/tensorflow/lite/tools/cmake/modules/gemmlowp.cmake b/tensorflow/lite/tools/cmake/modules/gemmlowp.cmake
new file mode 100644
index 00000000000..a0483ab62ef
--- /dev/null
+++ b/tensorflow/lite/tools/cmake/modules/gemmlowp.cmake
@@ -0,0 +1,45 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(TARGET gemmlowp OR gemmlowp_POPULATED)
+  return()
+endif()
+
+include(OverridableFetchContent)
+
+OverridableFetchContent_Declare(
+  gemmlowp
+  GIT_REPOSITORY https://github.com/google/gemmlowp
+  GIT_TAG fda83bdc38b118cc6b56753bd540caa49e570745
+  # It's not currently (cmake 3.17) possible to shallow clone with a GIT TAG
+  # as cmake attempts to git checkout the commit hash after the clone
+  # which doesn't work as it's a shallow clone hence a different commit hash.
+  # https://gitlab.kitware.com/cmake/cmake/-/issues/17770
+  # GIT_SHALLOW TRUE
+  GIT_PROGRESS TRUE
+  SOURCE_DIR "${CMAKE_BINARY_DIR}/gemmlowp"
+)
+
+OverridableFetchContent_GetProperties(gemmlowp)
+if(NOT gemmlowp_POPULATED)
+  OverridableFetchContent_Populate(gemmlowp)
+endif()
+
+set(GEMMLOWP_SOURCE_DIR "${gemmlowp_SOURCE_DIR}" CACHE PATH "Source directory")
+add_subdirectory(
+  "${CMAKE_CURRENT_LIST_DIR}/gemmlowp"
+  "${gemmlowp_BINARY_DIR}"
+  EXCLUDE_FROM_ALL
+)
diff --git a/tensorflow/lite/tools/cmake/modules/gemmlowp/CMakeLists.txt b/tensorflow/lite/tools/cmake/modules/gemmlowp/CMakeLists.txt
new file mode 100644
index 00000000000..0aa5ae1a4d3
--- /dev/null
+++ b/tensorflow/lite/tools/cmake/modules/gemmlowp/CMakeLists.txt
@@ -0,0 +1,87 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+project(gemmlowp CXX)
+
+option(GEMMLOWP_ADD_HEADERS_TO_TARGETS OFF
+  "Whether to add sources to gemmlowp's interface library targets.
+   This will cause all users of these libraries to also include these headers"
+)
+
+set(GEMMLOWP_SOURCE_DIR "" CACHE PATH
+  "Directory that contains the gemmlowp project"
+)
+if(NOT GEMMLOWP_SOURCE_DIR)
+  message(FATAL_ERROR "Must specify source directory")
+endif()
+
+# gemmlowp doesn't have a CMake project so this is transcribed from
+# gemmlowp/BUILD.
+
+file(GLOB GEMMLOWP_EIGHTBITINT_HEADERS
+  "${GEMMLOWP_SOURCE_DIR}/eight_bit_int_gemm/*.h"
+  )
+file(GLOB GEMMLOWP_EIGHTBITINT_SOURCES
+  "${GEMMLOWP_SOURCE_DIR}/eight_bit_int_gemm/*.cc"
+)
+file(GLOB GEMMLOWP_FIXEDPOINT_HEADERS "${GEMMLOWP_SOURCE_DIR}/fixedpoint/*.h")
+file(GLOB GEMMLOWP_INTERNAL_HEADERS "${GEMMLOWP_SOURCE_DIR}/internal/*.h")
+file(GLOB GEMMLOWP_META_HEADERS "${GEMMLOWP_SOURCE_DIR}/meta/*.h")
+file(GLOB GEMMLOWP_PROFILING_HEADERS "${GEMMLOWP_SOURCE_DIR}/profiling/*.h")
+file(GLOB GEMMLOWP_PUBLIC_HEADERS "${GEMMLOWP_SOURCE_DIR}/public/*.h")
+
+set(GEMMLOWP_PRIVATE_HEADERS "")
+list(APPEND GEMMLOWP_PRIVATE_HEADERS ${GEMMLOWP_FIXEDPOINT_HEADERS})
+list(APPEND GEMMLOWP_PRIVATE_HEADERS ${GEMMLOWP_INTERNAL_HEADERS})
+
+add_library(gemmlowp_private INTERFACE)
+if(GEMMLOWP_ADD_HEADERS_TO_TARGETS)
+  target_sources(gemmlowp_private INTERFACE ${GEMMLOWP_PRIVATE_HEADERS})
+endif()
+target_include_directories(gemmlowp_private INTERFACE "${GEMMLOWP_SOURCE_DIR}")
+
+add_library(gemmlowp INTERFACE)
+if(GEMMLOWP_ADD_HEADERS_TO_TARGETS)
+  target_sources(gemmlowp INTERFACE ${GEMMLOWP_PUBLIC_HEADERS})
+endif()
+target_include_directories(gemmlowp INTERFACE "${GEMMLOWP_SOURCE_DIR}/public")
+target_link_libraries(gemmlowp INTERFACE gemmlowp_private)
+
+add_library(gemmlowp_eight_bit_int_gemm
+  ${GEMMLOWP_EIGHTBITINT_SOURCES}
+  ${GEMMLOWP_EIGHTBITINT_HEADERS}
+)
+target_include_directories(gemmlowp_eight_bit_int_gemm
+  PUBLIC "${GEMMLOWP_SOURCE_DIR}/eight_bit_int_gemm"
+)
+
+add_library(gemmlowp_fixedpoint INTERFACE)
+if(GEMMLOWP_ADD_HEADERS_TO_TARGETS)
+  target_sources(gemmlowp_fixedpoint INTERFACE ${GEMMLOWP_FIXEDPOINT_HEADERS})
+endif()
+target_include_directories(gemmlowp_fixedpoint
+  INTERFACE "${GEMMLOWP_SOURCE_DIR}/fixedpoint"
+)
+target_link_libraries(gemmlowp_fixedpoint INTERFACE gemmlowp_private)
+
+add_library(gemmlowp_profiler INTERFACE)
+if(GEMMLOWP_ADD_HEADERS_TO_TARGETS)
+  target_sources(gemmlowp_profiler INTERFACE ${GEMMLOWP_PROFILING_HEADERS})
+endif()
+target_include_directories(gemmlowp_profiler
+  INTERFACE "${GEMMLOWP_SOURCE_DIR}/profiling"
+)
+
+
diff --git a/tensorflow/lite/tools/cmake/modules/neon2sse.cmake b/tensorflow/lite/tools/cmake/modules/neon2sse.cmake
new file mode 100644
index 00000000000..505835b53f0
--- /dev/null
+++ b/tensorflow/lite/tools/cmake/modules/neon2sse.cmake
@@ -0,0 +1,40 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+include(ExternalProject)
+
+if(TARGET neon2sse OR neon2sse_POPULATED)
+  return()
+endif()
+
+OverridableFetchContent_Declare(
+  neon2sse
+  GIT_REPOSITORY https://github.com/intel/ARM_NEON_2_x86_SSE
+  GIT_TAG master
+  GIT_SHALLOW TRUE
+  GIT_PROGRESS TRUE
+  SOURCE_DIR "${CMAKE_BINARY_DIR}/neon2sse"
+)
+
+OverridableFetchContent_GetProperties(neon2sse)
+if(NOT neon2sse_POPULATED)
+  OverridableFetchContent_Populate(neon2sse)
+endif()
+
+add_subdirectory(
+  "${neon2sse_SOURCE_DIR}"
+  "${neon2sse_BINARY_DIR}"
+  EXCLUDE_FROM_ALL
+)
diff --git a/tensorflow/lite/tools/cmake/modules/ruy.cmake b/tensorflow/lite/tools/cmake/modules/ruy.cmake
new file mode 100644
index 00000000000..02a99cd7bab
--- /dev/null
+++ b/tensorflow/lite/tools/cmake/modules/ruy.cmake
@@ -0,0 +1,41 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(TARGET ruy OR ruy_POPULATED)
+  return()
+endif()
+
+include(OverridableFetchContent)
+
+OverridableFetchContent_Declare(
+  ruy
+  GIT_REPOSITORY https://github.com/google/ruy
+  GIT_TAG master # TODO
+  GIT_SHALLOW TRUE
+  GIT_PROGRESS TRUE
+  SOURCE_DIR "${CMAKE_BINARY_DIR}/ruy"
+)
+OverridableFetchContent_GetProperties(ruy)
+if(NOT ruy_POPULATED)
+  OverridableFetchContent_Populate(ruy)
+endif()
+
+set(RUY_SOURCE_DIR "${ruy_SOURCE_DIR}" CACHE PATH "RUY source directory")
+
+add_subdirectory(
+  "${CMAKE_CURRENT_LIST_DIR}/ruy"
+  "${ruy_BINARY_DIR}"
+  EXCLUDE_FROM_ALL
+)
diff --git a/tensorflow/lite/tools/cmake/modules/ruy/CMakeLists.txt b/tensorflow/lite/tools/cmake/modules/ruy/CMakeLists.txt
new file mode 100644
index 00000000000..d88d0470e22
--- /dev/null
+++ b/tensorflow/lite/tools/cmake/modules/ruy/CMakeLists.txt
@@ -0,0 +1,38 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+cmake_minimum_required(VERSION 3.16)
+
+project(ruy CXX)
+
+set(CMAKE_CXX_STANDARD 14)  # Some components require C++14.
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+set(RUY_SOURCE_DIR "" CACHE PATH
+  "Directory that contains the RUY project"
+)
+if(NOT RUY_SOURCE_DIR)
+  message(FATAL_ERROR "Must specify source directory")
+endif()
+
+file(GLOB RUY_SOURCES "${RUY_SOURCE_DIR}/ruy/*.*")
+list(FILTER RUY_SOURCES INCLUDE REGEX ".*\\.(c|cc|h)$")
+list(FILTER RUY_SOURCES EXCLUDE REGEX ".*(_test)\\.(c|cc|h)$")
+list(FILTER RUY_SOURCES EXCLUDE REGEX ".*/(benchmark|example|test_.*)\.cc$")
+list(FILTER RUY_SOURCES EXCLUDE REGEX ".*/gtest_wrapper\\.h$")
+
+add_library(ruy ${RUY_SOURCES})
+target_include_directories(ruy PUBLIC "${RUY_SOURCE_DIR}")
+
diff --git a/tensorflow/lite/tools/cmake/modules/xnnpack.cmake b/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
new file mode 100644
index 00000000000..ea6306aea5a
--- /dev/null
+++ b/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
@@ -0,0 +1,53 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(TARGET xnnpack OR xnnpack_POPULATED)
+  return()
+endif()
+
+include(FetchContent)
+
+OverridableFetchContent_Declare(
+  xnnpack
+  GIT_REPOSITORY https://github.com/google/xnnpack
+  GIT_TAG 0af63ab36b899559bd1a92bbc327f8137e53c15c
+  GIT_PROGRESS TRUE
+  PREFIX "${CMAKE_BINARY_DIR}"
+  SOURCE_DIR "${CMAKE_BINARY_DIR}/xnnpack"
+)
+OverridableFetchContent_GetProperties(xnnpack)
+if(NOT xnnpack_POPULATED)
+  OverridableFetchContent_Populate(xnnpack)
+endif()
+
+# May consider setting XNNPACK_USE_SYSTEM_LIBS if we want to control all
+# dependencies by TFLite.
+set(XNNPACK_BUILD_TESTS OFF CACHE BOOL "Disable XNNPACK test.")
+set(XNNPACK_BUILD_BENCHMARKS OFF CACHE BOOL "Disable XNNPACK benchmarks.")
+
+# The following line adds project of PTHREADPOOL, FP16 and XNNPACK which are
+# needed to compile XNNPACK delegate of TFLite.
+add_subdirectory(
+  "${xnnpack_SOURCE_DIR}"
+  "${xnnpack_BINARY_DIR}"
+  EXCLUDE_FROM_ALL
+)
+
+include_directories(
+  AFTER
+   "${PTHREADPOOL_SOURCE_DIR}/include"
+   "${FP16_SOURCE_DIR}/include"
+   "${XNNPACK_SOURCE_DIR}/include"
+)
diff --git a/tensorflow/lite/tools/command_line_flags.cc b/tensorflow/lite/tools/command_line_flags.cc
index 4f646ae27f4..c7affe435f5 100644
--- a/tensorflow/lite/tools/command_line_flags.cc
+++ b/tensorflow/lite/tools/command_line_flags.cc
@@ -185,7 +185,10 @@ std::string Flag::GetTypeName() const {
 
     const auto it = processed_flags.find(flag.name_);
     if (it != processed_flags.end()) {
+#ifndef NDEBUG
+      // Only log this in debug builds.
       TFLITE_LOG(WARN) << "Duplicate flags: " << flag.name_;
+#endif
       if (it->second != -1) {
         bool value_parsing_ok;
         flag.Parse(argv[it->second], &value_parsing_ok);
diff --git a/tensorflow/lite/tools/delegates/BUILD b/tensorflow/lite/tools/delegates/BUILD
index a8b1485d14b..fca64467bdf 100644
--- a/tensorflow/lite/tools/delegates/BUILD
+++ b/tensorflow/lite/tools/delegates/BUILD
@@ -64,9 +64,11 @@ cc_library(
         ":delegate_provider_hdr",
         "//tensorflow/lite/tools/evaluation:utils",
     ] + select({
-        "//tensorflow:android": [
+        "//tensorflow/lite/delegates/gpu:supports_gpu_delegate": [
             "//tensorflow/lite/delegates/gpu:delegate",
         ],
+        "//conditions:default": [],
+    }) + select({
         "//tensorflow:ios": [
             "//tensorflow/lite/delegates/gpu:metal_delegate",
         ],
diff --git a/tensorflow/lite/tools/delegates/gpu_delegate_provider.cc b/tensorflow/lite/tools/delegates/gpu_delegate_provider.cc
index 32e5e1b117f..de7ace18cf1 100644
--- a/tensorflow/lite/tools/delegates/gpu_delegate_provider.cc
+++ b/tensorflow/lite/tools/delegates/gpu_delegate_provider.cc
@@ -16,7 +16,7 @@ limitations under the License.
 
 #include "tensorflow/lite/tools/delegates/delegate_provider.h"
 #include "tensorflow/lite/tools/evaluation/utils.h"
-#if defined(__ANDROID__)
+#if TFLITE_SUPPORTS_GPU_DELEGATE
 #include "tensorflow/lite/delegates/gpu/delegate.h"
 #elif defined(__APPLE__)
 #include "TargetConditionals.h"
@@ -34,13 +34,13 @@ class GpuDelegateProvider : public DelegateProvider {
  public:
   GpuDelegateProvider() {
     default_params_.AddParam("use_gpu", ToolParam::Create<bool>(false));
-#if defined(__ANDROID__) || defined(REAL_IPHONE_DEVICE)
+#if TFLITE_SUPPORTS_GPU_DELEGATE || defined(REAL_IPHONE_DEVICE)
     default_params_.AddParam("gpu_precision_loss_allowed",
                              ToolParam::Create<bool>(true));
     default_params_.AddParam("gpu_experimental_enable_quant",
                              ToolParam::Create<bool>(true));
 #endif
-#if defined(__ANDROID__)
+#if TFLITE_SUPPORTS_GPU_DELEGATE
     default_params_.AddParam("gpu_backend", ToolParam::Create<std::string>(""));
 #endif
 #if defined(REAL_IPHONE_DEVICE)
@@ -62,7 +62,7 @@ REGISTER_DELEGATE_PROVIDER(GpuDelegateProvider);
 std::vector<Flag> GpuDelegateProvider::CreateFlags(ToolParams* params) const {
   std::vector<Flag> flags = {
     CreateFlag<bool>("use_gpu", params, "use gpu"),
-#if defined(__ANDROID__) || defined(REAL_IPHONE_DEVICE)
+#if TFLITE_SUPPORTS_GPU_DELEGATE || defined(REAL_IPHONE_DEVICE)
     CreateFlag<bool>("gpu_precision_loss_allowed", params,
                      "Allow to process computation in lower precision than "
                      "FP32 in GPU. By default, it's enabled."),
@@ -70,7 +70,7 @@ std::vector<Flag> GpuDelegateProvider::CreateFlags(ToolParams* params) const {
                      "Whether to enable the GPU delegate to run quantized "
                      "models or not. By default, it's enabled."),
 #endif
-#if defined(__ANDROID__)
+#if TFLITE_SUPPORTS_GPU_DELEGATE
     CreateFlag<std::string>(
         "gpu_backend", params,
         "Force the GPU delegate to use a particular backend for execution, and "
@@ -89,13 +89,13 @@ std::vector<Flag> GpuDelegateProvider::CreateFlags(ToolParams* params) const {
 void GpuDelegateProvider::LogParams(const ToolParams& params,
                                     bool verbose) const {
   LOG_TOOL_PARAM(params, bool, "use_gpu", "Use gpu", verbose);
-#if defined(__ANDROID__) || defined(REAL_IPHONE_DEVICE)
+#if TFLITE_SUPPORTS_GPU_DELEGATE || defined(REAL_IPHONE_DEVICE)
   LOG_TOOL_PARAM(params, bool, "gpu_precision_loss_allowed",
                  "Allow lower precision in gpu", verbose);
   LOG_TOOL_PARAM(params, bool, "gpu_experimental_enable_quant",
                  "Enable running quant models in gpu", verbose);
 #endif
-#if defined(__ANDROID__)
+#if TFLITE_SUPPORTS_GPU_DELEGATE
   LOG_TOOL_PARAM(params, std::string, "gpu_backend", "GPU backend", verbose);
 #endif
 #if defined(REAL_IPHONE_DEVICE)
@@ -109,7 +109,7 @@ TfLiteDelegatePtr GpuDelegateProvider::CreateTfLiteDelegate(
   TfLiteDelegatePtr delegate(nullptr, [](TfLiteDelegate*) {});
 
   if (params.Get<bool>("use_gpu")) {
-#if defined(__ANDROID__)
+#if TFLITE_SUPPORTS_GPU_DELEGATE
     TfLiteGpuDelegateOptionsV2 gpu_opts = TfLiteGpuDelegateOptionsV2Default();
     if (params.Get<bool>("gpu_precision_loss_allowed")) {
       gpu_opts.inference_priority1 = TFLITE_GPU_INFERENCE_PRIORITY_MIN_LATENCY;
@@ -157,7 +157,8 @@ TfLiteDelegatePtr GpuDelegateProvider::CreateTfLiteDelegate(
                                  &TFLGpuDelegateDelete);
 #else
     TFLITE_LOG(WARN) << "The GPU delegate compile options are only supported "
-                        "on Android or iOS platforms.";
+                        "on Android or iOS platforms or when the tool was "
+                        "built with -DCL_DELEGATE_NO_GL.";
     delegate = evaluation::CreateGPUDelegate();
 #endif
 
diff --git a/tensorflow/lite/tools/evaluation/BUILD b/tensorflow/lite/tools/evaluation/BUILD
index 85dfb183254..e653379ef69 100644
--- a/tensorflow/lite/tools/evaluation/BUILD
+++ b/tensorflow/lite/tools/evaluation/BUILD
@@ -42,8 +42,12 @@ cc_library(
     deps = [
         "//tensorflow/lite/c:common",
     ] + select({
-        "//tensorflow:android": [
+        "//tensorflow/lite/delegates/gpu:supports_gpu_delegate": [
             "//tensorflow/lite/delegates/gpu:delegate",
+        ],
+        "//conditions:default": [],
+    }) + select({
+        "//tensorflow:android": [
             "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
         ],
         "//conditions:default": [],
diff --git a/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.cc b/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.cc
index 365a00c3cd1..124f85c078b 100644
--- a/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.cc
+++ b/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.cc
@@ -51,6 +51,27 @@ void TfliteInferenceStage::UpdateModelInfo() {
   }
 }
 
+TfLiteStatus TfliteInferenceStage::ResizeInputs(
+    const std::vector<std::vector<int>>& shapes) {
+  const std::vector<int>& intepreter_inputs = interpreter_->inputs();
+  if (intepreter_inputs.size() != shapes.size()) {
+    LOG(ERROR) << "New shape is not compatible";
+    return kTfLiteError;
+  }
+
+  for (int j = 0; j < shapes.size(); ++j) {
+    int i = intepreter_inputs[j];
+    TfLiteTensor* t = interpreter_->tensor(i);
+    if (t->type != kTfLiteString) {
+      TF_LITE_ENSURE_STATUS(interpreter_->ResizeInputTensor(i, shapes[j]));
+    }
+  }
+
+  TF_LITE_ENSURE_STATUS(interpreter_->AllocateTensors());
+  UpdateModelInfo();
+  return kTfLiteOk;
+}
+
 TfLiteStatus TfliteInferenceStage::ApplyCustomDelegate(
     Interpreter::TfLiteDelegatePtr delegate) {
   if (!interpreter_) {
diff --git a/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.h b/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.h
index a8a319fcd16..84ae93aaada 100644
--- a/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.h
+++ b/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.h
@@ -58,6 +58,9 @@ class TfliteInferenceStage : public EvaluationStage {
     inputs_ = &raw_input_ptrs;
   }
 
+  // Resize input tensors with given shapes.
+  TfLiteStatus ResizeInputs(const std::vector<std::vector<int>>& shapes);
+
   // Applies provided delegate to the underlying TFLite Interpreter.
   TfLiteStatus ApplyCustomDelegate(Interpreter::TfLiteDelegatePtr delegate);
 
diff --git a/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage_test.cc b/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage_test.cc
index 80e6b3a6a07..1597a18d625 100644
--- a/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage_test.cc
+++ b/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage_test.cc
@@ -121,6 +121,40 @@ TEST(TfliteInferenceStage, CorrectModelInfo) {
   EXPECT_EQ(output_shape->data[3], 3);
 }
 
+TEST(TfliteInferenceStage, TestResizeModel) {
+  // Create stage.
+  EvaluationStageConfig config = GetTfliteInferenceStageConfig();
+  TfliteInferenceStage stage(config);
+
+  // Initialize.
+  EXPECT_EQ(stage.Init(), kTfLiteOk);
+
+  // Resize.
+  EXPECT_EQ(stage.ResizeInputs({{3, 8, 8, 3}}), kTfLiteOk);
+
+  const TfLiteModelInfo* model_info = stage.GetModelInfo();
+  // Verify Input
+  EXPECT_EQ(model_info->inputs.size(), 1);
+  const TfLiteTensor* tensor = model_info->inputs[0];
+  EXPECT_EQ(tensor->type, kTfLiteUInt8);
+  EXPECT_EQ(tensor->bytes, 3 * kTotalElements);
+  const TfLiteIntArray* input_shape = tensor->dims;
+  EXPECT_EQ(input_shape->data[0], 3);
+  EXPECT_EQ(input_shape->data[1], 8);
+  EXPECT_EQ(input_shape->data[2], 8);
+  EXPECT_EQ(input_shape->data[3], 3);
+  // Verify Output
+  EXPECT_EQ(model_info->outputs.size(), 1);
+  tensor = model_info->outputs[0];
+  EXPECT_EQ(tensor->type, kTfLiteUInt8);
+  EXPECT_EQ(tensor->bytes, 3 * kTotalElements);
+  const TfLiteIntArray* output_shape = tensor->dims;
+  EXPECT_EQ(output_shape->data[0], 3);
+  EXPECT_EQ(output_shape->data[1], 8);
+  EXPECT_EQ(output_shape->data[2], 8);
+  EXPECT_EQ(output_shape->data[3], 3);
+}
+
 TEST(TfliteInferenceStage, CorrectOutput) {
   // Create stage.
   EvaluationStageConfig config = GetTfliteInferenceStageConfig();
diff --git a/tensorflow/lite/tools/evaluation/utils.cc b/tensorflow/lite/tools/evaluation/utils.cc
index c766a932999..d75270c07e9 100644
--- a/tensorflow/lite/tools/evaluation/utils.cc
+++ b/tensorflow/lite/tools/evaluation/utils.cc
@@ -114,15 +114,15 @@ TfLiteDelegatePtr CreateNNAPIDelegate(StatefulNnApiDelegate::Options options) {
 }
 #endif  // defined(__ANDROID__)
 
-#if defined(__ANDROID__)
+#if TFLITE_SUPPORTS_GPU_DELEGATE
 TfLiteDelegatePtr CreateGPUDelegate(TfLiteGpuDelegateOptionsV2* options) {
   return TfLiteDelegatePtr(TfLiteGpuDelegateV2Create(options),
                            &TfLiteGpuDelegateV2Delete);
 }
-#endif  // defined(__ANDROID__)
+#endif  // TFLITE_SUPPORTS_GPU_DELEGATE
 
 TfLiteDelegatePtr CreateGPUDelegate() {
-#if defined(__ANDROID__)
+#if TFLITE_SUPPORTS_GPU_DELEGATE
   TfLiteGpuDelegateOptionsV2 options = TfLiteGpuDelegateOptionsV2Default();
   options.inference_priority1 = TFLITE_GPU_INFERENCE_PRIORITY_MIN_LATENCY;
   options.inference_preference =
@@ -131,7 +131,7 @@ TfLiteDelegatePtr CreateGPUDelegate() {
   return CreateGPUDelegate(&options);
 #else
   return CreateNullDelegate();
-#endif  // defined(__ANDROID__)
+#endif  // TFLITE_SUPPORTS_GPU_DELEGATE
 }
 
 TfLiteDelegatePtr CreateHexagonDelegate(
diff --git a/tensorflow/lite/tools/evaluation/utils.h b/tensorflow/lite/tools/evaluation/utils.h
index 5d9920cf22b..02013f3e39a 100644
--- a/tensorflow/lite/tools/evaluation/utils.h
+++ b/tensorflow/lite/tools/evaluation/utils.h
@@ -21,8 +21,15 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
-#if defined(__ANDROID__)
+#if defined(__ANDROID__) || defined(CL_DELEGATE_NO_GL)
+#define TFLITE_SUPPORTS_GPU_DELEGATE 1
+#endif
+
+#if TFLITE_SUPPORTS_GPU_DELEGATE
 #include "tensorflow/lite/delegates/gpu/delegate.h"
+#endif
+
+#if defined(__ANDROID__)
 #include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
 #if (defined(__arm__) || defined(__aarch64__))
 #include "tensorflow/lite/delegates/hexagon/hexagon_delegate.h"
@@ -67,7 +74,7 @@ TfLiteDelegatePtr CreateNNAPIDelegate(StatefulNnApiDelegate::Options options);
 #endif
 
 TfLiteDelegatePtr CreateGPUDelegate();
-#if defined(__ANDROID__)
+#if TFLITE_SUPPORTS_GPU_DELEGATE
 TfLiteDelegatePtr CreateGPUDelegate(TfLiteGpuDelegateOptionsV2* options);
 #endif
 
diff --git a/tensorflow/lite/tools/flatbuffer_utils.py b/tensorflow/lite/tools/flatbuffer_utils.py
index 3171759201c..29eaf1c7a3b 100644
--- a/tensorflow/lite/tools/flatbuffer_utils.py
+++ b/tensorflow/lite/tools/flatbuffer_utils.py
@@ -28,6 +28,7 @@ from __future__ import print_function
 import copy
 import os
 import random
+import re
 
 import flatbuffers
 from tensorflow.lite.python import schema_py_generated as schema_fb
@@ -81,7 +82,7 @@ def read_model_with_mutable_tensors(input_tflite_file):
 
 
 def convert_object_to_bytearray(model_object):
-  """Converts a tflite model from an object to a bytearray."""
+  """Converts a tflite model from an object to a immutable bytearray."""
   # Initial size of the buffer, which will grow automatically if needed
   builder = flatbuffers.Builder(1024)
   model_offset = model_object.Pack(builder)
@@ -153,3 +154,59 @@ def randomize_weights(model, random_seed=0):
     # end up as denormalized or NaN/Inf floating point numbers.
     for j in range(buffer_i_size):
       buffer_i_data[j] = random.randint(0, 255)
+
+
+def xxd_output_to_bytes(input_cc_file):
+  """Converts xxd output C++ source file to bytes (immutable)
+
+  Args:
+    input_cc_file: Full path name to th C++ source file dumped by xxd
+
+  Raises:
+    RuntimeError: If input_cc_file path is invalid.
+    IOError: If input_cc_file cannot be opened.
+
+  Returns:
+    A bytearray corresponding to the input cc file array.
+  """
+  # Match hex values in the string with comma as separator
+  pattern = re.compile(r'\W*(0x[0-9a-fA-F,x ]+).*')
+
+  model_bytearray = bytearray()
+
+  with open(input_cc_file) as file_handle:
+    for line in file_handle:
+      values_match = pattern.match(line)
+
+      if values_match is None:
+        continue
+
+      # Match in the parentheses (hex array only)
+      list_text = values_match.group(1)
+
+      # Extract hex values (text) from the line
+      # e.g. 0x1c, 0x00, 0x00, 0x00, 0x54, 0x46, 0x4c,
+      values_text = filter(None, list_text.split(','))
+
+      # Convert to hex
+      values = [int(x, base=16) for x in values_text]
+      model_bytearray.extend(values)
+
+  return bytes(model_bytearray)
+
+
+def xxd_output_to_object(input_cc_file):
+  """Converts xxd output C++ source file to object
+
+  Args:
+    input_cc_file: Full path name to th C++ source file dumped by xxd
+
+  Raises:
+    RuntimeError: If input_cc_file path is invalid.
+    IOError: If input_cc_file cannot be opened.
+
+  Returns:
+    A python object corresponding to the input tflite file.
+  """
+  model_bytes = xxd_output_to_bytes(input_cc_file)
+  return convert_bytearray_to_object(model_bytes)
diff --git a/tensorflow/lite/tools/flatbuffer_utils_test.py b/tensorflow/lite/tools/flatbuffer_utils_test.py
index 60235b06bc8..0b7aa282ab1 100644
--- a/tensorflow/lite/tools/flatbuffer_utils_test.py
+++ b/tensorflow/lite/tools/flatbuffer_utils_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 import copy
 import os
+import subprocess
 
 from tensorflow.lite.tools import flatbuffer_utils
 from tensorflow.lite.tools import test_utils
@@ -159,5 +160,33 @@ class RandomizeWeightsTest(test_util.TensorFlowTestCase):
       self.assertNotEqual(initial_buffer.data[j], final_buffer.data[j])
 
 
+class XxdOutputToBytesTest(test_util.TensorFlowTestCase):
+
+  def testXxdOutputToBytes(self):
+    # 1. SETUP
+    # Define the initial model
+    initial_model = test_utils.build_mock_model()
+    initial_bytes = flatbuffer_utils.convert_object_to_bytearray(initial_model)
+
+    # Define temporary files
+    tmp_dir = self.get_temp_dir()
+    model_filename = os.path.join(tmp_dir, 'model.tflite')
+
+    # 2. Write model to temporary file (will be used as input for xxd)
+    flatbuffer_utils.write_model(initial_model, model_filename)
+
+    # 3. DUMP WITH xxd
+    input_cc_file = os.path.join(tmp_dir, 'model.cc')
+
+    command = 'xxd -i {} > {}'.format(model_filename, input_cc_file)
+    subprocess.call(command, shell=True)
+
+    # 4. VALIDATE
+    final_bytes = flatbuffer_utils.xxd_output_to_bytes(input_cc_file)
+
+    # Validate that the initial and final bytearray are the same
+    self.assertEqual(initial_bytes, final_bytes)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/lite/tools/list_flex_ops.cc b/tensorflow/lite/tools/list_flex_ops.cc
index 9c80afbc90e..b4db4f76d2b 100644
--- a/tensorflow/lite/tools/list_flex_ops.cc
+++ b/tensorflow/lite/tools/list_flex_ops.cc
@@ -19,9 +19,8 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_join.h"
 #include "flatbuffers/flexbuffers.h"  // from @flatbuffers
+#include "json/json.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
@@ -34,14 +33,14 @@ namespace tflite {
 namespace flex {
 
 std::string OpListToJSONString(const OpKernelSet& flex_ops) {
-  return absl::StrCat("[",
-                      absl::StrJoin(flex_ops, ",\n",
-                                    [](std::string* out, const OpKernel& op) {
-                                      absl::StrAppend(out, "[\"", op.op_name,
-                                                      "\", \"", op.kernel_name,
-                                                      "\"]");
-                                    }),
-                      "]");
+  Json::Value result(Json::arrayValue);
+  for (const OpKernel& op : flex_ops) {
+    Json::Value op_kernel(Json::arrayValue);
+    op_kernel.append(Json::Value(op.op_name));
+    op_kernel.append(Json::Value(op.kernel_name));
+    result.append(op_kernel);
+  }
+  return Json::FastWriter().write(result);
 }
 
 // Find the class name of the op kernel described in the node_def from the pool
diff --git a/tensorflow/lite/tools/list_flex_ops.h b/tensorflow/lite/tools/list_flex_ops.h
index 070da2d9b3d..f9bc7b952df 100644
--- a/tensorflow/lite/tools/list_flex_ops.h
+++ b/tensorflow/lite/tools/list_flex_ops.h
@@ -42,7 +42,7 @@ struct OpKernelCompare {
 using OpKernelSet = std::set<OpKernel, OpKernelCompare>;
 
 // Find flex ops and its kernel classes inside a TFLite model and add them to
-// the map flex_ops. The map stores
+// the map flex_ops.
 void AddFlexOpsFromModel(const tflite::Model* model, OpKernelSet* flex_ops);
 
 // Serialize the list op of to a json string. If flex_ops is empty, return an
diff --git a/tensorflow/lite/tools/list_flex_ops_no_kernel.cc b/tensorflow/lite/tools/list_flex_ops_no_kernel.cc
new file mode 100644
index 00000000000..68d40be1c9c
--- /dev/null
+++ b/tensorflow/lite/tools/list_flex_ops_no_kernel.cc
@@ -0,0 +1,58 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "json/json.h"
+#include "tensorflow/lite/tools/list_flex_ops.h"
+
+namespace tflite {
+namespace flex {
+
+std::string OpListToJSONString(const OpKernelSet& flex_ops) {
+  Json::Value result(Json::arrayValue);
+  for (const OpKernel& op : flex_ops) {
+    result.append(Json::Value(op.op_name));
+  }
+  return Json::FastWriter().write(result);
+}
+
+void AddFlexOpsFromModel(const tflite::Model* model, OpKernelSet* flex_ops) {
+  auto* subgraphs = model->subgraphs();
+  if (!subgraphs) return;
+
+  for (int subgraph_index = 0; subgraph_index < subgraphs->size();
+       ++subgraph_index) {
+    const tflite::SubGraph* subgraph = subgraphs->Get(subgraph_index);
+    auto* operators = subgraph->operators();
+    auto* opcodes = model->operator_codes();
+    if (!operators || !opcodes) continue;
+
+    for (int i = 0; i < operators->size(); ++i) {
+      const tflite::Operator* op = operators->Get(i);
+      const tflite::OperatorCode* opcode = opcodes->Get(op->opcode_index());
+      if (opcode->builtin_code() != tflite::BuiltinOperator_CUSTOM ||
+          !tflite::IsFlexOp(opcode->custom_code()->c_str())) {
+        continue;
+      }
+
+      // Remove the "Flex" prefix from op name.
+      std::string flex_op_name(opcode->custom_code()->c_str());
+      std::string tf_op_name =
+          flex_op_name.substr(strlen(tflite::kFlexCustomCodePrefix));
+
+      flex_ops->insert({tf_op_name, ""});
+    }
+  }
+}
+}  // namespace flex
+}  // namespace tflite
diff --git a/tensorflow/lite/tools/list_flex_ops_test.cc b/tensorflow/lite/tools/list_flex_ops_test.cc
index 872d7509d0c..7d81dda71e6 100644
--- a/tensorflow/lite/tools/list_flex_ops_test.cc
+++ b/tensorflow/lite/tools/list_flex_ops_test.cc
@@ -87,7 +87,7 @@ class FlexOpModel : public SingleOpModel {
 
 TEST_F(FlexOpsListTest, TestModelsNoFlex) {
   ReadOps("tensorflow/lite/testdata/test_model.bin");
-  EXPECT_EQ(output_text_, "[]");
+  EXPECT_EQ(output_text_, "[]\n");
 }
 
 TEST_F(FlexOpsListTest, TestBrokenModel) {
@@ -97,29 +97,29 @@ TEST_F(FlexOpsListTest, TestBrokenModel) {
 
 TEST_F(FlexOpsListTest, TestZeroSubgraphs) {
   ReadOps("tensorflow/lite/testdata/0_subgraphs.bin");
-  EXPECT_EQ(output_text_, "[]");
+  EXPECT_EQ(output_text_, "[]\n");
 }
 
 TEST_F(FlexOpsListTest, TestFlexAdd) {
   ReadOps("tensorflow/lite/testdata/multi_add_flex.bin");
   EXPECT_EQ(output_text_,
-            "[[\"Add\", \"BinaryOp<CPUDevice, functor::add<float>>\"]]");
+            "[[\"Add\",\"BinaryOp<CPUDevice, functor::add<float>>\"]]\n");
 }
 
 TEST_F(FlexOpsListTest, TestTwoModel) {
   ReadOps("tensorflow/lite/testdata/multi_add_flex.bin");
   ReadOps("tensorflow/lite/testdata/softplus_flex.bin");
   EXPECT_EQ(output_text_,
-            "[[\"Add\", \"BinaryOp<CPUDevice, "
-            "functor::add<float>>\"],\n[\"Softplus\", \"SoftplusOp<CPUDevice, "
-            "float>\"]]");
+            "[[\"Add\",\"BinaryOp<CPUDevice, "
+            "functor::add<float>>\"],[\"Softplus\",\"SoftplusOp<CPUDevice, "
+            "float>\"]]\n");
 }
 
 TEST_F(FlexOpsListTest, TestDuplicatedOp) {
   ReadOps("tensorflow/lite/testdata/multi_add_flex.bin");
   ReadOps("tensorflow/lite/testdata/multi_add_flex.bin");
   EXPECT_EQ(output_text_,
-            "[[\"Add\", \"BinaryOp<CPUDevice, functor::add<float>>\"]]");
+            "[[\"Add\",\"BinaryOp<CPUDevice, functor::add<float>>\"]]\n");
 }
 
 TEST_F(FlexOpsListTest, TestInvalidCustomOptions) {
@@ -192,7 +192,7 @@ TEST_F(FlexOpsListTest, TestFlexAddWithSingleOpModel) {
                         CreateFlexCustomOptions(nodedef_raw_str));
   ReadOps(tflite::GetModel(max_model.GetModelBuffer()));
   EXPECT_EQ(output_text_,
-            "[[\"Add\", \"BinaryOp<CPUDevice, functor::add<float>>\"]]");
+            "[[\"Add\",\"BinaryOp<CPUDevice, functor::add<float>>\"]]\n");
 }
 }  // namespace flex
 }  // namespace tflite
diff --git a/tensorflow/lite/tools/make/Makefile b/tensorflow/lite/tools/make/Makefile
index c7ddff58440..eb07503fd5d 100644
--- a/tensorflow/lite/tools/make/Makefile
+++ b/tensorflow/lite/tools/make/Makefile
@@ -315,9 +315,6 @@ $(OBJDIR)%.o: %.cc
 $(OBJDIR)%.o: %.c
 	@mkdir -p $(dir $@)
 	$(CC) $(CFLAGS) $(INCLUDES) -c $< -o $@
-$(OBJDIR)%.o: %.cpp
-	@mkdir -p $(dir $@)
-	$(CXX) $(CXXFLAGS) $(INCLUDES) -c $< -o $@
 
 # The target that's compiled if there's no command-line arguments.
 all: $(LIB_PATH)  $(MINIMAL_BINARY) $(BENCHMARK_BINARY) $(BENCHMARK_PERF_OPTIONS_BINARY)
diff --git a/tensorflow/lite/tools/optimize/calibration/BUILD b/tensorflow/lite/tools/optimize/calibration/BUILD
index 06183353e44..674ef0ae4f6 100644
--- a/tensorflow/lite/tools/optimize/calibration/BUILD
+++ b/tensorflow/lite/tools/optimize/calibration/BUILD
@@ -41,7 +41,6 @@ cc_library(
         ":calibration_reader",
         ":logging_op",
         ":logging_op_resolver",
-        ":node_info_delegate",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:string_util",
         "//tensorflow/lite/c:common",
@@ -156,39 +155,4 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "node_info_delegate",
-    srcs = ["node_info_delegate.cc"],
-    hdrs = ["node_info_delegate.h"],
-    copts = tflite_copts(),
-    deps = [
-        ":calibration_common",
-        "//tensorflow/lite:framework",
-    ],
-)
-
-tf_cc_test(
-    name = "node_info_delegate_test",
-    srcs = ["node_info_delegate_test.cc"],
-    args = [
-        "--test_model_file=$(location //tensorflow/lite/tools/optimize:testdata/single_conv_weights_min_0_max_plus_10.bin)",
-    ],
-    data = [
-        "//tensorflow/lite/tools/optimize:testdata/single_conv_weights_min_0_max_plus_10.bin",
-    ],
-    tags = [
-        "tflite_not_portable_android",
-        "tflite_not_portable_ios",
-    ],
-    deps = [
-        ":node_info_delegate",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-        "//tensorflow/lite:framework",
-        "//tensorflow/lite/kernels:builtin_ops",
-        "//tensorflow/lite/tools/optimize:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
 tflite_portable_test_suite()
diff --git a/tensorflow/lite/tools/optimize/calibration/calibrator.cc b/tensorflow/lite/tools/optimize/calibration/calibrator.cc
index fb1677fda99..e4e1afad7e3 100644
--- a/tensorflow/lite/tools/optimize/calibration/calibrator.cc
+++ b/tensorflow/lite/tools/optimize/calibration/calibrator.cc
@@ -39,7 +39,6 @@ limitations under the License.
 #include "tensorflow/lite/tools/optimize/calibration/calibration_reader.h"
 #include "tensorflow/lite/tools/optimize/calibration/logging_op.h"
 #include "tensorflow/lite/tools/optimize/calibration/logging_op_resolver.h"
-#include "tensorflow/lite/tools/optimize/calibration/node_info_delegate.h"
 
 namespace tflite {
 namespace optimize {
@@ -267,18 +266,19 @@ TfLiteStatus GetNodeOpInfoMapAndContext(
     const std::unordered_map<int, OperatorInfo>& node_to_opinfo,
     tflite::Interpreter* const interpreter,
     std::unordered_map<const TfLiteNode*, OperatorInfo>* node_ptr_opinfo_map,
-    const TfLiteContext** context) {
-  NodeInfoDelegateObserver delegate_observer(node_to_opinfo,
-                                             node_ptr_opinfo_map);
-  NodeInfoDelegateParams delegate_params;
-  delegate_params.delegate_observer = &delegate_observer;
-  TfLiteDelegate logging_delegate = CreateNodeInfoDelegate(&delegate_params);
+    TfLiteContext** context) {
+  *context = interpreter->primary_subgraph().context();
 
-  auto modify_status = interpreter->ModifyGraphWithDelegate(&logging_delegate);
-  if (modify_status != kTfLiteOk) {
-    return kTfLiteError;
+  // Since we only consider the primary subgraph while populating
+  // node_to_opinfo, do the same here.
+  TF_LITE_ENSURE_EQ(*context, interpreter->execution_plan().size(),
+                    node_to_opinfo.size());
+  for (const auto& entry : node_to_opinfo) {
+    auto op_info = entry.second;
+    const auto* node_and_reg = interpreter->node_and_registration(entry.first);
+    op_info.registration = &node_and_reg->second;
+    node_ptr_opinfo_map->insert({&node_and_reg->first, op_info});
   }
-  *context = delegate_observer.GetContext();
   return kTfLiteOk;
 }
 
@@ -391,7 +391,7 @@ TfLiteStatus BuildLoggingInterpreter(
   // Compute the mapping between runtime and static graph structure, i.e.
   // (TfLiteContext, TfLiteNode) -> OperatorInfo
   std::unordered_map<const TfLiteNode*, OperatorInfo> node_ptr_opinfo_map;
-  const TfLiteContext* context = nullptr;
+  TfLiteContext* context = nullptr;
   GetNodeOpInfoMapAndContext(node_to_opinfo, interpreter->get(),
                              &node_ptr_opinfo_map, &context);
 
diff --git a/tensorflow/lite/tools/optimize/calibration/node_info_delegate.cc b/tensorflow/lite/tools/optimize/calibration/node_info_delegate.cc
deleted file mode 100644
index 84031761b30..00000000000
--- a/tensorflow/lite/tools/optimize/calibration/node_info_delegate.cc
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/lite/tools/optimize/calibration/node_info_delegate.h"
-
-namespace tflite {
-namespace optimize {
-namespace calibration {
-
-namespace {
-// The prepare function for delegate that forwards the prepare call to the
-// delegate observer in node info delegate params.
-// The function simply calls a delegate observer OnDelegatePrepareMethod.
-TfLiteStatus NodeInfoDelegatePrepare(TfLiteContext* context,
-                                     TfLiteDelegate* delegate) {
-  if (delegate == nullptr) return TfLiteStatus::kTfLiteError;
-
-  NodeInfoDelegateParams* params =
-      reinterpret_cast<NodeInfoDelegateParams*>(delegate->data_);
-  return params->delegate_observer->OnDelegatePrepareCalled(context);
-}
-}  // namespace
-
-TfLiteDelegate CreateNodeInfoDelegate(NodeInfoDelegateParams* params) {
-  auto delegate = TfLiteDelegateCreate();
-  delegate.data_ = params;
-  delegate.Prepare = NodeInfoDelegatePrepare;
-  delegate.CopyFromBufferHandle = nullptr;
-  delegate.CopyToBufferHandle = nullptr;
-  delegate.FreeBufferHandle = nullptr;
-  delegate.flags = kTfLiteDelegateFlagsAllowDynamicTensors;
-  return delegate;
-}
-
-TfLiteStatus NodeInfoDelegateObserver::OnDelegatePrepareCalled(
-    TfLiteContext* context) {
-  context_ = context;
-  const size_t num_nodes = node_index_opinfo_map_.size();
-  for (size_t node_index = 0; node_index < num_nodes; node_index++) {
-    TfLiteNode* node = nullptr;
-    TfLiteRegistration* reg = nullptr;
-    TF_LITE_ENSURE_STATUS(
-        context->GetNodeAndRegistration(context, node_index, &node, &reg));
-    auto op_info = node_index_opinfo_map_.at(node_index);
-    op_info.registration = reg;
-    node_ptr_opinfo_map_->insert({node, op_info});
-  }
-
-  if (node_ptr_opinfo_map_->size() != node_index_opinfo_map_.size()) {
-    // Something wrong.
-    return kTfLiteError;
-  }
-  return kTfLiteOk;
-}
-
-}  // namespace calibration
-}  // namespace optimize
-}  // namespace tflite
diff --git a/tensorflow/lite/tools/optimize/calibration/node_info_delegate.h b/tensorflow/lite/tools/optimize/calibration/node_info_delegate.h
deleted file mode 100644
index 56f6141f21d..00000000000
--- a/tensorflow/lite/tools/optimize/calibration/node_info_delegate.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_LITE_TOOLS_OPTIMIZE_NODE_INFO_DELEGATE_H_
-#define TENSORFLOW_LITE_TOOLS_OPTIMIZE_NODE_INFO_DELEGATE_H_
-
-#include <unordered_map>
-
-#include "tensorflow/lite/context.h"
-#include "tensorflow/lite/tools/optimize/calibration/calibration_common.h"
-
-namespace tflite {
-namespace optimize {
-namespace calibration {
-
-// An interface for delegate observer that can listen to TfLiteDelegate::Prepare
-// calls.
-class DelegateObserver {
- public:
-  virtual TfLiteStatus OnDelegatePrepareCalled(TfLiteContext* context) = 0;
-  virtual ~DelegateObserver() {}
-};
-
-// The parameters for the node info delegate.
-struct NodeInfoDelegateParams {
-  DelegateObserver* delegate_observer;
-};
-
-// Creates a delegate with the given |params|.
-TfLiteDelegate CreateNodeInfoDelegate(NodeInfoDelegateParams* params);
-
-// A delegate observer that can construct the map from TfLiteNode* ->
-// OperatorInfo.
-class NodeInfoDelegateObserver : public DelegateObserver {
- public:
-  NodeInfoDelegateObserver(
-      const std::unordered_map<int, OperatorInfo>& node_index_to_op,
-      std::unordered_map<const TfLiteNode*, OperatorInfo>* node_ptr_opinfo_map)
-      : node_index_opinfo_map_(node_index_to_op),
-        node_ptr_opinfo_map_(node_ptr_opinfo_map) {}
-
-  TfLiteStatus OnDelegatePrepareCalled(TfLiteContext* context) override;
-
-  // Returns the context that was used to called the prepare method.
-  const TfLiteContext* GetContext() const { return context_; }
-
- private:
-  const TfLiteContext* context_ = nullptr;
-  const std::unordered_map<int, OperatorInfo>& node_index_opinfo_map_;
-  std::unordered_map<const TfLiteNode*, OperatorInfo>* node_ptr_opinfo_map_;
-};
-
-}  // namespace calibration
-}  // namespace optimize
-}  // namespace tflite
-#endif  // TENSORFLOW_LITE_TOOLS_OPTIMIZE_NODE_INFO_DELEGATE_H_
diff --git a/tensorflow/lite/tools/optimize/calibration/node_info_delegate_test.cc b/tensorflow/lite/tools/optimize/calibration/node_info_delegate_test.cc
deleted file mode 100644
index 722bdbdbb39..00000000000
--- a/tensorflow/lite/tools/optimize/calibration/node_info_delegate_test.cc
+++ /dev/null
@@ -1,178 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <unordered_map>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "tensorflow/core/lib/io/path.h"
-#include "tensorflow/core/platform/init_main.h"
-#include "tensorflow/core/util/command_line_flags.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/tools/optimize/calibration/node_info_delegate.h"
-#include "tensorflow/lite/tools/optimize/test_util.h"
-
-namespace {
-tensorflow::string* g_test_model_dir = nullptr;
-}  // namespace
-
-namespace tflite {
-namespace optimize {
-namespace calibration {
-namespace {
-
-std::unique_ptr<FlatBufferModel> ReadModel(const char* model) {
-  auto model_path = tensorflow::io::JoinPath(*g_test_model_dir, model);
-  return FlatBufferModel::BuildFromFile(model_path.c_str());
-}
-
-std::unique_ptr<FlatBufferModel> ReadModel() {
-  return ReadModel(internal::kConvModelWith0Plus10Weights);
-}
-
-class TestDelegateObserver : public DelegateObserver {
- public:
-  explicit TestDelegateObserver(TfLiteStatus status_to_return)
-      : status_to_return_(status_to_return) {}
-
-  TfLiteStatus OnDelegatePrepareCalled(TfLiteContext* context) override {
-    num_times_called_++;
-    return status_to_return_;
-  }
-  int num_times_called() { return num_times_called_; }
-
- private:
-  int num_times_called_ = 0;
-  TfLiteStatus status_to_return_;
-};
-
-TEST(NodeInfoDelegateTest, DelegateObserverIsCalled) {
-  TestDelegateObserver observer(kTfLiteOk);
-  NodeInfoDelegateParams params;
-  params.delegate_observer = &observer;
-  auto model = ReadModel();
-  ASSERT_TRUE(model);
-  std::unique_ptr<Interpreter> interpreter;
-  ASSERT_EQ(InterpreterBuilder(*model,
-                               ops::builtin::BuiltinOpResolver{})(&interpreter),
-            kTfLiteOk);
-  ASSERT_TRUE(interpreter);
-  EXPECT_EQ(0, observer.num_times_called());
-  TfLiteDelegate delegate = CreateNodeInfoDelegate(&params);
-
-  auto status = interpreter->ModifyGraphWithDelegate(&delegate);
-  EXPECT_EQ(kTfLiteOk, status);
-  EXPECT_EQ(1, observer.num_times_called());
-}
-
-TEST(NodeInfoDelegateTest, ObserverErrorCausesModifyGraphFailure) {
-  // Observer returns error
-  TestDelegateObserver observer(kTfLiteError);
-  NodeInfoDelegateParams params;
-  params.delegate_observer = &observer;
-  auto model = ReadModel();
-  ASSERT_TRUE(model);
-  std::unique_ptr<Interpreter> interpreter;
-  ASSERT_EQ(InterpreterBuilder(*model,
-                               ops::builtin::BuiltinOpResolver{})(&interpreter),
-            kTfLiteOk);
-  ASSERT_TRUE(interpreter);
-  TfLiteDelegate delegate = CreateNodeInfoDelegate(&params);
-
-  auto status = interpreter->ModifyGraphWithDelegate(&delegate);
-  EXPECT_EQ(kTfLiteDelegateError, status);
-}
-
-TEST(NodeInfoDelegateTest, NodeInfoDelegateObserver) {
-  auto model = ReadModel();
-  ASSERT_TRUE(model);
-
-  std::unordered_map<int, OperatorInfo> index_to_opinfo;
-  auto primary_subgraph = model->GetModel()->subgraphs()->Get(0);
-  auto operators = primary_subgraph->operators();
-  auto subgraph_tensors = primary_subgraph->tensors();
-  for (size_t i = 0; i < operators->size(); i++) {
-    OperatorInfo info;
-    auto op_inputs = operators->Get(i)->inputs();
-    auto op_outputs = operators->Get(i)->outputs();
-    info.inputs = std::vector<int>(op_inputs->begin(), op_inputs->end());
-    info.outputs = std::vector<int>(op_outputs->begin(), op_outputs->end());
-    index_to_opinfo[i] = info;
-  }
-
-  std::unordered_map<const TfLiteNode*, OperatorInfo> node_to_opinfo;
-  NodeInfoDelegateObserver observer(index_to_opinfo, &node_to_opinfo);
-  NodeInfoDelegateParams params;
-  params.delegate_observer = &observer;
-  std::unique_ptr<Interpreter> interpreter;
-  ASSERT_EQ(InterpreterBuilder(*model,
-                               ops::builtin::BuiltinOpResolver{})(&interpreter),
-            kTfLiteOk);
-  ASSERT_TRUE(interpreter);
-
-  TfLiteDelegate delegate = CreateNodeInfoDelegate(&params);
-
-  auto status = interpreter->ModifyGraphWithDelegate(&delegate);
-  EXPECT_EQ(kTfLiteOk, status);
-  EXPECT_EQ(index_to_opinfo.size(), node_to_opinfo.size());
-  EXPECT_EQ(interpreter->nodes_size(), node_to_opinfo.size());
-
-  for (const auto& node_and_opinfo : node_to_opinfo) {
-    const TfLiteNode* tflite_node = node_and_opinfo.first;
-    const OperatorInfo& info = node_and_opinfo.second;
-    ASSERT_EQ(tflite_node->inputs->size, info.inputs.size());
-    ASSERT_EQ(tflite_node->outputs->size, info.outputs.size());
-
-    for (size_t input_index = 0; input_index < info.inputs.size();
-         input_index++) {
-      const TfLiteTensor* tflite_tensor =
-          interpreter->tensor(tflite_node->inputs->data[input_index]);
-      EXPECT_EQ(tflite_tensor->name,
-                subgraph_tensors->Get(info.inputs[input_index])->name()->str());
-    }
-
-    for (size_t output_index = 0; output_index < info.outputs.size();
-         output_index++) {
-      const TfLiteTensor* tflite_tensor =
-          interpreter->tensor(tflite_node->outputs->data[output_index]);
-      EXPECT_EQ(
-          tflite_tensor->name,
-          subgraph_tensors->Get(info.outputs[output_index])->name()->str());
-    }
-  }
-}
-
-}  // namespace
-}  // namespace calibration
-}  // namespace optimize
-}  // namespace tflite
-
-int main(int argc, char** argv) {
-  tensorflow::string model_file;
-  const std::vector<tensorflow::Flag> flag_list = {
-      tensorflow::Flag("test_model_file", &model_file,
-                       "Path to test tflite model file."),
-  };
-
-  const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
-  if (!parse_result) {
-    std::cerr << "Required test_model_file\n";
-    std::abort();
-  }
-  g_test_model_dir =
-      new tensorflow::string(tensorflow::io::Dirname(model_file));
-  ::tensorflow::port::InitMain(argv[0], &argc, &argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/tools/optimize/modify_model_interface.h b/tensorflow/lite/tools/optimize/modify_model_interface.h
index 5711a615812..78edadc4bed 100644
--- a/tensorflow/lite/tools/optimize/modify_model_interface.h
+++ b/tensorflow/lite/tools/optimize/modify_model_interface.h
@@ -22,9 +22,10 @@ namespace tflite {
 namespace optimize {
 
 // Changes the interface of a quantized model. This method allows the users to
-// replace float interface with other types.
-// This populates the builder with the new model.
-// Currently only int8, int16 and uint8 are supported.
+// replace float interface with other types. Currently only int8, int16 and
+// uint8 are supported.
+//
+// This method populates the builder with the new model.
 //
 // Note: This is a private API, subject to change.
 TfLiteStatus ModifyModelInterface(flatbuffers::FlatBufferBuilder* builder,
diff --git a/tensorflow/lite/tools/optimize/operator_property.cc b/tensorflow/lite/tools/optimize/operator_property.cc
index 5ab48d570f5..ce56b186216 100644
--- a/tensorflow/lite/tools/optimize/operator_property.cc
+++ b/tensorflow/lite/tools/optimize/operator_property.cc
@@ -70,6 +70,11 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
   BuiltinOperator op_code = op_variant.op_code;
   OperatorProperty property;
   switch (op_code) {
+    case BuiltinOperator_ABS:
+      property.inputs = {{0, {}}};
+      property.outputs = {{0, {}}};
+      property.version = 2;
+      break;
     case BuiltinOperator_ADD:
       property.inputs = {{0, {}}, {1, {}}};
       property.outputs = {{0, {}}};
@@ -866,13 +871,18 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       property.version = 1;
       break;
     case BuiltinOperator_RESIZE_BILINEAR:
-    case BuiltinOperator_RESIZE_NEAREST_NEIGHBOR:
       property.inputs = {{0, {}}};
       property.outputs = {{0, {}}};
       property.restrict_same_input_output_scale = true;
       property.version = 2;
       property.quantizable_int16 = false;
       break;
+    case BuiltinOperator_RESIZE_NEAREST_NEIGHBOR:
+      property.inputs = {{0, {}}};
+      property.outputs = {{0, {}}};
+      property.restrict_same_input_output_scale = true;
+      property.version = 2;
+      break;
     case BuiltinOperator_SHAPE:
       property.inputs = {{0, {}}};
       // Shape has no quantizable output.
diff --git a/tensorflow/lite/tools/optimize/operator_property.h b/tensorflow/lite/tools/optimize/operator_property.h
index ef84f3aaac1..58922a60e27 100644
--- a/tensorflow/lite/tools/optimize/operator_property.h
+++ b/tensorflow/lite/tools/optimize/operator_property.h
@@ -26,6 +26,27 @@ namespace operator_property {
 // the scales. For example, for bias in conv, derived_scale = {{0, 1}, {}, {}}
 // and for lstm gate bias, the derived scale is {{}, {0}, {2^-10}}
 struct DerivedScale {
+  // MSVC2015 version 14.0 and below doesn't support struct initialization with
+  // initializer lists so emulate the behavior using a float initializer list.
+#if _MSC_VER <= 1900
+  DerivedScale() {}
+  // Construct this object with a list of initializer lists. All list elements
+  // are cast to float values to avoid ambiguous construction of a union-style
+  // object that could take either std::initializer_list<float> or
+  // std::initializer_list<int>.
+  DerivedScale(std::initializer_list<std::initializer_list<float>> values) {
+    assert(values.size() == 3);
+    std::vector<std::initializer_list<float>> items(values);
+    for (auto& it : items[0]) {
+      input_tensors.push_back(static_cast<int>(it));
+    }
+    for (auto& it : items[1]) {
+      intermediate_tensors.push_back(static_cast<int>(it));
+    }
+    factors.assign(items[2]);
+  }
+#endif  // _MSC_VER <= 1900
+
   std::vector<int> input_tensors = {};
   std::vector<int> intermediate_tensors = {};
   // This is a list of extra factors that are not associated with any other
diff --git a/tensorflow/lite/tools/optimize/sparsity/BUILD b/tensorflow/lite/tools/optimize/sparsity/BUILD
index 7f2d880ff48..9bceb585826 100644
--- a/tensorflow/lite/tools/optimize/sparsity/BUILD
+++ b/tensorflow/lite/tools/optimize/sparsity/BUILD
@@ -1,4 +1,5 @@
 load("//tensorflow/lite:build_def.bzl", "tflite_copts")
+load("//tensorflow:tensorflow.bzl", "get_compatible_with_portable")
 
 package(
     default_visibility = [
@@ -11,6 +12,7 @@ cc_library(
     name = "format_converter",
     srcs = ["format_converter.cc"],
     hdrs = ["format_converter.h"],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
     deps = [
         "//tensorflow/lite/c:common",
diff --git a/tensorflow/lite/tools/reverse_xxd_dump_from_cc.py b/tensorflow/lite/tools/reverse_xxd_dump_from_cc.py
new file mode 100644
index 00000000000..cb7c73b6a2a
--- /dev/null
+++ b/tensorflow/lite/tools/reverse_xxd_dump_from_cc.py
@@ -0,0 +1,68 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+r"""Reverses xxd dump from to binary file
+
+This script is used to convert models from C++ source file (dumped with xxd) to
+the binary model weight file and analyze it with model visualizer like Netron
+(https://github.com/lutzroeder/netron) or load the model in TensorFlow Python
+API
+to evaluate the results in Python.
+
+The command to dump binary file to C++ source file looks like
+
+xxd -i model_data.tflite > model_data.cc
+
+Example usage:
+
+python reverse_xxd_dump_from_cc.py \
+  --input_cc_file=model_data.cc \
+  --output_tflite_file=model_data.tflite
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import sys
+
+from tensorflow.lite.tools import flatbuffer_utils
+from tensorflow.python.platform import app
+
+
+def main(_):
+  """Application run loop."""
+  parser = argparse.ArgumentParser(
+      description='Reverses xxd dump from to binary file')
+  parser.add_argument(
+      '--input_cc_file',
+      type=str,
+      required=True,
+      help='Full path name to the input cc file.')
+  parser.add_argument(
+      '--output_tflite_file',
+      type=str,
+      required=True,
+      help='Full path name to the stripped output tflite file.')
+
+  args = parser.parse_args()
+
+  # Read the model from xxd output C++ source file
+  model = flatbuffer_utils.xxd_output_to_object(args.input_cc_file)
+  # Write the model
+  flatbuffer_utils.write_model(model, args.output_tflite_file)
+
+
+if __name__ == '__main__':
+  app.run(main=main, argv=sys.argv[:1])
diff --git a/tensorflow/lite/tools/versioning/op_version.cc b/tensorflow/lite/tools/versioning/op_version.cc
index 140fa6e376a..4d41b7d13e9 100644
--- a/tensorflow/lite/tools/versioning/op_version.cc
+++ b/tensorflow/lite/tools/versioning/op_version.cc
@@ -237,6 +237,9 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       return 1;
 
     case BuiltinOperator_TRANSPOSE:
+      if (op_sig.input_types.at(0) == TensorType_INT16) {
+        return 5;
+      }
       if (op_sig.options.single_input_op.num_dims > 4) {
         return 4;
       }
@@ -320,6 +323,9 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       return 1;
 
     case BuiltinOperator_SLICE:
+      if (op_sig.input_types.at(0) == TensorType_INT16) {
+        return 4;
+      }
       // Version 3 supports string input types.
       if (op_sig.input_types.at(0) == TensorType_STRING) {
         return 3;
@@ -368,6 +374,7 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       }
       return 1;
 
+    case BuiltinOperator_ABS:
     case BuiltinOperator_RELU:
       if (op_sig.input_types.at(0) == TensorType_INT8 ||
           op_sig.input_types.at(0) == TensorType_UINT8) {
@@ -399,8 +406,10 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       }
       return 1;
     case BuiltinOperator_RESIZE_NEAREST_NEIGHBOR:
-      if (op_sig.options.resize.half_pixel_centers ||
-          op_sig.options.resize.align_corners) {
+      if (op_sig.input_types.at(0) == TensorType_INT16) {
+        return 4;
+      } else if (op_sig.options.resize.half_pixel_centers ||
+                 op_sig.options.resize.align_corners) {
         return 3;
       } else if (op_sig.input_types.at(0) == TensorType_INT8) {
         return 2;
diff --git a/tensorflow/lite/tools/versioning/op_version_test.cc b/tensorflow/lite/tools/versioning/op_version_test.cc
index a90cb336318..f954ea6b6d2 100644
--- a/tensorflow/lite/tools/versioning/op_version_test.cc
+++ b/tensorflow/lite/tools/versioning/op_version_test.cc
@@ -216,6 +216,12 @@ TEST(OpVersionTest, VersioningSpaceToDepthTest) {
 
 TEST(OpVersionTest, VersioningSliceTest) {
   OpSignature fake_op_sig = {
+      .op = BuiltinOperator_SLICE,
+      .input_types = std::vector<TensorType>{TensorType_INT16},
+  };
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 4);
+
+  fake_op_sig = {
       .op = BuiltinOperator_SLICE,
       .input_types = std::vector<TensorType>{TensorType_STRING},
   };
@@ -587,6 +593,12 @@ TEST(OpVersionTest, VersioningTileOperatorTest) {
 }
 TEST(OpVersionTest, VersioningTransposeTest) {
   OpSignature fake_op_sig = {
+      .op = BuiltinOperator_TRANSPOSE,
+      .input_types = std::vector<TensorType>{TensorType_INT16},
+  };
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 5);
+
+  fake_op_sig = {
       .op = BuiltinOperator_TRANSPOSE,
       .input_types = std::vector<TensorType>{TensorType_BOOL},
   };
@@ -709,5 +721,31 @@ TEST(OpVersionTest, VersioningResizeNearestNeighborTest) {
 
   fake_op_sig.options.resize.align_corners = true;
   EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 3);
+
+  // int16 input is version 4.
+  fake_op_sig = {
+      .op = BuiltinOperator_RESIZE_NEAREST_NEIGHBOR,
+      .input_types =
+          std::vector<TensorType>{TensorType_INT16, TensorType_INT32},
+      .output_types = std::vector<TensorType>{TensorType_INT16},
+  };
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 4);
+}
+TEST(OpVersionTest, VersioningAbsTest) {
+  // Default.
+  OpSignature fake_op_sig = {
+      .op = BuiltinOperator_ABS,
+      .input_types = std::vector<TensorType>{TensorType_FLOAT32},
+      .output_types = std::vector<TensorType>{TensorType_FLOAT32},
+  };
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 1);
+
+  // int8 input is version 2.
+  fake_op_sig = {
+      .op = BuiltinOperator_RESIZE_NEAREST_NEIGHBOR,
+      .input_types = std::vector<TensorType>{TensorType_INT8},
+      .output_types = std::vector<TensorType>{TensorType_INT8},
+  };
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 2);
 }
 }  // namespace tflite
diff --git a/tensorflow/lite/tools/versioning/runtime_version.cc b/tensorflow/lite/tools/versioning/runtime_version.cc
index 56ea12db43f..3c0a92f9df1 100644
--- a/tensorflow/lite/tools/versioning/runtime_version.cc
+++ b/tensorflow/lite/tools/versioning/runtime_version.cc
@@ -160,6 +160,7 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
               {{BuiltinOperator_TRANSPOSE, 2}, "1.14.0"},
               {{BuiltinOperator_TRANSPOSE, 3}, "1.15.0"},
               {{BuiltinOperator_TRANSPOSE, 4}, "2.3.0"},
+              {{BuiltinOperator_TRANSPOSE, 5}, kPendingReleaseVersion},
               {{BuiltinOperator_LSTM, 1}, "1.7.0"},
               {{BuiltinOperator_LSTM, 2}, "1.10.0"},
               {{BuiltinOperator_LSTM, 3}, "1.14.0"},
@@ -192,6 +193,8 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
               {{BuiltinOperator_RESIZE_NEAREST_NEIGHBOR, 1}, "1.13.1"},
               {{BuiltinOperator_RESIZE_NEAREST_NEIGHBOR, 2}, "1.14.0"},
               {{BuiltinOperator_RESIZE_NEAREST_NEIGHBOR, 3}, "2.3.0"},
+              {{BuiltinOperator_RESIZE_NEAREST_NEIGHBOR, 4},
+               kPendingReleaseVersion},
               {{BuiltinOperator_RNN, 1}, "1.5.0"},
               {{BuiltinOperator_RNN, 2}, "1.14.0"},
               {{BuiltinOperator_RNN, 3}, "2.3.0"},
@@ -227,6 +230,7 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
               {{BuiltinOperator_SLICE, 1}, "1.14.0"},
               {{BuiltinOperator_SLICE, 2}, "1.14.0"},
               {{BuiltinOperator_SLICE, 3}, "1.14.0"},
+              {{BuiltinOperator_SLICE, 4}, kPendingReleaseVersion},
               {{BuiltinOperator_TANH, 1}, "1.14.0"},
               {{BuiltinOperator_TANH, 2}, "1.14.0"},
               {{BuiltinOperator_TANH, 3}, "2.3.0"},
@@ -306,6 +310,7 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
               {{BuiltinOperator_SQUARE, 1}, "1.12.0"},
               {{BuiltinOperator_ZEROS_LIKE, 1}, "1.12.0"},
               {{BuiltinOperator_ABS, 1}, "1.13.0"},
+              {{BuiltinOperator_ABS, 2}, kPendingReleaseVersion},
               {{BuiltinOperator_HARD_SWISH, 1}, "1.15.0"},
               {{BuiltinOperator_FILL, 1}, "1.13.0"},
               {{BuiltinOperator_FILL, 2}, "2.3.0"},
diff --git a/tensorflow/lite/type_to_tflitetype.h b/tensorflow/lite/type_to_tflitetype.h
index a95b233c13c..8409a299082 100644
--- a/tensorflow/lite/type_to_tflitetype.h
+++ b/tensorflow/lite/type_to_tflitetype.h
@@ -15,56 +15,20 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_TYPE_TO_TFLITETYPE_H_
 #define TENSORFLOW_LITE_TYPE_TO_TFLITETYPE_H_
 
-// Arduino build defines abs as a macro here. That is invalid C++, and breaks
-// libc++'s <complex> header, undefine it.
-#ifdef abs
-#undef abs
-#endif
-
-#include <complex>
 #include <string>
 
 #include "tensorflow/lite/c/common.h"
 
+// Most of the definitions have been moved to this subheader so that Micro
+// can include it without relying on <string>, which isn't available on all
+// platforms.
+#include "tensorflow/lite/portable_type_to_tflitetype.h"
+
 namespace tflite {
 
-// Map statically from a C++ type to a TfLiteType. Used in interpreter for
-// safe casts.
-// Example:
-//  typeToTfLiteType<bool>() -> kTfLiteBool
-template <typename T>
-constexpr TfLiteType typeToTfLiteType() {
-  return kTfLiteNoType;
-}
-// Map from TfLiteType to the corresponding C++ type.
-// Example:
-//   TfLiteTypeToType<kTfLiteBool>::Type -> bool
-template <TfLiteType TFLITE_TYPE_ENUM>
-struct TfLiteTypeToType {};  // Specializations below
-
-// Template specialization for both typeToTfLiteType and TfLiteTypeToType.
-#define MATCH_TYPE_AND_TFLITE_TYPE(CPP_TYPE, TFLITE_TYPE_ENUM) \
-  template <>                                                  \
-  constexpr TfLiteType typeToTfLiteType<CPP_TYPE>() {          \
-    return TFLITE_TYPE_ENUM;                                   \
-  }                                                            \
-  template <>                                                  \
-  struct TfLiteTypeToType<TFLITE_TYPE_ENUM> {                  \
-    using Type = CPP_TYPE;                                     \
-  }
-
-MATCH_TYPE_AND_TFLITE_TYPE(int, kTfLiteInt32);
-MATCH_TYPE_AND_TFLITE_TYPE(int16_t, kTfLiteInt16);
-MATCH_TYPE_AND_TFLITE_TYPE(int64_t, kTfLiteInt64);
-MATCH_TYPE_AND_TFLITE_TYPE(float, kTfLiteFloat32);
-MATCH_TYPE_AND_TFLITE_TYPE(unsigned char, kTfLiteUInt8);
-MATCH_TYPE_AND_TFLITE_TYPE(int8_t, kTfLiteInt8);
-MATCH_TYPE_AND_TFLITE_TYPE(bool, kTfLiteBool);
-MATCH_TYPE_AND_TFLITE_TYPE(std::complex<float>, kTfLiteComplex64);
-MATCH_TYPE_AND_TFLITE_TYPE(std::complex<double>, kTfLiteComplex128);
+// TODO(b/163167649): This string conversion means that only the first entry
+// in a string tensor will be returned as a std::string, so it's deprecated.
 MATCH_TYPE_AND_TFLITE_TYPE(std::string, kTfLiteString);
-MATCH_TYPE_AND_TFLITE_TYPE(TfLiteFloat16, kTfLiteFloat16);
-MATCH_TYPE_AND_TFLITE_TYPE(double, kTfLiteFloat64);
 
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_TYPE_TO_TFLITETYPE_H_
diff --git a/tensorflow/opensource_only.files b/tensorflow/opensource_only.files
index d46d9a27b24..2aad135c23b 100644
--- a/tensorflow/opensource_only.files
+++ b/tensorflow/opensource_only.files
@@ -163,8 +163,6 @@ tensorflow/third_party/repo.bzl
 tensorflow/third_party/six.BUILD
 tensorflow/third_party/snappy.BUILD
 tensorflow/third_party/sqlite.BUILD
-tensorflow/third_party/swig.BUILD
-tensorflow/third_party/sycl/crosstool/BUILD
 tensorflow/third_party/systemlibs/BUILD
 tensorflow/third_party/systemlibs/BUILD.tpl
 tensorflow/third_party/systemlibs/absl_py.BUILD
@@ -194,7 +192,6 @@ tensorflow/third_party/systemlibs/re2.BUILD
 tensorflow/third_party/systemlibs/six.BUILD
 tensorflow/third_party/systemlibs/snappy.BUILD
 tensorflow/third_party/systemlibs/sqlite.BUILD
-tensorflow/third_party/systemlibs/swig.BUILD
 tensorflow/third_party/systemlibs/syslibs_configure.bzl
 tensorflow/third_party/systemlibs/termcolor.BUILD
 tensorflow/third_party/systemlibs/zlib.BUILD
@@ -245,6 +242,8 @@ tensorflow/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-
 tensorflow/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.0/cc_toolchain_config.bzl
 tensorflow/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1/BUILD
 tensorflow/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1/cc_toolchain_config.bzl
+tensorflow/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda11/BUILD
+tensorflow/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda11/cc_toolchain_config.bzl
 tensorflow/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010/BUILD
 tensorflow/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010/cc_toolchain_config.bzl
 tensorflow/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010/dummy_toolchain.bzl
@@ -265,6 +264,7 @@ tensorflow/third_party/toolchains/remote_config/BUILD
 tensorflow/third_party/toolchains/remote_config/configs.bzl
 tensorflow/third_party/toolchains/remote_config/containers.bzl
 tensorflow/third_party/toolchains/remote_config/rbe_config.bzl
+tensorflow/third_party/typing_extensions.BUILD
 tensorflow/third_party/wrapt.BUILD
 tensorflow/third_party/zlib.BUILD
 tensorflow/tools/build_info/BUILD
@@ -273,12 +273,6 @@ tensorflow/tools/ci_build/release/common.sh
 tensorflow/tools/ci_build/release/common_win.bat
 tensorflow/tools/ci_build/release/macos/cpu_libtensorflow/build.sh
 tensorflow/tools/ci_build/release/macos/cpu_libtensorflow/release.sh
-tensorflow/tools/ci_build/release/macos/cpu_py2_full/nightly_release.sh
-tensorflow/tools/ci_build/release/macos/cpu_py2_full/nonpip.sh
-tensorflow/tools/ci_build/release/macos/cpu_py2_full/nonpip_v1.sh
-tensorflow/tools/ci_build/release/macos/cpu_py2_full/pip.sh
-tensorflow/tools/ci_build/release/macos/cpu_py2_full/pip_v1.sh
-tensorflow/tools/ci_build/release/macos/cpu_py2_xla/build.sh
 tensorflow/tools/ci_build/release/macos/cpu_py35_full/nightly_release.sh
 tensorflow/tools/ci_build/release/macos/cpu_py35_full/nonpip.sh
 tensorflow/tools/ci_build/release/macos/cpu_py35_full/nonpip_v1.sh
@@ -299,11 +293,6 @@ tensorflow/tools/ci_build/release/macos/cpu_py37_full/release.sh
 tensorflow/tools/ci_build/release/macos/cpu_py38_full/nightly_release.sh
 tensorflow/tools/ci_build/release/macos/cpu_py38_full/nonpip.sh
 tensorflow/tools/ci_build/release/macos/cpu_py38_full/pip.sh
-tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/nightly_release.sh
-tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/nonpip.sh
-tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/nonpip_v1.sh
-tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/pip.sh
-tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/pip_v1.sh
 tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/nightly_release.sh
 tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/nonpip.sh
 tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/nonpip_v1.sh
@@ -325,11 +314,6 @@ tensorflow/tools/ci_build/release/ubuntu_16/cpu_py38_full/pip.sh
 tensorflow/tools/ci_build/release/ubuntu_16/custom_op/nightly.sh
 tensorflow/tools/ci_build/release/ubuntu_16/custom_op/release.sh
 tensorflow/tools/ci_build/release/ubuntu_16/gpu_pip_on_cpu/build.sh
-tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/nightly_release.sh
-tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/nonpip.sh
-tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/nonpip_v1.sh
-tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/pip.sh
-tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/pip_v1.sh
 tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/nightly_release.sh
 tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/nonpip.sh
 tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/nonpip_v1.sh
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 5288d9d9fb7..bdfbbbeeb74 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -102,6 +102,7 @@ py_library(
         "//tensorflow/tools/api/tests:__pkg__",
         "//tensorflow/tools/compatibility/update:__pkg__",
         "//tensorflow_estimator:__subpackages__",
+        "//third_party/py/tensorflow_privacy:__subpackages__",  # TODO(b/163395075): remove when fixed
     ],
     deps = [
         ":layers",
@@ -172,6 +173,7 @@ py_library(
         ":list_ops",
         ":manip_ops",
         ":map_fn",
+        ":map_ops",
         ":math_ops",
         ":metrics",
         ":nccl_ops",
@@ -183,7 +185,6 @@ py_library(
         ":pywrap_tf_session",
         ":pywrap_tfe",
         ":rnn_ops_gen",
-        ":saver_test_utils",
         ":script_ops",
         ":sendrecv_ops_gen",
         ":session_ops",
@@ -237,6 +238,7 @@ py_library(
         "//tensorflow/python/tools:module_util",
         "//tensorflow/python/tools/api/generator:create_python_api",
         "//tensorflow/python/tpu:tpu_noestimator",
+        "//tensorflow/python/training:saver_test_utils",
         "//tensorflow/python/types",
         "//third_party/py/numpy",
     ],
@@ -253,6 +255,7 @@ py_library(
         "//third_party/py/tensorflow_core:__subpackages__",
     ],
     deps = [
+        ":framework_combinations",
         ":no_contrib",
     ],
 )
@@ -346,6 +349,7 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
+    tfrt_enabled = True,
     deps = [
         ":platform",
         ":platform_test",
@@ -364,6 +368,7 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
+    tfrt_enabled = True,
     deps = [
         ":platform",
         ":platform_test",
@@ -375,6 +380,7 @@ tf_py_test(
     size = "small",
     srcs = ["platform/flags_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":platform",
@@ -390,6 +396,7 @@ tf_py_test(
         "no_windows",
         "nomac",
     ],
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":platform",
@@ -673,19 +680,21 @@ tf_python_pybind_extension(
         "lib/core/numpy.h",
         "lib/core/safe_ptr.h",
         "//tensorflow/c:headers",
-        "//tensorflow/c:pywrap_required_hdrs",
         "//tensorflow/c/eager:headers",
         "//tensorflow/c/eager:pywrap_required_hdrs",
+        "//tensorflow/c/experimental/ops:pywrap_required_hdrs",
         "//tensorflow/core/common_runtime/eager:pywrap_required_hdrs",
         "//tensorflow/core/distributed_runtime:pywrap_required_hdrs",
         "//tensorflow/core/distributed_runtime/eager:pywrap_required_hdrs",
-        "//tensorflow/core/framework:pywrap_required_hdrs",
     ],
     module_name = "_pywrap_tf_session",
     deps = [
         ":pybind11_lib",
         ":pybind11_status",
+        ":safe_pyobject_ptr",
+        "//tensorflow/core/framework:pywrap_required_hdrs",
         "//third_party/py/numpy:headers",
+        "//tensorflow/c:pywrap_required_hdrs",
         "@pybind11",
         "//third_party/python_runtime:headers",
         "//tensorflow/core:protos_all_cc",
@@ -694,6 +703,7 @@ tf_python_pybind_extension(
         "//tensorflow/core/common_runtime:core_cpu_headers_lib",
         "//tensorflow/core:lib_headers_for_pybind",
         "@com_google_absl//absl/types:optional",
+        "//tensorflow/core/lib/llvm_rtti",
     ] + if_static(
         extra_deps = [
             "//tensorflow/core/protobuf:eager_service_proto_cc",
@@ -754,7 +764,7 @@ tf_python_pybind_extension(
 tf_python_pybind_extension(
     name = "_pywrap_quantize_training",
     srcs = [
-        "training/quantize_training_wrapper.cc",
+        "//tensorflow/python/training:quantize_training_wrapper.cc",
     ],
     hdrs = ["//tensorflow/core/common_runtime:quantize_training_hdrs"],
     module_name = "_pywrap_quantize_training",
@@ -787,10 +797,10 @@ tf_python_pybind_extension(
 )
 
 tf_python_pybind_extension(
-    name = "_pywrap_tf32_execution",
-    srcs = ["util/tf32.cc"],
-    hdrs = ["//tensorflow/core/platform:tf32_hdr"],
-    module_name = "_pywrap_tf32_execution",
+    name = "_pywrap_tensor_float_32_execution",
+    srcs = ["util/tensor_float_32.cc"],
+    hdrs = ["//tensorflow/core/platform:tensor_float_32_hdr"],
+    module_name = "_pywrap_tensor_float_32_execution",
     deps = [
         "@pybind11",
     ],
@@ -883,6 +893,7 @@ tf_python_pybind_extension(
     deps = [
         ":pybind11_lib",
         ":pybind11_status",
+        ":safe_pyobject_ptr",
         "//tensorflow/core:lib_headers_for_pybind",
         "//tensorflow/core:op_gen_lib",
         "//tensorflow/core:protos_all_cc",
@@ -934,12 +945,17 @@ tf_python_pybind_extension(
     ],
 )
 
+# TODO(edloper): Remove unused dependency on safe_ptr.  (Blocker: there are
+# targets that depend are relying on cpp_python_util to pull in safe_ptr's
+# third_party/tensorflow/c:c_api_no_xla dependency, which registers
+# ops/gradients, rather than depending on it themselves.)
 cc_library(
     name = "cpp_python_util",
     srcs = ["util/util.cc"],
     hdrs = ["util/util.h"],
     deps = [
         ":safe_ptr",
+        ":safe_pyobject_ptr",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//third_party/python_runtime:headers",
@@ -994,6 +1010,15 @@ tf_python_pybind_extension(
     ],
 )
 
+cc_library(
+    name = "safe_pyobject_ptr",
+    srcs = ["lib/core/safe_pyobject_ptr.cc"],
+    hdrs = ["lib/core/safe_pyobject_ptr.h"],
+    deps = [
+        "//third_party/python_runtime:headers",
+    ],
+)
+
 cc_library(
     name = "safe_ptr",
     srcs = [
@@ -1002,6 +1027,7 @@ cc_library(
     ],
     hdrs = ["lib/core/safe_ptr.h"],
     deps = [
+        ":safe_pyobject_ptr",
         "//tensorflow/c:c_api_no_xla",
         "//third_party/python_runtime:headers",
     ],
@@ -1015,8 +1041,8 @@ cc_library(
         "lib/core/ndarray_tensor_bridge.h",
         "lib/core/numpy.h",
         "lib/core/safe_ptr.h",
+        "lib/core/safe_pyobject_ptr.h",
         "//tensorflow/c:headers",
-        "//tensorflow/c:pywrap_required_hdrs",
         "//tensorflow/c/eager:headers",
     ],
     features = [
@@ -1027,6 +1053,7 @@ cc_library(
     ]),
     deps = [
         ":numpy_lib",
+        "//tensorflow/c:pywrap_required_hdrs",
         "//tensorflow/c:tf_status_headers",
         "//tensorflow/core:framework_internal_headers_lib",
         "//tensorflow/core/common_runtime:core_cpu_headers_lib",
@@ -1147,6 +1174,7 @@ tf_py_test(
     name = "decorator_utils_test",
     srcs = ["util/decorator_utils_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":platform",
@@ -1158,6 +1186,7 @@ tf_py_test(
     name = "deprecation_test",
     srcs = ["util/deprecation_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":platform",
@@ -1169,6 +1198,7 @@ tf_py_test(
     name = "dispatch_test",
     srcs = ["util/dispatch_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":platform",
@@ -1180,6 +1210,7 @@ tf_py_test(
     name = "keyword_args_test",
     srcs = ["util/keyword_args_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":util",
@@ -1517,6 +1548,7 @@ tf_py_test(
     srcs = ["framework/function_def_to_graph_test.py"],
     python_version = "PY3",
     tags = ["no_pip"],
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -1554,6 +1586,7 @@ py_library(
         "framework/convert_to_constants.py",
     ],
     srcs_version = "PY2AND3",
+    visibility = visibility + ["//waymo/ml:__subpackages__"],
     deps = [
         ":dtypes",
         ":framework_ops",
@@ -1616,11 +1649,98 @@ py_library(
     ],
 )
 
+cc_library(
+    name = "py_context_manager",
+    srcs = ["framework/py_context_manager.cc"],
+    hdrs = ["framework/py_context_manager.h"],
+    deps = [
+        ":safe_pyobject_ptr",
+        "//tensorflow/core:lib",  # for core/platform/logging.h
+        "//third_party/python_runtime:headers",
+    ],
+)
+
+# Pybind extension used by py_context_manager_test.
+tf_python_pybind_extension(
+    name = "_py_context_manager",
+    srcs = ["framework/py_context_manager_pybind.cc"],
+    module_name = "_py_context_manager",
+    deps = [
+        ":py_context_manager",
+        "//third_party/python_runtime:headers",
+        "@pybind11",
+    ],
+)
+
+tf_py_test(
+    name = "py_context_manager_test",
+    srcs = ["framework/py_context_manager_test.py"],
+    python_version = "PY3",
+    tags = ["no_pip"],
+    tfrt_enabled = True,
+    deps = [
+        ":_py_context_manager",
+    ],
+)
+
+cc_library(
+    name = "op_def_util_cc",
+    srcs = ["framework/op_def_util.cc"],
+    hdrs = ["framework/op_def_util.h"],
+    deps = [
+        ":cpp_python_util",
+        ":safe_pyobject_ptr",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+# Note: this target is only used for op_def_util_test.  It includes op_def_util.cc
+# directly in its srcs (rather than depending on the `op_def_util_cc` target) because
+# depending on that target adds dependencies that register objects; and since the
+# extension is built as a shared object in some kokoro tests, this causes those objects
+# to get registered multiple times (which fails).
+# TODO(edloper): Simplify this, once cpp_python_util is changed to not depend on
+# safe_ptr (which transitively depends on third_party/tensorflow/c:c_api_no_xla).
+tf_python_pybind_extension(
+    name = "_op_def_util",
+    srcs = [
+        "framework/op_def_util.cc",
+        "framework/op_def_util_pybind.cc",
+    ],
+    hdrs = [
+        "framework/op_def_util.h",
+        "lib/core/safe_ptr.h",
+        "util/util.h",
+        "//tensorflow/c:headers",
+        "//tensorflow/c/eager:headers",
+    ],
+    module_name = "_op_def_util",
+    deps = [
+        ":pybind11_status",
+        ":safe_pyobject_ptr",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/platform:status",
+        "//third_party/python_runtime:headers",
+        "@com_google_absl//absl/strings",
+        "@pybind11",
+    ],
+)
+
+tf_py_test(
+    name = "op_def_util_test",
+    srcs = ["framework/op_def_util_test.py"],
+    python_version = "PY3",
+    tags = ["no_pip"],
+    tfrt_enabled = True,
+)
+
 py_library(
     name = "framework_ops",  # "ops" is already the name of a deprecated target
     srcs = ["framework/ops.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":_op_def_util",
         ":c_api_util",
         ":control_flow_util",
         ":device",
@@ -1642,6 +1762,7 @@ py_library(
         "//tensorflow/python/eager:core",
         "//tensorflow/python/eager:monitoring",
         "//tensorflow/python/eager:tape",
+        "//tensorflow/python/profiler:traceme",
         "@six_archive//:six",
     ],
 )
@@ -1825,6 +1946,7 @@ tf_py_test(
     size = "small",
     srcs = ["framework/smart_cond_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":constant_op",
@@ -1853,7 +1975,7 @@ py_library(
     name = "composite_tensor",
     srcs = ["framework/composite_tensor.py"],
     srcs_version = "PY2AND3",
-    visibility = visibility + ["//tensorflow:composite_tensor_whitelist"],
+    visibility = visibility,
     deps = [
         ":dtypes",
         ":tensor_util",
@@ -1897,6 +2019,7 @@ tf_py_test(
     srcs = ["framework/composite_tensor_utils_test.py"],
     main = "framework/composite_tensor_utils_test.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":composite_tensor",
@@ -1927,7 +2050,7 @@ py_library(
     name = "type_spec",
     srcs = ["framework/type_spec.py"],
     srcs_version = "PY2AND3",
-    visibility = visibility + ["//tensorflow:composite_tensor_whitelist"],
+    visibility = visibility,
     deps = [
         ":dtypes",
         ":tensor_shape",
@@ -2048,6 +2171,7 @@ py_library(
     name = "is_mlir_bridge_test_true",
     srcs = ["framework/is_mlir_bridge_test_true.py"],
     srcs_version = "PY2AND3",
+    visibility = visibility,
 )
 
 # Including this as a dependency will result in tests to use TFRT.
@@ -2154,6 +2278,7 @@ tf_py_test(
     srcs = ["framework/registry_test.py"],
     main = "framework/registry_test.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":framework_for_generated_wrappers",
@@ -2167,6 +2292,7 @@ tf_py_test(
     srcs = ["framework/errors_test.py"],
     main = "framework/errors_test.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":errors",
@@ -2180,6 +2306,7 @@ tf_py_test(
     srcs = ["framework/error_interpolation_test.py"],
     main = "framework/error_interpolation_test.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":constant_op",
@@ -2194,6 +2321,7 @@ tf_py_test(
     srcs = ["framework/subscribe_test.py"],
     main = "framework/subscribe_test.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":framework",
         ":framework_for_generated_wrappers",
@@ -2236,6 +2364,7 @@ tf_py_test(
     tags = [
         "no_pip",
     ],
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":platform",
@@ -2248,6 +2377,7 @@ tf_py_test(
     srcs = ["framework/proto_test.py"],
     main = "framework/proto_test.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":framework_for_generated_wrappers",
@@ -2336,6 +2466,7 @@ tf_py_test(
     srcs = ["framework/versions_test.py"],
     main = "framework/versions_test.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":framework_for_generated_wrappers",
@@ -2348,6 +2479,7 @@ tf_py_test(
     srcs = ["framework/importer_test.py"],
     main = "framework/importer_test.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -2384,6 +2516,7 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -2407,6 +2540,7 @@ tf_py_test(
     srcs = ["framework/traceable_stack_test.py"],
     main = "framework/traceable_stack_test.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":framework_test_lib",
         ":platform_test",
@@ -2461,6 +2595,7 @@ tf_py_test(
     srcs = ["framework/common_shapes_test.py"],
     main = "framework/common_shapes_test.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":framework",
         ":framework_for_generated_wrappers",
@@ -2507,6 +2642,7 @@ tf_py_test(
     srcs = ["framework/ops_enable_eager_test.py"],
     main = "framework/ops_enable_eager_test.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":framework",
         ":platform_test",
@@ -2520,6 +2656,7 @@ tf_py_test(
     srcs = ["framework/tensor_shape_test.py"],
     main = "framework/tensor_shape_test.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
@@ -2535,6 +2672,7 @@ tf_py_test(
     srcs = ["framework/type_spec_test.py"],
     main = "framework/type_spec_test.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
@@ -2550,6 +2688,7 @@ tf_py_test(
     srcs = ["framework/tensor_spec_test.py"],
     main = "framework/tensor_spec_test.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
@@ -2582,6 +2721,7 @@ tf_py_test(
     srcs = ["framework/device_spec_test.py"],
     main = "framework/device_spec_test.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
@@ -2596,6 +2736,7 @@ tf_py_test(
     srcs = ["framework/device_test.py"],
     main = "framework/device_test.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
@@ -2610,6 +2751,7 @@ tf_py_test(
     srcs = ["framework/random_seed_test.py"],
     main = "framework/random_seed_test.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":framework",
@@ -2622,6 +2764,7 @@ tf_py_test(
     srcs = ["framework/tensor_shape_div_test.py"],
     main = "framework/tensor_shape_div_test.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
@@ -2638,6 +2781,7 @@ tf_py_test(
     main = "framework/tensor_util_test.py",
     python_version = "PY3",
     tags = ["no_windows"],
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -2657,6 +2801,7 @@ tf_py_test(
     main = "framework/test_util_test.py",
     python_version = "PY3",
     tags = ["no_windows"],
+    tfrt_enabled = True,
     deps = [
         ":control_flow_ops",
         ":errors",
@@ -2691,6 +2836,7 @@ tf_py_test(
         "nomsan",  # TODO(b/149948895): Re-enable.
         "notsan",  # TODO(b/149948895): Re-enable.
     ],
+    tfrt_enabled = True,
     deps = [
         ":framework_test_lib",
         # TODO(kkb): Find more appropriate place to add `memory_checker` as deps
@@ -2716,6 +2862,7 @@ tf_py_test(
     srcs = ["framework/dtypes_test.py"],
     main = "framework/dtypes_test.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
@@ -2731,6 +2878,7 @@ tf_py_test(
     size = "small",
     srcs = ["framework/op_def_library_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
@@ -2744,6 +2892,7 @@ tf_py_test(
     srcs = ["framework/kernels_test.py"],
     main = "framework/kernels_test.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":framework_test_lib",
         ":kernels",
@@ -2759,6 +2908,7 @@ tf_gen_op_wrapper_private_py(
         "//tensorflow/compiler/tests:__pkg__",
         "//tensorflow/python/kernel_tests:__pkg__",
         "//tensorflow/python/kernel_tests/v1_compat_tests:__pkg__",
+        "//tensorflow/python/training:__pkg__",
     ],
     deps = [
         "//tensorflow/c/kernels:bitcast_op_lib",
@@ -2817,7 +2967,10 @@ tf_gen_op_wrapper_private_py(
 
 tf_gen_op_wrapper_private_py(
     name = "checkpoint_ops_gen",
-    visibility = ["//tensorflow/python/kernel_tests:__pkg__"],
+    visibility = [
+        "//tensorflow/python/kernel_tests:__pkg__",
+        "//tensorflow/python/training:__pkg__",
+    ],
 )
 
 tf_gen_op_wrapper_private_py(
@@ -2857,6 +3010,7 @@ tf_gen_op_wrapper_private_py(
     visibility = [
         "//learning/brain/python/ops:__pkg__",
         "//tensorflow/python/kernel_tests:__pkg__",
+        "//tensorflow/python/training:__pkg__",
     ],
 )
 
@@ -2888,6 +3042,7 @@ tf_gen_op_wrapper_private_py(
     visibility = [
         "//learning/brain/python/ops:__pkg__",
         "//tensorflow/python/kernel_tests:__pkg__",
+        "//tensorflow/python/training:__pkg__",
         "//tensorflow/python/training/tracking:__pkg__",
     ],
 )
@@ -2904,6 +3059,8 @@ tf_gen_op_wrapper_private_py(
         "//tensorflow/python/kernel_tests:__pkg__",
     ],
     deps = [
+        "//tensorflow/c/kernels:histogram_summary_op_lib",
+        "//tensorflow/c/kernels:merge_summary_op_lib",
         "//tensorflow/c/kernels:summary_op_lib",
         "//tensorflow/core:logging_ops_op_lib",
     ],
@@ -2914,6 +3071,7 @@ tf_gen_op_wrapper_private_py(
     visibility = [
         "//learning/brain/python/ops:__pkg__",
         "//tensorflow/python/kernel_tests:__pkg__",
+        "//tensorflow/python/training:__pkg__",
     ],
 )
 
@@ -3025,10 +3183,18 @@ tf_gen_op_wrapper_private_py(
     ],
 )
 
+tf_gen_op_wrapper_private_py(
+    name = "stateless_random_ops_v2_gen",
+)
+
 tf_gen_op_wrapper_private_py(
     name = "list_ops_gen",
 )
 
+tf_gen_op_wrapper_private_py(
+    name = "map_ops_gen",
+)
+
 tf_gen_op_wrapper_private_py(
     name = "script_ops_gen",
 )
@@ -3047,6 +3213,7 @@ tf_gen_op_wrapper_private_py(
     visibility = [
         "//learning/brain/python/ops:__pkg__",
         "//tensorflow/python/kernel_tests:__pkg__",
+        "//tensorflow/python/training:__pkg__",
     ],
 )
 
@@ -3069,7 +3236,9 @@ tf_gen_op_wrapper_private_py(
 
 tf_gen_op_wrapper_private_py(
     name = "training_ops_gen",
-    out = "training/gen_training_ops.py",
+    visibility = [
+        "//tensorflow/python/training:__pkg__",
+    ],
 )
 
 tf_gen_op_wrapper_private_py(
@@ -3265,6 +3434,7 @@ tf_py_test(
     size = "small",
     srcs = ["ops/clip_ops_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":clip_ops",
@@ -3290,6 +3460,7 @@ tf_py_test(
     size = "medium",
     srcs = ["ops/clustering_ops_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":clustering_ops",
@@ -3314,6 +3485,7 @@ tf_py_test(
     srcs = ["ops/collective_ops_test.py"],
     python_version = "PY3",
     tags = ["no_rocm"],
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":collective_ops",
@@ -3323,6 +3495,25 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "collective_ops_multi_worker_test",
+    size = "medium",
+    srcs = ["ops/collective_ops_multi_worker_test.py"],
+    python_version = "PY3",
+    tags = ["no_rocm"],
+    tfrt_enabled = False,
+    deps = [
+        ":collective_ops",
+        ":constant_op",
+        ":errors",
+        "//tensorflow/python/distribute:multi_process_runner",
+        "//tensorflow/python/distribute:multi_worker_test_base",
+        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:test",
+    ],
+)
+
 tf_py_test(
     name = "collective_ops_xla_test",
     size = "small",
@@ -3334,6 +3525,7 @@ tf_py_test(
         "no_windows",
         "nomac",
     ],
+    tfrt_enabled = True,
     xla_enable_strict_auto_jit = True,
     deps = [
         ":client_testlib",
@@ -3458,6 +3650,7 @@ tf_py_test(
     size = "small",
     srcs = ["ops/control_flow_v2_toggles_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":control_flow_util_v2",
@@ -3471,6 +3664,7 @@ tf_py_test(
     size = "small",
     srcs = ["ops/control_flow_v2_enable_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":control_flow_util",
@@ -3492,6 +3686,7 @@ tf_py_test(
         "no_oss",
         "no_pip",
     ],
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":control_flow_util",
@@ -3546,6 +3741,7 @@ py_library(
         ":functional_ops_gen",
         ":gradients_util",
         ":list_ops",
+        ":map_ops",
         ":pywrap_tf_session",
         ":tensor_array_ops",
         ":tensor_shape",
@@ -3572,6 +3768,7 @@ tf_py_test(
     size = "small",
     srcs = ["ops/bincount_ops_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":bincount_ops",
         ":platform_test",
@@ -3979,120 +4176,6 @@ py_library(
     ],
 )
 
-py_library(
-    name = "loss_scale",
-    srcs = ["training/experimental/loss_scale.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":framework",
-        "@absl_py//absl/testing:parameterized",
-    ],
-)
-
-py_library(
-    name = "loss_scale_optimizer",
-    srcs = ["training/experimental/loss_scale_optimizer.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":loss_scale",
-        "//tensorflow/python/distribute:distribute_lib",
-        "@absl_py//absl/testing:parameterized",
-    ],
-)
-
-py_test(
-    name = "loss_scale_optimizer_test",
-    size = "small",
-    srcs = ["training/experimental/loss_scale_optimizer_test.py"],
-    python_version = "PY3",
-    deps = [
-        ":client_testlib",
-        ":loss_scale_optimizer",
-        "//tensorflow/python/distribute:mirrored_strategy",
-        "//tensorflow/python/distribute:one_device_strategy",
-        "//tensorflow/python/keras/mixed_precision/experimental:test_util",
-        "@absl_py//absl/testing:parameterized",
-    ],
-)
-
-py_test(
-    name = "loss_scale_test",
-    size = "medium",
-    srcs = ["training/experimental/loss_scale_test.py"],
-    python_version = "PY3",
-    deps = [
-        ":client_testlib",
-        ":loss_scale",
-        "//tensorflow/python/distribute:mirrored_strategy",
-        "//tensorflow/python/distribute:one_device_strategy",
-        "@absl_py//absl/testing:parameterized",
-    ],
-)
-
-py_library(
-    name = "mixed_precision_global_state",
-    srcs = ["training/experimental/mixed_precision_global_state.py"],
-    srcs_version = "PY2AND3",
-)
-
-py_library(
-    name = "mixed_precision",
-    srcs = ["training/experimental/mixed_precision.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":config",
-        ":loss_scale",
-        ":loss_scale_optimizer",
-        ":mixed_precision_global_state",
-        ":util",
-    ],
-)
-
-cuda_py_test(
-    name = "mixed_precision_test",
-    size = "small",
-    srcs = ["training/experimental/mixed_precision_test.py"],
-    python_version = "PY3",
-    deps = [
-        ":client_testlib",
-        ":mixed_precision",
-        "@absl_py//absl/testing:parameterized",
-    ],
-)
-
-py_library(
-    name = "loss_scaling_gradient_tape",
-    srcs = ["training/experimental/loss_scaling_gradient_tape.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":array_ops",
-        ":loss_scale",
-        ":unconnected_gradients",
-        ":util",
-        "//tensorflow/python/distribute:distribute_lib",
-        "//tensorflow/python/eager:backprop",
-    ],
-)
-
-cuda_py_test(
-    name = "loss_scaling_gradient_tape_test",
-    size = "medium",
-    srcs = ["training/experimental/loss_scaling_gradient_tape_test.py"],
-    shard_count = 2,
-    deps = [
-        ":client_testlib",
-        ":constant_op",
-        ":framework_test_combinations_lib",
-        ":loss_scale",
-        ":loss_scaling_gradient_tape",
-        "//tensorflow/python/compat:v2_compat",
-        "//tensorflow/python/distribute:mirrored_strategy",
-        "//tensorflow/python/eager:def_function",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
-    ],
-)
-
 py_library(
     name = "math_grad",
     srcs = ["ops/math_grad.py"],
@@ -4205,6 +4288,16 @@ py_library(
     ],
 )
 
+py_library(
+    name = "map_ops",
+    srcs = ["ops/map_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":array_ops",
+        ":map_ops_gen",
+    ],
+)
+
 py_library(
     name = "nn",
     srcs = [
@@ -4351,6 +4444,7 @@ py_library(
         ":framework_ops",
         ":math_ops",
         ":stateful_random_ops_gen",
+        ":stateless_random_ops_v2_gen",
         ":variables",
         "//third_party/py/numpy",
     ],
@@ -4361,6 +4455,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["ops/stateful_random_ops_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     xla_enable_strict_auto_jit = False,
     xla_enabled = True,
     deps = [
@@ -4384,6 +4479,7 @@ py_library(
         ":math_ops",
         ":random_ops",
         ":stateless_random_ops_gen",
+        ":stateless_random_ops_v2_gen",
     ],
 )
 
@@ -4532,6 +4628,7 @@ tf_py_test(
     name = "sort_ops_test",
     srcs = ["ops/sort_ops_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -4631,6 +4728,7 @@ cuda_py_test(
     name = "rnn_grad_test",
     srcs = ["ops/rnn_grad_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -4684,7 +4782,6 @@ py_library(
         ":linalg_ops",
         ":logging_ops",
         ":lookup_ops",
-        ":loss_scaling_gradient_tape",
         ":manip_grad",
         ":manip_ops",
         ":math_grad",
@@ -4719,6 +4816,7 @@ py_library(
         "//tensorflow/python/ops/linalg/sparse",
         "//tensorflow/python/ops/ragged",
         "//tensorflow/python/ops/structured",
+        "//tensorflow/python/training/experimental:loss_scaling_gradient_tape",
     ],
 )
 
@@ -4913,6 +5011,7 @@ cuda_py_test(
     srcs = ["ops/bitwise_ops_test.py"],
     python_version = "PY3",
     tags = ["no_windows"],
+    tfrt_enabled = True,
     deps = [
         ":bitwise_ops",
         ":constant_op",
@@ -5001,6 +5100,7 @@ cuda_py_test(
         ":gradients",
         ":init_ops",
         ":list_ops",
+        ":map_ops",
         ":math_grad",
         ":math_ops",
         ":nn_grad",
@@ -5023,6 +5123,7 @@ cuda_py_test(
     size = "small",
     srcs = ["ops/histogram_ops_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -5039,6 +5140,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["ops/image_grad_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":framework_for_generated_wrappers",
@@ -5080,6 +5182,7 @@ cuda_py_test(
     size = "small",
     srcs = ["ops/init_ops_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":framework_ops",
@@ -5095,6 +5198,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["ops/init_ops_v2_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -5148,7 +5252,10 @@ cuda_py_test(
     size = "medium",
     srcs = ["ops/math_ops_linspace_test.py"],
     python_version = "PY3",
-    tags = ["no_windows_gpu"],
+    tags = [
+        "no_windows_gpu",
+        "notsan",  # b/168815578
+    ],
     deps = [
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
@@ -5165,6 +5272,7 @@ cuda_py_test(
     python_version = "PY3",
     shard_count = 4,
     tags = ["no_windows"],
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -5184,6 +5292,7 @@ cuda_py_test(
     srcs = ["ops/nn_fused_batchnorm_test.py"],
     python_version = "PY3",
     shard_count = 24,
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -5234,6 +5343,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["ops/nn_xent_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":framework_for_generated_wrappers",
@@ -5267,6 +5377,7 @@ cuda_py_test(
         "no_oss",  # TODO(b/149565560)
         "no_windows_gpu",
     ],
+    tfrt_enabled = True,
     deps = [
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
@@ -5303,6 +5414,7 @@ tf_py_test(
     size = "small",
     srcs = ["ops/variable_spec_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
@@ -5311,231 +5423,6 @@ tf_py_test(
     ],
 )
 
-py_library(
-    name = "training_lib",
-    srcs = glob(
-        ["training/**/*.py"],
-        exclude = [
-            "**/*test*",
-            "training/tracking/**/*.py",
-            "training/saving/**/*.py",
-            # The following targets have their own build rules (same name as the
-            # file):
-            "training/basic_session_run_hooks.py",
-            "training/checkpoint_management.py",
-            "training/distribute.py",
-            "training/distribution_strategy_context.py",
-            "training/saver.py",
-            "training/session_run_hook.py",
-            "training/training_util.py",
-        ],
-    ),
-    srcs_version = "PY2AND3",
-    deps = [
-        ":array_ops",
-        ":array_ops_gen",
-        ":basic_session_run_hooks",
-        ":checkpoint_management",
-        ":checkpoint_ops_gen",
-        ":client",
-        ":control_flow_ops",
-        ":data_flow_ops",
-        ":device",
-        ":device_spec",
-        ":distribute",
-        ":errors",
-        ":framework",
-        ":framework_for_generated_wrappers",
-        ":framework_ops",
-        ":gradients",
-        ":init_ops",
-        ":io_ops",
-        ":layers_util",
-        ":lookup_ops",
-        ":loss_scale",
-        ":loss_scale_optimizer",
-        ":math_ops",
-        ":mixed_precision",
-        ":platform",
-        ":py_checkpoint_reader",
-        ":pywrap_tensorflow",
-        ":random_ops",
-        ":resource_variable_ops",
-        ":resources",
-        ":saver",
-        ":sdca_ops",
-        ":session",
-        ":session_run_hook",
-        ":sparse_ops",
-        ":sparse_tensor",
-        ":state_ops",
-        ":summary",
-        ":training_ops_gen",
-        ":training_util",
-        ":util",
-        ":variable_scope",
-        ":variables",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python/data/experimental/service:server_lib",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/distribute:distribute_coordinator_context",
-        "//tensorflow/python/distribute:distribute_lib",
-        "//tensorflow/python/distribute:reduce_util",
-        "//tensorflow/python/eager:backprop",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/keras/optimizer_v2:legacy_learning_rate_decay",
-        "//tensorflow/python/ops/losses",
-        "//third_party/py/numpy",
-        "@six_archive//:six",
-    ],
-)
-
-py_library(
-    name = "training",
-    srcs_version = "PY2AND3",
-    deps = [
-        ":training_lib",
-        "//tensorflow/python/training/tracking:base",
-        "//tensorflow/python/training/tracking:python_state",
-        "//tensorflow/python/training/tracking:util",
-    ],
-)
-
-# Dependency added and used by ClusterResolvers to avoid circular dependency between keras, distribute, and training.
-py_library(
-    name = "training_server_lib",
-    srcs = ["training/server_lib.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":framework",
-        ":pywrap_tf_session",
-        ":util",
-        "//tensorflow/core:protos_all_py",
-    ],
-)
-
-py_library(
-    name = "py_checkpoint_reader",
-    srcs = ["training/py_checkpoint_reader.py"],
-    deps = [
-        ":_pywrap_checkpoint_reader",
-        ":dtypes",
-        ":errors",
-        ":util",
-    ],
-)
-
-py_library(
-    name = "checkpoint_management",
-    srcs = ["training/checkpoint_management.py"],
-    deps = [
-        ":errors",
-        ":lib",
-        ":platform",
-        ":protos_all_py",
-        ":util",
-        "//tensorflow/core:protos_all_py",
-    ],
-)
-
-py_library(
-    name = "session_run_hook",
-    srcs = ["training/session_run_hook.py"],
-    srcs_version = "PY2AND3",
-    deps = [":util"],
-)
-
-py_library(
-    name = "basic_session_run_hooks",
-    srcs = ["training/basic_session_run_hooks.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":client",
-        ":framework",
-        ":platform",
-        ":protos_all_py",
-        ":session_run_hook",
-        ":training_util",
-        ":util",
-    ],
-)
-
-py_library(
-    name = "saver",
-    srcs = ["training/saver.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":array_ops",
-        ":checkpoint_management",
-        ":constant_op",
-        ":control_flow_ops",
-        ":device",
-        ":errors",
-        ":framework",
-        ":framework_ops",
-        ":io_ops",
-        ":io_ops_gen",
-        ":platform",
-        ":py_checkpoint_reader",
-        ":resource_variable_ops",
-        ":session",
-        ":state_ops",
-        ":string_ops",
-        ":training_util",
-        ":util",
-        ":variables",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/training/saving:saveable_object",
-        "//tensorflow/python/training/saving:saveable_object_util",
-        "//tensorflow/python/training/tracking:base",
-        "//third_party/py/numpy",
-        "@six_archive//:six",
-    ],
-)
-
-py_library(
-    name = "distribute",
-    srcs = [
-        "training/distribute.py",
-        "training/distribution_strategy_context.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/python/distribute:distribute_lib",
-    ],
-)
-
-tf_py_test(
-    name = "evaluation_test",
-    size = "small",
-    srcs = ["training/evaluation_test.py"],
-    python_version = "PY3",
-    shard_count = 3,
-    tags = [
-        "manual",
-        "notap",  # Disabling until b/33000128 and b/33040312 are fixed.
-    ],
-    deps = [
-        ":array_ops",
-        ":client",
-        ":client_testlib",
-        ":framework",
-        ":framework_for_generated_wrappers",
-        ":framework_test_lib",
-        ":math_ops",
-        ":metrics",
-        ":platform",
-        ":state_ops",
-        ":summary",
-        ":training",
-        ":variables",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python/ops/losses",
-        "//third_party/py/numpy",
-    ],
-)
-
 py_library(
     name = "client",
     srcs = [
@@ -5573,6 +5460,7 @@ tf_py_test(
     name = "tf_export_test",
     srcs = ["util/tf_export_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":platform",
@@ -5630,6 +5518,7 @@ tf_py_test(
     name = "tf_stack_test",
     srcs = ["util/tf_stack_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":tf_export",
@@ -5652,6 +5541,48 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "function_parameter_canonicalizer",
+    srcs = ["util/function_parameter_canonicalizer.cc"],
+    hdrs = ["util/function_parameter_canonicalizer.h"],
+    deps = [
+        ":py_util",
+        ":safe_pyobject_ptr",
+        "//tensorflow/core/platform:logging",
+        "//tensorflow/core/platform:macros",
+        "//third_party/python_runtime:headers",  # buildcleaner: keep
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+tf_python_pybind_extension(
+    name = "_function_parameter_canonicalizer_binding_for_test",
+    testonly = True,
+    srcs = ["util/function_parameter_canonicalizer_binding_for_test.cc"],
+    module_name = "_function_parameter_canonicalizer_binding_for_test",
+    deps = [
+        ":function_parameter_canonicalizer",
+        "//third_party/python_runtime:headers",  # buildcleaner: keep
+        "@com_google_absl//absl/types:span",
+        "@pybind11",
+    ],
+)
+
+tf_py_test(
+    name = "function_parameter_canonicalizer_test",
+    srcs = ["util/function_parameter_canonicalizer_test.py"],
+    python_version = "PY3",
+    tags = [
+        "no_pip",  # b/168621686
+    ],
+    tfrt_enabled = True,
+    deps = [
+        ":_function_parameter_canonicalizer_binding_for_test",
+        ":client_testlib",
+    ],
+)
+
 py_library(
     name = "global_test_configuration",
     deps = if_mlir(["//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_pass_registration"]) +
@@ -5677,10 +5608,9 @@ py_library(
         "//tensorflow:__pkg__",
         "//third_party/py/tensorflow_core:__subpackages__",
         "//third_party/py/tf_agents:__subpackages__",
-        "//tensorflow:composite_tensor_whitelist",
     ],
     deps = [
-        ":_pywrap_tf32_execution",
+        ":_pywrap_tensor_float_32_execution",
         # global_test_configuration is added here because all major tests depend on this
         # library. It isn't possible to add these test dependencies via tensorflow.bzl's
         # py_test because not all tensorflow tests use tensorflow.bzl's py_test.
@@ -5702,6 +5632,7 @@ tf_py_test(
     size = "small",
     srcs = ["util/object_identity_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
 )
 
 # Placeholder for intenal nest_test comments.
@@ -5711,6 +5642,7 @@ tf_py_test(
     srcs = ["util/nest_test.py"],
     main = "util/nest_test.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [":util_nest_test_main_lib"],
 )
 
@@ -5736,6 +5668,7 @@ tf_py_test(
     srcs = ["util/serialization_test.py"],
     main = "util/serialization_test.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":util",
@@ -5746,6 +5679,7 @@ tf_py_test(
     name = "function_utils_test",
     srcs = ["util/function_utils_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":util",
@@ -5757,6 +5691,7 @@ tf_py_test(
     size = "small",
     srcs = ["util/tf_contextlib_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":util",
@@ -5768,6 +5703,7 @@ tf_py_test(
     size = "small",
     srcs = ["util/tf_decorator_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":util",
@@ -5791,6 +5727,7 @@ tf_py_test(
     size = "small",
     srcs = ["util/tf_should_use_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":tf_should_use",
@@ -5802,6 +5739,7 @@ tf_py_test(
     size = "small",
     srcs = ["util/tf_inspect_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":util",
@@ -5826,6 +5764,7 @@ tf_py_test(
     srcs = ["util/lock_util_test.py"],
     main = "util/lock_util_test.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":util",
@@ -5838,6 +5777,7 @@ tf_py_test(
     size = "small",
     srcs = ["util/module_wrapper_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":util",
@@ -5855,6 +5795,7 @@ tf_proto_library(
             "framework/cpp_shape_inference.proto",
         ],
     ),
+    protodeps = ["//tensorflow/python/training:checkpoint_state"],
     visibility = visibility,
 )
 
@@ -5880,6 +5821,7 @@ tf_py_test(
     main = "util/protobuf/compare_test.py",
     python_version = "PY3",
     tags = ["no_pip"],  # compare_test_pb2 proto is not available in pip.
+    tfrt_enabled = True,
     deps = [
         ":compare_test_proto_py",
         ":platform_test",
@@ -5894,6 +5836,7 @@ tf_py_test(
     srcs = ["util/example_parser_configuration_test.py"],
     main = "util/example_parser_configuration_test.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client",
@@ -5909,6 +5852,7 @@ tf_py_test(
     size = "small",
     srcs = ["client/events_writer_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":errors",
         ":framework_test_lib",
@@ -6047,6 +5991,10 @@ pywrap_tensorflow_macro(
         "//tensorflow/c:tf_status_helper",
         "//tensorflow/c/eager:c_api",
         "//tensorflow/c/eager:c_api_experimental",
+        "//tensorflow/c/experimental/ops",
+        "//tensorflow/c/experimental/gradients",
+        "//tensorflow/c/eager:mnist_gradients_testutil",
+        "//tensorflow/compiler/mlir/tensorflow/c:mlir_c_api_registration",
         "//tensorflow/core/data/service:server_lib",
         "//tensorflow/core/distributed_runtime/rpc:grpc_rpc_factory_registration",
         "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
@@ -6082,7 +6030,7 @@ pywrap_tensorflow_macro(
         "@ngraph_tf//:ngraph_tf",
     ]) + if_xla_available([
         "//tensorflow/compiler/aot:tfcompile_lib",
-    ]) + if_static(extra_deps = ["//tensorflow/core/platform:tf32_utils"]),
+    ]) + if_static(extra_deps = ["//tensorflow/core/platform:tensor_float_32_utils"]),
 )
 
 # ** Targets for Windows build (start) **
@@ -6105,6 +6053,7 @@ filegroup(
         "//tensorflow/c:checkpoint_reader",  # checkpoint_reader
         "//tensorflow/c:python_api",  # tf_session
         "//tensorflow/c:tf_status_helper",  # tfe
+        "//tensorflow/compiler/jit:get_compiler_ir",  #tfe
         "//tensorflow/compiler/jit:flags",  #tfe
         "//tensorflow/compiler/mlir/python:mlir",  # mlir
         "//tensorflow/core/common_runtime:device",  # device_lib, tfe, tf_session
@@ -6135,7 +6084,7 @@ filegroup(
         "//tensorflow/core/grappler/graph_analyzer:graph_analyzer_tool",  # graph_analyzer
         "//tensorflow/core/grappler/optimizers:meta_optimizer",  # tf_optimizer
         "//tensorflow/core/grappler/utils:topological_sort",  # tf_item
-        "//tensorflow/core/platform:tf32_utils",  # tf32
+        "//tensorflow/core/platform:tensor_float_32_utils",  # tensor_float_32
         "//tensorflow/core/profiler/internal:annotation_stack_impl",  # profiler
         "//tensorflow/core/profiler/internal:print_model_analysis",  # tfprof
         "//tensorflow/core/profiler/internal:traceme_recorder_impl",  # profiler
@@ -6321,181 +6270,6 @@ py_library(
     ],
 )
 
-tf_py_test(
-    name = "server_lib_test",
-    size = "small",
-    srcs = ["training/server_lib_test.py"],
-    grpc_enabled = True,
-    python_version = "PY3",
-    tags = [
-        "noasan",  # TODO(b/161236904): flaky timeout in trying to start gRPC server
-    ],
-    deps = [
-        ":array_ops",
-        ":client",
-        ":client_testlib",
-        ":data_flow_ops",
-        ":errors",
-        ":framework_for_generated_wrappers",
-        ":math_ops",
-        ":training",
-        ":variables",
-        "//tensorflow/core:protos_all_py",
-        "//third_party/py/numpy",
-    ],
-)
-
-tf_py_test(
-    name = "server_lib_multiple_containers_test",
-    size = "small",
-    srcs = ["training/server_lib_multiple_containers_test.py"],
-    grpc_enabled = True,
-    python_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":client",
-        ":client_testlib",
-        ":data_flow_ops",
-        ":errors",
-        ":framework_for_generated_wrappers",
-        ":math_ops",
-        ":training",
-        ":variables",
-        "//tensorflow/core:protos_all_py",
-        "//third_party/py/numpy",
-    ],
-)
-
-tf_py_test(
-    name = "server_lib_same_variables_clear_container_test",
-    size = "small",
-    srcs = ["training/server_lib_same_variables_clear_container_test.py"],
-    grpc_enabled = True,
-    python_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":client",
-        ":client_testlib",
-        ":data_flow_ops",
-        ":errors",
-        ":framework_for_generated_wrappers",
-        ":math_ops",
-        ":training",
-        ":variables",
-        "//tensorflow/core:protos_all_py",
-        "//third_party/py/numpy",
-    ],
-)
-
-tf_py_test(
-    name = "server_lib_same_variables_clear_test",
-    size = "small",
-    srcs = ["training/server_lib_same_variables_clear_test.py"],
-    grpc_enabled = True,
-    python_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":client",
-        ":client_testlib",
-        ":data_flow_ops",
-        ":errors",
-        ":framework_for_generated_wrappers",
-        ":math_ops",
-        ":training",
-        ":variables",
-        "//tensorflow/core:protos_all_py",
-        "//third_party/py/numpy",
-    ],
-)
-
-tf_py_test(
-    name = "server_lib_same_variables_no_clear_test",
-    size = "small",
-    srcs = ["training/server_lib_same_variables_no_clear_test.py"],
-    grpc_enabled = True,
-    python_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":client",
-        ":client_testlib",
-        ":data_flow_ops",
-        ":errors",
-        ":framework_for_generated_wrappers",
-        ":math_ops",
-        ":training",
-        ":variables",
-        "//tensorflow/core:protos_all_py",
-        "//third_party/py/numpy",
-    ],
-)
-
-tf_py_test(
-    name = "server_lib_sparse_job_test",
-    size = "small",
-    srcs = ["training/server_lib_sparse_job_test.py"],
-    grpc_enabled = True,
-    python_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":client",
-        ":client_testlib",
-        ":data_flow_ops",
-        ":errors",
-        ":framework_for_generated_wrappers",
-        ":math_ops",
-        ":training",
-        ":variables",
-        "//tensorflow/core:protos_all_py",
-        "//third_party/py/numpy",
-    ],
-)
-
-cuda_py_test(
-    name = "localhost_cluster_performance_test",
-    size = "medium",
-    srcs = [
-        "training/localhost_cluster_performance_test.py",
-    ],
-    grpc_enabled = True,
-    python_version = "PY3",
-    tags = [
-        "no_oss",  # Test flaky due to port collisions.
-        "oss_serial",
-    ],
-    deps = [
-        ":client",
-        ":client_testlib",
-        ":distributed_framework_test_lib",
-        ":framework_for_generated_wrappers",
-        ":partitioned_variables",
-        ":training",
-        ":variable_scope",
-        ":variables",
-        "//third_party/py/numpy",
-    ],
-)
-
-tf_py_test(
-    name = "sync_replicas_optimizer_test",
-    size = "medium",
-    srcs = [
-        "training/sync_replicas_optimizer_test.py",
-    ],
-    grpc_enabled = True,
-    python_version = "PY3",
-    tags = [
-        "no_oss",  # Test flaky due to port collisions.
-        "notsan",  # data race due to b/62910646
-        "oss_serial",
-    ],
-    deps = [
-        ":client_testlib",
-        ":framework_for_generated_wrappers",
-        ":training",
-        ":variables",
-    ],
-)
-
 py_library(
     name = "timeline",
     srcs = ["client/timeline.py"],
@@ -6530,6 +6304,7 @@ tf_py_test(
         "no_pip_gpu",  # testInteractivePlacePrunedGraph fails on invalid assumption about GPU ops.
         "no_windows",
     ],
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client",
@@ -6592,6 +6367,7 @@ tf_py_test(
         "no_pip_gpu",
         "notsan",  # data race due to b/62910646
     ],
+    tfrt_enabled = True,
     deps = [
         ":client",
         ":framework",
@@ -6611,6 +6387,7 @@ tf_py_test(
         "no_gpu",
         "no_windows",
     ],
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client",
@@ -6635,6 +6412,7 @@ cuda_py_test(
         "gpu_cupti",
         "no_gpu",  # b/154742661
     ],
+    tfrt_enabled = True,
     xla_enable_strict_auto_jit = False,  # Graph structure is different with autojit
     deps = [
         ":client",
@@ -6654,6 +6432,7 @@ cuda_py_test(
         "no_gpu",  # b/127386241
         "no_windows_gpu",
     ],
+    tfrt_enabled = True,
     deps = [
         ":client",
         ":client_testlib",
@@ -6668,6 +6447,7 @@ tf_py_test(
     size = "small",
     srcs = ["framework/c_api_util_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":c_api_util",
         ":framework_test_lib",
@@ -6680,6 +6460,7 @@ tf_py_test(
     size = "small",
     srcs = ["framework/graph_util_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client",
         ":client_testlib",
@@ -6700,6 +6481,7 @@ tf_py_test(
     srcs = ["framework/convert_to_constants_test.py"],
     python_version = "PY3",
     tags = ["no_rocm"],
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":control_flow_v2_toggles",
@@ -6714,6 +6496,7 @@ tf_py_test(
     size = "small",
     srcs = ["lib/core/bfloat16_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":lib",
@@ -6730,6 +6513,7 @@ tf_py_test(
         "no_rocm",
         "no_windows",
     ],
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":errors",
@@ -6742,6 +6526,7 @@ tf_py_test(
     size = "small",
     srcs = ["lib/io/tf_record_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":errors",
@@ -6750,455 +6535,6 @@ tf_py_test(
     ],
 )
 
-cuda_py_test(
-    name = "adam_test",
-    size = "medium",
-    srcs = ["training/adam_test.py"],
-    python_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":client_testlib",
-        ":framework",
-        ":math_ops",
-        ":platform",
-        ":platform_test",
-        ":training",
-        "//third_party/py/numpy",
-    ],
-)
-
-cuda_py_test(
-    name = "moving_averages_test",
-    size = "small",
-    srcs = [
-        "training/moving_averages_test.py",
-    ],
-    python_version = "PY3",
-    tags = [
-        "no_windows",  # b/139083295: bfloat16 tests fail on Windows
-        "notsan",
-    ],
-    deps = [
-        ":array_ops",
-        ":client_testlib",
-        ":constant_op",
-        ":dtypes",
-        ":framework_for_generated_wrappers",
-        ":framework_ops",
-        ":training",
-        ":variable_scope",
-        ":variables",
-    ],
-)
-
-cuda_py_tests(
-    name = "training_tests",
-    size = "medium",
-    srcs = [
-        "training/adadelta_test.py",
-        "training/adagrad_da_test.py",
-        "training/adagrad_test.py",
-        "training/basic_loops_test.py",
-        "training/coordinator_test.py",
-        "training/device_setter_test.py",
-        "training/ftrl_test.py",
-        "training/gradient_descent_test.py",
-        "training/momentum_test.py",
-        "training/optimizer_test.py",
-        "training/proximal_adagrad_test.py",
-        "training/proximal_gradient_descent_test.py",
-        "training/quantize_training_test.py",
-        "training/queue_runner_test.py",
-        "training/rmsprop_test.py",
-        "training/slot_creator_test.py",
-        "training/tensorboard_logging_test.py",
-        "training/training_ops_test.py",
-    ],
-    python_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":client",
-        ":client_testlib",
-        ":control_flow_ops",
-        ":data_flow_ops",
-        ":data_flow_ops_gen",
-        ":embedding_ops",
-        ":errors",
-        ":framework",
-        ":framework_for_generated_wrappers",
-        ":framework_test_lib",
-        ":gradients",
-        ":lookup_ops",
-        ":math_ops",
-        ":nn_grad",
-        ":nn_ops",
-        ":partitioned_variables",
-        ":platform",
-        ":platform_test",
-        ":pywrap_tensorflow",
-        ":random_ops",
-        ":resource_variable_ops",
-        ":resources",
-        ":sparse_ops",
-        ":state_ops",
-        ":state_ops_gen",
-        ":summary",
-        ":training",
-        ":util",
-        ":variable_scope",
-        ":variables",
-        "//tensorflow/core:protos_all_py",
-        "//third_party/py/numpy",
-        "@six_archive//:six",
-    ],
-)
-
-py_library(
-    name = "saver_test_utils",
-    srcs = ["training/saver_test_utils.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":lookup_ops_gen",
-        ":training",
-    ],
-)
-
-cuda_py_test(
-    name = "saver_test",
-    size = "medium",
-    srcs = [
-        "training/saver_test.py",
-    ],
-    python_version = "PY3",
-    tags = ["multi_gpu"],
-    deps = [
-        ":array_ops",
-        ":client_testlib",
-        ":control_flow_ops",
-        ":data_flow_ops",
-        ":errors",
-        ":gradients",
-        ":math_ops",
-        ":nn_grad",
-        ":nn_ops",
-        ":partitioned_variables",
-        ":platform",
-        ":platform_test",
-        ":py_checkpoint_reader",
-        ":random_ops",
-        ":resource_variable_ops",
-        ":saver_test_utils",
-        ":sparse_ops",
-        ":summary",
-        ":training",
-        ":util",
-        ":variable_scope",
-        ":variables",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//third_party/py/numpy",
-        "@six_archive//:six",
-    ],
-)
-
-cuda_py_test(
-    name = "checkpoint_management_test",
-    size = "small",
-    srcs = [
-        "training/checkpoint_management_test.py",
-    ],
-    python_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":client_testlib",
-        ":control_flow_ops",
-        ":data_flow_ops",
-        ":errors",
-        ":gradients",
-        ":math_ops",
-        ":nn_grad",
-        ":nn_ops",
-        ":partitioned_variables",
-        ":platform",
-        ":platform_test",
-        ":pywrap_tensorflow",
-        ":random_ops",
-        ":resource_variable_ops",
-        ":saver_test_utils",
-        ":sparse_ops",
-        ":summary",
-        ":training",
-        ":util",
-        ":variable_scope",
-        ":variables",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//third_party/py/numpy",
-        "@six_archive//:six",
-    ],
-)
-
-tf_py_test(
-    name = "saver_large_variable_test",
-    size = "medium",
-    srcs = ["training/saver_large_variable_test.py"],
-    python_version = "PY3",
-    tags = [
-        "manual",
-        "noasan",  # http://b/30379628
-        "notsan",  # http://b/30379628
-    ],
-    deps = [
-        ":client",
-        ":client_testlib",
-        ":errors",
-        ":framework_for_generated_wrappers",
-        ":training",
-        ":variables",
-        "//tensorflow/core:protos_all_py",
-    ],
-)
-
-tf_py_test(
-    name = "saver_large_partitioned_variable_test",
-    size = "medium",
-    srcs = ["training/saver_large_partitioned_variable_test.py"],
-    python_version = "PY3",
-    tags = [
-        "noasan",  # http://b/30782289
-        "notsan",  # http://b/30782289
-    ],
-    deps = [
-        ":client",
-        ":client_testlib",
-        ":framework_for_generated_wrappers",
-        ":partitioned_variables",
-        ":training",
-        ":variables",
-    ],
-)
-
-cuda_py_test(
-    name = "session_manager_test",
-    size = "medium",  # TODO(irving): Can this be made small?
-    srcs = ["training/session_manager_test.py"],
-    grpc_enabled = True,
-    main = "training/session_manager_test.py",
-    python_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":client",
-        ":client_testlib",
-        ":control_flow_ops",
-        ":errors",
-        ":framework_for_generated_wrappers",
-        ":platform",
-        ":training",
-        ":variables",
-    ],
-)
-
-tf_py_test(
-    name = "supervisor_test",
-    size = "small",
-    srcs = ["training/supervisor_test.py"],
-    grpc_enabled = True,
-    python_version = "PY3",
-    tags = ["no_windows"],
-    deps = [
-        ":array_ops",
-        ":checkpoint_management",
-        ":client_testlib",
-        ":errors",
-        ":framework",
-        ":framework_for_generated_wrappers",
-        ":io_ops",
-        ":parsing_ops",
-        ":platform",
-        ":saver",
-        ":summary",
-        ":training",
-        ":variables",
-        "//tensorflow/core:protos_all_py",
-    ],
-)
-
-tf_py_test(
-    name = "basic_session_run_hooks_test",
-    size = "medium",
-    srcs = ["training/basic_session_run_hooks_test.py"],
-    python_version = "PY3",
-    tags = [
-        "no_pip",  # Relies on contrib
-        "no_windows",
-        "notsan",  # intermittent races on a few percent of runs
-    ],
-    deps = [
-        ":client",
-        ":client_testlib",
-        ":control_flow_ops",
-        ":fake_summary_writer",
-        ":framework",
-        ":framework_for_generated_wrappers",
-        ":nn_grad",
-        ":platform",
-        ":state_ops",
-        ":summary",
-        ":training",
-        ":variable_scope",
-        ":variables",
-        "//tensorflow/core:protos_all_py",
-    ],
-)
-
-tf_py_test(
-    name = "checkpoint_utils_test",
-    size = "small",
-    srcs = ["training/checkpoint_utils_test.py"],
-    python_version = "PY3",
-    tags = [
-        "manual",
-        "no_cuda_on_cpu_tap",
-        "no_oss",
-        "no_windows",
-        "notap",
-    ],
-    deps = [
-        ":client",
-        ":client_testlib",
-        ":framework_for_generated_wrappers",
-        ":io_ops",
-        ":partitioned_variables",
-        ":platform",
-        ":resource_variable_ops",
-        ":state_ops",
-        ":training",
-        ":variable_scope",
-        ":variables",
-    ],
-)
-
-tf_py_test(
-    name = "checkpoint_ops_test",
-    size = "small",
-    srcs = ["training/checkpoint_ops_test.py"],
-    python_version = "PY3",
-    deps = [
-        ":checkpoint_ops_gen",
-        ":client",
-        ":client_testlib",
-        ":framework_for_generated_wrappers",
-        ":io_ops",
-        ":partitioned_variables",
-        ":platform",
-        ":pywrap_tensorflow",
-        ":state_ops",
-        ":training",
-        ":variable_scope",
-        ":variables",
-    ],
-)
-
-tf_py_test(
-    name = "warm_starting_util_test",
-    size = "medium",
-    srcs = ["training/warm_starting_util_test.py"],
-    python_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":client_testlib",
-        ":dtypes",
-        ":framework_ops",
-        ":init_ops",
-        ":training",
-        ":variable_scope",
-        ":variables",
-        "//third_party/py/numpy",
-    ],
-)
-
-tf_py_test(
-    name = "monitored_session_test",
-    size = "medium",
-    srcs = ["training/monitored_session_test.py"],
-    tags = [
-        "no_pip",
-        "notsan",  # b/67945581
-    ],
-    deps = [
-        ":array_ops",
-        ":checkpoint_management",
-        ":client_testlib",
-        ":control_flow_ops",
-        ":errors",
-        ":framework_for_generated_wrappers",
-        ":resource_variable_ops",
-        ":saver",
-        ":session",
-        ":state_ops",
-        ":summary",
-        ":training",
-        ":variables",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python/distribute:collective_all_reduce_strategy",
-        "//tensorflow/python/distribute:distribute_coordinator",
-    ],
-)
-
-py_library(
-    name = "training_util",
-    srcs = ["training/training_util.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":dtypes",
-        ":framework",
-        ":framework_ops",
-        ":init_ops",
-        ":platform",
-        ":resource_variable_ops",
-        ":state_ops",
-        ":util",
-        ":variable_scope",
-        ":variables",
-        "//tensorflow/python/eager:context",
-    ],
-)
-
-tf_py_test(
-    name = "training_util_test",
-    size = "small",
-    srcs = ["training/training_util_test.py"],
-    python_version = "PY3",
-    deps = [
-        ":client_testlib",
-        ":framework",
-        ":platform",
-        ":training",
-        ":training_util",
-        ":variables",
-    ],
-)
-
-tf_py_test(
-    name = "input_test",
-    size = "medium",
-    srcs = ["training/input_test.py"],
-    python_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":client_testlib",
-        ":errors",
-        ":framework",
-        ":framework_for_generated_wrappers",
-        ":math_ops",
-        ":platform",
-        ":training",
-        ":util",
-        ":variables",
-        "//third_party/py/numpy",
-    ],
-)
-
 py_library(
     name = "summary_op_util",
     srcs = ["ops/summary_op_util.py"],
@@ -7334,6 +6670,7 @@ tf_py_test(
     srcs = ["ops/dequantize_op_test.py"],
     python_version = "PY3",
     tags = ["no_windows"],
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -7348,6 +6685,7 @@ tf_py_test(
     srcs = ["ops/quantized_ops_test.py"],
     python_version = "PY3",
     tags = ["no_windows"],
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -7362,6 +6700,7 @@ tf_py_test(
     srcs = ["ops/quantized_conv_ops_test.py"],
     python_version = "PY3",
     tags = ["no_windows"],
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":framework_for_generated_wrappers",
@@ -7395,6 +6734,7 @@ cuda_py_test(
     main = "ops/accumulate_n_benchmark.py",
     python_version = "PY3",
     shard_count = 6,
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client",
@@ -7414,6 +6754,7 @@ cuda_py_test(
     srcs = ["ops/batch_norm_benchmark.py"],
     main = "ops/batch_norm_benchmark.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client",
@@ -7435,6 +6776,7 @@ cuda_py_test(
     srcs = ["ops/collective_ops_benchmark.py"],
     main = "ops/collective_ops_benchmark.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client",
@@ -7452,6 +6794,7 @@ cuda_py_test(
     srcs = ["ops/concat_benchmark.py"],
     main = "ops/concat_benchmark.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client",
@@ -7470,6 +6813,7 @@ cuda_py_test(
     srcs = ["ops/control_flow_ops_benchmark.py"],
     main = "ops/control_flow_ops_benchmark.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":constant_op",
@@ -7485,6 +6829,7 @@ cuda_py_test(
     srcs = ["ops/conv2d_benchmark.py"],
     main = "ops/conv2d_benchmark.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client",
         ":client_testlib",
@@ -7505,6 +6850,7 @@ cuda_py_test(
     srcs = ["ops/split_benchmark.py"],
     main = "ops/split_benchmark.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client",
@@ -7525,6 +6871,7 @@ cuda_py_test(
     srcs = ["ops/transpose_benchmark.py"],
     main = "ops/transpose_benchmark.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client",
@@ -7545,6 +6892,7 @@ cuda_py_test(
     srcs = ["ops/matmul_benchmark.py"],
     main = "ops/matmul_benchmark.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [":matmul_benchmark_main_lib"],
 )
 
@@ -7574,6 +6922,7 @@ cuda_py_test(
     grpc_enabled = True,
     main = "client/session_benchmark.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client",
@@ -7592,6 +6941,7 @@ cuda_py_test(
     srcs = ["framework/graph_building_benchmark.py"],
     main = "framework/graph_building_benchmark.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -7607,6 +6957,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["ops/nn_grad_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":framework_for_generated_wrappers",
@@ -7661,6 +7012,7 @@ tf_py_test(
         "grappler",
         "no_pip",  # tf_optimizer is not available in pip.
     ],
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":framework_for_generated_wrappers",
@@ -7681,6 +7033,7 @@ tf_py_test(
         "grappler",
         "no_pip",  # tf_optimizer is not available in pip.
     ],
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -7799,6 +7152,7 @@ tf_py_test(
         "grappler",
         "no_pip",  # tf_optimizer is not available in pip.
     ],
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":framework_for_generated_wrappers",
@@ -7820,6 +7174,7 @@ tf_py_test(
     tags = [
         "grappler",
     ],
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":framework_for_generated_wrappers",
@@ -7955,6 +7310,7 @@ tf_py_test(
         "no_pip",
         "no_windows",  # TODO(b/151942037)
     ],
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -7989,6 +7345,7 @@ tf_py_test(
         "grappler",
         "no_pip",
     ],
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -8009,6 +7366,7 @@ cuda_py_test(
     ],
     python_version = "PY3",
     tags = ["grappler"],
+    tfrt_enabled = True,
     # This test analyzes the graph, but XLA changes the names of nodes.
     xla_enable_strict_auto_jit = False,
     deps = [
@@ -8122,6 +7480,7 @@ tf_python_pybind_extension(
     deps = [
         ":pybind11_lib",
         ":pybind11_status",
+        ":safe_pyobject_ptr",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/platform:status",
         "//third_party/python_runtime:headers",
@@ -8130,6 +7489,37 @@ tf_python_pybind_extension(
     ],
 )
 
+cc_library(
+    name = "unified_api_pywrap_required_headers",
+    textual_hdrs = [
+        "lib/core/numpy.h",
+        "lib/core/py_exception_registry.h",
+        "lib/core/pybind11_status.h",
+        "lib/core/bfloat16.h",
+        "lib/core/ndarray_tensor.h",
+        "lib/core/ndarray_tensor_bridge.h",
+        "lib/core/safe_ptr.h",
+        "lib/core/safe_pyobject_ptr.h",
+        "//tensorflow/c:headers",
+        "//tensorflow/c/eager:headers",
+        "//tensorflow/c/eager:pywrap_required_hdrs",
+        "//tensorflow/c/experimental/ops:pywrap_required_hdrs",
+        "//tensorflow/c/experimental/gradients:pywrap_required_hdrs",
+        "//tensorflow/core/common_runtime/eager:pywrap_required_hdrs",
+        "//tensorflow/python/eager:pywrap_required_hdrs",
+        "//tensorflow/core/common_runtime:core_cpu_lib_headers",
+        "//tensorflow/core/public:session.h",
+        "//tensorflow/core/public:session_options.h",
+    ],
+    visibility = ["//tensorflow/python/framework/experimental:__pkg__"],
+    deps = [
+        "//tensorflow/c:pywrap_required_hdrs",
+        "//tensorflow/core/framework:pywrap_required_hdrs",
+        "//third_party/py/numpy:headers",
+        "//third_party/python_runtime:headers",
+    ],
+)
+
 py_library(
     name = "pywrap_tfe",
     srcs = ["pywrap_tfe.py"],
@@ -8144,37 +7534,44 @@ tf_python_pybind_extension(
     name = "_pywrap_tfe",
     srcs = ["tfe_wrapper.cc"],
     hdrs = [
+        "lib/core/numpy.h",
         "lib/core/safe_ptr.h",
         "util/util.h",
         ":py_exception_registry_hdr",
         "//tensorflow/c:headers",
-        "//tensorflow/c:pywrap_required_hdrs",
         "//tensorflow/c/eager:headers",
         "//tensorflow/c/eager:pywrap_required_hdrs",
+        "//tensorflow/c/experimental/ops:pywrap_required_hdrs",
         "//tensorflow/core/common_runtime/eager:pywrap_required_hdrs",
         "//tensorflow/core/distributed_runtime:pywrap_required_hdrs",
         "//tensorflow/core/distributed_runtime/eager:pywrap_required_hdrs",
-        "//tensorflow/core/framework:pywrap_required_hdrs",
         "//tensorflow/python/eager:pywrap_required_hdrs",
     ],
     module_name = "_pywrap_tfe",
     deps = [
+        ":safe_pyobject_ptr",
         ":pybind11_lib",
+        "//third_party/py/numpy:headers",
         ":pybind11_status",
+        "//tensorflow/core/framework:pywrap_required_hdrs",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/hash",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
+        "//tensorflow/c:pywrap_required_hdrs",
         "@pybind11",
         "//third_party/python_runtime:headers",
         "//tensorflow/c/experimental/saved_model/core:pywrap_required_hdrs",
         "//tensorflow/compiler/jit:flags_headers_only",
+        "//tensorflow/compiler/jit:get_compiler_ir_hdrs_only",
+        "//tensorflow/c/eager:tfe_tensorhandle_internal",
         "//tensorflow/core/common_runtime:core_cpu_headers_lib",
         "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:lib_headers_for_pybind",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/platform",
+        "//tensorflow/core/lib/llvm_rtti",
     ] + if_static(
         extra_deps = [
             "//tensorflow/core/protobuf:eager_service_proto_cc",
@@ -8225,11 +7622,12 @@ tf_python_pybind_extension(
     module_name = "_pywrap_parallel_device",
     visibility = ["//tensorflow/python/distribute/parallel_device:__pkg__"],
     deps = [
+        ":pybind11_lib",
+        ":pybind11_status",
+        ":safe_pyobject_ptr",
         "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:lib_headers_for_pybind",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/python:pybind11_lib",
-        "//tensorflow/python:pybind11_status",
         "//third_party/python_runtime:headers",
         "@pybind11",
     ],
@@ -8264,7 +7662,91 @@ cuda_py_test(
     name = "raw_ops_test",
     srcs = ["ops/raw_ops_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":client_testlib",
     ],
 )
+
+alias(
+    name = "basic_session_run_hooks",
+    actual = "//tensorflow/python/training:basic_session_run_hooks",
+)
+
+alias(
+    name = "checkpoint_management",
+    actual = "//tensorflow/python/training:checkpoint_management",
+)
+
+alias(
+    name = "distribute",
+    actual = "//tensorflow/python/training:distribute",
+)
+
+alias(
+    name = "py_checkpoint_reader",
+    actual = "//tensorflow/python/training:py_checkpoint_reader",
+)
+
+alias(
+    name = "saver",
+    actual = "//tensorflow/python/training:saver",
+)
+
+alias(
+    name = "session_run_hook",
+    actual = "//tensorflow/python/training:session_run_hook",
+)
+
+alias(
+    name = "training",
+    actual = "//tensorflow/python/training:training",
+)
+
+alias(
+    name = "training_lib",
+    actual = "//tensorflow/python/training:training_lib",
+)
+
+alias(
+    name = "training_server_lib",
+    actual = "//tensorflow/python/training:server_lib",
+)
+
+alias(
+    name = "training_util",
+    actual = "//tensorflow/python/training:training_util",
+)
+
+alias(
+    name = "loss_scale",
+    actual = "//tensorflow/python/training/experimental:loss_scale",
+)
+
+alias(
+    name = "loss_scale_optimizer",
+    actual = "//tensorflow/python/training/experimental:loss_scale_optimizer",
+)
+
+alias(
+    name = "mixed_precision",
+    actual = "//tensorflow/python/training/experimental:mixed_precision",
+)
+
+alias(
+    name = "mixed_precision_global_state",
+    actual = "//tensorflow/python/training/experimental:mixed_precision_global_state",
+)
+
+alias(
+    name = "loss_scaling_gradient_tape",
+    actual = "//tensorflow/python/training/experimental:loss_scaling_gradient_tape",
+)
+
+py_library(
+    name = "learning_rate_decay",
+    # This rule depends on a target that only python:__pkg__ has visibility for.
+    srcs = ["//tensorflow/python/training:learning_rate_decay.py"],
+    srcs_version = "PY2AND3",
+    deps = ["//tensorflow/python/keras/optimizer_v2:legacy_learning_rate_decay"],
+)
diff --git a/tensorflow/python/__init__.py b/tensorflow/python/__init__.py
index b5acf23ba79..22b4884dd71 100644
--- a/tensorflow/python/__init__.py
+++ b/tensorflow/python/__init__.py
@@ -36,6 +36,7 @@ import traceback
 
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import,g-bad-import-order,g-import-not-at-top
+from tensorflow.python import pywrap_tensorflow as _pywrap_tensorflow
 
 from tensorflow.python.eager import context
 
@@ -101,7 +102,7 @@ from tensorflow.python.platform import flags
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.platform import resource_loader
-from tensorflow.python.platform import sysconfig
+from tensorflow.python.platform import sysconfig as sysconfig_lib
 from tensorflow.python.platform import test
 
 from tensorflow.python.compat import v2_compat
diff --git a/tensorflow/python/autograph/converters/BUILD b/tensorflow/python/autograph/converters/BUILD
index f584038978f..fd8ec1dbaa3 100644
--- a/tensorflow/python/autograph/converters/BUILD
+++ b/tensorflow/python/autograph/converters/BUILD
@@ -176,6 +176,7 @@ py_test(
     srcs = ["logical_expressions_test.py"],
     python_version = "PY3",
     srcs_version = "PY2AND3",
+    tags = ["notsan"],  # b/163218460
     deps = [
         ":converters",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/autograph/converters/control_flow.py b/tensorflow/python/autograph/converters/control_flow.py
index b54770cbd28..4eace00fcaf 100644
--- a/tensorflow/python/autograph/converters/control_flow.py
+++ b/tensorflow/python/autograph/converters/control_flow.py
@@ -24,6 +24,7 @@ from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.lang import directives
 from tensorflow.python.autograph.pyct import anno
 from tensorflow.python.autograph.pyct import cfg
+from tensorflow.python.autograph.pyct import origin_info
 from tensorflow.python.autograph.pyct import parser
 from tensorflow.python.autograph.pyct import qual_names
 from tensorflow.python.autograph.pyct import templates
@@ -59,10 +60,10 @@ class ControlFlowTransformer(converter.Base):
   def _create_nonlocal_declarations(self, vars_):
     vars_ = set(vars_)
     results = []
-    global_vars = self.state[_Function].scope.globals
+    global_vars = self.state[_Function].scope.globals & vars_
 
     if global_vars:
-      results.append(gast.Global([str(v) for v in vars_]))
+      results.append(gast.Global([str(v) for v in global_vars]))
 
     nonlocal_vars = [
         v for v in vars_ if not v.is_composite() and v not in global_vars]
@@ -179,6 +180,7 @@ class ControlFlowTransformer(converter.Base):
     defined_in = anno.getanno(node, anno.Static.DEFINED_VARS_IN)
     live_in = anno.getanno(node, anno.Static.LIVE_VARS_IN)
     live_out = anno.getanno(node, anno.Static.LIVE_VARS_OUT)
+    fn_scope = self.state[_Function].scope
 
     basic_scope_vars = self._get_block_basic_vars(
         modified,
@@ -190,8 +192,9 @@ class ControlFlowTransformer(converter.Base):
     # Variables that are modified inside the scope, but not defined
     # before entering it. Only simple variables must be defined. The
     # composite ones will be implicitly checked at runtime.
-    # This covers loop variables as well as variables that
-    undefined = tuple(v for v in modified - defined_in if not v.is_composite())
+    possibly_undefined = (
+        modified - defined_in - fn_scope.globals - fn_scope.nonlocals)
+    undefined = tuple(v for v in possibly_undefined if not v.is_composite())
 
     # Variables that are modified inside the scope, and depend on values outside
     # it.
@@ -209,7 +212,7 @@ class ControlFlowTransformer(converter.Base):
     orelse_scope = anno.getanno(node, annos.NodeAnno.ORELSE_SCOPE)
 
     cond_vars, undefined, nouts = self._get_block_vars(
-        node, body_scope.modified | orelse_scope.modified)
+        node, body_scope.bound | orelse_scope.bound)
 
     undefined_assigns = self._create_undefined_assigns(undefined)
 
@@ -243,7 +246,7 @@ class ControlFlowTransformer(converter.Base):
         (symbol_names,),
         nouts)
     """
-    return templates.replace(
+    new_nodes = templates.replace(
         template,
         body=node.body,
         body_name=self.ctx.namer.new_symbol('if_body', reserved),
@@ -257,12 +260,14 @@ class ControlFlowTransformer(converter.Base):
         symbol_names=tuple(gast.Constant(str(s), kind=None) for s in cond_vars),
         test=node.test,
         undefined_assigns=undefined_assigns)
+    origin_info.copy_origin(node, new_nodes[-1])
+    return new_nodes
 
   def visit_While(self, node):
     node = self.generic_visit(node)
     body_scope = anno.getanno(node, annos.NodeAnno.BODY_SCOPE)
 
-    loop_vars, undefined, _ = self._get_block_vars(node, body_scope.modified)
+    loop_vars, undefined, _ = self._get_block_vars(node, body_scope.bound)
 
     undefined_assigns = self._create_undefined_assigns(undefined)
 
@@ -292,7 +297,7 @@ class ControlFlowTransformer(converter.Base):
           (symbol_names,),
           opts)
     """
-    return templates.replace(
+    new_nodes = templates.replace(
         template,
         body=node.body,
         body_name=self.ctx.namer.new_symbol('loop_body', reserved),
@@ -305,6 +310,8 @@ class ControlFlowTransformer(converter.Base):
         test=node.test,
         test_name=self.ctx.namer.new_symbol('loop_test', reserved),
         undefined_assigns=undefined_assigns)
+    origin_info.copy_origin(node, new_nodes[-1])
+    return new_nodes
 
   def visit_For(self, node):
     node = self.generic_visit(node)
@@ -312,7 +319,7 @@ class ControlFlowTransformer(converter.Base):
     iter_scope = anno.getanno(node, annos.NodeAnno.ITERATE_SCOPE)
 
     loop_vars, undefined, _ = self._get_block_vars(
-        node, body_scope.modified | iter_scope.modified)
+        node, body_scope.bound | iter_scope.bound)
 
     undefined_assigns = self._create_undefined_assigns(undefined)
 
@@ -356,6 +363,7 @@ class ControlFlowTransformer(converter.Base):
     """
     iterate_expansion = templates.replace(
         template, iterate_arg_name=iterate_arg_name, iterates=node.target)
+    origin_info.copy_origin(node, iterate_expansion)
 
     template = """
       state_functions
@@ -374,7 +382,7 @@ class ControlFlowTransformer(converter.Base):
           (symbol_names,),
           opts)
     """
-    return templates.replace(
+    new_nodes = templates.replace(
         template,
         body=node.body,
         body_name=self.ctx.namer.new_symbol('loop_body', reserved),
@@ -390,6 +398,8 @@ class ControlFlowTransformer(converter.Base):
         state_getter_name=state_getter_name,
         state_setter_name=state_setter_name,
         undefined_assigns=undefined_assigns)
+    origin_info.copy_origin(node, new_nodes[-1])
+    return new_nodes
 
 
 class AnnotatedDef(reaching_definitions.Definition):
diff --git a/tensorflow/python/autograph/converters/control_flow_test.py b/tensorflow/python/autograph/converters/control_flow_test.py
index 87f59bef675..497b3297335 100644
--- a/tensorflow/python/autograph/converters/control_flow_test.py
+++ b/tensorflow/python/autograph/converters/control_flow_test.py
@@ -38,6 +38,8 @@ from tensorflow.python.util import nest
 
 
 for_unaffected_global = None
+for_mixed_globals_nonglobals = None
+for_test_global_local = None
 
 
 class ControlFlowTestBase(converter_testing.TestCase):
@@ -76,6 +78,25 @@ class NestedControlFlowTest(ControlFlowTestBase):
     self.assertTransformedResult(f, constant_op.constant(5),
                                  (25, 5, 0, 5))
 
+  def test_mixed_globals_nonglobals(self):
+
+    def f(n):
+      global for_mixed_globals_nonglobals
+      i = 0
+      j = 0
+      for_mixed_globals_nonglobals = 0
+      while i < n:
+        while j < i:
+          j += 3
+        u = i + j  # 'u' is not defined within the inner loop
+        for_mixed_globals_nonglobals += u
+        i += 1
+        j = 0
+      return for_mixed_globals_nonglobals, i, j, n
+
+    self.assertTransformedResult(f, constant_op.constant(5),
+                                 (25, 5, 0, 5))
+
   def test_composite_state_complex(self):
 
     class TestClassX(object):
@@ -457,6 +478,23 @@ class IfStatementTest(ControlFlowTestBase):
     self.assertTransformedResult(f, constant_op.constant(1), 5)
     self.assertTransformedResult(f, constant_op.constant(-1), -1)
 
+  def test_global_local(self):
+
+    def f(n):
+      if n > 0:
+        global for_test_global_local
+        if for_test_global_local is None:
+          for_test_global_local = 1
+        else:
+          for_test_global_local += 1
+        n += for_test_global_local
+      return n
+
+    tr = self.transform(f, control_flow)
+    assert for_test_global_local is None
+    self.assertEqual(tr(1), 2)
+    self.assertEqual(for_test_global_local, 1)
+
   def test_no_outputs(self):
 
     def f(n):
diff --git a/tensorflow/python/autograph/g3doc/reference/debugging.md b/tensorflow/python/autograph/g3doc/reference/debugging.md
index 2c2a96cec86..5f20b5f9c24 100644
--- a/tensorflow/python/autograph/g3doc/reference/debugging.md
+++ b/tensorflow/python/autograph/g3doc/reference/debugging.md
@@ -21,13 +21,12 @@ Note: Python debugging can only be used to step through the code during graph
 construction time (or tracing time in the case of `tf.function`). To debug
 TensorFlow execution, use Eager execution.
 
-### Debugging `tf.function`: `tf.config.experimental_execute_functions_eagerly`
+### Debugging `tf.function`: `tf.config.experimental_run_functions_eagerly`
 
-When using `@tf.function`, you can temporarily toggle graph execution
-by using `tf.config.experimental_execute_functions_eagerly`. This will
-effectively run the annotated code eagerly, without transformation.
-Since AutoGraph has semantics consistent with Eager, it's an effective way to
-debug the code step-by-step.
+When using `@tf.function`, you can temporarily toggle graph execution by using
+`tf.config.experimental_run_functions_eagerly`. This will effectively run the
+annotated code eagerly, without transformation. Since AutoGraph has semantics
+consistent with Eager, it's an effective way to debug the code step-by-step.
 
 Note: AutoGraph is compatible with Eager, but the converse is not always
 true, so exercise care when making modifications to the code while debugging.
@@ -58,8 +57,8 @@ f(1)
      14       ...
 ```
 
-Adding a call to `tf.config.experimental_execute_functions_eagerly` before
-executing the function will land the debugger in the original code instead:
+Adding a call to `tf.config.experimental_run_functions_eagerly` before executing
+the function will land the debugger in the original code instead:
 
 ```
 tf.config.run_functions_eagerly(True)
diff --git a/tensorflow/python/autograph/g3doc/reference/limitations.md b/tensorflow/python/autograph/g3doc/reference/limitations.md
index 70ce5fc7dec..5459d67b883 100644
--- a/tensorflow/python/autograph/g3doc/reference/limitations.md
+++ b/tensorflow/python/autograph/g3doc/reference/limitations.md
@@ -66,22 +66,48 @@ else:
   pass
 ```
 
-Similarly, variables may not be defined inside a TensorFlow loop, unless they
-are local to the loop. A variable is local to the loop if (1) it's not used
-after the loop and (2) the value from a previour iteration is not used in the
-next iteration:
+Similarly, variables must usually be defined before a TensorFlow loop.
+
+The most common example that is not allowed is a loop which initializes some
+accumulator variable in the first iteration:
 
 ```
 del x
-while tf.random.uniform(()) > 0.5:  # Error -- x must be defined before the loop
+for i in tf.range(100):  # Error -- x must be defined before the loop
+  if i == 0:
+    x = tf.constant(1)
+  else:
+    x = x + 1
+tf.print(x)
+```
+
+When the variable is only used inside the loop and does not depend on previous
+iterations, then it's ok to only be initialized inside the loop.
+
+```
+del x
+while tf.random.uniform(()) > 0.5:  # Okay -- x is not used after the loop
+  x = tf.constant(1)
+```
+
+* New in TF 2.4 *
+
+As long as it doesn't depend on previous iterations, the variable may also be
+used after the loop, however in that case the loop must execute at least one
+iteration, and will raise a runtime error otherwise.
+
+```
+del x
+for i in tf.range(10):  # Okay -- x does not depend on previous iterations
   x = tf.constant(1)
 tf.print(x)
 ```
 
 ```
 del x
-while tf.random.uniform(()) > 0.5:  # Okay -- x is local to the loop
+while tf.constant(False):  # Error -- loop must initialize x!
   x = tf.constant(1)
+tf.print(x)
 ```
 
 Avoid these limitations by defining a default value before the control flow
@@ -98,6 +124,34 @@ Note: `None` values and undefined symbols are allowed in Eager control flow,
 because Eager execution uses Python control flow, rather than TensorFlow
 control flow ops.
 
+#### Special case: creating Tensors in a loop
+
+* New in TF 2.4 *
+
+A very common use-case is to run a training loop that creates some outputs:
+
+```
+for i in tf.range(num_steps):
+  outputs = train(next(data_iterator))
+```
+
+Often times these outputs can be nested structures of Tensors, which makes them
+impractical to initialize ahead of the loop.
+
+To help with this use-case, AutoGraph lets you run such loops, under certain
+conditions:
+
+ * outputs must be a Tensor, Python numeric, or a structure of these
+ * outputs must not depend on the value from a previous iteration; in other
+   words, the outputs may only appear to the left of an assignment operation
+ * the loop must run at least one iteration
+
+If the type of outputs is not recognized, then the usual
+"outputs must be defined before the loop" is raised at graph construction.
+
+AutoGraph also inserts a `tf.Assert` statement that raises a runtime error
+if the loop did not execute at least one iteration.
+
 ### Indirect modifications and hidden side effects in TensorFlow control flow
 
 Key Point: We recommend using a functional programming style, immutable Python
@@ -230,7 +284,7 @@ A special case of hidden side effects are methods, which are commonly used
 to change the value of objects:
 
 ```
-def MyClass(object):
+class MyClass(object):
   def change(self):
     self.y += 1
 
@@ -254,7 +308,7 @@ temporary objects when executing eagerly, but their number is greatly reduced
 in `@tf.function`:
 
 ```
-def MyClass(object):
+class MyClass(object):
   def change(self):
     self.y += 1
     return self
diff --git a/tensorflow/python/autograph/operators/control_flow.py b/tensorflow/python/autograph/operators/control_flow.py
index 03f67d67fee..9bd139c031f 100644
--- a/tensorflow/python/autograph/operators/control_flow.py
+++ b/tensorflow/python/autograph/operators/control_flow.py
@@ -79,6 +79,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import math_ops
@@ -99,19 +100,70 @@ INEFFICIENT_UNROLL_MIN_OPS = 1
 # datasets. Before it can be used though, we need to standardize the interface.
 
 
-def _verify_loop_init_vars(values, symbol_names):
-  """Ensures that all values in the state are defined when entering a loop."""
-  for name, value in zip(symbol_names, values):
-    if value is None:
-      raise ValueError("'{}' may not be None before the loop.".format(name))
-    if isinstance(value, variables.UndefinedReturnValue):
-      # Assumption: the loop will only capture the variable which tracks the
-      # return value if the loop contained a return statement.
-      # TODO(mdan): This should be checked at the place where return occurs.
-      raise ValueError(
-          'return statements are not supported within a TensorFlow loop.')
-    if isinstance(value, variables.Undefined):
-      raise ValueError("'{}' must be defined before the loop.".format(name))
+def _is_none_or_undef(value):
+  """Tests whether a value is None or undefined.
+
+  AutoGraph represents undefined symbols using special objects of type Undefined
+  or UndefinedReturnValue.
+
+  Args:
+    value: value to test
+  Returns:
+    Boolean
+  """
+  return ((value is None)
+          or isinstance(value, variables.UndefinedReturnValue)
+          or isinstance(value, variables.Undefined))
+
+
+def _verify_loop_init_vars(init_vars, symbol_names, first_iter_vars=None):
+  """Ensures that all values in the state are valid to use in a TF loop.
+
+  The init_vars may contain placeholder values derived from first_iter_vars.
+
+  Args:
+    init_vars: initial loop variables (as taken before entering the loop)
+    symbol_names: corresponding names of the initial loop variables
+    first_iter_vars: loop variables after one iteration of the loop
+  """
+  if not symbol_names:
+    return
+  if first_iter_vars is None:
+    first_iter_vars = (None,) * len(symbol_names)
+
+  assert len(symbol_names) == len(init_vars)
+  assert len(symbol_names) == len(first_iter_vars)
+  for name, val, fi_val in zip(symbol_names, init_vars, first_iter_vars):
+    if isinstance(val, variables.UndefinedReturnValue):
+      if fi_val:
+        raise ValueError(
+            'the return value from a TensorFlow loop may only be a {}; got {}'
+            .format(LEGAL_LOOP_TYPES, type(fi_val)))
+      else:
+        # TODO(mdan): This can be handled by removing the return value.
+        raise NotImplementedError(
+            'a return statement cannot be placed inside this TensorFlow loop;'
+            ' this may happen if a return statement depends on a'
+            ' static Python condition such as a hyperparameter')
+
+    error_msg = None
+    if val is None:
+      error_msg = "'{}' may not be None before the loop".format(name)
+    elif isinstance(val, variables.Undefined):
+      error_msg = "'{}' must be defined before the loop".format(name)
+
+    # This only happens when we could not infer a placeholder for the
+    # variable. The canonical case when that happens is when _placeholder_value
+    # couldnot infer a placeholder for it. That means it's of an unknown type
+    # or it's still undefined after staging one iteration.
+    if error_msg is not None:
+      if fi_val:
+        error_msg += (", unless it's a {}; got {}".format(
+            LEGAL_LOOP_TYPES, type(fi_val)))
+      else:
+        # TODO(mdan): This can be handled by removing the loop var.
+        error_msg += '.'
+      raise ValueError(error_msg)
 
 
 def _is_subshape(left, right):
@@ -394,6 +446,12 @@ def _py_for_stmt(iter_, extra_test, body, get_state, set_state):
       body(target)
 
 
+def _add_max_iterations_hint(opts, n):
+  # TODO(b/159186914): Remove the safeguard, and always set maximum_iterations.
+  if control_flow_util.GraphOrParentsInXlaContext(ops.get_default_graph()):
+    opts['maximum_iterations'] = n
+
+
 def _known_len_tf_for_stmt(
     iter_, extra_test, body, get_state, set_state, symbol_names, opts):
   """Overload of for_stmt that iterates over TF entities that admit a length."""
@@ -427,9 +485,7 @@ def _known_len_tf_for_stmt(
       return control_flow_ops.cond(main_test, extra_test, lambda: False)
     return main_test
 
-  # TODO(b/159186914): Remove.
-  if not control_flow_util.GraphOrParentsInXlaContext(ops.get_default_graph()):
-    opts['maximum_iterations'] = n
+  _add_max_iterations_hint(opts, n)
 
   _tf_while_stmt(
       aug_test,
@@ -475,9 +531,7 @@ def _tf_ragged_for_stmt(
       return control_flow_ops.cond(main_test, extra_test, lambda: False)
     return main_test
 
-  # TODO(b/159186914): Remove.
-  if not control_flow_util.GraphOrParentsInXlaContext(ops.get_default_graph()):
-    opts['maximum_iterations'] = n
+  _add_max_iterations_hint(opts, n)
 
   _tf_while_stmt(
       aug_test,
@@ -535,10 +589,9 @@ def _tf_range_for_stmt(
       main_test = control_flow_ops.cond(main_test, extra_test, lambda: False)
     return main_test
 
-  # TODO(b/134181679): Remove.
-  if not control_flow_util.GraphOrParentsInXlaContext(ops.get_default_graph()):
-    opts['maximum_iterations'] = math_ops.cast(
-        misc.get_range_len(start, limit, delta), dtypes.int32)
+  _add_max_iterations_hint(
+      opts,
+      math_ops.cast(misc.get_range_len(start, limit, delta), dtypes.int32))
 
   _tf_while_stmt(
       aug_test,
@@ -876,21 +929,130 @@ def _shape_invariants_mapping_to_positional_list(mapping, keys):
   return tuple(result)
 
 
+# Textual description of what a legal TF loop variable is. This description
+# summarizes types that _placeholder_value below can handle. Keep the two
+# together and in sync.
+LEGAL_LOOP_TYPES = 'Tensor, int, float, bool or a list, tuple or dict thereof'
+
+
+def _placeholder_value(like, original=None):
+  if isinstance(like, (variables.Undefined, variables.UndefinedReturnValue)):
+    return original
+  if isinstance(like, (int, float, bool)):
+    return type(like)(0)
+  if tensor_util.is_tensor(like):
+    return array_ops.zeros(like.shape, like.dtype)
+  elif isinstance(like, (list, tuple, dict)):
+    return nest.map_structure(_placeholder_value, like)
+  return original
+
+
+def _try_handling_undefineds(
+    body, get_state, set_state, init_vars, nulls, symbol_names):
+  """Makes a best-effort attempt to substitute undefineds with placeholders.
+
+  Note: this substitution requires two things to happen:
+   1. the types of loop variables could be inferred (usually by staging one
+       iteration)
+   2. these types could be replaced by placeholders (e.g. zero values, for
+       tensors.
+
+  Args:
+    body: a function representing the loop body. See while_stmt.
+    get_state: state getter for the loop statement. See while_stmt.
+    set_state: state getter for the loop statement. See while_stmt.
+    init_vars: loop variables before entering the loop. See while_stmt.
+    nulls: list of boolean flags indicating whether the corresponding loop
+        var is None or undefined.
+    symbol_names: list of loop variable names. See while_stmt.
+  Returns:
+    A tuple (success, new_init_vars). success is a boolean flag indicating
+    whether types could be successfully inferred (step 1 above). new_init_vars
+    contains the loop vars, with None or undefined values replaced by
+    placeholders, where possible (step 2 above).
+  """
+  state_modified = False
+
+  try:
+    # Stage an iteration of the loop body in a temporary graph.
+    with func_graph.FuncGraph('tmp').as_default():
+      # This call to set_state helps report nicer error messages when symbols
+      # are inconsistently used.
+      set_state(init_vars)
+      state_modified = True
+
+      body()
+      first_iter_vars = get_state()
+  except (UnboundLocalError, TypeError, ValueError, KeyError):
+    # Fall back to the old functionality. It will likely result in an input
+    # validation failure.
+    first_iter_vars = None
+  finally:
+    if state_modified:
+      set_state(init_vars)
+
+  if first_iter_vars is not None:
+    # Note: the actual placeholder value doesn't matter, because as the staging
+    # proved, it will be replaced by an actual value before being read.
+    init_vars = tuple(
+        (_placeholder_value(iv, v) if n else v)
+        for v, n, iv in zip(init_vars, nulls, first_iter_vars))
+    success = True
+  else:
+    success = False
+
+  # This check runs regardless, in case we captured non-Tensor inputs.
+  _verify_loop_init_vars(init_vars, symbol_names, first_iter_vars)
+
+  return success, init_vars
+
+
+def _runtime_zero_iterations_errmsg(symbol_names, nulls, init_vars):
+  """Creates an error message asking for the loop to iterate at least once."""
+  var_names = []
+  for sn, n, v in zip(symbol_names, nulls, init_vars):
+    if not n:
+      continue
+    if isinstance(v, variables.UndefinedReturnValue):
+      var_names.append('the function return value')
+    else:
+      var_names.append(sn)
+  var_names = ', '.join(var_names)
+  return 'loop must iterate at least once to initialize {}'.format(var_names)
+
+
 def _tf_while_stmt(test, body, get_state, set_state, symbol_names, opts):
   """Overload of while_stmt that stages a TF while_stmt."""
   init_vars = get_state()
-  _verify_loop_init_vars(init_vars, symbol_names)
+  orig_init_vars = init_vars
+
+  nulls = tuple(_is_none_or_undef(v) for v in init_vars)
+  if any(nulls):
+    require_one_iteration, init_vars = _try_handling_undefineds(
+        body, get_state, set_state, init_vars, nulls, symbol_names)
+  else:
+    require_one_iteration = False
 
   def aug_test(*loop_vars):
+    if require_one_iteration:
+      loop_vars = loop_vars[1:]
+
     set_state(loop_vars)
     return test()
 
   def aug_body(*loop_vars):
+    if require_one_iteration:
+      loop_vars = loop_vars[1:]
+
     set_state(loop_vars)
     body()
     new_loop_vars = get_state()
     _verify_tf_loop_vars(
         init_vars, loop_vars, new_loop_vars, symbol_names, opts)
+
+    if require_one_iteration:
+      new_loop_vars = (True,) + new_loop_vars
+
     return new_loop_vars
 
   if 'shape_invariants' in opts:
@@ -904,8 +1066,25 @@ def _tf_while_stmt(test, body, get_state, set_state, symbol_names, opts):
   # This enforces consistency across versions.
   while_loop_opts['return_same_structure'] = True
 
+  if require_one_iteration:
+    aug_init_vars = (False,) + init_vars
+  else:
+    aug_init_vars = init_vars
+
   final_loop_vars = control_flow_ops.while_loop(
-      aug_test, aug_body, init_vars, **while_loop_opts)
+      aug_test, aug_body, aug_init_vars, **while_loop_opts)
+
+  if require_one_iteration:
+    with ops.control_dependencies([
+        control_flow_ops.Assert(final_loop_vars[0], [
+            _runtime_zero_iterations_errmsg(symbol_names, nulls, orig_init_vars)
+        ])
+    ]):
+      final_loop_vars = nest.map_structure(
+          lambda v: (array_ops.identity(v) if tensor_util.is_tensor(v) else v),
+          final_loop_vars[1:],
+      )
+
   set_state(final_loop_vars)
 
 
diff --git a/tensorflow/python/autograph/operators/control_flow_test.py b/tensorflow/python/autograph/operators/control_flow_test.py
index 5f0629a163f..32b36a29797 100644
--- a/tensorflow/python/autograph/operators/control_flow_test.py
+++ b/tensorflow/python/autograph/operators/control_flow_test.py
@@ -86,7 +86,9 @@ class ForLoopTest(testing.AutoGraphTestCase):
         set_state=set_state,
         symbol_names=('s',),
         opts={'iterate_names': 'i'})
+
     self.assertEqual(s, (1234,))
+    self.assertOpCreated('StatelessWhile')
 
   def test_range_tensor_explicit_limit_delta(self):
     def body(i):
@@ -106,7 +108,9 @@ class ForLoopTest(testing.AutoGraphTestCase):
         set_state=set_state,
         symbol_names=('s',),
         opts={'iterate_names': 'i'})
+
     self.assertEqual(s, (-171207,))
+    self.assertOpCreated('StatelessWhile')
 
   def test_range_tensor_explicit_limit_negative_delta(self):
     def body(i):
@@ -126,7 +130,9 @@ class ForLoopTest(testing.AutoGraphTestCase):
         set_state=set_state,
         symbol_names=('s',),
         opts={'iterate_names': 'i'})
+
     self.assertEqual(s, (171207,))
+    self.assertOpCreated('StatelessWhile')
 
   def test_range_tensor_random_delta(self):
     def body(i):
@@ -147,7 +153,9 @@ class ForLoopTest(testing.AutoGraphTestCase):
         set_state=set_state,
         symbol_names=('s',),
         opts={'iterate_names': 'i'})
+
     self.assertEqual(s, (1234,))
+    self.assertOpCreated('StatelessWhile')
 
   def test_range_tensor_random_negative_delta(self):
     def body(i):
@@ -168,7 +176,9 @@ class ForLoopTest(testing.AutoGraphTestCase):
         set_state=set_state,
         symbol_names=('s',),
         opts={'iterate_names': 'i'})
+
     self.assertEqual(s, (171207,))
+    self.assertOpCreated('StatelessWhile')
 
   def test_tensor_with_extra_test_object_vars(self):
     class MutableObject(object):
@@ -194,7 +204,9 @@ class ForLoopTest(testing.AutoGraphTestCase):
         set_state=set_state,
         symbol_names=('state.field_1', 'state.field_2'),
         opts={})
+
     self.assertEqual((state.field_1, state.field_2), (6, 6))
+    self.assertOpCreated('StatelessWhile')
 
   def test_python(self):
     def body(i):
@@ -214,7 +226,9 @@ class ForLoopTest(testing.AutoGraphTestCase):
         set_state=set_state,
         symbol_names=('s',),
         opts={})
+
     self.assertEqual(s, 1234)
+    self.assertNoOpsCreated()
 
   def test_python_generator_with_extra_test(self):
     def new_generator():
@@ -247,6 +261,8 @@ class ForLoopTest(testing.AutoGraphTestCase):
 
     self.assertEqual(next(gen), 4)
 
+    self.assertNoOpsCreated()
+
   def test_python_generator_with_extra_test_no_iterations(self):
     def new_generator():
       for i in range(5):
@@ -275,6 +291,8 @@ class ForLoopTest(testing.AutoGraphTestCase):
 
     self.assertEqual(next(gen), 0)
 
+    self.assertNoOpsCreated()
+
   def test_tf_dataset(self):
     def body(i):
       nonlocal s
@@ -293,7 +311,9 @@ class ForLoopTest(testing.AutoGraphTestCase):
         set_state=set_state,
         symbol_names=('s',),
         opts={})
+
     self.assertEqual(s, (1234,))
+    self.assertOpCreated('ScanDataset')
 
   def test_dataset_with_extra_test(self):
     def body(i):
@@ -313,7 +333,9 @@ class ForLoopTest(testing.AutoGraphTestCase):
         set_state=set_state,
         symbol_names=('s',),
         opts={})
+
     self.assertEqual(s, (12,))
+    self.assertOpCreated('ScanDataset')
 
   def test_dataset_with_extra_test_collection_vars(self):
     def body(i):
@@ -335,7 +357,9 @@ class ForLoopTest(testing.AutoGraphTestCase):
         set_state=set_state,
         symbol_names=('l[0]', 's'),
         opts={})
+
     self.assertEqual((l[0], s), (3, 3))
+    self.assertOpCreated('ScanDataset')
 
   def test_dataset_with_extra_test_iteration_limiting(self):
     def body(it):
@@ -356,7 +380,9 @@ class ForLoopTest(testing.AutoGraphTestCase):
         set_state=set_state,
         symbol_names=('i',),
         opts={})
+
     self.assertEqual(i, (3,))
+    self.assertOpCreated('ScanDataset')
 
   def test_tf_dataset_no_loop_vars(self):
     def body(i):
@@ -374,6 +400,7 @@ class ForLoopTest(testing.AutoGraphTestCase):
         opts={})
 
     self.assertEqual(v.read_value(), 1234)
+    self.assertOpCreated('ScanDataset')
 
   def test_tf_iterator(self):
     def body(i):
@@ -395,6 +422,7 @@ class ForLoopTest(testing.AutoGraphTestCase):
         opts={})
 
     self.assertEqual(s, 1234)
+    self.assertOpCreated('IteratorGetNextAsOptional')
 
   def test_tf_iterator_shape_invariants(self):
     def body(i):
@@ -416,6 +444,7 @@ class ForLoopTest(testing.AutoGraphTestCase):
         opts={'shape_invariants': [(s, tensor_shape.TensorShape([None]))]})
 
     self.assertAllEqual(s, [0, 1, 2, 3, 4])
+    self.assertOpCreated('IteratorGetNextAsOptional')
 
   def test_tf_iterator_no_loop_vars(self):
     def body(i):
@@ -433,6 +462,7 @@ class ForLoopTest(testing.AutoGraphTestCase):
         opts={})
 
     self.assertEqual(v.read_value(), 1234)
+    self.assertOpCreated('IteratorGetNextAsOptional')
 
   def test_tf_ragged_tensor(self):
     def body(i):
@@ -454,6 +484,7 @@ class ForLoopTest(testing.AutoGraphTestCase):
         opts={})
 
     self.assertEqual(s, (123,))
+    self.assertOpCreated('StatelessWhile')
 
   def test_tf_ragged_tensor_higher_dimensional(self):
     def body(i):
@@ -479,6 +510,7 @@ class ForLoopTest(testing.AutoGraphTestCase):
         opts={})
 
     self.assertEqual(s, (12,))
+    self.assertOpCreated('StatelessWhile')
 
   def test_tf_ragged_tensor_no_loop_vars(self):
     v = self.variable('v', 0, dtypes.int32)
@@ -497,6 +529,7 @@ class ForLoopTest(testing.AutoGraphTestCase):
 
     # Note: 123 = ((0*10 + 1)*10+2)*10+3 (first element of each row).
     self.assertEqual(v.read_value(), 123)
+    self.assertOpCreated('While')
 
   def _basic_loop(self, init_value, body_fn):
     def body(i):
@@ -540,6 +573,7 @@ class ForLoopTest(testing.AutoGraphTestCase):
 class WhileLoopTest(testing.AutoGraphTestCase):
 
   def test_tensor(self):
+
     def body():
       nonlocal i, s
       s = s * 10 + i
@@ -559,8 +593,66 @@ class WhileLoopTest(testing.AutoGraphTestCase):
         set_state=set_state,
         symbol_names=('i', 's'),
         opts={})
+
     self.assertEqual(i, 5)
     self.assertEqual(s, 1234)
+    self.assertOpCreated('StatelessWhile')
+
+  def test_tensor_creating_variable(self):
+
+    def body():
+      nonlocal i, s
+      i = constant_op.constant(2)
+      s = i ** 5
+
+    def set_state(loop_vars):
+      nonlocal i, s
+      i, s = loop_vars
+
+    i = variable_operators.Undefined('i')
+    s = constant_op.constant(0)
+    control_flow.while_stmt(
+        test=lambda: math_ops.equal(s, 0),
+        body=body,
+        get_state=lambda: (i, s),
+        set_state=set_state,
+        symbol_names=('i', 's'),
+        opts={})
+
+    self.assertEqual(i, 2)
+    self.assertEqual(s, 32)
+    self.assertOpCreated('StatelessWhile')
+    # Check that the temporary staging of the body did not create extra ops.
+    # Node naming is inconsistent between V1 and V2.
+    self.assertGraphContains(r'(while/)?pow$', 1)
+
+  def test_tensor_creating_complex_variable(self):
+
+    def body():
+      nonlocal i, s
+      i = {'a': constant_op.constant(2), 'b': {'c': constant_op.constant(1)}}
+      s = i['a'] ** 5
+
+    def set_state(loop_vars):
+      nonlocal i, s
+      i, s = loop_vars
+
+    i = variable_operators.Undefined('i')
+    s = constant_op.constant(0)
+    control_flow.while_stmt(
+        test=lambda: math_ops.equal(s, 0),
+        body=body,
+        get_state=lambda: (i, s),
+        set_state=set_state,
+        symbol_names=('i', 's'),
+        opts={})
+
+    self.assertDictEqual(i, {'a': 2, 'b': {'c': 1}})
+    self.assertEqual(s, 32)
+    self.assertOpCreated('StatelessWhile')
+    # Check that the temporary staging of the body did not create extra ops.
+    # Node naming is inconsistent between V1 and V2.
+    self.assertGraphContains(r'(while/)?pow$', 1)
 
   def test_tensor_with_side_effecting_condition(self):
     v = self.variable('v', 0, dtypes.int32)
@@ -589,6 +681,7 @@ class WhileLoopTest(testing.AutoGraphTestCase):
 
     self.assertEqual(i, (5,))
     self.assertEqual(v, (12345,))
+    self.assertOpCreated('While')
 
   def test_tensor_with_python_state(self):
     class MutableObject(object):
@@ -613,8 +706,10 @@ class WhileLoopTest(testing.AutoGraphTestCase):
         set_state=set_state,
         symbol_names=('i', 'state.field'),
         opts={})
+
     self.assertEqual(i, 5)
     self.assertEqual(state.field, 1234)
+    self.assertOpCreated('StatelessWhile')
 
   def test_python(self):
     def body():
@@ -632,7 +727,9 @@ class WhileLoopTest(testing.AutoGraphTestCase):
         set_state=None,
         symbol_names=('i', 's'),
         opts={})
+
     self.assertEqual(s, 1234)
+    self.assertNoOpsCreated()
 
   def test_python_with_tensor_state(self):
     def body():
@@ -650,8 +747,10 @@ class WhileLoopTest(testing.AutoGraphTestCase):
         set_state=None,
         symbol_names=('i', 's'),
         opts={})
+
     self.assertEqual(i, 5)
     self.assertEqual(s, 1234)
+    self.assertOpsNotCreated(('While', 'StatelessWhile'))
 
   def test_python_while_infinite(self):
     if not __debug__:
@@ -732,6 +831,7 @@ class WhileLoopTest(testing.AutoGraphTestCase):
             r'.* Large unrolled loop.*Add.*', out_capturer.getvalue()))
 
   def _basic_loop(self, init_value, body_fn):
+
     def body():
       nonlocal i, s
       s = body_fn(i, s)
@@ -802,6 +902,7 @@ class IfStmtTest(testing.AutoGraphTestCase):
 
     self.assertEqual(test_fn(constant_op.constant(True)), 1)
     self.assertEqual(test_fn(constant_op.constant(False)), -1)
+    self.assertOpCreated('StatelessIf')
 
   def test_tensor_no_outputs(self):
 
@@ -831,6 +932,7 @@ class IfStmtTest(testing.AutoGraphTestCase):
 
     self.assertIsNone(test_fn(constant_op.constant(True)))
     self.assertIsNone(test_fn(constant_op.constant(False)))
+    self.assertOpCreated('StatelessIf')
 
   def test_tensor_multiple_returns(self):
 
@@ -862,6 +964,7 @@ class IfStmtTest(testing.AutoGraphTestCase):
 
     self.assertEqual(test_fn(constant_op.constant(True)), (1, 2))
     self.assertEqual(test_fn(constant_op.constant(False)), (-1, -2))
+    self.assertOpCreated('StatelessIf')
 
   def test_python(self):
 
@@ -887,6 +990,7 @@ class IfStmtTest(testing.AutoGraphTestCase):
 
     self.assertEqual(test_fn(True), 1)
     self.assertEqual(test_fn(False), -1)
+    self.assertNoOpsCreated()
 
   def test_python_multiple_returns(self):
 
@@ -914,6 +1018,7 @@ class IfStmtTest(testing.AutoGraphTestCase):
 
     self.assertEqual(test_fn(True), (1, 2))
     self.assertEqual(test_fn(False), (-1, -2))
+    self.assertNoOpsCreated()
 
   def _basic_cond(self, body_fn, else_fn):
     def body():
diff --git a/tensorflow/python/autograph/pyct/anno.py b/tensorflow/python/autograph/pyct/anno.py
index 90535ffd903..3abee325084 100644
--- a/tensorflow/python/autograph/pyct/anno.py
+++ b/tensorflow/python/autograph/pyct/anno.py
@@ -35,10 +35,14 @@ import gast
 
 
 class NoValue(enum.Enum):
+  """Base class for different types of AST annotations."""
 
   def of(self, node, default=None):
     return getanno(node, self, default=default)
 
+  def add_to(self, node, value):
+    setanno(node, self, value)
+
   def exists(self, node):
     return hasanno(node, self)
 
diff --git a/tensorflow/python/autograph/pyct/origin_info.py b/tensorflow/python/autograph/pyct/origin_info.py
index ba25d96d2d6..cd909e16364 100644
--- a/tensorflow/python/autograph/pyct/origin_info.py
+++ b/tensorflow/python/autograph/pyct/origin_info.py
@@ -279,3 +279,15 @@ def resolve_entity(node, source, entity):
   col_offset = len(definition_line) - len(definition_line.lstrip())
 
   resolve(node, source, filepath, lineno, col_offset)
+
+
+def copy_origin(from_node, to_node):
+  """Copies the origin info from a node to another, recursively."""
+  origin = anno.Basic.ORIGIN.of(from_node, default=None)
+  if origin is None:
+    return
+  if not isinstance(to_node, (list, tuple)):
+    to_node = (to_node,)
+  for node in to_node:
+    for n in gast.walk(node):
+      anno.setanno(n, anno.Basic.ORIGIN, origin)
diff --git a/tensorflow/python/autograph/pyct/static_analysis/activity.py b/tensorflow/python/autograph/pyct/static_analysis/activity.py
index a3228c0a1cc..dc50a4761ad 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/activity.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/activity.py
@@ -178,7 +178,8 @@ class Scope(object):
     self.isolated_names.update(other.isolated_names)
     self.read.update(other.read)
     self.modified.update(other.modified)
-    self.bound.update(other.deleted)
+    self.bound.update(other.bound)
+    self.deleted.update(other.deleted)
     self.annotations.update(other.annotations)
     self.params.update(other.params)
 
diff --git a/tensorflow/python/autograph/pyct/static_analysis/activity_test.py b/tensorflow/python/autograph/pyct/static_analysis/activity_test.py
index 3a1b552190a..ecf08011627 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/activity_test.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/activity_test.py
@@ -69,11 +69,25 @@ class ScopeTest(test.TestCase):
 
     self.assertMissing(QN('bar'), scope)
 
-    scope.modified.add(QN('bar'))
+  def test_merge_from(self):
+    scope = activity.Scope(None)
+    other = activity.Scope(None)
+
+    for col in (scope.modified, scope.read, scope.bound, scope.deleted):
+      col.add(QN('foo'))
+
+    for col in (other.modified, other.read, other.bound, other.deleted):
+      col.add(QN('foo'))
+      col.add(QN('bar'))
+
     scope.merge_from(other)
 
-    self.assertWriteOnly(QN('bar'), scope)
-    self.assertMissing(QN('bar'), other)
+    self.assertReadWrite(QN('foo'), scope)
+    self.assertReadWrite(QN('bar'), scope)
+    self.assertIn(QN('foo'), scope.bound)
+    self.assertIn(QN('bar'), scope.bound)
+    self.assertIn(QN('foo'), scope.deleted)
+    self.assertIn(QN('bar'), scope.deleted)
 
   def test_copy_of(self):
     scope = activity.Scope(None)
diff --git a/tensorflow/python/autograph/pyct/static_analysis/type_inference.py b/tensorflow/python/autograph/pyct/static_analysis/type_inference.py
index cf866ad3ec7..b35b1d2c9d8 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/type_inference.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/type_inference.py
@@ -31,7 +31,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from typing import Tuple
+from typing import Any, Callable, Tuple
 
 import gast
 
@@ -75,17 +75,29 @@ class Resolver(object):
     """Resolves the type a literal or static value."""
     raise NotImplementedError('subclasses must implement')
 
-  def res_arg(self, ns, types_ns, f_name, name, type_anno):
-    """Resolves the type of a (possibly annotated) function argument."""
+  def res_arg(self, ns, types_ns, f_name, name, type_anno, f_is_local):
+    """Resolves the type of a (possibly annotated) function argument.
+
+    Args:
+      ns: namespace
+      types_ns: types namespace
+      f_name: str, the function name
+      name: str, the argument name
+      type_anno: the type annotating the argument, if any
+      f_is_local: bool, whether the function is a local function
+    Returns:
+      Set of the argument types.
+    """
     raise NotImplementedError('subclasses must implement')
 
-  def res_call(self, ns, types_ns, node, args, keywords):
+  def res_call(self, ns, types_ns, node, f_type, args, keywords):
     """Resolves the return type an external function or method call.
 
     Args:
       ns: namespace
       types_ns: types namespace
       node: str, the function name
+      f_type: types of the actual function being called, if known
       args: types of each respective argument in node.args
       keywords: types of each respective argument in node.keywords
 
@@ -97,8 +109,9 @@ class Resolver(object):
     """
     raise NotImplementedError('subclasses must implement')
 
-  def res_subscript(self, ns, types_ns, node, value, slice_):
-    """Resolves the return type of a unary operation."""
+  # TODO(mdan): Clean this up.
+  def res_slice(self, ns, types_ns, node_or_slice, value, slice_):
+    """Resolves the return type of slice operation."""
     raise NotImplementedError('subclasses must implement')
 
   def res_compare(self, ns, types_ns, node, left, right):
@@ -154,6 +167,9 @@ class _SymbolTable(object):
     return 'SymbolTable {}'.format(self.types)
 
 
+NO_VALUE = object()
+
+
 class StmtInferrer(gast.NodeVisitor):
   """Runs type inference on a single AST statement.
 
@@ -187,16 +203,13 @@ class StmtInferrer(gast.NodeVisitor):
 
   def visit(self, node):
     types = super().visit(node)
+    if __debug__:
+      self._check_set(types)
     if types is not None:
       # TODO(mdan): Normalize by removing subtypes.
       anno.setanno(node, anno.Static.TYPES, tuple(types))
     return types
 
-  def visit_FunctionDef(self, node):
-    # Skip local function definitions. They are analyzed separately.
-    # TODO(mdan): Don't skip. Analyze side effects instead.
-    return None
-
   def _check_set(self, value):
     if value is not None and not isinstance(value, set):
       raise ValueError('{} method expected to return set, got {}'.format(
@@ -216,7 +229,18 @@ class StmtInferrer(gast.NodeVisitor):
       return {Tuple}
 
     assert isinstance(node.ctx, gast.Store)
-    # TODO(mdan): Implement tuple unpacking.
+
+    if self.rtype is not None:
+      original_stype = self.rtype
+      # TODO(mdan): Find a better way to express unpacking.
+      i_type = self.resolver.res_value(self.namespace, 0)
+      for i, elt in enumerate(node.elts):
+        self.rtype = self.resolver.res_subscript(
+            self.namespace, self.types_in.types, i, original_stype, i_type)
+        self.visit(elt)
+      self.rtype = original_stype
+      return original_stype
+
     return None
 
   def visit_List(self, node):
@@ -236,19 +260,25 @@ class StmtInferrer(gast.NodeVisitor):
 
     if isinstance(node.ctx, gast.Load):
       types = self.types_in.types.get(name, None)
-      if (types is None) and (name not in self.scope.bound):
-        if name in self.closure_types:
-          types = self.closure_types[name]
-        else:
-          types, value = self.resolver.res_name(
-              self.namespace, self.types_in.types, name)
-          if value is not None:
-            anno.setanno(node, anno.Static.VALUE, value)
+      if types is None:
+        if (name not in self.scope.bound) or (name in self.scope.nonlocals):
+          # TODO(mdan): Test with global variables.
+          if name in self.closure_types:
+            types = self.closure_types[name]
+          else:
+            types, value = self.resolver.res_name(
+                self.namespace, self.types_in.types, name)
+            if value is not None:
+              anno.setanno(node, anno.Static.VALUE, value)
 
     elif isinstance(node.ctx, gast.Param):
+      # The direct parent it the whole function scope. See activity.py.
+      f_is_local = self.scope.parent.parent is not None
+
       type_name = anno.getanno(node.annotation, anno.Basic.QN, None)
       types = self.resolver.res_arg(self.namespace, self.types_in.types,
-                                    self.scope.function_name, name, type_name)
+                                    self.scope.function_name, name, type_name,
+                                    f_is_local)
       if types is not None:
         self.new_symbols[name] = types
 
@@ -271,7 +301,17 @@ class StmtInferrer(gast.NodeVisitor):
     # Attempt to use the static value if known.
     parent_value = anno.Static.VALUE.of(node.value, None)
     if parent_value is not None:
-      static_value = getattr(parent_value, node.attr, None)
+      static_value = getattr(parent_value, node.attr, NO_VALUE)
+
+      if static_value is NO_VALUE:
+        # Unexpected failure to resolve attribute. Ask the resolver about the
+        # full name instead.
+        types, static_value = self.resolver.res_name(
+            self.namespace, self.types_in, anno.Basic.QN.of(node))
+        anno.setanno(node, anno.Static.VALUE, static_value)
+        if __debug__:
+          self._check_set(types)
+        return types
 
     else:
       # Fall back to the type if that is known.
@@ -298,21 +338,74 @@ class StmtInferrer(gast.NodeVisitor):
 
     return types
 
+  def visit_FunctionDef(self, node):
+    f_name = qual_names.QN(node.name)
+
+    if node.decorator_list:
+      raise NotImplementedError('decorators: {}'.format(node.decorator_list))
+
+    ret_types = None
+    if node.returns:
+      ret_types, _ = self.resolver.res_name(
+          self.namespace, self.types_in.types, anno.Basic.QN.of(node.returns))
+      if __debug__:
+        self._check_set(ret_types)
+
+    if ret_types is None:
+      ret_types = {Any}
+
+    f_types = set()
+    for rt in ret_types:
+      f_types.add(Callable[[Any], rt])
+
+    self.new_symbols[f_name] = f_types
+    # The definition of a function is an expression, hence has no return value.
+    return None
+
+  def _resolve_typed_callable(self, f_types, arg_types, keyword_types):
+    ret_types = set()
+    for t in f_types:
+
+      if isinstance(t, Callable):
+        # Note: these are undocummented - may be version-specific!
+        # Callable[[x], y]: __args__ are (x, y)
+        args = t.__args__
+        if args:
+          ret_types.add(args[-1])
+        else:
+          ret_types.add(Any)
+      else:
+        raise NotImplementedError('callable type {}'.format(type(t)))
+
+    # Side effects can not be inferred based on type alone.
+    side_effects = None
+    return ret_types, side_effects
+
   def visit_Call(self, node):
     self.visit(node.func)
 
-    f_name = anno.getanno(node.func, anno.Basic.QN)
-    if f_name in self.scope.bound:
-      # Don't attempt external resolution of local functions.
-      # TODO(mdan): Use type annotations of the local definition.
-      return None
-
+    f_name = anno.Basic.QN.of(node.func)
     arg_types = [self.visit(a) for a in node.args]
     keyword_types = [self.visit(kw.value) for kw in node.keywords]
 
-    ret_type, side_effects = self.resolver.res_call(self.namespace,
-                                                    self.types_in.types, node,
-                                                    arg_types, keyword_types)
+    if f_name in self.scope.bound:
+      # Local function, use local type definitions, if available.
+      f_type = self.types_in.types.get(f_name, None)
+      if f_type is None:
+        # No static type info available, nothing more to do.
+        ret_type, side_effects = None, None
+      else:
+        ret_type, side_effects = self._resolve_typed_callable(
+            f_type, arg_types, keyword_types)
+
+    else:
+      # Nonlocal function, resolve externally.
+      f_type = anno.Static.TYPES.of(node.func, None)
+      ret_type, side_effects = self.resolver.res_call(self.namespace,
+                                                      self.types_in.types, node,
+                                                      f_type, arg_types,
+                                                      keyword_types)
+
     if __debug__:
       self._check_set(ret_type)
       if side_effects:
@@ -328,6 +421,9 @@ class StmtInferrer(gast.NodeVisitor):
       self.new_symbols.update(side_effects)
     return ret_type
 
+  def visit_Expr(self, node):
+    return self.visit(node.value)
+
   def visit_Index(self, node):
     return self.visit(node.value)
 
@@ -404,15 +500,24 @@ class Analyzer(cfg.GraphVisitor):
     self.scope = scope
     self.closure_types = closure_types
 
+    context_types = {
+        n: t for n, t in closure_types.items() if n not in scope.bound
+    }
+    if context_types:
+      self.context_types = _SymbolTable()
+      self.context_types.types = context_types
+    else:
+      self.context_types = None
+
   def init_state(self, _):
     return _SymbolTable()
 
   def _update_closure_types(self, ast_node, types):
-    existing_types = anno.getanno(ast_node, anno.Static.CLOSURE_TYPES, None)
+    existing_types = anno.Static.CLOSURE_TYPES.of(ast_node, None)
 
     if existing_types is None:
       existing_types = {}
-      anno.setanno(ast_node, anno.Static.CLOSURE_TYPES, existing_types)
+      anno.Static.CLOSURE_TYPES.add_to(ast_node, existing_types)
 
     for k, v in types.types.items():
       if k in existing_types:
@@ -426,6 +531,8 @@ class Analyzer(cfg.GraphVisitor):
     types_in = _SymbolTable()
     for n in node.prev:
       types_in |= self.out[n]
+    if (self.context_types is not None) and (node is self.graph.entry):
+      types_in |= self.context_types
 
     types_out = _SymbolTable(types_in)
     ast_node = node.ast_node
@@ -435,8 +542,8 @@ class Analyzer(cfg.GraphVisitor):
     inferrer.visit(ast_node)
     types_out.types.update(inferrer.new_symbols)
 
-    reaching_fndefs = anno.getanno(ast_node, anno.Static.DEFINED_FNS_IN)
-    node_scope = anno.getanno(ast_node, anno.Static.SCOPE, None)
+    reaching_fndefs = anno.Static.DEFINED_FNS_IN.of(ast_node)
+    node_scope = anno.Static.SCOPE.of(ast_node, None)
     if node_scope is not None:
       # TODO(mdan): Check that it's actually safe to skip nodes without scope.
       reads = {str(qn) for qn in node_scope.read}
diff --git a/tensorflow/python/autograph/pyct/static_analysis/type_inference_test.py b/tensorflow/python/autograph/pyct/static_analysis/type_inference_test.py
index e3cb7e04c61..5648f8dcb62 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/type_inference_test.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/type_inference_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from typing import Any, Callable, Tuple
+
 from tensorflow.python.autograph.pyct import anno
 from tensorflow.python.autograph.pyct import cfg
 from tensorflow.python.autograph.pyct import qual_names
@@ -33,12 +35,17 @@ class BasicTestResolver(type_inference.Resolver):
   """A very basic resolver for testing."""
 
   def res_name(self, ns, types_ns, name):
-    return {type(ns[str(name)])}, ns[str(name)]
+    str_name = str(name)
+    if str_name == 'int':
+      return {int}, int
+    return {type(ns[str_name])}, ns[str_name]
 
   def res_value(self, ns, value):
     return {type(value)}
 
-  def res_arg(self, ns, types_ns, f_name, name, type_anno):
+  def res_arg(self, ns, types_ns, f_name, name, type_anno, f_is_local):
+    if type_anno is None:
+      return None
     return {str(type_anno)}
 
 
@@ -72,13 +79,15 @@ class TypeInferenceAnalyzerTest(test.TestCase):
   def assertClosureTypes(self, node, expected):
     actual = anno.getanno(node, anno.Static.CLOSURE_TYPES)
     actual = {str(k): v for k, v in actual.items()}
-    self.assertDictEqual(actual, expected)
+    for k, v in expected.items():
+      self.assertIn(k, actual)
+      self.assertEqual(actual[k], v)
 
   def test_no_inference_on_unknown_operand_types(self):
 
     class Resolver(type_inference.Resolver):
 
-      def res_arg(self, ns, types_ns, f_name, name, type_anno):
+      def res_arg(self, ns, types_ns, f_name, name, type_anno, f_is_local):
         return None
 
     def test_fn(a, b):
@@ -97,7 +106,7 @@ class TypeInferenceAnalyzerTest(test.TestCase):
 
     class Resolver(type_inference.Resolver):
 
-      def res_arg(self, ns, types_ns, f_name, name, type_anno):
+      def res_arg(self, ns, types_ns, f_name, name, type_anno, f_is_local):
         return 1
 
     def test_fn(a):
@@ -113,7 +122,8 @@ class TypeInferenceAnalyzerTest(test.TestCase):
 
     class Resolver(type_inference.Resolver):
 
-      def res_arg(self, ns, types_ns, f_name, name, type_anno):
+      def res_arg(self, ns, types_ns, f_name, name, type_anno, f_is_local):
+        test_self.assertFalse(f_is_local)
         if name == qual_names.QN('a'):
           test_self.assertEqual(type_anno, qual_names.QN('int'))
         return {str(name) + '_type'}
@@ -129,19 +139,41 @@ class TypeInferenceAnalyzerTest(test.TestCase):
 
   def test_argument_of_local_function(self):
 
+    test_self = self
+
+    class Resolver(type_inference.Resolver):
+
+      def res_arg(self, ns, types_ns, f_name, name, type_anno, f_is_local):
+        if f_name == 'test_fn':
+          test_self.assertFalse(f_is_local)
+          test_self.assertEqual(name, qual_names.QN('a'))
+          test_self.assertEqual(type_anno, qual_names.QN('int'))
+        elif f_name == 'foo':
+          test_self.assertTrue(f_is_local)
+          if name == qual_names.QN('x'):
+            test_self.assertEqual(type_anno, qual_names.QN('float'))
+          elif name == qual_names.QN('y'):
+            test_self.assertIsNone(type_anno)
+          else:
+            test_self.fail('unexpected argument {} for {}'.format(name, f_name))
+        else:
+          test_self.fail('unexpected function name {}'.format(f_name))
+        return {str(name) + '_type'}
+
     def test_fn(a: int):
 
-      def foo(x: float):
-        return x
+      def foo(x: float, y):
+        return x, y
 
-      return foo(a)
+      return foo(a, a)
 
-    tr = TestTranspiler(BasicTestResolver)
+    tr = TestTranspiler(Resolver)
     node, _ = tr.transform(test_fn, None)
     fn_body = node.body
 
-    self.assertTypes(fn_body[0].body[0].value, 'float')
-    self.assertClosureTypes(fn_body[0], {'a': {'int'}})
+    self.assertTypes(fn_body[0].body[0].value, Tuple)
+    self.assertTypes(fn_body[0].body[0].value.elts[0], 'x_type')
+    self.assertTypes(fn_body[0].body[0].value.elts[1], 'y_type')
 
   def test_assign_straightline(self):
 
@@ -160,19 +192,20 @@ class TypeInferenceAnalyzerTest(test.TestCase):
 
   def test_expr(self):
 
-    self_test = self
+    test_self = self
 
     class Resolver(type_inference.Resolver):
 
       def res_value(self, ns, value):
-        self_test.assertEqual(value, tc.a)
+        test_self.assertEqual(value, tc.a)
         return {str}
 
       def res_name(self, ns, types_ns, name):
-        self_test.assertEqual(name, qual_names.QN('tc'))
+        test_self.assertEqual(name, qual_names.QN('tc'))
         return {TestClass}, tc
 
-      def res_call(self, ns, types_ns, node, args, keywords):
+      def res_call(self, ns, types_ns, node, f_type, args, keywords):
+        test_self.assertEqual(f_type, (str,))
         return {int}, None
 
     class TestClass:
@@ -188,10 +221,11 @@ class TypeInferenceAnalyzerTest(test.TestCase):
     node, _ = TestTranspiler(Resolver).transform(test_fn, None)
     fn_body = node.body
 
-    self.assertTypes(fn_body[0].value, int)
-    self.assertTypes(fn_body[0].value.func, str)
     self.assertEqual(
         anno.getanno(fn_body[0].value.func, anno.Static.VALUE), tc.a)
+    self.assertTypes(fn_body[0].value.func, str)
+    self.assertTypes(fn_body[0].value, int)
+    self.assertTypes(fn_body[0], int)
 
   def test_assign_overwriting(self):
 
@@ -393,7 +427,8 @@ class TypeInferenceAnalyzerTest(test.TestCase):
         test_self.assertEqual(name, qual_names.QN('g'))
         return {str}, g
 
-      def res_call(self, ns, types_ns, node, args, keywords):
+      def res_call(self, ns, types_ns, node, f_type, args, keywords):
+        test_self.assertEqual(f_type, (str,))
         test_self.assertEqual(
             anno.getanno(node.func, anno.Basic.QN), qual_names.QN('g'))
         return {float}, None
@@ -422,10 +457,11 @@ class TypeInferenceAnalyzerTest(test.TestCase):
         test_self.assertEqual(name, qual_names.QN('g'))
         return None, g
 
-      def res_arg(self, ns, types_ns, f_name, name, type_anno):
+      def res_arg(self, ns, types_ns, f_name, name, type_anno, f_is_local):
         return {str(type_anno)}
 
-      def res_call(self, ns, types_ns, node, args, keywords):
+      def res_call(self, ns, types_ns, node, f_type, args, keywords):
+        test_self.assertIsNone(f_type)
         return None, {qual_names.QN('x'): {str}}
 
     def g():
@@ -463,6 +499,42 @@ class TypeInferenceAnalyzerTest(test.TestCase):
     self.assertTypes(fn_body[0].body[0].value, 'int')
     self.assertClosureTypes(fn_body[0], {'x': {'int'}})
 
+  def test_local_function_closure_nested(self):
+
+    def test_fn(x: int):
+
+      def foo():
+
+        def bar():
+          return x
+
+        bar()
+
+      foo()
+
+    node, _ = TestTranspiler(BasicTestResolver).transform(test_fn, None)
+    fn_body = node.body
+
+    self.assertTypes(fn_body[0].body[0].body[0].value, 'int')
+    self.assertClosureTypes(fn_body[0], {'x': {'int'}})
+    self.assertClosureTypes(fn_body[0].body[0], {'x': {'int'}})
+
+  def test_local_function_closure_mutable_var(self):
+
+    def test_fn(x: int):
+
+      def foo():
+        nonlocal x
+        return x
+
+      foo()
+
+    node, _ = TestTranspiler(BasicTestResolver).transform(test_fn, None)
+    fn_body = node.body
+
+    self.assertTypes(fn_body[0].body[1].value, 'int')
+    self.assertClosureTypes(fn_body[0], {'x': {'int'}})
+
   def test_local_function_closure_ignored_for_bound_symbols(self):
 
     def test_fn(x: float):  # pylint:disable=unused-argument
@@ -496,13 +568,89 @@ class TypeInferenceAnalyzerTest(test.TestCase):
     self.assertTypes(fn_body[1].targets[0], float)
     self.assertClosureTypes(fn_body[0], {'x': {float}})
 
+  def test_local_function_hides_locals(self):
+
+    def test_fn(a: int):  # pylint:disable=unused-argument
+
+      def local_fn(v):
+        a = v
+        return a
+
+      local_fn(1)
+
+    node, _ = TestTranspiler(BasicTestResolver).transform(test_fn, None)
+    fn_body = node.body
+
+    self.assertFalse(
+        anno.hasanno(fn_body[0].body[0].targets[0], anno.Static.TYPES))
+
+  def test_local_function_type(self):
+
+    def test_fn(x: int):
+
+      def foo() -> int:
+        return x
+
+      foo()
+
+    node, _ = TestTranspiler(BasicTestResolver).transform(test_fn, None)
+    fn_body = node.body
+
+    self.assertTypes(fn_body[1].value.func, Callable[[Any], int])
+    self.assertTypes(fn_body[1].value, int)
+    self.assertTypes(fn_body[1], int)
+
+  def test_side_effects_on_arg_function_closure(self):
+
+    test_self = self
+
+    class Resolver(type_inference.Resolver):
+
+      def res_name(self, ns, types_ns, name):
+        test_self.assertEqual(name, qual_names.QN('g'))
+        return {Callable[[Callable], None]}, g
+
+      def res_value(self, ns, value):
+        test_self.assertEqual(value, 1.0)
+        return {float}
+
+      def res_arg(self, ns, types_ns, f_name, name, type_anno, f_is_local):
+        return {str(type_anno)}
+
+      def res_call(self, ns, types_ns, node, f_type, args, keywords):
+        test_self.assertEqual(node.func.id, 'g')
+        test_self.assertEqual(f_type, (Callable[[Callable], None],))
+        return None, {qual_names.QN('x'): {str}}
+
+    def g(foo):
+      # The resolver will convey that this function has the following body:
+      #
+      #   nonlocal x
+      #   x = 'a'
+      #   foo()
+      del foo
+      pass
+
+    def test_fn(x: int):  # pylint:disable=unused-argument
+
+      def foo():
+        return x
+
+      x = 1.0
+      g(foo)
+
+    node, _ = TestTranspiler(Resolver).transform(test_fn, None)
+    fn_body = node.body
+
+    self.assertTypes(fn_body[0].body[0].value, str)
+
   def test_subscript(self):
 
     test_self = self
 
     class Resolver(type_inference.Resolver):
 
-      def res_arg(self, ns, types_ns, f_name, name, type_anno):
+      def res_arg(self, ns, types_ns, f_name, name, type_anno, f_is_local):
         return {list}
 
       def res_value(self, ns, value):
@@ -523,13 +671,45 @@ class TypeInferenceAnalyzerTest(test.TestCase):
     self.assertTypes(fn_body[0].value.value, list)
     self.assertTypes(fn_body[0].value.slice.value, int)
 
+  def test_tuple_unpacking(self):
+
+    test_self = self
+
+    class Resolver(type_inference.Resolver):
+
+      def res_arg(self, ns, types_ns, f_name, name, type_anno, f_is_local):
+        return {list}
+
+      def res_value(self, ns, value):
+        return {int}
+
+      def res_subscript(self, ns, types_ns, node_or_slice, value, slice_):
+        test_self.assertIn(node_or_slice, (0, 1))
+        test_self.assertSetEqual(value, {list})
+        test_self.assertSetEqual(slice_, {int})
+        if node_or_slice == 0:
+          return {float}
+        else:
+          return {str}
+
+    def test_fn(t):
+      a, b = t
+      return a, b
+
+    node, _ = TestTranspiler(Resolver).transform(test_fn, None)
+    fn_body = node.body
+
+    self.assertTypes(fn_body[1].value, Tuple)
+    self.assertTypes(fn_body[1].value.elts[0], float)
+    self.assertTypes(fn_body[1].value.elts[1], str)
+
   def test_compare(self):
 
     test_self = self
 
     class Resolver(type_inference.Resolver):
 
-      def res_arg(self, ns, types_ns, f_name, name, type_anno):
+      def res_arg(self, ns, types_ns, f_name, name, type_anno, f_is_local):
         return {int}
 
       def res_compare(self, ns, types_ns, node, left, right):
@@ -553,7 +733,7 @@ class TypeInferenceAnalyzerTest(test.TestCase):
 
     class Resolver(type_inference.Resolver):
 
-      def res_arg(self, ns, types_ns, f_name, name, type_anno):
+      def res_arg(self, ns, types_ns, f_name, name, type_anno, f_is_local):
         return {list}
 
       def res_binop(self, ns, types_ns, node, left, right):
diff --git a/tensorflow/python/autograph/utils/testing.py b/tensorflow/python/autograph/utils/testing.py
index f4238bea397..bec6966e7cb 100644
--- a/tensorflow/python/autograph/utils/testing.py
+++ b/tensorflow/python/autograph/utils/testing.py
@@ -18,10 +18,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import re
 import types
 import unittest
 
 from tensorflow.python.eager import def_function
+from tensorflow.python.framework import op_callbacks
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -50,6 +52,10 @@ class AutoGraphTestCase(test.TestCase):
 
         baz_actual, value_actual = test_fn()
         self.assertEqual(baz_actual, value_actual)
+
+  Only assertions that require evaluation outside the function are lifted
+  outside the function scope. The rest execute inline, at function creation
+  time.
   """
 
   def __new__(cls, *args):
@@ -65,18 +71,31 @@ class AutoGraphTestCase(test.TestCase):
 
     return obj
 
+  def _op_callback(
+      self, op_type, inputs, attrs, outputs, op_name=None, graph=None):
+    self.trace_log.append(op_type)
+
   def _run_as_tf_function(self, fn):
 
     def wrapper(self):
       @def_function.function(autograph=False)  # Testing autograph itself.
       def fn_wrapper():
         self.assertions = []
+        self.graph_assertions = []
+        self.trace_log = []
         fn()
         targets = [args for _, args in self.assertions]
         return targets
-      actuals = self.evaluate(fn_wrapper())
-      for (_, args), value in zip(self.assertions, actuals):
-        args[:] = value
+
+      tensors = fn_wrapper()
+
+      for assertion in self.graph_assertions:
+        assertion(fn_wrapper.get_concrete_function().graph)
+
+      actuals = self.evaluate(tensors)
+      for (assertion, _), values in zip(self.assertions, actuals):
+        assertion(*values)
+
     return wrapper
 
   def variable(self, name, value, dtype):
@@ -89,11 +108,40 @@ class AutoGraphTestCase(test.TestCase):
   def setUp(self):
     super().setUp()
     self.variables = {}
+    self.trace_log = []
+    op_callbacks.add_op_callback(self._op_callback)
 
   def tearDown(self):
-    for fn, args in self.assertions:
-      fn(*args)
+    op_callbacks.remove_op_callback(self._op_callback)
+    self.trace_log = None
+    self.variables = None
     super().tearDown()
 
+  def assertGraphContains(self, op_regex, n):
+    def assertion(graph):
+      matches = []
+      for node in graph.as_graph_def().node:
+        if re.match(op_regex, node.name):
+          matches.append(node)
+      for fn in graph.as_graph_def().library.function:
+        for node_def in fn.node_def:
+          if re.match(op_regex, node_def.name):
+            matches.append(node_def)
+      self.assertLen(matches, n)
+
+    self.graph_assertions.append(assertion)
+
+  def assertOpCreated(self, op_type):
+    self.assertIn(op_type, self.trace_log)
+
+  def assertOpsNotCreated(self, op_types):
+    self.assertEmpty(set(op_types) & set(self.trace_log))
+
+  def assertNoOpsCreated(self):
+    self.assertEmpty(self.trace_log)
+
   def assertEqual(self, *args):
     self.assertions.append((super().assertEqual, list(args)))
+
+  def assertDictEqual(self, *args):
+    self.assertions.append((super().assertDictEqual, list(args)))
diff --git a/tensorflow/python/client/device_lib_test.py b/tensorflow/python/client/device_lib_test.py
index fec41f50b6c..431cafa0371 100644
--- a/tensorflow/python/client/device_lib_test.py
+++ b/tensorflow/python/client/device_lib_test.py
@@ -39,8 +39,7 @@ class DeviceLibTest(test_util.TensorFlowTestCase):
     # GPU test
     if test.is_gpu_available():
       self.assertGreater(len(devices), 1)
-      self.assertTrue("GPU" in [d.device_type for d in devices] or
-                      "SYCL" in [d.device_type for d in devices])
+      self.assertIn("GPU", [d.device_type for d in devices])
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/client/session.py b/tensorflow/python/client/session.py
index bcd27fb6318..5a83f5776a1 100644
--- a/tensorflow/python/client/session.py
+++ b/tensorflow/python/client/session.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import functools
 import re
 import threading
@@ -41,13 +42,14 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.experimental import mixed_precision_global_state
 from tensorflow.python.util import compat
 from tensorflow.python.util import nest
-from tensorflow.python.util.tf_export import tf_export
 from tensorflow.python.util.compat import collections_abc
+from tensorflow.python.util.tf_export import tf_export
 
 _python_session_create_counter = monitoring.Counter(
     '/tensorflow/api/python/session_create_counter',
     'Counter for number of sessions created in Python.')
 
+
 class SessionInterface(object):
   """Base class for implementations of TensorFlow client sessions."""
 
@@ -406,6 +408,12 @@ class _DictFetchMapper(_FetchMapper):
       fetches: Dict of fetches.
     """
     self._fetch_type = type(fetches)
+    if isinstance(fetches, collections.defaultdict):
+      self._type_ctor = functools.partial(collections.defaultdict,
+                                          fetches.default_factory)
+    else:
+      self._type_ctor = self._fetch_type
+
     self._keys = fetches.keys()
     self._mappers = [
         _FetchMapper.for_fetch(fetch) for fetch in fetches.values()
@@ -416,10 +424,12 @@ class _DictFetchMapper(_FetchMapper):
     return self._unique_fetches
 
   def build_results(self, values):
-    results = self._fetch_type()
-    for k, m, vi in zip(self._keys, self._mappers, self._value_indices):
-      results[k] = m.build_results([values[j] for j in vi])
-    return results
+
+    def _generator():
+      for k, m, vi in zip(self._keys, self._mappers, self._value_indices):
+        yield k, m.build_results([values[j] for j in vi])
+
+    return self._type_ctor(_generator())
 
 
 class _AttrsFetchMapper(_FetchMapper):
diff --git a/tensorflow/python/client/session_test.py b/tensorflow/python/client/session_test.py
index 23d5ddaee44..4bf5095ae8b 100644
--- a/tensorflow/python/client/session_test.py
+++ b/tensorflow/python/client/session_test.py
@@ -18,8 +18,8 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
-import random
 import os
+import random
 import sys
 import threading
 import time
@@ -68,6 +68,13 @@ try:
 except ImportError:
   attr = None
 
+try:
+  from frozendict import frozendict  # pylint:disable=g-import-not-at-top
+except ImportError:
+  frozendict = dict  # pylint:disable=invalid-name
+
+defaultdict = collections.defaultdict  # pylint:disable=invalid-name
+
 
 class SessionTest(test_util.TensorFlowTestCase):
 
@@ -222,13 +229,13 @@ class SessionTest(test_util.TensorFlowTestCase):
       res = sess.run(a)
       self.assertEqual(42.0, res)
       res = sess.run(a.op)  # An op, not a tensor.
-      self.assertEqual(None, res)
+      self.assertIsNone(res)
       tensor_runner = sess.make_callable(a)
       res = tensor_runner()
       self.assertEqual(42.0, res)
       op_runner = sess.make_callable(a.op)
       res = op_runner()
-      self.assertEqual(None, res)
+      self.assertIsNone(res)
 
   def testFetchSingletonByName(self):
     with session.Session() as sess:
@@ -236,7 +243,7 @@ class SessionTest(test_util.TensorFlowTestCase):
       res = sess.run(a.name)
       self.assertEqual(42.0, res)
       res = sess.run(a.op)  # An op, not a tensor.
-      self.assertEqual(None, res)
+      self.assertIsNone(res)
 
   def testFetchList(self):
     with session.Session() as sess:
@@ -246,11 +253,11 @@ class SessionTest(test_util.TensorFlowTestCase):
       v = variables.Variable([54.0])
       assign = v.assign([63.0])
       res = sess.run([a, b, c, a.name, assign.op])
-      self.assertTrue(isinstance(res, list))
+      self.assertIsInstance(res, list)
       self.assertEqual([42.0, None, 44.0, 42.0, None], res)
       list_runner = sess.make_callable([a, b, c, a.name, assign.op])
       res = list_runner()
-      self.assertTrue(isinstance(res, list))
+      self.assertIsInstance(res, list)
       self.assertEqual([42.0, None, 44.0, 42.0, None], res)
 
   def testFetchTuple(self):
@@ -259,11 +266,11 @@ class SessionTest(test_util.TensorFlowTestCase):
       b = control_flow_ops.no_op()  # An op, not a tensor.
       c = constant_op.constant(44.0)
       res = sess.run((a, b, c, a.name))
-      self.assertTrue(isinstance(res, tuple))
+      self.assertIsInstance(res, tuple)
       self.assertEqual((42.0, None, 44.0, 42.0), res)
       tuple_runner = sess.make_callable((a, b, c, a.name))
       res = tuple_runner()
-      self.assertTrue(isinstance(res, tuple))
+      self.assertIsInstance(res, tuple)
       self.assertEqual((42.0, None, 44.0, 42.0), res)
 
   def testFetchNamedTuple(self):
@@ -275,15 +282,15 @@ class SessionTest(test_util.TensorFlowTestCase):
       b = control_flow_ops.no_op()  # An op, not a tensor.
       c = constant_op.constant(44.0)
       res = sess.run(ABC(a, b, c))
-      self.assertTrue(isinstance(res, ABC))
+      self.assertIsInstance(res, ABC)
       self.assertEqual(42.0, res.a)
-      self.assertEqual(None, res.b)
+      self.assertIsNone(res.b)
       self.assertEqual(44.0, res.c)
       namedtuple_runner = sess.make_callable(ABC(a, b, c))
       res = namedtuple_runner()
-      self.assertTrue(isinstance(res, ABC))
+      self.assertIsInstance(res, ABC)
       self.assertEqual(42.0, res.a)
-      self.assertEqual(None, res.b)
+      self.assertIsNone(res.b)
       self.assertEqual(44.0, res.c)
 
   def testFetchDict(self):
@@ -292,9 +299,9 @@ class SessionTest(test_util.TensorFlowTestCase):
       b = control_flow_ops.no_op()  # An op, not a tensor.
       c = constant_op.constant(44.0)
       res = sess.run({'a': a, 'b': b, 'c': c})
-      self.assertTrue(isinstance(res, dict))
+      self.assertIsInstance(res, dict)
       self.assertEqual(42.0, res['a'])
-      self.assertEqual(None, res['b'])
+      self.assertIsNone(res['b'])
       self.assertEqual(44.0, res['c'])
 
   def testFetchOrderedDict(self):
@@ -303,10 +310,10 @@ class SessionTest(test_util.TensorFlowTestCase):
       b = control_flow_ops.no_op()  # An op, not a tensor.
       c = constant_op.constant(44.0)
       res = sess.run(collections.OrderedDict([(3, a), (2, b), (1, c)]))
-      self.assertTrue(isinstance(res, collections.OrderedDict))
+      self.assertIsInstance(res, collections.OrderedDict)
       self.assertEqual([3, 2, 1], list(res.keys()))
       self.assertEqual(42.0, res[3])
-      self.assertEqual(None, res[2])
+      self.assertIsNone(res[2])
       self.assertEqual(44.0, res[1])
 
   @test_util.run_v1_only('b/120545219')
@@ -393,23 +400,23 @@ class SessionTest(test_util.TensorFlowTestCase):
       a = constant_op.constant(a_val)
 
       res = sess.run([[], tuple(), {}])
-      self.assertTrue(isinstance(res, list))
+      self.assertIsInstance(res, list)
       self.assertEqual(3, len(res))
-      self.assertTrue(isinstance(res[0], list))
+      self.assertIsInstance(res[0], list)
       self.assertEqual(0, len(res[0]))
-      self.assertTrue(isinstance(res[1], tuple))
+      self.assertIsInstance(res[1], tuple)
       self.assertEqual(0, len(res[1]))
-      self.assertTrue(isinstance(res[2], dict))
+      self.assertIsInstance(res[2], dict)
       self.assertEqual(0, len(res[2]))
 
       res = sess.run([[], tuple(), {}, a])
-      self.assertTrue(isinstance(res, list))
+      self.assertIsInstance(res, list)
       self.assertEqual(4, len(res))
-      self.assertTrue(isinstance(res[0], list))
+      self.assertIsInstance(res[0], list)
       self.assertEqual(0, len(res[0]))
-      self.assertTrue(isinstance(res[1], tuple))
+      self.assertIsInstance(res[1], tuple)
       self.assertEqual(0, len(res[1]))
-      self.assertTrue(isinstance(res[2], dict))
+      self.assertIsInstance(res[2], dict)
       self.assertEqual(0, len(res[2]))
       self.assertEqual(a_val, res[3])
 
@@ -417,7 +424,7 @@ class SessionTest(test_util.TensorFlowTestCase):
     with session.Session() as sess:
       # pylint: disable=invalid-name
       ABC = collections.namedtuple('ABC', ['a', 'b', 'c'])
-      DEFG = collections.namedtuple('DEFG', ['d', 'e', 'f', 'g'])
+      DEFGHI = collections.namedtuple('DEFGHI', ['d', 'e', 'f', 'g', 'h', 'i'])
       # pylint: enable=invalid-name
       a_val = 42.0
       b_val = None
@@ -425,124 +432,141 @@ class SessionTest(test_util.TensorFlowTestCase):
       a = constant_op.constant(a_val)
       b = control_flow_ops.no_op()  # An op, not a tensor.
       c = constant_op.constant(c_val)
-      # List of lists, tuples, namedtuple, and dict
-      res = sess.run([[a, b, c], (a, b, c),
-                      ABC(a=a, b=b, c=c), {
-                          'a': a.name,
-                          'c': c,
-                          'b': b
-                      }])
-      self.assertTrue(isinstance(res, list))
-      self.assertEqual(4, len(res))
-      self.assertTrue(isinstance(res[0], list))
+      test_dct = {'a': a.name, 'c': c, 'b': b}
+      test_dct_types = [dict, frozendict, defaultdict]
+      # List of lists, tuples, namedtuple, dict, frozendict, and defaultdict
+      res = sess.run([
+          [a, b, c],
+          (a, b, c),
+          ABC(a=a, b=b, c=c),
+          dict(test_dct),
+          frozendict(test_dct),
+          defaultdict(str, test_dct),
+      ])
+      self.assertIsInstance(res, list)
+      self.assertEqual(6, len(res))
+      self.assertIsInstance(res[0], list)
       self.assertEqual(3, len(res[0]))
       self.assertEqual(a_val, res[0][0])
       self.assertEqual(b_val, res[0][1])
       self.assertEqual(c_val, res[0][2])
-      self.assertTrue(isinstance(res[1], tuple))
+      self.assertIsInstance(res[1], tuple)
       self.assertEqual(3, len(res[1]))
       self.assertEqual(a_val, res[1][0])
       self.assertEqual(b_val, res[1][1])
       self.assertEqual(c_val, res[1][2])
-      self.assertTrue(isinstance(res[2], ABC))
+      self.assertIsInstance(res[2], ABC)
       self.assertEqual(a_val, res[2].a)
       self.assertEqual(b_val, res[2].b)
       self.assertEqual(c_val, res[2].c)
-      self.assertTrue(isinstance(res[3], dict))
-      self.assertEqual(3, len(res[3]))
-      self.assertEqual(a_val, res[3]['a'])
-      self.assertEqual(b_val, res[3]['b'])
-      self.assertEqual(c_val, res[3]['c'])
-      # Tuple of lists, tuples, namedtuple, and dict
-      res = sess.run(([a, b, c], (a.name, b, c), ABC(a=a, b=b, c=c), {
-          'a': a,
-          'c': c,
-          'b': b
-      }))
-      self.assertTrue(isinstance(res, tuple))
-      self.assertEqual(4, len(res))
-      self.assertTrue(isinstance(res[0], list))
+      for expected_type, r in zip(test_dct_types, res[3:]):
+        self.assertIsInstance(r, expected_type)
+        self.assertEqual(3, len(r))
+        self.assertEqual(a_val, r['a'])
+        self.assertEqual(b_val, r['b'])
+        self.assertEqual(c_val, r['c'])
+      self.assertEqual(res[5].default_factory, str)
+      # Tuple of lists, tuples, namedtuple, dict, frozendict, and defaultdict
+      res = sess.run(([a, b, c], (a.name, b, c), ABC(a=a, b=b,
+                                                     c=c), dict(test_dct),
+                      frozendict(test_dct), defaultdict(str, test_dct)))
+      self.assertIsInstance(res, tuple)
+      self.assertEqual(6, len(res))
+      self.assertIsInstance(res[0], list)
       self.assertEqual(3, len(res[0]))
       self.assertEqual(a_val, res[0][0])
       self.assertEqual(b_val, res[0][1])
       self.assertEqual(c_val, res[0][2])
-      self.assertTrue(isinstance(res[1], tuple))
+      self.assertIsInstance(res[1], tuple)
       self.assertEqual(3, len(res[1]))
       self.assertEqual(a_val, res[1][0])
       self.assertEqual(b_val, res[1][1])
       self.assertEqual(c_val, res[1][2])
-      self.assertTrue(isinstance(res[2], ABC))
+      self.assertIsInstance(res[2], ABC)
       self.assertEqual(a_val, res[2].a)
       self.assertEqual(b_val, res[2].b)
       self.assertEqual(c_val, res[2].c)
-      self.assertTrue(isinstance(res[3], dict))
-      self.assertEqual(3, len(res[3]))
-      self.assertEqual(a_val, res[3]['a'])
-      self.assertEqual(b_val, res[3]['b'])
-      self.assertEqual(c_val, res[3]['c'])
-      # Namedtuple of lists, tuples, namedtuples, and dict
+      for expected_type, r in zip(test_dct_types, res[3:]):
+        self.assertIsInstance(r, expected_type)
+        self.assertEqual(3, len(r))
+        self.assertEqual(a_val, r['a'])
+        self.assertEqual(b_val, r['b'])
+        self.assertEqual(c_val, r['c'])
+      self.assertEqual(res[5].default_factory, str)
+
+      # Namedtuple of lists, tuples, namedtuples, dict, frozendict, defaultdict
       res = sess.run(
-          DEFG(
+          DEFGHI(
               d=[a, b, c],
               e=(a, b, c),
               f=ABC(a=a.name, b=b, c=c),
-              g={
-                  'a': a,
-                  'c': c,
-                  'b': b
-              }))
-      self.assertTrue(isinstance(res, DEFG))
-      self.assertTrue(isinstance(res.d, list))
+              g=dict(test_dct),
+              h=frozendict(test_dct),
+              i=defaultdict(str, test_dct)))
+      self.assertIsInstance(res, DEFGHI)
+      self.assertIsInstance(res.d, list)
       self.assertEqual(3, len(res.d))
       self.assertEqual(a_val, res.d[0])
       self.assertEqual(b_val, res.d[1])
       self.assertEqual(c_val, res.d[2])
-      self.assertTrue(isinstance(res.e, tuple))
+      self.assertIsInstance(res.e, tuple)
       self.assertEqual(3, len(res.e))
       self.assertEqual(a_val, res.e[0])
       self.assertEqual(b_val, res.e[1])
       self.assertEqual(c_val, res.e[2])
-      self.assertTrue(isinstance(res.f, ABC))
+      self.assertIsInstance(res.f, ABC)
       self.assertEqual(a_val, res.f.a)
       self.assertEqual(b_val, res.f.b)
       self.assertEqual(c_val, res.f.c)
-      self.assertTrue(isinstance(res.g, dict))
+      self.assertIsInstance(res.g, dict)
       self.assertEqual(3, len(res.g))
       self.assertEqual(a_val, res.g['a'])
       self.assertEqual(b_val, res.g['b'])
       self.assertEqual(c_val, res.g['c'])
-      # Dict of lists, tuples, namedtuples, and dict
+      self.assertIsInstance(res.h, frozendict)
+      self.assertEqual(3, len(res.h))
+      self.assertEqual(a_val, res.h['a'])
+      self.assertEqual(b_val, res.h['b'])
+      self.assertEqual(c_val, res.h['c'])
+      self.assertIsInstance(res.i, defaultdict)
+      self.assertEqual(3, len(res.i))
+      self.assertEqual(a_val, res.i['a'])
+      self.assertEqual(b_val, res.i['b'])
+      self.assertEqual(c_val, res.i['c'])
+      self.assertEqual(res.i.default_factory, str)
+      # Dict of lists, tuples, namedtuples, dict, frozendict, defaultdict
       res = sess.run({
           'd': [a, b, c],
           'e': (a, b, c),
           'f': ABC(a=a, b=b, c=c),
-          'g': {
-              'a': a.name,
-              'c': c,
-              'b': b
-          }
+          'g': dict(test_dct),
+          'h': frozendict(test_dct),
+          'i': defaultdict(str, test_dct),
       })
-      self.assertTrue(isinstance(res, dict))
-      self.assertEqual(4, len(res))
-      self.assertTrue(isinstance(res['d'], list))
+      self.assertIsInstance(res, dict)
+      self.assertEqual(6, len(res))
+      self.assertIsInstance(res['d'], list)
       self.assertEqual(3, len(res['d']))
       self.assertEqual(a_val, res['d'][0])
       self.assertEqual(b_val, res['d'][1])
       self.assertEqual(c_val, res['d'][2])
-      self.assertTrue(isinstance(res['e'], tuple))
+      self.assertIsInstance(res['e'], tuple)
       self.assertEqual(3, len(res['e']))
       self.assertEqual(a_val, res['e'][0])
       self.assertEqual(b_val, res['e'][1])
       self.assertEqual(c_val, res['e'][2])
-      self.assertTrue(isinstance(res['f'], ABC))
+      self.assertIsInstance(res['f'], ABC)
       self.assertEqual(a_val, res['f'].a)
       self.assertEqual(b_val, res['f'].b)
       self.assertEqual(c_val, res['f'].c)
-      self.assertTrue(isinstance(res['g'], dict))
-      self.assertEqual(3, len(res['g']))
-      self.assertEqual(a_val, res['g']['a'])
-      self.assertEqual(b_val, res['g']['b'])
-      self.assertEqual(c_val, res['g']['c'])
+      for expected_type, r_key in zip(test_dct_types, ('g', 'h', 'i')):
+        r = res[r_key]
+        self.assertIsInstance(r, expected_type)
+        self.assertEqual(3, len(r))
+        self.assertEqual(a_val, r['a'])
+        self.assertEqual(b_val, r['b'])
+        self.assertEqual(c_val, r['c'])
+      self.assertEqual(res['i'].default_factory, str)
 
   def testFetchTensorObject(self):
     with session.Session() as s:
@@ -1279,7 +1303,7 @@ class SessionTest(test_util.TensorFlowTestCase):
   @test_util.run_v1_only('b/120545219')
   def testNotEntered(self):
     # pylint: disable=protected-access
-    self.assertEqual(ops._default_session_stack.get_default(), None)
+    self.assertIsNone(ops._default_session_stack.get_default())
     # pylint: enable=protected-access
     with ops.device('/cpu:0'):
       sess = session.Session()
@@ -1326,10 +1350,10 @@ class SessionTest(test_util.TensorFlowTestCase):
     with warnings.catch_warnings(record=True) as w:
       sess2 = session.InteractiveSession()
     self.assertEqual(1, len(w))
-    self.assertTrue('An interactive session is already active. This can cause '
-                    'out-of-memory errors in some cases. You must explicitly '
-                    'call `InteractiveSession.close()` to release resources '
-                    'held by the other session(s).' in str(w[0].message))
+    self.assertIn('An interactive session is already active. This can cause '
+                  'out-of-memory errors in some cases. You must explicitly '
+                  'call `InteractiveSession.close()` to release resources '
+                  'held by the other session(s).', str(w[0].message))
     sess2.close()
     sess.close()
 
@@ -1610,10 +1634,10 @@ class SessionTest(test_util.TensorFlowTestCase):
       e = constant_op.constant(44.0, name=b'e')
       f = constant_op.constant(45.0, name=r'f')
 
-      self.assertTrue(isinstance(c.name, six.text_type))
-      self.assertTrue(isinstance(d.name, six.text_type))
-      self.assertTrue(isinstance(e.name, six.text_type))
-      self.assertTrue(isinstance(f.name, six.text_type))
+      self.assertIsInstance(c.name, six.text_type)
+      self.assertIsInstance(d.name, six.text_type)
+      self.assertIsInstance(e.name, six.text_type)
+      self.assertIsInstance(f.name, six.text_type)
 
       self.assertEqual(42.0, sess.run('c:0'))
       self.assertEqual(42.0, sess.run(u'c:0'))
@@ -1673,10 +1697,10 @@ class SessionTest(test_util.TensorFlowTestCase):
     with ops.device('/cpu:0'):
       with session.Session() as sess:
         sess.run(constant_op.constant(1.0))
-        self.assertTrue(not run_metadata.HasField('step_stats'))
+        self.assertFalse(run_metadata.HasField('step_stats'))
 
         sess.run(constant_op.constant(1.0), run_metadata=run_metadata)
-        self.assertTrue(not run_metadata.HasField('step_stats'))
+        self.assertFalse(run_metadata.HasField('step_stats'))
 
         sess.run(
             constant_op.constant(1.0),
@@ -1697,11 +1721,11 @@ class SessionTest(test_util.TensorFlowTestCase):
         sess.run(constant_op.constant(1.0), options=None, run_metadata=None)
         sess.run(
             constant_op.constant(1.0), options=None, run_metadata=run_metadata)
-        self.assertTrue(not run_metadata.HasField('step_stats'))
+        self.assertFalse(run_metadata.HasField('step_stats'))
 
         sess.run(
             constant_op.constant(1.0), options=run_options, run_metadata=None)
-        self.assertTrue(not run_metadata.HasField('step_stats'))
+        self.assertFalse(run_metadata.HasField('step_stats'))
 
         sess.run(
             constant_op.constant(1.0),
@@ -1730,9 +1754,9 @@ class SessionTest(test_util.TensorFlowTestCase):
     with ops.Graph().as_default(), ops.device('/cpu:0'):
       a = constant_op.constant([[1, 2]])
       sess = session.Session()
-      self.assertFalse('_output_shapes' in sess.graph_def.node[0].attr)
+      self.assertNotIn('_output_shapes', sess.graph_def.node[0].attr)
       # Avoid lint error regarding 'unused' var a.
-      self.assertTrue(a == a)
+      self.assertEqual(a, a)
 
   def testInferShapesTrue(self):
     config_pb = config_pb2.ConfigProto(
@@ -1740,9 +1764,9 @@ class SessionTest(test_util.TensorFlowTestCase):
     with ops.Graph().as_default(), ops.device('/cpu:0'):
       a = constant_op.constant([[1, 2]])
       sess = session.Session(config=config_pb)
-      self.assertTrue('_output_shapes' in sess.graph_def.node[0].attr)
+      self.assertIn('_output_shapes', sess.graph_def.node[0].attr)
       # Avoid lint error regarding 'unused' var a.
-      self.assertTrue(a == a)
+      self.assertEqual(a, a)
 
   def testBuildCostModel(self):
     run_options = config_pb2.RunOptions()
diff --git a/tensorflow/python/client/tf_session_helper.cc b/tensorflow/python/client/tf_session_helper.cc
index cb960fd599a..3bb87cdd4d6 100644
--- a/tensorflow/python/client/tf_session_helper.cc
+++ b/tensorflow/python/client/tf_session_helper.cc
@@ -89,8 +89,7 @@ void TF_Run_wrapper_helper(TF_DeprecatedSession* session, const char* handle,
     input_names.push_back(key_string);
 
     inputs_safe.emplace_back(make_safe(static_cast<TF_Tensor*>(nullptr)));
-    s = NdarrayToTensor(nullptr /*ctx*/, value, &inputs_safe.back(),
-                        true /*convert_to_string*/);
+    s = NdarrayToTensor(nullptr /*ctx*/, value, &inputs_safe.back());
     if (!s.ok()) {
       Set_TF_Status_from_Status(out_status, s);
       return;
@@ -383,7 +382,7 @@ void TF_SessionRun_wrapper_helper(TF_Session* session, const char* handle,
   std::vector<Safe_TF_TensorPtr> input_vals_safe;
   for (PyObject* ndarray : input_ndarrays) {
     input_vals_safe.emplace_back(make_safe(static_cast<TF_Tensor*>(nullptr)));
-    s = NdarrayToTensor(nullptr, ndarray, &input_vals_safe.back(), true);
+    s = NdarrayToTensor(nullptr, ndarray, &input_vals_safe.back());
     if (!s.ok()) {
       Set_TF_Status_from_Status(out_status, s);
       return;
diff --git a/tensorflow/python/client/tf_session_wrapper.cc b/tensorflow/python/client/tf_session_wrapper.cc
index 6bc8cb2084d..ac656d322c4 100644
--- a/tensorflow/python/client/tf_session_wrapper.cc
+++ b/tensorflow/python/client/tf_session_wrapper.cc
@@ -166,7 +166,7 @@ PYBIND11_MODULE(_pywrap_tf_session, m) {
           return out_handle;
         });
   m.def("_TF_SetTarget", TF_SetTarget);
-  m.def("_TF_SetConfig", [](TF_SessionOptions* options, py::str proto) {
+  m.def("_TF_SetConfig", [](TF_SessionOptions* options, py::bytes proto) {
     tensorflow::Safe_TF_StatusPtr status =
         tensorflow::make_safe(TF_NewStatus());
     tensorflow::Safe_TF_BufferPtr buf =
@@ -398,7 +398,7 @@ PYBIND11_MODULE(_pywrap_tf_session, m) {
   });
 
   m.def("SetHandleShapeAndType",
-        [](TF_Graph* graph, TF_Output output, py::str proto) {
+        [](TF_Graph* graph, TF_Output output, py::bytes proto) {
           tensorflow::Safe_TF_StatusPtr status =
               tensorflow::make_safe(TF_NewStatus());
           tensorflow::Safe_TF_BufferPtr buf =
@@ -614,7 +614,7 @@ PYBIND11_MODULE(_pywrap_tf_session, m) {
         });
 
   m.def("TF_SetAttrValueProto", [](TF_OperationDescription* desc,
-                                   const char* attr_name, py::str proto) {
+                                   const char* attr_name, py::bytes proto) {
     tensorflow::Safe_TF_StatusPtr status =
         tensorflow::make_safe(TF_NewStatus());
     tensorflow::Safe_TF_BufferPtr buf =
@@ -673,7 +673,7 @@ PYBIND11_MODULE(_pywrap_tf_session, m) {
   m.def("TF_DeleteBuffer", &TF_DeleteBuffer);
   m.def(
       "TF_NewBufferFromString",
-      [](py::str buffer_as_string) {
+      [](py::bytes buffer_as_string) {
         tensorflow::Safe_TF_BufferPtr buf = tensorflow::make_safe(
             ProtoStringToTFBuffer(buffer_as_string.ptr()));
         return TF_NewBufferFromString(buf.get()->data, buf.get()->length);
@@ -853,7 +853,7 @@ PYBIND11_MODULE(_pywrap_tf_session, m) {
         py::call_guard<py::gil_scoped_release>());
 
   m.def("TF_FunctionSetAttrValueProto",
-        [](TF_Function* func, const char* attr_name, py::str proto) {
+        [](TF_Function* func, const char* attr_name, py::bytes proto) {
           tensorflow::Safe_TF_StatusPtr status =
               tensorflow::make_safe(TF_NewStatus());
           tensorflow::Safe_TF_BufferPtr buf =
@@ -887,7 +887,7 @@ PYBIND11_MODULE(_pywrap_tf_session, m) {
 
   m.def(
       "TF_FunctionImportFunctionDef",
-      [](py::str proto) {
+      [](py::bytes proto) {
         tensorflow::Safe_TF_StatusPtr status =
             tensorflow::make_safe(TF_NewStatus());
         tensorflow::Safe_TF_BufferPtr buf =
@@ -991,7 +991,7 @@ PYBIND11_MODULE(_pywrap_tf_session, m) {
 
   m.def(
       "TF_NewServer",
-      [](py::str proto) {
+      [](py::bytes proto) {
         tensorflow::Safe_TF_StatusPtr status =
             tensorflow::make_safe(TF_NewStatus());
         tensorflow::Safe_TF_BufferPtr buf =
diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index a274743d124..be7535dfae6 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -33,7 +33,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 8, 5)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 9, 18)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 
diff --git a/tensorflow/python/compiler/mlir/BUILD b/tensorflow/python/compiler/mlir/BUILD
index fe59213837b..7e193795e60 100644
--- a/tensorflow/python/compiler/mlir/BUILD
+++ b/tensorflow/python/compiler/mlir/BUILD
@@ -11,7 +11,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:pywrap_mlir",
-        "//tensorflow/python:util",
+        "//tensorflow/python:tf_export",
     ],
 )
 
@@ -22,6 +22,10 @@ py_test(
     deps = [
         ":mlir",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:platform",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:logging_ops",
+        "//tensorflow/python:tensor_spec",
+        "//tensorflow/python/eager:def_function",
     ],
 )
diff --git a/tensorflow/python/compiler/mlir/mlir.py b/tensorflow/python/compiler/mlir/mlir.py
index 84d23c30f00..3b72abc2850 100644
--- a/tensorflow/python/compiler/mlir/mlir.py
+++ b/tensorflow/python/compiler/mlir/mlir.py
@@ -26,16 +26,63 @@ from tensorflow.python.util.tf_export import tf_export
 def convert_graph_def(graph_def, pass_pipeline='tf-standard-pipeline'):
   """Import a GraphDef and convert it to a textual MLIR module.
 
+  This API is only intended for inspecting the internals of TensorFlow and the
+  string returned is at the moment intended for debugging purposes.
+
   Args:
     graph_def: An object of type graph_pb2.GraphDef or a textual proto
       representation of a valid GraphDef.
     pass_pipeline: A textual description of an MLIR Pass Pipeline to run on the
       module, see MLIR documentation for the
-      [textual pass pipeline syntax](https://github.com/tensorflow/mlir/blob/master/g3doc/WritingAPass.md#textual-pass-pipeline-specification).
+      [textual pass pipeline syntax](https://mlir.llvm.org/docs/PassManagement/#textual-pass-pipeline-specification).
 
   Returns:
     A textual representation of the MLIR module corresponding to the graphdef.
-    Raises a RuntimeError on error.
+
+  Raises:
+    InvalidArgumentError: if graph_def is invalid or cannot be converted to
+      MLIR.
 
   """
   return pywrap_mlir.import_graphdef(graph_def, pass_pipeline)
+
+
+@tf_export('mlir.experimental.convert_function')
+def convert_function(concrete_function, pass_pipeline='tf-standard-pipeline'):
+  """Import a ConcreteFunction and convert it to a textual MLIR module.
+
+  This API is only intended for inspecting the internals of TensorFlow and the
+  string returned is at the moment intended for debugging purposes.
+
+  A [tf.function](https://www.tensorflow.org/api_docs/python/tf/function) can be
+  imported and converted from TensorFlow to TensorFlow MLIR with this API by
+  extracting its ConcreteFunction (eagerly-executing wrapper around a
+  [tf.Graph](https://www.tensorflow.org/api_docs/python/tf/Graph)).
+
+  For example:
+  >>> @tf.function
+  ... def add(a, b):
+  ...   return a + b
+
+  >>> concrete_function = add.get_concrete_function(
+  ...     tf.TensorSpec(None, tf.dtypes.float32),
+  ...     tf.TensorSpec(None, tf.dtypes.float32))
+  >>> tf.mlir.experimental.convert_function(concrete_function)
+  '...module attributes {...} {...}'
+
+  Args:
+    concrete_function: An object of type ConcreteFunction.
+    pass_pipeline: A textual description of an MLIR Pass Pipeline to run on the
+      module, see MLIR documentation for the
+      [textual pass pipeline syntax](https://mlir.llvm.org/docs/PassManagement/#textual-pass-pipeline-specification).
+
+  Returns:
+    A textual representation of the MLIR module corresponding to the
+    ConcreteFunction.
+
+  Raises:
+    InvalidArgumentError: if concrete_function is invalid or cannot be converted
+      to MLIR.
+
+  """
+  return pywrap_mlir.import_function(concrete_function, pass_pipeline)
diff --git a/tensorflow/python/compiler/mlir/mlir_test.py b/tensorflow/python/compiler/mlir/mlir_test.py
index 2a2362d9f6b..9cb0063dc64 100644
--- a/tensorflow/python/compiler/mlir/mlir_test.py
+++ b/tensorflow/python/compiler/mlir/mlir_test.py
@@ -19,23 +19,68 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.compiler.mlir import mlir
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.ops import logging_ops
 from tensorflow.python.platform import test
 
 
-class MLIRImportTest(test.TestCase):
+class MLIRGraphDefImportTest(test.TestCase):
 
-  def test_import_graph_def(self):
+  def testImport(self):
     """Tests the basic flow of `tf.mlir.experimental.convert_graph_def`."""
     mlir_module = mlir.convert_graph_def('')
     # An empty graph should contain at least an empty main function.
     self.assertIn('func @main', mlir_module)
 
-  def test_invalid_pbtxt(self):
+  def testInvalidPbtxt(self):
     with self.assertRaisesRegex(errors.InvalidArgumentError,
                                 'Could not parse input proto'):
       mlir.convert_graph_def('some invalid proto')
 
 
+class MLIRConcreteFunctionImportTest(test.TestCase):
+
+  def testImport(self):
+
+    @def_function.function
+    def identity(i):
+      return i
+
+    concrete_function = identity.get_concrete_function(
+        tensor_spec.TensorSpec(None, dtypes.float32))
+    mlir_module = mlir.convert_function(concrete_function)
+    self.assertRegex(mlir_module, r'func @.*identity.*\(')
+
+  def testImportWithCall(self):
+
+    @def_function.function
+    def callee(i):
+      return i
+
+    @def_function.function
+    def caller(i):
+      return callee(i)
+
+    concrete_function = caller.get_concrete_function(
+        tensor_spec.TensorSpec(None, dtypes.float32))
+    mlir_module = mlir.convert_function(concrete_function)
+    self.assertRegex(mlir_module, r'func @.*caller.*\(')
+    self.assertRegex(mlir_module, r'func @.*callee.*\(')
+
+  def testImportWithControlRet(self):
+
+    @def_function.function
+    def logging():
+      logging_ops.print_v2('some message')
+
+    concrete_function = logging.get_concrete_function()
+    mlir_module = mlir.convert_function(concrete_function, pass_pipeline='')
+    self.assertRegex(mlir_module, r'tf\.PrintV2')
+    self.assertRegex(mlir_module, r'tf_executor.fetch.*: !tf_executor.control')
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/compiler/tensorrt/BUILD b/tensorflow/python/compiler/tensorrt/BUILD
index 2b26dd42818..b33eaa847e4 100644
--- a/tensorflow/python/compiler/tensorrt/BUILD
+++ b/tensorflow/python/compiler/tensorrt/BUILD
@@ -146,6 +146,7 @@ cuda_py_tests(
     python_version = "PY3",
     tags = [
         "no_cuda_on_cpu_tap",
+        "no_oss",  # TODO(b/165611343): Need to address the failures for CUDA 11 in OSS build.
         "no_rocm",
         "no_windows",
         "nomac",
@@ -172,7 +173,6 @@ cuda_py_test(
         "no_oss",  # TODO(b/125290478): allow running in at least some OSS configurations.
         "no_pip",
         "no_rocm",
-        "no_tap",  # It is not able to download the mnist data.
         "no_windows",
         "nomac",
     ],
diff --git a/tensorflow/python/compiler/tensorrt/test/base_test.py b/tensorflow/python/compiler/tensorrt/test/base_test.py
index 195382cd8ed..9d2d3abd4fb 100644
--- a/tensorflow/python/compiler/tensorrt/test/base_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/base_test.py
@@ -70,12 +70,6 @@ class SimpleSingleEngineTest(trt_test.TfTrtIntegrationTestBase):
         ]
     }
 
-  def ShouldRunTest(self, run_params):
-    # TODO(b/162448349): Enable the test for TRT 7.1.3.
-    if trt_test.IsTensorRTVersionGreaterEqual(7, 1, 3):
-      return (False, "Skip test due to b/162448349")
-    return super().ShouldRunTest(run_params)
-
 
 class SimpleMultiEnginesTest(trt_test.TfTrtIntegrationTestBase):
 
@@ -136,12 +130,6 @@ class SimpleMultiEnginesTest(trt_test.TfTrtIntegrationTestBase):
     return conversion_params._replace(
         rewriter_config_template=rewrite_config_with_trt)
 
-  def ShouldRunTest(self, run_params):
-    # TODO(b/162448349): Enable the test for TRT 7.1.3.
-    if trt_test.IsTensorRTVersionGreaterEqual(7, 1, 3):
-      return (False, "Skip test due to b/162448349")
-    return super().ShouldRunTest(run_params)
-
 
 class SimpleMultiEnginesTest2(trt_test.TfTrtIntegrationTestBase):
 
diff --git a/tensorflow/python/compiler/tensorrt/test/combined_nms_test.py b/tensorflow/python/compiler/tensorrt/test/combined_nms_test.py
index 26e911e3b0b..3f2a5469ae6 100644
--- a/tensorflow/python/compiler/tensorrt/test/combined_nms_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/combined_nms_test.py
@@ -89,9 +89,6 @@ class CombinedNmsTest(trt_test.TfTrtIntegrationTestBase):
     }
 
   def ShouldRunTest(self, run_params):
-    # TODO(b/162447069): Enable the test for TRT 7.1.3.
-    if trt_test.IsTensorRTVersionGreaterEqual(7, 1, 3):
-      return (False, 'Skip test due to b/162447069')
     # There is no CombinedNonMaxSuppression op for GPU at the moment, so
     # calibration will fail.
     # TODO(laigd): fix this.
diff --git a/tensorflow/python/compiler/tensorrt/test/const_broadcast_test.py b/tensorflow/python/compiler/tensorrt/test/const_broadcast_test.py
index 9e71b9e3f75..ccbaf9e52fa 100644
--- a/tensorflow/python/compiler/tensorrt/test/const_broadcast_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/const_broadcast_test.py
@@ -60,12 +60,6 @@ class ConstBroadcastTest(trt_test.TfTrtIntegrationTestBase):
     """The relative tolerance to compare floating point results."""
     return 1.e-04 if run_params.precision_mode == 'FP32' else 1.e-02
 
-  def ShouldRunTest(self, run_params):
-    # TODO(b/162448349): Enable the test for TRT 7.1.3.
-    if trt_test.IsTensorRTVersionGreaterEqual(7, 1, 3):
-      return (False, 'Skip test due to b/162448349')
-    return super().ShouldRunTest(run_params)
-
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/compiler/tensorrt/test/conv2d_test.py b/tensorflow/python/compiler/tensorrt/test/conv2d_test.py
index 400c17b343e..df1adce2178 100644
--- a/tensorflow/python/compiler/tensorrt/test/conv2d_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/conv2d_test.py
@@ -114,12 +114,6 @@ class Conv2DNCHWTest(trt_test.TfTrtIntegrationTestBase):
       return 4e-02
     return super(Conv2DNCHWTest, self).ExpectedRelativeTolerance(run_params)
 
-  def ShouldRunTest(self, run_params):
-    # TODO(b/162448349): Enable the test for TRT 7.1.3.
-    if trt_test.IsTensorRTVersionGreaterEqual(7, 1, 3):
-      return (False, "Skip test due to b/162448349")
-    return super().ShouldRunTest(run_params)
-
 
 class Conv2DNHWCTest(trt_test.TfTrtIntegrationTestBase):
   """Testing conversion of Conv2D (data_format=NCHW) in TF-TRT conversion."""
@@ -143,12 +137,6 @@ class Conv2DNHWCTest(trt_test.TfTrtIntegrationTestBase):
     """Return the expected engines to build."""
     return ["TRTEngineOp_0"]
 
-  def ShouldRunTest(self, run_params):
-    # TODO(b/162448349): Enable the test for TRT 7.1.3.
-    if trt_test.IsTensorRTVersionGreaterEqual(7, 1, 3):
-      return (False, "Skip test due to b/162448349")
-    return super().ShouldRunTest(run_params)
-
 
 class Conv2DStridedNCHWTest(trt_test.TfTrtIntegrationTestBase):
   """Testing conversion of strided Conv2D (data_format=NCHW)."""
@@ -180,12 +168,6 @@ class Conv2DStridedNCHWTest(trt_test.TfTrtIntegrationTestBase):
     """Return the expected engines to build."""
     return ["TRTEngineOp_0"]
 
-  def ShouldRunTest(self, run_params):
-    # TODO(b/162448349): Enable the test for TRT 7.1.3.
-    if trt_test.IsTensorRTVersionGreaterEqual(7, 1, 3):
-      return (False, "Skip test due to b/162448349")
-    return super().ShouldRunTest(run_params)
-
 
 class Conv2DTranposeTest(trt_test.TfTrtIntegrationTestBase):
   """Testing conversion of conv2d_transpose (AKA Conv2DBackpropInput)"""
diff --git a/tensorflow/python/compiler/tensorrt/test/dynamic_input_shapes_test.py b/tensorflow/python/compiler/tensorrt/test/dynamic_input_shapes_test.py
index f02ad08777e..95dbe727ac3 100644
--- a/tensorflow/python/compiler/tensorrt/test/dynamic_input_shapes_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/dynamic_input_shapes_test.py
@@ -98,9 +98,6 @@ class DynamicInputShapesTest(trt_test.TfTrtIntegrationTestBase):
     return ["TRTEngineOp_0"]
 
   def ShouldRunTest(self, run_params):
-    # TODO(b/162448349): Enable the test for TRT 7.1.3.
-    if trt_test.IsTensorRTVersionGreaterEqual(7, 1, 3):
-      return (False, "Skip test due to b/162448349")
     return (run_params.dynamic_engine and not trt_test.IsQuantizationMode(
         run_params.precision_mode)), "test dynamic engine and non-INT8"
 
diff --git a/tensorflow/python/compiler/tensorrt/test/memory_alignment_test.py b/tensorflow/python/compiler/tensorrt/test/memory_alignment_test.py
index c1f0a007bf8..056edc3e4d4 100644
--- a/tensorflow/python/compiler/tensorrt/test/memory_alignment_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/memory_alignment_test.py
@@ -67,12 +67,6 @@ class MemoryAlignmentTest(trt_test.TfTrtIntegrationTestBase):
     """The relative tolerance to compare floating point results."""
     return 0.1
 
-  def ShouldRunTest(self, run_params):
-    # TODO(b/162448349): Enable the test for TRT 7.1.3.
-    if trt_test.IsTensorRTVersionGreaterEqual(7, 1, 3):
-      return (False, "Skip test due to b/162448349")
-    return super().ShouldRunTest(run_params)
-
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/compiler/tensorrt/test/multi_connection_neighbor_engine_test.py b/tensorflow/python/compiler/tensorrt/test/multi_connection_neighbor_engine_test.py
index 687a12486b7..b57bee6c5d7 100644
--- a/tensorflow/python/compiler/tensorrt/test/multi_connection_neighbor_engine_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/multi_connection_neighbor_engine_test.py
@@ -72,12 +72,6 @@ class MultiConnectionNeighborEngineTest(trt_test.TfTrtIntegrationTestBase):
     """Return the expected engines to build."""
     return ["TRTEngineOp_0", "TRTEngineOp_1"]
 
-  def ShouldRunTest(self, run_params):
-    # TODO(b/162447069): Enable the test for TRT 7.1.3.
-    if trt_test.IsTensorRTVersionGreaterEqual(7, 1, 3):
-      return (False, "Skip test due to b/162447069")
-    return super().ShouldRunTest(run_params)
-
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/compiler/tensorrt/test/neighboring_engine_test.py b/tensorflow/python/compiler/tensorrt/test/neighboring_engine_test.py
index 39fee5cba5d..f377fe8dceb 100644
--- a/tensorflow/python/compiler/tensorrt/test/neighboring_engine_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/neighboring_engine_test.py
@@ -61,12 +61,6 @@ class NeighboringEngineTest(trt_test.TfTrtIntegrationTestBase):
         "TRTEngineOp_1": ["weights", "conv"]
     }
 
-  def ShouldRunTest(self, run_params):
-    # TODO(b/162447069): Enable the test for TRT 7.1.3.
-    if trt_test.IsTensorRTVersionGreaterEqual(7, 1, 3):
-      return (False, "Skip test due to b/162447069")
-    return super().ShouldRunTest(run_params)
-
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/compiler/tensorrt/test/quantization_mnist_test.py b/tensorflow/python/compiler/tensorrt/test/quantization_mnist_test.py
index d859407f1f7..000b231a61a 100644
--- a/tensorflow/python/compiler/tensorrt/test/quantization_mnist_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/quantization_mnist_test.py
@@ -261,10 +261,6 @@ class QuantizationAwareTrainingMNISTTest(test_util.TensorFlowTestCase):
     if not is_tensorrt_enabled():
       return
 
-    # TODO(b/162447069): Enable the test for TRT 7.1.3.
-    if trt_test.IsTensorRTVersionGreaterEqual(7, 1, 3):
-      return
-
     model_dir = test.test_src_dir_path(
         'python/compiler/tensorrt/test/testdata/mnist')
 
diff --git a/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py b/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py
index 27133a14203..5ae4b32279b 100644
--- a/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py
+++ b/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py
@@ -280,7 +280,7 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
         is_dynamic_op=run_params.dynamic_engine,
         maximum_cached_engines=1,
         use_calibration=run_params.use_calibration,
-        max_batch_size=min(batch_list))
+        max_batch_size=max(batch_list))
     return conversion_params
 
   def GetTrtRewriterConfig(self,
diff --git a/tensorflow/python/compiler/tensorrt/test/trt_mode_test.py b/tensorflow/python/compiler/tensorrt/test/trt_mode_test.py
index 7d991678748..ee56be3f11a 100644
--- a/tensorflow/python/compiler/tensorrt/test/trt_mode_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/trt_mode_test.py
@@ -59,11 +59,19 @@ class TrtModeTestBase(trt_test.TfTrtIntegrationTestBase):
     return self.BuildParams(self.GraphFn, dtypes.float32, [[1, 12, 5]],
                             [[12, 5]])
 
-  def GetConversionParams(self, run_params, implicit_batch=False):
+  def GetConversionParams(self,
+                          run_params,
+                          max_batch_size=0,
+                          implicit_batch=False):
     """Return a TrtConversionParams for test."""
 
     conversion_params = super(TrtModeTestBase,
                               self).GetConversionParams(run_params)
+    # If max_batch_size!=0, use the value for conversion_params.
+    if max_batch_size and implicit_batch:
+      conversion_params = conversion_params._replace(
+          max_batch_size=max_batch_size)
+
     rewriter_config = self.GetTrtRewriterConfig(
         run_params=run_params,
         conversion_params=conversion_params,
@@ -81,7 +89,10 @@ class ImplicitBatchTest(TrtModeTestBase):
 
   def GetConversionParams(self, run_params):
     """Return a TrtConversionParams for test using implicit batch mdoe."""
-    return super(ImplicitBatchTest, self).GetConversionParams(run_params, True)
+    # The first dimension of the input is squeezed and the batch size for the
+    # rest OPs is 12.
+    return super(ImplicitBatchTest,
+                 self).GetConversionParams(run_params, 12, True)
 
   def ExpectedEnginesToBuild(self, run_params):
     """Check that the expected engine is built.
diff --git a/tensorflow/python/compiler/tensorrt/test/vgg_block_nchw_test.py b/tensorflow/python/compiler/tensorrt/test/vgg_block_nchw_test.py
index 43034e8b31e..8fd9606812d 100644
--- a/tensorflow/python/compiler/tensorrt/test/vgg_block_nchw_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/vgg_block_nchw_test.py
@@ -76,12 +76,6 @@ class VGGBlockNCHWTest(trt_test.TfTrtIntegrationTestBase):
     super(trt_test.TfTrtIntegrationTestBase, self).setUp()
     os.environ["TF_TRT_ALLOW_ENGINE_NATIVE_SEGMENT_EXECUTION"] = "True"
 
-  def ShouldRunTest(self, run_params):
-    # TODO(b/162448349): Enable the test for TRT 7.1.3.
-    if trt_test.IsTensorRTVersionGreaterEqual(7, 1, 3):
-      return (False, "Skip test due to b/162448349")
-    return super().ShouldRunTest(run_params)
-
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/compiler/tensorrt/test/vgg_block_test.py b/tensorflow/python/compiler/tensorrt/test/vgg_block_test.py
index 7b1f7e062d7..9d81cd6dcc3 100644
--- a/tensorflow/python/compiler/tensorrt/test/vgg_block_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/vgg_block_test.py
@@ -67,12 +67,6 @@ class VGGBlockTest(trt_test.TfTrtIntegrationTestBase):
     super(trt_test.TfTrtIntegrationTestBase, self).setUp()
     os.environ["TF_TRT_ALLOW_ENGINE_NATIVE_SEGMENT_EXECUTION"] = "True"
 
-  def ShouldRunTest(self, run_params):
-    # TODO(b/162448349): Enable the test for TRT 7.1.3.
-    if trt_test.IsTensorRTVersionGreaterEqual(7, 1, 3):
-      return (False, "Skip test due to b/162448349")
-    return super().ShouldRunTest(run_params)
-
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/compiler/tensorrt/trt_convert.py b/tensorflow/python/compiler/tensorrt/trt_convert.py
index a0388c3630d..29cb6d14856 100644
--- a/tensorflow/python/compiler/tensorrt/trt_convert.py
+++ b/tensorflow/python/compiler/tensorrt/trt_convert.py
@@ -154,7 +154,7 @@ class TrtConversionParams(
       there is a mismatch between which tensors TRT quantizes and which
       tensors were trained with fake quantization.
     max_batch_size: max size for the input batch. This parameter is only
-      effective when is_dynamic_op=False which is not supported in TF 2.0.
+      effective when use_implicit_batch is true.
     allow_build_at_runtime: whether to build TensorRT engines during runtime.
       If no TensorRT engine can be found in cache that can handle the given
       inputs during runtime, then a new TensorRT engine is built at runtime if
@@ -341,9 +341,8 @@ def get_tensorrt_rewriter_config(conversion_params,
     optimizer.parameter_map["is_dynamic_op"].b = conversion_params.is_dynamic_op
     optimizer.parameter_map[
         "allow_build_at_runtime"].b = conversion_params.allow_build_at_runtime
-    if not is_v2:
-      optimizer.parameter_map[
-          "max_batch_size"].i = conversion_params.max_batch_size
+    optimizer.parameter_map[
+        "max_batch_size"].i = conversion_params.max_batch_size
   else:
     rewriter_config_with_trt.CopyFrom(
         conversion_params.rewriter_config_template)
diff --git a/tensorflow/python/compiler/tensorrt/trt_convert_test.py b/tensorflow/python/compiler/tensorrt/trt_convert_test.py
index 1aa53a5bc1b..7d52e054bea 100644
--- a/tensorflow/python/compiler/tensorrt/trt_convert_test.py
+++ b/tensorflow/python/compiler/tensorrt/trt_convert_test.py
@@ -86,7 +86,7 @@ class TrtConvertTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         is_dynamic_op=True,
         maximum_cached_engines=2)
     rewriter_cfg = trt_convert.get_tensorrt_rewriter_config(
-        conversion_params=conversion_params)
+        conversion_params=conversion_params, is_v2=True)
     self.assertEqual(["constfold", "layout", "constfold"],
                      rewriter_cfg.optimizers)
     self.assertEqual(rewriter_config_pb2.RewriterConfig.ONE,
diff --git a/tensorflow/python/compiler/xla/jit.py b/tensorflow/python/compiler/xla/jit.py
index 3ccf2959b76..fb911ec9637 100644
--- a/tensorflow/python/compiler/xla/jit.py
+++ b/tensorflow/python/compiler/xla/jit.py
@@ -70,6 +70,34 @@ def experimental_jit_scope(compile_ops=True, separate_compiled_gradients=False):
     h = tf.gradients([f], [a, b], name='mygrads2')
     ```
 
+  Ops that are not in the scope may be clustered and compiled with ops in
+  the scope with `compile_ops=True`, while the ops in the scope with
+  `compile_ops=False` will never be compiled.
+
+  For example:
+
+    ```python
+    # In the example below, x and loss may be clustered and compiled together,
+    # while y will not be compiled.
+    with tf.xla.experimental.jit_scope():
+      x = tf.matmul(a, b)
+    with tf.xla.experimental.jit_scope(compile_ops=False):
+      y = tf.matmul(c, d)
+    loss = x + y
+    ```
+
+  If you want to only compile the ops in the scope with `compile_ops=True`,
+  consider adding an outer `jit_scope(compile_ops=False)`:
+
+    ```python
+    # In the example below, only x will be compiled.
+    with tf.xla.experimental.jit_scope(compile_ops=False):
+      with tf.xla.experimental.jit_scope():
+        x = tf.matmul(a, b)
+      y = tf.matmul(c, d)
+      loss = x + y
+    ```
+
   Args:
     compile_ops: Whether to enable or disable compilation in the scope.
       Either a Python bool, or a callable that accepts the parameter
diff --git a/tensorflow/python/compiler/xla/xla.py b/tensorflow/python/compiler/xla/xla.py
index 51ad5569a30..59b70f2a217 100644
--- a/tensorflow/python/compiler/xla/xla.py
+++ b/tensorflow/python/compiler/xla/xla.py
@@ -37,6 +37,7 @@ from tensorflow.python.util import compat
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.compat import collections_abc
+from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.tf_export import tf_export
 
 _XLA_COMPILE_ATTR = '_xla_compile_id'
@@ -64,6 +65,10 @@ _UNSUPPORTED_OPS = set([
 
 
 @tf_export('xla.experimental.compile')
+@deprecated(
+    None, 'xla.experimental.compile is deprecated. Consider using '
+    'tf.function(experimental_compile=True)',
+    warn_once=True)
 def compile(computation, inputs=None):  # pylint: disable=redefined-builtin
   """Builds an operator that compiles and runs `computation` with XLA.
 
diff --git a/tensorflow/python/data/__init__.py b/tensorflow/python/data/__init__.py
index 39cbd3de735..04ea1e6b0c1 100644
--- a/tensorflow/python/data/__init__.py
+++ b/tensorflow/python/data/__init__.py
@@ -23,6 +23,7 @@ from __future__ import print_function
 
 # pylint: disable=unused-import
 from tensorflow.python.data import experimental
+from tensorflow.python.data.ops.dataset_ops import AUTOTUNE
 from tensorflow.python.data.ops.dataset_ops import Dataset
 from tensorflow.python.data.ops.dataset_ops import INFINITE as INFINITE_CARDINALITY
 from tensorflow.python.data.ops.dataset_ops import make_initializable_iterator
diff --git a/tensorflow/python/data/benchmarks/benchmark_base.py b/tensorflow/python/data/benchmarks/benchmark_base.py
index 1e1a71668ef..7ee2de44ff4 100644
--- a/tensorflow/python/data/benchmarks/benchmark_base.py
+++ b/tensorflow/python/data/benchmarks/benchmark_base.py
@@ -75,7 +75,7 @@ class DatasetBenchmarkBase(test.Benchmark):
         if warmup:
           # Run once to warm up the session caches.
           sess.run(iterator.initializer)
-          sess.run(next_element)
+          sess.run(next_element.op)
 
         sess.run(iterator.initializer)
         start = time.time()
diff --git a/tensorflow/python/data/benchmarks/range_benchmark.py b/tensorflow/python/data/benchmarks/range_benchmark.py
index 4eb7c94b564..81e554252df 100644
--- a/tensorflow/python/data/benchmarks/range_benchmark.py
+++ b/tensorflow/python/data/benchmarks/range_benchmark.py
@@ -24,18 +24,22 @@ from tensorflow.python.data.ops import dataset_ops
 class RangeBenchmark(benchmark_base.DatasetBenchmarkBase):
   """Benchmarks for `tf.data.Dataset.range()`."""
 
-  def benchmark_range(self):
-    for modeling_enabled in [False, True]:
-      num_elements = 10000000 if modeling_enabled else 50000000
-      options = dataset_ops.Options()
-      options.experimental_optimization.autotune = modeling_enabled
-      dataset = dataset_ops.Dataset.range(num_elements)
-      dataset = dataset.with_options(options)
+  def _benchmark_range(self, num_elements, modeling_enabled):
+    options = dataset_ops.Options()
+    options.experimental_optimization.autotune = modeling_enabled
+    dataset = dataset_ops.Dataset.range(num_elements)
+    dataset = dataset.with_options(options)
 
-      self.run_and_report_benchmark(
-          dataset,
-          num_elements=num_elements,
-          name="modeling_%s" % ("on" if modeling_enabled else "off"))
+    self.run_and_report_benchmark(
+        dataset,
+        num_elements=num_elements,
+        name="modeling_%s" % ("on" if modeling_enabled else "off"))
+
+  def benchmark_range_with_modeling(self):
+    self._benchmark_range(num_elements=10000000, modeling_enabled=True)
+
+  def benchmark_range_without_modeling(self):
+    self._benchmark_range(num_elements=50000000, modeling_enabled=False)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/BUILD b/tensorflow/python/data/experimental/kernel_tests/BUILD
index 18b748904e6..e2606dd6617 100644
--- a/tensorflow/python/data/experimental/kernel_tests/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/BUILD
@@ -756,6 +756,7 @@ tf_py_test(
     srcs = ["stats_dataset_ops_test.py"],
     tags = [
         "no_pip",
+        "notap",
     ],
     deps = [
         ":reader_dataset_ops_test_base",
diff --git a/tensorflow/python/data/experimental/kernel_tests/auto_shard_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/auto_shard_dataset_test.py
index 36587d97ea0..564dda0cf11 100644
--- a/tensorflow/python/data/experimental/kernel_tests/auto_shard_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/auto_shard_dataset_test.py
@@ -252,6 +252,23 @@ class AutoShardDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     ]
     self.assertDatasetProducesWithShuffle(dataset, expected, 5, 4, shuffle)
 
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(sharding_policy=[
+              distribute_options.AutoShardPolicy.DATA,
+              distribute_options.AutoShardPolicy.AUTO
+          ])))
+  def testShardByDataBeforePrefetch(self, sharding_policy):
+    dataset = dataset_ops.Dataset.range(4)
+    dataset = dataset.apply(testing.assert_next(["Shard", "Prefetch"]))
+    dataset = dataset.prefetch(1)
+    options = dataset_ops.Options()
+    options.experimental_distribute.auto_shard_policy = sharding_policy
+    dataset = dataset.with_options(options)
+    dataset = distribute._AutoShardDataset(dataset, 2, 0)
+    self.assertDatasetProduces(dataset, [0, 2])
+
   @combinations.generate(
       combinations.times(
           test_base.default_test_combinations(),
@@ -393,32 +410,6 @@ class AutoShardDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
       dataset = distribute._AutoShardDataset(dataset, 10, 0)
       self.evaluate(self.getNext(dataset)())
 
-  @combinations.generate(test_base.default_test_combinations())
-  def testShardWithLegacyRebatch(self):
-    # Tests that RebatchDatasetV1 is a passthrough op.
-    dataset = dataset_ops.Dataset.list_files(self.test_filenames, shuffle=False)
-    dataset = dataset.apply(
-        testing.assert_next(["Shard", "FlatMap", "Batch", "Rebatch"]))
-    dataset = dataset.flat_map(core_readers.TFRecordDataset)
-    dataset = dataset.batch(5)
-    dataset = distribute._LegacyRebatchDataset(dataset, num_replicas=1)
-    dataset = distribute._AutoShardDataset(dataset, 5, 3)
-    nxt = self.getNext(dataset)
-    self.evaluate(nxt())
-
-  @combinations.generate(test_base.default_test_combinations())
-  def testShardWithRebatch(self):
-    # Tests that RebatchDatasetV2 is a passthrough op.
-    dataset = dataset_ops.Dataset.list_files(self.test_filenames, shuffle=False)
-    dataset = dataset.apply(
-        testing.assert_next(["Shard", "FlatMap", "Batch", "Rebatch"]))
-    dataset = dataset.flat_map(core_readers.TFRecordDataset)
-    dataset = dataset.batch(5)
-    dataset = distribute._RebatchDataset(dataset, batch_sizes=5)
-    dataset = distribute._AutoShardDataset(dataset, 5, 3)
-    nxt = self.getNext(dataset)
-    self.evaluate(nxt())
-
   @combinations.generate(test_base.default_test_combinations())
   def testNoReaderPipelines(self):
     dataset = dataset_ops.Dataset.range(1024)
@@ -529,5 +520,91 @@ class AutoShardTextLineDatasetTest(
     self.assertDatasetProduces(dataset, expected)
 
 
+class AutoShardWithRebatchDatasetTest(
+    reader_dataset_ops_test_base.TFRecordDatasetTestBase,
+    parameterized.TestCase):
+
+  def _setUpFiles(self, num_files, num_records_per_file):
+    self._num_files = num_files
+    self._num_records = num_records_per_file
+    self.test_filenames = self._createFiles()
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testFileShardingWithLegacyRebatch(self):
+    # Tests that RebatchDatasetV1 is a passthrough op.
+    self._setUpFiles(num_files=5, num_records_per_file=10)
+    dataset = dataset_ops.Dataset.list_files(self.test_filenames, shuffle=False)
+    dataset = dataset.apply(
+        testing.assert_next(["Shard", "FlatMap", "Batch", "Rebatch"]))
+    dataset = dataset.flat_map(core_readers.TFRecordDataset)
+    dataset = dataset.batch(5)
+    dataset = distribute._LegacyRebatchDataset(dataset, num_replicas=5)
+    dataset = distribute._AutoShardDataset(dataset, 5, 3)
+    expected = [[self._record(3, i)] for i in range(10)]
+    self.assertDatasetProduces(dataset, expected)
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testFileShardingWithRebatch(self):
+    # Tests that RebatchDatasetV2 is a passthrough op.
+    self._setUpFiles(num_files=3, num_records_per_file=5)
+    dataset = dataset_ops.Dataset.list_files(self.test_filenames, shuffle=False)
+    dataset = dataset.apply(
+        testing.assert_next(["Shard", "FlatMap", "Batch", "Rebatch"]))
+    dataset = dataset.flat_map(core_readers.TFRecordDataset)
+    dataset = dataset.batch(5)
+    dataset = distribute._RebatchDataset(dataset, batch_sizes=[2, 1, 2])
+    dataset = distribute._AutoShardDataset(dataset, 3, 1)
+    expected = [[self._record(1, 0), self._record(1, 1)], [self._record(1, 2)],
+                [self._record(1, 3), self._record(1, 4)]]
+    self.assertDatasetProduces(dataset, expected)
+
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.times(
+              combinations.combine(sharding_policy=[
+                  distribute_options.AutoShardPolicy.DATA,
+                  distribute_options.AutoShardPolicy.AUTO
+              ]), combinations.combine(with_prefetch=[True, False]))))
+  def testUseLegacyRebatchWithDataSharding(self, sharding_policy,
+                                           with_prefetch):
+    # This test simulates a distributed environment with 3 workers, each with
+    # 1 replica.
+    dataset = dataset_ops.Dataset.range(8)
+    dataset = dataset.batch(4)
+    options = dataset_ops.Options()
+    options.experimental_distribute.auto_shard_policy = sharding_policy
+    dataset = dataset.with_options(options)
+    # We expect the auto-shard rewrite to rewrite RebatchDatasetV2 to
+    # RebatchDataset(V1) for correctness reasons. This will modify the output
+    # of the dataset.
+    worker_a_dataset = distribute._RebatchDataset(
+        dataset, batch_sizes=[2, 1, 1])
+    if with_prefetch:
+      worker_a_dataset = worker_a_dataset.prefetch(1)
+    worker_a_dataset = distribute._AutoShardDataset(
+        worker_a_dataset, 3, 0, num_replicas=3)
+    expected = [[0, 1], [4, 5]]
+    self.assertDatasetProduces(worker_a_dataset, expected)
+
+    worker_b_dataset = distribute._RebatchDataset(
+        dataset, batch_sizes=[1, 1, 2])
+    if with_prefetch:
+      worker_b_dataset = worker_b_dataset.prefetch(1)
+    worker_b_dataset = distribute._AutoShardDataset(
+        worker_b_dataset, 3, 1, num_replicas=3)
+    expected = [[2, 3], [6, 7]]
+    self.assertDatasetProduces(worker_b_dataset, expected)
+
+    worker_c_dataset = distribute._RebatchDataset(
+        dataset, batch_sizes=[1, 2, 1])
+    if with_prefetch:
+      worker_c_dataset = worker_c_dataset.prefetch(1)
+    worker_c_dataset = distribute._AutoShardDataset(
+        worker_c_dataset, 3, 2, num_replicas=3)
+    expected = [[], []]
+    self.assertDatasetProduces(worker_c_dataset, expected)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/compression_ops_test.py b/tensorflow/python/data/experimental/kernel_tests/compression_ops_test.py
index a091bdca8b9..e7c84ee5d60 100644
--- a/tensorflow/python/data/experimental/kernel_tests/compression_ops_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/compression_ops_test.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from collections import namedtuple
 from absl.testing import parameterized
 
 from tensorflow.python.data.experimental.ops import compression_ops
@@ -25,14 +26,24 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import structure
 from tensorflow.python.framework import combinations
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.platform import test
 
 
 def _test_objects():
+
+  Item = namedtuple("Item", "id name")
+
   return [
       combinations.NamedObject("int", 1),
       combinations.NamedObject("string", "dog"),
       combinations.NamedObject("tuple", (1, 1)),
+      combinations.NamedObject("nested_tuple", ((1, 1), (2, 2))),
+      combinations.NamedObject("named_tuple", Item(id=1, name="item1")),
+      combinations.NamedObject("unicode", "アヒル"),
+      combinations.NamedObject(
+          "nested_named_tuple",
+          (Item(id=1, name="item1"), Item(id=2, name="item2"))),
       combinations.NamedObject("int_string_tuple", (1, "dog")),
       combinations.NamedObject(
           "sparse",
@@ -50,11 +61,32 @@ def _test_objects():
   ]
 
 
+def _test_v2_eager_only_objects():
+  return [
+      combinations.NamedObject(
+          "ragged",
+          ragged_factory_ops.constant([[0, 1, 2, 3], [4, 5], [6, 7, 8], [9]])),
+      combinations.NamedObject(
+          "sparse_ragged_structured", {
+              "sparse":
+                  sparse_tensor.SparseTensorValue(
+                      indices=[[0, 0], [1, 2]],
+                      values=[1, 2],
+                      dense_shape=[3, 4]),
+              "ragged":
+                  ragged_factory_ops.constant([[0, 1, 2, 3], [9]])
+          })
+  ]
+
+
 class CompressionOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @combinations.generate(
       combinations.times(test_base.default_test_combinations(),
-                         combinations.combine(element=_test_objects())))
+                         combinations.combine(element=_test_objects())) +
+      combinations.times(
+          test_base.v2_eager_only_combinations(),
+          combinations.combine(element=_test_v2_eager_only_objects())))
   def testCompression(self, element):
     element = element._obj
 
@@ -65,7 +97,10 @@ class CompressionOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @combinations.generate(
       combinations.times(test_base.default_test_combinations(),
-                         combinations.combine(element=_test_objects())))
+                         combinations.combine(element=_test_objects())) +
+      combinations.times(
+          test_base.v2_eager_only_combinations(),
+          combinations.combine(element=_test_v2_eager_only_objects())))
   def testDatasetCompression(self, element):
     element = element._obj
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/ignore_errors_test.py b/tensorflow/python/data/experimental/kernel_tests/ignore_errors_test.py
index 5ed72767425..77708a2a8e0 100644
--- a/tensorflow/python/data/experimental/kernel_tests/ignore_errors_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/ignore_errors_test.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import os
+import sys
 
 from absl.testing import parameterized
 import numpy as np
@@ -54,6 +55,23 @@ class IgnoreErrorsTest(test_base.DatasetTestBase, parameterized.TestCase):
     with self.assertRaises(errors.OutOfRangeError):
       self.evaluate(get_next())
 
+  @combinations.generate(test_base.default_test_combinations())
+  def testIgnoreError_withLogWarning(self):
+    components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)
+    dataset = (
+        dataset_ops.Dataset.from_tensor_slices(components).map(
+            lambda x: array_ops.check_numerics(x, "message")).apply(
+                error_ops.ignore_errors(log_warning=True)))
+    get_next = self.getNext(dataset)
+    for x in [1., 2., 3.]:
+      self.assertEqual(x, self.evaluate(get_next()))
+    with self.captureWritesToStream(sys.stderr) as logged:
+      self.assertEqual(5., self.evaluate(get_next()))
+    expected = "Tensor had NaN values"
+    self.assertIn((expected), logged.contents())
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
   @combinations.generate(test_base.default_test_combinations())
   def testParallelMapIgnoreError(self):
     components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)
diff --git a/tensorflow/python/data/experimental/kernel_tests/io_test.py b/tensorflow/python/data/experimental/kernel_tests/io_test.py
index 1fdd24ea613..7cbb745e099 100644
--- a/tensorflow/python/data/experimental/kernel_tests/io_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/io_test.py
@@ -25,6 +25,7 @@ from absl.testing import parameterized
 from tensorflow.python.data.experimental.ops import io
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import combinations
 from tensorflow.python.platform import test
 
@@ -82,6 +83,22 @@ class IOTest(test_base.DatasetTestBase, parameterized.TestCase):
       expected.extend(range(i, 42, 7))
     self.assertDatasetProduces(dataset2, expected)
 
+  @combinations.generate(
+      combinations.times(test_base.eager_only_combinations(),
+                         combinations.combine(compression=[None, "GZIP"])))
+  def testSaveInsideFunction(self, compression):
+
+    dataset = dataset_ops.Dataset.range(42)
+
+    @def_function.function
+    def save_fn():
+      io.save(dataset, self._test_dir, compression=compression)
+
+    save_fn()
+    dataset = io.load(
+        self._test_dir, dataset.element_spec, compression=compression)
+    self.assertDatasetProduces(dataset, range(42))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimize_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/optimize_dataset_test.py
index e26e97dbd97..9d99954e183 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimize_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimize_dataset_test.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import functools
+import os
 import warnings
 
 from absl.testing import parameterized
@@ -103,6 +104,28 @@ def _captured_refvar_test_combinations():
   return functools.reduce(reduce_fn, cases, [])
 
 
+def _disable_intra_op_parallelism_test_combinations():
+
+  def make_tensor_dataset():
+    return dataset_ops.Dataset.from_tensors(42)
+
+  def make_map_dataset():
+    return dataset_ops.Dataset.from_tensors(42).map(lambda x: x + 1)
+
+  cases = [
+      ("FromTensors", make_tensor_dataset, [42]),
+      ("Map", make_map_dataset, [43]),
+  ]
+
+  def reduce_fn(x, y):
+    name, dataset_fn, expected_output = y
+    return x + combinations.combine(
+        dataset_fn=combinations.NamedObject(name, dataset_fn),
+        expected_output=[expected_output])
+
+  return functools.reduce(reduce_fn, cases, [])
+
+
 class OptimizeDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @combinations.generate(test_base.default_test_combinations())
@@ -185,6 +208,22 @@ class OptimizeDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     dataset = dataset.with_options(options)
     self.assertDatasetProduces(dataset, expected_output=[[0]])
 
+  @combinations.generate(
+      combinations.times(test_base.default_test_combinations(),
+                         _disable_intra_op_parallelism_test_combinations()))
+  def testOptimizationDisableIntraOpParallelism(self, dataset_fn,
+                                                expected_output):
+    os.environ["TF_DATA_EXPERIMENT_OPT_IN"] = "disable_intra_op_parallelism"
+    os.environ["TF_JOB_NAME"] = "test_job"
+
+    dataset = dataset_fn()
+    dataset = dataset.apply(testing.assert_next(["MaxIntraOpParallelism"]))
+
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
+
+    del os.environ["TF_DATA_EXPERIMENT_OPT_IN"]
+    del os.environ["TF_JOB_NAME"]
+
   @combinations.generate(test_base.default_test_combinations())
   def testOptimizationThreadPoolDataset(self):
     dataset = dataset_ops.Dataset.range(10).batch(10)
@@ -390,23 +429,26 @@ class OptimizeDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     options = dataset_ops.Options()
 
     # Check defaults
-    autotune, algorithm, cpu_budget = options._autotune_settings()
+    autotune, algorithm, cpu_budget, ram_budget = options._autotune_settings()
     self.assertTrue(autotune)
     self.assertEqual(algorithm,
                      optimization_options._AutotuneAlgorithm.HILL_CLIMB)
     self.assertEqual(cpu_budget, 0)
+    self.assertEqual(ram_budget, 0)
 
   @combinations.generate(test_base.default_test_combinations())
-  def testAutotuningBufferSizes(self):
+  def testAutotuningSettings(self):
     options = dataset_ops.Options()
+    options.experimental_optimization.autotune_cpu_budget = 1000
+    options.experimental_optimization.autotune_ram_budget = 999999999
     options.experimental_optimization.autotune_buffers = True
     self.assertIn("inject_prefetch", options._graph_rewrites().enabled)
-    autotune, algorithm, cpu_budget = options._autotune_settings()
+    autotune, algorithm, cpu_budget, ram_budget = options._autotune_settings()
     self.assertTrue(autotune)
     self.assertEqual(algorithm,
                      optimization_options._AutotuneAlgorithm.GRADIENT_DESCENT)
-    self.assertEqual(cpu_budget, 0)
-
+    self.assertEqual(cpu_budget, 1000)
+    self.assertEqual(ram_budget, 999999999)
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py b/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py
index 611fbab4b8b..70fb64554a3 100644
--- a/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py
@@ -150,6 +150,17 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     self.assertDatasetProduces(device_dataset, list(range(10)))
 
+  @combinations.generate(test_base.default_test_combinations())
+  def testPrefetchToDeviceCorrectPlacement(self):
+
+    if not test_util.is_gpu_available():
+      self.skipTest("No GPU available")
+
+    dataset = dataset_ops.Dataset.range(10)
+    dataset = dataset.apply(prefetching_ops.prefetch_to_device("/gpu:0"))
+
+    self.assertIn("gpu:0", dataset._variant_tensor.device.lower())
+
   @combinations.generate(test_base.graph_only_combinations())
   def testPrefetchToDeviceWithReInit(self):
     host_dataset = dataset_ops.Dataset.range(10)
@@ -213,6 +224,48 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase):
     self.assertEqual(device_dataset._variant_tensor.device,
                      "/job:localhost/replica:0/task:0/device:GPU:0")
 
+  @combinations.generate(test_base.eager_only_combinations())
+  def testIteratorOnDeviceEagerMode(self):
+    if not test_util.is_gpu_available():
+      self.skipTest("No GPU available")
+
+    dataset = dataset_ops.Dataset.range(10)
+    dataset = dataset.apply(prefetching_ops.prefetch_to_device("/gpu:0"))
+    iterator = iter(dataset)
+    data = next(iterator)
+
+    self.assertIn("gpu:0", dataset._variant_tensor.device.lower())
+    self.assertIn("gpu:0", iterator._iterator_resource.device.lower())
+    self.assertIn("gpu:0", data.device.lower())
+
+  @combinations.generate(test_base.graph_only_combinations())
+  def testIteratorOnDeviceGraphModeOneShotIterator(self):
+    if not test_util.is_gpu_available():
+      self.skipTest("No GPU available")
+
+    dataset = dataset_ops.Dataset.range(10)
+    dataset = dataset.apply(prefetching_ops.prefetch_to_device("/gpu:0"))
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
+    data = iterator.get_next()
+
+    self.assertIn("gpu:0", dataset._variant_tensor.device.lower())
+    self.assertIn("gpu:0", iterator._iterator_resource.device.lower())
+    self.assertIn("gpu:0", data.device.lower())
+
+  @combinations.generate(test_base.graph_only_combinations())
+  def testIteratorOnDeviceGraphModeInitializableIterator(self):
+    if not test_util.is_gpu_available():
+      self.skipTest("No GPU available")
+
+    dataset = dataset_ops.Dataset.range(10)
+    dataset = dataset.apply(prefetching_ops.prefetch_to_device("/gpu:0"))
+    iterator = dataset_ops.make_initializable_iterator(dataset)
+    data = iterator.get_next()
+
+    self.assertIn("gpu:0", dataset._variant_tensor.device.lower())
+    self.assertIn("gpu:0", iterator._iterator_resource.device.lower())
+    self.assertIn("gpu:0", data.device.lower())
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/prefetch_with_slack_test.py b/tensorflow/python/data/experimental/kernel_tests/prefetch_with_slack_test.py
index cbff39b90e5..e9a4d52599a 100644
--- a/tensorflow/python/data/experimental/kernel_tests/prefetch_with_slack_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/prefetch_with_slack_test.py
@@ -86,18 +86,16 @@ class PrefetchWithSlackTest(test_base.DatasetTestBase, parameterized.TestCase):
     self.assertDatasetProduces(dataset, range(1, 11))
 
   @combinations.generate(test_base.default_test_combinations())
-  def testErrorWithoutPrefetch(self):
-    """The rewrite fails if there is no prefetch() in the pipeline."""
+  def testNoErrorWithoutPrefetch(self):
+    """The rewrite should not fail if there is no prefetch() in the pipeline."""
     dataset = dataset_ops.Dataset.range(10)
     options = dataset_ops.Options()
     options.experimental_slack = True
     dataset = dataset.with_options(options)
-    with self.assertRaises(errors.InvalidArgumentError):
-      get_next = self.getNext(dataset)
-      self.evaluate(get_next())
+    self.assertDatasetProduces(dataset, range(10))
 
   @combinations.generate(test_base.default_test_combinations())
-  def testErrorWithInvalidDataset(self):
+  def testNoErrorWithInvalidDataset(self):
     """With a nested dataset op after prefetch, the rewrite should fail."""
     dataset = dataset_ops.Dataset.range(10)
     dataset = dataset.prefetch(1)
@@ -105,9 +103,7 @@ class PrefetchWithSlackTest(test_base.DatasetTestBase, parameterized.TestCase):
     options = dataset_ops.Options()
     options.experimental_slack = True
     dataset = dataset.with_options(options)
-    with self.assertRaises(errors.InvalidArgumentError):
-      get_next = self.getNext(dataset)
-      self.evaluate(get_next())
+    self.assertDatasetProduces(dataset, range(10))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py
index 0b614ef0b84..8175480182f 100644
--- a/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py
@@ -26,7 +26,10 @@ from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.framework import combinations
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import image_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import test
@@ -35,8 +38,15 @@ from tensorflow.python.platform import test
 class BatchSizesForWorkerTest(test_base.DatasetTestBase,
                               parameterized.TestCase):
 
-  def _test(self, global_batch_size, num_workers, num_replicas_per_worker):
+  def _test(self, global_batch_size, num_workers, num_replicas_per_worker,
+            is_batch_size_static):
     """Test that all constraints are met for given parameters."""
+    if not is_batch_size_static:
+      # Adding a constant value here prevents downstream computation from
+      # statically deriving the value of global batch size when running
+      # in graph mode.
+      global_batch_size += constant_op.constant(0, dtypes.int64)
+
     batch_sizes_list = []
     for i in range(num_workers):
       batch_sizes_list.append(
@@ -65,8 +75,11 @@ class BatchSizesForWorkerTest(test_base.DatasetTestBase,
     # Constraint (D): Batch size of any two replicas differs by at most one
     self.assertLessEqual(np.max(batch_sizes_list) - np.min(batch_sizes_list), 1)
 
-  @combinations.generate(test_base.default_test_combinations())
-  def testBasic(self):
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(is_batch_size_static=[True, False])))
+  def testBasic(self, is_batch_size_static):
     # Manually verify basic test case.
     global_batch_size = 8
     num_workers = 2
@@ -76,17 +89,22 @@ class BatchSizesForWorkerTest(test_base.DatasetTestBase,
                                                       num_workers,
                                                       num_replicas_per_worker,
                                                       worker_index)
-      self.assertAllEqual([2, 2, 2, 2], self.evaluate(batch_sizes))
-    self._test(global_batch_size, num_workers, num_replicas_per_worker)
+      self.assertAllEqual([2, 2, 2, 2],
+                          tensor_util.constant_value(batch_sizes))
+    self._test(global_batch_size, num_workers, num_replicas_per_worker,
+               is_batch_size_static)
 
-  @combinations.generate(test_base.default_test_combinations())
-  def testBatchSizeIndivisibleByNumWorkers(self):
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(is_batch_size_static=[True, False])))
+  def testBatchSizeIndivisibleByNumWorkers(self, is_batch_size_static):
     global_batch_size = 4
     num_workers = 3
     num_replicas_per_worker = 1
 
     def get_batch_sizes_for_worker(worker_index):
-      return self.evaluate(
+      return tensor_util.constant_value(
           distribute.batch_sizes_for_worker(global_batch_size, num_workers,
                                             num_replicas_per_worker,
                                             worker_index))
@@ -95,19 +113,41 @@ class BatchSizesForWorkerTest(test_base.DatasetTestBase,
     self.assertAllEqual([2, 1, 1], get_batch_sizes_for_worker(0))
     self.assertAllEqual([1, 1, 2], get_batch_sizes_for_worker(1))
     self.assertAllEqual([1, 2, 1], get_batch_sizes_for_worker(2))
-    self._test(global_batch_size, num_workers, num_replicas_per_worker)
+    self._test(global_batch_size, num_workers, num_replicas_per_worker,
+               is_batch_size_static)
 
-  @combinations.generate(test_base.default_test_combinations())
-  def testBatchSizeIndivisibleByNumReplicas(self):
-    self._test(global_batch_size=4, num_workers=1, num_replicas_per_worker=5)
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(is_batch_size_static=[True, False])))
+  def testBatchSizeIndivisibleByNumReplicas(self, is_batch_size_static):
+    self._test(
+        global_batch_size=4,
+        num_workers=1,
+        num_replicas_per_worker=5,
+        is_batch_size_static=is_batch_size_static)
 
-  @combinations.generate(test_base.default_test_combinations())
-  def testBatchSizeSmallerThanNumReplicas(self):
-    self._test(global_batch_size=4, num_workers=2, num_replicas_per_worker=5)
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(is_batch_size_static=[True, False])))
+  def testBatchSizeSmallerThanNumReplicas(self, is_batch_size_static):
+    self._test(
+        global_batch_size=4,
+        num_workers=2,
+        num_replicas_per_worker=5,
+        is_batch_size_static=is_batch_size_static)
 
-  @combinations.generate(test_base.default_test_combinations())
-  def testBatchSizeSmallerThanNumWorkers(self):
-    self._test(global_batch_size=4, num_workers=5, num_replicas_per_worker=1)
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(is_batch_size_static=[True, False])))
+  def testBatchSizeSmallerThanNumWorkers(self, is_batch_size_static):
+    self._test(
+        global_batch_size=4,
+        num_workers=5,
+        num_replicas_per_worker=1,
+        is_batch_size_static=is_batch_size_static)
 
 
 def _flat_shapes(dataset):
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD b/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
index b3123d65852..e2c7d3ced2c 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
@@ -93,7 +93,7 @@ tf_py_test(
 
 tf_py_test(
     name = "cache_dataset_serialization_test",
-    size = "small",
+    size = "medium",
     srcs = ["cache_dataset_serialization_test.py"],
     tags = [
         "no_oss",
@@ -154,7 +154,7 @@ tf_py_test(
 
 tf_py_test(
     name = "choose_fastest_dataset_serialization_test",
-    size = "small",
+    size = "medium",
     srcs = ["choose_fastest_dataset_serialization_test.py"],
     tags = [
         "no_oss",
@@ -207,7 +207,7 @@ tf_py_test(
 
 tf_py_test(
     name = "csv_dataset_serialization_test",
-    size = "small",
+    size = "medium",
     srcs = ["csv_dataset_serialization_test.py"],
     tags = [
         "no_oss",
@@ -438,7 +438,7 @@ tf_py_test(
 
 tf_py_test(
     name = "optimize_dataset_serialization_test",
-    size = "small",
+    size = "medium",
     srcs = ["optimize_dataset_serialization_test.py"],
     tags = [
         "no_oss",
@@ -455,7 +455,7 @@ tf_py_test(
 
 tf_py_test(
     name = "rebatch_dataset_serialization_test",
-    size = "small",
+    size = "medium",
     srcs = ["rebatch_dataset_serialization_test.py"],
     tags = [
         "no_oss",
@@ -607,7 +607,7 @@ tf_py_test(
 
 tf_py_test(
     name = "scan_dataset_serialization_test",
-    size = "small",
+    size = "medium",
     srcs = ["scan_dataset_serialization_test.py"],
     tags = [
         "no_oss",
@@ -626,6 +626,7 @@ tf_py_test(
     name = "sequence_dataset_serialization_test",
     size = "medium",
     srcs = ["sequence_dataset_serialization_test.py"],
+    shard_count = 5,
     tags = [
         "no_oss",
         "no_pip",
@@ -842,7 +843,7 @@ tf_py_test(
 
 tf_py_test(
     name = "zip_dataset_serialization_test",
-    size = "small",
+    size = "medium",
     srcs = ["zip_dataset_serialization_test.py"],
     tags = [
         "no_oss",
diff --git a/tensorflow/python/data/experimental/kernel_tests/snapshot_test.py b/tensorflow/python/data/experimental/kernel_tests/snapshot_test.py
index b6fc337db61..dc7dd61679e 100644
--- a/tensorflow/python/data/experimental/kernel_tests/snapshot_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/snapshot_test.py
@@ -314,6 +314,25 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
         num_runs_per_fingerprint=1,
         num_snapshot_shards_per_run=multiprocessing.cpu_count())
 
+  @combinations.generate(test_base.default_test_combinations())
+  def testWriteSnapshotShuffleSameFingerprint(self):
+
+    def make_dataset():
+      dataset = dataset_ops.Dataset.range(1000)
+      dataset = dataset.shuffle(1000)
+      dataset = dataset.apply(snapshot.snapshot(self._snapshot_dir))
+      return dataset
+
+    dataset1 = make_dataset()
+    self.assertDatasetProducesSet(dataset1, list(range(1000)))
+    dataset2 = make_dataset()
+    self.assertDatasetProducesSet(dataset2, list(range(1000)))
+    self.assertSnapshotDirectoryContains(
+        self._snapshot_dir,
+        num_fingerprints=1,
+        num_runs_per_fingerprint=1,
+        num_snapshot_shards_per_run=multiprocessing.cpu_count())
+
 
 class LegacySnapshotDatasetTest(
     reader_dataset_ops_test_base.TFRecordDatasetTestBase,
diff --git a/tensorflow/python/data/experimental/kernel_tests/unique_test.py b/tensorflow/python/data/experimental/kernel_tests/unique_test.py
index 2b1b525a1b5..abae3e18ff5 100644
--- a/tensorflow/python/data/experimental/kernel_tests/unique_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/unique_test.py
@@ -61,6 +61,7 @@ class UniqueTest(test_base.DatasetTestBase, parameterized.TestCase):
           ([], []),
           ([1], [1]),
           ([1, 1, 1, 1, 1, 1, 1], [1]),
+          ([1, 1, 1, 1, 0], [1, 0]),
           ([1, 2, 3, 4], [1, 2, 3, 4]),
           ([1, 2, 4, 3, 2, 1, 2, 3, 4], [1, 2, 4, 3]),
           ([[1], [1, 1], [1, 1, 1]], [[1], [1, 1], [1, 1, 1]]),
diff --git a/tensorflow/python/data/experimental/ops/batching.py b/tensorflow/python/data/experimental/ops/batching.py
index 4d2bfbb8dc9..3d04de8970c 100644
--- a/tensorflow/python/data/experimental/ops/batching.py
+++ b/tensorflow/python/data/experimental/ops/batching.py
@@ -176,7 +176,7 @@ def map_and_batch_with_legacy_function(map_func,
     num_parallel_calls: (Optional.) A `tf.int32` scalar `tf.Tensor`,
       representing the number of elements to process in parallel. If not
       specified, `batch_size * num_parallel_batches` elements will be processed
-      in parallel. If the value `tf.data.experimental.AUTOTUNE` is used, then
+      in parallel. If the value `tf.data.AUTOTUNE` is used, then
       the number of parallel calls is set dynamically based on available CPU.
 
   Returns:
@@ -237,7 +237,7 @@ def map_and_batch(map_func,
     num_parallel_calls: (Optional.) A `tf.int32` scalar `tf.Tensor`,
       representing the number of elements to process in parallel. If not
       specified, `batch_size * num_parallel_batches` elements will be processed
-      in parallel. If the value `tf.data.experimental.AUTOTUNE` is used, then
+      in parallel. If the value `tf.data.AUTOTUNE` is used, then
       the number of parallel calls is set dynamically based on available CPU.
 
   Returns:
diff --git a/tensorflow/python/data/experimental/ops/data_service_ops.py b/tensorflow/python/data/experimental/ops/data_service_ops.py
index b5dd6bba5d8..eb5f6fce31f 100644
--- a/tensorflow/python/data/experimental/ops/data_service_ops.py
+++ b/tensorflow/python/data/experimental/ops/data_service_ops.py
@@ -435,10 +435,11 @@ def register_dataset(service, dataset):
   If the dataset is already registered with the tf.data service,
   `register_dataset` returns the already-registered dataset's id.
 
-  >>> dispatcher = tf.data.experimental.service.DispatchServer(port=0)
+  >>> dispatcher = tf.data.experimental.service.DispatchServer()
   >>> dispatcher_address = dispatcher.target.split("://")[1]
   >>> worker = tf.data.experimental.service.WorkerServer(
-  ...     port=0, dispatcher_address=dispatcher_address)
+  ...     tf.data.experimental.service.WorkerConfig(
+  ...         dispatcher_address=dispatcher_address))
   >>> dataset = tf.data.Dataset.range(10)
   >>> dataset_id = tf.data.experimental.service.register_dataset(
   ...     dispatcher.target, dataset)
@@ -466,9 +467,9 @@ def register_dataset(service, dataset):
 
   # Compress the dataset elements to reduce the amount of data that needs to
   # be sent over the network.
-  # TODO(b/157105111): Make this an autotuned parallel map when we have a way
-  # to limit memory usage.
-  dataset = dataset.map(lambda *x: compression_ops.compress(x))
+  dataset = dataset.map(
+      lambda *x: compression_ops.compress(x),
+      num_parallel_calls=dataset_ops.AUTOTUNE)
   # Prefetch one compressed element to reduce latency when requesting data
   # from tf.data workers.
   # TODO(b/157105111): Set this to autotune when we have a way to limit
@@ -518,10 +519,11 @@ def from_dataset_id(processing_mode,
   See the documentation for `tf.data.experimental.service.distribute` for more
   detail about how `from_dataset_id` works.
 
-  >>> dispatcher = tf.data.experimental.service.DispatchServer(port=0)
+  >>> dispatcher = tf.data.experimental.service.DispatchServer()
   >>> dispatcher_address = dispatcher.target.split("://")[1]
   >>> worker = tf.data.experimental.service.WorkerServer(
-  ...     port=0, dispatcher_address=dispatcher_address)
+  ...     tf.data.experimental.service.WorkerConfig(
+  ...         dispatcher_address=dispatcher_address))
   >>> dataset = tf.data.Dataset.range(10)
   >>> dataset_id = tf.data.experimental.service.register_dataset(
   ...     dispatcher.target, dataset)
diff --git a/tensorflow/python/data/experimental/ops/distribute.py b/tensorflow/python/data/experimental/ops/distribute.py
index 7b05b34c110..5105f30fd07 100644
--- a/tensorflow/python/data/experimental/ops/distribute.py
+++ b/tensorflow/python/data/experimental/ops/distribute.py
@@ -35,22 +35,38 @@ class _AutoShardDataset(dataset_ops.UnaryDataset):
   """A `Dataset` that shards the `Dataset` automatically.
 
   This dataset takes in an existing dataset and tries to automatically figure
-  out how to shard the dataset in a multi-worker scenario. Currently, it uses
-  Grappler to walk up the dataset graph until it finds a reader dataset (e.g.
-  CSVDataset, TFRecordDataset), then inserts a ShardDataset op before that node
+  out how to shard the dataset in a multi-worker scenario using graph rewrites.
+
+  If the AutoShardPolicy is set to FILE, it walks up the dataset graph until
+  it finds a reader dataset, then inserts a ShardDataset op before that node
   so that each worker only sees some files.
 
+  If the AutoShardPolicy is set to DATA, it inserts a ShardDataset op at the
+  end of the input pipeline, before any terminal PrefetchDataset if there is
+  one. Additionally, if there is a RebatchDatasetV2 in the input pipeline, it
+  is written to legacy RebatchDataset for correctness reasons, since
+  RebatchDatasetV2 is incompatible with data sharding.
+
+  If the AutoShardPolicy is set to AUTO, it tries to do file-based sharding.
+  If it cannot find a reader dataset, it falls back to doing data-based
+  sharding.
+
+  If the AutoShardPolicy is set to OFF, it does nothing.
+
   Args:
     num_workers: Total number of workers to shard this dataset across.
     index: The current worker index (out of the total number of workers) this
       dataset is for.
+    num_replicas: The total number of replicas across all workers. This is used
+      only when sharding by data (either DATA or AUTO) in order to rewrite
+      RebatchDatasetV2 to RebatchDataset.
 
   Raises:
     NotFoundError: If we cannot find a suitable reader dataset to begin
       automatically sharding the dataset.
   """
 
-  def __init__(self, input_dataset, num_workers, index):
+  def __init__(self, input_dataset, num_workers, index, num_replicas=None):
     self._input_dataset = input_dataset
 
     self._element_spec = input_dataset.element_spec
@@ -60,6 +76,7 @@ class _AutoShardDataset(dataset_ops.UnaryDataset):
         index=index,
         auto_shard_policy=int(
             input_dataset.options().experimental_distribute.auto_shard_policy),
+        num_replicas=num_replicas,
         **self._flat_structure)
     super(_AutoShardDataset, self).__init__(input_dataset, variant_tensor)
 
@@ -412,6 +429,13 @@ def batch_sizes_for_worker(global_batch_size, num_workers,
   # Constraint (A)
   num_subbatches = num_workers * num_replicas_per_worker
 
+  offset = worker_index * num_replicas_per_worker
+
+  const_value = tensor_util.constant_value(global_batch_size)
+  if const_value is not None:
+    # Use the constant global batch size for further calculations
+    global_batch_size = const_value
+
   # Let N = W * R. Constraint (B) and (D) jointly mean that the iterations
   # should have batch size either floor(B/N) or ceil(B/N). Namely, of the N
   # subbatches a batch is split into, B - N * floor(B/N) of them will have size
@@ -422,6 +446,16 @@ def batch_sizes_for_worker(global_batch_size, num_workers,
   # For worker 0, we assign the first num_ceil subbatches to have size
   # ceil(B/N), and the remainder to have size floor(B/N). The other workers will
   # each be offset by R * worker_index in order to meet constraint (C).
+  if const_value is not None:
+    # If the global batch size is a known constant value, we return a constant
+    # tensor directly instead of manipulating it with TF ops. This allows for
+    # better downstream shape inference.
+    worker_0 = [floor + 1] * num_ceil + [floor] * (num_subbatches - num_ceil)
+    return ops.convert_to_tensor(
+        worker_0[offset:] + worker_0[:offset],
+        dtype=dtypes.int64,
+        name="batch_sizes")
+
   worker_0 = array_ops.ones(num_subbatches, dtype=dtypes.int64)
   worker_0 = floor * worker_0 + array_ops.concat([
       array_ops.ones(num_ceil, dtype=dtypes.int64),
@@ -429,7 +463,6 @@ def batch_sizes_for_worker(global_batch_size, num_workers,
   ],
                                                  axis=0)
 
-  offset = worker_index * num_replicas_per_worker
   return array_ops.concat([worker_0[offset:], worker_0[:offset]], axis=0)
 
 
diff --git a/tensorflow/python/data/experimental/ops/error_ops.py b/tensorflow/python/data/experimental/ops/error_ops.py
index 23937bb76f8..fa0e737caef 100644
--- a/tensorflow/python/data/experimental/ops/error_ops.py
+++ b/tensorflow/python/data/experimental/ops/error_ops.py
@@ -20,10 +20,11 @@ from __future__ import print_function
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.ops import gen_experimental_dataset_ops
 from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.compat import compat
 
 
 @tf_export("data.experimental.ignore_errors")
-def ignore_errors():
+def ignore_errors(log_warning=False):
   """Creates a `Dataset` from another `Dataset` and silently ignores any errors.
 
   Use this transformation to produce a dataset that contains the same elements
@@ -41,6 +42,9 @@ def ignore_errors():
   dataset =
       dataset.apply(tf.data.experimental.ignore_errors())  # ==> {1., 0.5, 0.2}
   ```
+  Args:
+     log_warning: (Optional.) A 'tf.bool' scalar indicating whether ignored
+      errors should be logged to stderr. Defaults to 'False'.
 
   Returns:
     A `Dataset` transformation function, which can be passed to
@@ -48,7 +52,7 @@ def ignore_errors():
   """
 
   def _apply_fn(dataset):
-    return _IgnoreErrorsDataset(dataset)
+    return _IgnoreErrorsDataset(dataset, log_warning)
 
   return _apply_fn
 
@@ -56,11 +60,18 @@ def ignore_errors():
 class _IgnoreErrorsDataset(dataset_ops.UnaryUnchangedStructureDataset):
   """A `Dataset` that silently ignores errors when computing its input."""
 
-  def __init__(self, input_dataset):
+  def __init__(self, input_dataset, log_warning):
     """See `Dataset.ignore_errors()` for details."""
     self._input_dataset = input_dataset
-    variant_tensor = (
-        gen_experimental_dataset_ops.ignore_errors_dataset(
-            self._input_dataset._variant_tensor,  # pylint: disable=protected-access
-            **self._flat_structure))
+    if compat.forward_compatible(2020, 8, 26) or log_warning:
+      variant_tensor = (
+          gen_experimental_dataset_ops.ignore_errors_dataset(
+              self._input_dataset._variant_tensor,  # pylint: disable=protected-access
+              log_warning=log_warning,
+              **self._flat_structure))
+    else:
+      variant_tensor = (
+          gen_experimental_dataset_ops.ignore_errors_dataset(
+              self._input_dataset._variant_tensor,  # pylint: disable=protected-access
+              **self._flat_structure))
     super(_IgnoreErrorsDataset, self).__init__(input_dataset, variant_tensor)
diff --git a/tensorflow/python/data/experimental/ops/interleave_ops.py b/tensorflow/python/data/experimental/ops/interleave_ops.py
index a3cc75a35d3..4c16d353107 100644
--- a/tensorflow/python/data/experimental/ops/interleave_ops.py
+++ b/tensorflow/python/data/experimental/ops/interleave_ops.py
@@ -37,7 +37,7 @@ from tensorflow.python.util.tf_export import tf_export
 @deprecation.deprecated(
     None,
     "Use `tf.data.Dataset.interleave(map_func, cycle_length, block_length, "
-    "num_parallel_calls=tf.data.experimental.AUTOTUNE)` instead. If sloppy "
+    "num_parallel_calls=tf.data.AUTOTUNE)` instead. If sloppy "
     "execution is desired, use `tf.data.Options.experimental_deterministic`.")
 @tf_export("data.experimental.parallel_interleave")
 def parallel_interleave(map_func,
diff --git a/tensorflow/python/data/experimental/ops/optimization_options.py b/tensorflow/python/data/experimental/ops/optimization_options.py
index fa7a0d23dea..ea32408ceed 100644
--- a/tensorflow/python/data/experimental/ops/optimization_options.py
+++ b/tensorflow/python/data/experimental/ops/optimization_options.py
@@ -117,6 +117,14 @@ class OptimizationOptions(options.OptionsBase):
       "are allowed but may result in CPU contention. If None, defaults to the "
       "number of schedulable CPU cores.")
 
+  autotune_ram_budget = options.create_option(
+      name="autotune_ram_budget",
+      ty=int,
+      docstring=
+      "When autotuning is enabled (through `autotune`), determines the RAM "
+      "budget to use. Values greater than the available RAM in bytes may "
+      "result in OOM. If None, defaults to half of the available RAM in bytes.")
+
   filter_fusion = options.create_option(
       name="filter_fusion",
       ty=bool,
@@ -223,14 +231,17 @@ class OptimizationOptions(options.OptionsBase):
         _AutotuneAlgorithm.GRADIENT_DESCENT
         if self._autotune_buffers() else _AutotuneAlgorithm.HILL_CLIMB)
     cpu_budget = 0  # Indicates that all CPU cores should be used by default.
+    ram_budget = 0  # Indicates that default value of RAM budget should be used.
 
     # Set these options if they are explicitly set by the user.
     if self.autotune is False:  # pylint: disable=g-bool-id-comparison
       autotune = False
     if self.autotune_cpu_budget is not None:
       cpu_budget = self.autotune_cpu_budget
+    if self.autotune_ram_budget is not None:
+      ram_budget = self.autotune_ram_budget
 
-    return autotune, algorithm, cpu_budget
+    return autotune, algorithm, cpu_budget, ram_budget
 
   def _graph_rewrites(self):
     """Produces lists of enabled, disabled and default graph optimizations.
diff --git a/tensorflow/python/data/experimental/ops/stats_aggregator.py b/tensorflow/python/data/experimental/ops/stats_aggregator.py
index d8174acb818..bf4bb4762b1 100644
--- a/tensorflow/python/data/experimental/ops/stats_aggregator.py
+++ b/tensorflow/python/data/experimental/ops/stats_aggregator.py
@@ -21,6 +21,7 @@ import tempfile
 
 from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
 from tensorflow.python.ops import summary_ops_v2
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -28,6 +29,7 @@ _DEFAULT_MAX_QUEUE = 10
 
 
 @tf_export("data.experimental.StatsAggregator", v1=[])
+@deprecation.deprecated_endpoints("data.experimental.StatsAggregator")
 class StatsAggregatorV2(object):
   """A stateful resource that aggregates statistics from one or more iterators.
 
@@ -60,6 +62,12 @@ class StatsAggregatorV2(object):
   different ways of exporting statistics, and add more types of statistics.
   """
 
+  # This deprecation warning on __init__ is necessary to print deprecation
+  # messages.
+  @deprecation.deprecated(
+      None,
+      "Use TF Profiler to analyze performance instead."
+  )
   def __init__(self):
     self._resource = ged_ops.stats_aggregator_handle_v2()
     # There could be a conflict with multiple file writer in the same logdir,
@@ -79,6 +87,7 @@ class StatsAggregatorV2(object):
 
 
 @tf_export(v1=["data.experimental.StatsAggregator"])
+@deprecation.deprecated_endpoints("data.experimental.StatsAggregator")
 class StatsAggregatorV1(object):
   """A stateful resource that aggregates statistics from one or more iterators.
 
@@ -123,6 +132,12 @@ class StatsAggregatorV1(object):
   different ways of exporting statistics, and add more types of statistics.
   """
 
+  # This deprecation warning on __init__ is necessary to print deprecation
+  # messages.
+  @deprecation.deprecated(
+      None,
+      "Use TF Profiler to analyze performance instead."
+  )
   def __init__(self):
     """Creates a `StatsAggregator`."""
     self._resource = ged_ops.stats_aggregator_handle()
diff --git a/tensorflow/python/data/experimental/service/BUILD b/tensorflow/python/data/experimental/service/BUILD
index f072c5f2208..e8af29caa0e 100644
--- a/tensorflow/python/data/experimental/service/BUILD
+++ b/tensorflow/python/data/experimental/service/BUILD
@@ -18,7 +18,6 @@ tf_python_pybind_extension(
         "//tensorflow/python:pybind11_lib",
         "//tensorflow/python:pybind11_status",
         "//third_party/python_runtime:headers",
-        "@com_github_grpc_grpc//:grpc++_public_hdrs",
         "@pybind11",
     ],
 )
diff --git a/tensorflow/python/data/experimental/service/__init__.py b/tensorflow/python/data/experimental/service/__init__.py
index 74ced1a8eb6..53cf1ae4892 100644
--- a/tensorflow/python/data/experimental/service/__init__.py
+++ b/tensorflow/python/data/experimental/service/__init__.py
@@ -64,8 +64,8 @@ workers, the tf.data service should be able to achieve similar speed.
 ## Running the tf.data service
 
 tf.data servers should be brought up alongside your training jobs, and brought
-down when the jobs are finished. The tf.data service uses one DispatchServer and
-any number of WorkerServers. See
+down when the jobs are finished. The tf.data service uses one `DispatchServer`
+and any number of `WorkerServers`. See
 https://github.com/tensorflow/ecosystem/tree/master/data_service for an example
 of using Google Kubernetes Engine (GKE) to manage the tf.data service. The
 server implementation in
@@ -75,12 +75,17 @@ contexts.
 
 ### Fault tolerance
 
-The tf.data dispatch server manages all state for the service, so it is
-important to keep the server alive. If the dispatch server is restarted
-mid-training, the training must also be restarted.
+By default, the tf.data dispatch server stores its state in-memory, making it a
+single point of failure during training. To avoid this, pass
+`fault_tolerant_mode=True` when creating your `DispatchServer`. Dispatcher
+fault tolerance requires `work_dir` to be configured and accessible from the
+dispatcher both before and after restart (e.g. a GCS path). With fault tolerant
+mode enabled, the dispatcher will journal its state to the work directory so
+that no state is lost when the dispatcher is restarted.
 
-WorkerServers, on the other hand, may be freely restarted, added, or removed
-during training.
+WorkerServers may be freely restarted, added, or removed during training. At
+startup, workers will register with the dispatcher and begin processing all
+outstanding jobs from the beginning.
 
 ## Using the tf.data service from your training job
 
@@ -102,10 +107,11 @@ dataset = dataset.apply(tf.data.experimental.service.distribute(
 
 Below is a toy example that you can run yourself.
 
->>> dispatcher = tf.data.experimental.service.DispatchServer(port=0)
+>>> dispatcher = tf.data.experimental.service.DispatchServer()
 >>> dispatcher_address = dispatcher.target.split("://")[1]
 >>> worker = tf.data.experimental.service.WorkerServer(
-...     port=0, dispatcher_address=dispatcher_address)
+...     tf.data.experimental.service.WorkerConfig(
+...         dispatcher_address=dispatcher_address))
 >>> dataset = tf.data.Dataset.range(10)
 >>> dataset = dataset.apply(tf.data.experimental.service.distribute(
 ...     processing_mode="parallel_epochs", service=dispatcher.target))
@@ -123,5 +129,7 @@ from __future__ import print_function
 from tensorflow.python.data.experimental.ops.data_service_ops import distribute
 from tensorflow.python.data.experimental.ops.data_service_ops import from_dataset_id
 from tensorflow.python.data.experimental.ops.data_service_ops import register_dataset
+from tensorflow.python.data.experimental.service.server_lib import DispatcherConfig
 from tensorflow.python.data.experimental.service.server_lib import DispatchServer
+from tensorflow.python.data.experimental.service.server_lib import WorkerConfig
 from tensorflow.python.data.experimental.service.server_lib import WorkerServer
diff --git a/tensorflow/python/data/experimental/service/server_lib.py b/tensorflow/python/data/experimental/service/server_lib.py
index 99dc9297901..f0a8254f906 100644
--- a/tensorflow/python/data/experimental/service/server_lib.py
+++ b/tensorflow/python/data/experimental/service/server_lib.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
+
 # pylint: disable=invalid-import-order,g-bad-import-order, unused-import
 from tensorflow.core.protobuf.data.experimental import service_config_pb2
 from tensorflow.python import pywrap_tensorflow
@@ -25,6 +27,57 @@ from tensorflow.python.data.experimental.service import _pywrap_server_lib
 from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("data.experimental.service.DispatcherConfig")
+class DispatcherConfig(
+    collections.namedtuple("DispatcherConfig", [
+        "port", "protocol", "work_dir", "fault_tolerant_mode",
+        "job_gc_check_interval_ms", "job_gc_timeout_ms"
+    ])):
+  """Configuration class for tf.data service dispatchers.
+
+  Fields:
+    port: Specifies the port to bind to. A value of 0 indicates that the server
+      may bind to any available port.
+    protocol: The protocol to use for communicating with the tf.data service.
+      Defaults to `"grpc"`.
+    work_dir: A directory to store dispatcher state in. This
+      argument is required for the dispatcher to be able to recover from
+      restarts.
+    fault_tolerant_mode: Whether the dispatcher should write its state to a
+      journal so that it can recover from restarts. Dispatcher state, including
+      registered datasets and created jobs, is synchronously written to the
+      journal before responding to RPCs. If `True`, `work_dir` must also be
+      specified.
+    job_gc_check_interval_ms: How often the dispatcher should scan through to
+      delete old and unused jobs, in milliseconds. If not set, the runtime will
+      select a reasonable default. A higher value will reduce load on the
+      dispatcher, while a lower value will reduce the time it takes for the
+      dispatcher to garbage collect expired jobs.
+    job_gc_timeout_ms: How long a job needs to be unused before it becomes a
+      candidate for garbage collection, in milliseconds. If not set, the runtime
+      will select a reasonable default. A higher value will cause jobs to stay
+      around longer with no consumers. This is useful if there is a large gap in
+      time between when consumers read from the job. A lower value will reduce
+      the time it takes to reclaim the resources from expired jobs.
+  """
+
+  def __new__(cls,
+              port=0,
+              protocol="grpc",
+              work_dir=None,
+              fault_tolerant_mode=False,
+              job_gc_check_interval_ms=None,
+              job_gc_timeout_ms=None):
+    if job_gc_check_interval_ms is None:
+      job_gc_check_interval_ms = 10 * 60 * 1000  # 10 minutes.
+    if job_gc_timeout_ms is None:
+      job_gc_timeout_ms = 5 * 60 * 1000  # 5 minutes.
+    return super(DispatcherConfig,
+                 cls).__new__(cls, port, protocol, work_dir,
+                              fault_tolerant_mode, job_gc_check_interval_ms,
+                              job_gc_timeout_ms)
+
+
 @tf_export("data.experimental.service.DispatchServer", v1=[])
 class DispatchServer(object):
   """An in-process tf.data service dispatch server.
@@ -33,10 +86,10 @@ class DispatchServer(object):
   `tf.data.experimental.service.WorkerServer`s. When the workers start, they
   register themselves with the dispatcher.
 
-  >>> dispatcher = tf.data.experimental.service.DispatchServer(port=0)
+  >>> dispatcher = tf.data.experimental.service.DispatchServer()
   >>> dispatcher_address = dispatcher.target.split("://")[1]
-  >>> worker = tf.data.experimental.service.WorkerServer(
-  ...     port=0, dispatcher_address=dispatcher_address)
+  >>> worker = tf.data.experimental.service.WorkerServer(WorkerConfig(
+  ...     dispatcher_address=dispatcher_address))
   >>> dataset = tf.data.Dataset.range(10)
   >>> dataset = dataset.apply(tf.data.experimental.service.distribute(
   ...     processing_mode="parallel_epochs", service=dispatcher.target))
@@ -47,39 +100,54 @@ class DispatchServer(object):
   indefinitely after starting up the server.
 
   ```
-  dispatcher = tf.data.experimental.service.DispatchServer(port=5050)
+  dispatcher = tf.data.experimental.service.DispatchServer(
+      tf.data.experimental.service.DispatcherConfig(port=5050))
   dispatcher.join()
   ```
+
+  To start a `DispatchServer` in fault-tolerant mode, set `work_dir` and
+  `fault_tolerant_mode` like below:
+
+  ```
+  dispatcher = tf.data.experimental.service.DispatchServer(
+      tf.data.experimental.service.DispatcherConfig(
+          port=5050,
+          work_dir="gs://my-bucket/dispatcher/work_dir",
+          fault_tolerant_mode=True))
+  ```
   """
 
-  def __init__(self, port, protocol=None, start=True):
+  def __init__(self, config=None, start=True):
     """Creates a new dispatch server.
 
     Args:
-      port: Specifies the port to bind to.
-      protocol: (Optional.) Specifies the protocol to be used by the server.
-        Acceptable values include `"grpc", "grpc+local"`. Defaults to `"grpc"`.
+      config: (Optional.) A `tf.data.experimental.service.DispatcherConfig`
+        configration. If `None`, the dispatcher will be use default
+        configuration values.
       start: (Optional.) Boolean, indicating whether to start the server after
-        creating it. Defaults to `True`.
-
-    Raises:
-      tf.errors.OpError: Or one of its subclasses if an error occurs while
-        creating the TensorFlow server.
+        creating it.
     """
-    if protocol is None:
-      protocol = "grpc"
-    self._protocol = protocol
-    config = service_config_pb2.DispatcherConfig(port=port, protocol=protocol)
+    config = config or DispatcherConfig()
+    if config.fault_tolerant_mode and not config.work_dir:
+      raise ValueError(
+          "Cannot enable fault tolerant mode without configuring a work_dir")
+    self._config = config
+    config_proto = service_config_pb2.DispatcherConfig(
+        port=config.port,
+        protocol=config.protocol,
+        work_dir=config.work_dir,
+        fault_tolerant_mode=config.fault_tolerant_mode,
+        job_gc_check_interval_ms=config.job_gc_check_interval_ms,
+        job_gc_timeout_ms=config.job_gc_timeout_ms)
     self._server = _pywrap_server_lib.TF_DATA_NewDispatchServer(
-        config.SerializeToString())
+        config_proto.SerializeToString())
     if start:
       self._server.start()
 
   def start(self):
     """Starts this server.
 
-    >>> dispatcher = tf.data.experimental.service.DispatchServer(port=0,
-    ...                                                          start=False)
+    >>> dispatcher = tf.data.experimental.service.DispatchServer(start=False)
     >>> dispatcher.start()
 
     Raises:
@@ -94,7 +162,8 @@ class DispatchServer(object):
     This is useful when starting a dedicated dispatch process.
 
     ```
-    dispatcher = tf.data.experimental.service.DispatchServer(port=5050)
+    dispatcher = tf.data.experimental.service.DispatchServer(
+        tf.data.experimental.service.DispatcherConfig(port=5050))
     dispatcher.join()
     ```
 
@@ -108,7 +177,7 @@ class DispatchServer(object):
   def target(self):
     """Returns a target that can be used to connect to the server.
 
-    >>> dispatcher = tf.data.experimental.service.DispatchServer(port=0)
+    >>> dispatcher = tf.data.experimental.service.DispatchServer()
     >>> dataset = tf.data.Dataset.range(10)
     >>> dataset = dataset.apply(tf.data.experimental.service.distribute(
     ...     processing_mode="parallel_epochs", service=dispatcher.target))
@@ -116,7 +185,7 @@ class DispatchServer(object):
     The returned string will be in the form protocol://address, e.g.
     "grpc://localhost:5050".
     """
-    return "{0}://localhost:{1}".format(self._protocol,
+    return "{0}://localhost:{1}".format(self._config.protocol,
                                         self._server.bound_port())
 
   def _stop(self):
@@ -144,6 +213,46 @@ class DispatchServer(object):
     return self._server.num_workers()
 
 
+@tf_export("data.experimental.service.WorkerConfig")
+class WorkerConfig(
+    collections.namedtuple("WorkerConfig", [
+        "dispatcher_address", "worker_address", "port", "protocol",
+        "heartbeat_interval_ms"
+    ])):
+  """Configuration class for tf.data service dispatchers.
+
+  Fields:
+    dispatcher_address: Specifies the address of the dispatcher.
+    worker_address: Specifies the address of the worker server. This address is
+      passed to the dispatcher so that the dispatcher can tell clients how to
+      connect to this worker.
+    port: Specifies the port to bind to. A value of 0 indicates that the worker
+      can bind to any available port.
+    protocol: (Optional.) Specifies the protocol to be used by the server.
+      Defaults to `"grpc"`.
+    heartbeat_interval_ms: How often the worker should heartbeat to the
+      dispatcher, in milliseconds. If not set, the runtime will select a
+      reasonable default. A higher value will reduce the load on the dispatcher,
+      while a lower value will reduce the time it takes to reclaim resources
+      from finished jobs.
+  """
+
+  def __new__(cls,
+              dispatcher_address,
+              worker_address=None,
+              port=0,
+              protocol="grpc",
+              heartbeat_interval_ms=None):
+    if worker_address is None:
+      worker_address = "localhost:%port%"
+    if heartbeat_interval_ms is None:
+      heartbeat_interval_ms = 30 * 1000  # 30 seconds
+
+    return super(WorkerConfig,
+                 cls).__new__(cls, dispatcher_address, worker_address, port,
+                              protocol, heartbeat_interval_ms)
+
+
 @tf_export("data.experimental.service.WorkerServer", v1=[])
 class WorkerServer(object):
   """An in-process tf.data service worker server.
@@ -153,10 +262,11 @@ class WorkerServer(object):
   RPC. A worker is associated with a single
   `tf.data.experimental.service.DispatchServer`.
 
-  >>> dispatcher = tf.data.experimental.service.DispatchServer(port=0)
+  >>> dispatcher = tf.data.experimental.service.DispatchServer()
   >>> dispatcher_address = dispatcher.target.split("://")[1]
   >>> worker = tf.data.experimental.service.WorkerServer(
-  ...     port=0, dispatcher_address=dispatcher_address)
+  ...     tf.data.experimental.service.WorkerConfig(
+  ...         dispatcher_address=dispatcher_address))
   >>> dataset = tf.data.Dataset.range(10)
   >>> dataset = dataset.apply(tf.data.experimental.service.distribute(
   ...     processing_mode="parallel_epochs", service=dispatcher.target))
@@ -173,45 +283,24 @@ class WorkerServer(object):
   ```
   """
 
-  def __init__(self,
-               port,
-               dispatcher_address,
-               worker_address=None,
-               protocol=None,
-               start=True):
+  def __init__(self, config, start=True):
     """Creates a new worker server.
 
     Args:
-      port: Specifies the port to bind to. A value of 0 indicates that the
-        worker can bind to any available port.
-      dispatcher_address: Specifies the address of the dispatcher.
-      worker_address: (Optional.) Specifies the address of the worker server.
-        This address is passed to the dispatcher so that the dispatcher can
-        tell clients how to connect to this worker. Defaults to
-        `"localhost:%port%"`, where `%port%` will be replaced with the port used
-        by the worker.
-      protocol: (Optional.) Specifies the protocol to be used by the server.
-        Acceptable values include `"grpc", "grpc+local"`. Defaults to `"grpc"`.
+      config: A `tf.data.experimental.service.WorkerConfig` configration.
       start: (Optional.) Boolean, indicating whether to start the server after
-        creating it. Defaults to `True`.
-
-    Raises:
-      tf.errors.OpError: Or one of its subclasses if an error occurs while
-        creating the TensorFlow server.
+        creating it.
     """
-    if worker_address is None:
-      worker_address = "localhost:%port%"
-    if protocol is None:
-      protocol = "grpc"
-
-    self._protocol = protocol
-    config = service_config_pb2.WorkerConfig(
-        port=port,
-        protocol=protocol,
-        dispatcher_address=dispatcher_address,
-        worker_address=worker_address)
+    if config.dispatcher_address is None:
+      raise ValueError("must specify a dispatcher_address")
+    config_proto = service_config_pb2.WorkerConfig(
+        dispatcher_address=config.dispatcher_address,
+        worker_address=config.worker_address,
+        port=config.port,
+        protocol=config.protocol,
+        heartbeat_interval_ms=config.heartbeat_interval_ms)
     self._server = _pywrap_server_lib.TF_DATA_NewWorkerServer(
-        config.SerializeToString())
+        config_proto.SerializeToString())
     if start:
       self._server.start()
 
@@ -262,3 +351,7 @@ class WorkerServer(object):
     The returned string will be in the form address:port, e.g. "localhost:1000".
     """
     return "localhost:{0}".format(self._server.bound_port())
+
+  def _num_tasks(self):
+    """Returns the number of tasks currently being executed on the worker."""
+    return self._server.num_tasks()
diff --git a/tensorflow/python/data/experimental/service/server_lib_test.py b/tensorflow/python/data/experimental/service/server_lib_test.py
index f7354e64a3a..5b11f58c141 100644
--- a/tensorflow/python/data/experimental/service/server_lib_test.py
+++ b/tensorflow/python/data/experimental/service/server_lib_test.py
@@ -18,74 +18,147 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import logging
+import tempfile
+import threading
+import unittest
 from tensorflow.python.data.experimental.service import server_lib
-
 from tensorflow.python.platform import test
 
+_portpicker_import_error = None
+try:
+  import portpicker  # pylint: disable=g-import-not-at-top
+except ImportError as _error:  # pylint: disable=invalid-name
+  _portpicker_import_error = _error
+  portpicker = None
+
+ASSIGNED_PORTS = set()
+lock = threading.Lock()
+
+
+def pick_unused_port():
+  """Returns an unused and unassigned local port."""
+
+  if _portpicker_import_error:
+    raise _portpicker_import_error  # pylint: disable=raising-bad-type
+
+  global ASSIGNED_PORTS
+  with lock:
+    while True:
+      try:
+        port = portpicker.pick_unused_port()
+      except portpicker.NoFreePortFoundError:
+        raise unittest.SkipTest("Flakes in portpicker library do not represent "
+                                "TensorFlow errors.")
+      if port > 10000 and port not in ASSIGNED_PORTS:
+        ASSIGNED_PORTS.add(port)
+        logging.info("Using local port %r", port)
+        return port
+
 
 class ServerLibTest(test.TestCase):
 
   def testStartDispatcher(self):
-    dispatcher = server_lib.DispatchServer(0, start=False)
+    dispatcher = server_lib.DispatchServer(start=False)
     dispatcher.start()
 
+  def testStartDispatcherWithPortConfig(self):
+    port = pick_unused_port()
+    config = server_lib.DispatcherConfig(port=port)
+    dispatcher = server_lib.DispatchServer(config=config, start=True)
+    self.assertEqual(dispatcher.target, "grpc://localhost:{}".format(port))
+
+  def testStartDispatcherWithWorkDirConfig(self):
+    temp_dir = tempfile.mkdtemp()
+    config = server_lib.DispatcherConfig(work_dir=temp_dir)
+    dispatcher = server_lib.DispatchServer(  # pylint: disable=unused-variable
+        config=config, start=True)
+
+  def testStartDispatcherWithFaultTolerantConfig(self):
+    temp_dir = tempfile.mkdtemp()
+    config = server_lib.DispatcherConfig(
+        work_dir=temp_dir, fault_tolerant_mode=True)
+    dispatcher = server_lib.DispatchServer(  # pylint: disable=unused-variable
+        config=config, start=True)
+
+  def testStartDispatcherWithWrongFaultTolerantConfig(self):
+    config = server_lib.DispatcherConfig(fault_tolerant_mode=True)
+    error = "Cannot enable fault tolerant mode without configuring a work_dir"
+    with self.assertRaisesRegex(ValueError, error):
+      dispatcher = server_lib.DispatchServer(  # pylint: disable=unused-variable
+          config=config, start=True)
+
   def testMultipleStartDispatcher(self):
-    dispatcher = server_lib.DispatchServer(0, start=True)
+    dispatcher = server_lib.DispatchServer(start=True)
     dispatcher.start()
 
   def testStartWorker(self):
-    dispatcher = server_lib.DispatchServer(0)
-    worker = server_lib.WorkerServer(0, dispatcher._address, start=False)
+    dispatcher = server_lib.DispatchServer()
+    worker = server_lib.WorkerServer(
+        server_lib.WorkerConfig(dispatcher._address), start=False)
     worker.start()
 
+  def testStartWorkerWithPortConfig(self):
+    dispatcher = server_lib.DispatchServer()
+    port = pick_unused_port()
+    worker = server_lib.WorkerServer(
+        server_lib.WorkerConfig(dispatcher._address, port=port), start=True)
+    self.assertEqual(worker._address, "localhost:{}".format(port))
+
   def testMultipleStartWorker(self):
-    dispatcher = server_lib.DispatchServer(0)
-    worker = server_lib.WorkerServer(0, dispatcher._address, start=True)
+    dispatcher = server_lib.DispatchServer()
+    worker = server_lib.WorkerServer(
+        server_lib.WorkerConfig(dispatcher._address), start=True)
     worker.start()
 
   def testStopDispatcher(self):
-    dispatcher = server_lib.DispatchServer(0)
+    dispatcher = server_lib.DispatchServer()
     dispatcher._stop()
     dispatcher._stop()
 
   def testStopWorker(self):
-    dispatcher = server_lib.DispatchServer(0)
-    worker = server_lib.WorkerServer(0, dispatcher._address)
+    dispatcher = server_lib.DispatchServer()
+    worker = server_lib.WorkerServer(
+        server_lib.WorkerConfig(dispatcher._address))
     worker._stop()
     worker._stop()
 
   def testStopStartDispatcher(self):
-    dispatcher = server_lib.DispatchServer(0)
+    dispatcher = server_lib.DispatchServer()
     dispatcher._stop()
     with self.assertRaisesRegex(
         RuntimeError, "Server cannot be started after it has been stopped"):
       dispatcher.start()
 
   def testStopStartWorker(self):
-    dispatcher = server_lib.DispatchServer(0)
-    worker = server_lib.WorkerServer(0, dispatcher._address)
+    dispatcher = server_lib.DispatchServer()
+    worker = server_lib.WorkerServer(
+        server_lib.WorkerConfig(dispatcher._address))
     worker._stop()
     with self.assertRaisesRegex(
         RuntimeError, "Server cannot be started after it has been stopped"):
       worker.start()
 
   def testJoinDispatcher(self):
-    dispatcher = server_lib.DispatchServer(0)
+    dispatcher = server_lib.DispatchServer()
     dispatcher._stop()
     dispatcher.join()
 
   def testJoinWorker(self):
-    dispatcher = server_lib.DispatchServer(0)
-    worker = server_lib.WorkerServer(0, dispatcher._address)
+    dispatcher = server_lib.DispatchServer()
+    worker = server_lib.WorkerServer(
+        server_lib.WorkerConfig(dispatcher._address))
     worker._stop()
     worker.join()
 
   def testDispatcherNumWorkers(self):
-    dispatcher = server_lib.DispatchServer(0)
+    dispatcher = server_lib.DispatchServer()
     self.assertEqual(0, dispatcher._num_workers())
-    worker1 = server_lib.WorkerServer(0, dispatcher._address)  # pylint: disable=unused-variable
+    worker1 = server_lib.WorkerServer(  # pylint: disable=unused-variable
+        server_lib.WorkerConfig(dispatcher._address))
     self.assertEqual(1, dispatcher._num_workers())
-    worker2 = server_lib.WorkerServer(0, dispatcher._address)  # pylint: disable=unused-variable
+    worker2 = server_lib.WorkerServer(  # pylint: disable=unused-variable
+        server_lib.WorkerConfig(dispatcher._address))
     self.assertEqual(2, dispatcher._num_workers())
 
 
diff --git a/tensorflow/python/data/experimental/service/server_lib_wrapper.cc b/tensorflow/python/data/experimental/service/server_lib_wrapper.cc
index f59c1fb90bf..5a229f88d92 100644
--- a/tensorflow/python/data/experimental/service/server_lib_wrapper.cc
+++ b/tensorflow/python/data/experimental/service/server_lib_wrapper.cc
@@ -34,7 +34,8 @@ PYBIND11_MODULE(_pywrap_server_lib, m) {
                                                        "DispatchGrpcDataServer")
       .def("start", &tensorflow::data::DispatchGrpcDataServer::Start)
       .def("stop", &tensorflow::data::DispatchGrpcDataServer::Stop)
-      .def("join", &tensorflow::data::DispatchGrpcDataServer::Join)
+      .def("join", &tensorflow::data::DispatchGrpcDataServer::Join,
+           py::call_guard<py::gil_scoped_release>())
       .def("bound_port", &tensorflow::data::DispatchGrpcDataServer::BoundPort)
       .def("num_workers",
            [](tensorflow::data::DispatchGrpcDataServer* server) -> int {
@@ -47,8 +48,16 @@ PYBIND11_MODULE(_pywrap_server_lib, m) {
   py::class_<tensorflow::data::WorkerGrpcDataServer>(m, "WorkerGrpcDataServer")
       .def("start", &tensorflow::data::WorkerGrpcDataServer::Start)
       .def("stop", &tensorflow::data::WorkerGrpcDataServer::Stop)
-      .def("join", &tensorflow::data::WorkerGrpcDataServer::Join)
-      .def("bound_port", &tensorflow::data::WorkerGrpcDataServer::BoundPort);
+      .def("join", &tensorflow::data::WorkerGrpcDataServer::Join,
+           py::call_guard<py::gil_scoped_release>())
+      .def("bound_port", &tensorflow::data::WorkerGrpcDataServer::BoundPort)
+      .def("num_tasks",
+           [](tensorflow::data::WorkerGrpcDataServer* server) -> int {
+             int num_tasks;
+             tensorflow::Status status = server->NumTasks(&num_tasks);
+             tensorflow::MaybeRaiseFromStatus(status);
+             return num_tasks;
+           });
 
   m.def(
       "TF_DATA_NewDispatchServer",
@@ -61,7 +70,7 @@ PYBIND11_MODULE(_pywrap_server_lib, m) {
         }
         std::unique_ptr<tensorflow::data::DispatchGrpcDataServer> server;
         tensorflow::Status status =
-            tensorflow::data::NewDispatchServer(config, &server);
+            tensorflow::data::NewDispatchServer(config, server);
         tensorflow::MaybeRaiseFromStatus(status);
         return server;
       },
@@ -78,7 +87,7 @@ PYBIND11_MODULE(_pywrap_server_lib, m) {
         }
         std::unique_ptr<tensorflow::data::WorkerGrpcDataServer> server;
         tensorflow::Status status =
-            tensorflow::data::NewWorkerServer(config, &server);
+            tensorflow::data::NewWorkerServer(config, server);
         tensorflow::MaybeRaiseFromStatus(status);
         return server;
       },
diff --git a/tensorflow/python/data/kernel_tests/BUILD b/tensorflow/python/data/kernel_tests/BUILD
index 210b6f59681..43f3a297da8 100644
--- a/tensorflow/python/data/kernel_tests/BUILD
+++ b/tensorflow/python/data/kernel_tests/BUILD
@@ -94,6 +94,7 @@ tf_py_test(
     name = "data_service_ops_test",
     size = "medium",
     srcs = ["data_service_ops_test.py"],
+    shard_count = 10,
     deps = [
         "//tensorflow:tensorflow_py",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/data/kernel_tests/data_service_ops_test.py b/tensorflow/python/data/kernel_tests/data_service_ops_test.py
index 933654b89a1..7b893d05083 100644
--- a/tensorflow/python/data/kernel_tests/data_service_ops_test.py
+++ b/tensorflow/python/data/kernel_tests/data_service_ops_test.py
@@ -17,6 +17,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
+import threading
 import time
 
 from absl.testing import parameterized
@@ -24,6 +26,7 @@ from absl.testing import parameterized
 from tensorflow.python.data.experimental.ops import batching
 from tensorflow.python.data.experimental.ops import data_service_ops
 from tensorflow.python.data.experimental.ops import distribute_options
+from tensorflow.python.data.experimental.ops import grouping
 from tensorflow.python.data.experimental.ops import testing
 from tensorflow.python.data.experimental.service import server_lib
 from tensorflow.python.data.kernel_tests import test_base
@@ -39,91 +42,224 @@ from tensorflow.python.framework import tensor_spec
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.platform import test
 
-PROTOCOL = "grpc"
+
+def _address_from_target(target):
+  # Targets are in the format <protocol>://<address>
+  return target.split("://")[1]
 
 
-def _make_distributed_dataset(dataset, service, job_name=None):
-  """Creates a distributed dataset with a short task refresh interval."""
+def _make_distributed_dataset(dataset,
+                              dispatcher,
+                              job_name=None,
+                              max_outstanding_requests=None):
   return dataset.apply(
       data_service_ops._distribute(
           "parallel_epochs",
-          service,
+          dispatcher.target,
           job_name=job_name,
+          max_outstanding_requests=max_outstanding_requests,
           task_refresh_interval_hint_ms=20))
 
 
+def _all_cluster_configurations():
+  with_work_dir = combinations.combine(
+      work_dir=None, fault_tolerant_mode=[True, False])
+  without_work_dir = combinations.combine(
+      work_dir="", fault_tolerant_mode=False)
+  return with_work_dir + without_work_dir
+
+
+def _make_distributed_range_dataset(num_elements,
+                                    dispatcher,
+                                    job_name=None,
+                                    max_outstanding_requests=None):
+  """Creates a distributed dataset.
+
+  Args:
+    num_elements: The number of elements in the range dataset that will be
+      distributed.
+    dispatcher: The dispatcher to distribute to.
+    job_name: Optional job name for the distributed dataset.
+    max_outstanding_requests: Optional limit on the number of outstanding
+      requests.
+
+  Returns:
+    The created dataset.
+  """
+  dataset = dataset_ops.Dataset.range(num_elements)
+  return _make_distributed_dataset(dataset, dispatcher, job_name,
+                                   max_outstanding_requests)
+
+
 class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
 
-  def create_cluster(self, num_workers):
-    """Creates a cluster of tf.data service servers.
+  def start_dispatch_server(self,
+                            name="",
+                            port=0,
+                            work_dir=None,
+                            fault_tolerant_mode=True,
+                            job_gc_check_interval_ms=None,
+                            job_gc_timeout_ms=None):
+    # If a test starts multiple independent dispatch servers, it should give
+    # them different `name` values.
+    work_dir = os.path.join(self.get_temp_dir(), "work_dir_",
+                            name) if work_dir is None else work_dir
+    return server_lib.DispatchServer(
+        server_lib.DispatcherConfig(
+            port=port,
+            work_dir=work_dir,
+            fault_tolerant_mode=fault_tolerant_mode,
+            job_gc_check_interval_ms=job_gc_check_interval_ms,
+            job_gc_timeout_ms=job_gc_timeout_ms))
 
-    Args:
-      num_workers: The number of workers in the cluster.
+  def start_worker_server(self, dispatcher, port=0):
+    return server_lib.WorkerServer(
+        server_lib.WorkerConfig(
+            dispatcher_address=_address_from_target(dispatcher.target),
+            port=port,
+            heartbeat_interval_ms=200))
 
-    Returns:
-      A string for connecting to the tf.data service.
-    """
-    self._dispatcher = server_lib.DispatchServer(port=0, protocol=PROTOCOL)
-    self._servers = []
-    for _ in range(num_workers):
-      self._servers.append(
-          server_lib.WorkerServer(
-              port=0,
-              dispatcher_address=self._dispatcher._address,
-              protocol=PROTOCOL))
+  def restart_dispatcher(self, dispatcher):
+    """Stops `dispatcher` and returns a new dispatcher with the same port."""
+    port = int(_address_from_target(dispatcher.target).split(":")[1])
+    dispatcher._stop()
+    return self.start_dispatch_server(
+        port=port,
+        work_dir=dispatcher._config.work_dir,
+        fault_tolerant_mode=dispatcher._config.fault_tolerant_mode)
 
-    return "{0}://{1}".format(PROTOCOL, self._dispatcher._address)
+  def restart_worker(self, worker, dispatcher, use_same_port=True):
+    """Stops `worker` and returns a new worker."""
+    port = 0
+    if use_same_port:
+      port = int(worker._address.split(":")[1])
+    worker._stop()
+    return self.start_worker_server(dispatcher, port)
 
-  @combinations.generate(test_base.eager_only_combinations())
-  def testDistributeBasic(self):
+  def start_cluster(self,
+                    num_workers,
+                    name="",
+                    work_dir=None,
+                    fault_tolerant_mode=True):
+    """Creates and starts a tf.data service cluster."""
+    dispatcher = self.start_dispatch_server(
+        name=name, work_dir=work_dir, fault_tolerant_mode=fault_tolerant_mode)
+    workers = [self.start_worker_server(dispatcher) for _ in range(num_workers)]
+    return dispatcher, workers
+
+  @combinations.generate(
+      combinations.times(test_base.eager_only_combinations(),
+                         _all_cluster_configurations()))
+  def testDistributeBasic(self, work_dir, fault_tolerant_mode):
+    dispatcher, workers = self.start_cluster(  # to avoid gcing workers, pylint: disable=unused-variable
+        1,
+        work_dir=work_dir,
+        fault_tolerant_mode=fault_tolerant_mode)
     num_elements = 10
-    service = self.create_cluster(1)
-    ds = dataset_ops.Dataset.range(num_elements)
-    ds = _make_distributed_dataset(ds, service)
+    ds = _make_distributed_range_dataset(10, dispatcher)
     results = [elem.numpy() for elem in ds]
     self.assertEqual(list(range(num_elements)), results)
 
   @combinations.generate(test_base.eager_only_combinations())
-  def testDispatcherPreemption(self):
-    self._dispatcher = server_lib.DispatchServer(port=0, protocol=PROTOCOL)
-    self._worker = server_lib.WorkerServer(
-        port=0, dispatcher_address=self._dispatcher._address, protocol=PROTOCOL)
+  def testDispatcherStop(self):
+    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
     num_elements = 100
-    ds = dataset_ops.Dataset.range(num_elements)
-    ds = _make_distributed_dataset(
-        ds, "{}://{}".format(PROTOCOL, self._dispatcher._address))
+    ds = _make_distributed_range_dataset(num_elements, dispatcher)
     iterator = iter(ds)
     results = []
     results.append(next(iterator).numpy())
-    self._dispatcher._stop()
+    dispatcher._stop()
     # After the dispatcher dies, the worker should continue providing the rest
     # of the dataset's elements.
     for _ in range(num_elements - 1):
       results.append(next(iterator).numpy())
     self.assertEqual(results, list(range(num_elements)))
 
+  @combinations.generate(test_base.eager_only_combinations())
+  def testDispatcherRestartBeforeReading(self):
+    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
+    num_elements = 100
+    ds = _make_distributed_range_dataset(num_elements, dispatcher)
+    dispatcher = self.restart_dispatcher(dispatcher)
+
+    self.assertDatasetProduces(ds, list(range(num_elements)))
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testDispatcherRestartDuringReading(self):
+    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
+    num_elements = 100
+    ds = _make_distributed_range_dataset(num_elements, dispatcher)
+    iterator = iter(ds)
+    results = []
+    for _ in range(num_elements // 2):
+      results.append(next(iterator).numpy())
+    dispatcher = self.restart_dispatcher(dispatcher)
+    for elem in iterator:
+      results.append(elem.numpy())
+
+    self.assertEqual(list(range(num_elements)), results)
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testDispatcherRestartBetweenIterations(self):
+    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
+    num_elements = 100
+    ds = _make_distributed_range_dataset(100, dispatcher)
+    self.assertDatasetProduces(ds, list(range(num_elements)))
+    dispatcher = self.restart_dispatcher(dispatcher)
+    self.assertDatasetProduces(ds, list(range(num_elements)))
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testDispatcherManyRestarts(self):
+    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
+    num_elements_start = 10
+    num_elements_end = 15
+    datasets = []
+    for num_elements in range(num_elements_start, num_elements_end):
+      datasets.append(_make_distributed_range_dataset(num_elements, dispatcher))
+      dispatcher = self.restart_dispatcher(dispatcher)
+    for ds, num_elements in zip(datasets,
+                                range(num_elements_start, num_elements_end)):
+      self.assertDatasetProduces(ds, list(range(num_elements)))
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testDispatcherAndWorkerRestart(self):
+    dispatcher, [worker] = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
+    num_elements = 100
+    ds = dataset_ops.Dataset.range(num_elements)
+
+    def restart():
+      return (self.restart_dispatcher(dispatcher),
+              self.restart_worker(worker, dispatcher))
+
+    ds = _make_distributed_dataset(ds, dispatcher)
+    dispatcher, worker = restart()
+    self.assertDatasetProduces(ds, list(range(num_elements)))
+    dispatcher, worker = restart()
+    self.assertDatasetProduces(ds, list(range(num_elements)))
+
   @combinations.generate(test_base.eager_only_combinations())
   def testDistributeSparse(self):
-    service = self.create_cluster(1)
+    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
     element = sparse_tensor.SparseTensor(
         indices=[[0]],
         values=constant_op.constant([0], dtype=dtypes.int32),
         dense_shape=[1])
     ds = dataset_ops.Dataset.from_tensors(element)
-    ds = _make_distributed_dataset(ds, service)
+    ds = _make_distributed_dataset(ds, dispatcher)
     results = [sparse_ops.sparse_tensor_to_dense(elem) for elem in ds]
     self.assertAllEqual(results, [[0]])
 
   @combinations.generate(test_base.eager_only_combinations())
   def testDistributeRagged(self):
-    service = self.create_cluster(1)
+    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
     ds = dataset_ops.Dataset.from_tensor_slices([1, 5, 3, 2, 8])
     ds = ds.map(math_ops.range)
     ds = ds.apply(batching.dense_to_ragged_batch(2))
-    ds = _make_distributed_dataset(ds, service)
+    ds = _make_distributed_dataset(ds, dispatcher)
     results = [elem.to_tensor() for elem in ds]
     self.assertAllEqual(results[0], [[0, 0, 0, 0, 0], [0, 1, 2, 3, 4]])
     self.assertAllEqual(results[1], [[0, 1, 2], [0, 1, 0]])
@@ -133,10 +269,10 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
   def testDifferentShuffleOrders(self):
     random_seed.set_random_seed(None)
     num_elements = 100
-    service = self.create_cluster(2)
+    dispatcher, workers = self.start_cluster(2)  # to avoid gcing workers, pylint: disable=unused-variable
     ds = dataset_ops.Dataset.range(num_elements)
     ds = ds.shuffle(num_elements)
-    ds = _make_distributed_dataset(ds, service)
+    ds = _make_distributed_dataset(ds, dispatcher)
     output = [elem.numpy() for elem in ds]
 
     # The output will be two sequences of range(num_elements)
@@ -153,34 +289,31 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @combinations.generate(test_base.eager_only_combinations())
   def testMultipleEpochs(self):
+    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
     num_elements = 3
-    service = self.create_cluster(1)
-    ds = dataset_ops.Dataset.range(num_elements)
-    ds = _make_distributed_dataset(ds, service)
+    ds = _make_distributed_range_dataset(num_elements, dispatcher)
     for _ in range(10):
       self.assertEqual(list(range(num_elements)), [elem.numpy() for elem in ds])
 
   @combinations.generate(test_base.eager_only_combinations())
   def testRepeatedDataset(self):
+    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
     num_elements = 10
     num_repetitions = 5
-    service = self.create_cluster(1)
-    ds = dataset_ops.Dataset.range(num_elements)
-    ds = _make_distributed_dataset(ds, service)
+    ds = _make_distributed_range_dataset(num_elements, dispatcher)
     ds = ds.repeat(num_repetitions)
     self.assertDatasetProduces(
         ds, expected_output=num_repetitions * list(range(num_elements)))
 
   @combinations.generate(test_base.eager_only_combinations())
   def testConcurrentEpoch(self):
+    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
     num_elements = 10
     num_datasets = 3
-    service = self.create_cluster(1)
     iterators = []
     results = []
     for _ in range(num_datasets):
-      ds = dataset_ops.Dataset.range(num_elements)
-      ds = _make_distributed_dataset(ds, service)
+      ds = _make_distributed_range_dataset(num_elements, dispatcher)
       iterators.append(iter(ds))
       results.append([])
 
@@ -194,11 +327,10 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
   @combinations.generate(test_base.eager_only_combinations())
   def testSharedEpoch(self):
     self.skipTest("Not yet implemented")
+    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
     num_elements = 10
     num_iterators = 3
-    service = self.create_cluster(1)
-    ds = dataset_ops.Dataset.range(num_elements)
-    ds = _make_distributed_dataset(ds, service)
+    ds = _make_distributed_range_dataset(num_elements, dispatcher)
     result = []
     iterators = []
     for _ in range(num_iterators):
@@ -219,33 +351,57 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
   @combinations.generate(test_base.eager_only_combinations())
   def testMultiWorker(self):
     num_workers = 3
+    dispatcher, workers = self.start_cluster(num_workers)  # to avoid gcing workers, pylint: disable=unused-variable
     num_elements = 10
-    service = self.create_cluster(num_workers)
-    ds = dataset_ops.Dataset.range(num_elements)
-    ds = _make_distributed_dataset(ds, service)
+    ds = _make_distributed_range_dataset(num_elements, dispatcher)
     results = [elem.numpy() for elem in ds]
     self.assertCountEqual(num_workers * list(range(num_elements)), results)
 
+  @combinations.generate(test_base.eager_only_combinations())
+  def testStartServersLate(self):
+    # Test that the data service client performs retries instead of failing when
+    # the dataset is created before the master and worker are started.
+    try:
+      import portpicker  # pylint: disable=g-import-not-at-top
+      dispatcher_port = portpicker.pick_unused_port()
+    except:
+      raise self.skipTest("Flakes in portpicker library do not represent "
+                          "TensorFlow errors.")
+    dispatcher = server_lib.DispatchServer(
+        server_lib.DispatcherConfig(port=dispatcher_port), start=False)
+    worker = server_lib.WorkerServer(
+        server_lib.WorkerConfig(
+            dispatcher_address=_address_from_target(dispatcher.target), port=0),
+        start=False)
+
+    def start_servers():
+      time.sleep(1)
+      dispatcher.start()
+      worker.start()
+
+    start_servers_thread = threading.Thread(target=start_servers, daemon=True)
+    start_servers_thread.start()
+
+    num_elements = 10
+    ds = _make_distributed_range_dataset(num_elements, dispatcher)
+    results = [elem.numpy() for elem in ds]
+    self.assertEqual(list(range(num_elements)), results)
+    start_servers_thread.join()
+
   @combinations.generate(test_base.eager_only_combinations())
   def testAddWorkerMidJob(self):
-    self._dispatcher = server_lib.DispatchServer(port=0, protocol=PROTOCOL)
-    self._worker = server_lib.WorkerServer(
-        port=0, dispatcher_address=self._dispatcher._address, protocol=PROTOCOL)
+    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
     num_elements = 100
-    ds = dataset_ops.Dataset.range(num_elements)
-    ds = _make_distributed_dataset(
-        ds, "{}://{}".format(PROTOCOL, self._dispatcher._address))
+    ds = _make_distributed_range_dataset(num_elements, dispatcher)
     iterator = iter(ds)
     results = []
     # Read halfway through the dataset.
     for _ in range(num_elements // 2):
       results.append(next(iterator).numpy())
 
-    self._new_worker = server_lib.WorkerServer(
-        port=0, dispatcher_address=self._dispatcher._address, protocol=PROTOCOL)
-
+    new_worker = self.start_worker_server(dispatcher)  # to avoid gcing workers, pylint: disable=unused-variable
     # Wait for the new worker to register with the dispatcher.
-    while self._dispatcher._num_workers() < 2:
+    while dispatcher._num_workers() < 2:
       time.sleep(10 / 1000)  # 10ms
 
     for elem in iterator:
@@ -255,15 +411,13 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @combinations.generate(
       combinations.times(test_base.eager_only_combinations(),
-                         combinations.combine(use_same_port=[True, False])))
-  def testRestartWorker(self, use_same_port):
-    self._dispatcher = server_lib.DispatchServer(port=0, protocol=PROTOCOL)
-    self._worker = server_lib.WorkerServer(
-        port=0, dispatcher_address=self._dispatcher._address, protocol=PROTOCOL)
+                         combinations.combine(use_same_port=[True, False]),
+                         _all_cluster_configurations()))
+  def testRestartWorker(self, use_same_port, work_dir, fault_tolerant_mode):
+    dispatcher, [worker] = self.start_cluster(
+        1, work_dir=work_dir, fault_tolerant_mode=fault_tolerant_mode)
     num_elements = 100
-    ds = dataset_ops.Dataset.range(num_elements)
-    ds = _make_distributed_dataset(
-        ds, "{}://{}".format(PROTOCOL, self._dispatcher._address))
+    ds = _make_distributed_range_dataset(num_elements, dispatcher)
     iterator = iter(ds)
     # Read halfway through the dataset.
     midpoint = num_elements // 2
@@ -271,14 +425,7 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
       self.assertEqual(i, next(iterator).numpy())
 
     # Stop the original worker and start a new one.
-    port = 0
-    if use_same_port:
-      port = int(self._worker._address.split(":")[1])
-    self._worker._stop()
-    self._new_worker = server_lib.WorkerServer(
-        port=port,
-        dispatcher_address=self._dispatcher._address,
-        protocol=PROTOCOL)
+    worker = self.restart_worker(worker, dispatcher, use_same_port)
 
     # There may have been some elements prefetched from the first worker
     # before it was stopped.
@@ -296,29 +443,23 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @combinations.generate(test_base.eager_only_combinations())
   def testMaxOutstandingRequests(self):
-    num_elements = 10
     num_workers = 3
-    service = self.create_cluster(num_workers)
-    ds = dataset_ops.Dataset.range(num_elements)
-    ds = ds.apply(
-        data_service_ops._distribute(
-            "parallel_epochs",
-            service,
-            max_outstanding_requests=1,
-            task_refresh_interval_hint_ms=20))
+    dispatcher, workers = self.start_cluster(num_workers)  # to avoid gcing workers, pylint: disable=unused-variable
+    num_elements = 10
+    ds = _make_distributed_range_dataset(
+        num_elements, dispatcher, max_outstanding_requests=1)
     self.assertCountEqual(num_workers * list(range(num_elements)),
                           self.getDatasetOutput(ds))
 
   @combinations.generate(test_base.eager_only_combinations())
   def testInsideFunction(self):
     num_workers = 3
+    dispatcher, workers = self.start_cluster(num_workers)  # to avoid gcing workers, pylint: disable=unused-variable
     num_elements = 10
-    service = self.create_cluster(num_workers)
 
     @def_function.function
     def f():
-      ds = dataset_ops.Dataset.range(num_elements)
-      ds = _make_distributed_dataset(ds, service)
+      ds = _make_distributed_range_dataset(num_elements, dispatcher)
       result = tensor_array_ops.TensorArray(
           dtypes.int64, size=num_workers * num_elements, dynamic_size=True)
       i = 0
@@ -332,11 +473,14 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @combinations.generate(test_base.eager_only_combinations())
   def testSharedJobName(self):
+    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
     num_elements = 100
-    service = self.create_cluster(1)
-    ds = dataset_ops.Dataset.range(num_elements)
-    ds1 = _make_distributed_dataset(ds, service, job_name="job_name")
-    ds2 = _make_distributed_dataset(ds, service, job_name="job_name")
+
+    def make_ds():
+      return dataset_ops.Dataset.range(num_elements).shuffle(num_elements)
+
+    ds1 = _make_distributed_dataset(make_ds(), dispatcher, job_name="job_name")
+    ds2 = _make_distributed_dataset(make_ds(), dispatcher, job_name="job_name")
     iter1 = iter(ds1)
     iter2 = iter(ds2)
     results = []
@@ -351,21 +495,21 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @combinations.generate(test_base.eager_only_combinations())
   def testDifferentJobNames(self):
+    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
     num_elements = 10
-    service = self.create_cluster(1)
     ds = dataset_ops.Dataset.range(num_elements)
-    ds1 = _make_distributed_dataset(ds, service, job_name="job_name1")
-    ds2 = _make_distributed_dataset(ds, service, job_name="job_name2")
+    ds1 = _make_distributed_dataset(ds, dispatcher, job_name="job_name1")
+    ds2 = _make_distributed_dataset(ds, dispatcher, job_name="job_name2")
     self.assertDatasetProduces(ds1, list(range(num_elements)))
     self.assertDatasetProduces(ds2, list(range(num_elements)))
 
   @combinations.generate(test_base.eager_only_combinations())
   def testSharedJobNameMultiIteration(self):
+    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
     num_elements = 10
-    service = self.create_cluster(1)
     ds = dataset_ops.Dataset.range(num_elements)
-    ds1 = _make_distributed_dataset(ds, service, job_name="job_name")
-    ds2 = _make_distributed_dataset(ds, service, job_name="job_name")
+    ds1 = _make_distributed_dataset(ds, dispatcher, job_name="job_name")
+    ds2 = _make_distributed_dataset(ds, dispatcher, job_name="job_name")
     # iteration 1
     self.assertDatasetProduces(ds1, list(range(num_elements)))
     self.assertDatasetProduces(ds2, [])
@@ -375,13 +519,13 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @combinations.generate(test_base.eager_only_combinations())
   def testSharedJobNameRepeat(self):
+    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
     num_elements = 100
     num_repetitions = 3
-    service = self.create_cluster(1)
     ds = dataset_ops.Dataset.range(num_elements)
-    ds1 = _make_distributed_dataset(ds, service, job_name="job_name")
+    ds1 = _make_distributed_dataset(ds, dispatcher, job_name="job_name")
     ds1 = ds1.repeat(num_repetitions)
-    ds2 = _make_distributed_dataset(ds, service, job_name="job_name")
+    ds2 = _make_distributed_dataset(ds, dispatcher, job_name="job_name")
     ds2 = ds2.repeat(num_repetitions)
     results = []
     iter1 = iter(ds1)
@@ -396,10 +540,51 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
       results.append(elem.numpy())
     self.assertCountEqual(num_repetitions * list(range(num_elements)), results)
 
+  @combinations.generate(
+      combinations.times(test_base.eager_only_combinations(),
+                         combinations.combine(job_name=[None, "test"])))
+  def testGcUnusedJob(self, job_name):
+    dispatcher = self.start_dispatch_server(
+        job_gc_check_interval_ms=50, job_gc_timeout_ms=20)
+    worker = self.start_worker_server(dispatcher)  # pylint: disable=unused-variable
+    num_elements = 100
+    ds = _make_distributed_range_dataset(
+        num_elements, dispatcher, job_name=job_name)
+    it = iter(ds)
+    self.assertEqual(next(it).numpy(), 0)
+    self.assertEqual(worker._num_tasks(), 1)
+    del it
+    while worker._num_tasks() > 0:
+      time.sleep(0.1)
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testDontGcUsedJob(self):
+    dispatcher = self.start_dispatch_server(
+        job_gc_check_interval_ms=50, job_gc_timeout_ms=20)
+    worker = self.start_worker_server(dispatcher)  # pylint: disable=unused-variable
+    num_elements = 10
+    it1 = iter(
+        _make_distributed_range_dataset(
+            num_elements, dispatcher, job_name="test1"))
+    it2 = iter(
+        _make_distributed_range_dataset(
+            num_elements, dispatcher, job_name="test2"))
+    it3 = iter(  # this iterator keeps the task alive. pylint: disable=unused-variable
+        _make_distributed_range_dataset(
+            num_elements, dispatcher, job_name="test2"))
+    self.assertEqual(2, worker._num_tasks())
+    del it1
+    del it2
+    # Check that only the first job is gced. The second job will not be gced
+    # because there is still an outstanding iterator for it.
+    while worker._num_tasks() > 1:
+      time.sleep(0.1)
+    self.assertEqual(1, worker._num_tasks())
+
   @combinations.generate(test_base.eager_only_combinations())
   def testApplyDeterminismOption(self):
     elements = list(range(10))
-    service = self.create_cluster(1)
+    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
 
     def dataset_fn(delay_ms):
 
@@ -416,7 +601,7 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
       opts = dataset_ops.Options()
       opts.experimental_deterministic = False
       ds = ds.with_options(opts)
-      ds = _make_distributed_dataset(ds, service)
+      ds = _make_distributed_dataset(ds, dispatcher)
       return ds
 
     self.checkDeterminism(
@@ -433,8 +618,8 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
     options.experimental_external_state_policy = external_state_policy
     ds = ds.with_options(options)
 
-    service = self.create_cluster(3)
-    ds = _make_distributed_dataset(ds, service)
+    dispatcher, workers = self.start_cluster(3)  # to avoid gcing workers, pylint: disable=unused-variable
+    ds = _make_distributed_dataset(ds, dispatcher)
     next(iter(ds))
 
   @combinations.generate(
@@ -454,18 +639,16 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @combinations.generate(test_base.eager_only_combinations())
   def testDistributeFromInterleave(self):
-    service = self.create_cluster(1)
+    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
     ds = dataset_ops.Dataset.range(2)
 
     def interleave_fn(_):
-      ds = dataset_ops.Dataset.range(2)
-      _make_distributed_dataset(ds, service)
-      return ds
+      dataset = dataset_ops.Dataset.range(2)
+      _make_distributed_dataset(dataset, dispatcher)
+      return dataset
 
-    with self.assertRaisesRegex(
-        errors.InvalidArgumentError, r"The `.distribute\(...\)` dataset "
-        "transformation is not supported within tf.data functions"):
-      ds = ds.interleave(interleave_fn, cycle_length=2)
+    ds = ds.interleave(interleave_fn, cycle_length=2)
+    self.assertDatasetProduces(ds, [0, 0, 1, 1])
 
   @combinations.generate(test_base.eager_only_combinations())
   def testDistributeNonStringAddresses(self):
@@ -495,25 +678,25 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @combinations.generate(test_base.eager_only_combinations())
   def testFromDatasetId(self):
-    num_elements = 10
-    service = self.create_cluster(1)
+    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
 
+    num_elements = 10
     ds = dataset_ops.Dataset.range(num_elements)
-    dataset_id = data_service_ops.register_dataset(service, ds)
+    dataset_id = data_service_ops.register_dataset(dispatcher.target, ds)
     from_dataset_id_ds = data_service_ops.from_dataset_id(
-        "parallel_epochs", service, dataset_id, ds.element_spec)
+        "parallel_epochs", dispatcher.target, dataset_id, ds.element_spec)
     self.assertDatasetProduces(from_dataset_id_ds, list(range(num_elements)))
 
   @combinations.generate(test_base.eager_only_combinations())
   def testFromDatasetIdMultipleComponents(self):
-    num_elements = 10
-    service = self.create_cluster(1)
+    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
 
+    num_elements = 10
     ds = dataset_ops.Dataset.range(num_elements)
     ds = dataset_ops.Dataset.zip({"a": (ds, ds), "b": ds})
-    dataset_id = data_service_ops.register_dataset(service, ds)
+    dataset_id = data_service_ops.register_dataset(dispatcher.target, ds)
     from_dataset_id_ds = data_service_ops.from_dataset_id(
-        "parallel_epochs", service, dataset_id, ds.element_spec)
+        "parallel_epochs", dispatcher.target, dataset_id, ds.element_spec)
     output = self.getDatasetOutput(from_dataset_id_ds)
     for i in range(num_elements):
       self.assertEqual(i, output[i]["a"][0])
@@ -522,26 +705,26 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @combinations.generate(test_base.eager_only_combinations())
   def testFromDatasetIdWrongElementSpec(self):
-    num_elements = 10
-    service = self.create_cluster(1)
+    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
 
+    num_elements = 10
     ds = dataset_ops.Dataset.range(num_elements)
-    dataset_id = data_service_ops.register_dataset(service, ds)
+    dataset_id = data_service_ops.register_dataset(dispatcher.target, ds)
     wrong_spec = tensor_spec.TensorSpec(shape=(), dtype=dtypes.variant)
     from_dataset_id_ds = data_service_ops.from_dataset_id(
-        "parallel_epochs", service, dataset_id, wrong_spec)
+        "parallel_epochs", dispatcher.target, dataset_id, wrong_spec)
     with self.assertRaisesRegex(errors.FailedPreconditionError,
                                 "Expected a tensor of type variant"):
       self.evaluate(self.getNext(from_dataset_id_ds)())
 
   @combinations.generate(test_base.eager_only_combinations())
   def testFromDatasetIdNotRegistered(self):
-    service = self.create_cluster(1)
+    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
 
     dataset_id = 0
     element_spec = tensor_spec.TensorSpec(shape=(), dtype=dtypes.variant)
     from_dataset_id_ds = data_service_ops.from_dataset_id(
-        "parallel_epochs", service, dataset_id, element_spec)
+        "parallel_epochs", dispatcher.target, dataset_id, element_spec)
     with self.assertRaisesRegex(errors.NotFoundError, "Dataset id"):
       self.evaluate(self.getNext(from_dataset_id_ds)())
 
@@ -550,17 +733,14 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
     self.skipTest("b/162521601")
     sleep_microseconds = int(1e6) * 1000
 
-    self._dispatcher = server_lib.DispatchServer(port=0, protocol=PROTOCOL)
-    self._worker = server_lib.WorkerServer(
-        port=0, dispatcher_address=self._dispatcher._address, protocol=PROTOCOL)
+    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
     # Create a dataset which produces the first element quickly, and the second
     # element slowly. Fetching the first element triggers prefetching of the
     # second element, which we should be able to cancel.
     slow = dataset_ops.Dataset.range(1)
     slow = slow.apply(testing.sleep(sleep_microseconds))
     ds = dataset_ops.Dataset.range(1).concatenate(slow)
-    ds = _make_distributed_dataset(
-        ds, "{}://{}".format(PROTOCOL, self._dispatcher._address))
+    ds = _make_distributed_dataset(ds, dispatcher)
     ds = ds.prefetch(1)
     get_next = self.getNext(ds, requires_initialization=True)
     self.assertEqual(0, self.evaluate(get_next()))
@@ -571,20 +751,54 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
   def testRegisterEquivalentDatasets(self):
     ds_1 = dataset_ops.Dataset.range(10)
     ds_2 = dataset_ops.Dataset.range(10)
-    service = self.create_cluster(1)
-    id_1 = data_service_ops.register_dataset(service, ds_1)
-    id_2 = data_service_ops.register_dataset(service, ds_2)
+    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
+    id_1 = data_service_ops.register_dataset(dispatcher.target, ds_1)
+    id_2 = data_service_ops.register_dataset(dispatcher.target, ds_2)
     self.assertEqual(id_1.numpy(), id_2.numpy())
 
   @combinations.generate(test_base.eager_only_combinations())
   def testRegisterDifferentDatasets(self):
     ds_1 = dataset_ops.Dataset.range(10)
     ds_2 = dataset_ops.Dataset.range(20)
-    service = self.create_cluster(1)
-    id_1 = data_service_ops.register_dataset(service, ds_1)
-    id_2 = data_service_ops.register_dataset(service, ds_2)
+    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
+    id_1 = data_service_ops.register_dataset(dispatcher.target, ds_1)
+    id_2 = data_service_ops.register_dataset(dispatcher.target, ds_2)
     self.assertNotEqual(id_1.numpy(), id_2.numpy())
 
+  @combinations.generate(test_base.eager_only_combinations())
+  def testTwoLevelDistribute(self):
+    cluster_1_size = 3
+    dispatcher_1, workers_1 = self.start_cluster(  # to avoid gcing workers, pylint: disable=unused-variable
+        cluster_1_size,
+        name="cluster_1")
+    dispatcher_2, workers_2 = self.start_cluster(1, name="cluster_2")  # to avoid gcing workers, pylint: disable=unused-variable
+    num_sizes = 10
+    size_repeats = 5
+    strings = ["a" * i for i in range(num_sizes)] * size_repeats
+    ds = dataset_ops.Dataset.from_tensor_slices(strings)
+    ds = ds.shuffle(len(strings))
+    ds = _make_distributed_dataset(ds, dispatcher_1)
+    # Large enough so that all strings of the same size are windowed together.
+    window_size = cluster_1_size * size_repeats
+    batch_size = size_repeats
+
+    def key_func(x):
+      return math_ops.cast(string_ops.string_length_v2(x), dtypes.int64)
+
+    ds = ds.apply(
+        grouping.group_by_window(
+            key_func=key_func,
+            reduce_func=lambda _, x: x.batch(batch_size),
+            window_size=window_size))
+    ds = _make_distributed_dataset(ds, dispatcher_2)
+
+    it = iter(ds)
+    for _ in range(num_sizes):
+      element = next(it).numpy()
+      for _ in range(1, cluster_1_size):
+        self.assertAllEqual(next(it).numpy(), element)
+    self.assertEmpty(list(it))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/from_generator_test.py b/tensorflow/python/data/kernel_tests/from_generator_test.py
index 8643228f267..316c4fe4145 100644
--- a/tensorflow/python/data/kernel_tests/from_generator_test.py
+++ b/tensorflow/python/data/kernel_tests/from_generator_test.py
@@ -28,7 +28,12 @@ from tensorflow.python.framework import combinations
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.ops import script_ops
+from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import test
 
 
@@ -259,7 +264,7 @@ class FromGeneratorTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     self.assertAllEqual([1, 2, 3], self.evaluate(get_next()))
     self.assertAllEqual([4, 5, 6], self.evaluate(get_next()))
-    with self.assertRaisesOpError("The expected type was int64"):
+    with self.assertRaises(errors.InvalidArgumentError):
       self.evaluate(get_next())
     self.assertAllEqual([7, 8, 9], self.evaluate(get_next()))
     with self.assertRaises(errors.OutOfRangeError):
@@ -279,7 +284,7 @@ class FromGeneratorTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     self.assertAllEqual([1, 2, 3], self.evaluate(get_next()))
     self.assertAllEqual([4, 5, 6], self.evaluate(get_next()))
-    with self.assertRaisesOpError(r"element of shape \(3,\) was expected"):
+    with self.assertRaises(errors.InvalidArgumentError):
       self.evaluate(get_next())
     self.assertAllEqual([11, 12, 13], self.evaluate(get_next()))
     with self.assertRaises(errors.OutOfRangeError):
@@ -300,11 +305,9 @@ class FromGeneratorTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     self.assertEqual((1, 2), self.evaluate(get_next()))
     self.assertEqual((3, 4), self.evaluate(get_next()))
-    with self.assertRaisesOpError(
-        r"The expected structure was \(tf\.int64, tf\.int64\)"):
+    with self.assertRaises(errors.InvalidArgumentError):
       self.evaluate(get_next())
-    with self.assertRaisesOpError(
-        r"The expected structure was \(tf\.int64, tf\.int64\)"):
+    with self.assertRaises(errors.InvalidArgumentError):
       self.evaluate(get_next())
     self.assertEqual((9, 10), self.evaluate(get_next()))
     with self.assertRaises(errors.OutOfRangeError):
@@ -423,8 +426,12 @@ class FromGeneratorTest(test_base.DatasetTestBase, parameterized.TestCase):
                                 stateful=True)
 
     dummy = constant_op.constant(37)
-    dataset = dataset_ops._GeneratorDataset(dummy, lambda x: x, lambda x: x,
-                                            finalize_fn).take(2)
+
+    dataset = dataset_ops._GeneratorDataset(
+        dummy, lambda x: x, lambda x: x, finalize_fn,
+        tensor_spec.TensorSpec((), dtypes.int32))
+
+    dataset = dataset.take(2)
     get_next = self.getNext(dataset)
 
     self.assertAllEqual(37, self.evaluate(get_next()))
@@ -446,6 +453,44 @@ class FromGeneratorTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     self.assertAllEqual([20], self.evaluate(get_next()))
 
+  @combinations.generate(test_base.default_test_combinations())
+  def testFromGeneratorRaggedTensor(self):
+
+    def generator():
+      yield ragged_factory_ops.constant([[1, 2], [3]])
+
+    dataset = dataset_ops.Dataset.from_generator(
+        generator,
+        output_signature=ragged_tensor.RaggedTensorSpec(
+            shape=(2, None), dtype=dtypes.int32))
+    get_next = self.getNext(dataset)
+
+    ret = get_next()
+
+    self.assertIsInstance(ret, ragged_tensor.RaggedTensor)
+    self.assertAllEqual([[1, 2], [3]], ret)
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testFromGeneratorSparseTensor(self):
+
+    def generator():
+      yield sparse_tensor.SparseTensor(
+          indices=[[0, 0], [1, 2]],
+          values=constant_op.constant([1, 2], dtype=dtypes.int64),
+          dense_shape=[3, 4])
+
+    dataset = dataset_ops.Dataset.from_generator(
+        generator,
+        output_signature=sparse_tensor.SparseTensorSpec([3, 4], dtypes.int64))
+
+    get_next = self.getNext(dataset)
+
+    ret = get_next()
+
+    self.assertIsInstance(ret, sparse_tensor.SparseTensor)
+    self.assertAllEqual([[1, 0, 0, 0], [0, 0, 2, 0], [0, 0, 0, 0]],
+                        sparse_ops.sparse_tensor_to_dense(ret))
+
   @combinations.generate(test_base.default_test_combinations())
   def testTypeIsListError(self):
 
diff --git a/tensorflow/python/data/kernel_tests/iterator_test.py b/tensorflow/python/data/kernel_tests/iterator_test.py
index 060014652ec..a59c64d50b2 100644
--- a/tensorflow/python/data/kernel_tests/iterator_test.py
+++ b/tensorflow/python/data/kernel_tests/iterator_test.py
@@ -946,7 +946,9 @@ class IteratorTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     @def_function.function
     def fn():
-      dataset = dataset_ops._GeneratorDataset(1, init_fn, next_fn, finalize_fn)
+      output_signature = tensor_spec.TensorSpec((), dtypes.int64)
+      dataset = dataset_ops._GeneratorDataset(1, init_fn, next_fn, finalize_fn,
+                                              output_signature)
       iterator = iter(dataset)
       next(iterator)
 
diff --git a/tensorflow/python/data/kernel_tests/map_test.py b/tensorflow/python/data/kernel_tests/map_test.py
index 275be3ea635..f179ba3c359 100644
--- a/tensorflow/python/data/kernel_tests/map_test.py
+++ b/tensorflow/python/data/kernel_tests/map_test.py
@@ -56,6 +56,8 @@ from tensorflow.python.ops.ragged import ragged_concat_ops
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import test
+from tensorflow.python.training import checkpoint_management
+from tensorflow.python.training.tracking import util as trackable_utils
 
 
 def _test_combinations_with_mode_v1(mode):
@@ -1380,6 +1382,23 @@ class MapTest(test_base.DatasetTestBase, parameterized.TestCase):
     dataset = apply_map(dataset, map_function)
     self.assertDatasetProduces(dataset, expected_output=[21])
 
+  @combinations.generate(test_base.eager_only_combinations())
+  def testCheckpointLargeBuffer(self):
+    # Tensor of size 100M
+    dataset = dataset_ops.Dataset.from_tensors(
+        array_ops.ones((25, 1000, 1000), dtype=dtypes.float32))
+    # Repeat 25 times to exceed the 2G proto limit
+    dataset = dataset.repeat(30)
+    dataset = dataset.map(lambda x: x * 2, num_parallel_calls=25)
+
+    iterator = iter(dataset)
+    # Call next() to trigger parallel map calls.
+    next(iterator)
+    ckpt = trackable_utils.Checkpoint(iterator=iterator)
+    manager = checkpoint_management.CheckpointManager(
+        ckpt, self.get_temp_dir(), max_to_keep=1)
+    manager.save()
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/options_test.py b/tensorflow/python/data/kernel_tests/options_test.py
index 6869306e0d6..31220c69d9e 100644
--- a/tensorflow/python/data/kernel_tests/options_test.py
+++ b/tensorflow/python/data/kernel_tests/options_test.py
@@ -18,6 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import platform
+import sys
+
 from absl.testing import parameterized
 
 from tensorflow.python.data.experimental.ops import optimization_options
@@ -51,25 +54,31 @@ class OptionsTest(test_base.DatasetTestBase, parameterized.TestCase):
     self.assertEqual(options, ds.options())
 
   @combinations.generate(test_base.default_test_combinations())
-  def testOptionsTwiceDifferent(self):
+  def testOptionsTwiceDifferentOptions(self):
     options1 = dataset_ops.Options()
     options1.experimental_optimization.autotune = True
     options2 = dataset_ops.Options()
     options2.experimental_deterministic = False
-    ds = dataset_ops.Dataset.range(0).with_options(options1).with_options(
-        options2)
+    ds = dataset_ops.Dataset.range(0)
+    ds = ds.with_options(options1)
+    ds = ds.with_options(options2)
     self.assertTrue(ds.options().experimental_optimization.autotune)
     # Explicitly check that flag is False since assertFalse allows None
     self.assertIs(ds.options().experimental_deterministic, False)
 
   @combinations.generate(test_base.default_test_combinations())
-  def testOptionsTwiceDifferentError(self):
+  def testOptionsTwiceSameOption(self):
+    if sys.version_info >= (3, 8) and platform.system() == "Windows":
+      # TODO(b/165013260): Fix this
+      self.skipTest("Test is currently broken on Windows with Python 3.8")
     options1 = dataset_ops.Options()
-    options1.experimental_optimization.autotune = True
+    options1.experimental_optimization.autotune = False
     options2 = dataset_ops.Options()
-    options2.experimental_optimization.autotune = False
-    with self.assertRaisesRegex(ValueError, "Cannot merge incompatible values"):
-      dataset_ops.Dataset.range(0).with_options(options1).with_options(options2)
+    options2.experimental_optimization.autotune = True
+    ds = dataset_ops.Dataset.range(0)
+    ds = ds.with_options(options1)
+    ds = ds.with_options(options2)
+    self.assertTrue(ds.options().experimental_optimization.autotune)
 
   @combinations.generate(test_base.default_test_combinations())
   def testOptionsMergeOptionsFromMultipleInputs(self):
@@ -77,9 +86,9 @@ class OptionsTest(test_base.DatasetTestBase, parameterized.TestCase):
     options1.experimental_optimization.autotune = True
     options2 = dataset_ops.Options()
     options2.experimental_deterministic = True
-    ds = dataset_ops.Dataset.zip(
-        (dataset_ops.Dataset.range(0).with_options(options1),
-         dataset_ops.Dataset.range(0).with_options(options2)))
+    ds1 = dataset_ops.Dataset.range(0).with_options(options1)
+    ds2 = dataset_ops.Dataset.range(0).with_options(options2)
+    ds = dataset_ops.Dataset.zip((ds1, ds2))
     self.assertTrue(ds.options().experimental_optimization.autotune)
     self.assertTrue(ds.options().experimental_deterministic)
 
@@ -99,6 +108,16 @@ class OptionsTest(test_base.DatasetTestBase, parameterized.TestCase):
     self.assertEqual(options1.experimental_threading,
                      threading_options.ThreadingOptions())
 
+  @combinations.generate(test_base.default_test_combinations())
+  def testMutableOptions(self):
+    ds = dataset_ops.Dataset.range(0)
+    ds.options().experimental_optimization.autotune = True
+    self.assertTrue(ds.options().experimental_optimization.autotune)
+    options = dataset_ops.Options()
+    ds = ds.with_options(options)
+    ds.options().experimental_deterministic = True
+    self.assertTrue(ds.options().experimental_deterministic)
+
   @combinations.generate(test_base.eager_only_combinations())
   def testNestedDataset(self):
     ds = dataset_ops.Dataset.from_tensors(0)
diff --git a/tensorflow/python/data/kernel_tests/test_base.py b/tensorflow/python/data/kernel_tests/test_base.py
index 31595363bd5..5da633a9ee2 100644
--- a/tensorflow/python/data/kernel_tests/test_base.py
+++ b/tensorflow/python/data/kernel_tests/test_base.py
@@ -52,10 +52,15 @@ def graph_only_combinations():
 
 
 def v2_only_combinations():
-  """Returns the default test combinations for v1 only tf.data tests."""
+  """Returns the default test combinations for v2 only tf.data tests."""
   return combinations.combine(tf_api_version=2, mode=["eager", "graph"])
 
 
+def v2_eager_only_combinations():
+  """Returns the default test combinations for v2 eager only tf.data tests."""
+  return combinations.combine(tf_api_version=2, mode="eager")
+
+
 class DatasetTestBase(test.TestCase):
   """Base class for dataset tests."""
 
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index 19d9fe9d88b..5e1981c8abb 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -30,7 +30,6 @@ from six.moves import queue as Queue  # pylint: disable=redefined-builtin
 
 from tensorflow.core.framework import graph_pb2
 from tensorflow.python import tf2
-from tensorflow.python.compat import compat
 from tensorflow.python.data.experimental.ops import distribute_options
 from tensorflow.python.data.experimental.ops import optimization_options
 from tensorflow.python.data.experimental.ops import stats_options
@@ -95,6 +94,8 @@ ops.NotDifferentiable("ReduceDataset")
 
 # A constant that can be used to enable auto-tuning.
 AUTOTUNE = -1
+tf_export("data.AUTOTUNE").export_constant(__name__, "AUTOTUNE")
+# TODO(b/168128531): Deprecate and remove this symbol.
 tf_export("data.experimental.AUTOTUNE").export_constant(__name__, "AUTOTUNE")
 
 # Constants representing infinite and unknown cardinalities.
@@ -377,25 +378,26 @@ class DatasetV2(collections_abc.Iterable, tracking_base.Trackable,
     graph_rewrites = options._graph_rewrites()
     graph_rewrite_configs = options._graph_rewrite_configs()
     # pylint: enable=protected-access
-    if graph_rewrites.enabled or graph_rewrites.default:
-      if self._has_captured_ref():
+    if self._has_captured_ref():
+      if graph_rewrites.enabled or graph_rewrites.default:
         warnings.warn(
             "tf.data graph rewrites are not compatible with tf.Variable. "
             "The following rewrites will be disabled: %s. To enable "
             "rewrites, use resource variables instead by calling "
             "`tf.enable_resource_variables()` at the start of the program." %
             ", ".join(graph_rewrites.enabled + graph_rewrites.default))
-      else:
-        dataset = _OptimizeDataset(dataset, graph_rewrites.enabled,
-                                   graph_rewrites.disabled,
-                                   graph_rewrites.default,
-                                   graph_rewrite_configs)
+    elif (graph_rewrites.enabled or graph_rewrites.default or
+          (options.experimental_optimization.apply_default_optimizations  # pylint: disable=g-bool-id-comparison
+           is not False)):
+      dataset = _OptimizeDataset(dataset, graph_rewrites.enabled,
+                                 graph_rewrites.disabled,
+                                 graph_rewrites.default, graph_rewrite_configs)
 
     # (3) Apply autotune options
-    autotune, algorithm, cpu_budget = options._autotune_settings()  # pylint: disable=protected-access
+    autotune, algorithm, cpu_budget, ram_budget = options._autotune_settings()  # pylint: disable=protected-access
 
     if autotune:
-      dataset = _ModelDataset(dataset, algorithm, cpu_budget)
+      dataset = _ModelDataset(dataset, algorithm, cpu_budget, ram_budget)
 
     # (4) Apply stats aggregator options
     if options.experimental_stats and options.experimental_stats.aggregator:  # pylint: disable=line-too-long
@@ -417,7 +419,8 @@ class DatasetV2(collections_abc.Iterable, tracking_base.Trackable,
       RuntimeError: If not inside of tf.function and not executing eagerly.
     """
     if context.executing_eagerly() or ops.inside_function():
-      return iterator_ops.OwnedIterator(self)
+      with ops.device(self._variant_tensor.device):
+        return iterator_ops.OwnedIterator(self)
     else:
       raise RuntimeError("__iter__() is only supported inside of tf.function "
                          "or when eager execution is enabled.")
@@ -725,27 +728,46 @@ class DatasetV2(collections_abc.Iterable, tracking_base.Trackable,
       del self._iterators[iterator_id]
 
   @staticmethod
-  def from_generator(generator, output_types, output_shapes=None, args=None):
+  @deprecation.deprecated_args(None, "Use output_signature instead",
+                               "output_types", "output_shapes")
+  def from_generator(generator,
+                     output_types=None,
+                     output_shapes=None,
+                     args=None,
+                     output_signature=None):
     """Creates a `Dataset` whose elements are generated by `generator`.
 
     The `generator` argument must be a callable object that returns
     an object that supports the `iter()` protocol (e.g. a generator function).
-    The elements generated by `generator` must be compatible with the given
-    `output_types` and (optional) `output_shapes` arguments.
 
-    >>> import itertools
-    >>>
+    The elements generated by `generator` must be compatible with either the
+    given `output_signature` argument or with the given `output_types` and
+    (optionally) `output_shapes` arguments, whichiver was specified.
+
+    The recommended way to call `from_generator` is to use the
+    `output_signature` argument. In this case the output will be assumed to
+    consist of objects with the classes, shapes and types defined by
+    `tf.TypeSpec` objects from `output_signature` argument:
+
     >>> def gen():
-    ...   for i in itertools.count(1):
-    ...     yield (i, [1] * i)
+    ...   ragged_tensor = tf.ragged.constant([[1, 2], [3]])
+    ...   yield 42, ragged_tensor
     >>>
     >>> dataset = tf.data.Dataset.from_generator(
     ...      gen,
-    ...      (tf.int64, tf.int64),
-    ...      (tf.TensorShape([]), tf.TensorShape([None])))
+    ...      output_signature=(
+    ...          tf.TensorSpec(shape=(), dtype=tf.int32),
+    ...          tf.RaggedTensorSpec(shape=(2, None), dtype=tf.int32)))
     >>>
-    >>> list(dataset.take(3).as_numpy_iterator())
-    [(1, array([1])), (2, array([1, 1])), (3, array([1, 1, 1]))]
+    >>> list(dataset.take(1))
+    [(<tf.Tensor: shape=(), dtype=int32, numpy=42>,
+    <tf.RaggedTensor [[1, 2], [3]]>)]
+
+    There is also a deprecated way to call `from_generator` by either with
+    `output_types` argument alone or together with `output_shapes` argument.
+    In this case the output of the function will be assumed to consist of
+    `tf.Tensor` objects with with the types defined by `output_types` and with
+    the shapes which are either unknown or defined by `output_shapes`.
 
     Note: The current implementation of `Dataset.from_generator()` uses
     `tf.numpy_function` and inherits the same constraints. In particular, it
@@ -769,31 +791,56 @@ class DatasetV2(collections_abc.Iterable, tracking_base.Trackable,
         `iter()` protocol. If `args` is not specified, `generator` must take no
         arguments; otherwise it must take as many arguments as there are values
         in `args`.
-      output_types: A nested structure of `tf.DType` objects corresponding to
-        each component of an element yielded by `generator`.
+      output_types: (Optional.) A nested structure of `tf.DType` objects
+        corresponding to each component of an element yielded by `generator`.
       output_shapes: (Optional.) A nested structure of `tf.TensorShape` objects
         corresponding to each component of an element yielded by `generator`.
       args: (Optional.) A tuple of `tf.Tensor` objects that will be evaluated
         and passed to `generator` as NumPy-array arguments.
+      output_signature: (Optional.) A nested structure of `tf.TypeSpec` objects
+        corresponding to each component of an element yielded by `generator`.
 
     Returns:
       Dataset: A `Dataset`.
     """
     if not callable(generator):
       raise TypeError("`generator` must be callable.")
-    if output_shapes is None:
-      output_shapes = nest.map_structure(
-          lambda _: tensor_shape.TensorShape(None), output_types)
+
+    if output_signature is not None:
+      if output_types is not None:
+        raise TypeError("`output_types` can not be used together with "
+                        "`output_signature`")
+      if output_shapes is not None:
+        raise TypeError("`output_shapes` can not be used together with "
+                        "`output_signature`")
+      if not all(
+          isinstance(_, type_spec.TypeSpec)
+          for _ in nest.flatten(output_signature)):
+        raise TypeError("All the elements of `output_signature` must be "
+                        "`tf.TypeSpec` objects.")
     else:
-      output_shapes = nest.map_structure_up_to(
-          output_types, tensor_shape.as_shape, output_shapes)
+      if output_types is None:
+        raise TypeError("Either `output_signature` or `output_types` must "
+                        "be specified")
+
+    if output_signature is None:
+      if output_shapes is None:
+        output_shapes = nest.map_structure(
+            lambda _: tensor_shape.TensorShape(None), output_types)
+      else:
+        output_shapes = nest.map_structure_up_to(output_types,
+                                                 tensor_shape.as_shape,
+                                                 output_shapes)
+      output_signature = nest.map_structure_up_to(output_types,
+                                                  tensor_spec.TensorSpec,
+                                                  output_shapes, output_types)
+
     if args is None:
       args = ()
     else:
       args = tuple(ops.convert_n_to_tensor(args, name="args"))
 
-    flattened_types = [dtypes.as_dtype(dt) for dt in nest.flatten(output_types)]
-    flattened_shapes = nest.flatten(output_shapes)
+    flat_output_types = structure.get_flat_tensor_types(output_signature)
 
     generator_state = DatasetV2._GeneratorState(generator)
 
@@ -831,56 +878,33 @@ class DatasetV2(collections_abc.Iterable, tracking_base.Trackable,
         """A `py_func` that will be called to invoke the iterator."""
         # `next()` raises `StopIteration` when there are no more
         # elements remaining to be generated.
-        values = next(generator_state.get_iterator(iterator_id))
+        values = next(generator_state.get_iterator(iterator_id.numpy()))
 
-        # Use the same _convert function from the py_func() implementation to
-        # convert the returned values to arrays early, so that we can inspect
-        # their values.
         try:
-          flattened_values = nest.flatten_up_to(output_types, values)
+          values = structure.normalize_element(values, output_signature)
         except (TypeError, ValueError):
-          six.reraise(TypeError, TypeError(
-              "`generator` yielded an element that did not match the expected "
-              "structure. The expected structure was %s, but the yielded "
-              "element was %s." % (output_types, values)), sys.exc_info()[2])
-        ret_arrays = []
-        for ret, dtype in zip(flattened_values, flattened_types):
-          try:
-            ret_arrays.append(script_ops.FuncRegistry._convert(  # pylint: disable=protected-access
-                ret, dtype=dtype.as_numpy_dtype))
-          except (TypeError, ValueError):
-            six.reraise(TypeError, TypeError(
-                "`generator` yielded an element that could not be converted to "
-                "the expected type. The expected type was %s, but the yielded "
-                "element was %s." % (dtype.name, ret)), sys.exc_info()[2])
+          six.reraise(
+              TypeError,
+              TypeError(
+                  "`generator` yielded an element that did not match the "
+                  "expected structure. The expected structure was %s, but the "
+                  "yielded element was %s." % (output_signature, values)),
+              sys.exc_info()[2])
 
-        # Additional type and shape checking to ensure that the components
-        # of the generated element match the `output_types` and `output_shapes`
-        # arguments.
-        for (ret_array, expected_dtype, expected_shape) in zip(
-            ret_arrays, flattened_types, flattened_shapes):
-          if ret_array.dtype != expected_dtype.as_numpy_dtype:
-            raise TypeError(
-                "`generator` yielded an element of type %s where an element "
-                "of type %s was expected." % (ret_array.dtype,
-                                              expected_dtype.as_numpy_dtype))
-          if not expected_shape.is_compatible_with(ret_array.shape):
-            raise ValueError(
-                "`generator` yielded an element of shape %s where an element "
-                "of shape %s was expected." % (ret_array.shape, expected_shape))
+        values_spec = structure.type_spec_from_value(values)
 
-        return ret_arrays
+        if not structure.are_compatible(values_spec, output_signature):
+          raise TypeError(
+              "`generator` yielded an element of %s where an element "
+              "of %s was expected." % (values_spec, output_signature))
 
-      flat_values = script_ops.numpy_function(generator_py_func,
-                                              [iterator_id_t], flattened_types)
+        return structure.to_tensor_list(output_signature, values)
 
-      # The `py_func()` op drops the inferred shapes, so we add them back in
-      # here.
-      if output_shapes is not None:
-        for ret_t, shape in zip(flat_values, flattened_shapes):
-          ret_t.set_shape(shape)
-
-      return nest.pack_sequence_as(output_types, flat_values)
+      return script_ops._eager_py_func(  # pylint: disable=protected-access
+          generator_py_func,
+          inp=[iterator_id_t],
+          Tout=flat_output_types,
+          use_tape_cache=False)
 
     def finalize_fn(iterator_id_t):
       """Releases host-side state for the iterator with ID `iterator_id_t`."""
@@ -906,7 +930,7 @@ class DatasetV2(collections_abc.Iterable, tracking_base.Trackable,
       # given ID, and raises StopIteration when that iterator contains no
       # more elements.
       return _GeneratorDataset(dummy_arg, get_iterator_id_fn, generator_next_fn,
-                               finalize_fn)
+                               finalize_fn, output_signature)
 
     # A single-element dataset that, each time it is evaluated, contains a
     # freshly-generated and unique (for the returned dataset) int64
@@ -1678,7 +1702,7 @@ name=None))
 
     >>> dataset = Dataset.range(1, 6)  # ==> [ 1, 2, 3, 4, 5 ]
     >>> dataset = dataset.map(lambda x: x + 1,
-    ...     num_parallel_calls=tf.data.experimental.AUTOTUNE,
+    ...     num_parallel_calls=tf.data.AUTOTUNE,
     ...     deterministic=False)
 
     Args:
@@ -1686,7 +1710,7 @@ name=None))
       num_parallel_calls: (Optional.) A `tf.int32` scalar `tf.Tensor`,
         representing the number elements to process asynchronously in parallel.
         If not specified, elements will be processed sequentially. If the value
-        `tf.data.experimental.AUTOTUNE` is used, then the number of parallel
+        `tf.data.AUTOTUNE` is used, then the number of parallel
         calls is set dynamically based on available CPU.
       deterministic: (Optional.) A boolean controlling whether determinism
         should be traded for performance by allowing elements to be produced out
@@ -1799,7 +1823,7 @@ name=None))
     ...              "/var/data/file3.txt", "/var/data/file4.txt"]
     >>> dataset = tf.data.Dataset.from_tensor_slices(filenames)
     >>> dataset = dataset.interleave(lambda x: tf.data.TFRecordDataset(x),
-    ...     cycle_length=4, num_parallel_calls=tf.data.experimental.AUTOTUNE,
+    ...     cycle_length=4, num_parallel_calls=tf.data.AUTOTUNE,
     ...     deterministic=False)
 
     Args:
@@ -1807,7 +1831,7 @@ name=None))
       cycle_length: (Optional.) The number of input elements that will be
         processed concurrently. If not set, the tf.data runtime decides what it
         should be based on available CPU. If `num_parallel_calls` is set to
-        `tf.data.experimental.AUTOTUNE`, the `cycle_length` argument identifies
+        `tf.data.AUTOTUNE`, the `cycle_length` argument identifies
         the maximum degree of parallelism.
       block_length: (Optional.) The number of consecutive elements to produce
         from each input element before cycling to another input element. If not
@@ -1816,7 +1840,7 @@ name=None))
         threadpool, which is used to fetch inputs from cycle elements
         asynchronously and in parallel. The default behavior is to fetch inputs
         from cycle elements synchronously with no parallelism. If the value
-        `tf.data.experimental.AUTOTUNE` is used, then the number of parallel
+        `tf.data.AUTOTUNE` is used, then the number of parallel
         calls is set dynamically based on available CPU.
       deterministic: (Optional.) A boolean controlling whether determinism
         should be traded for performance by allowing elements to be produced out
@@ -2148,21 +2172,8 @@ name=None))
 
     `cardinality` may return `tf.data.INFINITE_CARDINALITY` if the dataset
     contains an infinite number of elements or `tf.data.UNKNOWN_CARDINALITY` if
-    the analysis fails to determine the number of elements in the dataset.
-
-    `cardinality` only reports known cardinality (finite or infinite), if it can
-    be inferred statically. In particular, the implementation does not iterate
-    through the dataset or evaluate user-defined functions. As a consequence,
-    the statically inferred cardinality may often be unknown. For example, if
-    the dataset reads from file(s), the cardinality will be unknown. The
-    cardinality will also be unknown if the dataset contains user-defined
-    functions which could affect the cardinality (such as the functions in
-    `filter`, `flat_map`, `interleave`, or `from_generator`).
-
-    When constructing a dataset, you can apply the
-    `tf.data.experimental.assert_cardinality` transformation to inform the
-    dataset of its expected cardinality, so that `cardinality` can produce a
-    known cardinality.
+    the analysis fails to determine the number of elements in the dataset
+    (e.g. when the dataset source is a file).
 
     >>> dataset = tf.data.Dataset.range(42)
     >>> print(dataset.cardinality().numpy())
@@ -2171,13 +2182,10 @@ name=None))
     >>> cardinality = dataset.cardinality()
     >>> print((cardinality == tf.data.INFINITE_CARDINALITY).numpy())
     True
-    >>> dataset = dataset.filter(lambda x: False)
+    >>> dataset = dataset.filter(lambda x: True)
     >>> cardinality = dataset.cardinality()
     >>> print((cardinality == tf.data.UNKNOWN_CARDINALITY).numpy())
     True
-    >>> dataset = dataset.apply(tf.data.experimental.assert_cardinality(0))
-    >>> print(dataset.cardinality().numpy())
-    0
 
     Returns:
       A scalar `tf.int64` `Tensor` representing the cardinality of the dataset.
@@ -2266,7 +2274,8 @@ class DatasetV1(DatasetV2):
 
   def _make_one_shot_iterator(self):  # pylint: disable=missing-docstring
     if context.executing_eagerly():
-      return iterator_ops.OwnedIterator(self)
+      with ops.device(self._variant_tensor.device):
+        return iterator_ops.OwnedIterator(self)
 
     _ensure_same_dataset_graph(self)
     # Now that we create datasets at python object creation time, the capture
@@ -2308,12 +2317,13 @@ class DatasetV1(DatasetV2):
       else:
         six.reraise(ValueError, err)
 
-    # pylint: disable=protected-access
-    return iterator_ops.Iterator(
-        gen_dataset_ops.one_shot_iterator(
-            dataset_factory=_make_dataset, **self._flat_structure), None,
-        get_legacy_output_types(self), get_legacy_output_shapes(self),
-        get_legacy_output_classes(self))
+    with ops.device(self._variant_tensor.device):
+      # pylint: disable=protected-access
+      return iterator_ops.Iterator(
+          gen_dataset_ops.one_shot_iterator(
+              dataset_factory=_make_dataset, **self._flat_structure), None,
+          get_legacy_output_types(self), get_legacy_output_shapes(self),
+          get_legacy_output_classes(self))
 
   @deprecation.deprecated(
       None, "This is a deprecated API that should only be used in TF 1 graph "
@@ -2373,16 +2383,20 @@ class DatasetV1(DatasetV2):
     dataset = self._apply_options()
     if shared_name is None:
       shared_name = ""
-    iterator_resource = gen_dataset_ops.iterator_v2(
-        container="", shared_name=shared_name, **self._flat_structure)
-    with ops.colocate_with(iterator_resource):
+
+    with ops.device(self._variant_tensor.device):
+      iterator_resource = gen_dataset_ops.iterator_v2(
+          container="", shared_name=shared_name, **self._flat_structure)
+
       initializer = gen_dataset_ops.make_iterator(
           dataset._variant_tensor,  # pylint: disable=protected-access
           iterator_resource)
-    # pylint: disable=protected-access
-    return iterator_ops.Iterator(
-        iterator_resource, initializer, get_legacy_output_types(dataset),
-        get_legacy_output_shapes(dataset), get_legacy_output_classes(dataset))
+
+      # pylint: disable=protected-access
+      return iterator_ops.Iterator(iterator_resource, initializer,
+                                   get_legacy_output_types(dataset),
+                                   get_legacy_output_shapes(dataset),
+                                   get_legacy_output_classes(dataset))
 
   @property
   @deprecation.deprecated(
@@ -2458,9 +2472,14 @@ class DatasetV1(DatasetV2):
 
   @staticmethod
   @functools.wraps(DatasetV2.from_generator)
-  def from_generator(generator, output_types, output_shapes=None, args=None):
-    return DatasetV1Adapter(DatasetV2.from_generator(
-        generator, output_types, output_shapes, args))
+  def from_generator(generator,
+                     output_types=None,
+                     output_shapes=None,
+                     args=None,
+                     output_signature=None):
+    return DatasetV1Adapter(
+        DatasetV2.from_generator(generator, output_types, output_shapes, args,
+                                 output_signature))
 
   @staticmethod
   @functools.wraps(DatasetV2.range)
@@ -2557,7 +2576,7 @@ class DatasetV1(DatasetV2):
       num_parallel_calls: (Optional.) A `tf.int32` scalar `tf.Tensor`,
         representing the number elements to process asynchronously in parallel.
         If not specified, elements will be processed sequentially. If the value
-        `tf.data.experimental.AUTOTUNE` is used, then the number of parallel
+        `tf.data.AUTOTUNE` is used, then the number of parallel
         calls is set dynamically based on available CPU.
       deterministic: (Optional.) A boolean controlling whether determinism
         should be traded for performance by allowing elements to be produced out
@@ -2835,20 +2854,37 @@ def get_legacy_output_types(dataset_or_iterator):
 
 @tf_export("data.Options")
 class Options(options_lib.OptionsBase):
-  """Represents options for tf.data.Dataset.
+  """Represents options for `tf.data.Dataset`.
 
-  An `Options` object can be, for instance, used to control which graph
-  optimizations to apply or whether to use performance modeling to dynamically
-  tune the parallelism of operations such as `tf.data.Dataset.map` or
-  `tf.data.Dataset.interleave`.
+  A `tf.data.Options` object can be, for instance, used to control which static
+  optimizations to apply to the input pipeline graph or whether to use
+  performance modeling to dynamically tune the parallelism of operations such as
+  `tf.data.Dataset.map` or `tf.data.Dataset.interleave`.
 
-  After constructing an `Options` object, use `dataset.with_options(options)` to
-  apply the options to a dataset.
+  The options are set for the entire dataset and are carried over to datasets
+  created through tf.data transformations.
 
-  >>> dataset = tf.data.Dataset.range(3)
+  The options can be set either by mutating the object returned by
+  `tf.data.Dataset.options()` or by constructing an `Options` object and using
+  the `tf.data.Dataset.with_options(options)` transformation, which returns a
+  dataset with the options set.
+
+  >>> dataset = tf.data.Dataset.range(42)
+  >>> dataset.options().experimental_deterministic = False
+  >>> print(dataset.options().experimental_deterministic)
+  False
+
+  >>> dataset = tf.data.Dataset.range(42)
   >>> options = tf.data.Options()
-  >>> # Set options here.
+  >>> options.experimental_deterministic = False
   >>> dataset = dataset.with_options(options)
+  >>> print(dataset.options().experimental_deterministic)
+  False
+
+  Note: A known limitation of the `tf.data.Options` implementation is that the
+  options are not preserved across tf.function boundaries. In particular, to
+  set options for a dataset that is iterated within a tf.function, the options
+  need to be set within the same tf.function.
   """
 
   experimental_deterministic = options_lib.create_option(
@@ -2967,17 +3003,15 @@ class Options(options_lib.OptionsBase):
   def merge(self, options):
     """Merges itself with the given `tf.data.Options`.
 
-    The given `tf.data.Options` can be merged as long as there does not exist an
-    attribute that is set to different values in `self` and `options`.
+    If this object and the `options` to merge set an option differently, a
+    warning is generated and this object's value is updated with the `options`
+    object's value.
 
     Args:
       options: a `tf.data.Options` to merge with
 
-    Raises:
-      ValueError: if the given `tf.data.Options` cannot be merged
-
     Returns:
-      New `tf.data.Options()` object which is the result of merging self with
+      New `tf.data.Options` object which is the result of merging self with
       the input `tf.data.Options`.
     """
     return options_lib.merge_options(self, options)
@@ -3454,7 +3488,8 @@ class StructuredFunctionWrapper(object):
 class _GeneratorDataset(DatasetSource):
   """A `Dataset` that generates elements by invoking a function."""
 
-  def __init__(self, init_args, init_func, next_func, finalize_func):
+  def __init__(self, init_args, init_func, next_func, finalize_func,
+               output_signature):
     """Constructs a `_GeneratorDataset`.
 
     Args:
@@ -3468,6 +3503,8 @@ class _GeneratorDataset(DatasetSource):
       finalize_func: A TensorFlow function that will be called on the result of
         `init_func` immediately before a C++ iterator over this dataset is
         destroyed. The return value is ignored.
+      output_signature: A nested structure of `tf.TypeSpec` objects describing
+        the output of `next_func`.
     """
     self._init_args = init_args
 
@@ -3487,6 +3524,9 @@ class _GeneratorDataset(DatasetSource):
         finalize_func,
         self._transformation_name(),
         input_structure=self._init_func.output_structure)
+
+    self._output_signature = output_signature
+
     variant_tensor = gen_dataset_ops.generator_dataset(
         structure.to_tensor_list(self._init_structure, self._init_args) +
         self._init_func.function.captured_inputs,
@@ -3500,7 +3540,7 @@ class _GeneratorDataset(DatasetSource):
 
   @property
   def element_spec(self):
-    return self._next_func.output_structure
+    return self._output_signature
 
   def _transformation_name(self):
     return "Dataset.from_generator()"
@@ -4414,12 +4454,13 @@ class _OptionsDataset(UnaryUnchangedStructureDataset):
 class _ModelDataset(UnaryUnchangedStructureDataset):
   """A `Dataset` that acts as an identity, and models performance."""
 
-  def __init__(self, input_dataset, algorithm, cpu_budget):
+  def __init__(self, input_dataset, algorithm, cpu_budget, ram_budget):
     self._input_dataset = input_dataset
     variant_tensor = gen_dataset_ops.model_dataset(
         input_dataset._variant_tensor,  # pylint: disable=protected-access
         algorithm=algorithm.value,
         cpu_budget=cpu_budget,
+        ram_budget=ram_budget,
         **self._flat_structure)
     super(_ModelDataset, self).__init__(input_dataset, variant_tensor)
 
@@ -4437,45 +4478,30 @@ class _OptimizeDataset(UnaryUnchangedStructureDataset):
     if optimization_configs is None:
       optimization_configs = []
 
-    if compat.forward_compatible(2020, 8, 6):
-      self._optimizations_enabled = convert.optional_param_to_tensor(
-          argument_name="optimizations_enabled",
-          argument_value=optimizations_enabled,
-          argument_default=[],
-          argument_dtype=dtypes.string)
-      self._optimizations_disabled = convert.optional_param_to_tensor(
-          argument_name="optimizations_disabled",
-          argument_value=optimizations_disabled,
-          argument_default=[],
-          argument_dtype=dtypes.string)
-      self._optimizations_default = convert.optional_param_to_tensor(
-          argument_name="optimizations_default",
-          argument_value=optimizations_default,
-          argument_default=[],
-          argument_dtype=dtypes.string)
+    self._optimizations_enabled = convert.optional_param_to_tensor(
+        argument_name="optimizations_enabled",
+        argument_value=optimizations_enabled,
+        argument_default=[],
+        argument_dtype=dtypes.string)
+    self._optimizations_disabled = convert.optional_param_to_tensor(
+        argument_name="optimizations_disabled",
+        argument_value=optimizations_disabled,
+        argument_default=[],
+        argument_dtype=dtypes.string)
+    self._optimizations_default = convert.optional_param_to_tensor(
+        argument_name="optimizations_default",
+        argument_value=optimizations_default,
+        argument_default=[],
+        argument_dtype=dtypes.string)
 
-      variant_tensor = gen_dataset_ops.optimize_dataset_v2(
-          input_dataset._variant_tensor,  # pylint: disable=protected-access
-          self._optimizations_enabled,
-          self._optimizations_disabled,
-          self._optimizations_default,
-          optimization_configs=optimization_configs,
-          **self._flat_structure)
-    else:
-      if optimizations_enabled is None:
-        optimizations_enabled = []
-      if optimizations_default is None:
-        optimizations_default = []
+    variant_tensor = gen_dataset_ops.optimize_dataset_v2(
+        input_dataset._variant_tensor,  # pylint: disable=protected-access
+        self._optimizations_enabled,
+        self._optimizations_disabled,
+        self._optimizations_default,
+        optimization_configs=optimization_configs,
+        **self._flat_structure)
 
-      self._optimizations = ops.convert_to_tensor(
-          optimizations_enabled + optimizations_default,
-          dtype=dtypes.string,
-          name="optimizations")
-      variant_tensor = gen_dataset_ops.optimize_dataset(
-          input_dataset._variant_tensor,  # pylint: disable=protected-access
-          self._optimizations,
-          optimization_configs=optimization_configs,
-          **self._flat_structure)
     super(_OptimizeDataset, self).__init__(input_dataset, variant_tensor)
 
 
diff --git a/tensorflow/python/data/ops/iterator_ops.py b/tensorflow/python/data/ops/iterator_ops.py
index 023cee88a5c..20adaf2f887 100644
--- a/tensorflow/python/data/ops/iterator_ops.py
+++ b/tensorflow/python/data/ops/iterator_ops.py
@@ -36,7 +36,6 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import type_spec
 from tensorflow.python.ops import gen_dataset_ops
-from tensorflow.python.ops import gen_experimental_dataset_ops
 from tensorflow.python.training.saver import BaseSaverBuilder
 from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import deprecation
@@ -372,9 +371,11 @@ class Iterator(trackable.Trackable):
           raise TypeError("Expected output shapes compatible with %r but got "
                           "dataset with output shapes %r." %
                           (self.output_shapes, dataset_output_shapes))
-    with ops.colocate_with(self._iterator_resource):
+
+    with ops.device(self._iterator_resource.device):
+      # pylint: disable=protected-access
       return gen_dataset_ops.make_iterator(
-          dataset._variant_tensor, self._iterator_resource, name=name)  # pylint: disable=protected-access
+          dataset._variant_tensor, self._iterator_resource, name=name)
 
   def get_next(self, name=None):
     """Returns a nested structure of `tf.Tensor`s representing the next element.
@@ -424,13 +425,23 @@ class Iterator(trackable.Trackable):
     if self._get_next_call_count > GET_NEXT_CALL_WARNING_THRESHOLD:
       warnings.warn(GET_NEXT_CALL_WARNING_MESSAGE)
 
+    with ops.device(self._iterator_resource.device):
+      # pylint: disable=protected-access
+      flat_ret = gen_dataset_ops.iterator_get_next(
+          self._iterator_resource,
+          output_types=self._flat_tensor_types,
+          output_shapes=self._flat_tensor_shapes,
+          name=name)
+      return structure.from_tensor_list(self._element_spec, flat_ret)
+
+  def get_next_as_optional(self):
     # pylint: disable=protected-access
-    flat_ret = gen_dataset_ops.iterator_get_next(
-        self._iterator_resource,
-        output_types=self._flat_tensor_types,
-        output_shapes=self._flat_tensor_shapes,
-        name=name)
-    return structure.from_tensor_list(self._element_spec, flat_ret)
+    return optional_ops._OptionalImpl(
+        gen_dataset_ops.iterator_get_next_as_optional(
+            self._iterator_resource,
+            output_types=structure.get_flat_tensor_types(self.element_spec),
+            output_shapes=structure.get_flat_tensor_shapes(
+                self.element_spec)), self.element_spec)
 
   def string_handle(self, name=None):
     """Returns a string-valued `tf.Tensor` that represents this iterator.
@@ -647,11 +658,7 @@ class OwnedIterator(IteratorBase):
   in eager mode and inside of tf.functions.
   """
 
-  def __init__(self,
-               dataset=None,
-               components=None,
-               element_spec=None,
-               job_token=None):
+  def __init__(self, dataset=None, components=None, element_spec=None):
     """Creates a new iterator from the given dataset.
 
     If `dataset` is not specified, the iterator will be created from the given
@@ -664,20 +671,17 @@ class OwnedIterator(IteratorBase):
       components: Tensor components to construct the iterator from.
       element_spec: A nested structure of `TypeSpec` objects that
         represents the type specification of elements of the iterator.
-      job_token: A token to use for reading from a tf.data service job. Data
-        will be partitioned among all iterators using the same token. If `None`,
-        the iterator will not read from the tf.data service.
 
     Raises:
       ValueError: If `dataset` is not provided and either `components` or
         `element_spec` is not provided. Or `dataset` is provided and either
         `components` and `element_spec` is provided.
     """
+    super(OwnedIterator, self).__init__()
     error_message = ("Either `dataset` or both `components` and "
                      "`element_spec` need to be provided.")
 
     self._device = context.context().device_name
-    self._job_token = job_token
 
     if dataset is None:
       if (components is None or element_spec is None):
@@ -720,11 +724,7 @@ class OwnedIterator(IteratorBase):
           gen_dataset_ops.anonymous_iterator_v2(
               output_types=self._flat_output_types,
               output_shapes=self._flat_output_shapes))
-      if self._job_token is None:
-        gen_dataset_ops.make_iterator(ds_variant, self._iterator_resource)
-      else:
-        gen_experimental_dataset_ops.make_data_service_iterator(
-            ds_variant, self._job_token, self._iterator_resource)
+      gen_dataset_ops.make_iterator(ds_variant, self._iterator_resource)
       # Delete the resource when this object is deleted
       self._resource_deleter = IteratorResourceDeleter(
           handle=self._iterator_resource,
diff --git a/tensorflow/python/data/ops/multi_device_iterator_ops.py b/tensorflow/python/data/ops/multi_device_iterator_ops.py
index 7fa49a13fe6..b1a3b40f483 100644
--- a/tensorflow/python/data/ops/multi_device_iterator_ops.py
+++ b/tensorflow/python/data/ops/multi_device_iterator_ops.py
@@ -223,7 +223,7 @@ class MultiDeviceIterator(object):
       dataset: The input dataset to be iterated over.
       devices: The list of devices to fetch data to.
       max_buffer_size: Maximum size of the host side per device buffer to keep.
-      prefetch_buffer_size: if > 1, then we setup a buffer on each device to
+      prefetch_buffer_size: if > 0, then we setup a buffer on each device to
         prefetch into.
       source_device: The host device to place the `dataset` on.  In order to
         prevent deadlocks, if the prefetch_buffer_size is greater than the
@@ -335,8 +335,7 @@ class MultiDeviceIterator(object):
     result = []
     for i, device in enumerate(self._devices):
       with ops.device(device):
-        result.append(
-            iterator_ops.get_next_as_optional(self._device_iterators[i]))
+        result.append(self._device_iterators[i].get_next_as_optional())
     return result
 
   @property
@@ -482,7 +481,7 @@ class OwnedMultiDeviceIterator(composite_tensor.CompositeTensor):
       dataset: The input dataset to be iterated over.
       devices: The list of devices to fetch data to.
       max_buffer_size: Maximum size of the host side per device buffer to keep.
-      prefetch_buffer_size: if > 1, then we setup a buffer on each device to
+      prefetch_buffer_size: if > 0, then we setup a buffer on each device to
         prefetch into.
       source_device: The host device to place the `dataset` on.  In order to
         prevent deadlocks, if the prefetch_buffer_size is greater than the
@@ -602,8 +601,7 @@ class OwnedMultiDeviceIterator(composite_tensor.CompositeTensor):
     result = []
     for i, device in enumerate(self._devices):
       with ops.device(device):
-        result.append(
-            iterator_ops.get_next_as_optional(self._device_iterators[i]))
+        result.append(self._device_iterators[i].get_next_as_optional())
     return result
 
   @property
diff --git a/tensorflow/python/data/util/options.py b/tensorflow/python/data/util/options.py
index 781ae6403fa..8af773ed68b 100644
--- a/tensorflow/python/data/util/options.py
+++ b/tensorflow/python/data/util/options.py
@@ -20,6 +20,8 @@ from __future__ import print_function
 
 import collections
 
+from absl import logging
+
 
 def _internal_attr_name(name):
   return "_" + name
@@ -98,23 +100,23 @@ def merge_options(*options_list):
   """Merges the given options, returning the result as a new options object.
 
   The input arguments are expected to have a matching type that derives from
-  `OptionsBase` (and thus each represent a set of options). The method outputs
-  an object of the same type created by merging the sets of options represented
-  by the input arguments.
+  `tf.data.OptionsBase` (and thus each represent a set of options). The method
+  outputs an object of the same type created by merging the sets of options
+  represented by the input arguments.
 
-  The sets of options can be merged as long as there does not exist an option
-  with different non-default values.
+  If an option is set to different values by different options objects, the
+  result will match the setting of the options object that appears in the input
+  list last.
 
-  If an option is an instance of `OptionsBase` itself, then this method is
-  applied recursively to the set of options represented by this option.
+  If an option is an instance of `tf.data.OptionsBase` itself, then this method
+  is applied recursively to the set of options represented by this option.
 
   Args:
     *options_list: options to merge
 
   Raises:
     TypeError: if the input arguments are incompatible or not derived from
-      `OptionsBase`
-    ValueError: if the given options cannot be merged
+      `tf.data.OptionsBase`
 
   Returns:
     A new options object which is the result of merging the given options.
@@ -134,7 +136,7 @@ def merge_options(*options_list):
   default_options = result_type()
   result = result_type()
   for options in options_list:
-    # Iterate over all set options and merge the into the result.
+    # Iterate over all set options and merge them into the result.
     for name in options._options:  # pylint: disable=protected-access
       this = getattr(result, name)
       that = getattr(options, name)
@@ -146,7 +148,7 @@ def merge_options(*options_list):
       elif isinstance(this, OptionsBase):
         setattr(result, name, merge_options(this, that))
       elif this != that:
-        raise ValueError(
-            "Cannot merge incompatible values (%r and %r) of option: %s" %
-            (this, that, name))
+        logging.warning("Changing the value of option %s from %r to %r.", name,
+                        this, that)
+        setattr(result, name, that)
   return result
diff --git a/tensorflow/python/data/util/structure.py b/tensorflow/python/data/util/structure.py
index 30e393c82de..61fbda603fe 100644
--- a/tensorflow/python/data/util/structure.py
+++ b/tensorflow/python/data/util/structure.py
@@ -67,7 +67,7 @@ def _RaggedTensorStructure(dtype, shape, ragged_rank):
 
 # TODO(jsimsa): Remove the special-case for `TensorArray` pass-through once
 # it is a subclass of `CompositeTensor`.
-def normalize_element(element):
+def normalize_element(element, element_signature=None):
   """Normalizes a nested structure of element components.
 
   * Components matching `SparseTensorSpec` are converted to `SparseTensor`.
@@ -78,19 +78,32 @@ def normalize_element(element):
 
   Args:
     element: A nested structure of individual components.
+    element_signature: (Optional.) A nested structure of `tf.DType` objects
+      corresponding to each component of `element`. If specified, it will be
+      used to set the exact type of output tensor when converting input
+      components which are not tensors themselves (e.g. numpy arrays, native
+      python types, etc.)
 
   Returns:
     A nested structure of `Tensor`, `Dataset`, `SparseTensor`, `RaggedTensor`,
     or `TensorArray` objects.
   """
-  components = nest.flatten(element)
   normalized_components = []
+  if element_signature is None:
+    components = nest.flatten(element)
+    flattened_signature = [None] * len(components)
+    pack_as = element
+  else:
+    flattened_signature = nest.flatten(element_signature)
+    components = nest.flatten_up_to(element_signature, element)
+    pack_as = element_signature
   with ops.name_scope("normalize_element"):
     # Imported here to avoid circular dependency.
     from tensorflow.python.data.ops import dataset_ops  # pylint: disable=g-import-not-at-top
-    for i, t in enumerate(components):
+    for i, (t, spec) in enumerate(zip(components, flattened_signature)):
       try:
-        spec = type_spec_from_value(t, use_fallback=False)
+        if spec is None:
+          spec = type_spec_from_value(t, use_fallback=False)
       except TypeError:
         # TypeError indicates it was not possible to compute a `TypeSpec` for
         # the value. As a fallback try converting the value to a tensor.
@@ -111,9 +124,10 @@ def normalize_element(element):
         elif isinstance(t, composite_tensor.CompositeTensor):
           normalized_components.append(t)
         else:
+          dtype = getattr(spec, "dtype", None)
           normalized_components.append(
-              ops.convert_to_tensor(t, name="component_%d" % i))
-  return nest.pack_sequence_as(element, normalized_components)
+              ops.convert_to_tensor(t, name="component_%d" % i, dtype=dtype))
+  return nest.pack_sequence_as(pack_as, normalized_components)
 
 
 def convert_legacy_structure(output_types, output_shapes, output_classes):
diff --git a/tensorflow/python/debug/cli/evaluator_test.py b/tensorflow/python/debug/cli/evaluator_test.py
index 3116ab6f957..dd99544a2af 100644
--- a/tensorflow/python/debug/cli/evaluator_test.py
+++ b/tensorflow/python/debug/cli/evaluator_test.py
@@ -151,7 +151,7 @@ class EvaluatorTest(test_util.TensorFlowTestCase):
       return [np.array([[1.0, 2.0, 3.0]])]
 
     with test.mock.patch.object(
-        dump, "get_tensors", side_effect=fake_get_tensors, autospec=True):
+        dump, "get_tensors", side_effect=fake_get_tensors):
       ev = evaluator.ExpressionEvaluator(dump)
       self.assertEqual(3, ev.evaluate("np.size(`a:0`)"))
 
@@ -168,7 +168,7 @@ class EvaluatorTest(test_util.TensorFlowTestCase):
         return [np.array([[-1.0], [1.0]])]
 
     with test.mock.patch.object(
-        dump, "get_tensors", side_effect=fake_get_tensors, autospec=True):
+        dump, "get_tensors", side_effect=fake_get_tensors):
       ev = evaluator.ExpressionEvaluator(dump)
       self.assertAllClose([[-3.0], [1.0]],
                           ev.evaluate("np.matmul(`a:0`, `b:0`)"))
@@ -182,7 +182,7 @@ class EvaluatorTest(test_util.TensorFlowTestCase):
       raise debug_data.WatchKeyDoesNotExistInDebugDumpDirError()
 
     with test.mock.patch.object(
-        dump, "get_tensors", side_effect=fake_get_tensors, autospec=True):
+        dump, "get_tensors", side_effect=fake_get_tensors):
       ev = evaluator.ExpressionEvaluator(dump)
       with self.assertRaisesRegex(
           ValueError, "Eval failed due to the value of .* being unavailable"):
@@ -204,7 +204,7 @@ class EvaluatorTest(test_util.TensorFlowTestCase):
         return [np.array(20.0)]
 
     with test.mock.patch.object(
-        dump, "get_tensors", side_effect=fake_get_tensors, autospec=True):
+        dump, "get_tensors", side_effect=fake_get_tensors):
       ev = evaluator.ExpressionEvaluator(dump)
       with self.assertRaisesRegex(ValueError, r"multiple \(2\) devices"):
         ev.evaluate("`a:0` + `a:0`")
@@ -224,7 +224,7 @@ class EvaluatorTest(test_util.TensorFlowTestCase):
         return [np.array([[-2.0, 2.0]])]
 
     with test.mock.patch.object(
-        dump, "get_tensors", side_effect=fake_get_tensors, autospec=True):
+        dump, "get_tensors", side_effect=fake_get_tensors):
       ev = evaluator.ExpressionEvaluator(dump)
       self.assertAllClose(
           [[4.0]],
@@ -238,7 +238,7 @@ class EvaluatorTest(test_util.TensorFlowTestCase):
         return [np.array([[-1.0], [1.0]]), np.array([[-2.0], [2.0]])]
 
     with test.mock.patch.object(
-        dump, "get_tensors", side_effect=fake_get_tensors, autospec=True):
+        dump, "get_tensors", side_effect=fake_get_tensors):
       ev = evaluator.ExpressionEvaluator(dump)
       self.assertAllClose(
           [[4.0]], ev.evaluate("np.matmul(`a:0[1]`.T, `a:0[0]`)"))
diff --git a/tensorflow/python/debug/examples/v1/examples_v1_test.sh b/tensorflow/python/debug/examples/v1/examples_v1_test.sh
index 6b52f57ba8a..e59b357a7c8 100755
--- a/tensorflow/python/debug/examples/v1/examples_v1_test.sh
+++ b/tensorflow/python/debug/examples/v1/examples_v1_test.sh
@@ -105,7 +105,7 @@ EOF
 
 # Test debugging of tf.keras, with non-debug runs included.
 cat << EOF | ${DEBUG_KERAS_BIN} --debug --ui_type=readline --use_random_config_path
-run -t 10
+run -t 11
 EOF
 
 # Test offline_analyzer.
diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index c1b0ee6ce23..0b1e0bdd5f0 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -13,6 +13,7 @@ exports_files(["LICENSE"])
 py_library(
     name = "distribute_test_lib_pip",
     deps = [
+        ":all_reduce",
         ":combinations",
         ":multi_worker_test_base",
         ":single_loss_example",
@@ -89,7 +90,6 @@ py_library(
     srcs = ["cross_device_utils.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":all_reduce",
         ":values",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:collective_ops",
@@ -141,11 +141,13 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
+        ":all_reduce",
         ":cross_device_ops",
         ":distribute_lib",
         ":mirrored_strategy",
         ":one_device_strategy",
         ":sharded_variable",
+        "//tensorflow/python/distribute/client",
         "//tensorflow/python/distribute/experimental",
     ],
 )
@@ -599,7 +601,7 @@ py_library(
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/tpu:tpu_lib",
-        "//tensorflow/python/tpu:tpu_py",
+        "//tensorflow/python/tpu:tpu_ops",
         "//third_party/py/numpy",
     ],
 )
@@ -851,6 +853,7 @@ py_library(
 distribute_py_test(
     name = "strategy_combinations_test",
     srcs = ["strategy_combinations_test.py"],
+    disable_mlir_bridge = False,
     python_version = "PY3",
     deps = [
         ":combinations",
@@ -868,6 +871,7 @@ py_library(
     srcs = ["multi_worker_test_base.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":multi_process_runner",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:distributed_framework_test_lib",
@@ -877,12 +881,22 @@ py_library(
         "//tensorflow/python:session",
         "//tensorflow/python:training_lib",
         "//tensorflow/python:util",
+        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:remote",
         "//third_party/py/numpy",
     ],
 )
 
+tf_py_test(
+    name = "multi_worker_test_base_test",
+    srcs = ["multi_worker_test_base_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":multi_worker_test_base",
+    ],
+)
+
 cuda_py_test(
     name = "checkpoint_utils_test",
     size = "medium",
@@ -905,6 +919,7 @@ cuda_py_test(
 distribute_py_test(
     name = "checkpointing_test",
     srcs = ["checkpointing_test.py"],
+    disable_mlir_bridge = False,
     main = "checkpointing_test.py",
     tags = [
         "multi_and_single_gpu",
@@ -920,6 +935,7 @@ distribute_py_test(
 distribute_py_test(
     name = "input_lib_test",
     srcs = ["input_lib_test.py"],
+    disable_mlir_bridge = False,
     main = "input_lib_test.py",
     shard_count = 10,
     tags = [
@@ -1007,27 +1023,19 @@ cuda_py_test(
         "multi_and_single_gpu",
     ],
     deps = [
-        ":collective_all_reduce_strategy",
-        ":collective_util",
         ":combinations",
         ":cross_device_ops",
-        ":cross_device_utils",
-        ":device_util",
+        ":multi_process_runner",
         ":multi_worker_test_base",
-        ":multi_worker_util",
         ":reduce_util",
-        ":strategy_combinations",
         ":values",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:collective_ops",
-        "//tensorflow/python:constant_op",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:kernels",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:variables",
+        "//tensorflow/python:indexed_slices",
+        "//tensorflow/python:util",
         "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:remote",
+        "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -1051,8 +1059,12 @@ py_library(
     srcs = ["sharded_variable.py"],
     srcs_version = "PY2AND3",
     deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:variables",
+        "//tensorflow/python/saved_model:save_context",
         "//tensorflow/python/training/saving:saveable_object_util",
         "//tensorflow/python/training/tracking:base",
     ],
@@ -1066,7 +1078,9 @@ tf_py_test(
         ":sharded_variable",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:embedding_ops",
+        "//tensorflow/python:extra_py_tests_deps",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:session",
         "//tensorflow/python:tensor_spec",
@@ -1116,11 +1130,13 @@ distribute_py_test(
     name = "values_test",
     size = "medium",
     srcs = ["values_test.py"],
+    disable_mlir_bridge = False,
     main = "values_test.py",
     shard_count = 5,
     tags = [
         "multi_and_single_gpu",
         "no_rocm",
+        "notsan",  # b/168645872
     ],
     tpu_tags = [
         "no_oss",  # b/150954621 Target too big to run serially reliably.
@@ -1131,6 +1147,7 @@ distribute_py_test(
         ":distribute_utils",
         ":packed_distributed_variable",
         ":parameter_server_strategy",
+        ":ps_values",
         ":strategy_combinations",
         ":test_util",
         ":tpu_strategy",
@@ -1173,6 +1190,7 @@ distribute_py_test(
 distribute_py_test(
     name = "distribute_utils_test",
     srcs = ["distribute_utils_test.py"],
+    disable_mlir_bridge = False,
     main = "distribute_utils_test.py",
     tags = [
         "multi_and_single_gpu",
@@ -1191,6 +1209,7 @@ distribute_py_test(
         "//tensorflow/python/eager:test",
         "//tensorflow/python/saved_model/model_utils:mode_keys",
         "@absl_py//absl/testing:parameterized",
+        "@wrapt",
     ],
 )
 
@@ -1238,6 +1257,7 @@ distribute_py_test(
     name = "ps_values_test",
     size = "medium",
     srcs = ["ps_values_test.py"],
+    disable_mlir_bridge = False,
     main = "ps_values_test.py",
     tags = [
         "multi_and_single_gpu",
@@ -1257,6 +1277,7 @@ distribute_py_test(
 distribute_py_test(
     name = "moving_averages_test",
     srcs = ["moving_averages_test.py"],
+    disable_mlir_bridge = False,
     main = "moving_averages_test.py",
     deps = [
         ":combinations",
@@ -1274,6 +1295,7 @@ distribute_py_test(
 distribute_py_test(
     name = "custom_training_loop_gradient_test",
     srcs = ["custom_training_loop_gradient_test.py"],
+    disable_mlir_bridge = False,
     main = "custom_training_loop_gradient_test.py",
     tags = [
         "multi_and_single_gpu",
@@ -1472,6 +1494,7 @@ distribute_py_test(
 distribute_py_test(
     name = "zero_batch_test",
     srcs = ["zero_batch_test.py"],
+    disable_mlir_bridge = False,
     main = "zero_batch_test.py",
     deps = [
         ":combinations",
@@ -1487,6 +1510,7 @@ cuda_py_test(
     python_version = "PY3",
     tags = [
         "multi_and_single_gpu",
+        "nomsan",  # b/154224457: Re-enable when fixed.
     ],
     # b/155301154 broken with XLA:GPU
     xla_enable_strict_auto_jit = True,
@@ -1671,13 +1695,11 @@ py_test(
 distribute_py_test(
     name = "strategy_common_test",
     srcs = ["strategy_common_test.py"],
+    disable_mlir_bridge = False,
     python_version = "PY3",
     shard_count = 2,
     tags = [
         "multi_and_single_gpu",
-        # TODO(b/155301154): Enable this test on multi-gpu guitar once multi process
-        # runner can run on guitar.
-        "noguitar",
         "notsan",  # TODO(b/160006974)
     ],
     xla_enable_strict_auto_jit = True,
@@ -1703,6 +1725,7 @@ distribute_py_test(
 distribute_py_test(
     name = "tf_function_test",
     srcs = ["tf_function_test.py"],
+    disable_mlir_bridge = False,
     main = "tf_function_test.py",
     tags = [
         "multi_and_single_gpu",
@@ -1764,17 +1787,49 @@ distribute_py_test(
 py_library(
     name = "parameter_server_strategy_v2",
     srcs = ["parameter_server_strategy_v2.py"],
-    srcs_version = "PY3",
+    srcs_version = "PY2AND3",
     deps = [
+        ":distribute_lib",
+        ":distribute_utils",
         ":parameter_server_strategy",
+        ":sharded_variable",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:util",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/distribute:distribute_lib",
-        "//tensorflow/python/distribute:input_lib",
-        "//tensorflow/python/distribute:sharded_variable",
-        "//tensorflow/python/distribute:values",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:tf_decorator",
+        "//tensorflow/python/eager:remote",
+        "//tensorflow/python/training:server_lib",
+        "//tensorflow/python/training/tracking:base",
+        "@absl_py//absl/logging",
+    ],
+)
+
+tf_py_test(
+    name = "parameter_server_strategy_v2_test",
+    srcs = ["parameter_server_strategy_v2_test.py"],
+    python_version = "PY3",
+    tags = [
+        "notsan",  # b/168675975
+    ],
+    deps = [
+        ":multi_worker_test_base",
+        ":parameter_server_strategy_v2",
+        ":sharded_variable",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:extra_py_tests_deps",
+        "//tensorflow/python:init_ops_v2",
+        "//tensorflow/python:linalg_ops_impl",
+        "//tensorflow/python:partitioned_variables",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/training:server_lib",
+        "//tensorflow/python/training/tracking",
+        "//tensorflow/python/training/tracking:util",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
diff --git a/tensorflow/python/distribute/checkpointing_test.py b/tensorflow/python/distribute/checkpointing_test.py
index a4be193284e..8b4af726643 100644
--- a/tensorflow/python/distribute/checkpointing_test.py
+++ b/tensorflow/python/distribute/checkpointing_test.py
@@ -51,7 +51,7 @@ class TrainingCheckpointTests(test.TestCase, parameterized.TestCase):
       restore_checkpoint = trackable_utils.Checkpoint()
       restore_checkpoint.restore(save_path)
       initial_value = restore_checkpoint._preload_simple_restoration(
-          "v", variable_shape)
+          "v")
       v = variables_lib.Variable(initial_value)
       # Check that the variable is now tagged as restored. `Checkpoint` then
       # knows it doesn't have to restore `v`'s value when it's assigned to an
diff --git a/tensorflow/python/distribute/client/BUILD b/tensorflow/python/distribute/client/BUILD
index 35d8de95276..4827df56214 100644
--- a/tensorflow/python/distribute/client/BUILD
+++ b/tensorflow/python/distribute/client/BUILD
@@ -7,22 +7,13 @@ package(
 
 exports_files(["LICENSE"])
 
-py_library(
-    name = "parameter_server_client",
-    srcs = ["parameter_server_client.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":client",
-        "//tensorflow/python/distribute:parameter_server_strategy_v2",
-    ],
-)
-
 py_library(
     name = "client",
     srcs = ["client.py"],
-    srcs_version = "PY3",
+    srcs_version = "PY2AND3",
     deps = [
         ":metric_utils",
+        ":utils",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:func_graph",
@@ -66,15 +57,42 @@ tf_py_test(
     shard_count = 14,
     tags = ["no_oss"],  # TODO(b/162119374)
     deps = [
-        ":parameter_server_client",
+        ":client",
+        "//tensorflow/python:check_ops",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
-        "//tensorflow/python:init_ops_v2",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:tensor_spec",
         "//tensorflow/python:training_server_lib",
         "//tensorflow/python:variables",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/distribute:multi_worker_test_base",
+        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:test",
+        "@absl_py//absl/logging",
+    ],
+)
+
+tf_py_test(
+    name = "client_mpr_test",
+    srcs = ["client_mpr_test.py"],
+    python_version = "PY3",
+    shard_count = 2,
+    tags = ["no_oss"],  # TODO(b/162119374)
+    deps = [
+        ":remote_eager_lib",
+        ":utils",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/distribute:multi_process_runner",
+        "//tensorflow/python/distribute:multi_worker_test_base",
+        "//tensorflow/python/distribute:parameter_server_strategy_v2",
         "//tensorflow/python/distribute:sharded_variable",
+        "//tensorflow/python/distribute/client",
         "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
@@ -84,7 +102,7 @@ tf_py_test(
 py_library(
     name = "metric_utils",
     srcs = ["metric_utils.py"],
-    srcs_version = "PY3",
+    srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python/eager:monitoring",
     ],
@@ -99,7 +117,23 @@ tf_py_test(
         ":metric_utils",
         "//tensorflow/python:training_server_lib",
         "//tensorflow/python/distribute:multi_worker_test_base",
+        "//tensorflow/python/distribute:parameter_server_strategy_v2",
         "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
         "//tensorflow/python/eager:test",
     ],
 )
+
+py_library(
+    name = "utils",
+    srcs = ["utils.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:training_server_lib",
+    ],
+)
+
+py_library(
+    name = "remote_eager_lib",
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+)
diff --git a/tensorflow/python/distribute/client/client.py b/tensorflow/python/distribute/client/client.py
index 7bef5e2385c..20c64bc6fdc 100644
--- a/tensorflow/python/distribute/client/client.py
+++ b/tensorflow/python/distribute/client/client.py
@@ -26,13 +26,14 @@ import contextlib
 import enum
 import functools
 import os
+import re
 import sys
 import threading
 import weakref
 from absl import logging
 from six.moves import queue
 
-from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.distribute import input_lib
 from tensorflow.python.distribute import parameter_server_strategy_v2
 from tensorflow.python.distribute.client import metric_utils
@@ -41,15 +42,11 @@ from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import executor
 from tensorflow.python.eager import function as tf_function
-from tensorflow.python.eager import remote
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.training import server_lib
 from tensorflow.python.util import nest
 
 # Maximum time for failed worker to come back is 1 hour
@@ -199,17 +196,21 @@ class FunctionRetryableError(Exception):
   pass
 
 
-def _maybe_get_error_and_rebuild_remote_values(worker, structure):
+def _maybe_rebuild_remote_values(worker, structure):
   """Attempts to return errors from `RemoteValue`s. Rebuilds them if needed."""
   errors_in_structure = []
 
   def _get_error(val):
     if isinstance(val, RemoteValue):
       if val._status is _RemoteValueStatus.ABORTED:  # pylint: disable=protected-access
-        with worker.failure_handler.wait_on_failure(
-            on_recovery_fn=functools.partial(val._rebuild_on, worker),  # pylint: disable=protected-access
-            worker_device_name=worker.device_name):
-          val._rebuild_on(worker)  # pylint: disable=protected-access
+        try:
+          with worker.failure_handler.wait_on_failure(
+              on_recovery_fn=functools.partial(val._rebuild_on, worker),  # pylint: disable=protected-access
+              worker_device_name=worker.device_name):
+            val._rebuild_on(worker)  # pylint: disable=protected-access
+        except Exception as e:  # pylint: disable=broad-except
+          val._set_error(e)  # pylint: disable=protected-access
+
       error = val._get_error()  # pylint: disable=protected-access
       if error:
         errors_in_structure.append(error)
@@ -260,6 +261,18 @@ def _select_worker_slice(worker_id, structured):
   return nest.map_structure(_get, structured)
 
 
+def _disallow_remote_value_as_input(structured):
+  """Raises if any element of `structured` is a RemoteValue."""
+
+  def _raise_if_remote_value(x):
+    if isinstance(x, RemoteValue):
+      raise ValueError("RemoteValue cannot be used as an input to scheduled "
+                       "function. Please file a feature request if you need "
+                       "this feature.")
+
+  nest.map_structure(_raise_if_remote_value, structured)
+
+
 class Closure(object):
   """Hold a function to be scheduled and its arguments."""
 
@@ -270,6 +283,9 @@ class Closure(object):
     self._args = args or ()
     self._kwargs = kwargs or {}
 
+    _disallow_remote_value_as_input(self._args)
+    _disallow_remote_value_as_input(self._kwargs)
+
     if isinstance(function, def_function.Function):
       replica_args = _select_worker_slice(0, self._args)
       replica_kwargs = _select_worker_slice(0, self._kwargs)
@@ -293,8 +309,7 @@ class Closure(object):
       self._output_remote_values = nest.map_structure(
           lambda x: RemoteValue(self, x), concrete_function.structured_outputs)
     elif isinstance(function, tf_function.ConcreteFunction):
-      self._function = cancellation_mgr.get_cancelable_function(
-          concrete_function)
+      self._function = cancellation_mgr.get_cancelable_function(function)
       self._output_remote_values = nest.map_structure(
           lambda x: RemoteValue(self, x), function.structured_outputs)
     else:
@@ -332,8 +347,8 @@ class Closure(object):
     replica_kwargs = _select_worker_slice(worker.worker_index, self._kwargs)
 
     e = (
-        _maybe_get_error_and_rebuild_remote_values(worker, replica_args) or
-        _maybe_get_error_and_rebuild_remote_values(worker, replica_kwargs))
+        _maybe_rebuild_remote_values(worker, replica_args) or
+        _maybe_rebuild_remote_values(worker, replica_kwargs))
     if e:
       if not isinstance(e, InputError):
         e = InputError(e)
@@ -358,7 +373,7 @@ class _CoordinatedClosureQueue(object):
   This class is thread-safe.
   """
 
-  def __init__(self, cancellation_mgr):
+  def __init__(self):
     # `self._inflight_closure_count` only tracks the number of inflight closures
     # that are "in generation". Once an error occurs, error generation is
     # incremented and all subsequent arriving closures (from inflight) are
@@ -384,11 +399,11 @@ class _CoordinatedClosureQueue(object):
     self._no_inflight_closure_condition = threading.Condition(self._queue_lock)
 
     # Use to cancel in-flight closures.
-    self._cancellation_mgr = cancellation_mgr
+    self._cancellation_mgr = cancellation.CancellationManager()
 
     if _CLOSURE_QUEUE_MAX_SIZE <= 0:
       logging.warning(
-          "In ParameterServerClient, creating an infinite closure queue can "
+          "In a `Client`, creating an infinite closure queue can "
           "consume a significant amount of memory and even lead to OOM.")
     self._queue = queue.Queue(maxsize=_CLOSURE_QUEUE_MAX_SIZE)
     self._error = None
@@ -421,6 +436,14 @@ class _CoordinatedClosureQueue(object):
         closure._set_output_remote_values_cancelled()  # pylint: disable=protected-access
       except queue.Empty:
         break
+    # The cancellation manager cannot be reused once cancelled. After all
+    # closures (queued or inflight) are cleaned up, recreate the cancellation
+    # manager with clean state.
+    # Note on thread-safety: this is triggered when one of theses client APIs
+    # are called: `schedule`, `wait`, and `done`. At the same time, no new
+    # closures can be constructed (which reads the _cancellation_mgr to get
+    # cancellable functions).
+    self._cancellation_mgr = cancellation.CancellationManager()
 
   def _raise_if_error(self):
     """Raises the error if one exists.
@@ -431,6 +454,8 @@ class _CoordinatedClosureQueue(object):
     This method expects self._queue_lock to be held prior to entry.
     """
     if self._error:
+      logging.error("Start cancelling closures due to error %r: %s",
+                    self._error, self._error)
       self._cancel_all_closures()
       try:
         raise self._error  # pylint: disable=raising-bad-type
@@ -537,8 +562,9 @@ class _CoordinatedClosureQueue(object):
 class WorkerPreemptionHandler(object):
   """Handles worker preemptions."""
 
-  def __init__(self, server_def):
+  def __init__(self, server_def, cluster):
     self._server_def = server_def
+    self._cluster = cluster
     self._cluster_update_lock = threading.Lock()
     self._cluster_due_for_update = threading.Event()
     self._worker_up_cond = threading.Condition(self._cluster_update_lock)
@@ -572,6 +598,13 @@ class WorkerPreemptionHandler(object):
     try:
       yield
     except errors.OpError as e:
+      # If the error is due to temporary connectivity issues between worker and
+      # ps, put back closure, ignore error and do not mark worker as failure.
+      if self._cluster._record_and_ignore_transient_ps_failure(e):  # pylint: disable=protected-access
+        if on_failure_fn:
+          on_failure_fn()
+        return
+
       self._validate_preemption_failure(e)
       logging.error("Worker %s failed with error: %s", worker_device_name, e)
       if on_failure_fn:
@@ -669,9 +702,11 @@ class Worker(object):
           closure._fetch_output_remote_values()  # pylint: disable=protected-access
         self._cluster._closure_queue.mark_finished()  # pylint: disable=protected-access
     except Exception as e:  # pylint: disable=broad-except
-      logging.error(
-          "/job:worker/task:%d encountered the following error when processing "
-          "closure: %r:%s", self.worker_index, e, e)
+      # Avoid logging the derived cancellation error
+      if not isinstance(e, errors.CancelledError):
+        logging.error(
+            "/job:worker/task:%d encountered the following error when "
+            "processing closure: %r:%s", self.worker_index, e, e)
       nest.map_structure(
           lambda x: x._set_error(e),  # pylint: disable=protected-access
           closure._output_remote_values)  # pylint: disable=protected-access
@@ -700,7 +735,10 @@ class Worker(object):
     # status, and executing closures happen on the same thread. This allows us
     # to have simpler logic of concurrency.
     closure = Closure(
-        function, self._cluster._cancellation_mgr, args=args, kwargs=kwargs)  # pylint: disable=protected-access
+        function,
+        self._cluster._closure_queue._cancellation_mgr,  # pylint: disable=protected-access
+        args=args,
+        kwargs=kwargs)
     resource_remote_value = closure._output_remote_values  # pylint: disable=protected-access
     self._register_resource(resource_remote_value)
 
@@ -738,36 +776,31 @@ class Cluster(object):
     workers: a list of `Worker` objects in the cluster.
   """
 
-  def __init__(self, cluster_resolver, client_name="chief"):
-    """Initializes the cluster instance and connect to the remote cluster."""
-    if client_name in ["worker", "ps"]:
-      raise ValueError("Client name should not be 'worker' or 'ps'.")
-    cluster_spec = cluster_resolver.cluster_spec()
+  def __init__(self, strategy):
+    """Initializes the cluster instance."""
 
-    self._num_workers = len(cluster_spec.as_dict().get("worker", ()))
-    self._num_ps = len(cluster_spec.as_dict().get("ps", ()))
-    device_filters = server_lib.ClusterDeviceFilters()
-    # For any worker, only the devices on PS and chief nodes are visible
-    for i in range(self._num_workers):
-      device_filters.set_device_filters(
-          "worker", i, ["/job:ps", "/job:%s" % client_name])
-    # Similarly for any ps, only the devices on workers and chief are visible
-    for i in range(self._num_ps):
-      device_filters.set_device_filters(
-          "ps", i, ["/job:worker", "/job:%s" % client_name])
+    self._num_workers = strategy._num_workers
+    self._num_ps = strategy._num_ps
 
-    context.context().mirroring_policy = context.MIRRORING_ALL
-    # Allow at most one outstanding RPC for each worker at a certain time. This
-    # is to simplify worker failure handling in the runtime
-    os.environ["TF_ENABLE_EAGER_CLIENT_STREAMING_ENQUEUE"] = "False"
-    remote.connect_to_cluster(cluster_spec,
-                              job_name=client_name,
-                              protocol=cluster_resolver.rpc_layer,
-                              cluster_device_filters=device_filters)
+    # Ignore PS failures reported by workers due to transient connection errors.
+    # Transient connectivity issues between workers and PS are relayed by the
+    # workers to the client, leading the client to believe that there are PS
+    # failures. The difference between transient vs. permanent PS failure is the
+    # number of reports from the workers. When this env var is set to a positive
+    # integer K, the client ignores up to K reports of a failed PS task. I.e.,
+    # only when there are more than K trials of executing closures fail due to
+    # errors from the same PS instance do we consider the PS instance encounters
+    # a failure.
+    # TODO(b/164279603): Remove this workaround when the underlying connectivity
+    # issue in gRPC server is resolved.
+    self._transient_ps_failures_threshold = int(os.environ.get(
+        "TF_CLIENT_IGNORE_TRANSIENT_PS_FAILURES", 3))
+    self._potential_ps_failures_lock = threading.Lock()
+    self._potential_ps_failures_count = [0] * self._num_ps
 
-    self._cancellation_mgr = cancellation.CancellationManager()
-    self._closure_queue = _CoordinatedClosureQueue(self._cancellation_mgr)
-    self.failure_handler = WorkerPreemptionHandler(context.get_server_def())
+    self._closure_queue = _CoordinatedClosureQueue()
+    self.failure_handler = WorkerPreemptionHandler(context.get_server_def(),
+                                                   self)
     worker_device_strings = [
         "/job:worker/replica:0/task:%d" % i for i in range(self._num_workers)
     ]
@@ -775,6 +808,22 @@ class Cluster(object):
         Worker(i, w, self) for i, w in enumerate(worker_device_strings)
     ]
 
+  def _record_and_ignore_transient_ps_failure(self, e):
+    """Records potential PS failures and return if failure should be ignored."""
+    if self._transient_ps_failures_threshold <= 0 or not _is_ps_failure(e):
+      return False
+
+    ps_tasks = _extract_failed_ps_instances(str(e))
+    with self._potential_ps_failures_lock:
+      for t in ps_tasks:
+        self._potential_ps_failures_count[t] += 1
+        # The number of UnavailableError encountered on this PS task exceeds the
+        # maximum number of ignored error
+        if (self._potential_ps_failures_count[t] >=
+            self._transient_ps_failures_threshold):
+          return False
+    return True
+
   def schedule(self, function, args, kwargs):
     """Schedules `function` to be dispatched to a worker for execution.
 
@@ -788,7 +837,10 @@ class Cluster(object):
       A structure of `RemoteValue` object.
     """
     closure = Closure(
-        function, self._cancellation_mgr, args=args, kwargs=kwargs)
+        function,
+        self._closure_queue._cancellation_mgr,  # pylint: disable=protected-access
+        args=args,
+        kwargs=kwargs)
     self._closure_queue.put(closure)
     return closure._output_remote_values  # pylint: disable=protected-access
 
@@ -810,14 +862,10 @@ class Client(object):
   """An object to schedule and orchestrate remote function execution.
 
   A `Client` object represents a program used to create dataset, schedule
-  functions to be executed, and fetch the results of the functions. Operations
-  that will involve other tasks in the cluster, such as variable creation,
-  reading variables etc., should be performed within `client.context()`.
+  functions to be executed, and fetch the results of the functions.
 
   Currently, `Client` is not supported to be used in a standalone manner.
-  It should be used in conjunction with `ParameterServerStrategyV2`. The
-  recommended way of using the combination is through a `ParameterServerClient`
-  object. Please see `ParameterServerClient` for more information.
+  It should be used in conjunction with `ParameterServerStrategyV2`.
 
   This is currently under development, and the API as well as implementation
   is subject to changes.
@@ -841,38 +889,11 @@ class Client(object):
       raise ValueError("Only `ParameterServerStrategyV2` is supported in "
                        "`Client` currently.")
     self._strategy = strategy
-    self.cluster = Cluster(strategy._cluster_resolver)
+    self.cluster = Cluster(strategy)
 
-  @contextlib.contextmanager
-  def context(self):
-    """Context manager under which client distribution is in effect.
-
-    All distribution related methods using this `Client`, including those that
-    create and update variables, should be used within this context. This
-    context manager handles cluster fault tolerance in remote function
-    execution.
-
-    The context manager calls `join` automatically when exiting successfully.
-
-    Entering `Client.context` also enters the underlying strategy's scope, and
-    this means that `tf.distribute.get_strategy()` will return the strategy
-    object being used.
-
-    Yields:
-      Nothing.
-    """
-    with self._strategy.scope(), self._handle_parameter_server_failure():
-      yield
-    self.join()
-
-  @contextlib.contextmanager
-  def experimental_variable_partitioning_scope(self):
-    with self._strategy.experimental_variable_partitioning_scope():
-      yield
-
-  (experimental_variable_partitioning_scope.__doc__) = (
-      parameter_server_strategy_v2.ParameterServerStrategyV2
-      .experimental_variable_partitioning_scope.__doc__)
+  @property
+  def strategy(self):
+    return self._strategy
 
   def schedule(self, fn, args=None, kwargs=None):
     """Schedules `fn` to be dispatched to a worker for execution asynchronously.
@@ -911,7 +932,8 @@ class Client(object):
     of values, each of which represents a component specific to a worker; in
     this case, the argument will be substituted with the corresponding component
     on the target worker. Arguments that are not `PerWorkerValues` will be
-    passed into `fn` as-is.
+    passed into `fn` as-is. Currently, `RemoteValue` is not supported to be
+    input `args` or `kwargs`.
 
     Args:
       fn: A `tf.function`; the function to be dispatched to a worker for
@@ -927,13 +949,10 @@ class Client(object):
         scheduled function since the last time an error was thrown or since
         the beginning of the program.
     """
-    # TODO(b/160702436): Invoke `strategy.run` for user's function so it enters
-    # a `ReplicaContext` in a logically correct way.
-    with distribute_lib.ReplicaContext(
-        self._strategy,
-        replica_id_in_sync_group=constant_op.constant(0, dtypes.int32)):
-      with self._translate_parameter_server_failure():
-        return self.cluster.schedule(fn, args=args, kwargs=kwargs)
+    # Slot variables are usually created during function tracing time; thus
+    # `schedule` needs to be called within the `strategy.scope()`.
+    with self.strategy.scope(), _translate_parameter_server_failure():
+      return self.cluster.schedule(fn, args=args, kwargs=kwargs)
 
   def join(self):
     """Blocks until all the scheduled functions have finished execution.
@@ -954,9 +973,7 @@ class Client(object):
         scheduled function since the last time an error was thrown or since
         the beginning of the program.
     """
-    # TODO(b/159486639): Update the docs once we can cancel the functions being
-    # executed on workers, that when `join` returns, the system is stabilized.
-    with self._translate_parameter_server_failure():
+    with _translate_parameter_server_failure():
       self.cluster.join()
 
   def done(self):
@@ -1055,31 +1072,32 @@ class Client(object):
       return (result,)
     return result
 
-  # pylint: disable=missing-function-docstring
-  @contextlib.contextmanager
-  def _translate_parameter_server_failure(self):
-    try:
-      yield
-    except Exception as e:  # pylint: disable=broad-except
-      if _is_ps_failure(e):
-        logging.exception("Encountered parameter server failures!")
-        raise ParameterServerFailureError(e)
-      else:
-        raise
 
-  # pylint: disable=missing-function-docstring
-  @contextlib.contextmanager
-  def _handle_parameter_server_failure(self):
-    try:
-      with self._translate_parameter_server_failure():
-        yield
-    except ParameterServerFailureError as e:  # pylint: disable=broad-except
-      restart_exit_code = os.environ.get(
-          "TF_CLIENT_NON_FATAL_RESTART_EXIT_CODE", None)
-      if restart_exit_code is not None:
-        sys.exit(int(restart_exit_code))
-      else:
-        raise
+# pylint: disable=missing-function-docstring
+@contextlib.contextmanager
+def _translate_parameter_server_failure():
+  try:
+    yield
+  except Exception as e:  # pylint: disable=broad-except
+    if _is_ps_failure(e):
+      raise ParameterServerFailureError(e)
+    else:
+      raise
+
+
+# pylint: disable=missing-function-docstring
+@contextlib.contextmanager
+def handle_parameter_server_failure():
+  try:
+    with _translate_parameter_server_failure():
+      yield
+  except ParameterServerFailureError as e:  # pylint: disable=broad-except
+    restart_exit_code = os.environ.get("TF_CLIENT_NON_FATAL_RESTART_EXIT_CODE",
+                                       None)
+    if restart_exit_code is not None:
+      sys.exit(int(restart_exit_code))
+    else:
+      raise
 
 
 class _PerWorkerDistributedDataset(object):
@@ -1102,9 +1120,7 @@ class _PerWorkerDistributedDataset(object):
     elif not isinstance(dataset_fn, tf_function.ConcreteFunction):
       with variable_scope.variable_creator_scope(disallow_variable_creation):
         dataset_fn = def_function.function(dataset_fn).get_concrete_function()
-    self._dataset_fn = (
-        client.cluster._cancellation_mgr.get_cancelable_function(  # pylint: disable=protected-access
-            dataset_fn))
+    self._dataset_fn = dataset_fn
     self._input_workers = input_workers
     self._client = client
     self._element_spec = None
@@ -1127,15 +1143,12 @@ class _PerWorkerDistributedDataset(object):
     per_worker_iterator = self._client._create_per_worker_resources(
         _create_per_worker_iterator)
 
-    # Create an iterator, so the consumer function of this iterator can start
-    # tracing using this iterator without needing to wait for the completion of
-    # the iterater creation. Note: the iterator shouldn't use memory until it is
-    # consumed.
-    # TODO(b/154675763): get rid of this workaround once we can make input_fn a
-    # tf.function.
-    iterator = _create_per_worker_iterator()
+    # Setting type_spec of each RemoteValue so that functions taking these
+    # RemoteValues as inputs can be traced.
     for iterator_remote_value in per_worker_iterator._values:
-      iterator_remote_value._set_type_spec(iterator._type_spec)
+      iterator_remote_value._set_type_spec(
+          iterator_ops.IteratorSpec(
+              self._dataset_fn.structured_outputs.element_spec))
     return _PerWorkerDistributedIterator(per_worker_iterator._values)
 
   @property
@@ -1157,6 +1170,12 @@ class _PerWorkerDistributedIterator(PerWorkerValues):
                               "is not supported right now.")
 
 
+def _extract_failed_ps_instances(err_msg):
+  """Return a set of potentially failing ps instances from error message."""
+  tasks = re.findall("/job:ps/replica:0/task:[0-9]+", err_msg)
+  return set(int(t.split(":")[-1]) for t in tasks)
+
+
 def _is_ps_failure(error):
   """Whether the error is considered a parameter server failure."""
   if (_RPC_ERROR_FROM_PS in str(error) or
diff --git a/tensorflow/python/distribute/client/client_mpr_test.py b/tensorflow/python/distribute/client/client_mpr_test.py
new file mode 100644
index 00000000000..d0ac00777b6
--- /dev/null
+++ b/tensorflow/python/distribute/client/client_mpr_test.py
@@ -0,0 +1,129 @@
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Multi-process runner tests for `Client` with `ParameterServerStrategyV2`."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import time
+from absl import logging
+from tensorflow.python.compat import v2_compat
+from tensorflow.python.distribute import multi_process_runner
+from tensorflow.python.distribute import multi_worker_test_base
+from tensorflow.python.distribute import parameter_server_strategy_v2
+from tensorflow.python.distribute.client import client as client_lib
+from tensorflow.python.distribute.client import utils
+from tensorflow.python.distribute.cluster_resolver import TFConfigClusterResolver
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import test
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables
+
+
+class ClientMprTest(test.TestCase):
+
+  def testScheduleTranslatePSFailureError(self):
+    self._test_translate_ps_failure_error(test_schedule=True)
+
+  def testJoinTranslatePSFailureError(self):
+    self._test_translate_ps_failure_error(test_join=True)
+
+  def _test_translate_ps_failure_error(self,
+                                       test_schedule=False,
+                                       test_join=False):
+
+    def proc_func(functions_scheduled_event, test_finished_event):
+      cluster_resolver = TFConfigClusterResolver()
+      if cluster_resolver.task_type != "chief":
+        utils.start_server(cluster_resolver, "grpc")
+      strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
+          cluster_resolver)
+      ps_client = client_lib.Client(strategy)
+
+      with strategy.scope():
+        v = variables.Variable(initial_value=0, dtype=dtypes.int32)
+
+      @def_function.function
+      def worker_fn():
+        # An ever-running function.
+        for _ in math_ops.range(100000):
+          v.assign_add(1)
+
+      # Keep the two workers occupied.
+      ps_client.schedule(worker_fn)
+      ps_client.schedule(worker_fn)
+      # Now the main process can terminate.
+      functions_scheduled_event.set()
+
+      # Verified that join and schedule indeed raise
+      # ParameterServerFailureError.
+      try:
+        if test_join:
+          ps_client.join()
+        if test_schedule:
+          while ps_client.cluster._closure_queue._error is None:
+            time.sleep(1)
+          ps_client.schedule(worker_fn)
+      except client_lib.ParameterServerFailureError:
+        # The following verifies that after PS fails, continue executing
+        # functions on workers should fail and indicate it's PS failure.
+        for worker_id in range(3):
+          with ops.device("/job:worker/replica:0/task:{}".format(worker_id)):
+            try:
+              # Executing a function after PS fails should result in a PS
+              # failure.
+              worker_fn()
+            except Exception as e:  # pylint: disable=broad-except
+              if client_lib._is_ps_failure(e):
+                if worker_id < 2:
+                  continue
+                logging.info("_test_translate_ps_failure_error ends properly.")
+                # Now we can safely exit the test.
+                test_finished_event.set()
+                return
+            raise RuntimeError("Executing a function after PS fails, should "
+                               "result in a PS failure.")
+
+      raise RuntimeError("ParameterServerFailureError supposed to be raised.")
+
+    manager = multi_process_runner.manager()
+    functions_scheduled_event = manager.Event()
+    test_finished_event = manager.Event()
+    mpr = multi_process_runner.MultiProcessRunner(
+        proc_func,
+        multi_worker_test_base.create_cluster_spec(
+            has_chief=True, num_workers=3, num_ps=1, has_eval=False),
+        args=(functions_scheduled_event, test_finished_event),
+        rpc_layer="grpc",
+        list_stdout=True,
+        use_dill_for_args=False)
+
+    mpr.start()
+    functions_scheduled_event.wait()
+    mpr.terminate("ps", 0)
+    while mpr.process_exists("ps", 0):
+      time.sleep(0.01)
+    test_finished_event.wait()
+    self.assertTrue(
+        any("_test_translate_ps_failure_error ends properly" in msg
+            for msg in mpr.join().stdout))
+
+
+if __name__ == "__main__":
+  v2_compat.enable_v2_behavior()
+  multi_process_runner.test_main()
diff --git a/tensorflow/python/distribute/client/client_test.py b/tensorflow/python/distribute/client/client_test.py
index 19deab26f63..3ea3e46d6e8 100644
--- a/tensorflow/python/distribute/client/client_test.py
+++ b/tensorflow/python/distribute/client/client_test.py
@@ -19,34 +19,25 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import platform
+import sys
 import threading
 import time
 from absl import logging
 
 from tensorflow.python.distribute.client import client
+from tensorflow.python.eager import cancellation
 from tensorflow.python.eager import def_function
 from tensorflow.python.platform import test
 from tensorflow.python.training import coordinator
 from tensorflow.python.util import nest
 
 
-class MockCancellationManager(object):
-
-  def __init__(self):
-    self.cancelled = False
-
-  def start_cancel(self):
-    self.cancelled = True
-
-  def get_cancelable_function(self, func):
-    return func
-
-
 class CoordinatedClosureQueueTest(test.TestCase):
 
   def testBasic(self):
-    queue = client._CoordinatedClosureQueue(MockCancellationManager())
-    closure1 = self._create_closure()
+    queue = client._CoordinatedClosureQueue()
+    closure1 = self._create_closure(queue._cancellation_mgr)
     queue.put(closure1)
     self.assertIs(closure1, queue.get())
     self.assertFalse(queue.done())
@@ -57,7 +48,7 @@ class CoordinatedClosureQueueTest(test.TestCase):
     queue.wait()
 
   def testProcessAtLeaseOnce(self):
-    closure_queue = client._CoordinatedClosureQueue(MockCancellationManager())
+    closure_queue = client._CoordinatedClosureQueue()
     labels = ['A', 'B', 'C', 'D', 'E']
     processed_count = collections.defaultdict(int)
 
@@ -80,16 +71,14 @@ class CoordinatedClosureQueueTest(test.TestCase):
     def get_func(label):
 
       def func():
-        logging.info('Label: ' + label + ', before waiting 3 sec')  # pylint: disable=logging-not-lazy
         time.sleep(3)
         processed_count[label] += 1
-        logging.info('Label: ' + label + ', after waiting 3 sec')  # pylint: disable=logging-not-lazy
 
       return func
 
+    cm = cancellation.CancellationManager()
     for label in labels:
-      closure_queue.put(
-          client.Closure(get_func(label), MockCancellationManager()))
+      closure_queue.put(client.Closure(get_func(label), cm))
     t1 = threading.Thread(target=process_queue, daemon=True)
     t1.start()
     t2 = threading.Thread(target=process_queue, daemon=True)
@@ -106,7 +95,7 @@ class CoordinatedClosureQueueTest(test.TestCase):
     coord.join([t1, t2])
 
   def testNotifyBeforeWait(self):
-    closure_queue = client._CoordinatedClosureQueue(MockCancellationManager())
+    closure_queue = client._CoordinatedClosureQueue()
 
     def func():
       logging.info('func running')
@@ -118,7 +107,7 @@ class CoordinatedClosureQueueTest(test.TestCase):
         closure_queue.get()
         closure_queue.mark_finished()
 
-    closure_queue.put(client.Closure(func, MockCancellationManager()))
+    closure_queue.put(client.Closure(func, closure_queue._cancellation_mgr))
     t = threading.Thread(target=process_queue)
     t.start()
     coord.join([t])
@@ -150,8 +139,12 @@ class CoordinatedClosureQueueTest(test.TestCase):
     coord.join([t])
 
   def testWaitRaiseErrorAfterMarkFailure(self):
-    closure_queue = client._CoordinatedClosureQueue(MockCancellationManager())
-    closure_queue.put(self._create_closure())
+    if sys.version_info >= (3, 8) and platform.system() == 'Windows':
+      # TODO(b/165013260): Fix this
+      self.skipTest('Test is currently broken on Windows with Python 3.8')
+
+    closure_queue = client._CoordinatedClosureQueue()
+    closure_queue.put(self._create_closure(closure_queue._cancellation_mgr))
     closure = closure_queue.get()
 
     wait_finish_event = threading.Event()
@@ -174,20 +167,20 @@ class CoordinatedClosureQueueTest(test.TestCase):
 
     self.assertTrue(closure_queue.done())
 
-  def _create_closure(self):
+  def _create_closure(self, cancellation_mgr):
 
     @def_function.function()
     def some_function():
       return 1.0
 
-    return client.Closure(some_function, MockCancellationManager())
+    return client.Closure(some_function, cancellation_mgr)
 
   def _put_two_closures_and_get_one(self):
-    closure_queue = client._CoordinatedClosureQueue(MockCancellationManager())
-    closure1 = self._create_closure()
+    closure_queue = client._CoordinatedClosureQueue()
+    closure1 = self._create_closure(closure_queue._cancellation_mgr)
     closure_queue.put(closure1)
 
-    closure2 = self._create_closure()
+    closure2 = self._create_closure(closure_queue._cancellation_mgr)
     closure_queue.put(closure2)
 
     closure_got = closure_queue.get()  # returns closure1
@@ -196,12 +189,16 @@ class CoordinatedClosureQueueTest(test.TestCase):
     return closure_queue, closure1, closure2
 
   def testPutRaiseError(self):
+    if sys.version_info >= (3, 8) and platform.system() == 'Windows':
+      # TODO(b/165013260): Fix this
+      self.skipTest('Test is currently broken on Windows with Python 3.8')
+
     closure_queue, _, closure2 = self._put_two_closures_and_get_one()
 
     closure_queue.mark_failed(ValueError())
 
     with self.assertRaises(ValueError):
-      closure_queue.put(self._create_closure())
+      closure_queue.put(self._create_closure(closure_queue._cancellation_mgr))
 
     self.assertTrue(closure_queue.done())
 
@@ -212,9 +209,13 @@ class CoordinatedClosureQueueTest(test.TestCase):
       closure2._fetch_output_remote_values()
 
     # The error is cleared.
-    closure_queue.put(self._create_closure())
+    closure_queue.put(self._create_closure(closure_queue._cancellation_mgr))
 
   def testWaitRaiseError(self):
+    if sys.version_info >= (3, 8) and platform.system() == 'Windows':
+      # TODO(b/165013260): Fix this
+      self.skipTest('Test is currently broken on Windows with Python 3.8')
+
     closure_queue, _, closure2 = self._put_two_closures_and_get_one()
 
     closure_queue.mark_failed(ValueError())
@@ -233,6 +234,10 @@ class CoordinatedClosureQueueTest(test.TestCase):
     closure_queue.wait()
 
   def testDoneRaiseError(self):
+    if sys.version_info >= (3, 8) and platform.system() == 'Windows':
+      # TODO(b/165013260): Fix this
+      self.skipTest('Test is currently broken on Windows with Python 3.8')
+
     closure_queue, _, _ = self._put_two_closures_and_get_one()
 
     self.assertFalse(closure_queue.done())
@@ -249,12 +254,19 @@ class CoordinatedClosureQueueTest(test.TestCase):
       closure_queue.mark_failed(e)
 
   def _test_cancel_closure_when_error(self, call_wait):
+    if sys.version_info >= (3, 8) and platform.system() == 'Windows':
+      # TODO(b/165013260): Fix this
+      self.skipTest('Test is currently broken on Windows with Python 3.8')
+
     closure_queue, closure1, closure2 = self._put_two_closures_and_get_one()
-    closure_queue.put(self._create_closure())
+    closure_queue.put(self._create_closure(closure_queue._cancellation_mgr))
     closure_queue.get()
     # At this moment, there are two inflight, one in queue.
     self.assertEqual(closure_queue._inflight_closure_count, 2)
 
+    # Hold a copy of the queue's cancellation manager at this point
+    initial_cm = closure_queue._cancellation_mgr
+
     # Simulating closure1 fails.
     self._set_error(closure_queue, closure1, ValueError('Some error.'))
 
@@ -262,7 +274,7 @@ class CoordinatedClosureQueueTest(test.TestCase):
     self.assertEqual(closure_queue._queue.qsize(), 1)
     self.assertEqual(closure_queue._inflight_closure_count, 1)
 
-    closure3 = self._create_closure()
+    closure3 = self._create_closure(closure_queue._cancellation_mgr)
 
     def fake_cancellation():
       self._set_error(closure_queue, closure2,
@@ -280,8 +292,8 @@ class CoordinatedClosureQueueTest(test.TestCase):
 
     self._assert_one_unblock_the_other(fake_cancellation, report_error)
 
-    # Cancellation manager has been called.
-    self.assertTrue(closure_queue._cancellation_mgr.cancelled)
+    # The original cancellation manager of the queue has been cancelled.
+    self.assertTrue(initial_cm.is_cancelled)
 
     # At this moment, there is zero inflight, nothing in queue.
     self.assertTrue(closure_queue._queue.empty())
@@ -316,20 +328,24 @@ class CoordinatedClosureQueueTest(test.TestCase):
     self._test_cancel_closure_when_error(call_wait=False)
 
   def testStateIsRestoredAfterJoinIsCalled(self):
+    if sys.version_info >= (3, 8) and platform.system() == 'Windows':
+      # TODO(b/165013260): Fix this
+      self.skipTest('Test is currently broken on Windows with Python 3.8')
+
     closure_queue, _, _ = self._put_two_closures_and_get_one()
     self.assertEqual(closure_queue._inflight_closure_count, 1)
     closure_queue.mark_failed(ValueError('test error'))
     with self.assertRaises(ValueError):
-      closure_queue.put(self._create_closure())
+      closure_queue.put(self._create_closure(closure_queue._cancellation_mgr))
 
     # Its error should have been cleared.
     self.assertIsNone(closure_queue._error)
-    closure_queue.put(self._create_closure())
+    closure_queue.put(self._create_closure(closure_queue._cancellation_mgr))
     self.assertIsNone(closure_queue._error)
 
   def testThreadSafey(self):
     thread_count = 10
-    queue = client._CoordinatedClosureQueue(MockCancellationManager())
+    queue = client._CoordinatedClosureQueue()
 
     # Each thread performs 20 queue actions: 10 are `put_back` and 10 are
     # `mark_finished`.
@@ -348,7 +364,7 @@ class CoordinatedClosureQueueTest(test.TestCase):
       t.start()
 
     for _ in range(thread_count * action_count // 2):
-      queue.put(self._create_closure())
+      queue.put(self._create_closure(queue._cancellation_mgr))
     queue.wait()
     self.assertTrue(queue.done())
 
diff --git a/tensorflow/python/distribute/client/metric_utils_test.py b/tensorflow/python/distribute/client/metric_utils_test.py
index 79827e5e9f6..ec9fc28a1e1 100644
--- a/tensorflow/python/distribute/client/metric_utils_test.py
+++ b/tensorflow/python/distribute/client/metric_utils_test.py
@@ -19,8 +19,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import platform
+import sys
 import time
 from tensorflow.python.distribute import multi_worker_test_base
+from tensorflow.python.distribute import parameter_server_strategy_v2
 from tensorflow.python.distribute.client import client
 from tensorflow.python.distribute.client import metric_utils
 from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
@@ -31,17 +34,26 @@ from tensorflow.python.training.server_lib import ClusterSpec
 
 class MetricUtilsTest(test.TestCase):
 
+  def get_rpc_layer(self):
+    return 'grpc'
+
   def testClientMetrics(self):
+    if sys.version_info >= (3, 8) and platform.system() == 'Windows':
+      # TODO(b/165013260): Fix this
+      self.skipTest('Test is currently broken on Windows with Python 3.8')
+
     metric_utils.enable_metrics = True
 
     cluster_def = multi_worker_test_base.create_in_process_cluster(
-        num_workers=1, num_ps=1, rpc_layer='grpc')
+        num_workers=1, num_ps=1, rpc_layer=self.get_rpc_layer())
     cluster_def['chief'] = [
         'localhost:%d' % multi_worker_test_base.pick_unused_port()
     ]
     cluster_resolver = SimpleClusterResolver(
-        ClusterSpec(cluster_def), rpc_layer='grpc')
-    cluster = client.Cluster(cluster_resolver)
+        ClusterSpec(cluster_def), rpc_layer=self.get_rpc_layer())
+    strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
+        cluster_resolver)
+    cluster = client.Cluster(strategy)
 
     @def_function.function
     def func():
diff --git a/tensorflow/python/distribute/client/parameter_server_client.py b/tensorflow/python/distribute/client/parameter_server_client.py
deleted file mode 100644
index 8236c2410d8..00000000000
--- a/tensorflow/python/distribute/client/parameter_server_client.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# Lint as: python3
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Parameter server client module.
-
-This is currently under development and the API is subject to change.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.distribute import parameter_server_strategy_v2
-from tensorflow.python.distribute.client import client
-
-
-class ParameterServerClient(client.Client):
-  """A client that uses `ParameterServerStrategy` to distribute tasks.
-
-  Parameter server training refers to the distributed training architecture
-  that requires two jobs in the cluster: workers and parameter servers. The
-  variables and updates to those variables are assigned on the parameter
-  servers' tasks, and the actual computation intensive operations are assigned
-  on worker tasks. In TF2, parameter server training only starts up one
-  client process, to drive and coordinate the workers and parameter servers.
-  This is referred to as single-client architecture, as opposed to multi-client
-  approach which is seen more often in traditional TensorFlow distributed
-  training, including `tf.estimator.Estimator` and `tf.keras` with
-  `tf.distribute.experimental.MultiWorkerMirroredStrategy`.
-
-  `ParameterServerClient` is a `Client` that uses `ParameterServerStrategy` as
-  the underlying strategy to distribute, and is the starting point of parameter
-  server training/evaluation.
-
-  If 'TF_CONFIG' environment variable is used, provide a
-  `TFConfigClusterResolver` to detect configurations for multi-worker training.
-
-  """
-
-  def __init__(self, cluster_resolver):
-    super(ParameterServerClient, self).__init__(
-        parameter_server_strategy_v2.ParameterServerStrategyV2(
-            cluster_resolver))
diff --git a/tensorflow/python/distribute/client/parameter_server_client_test.py b/tensorflow/python/distribute/client/parameter_server_client_test.py
index 32c7ff9c7e9..d43cf5a3270 100644
--- a/tensorflow/python/distribute/client/parameter_server_client_test.py
+++ b/tensorflow/python/distribute/client/parameter_server_client_test.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for parameter_server_client.py."""
+"""Tests for `Client` when used together with `ParameterServerStrategyV2."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -24,10 +24,10 @@ import threading
 from absl import logging
 
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.distribute import multi_worker_test_base
-from tensorflow.python.distribute import sharded_variable
-from tensorflow.python.distribute.client import client
-from tensorflow.python.distribute.client import parameter_server_client
+from tensorflow.python.distribute import parameter_server_strategy_v2
+from tensorflow.python.distribute.client import client as client_lib
 from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import test
@@ -36,7 +36,6 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import init_ops_v2
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables
@@ -94,7 +93,9 @@ def make_client(num_workers, num_ps):
   ]
   cluster_resolver = SimpleClusterResolver(
       ClusterSpec(cluster_def), rpc_layer="grpc")
-  return parameter_server_client.ParameterServerClient(cluster_resolver)
+  strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
+      cluster_resolver)
+  return client_lib.Client(strategy)
 
 
 class ParameterServerClientTest(TestCaseWithErrorReportingThread):
@@ -103,13 +104,14 @@ class ParameterServerClientTest(TestCaseWithErrorReportingThread):
   def setUpClass(cls):
     super(ParameterServerClientTest, cls).setUpClass()
     cls.client = make_client(num_workers=3, num_ps=2)
+    cls.strategy = cls.client.strategy
 
   def testBasic(self):
-    self.client._strategy.extended._variable_count = 0
-    with self.client.context():
+    self.strategy.extended._variable_count = 0
+    with self.strategy.scope():
       v1 = variables.Variable(initial_value=0.0)
       v2 = variables.Variable(initial_value=1.0)
-    self.assertEqual(self.client._strategy.extended._variable_count, 2)
+    self.assertEqual(self.strategy.extended._variable_count, 2)
 
     @def_function.function
     def worker_fn():
@@ -140,7 +142,7 @@ class ParameterServerClientTest(TestCaseWithErrorReportingThread):
     def input_fn():
       return dataset_ops.DatasetV2.range(1, 2)
 
-    with self.client.context():
+    with self.strategy.scope():
       v = variables.Variable(initial_value=0, dtype=dtypes.int64)
 
     @def_function.function
@@ -164,7 +166,7 @@ class ParameterServerClientTest(TestCaseWithErrorReportingThread):
     def input_fn():
       return dataset_ops.DatasetV2.from_tensor_slices([2] * 10)
 
-    with self.client.context():
+    with self.strategy.scope():
       v = variables.Variable(initial_value=0, dtype=dtypes.int32)
 
     # TODO(yuefengz): the following tf.function has a return value which is None
@@ -231,89 +233,6 @@ class ParameterServerClientTest(TestCaseWithErrorReportingThread):
     with self.assertRaises(ValueError):
       self.client.create_per_worker_dataset(input_fn)
 
-
-class LimitedClosureQueueSizeBasicTest(ParameterServerClientTest):
-  """Test basic functionality works with explicit maximum closure queue size.
-
-  Execute the same set of test cases as in ParameterServerClientTest, with an
-  explicit size limit for the closure queue. Note that even when the queue size
-  is set to infinite, there is still a maximum practical size (depends on host
-  memory limit) that might cause the queue.put operations to be blocking when
-  scheduling a large number of closures on a big cluster. These tests make sure
-  that the client does not run into deadlocks in such scenario.
-  """
-
-  @classmethod
-  def setUpClass(cls):
-    super(LimitedClosureQueueSizeBasicTest, cls).setUpClass()
-    client._CLOSURE_QUEUE_MAX_SIZE = 2
-    cls.client = make_client(num_workers=3, num_ps=2)
-
-
-class VariablePartitioningScopeTest(test.TestCase):
-
-  @classmethod
-  def setUpClass(cls):
-    super(VariablePartitioningScopeTest, cls).setUpClass()
-    cls.client = make_client(num_workers=3, num_ps=2)
-
-  def testBasic(self):
-    with self.client.context():
-      with self.client.experimental_variable_partitioning_scope():
-        init1 = init_ops_v2.Constant([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
-        v1 = variables.Variable(
-            initial_value=lambda: init1(shape=(5, 2), dtype=dtypes.int64),
-            shape=(5, 2),
-            dtype=dtypes.int64)
-
-        init2 = init_ops_v2.Constant([0, 1, 2, 3, 4, 5])
-        v2 = variables.Variable(
-            initial_value=lambda: init2(shape=(6, 1), dtype=dtypes.int64),
-            shape=(6, 1),
-            dtype=dtypes.int64)
-
-    self.assertIsInstance(v1, sharded_variable.ShardedVariable)
-    self.assertLen(v1.variables, 2)
-    self.assertRegex(v1.variables[0].device, "/job:ps/replica:0/task:0")
-    self.assertRegex(v1.variables[1].device, "/job:ps/replica:0/task:1")
-    self.assertAllEqual(v1.variables[0].read_value().numpy(),
-                        [[0, 1], [2, 3], [4, 5]])
-    self.assertAllEqual(v1.variables[1].read_value().numpy(), [[6, 7], [8, 9]])
-
-    self.assertIsInstance(v2, sharded_variable.ShardedVariable)
-    self.assertLen(v2.variables, 2)
-    self.assertRegex(v2.variables[0].device, "/job:ps/replica:0/task:0")
-    self.assertRegex(v2.variables[1].device, "/job:ps/replica:0/task:1")
-    self.assertAllEqual(v2.variables[0].read_value().numpy(), [[0], [1], [2]])
-    self.assertAllEqual(v2.variables[1].read_value().numpy(), [[3], [4], [5]])
-
-  def testSurplusPS(self):
-    with self.client.context():
-      with self.client.experimental_variable_partitioning_scope():
-        initializer = init_ops_v2.Constant([0])
-
-        v = variables.Variable(
-            initial_value=lambda: initializer(shape=(1,), dtype=dtypes.int64),
-            shape=(1,),
-            dtype=dtypes.int64)
-
-    self.assertIsInstance(v, sharded_variable.ShardedVariable)
-    self.assertLen(v.variables, 1)
-    self.assertRegex(v.variables[0].device, "/job:ps/replica:0/task:0")
-    self.assertAllEqual(v.variables[0].read_value().numpy(), [0])
-
-  def testInvalidArgument(self):
-    with self.assertRaisesRegex(ValueError, "initial_value"):
-      with self.client.experimental_variable_partitioning_scope():
-        variables.Variable(initial_value=[0, 1, 2], shape=(3,))
-
-    with self.assertRaisesRegex(ValueError, "shape"):
-      with self.client.experimental_variable_partitioning_scope():
-        initializer = init_ops_v2.Constant([0, 1, 2])
-        variables.Variable(
-            initial_value=lambda: initializer(shape=(3,), dtype=dtypes.int64),
-            dtype=dtypes.int64)
-
   def testPerWorkerValue(self):
     var_shape = tuple()
     var_dtype = dtypes.float32
@@ -348,6 +267,39 @@ class VariablePartitioningScopeTest(test.TestCase):
     var_sum = sum(self.client.fetch(worker_local_var._values))
     self.assertEqual(var_sum, 10.0)
 
+  def testDisallowRemoteValueAsInput(self):
+
+    @def_function.function
+    def func_0():
+      return 1.0
+
+    @def_function.function
+    def func_1(x):
+      return x + 1.0
+
+    remote_v = self.client.schedule(func_0)
+    with self.assertRaises(ValueError):
+      self.client.schedule(func_1, args=(remote_v,))
+
+
+class LimitedClosureQueueSizeBasicTest(ParameterServerClientTest):
+  """Test basic functionality works with explicit maximum closure queue size.
+
+  Execute the same set of test cases as in `ParameterServerClientTest`, with an
+  explicit size limit for the closure queue. Note that even when the queue size
+  is set to infinite, there is still a maximum practical size (depends on host
+  memory limit) that might cause the queue.put operations to be blocking when
+  scheduling a large number of closures on a big cluster. These tests make sure
+  that the client does not run into deadlocks in such scenario.
+  """
+
+  @classmethod
+  def setUpClass(cls):
+    super(LimitedClosureQueueSizeBasicTest, cls).setUpClass()
+    client_lib._CLOSURE_QUEUE_MAX_SIZE = 2
+    cls.client = make_client(num_workers=3, num_ps=2)
+    cls.strategy = cls.client.strategy
+
 
 class ErrorReportingTest(TestCaseWithErrorReportingThread):
 
@@ -355,8 +307,9 @@ class ErrorReportingTest(TestCaseWithErrorReportingThread):
   def setUpClass(cls):
     super(ErrorReportingTest, cls).setUpClass()
     cls.client = make_client(num_workers=3, num_ps=2)
+    cls.strategy = cls.client.strategy
 
-    with cls.client.context():
+    with cls.strategy.scope():
       cls.iteration = variables.Variable(initial_value=0.0)
 
   @def_function.function
@@ -374,6 +327,15 @@ class ErrorReportingTest(TestCaseWithErrorReportingThread):
     self.iteration.assign_add(1.0)
     return self.iteration
 
+  @def_function.function
+  def _long_function(self):
+    x = random_ops.random_uniform((1000, 1000))
+    for _ in math_ops.range(10000):
+      a = random_ops.random_uniform((1000, 1000))
+      b = random_ops.random_uniform((1000, 1000))
+      x += math_ops.matmul(a, b)
+    return x
+
   def testJoinRaiseError(self):
     for _ in range(3):
       self.client.schedule(self._normal_function)
@@ -420,21 +382,36 @@ class ErrorReportingTest(TestCaseWithErrorReportingThread):
       self.client.join()
 
   def testInputError(self):
-    aborted = self.client.schedule(self._error_function)
+
+    worker_local_val = self.client._create_per_worker_resources(
+        self._error_function)
 
     @def_function.function
     def func(x):
-      return x + 1.0
+      return x + 1
+
+    result = self.client.schedule(func, args=(worker_local_val,))
+    with self.assertRaises(client_lib.InputError):
+      self.client.join()
+
+    with self.assertRaises(client_lib.InputError):
+      result.fetch()
+
+  def testCancellation(self):
+    for _ in range(3):
+      self.client.schedule(self._normal_function)
+    long_function = self.client.schedule(self._long_function)
+    self.client.schedule(self._error_function)
 
     with self.assertRaises(errors.InvalidArgumentError):
       self.client.join()
 
-    result = self.client.schedule(func, args=(aborted,))
-    with self.assertRaises(client.InputError):
-      result.fetch()
+    with self.assertRaises(client_lib.FunctionRetryableError):
+      long_function.fetch()
 
-    with self.assertRaises(client.InputError):
-      self.client.join()
+    for _ in range(3):
+      self.client.schedule(self._normal_function)
+    self.client.join()
 
 
 class LimitedClosureQueueErrorTest(ErrorReportingTest):
@@ -447,12 +424,48 @@ class LimitedClosureQueueErrorTest(ErrorReportingTest):
   @classmethod
   def setUpClass(cls):
     super(LimitedClosureQueueErrorTest, cls).setUpClass()
-    client._CLOSURE_QUEUE_MAX_SIZE = 2
+    client_lib._CLOSURE_QUEUE_MAX_SIZE = 2
     cls.client = make_client(num_workers=3, num_ps=2)
+    cls.strategy = cls.client.strategy
 
-    with cls.client.context():
+    with cls.client.strategy.scope():
       cls.iteration = variables.Variable(initial_value=0.0)
 
 
+class StrategyRunTest(test.TestCase):
+
+  @classmethod
+  def setUpClass(cls):
+    super(StrategyRunTest, cls).setUpClass()
+    cls.client = make_client(num_workers=1, num_ps=1)
+    cls.strategy = cls.client.strategy
+
+  def testStrategyRun(self):
+    self.assertFalse(distribution_strategy_context.in_cross_replica_context())
+    with self.strategy.scope():
+      self.assertTrue(distribution_strategy_context.in_cross_replica_context())
+      v = variables.Variable(initial_value=1)
+
+      @def_function.function
+      def worker_fn(input_tensor):
+
+        def replica_fn(input_tensor):
+          # Within `replica_fn`, it has to be in a replica context.
+          self.assertFalse(
+              distribution_strategy_context.in_cross_replica_context())
+          return input_tensor + v
+
+        return self.strategy.run(replica_fn, args=(input_tensor,))
+
+      # Asserting scheduling in scope has the expected behavior.
+      result = self.client.schedule(worker_fn, args=(constant_op.constant(3),))
+      self.assertIsInstance(result, client_lib.RemoteValue)
+      self.assertEqual(result.fetch(), 4)
+
+    # Asserting scheduling out of scope has the expected behavior.
+    result = self.client.schedule(worker_fn, args=(constant_op.constant(3),))
+    self.assertEqual(result.fetch(), 4)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/distribute/client/utils.py b/tensorflow/python/distribute/client/utils.py
new file mode 100644
index 00000000000..51d82630d6e
--- /dev/null
+++ b/tensorflow/python/distribute/client/utils.py
@@ -0,0 +1,46 @@
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TF2 parameter server training utilities.
+
+Parameter server training in TF2 is currently under development.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from absl import logging
+from tensorflow.python.training import server_lib
+
+
+def start_server(cluster_resolver, protocol):
+  """Start a server and block the process from exiting."""
+  # This function is for multi-processing test or users who would like to have
+  # every job run the same binary for simplicity.
+  if not (cluster_resolver.task_type == 'worker' or
+          cluster_resolver.task_type == 'ps'):
+    raise ValueError('Unexpected task_type to start a server: {}'.format(
+        cluster_resolver.task_type))
+
+  server = server_lib.Server(
+      cluster_resolver.cluster_spec().as_cluster_def(),
+      job_name=cluster_resolver.task_type,
+      task_index=cluster_resolver.task_id,
+      protocol=protocol)
+
+  logging.info('TensorFlow server started for job %s, task %d.',
+               cluster_resolver.task_type, cluster_resolver.task_id)
+
+  # Blocking the process that starts a server from exiting.
+  server.join()
diff --git a/tensorflow/python/distribute/cluster_resolver/BUILD b/tensorflow/python/distribute/cluster_resolver/BUILD
index c7427af2081..e925104ffa3 100644
--- a/tensorflow/python/distribute/cluster_resolver/BUILD
+++ b/tensorflow/python/distribute/cluster_resolver/BUILD
@@ -20,6 +20,7 @@ py_library(
         ":base_cluster_resolver_py",
         ":gce_cluster_resolver_py",
         ":kubernetes_cluster_resolver_py",
+        ":sagemaker_cluster_resolver_py",
         ":slurm_cluster_resolver_py",
         ":tfconfig_cluster_resolver_py",
         ":tpu_cluster_resolver_py",
@@ -56,6 +57,16 @@ py_library(
     ],
 )
 
+py_library(
+    name = "sagemaker_cluster_resolver_py",
+    srcs = ["sagemaker_cluster_resolver.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":base_cluster_resolver_py",
+        "//tensorflow/python:training_server_lib",
+    ],
+)
+
 py_library(
     name = "tpu_cluster_resolver_py",
     srcs = ["tpu_cluster_resolver.py"],
@@ -128,6 +139,22 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "sagemaker_cluster_resolver_py_test",
+    size = "small",
+    srcs = ["sagemaker_cluster_resolver_test.py"],
+    grpc_enabled = True,
+    main = "sagemaker_cluster_resolver_test.py",
+    deps = [
+        ":sagemaker_cluster_resolver_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:training_server_lib",
+    ],
+)
+
 tf_py_test(
     name = "slurm_cluster_resolver_py_test",
     size = "small",
diff --git a/tensorflow/python/distribute/cluster_resolver/sagemaker_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/sagemaker_cluster_resolver.py
new file mode 100644
index 00000000000..4b3d1f69e22
--- /dev/null
+++ b/tensorflow/python/distribute/cluster_resolver/sagemaker_cluster_resolver.py
@@ -0,0 +1,210 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implementation of Cluster Resolvers for SageMaker Environment."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import json
+import os
+
+from tensorflow.python.distribute.cluster_resolver.cluster_resolver import ClusterResolver
+from tensorflow.python.training.server_lib import ClusterSpec
+from tensorflow.python.util.tf_export import tf_export
+
+# List of envs
+# https://github.com/aws/sagemaker-training-toolkit/blob/master/ENVIRONMENT_VARIABLES.md
+# Only support Multi-Worker Mirrored Strategy
+
+_SESSION_MASTER_KEY = 'session_master'
+_RPC_LAYER_KEY = 'rpc_layer'
+_TASK_KEY = 'task'
+_CLUSTER_KEY = 'cluster'
+_WORKER_KEY = 'worker'
+_INDEX_KEY = 'index'
+_TYPE_KEY = 'type'
+
+_SM_CURRENT_HOST = 'SM_CURRENT_HOST'
+_SM_HOSTS = 'SM_HOSTS'
+
+
+def format_master_url(master, rpc_layer=None):
+  if rpc_layer:
+    return '%s://%s' % (rpc_layer, master)
+  else:
+    return master
+
+
+def _load_tf_config(port):
+  # Create a tf_config from SM Variables
+  assert all([x in os.environ for x in [_SM_CURRENT_HOST, _SM_HOSTS]
+             ]), 'Not a SageMaker Environment'
+  hosts = sorted(json.loads(
+      os.environ[_SM_HOSTS])) if os.environ[_SM_HOSTS] != '' else []
+  current_host = os.environ[_SM_CURRENT_HOST]
+
+  if current_host not in hosts:
+    return {}
+
+  host_index = hosts.index(current_host)
+  # Assign ports
+  hosts = ['%s:%s' % (host, port) for host in hosts]
+
+  tf_config = {
+      _CLUSTER_KEY: {
+          _WORKER_KEY: hosts
+      },
+      _TASK_KEY: {
+          _TYPE_KEY: _WORKER_KEY,
+          _INDEX_KEY: host_index
+      }
+  }
+  return tf_config
+
+
+def _get_value_in_tfconfig(key, port, default=None):
+  tf_config = _load_tf_config(port)
+  return tf_config[key] if key in tf_config else default
+
+
+@tf_export('distribute.cluster_resolver.SageMakerClusterResolver')
+class SageMakerClusterResolver(ClusterResolver):
+  """Implementation of a ClusterResolver which reads the Sagemaker EnvVars. This is an implementation of cluster resolvers when running in a SageMaker environment to set information about the cluster.
+
+  The cluster spec returned will be initialized from the SageMaker
+  environment variables.
+  Currently this Cluster Resolver only supports Multi-Worker Mirrored Strategy.
+  It assumes all nodes in a SageMaker Cluster are workers.
+  """
+
+  def __init__(self,
+               port=2223,
+               task_type=None,
+               task_id=None,
+               rpc_layer=None,
+               environment=None):
+    """Creates a new SageMakerClusterResolver.
+
+    Args:
+      port: (integer, optional) Override default port usage of 2223
+      task_type: (String, optional) Overrides the task type.
+      task_id: (Integer, optional) Overrides the task index.
+      rpc_layer: (String, optional) Overrides the rpc layer TensorFlow uses.
+      environment: (String, optional) Overrides the environment TensorFlow
+        operates in.
+    """
+    self._task_type = task_type
+    self._task_id = task_id
+    self._rpc_layer = rpc_layer
+    self._environment = environment
+    self._port = str(port)
+
+  @property
+  def task_type(self):
+    if self._task_type is None:
+      task_info = _get_value_in_tfconfig(_TASK_KEY, self._port, {})
+      return str(task_info['type']) if 'type' in task_info else None
+    else:
+      return str(self._task_type)
+
+  @property
+  def task_id(self):
+    if self._task_id is None:
+      task_info = _get_value_in_tfconfig(_TASK_KEY, self._port, {})
+      return int(task_info['index']) if 'index' in task_info else None
+    else:
+      return int(self._task_id)
+
+  @task_type.setter
+  def task_type(self, task_type):
+    self._task_type = task_type
+
+  @task_id.setter
+  def task_id(self, task_id):
+    self._task_id = task_id
+
+  @property
+  def environment(self):
+    return self._environment
+
+  @property
+  def rpc_layer(self):
+    if self._rpc_layer is None:
+      return _get_value_in_tfconfig(_RPC_LAYER_KEY, self._port)
+    else:
+      return self._rpc_layer
+
+  @rpc_layer.setter
+  def rpc_layer(self, rpc_layer):
+    self._rpc_layer = rpc_layer
+
+  def num_accelerators(self, task_type=None, task_id=None, config_proto=None):
+    task_type = self.task_type if task_type is None else task_type
+    task_id = self.task_id if task_id is None else task_id
+    return super(SageMakerClusterResolver,
+                 self).num_accelerators(task_type, task_id, config_proto)
+
+  def cluster_spec(self):
+    """Returns a ClusterSpec based on the SageMaker environment variables.
+
+    Returns:
+      A ClusterSpec with information from the SageMaker environment variables.
+    """
+    tf_config = _load_tf_config(self._port)
+    if 'cluster' not in tf_config:
+      return ClusterSpec({})
+    return ClusterSpec(tf_config['cluster'])
+
+  def master(self, task_type=None, task_id=None, rpc_layer=None):
+    """Returns the master address to use when creating a TensorFlow session.
+
+    Note: this is only useful for TensorFlow 1.x.
+
+    Args:
+      task_type: (String, optional) Overrides and sets the task_type of the
+        master.
+      task_id: (Integer, optional) Overrides and sets the task id of the master.
+      rpc_layer: (String, optional) Overrides and sets the protocol over which
+        TensorFlow nodes communicate with each other.
+
+    Returns:
+      The address of the master.
+
+    Raises:
+      RuntimeError: If the task_type or task_id is not specified and the
+        SageMaker environment variables does not contain a task section.
+    """
+
+    # If `session_master` is set, just use that.
+    session_master = _get_value_in_tfconfig(_SESSION_MASTER_KEY, self._port)
+    if session_master is not None:
+      return session_master
+
+    # Return an empty string if we are the only job in the ClusterSpec.
+    cluster_spec = self.cluster_spec()
+    if (not cluster_spec.jobs or
+        (len(cluster_spec.jobs) == 1 and
+         len(cluster_spec.job_tasks(cluster_spec.jobs[0])) == 1)):
+      return ''
+
+    # We try to auto-detect the task type and id, but uses the user-supplied one
+    # where available
+    task_type = task_type if task_type is not None else self.task_type
+    task_id = task_id if task_id is not None else self.task_id
+    rpc_layer = rpc_layer if rpc_layer is not None else self.rpc_layer
+
+    return format_master_url(
+        cluster_spec.task_address(task_type, task_id), rpc_layer)
diff --git a/tensorflow/python/distribute/cluster_resolver/sagemaker_cluster_resolver_test.py b/tensorflow/python/distribute/cluster_resolver/sagemaker_cluster_resolver_test.py
new file mode 100644
index 00000000000..aad2fc3ccec
--- /dev/null
+++ b/tensorflow/python/distribute/cluster_resolver/sagemaker_cluster_resolver_test.py
@@ -0,0 +1,121 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for SageMakerClusterResolver."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.python.distribute.cluster_resolver.sagemaker_cluster_resolver import SageMakerClusterResolver
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+from tensorflow.python.training import server_lib
+
+mock = test.mock
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class SageMakerClusterResolverTest(test.TestCase):
+
+  def _verifyClusterSpecEquality(self, cluster_spec, expected_proto):
+    self.assertProtoEquals(expected_proto, cluster_spec.as_cluster_def())
+    self.assertProtoEquals(
+        expected_proto,
+        server_lib.ClusterSpec(cluster_spec).as_cluster_def())
+    self.assertProtoEquals(
+        expected_proto,
+        server_lib.ClusterSpec(cluster_spec.as_cluster_def()).as_cluster_def())
+    self.assertProtoEquals(
+        expected_proto,
+        server_lib.ClusterSpec(cluster_spec.as_dict()).as_cluster_def())
+
+  def testNormalClusterSpecRead(self):
+    os.environ['SM_HOSTS'] = '["algo-1","algo-2"]'
+    os.environ['SM_CURRENT_HOST'] = 'algo-2'
+
+    cluster_resolver = SageMakerClusterResolver()
+    expected_proto = """
+    job { name: 'worker' tasks { key: 0 value: 'algo-1:2223' }
+                         tasks { key: 1 value: 'algo-2:2223' } }
+    """
+    actual_cluster_spec = cluster_resolver.cluster_spec()
+    self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
+
+  def testAutomaticMasterRead(self):
+    os.environ['SM_HOSTS'] = '["algo-1","algo-2"]'
+    os.environ['SM_CURRENT_HOST'] = 'algo-1'
+
+    cluster_resolver = SageMakerClusterResolver()
+    self.assertEqual('algo-1:2223', cluster_resolver.master())
+
+  def testSpecifiedTaskTypeAndIndexMasterRead(self):
+    os.environ['SM_HOSTS'] = '["algo-1","algo-2"]'
+    os.environ['SM_CURRENT_HOST'] = 'algo-2'
+
+    cluster_resolver = SageMakerClusterResolver()
+    self.assertEqual('algo-2:2223', cluster_resolver.master('worker', 1))
+
+  def testRpcLayerRead(self):
+    os.environ['SM_HOSTS'] = '["algo-1","algo-2"]'
+    os.environ['SM_CURRENT_HOST'] = 'algo-1'
+
+    cluster_resolver = SageMakerClusterResolver(rpc_layer='grpc')
+    self.assertEqual('grpc://algo-1:2223', cluster_resolver.master())
+
+  def testParameterOverrides(self):
+    os.environ['SM_HOSTS'] = '["algo-1","algo-2"]'
+    os.environ['SM_CURRENT_HOST'] = 'algo-1'
+
+    cluster_resolver = SageMakerClusterResolver(task_type='worker', task_id=0)
+
+    self.assertEqual('algo-1:2223', cluster_resolver.master())
+    self.assertEqual('worker', cluster_resolver.task_type)
+    self.assertEqual(0, cluster_resolver.task_id)
+
+    cluster_resolver.task_type = 'worker'
+    cluster_resolver.task_id = 1
+    cluster_resolver.rpc_layer = 'test'
+
+    self.assertEqual('test://algo-2:2223', cluster_resolver.master())
+    self.assertEqual('worker', cluster_resolver.task_type)
+    self.assertEqual(1, cluster_resolver.task_id)
+    self.assertEqual('test', cluster_resolver.rpc_layer)
+
+  def testTaskIndexOverride(self):
+    os.environ['SM_HOSTS'] = '["algo-1","algo-2"]'
+    os.environ['SM_CURRENT_HOST'] = 'algo-2'
+
+    cluster_resolver = SageMakerClusterResolver(task_id=1)
+    self.assertEqual(1, cluster_resolver.task_id)
+
+  def testZeroItemsInClusterSpecMasterRead(self):
+    os.environ['SM_HOSTS'] = ''
+    os.environ['SM_CURRENT_HOST'] = ''
+
+    cluster_resolver = SageMakerClusterResolver()
+    self.assertEqual('', cluster_resolver.master())
+
+  def testOneItemInClusterSpecMasterRead(self):
+    os.environ['SM_HOSTS'] = '["algo-1"]'
+    os.environ['SM_CURRENT_HOST'] = ''
+
+    cluster_resolver = SageMakerClusterResolver()
+    self.assertEqual('', cluster_resolver.master())
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/distribute/cluster_resolver/tpu/BUILD b/tensorflow/python/distribute/cluster_resolver/tpu/BUILD
index 4825bf3b6d8..dff2b6937f7 100644
--- a/tensorflow/python/distribute/cluster_resolver/tpu/BUILD
+++ b/tensorflow/python/distribute/cluster_resolver/tpu/BUILD
@@ -20,7 +20,7 @@ py_library(
     deps = [
         "//tensorflow/python:training_server_lib",
         "//tensorflow/python/distribute/cluster_resolver:base_cluster_resolver_py",
-        "//tensorflow/python/tpu:tpu_lib",
+        "//tensorflow/python/tpu:tpu_system_metadata",
         "//tensorflow/python/tpu/client",
     ] + tf_additional_rpc_deps(),
 )
diff --git a/tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver.py
index e42420ec644..393561a80aa 100644
--- a/tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver.py
+++ b/tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver.py
@@ -22,6 +22,7 @@ import collections
 import re
 
 from tensorflow.python.distribute.cluster_resolver import cluster_resolver
+from tensorflow.python.framework import config as framework_config
 from tensorflow.python.framework import errors
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.tpu import tpu_system_metadata as tpu_system_metadata_lib
@@ -41,6 +42,13 @@ def is_running_in_gce():
   return True
 
 
+class _LocalCloudTpuClient(object):
+  """Dummy local Cloud TPU client."""
+
+  def api_available(self):
+    return False
+
+
 _TPU_DEVICE_REGEX = re.compile(
     r'.*task:(?P<host_id>\d+)/.*device:TPU:(?P<core_id>\d+)$')
 _TPU_CONN_RETRIES = 120
@@ -155,7 +163,8 @@ class TPUClusterResolver(cluster_resolver.ClusterResolver):
     Args:
       tpu: A string corresponding to the TPU to use. It can be the TPU name or
         TPU worker gRPC address. If not set, it will try automatically resolve
-        the TPU address on Cloud TPUs.
+        the TPU address on Cloud TPUs. If set to "local", it will assume that
+        the TPU is directly connected to the VM instead of over the network.
       zone: Zone where the TPUs are located. If omitted or empty, we will assume
         that the zone of the TPU is the same as the zone of the GCE VM, which we
         will try to discover from the GCE metadata service.
@@ -187,15 +196,21 @@ class TPUClusterResolver(cluster_resolver.ClusterResolver):
         Google Cloud environment.
     """
 
-    self._cloud_tpu_client = client.Client(
-        tpu=tpu,
-        zone=zone,
-        project=project,
-        credentials=credentials,
-        service=service,
-        discovery_url=discovery_url)
+    if tpu != 'local':
+      # Default Cloud environment
+      self._cloud_tpu_client = client.Client(
+          tpu=tpu,
+          zone=zone,
+          project=project,
+          credentials=credentials,
+          service=service,
+          discovery_url=discovery_url)
+      self._tpu = self._cloud_tpu_client.name()
+    else:
+      # Directly connected TPU environment
+      self._cloud_tpu_client = _LocalCloudTpuClient()
+      self._tpu = 'local'
 
-    self._tpu = self._cloud_tpu_client.name()
     # By default the task_type is 'worker` and the task_id is 0 (which is the
     # first worker in the task).
     self.task_type = job_name
@@ -238,20 +253,23 @@ class TPUClusterResolver(cluster_resolver.ClusterResolver):
       ValueError: If none of the TPUs specified exists.
     """
 
-    cluster_spec = self.cluster_spec()
-    if task_type is not None and task_id is not None:
-      # task_type and task_id is from the function parameter
-      master = cluster_spec.task_address(task_type, task_id)
-    elif self.task_type is not None and self.task_id is not None:
-      # task_type and task_id is from the object
-      master = cluster_spec.task_address(self.task_type, self.task_id)
+    if self._tpu != 'local':
+      cluster_spec = self.cluster_spec()
+      if task_type is not None and task_id is not None:
+        # task_type and task_id is from the function parameter
+        master = cluster_spec.task_address(task_type, task_id)
+      elif self.task_type is not None and self.task_id is not None:
+        # task_type and task_id is from the object
+        master = cluster_spec.task_address(self.task_type, self.task_id)
+      else:
+        # by default we take the first item in the cluster with the right name
+        job_tasks = cluster_spec.job_tasks(self.task_type)
+        if not job_tasks:
+          raise ValueError('No TPUs with the specified names exist.')
+        master = job_tasks[0]
+      return cluster_resolver.format_master_url(master, 'grpc')
     else:
-      # by default we take the first item in the cluster with the right name
-      job_tasks = cluster_spec.job_tasks(self.task_type)
-      if not job_tasks:
-        raise ValueError('No TPUs with the specified names exist.')
-      master = job_tasks[0]
-    return cluster_resolver.format_master_url(master, 'grpc')
+      return ''
 
   def get_master(self):
     return self.master()
@@ -298,7 +316,8 @@ class TPUClusterResolver(cluster_resolver.ClusterResolver):
       RuntimeError: If the provided TPU is not healthy.
     """
     ############################################################################
-    # There are 5 potential cases this code must handle:
+    # There are 6 potential cases this code must handle:
+    #  0. [Local case.] When a TPU is connected directly to the VM.
     #  1. [Normal case.] We should resolve the TPU name to a set of tasks, and
     #      a. Create a ClusterSpec that includes the coordinator job
     #      b. Create a ClusterSpec without the coordinator job.
@@ -308,17 +327,19 @@ class TPUClusterResolver(cluster_resolver.ClusterResolver):
     #      b. Create a ClusterSpec without the coordinator
     ############################################################################
 
-    network_endpoints = self._cloud_tpu_client.network_endpoints()
-    worker_list = [
-        '%s:%s' % (endpoint['ipAddress'], endpoint['port'])
-        for endpoint in network_endpoints
-    ]
-    cluster_spec = {self.task_type: worker_list}
-    if self._coordinator_address:
-      # {1, 2}.a
-      cluster_spec[self._coordinator_name] = [self._coordinator_address]
-
-    return server_lib.ClusterSpec(cluster_spec)
+    if self._tpu != 'local':
+      network_endpoints = self._cloud_tpu_client.network_endpoints()
+      worker_list = [
+          '%s:%s' % (endpoint['ipAddress'], endpoint['port'])
+          for endpoint in network_endpoints
+      ]
+      cluster_spec = {self.task_type: worker_list}
+      if self._coordinator_address:
+        # {1, 2}.a
+        cluster_spec[self._coordinator_name] = [self._coordinator_address]
+      return server_lib.ClusterSpec(cluster_spec)
+    else:
+      return server_lib.ClusterSpec({})
 
   def num_accelerators(self,
                        task_type=None,
@@ -340,6 +361,15 @@ class TPUClusterResolver(cluster_resolver.ClusterResolver):
       RuntimeError: If we cannot talk to a TPU worker after retrying or if the
         number of TPU devices per host is different.
     """
+    if self._tpu == 'local':
+      return {
+          'TPU':
+              len([
+                  d for d in framework_config.list_logical_devices()
+                  if d.device_type == 'TPU'
+              ])
+      }
+
     retry_count = 1
     # TODO(b/120564445): Replace with standard library for retries.
     while True:
@@ -360,8 +390,11 @@ class TPUClusterResolver(cluster_resolver.ClusterResolver):
           raise RuntimeError(error_message)
 
     if device_details.total_cores:
-      return {'TPU': TPUClusterResolver._verify_and_return_same_core_count(
-          device_details.device_map)}
+      return {
+          'TPU':
+              TPUClusterResolver._verify_and_return_same_core_count(
+                  device_details.device_map)
+      }
     return {'TPU': 0}
 
   @property
diff --git a/tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver_test.py b/tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver_test.py
index 51abc850bb2..155410f9668 100644
--- a/tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver_test.py
+++ b/tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver_test.py
@@ -706,6 +706,10 @@ class TPUClusterResolverTest(test.TestCase):
     with self.assertRaises(RuntimeError):
       cluster_resolver.num_accelerators()
 
+  def testLocalTpuResolver(self):
+    cr = resolver.TPUClusterResolver(tpu='local')
+    self.assertEqual(cr.get_master(), '')
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/distribute/collective_all_reduce_strategy.py b/tensorflow/python/distribute/collective_all_reduce_strategy.py
index eeef87f5765..e41fda63fd5 100644
--- a/tensorflow/python/distribute/collective_all_reduce_strategy.py
+++ b/tensorflow/python/distribute/collective_all_reduce_strategy.py
@@ -19,10 +19,13 @@ from __future__ import division
 from __future__ import print_function
 
 import copy
+import threading
+import time
 import weakref
 
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.core.protobuf import tensorflow_server_pb2
+from tensorflow.python.distribute import collective_util
 from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib
 from tensorflow.python.distribute import cross_device_utils
 from tensorflow.python.distribute import device_util
@@ -37,10 +40,12 @@ from tensorflow.python.distribute import values
 from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
 from tensorflow.python.distribute.cluster_resolver import TFConfigClusterResolver
 from tensorflow.python.eager import context
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import collective_ops
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training.tracking import base
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -176,6 +181,18 @@ class CollectiveAllReduceStrategyV1(distribute_lib.StrategyV1):
 class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
   """Implementation of CollectiveAllReduceStrategy."""
 
+  # Whether to perdically check the health of the cluster. If any worker is not
+  # reachable, collectives are aborted and the user program should get a
+  # tf.errors.UnavailableError. It's required to restart in order to recover.
+  _enable_check_health = True
+  # Check health interval in seconds.
+  _check_health_interval = 30
+  # Timeout in seconds for the first check health. The first check health needs
+  # to wait for cluster, which may make a longer time.
+  _check_health_initial_timeout = 0
+  # Times to retry before considering the peer is down.
+  _check_health_retry_limit = 3
+
   def __init__(self,
                container_strategy,
                communication,
@@ -370,6 +387,9 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
     self._rpc_layer = cluster_resolver.rpc_layer
     self._warn_nccl_no_gpu()
 
+    if self._enable_check_health:
+      self._start_check_health_thread()
+
     logging.info(
         "MultiWorkerMirroredStrategy with cluster_spec = %r, task_type = %r, "
         "task_id = %r, num_workers = %r, local_devices = %r, "
@@ -377,6 +397,9 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
         task_id, self._num_workers, local_devices,
         self._communication)
 
+  def __del__(self):
+    self._stop_check_health_thread()
+
   def _input_workers_with_options(self, options=None):
     host_device = device_util.get_host_for_device(self._worker_device)
     if not options or options.experimental_prefetch_to_device:
@@ -411,6 +434,8 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
           initial_value = kwargs["initial_value"]
           if callable(initial_value):
             initial_value = initial_value()
+          if isinstance(initial_value, base.CheckpointInitialValue):
+            initial_value = initial_value.wrapped_value
           assert not callable(initial_value)
           initial_value = ops.convert_to_tensor(
               initial_value, dtype=kwargs.get("dtype", None))
@@ -581,6 +606,14 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
     else:
       return self._host_cross_device_ops
 
+  def _gather_to_implementation(self, value, destinations, axis,
+                                experimental_hints):
+    return self._get_cross_device_ops(value)._gather(  # pylint: disable=protected-access
+        value,
+        destinations=destinations,
+        axis=axis,
+        experimental_hints=experimental_hints)
+
   def _reduce_to(self, reduce_op, value, destinations, experimental_hints):
     if (isinstance(value, values.Mirrored) and
         reduce_op == reduce_util.ReduceOp.MEAN):
@@ -607,6 +640,92 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
         destinations=destinations,
         experimental_hints=experimental_hints)
 
+  def _check_health(self):
+    while True:
+      if self._check_health_thread_should_stop.is_set():
+        return
+      for job in self._cluster_spec.jobs:
+        for task_id in range(self._cluster_spec.num_tasks(job)):
+          peer = "/job:{}/replica:0/task:{}".format(job, task_id)
+          attempts = 0
+          while True:
+            attempts += 1
+            try:
+              context.context().check_collective_ops_peer_health(peer)
+              # If check_collective_ops_peer_health doesn't raise an Exception,
+              # the peer is healthy.
+              break
+            except (errors.UnavailableError,
+                    errors.FailedPreconditionError) as e:
+              # TODO(b/151232436): Always raise UnavailableError when a peer
+              # fails. Now there could be many kinds of errors:
+              # - Unavailable: when the peer is not reachable, e.g. it's down.
+              # - FailedPrecondition: when the peer has restarted.
+              if attempts < self._check_health_retry_limit:
+                logging.warning("%s seems down, retrying %d/%d", peer, attempts,
+                                self._check_health_retry_limit)
+                continue
+              logging.error(
+                  "Cluster check alive failed, %s is down, "
+                  "aborting collectives: %s", peer, e)
+              context.context().abort_collective_ops(
+                  errors.UNAVAILABLE,
+                  "cluster check alive failed, {} is down".format(peer))
+              return
+            except Exception as e:  # pylint: disable=broad-except
+              logging.error("Unexpected exception in check alive: %s", e)
+              context.context().abort_collective_ops(
+                  errors.INTERNAL,
+                  "unexecpted exception in check alive: %s" % e)
+              return
+      time.sleep(self._check_health_interval)
+
+  def _start_check_health_thread(self):
+    if not context.executing_eagerly():
+      logging.info("Check health is only supported in eager.")
+      return
+    # Use a dummy all-reduce as a barrier to wait for all workers to be up,
+    # otherwise the check health may fail immediately.
+
+    # Use array_ops.identity to create the dummy tensor so that we have a new
+    # Tensor. If we use constant it may be a cached from on a /job:localhost
+    # device, which will cause some code that relies on tensor.device to error.
+    #
+    # TODO(b/151232436): change to an explicit barrier if we have it.
+    dummy_value = array_ops.identity([])
+    logging.info("Waiting for the cluster, timeout = %s",
+                 self._check_health_initial_timeout or "inf")
+    try:
+      self._host_cross_device_ops.reduce(
+          reduce_util.ReduceOp.SUM,
+          dummy_value,
+          dummy_value,
+          experimental_hints=collective_util.Hints(
+              timeout_seconds=self._check_health_initial_timeout))
+      if context.is_async():
+        context.async_wait()
+    except errors.DeadlineExceededError:
+      raise RuntimeError(
+          "Timeout waiting for the cluster, timeout is %d seconds" %
+          self._check_health_initial_timeout)
+    logging.info("Cluster is ready.")
+    self._check_health_thread_should_stop = threading.Event()
+    # Start the thread as daemon to avoid it blocking the program from exiting.
+    # We try best to shutdown the thread but __del__ is not guaranteed to be
+    # called when program exists.
+    self._check_health_thread = threading.Thread(
+        target=self._check_health,
+        daemon=True)
+    self._check_health_thread.start()
+
+  def _stop_check_health_thread(self):
+    if getattr(self, "_check_health_thread", None):
+      logging.info("stopping check health thread")
+      self._check_health_thread_should_stop.set()
+      self._check_health_thread.join()
+      self._check_health_thread = None
+      logging.info("check health thread stopped")
+
   def _warn_nccl_no_gpu(self):
     if ((self._communication ==
          cross_device_ops_lib.CollectiveCommunication.NCCL) and
diff --git a/tensorflow/python/distribute/collective_all_reduce_strategy_test.py b/tensorflow/python/distribute/collective_all_reduce_strategy_test.py
index 6cfb007bd79..67e156f1a3d 100644
--- a/tensorflow/python/distribute/collective_all_reduce_strategy_test.py
+++ b/tensorflow/python/distribute/collective_all_reduce_strategy_test.py
@@ -56,6 +56,10 @@ from tensorflow.python.platform import test
 from tensorflow.python.training.server_lib import ClusterSpec
 
 
+CollectiveAllReduceExtended = (
+    collective_all_reduce_strategy.CollectiveAllReduceExtended)
+
+
 def create_test_objects(cluster_spec=None,
                         task_type=None,
                         task_id=None,
@@ -430,13 +434,21 @@ class DistributedCollectiveAllReduceStrategyTest(
 
   @combinations.generate(combinations.combine(mode=['eager']))
   def testEnableCollectiveOps(self):
+    # We cannot enable check health with this test because it mocks
+    # enable_collective_ops.
+    CollectiveAllReduceExtended._enable_check_health = False
     strategy, mock_called = self._get_strategy_with_mocked_methods()
+    CollectiveAllReduceExtended._enable_check_health = True
     self.assertTrue(strategy.extended._std_server_started)
     self.assertTrue(mock_called[0])
 
   @combinations.generate(combinations.combine(mode=['eager']))
   def testEnableCollectiveOpsAndClusterResolver(self):
+    # We cannot enable check health with this test because it mocks
+    # enable_collective_ops.
+    CollectiveAllReduceExtended._enable_check_health = False
     strategy, _ = self._get_strategy_with_mocked_methods()
+    CollectiveAllReduceExtended._enable_check_health = True
     self.assertEqual(strategy.cluster_resolver.task_type, 'worker')
     self.assertEqual(strategy.cluster_resolver.task_id, 1)
 
diff --git a/tensorflow/python/distribute/combinations.py b/tensorflow/python/distribute/combinations.py
index 3856b6fd132..50fcc1eb14c 100644
--- a/tensorflow/python/distribute/combinations.py
+++ b/tensorflow/python/distribute/combinations.py
@@ -102,6 +102,18 @@ class ClusterParameters(combinations_lib.ParameterModifier):
 class DistributionCombination(combinations_lib.TestCombination):
   """Sets up distribution strategy for tests."""
 
+  XLA_TEST = re.search(r"(test_xla|test_xla_gpu)$", sys.argv[0])
+
+  def should_execute_combination(self, kwargs):
+    distributions = [
+        v for v in kwargs.values() if isinstance(v, NamedDistribution)
+    ]
+    if self.XLA_TEST and any(d.no_xla for d in distributions):
+      return (
+          False,
+          "n/a: skipping strategy combination with no_xla=True in XLA tests")
+    return (True, None)
+
   def parameter_modifiers(self):
     return [
         DistributionParameter(),
@@ -231,7 +243,8 @@ class NamedDistribution(object):
                use_cloud_tpu=False,
                has_chief=False,
                num_workers=1,
-               use_pool_runner=False):
+               use_pool_runner=False,
+               no_xla=False):
     """Initialize NamedDistribution.
 
     Args:
@@ -244,6 +257,7 @@ class NamedDistribution(object):
       num_workers: The number of workers that the strategy requires.
       use_pool_runner: Whether to use a pool runner so that workers are re-used
         each time.
+      no_xla: Whether to skip in XLA tests.
     """
     object.__init__(self)
     self._name = name
@@ -253,6 +267,7 @@ class NamedDistribution(object):
     self.use_cloud_tpu = use_cloud_tpu
     self.has_chief = has_chief
     self.num_workers = num_workers
+    self.no_xla = no_xla
     self._runner = None
 
     if _num_total_workers(self.has_chief, self.num_workers) > 1:
diff --git a/tensorflow/python/distribute/cross_device_ops.py b/tensorflow/python/distribute/cross_device_ops.py
index ed3e2d5d951..c4a4bf6ac0c 100644
--- a/tensorflow/python/distribute/cross_device_ops.py
+++ b/tensorflow/python/distribute/cross_device_ops.py
@@ -35,6 +35,7 @@ from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import tpu_values
 from tensorflow.python.distribute import values as value_lib
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.eager import executor
 from tensorflow.python.framework import kernels
 from tensorflow.python.framework import ops
@@ -81,7 +82,7 @@ def reduce_non_distributed_value(
     reduce_op, value, destinations, num_replicas_in_graph):
   """Reduce a non-DistributedValue `value` to `destinations`."""
   if isinstance(value, value_lib.DistributedValues):
-    raise ValueError("You are passing a `DistributedValue` to "
+    raise ValueError("You are passing a `DistributedValues` to "
                      "`reduce_non_distributed_value`, which is not allowed.")
 
   # If the same value is present on all replicas then the PerReplica value will
@@ -216,7 +217,18 @@ def _simple_reduce(per_replica_value, reduce_to_device, accumulation_fn,
 
 @tf_export("distribute.CrossDeviceOps")
 class CrossDeviceOps(object):
-  """Base class for cross-device reduction and broadcasting algorithms."""
+  """Base class for cross-device reduction and broadcasting algorithms.
+
+  The main purpose of this class is to be passed to
+  `tf.distribute.MirroredStrategy` in order to choose among different cross
+  device communication implementations. Prefer using the methods of
+  `tf.distribute.Strategy` instead of the ones of this class.
+
+  Implementations:
+  * `tf.distribute.ReductionToOneDevice`
+  * `tf.distribute.NcclAllReduce`
+  * `tf.distribute.HierarchicalCopyAllReduce`
+  """
 
   def __init__(self):
     pass
@@ -233,24 +245,30 @@ class CrossDeviceOps(object):
              experimental_hints=None):
     """Reduce `per_replica_value` to `destinations`.
 
-    It runs the reduction operation defined by `reduce_op` and put the
-    result on `destinations`.
+    See `tf.distribute.StrategyExtended.reduce_to`. This can only be called in
+    the cross-replica context.
 
     Args:
-      reduce_op: An instance of `tf.distribute.ReduceOp` that indicates how
-        per_replica_value will be reduced.
-      per_replica_value: A `tf.distribute.DistributedValues` object or a tensor
-        with device set.
-      destinations: the reduction destinations.
-      experimental_hints: A `tf.distrbute.experimental.CollectiveHints`. Hints
-        to perform collective operations.
+      reduce_op: a `tf.distribute.ReduceOp` specifying how values should be
+        combined.
+      per_replica_value: a `tf.distribute.DistributedValues`, or a `tf.Tensor`
+        like object.
+      destinations: a `tf.distribute.DistributedValues`, a `tf.Variable`, a
+        `tf.Tensor` alike object, or a device string. It specifies the devices
+        to reduce to. To perform an all-reduce, pass the same to `value` and
+        `destinations`. Note that if it's a `tf.Variable`, the value is reduced
+        to the devices of that variable, and this method doesn't update the
+        variable.
+      experimental_hints: a `tf.distribute.experimental.CollectiveHints`. See
+        `tf.distribute.experimental.CollectiveHints` for details.
 
     Returns:
-      a Mirrored object.
+      A `tf.Tensor` or `tf.distribute.DistributedValues`.
 
     Raises:
-      ValueError: if per_replica_value can't be converted to a PerReplica
-        object or if destinations aren't strings, Variables or DistributedValues
+      ValueError: if per_replica_value can't be converted to a
+        `tf.distribute.DistributedValues` or if destinations is not a string,
+        `tf.Variable` or `tf.distribute.DistributedValues`.
     """
     if not isinstance(per_replica_value, value_lib.DistributedValues):
       per_replica_value = _make_tensor_into_per_replica(per_replica_value)
@@ -270,32 +288,112 @@ class CrossDeviceOps(object):
     return self.reduce_implementation(reduce_op, per_replica_value,
                                       destinations, experimental_hints)
 
+  def _gather(self,
+              per_replica_value,
+              destinations,
+              axis,
+              experimental_hints=None):
+    """Gather `per_replica_value` to `destinations`.
+
+    Args:
+      per_replica_value: a `tf.distribute.DistributedValues`, or a `tf.Tensor`
+        like object.
+      destinations: a `tf.distribute.DistributedValues`, a `tf.Variable`, a
+        `tf.Tensor` alike object, or a device string. It specifies the devices
+        to gather to. To perform an all-gather, pass the same to `value` and
+        `destinations`. Note that if it's a `tf.Variable`, the value is gathered
+        to the devices of that variable, and this method doesn't update the
+        variable.
+      axis: specifies the dimension to gather along within each replica's
+        tensor.
+      experimental_hints: a `tf.distribute.experimental.CollectiveHints`. See
+        `tf.distribute.experimental.CollectiveHints` for details.
+
+    Returns:
+      A `tf.Tensor` or `tf.distribute.DistributedValues`
+
+    Raises:
+      ValueError: if per_replica_value can't be converted to a
+        `tf.distribute.DistributedValues` or if destinations is not a string,
+        `tf.Variable` or `tf.distribute.DistributedValues`.
+    """
+    if isinstance(per_replica_value, ops.IndexedSlices):
+      raise NotImplementedError("gather/all_gather does not support "
+                                "IndexedSlices")
+    if experimental_hints is None:
+      experimental_hints = collective_util.Hints()
+
+    if not isinstance(per_replica_value, value_lib.DistributedValues):
+      per_replica_value = _make_tensor_into_per_replica(per_replica_value)
+
+    validate_destinations(destinations)
+
+    # Shortcut if `per_replica_value` only contains one value.
+    if self._num_between_graph_workers == 1 and len(
+        per_replica_value.values) == 1 and _devices_match(
+            per_replica_value, destinations):
+      with ops.device(per_replica_value.values[0].device):
+        v = array_ops.identity(per_replica_value.values[0])
+      return distribute_utils.regroup((v,), wrap_class=value_lib.Mirrored)
+
+    return self._gather_implementation(per_replica_value, destinations, axis,
+                                       experimental_hints)
+
+  def _gather_implementation(self, per_replica_value, destinations, axis,
+                             experimental_hints):
+    """Implementation of `gather` method of `tf.distribute.CrossDeviceOps`.
+
+    Overriding this method is useful for subclass implementers.
+
+    Args:
+      per_replica_value: a `tf.distribute.DistributedValues`, or a `tf.Tensor`
+        like object.
+      destinations: a `tf.distribute.DistributedValues`, a `tf.Variable`, a
+        `tf.Tensor` alike object, or a device string. It specifies the devices
+        to gather to. To perform an all-gather, pass the same to `value` and
+        `destinations`. Note that if it's a `tf.Variable`, the value is gathered
+        to the devices of that variable, this method doesn't update the
+        variable.
+      axis: specifies the dimension to gather along within each replica's
+        tensor.
+      experimental_hints: a `tf.distribute.experimental.CollectiveHints`. See
+        `tf.distribute.experimental.CollectiveHints` for details.
+
+    Returns:
+      A `tf.Tensor` or `tf.distribute.DistributedValues`.
+
+    Raises:
+      ValueError: if per_replica_value can't be converted to a
+        `tf.distribute.DistributedValues` or if destinations is not a string,
+        `tf.Variable` or `tf.distribute.DistributedValues`.
+    """
+    raise NotImplementedError(
+        "_gather method must be implemented in descendants.")
+
   def batch_reduce(self,
                    reduce_op,
                    value_destination_pairs,
                    experimental_hints=None):
-    """Reduce PerReplica objects in a batch.
+    """Reduce values to destinations in batches.
 
-    Reduce each first element in `value_destination_pairs` to each second
-    element which indicates the destinations.
-
-    This can be faster than multiple individual `reduce`s because we can
-    fuse several tensors into one or multiple packs before reduction.
+    See `tf.distribute.StrategyExtended.batch_reduce_to`. This can only be
+    called in the cross-replica context.
 
     Args:
-      reduce_op: An instance of `tf.distribute.ReduceOp` that indicates how the
-        `per_replica_value` will be reduced.
-      value_destination_pairs: A list or a tuple of PerReplica objects (or
-        tensors with device set if there is one device) and destinations.
-      experimental_hints: A `tf.distrbute.experimental.CollectiveHints`. Hints
-        to perform collective operations.
+      reduce_op: a `tf.distribute.ReduceOp` specifying how values should be
+        combined.
+      value_destination_pairs: a sequence of (value, destinations) pairs. See
+        `tf.distribute.CrossDeviceOps.reduce` for descriptions.
+      experimental_hints: a `tf.distribute.experimental.CollectiveHints`. See
+        `tf.distribute.experimental.CollectiveHints` for details.
 
     Returns:
-      a list of Mirrored objects.
+      A list of `tf.Tensor` or `tf.distribute.DistributedValues`, one per pair
+      in `value_destination_pairs`.
 
     Raises:
       ValueError: if `value_destination_pairs` is not an iterable of
-        tuples of PerReplica objects and destinations.
+        tuples of `tf.distribute.DistributedValues` and destinations.
     """
     # TODO(yuefengz): if destinations are different, split into several
     # `_batch_reduce` invocations.
@@ -323,14 +421,20 @@ class CrossDeviceOps(object):
                                             experimental_hints)
 
   def broadcast(self, tensor, destinations):
-    """Broadcast the `tensor` to destinations.
+    """Broadcast `tensor` to `destinations`.
+
+    This can only be called in the cross-replica context.
 
     Args:
-      tensor: the tensor to broadcast.
-      destinations: the broadcast destinations.
+      tensor: a `tf.Tensor` like object. The value to broadcast.
+      destinations: a `tf.distribute.DistributedValues`, a `tf.Variable`, a
+        `tf.Tensor` alike object, or a device string. It specifies the devices
+        to broadcast to. Note that if it's a `tf.Variable`, the value is
+        broadcasted to the devices of that variable, this method doesn't update
+        the variable.
 
     Returns:
-      a Mirrored object.
+      A `tf.Tensor` or `tf.distribute.DistributedValues`.
     """
     validate_destinations(destinations)
     return self.broadcast_implementation(tensor, destinations)
@@ -338,27 +442,31 @@ class CrossDeviceOps(object):
   @doc_controls.for_subclass_implementers
   def reduce_implementation(self, reduce_op, per_replica_value, destinations,
                             experimental_hints):
-    """The implementation of reduce of `per_replica_value` to `destinations`.
+    """Implementation of `reduce`.
 
     Overriding this method is useful for subclass implementers.
 
-    It runs the reduction operation defined by `reduce_op` and put the
-    result on `destinations`.
-
     Args:
-      reduce_op: An instance `tf.distribute.ReduceOp` that indicates of how
-        per_replica_value will be reduced.
-      per_replica_value: A PerReplica object or a tensor with device set.
-      destinations: the reduction destinations.
-      experimental_hints: A `tf.distrbute.experimental.CollectiveHints`. Hints
-        to perform collective operations.
+      reduce_op: a `tf.distribute.ReduceOp` specifying how values should be
+        combined.
+      per_replica_value: a `tf.distribute.DistributedValues`, or a `tf.Tensor`
+        like object.
+      destinations: a `tf.distribute.DistributedValues`, a `tf.Variable`, a
+        `tf.Tensor` alike object, or a device string. It specifies the devices
+        to reduce to. To perform an all-reduce, pass the same to `value` and
+        `destinations`. Note that if it's a `tf.Variable`, the value is reduced
+        to the devices of that variable, this method doesn't update the
+        variable.
+      experimental_hints: a `tf.distribute.experimental.CollectiveHints`. See
+        `tf.distribute.experimental.CollectiveHints` for details.
 
     Returns:
-      a Mirrored object.
+      A `tf.Tensor` or `tf.distribute.DistributedValues`.
 
     Raises:
-      ValueError: if per_replica_value can't be converted to a PerReplica
-        object.
+      ValueError: if per_replica_value can't be converted to a
+        `tf.distribute.DistributedValues` or if destinations is not a string,
+        `tf.Variable` or `tf.distribute.DistributedValues`.
     """
     raise NotImplementedError(
         "_reduce method must be implemented in descendants.")
@@ -366,27 +474,25 @@ class CrossDeviceOps(object):
   @doc_controls.for_subclass_implementers
   def batch_reduce_implementation(self, reduce_op, value_destination_pairs,
                                   experimental_hints):
-    """Implementation of reduce PerReplica objects in a batch.
+    """Implementation of `batch_reduce`.
 
     Overriding this method is useful for subclass implementers.
 
-    Reduce each first element in `value_destination_pairs` to each second
-    element which indicates the destinations.
-
     Args:
-      reduce_op: An instance of `tf.distribute.ReduceOp` that indicates how
-        per_replica_value will be reduced.
-      value_destination_pairs: An iterable of tuples of PerReplica objects
-        (or tensors with device set if there is one device) and destinations.
-      experimental_hints: A `tf.distrbute.experimental.CollectiveHints`. Hints
+      reduce_op: a `tf.distribute.ReduceOp` specifying how values should be
+        combined.
+      value_destination_pairs: a sequence of (value, destinations) pairs. See
+        `reduce` for descriptions.
+      experimental_hints: a `tf.distribute.experimental.CollectiveHints`. Hints
         to perform collective operations.
 
     Returns:
-      a list of Mirrored objects.
+      A list of `tf.Tensor` or `tf.distribute.DistributedValues`, one per pair
+      in `value_destination_pairs`.
 
     Raises:
       ValueError: if `value_destination_pairs` is not an iterable of
-        tuples of PerReplica objects and destinations
+        tuples of `tf.distribute.DistributedValues` and destinations.
     """
     raise NotImplementedError(
         "batch_reduce_implementation method must be implemented in descendants."
@@ -394,26 +500,36 @@ class CrossDeviceOps(object):
 
   @doc_controls.for_subclass_implementers
   def broadcast_implementation(self, tensor, destinations):
-    """Implementation of broadcast the `tensor` to destinations.
+    """Implementation of `broadcast`.
 
     Args:
-      tensor: the tensor to broadcast.
-      destinations: the broadcast destinations.
+      tensor: a `tf.Tensor` like object. The value to broadcast.
+      destinations: a `tf.distribute.DistributedValues`, a `tf.Variable`, a
+        `tf.Tensor` alike object, or a device string. It specifies the devices
+        to broadcast to.
+        `destinations`. Note that if it's a `tf.Variable`, the value is
+        broadcasted to the devices of that variable, this method doesn't update
+        the variable.
 
     Returns:
-      a Mirrored object.
+      A `tf.Tensor` or `tf.distribute.DistributedValues`.
     """
     return simple_broadcast(tensor, destinations, always_mirrored=True)
 
 
 @tf_export("distribute.ReductionToOneDevice")
 class ReductionToOneDevice(CrossDeviceOps):
-  """Always do reduction to one device first and then do broadcasting.
+  """A CrossDeviceOps implementation that copies values to one device to reduce.
 
-  Batch reduction is done by reduction on each element one by one.
+  This implementation always copies values to one device to reduce them, then
+  broadcast reduced values to the destinations. It doesn't support efficient
+  batching.
+
+  Here is how you can use `ReductionToOneDevice` in
+  `tf.distribute.MirroredStrategy`:
 
   ```
-    mirrored_strategy = tf.distribute.MirroredStrategy(
+    strategy = tf.distribute.MirroredStrategy(
       cross_device_ops=tf.distribute.ReductionToOneDevice())
   ```
   """
@@ -423,8 +539,8 @@ class ReductionToOneDevice(CrossDeviceOps):
 
     Args:
       reduce_to_device: the intermediate device to reduce to. If None, reduce
-        to the first device in `destinations` of the `reduce()` method.
-      accumulation_fn: a function that does accumulation.  If None, then
+        to the first device in `destinations` of the `reduce` method.
+      accumulation_fn: a function that does accumulation.  If None,
         `tf.math.add_n` is used.
     """
     self.reduce_to_device = reduce_to_device
@@ -641,18 +757,24 @@ def _unpack_tensors(reduced, tensor_packer=None):
 
 
 class AllReduceCrossDeviceOps(CrossDeviceOps):
-  """Reduction using all-reduce."""
+  """All-reduce implementation of CrossDeviceOps.
+
+  It performs all-reduce when applicable using NCCL or hierarchical copy. For
+  the batch API, tensors will be repacked or aggregated for more efficient
+  cross-device transportation.
+
+  For reduces that are not all-reduce, it falls back to
+  `tf.distribute.ReductionToOneDevice`.
+  """
 
   def __init__(self, all_reduce_alg="nccl", num_packs=1):
-    """All-reduce implementation of CrossDeviceOps.
-
-    Before performing all-reduce, tensors will be packed for more efficient
-    cross-device transportation.
+    """Initializes the object.
 
     Args:
       all_reduce_alg: the all-reduce algorithm to use, currently only "nccl" or
         "hierarchical_copy" are supported.
-      num_packs: If non-zero, pack values into `num_packs` splits.
+      num_packs: a non-negative integer. The number of packs to split values
+        into. If zero, no packing will be done.
     """
     self._all_reduce_alg = all_reduce_alg
     self._num_packs = num_packs
@@ -746,21 +868,32 @@ AllReduceSpecTuple = collections.namedtuple("AllReduceSpecTuple",
 
 @tf_export("distribute.NcclAllReduce")
 class NcclAllReduce(AllReduceCrossDeviceOps):
-  """Reduction using NCCL all-reduce."""
+  """NCCL all-reduce implementation of CrossDeviceOps.
+
+  It uses Nvidia NCCL for all-reduce. For the batch API, tensors will be
+  repacked or aggregated for more efficient cross-device transportation.
+
+  For reduces that are not all-reduce, it falls back to
+  `tf.distribute.ReductionToOneDevice`.
+
+  Here is how you can use `NcclAllReduce` in `tf.distribute.MirroredStrategy`:
+
+
+  ```
+    strategy = tf.distribute.MirroredStrategy(
+      cross_device_ops=tf.distribute.NcclAllReduce())
+  ```
+  """
 
   def __init__(self, num_packs=1):
-    """NCCL all-reduce implementation of CrossDeviceOps.
-
-    It uses Nvidia NCCL for all-reduce. Before performing all-reduce, tensors
-    will be repacked or aggregated for more efficient cross-device
-    transportation.
+    """Initializes the object.
 
     Args:
-      num_packs: values will be packed in this many splits.  `num_packs` should
-        be greater than or equals 0. When it is zero, no packing will be done.
+      num_packs: a non-negative integer. The number of packs to split values
+        into. If zero, no packing will be done.
 
     Raises:
-      ValueError if `num_packs` is negative.
+      ValueError: if `num_packs` is negative.
     """
     if num_packs < 0:
       raise ValueError(
@@ -772,23 +905,34 @@ class NcclAllReduce(AllReduceCrossDeviceOps):
 
 @tf_export("distribute.HierarchicalCopyAllReduce")
 class HierarchicalCopyAllReduce(AllReduceCrossDeviceOps):
-  """Reduction using hierarchical copy all-reduce.
+  """Hierarchical copy all-reduce implementation of CrossDeviceOps.
 
   It reduces to one GPU along edges in some hierarchy and broadcasts back to
-  each GPU along the same path. Before performing all-reduce, tensors will be
-  repacked or aggregated for more efficient cross-device transportation.
+  each GPU along the same path. For the batch API, tensors will be repacked or
+  aggregated for more efficient cross-device transportation.
 
   This is a reduction created for Nvidia DGX-1 which assumes GPUs connects like
   that on DGX-1 machine. If you have different GPU inter-connections, it is
   likely that it would be slower than `tf.distribute.ReductionToOneDevice`.
+
+  For reduces that are not all-reduce, it falls back to
+  `tf.distribute.ReductionToOneDevice`.
+
+  Here is how you can use `HierarchicalCopyAllReduce` in
+  `tf.distribute.MirroredStrategy`:
+
+  ```
+    strategy = tf.distribute.MirroredStrategy(
+      cross_device_ops=tf.distribute.HierarchicalCopyAllReduce())
+  ```
   """
 
   def __init__(self, num_packs=1):
     """Initializes the object.
 
     Args:
-      num_packs: values will be packed in this many splits.  `num_packs` should
-        be greater than or equals 0. When it is zero, no packing will be done.
+      num_packs: a non-negative integer. The number of packs to split values
+        into. If zero, no packing will be done.
 
     Raises:
       ValueError if `num_packs` is negative.
@@ -802,117 +946,6 @@ class HierarchicalCopyAllReduce(AllReduceCrossDeviceOps):
         num_packs=num_packs)
 
 
-class MultiWorkerAllReduce(AllReduceCrossDeviceOps):
-  """All-reduce algorithms for distributed TensorFlow."""
-
-  def __init__(self,
-               worker_devices,
-               num_gpus_per_worker,
-               all_reduce_spec=("pscpu/pscpu", 2, -1),
-               num_packs=0):
-    """Initialize the all-reduce algorithm.
-
-    Args:
-      worker_devices: a list of device strings for workers participating in
-        all-reduce.
-      num_gpus_per_worker: number of GPU devices per worker.
-      all_reduce_spec: a tuple or a named tuple or a list of tuples specifying
-        the all-reduce algorithm.
-        1. The first element of a tuple is the name of the all-reduce algorithm.
-        Valid algorithm names are: "nccl", "nccl/xring", "nccl/rechd",
-        "nccl/pscpu", "xring", "pscpu", "psgpu", "pscpu/pscpu". Algorithms with
-        a "/" are hierarchical, so two all-reduces are executed, the first one
-        aggregates tensors within a worker and the second aggregates across
-        workers.
-        2. The second element of a tuple is the number of shards when doing
-        all-reduce. Let's say its values is M, each tensor after packing will be
-        split into M shards and then M parallel all-reduces would be performed
-        before finally they are concatenated backed into a complete tensor.
-        3. The third element is the maximum size of tensors that will be
-        applicable for the algorithm specified by the first element. For
-        example, if all_reduce_spec=[("nccl", 2, 1024), ("pscpu/pscpu", 2, -1)],
-        tensors with size not larger than 1024 bytes will be applied a 2-shard
-        "nccl" all-reduce and other tensors will be applied a 2-shard
-        "pscpu/pscpu" algorithm. The third elements should be in increasing
-        order across tuples and end with -1 which indicates infinity.
-      num_packs: see AllReduceCrossDeviceOps.
-    """
-    self._worker_devices = worker_devices
-    self._num_gpus_per_worker = num_gpus_per_worker
-    super(MultiWorkerAllReduce, self).__init__(num_packs=num_packs)
-
-    def validate_and_complete_spec(spec):
-      """Validate and complete the all-reduce spec."""
-      # TODO(yuefengz): support namedtuple.
-      if not isinstance(spec, tuple):
-        raise ValueError(
-            "A tuple is expected for all-reduce spec: %r" % all_reduce_spec)
-      if not spec or len(spec) > 3:
-        raise ValueError(
-            "Too many elements in the all-reduce spec tuple: %r" % spec)
-      if len(spec) == 1:
-        return AllReduceSpecTuple(spec[0], 1, -1)
-      elif len(spec) == 2:
-        return AllReduceSpecTuple(spec[0], spec[1], -1)
-      else:
-        return AllReduceSpecTuple(*spec)
-
-    self._all_reduce_spec = []
-    if isinstance(all_reduce_spec, six.string_types):
-      self._all_reduce_spec.append(AllReduceSpecTuple(all_reduce_spec, 1, -1))
-    elif isinstance(all_reduce_spec, tuple):
-      self._all_reduce_spec.append(validate_and_complete_spec(all_reduce_spec))
-    elif isinstance(all_reduce_spec, list):
-      self._all_reduce_spec = [
-          validate_and_complete_spec(spec) for spec in all_reduce_spec
-      ]
-
-  def _batch_all_reduce(self, reduce_op, per_replica_values):
-    """All-reduce algorithm in a batch."""
-    logging.log_first_n(
-        logging.INFO, "Distributed batch_all_reduce: %d all-reduces with "
-        "allreduce_spec = %r, num_packs = %d" %
-        (len(per_replica_values), self._all_reduce_spec, self._num_packs), 10)
-
-    device_grads = _group_value_by_device(per_replica_values)
-
-    # The all-reduce library requires fully defined shapes.
-    # TODO(yuefengz): when tensor sharding is not needed, static shapes are not
-    # required as well.
-    for device_grad in device_grads:
-      for grad, _ in device_grad:
-        if not grad.shape.is_fully_defined():
-          raise ValueError("Shape is unknown for node %r" % grad)
-
-    remaining_grads = device_grads
-    aggregated_grads = []
-    for spec_tuple in self._all_reduce_spec:
-      if spec_tuple.limit < 0:
-        this_grads = remaining_grads
-        remaining_grads = []
-      else:
-        (this_grads, remaining_grads) = cross_device_utils.split_grads_by_size(
-            spec_tuple.limit, remaining_grads)
-      if this_grads:
-        device_grad_packs, tensor_packer = _pack_tensors(
-            this_grads, self._num_packs)
-        range_agg_grads = cross_device_utils.sum_gradients_all_reduce(
-            self._worker_devices, device_grad_packs, len(self._worker_devices),
-            spec_tuple.alg, spec_tuple.shards, range(self._num_gpus_per_worker))
-        range_agg_grads = _unpack_tensors(range_agg_grads, tensor_packer)
-
-        if not aggregated_grads:
-          aggregated_grads = range_agg_grads
-        else:
-          assert len(aggregated_grads) == len(range_agg_grads)
-          for i, range_agg_grad in enumerate(range_agg_grads):
-            aggregated_grads[i] += range_agg_grad
-    assert not remaining_grads
-
-    return _ungroup_and_make_mirrored(aggregated_grads, per_replica_values[0],
-                                      reduce_op)
-
-
 @tf_export("distribute.experimental.CollectiveCommunication")
 class CollectiveCommunication(enum.Enum):
   """Communication choices for CollectiveOps.
@@ -963,7 +996,7 @@ class CollectiveAllReduce(CrossDeviceOps):
     # cross_device_utils.build_collectve_*.
     #
     # In a multi threaded eager program we need to ensure different groups of
-    # collectives don't interleave each other, otherwise there couuld be
+    # collectives don't interleave each other, otherwise there could be
     # deadlocks. E.g. if two user threads both are launching collectives:
     #   user-thread-0  device0                 device1
     #   user-thread-1          device0 device1
@@ -1180,6 +1213,84 @@ class CollectiveAllReduce(CrossDeviceOps):
           distribute_utils.regroup(value, wrap_class=value_lib.Mirrored))
     return mirrored
 
+  def _gather_implementation(self, per_replica_value, destinations, axis,
+                             experimental_hints):
+    all_gathered = self._batch_all_gather([per_replica_value], axis,
+                                          experimental_hints)[0]
+    devices = get_devices_from(destinations)
+
+    if _devices_match(per_replica_value, destinations):
+      return all_gathered
+
+    # Convert `all_gathered` to a `Mirrored` object, as a simple and uniform
+    # utility to access component for a particular device.
+    if not isinstance(all_gathered, value_lib.Mirrored):
+      all_gathered = value_lib.Mirrored([all_gathered])
+
+    # If we got this far, the destination devices do not match the all-gather
+    # devices, so we must map from one to the other.
+    index = []
+    # We must add these control dependencies, otherwise we can get deadlock.
+    with ops.control_dependencies(all_gathered.values):
+      for d in devices:
+        with ops.device(d):
+          for v in all_gathered.values:
+            if v.device == d:
+              index.append(array_ops.identity(v))
+              break
+            else:
+              index.append(array_ops.identity(all_gathered._primary))  # pylint: disable=protected-access
+    return distribute_utils.regroup(index, wrap_class=value_lib.Mirrored)
+
+  def _batch_all_gather(self, per_replica_values, axis, experimental_hints):
+    """all gather multiple per-replica-values."""
+    batch_size = len(per_replica_values)
+    # Pass self._communication to the runtime as a communication hint.
+    communication = self._communication.value
+    # For now, we use NCCL only when batch_size > 1.
+    # TODO(b/132575814): switch to NCCL for all collectives when communication
+    # is NCCL.
+    if self._communication == CollectiveCommunication.NCCL and batch_size == 1:
+      communication = CollectiveCommunication.AUTO.value
+
+    logging.log_first_n(
+        logging.INFO, "Collective batch_all_gather: %d all-gathers, "
+        "num_devices = %d, group_size = %d, communication_hint = %s, " %
+        (batch_size, len(self._devices), self._group_size, communication), 10)
+
+    def compute_gathered_values():
+      gathered_values = []
+      with self._lock, ops.name_scope("allgather"):
+        for per_replica in per_replica_values:
+          if (communication == CollectiveCommunication.NCCL.value and
+              gathered_values):
+            control_inputs = list(gathered_values[-1])
+          else:
+            control_inputs = None
+          gathered_values.append(
+              cross_device_utils.build_collective_gather(
+                  per_replica.values,
+                  self._devices,
+                  self._group_size,
+                  self._collective_keys,
+                  axis,
+                  communication,
+                  control_inputs,
+                  timeout=experimental_hints.timeout_seconds))
+      return gathered_values
+
+    if context.executing_eagerly():
+      gathered_values = def_function.function(compute_gathered_values)()
+    else:
+      gathered_values = compute_gathered_values()
+
+    mirrored = []
+    # Reverse the order of gathered value to recover the order in the input.
+    for value in reversed(gathered_values):
+      mirrored.append(
+          distribute_utils.regroup(value, wrap_class=value_lib.Mirrored))
+    return mirrored
+
   def __deepcopy__(self, memo):
     # distribute_coordinator deep-copies the strategy object, so
     # CollectiveAllReduce needs to support deep copy as well.
diff --git a/tensorflow/python/distribute/cross_device_ops_test.py b/tensorflow/python/distribute/cross_device_ops_test.py
index 967de7d8426..f22cab0048a 100644
--- a/tensorflow/python/distribute/cross_device_ops_test.py
+++ b/tensorflow/python/distribute/cross_device_ops_test.py
@@ -18,1063 +18,547 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import itertools
+import atexit
+import collections
 import os
-import threading
-import time
 
 from absl.testing import parameterized
-import numpy as np
+
 from tensorflow.core.protobuf import config_pb2
-from tensorflow.python.distribute import cluster_resolver
-from tensorflow.python.distribute import collective_all_reduce_strategy
-from tensorflow.python.distribute import collective_util
+from tensorflow.core.protobuf import tensorflow_server_pb2
+from tensorflow.python.distribute import cluster_resolver as cluster_resolver_lib
 from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib
-from tensorflow.python.distribute import cross_device_utils
-from tensorflow.python.distribute import device_util
-from tensorflow.python.distribute import distribute_utils
+from tensorflow.python.distribute import multi_process_runner
 from tensorflow.python.distribute import multi_worker_test_base
-from tensorflow.python.distribute import multi_worker_util
 from tensorflow.python.distribute import reduce_util
-from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.distribute import values as value_lib
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
-from tensorflow.python.eager import remote
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import kernels
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import indexed_slices
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import collective_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import variables
-
-
-def _get_devices(devices):
-  if isinstance(devices, (tuple, list)):
-    return tuple(device_util.resolve(d) for d in devices)
-  elif isinstance(devices, value_lib.DistributedValues):
-    return devices._devices
-  elif isinstance(devices, ops.Tensor):
-    return (device_util.resolve(devices.device),)
-  return (device_util.resolve(devices),)
-
-
-def _make_per_replica(values, devices, regroup=False):
-  devices = _get_devices(devices)
-  assert len(values) == len(devices)
-
-  # We simulate the result of regroup called on PerReplica which strips the
-  # PerReplica wrapper if it has only one value.
-  if len(values) == 1 and regroup:
-    with ops.device(devices[0]):
-      placed_v = array_ops.identity(values[0])
-    return placed_v
-
-  index = []
-  for d, v in zip(devices, values):
-    with ops.device(d):
-      placed_v = array_ops.identity(v)
-    index.append(placed_v)
-  return distribute_utils.regroup(index)
-
-
-# pylint: disable=g-doc-args,g-doc-return-or-yield
-def _fake_mirrored(value, devices):
-  """Create a faked Mirrored object for testing.
-
-  All components of the returned Mirrored have the same objects, which is not
-  true in reality.
-  """
-  devices = _get_devices(devices)
-  values = []
-  for d in devices:
-    with ops.device(d):
-      values.append(array_ops.identity(value))
-  return distribute_utils.regroup(
-      values,
-      wrap_class=value_lib.Mirrored)
-
-
-def _make_indexed_slices(values, indices, dense_shape, device):
-  with ops.device(device):
-    tensor = ops.IndexedSlices(
-        values=constant_op.constant(values),
-        indices=constant_op.constant(indices),
-        dense_shape=constant_op.constant(dense_shape))
-  return tensor
-
-
-def _make_mirrored_indexed_slices(devices, values, indices, dense_shape):
-  values = [_make_indexed_slices(values, indices, dense_shape, d)
-            for d in devices]
-  return distribute_utils.regroup(
-      values,
-      wrap_class=value_lib.Mirrored)
-
-
-_cpu_device = "/device:CPU:0"
-
-
-class CrossDeviceOpsTestBase(test.TestCase, parameterized.TestCase):
-
-  def _assert_indexed_slices_equal(self, left, right):
-    self.assertIsInstance(left, ops.IndexedSlices)
-    self.assertIsInstance(right, ops.IndexedSlices)
-    self.assertEqual(
-        device_util.resolve(left.device), device_util.resolve(right.device))
-    self.assertAllEqual(
-        self.evaluate(ops.convert_to_tensor(left)),
-        self.evaluate(ops.convert_to_tensor(right)))
-
-  def _assert_mirrored_equal(self,
-                             left_list,
-                             right_list,
-                             sess=None,
-                             run_options=None):
-    if not isinstance(left_list, list):
-      left_list, right_list = [left_list], [right_list]
-
-    for left, right in zip(left_list, right_list):
-      self.assertEqual(type(left), type(right))
-
-      # Convert Mirrored to a list since sess.run(Mirrored) only returns one
-      # value.
-      if isinstance(left, value_lib.Mirrored):
-        left, right = left.values, right.values
-      else:
-        # When there's only one replica Mirrored is automatically unwrapped.
-        left, right = [left], [right]
-
-      for left_value, right_value in zip(left, right):
-        self.assertEqual(
-            device_util.resolve(left_value.device),
-            device_util.resolve(right_value.device))
-
-      # Densify IndexedSlices.
-      left = [ops.convert_to_tensor(v) for v in left]
-      right = [ops.convert_to_tensor(v) for v in right]
-      if not context.executing_eagerly():
-        left, right = sess.run((left, right), options=run_options)
-      for left_value, right_value in zip(left, right):
-        self.assertAllEqual(left_value, right_value)
-
-  def _testReductionAndBroadcast(self, cross_device_ops, devices):
-    if context.num_gpus() < sum(1 for d in devices if "GPU" in d.upper()):
-      self.skipTest("Not enough GPUs")
-
-    with self.cached_session() as sess:
-      values = [constant_op.constant(float(d)) for d in range(len(devices))]
-      per_replica = _make_per_replica(values, devices)
-      mean = (len(devices) - 1.) / 2.
-
-      values_2 = [constant_op.constant(d + 1.0) for d in range(len(devices))]
-      per_replica_2 = _make_per_replica(values_2, devices)
-      mean_2 = mean + 1.
-
-      destination_mirrored = _fake_mirrored(1., devices)
-      destination_different = _fake_mirrored(1.,
-                                             device_util.resolve(_cpu_device))
-      destination_str = device_util.resolve(_cpu_device)
-
-      all_destinations = [
-          destination_mirrored,
-          destination_different,
-          destination_str,
-      ]
-
-      # test reduce()
-      for destinations in all_destinations:
-        self._assert_mirrored_equal(
-            cross_device_ops.reduce(
-                reduce_util.ReduceOp.MEAN,
-                per_replica,
-                destinations=destinations), _fake_mirrored(mean, destinations),
-            sess)
-        self._assert_mirrored_equal(
-            cross_device_ops.reduce(
-                reduce_util.ReduceOp.MEAN,
-                per_replica_2,
-                destinations=destinations),
-            _fake_mirrored(mean_2, destinations), sess)
-        self._assert_mirrored_equal(
-            cross_device_ops.reduce(
-                reduce_util.ReduceOp.SUM,
-                per_replica,
-                destinations=destinations),
-            _fake_mirrored(mean * len(devices), destinations), sess)
-        self._assert_mirrored_equal(
-            cross_device_ops.reduce(
-                reduce_util.ReduceOp.SUM,
-                per_replica_2,
-                destinations=destinations),
-            _fake_mirrored(mean_2 * len(devices), destinations), sess)
-
-      # test batch_reduce()
-      for d1, d2 in itertools.product(all_destinations, all_destinations):
-        self._assert_mirrored_equal(
-            cross_device_ops.batch_reduce(reduce_util.ReduceOp.MEAN,
-                                          [(per_replica, d1),
-                                           (per_replica_2, d2)]),
-            [_fake_mirrored(mean, d1),
-             _fake_mirrored(mean_2, d2)], sess)
-        self._assert_mirrored_equal(
-            cross_device_ops.batch_reduce(reduce_util.ReduceOp.SUM,
-                                          [(per_replica, d1),
-                                           (per_replica_2, d2)]),
-            [
-                _fake_mirrored(mean * len(devices), d1),
-                _fake_mirrored(mean_2 * len(devices), d2)
-            ], sess)
-
-      # test broadcast()
-      for destinations in all_destinations:
-        self._assert_mirrored_equal(
-            cross_device_ops.broadcast(constant_op.constant(1.), destinations),
-            _fake_mirrored(1., destinations), sess)
-
-  def _testIndexedSlicesAllReduce(self, devices, cross_device_ops_instance,
-                                  reduce_op, batch_reduce):
-    with self.cached_session() as sess:
-      dense_shape = [5, 2]
-      t0 = _make_indexed_slices([[1., 2.]], [1], dense_shape, devices[0])
-      t1 = _make_indexed_slices([[3., 4.], [5., 6.]], [1, 3], dense_shape,
-                                devices[1])
-      per_replica = value_lib.PerReplica((t0, t1))
-
-      if batch_reduce:
-        result = cross_device_ops_instance.batch_reduce(
-            reduce_op, [(per_replica, per_replica)])
-      else:
-        result = cross_device_ops_instance.reduce(reduce_op, per_replica,
-                                                  per_replica)
-
-      total_indices_with_dups = [1, 1, 3]
-      total_indices_without_dups = [1, 3]
-
-      if reduce_op == reduce_util.ReduceOp.SUM:
-        total_values_with_dups = [[1., 2.], [3., 4.], [5., 6.]]
-        total_values_without_dups = [[4., 6.], [5., 6.]]
-      else:
-        assert reduce_op == reduce_util.ReduceOp.MEAN
-        total_values_with_dups = [[0.5, 1.], [1.5, 2.], [2.5, 3.]]
-        total_values_without_dups = [[2., 3.], [2.5, 3.]]
-
-      total_mirrored_with_dups = _make_mirrored_indexed_slices(
-          devices, total_values_with_dups, total_indices_with_dups, dense_shape)
-      total_mirrored_without_dups = _make_mirrored_indexed_slices(
-          devices, total_values_without_dups, total_indices_without_dups,
-          dense_shape)
-
-      # Test that the result is semantically equal to both the concatenated
-      # IndexedSlices, as well as when the duplicate indices are summed up.
-      if batch_reduce:
-        total_mirrored_with_dups = [total_mirrored_with_dups]
-        total_mirrored_without_dups = [total_mirrored_without_dups]
-
-      self._assert_mirrored_equal(total_mirrored_with_dups, result, sess)
-      self._assert_mirrored_equal(total_mirrored_without_dups, result, sess)
-
-
-class SingleWorkerCrossDeviceOpsTest(CrossDeviceOpsTestBase):
-
-  reduction_to_one_combinations = combinations.combine(
-      cross_device_ops=[
-          combinations.NamedObject("DefaultReductionToOneDevice",
-                                   cross_device_ops_lib.ReductionToOneDevice()),
-          combinations.NamedObject(
-              "ReductionToCPUDeviceCrossDeviceOps",
-              cross_device_ops_lib.ReductionToOneDevice(
-                  reduce_to_device=_cpu_device)),
-          combinations.NamedObject(
-              "AccumulateNCrossDeviceOp",
-              cross_device_ops_lib.ReductionToOneDevice(
-                  accumulation_fn=math_ops.add_n)),
-      ],
-      devices=[
-          ["/cpu:0"],
-          ["/cpu:0", "/gpu:0"],
-          ["/gpu:0", "/gpu:1"],
-      ],
-      mode=["graph", "eager"])
-  allreduce_combinations = combinations.combine(
-      cross_device_ops=[
-          combinations.NamedObject(
-              "AllReduce",
-              cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 1)),
-          combinations.NamedObject(
-              "AllReduceNoGradientRepacking",
-              cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 0)),
-          combinations.NamedObject("NcclAllReduce",
-                                   cross_device_ops_lib.NcclAllReduce()),
-          combinations.NamedObject(
-              "HierarchicalCopy",
-              cross_device_ops_lib.HierarchicalCopyAllReduce(8)),
-      ],
-      devices=[
-          ["/gpu:0", "/gpu:1"],
-      ],
-      mode=["graph", "eager"])
-
-  @combinations.generate(reduction_to_one_combinations + allreduce_combinations)
-  def testReductionAndBroadcast(self, cross_device_ops, devices):
-    if isinstance(
-        cross_device_ops._obj,  # pylint: disable=protected-access
-        cross_device_ops_lib.AllReduceCrossDeviceOps
-    ) and context.executing_eagerly():
-      self.skipTest("b/149881884")
-    self._testReductionAndBroadcast(cross_device_ops, devices)
-
-  def testChooseAlgorithm(self):
-    # Not use nccl if there is any cpu device.
-    self.assertIsInstance(
-        cross_device_ops_lib.choose_the_best(["/cpu:0"]),
-        cross_device_ops_lib.ReductionToOneDevice)
-
-    # Not use nccl if requested device is not visible to TensorFlow.
-    # TODO(yuefengz): make `choose_the_best` work with device strings
-    # self.assertIsInstance(
-    #     cross_device_ops_lib.choose_the_best(["/gpu:100"]),
-    #     cross_device_ops_lib.ReductionToOneDevice)
-
-    if context.num_gpus() < 1:
-      return
-
-    devices = ["/gpu:0"]
-
-    def mock_get_registered_kernels_for_op(op):
-      if op == "NcclAllReduce":
-        return [object]
-      else:
-        return []
-
-    # Use nccl if nccl kernel is found.
-    with test.mock.patch.object(kernels, "get_registered_kernels_for_op",
-                                mock_get_registered_kernels_for_op):
-      self.assertIsInstance(
-          cross_device_ops_lib.choose_the_best(devices),
-          cross_device_ops_lib.NcclAllReduce)
-
-    # Not use nccl if nccl kernel is not found.
-    with test.mock.patch.object(kernels,
-                                "get_registered_kernels_for_op", lambda _: []):
-      self.assertIsInstance(
-          cross_device_ops_lib.choose_the_best(devices),
-          cross_device_ops_lib.ReductionToOneDevice)
-
-  @combinations.generate(combinations.combine(
-      mode=["graph", "eager"],
-      required_gpus=1))
-  def testSimpleReduceWithIndexedSlices(self):
-    devices = ["/cpu:0", "/gpu:0"]
-    t0 = _make_indexed_slices([[1., 2.]], [1], [5, 2], devices[0])
-    t1 = _make_indexed_slices([[3., 4.], [5., 6.]], [1, 3], [5, 2], devices[1])
-    per_replica = value_lib.PerReplica((t0, t1))
-    result = cross_device_ops_lib._simple_reduce(
-        per_replica, devices[0], math_ops.add_n, reduce_util.ReduceOp.SUM)
-
-    # Test that the result is semantically equal to both the concatenated
-    # IndexedSlices with and without duplicate indices.
-    total_with_dups = _make_indexed_slices(
-        [[1., 2.], [3., 4.], [5., 6.]], [1, 1, 3], [5, 2], devices[0])
-    total_without_dups = _make_indexed_slices(
-        [[4., 6.], [5., 6.]], [1, 3], [5, 2], devices[0])
-    self._assert_indexed_slices_equal(total_with_dups, result)
-    self._assert_indexed_slices_equal(total_without_dups, result)
-
-  @combinations.generate(
-      combinations.combine(
-          cross_device_ops_instance=[
-              combinations.NamedObject(
-                  "ReductionToOneDevice",
-                  cross_device_ops_lib.ReductionToOneDevice()),
-              combinations.NamedObject(
-                  "AllReduceCrossDeviceOps",
-                  cross_device_ops_lib.AllReduceCrossDeviceOps())
-          ],
-          reduce_op=[reduce_util.ReduceOp.SUM, reduce_util.ReduceOp.MEAN],
-          batch_reduce=[True, False],
-          mode=["graph", "eager"],
-          required_gpus=1))
-  def testIndexedSlicesAllReduce(self, cross_device_ops_instance, reduce_op,
-                                 batch_reduce):
-    devices = ["/cpu:0", "/gpu:0"]
-    self._testIndexedSlicesAllReduce(devices, cross_device_ops_instance,
-                                     reduce_op, batch_reduce)
-
-  @combinations.generate(
-      combinations.combine(
-          distribution=strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
-          cross_device_ops_instance=[
-              combinations.NamedObject(
-                  "ReductionToOneDevice",
-                  cross_device_ops_lib.ReductionToOneDevice()),
-              combinations.NamedObject(
-                  "AllReduceCrossDeviceOps",
-                  cross_device_ops_lib.AllReduceCrossDeviceOps("ring"))
-          ],
-          batch_reduce=[True, False],
-          mode=["graph", "eager"]))
-  def testReduceDistributedVariable(self, distribution,
-                                    cross_device_ops_instance, batch_reduce):
-    with distribution.scope():
-      v = variables.Variable(1.)
-    if batch_reduce:
-      result = cross_device_ops_instance.batch_reduce(reduce_util.ReduceOp.MEAN,
-                                                      [(v, v)])[0]
-    else:
-      result = cross_device_ops_instance.reduce(reduce_util.ReduceOp.MEAN, v, v)
-    for v in result.values:
-      self.assertIsInstance(v, ops.Tensor)
-    self.evaluate(variables.global_variables_initializer())
-    self.assertAllEqual(self.evaluate(result.values), [1.0, 1.0])
-
-
-class MultiWorkerCrossDeviceOpsTest(multi_worker_test_base.MultiWorkerTestBase,
-                                    CrossDeviceOpsTestBase):
-
-  worker_devices = [
-      "/job:worker/replica:0/task:0", "/job:worker/replica:0/task:1"
-  ]
-  multi_worker_allreduce_combinations = combinations.combine(
-      cross_device_ops=[
-          combinations.NamedObject(
-              "MultiWorkerAllReduce",
-              cross_device_ops_lib.MultiWorkerAllReduce(worker_devices, 2,
-                                                        ("pscpu/pscpu", 2, -1),
-                                                        0)),
-          combinations.NamedObject(
-              "MultiWorkerAllReducePack",
-              cross_device_ops_lib.MultiWorkerAllReduce(worker_devices, 2,
-                                                        ("pscpu/pscpu", 2, -1),
-                                                        1)),
-          combinations.NamedObject(
-              "MultiWorkerAllReduceMultipleSpecs",
-              cross_device_ops_lib.MultiWorkerAllReduce(
-                  worker_devices, 2, [("pscpu/pscpu", 2, 100),
-                                      ("xring", 2, -1)], 0)),
-      ],
-      devices=[
-          [
-              "/job:worker/replica:0/task:0/device:CPU:0",
-              "/job:worker/replica:0/task:1/device:CPU:0"
-          ],
-          [
-              "/job:worker/replica:0/task:0/device:GPU:0",
-              "/job:worker/replica:0/task:1/device:GPU:0"
-          ],
-          [
-              "/job:worker/replica:0/task:0/device:GPU:0",
-              "/job:worker/replica:0/task:0/device:GPU:1",
-              "/job:worker/replica:0/task:1/device:GPU:0",
-              "/job:worker/replica:0/task:1/device:GPU:1"
-          ],
-      ],
-      mode=["graph"])
-
-  @combinations.generate(multi_worker_allreduce_combinations)
-  def testReductionAndBroadcast(self, cross_device_ops, devices):
-    # Mimic the default device of multi-worker strategies.
-    with ops.device("/job:worker/replica:0/task:0"):
-      self._testReductionAndBroadcast(cross_device_ops, devices)
-
-
-NUM_WORKERS = 3
+from tensorflow.python.util import nest
 
 CollectiveCommunication = cross_device_ops_lib.CollectiveCommunication
+ReduceOp = reduce_util.ReduceOp
 
 
-class CollectiveAllReduceTest(multi_worker_test_base.MultiWorkerTestBase,
-                              CrossDeviceOpsTestBase):
+def make_per_replica_value(value_fn, devices):
+  """Creates a `PerReplica` object whose values reside in `devices`.
 
-  collective_key_base = 100000
+  Args:
+    value_fn: a callable that takes one argument (`device_idx`) and should
+      return the value that is going to be created on devices[device_idx].
+    devices: a list of device strings to create `PerReplica` values on.
 
-  @classmethod
-  def setUpClass(cls):
-    """Create a local cluster with 3 workers."""
-    cls._cluster_spec = multi_worker_test_base.create_in_process_cluster(
-        num_workers=NUM_WORKERS, num_ps=0)
+  Returns:
+    A `PerReplica` object.
+  """
+  values = []
+  for device_idx, device in enumerate(devices):
+    v = value_fn(device_idx)
+    if isinstance(v, indexed_slices.IndexedSlicesValue):
+      with ops.device(device):
+        values.append(
+            indexed_slices.IndexedSlices(
+                values=array_ops.identity(v.values),
+                indices=array_ops.identity(v.indices),
+                dense_shape=array_ops.identity(v.dense_shape)))
+    else:
+      with ops.device(device):
+        values.append(array_ops.identity(v))
+  return value_lib.PerReplica(values)
+
+
+def enable_collective_ops():
+  """Enable collectives in the current process."""
+  cluster_resolver = cluster_resolver_lib.TFConfigClusterResolver()
+  context.context().configure_collective_ops(
+      collective_leader="'/job:worker/replica:0/task:0'")
+  config_proto = config_pb2.ConfigProto()
+  config_proto.experimental.collective_group_leader = (
+      "/job:worker/replica:0/task:0")
+  server_def = tensorflow_server_pb2.ServerDef(
+      cluster=cluster_resolver.cluster_spec().as_cluster_def(),
+      default_session_config=config_proto,
+      job_name=cluster_resolver.task_type,
+      task_index=cluster_resolver.task_id,
+      protocol=cluster_resolver.rpc_layer)
+  context.context().enable_collective_ops(server_def)
+
+
+class MultiProcessPoolRunner():
+
+  def __init__(self, num_processes):
+    cluster_spec_dict = multi_worker_test_base.create_cluster_spec(
+        num_workers=num_processes)
+    self.runner = multi_process_runner.MultiProcessPoolRunner(cluster_spec_dict)
+
+
+# Global MultiProcessPoolRunners that can be shared by test cases to avoid
+# expensive initialization cost of TensorFlow in new processes.
+#
+# Note that they have to be globals and can't be owned by test classes because
+# usually proc_func usually captures the test class instance, and test class
+# instance can't be pickled if it has mpr as a member (it is not allowed to
+# pickle Process objects).
+# TODO(crccw): Use `num_workers` combination once it is ready.
+global_mpr_2p = MultiProcessPoolRunner(num_processes=2)
+global_mpr_1p = MultiProcessPoolRunner(num_processes=1)
+
+
+def get_global_mpr(num_processes):
+  if num_processes == 1:
+    return global_mpr_1p.runner
+  elif num_processes == 2:
+    return global_mpr_2p.runner
+  else:
+    raise ValueError("get_global_mpr: num_processes must be 1 or 2, got %d" %
+                     num_processes)
+
+
+# Shutdown the runners gracefully to avoid the processes getting SIGTERM and
+# make tsan happy.
+def _shutdown_at_exit():
+  global_mpr_2p.runner.shutdown()
+  global_mpr_1p.runner.shutdown()
+
+
+atexit.register(_shutdown_at_exit)
+
+
+class CollectiveOpsTest(test.TestCase, parameterized.TestCase):
 
   def setUp(self):
-    super(CollectiveAllReduceTest, self).setUp()
-    # Reusing keys is not supported well. So we have to give a different
-    # collective key base for different tests.
-    CollectiveAllReduceTest.collective_key_base += 100000
+    super().setUp()
+    # Enabling collectives can be done in "setUpClass", but requires using
+    # different collective_keys in different tests as collectives are reused
+    # across tests. Always resetting collective ops before each test offers
+    # better test isolation.
+    global_mpr_1p.runner.run(enable_collective_ops)
+    global_mpr_2p.runner.run(enable_collective_ops)
 
-  def _get_test_objects(self,
-                        task_type,
-                        task_id,
-                        num_gpus=0,
-                        communication=CollectiveCommunication.AUTO,
-                        use_strategy_object=False,
-                        local_mode=False):
-    collective_keys = cross_device_utils.CollectiveKeys(
-        group_key_start=10 + CollectiveAllReduceTest.collective_key_base,
-        op_instance_key_start=100 + CollectiveAllReduceTest.collective_key_base,
-        variable_instance_key_start=10000 +
-        CollectiveAllReduceTest.collective_key_base)
-    if local_mode:
-      if num_gpus:
-        devices = ["/device:GPU:%d" % i for i in range(num_gpus)]
-      else:
-        devices = ["/device:CPU:0"]
+  def make_collective(self, num_processes, gpu_per_process, communication):
+    """Returns collectives and other info to be used in tests.
 
-      if use_strategy_object:
-        strategy = (
-            collective_all_reduce_strategy.CollectiveAllReduceStrategy
-            ._from_local_devices(devices, communication=communication))  # pylint: disable=protected-access
-        strategy.extended._collective_keys = collective_keys
-        strategy.extended._cross_device_ops._collective_keys = collective_keys
-        strategy.extended._host_cross_device_ops._collective_keys = (
-            collective_keys)
-        return strategy, devices, ""
-      else:
-        collective_all_reduce_ops = cross_device_ops_lib.CollectiveAllReduce(
-            devices=devices,
-            group_size=len(devices),
-            collective_keys=collective_keys,
-            communication=communication)
-        return collective_all_reduce_ops, devices, ""
-    else:
-      # NCCL requires physical GPUs for every replica, which we can't do with
-      # simulated multi host set up now.
-      assert communication != CollectiveCommunication.NCCL
-      if num_gpus:
-        devices = [
-            "/job:%s/task:%d/replica:0/device:GPU:%d" % (task_type, task_id, i)
-            for i in range(num_gpus)
-        ]
-      else:
-        devices = [
-            "/job:%s/task:%d/replica:0/device:CPU:0" % (task_type, task_id)
-        ]
+    Args:
+      num_processes: an integer indicating the number of processes that
+        participate in the collective.
+      gpu_per_process: number of GPUs (0 if no GPUs) used by each process.
+      communication: one of `CollectiveCommunication`.
 
-      if use_strategy_object:
-        resolver = cluster_resolver.SimpleClusterResolver(
-            cluster_spec=multi_worker_util.normalize_cluster_spec(
-                self._cluster_spec),
-            task_type=task_type,
-            task_id=task_id,
-            num_accelerators={"GPU": num_gpus})
-        strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy(
-            cluster_resolver=resolver, communication=communication)
-        strategy.extended._collective_keys = collective_keys
-        strategy.extended._cross_device_ops._collective_keys = collective_keys
-        return (strategy, devices,
-                "grpc://" + self._cluster_spec[task_type][task_id])
-      else:
-        collective_all_reduce_ops = cross_device_ops_lib.CollectiveAllReduce(
-            devices=devices,
-            group_size=len(devices) * NUM_WORKERS,
-            collective_keys=collective_keys,
-            communication=communication)
-        return (collective_all_reduce_ops, devices,
-                "grpc://" + self._cluster_spec[task_type][task_id])
+    Returns:
+     A tuple of (collective, devices, group_size) where collective is a instance
+     of `CollectiveAllReduce`, devices are a list of local devices (str)
+     attached to the current process, and group_size is the group_size of
+     collective.
+    """
 
-  def _assert_mirrored_equal(self, left_list, right_list, sess=None):
-    if context.executing_eagerly():
-      run_options = None
-    else:
-      # TODO(b/151025792): figure out why missing run options would make the
-      # test flaky and whether this is a problem in TF 2.
-      run_options = config_pb2.RunOptions()
-      run_options.experimental.collective_graph_key = 5
-    super(CollectiveAllReduceTest, self)._assert_mirrored_equal(
-        left_list, right_list, sess, run_options=run_options)
-
-  def _test_reduction(self,
-                      task_type,
-                      task_id,
-                      num_gpus,
-                      communication,
-                      use_strategy_object=False,
-                      local_mode=False,
-                      hints=None):
-    collective_all_reduce, devices, master_target = self._get_test_objects(
-        task_type,
-        task_id,
-        num_gpus,
-        communication=communication,
-        use_strategy_object=use_strategy_object,
-        local_mode=local_mode)
-    if local_mode:
-      num_workers = 1
-      worker_device = None
-    else:
-      num_workers = len(self._cluster_spec.get("chief", [])) + len(
-          self._cluster_spec.get("worker", []))
-      worker_device = "/job:%s/task:%d" % (task_type, task_id)
-
-    def _reduce(test_object, reduce_op, per_replica, destinations):
-      if use_strategy_object:
-        with test_object.scope():
-          return test_object.extended.reduce_to(reduce_op, per_replica,
-                                                destinations, hints)
-      else:
-        return test_object.reduce(reduce_op, per_replica, destinations, hints)
-
-    def _batch_reduce(test_object, reduce_op, value_destination_pairs):
-      if use_strategy_object:
-        with test_object.scope():
-          return test_object.extended.batch_reduce_to(reduce_op,
-                                                      value_destination_pairs,
-                                                      hints)
-      else:
-        return test_object.batch_reduce(reduce_op, value_destination_pairs,
-                                        hints)
-
-    with ops.Graph().as_default(), \
-         ops.device(worker_device), \
-         self.cached_session(target=master_target) as sess:
-      # Collective ops doesn't support scalar tensors, so we have to construct
-      # 1-d tensors.
-      values = [constant_op.constant([float(d)]) for d in range(len(devices))]
-      per_replica = _make_per_replica(values, devices)
-      mean = np.array([(len(devices) - 1.) / 2.])
-
-      values_2 = [constant_op.constant([d + 1.0]) for d in range(len(devices))]
-      per_replica_2 = _make_per_replica(values_2, devices)
-      mean_2 = np.array([mean[0] + 1.])
-
-      destination_mirrored = _fake_mirrored(1., devices)
-      destination_different = _fake_mirrored(1., _cpu_device)
-      destination_str = _cpu_device
-
-      all_destinations = [
-          destination_different, destination_mirrored, destination_str
+    cluster_resolver = cluster_resolver_lib.TFConfigClusterResolver()
+    devices = [
+        "/job:worker/replica:0/task:%d/device:CPU:0" % cluster_resolver.task_id
+    ]
+    if gpu_per_process > 0:
+      devices = [
+          "/job:worker/replica:0/task:%d/device:GPU:%d" %
+          (cluster_resolver.task_id, i) for i in range(gpu_per_process)
       ]
+    group_size = num_processes * len(devices)
+    collective = cross_device_ops_lib.CollectiveAllReduce(
+        devices=devices, group_size=group_size, communication=communication)
+    return collective, devices, cluster_resolver.task_id
 
-      # test reduce()
-      for destinations in all_destinations:
-        self._assert_mirrored_equal(
-            _reduce(
-                collective_all_reduce,
-                reduce_util.ReduceOp.MEAN,
-                per_replica,
-                destinations=destinations), _fake_mirrored(mean, destinations),
-            sess)
-        self._assert_mirrored_equal(
-            _reduce(
-                collective_all_reduce,
-                reduce_util.ReduceOp.MEAN,
-                per_replica_2,
-                destinations=destinations),
-            _fake_mirrored(mean_2, destinations), sess)
-        self._assert_mirrored_equal(
-            _reduce(
-                collective_all_reduce,
-                reduce_util.ReduceOp.SUM,
-                per_replica,
-                destinations=destinations),
-            _fake_mirrored(mean * len(devices) * num_workers, destinations),
-            sess)
-        self._assert_mirrored_equal(
-            _reduce(
-                collective_all_reduce,
-                reduce_util.ReduceOp.SUM,
-                per_replica_2,
-                destinations=destinations),
-            _fake_mirrored(mean_2 * len(devices) * num_workers, destinations),
-            sess)
+  def as_list(self, value):
+    """An utility to convert a `Mirrored`, `Tensor` or `IndexedSlices` to a list.
 
-      # test batch_reduce()
-      for d1, d2 in itertools.product(all_destinations, all_destinations):
-        self._assert_mirrored_equal(
-            _batch_reduce(collective_all_reduce, reduce_util.ReduceOp.MEAN,
-                          [(per_replica, d1), (per_replica_2, d2)]),
-            [_fake_mirrored(mean, d1),
-             _fake_mirrored(mean_2, d2)], sess)
-        self._assert_mirrored_equal(
-            _batch_reduce(collective_all_reduce, reduce_util.ReduceOp.SUM,
-                          [(per_replica, d1), (per_replica_2, d2)]),
-            [
-                _fake_mirrored(mean * len(devices) * num_workers, d1),
-                _fake_mirrored(mean_2 * len(devices) * num_workers, d2)
-            ], sess)
+    The reason it exists is to provide a uniformed view of returned value of
+    "reduce" calls, especially across tf.function boundaries. Returning
+    `Mirrored` from a tf.function will only evaluate the primary value, which
+    makes collective ops of non-primary device being pruned, and will eventually
+    cause hanging.
 
-  def _get_indexed_slices(self,
-                          devices,
-                          start_i,
-                          variable_length,
-                          as_per_replica=True):
-    dense_shape = [10, 2]
-    values = ([[1., 2.]], [[3., 4.]], [[2., 1.]], [[0., 0.]], [[3., 1.]],
-              [[2., 1.]])
-    indices = ([1], [2], [3], [4], [5], [6])
+    Args:
+      value: the value to convert, can be one of `Mirrored`, `Tensor` and
+        `IndexedSlices`.
 
-    # values and indices that have variable lengths.
-    vl_values = ([[1., 2.], [3., 4.]], [[3., 4.]], [[2., 1.]], [[0., 0.]],
-                 [[3., 1.], [2., 1.]], [[2., 1.]])
-    vl_indices = ([1, 2], [2], [3], [4], [5, 6], [6])
-
-    indexed_slices = []
-    for i, d in enumerate(devices):
-      idx = i + start_i
-      indexed_slices.append(
-          _make_indexed_slices(
-              vl_values[idx] if variable_length else values[idx],
-              vl_indices[idx] if variable_length else indices[idx], dense_shape,
-              d))
-    if as_per_replica:
-      per_replica = value_lib.PerReplica(indexed_slices)
-      return per_replica
+    Returns:
+      A list of `Tensor` or `IndexedSlices`.
+    """
+    if isinstance(value, ops.Tensor):
+      return [value]
+    elif isinstance(value, indexed_slices.IndexedSlices):
+      return [value]
+    elif isinstance(value, value_lib.Mirrored):
+      return value.values
     else:
-      return indexed_slices
+      raise ValueError("unwrap: unsupported input type: %s" % type(value))
 
-  def _test_reduce_indexed_slices(self,
-                                  task_type,
-                                  task_id,
-                                  num_gpus,
-                                  communication,
-                                  batch_reduce,
-                                  variable_length,
-                                  local_mode=False):
-    collective_all_reduce, devices, master_target = self._get_test_objects(
-        task_type,
-        task_id,
-        num_gpus,
-        communication=communication,
-        local_mode=local_mode)
-    if local_mode:
-      num_workers = 1
-      worker_device = None
-    else:
-      num_workers = len(self._cluster_spec.get("chief", [])) + len(
-          self._cluster_spec.get("worker", []))
-      worker_device = "/job:%s/task:%d" % (task_type, task_id)
-    with ops.Graph().as_default(), \
-         ops.device(worker_device), \
-         self.cached_session(target=master_target) as sess:
-      per_replica = self._get_indexed_slices(devices,
-                                             (task_id or 0) * max(num_gpus, 1),
-                                             variable_length)
+  RunOptions = collections.namedtuple(  # pylint: disable=invalid-name
+      "RunOptions",
+      [
+          "mode",  # A list of str from ["eager", "func_graph"]
+          "num_processes",
+          "gpus_per_process",
+          "reduce_op",
+          "communication",
+      ])
+  RunOptions.__new__.__defaults__ = (["eager", "func_graph"], 2, 0,
+                                     ReduceOp.SUM, CollectiveCommunication.AUTO)
 
-      if batch_reduce:
-        result = collective_all_reduce.batch_reduce(
-            reduce_util.ReduceOp.SUM, [(per_replica, per_replica)])[0]
-      else:
-        result = collective_all_reduce.reduce(reduce_util.ReduceOp.SUM,
-                                              per_replica, per_replica)
-      if num_gpus > 1:
-        self.assertIsInstance(result, value_lib.Mirrored)
+  def reduce_and_verify(self, inputs, expect, options):
+    """Reduce the given `inputs` and verify the output matches `expect`.
 
-      run_options = config_pb2.RunOptions()
-      run_options.experimental.collective_graph_key = 7
-      if num_gpus > 1:
-        result = sess.run([ops.convert_to_tensor(v) for v in result.values],
-                          options=run_options)[0]
-      else:
-        result = sess.run(ops.convert_to_tensor(result), options=run_options)
+    Args:
+      inputs: a list of `Tensor` or `IndexedSlices`, where i-th value will be
+        fed to i-th replica.
+      expect: a `Tensor` or `IndexedSlices`. This should be the expected value
+        for one replica.
+      options: a `RunOpotions` instance.
+    """
 
-      # Reduce the same indexed slices on CPU locally as our expected results.
-      devices_cpu = [(worker_device or "") + "/device:CPU:0"] * (
-          max(num_gpus, 1) * num_workers)
-      per_replica_on_cpu = self._get_indexed_slices(
-          devices_cpu, 0, variable_length, as_per_replica=False)
-      expected_result = cross_device_utils.aggregate_tensors_or_indexed_slices(
-          per_replica_on_cpu)
-      expected_result = sess.run(ops.convert_to_tensor(expected_result))
+    def replica_fn():
+      collective, devices, pid = self.make_collective(options.num_processes,
+                                                      options.gpus_per_process,
+                                                      options.communication)
 
-      self.assertAllEqual(expected_result, result)
+      def reduce_fn():
+        value_fn = lambda device_idx: inputs[pid * len(devices) + device_idx]
+        per_replica_value = make_per_replica_value(value_fn, devices)
+        reduced_values = collective.reduce(options.reduce_op, per_replica_value,
+                                           per_replica_value)
+        reduced_values = self.as_list(reduced_values)
+        self.assertAllEqual(devices, [v.device for v in reduced_values])
+        return [ops.convert_to_tensor(v) for v in reduced_values]
+
+      per_replica_expect = [ops.convert_to_tensor(expect)] * len(devices)
+
+      if "eager" in options.mode:
+        got = reduce_fn()
+        self.assertAllClose(got, per_replica_expect)
+
+      if "func_graph" in options.mode:
+        got = def_function.function(reduce_fn)()
+        self.assertAllClose(got, per_replica_expect)
+
+    get_global_mpr(options.num_processes).run(replica_fn)
+
+  def batch_reduce_and_verify(self, inputs, expect, options):
+    """Batch reduce the given `inputs` and verify the output matches `expect`.
+
+    Args:
+      inputs: a 2-level nested list of `Tensor` or `IndexedSlices`, where i-th
+        value will be fed to i-th replica.
+      expect: a list of `Tensor` or `IndexedSlices`. This should be the expected
+        value for one replica.
+      options: a `RunOpotions` instance.
+    """
+
+    def replica_fn():
+      collective, devices, pid = self.make_collective(options.num_processes,
+                                                      options.gpus_per_process,
+                                                      options.communication)
+
+      def batch_reduce_fn():
+        batch_size = len(inputs[0])
+        value_dst_pairs = []
+        for i in range(batch_size):
+
+          def value_fn(device_idx, idx=i):
+            return inputs[pid * len(devices) + device_idx][idx]
+
+          per_replica_value = make_per_replica_value(value_fn, devices)
+          value_dst_pairs.append((per_replica_value, per_replica_value))
+        reduced_values = collective.batch_reduce(options.reduce_op,
+                                                 value_dst_pairs)
+        reduced_values = [self.as_list(v) for v in reduced_values]
+        for v in reduced_values:
+          self.assertAllEqual(devices, [t.device for t in v])
+        return nest.map_structure(ops.convert_to_tensor, reduced_values)
+
+      per_replica_expect = nest.map_structure(
+          lambda x: [ops.convert_to_tensor(x)] * len(devices), expect)
+
+      if "eager" in options.mode:
+        got = batch_reduce_fn()
+        self.assertAllClose(got, per_replica_expect)
+
+      if "func_graph" in options.mode:
+        got = def_function.function(batch_reduce_fn)()
+        self.assertAllClose(got, per_replica_expect)
+
+    get_global_mpr(options.num_processes).run(replica_fn)
 
   @combinations.generate(
       combinations.combine(
-          mode=["graph"],
+          num_processes=[1, 2],
           required_gpus=[0, 1, 2],
-          use_strategy_object=[True, False],
-          bytes_per_pack=[0, 1, 4]))
-  def testReductionDistributed(self, required_gpus, use_strategy_object,
-                               bytes_per_pack):
-    hints = collective_util.Hints(bytes_per_pack=bytes_per_pack)
-    self._run_between_graph_clients(
-        self._test_reduction,
-        self._cluster_spec,
-        required_gpus,
-        communication=CollectiveCommunication.RING,
-        use_strategy_object=use_strategy_object,
-        hints=hints)
-
-  @combinations.generate(
-      combinations.combine(
-          mode=["graph"],
-          required_gpus=[0, 1, 2],
-          variable_length=[True, False]))
-  def testReduceIndexedSlicesDistributed(self, required_gpus, variable_length):
-    self._run_between_graph_clients(
-        self._test_reduce_indexed_slices,
-        self._cluster_spec,
-        required_gpus,
-        communication=CollectiveCommunication.RING,
-        batch_reduce=True,
-        variable_length=variable_length)
-
-  # Collective ops doesn't support strategy with one device.
-  @combinations.generate(
-      combinations.combine(
-          mode=["graph"],
-          required_gpus=2,
           communication=[
-              CollectiveCommunication.NCCL, CollectiveCommunication.RING
+              # NCCL is only used for batch reduce, so we are not including
+              # NCCL combination here.
+              CollectiveCommunication.AUTO,
+              CollectiveCommunication.RING
           ],
-          use_strategy_object=[True, False]))
-  def testReductionLocal(self, required_gpus, communication,
-                         use_strategy_object):
-    self._test_reduction(
-        None,
-        None,
-        required_gpus,
-        communication=communication,
-        use_strategy_object=use_strategy_object,
-        local_mode=True)
+          reduce_op=[ReduceOp.SUM, ReduceOp.MEAN]))
+  def testAllReduceDense(self, num_processes, required_gpus, communication,
+                         reduce_op):
+    options = self.RunOptions(
+        num_processes=num_processes,
+        gpus_per_process=required_gpus,
+        reduce_op=reduce_op,
+        communication=communication)
+    group_size = options.num_processes * (options.gpus_per_process or 1)
+
+    inputs_data = [1.0, 2.0, 3.0, 4.0]
+    inputs = inputs_data[0:group_size]
+
+    if group_size == 1:
+      expect = 1.0
+    if group_size == 2:
+      expect = 3.0 if reduce_op == ReduceOp.SUM else 1.5
+    elif group_size == 4:
+      expect = 10.0 if reduce_op == ReduceOp.SUM else 2.5
+
+    self.reduce_and_verify(inputs, expect, options)
 
   @combinations.generate(
       combinations.combine(
-          mode=["graph"],
-          required_gpus=2,
-          batch_reduce=[True, False],
-          variable_length=[True, False],
+          num_processes=[1, 2],
+          required_gpus=[0, 1, 2],
           communication=[
-              CollectiveCommunication.NCCL, CollectiveCommunication.RING
-          ]))
-  def testReduceIndexedSlicesLocal(self, required_gpus, batch_reduce,
-                                   variable_length, communication):
-    self._test_reduce_indexed_slices(
-        None,
-        None,
-        required_gpus,
-        communication=communication,
-        batch_reduce=batch_reduce,
-        variable_length=variable_length,
-        local_mode=True)
+              # NCCL is only used for batch reduce, so we are not including
+              # NCCL combination here.
+              CollectiveCommunication.AUTO,
+              CollectiveCommunication.RING
+          ],
+          # TODO(b/166682130): add MEAN reduce once the bug is fixed.
+          reduce_op=ReduceOp.SUM))
+  def testAllReduceSparse(self, num_processes, required_gpus, communication,
+                          reduce_op):
+    options = self.RunOptions(
+        mode=["func_graph"],  # Sparse reduce is not supported in eager.
+        num_processes=num_processes,
+        gpus_per_process=required_gpus,
+        reduce_op=reduce_op,
+        communication=communication)
+    group_size = options.num_processes * (options.gpus_per_process or 1)
+
+    inputs_data = [
+        indexed_slices.IndexedSlicesValue(
+            values=[[1.], [2.]], indices=[0, 1], dense_shape=[10, 1]),
+        indexed_slices.IndexedSlicesValue(
+            values=[[3.], [4.]], indices=[1, 2], dense_shape=[10, 1]),
+        indexed_slices.IndexedSlicesValue(
+            values=[[5.], [6.]], indices=[7, 8], dense_shape=[10, 1]),
+        indexed_slices.IndexedSlicesValue(
+            values=[[7.], [8.]], indices=[3, 2], dense_shape=[10, 1]),
+    ]
+    inputs = inputs_data[0:group_size]
+
+    if group_size == 1:
+      expect = indexed_slices.IndexedSlices(
+          values=[[1.], [2.]], indices=[0, 1], dense_shape=[10, 1])
+    elif group_size == 2:
+      expect = indexed_slices.IndexedSlices(
+          values=[[1.], [2.], [3.], [4.]],
+          indices=[0, 1, 1, 2],
+          dense_shape=[10, 1])
+    elif group_size == 4:
+      expect = indexed_slices.IndexedSlices(
+          values=[[1.], [2.], [3.], [4.], [5.], [6.], [7.], [8.]],
+          indices=[0, 1, 1, 2, 7, 8, 3, 2],
+          dense_shape=[10, 1])
+
+    self.reduce_and_verify(inputs, expect, options)
+
+  def testAllReduceSparseVariableLength(self):
+    # One device per process, 2 processes, 2 replicas in total.
+    inputs = [
+        indexed_slices.IndexedSlicesValue(
+            values=[[1.]], indices=[0], dense_shape=[10, 1]),
+        indexed_slices.IndexedSlicesValue(
+            values=[[2.], [3.], [4.]], indices=[0, 1, 2], dense_shape=[10, 1]),
+    ]
+    expect = indexed_slices.IndexedSlices(
+        values=[[1.], [2.], [3.], [4.]],
+        indices=[0, 0, 1, 2],
+        dense_shape=[10, 1])
+    self.reduce_and_verify(
+        inputs,
+        expect,
+        self.RunOptions(
+            mode=["func_graph"],  # Sparse reduce is not supported in eager.
+            num_processes=2,
+            reduce_op=ReduceOp.SUM))
 
   @combinations.generate(
       combinations.combine(
-          required_gpus=2,
-          mode="eager",
+          num_processes=[1, 2],
+          required_gpus=[0, 1, 2],
           communication=[
-              CollectiveCommunication.NCCL, CollectiveCommunication.RING
-          ]))
-  def testEagerMultiThread(self, communication):
-    collective, devices, _ = self._get_test_objects(
-        None,
-        None,
-        num_gpus=2,
-        communication=communication,
-        use_strategy_object=False,
-        local_mode=True)
+              CollectiveCommunication.AUTO, CollectiveCommunication.RING,
+              CollectiveCommunication.NCCL
+          ],
+          reduce_op=[ReduceOp.SUM, ReduceOp.MEAN]))
+  def testBatchAllReduceDense(self, num_processes, required_gpus, communication,
+                              reduce_op):
+    if required_gpus == 0 and communication == CollectiveCommunication.NCCL:
+      self.skipTest("Skip CPU + NCCL combination")
+    if num_processes == 2 and communication == CollectiveCommunication.NCCL:
+      self.skipTest("Skip NCCL + 2 processes combination. NCCL requires "
+                    "physical GPUs for every process.")
 
-    # We would like to simulate the following sequence:
-    #   thread-0  device0                 device1
-    #   thread-1          device0 device1
-    # If the kernel launch sequence is as-is the program will deadlock since
-    # NCCL requires the launch order to be same on each device.
-    v0 = _make_per_replica([1.0 for _ in devices], devices)
-    v1 = _make_per_replica([2.0 for _ in devices], devices)
+    options = self.RunOptions(
+        num_processes=num_processes,
+        gpus_per_process=required_gpus,
+        reduce_op=reduce_op,
+        communication=communication)
+    group_size = options.num_processes * (options.gpus_per_process or 1)
 
-    # Add a delay to collective_ops.all_reduce according to the input tensors
-    # index in `sequence.`
-    sequence = [v0.values[0], v1.values[0], v1.values[1], v0.values[1]]
-    all_reduce = collective_ops.all_reduce
+    inputs_data = [[1.0, 2.0], [3.0, 4.0], [5.0, 6.0], [7.0, 8.0]]
+    inputs = inputs_data[0:group_size]
 
-    def delayed_all_reduce(input_tensor, *args, **kwargs):
-      for idx, v in enumerate(sequence):
-        if input_tensor is v:
-          time.sleep(idx)
-          break
-      return all_reduce(input_tensor, *args, **kwargs)
+    if group_size == 1:
+      expect = [1.0, 2.0]
+    if group_size == 2:
+      expect = [4.0, 6.0] if reduce_op == ReduceOp.SUM else [2.0, 3.0]
+    elif group_size == 4:
+      expect = [16.0, 20.0] if reduce_op == ReduceOp.SUM else [4.0, 5.0]
 
-    with test.mock.patch.object(collective_ops, "all_reduce",
-                                delayed_all_reduce):
-      # We only use NCCL for batch reduce with two or more values, so we use two
-      # values here.
-
-      def thread_fn():
-        reduced = collective.batch_reduce(reduce_util.ReduceOp.SUM, [(v0, v0),
-                                                                     (v0, v0)])
-        self.assertAllEqual(reduced[0].values, [2.0, 2.0])
-        self.assertAllEqual(reduced[1].values, [2.0, 2.0])
-
-      t = threading.Thread(target=thread_fn)
-      t.start()
-      reduced = collective.batch_reduce(reduce_util.ReduceOp.SUM, [(v1, v1),
-                                                                   (v1, v1)])
-      self.assertAllEqual(reduced[0].values, [4.0, 4.0])
-      self.assertAllEqual(reduced[1].values, [4.0, 4.0])
-      t.join()
+    self.batch_reduce_and_verify(inputs, expect, options)
 
   @combinations.generate(
       combinations.combine(
-          required_gpus=2,
-          mode="eager",
+          num_processes=[1, 2],
+          required_gpus=[0, 1, 2],
           communication=[
-              CollectiveCommunication.NCCL, CollectiveCommunication.RING
-          ]))
-  def testInputsAreFunctionArgs(self, communication):
-    # Function inputs don't have device placement.
-    hints = collective_util.Hints(bytes_per_pack=1)
-    collective, devices, _ = self._get_test_objects(
-        None,
-        None,
-        num_gpus=2,
-        communication=communication,
-        use_strategy_object=False,
-        local_mode=True)
-    devices = [device_util.canonicalize(d) for d in devices]
+              CollectiveCommunication.AUTO,
+              CollectiveCommunication.RING,
+              CollectiveCommunication.NCCL,
+          ],
+          # TODO(b/166682130): add MEAN reduce once the bug is fixed.
+          reduce_op=ReduceOp.SUM))
+  def testBatchAllReduceSparse(self, num_processes, required_gpus,
+                               communication, reduce_op):
+    if required_gpus == 0 and communication == CollectiveCommunication.NCCL:
+      self.skipTest("Skip CPU + NCCL combination")
+    if num_processes == 2 and communication == CollectiveCommunication.NCCL:
+      self.skipTest("Skip NCCL + 2 processes combination. NCCL requires "
+                    "physical GPUs for every process.")
 
-    @def_function.function
-    def reduce_fn(v):
-      self.assertEqual(v.values[0].device, "")
-      self.assertEqual(v.values[1].device, "")
-      # We only use NCCL for batch reduce with two or more values, so we use two
-      # values here.
-      reduced = collective.batch_reduce(
-          reduce_util.ReduceOp.SUM, [(v, v), (v, v)], experimental_hints=hints)
-      self.assertEqual(reduced[0].values[0].device, devices[0])
-      self.assertEqual(reduced[0].values[1].device, devices[1])
-      self.assertEqual(reduced[1].values[0].device, devices[0])
-      self.assertEqual(reduced[1].values[1].device, devices[1])
-      # Returning Mirrored only evaluates the primary value, which causes
-      # hanging,
-      return [reduced[0].values, reduced[1].values]
+    options = self.RunOptions(
+        mode=["func_graph"],  # Sparse reduce is not supported in eager.
+        num_processes=num_processes,
+        gpus_per_process=required_gpus,
+        reduce_op=reduce_op,
+        communication=communication)
+    group_size = options.num_processes * (options.gpus_per_process or 1)
 
-    v = _make_per_replica([1.0, 2.0], devices)
-    reduced = reduce_fn(v)
-    self.assertAllEqual(self.evaluate(reduced), [[3.0, 3.0], [3.0, 3.0]])
-
-  @combinations.generate(
-      combinations.combine(
-          required_gpus=[0, 1],
-          mode="eager",
-          communication=[CollectiveCommunication.RING]))
-  def testTimeoutReduceDense(self, communication, required_gpus):
-    hints = collective_util.Hints(timeout_seconds=1)
-    collective, devices, _ = self._get_test_objects(
-        "worker",
-        0,
-        num_gpus=required_gpus,
-        communication=communication,
-        use_strategy_object=False)
-    remote.connect_to_cluster(
-        multi_worker_util.normalize_cluster_spec(self._cluster_spec),
-        protocol="grpc")
-    devices = [device_util.canonicalize(d) for d in devices]
-    v = _make_per_replica([1.0], devices)
-
-    @def_function.function
-    def reduce_dense():
-      collective.reduce(reduce_util.ReduceOp.SUM, v, v, hints)
-
-    # The collective should time out because we only launch it on worker-0,
-    # while there're three workers in total.
-    with self.assertRaises(errors.DeadlineExceededError):
-      reduce_dense()
-
-    # Reset since collective failures poison the context.
-    context._reset_context()  # pylint: disable=protected-access
-
-  @combinations.generate(
-      combinations.combine(
-          required_gpus=[0, 1],
-          mode="eager",
-          communication=[CollectiveCommunication.RING]))
-  def testTimeoutBatchReduceDense(self, communication, required_gpus):
-    hints = collective_util.Hints(timeout_seconds=1)
-    collective, devices, _ = self._get_test_objects(
-        "worker",
-        0,
-        num_gpus=required_gpus,
-        communication=communication,
-        use_strategy_object=False)
-    remote.connect_to_cluster(
-        multi_worker_util.normalize_cluster_spec(self._cluster_spec),
-        protocol="grpc")
-    devices = [device_util.canonicalize(d) for d in devices]
-    v = _make_per_replica([1.0], devices)
-
-    @def_function.function
-    def batch_reduce_dense():
-      collective.batch_reduce(reduce_util.ReduceOp.SUM, [(v, v), (v, v)], hints)
-
-    # The collective should time out because we only launch it on worker-0,
-    # while there're three workers in total.
-    with self.assertRaises(errors.DeadlineExceededError):
-      batch_reduce_dense()
-
-    # Reset since collective failures poison the context.
-    context._reset_context()  # pylint: disable=protected-access
-
-  @combinations.generate(
-      combinations.combine(
-          required_gpus=[0, 1],
-          mode="eager",
-          communication=[CollectiveCommunication.RING]))
-  def testTimeoutReduceSparse(self, communication, required_gpus):
-    hints = collective_util.Hints(timeout_seconds=1)
-    collective, devices, _ = self._get_test_objects(
-        "worker",
-        0,
-        num_gpus=required_gpus,
-        communication=communication,
-        use_strategy_object=False)
-    remote.connect_to_cluster(
-        multi_worker_util.normalize_cluster_spec(self._cluster_spec),
-        protocol="grpc")
-    devices = [device_util.canonicalize(d) for d in devices]
-    v = value_lib.PerReplica([
-        _make_indexed_slices([[4., 6.], [5., 6.]], [1, 3], [5, 2], devices[0])
+    inputs_data = ([
+        indexed_slices.IndexedSlicesValue(
+            values=[[1.], [2.]], indices=[0, 1], dense_shape=[10, 1]),
+        indexed_slices.IndexedSlicesValue(
+            values=[[3.], [4.]], indices=[1, 2], dense_shape=[5, 1])
+    ], [
+        indexed_slices.IndexedSlicesValue(
+            values=[[5.], [6.]], indices=[1, 2], dense_shape=[10, 1]),
+        indexed_slices.IndexedSlicesValue(
+            values=[[7.], [8.]], indices=[0, 1], dense_shape=[5, 1])
+    ], [
+        indexed_slices.IndexedSlicesValue(
+            values=[[9.], [10.]], indices=[3, 4], dense_shape=[10, 1]),
+        indexed_slices.IndexedSlicesValue(
+            values=[[11.], [12.]], indices=[3, 4], dense_shape=[5, 1])
+    ], [
+        indexed_slices.IndexedSlicesValue(
+            values=[[13.], [14.]], indices=[8, 9], dense_shape=[10, 1]),
+        indexed_slices.IndexedSlicesValue(
+            values=[[15.], [16.]], indices=[3, 4], dense_shape=[5, 1])
     ])
+    inputs = inputs_data[0:group_size]
 
-    @def_function.function
-    def reduce_sparse():
-      collective.reduce(reduce_util.ReduceOp.SUM, v, v, hints)
-
-    # The collective should time out because we only launch it on worker-0,
-    # while there're three workers in total.
-    with self.assertRaises(errors.DeadlineExceededError):
-      reduce_sparse()
-
-    # Reset since collective failures poison the context.
-    context._reset_context()  # pylint: disable=protected-access
+    if group_size == 1:
+      expect = [
+          indexed_slices.IndexedSlices(
+              values=[[1.], [2.]], indices=[0, 1], dense_shape=[10, 1]),
+          indexed_slices.IndexedSlicesValue(
+              values=[[3.], [4.]], indices=[1, 2], dense_shape=[5, 1])
+      ]
+    if group_size == 2:
+      expect = [
+          indexed_slices.IndexedSlices(
+              values=[[1.], [2.], [5.], [6.]],
+              indices=[0, 1, 1, 2],
+              dense_shape=[10, 1]),
+          indexed_slices.IndexedSlices(
+              values=[[3.], [4.], [7.], [8.]],
+              indices=[1, 2, 3, 4],
+              dense_shape=[5, 1])
+      ]
+    elif group_size == 4:
+      expect = [
+          indexed_slices.IndexedSlices(
+              values=[[1.], [2.], [5.], [6.], [9.], [10.], [13.], [14.]],
+              indices=[0, 1, 1, 2, 3, 4, 8, 9],
+              dense_shape=[10, 1]),
+          indexed_slices.IndexedSlices(
+              values=[[3.], [4.], [7.], [8.], [11.], [12.], [15.], [16.]],
+              indices=[1, 2, 0, 1, 3, 4, 3, 4],
+              dense_shape=[5, 2])
+      ]
+      self.batch_reduce_and_verify(inputs, expect, options)
 
   @combinations.generate(
       combinations.combine(
-          required_gpus=[0, 1],
-          mode="eager",
-          communication=[CollectiveCommunication.RING]))
-  def testTimeoutBatchReduceSparse(self, communication, required_gpus):
-    hints = collective_util.Hints(timeout_seconds=1)
-    collective, devices, _ = self._get_test_objects(
-        "worker",
-        0,
-        num_gpus=required_gpus,
-        communication=communication,
-        use_strategy_object=False)
-    remote.connect_to_cluster(
-        multi_worker_util.normalize_cluster_spec(self._cluster_spec),
-        protocol="grpc")
-    devices = [device_util.canonicalize(d) for d in devices]
-    v = value_lib.PerReplica([
-        _make_indexed_slices([[4., 6.], [5., 6.]], [1, 3], [5, 2], devices[0])
-    ])
+          num_processes=[1, 2],
+          required_gpus=[0, 1, 2],
+          axis=[0, 1, 2],
+          func_mode=["eager", "func_graph"],
+          communication=[
+              CollectiveCommunication.NCCL,
+              CollectiveCommunication.AUTO,
+              CollectiveCommunication.RING
+          ]))
+  def testAllGatherSameShape(self, num_processes, required_gpus, communication,
+                             func_mode, axis):
 
-    @def_function.function
-    def batch_reduce_sparse():
-      collective.batch_reduce(reduce_util.ReduceOp.SUM, [(v, v), (v, v)], hints)
+    def replica_fn():
+      collective, devices, _ = self.make_collective(num_processes,
+                                                    required_gpus,
+                                                    communication)
+      value = constant_op.constant([[[1, 2], [1, 2]]], dtype=dtypes.float32)
 
-    # The collective should time out because we only launch it on worker-0,
-    # while there're three workers in total.
-    with self.assertRaises(errors.DeadlineExceededError):
-      batch_reduce_sparse()
+      def gather_fn():
+        value_fn = lambda device_idx: value
+        per_replica_value = make_per_replica_value(value_fn, devices)
+        gathered_values = collective._gather(
+            per_replica_value, per_replica_value, axis=axis)
+        gathered_values = self.as_list(gathered_values)
+        # Skip checking devices in eager. In eager the device attribute doesn't
+        # reflect the actual device of the tensor.
+        if not context.executing_eagerly():
+          self.assertAllEqual(devices, [v.device for v in gathered_values])
+        return [ops.convert_to_tensor(v) for v in gathered_values]
 
-    # Reset since collective failures poison the context.
-    context._reset_context()  # pylint: disable=protected-access
+      group_size = num_processes * (required_gpus or 1)
+      expect = array_ops.concat([value] * group_size, axis=axis)
+      per_replica_expect = [ops.convert_to_tensor(expect)] * len(devices)
 
+      if func_mode == "eager":
+        result = gather_fn()
+        self.assertAllClose(result, per_replica_expect)
+
+      if func_mode == "func_graph":
+        result = def_function.function(gather_fn)()
+        self.assertAllClose(result, per_replica_expect)
+
+    get_global_mpr(num_processes).run(replica_fn)
 
 if __name__ == "__main__":
   # Set default inter op thread pool size to one to ensure we don't exhaust the
   # thread pool with the additional executors to run collectives in eager.
   os.environ["TF_NUM_INTEROP_THREADS"] = "1"
-  combinations.main()
+  multi_process_runner.test_main()
diff --git a/tensorflow/python/distribute/cross_device_utils.py b/tensorflow/python/distribute/cross_device_utils.py
index a8d4d176ab9..cb442f4a53a 100644
--- a/tensorflow/python/distribute/cross_device_utils.py
+++ b/tensorflow/python/distribute/cross_device_utils.py
@@ -18,11 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections as pycoll
 import copy
 import threading
 
-from tensorflow.python.distribute import all_reduce
 from tensorflow.python.distribute import values as value_lib
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
@@ -36,7 +34,6 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nccl_ops
 from tensorflow.python.platform import tf_logging as logging
 
-
 OP_INSTANCE_KEY_START_NUMBER = 100
 
 
@@ -171,65 +168,6 @@ def aggregate_single_gradient_using_copy(grad_and_vars, use_mean,
     return (grad, v), None
 
 
-def group_device_names(devices, group_size):
-  """Group device names into groups of group_size.
-
-  Args:
-    devices: a list of canonical device strings.
-    group_size: integer which is equal to or greater than 1.
-
-  Returns:
-    list of lists of devices, where each inner list is group_size long,
-      and each device appears at least once in an inner list.  If
-      len(devices) % group_size == 0 then each device will appear exactly once.
-
-  Raises:
-    ValueError: if group_size > len(devices)
-  """
-  num_devices = len(devices)
-  if group_size > num_devices:
-    raise ValueError(
-        'only %d devices, but group_size=%d' % (num_devices, group_size))
-  num_groups = (
-      num_devices // group_size + (1 if (num_devices % group_size != 0) else 0))
-  groups = [[] for i in range(num_groups)]
-  for i in range(num_groups * group_size):
-    groups[i % num_groups].append(devices[i % num_devices])
-  return groups
-
-
-def split_grads_by_size(threshold_size, device_grads):
-  """Break gradients into two sets according to tensor size.
-
-  Args:
-    threshold_size: int size cutoff for small vs large tensor.
-    device_grads: List of lists of (gradient, variable) tuples.  The outer
-        list is over devices. The inner list is over individual gradients.
-
-  Returns:
-    small_grads: Subset of device_grads where shape is <= threshold_size
-       elements.
-    large_grads: Subset of device_grads where shape is > threshold_size
-       elements.
-  """
-  small_grads = []
-  large_grads = []
-  for dl in device_grads:
-    small_dl = []
-    large_dl = []
-    for (g, v) in dl:
-      tensor_size = g.get_shape().num_elements()
-      if tensor_size <= threshold_size:
-        small_dl.append([g, v])
-      else:
-        large_dl.append([g, v])
-    if small_dl:
-      small_grads.append(small_dl)
-    if large_dl:
-      large_grads.append(large_dl)
-  return small_grads, large_grads
-
-
 # TODO(yuefengz): use random key starts to avoid reusing keys?
 class CollectiveKeys(object):
   """Class that manages collective keys.
@@ -401,6 +339,7 @@ def build_collective_gather(input_tensors,
                             devices,
                             group_size,
                             collective_keys,
+                            axis,
                             communication_hint='AUTO',
                             control_inputs=None,
                             timeout=None):
@@ -410,14 +349,16 @@ def build_collective_gather(input_tensors,
 
   Args:
     input_tensors: tensors within a single worker graph that are to be gathered
-      together; must be one per device.
+      together; must be one per device. Input tensors cannot have rank 0.
     devices: a list of device strings to run the collective on.
     group_size: total number of devices globally that will be doing this same
       gathering. The gathering will actually include the corresponding tensors
       at all these workers.
     collective_keys: a CollectiveKeys object.
+    axis: 0-D int32 Tensor. Dimension along which to gather. Must be in the
+      range [0, rank(value)).
     communication_hint: string providing hint to runtime for choosing collective
-      implementation.
+      implementation. Available options are `AUTO`, `NCCL`, and `RING`.
     control_inputs: if not None, add control edges between control_inputs and
       (index-wise) corresponding collective_gather tensors
     timeout: a float or None. The timeout in seconds.
@@ -425,9 +366,6 @@ def build_collective_gather(input_tensors,
   Returns:
     An array of final tensors, one per device, computed by the full gather.
   """
-  assert not context.executing_eagerly(), (
-      'build_collective_gather can only be called in graph mode or inside '
-      'tf.function')
   if len(input_tensors) != len(devices):
     raise ValueError(
         'collective requires one input tensor for each device, %d != %d' %
@@ -436,24 +374,75 @@ def build_collective_gather(input_tensors,
   if group_size < 2:
     return input_tensors
   group_key = collective_keys.get_group_key(devices)
-  instance_key = collective_keys.get_op_instance_key()
+  instance_key_tensor = collective_keys.get_op_instance_key()
+  instance_key_shape = collective_keys.get_op_instance_key()
 
   out_tensors = []
   for idx, input_tensor in enumerate(input_tensors):
-    with ops.device(devices[idx]):
-      with ops.control_dependencies(
-          _control_input(devices, control_inputs, idx)):
-        out_tensor = collective_ops.all_gather(
-            input_tensor,
-            group_size,
-            group_key,
-            instance_key,
-            communication_hint,
-            timeout=timeout)
+    with ops.device(devices[idx]), ops.control_dependencies(
+        _control_input(devices, control_inputs, idx)):
+      # 1. Transpose
+      # E.g. Given an input_tensor with shape [2,2,5,1] and axis to gather is 3,
+      # we use perm_pre=[3 0 1 2] to reshape it to [1,2,2,5], which
+      # brings the 3rd dim first; afterwards we use perm_after=[1,2,3,0] to
+      # place it back.
+      perm_pre = array_ops.concat(
+          ([axis], math_ops.range(axis),
+           math_ops.range(axis + 1, array_ops.rank(input_tensor))),
+          axis=0)
+      input_tensor_t = array_ops.transpose(input_tensor, perm=perm_pre)
+      # 2. Pad
+      gathered_shape = collective_ops.all_gather(
+          array_ops.expand_dims_v2(array_ops.shape_v2(input_tensor_t), axis=0),
+          group_size,
+          group_key,
+          instance_key_shape,
+          communication_hint,
+          timeout=timeout)
+      first_dims = gathered_shape[:, 0]
+      full_axis_dim = math_ops.reduce_max(first_dims)
+      padded_input_tensor = _pad_util(input_tensor_t, full_axis_dim)
+
+      # 3. Gather
+      gather_padded_out_tensor = collective_ops.all_gather(
+          padded_input_tensor,
+          group_size,
+          group_key,
+          instance_key_tensor,
+          communication_hint,
+          timeout=timeout)
+      # 4. Unpad
+      split_tensors = []
+      for i in range(first_dims.shape[0]):
+        start_pos = i * full_axis_dim
+        split_tensors.append(gather_padded_out_tensor[start_pos:start_pos +
+                                                      first_dims[i]])
+      out_tensor_t = array_ops.concat(split_tensors, 0)
+
+      # 5. Transpose back
+      perm_after = array_ops.concat(
+          (math_ops.range(1, axis + 1), [0],
+           math_ops.range(axis + 1, array_ops.rank(input_tensor_t))),
+          axis=0)
+      out_tensor = array_ops.transpose(out_tensor_t, perm=perm_after)
       out_tensors.append(out_tensor)
   return out_tensors
 
 
+def _pad_util(input_tensor, full_axis_dim):
+  """Pad the `input_tensor`'s first dimension to be `full_axis_dim`."""
+  missing_axis_dim = full_axis_dim - array_ops.shape_v2(input_tensor)[0]
+  tensor_rank = array_ops.rank(input_tensor)
+  paddings_axis = [[0, missing_axis_dim]]
+  paddings = array_ops.concat([
+      paddings_axis,
+      array_ops.zeros(shape=(tensor_rank - 1, 2), dtype=dtypes.int32)
+  ],
+                              axis=0)
+  padded_input_tensor = array_ops.pad(input_tensor, paddings)
+  return padded_input_tensor
+
+
 def build_collective_gather_indexed_slices(input_slices_list,
                                            devices,
                                            group_size,
@@ -580,272 +569,6 @@ def build_collective_gather_indexed_slices(input_slices_list,
   return out_slices_list
 
 
-def sum_grad_and_var_all_reduce(grad_and_vars,
-                                num_workers,
-                                alg,
-                                gpu_indices,
-                                aux_devices=None,
-                                num_shards=1):
-  """Apply all-reduce algorithm over specified gradient tensors."""
-  with ops.name_scope('allreduce'):
-    # Note that each grad_and_vars looks like the following:
-    #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
-    scaled_grads = [g for g, _ in grad_and_vars]
-    if alg == 'nccl':
-      summed_grads = nccl_ops.all_sum(scaled_grads)
-    elif alg == 'xring':
-      summed_grads = all_reduce.build_ring_all_reduce(
-          scaled_grads, num_workers, num_shards, gpu_indices, math_ops.add)
-    elif alg == 'nccl/xring':
-      summed_grads = all_reduce.build_nccl_then_ring(scaled_grads, num_shards,
-                                                     math_ops.add)
-    elif alg == 'nccl/rechd':
-      summed_grads = all_reduce.build_nccl_then_recursive_hd(
-          scaled_grads, math_ops.add)
-    elif alg == 'nccl/pscpu':
-      summed_grads = all_reduce.build_nccl_then_shuffle(
-          scaled_grads, aux_devices, math_ops.add, math_ops.add_n)
-    elif alg == 'pscpu/pscpu':
-      second_gather_devices = aux_devices[:num_shards]
-      summed_grads = all_reduce.build_shuffle_then_shuffle(
-          scaled_grads, aux_devices, second_gather_devices, math_ops.add_n)
-    elif alg in ['pscpu', 'psgpu']:
-      summed_grads = all_reduce.build_shuffle_all_reduce(
-          scaled_grads, aux_devices, math_ops.add_n)
-    else:
-      raise ValueError('unsupported all_reduce alg: ', alg)
-
-  result = []
-  for (_, v), g in zip(grad_and_vars, summed_grads):
-    result.append([g, v])
-  return result
-
-
-def sum_gradients_all_reduce(dev_prefixes, replica_grads, num_workers, alg,
-                             num_shards, gpu_indices):
-  """Apply all-reduce algorithm over specified gradient tensors.
-
-  Args:
-    dev_prefixes: list of prefix strings to use to generate PS device names.
-    replica_grads: the gradients to reduce.
-    num_workers: number of worker processes across entire job.
-    alg: the all-reduce algorithm to apply.
-    num_shards: alg-specific sharding factor.
-    gpu_indices: indices of local GPUs in order usable for ring-reduce.
-
-  Returns:
-    list of reduced tensors
-  """
-  alg_contains_shuffle = any(n in alg for n in ['pscpu', 'psgpu'])
-  is_hierarchical = '/' in alg
-  if 'pscpu' in alg:
-    aux_devices = [prefix + '/cpu:0' for prefix in dev_prefixes]
-  elif 'psgpu' in alg:
-    aux_devices = [
-        prefix + '/gpu:%d' % i
-        for i in range(len(gpu_indices))
-        for prefix in dev_prefixes
-    ]
-  else:
-    aux_devices = ['/job:localhost/cpu:0']
-  # Auxiliary devices for hierarchical all-reduces.
-  aux_device_groups = group_device_names(
-      aux_devices, num_shards if alg_contains_shuffle else 1)
-  group_index = 0
-  reduced_gv_list = []
-  for grad_and_vars in zip(*replica_grads):
-    reduced_gv_list.append(
-        sum_grad_and_var_all_reduce(
-            grad_and_vars, num_workers, alg, gpu_indices, aux_devices
-            if is_hierarchical else aux_device_groups[group_index], num_shards))
-    group_index = (group_index + 1) % len(aux_device_groups)
-  new_replica_grads = [list(x) for x in zip(*reduced_gv_list)]
-  return new_replica_grads
-
-
-def extract_ranges(index_list, range_size_limit=32):
-  """Extract consecutive ranges and singles from index_list.
-
-  Args:
-    index_list: List of monotone increasing non-negative integers.
-    range_size_limit: Largest size range to return.  If a larger
-      consecutive range exists, it will be returned as multiple
-      ranges.
-
-  Returns:
-    (ranges, singles) where ranges is a list of [first, last] pairs of
-      consecutive elements in index_list, and singles is all of the
-      other elements, in original order.
-  """
-  if not index_list:
-    return [], []
-  first = index_list[0]
-  last = first
-  ranges = []
-  singles = []
-  for i in index_list[1:]:
-    if i == last + 1 and (last - first) <= range_size_limit:
-      last = i
-    else:
-      if last > first:
-        ranges.append([first, last])
-      else:
-        singles.append(first)
-      first = i
-      last = i
-  if last > first:
-    ranges.append([first, last])
-  else:
-    singles.append(first)
-  return ranges, singles
-
-
-GradPackTuple = pycoll.namedtuple('GradPackTuple', 'indices vars shapes')
-
-
-def pack_range(key, packing, grad_vars, rng):
-  """Form the concatenation of a specified range of gradient tensors.
-
-  Args:
-    key: Value under which to store meta-data in packing that will be used
-      later to restore the grad_var list structure.
-    packing: Dict holding data describing packed ranges of small tensors.
-    grad_vars: List of (grad, var) pairs for one replica.
-    rng: A pair of integers giving the first, last indices of a consecutive
-      range of tensors to be packed.
-
-  Returns:
-    A tensor that is the concatenation of all the specified small tensors.
-  """
-  to_pack = grad_vars[rng[0]:rng[1] + 1]
-  members = []
-  variables = []
-  restore_shapes = []
-  with ops.name_scope('pack'):
-    for g, v in to_pack:
-      variables.append(v)
-      restore_shapes.append(g.shape)
-      with ops.device(g.device):
-        members.append(array_ops.reshape(g, [-1]))
-    packing[key] = GradPackTuple(
-        indices=range(rng[0], rng[1] + 1),
-        vars=variables,
-        shapes=restore_shapes)
-    with ops.device(members[0].device):
-      return array_ops.concat(members, 0)
-
-
-def unpack_grad_tuple(gv, gpt):
-  """Unpack a previously packed collection of gradient tensors.
-
-  Args:
-    gv: A (grad, var) pair to be unpacked.
-    gpt: A GradPackTuple describing the packing operation that produced gv.
-
-  Returns:
-    A list of (grad, var) pairs corresponding to the values that were
-     originally packed into gv, maybe following subsequent operations like
-     reduction.
-  """
-  elt_widths = [x.num_elements() for x in gpt.shapes]
-  with ops.device(gv[0].device):
-    with ops.name_scope('unpack'):
-      splits = array_ops.split(gv[0], elt_widths)
-      unpacked_gv = []
-      for idx, s in enumerate(splits):
-        unpacked_gv.append((array_ops.reshape(s, gpt.shapes[idx]),
-                            gpt.vars[idx]))
-  return unpacked_gv
-
-
-def pack_small_tensors(replica_grads, max_bytes=0, max_group=0):
-  """Concatenate small gradient tensors together for reduction.
-
-  Args:
-    replica_grads: List of lists of (gradient, variable) tuples.
-    max_bytes: Int giving max number of bytes in a tensor that
-      may be considered small.
-    max_group: Int giving max number of small tensors that may be
-      concatenated into one new tensor.
-
-  Returns:
-    new_replica_grads, packing where new_replica_grads is identical to
-      replica_grads except that all feasible small_tensors have been removed
-      from their places and concatenated into larger tensors that are
-      now in the front of the list for each replica, and packing contains
-      the data necessary to restore the replica_grads structure.
-
-  Look through the first replica for gradients of the same type (float),
-  and small size, that are all sequential.  For each such group,
-  replace by a new tensor that is a flattened concatenation.  Note
-  that the corresponding variable will be absent, which doesn't matter
-  because it isn't used during all-reduce.
-
-  Requires:
-    Every gv_list in replicas must have isomorphic structure including identical
-      tensor sizes and types.
-  """
-  small_indices = []
-  large_indices = []
-  for idx, (g, _) in enumerate(replica_grads[0]):
-    if g.dtype == dtypes.float32 and (4 * g.shape.num_elements()) <= max_bytes:
-      small_indices.append(idx)
-    else:
-      large_indices.append(idx)
-  small_ranges, small_singles = extract_ranges(
-      small_indices, range_size_limit=max_group)
-  large_indices = sorted(large_indices + small_singles)
-  num_gv = len(replica_grads[0])
-  packing = {}
-  if small_ranges:
-    new_replica_grads = []
-    for dev_idx, gv_list in enumerate(replica_grads):
-      assert len(gv_list) == num_gv
-      new_gv_list = []
-      for r in small_ranges:
-        key = '%d:%d' % (dev_idx, len(new_gv_list))
-        new_gv_list.append((pack_range(key, packing, gv_list, r),
-                            'packing_var_placeholder'))
-      for i in large_indices:
-        new_gv_list.append(gv_list[i])
-      new_replica_grads.append(new_gv_list)
-    return new_replica_grads, packing
-  else:
-    return replica_grads, None
-
-
-def unpack_small_tensors(replica_grads, packing):
-  """Undo the structure alterations to replica_grads done by pack_small_tensors.
-
-  Args:
-    replica_grads: List of List of (grad, var) tuples.
-    packing: A dict generated by pack_small_tensors describing the changes
-      it made to replica_grads.
-
-  Returns:
-    new_replica_grads: identical to replica_grads except that concatenations
-      of small tensors have been split apart and returned to their original
-      positions, paired with their original variables.
-  """
-  if not packing:
-    return replica_grads
-  new_replica_grads = []
-  num_devices = len(replica_grads)
-  num_packed = len(packing.keys()) // num_devices
-  for dev_idx, gv_list in enumerate(replica_grads):
-    gv_list = list(gv_list)
-    new_gv_list = gv_list[num_packed:]
-    for i in range(num_packed):
-      k = '%d:%d' % (dev_idx, i)
-      gpt = packing[k]
-      gv = unpack_grad_tuple(gv_list[i], gpt)
-      for gi, idx in enumerate(gpt.indices):
-        assert idx == gpt.indices[gi]
-        new_gv_list.insert(idx, gv[gi])
-    new_replica_grads.append(new_gv_list)
-  return new_replica_grads
-
-
 def aggregate_tensors_or_indexed_slices(values, accumulation_fn=math_ops.add_n):
   """Aggregate tensors using `accumulation_fn` and IndexedSlices via concat."""
   if any(isinstance(v, ops.IndexedSlices) for v in values):
@@ -875,18 +598,6 @@ def copy_tensor_or_indexed_slices_to_device(value, device):
   return result
 
 
-def contains_indexed_slices(value):
-  """Check whether the value is `IndexedSlices` or contains `IndexedSlices`."""
-  if isinstance(value, ops.IndexedSlices):
-    return True
-  elif isinstance(value, (list, tuple)) and value:
-    return any(contains_indexed_slices(v) for v in value)
-  elif isinstance(value, value_lib.DistributedValues):
-    return contains_indexed_slices(value.values)
-  else:
-    return False
-
-
 def is_indexed_slices(value):
   if isinstance(value, ops.IndexedSlices):
     return True
diff --git a/tensorflow/python/distribute/cross_device_utils_test.py b/tensorflow/python/distribute/cross_device_utils_test.py
index 9781bf67566..626ec5cfd60 100644
--- a/tensorflow/python/distribute/cross_device_utils_test.py
+++ b/tensorflow/python/distribute/cross_device_utils_test.py
@@ -81,32 +81,7 @@ class IndexedSlicesUtilsTest(test.TestCase, parameterized.TestCase):
   def testIsIndexedSlices(self):
     t = math_ops._as_indexed_slices(
         constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
-    self.assertTrue(cross_device_utils.contains_indexed_slices(t))
-
-  @test_util.run_in_graph_and_eager_modes
-  def testContainsIndexedSlices_List(self):
-    t0 = math_ops._as_indexed_slices(
-        constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
-    t1 = math_ops._as_indexed_slices(
-        constant_op.constant([[0., 0.], [5, 6], [7., 8.]]))
-    self.assertTrue(cross_device_utils.contains_indexed_slices([t0, t1]))
-
-  @test_util.run_in_graph_and_eager_modes
-  def testContainsIndexedSlices_Tuple(self):
-    t0 = math_ops._as_indexed_slices(
-        constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
-    t1 = math_ops._as_indexed_slices(
-        constant_op.constant([[0., 0.], [5, 6], [7., 8.]]))
-    self.assertTrue(cross_device_utils.contains_indexed_slices((t0, t1)))
-
-  @test_util.run_in_graph_and_eager_modes
-  def testContainsIndexedSlices_PerReplica(self):
-    t0 = math_ops._as_indexed_slices(
-        constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
-    t1 = math_ops._as_indexed_slices(
-        constant_op.constant([[0., 0.], [5, 6], [7., 8.]]))
-    per_replica = value_lib.PerReplica((t0, t1))
-    self.assertTrue(cross_device_utils.contains_indexed_slices(per_replica))
+    self.assertTrue(cross_device_utils.is_indexed_slices(t))
 
   @combinations.generate(combinations.combine(
       mode=["graph", "eager"],
diff --git a/tensorflow/python/distribute/custom_training_loop_input_test.py b/tensorflow/python/distribute/custom_training_loop_input_test.py
index 832dc061f65..a835f5e5ac9 100644
--- a/tensorflow/python/distribute/custom_training_loop_input_test.py
+++ b/tensorflow/python/distribute/custom_training_loop_input_test.py
@@ -35,6 +35,7 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import map_fn
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.util import nest
@@ -634,8 +635,85 @@ class InputIterationTest(test.TestCase, parameterized.TestCase,
   @combinations.generate(
       combinations.combine(
           distribution=strategy_combinations.multidevice_strategies,
-          mode=["eager"]
-      ))
+          mode=["eager"]))
+  def testSegmentSumWithDynamicNumberOfSegments(self, distribution):
+
+    def dataset_fn(_):
+      data = array_ops.zeros(5, dtype=dtypes.int32)
+      dataset = get_dataset_from_tensor_slices(data)
+      dataset = dataset.batch(3)
+      return dataset
+
+    input_iterator = iter(
+        distribution.experimental_distribute_datasets_from_function(dataset_fn))
+
+    @def_function.function
+    def step_fn(example):
+      segment_ids = array_ops.zeros_like_v2(example)
+      num_segment = array_ops.shape(example)[0]
+      # If number of segments is dynamic, output should be a dynamic shape.
+      return math_ops.unsorted_segment_sum(example, segment_ids, num_segment)
+
+    # This assumes that there are exactly 2 replicas
+    outputs = distribution.experimental_local_results(
+        distribution.run(step_fn, args=(next(input_iterator),)))
+    self.assertAllEqual((3,), outputs[0].shape)
+    self.assertAllEqual((2,), outputs[1].shape)
+
+  @combinations.generate(
+      combinations.combine(
+          distribution=strategy_combinations.multidevice_strategies,
+          mode=["eager"]))
+  def testReshapeWithDynamicInputs(self, distribution):
+
+    def dataset_fn(_):
+      data = array_ops.zeros((5, 1, 2), dtype=dtypes.int32)
+      dataset = get_dataset_from_tensor_slices(data)
+      dataset = dataset.batch(3)
+      return dataset
+
+    input_iterator = iter(
+        distribution.experimental_distribute_datasets_from_function(dataset_fn))
+
+    @def_function.function
+    def step_fn(example):
+      # example: [<=3, 1, 2]
+      # tile: [<=3, <=3, 2]
+      tile = array_ops.tile(example, [1, array_ops.shape(example)[0], 1])
+      # reshape1: [<=(3*3 = 9), 2]
+      reshape1 = array_ops.reshape(tile, [-1, 2])
+
+      # reshape2: [<=3, <=3, 2]
+      reshape2 = array_ops.reshape(
+          reshape1,
+          [array_ops.shape(example)[0],
+           array_ops.shape(example)[0], 2])
+
+      # reshape3: [<=3, -1, 2]
+      reshape3 = array_ops.reshape(reshape1,
+                                   [array_ops.shape(example)[0], -1, 2])
+      # reshape4: [-1, <=3, 2]
+      reshape4 = array_ops.reshape(reshape1,
+                                   [-1, array_ops.shape(example)[0], 2])
+      return [reshape1, reshape2, reshape3, reshape4]
+
+    # This assumes that there are exactly 2 replicas
+    outputs = distribution.experimental_local_results(
+        distribution.run(step_fn, args=(next(input_iterator),)))
+    self.assertAllEqual((9, 2), outputs[0][0].values[0].shape)
+    self.assertAllEqual((3, 3, 2), outputs[0][1].values[0].shape)
+    self.assertAllEqual((3, 3, 2), outputs[0][2].values[0].shape)
+    self.assertAllEqual((3, 3, 2), outputs[0][3].values[0].shape)
+
+    self.assertAllEqual((4, 2), outputs[0][0].values[1].shape)
+    self.assertAllEqual((2, 2, 2), outputs[0][1].values[1].shape)
+    self.assertAllEqual((2, 2, 2), outputs[0][2].values[1].shape)
+    self.assertAllEqual((2, 2, 2), outputs[0][3].values[1].shape)
+
+  @combinations.generate(
+      combinations.combine(
+          distribution=strategy_combinations.multidevice_strategies,
+          mode=["eager"]))
   def testDynamicShapesWithFirstReplicaNotMaximumShape(self, distribution):
     def dataset_fn(_):
       dataset1 = get_dataset_from_tensor_slices([[1., 2.], [1., 2.]])
@@ -659,6 +737,39 @@ class InputIterationTest(test.TestCase, parameterized.TestCase,
     # This assumes that there are exactly 2 replicas
     self.assertAllEqual([1.5, 2.], run(next(input_iterator)))
 
+  @combinations.generate(
+      combinations.combine(
+          distribution=strategy_combinations.multidevice_strategies,
+          mode=["eager"]))
+  def testMapFnWithDynamicInputs(self, distribution):
+
+    def dataset_fn(_):
+      data = array_ops.zeros((20, 300, 32), dtype=dtypes.int32)
+      dataset = get_dataset_from_tensor_slices(data)
+      dataset = dataset.batch(16)
+      return dataset
+
+    input_iterator = iter(
+        distribution.experimental_distribute_datasets_from_function(dataset_fn))
+
+    def embedding_lookup(inputs):
+      embedding_weights = array_ops.zeros((1, 128))
+      flat_inputs = array_ops.reshape(inputs, [-1])
+      embeddings = array_ops.gather(embedding_weights, flat_inputs)
+      embeddings = array_ops.reshape(embeddings, inputs.shape.as_list() + [128])
+      return embeddings
+
+    @def_function.function
+    def step_fn(example):
+      return map_fn.map_fn(
+          embedding_lookup, example, fn_output_signature=dtypes.float32)
+
+    # This assumes that there are exactly 2 replicas
+    outputs = distribution.experimental_local_results(
+        distribution.run(step_fn, args=(next(input_iterator),)))
+    self.assertAllEqual((16, 300, 32, 128), outputs[0].shape)
+    self.assertAllEqual((4, 300, 32, 128), outputs[1].shape)
+
   @combinations.generate(
       combinations.combine(
           distribution=strategy_combinations.all_strategies,
diff --git a/tensorflow/python/distribute/distribute_lib.py b/tensorflow/python/distribute/distribute_lib.py
index 522849ac951..944fc9c6ed7 100644
--- a/tensorflow/python/distribute/distribute_lib.py
+++ b/tensorflow/python/distribute/distribute_lib.py
@@ -200,7 +200,6 @@ import six
 from tensorflow.python.autograph.core import ag_ctx as autograph_ctx
 from tensorflow.python.autograph.impl import api as autograph
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.distribute import collective_util
 from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute import distribution_strategy_context
@@ -873,13 +872,13 @@ class StrategyBase(object):
       `model.evaluate`, `model.predict` and `model.save` can all be called
       inside or outside the scope.
     * The following can be either inside or outside the scope:
-      ** Creating the input datasets
-      ** Defining `tf.function`s that represent your training step
-      ** Saving APIs such as `tf.saved_model.save`. Loading creates variables,
-         so that should go inside the scope if you want to train the model in a
-         distributed way.
-      ** Checkpoint saving. As mentioned above - `checkpoint.restore` may
-         sometimes need to be inside scope if it creates variables.
+        * Creating the input datasets
+        * Defining `tf.function`s that represent your training step
+        * Saving APIs such as `tf.saved_model.save`. Loading creates variables,
+          so that should go inside the scope if you want to train the model in a
+          distributed way.
+        * Checkpoint saving. As mentioned above - `checkpoint.restore` may
+          sometimes need to be inside scope if it creates variables.
 
     Returns:
       A context manager.
@@ -1460,6 +1459,104 @@ class StrategyBase(object):
     denom = math_ops.cast(denom, numer.dtype)
     return math_ops.truediv(numer, denom)
 
+  # TODO(wxinyi): generate docs after it is implemented for all strategies.
+  # TODO(wxinyi): hide from V1 API
+  def _gather(self, value, axis):
+    # pylint: disable=line-too-long, protected-access
+    """Gather `value` across replicas along `axis` to the current device.
+
+    Given a `tf.distribute.DistributedValues` or `tf.Tensor`-like
+    object `value`, this API gathers and concatenates `value` along the
+    `axis`-th dimension. The result is copied to the "current" device - which
+    would typically be the CPU of the worker on which the program is running.
+    For `tf.distribute.TPUStrategy`, it is the first TPU host. For multi-client
+    `MultiWorkerMirroredStrategy`, this is CPU of each worker.
+
+    This API can only be called in the cross-replica context. For a counterpart
+    in the replica context, see `tf.distribute.ReplicaContext.all_gather`.
+
+    Note: the input `value` on different replicas must have the same rank, and
+    they must have shapes that are consistent along all dimensions except the
+    `axis`-th dimension. For example, given a `tf.distribute.DistributedValues`
+    with tensors of shape `(1, 2, 3)` and `(1, 3, 3)` on two replicas, you can
+    call `gather(..., axis=1, ...)` on it, but not `gather(..., axis=0, ...)` or
+    `gather(..., axis=2, ...)`.
+
+
+    # TODO(wxinyi): convert to testable docstring after implemented for MirroredStrategy
+    ```python
+    strategy = tf.distribute.MirroredStrategy(["GPU:0", "GPU:1"])
+    local_tensor = tf.constant([[1, 2], [3, 4]])
+    distributed_values = strategy.experimental_distribute_values_from_function(lambda _: tf.identity(local_tensor))
+    @tf.function
+    def run():
+      return strategy.gather(distributed_values, axis=0)
+    run()
+    # <tf.Tensor: shape=(4, 2), dtype=int32, numpy=
+    # array([[1, 2],
+    #        [3, 4],
+    #        [1, 2],
+    #        [3, 4]], dtype=int32)>
+    ```
+
+    Some more example cases:
+
+    ```python
+    strategy = tf.distribute.MirroredStrategy(["GPU:0", "GPU:1", "GPU:2", "GPU:3"])
+    local_tensor = tf.reshape(tf.range(6), shape=(1,2,3))
+    distributed_values = strategy.experimental_distribute_values_from_function(lambda _: local_tensor)
+    @tf.function
+    def run():
+      return strategy.gather(distributed_values, axis=AXIS)
+    run()
+
+    #     With AXIS=0, the result is
+    #     <tf.Tensor: shape=(4, 2, 3), dtype=int32, numpy=
+    #     array([[[0, 1, 2],
+    #             [3, 4, 5]],
+    #            [[0, 1, 2],
+    #             [3, 4, 5]],
+    #            [[0, 1, 2],
+    #             [3, 4, 5]],
+    #            [[0, 1, 2],
+    #             [3, 4, 5]]], dtype=int32)>
+    #     With AXIS=1, the result is
+    #     <tf.Tensor: shape=(1, 8, 3), dtype=int32, numpy=
+    #     array([[[0, 1, 2],
+    #             [3, 4, 5],
+    #             [0, 1, 2],
+    #             [3, 4, 5],
+    #             [0, 1, 2],
+    #             [3, 4, 5],
+    #             [0, 1, 2],
+    #             [3, 4, 5]]], dtype=int32)>
+    #     With AXIS=2, the result is
+    #     <tf.Tensor: shape=(1, 2, 12), dtype=int32, numpy=
+    #     array([[[0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2],
+    #             [3, 4, 5, 3, 4, 5, 3, 4, 5, 3, 4, 5]]], dtype=int32)>
+
+    ```
+
+    Args:
+      value: a `tf.distribute.DistributedValues` instance, e.g. returned by
+        `Strategy.run`, to be combined into a single tensor. It can also be a
+        regular tensor when used with `OneDeviceStrategy` or default strategy.
+        The underlying tensor constructs can only be dense tensors with non-zero
+        rank, NOT `tf.IndexedSlices`.
+      axis: 0-D int32 Tensor. Dimension along which to gather. Must be in the
+        range [0, rank(value)).
+
+    Returns:
+       A `Tensor` that's the concatenation of `value` across replicas along
+       `axis` dimension.
+    """
+    # pylint: enable=line-too-long
+    _require_cross_replica_or_default_context_extended(self._extended)
+    dst = device_util.current(
+    ) or self._extended._default_device or "/device:CPU:0"
+    return self._extended._local_results(
+        self._extended._gather_to(value, dst, axis))[0]
+
   @doc_controls.do_not_doc_inheritable  # DEPRECATED
   def unwrap(self, value):
     """Returns the list of all local per-replica values contained in `value`.
@@ -1616,167 +1713,6 @@ class Strategy(StrategyBase):
 
   __doc__ = StrategyBase.__doc__
 
-  def experimental_assign_to_logical_device(self, tensor, logical_device_id):
-    """Adds annotation that `tensor` will be assigned to a logical device.
-
-    NOTE: This API is only supported in TPUStrategy for now.
-    This adds an annotation to `tensor` specifying that operations on
-    `tensor` will be invoked on logical core device id `logical_device_id`.
-    When model parallelism is used, the default behavior is that all ops
-    are placed on zero-th logical device.
-
-    ```python
-
-    # Initializing TPU system with 2 logical devices and 4 replicas.
-    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
-    tf.config.experimental_connect_to_cluster(resolver)
-    topology = tf.tpu.experimental.initialize_tpu_system(resolver)
-    device_assignment = tf.tpu.experimental.DeviceAssignment.build(
-        topology,
-        computation_shape=[1, 1, 1, 2],
-        num_replicas=4)
-    strategy = tf.distribute.TPUStrategy(
-        resolver, experimental_device_assignment=device_assignment)
-    iterator = iter(inputs)
-
-    @tf.function()
-    def step_fn(inputs):
-      output = tf.add(inputs, inputs)
-
-      # Add operation will be executed on logical device 0.
-      output = strategy.experimental_assign_to_logical_device(output, 0)
-      return output
-
-    strategy.run(step_fn, args=(next(iterator),))
-    ```
-
-    Args:
-      tensor: Input tensor to annotate.
-      logical_device_id: Id of the logical core to which the tensor will be
-        assigned.
-
-    Raises:
-      ValueError: The logical device id presented is not consistent with total
-      number of partitions specified by the device assignment.
-
-    Returns:
-      Annotated tensor with idential value as `tensor`.
-    """
-    return self._extended._experimental_assign_to_logical_device(  # pylint: disable=protected-access
-        tensor, logical_device_id)
-
-  def experimental_split_to_logical_devices(self, tensor, partition_dimensions):
-    """Adds annotation that `tensor` will be split across logical devices.
-
-    NOTE: This API is only supported in TPUStrategy for now.
-    This adds an annotation to tensor `tensor` specifying that operations on
-    `tensor` will be be split among multiple logical devices. Tensor `tensor`
-    will be split across dimensions specified by `partition_dimensions`.
-    The dimensions of `tensor` must be divisible by corresponding value in
-    `partition_dimensions`.
-
-    For example, for system with 8 logical devices, if `tensor` is an image
-    tensor with shape (batch_size, width, height, channel) and
-    `partition_dimensions` is [1, 2, 4, 1], then `tensor` will be split
-    2 in width dimension and 4 way in height dimension and the split
-    tensor values will be fed into 8 logical devices.
-
-    ```python
-    # Initializing TPU system with 8 logical devices and 1 replica.
-    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
-    tf.config.experimental_connect_to_cluster(resolver)
-    topology = tf.tpu.experimental.initialize_tpu_system(resolver)
-    device_assignment = tf.tpu.experimental.DeviceAssignment.build(
-        topology,
-        computation_shape=[1, 2, 2, 2],
-        num_replicas=1)
-    strategy = tf.distribute.TPUStrategy(
-        resolver, experimental_device_assignment=device_assignment)
-
-    iterator = iter(inputs)
-
-    @tf.function()
-    def step_fn(inputs):
-      inputs = strategy.experimental_split_to_logical_devices(
-        inputs, [1, 2, 4, 1])
-
-      # model() function will be executed on 8 logical devices with `inputs`
-      # split 2 * 4  ways.
-      output = model(inputs)
-      return output
-
-    strategy.run(step_fn, args=(next(iterator),))
-    ```
-    Args:
-      tensor: Input tensor to annotate.
-      partition_dimensions: An unnested list of integers with the size equal to
-        rank of `tensor` specifying how `tensor` will be partitioned. The
-        product of all elements in `partition_dimensions` must be equal to the
-        total number of logical devices per replica.
-
-    Raises:
-      ValueError: 1) If the size of partition_dimensions does not equal to rank
-        of `tensor` or 2) if product of elements of `partition_dimensions` does
-        not match the number of logical devices per replica defined by the
-        implementing DistributionStrategy's device specification or
-        3) if a known size of `tensor` is not divisible by corresponding
-        value in `partition_dimensions`.
-
-    Returns:
-      Annotated tensor with idential value as `tensor`.
-    """
-    return self._extended._experimental_split_to_logical_devices(  # pylint: disable=protected-access
-        tensor, partition_dimensions)
-
-  def experimental_replicate_to_logical_devices(self, tensor):
-    """Adds annotation that `tensor` will be replicated to all logical devices.
-
-    NOTE: This API is only supported in TPUStrategy for now.
-    This adds an annotation to tensor `tensor` specifying that operations on
-    `tensor` will be invoked on all logical devices.
-
-    ```python
-    # Initializing TPU system with 2 logical devices and 4 replicas.
-    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
-    tf.config.experimental_connect_to_cluster(resolver)
-    topology = tf.tpu.experimental.initialize_tpu_system(resolver)
-    device_assignment = tf.tpu.experimental.DeviceAssignment.build(
-        topology,
-        computation_shape=[1, 1, 1, 2],
-        num_replicas=4)
-    strategy = tf.distribute.TPUStrategy(
-        resolver, experimental_device_assignment=device_assignment)
-
-    iterator = iter(inputs)
-
-    @tf.function()
-    def step_fn(inputs):
-      images, labels = inputs
-      images = strategy.experimental_split_to_logical_devices(
-        inputs, [1, 2, 4, 1])
-
-      # model() function will be executed on 8 logical devices with `inputs`
-      # split 2 * 4  ways.
-      output = model(inputs)
-
-      # For loss calculation, all logical devices share the same logits
-      # and labels.
-      labels = strategy.experimental_replicate_to_logical_devices(labels)
-      output = strategy.experimental_replicate_to_logical_devices(output)
-      loss = loss_fn(labels, output)
-
-      return loss
-
-    strategy.run(step_fn, args=(next(iterator),))
-    ```
-    Args:
-      tensor: Input tensor to annotate.
-
-    Returns:
-      Annotated tensor with idential value as `tensor`.
-    """
-    return self._extended._experimental_replicate_to_logical_devices(tensor)  # pylint: disable=protected-access
-
   def experimental_distribute_values_from_function(self, value_fn):
     """Generates `tf.distribute.DistributedValues` from `value_fn`.
 
@@ -2131,6 +2067,10 @@ class StrategyExtendedV2(object):
         checkpoint_restore_uid = kwargs[
             "initial_value"].checkpoint_position.restore_uid
         kwargs["initial_value"] = kwargs["initial_value"].wrapped_value
+      elif isinstance(kwargs["initial_value"],
+                      trackable.CheckpointInitialValueCallable):
+        checkpoint_restore_uid = kwargs[
+            "initial_value"].checkpoint_position.restore_uid
       else:
         checkpoint_restore_uid = None
 
@@ -2140,6 +2080,9 @@ class StrategyExtendedV2(object):
         # pylint: disable=protected-access
         # Let the checkpointing infrastructure know that the variable was
         # already restored so it doesn't waste memory loading the value again.
+        # In this case of CheckpointInitialValueCallable this may already be
+        # done by the final variable creator, but it doesn't hurt to do it
+        # again.
         created._maybe_initialize_trackable()
         created._update_uid = checkpoint_restore_uid
         # pylint: enable=protected-access
@@ -2244,19 +2187,6 @@ class StrategyExtendedV2(object):
     """Validate `colocate_with_variable` argument to `colocate_vars_with`."""
     pass
 
-  def _experimental_assign_to_logical_device(self, tensor, logical_device_id):
-    raise NotImplementedError("This method should be overriden by "
-                              "sub-classes which support model parallelism.")
-
-  def _experimental_split_to_logical_devices(self, tensor,
-                                             partition_dimensions):
-    raise NotImplementedError("This method should be overriden by "
-                              "sub-classes which support model parallelism.")
-
-  def _experimental_replicate_to_logical_devices(self, tensor):
-    raise NotImplementedError("This method should be overriden by "
-                              "sub-classes which support model parallelism.")
-
   def _make_dataset_iterator(self, dataset):
     raise NotImplementedError("must be implemented in descendants")
 
@@ -2333,16 +2263,18 @@ class StrategyExtendedV2(object):
     <tf.Variable 'Variable:0' shape=() dtype=float32, numpy=1.0>
 
     Args:
-      reduce_op: a `tf.distribute.ReduceOp` or string. How to reduce the value.
-      value: a `tf.distribute.DistributedValue`, or a `tf.Tensor` like object.
-      destinations: a `tf.distribute.DistributedValue`, a `tf.Variable`, a
+      reduce_op: a `tf.distribute.ReduceOp` value specifying how values should
+        be combined. Allows using string representation of the enum such as
+        "SUM", "MEAN".
+      value: a `tf.distribute.DistributedValues`, or a `tf.Tensor` like object.
+      destinations: a `tf.distribute.DistributedValues`, a `tf.Variable`, a
         `tf.Tensor` alike object, or a device string. It specifies the devices
         to reduce to. To perform an all-reduce, pass the same to `value` and
         `destinations`. Note that if it's a `tf.Variable`, the value is reduced
-        to the devices of that variable, this method doesn't update the variable.
-      experimental_hints: a `tf.distrbute.experimental.CollectiveHints`. Hints
-        to perform collective operations. See
-        `tf.distrbute.experimental.CollectiveHints` for details.
+        to the devices of that variable, and this method doesn't update the
+        variable.
+      experimental_hints: a `tf.distribute.experimental.CollectiveHints`. See
+        `tf.distribute.experimental.CollectiveHints` for details.
 
     Returns:
       A tensor or value reduced to `destinations`.
@@ -2414,11 +2346,13 @@ class StrategyExtendedV2(object):
     <tf.Variable 'Variable:0' shape=() dtype=float32, numpy=1.0>
 
     Args:
-      reduce_op: a `tf.distribute.ReduceOp`. How to reduce the value.
+      reduce_op: a `tf.distribute.ReduceOp` value specifying how values should
+        be combined. Allows using string representation of the enum such as
+        "SUM", "MEAN".
       value_destination_pairs: a sequence of (value, destinations) pairs. See
-        `reduce_to()` for descriptions.
-      experimental_hints: a `tf.distrbute.experimental.CollectiveHints`. Hints
-        to perform collective operations.
+        `tf.distribute.Strategy.reduce_to` for descriptions.
+      experimental_hints: a `tf.distribute.experimental.CollectiveHints`. See
+        `tf.distribute.experimental.CollectiveHints` for details.
 
     Returns:
       A list of reduced values, one per pair in `value_destination_pairs`.
@@ -2440,6 +2374,51 @@ class StrategyExtendedV2(object):
         for t, v in value_destination_pairs
     ]
 
+  def _gather_to(self, value, destinations, axis, experimental_hints=None):
+    """Gather `value` across replicas along axis-th dimension to `destinations`.
+
+    `gather_to` gathers `tf.distribute.DistributedValues` or `tf.Tensor`-like
+    object, along `axis`-th dimension. It supports only dense tensors but NOT
+    sparse tensor. This API can only be called in cross-replica context.
+
+    Args:
+      value: a `tf.distribute.DistributedValues`, or a `tf.Tensor` like object.
+      destinations: a `tf.distribute.DistributedValues`, a `tf.Variable`, a
+        `tf.Tensor` alike object, or a device string. It specifies the devices
+        to reduce to. To perform an all-gather, pass the same to `value` and
+        `destinations`. Note that if it's a `tf.Variable`, the value is reduced
+        to the devices of that variable, and this method doesn't update the
+        variable.
+      axis: 0-D int32 Tensor. Dimension along which to gather. Must be in the
+        range [0, rank(value)).
+      experimental_hints: a `tf.distribute.experimental.CollectiveHints`. See
+        `tf.distribute.experimental.CollectiveHints` for details.
+
+    Returns:
+      A tensor or value gathered to `destinations`.
+    """
+    _require_cross_replica_or_default_context_extended(self)
+    assert not isinstance(destinations, (list, tuple))
+    if experimental_hints is None:
+      experimental_hints = collective_util.Hints()
+    return self._gather_to_implementation(value, destinations, axis, experimental_hints)
+
+  def _gather_to_implementation(self, value, destinations, axis, experimental_hints):
+    raise NotImplementedError("_gather_to must be implemented in descendants")
+
+  def _batch_gather_to(self,
+                       value_destination_pairs,
+                       axis,
+                       experimental_hints=None):
+    _require_cross_replica_or_default_context_extended(self)
+    if experimental_hints is None:
+      experimental_hints = collective_util.Hints()
+    return [
+        self._gather_to(
+            t, destinations=v, axis=axis, experimental_hints=experimental_hints)
+        for t, v in value_destination_pairs
+    ]
+
   def update(self, var, fn, args=(), kwargs=None, group=True):
     """Run `fn` to update `var` using inputs mirrored to the same devices.
 
@@ -2512,35 +2491,6 @@ class StrategyExtendedV2(object):
   def _update(self, var, fn, args, kwargs, group):
     raise NotImplementedError("must be implemented in descendants")
 
-  @doc_controls.do_not_generate_docs
-  def update_non_slot(
-      self, colocate_with, fn, args=(), kwargs=None, group=True):
-    """Runs `fn(*args, **kwargs)` on `colocate_with` devices.
-
-    Used to update non-slot variables.
-
-    Args:
-      colocate_with: Devices returned by `non_slot_devices()`.
-      fn: Function to execute.
-      args: Tuple or list. Positional arguments to pass to `fn()`.
-      kwargs: Dict with keyword arguments to pass to `fn()`.
-      group: Boolean. Defaults to True. If False, the return value will be
-        unwrapped.
-
-    Returns:
-      Return value of `fn`, possibly merged across devices.
-    """
-    _require_cross_replica_or_default_context_extended(self)
-    if kwargs is None:
-      kwargs = {}
-    fn = autograph.tf_convert(
-        fn, autograph_ctx.control_status_ctx(), convert_by_default=False)
-    with self._container_strategy().scope():
-      return self._update_non_slot(colocate_with, fn, args, kwargs, group)
-
-  def _update_non_slot(self, colocate_with, fn, args, kwargs, group):
-    raise NotImplementedError("must be implemented in descendants")
-
   def _local_results(self, distributed_value):
     raise NotImplementedError("must be implemented in descendants")
 
@@ -2594,26 +2544,6 @@ class StrategyExtendedV2(object):
     # TODO(josh11b): More docstring
     raise NotImplementedError("must be implemented in descendants")
 
-  @doc_controls.do_not_generate_docs
-  def non_slot_devices(self, var_list):
-    """Device(s) for non-slot variables.
-
-    This method returns non-slot devices where non-slot variables are placed.
-    Users can create non-slot variables on these devices by using a block:
-
-    ```python
-    with tf.distribute.StrategyExtended.colocate_vars_with(tf.distribute.StrategyExtended.non_slot_devices(...)):
-      ...
-    ```
-
-    Args:
-      var_list: The list of variables being optimized, needed with the
-        default `tf.distribute.Strategy`.
-    Returns:
-      A sequence of devices for non-slot variables.
-    """
-    raise NotImplementedError("must be implemented in descendants")
-
   def _configure(self,
                  session_config=None,
                  cluster_spec=None,
@@ -2812,6 +2742,57 @@ class StrategyExtendedV1(StrategyExtendedV2):
     """
     raise NotImplementedError("must be implemented in descendants")
 
+  def update_non_slot(
+      self, colocate_with, fn, args=(), kwargs=None, group=True):
+    """Runs `fn(*args, **kwargs)` on `colocate_with` devices.
+
+    Used to update non-slot variables.
+
+    DEPRECATED: TF 1.x ONLY.
+
+    Args:
+      colocate_with: Devices returned by `non_slot_devices()`.
+      fn: Function to execute.
+      args: Tuple or list. Positional arguments to pass to `fn()`.
+      kwargs: Dict with keyword arguments to pass to `fn()`.
+      group: Boolean. Defaults to True. If False, the return value will be
+        unwrapped.
+
+    Returns:
+      Return value of `fn`, possibly merged across devices.
+    """
+    _require_cross_replica_or_default_context_extended(self)
+    if kwargs is None:
+      kwargs = {}
+    fn = autograph.tf_convert(
+        fn, autograph_ctx.control_status_ctx(), convert_by_default=False)
+    with self._container_strategy().scope():
+      return self._update_non_slot(colocate_with, fn, args, kwargs, group)
+
+  def _update_non_slot(self, colocate_with, fn, args, kwargs, group):
+    raise NotImplementedError("must be implemented in descendants")
+
+  def non_slot_devices(self, var_list):
+    """Device(s) for non-slot variables.
+
+    DEPRECATED: TF 1.x ONLY.
+
+    This method returns non-slot devices where non-slot variables are placed.
+    Users can create non-slot variables on these devices by using a block:
+
+    ```python
+    with tf.distribute.StrategyExtended.colocate_vars_with(tf.distribute.StrategyExtended.non_slot_devices(...)):
+      ...
+    ```
+
+    Args:
+      var_list: The list of variables being optimized, needed with the
+        default `tf.distribute.Strategy`.
+    Returns:
+      A sequence of devices for non-slot variables.
+    """
+    raise NotImplementedError("must be implemented in descendants")
+
   @property
   def experimental_between_graph(self):
     """Whether the strategy uses between-graph replication or not.
@@ -3011,32 +2992,64 @@ class ReplicaContext(object):
     return (device_util.current(),)
 
   def all_reduce(self, reduce_op, value, experimental_hints=None):
-    """All-reduces the given `value Tensor` nest across replicas.
+    """All-reduces `value` across all replicas.
 
-    If `all_reduce` is called in any replica, it must be called in all replicas.
-    The nested structure and `Tensor` shapes must be identical in all replicas.
+    >>> strategy = tf.distribute.MirroredStrategy(["GPU:0", "GPU:1"])
+    >>> def step_fn():
+    ...   ctx = tf.distribute.get_replica_context()
+    ...   value = tf.identity(1.)
+    ...   return ctx.all_reduce(tf.distribute.ReduceOp.SUM, value)
+    >>> strategy.experimental_local_results(strategy.run(step_fn))
+    (<tf.Tensor: shape=(), dtype=float32, numpy=2.0>,
+     <tf.Tensor: shape=(), dtype=float32, numpy=2.0>)
 
-    IMPORTANT: The ordering of communications must be identical in all replicas.
+    It supports batched operations. You can pass a list of values and it
+    attempts to batch them when possible. You can also specify `experimental_hints`
+    to indicate the desired batching behavior, e.g. batch the values into
+    multiple packs so that they can better overlap with computations.
 
-    Example with two replicas:
-      Replica 0 `value`: {'a': 1, 'b': [40, 1]}
-      Replica 1 `value`: {'a': 3, 'b': [ 2, 98]}
+    >>> strategy = tf.distribute.MirroredStrategy(["GPU:0", "GPU:1"])
+    >>> def step_fn():
+    ...   ctx = tf.distribute.get_replica_context()
+    ...   value1 = tf.identity(1.)
+    ...   value2 = tf.identity(2.)
+    ...   return ctx.all_reduce(tf.distribute.ReduceOp.SUM, [value1, value2])
+    >>> strategy.experimental_local_results(strategy.run(step_fn))
+    ([PerReplica:{
+      0: <tf.Tensor: shape=(), dtype=float32, numpy=2.0>,
+      1: <tf.Tensor: shape=(), dtype=float32, numpy=2.0>
+    }, PerReplica:{
+      0: <tf.Tensor: shape=(), dtype=float32, numpy=4.0>,
+      1: <tf.Tensor: shape=(), dtype=float32, numpy=4.0>
+    }],)
 
-      If `reduce_op` == `SUM`:
-        Result (on all replicas): {'a': 4, 'b': [42, 99]}
+    Note that all replicas need to participate in the all-reduce, otherwise this
+    operation hangs. Note that if there're multiple all-reduces, they need to
+    execute in the same order on all replicas. Dispatching all-reduce based on
+    conditions is usually error-prone.
 
-      If `reduce_op` == `MEAN`:
-        Result (on all replicas): {'a': 2, 'b': [21, 49.5]}
+    This API currently can only be called in the replica context. Other
+    variants to reduce values across replicas are:
+    * `tf.distribute.StrategyExtended.reduce_to`: the reduce and all-reduce API
+      in the cross-replica context.
+    * `tf.distribute.StrategyExtended.batch_reduce_to`: the batched reduce and
+      all-reduce API in the cross-replica context.
+    * `tf.distribute.Strategy.reduce`: a more convenient method to reduce
+      to the host in cross-replica context.
 
     Args:
-      reduce_op: Reduction type, an instance of `tf.distribute.ReduceOp` enum.
-      value: The nested structure of `Tensor`s to all-reduce. The structure must
-        be compatible with `tf.nest`.
-      experimental_hints: A `tf.distrbute.experimental.CollectiveHints`. Hints
+      reduce_op: a `tf.distribute.ReduceOp` value specifying how values should
+        be combined. Allows using string representation of the enum such as
+        "SUM", "MEAN".
+      value: a nested structure of `tf.Tensor` which `tf.nest.flatten` accepts.
+        The structure and the shapes of the `tf.Tensor` need to be same on all
+        replicas.
+      experimental_hints: a `tf.distribute.experimental.CollectiveHints`. Hints
         to perform collective operations.
 
     Returns:
-       A `Tensor` nest with the reduced `value`s from each replica.
+       A nested structure of `tf.Tensor` with the reduced values. The structure
+       is the same as `value`.
     """
     if isinstance(reduce_op, six.string_types):
       reduce_op = reduce_util.ReduceOp(reduce_op.upper())
@@ -3072,6 +3085,112 @@ class ReplicaContext(object):
   #   to that point that the first result is needed. Most likely this can be
   #   implemented in terms of `merge_call()` and `batch_reduce_to()`.
 
+  # TODO(wxinyi): generate docs after it is implemented for all strategies.
+  def _all_gather(self, value, axis, experimental_hints=None):
+    """All-gathers `value` across all replicas along `axis`.
+
+    Note: An `all_gather` method can only be called in replica context. To find
+    a cross-replica context counterpart, see `tf.distribute.Strategy.gather`.
+    All replicas need to participate in the all-gather, otherwise this
+    operation hangs. So if `all_gather` is called in any replica, it must be
+    called in all replicas.
+
+    Note: If there're multiple all-gather calls, they need to execute in
+    the same order on all replicas. Dispatching all-gather based on conditions
+    is usually error-prone.
+
+    # TODO(wxinyi): convert to testable docstring after implemented for MirroredStrategy
+    ```python
+    strategy = tf.distribute.MirroredStrategy(["GPU:0", "CPU:0"])
+    @tf.function
+    def gather_value():
+      ctx = tf.distribute.get_replica_context()
+      value = tf.constant([1, 2, 3])
+      # all_gather a `tf.distribute.DistributedValues`
+      return strategy.run(ctx.all_gather(value, axis=0))
+    strategy.experimental_local_results(gather_value)
+    # Result:
+    # (<tf.Tensor: shape=(6,), dtype=int32, numpy=array([1, 2, 3, 1, 2, 3],
+    # dtype=int32)>,
+    #  <tf.Tensor: shape=(6,), dtype=int32, numpy=array([1, 2, 3, 1, 2, 3],
+    # dtype=int32)>)
+    ```
+
+    ```python
+    strategy = tf.distribute.MirroredStrategy(["GPU:0", "CPU:0"])
+    @tf.function
+    def gather_nest():
+      ctx = tf.distribute.get_replica_context()
+      value_1 = tf.constant([1, 2, 3])
+      value_2 = tf.constant([[1, 2], [3, 4]])
+      # all_gather a nest of `tf.distribute.DistributedValues`
+      return ctx.all_gather([value_1, value_2], axis=0)
+    strategy.experimental_local_results(gather_nest)
+    # Result:
+    # ([<tf.Tensor: shape=(6,), dtype=int32, numpy=array([1, 2, 3, 1, 2, 3],
+    # dtype=int32)>, <tf.Tensor: shape=(4, 2), dtype=int32, numpy=
+    # array([[1, 2],
+    #        [3, 4],
+    #        [1, 2],
+    #        [3, 4]], dtype=int32)],
+    # [<tf.Tensor: shape=(6,), dtype=int32, numpy=array([1, 2, 3, 1, 2, 3],
+    # dtype=int32)>, <tf.Tensor: shape=(4, 2), dtype=int32, numpy=
+    # array([[1, 2],
+    #        [3, 4],
+    #        [1, 2],
+    #        [3, 4]], dtype=int32)])
+    ```
+
+    Example with two replicas:
+      Replica 0 `value`: {'a': [0], 'b': [[0, 1]]}
+      Replica 1 `value`: {'a': [1], 'b': [[2, 3], [4, 5]]}
+
+      Result for `all_gather` with `axis`=0: (on all replicas):
+      {'a': [1, 2], 'b': [[0, 1], [2, 3], [4, 5]]}
+
+    Note: an input to be all_gathered must have the same rank on different
+    replicas, and they must have shapes that are consistent along all dimensions
+    except the `axis`-th dimension. For example, given a
+    `tf.distribute.DistributedValues` with tensors of shape `(1, 2, 3)` and
+    `(1, 3, 3)` on two replicas, you can call `all_gather(..., axis=1, ...)` on
+    it, but not `all_gather(..., axis=0, ...)` or `all_gather(..., axis=2, ...)`.
+
+
+    Args:
+      value: a nested structure of `tf.Tensor` which `tf.nest.flatten` accepts,
+        or a `tf.distribute.DistributedValues` instance. The structure of the
+        `tf.Tensor` need to be same on all replicas. The underlying tensor
+        constructs can only be dense tensors with non-zero rank, NOT
+        `tf.IndexedSlices`.
+      axis: 0-D int32 Tensor. Dimension along which to gather.
+      experimental_hints: a `tf.distribute.experimental.CollectiveHints`. Hints
+        to perform collective operations.
+
+    Returns:
+       A nested structure of `tf.Tensor` with the gathered values. The structure
+       is the same as `value`.
+    """
+    for v in nest.flatten(value):
+      if isinstance(v, ops.IndexedSlices):
+        raise NotImplementedError("gather/all_gather does not support "
+                                  "IndexedSlices")
+
+    if experimental_hints is None:
+      experimental_hints = collective_util.Hints()
+
+    def batch_all_gather(strategy, *value_flat):
+      return strategy.extended._batch_gather_to(  # pylint: disable=protected-access
+          [(v, _batch_reduce_destination(v)) for v in value_flat], axis,
+          experimental_hints)
+
+    @custom_gradient.custom_gradient
+    def grad_wrapper(*xs):
+      ys = self.merge_call(batch_all_gather, args=xs)
+      # The gradient of an all-gather is itself an all-gather.
+      return ys, lambda *dy_s: self.all_gather(dy_s, axis)
+
+    return nest.pack_sequence_as(value, grad_wrapper(*nest.flatten(value)))
+
 
 def _batch_reduce_destination(x):
   """Returns the destinations for batch all-reduce."""
@@ -3292,7 +3411,7 @@ class _DefaultDistributionExtended(StrategyExtendedV1):
       return self._iterator.get_next()
 
     def get_next_as_optional(self):
-      return iterator_ops.get_next_as_optional(self._iterator)
+      return self._iterator.get_next_as_optional()
 
     @deprecated(None, "Use the iterator's `initializer` property instead.")
     def initialize(self):
diff --git a/tensorflow/python/distribute/distribute_utils.py b/tensorflow/python/distribute/distribute_utils.py
index 916ebafd8ac..32582ac9ce6 100644
--- a/tensorflow/python/distribute/distribute_utils.py
+++ b/tensorflow/python/distribute/distribute_utils.py
@@ -63,8 +63,8 @@ def regroup(values, wrap_class=values_lib.PerReplica, always_wrap=False):
     if hasattr(v0, "_fields"):
       # This tuple is in fact a namedtuple! Create a new namedtuple instance
       # and initialize it with the regrouped values:
-      assert hasattr(type(v0), "_make")
-      return type(v0)._make(regrouped_tuple)
+      assert hasattr(v0, "_make")
+      return v0._make(regrouped_tuple)
     else:
       return regrouped_tuple
 
@@ -115,10 +115,10 @@ def regroup(values, wrap_class=values_lib.PerReplica, always_wrap=False):
     # pylint: disable=protected-access
     assert not isinstance(v0, values_lib.MirroredVariable), (
         "ids = %s, values = %s" % ([id(v) for v in values], values))
-    distributed_container = v0._distributed_container
+    distributed_container = v0._distributed_container()
     assert distributed_container is not None
     for v in values[1:]:
-      assert distributed_container is v._distributed_container
+      assert distributed_container is v._distributed_container()
     return distributed_container
   # pylint: enable=protected-access
 
@@ -209,7 +209,7 @@ def value_container(val):
       # DistributedVariable has _distributed_container defined
       # but we don't want to return it.
       not isinstance(val, values_lib.DistributedVariable)):
-    container = val._distributed_container  # pylint: disable=protected-access
+    container = val._distributed_container()  # pylint: disable=protected-access
     if container is not None:
       return container
   return val
@@ -318,13 +318,6 @@ def create_mirrored_variable(strategy, real_mirrored_creator, class_mapping,
     else:
       var_cls = class_mapping.get(synchronization)
       result = var_cls(strategy, value_list, aggregation)
-    # Install the created DistributedVariable as _distributed_container property
-    # of the underlying variables, to make it easy to map back to the container.
-    for v in result.values:
-      # Hold a strong reference to avoid the container from being GC-ed. After
-      # v = v.assign(), the user code may no longer holds references to the
-      # original container, since v.assign() returns a new DistributedVariable.
-      v._distributed_container = result  # pylint: disable=protected-access
 
   # Add the wrapped variable to the requested collections.
   # The handling of eager mode and the global step matches
diff --git a/tensorflow/python/distribute/distribute_utils_test.py b/tensorflow/python/distribute/distribute_utils_test.py
index f91cad2db47..22ea6264d07 100644
--- a/tensorflow/python/distribute/distribute_utils_test.py
+++ b/tensorflow/python/distribute/distribute_utils_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import collections
 
 from absl.testing import parameterized
+import wrapt
 
 from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import distribute_utils
@@ -211,6 +212,15 @@ class RegroupAndSelectDeviceTest(test.TestCase, parameterized.TestCase):
                          distribute_utils.select_replica(
                              device_id, merged_estimator_spec))
 
+  def testWrappedNamedTuple(self):
+    Point = collections.namedtuple("Point", ["x", "y"])
+    point1 = Point(x=0, y=2)
+    point2 = Point(x=1, y=3)
+    wrapped1 = wrapt.ObjectProxy(point1)
+    wrapped2 = wrapt.ObjectProxy(point2)
+    result = distribute_utils.regroup([wrapped1, wrapped2])
+    self.assertEqual(result.x.values, (0, 1))
+    self.assertEqual(result.y.values, (2, 3))
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/distribute/input_lib.py b/tensorflow/python/distribute/input_lib.py
index b77739c1274..d689346870e 100644
--- a/tensorflow/python/distribute/input_lib.py
+++ b/tensorflow/python/distribute/input_lib.py
@@ -499,21 +499,27 @@ class InputWorkers(object):
     return InputWorkers(worker_device_pairs)
 
 
-def _get_next_as_optional(iterator, strategy, name=None):
-  """Returns an empty dataset indicator and the next input from the iterator."""
+def _get_next_as_optional(iterator, strategy, return_per_replica=False):
+  """Returns an empty dataset indicator and the next input from the iterator.
+
+  Args:
+    iterator: a DistributedIterator object.
+    strategy: the `tf.distribute.Strategy` instance.
+    return_per_replica: a boolean. If True, the returned data will be wrapped
+      with `PerReplica` structure. Otherwise it is a 2D
+      num_input_workers*num_replicas_per_worker list.
+
+  Returns:
+    A tuple (a boolean tensor indicating whether the next batch has value
+    globally, data from all replicas).
+  """
   replicas = []
   worker_has_values = []
   worker_devices = []
   for i, worker in enumerate(iterator._input_workers.worker_devices):  # pylint: disable=protected-access
-    if name is not None:
-      d = tf_device.DeviceSpec.from_string(worker)
-      new_name = "%s_%s_%d" % (name, d.job, d.task)
-    else:
-      new_name = None
-
     with ops.device(worker):
       worker_has_value, next_element = (
-          iterator._iterators[i].get_next_as_list(new_name))  # pylint: disable=protected-access
+          iterator._iterators[i].get_next_as_list())  # pylint: disable=protected-access
       # Collective all-reduce requires explicit devices for inputs.
       with ops.device("/cpu:0"):
         # Converting to integers for all-reduce.
@@ -523,6 +529,12 @@ def _get_next_as_optional(iterator, strategy, name=None):
       # Make `replicas` a flat list of values across all replicas.
       replicas.append(next_element)
 
+  if return_per_replica:
+    flattened_data = []
+    for per_worker_data in replicas:
+      flattened_data.extend(per_worker_data)
+    replicas = distribute_utils.regroup(flattened_data)
+
   # Run an all-reduce to see whether any worker has values.
   # TODO(b/131423105): we should be able to short-cut the all-reduce in some
   # cases.
@@ -622,29 +634,15 @@ class DistributedIteratorBase(DistributedIteratorInterface):
     return self
 
   def get_next_as_optional(self):
-    global_has_value, replicas = _get_next_as_optional(self, self._strategy)
+    global_has_value, replicas = _get_next_as_optional(
+        self, self._strategy, return_per_replica=True)
 
     def return_none():
       return optional_ops.Optional.empty(self._element_spec)
 
-    def return_value(replicas):
-      """Wraps the inputs for replicas in an `tf.experimental.Optional`."""
-      results = []
-      for i, worker in enumerate(self._input_workers.worker_devices):
-        with ops.device(worker):
-          devices = self._input_workers.compute_devices_for_worker(i)
-          for j, device in enumerate(devices):
-            with ops.device(device):
-              result = replicas[i][j]
-              results.append(result)
-      replicas = results
-
-      return optional_ops.Optional.from_value(
-          distribute_utils.regroup(replicas))
-
-    return control_flow_ops.cond(global_has_value,
-                                 lambda: return_value(replicas),
-                                 lambda: return_none())  # pylint: disable=unnecessary-lambda
+    return control_flow_ops.cond(
+        global_has_value, lambda: optional_ops.Optional.from_value(replicas),
+        return_none)
 
   def get_next(self, name=None):
     """Returns the next input from the iterator for all replicas."""
@@ -671,7 +669,8 @@ class DistributedIteratorBase(DistributedIteratorInterface):
       out_of_range_replicas.append(data)
       return data
 
-    global_has_value, replicas = _get_next_as_optional(self, self._strategy)
+    global_has_value, replicas = _get_next_as_optional(
+        self, self._strategy, return_per_replica=False)
     results = []
     for i, worker in enumerate(self._input_workers.worker_devices):
       with ops.device(worker):
@@ -906,7 +905,8 @@ class _IterableInput(DistributedDatasetInterface):
   def reduce(self, initial_state, reduce_fn):
     """Execute a `reduce_fn` over all the elements of the input."""
     iterator = iter(self)
-    has_data, data = _get_next_as_optional(iterator, self._strategy)
+    has_data, data = _get_next_as_optional(
+        iterator, self._strategy, return_per_replica=True)
 
     def cond(has_data, data, state):
       del data, state  # Unused.
@@ -915,16 +915,9 @@ class _IterableInput(DistributedDatasetInterface):
     def loop_body(has_data, data, state):
       """Executes `reduce_fn` in a loop till the dataset is empty."""
       del has_data  # Unused.
-      # data is list of lists here. where each list corresponds to one worker.
-      # TODO(b/130570614): Add support for the multiworker and TPU pods use
-      # case.
-      if self._input_workers.num_workers == 1:
-        data = data[0]
-      else:
-        raise ValueError("Dataset iteration within a tf.function is"
-                         " not supported for multiple workers.")
-      state = reduce_fn(state, distribute_utils.regroup(data))
-      has_data, data = _get_next_as_optional(iterator, self._strategy)
+      state = reduce_fn(state, data)
+      has_data, data = _get_next_as_optional(
+          iterator, self._strategy, return_per_replica=True)
       return has_data, data, state
 
     has_data, data, final_state = control_flow_ops.while_loop(
diff --git a/tensorflow/python/distribute/input_lib_test.py b/tensorflow/python/distribute/input_lib_test.py
index a70eb50dbba..ea411542a51 100644
--- a/tensorflow/python/distribute/input_lib_test.py
+++ b/tensorflow/python/distribute/input_lib_test.py
@@ -992,8 +992,7 @@ class DistributedIteratorMultiWorkerTest(
 
     strategy = mirrored_strategy.MirroredStrategy(
         devices=(self._cpu_devices()[0][1] + self._cpu_devices()[1][1]),
-        cross_device_ops=cross_device_ops_lib.MultiWorkerAllReduce(
-            ["/job:worker/task:0", "/job:worker/task:1"], 1))
+        cross_device_ops=cross_device_ops_lib.ReductionToOneDevice())
     worker_devices = self._cpu_devices()
     with context.graph_mode(), self.cached_session() as sess:
       if auto_shard_policy == AutoShardPolicy.AUTO:
@@ -1022,8 +1021,7 @@ class DistributedIteratorMultiWorkerTest(
 
     strategy = mirrored_strategy.MirroredStrategy(
         devices=(self._cpu_devices()[0][1] + self._cpu_devices()[1][1]),
-        cross_device_ops=cross_device_ops_lib.MultiWorkerAllReduce(
-            ["/job:worker/task:0", "/job:worker/task:1"], 1))
+        cross_device_ops=cross_device_ops_lib.ReductionToOneDevice())
     worker_devices = self._cpu_devices()
     with context.graph_mode(), strategy.scope(), self.cached_session() as sess:
 
@@ -1064,8 +1062,7 @@ class DistributedIteratorMultiWorkerTest(
     strategy = mirrored_strategy.MirroredStrategy(
         devices=(self._cpu_and_one_gpu_devices()[0][1] +
                  self._cpu_and_one_gpu_devices()[1][1]),
-        cross_device_ops=cross_device_ops_lib.MultiWorkerAllReduce(
-            ["/job:worker/task:0", "/job:worker/task:1"], 2))
+        cross_device_ops=cross_device_ops_lib.ReductionToOneDevice())
     worker_devices = self._cpu_and_one_gpu_devices()
     with context.graph_mode(), strategy.scope(), self.cached_session() as sess:
 
@@ -1097,8 +1094,7 @@ class DistributedIteratorMultiWorkerTest(
                        enable_get_next_as_optional):
     strategy = mirrored_strategy.MirroredStrategy(
         devices=(self._cpu_devices()[0][1] + self._cpu_devices()[1][1]),
-        cross_device_ops=cross_device_ops_lib.MultiWorkerAllReduce(
-            ["/job:worker/task:0", "/job:worker/task:1"], 1))
+        cross_device_ops=cross_device_ops_lib.ReductionToOneDevice())
     worker_devices = self._cpu_devices()
 
     def dataset_fn(ctx):
@@ -1144,8 +1140,7 @@ class DistributedIteratorMultiWorkerTest(
     strategy = mirrored_strategy.MirroredStrategy(
         devices=(self._cpu_and_one_gpu_devices()[0][1] +
                  self._cpu_and_one_gpu_devices()[1][1]),
-        cross_device_ops=cross_device_ops_lib.MultiWorkerAllReduce(
-            ["/job:worker/task:0", "/job:worker/task:1"], 2))
+        cross_device_ops=cross_device_ops_lib.ReductionToOneDevice())
     if tf2.enabled():
       dataset_fn = lambda _: dataset_ops.DatasetV2.range(9).batch(2)
     else:
@@ -1263,8 +1258,7 @@ class DistributedIteratorMultiWorkerTest(
     strategy = mirrored_strategy.MirroredStrategy(
         devices=(self._cpu_and_one_gpu_devices()[0][1] +
                  self._cpu_and_one_gpu_devices()[1][1]),
-        cross_device_ops=cross_device_ops_lib.MultiWorkerAllReduce(
-            ["/job:worker/task:0", "/job:worker/task:1"], 2))
+        cross_device_ops=cross_device_ops_lib.ReductionToOneDevice())
     worker_devices = self._cpu_and_one_gpu_devices()
     with context.graph_mode(), strategy.scope(), self.cached_session() as sess:
 
diff --git a/tensorflow/python/distribute/integration_test/BUILD b/tensorflow/python/distribute/integration_test/BUILD
index d997e64be05..361c8a42dbe 100644
--- a/tensorflow/python/distribute/integration_test/BUILD
+++ b/tensorflow/python/distribute/integration_test/BUILD
@@ -1,4 +1,5 @@
 load("//tensorflow/core/platform/default:distribute.bzl", "distribute_py_test")
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 
 package(
     default_visibility = ["//tensorflow:internal"],
@@ -8,6 +9,7 @@ package(
 distribute_py_test(
     name = "saved_model_test",
     srcs = ["saved_model_test.py"],
+    disable_mlir_bridge = False,
     deps = [
         "//tensorflow:tensorflow_py",
         "//tensorflow/python/distribute:combinations",
@@ -18,3 +20,22 @@ distribute_py_test(
         "@absl_py//absl/testing:parameterized",
     ],
 )
+
+cuda_py_test(
+    name = "mwms_peer_failure_test",
+    size = "medium",
+    srcs = ["mwms_peer_failure_test.py"],
+    python_version = "PY3",
+    shard_count = 2,
+    tags = [
+        "multi_and_single_gpu",
+    ],
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python/distribute:collective_all_reduce_strategy",
+        "//tensorflow/python/distribute:combinations",
+        "//tensorflow/python/distribute:multi_process_runner",
+        "//tensorflow/python/distribute:multi_worker_test_base",
+        "//tensorflow/python/eager:test",
+    ],
+)
diff --git a/tensorflow/python/distribute/integration_test/mwms_peer_failure_test.py b/tensorflow/python/distribute/integration_test/mwms_peer_failure_test.py
new file mode 100644
index 00000000000..6d822ca1b97
--- /dev/null
+++ b/tensorflow/python/distribute/integration_test/mwms_peer_failure_test.py
@@ -0,0 +1,216 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""This file contains tests that simulate peer failures.
+
+When a peer fails during MultiWorkerMirroredStrategy training. All workers
+should get Unavailable error.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+import tensorflow as tf
+
+from tensorflow.python.distribute import collective_all_reduce_strategy as mwms_lib
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import multi_process_runner
+from tensorflow.python.distribute import multi_worker_test_base
+from tensorflow.python.eager import test
+
+
+# Put it in top level so it executes in the child processes as well.
+mwms_lib.CollectiveAllReduceExtended._enable_check_health = True
+mwms_lib.CollectiveAllReduceExtended._check_health_interval = 3
+mwms_lib.CollectiveAllReduceExtended._check_health_initial_timeout = 0
+
+
+def get_attempt(strategy, attempts):
+  task_type = strategy.cluster_resolver.task_type
+  task_id = strategy.cluster_resolver.task_id
+  attempts[(task_type, task_id)] = attempts.get((task_type, task_id), 0) + 1
+  return task_id, attempts[(task_type, task_id)]
+
+
+quick_exit = os._exit  # pylint: disable=protected-access
+
+
+class PeerFailureTest(test.TestCase):
+  # Note that all the tests use auto_restart=True. Currently we rely on the
+  # assumption that an external system restarts failed tasks. If the assumption
+  # is not true, the remaining tasks may still hang instead of fail.
+  #
+  # In these tests we leverage the auto restart feature of MultiProcessRunner.
+  # Failed workers are restarted automatically. In reality there needs to be
+  # some job management system that does the restart, e.g. Kubernetes.
+  #
+  # Worker failures may cause problems if there're more than one collective, and
+  # the failure happens after the first collective. In this case the recovered
+  # worker will be running a different collective with the rest, which causes a
+  # deadlock. Note that collectives are common, e.g. when creating variables the
+  # initial values are broadcasted from the first worker.
+  #
+  # We use a multiprocessing.Manager().dict() object to track the attempts of
+  # each worker. We take different actions in different attempts to simuate the
+  # events in real world. E.g. some tests make a worker fail on the first
+  # attempt only, and asserts that it should recovery.
+
+  def test_creating_variable(self):
+    # This test simulates the case when a worker fails before or during creating
+    # a variable. Creating variables involve broadcasting the initial value from
+    # the first replica to all replicas.
+
+    def worker_fn():
+      strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
+      with strategy.scope():
+        tf.Variable(1.)
+        # worker-1 dies here.
+        if strategy.cluster_resolver.task_id == 1:
+          quick_exit(1)
+        v = tf.Variable(tf.random.uniform(()))
+        return v.read_value().numpy()
+
+    cluster_spec = multi_worker_test_base.create_cluster_spec(num_workers=2)
+    mpr = multi_process_runner.MultiProcessRunner(worker_fn, cluster_spec)
+    mpr.start()
+    # TODO(b/151232436): Always raise UnavailableError when a peer fails.
+    with self.assertRaises(
+        (tf.errors.UnavailableError, tf.errors.DeadlineExceededError)):
+      mpr.join(timeout=30)
+
+  def test_reduce_small_tensor(self):
+    # This test simulates the case when a worker fails before or during reducing
+    # a small tensors, e.g. reading a metric.
+    #
+    # Note that this is written for a specific corner case that used to happen
+    # only when all of the following conditions are met:
+    #   - There're two workers.
+    #   - They're reducing a small tensor. The definition of small varies
+    #     per platform.
+    #   - They're reducing a single tensor. Batched all-reduce are not affected.
+    #   - It must be worker-1 that fails.
+    # Under this case, the all-reduce is effectively two send/recv operation,
+    # the first one from worker-0 to worker-1, and the second one vice versa.
+    # The first one blocks the second one. In send/recv, the sending party is
+    # not aware of the failures of the receiving party.
+
+    def worker_fn():
+      strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
+      value = tf.identity([1.])
+      strategy.reduce("sum", value, axis=None)
+      # worker-1 dies here.
+      if strategy.cluster_resolver.task_id == 1:
+        quick_exit(1)
+      strategy.reduce("sum", value, axis=None)
+
+    cluster_spec = multi_worker_test_base.create_cluster_spec(num_workers=2)
+    mpr = multi_process_runner.MultiProcessRunner(worker_fn, cluster_spec)
+    mpr.start()
+    # TODO(b/151232436): Always raise UnavailableError when a peer fails.
+    with self.assertRaises(
+        (tf.errors.UnavailableError, tf.errors.DeadlineExceededError)):
+      mpr.join(timeout=30)
+
+
+class PeerFailureRecoverTest(test.TestCase):
+  # Similar to PeerFailureTest but simulates the situation where there's some
+  # external system that automatically restarts failed workers.
+
+  def test_creating_variable(self):
+    # See PeerFailureTest.test_creating_variable
+
+    def worker_fn(attempts):
+      strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
+      task_id, attempt = get_attempt(strategy, attempts)
+      with strategy.scope():
+        tf.Variable(1.)
+        # worker-1 dies here.
+        if attempt == 1 and task_id == 1:
+          quick_exit(1)
+        v = tf.Variable(tf.random.uniform(()))
+        return v.read_value().numpy()
+
+    cluster_spec = multi_worker_test_base.create_cluster_spec(num_workers=2)
+    attempts = multi_process_runner.manager().dict()
+    mpr = multi_process_runner.MultiProcessRunner(
+        worker_fn, cluster_spec, args=(attempts,), auto_restart=True)
+    mpr.start()
+    results = mpr.join(timeout=90).return_value
+    self.assertEqual(results[0], results[1])
+
+  def test_reduce_small_tensor(self):
+    # See PeerFailureTest.test_reduce_small_tensor
+
+    def worker_fn(attempts):
+      strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
+      task_id, attempt = get_attempt(strategy, attempts)
+      value = tf.identity([1.])
+      strategy.reduce("sum", value, axis=None)
+      # worker-1 dies here.
+      if attempt == 1 and task_id == 1:
+        quick_exit(1)
+      return strategy.reduce("sum", value, axis=None).numpy()
+
+    cluster_spec = multi_worker_test_base.create_cluster_spec(num_workers=2)
+    attempts = multi_process_runner.manager().dict()
+    mpr = multi_process_runner.MultiProcessRunner(
+        worker_fn, cluster_spec, args=(attempts,), auto_restart=True)
+    mpr.start()
+    results = mpr.join(timeout=90).return_value
+    self.assertAllEqual(results, [[2.], [2.]])
+
+  def test_quick_recover(self):
+    # This test simulates the case when a worker fails but recovers quickly
+    # before the next collective.
+    #
+    # It's not guaranteed that the cluster only restarts once when one worker
+    # fails. The external job management system is expected to keep restarting
+    # failed workers.
+
+    def worker_fn(attempts):
+      # Set a long check alive interval to better simulate the case when a
+      # worker fails and recovers during a check alive interval.
+      mwms_lib.CollectiveAllReduceExtended._check_alive_interval = 30
+      mwms_lib.CollectiveAllReduceExtended._check_alive_initial_timeout = 30
+
+      strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
+      task_id, attempt = get_attempt(strategy, attempts)
+
+      @tf.function
+      def replica_fn():
+        ctx = tf.distribute.get_replica_context()
+        # Use a large tensor because small tensor may hang regardless when the
+        # worker recovers.
+        value = tf.ones((64, 64))
+        ctx.all_reduce(tf.distribute.ReduceOp.SUM, [value, value])
+
+      strategy.run(replica_fn)
+      # worker-1 dies here.
+      if attempt == 1 and task_id == 1:
+        quick_exit(1)
+      strategy.run(replica_fn)
+
+    cluster_spec = multi_worker_test_base.create_cluster_spec(num_workers=2)
+    attempts = multi_process_runner.manager().dict()
+    mpr = multi_process_runner.MultiProcessRunner(
+        worker_fn, cluster_spec, args=(attempts,), auto_restart=True)
+    mpr.start()
+    mpr.join(timeout=90)
+
+
+if __name__ == "__main__":
+  combinations.main()
diff --git a/tensorflow/python/distribute/mirrored_strategy.py b/tensorflow/python/distribute/mirrored_strategy.py
index 5323f6131ee..79a563680ea 100644
--- a/tensorflow/python/distribute/mirrored_strategy.py
+++ b/tensorflow/python/distribute/mirrored_strategy.py
@@ -86,7 +86,7 @@ def _cluster_spec_to_device_list(cluster_spec, num_gpus_per_worker):
   for task_type in ("chief", "worker"):
     for task_id in range(len(cluster_spec.as_dict().get(task_type, []))):
       if num_gpus_per_worker == 0:
-        devices.append("/job:%s/task:%d" % (task_type, task_id))
+        devices.append("/job:%s/task:%d/device:CPU:0" % (task_type, task_id))
       else:
         devices.extend([
             "/job:%s/task:%d/device:GPU:%i" % (task_type, task_id, gpu_id)
@@ -378,8 +378,10 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1):
     self._is_multi_worker_training = True
 
     if len(workers) > 1:
-      if not isinstance(self._cross_device_ops,
-                        cross_device_ops_lib.MultiWorkerAllReduce):
+      # Grandfather usage in the legacy tests if they're configured properly.
+      if (not isinstance(self._cross_device_ops,
+                         cross_device_ops_lib.ReductionToOneDevice) or
+          self._cross_device_ops._num_between_graph_workers > 1):  # pylint: disable=protected-access
         raise ValueError(
             "In-graph multi-worker training with `MirroredStrategy` is not "
             "supported.")
@@ -691,9 +693,9 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1):
   def read_var(self, replica_local_var):
     """Read the aggregate value of a replica-local variable."""
     # pylint: disable=protected-access
-    if values._is_sync_on_read(replica_local_var):
+    if distribute_utils.is_sync_on_read(replica_local_var):
       return replica_local_var._get_cross_replica()
-    assert values._is_mirrored(replica_local_var)
+    assert distribute_utils.is_mirrored(replica_local_var)
     return array_ops.identity(replica_local_var._get())
     # pylint: enable=protected-access
 
diff --git a/tensorflow/python/distribute/mirrored_strategy_test.py b/tensorflow/python/distribute/mirrored_strategy_test.py
index 5c86cbea1a4..acdfdbb3788 100644
--- a/tensorflow/python/distribute/mirrored_strategy_test.py
+++ b/tensorflow/python/distribute/mirrored_strategy_test.py
@@ -1148,9 +1148,9 @@ class MirroredStrategyDefunTest(test.TestCase):
                 # pylint: disable=g-long-lambda
                 lambda: mirrored_strategy.MirroredStrategy(
                     devices=mirrored_strategy.all_local_devices(),
-                    cross_device_ops=cross_device_ops_lib.MultiWorkerAllReduce([
-                        "/job:worker/task:0", "/job:worker/task:1"
-                    ], context.num_gpus())),
+                    cross_device_ops=cross_device_ops_lib.ReductionToOneDevice(
+                    ),
+                ),
                 required_gpus=1)
         ],
         mode=["graph"]))
@@ -1288,9 +1288,7 @@ class MultiWorkerMirroredStrategyTestWithChief(
     cls._default_target = "grpc://" + cls._cluster_spec["chief"][0]
 
   def _make_cross_device_ops(self):
-    return cross_device_ops_lib.MultiWorkerAllReduce(
-        ["/job:chief/task:0", "/job:worker/task:0", "/job:worker/task:1"],
-        context.num_gpus())
+    return cross_device_ops_lib.ReductionToOneDevice()
 
   def testMinimizeLossGraph(self):
     with context.graph_mode():
diff --git a/tensorflow/python/distribute/multi_process_runner.py b/tensorflow/python/distribute/multi_process_runner.py
index 6d0854f18d7..8d632c09677 100644
--- a/tensorflow/python/distribute/multi_process_runner.py
+++ b/tensorflow/python/distribute/multi_process_runner.py
@@ -187,11 +187,12 @@ class MultiProcessRunner(object):
                        'one chief. Current `cluster_spec` has {} chiefs.'
                        .format(len(cluster_spec['chief'])))
     if not multi_process_lib.initialized():
-      raise RuntimeError('`multi_process_runner` is not initialized. '
-                         'Please call `multi_process_runner.test_main()` '
-                         'within `if __name__ == \'__main__\':` block '
-                         'in your python module to properly initialize '
-                         '`multi_process_runner`.')
+      raise MultiProcessRunnerNotInitializedError(
+          '`multi_process_runner` is not initialized. '
+          'Please call `multi_process_runner.test_main()` '
+          'within `if __name__ == \'__main__\':` block '
+          'in your python module to properly initialize '
+          '`multi_process_runner`.')
     if not callable(proc_func):
       raise ValueError('proc_func is not a callable')
 
@@ -479,6 +480,19 @@ class MultiProcessRunner(object):
       p = self._processes[(task_type, task_id)]
     return p.exitcode if p else None
 
+  def process_exists(self, task_type, task_id):
+    """Returns whether the subprocess still exists given the task type and id.
+
+    Args:
+      task_type: The task type.
+      task_id: The task id.
+
+    Returns:
+      Boolean; whether the subprocess still exists. If the subprocess has
+      exited, this returns False.
+    """
+    return self.get_process_exit_code(task_type, task_id) is None
+
   def _process_watchdog(self):
     """Simulates a cluster management system.
 
@@ -528,6 +542,8 @@ class MultiProcessRunner(object):
     for process_status in process_statuses.values():
       assert isinstance(process_status, _ProcessStatusInfo)
       if not process_status.is_successful:
+        process_status.exc_info[1].mpr_result = self._get_mpr_result(
+            process_statuses)
         six.reraise(*process_status.exc_info)
 
   def join(self, timeout=_DEFAULT_TIMEOUT_SEC):
@@ -546,8 +562,10 @@ class MultiProcessRunner(object):
     race is removed.
 
     Args:
-      timeout: if set and not all processes report status within roughly
-        `timeout` seconds, a `SubprocessTimeoutError` exception will be raised.
+      timeout: optional integer or `None`. If provided as an integer, and not
+      all processes report status within roughly `timeout` seconds, a
+      `SubprocessTimeoutError` exception will be raised. If `None`, `join` never
+      times out.
 
     Returns:
       A MultiProcessRunnerResult object, which has two attributes,
@@ -570,8 +588,13 @@ class MultiProcessRunner(object):
         is not `None`, it is expected that some subprocesses may be
         force-killed when `max_run_time` is up, and this is raised in those
         cases.
-      Exception: if there is an Exception propagated from any subprocess.
+      Exception: if there is an Exception propagated from any subprocess. When
+        this is raised, a `MultiProcessRunnerResult` object can be retrieved by
+        `UnexpectedSubprocessExitError`'s mpr_result attribute, which has the
+        same structure as above 'Returns' section describes.
     """
+    if timeout and not isinstance(timeout, int):
+      raise ValueError('`timeout` must be an integer or `None`.')
     with self._process_lock:
       if self._joined:
         raise ValueError("MultiProcessRunner can't be joined twice.")
@@ -594,8 +617,12 @@ class MultiProcessRunner(object):
         self._watchdog_thread.join()
       process_statuses = self._get_process_statuses()
       self._reraise_if_subprocess_error(process_statuses)
-      raise SubprocessTimeoutError('one or more subprocesses timed out.',
-                                   self._get_mpr_result(process_statuses))
+      raise SubprocessTimeoutError(
+          'One or more subprocesses timed out, where timeout was set to {}s. '
+          'Please change the `timeout` argument for '
+          '`MultiProcessRunner.join()` or `multi_process_runner.run()` '
+          'if it should be adjusted.'.format(timeout),
+          self._get_mpr_result(process_statuses))
 
     for (task_type, task_id), p in self._processes.items():
       logging.info('%s-%d exit code: %s', task_type, task_id, p.exitcode)
@@ -1033,6 +1060,16 @@ class UnexpectedSubprocessExitError(RuntimeError):
     self.mpr_result = mpr_result
 
 
+class MultiProcessRunnerNotInitializedError(RuntimeError):
+  """An error indicating `MultiProcessRunner` is used without initialization.
+
+  When this is raised, user is supposed to call
+  `multi_process_runner.test_main()` within `if __name__ == '__main__':` block
+  to properly initialize `multi_process_runner`.
+  """
+  pass
+
+
 def _set_tf_config(task_type, task_id, cluster_spec, rpc_layer=None):
   """Set TF_CONFIG environment variable."""
   tf_config_dict = {
diff --git a/tensorflow/python/distribute/multi_process_runner_no_init_test.py b/tensorflow/python/distribute/multi_process_runner_no_init_test.py
index 2a1fe2551b9..9276555c26b 100644
--- a/tensorflow/python/distribute/multi_process_runner_no_init_test.py
+++ b/tensorflow/python/distribute/multi_process_runner_no_init_test.py
@@ -30,8 +30,9 @@ class MultiProcessRunnerNoInitTest(test.TestCase):
     def simple_func():
       return 'foobar'
 
-    with self.assertRaisesRegex(RuntimeError,
-                                '`multi_process_runner` is not initialized.'):
+    with self.assertRaisesRegex(
+        multi_process_runner.MultiProcessRunnerNotInitializedError,
+        '`multi_process_runner` is not initialized.'):
       multi_process_runner.run(
           simple_func,
           multi_worker_test_base.create_cluster_spec(num_workers=1))
diff --git a/tensorflow/python/distribute/multi_process_runner_test.py b/tensorflow/python/distribute/multi_process_runner_test.py
index b7d8acf55c6..1c62b44b3f9 100644
--- a/tensorflow/python/distribute/multi_process_runner_test.py
+++ b/tensorflow/python/distribute/multi_process_runner_test.py
@@ -406,16 +406,20 @@ class MultiProcessRunnerTest(test.TestCase):
   def test_auto_restart_and_timeout(self):
 
     def proc_func():
+      logging.info('Running')
       time.sleep(1)
       raise ValueError
 
     mpr = multi_process_runner.MultiProcessRunner(
         proc_func,
         multi_worker_test_base.create_cluster_spec(num_workers=1),
-        auto_restart=True)
+        auto_restart=True,
+        list_stdout=True)
     mpr.start()
-    with self.assertRaises(ValueError):
+    with self.assertRaises(ValueError) as cm:
       mpr.join(timeout=10)
+    self.assertGreater(
+        sum(['Running' in msg for msg in cm.exception.mpr_result.stdout]), 1)
 
   def test_auto_restart_and_chief(self):
     # If the chief has exited with zero exit code, auto restart should stop
@@ -495,6 +499,34 @@ class MultiProcessRunnerTest(test.TestCase):
         'Worker 0 errored'):
       mpr.join(timeout=20)
 
+  def test_process_exists(self):
+
+    def proc_func():
+      time.sleep(100000)
+
+    mpr = multi_process_runner.MultiProcessRunner(
+        proc_func,
+        multi_worker_test_base.create_cluster_spec(num_workers=1))
+    mpr.start()
+    self.assertTrue(mpr.process_exists('worker', 0))
+    mpr.terminate('worker', 0)
+    # Worker 0 should exit at some point, or else the test would time out.
+    while mpr.process_exists('worker', 0):
+      time.sleep(1)
+
+  def test_timeout_none(self):
+
+    def proc_func():
+      time.sleep(250)
+      raise ValueError('Worker 0 errored')
+
+    mpr = multi_process_runner.MultiProcessRunner(
+        proc_func, multi_worker_test_base.create_cluster_spec(num_workers=1))
+
+    mpr.start()
+    with self.assertRaisesRegex(ValueError, 'Worker 0 errored'):
+      mpr.join(timeout=None)
+
 
 class MultiProcessPoolRunnerTest(test.TestCase):
 
diff --git a/tensorflow/python/distribute/multi_worker_continuous_run_test.py b/tensorflow/python/distribute/multi_worker_continuous_run_test.py
index 14e0564874b..3cfbe022204 100644
--- a/tensorflow/python/distribute/multi_worker_continuous_run_test.py
+++ b/tensorflow/python/distribute/multi_worker_continuous_run_test.py
@@ -40,6 +40,15 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import variable_scope
 
 
+# TODO(b/151232436): This test doesn't work with check health enabled because it
+# relies on barrier around creating strategies. Check health performs
+# communications inside strategy constructor, which makes the barrier
+# ineffective.
+CollectiveAllReduceExtended = (
+    collective_all_reduce_strategy.CollectiveAllReduceExtended)
+CollectiveAllReduceExtended._enable_check_health = False
+
+
 NUM_WORKERS = 5
 
 
diff --git a/tensorflow/python/distribute/multi_worker_test_base.py b/tensorflow/python/distribute/multi_worker_test_base.py
index 408cad2ca0a..b0c51f4767f 100644
--- a/tensorflow/python/distribute/multi_worker_test_base.py
+++ b/tensorflow/python/distribute/multi_worker_test_base.py
@@ -41,6 +41,9 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.distribute import distribute_coordinator as dc
+from tensorflow.python.distribute import multi_process_runner
+from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
+from tensorflow.python.distribute.cluster_resolver import TFConfigClusterResolver
 from tensorflow.python.eager import context
 from tensorflow.python.eager import remote
 from tensorflow.python.framework import errors
@@ -200,6 +203,156 @@ def create_in_process_cluster(num_workers,
   return cluster
 
 
+class MultiProcessCluster(object):
+  """A cluster of TensorFlow servers in separate processes.
+
+  This class is not thread-safe.
+  """
+
+  def __init__(self, cluster_resolver):
+    self._cluster_resolver = cluster_resolver
+    self._cluster_spec = cluster_resolver.cluster_spec().as_dict()
+    self._rpc_layer = cluster_resolver.rpc_layer
+    self._start_events = {}
+    self._finish_events = {}
+    self._mpr_manager = multi_process_runner.manager()
+
+    def task_function(start_events, finish_events):
+      cluster_resolver = TFConfigClusterResolver()
+      cluster_spec = cluster_resolver.cluster_spec()
+      task_type = cluster_resolver.task_type
+      task_id = cluster_resolver.task_id
+      rpc_layer = cluster_resolver.rpc_layer
+
+      logging.info(
+          'Starting server with cluster_spec = %r, task_type = %r, '
+          'task_id = %r, rpc_layer = %r', cluster_spec, task_type, task_id,
+          rpc_layer)
+
+      # TODO(yuefengz): support GPU clusters.
+      server_config = config_pb2.ConfigProto()
+      server_config.device_count['GPU'] = 0
+
+      server_lib.Server(
+          cluster_spec,
+          job_name=task_type,
+          protocol=rpc_layer,
+          task_index=task_id,
+          config=server_config,
+          start=True)
+
+      start_event = start_events[task_type][task_id]
+      start_event.set()
+
+      finish_event = finish_events[task_type][task_id]
+      finish_event.wait()
+
+      os._exit(0)  # pylint: disable=protected-access
+
+    self._task_function = task_function
+    self._mpr = None
+
+  def start(self):
+    """Starts one TensorFlow server for each task in the cluster_resolver.
+
+    It will wait until all the servers are up before returns.
+    """
+    if self._mpr:
+      raise ValueError('The cluster has already been started.')
+    for task_type, task_addresses in self._cluster_spec.items():
+      self._start_events[task_type] = []
+      self._finish_events[task_type] = []
+      for _ in task_addresses:
+        self._start_events[task_type].append(self._mpr_manager.Event())
+        self._finish_events[task_type].append(self._mpr_manager.Event())
+
+    self._mpr = multi_process_runner.MultiProcessRunner(
+        self._task_function,
+        self._cluster_spec,
+        args=(self._start_events, self._finish_events),
+        rpc_layer=self._rpc_layer,
+        stream_stdout=False,
+        list_stdout=False,
+        use_dill_for_args=False)
+    self._mpr.start()
+    for task_type, task_addresses in self._cluster_spec.items():
+      for i in range(len(task_addresses)):
+        self._start_events[task_type][i].wait()
+
+  def stop(self):
+    """Stops all the servers."""
+    for task_type, task_addresses in self._cluster_spec.items():
+      for i in range(len(task_addresses)):
+        self._finish_events[task_type][i].set()
+    try:
+      self._mpr.join()
+    except multi_process_runner.UnexpectedSubprocessExitError:
+      # TODO(yuefengz): investigate why processes exit with 255.
+      pass
+    self._mpr = None
+    self._start_events = {}
+    self._finish_events = {}
+
+  def kill_task(self, task_type, task_id):
+    """Kill a server given task_type and task_id.
+
+    Args:
+      task_type: the type of the task such as "worker".
+      task_id: the id the task such as 1.
+    """
+    assert self._mpr
+    if (not self._start_events[task_type][task_id].is_set() or
+        self._finish_events[task_type][task_id].is_set()):
+      raise ValueError("The task %s:%d doesn't exist." % (task_type, task_id))
+
+    self._finish_events[task_type][task_id].set()
+    self._mpr._processes[(task_type, task_id)].join()
+
+  def start_task(self, task_type, task_id):
+    """Starts a server given task_type and task_id.
+
+    Args:
+      task_type: the type of the task such as "worker".
+      task_id: the id the task such as 1.
+
+    Raises:
+      ValueError: if the server alreay exists.
+    """
+    assert self._mpr
+
+    if (not self._start_events[task_type][task_id].is_set() or
+        not self._finish_events[task_type][task_id].is_set()):
+      raise ValueError(
+          'The task %s:%d is still alive. You cannot start another one.' %
+          (task_type, task_id))
+    self._start_events[task_type][task_id] = self._mpr_manager.Event()
+    self._finish_events[task_type][task_id] = self._mpr_manager.Event()
+    self._mpr.start_single_process(task_type=task_type, task_id=task_id)
+    self._start_events[task_type][task_id].wait()
+
+  @property
+  def cluster_resolver(self):
+    return copy.deepcopy(self._cluster_resolver)
+
+
+def create_multi_process_cluster(num_workers,
+                                 num_ps,
+                                 has_chief=False,
+                                 has_eval=False,
+                                 rpc_layer='grpc'):
+  cluster_spec = create_cluster_spec(
+      has_chief=has_chief,
+      num_workers=num_workers,
+      num_ps=num_ps,
+      has_eval=has_eval)
+
+  cluster = MultiProcessCluster(
+      SimpleClusterResolver(
+          server_lib.ClusterSpec(cluster_spec), rpc_layer=rpc_layer))
+  cluster.start()
+  return cluster
+
+
 # TODO(rchao): Remove `test_obj` once estimator repo picks up the updated
 # nightly TF.
 def create_cluster_spec(has_chief=False,
diff --git a/tensorflow/python/distribute/multi_worker_test_base_test.py b/tensorflow/python/distribute/multi_worker_test_base_test.py
new file mode 100644
index 00000000000..e660d289a5b
--- /dev/null
+++ b/tensorflow/python/distribute/multi_worker_test_base_test.py
@@ -0,0 +1,82 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for multi-process clusters."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.distribute import multi_process_runner
+from tensorflow.python.distribute import multi_worker_test_base
+from tensorflow.python.eager import context
+from tensorflow.python.eager import remote
+from tensorflow.python.eager import test
+
+
+class MultiProcessClusterTest(test.TestCase):
+
+  def setUp(self):
+    super(MultiProcessClusterTest, self).setUp()
+    self._cluster = multi_worker_test_base.create_multi_process_cluster(
+        num_workers=2, num_ps=1, has_chief=True, rpc_layer="grpc")
+    remote.connect_to_cluster(
+        self._cluster.cluster_resolver.cluster_spec(), protocol="grpc")
+    context.ensure_initialized()
+
+  def testClusterIsAlive(self):
+    self.assertTrue(context.check_alive("/job:worker/replica:0/task:0"))
+    self.assertTrue(context.check_alive("/job:worker/replica:0/task:1"))
+    self.assertTrue(context.check_alive("/job:ps/replica:0/task:0"))
+    self.assertTrue(context.check_alive("/job:chief/replica:0/task:0"))
+
+  def testKillAndStartTask(self):
+    self.assertTrue(context.check_alive("/job:worker/replica:0/task:0"))
+
+    # It is not allowed to start a task before killing it.
+    with self.assertRaises(ValueError):
+      self._cluster.start_task("worker", 0)
+
+    self._cluster.kill_task("worker", 0)
+    self.assertFalse(context.check_alive("/job:worker/replica:0/task:0"))
+
+    # The task is already killed.
+    with self.assertRaises(ValueError):
+      self._cluster.kill_task("worker", 0)
+
+    self._cluster.start_task("worker", 0)
+
+    # Without a call to update_server_def, the next check_alive will return
+    # False. Alternatively sleeping for 2 seconds here also works.
+    context.context().update_server_def(context.get_server_def())
+
+    self.assertTrue(context.check_alive("/job:worker/replica:0/task:0"))
+
+  def testStop(self):
+    self._cluster.stop()
+    self.assertFalse(context.check_alive("/job:worker/replica:0/task:0"))
+    self.assertFalse(context.check_alive("/job:worker/replica:0/task:1"))
+    self.assertFalse(context.check_alive("/job:ps/replica:0/task:0"))
+    self.assertFalse(context.check_alive("/job:chief/replica:0/task:0"))
+
+  def testClusterResolverProperty(self):
+    cluster_spec = self._cluster.cluster_resolver.cluster_spec().as_dict()
+
+    self.assertEqual(len(cluster_spec["worker"]), 2)
+    self.assertEqual(len(cluster_spec["ps"]), 1)
+    self.assertEqual(len(cluster_spec["chief"]), 1)
+
+
+if __name__ == "__main__":
+  multi_process_runner.test_main()
diff --git a/tensorflow/python/distribute/parallel_device/BUILD b/tensorflow/python/distribute/parallel_device/BUILD
index 930816d4407..3d08f5b90e3 100644
--- a/tensorflow/python/distribute/parallel_device/BUILD
+++ b/tensorflow/python/distribute/parallel_device/BUILD
@@ -1,6 +1,3 @@
-load("//tensorflow:tensorflow.bzl", "tf_custom_op_library", "tf_gen_op_wrapper_py")
-load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
-
 package(
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],  # Apache 2.0
@@ -17,9 +14,9 @@ py_library(
     srcs = ["parallel_device.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":parallel_device_ops",
         ":saving",
         "//tensorflow/python:_pywrap_parallel_device",
+        "//tensorflow/python/distribute:device_util",
     ],
 )
 
@@ -30,25 +27,6 @@ py_library(
     deps = ["//tensorflow/python:framework_ops"],
 )
 
-tf_gen_op_wrapper_py(
-    name = "parallel_device_ops_py",
-    out = "gen_parallel_device_ops.py",
-    deps = ["//tensorflow/c/eager/parallel_device:parallel_device_ops"],
-)
-
-tf_custom_op_library(
-    name = "_parallel_device_ops.so",
-    srcs = ["//tensorflow/c/eager/parallel_device:parallel_device_ops_srcs"],
-)
-
-tf_custom_op_py_library(
-    name = "parallel_device_ops",
-    dso = [":_parallel_device_ops.so"],
-    kernels = ["//tensorflow/c/eager/parallel_device:parallel_device_ops"],
-    visibility = ["//tensorflow:internal"],
-    deps = [":parallel_device_ops_py"],
-)
-
 py_test(
     name = "parallel_device_test",
     srcs = ["parallel_device_test.py"],
diff --git a/tensorflow/python/distribute/parallel_device/parallel_device.py b/tensorflow/python/distribute/parallel_device/parallel_device.py
index 2dbdc653a64..30381e2a95d 100644
--- a/tensorflow/python/distribute/parallel_device/parallel_device.py
+++ b/tensorflow/python/distribute/parallel_device/parallel_device.py
@@ -18,27 +18,27 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import contextlib
 import threading
 
 from tensorflow.python import _pywrap_parallel_device
-from tensorflow.python.distribute.parallel_device import gen_parallel_device_ops
+from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute.parallel_device import saving
 from tensorflow.python.eager import context
-from tensorflow.python.framework import load_library
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
-from tensorflow.python.platform import resource_loader
+from tensorflow.python.ops import array_ops
 from tensorflow.python.tpu.ops import tpu_ops
 
-load_library.load_op_library(
-    resource_loader.get_path_to_datafile("_parallel_device_ops.so"))
-
 _next_device_number = 0
 _next_device_number_lock = threading.Lock()
 
 
 # TODO(allenl): Expand this docstring once things like getting components on and
 # off the device are stable.
+#
+# TODO(allenl): Make multi-client work; we need an offset for device IDs, and an
+# indication of how many other devices there are total for collectives which
+# don't have a number of participants hard-coded in their attributes.
 class ParallelDevice(object):
   """A device which executes operations in parallel."""
 
@@ -53,19 +53,20 @@ class ParallelDevice(object):
       A string with the name of the newly created device.
     """
     global _next_device_number, _next_device_number_lock
-    self.components = tuple(components)
+    self.components = tuple(device_util.canonicalize(d) for d in components)
     ctx = context.context()
     with _next_device_number_lock:
       # TODO(allenl): Better names for parallel devices (right now "CUSTOM" is
       # special-cased).
-      self.name = "{}/device:CUSTOM:{}".format(
-          ctx.host_address_space(), _next_device_number)
+      self._name = "{}/device:CUSTOM:{}".format(ctx.host_address_space(),
+                                                _next_device_number)
       _next_device_number += 1
     device, device_info = _pywrap_parallel_device.GetParallelDeviceCapsules(
-        self.name, self.components)
-    context.register_custom_device(device, self.name, device_info)
-    with ops.device(self.name):
-      self._device_ids = gen_parallel_device_ops.device_id()
+        self._name, self.components)
+    context.register_custom_device(device, self._name, device_info)
+    self._device_ids = None
+    self._device_scope = None
+    self._saving_scope = None
 
   def pack(self, tensors):
     """Create a tensor on the parallel device from a sequence of tensors.
@@ -74,21 +75,23 @@ class ParallelDevice(object):
       tensors: A flat list of tensors, one per device in `self.components`.
 
     Returns:
-      A single tensor placed on `self.name`.
+      A single tensor placed on the ParallelDevice.
     """
-    with ops.device(self.name):
+    self._assert_eager()
+    with ops.device(self._name):
       return tpu_ops.tpu_replicated_input(inputs=tensors)
 
   def unpack(self, parallel_tensor):
     """Unpack a parallel tensor into its components.
 
     Args:
-      parallel_tensor: A tensor placed on `self.name`.
+      parallel_tensor: A tensor placed on the ParallelDevice.
 
     Returns:
       A flat list of tensors, one per `self.components`.
     """
-    with ops.device(self.name):
+    self._assert_eager()
+    with ops.device(self._name):
       return tpu_ops.tpu_replicated_output(
           parallel_tensor, num_replicas=len(self.components))
 
@@ -102,14 +105,55 @@ class ParallelDevice(object):
     Returns:
       A parallel tensor containing 0 on the first device, 1 on the second, etc.
     """
+    if self._device_ids is None:
+      # device_ids may be called from inside a tf.function, in which case the
+      # function captures the eager tensor. We can't pack tensors in a function
+      # at the moment, and even if we could we don't want to hold on to a
+      # symbolic tensor, so we need to init_scope out of the function
+      # temporarily.
+      with ops.init_scope():
+        # TODO(allenl): Functions which capture eager device ID tensors won't be
+        # saveable in SavedModels. Ideally we'd run a DeviceID op every time
+        # device IDs are required, with functions using the op in their bodies
+        # but not hard-coding a fixed number of devices (so they can be re-used
+        # with a different replica count).
+        device_ids_list = []
+        for index, device in enumerate(self.components):
+          with ops.device(device):
+            # The identity op ensures each device ID tensor is placed on its
+            # device.
+            device_ids_list.append(
+                array_ops.identity(constant_op.constant(index)))
+        self._device_ids = self.pack(device_ids_list)
+
     return self._device_ids
 
-  # TODO(allenl): Fixing saving in Python is a bit odd. One alternative would be
-  # to provide a hook for the custom device to create save specs/etc., then call
-  # that hook from the default variable implementation if the variable is on a
-  # custom device. We'll likely want similar hooks for repr() and such.
-  @contextlib.contextmanager
-  def scope(self):
+  def _assert_eager(self):
+    """Verifies that tracing is not active."""
+    if not context.executing_eagerly():
+      raise NotImplementedError(
+          "ParallelDevice is currently not supported inside `tf.function`. It "
+          "can however run calls to a `tf.function` in parallel:\n\n"
+          "with ParallelDevice() as p:\n  f()")
+
+  def __enter__(self):
     """Runs ops in parallel, makes variables which save independent buffers."""
-    with ops.device(self.name), saving.independent_buffers(self):
-      yield
+    if (self._device_scope is not None or self._saving_scope is not None):
+      raise AssertionError(
+          "Re-entered a ParallelDevice scope without first exiting it.")
+    self._assert_eager()
+    self._device_scope = ops.device(self._name)
+    self._saving_scope = saving.independent_buffers(self)
+    self._device_scope.__enter__()
+    # TODO(allenl): Fixing saving in Python is a bit odd. One alternative would
+    # be to provide a hook for the custom device to create save specs/etc., then
+    # call that hook from the default variable implementation if the variable is
+    # on a custom device. We'll likely want similar hooks for repr() and such.
+    self._saving_scope.__enter__()
+    return self
+
+  def __exit__(self, typ, exc, tb):
+    self._device_scope.__exit__(typ, exc, tb)
+    self._saving_scope.__exit__(typ, exc, tb)
+    self._device_scope = None
+    self._saving_scope = None
diff --git a/tensorflow/python/distribute/parallel_device/parallel_device_test.py b/tensorflow/python/distribute/parallel_device/parallel_device_test.py
index 780aca993d9..571a8ae4b6b 100644
--- a/tensorflow/python/distribute/parallel_device/parallel_device_test.py
+++ b/tensorflow/python/distribute/parallel_device/parallel_device_test.py
@@ -34,6 +34,8 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
+from tensorflow.python.saved_model import load
+from tensorflow.python.saved_model import save
 from tensorflow.python.training import checkpoint_management
 from tensorflow.python.training.tracking import util as tracking
 from tensorflow.python.util import nest
@@ -100,19 +102,16 @@ class _VirtualDeviceTestCase(test.TestCase):
         context.LogicalDeviceConfiguration()
     ])
 
-    # TODO(allenl): Make CPU:0 and CPU:1 work (right now "CPU:1" soft-places
-    # onto CPU:0, which seems wrong).
-    components = [
-        "/job:localhost/replica:0/task:0/device:CPU:0",
-        "/job:localhost/replica:0/task:0/device:CPU:1"
-    ]
-    self.device = parallel_device.ParallelDevice(components)
+    self.device = parallel_device.ParallelDevice(
+        components=["/job:localhost/device:CPU:0", "CPU:1"])
+    self.assertIn("CPU:0", self.device.components[0])
+    self.assertIn("CPU:1", self.device.components[1])
 
 
 class ParallelDeviceTests(_VirtualDeviceTestCase):
 
   def test_register_parallel_device(self):
-    with ops.device(self.device.name):
+    with self.device:
       c = constant_op.constant(1.)
       d = constant_op.constant(2.)
       e = c + d
@@ -129,7 +128,7 @@ class ParallelDeviceTests(_VirtualDeviceTestCase):
     self.assertIn(self.device.components[1], device_ids[1].backing_device)
 
   def test_collective_reduce(self):
-    with ops.device(self.device.name):
+    with self.device:
       x = self.device.pack(
           [constant_op.constant(-1.5),
            constant_op.constant(3.5)])
@@ -142,7 +141,7 @@ class ParallelDeviceTests(_VirtualDeviceTestCase):
   def test_collective_reduce_async_scope(self):
     # Note that ops on the parallel device currently don't execute
     # asynchronously. The test is just that we don't get deadlocks.
-    with context.async_scope(), ops.device(self.device.name):
+    with context.async_scope(), self.device:
       x = self.device.pack(
           [constant_op.constant(-1.5),
            constant_op.constant(3.5)])
@@ -160,7 +159,7 @@ class ParallelDeviceTests(_VirtualDeviceTestCase):
       self.setUp()
       # Note that ops on the parallel device currently don't execute
       # asynchronously. The test is just that we don't get deadlocks.
-      with ops.device(self.device.name):
+      with self.device:
         x = self.device.pack(
             [constant_op.constant(-1.5),
              constant_op.constant(3.5)])
@@ -195,31 +194,79 @@ class ParallelDeviceTests(_VirtualDeviceTestCase):
       return control_flow_ops.switch_case(
           device_id, branch_fns={0: send, 1: recv})
 
-    with ops.device(self.device.name):
+    with self.device:
       result = broadcast_send_recv(self.device.device_ids)
     self.assertAllClose([[2], [6]], self.device.unpack(result))
 
+  def test_use_in_graph_error_is_informative(self):
+    @def_function.function
+    def uses_parallel():
+      with self.device:
+        return self.device.unpack(array_ops.ones([]))
+
+    with self.assertRaisesRegex(NotImplementedError, "inside `tf.function`"):
+      uses_parallel()
+
   def test_checkpointing(self):
-    self.skipTest(
-        "Disable saving until SaveableObject's methods are traceable.")
     prefix = os.path.join(self.get_temp_dir(), "ckpt")
-    with self.device.scope():
+    with self.device:
       different_values = self.device.pack(
           [constant_op.constant(-1.),
            constant_op.constant(3.)])
       v = variables.Variable(different_values)
       checkpoint = tracking.Checkpoint(v=v)
     save_path = checkpoint.save(prefix)
-    with ops.device(self.device.name):
+    with self.device:
       v.assign(constant_op.constant(0.))
     checkpoint.restore(save_path).assert_consumed()
-    with ops.device(self.device.name):
+    with self.device:
       outputs = self.device.unpack(v)
     self.assertAllClose([-1., 3.], outputs)
 
+    with self.device:
+      restore_on_create = tracking.Checkpoint()
+      restore_on_create.restore(save_path)
+      restore_on_create.v = variables.Variable(0.)
+      outputs = self.device.unpack(restore_on_create.v)
+    self.assertAllClose([-1., 3.], outputs)
+
+    # Changing the number of devices / restoring into a single-device copy is OK
+    single_device = tracking.Checkpoint(v=variables.Variable(0.))
+    status = single_device.restore(save_path)
+    status.assert_existing_objects_matched()
+    self.assertAllClose(-1., single_device.v)
+    with self.assertRaisesRegex(AssertionError, "parallel_component_1"):
+      # There are parts of the variable that aren't restored into a
+      # single-device copy.
+      status.assert_consumed()
+
+  def test_saved_model(self):
+    with self.device:
+      different_values = self.device.pack(
+          [constant_op.constant(-1.),
+           constant_op.constant(3.)])
+      m = module.Module()
+      m.v = variables.Variable(different_values)
+      m.f = def_function.function(lambda: m.v * 2.)
+      self.assertAllClose([-2., 6.], self.device.unpack(m.f()))
+    saved_model_path = os.path.join(self.get_temp_dir(), "saved_model")
+    save.save(m, saved_model_path)
+
+    context._reset_context()
+    self.setUp()
+
+    single_device_loaded = load.load(saved_model_path)
+    self.assertAllClose(-2., single_device_loaded.f())
+    with self.device:
+      parallel_loaded = load.load(saved_model_path)
+      self.assertAllClose([-2., 6.], self.device.unpack(parallel_loaded.f()))
+      self.assertAllClose([-1., 3.], self.device.unpack(parallel_loaded.v))
+      parallel_loaded.v.assign(self.device.pack([.1, .2]))
+      self.assertAllClose([.2, .4], self.device.unpack(parallel_loaded.f()))
+
   def _assert_close_to_non_parallel(self, computation):
     """Asserts that replication of `computation` works and is equivalent."""
-    with ops.device(self.device.name):
+    with self.device:
       parallel_result = computation()
     non_parallel_result = computation()
     # The computations should have the same number and structure of Tensor
@@ -230,8 +277,8 @@ class ParallelDeviceTests(_VirtualDeviceTestCase):
     parallel_flat = nest.flatten(parallel_result)
     self.assertGreater(len(parallel_flat), 0)
     for non_parallel, parallel in zip(non_parallel_flat, parallel_flat):
-      self.assertEqual(self.device.name, parallel.device)
-      self.assertNotEqual(self.device.name, non_parallel.device)
+      self.assertEqual(self.device._name, parallel.device)
+      self.assertNotEqual(self.device._name, non_parallel.device)
       for parallel_component in self.device.unpack(parallel):
         self.assertAllClose(non_parallel, parallel_component)
 
@@ -257,7 +304,7 @@ class ParallelDeviceTests(_VirtualDeviceTestCase):
 class LayerTests(_VirtualDeviceTestCase):
 
   def test_layer_forward(self):
-    with ops.device(self.device.name):
+    with self.device:
       layer = _Dense(5)
       x = constant_op.constant([[2.]])
       y = layer(x)
@@ -268,7 +315,7 @@ class LayerTests(_VirtualDeviceTestCase):
     self.assertIn(self.device.components[1], outputs[1].backing_device)
 
     # With different Layer inputs we get different outputs
-    with ops.device(self.device.name):
+    with self.device:
       x = self.device.pack(
           [constant_op.constant([[-0.5]]),
            constant_op.constant([[0.5]])])
@@ -280,7 +327,7 @@ class LayerTests(_VirtualDeviceTestCase):
     self.assertIn(self.device.components[1], outputs[1].backing_device)
 
   def test_layer_sync_training(self):
-    with ops.device(self.device.name):
+    with self.device:
       layer = _Dense(5)
 
       with backprop.GradientTape() as tape:
@@ -305,7 +352,7 @@ class LayerTests(_VirtualDeviceTestCase):
     self.assertIn(self.device.components[1], final_kernels[1].backing_device)
 
   def test_layer_divergent_buffer_training(self):
-    with ops.device(self.device.name):
+    with self.device:
       layer = _Dense(5)
 
       with backprop.GradientTape() as tape:
@@ -329,8 +376,6 @@ class LayerTests(_VirtualDeviceTestCase):
     self.assertIn(self.device.components[1], final_kernels[1].backing_device)
 
   def test_training_loop(self):
-    self.skipTest(
-        "Disable saving until SaveableObject's methods are traceable.")
     for _ in range(5):
       layer = _Dense(5)
       checkpoint = tracking.Checkpoint(layer=layer)
@@ -339,7 +384,7 @@ class LayerTests(_VirtualDeviceTestCase):
       manager.restore_or_initialize()
 
       for _ in range(10):
-        with self.device.scope():
+        with self.device:
           with backprop.GradientTape() as tape:
             x = self.device.pack(
                 [constant_op.constant([[-0.5]]),
diff --git a/tensorflow/python/distribute/parallel_device/saving.py b/tensorflow/python/distribute/parallel_device/saving.py
index f2e7dadae41..f1539e49651 100644
--- a/tensorflow/python/distribute/parallel_device/saving.py
+++ b/tensorflow/python/distribute/parallel_device/saving.py
@@ -21,96 +21,85 @@ from __future__ import print_function
 import contextlib
 import functools
 
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_resource_variable_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables
 from tensorflow.python.training.saving import saveable_object
 
 
-def _read_component(handle, dtype, replica_id, parallel_device):
-  """Read one component of a parallel variable and discard the rest."""
-  with ops.device(handle.device):
-    read = gen_resource_variable_ops.read_variable_op(
-        resource=handle, dtype=dtype)
-  all_components = parallel_device.unpack(read)
-  # We're pretending that parallel variables have a first axis with length
-  # num_components, so we need to add a dummy first axis to the shape that gets
-  # saved.
-  return all_components[replica_id][None, ...]
+class _ParallelComponentSaveable(saveable_object.SaveableObject):
+  """Saves and restores one component of a parallel variable."""
 
-
-class _ParallelDeviceSaveable(saveable_object.SaveableObject):
-  """Saves and restores a parallel variable."""
-
-  def __init__(self, name, handle, dtype, component_shape, parallel_device):
-    # Each component device gets one spec with a tensor to save.
-    specs = []
-    for replica_id, device_name in enumerate(parallel_device.components):
-      # TODO(b/151773535): SaveableObjects with SaveSpecs on different devices
-      # will cause extra copying at the moment. We should fix that before doing
-      # anything serious with this code.
-      specs.append(
-          saveable_object.SaveSpec(
-              tensor=functools.partial(
-                  _read_component,
-                  handle=handle,
-                  dtype=dtype,
-                  replica_id=replica_id,
-                  parallel_device=parallel_device),
-              slice_spec=variables.Variable.SaveSliceInfo(
-                  full_shape=([len(parallel_device.components)] +
-                              component_shape),
-                  var_offset=[replica_id] + [0] * len(component_shape),
-                  var_shape=[1] + component_shape).spec,
-              device=device_name,
-              dtype=dtype,
-              name=name))
+  def __init__(self, name, handle, dtype, shape):
+    specs = [saveable_object.SaveSpec(
+        tensor=functools.partial(gen_resource_variable_ops.read_variable_op,
+                                 resource=handle, dtype=dtype),
+        slice_spec="",
+        device=handle.device,
+        dtype=dtype,
+        name=name)]
     self._handle = handle
-    self._parallel_device = parallel_device
-    self._component_shape = component_shape
-    super(_ParallelDeviceSaveable, self).__init__(None, specs, name)
+    super(_ParallelComponentSaveable, self).__init__(handle, specs, name)
 
   def restore(self, tensors, restored_shapes=None):
-    with ops.device(self._handle.device):
-      # Combine the restored tensors into one parallel tensor to assign.
-      bundled = self._parallel_device.pack(tensors)
-      gen_resource_variable_ops.assign_variable_op(
-          resource=self._handle,
-          # Squeeze out the dummy first axis we added when saving.
-          value=array_ops.squeeze(bundled, axis=0))
+    restored_tensor, = tensors
+    gen_resource_variable_ops.assign_variable_op(
+        resource=self._handle, value=restored_tensor)
 
 
-class VariableWithFixedCheckpointing(resource_variable_ops.ResourceVariable):
-  """Overrides checkpointing behavior to save like a partitioned variable."""
+class ParallelSavingMixin(resource_variable_ops.BaseResourceVariable):
+  """Mixin to to override variable checkpointing, saving each component."""
 
-  def __init__(self, parallel_device, **kwargs):
+  def __init__(self, parallel_device, expected_shape=None, use_resource=None,
+               **kwargs):
+    del expected_shape, use_resource
     self._parallel_device = parallel_device
-    kwargs = {k: v for k, v in kwargs.items()
-              if k not in ["use_resource", "expected_shape"]}
-    super(VariableWithFixedCheckpointing, self).__init__(**kwargs)
+    super(ParallelSavingMixin, self).__init__(**kwargs)
 
+  # TODO(allenl): Consider either adding a boolean argument for
+  # save-primary-only or looking at synchronization/aggregation properties.
   def _gather_saveables_for_checkpoint(self):
-    # Note VARIABLE_VALUE is the usual attribute name for variables. Using
-    # something different means (a) the checkpointing infrastructure won't try
-    # doing restore-on-create (which has shape issues), and (b) the saved
-    # variables won't be compatible with regular variables. Both of those are
-    # good in this case.
-    return dict(
-        PARALLEL_VARIABLE_VALUE=functools.partial(
-            _ParallelDeviceSaveable,
-            handle=self.handle,
-            dtype=self.dtype,
-            component_shape=self.shape,
-            parallel_device=self._parallel_device))
+    """Generate SaveableObjects for each component device."""
+    component_saveables = {}
+    # Create one SaveableObject per device, each one of which looks like a
+    # regular ResourceVariable saveable.
+    for index, handle in enumerate(self._parallel_device.unpack(self.handle)):
+      if index == 0:
+        # This is the name regular tf.Variables use to save. Using it for the
+        # component on the first device means non-parallel tf.Variable objects
+        # will use this value when pointed at a parallel checkpoint.
+        attribute = "VARIABLE_VALUE"
+      else:
+        attribute = "parallel_component_{}".format(index)
+      component_saveables[attribute] = (
+          functools.partial(
+              _ParallelComponentSaveable,
+              handle=handle,
+              dtype=self.dtype,
+              shape=self.shape))
+    return component_saveables
 
 
-def _variable_creator(next_creator, parallel_device, **kwargs):
+class ParallelVariable(
+    ParallelSavingMixin, resource_variable_ops.ResourceVariable):
+  pass
+
+
+class UninitializedParallelVariable(
+    ParallelSavingMixin, resource_variable_ops.UninitializedVariable):
+  pass
+
+
+def _variable_creator(next_creator, parallel_device, initial_value=None,
+                      **kwargs):
   del next_creator
-  return VariableWithFixedCheckpointing(
-      parallel_device=parallel_device, **kwargs)
+  if initial_value is not None:
+    return ParallelVariable(
+        parallel_device=parallel_device, initial_value=initial_value, **kwargs)
+  else:
+    # SavedModel loading does not pass an initial value.
+    return UninitializedParallelVariable(
+        parallel_device=parallel_device, **kwargs)
 
 
 @contextlib.contextmanager
diff --git a/tensorflow/python/distribute/parameter_server_strategy_v2.py b/tensorflow/python/distribute/parameter_server_strategy_v2.py
index 02f3c35a716..c99f26c457c 100644
--- a/tensorflow/python/distribute/parameter_server_strategy_v2.py
+++ b/tensorflow/python/distribute/parameter_server_strategy_v2.py
@@ -21,14 +21,20 @@ This is currently under development and the API is subject to change.
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-
+import os
 from absl import logging
 from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import distribute_utils
 from tensorflow.python.distribute import parameter_server_strategy
 from tensorflow.python.distribute import sharded_variable
+from tensorflow.python.eager import remote
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.util import tf_contextlib
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.training import server_lib
+from tensorflow.python.training.tracking import base as trackable
+from tensorflow.python.util import tf_inspect
 
 
 # pylint: disable=protected-access
@@ -36,56 +42,85 @@ class ParameterServerStrategyV2(distribute_lib.Strategy):
   """An asynchronous multi-worker parameter server tf.distribute strategy.
 
   Currently, `ParameterServerStrategyV2` is not supported to be used as a
-  standalone tf.distribute strategy. It must be used in conjunction with
-  `Client`. The recommended way of using the combination is through a
-  `ParameterServerClient` object. Please see `Client` and
-  `ParameterServerClient` for more information.
+  standalone tf.distribute strategy. It should be used in conjunction with
+  `Client`. Please see `Client` for more information.
 
   This is currently under development, and the API as well as implementation
   is subject to changes.
   """
 
-  def __init__(self, cluster_resolver):
+  def __init__(self, cluster_resolver, variable_partitioner=None):
     """Initializes the V2 parameter server strategy.
 
+    This also connects to the remote server cluster.
+
     Args:
       cluster_resolver: a `tf.distribute.cluster_resolver.ClusterResolver`
         object.
+      variable_partitioner: a callable with the signature `num_partitions =
+        fn(shape, dtype)`, where `num_partitions` is a list/tuple representing
+        the number of partitions on each axis, and `shape` and `dtype` are of
+        types `tf.TensorShape` and `tf.dtypes.Dtype`. If None, variables will
+        not be partitioned. * `variable_partitioner` will be called for all
+        variables created under strategy `scope` to instruct how the variables
+        should be partitioned. Variables will be partitioned if there are more
+        than one partitions along the partitioning axis, otherwise it falls back
+        to normal `tf.Variable`. * Only the first / outermost axis partitioning
+        is supported, namely, elements in `num_partitions` must be 1 other than
+        the first element. * Partitioner like `min_max_variable_partitioner`,
+        `variable_axis_size_partitioner` and `fixed_size_partitioner` are also
+        supported since they conform to the required signature. * Div partition
+        strategy is used to partition variables. Assuming we assign consecutive
+        integer ids along the first axis of a variable, then ids are assigned to
+        shards in a contiguous manner, while attempting to keep each shard size
+        identical. If the ids do not evenly divide the number of shards, each of
+        the first several shards will be assigned one more id. For instance, a
+        variable whose first dimension is
+        13 has 13 ids, and they are split across 5 shards as: `[[0, 1, 2], [3,
+          4, 5], [6, 7, 8], [9, 10], [11, 12]]`. * Variables created under
+          `strategy.extended.colocate_vars_with` will not be partitioned, e.g,
+          optimizer's slot variables.
     """
-    self._extended = ParameterServerStrategyV2Extended(self, cluster_resolver)
     self._cluster_resolver = cluster_resolver
+    self._extended = ParameterServerStrategyV2Extended(self, cluster_resolver,
+                                                       variable_partitioner)
     self._verify_args_and_config(cluster_resolver)
     logging.info(
         "ParameterServerStrategyV2 is initialized with cluster_spec: "
         "%s", cluster_resolver.cluster_spec())
+
+    # TODO(b/167894802): Make chief, worker, and ps names customizable.
+    self._connect_to_cluster(client_name="chief")
     super(ParameterServerStrategyV2, self).__init__(self._extended)
 
-  @tf_contextlib.contextmanager
-  def experimental_variable_partitioning_scope(self):
-    """A context manager for creating `ShardedVariable`.
+  def _connect_to_cluster(self, client_name):
+    if client_name in ["worker", "ps"]:
+      raise ValueError("Client name should not be 'worker' or 'ps'.")
+    cluster_spec = self._cluster_resolver.cluster_spec()
+    self._num_workers = len(cluster_spec.as_dict().get("worker", ()))
+    self._num_ps = len(cluster_spec.as_dict().get("ps", ()))
 
-    Variables created inside a `with experimental_variable_partitioning_scope()`
-    code block will be of type `ShardedVariable` and their values are
-    partitioned among parameter servers along the first / outermost axis. The
-    number of shards are equal to the number of parameter servers.
+    device_filters = server_lib.ClusterDeviceFilters()
+    # For any worker, only the devices on PS and chief nodes are visible
+    for i in range(self._num_workers):
+      device_filters.set_device_filters(
+          "worker", i, ["/job:ps", "/job:%s" % client_name])
+    # Similarly for any ps, only the devices on workers and chief are visible
+    for i in range(self._num_ps):
+      device_filters.set_device_filters(
+          "ps", i, ["/job:worker", "/job:%s" % client_name])
 
-    Variables created within this scope must be initialized using a callable as
-    `initial_value` and a known shape.
+    # Allow at most one outstanding RPC for each worker at a certain time. This
+    # is to simplify worker failure handling in the runtime
+    os.environ["TF_ENABLE_EAGER_CLIENT_STREAMING_ENQUEUE"] = "False"
 
-    Div partition strategy is used to partition the variable. Assuming we
-    assign consective integer ids along the first axis of the variable, then ids
-    are assigned to shards in a contiguous manner, while attempting to keep each
-    shard size identical. If the ids do not evenly divide the number of shards,
-    each of the first several shards will be assigned one more id. For instance,
-    a variable whose first dimension is 13 has 13 ids, and they are split across
-    5 shards as: `[[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10], [11, 12]]`.
-
-    Yields:
-      A context manager for creating `ShardedVariable`.
-    """
-    with variable_scope.variable_creator_scope(
-        self._extended._make_sharded_variable_creator()):
-      yield
+    logging.info("%s is now connecting to cluster with cluster_spec: %r",
+                 self.__class__.__name__, cluster_spec)
+    remote.connect_to_cluster(
+        cluster_spec,
+        job_name=client_name,
+        protocol=self._cluster_resolver.rpc_layer,
+        cluster_device_filters=device_filters)
 
   def _verify_args_and_config(self, cluster_resolver):
     if not cluster_resolver.cluster_spec():
@@ -101,15 +136,35 @@ class ParameterServerStrategyV2Extended(
   Please see `tf.distribute.StrategyExtended` doc for more information.
   """
 
-  def __init__(self, container_strategy, cluster_resolver):
+  def __init__(self, container_strategy, cluster_resolver,
+               variable_partitioner):
     """Initialization of ParameterServerStrategyV2Extended."""
     super(ParameterServerStrategyV2Extended, self).__init__(container_strategy)
     self._num_ps = len(cluster_resolver.cluster_spec().as_dict().get("ps", []))
     self._variable_count = 0
+    self._variable_partitioner = variable_partitioner
 
   def _create_variable(self, next_creator, **kwargs):
+    """Implements StrategyExtendedV2._create_variable.
 
-    if "colocate_with" in kwargs:
+    Creates a `Variable` or a `ShardedVariable`. A `ShardedVariable` will be
+    created if satisfying all the following criteria:
+      1. `self._variable_partitioner` results in more than one partition on the
+         first axis.
+      2. variable's rank is greater than 0.
+      3. variable is not colocated with another variable.
+    Otherwise a `Variable` will be created.
+
+    Args:
+      next_creator: See `variable_scope.variable_creator_scope`; the next
+        creator in the chain.
+      **kwargs: Passed through to the next creator.
+
+    Returns:
+      A `Variable` or `ShardedVariable`.
+    """
+
+    if "colocate_with" in kwargs:  # Never partition colocated_with variables.
       colocate_with = kwargs["colocate_with"]
       # Clear the variable scope to avoid possible conflicts between device
       # scope and colocation scope.
@@ -121,6 +176,103 @@ class ParameterServerStrategyV2Extended(
               var.name, var.shape, kwargs["colocate_with"].name)
           return var
 
+    if self._variable_partitioner is None:
+      return self._create_variable_round_robin(next_creator, **kwargs)
+
+    name = kwargs.get("name", None)
+    initial_value = kwargs.get("initial_value", None)
+    if initial_value is None:
+      raise ValueError("initial_value must be specified.")
+
+    # Two cases where initial_value can be a callable:
+    #   1. initial_value is passed as a callable, e.g, an `initializer` class.
+    #   2. restoring from checkpoint, initial_value is a
+    #     "CheckpointInitialValueCallable".
+    init_from_fn = callable(initial_value)
+
+    dtype = kwargs.get("dtype", None)
+    shape = kwargs.get("shape", None)
+    if init_from_fn and (shape is None or dtype is None):
+      init_from_fn = False
+      initial_value = initial_value()
+    if not init_from_fn:
+      # The initial_value is created on client, it will need to be sent to
+      # PS for variable initialization, which can be inefficient and can
+      # potentially hit the 2GB limit on protobuf serialization.
+      initial_value = ops.convert_to_tensor(initial_value, dtype=dtype)
+      dtype = initial_value.dtype
+      shape = initial_value.shape
+    else:
+      shape = tensor_shape.as_shape(shape)
+
+    if shape.rank == 0:  # Skip partitioning rank-0 variable.
+      return self._create_variable_round_robin(next_creator, **kwargs)
+
+    num_partitions = self._variable_partitioner(shape=shape, dtype=dtype)
+    if not num_partitions or num_partitions[0] == 0 or any(
+        v != 1 for v in num_partitions[1:]):
+      raise ValueError(
+          "variable_partitioner must return a list/tuple whose elements are 1"
+          " besides the first element (non-zero), got: %r" % num_partitions)
+
+    if num_partitions[0] == 1:  # no partition
+      return self._create_variable_round_robin(next_creator, **kwargs)
+
+    # Use "div" partition strategy to partition the variable.
+    num_partitions = min(num_partitions[0], shape[0])
+    base = shape[0] // num_partitions
+    extra = shape[0] % num_partitions
+    # An example: num_partitions=4, shape[0]=10, partitions: [3, 3, 2, 2]
+    # offsets: [0, 3, 6, 8, 10]
+    offsets = []
+    for i in range(num_partitions):
+      if i == 0:
+        offsets.append(0)
+      else:
+        prev_shard_size = base + (1 if i - 1 < extra else 0)
+        offsets.append(offsets[i - 1] + prev_shard_size)
+    offsets.append(shape[0])
+
+    def init_shard_fn(shard_index):
+      if not init_from_fn:
+        logging.log_if(
+            logging.WARNING, _INEFFICIENT_INIT_WARNING % name,
+            shard_index == 0 and
+            shape.num_elements() > _LARGE_VARIABLE_NUM_ELEMENTS)
+        return initial_value[offsets[shard_index]:offsets[shard_index + 1]]
+      arg_spec = tf_inspect.getfullargspec(initial_value)
+      if ("shard_info" not in arg_spec.args and
+          "shard_info" not in arg_spec.kwonlyargs):
+        # `initial_value` is a callable that doesn't accept `shard_info`.
+        logging.log_if(
+            logging.WARNING, _INEFFICIENT_INIT_WARNING % name,
+            shard_index == 0 and
+            shape.num_elements() > _LARGE_VARIABLE_NUM_ELEMENTS)
+        full_value = initial_value()
+        return full_value[offsets[shard_index]:offsets[shard_index + 1]]
+      else:
+        # Memory-efficient way of initializing sharded variable. It requires
+        # the `init_fn` to accept a namedtuple `shard_info`.
+        component_shape = (offsets[shard_index + 1] -
+                           offsets[shard_index],) + shape[1:]
+        offsets_all_axes = (offsets[shard_index],) + (0,) * len(shape[1:])
+        return initial_value(
+            shard_info=trackable.ShardInfo(
+                shape=tensor_shape.as_shape(component_shape),
+                offset=offsets_all_axes))
+
+    var_list = []
+    for i in range(num_partitions):
+      kwargs["shape"] = (offsets[i + 1] - offsets[i],) + shape[1:]
+      kwargs["initial_value"] = lambda: init_shard_fn(i)
+      if name is not None:
+        kwargs["name"] = "{}/part_{}".format(name, i)
+      var_list.append(self._create_variable_round_robin(next_creator, **kwargs))
+
+    result = sharded_variable.ShardedVariable(var_list)
+    return result
+
+  def _create_variable_round_robin(self, next_creator, **kwargs):
     # Clear the colocation scope to avoid possible conflicts between device
     # scope and colocation scope.
     with ops.colocate_with(None, ignore_existing=True):
@@ -133,70 +285,24 @@ class ParameterServerStrategyV2Extended(
         self._variable_count += 1
         return var
 
-  def _make_sharded_variable_creator(self):
-    """Returns a function conforming to the `variable_creator` signature.
-
-    The returned function creates `ShardedVariable` when called.
-    """
-
-    def sharded_variable_creator(next_creator, **kwargs):
-      if "shape" not in kwargs or kwargs["shape"] is None:
-        raise ValueError("shape must be explicitly specified when creating "
-                         "sharded variables")
-      init_fn = kwargs.get("initial_value", None)
-      # We intentionally don't allow non-callable initial_value to ensure the
-      # value is created on PS but not client. If the value is created on
-      # client, it will needed to be sent to PS for variable initialization,
-      # which is inefficient and can potentially hit the 2GB limit on protobuf
-      # serialization.
-      if init_fn is None or not callable(init_fn):
-        raise ValueError("initial_value must be specified as a callable when "
-                         "creating sharded variables")
-
-      # Use "div" partition strategy to partition the variable.
-      full_shape = kwargs["shape"]
-      if self._num_ps < full_shape[0]:
-        num_shards = self._num_ps
-      else:
-        num_shards = full_shape[0]
-      offsets = []
-      base = full_shape[0] // num_shards
-      extra = full_shape[0] % num_shards
-      for i in range(num_shards):
-        if i == 0:
-          offsets.append(0)
-        else:
-          prev_shard_size = base + (1 if i - 1 < extra else 0)
-          offsets.append(offsets[i - 1] + prev_shard_size)
-
-      # Note: The way we initialize sharded variables is suboptimal, as it
-      # needs to create the full value tensor separately on each PS which the
-      # variable is going to be placed on. The full value could be very large
-      # and consume a lot of memory. The ideal way is to only create what's
-      # needed on the shard, however that's not practical because:
-      #  1. Initializers don't have sharded behavior support, even though some
-      #     initializers (e.g, uniform) can be used directly.
-      #  2. tf.Variable signature requires "initial_value" to be either a value
-      #     or a callable without arguments, meaning it is not straightforward
-      #     to make the sharded component from it.
-      def init_shard_fn(shard_index):
-        full_value = init_fn()
-        if shard_index < num_shards - 1:
-          return full_value[offsets[shard_index]:offsets[shard_index + 1]]
-        else:
-          return full_value[offsets[shard_index]:]
-
-      var_list = []
-      for i in range(num_shards):
-        kwargs["shape"] = None
-        kwargs["initial_value"] = lambda: init_shard_fn(i)
-        var_list.append(next_creator(**kwargs))
-
-      result = sharded_variable.ShardedVariable(var_list)
-      return result
-
-    return sharded_variable_creator
-
   def _call_for_each_replica(self, fn, args, kwargs):
-    # TODO(rchao): Consider implementing sync PS training.
-    raise NotImplementedError("Sync PS training is not implemented yet.")
+    with distribute_lib.ReplicaContext(
+        self._container_strategy(),
+        replica_id_in_sync_group=constant_op.constant(0, dtypes.int32)):
+      # TODO(rchao): Support multi-replica per worker or sync-group.
+      return distribute_utils.regroup((fn(*args, **kwargs),))
+
+
+# The warning that will be logged if the way we initialize sharded variables
+# is memory-inefficient.
+_INEFFICIENT_INIT_WARNING = (
+    "Large variable %s is partitioned but not initialized in a memory-efficient"
+    " way. The full value is first being created and then sliced into smaller "
+    "values. To reduce the memory footprint, explicitly specify `dtype` and "
+    "`shape` when creating variables, and pass a callable to Variable's "
+    "`initial_value`. The callable should take only one argument which is a "
+    "namedtuple (shape: `tf.TensorShape`, offsets: list/tuple) where shape is "
+    "the shape of the component variable, and offsets is the offsets of the "
+    "smaller variable on each axis.")
+
+_LARGE_VARIABLE_NUM_ELEMENTS = 1e9
diff --git a/tensorflow/python/distribute/parameter_server_strategy_v2_test.py b/tensorflow/python/distribute/parameter_server_strategy_v2_test.py
new file mode 100644
index 00000000000..94b07fc99a0
--- /dev/null
+++ b/tensorflow/python/distribute/parameter_server_strategy_v2_test.py
@@ -0,0 +1,381 @@
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for parameter_server_strategy_v2.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import os
+import platform
+import sys
+
+from absl.testing import parameterized
+
+from tensorflow.python.distribute import multi_worker_test_base
+from tensorflow.python.distribute import parameter_server_strategy_v2
+from tensorflow.python.distribute import sharded_variable
+from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
+from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import test
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops_v2
+from tensorflow.python.ops import linalg_ops_impl
+from tensorflow.python.ops import partitioned_variables
+from tensorflow.python.ops import variables
+from tensorflow.python.training.server_lib import ClusterSpec
+from tensorflow.python.training.tracking import tracking
+from tensorflow.python.training.tracking import util as tracking_util
+
+
+class ParameterServerStrategyV2Test(test.TestCase):
+
+  @classmethod
+  def setUpClass(cls):
+    super(ParameterServerStrategyV2Test, cls).setUpClass()
+    cluster_def = multi_worker_test_base.create_in_process_cluster(
+        num_workers=2, num_ps=3)
+    cls.cluster_resolver = SimpleClusterResolver(ClusterSpec(cluster_def))
+
+  def tearDown(self):
+    super().tearDown()
+    # reset context to disconnect from the cluster.
+    context._reset_context()
+
+  def testVariablePlacement(self):
+
+    if sys.version_info >= (3, 8) and platform.system() == "Windows":
+      # TODO(b/165013260): Fix this
+      self.skipTest("Test is currently broken on Windows with Python 3.8")
+
+    strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
+        self.cluster_resolver)
+    v1 = variables.Variable(initial_value=0.0)
+    with strategy.scope():
+      v2 = variables.Variable(initial_value=1.0)
+      v3 = variables.Variable(initial_value=2.0)
+      v4 = variables.Variable(initial_value=3.0)
+      v5 = variables.Variable(initial_value=4.0)
+    # v1 was created outside scope so should be on client.
+    self.assertEqual(v1.device, "/job:chief/replica:0/task:0/device:CPU:0")
+    # v2 through v5 are created in scope and in a round-robin manner.
+    self.assertEqual(v2.device, "/job:ps/replica:0/task:0/device:CPU:0")
+    self.assertEqual(v3.device, "/job:ps/replica:0/task:1/device:CPU:0")
+    self.assertEqual(v4.device, "/job:ps/replica:0/task:2/device:CPU:0")
+    self.assertEqual(v5.device, "/job:ps/replica:0/task:0/device:CPU:0")
+
+
+class PartitionAwareIdentity(object):
+
+  def __call__(self, shape, dtype, shard_info):
+    value = linalg_ops_impl.eye(*shape, dtype=dtype)
+    if shard_info is not None:
+      value = array_ops.slice(value, shard_info.offset, shard_info.shape)
+    return value
+
+
+class VariablePartitioningTest(test.TestCase, parameterized.TestCase):
+
+  @classmethod
+  def setUpClass(cls):
+    super(VariablePartitioningTest, cls).setUpClass()
+    cluster_def = multi_worker_test_base.create_in_process_cluster(
+        num_workers=2, num_ps=2)
+    cls.cluster_resolver = SimpleClusterResolver(ClusterSpec(cluster_def))
+
+  def setUp(self):
+    super().setUp()
+    if sys.version_info >= (3, 8) and platform.system() == "Windows":
+      # TODO(b/165013260): Fix this
+      self.skipTest("Test is currently broken on Windows with Python 3.8")
+
+  def tearDown(self):
+    super().tearDown()
+    # reset context to disconnect from the cluster.
+    context._reset_context()
+
+  def testDefaultNoPartition(self):
+    strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
+        self.cluster_resolver)
+    with strategy.scope():
+      v = variables.Variable([0, 1, 2, 3])
+
+    self.assertIsInstance(v, variables.Variable)
+
+  def testBasic(self):
+    strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
+        self.cluster_resolver, partitioned_variables.fixed_size_partitioner(2))
+    with strategy.scope():
+      init1 = init_ops_v2.Constant([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
+      v1 = variables.Variable(
+          initial_value=lambda: init1(shape=(5, 2), dtype=dtypes.int64),
+          shape=(5, 2),
+          dtype=dtypes.int64)
+
+      init2 = init_ops_v2.Constant([0, 1, 2, 3, 4, 5])
+      v2 = variables.Variable(
+          initial_value=lambda: init2(shape=(6, 1), dtype=dtypes.int64),
+          shape=(6, 1),
+          dtype=dtypes.int64)
+
+    self.assertIsInstance(v1, sharded_variable.ShardedVariable)
+    self.assertLen(v1.variables, 2)
+    self.assertRegex(v1.variables[0].device, "/job:ps/replica:0/task:0")
+    self.assertRegex(v1.variables[1].device, "/job:ps/replica:0/task:1")
+    self.assertAllEqual(v1.variables[0], [[0, 1], [2, 3], [4, 5]])
+    self.assertAllEqual(v1.variables[1], [[6, 7], [8, 9]])
+
+    self.assertIsInstance(v2, sharded_variable.ShardedVariable)
+    self.assertLen(v2.variables, 2)
+    self.assertRegex(v2.variables[0].device, "/job:ps/replica:0/task:0")
+    self.assertRegex(v2.variables[1].device, "/job:ps/replica:0/task:1")
+    self.assertAllEqual(v2.variables[0], [[0], [1], [2]])
+    self.assertAllEqual(v2.variables[1], [[3], [4], [5]])
+
+  def testNonCallableInitialValue(self):
+    strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
+        self.cluster_resolver, partitioned_variables.fixed_size_partitioner(4))
+    with strategy.scope():
+      v = variables.Variable([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
+
+    self.assertIsInstance(v, sharded_variable.ShardedVariable)
+    self.assertLen(v.variables, 4)
+    self.assertRegex(v.variables[0].device, "/job:ps/replica:0/task:0")
+    self.assertRegex(v.variables[1].device, "/job:ps/replica:0/task:1")
+    self.assertRegex(v.variables[2].device, "/job:ps/replica:0/task:0")
+    self.assertRegex(v.variables[3].device, "/job:ps/replica:0/task:1")
+    self.assertAllEqual(v.variables[0], [0, 1, 2])
+    self.assertAllEqual(v.variables[1], [3, 4, 5])
+    self.assertAllEqual(v.variables[2], [6, 7])
+    self.assertAllEqual(v.variables[3], [8, 9])
+
+  def testNumPartitionsLargerThanSize(self):
+    strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
+        self.cluster_resolver, partitioned_variables.fixed_size_partitioner(4))
+    with strategy.scope():
+      v = variables.Variable([0, 1, 2])
+
+    self.assertIsInstance(v, sharded_variable.ShardedVariable)
+    self.assertLen(v.variables, 3)
+    self.assertRegex(v.variables[0].device, "/job:ps/replica:0/task:0")
+    self.assertRegex(v.variables[1].device, "/job:ps/replica:0/task:1")
+    self.assertRegex(v.variables[2].device, "/job:ps/replica:0/task:0")
+    self.assertAllEqual(v.variables[0], [0])
+    self.assertAllEqual(v.variables[1], [1])
+    self.assertAllEqual(v.variables[2], [2])
+
+  def testPartitionToOne(self):
+    # For small variables there is only one partition.
+    variable_partitioner = partitioned_variables.min_max_variable_partitioner(
+        max_partitions=2, min_slice_size=64 << 20)
+    strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
+        self.cluster_resolver, variable_partitioner)
+    with strategy.scope():
+      initializer = init_ops_v2.Constant([0] * 10)
+      v1 = variables.Variable(
+          initial_value=lambda: initializer(shape=(10,), dtype=dtypes.int64),
+          shape=(10,),
+          dtype=dtypes.int64)
+
+      v2 = variables.Variable([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
+
+    self.assertIsInstance(v1, variables.Variable)
+    self.assertNotIsInstance(v1, sharded_variable.ShardedVariable)
+    self.assertRegex(v1.device, "/job:ps/replica:0/task:0")
+    self.assertAllEqual(v1, [0] * 10)
+
+    self.assertIsInstance(v2, variables.Variable)
+    self.assertNotIsInstance(v2, sharded_variable.ShardedVariable)
+    self.assertRegex(v2.device, "/job:ps/replica:0/task:1")
+    self.assertAllEqual(v2, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
+
+  def testColocateWith(self):
+    strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
+        self.cluster_resolver, partitioned_variables.fixed_size_partitioner(2))
+    with strategy.scope():
+      v1 = variables.Variable([0, 1, 2, 3])
+
+      with strategy.extended.colocate_vars_with(v1.variables[0]):
+        v2 = variables.Variable([4, 5])
+
+    self.assertIsInstance(v1, sharded_variable.ShardedVariable)
+
+    self.assertIsInstance(v2, variables.Variable)
+    self.assertNotIsInstance(v2, sharded_variable.ShardedVariable)
+    self.assertEqual(v2.device, v1.variables[0].device)
+    self.assertAllEqual(v2, [4, 5])
+
+  def testPartitionAwareInitializer(self):
+    strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
+        self.cluster_resolver, partitioned_variables.fixed_size_partitioner(2))
+    with strategy.scope():
+      initializer = PartitionAwareIdentity()
+      initial_value = functools.partial(
+          initializer, shape=(4, 4), dtype=dtypes.int64)
+      v = variables.Variable(
+          initial_value=initial_value, shape=(4, 4), dtype=dtypes.int64)
+
+    self.assertIsInstance(v, sharded_variable.ShardedVariable)
+    self.assertLen(v.variables, 2)
+    self.assertRegex(v.variables[0].device, "/job:ps/replica:0/task:0")
+    self.assertRegex(v.variables[1].device, "/job:ps/replica:0/task:1")
+    self.assertAllEqual(v.variables[0], [[1, 0, 0, 0], [0, 1, 0, 0]])
+    self.assertAllEqual(v.variables[1], [[0, 0, 1, 0], [0, 0, 0, 1]])
+
+  def testPartitionWhenLackOfInfo(self):
+    strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
+        self.cluster_resolver, partitioned_variables.fixed_size_partitioner(2))
+    with strategy.scope():
+      initializer = init_ops_v2.Constant([0, 1, 2, 3])
+      # Shape is not explicitly specified.
+      v1 = variables.Variable(
+          initial_value=lambda: initializer(shape=(4,), dtype=dtypes.int64),
+          dtype=dtypes.int64)
+      # Dtype is not explicitly specified.
+      v2 = variables.Variable(
+          initial_value=lambda: initializer(shape=(4,), dtype=dtypes.int64),
+          shape=(4,))
+      # Neither shape nor dtype is explicitly specified.
+      v3 = variables.Variable(
+          initial_value=lambda: initializer(shape=(4,), dtype=dtypes.int64))
+
+    for v in [v1, v2, v3]:
+      self.assertIsInstance(v, sharded_variable.ShardedVariable)
+      self.assertLen(v.variables, 2)
+      self.assertRegex(v.variables[0].device, "/job:ps/replica:0/task:0")
+      self.assertRegex(v.variables[1].device, "/job:ps/replica:0/task:1")
+      self.assertAllEqual(v.variables[0], [0, 1])
+      self.assertAllEqual(v.variables[1], [2, 3])
+
+  def testInvalidPartitioner(self):
+    strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
+        self.cluster_resolver, lambda shape, dtype: None)
+    with self.assertRaisesRegex(ValueError, "variable_partitioner"):
+      with strategy.scope():
+        variables.Variable([[[0, 1], [2, 3]], [[0, 1], [2, 3]]])
+
+    strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
+        self.cluster_resolver, lambda shape, dtype: [])
+    with self.assertRaisesRegex(ValueError, "variable_partitioner"):
+      with strategy.scope():
+        variables.Variable([[[0, 1], [2, 3]], [[0, 1], [2, 3]]])
+
+    strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
+        self.cluster_resolver, lambda shape, dtype: [0, 1, 1])
+    with self.assertRaisesRegex(ValueError, "variable_partitioner"):
+      with strategy.scope():
+        variables.Variable([[[0, 1], [2, 3]], [[0, 1], [2, 3]]])
+
+    strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
+        self.cluster_resolver, lambda shape, dtype: [2, 2, 1])
+    with self.assertRaisesRegex(ValueError, "variable_partitioner"):
+      with strategy.scope():
+        variables.Variable([[[0, 1], [2, 3]], [[0, 1], [2, 3]]])
+
+  def testCreateInsideTFFunction(self):
+    strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
+        self.cluster_resolver, partitioned_variables.fixed_size_partitioner(2))
+
+    collection = []
+
+    @def_function.function
+    def create_vars():
+      if not collection:
+        identity = init_ops_v2.Identity()
+        v1 = variables.Variable([[1., 0.], [0., 1.]], dtype=dtypes.float32)
+        v2 = variables.Variable(lambda: identity((2, 2), dtypes.float32))
+        v3 = variables.Variable(
+            lambda: identity((2, 2), dtypes.float32),
+            dtype=dtypes.float32,
+            shape=(2, 2))
+        collection.extend([v1, v2, v3])
+
+    with strategy.scope():
+      create_vars()
+      for v in collection:
+        self.assertIsInstance(v, sharded_variable.ShardedVariable)
+        self.assertLen(v.variables, 2)
+        self.assertRegex(v.variables[0].device, "/job:ps/replica:0/task:0")
+        self.assertRegex(v.variables[1].device, "/job:ps/replica:0/task:1")
+        self.assertAllEqual(v.variables[0], [[1., 0.]])
+        self.assertAllEqual(v.variables[1], [[0., 1.]])
+
+  @parameterized.named_parameters(
+      ("Restore", False, 2),
+      ("RestoreDiffShards", False, 4),
+      ("DelayedRestore", True, 2),
+      ("DelayedRestoreDiffShards", True, 4),
+  )
+  def testCheckpoint(self, delayed, restore_shards):
+
+    def make_variable(name, shape, dtype, initializer):
+      initial_value = functools.partial(initializer, shape, dtype=dtype)
+      return variables.Variable(
+          name=name, initial_value=initial_value, shape=shape, dtype=dtype)
+
+    class Model(tracking.AutoTrackable):
+
+      def build(self):
+        self.w = self._add_variable_with_custom_getter(
+            "w",
+            shape=(4,),
+            initializer=init_ops_v2.Ones(),
+            getter=make_variable)
+
+    strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
+        self.cluster_resolver, partitioned_variables.fixed_size_partitioner(2))
+    ckpt_dir = os.path.join(self.get_temp_dir(), "checkpoint")
+
+    with strategy.scope():
+      model1 = Model()
+      model1.build()
+      self.assertIsInstance(model1.w, sharded_variable.ShardedVariable)
+      self.assertLen(model1.w.variables, 2)
+      model1.w.assign([1., 2., 3., 4.])
+
+      cp1 = tracking_util.Checkpoint(model=model1)
+      cp1.write(ckpt_dir)
+
+    strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
+        self.cluster_resolver,
+        partitioned_variables.fixed_size_partitioner(restore_shards))
+
+    with strategy.scope():
+      model2 = Model()
+      cp2 = tracking_util.Checkpoint(model=model2)
+      if delayed:
+        cp2.restore(ckpt_dir)
+        model2.build()
+      else:
+        model2.build()
+        cp2.restore(ckpt_dir)
+      self.assertIsInstance(model2.w, sharded_variable.ShardedVariable)
+      self.assertLen(model2.w.variables, restore_shards)
+      if restore_shards == 2:
+        self.assertAllEqual(model2.w.variables[0], [1., 2.])
+        self.assertAllEqual(model2.w.variables[1], [3., 4.])
+      elif restore_shards == 4:
+        self.assertAllEqual(model2.w.variables[0], [1.])
+        self.assertAllEqual(model2.w.variables[1], [2.])
+        self.assertAllEqual(model2.w.variables[2], [3.])
+        self.assertAllEqual(model2.w.variables[3], [4.])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/distribute/ps_values.py b/tensorflow/python/distribute/ps_values.py
index db03c66850c..a257a022dfa 100644
--- a/tensorflow/python/distribute/ps_values.py
+++ b/tensorflow/python/distribute/ps_values.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import copy
 import weakref
 
 from tensorflow.python.distribute import distribute_lib
@@ -43,6 +44,36 @@ class AggregatingVariable(variables_lib.Variable, core.Tensor):
     v._aggregating_container = weakref.ref(self)  # pylint: disable=protected-access
     self._aggregation = aggregation
 
+  def __deepcopy__(self, memo):
+    """Perform a deepcopy of the `AggregatingVariable`.
+
+    Unlike the deepcopy of a regular tf.Variable, this keeps the original
+    strategy and devices of the `AggregatingVariable`.  To avoid confusion
+    with the behavior of deepcopy on a regular `Variable` (which does
+    copy into new devices), we only allow a deepcopy of a `AggregatingVariable`
+    within its originating strategy scope.
+
+    Args:
+      memo: The memoization object for `deepcopy`.
+
+    Returns:
+      A deep copy of the current `AggregatingVariable`.
+
+    Raises:
+      RuntimeError: If trying to deepcopy into a different strategy.
+    """
+    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
+      v = copy.deepcopy(self._v, memo)
+
+    copied_variable = type(self)(
+        strategy=self._distribute_strategy,
+        v=v,
+        aggregation=self._aggregation)
+
+    memo[id(self)] = copied_variable
+
+    return copied_variable
+
   def get(self):
     return self._v
 
diff --git a/tensorflow/python/distribute/sharded_variable.py b/tensorflow/python/distribute/sharded_variable.py
index ea0fbf110d9..7b757db95cb 100644
--- a/tensorflow/python/distribute/sharded_variable.py
+++ b/tensorflow/python/distribute/sharded_variable.py
@@ -19,7 +19,9 @@ from __future__ import print_function
 
 import copy
 
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.saved_model import save_context
@@ -122,6 +124,13 @@ class ShardedVariable(trackable.Trackable):
           'axis, found {}'.format([v.shape for v in variables]))
     first_dim = sum(int(v.shape[0]) for v in variables)
     self._shape = tensor_shape.TensorShape([first_dim] + first_var.shape[1:])
+    self._var_offsets = [
+        [0 for _ in range(len(first_var.shape))] for _ in range(len(variables))
+    ]
+    for i in range(1, len(variables)):
+      # Always partition on the first axis. Offsets on other axes are 0.
+      self._var_offsets[i][0] += (
+          self._var_offsets[i - 1][0] + variables[i - 1].shape[0])
 
     save_slice_info = [v._get_save_slice_info() for v in variables]  # pylint: disable=protected-access
     if any(slice_info is not None for slice_info in save_slice_info):
@@ -162,6 +171,20 @@ class ShardedVariable(trackable.Trackable):
     """The overall shape, combining all shards along axis `0`."""
     return self._shape
 
+  def assign(self, value, use_locking=None, name=None, read_value=True):
+    for i, v in enumerate(self._variables):
+      v.assign(array_ops.slice(value, self._var_offsets[i], v.shape.as_list()))
+
+  def assign_add(self, delta, use_locking=False, name=None, read_value=True):
+    for i, v in enumerate(self._variables):
+      v.assign_add(
+          array_ops.slice(delta, self._var_offsets[i], v.shape.as_list()))
+
+  def assign_sub(self, delta, use_locking=False, name=None, read_value=True):
+    for i, v in enumerate(self._variables):
+      v.assign_sub(
+          array_ops.slice(delta, self._var_offsets[i], v.shape.as_list()))
+
   def _gather_saveables_for_checkpoint(self):
     """Return a `Saveable` for each shard. See `Trackable`."""
 
@@ -195,3 +218,20 @@ class ShardedVariable(trackable.Trackable):
                                     name=self.name)
 
     return obj_map, resource_map
+
+
+def _var_to_tensor(var, dtype=None, name=None, as_ref=False):
+  del name
+  if dtype is not None and not dtype.is_compatible_with(var.dtype):
+    raise ValueError(
+        'Incompatible type conversion requested to type {!r} for variable '
+        'of type {!r}'.format(dtype.name, var.dtype.name))
+  if as_ref:
+    raise NotImplementedError(
+        "ShardedVariable doesn't support being used as a reference.")
+  return array_ops.concat(var.variables, axis=0)
+
+
+# Register a conversion function which reads the value of the variable,
+# allowing instances of the class to be used as tensors.
+ops.register_tensor_conversion_function(ShardedVariable, _var_to_tensor)
diff --git a/tensorflow/python/distribute/sharded_variable_test.py b/tensorflow/python/distribute/sharded_variable_test.py
index 64ed3d03717..f04e5b248a3 100644
--- a/tensorflow/python/distribute/sharded_variable_test.py
+++ b/tensorflow/python/distribute/sharded_variable_test.py
@@ -72,6 +72,44 @@ class ShardedVariableTest(test.TestCase):
     self.assertEqual(s.dtype, v0.dtype)
     self.assertEqual(s.name, 's')
 
+  def test_assign(self):
+    v0 = variables_lib.Variable([[0, 0]])
+    v1 = variables_lib.Variable([[1, 1], [2, 2]])
+    v2 = variables_lib.Variable([[3, 3]])
+    s = sharded_variable.ShardedVariable([v0, v1, v2])
+    s.assign([[4, 4], [5, 5], [6, 6], [7, 7]])
+    self.assertAllEqual(self.evaluate(s.variables[0]), [[4, 4]])
+    self.assertAllEqual(self.evaluate(s.variables[1]), [[5, 5], [6, 6]])
+    self.assertAllEqual(self.evaluate(s.variables[2]), [[7, 7]])
+
+  def test_assign_add(self):
+    v0 = variables_lib.Variable([[0, 0]])
+    v1 = variables_lib.Variable([[1, 1], [2, 2]])
+    v2 = variables_lib.Variable([[3, 3]])
+    s = sharded_variable.ShardedVariable([v0, v1, v2])
+    s.assign_add([[1, 1], [1, 1], [2, 2], [2, 2]])
+    self.assertAllEqual(self.evaluate(s.variables[0]), [[1, 1]])
+    self.assertAllEqual(self.evaluate(s.variables[1]), [[2, 2], [4, 4]])
+    self.assertAllEqual(self.evaluate(s.variables[2]), [[5, 5]])
+
+  def test_assign_sub(self):
+    v0 = variables_lib.Variable([[0, 0]])
+    v1 = variables_lib.Variable([[1, 1], [2, 2]])
+    v2 = variables_lib.Variable([[3, 3]])
+    s = sharded_variable.ShardedVariable([v0, v1, v2])
+    s.assign_sub([[0, 0], [1, 1], [1, 1], [3, 3]])
+    self.assertAllEqual(self.evaluate(s.variables[0]), [[0, 0]])
+    self.assertAllEqual(self.evaluate(s.variables[1]), [[0, 0], [1, 1]])
+    self.assertAllEqual(self.evaluate(s.variables[2]), [[0, 0]])
+
+  def test_convert_to_tensor(self):
+    v0 = variables_lib.Variable([[0, 0]])
+    v1 = variables_lib.Variable([[1, 1], [2, 2]])
+    v2 = variables_lib.Variable([[3, 3]])
+    s = sharded_variable.ShardedVariable([v0, v1, v2])
+    t = ops.convert_to_tensor(s)
+    self.assertAllEqual(t, [[0, 0], [1, 1], [2, 2], [3, 3]])
+
   def test_save_restore(self):
     fname = os.path.join(self.get_temp_dir(), 'checkpoint')
     variables = [
@@ -148,6 +186,58 @@ class ShardedVariableTest(test.TestCase):
     self.assertAllEqual(self.evaluate(cp2.s.variables[0]), [0, 1])
     self.assertAllEqual(self.evaluate(cp2.s.variables[1]), [2, 3])
 
+  def test_delayed_restore(self):
+    fname = os.path.join(self.get_temp_dir(), 'checkpoint')
+    model = tracking.AutoTrackable()
+    variables = [
+        variables_lib.Variable([0]),
+        variables_lib.Variable([1]),
+        variables_lib.Variable([2]),
+        variables_lib.Variable([3])
+    ]
+    model.s = sharded_variable.ShardedVariable(variables)
+    cp = util.Checkpoint(model=model)
+    cp.write(fname)
+
+    model2 = tracking.AutoTrackable()
+    cp2 = util.Checkpoint(model=model2)
+    cp2.restore(fname)
+    variables2 = [
+        variables_lib.Variable([0]),
+        variables_lib.Variable([0]),
+        variables_lib.Variable([0]),
+        variables_lib.Variable([0])
+    ]
+    model2.s = sharded_variable.ShardedVariable(variables2)
+    self.assertAllEqual(self.evaluate(model2.s.variables[0]), [0])
+    self.assertAllEqual(self.evaluate(model2.s.variables[1]), [1])
+    self.assertAllEqual(self.evaluate(model2.s.variables[2]), [2])
+    self.assertAllEqual(self.evaluate(model2.s.variables[3]), [3])
+
+  def test_delayed_restore_4_to_2_partitions(self):
+    fname = os.path.join(self.get_temp_dir(), 'checkpoint')
+    model = tracking.AutoTrackable()
+    variables = [
+        variables_lib.Variable([0]),
+        variables_lib.Variable([1]),
+        variables_lib.Variable([2]),
+        variables_lib.Variable([3])
+    ]
+    model.s = sharded_variable.ShardedVariable(variables)
+    cp = util.Checkpoint(model=model)
+    cp.write(fname)
+
+    model2 = tracking.AutoTrackable()
+    cp2 = util.Checkpoint(model=model2)
+    cp2.restore(fname)
+    variables2 = [
+        variables_lib.Variable([0, 0]),
+        variables_lib.Variable([0, 0])
+    ]
+    model2.s = sharded_variable.ShardedVariable(variables2)
+    self.assertAllEqual(self.evaluate(model2.s.variables[0]), [0, 1])
+    self.assertAllEqual(self.evaluate(model2.s.variables[1]), [2, 3])
+
   def test_save_graph_def(self):
     root = tracking.AutoTrackable()
     v1 = variables_lib.Variable([3.])
diff --git a/tensorflow/python/distribute/strategy_combinations.py b/tensorflow/python/distribute/strategy_combinations.py
index b72cdd77a0e..b25e6157a3d 100644
--- a/tensorflow/python/distribute/strategy_combinations.py
+++ b/tensorflow/python/distribute/strategy_combinations.py
@@ -41,6 +41,8 @@ from tensorflow.python.tpu import tpu_strategy_util
 FLAGS = flags.FLAGS
 
 _did_connect_to_cluster = False
+CollectiveAllReduceExtended = (
+    collective_all_reduce_strategy.CollectiveAllReduceExtended)
 
 
 # pylint: disable=missing-docstring
@@ -109,6 +111,11 @@ def _get_multi_worker_mirrored_creator(required_gpus):
         num_accelerators={"GPU": required_gpus},
         rpc_layer=tf_config.rpc_layer or "grpc",
     )
+    # Disable health check. We don't have a reliable to shutdown the strategy
+    # (and thus the health check) at the end of a test. Turning on health check
+    # causes some flakiness since we re-create part of the server when creating
+    # a strategy, and our tests are capable of handling failures.
+    CollectiveAllReduceExtended._enable_check_health = False  # pylint: disable=protected-access
     # Always create the strategy in eager mode so that it starts the server and
     # configures the eager context. The eager context can no longer be
     # configured after initialization.
@@ -205,6 +212,7 @@ multi_worker_mirrored_2x1_cpu = combinations.NamedDistribution(
     has_chief=True,
     num_workers=1,
     use_pool_runner=True,
+    no_xla=True,
 )
 # chief + 1 worker, with 1 GPU each.
 multi_worker_mirrored_2x1_gpu = combinations.NamedDistribution(
@@ -214,6 +222,7 @@ multi_worker_mirrored_2x1_gpu = combinations.NamedDistribution(
     num_workers=1,
     required_gpus=1,
     use_pool_runner=True,
+    no_xla=True,
 )
 # chief + 1 worker, with 2 GPU each.
 multi_worker_mirrored_2x2_gpu = combinations.NamedDistribution(
@@ -223,6 +232,7 @@ multi_worker_mirrored_2x2_gpu = combinations.NamedDistribution(
     num_workers=1,
     required_gpus=2,
     use_pool_runner=True,
+    no_xla=True,
 )
 # chief + 3 workers, with CPU.
 multi_worker_mirrored_4x1_cpu = combinations.NamedDistribution(
@@ -231,6 +241,7 @@ multi_worker_mirrored_4x1_cpu = combinations.NamedDistribution(
     has_chief=True,
     num_workers=3,
     use_pool_runner=True,
+    no_xla=True,
 )
 
 
diff --git a/tensorflow/python/distribute/strategy_common_test.py b/tensorflow/python/distribute/strategy_common_test.py
index ece8c573ed1..0c9fbe0d660 100644
--- a/tensorflow/python/distribute/strategy_common_test.py
+++ b/tensorflow/python/distribute/strategy_common_test.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 from absl.testing import parameterized
-import numpy as np
 
 from tensorflow.python.compat import v2_compat
 from tensorflow.python.data.ops import dataset_ops
@@ -32,11 +31,15 @@ from tensorflow.python.distribute import strategy_test_lib
 from tensorflow.python.distribute.collective_all_reduce_strategy import CollectiveAllReduceStrategy
 from tensorflow.python.distribute.tpu_strategy import TPUStrategy
 from tensorflow.python.eager import def_function
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
+from tensorflow.python.util import nest
 
 
 @combinations.generate(
@@ -86,6 +89,501 @@ class StrategyTest(test.TestCase, parameterized.TestCase):
     g()
 
 
+@combinations.generate(
+    combinations.combine(
+        strategy=[
+            strategy_combinations.multi_worker_mirrored_2x2_gpu,
+            strategy_combinations.multi_worker_mirrored_2x1_cpu,
+            strategy_combinations.multi_worker_mirrored_2x1_gpu,
+        ],
+        mode=['eager'],
+        pure_eager=[True, False]))
+class GatherTest(test.TestCase, parameterized.TestCase):
+
+  def _gather_same_shape_and_verify(self, value_on_replica, axis, pure_eager,
+                                    strategy):
+    distributed_values = strategy.experimental_distribute_values_from_function(
+        lambda _: array_ops.identity(value_on_replica))
+
+    def run():
+      return strategy._gather(distributed_values, axis=axis)
+
+    if not pure_eager:
+      run = def_function.function(run)
+
+    all_results = [
+        value_on_replica for _ in range(strategy.num_replicas_in_sync)
+    ]
+    expected_result = array_ops.concat(all_results, axis=axis)
+    self.assertAllEqual(run().numpy(), expected_result)
+
+  def testGatherPerReplicaDense1D0Axis(self, strategy, pure_eager):
+    """A DistributedValues object with two tensors of shape [3] on each replica gathers to a tensor of [6]."""
+    single_value = constant_op.constant([1, 2, 3])
+    axis = 0
+    self._gather_same_shape_and_verify(single_value, axis, pure_eager, strategy)
+
+  def testGatherPerReplicaDense2D0Axis(self, strategy, pure_eager):
+    """A DistributedValues object with two tensors of [1, 3] on each replica gathers along 0th dim to a tensor of [2, 3]."""
+    single_value = constant_op.constant([[1, 2, 3]])
+    axis = 0
+    self._gather_same_shape_and_verify(single_value, axis, pure_eager, strategy)
+
+  def testGatherPerReplicaDense2D1Axis(self, strategy, pure_eager):
+    """A DistributedValues object with two tensors of [1, 3] on each replica gathers along 1st dim to a tensor of [1, 6]."""
+    single_value = constant_op.constant([[1, 2, 3]])
+    axis = 1
+    self._gather_same_shape_and_verify(single_value, axis, pure_eager, strategy)
+
+  def testGatherPerReplicaDense3D0Axis(self, strategy, pure_eager):
+    """A DistributedValues object with two tensors of [1, 2, 2] on each replica gathers along 0th dim to a tensor of [2, 2, 2]."""
+    single_value = constant_op.constant([[[1, 2], [1, 2]]])
+    axis = 0
+    self._gather_same_shape_and_verify(single_value, axis, pure_eager, strategy)
+
+  def testGatherPerReplicaDense3D1Axis(self, strategy, pure_eager):
+    """A DistributedValues object with two tensors of [1, 2, 2] on each replica gathers along 1nd dimension to a tensor of [1, 4, 2]."""
+    single_value = constant_op.constant([[[1, 2], [1, 2]]])
+    axis = 1
+    self._gather_same_shape_and_verify(single_value, axis, pure_eager, strategy)
+
+  def testGatherPerReplicaDense3D2Axis(self, strategy, pure_eager):
+    """A DistributedValues object with two tensors of [1, 2, 2] on each replica gathers along 2nd dimension to a tensor of [1, 2, 4]."""
+    single_value = constant_op.constant([[[1, 2], [1, 2]]])
+    axis = 2
+    self._gather_same_shape_and_verify(single_value, axis, pure_eager, strategy)
+
+  def testGatherDiffShapeAtAxis0(self, strategy, pure_eager):
+    """Different `Axis`-th (0) dimension: shape [1, 1], [2, 1] -> [3, 1]."""
+
+    def value_fn(ctx):
+      return constant_op.constant(
+          1, shape=(ctx.replica_id_in_sync_group + 1, 1))
+
+    distributed_values = strategy.experimental_distribute_values_from_function(
+        value_fn)
+    axis = 0
+
+    def run():
+      return strategy._gather(distributed_values, axis=axis)
+
+    if not pure_eager:
+      run = def_function.function(run)
+
+    if strategy.num_replicas_in_sync == 1:
+      expected_result = constant_op.constant(1, shape=(1, 1))
+    elif strategy.num_replicas_in_sync == 2:
+      expected_result = constant_op.constant(1, shape=(3, 1))
+    elif strategy.num_replicas_in_sync == 4:
+      expected_result = constant_op.constant(1, shape=(10, 1))
+    else:
+      # should follow expected_result = constant_op.constant(
+      #    1, shape=(sum(range(strategy.num_replicas_in_sync + 1)), 1))
+      raise ValueError('Add your own expect according to num_replicas_in sync')
+
+    self.assertAllEqual(run().numpy(), expected_result)
+
+  def testGatherDiffShapeAtAxis1(self, strategy, pure_eager):
+    """Different `Axis`-th (non-0) dimension: shape [1, 1], [1, 2] -> [1, 3]."""
+
+    def value_fn(ctx):
+      return constant_op.constant(
+          1, shape=(1, ctx.replica_id_in_sync_group + 1))
+
+    distributed_values = strategy.experimental_distribute_values_from_function(
+        value_fn)
+    axis = 1
+
+    def run():
+      return strategy._gather(distributed_values, axis=axis)
+
+    if not pure_eager:
+      run = def_function.function(run)
+
+    if strategy.num_replicas_in_sync == 1:
+      expected_result = constant_op.constant(1, shape=(1, 1))
+    elif strategy.num_replicas_in_sync == 2:
+      expected_result = constant_op.constant(1, shape=(1, 3))
+    elif strategy.num_replicas_in_sync == 4:
+      expected_result = constant_op.constant(1, shape=(1, 10))
+    else:
+      # should follow expected_result = constant_op.constant(
+      #   1, shape=(1, sum(range(strategy.num_replicas_in_sync + 1))))
+      raise ValueError('Add your own expect according to num_replicas_in sync')
+
+    self.assertAllEqual(run().numpy(), expected_result)
+
+  def testGatherRaiseDiffShapeAtNonAxis(self, strategy, pure_eager):
+    """Different at non-`axis`-th dimension : [1, 1], [1, 2], 0th -> raise error."""
+    if _get_num_devices_per_worker(strategy) > 1:
+      self.skipTest('b/167331966')
+    def value_fn(ctx):
+      return constant_op.constant(
+          1, shape=(1, ctx.replica_id_in_sync_group + 1))
+
+    distributed_values = strategy.experimental_distribute_values_from_function(
+        value_fn)
+    axis = 0
+
+    def run():
+      return strategy._gather(distributed_values, axis=axis)
+
+    error_message = 'Shape mismatch'
+    if not pure_eager:
+      run = def_function.function(run)
+
+    with self.assertRaisesRegex(errors.InvalidArgumentError, error_message):
+      run()
+
+  def testGatherRaiseSparsePerReplicaMultiWorker(self, strategy, pure_eager):
+    if strategy.num_replicas_in_sync != 2:
+      self.skipTest('Test for two replicas.')
+    dense_shape = [5, 2]
+    if multi_worker_test_base.get_task_type() == 'chief':
+      t0 = _make_indexed_slices(
+          values=[[1., 2.]], indices=[2], dense_shape=dense_shape)
+    if multi_worker_test_base.get_task_type() == 'worker':
+      t0 = _make_indexed_slices(
+          values=[[3., 4.], [5., 6.]], indices=[1, 3], dense_shape=dense_shape)
+
+    def run(value):
+      return strategy._gather(value, axis=0)
+
+    with self.assertRaisesRegex(
+        NotImplementedError,
+        r'gather/all_gather does not support IndexedSlices'):
+      if pure_eager:
+        run(t0)
+      else:
+        def_function.function(run)(t0)
+
+  def testGatherRaiseDifferentRank(self, strategy, pure_eager):
+    """Different rank: [1,], [1, 2] -> raise error."""
+    if strategy.num_replicas_in_sync <= 1:
+      self.skipTest('Test for more than 1 replicas.')
+    if _get_num_devices_per_worker(strategy) > 1:
+      self.skipTest('b/167331966')
+    def value_fn(ctx):
+      return array_ops.ones(shape=(range(1, ctx.replica_id_in_sync_group + 2)))
+
+    distributed_values = strategy.experimental_distribute_values_from_function(
+        value_fn)
+    axis = 0
+
+    def run():
+      return strategy._gather(distributed_values, axis=axis)
+
+    error_message = 'Shape mismatch'
+
+    if not pure_eager:
+      run = def_function.function(run)
+
+    with self.assertRaisesRegex(errors.InvalidArgumentError, error_message):
+      run()
+
+
+@combinations.generate(
+    combinations.combine(
+        strategy=[
+            strategy_combinations.multi_worker_mirrored_2x2_gpu,
+            strategy_combinations.multi_worker_mirrored_2x1_cpu,
+            strategy_combinations.multi_worker_mirrored_2x1_gpu,
+        ],
+        mode=['eager'],
+        pure_eager=[True, False]))
+class AllGatherTest(test.TestCase, parameterized.TestCase):
+
+  def _all_gather_same_shape_and_verify(self, value_on_replica, axis,
+                                        pure_eager, strategy):
+    per_replica_value = strategy.experimental_distribute_values_from_function(
+        lambda _: array_ops.identity(value_on_replica))
+
+    def replica_fn(per_replica_value):
+      ctx = ds_context.get_replica_context()
+      local_value = array_ops.identity(per_replica_value)
+      return ctx._all_gather(local_value, axis=axis)
+
+    if not pure_eager:
+      replica_fn = def_function.function(replica_fn)
+
+    result = strategy.experimental_local_results(
+        strategy.run(replica_fn, args=(per_replica_value,)))
+
+    all_value = [value_on_replica for _ in range(strategy.num_replicas_in_sync)]
+    expect = array_ops.concat(all_value, axis=axis)
+    expected_result = [expect] * _get_num_devices_per_worker(strategy)
+
+    self.assertAllClose(result, expected_result)
+
+  def testAllGatherPerReplicaDense1D0Axis(self, strategy, pure_eager):
+    """all_gather(..., axis=0,...) a DistributedValues with a Tensor of shape (3,) on two replica returns a PerReplica of tensor(s) with shape (6,)."""
+    single_value = constant_op.constant([1, 2, 3], dtype=dtypes.float32)
+    axis = 0
+    self._all_gather_same_shape_and_verify(single_value, axis, pure_eager,
+                                           strategy)
+
+  def testAllGatherPerReplicaDense2D0Axis(self, strategy, pure_eager):
+    """all_gather(..., axis=0,...) a DistributedValues with a Tensor of shape (1,3) on two replica returns PerReplica of tensor(s) with shape (2,3)."""
+    single_value = constant_op.constant([[1, 2, 3]])
+    axis = 0
+    self._all_gather_same_shape_and_verify(single_value, axis, pure_eager,
+                                           strategy)
+
+  def testAllGatherPerReplicaDense2D1Axis(self, strategy, pure_eager):
+    """all_gather(..., axis=1,...) a DistributedValues with a Tensor of shape (1,3) on two replica returns PerReplica of tensor(s) with shape (1,6)."""
+    single_value = constant_op.constant([[1, 2, 3]])
+    axis = 1
+    self._all_gather_same_shape_and_verify(single_value, axis, pure_eager,
+                                           strategy)
+
+  def testAllGatherPerReplicaDense3D0Axis(self, strategy, pure_eager):
+    """all_gather(..., axis=0,...) a DistributedValues with a Tensor of shape (1,2,2) on two replica returns PerReplica of tensor(s) with shape (2,2,2)."""
+    single_value = constant_op.constant([[[1, 2], [1, 2]]])
+    axis = 0
+    self._all_gather_same_shape_and_verify(single_value, axis, pure_eager,
+                                           strategy)
+
+  def testAllGatherPerReplicaDense3D1Axis(self, strategy, pure_eager):
+    """all_gather(..., axis=1,...) a DistributedValues with a Tensor of shape (1,2,2) on two replica returns PerReplica of tensor(s) with shape (1,4,2)."""
+    single_value = constant_op.constant([[[1, 2], [1, 2]]])
+    axis = 1
+    self._all_gather_same_shape_and_verify(single_value, axis, pure_eager,
+                                           strategy)
+
+  def testAllGatherPerReplicaDense3D2Axis(self, strategy, pure_eager):
+    """all_gather(..., axis=2,...) a DistributedValues with a Tensor of shape (1,2,2) on two replica returns PerReplica of tensor(s) with shape (1,2,4)."""
+    single_value = constant_op.constant([[[1, 2], [1, 2]]])
+    axis = 2
+    self._all_gather_same_shape_and_verify(single_value, axis, pure_eager,
+                                           strategy)
+
+  def testAllGatherDiffShapeAtAxis0(self, strategy, pure_eager):
+    """Different `Axis==0`-th dimension: shape [1, 1], [2, 1] -> [3, 1]."""
+
+    def value_fn(ctx):
+      return constant_op.constant(
+          1, shape=(ctx.replica_id_in_sync_group + 1, 1))
+
+    per_replica_value = strategy.experimental_distribute_values_from_function(
+        value_fn)
+
+    if strategy.num_replicas_in_sync == 1:
+      expect = constant_op.constant(1, shape=(1, 1))
+    elif strategy.num_replicas_in_sync == 2:
+      expect = constant_op.constant(1, shape=(3, 1))
+    elif strategy.num_replicas_in_sync == 4:
+      expect = constant_op.constant(1, shape=(10, 1))
+    else:
+      # should follow expect = constant_op.constant(
+      #     1, shape=(sum(range(strategy.num_replicas_in_sync + 1)), 1))
+      raise ValueError('Add your own expect according to num_replicas_in sync')
+
+    def run(value):
+      value_identity = array_ops.identity(value)
+      ctx = ds_context.get_replica_context()
+      return ctx._all_gather(value_identity, axis=0)
+
+    if not pure_eager:
+      run = def_function.function(run)
+
+    expected_result = [expect] * _get_num_devices_per_worker(strategy)
+    result = strategy.experimental_local_results(
+        strategy.run(run, args=(per_replica_value,)))
+    self.assertAllEqual(result, expected_result)
+
+  def testAllGatherDiffShapeAtAxis1(self, strategy, pure_eager):
+    """Different `Axis`-th (not 0th) dimension: shape [1, 1], [1, 2] -> [1, 3]."""
+
+    def value_fn(ctx):
+      return constant_op.constant(
+          1, shape=(1, ctx.replica_id_in_sync_group + 1))
+
+    per_replica_value = strategy.experimental_distribute_values_from_function(
+        value_fn)
+
+    if strategy.num_replicas_in_sync == 1:
+      expect = constant_op.constant(1, shape=(1, 1))
+    elif strategy.num_replicas_in_sync == 2:
+      expect = constant_op.constant(1, shape=(1, 3))
+    elif strategy.num_replicas_in_sync == 4:
+      expect = constant_op.constant(1, shape=(1, 10))
+    else:
+      # should follow expect = constant_op.constant(
+      #    1, shape=(1, sum(range(strategy.num_replicas_in_sync + 1))))
+      raise ValueError('Add your own expect according to num_replicas_in sync')
+
+    def run(value):
+      value_identity = array_ops.identity(value)
+      ctx = ds_context.get_replica_context()
+      return ctx._all_gather(value_identity, axis=1)
+
+    if not pure_eager:
+      run = def_function.function(run)
+
+    expected_result = [expect] * _get_num_devices_per_worker(strategy)
+    result = strategy.experimental_local_results(
+        strategy.run(run, args=(per_replica_value,)))
+    self.assertAllEqual(result, expected_result)
+
+  def testAllGatherNest(self, strategy, pure_eager):
+    axis = 1
+
+    def value_fn(ctx):
+      value = constant_op.constant(
+          1, shape=(1, ctx.replica_id_in_sync_group + 1))
+      return value
+    per_replica_value = strategy.experimental_distribute_values_from_function(
+        value_fn)
+
+    if strategy.num_replicas_in_sync == 1:
+      expect_1 = constant_op.constant(1, shape=(1, 1))
+    elif strategy.num_replicas_in_sync == 2:
+      expect_1 = constant_op.constant(1, shape=(1, 3))
+    elif strategy.num_replicas_in_sync == 4:
+      expect_1 = constant_op.constant(1, shape=(1, 10))
+    else:
+      # should follow expect_1 = constant_op.constant(
+      #    1, shape=(1, sum(range(strategy.num_replicas_in_sync + 1))))
+      raise ValueError('Add your own expect according to num_replicas_in sync')
+
+    expected_per_replica_1 = [expect_1] * _get_num_devices_per_worker(strategy)
+
+    value_2 = constant_op.constant([[[1, 2], [1, 2]]])
+
+    if strategy.num_replicas_in_sync == 1:
+      expect_2 = constant_op.constant([[[1, 2], [1, 2]]])
+    elif strategy.num_replicas_in_sync == 2:
+      expect_2 = constant_op.constant([[[1, 2], [1, 2], [1, 2], [1, 2]]])
+    elif strategy.num_replicas_in_sync == 4:
+      expect_2 = constant_op.constant([[[1, 2], [1, 2], [1, 2], [1, 2], [1, 2],
+                                        [1, 2], [1, 2], [1, 2]]])
+    else:
+      # should follow expect_2 = array_ops.concat(
+      #    [value_2 for _ in range(strategy.num_replicas_in_sync)], axis=axis)
+      raise ValueError('Add your own expect according to num_replicas_in sync')
+
+    expected_per_replica_2 = [expect_2] * _get_num_devices_per_worker(strategy)
+
+    def run(value):
+      value_1 = array_ops.identity(value)
+      value_3 = array_ops.identity(value_2)
+      ctx = ds_context.get_replica_context()
+      return ctx._all_gather([value_1, value_3], axis=axis)
+
+    if not pure_eager:
+      run = def_function.function(run)
+
+    result = strategy.run(run, args=(per_replica_value,))
+    self.assertAllEqual(
+        strategy.experimental_local_results(result[0]), expected_per_replica_1)
+    self.assertAllEqual(
+        strategy.experimental_local_results(result[1]), expected_per_replica_2)
+
+  def testAllGatherNest1D0Axis(self, strategy, pure_eager):
+    """all_gather(..., axis=0,...) a nest of DistributedValues."""
+    single_value = constant_op.constant([1, 2, 3])
+    axis = 0
+
+    def run():
+      value_identity = array_ops.identity(single_value)
+      ctx = ds_context.get_replica_context()
+      return ctx._all_gather([value_identity, value_identity], axis=axis)
+
+    if not pure_eager:
+      run = def_function.function(run)
+
+    all_value = [single_value for _ in range(strategy.num_replicas_in_sync)]
+    expect = array_ops.concat(all_value, axis=axis)
+    expected_per_replica = [expect] * _get_num_devices_per_worker(strategy)
+
+    result = strategy.run(run)
+    for gathered_result in result:
+      self.assertAllEqual(
+          strategy.experimental_local_results(gathered_result),
+          expected_per_replica)
+
+  def testAllGatherRaiseDiffShapeAtNonAxis(self, strategy, pure_eager):
+    """Different at non-`axis`-th dimension : [2, 1], [1, 1], all_gather(...axis=1...) -> raise error."""
+    if _get_num_devices_per_worker(strategy) > 1:
+      self.skipTest('b/167331966')
+
+    def value_fn(ctx):
+      return constant_op.constant(
+          1, shape=(1, ctx.replica_id_in_sync_group + 1))
+
+    per_replica_value = strategy.experimental_distribute_values_from_function(
+        value_fn)
+
+    def run(value):
+      value_identity = array_ops.identity(value)
+      ctx = ds_context.get_replica_context()
+      return ctx._all_gather(value_identity, axis=0)
+
+    if not pure_eager:
+      run = def_function.function(run)
+
+    with self.assertRaisesRegex(errors.InvalidArgumentError, r'Shape mismatch'):
+      strategy.run(run, args=(per_replica_value,))
+
+  def testAllGatherRaiseSparsePerReplica(self, strategy, pure_eager):
+    # all_gather supports sparse when using tf.function, because sparse tensors
+    # are converted to dense in
+    # third_party/tensorflow/python/ops/custom_gradient.py _graph_mode_decorator
+    if strategy.num_replicas_in_sync != 2:
+      self.skipTest('Test for two replicas.')
+    dense_shape = [5, 2]
+    t0 = _make_indexed_slices(
+        values=[[1., 2.]], indices=[2], dense_shape=dense_shape)
+
+    def replica_fn(value):
+      ctx = ds_context.get_replica_context()
+      return ctx._all_gather(value, axis=0)
+
+    with self.assertRaisesRegex(
+        NotImplementedError,
+        r'gather/all_gather does not support IndexedSlices'):
+      strategy.run(replica_fn, args=(t0,))
+
+  def testAllGatherRaiseDifferentRank(self, strategy, pure_eager):
+    """Different rank: [1,], [1, 2] -> raise error."""
+    if strategy.num_replicas_in_sync <= 1:
+      self.skipTest('Test for more than 1 replicas.')
+    if _get_num_devices_per_worker(strategy) > 1:
+      self.skipTest('b/167331966')
+    def value_fn(ctx):
+      return array_ops.ones(shape=(range(1, ctx.replica_id_in_sync_group + 2)))
+
+    per_replica_value = strategy.experimental_distribute_values_from_function(
+        value_fn)
+
+    def run(value):
+      value_identity = array_ops.identity(value)
+      ctx = ds_context.get_replica_context()
+      return ctx._all_gather(value_identity, axis=0)
+
+    error_message = 'Shape mismatch'
+
+    if not pure_eager:
+      run = def_function.function(run)
+
+    with self.assertRaisesRegex(errors.InvalidArgumentError, error_message):
+      strategy.run(run, args=(per_replica_value,))
+
+
+def _make_indexed_slices(values, indices, dense_shape):
+  tensor = ops.IndexedSlices(
+      values=constant_op.constant(values),
+      indices=constant_op.constant(indices),
+      dense_shape=constant_op.constant(dense_shape))
+  return tensor
+
+
+def _get_num_devices_per_worker(strategy):
+  """Returns the number of workers in the current cluster for multi-worker."""
+  resolver = strategy.cluster_resolver
+  return max(nest.flatten(resolver.num_accelerators())[0], 1)
+
+
 @combinations.generate(
     combinations.combine(
         strategy=[
@@ -135,10 +633,9 @@ class DistributedCollectiveAllReduceStrategyTest(
     # `result` is an incomplete batch
     result = run(input_iterator)
     expected_data_on_workers = {'chief': [8, 9, 10], 'worker': [11, 12, 13]}
-    self.assertTrue(
-        np.array_equal(
-            result.numpy(),
-            expected_data_on_workers[multi_worker_test_base.get_task_type()]))
+    self.assertAllEqual(
+        result.numpy(),
+        expected_data_on_workers[multi_worker_test_base.get_task_type()])
 
   def testSimpleInputFromFnLastPartialBatch(self, strategy):
 
@@ -163,10 +660,9 @@ class DistributedCollectiveAllReduceStrategyTest(
     result = run(input_iterator)
 
     expected_data_on_worker = {'chief': [8, 9, 10, 11], 'worker': [12, 13]}
-    self.assertTrue(
-        np.array_equal(
-            result.numpy(), expected_data_on_worker[
-                multi_worker_test_base.get_task_type()]))
+    self.assertAllEqual(
+        result.numpy(),
+        expected_data_on_worker[multi_worker_test_base.get_task_type()])
 
   def testReduceHostTensor(self, strategy):
     reduced = strategy.reduce(
diff --git a/tensorflow/python/distribute/test_util.py b/tensorflow/python/distribute/test_util.py
index de45bcfdecb..0f5dbd55491 100644
--- a/tensorflow/python/distribute/test_util.py
+++ b/tensorflow/python/distribute/test_util.py
@@ -64,7 +64,7 @@ def _gather(strategy, value):
   @def_function.function
   def gather_fn():
     gathered = cross_device_utils.build_collective_gather(
-        inputs, devices, group_size, collective_keys)
+        inputs, devices, group_size, collective_keys, axis=0)
     return distribute_utils.update_regroup(
         strategy.extended, gathered, group=True)
 
diff --git a/tensorflow/python/distribute/tpu_strategy.py b/tensorflow/python/distribute/tpu_strategy.py
index 22aeb37ff7c..0d5ab9d01cd 100644
--- a/tensorflow/python/distribute/tpu_strategy.py
+++ b/tensorflow/python/distribute/tpu_strategy.py
@@ -278,6 +278,200 @@ class TPUStrategyV2(distribute_lib.Strategy):
     options = options or distribute_lib.RunOptions()
     return self.extended.tpu_run(fn, args, kwargs, options)
 
+  def experimental_assign_to_logical_device(self, tensor, logical_device_id):
+    """Adds annotation that `tensor` will be assigned to a logical device.
+
+    This adds an annotation to `tensor` specifying that operations on
+    `tensor` will be invoked on logical core device id `logical_device_id`.
+    When model parallelism is used, the default behavior is that all ops
+    are placed on zero-th logical device.
+
+    ```python
+
+    # Initializing TPU system with 2 logical devices and 4 replicas.
+    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
+    tf.config.experimental_connect_to_cluster(resolver)
+    topology = tf.tpu.experimental.initialize_tpu_system(resolver)
+    device_assignment = tf.tpu.experimental.DeviceAssignment.build(
+        topology,
+        computation_shape=[1, 1, 1, 2],
+        num_replicas=4)
+    strategy = tf.distribute.TPUStrategy(
+        resolver, experimental_device_assignment=device_assignment)
+    iterator = iter(inputs)
+
+    @tf.function()
+    def step_fn(inputs):
+      output = tf.add(inputs, inputs)
+
+      # Add operation will be executed on logical device 0.
+      output = strategy.experimental_assign_to_logical_device(output, 0)
+      return output
+
+    strategy.run(step_fn, args=(next(iterator),))
+    ```
+
+    Args:
+      tensor: Input tensor to annotate.
+      logical_device_id: Id of the logical core to which the tensor will be
+        assigned.
+
+    Raises:
+      ValueError: The logical device id presented is not consistent with total
+      number of partitions specified by the device assignment.
+
+    Returns:
+      Annotated tensor with identical value as `tensor`.
+    """
+    num_logical_devices_per_replica = self.extended._tpu_devices.shape[1]  # pylint: disable=protected-access
+    if (logical_device_id < 0 or
+        logical_device_id >= num_logical_devices_per_replica):
+      raise ValueError("`logical_core_id` to assign must be lower then total "
+                       "number of logical devices per replica. Received "
+                       "logical device id {} but there are only total of {} "
+                       "logical devices in replica.".format(
+                           logical_device_id, num_logical_devices_per_replica))
+    return xla_sharding.assign_device(
+        tensor, logical_device_id, use_sharding_op=True)
+
+  def experimental_split_to_logical_devices(self, tensor, partition_dimensions):
+    """Adds annotation that `tensor` will be split across logical devices.
+
+    This adds an annotation to tensor `tensor` specifying that operations on
+    `tensor` will be be split among multiple logical devices. Tensor `tensor`
+    will be split across dimensions specified by `partition_dimensions`.
+    The dimensions of `tensor` must be divisible by corresponding value in
+    `partition_dimensions`.
+
+    For example, for system with 8 logical devices, if `tensor` is an image
+    tensor with shape (batch_size, width, height, channel) and
+    `partition_dimensions` is [1, 2, 4, 1], then `tensor` will be split
+    2 in width dimension and 4 way in height dimension and the split
+    tensor values will be fed into 8 logical devices.
+
+    ```python
+    # Initializing TPU system with 8 logical devices and 1 replica.
+    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
+    tf.config.experimental_connect_to_cluster(resolver)
+    topology = tf.tpu.experimental.initialize_tpu_system(resolver)
+    device_assignment = tf.tpu.experimental.DeviceAssignment.build(
+        topology,
+        computation_shape=[1, 2, 2, 2],
+        num_replicas=1)
+    strategy = tf.distribute.TPUStrategy(
+        resolver, experimental_device_assignment=device_assignment)
+
+    iterator = iter(inputs)
+
+    @tf.function()
+    def step_fn(inputs):
+      inputs = strategy.experimental_split_to_logical_devices(
+        inputs, [1, 2, 4, 1])
+
+      # model() function will be executed on 8 logical devices with `inputs`
+      # split 2 * 4  ways.
+      output = model(inputs)
+      return output
+
+    strategy.run(step_fn, args=(next(iterator),))
+    ```
+    Args:
+      tensor: Input tensor to annotate.
+      partition_dimensions: An unnested list of integers with the size equal to
+        rank of `tensor` specifying how `tensor` will be partitioned. The
+        product of all elements in `partition_dimensions` must be equal to the
+        total number of logical devices per replica.
+
+    Raises:
+      ValueError: 1) If the size of partition_dimensions does not equal to rank
+        of `tensor` or 2) if product of elements of `partition_dimensions` does
+        not match the number of logical devices per replica defined by the
+        implementing DistributionStrategy's device specification or
+        3) if a known size of `tensor` is not divisible by corresponding
+        value in `partition_dimensions`.
+
+    Returns:
+      Annotated tensor with identical value as `tensor`.
+    """
+    num_logical_devices_per_replica = self.extended._tpu_devices.shape[1]  # pylint: disable=protected-access
+    num_partition_splits = np.prod(partition_dimensions)
+    input_shape = tensor.shape
+    tensor_rank = len(input_shape)
+
+    if tensor_rank != len(partition_dimensions):
+      raise ValueError("Length of `partition_dimensions` ({}) must be  "
+                       "equal to the rank of `x` ({}).".format(
+                           len(partition_dimensions), tensor_rank))
+
+    for dim_index, dim_size in enumerate(input_shape):
+      if dim_size is None:
+        continue
+
+      split_size = partition_dimensions[dim_index]
+      if dim_size % split_size != 0:
+        raise ValueError("Tensor shape at dimension ({}) must be "
+                         "divisible by corresponding value specified "
+                         "by `partition_dimensions` ({}).".format(
+                             dim_index, split_size))
+
+    if num_partition_splits != num_logical_devices_per_replica:
+      raise ValueError("Number of logical devices ({}) does not match the "
+                       "number of partition splits specified ({}).".format(
+                           num_logical_devices_per_replica,
+                           num_partition_splits))
+
+    tile_assignment = np.arange(num_partition_splits).reshape(
+        partition_dimensions)
+    return xla_sharding.tile(tensor, tile_assignment, use_sharding_op=True)
+
+  def experimental_replicate_to_logical_devices(self, tensor):
+    """Adds annotation that `tensor` will be replicated to all logical devices.
+
+    This adds an annotation to tensor `tensor` specifying that operations on
+    `tensor` will be invoked on all logical devices.
+
+    ```python
+    # Initializing TPU system with 2 logical devices and 4 replicas.
+    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
+    tf.config.experimental_connect_to_cluster(resolver)
+    topology = tf.tpu.experimental.initialize_tpu_system(resolver)
+    device_assignment = tf.tpu.experimental.DeviceAssignment.build(
+        topology,
+        computation_shape=[1, 1, 1, 2],
+        num_replicas=4)
+    strategy = tf.distribute.TPUStrategy(
+        resolver, experimental_device_assignment=device_assignment)
+
+    iterator = iter(inputs)
+
+    @tf.function()
+    def step_fn(inputs):
+      images, labels = inputs
+      images = strategy.experimental_split_to_logical_devices(
+        inputs, [1, 2, 4, 1])
+
+      # model() function will be executed on 8 logical devices with `inputs`
+      # split 2 * 4  ways.
+      output = model(inputs)
+
+      # For loss calculation, all logical devices share the same logits
+      # and labels.
+      labels = strategy.experimental_replicate_to_logical_devices(labels)
+      output = strategy.experimental_replicate_to_logical_devices(output)
+      loss = loss_fn(labels, output)
+
+      return loss
+
+    strategy.run(step_fn, args=(next(iterator),))
+    ```
+    Args:
+      tensor: Input tensor to annotate.
+
+    Returns:
+      Annotated tensor with identical value as `tensor`.
+    """
+    return xla_sharding.replicate(tensor, use_sharding_op=True)
+
 
 @tf_export("distribute.experimental.TPUStrategy", v1=[])
 @deprecation.deprecated_endpoints("distribute.experimental.TPUStrategy")
@@ -772,57 +966,6 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
     finally:
       self._logical_device_stack.pop()
 
-  def _experimental_assign_to_logical_device(self, tensor, logical_device_id):
-    """See `DistributionStrategy.experimental_assign_to_logical_device`."""
-    num_logical_devices_per_replica = self._tpu_devices.shape[1]
-    if (logical_device_id < 0 or
-        logical_device_id >= num_logical_devices_per_replica):
-      raise ValueError("`logical_core_id` to assign must be lower then total "
-                       "number of logical devices per replica. Received "
-                       "logical device id {} but there are only total of {} "
-                       "logical devices in replica.".format(
-                           logical_device_id, num_logical_devices_per_replica))
-    return xla_sharding.assign_device(
-        tensor, logical_device_id, use_sharding_op=True)
-
-  def _experimental_split_to_logical_devices(self, tensor,
-                                             partition_dimensions):
-    """See `DistributionStrategy.experimental_split_to_logical_devices`."""
-    num_logical_devices_per_replica = self._tpu_devices.shape[1]
-    num_partition_splits = np.prod(partition_dimensions)
-    input_shape = tensor.shape
-    tensor_rank = len(input_shape)
-
-    if tensor_rank != len(partition_dimensions):
-      raise ValueError("Length of `partition_dimensions` ({}) must be  "
-                       "equal to the rank of `x` ({}).".format(
-                           len(partition_dimensions), tensor_rank))
-
-    for dim_index, dim_size in enumerate(input_shape):
-      if dim_size is None:
-        continue
-
-      split_size = partition_dimensions[dim_index]
-      if dim_size % split_size != 0:
-        raise ValueError("Tensor shape at dimension ({}) must be "
-                         "divisible by corresponding value specified "
-                         "by `partition_dimensions` ({}).".format(
-                             dim_index, split_size))
-
-    if num_partition_splits != num_logical_devices_per_replica:
-      raise ValueError("Number of logical devices ({}) does not match the "
-                       "number of partition splits specified ({}).".format(
-                           num_logical_devices_per_replica,
-                           num_partition_splits))
-
-    tile_assignment = np.arange(num_partition_splits).reshape(
-        partition_dimensions)
-    return xla_sharding.tile(tensor, tile_assignment, use_sharding_op=True)
-
-  def _experimental_replicate_to_logical_devices(self, tensor):
-    """See `DistributionStrategy.experimental_replicate_to_logical_devices`."""
-    return xla_sharding.replicate(tensor, use_sharding_op=True)
-
   def _experimental_initialize_system(self):
     """Experimental method added to be used by Estimator.
 
diff --git a/tensorflow/python/distribute/tpu_strategy_test.py b/tensorflow/python/distribute/tpu_strategy_test.py
index c1318927ca8..c2aa68a0785 100644
--- a/tensorflow/python/distribute/tpu_strategy_test.py
+++ b/tensorflow/python/distribute/tpu_strategy_test.py
@@ -454,8 +454,7 @@ class TPUStrategyTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual(expected_result, run(input_iterator))
     self.assertAllEqual((0.,), w.read_value())
 
-  # TODO(b/140633529): Re-enable the test.
-  def disable_test_experimental_run_output_on_device(self, enable_packed_var):
+  def test_experimental_run_output_on_device(self, enable_packed_var):
     strategy = get_tpu_strategy(enable_packed_var)
 
     def computation(x):
diff --git a/tensorflow/python/distribute/tpu_values.py b/tensorflow/python/distribute/tpu_values.py
index 901b906e4d9..f734caef5c5 100644
--- a/tensorflow/python/distribute/tpu_values.py
+++ b/tensorflow/python/distribute/tpu_values.py
@@ -207,42 +207,62 @@ class TPUDistributedVariable(TPUVariableMixin, values.DistributedVariable):
     self._policy._is_mirrored()  # pylint: disable=protected-access
 
   def assign_sub(self, value, use_locking=False, name=None, read_value=True):
+    if values_util.is_saving_non_distributed():
+      return self._primary.assign_sub(value, use_locking, name, read_value)
     return self._policy.assign_sub(
         self, value, use_locking=use_locking, name=name, read_value=read_value)
 
   def assign_add(self, value, use_locking=False, name=None, read_value=True):
+    if values_util.is_saving_non_distributed():
+      return self._primary.assign_add(value, use_locking, name, read_value)
     return self._policy.assign_add(
         self, value, use_locking=use_locking, name=name, read_value=read_value)
 
   def assign(self, value, use_locking=False, name=None, read_value=True):
+    if values_util.is_saving_non_distributed():
+      return self._primary.assign(value, use_locking, name, read_value)
     return self._policy.assign(
         self, value, use_locking=use_locking, name=name, read_value=read_value)
 
   def scatter_sub(self, sparse_delta, use_locking=False, name=None):
+    if values_util.is_saving_non_distributed():
+      return self._primary.scatter_sub(sparse_delta, use_locking, name)
     return self._policy.scatter_sub(
         self, sparse_delta, use_locking=use_locking, name=name)
 
   def scatter_add(self, sparse_delta, use_locking=False, name=None):
+    if values_util.is_saving_non_distributed():
+      return self._primary.scatter_add(sparse_delta, use_locking, name)
     return self._policy.scatter_add(
         self, sparse_delta, use_locking=use_locking, name=name)
 
   def scatter_mul(self, sparse_delta, use_locking=False, name=None):
+    if values_util.is_saving_non_distributed():
+      return self._primary.scatter_mul(sparse_delta, use_locking, name)
     return self._policy.scatter_mul(
         self, sparse_delta, use_locking=use_locking, name=name)
 
   def scatter_div(self, sparse_delta, use_locking=False, name=None):
+    if values_util.is_saving_non_distributed():
+      return self._primary.scatter_div(sparse_delta, use_locking, name)
     return self._policy.scatter_div(
         self, sparse_delta, use_locking=use_locking, name=name)
 
   def scatter_min(self, sparse_delta, use_locking=False, name=None):
+    if values_util.is_saving_non_distributed():
+      return self._primary.scatter_min(sparse_delta, use_locking, name)
     return self._policy.scatter_min(
         self, sparse_delta, use_locking=use_locking, name=name)
 
   def scatter_max(self, sparse_delta, use_locking=False, name=None):
+    if values_util.is_saving_non_distributed():
+      return self._primary.scatter_max(sparse_delta, use_locking, name)
     return self._policy.scatter_max(
         self, sparse_delta, use_locking=use_locking, name=name)
 
   def scatter_update(self, sparse_delta, use_locking=False, name=None):
+    if values_util.is_saving_non_distributed():
+      return self._primary.scatter_update(sparse_delta, use_locking, name)
     return self._policy.scatter_update(
         self, sparse_delta, use_locking=use_locking, name=name)
 
diff --git a/tensorflow/python/distribute/v1/BUILD b/tensorflow/python/distribute/v1/BUILD
new file mode 100644
index 00000000000..ea973cce272
--- /dev/null
+++ b/tensorflow/python/distribute/v1/BUILD
@@ -0,0 +1,43 @@
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+cuda_py_test(
+    name = "cross_device_ops_test",
+    srcs = ["cross_device_ops_test.py"],
+    python_version = "PY3",
+    tags = [
+        "multi_and_single_gpu",
+    ],
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:collective_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:kernels",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/distribute:collective_all_reduce_strategy",
+        "//tensorflow/python/distribute:collective_util",
+        "//tensorflow/python/distribute:combinations",
+        "//tensorflow/python/distribute:cross_device_ops",
+        "//tensorflow/python/distribute:cross_device_utils",
+        "//tensorflow/python/distribute:device_util",
+        "//tensorflow/python/distribute:distribute_utils",
+        "//tensorflow/python/distribute:multi_worker_test_base",
+        "//tensorflow/python/distribute:multi_worker_util",
+        "//tensorflow/python/distribute:reduce_util",
+        "//tensorflow/python/distribute:strategy_combinations",
+        "//tensorflow/python/distribute:values",
+        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:remote",
+        "//tensorflow/python/eager:test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
diff --git a/tensorflow/python/distribute/v1/cross_device_ops_test.py b/tensorflow/python/distribute/v1/cross_device_ops_test.py
new file mode 100644
index 00000000000..6026b6135bc
--- /dev/null
+++ b/tensorflow/python/distribute/v1/cross_device_ops_test.py
@@ -0,0 +1,1037 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for CrossDeviceOps in v1 graph mode."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+import os
+import threading
+import time
+
+from absl.testing import parameterized
+import numpy as np
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.distribute import cluster_resolver
+from tensorflow.python.distribute import collective_all_reduce_strategy
+from tensorflow.python.distribute import collective_util
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib
+from tensorflow.python.distribute import cross_device_utils
+from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import distribute_utils
+from tensorflow.python.distribute import multi_worker_test_base
+from tensorflow.python.distribute import multi_worker_util
+from tensorflow.python.distribute import reduce_util
+from tensorflow.python.distribute import strategy_combinations
+from tensorflow.python.distribute import values as value_lib
+from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import remote
+from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import kernels
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import collective_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables
+
+
+def _get_devices(devices):
+  if isinstance(devices, (tuple, list)):
+    return tuple(device_util.resolve(d) for d in devices)
+  elif isinstance(devices, value_lib.DistributedValues):
+    return devices._devices
+  elif isinstance(devices, ops.Tensor):
+    return (device_util.resolve(devices.device),)
+  return (device_util.resolve(devices),)
+
+
+def _make_per_replica(values, devices, regroup=False):
+  devices = _get_devices(devices)
+  assert len(values) == len(devices)
+
+  # We simulate the result of regroup called on PerReplica which strips the
+  # PerReplica wrapper if it has only one value.
+  if len(values) == 1 and regroup:
+    with ops.device(devices[0]):
+      placed_v = array_ops.identity(values[0])
+    return placed_v
+
+  index = []
+  for d, v in zip(devices, values):
+    with ops.device(d):
+      placed_v = array_ops.identity(v)
+    index.append(placed_v)
+  return distribute_utils.regroup(index)
+
+
+# pylint: disable=g-doc-args,g-doc-return-or-yield
+def _fake_mirrored(value, devices):
+  """Create a faked Mirrored object for testing.
+
+  All components of the returned Mirrored have the same objects, which is not
+  true in reality.
+  """
+  devices = _get_devices(devices)
+  values = []
+  for d in devices:
+    with ops.device(d):
+      values.append(array_ops.identity(value))
+  return distribute_utils.regroup(
+      values,
+      wrap_class=value_lib.Mirrored)
+
+
+def _make_indexed_slices(values, indices, dense_shape, device):
+  with ops.device(device):
+    tensor = ops.IndexedSlices(
+        values=constant_op.constant(values),
+        indices=constant_op.constant(indices),
+        dense_shape=constant_op.constant(dense_shape))
+  return tensor
+
+
+def _make_mirrored_indexed_slices(devices, values, indices, dense_shape):
+  values = [_make_indexed_slices(values, indices, dense_shape, d)
+            for d in devices]
+  return distribute_utils.regroup(
+      values,
+      wrap_class=value_lib.Mirrored)
+
+
+_cpu_device = "/device:CPU:0"
+
+
+class CrossDeviceOpsTestBase(test.TestCase, parameterized.TestCase):
+
+  def _assert_indexed_slices_equal(self, left, right):
+    self.assertIsInstance(left, ops.IndexedSlices)
+    self.assertIsInstance(right, ops.IndexedSlices)
+    self.assertEqual(
+        device_util.resolve(left.device), device_util.resolve(right.device))
+    self.assertAllEqual(
+        self.evaluate(ops.convert_to_tensor(left)),
+        self.evaluate(ops.convert_to_tensor(right)))
+
+  def _assert_mirrored_equal(self,
+                             left_list,
+                             right_list,
+                             sess=None,
+                             run_options=None):
+    if not isinstance(left_list, list):
+      left_list, right_list = [left_list], [right_list]
+
+    for left, right in zip(left_list, right_list):
+      self.assertEqual(type(left), type(right))
+
+      # Convert Mirrored to a list since sess.run(Mirrored) only returns one
+      # value.
+      if isinstance(left, value_lib.Mirrored):
+        left, right = left.values, right.values
+      else:
+        # When there's only one replica Mirrored is automatically unwrapped.
+        left, right = [left], [right]
+
+      for left_value, right_value in zip(left, right):
+        self.assertEqual(
+            device_util.resolve(left_value.device),
+            device_util.resolve(right_value.device))
+
+      # Densify IndexedSlices.
+      left = [ops.convert_to_tensor(v) for v in left]
+      right = [ops.convert_to_tensor(v) for v in right]
+      if not context.executing_eagerly():
+        left, right = sess.run((left, right), options=run_options)
+      for left_value, right_value in zip(left, right):
+        self.assertAllEqual(left_value, right_value)
+
+  def _testReductionAndBroadcast(self, cross_device_ops, devices):
+    if context.num_gpus() < sum(1 for d in devices if "GPU" in d.upper()):
+      self.skipTest("Not enough GPUs")
+
+    with self.cached_session() as sess:
+      values = [constant_op.constant(float(d)) for d in range(len(devices))]
+      per_replica = _make_per_replica(values, devices)
+      mean = (len(devices) - 1.) / 2.
+
+      values_2 = [constant_op.constant(d + 1.0) for d in range(len(devices))]
+      per_replica_2 = _make_per_replica(values_2, devices)
+      mean_2 = mean + 1.
+
+      destination_mirrored = _fake_mirrored(1., devices)
+      destination_different = _fake_mirrored(1.,
+                                             device_util.resolve(_cpu_device))
+      destination_str = device_util.resolve(_cpu_device)
+
+      all_destinations = [
+          destination_mirrored,
+          destination_different,
+          destination_str,
+      ]
+
+      # test reduce()
+      for destinations in all_destinations:
+        self._assert_mirrored_equal(
+            cross_device_ops.reduce(
+                reduce_util.ReduceOp.MEAN,
+                per_replica,
+                destinations=destinations), _fake_mirrored(mean, destinations),
+            sess)
+        self._assert_mirrored_equal(
+            cross_device_ops.reduce(
+                reduce_util.ReduceOp.MEAN,
+                per_replica_2,
+                destinations=destinations),
+            _fake_mirrored(mean_2, destinations), sess)
+        self._assert_mirrored_equal(
+            cross_device_ops.reduce(
+                reduce_util.ReduceOp.SUM,
+                per_replica,
+                destinations=destinations),
+            _fake_mirrored(mean * len(devices), destinations), sess)
+        self._assert_mirrored_equal(
+            cross_device_ops.reduce(
+                reduce_util.ReduceOp.SUM,
+                per_replica_2,
+                destinations=destinations),
+            _fake_mirrored(mean_2 * len(devices), destinations), sess)
+
+      # test batch_reduce()
+      for d1, d2 in itertools.product(all_destinations, all_destinations):
+        self._assert_mirrored_equal(
+            cross_device_ops.batch_reduce(reduce_util.ReduceOp.MEAN,
+                                          [(per_replica, d1),
+                                           (per_replica_2, d2)]),
+            [_fake_mirrored(mean, d1),
+             _fake_mirrored(mean_2, d2)], sess)
+        self._assert_mirrored_equal(
+            cross_device_ops.batch_reduce(reduce_util.ReduceOp.SUM,
+                                          [(per_replica, d1),
+                                           (per_replica_2, d2)]),
+            [
+                _fake_mirrored(mean * len(devices), d1),
+                _fake_mirrored(mean_2 * len(devices), d2)
+            ], sess)
+
+      # test broadcast()
+      for destinations in all_destinations:
+        self._assert_mirrored_equal(
+            cross_device_ops.broadcast(constant_op.constant(1.), destinations),
+            _fake_mirrored(1., destinations), sess)
+
+  def _testIndexedSlicesAllReduce(self, devices, cross_device_ops_instance,
+                                  reduce_op, batch_reduce):
+    with self.cached_session() as sess:
+      dense_shape = [5, 2]
+      t0 = _make_indexed_slices([[1., 2.]], [1], dense_shape, devices[0])
+      t1 = _make_indexed_slices([[3., 4.], [5., 6.]], [1, 3], dense_shape,
+                                devices[1])
+      per_replica = value_lib.PerReplica((t0, t1))
+
+      if batch_reduce:
+        result = cross_device_ops_instance.batch_reduce(
+            reduce_op, [(per_replica, per_replica)])
+      else:
+        result = cross_device_ops_instance.reduce(reduce_op, per_replica,
+                                                  per_replica)
+
+      total_indices_with_dups = [1, 1, 3]
+      total_indices_without_dups = [1, 3]
+
+      if reduce_op == reduce_util.ReduceOp.SUM:
+        total_values_with_dups = [[1., 2.], [3., 4.], [5., 6.]]
+        total_values_without_dups = [[4., 6.], [5., 6.]]
+      else:
+        assert reduce_op == reduce_util.ReduceOp.MEAN
+        total_values_with_dups = [[0.5, 1.], [1.5, 2.], [2.5, 3.]]
+        total_values_without_dups = [[2., 3.], [2.5, 3.]]
+
+      total_mirrored_with_dups = _make_mirrored_indexed_slices(
+          devices, total_values_with_dups, total_indices_with_dups, dense_shape)
+      total_mirrored_without_dups = _make_mirrored_indexed_slices(
+          devices, total_values_without_dups, total_indices_without_dups,
+          dense_shape)
+
+      # Test that the result is semantically equal to both the concatenated
+      # IndexedSlices, as well as when the duplicate indices are summed up.
+      if batch_reduce:
+        total_mirrored_with_dups = [total_mirrored_with_dups]
+        total_mirrored_without_dups = [total_mirrored_without_dups]
+
+      self._assert_mirrored_equal(total_mirrored_with_dups, result, sess)
+      self._assert_mirrored_equal(total_mirrored_without_dups, result, sess)
+
+
+class SingleWorkerCrossDeviceOpsTest(CrossDeviceOpsTestBase):
+
+  # TODO(b/159829194): move eager tests in this test class to
+  # tensorflow/python/distribute/cross_device_ops_test.py
+
+  reduction_to_one_combinations = combinations.combine(
+      cross_device_ops=[
+          combinations.NamedObject("DefaultReductionToOneDevice",
+                                   cross_device_ops_lib.ReductionToOneDevice()),
+          combinations.NamedObject(
+              "ReductionToCPUDeviceCrossDeviceOps",
+              cross_device_ops_lib.ReductionToOneDevice(
+                  reduce_to_device=_cpu_device)),
+          combinations.NamedObject(
+              "AccumulateNCrossDeviceOp",
+              cross_device_ops_lib.ReductionToOneDevice(
+                  accumulation_fn=math_ops.add_n)),
+      ],
+      devices=[
+          ["/cpu:0"],
+          ["/cpu:0", "/gpu:0"],
+          ["/gpu:0", "/gpu:1"],
+      ],
+      mode=["graph", "eager"])
+  allreduce_combinations = combinations.combine(
+      cross_device_ops=[
+          combinations.NamedObject(
+              "AllReduce",
+              cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 1)),
+          combinations.NamedObject(
+              "AllReduceNoGradientRepacking",
+              cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 0)),
+          combinations.NamedObject("NcclAllReduce",
+                                   cross_device_ops_lib.NcclAllReduce()),
+          combinations.NamedObject(
+              "HierarchicalCopy",
+              cross_device_ops_lib.HierarchicalCopyAllReduce(8)),
+      ],
+      devices=[
+          ["/gpu:0", "/gpu:1"],
+      ],
+      mode=["graph", "eager"])
+
+  @combinations.generate(reduction_to_one_combinations + allreduce_combinations)
+  def testReductionAndBroadcast(self, cross_device_ops, devices):
+    if isinstance(
+        cross_device_ops._obj,  # pylint: disable=protected-access
+        cross_device_ops_lib.AllReduceCrossDeviceOps
+    ) and context.executing_eagerly():
+      self.skipTest("b/149881884")
+    self._testReductionAndBroadcast(cross_device_ops, devices)
+
+  def testChooseAlgorithm(self):
+    # Not use nccl if there is any cpu device.
+    self.assertIsInstance(
+        cross_device_ops_lib.choose_the_best(["/cpu:0"]),
+        cross_device_ops_lib.ReductionToOneDevice)
+
+    # Not use nccl if requested device is not visible to TensorFlow.
+    # TODO(yuefengz): make `choose_the_best` work with device strings
+    # self.assertIsInstance(
+    #     cross_device_ops_lib.choose_the_best(["/gpu:100"]),
+    #     cross_device_ops_lib.ReductionToOneDevice)
+
+    if context.num_gpus() < 1:
+      return
+
+    devices = ["/gpu:0"]
+
+    def mock_get_registered_kernels_for_op(op):
+      if op == "NcclAllReduce":
+        return [object]
+      else:
+        return []
+
+    # Use nccl if nccl kernel is found.
+    with test.mock.patch.object(kernels, "get_registered_kernels_for_op",
+                                mock_get_registered_kernels_for_op):
+      self.assertIsInstance(
+          cross_device_ops_lib.choose_the_best(devices),
+          cross_device_ops_lib.NcclAllReduce)
+
+    # Not use nccl if nccl kernel is not found.
+    with test.mock.patch.object(kernels,
+                                "get_registered_kernels_for_op", lambda _: []):
+      self.assertIsInstance(
+          cross_device_ops_lib.choose_the_best(devices),
+          cross_device_ops_lib.ReductionToOneDevice)
+
+  @combinations.generate(combinations.combine(
+      mode=["graph", "eager"],
+      required_gpus=1))
+  def testSimpleReduceWithIndexedSlices(self):
+    devices = ["/cpu:0", "/gpu:0"]
+    t0 = _make_indexed_slices([[1., 2.]], [1], [5, 2], devices[0])
+    t1 = _make_indexed_slices([[3., 4.], [5., 6.]], [1, 3], [5, 2], devices[1])
+    per_replica = value_lib.PerReplica((t0, t1))
+    result = cross_device_ops_lib._simple_reduce(
+        per_replica, devices[0], math_ops.add_n, reduce_util.ReduceOp.SUM)
+
+    # Test that the result is semantically equal to both the concatenated
+    # IndexedSlices with and without duplicate indices.
+    total_with_dups = _make_indexed_slices(
+        [[1., 2.], [3., 4.], [5., 6.]], [1, 1, 3], [5, 2], devices[0])
+    total_without_dups = _make_indexed_slices(
+        [[4., 6.], [5., 6.]], [1, 3], [5, 2], devices[0])
+    self._assert_indexed_slices_equal(total_with_dups, result)
+    self._assert_indexed_slices_equal(total_without_dups, result)
+
+  @combinations.generate(
+      combinations.combine(
+          cross_device_ops_instance=[
+              combinations.NamedObject(
+                  "ReductionToOneDevice",
+                  cross_device_ops_lib.ReductionToOneDevice()),
+              combinations.NamedObject(
+                  "AllReduceCrossDeviceOps",
+                  cross_device_ops_lib.AllReduceCrossDeviceOps())
+          ],
+          reduce_op=[reduce_util.ReduceOp.SUM, reduce_util.ReduceOp.MEAN],
+          batch_reduce=[True, False],
+          mode=["graph", "eager"],
+          required_gpus=1))
+  def testIndexedSlicesAllReduce(self, cross_device_ops_instance, reduce_op,
+                                 batch_reduce):
+    devices = ["/cpu:0", "/gpu:0"]
+    self._testIndexedSlicesAllReduce(devices, cross_device_ops_instance,
+                                     reduce_op, batch_reduce)
+
+  @combinations.generate(
+      combinations.combine(
+          distribution=strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+          cross_device_ops_instance=[
+              combinations.NamedObject(
+                  "ReductionToOneDevice",
+                  cross_device_ops_lib.ReductionToOneDevice()),
+              combinations.NamedObject(
+                  "AllReduceCrossDeviceOps",
+                  cross_device_ops_lib.AllReduceCrossDeviceOps("ring"))
+          ],
+          batch_reduce=[True, False],
+          mode=["graph", "eager"]))
+  def testReduceDistributedVariable(self, distribution,
+                                    cross_device_ops_instance, batch_reduce):
+    with distribution.scope():
+      v = variables.Variable(1.)
+    if batch_reduce:
+      result = cross_device_ops_instance.batch_reduce(reduce_util.ReduceOp.MEAN,
+                                                      [(v, v)])[0]
+    else:
+      result = cross_device_ops_instance.reduce(reduce_util.ReduceOp.MEAN, v, v)
+    for v in result.values:
+      self.assertIsInstance(v, ops.Tensor)
+    self.evaluate(variables.global_variables_initializer())
+    self.assertAllEqual(self.evaluate(result.values), [1.0, 1.0])
+
+
+NUM_WORKERS = 3
+
+CollectiveCommunication = cross_device_ops_lib.CollectiveCommunication
+
+
+class CollectiveAllReduceTest(multi_worker_test_base.MultiWorkerTestBase,
+                              CrossDeviceOpsTestBase):
+
+  # TODO(b/159829194): move eager tests in this test class to
+  # tensorflow/python/distribute/cross_device_ops_test.py
+
+  collective_key_base = 100000
+
+  @classmethod
+  def setUpClass(cls):
+    """Create a local cluster with 3 workers."""
+    cls._cluster_spec = multi_worker_test_base.create_in_process_cluster(
+        num_workers=NUM_WORKERS, num_ps=0)
+
+  def setUp(self):
+    super(CollectiveAllReduceTest, self).setUp()
+    # Reusing keys is not supported well. So we have to give a different
+    # collective key base for different tests.
+    CollectiveAllReduceTest.collective_key_base += 100000
+
+  def _get_test_objects(self,
+                        task_type,
+                        task_id,
+                        num_gpus=0,
+                        communication=CollectiveCommunication.AUTO,
+                        use_strategy_object=False,
+                        local_mode=False):
+    collective_keys = cross_device_utils.CollectiveKeys(
+        group_key_start=10 + CollectiveAllReduceTest.collective_key_base,
+        op_instance_key_start=100 + CollectiveAllReduceTest.collective_key_base,
+        variable_instance_key_start=10000 +
+        CollectiveAllReduceTest.collective_key_base)
+    if local_mode:
+      if num_gpus:
+        devices = ["/device:GPU:%d" % i for i in range(num_gpus)]
+      else:
+        devices = ["/device:CPU:0"]
+
+      if use_strategy_object:
+        strategy = (
+            collective_all_reduce_strategy.CollectiveAllReduceStrategy
+            ._from_local_devices(devices, communication=communication))  # pylint: disable=protected-access
+        strategy.extended._collective_keys = collective_keys
+        strategy.extended._cross_device_ops._collective_keys = collective_keys
+        strategy.extended._host_cross_device_ops._collective_keys = (
+            collective_keys)
+        return strategy, devices, ""
+      else:
+        collective_all_reduce_ops = cross_device_ops_lib.CollectiveAllReduce(
+            devices=devices,
+            group_size=len(devices),
+            collective_keys=collective_keys,
+            communication=communication)
+        return collective_all_reduce_ops, devices, ""
+    else:
+      # NCCL requires physical GPUs for every replica, which we can't do with
+      # simulated multi host set up now.
+      assert communication != CollectiveCommunication.NCCL
+      if num_gpus:
+        devices = [
+            "/job:%s/task:%d/replica:0/device:GPU:%d" % (task_type, task_id, i)
+            for i in range(num_gpus)
+        ]
+      else:
+        devices = [
+            "/job:%s/task:%d/replica:0/device:CPU:0" % (task_type, task_id)
+        ]
+
+      if use_strategy_object:
+        resolver = cluster_resolver.SimpleClusterResolver(
+            cluster_spec=multi_worker_util.normalize_cluster_spec(
+                self._cluster_spec),
+            task_type=task_type,
+            task_id=task_id,
+            num_accelerators={"GPU": num_gpus})
+        strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy(
+            cluster_resolver=resolver, communication=communication)
+        strategy.extended._collective_keys = collective_keys
+        strategy.extended._cross_device_ops._collective_keys = collective_keys
+        return (strategy, devices,
+                "grpc://" + self._cluster_spec[task_type][task_id])
+      else:
+        collective_all_reduce_ops = cross_device_ops_lib.CollectiveAllReduce(
+            devices=devices,
+            group_size=len(devices) * NUM_WORKERS,
+            collective_keys=collective_keys,
+            communication=communication)
+        return (collective_all_reduce_ops, devices,
+                "grpc://" + self._cluster_spec[task_type][task_id])
+
+  def _assert_mirrored_equal(self, left_list, right_list, sess=None):
+    if context.executing_eagerly():
+      run_options = None
+    else:
+      # TODO(b/151025792): figure out why missing run options would make the
+      # test flaky and whether this is a problem in TF 2.
+      run_options = config_pb2.RunOptions()
+      run_options.experimental.collective_graph_key = 5
+    super(CollectiveAllReduceTest, self)._assert_mirrored_equal(
+        left_list, right_list, sess, run_options=run_options)
+
+  def _test_reduction(self,
+                      task_type,
+                      task_id,
+                      num_gpus,
+                      communication,
+                      use_strategy_object=False,
+                      local_mode=False,
+                      hints=None):
+    collective_all_reduce, devices, master_target = self._get_test_objects(
+        task_type,
+        task_id,
+        num_gpus,
+        communication=communication,
+        use_strategy_object=use_strategy_object,
+        local_mode=local_mode)
+    if local_mode:
+      num_workers = 1
+      worker_device = None
+    else:
+      num_workers = len(self._cluster_spec.get("chief", [])) + len(
+          self._cluster_spec.get("worker", []))
+      worker_device = "/job:%s/task:%d" % (task_type, task_id)
+
+    def _reduce(test_object, reduce_op, per_replica, destinations):
+      if use_strategy_object:
+        with test_object.scope():
+          return test_object.extended.reduce_to(reduce_op, per_replica,
+                                                destinations, hints)
+      else:
+        return test_object.reduce(reduce_op, per_replica, destinations, hints)
+
+    def _batch_reduce(test_object, reduce_op, value_destination_pairs):
+      if use_strategy_object:
+        with test_object.scope():
+          return test_object.extended.batch_reduce_to(reduce_op,
+                                                      value_destination_pairs,
+                                                      hints)
+      else:
+        return test_object.batch_reduce(reduce_op, value_destination_pairs,
+                                        hints)
+
+    with ops.Graph().as_default(), \
+         ops.device(worker_device), \
+         self.cached_session(target=master_target) as sess:
+      # Collective ops doesn't support scalar tensors, so we have to construct
+      # 1-d tensors.
+      values = [constant_op.constant([float(d)]) for d in range(len(devices))]
+      per_replica = _make_per_replica(values, devices)
+      mean = np.array([(len(devices) - 1.) / 2.])
+
+      values_2 = [constant_op.constant([d + 1.0]) for d in range(len(devices))]
+      per_replica_2 = _make_per_replica(values_2, devices)
+      mean_2 = np.array([mean[0] + 1.])
+
+      destination_mirrored = _fake_mirrored(1., devices)
+      destination_different = _fake_mirrored(1., _cpu_device)
+      destination_str = _cpu_device
+
+      all_destinations = [
+          destination_different, destination_mirrored, destination_str
+      ]
+
+      # test reduce()
+      for destinations in all_destinations:
+        self._assert_mirrored_equal(
+            _reduce(
+                collective_all_reduce,
+                reduce_util.ReduceOp.MEAN,
+                per_replica,
+                destinations=destinations), _fake_mirrored(mean, destinations),
+            sess)
+        self._assert_mirrored_equal(
+            _reduce(
+                collective_all_reduce,
+                reduce_util.ReduceOp.MEAN,
+                per_replica_2,
+                destinations=destinations),
+            _fake_mirrored(mean_2, destinations), sess)
+        self._assert_mirrored_equal(
+            _reduce(
+                collective_all_reduce,
+                reduce_util.ReduceOp.SUM,
+                per_replica,
+                destinations=destinations),
+            _fake_mirrored(mean * len(devices) * num_workers, destinations),
+            sess)
+        self._assert_mirrored_equal(
+            _reduce(
+                collective_all_reduce,
+                reduce_util.ReduceOp.SUM,
+                per_replica_2,
+                destinations=destinations),
+            _fake_mirrored(mean_2 * len(devices) * num_workers, destinations),
+            sess)
+
+      # test batch_reduce()
+      for d1, d2 in itertools.product(all_destinations, all_destinations):
+        self._assert_mirrored_equal(
+            _batch_reduce(collective_all_reduce, reduce_util.ReduceOp.MEAN,
+                          [(per_replica, d1), (per_replica_2, d2)]),
+            [_fake_mirrored(mean, d1),
+             _fake_mirrored(mean_2, d2)], sess)
+        self._assert_mirrored_equal(
+            _batch_reduce(collective_all_reduce, reduce_util.ReduceOp.SUM,
+                          [(per_replica, d1), (per_replica_2, d2)]),
+            [
+                _fake_mirrored(mean * len(devices) * num_workers, d1),
+                _fake_mirrored(mean_2 * len(devices) * num_workers, d2)
+            ], sess)
+
+  def _get_indexed_slices(self,
+                          devices,
+                          start_i,
+                          variable_length,
+                          as_per_replica=True):
+    dense_shape = [10, 2]
+    values = ([[1., 2.]], [[3., 4.]], [[2., 1.]], [[0., 0.]], [[3., 1.]],
+              [[2., 1.]])
+    indices = ([1], [2], [3], [4], [5], [6])
+
+    # values and indices that have variable lengths.
+    vl_values = ([[1., 2.], [3., 4.]], [[3., 4.]], [[2., 1.]], [[0., 0.]],
+                 [[3., 1.], [2., 1.]], [[2., 1.]])
+    vl_indices = ([1, 2], [2], [3], [4], [5, 6], [6])
+
+    indexed_slices = []
+    for i, d in enumerate(devices):
+      idx = i + start_i
+      indexed_slices.append(
+          _make_indexed_slices(
+              vl_values[idx] if variable_length else values[idx],
+              vl_indices[idx] if variable_length else indices[idx], dense_shape,
+              d))
+    if as_per_replica:
+      per_replica = value_lib.PerReplica(indexed_slices)
+      return per_replica
+    else:
+      return indexed_slices
+
+  def _test_reduce_indexed_slices(self,
+                                  task_type,
+                                  task_id,
+                                  num_gpus,
+                                  communication,
+                                  batch_reduce,
+                                  variable_length,
+                                  local_mode=False):
+    collective_all_reduce, devices, master_target = self._get_test_objects(
+        task_type,
+        task_id,
+        num_gpus,
+        communication=communication,
+        local_mode=local_mode)
+    if local_mode:
+      num_workers = 1
+      worker_device = None
+    else:
+      num_workers = len(self._cluster_spec.get("chief", [])) + len(
+          self._cluster_spec.get("worker", []))
+      worker_device = "/job:%s/task:%d" % (task_type, task_id)
+    with ops.Graph().as_default(), \
+         ops.device(worker_device), \
+         self.cached_session(target=master_target) as sess:
+      per_replica = self._get_indexed_slices(devices,
+                                             (task_id or 0) * max(num_gpus, 1),
+                                             variable_length)
+
+      if batch_reduce:
+        result = collective_all_reduce.batch_reduce(
+            reduce_util.ReduceOp.SUM, [(per_replica, per_replica)])[0]
+      else:
+        result = collective_all_reduce.reduce(reduce_util.ReduceOp.SUM,
+                                              per_replica, per_replica)
+      if num_gpus > 1:
+        self.assertIsInstance(result, value_lib.Mirrored)
+
+      run_options = config_pb2.RunOptions()
+      run_options.experimental.collective_graph_key = 7
+      if num_gpus > 1:
+        result = sess.run([ops.convert_to_tensor(v) for v in result.values],
+                          options=run_options)[0]
+      else:
+        result = sess.run(ops.convert_to_tensor(result), options=run_options)
+
+      # Reduce the same indexed slices on CPU locally as our expected results.
+      devices_cpu = [(worker_device or "") + "/device:CPU:0"] * (
+          max(num_gpus, 1) * num_workers)
+      per_replica_on_cpu = self._get_indexed_slices(
+          devices_cpu, 0, variable_length, as_per_replica=False)
+      expected_result = cross_device_utils.aggregate_tensors_or_indexed_slices(
+          per_replica_on_cpu)
+      expected_result = sess.run(ops.convert_to_tensor(expected_result))
+
+      self.assertAllEqual(expected_result, result)
+
+  @combinations.generate(
+      combinations.combine(
+          mode=["graph"],
+          required_gpus=[0, 1, 2],
+          use_strategy_object=[True, False],
+          bytes_per_pack=[0, 1, 4]))
+  def testReductionDistributed(self, required_gpus, use_strategy_object,
+                               bytes_per_pack):
+    hints = collective_util.Hints(bytes_per_pack=bytes_per_pack)
+    self._run_between_graph_clients(
+        self._test_reduction,
+        self._cluster_spec,
+        required_gpus,
+        communication=CollectiveCommunication.RING,
+        use_strategy_object=use_strategy_object,
+        hints=hints)
+
+  @combinations.generate(
+      combinations.combine(
+          mode=["graph"],
+          required_gpus=[0, 1, 2],
+          variable_length=[True, False]))
+  def testReduceIndexedSlicesDistributed(self, required_gpus, variable_length):
+    self._run_between_graph_clients(
+        self._test_reduce_indexed_slices,
+        self._cluster_spec,
+        required_gpus,
+        communication=CollectiveCommunication.RING,
+        batch_reduce=True,
+        variable_length=variable_length)
+
+  # Collective ops doesn't support strategy with one device.
+  @combinations.generate(
+      combinations.combine(
+          mode=["graph"],
+          required_gpus=2,
+          communication=[
+              CollectiveCommunication.NCCL, CollectiveCommunication.RING
+          ],
+          use_strategy_object=[True, False]))
+  def testReductionLocal(self, required_gpus, communication,
+                         use_strategy_object):
+    self._test_reduction(
+        None,
+        None,
+        required_gpus,
+        communication=communication,
+        use_strategy_object=use_strategy_object,
+        local_mode=True)
+
+  @combinations.generate(
+      combinations.combine(
+          mode=["graph"],
+          required_gpus=2,
+          batch_reduce=[True, False],
+          variable_length=[True, False],
+          communication=[
+              CollectiveCommunication.NCCL, CollectiveCommunication.RING
+          ]))
+  def testReduceIndexedSlicesLocal(self, required_gpus, batch_reduce,
+                                   variable_length, communication):
+    self._test_reduce_indexed_slices(
+        None,
+        None,
+        required_gpus,
+        communication=communication,
+        batch_reduce=batch_reduce,
+        variable_length=variable_length,
+        local_mode=True)
+
+  @combinations.generate(
+      combinations.combine(
+          required_gpus=2,
+          mode="eager",
+          communication=[
+              CollectiveCommunication.NCCL, CollectiveCommunication.RING
+          ]))
+  def testEagerMultiThread(self, communication):
+    collective, devices, _ = self._get_test_objects(
+        None,
+        None,
+        num_gpus=2,
+        communication=communication,
+        use_strategy_object=False,
+        local_mode=True)
+
+    # We would like to simulate the following sequence:
+    #   thread-0  device0                 device1
+    #   thread-1          device0 device1
+    # If the kernel launch sequence is as-is the program will deadlock since
+    # NCCL requires the launch order to be same on each device.
+    v0 = _make_per_replica([1.0 for _ in devices], devices)
+    v1 = _make_per_replica([2.0 for _ in devices], devices)
+
+    # Add a delay to collective_ops.all_reduce according to the input tensors
+    # index in `sequence.`
+    sequence = [v0.values[0], v1.values[0], v1.values[1], v0.values[1]]
+    all_reduce = collective_ops.all_reduce
+
+    def delayed_all_reduce(input_tensor, *args, **kwargs):
+      for idx, v in enumerate(sequence):
+        if input_tensor is v:
+          time.sleep(idx)
+          break
+      return all_reduce(input_tensor, *args, **kwargs)
+
+    with test.mock.patch.object(collective_ops, "all_reduce",
+                                delayed_all_reduce):
+      # We only use NCCL for batch reduce with two or more values, so we use two
+      # values here.
+
+      def thread_fn():
+        reduced = collective.batch_reduce(reduce_util.ReduceOp.SUM, [(v0, v0),
+                                                                     (v0, v0)])
+        self.assertAllEqual(reduced[0].values, [2.0, 2.0])
+        self.assertAllEqual(reduced[1].values, [2.0, 2.0])
+
+      t = threading.Thread(target=thread_fn)
+      t.start()
+      reduced = collective.batch_reduce(reduce_util.ReduceOp.SUM, [(v1, v1),
+                                                                   (v1, v1)])
+      self.assertAllEqual(reduced[0].values, [4.0, 4.0])
+      self.assertAllEqual(reduced[1].values, [4.0, 4.0])
+      t.join()
+
+  @combinations.generate(
+      combinations.combine(
+          required_gpus=2,
+          mode="eager",
+          communication=[
+              CollectiveCommunication.NCCL, CollectiveCommunication.RING
+          ]))
+  def testInputsAreFunctionArgs(self, communication):
+    # Function inputs don't have device placement.
+    hints = collective_util.Hints(bytes_per_pack=1)
+    collective, devices, _ = self._get_test_objects(
+        None,
+        None,
+        num_gpus=2,
+        communication=communication,
+        use_strategy_object=False,
+        local_mode=True)
+    devices = [device_util.canonicalize(d) for d in devices]
+
+    @def_function.function
+    def reduce_fn(v):
+      self.assertEqual(v.values[0].device, "")
+      self.assertEqual(v.values[1].device, "")
+      # We only use NCCL for batch reduce with two or more values, so we use two
+      # values here.
+      reduced = collective.batch_reduce(
+          reduce_util.ReduceOp.SUM, [(v, v), (v, v)], experimental_hints=hints)
+      self.assertEqual(reduced[0].values[0].device, devices[0])
+      self.assertEqual(reduced[0].values[1].device, devices[1])
+      self.assertEqual(reduced[1].values[0].device, devices[0])
+      self.assertEqual(reduced[1].values[1].device, devices[1])
+      # Returning Mirrored only evaluates the primary value, which causes
+      # hanging,
+      return [reduced[0].values, reduced[1].values]
+
+    v = _make_per_replica([1.0, 2.0], devices)
+    reduced = reduce_fn(v)
+    self.assertAllEqual(self.evaluate(reduced), [[3.0, 3.0], [3.0, 3.0]])
+
+  @combinations.generate(
+      combinations.combine(
+          required_gpus=[0, 1],
+          mode="eager",
+          communication=[CollectiveCommunication.RING]))
+  def testTimeoutReduceDense(self, communication, required_gpus):
+    hints = collective_util.Hints(timeout_seconds=1)
+    collective, devices, _ = self._get_test_objects(
+        "worker",
+        0,
+        num_gpus=required_gpus,
+        communication=communication,
+        use_strategy_object=False)
+    remote.connect_to_cluster(
+        multi_worker_util.normalize_cluster_spec(self._cluster_spec),
+        protocol="grpc")
+    devices = [device_util.canonicalize(d) for d in devices]
+    v = _make_per_replica([1.0], devices)
+
+    @def_function.function
+    def reduce_dense():
+      collective.reduce(reduce_util.ReduceOp.SUM, v, v, hints)
+
+    # The collective should time out because we only launch it on worker-0,
+    # while there're three workers in total.
+    with self.assertRaises(errors.DeadlineExceededError):
+      reduce_dense()
+
+    # Reset since collective failures poison the context.
+    context._reset_context()  # pylint: disable=protected-access
+
+  @combinations.generate(
+      combinations.combine(
+          required_gpus=[0, 1],
+          mode="eager",
+          communication=[CollectiveCommunication.RING]))
+  def testTimeoutBatchReduceDense(self, communication, required_gpus):
+    hints = collective_util.Hints(timeout_seconds=1)
+    collective, devices, _ = self._get_test_objects(
+        "worker",
+        0,
+        num_gpus=required_gpus,
+        communication=communication,
+        use_strategy_object=False)
+    remote.connect_to_cluster(
+        multi_worker_util.normalize_cluster_spec(self._cluster_spec),
+        protocol="grpc")
+    devices = [device_util.canonicalize(d) for d in devices]
+    v = _make_per_replica([1.0], devices)
+
+    @def_function.function
+    def batch_reduce_dense():
+      collective.batch_reduce(reduce_util.ReduceOp.SUM, [(v, v), (v, v)], hints)
+
+    # The collective should time out because we only launch it on worker-0,
+    # while there're three workers in total.
+    with self.assertRaises(errors.DeadlineExceededError):
+      batch_reduce_dense()
+
+    # Reset since collective failures poison the context.
+    context._reset_context()  # pylint: disable=protected-access
+
+  @combinations.generate(
+      combinations.combine(
+          required_gpus=[0, 1],
+          mode="eager",
+          communication=[CollectiveCommunication.RING]))
+  def testTimeoutReduceSparse(self, communication, required_gpus):
+    hints = collective_util.Hints(timeout_seconds=1)
+    collective, devices, _ = self._get_test_objects(
+        "worker",
+        0,
+        num_gpus=required_gpus,
+        communication=communication,
+        use_strategy_object=False)
+    remote.connect_to_cluster(
+        multi_worker_util.normalize_cluster_spec(self._cluster_spec),
+        protocol="grpc")
+    devices = [device_util.canonicalize(d) for d in devices]
+    v = value_lib.PerReplica([
+        _make_indexed_slices([[4., 6.], [5., 6.]], [1, 3], [5, 2], devices[0])
+    ])
+
+    @def_function.function
+    def reduce_sparse():
+      collective.reduce(reduce_util.ReduceOp.SUM, v, v, hints)
+
+    # The collective should time out because we only launch it on worker-0,
+    # while there're three workers in total.
+    with self.assertRaises(errors.DeadlineExceededError):
+      reduce_sparse()
+
+    # Reset since collective failures poison the context.
+    context._reset_context()  # pylint: disable=protected-access
+
+  @combinations.generate(
+      combinations.combine(
+          required_gpus=[0, 1],
+          mode="eager",
+          communication=[CollectiveCommunication.RING]))
+  def testTimeoutBatchReduceSparse(self, communication, required_gpus):
+    hints = collective_util.Hints(timeout_seconds=1)
+    collective, devices, _ = self._get_test_objects(
+        "worker",
+        0,
+        num_gpus=required_gpus,
+        communication=communication,
+        use_strategy_object=False)
+    remote.connect_to_cluster(
+        multi_worker_util.normalize_cluster_spec(self._cluster_spec),
+        protocol="grpc")
+    devices = [device_util.canonicalize(d) for d in devices]
+    v = value_lib.PerReplica([
+        _make_indexed_slices([[4., 6.], [5., 6.]], [1, 3], [5, 2], devices[0])
+    ])
+
+    @def_function.function
+    def batch_reduce_sparse():
+      collective.batch_reduce(reduce_util.ReduceOp.SUM, [(v, v), (v, v)], hints)
+
+    # The collective should time out because we only launch it on worker-0,
+    # while there're three workers in total.
+    with self.assertRaises(errors.DeadlineExceededError):
+      batch_reduce_sparse()
+
+    # Reset since collective failures poison the context.
+    context._reset_context()  # pylint: disable=protected-access
+
+
+if __name__ == "__main__":
+  # Set default inter op thread pool size to one to ensure we don't exhaust the
+  # thread pool with the additional executors to run collectives in eager.
+  os.environ["TF_NUM_INTEROP_THREADS"] = "1"
+  combinations.main()
diff --git a/tensorflow/python/distribute/values.py b/tensorflow/python/distribute/values.py
index ff1c94ae392..7b4232d99cf 100644
--- a/tensorflow/python/distribute/values.py
+++ b/tensorflow/python/distribute/values.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import copy
+import weakref
 
 from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute import distribute_lib
@@ -36,7 +38,6 @@ from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.saved_model import save_context
 from tensorflow.python.training.saving import saveable_object
-from tensorflow.python.training.saving import saveable_object_util
 from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.types import core
 from tensorflow.python.util.tf_export import tf_export
@@ -434,7 +435,7 @@ class DistributedVarOp(object):
             self.traceback == o.traceback and self.type == o.type)
 
   def __hash__(self):
-    return hash((self.name, self.graph, self.traceback, self.type))
+    return hash((self.name, self.graph, tuple(self.traceback), self.type))
 
 
 class DistributedVariable(DistributedDelegate, variables_lib.Variable,
@@ -446,6 +447,10 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
     self._aggregation = aggregation
     super(DistributedVariable, self).__init__(values)
     self._common_name = self._primary.name.split(":")[0]
+    # Use a weakref to make it easy to map from the contained values
+    # to the container without introducing a reference cycle.
+    for v in values:
+      v._distributed_container = weakref.ref(self)  # pylint: disable=protected-access
 
     # Packed variable is used to reduce the overhead of function execution.
     # For a DistributedVariable, only one variable handle is captured into a
@@ -472,6 +477,41 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
     # variable.
     self._policy = var_policy
 
+  def __deepcopy__(self, memo):
+    """Perform a deepcopy of the `DistributedVariable`.
+
+    Unlike the deepcopy of a regular tf.Variable, this keeps the original
+    strategy and devices of the `DistributedVariable`.  To avoid confusion
+    with the behavior of deepcopy on a regular `Variable` (which does
+    copy into new devices), we only allow a deepcopy of a `DistributedVariable`
+    within its originating strategy scope.
+
+    Args:
+      memo: The memoization object for `deepcopy`.
+
+    Returns:
+      A deep copy of the current `DistributedVariable`.
+
+    Raises:
+      RuntimeError: If trying to deepcopy into a different strategy.
+    """
+    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
+      new_values = []
+
+      for value in self._values:
+        with ops.device(value.device):
+          new_values.append(copy.deepcopy(value, memo))
+
+    copied_variable = type(self)(
+        strategy=self._distribute_strategy,
+        values=new_values,
+        aggregation=self._aggregation,
+        var_policy=copy.deepcopy(self._policy, memo))
+
+    memo[id(self)] = copied_variable
+
+    return copied_variable
+
   def _use_packed_variable(self):
     # Don't use packed variable when under a SaveContext to avoid explicit
     # device placement on variable consuming ops.
@@ -916,6 +956,13 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
     return obj_map, resource_map
 
 
+# We extend from `saveable_object.SaveableObject` instead of
+# `saveable_object_util.ResourceVariableSaveable` since we need to read the
+# value of ONREAD variables when saving. `SaveableObject` provides a way to
+# specify the function to run to get the value of the variable or tensor at
+# saving time. We can use this for both ON_READ and ON_WRITE variables.
+# TODO(b/164586507): Consolidate ON_WRITE and ON_READ saving/restoring logic
+# if possible.
 class _DistributedVariableSaveable(saveable_object.SaveableObject):
   """Class for defining how to restore a DistributedVariable."""
 
@@ -935,26 +982,21 @@ class _DistributedVariableSaveable(saveable_object.SaveableObject):
         self._distributed_variable, tensor)
 
 
-class _MirroredSaveable(saveable_object_util.ResourceVariableSaveable):
+class _MirroredSaveable(saveable_object.SaveableObject):
   """Class for defining how to restore a MirroredVariable."""
 
   def __init__(self, mirrored_variable, primary_variable, name):
     self._mirrored_variable = mirrored_variable
-    super(_MirroredSaveable, self).__init__(primary_variable, "", name)
+    tensor, spec = values_util.get_on_write_saveable(self._mirrored_variable,
+                                                     primary_variable,
+                                                     name)
+    super(_MirroredSaveable, self).__init__(tensor, spec, name)
 
   def restore(self, restored_tensors, restored_shapes):
     """Restore the same value into all variables."""
     tensor, = restored_tensors
-    packed_var = self._mirrored_variable._packed_variable  # pylint: disable=protected-access
-    if packed_var is not None:
-      return control_flow_ops.group(
-          tuple(
-              values_util.assign_on_device(d, packed_var, tensor)
-              for d in packed_var.devices))
-    return control_flow_ops.group(
-        tuple(
-            values_util.assign_on_device(v.device, v, tensor)
-            for v in self._mirrored_variable.values))
+    return values_util.get_on_write_restore_ops(self._mirrored_variable,
+                                                tensor)
 
 
 class MirroredVariable(DistributedVariable, Mirrored):
@@ -991,8 +1033,6 @@ class MirroredVariable(DistributedVariable, Mirrored):
     return super(MirroredVariable, self).scatter_update(*args, **kwargs)
 
   def _get_cross_replica(self):
-    if values_util.is_saving_non_distributed():
-      return self._primary.read_value()
     # Return identity, to avoid directly exposing the variable to the user and
     # allowing it to be modified by mistake.
     return array_ops.identity(Mirrored._get_cross_replica(self))
@@ -1020,7 +1060,8 @@ class MirroredVariable(DistributedVariable, Mirrored):
     # TODO(b/154017756): Make _dense_var_to_tensor consistent between ON_READ
     # and ON_WRITE.
     # Try to avoid assignments to and other mutations of MirroredVariable
-    # state except through a DistributionStrategy.extended.update() call.
+    # state except through a DistributionStrategy.extended.update() or any of
+    # the `assign*` and `scatter*` calls.
     if as_ref:
       # A TF 1.x case where the variable is a boolean variable and used like:
       # tf.cond(v, true_fn, false_fn).
@@ -1037,38 +1078,17 @@ class _SyncOnReadSaveable(saveable_object.SaveableObject):
 
   def __init__(self, sync_on_read_variable, name):
     self._sync_on_read_variable = sync_on_read_variable
+    tensor, spec = values_util.get_on_read_saveable(
+        sync_on_read_variable, sync_on_read_variable._primary, name)
 
-    # We use a callable so that we don't have to evaluate this expression
-    # in the case where we are trying to restore instead of save.
-    def tensor():
-      strategy = sync_on_read_variable._distribute_strategy  # pylint: disable=protected-access
-      return strategy.extended.read_var(sync_on_read_variable)
-
-    spec = saveable_object.SaveSpec(
-        tensor=tensor,
-        slice_spec="",
-        name=name,
-        dtype=sync_on_read_variable.dtype,
-        device=sync_on_read_variable._primary.device)  # pylint: disable=protected-access
-
-    super(_SyncOnReadSaveable, self).__init__(tensor, [spec], name)
+    super(_SyncOnReadSaveable, self).__init__(tensor, spec, name)
 
   def restore(self, restored_tensors, restored_shapes):
     """Restore the same value into all variables."""
-    # To preserve the sum across save and restore, we have to divide the
-    # total across all devices when restoring a variable that was summed
-    # when saving.
     tensor, = restored_tensors
-    if self._sync_on_read_variable.aggregation == vs.VariableAggregation.SUM:
-      # pylint: disable=protected-access
-      strategy = self._sync_on_read_variable._distribute_strategy
-      tensor = math_ops.cast(tensor / strategy.num_replicas_in_sync,
-                             self._sync_on_read_variable.dtype)
-      # pylint: enable=protected-access
-    return control_flow_ops.group(
-        tuple(
-            values_util.assign_on_device(v.device, v, tensor)
-            for v in self._sync_on_read_variable.values))
+    return values_util.get_on_read_restore_ops(
+        self._sync_on_read_variable, tensor,
+        self._sync_on_read_variable.aggregation)
 
 
 class SyncOnReadVariable(DistributedVariable):
@@ -1083,7 +1103,8 @@ class SyncOnReadVariable(DistributedVariable):
     if values_util.is_saving_non_distributed():
       return self._primary.assign_sub(value, use_locking, name, read_value)
     with ds_context.enter_or_assert_strategy(self._distribute_strategy):
-      if ds_context.in_cross_replica_context() and not _in_update_replica():
+      if (ds_context.in_cross_replica_context() and
+          not values_util.in_replica_update_context()):
         return values_util.on_read_assign_sub_cross_replica(
             self, value, read_value=read_value)
       else:
@@ -1094,7 +1115,8 @@ class SyncOnReadVariable(DistributedVariable):
     if values_util.is_saving_non_distributed():
       return self._primary.assign_add(value, use_locking, name, read_value)
     with ds_context.enter_or_assert_strategy(self._distribute_strategy):
-      if ds_context.in_cross_replica_context() and not _in_update_replica():
+      if (ds_context.in_cross_replica_context() and
+          not values_util.in_replica_update_context()):
         return values_util.on_read_assign_add_cross_replica(
             self, value, read_value=read_value)
       else:
@@ -1105,7 +1127,8 @@ class SyncOnReadVariable(DistributedVariable):
     if values_util.is_saving_non_distributed():
       return self._primary.assign(value, use_locking, name, read_value)
     with ds_context.enter_or_assert_strategy(self._distribute_strategy):
-      if ds_context.in_cross_replica_context() and not _in_update_replica():
+      if (ds_context.in_cross_replica_context() and
+          not values_util.in_replica_update_context()):
         return values_util.on_read_assign_cross_replica(
             self, value, read_value=read_value)
       else:
@@ -1156,7 +1179,8 @@ class SyncOnReadVariable(DistributedVariable):
     if values_util.is_saving_non_distributed():
       return self._primary.value()
     with ds_context.enter_or_assert_strategy(self._distribute_strategy):
-      if ds_context.in_cross_replica_context() and not _in_update_replica():
+      if (ds_context.in_cross_replica_context() and
+          not values_util.in_replica_update_context()):
         if self._aggregation == vs.VariableAggregation.ONLY_FIRST_REPLICA:
           return self._get_replica(0).value()
         return self._get_cross_replica()
@@ -1165,8 +1189,6 @@ class SyncOnReadVariable(DistributedVariable):
         return self._get_on_device_or_primary().value()
 
   def _get_cross_replica(self):
-    if values_util.is_saving_non_distributed():
-      return self._primary.read_value()
     if self._aggregation == vs.VariableAggregation.ONLY_FIRST_REPLICA:
       # Consider returning a tensor value here to make the return value of
       # _get_cross_replica consistent.
@@ -1291,7 +1313,10 @@ class OnReadPolicy(VariablePolicy):
 
   def value(self, var):
     with ds_context.enter_or_assert_strategy(var.distribute_strategy):
-      if ds_context.in_cross_replica_context():
+      if (ds_context.in_cross_replica_context() and
+          not values_util.in_replica_update_context()):
+        if self._aggregation == vs.VariableAggregation.ONLY_FIRST_REPLICA:
+          return var._get_replica(0).value()  # pylint: disable=protected-access
         return var._get_cross_replica()  # pylint: disable=protected-access
       else:
         return var._get_on_device_or_primary().value()  # pylint: disable=protected-access
@@ -1304,7 +1329,7 @@ class OnReadPolicy(VariablePolicy):
 
   def _get_cross_replica(self, var):
     if self._aggregation == vs.VariableAggregation.ONLY_FIRST_REPLICA:
-      return var._primary  # pylint: disable=protected-access
+      return var._get_replica(0)  # pylint: disable=protected-access
 
     with ds_context.enter_or_assert_strategy(var.distribute_strategy):
       return  var.distribute_strategy.reduce(
@@ -1324,7 +1349,8 @@ class OnReadPolicy(VariablePolicy):
                  read_value=True):
     """Subtracts a value from this variable."""
     with ds_context.enter_or_assert_strategy(var.distribute_strategy):
-      if ds_context.in_cross_replica_context():
+      if (ds_context.in_cross_replica_context() and
+          not values_util.in_replica_update_context()):
         return values_util.on_read_assign_sub_cross_replica(
             var, value, read_value=read_value)
       else:
@@ -1336,7 +1362,8 @@ class OnReadPolicy(VariablePolicy):
                  read_value=True):
     """Adds a value to this variable."""
     with ds_context.enter_or_assert_strategy(var.distribute_strategy):
-      if ds_context.in_cross_replica_context():
+      if (ds_context.in_cross_replica_context() and
+          not values_util.in_replica_update_context()):
         return values_util.on_read_assign_add_cross_replica(
             var, value, read_value=read_value)
       else:
@@ -1346,7 +1373,8 @@ class OnReadPolicy(VariablePolicy):
 
   def assign(self, var, value, use_locking=False, name=None, read_value=True):
     with ds_context.enter_or_assert_strategy(var.distribute_strategy):
-      if ds_context.in_cross_replica_context():
+      if (ds_context.in_cross_replica_context() and
+          not values_util.in_replica_update_context()):
         return values_util.on_read_assign_cross_replica(var, value,
                                                         read_value=read_value)
       else:
@@ -1385,35 +1413,11 @@ class OnReadPolicy(VariablePolicy):
 
   def get_saveable(self, var, primary_var, name):
     """Create a saveable object for the given variable."""
-
-    # We use a callable so that we don't have to evaluate this expression
-    # in the case where we are trying to restore instead of save.
-    def tensor():
-      strategy = var.distribute_strategy
-      return strategy.extended.read_var(var)
-
-    spec = saveable_object.SaveSpec(
-        tensor=tensor,
-        slice_spec="",
-        name=name,
-        dtype=var.dtype,
-        device=primary_var.device)
-
-    return tensor, [spec]
+    return values_util.get_on_read_saveable(var, primary_var, name)
 
   def get_restore_ops(self, var, tensor):
     """Restore the same value into all variables."""
-    # To preserve the sum across save and restore, we have to divide the
-    # total across all devices when restoring a variable that was summed
-    # when saving.
-    if self._aggregation == vs.VariableAggregation.SUM:
-      strategy = var._distribute_strategy  # pylint: disable=protected-access
-      num_replicas_in_sync = strategy.num_replicas_in_sync
-      tensor = math_ops.cast(tensor / num_replicas_in_sync, var.dtype)
-    return control_flow_ops.group(
-        tuple(
-            values_util.assign_on_device(v.device, v, tensor)
-            for v in var.values))
+    return values_util.get_on_read_restore_ops(var, tensor, self._aggregation)
 
 
 class AutoPolicy(VariablePolicy):
@@ -1437,7 +1441,7 @@ class AutoPolicy(VariablePolicy):
   def _get_cross_replica(self, var):
     # Return identity, to avoid directly exposing the variable to the user and
     # allowing it to be modified by mistake.
-    return array_ops.identity(Mirrored._get_cross_replica(var))  # pylint: disable=protected-access
+    return array_ops.identity(var._get_on_device_or_primary())  # pylint: disable=protected-access
 
   def _update_replica(self, var, update_fn, value, **kwargs):
     return update_fn(var._get_on_device_or_primary(), value, **kwargs)  # pylint: disable=protected-access
@@ -1498,14 +1502,11 @@ class AutoPolicy(VariablePolicy):
                                       name=name)
 
   def get_saveable(self, var, primary_var, name):
-    del var, name
-    return primary_var, ""
+    """Saveable ops for AUTO variables."""
+    return values_util.get_on_write_saveable(var, primary_var, name)
 
   def get_restore_ops(self, var, tensor):
-    return control_flow_ops.group(
-        tuple(
-            values_util.assign_on_device(v.device, v, tensor)
-            for v in var.values))
+    return values_util.get_on_write_restore_ops(var, tensor)
 
 
 class OnWritePolicy(AutoPolicy):
@@ -1524,24 +1525,3 @@ class OnWritePolicy(AutoPolicy):
 
   def _update_replica(self, var, update_fn, value, **kwargs):
     return _on_write_update_replica(var, update_fn, value, **kwargs)
-
-
-# Utility functions
-# Return True if the Value is Mirrored or the Variable is replicated and kept in
-# sync.
-def _is_mirrored(val):
-  if isinstance(val, DistributedVariable):
-    if val._policy:  # pylint: disable=protected-access
-      return val._policy._is_mirrored()  # pylint: disable=protected-access
-  return isinstance(val, Mirrored)
-
-
-def _is_sync_on_read(val):
-  if isinstance(val, DistributedVariable):
-    if val._policy:  # pylint: disable=protected-access
-      return not val._policy._is_mirrored()  # pylint: disable=protected-access
-  return not isinstance(val, Mirrored)
-
-
-def _in_update_replica():
-  return distribute_lib.get_update_replica_id() is not None
diff --git a/tensorflow/python/distribute/values_test.py b/tensorflow/python/distribute/values_test.py
index 899134f0bff..02a9926ea18 100644
--- a/tensorflow/python/distribute/values_test.py
+++ b/tensorflow/python/distribute/values_test.py
@@ -32,6 +32,7 @@ from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distribute_utils
 from tensorflow.python.distribute import packed_distributed_variable as packed
 from tensorflow.python.distribute import parameter_server_strategy
+from tensorflow.python.distribute import ps_values
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.distribute import test_util as ds_test_util
 from tensorflow.python.distribute import tpu_strategy
@@ -80,15 +81,23 @@ def _make_mirrored_val(init_val=5.0):
   return values_lib.Mirrored(v)
 
 
-def _make_mirrored():
+def _make_mirrored(distribution=None):
   v = []
-  devices = ["/device:GPU:0", "/device:CPU:0"]
+  if distribution:
+    devices = distribution.extended.worker_devices
+  else:
+    devices = ["/device:GPU:0", "/device:CPU:0"]
   for d, n, init in zip(devices, ["v", "v/replica"], [1., 2.]):
     with ops.device(d):
-      v.append(variable_scope.get_variable(
-          name=n, initializer=init, use_resource=True))
-  mirrored = values_lib.MirroredVariable(
-      None, v, variable_scope.VariableAggregation.SUM)
+      v.append(
+          variable_scope.get_variable(
+              name=n, initializer=init, use_resource=True))
+
+  if (distribution is not None) and isinstance(distribution, _TPU_STRATEGIES):
+    var_cls = tpu_values.TPUMirroredVariable
+  else:
+    var_cls = values_lib.MirroredVariable
+  mirrored = var_cls(distribution, v, variable_scope.VariableAggregation.SUM)
   return mirrored
 
 
@@ -408,7 +417,7 @@ class DistributedDelegateTest(test.TestCase):
             strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
             strategy_combinations.tpu_strategy,
             strategy_combinations.tpu_strategy_packed_var,
-            strategy_combinations.central_storage_strategy_with_two_gpus,
+            strategy_combinations.central_storage_strategy_with_gpu_and_cpu,
             strategy_combinations.multi_worker_mirrored_2x1_cpu,
             strategy_combinations.multi_worker_mirrored_2x1_gpu,
             strategy_combinations.multi_worker_mirrored_2x2_gpu
@@ -422,7 +431,8 @@ class DistributedDelegateTest(test.TestCase):
             variables_lib.VariableAggregation.SUM,
             variables_lib.VariableAggregation.ONLY_FIRST_REPLICA,
         ],
-        mode=["graph", "eager"]))
+        mode=["graph", "eager"],
+        use_var_policy=[True, False]))
 class DistributedVariableTest(test.TestCase, parameterized.TestCase):
 
   def testExtendsVariable(self, distribution, synchronization, aggregation):
@@ -533,6 +543,42 @@ class DistributedVariableTest(test.TestCase, parameterized.TestCase):
     # In replica context.
     distribution.run(assert_is_tensor_like, args=(v,))
 
+  def testDeepCopy(self, distribution, synchronization,
+                   aggregation):
+    if not context.executing_eagerly():
+      self.skipTest("deepcopy only supported in eager mode")
+
+    with distribution.scope():
+      v = variables_lib.Variable(
+          0., synchronization=synchronization, aggregation=aggregation)
+      in_dist_copy = copy.deepcopy(v)
+
+    out_dist_copy = copy.deepcopy(v)
+
+    def assert_is_deep_copy(v1, v2):
+      self.assertIsInstance(v2, type(v1))
+      self.assertEqual(v1.aggregation, v2.aggregation)
+      self.assertEqual(v1.distribute_strategy, v2.distribute_strategy)
+      if isinstance(v1, ps_values.AggregatingVariable):
+        self.assertIsInstance(v2.get(), type(v1.get()))
+        self.assertNotEqual(id(v1.get()), id(v2.get()))
+      else:
+        if v1._policy:
+          self.assertNotEqual(id(v1._policy), id(v2._policy))  # pylint: disable=protected-access
+        else:
+          self.assertEqual(id(v1._policy), id(v2._policy))  # pylint: disable=protected-access
+        self.assertEqual(len(v1.values), len(v2.values))
+        for (v1v, v2v) in zip(v1.values, v2.values):
+          self.assertEqual(v1v.device, v2v.device)
+          self.assertNotEqual(id(v1v), id(v2v))
+          self.assertAllEqual(self.evaluate(v1.values),
+                              self.evaluate(v2.values))
+
+    self.evaluate(variables_lib.global_variables_initializer())
+    if not isinstance(distribution.extended, tpu_strategy.TPUExtended):
+      distribution.run(assert_is_deep_copy, args=(v, in_dist_copy))
+      distribution.run(assert_is_deep_copy, args=(v, out_dist_copy))
+
   def testAssignSignature(self, distribution, synchronization, aggregation):
     # This test verifies assign*() can be called in the same way as normal
     # variables.
@@ -866,6 +912,9 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(v.dtype, mirrored.dtype)
     self.assertEqual(v.shape, mirrored.shape)
 
+
+class MirroredVariableSaveRestoreTest(test.TestCase, parameterized.TestCase):
+
   def _assign_mirrored(self, v, new):
     for var, n in zip(v.values, new):
       self.evaluate(var.assign(n))
@@ -880,37 +929,10 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
     save_path, _ = self._save_return_saver(sess, var)
     return save_path
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testSaveAndRestoreMirroredOneGraph(self):
-    if context.num_gpus() < 1 and context.executing_eagerly():
-      # Graph mode can work without GPU because the Placer "moves" the
-      # variable to a CPU. In other words, if there is no GPU available, but
-      # user requested to create a variable on GPU, Placer will ignore the
-      # user request and assign the VarHandleOp to CPU. This requires
-      # soft_placement, which is on by default.
-      self.skipTest("A GPU is not available for this test in eager mode.")
-
-    with self.cached_session(config=self.config) as sess:
-      mirrored = _make_mirrored()
-      v = mirrored.values
-
-      # Overwrite the initial values.
-      self._assign_mirrored(mirrored, [3., 4.])
-
-      # Saves the current value of v[0], 3.
-      save_path, saver = self._save_return_saver(sess, mirrored)
-
-      # Change the values between save and restore.
-      self._assign_mirrored(mirrored, [5., 6.])
-
-      # Restores the saved value of 3. to both variables.
-      saver.restore(sess, save_path)
-      self.assertEqual([3., 3.], self.evaluate([v[0], v[1]]))
-
-  def _save_mirrored(self):
+  def _save_mirrored(self, distribution):
     """Save variables with mirroring, returns save_path."""
     with self.session(graph=ops.Graph()) as sess:
-      mirrored = _make_mirrored()
+      mirrored = _make_mirrored(distribution)
 
       # Overwrite the initial values.
       self._assign_mirrored(mirrored, [3., 4.])
@@ -952,10 +974,10 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
       saver.restore(sess, save_path)
       self.assertEqual(3., self.evaluate(var))
 
-  def _restore_mirrored(self, save_path):
+  def _restore_mirrored(self, save_path, distribution):
     """Restore to variables with mirroring in a fresh graph."""
     with self.session(graph=ops.Graph()) as sess:
-      mirrored = _make_mirrored()
+      mirrored = _make_mirrored(distribution)
       v = mirrored.values
 
       # Overwrite the initial values.
@@ -966,8 +988,27 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
       saver.restore(sess, save_path)
       self.assertEqual([3., 3.], self.evaluate([v[0], v[1]]))
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testSaveMirroredRestoreMirrored(self):
+  @combinations.generate(mirrored_and_tpu_strategy_combinations())
+  def testSaveAndRestoreMirroredOneGraph(self, distribution):
+    with self.cached_session() as sess:
+      mirrored = _make_mirrored(distribution)
+      v = mirrored  .values
+
+      # Overwrite the initial values.
+      self._assign_mirrored(mirrored, [3., 4.])
+
+      # Saves the current value of v[0], 3.
+      save_path, saver = self._save_return_saver(sess, mirrored)
+
+      # Change the values between save and restore.
+      self._assign_mirrored(mirrored, [5., 6.])
+
+      # Restores the saved value of 3. to both variables.
+      saver.restore(sess, save_path)
+      self.assertEqual([3., 3.], self.evaluate([v[0], v[1]]))
+
+  @combinations.generate(mirrored_and_tpu_strategy_combinations())
+  def testSaveMirroredRestoreMirrored(self, distribution):
     if context.num_gpus() < 1 and context.executing_eagerly():
       # Graph mode can work without GPU because the Placer "moves" the
       # variable to a CPU. In other words, if there is no GPU available, but
@@ -976,11 +1017,11 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
       # soft_placement, which is on by default.
       self.skipTest("A GPU is not available for this test in eager mode.")
 
-    save_path = self._save_mirrored()
-    self._restore_mirrored(save_path)
+    save_path = self._save_mirrored(distribution)
+    self._restore_mirrored(save_path, distribution)
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testSaveMirroredRestoreNormal(self):
+  @combinations.generate(mirrored_and_tpu_strategy_combinations())
+  def testSaveMirroredRestoreNormal(self, distribution):
     if context.num_gpus() < 1 and context.executing_eagerly():
       # Graph mode can work without GPU because the Placer "moves" the
       # variable to a CPU. In other words, if there is no GPU available, but
@@ -989,11 +1030,11 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
       # soft_placement, which is on by default.
       self.skipTest("A GPU is not available for this test in eager mode.")
 
-    save_path = self._save_mirrored()
+    save_path = self._save_mirrored(distribution)
     self._restore_normal(save_path)
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testSaveNormalRestoreMirrored(self):
+  @combinations.generate(mirrored_and_tpu_strategy_combinations())
+  def testSaveNormalRestoreMirrored(self, distribution):
     if context.num_gpus() < 1 and context.executing_eagerly():
       # Graph mode can work without GPU because the Placer "moves" the
       # variable to a CPU. In other words, if there is no GPU available, but
@@ -1003,7 +1044,7 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
       self.skipTest("A GPU is not available for this test in eager mode.")
 
     save_path = self._save_normal()
-    self._restore_mirrored(save_path)
+    self._restore_mirrored(save_path, distribution)
 
 
 _TPU_STRATEGIES = (tpu_strategy.TPUStrategy, tpu_strategy.TPUStrategyV1)
diff --git a/tensorflow/python/distribute/values_util.py b/tensorflow/python/distribute/values_util.py
index 692d951cffa..c9bcf8f9a56 100644
--- a/tensorflow/python/distribute/values_util.py
+++ b/tensorflow/python/distribute/values_util.py
@@ -28,6 +28,78 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.saved_model import save_context
 from tensorflow.python.saved_model import save_options
+from tensorflow.python.training.saving import saveable_object
+
+
+def get_on_write_saveable(var, primary_var, name):
+  """Return saveable spec for AUTO and ON_WRITE variables."""
+  # We use a callable so that we don't have to evaluate this expression
+  # in the case where we are trying to restore instead of save.
+  def tensor():
+    strategy = var.distribute_strategy
+    return strategy.extended.read_var(var)
+
+  spec = saveable_object.SaveSpec(
+      tensor=tensor,
+      slice_spec="",
+      name=name,
+      dtype=var.dtype,
+      device=primary_var.device)
+
+  return tensor, [spec]
+
+
+def get_on_write_restore_ops(var, tensor):
+  """Return restore ops for AUTO and ON_WRITE variables."""
+  packed_var = var._packed_variable  # pylint: disable=protected-access
+  if packed_var is not None:
+    return control_flow_ops.group(
+        tuple(
+            assign_on_device(d, packed_var, tensor)
+            for d in packed_var.devices))
+  return control_flow_ops.group(
+      tuple(
+          assign_on_device(v.device, v, tensor)
+          for v in var.values))
+
+
+def get_on_read_saveable(var, primary_var, name):
+  """Return saveables for ON_READ variable."""
+
+  # We use a callable so that we don't have to evaluate this expression
+  # in the case where we are trying to restore instead of save.
+  def tensor():
+    return var._get_cross_replica()  # pylint: disable=protected-access
+
+  spec = saveable_object.SaveSpec(
+      tensor=tensor,
+      slice_spec="",
+      name=name,
+      dtype=var.dtype,
+      device=primary_var.device)
+
+  return tensor, [spec]
+
+
+def get_on_read_restore_ops(var, tensor, aggregation):
+  """Return restore ops for ON_READ variables."""
+  # To preserve the sum across save and restore, we have to divide the
+  # total across all devices when restoring a variable that was summed
+  # when saving.
+  if aggregation == vs.VariableAggregation.SUM:
+    strategy = var.distribute_strategy
+    tensor = math_ops.cast(tensor / strategy.num_replicas_in_sync,
+                           var.dtype)
+  return control_flow_ops.group(
+      tuple(
+          assign_on_device(v.device, v, tensor)
+          for v in var.values))
+
+
+# Utility function that indicates if you are in an UpdateContext when running
+# in a replica fn.
+def in_replica_update_context():
+  return distribute_lib.get_update_replica_id() is not None
 
 
 def on_write_assign(var, value, use_locking=False, name=None, read_value=True):
@@ -264,5 +336,5 @@ def is_saving_non_distributed():
   if not save_context.in_save_context():
     return False
   options = save_context.get_save_options()
-  return (options is not None and options.experimental_variable_policy !=
+  return (options.experimental_variable_policy !=
           save_options.VariablePolicy.EXPAND_DISTRIBUTED_VARIABLES)
diff --git a/tensorflow/python/distribute/vars_test.py b/tensorflow/python/distribute/vars_test.py
index a8605a3f2da..ba77384a83a 100644
--- a/tensorflow/python/distribute/vars_test.py
+++ b/tensorflow/python/distribute/vars_test.py
@@ -20,10 +20,11 @@ from __future__ import print_function
 
 import itertools
 
+import uuid
 from absl.testing import parameterized
 
 from tensorflow.python.distribute import combinations
-from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.distribute import tpu_strategy
 from tensorflow.python.distribute import values
@@ -41,6 +42,8 @@ from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.tpu import tpu_strategy_util
+from tensorflow.python.training import checkpoint_management as ckpt_manager
+from tensorflow.python.training.tracking import util as trackable_utils
 
 
 _TPU_STRATEGIES = (tpu_strategy.TPUStrategy, tpu_strategy.TPUStrategyV1)
@@ -78,22 +81,6 @@ def strategy_with_var_policy():
 
 class OnWriteVariableSync(test.TestCase, parameterized.TestCase):
 
-  @combinations.generate(
-      combinations.combine(
-          distribution=[
-              strategy_combinations.mirrored_strategy_with_one_gpu,
-          ],
-          mode=["graph"]))
-  def testFetchAMirroredVariable(self, distribution):
-    with self.session(graph=ops.Graph()) as sess, distribution.scope():
-      with ops.device("/device:GPU:0"):
-        v = variable_scope.get_variable(
-            name="v", initializer=1., use_resource=True)
-      mirrored = values.MirroredVariable(
-          distribution, (v,), variable_scope.VariableAggregation.MEAN)
-      sess.run(variables_lib.global_variables_initializer())
-      sess.run({"complicated": mirrored})
-
   @combinations.generate(strategy_and_run_tf_function_combinations())
   def testAssign(self, distribution, experimental_run_tf_function):
 
@@ -330,7 +317,7 @@ class OnWriteVariableSync(test.TestCase, parameterized.TestCase):
 
     @def_function.function
     def assign():
-      ctx = distribution_strategy_context.get_replica_context()
+      ctx = ds_context.get_replica_context()
       return v.assign(ctx.replica_id_in_sync_group)
 
     # disallow assign() with distributed value in replica context.
@@ -402,7 +389,7 @@ class OnWriteVariableSync(test.TestCase, parameterized.TestCase):
 
     @def_function.function
     def assign():
-      ctx = distribution_strategy_context.get_replica_context()
+      ctx = ds_context.get_replica_context()
       replica_id = ctx.replica_id_in_sync_group
       return v.assign(math_ops.cast(replica_id, dtypes.float32))
     per_replica_results = self.evaluate(distribution.experimental_local_results(
@@ -458,6 +445,60 @@ class OnWriteVariableSync(test.TestCase, parameterized.TestCase):
         distribution.experimental_local_results(distribution.run(add)))
     self.assertAllEqual([2, 2], per_replica_results)
 
+  @combinations.generate(
+      combinations.combine(
+          strategy=[
+              strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+              strategy_combinations.tpu_strategy,
+              strategy_combinations.tpu_strategy_packed_var,
+              strategy_combinations.multi_worker_mirrored_2x1_cpu,
+              strategy_combinations.multi_worker_mirrored_2x1_gpu,
+          ],
+          mode=["eager"],
+          use_var_policy=[True, False]))
+  def testSaveAndRestoreOnWrite(self, strategy):
+    aggregation = [
+        variable_scope.VariableAggregation.NONE,
+        variable_scope.VariableAggregation.ONLY_FIRST_REPLICA,
+        variable_scope.VariableAggregation.SUM,
+        variable_scope.VariableAggregation.MEAN
+    ]
+    for agg in aggregation:
+      v_normal_restore = variables_lib.Variable(1.0)
+      v_normal_save = variables_lib.Variable(3.0)
+      with strategy.scope():
+        v_on_write = variables_lib.Variable(2.0, aggregation=agg)
+
+        # Save ONWRITE Restore ONWRITE
+        # Save
+        ckpt = trackable_utils.Checkpoint(var=v_on_write)
+        manager = ckpt_manager.CheckpointManager(
+            ckpt, "/tmp/ckpt_" + str(uuid.uuid4()), max_to_keep=None)
+        manager.save()
+        # Restore
+        ckpt.restore(manager.latest_checkpoint)
+        self.assertEqual(2.0, self.evaluate(v_on_write._values[0]))
+        self.assertEqual(2.0, self.evaluate(v_on_write.read_value()))
+
+        # Save Mirrored Restore Normal
+        # We've already saved Mirrored, so we only need to restore normal
+        ckpt_normal = trackable_utils.Checkpoint(var=v_normal_restore)
+        ckpt_normal.restore(manager.latest_checkpoint)
+        self.assertEqual(2.0, self.evaluate(v_on_write._values[0]))
+        self.assertEqual(2.0, self.evaluate(v_normal_restore.read_value()))
+
+        # Save Normal Restore Mirrored
+        # Save
+        ckpt = trackable_utils.Checkpoint(var=v_normal_save)
+        manager_2 = ckpt_manager.CheckpointManager(
+            ckpt, "/tmp/ckptckpt_" + str(uuid.uuid4()), max_to_keep=None)
+        manager_2.save()
+        # Restore
+        ckpt_on_write = trackable_utils.Checkpoint(var=v_on_write)
+        ckpt_on_write.restore(manager_2.latest_checkpoint)
+        self.assertEqual(3.0, self.evaluate(v_on_write._values[0]))
+        self.assertEqual(3.0, self.evaluate(v_on_write.read_value()))
+
 
 @combinations.generate(
     combinations.combine(
@@ -468,7 +509,7 @@ class OnWriteVariableSync(test.TestCase, parameterized.TestCase):
         use_var_policy=[True, False]))
 class OnWriteVariableSyncScatterTests(test.TestCase, parameterized.TestCase):
 
-  def testScatterSub(self, distribution, use_var_policy):
+  def testScatterSub(self, distribution):
     with distribution.scope():
       v = variables_lib.Variable(
           [0., 0., 0.], aggregation=variables_lib.VariableAggregation.MEAN)
@@ -476,7 +517,7 @@ class OnWriteVariableSyncScatterTests(test.TestCase, parameterized.TestCase):
 
     @def_function.function
     def scatter_sub():
-      ctx = distribution_strategy_context.get_replica_context()
+      ctx = ds_context.get_replica_context()
       replica_id = ctx.replica_id_in_sync_group
       value = indexed_slices.IndexedSlices(
           values=array_ops.stack([
@@ -492,7 +533,7 @@ class OnWriteVariableSyncScatterTests(test.TestCase, parameterized.TestCase):
             distribution.run(scatter_sub)))
     self.assertAllEqual([[0., -1., -1.], [0., -1., -1.]], per_replica_results)
 
-  def testScatterAdd(self, distribution, use_var_policy):
+  def testScatterAdd(self, distribution):
     with distribution.scope():
       v = variables_lib.Variable(
           [0, 0, 0], aggregation=variables_lib.VariableAggregation.SUM)
@@ -500,7 +541,7 @@ class OnWriteVariableSyncScatterTests(test.TestCase, parameterized.TestCase):
 
     @def_function.function
     def scatter_add():
-      ctx = distribution_strategy_context.get_replica_context()
+      ctx = ds_context.get_replica_context()
       replica_id = ctx.replica_id_in_sync_group
       value = indexed_slices.IndexedSlices(
           values=array_ops.stack([replica_id, replica_id + 1]),
@@ -513,7 +554,7 @@ class OnWriteVariableSyncScatterTests(test.TestCase, parameterized.TestCase):
             distribution.run(scatter_add)))
     self.assertAllEqual([[0, 2, 2], [0, 2, 2]], per_replica_results)
 
-  def testScatterDiv(self, distribution, use_var_policy):
+  def testScatterDiv(self, distribution):
     with distribution.scope():
       v = variables_lib.Variable(
           [1, 6, 1], aggregation=variables_lib.VariableAggregation.SUM)
@@ -521,7 +562,7 @@ class OnWriteVariableSyncScatterTests(test.TestCase, parameterized.TestCase):
 
     @def_function.function
     def scatter_div():
-      ctx = distribution_strategy_context.get_replica_context()
+      ctx = ds_context.get_replica_context()
       replica_id = ctx.replica_id_in_sync_group
       value = indexed_slices.IndexedSlices(
           values=array_ops.reshape(replica_id + 2, [1]),
@@ -534,7 +575,7 @@ class OnWriteVariableSyncScatterTests(test.TestCase, parameterized.TestCase):
             distribution.run(scatter_div)))
     self.assertAllEqual([[0, 2, 1], [0, 2, 1]], per_replica_results)
 
-  def testScatterMul(self, distribution, use_var_policy):
+  def testScatterMul(self, distribution):
     with distribution.scope():
       v = variables_lib.Variable(
           [2., 1., 1.], aggregation=variables_lib.VariableAggregation.MEAN)
@@ -542,7 +583,7 @@ class OnWriteVariableSyncScatterTests(test.TestCase, parameterized.TestCase):
 
     @def_function.function
     def scatter_mul():
-      ctx = distribution_strategy_context.get_replica_context()
+      ctx = ds_context.get_replica_context()
       replica_id = ctx.replica_id_in_sync_group
       value = indexed_slices.IndexedSlices(
           values=array_ops.reshape(
@@ -556,7 +597,7 @@ class OnWriteVariableSyncScatterTests(test.TestCase, parameterized.TestCase):
             distribution.run(scatter_mul)))
     self.assertAllClose([[2., 1.5, 1.], [2., 1.5, 1.]], per_replica_results)
 
-  def testScatterMin(self, distribution, use_var_policy):
+  def testScatterMin(self, distribution):
     with distribution.scope():
       v1 = variables_lib.Variable(
           [0, 2, 0], aggregation=variables_lib.VariableAggregation.SUM)
@@ -583,7 +624,7 @@ class OnWriteVariableSyncScatterTests(test.TestCase, parameterized.TestCase):
             distribution.run(scatter_min, args=(v2,))))
     self.assertAllClose([[0, 1, 0], [0, 1, 0]], per_replica_results)
 
-  def testScatterMax(self, distribution, use_var_policy):
+  def testScatterMax(self, distribution):
     with distribution.scope():
       v1 = variables_lib.Variable(
           [0, 0, 0], aggregation=variables_lib.VariableAggregation.SUM)
@@ -610,7 +651,7 @@ class OnWriteVariableSyncScatterTests(test.TestCase, parameterized.TestCase):
             distribution.run(scatter_max, args=(v2,))))
     self.assertAllClose([[1, 0, 0], [1, 0, 0]], per_replica_results)
 
-  def testScatterUpdate(self, distribution, use_var_policy):
+  def testScatterUpdate(self, distribution):
     with distribution.scope():
       v1 = variables_lib.Variable(
           [0, 0, 0], aggregation=variables_lib.VariableAggregation.SUM)
@@ -637,7 +678,7 @@ class OnWriteVariableSyncScatterTests(test.TestCase, parameterized.TestCase):
             distribution.run(scatter_update, args=(v2,))))
     self.assertAllClose([[0, 3, 0], [0, 3, 0]], per_replica_results)
 
-  def testScatterOpsInCrossReplicaContext(self, distribution, use_var_policy):
+  def testScatterOpsInCrossReplicaContext(self, distribution):
     with distribution.scope():
       v1 = variables_lib.Variable(
           [1, 1, 1], aggregation=variables_lib.VariableAggregation.SUM)
@@ -659,8 +700,7 @@ class OnWriteVariableSyncScatterTests(test.TestCase, parameterized.TestCase):
 class OnReadVariableSyncTest(test.TestCase, parameterized.TestCase):
 
   @combinations.generate(strategy_and_run_tf_function_combinations())
-  def testAssign(self, distribution, experimental_run_tf_function,
-                 use_var_policy):
+  def testAssign(self, distribution, experimental_run_tf_function):
 
     def assign(fn, v, update_value, cross_replica):
       update_fn = lambda: getattr(v, fn)(update_value)
@@ -702,8 +742,7 @@ class OnReadVariableSyncTest(test.TestCase, parameterized.TestCase):
                             self.evaluate(array_ops.ones_like(component)))
 
   @combinations.generate(strategy_and_run_tf_function_combinations())
-  def testAssignOnReadVar(self, distribution, experimental_run_tf_function,
-                          use_var_policy):
+  def testAssignOnReadVar(self, distribution, experimental_run_tf_function):
 
     with distribution.scope():
       v_to_assign = variable_scope.variable(
@@ -764,8 +803,7 @@ class OnReadVariableSyncTest(test.TestCase, parameterized.TestCase):
                               self.evaluate(component.read_value()))
 
   @combinations.generate(strategy_and_run_tf_function_combinations())
-  def testAssignPerReplicaVal(self, distribution, experimental_run_tf_function,
-                              use_var_policy):
+  def testAssignPerReplicaVal(self, distribution, experimental_run_tf_function):
 
     if isinstance(distribution, _TPU_STRATEGIES):
       self.skipTest("Assigning PerReplica values is not supported. See"
@@ -822,8 +860,7 @@ class OnReadVariableSyncTest(test.TestCase, parameterized.TestCase):
 
   @combinations.generate(strategy_and_run_tf_function_combinations())
   def testAssignDtypeConversion(self, distribution,
-                                experimental_run_tf_function,
-                                use_var_policy):
+                                experimental_run_tf_function):
 
     def assign(fn, v, update_value, cross_replica):
       update_fn = lambda: getattr(v, fn)(update_value)
@@ -865,7 +902,7 @@ class OnReadVariableSyncTest(test.TestCase, parameterized.TestCase):
                             self.evaluate(array_ops.ones_like(component)))
 
   @combinations.generate(strategy_with_var_policy())
-  def testAssignWithAggregationSum(self, distribution, use_var_policy):
+  def testAssignWithAggregationSum(self, distribution):
     with distribution.scope():
       v = variable_scope.variable(
           0.,
@@ -878,7 +915,7 @@ class OnReadVariableSyncTest(test.TestCase, parameterized.TestCase):
                           self.evaluate(array_ops.ones_like(component)))
 
   @combinations.generate(strategy_with_var_policy())
-  def testAssignAddSubWithAggregationSum(self, distribution, use_var_policy):
+  def testAssignAddSubWithAggregationSum(self, distribution):
     with distribution.scope():
       v = variable_scope.variable(
           0.,
@@ -894,8 +931,7 @@ class OnReadVariableSyncTest(test.TestCase, parameterized.TestCase):
 
   @combinations.generate(strategy_and_run_tf_function_combinations())
   def testReadValueInReplicaContext(self, distribution,
-                                    experimental_run_tf_function,
-                                    use_var_policy):
+                                    experimental_run_tf_function):
     aggregations = [
         variables_lib.VariableAggregation.NONE,
         variables_lib.VariableAggregation.SUM,
@@ -921,8 +957,7 @@ class OnReadVariableSyncTest(test.TestCase, parameterized.TestCase):
 
   @combinations.generate(strategy_and_run_tf_function_combinations())
   def testReadValueInCrossReplicaContext(self, distribution,
-                                         experimental_run_tf_function,
-                                         use_var_policy):
+                                         experimental_run_tf_function):
     aggregations = [
         variables_lib.VariableAggregation.SUM,
         variables_lib.VariableAggregation.MEAN,
@@ -940,7 +975,7 @@ class OnReadVariableSyncTest(test.TestCase, parameterized.TestCase):
       self.evaluate(variables_lib.global_variables_initializer())
 
       def assign(v=v):
-        ctx = distribution_strategy_context.get_replica_context()
+        ctx = ds_context.get_replica_context()
         replica_id = ctx.replica_id_in_sync_group
         return v.assign(math_ops.cast(replica_id, dtypes.float32))
 
@@ -967,8 +1002,7 @@ class OnReadVariableSyncTest(test.TestCase, parameterized.TestCase):
   # respected on GPUs.
   @combinations.generate(strategy_and_run_tf_function_combinations())
   def disable_testAllReduce(self, distribution,
-                            experimental_run_tf_function,
-                            use_var_policy):
+                            experimental_run_tf_function):
     with distribution.scope():
       v = variable_scope.variable(
           2.,
@@ -977,7 +1011,7 @@ class OnReadVariableSyncTest(test.TestCase, parameterized.TestCase):
     self.evaluate(variables_lib.global_variables_initializer())
 
     def all_reduce():
-      ctx = distribution_strategy_context.get_replica_context()
+      ctx = ds_context.get_replica_context()
       replica_id = ctx.replica_id_in_sync_group
       return ctx.all_reduce("SUM", v) + math_ops.cast(replica_id,
                                                       dtypes.float32)
@@ -995,8 +1029,7 @@ class OnReadVariableSyncTest(test.TestCase, parameterized.TestCase):
 
   @combinations.generate(strategy_and_run_tf_function_combinations())
   def testAssignPerReplicaBeforeRead(self, distribution,
-                                     experimental_run_tf_function,
-                                     use_var_policy):
+                                     experimental_run_tf_function):
     aggregations = [
         variables_lib.VariableAggregation.SUM,
         variables_lib.VariableAggregation.MEAN,
@@ -1011,7 +1044,7 @@ class OnReadVariableSyncTest(test.TestCase, parameterized.TestCase):
       self.evaluate(variables_lib.global_variables_initializer())
 
       def assign(var=v):
-        ctx = distribution_strategy_context.get_replica_context()
+        ctx = ds_context.get_replica_context()
         replica_id = ctx.replica_id_in_sync_group
         return var.assign(math_ops.cast(replica_id, dtypes.float32))
 
@@ -1026,8 +1059,7 @@ class OnReadVariableSyncTest(test.TestCase, parameterized.TestCase):
       self.assertEqual(per_replica_results, tuple(expected_result))
 
   @combinations.generate(strategy_with_var_policy())
-  def testReadValueWithAggregationNoneInCrossReplicaContext(self, distribution,
-                                                            use_var_policy):
+  def testReadValueWithAggregationNoneInCrossReplicaContext(self, distribution):
     with distribution.scope():
       v = variable_scope.variable(
           0.,
@@ -1039,8 +1071,7 @@ class OnReadVariableSyncTest(test.TestCase, parameterized.TestCase):
       self.evaluate(v.read_value())
 
   @combinations.generate(strategy_with_var_policy())
-  def testInitializedToSameValueInsideEagerRun(self, distribution,
-                                               use_var_policy):
+  def testInitializedToSameValueInsideEagerRun(self, distribution):
     if not context.executing_eagerly(): self.skipTest("eager only")
 
     v = [None]
@@ -1060,7 +1091,7 @@ class OnReadVariableSyncTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual(vals[0], vals[1])
 
   @combinations.generate(strategy_with_var_policy())
-  def testOperatorOverride(self, distribution, use_var_policy):
+  def testOperatorOverride(self, distribution):
 
     with distribution.scope():
       v = variable_scope.variable(
@@ -1071,7 +1102,7 @@ class OnReadVariableSyncTest(test.TestCase, parameterized.TestCase):
 
       @def_function.function
       def assign():
-        ctx = distribution_strategy_context.get_replica_context()
+        ctx = ds_context.get_replica_context()
         replica_id = ctx.replica_id_in_sync_group
         return v.assign(math_ops.cast(replica_id, dtypes.float32))
 
@@ -1088,6 +1119,73 @@ class OnReadVariableSyncTest(test.TestCase, parameterized.TestCase):
           distribution.experimental_local_results(distribution.run(add)))
       self.assertAllEqual([1, 2], per_replica_results)
 
+  @combinations.generate(
+      combinations.combine(
+          strategy=[
+              strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+              strategy_combinations.tpu_strategy,
+              strategy_combinations.tpu_strategy_packed_var,
+              strategy_combinations.multi_worker_mirrored_2x1_cpu,
+              strategy_combinations.multi_worker_mirrored_2x1_gpu,
+          ],
+          mode=["eager"],
+          use_var_policy=[True, False]))
+  def testSaveAndRestoreOnRead(self, strategy):
+    aggregation = [variable_scope.VariableAggregation.SUM,
+                   variable_scope.VariableAggregation.MEAN]
+    for agg in aggregation:
+      v_normal_restore = variables_lib.Variable(1.0)
+      v_normal_save = variables_lib.Variable(2.0)
+
+      with strategy.scope():
+        v_on_read = variables_lib.Variable(
+            1.0, synchronization=variable_scope.VariableSynchronization.ON_READ,
+            aggregation=agg)
+
+        @def_function.function
+        def assign_fn():
+          cluster_resolver = strategy.cluster_resolver
+          replica_ctx = ds_context.get_replica_context()
+          if ((cluster_resolver and cluster_resolver.task_type == "worker") or
+              math_ops.equal(replica_ctx.replica_id_in_sync_group,
+                             constant_op.constant(1))):
+            v_on_read.assign(3.)  # pylint:disable=cell-var-from-loop
+          else:
+            v_on_read.assign(4.)  # pylint:disable=cell-var-from-loop
+
+        strategy.run(assign_fn)
+
+        # Save ONREAD, restore ONREAD
+        # Saves v[0] + v[1] = 7 for SUM and 3.5 for MEAN.
+        ckpt = trackable_utils.Checkpoint(var=v_on_read)
+        manager = ckpt_manager.CheckpointManager(
+            ckpt, "/tmp/ckpt_" + str(uuid.uuid4()), max_to_keep=None)
+        manager.save()
+        # Restores a value of 7/2 = 3.5 for SUM and 3.5 for MEAN.
+        ckpt.restore(manager.latest_checkpoint)
+        self.assertEqual(3.5, self.evaluate(v_on_read._values[0]))
+
+        # Save ONREAD, restore normal
+        ckpt_normal = trackable_utils.Checkpoint(var=v_normal_restore)
+        ckpt_normal.restore(manager.latest_checkpoint)
+        if agg == variable_scope.VariableAggregation.SUM:
+          self.assertEqual(7.0, self.evaluate(v_normal_restore.read_value()))
+        else:
+          self.assertEqual(3.5, self.evaluate(v_normal_restore.read_value()))
+
+        # Save normal, restore ONREAD
+        ckpt = trackable_utils.Checkpoint(var=v_normal_save)
+        manager = ckpt_manager.CheckpointManager(
+            ckpt, "/tmp/ckpt_" + str(uuid.uuid4()), max_to_keep=None)
+        manager.save()
+        # Restores a value of 2/2 = 1.0 for SUM and 2.0 for MEAN.
+        ckpt_on_read = trackable_utils.Checkpoint(var=v_on_read)
+        ckpt_on_read.restore(manager.latest_checkpoint)
+        if agg == variable_scope.VariableAggregation.SUM:
+          self.assertEqual(1.0, self.evaluate(v_on_read._values[0]))
+        else:
+          self.assertEqual(2.0, self.evaluate(v_on_read._values[0]))
+
 
 @combinations.generate(
     combinations.combine(
@@ -1103,7 +1201,7 @@ class OnReadVariableSyncTest(test.TestCase, parameterized.TestCase):
         use_var_policy=[True, False]))
 class SyncOnReadScatterReplicaTest(test.TestCase, parameterized.TestCase):
 
-  def testScatterSub(self, distribution, aggregation, use_var_policy):
+  def testScatterSub(self, distribution, aggregation):
     with distribution.scope():
       v = variables_lib.Variable(
           [1., 1., 1.],
@@ -1121,7 +1219,7 @@ class SyncOnReadScatterReplicaTest(test.TestCase, parameterized.TestCase):
     with self.assertRaises(NotImplementedError):
       self.evaluate(distribution.run(v.scatter_sub, args=(delta,)))
 
-  def testScatterAdd(self, distribution, aggregation, use_var_policy):
+  def testScatterAdd(self, distribution, aggregation):
     with distribution.scope():
       v = variables_lib.Variable(
           [1., 1., 1.],
@@ -1139,7 +1237,7 @@ class SyncOnReadScatterReplicaTest(test.TestCase, parameterized.TestCase):
     with self.assertRaises(NotImplementedError):
       self.evaluate(distribution.run(v.scatter_add, args=(delta,)))
 
-  def testScatterDiv(self, distribution, aggregation, use_var_policy):
+  def testScatterDiv(self, distribution, aggregation):
     with distribution.scope():
       v = variables_lib.Variable(
           [2., 6., 1.],
@@ -1157,7 +1255,7 @@ class SyncOnReadScatterReplicaTest(test.TestCase, parameterized.TestCase):
     with self.assertRaises(NotImplementedError):
       self.evaluate(distribution.run(v.scatter_div, args=(delta,)))
 
-  def testScatterMul(self, distribution, aggregation, use_var_policy):
+  def testScatterMul(self, distribution, aggregation):
     with distribution.scope():
       v = variables_lib.Variable(
           [2., 1., 1.],
@@ -1175,7 +1273,7 @@ class SyncOnReadScatterReplicaTest(test.TestCase, parameterized.TestCase):
     with self.assertRaises(NotImplementedError):
       self.evaluate(distribution.run(v.scatter_mul, args=(delta,)))
 
-  def testScatterMin(self, distribution, aggregation, use_var_policy):
+  def testScatterMin(self, distribution, aggregation):
     with distribution.scope():
       v = variables_lib.Variable(
           [3., 4., 5.],
@@ -1193,7 +1291,7 @@ class SyncOnReadScatterReplicaTest(test.TestCase, parameterized.TestCase):
     with self.assertRaises(NotImplementedError):
       self.evaluate(distribution.run(v.scatter_min, args=(delta,)))
 
-  def testScatterMax(self, distribution, aggregation, use_var_policy):
+  def testScatterMax(self, distribution, aggregation):
     with distribution.scope():
       v = variables_lib.Variable(
           [3., 4., 5.],
@@ -1211,7 +1309,7 @@ class SyncOnReadScatterReplicaTest(test.TestCase, parameterized.TestCase):
     with self.assertRaises(NotImplementedError):
       self.evaluate(distribution.run(v.scatter_max, args=(delta,)))
 
-  def testScatterUpdate(self, distribution, aggregation, use_var_policy):
+  def testScatterUpdate(self, distribution, aggregation):
     with distribution.scope():
       v = variables_lib.Variable(
           [0., 0., 0.],
@@ -1231,4 +1329,4 @@ class SyncOnReadScatterReplicaTest(test.TestCase, parameterized.TestCase):
 
 
 if __name__ == "__main__":
-  test.main()
+  combinations.main()
diff --git a/tensorflow/python/dlpack/dlpack_test.py b/tensorflow/python/dlpack/dlpack_test.py
index af91da80512..e32fbfc86ed 100644
--- a/tensorflow/python/dlpack/dlpack_test.py
+++ b/tensorflow/python/dlpack/dlpack_test.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import test
+from tensorflow.python.ops import array_ops
 
 int_dtypes = [
     np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16, np.uint32,
@@ -59,11 +60,20 @@ class DLPackTest(parameterized.TestCase, test.TestCase):
   def testRoundTrip(self, dtype, shape):
     np.random.seed(42)
     np_array = np.random.randint(0, 10, shape)
-    tf_tensor = constant_op.constant(np_array, dtype=dtype)
+    # copy to gpu if available
+    tf_tensor = array_ops.identity(constant_op.constant(np_array, dtype=dtype))
+    tf_tensor_device = tf_tensor.device
+    tf_tensor_dtype = tf_tensor.dtype
     dlcapsule = dlpack.to_dlpack(tf_tensor)
     del tf_tensor  # should still work
     tf_tensor2 = dlpack.from_dlpack(dlcapsule)
     self.assertAllClose(np_array, tf_tensor2)
+    if tf_tensor_dtype == dtypes.int32:
+      # int32 tensor is always on cpu for now
+      self.assertEqual(tf_tensor2.device,
+                       "/job:localhost/replica:0/task:0/device:CPU:0")
+    else:
+      self.assertEqual(tf_tensor_device, tf_tensor2.device)
 
   def testTensorsCanBeConsumedOnceOnly(self):
     np.random.seed(42)
diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index 3c0c3894a64..f02e1e88456 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -5,6 +5,9 @@ load("//tensorflow:tensorflow.bzl", "tf_py_test")
 
 # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_python_pybind_extension")
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "pybind_extension")
 load("//tensorflow/python/tpu:tpu.bzl", "tpu_py_test")
 load(
     "//tensorflow/tools/test:performance.bzl",
@@ -77,9 +80,41 @@ cc_library(
     ],
 )
 
+tf_python_pybind_extension(
+    name = "pywrap_tensor_test_util",
+    testonly = True,
+    srcs = ["pywrap_tensor_test_util.cc"],
+    module_name = "pywrap_tensor_test_util",
+    deps = [
+        ":pywrap_tfe_lib",
+        "//tensorflow/c:tf_status_helper",
+        "//tensorflow/c/eager:c_api_test_util",
+        "//tensorflow/python:pybind11_lib",
+        "@pybind11",
+    ],
+)
+
+cuda_py_test(
+    name = "pywrap_tensor_test",
+    size = "small",
+    srcs = ["pywrap_tensor_test.py"],
+    python_version = "PY3",
+    tags = [
+        "no_oss",  # TODO(b/168051787): Enable.
+        "no_pip",  # TODO(b/168051787): Enable.
+    ],
+    tfrt_enabled = True,
+    deps = [
+        ":pywrap_tensor_test_util",
+        ":test",
+        "//third_party/py/numpy",
+    ],
+)
+
 filegroup(
     name = "pywrap_required_hdrs",
     srcs = [
+        "pywrap_tensor.h",
         "pywrap_tensor_conversion.h",
         "pywrap_tfe.h",
     ],
@@ -144,6 +179,7 @@ cuda_py_test(
     size = "small",
     srcs = ["cancellation_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":cancellation",
         ":test",
@@ -250,6 +286,7 @@ cuda_py_test(
     name = "monitoring_test",
     srcs = ["monitoring_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":monitoring",
         ":test",
@@ -374,6 +411,7 @@ cuda_py_test(
     size = "small",
     srcs = ["core_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":context",
         ":core",
@@ -393,6 +431,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["function_argument_naming_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":backprop",
         ":def_function",
@@ -408,6 +447,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["function_defun_collection_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":backprop",
         ":def_function",
@@ -444,7 +484,6 @@ cuda_py_test(
     python_version = "PY3",
     shard_count = 15,
     tags = [
-        "nogpu",  # TODO(b/162544929): segfault
         "nomac",  # b/157056289
     ],
     deps = [
@@ -524,6 +563,7 @@ cuda_py_test(
     name = "graph_only_ops_test",
     srcs = ["graph_only_ops_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         "graph_only_ops",
         "//tensorflow/python:client_testlib",
@@ -574,6 +614,16 @@ py_library(
     ],
 )
 
+pybind_extension(
+    name = "_concrete_function",
+    srcs = ["function.cc"],
+    module_name = "_concrete_function",
+    deps = [
+        "//third_party/python_runtime:headers",
+        "@pybind11",
+    ],
+)
+
 py_library(
     name = "backprop",
     srcs = ["backprop.py"],
@@ -670,6 +720,7 @@ cuda_py_test(
     name = "remote_benchmarks_test",
     srcs = ["remote_benchmarks_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":backprop",
         ":benchmarks_test_base",
@@ -695,6 +746,7 @@ tf_py_test(
     name = "tape_test",
     srcs = ["tape_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":backprop",
         ":context",
@@ -803,6 +855,7 @@ tf_py_test(
     size = "medium",
     srcs = ["lift_to_graph_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         "lift_to_graph",
         "//tensorflow/python:framework_ops",
@@ -950,6 +1003,7 @@ cuda_py_test(
         "optonly",  # times out
     ],
     deps = [
+        ":cancellation",
         ":context",
         ":def_function",
         ":remote",
diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py
index 7cb3abf4e07..71b1303ecf4 100644
--- a/tensorflow/python/eager/backprop.py
+++ b/tensorflow/python/eager/backprop.py
@@ -1091,7 +1091,11 @@ class GradientTape(object):
       self._tape = None
 
     if rewrap_as_ndarray:
-      flat_grad = nest.map_structure(np_arrays.tensor_to_ndarray, flat_grad)
+      def _tensor_to_ndarray(x):
+        if x is not None:
+          return np_arrays.tensor_to_ndarray(x)
+        return None
+      flat_grad = nest.map_structure(_tensor_to_ndarray, flat_grad)
 
     grad = nest.pack_sequence_as(sources, flat_grad)
     return grad
diff --git a/tensorflow/python/eager/benchmarks/BUILD b/tensorflow/python/eager/benchmarks/BUILD
new file mode 100644
index 00000000000..8e147d50d9e
--- /dev/null
+++ b/tensorflow/python/eager/benchmarks/BUILD
@@ -0,0 +1,21 @@
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+cuda_py_test(
+    name = "kpi_benchmark_test",
+    size = "medium",
+    srcs = ["kpi_benchmark_test.py"],
+    python_version = "PY3",
+    tags = [
+        "no_windows",  #  b/141617449
+        "optonly",
+    ],
+    deps = [
+        "//tensorflow:tensorflow_py_no_contrib",
+        "//tensorflow/python/eager:benchmarks_test_base",
+    ],
+)
diff --git a/tensorflow/python/eager/benchmarks/kpi_benchmark_test.py b/tensorflow/python/eager/benchmarks/kpi_benchmark_test.py
new file mode 100644
index 00000000000..22a70e199f9
--- /dev/null
+++ b/tensorflow/python/eager/benchmarks/kpi_benchmark_test.py
@@ -0,0 +1,121 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+r"""KPI Benchmarks for low-level eager execution primitives.
+
+This is a suite of full end-to-end integration benchmakr for low-level eager
+execution APIs. Also tracks them as KPI Traceme.
+
+To run CPU benchmarks:
+  bazel run -c opt kpi_benchmarks_test -- --benchmarks=.
+
+To run GPU benchmarks:
+  bazel run --config=cuda -c opt --copt="-mavx" kpi_benchmarks_test -- \
+    --benchmarks=.
+
+To run a subset of benchmarks using --benchmarks flag.
+--benchmarks: the list of benchmarks to run. The specified value is interpreted
+as a regular expression and any benchmark whose name contains a partial match
+to the regular expression is executed.
+e.g. --benchmarks=".*matmul*." will run all matmul related benchmarks.
+
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gc
+import time
+
+import tensorflow as tf
+
+from tensorflow.python.eager import benchmarks_test_base
+from tensorflow.python.eager import context
+from tensorflow.python.profiler import trace
+
+NUM_ITERATIONS = 30000
+
+
+def _run_benchmark(func, num_iters, execution_mode=None):
+  ctx = context.context()
+  with context.execution_mode(execution_mode):
+    # call func to warm up
+    func()
+    if execution_mode == context.ASYNC:
+      ctx.executor.wait()
+    start = time.time()
+    for _ in range(num_iters):
+      func()
+    if execution_mode == context.ASYNC:
+      ctx.executor.wait()
+    end = time.time()
+
+    return end - start
+
+
+class KpiBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
+  """A Collection of KPI benchmarks."""
+
+  def _get_benchmark_name(self):
+    return self._get_name()
+
+  def _run(self, func, num_iters):
+    gc.disable()
+    gc.collect()
+    self.run_report(_run_benchmark, func, num_iters)
+    gc.enable()
+
+  def benchmark_tf_constant_2x2(self):
+    x = [[1., 2.], [3., 4.]]
+
+    def fn():
+      with trace.Trace("tf.constant-2x2"):
+        tf.constant(x)
+
+    self._run(fn, NUM_ITERATIONS)
+
+  def benchmark_tf_convert_to_tensor_2x2(self):
+    x = [[1., 2.], [3., 4.]]
+
+    def fn():
+      with trace.Trace("tf.convert_to_tensor-2x2"):
+        tf.convert_to_tensor(x)
+
+    self._run(fn, NUM_ITERATIONS)
+
+  def benchmark_tf_nn_relu_2x2(self):
+    x = tf.constant([[1., 2.], [3., 4.]])
+
+    def fn():
+      with trace.Trace("tf.nn.relu-2x2"):
+        tf.nn.relu(x)
+
+    self._run(fn, NUM_ITERATIONS)
+
+  def benchmark_tf_function_invocation_identity(self):
+    x = tf.constant([[1., 2.], [3., 4.]])
+
+    @tf.function
+    def identity(x):
+      return x
+
+    def fn():
+      with trace.Trace("tf.function-identity"):
+        identity(x)
+
+    self._run(fn, NUM_ITERATIONS)
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensorflow/python/eager/benchmarks_test.py b/tensorflow/python/eager/benchmarks_test.py
index 93766d809f2..82e4d1e74cd 100644
--- a/tensorflow/python/eager/benchmarks_test.py
+++ b/tensorflow/python/eager/benchmarks_test.py
@@ -80,8 +80,8 @@ def c_tfe_py_fastpath_execute(a,
   assert ctx.executing_eagerly(
   ), "The prototype doesn't contain C code for graph construction"
   try:
-    return pywrap_tfe.TFE_Py_FastPathExecute(ctx._handle, ctx.device_name,
-                                             "MatMul", name, ctx.op_callbacks,
+    return pywrap_tfe.TFE_Py_FastPathExecute(ctx,
+                                             "MatMul", name,
                                              a, b, "transpose_a", transpose_a,
                                              "transpose_b", transpose_b)
   except core._NotOkStatusException as e:
@@ -253,32 +253,26 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
     tensor_b = constant_op.constant([[24, 24], [24, 24]])
     self._benchmark_add(tensor_a, tensor_b)
 
-  @test_util.disable_tfrt("convert_to_tensor not handled")
   def benchmark_create_float_tensor_from_list_CPU(self):
     self._benchmark_create_tensor([[3.0]], dtypes.float32.as_datatype_enum, CPU)
 
-  @test_util.disable_tfrt("convert_to_tensor not handled")
   def benchmark_create_float_tensor_from_np_array_CPU(self):
     self._benchmark_create_tensor(
         np.array([[3.0]], dtype=np.float32), dtypes.float32.as_datatype_enum,
         CPU)
 
-  @test_util.disable_tfrt("convert_to_tensor not handled")
   def benchmark_create_int32_tensor_from_list_CPU(self):
     self._benchmark_create_tensor([[3]], dtypes.int32.as_datatype_enum, CPU)
 
-  @test_util.disable_tfrt("convert_to_tensor not handled")
   def benchmark_create_int32_tensor_from_np_array_CPU(self):
     self._benchmark_create_tensor(
         np.array([[3]], dtype=np.int32), dtypes.int32.as_datatype_enum, CPU)
 
-  @test_util.disable_tfrt("no gpu support")
   def benchmark_create_float_tensor_from_list_GPU(self):
     if not context.num_gpus():
       return
     self._benchmark_create_tensor([[3.0]], dtypes.float32.as_datatype_enum, GPU)
 
-  @test_util.disable_tfrt("no gpu support")
   def benchmark_create_float_tensor_from_np_array_GPU(self):
     if not context.num_gpus():
       return
@@ -286,14 +280,12 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
         np.array([[3.0]], dtype=np.float32), dtypes.float32.as_datatype_enum,
         GPU)
 
-  @test_util.disable_tfrt("no gpu support")
   def benchmark_create_int32_tensor_from_list_GPU(self):
     # int32's are kept on host memory even when executing on GPU.
     if not context.num_gpus():
       return
     self._benchmark_create_tensor([[3]], dtypes.int32.as_datatype_enum, GPU)
 
-  @test_util.disable_tfrt("no gpu support")
   def benchmark_create_int32_tensor_from_np_array_GPU(self):
     # int32's are kept on host memory even when executing on GPU.
     if not context.num_gpus():
@@ -301,17 +293,14 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
     self._benchmark_create_tensor(
         np.array([[3]], dtype=np.int32), dtypes.int32.as_datatype_enum, GPU)
 
-  @test_util.disable_tfrt("strided slice not supported")
   def benchmark_index_tensor_with_literal(self):
     func = lambda: constant_op.constant([3.0])[0]
     self._run(func, 30000)
 
-  @test_util.disable_tfrt("strided slice not supported")
   def benchmark_index_tensor_with_tensor(self):
     func = lambda idx=constant_op.constant(0): constant_op.constant([3.0])[idx]
     self._run(func, 30000)
 
-  @test_util.disable_tfrt("strided slice not supported")
   def benchmark_index_tensor_with_np_array(self):
     func = lambda idx=np.array(0): constant_op.constant([3.0])[idx]
     self._run(func, 30000)
@@ -481,14 +470,26 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
                                              num_iters,
                                              execution_mode=None):
 
-    def func_matmul(m):
+    @def_function.function(
+        input_signature=[tensor_spec.TensorSpec([2, 2], dtypes.float32)])
+    def defun_matmul(m):
       return math_ops.matmul(m, m)
 
-    f = function.defun(
-        func_matmul,
-        input_signature=[tensor_spec.TensorSpec([2, 2], dtypes.float32)])
+    func = lambda: defun_matmul(m)
+    self._run(func, num_iters, execution_mode=execution_mode)
 
-    func = lambda: f(m)
+  def _benchmark_defun_matmul_relaxed_shape(self,
+                                            m,
+                                            num_iters,
+                                            execution_mode=None):
+
+    @def_function.function(experimental_relax_shapes=True)
+    def defun_matmul(m):
+      return math_ops.matmul(m, m)
+
+    m_3_by_3 = random_ops.random_uniform((3, 3))
+    defun_matmul(m_3_by_3)
+    func = lambda: defun_matmul(m)
     self._run(func, num_iters, execution_mode=execution_mode)
 
   def _benchmark_defun_args_matmul(self, m, num_iters, execution_mode=None):
@@ -518,7 +519,7 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
                                                transpose_b,
                                                num_iters,
                                                execution_mode=None):
-    f = function.defun(math_ops.matmul)
+    f = def_function.function(math_ops.matmul)
 
     def func():
       with backprop.GradientTape() as gt:
@@ -591,12 +592,18 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       self._benchmark_defun_matmul(
           m, transpose_b=False, num_iters=self._num_iters_2_by_2)
 
-  def benchmark_defun_matmul_2_by_2_CPU_with_signature(self):
+  def benchmark_defun_matmul_2_by_2_with_signature_CPU(self):
     with context.device(CPU):
       m = self._m_2_by_2.cpu()
       self._benchmark_defun_matmul_with_signature(
           m, num_iters=self._num_iters_2_by_2)
 
+  def benchmark_defun_matmul_2_by_2_relaxed_shape_CPU(self):
+    with context.device(CPU):
+      m = self._m_2_by_2.cpu()
+      self._benchmark_defun_matmul_relaxed_shape(
+          m, num_iters=self._num_iters_2_by_2)
+
   @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_defun_args_matmul_2_by_2_CPU(self):
     with context.device(CPU):
@@ -613,14 +620,18 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
           num_iters=self._num_iters_2_by_2,
           execution_mode=context.ASYNC)
 
-  def benchmark_defun_matmul_forward_backward_2_by_2_CPU(self):
+  def _benchmark_matmul_forward_backward_2_by_2_CPU(self, run_eager=False):
+    def_function.run_functions_eagerly(run_eager)
     with context.device(CPU):
       m = self._m_2_by_2.cpu()
       self._benchmark_defun_matmul_forward_backward(
           m, transpose_b=False, num_iters=self._num_iters_2_by_2)
+    def_function.run_functions_eagerly(False)
 
   @test_util.disable_tfrt("async not supported")
-  def benchmark_defun_matmul_forward_backward_2_by_2_CPU_async(self):
+  def _benchmark_matmul_forward_backward_2_by_2_CPU_async(
+      self, run_eager=False):
+    def_function.run_functions_eagerly(run_eager)
     with context.device(CPU):
       m = self._m_2_by_2.cpu()
       self._benchmark_defun_matmul_forward_backward(
@@ -629,7 +640,20 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
           num_iters=self._num_iters_2_by_2,
           execution_mode=context.ASYNC)
 
-  @test_util.disable_tfrt("copy to GPU not supported")
+  def benchmark_defun_matmul_forward_backward_2_by_2_CPU(self):
+    self._benchmark_matmul_forward_backward_2_by_2_CPU(False)
+
+  @test_util.disable_tfrt("async not supported")
+  def benchmark_defun_matmul_forward_backward_2_by_2_CPU_async(self):
+    self._benchmark_matmul_forward_backward_2_by_2_CPU_async(False)
+
+  def benchmark_defun_eager_matmul_forward_backward_2_by_2_CPU(self):
+    self._benchmark_matmul_forward_backward_2_by_2_CPU(True)
+
+  @test_util.disable_tfrt("async not supported")
+  def benchmark_defun_eager_matmul_forward_backward_2_by_2_CPU_async(self):
+    self._benchmark_matmul_forward_backward_2_by_2_CPU_async(True)
+
   def benchmark_tf_matmul_2_by_2_GPU(self):
     if not context.num_gpus():
       return
@@ -678,7 +702,7 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
           m, transpose_b=False, num_iters=self._num_iters_2_by_2)
 
   @test_util.disable_tfrt("copy to GPU not supported")
-  def benchmark_defun_matmul_2_by_2_GPU_with_signature(self):
+  def benchmark_defun_matmul_2_by_2_with_signature_GPU(self):
     if not context.num_gpus():
       return
     with context.device(GPU):
@@ -686,6 +710,15 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       self._benchmark_defun_matmul_with_signature(
           m, num_iters=self._num_iters_2_by_2)
 
+  @test_util.disable_tfrt("copy to GPU not supported")
+  def benchmark_defun_matmul_2_by_2_relaxed_shape_GPU(self):
+    if not context.num_gpus():
+      return
+    with context.device(GPU):
+      m = self._m_2_by_2.gpu()
+      self._benchmark_defun_matmul_relaxed_shape(
+          m, num_iters=self._num_iters_2_by_2)
+
   @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_defun_args_matmul_2_by_2_GPU(self):
     if not context.num_gpus():
@@ -1458,6 +1491,19 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
 
     self._run(fn, 10000)
 
+  def benchmark_tf_nest_flatten_none(self):
+    def fn():
+      nest.flatten(None)
+
+    self._run(fn, 100000)
+
+  def benchmark_tf_nest_flatten(self):
+    nested = {"a": [1, 2, 3], "b": (4, 5, 6)}
+    def fn():
+      nest.flatten(nested)
+
+    self._run(fn, 100000)
+
   def benchmark_tf_nn_convolution_overhead(self):
     inputs = array_ops.ones((1, 1, 1, 1))
     filters = array_ops.ones((1, 1, 1, 1))
diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index 765c77af7cd..d917f8a4b4e 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -42,6 +42,7 @@ from tensorflow.python.framework import device as pydev
 from tensorflow.python.util import compat
 from tensorflow.python.util import is_in_graph_mode
 from tensorflow.python.util import tf_contextlib
+from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.tf_export import tf_export
 
 GRAPH_MODE = 0
@@ -67,9 +68,6 @@ DEVICE_PLACEMENT_SILENT_FOR_INT32 = (
 SYNC = 0
 ASYNC = 1
 
-MIRRORING_NONE = pywrap_tfe.TFE_MIRRORING_NONE
-MIRRORING_ALL = pywrap_tfe.TFE_MIRRORING_ALL
-
 _KEEP_ALIVE_SECS = 600
 
 _python_eager_context_create_counter = monitoring.Counter(
@@ -185,21 +183,6 @@ class _TensorCaches(threading.local):
     return self._zeros_cache
 
 
-class _ThreadLocalData(threading.local):
-  """Thread local storage for the eager context."""
-
-  def __init__(self):
-    super(_ThreadLocalData, self).__init__()
-    self.device_spec = _starting_device_spec
-    self.device_name = ""
-    self.is_eager = default_execution_mode == EAGER_MODE
-    self.scope_name = ""
-    self.function_call_options = None
-    self.executor = None
-    self.op_callbacks = []
-    self.invoking_op_callbacks = False
-
-
 ContextSwitch = collections.namedtuple(
     "ContextSwitch", ["is_building_function", "enter_context_fn",
                       "device_stack"])
@@ -422,7 +405,10 @@ class Context(object):
     _tensor_caches_map[self._id] = _TensorCaches()
 
     self._config = config
-    self._thread_local_data = _ThreadLocalData()
+    self._thread_local_data = pywrap_tfe.EagerContextThreadLocalData(
+        self,
+        is_eager=lambda: default_execution_mode == EAGER_MODE,
+        device_spec=_starting_device_spec)
     self._context_switches = _ContextSwitchStack(self.executing_eagerly())
     self._context_handle = None
     self._context_devices = None
@@ -761,8 +747,8 @@ class Context(object):
 
     This is intended to be used when a peer failure is detected, which allows
     the user to handle the case instead of hanging. This aborts all on-going
-    collectives. After all subsequent collectives error immediately. The only
-    way to recovery now is to restart the program.
+    collectives. After all subsequent collectives error immediately, and you
+    need to reset_context() to use collectives again.
 
     Args:
       code: a `tf.errors` error code.
@@ -771,6 +757,26 @@ class Context(object):
     self.ensure_initialized()
     pywrap_tfe.TFE_AbortCollectiveOps(self._handle, code, message)
 
+  def check_collective_ops_peer_health(self, task):
+    """Check collective peer health.
+
+    This probes each task to see if they're still alive. Note that restarted
+    tasks are considered a different one, and they're considered not healthy.
+
+    This should only be used in multi client multi worker training.
+
+    Args:
+      task: a task string, must be in the format of /job:xxx/replica:0/task:N.
+
+    Raises:
+      tf.errors.UnavailableError: when a peer is down.
+      tf.errors.FailedPreconditionError: when a peer is a different one from the
+        one this task has talked to, e.g. the peer has restarted.
+      tf.errors.InvalidArgumentError: when the task string is invalid.
+    """
+    self.ensure_initialized()
+    pywrap_tfe.TFE_CollectiveOpsCheckPeerHealth(self._handle, task)
+
   @property
   def _handle(self):
     if self._context_handle is None:
@@ -1254,12 +1260,7 @@ class Context(object):
           p: i for i, p in enumerate(self._physical_devices)
       }
 
-      # Construct the visible device list from all physical devices but ignore
-      # XLA devices
-      self._visible_device_list = [
-          d for d in self._physical_devices
-          if not d.device_type.startswith("XLA")
-      ]
+      self._visible_device_list = list(self._physical_devices)
       self._memory_growth_map = {
           d: None for d in self._physical_devices if d.device_type == "GPU"
       }
@@ -1493,6 +1494,16 @@ class Context(object):
 
     self._virtual_device_map[dev] = virtual_devices
 
+  def get_compiler_ir(self, function_name, args, stage="hlo"):
+    return pywrap_tfe.TF_GetCompilerIr(self._context_handle, function_name,
+                                       stage, self.device_name, args)
+
+  @deprecated(
+      None, "XLA:CPU and XLA:GPU devices are deprecated", warn_once=True)
+  def enable_xla_devices(self):
+    """Enables XLA:CPU and XLA:GPU devices registration."""
+    pywrap_tfe.TF_EnableXlaDevices()
+
   @property
   def enable_mlir_bridge(self):
     return pywrap_tfe.TF_IsMlirBridgeEnabled()
@@ -1646,27 +1657,6 @@ class Context(object):
         pywrap_tfe.TFE_ContextSetThreadLocalDevicePlacementPolicy(
             self._handle, self._device_policy)
 
-  @property
-  def mirroring_policy(self):
-    # Only get the policy from the context if it has already been initialized
-    if self._context_handle is not None:
-      return pywrap_tfe.TFE_ContextGetMirroringPolicy(self._handle)
-
-    return self._mirroring_policy
-
-  @mirroring_policy.setter
-  def mirroring_policy(self, policy):
-    if policy is None:
-      policy = MIRRORING_NONE
-
-    if self._mirroring_policy is None or self._mirroring_policy != policy:
-      self._mirroring_policy = policy
-
-      # Only set the policy if the context has already been initialized
-      if self._context_handle is not None:
-        pywrap_tfe.TFE_ContextSetThreadLocalMirroringPolicy(
-            self._handle, self._mirroring_policy)
-
   @property
   def lazy_remote_inputs_copy(self):
     return self._lazy_remote_inputs_copy
@@ -1841,11 +1831,13 @@ def _reset_context():
   Should only be used for testing.
   """
   global _context
+  global _device_parsing_cache
   with _context_lock:
     if _context is not None:
       _context._clear_caches()
       _context = None
   _create_context()
+  _device_parsing_cache = {}
   pywrap_tfe.TFE_ClearScalarCache()
 
 
@@ -2102,18 +2094,6 @@ def device_policy(policy):
     ctx.device_policy = old_policy
 
 
-@tf_contextlib.contextmanager
-def mirroring_policy(policy):
-  """Context manager for setting mirroring policy for current thread."""
-  ctx = context()
-  old_policy = ctx.mirroring_policy
-  try:
-    ctx.mirroring_policy = policy
-    yield
-  finally:
-    ctx.mirroring_policy = old_policy
-
-
 def set_execution_mode(mode):
   """Sets execution mode for the current thread."""
   context().execution_mode = mode
diff --git a/tensorflow/python/eager/core_test.py b/tensorflow/python/eager/core_test.py
index d756827f44f..b6e952dce92 100644
--- a/tensorflow/python/eager/core_test.py
+++ b/tensorflow/python/eager/core_test.py
@@ -324,6 +324,7 @@ class TFETest(test_util.TensorFlowTestCase):
       else:
         ops.disable_tensor_equality()
 
+  @test_util.disable_tfrt('Async execution mode not supported in TFRT.')
   def testContext(self):
     ctx = context.Context()
     self.assertTrue(ctx.executing_eagerly())
@@ -383,6 +384,7 @@ class TFETest(test_util.TensorFlowTestCase):
     with ctx.device(device_spec):
       self.assertEqual(device_name, ctx.device_name)
 
+  @test_util.disable_tfrt('Async execution mode not supported in TFRT.')
   def testAsyncBasic(self):
     ctx = context.Context(execution_mode=context.ASYNC)
     ctx.ensure_initialized()
@@ -392,6 +394,7 @@ class TFETest(test_util.TensorFlowTestCase):
     self.assertTrue(has_cpu_device)
     del ctx
 
+  @test_util.disable_tfrt('Multi CPU placement not supported yet.')
   def testMultiCpuPlacement(self):
     with ops.device('cpu:1'):
       x = array_ops.identity(1.0)
@@ -404,7 +407,7 @@ class TFETest(test_util.TensorFlowTestCase):
   def testShouldCopy(self):
     with ops.device('GPU:0'):
       x = array_ops.identity(1.0)
-      self.assertEqual(x.device, '/job:localhost/replica:0/task:0/device:GPU:0')
+      self.assertEndsWith(x.device, 'GPU:0')
     y = array_ops.identity(x)
     # The value we're testing y.device against will depend on what the behavior
     # of not explicitly specifying a device in the context is.  This behavior is
@@ -461,6 +464,7 @@ class TFETest(test_util.TensorFlowTestCase):
     self.assertAllEqual(context_values, get_context_values(ctx))
 
   @test_util.run_gpu_only
+  @test_util.disable_tfrt('Context config not supported in TFRT.')
   def testContextConfig(self):
     ctx = context.Context(config=config_pb2.ConfigProto(
         device_count={'GPU': 0}))
@@ -494,6 +498,8 @@ class TFETest(test_util.TensorFlowTestCase):
     cpu.__exit__()
 
   @test_util.run_gpu_only
+  @test_util.disable_tfrt('Device name incorrect (known issue for runtime '
+                          'fallback).')
   def testReEntrant(self):
     cpu = context.device('cpu:0')
     gpu = context.device('gpu:0')
@@ -540,6 +546,7 @@ class TFETest(test_util.TensorFlowTestCase):
       x.gpu(context.context().num_gpus() + 1)
 
   @test_util.run_gpu_only
+  @test_util.disable_tfrt('Async execution mode not supported in TFRT.')
   def testCopyBetweenDevicesAsync(self):
     with context.execution_mode(context.ASYNC):
       x = constant_op.constant([[1., 2.], [3., 4.]])
@@ -556,6 +563,7 @@ class TFETest(test_util.TensorFlowTestCase):
     context.context().executor.clear_error()
 
   @test_util.run_gpu_only
+  @test_util.disable_tfrt('Device placement not implemented.')
   def testCopyScope(self):
     constant = constant_op.constant(1.0)
     with ops.device('gpu:0'):
@@ -563,6 +571,7 @@ class TFETest(test_util.TensorFlowTestCase):
         c = constant + 1.0
     self.assertAllEqual(c, 2.0)
 
+  @test_util.disable_tfrt('ContextFromInterface not implemented.')
   def testPyFunctionNullContext(self):
     def simple_fn(unused_handle):
       return 1.
@@ -577,6 +586,7 @@ class TFETest(test_util.TensorFlowTestCase):
 
     self.assertAllEqual(test_fn(test_var), 1.0)
 
+  @test_util.disable_tfrt('Async execution mode not supported in TFRT.')
   def testPyFunctionAsync(self):
 
     def simple_fn(v):
@@ -622,6 +632,7 @@ class TFETest(test_util.TensorFlowTestCase):
         attrs=('T', three.dtype.as_datatype_enum))[0]
     self.assertAllEqual(15, product)
 
+  @test_util.disable_tfrt('Async execution mode not supported in TFRT.')
   def testExecuteBasicAsync(self):
     with context.execution_mode(context.ASYNC):
       three = constant_op.constant(3)
@@ -650,6 +661,8 @@ class TFETest(test_util.TensorFlowTestCase):
     context.context().executor.clear_error()
     context.context().execution_mode = context.SYNC
 
+  @test_util.disable_tfrt('TFRT asserts correct number of outputs instead of '
+                          'returning error status.')
   def testExecuteTooManyNumOutputs(self):
     # num_outputs provided is 50, but only one output is produced.
     product = execute(
@@ -660,6 +673,8 @@ class TFETest(test_util.TensorFlowTestCase):
         attrs=('T', dtypes.int32.as_datatype_enum))[0]
     self.assertAllEqual(15, product)
 
+  @test_util.disable_tfrt('TFRT asserts correct number of outputs instead of '
+                          'returning error status.')
   def testExecuteTooFewNumOutputs(self):
     # num_outputs provided is 0, but one output is produced.
     with self.assertRaises(errors.InvalidArgumentError):
@@ -671,6 +686,7 @@ class TFETest(test_util.TensorFlowTestCase):
           attrs=('T', dtypes.int32.as_datatype_enum))[0]
 
   @test_util.run_gpu_only
+  @test_util.disable_tfrt('Device placement not implemented yet.')
   def testMatMulGPU(self):
     three = constant_op.constant([[3.]]).gpu()
     five = constant_op.constant([[5.]]).gpu()
@@ -736,8 +752,8 @@ class TFETest(test_util.TensorFlowTestCase):
     product = execute(
         b'MatMul',
         num_outputs=1,
-        inputs=[constant_op.constant([[3]]),
-                constant_op.constant([[5]])],
+        inputs=[constant_op.constant([[3.]]),
+                constant_op.constant([[5.]])],
         attrs=('transpose_a', True, 'transpose_b', False, 'T',
                dtypes.int32.as_datatype_enum))[0]
     self.assertAllEqual([[15]], product)
@@ -903,6 +919,7 @@ class TFETest(test_util.TensorFlowTestCase):
           inputs=[constant_op.constant(3.0)],
           attrs=('T', dtypes.float32.as_datatype_enum))
 
+  @test_util.disable_tfrt('TFRT raises InternalError instead of NotFoundError')
   def testExecuteUnknownOp(self):
     with self.assertRaises(errors.NotFoundError):
       execute(b'BlahBlahBlah', num_outputs=1, inputs=[], attrs=None)
@@ -973,7 +990,7 @@ class TFETest(test_util.TensorFlowTestCase):
       c = a + b
 
     # Op forced to CPU since all constants are integers and small.
-    self.assertEqual(c.device, '/job:localhost/replica:0/task:0/device:CPU:0')
+    self.assertEndsWith(c.device, 'CPU:0')
 
     a = array_ops.zeros((8, 10), dtype=dtypes.int64)
     b = array_ops.ones((8, 10), dtype=dtypes.int64)
@@ -982,7 +999,7 @@ class TFETest(test_util.TensorFlowTestCase):
       c = a + b
 
     # Op not forced to CPU since the tensors are larger than 64 elements.
-    self.assertEqual(c.device, '/job:localhost/replica:0/task:0/device:GPU:0')
+    self.assertEndsWith(c.device, 'GPU:0')
 
     a = constant_op.constant((1, 2, 3, 4, 5), dtype=dtypes.float32)
     b = constant_op.constant((2, 3, 4, 5, 6), dtype=dtypes.float32)
@@ -990,7 +1007,7 @@ class TFETest(test_util.TensorFlowTestCase):
       c = a + b
 
     # Op not forced to CPU since the constants are not integers.
-    self.assertEqual(c.device, '/job:localhost/replica:0/task:0/device:GPU:0')
+    self.assertEndsWith(c.device, 'GPU:0')
 
   def testExecutionModeIsStoredThreadLocal(self):
     cv = threading.Condition()
@@ -1024,6 +1041,8 @@ class TFETest(test_util.TensorFlowTestCase):
     for t in threads:
       t.join()
 
+  @test_util.disable_tfrt('Does not support converting DT_RESOURCE'
+                          'to op attr type yet.')
   def testEmptyResourceReturned(self):
     with ops.device('CPU:0'):
       v = variables.Variable(1.)
@@ -1066,6 +1085,7 @@ class SendRecvTest(test_util.TensorFlowTestCase):
     context._reset_context()
     configure_virtual_cpus()
 
+  @test_util.disable_tfrt('Send/Receive not supported in TFRT yet.')
   def testBasic(self):
     t0 = constant_op.constant(1.0)
     t1 = constant_op.constant(2.0)
@@ -1079,6 +1099,7 @@ class SendRecvTest(test_util.TensorFlowTestCase):
         2.0)
 
   @test_util.run_gpu_only
+  @test_util.disable_tfrt('Send/Receive not supported in TFRT yet.')
   def testLocalCrossDevice(self):
     gpu_device_name = '/job:localhost/replica:0/task:0/device:GPU:0'
     with ops.device('GPU:0'):
diff --git a/tensorflow/python/eager/def_function.py b/tensorflow/python/eager/def_function.py
index efc648a2f0c..850abbe6f54 100644
--- a/tensorflow/python/eager/def_function.py
+++ b/tensorflow/python/eager/def_function.py
@@ -27,7 +27,6 @@ import six
 from google.protobuf import text_format as _text_format
 from google.protobuf.message import DecodeError
 from tensorflow.core.framework import attr_value_pb2
-from tensorflow.python import pywrap_tfe
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function as function_lib
 from tensorflow.python.eager import lift_to_graph
@@ -218,17 +217,18 @@ class UnliftedInitializerVariable(resource_variable_ops.UninitializedVariable):
     if constraint is not None and not callable(constraint):
       raise ValueError("The `constraint` argument must be a callable.")
 
-    if isinstance(initial_value, trackable.CheckpointInitialValue):
-      self._maybe_initialize_trackable()
-      self._update_uid = initial_value.checkpoint_position.restore_uid
-      initial_value = initial_value.wrapped_value
-
     with ops.name_scope(name, "Variable", []
                         if init_from_fn else [initial_value]) as scope_name:
       with ops.name_scope("Initializer"):
-        initial_value = ops.convert_to_tensor(
-            initial_value() if init_from_fn else initial_value,
-            name="initial_value", dtype=dtype)
+        if init_from_fn:
+          initial_value = initial_value()
+        if isinstance(initial_value, trackable.CheckpointInitialValue):
+          self._maybe_initialize_trackable()
+          self._update_uid = initial_value.checkpoint_position.restore_uid
+          initial_value = initial_value.wrapped_value
+
+        initial_value = ops.convert_to_tensor(initial_value,
+                                              name="initial_value", dtype=dtype)
       assert initial_value is not None
 
       # Don't use `shape or initial_value.shape` since TensorShape has
@@ -594,6 +594,8 @@ class Function(object):
     """Creates a defun wrapped inside a variable creator scope."""
 
     weak_wrapped_fn = None
+    compile_with_xla = self._experimental_compile
+
     def wrapped_fn(*args, **kwds):
       """Wraps `self._python_function` in a variable creator scope."""
       # We register a variable creator with reduced priority. If an outer
@@ -608,10 +610,22 @@ class Function(object):
       # and so variable initializers can't depend on function arguments. This is
       # better than the alternative, tracing the initialization graph but giving
       # the user a variable type they didn't want.
-      with ops.get_default_graph()._variable_creator_scope(scope, priority=50):  # pylint: disable=protected-access
+      default_graph = ops.get_default_graph()
+      with default_graph._variable_creator_scope(scope, priority=50):  # pylint: disable=protected-access
         # __wrapped__ allows AutoGraph to swap in a converted function. We give
         # the function a weak reference to itself to avoid a reference cycle.
-        return weak_wrapped_fn().__wrapped__(*args, **kwds)
+        if compile_with_xla and \
+            not control_flow_util.GraphOrParentsInXlaContext(default_graph):
+          xla_context = control_flow_ops.XLAControlFlowContext()
+          try:
+            xla_context.Enter()
+            out = weak_wrapped_fn().__wrapped__(*args, **kwds)
+          finally:
+            xla_context.Exit()
+        else:
+          out = weak_wrapped_fn().__wrapped__(*args, **kwds)
+        return out
+
     weak_wrapped_fn = weakref.ref(wrapped_fn)
 
     return self._defun(tf_decorator.make_decorator(
@@ -655,14 +669,6 @@ class Function(object):
       attributes.update(_XlaMustCompile=bool(self._experimental_compile))
       if self._experimental_compile:
         attributes.update(_noinline=True)
-        # TODO(b/149755889): Until XLA is always linked, we have to do a runtime
-        # check.
-        if not pywrap_tfe.TF_IsXlaEnabled():
-          raise ValueError(
-              "Attempting to use experimental_compile, "
-              "but XLA support is not linked in. "
-              "Is the dependency to tensorflow/compiler/jit:xla_gpu_jit "
-              "(or xla_cpu_jit) present?")
     if not attributes:
       attributes = None
     return function_lib.defun_with_attributes(
@@ -778,23 +784,8 @@ class Function(object):
 
     tracing_count = self._get_tracing_count()
     with trace.Trace(self._name) as tm:
-      if self._experimental_compile and (
-          not control_flow_util.GraphOrParentsInXlaContext(
-              ops.get_default_graph())):
-        # V2 control flow relies on XLAControlFlowContext to generate a
-        # XLA-compatible function graph. If the function is already called
-        # inside an XLA context, we don't create nested XLA context.
-        compiler = "xla"
-        xla_context = control_flow_ops.XLAControlFlowContext()
-        try:
-          xla_context.Enter()
-          result = self._call(*args, **kwds)
-        finally:
-          xla_context.Exit()
-      else:
-        compiler = "nonXla"
-        result = self._call(*args, **kwds)
-
+      result = self._call(*args, **kwds)
+      compiler = "xla" if self._experimental_compile else "nonXla"
       new_tracing_count = self._get_tracing_count()
       without_tracing = (tracing_count == new_tracing_count)
       execution_mode = "notTraced" if without_tracing else "traced"
@@ -855,13 +846,14 @@ class Function(object):
         # stateless function.
         return self._stateless_fn(*args, **kwds)
     else:
-      canon_args, canon_kwds = \
+      _, _, _, filtered_flat_args = \
           self._stateful_fn._function_spec.canonicalize_function_inputs(  # pylint: disable=protected-access
               *args, **kwds)
       # If we did not create any variables the trace we have is good enough.
-      return self._concrete_stateful_fn._filtered_call(canon_args, canon_kwds)  # pylint: disable=protected-access
+      return self._concrete_stateful_fn._call_flat(
+          filtered_flat_args, self._concrete_stateful_fn.captured_inputs)  # pylint: disable=protected-access
 
-    def fn_with_cond(*inner_args, **inner_kwds):
+    def fn_with_cond(inner_args, inner_kwds, inner_filtered_flat_args):
       """Conditionally runs initialization if it's needed."""
       condition = True
       for wr in self._created_variables:
@@ -909,15 +901,88 @@ class Function(object):
       return control_flow_ops.cond(
           condition,
           lambda: self._stateless_fn(*inner_args, **inner_kwds),
-          functools.partial(self._concrete_stateful_fn._filtered_call,  # pylint: disable=protected-access
-                            inner_args, inner_kwds))
+          functools.partial(
+              self._concrete_stateful_fn._call_flat,  # pylint: disable=protected-access
+              inner_filtered_flat_args,
+              captured_inputs=self._concrete_stateful_fn.captured_inputs))
 
     # We've created variables and are unable to lift the initialization graphs,
     # so we fall back to initializing with conds while running the function.
-    canon_args, canon_kwds = \
+    canon_args, canon_kwds, _, filtered_flat_args = \
         self._stateful_fn._function_spec.canonicalize_function_inputs(  # pylint: disable=protected-access
             *args, **kwds)
-    return function_lib.defun(fn_with_cond)(*canon_args, **canon_kwds)
+    return function_lib.defun(fn_with_cond)(canon_args, canon_kwds,
+                                            filtered_flat_args)
+
+  def experimental_get_compiler_ir(self, *args, **kwargs):
+    """Returns compiler IR for the compiled function.
+
+    This API is intended *only* for debugging as there are no guarantees on
+    backwards compatibility of returned IR or the allowed values of `stage`.
+
+    Args:
+      *args: Arguments used for compilation; same arguments as used for calling
+        the function. Need to be eager tensors.
+      **kwargs: Keyword arguments used for compilation.
+
+    Returns:
+      Function callable with the stage at which the compiler IR should be
+      serialized. Allowed values for the `stage` are `hlo` and `optimized_hlo`.
+      When called, the returned function returns string representation of the
+      compiler IR at a given stage.
+
+      For example, for
+
+      ```python
+      @tf.function(experimental_compile=True)
+      def f(x):
+        return x + 1
+
+      f.experimental_get_compiler_ir(tf.random.normal([10, 10])(stage='hlo')
+      ```
+
+      the output is:
+
+      ```
+      HloModule a_inference_f_13__.9
+
+      ENTRY %a_inference_f_13__.9 (arg0.1: f32[10,10]) -> f32[10,10] {
+        %arg0.1 = f32[10,10]{1,0} parameter(0), parameter_replication={false}
+        %reshape.2 = f32[10,10]{1,0} reshape(f32[10,10]{1,0} %arg0.1)
+        %constant.3 = f32[] constant(1)
+        %broadcast.4 = f32[10,10]{1,0} broadcast(f32[] %constant.3)
+        %add.5 = f32[10,10]{1,0} add(f32[10,10]{1,0} %reshape.2,
+                                     f32[10,10]{1,0} %broadcast.4)
+        %reshape.6 = f32[10,10]{1,0} reshape(f32[10,10]{1,0} %add.5)
+        %tuple.7 = (f32[10,10]{1,0}) tuple(f32[10,10]{1,0} %reshape.6)
+        ROOT %get-tuple-element.8 = f32[10,10]{1,0}
+          get-tuple-element((f32[10,10]{1,0}) %tuple.7), index=0
+      }
+      ```
+
+    Raises:
+      ValueError: If an invalid `stage` is selected or if applied to a function
+        which is not compiled (`experimental_compile=True` is not set).
+      TypeError: When called with input in graph mode.
+    """
+    context.ensure_initialized()
+    if not self._experimental_compile:
+      raise ValueError(
+          "Compiler IR can only be returned for functions marked with "
+          "experimental_compile=True")
+
+    concrete_fn = self.get_concrete_function(*args, **kwargs)
+    fn_name = concrete_fn.name
+
+    # pylint: disable=protected-access
+    canon_args, _, _, _ = \
+        concrete_fn._function_spec.canonicalize_function_inputs(
+            *args, **kwargs)
+
+    return functools.partial(
+        context.context().get_compiler_ir,
+        function_name=fn_name,
+        args=list(canon_args) + concrete_fn.captured_inputs)
 
   @property
   def python_function(self):
diff --git a/tensorflow/python/eager/def_function_test_cpu_only.py b/tensorflow/python/eager/def_function_test_cpu_only.py
index bd3774269ea..7bb6ade8f6c 100644
--- a/tensorflow/python/eager/def_function_test_cpu_only.py
+++ b/tensorflow/python/eager/def_function_test_cpu_only.py
@@ -20,9 +20,9 @@ from __future__ import print_function
 from absl.testing import parameterized
 
 from tensorflow.python.eager import def_function
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
@@ -37,11 +37,12 @@ class DefFunctionCpuOnlyTest(test.TestCase, parameterized.TestCase):
     if test.is_built_with_rocm() or test_util.is_xla_enabled():
       return
 
-    with self.assertRaisesRegexp(ValueError, 'XLA support is not'):
+    with self.assertRaisesRegexp(errors.UnimplementedError,
+                                 'check target linkage'):
 
       @def_function.function(experimental_compile=True)
       def fn(x):
-        return array_ops.unique(x).y
+        return x + x
 
       fn([1, 1, 2, 3])
 
diff --git a/tensorflow/python/eager/def_function_xla_jit_test.py b/tensorflow/python/eager/def_function_xla_jit_test.py
index 813c1377cd9..dffd924c3e3 100644
--- a/tensorflow/python/eager/def_function_xla_jit_test.py
+++ b/tensorflow/python/eager/def_function_xla_jit_test.py
@@ -76,9 +76,7 @@ class DefFunctionTest(xla_test.XLATestCase):
 
       inputs = constant_op.constant([1, 2, 2, 3, 3])
       self.assertAllClose([2, 3, 3, 4, 4], func(inputs, 1))
-      if not test.is_built_with_rocm():
-        # XLA support is not yet enabled for TF ROCm
-        self.assertAllClose([2, 3, 3, 4, 4], xla_func(inputs, 1))
+      self.assertAllClose([2, 3, 3, 4, 4], xla_func(inputs, 1))
 
   def testBasicInt32(self):
     with ops.device('device:{}:0'.format(self.device)):
@@ -88,14 +86,10 @@ class DefFunctionTest(xla_test.XLATestCase):
         return x + a
 
       inputs = constant_op.constant([1, 2, 2, 3, 3], dtype=dtypes.int32)
-      if not test.is_built_with_rocm():
-        # XLA support is not yet enabled for TF ROCm
-        self.assertAllClose([2, 3, 3, 4, 4], fn(inputs, 1))
+      self.assertAllClose([2, 3, 3, 4, 4], fn(inputs, 1))
 
   def testDerivative(self):
     with ops.device('device:{}:0'.format(self.device)):
-      if test.is_built_with_rocm():
-        return
 
       def fn(x, a):
         return 2 * x + a
@@ -135,9 +129,7 @@ class DefFunctionTest(xla_test.XLATestCase):
         return fn(x, a)
 
       inputs = constant_op.constant([1, 2, 2, 3, 3])
-      if not test.is_built_with_rocm():
-        # XLA support is not yet enabled for TF ROCm
-        self.assertAllClose([2, 3, 3, 4, 4], fn2(inputs, 1))
+      self.assertAllClose([2, 3, 3, 4, 4], fn2(inputs, 1))
 
   @test_util.disable_mlir_bridge('TODO(b/162272821): MLIR bridge returns'
                                  ' wrong status type')
@@ -154,10 +146,9 @@ class DefFunctionTest(xla_test.XLATestCase):
 
       func = def_function.function(fn2, experimental_compile=False)
       inputs = constant_op.constant([1, 2, 2, 3, 3])
-      if not test.is_built_with_rocm():
-        with self.assertRaisesRegex(errors.InvalidArgumentError,
-                                    'not compilable'):
-          func(inputs)
+      with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                  'not compilable'):
+        func(inputs)
 
   @test_util.disable_mlir_bridge('TODO(b/162272821): MLIR bridge returns'
                                  ' wrong status type')
@@ -196,9 +187,7 @@ class DefFunctionTest(xla_test.XLATestCase):
         self.assertAllClose(3.0, dy)
 
       run_and_check(func)
-      if not test.is_built_with_rocm():
-        # XLA support is not yet enabled for TF ROCm
-        run_and_check(xla_func)
+      run_and_check(xla_func)
 
   @test_util.disable_mlir_bridge('TODO(b/162521846): MLIR bridge fails'
                                  ' msan, function library not found')
@@ -229,12 +218,15 @@ class DefFunctionTest(xla_test.XLATestCase):
           y = f(x)
         return y, tape.gradient(y, x)
 
+      # Test that XLA context gets correctly propagated.
+      g._get_concrete_function_garbage_collected(2.0)(2.0)
+
       self.assertAllClose(40.0, f(2.0))
       self.assertAllClose([40.0, 28.0], g(2.0))
+      self.assertAllClose(40.0, f.get_concrete_function(2.0)(2.0))
+      self.assertAllClose([40.0, 28.0], g.get_concrete_function(2.0)(2.0))
 
   def testMethodCompilation(self):
-    if test.is_built_with_rocm():
-      return
 
     with ops.device('device:{}:0'.format(self.device)):
 
@@ -251,8 +243,6 @@ class DefFunctionTest(xla_test.XLATestCase):
   @test_util.disable_mlir_bridge('TODO(b/162272821): MLIR bridge returns '
                                  ' wrong status type')
   def testMethodCompilationUnsupportedFunc(self):
-    if test.is_built_with_rocm():
-      return
 
     with ops.device('device:{}:0'.format(self.device)):
 
@@ -273,8 +263,6 @@ class DefFunctionTest(xla_test.XLATestCase):
       self.skipTest('b/162799319: Cannot resolve constant on TPU')
 
     with ops.device('device:{}:0'.format(self.device)):
-      if test.is_built_with_rocm():
-        return
 
       @def_function.function(experimental_compile=True)
       def f():
@@ -538,6 +526,82 @@ class DefFunctionTest(xla_test.XLATestCase):
           b.backing_device) if on_gpu else 0
       self.assertEqual(initial_usage, final_usage)
 
+  def testGetCompilerIrConstants(self):
+    if 'tpu' in self.device.lower():
+      self.skipTest('TPU generates different HLO')
+
+    with ops.device('device:{}:0'.format(self.device)):
+
+      @def_function.function(experimental_compile=True)
+      def f(a, b):
+        return array_ops.transpose(a, b)
+
+      a = array_ops.ones([3, 4, 3], dtype=dtypes.float32)
+      b = constant_op.constant([0, 2, 1], dtype=dtypes.int32)
+
+      self.assertIn('{1,2,0}',
+                    f.experimental_get_compiler_ir(a, b)(stage='optimized_hlo'))
+
+  @test_util.disable_mlir_bridge('TODO(b/168732524): MLIR bridge does not '
+                                 ' optimize single-element tuples to scalars')
+  def testGetCompilerIrResourceVars(self):
+    with ops.device('device:{}:0'.format(self.device)):
+
+      v = variables.Variable([3.1, 3.2])
+
+      @def_function.function(experimental_compile=True)
+      def f(a, b):
+        v.assign_add(a * b)
+
+      a = random_ops.random_normal([2])
+      b = random_ops.random_normal([2])
+
+      self.assertIn('input_output_alias={ {}: (2, {}, may-alias) }',
+                    f.experimental_get_compiler_ir(a, b)(stage='optimized_hlo'))
+
+  def testGetCompilerIrNotCompiled(self):
+    with ops.device('device:{}:0'.format(self.device)):
+
+      @def_function.function
+      def f(x):
+        return x + 1
+
+      a = random_ops.random_normal([10, 10])
+      with self.assertRaisesRegex(ValueError,
+                                  'marked with experimental_compile'):
+        f.experimental_get_compiler_ir(a)()
+
+  def testGetCompilerIrNested(self):
+    with ops.device('device:{}:0'.format(self.device)):
+
+      @def_function.function(experimental_compile=True)
+      def fn(x, a):
+        return x + a
+
+      @def_function.function(experimental_compile=False)
+      def fn2(x, a):
+        fn.experimental_get_compiler_ir(x, a)()
+        return fn(x, a)
+
+      inputs = constant_op.constant([1, 2, 2, 3, 3])
+      with self.assertRaisesRegex(TypeError, '"Graph" tensor'):
+        fn2(inputs, 1)
+
+  def testGetCompilerIrKwargs(self):
+    with ops.device('device:{}:0'.format(self.device)):
+
+      v = variables.Variable([0.1, 0.1])
+
+      @def_function.function(experimental_compile=True)
+      def f(a, b):
+        return (a + b) * v
+
+      a = constant_op.constant([1.1, 1.1])
+      b = constant_op.constant([2.2, 2.2])
+
+      self.assertIn('multiply',
+                    f.experimental_get_compiler_ir(b=a, a=b)(stage='hlo'))
+
 
 if __name__ == '__main__':
   ops.enable_eager_execution()
diff --git a/tensorflow/python/eager/forwardprop_test.py b/tensorflow/python/eager/forwardprop_test.py
index fea6c9963ff..35abd6ddbfe 100644
--- a/tensorflow/python/eager/forwardprop_test.py
+++ b/tensorflow/python/eager/forwardprop_test.py
@@ -1041,6 +1041,27 @@ class BatchTests(test.TestCase, parameterized.TestCase):
       z = x * y
     self.assertAllClose(acc.jvp(z), constant_op.constant([5.0, 2.0, 7.0]))
 
+  @parameterized.named_parameters([("ForwardPropFirst", True),
+                                   ("TapeFirst", False)])
+  def testBatchBackwardOverForward(self, forward_prop_first):
+    x = constant_op.constant(1.)
+    tangents = random_ops.random_normal(shape=[10], seed=1)
+    expected = [-t * math_ops.cos(1.) for t in tangents]
+    if forward_prop_first:
+      batch_acc = forwardprop.ForwardAccumulator._batch_accumulator(x, tangents)
+      gradient_tape = backprop.GradientTape(persistent=True)
+    else:
+      gradient_tape = backprop.GradientTape(persistent=True)
+      batch_acc = forwardprop.ForwardAccumulator._batch_accumulator(x, tangents)
+    with gradient_tape as tape:
+      with batch_acc as acc:
+        tape.watch(x)
+        y = math_ops.cos(x)
+        self.assertTrue(tape_lib.should_record_backprop((acc.jvp(y),)))
+        jvps = acc.jvp(y)
+      d2y_dx2 = [tape.gradient(dy_dx, x) for dy_dx in jvps]
+    self.assertAllClose(expected, d2y_dx2)
+
 
 if __name__ == "__main__":
   # TODO(allenl): Also test with 1.x-style graph mode.
diff --git a/tensorflow/python/eager/function.cc b/tensorflow/python/eager/function.cc
new file mode 100644
index 00000000000..0fc22d3844c
--- /dev/null
+++ b/tensorflow/python/eager/function.cc
@@ -0,0 +1,83 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <Python.h>
+
+#include "pybind11/pybind11.h"
+#include "pybind11/stl_bind.h"
+
+struct ConcreteFunction;  // Forward declaration.
+
+// TODO(jlchu): Migrate Python characteristics to C++
+
+namespace tensorflow {
+
+namespace py = pybind11;
+
+struct PyConcreteFunction {
+  PyConcreteFunction() {}
+  py::object _build_call_outputs(py::object result,
+                                 py::object structured_outputs,
+                                 bool _ndarrays_list, bool _ndarray_singleton);
+};
+
+py::object PyConcreteFunction::_build_call_outputs(
+    py::object result, py::object structured_outputs, bool _ndarrays_list,
+    bool _ndarray_singleton) {
+  static const py::module* nest =
+      new py::module(py::module::import("tensorflow.python.util.nest"));
+  // TODO(jlchu): Look into lazy loading of np_arrays module
+  static const py::module* np_arrays = new py::module(
+      py::module::import("tensorflow.python.ops.numpy_ops.np_arrays"));
+
+  if (structured_outputs.is_none()) {
+    return result;
+  }
+
+  // TODO(jlchu): Verify invariant -result = None only if
+  // structured_outputs = None?
+  py::list list_result = (py::list)result;
+
+  if (!list_result.empty()) {
+    if (_ndarrays_list) {
+      py::list ndarr_result(list_result.size());
+      for (int i = 0; i < ndarr_result.size(); ++i) {
+        ndarr_result[i] = np_arrays->attr("tensor_to_ndarray")(list_result[i]);
+      }
+      return ndarr_result;
+    } else if (_ndarray_singleton) {
+      return np_arrays->attr("tensor_to_ndarray")(list_result[0]);
+    }
+  }
+
+  // Replace outputs with results, skipping over any 'None' values.
+  py::list outputs_list = nest->attr("flatten")(structured_outputs, true);
+  int j = 0;
+  for (int i = 0; i < outputs_list.size(); ++i) {
+    if (!outputs_list[i].is_none()) {
+      outputs_list[i] = list_result[j];
+      ++j;
+    }
+  }
+  return nest->attr("pack_sequence_as")(structured_outputs, outputs_list, true);
+}
+
+PYBIND11_MODULE(_concrete_function, m) {
+  py::class_<PyConcreteFunction>(m, "ConcreteFunction")
+      .def(py::init<>())
+      .def("_build_call_outputs", &PyConcreteFunction::_build_call_outputs);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index 289b8a32cdb..07227c7883d 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -41,6 +41,7 @@ from tensorflow.python.eager import backprop_util
 from tensorflow.python.eager import context
 from tensorflow.python.eager import execute
 from tensorflow.python.eager import forwardprop_util
+from tensorflow.python.eager import monitoring
 from tensorflow.python.eager import tape
 from tensorflow.python.eager.graph_only_ops import graph_placeholder
 from tensorflow.python.framework import c_api_util
@@ -54,7 +55,6 @@ from tensorflow.python.framework import func_graph as func_graph_module
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
-from tensorflow.python.framework import tensor_util
 from tensorflow.python.framework import type_spec
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -93,6 +93,10 @@ BACKWARD_FUNCTION_ATTRIBUTE_NAME = "backward_function_name"
 IMPLEMENTS_ATTRIBUTE_NAME = "_implements"
 SHARED_RENDEZVOUS_ATTRIBUTE_NAME = "shared_rendezvous"
 
+_graph_building_time_counter = monitoring.Counter(
+    "/tensorflow/core/tf_function/graph_building_time_usecs",
+    "Time for tf.function to build a graph (us).")
+
 
 def _make_input_signature_hashable(elem):
   """Rewrite input signature to be hashable.
@@ -106,13 +110,13 @@ def _make_input_signature_hashable(elem):
   Returns:
     A hashable object for the requested input signature
   """
-  # TODO(slebedev): consider using nest.
-  if isinstance(elem, tuple):
-    return tuple(map(_make_input_signature_hashable, elem))
-
   try:
     hash(elem)
   except TypeError:
+    # TODO(slebedev): consider using nest.
+    if isinstance(elem, tuple):
+      return tuple(map(_make_input_signature_hashable, elem))
+
     # TFE_Py_EncodeArg weakrefs arguments it does not recognize, and we expect
     # all recognized types to be hashable.
     assert isinstance(elem, weakref.ReferenceType)
@@ -1200,9 +1204,10 @@ class _TapeGradientFunctions(object):
     """Create a backward function given `outputs` from the forward function."""
     capture_mapping = dict(
         zip((ops.tensor_id(t) for t in forward_graph.outputs), outputs))
+    captured_inputs = backward.captured_inputs
     remapped_captures = [
         capture_mapping.get(ops.tensor_id(capture), capture)
-        for capture in backward.captured_inputs
+        for capture in captured_inputs
     ]
     if any(t.graph is forward_graph for t in remapped_captures
            if not isinstance(t, ops.EagerTensor)):
@@ -1216,8 +1221,7 @@ class _TapeGradientFunctions(object):
     # unconnected gradients. We do that in advance so we don't have to hold on
     # to the outputs themselves, which may not be needed otherwise.
     variant_zeros_like = {}
-    backward_function_inputs = (
-        len(backward.inputs) - len(backward.captured_inputs))
+    backward_function_inputs = (len(backward.inputs) - len(captured_inputs))
     recorded_outputs = []
     trainable_recorded_outputs = 0
     skip_positions = []
@@ -1747,12 +1751,15 @@ class ConcreteFunction(object):
       TypeError: if `args` and `kwargs` do not match the structured signature
         of this `ConcreteFunction`.
     """
-    args, kwargs = self._function_spec.canonicalize_function_inputs(
-        *args, **kwargs)
+    args, kwargs, _, filtered_flat_args = \
+        self._function_spec.canonicalize_function_inputs(*args, **kwargs)
     self._structured_signature_check_missing_args(args, kwargs)
     self._structured_signature_check_unexpected_args(args, kwargs)
     self._structured_signature_check_arg_types(args, kwargs)
-    return self._filtered_call(args, kwargs, cancellation_manager)
+    return self._call_flat(
+        filtered_flat_args,
+        captured_inputs=self.captured_inputs,
+        cancellation_manager=cancellation_manager)
 
   def _structured_signature_check_missing_args(self, args, kwargs):
     """Raises a TypeError if any args are missing."""
@@ -1834,35 +1841,14 @@ class ConcreteFunction(object):
                             type(spec_piece).__name__, spec_piece, name,
                             type(arg_piece).__name__, arg_piece))
 
-  def _filtered_call(self, args, kwargs, cancellation_manager=None):
-    """Executes the function, filtering arguments from the Python function.
-
-    Objects aside from Tensors, CompositeTensors, and Variables are ignored.
-    CompositeTensors are expanded into their components.
-
-    Args:
-      args: Canonicalized positional arguments of the Python function.
-      kwargs: Canonicalized keyword arguments of the Python function.
-      cancellation_manager: (Optional.) A `CancellationManager` that can be
-        used to cancel function invocation.
-
-    Returns:
-      The result of applying the function on the Tensors/Variables contained in
-      `args` and `kwargs`.
-    """
-    return self._call_flat(
-        [t for t in nest.flatten((args, kwargs), expand_composites=True)
-         if isinstance(t, (ops.Tensor,
-                           resource_variable_ops.BaseResourceVariable))],
-        captured_inputs=self.captured_inputs,
-        cancellation_manager=cancellation_manager)
-
   def _call_flat(self, args, captured_inputs, cancellation_manager=None):
     """Executes the wrapped function.
 
     Args:
-      args: a list of Tensors or Variables.  Any CompositeTensors should be
-        expanded before calling this method.
+      args: a list of Tensors or Variables. Arguments from the Python function
+        should be filtered before calling this method: objects aside from
+        Tensors, CompositeTensors, and Variables are ignored. Any
+        CompositeTensors should be expanded before calling this method.
       captured_inputs: the captured inputs that are also part of the input args
         to the actual execution. By default, it should be self._captured_inputs.
       cancellation_manager: (Optional.) A `CancellationManager` that can be
@@ -1939,8 +1925,7 @@ class ConcreteFunction(object):
     forward_function, args_with_tangents = forward_backward.forward()
     if executing_eagerly:
       flat_outputs = forward_function.call(
-          ctx, args_with_tangents,
-          cancellation_manager=cancellation_manager)
+          ctx, args_with_tangents, cancellation_manager=cancellation_manager)
     else:
       with default_graph._override_gradient_function(  # pylint: disable=protected-access
           {"PartitionedCall": self._get_gradient_function(),
@@ -2166,7 +2151,7 @@ class ConcreteFunction(object):
     Returns:
       The actual call output.
     """
-    # TODO(jlchu): implement in C++.
+    # TODO(jlchu): call C++ version in function.cc when speed is improved
     if self._func_graph.structured_outputs is None:
       return result
 
@@ -2590,7 +2575,7 @@ class FunctionSpec(object):
     """Canonicalizes `args` and `kwargs`.
 
     Canonicalize the inputs to the Python function using a `FunctionSpec`
-    instance. In particular, we parse the varags and kwargs that the
+    instance. In particular, we parse the varargs and kwargs that the
     original function was called with into a tuple corresponding to the
     Python function's positional (named) arguments and a dictionary
     corresponding to its kwargs.  Missing default arguments are added.
@@ -2606,10 +2591,12 @@ class FunctionSpec(object):
       **kwargs: The keyword args this function was called with.
 
     Returns:
-      A canonicalized ordering of the inputs representened by a tuple in the
-      form (args, kwargs). Here: `args` is a full list of bound arguments, and
-      `kwargs` contains only true keyword arguments, as opposed to named
-      arguments called in a keyword-like fashion.
+      A canonicalized ordering of the inputs, as well as full and filtered
+      (Tensors and Variables only) versions of their concatenated flattened
+      representations, represented by a tuple in the form (args, kwargs,
+      flat_args, filtered_flat_args). Here: `args` is a full list of bound
+      arguments, and `kwargs` contains only true keyword arguments, as opposed
+      to named arguments called in a keyword-like fashion.
 
     Raises:
       ValueError: If a keyword in `kwargs` cannot be matched with a positional
@@ -2689,16 +2676,15 @@ class FunctionSpec(object):
           kwargs.setdefault(kwarg, default)
 
     if self._input_signature is None:
-      inputs = _convert_numpy_inputs(inputs)
-      kwargs = _convert_numpy_inputs(kwargs)
-      return inputs, kwargs
+      inputs, flat_inputs, filtered_flat_inputs = _convert_numpy_inputs(inputs)
+      kwargs, flat_kwargs, filtered_flat_kwargs = _convert_numpy_inputs(kwargs)
+      return (inputs, kwargs, flat_inputs + flat_kwargs,
+              filtered_flat_inputs + filtered_flat_kwargs)
     else:
       assert not kwargs
-      inputs = _convert_inputs_to_signature(
-          inputs,
-          self._input_signature,
-          self._flat_input_signature)
-      return inputs, {}
+      inputs, flat_inputs, filtered_flat_inputs = _convert_inputs_to_signature(
+          inputs, self._input_signature, self._flat_input_signature)
+      return inputs, {}, flat_inputs, filtered_flat_inputs
 
 
 def _as_ndarray(value):
@@ -2711,8 +2697,10 @@ def _is_ndarray(value):
   """Tests whether the given value is an ndarray (and not a TF tensor/var)."""
   # TODO(tomhennigan) Support __array_interface__ too.
   return hasattr(value, "__array__") and not (
-      resource_variable_ops.is_resource_variable(value)
-      or tensor_util.is_tensor(value)
+      isinstance(value, ops.Tensor)
+      or isinstance(value, resource_variable_ops.BaseResourceVariable)
+      or hasattr(value, "_should_act_as_resource_variable")
+
       # For legacy reasons we do not automatically promote Numpy strings.
       or isinstance(value, np.str_)
       # NumPy dtypes have __array__ as unbound methods.
@@ -2724,27 +2712,39 @@ def _is_ndarray(value):
 def _convert_numpy_inputs(inputs):
   """Convert numpy array inputs to tensors."""
   # We assume that any CompositeTensors have already converted their components
-  # from numpy arrays to Tensors, so we don't need to expand composites here.
-  flat_inputs = nest.flatten(inputs, expand_composites=False)
+  # from numpy arrays to Tensors, so we don't need to expand composites here for
+  # the numpy array conversion. Instead, we do so because the flattened inputs
+  # are eventually passed to ConcreteFunction()._call_flat, which requires
+  # expanded composites.
+  flat_inputs = nest.flatten(inputs, expand_composites=True)
 
   # Check for NumPy arrays in arguments and convert them to Tensors.
   # TODO(nareshmodi): Skip ndarray conversion to tensor altogether, perhaps
   # finding a way to store them directly in the cache key (currently not
   # possible since ndarrays are not hashable).
   need_packing = False
+  filtered_flat_inputs = []
   for index, value in enumerate(flat_inputs):
-    if _is_ndarray(value):
+    if isinstance(value,
+                  (ops.Tensor, resource_variable_ops.BaseResourceVariable)):
+      filtered_flat_inputs.append(value)
+    elif hasattr(value, "__array__") and not (
+        hasattr(value, "_should_act_as_resource_variable") or
+        isinstance(value, (np.str_, type, composite_tensor.CompositeTensor))):
+      # This case is equivalent to _is_ndarray(value) == True
       a = _as_ndarray(value)
       if not isinstance(a, np.ndarray):
         raise TypeError("The output of __array__ must be an np.ndarray "
                         "(got {} from {}).".format(type(a), type(value)))
       flat_inputs[index] = constant_op.constant(a)
+      filtered_flat_inputs.append(flat_inputs[index])
       need_packing = True
   if need_packing:
-    return nest.pack_sequence_as(
-        structure=inputs, flat_sequence=flat_inputs, expand_composites=False)
+    return (nest.pack_sequence_as(
+        structure=inputs, flat_sequence=flat_inputs,
+        expand_composites=True), flat_inputs, filtered_flat_inputs)
   else:
-    return inputs
+    return inputs, flat_inputs, filtered_flat_inputs
 
 
 def _convert_inputs_to_signature(inputs, input_signature, flat_input_signature):
@@ -2793,7 +2793,12 @@ def _convert_inputs_to_signature(inputs, input_signature, flat_input_signature):
         flat_sequence=flatten_inputs,
         expand_composites=True)
 
-  return inputs
+  flat_inputs = nest.flatten(inputs, expand_composites=True)
+
+  return (inputs, flat_inputs, [
+      t for t in flat_inputs
+      if isinstance(t, (ops.Tensor, resource_variable_ops.BaseResourceVariable))
+  ])
 
 
 class FunctionCache(object):
@@ -2918,8 +2923,10 @@ class Function(object):
   def __call__(self, *args, **kwargs):
     """Calls a graph function specialized to the inputs."""
     with self._lock:
-      graph_function, args, kwargs = self._maybe_define_function(args, kwargs)
-    return graph_function._filtered_call(args, kwargs)  # pylint: disable=protected-access
+      (graph_function,
+       filtered_flat_args) = self._maybe_define_function(args, kwargs)
+    return graph_function._call_flat(
+        filtered_flat_args, captured_inputs=graph_function.captured_inputs)  # pylint: disable=protected-access
 
   @property
   def python_function(self):
@@ -2945,7 +2952,7 @@ class Function(object):
     if self.input_signature:
       args, kwargs = None, None
     with self._lock:
-      graph_function, _, _ = self._maybe_define_function(args, kwargs)
+      graph_function, _ = self._maybe_define_function(args, kwargs)
     return graph_function
 
   def _get_concrete_function_internal(self, *args, **kwargs):
@@ -2995,7 +3002,7 @@ class Function(object):
                            (str(args), str(self.input_signature)))
       args, kwargs = None, None
     with self._lock:
-      graph_function, args, kwargs = self._maybe_define_function(args, kwargs)
+      graph_function, _ = self._maybe_define_function(args, kwargs)
       seen_names = set()
       captured = object_identity.ObjectIdentitySet(
           graph_function.graph.internal_captures)
@@ -3066,7 +3073,11 @@ class Function(object):
     # Return the cached `Function` for the instance
     return self._descriptor_cache[instance]
 
-  def _cache_key(self, args, kwargs, include_tensor_ranks_only=False):
+  def _cache_key(self,
+                 args,
+                 kwargs,
+                 cache_key_context,
+                 include_tensor_ranks_only=False):
     """Computes the cache key given inputs and execution context."""
     if self.input_signature is None:
       inputs = (args, kwargs) if kwargs else args
@@ -3078,6 +3089,15 @@ class Function(object):
       assert not include_tensor_ranks_only
       hashable_input_signature = self._hashable_input_signature
 
+    (parent_graph, device_functions, colocation_stack, in_cross_replica_context,
+     variable_policy, xla_context_id) = cache_key_context
+
+    return CacheKey(hashable_input_signature, parent_graph, device_functions,
+                    colocation_stack, in_cross_replica_context, variable_policy,
+                    xla_context_id)
+
+  def _cache_key_context(self):
+    """Returns execution context."""
     ctx = context.context()
 
     # Don't need to open an init_scope if the _cache_key call is in eager mode
@@ -3146,9 +3166,8 @@ class Function(object):
     else:
       variable_policy = save_options.VariablePolicy.EXPAND_DISTRIBUTED_VARIABLES
 
-    return CacheKey(hashable_input_signature, parent_graph, device_functions,
-                    colocation_stack, in_cross_replica_context, variable_policy,
-                    xla_context_id)
+    return (parent_graph, device_functions, colocation_stack,
+            in_cross_replica_context, variable_policy, xla_context_id)
 
   def _create_graph_function(self, args, kwargs, override_flat_arg_shapes=None):
     """Create a `ConcreteFunction` from `args` and `kwargs`."""
@@ -3188,25 +3207,32 @@ class Function(object):
         shared_func_graph=False)
     return graph_function
 
-  def _define_function_with_shape_relaxation(self, args, kwargs):
+  def _define_function_with_shape_relaxation(self, args, kwargs, flat_args,
+                                             filtered_flat_args,
+                                             cache_key_context):
     """Define a function, relaxing arg shapes to avoid unnecessary retracing."""
-    any_composite_args = any(isinstance(x, composite_tensor.CompositeTensor)
-                             for x in nest.flatten((args, kwargs)))
+    flat_no_comp = nest.flatten((args, kwargs), expand_composites=False)
+
+    any_composite_args = any(
+        isinstance(x, composite_tensor.CompositeTensor) for x in flat_no_comp)
 
     # Build a cache key where TensorShapes include only rank information (and
     # not information about the size of each dimension).
     if not any_composite_args:
       rank_only_cache_key = self._cache_key(
-          args, kwargs, include_tensor_ranks_only=True)
+          args, kwargs, cache_key_context, include_tensor_ranks_only=True)
     else:
       # For the rank-only cache key, replace any composite tensors with
       # shape-relaxed TypeSpecs.
       (cache_key_args, cache_key_kwargs) = nest.map_structure(
           _shape_relaxed_type_for_composite_tensor, (args, kwargs))
       rank_only_cache_key = self._cache_key(
-          cache_key_args, cache_key_kwargs, include_tensor_ranks_only=True)
+          cache_key_args,
+          cache_key_kwargs,
+          cache_key_context,
+          include_tensor_ranks_only=True)
 
-    arg_specs = [_type_spec_for(x) for x in nest.flatten((args, kwargs))]
+    arg_specs = [_type_spec_for(x) for x in flat_no_comp]
     relaxed_arg_specs = self._function_cache.arg_relaxed_specs.get(
         rank_only_cache_key, None)
     relaxed_arg_function = self._function_cache.arg_relaxed.get(
@@ -3215,7 +3241,7 @@ class Function(object):
     if (relaxed_arg_function is not None
         and all(_is_type_subset(x, y) for (x, y) in
                 zip(relaxed_arg_specs, arg_specs))):
-      return relaxed_arg_function, args, kwargs
+      return relaxed_arg_function, filtered_flat_args
 
     if relaxed_arg_specs is None:
       relaxed_arg_specs = arg_specs
@@ -3241,14 +3267,18 @@ class Function(object):
           (args, kwargs), relaxed_arg_specs, expand_composites=False)
       (args, kwargs) = nest.pack_sequence_as(
           (relaxed_arg_specs, relaxed_kwarg_specs),
-          nest.flatten((args, kwargs), expand_composites=True),
+          flat_args,
           expand_composites=True)
 
     graph_function = self._create_graph_function(
         args, kwargs, override_flat_arg_shapes=relaxed_arg_shapes)
     self._function_cache.arg_relaxed[rank_only_cache_key] = graph_function
 
-    return graph_function, args, kwargs
+    return (graph_function, [
+        t for t in nest.flatten((args, kwargs), expand_composites=True)
+        if isinstance(t, (ops.Tensor,
+                          resource_variable_ops.BaseResourceVariable))
+    ])
 
   def _maybe_define_function(self, args, kwargs):
     """Gets a function for these inputs, defining it if necessary.
@@ -3264,7 +3294,8 @@ class Function(object):
 
     Returns:
       A graph function corresponding to the input signature implied by args and
-      kwargs, as well as the inputs that the object should be called with.
+      kwargs, as well as filtered flattened inputs (only Tensors and Variables)
+      that the object should be called with.
 
     Raises:
       ValueError: If inputs are incompatible with the input signature.
@@ -3273,10 +3304,13 @@ class Function(object):
         shape relaxation retracing.
     """
     if self.input_signature is None or args is not None or kwargs is not None:
-      args, kwargs = self._function_spec.canonicalize_function_inputs(
-          *args, **kwargs)
+      args, kwargs, flat_args, filtered_flat_args = \
+          self._function_spec.canonicalize_function_inputs(*args, **kwargs)
+    else:
+      flat_args, filtered_flat_args = [None], []
 
-    cache_key = self._cache_key(args, kwargs)
+    cache_key_context = self._cache_key_context()
+    cache_key = self._cache_key(args, kwargs, cache_key_context)
 
     try:
       hash(cache_key)
@@ -3287,42 +3321,44 @@ class Function(object):
 
     graph_function = self._function_cache.primary.get(cache_key, None)
     if graph_function is not None:
-      return graph_function, args, kwargs
+      return graph_function, filtered_flat_args
 
-    logging.vlog(1,
-                 "Creating new FuncGraph for Python function %r (key: %r)",
-                 self._python_function, cache_key)
-    logging.vlog(2,
-                 "Python function signature [args: %s] [kwargs: %s]",
-                 args,
-                 kwargs)
+    with monitoring.MonitoredTimer(_graph_building_time_counter.get_cell()):
+      with trace.Trace("tf.function-graph_building"):
+        logging.vlog(1,
+                     "Creating new FuncGraph for Python function %r (key: %r)",
+                     self._python_function, cache_key)
+        logging.vlog(2, "Python function signature [args: %s] [kwargs: %s]",
+                     args, kwargs)
 
-    # pylint: disable=protected-access
-    call_context_key = cache_key._replace(input_signature=None)
-    # pylint: disable=protected-access
+        # pylint: disable=protected-access
+        call_context_key = cache_key._replace(input_signature=None)
+        # pylint: disable=protected-access
 
-    ag_status = (
-        ag_ctx.Status.ENABLED if self._autograph else ag_ctx.Status.DISABLED)
-    with ag_ctx.ControlStatusCtx(
-        status=ag_status, options=self._autograph_options):
+        ag_status = (
+            ag_ctx.Status.ENABLED
+            if self._autograph else ag_ctx.Status.DISABLED)
+        with ag_ctx.ControlStatusCtx(
+            status=ag_status, options=self._autograph_options):
 
-      # Build a function with shape relaxation retracing if:
-      # 1. shape relaxation is explicitly enabled
-      # and 2. there's no provided input signature
-      # and 3. there's been a cache miss for this calling context
-      if (self._experimental_relax_shapes
-          and self.input_signature is None
-          and call_context_key in self._function_cache.missed):
-        return self._define_function_with_shape_relaxation(args, kwargs)
+          # Build a function with shape relaxation retracing if:
+          # 1. shape relaxation is explicitly enabled
+          # and 2. there's no provided input signature
+          # and 3. there's been a cache miss for this calling context
+          if (self._experimental_relax_shapes and
+              self.input_signature is None and
+              call_context_key in self._function_cache.missed):
+            return self._define_function_with_shape_relaxation(
+                args, kwargs, flat_args, filtered_flat_args, cache_key_context)
 
-      self._function_cache.missed.add(call_context_key)
-      graph_function = self._create_graph_function(args, kwargs)
-      self._function_cache.primary[cache_key] = graph_function
+          self._function_cache.missed.add(call_context_key)
+          graph_function = self._create_graph_function(args, kwargs)
+          self._function_cache.primary[cache_key] = graph_function
 
-      if ops.get_default_graph()._distribution_strategy_stack:
-        self._traced_with_distribution_strategy = True
+          if ops.get_default_graph()._distribution_strategy_stack:
+            self._traced_with_distribution_strategy = True
 
-      return graph_function, args, kwargs
+          return graph_function, filtered_flat_args
 
 
 def register(func, *args, **kwargs):
diff --git a/tensorflow/python/eager/function_argument_naming_test.py b/tensorflow/python/eager/function_argument_naming_test.py
index 4e6a60e0d27..c643bce6f56 100644
--- a/tensorflow/python/eager/function_argument_naming_test.py
+++ b/tensorflow/python/eager/function_argument_naming_test.py
@@ -26,6 +26,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_spec
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
@@ -217,9 +218,9 @@ class ArgumentNamingTests(test.TestCase, parameterized.TestCase):
         [inp.op.name for inp in variadic_op.inputs])
     self.assertEqual(
         [b'x', b'y', b'args_1', b'second_variadic', b'z', b'cust'],
-        [inp.op.get_attr('_user_specified_name')
-         for inp in variadic_op.inputs])
+        [inp.op.get_attr('_user_specified_name') for inp in variadic_op.inputs])
 
+  @test_util.disable_tfrt('GPU to host copy not implemented yet.')
   def testVariadicInputSignature(self, function_decorator):
     @function_decorator(
         input_signature=(
diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index 65b23401431..17075ebda33 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -56,6 +56,7 @@ from tensorflow.python.framework import test_ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.framework import type_spec
 from tensorflow.python.layers import convolutional
+from tensorflow.python.module import module
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import clip_ops
@@ -3363,7 +3364,8 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     with self.assertRaises(errors.CancelledError):
       cancelable_func()
 
-  def testCancelBlockedFunctionExecution(self):
+  # TODO(b/162544929): Enable this test.
+  def DISABLE_testCancelBlockedFunctionExecution(self):
     if not context.executing_eagerly():
       self.skipTest('eager only')
 
@@ -4220,6 +4222,62 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     enabled(1, 2, 3, 4, kwonly=5, kwarg1=600, kwarg2=700)  # No retrace
     self.assertEqual(trace_count[0], 4)
 
+  def testWithModuleNameScope(self):
+    self.skipTest('b/166158748:function does not handle this case correctly.')
+
+    class Foo(module.Module):
+
+      def __init__(self):
+        super().__init__()
+        self.var = None
+
+      @def_function.function
+      @module.Module.with_name_scope
+      def add(self, x, y, z=1):
+        if self.var is None:
+          return x + y + z
+
+    foo = Foo()
+    self.assertEqual(foo.add(2, 3), 6)
+
+  def testWithModuleNameScopeRedundantArgs(self):
+    self.skipTest('b/166158748:function does not handle this case correctly.')
+
+    class Foo(module.Module):
+
+      def __init__(self):
+        super().__init__()
+        self.var = None
+
+      @def_function.function
+      @module.Module.with_name_scope
+      def add(self, x, y):
+        if self.var is None:
+          return x + y
+
+    foo = Foo()
+    with self.assertRaisesRegex(TypeError, 'got two values for argument'):
+      foo.add(2, x=3)  # pylint: disable=redundant-keyword-arg,no-value-for-parameter
+
+  def testWithModuleNameScopeMissingArgs(self):
+    self.skipTest('b/166158748:function does not handle this case correctly.')
+
+    class Foo(module.Module):
+
+      def __init__(self):
+        super().__init__()
+        self.var = None
+
+      @def_function.function
+      @module.Module.with_name_scope
+      def add(self, x, y):
+        if self.var is None:
+          return x + y
+
+    foo = Foo()
+    with self.assertRaisesRegex(TypeError, 'missing required arguments: y'):
+      foo.add(2)  # pylint: disable=no-value-for-parameter
+
 
 class MultiDeviceTest(test.TestCase, parameterized.TestCase):
 
diff --git a/tensorflow/python/eager/memory_tests/BUILD b/tensorflow/python/eager/memory_tests/BUILD
index 419de91b42a..59735c41608 100644
--- a/tensorflow/python/eager/memory_tests/BUILD
+++ b/tensorflow/python/eager/memory_tests/BUILD
@@ -43,6 +43,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["remote_memory_test.py"],
     tags = [
+        "no_gpu",  # TODO(b/168058741): Enable the test for GPU
         "optonly",  # The test is too slow in non-opt mode
     ],
     xla_enable_strict_auto_jit = False,  # b/140261762
diff --git a/tensorflow/python/eager/pywrap_gradient_exclusions.cc b/tensorflow/python/eager/pywrap_gradient_exclusions.cc
index 83523f321bd..e24717960a7 100644
--- a/tensorflow/python/eager/pywrap_gradient_exclusions.cc
+++ b/tensorflow/python/eager/pywrap_gradient_exclusions.cc
@@ -50,7 +50,7 @@ auto OpGradientInfoInit(const T &a) {
 
 absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedInputIndices(
     const tensorflow::string &op_name) {
-  static std::array<OpIndexInfo, 349> a = {{
+  static std::array<OpIndexInfo, 357> a = {{
       {"Acosh"},
       {"AllToAll", 1, {0}},
       {"ApproximateEqual"},
@@ -152,6 +152,7 @@ absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedInputIndices(
       {"IdentityReader"},
       {"Imag"},
       {"ImageProjectiveTransformV2", 1, {2}},
+      {"ImageProjectiveTransformV3", 2, {2, 3}},
       {"ImageSummary"},
       {"InitializeTable"},
       {"InitializeTableFromTextFile"},
@@ -160,6 +161,7 @@ absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedInputIndices(
       {"Inv"},
       {"Invert"},
       {"InvertPermutation"},
+      {"IsotonicRegression"},
       {"LMDBReader"},
       {"LeakyReluGrad", 1, {0}},
       {"LeftShift"},
@@ -330,11 +332,16 @@ absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedInputIndices(
       {"StatelessRandomBinomial"},
       {"StatelessRandomGammaV2", 1, {1}},
       {"StatelessRandomNormal"},
+      {"StatelessRandomNormalV2"},
       {"StatelessRandomPoisson"},
       {"StatelessRandomUniform"},
       {"StatelessRandomUniformFullInt"},
+      {"StatelessRandomUniformFullIntV2"},
       {"StatelessRandomUniformInt"},
+      {"StatelessRandomUniformIntV2"},
+      {"StatelessRandomUniformV2"},
       {"StatelessTruncatedNormal"},
+      {"StatelessTruncatedNormalV2"},
       {"StopGradient"},
       {"StridedSliceGrad", 2, {0, 4}},
       {"StringSplit"},
@@ -381,6 +388,7 @@ absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedInputIndices(
       {"TensorScatterAdd", 2, {0, 2}},
       {"TensorScatterSub", 2, {0, 2}},
       {"TensorScatterUpdate", 1, {0}},
+      {"TensorStridedSliceUpdate", 2, {0, 4}},
       {"TensorSummary"},
       {"TensorSummaryV2"},
       {"TextLineReader"},
@@ -412,7 +420,7 @@ absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedInputIndices(
 
 absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedOutputIndices(
     const tensorflow::string &op_name) {
-  static std::array<OpIndexInfo, 465> a = {{
+  static std::array<OpIndexInfo, 473> a = {{
       {"Abs"},
       {"AccumulateNV2"},
       {"Acos"},
@@ -568,6 +576,7 @@ absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedOutputIndices(
       {"Igammac"},
       {"Imag"},
       {"ImageProjectiveTransformV2"},
+      {"ImageProjectiveTransformV3"},
       {"ImageSummary"},
       {"InitializeTable"},
       {"InitializeTableFromTextFile"},
@@ -576,6 +585,7 @@ absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedOutputIndices(
       {"InvGrad"},
       {"Invert"},
       {"InvertPermutation"},
+      {"IsotonicRegression", 1, {0}},
       {"L2Loss"},
       {"LMDBReader"},
       {"LeakyRelu"},
@@ -788,11 +798,16 @@ absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedOutputIndices(
       {"StatelessMultinomial"},
       {"StatelessRandomBinomial"},
       {"StatelessRandomNormal"},
+      {"StatelessRandomNormalV2"},
       {"StatelessRandomPoisson"},
       {"StatelessRandomUniform"},
       {"StatelessRandomUniformFullInt"},
+      {"StatelessRandomUniformFullIntV2"},
       {"StatelessRandomUniformInt"},
+      {"StatelessRandomUniformIntV2"},
+      {"StatelessRandomUniformV2"},
       {"StatelessTruncatedNormal"},
+      {"StatelessTruncatedNormalV2"},
       {"StopGradient"},
       {"StridedSlice"},
       {"StridedSliceGrad"},
@@ -851,6 +866,7 @@ absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedOutputIndices(
       {"TensorScatterAdd"},
       {"TensorScatterSub"},
       {"TensorScatterUpdate"},
+      {"TensorStridedSliceUpdate"},
       {"TensorSummary"},
       {"TensorSummaryV2"},
       {"TextLineReader"},
diff --git a/tensorflow/python/eager/pywrap_tensor.cc b/tensorflow/python/eager/pywrap_tensor.cc
index 0789eab6270..e5c74deaf80 100644
--- a/tensorflow/python/eager/pywrap_tensor.cc
+++ b/tensorflow/python/eager/pywrap_tensor.cc
@@ -40,9 +40,42 @@ limitations under the License.
 
 // forward declare
 struct EagerTensor;
+namespace tensorflow {
 
+// Convert a TFE_TensorHandle to a Python numpy.ndarray object.
+// The two may share underlying storage so changes to one may reflect in the
+// other.
+PyObject* TFE_TensorHandleToNumpy(TFE_TensorHandle* handle, TF_Status* status) {
+  if (TFE_TensorHandleDataType(handle) == TF_RESOURCE) {
+    TF_SetStatus(status, TF_INVALID_ARGUMENT,
+                 "Cannot convert a Tensor of dtype resource to a NumPy array.");
+    return nullptr;
+  }
+
+  tensorflow::Safe_TF_TensorPtr tensor = nullptr;
+  Py_BEGIN_ALLOW_THREADS;
+  tensor = tensorflow::make_safe(TFE_TensorHandleResolve(handle, status));
+  Py_END_ALLOW_THREADS;
+  if (!status->status.ok()) {
+    return nullptr;
+  }
+
+  PyObject* ret = nullptr;
+  auto cppstatus =
+      tensorflow::TF_TensorToMaybeAliasedPyArray(std::move(tensor), &ret);
+  tensorflow::Set_TF_Status_from_Status(status, cppstatus);
+  if (!status->status.ok()) {
+    Py_XDECREF(ret);
+    return nullptr;
+  }
+  CHECK_NE(ret, nullptr);
+  return ret;
+}
+}  // namespace tensorflow
 namespace {
 
+using tensorflow::TFE_TensorHandleToNumpy;
+
 // An instance of _EagerTensorProfiler that will receive callbacks about
 // events on eager tensors. This is set by TFE_Py_InitEagerTensor, if at all.
 PyObject* eager_tensor_profiler = nullptr;
@@ -87,35 +120,6 @@ TFE_Context* GetContextHandle(PyObject* py_context) {
   return ctx;
 }
 
-// Convert a TFE_TensorHandle to a Python numpy.ndarray object.
-// The two may share underlying storage so changes to one may reflect in the
-// other.
-PyObject* TFE_TensorHandleToNumpy(TFE_TensorHandle* handle, TF_Status* status) {
-  if (TFE_TensorHandleDataType(handle) == TF_RESOURCE) {
-    TF_SetStatus(status, TF_INVALID_ARGUMENT,
-                 "Cannot convert a Tensor of dtype resource to a NumPy array.");
-    return nullptr;
-  }
-
-  tensorflow::Safe_TF_TensorPtr tensor = nullptr;
-  Py_BEGIN_ALLOW_THREADS;
-  tensor = tensorflow::make_safe(TFE_TensorHandleResolve(handle, status));
-  Py_END_ALLOW_THREADS;
-  if (!status->status.ok()) {
-    return nullptr;
-  }
-
-  PyObject* ret = nullptr;
-  auto cppstatus =
-      tensorflow::TF_TensorToMaybeAliasedPyArray(std::move(tensor), &ret);
-  tensorflow::Set_TF_Status_from_Status(status, cppstatus);
-  if (!status->status.ok()) {
-    Py_XDECREF(ret);
-    return nullptr;
-  }
-  CHECK_NE(ret, nullptr);
-  return ret;
-}
 
 // Helper function to convert `v` to a tensorflow::DataType and store it in
 // `*out`. Returns true on success, false otherwise.
diff --git a/tensorflow/python/eager/pywrap_tensor.h b/tensorflow/python/eager/pywrap_tensor.h
index 4c84b5ce6ea..3c2a7c5a010 100644
--- a/tensorflow/python/eager/pywrap_tensor.h
+++ b/tensorflow/python/eager/pywrap_tensor.h
@@ -24,6 +24,7 @@ bool EagerTensor_CheckExact(const PyObject* o);
 tensorflow::int64 PyEagerTensor_ID(const PyObject* tensor);
 tensorflow::DataType PyEagerTensor_Dtype(const PyObject* tensor);
 tensorflow::int64 PyEagerTensor_NumElements(PyObject* tensor);
+TFE_TensorHandle* EagerTensor_Handle(const PyObject* o);
 
 namespace tensorflow {
 
@@ -37,6 +38,8 @@ TFE_TensorHandle* ConvertToEagerTensor(TFE_Context* ctx, PyObject* value,
                                        DataType dtype,
                                        const char* device_name = nullptr);
 
+PyObject* TFE_TensorHandleToNumpy(TFE_TensorHandle* handle, TF_Status* status);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_PYTHON_EAGER_PYWRAP_TENSOR_H_
diff --git a/tensorflow/python/eager/pywrap_tensor_test.py b/tensorflow/python/eager/pywrap_tensor_test.py
new file mode 100644
index 00000000000..ee1a3536546
--- /dev/null
+++ b/tensorflow/python/eager/pywrap_tensor_test.py
@@ -0,0 +1,35 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for TFE_TensorHandleToNumpy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from tensorflow.python.eager import pywrap_tensor_test_util as util
+from tensorflow.python.eager import test
+
+
+class PywrapTensorTest(test.TestCase):
+
+  def testGetScalarOne(self):
+    result = util.get_scalar_one()
+    self.assertIsInstance(result, np.ndarray)
+    self.assertAllEqual(result, 1.0)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/eager/pywrap_tensor_test_util.cc b/tensorflow/python/eager/pywrap_tensor_test_util.cc
new file mode 100644
index 00000000000..21ef8c45e43
--- /dev/null
+++ b/tensorflow/python/eager/pywrap_tensor_test_util.cc
@@ -0,0 +1,41 @@
+// Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "pybind11/pybind11.h"
+#include "pybind11/pytypes.h"
+#include "tensorflow/c/eager/c_api_test_util.h"
+#include "tensorflow/c/tf_status_helper.h"
+#include "tensorflow/python/eager/pywrap_tensor.h"
+#include "tensorflow/python/lib/core/pybind11_lib.h"
+
+using tensorflow::Pyo;
+using tensorflow::TF_StatusPtr;
+using tensorflow::TFE_TensorHandleToNumpy;
+
+PYBIND11_MODULE(pywrap_tensor_test_util, m) {
+  m.def("get_scalar_one", []() {
+    // Builds a TFE_TensorHandle and then converts to NumPy ndarray
+    // using TFE_TensorHandleToNumpy.
+    TFE_ContextOptions* opts = TFE_NewContextOptions();
+    TF_StatusPtr status(TF_NewStatus());
+    TFE_Context* ctx = TFE_NewContext(opts, status.get());
+    TFE_TensorHandle* handle = TestScalarTensorHandle(ctx, 1.0f);
+    auto result = Pyo(TFE_TensorHandleToNumpy(handle, status.get()));
+    TFE_DeleteTensorHandle(handle);
+    TFE_DeleteContext(ctx);
+    TFE_DeleteContextOptions(opts);
+    return result;
+  });
+}
diff --git a/tensorflow/python/eager/pywrap_tfe.h b/tensorflow/python/eager/pywrap_tfe.h
index 4431502f428..facbba92f59 100755
--- a/tensorflow/python/eager/pywrap_tfe.h
+++ b/tensorflow/python/eager/pywrap_tfe.h
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/c/eager/c_api.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/python/lib/core/safe_pyobject_ptr.h"
 
 typedef tensorflow::gtl::InlinedVector<TFE_TensorHandle*, 4>
     TFE_InputTensorHandles;
@@ -259,16 +260,15 @@ PyObject* TFE_Py_TapeGradient(PyObject* tape, PyObject* target,
 // it will simply fail with a NotImplementedError.
 //
 // The "args" PyObject* is meant to be a tuple with the following structure:
-//  Item 1: The TFE Context
-//  Item 2: device_name: Name of the device on which to execute the operation,
-//          or NULL for automatic selection.
-//  Item 3: op_name: Name of the TensorFlow op to execute.
-//  Item 4: name: An optional name for the operation.
-//  Item 5: List representing all callbacks to execute after successful
-//  op execute.
-//  Item 6 onwards: inputs - This is a list of inputs followed by a list of
+//  Item 1: The Python eager Context object
+//  Item 2: op_name: Name of the TensorFlow op to execute.
+//  Item 3: name: An optional name for the operation.
+//  Item 4 onwards: inputs - This is a list of inputs followed by a list of
 //        attrs. It is not necessary for type attrs to be present.
 //
+// Note: the device_name and op_callbacks, which were previously passed
+// as arguments, are now read via GetEagerContextThreadLocalData().
+//
 // This is named _C since there doesn't seem to be any way to make it visible
 // in the SWIG interface without renaming due to the use of the %native
 // directive.
@@ -394,4 +394,59 @@ PyObject* GetPyEagerContext();
 TF_Status* GetStatus();
 // Returns the pre-allocated status to the code.
 void ReturnStatus(TF_Status* status);
+
+namespace tensorflow {
+
+// Thread-local data associated with a Python eager Context object.
+//
+// TODO(edloper): Consider changing device_name and scope_name to a const char*
+// (with nullptr used for None). However, note that existing code (e.g.
+// TFE_TensorHandleCache::Lookup) assumes that the lifetime of these strings
+// extends beyond the point where their value is changed; so we'd need to make
+// sure that the strings stay alive (maybe using PyUnicode_InternInPlace?)
+struct EagerContextThreadLocalData {
+  bool is_eager = false;
+  bool invoking_op_callbacks = false;
+  tensorflow::Safe_PyObjectPtr device_name;
+  tensorflow::Safe_PyObjectPtr scope_name;
+  tensorflow::Safe_PyObjectPtr device_spec;
+  tensorflow::Safe_PyObjectPtr function_call_options;
+  tensorflow::Safe_PyObjectPtr executor;
+  tensorflow::Safe_PyObjectPtr op_callbacks;
+};
+
+// Create a thread-local-data structure associated with py_eager_context.
+// `is_eager` and `device_spec` are used to supply default values for those
+// fields whenever a new thread-local instance is created for py_eager_tensor.
+//
+// This function assumes that the Python GIL is held (and does not perform its
+// own locking).
+void MakeEagerContextThreadLocalData(PyObject* py_eager_context,
+                                     PyObject* is_eager,
+                                     PyObject* device_spec);
+
+// Returns the thread-local instance of EagerContextThreadLocalData that is
+// associated with the given Python Context object.  If an instance has not
+// yet been created for `py_eager_context` in this thread, then a new one is
+// created, and initialized with the default values specified in
+// MakeEagerContextThreadLocalData.
+EagerContextThreadLocalData* GetEagerContextThreadLocalData(
+    PyObject* py_eager_context);
+
+// Free data structures used to track py_eager_context.
+//
+// This frees global state associated with py_eager_context, as well as thread-
+// local state associated with py_eager_context and the current thread. If you
+// wish to destroy thread-local state associated with a single py_eager_context
+// for multiple threads, then you must call this method from each thread.
+//
+// Thread-local state assocaited with eager contexts is also automatically
+// cleaned up when the thread is destroyed.
+//
+// This function assumes that the Python GIL is held (and does not perform its
+// own locking).
+void DestroyEagerContextThreadLocalData(PyObject* py_eager_context);
+
+}  // namespace tensorflow
+
 #endif  // TENSORFLOW_PYTHON_EAGER_PYWRAP_TFE_H_
diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index cd0ecc8182e..e2dfd3c3ca4 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -2953,7 +2953,14 @@ PyObject* TFE_Py_PackJVPs(PyObject* tensors) {
 }
 
 namespace {
-static const int kFastPathExecuteInputStartIndex = 5;
+
+// Indices for the "args" tuple that's passed to TFE_Py_FastPathExecute_C.
+enum FastPathExecuteArgIndex {
+  FAST_PATH_EXECUTE_ARG_CONTEXT = 0,
+  FAST_PATH_EXECUTE_ARG_OP_NAME = 1,
+  FAST_PATH_EXECUTE_ARG_NAME = 2,
+  FAST_PATH_EXECUTE_ARG_INPUT_START = 3
+};
 
 PyObject* GetPythonObjectFromString(tensorflow::StringPiece s) {
 #if PY_MAJOR_VERSION >= 3
@@ -3063,7 +3070,7 @@ tensorflow::DataType MaybeGetDTypeForAttr(const string& attr,
 
   for (const auto& input_info : it->second) {
     PyObject* item = PyTuple_GET_ITEM(
-        op_exec_info->args, kFastPathExecuteInputStartIndex + input_info.i);
+        op_exec_info->args, FAST_PATH_EXECUTE_ARG_INPUT_START + input_info.i);
     if (input_info.is_list) {
       tensorflow::Safe_PyObjectPtr fast_item(
           PySequence_Fast(item, "Unable to allocate"));
@@ -3526,19 +3533,26 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject* args) {
   tensorflow::profiler::TraceMe activity(
       "TFE_Py_FastPathExecute_C", tensorflow::profiler::TraceMeLevel::kInfo);
   Py_ssize_t args_size = PyTuple_GET_SIZE(args);
-  if (args_size < kFastPathExecuteInputStartIndex) {
+  if (args_size < FAST_PATH_EXECUTE_ARG_INPUT_START) {
     PyErr_SetString(
         PyExc_ValueError,
         Printf("There must be at least %d items in the input tuple.",
-               kFastPathExecuteInputStartIndex)
+               FAST_PATH_EXECUTE_ARG_INPUT_START)
             .c_str());
     return nullptr;
   }
 
   FastPathOpExecInfo op_exec_info;
 
+  PyObject* py_eager_context =
+      PyTuple_GET_ITEM(args, FAST_PATH_EXECUTE_ARG_CONTEXT);
+
+  // TODO(edoper): Use interned string here
+  PyObject* eager_context_handle =
+      PyObject_GetAttrString(py_eager_context, "_context_handle");
+
   TFE_Context* ctx = reinterpret_cast<TFE_Context*>(
-      PyCapsule_GetPointer(PyTuple_GET_ITEM(args, 0), nullptr));
+      PyCapsule_GetPointer(eager_context_handle, nullptr));
   op_exec_info.ctx = ctx;
   op_exec_info.args = args;
 
@@ -3550,10 +3564,15 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject* args) {
     return nullptr;
   }
 
-  op_exec_info.device_name = GetDeviceName(PyTuple_GET_ITEM(args, 1));
-  op_exec_info.op_name = PyTuple_GET_ITEM(args, 2);
-  op_exec_info.name = PyTuple_GET_ITEM(args, 3);
-  op_exec_info.callbacks = PyTuple_GET_ITEM(args, 4);
+  auto* tld = tensorflow::GetEagerContextThreadLocalData(py_eager_context);
+  if (tld == nullptr) {
+    return nullptr;
+  }
+  op_exec_info.device_name = GetDeviceName(tld->device_name.get());
+  op_exec_info.callbacks = tld->op_callbacks.get();
+
+  op_exec_info.op_name = PyTuple_GET_ITEM(args, FAST_PATH_EXECUTE_ARG_OP_NAME);
+  op_exec_info.name = PyTuple_GET_ITEM(args, FAST_PATH_EXECUTE_ARG_NAME);
 
   // TODO(nareshmodi): Add a benchmark for the fast-path with gradient callbacks
   // (similar to benchmark_tf_gradient_function_*). Also consider using an
@@ -3591,18 +3610,19 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject* args) {
   const tensorflow::OpDef* op_def = tensorflow::unwrap(op)->OpDef();
   if (op_def == nullptr) return nullptr;
 
-  if (args_size < kFastPathExecuteInputStartIndex + op_def->input_arg_size()) {
+  if (args_size <
+      FAST_PATH_EXECUTE_ARG_INPUT_START + op_def->input_arg_size()) {
     PyErr_SetString(
         PyExc_ValueError,
         Printf("Tuple size smaller than intended. Expected to be at least %d, "
                "was %ld",
-               kFastPathExecuteInputStartIndex + op_def->input_arg_size(),
+               FAST_PATH_EXECUTE_ARG_INPUT_START + op_def->input_arg_size(),
                args_size)
             .c_str());
     return nullptr;
   }
 
-  if (!CheckInputsOk(args, kFastPathExecuteInputStartIndex, *op_def)) {
+  if (!CheckInputsOk(args, FAST_PATH_EXECUTE_ARG_INPUT_START, *op_def)) {
     RaiseFallbackException(
         "This function does not handle the case of the path where "
         "all inputs are not already EagerTensors.");
@@ -3618,7 +3638,7 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject* args) {
 
   // Set non-inferred attrs, including setting defaults if the attr is passed in
   // as None.
-  for (int i = kFastPathExecuteInputStartIndex + op_def->input_arg_size();
+  for (int i = FAST_PATH_EXECUTE_ARG_INPUT_START + op_def->input_arg_size();
        i < args_size; i += 2) {
     PyObject* py_attr_name = PyTuple_GET_ITEM(args, i);
     const char* attr_name = TFE_GetPythonString(py_attr_name);
@@ -3675,7 +3695,7 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject* args) {
     const auto& input_arg = op_def->input_arg(i);
 
     PyObject* input =
-        PyTuple_GET_ITEM(args, kFastPathExecuteInputStartIndex + i);
+        PyTuple_GET_ITEM(args, FAST_PATH_EXECUTE_ARG_INPUT_START + i);
     if (!input_arg.number_attr().empty()) {
       // The item is a homogeneous list.
       if (!RaiseIfNotPySequence(input, input_arg.number_attr())) return nullptr;
@@ -3820,7 +3840,7 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject* args) {
   if (op_exec_info.run_callbacks) {
     if (!RunCallbacks(
             op_exec_info, args,
-            kFastPathExecuteInputStartIndex + op_def->input_arg_size(),
+            FAST_PATH_EXECUTE_ARG_INPUT_START + op_def->input_arg_size(),
             *flattened_inputs, *flattened_attrs, flat_result.get())) {
       return nullptr;
     }
@@ -4203,3 +4223,125 @@ PyObject* GetPyEagerContext() {
   Py_INCREF(py_context);
   return py_context;
 }
+
+namespace {
+
+// Default values for thread_local_data fields.
+struct EagerContextThreadLocalDataDefaults {
+  tensorflow::Safe_PyObjectPtr is_eager;
+  tensorflow::Safe_PyObjectPtr device_spec;
+};
+
+// Maps each py_eager_context object to its thread_local_data.
+//
+// Note: we need to use the python Context object as the key here (and not
+// its handle object), because the handle object isn't created until the
+// context is initialized; but thread_local_data is potentially accessed
+// before then.
+using EagerContextThreadLocalDataMap = absl::flat_hash_map<
+    PyObject*, std::unique_ptr<tensorflow::EagerContextThreadLocalData>>;
+thread_local EagerContextThreadLocalDataMap*
+    eager_context_thread_local_data_map = nullptr;
+
+// Maps each py_eager_context object to default values.
+using EagerContextThreadLocalDataDefaultsMap =
+    absl::flat_hash_map<PyObject*, EagerContextThreadLocalDataDefaults>;
+EagerContextThreadLocalDataDefaultsMap*
+    eager_context_thread_local_data_defaults = nullptr;
+
+}  // namespace
+
+namespace tensorflow {
+
+void MakeEagerContextThreadLocalData(PyObject* py_eager_context,
+                                     PyObject* is_eager,
+                                     PyObject* device_spec) {
+  DCheckPyGilState();
+  if (eager_context_thread_local_data_defaults == nullptr) {
+    eager_context_thread_local_data_defaults =
+        new EagerContextThreadLocalDataDefaultsMap();
+  }
+  if (eager_context_thread_local_data_defaults->count(py_eager_context) > 0) {
+    PyErr_SetString(PyExc_AssertionError,
+                    "MakeEagerContextThreadLocalData may not be called "
+                    "twice on the same eager Context object.");
+  }
+
+  auto& defaults =
+      (*eager_context_thread_local_data_defaults)[py_eager_context];
+  Py_INCREF(is_eager);
+  defaults.is_eager.reset(is_eager);
+  Py_INCREF(device_spec);
+  defaults.device_spec.reset(device_spec);
+}
+
+EagerContextThreadLocalData* GetEagerContextThreadLocalData(
+    PyObject* py_eager_context) {
+  if (eager_context_thread_local_data_defaults == nullptr) {
+    PyErr_SetString(PyExc_AssertionError,
+                    "MakeEagerContextThreadLocalData must be called "
+                    "before GetEagerContextThreadLocalData.");
+    return nullptr;
+  }
+  auto defaults =
+      eager_context_thread_local_data_defaults->find(py_eager_context);
+  if (defaults == eager_context_thread_local_data_defaults->end()) {
+    PyErr_SetString(PyExc_AssertionError,
+                    "MakeEagerContextThreadLocalData must be called "
+                    "before GetEagerContextThreadLocalData.");
+    return nullptr;
+  }
+
+  if (eager_context_thread_local_data_map == nullptr) {
+    eager_context_thread_local_data_map = new EagerContextThreadLocalDataMap();
+  }
+  auto& thread_local_data =
+      (*eager_context_thread_local_data_map)[py_eager_context];
+
+  if (!thread_local_data) {
+    thread_local_data.reset(new EagerContextThreadLocalData());
+
+    Safe_PyObjectPtr is_eager(PyObject_CallFunctionObjArgs(
+        defaults->second.is_eager.get(), nullptr));
+    if (!is_eager) return nullptr;
+    thread_local_data->is_eager = PyObject_IsTrue(is_eager.get());
+
+#if PY_MAJOR_VERSION >= 3
+    PyObject* scope_name = PyUnicode_FromString("");
+#else
+    PyObject* scope_name = PyString_FromString("");
+#endif
+    thread_local_data->scope_name.reset(scope_name);
+
+#if PY_MAJOR_VERSION >= 3
+    PyObject* device_name = PyUnicode_FromString("");
+#else
+    PyObject* device_name = PyString_FromString("");
+#endif
+    thread_local_data->device_name.reset(device_name);
+
+    Py_INCREF(defaults->second.device_spec.get());
+    thread_local_data->device_spec.reset(defaults->second.device_spec.get());
+
+    Py_INCREF(Py_None);
+    thread_local_data->function_call_options.reset(Py_None);
+
+    Py_INCREF(Py_None);
+    thread_local_data->executor.reset(Py_None);
+
+    thread_local_data->op_callbacks.reset(PyList_New(0));
+  }
+  return thread_local_data.get();
+}
+
+void DestroyEagerContextThreadLocalData(PyObject* py_eager_context) {
+  DCheckPyGilState();
+  if (eager_context_thread_local_data_defaults) {
+    eager_context_thread_local_data_defaults->erase(py_eager_context);
+  }
+  if (eager_context_thread_local_data_map) {
+    eager_context_thread_local_data_map->erase(py_eager_context);
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/python/eager/pywrap_tfe_test.py b/tensorflow/python/eager/pywrap_tfe_test.py
index 9bf698fded0..07529e6a36e 100644
--- a/tensorflow/python/eager/pywrap_tfe_test.py
+++ b/tensorflow/python/eager/pywrap_tfe_test.py
@@ -57,16 +57,15 @@ class Tests(test.TestCase):
 
     self.assertAllClose(
         math_ops.matmul(a_2_by_2, b_2_by_2),
-        pywrap_tfe.TFE_Py_FastPathExecute(ctx._handle, ctx.device_name,
-                                          "MatMul", None, None, a_2_by_2,
-                                          b_2_by_2, "transpose_a", False,
-                                          "transpose_b", False))
+        pywrap_tfe.TFE_Py_FastPathExecute(ctx, "MatMul", None,
+                                          a_2_by_2, b_2_by_2, "transpose_a",
+                                          False, "transpose_b", False))
     self.assertAllClose(
         math_ops.matmul(a_100_by_784, b_100_by_784, transpose_b=True),
-        pywrap_tfe.TFE_Py_FastPathExecute(ctx._handle, ctx.device_name,
-                                          "MatMul", None, None, a_100_by_784,
-                                          b_100_by_784, "transpose_a", False,
-                                          "transpose_b", True))
+        pywrap_tfe.TFE_Py_FastPathExecute(ctx, "MatMul", None,
+                                          a_100_by_784, b_100_by_784,
+                                          "transpose_a", False, "transpose_b",
+                                          True))
 
   @test_util.assert_no_new_tensors
   @test_util.assert_no_garbage_created
@@ -76,14 +75,12 @@ class Tests(test.TestCase):
 
     a_2_by_2 = constant_op.constant(1.0, shape=[2, 2])
     m = resource_variable_ops.ResourceVariable(a_2_by_2)
-    x = pywrap_tfe.TFE_Py_FastPathExecute(ctx._handle, ctx.device_name,
-                                          "MatMul", None, None, m, m,
-                                          "transpose_a", False, "transpose_b",
-                                          False)
-    y = pywrap_tfe.TFE_Py_FastPathExecute(ctx._handle, ctx.device_name,
-                                          "MatMul", None, None, a_2_by_2,
-                                          a_2_by_2, "transpose_a", False,
+    x = pywrap_tfe.TFE_Py_FastPathExecute(ctx, "MatMul", None, m,
+                                          m, "transpose_a", False,
                                           "transpose_b", False)
+    y = pywrap_tfe.TFE_Py_FastPathExecute(ctx, "MatMul", None,
+                                          a_2_by_2, a_2_by_2, "transpose_a",
+                                          False, "transpose_b", False)
 
     self.assertAllEqual(x, y)
 
@@ -96,10 +93,9 @@ class Tests(test.TestCase):
     with backprop.GradientTape(persistent=True) as tape:
       a_2_by_2 = constant_op.constant(1.0, shape=[2, 2])
       tape.watch(a_2_by_2)
-      z = pywrap_tfe.TFE_Py_FastPathExecute(ctx._handle, ctx.device_name,
-                                            "MatMul", None, None, a_2_by_2,
-                                            a_2_by_2, "transpose_a", False,
-                                            "transpose_b", False)
+      z = pywrap_tfe.TFE_Py_FastPathExecute(ctx, "MatMul", None,
+                                            a_2_by_2, a_2_by_2, "transpose_a",
+                                            False, "transpose_b", False)
     dz_dy = tape.gradient(z, [a_2_by_2])[0]
     self.assertAllEqual(dz_dy.numpy(),
                         constant_op.constant(4.0, shape=[2, 2]).numpy())
@@ -114,10 +110,9 @@ class Tests(test.TestCase):
       a_2_by_2 = constant_op.constant(1.0, shape=[2, 2])
       m = resource_variable_ops.ResourceVariable(a_2_by_2)
       tape.watch(m)
-      z = pywrap_tfe.TFE_Py_FastPathExecute(ctx._handle, ctx.device_name,
-                                            "MatMul", None, None, m, m,
-                                            "transpose_a", False, "transpose_b",
-                                            False)
+      z = pywrap_tfe.TFE_Py_FastPathExecute(ctx, "MatMul", None, m,
+                                            m, "transpose_a", False,
+                                            "transpose_b", False)
     dz_dy = tape.gradient(z, [m])[0]
     self.assertAllEqual(dz_dy.numpy(),
                         constant_op.constant(4.0, shape=[2, 2]).numpy())
@@ -134,8 +129,8 @@ class Tests(test.TestCase):
 
     self.assertAllClose(
         math_ops.add_n([a_2_by_2, b_2_by_2]),
-        pywrap_tfe.TFE_Py_FastPathExecute(ctx._handle, ctx.device_name, "AddN",
-                                          None, None, [a_2_by_2, b_2_by_2]))
+        pywrap_tfe.TFE_Py_FastPathExecute(ctx, "AddN", None,
+                                          [a_2_by_2, b_2_by_2]))
 
   # Tests homogeneous list op
   @test_util.assert_no_new_tensors
@@ -150,8 +145,7 @@ class Tests(test.TestCase):
     with backprop.GradientTape(persistent=True) as tape:
       tape.watch(a_2_by_2)
       tape.watch(b_2_by_2)
-      z1 = pywrap_tfe.TFE_Py_FastPathExecute(ctx._handle, ctx.device_name,
-                                             "AddN", None, None,
+      z1 = pywrap_tfe.TFE_Py_FastPathExecute(ctx, "AddN", None,
                                              [a_2_by_2, b_2_by_2])
       z2 = math_ops.add_n([a_2_by_2, b_2_by_2])
     dz1_dy = tape.gradient(z1, [a_2_by_2])[0]
@@ -170,8 +164,7 @@ class Tests(test.TestCase):
 
     self.assertAllClose(
         array_ops.identity_n([a_2_by_2, b_2_by_2]),
-        pywrap_tfe.TFE_Py_FastPathExecute(ctx._handle, ctx.device_name,
-                                          "IdentityN", None, None,
+        pywrap_tfe.TFE_Py_FastPathExecute(ctx, "IdentityN", None,
                                           [a_2_by_2, b_2_by_2]))
 
   # Tests heterogeneous list op
@@ -187,9 +180,8 @@ class Tests(test.TestCase):
     with backprop.GradientTape(persistent=True) as tape:
       tape.watch(a_2_by_2)
       tape.watch(b_2_by_2)
-      z1 = pywrap_tfe.TFE_Py_FastPathExecute(ctx._handle, ctx.device_name,
-                                             "IdentityN", None, None,
-                                             [a_2_by_2, b_2_by_2])
+      z1 = pywrap_tfe.TFE_Py_FastPathExecute(ctx, "IdentityN",
+                                             None, [a_2_by_2, b_2_by_2])
       z2 = array_ops.identity_n([a_2_by_2, b_2_by_2])
     dz1_dy = tape.gradient(z1[0], [a_2_by_2])[0]
     dz2_dy = tape.gradient(z2[0], [a_2_by_2])[0]
@@ -208,18 +200,17 @@ class Tests(test.TestCase):
 
     # Not enough base params
     with self.assertRaisesRegex(ValueError,
-                                "at least 5 items in the input tuple"):
-      pywrap_tfe.TFE_Py_FastPathExecute(ctx_handle, ctx.device_name, "Identity")
+                                "at least 3 items in the input tuple"):
+      pywrap_tfe.TFE_Py_FastPathExecute(ctx, "Identity")
 
     # Not enough inputs
-    with self.assertRaisesRegex(ValueError, "Expected to be at least 6, was 5"):
-      pywrap_tfe.TFE_Py_FastPathExecute(ctx_handle, ctx_handle, "Identity",
-                                        None, [])
+    with self.assertRaisesRegex(ValueError, "Expected to be at least 4, was 3"):
+      pywrap_tfe.TFE_Py_FastPathExecute(ctx, "Identity", None)
 
     # Bad type
     with self.assertRaisesRegex(TypeError, "expected a string for op_name"):
-      pywrap_tfe.TFE_Py_FastPathExecute(ctx_handle, ctx.device_name, ctx_handle,
-                                        None, [], a_2_by_2)
+      pywrap_tfe.TFE_Py_FastPathExecute(ctx, ctx_handle, None,
+                                        a_2_by_2)
 
   @test_util.assert_no_new_tensors
   @test_util.assert_no_garbage_created
@@ -229,11 +220,9 @@ class Tests(test.TestCase):
     ctx = context.context()
     ctx.ensure_initialized()
 
-    ctx_handle = ctx._handle
     with self.assertRaises(core._FallbackException):
-      pywrap_tfe.TFE_Py_FastPathExecute(ctx_handle, ctx.device_name, "Split",
-                                        None, None, split_dim, value,
-                                        "num_split", -1)
+      pywrap_tfe.TFE_Py_FastPathExecute(ctx, "Split", None,
+                                        split_dim, value, "num_split", -1)
 
   @test_util.assert_no_new_tensors
   @test_util.assert_no_garbage_created
@@ -273,9 +262,9 @@ class Tests(test.TestCase):
     ctx = context.context()
     ctx.ensure_initialized()
     with self.assertRaises(core._FallbackException):
-      pywrap_tfe.TFE_Py_FastPathExecute(ctx._handle, ctx.device_name, "MatMul",
-                                        None, None, m, m, "transpose_a", False,
-                                        "transpose_b", False)
+      pywrap_tfe.TFE_Py_FastPathExecute(ctx, "MatMul", None, m, m,
+                                        "transpose_a", False, "transpose_b",
+                                        False)
 
   def testOpDefDefaultType(self):
     im = np.random.randint(
diff --git a/tensorflow/python/eager/remote_benchmarks_test.py b/tensorflow/python/eager/remote_benchmarks_test.py
index 300ce0c2b90..73437132cc5 100644
--- a/tensorflow/python/eager/remote_benchmarks_test.py
+++ b/tensorflow/python/eager/remote_benchmarks_test.py
@@ -92,7 +92,7 @@ class RemoteWorkerMicroBenchmarks(test.Benchmark):
         wall_time=mean_us,
         extras={"examples_per_sec": num_iters / total_time})
 
-  def benchmark_send_mirroring_off(self):
+  def benchmark_send(self):
     remote.connect_to_remote_host(self._cached_server_target1)
 
     x = random_ops.random_uniform((2, 2)).cpu()
@@ -105,34 +105,13 @@ class RemoteWorkerMicroBenchmarks(test.Benchmark):
       with ops.device("job:worker/replica:0/task:0/device:CPU:0"):
         return remote_func(m)
 
-    context.context().mirroring_policy = context.MIRRORING_NONE
     self._run(lambda: func(x))
     # NOTE(b/136184459): Force garbage collecting hanging resources before
     # subsequent calls to set_server_def, to ensure the destroy resource ops are
     # executed when their corresponding device and manager are still available.
     gc.collect()
 
-  def benchmark_send_mirroring_on(self):
-    remote.connect_to_remote_host(self._cached_server_target1)
-
-    x = random_ops.random_uniform((2, 2)).cpu()
-
-    @def_function.function
-    def remote_func(m):
-      return math_ops.matmul(m, m)
-
-    def func(m):
-      with ops.device("job:worker/replica:0/task:0/device:CPU:0"):
-        return remote_func(m)
-
-    context.context().mirroring_policy = context.MIRRORING_ALL
-    self._run(lambda: func(x))
-    # NOTE(b/136184459): Force garbage collecting hanging resources before
-    # subsequent calls to set_server_def, to ensure the destroy resource ops are
-    # executed when their corresponding device and manager are still available.
-    gc.collect()
-
-  def benchmark_worker_mirroring_off(self):
+  def benchmark_worker_recv(self):
     remote.connect_to_remote_host(
         [self._cached_server_target1, self._cached_server_target2])
 
@@ -147,29 +126,6 @@ class RemoteWorkerMicroBenchmarks(test.Benchmark):
       with ops.device("job:worker/replica:0/task:0/device:CPU:0"):
         return remote_func()
 
-    context.context().mirroring_policy = context.MIRRORING_NONE
-    self._run(func)
-    # NOTE(b/136184459): Force garbage collecting hanging resources before
-    # subsequent calls to set_server_def, to ensure the destroy resource ops are
-    # executed when their corresponding device and manager are still available.
-    gc.collect()
-
-  def benchmark_worker_mirroring_on(self):
-    remote.connect_to_remote_host(
-        [self._cached_server_target1, self._cached_server_target2])
-
-    with ops.device("job:worker/replica:0/task:1/device:CPU:0"):
-      v = variables.Variable(1.0)
-
-    @def_function.function
-    def remote_func():
-      return 1.0 + v
-
-    def func():
-      with ops.device("job:worker/replica:0/task:0/device:CPU:0"):
-        return remote_func()
-
-    context.context().mirroring_policy = context.MIRRORING_ALL
     self._run(func)
     # NOTE(b/136184459): Force garbage collecting hanging resources before
     # subsequent calls to set_server_def, to ensure the destroy resource ops are
diff --git a/tensorflow/python/eager/remote_cloud_tpu_test.py b/tensorflow/python/eager/remote_cloud_tpu_test.py
index 8ba11a3e6ac..3a485b54783 100644
--- a/tensorflow/python/eager/remote_cloud_tpu_test.py
+++ b/tensorflow/python/eager/remote_cloud_tpu_test.py
@@ -36,7 +36,6 @@ DEVICES_PER_TASK = 8
 
 EXPECTED_DEVICES_PRE_CONNECT = [
     '/device:CPU:0',
-    '/device:XLA_CPU:0',
 ]
 EXPECTED_NEW_DEVICES_AFTER_CONNECT_TEMPLATES = [
     '/job:worker/replica:0/task:{task}/device:CPU:0',
diff --git a/tensorflow/python/eager/remote_test.py b/tensorflow/python/eager/remote_test.py
index 710e7bf5f9d..0fb78cb2846 100644
--- a/tensorflow/python/eager/remote_test.py
+++ b/tensorflow/python/eager/remote_test.py
@@ -18,7 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
 import random
+import time
 
 from absl.testing import parameterized
 import numpy as np
@@ -26,6 +28,7 @@ import six
 
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute.cluster_resolver.cluster_resolver import SimpleClusterResolver
+from tensorflow.python.eager import cancellation
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import remote
@@ -38,6 +41,7 @@ from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -88,7 +92,6 @@ class SingleWorkerTest(test.TestCase, parameterized.TestCase):
 
     self.assertAllEqual(with_variable(constant_op.constant([2])).numpy(), [3])
 
-  @test_util.eager_lazy_remote_copy_on_and_off
   def testMultiDeviceFunctionRemoteOutput(self):
     with ops.device('/job:worker/replica:0/task:0/cpu:0'):
       variable_b = variables.Variable(1)
@@ -97,10 +100,15 @@ class SingleWorkerTest(test.TestCase, parameterized.TestCase):
     def remote_output(i):
       with ops.device('/job:worker/replica:0/task:0/cpu:0'):
         c = variable_b + 1
-      return c, i + variable_b
+      return i + variable_b, c
 
-    self.assertAllEqual(
-        remote_output(constant_op.constant([1]))[0].numpy(), 2)
+    rets = remote_output(constant_op.constant([1]))
+    self.assertEqual(rets[0].backing_device,
+                     '/job:localhost/replica:0/task:0/device:CPU:0')
+    self.assertEqual(rets[1].backing_device,
+                     '/job:worker/replica:0/task:0/device:CPU:0')
+    self.assertAllEqual(rets[0].numpy(), [2])
+    self.assertAllEqual(rets[1].numpy(), 2)
 
   def testMultiDeviceFunctionAmbiguousDevice(self):
 
@@ -311,6 +319,63 @@ class MultiWorkersTest(test.TestCase, parameterized.TestCase):
     with ops.device('/job:worker/replica:0/task:1'):
       self.assertAllEqual(local_func(x), [2, 1])
 
+  # Note that the following tests for remote function cancellation only works
+  # when non-streaming RPC. We need to disable streaming explicitly and restore
+  # this config to its initial value at the end of each test case.
+  def testCancelRemoteFunctionBeforeExecution(self):
+    remote_async_env_var = 'TF_ENABLE_EAGER_CLIENT_STREAMING_ENQUEUE'
+    default_streaming = os.environ.get(remote_async_env_var)
+    os.environ[remote_async_env_var] = str(False)
+
+    q = data_flow_ops.FIFOQueue(1, dtypes.int32)
+
+    @def_function.function
+    def f():
+      return q.dequeue()
+
+    c_mgr = cancellation.CancellationManager()
+    cancelable_func = c_mgr.get_cancelable_function(f.get_concrete_function())
+
+    c_mgr.start_cancel()
+    with self.assertRaises(errors.CancelledError):
+      with ops.device('/job:worker/replica:0/task:1'):
+        cancelable_func()
+
+    if default_streaming is None:
+      del os.environ[remote_async_env_var]
+    else:
+      os.environ[remote_async_env_var] = default_streaming
+
+  def testCancelRemoteFunctionDuringExecution(self):
+    remote_async_env_var = 'TF_ENABLE_EAGER_CLIENT_STREAMING_ENQUEUE'
+    default_streaming = os.environ.get(remote_async_env_var)
+    os.environ[remote_async_env_var] = str(False)
+
+    q = data_flow_ops.FIFOQueue(1, dtypes.int32)
+
+    @def_function.function
+    def f():
+      return q.dequeue()
+
+    c_mgr = cancellation.CancellationManager()
+    cancelable_func = c_mgr.get_cancelable_function(f.get_concrete_function())
+
+    def cancel_thread():
+      time.sleep(0.5)
+      c_mgr.start_cancel()
+
+    t = self.checkedThread(cancel_thread)
+    t.start()
+    with self.assertRaises(errors.CancelledError):
+      with ops.device('/job:worker/replica:0/task:1'):
+        cancelable_func()
+    t.join()
+
+    if default_streaming is None:
+      del os.environ[remote_async_env_var]
+    else:
+      os.environ[remote_async_env_var] = default_streaming
+
   @test_util.eager_lazy_remote_copy_on_and_off
   def testMultiDeviceFunctionOnLocalDevice(self):
     with ops.device('/job:worker/replica:0/task:1'):
@@ -403,8 +468,6 @@ class MultiWorkersTest(test.TestCase, parameterized.TestCase):
       c = a + 1.0
       return c
 
-    context.context().mirroring_policy = context.MIRRORING_NONE
-
     with ops.device('/job:worker/replica:0/task:0'):
       self.assertAllEqual(remote_function(constant_op.constant([1.0])), [3.0])
 
@@ -412,14 +475,24 @@ class MultiWorkersTest(test.TestCase, parameterized.TestCase):
       with ops.device('/job:worker/replica:0/task:0/device:GPU:0'):
         self.assertAllEqual(remote_function(constant_op.constant([1.0])), [3.0])
 
-    context.context().mirroring_policy = context.MIRRORING_ALL
+  def testMultiDeviceFunctionRemoteOutput(self):
+    with ops.device('/job:worker/replica:0/task:1/cpu:0'):
+      variable_b = variables.Variable(1)
 
-    with ops.device('/job:worker/replica:0/task:0'):
-      self.assertAllEqual(remote_function(constant_op.constant([1.0])), [3.0])
+    @def_function.function
+    def remote_output(i):
+      with ops.device('/job:worker/replica:0/task:1/cpu:0'):
+        c = variable_b + 1
+      return i + variable_b, c
 
-    if test_util.is_gpu_available():
-      with ops.device('/job:worker/replica:0/task:0/device:GPU:0'):
-        self.assertAllEqual(remote_function(constant_op.constant([1.0])), [3.0])
+    with ops.device('/job:worker/replica:0/task:0/cpu:0'):
+      rets = remote_output(constant_op.constant([1]))
+    self.assertEqual(rets[0].backing_device,
+                     '/job:worker/replica:0/task:0/device:CPU:0')
+    self.assertEqual(rets[1].backing_device,
+                     '/job:worker/replica:0/task:1/device:CPU:0')
+    self.assertAllEqual(rets[0].numpy(), [2])
+    self.assertAllEqual(rets[1].numpy(), 2)
 
   @test_util.eager_lazy_remote_copy_on_and_off
   def testMultiDeviceWhileLoopOnRemoteDevice(self):
@@ -436,17 +509,6 @@ class MultiWorkersTest(test.TestCase, parameterized.TestCase):
 
       return control_flow_ops.while_loop_v2(lambda _, d: d < 1, body, [i, 0])[0]
 
-    context.context().mirroring_policy = context.MIRRORING_NONE
-
-    with ops.device('/job:worker/replica:0/task:0'):
-      self.assertAllEqual(remote_function(constant_op.constant([1.0])), [3.0])
-
-    if test_util.is_gpu_available():
-      with ops.device('/job:worker/replica:0/task:0/device:GPU:0'):
-        self.assertAllEqual(remote_function(constant_op.constant([1.0])), [3.0])
-
-    context.context().mirroring_policy = context.MIRRORING_ALL
-
     with ops.device('/job:worker/replica:0/task:0'):
       self.assertAllEqual(remote_function(constant_op.constant([1.0])), [3.0])
 
diff --git a/tensorflow/python/feature_column/feature_column_test.py b/tensorflow/python/feature_column/feature_column_test.py
index e598848282f..755df6060d3 100644
--- a/tensorflow/python/feature_column/feature_column_test.py
+++ b/tensorflow/python/feature_column/feature_column_test.py
@@ -2821,27 +2821,26 @@ class FunctionalInputLayerTest(test.TestCase):
                             variables_lib.Variable)
       self.assertAllEqual(cols_to_vars[some_embedding_column][0].shape, [5, 10])
 
-  @test_util.run_deprecated_v1
   def test_fills_cols_to_vars_shared_embedding(self):
     # Provide 5 DenseColumn's to input_layer: a NumericColumn, a
     # BucketizedColumn, an EmbeddingColumn, two SharedEmbeddingColumns. The
     # EmbeddingColumn creates a Variable and the two SharedEmbeddingColumns
     # shared one variable.
-    price1 = fc._numeric_column('price1')
-    dense_feature = fc._numeric_column('dense_feature')
-    dense_feature_bucketized = fc._bucketized_column(
-        dense_feature, boundaries=[0.])
-    some_sparse_column = fc._categorical_column_with_hash_bucket(
-        'sparse_feature', hash_bucket_size=5)
-    some_embedding_column = fc._embedding_column(
-        some_sparse_column, dimension=10)
-    categorical_column_a = fc._categorical_column_with_identity(
-        key='aaa', num_buckets=3)
-    categorical_column_b = fc._categorical_column_with_identity(
-        key='bbb', num_buckets=3)
-    shared_embedding_a, shared_embedding_b = fc_new.shared_embedding_columns(
-        [categorical_column_a, categorical_column_b], dimension=2)
     with ops.Graph().as_default():
+      price1 = fc._numeric_column('price1')
+      dense_feature = fc._numeric_column('dense_feature')
+      dense_feature_bucketized = fc._bucketized_column(
+          dense_feature, boundaries=[0.])
+      some_sparse_column = fc._categorical_column_with_hash_bucket(
+          'sparse_feature', hash_bucket_size=5)
+      some_embedding_column = fc._embedding_column(
+          some_sparse_column, dimension=10)
+      categorical_column_a = fc._categorical_column_with_identity(
+          key='aaa', num_buckets=3)
+      categorical_column_b = fc._categorical_column_with_identity(
+          key='bbb', num_buckets=3)
+      shared_embedding_a, shared_embedding_b = fc_new.shared_embedding_columns(
+          [categorical_column_a, categorical_column_b], dimension=2)
       features = {
           'price1': [[3.], [4.]],
           'dense_feature': [[-1.], [4.]],
@@ -3019,18 +3018,17 @@ class FunctionalInputLayerTest(test.TestCase):
           expected_var_names,
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
 
-  @test_util.run_deprecated_v1
   def test_multiple_layers_with_same_shared_embedding_column(self):
-    categorical_column_a = fc._categorical_column_with_identity(
-        key='aaa', num_buckets=3)
-    categorical_column_b = fc._categorical_column_with_identity(
-        key='bbb', num_buckets=3)
-    embedding_dimension = 2
-    embedding_column_b, embedding_column_a = fc_new.shared_embedding_columns(
-        [categorical_column_b, categorical_column_a],
-        dimension=embedding_dimension)
-
     with ops.Graph().as_default():
+      categorical_column_a = fc._categorical_column_with_identity(
+          key='aaa', num_buckets=3)
+      categorical_column_b = fc._categorical_column_with_identity(
+          key='bbb', num_buckets=3)
+      embedding_dimension = 2
+      embedding_column_b, embedding_column_a = fc_new.shared_embedding_columns(
+          [categorical_column_b, categorical_column_a],
+          dimension=embedding_dimension)
+
       features = {
           'aaa':
               sparse_tensor.SparseTensor(
@@ -3053,19 +3051,18 @@ class FunctionalInputLayerTest(test.TestCase):
           ['input_layer/aaa_bbb_shared_embedding/embedding_weights:0'],
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
 
-  @test_util.run_deprecated_v1
   def test_multiple_layers_with_same_shared_embedding_column_diff_graphs(self):
-    categorical_column_a = fc._categorical_column_with_identity(
-        key='aaa', num_buckets=3)
-    categorical_column_b = fc._categorical_column_with_identity(
-        key='bbb', num_buckets=3)
-    embedding_dimension = 2
-    embedding_column_b, embedding_column_a = fc_new.shared_embedding_columns(
-        [categorical_column_b, categorical_column_a],
-        dimension=embedding_dimension)
-    all_cols = [embedding_column_a, embedding_column_b]
-
     with ops.Graph().as_default():
+      categorical_column_a = fc._categorical_column_with_identity(
+          key='aaa', num_buckets=3)
+      categorical_column_b = fc._categorical_column_with_identity(
+          key='bbb', num_buckets=3)
+      embedding_dimension = 2
+      embedding_column_b, embedding_column_a = fc_new.shared_embedding_columns(
+          [categorical_column_b, categorical_column_a],
+          dimension=embedding_dimension)
+      all_cols = [embedding_column_a, embedding_column_b]
+
       features = {
           'aaa':
               sparse_tensor.SparseTensor(
@@ -3105,56 +3102,56 @@ class FunctionalInputLayerTest(test.TestCase):
           ['input_layer/aaa_bbb_shared_embedding/embedding_weights:0'],
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
 
-  @test_util.run_deprecated_v1
   def test_with_1d_sparse_tensor(self):
-    embedding_values = (
-        (1., 2., 3., 4., 5.),  # id 0
-        (6., 7., 8., 9., 10.),  # id 1
-        (11., 12., 13., 14., 15.)  # id 2
-    )
-    def _initializer(shape, dtype, partition_info):
-      del shape, dtype, partition_info
-      return embedding_values
+    with ops.Graph().as_default():
+      embedding_values = (
+          (1., 2., 3., 4., 5.),  # id 0
+          (6., 7., 8., 9., 10.),  # id 1
+          (11., 12., 13., 14., 15.)  # id 2
+      )
+      def _initializer(shape, dtype, partition_info):
+        del shape, dtype, partition_info
+        return embedding_values
 
-    # price has 1 dimension in input_layer
-    price = fc._numeric_column('price')
+      # price has 1 dimension in input_layer
+      price = fc._numeric_column('price')
 
-    # one_hot_body_style has 3 dims in input_layer.
-    body_style = fc._categorical_column_with_vocabulary_list(
-        'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
-    one_hot_body_style = fc._indicator_column(body_style)
+      # one_hot_body_style has 3 dims in input_layer.
+      body_style = fc._categorical_column_with_vocabulary_list(
+          'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
+      one_hot_body_style = fc._indicator_column(body_style)
 
-    # embedded_body_style has 5 dims in input_layer.
-    country = fc._categorical_column_with_vocabulary_list(
-        'country', vocabulary_list=['US', 'JP', 'CA'])
-    embedded_country = fc._embedding_column(
-        country, dimension=5, initializer=_initializer)
+      # embedded_body_style has 5 dims in input_layer.
+      country = fc._categorical_column_with_vocabulary_list(
+          'country', vocabulary_list=['US', 'JP', 'CA'])
+      embedded_country = fc._embedding_column(
+          country, dimension=5, initializer=_initializer)
 
-    # Provides 1-dim tensor and dense tensor.
-    features = {
-        'price': constant_op.constant([11., 12.,]),
-        'body-style': sparse_tensor.SparseTensor(
-            indices=((0,), (1,)),
-            values=('sedan', 'hardtop'),
-            dense_shape=(2,)),
-        # This is dense tensor for the categorical_column.
-        'country': constant_op.constant(['CA', 'US']),
-    }
-    self.assertEqual(1, features['price'].shape.ndims)
-    self.assertEqual(1, features['body-style'].dense_shape.get_shape()[0])
-    self.assertEqual(1, features['country'].shape.ndims)
+      # Provides 1-dim tensor and dense tensor.
+      features = {
+          'price': constant_op.constant([11., 12.,]),
+          'body-style': sparse_tensor.SparseTensor(
+              indices=((0,), (1,)),
+              values=('sedan', 'hardtop'),
+              dense_shape=(2,)),
+          # This is dense tensor for the categorical_column.
+          'country': constant_op.constant(['CA', 'US']),
+      }
+      self.assertEqual(1, features['price'].shape.ndims)
+      self.assertEqual(1, features['body-style'].dense_shape.get_shape()[0])
+      self.assertEqual(1, features['country'].shape.ndims)
 
-    net = fc.input_layer(features,
-                         [price, one_hot_body_style, embedded_country])
-    self.assertEqual(1 + 3 + 5, net.shape[1])
-    with _initialized_session() as sess:
+      net = fc.input_layer(features,
+                           [price, one_hot_body_style, embedded_country])
+      self.assertEqual(1 + 3 + 5, net.shape[1])
+      with _initialized_session():
 
-      # Each row is formed by concatenating `embedded_body_style`,
-      # `one_hot_body_style`, and `price` in order.
-      self.assertAllEqual(
-          [[0., 0., 1., 11., 12., 13., 14., 15., 11.],
-           [1., 0., 0., 1., 2., 3., 4., 5., 12.]],
-          sess.run(net))
+        # Each row is formed by concatenating `embedded_body_style`,
+        # `one_hot_body_style`, and `price` in order.
+        self.assertAllEqual(
+            [[0., 0., 1., 11., 12., 13., 14., 15., 11.],
+             [1., 0., 0., 1., 2., 3., 4., 5., 12.]],
+            self.evaluate(net))
 
   @test_util.run_deprecated_v1
   def test_with_1d_unknown_shape_sparse_tensor(self):
@@ -3347,7 +3344,6 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
         'python/feature_column/testdata/wire_vocabulary.txt')
     self._wire_vocabulary_size = 3
 
-  @test_util.run_deprecated_v1
   def test_defaults(self):
     column = fc._categorical_column_with_vocabulary_file(
         key='aaa', vocabulary_file='path_to_file', vocabulary_size=3)
@@ -3364,7 +3360,6 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
       fc._categorical_column_with_vocabulary_file(
           key=('aaa',), vocabulary_file='path_to_file', vocabulary_size=3)
 
-  @test_util.run_deprecated_v1
   def test_all_constructor_args(self):
     column = fc._categorical_column_with_vocabulary_file(
         key='aaa',
@@ -3377,7 +3372,6 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
         'aaa': parsing_ops.VarLenFeature(dtypes.int32)
     }, column._parse_example_spec)
 
-  @test_util.run_deprecated_v1
   def test_deep_copy(self):
     original = fc._categorical_column_with_vocabulary_file(
         key='aaa',
@@ -3402,18 +3396,18 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
       fc._categorical_column_with_vocabulary_file(
           key='aaa', vocabulary_file='', vocabulary_size=3)
 
-  @test_util.run_deprecated_v1
   def test_invalid_vocabulary_file(self):
-    column = fc._categorical_column_with_vocabulary_file(
-        key='aaa', vocabulary_file='file_does_not_exist', vocabulary_size=10)
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=('marlo', 'skywalker', 'omar'),
-        dense_shape=(2, 2))
-    column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
-    with self.assertRaisesRegex(errors.OpError, 'file_does_not_exist'):
-      with self.cached_session():
-        lookup_ops.tables_initializer().run()
+    with ops.Graph().as_default():
+      column = fc._categorical_column_with_vocabulary_file(
+          key='aaa', vocabulary_file='file_does_not_exist', vocabulary_size=10)
+      inputs = sparse_tensor.SparseTensorValue(
+          indices=((0, 0), (1, 0), (1, 1)),
+          values=('marlo', 'skywalker', 'omar'),
+          dense_shape=(2, 2))
+      column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+      with self.assertRaisesRegex(errors.OpError, 'file_does_not_exist'):
+        with self.cached_session():
+          lookup_ops.tables_initializer().run()
 
   def test_invalid_vocabulary_size(self):
     with self.assertRaisesRegex(ValueError, 'Invalid vocabulary_size'):
@@ -3427,20 +3421,20 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
           vocabulary_file=self._wire_vocabulary_file_name,
           vocabulary_size=0)
 
-  @test_util.run_deprecated_v1
   def test_too_large_vocabulary_size(self):
-    column = fc._categorical_column_with_vocabulary_file(
-        key='aaa',
-        vocabulary_file=self._wire_vocabulary_file_name,
-        vocabulary_size=self._wire_vocabulary_size + 1)
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=('marlo', 'skywalker', 'omar'),
-        dense_shape=(2, 2))
-    column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
-    with self.assertRaisesRegex(errors.OpError, 'Invalid vocab_size'):
-      with self.cached_session():
-        lookup_ops.tables_initializer().run()
+    with ops.Graph().as_default():
+      column = fc._categorical_column_with_vocabulary_file(
+          key='aaa',
+          vocabulary_file=self._wire_vocabulary_file_name,
+          vocabulary_size=self._wire_vocabulary_size + 1)
+      inputs = sparse_tensor.SparseTensorValue(
+          indices=((0, 0), (1, 0), (1, 1)),
+          values=('marlo', 'skywalker', 'omar'),
+          dense_shape=(2, 2))
+      column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+      with self.assertRaisesRegex(errors.OpError, 'Invalid vocab_size'):
+        with self.cached_session():
+          lookup_ops.tables_initializer().run()
 
   def test_invalid_num_oov_buckets(self):
     with self.assertRaisesRegex(ValueError, 'Invalid num_oov_buckets'):
@@ -3517,64 +3511,64 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
               dense_shape=[1, 2]),
           features['aaa'].eval())
 
-  @test_util.run_deprecated_v1
   def test_get_sparse_tensors(self):
-    column = fc._categorical_column_with_vocabulary_file(
-        key='aaa',
-        vocabulary_file=self._wire_vocabulary_file_name,
-        vocabulary_size=self._wire_vocabulary_size)
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=('marlo', 'skywalker', 'omar'),
-        dense_shape=(2, 2))
-    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
-    self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((2, -1, 0), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
+    with ops.Graph().as_default():
+      column = fc._categorical_column_with_vocabulary_file(
+          key='aaa',
+          vocabulary_file=self._wire_vocabulary_file_name,
+          vocabulary_size=self._wire_vocabulary_size)
+      inputs = sparse_tensor.SparseTensorValue(
+          indices=((0, 0), (1, 0), (1, 1)),
+          values=('marlo', 'skywalker', 'omar'),
+          dense_shape=(2, 2))
+      id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+      self.assertIsNone(id_weight_pair.weight_tensor)
+      with _initialized_session():
+        _assert_sparse_tensor_value(
+            self,
+            sparse_tensor.SparseTensorValue(
+                indices=inputs.indices,
+                values=np.array((2, -1, 0), dtype=np.int64),
+                dense_shape=inputs.dense_shape),
+            id_weight_pair.id_tensor.eval())
 
-  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_none_vocabulary_size(self):
-    column = fc._categorical_column_with_vocabulary_file(
-        key='aaa', vocabulary_file=self._wire_vocabulary_file_name)
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=('marlo', 'skywalker', 'omar'),
-        dense_shape=(2, 2))
-    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
-    self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(self,
-                                  sparse_tensor.SparseTensorValue(
-                                      indices=inputs.indices,
-                                      values=np.array(
-                                          (2, -1, 0), dtype=np.int64),
-                                      dense_shape=inputs.dense_shape),
-                                  id_weight_pair.id_tensor.eval())
+    with ops.Graph().as_default():
+      column = fc._categorical_column_with_vocabulary_file(
+          key='aaa', vocabulary_file=self._wire_vocabulary_file_name)
+      inputs = sparse_tensor.SparseTensorValue(
+          indices=((0, 0), (1, 0), (1, 1)),
+          values=('marlo', 'skywalker', 'omar'),
+          dense_shape=(2, 2))
+      id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+      self.assertIsNone(id_weight_pair.weight_tensor)
+      with _initialized_session():
+        _assert_sparse_tensor_value(self,
+                                    sparse_tensor.SparseTensorValue(
+                                        indices=inputs.indices,
+                                        values=np.array(
+                                            (2, -1, 0), dtype=np.int64),
+                                        dense_shape=inputs.dense_shape),
+                                    id_weight_pair.id_tensor.eval())
 
-  @test_util.run_deprecated_v1
   def test_transform_feature(self):
-    column = fc._categorical_column_with_vocabulary_file(
-        key='aaa',
-        vocabulary_file=self._wire_vocabulary_file_name,
-        vocabulary_size=self._wire_vocabulary_size)
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=('marlo', 'skywalker', 'omar'),
-        dense_shape=(2, 2))
-    id_tensor = _transform_features({'aaa': inputs}, [column])[column]
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((2, -1, 0), dtype=np.int64),
-              dense_shape=inputs.dense_shape), self.evaluate(id_tensor))
+    with ops.Graph().as_default():
+      column = fc._categorical_column_with_vocabulary_file(
+          key='aaa',
+          vocabulary_file=self._wire_vocabulary_file_name,
+          vocabulary_size=self._wire_vocabulary_size)
+      inputs = sparse_tensor.SparseTensorValue(
+          indices=((0, 0), (1, 0), (1, 1)),
+          values=('marlo', 'skywalker', 'omar'),
+          dense_shape=(2, 2))
+      id_tensor = _transform_features({'aaa': inputs}, [column])[column]
+      with _initialized_session():
+        _assert_sparse_tensor_value(
+            self,
+            sparse_tensor.SparseTensorValue(
+                indices=inputs.indices,
+                values=np.array((2, -1, 0), dtype=np.int64),
+                dense_shape=inputs.dense_shape), self.evaluate(id_tensor))
 
   def test_get_sparse_tensors_weight_collections(self):
     column = fc._categorical_column_with_vocabulary_file(
@@ -3594,163 +3588,162 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
                           ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
     self.assertCountEqual([], ops.get_collection('my_weights'))
 
-  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_dense_input(self):
-    column = fc._categorical_column_with_vocabulary_file(
-        key='aaa',
-        vocabulary_file=self._wire_vocabulary_file_name,
-        vocabulary_size=self._wire_vocabulary_size)
-    id_weight_pair = column._get_sparse_tensors(
-        _LazyBuilder({
-            'aaa': (('marlo', ''), ('skywalker', 'omar'))
-        }))
-    self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=((0, 0), (1, 0), (1, 1)),
-              values=np.array((2, -1, 0), dtype=np.int64),
-              dense_shape=(2, 2)),
-          id_weight_pair.id_tensor.eval())
+    with ops.Graph().as_default():
+      column = fc._categorical_column_with_vocabulary_file(
+          key='aaa',
+          vocabulary_file=self._wire_vocabulary_file_name,
+          vocabulary_size=self._wire_vocabulary_size)
+      id_weight_pair = column._get_sparse_tensors(
+          _LazyBuilder({
+              'aaa': (('marlo', ''), ('skywalker', 'omar'))
+          }))
+      self.assertIsNone(id_weight_pair.weight_tensor)
+      with _initialized_session():
+        _assert_sparse_tensor_value(
+            self,
+            sparse_tensor.SparseTensorValue(
+                indices=((0, 0), (1, 0), (1, 1)),
+                values=np.array((2, -1, 0), dtype=np.int64),
+                dense_shape=(2, 2)),
+            id_weight_pair.id_tensor.eval())
 
-  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_default_value_in_vocabulary(self):
-    column = fc._categorical_column_with_vocabulary_file(
-        key='aaa',
-        vocabulary_file=self._wire_vocabulary_file_name,
-        vocabulary_size=self._wire_vocabulary_size,
-        default_value=2)
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=('marlo', 'skywalker', 'omar'),
-        dense_shape=(2, 2))
-    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
-    self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((2, 2, 0), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
+    with ops.Graph().as_default():
+      column = fc._categorical_column_with_vocabulary_file(
+          key='aaa',
+          vocabulary_file=self._wire_vocabulary_file_name,
+          vocabulary_size=self._wire_vocabulary_size,
+          default_value=2)
+      inputs = sparse_tensor.SparseTensorValue(
+          indices=((0, 0), (1, 0), (1, 1)),
+          values=('marlo', 'skywalker', 'omar'),
+          dense_shape=(2, 2))
+      id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+      self.assertIsNone(id_weight_pair.weight_tensor)
+      with _initialized_session():
+        _assert_sparse_tensor_value(
+            self,
+            sparse_tensor.SparseTensorValue(
+                indices=inputs.indices,
+                values=np.array((2, 2, 0), dtype=np.int64),
+                dense_shape=inputs.dense_shape),
+            id_weight_pair.id_tensor.eval())
 
-  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_with_oov_buckets(self):
-    column = fc._categorical_column_with_vocabulary_file(
-        key='aaa',
-        vocabulary_file=self._wire_vocabulary_file_name,
-        vocabulary_size=self._wire_vocabulary_size,
-        num_oov_buckets=100)
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1), (1, 2)),
-        values=('marlo', 'skywalker', 'omar', 'heisenberg'),
-        dense_shape=(2, 3))
-    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
-    self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((2, 33, 0, 62), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
+    with ops.Graph().as_default():
+      column = fc._categorical_column_with_vocabulary_file(
+          key='aaa',
+          vocabulary_file=self._wire_vocabulary_file_name,
+          vocabulary_size=self._wire_vocabulary_size,
+          num_oov_buckets=100)
+      inputs = sparse_tensor.SparseTensorValue(
+          indices=((0, 0), (1, 0), (1, 1), (1, 2)),
+          values=('marlo', 'skywalker', 'omar', 'heisenberg'),
+          dense_shape=(2, 3))
+      id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+      self.assertIsNone(id_weight_pair.weight_tensor)
+      with _initialized_session():
+        _assert_sparse_tensor_value(
+            self,
+            sparse_tensor.SparseTensorValue(
+                indices=inputs.indices,
+                values=np.array((2, 33, 0, 62), dtype=np.int64),
+                dense_shape=inputs.dense_shape),
+            id_weight_pair.id_tensor.eval())
 
-  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_small_vocabulary_size(self):
-    # 'marlo' is the last entry in our vocabulary file, so be setting
-    # `vocabulary_size` to 1 less than number of entries in file, we take
-    # 'marlo' out of the vocabulary.
-    column = fc._categorical_column_with_vocabulary_file(
-        key='aaa',
-        vocabulary_file=self._wire_vocabulary_file_name,
-        vocabulary_size=self._wire_vocabulary_size - 1)
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=('marlo', 'skywalker', 'omar'),
-        dense_shape=(2, 2))
-    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
-    self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((-1, -1, 0), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
+    with ops.Graph().as_default():
+      # 'marlo' is the last entry in our vocabulary file, so be setting
+      # `vocabulary_size` to 1 less than number of entries in file, we take
+      # 'marlo' out of the vocabulary.
+      column = fc._categorical_column_with_vocabulary_file(
+          key='aaa',
+          vocabulary_file=self._wire_vocabulary_file_name,
+          vocabulary_size=self._wire_vocabulary_size - 1)
+      inputs = sparse_tensor.SparseTensorValue(
+          indices=((0, 0), (1, 0), (1, 1)),
+          values=('marlo', 'skywalker', 'omar'),
+          dense_shape=(2, 2))
+      id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+      self.assertIsNone(id_weight_pair.weight_tensor)
+      with _initialized_session():
+        _assert_sparse_tensor_value(
+            self,
+            sparse_tensor.SparseTensorValue(
+                indices=inputs.indices,
+                values=np.array((-1, -1, 0), dtype=np.int64),
+                dense_shape=inputs.dense_shape),
+            id_weight_pair.id_tensor.eval())
 
-  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_int32(self):
-    column = fc._categorical_column_with_vocabulary_file(
-        key='aaa',
-        vocabulary_file=self._warriors_vocabulary_file_name,
-        vocabulary_size=self._warriors_vocabulary_size,
-        dtype=dtypes.int32)
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1), (2, 2)),
-        values=(11, 100, 30, 22),
-        dense_shape=(3, 3))
-    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
-    self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((2, -1, 0, 4), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
+    with ops.Graph().as_default():
+      column = fc._categorical_column_with_vocabulary_file(
+          key='aaa',
+          vocabulary_file=self._warriors_vocabulary_file_name,
+          vocabulary_size=self._warriors_vocabulary_size,
+          dtype=dtypes.int32)
+      inputs = sparse_tensor.SparseTensorValue(
+          indices=((0, 0), (1, 0), (1, 1), (2, 2)),
+          values=(11, 100, 30, 22),
+          dense_shape=(3, 3))
+      id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+      self.assertIsNone(id_weight_pair.weight_tensor)
+      with _initialized_session():
+        _assert_sparse_tensor_value(
+            self,
+            sparse_tensor.SparseTensorValue(
+                indices=inputs.indices,
+                values=np.array((2, -1, 0, 4), dtype=np.int64),
+                dense_shape=inputs.dense_shape),
+            id_weight_pair.id_tensor.eval())
 
-  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_int32_dense_input(self):
-    default_value = -100
-    column = fc._categorical_column_with_vocabulary_file(
-        key='aaa',
-        vocabulary_file=self._warriors_vocabulary_file_name,
-        vocabulary_size=self._warriors_vocabulary_size,
-        dtype=dtypes.int32,
-        default_value=default_value)
-    id_weight_pair = column._get_sparse_tensors(
-        _LazyBuilder({
-            'aaa': ((11, -1, -1), (100, 30, -1), (-1, -1, 22))
-        }))
-    self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=((0, 0), (1, 0), (1, 1), (2, 2)),
-              values=np.array((2, default_value, 0, 4), dtype=np.int64),
-              dense_shape=(3, 3)),
-          id_weight_pair.id_tensor.eval())
+    with ops.Graph().as_default():
+      default_value = -100
+      column = fc._categorical_column_with_vocabulary_file(
+          key='aaa',
+          vocabulary_file=self._warriors_vocabulary_file_name,
+          vocabulary_size=self._warriors_vocabulary_size,
+          dtype=dtypes.int32,
+          default_value=default_value)
+      id_weight_pair = column._get_sparse_tensors(
+          _LazyBuilder({
+              'aaa': ((11, -1, -1), (100, 30, -1), (-1, -1, 22))
+          }))
+      self.assertIsNone(id_weight_pair.weight_tensor)
+      with _initialized_session():
+        _assert_sparse_tensor_value(
+            self,
+            sparse_tensor.SparseTensorValue(
+                indices=((0, 0), (1, 0), (1, 1), (2, 2)),
+                values=np.array((2, default_value, 0, 4), dtype=np.int64),
+                dense_shape=(3, 3)),
+            id_weight_pair.id_tensor.eval())
 
-  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_int32_with_oov_buckets(self):
-    column = fc._categorical_column_with_vocabulary_file(
-        key='aaa',
-        vocabulary_file=self._warriors_vocabulary_file_name,
-        vocabulary_size=self._warriors_vocabulary_size,
-        dtype=dtypes.int32,
-        num_oov_buckets=100)
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1), (2, 2)),
-        values=(11, 100, 30, 22),
-        dense_shape=(3, 3))
-    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
-    self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((2, 60, 0, 4), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
+    with ops.Graph().as_default():
+      column = fc._categorical_column_with_vocabulary_file(
+          key='aaa',
+          vocabulary_file=self._warriors_vocabulary_file_name,
+          vocabulary_size=self._warriors_vocabulary_size,
+          dtype=dtypes.int32,
+          num_oov_buckets=100)
+      inputs = sparse_tensor.SparseTensorValue(
+          indices=((0, 0), (1, 0), (1, 1), (2, 2)),
+          values=(11, 100, 30, 22),
+          dense_shape=(3, 3))
+      id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+      self.assertIsNone(id_weight_pair.weight_tensor)
+      with _initialized_session():
+        _assert_sparse_tensor_value(
+            self,
+            sparse_tensor.SparseTensorValue(
+                indices=inputs.indices,
+                values=np.array((2, 60, 0, 4), dtype=np.int64),
+                dense_shape=inputs.dense_shape),
+            id_weight_pair.id_tensor.eval())
 
-  @test_util.run_deprecated_v1
   def test_linear_model(self):
     wire_column = fc._categorical_column_with_vocabulary_file(
         key='wire',
@@ -3777,7 +3770,6 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
         # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
         self.assertAllClose(((3.,), (5.,)), self.evaluate(predictions))
 
-  @test_util.run_deprecated_v1
   def test_keras_linear_model(self):
     wire_column = fc._categorical_column_with_vocabulary_file(
         key='wire',
@@ -3835,7 +3827,6 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
         'aaa': parsing_ops.VarLenFeature(dtypes.int64)
     }, column._parse_example_spec)
 
-  @test_util.run_deprecated_v1
   def test_all_constructor_args(self):
     column = fc._categorical_column_with_vocabulary_list(
         key='aaa',
@@ -3847,7 +3838,6 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
         'aaa': parsing_ops.VarLenFeature(dtypes.int32)
     }, column._parse_example_spec)
 
-  @test_util.run_deprecated_v1
   def test_deep_copy(self):
     original = fc._categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=(12, 24, 36), dtype=dtypes.int32)
@@ -3982,41 +3972,41 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
               dense_shape=[1, 2]),
           features['aaa'].eval())
 
-  @test_util.run_deprecated_v1
   def test_get_sparse_tensors(self):
-    column = fc._categorical_column_with_vocabulary_list(
-        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=('marlo', 'skywalker', 'omar'),
-        dense_shape=(2, 2))
-    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
-    self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((2, -1, 0), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
+    with ops.Graph().as_default():
+      column = fc._categorical_column_with_vocabulary_list(
+          key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
+      inputs = sparse_tensor.SparseTensorValue(
+          indices=((0, 0), (1, 0), (1, 1)),
+          values=('marlo', 'skywalker', 'omar'),
+          dense_shape=(2, 2))
+      id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+      self.assertIsNone(id_weight_pair.weight_tensor)
+      with _initialized_session():
+        _assert_sparse_tensor_value(
+            self,
+            sparse_tensor.SparseTensorValue(
+                indices=inputs.indices,
+                values=np.array((2, -1, 0), dtype=np.int64),
+                dense_shape=inputs.dense_shape),
+            id_weight_pair.id_tensor.eval())
 
-  @test_util.run_deprecated_v1
   def test_transform_feature(self):
-    column = fc._categorical_column_with_vocabulary_list(
-        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=('marlo', 'skywalker', 'omar'),
-        dense_shape=(2, 2))
-    id_tensor = _transform_features({'aaa': inputs}, [column])[column]
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((2, -1, 0), dtype=np.int64),
-              dense_shape=inputs.dense_shape), self.evaluate(id_tensor))
+    with ops.Graph().as_default():
+      column = fc._categorical_column_with_vocabulary_list(
+          key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
+      inputs = sparse_tensor.SparseTensorValue(
+          indices=((0, 0), (1, 0), (1, 1)),
+          values=('marlo', 'skywalker', 'omar'),
+          dense_shape=(2, 2))
+      id_tensor = _transform_features({'aaa': inputs}, [column])[column]
+      with _initialized_session():
+        _assert_sparse_tensor_value(
+            self,
+            sparse_tensor.SparseTensorValue(
+                indices=inputs.indices,
+                values=np.array((2, -1, 0), dtype=np.int64),
+                dense_shape=inputs.dense_shape), self.evaluate(id_tensor))
 
   def test_get_sparse_tensors_weight_collections(self):
     column = fc._categorical_column_with_vocabulary_list(
@@ -4034,134 +4024,129 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
                           ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
     self.assertCountEqual([], ops.get_collection('my_weights'))
 
-  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_dense_input(self):
-    column = fc._categorical_column_with_vocabulary_list(
-        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
-    id_weight_pair = column._get_sparse_tensors(
-        _LazyBuilder({
-            'aaa': (('marlo', ''), ('skywalker', 'omar'))
-        }))
-    self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=((0, 0), (1, 0), (1, 1)),
-              values=np.array((2, -1, 0), dtype=np.int64),
-              dense_shape=(2, 2)),
-          id_weight_pair.id_tensor.eval())
+    with ops.Graph().as_default():
+      column = fc._categorical_column_with_vocabulary_list(
+          key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
+      id_weight_pair = column._get_sparse_tensors(
+          _LazyBuilder({'aaa': (('marlo', ''), ('skywalker', 'omar'))}))
+      self.assertIsNone(id_weight_pair.weight_tensor)
+      with _initialized_session():
+        _assert_sparse_tensor_value(
+            self,
+            sparse_tensor.SparseTensorValue(
+                indices=((0, 0), (1, 0), (1, 1)),
+                values=np.array((2, -1, 0), dtype=np.int64),
+                dense_shape=(2, 2)), id_weight_pair.id_tensor.eval())
 
-  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_default_value_in_vocabulary(self):
-    column = fc._categorical_column_with_vocabulary_list(
-        key='aaa',
-        vocabulary_list=('omar', 'stringer', 'marlo'),
-        default_value=2)
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=('marlo', 'skywalker', 'omar'),
-        dense_shape=(2, 2))
-    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
-    self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((2, 2, 0), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
+    with ops.Graph().as_default():
+      column = fc._categorical_column_with_vocabulary_list(
+          key='aaa',
+          vocabulary_list=('omar', 'stringer', 'marlo'),
+          default_value=2)
+      inputs = sparse_tensor.SparseTensorValue(
+          indices=((0, 0), (1, 0), (1, 1)),
+          values=('marlo', 'skywalker', 'omar'),
+          dense_shape=(2, 2))
+      id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+      self.assertIsNone(id_weight_pair.weight_tensor)
+      with _initialized_session():
+        _assert_sparse_tensor_value(
+            self,
+            sparse_tensor.SparseTensorValue(
+                indices=inputs.indices,
+                values=np.array((2, 2, 0), dtype=np.int64),
+                dense_shape=inputs.dense_shape),
+            id_weight_pair.id_tensor.eval())
 
-  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_with_oov_buckets(self):
-    column = fc._categorical_column_with_vocabulary_list(
-        key='aaa',
-        vocabulary_list=('omar', 'stringer', 'marlo'),
-        num_oov_buckets=100)
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1), (1, 2)),
-        values=('marlo', 'skywalker', 'omar', 'heisenberg'),
-        dense_shape=(2, 3))
-    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
-    self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((2, 33, 0, 62), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
+    with ops.Graph().as_default():
+      column = fc._categorical_column_with_vocabulary_list(
+          key='aaa',
+          vocabulary_list=('omar', 'stringer', 'marlo'),
+          num_oov_buckets=100)
+      inputs = sparse_tensor.SparseTensorValue(
+          indices=((0, 0), (1, 0), (1, 1), (1, 2)),
+          values=('marlo', 'skywalker', 'omar', 'heisenberg'),
+          dense_shape=(2, 3))
+      id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+      self.assertIsNone(id_weight_pair.weight_tensor)
+      with _initialized_session():
+        _assert_sparse_tensor_value(
+            self,
+            sparse_tensor.SparseTensorValue(
+                indices=inputs.indices,
+                values=np.array((2, 33, 0, 62), dtype=np.int64),
+                dense_shape=inputs.dense_shape),
+            id_weight_pair.id_tensor.eval())
 
-  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_int32(self):
-    column = fc._categorical_column_with_vocabulary_list(
-        key='aaa',
-        vocabulary_list=np.array((30, 35, 11, 23, 22), dtype=np.int32),
-        dtype=dtypes.int32)
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1), (2, 2)),
-        values=np.array((11, 100, 30, 22), dtype=np.int32),
-        dense_shape=(3, 3))
-    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
-    self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((2, -1, 0, 4), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
+    with ops.Graph().as_default():
+      column = fc._categorical_column_with_vocabulary_list(
+          key='aaa',
+          vocabulary_list=np.array((30, 35, 11, 23, 22), dtype=np.int32),
+          dtype=dtypes.int32)
+      inputs = sparse_tensor.SparseTensorValue(
+          indices=((0, 0), (1, 0), (1, 1), (2, 2)),
+          values=np.array((11, 100, 30, 22), dtype=np.int32),
+          dense_shape=(3, 3))
+      id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+      self.assertIsNone(id_weight_pair.weight_tensor)
+      with _initialized_session():
+        _assert_sparse_tensor_value(
+            self,
+            sparse_tensor.SparseTensorValue(
+                indices=inputs.indices,
+                values=np.array((2, -1, 0, 4), dtype=np.int64),
+                dense_shape=inputs.dense_shape),
+            id_weight_pair.id_tensor.eval())
 
-  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_int32_dense_input(self):
-    default_value = -100
-    column = fc._categorical_column_with_vocabulary_list(
-        key='aaa',
-        vocabulary_list=np.array((30, 35, 11, 23, 22), dtype=np.int32),
-        dtype=dtypes.int32,
-        default_value=default_value)
-    id_weight_pair = column._get_sparse_tensors(
-        _LazyBuilder({
-            'aaa':
-                np.array(
-                    ((11, -1, -1), (100, 30, -1), (-1, -1, 22)), dtype=np.int32)
-        }))
-    self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=((0, 0), (1, 0), (1, 1), (2, 2)),
-              values=np.array((2, default_value, 0, 4), dtype=np.int64),
-              dense_shape=(3, 3)),
-          id_weight_pair.id_tensor.eval())
+    with ops.Graph().as_default():
+      default_value = -100
+      column = fc._categorical_column_with_vocabulary_list(
+          key='aaa',
+          vocabulary_list=np.array((30, 35, 11, 23, 22), dtype=np.int32),
+          dtype=dtypes.int32,
+          default_value=default_value)
+      id_weight_pair = column._get_sparse_tensors(
+          _LazyBuilder({
+              'aaa':
+                  np.array(((11, -1, -1), (100, 30, -1), (-1, -1, 22)),
+                           dtype=np.int32)
+          }))
+      self.assertIsNone(id_weight_pair.weight_tensor)
+      with _initialized_session():
+        _assert_sparse_tensor_value(
+            self,
+            sparse_tensor.SparseTensorValue(
+                indices=((0, 0), (1, 0), (1, 1), (2, 2)),
+                values=np.array((2, default_value, 0, 4), dtype=np.int64),
+                dense_shape=(3, 3)), id_weight_pair.id_tensor.eval())
 
-  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_int32_with_oov_buckets(self):
-    column = fc._categorical_column_with_vocabulary_list(
-        key='aaa',
-        vocabulary_list=np.array((30, 35, 11, 23, 22), dtype=np.int32),
-        dtype=dtypes.int32,
-        num_oov_buckets=100)
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1), (2, 2)),
-        values=(11, 100, 30, 22),
-        dense_shape=(3, 3))
-    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
-    self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((2, 60, 0, 4), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
+    with ops.Graph().as_default():
+      column = fc._categorical_column_with_vocabulary_list(
+          key='aaa',
+          vocabulary_list=np.array((30, 35, 11, 23, 22), dtype=np.int32),
+          dtype=dtypes.int32,
+          num_oov_buckets=100)
+      inputs = sparse_tensor.SparseTensorValue(
+          indices=((0, 0), (1, 0), (1, 1), (2, 2)),
+          values=(11, 100, 30, 22),
+          dense_shape=(3, 3))
+      id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+      self.assertIsNone(id_weight_pair.weight_tensor)
+      with _initialized_session():
+        _assert_sparse_tensor_value(
+            self,
+            sparse_tensor.SparseTensorValue(
+                indices=inputs.indices,
+                values=np.array((2, 60, 0, 4), dtype=np.int64),
+                dense_shape=inputs.dense_shape),
+            id_weight_pair.id_tensor.eval())
 
-  @test_util.run_deprecated_v1
   def test_linear_model(self):
     wire_column = fc._categorical_column_with_vocabulary_list(
         key='aaa',
@@ -4187,7 +4172,6 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
         # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
         self.assertAllClose(((3.,), (5.,)), self.evaluate(predictions))
 
-  @test_util.run_deprecated_v1
   def test_keras_linear_model(self):
     wire_column = fc._categorical_column_with_vocabulary_list(
         key='aaa',
@@ -4231,7 +4215,6 @@ class IdentityCategoricalColumnTest(test.TestCase):
     with self.assertRaisesRegex(ValueError, 'key must be a string.'):
       fc._categorical_column_with_identity(key=('aaa',), num_buckets=3)
 
-  @test_util.run_deprecated_v1
   def test_deep_copy(self):
     original = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
     for column in (original, copy.deepcopy(original)):
@@ -4290,39 +4273,39 @@ class IdentityCategoricalColumnTest(test.TestCase):
               dense_shape=[1, 2]),
           features['aaa'].eval())
 
-  @test_util.run_deprecated_v1
   def test_get_sparse_tensors(self):
-    column = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(0, 1, 0),
-        dense_shape=(2, 2))
-    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
-    self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((0, 1, 0), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
+    with ops.Graph().as_default():
+      column = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
+      inputs = sparse_tensor.SparseTensorValue(
+          indices=((0, 0), (1, 0), (1, 1)),
+          values=(0, 1, 0),
+          dense_shape=(2, 2))
+      id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+      self.assertIsNone(id_weight_pair.weight_tensor)
+      with _initialized_session():
+        _assert_sparse_tensor_value(
+            self,
+            sparse_tensor.SparseTensorValue(
+                indices=inputs.indices,
+                values=np.array((0, 1, 0), dtype=np.int64),
+                dense_shape=inputs.dense_shape),
+            id_weight_pair.id_tensor.eval())
 
-  @test_util.run_deprecated_v1
   def test_transform_feature(self):
-    column = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(0, 1, 0),
-        dense_shape=(2, 2))
-    id_tensor = _transform_features({'aaa': inputs}, [column])[column]
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((0, 1, 0), dtype=np.int64),
-              dense_shape=inputs.dense_shape), self.evaluate(id_tensor))
+    with ops.Graph().as_default():
+      column = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
+      inputs = sparse_tensor.SparseTensorValue(
+          indices=((0, 0), (1, 0), (1, 1)),
+          values=(0, 1, 0),
+          dense_shape=(2, 2))
+      id_tensor = _transform_features({'aaa': inputs}, [column])[column]
+      with _initialized_session():
+        _assert_sparse_tensor_value(
+            self,
+            sparse_tensor.SparseTensorValue(
+                indices=inputs.indices,
+                values=np.array((0, 1, 0), dtype=np.int64),
+                dense_shape=inputs.dense_shape), self.evaluate(id_tensor))
 
   def test_get_sparse_tensors_weight_collections(self):
     column = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
@@ -4339,139 +4322,139 @@ class IdentityCategoricalColumnTest(test.TestCase):
                           ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
     self.assertCountEqual([], ops.get_collection('my_weights'))
 
-  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_dense_input(self):
-    column = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
-    id_weight_pair = column._get_sparse_tensors(
-        _LazyBuilder({
-            'aaa': ((0, -1), (1, 0))
-        }))
-    self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=((0, 0), (1, 0), (1, 1)),
-              values=np.array((0, 1, 0), dtype=np.int64),
-              dense_shape=(2, 2)),
-          id_weight_pair.id_tensor.eval())
-
-  @test_util.run_deprecated_v1
-  def test_get_sparse_tensors_with_inputs_too_big(self):
-    # Inputs.
-    vocabulary_size = 2
-    sparse_input = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)), values=(2, 1, 0), dense_shape=(2, 2))
-
-    # Embedding variable.
-    embedding_dimension = 2
-    embedding_values = (
-        (1., 2.),  # id 0
-        (3., 5.),  # id 1
-    )
-
-    def _initializer(shape, dtype, partition_info=None):
-      del shape, dtype, partition_info
-      return embedding_values
-
-    # Build columns.
-    categorical_column = fc._categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc._embedding_column(
-        categorical_column,
-        dimension=embedding_dimension,
-        initializer=_initializer)
-
-    # Provide sparse input and get dense result.
-    embedding_lookup = embedding_column._get_dense_tensor(
-        _LazyBuilder({'aaa': sparse_input}))
-
-    with _initialized_session():
-      with self.assertRaisesRegex(errors.OpError,
-                                  r'indices\[0\] .* 2 .* \[0, 2\)'):
-        self.evaluate(embedding_lookup)
-
-  @test_util.run_deprecated_v1
-  def test_get_sparse_tensors_with_inputs_too_small(self):
-    # Inputs.
-    vocabulary_size = 2
-    sparse_input = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (0, 0), (1, 1), (1, 2)),
-        values=(-9, 0, -6, 1),
-        dense_shape=(2, 4))
-
-    # Embedding variable.
-    embedding_dimension = 2
-    embedding_values = (
-        (1., 2.),  # id 0
-        (3., 5.),  # id 1
-    )
-
-    def _initializer(shape, dtype, partition_info=None):
-      del shape, dtype, partition_info
-      return embedding_values
-
-    # Build columns.
-    categorical_column = fc._categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc._embedding_column(
-        categorical_column,
-        dimension=embedding_dimension,
-        initializer=_initializer)
-
-    # Provide sparse input and get dense result.
-    embedding_lookup = embedding_column._get_dense_tensor(
-        _LazyBuilder({'aaa': sparse_input}))
-    expected_lookups = ((1., 2.), (3., 5))
-    with _initialized_session():
-      self.assertAllEqual(expected_lookups, self.evaluate(embedding_lookup))
-
-  @test_util.run_deprecated_v1
-  def test_get_sparse_tensors_with_default_value(self):
-    column = fc._categorical_column_with_identity(
-        key='aaa', num_buckets=4, default_value=3)
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(1, -1, 99),
-        dense_shape=(2, 2))
-    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
-    self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((1, 3, 3), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
-
-  @test_util.run_deprecated_v1
-  def test_get_sparse_tensors_with_default_value_and_placeholder_inputs(self):
-    column = fc._categorical_column_with_identity(
-        key='aaa', num_buckets=4, default_value=3)
-    input_indices = array_ops.placeholder(dtype=dtypes.int64)
-    input_values = array_ops.placeholder(dtype=dtypes.int32)
-    input_shape = array_ops.placeholder(dtype=dtypes.int64)
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=input_indices,
-        values=input_values,
-        dense_shape=input_shape)
-    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
-    self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=np.array(((0, 0), (1, 0), (1, 1)), dtype=np.int64),
-              values=np.array((1, 3, 3), dtype=np.int64),
-              dense_shape=np.array((2, 2), dtype=np.int64)),
-          id_weight_pair.id_tensor.eval(feed_dict={
-              input_indices: ((0, 0), (1, 0), (1, 1)),
-              input_values: (1, -1, 99),
-              input_shape: (2, 2),
+    with ops.Graph().as_default():
+      column = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
+      id_weight_pair = column._get_sparse_tensors(
+          _LazyBuilder({
+              'aaa': ((0, -1), (1, 0))
           }))
+      self.assertIsNone(id_weight_pair.weight_tensor)
+      with _initialized_session():
+        _assert_sparse_tensor_value(
+            self,
+            sparse_tensor.SparseTensorValue(
+                indices=((0, 0), (1, 0), (1, 1)),
+                values=np.array((0, 1, 0), dtype=np.int64),
+                dense_shape=(2, 2)),
+            id_weight_pair.id_tensor.eval())
+
+  def test_get_sparse_tensors_with_inputs_too_big(self):
+    with ops.Graph().as_default():
+      # Inputs.
+      vocabulary_size = 2
+      sparse_input = sparse_tensor.SparseTensorValue(
+          indices=((0, 0), (1, 0), (1, 1)), values=(2, 1, 0),
+          dense_shape=(2, 2))
+
+      # Embedding variable.
+      embedding_dimension = 2
+      embedding_values = (
+          (1., 2.),  # id 0
+          (3., 5.),  # id 1
+      )
+
+      def _initializer(shape, dtype, partition_info=None):
+        del shape, dtype, partition_info
+        return embedding_values
+
+      # Build columns.
+      categorical_column = fc._categorical_column_with_identity(
+          key='aaa', num_buckets=vocabulary_size)
+      embedding_column = fc._embedding_column(
+          categorical_column,
+          dimension=embedding_dimension,
+          initializer=_initializer)
+
+      # Provide sparse input and get dense result.
+      embedding_lookup = embedding_column._get_dense_tensor(
+          _LazyBuilder({'aaa': sparse_input}))
+
+      with _initialized_session():
+        with self.assertRaisesRegex(errors.OpError,
+                                    r'indices\[0\] .* 2 .* \[0, 2\)'):
+          self.evaluate(embedding_lookup)
+
+  def test_get_sparse_tensors_with_inputs_too_small(self):
+    with ops.Graph().as_default():
+      # Inputs.
+      vocabulary_size = 2
+      sparse_input = sparse_tensor.SparseTensorValue(
+          indices=((0, 0), (0, 0), (1, 1), (1, 2)),
+          values=(-9, 0, -6, 1),
+          dense_shape=(2, 4))
+
+      # Embedding variable.
+      embedding_dimension = 2
+      embedding_values = (
+          (1., 2.),  # id 0
+          (3., 5.),  # id 1
+      )
+
+      def _initializer(shape, dtype, partition_info=None):
+        del shape, dtype, partition_info
+        return embedding_values
+
+      # Build columns.
+      categorical_column = fc._categorical_column_with_identity(
+          key='aaa', num_buckets=vocabulary_size)
+      embedding_column = fc._embedding_column(
+          categorical_column,
+          dimension=embedding_dimension,
+          initializer=_initializer)
+
+      # Provide sparse input and get dense result.
+      embedding_lookup = embedding_column._get_dense_tensor(
+          _LazyBuilder({'aaa': sparse_input}))
+      expected_lookups = ((1., 2.), (3., 5))
+      with _initialized_session():
+        self.assertAllEqual(expected_lookups, self.evaluate(embedding_lookup))
+
+  def test_get_sparse_tensors_with_default_value(self):
+    with ops.Graph().as_default():
+      column = fc._categorical_column_with_identity(
+          key='aaa', num_buckets=4, default_value=3)
+      inputs = sparse_tensor.SparseTensorValue(
+          indices=((0, 0), (1, 0), (1, 1)),
+          values=(1, -1, 99),
+          dense_shape=(2, 2))
+      id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+      self.assertIsNone(id_weight_pair.weight_tensor)
+      with _initialized_session():
+        _assert_sparse_tensor_value(
+            self,
+            sparse_tensor.SparseTensorValue(
+                indices=inputs.indices,
+                values=np.array((1, 3, 3), dtype=np.int64),
+                dense_shape=inputs.dense_shape),
+            id_weight_pair.id_tensor.eval())
+
+  def test_get_sparse_tensors_with_default_value_and_placeholder_inputs(self):
+    with ops.Graph().as_default():
+      column = fc._categorical_column_with_identity(
+          key='aaa', num_buckets=4, default_value=3)
+      input_indices = array_ops.placeholder(dtype=dtypes.int64)
+      input_values = array_ops.placeholder(dtype=dtypes.int32)
+      input_shape = array_ops.placeholder(dtype=dtypes.int64)
+      inputs = sparse_tensor.SparseTensorValue(
+          indices=input_indices,
+          values=input_values,
+          dense_shape=input_shape)
+      id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+      self.assertIsNone(id_weight_pair.weight_tensor)
+      with _initialized_session():
+        _assert_sparse_tensor_value(
+            self,
+            sparse_tensor.SparseTensorValue(
+                indices=np.array(((0, 0), (1, 0), (1, 1)), dtype=np.int64),
+                values=np.array((1, 3, 3), dtype=np.int64),
+                dense_shape=np.array((2, 2), dtype=np.int64)),
+            id_weight_pair.id_tensor.eval(feed_dict={
+                input_indices: ((0, 0), (1, 0), (1, 1)),
+                input_values: (1, -1, 99),
+                input_shape: (2, 2),
+            }))
 
-  @test_util.run_deprecated_v1
   def test_linear_model(self):
     column = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
     self.assertEqual(3, column._num_buckets)
@@ -4493,7 +4476,6 @@ class IdentityCategoricalColumnTest(test.TestCase):
         # weight_var[2] + weight_var[1] = 3+2 = 5
         self.assertAllClose(((1.,), (5.,)), self.evaluate(predictions))
 
-  @test_util.run_deprecated_v1
   def test_keras_linear_model(self):
     column = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
     self.assertEqual(3, column._num_buckets)
@@ -4645,7 +4627,6 @@ class IndicatorColumnTest(test.TestCase):
     with self.cached_session():
       self.assertAllEqual([[0., 1., 1., 0.]], self.evaluate(output))
 
-  @test_util.run_deprecated_v1
   def test_deep_copy(self):
     a = fc._categorical_column_with_hash_bucket('a', 4)
     column = fc._indicator_column(a)
@@ -4678,66 +4659,66 @@ class IndicatorColumnTest(test.TestCase):
               dense_shape=[1, 2]),
           features['aaa'].eval())
 
-  @test_util.run_deprecated_v1
   def test_transform(self):
-    a = fc._categorical_column_with_vocabulary_list(
-        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
-    a_indicator = fc._indicator_column(a)
-    features = {
-        'aaa': sparse_tensor.SparseTensorValue(
-            indices=((0, 0), (1, 0), (1, 1)),
-            values=('marlo', 'skywalker', 'omar'),
-            dense_shape=(2, 2))
-    }
-    indicator_tensor = _transform_features(features, [a_indicator])[a_indicator]
-    with _initialized_session():
-      self.assertAllEqual([[0, 0, 1], [1, 0, 0]],
-                          self.evaluate(indicator_tensor))
+    with ops.Graph().as_default():
+      a = fc._categorical_column_with_vocabulary_list(
+          key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
+      a_indicator = fc._indicator_column(a)
+      features = {
+          'aaa': sparse_tensor.SparseTensorValue(
+              indices=((0, 0), (1, 0), (1, 1)),
+              values=('marlo', 'skywalker', 'omar'),
+              dense_shape=(2, 2))
+      }
+      indicator_tensor = _transform_features(features,
+                                             [a_indicator])[a_indicator]
+      with _initialized_session():
+        self.assertAllEqual([[0, 0, 1], [1, 0, 0]],
+                            self.evaluate(indicator_tensor))
 
-  @test_util.run_deprecated_v1
   def test_transform_with_weighted_column(self):
-    # Github issue 12557
-    ids = fc._categorical_column_with_vocabulary_list(
-        key='ids', vocabulary_list=('a', 'b', 'c'))
-    weights = fc._weighted_categorical_column(ids, 'weights')
-    indicator = fc._indicator_column(weights)
-    features = {
-        'ids': constant_op.constant([['c', 'b', 'a', 'c']]),
-        'weights': constant_op.constant([[2., 4., 6., 1.]])
-    }
-    indicator_tensor = _transform_features(features, [indicator])[indicator]
-    with _initialized_session():
-      self.assertAllEqual([[6., 4., 3.]], self.evaluate(indicator_tensor))
+    with ops.Graph().as_default():
+      # Github issue 12557
+      ids = fc._categorical_column_with_vocabulary_list(
+          key='ids', vocabulary_list=('a', 'b', 'c'))
+      weights = fc._weighted_categorical_column(ids, 'weights')
+      indicator = fc._indicator_column(weights)
+      features = {
+          'ids': constant_op.constant([['c', 'b', 'a', 'c']]),
+          'weights': constant_op.constant([[2., 4., 6., 1.]])
+      }
+      indicator_tensor = _transform_features(features, [indicator])[indicator]
+      with _initialized_session():
+        self.assertAllEqual([[6., 4., 3.]], self.evaluate(indicator_tensor))
 
-  @test_util.run_deprecated_v1
   def test_transform_with_missing_value_in_weighted_column(self):
-    # Github issue 12583
-    ids = fc._categorical_column_with_vocabulary_list(
-        key='ids', vocabulary_list=('a', 'b', 'c'))
-    weights = fc._weighted_categorical_column(ids, 'weights')
-    indicator = fc._indicator_column(weights)
-    features = {
-        'ids': constant_op.constant([['c', 'b', 'unknown']]),
-        'weights': constant_op.constant([[2., 4., 6.]])
-    }
-    indicator_tensor = _transform_features(features, [indicator])[indicator]
-    with _initialized_session():
-      self.assertAllEqual([[0., 4., 2.]], self.evaluate(indicator_tensor))
+    with ops.Graph().as_default():
+      # Github issue 12583
+      ids = fc._categorical_column_with_vocabulary_list(
+          key='ids', vocabulary_list=('a', 'b', 'c'))
+      weights = fc._weighted_categorical_column(ids, 'weights')
+      indicator = fc._indicator_column(weights)
+      features = {
+          'ids': constant_op.constant([['c', 'b', 'unknown']]),
+          'weights': constant_op.constant([[2., 4., 6.]])
+      }
+      indicator_tensor = _transform_features(features, [indicator])[indicator]
+      with _initialized_session():
+        self.assertAllEqual([[0., 4., 2.]], self.evaluate(indicator_tensor))
 
-  @test_util.run_deprecated_v1
   def test_transform_with_missing_value_in_categorical_column(self):
-    # Github issue 12583
-    ids = fc._categorical_column_with_vocabulary_list(
-        key='ids', vocabulary_list=('a', 'b', 'c'))
-    indicator = fc._indicator_column(ids)
-    features = {
-        'ids': constant_op.constant([['c', 'b', 'unknown']]),
-    }
-    indicator_tensor = _transform_features(features, [indicator])[indicator]
-    with _initialized_session():
-      self.assertAllEqual([[0., 1., 1.]], self.evaluate(indicator_tensor))
+    with ops.Graph().as_default():
+      # Github issue 12583
+      ids = fc._categorical_column_with_vocabulary_list(
+          key='ids', vocabulary_list=('a', 'b', 'c'))
+      indicator = fc._indicator_column(ids)
+      features = {
+          'ids': constant_op.constant([['c', 'b', 'unknown']]),
+      }
+      indicator_tensor = _transform_features(features, [indicator])[indicator]
+      with _initialized_session():
+        self.assertAllEqual([[0., 1., 1.]], self.evaluate(indicator_tensor))
 
-  @test_util.run_deprecated_v1
   def test_linear_model(self):
     animal = fc._indicator_column(
         fc._categorical_column_with_identity('animal', num_buckets=4))
@@ -4757,7 +4738,6 @@ class IndicatorColumnTest(test.TestCase):
         weight_var.assign([[1.], [2.], [3.], [4.]]).eval()
         self.assertAllClose([[2. + 3.]], self.evaluate(predictions))
 
-  @test_util.run_deprecated_v1
   def test_keras_linear_model(self):
     animal = fc._indicator_column(
         fc._categorical_column_with_identity('animal', num_buckets=4))
@@ -4777,7 +4757,6 @@ class IndicatorColumnTest(test.TestCase):
         weight_var.assign([[1.], [2.], [3.], [4.]]).eval()
         self.assertAllClose([[2. + 3.]], self.evaluate(predictions))
 
-  @test_util.run_deprecated_v1
   def test_input_layer(self):
     animal = fc._indicator_column(
         fc._categorical_column_with_identity('animal', num_buckets=4))
@@ -6287,7 +6266,6 @@ class SharedEmbeddingColumnTest(test.TestCase, parameterized.TestCase):
 
 class WeightedCategoricalColumnTest(test.TestCase):
 
-  @test_util.run_deprecated_v1
   def test_defaults(self):
     column = fc._weighted_categorical_column(
         categorical_column=fc._categorical_column_with_identity(
@@ -6301,7 +6279,6 @@ class WeightedCategoricalColumnTest(test.TestCase):
         'values': parsing_ops.VarLenFeature(dtypes.float32)
     }, column._parse_example_spec)
 
-  @test_util.run_deprecated_v1
   def test_deep_copy(self):
     """Tests deepcopy of categorical_column_with_hash_bucket."""
     original = fc._weighted_categorical_column(
@@ -6400,95 +6377,94 @@ class WeightedCategoricalColumnTest(test.TestCase):
               dense_shape=[1, 2]),
           features['weights'].eval())
 
-  @test_util.run_deprecated_v1
   def test_transform_features(self):
-    column = fc._weighted_categorical_column(
-        categorical_column=fc._categorical_column_with_identity(
-            key='ids', num_buckets=3),
-        weight_feature_key='values')
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(0, 1, 0),
-        dense_shape=(2, 2))
-    weights = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(0.5, 1.0, 0.1),
-        dense_shape=(2, 2))
-    id_tensor, weight_tensor = _transform_features({
-        'ids': inputs,
-        'values': weights,
-    }, (column,))[column]
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array(inputs.values, dtype=np.int64),
-              dense_shape=inputs.dense_shape), self.evaluate(id_tensor))
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=weights.indices,
-              values=np.array(weights.values, dtype=np.float32),
-              dense_shape=weights.dense_shape), self.evaluate(weight_tensor))
+    with ops.Graph().as_default():
+      column = fc._weighted_categorical_column(
+          categorical_column=fc._categorical_column_with_identity(
+              key='ids', num_buckets=3),
+          weight_feature_key='values')
+      inputs = sparse_tensor.SparseTensorValue(
+          indices=((0, 0), (1, 0), (1, 1)),
+          values=(0, 1, 0),
+          dense_shape=(2, 2))
+      weights = sparse_tensor.SparseTensorValue(
+          indices=((0, 0), (1, 0), (1, 1)),
+          values=(0.5, 1.0, 0.1),
+          dense_shape=(2, 2))
+      id_tensor, weight_tensor = _transform_features({
+          'ids': inputs,
+          'values': weights,
+      }, (column,))[column]
+      with _initialized_session():
+        _assert_sparse_tensor_value(
+            self,
+            sparse_tensor.SparseTensorValue(
+                indices=inputs.indices,
+                values=np.array(inputs.values, dtype=np.int64),
+                dense_shape=inputs.dense_shape), self.evaluate(id_tensor))
+        _assert_sparse_tensor_value(
+            self,
+            sparse_tensor.SparseTensorValue(
+                indices=weights.indices,
+                values=np.array(weights.values, dtype=np.float32),
+                dense_shape=weights.dense_shape), self.evaluate(weight_tensor))
 
-  @test_util.run_deprecated_v1
   def test_transform_features_dense_input(self):
-    column = fc._weighted_categorical_column(
-        categorical_column=fc._categorical_column_with_identity(
-            key='ids', num_buckets=3),
-        weight_feature_key='values')
-    weights = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(0.5, 1.0, 0.1),
-        dense_shape=(2, 2))
-    id_tensor, weight_tensor = _transform_features({
-        'ids': ((0, -1), (1, 0)),
-        'values': weights,
-    }, (column,))[column]
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=((0, 0), (1, 0), (1, 1)),
-              values=np.array((0, 1, 0), dtype=np.int64),
-              dense_shape=(2, 2)), self.evaluate(id_tensor))
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=weights.indices,
-              values=np.array(weights.values, dtype=np.float32),
-              dense_shape=weights.dense_shape), self.evaluate(weight_tensor))
+    with ops.Graph().as_default():
+      column = fc._weighted_categorical_column(
+          categorical_column=fc._categorical_column_with_identity(
+              key='ids', num_buckets=3),
+          weight_feature_key='values')
+      weights = sparse_tensor.SparseTensorValue(
+          indices=((0, 0), (1, 0), (1, 1)),
+          values=(0.5, 1.0, 0.1),
+          dense_shape=(2, 2))
+      id_tensor, weight_tensor = _transform_features({
+          'ids': ((0, -1), (1, 0)),
+          'values': weights,
+      }, (column,))[column]
+      with _initialized_session():
+        _assert_sparse_tensor_value(
+            self,
+            sparse_tensor.SparseTensorValue(
+                indices=((0, 0), (1, 0), (1, 1)),
+                values=np.array((0, 1, 0), dtype=np.int64),
+                dense_shape=(2, 2)), self.evaluate(id_tensor))
+        _assert_sparse_tensor_value(
+            self,
+            sparse_tensor.SparseTensorValue(
+                indices=weights.indices,
+                values=np.array(weights.values, dtype=np.float32),
+                dense_shape=weights.dense_shape), self.evaluate(weight_tensor))
 
-  @test_util.run_deprecated_v1
   def test_transform_features_dense_weights(self):
-    column = fc._weighted_categorical_column(
-        categorical_column=fc._categorical_column_with_identity(
-            key='ids', num_buckets=3),
-        weight_feature_key='values')
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(2, 1, 0),
-        dense_shape=(2, 2))
-    id_tensor, weight_tensor = _transform_features({
-        'ids': inputs,
-        'values': ((.5, 0.), (1., .1)),
-    }, (column,))[column]
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array(inputs.values, dtype=np.int64),
-              dense_shape=inputs.dense_shape), self.evaluate(id_tensor))
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=((0, 0), (1, 0), (1, 1)),
-              values=np.array((.5, 1., .1), dtype=np.float32),
-              dense_shape=(2, 2)), self.evaluate(weight_tensor))
+    with ops.Graph().as_default():
+      column = fc._weighted_categorical_column(
+          categorical_column=fc._categorical_column_with_identity(
+              key='ids', num_buckets=3),
+          weight_feature_key='values')
+      inputs = sparse_tensor.SparseTensorValue(
+          indices=((0, 0), (1, 0), (1, 1)),
+          values=(2, 1, 0),
+          dense_shape=(2, 2))
+      id_tensor, weight_tensor = _transform_features({
+          'ids': inputs,
+          'values': ((.5, 0.), (1., .1)),
+      }, (column,))[column]
+      with _initialized_session():
+        _assert_sparse_tensor_value(
+            self,
+            sparse_tensor.SparseTensorValue(
+                indices=inputs.indices,
+                values=np.array(inputs.values, dtype=np.int64),
+                dense_shape=inputs.dense_shape), self.evaluate(id_tensor))
+        _assert_sparse_tensor_value(
+            self,
+            sparse_tensor.SparseTensorValue(
+                indices=((0, 0), (1, 0), (1, 1)),
+                values=np.array((.5, 1., .1), dtype=np.float32),
+                dense_shape=(2, 2)), self.evaluate(weight_tensor))
 
-  @test_util.run_deprecated_v1
   def test_keras_linear_model(self):
     column = fc._weighted_categorical_column(
         categorical_column=fc._categorical_column_with_identity(
@@ -6591,7 +6567,6 @@ class WeightedCategoricalColumnTest(test.TestCase):
         # = 3*1 + 2*.1 = 3+.2 = 3.2
         self.assertAllClose(((.5,), (3.2,)), self.evaluate(predictions))
 
-  @test_util.run_deprecated_v1
   def test_linear_model(self):
     column = fc._weighted_categorical_column(
         categorical_column=fc._categorical_column_with_identity(
diff --git a/tensorflow/python/framework/auto_control_deps.py b/tensorflow/python/framework/auto_control_deps.py
index 06a5b6dea33..12f35a494b1 100644
--- a/tensorflow/python/framework/auto_control_deps.py
+++ b/tensorflow/python/framework/auto_control_deps.py
@@ -41,6 +41,7 @@ from tensorflow.python.util import tf_decorator
 # asynchronously to avoid deadlock.
 ASYNC_STATEFUL_OPS = [
     "CollectiveGather",
+    "CollectiveGatherV2",
     "CollectiveReduce",
     "CollectiveReduceV2",
     "CollectiveBcastSend",
diff --git a/tensorflow/python/framework/combinations.py b/tensorflow/python/framework/combinations.py
index a384037e14f..7a659143fc3 100644
--- a/tensorflow/python/framework/combinations.py
+++ b/tensorflow/python/framework/combinations.py
@@ -28,6 +28,7 @@ from tensorflow.python import tf2
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_combinations
+from tensorflow.python.util.tf_export import tf_export
 
 
 class EagerGraphCombination(test_combinations.TestCombination):
@@ -81,3 +82,5 @@ generate = functools.partial(
 combine = test_combinations.combine
 times = test_combinations.times
 NamedObject = test_combinations.NamedObject
+
+tf_export("__internal__.test.combinations.generate", v1=[])(generate)
diff --git a/tensorflow/python/framework/composite_tensor.py b/tensorflow/python/framework/composite_tensor.py
index b7a4d65b412..e3db9936389 100644
--- a/tensorflow/python/framework/composite_tensor.py
+++ b/tensorflow/python/framework/composite_tensor.py
@@ -58,8 +58,8 @@ class CompositeTensor(object):
 
     Args:
       shape: A `tf.TensorShape` object.  The shape invariant for this
-        `CompositeTensor`, or `None` if a default shape invariant should be
-        used (based on the value of this `CompositeTensor`).
+        `CompositeTensor`, or `None` if a default shape invariant should be used
+        (based on the value of this `CompositeTensor`).
 
     Returns:
       A nested structure whose values are `tf.TensorShape` objects, specifying
@@ -68,8 +68,8 @@ class CompositeTensor(object):
     # New TypeSpec subclasses generally do not need to implement this --
     # this method is used for backwards compatibility.  Users of tf.while_loop
     # can specify a type by passing in TypeSpec instead.
-    raise NotImplementedError("%s._shape_invariant_to_type_spec"
-                              % type(self).__name__)
+    raise NotImplementedError("%s._shape_invariant_to_type_spec" %
+                              type(self).__name__)
 
   def _consumers(self):
     """Returns a list of `Operation`s that consume this `CompositeTensor`.
@@ -105,12 +105,13 @@ def replace_composites_with_components(structure):
     returns the same value as `nest.flatten(structure)`.
   """
   if isinstance(structure, CompositeTensor):
-    return replace_composites_with_components(structure._to_components())  # pylint: disable=protected-access
+    return replace_composites_with_components(
+        structure._type_spec._to_components(structure))  # pylint: disable=protected-access
   elif not nest.is_sequence(structure):
     return structure
   else:
-    return nest.map_structure(replace_composites_with_components, structure,
-                              expand_composites=False)
+    return nest.map_structure(
+        replace_composites_with_components, structure, expand_composites=False)
 
 
 # @TODO(edloper): Can we replace convert_to_tensor_or_xyz with just
diff --git a/tensorflow/python/framework/config.py b/tensorflow/python/framework/config.py
index 0962b9a8a70..7aed6fb2b70 100644
--- a/tensorflow/python/framework/config.py
+++ b/tensorflow/python/framework/config.py
@@ -18,40 +18,79 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python import _pywrap_tf32_execution
+from tensorflow.python import _pywrap_tensor_float_32_execution
 from tensorflow.python.eager import context
 from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
-# No tf_export until TF is built against CUDA11 which is required for TF32.
-def tensor_float_32_execution_allowed():
-  """Get if TensorFloat-32 operations are enabled on supported hardware.
+@tf_export('config.experimental.tensor_float_32_execution_enabled')
+def tensor_float_32_execution_enabled():
+  """Returns whether TensorFloat-32 is enabled.
+
+  By default, TensorFloat-32 is enabled, but this can be changed with
+  `tf.config.experimental.enable_tensor_float_32_execution`.
 
   Returns:
-    True if TensorFloat-32 execution is enabled and False otherwise.
+    True if TensorFloat-32 is enabled (the default) and False otherwise
   """
-  return _pywrap_tf32_execution.is_allowed()
+  return _pywrap_tensor_float_32_execution.is_enabled()
 
 
-# No tf_export until TF is built against CUDA11 which is required for TF32.
-def allow_tensor_float_32_execution(allowed):
-  """Allow use of TensorFloat-32 with float32 ops on supported hardware.
+@tf_export('config.experimental.enable_tensor_float_32_execution')
+def enable_tensor_float_32_execution(enabled):
+  """Enable or disable the use of TensorFloat-32 on supported hardware.
 
-  TensorFloat-32 is a math mode introduced with the NVIDIA Ampere architecture.
-  TensorFloat-32 kernels take float32 inputs and produce float32 outputs.
-  Internally, the inputs are cast to a custom representation with 10-bit
-  mantissa (similar to float16) and 8-bit exponent (similar to float32) and are
-  executed using TensorCores with float32 accumulation. For more information,
-  see https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/.
+  [TensorFloat-32](https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format),
+  or TF32 for short, is a math mode for NVIDIA Ampere GPUs. TensorFloat-32
+  execution causes certain float32 ops, such as matrix multiplications and
+  convolutions, to run much faster on Ampere GPUs but with reduced precision.
+  This reduced precision should not impact convergence of deep learning models
+  in practice.
 
-  TensorFloat-32 execution is disabled by default, but this may change in a
-  future version.
+  TensorFloat-32 is enabled by default in the nightly versions of TensorFlow. We
+  expect it will remain enabled by default in the first stable version that
+  TensorFloat-32 is available, which is TensorFlow 2.4, as it increases
+  performance and does not reduce model quality in practice. If you want to use
+  the full float32 precision, you can disable TensorFloat-32 execution with this
+  function. For example:
+
+  ```python
+  x = tf.fill((2, 2), 1.0001)
+  y = tf.fill((2, 2), 1.)
+  # TensorFloat-32 is enabled, so matmul is run with reduced precision
+  print(tf.linalg.matmul(x, y))  # [[2., 2.], [2., 2.]]
+  tf.config.experimental.enable_tensor_float_32_execution(False)
+  # Matmul is run with full precision
+  print(tf.linalg.matmul(x, y))  # [[2.0002, 2.0002], [2.0002, 2.0002]]
+  ```
+
+  There is [an RFC](https://github.com/tensorflow/community/pull/287) proposing
+  that TensorFloat-32 remain enabled by default in stable versions of
+  TensorFlow. We expect the RFC to be accepted, but if it isn't, TensorFloat-32
+  will be disabled by default in TensorFlow 2.4.
+
+  To check whether TensorFloat-32 execution is currently enabled, use
+  `tf.config.experimental.tensor_float_32_execution_enabled`.
+
+  Enabling TensorFloat-32 causes float32 inputs of supported ops, such as
+  `tf.linalg.matmul`, to be rounded from 23 bits of precision to 10 bits of
+  precision in most cases. This allows the ops to execute much faster by
+  utilizing the GPU's tensor cores. TensorFloat-32 has the same dynamic range as
+  float32, meaning it is no more likely to underflow or overflow than float32.
+  Ops still use float32 accumulation when TensorFloat-32 is enabled. Enabling
+  TensorFloat-32 only affects Ampere GPUs and subsequent GPUs that support
+  TensorFloat-32.
+
+  Note TensorFloat-32 is not always used in supported ops, as only inputs of
+  certain shapes are supported. Support for more input shapes and more ops may
+  be added in the future. As a result, precision of float32 ops may decrease in
+  minor versions of TensorFlow.
 
   Args:
-    allowed: whether to allow TensorFloat-32 execution
+    enabled: Bool indicating whether to enable TensorFloat-32 execution.
   """
-  _pywrap_tf32_execution.allow(allowed)
+  _pywrap_tensor_float_32_execution.enable(enabled)
 
 
 @tf_export('config.threading.get_intra_op_parallelism_threads')
@@ -474,6 +513,33 @@ def set_visible_devices(devices, device_type=None):
   context.context().set_visible_devices(devices, device_type)
 
 
+@tf_export('config.experimental.get_memory_usage')
+def get_memory_usage(device):
+  """Get the memory usage, in bytes, for the chosen device.
+
+  See https://www.tensorflow.org/api_docs/python/tf/device for specifying device
+  strings.
+
+  For example:
+
+  >>> gpu_devices = tf.config.list_physical_devices('GPU')
+  >>> if gpu_devices:
+  ...   tf.config.experimental.get_memory_usage('GPU:0')
+
+  Does not work for CPU.
+
+  Args:
+    device: Device string to get the bytes in use for.
+
+  Returns:
+    Total memory usage in bytes.
+
+  Raises:
+    ValueError: Non-existent or CPU device specified.
+  """
+  return context.context().get_total_memory_usage(device)
+
+
 @tf_export('config.experimental.get_memory_growth')
 def get_memory_growth(device):
   """Get if memory growth is enabled for a `PhysicalDevice`.
diff --git a/tensorflow/python/framework/config_test.py b/tensorflow/python/framework/config_test.py
index 70857ef4b83..a20af802824 100644
--- a/tensorflow/python/framework/config_test.py
+++ b/tensorflow/python/framework/config_test.py
@@ -435,9 +435,6 @@ class DeviceTest(test.TestCase):
     self.assertEqual(len(config.get_visible_devices('CPU')), 1)
     self.assertGreater(len(config.get_visible_devices('GPU')), 0)
 
-    # get_visible_devices filters out XLA_* devices.  list_logical_devices does
-    # not, but we can't call it here because it initializes the devices and
-    # calling set_visible_devices after that is disallowed.
     self.assertEqual(len(config.get_visible_devices('XLA_GPU')), 0)
 
     config.set_visible_devices(cpus[0])
@@ -451,12 +448,6 @@ class DeviceTest(test.TestCase):
         a = array_ops.identity(1.0)
         self.evaluate(a)
 
-    with self.assertRaisesRegex(errors.InvalidArgumentError,
-                                'Could not satisfy'):
-      with ops.device('/device:XLA_GPU:0'):
-        a = array_ops.identity(1.0)
-        self.evaluate(a)
-
     # Modifying the visible devices is not supported
     with self.assertRaisesRegex(RuntimeError, 'cannot be modified'):
       config.set_visible_devices(gpus)
@@ -597,6 +588,35 @@ class DeviceTest(test.TestCase):
     for gpu in gpus:
       config.set_memory_growth(gpu, True)
 
+  @test_util.run_gpu_only
+  @reset_eager
+  def testGetMemoryUsage(self):
+    device = array_ops.zeros([]).backing_device
+    self.assertGreater(config.get_memory_usage(device), 0)
+
+  @test_util.run_gpu_only
+  @reset_eager
+  def testGetMemoryUsageSubstring(self):
+    self.assertGreater(config.get_memory_usage('GPU:0'), 0)
+
+  @reset_eager
+  def testGetMemoryUsageCPU(self):
+    with self.assertRaisesRegex(ValueError, 'CPU does not support'):
+      config.get_memory_usage('CPU:0')
+
+  @reset_eager
+  def testGetMemoryUsageUnknownDevice(self):
+    with self.assertRaisesRegex(ValueError, 'Failed parsing device name'):
+      config.get_memory_usage('unknown_device')
+
+  @test_util.run_gpu_only
+  @reset_eager
+  def testGetMemoryUsageAmbiguousDevice(self):
+    if len(config.list_physical_devices('GPU')) < 2:
+      self.skipTest('Need at least 2 GPUs')
+    with self.assertRaisesRegex(ValueError, 'Multiple devices'):
+      config.get_memory_usage('GPU')
+
   @test_util.run_gpu_only
   @reset_eager
   def testGpuInvalidConfig(self):
@@ -759,40 +779,38 @@ class DeviceTest(test.TestCase):
 class TensorFloat32Test(test.TestCase):
 
   def setUp(self):
+    super(TensorFloat32Test, self).setUp()
     if not test_util.is_gpu_available(
         cuda_only=True, min_cuda_compute_capability=(8, 0)):
       self.skipTest('TensorFloat-32 requires an NVIDIA GPU with compute '
                     'capability of at least 8.0')
 
   def tearDown(self):
-    config.allow_tensor_float_32_execution(False)
+    super(TensorFloat32Test, self).tearDown()
+    config.enable_tensor_float_32_execution(True)
 
-  def test_tf32_enabled(self):
-    self.assertFalse(config.tensor_float_32_execution_allowed())
-    config.allow_tensor_float_32_execution(True)
-    self.assertTrue(config.tensor_float_32_execution_allowed())
+  def test_tensor_float_32_enabled(self):
+    self.assertTrue(config.tensor_float_32_execution_enabled())
 
     x = array_ops.fill((8, 8), 1 + 2**-20)
     y = array_ops.ones((8, 8))
     out = math_ops.matmul(x, y)
-    # In tf32, each element of x is rounded to 1, so the output will be 8s.
+    # In TensorFloat-32, each element of x is rounded to 1, so the output will
+    # be 8s.
     expected = array_ops.fill((8, 8), 8)
     self.assertAllEqual(out, expected)
 
-  def test_tf32_disabled(self):
+  def test_tensor_float_32_disabled(self):
+    self.assertTrue(config.tensor_float_32_execution_enabled())
+    config.enable_tensor_float_32_execution(False)
+    self.assertFalse(config.tensor_float_32_execution_enabled())
+
     x = array_ops.fill((8, 8), 1 + 2**-20)
     y = array_ops.ones((8, 8))
     out = math_ops.matmul(x, y)
     expected = array_ops.fill((8, 8), 8 * (1 + 2**-20))
     self.assertAllEqual(out, expected)
 
-    # Test disabling tf32 after enabling it works correctly
-    config.allow_tensor_float_32_execution(True)
-    config.allow_tensor_float_32_execution(False)
-    self.assertFalse(config.tensor_float_32_execution_allowed())
-    out = math_ops.matmul(x, y)
-    self.assertAllEqual(out, expected)
-
 
 if __name__ == '__main__':
   ops.enable_eager_execution()
diff --git a/tensorflow/python/framework/cpp_shape_inference.proto b/tensorflow/python/framework/cpp_shape_inference.proto
index 11199a9720f..1bf14570292 100644
--- a/tensorflow/python/framework/cpp_shape_inference.proto
+++ b/tensorflow/python/framework/cpp_shape_inference.proto
@@ -2,6 +2,7 @@ syntax = "proto3";
 
 package tensorflow;
 option cc_enable_arenas = true;
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/python/framework/cpp_shape_inference_go_proto";
 
 import "tensorflow/core/framework/types.proto";
 import "tensorflow/core/framework/tensor_shape.proto";
diff --git a/tensorflow/python/framework/experimental/BUILD b/tensorflow/python/framework/experimental/BUILD
new file mode 100644
index 00000000000..ab960905e1b
--- /dev/null
+++ b/tensorflow/python/framework/experimental/BUILD
@@ -0,0 +1,161 @@
+# Experimental Unified APIs for Eager and Graph modes.
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "tf_python_pybind_extension")
+
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+tf_python_pybind_extension(
+    name = "_unified_api",
+    srcs = ["unified_api.cc"],
+    features = ["-layering_check"],
+    module_name = "_unified_api",
+    deps = [
+        "//tensorflow/c/eager:tfe_tensorhandle_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/lib/llvm_rtti",
+        "//tensorflow/python:pybind11_lib",
+        "//tensorflow/python:unified_api_pywrap_required_headers",
+        "@pybind11",
+    ],
+)
+
+tf_python_pybind_extension(
+    name = "_tape",
+    srcs = ["tape.cc"],
+    features = ["-layering_check"],
+    module_name = "_tape",
+    deps = [
+        "//tensorflow/c/eager:tfe_tensorhandle_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/lib/llvm_rtti",
+        "//tensorflow/python:pybind11_lib",
+        "//tensorflow/python:unified_api_pywrap_required_headers",
+        "@pybind11",
+    ],
+)
+
+tf_python_pybind_extension(
+    name = "_math_ops",
+    srcs = ["math_ops.cc"],
+    module_name = "_math_ops",
+    deps = [
+        "//tensorflow/c/eager:tfe_tensorhandle_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/lib/llvm_rtti",
+        "//tensorflow/python:pybind11_lib",
+        "//tensorflow/python:unified_api_pywrap_required_headers",
+        "@com_google_absl//absl/types:span",
+        "@pybind11",
+    ],
+)
+
+tf_python_pybind_extension(
+    name = "_nn_ops",
+    srcs = ["nn_ops.cc"],
+    module_name = "_nn_ops",
+    deps = [
+        "//tensorflow/c/eager:tfe_tensorhandle_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/lib/llvm_rtti",
+        "//tensorflow/python:pybind11_lib",
+        "//tensorflow/python:unified_api_pywrap_required_headers",
+        "@com_google_absl//absl/types:span",
+        "@pybind11",
+    ],
+)
+
+py_library(
+    name = "gradient_registry",
+    srcs = ["gradient_registry.py"],
+    deps = [":_tape"],
+)
+
+py_library(
+    name = "math_ops",
+    srcs = ["math_ops.py"],
+    deps = [
+        ":_math_ops",
+        ":gradient_registry",
+        ":tape_stack",
+    ],
+)
+
+py_library(
+    name = "nn_ops",
+    srcs = ["nn_ops.py"],
+    deps = [
+        ":_nn_ops",
+        ":gradient_registry",
+        ":tape_stack",
+    ],
+)
+
+py_library(
+    name = "tape",
+    srcs = ["tape.py"],
+    deps = [
+        ":_tape",
+        ":context_stack",
+        ":tape_stack",
+        "//tensorflow/python/data/util:nest",
+    ],
+)
+
+py_library(
+    name = "def_function",
+    srcs = ["def_function.py"],
+)
+
+py_library(
+    name = "thread_local_stack",
+    srcs = ["thread_local_stack.py"],
+)
+
+py_library(
+    name = "context_stack",
+    srcs = ["context_stack.py"],
+    deps = [":thread_local_stack"],
+)
+
+py_library(
+    name = "tape_stack",
+    srcs = ["tape_stack.py"],
+    deps = [":thread_local_stack"],
+)
+
+cuda_py_test(
+    name = "unified_api_test",
+    size = "small",
+    srcs = ["unified_api_test.py"],
+    tags = [
+        # Note(srbs): These python bindings are not
+        # exported as part of the pip package yet so
+        # this test is disabled.
+        "no_pip",
+        "no_windows",  # b/168218876
+    ],
+    tfrt_enabled = True,
+    deps = [
+        ":_unified_api",
+        ":context_stack",
+        ":def_function",
+        ":math_ops",
+        ":nn_ops",
+        ":tape",
+        "//tensorflow/python:client_testlib",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
diff --git a/tensorflow/python/framework/experimental/context_stack.py b/tensorflow/python/framework/experimental/context_stack.py
new file mode 100644
index 00000000000..7e29c1fb36e
--- /dev/null
+++ b/tensorflow/python/framework/experimental/context_stack.py
@@ -0,0 +1,40 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Thread-local context manager stack."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib
+
+from tensorflow.python.framework.experimental import thread_local_stack
+
+_default_ctx_stack = thread_local_stack.ThreadLocalStack()
+
+
+def get_default():
+  """Returns the default execution context."""
+  return _default_ctx_stack.peek()
+
+
+@contextlib.contextmanager
+def set_default(ctx):
+  """Returns a contextmanager with `ctx` as the default execution context."""
+  try:
+    _default_ctx_stack.push(ctx)
+    yield
+  finally:
+    _default_ctx_stack.pop()
diff --git a/tensorflow/python/framework/experimental/def_function.py b/tensorflow/python/framework/experimental/def_function.py
new file mode 100644
index 00000000000..29d914cbc6c
--- /dev/null
+++ b/tensorflow/python/framework/experimental/def_function.py
@@ -0,0 +1,74 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Experimental impl of tf.function using unified APIs, for testing only."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework.experimental import _unified_api
+from tensorflow.python.framework.experimental import context_stack as context_lib
+from tensorflow.python.util import nest
+
+NewTracingContext = _unified_api.NewTracingContext
+
+
+class Function(object):
+  """Helper for tf.function."""
+
+  def __init__(self, func, name=None):
+    self._python_func = func
+    # TODO(srbs): Uniquify this name.
+    self.name = name or func.__name__
+
+  def __call__(self, *args, **kwargs):
+    # Flatten arguments.
+    flat_args = nest.flatten(args, expand_composites=True)
+    flat_kwargs = nest.flatten(kwargs, expand_composites=True)
+    all_args = flat_args + flat_kwargs
+
+    # Trace
+    outer_ctx = context_lib.get_default()
+    ctx = NewTracingContext(self.name)
+    with context_lib.set_default(ctx):
+      # TODO(srbs): Iterating over list of inputs is a known performance
+      # bottleneck. Add a pybind API for this.
+      inputs = [ctx.AddParameter(arg.DataType()) for arg in all_args]
+      structured_args = nest.pack_sequence_as(args, inputs[:len(flat_args)])
+      structured_kwargs = nest.pack_sequence_as(kwargs, inputs[len(flat_args):])
+      structured_outputs = self._python_func(*structured_args,
+                                             **structured_kwargs)
+
+      py_outputs = nest.flatten(structured_outputs, expand_composites=True)
+      num_outputs = len(py_outputs)
+      # TODO(srbs): Drop Nones before calling Finalize.
+      finalized_f = ctx.Finalize(py_outputs)
+      outer_ctx.RegisterFunction(finalized_f)
+
+    # Build call op
+    call_op = outer_ctx.CreateOperation(self.name, "")
+    call_op.SetOpName(self.name)
+    for arg in all_args:
+      call_op.AddInput(arg)
+    call_op_outputs = call_op.Execute(num_outputs)
+
+    # Cleanup
+    outer_ctx.RemoveFunction(self.name)
+
+    return nest.pack_sequence_as(structured_outputs, call_op_outputs)
+
+
+def function(func):
+  return Function(func)
diff --git a/tensorflow/python/framework/experimental/gradient_registry.py b/tensorflow/python/framework/experimental/gradient_registry.py
new file mode 100644
index 00000000000..6cba4001553
--- /dev/null
+++ b/tensorflow/python/framework/experimental/gradient_registry.py
@@ -0,0 +1,27 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Global GradientRegistry."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework.experimental import _tape
+
+_GRADIENT_REGISTRY_GLOBAL = _tape.GradientRegistry()
+
+
+def get_global_registry():
+  return _GRADIENT_REGISTRY_GLOBAL
diff --git a/tensorflow/python/framework/experimental/math_ops.cc b/tensorflow/python/framework/experimental/math_ops.cc
new file mode 100644
index 00000000000..2102af3cf77
--- /dev/null
+++ b/tensorflow/python/framework/experimental/math_ops.cc
@@ -0,0 +1,81 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/experimental/ops/math_ops.h"
+
+#include <pybind11/stl.h>
+
+#include <memory>
+
+#include "absl/types/span.h"
+#include "pybind11/pybind11.h"
+#include "tensorflow/c/eager/abstract_context.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/c/eager/gradients.h"
+#include "tensorflow/python/lib/core/pybind11_status.h"
+
+// This library provides helpers for running ops and recording them on a
+// GradientTape. This is currently needed because the tape does not provide
+// an implementation of the abstract execution APIs but that will change.
+// TODO(b/168209775): Remove this and its imported symbols once the tape
+// execution context is ready.
+#include "tensorflow/c/eager/mnist_gradients_testutil.h"
+
+using tensorflow::AbstractContext;
+using tensorflow::AbstractTensorHandle;
+using tensorflow::gradients::GradientRegistry;
+using tensorflow::gradients::Tape;
+
+namespace tensorflow {
+PYBIND11_MODULE(_math_ops, m) {
+  m.def("add", [](AbstractContext* ctx, AbstractTensorHandle* a,
+                  AbstractTensorHandle* b, const char* name, Tape* tape,
+                  GradientRegistry* registry) {
+    int num_outputs = 1;
+    std::vector<AbstractTensorHandle*> outputs(1);
+    if (!name) {
+      name = "Add";
+    }
+    if (!tape) {
+      MaybeRaiseRegisteredFromStatus(
+          ops::Add(ctx, {a, b}, absl::MakeSpan(outputs), name));
+    } else {
+      MaybeRaiseRegisteredFromStatus(gradients::internal::Add(
+          ctx, tape, {a, b}, absl::MakeSpan(outputs), *registry));
+    }
+    return outputs[0];
+  });
+  m.def("mat_mul", [](AbstractContext* ctx, AbstractTensorHandle* a,
+                      AbstractTensorHandle* b, const char* name, Tape* tape,
+                      GradientRegistry* registry) {
+    int num_outputs = 1;
+    std::vector<AbstractTensorHandle*> outputs(1);
+    if (!name) {
+      name = "MatMul";
+    }
+    if (!tape) {
+      MaybeRaiseRegisteredFromStatus(
+          ops::MatMul(ctx, {a, b}, absl::MakeSpan(outputs), name,
+                      /*transpose_a=*/false, /*transpose_b=*/false));
+    } else {
+      MaybeRaiseRegisteredFromStatus(gradients::internal::MatMul(
+          ctx, tape, {a, b}, absl::MakeSpan(outputs), name,
+          /*transpose_a=*/false,
+          /*transpose_b=*/false, *registry));
+    }
+    return outputs[0];
+  });
+}
+}  // namespace tensorflow
diff --git a/tensorflow/python/framework/experimental/math_ops.py b/tensorflow/python/framework/experimental/math_ops.py
new file mode 100644
index 00000000000..e0ab5c4ab45
--- /dev/null
+++ b/tensorflow/python/framework/experimental/math_ops.py
@@ -0,0 +1,38 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Experimental impl for gen_math_ops.py using unified APIs, for testing."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework.experimental import _math_ops
+from tensorflow.python.framework.experimental import context_stack as context
+from tensorflow.python.framework.experimental import gradient_registry
+from tensorflow.python.framework.experimental import tape_stack
+
+
+def add(a, b, name=None):
+  ctx = context.get_default()
+  tape = tape_stack.get_default()
+  grad_registry = gradient_registry.get_global_registry()
+  return _math_ops.add(ctx, a, b, name, tape, grad_registry)
+
+
+def mat_mul(a, b, name=None):
+  ctx = context.get_default()
+  tape = tape_stack.get_default()
+  grad_registry = gradient_registry.get_global_registry()
+  return _math_ops.mat_mul(ctx, a, b, name, tape, grad_registry)
diff --git a/tensorflow/python/framework/experimental/nn_ops.cc b/tensorflow/python/framework/experimental/nn_ops.cc
new file mode 100644
index 00000000000..1524dd35e65
--- /dev/null
+++ b/tensorflow/python/framework/experimental/nn_ops.cc
@@ -0,0 +1,82 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/experimental/ops/nn_ops.h"
+
+#include <pybind11/stl.h>
+
+#include <memory>
+
+#include "absl/types/span.h"
+#include "pybind11/pybind11.h"
+#include "tensorflow/c/eager/abstract_context.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/c/eager/gradients.h"
+#include "tensorflow/python/lib/core/pybind11_status.h"
+
+// This library provides helpers for running ops and recording them on a
+// GradientTape. This is currently needed because the tape does not provide
+// an implementation of the abstract execution APIs but that will change.
+// TODO(b/168209775): Remove this and its imported symbols once the tape
+// execution context is ready.
+#include "tensorflow/c/eager/mnist_gradients_testutil.h"
+
+using tensorflow::AbstractContext;
+using tensorflow::AbstractTensorHandle;
+using tensorflow::gradients::GradientRegistry;
+using tensorflow::gradients::Tape;
+
+namespace tensorflow {
+PYBIND11_MODULE(_nn_ops, m) {
+  m.def("relu", [](AbstractContext* ctx, AbstractTensorHandle* a,
+                   const char* name, Tape* tape, GradientRegistry* registry) {
+    int num_outputs = 1;
+    std::vector<AbstractTensorHandle*> outputs(1);
+    if (!name) {
+      name = "Relu";
+    }
+    if (!tape) {
+      MaybeRaiseRegisteredFromStatus(
+          ops::Relu(ctx, {a}, absl::MakeSpan(outputs), name));
+    } else {
+      MaybeRaiseRegisteredFromStatus(gradients::internal::Relu(
+          ctx, tape, {a}, absl::MakeSpan(outputs), name, *registry));
+    }
+    return outputs[0];
+  });
+
+  m.def("sparse_softmax_cross_entropy_with_logits",
+        [](AbstractContext* ctx, AbstractTensorHandle* features,
+           AbstractTensorHandle* labels, const char* name, Tape* tape,
+           GradientRegistry* registry) {
+          int num_outputs = 2;
+          std::vector<AbstractTensorHandle*> outputs(2);
+          if (!name) {
+            name = "SparseSoftmaxCrossEntropyWithLogits";
+          }
+          if (!tape) {
+            MaybeRaiseRegisteredFromStatus(
+                ops::SparseSoftmaxCrossEntropyWithLogits(
+                    ctx, {features, labels}, absl::MakeSpan(outputs), name));
+          } else {
+            MaybeRaiseRegisteredFromStatus(
+                gradients::internal::SparseSoftmaxCrossEntropyWithLogits(
+                    ctx, tape, {features, labels}, absl::MakeSpan(outputs),
+                    name, *registry));
+          }
+          return outputs[0];  // Only return the loss vals, not the backprop.
+        });
+}
+}  // namespace tensorflow
diff --git a/tensorflow/python/framework/experimental/nn_ops.py b/tensorflow/python/framework/experimental/nn_ops.py
new file mode 100644
index 00000000000..5befa2845cf
--- /dev/null
+++ b/tensorflow/python/framework/experimental/nn_ops.py
@@ -0,0 +1,39 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Experimental impl for gen_nn_ops.py using unified APIs, for testing only."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework.experimental import _nn_ops
+from tensorflow.python.framework.experimental import context_stack as context
+from tensorflow.python.framework.experimental import gradient_registry
+from tensorflow.python.framework.experimental import tape_stack
+
+
+def relu(a, name=None):
+  ctx = context.get_default()
+  tape = tape_stack.get_default()
+  grad_registry = gradient_registry.get_global_registry()
+  return _nn_ops.relu(ctx, a, name, tape, grad_registry)
+
+
+def sparse_softmax_cross_entropy_with_logits(logits, labels, name=None):
+  ctx = context.get_default()
+  tape = tape_stack.get_default()
+  grad_registry = gradient_registry.get_global_registry()
+  return _nn_ops.sparse_softmax_cross_entropy_with_logits(
+      ctx, logits, labels, name, tape, grad_registry)
diff --git a/tensorflow/python/framework/experimental/tape.cc b/tensorflow/python/framework/experimental/tape.cc
new file mode 100644
index 00000000000..8b5445db562
--- /dev/null
+++ b/tensorflow/python/framework/experimental/tape.cc
@@ -0,0 +1,79 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <pybind11/stl.h>
+
+#include "pybind11/pybind11.h"
+#include "tensorflow/c/eager/gradients.h"
+#include "tensorflow/c/experimental/gradients/math_grad.h"
+#include "tensorflow/c/experimental/gradients/nn_grad.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/python/lib/core/pybind11_status.h"
+
+namespace py = pybind11;
+
+namespace tensorflow {
+namespace gradients {
+
+Status RegisterGradients(GradientRegistry* registry) {
+  TF_RETURN_IF_ERROR(registry->Register("Add", AddRegisterer));
+  TF_RETURN_IF_ERROR(registry->Register("Exp", ExpRegisterer));
+  TF_RETURN_IF_ERROR(registry->Register("MatMul", MatMulRegisterer));
+  TF_RETURN_IF_ERROR(registry->Register("Relu", ReluRegisterer));
+  TF_RETURN_IF_ERROR(
+      registry->Register("SparseSoftmaxCrossEntropyWithLogits",
+                         SparseSoftmaxCrossEntropyWithLogitsRegisterer));
+  return Status::OK();
+}
+
+PYBIND11_MODULE(_tape, m) {
+  py::class_<Tape>(m, "Tape")
+      .def(py::init([](bool persistent) { return new Tape(persistent); }))
+      .def("Watch",
+           [](Tape* self, AbstractTensorHandle* t) { self->Watch(ToId(t)); })
+      .def("ComputeGradient",
+           [](Tape* self, TapeVSpace* vspace,
+              std::vector<AbstractTensorHandle*> target_tensors,
+              std::vector<AbstractTensorHandle*> source_tensors,
+              std::vector<AbstractTensorHandle*> output_gradients) {
+             std::vector<int64> target_tensor_ids;
+             std::vector<int64> source_tensor_ids;
+             target_tensor_ids.reserve(target_tensors.size());
+             source_tensor_ids.reserve(source_tensors.size());
+             for (auto t : target_tensors) {
+               target_tensor_ids.emplace_back(ToId(t));
+             }
+             for (auto t : source_tensors) {
+               source_tensor_ids.emplace_back(ToId(t));
+             }
+             std::unordered_map<tensorflow::int64, TapeTensor>
+                 source_tensors_that_are_targets;
+             std::vector<AbstractTensorHandle*> results;
+             Status s = self->ComputeGradient(
+                 *vspace, target_tensor_ids, source_tensor_ids,
+                 source_tensors_that_are_targets, output_gradients, &results,
+                 /*build_default_zeros_grads=*/false);
+             MaybeRaiseRegisteredFromStatus(s);
+             return results;
+           });
+  py::class_<TapeVSpace>(m, "TapeVSpace")
+      .def(py::init([](AbstractContext* ctx) { return new TapeVSpace(ctx); }));
+  py::class_<GradientRegistry>(m, "GradientRegistry").def(py::init([]() {
+    auto registry = new GradientRegistry();
+    MaybeRaiseRegisteredFromStatus(RegisterGradients(registry));
+    return registry;
+  }));
+}
+}  // namespace gradients
+}  // namespace tensorflow
diff --git a/tensorflow/python/framework/experimental/tape.py b/tensorflow/python/framework/experimental/tape.py
new file mode 100644
index 00000000000..47ce781ee7c
--- /dev/null
+++ b/tensorflow/python/framework/experimental/tape.py
@@ -0,0 +1,54 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Experimental impl for GradientTape using unified APIs, for testing only."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework.experimental import _tape
+from tensorflow.python.framework.experimental import context_stack
+from tensorflow.python.framework.experimental import tape_stack
+from tensorflow.python.util import nest
+
+
+class GradientTape(object):
+  """GradientTape using the unified API."""
+
+  def __init__(self, persistent=False):
+    self._c_tape = _tape.Tape(persistent)
+
+  def watch(self, t):
+    self._c_tape.Watch(t)
+
+  # TODO(srbs): Add support for unconnected_gradients.
+  def gradient(self, targets, sources, output_gradients=None):
+    ctx = context_stack.get_default()
+    vspace = _tape.TapeVSpace(ctx)
+    flat_targets = nest.flatten(targets)
+    flat_sources = nest.flatten(sources)
+    out_grads = self._c_tape.ComputeGradient(vspace, flat_targets, flat_sources,
+                                             output_gradients or [])
+    return nest.pack_sequence_as(sources, out_grads)
+
+  def __enter__(self):
+    """Enters a context inside which operations are recorded on this tape."""
+    if tape_stack.get_default():
+      raise ValueError("Nested tapes are not supported yet.")
+    tape_stack.push(self._c_tape)
+    return self
+
+  def __exit__(self, typ, value, traceback):
+    tape_stack.pop()
diff --git a/tensorflow/python/framework/experimental/tape_stack.py b/tensorflow/python/framework/experimental/tape_stack.py
new file mode 100644
index 00000000000..20583e699bb
--- /dev/null
+++ b/tensorflow/python/framework/experimental/tape_stack.py
@@ -0,0 +1,35 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Thread-local context manager stack."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework.experimental import thread_local_stack
+
+_default_tape_stack = thread_local_stack.ThreadLocalStack()
+
+
+def get_default():
+  return _default_tape_stack.peek()
+
+
+def push(tape):
+  _default_tape_stack.push(tape)
+
+
+def pop():
+  _default_tape_stack.pop()
diff --git a/tensorflow/python/framework/experimental/thread_local_stack.py b/tensorflow/python/framework/experimental/thread_local_stack.py
new file mode 100644
index 00000000000..7042f32902c
--- /dev/null
+++ b/tensorflow/python/framework/experimental/thread_local_stack.py
@@ -0,0 +1,39 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Thread-local stack."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import threading
+
+
+# TODO(srbs): Move this to C++.
+class ThreadLocalStack(threading.local):
+  """A thread-local stack of objects for providing implicit defaults."""
+
+  def __init__(self):
+    super(ThreadLocalStack, self).__init__()
+    self._stack = []
+
+  def peek(self):
+    return self._stack[-1] if self._stack else None
+
+  def push(self, ctx):
+    return self._stack.append(ctx)
+
+  def pop(self):
+    self._stack.pop()
diff --git a/tensorflow/python/framework/experimental/unified_api.cc b/tensorflow/python/framework/experimental/unified_api.cc
new file mode 100644
index 00000000000..96bf2232a1e
--- /dev/null
+++ b/tensorflow/python/framework/experimental/unified_api.cc
@@ -0,0 +1,256 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <pybind11/stl.h>
+
+#include <memory>
+
+#include "pybind11/pybind11.h"
+#include "tensorflow/c/eager/abstract_context.h"
+#include "tensorflow/c/eager/abstract_function.h"
+#include "tensorflow/c/eager/abstract_operation.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_internal.h"
+#include "tensorflow/c/eager/c_api_unified_experimental.h"
+#include "tensorflow/c/eager/c_api_unified_experimental_internal.h"
+#include "tensorflow/c/eager/immediate_execution_context.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/c/eager/tfe_context_internal.h"
+#include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/llvm_rtti/llvm_rtti.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/python/eager/pywrap_tensor.h"
+#include "tensorflow/python/lib/core/pybind11_lib.h"
+#include "tensorflow/python/lib/core/pybind11_status.h"
+#include "tensorflow/python/lib/core/safe_ptr.h"
+
+namespace py = pybind11;
+
+using tensorflow::AbstractContext;
+using tensorflow::AbstractContextPtr;
+using tensorflow::AbstractFunction;
+using tensorflow::AbstractOperation;
+using tensorflow::AbstractOperationPtr;
+using tensorflow::AbstractTensorHandle;
+using tensorflow::AbstractTensorHandlePtr;
+using tensorflow::OutputList;
+
+using tensorflow::tracing::TracingContext;
+using tensorflow::tracing::TracingOperation;
+using tensorflow::tracing::TracingTensorHandle;
+
+using tensorflow::ImmediateContextPtr;
+using tensorflow::ImmediateExecutionContext;
+using tensorflow::ImmediateExecutionTensorHandle;
+
+using tensorflow::dyn_cast;
+using tensorflow::isa;
+using tensorflow::unwrap;
+using tensorflow::wrap;
+
+using tensorflow::DataType;
+using tensorflow::make_safe;
+using tensorflow::MaybeRaiseRegisteredFromStatus;
+using tensorflow::MaybeRaiseRegisteredFromTFStatus;
+using tensorflow::Pyo;
+using tensorflow::Safe_TF_StatusPtr;
+using tensorflow::Status;
+using tensorflow::string;
+using tensorflow::TFE_TensorHandleToNumpy;
+
+using tensorflow::errors::Internal;
+using tensorflow::errors::InvalidArgument;
+
+PYBIND11_MODULE(_unified_api, m) {
+  // Context creation functions.
+  m.def("SetTracingImplementation", [](const char* impl) {
+    Safe_TF_StatusPtr status = make_safe(TF_NewStatus());
+    TF_SetTracingImplementation(impl, status.get());
+    MaybeRaiseRegisteredFromStatus(status->status);
+  });
+  m.def("NewTracingContext", [](const char* fn_name) {
+    Safe_TF_StatusPtr status = make_safe(TF_NewStatus());
+    auto* ctx = unwrap(TF_CreateFunction(fn_name, status.get()));
+    MaybeRaiseRegisteredFromTFStatus(status.get());
+    if (!ctx) {
+      MaybeRaiseRegisteredFromStatus(
+          Internal("TF_CreateFunction returned nullptr"));
+    }
+    if (!isa<TracingContext>(ctx)) {
+      // TODO(srbs): Add a helper to convert the kind enum to a user-friendly
+      // string.
+      MaybeRaiseRegisteredFromStatus(
+          Internal("TF_CreateFunction must return a TracingContext, found ",
+                   ctx->getKind()));
+    }
+    return dyn_cast<TracingContext>(ctx);
+  });
+  m.def("EagerContextToImmediateExecutionContext", [](py::handle& obj) {
+    TFE_Context* ctx =
+        static_cast<TFE_Context*>(PyCapsule_GetPointer(obj.ptr(), nullptr));
+    if (!ctx) {
+      MaybeRaiseRegisteredFromStatus(InvalidArgument("TFE_Context is nullptr"));
+    }
+    return unwrap(ctx);
+  });
+
+  // Unified execution context.
+  py::class_<AbstractContext, AbstractContextPtr>(m, "AbstractContext")
+      .def("CreateOperation",
+           [](AbstractContext* self, const char* op,
+              const char* raw_device_name) {
+             auto operation = self->CreateOperation();
+             operation->Reset(op, raw_device_name);
+             return operation;
+           })
+      .def("RegisterFunction",
+           [](AbstractContext* self, AbstractFunction* f) {
+             Status s = self->RegisterFunction(f);
+             MaybeRaiseRegisteredFromStatus(s);
+           })
+      .def("RemoveFunction", [](AbstractContext* self, const string& func) {
+        Status s = self->RemoveFunction(func);
+        MaybeRaiseRegisteredFromStatus(s);
+      });
+
+  py::class_<TracingContext, AbstractContext>(m, "TracingContext")
+      .def("AddParameter",
+           [](TracingContext* self, DataType dtype) {
+             TracingTensorHandle* handle = nullptr;
+             Status s = self->AddParameter(dtype, &handle);
+             MaybeRaiseRegisteredFromStatus(s);
+             return static_cast<AbstractTensorHandle*>(handle);
+           })
+      .def("Finalize", [](TracingContext* self, py::handle& outputs) {
+        // TODO(srbs): Using OutputList seems like an overkill here. Should we
+        // simply pass in an absl::Span?
+        OutputList output_list;
+        if (outputs.ptr() != Py_None) {
+          if (!PyList_Check(outputs.ptr())) {
+            MaybeRaiseRegisteredFromStatus(
+                InvalidArgument("must provide a list of Tensors as inputs"));
+          }
+          Py_ssize_t len = PyList_Size(outputs.ptr());
+          output_list.outputs.resize(len);
+          for (Py_ssize_t i = 0; i < len; ++i) {
+            PyObject* elem = PyList_GetItem(outputs.ptr(), i);
+            if (!elem) {
+              MaybeRaiseRegisteredFromStatus(
+                  InvalidArgument("Tensor at index  ", i, " is None."));
+            }
+            py::handle elem_h = elem;
+            AbstractTensorHandle* handle = elem_h.cast<AbstractTensorHandle*>();
+            if (!isa<TracingTensorHandle>(handle)) {
+              MaybeRaiseRegisteredFromStatus(InvalidArgument(
+                  "Tensor at index  ", i, " is not a graph tensor."));
+            }
+            output_list.outputs[i] = handle;
+          }
+        }
+        AbstractFunction* f = nullptr;
+        Status s = self->Finalize(&output_list, &f);
+        MaybeRaiseRegisteredFromStatus(s);
+        return f;
+      });
+
+  // Note: This does not take ownership of the C++ context, the lifetime of
+  // which is managed by the python `Context` and is expected to outlive this
+  // object.
+  // TODO(srbs): Make AbstractContext refcounted so that the above comment is
+  // not needed.
+  py::class_<ImmediateExecutionContext, AbstractContext,
+             std::unique_ptr<ImmediateExecutionContext, py::nodelete>>
+      ImmediateExecutionContext(m, "ImmediateExecutionContext");
+
+  // Unified execution operation.
+  py::class_<AbstractOperation, AbstractOperationPtr>(m, "AbstractOperation")
+      .def("Reset",
+           [](AbstractOperation* self, const char* op,
+              const char* raw_device_name) {
+             Status s = self->Reset(op, raw_device_name);
+             MaybeRaiseRegisteredFromStatus(s);
+           })
+      .def("SetOpName",
+           [](AbstractOperation* self, const char* op_name) {
+             // TODO(srbs): We could provide SetOpName on TracingOperation
+             // but then we need to do a hasattr check or try/pass in python.
+             if (isa<TracingOperation>(self)) {
+               auto tracing_op = reinterpret_cast<TracingOperation*>(self);
+               Status s = tracing_op->SetOpName(op_name);
+               MaybeRaiseRegisteredFromStatus(s);
+             }
+           })
+      .def("Name", &AbstractOperation::Name)
+      .def("DeviceName", &AbstractOperation::DeviceName)
+      .def("SetDeviceName",
+           [](AbstractOperation* self, const char* name) {
+             Status s = self->SetDeviceName(name);
+             MaybeRaiseRegisteredFromStatus(s);
+           })
+      .def("AddInput",
+           [](AbstractOperation* self, AbstractTensorHandle* input) {
+             Status s = self->AddInput(input);
+             MaybeRaiseRegisteredFromStatus(s);
+           })
+      .def("SetAttrType",
+           [](AbstractOperation* self, const char* attr_name, DataType value) {
+             Status s = self->SetAttrType(attr_name, value);
+             MaybeRaiseRegisteredFromStatus(s);
+           })
+      .def("Execute", [](AbstractOperation* self, int num_outputs) {
+        std::vector<AbstractTensorHandle*> outputs(num_outputs);
+        MaybeRaiseRegisteredFromStatus(
+            self->Execute(absl::MakeSpan(outputs), &num_outputs));
+        return outputs;
+      });
+
+  // Unified execution tensor handle.
+  py::class_<AbstractTensorHandle, AbstractTensorHandlePtr>(
+      m, "AbstractTensorHandle")
+      .def("DataType", &AbstractTensorHandle::DataType)
+      .def("numpy", [](AbstractTensorHandle* self) {
+        // TODO(srbs): Export this on ImmediateExecutionTensorHandle only.
+        if (!isa<ImmediateExecutionTensorHandle>(self)) {
+          // TODO(srbs): Add a helper to convert the kind enum to a
+          // user-friendly string.
+          MaybeRaiseRegisteredFromStatus(Internal(
+              "AbstractTensorHandle.numpy() must be called with an ",
+              "ImmediateExecutionTensorHandle found type: ", self->getKind()));
+        }
+        TF_Status s;
+        TFE_TensorHandle* handle =
+            wrap(dyn_cast<ImmediateExecutionTensorHandle>(self));
+        auto result = TFE_TensorHandleToNumpy(handle, &s);
+        MaybeRaiseRegisteredFromStatus(s.status);
+        return Pyo(result);
+      });
+
+  m.def("EagerTensorToImmediateExecutionTensorHandle", [](py::object handle) {
+    if (!EagerTensor_CheckExact(handle.ptr())) {
+      MaybeRaiseRegisteredFromStatus(
+          InvalidArgument("EagerTensorToImmediateExecutionTensorHandle called "
+                          "with non-EagerTensor."));
+    }
+    TFE_TensorHandle* eager_tensor = EagerTensor_Handle(handle.ptr());
+    auto t = static_cast<AbstractTensorHandle*>(unwrap(eager_tensor));
+    t->Ref();
+    return t;
+  });
+
+  py::class_<AbstractFunction> AbstractFunction(m, "AbstractFunction");
+}
diff --git a/tensorflow/python/framework/experimental/unified_api_test.py b/tensorflow/python/framework/experimental/unified_api_test.py
new file mode 100644
index 00000000000..3b476255c44
--- /dev/null
+++ b/tensorflow/python/framework/experimental/unified_api_test.py
@@ -0,0 +1,273 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Unified APIs' python bindings."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import timeit
+
+from absl.testing import parameterized
+
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework.experimental import _unified_api
+from tensorflow.python.framework.experimental import context_stack as context_lib
+from tensorflow.python.framework.experimental import def_function
+from tensorflow.python.framework.experimental import math_ops as unified_math_ops
+from tensorflow.python.framework.experimental import nn_ops as unified_nn_ops
+from tensorflow.python.framework.experimental import tape as tape_lib
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_grad  # pylint: disable=unused-import
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.platform import test
+
+SetTracingImplementation = _unified_api.SetTracingImplementation
+TensorCastHelper = _unified_api.EagerTensorToImmediateExecutionTensorHandle
+
+
+def get_immediate_execution_context():
+  context.context().ensure_initialized()
+  return _unified_api.EagerContextToImmediateExecutionContext(
+      context.context()._handle)
+
+
+def maybe_cast(t, perform_cast):
+  if perform_cast:
+    return TensorCastHelper(t)
+  return t
+
+
+class UnifiedApiTest(test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters([
+      ("Graph", False),
+      ("Mlir", True),
+  ])
+  def testAdd(self, use_mlir):
+    if use_mlir:
+      SetTracingImplementation("mlir")
+
+    def model(a, b):
+      return unified_math_ops.add(a, b)
+
+    with context_lib.set_default(get_immediate_execution_context()):
+      a = TensorCastHelper(constant_op.constant([1., 2.]))
+      b = TensorCastHelper(constant_op.constant([3., 4.]))
+
+      func_output = def_function.function(model)(a, b)
+      self.assertAllEqual(func_output.numpy(), [4., 6.])
+
+      eager_output = model(a, b)
+      self.assertAllEqual(eager_output.numpy(), [4., 6.])
+
+  @parameterized.named_parameters([
+      ("Graph", False),
+      ("Mlir", True),
+  ])
+  def testAddGrad(self, use_mlir):
+    if use_mlir:
+      SetTracingImplementation("mlir")
+
+    def model(a, b):
+      with tape_lib.GradientTape() as tape:
+        tape.watch(a)
+        tape.watch(b)
+        result = unified_math_ops.add(a, b)
+      grads = tape.gradient(result, [a, b])
+      return grads
+
+    with context_lib.set_default(get_immediate_execution_context()):
+      a = TensorCastHelper(constant_op.constant([1., 2.]))
+      b = TensorCastHelper(constant_op.constant([3., 4.]))
+
+      func_outputs = def_function.function(model)(a, b)
+      self.assertAllEqual(func_outputs[0].numpy(), [1.0, 1.0])
+      self.assertAllEqual(func_outputs[1].numpy(), [1.0, 1.0])
+
+      eager_outputs = model(a, b)
+      self.assertAllEqual(eager_outputs[0].numpy(), [1.0, 1.0])
+      self.assertAllEqual(eager_outputs[1].numpy(), [1.0, 1.0])
+
+  @parameterized.named_parameters([
+      ("Graph", False),
+      ("Mlir", True),
+  ])
+  def testRelu(self, use_mlir):
+    if use_mlir:
+      SetTracingImplementation("mlir")
+
+    def model(t):
+      return unified_nn_ops.relu(t)
+
+    with context_lib.set_default(get_immediate_execution_context()):
+      positive = TensorCastHelper(constant_op.constant([1.]))
+      negative = TensorCastHelper(constant_op.constant([-1.]))
+
+      model_fn = def_function.function(model)
+      func_output = model_fn(positive)
+      self.assertAllEqual(func_output.numpy(), [1.])
+      func_output = model_fn(negative)
+      self.assertAllEqual(func_output.numpy(), [0.])
+
+      eager_output = model(positive)
+      self.assertAllEqual(eager_output.numpy(), [1.])
+      eager_output = model(negative)
+      self.assertAllEqual(eager_output.numpy(), [0.])
+
+  @parameterized.named_parameters([
+      ("Graph", False),
+      ("Mlir", True),
+  ])
+  def testReluGrad(self, use_mlir):
+    if use_mlir:
+      SetTracingImplementation("mlir")
+
+    def model(t):
+      with tape_lib.GradientTape() as tape:
+        tape.watch(t)
+        result = unified_nn_ops.relu(t)
+      grads = tape.gradient(result, t)
+      return grads
+
+    with context_lib.set_default(get_immediate_execution_context()):
+      positive = TensorCastHelper(constant_op.constant([1.]))
+      negative = TensorCastHelper(constant_op.constant([-1.]))
+
+      model_fn = def_function.function(model)
+      func_output = model_fn(positive)
+      self.assertAllEqual(func_output.numpy(), [1.])
+      func_output = model_fn(negative)
+      self.assertAllEqual(func_output.numpy(), [0.])
+
+      eager_output = model(positive)
+      self.assertAllEqual(eager_output.numpy(), [1.])
+      eager_output = model(negative)
+      self.assertAllEqual(eager_output.numpy(), [0.])
+
+
+class UnifiedTapeBenchmark(test.Benchmark):
+
+  def _computeMnistMlpGrads(self, math_ops_lib, nn_ops_lib, backprop_lib, cast,
+                            num_iters, hidden_layers, hidden_size, batch_size):
+    batch_size = 1
+    image_size = 28 * 28
+    num_classes = 10
+
+    def model(x, hidden_weights, softmax_weight, labels):
+      with backprop_lib.GradientTape() as tape:
+        for weight in hidden_weights + [softmax_weight]:
+          tape.watch(weight)
+        for hidden_weight in hidden_weights:
+          x = math_ops_lib.mat_mul(x, hidden_weight)
+          x = nn_ops_lib.relu(x)
+        logits = math_ops_lib.mat_mul(x, softmax_weight)
+        loss = nn_ops_lib.sparse_softmax_cross_entropy_with_logits(
+            logits=logits, labels=labels)
+
+      grads = tape.gradient(loss, hidden_weights + [softmax_weight])
+      return grads
+
+    x = maybe_cast(array_ops.ones([batch_size, image_size]), cast)
+    hidden_weights = []
+    for i in range(hidden_layers):
+      hidden_weights.append(
+          maybe_cast(
+              random_ops.random_uniform(
+                  [hidden_size if i else image_size, hidden_size]), cast))
+    softmax_weight = maybe_cast(
+        random_ops.random_uniform([hidden_size, num_classes]), cast)
+    labels = maybe_cast(array_ops.zeros([batch_size], dtype=dtypes.int32), cast)
+
+    with context_lib.set_default(get_immediate_execution_context()):
+      # Warm up.
+      for _ in range(10):
+        model(x, hidden_weights, softmax_weight, labels)
+      runtimes = timeit.repeat(
+          lambda: model(x, hidden_weights, softmax_weight, labels),
+          repeat=num_iters,
+          number=10)
+    return min(runtimes) / 10
+
+  def benchmarkTwoHiddenLayerMnistEagerUnified(self):
+    num_iters = 100
+    duration = self._computeMnistMlpGrads(
+        unified_math_ops,
+        unified_nn_ops,
+        tape_lib,
+        True,
+        num_iters,
+        hidden_layers=2,
+        hidden_size=100,
+        batch_size=1)
+    self.report_benchmark(
+        name="TwoHiddenLayerMnistEagerUnified",
+        iters=num_iters,
+        wall_time=duration)
+
+  def benchmarkTwoHiddenLayerMnistEager(self):
+    num_iters = 100
+    duration = self._computeMnistMlpGrads(
+        math_ops,
+        nn_ops,
+        backprop,
+        False,
+        num_iters,
+        hidden_layers=2,
+        hidden_size=100,
+        batch_size=1)
+    self.report_benchmark(
+        name="TwoHiddenLayerMnistEager", iters=num_iters, wall_time=duration)
+
+  def benchmarkTenHiddenLayerMnistEagerUnified(self):
+    num_iters = 100
+    duration = self._computeMnistMlpGrads(
+        unified_math_ops,
+        unified_nn_ops,
+        tape_lib,
+        True,
+        num_iters,
+        hidden_layers=10,
+        hidden_size=100,
+        batch_size=1)
+    self.report_benchmark(
+        name="TenHiddenLayerMnistEagerUnified",
+        iters=num_iters,
+        wall_time=duration)
+
+  def benchmarkTenHiddenLayerMnistEager(self):
+    num_iters = 100
+    duration = self._computeMnistMlpGrads(
+        math_ops,
+        nn_ops,
+        backprop,
+        False,
+        num_iters,
+        hidden_layers=10,
+        hidden_size=100,
+        batch_size=1)
+    self.report_benchmark(
+        name="TenHiddenLayerMnistEager", iters=num_iters, wall_time=duration)
+
+
+if __name__ == "__main__":
+  ops.enable_eager_execution()
+  test.main()
diff --git a/tensorflow/python/framework/func_graph.py b/tensorflow/python/framework/func_graph.py
index dbe0d57759b..284b6994c5f 100644
--- a/tensorflow/python/framework/func_graph.py
+++ b/tensorflow/python/framework/func_graph.py
@@ -583,14 +583,16 @@ class FuncGraph(ops.Graph):
     # backward accumulators in the original graph before we create placeholders
     # to capture the inputs.
     ctxt = ops.get_default_graph()._control_flow_context  # pylint: disable=protected-access
-    for i, inp in enumerate(inputs):
+    # Use a different list to avoid modifying the original inputs list.
+    captured_inputs = []
+    for inp in inputs:
       # TPU Estimator defines a control flow context with no AddValue method.
       if ctxt is not None and hasattr(ctxt, "AddValue"):
         inp = ctxt.AddValue(inp)
       inp = self.capture(inp)
-      inputs[i] = inp
+      captured_inputs.append(inp)
     return super(FuncGraph, self)._create_op_internal(  # pylint: disable=protected-access
-        op_type, inputs, dtypes, input_types, name, attrs, op_def,
+        op_type, captured_inputs, dtypes, input_types, name, attrs, op_def,
         compute_device)
 
   def capture(self, tensor, name=None, shape=None):
diff --git a/tensorflow/python/framework/function_test.py b/tensorflow/python/framework/function_test.py
index 596b93227bf..298d41a995c 100644
--- a/tensorflow/python/framework/function_test.py
+++ b/tensorflow/python/framework/function_test.py
@@ -1592,6 +1592,7 @@ class UnrollLSTMTest(test.TestCase):
       self.assertAllClose(mv0, mv2, rtol=1e-4)
       self.assertAllClose(mv0, mv3, rtol=1e-4)
 
+  @test_util.run_without_tensor_float_32("Calls matmul in custom LSTM function")
   def testUnrollLSTMGrad(self):
     # Run one step of the unrolled lstm graph.
     def RunForwardBackward(mode, cfg=None):
diff --git a/tensorflow/python/framework/indexed_slices.py b/tensorflow/python/framework/indexed_slices.py
index 45f6e254b0e..b1e1e20fc2e 100644
--- a/tensorflow/python/framework/indexed_slices.py
+++ b/tensorflow/python/framework/indexed_slices.py
@@ -429,9 +429,12 @@ def _indexed_slices_to_tensor(value, dtype=None, name=None, as_ref=False):
             "elements. This may consume a large amount of memory." %
             num_elements)
     else:
-      warnings.warn(
-          "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
-          "This may consume a large amount of memory.")
+      if value.dense_shape.op.type != "VariableShape":
+        # VariableShape may hide static shapes behind a resource handle
+        # producing a warning that isn't that useful to users.
+        warnings.warn(
+            "Converting sparse IndexedSlices(%s) to a dense Tensor of unknown "
+            "shape. This may consume a large amount of memory." % value)
   return math_ops.unsorted_segment_sum(
       value.values, value.indices, value.dense_shape[0], name=name)
 
diff --git a/tensorflow/python/framework/memory_checker.py b/tensorflow/python/framework/memory_checker.py
index ff99182966a..5def0a9cd86 100644
--- a/tensorflow/python/framework/memory_checker.py
+++ b/tensorflow/python/framework/memory_checker.py
@@ -24,9 +24,9 @@ from tensorflow.python.profiler.traceme import traceme_wrapper
 from tensorflow.python.util import tf_inspect
 
 try:
-  from tensorflow.python.platform.cpp_memory_checker import _CppMemoryChecker  # pylint:disable=g-import-not-at-top
+  from tensorflow.python.platform.cpp_memory_checker import _CppMemoryChecker as CppMemoryChecker  # pylint:disable=g-import-not-at-top
 except ImportError:
-  _CppMemoryChecker = None
+  CppMemoryChecker = None
 
 
 def _get_test_name_best_effort():
@@ -77,13 +77,13 @@ class MemoryChecker(object):
     self._trace_me = TraceMe('with MemoryChecker():')
     self._trace_me.__enter__()
     self._python_memory_checker = _PythonMemoryChecker()
-    if _CppMemoryChecker:
-      self._cpp_memory_checker = _CppMemoryChecker(_get_test_name_best_effort())
+    if CppMemoryChecker:
+      self._cpp_memory_checker = CppMemoryChecker(_get_test_name_best_effort())
     return self
 
   @traceme_wrapper
   def __exit__(self, exc_type, exc_value, traceback):
-    if _CppMemoryChecker:
+    if CppMemoryChecker:
       self._cpp_memory_checker.stop()
     self._trace_me.__exit__(exc_type, exc_value, traceback)
 
@@ -99,7 +99,7 @@ class MemoryChecker(object):
     code complexity and the allcoation pattern.
     """
     self._python_memory_checker.record_snapshot()
-    if _CppMemoryChecker:
+    if CppMemoryChecker:
       self._cpp_memory_checker.record_snapshot()
 
   @traceme_wrapper
@@ -111,7 +111,7 @@ class MemoryChecker(object):
     directory provided the infra instead.
     """
     self._python_memory_checker.report()
-    if _CppMemoryChecker:
+    if CppMemoryChecker:
       self._cpp_memory_checker.report()
 
   @traceme_wrapper
@@ -124,7 +124,7 @@ class MemoryChecker(object):
     """
 
     self._python_memory_checker.assert_no_leak_if_all_possibly_except_one()
-    if _CppMemoryChecker:
+    if CppMemoryChecker:
       self._cpp_memory_checker.assert_no_leak_if_all_possibly_except_one()
 
   @traceme_wrapper
diff --git a/tensorflow/python/framework/op_def_library.py b/tensorflow/python/framework/op_def_library.py
index 17e06b79f74..016af65fc0a 100644
--- a/tensorflow/python/framework/op_def_library.py
+++ b/tensorflow/python/framework/op_def_library.py
@@ -21,10 +21,12 @@ from __future__ import print_function
 
 import six
 
+from google.protobuf import text_format
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.framework import tensor_pb2
 from tensorflow.core.framework import tensor_shape_pb2
 from tensorflow.core.framework import types_pb2
+from tensorflow.python import _pywrap_utils
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import op_callbacks
 from tensorflow.python.framework import op_def_registry
@@ -61,6 +63,27 @@ def _SatisfiesTypeConstraint(dtype, attr_def, param_name):
            ", ".join(dtypes.as_dtype(x).name for x in allowed_list)))
 
 
+def _SatisfiesLengthConstraint(length, attr_def, param_name, op_type_name):
+  if attr_def.has_minimum and length < attr_def.minimum:
+    raise ValueError("Attr '%s' of '%s' Op passed list of length %d "
+                     "less than minimum %d." %
+                     (param_name, op_type_name, length, attr_def.minimum))
+
+
+def _SatisfiesAllowedStringsConstraint(value, attr_def, arg_name, op_type_name):
+  if value not in attr_def.allowed_values.list.s:
+    raise ValueError(
+        "Attr '%s' of '%s' Op passed string '%s' not in: \"%s\"." %
+        (arg_name, op_type_name, compat.as_text(value), '", "'.join(
+            map(compat.as_text, attr_def.allowed_values.list.s))))
+
+
+def _SatisfiesIntMinimumConstraint(value, attr_def, arg_name, op_type_name):
+  if value < attr_def.minimum:
+    raise ValueError("Attr '%s' of '%s' Op passed %d less than minimum %d." %
+                     (arg_name, op_type_name, value, attr_def.minimum))
+
+
 def _IsListParameter(arg):
   if arg.number_attr:
     return True
@@ -170,15 +193,13 @@ def _MakeBool(v, arg_name):
   return v
 
 
-def _MakeType(v, attr_def):
+def _MakeType(v, arg_name):
   try:
     v = dtypes.as_dtype(v).base_dtype
   except TypeError:
     raise TypeError("Expected DataType for argument '%s' not %s." %
-                    (attr_def.name, repr(v)))
-  i = v.as_datatype_enum
-  _SatisfiesTypeConstraint(i, attr_def, param_name=attr_def.name)
-  return i
+                    (arg_name, repr(v)))
+  return v.as_datatype_enum
 
 
 def _MakeShape(v, arg_name):
@@ -668,78 +689,32 @@ def _apply_op_helper(op_type_name, name=None, **keywords):  # pylint: disable=in
     for attr_def in op_def.attr:
       key = attr_def.name
       value = attrs[key]
-      attr_value = attr_value_pb2.AttrValue()
+
       if attr_def.HasField("default_value") and value is None:
+        attr_value = attr_value_pb2.AttrValue()
         attr_value.CopyFrom(attr_def.default_value)
         attr_protos[key] = attr_value
         continue
+
+      attr_value = value_to_attr_value(value, attr_def.type, key)
       if attr_def.type.startswith("list("):
-        if not _IsListValue(value):
-          raise TypeError("Expected list for attr " + key)
-        if attr_def.has_minimum:
-          if len(value) < attr_def.minimum:
-            raise ValueError("Attr '%s' of '%s' Op passed list of length %d "
-                             "less than minimum %d." %
-                             (key, op_type_name, len(value),
-                              attr_def.minimum))
-        attr_value.list.SetInParent()
-      if attr_def.type == "string":
-        attr_value.s = _MakeStr(value, key)
-        if attr_def.HasField("allowed_values"):
-          if attr_value.s not in attr_def.allowed_values.list.s:
-            raise ValueError(
-                "Attr '%s' of '%s' Op passed string '%s' not in: \"%s\"." %
-                (key, op_type_name, compat.as_text(attr_value.s),
-                 '", "'.join(map(compat.as_text,
-                                 attr_def.allowed_values.list.s))))
-      elif attr_def.type == "list(string)":
-        attr_value.list.s.extend([_MakeStr(x, key) for x in value])
-        if attr_def.HasField("allowed_values"):
-          for x in attr_value.list.s:
-            if x not in attr_def.allowed_values.list.s:
-              raise ValueError(
-                  "Attr '%s' of '%s' Op passed string '%s' not in: \"%s\"." %
-                  (key, op_type_name, compat.as_text(x),
-                   '", "'.join(map(compat.as_text,
-                                   attr_def.allowed_values.list.s))))
-      elif attr_def.type == "int":
-        attr_value.i = _MakeInt(value, key)
-        if attr_def.has_minimum:
-          if attr_value.i < attr_def.minimum:
-            raise ValueError(
-                "Attr '%s' of '%s' Op passed %d less than minimum %d." %
-                (key, op_type_name, attr_value.i, attr_def.minimum))
-      elif attr_def.type == "list(int)":
-        attr_value.list.i.extend([_MakeInt(x, key) for x in value])
-      elif attr_def.type == "float":
-        attr_value.f = _MakeFloat(value, key)
-      elif attr_def.type == "list(float)":
-        attr_value.list.f.extend([_MakeFloat(x, key) for x in value])
-      elif attr_def.type == "bool":
-        attr_value.b = _MakeBool(value, key)
-      elif attr_def.type == "list(bool)":
-        attr_value.list.b.extend([_MakeBool(x, key) for x in value])
-      elif attr_def.type == "type":
-        attr_value.type = _MakeType(value, attr_def)
-      elif attr_def.type == "list(type)":
-        attr_value.list.type.extend(
-            [_MakeType(x, attr_def) for x in value])
-      elif attr_def.type == "shape":
-        attr_value.shape.CopyFrom(_MakeShape(value, key))
-      elif attr_def.type == "list(shape)":
-        attr_value.list.shape.extend(
-            [_MakeShape(x, key) for x in value])
-      elif attr_def.type == "tensor":
-        attr_value.tensor.CopyFrom(_MakeTensor(value, key))
-      elif attr_def.type == "list(tensor)":
-        attr_value.list.tensor.extend(
-            [_MakeTensor(x, key) for x in value])
-      elif attr_def.type == "func":
-        attr_value.func.CopyFrom(_MakeFunc(value, key))
-      elif attr_def.type == "list(func)":
-        attr_value.list.func.extend([_MakeFunc(x, key) for x in value])
-      else:
-        raise TypeError("Unrecognized Attr type " + attr_def.type)
+        _SatisfiesLengthConstraint(len(value), attr_def, key, op_type_name)
+      if attr_def.HasField("allowed_values"):
+        if attr_def.type == "string":
+          _SatisfiesAllowedStringsConstraint(attr_value.s, attr_def, key,
+                                             op_type_name)
+        elif attr_def.type == "list(string)":
+          for value in attr_value.list.s:
+            _SatisfiesAllowedStringsConstraint(value, attr_def, key,
+                                               op_type_name)
+      if attr_def.has_minimum and attr_def.type == "int":
+        _SatisfiesIntMinimumConstraint(attr_value.i, attr_def, key,
+                                       op_type_name)
+      if attr_def.type == "type":
+        _SatisfiesTypeConstraint(attr_value.type, attr_def, key)
+      if attr_def.type == "list(type)":
+        for value in attr_value.list.type:
+          _SatisfiesTypeConstraint(value, attr_def, key)
 
       attr_protos[key] = attr_value
     del attrs  # attrs is no longer authoritative, use attr_protos instead
@@ -788,3 +763,68 @@ def _apply_op_helper(op_type_name, name=None, **keywords):  # pylint: disable=in
         outputs = callback_outputs
 
     return output_structure, op_def.is_stateful, op, outputs
+
+
+def value_to_attr_value(value, attr_type, arg_name):  # pylint: disable=invalid-name
+  """Encodes a Python value as an `AttrValue` proto message.
+
+  Args:
+    value: The value to convert.
+    attr_type: The value type (string) -- see the AttrValue proto definition for
+      valid strings.
+    arg_name: Argument name (for error messages).
+
+  Returns:
+    An AttrValue proto message that encodes `value`.
+  """
+  attr_value = attr_value_pb2.AttrValue()
+
+  if attr_type.startswith("list("):
+    if not _IsListValue(value):
+      raise TypeError("Expected list for attr " + arg_name)
+
+  if attr_type == "string":
+    attr_value.s = _MakeStr(value, arg_name)
+  elif attr_type == "list(string)":
+    attr_value.list.s.extend([_MakeStr(x, arg_name) for x in value])
+  elif attr_type == "int":
+    attr_value.i = _MakeInt(value, arg_name)
+  elif attr_type == "list(int)":
+    attr_value.list.i.extend([_MakeInt(x, arg_name) for x in value])
+  elif attr_type == "float":
+    attr_value.f = _MakeFloat(value, arg_name)
+  elif attr_type == "list(float)":
+    attr_value.list.f.extend([_MakeFloat(x, arg_name) for x in value])
+  elif attr_type == "bool":
+    attr_value.b = _MakeBool(value, arg_name)
+  elif attr_type == "list(bool)":
+    attr_value.list.b.extend([_MakeBool(x, arg_name) for x in value])
+  elif attr_type == "type":
+    attr_value.type = _MakeType(value, arg_name)
+  elif attr_type == "list(type)":
+    attr_value.list.type.extend([_MakeType(x, arg_name) for x in value])
+  elif attr_type == "shape":
+    attr_value.shape.CopyFrom(_MakeShape(value, arg_name))
+  elif attr_type == "list(shape)":
+    attr_value.list.shape.extend([_MakeShape(x, arg_name) for x in value])
+  elif attr_type == "tensor":
+    attr_value.tensor.CopyFrom(_MakeTensor(value, arg_name))
+  elif attr_type == "list(tensor)":
+    attr_value.list.tensor.extend([_MakeTensor(x, arg_name) for x in value])
+  elif attr_type == "func":
+    attr_value.func.CopyFrom(_MakeFunc(value, arg_name))
+  elif attr_type == "list(func)":
+    attr_value.list.func.extend([_MakeFunc(x, arg_name) for x in value])
+  else:
+    raise TypeError("Unrecognized Attr type " + attr_type)
+  return attr_value
+
+
+# The following symbols are used by op_def_util.cc.
+_pywrap_utils.RegisterPyObject("tf.dtypes.DType", dtypes.DType)
+_pywrap_utils.RegisterPyObject("tf.dtypes.as_dtype", dtypes.as_dtype)
+_pywrap_utils.RegisterPyObject("tf.TensorShape", tensor_shape.TensorShape)
+_pywrap_utils.RegisterPyObject("tf.as_shape", tensor_shape.as_shape)
+_pywrap_utils.RegisterPyObject("tf.TensorProto", tensor_pb2.TensorProto)
+_pywrap_utils.RegisterPyObject("text_format.Parse", text_format.Parse)
+_pywrap_utils.RegisterPyObject("tf.convert_to_tensor", ops.convert_to_tensor)
diff --git a/tensorflow/python/framework/op_def_util.cc b/tensorflow/python/framework/op_def_util.cc
new file mode 100644
index 00000000000..4e1569f190d
--- /dev/null
+++ b/tensorflow/python/framework/op_def_util.cc
@@ -0,0 +1,381 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/python/framework/op_def_util.h"
+
+#include <map>
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/python/lib/core/safe_ptr.h"
+#include "tensorflow/python/util/util.h"
+
+using ::tensorflow::swig::GetRegisteredPyObject;
+
+#if PY_MAJOR_VERSION < 3
+// Python 2.x:
+#define PY_STRING_CHECK(x) (PyString_Check(x) || PyUnicode_Check(x))
+#define PY_STRING_FROMSTRING(x) (PyString_FromString(x))
+#define PY_INT_CHECK(x) (PyInt_Check(x))
+#define PY_INT_TYPE PyInt_Type
+#define PY_INT_FROM_LONG(x) (PyInt_FromLong(x))
+#else
+// Python 3.x:
+#define PY_STRING_CHECK(x) (PyBytes_Check(x) || PyUnicode_Check(x))
+#define PY_STRING_FROMSTRING(x) (PyUnicode_FromString(x))
+#define PY_INT_CHECK(x) (PyLong_Check(x))
+#define PY_INT_TYPE PyLong_Type
+#define PY_INT_FROM_LONG(x) (PyLong_FromLong(x))
+#endif
+
+namespace tensorflow {
+
+namespace {
+
+const std::map<std::string, AttributeType>* AttributeTypeNameMap() {
+  static auto* type_map = new std::map<std::string, AttributeType>(
+      {{"any", AttributeType::ANY},
+       {"float", AttributeType::FLOAT},
+       {"int", AttributeType::INT},
+       {"string", AttributeType::STRING},
+       {"bool", AttributeType::BOOL},
+       {"shape", AttributeType::SHAPE},
+       {"type", AttributeType::DTYPE},
+       {"tensor", AttributeType::TENSOR},
+       {"list(any)", AttributeType::LIST_ANY},
+       {"list(float)", AttributeType::LIST_FLOAT},
+       {"list(int)", AttributeType::LIST_INT},
+       {"list(string)", AttributeType::LIST_STRING},
+       {"list(bool)", AttributeType::LIST_BOOL},
+       {"list(type)", AttributeType::LIST_DTYPE},
+       {"list(shape)", AttributeType::LIST_SHAPE},
+       {"list(tensor)", AttributeType::LIST_TENSOR}});
+  return type_map;
+}
+
+// Note: we define functors for converting value types (rather than simple
+// functions) so we can define a generic ConvertListAttr method.  These
+// functors all return a new reference on success, or nullptr on failure.
+// They do not (necessarily) call PyErr_SetString.
+
+struct ConvertAnyFunctor {
+  Safe_PyObjectPtr operator()(PyObject* value) {
+    Py_INCREF(value);
+    return Safe_PyObjectPtr(value);
+  }
+};
+
+struct ConvertFloatFunctor {
+  Safe_PyObjectPtr operator()(PyObject* value) {
+    Safe_PyObjectPtr result;
+    if (PyFloat_Check(value)) {
+      Py_INCREF(value);
+      result.reset(value);
+    } else if (!PY_STRING_CHECK(value)) {
+      result.reset(PyObject_CallFunctionObjArgs(
+          reinterpret_cast<PyObject*>(&PyFloat_Type), value, nullptr));
+    }
+    return result;
+  }
+};
+
+struct ConvertIntFunctor {
+  Safe_PyObjectPtr operator()(PyObject* value) {
+    Safe_PyObjectPtr result;
+    if (PY_INT_CHECK(value)) {
+      Py_INCREF(value);
+      result.reset(value);
+    } else if (!PY_STRING_CHECK(value)) {
+      result.reset(PyObject_CallFunctionObjArgs(
+          reinterpret_cast<PyObject*>(&PY_INT_TYPE), value, nullptr));
+    }
+    return result;
+  }
+};
+
+struct ConvertStringFunctor {
+  Safe_PyObjectPtr operator()(PyObject* value) {
+    Safe_PyObjectPtr result;
+    if (PY_STRING_CHECK(value)) {
+      Py_INCREF(value);
+      result.reset(value);
+    }
+    return result;
+  }
+};
+
+// TODO(edloper): Should we allow ints (or any other values) to be converted
+// to booleans?  Currently, TensorFlow does not do this conversion for attribute
+// values in _MakeBool or make_bool.
+struct ConvertBoolFunctor {
+  Safe_PyObjectPtr operator()(PyObject* value) {
+    Safe_PyObjectPtr result;
+    if (PyBool_Check(value)) {
+      Py_INCREF(value);
+      result.reset(value);
+    }
+    return result;
+  }
+};
+
+struct ConvertDTypeFunctor {
+  Safe_PyObjectPtr operator()(PyObject* value) {
+    Safe_PyObjectPtr result;
+    // The following symbols are registered in op_def_library.py
+    static PyObject* dtype = GetRegisteredPyObject("tf.dtypes.DType");
+    static PyObject* as_dtype = GetRegisteredPyObject("tf.dtypes.as_dtype");
+    if (reinterpret_cast<PyObject*>(value->ob_type) == dtype) {
+      Py_INCREF(value);
+      result.reset(value);
+    } else {
+      result.reset(PyObject_CallFunctionObjArgs(as_dtype, value, nullptr));
+    }
+    return result;
+  }
+};
+
+struct ConvertTensorShapeFunctor {
+  Safe_PyObjectPtr operator()(PyObject* value) {
+    Safe_PyObjectPtr result;
+    // The following symbols are registered in op_def_library.py
+    static PyObject* shape = GetRegisteredPyObject("tf.TensorShape");
+    static PyObject* as_shape = GetRegisteredPyObject("tf.as_shape");
+    if (reinterpret_cast<PyObject*>(value->ob_type) == shape) {
+      Py_INCREF(value);
+      result.reset(value);
+    } else {
+      result.reset(PyObject_CallFunctionObjArgs(as_shape, value, nullptr));
+    }
+    return result;
+  }
+};
+
+struct ConvertTensorProtoFunctor {
+  Safe_PyObjectPtr operator()(PyObject* value) {
+    Safe_PyObjectPtr result;
+    // The following symbols are registered in op_def_library.py
+    static PyObject* tensor_proto = GetRegisteredPyObject("tf.TensorProto");
+    static PyObject* text_format_parse =
+        GetRegisteredPyObject("text_format.Parse");
+    if (reinterpret_cast<PyObject*>(value->ob_type) == tensor_proto) {
+      Py_INCREF(value);
+      result.reset(value);
+    } else if (PY_STRING_CHECK(value)) {
+      result.reset(PyObject_CallObject(tensor_proto, nullptr));
+      if (result) {
+        if (!PyObject_CallFunctionObjArgs(text_format_parse, value,
+                                          result.get(), nullptr)) {
+          return nullptr;
+        }
+      }
+    }
+    return result;
+  }
+};
+
+// Converts `value` to a list of elements with the same type, using
+// `convert_functor` to convert each element.
+template <typename T>
+Safe_PyObjectPtr ConvertListAttr(PyObject* value, T convert_functor) {
+  // Copy the list.
+  Safe_PyObjectPtr result(PySequence_List(value));
+  if (!result) return nullptr;
+
+  // Check the type of each item in the list.
+  Py_ssize_t len = PySequence_Fast_GET_SIZE(result.get());
+  PyObject** items = PySequence_Fast_ITEMS(result.get());
+  for (Py_ssize_t i = 0; i < len; ++i) {
+    if (!PyFloat_Check(value)) {
+      Safe_PyObjectPtr item = convert_functor(items[i]);
+      if (!item) return nullptr;
+      PySequence_SetItem(result.get(), i, item.get());
+    }
+  }
+  return result;
+}
+
+// Returns the given `value` value, converted to the indicated type.
+// Returns nullptr if `value` is not convertible.
+Safe_PyObjectPtr ConvertAttrOrNull(PyObject* value, AttributeType attr_type) {
+  switch (attr_type) {
+    case AttributeType::ANY:
+      return ConvertAnyFunctor()(value);
+    case AttributeType::FLOAT:
+      return ConvertFloatFunctor()(value);
+    case AttributeType::INT:
+      return ConvertIntFunctor()(value);
+    case AttributeType::STRING:
+      return ConvertStringFunctor()(value);
+    case AttributeType::BOOL:
+      return ConvertBoolFunctor()(value);
+    case AttributeType::DTYPE:
+      return ConvertDTypeFunctor()(value);
+    case AttributeType::SHAPE:
+      return ConvertTensorShapeFunctor()(value);
+    case AttributeType::TENSOR:
+      return ConvertTensorProtoFunctor()(value);
+    case AttributeType::LIST_ANY:
+      return ConvertListAttr(value, ConvertAnyFunctor());
+    case AttributeType::LIST_FLOAT:
+      return ConvertListAttr(value, ConvertFloatFunctor());
+    case AttributeType::LIST_INT:
+      return ConvertListAttr(value, ConvertIntFunctor());
+    case AttributeType::LIST_STRING:
+      return ConvertListAttr(value, ConvertStringFunctor());
+    case AttributeType::LIST_BOOL:
+      return ConvertListAttr(value, ConvertBoolFunctor());
+    case AttributeType::LIST_DTYPE:
+      return ConvertListAttr(value, ConvertDTypeFunctor());
+    case AttributeType::LIST_SHAPE:
+      return ConvertListAttr(value, ConvertTensorShapeFunctor());
+    case AttributeType::LIST_TENSOR:
+      return ConvertListAttr(value, ConvertTensorProtoFunctor());
+    default:
+      return nullptr;
+  }
+}
+
+// Returns a new reference to Py_True or Py_False depending on b.
+PyObject* PyBool_FromBool(bool b) {
+  PyObject* result = b ? Py_True : Py_False;
+  Py_INCREF(result);
+  return result;
+}
+
+Safe_PyObjectPtr AttrValueListToPyObject(AttrValue::ListValue list) {
+  if (list.s_size()) {
+    Safe_PyObjectPtr result(PyList_New(list.s_size()));
+    for (int i = 0; i < list.s_size(); ++i) {
+      PyList_SET_ITEM(result.get(), i, PY_STRING_FROMSTRING(list.s(i).c_str()));
+    }
+    return result;
+  } else if (list.i_size()) {
+    Safe_PyObjectPtr result(PyList_New(list.i_size()));
+    for (int i = 0; i < list.i_size(); ++i) {
+      PyList_SET_ITEM(result.get(), i, PY_INT_FROM_LONG(list.i(i)));
+    }
+    return result;
+  } else if (list.f_size()) {
+    Safe_PyObjectPtr result(PyList_New(list.f_size()));
+    for (int i = 0; i < list.f_size(); ++i) {
+      PyList_SET_ITEM(result.get(), i, PyFloat_FromDouble(list.f(i)));
+    }
+    return result;
+  } else if (list.b_size()) {
+    Safe_PyObjectPtr result(PyList_New(list.b_size()));
+    for (int i = 0; i < list.b_size(); ++i) {
+      PyList_SET_ITEM(result.get(), i, PyBool_FromBool(list.b(i)));
+    }
+    return result;
+  } else if (list.type_size()) {
+    Safe_PyObjectPtr result(PyList_New(list.type_size()));
+    for (int i = 0; i < list.type_size(); ++i) {
+      Safe_PyObjectPtr item(DataTypeToPyObject(list.type(i)));
+      Py_INCREF(item.get());
+      PyList_SET_ITEM(result.get(), i, item.get());
+    }
+    return result;
+  } else if (list.shape_size()) {
+    Safe_PyObjectPtr result(PyList_New(list.shape_size()));
+    for (int i = 0; i < list.shape_size(); ++i) {
+      Safe_PyObjectPtr item(TensorShapeProtoToPyObject(list.shape(i)));
+      Py_INCREF(item.get());
+      PyList_SET_ITEM(result.get(), i, item.get());
+    }
+    return result;
+  } else if (list.tensor_size() || list.func_size()) {
+    // TODO(edloper): Add support for tensorflow::AttrValue::kTensor.
+    PyErr_SetString(PyExc_TypeError, "Unsupported AttrValue type");
+    return nullptr;
+  } else {
+    // Empty list
+    return Safe_PyObjectPtr(PyList_New(0));
+  }
+}
+
+}  // namespace
+
+AttributeType AttributeTypeFromName(const std::string& type_name) {
+  const auto* type_map = AttributeTypeNameMap();
+  auto it = type_map->find(type_name);
+  return it != type_map->end() ? it->second : AttributeType::UNKNOWN;
+}
+
+std::string AttributeTypeToName(AttributeType attr_type) {
+  for (const auto& pair : *AttributeTypeNameMap()) {
+    if (pair.second == attr_type) {
+      return pair.first;
+    }
+  }
+  return "<unknown>";
+}
+
+Safe_PyObjectPtr ConvertPyObjectToAttributeType(PyObject* value,
+                                                AttributeType type) {
+  Safe_PyObjectPtr result = ConvertAttrOrNull(value, type);
+  if (!result) {
+    auto err = absl::StrCat("Failed to convert value of type '",
+                            value->ob_type->tp_name, "' to type '",
+                            AttributeTypeToName(type), "'.");
+    PyErr_SetString(PyExc_TypeError, err.c_str());
+  }
+
+  return result;
+}
+
+Safe_PyObjectPtr AttrValueToPyObject(const AttrValue& attr_value) {
+  switch (attr_value.value_case()) {
+    case tensorflow::AttrValue::kS:
+      return Safe_PyObjectPtr(PY_STRING_FROMSTRING(attr_value.s().c_str()));
+    case tensorflow::AttrValue::kI:
+      return Safe_PyObjectPtr(PY_INT_FROM_LONG(attr_value.i()));
+    case tensorflow::AttrValue::kF:
+      return Safe_PyObjectPtr(PyFloat_FromDouble(attr_value.f()));
+    case tensorflow::AttrValue::kB:
+      return Safe_PyObjectPtr(PyBool_FromBool(attr_value.b()));
+    case tensorflow::AttrValue::kType:
+      return DataTypeToPyObject(attr_value.type());
+    case tensorflow::AttrValue::kShape:
+      return TensorShapeProtoToPyObject(attr_value.shape());
+    case tensorflow::AttrValue::kList:
+      return AttrValueListToPyObject(attr_value.list());
+    default:
+      // TODO(edloper): Add support for tensorflow::AttrValue::kTensor.
+      PyErr_SetString(PyExc_ValueError, "Unsupported AttrValue type");
+      return nullptr;
+  }
+}
+
+Safe_PyObjectPtr DataTypeToPyObject(const DataType& data_type) {
+  Safe_PyObjectPtr enum_value(PY_INT_FROM_LONG(data_type));
+  return ConvertDTypeFunctor()(enum_value.get());
+}
+
+Safe_PyObjectPtr TensorShapeProtoToPyObject(
+    const TensorShapeProto& tensor_shape) {
+  if (tensor_shape.unknown_rank()) {
+    return ConvertTensorShapeFunctor()(Py_None);
+  } else {
+    Safe_PyObjectPtr dims(PyTuple_New(tensor_shape.dim_size()));
+    for (int i = 0; i < tensor_shape.dim_size(); ++i) {
+      PyTuple_SET_ITEM(dims.get(), i,
+                       PY_INT_FROM_LONG(tensor_shape.dim(i).size()));
+    }
+    return ConvertTensorShapeFunctor()(dims.get());
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/python/framework/op_def_util.h b/tensorflow/python/framework/op_def_util.h
new file mode 100644
index 00000000000..3b35c3ef7ad
--- /dev/null
+++ b/tensorflow/python/framework/op_def_util.h
@@ -0,0 +1,98 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_PYTHON_FRAMEWORK_OP_DEF_UTIL_H_
+#define TENSORFLOW_PYTHON_FRAMEWORK_OP_DEF_UTIL_H_
+
+#include <Python.h>
+
+#include <string>
+
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/python/lib/core/safe_ptr.h"
+
+namespace tensorflow {
+
+// Enumerated type corresponding with string values in AttrDef::type.
+enum class AttributeType {
+  UNKNOWN,
+  ANY,          // "any"
+  FLOAT,        // "float"
+  INT,          // "int"
+  STRING,       // "string"
+  BOOL,         // "bool"
+  DTYPE,        // "type" (tf.dtypes.DType)
+  SHAPE,        // "shape" (tf.TensorShape)
+  TENSOR,       // "tensor" (tf.TensorProto)
+  LIST_ANY,     // "list(any)"
+  LIST_FLOAT,   // "list(float)"
+  LIST_INT,     // "list(int)"
+  LIST_STRING,  // "list(string)"
+  LIST_BOOL,    // "list(bool)"
+  LIST_DTYPE,   // "list(dtype)"
+  LIST_SHAPE,   // "list(shape)"
+  LIST_TENSOR   // "list(tensor)"
+};
+
+// Returns the enumerated value corresponding to a given string (e.g.
+// "string" or "list(string)".
+AttributeType AttributeTypeFromName(const std::string& type_name);
+
+// Returns the string corresponding to a given enumerated value.
+std::string AttributeTypeToName(AttributeType attr_type);
+
+// Converts `value` to the specified type and returns a new reference to the
+// converted value (if possible); or sets a Python exception and returns
+// nullptr.  This function is optimized to be fast if `value` already has the
+// desired type.
+//
+//   * 'any' values are returned as-is.
+//   * 'float' values are converted by calling float(value).
+//   * 'int' values are converted by calling int(value).
+//   * 'string' values are returned as-is if they are (bytes, unicode);
+//     otherwise, an exception is raised.
+//   * 'bool' values are returned as-is if they are boolean; otherwise, an
+//     exception is raised.
+//   * 'dtype' values are converted using `dtypes.as_dtype`.
+//   * 'shape' values are converted using `tensor_shape.as_shape`.
+//   * 'tensor' values are returned as-is if they are a `TensorProto`; or are
+//     parsed into `TensorProto` using `textformat.merge` if they are a string.
+//     Otherwise, an exception is raised.
+//   * 'list(*)' values are copied to a new list, and then each element is
+//     converted (in-place) as described above.  (If the value is not iterable,
+//     or if conversion fails for any item, then an exception is raised.)
+Safe_PyObjectPtr ConvertPyObjectToAttributeType(PyObject* value,
+                                                AttributeType type);
+
+// Converts a c++ `AttrValue` protobuf message to a Python object; or sets a
+// Python exception and returns nullptr if an error occurs.
+Safe_PyObjectPtr AttrValueToPyObject(const AttrValue& attr_value);
+
+// Converts a c++ `DataType` protobuf enum to a Python object; or sets a
+// Python exception and returns nullptr if an error occurs.
+Safe_PyObjectPtr DataTypeToPyObject(const DataType& data_type);
+
+// Converts a c++ `TensorShapeProto` message to a Python object; or sets a
+// Python exception and returns nullptr if an error occurs.
+Safe_PyObjectPtr TensorShapeProtoToPyObject(
+    const TensorShapeProto& tensor_shape);
+
+// TODO(edloper): Define TensorProtoToPyObject?
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_PYTHON_FRAMEWORK_OP_DEF_UTIL_H_
diff --git a/tensorflow/python/framework/op_def_util_pybind.cc b/tensorflow/python/framework/op_def_util_pybind.cc
new file mode 100644
index 00000000000..a7843322840
--- /dev/null
+++ b/tensorflow/python/framework/op_def_util_pybind.cc
@@ -0,0 +1,55 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "pybind11/pybind11.h"
+#include "tensorflow/python/framework/op_def_util.h"
+
+namespace py = pybind11;
+
+namespace {
+
+py::handle ConvertAttr(py::handle value, std::string attr_type) {
+  tensorflow::Safe_PyObjectPtr result =
+      ::tensorflow::ConvertPyObjectToAttributeType(
+          value.ptr(), ::tensorflow::AttributeTypeFromName(attr_type));
+  if (!result) {
+    throw py::error_already_set();
+  }
+  Py_INCREF(result.get());
+  return result.release();
+}
+
+py::handle SerializedAttrValueToPyObject(std::string attr_value_string) {
+  tensorflow::AttrValue attr_value;
+  attr_value.ParseFromString(attr_value_string);
+  tensorflow::Safe_PyObjectPtr result =
+      ::tensorflow::AttrValueToPyObject(attr_value);
+  if (!result) {
+    throw py::error_already_set();
+  }
+  Py_INCREF(result.get());
+  return result.release();
+}
+
+}  // namespace
+
+// Expose op_def_util.h functions via Python.
+PYBIND11_MODULE(_op_def_util, m) {
+  // Note: the bindings below are added for testing purposes; but the functions
+  // are expected to be called from c++, not Python.
+  m.def("ConvertPyObjectToAttributeType", ConvertAttr, py::arg("value"),
+        py::arg("attr_type_enum"));
+  m.def("SerializedAttrValueToPyObject", SerializedAttrValueToPyObject,
+        py::arg("attr_value_string"));
+}
diff --git a/tensorflow/python/framework/op_def_util_test.py b/tensorflow/python/framework/op_def_util_test.py
new file mode 100644
index 00000000000..9f2ce61996f
--- /dev/null
+++ b/tensorflow/python/framework/op_def_util_test.py
@@ -0,0 +1,138 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.python.ops.op_def_library."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+import numpy as np
+
+from google.protobuf import text_format
+from tensorflow.core.framework import attr_value_pb2
+from tensorflow.core.framework import tensor_pb2
+from tensorflow.core.framework import types_pb2
+from tensorflow.python import _op_def_util
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import googletest
+
+
+class OpDefUtilTest(test_util.TensorFlowTestCase, parameterized.TestCase):
+
+  @parameterized.parameters([
+      ("any", "Foo", "Foo"),
+      ("any", 12, 12),
+      ("any", {2: 3}, {2: 3}),
+      ("string", "Foo", "Foo"),
+      ("string", b"Foo", b"Foo"),
+      ("int", 12, 12),
+      ("int", 12.3, 12),
+      ("float", 12, 12.0),
+      ("float", 12.3, 12.3),
+      ("bool", True, True),
+      ("shape", tensor_shape.TensorShape([3]), tensor_shape.TensorShape([3])),
+      ("shape", [3], tensor_shape.TensorShape([3])),
+      ("type", dtypes.int32, dtypes.int32),
+      ("type", np.int32, dtypes.int32),
+      ("type", "int32", dtypes.int32),
+      ("tensor", tensor_pb2.TensorProto(dtype=types_pb2.DataType.DT_FLOAT),
+       tensor_pb2.TensorProto(dtype=types_pb2.DataType.DT_FLOAT)),
+      ("tensor", "dtype: DT_FLOAT",
+       tensor_pb2.TensorProto(dtype=types_pb2.DataType.DT_FLOAT)),
+      ("list(any)", [1, "foo", 7.3, dtypes.int32],
+       [1, "foo", 7.3, dtypes.int32]),
+      ("list(any)", (1, "foo"), [1, "foo"]),
+      ("list(string)", ["foo", "bar"], ["foo", "bar"]),
+      ("list(string)", ("foo", "bar"), ["foo", "bar"]),
+      ("list(string)", iter("abcd"), ["a", "b", "c", "d"]),
+      ("list(int)", (1, 2.3), [1, 2]),
+      ("list(float)", (1, 2.3), [1.0, 2.3]),
+      ("list(bool)", [True, False], [True, False]),
+  ])  # pyformat: disable
+  def testConvert(self, attr_type, value, expected):
+    result = _op_def_util.ConvertPyObjectToAttributeType(value, attr_type)
+
+    # Check that we get the expected value(s).
+    self.assertEqual(expected, result)
+
+    # Check that we get the expected type(s).
+    self.assertEqual(type(expected), type(result))
+    if isinstance(result, list):
+      for expected_item, result_item in zip(expected, result):
+        self.assertEqual(type(expected_item), type(result_item))
+
+  @parameterized.parameters([
+      ("string", 12),
+      ("int", "foo"),
+      ("float", "foo"),
+      ("bool", 1),
+      ("dtype", None),
+      ("shape", 12.0),
+      ("tensor", [1, 2, 3]),
+      ("list(any)", 12),
+      ("list(int)", [1, "two"]),
+      ("list(string)", [1, "two"]),
+      ("tensor", "string that is not a text-formatted TensorProto"),
+  ])
+  def testConvertError(self, attr_type, value):
+    with self.assertRaisesRegex(TypeError, "Failed to convert value"):
+      _op_def_util.ConvertPyObjectToAttributeType(value, attr_type)
+
+  # Test AttrValueToPyObject().  Note: this test also exercises the code in
+  # DataTypeToPyObject() and TensorShapeToPyObject(), since those are used
+  # when the AttrValue contains a DataType or TensorShape.
+  @parameterized.parameters([
+      ("s: 'foo'", "foo"),
+      ("i: 5", 5),
+      ("f: 8", 8.0),
+      ("b: True", True),
+      ("type: DT_INT32", dtypes.int32),
+      ("shape { dim: [{size: 3}, {size: 4}] }",
+       tensor_shape.TensorShape([3, 4])),
+      ("list { }", []),
+      ("list { s: [] }", []),
+      ("list { s: ['a', 'b', 'c'] }", ["a", "b", "c"]),
+      ("list { i: [1, 2, 3] }", [1, 2, 3]),
+      ("list { f: [2.0, 4.0] }", [2.0, 4.0]),
+  ])  # pyformat: disable
+  def testAttrValueToPyObject(self, pbtxt, expected):
+    proto = attr_value_pb2.AttrValue()
+    text_format.Parse(pbtxt, proto)
+    result = _op_def_util.SerializedAttrValueToPyObject(
+        proto.SerializeToString())
+
+    self.assertEqual(expected, result)
+
+  @parameterized.parameters([
+      "",                           # Empty value (oneof not set)
+      "tensor {}",                  # 'TensorProto' not supported (yet).
+      "func {}",                    # 'func' not supported.
+      "placeholder: ''",            # 'placeholder' not supported.
+      "list { tensor [{}] }",       # 'TensorProto' not supported (yet).
+      "list { func [{}] }",         # 'func' not supported.
+  ])  # pyformat: disable
+  def testAttrValueToPyObjectError(self, pbtxt):
+    proto = attr_value_pb2.AttrValue()
+    text_format.Parse(pbtxt, proto)
+    with self.assertRaises((TypeError, ValueError)):
+      _op_def_util.SerializedAttrValueToPyObject(proto.SerializeToString())
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 75a36f83fc5..7e51d3a330d 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -62,6 +62,7 @@ from tensorflow.python.framework import versions
 from tensorflow.python.ops import control_flow_util
 from tensorflow.python.platform import app
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.profiler import trace
 from tensorflow.python.types import core as core_tf_types
 from tensorflow.python.types import internal
 from tensorflow.python.util import compat
@@ -1472,6 +1473,7 @@ def pack_eager_tensors(tensors, ctx=None):
   return packed_tensor
 
 
+@trace.trace_wrapper("convert_to_tensor")
 def convert_to_tensor(value,
                       dtype=None,
                       name=None,
@@ -1504,6 +1506,13 @@ def convert_to_tensor(value,
 
   if preferred_dtype is not None:
     preferred_dtype = dtypes.as_dtype(preferred_dtype)
+
+  # See below for the reason why it's `type(value)` and not just `value`.
+  # https://docs.python.org/3.8/reference/datamodel.html#special-lookup
+  overload = getattr(type(value), "__tf_tensor__", None)
+  if overload is not None:
+    return overload(value, dtype, name)
+
   for base_type, conversion_func in tensor_conversion_registry.get(type(value)):
     # If dtype is None but preferred_dtype is not None, we try to
     # cast to preferred_dtype first.
@@ -2331,6 +2340,10 @@ class Operation(object):
   def __repr__(self):
     return "<tf.Operation '%s' type=%s>" % (self.name, self.type)
 
+  def __tf_tensor__(self, dtype=None, name=None):
+    """Raises a helpful error."""
+    raise TypeError("can't convert Operation '{}' to Tensor".format(self.name))
+
   @property
   def outputs(self):
     """The list of `Tensor` objects representing the outputs of this op."""
@@ -6831,13 +6844,6 @@ def get_from_proto_function(collection_name):
     return None
 
 
-def _operation_conversion_error(op, dtype=None, name=None, as_ref=False):
-  """Produce a nice error if someone converts an Operation to a Tensor."""
-  raise TypeError(("Can't convert Operation '%s' to Tensor "
-                   "(target dtype=%r, name=%r, as_ref=%r)") %
-                  (op.name, dtype, name, as_ref))
-
-
 def _op_to_colocate_with(v, graph):
   """Operation object corresponding to v to use for colocation constraints."""
   if v is None:
@@ -6871,10 +6877,6 @@ def _is_keras_symbolic_tensor(x):
   return hasattr(x, "graph") and getattr(x.graph, "name", None) == "keras_graph"
 
 
-tensor_conversion_registry.register_tensor_conversion_function(
-    Operation, _operation_conversion_error)
-
-
 # These symbols were originally defined in this module; import them for
 # backwards compatibility until all references have been updated to access
 # them from the indexed_slices.py module.
diff --git a/tensorflow/python/framework/ops_test.py b/tensorflow/python/framework/ops_test.py
index 4129b55e3fd..58e3f650c44 100644
--- a/tensorflow/python/framework/ops_test.py
+++ b/tensorflow/python/framework/ops_test.py
@@ -858,12 +858,25 @@ class OperationTest(test_util.TensorFlowTestCase):
     with self.assertRaises(ValueError):
       ops.convert_to_tensor(tensor, dtype=dtypes.int32)
 
+  @test_util.run_in_graph_and_eager_modes
+  def testConvertToTensorProtocol(self):
+    class TensorCompatible:
+
+      def __tf_tensor__(self, dtype=None, name=None):
+        return constant_op.constant((1, 2, 3), dtype=dtype, name=name)
+
+    tc = TensorCompatible()
+
+    tensor = ops.convert_to_tensor(tc, dtype=dtypes.int32)
+    self.assertEqual(tensor.dtype, dtypes.int32)
+    self.assertAllEqual((1, 2, 3), self.evaluate(tensor))
+
   @test_util.run_deprecated_v1
   def testNoConvert(self):
     # Operation cannot be converted to Tensor.
     op = control_flow_ops.no_op()
     with self.assertRaisesRegex(TypeError,
-                                r"Can't convert Operation '.*' to Tensor"):
+                                "can't convert Operation '.+' to Tensor"):
       ops.convert_to_tensor(op)
 
   def testStr(self):
diff --git a/tensorflow/python/framework/py_context_manager.cc b/tensorflow/python/framework/py_context_manager.cc
new file mode 100644
index 00000000000..b895701d84f
--- /dev/null
+++ b/tensorflow/python/framework/py_context_manager.cc
@@ -0,0 +1,74 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/python/framework/py_context_manager.h"
+
+#include <map>
+
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+
+bool PyContextManager::Enter(PyObject* py_context_manager) {
+  if (context_manager_) {
+    PyErr_SetString(
+        PyExc_ValueError,
+        "tensorflow::PyContextManager::Enter must be called at most once.");
+  }
+  if (!py_context_manager) return false;
+  context_manager_.reset(py_context_manager);
+  static char _enter[] = "__enter__";
+  var_.reset(PyObject_CallMethod(context_manager_.get(), _enter, nullptr));
+  return var_ != nullptr;
+}
+
+PyContextManager::~PyContextManager() {
+  if (var_) {
+    static char _exit[] = "__exit__";
+    static char _ooo[] = "OOO";
+    if (PyErr_Occurred()) {
+      PyObject *type, *value, *traceback;
+      PyErr_Fetch(&type, &value, &traceback);
+      value = value ? value : Py_None;
+      traceback = traceback ? traceback : Py_None;
+      Safe_PyObjectPtr result(PyObject_CallMethod(
+          context_manager_.get(), _exit, _ooo, type, value, traceback));
+      if (result) {
+        if (PyObject_IsTrue(result.get())) {
+          PyErr_SetString(
+              PyExc_ValueError,
+              "tensorflow::PyContextManager::Enter does not support "
+              "context managers that suppress exceptions.");
+        } else {
+          PyErr_Restore(type, value, traceback);
+        }
+      }
+    } else {
+      PyObject* result = PyObject_CallMethod(context_manager_.get(), _exit,
+                                             _ooo, Py_None, Py_None, Py_None);
+      if (result) {
+        Py_DECREF(result);
+      } else {
+        LOG(ERROR)
+            << "A context manager wrapped by tensorflow::PyContextManager "
+               "raised a new exception from its __new__ method.  This behavior "
+               "is not supported by PyContextManager, and the exception is "
+               "being suppressed.";
+        PyErr_Clear();
+      }
+    }
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/python/framework/py_context_manager.h b/tensorflow/python/framework/py_context_manager.h
new file mode 100644
index 00000000000..6c15fccaf07
--- /dev/null
+++ b/tensorflow/python/framework/py_context_manager.h
@@ -0,0 +1,78 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_PYTHON_FRAMEWORK_PY_CONTEXT_MANAGER_H_
+#define TENSORFLOW_PYTHON_FRAMEWORK_PY_CONTEXT_MANAGER_H_
+
+#include <Python.h>
+
+#include <string>
+
+#include "tensorflow/python/lib/core/safe_pyobject_ptr.h"
+
+namespace tensorflow {
+
+// Class that wraps a Python context manager, and calls the `__enter__` and
+// `__exit__` methods at appropriate times:
+//
+// * When `PyContextManager::Enter(cm)` is called, the context manager `cm`
+//   is stored, and `cm.__enter__` is called.  The result can be retrieved
+//   with `PyContextManager::var()`.
+// * When the `PyContextManager` is destroyed, then `cm.__exit__` is called
+//   (with information about any active exception).
+// * `PyContextManager::Enter(cm)` may be called at most once. If
+//   `PyContextManager::Enter()` is never called, then the destructor is a
+//   no-op (i.e., `__exit__` is not called).
+//
+// PyContextManager places two restrictons on the wrapped context managers:
+//
+// 1. The context manager may not suppress exceptions -- i.e., `__exit__`
+//    may not return a True value.  If it does, then a new exception will be
+//    set, indicating that this is unuspported.
+// 2. The context manager may not raise an exception from `__exit__` if the
+//    an exception is not active when it is called.  If it does, then an error
+//    message will be logged, indicating that this is unsupported, and the
+//    exception will be suppressed.
+//
+// These restrictions are both intended to ensure that the state of
+// PyErr_Occured is unchanged by PyContextManager's destructor.  This is
+// important, because changing the state of PyErr_Occurred in the destructor
+// would mean that we are returning a nullptr with no exception set, or
+// returning a non-null value with an exception set (both of which are invalid).
+class PyContextManager {
+ public:
+  // Calls `py_context_manager.__enter__()`, and stores the result in `var`.
+  // Return true if `__enter__` succeeds, or false if `__enter__` raises an
+  // exception.  (Also returns false if `py_context_manager` is nullptr.)
+  //
+  // Steals a reference to `py_context_manager`.  (This reference is deleted
+  // when the destructor is called.)
+  bool Enter(PyObject* py_context_manager);
+
+  // Calls `py_context_manager.__exit__()`.
+  ~PyContextManager();
+
+  // Returns the variable returned by `context_manager.__enter__()`.
+  // (This is the `var` bound by `with context_manager as var`.)
+  // Returns a borrowed reference.
+  PyObject* var() { return var_.get(); }
+
+ protected:
+  Safe_PyObjectPtr context_manager_;
+  Safe_PyObjectPtr var_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_PYTHON_FRAMEWORK_PY_CONTEXT_MANAGER_H_
diff --git a/tensorflow/python/framework/py_context_manager_pybind.cc b/tensorflow/python/framework/py_context_manager_pybind.cc
new file mode 100644
index 00000000000..34565145444
--- /dev/null
+++ b/tensorflow/python/framework/py_context_manager_pybind.cc
@@ -0,0 +1,51 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+#include "tensorflow/python/framework/py_context_manager.h"
+
+namespace py = pybind11;
+
+namespace {
+
+// Test harness for PyContextManager.  Creates a PyContextManager `cm` that
+// wraps `context_manager`, calls `cm.Enter()`, and then calls `body_func`
+// with `cm.var()`.  Returns the result of the function.
+py::handle TestPyContextManager(py::handle context_manager,
+                                py::handle body_func) {
+  tensorflow::Safe_PyObjectPtr result;
+  {
+    tensorflow::PyContextManager cm;
+    Py_INCREF(context_manager.ptr());  // cm.Enter steals a reference.
+    if (!cm.Enter(context_manager.ptr())) {
+      throw py::error_already_set();
+    }
+    result.reset(
+        PyObject_CallFunctionObjArgs(body_func.ptr(), cm.var(), nullptr));
+  }
+  // cm gets destroyed here.
+
+  if (result) {
+    return result.release();
+  } else {
+    throw py::error_already_set();
+  }
+}
+
+}  // namespace
+
+PYBIND11_MODULE(_py_context_manager, m) {
+  m.def("test_py_context_manager", TestPyContextManager);
+}
diff --git a/tensorflow/python/framework/py_context_manager_test.py b/tensorflow/python/framework/py_context_manager_test.py
new file mode 100644
index 00000000000..60c72a806ae
--- /dev/null
+++ b/tensorflow/python/framework/py_context_manager_test.py
@@ -0,0 +1,118 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.python.framework._py_context_manager."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python import _py_context_manager
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import googletest
+
+
+class TestContextManager(object):
+
+  def __init__(self, behavior="basic"):
+    self.log = []
+    self.behavior = behavior
+
+  def __enter__(self):
+    self.log.append("__enter__()")
+    if self.behavior == "raise_from_enter":
+      raise ValueError("exception in __enter__")
+    return "var"
+
+  def __exit__(self, ex_type, ex_value, ex_tb):
+    self.log.append("__exit__(%s, %s, %s)" % (ex_type, ex_value, ex_tb))
+    if self.behavior == "raise_from_exit":
+      raise ValueError("exception in __exit__")
+    if self.behavior == "suppress_exception":
+      return True
+
+
+# Expected log when the body doesn't raise an exception.
+NO_EXCEPTION_LOG = """\
+__enter__()
+body('var')
+__exit__(None, None, None)"""
+
+# Expected log when the body does raise an exception.  (Regular expression.)
+EXCEPTION_LOG = """\
+__enter__\\(\\)
+body\\('var'\\)
+__exit__\\(<class 'ValueError'>, Foo, <traceback object.*>\\)"""
+
+
+class OpDefUtilTest(test_util.TensorFlowTestCase):
+
+  def testBasic(self):
+    cm = TestContextManager()
+
+    def body(var):
+      cm.log.append("body(%r)" % var)
+
+    _py_context_manager.test_py_context_manager(cm, body)
+    self.assertEqual("\n".join(cm.log), NO_EXCEPTION_LOG)
+
+  def testBodyRaisesException(self):
+    cm = TestContextManager()
+
+    def body(var):
+      cm.log.append("body(%r)" % var)
+      raise ValueError("Foo")
+
+    with self.assertRaisesRegexp(ValueError, "Foo"):
+      _py_context_manager.test_py_context_manager(cm, body)
+    self.assertRegex("\n".join(cm.log), EXCEPTION_LOG)
+
+  def testEnterRaisesException(self):
+    cm = TestContextManager("raise_from_enter")
+
+    def body(var):
+      cm.log.append("body(%r)" % var)
+
+    with self.assertRaisesRegexp(ValueError, "exception in __enter__"):
+      _py_context_manager.test_py_context_manager(cm, body)
+    self.assertEqual("\n".join(cm.log), "__enter__()")
+
+  # Test behavior in unsupported case where __exit__ raises an exception.
+  def testExitRaisesException(self):
+    cm = TestContextManager("raise_from_exit")
+
+    def body(var):
+      cm.log.append("body(%r)" % var)
+
+    # Note: this does *not* raise an exception (but does log a warning):
+    _py_context_manager.test_py_context_manager(cm, body)
+    self.assertEqual("\n".join(cm.log), NO_EXCEPTION_LOG)
+
+  # Test behavior in unsupported case where __exit__ suppresses exception.
+  def testExitSuppressesException(self):
+    cm = TestContextManager("suppress_exception")
+
+    def body(var):
+      cm.log.append("body(%r)" % var)
+      raise ValueError("Foo")
+
+    with self.assertRaisesRegexp(
+        ValueError, "tensorflow::PyContextManager::Enter does not support "
+        "context managers that suppress exception"):
+      _py_context_manager.test_py_context_manager(cm, body)
+    self.assertRegex("\n".join(cm.log), EXCEPTION_LOG)
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index f81e99242bf..447cbc8e3b7 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -928,8 +928,7 @@ bool GenEagerPythonOp::AddEagerFallbackCode(
 
 void GenEagerPythonOp::AddEagerFastPathExecute() {
   string fastpath_execute_params =
-      strings::StrCat("_ctx._context_handle, tld.device_name, \"",
-                      op_def_.name(), "\", ", "name, tld.op_callbacks");
+      strings::StrCat("_ctx, \"", op_def_.name(), "\", ", "name");
   string fallback_params;
 
   for (int i = 0; i < api_def_.in_arg_size(); i++) {
diff --git a/tensorflow/python/framework/test_combinations.py b/tensorflow/python/framework/test_combinations.py
index 5a43704c548..09920f68adf 100644
--- a/tensorflow/python/framework/test_combinations.py
+++ b/tensorflow/python/framework/test_combinations.py
@@ -57,8 +57,10 @@ from absl.testing import parameterized
 import six
 
 from tensorflow.python.util import tf_inspect
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("__internal__.test.combinations.TestCombination", v1=[])
 class TestCombination(object):
   """Customize the behavior of `generate()` and the tests that it executes.
 
@@ -316,6 +318,7 @@ def _augment_with_special_arguments(test_method, test_combinations):
   return decorated
 
 
+@tf_export("__internal__.test.combinations.combine", v1=[])
 def combine(**kwargs):
   """Generate combinations based on its keyword arguments.
 
@@ -353,6 +356,7 @@ def combine(**kwargs):
   ]
 
 
+@tf_export("__internal__.test.combinations.times", v1=[])
 def times(*combined):
   """Generate a product of N sets of combinations.
 
@@ -386,6 +390,7 @@ def times(*combined):
   return combined_results
 
 
+@tf_export("__internal__.test.combinations.NamedObject", v1=[])
 class NamedObject(object):
   """A class that translates an object into a good test name."""
 
diff --git a/tensorflow/python/framework/test_file_system.cc b/tensorflow/python/framework/test_file_system.cc
index 6e9915adbb6..ed0a66fbefd 100644
--- a/tensorflow/python/framework/test_file_system.cc
+++ b/tensorflow/python/framework/test_file_system.cc
@@ -39,12 +39,14 @@ class TestRandomAccessFile : public RandomAccessFile {
 class TestFileSystem : public NullFileSystem {
  public:
   Status NewRandomAccessFile(
-      const string& fname, std::unique_ptr<RandomAccessFile>* result) override {
+      const string& fname, TransactionToken* token,
+      std::unique_ptr<RandomAccessFile>* result) override {
     result->reset(new TestRandomAccessFile);
     return Status::OK();
   }
   // Always return size of 10
-  Status GetFileSize(const string& fname, uint64* file_size) override {
+  Status GetFileSize(const string& fname, TransactionToken* token,
+                     uint64* file_size) override {
     *file_size = 10;
     return Status::OK();
   }
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index 15f4507b5e2..1c5ed18e6db 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -54,6 +54,7 @@ from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import tape
+from tensorflow.python.framework import config
 from tensorflow.python.framework import device as pydev
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -70,6 +71,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import control_flow_util_v2
 from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import script_ops
 from tensorflow.python.ops import summary_ops_v2
 from tensorflow.python.ops import variables
@@ -131,7 +133,7 @@ def _get_object_count_by_type():
 def gpu_device_name():
   """Returns the name of a GPU device if available or the empty string."""
   for x in device_lib.list_local_devices():
-    if x.device_type == "GPU" or x.device_type == "SYCL":
+    if x.device_type == "GPU":
       return compat.as_str(x.name)
   return ""
 
@@ -335,13 +337,13 @@ def NHWCToNCHW(input_tensor):
   """Converts the input from the NHWC format to NCHW.
 
   Args:
-    input_tensor: a 4- or 5-D tensor, or an array representing shape
+    input_tensor: a 3-, 4-, or 5-D tensor, or an array representing shape
 
   Returns:
     converted tensor or shape array
   """
   # tensor dim -> new axis order
-  new_axes = {4: [0, 3, 1, 2], 5: [0, 4, 1, 2, 3]}
+  new_axes = {3: [0, 2, 1], 4: [0, 3, 1, 2], 5: [0, 4, 1, 2, 3]}
   if isinstance(input_tensor, ops.Tensor):
     ndims = input_tensor.shape.ndims
     return array_ops.transpose(input_tensor, new_axes[ndims])
@@ -1561,6 +1563,10 @@ def is_gpu_available(cuda_only=False, min_cuda_compute_capability=None):
   Returns:
     True if a GPU device of the requested kind is available.
   """
+
+  # This was needed earlier when we had support for SYCL in TensorFlow.
+  del cuda_only
+
   try:
     for local_device in device_lib.list_local_devices():
       if local_device.device_type == "GPU":
@@ -1568,8 +1574,6 @@ def is_gpu_available(cuda_only=False, min_cuda_compute_capability=None):
         cc = gpu_info.compute_capability or (0, 0)
         if not min_cuda_compute_capability or cc >= min_cuda_compute_capability:
           return True
-      if local_device.device_type == "SYCL" and not cuda_only:
-        return True
     return False
   except errors_impl.NotFoundError as e:
     if not all(x in str(e) for x in ["CUDA", "not find"]):
@@ -1908,6 +1912,75 @@ def xla_allow_fallback(description):  # pylint: disable=unused-argument
   return xla_allow_fallback_impl
 
 
+# The description is just for documentation purposes.
+def run_without_tensor_float_32(description):  # pylint: disable=unused-argument
+  """Execute test with TensorFloat-32 disabled.
+
+  While almost every real-world deep learning model runs fine with
+  TensorFloat-32, many tests use assertAllClose or similar methods.
+  TensorFloat-32 matmuls typically will cause such methods to fail with the
+  default tolerances.
+
+  Args:
+    description: A description used for documentation purposes, describing why
+      the test requires TensorFloat-32 to be disabled.
+
+  Returns:
+    Decorator which runs a test with TensorFloat-32 disabled.
+  """
+
+  def decorator(f):
+
+    @functools.wraps(f)
+    def decorated(self, *args, **kwargs):
+      allowed = config.tensor_float_32_execution_enabled()
+      try:
+        config.enable_tensor_float_32_execution(False)
+        f(self, *args, **kwargs)
+      finally:
+        config.enable_tensor_float_32_execution(allowed)
+
+    return decorated
+
+  return decorator
+
+
+# The description is just for documentation purposes.
+def run_all_without_tensor_float_32(description):  # pylint: disable=unused-argument
+  """Execute all tests in a class with TensorFloat-32 disabled."""
+  return for_all_test_methods(run_without_tensor_float_32, description)
+
+
+def matmul_without_tf32(a, b, *args, **kwargs):
+  """Run matmul but cast float32 inputs to float64 if TensorFloat-32 is enabled.
+
+  This effectively runs matmul without TensorFloat-32. It should only be used in
+  tests when verifying some other op or functions works correctly, e.g. to test
+  `tf.linalg.sqrtm` by matrix multiplying the output of the op by itself. In
+  such cases, the matmul itself is not being tested so it's OK to run it with
+  higher precision.
+
+  If a matmul itself is being tested, or some other op which uses matmul, use
+  `run_without_tensor_float_32` instead.
+
+  Args:
+    a: First input to tf.linalg.matmul
+    b: Second input to tf.linalg.matmul
+    args: Other positional arguments to tf.linalg.matmul
+    **kwargs: Other keyword arguments to tf.linalg.matmul
+
+  Returns:
+    A tensor with the same type as `a`.
+  """
+  if config.tensor_float_32_execution_enabled() and a.dtype == "float32":
+    a = math_ops.cast(a, "float64")
+    b = math_ops.cast(b, "float64")
+    ret = math_ops.matmul(a, b, *args, **kwargs)
+    return math_ops.cast(ret, a.dtype)
+  else:
+    return math_ops.matmul(a, b, *args, **kwargs)
+
+
 class EagerSessionWarner(object):
 
   def __getattr__(self, attr):
@@ -2078,7 +2151,7 @@ class TensorFlowTestCase(googletest.TestCase):
     if isinstance(expected_message_maybe_ascii, type(message)):
       expected_message = expected_message_maybe_ascii
       self._AssertProtoEquals(expected_message, message)
-    elif isinstance(expected_message_maybe_ascii, str):
+    elif isinstance(expected_message_maybe_ascii, (str, bytes)):
       expected_message = type(message)()
       text_format.Merge(
           expected_message_maybe_ascii,
@@ -3295,8 +3368,8 @@ def _fake_gradient_tape_context_manager():
       def watch(self, x):
         pass
 
-      def gradient(self, y, x):
-        result = gradients_impl.gradients(y, x)
+      def gradient(self, y, x, grad_ys=None):
+        result = gradients_impl.gradients(y, x, grad_ys)
 
         # Unlike `tape.gradient()`, `tf.gradients()` returns a list for a single
         # element. So unpack if needed to match `tape.gradient()` behavior.
@@ -3331,3 +3404,41 @@ class AbstractGradientTape:
 
   def __exit__(self, exc_type, exc_val, exc_tb):
     self._tape_impl.__exit__(exc_type, exc_val, exc_tb)
+
+
+@contextlib.contextmanager
+def run_functions_eagerly(run_eagerly):
+  """Runs functions eagerly if `run_eagerly` is true.
+
+  WARNING: Setting `run_eagerly` to True in tests running in V1 graph mode
+  *WILL NOT* make the tf.function to run eagerly because eager is disabled by
+  default in V1. Instead, tf.function will run as a traced graph function.
+
+  Ensures that the state (for running functions eagerly) is back to the initial
+  `def_function.RUN_FUNCTIONS_EAGERLY` state.
+
+  Args:
+    run_eagerly: Boolean determining whether to run the function eagerly or not.
+
+  Raises:
+    ValueError if `run_eagerly` is not a boolean.
+
+  Yields:
+    Nothing.
+  """
+  if not isinstance(run_eagerly, bool):
+    raise ValueError(
+        "Expected bool for `run_eagerly` but got {}".format(run_eagerly))
+
+  is_eager = context.executing_eagerly()
+  if not is_eager and run_eagerly:
+    logging.warning(
+        "Running tf.function eagerly in V1 graph mode is not supported. "
+        "tf.function will be run as a traced graph function.")
+
+  initial_state = def_function.functions_run_eagerly()
+  def_function.run_functions_eagerly(run_eagerly)
+  try:
+    yield
+  finally:
+    def_function.run_functions_eagerly(initial_state)
diff --git a/tensorflow/python/framework/test_util_test.py b/tensorflow/python/framework/test_util_test.py
index 4d85772fe42..745e30b1fa4 100644
--- a/tensorflow/python/framework/test_util_test.py
+++ b/tensorflow/python/framework/test_util_test.py
@@ -34,6 +34,7 @@ from tensorflow.core.framework import graph_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.python.compat import compat
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import combinations
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -946,5 +947,30 @@ class GarbageCollectionTest(test_util.TensorFlowTestCase):
     LeakedObjectTest().test_has_no_leak()
 
 
+class RunFunctionsEagerlyInV2Test(test_util.TensorFlowTestCase,
+                                  parameterized.TestCase):
+  @parameterized.named_parameters(
+      [("_RunEagerly", True), ("_RunGraph", False)])
+  def test_run_functions_eagerly(self, run_eagerly):  # pylint: disable=g-wrong-blank-lines
+    results = []
+
+    @def_function.function
+    def add_two(x):
+      for _ in range(5):
+        x += 2
+        results.append(x)
+      return x
+
+    with test_util.run_functions_eagerly(run_eagerly):
+      add_two(constant_op.constant(2.))
+      if context.executing_eagerly():
+        if run_eagerly:
+          self.assertTrue(isinstance(t, ops.EagerTensor) for t in results)
+        else:
+          self.assertTrue(isinstance(t, ops.Tensor) for t in results)
+      else:
+        self.assertTrue(isinstance(t, ops.Tensor) for t in results)
+
+
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/framework/type_spec.py b/tensorflow/python/framework/type_spec.py
index 4bf2ad791d7..ebfce25d6db 100644
--- a/tensorflow/python/framework/type_spec.py
+++ b/tensorflow/python/framework/type_spec.py
@@ -380,6 +380,8 @@ class TypeSpec(object):
   @staticmethod
   def __is_compatible(a, b):
     """Returns true if the given type serializations compatible."""
+    if isinstance(a, TypeSpec):
+      return a.is_compatible_with(b)
     if type(a) is not type(b):
       return False
     if isinstance(a, (list, tuple)):
@@ -388,7 +390,7 @@ class TypeSpec(object):
     if isinstance(a, dict):
       return (len(a) == len(b) and sorted(a.keys()) == sorted(b.keys()) and all(
           TypeSpec.__is_compatible(a[k], b[k]) for k in a.keys()))
-    if isinstance(a, (TypeSpec, tensor_shape.TensorShape, dtypes.DType)):
+    if isinstance(a, (tensor_shape.TensorShape, dtypes.DType)):
       return a.is_compatible_with(b)
     return a == b
 
diff --git a/tensorflow/python/framework/type_spec_test.py b/tensorflow/python/framework/type_spec_test.py
index 46e1ea32d72..bcffd43ee6a 100644
--- a/tensorflow/python/framework/type_spec_test.py
+++ b/tensorflow/python/framework/type_spec_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
 from tensorflow.python.framework import type_spec
+from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import googletest
 
 
@@ -67,7 +68,8 @@ class TwoTensorsSpec(type_spec.TypeSpec):
     return (value.x, value.y)
 
   def _from_components(self, components):
-    return TwoTensors(*components)
+    x, y = components
+    return TwoTensors(x, y, self.color)
 
   def _serialize(self):
     return (self.x_shape, self.x_dtype, self.y_shape, self.y_dtype, self.color)
@@ -82,6 +84,54 @@ type_spec.register_type_spec_from_value_converter(
     TwoTensors, TwoTensorsSpec.from_value)
 
 
+class TwoComposites(object):
+  """A simple value type to test TypeSpec.
+
+  Contains two composite tensorstensors (x, y) and a string (color).
+  """
+
+  def __init__(self, x, y, color="red"):
+    assert isinstance(color, str)
+    self.x = ops.convert_to_tensor_or_composite(x)
+    self.y = ops.convert_to_tensor_or_composite(y)
+    self.color = color
+
+
+class TwoCompositesSpec(type_spec.TypeSpec):
+  """A TypeSpec for the TwoTensors value type."""
+
+  def __init__(self, x_spec, y_spec, color="red"):
+    self.x_spec = x_spec
+    self.y_spec = y_spec
+    self.color = color
+
+  value_type = property(lambda self: TwoComposites)
+
+  @property
+  def _component_specs(self):
+    return (self.x_spec, self.y_spec)
+
+  def _to_components(self, value):
+    return (value.x, value.y)
+
+  def _from_components(self, components):
+    x, y = components
+    return TwoTensors(x, y, self.color)
+
+  def _serialize(self):
+    return (self.x_spec, self.y_spec, self.color)
+
+  @classmethod
+  def from_value(cls, value):
+    return cls(type_spec.type_spec_from_value(value.x),
+               type_spec.type_spec_from_value(value.y),
+               value.color)
+
+
+type_spec.register_type_spec_from_value_converter(
+    TwoComposites, TwoCompositesSpec.from_value)
+
+
 class TypeSpecTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   @parameterized.named_parameters(
@@ -283,5 +333,21 @@ class TypeSpecTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     spec = type_spec.type_spec_from_value(value)
     self.assertEqual(spec, TwoTensorsSpec.from_value(value))
 
+  def testNestedRagged(self):
+    # Check that TwoCompositeSpecs are compatible if one has a nested
+    # RaggedTensorSpec w/ ragged_rank=0 and the other has a corresponding
+    # nested TensorSpec.
+    spec1 = TwoCompositesSpec(
+        ragged_tensor.RaggedTensorSpec([10], dtypes.int32, ragged_rank=0),
+        tensor_spec.TensorSpec(None, dtypes.int32))
+    spec2 = TwoCompositesSpec(
+        tensor_spec.TensorSpec([10], dtypes.int32),
+        tensor_spec.TensorSpec(None, dtypes.int32))
+    spec3 = TwoCompositesSpec(
+        tensor_spec.TensorSpec([12], dtypes.int32),
+        tensor_spec.TensorSpec(None, dtypes.int32))
+    self.assertTrue(spec1.is_compatible_with(spec2))
+    self.assertFalse(spec1.is_compatible_with(spec3))
+
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/grappler/auto_mixed_precision_test.py b/tensorflow/python/grappler/auto_mixed_precision_test.py
index 567ff8c000d..0066fcb9712 100644
--- a/tensorflow/python/grappler/auto_mixed_precision_test.py
+++ b/tensorflow/python/grappler/auto_mixed_precision_test.py
@@ -46,6 +46,7 @@ from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.losses import losses
+from tensorflow.python.platform import sysconfig
 from tensorflow.python.platform import test
 from tensorflow.python.training import adam
 from tensorflow.python.training import gradient_descent
@@ -138,6 +139,11 @@ def _conv_pool(x):
   return h_pool2
 
 
+def _depthwise_conv2d(x, w):
+  """Returns a 2d depthwise convolution layer with full stride."""
+  return nn.depthwise_conv2d(x, w, strides=[1, 1, 1, 1], padding='SAME')
+
+
 def _simple_loop(x, functor):
   """Simple loop whose body is provided by the functor."""
   init = (constant_op.constant(0), x)
@@ -566,6 +572,42 @@ class AutoMixedPrecisionTest(test.TestCase, parameterized.TestCase):
     tol = 5e-3 if mode == 'mkl' else 1e-3
     self.assertAllClose(output_val_ref, output_val, atol=tol, rtol=tol)
 
+  # TODO(benbarsdell): This test has not been tried with MKL.
+  @parameterized.parameters(['cuda'])
+  @test_util.run_deprecated_v1
+  @test_util.disable_xla('This test does not pass with XLA')
+  def test_depthwise_conv2d(self, mode):
+    """Test grad ops with depthwise convolution2d graph."""
+    self._maybe_skip(mode)
+    cudnn_version_str = sysconfig.get_build_info().get('cudnn_version', '0.0')
+    cudnn_version = tuple([int(x) for x in cudnn_version_str.split('.')])
+    if cudnn_version < (8,):
+      # Depthwise conv2d ops are only enabled in auto_mixed_precision as of
+      # cuDNN v8.
+      self.skipTest('cuDNN version >= 8 required')
+    random_seed.set_random_seed(0)
+    x = _input([2, 8, 8, 1])
+    f = _weight([3, 3, 1, 4])
+    y = _depthwise_conv2d(x, f)
+    y = array_ops.identity(y)
+    optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01)
+    g = optimizer.compute_gradients(y, [x, f])
+    output = (y, g)
+
+    output_val_ref, output_val, cost_graph = self._run(mode, output)
+    node_map = _build_node_map(cost_graph.node)
+    self._assert_output_f16(mode, node_map, 'depthwise')
+    self._assert_output_f16(
+        mode, node_map,
+        'gradients/depthwise_grad/DepthwiseConv2dNativeBackpropInput')
+    self._assert_output_f16(
+        mode, node_map,
+        'gradients/depthwise_grad/DepthwiseConv2dNativeBackpropFilter')
+
+    output_val_ref, output_val, cost_graph = self._run(mode, output)
+    tol = 2e-3
+    self.assertAllClose(output_val_ref, output_val, atol=tol, rtol=tol)
+
   @parameterized.parameters(['cuda', 'mkl'])
   @test_util.run_v1_only('b/138749235')
   @test_util.disable_xla('This test does not pass with XLA')
diff --git a/tensorflow/python/grappler/cluster_wrapper.cc b/tensorflow/python/grappler/cluster_wrapper.cc
index aa762cb1dd9..dee8e593307 100644
--- a/tensorflow/python/grappler/cluster_wrapper.cc
+++ b/tensorflow/python/grappler/cluster_wrapper.cc
@@ -99,7 +99,7 @@ PYBIND11_MODULE(_pywrap_tf_cluster, m) {
           std::vector<tensorflow::NamedDevice> named_devices;
           for (const auto& s : serialized_named_devices) {
             tensorflow::NamedDevice named_device;
-            if (!named_device.ParseFromString(s)) {
+            if (!named_device.ParseFromString(std::string(s))) {
               throw std::invalid_argument(
                   "The NamedDevice could not be parsed as a valid protocol "
                   "buffer");
@@ -241,7 +241,7 @@ PYBIND11_MODULE(_pywrap_tf_cluster, m) {
 
   m.def("TF_EstimatePerformance", [](const py::bytes& serialized_device) {
     tensorflow::NamedDevice device;
-    if (!device.ParseFromString(serialized_device)) {
+    if (!device.ParseFromString(std::string(serialized_device))) {
       throw std::invalid_argument(
           "The NamedDevice could not be parsed as a valid protocol buffer");
     }
diff --git a/tensorflow/python/grappler/cost_analyzer_wrapper.cc b/tensorflow/python/grappler/cost_analyzer_wrapper.cc
index ce557b02e8d..4e960bb9404 100644
--- a/tensorflow/python/grappler/cost_analyzer_wrapper.cc
+++ b/tensorflow/python/grappler/cost_analyzer_wrapper.cc
@@ -32,7 +32,7 @@ PYBIND11_MODULE(_pywrap_cost_analyzer, m) {
         [](const py::bytes& serialized_metagraph, bool per_node_report,
            bool verbose, tensorflow::grappler::Cluster* cluster) -> py::bytes {
           tensorflow::MetaGraphDef metagraph;
-          if (!metagraph.ParseFromString(serialized_metagraph)) {
+          if (!metagraph.ParseFromString(std::string(serialized_metagraph))) {
             return "The MetaGraphDef could not be parsed as a valid protocol "
                    "buffer";
           }
diff --git a/tensorflow/python/grappler/item_wrapper.cc b/tensorflow/python/grappler/item_wrapper.cc
index e55b468a6ba..3b29392dc05 100644
--- a/tensorflow/python/grappler/item_wrapper.cc
+++ b/tensorflow/python/grappler/item_wrapper.cc
@@ -129,7 +129,7 @@ PYBIND11_MODULE(_pywrap_tf_item, m) {
         [](const py::bytes& serialized_metagraph, bool ignore_colocation,
            bool ignore_user_placement) -> tensorflow::grappler::GrapplerItem* {
           tensorflow::MetaGraphDef metagraph;
-          if (!metagraph.ParseFromString(serialized_metagraph)) {
+          if (!metagraph.ParseFromString(std::string(serialized_metagraph))) {
             throw std::invalid_argument(
                 "The MetaGraphDef could not be parsed as a valid protocol "
                 "buffer");
diff --git a/tensorflow/python/grappler/layout_optimizer_test.py b/tensorflow/python/grappler/layout_optimizer_test.py
index 10f869805d8..198f5a7d83a 100644
--- a/tensorflow/python/grappler/layout_optimizer_test.py
+++ b/tensorflow/python/grappler/layout_optimizer_test.py
@@ -212,6 +212,12 @@ class LayoutOptimizerTest(test.TestCase):
   def _assert_trans_nhwc_to_nchw(self, name, nodes):
     self.assertIn(name + '-TransposeNHWCToNCHW-LayoutOptimizer', nodes)
 
+  def _assert_trans_ncdhw_to_ndhwc(self, name, nodes):
+    self.assertIn(name + '-TransposeNCDHWToNDHWC-LayoutOptimizer', nodes)
+
+  def _assert_trans_ndhwc_to_ncdhw(self, name, nodes):
+    self.assertIn(name + '-TransposeNDHWCToNCDHW-LayoutOptimizer', nodes)
+
   def _assert_map_nhwc_to_nchw(self, name, nodes):
     self.assertIn(name + '-DimMapNHWCToNCHW-LayoutOptimizer', nodes)
 
@@ -221,6 +227,14 @@ class LayoutOptimizerTest(test.TestCase):
   def _assert_vec_nhwc_to_nchw(self, name, nodes):
     self.assertIn(name + '-VecPermuteNHWCToNCHW-LayoutOptimizer', nodes)
 
+  def _assert_vec_ncdhw_to_ndhwc(self, name, nodes):
+    self.assertIn(name + '-DataFormatVecPermuteNCDHWToNDHWC-LayoutOptimizer',
+                  nodes)
+
+  def _assert_vec_ndhwc_to_ncdhw(self, name, nodes):
+    self.assertIn(name + '-DataFormatVecPermuteNDHWCToNCDHW-LayoutOptimizer',
+                  nodes)
+
   def _train(self, checkpoint_path, layout_optimizer=False, restore=False):
     ops.reset_default_graph()
     graph = ops.get_default_graph()
@@ -1121,6 +1135,169 @@ class LayoutOptimizerTest(test.TestCase):
       self.assertIn('MaxPoolGradV2-3-LayoutOptimizer', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
+  @test_util.deprecated_graph_mode_only
+  def testLeakyRelu(self):
+    if test.is_gpu_available(cuda_only=True):
+      random_seed.set_random_seed(0)
+      x = random_ops.truncated_normal([4, 14, 14, 1], seed=0)
+      w = random_ops.truncated_normal([2, 2, 1, 2], seed=0)
+      y = nn.conv2d(x, w, strides=[1, 1, 1, 1], padding='SAME')
+      y = nn.leaky_relu(y, alpha=0.2)
+      output = array_ops.identity(y)
+
+      with session.Session(config=_get_config(False)) as sess:
+        output_val_ref = sess.run(output)
+
+      with session.Session(config=_get_config()) as sess:
+        metadata = config_pb2.RunMetadata()
+        output_val = sess.run(output, run_metadata=metadata)
+
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if _is_transpose(node.name):
+          num_transposes += 1
+        nodes.append(node.name)
+
+      expected_num_transposes = 2
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self._assert_trans_nchw_to_nhwc('LeakyRelu-0-0', nodes)
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3)
+
+  @test_util.deprecated_graph_mode_only
+  def testLeakyReluGrad(self):
+    if test.is_gpu_available(cuda_only=True):
+      random_seed.set_random_seed(0)
+      x = random_ops.truncated_normal([4, 14, 14, 1], seed=0)
+      w = random_ops.truncated_normal([2, 2, 1, 1], seed=0)
+      y = nn.conv2d(x, w, strides=[1, 1, 1, 1], padding='SAME')
+      y = gen_nn_ops.leaky_relu_grad(y, x, alpha=0.2)
+      output = array_ops.identity(y)
+
+      with session.Session(config=_get_config(False)) as sess:
+        output_val_ref = sess.run(output)
+
+      with session.Session(config=_get_config()) as sess:
+        metadata = config_pb2.RunMetadata()
+        output_val = sess.run(output, run_metadata=metadata)
+
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if _is_transpose(node.name):
+          num_transposes += 1
+        nodes.append(node.name)
+
+      expected_num_transposes = 3
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self._assert_trans_nhwc_to_nchw('LeakyReluGrad-1', nodes)
+      self._assert_trans_nchw_to_nhwc('LeakyReluGrad-0-0', nodes)
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3)
+
+  @test_util.deprecated_graph_mode_only
+  def testConv3D(self):
+    if test.is_gpu_available(cuda_only=True):
+      random_seed.set_random_seed(0)
+      x = random_ops.truncated_normal([1, 784], seed=0)
+      conv = _two_layer_model(x)
+      filters = random_ops.truncated_normal([2, 2, 2, 1, 2], seed=0)
+      strides_val = [1, 1, 1, 1, 1]
+      x_3d = array_ops.reshape(conv, [-1, 4, 14, 14, 1])
+      conv3d = gen_nn_ops.conv3d(x_3d, filters, strides_val, 'VALID')
+      output = array_ops.identity(conv3d)
+
+      with session.Session(config=_get_config(False)) as sess:
+        output_val_ref = sess.run(output)
+
+      with session.Session(config=_get_config()) as sess:
+        metadata = config_pb2.RunMetadata()
+        output_val = sess.run(output, run_metadata=metadata)
+
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if _is_transpose(node.name):
+          num_transposes += 1
+        nodes.append(node.name)
+
+      expected_num_transposes = 2
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes)
+      self._assert_trans_ndhwc_to_ncdhw('Conv3D-0', nodes)
+      self._assert_trans_ncdhw_to_ndhwc('Conv3D-0-0', nodes)
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3)
+
+  @test_util.deprecated_graph_mode_only
+  def testConv3DBackpropInput(self):
+    if test.is_gpu_available(cuda_only=True):
+      random_seed.set_random_seed(0)
+      x = random_ops.truncated_normal([1, 784], seed=0)
+      conv = _two_layer_model(x)
+      x_3d = array_ops.reshape(conv, [-1, 4, 14, 14, 1])
+      filters = random_ops.truncated_normal([2, 2, 2, 1, 1], seed=0)
+      strides_val = [1, 1, 1, 1, 1]
+      shape = array_ops.shape(x_3d)
+      conv3d_grad = gen_nn_ops.conv3d_backprop_input_v2(shape, filters, x_3d,
+                                                        strides_val, 'SAME')
+      output = array_ops.identity(conv3d_grad)
+
+      with session.Session(config=_get_config(False)) as sess:
+        output_val_ref = sess.run(output)
+
+      with session.Session(config=_get_config()) as sess:
+        metadata = config_pb2.RunMetadata()
+        output_val = sess.run(output, run_metadata=metadata)
+
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if _is_transpose(node.name):
+          num_transposes += 1
+        nodes.append(node.name)
+
+      expected_num_transposes = 2
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes)
+      self._assert_vec_ndhwc_to_ncdhw('Conv3DBackpropInputV2-0', nodes)
+      self._assert_trans_ndhwc_to_ncdhw('Conv3DBackpropInputV2-2', nodes)
+      self._assert_trans_ncdhw_to_ndhwc('Conv3DBackpropInputV2-0-0', nodes)
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3)
+
+  @test_util.deprecated_graph_mode_only
+  def testConv3DBackpropFilter(self):
+    if test.is_gpu_available(cuda_only=True):
+      random_seed.set_random_seed(0)
+      x = random_ops.truncated_normal([1, 784], seed=0)
+      conv = _two_layer_model(x)
+      x_3d = array_ops.reshape(conv, [-1, 4, 14, 14, 1])
+      filters = random_ops.truncated_normal([2, 2, 2, 1, 1], seed=0)
+      strides_val = [1, 1, 1, 1, 1]
+      shape = constant_op.constant([2, 2, 2, 1, 1], shape=[5])
+      conv3d_grad = gen_nn_ops.conv3d_backprop_filter_v2(
+          x_3d, shape, x_3d, strides_val, 'SAME')
+      output = array_ops.identity(conv3d_grad)
+
+      with session.Session(config=_get_config(False)) as sess:
+        output_val_ref = sess.run(output)
+
+      with session.Session(config=_get_config()) as sess:
+        metadata = config_pb2.RunMetadata()
+        output_val = sess.run(output, run_metadata=metadata)
+
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if _is_transpose(node.name):
+          num_transposes += 1
+        nodes.append(node.name)
+
+      expected_num_transposes = 2
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes)
+      self._assert_trans_ndhwc_to_ncdhw('Conv3DBackpropFilterV2-0', nodes)
+      self._assert_trans_ndhwc_to_ncdhw('Conv3DBackpropFilterV2-2', nodes)
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3)
+
   @test_util.deprecated_graph_mode_only
   def testSliceWithNonConstAxis(self):
     if test.is_gpu_available(cuda_only=True):
diff --git a/tensorflow/python/grappler/model_analyzer_wrapper.cc b/tensorflow/python/grappler/model_analyzer_wrapper.cc
index 47d1ec89897..68740caf7bf 100644
--- a/tensorflow/python/grappler/model_analyzer_wrapper.cc
+++ b/tensorflow/python/grappler/model_analyzer_wrapper.cc
@@ -29,7 +29,7 @@ PYBIND11_MODULE(_pywrap_model_analyzer, m) {
         [](const py::bytes& serialized_metagraph, bool assume_valid_feeds,
            bool debug) -> py::bytes {
           tensorflow::MetaGraphDef metagraph;
-          if (!metagraph.ParseFromString(serialized_metagraph)) {
+          if (!metagraph.ParseFromString(std::string(serialized_metagraph))) {
             return "The MetaGraphDef could not be parsed as a valid protocol "
                    "buffer";
           }
diff --git a/tensorflow/python/grappler/tf_optimizer_wrapper.cc b/tensorflow/python/grappler/tf_optimizer_wrapper.cc
index 14336a08cf5..32446a61073 100644
--- a/tensorflow/python/grappler/tf_optimizer_wrapper.cc
+++ b/tensorflow/python/grappler/tf_optimizer_wrapper.cc
@@ -66,12 +66,13 @@ PYBIND11_MODULE(_pywrap_tf_optimizer, m) {
          const std::string& graph_id,
          bool strip_default_attributes) -> py::bytes {
         tensorflow::ConfigProto config_proto;
-        if (!config_proto.ParseFromString(serialized_config_proto)) {
+        if (!config_proto.ParseFromString(
+                std::string(serialized_config_proto))) {
           throw std::invalid_argument(
               "The ConfigProto could not be parsed as a valid protocol buffer");
         }
         tensorflow::MetaGraphDef metagraph;
-        if (!metagraph.ParseFromString(serialized_metagraph)) {
+        if (!metagraph.ParseFromString(std::string(serialized_metagraph))) {
           throw std::invalid_argument(
               "The MetaGraphDef could not be parsed as a valid protocol "
               "buffer");
diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
index 24c5b9de8ca..0f9ab84a270 100755
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -10,6 +10,14 @@ package(
 
 exports_files(["LICENSE"])
 
+filegroup(
+    name = "all_py_srcs",
+    srcs = glob(["*.py"]),
+    visibility = [
+        "//tensorflow/python/keras/google/private_tf_api_test:__pkg__",
+    ],
+)
+
 py_library(
     name = "keras",
     srcs = [
@@ -87,6 +95,8 @@ py_library(
         "//tensorflow/python/distribute:multi_worker_util",
         "//tensorflow/python/keras/engine:keras_tensor",
         "//tensorflow/python/keras/utils:control_flow_util",
+        "//tensorflow/python/keras/utils:tf_contextlib",
+        "//tensorflow/python/keras/utils:tf_inspect",
     ],
 )
 
@@ -307,6 +317,7 @@ py_library(
     deps = [
         ":backend",
         ":models",
+        "//tensorflow/python:config",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:tensor_spec",
@@ -522,7 +533,7 @@ tf_py_test(
     size = "medium",
     srcs = ["callbacks_test.py"],
     python_version = "PY3",
-    shard_count = 4,
+    shard_count = 6,
     tags = [
         "no_oss",
         "notsan",
diff --git a/tensorflow/python/keras/activations.py b/tensorflow/python/keras/activations.py
index fe0bf5977f9..119851f4e13 100644
--- a/tensorflow/python/keras/activations.py
+++ b/tensorflow/python/keras/activations.py
@@ -499,8 +499,10 @@ def serialize(activation):
 def deserialize(name, custom_objects=None):
   """Returns activation function given a string identifier.
 
-  Arguments:
-      x : String identifier.
+  Args:
+    name: The name of the activation function.
+    custom_objects: Optional `{function_name: function_obj}`
+      dictionary listing user-provided activation functions.
 
   Returns:
       Corresponding activation function.
@@ -516,11 +518,6 @@ def deserialize(name, custom_objects=None):
   ...
   ValueError: Unknown activation function:abcd
 
-  Args:
-    name: The name of the activation function.
-    custom_objects: Optional `{function_name: function_obj}`
-      dictionary listing user-provided activation functions.
-
   Raises:
       ValueError: `Unknown activation function` if the input string does not
       denote any defined Tensorflow activation function.
diff --git a/tensorflow/python/keras/api/BUILD b/tensorflow/python/keras/api/BUILD
index ff54400ae15..d69930b7455 100644
--- a/tensorflow/python/keras/api/BUILD
+++ b/tensorflow/python/keras/api/BUILD
@@ -23,6 +23,7 @@ keras_packages = [
     "tensorflow.python.keras.applications.inception_v3",
     "tensorflow.python.keras.applications.mobilenet",
     "tensorflow.python.keras.applications.mobilenet_v2",
+    "tensorflow.python.keras.applications.mobilenet_v3",
     "tensorflow.python.keras.applications.nasnet",
     "tensorflow.python.keras.applications.resnet",
     "tensorflow.python.keras.applications.resnet_v2",
diff --git a/tensorflow/python/keras/applications/BUILD b/tensorflow/python/keras/applications/BUILD
index 0c566c6e6d5..cdd0d8ef72a 100644
--- a/tensorflow/python/keras/applications/BUILD
+++ b/tensorflow/python/keras/applications/BUILD
@@ -14,6 +14,12 @@ package(
 
 exports_files(["LICENSE"])
 
+filegroup(
+    name = "all_py_srcs",
+    srcs = glob(["*.py"]),
+    visibility = ["//tensorflow/python/keras/google/private_tf_api_test:__pkg__"],
+)
+
 py_library(
     name = "applications",
     srcs = [
@@ -25,6 +31,7 @@ py_library(
         "inception_v3.py",
         "mobilenet.py",
         "mobilenet_v2.py",
+        "mobilenet_v3.py",
         "nasnet.py",
         "resnet.py",
         "resnet_v2.py",
@@ -53,7 +60,10 @@ tf_py_test(
     size = "medium",
     srcs = ["applications_test.py"],
     shard_count = 36,
-    tags = ["no_rocm"],
+    tags = [
+        "no_rocm",
+        "notsan",  # b/168814536
+    ],
     deps = [
         ":applications",
         "//tensorflow/python:client_testlib",
@@ -73,6 +83,7 @@ tf_py_test(
     tags = [
         "no_oss",
         "no_pip",
+        "notsan",  # b/168814536
     ],
     deps = [
         ":applications",
@@ -209,6 +220,23 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "applications_load_weight_test_mobilenet_v3",
+    srcs = ["applications_load_weight_test.py"],
+    args = ["--module=mobilenet_v3"],
+    main = "applications_load_weight_test.py",
+    tags = [
+        "no_oss",
+        "no_pip",
+        "notsan",
+    ],
+    deps = [
+        ":applications",
+        "//tensorflow/python:client_testlib",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 tf_py_test(
     name = "applications_load_weight_test_densenet",
     size = "large",
diff --git a/tensorflow/python/keras/applications/applications_load_weight_test.py b/tensorflow/python/keras/applications/applications_load_weight_test.py
index 42146c66f97..aaafe9f984a 100644
--- a/tensorflow/python/keras/applications/applications_load_weight_test.py
+++ b/tensorflow/python/keras/applications/applications_load_weight_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.keras.applications import inception_resnet_v2
 from tensorflow.python.keras.applications import inception_v3
 from tensorflow.python.keras.applications import mobilenet
 from tensorflow.python.keras.applications import mobilenet_v2
+from tensorflow.python.keras.applications import mobilenet_v3
 from tensorflow.python.keras.applications import nasnet
 from tensorflow.python.keras.applications import resnet
 from tensorflow.python.keras.applications import resnet_v2
@@ -51,6 +52,8 @@ ARG_TO_MODEL = {
                             [inception_resnet_v2.InceptionResNetV2]),
     'mobilenet': (mobilenet, [mobilenet.MobileNet]),
     'mobilenet_v2': (mobilenet_v2, [mobilenet_v2.MobileNetV2]),
+    'mobilenet_v3': (mobilenet_v3, [mobilenet_v3.MobileNetV3Small,
+                                    mobilenet_v3.MobileNetV3Large]),
     'densenet': (densenet, [densenet.DenseNet121,
                             densenet.DenseNet169, densenet.DenseNet201]),
     'nasnet_mobile': (nasnet, [nasnet.NASNetMobile]),
diff --git a/tensorflow/python/keras/applications/applications_test.py b/tensorflow/python/keras/applications/applications_test.py
index 198bebd904c..d92a2aaee7f 100644
--- a/tensorflow/python/keras/applications/applications_test.py
+++ b/tensorflow/python/keras/applications/applications_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.keras.applications import inception_resnet_v2
 from tensorflow.python.keras.applications import inception_v3
 from tensorflow.python.keras.applications import mobilenet
 from tensorflow.python.keras.applications import mobilenet_v2
+from tensorflow.python.keras.applications import mobilenet_v3
 from tensorflow.python.keras.applications import nasnet
 from tensorflow.python.keras.applications import resnet
 from tensorflow.python.keras.applications import resnet_v2
@@ -50,6 +51,8 @@ MODEL_LIST_NO_NASNET = [
     (inception_resnet_v2.InceptionResNetV2, 1536),
     (mobilenet.MobileNet, 1024),
     (mobilenet_v2.MobileNetV2, 1280),
+    (mobilenet_v3.MobileNetV3Small, 1024),
+    (mobilenet_v3.MobileNetV3Large, 1280),
     (densenet.DenseNet121, 1024),
     (densenet.DenseNet169, 1664),
     (densenet.DenseNet201, 1920),
diff --git a/tensorflow/python/keras/applications/densenet.py b/tensorflow/python/keras/applications/densenet.py
index 1302598f8e9..c66fe75554e 100644
--- a/tensorflow/python/keras/applications/densenet.py
+++ b/tensorflow/python/keras/applications/densenet.py
@@ -145,8 +145,9 @@ def DenseNet(
   Note that the data format convention used by the model is
   the one specified in your Keras config at `~/.keras/keras.json`.
 
-  Caution: Be sure to properly pre-process your inputs to the application.
-  Please see `applications.densenet.preprocess_input` for an example.
+  Note: each Keras Application expects a specific kind of input preprocessing.
+  For DenseNet, call `tf.keras.applications.densenet.preprocess_input` on your
+  inputs before passing them to the model.
 
   Arguments:
     blocks: numbers of building blocks for the four dense layers.
@@ -192,7 +193,7 @@ def DenseNet(
     ValueError: if `classifier_activation` is not `softmax` or `None` when
       using a pretrained top layer.
   """
-  if not (weights in {'imagenet', None} or file_io.file_exists(weights)):
+  if not (weights in {'imagenet', None} or file_io.file_exists_v2(weights)):
     raise ValueError('The `weights` argument should be either '
                      '`None` (random initialization), `imagenet` '
                      '(pre-training on ImageNet), '
@@ -382,9 +383,10 @@ DOC = """
   Optionally loads weights pre-trained on ImageNet.
   Note that the data format convention used by the model is
   the one specified in your Keras config at `~/.keras/keras.json`.
-  
-  Caution: Be sure to properly pre-process your inputs to the application.
-  Please see `applications.densenet.preprocess_input` for an example.
+
+  Note: each Keras Application expects a specific kind of input preprocessing.
+  For DenseNet, call `tf.keras.applications.densenet.preprocess_input` on your
+  inputs before passing them to the model.
 
   Arguments:
     include_top: whether to include the fully-connected
diff --git a/tensorflow/python/keras/applications/efficientnet.py b/tensorflow/python/keras/applications/efficientnet.py
index b791bbc2bc1..1e75d32faa7 100644
--- a/tensorflow/python/keras/applications/efficientnet.py
+++ b/tensorflow/python/keras/applications/efficientnet.py
@@ -269,7 +269,7 @@ def EfficientNet(
   if blocks_args == 'default':
     blocks_args = DEFAULT_BLOCKS_ARGS
 
-  if not (weights in {'imagenet', None} or file_io.file_exists(weights)):
+  if not (weights in {'imagenet', None} or file_io.file_exists_v2(weights)):
     raise ValueError('The `weights` argument should be either '
                      '`None` (random initialization), `imagenet` '
                      '(pre-training on ImageNet), '
diff --git a/tensorflow/python/keras/applications/inception_resnet_v2.py b/tensorflow/python/keras/applications/inception_resnet_v2.py
index 3bf296967f6..5e46d97fdd2 100644
--- a/tensorflow/python/keras/applications/inception_resnet_v2.py
+++ b/tensorflow/python/keras/applications/inception_resnet_v2.py
@@ -61,8 +61,10 @@ def InceptionResNetV2(include_top=True,
   Note that the data format convention used by the model is
   the one specified in your Keras config at `~/.keras/keras.json`.
 
-  Caution: Be sure to properly pre-process your inputs to the application.
-  Please see `applications.inception_resnet_v2.preprocess_input` for an example.
+  Note: each Keras Application expects a specific kind of input preprocessing.
+  For InceptionResNetV2, call
+  `tf.keras.applications.inception_resnet_v2.preprocess_input`
+  on your inputs before passing them to the model.
 
   Arguments:
     include_top: whether to include the fully-connected
@@ -112,7 +114,7 @@ def InceptionResNetV2(include_top=True,
     layers = VersionAwareLayers()
   if kwargs:
     raise ValueError('Unknown argument(s): %s' % (kwargs,))
-  if not (weights in {'imagenet', None} or file_io.file_exists(weights)):
+  if not (weights in {'imagenet', None} or file_io.file_exists_v2(weights)):
     raise ValueError('The `weights` argument should be either '
                      '`None` (random initialization), `imagenet` '
                      '(pre-training on ImageNet), '
diff --git a/tensorflow/python/keras/applications/inception_v3.py b/tensorflow/python/keras/applications/inception_v3.py
index 7237cf558e1..94e1ab558b8 100644
--- a/tensorflow/python/keras/applications/inception_v3.py
+++ b/tensorflow/python/keras/applications/inception_v3.py
@@ -63,8 +63,9 @@ def InceptionV3(
   Note that the data format convention used by the model is
   the one specified in the `tf.keras.backend.image_data_format()`.
 
-  Caution: Be sure to properly pre-process your inputs to the application.
-  Please see `applications.inception_v3.preprocess_input` for an example.
+  Note: each Keras Application expects a specific kind of input preprocessing.
+  For InceptionV3, call `tf.keras.applications.inception_v3.preprocess_input`
+  on your inputs before passing them to the model.
 
   Arguments:
     include_top: Boolean, whether to include the fully-connected
@@ -108,7 +109,7 @@ def InceptionV3(
     ValueError: if `classifier_activation` is not `softmax` or `None` when
       using a pretrained top layer.
   """
-  if not (weights in {'imagenet', None} or file_io.file_exists(weights)):
+  if not (weights in {'imagenet', None} or file_io.file_exists_v2(weights)):
     raise ValueError('The `weights` argument should be either '
                      '`None` (random initialization), `imagenet` '
                      '(pre-training on ImageNet), '
diff --git a/tensorflow/python/keras/applications/mobilenet.py b/tensorflow/python/keras/applications/mobilenet.py
index c59246fb8ef..d434a801e52 100644
--- a/tensorflow/python/keras/applications/mobilenet.py
+++ b/tensorflow/python/keras/applications/mobilenet.py
@@ -104,8 +104,9 @@ def MobileNet(input_shape=None,
   Note that the data format convention used by the model is
   the one specified in the `tf.keras.backend.image_data_format()`.
 
-  Caution: Be sure to properly pre-process your inputs to the application.
-  Please see `applications.mobilenet.preprocess_input` for an example.
+  Note: each Keras Application expects a specific kind of input preprocessing.
+  For MobileNet, call `tf.keras.applications.mobilenet.preprocess_input`
+  on your inputs before passing them to the model.
 
   Arguments:
     input_shape: Optional shape tuple, only to be specified if `include_top`
@@ -164,7 +165,7 @@ def MobileNet(input_shape=None,
     layers = VersionAwareLayers()
   if kwargs:
     raise ValueError('Unknown argument(s): %s' % (kwargs,))
-  if not (weights in {'imagenet', None} or file_io.file_exists(weights)):
+  if not (weights in {'imagenet', None} or file_io.file_exists_v2(weights)):
     raise ValueError('The `weights` argument should be either '
                      '`None` (random initialization), `imagenet` '
                      '(pre-training on ImageNet), '
@@ -349,15 +350,13 @@ def _conv_block(inputs, filters, alpha, kernel=(3, 3), strides=(1, 1)):
   """
   channel_axis = 1 if backend.image_data_format() == 'channels_first' else -1
   filters = int(filters * alpha)
-  x = layers.ZeroPadding2D(padding=((0, 1), (0, 1)), name='conv1_pad')(inputs)
   x = layers.Conv2D(
       filters,
       kernel,
-      padding='valid',
+      padding='same',
       use_bias=False,
       strides=strides,
-      name='conv1')(
-          x)
+      name='conv1')(inputs)
   x = layers.BatchNormalization(axis=channel_axis, name='conv1_bn')(x)
   return layers.ReLU(6., name='conv1_relu')(x)
 
diff --git a/tensorflow/python/keras/applications/mobilenet_v2.py b/tensorflow/python/keras/applications/mobilenet_v2.py
index 1891c3ff19f..c149bc3d16f 100644
--- a/tensorflow/python/keras/applications/mobilenet_v2.py
+++ b/tensorflow/python/keras/applications/mobilenet_v2.py
@@ -111,8 +111,9 @@ def MobileNetV2(input_shape=None,
 
   Optionally loads weights pre-trained on ImageNet.
 
-  Caution: Be sure to properly pre-process your inputs to the application.
-  Please see `applications.mobilenet_v2.preprocess_input` for an example.
+  Note: each Keras Application expects a specific kind of input preprocessing.
+  For MobileNetV2, call `tf.keras.applications.mobilenet_v2.preprocess_input`
+  on your inputs before passing them to the model.
 
   Arguments:
     input_shape: Optional shape tuple, to be specified if you would
@@ -180,7 +181,7 @@ def MobileNetV2(input_shape=None,
     layers = VersionAwareLayers()
   if kwargs:
     raise ValueError('Unknown argument(s): %s' % (kwargs,))
-  if not (weights in {'imagenet', None} or file_io.file_exists(weights)):
+  if not (weights in {'imagenet', None} or file_io.file_exists_v2(weights)):
     raise ValueError('The `weights` argument should be either '
                      '`None` (random initialization), `imagenet` '
                      '(pre-training on ImageNet), '
@@ -298,17 +299,13 @@ def MobileNetV2(input_shape=None,
   channel_axis = 1 if backend.image_data_format() == 'channels_first' else -1
 
   first_block_filters = _make_divisible(32 * alpha, 8)
-  x = layers.ZeroPadding2D(
-      padding=imagenet_utils.correct_pad(img_input, 3),
-      name='Conv1_pad')(img_input)
   x = layers.Conv2D(
       first_block_filters,
       kernel_size=3,
       strides=(2, 2),
-      padding='valid',
+      padding='same',
       use_bias=False,
-      name='Conv1')(
-          x)
+      name='Conv1')(img_input)
   x = layers.BatchNormalization(
       axis=channel_axis, epsilon=1e-3, momentum=0.999, name='bn_Conv1')(
           x)
diff --git a/tensorflow/python/keras/applications/mobilenet_v3.py b/tensorflow/python/keras/applications/mobilenet_v3.py
new file mode 100644
index 00000000000..ae8f98c181b
--- /dev/null
+++ b/tensorflow/python/keras/applications/mobilenet_v3.py
@@ -0,0 +1,569 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=invalid-name
+# pylint: disable=missing-function-docstring
+"""MobileNet v3 models for Keras."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+from tensorflow.python.keras import backend
+from tensorflow.python.keras import models
+from tensorflow.python.keras.applications import imagenet_utils
+from tensorflow.python.keras.layers import VersionAwareLayers
+from tensorflow.python.keras.utils import data_utils
+from tensorflow.python.keras.utils import layer_utils
+from tensorflow.python.lib.io import file_io
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import keras_export
+
+
+# TODO(scottzhu): Change this to the GCS path.
+BASE_WEIGHT_PATH = ('https://storage.googleapis.com/tensorflow/'
+                    'keras-applications/mobilenet_v3/')
+WEIGHTS_HASHES = {
+    'large_224_0.75_float': ('765b44a33ad4005b3ac83185abf1d0eb',
+                             'e7b4d1071996dd51a2c2ca2424570e20'),
+    'large_224_1.0_float': ('59e551e166be033d707958cf9e29a6a7',
+                            '037116398e07f018c0005ffcb0406831'),
+    'large_minimalistic_224_1.0_float': ('675e7b876c45c57e9e63e6d90a36599c',
+                                         'a2c33aed672524d1d0b4431808177695'),
+    'small_224_0.75_float': ('cb65d4e5be93758266aa0a7f2c6708b7',
+                             '4d2fe46f1c1f38057392514b0df1d673'),
+    'small_224_1.0_float': ('8768d4c2e7dee89b9d02b2d03d65d862',
+                            'be7100780f875c06bcab93d76641aa26'),
+    'small_minimalistic_224_1.0_float': ('99cd97fb2fcdad2bf028eb838de69e37',
+                                         '20d4e357df3f7a6361f3a288857b1051'),
+}
+
+layers = VersionAwareLayers()
+
+
+BASE_DOCSTRING = """Instantiates the {name} architecture.
+
+  Reference:
+  - [Searching for MobileNetV3](
+      https://arxiv.org/pdf/1905.02244.pdf) (ICCV 2019)
+
+  The following table describes the performance of MobileNets:
+  ------------------------------------------------------------------------
+  MACs stands for Multiply Adds
+
+  |Classification Checkpoint|MACs(M)|Parameters(M)|Top1 Accuracy|Pixel1|CPU(ms)|
+  | [mobilenet_v3_large_1.0_224]              | 217 | 5.4 |   75.6   |   51.2  |
+  | [mobilenet_v3_large_0.75_224]             | 155 | 4.0 |   73.3   |   39.8  |
+  | [mobilenet_v3_large_minimalistic_1.0_224] | 209 | 3.9 |   72.3   |   44.1  |
+  | [mobilenet_v3_small_1.0_224]              | 66  | 2.9 |   68.1   |   15.8  |
+  | [mobilenet_v3_small_0.75_224]             | 44  | 2.4 |   65.4   |   12.8  |
+  | [mobilenet_v3_small_minimalistic_1.0_224] | 65  | 2.0 |   61.9   |   12.2  |
+
+  The weights for all 6 models are obtained and translated from the Tensorflow
+  checkpoints from TensorFlow checkpoints found [here]
+  (https://github.com/tensorflow/models/tree/master/research/slim/nets/mobilenet/README.md).
+
+  Optionally loads weights pre-trained on ImageNet.
+
+  Note: each Keras Application expects a specific kind of input preprocessing.
+  For MobileNetV3, call
+  `tf.keras.applications.mobilenet_v3.preprocess_input` on your
+  inputs before passing them to the model.
+
+  Arguments:
+    input_shape: Optional shape tuple, to be specified if you would
+      like to use a model with an input image resolution that is not
+      (224, 224, 3).
+      It should have exactly 3 inputs channels (224, 224, 3).
+      You can also omit this option if you would like
+      to infer input_shape from an input_tensor.
+      If you choose to include both input_tensor and input_shape then
+      input_shape will be used if they match, if the shapes
+      do not match then we will throw an error.
+      E.g. `(160, 160, 3)` would be one valid value.
+    alpha: controls the width of the network. This is known as the
+      depth multiplier in the MobileNetV3 paper, but the name is kept for
+      consistency with MobileNetV1 in Keras.
+      - If `alpha` < 1.0, proportionally decreases the number
+          of filters in each layer.
+      - If `alpha` > 1.0, proportionally increases the number
+          of filters in each layer.
+      - If `alpha` = 1, default number of filters from the paper
+          are used at each layer.
+    minimalistic: In addition to large and small models this module also
+      contains so-called minimalistic models, these models have the same
+      per-layer dimensions characteristic as MobilenetV3 however, they don't
+      utilize any of the advanced blocks (squeeze-and-excite units, hard-swish,
+      and 5x5 convolutions). While these models are less efficient on CPU, they
+      are much more performant on GPU/DSP.
+    include_top: Boolean, whether to include the fully-connected
+      layer at the top of the network. Defaults to `True`.
+    weights: String, one of `None` (random initialization),
+      'imagenet' (pre-training on ImageNet),
+      or the path to the weights file to be loaded.
+    input_tensor: Optional Keras tensor (i.e. output of
+      `layers.Input()`)
+      to use as image input for the model.
+    pooling: String, optional pooling mode for feature extraction
+      when `include_top` is `False`.
+      - `None` means that the output of the model
+          will be the 4D tensor output of the
+          last convolutional block.
+      - `avg` means that global average pooling
+          will be applied to the output of the
+          last convolutional block, and thus
+          the output of the model will be a
+          2D tensor.
+      - `max` means that global max pooling will
+          be applied.
+    classes: Integer, optional number of classes to classify images
+      into, only to be specified if `include_top` is True, and
+      if no `weights` argument is specified.
+    dropout_rate: fraction of the input units to drop on the last layer.
+    classifier_activation: A `str` or callable. The activation function to use
+      on the "top" layer. Ignored unless `include_top=True`. Set
+      `classifier_activation=None` to return the logits of the "top" layer.
+
+  Returns:
+    A `keras.Model` instance.
+
+  Raises:
+    ValueError: in case of invalid argument for `weights`,
+      or invalid input shape or invalid alpha, rows when
+      weights='imagenet'
+    ValueError: if `classifier_activation` is not `softmax` or `None` when
+      using a pretrained top layer.
+"""
+
+
+def MobileNetV3(stack_fn,
+                last_point_ch,
+                input_shape=None,
+                alpha=1.0,
+                model_type='large',
+                minimalistic=False,
+                include_top=True,
+                weights='imagenet',
+                input_tensor=None,
+                classes=1000,
+                pooling=None,
+                dropout_rate=0.2,
+                classifier_activation='softmax'):
+  if not (weights in {'imagenet', None} or file_io.file_exists_v2(weights)):
+    raise ValueError('The `weights` argument should be either '
+                     '`None` (random initialization), `imagenet` '
+                     '(pre-training on ImageNet), '
+                     'or the path to the weights file to be loaded.')
+
+  if weights == 'imagenet' and include_top and classes != 1000:
+    raise ValueError('If using `weights` as `"imagenet"` with `include_top` '
+                     'as true, `classes` should be 1000')
+
+  # Determine proper input shape and default size.
+  # If both input_shape and input_tensor are used, they should match
+  if input_shape is not None and input_tensor is not None:
+    try:
+      is_input_t_tensor = backend.is_keras_tensor(input_tensor)
+    except ValueError:
+      try:
+        is_input_t_tensor = backend.is_keras_tensor(
+            layer_utils.get_source_inputs(input_tensor))
+      except ValueError:
+        raise ValueError('input_tensor: ', input_tensor,
+                         'is not type input_tensor')
+    if is_input_t_tensor:
+      if backend.image_data_format == 'channels_first':
+        if backend.int_shape(input_tensor)[1] != input_shape[1]:
+          raise ValueError('input_shape: ', input_shape, 'and input_tensor: ',
+                           input_tensor,
+                           'do not meet the same shape requirements')
+      else:
+        if backend.int_shape(input_tensor)[2] != input_shape[1]:
+          raise ValueError('input_shape: ', input_shape, 'and input_tensor: ',
+                           input_tensor,
+                           'do not meet the same shape requirements')
+    else:
+      raise ValueError('input_tensor specified: ', input_tensor,
+                       'is not a keras tensor')
+
+  # If input_shape is None, infer shape from input_tensor
+  if input_shape is None and input_tensor is not None:
+
+    try:
+      backend.is_keras_tensor(input_tensor)
+    except ValueError:
+      raise ValueError('input_tensor: ', input_tensor, 'is type: ',
+                       type(input_tensor), 'which is not a valid type')
+
+    if backend.is_keras_tensor(input_tensor):
+      if backend.image_data_format() == 'channels_first':
+        rows = backend.int_shape(input_tensor)[2]
+        cols = backend.int_shape(input_tensor)[3]
+        input_shape = (3, cols, rows)
+      else:
+        rows = backend.int_shape(input_tensor)[1]
+        cols = backend.int_shape(input_tensor)[2]
+        input_shape = (cols, rows, 3)
+  # If input_shape is None and input_tensor is None using standart shape
+  if input_shape is None and input_tensor is None:
+    input_shape = (None, None, 3)
+
+  if backend.image_data_format() == 'channels_last':
+    row_axis, col_axis = (0, 1)
+  else:
+    row_axis, col_axis = (1, 2)
+  rows = input_shape[row_axis]
+  cols = input_shape[col_axis]
+  if rows and cols and (rows < 32 or cols < 32):
+    raise ValueError('Input size must be at least 32x32; got `input_shape=' +
+                     str(input_shape) + '`')
+  if weights == 'imagenet':
+    if (not minimalistic and alpha not in [0.75, 1.0]
+        or minimalistic and alpha != 1.0):
+      raise ValueError('If imagenet weights are being loaded, '
+                       'alpha can be one of `0.75`, `1.0` for non minimalistic'
+                       ' or `1.0` for minimalistic only.')
+
+    if rows != cols or rows != 224:
+      logging.warning('`input_shape` is undefined or non-square, '
+                      'or `rows` is not 224.'
+                      ' Weights for input shape (224, 224) will be'
+                      ' loaded as the default.')
+
+  if input_tensor is None:
+    img_input = layers.Input(shape=input_shape)
+  else:
+    if not backend.is_keras_tensor(input_tensor):
+      img_input = layers.Input(tensor=input_tensor, shape=input_shape)
+    else:
+      img_input = input_tensor
+
+  channel_axis = 1 if backend.image_data_format() == 'channels_first' else -1
+
+  if minimalistic:
+    kernel = 3
+    activation = relu
+    se_ratio = None
+  else:
+    kernel = 5
+    activation = hard_swish
+    se_ratio = 0.25
+
+  x = img_input
+  x = layers.Rescaling(1. / 255.)(x)
+  x = layers.Conv2D(
+      16,
+      kernel_size=3,
+      strides=(2, 2),
+      padding='same',
+      use_bias=False,
+      name='Conv')(x)
+  x = layers.BatchNormalization(
+      axis=channel_axis, epsilon=1e-3,
+      momentum=0.999, name='Conv/BatchNorm')(x)
+  x = activation(x)
+
+  x = stack_fn(x, kernel, activation, se_ratio)
+
+  last_conv_ch = _depth(backend.int_shape(x)[channel_axis] * 6)
+
+  # if the width multiplier is greater than 1 we
+  # increase the number of output channels
+  if alpha > 1.0:
+    last_point_ch = _depth(last_point_ch * alpha)
+  x = layers.Conv2D(
+      last_conv_ch,
+      kernel_size=1,
+      padding='same',
+      use_bias=False,
+      name='Conv_1')(x)
+  x = layers.BatchNormalization(
+      axis=channel_axis, epsilon=1e-3,
+      momentum=0.999, name='Conv_1/BatchNorm')(x)
+  x = activation(x)
+  x = layers.Conv2D(
+      last_point_ch,
+      kernel_size=1,
+      padding='same',
+      use_bias=True,
+      name='Conv_2')(x)
+  x = activation(x)
+
+  if include_top:
+    x = layers.GlobalAveragePooling2D()(x)
+    if channel_axis == 1:
+      x = layers.Reshape((last_point_ch, 1, 1))(x)
+    else:
+      x = layers.Reshape((1, 1, last_point_ch))(x)
+    if dropout_rate > 0:
+      x = layers.Dropout(dropout_rate)(x)
+    x = layers.Conv2D(classes, kernel_size=1, padding='same', name='Logits')(x)
+    x = layers.Flatten()(x)
+    imagenet_utils.validate_activation(classifier_activation, weights)
+    x = layers.Activation(activation=classifier_activation,
+                          name='Predictions')(x)
+  else:
+    if pooling == 'avg':
+      x = layers.GlobalAveragePooling2D(name='avg_pool')(x)
+    elif pooling == 'max':
+      x = layers.GlobalMaxPooling2D(name='max_pool')(x)
+  # Ensure that the model takes into account
+  # any potential predecessors of `input_tensor`.
+  if input_tensor is not None:
+    inputs = layer_utils.get_source_inputs(input_tensor)
+  else:
+    inputs = img_input
+
+  # Create model.
+  model = models.Model(inputs, x, name='MobilenetV3' + model_type)
+
+  # Load weights.
+  if weights == 'imagenet':
+    model_name = '{}{}_224_{}_float'.format(
+        model_type, '_minimalistic' if minimalistic else '', str(alpha))
+    if include_top:
+      file_name = 'weights_mobilenet_v3_' + model_name + '.h5'
+      file_hash = WEIGHTS_HASHES[model_name][0]
+    else:
+      file_name = 'weights_mobilenet_v3_' + model_name + '_no_top.h5'
+      file_hash = WEIGHTS_HASHES[model_name][1]
+    weights_path = data_utils.get_file(
+        file_name,
+        BASE_WEIGHT_PATH + file_name,
+        cache_subdir='models',
+        file_hash=file_hash)
+    model.load_weights(weights_path)
+  elif weights is not None:
+    model.load_weights(weights)
+
+  return model
+
+
+@keras_export('keras.applications.MobileNetV3Small')
+def MobileNetV3Small(input_shape=None,
+                     alpha=1.0,
+                     minimalistic=False,
+                     include_top=True,
+                     weights='imagenet',
+                     input_tensor=None,
+                     classes=1000,
+                     pooling=None,
+                     dropout_rate=0.2,
+                     classifier_activation='softmax'):
+
+  def stack_fn(x, kernel, activation, se_ratio):
+
+    def depth(d):
+      return _depth(d * alpha)
+
+    x = _inverted_res_block(x, 1, depth(16), 3, 2, se_ratio, relu, 0)
+    x = _inverted_res_block(x, 72. / 16, depth(24), 3, 2, None, relu, 1)
+    x = _inverted_res_block(x, 88. / 24, depth(24), 3, 1, None, relu, 2)
+    x = _inverted_res_block(x, 4, depth(40), kernel, 2, se_ratio, activation, 3)
+    x = _inverted_res_block(x, 6, depth(40), kernel, 1, se_ratio, activation, 4)
+    x = _inverted_res_block(x, 6, depth(40), kernel, 1, se_ratio, activation, 5)
+    x = _inverted_res_block(x, 3, depth(48), kernel, 1, se_ratio, activation, 6)
+    x = _inverted_res_block(x, 3, depth(48), kernel, 1, se_ratio, activation, 7)
+    x = _inverted_res_block(x, 6, depth(96), kernel, 2, se_ratio, activation, 8)
+    x = _inverted_res_block(x, 6, depth(96), kernel, 1, se_ratio, activation, 9)
+    x = _inverted_res_block(x, 6, depth(96), kernel, 1, se_ratio, activation,
+                            10)
+    return x
+
+  return MobileNetV3(stack_fn, 1024, input_shape, alpha, 'small', minimalistic,
+                     include_top, weights, input_tensor, classes, pooling,
+                     dropout_rate, classifier_activation)
+
+
+@keras_export('keras.applications.MobileNetV3Large')
+def MobileNetV3Large(input_shape=None,
+                     alpha=1.0,
+                     minimalistic=False,
+                     include_top=True,
+                     weights='imagenet',
+                     input_tensor=None,
+                     classes=1000,
+                     pooling=None,
+                     dropout_rate=0.2,
+                     classifier_activation='softmax'):
+
+  def stack_fn(x, kernel, activation, se_ratio):
+
+    def depth(d):
+      return _depth(d * alpha)
+
+    x = _inverted_res_block(x, 1, depth(16), 3, 1, None, relu, 0)
+    x = _inverted_res_block(x, 4, depth(24), 3, 2, None, relu, 1)
+    x = _inverted_res_block(x, 3, depth(24), 3, 1, None, relu, 2)
+    x = _inverted_res_block(x, 3, depth(40), kernel, 2, se_ratio, relu, 3)
+    x = _inverted_res_block(x, 3, depth(40), kernel, 1, se_ratio, relu, 4)
+    x = _inverted_res_block(x, 3, depth(40), kernel, 1, se_ratio, relu, 5)
+    x = _inverted_res_block(x, 6, depth(80), 3, 2, None, activation, 6)
+    x = _inverted_res_block(x, 2.5, depth(80), 3, 1, None, activation, 7)
+    x = _inverted_res_block(x, 2.3, depth(80), 3, 1, None, activation, 8)
+    x = _inverted_res_block(x, 2.3, depth(80), 3, 1, None, activation, 9)
+    x = _inverted_res_block(x, 6, depth(112), 3, 1, se_ratio, activation, 10)
+    x = _inverted_res_block(x, 6, depth(112), 3, 1, se_ratio, activation, 11)
+    x = _inverted_res_block(x, 6, depth(160), kernel, 2, se_ratio, activation,
+                            12)
+    x = _inverted_res_block(x, 6, depth(160), kernel, 1, se_ratio, activation,
+                            13)
+    x = _inverted_res_block(x, 6, depth(160), kernel, 1, se_ratio, activation,
+                            14)
+    return x
+
+  return MobileNetV3(stack_fn, 1280, input_shape, alpha, 'large', minimalistic,
+                     include_top, weights, input_tensor, classes, pooling,
+                     dropout_rate, classifier_activation)
+
+
+MobileNetV3Small.__doc__ = BASE_DOCSTRING.format(name='MobileNetV3Small')
+MobileNetV3Large.__doc__ = BASE_DOCSTRING.format(name='MobileNetV3Large')
+
+
+def relu(x):
+  return layers.ReLU()(x)
+
+
+def hard_sigmoid(x):
+  return layers.ReLU(6.)(x + 3.) * (1. / 6.)
+
+
+def hard_swish(x):
+  return layers.Multiply()([hard_sigmoid(x), x])
+
+
+# This function is taken from the original tf repo.
+# It ensures that all layers have a channel number that is divisible by 8
+# It can be seen here:
+# https://github.com/tensorflow/models/blob/master/research/
+# slim/nets/mobilenet/mobilenet.py
+
+
+def _depth(v, divisor=8, min_value=None):
+  if min_value is None:
+    min_value = divisor
+  new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+  # Make sure that round down does not go down by more than 10%.
+  if new_v < 0.9 * v:
+    new_v += divisor
+  return new_v
+
+
+def _se_block(inputs, filters, se_ratio, prefix):
+  x = layers.GlobalAveragePooling2D(name=prefix + 'squeeze_excite/AvgPool')(
+      inputs)
+  if backend.image_data_format() == 'channels_first':
+    x = layers.Reshape((filters, 1, 1))(x)
+  else:
+    x = layers.Reshape((1, 1, filters))(x)
+  x = layers.Conv2D(
+      _depth(filters * se_ratio),
+      kernel_size=1,
+      padding='same',
+      name=prefix + 'squeeze_excite/Conv')(
+          x)
+  x = layers.ReLU(name=prefix + 'squeeze_excite/Relu')(x)
+  x = layers.Conv2D(
+      filters,
+      kernel_size=1,
+      padding='same',
+      name=prefix + 'squeeze_excite/Conv_1')(
+          x)
+  x = hard_sigmoid(x)
+  x = layers.Multiply(name=prefix + 'squeeze_excite/Mul')([inputs, x])
+  return x
+
+
+def _inverted_res_block(x, expansion, filters, kernel_size, stride, se_ratio,
+                        activation, block_id):
+  channel_axis = 1 if backend.image_data_format() == 'channels_first' else -1
+  shortcut = x
+  prefix = 'expanded_conv/'
+  infilters = backend.int_shape(x)[channel_axis]
+  if block_id:
+    # Expand
+    prefix = 'expanded_conv_{}/'.format(block_id)
+    x = layers.Conv2D(
+        _depth(infilters * expansion),
+        kernel_size=1,
+        padding='same',
+        use_bias=False,
+        name=prefix + 'expand')(
+            x)
+    x = layers.BatchNormalization(
+        axis=channel_axis,
+        epsilon=1e-3,
+        momentum=0.999,
+        name=prefix + 'expand/BatchNorm')(
+            x)
+    x = activation(x)
+
+  if stride == 2:
+    x = layers.ZeroPadding2D(
+        padding=imagenet_utils.correct_pad(x, kernel_size),
+        name=prefix + 'depthwise/pad')(
+            x)
+  x = layers.DepthwiseConv2D(
+      kernel_size,
+      strides=stride,
+      padding='same' if stride == 1 else 'valid',
+      use_bias=False,
+      name=prefix + 'depthwise')(
+          x)
+  x = layers.BatchNormalization(
+      axis=channel_axis,
+      epsilon=1e-3,
+      momentum=0.999,
+      name=prefix + 'depthwise/BatchNorm')(
+          x)
+  x = activation(x)
+
+  if se_ratio:
+    x = _se_block(x, _depth(infilters * expansion), se_ratio, prefix)
+
+  x = layers.Conv2D(
+      filters,
+      kernel_size=1,
+      padding='same',
+      use_bias=False,
+      name=prefix + 'project')(
+          x)
+  x = layers.BatchNormalization(
+      axis=channel_axis,
+      epsilon=1e-3,
+      momentum=0.999,
+      name=prefix + 'project/BatchNorm')(
+          x)
+
+  if stride == 1 and infilters == filters:
+    x = layers.Add(name=prefix + 'Add')([shortcut, x])
+  return x
+
+
+@keras_export('keras.applications.mobilenet_v3.preprocess_input')
+def preprocess_input(x, data_format=None):  # pylint: disable=unused-argument
+  return x
+
+
+@keras_export('keras.applications.mobilenet_v3.decode_predictions')
+def decode_predictions(preds, top=5):
+  return imagenet_utils.decode_predictions(preds, top=top)
+
+
+preprocess_input.__doc__ = imagenet_utils.PREPROCESS_INPUT_DOC.format(
+    mode='',
+    ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_TF,
+    error=imagenet_utils.PREPROCESS_INPUT_ERROR_DOC)
+decode_predictions.__doc__ = imagenet_utils.decode_predictions.__doc__
diff --git a/tensorflow/python/keras/applications/nasnet.py b/tensorflow/python/keras/applications/nasnet.py
index 4f71165f2e9..5887cfca594 100644
--- a/tensorflow/python/keras/applications/nasnet.py
+++ b/tensorflow/python/keras/applications/nasnet.py
@@ -86,9 +86,6 @@ def NASNet(
   Note that the data format convention used by the model is
   the one specified in your Keras config at `~/.keras/keras.json`.
 
-  Caution: Be sure to properly pre-process your inputs to the application.
-  Please see `applications.nasnet.preprocess_input` for an example.
-
   Arguments:
     input_shape: Optional shape tuple, the input shape
       is by default `(331, 331, 3)` for NASNetLarge and
@@ -150,7 +147,7 @@ def NASNet(
     ValueError: if `classifier_activation` is not `softmax` or `None` when
       using a pretrained top layer.
   """
-  if not (weights in {'imagenet', None} or file_io.file_exists(weights)):
+  if not (weights in {'imagenet', None} or file_io.file_exists_v2(weights)):
     raise ValueError('The `weights` argument should be either '
                      '`None` (random initialization), `imagenet` '
                      '(pre-training on ImageNet), '
@@ -331,7 +328,7 @@ def NASNetMobile(input_shape=None,
                  pooling=None,
                  classes=1000):
   """Instantiates a Mobile NASNet model in ImageNet mode.
-  
+
   Reference:
   - [Learning Transferable Architectures for Scalable Image Recognition](
       https://arxiv.org/abs/1707.07012) (CVPR 2018)
@@ -339,9 +336,10 @@ def NASNetMobile(input_shape=None,
   Optionally loads weights pre-trained on ImageNet.
   Note that the data format convention used by the model is
   the one specified in your Keras config at `~/.keras/keras.json`.
-  
-  Caution: Be sure to properly pre-process your inputs to the application.
-  Please see `applications.nasnet.preprocess_input` for an example.
+
+  Note: each Keras Application expects a specific kind of input preprocessing.
+  For NASNet, call `tf.keras.applications.nasnet.preprocess_input` on your
+  inputs before passing them to the model.
 
   Arguments:
       input_shape: Optional shape tuple, only to be specified
@@ -407,7 +405,7 @@ def NASNetLarge(input_shape=None,
                 pooling=None,
                 classes=1000):
   """Instantiates a NASNet model in ImageNet mode.
-  
+
   Reference:
   - [Learning Transferable Architectures for Scalable Image Recognition](
       https://arxiv.org/abs/1707.07012) (CVPR 2018)
@@ -415,9 +413,10 @@ def NASNetLarge(input_shape=None,
   Optionally loads weights pre-trained on ImageNet.
   Note that the data format convention used by the model is
   the one specified in your Keras config at `~/.keras/keras.json`.
-  
-  Caution: Be sure to properly pre-process your inputs to the application.
-  Please see `applications.nasnet.preprocess_input` for an example.
+
+  Note: each Keras Application expects a specific kind of input preprocessing.
+  For NASNet, call `tf.keras.applications.nasnet.preprocess_input` on your
+  inputs before passing them to the model.
 
   Arguments:
       input_shape: Optional shape tuple, only to be specified
diff --git a/tensorflow/python/keras/applications/resnet.py b/tensorflow/python/keras/applications/resnet.py
index 61310399180..9c50b8a7c65 100644
--- a/tensorflow/python/keras/applications/resnet.py
+++ b/tensorflow/python/keras/applications/resnet.py
@@ -79,9 +79,6 @@ def ResNet(stack_fn,
   Note that the data format convention used by the model is
   the one specified in your Keras config at `~/.keras/keras.json`.
 
-  Caution: Be sure to properly pre-process your inputs to the application.
-  Please see `applications.resnet.preprocess_input` for an example.
-
   Arguments:
     stack_fn: a function that returns output tensor for the
       stacked residual blocks.
@@ -137,7 +134,7 @@ def ResNet(stack_fn,
     layers = VersionAwareLayers()
   if kwargs:
     raise ValueError('Unknown argument(s): %s' % (kwargs,))
-  if not (weights in {'imagenet', None} or file_io.file_exists(weights)):
+  if not (weights in {'imagenet', None} or file_io.file_exists_v2(weights)):
     raise ValueError('The `weights` argument should be either '
                      '`None` (random initialization), `imagenet` '
                      '(pre-training on ImageNet), '
@@ -545,9 +542,10 @@ DOC = """
   Optionally loads weights pre-trained on ImageNet.
   Note that the data format convention used by the model is
   the one specified in your Keras config at `~/.keras/keras.json`.
-  
-  Caution: Be sure to properly pre-process your inputs to the application.
-  Please see `applications.resnet.preprocess_input` for an example.
+
+  Note: each Keras Application expects a specific kind of input preprocessing.
+  For ResNet, call `tf.keras.applications.resnet.preprocess_input` on your
+  inputs before passing them to the model.
 
   Arguments:
     include_top: whether to include the fully-connected
diff --git a/tensorflow/python/keras/applications/resnet_v2.py b/tensorflow/python/keras/applications/resnet_v2.py
index 212e25350a2..83f6e674cc8 100644
--- a/tensorflow/python/keras/applications/resnet_v2.py
+++ b/tensorflow/python/keras/applications/resnet_v2.py
@@ -148,8 +148,9 @@ DOC = """
   Note that the data format convention used by the model is
   the one specified in your Keras config at `~/.keras/keras.json`.
 
-  Caution: Be sure to properly pre-process your inputs to the application.
-  Please see `applications.resnet_v2.preprocess_input` for an example.
+  Note: each Keras Application expects a specific kind of input preprocessing.
+  For ResNetV2, call `tf.keras.applications.resnet_v2.preprocess_input` on your
+  inputs before passing them to the model.
 
   Arguments:
     include_top: whether to include the fully-connected
diff --git a/tensorflow/python/keras/applications/vgg16.py b/tensorflow/python/keras/applications/vgg16.py
index 0d508997d0f..33bf8d25b24 100644
--- a/tensorflow/python/keras/applications/vgg16.py
+++ b/tensorflow/python/keras/applications/vgg16.py
@@ -66,8 +66,9 @@ def VGG16(
 
   The default input size for this model is 224x224.
 
-  Caution: Be sure to properly pre-process your inputs to the application.
-  Please see `applications.vgg16.preprocess_input` for an example.
+  Note: each Keras Application expects a specific kind of input preprocessing.
+  For VGG16, call `tf.keras.applications.vgg16.preprocess_input` on your
+  inputs before passing them to the model.
 
   Arguments:
       include_top: whether to include the 3 fully-connected
@@ -113,7 +114,7 @@ def VGG16(
     ValueError: if `classifier_activation` is not `softmax` or `None` when
       using a pretrained top layer.
   """
-  if not (weights in {'imagenet', None} or file_io.file_exists(weights)):
+  if not (weights in {'imagenet', None} or file_io.file_exists_v2(weights)):
     raise ValueError('The `weights` argument should be either '
                      '`None` (random initialization), `imagenet` '
                      '(pre-training on ImageNet), '
diff --git a/tensorflow/python/keras/applications/vgg19.py b/tensorflow/python/keras/applications/vgg19.py
index c033f57338a..ad6c9b84c00 100644
--- a/tensorflow/python/keras/applications/vgg19.py
+++ b/tensorflow/python/keras/applications/vgg19.py
@@ -66,8 +66,9 @@ def VGG19(
 
   The default input size for this model is 224x224.
 
-  Caution: Be sure to properly pre-process your inputs to the application.
-  Please see `applications.vgg19.preprocess_input` for an example.
+  Note: each Keras Application expects a specific kind of input preprocessing.
+  For VGG19, call `tf.keras.applications.vgg19.preprocess_input` on your
+  inputs before passing them to the model.
 
   Arguments:
     include_top: whether to include the 3 fully-connected
@@ -113,7 +114,7 @@ def VGG19(
     ValueError: if `classifier_activation` is not `softmax` or `None` when
       using a pretrained top layer.
   """
-  if not (weights in {'imagenet', None} or file_io.file_exists(weights)):
+  if not (weights in {'imagenet', None} or file_io.file_exists_v2(weights)):
     raise ValueError('The `weights` argument should be either '
                      '`None` (random initialization), `imagenet` '
                      '(pre-training on ImageNet), '
diff --git a/tensorflow/python/keras/applications/xception.py b/tensorflow/python/keras/applications/xception.py
index b954f7735ed..3d595ffa419 100644
--- a/tensorflow/python/keras/applications/xception.py
+++ b/tensorflow/python/keras/applications/xception.py
@@ -68,8 +68,9 @@ def Xception(
   the one specified in your Keras config at `~/.keras/keras.json`.
   Note that the default input image size for this model is 299x299.
 
-  Caution: Be sure to properly pre-process your inputs to the application.
-  Please see `applications.xception.preprocess_input` for an example.
+  Note: each Keras Application expects a specific kind of input preprocessing.
+  For Xception, call `tf.keras.applications.xception.preprocess_input` on your
+  inputs before passing them to the model.
 
   Arguments:
     include_top: whether to include the fully-connected
@@ -113,7 +114,7 @@ def Xception(
     ValueError: if `classifier_activation` is not `softmax` or `None` when
       using a pretrained top layer.
   """
-  if not (weights in {'imagenet', None} or file_io.file_exists(weights)):
+  if not (weights in {'imagenet', None} or file_io.file_exists_v2(weights)):
     raise ValueError('The `weights` argument should be either '
                      '`None` (random initialization), `imagenet` '
                      '(pre-training on ImageNet), '
diff --git a/tensorflow/python/keras/backend.py b/tensorflow/python/keras/backend.py
index 5f8a11806db..15eed32fe4b 100644
--- a/tensorflow/python/keras/backend.py
+++ b/tensorflow/python/keras/backend.py
@@ -27,6 +27,7 @@ import json
 import os
 import sys
 import threading
+import warnings
 import weakref
 
 import numpy as np
@@ -54,6 +55,8 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend_config
 from tensorflow.python.keras.engine import keras_tensor
 from tensorflow.python.keras.utils import control_flow_util
+from tensorflow.python.keras.utils import tf_contextlib
+from tensorflow.python.keras.utils import tf_inspect
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
@@ -81,9 +84,8 @@ from tensorflow.python.training.tracking import util as tracking_util
 from tensorflow.python.util import dispatch
 from tensorflow.python.util import nest
 from tensorflow.python.util import object_identity
-from tensorflow.python.util import tf_contextlib
-from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import keras_export
+from tensorflow.tools.docs import doc_controls
 
 py_all = all
 py_sum = sum
@@ -163,6 +165,7 @@ set_image_data_format = backend_config.set_image_data_format
 
 
 @keras_export('keras.backend.backend')
+@doc_controls.do_not_generate_docs
 def backend():
   """Publicly accessible method for determining the current backend.
 
@@ -176,6 +179,7 @@ def backend():
 
 @keras_export('keras.backend.cast_to_floatx')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def cast_to_floatx(x):
   """Cast a Numpy array to the default Keras float type.
 
@@ -310,6 +314,7 @@ def clear_session():
 
 
 @keras_export('keras.backend.manual_variable_initialization')
+@doc_controls.do_not_generate_docs
 def manual_variable_initialization(value):
   """Sets the manual variable initialization flag.
 
@@ -327,6 +332,7 @@ def manual_variable_initialization(value):
 
 
 @keras_export('keras.backend.learning_phase')
+@doc_controls.do_not_generate_docs
 def learning_phase():
   """Returns the learning phase flag.
 
@@ -395,6 +401,7 @@ def _default_learning_phase():
 
 
 @keras_export('keras.backend.set_learning_phase')
+@doc_controls.do_not_generate_docs
 def set_learning_phase(value):
   """Sets the learning phase to a fixed value.
 
@@ -419,10 +426,10 @@ def set_learning_phase(value):
   Raises:
       ValueError: if `value` is neither `0` nor `1`.
   """
-  logging.warning('`tf.keras.backend.set_learning_phase` is deprecated and '
-                  'will be removed after 2020-10-11. To update it, simply '
-                  'pass a True/False value to the `training` argument of the '
-                  '`__call__` method of your layer or model.')
+  warnings.warn('`tf.keras.backend.set_learning_phase` is deprecated and '
+                'will be removed after 2020-10-11. To update it, simply '
+                'pass a True/False value to the `training` argument of the '
+                '`__call__` method of your layer or model.')
   deprecated_internal_set_learning_phase(value)
 
 
@@ -461,6 +468,7 @@ def deprecated_internal_set_learning_phase(value):
 
 @keras_export('keras.backend.learning_phase_scope')
 @tf_contextlib.contextmanager
+@doc_controls.do_not_generate_docs
 def learning_phase_scope(value):
   """Provides a scope within which the learning phase is equal to `value`.
 
@@ -476,10 +484,10 @@ def learning_phase_scope(value):
   Raises:
      ValueError: if `value` is neither `0` nor `1`.
   """
-  logging.warning('`tf.keras.backend.learning_phase_scope` is deprecated and '
-                  'will be removed after 2020-10-11. To update it, simply '
-                  'pass a True/False value to the `training` argument of the '
-                  '`__call__` method of your layer or model.')
+  warnings.warn('`tf.keras.backend.learning_phase_scope` is deprecated and '
+                'will be removed after 2020-10-11. To update it, simply '
+                'pass a True/False value to the `training` argument of the '
+                '`__call__` method of your layer or model.')
   with deprecated_internal_learning_phase_scope(value):
     try:
       yield
@@ -833,10 +841,11 @@ def _to_tensor(x, dtype):
   Returns:
       A tensor.
   """
-  return ops.convert_to_tensor_v2(x, dtype=dtype)
+  return ops.convert_to_tensor_v2_with_dispatch(x, dtype=dtype)
 
 
 @keras_export('keras.backend.is_sparse')
+@doc_controls.do_not_generate_docs
 def is_sparse(tensor):
   """Returns whether a tensor is a sparse tensor.
 
@@ -865,6 +874,7 @@ def is_sparse(tensor):
 
 @keras_export('keras.backend.to_dense')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def to_dense(tensor):
   """Converts a sparse tensor into a dense tensor and returns it.
 
@@ -892,6 +902,7 @@ def to_dense(tensor):
 
 
 @keras_export('keras.backend.name_scope', v1=[])
+@doc_controls.do_not_generate_docs
 def name_scope(name):
   """A context manager for use when defining a Python op.
 
@@ -923,6 +934,7 @@ keras_export(v1=['keras.backend.name_scope'])(ops.name_scope_v1)
 
 
 @keras_export('keras.backend.variable')
+@doc_controls.do_not_generate_docs
 def variable(value, dtype=None, name=None, constraint=None):
   """Instantiates a variable and returns it.
 
@@ -1074,6 +1086,7 @@ def _initialize_variables(session):
 
 @keras_export('keras.backend.constant')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def constant(value, dtype=None, shape=None, name=None):
   """Creates a constant tensor.
 
@@ -1147,6 +1160,7 @@ def is_keras_tensor(x):
 
 
 @keras_export('keras.backend.placeholder')
+@doc_controls.do_not_generate_docs
 def placeholder(shape=None,
                 ndim=None,
                 dtype=None,
@@ -1181,7 +1195,7 @@ def placeholder(shape=None,
 
   >>> input_ph = tf.keras.backend.placeholder(shape=(2, 4, 5))
   >>> input_ph
-  <KerasTensor: shape=(2, 4, 5) dtype=float32 (Symbolic value ...)>
+  <KerasTensor: shape=(2, 4, 5) dtype=float32 (created by layer ...)>
 
   """
   if sparse and ragged:
@@ -1265,6 +1279,7 @@ def is_placeholder(x):
 
 @keras_export('keras.backend.shape')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def shape(x):
   """Returns the symbolic shape of a tensor or variable.
 
@@ -1289,6 +1304,7 @@ def shape(x):
 
 
 @keras_export('keras.backend.int_shape')
+@doc_controls.do_not_generate_docs
 def int_shape(x):
   """Returns the shape of tensor or variable as a tuple of int or None entries.
 
@@ -1319,6 +1335,7 @@ def int_shape(x):
 
 
 @keras_export('keras.backend.ndim')
+@doc_controls.do_not_generate_docs
 def ndim(x):
   """Returns the number of axes in a tensor, as an integer.
 
@@ -1348,6 +1365,7 @@ def ndim(x):
 
 @keras_export('keras.backend.dtype')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def dtype(x):
   """Returns the dtype of a Keras tensor or variable, as a string.
 
@@ -1380,6 +1398,7 @@ def dtype(x):
 
 
 @keras_export('keras.backend.eval')
+@doc_controls.do_not_generate_docs
 def eval(x):
   """Evaluates the value of a variable.
 
@@ -1402,6 +1421,7 @@ def eval(x):
 
 
 @keras_export('keras.backend.zeros')
+@doc_controls.do_not_generate_docs
 def zeros(shape, dtype=None, name=None):
   """Instantiates an all-zeros variable and returns it.
 
@@ -1447,6 +1467,7 @@ def zeros(shape, dtype=None, name=None):
 
 @keras_export('keras.backend.ones')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def ones(shape, dtype=None, name=None):
   """Instantiates an all-ones variable and returns it.
 
@@ -1482,6 +1503,7 @@ def ones(shape, dtype=None, name=None):
 
 @keras_export('keras.backend.eye')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def eye(size, dtype=None, name=None):
   """Instantiate an identity matrix and returns it.
 
@@ -1511,6 +1533,7 @@ def eye(size, dtype=None, name=None):
 
 
 @keras_export('keras.backend.zeros_like')
+@doc_controls.do_not_generate_docs
 def zeros_like(x, dtype=None, name=None):
   """Instantiates an all-zeros variable of the same shape as another tensor.
 
@@ -1539,6 +1562,7 @@ def zeros_like(x, dtype=None, name=None):
 
 @keras_export('keras.backend.ones_like')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def ones_like(x, dtype=None, name=None):
   """Instantiates an all-ones variable of the same shape as another tensor.
 
@@ -1577,6 +1601,7 @@ def identity(x, name=None):
 
 
 @keras_export('keras.backend.random_uniform_variable')
+@doc_controls.do_not_generate_docs
 def random_uniform_variable(shape, low, high, dtype=None, name=None, seed=None):
   """Instantiates a variable with values drawn from a uniform distribution.
 
@@ -1611,6 +1636,7 @@ def random_uniform_variable(shape, low, high, dtype=None, name=None, seed=None):
 
 
 @keras_export('keras.backend.random_normal_variable')
+@doc_controls.do_not_generate_docs
 def random_normal_variable(shape, mean, scale, dtype=None, name=None,
                            seed=None):
   """Instantiates a variable with values drawn from a normal distribution.
@@ -1646,6 +1672,7 @@ def random_normal_variable(shape, mean, scale, dtype=None, name=None,
 
 
 @keras_export('keras.backend.count_params')
+@doc_controls.do_not_generate_docs
 def count_params(x):
   """Returns the static number of elements in a variable or tensor.
 
@@ -1670,6 +1697,7 @@ def count_params(x):
 
 @keras_export('keras.backend.cast')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def cast(x, dtype):
   """Casts a tensor to a different dtype and returns it.
 
@@ -1701,11 +1729,13 @@ def cast(x, dtype):
 
 
 @keras_export('keras.backend.update')
+@doc_controls.do_not_generate_docs
 def update(x, new_x):
   return state_ops.assign(x, new_x)
 
 
 @keras_export('keras.backend.update_add')
+@doc_controls.do_not_generate_docs
 def update_add(x, increment):
   """Update the value of `x` by adding `increment`.
 
@@ -1720,6 +1750,7 @@ def update_add(x, increment):
 
 
 @keras_export('keras.backend.update_sub')
+@doc_controls.do_not_generate_docs
 def update_sub(x, decrement):
   """Update the value of `x` by subtracting `decrement`.
 
@@ -1734,6 +1765,7 @@ def update_sub(x, decrement):
 
 
 @keras_export('keras.backend.moving_average_update')
+@doc_controls.do_not_generate_docs
 def moving_average_update(x, value, momentum):
   """Compute the exponential moving average of a value.
 
@@ -1781,6 +1813,7 @@ def moving_average_update(x, value, momentum):
 
 @keras_export('keras.backend.dot')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def dot(x, y):
   """Multiplies 2 tensors (and/or variables) and returns a tensor.
 
@@ -1842,6 +1875,7 @@ def dot(x, y):
 
 @keras_export('keras.backend.batch_dot')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def batch_dot(x, y, axes=None):
   """Batchwise dot product.
 
@@ -2031,6 +2065,7 @@ def batch_dot(x, y, axes=None):
 
 @keras_export('keras.backend.transpose')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def transpose(x):
   """Transposes a tensor and returns it.
 
@@ -2063,6 +2098,7 @@ def transpose(x):
 
 @keras_export('keras.backend.gather')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def gather(reference, indices):
   """Retrieves the elements of indices `indices` in the tensor `reference`.
 
@@ -2099,6 +2135,7 @@ def gather(reference, indices):
 
 @keras_export('keras.backend.max')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def max(x, axis=None, keepdims=False):
   """Maximum value in a tensor.
 
@@ -2118,6 +2155,7 @@ def max(x, axis=None, keepdims=False):
 
 @keras_export('keras.backend.min')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def min(x, axis=None, keepdims=False):
   """Minimum value in a tensor.
 
@@ -2137,6 +2175,7 @@ def min(x, axis=None, keepdims=False):
 
 @keras_export('keras.backend.sum')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def sum(x, axis=None, keepdims=False):
   """Sum of the values in a tensor, alongside the specified axis.
 
@@ -2156,6 +2195,7 @@ def sum(x, axis=None, keepdims=False):
 
 @keras_export('keras.backend.prod')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def prod(x, axis=None, keepdims=False):
   """Multiplies the values in a tensor, alongside the specified axis.
 
@@ -2175,6 +2215,7 @@ def prod(x, axis=None, keepdims=False):
 
 @keras_export('keras.backend.cumsum')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def cumsum(x, axis=0):
   """Cumulative sum of the values in a tensor, alongside the specified axis.
 
@@ -2190,6 +2231,7 @@ def cumsum(x, axis=0):
 
 @keras_export('keras.backend.cumprod')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def cumprod(x, axis=0):
   """Cumulative product of the values in a tensor, alongside the specified axis.
 
@@ -2204,6 +2246,7 @@ def cumprod(x, axis=0):
 
 
 @keras_export('keras.backend.var')
+@doc_controls.do_not_generate_docs
 def var(x, axis=None, keepdims=False):
   """Variance of a tensor, alongside the specified axis.
 
@@ -2225,6 +2268,7 @@ def var(x, axis=None, keepdims=False):
 
 @keras_export('keras.backend.std')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def std(x, axis=None, keepdims=False):
   """Standard deviation of a tensor, alongside the specified axis.
 
@@ -2252,6 +2296,7 @@ def std(x, axis=None, keepdims=False):
 
 @keras_export('keras.backend.mean')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def mean(x, axis=None, keepdims=False):
   """Mean of a tensor, alongside the specified axis.
 
@@ -2273,6 +2318,7 @@ def mean(x, axis=None, keepdims=False):
 
 @keras_export('keras.backend.any')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def any(x, axis=None, keepdims=False):
   """Bitwise reduction (logical OR).
 
@@ -2290,6 +2336,7 @@ def any(x, axis=None, keepdims=False):
 
 @keras_export('keras.backend.all')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def all(x, axis=None, keepdims=False):
   """Bitwise reduction (logical AND).
 
@@ -2307,6 +2354,7 @@ def all(x, axis=None, keepdims=False):
 
 @keras_export('keras.backend.argmax')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def argmax(x, axis=-1):
   """Returns the index of the maximum value along an axis.
 
@@ -2322,6 +2370,7 @@ def argmax(x, axis=-1):
 
 @keras_export('keras.backend.argmin')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def argmin(x, axis=-1):
   """Returns the index of the minimum value along an axis.
 
@@ -2337,6 +2386,7 @@ def argmin(x, axis=-1):
 
 @keras_export('keras.backend.square')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def square(x):
   """Element-wise square.
 
@@ -2351,6 +2401,7 @@ def square(x):
 
 @keras_export('keras.backend.abs')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def abs(x):
   """Element-wise absolute value.
 
@@ -2365,6 +2416,7 @@ def abs(x):
 
 @keras_export('keras.backend.sqrt')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def sqrt(x):
   """Element-wise square root.
 
@@ -2382,6 +2434,7 @@ def sqrt(x):
 
 @keras_export('keras.backend.exp')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def exp(x):
   """Element-wise exponential.
 
@@ -2396,6 +2449,7 @@ def exp(x):
 
 @keras_export('keras.backend.log')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def log(x):
   """Element-wise log.
 
@@ -2431,6 +2485,7 @@ def logsumexp(x, axis=None, keepdims=False):
 
 @keras_export('keras.backend.round')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def round(x):
   """Element-wise rounding to the closest integer.
 
@@ -2447,6 +2502,7 @@ def round(x):
 
 @keras_export('keras.backend.sign')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def sign(x):
   """Element-wise sign.
 
@@ -2461,6 +2517,7 @@ def sign(x):
 
 @keras_export('keras.backend.pow')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def pow(x, a):
   """Element-wise exponentiation.
 
@@ -2476,6 +2533,7 @@ def pow(x, a):
 
 @keras_export('keras.backend.clip')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def clip(x, min_value, max_value):
   """Element-wise value clipping.
 
@@ -2500,6 +2558,7 @@ def clip(x, min_value, max_value):
 
 @keras_export('keras.backend.equal')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def equal(x, y):
   """Element-wise equality between two tensors.
 
@@ -2515,6 +2574,7 @@ def equal(x, y):
 
 @keras_export('keras.backend.not_equal')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def not_equal(x, y):
   """Element-wise inequality between two tensors.
 
@@ -2530,6 +2590,7 @@ def not_equal(x, y):
 
 @keras_export('keras.backend.greater')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def greater(x, y):
   """Element-wise truth value of (x > y).
 
@@ -2545,6 +2606,7 @@ def greater(x, y):
 
 @keras_export('keras.backend.greater_equal')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def greater_equal(x, y):
   """Element-wise truth value of (x >= y).
 
@@ -2560,6 +2622,7 @@ def greater_equal(x, y):
 
 @keras_export('keras.backend.less')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def less(x, y):
   """Element-wise truth value of (x < y).
 
@@ -2575,6 +2638,7 @@ def less(x, y):
 
 @keras_export('keras.backend.less_equal')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def less_equal(x, y):
   """Element-wise truth value of (x <= y).
 
@@ -2590,6 +2654,7 @@ def less_equal(x, y):
 
 @keras_export('keras.backend.maximum')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def maximum(x, y):
   """Element-wise maximum of two tensors.
 
@@ -2615,6 +2680,7 @@ def maximum(x, y):
 
 @keras_export('keras.backend.minimum')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def minimum(x, y):
   """Element-wise minimum of two tensors.
 
@@ -2630,6 +2696,7 @@ def minimum(x, y):
 
 @keras_export('keras.backend.sin')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def sin(x):
   """Computes sin of x element-wise.
 
@@ -2644,6 +2711,7 @@ def sin(x):
 
 @keras_export('keras.backend.cos')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def cos(x):
   """Computes cos of x element-wise.
 
@@ -2759,6 +2827,7 @@ def _fused_normalize_batch_in_training(x,
 
 
 @keras_export('keras.backend.normalize_batch_in_training')
+@doc_controls.do_not_generate_docs
 def normalize_batch_in_training(x, gamma, beta, reduction_axes, epsilon=1e-3):
   """Computes mean and std for batch then apply batch_normalization on batch.
 
@@ -2790,6 +2859,7 @@ def normalize_batch_in_training(x, gamma, beta, reduction_axes, epsilon=1e-3):
 
 @keras_export('keras.backend.batch_normalization')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def batch_normalization(x, mean, var, beta, gamma, axis=-1, epsilon=1e-3):
   """Applies batch normalization on x given mean, var, beta and gamma.
 
@@ -2853,6 +2923,7 @@ def batch_normalization(x, mean, var, beta, gamma, axis=-1, epsilon=1e-3):
 
 @keras_export('keras.backend.concatenate')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def concatenate(tensors, axis=-1):
   """Concatenates a list of tensors alongside the specified axis.
 
@@ -2891,6 +2962,7 @@ def concatenate(tensors, axis=-1):
 
 @keras_export('keras.backend.reshape')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def reshape(x, shape):
   """Reshapes a tensor to the specified shape.
 
@@ -2921,6 +2993,7 @@ def reshape(x, shape):
 
 @keras_export('keras.backend.permute_dimensions')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def permute_dimensions(x, pattern):
   """Permutes axes in a tensor.
 
@@ -2953,6 +3026,7 @@ def permute_dimensions(x, pattern):
 
 @keras_export('keras.backend.resize_images')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def resize_images(x, height_factor, width_factor, data_format,
                   interpolation='nearest'):
   """Resizes the images contained in a 4D tensor.
@@ -3017,6 +3091,7 @@ def resize_images(x, height_factor, width_factor, data_format,
 
 @keras_export('keras.backend.resize_volumes')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def resize_volumes(x, depth_factor, height_factor, width_factor, data_format):
   """Resizes the volume contained in a 5D tensor.
 
@@ -3050,6 +3125,7 @@ def resize_volumes(x, depth_factor, height_factor, width_factor, data_format):
 
 @keras_export('keras.backend.repeat_elements')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def repeat_elements(x, rep, axis):
   """Repeats the elements of a tensor along an axis, like `np.repeat`.
 
@@ -3112,6 +3188,7 @@ def repeat_elements(x, rep, axis):
 
 @keras_export('keras.backend.repeat')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def repeat(x, n):
   """Repeats a 2D tensor.
 
@@ -3148,6 +3225,7 @@ def repeat(x, n):
 
 @keras_export('keras.backend.arange')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def arange(start, stop=None, step=1, dtype='int32'):
   """Creates a 1D tensor containing a sequence of integers.
 
@@ -3187,6 +3265,7 @@ def arange(start, stop=None, step=1, dtype='int32'):
 
 @keras_export('keras.backend.tile')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def tile(x, n):
   """Creates a tensor by tiling `x` by `n`.
 
@@ -3205,6 +3284,7 @@ def tile(x, n):
 
 @keras_export('keras.backend.flatten')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def flatten(x):
   """Flatten a tensor.
 
@@ -3231,6 +3311,7 @@ def flatten(x):
 
 @keras_export('keras.backend.batch_flatten')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def batch_flatten(x):
   """Turn a nD tensor into a 2D tensor with same 0th dimension.
 
@@ -3257,6 +3338,7 @@ def batch_flatten(x):
 
 @keras_export('keras.backend.expand_dims')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def expand_dims(x, axis=-1):
   """Adds a 1-sized dimension at index "axis".
 
@@ -3272,6 +3354,7 @@ def expand_dims(x, axis=-1):
 
 @keras_export('keras.backend.squeeze')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def squeeze(x, axis):
   """Removes a 1-dimension from the tensor at index "axis".
 
@@ -3287,6 +3370,7 @@ def squeeze(x, axis):
 
 @keras_export('keras.backend.temporal_padding')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def temporal_padding(x, padding=(1, 1)):
   """Pads the middle dimension of a 3D tensor.
 
@@ -3305,6 +3389,7 @@ def temporal_padding(x, padding=(1, 1)):
 
 @keras_export('keras.backend.spatial_2d_padding')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def spatial_2d_padding(x, padding=((1, 1), (1, 1)), data_format=None):
   """Pads the 2nd and 3rd dimensions of a 4D tensor.
 
@@ -3337,6 +3422,7 @@ def spatial_2d_padding(x, padding=((1, 1), (1, 1)), data_format=None):
 
 @keras_export('keras.backend.spatial_3d_padding')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def spatial_3d_padding(x, padding=((1, 1), (1, 1), (1, 1)), data_format=None):
   """Pads 5D tensor with zeros along the depth, height, width dimensions.
 
@@ -3382,6 +3468,7 @@ def spatial_3d_padding(x, padding=((1, 1), (1, 1), (1, 1)), data_format=None):
 
 @keras_export('keras.backend.stack')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def stack(x, axis=0):
   """Stacks a list of rank `R` tensors into a rank `R+1` tensor.
 
@@ -3409,6 +3496,7 @@ def stack(x, axis=0):
 
 @keras_export('keras.backend.one_hot')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def one_hot(indices, num_classes):
   """Computes the one-hot representation of an integer tensor.
 
@@ -3429,6 +3517,7 @@ def one_hot(indices, num_classes):
 
 @keras_export('keras.backend.reverse')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def reverse(x, axes):
   """Reverse a tensor along the specified axes.
 
@@ -3475,6 +3564,7 @@ _VALUE_SET_CODE_STRING = """
 
 
 @keras_export('keras.backend.get_value')
+@doc_controls.do_not_generate_docs
 def get_value(x):
   """Returns the value of a variable.
 
@@ -3510,6 +3600,7 @@ def get_value(x):
 
 @keras_export('keras.backend.batch_get_value')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def batch_get_value(tensors):
   """Returns the value of more than one tensor variable.
 
@@ -3533,6 +3624,7 @@ def batch_get_value(tensors):
 
 
 @keras_export('keras.backend.set_value')
+@doc_controls.do_not_generate_docs
 def set_value(x, value):
   """Sets the value of a variable, from a Numpy array.
 
@@ -3572,6 +3664,7 @@ def set_value(x, value):
 
 @keras_export('keras.backend.batch_set_value')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def batch_set_value(tuples):
   """Sets the values of many tensor variables at once.
 
@@ -3615,6 +3708,7 @@ set_value.__doc__ = set_value.__doc__.format(snippet=_VALUE_SET_CODE_STRING)
 
 @keras_export('keras.backend.print_tensor')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def print_tensor(x, message=''):
   """Prints `message` and the tensor value when evaluated.
 
@@ -3916,6 +4010,7 @@ def eval_in_eager_or_function(outputs):
 
 
 @keras_export('keras.backend.function')
+@doc_controls.do_not_generate_docs
 def function(inputs, outputs, updates=None, name=None, **kwargs):
   """Instantiates a Keras function.
 
@@ -3963,6 +4058,7 @@ def function(inputs, outputs, updates=None, name=None, **kwargs):
 
 
 @keras_export('keras.backend.gradients')
+@doc_controls.do_not_generate_docs
 def gradients(loss, variables):
   """Returns the gradients of `loss` w.r.t. `variables`.
 
@@ -3979,6 +4075,7 @@ def gradients(loss, variables):
 
 @keras_export('keras.backend.stop_gradient')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def stop_gradient(variables):
   """Returns `variables` but with zero gradient w.r.t. every other variable.
 
@@ -4396,6 +4493,7 @@ def rnn(step_function,
 
 @keras_export('keras.backend.switch')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def switch(condition, then_expression, else_expression):
   """Switches between two operations depending on a scalar value.
 
@@ -4460,6 +4558,7 @@ def switch(condition, then_expression, else_expression):
 
 
 @keras_export('keras.backend.in_train_phase')
+@doc_controls.do_not_generate_docs
 def in_train_phase(x, alt, training=None):
   """Selects `x` in train phase, and `alt` otherwise.
 
@@ -4505,6 +4604,7 @@ def in_train_phase(x, alt, training=None):
 
 
 @keras_export('keras.backend.in_test_phase')
+@doc_controls.do_not_generate_docs
 def in_test_phase(x, alt, training=None):
   """Selects `x` in test phase, and `alt` otherwise.
 
@@ -4530,6 +4630,7 @@ def in_test_phase(x, alt, training=None):
 
 @keras_export('keras.backend.relu')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def relu(x, alpha=0., max_value=None, threshold=0):
   """Rectified linear unit.
 
@@ -4587,6 +4688,7 @@ def relu(x, alpha=0., max_value=None, threshold=0):
 
 @keras_export('keras.backend.elu')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def elu(x, alpha=1.):
   """Exponential linear unit.
 
@@ -4606,6 +4708,7 @@ def elu(x, alpha=1.):
 
 @keras_export('keras.backend.softmax')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def softmax(x, axis=-1):
   """Softmax of a tensor.
 
@@ -4622,6 +4725,7 @@ def softmax(x, axis=-1):
 
 @keras_export('keras.backend.softplus')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def softplus(x):
   """Softplus of a tensor.
 
@@ -4636,6 +4740,7 @@ def softplus(x):
 
 @keras_export('keras.backend.softsign')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def softsign(x):
   """Softsign of a tensor.
 
@@ -4650,6 +4755,7 @@ def softsign(x):
 
 @keras_export('keras.backend.categorical_crossentropy')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def categorical_crossentropy(target, output, from_logits=False, axis=-1):
   """Categorical crossentropy between an output tensor and a target tensor.
 
@@ -4661,7 +4767,7 @@ def categorical_crossentropy(target, output, from_logits=False, axis=-1):
       from_logits: Boolean, whether `output` is the
           result of a softmax, or is a tensor of logits.
       axis: Int specifying the channels axis. `axis=-1` corresponds to data
-          format `channels_last', and `axis=1` corresponds to data format
+          format `channels_last`, and `axis=1` corresponds to data format
           `channels_first`.
 
   Returns:
@@ -4692,8 +4798,8 @@ def categorical_crossentropy(target, output, from_logits=False, axis=-1):
   [0. 0. 0.]
 
   """
-  target = ops.convert_to_tensor_v2(target)
-  output = ops.convert_to_tensor_v2(output)
+  target = ops.convert_to_tensor_v2_with_dispatch(target)
+  output = ops.convert_to_tensor_v2_with_dispatch(output)
 
   target.shape.assert_is_compatible_with(output.shape)
   if from_logits:
@@ -4721,6 +4827,7 @@ def categorical_crossentropy(target, output, from_logits=False, axis=-1):
 
 @keras_export('keras.backend.sparse_categorical_crossentropy')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def sparse_categorical_crossentropy(target, output, from_logits=False, axis=-1):
   """Categorical crossentropy with integer targets.
 
@@ -4732,7 +4839,7 @@ def sparse_categorical_crossentropy(target, output, from_logits=False, axis=-1):
       from_logits: Boolean, whether `output` is the
           result of a softmax, or is a tensor of logits.
       axis: Int specifying the channels axis. `axis=-1` corresponds to data
-          format `channels_last', and `axis=1` corresponds to data format
+          format `channels_last`, and `axis=1` corresponds to data format
           `channels_first`.
 
   Returns:
@@ -4741,8 +4848,8 @@ def sparse_categorical_crossentropy(target, output, from_logits=False, axis=-1):
   Raises:
       ValueError: if `axis` is neither -1 nor one of the axes of `output`.
   """
-  target = ops.convert_to_tensor_v2(target)
-  output = ops.convert_to_tensor_v2(output)
+  target = ops.convert_to_tensor_v2_with_dispatch(target)
+  output = ops.convert_to_tensor_v2_with_dispatch(output)
 
   if (not from_logits and
       not isinstance(output, (ops.EagerTensor, variables_module.Variable)) and
@@ -4805,6 +4912,7 @@ def sparse_categorical_crossentropy(target, output, from_logits=False, axis=-1):
 
 @keras_export('keras.backend.binary_crossentropy')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def binary_crossentropy(target, output, from_logits=False):
   """Binary crossentropy between an output tensor and a target tensor.
 
@@ -4818,8 +4926,8 @@ def binary_crossentropy(target, output, from_logits=False):
   Returns:
       A tensor.
   """
-  target = ops.convert_to_tensor_v2(target)
-  output = ops.convert_to_tensor_v2(output)
+  target = ops.convert_to_tensor_v2_with_dispatch(target)
+  output = ops.convert_to_tensor_v2_with_dispatch(output)
 
   if from_logits:
     return nn.sigmoid_cross_entropy_with_logits(labels=target, logits=output)
@@ -4844,6 +4952,7 @@ def binary_crossentropy(target, output, from_logits=False):
 
 @keras_export('keras.backend.sigmoid')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def sigmoid(x):
   """Element-wise sigmoid.
 
@@ -4858,6 +4967,7 @@ def sigmoid(x):
 
 @keras_export('keras.backend.hard_sigmoid')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def hard_sigmoid(x):
   """Segment-wise linear approximation of sigmoid.
 
@@ -4881,6 +4991,7 @@ def hard_sigmoid(x):
 
 @keras_export('keras.backend.tanh')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def tanh(x):
   """Element-wise tanh.
 
@@ -4895,6 +5006,7 @@ def tanh(x):
 
 @keras_export('keras.backend.dropout')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def dropout(x, level, noise_shape=None, seed=None):
   """Sets entries in `x` to zero at random, while scaling the entire tensor.
 
@@ -4916,6 +5028,7 @@ def dropout(x, level, noise_shape=None, seed=None):
 
 @keras_export('keras.backend.l2_normalize')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def l2_normalize(x, axis=None):
   """Normalizes a tensor wrt the L2 norm alongside the specified axis.
 
@@ -4931,6 +5044,7 @@ def l2_normalize(x, axis=None):
 
 @keras_export('keras.backend.in_top_k')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def in_top_k(predictions, targets, k):
   """Returns whether the `targets` are in the top `k` `predictions`.
 
@@ -5034,6 +5148,7 @@ def _preprocess_padding(padding):
 
 @keras_export('keras.backend.conv1d')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def conv1d(x,
            kernel,
            strides=1,
@@ -5085,6 +5200,7 @@ def conv1d(x,
 
 @keras_export('keras.backend.conv2d')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def conv2d(x,
            kernel,
            strides=(1, 1),
@@ -5129,6 +5245,7 @@ def conv2d(x,
 
 @keras_export('keras.backend.conv2d_transpose')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def conv2d_transpose(x,
                      kernel,
                      output_shape,
@@ -5270,6 +5387,7 @@ def separable_conv1d(x,
 
 @keras_export('keras.backend.separable_conv2d')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def separable_conv2d(x,
                      depthwise_kernel,
                      pointwise_kernel,
@@ -5328,6 +5446,7 @@ def separable_conv2d(x,
 
 @keras_export('keras.backend.depthwise_conv2d')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def depthwise_conv2d(x,
                      depthwise_kernel,
                      strides=(1, 1),
@@ -5378,6 +5497,7 @@ def depthwise_conv2d(x,
 
 @keras_export('keras.backend.conv3d')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def conv3d(x,
            kernel,
            strides=(1, 1, 1),
@@ -5481,6 +5601,7 @@ def conv3d_transpose(x,
 
 @keras_export('keras.backend.pool2d')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def pool2d(x,
            pool_size,
            strides=(1, 1),
@@ -5541,6 +5662,7 @@ def pool2d(x,
 
 @keras_export('keras.backend.pool3d')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def pool3d(x,
            pool_size,
            strides=(1, 1, 1),
@@ -5672,6 +5794,7 @@ def local_conv(inputs,
 
 @keras_export('keras.backend.local_conv1d')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def local_conv1d(inputs, kernel, kernel_size, strides, data_format=None):
   """Apply 1D conv with un-shared weights.
 
@@ -5708,6 +5831,7 @@ def local_conv1d(inputs, kernel, kernel_size, strides, data_format=None):
 
 @keras_export('keras.backend.local_conv2d')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def local_conv2d(inputs,
                  kernel,
                  kernel_size,
@@ -5750,6 +5874,7 @@ def local_conv2d(inputs,
 
 @keras_export('keras.backend.bias_add')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def bias_add(x, bias, data_format=None):
   """Adds a bias vector to a tensor.
 
@@ -5795,6 +5920,7 @@ def bias_add(x, bias, data_format=None):
 
 @keras_export('keras.backend.random_normal')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def random_normal(shape, mean=0.0, stddev=1.0, dtype=None, seed=None):
   """Returns a tensor with normal distribution of values.
 
@@ -5832,6 +5958,7 @@ def random_normal(shape, mean=0.0, stddev=1.0, dtype=None, seed=None):
 
 @keras_export('keras.backend.random_uniform')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def random_uniform(shape, minval=0.0, maxval=1.0, dtype=None, seed=None):
   """Returns a tensor with uniform distribution of values.
 
@@ -5865,6 +5992,7 @@ def random_uniform(shape, minval=0.0, maxval=1.0, dtype=None, seed=None):
 
 @keras_export('keras.backend.random_binomial')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def random_binomial(shape, p=0.0, dtype=None, seed=None):
   """Returns a tensor with random binomial distribution of values.
 
@@ -5891,19 +6019,15 @@ def random_binomial(shape, p=0.0, dtype=None, seed=None):
   <tf.Tensor: shape=(2, 3), dtype=float32, numpy=...,
   dtype=float32)>
   """
-  logging.warning('`tf.keras.backend.random_binomial` is deprecated. '
-                  'Please use `tf.keras.backend.random_bernoulli` instead.')
-  if dtype is None:
-    dtype = floatx()
-  if seed is None:
-    seed = np.random.randint(10e6)
-  return array_ops.where_v2(
-      random_ops.random_uniform(shape, dtype=dtype, seed=seed) <= p,
-      array_ops.ones(shape, dtype=dtype), array_ops.zeros(shape, dtype=dtype))
+  warnings.warn('`tf.keras.backend.random_binomial` is deprecated, '
+                'and will be removed in a future version.'
+                'Please use `tf.keras.backend.random_bernoulli` instead.')
+  return random_bernoulli(shape, p, dtype, seed)
 
 
 @keras_export('keras.backend.random_bernoulli')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def random_bernoulli(shape, p=0.0, dtype=None, seed=None):
   """Returns a tensor with random bernoulli distribution of values.
 
@@ -5916,11 +6040,18 @@ def random_bernoulli(shape, p=0.0, dtype=None, seed=None):
   Returns:
       A tensor.
   """
-  return random_binomial(shape, p, dtype, seed)
+  if dtype is None:
+    dtype = floatx()
+  if seed is None:
+    seed = np.random.randint(10e6)
+  return array_ops.where_v2(
+      random_ops.random_uniform(shape, dtype=dtype, seed=seed) <= p,
+      array_ops.ones(shape, dtype=dtype), array_ops.zeros(shape, dtype=dtype))
 
 
 @keras_export('keras.backend.truncated_normal')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def truncated_normal(shape, mean=0.0, stddev=1.0, dtype=None, seed=None):
   """Returns a tensor with truncated random normal distribution of values.
 
@@ -5956,6 +6087,7 @@ def truncated_normal(shape, mean=0.0, stddev=1.0, dtype=None, seed=None):
 
 @keras_export('keras.backend.ctc_label_dense_to_sparse')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def ctc_label_dense_to_sparse(labels, label_lengths):
   """Converts CTC labels from dense to sparse.
 
@@ -6003,6 +6135,7 @@ def ctc_label_dense_to_sparse(labels, label_lengths):
 
 @keras_export('keras.backend.ctc_batch_cost')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def ctc_batch_cost(y_true, y_pred, input_length, label_length):
   """Runs CTC loss algorithm on each batch element.
 
@@ -6036,6 +6169,7 @@ def ctc_batch_cost(y_true, y_pred, input_length, label_length):
 
 @keras_export('keras.backend.ctc_decode')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def ctc_decode(y_pred, input_length, greedy=True, beam_width=100, top_paths=1):
   """Decodes the output of a softmax.
 
@@ -6092,6 +6226,7 @@ def ctc_decode(y_pred, input_length, greedy=True, beam_width=100, top_paths=1):
 
 
 @keras_export('keras.backend.map_fn')
+@doc_controls.do_not_generate_docs
 def map_fn(fn, elems, name=None, dtype=None):
   """Map the function fn over the elements elems and return the outputs.
 
@@ -6108,6 +6243,7 @@ def map_fn(fn, elems, name=None, dtype=None):
 
 
 @keras_export('keras.backend.foldl')
+@doc_controls.do_not_generate_docs
 def foldl(fn, elems, initializer=None, name=None):
   """Reduce elems using fn to combine them from left to right.
 
@@ -6125,6 +6261,7 @@ def foldl(fn, elems, initializer=None, name=None):
 
 
 @keras_export('keras.backend.foldr')
+@doc_controls.do_not_generate_docs
 def foldr(fn, elems, initializer=None, name=None):
   """Reduce elems using fn to combine them from right to left.
 
diff --git a/tensorflow/python/keras/backend_test.py b/tensorflow/python/keras/backend_test.py
index 2e0274a509b..4dd2a45eba6 100644
--- a/tensorflow/python/keras/backend_test.py
+++ b/tensorflow/python/keras/backend_test.py
@@ -36,11 +36,11 @@ from tensorflow.python.keras import combinations
 from tensorflow.python.keras.engine import input_layer
 from tensorflow.python.keras.layers import advanced_activations
 from tensorflow.python.keras.layers import normalization
+from tensorflow.python.keras.utils import tf_inspect
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
-from tensorflow.python.util import tf_inspect
 
 
 def compare_single_input_op_to_numpy(keras_op,
@@ -491,7 +491,7 @@ class BackendLinearAlgebraTest(test.TestCase, parameterized.TestCase):
                                      input_shape_b=(4, 7))
 
   def test_relu(self):
-    x = ops.convert_to_tensor_v2([[-4, 0], [2, 7]], 'float32')
+    x = ops.convert_to_tensor_v2_with_dispatch([[-4, 0], [2, 7]], 'float32')
 
     # standard relu
     relu_op = backend.relu(x)
@@ -1310,7 +1310,7 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
     inputs = backend.variable(input_val)
     initial_states = [
         backend.variable(init_state_val),
-        ops.convert_to_tensor_v2(
+        ops.convert_to_tensor_v2_with_dispatch(
             np.concatenate([init_state_val, init_state_val], axis=-1))
     ]
     mask = backend.variable(np_mask)
@@ -1617,9 +1617,11 @@ class BackendCrossEntropyLossesTest(test.TestCase, parameterized.TestCase):
     p = backend.placeholder()
     o = backend.categorical_crossentropy(t, p)
 
-    t_val = ops.convert_to_tensor_v2([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]])
-    p_val = ops.convert_to_tensor_v2([[.9, .05, .05], [.05, .89, .06],
-                                      [.05, .01, .94]])
+    t_val = ops.convert_to_tensor_v2_with_dispatch([[1., 0., 0.], [0., 1., 0.],
+                                                    [0., 0., 1.]])
+    p_val = ops.convert_to_tensor_v2_with_dispatch([[.9, .05, .05],
+                                                    [.05, .89, .06],
+                                                    [.05, .01, .94]])
     f = backend.function([t, p], o)
 
     result = f([t_val, p_val])
@@ -1633,7 +1635,8 @@ class BackendCrossEntropyLossesTest(test.TestCase, parameterized.TestCase):
     self.assertArrayNear(result, [.105, .065, .111], 1e-3)
 
     # from logits
-    p_val = ops.convert_to_tensor_v2([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
+    p_val = ops.convert_to_tensor_v2_with_dispatch([[8., 1., 1.], [0., 9., 1.],
+                                                    [2., 3., 5.]])
     o = backend.categorical_crossentropy(t, p, from_logits=True)
     f = backend.function([t, p], o)
 
@@ -1685,9 +1688,10 @@ class BackendCrossEntropyLossesTest(test.TestCase, parameterized.TestCase):
     p = backend.placeholder()
     o = backend.sparse_categorical_crossentropy(t, p)
 
-    t_val = ops.convert_to_tensor_v2([0, 1, 2])
-    p_val = ops.convert_to_tensor_v2([[.9, .05, .05], [.05, .89, .06],
-                                      [.05, .01, .94]])
+    t_val = ops.convert_to_tensor_v2_with_dispatch([0, 1, 2])
+    p_val = ops.convert_to_tensor_v2_with_dispatch([[.9, .05, .05],
+                                                    [.05, .89, .06],
+                                                    [.05, .01, .94]])
     f = backend.function([t, p], o)
 
     result = f([t_val, p_val])
@@ -1703,7 +1707,8 @@ class BackendCrossEntropyLossesTest(test.TestCase, parameterized.TestCase):
       _ = f([t_val, p_val])
 
     # from logits
-    p_val = ops.convert_to_tensor_v2([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
+    p_val = ops.convert_to_tensor_v2_with_dispatch([[8., 1., 1.], [0., 9., 1.],
+                                                    [2., 3., 5.]])
     o = backend.sparse_categorical_crossentropy(t, p, from_logits=True)
     f = backend.function([t, p], o)
 
@@ -2124,9 +2129,10 @@ class ControlOpsTests(test.TestCase):
     self.assertEqual(backend.eval(tensor), [9.0])
 
   def test_unequal_rank(self):
-    x = ops.convert_to_tensor_v2(
+    x = ops.convert_to_tensor_v2_with_dispatch(
         np.array([[1, 2, 3], [4, 5, 6]]), dtype='float32')
-    y = ops.convert_to_tensor_v2(np.array([1, 2, 3]), dtype='float32')
+    y = ops.convert_to_tensor_v2_with_dispatch(
+        np.array([1, 2, 3]), dtype='float32')
 
     def true_func():
       return x
diff --git a/tensorflow/python/keras/benchmarks/BUILD b/tensorflow/python/keras/benchmarks/BUILD
index 28a80820bc2..f12a1a86f29 100644
--- a/tensorflow/python/keras/benchmarks/BUILD
+++ b/tensorflow/python/keras/benchmarks/BUILD
@@ -10,6 +10,12 @@ package(
 
 exports_files(["LICENSE"])
 
+filegroup(
+    name = "all_py_srcs",
+    srcs = glob(["*.py"]),
+    visibility = ["//tensorflow/python/keras/google/private_tf_api_test:__pkg__"],
+)
+
 # To run CPU benchmarks:
 #   bazel run -c opt benchmarks_test -- --benchmarks=.
 
@@ -33,6 +39,12 @@ py_library(
     ],
 )
 
+# This lib is mainly for running benchmarks on mlcompass infra.
+py_library(
+    name = "profiler_lib",
+    visibility = ["//tensorflow:internal"],
+)
+
 COMMON_TAGS = [
     "no_pip",  # b/161253163
     "no_windows",  # b/160628318
@@ -46,6 +58,7 @@ py_test(
     tags = COMMON_TAGS,
     deps = [
         ":benchmark_util",
+        ":profiler_lib",
         "//tensorflow:tensorflow_py",
         "//third_party/py/numpy",
     ],
@@ -60,6 +73,7 @@ cuda_py_test(
         "no_oss_py38",  # TODO(b/162044699)
     ],
     deps = [
+        ":profiler_lib",
         "//tensorflow:tensorflow_py",
     ],
 )
@@ -69,6 +83,7 @@ cuda_py_test(
     srcs = ["model_components_benchmarks_test.py"],
     python_version = "PY3",
     deps = [
+        ":profiler_lib",
         "//tensorflow:tensorflow_py",
     ],
 )
@@ -90,6 +105,7 @@ cuda_py_test(
     tags = COMMON_TAGS,
     deps = [
         ":benchmark_util",
+        ":profiler_lib",
         "//tensorflow:tensorflow_py",
     ],
 )
@@ -173,6 +189,17 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "mnist_conv_custom_training_benchmark_test",
+    srcs = ["keras_examples_benchmarks/mnist_conv_custom_training_benchmark_test.py"],
+    python_version = "PY3",
+    tags = COMMON_TAGS,
+    deps = [
+        ":distribution_util",
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
 py_library(
     name = "distribution_util",
     srcs = ["distribution_util.py"],
diff --git a/tensorflow/python/keras/benchmarks/eager_microbenchmarks_test.py b/tensorflow/python/keras/benchmarks/eager_microbenchmarks_test.py
index 82f2a8342c2..d8e004689d3 100644
--- a/tensorflow/python/keras/benchmarks/eager_microbenchmarks_test.py
+++ b/tensorflow/python/keras/benchmarks/eager_microbenchmarks_test.py
@@ -23,8 +23,8 @@ import six
 import tensorflow as tf
 
 from tensorflow.python.eager import context
+from tensorflow.python.keras.utils import tf_inspect
 from tensorflow.python.platform import benchmark
-from tensorflow.python.util import tf_inspect
 
 
 def _run_benchmark(func, num_iters, execution_mode=None):
diff --git a/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/README.md b/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/README.md
new file mode 100644
index 00000000000..d26d9495019
--- /dev/null
+++ b/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/README.md
@@ -0,0 +1,240 @@
+# Benchmarks for keras model examples
+
+-   [Benchmarks for keras model examples](#benchmarks-for-keras-model-examples)
+    -   [Keras benchmarks](#keras-benchmarks)
+    -   [Available models](#available-models)
+        -   [Computer Vision examples](#computer-vision-examples)
+        -   [Text & Sequence examples](#text--sequence-examples)
+        -   [Other examples](#other-examples)
+    -   [Available benchmark results](#available-benchmark-results)
+        -   [Cifar10 CNN benchmark](#cifar10-cnn-benchmark)
+        -   [MNIST Conv benchmark](#mnist-conv-benchmark)
+        -   [MNIST Hierarchical RNN (HRNN) benchmark](#mnist-hierarchical-rnn-hrnn-benchmark)
+        -   [Bidirectional LSTM benchmark](#bidirectional-lstm-benchmark)
+        -   [Text classification with transformer benchmark](#text-classification-with-transformer-benchmark)
+        -   [MLP benchmark](#mlp-benchmark)
+        -   [Antirectifier benchmark](#antirectifier-benchmark)
+        -   [IRNN benchmark](#irnn-benchmark)
+    -   [Install Bazel](#install-bazel)
+    -   [Run benchmarks](#run-benchmarks)
+    -   [Add new benchmarks](#add-new-benchmarks)
+    -   [Troubleshooting](#troubleshooting)
+
+## Keras benchmarks
+
+These are benchmark tests running on keras models: models from
+[keras/examples](https://github.com/keras-team/keras/tree/master/examples).
+Benchmarks in the current folder
+(`tensorflow/python/keras/benchmarks/keras_examples_benchmarks`) use Keras
+[built-in dataset](https://keras.io/api/datasets/). In addition, these
+benchmarks support different
+[distribution strategies](https://www.tensorflow.org/guide/distributed_training)
+on multiple GPUs.
+
+### Available models
+
+These examples are implemented by Functional API and Sequential API.
+
+#### Computer Vision examples
+
+-   [cifar10_cnn_benchmark_test.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/cifar10_cnn_benchmark_test.py):
+    Simple CNN on CIFAR10 image dataset.
+-   [mnist_conv_benchmark_test.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/mnist_conv_benchmark_test.py):
+    Simple Convnet that achieves ~99% test accuracy on MNIST.
+-   [mnist_hierarchical_rnn_benchmark_test.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/mnist_hierarchical_rnn_benchmark_test.py):
+    Hierarchical RNN (HRNN) to classify MNIST digits.
+
+#### Text & Sequence examples
+
+-   [Bidirectional_lstm_benchmark_test.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/bidirectional_lstm_benchmark_test.py):
+    2-layer bidirectional LSTM on IMDB movie review dataset.
+-   [text_classification_transformer_benchmark_test.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/text_classification_transformer_benchmark_test.py):
+    Text classification with custom transformer block.
+-   [reuters_mlp_benchmark_test.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/reuters_mlp_benchmark_test.py):
+    Simple MLP on Reuters newswire topic classification dataset.
+
+#### Other examples
+
+-   [antirectifier_benchmark_test.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/antirectifier_benchmark_test.py):
+    Simple custom layer example.
+-   [mnist_irnn_benchmark_test.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/mnist_irnn_benchmark_test.py):Reproduction
+    of the IRNN experiment with pixel-by-pixel sequential MNIST in
+    ["A Simple Way to Initialize Recurrent Networks of Rectified Linear Units"](https://arxiv.org/abs/1504.00941)
+    by Le et al.
+
+### Available benchmark results
+
+The listed benchmark results are obtained by running on Google Cloud Platform (GCP) with the following setup: </br>
+
+-   GPU: 2 x Tesla V100</br>
+-   OS: Ubuntu 18.04 </br>
+-   CPU: 8 x vCPUs, 30 GB memory </br>
+-   CUDA: 10.1 </br>
+-   Bazel: 3.1.0 </br>
+
+If you want to run benchmark tests on GPU, please make sure you already installed CUDA and other dependencies by following the instructions from the [official tutorial](https://www.tensorflow.org/install/gpu) for GPU support.</br>
+
+Metrics for following benchmarks:</br>
+
+-   Batch_size: Number of samples per batch of computation.</br>
+-   Wall_time: Total time to run benchmark test in seconds.</br>
+-   Avg_epoch_time: Average time for each epoch.</br>
+-   Exp_per_sec: Examples per second. The number of examples processed in one second.</br>
+-   Distribution_Strategy: The [distribution strategies](https://www.tensorflow.org/guide/distributed_training) used in the benchmark. </br>
+
+#### Cifar10 CNN benchmark
+
+      | Batch_size | Wall_time | Avg_epoch_time | Exp_per_sec | Distribution_Strategy
+:---: | :--------: | :-------: | :------------: | :---------: | :-------------------:
+CPU   | 256        | 1393.4896 | 3.21           | 15397.69    | `off`
+GPU:2 | 256        | 76.49     | 2.59           | 18758.01    | `mirrored`
+
+#### MNIST Conv benchmark
+
+      | Batch_size | Wall_time | Avg_epoch_time | Exp_per_sec | Distribution_Strategy
+:---: | :--------: | :-------: | :------------: | :---------: | :-------------------:
+CPU   | 256        | 196.52    | 12.19          | 4915.26     | `off`
+GPU:2 | 256        | 24.5794   | 1.21           | 47899.32    | `mirrored`
+
+#### MNIST Hierarchical RNN (HRNN) benchmark
+
+      | Batch_size | Wall_time | Avg_epoch_time | Exp_per_sec | Distribution_Strategy
+:---: | :--------: | :-------: | :------------: | :---------: | :-------------------:
+CPU   | 256        | 654.05    | 218.68         | 274.24      | `off`
+GPU:2 | 256        | 20.77     | 3.73           | 15088.06    | `mirrored`
+
+#### Bidirectional LSTM benchmark
+
+      | Batch_size | Wall_time | Avg_epoch_time | Exp_per_sec | Distribution_Strategy
+:---: | :--------: | :-------: | :------------: | :---------: | :-------------------:
+CPU   | 512        | 225.57    | 72.55          | 344.70      | `off`
+GPU:2 | 512        | 23.54     | 3.23           | 7532.53     | `mirrored`
+
+#### Text classification with transformer benchmark
+
+      | Batch_size | Wall_time | Avg_epoch_time | Exp_per_sec | Distribution_Strategy
+:---: | :--------: | :-------: | :------------: | :---------: | :-------------------:
+CPU   | 512        | 109.22    | 35.93          | 698.10      | `off`
+GPU:2 | 512        | 9.28      | 0.83           | 26567.54    | `mirrored`
+
+#### MLP benchmark
+
+      | Batch_size | Wall_time | Avg_epoch_time | Exp_per_sec | Distribution_Strategy
+:---: | :--------: | :-------: | :------------: | :---------: | :-------------------:
+CPU   | 128        | 3.76      | 0.54           | 17678.54    | `off`
+GPU:2 | 128        | 5.91      | 0.30           | 25435.14    | `mirrored`
+
+#### Antirectifier benchmark
+
+      | Batch_size | Wall_time | Avg_epoch_time | Exp_per_sec | Distribution_Strategy
+:---: | :--------: | :-------: | :------------: | :---------: | :-------------------:
+CPU   | 512        | 6.77      | 1.79           | 30916.39    | `off`
+GPU:2 | 512        | 6.81      | 0.66           | 66563.17    | `mirrored`
+
+#### IRNN benchmark
+
+      | Batch_size | Wall_time | Avg_epoch_time | Exp_per_sec | Distribution_Strategy
+:---: | :--------: | :-------: | :------------: | :---------: | :-------------------:
+CPU   | 1024       | 213.00    | 69.01          | 868.08      | `off`
+GPU:2 | 1024       | 92.71     | 29.12          | 2042.94     | `mirrored`
+
+**Note**: For the small models, running on GPU might be even slower than CPU.
+The potential reason is, training small models is not computation dominant, and
+there might be some overhead on model replication and data sharding with
+distributed training on GPUs.
+
+## Install Bazel
+
+This step can be skipped if Bazel is already installed. </br>
+
+[Bazel](https://bazel.build/) is used to build targets based on BUILD files. It
+will take a while for the first time because it will compile all dependencies
+from your BUILD file. For the next time, Bazel will use the cache and it’ll be
+much faster. For Ubuntu OS, please use the following steps for Bazel
+installation. For other platforms, you may follow the corresponding guide for
+the installation.
+
+1.  Add bazel as package source
+
+    ```shell
+    sudo apt install curl gnupg
+    ```
+
+    ```shell
+    curl https://bazel.build/bazel-release.pub.gpg | sudo apt-key add -
+    ```
+
+    ```shell
+    echo "deb [arch=amd64] https://storage.googleapis.com/bazel-apt stable jdk1.8" | sudo tee /etc/apt/sources.list.d/bazel.list
+    ```
+
+    Before we install the bazel, We should take a look for a bazel version that
+    can build the specific tensorflow version, you can check it from
+    [here](https://www.tensorflow.org/install/source#tested_build_configurations).
+    In addition, you can follow the instructions from
+    [Bazel website](https://docs.bazel.build/versions/3.4.0/install.html).
+
+2.  Install Bazel
+
+    ```shell
+    sudo apt update && sudo apt install bazel-`version`
+    ```
+
+## Run benchmarks
+
+To run benchmarks in
+[keras/benchmarks](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/python/keras/benchmarks),
+please take the following steps:
+
+1.  Pull the latest tensorflow repo from github.
+2.  Install the Bazel tool which works with tensorflow, please take a look for
+    the [Install bazel](#install-bazel) section.
+3.  To run benchmarks with Bazel, use the `--benchmarks=.` flags to specify the
+    benchmarks to run.
+
+    -   To run all benchmarks on CPU
+
+        ```shell
+        bazel run -c opt benchmark_test -- --benchmarks=.
+        ```
+
+    -   To run all benchmarks on GPU
+
+        ```shell
+        bazel run run --config=cuda -c opt --copt="-mavx" benchmarks_test -- --benchmarks=.
+        ```
+
+    -   To run a subset of benchmarks using `--benchmarks` flag, `--benchmarks`:
+        the list of benchmarks to run. The specified value is interpreted as a
+        regular expression and any benchmarks whose name contains a partial
+        match to the regular expression is executed. e.g.
+        `--benchmarks=".*lstm*."`, will run all lstm layer related benchmarks.
+
+## Add new benchmarks
+
+To add a new benchmark, please take the following steps:
+
+1.  Create your own benchmark test file, `xxxx_benchmark_test.py`.
+2.  Import `benchmark_util` to measure and track performance if needed.
+3.  Create class which inherits from `tf.test.Benchmark`
+4.  Define and load dataset in `__init__` method.
+5.  Design and create a model in `_build_model` method.
+6.  Define the benchmark_xxx method to measure the performance of benchmarks
+    with different hyper parameters, such as `batch_size`, `run_iters`,
+    `distribution_strategy` and etc. You can check examples from
+    [here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/bidirectional_lstm_benchmark_test.py#L60).
+7.  Add the benchmark target to the
+    [BUILD](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/keras/benchmarks/BUILD)
+    file.
+
+## Troubleshooting
+
+1.  tensorflow.python.framework.errors_impl.InternalError: CUDA runtime implicit
+    initialization on GPU:0 failed. Status: device kernel image is invalid
+
+    -   Make sure CUDA is installed on your machine.
+    -   Pull the latest tensorflow repo and run the `./configure` in the root
+        folder of tensorflow. It will help you to create the configuration file
+        which shows your local environment. Please check
+        [this post](https://www.tensorflow.org/install/source#configure_the_build)
+        for more details.
diff --git a/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/antirectifier_benchmark_test.py b/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/antirectifier_benchmark_test.py
index 430bae6186f..044f2a33d47 100644
--- a/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/antirectifier_benchmark_test.py
+++ b/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/antirectifier_benchmark_test.py
@@ -54,77 +54,65 @@ class AntirectifierBenchmark(tf.test.Benchmark):
   #   Check more details in `measure_performance()` method of
   #   benchmark_util.
   def benchmark_antirectifier_bs_128(self):
-    """Measure performance with batch_size=128 and run_iters=2."""
+    """Measure performance with batch_size=128."""
     batch_size = 128
-    run_iters = 2
     metrics, wall_time, extras = benchmark_util.measure_performance(
         self._build_model,
         x=self.x_train,
         y=self.y_train,
         batch_size=batch_size,
-        run_iters=run_iters,
         optimizer="rmsprop",
         loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
         metrics=["sparse_categorical_accuracy"])
 
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
 
   def benchmark_antirectifier_bs_256(self):
-    """Measure performance with batch_size=256 and run_iters=3."""
+    """Measure performance with batch_size=256."""
     batch_size = 256
-    run_iters = 3
     metrics, wall_time, extras = benchmark_util.measure_performance(
         self._build_model,
         x=self.x_train,
         y=self.y_train,
         batch_size=batch_size,
-        run_iters=run_iters,
         optimizer="rmsprop",
         loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
         metrics=["sparse_categorical_accuracy"])
 
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
 
   def benchmark_antirectifier_bs_512(self):
-    """Measure performance with batch_size=512 and run_iters=4."""
+    """Measure performance with batch_size=512."""
     batch_size = 512
-    run_iters = 4
     metrics, wall_time, extras = benchmark_util.measure_performance(
         self._build_model,
         x=self.x_train,
         y=self.y_train,
         batch_size=batch_size,
-        run_iters=run_iters,
         optimizer="rmsprop",
         loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
         metrics=["sparse_categorical_accuracy"])
 
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
 
   def benchmark_antirectifier_bs_512_gpu_2(self):
-    """Measure performance with batch_size=512, run_iters=4, gpu=2 and
+    """Measure performance with batch_size=512, gpu=2 and
 
     distribution_strategy=`mirrored`.
     """
     batch_size = 512
-    run_iters = 4
     metrics, wall_time, extras = benchmark_util.measure_performance(
         self._build_model,
         x=self.x_train,
         y=self.y_train,
         batch_size=batch_size,
-        run_iters=run_iters,
         num_gpus=2,
         distribution_strategy="mirrored",
         optimizer="rmsprop",
         loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
         metrics=["sparse_categorical_accuracy"])
 
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
 
 
 class Antirectifier(tf.keras.layers.Layer):
diff --git a/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/bidirectional_lstm_benchmark_test.py b/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/bidirectional_lstm_benchmark_test.py
index 77f231902a1..184e3d180a1 100644
--- a/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/bidirectional_lstm_benchmark_test.py
+++ b/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/bidirectional_lstm_benchmark_test.py
@@ -56,77 +56,65 @@ class BidirectionalLSTMBenchmark(tf.test.Benchmark):
   #   Check more details in `measure_performance()` method of
   #   benchmark_util.
   def benchmark_bidirect_lstm_imdb_bs_128(self):
-    """Measure performance with batch_size=128 and run_iters=3."""
+    """Measure performance with batch_size=128."""
     batch_size = 128
-    run_iters = 3
     metrics, wall_time, extras = benchmark_util.measure_performance(
         self._build_model,
         x=self.imdb_x,
         y=self.imdb_y,
         batch_size=batch_size,
-        run_iters=run_iters,
         optimizer='adam',
         loss='binary_crossentropy',
         metrics=['accuracy'])
 
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
 
   def benchmark_bidirect_lstm_imdb_bs_256(self):
-    """Measure performance with batch_size=256 and run_iters=2."""
+    """Measure performance with batch_size=256."""
     batch_size = 256
-    run_iters = 2
     metrics, wall_time, extras = benchmark_util.measure_performance(
         self._build_model,
         x=self.imdb_x,
         y=self.imdb_y,
         batch_size=batch_size,
-        run_iters=run_iters,
         optimizer='adam',
         loss='binary_crossentropy',
         metrics=['accuracy'])
 
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
 
   def benchmark_bidirect_lstm_imdb_bs_512(self):
-    """Measure performance with batch_size=512 and run_iters=4."""
+    """Measure performance with batch_size=512."""
     batch_size = 512
-    run_iters = 4
     metrics, wall_time, extras = benchmark_util.measure_performance(
         self._build_model,
         x=self.imdb_x,
         y=self.imdb_y,
         batch_size=batch_size,
-        run_iters=run_iters,
         optimizer='adam',
         loss='binary_crossentropy',
         metrics=['accuracy'])
 
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
 
   def benchmark_bidirect_lstm_imdb_bs_512_gpu_2(self):
-    """Measure performance with batch_size=512, run_iters=4, gpu=2 and
+    """Measure performance with batch_size=512, gpu=2 and
 
     distribution_strategy=`mirrored`.
     """
     batch_size = 512
-    run_iters = 4
     metrics, wall_time, extras = benchmark_util.measure_performance(
         self._build_model,
         x=self.imdb_x,
         y=self.imdb_y,
         batch_size=batch_size,
-        run_iters=run_iters,
         num_gpus=2,
         distribution_strategy='mirrored',
         optimizer='adam',
         loss='binary_crossentropy',
         metrics=['accuracy'])
 
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/cifar10_cnn_benchmark_test.py b/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/cifar10_cnn_benchmark_test.py
index 88d27a2a040..f12e970cfe4 100644
--- a/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/cifar10_cnn_benchmark_test.py
+++ b/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/cifar10_cnn_benchmark_test.py
@@ -31,7 +31,7 @@ class Cifar10CNNBenchmark(tf.test.Benchmark):
     (self.x_train, self.y_train), _ = tf.keras.datasets.cifar10.load_data()
     self.x_train = self.x_train.astype('float32') / 255
     self.y_train = tf.keras.utils.to_categorical(self.y_train, self.num_classes)
-    self.epochs = 25
+    self.epochs = 5
 
   def _build_model(self):
     """Model from https://github.com/keras-team/keras/blob/master/examples/cifar10_cnn.py."""
@@ -70,72 +70,61 @@ class Cifar10CNNBenchmark(tf.test.Benchmark):
   #   Check more details in `measure_performance()` method of
   #   benchmark_util.
   def benchmark_cnn_cifar10_bs_256(self):
-    """Measure performance with batch_size=256 and run_iters=3."""
+    """Measure performance with batch_size=256."""
     batch_size = 256
-    run_iters = 3
     metrics, wall_time, extras = benchmark_util.measure_performance(
         self._build_model,
         x=self.x_train,
         y=self.y_train,
         batch_size=batch_size,
-        run_iters=run_iters,
         epochs=self.epochs,
         optimizer=tf.keras.optimizers.RMSprop(learning_rate=0.0001, decay=1e-6),
         loss='categorical_crossentropy',
         metrics=['accuracy'])
 
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
 
   def benchmark_cnn_cifar10_bs_512(self):
-    """Measure performance with batch_size=512 and run_iters=3."""
+    """Measure performance with batch_size=512."""
     batch_size = 512
-    run_iters = 3
     metrics, wall_time, extras = benchmark_util.measure_performance(
         self._build_model,
         x=self.x_train,
         y=self.y_train,
         batch_size=batch_size,
-        run_iters=run_iters,
         epochs=self.epochs,
         optimizer=tf.keras.optimizers.RMSprop(learning_rate=0.0001, decay=1e-6),
         loss='categorical_crossentropy',
         metrics=['accuracy'])
 
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
 
   def benchmark_cnn_cifar10_bs_1024(self):
-    """Measure performance with batch_size=1024 and run_iters=2."""
+    """Measure performance with batch_size=1024."""
     batch_size = 1024
-    run_iters = 2
     metrics, wall_time, extras = benchmark_util.measure_performance(
         self._build_model,
         x=self.x_train,
         y=self.y_train,
         batch_size=batch_size,
-        run_iters=run_iters,
         epochs=self.epochs,
         optimizer=tf.keras.optimizers.RMSprop(learning_rate=0.0001, decay=1e-6),
         loss='categorical_crossentropy',
         metrics=['accuracy'])
 
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
 
   def benchmark_cnn_cifar10_bs_1024_gpu_2(self):
-    """Measure performance with batch_size=1024, run_iters=2, gpu=2 and
+    """Measure performance with batch_size=1024, gpu=2 and
 
     distribution_strategy=`mirrored`.
     """
     batch_size = 1024
-    run_iters = 2
     metrics, wall_time, extras = benchmark_util.measure_performance(
         self._build_model,
         x=self.x_train,
         y=self.y_train,
         batch_size=batch_size,
-        run_iters=run_iters,
         num_gpus=2,
         distribution_strategy='mirrored',
         epochs=self.epochs,
@@ -143,8 +132,7 @@ class Cifar10CNNBenchmark(tf.test.Benchmark):
         loss='categorical_crossentropy',
         metrics=['accuracy'])
 
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/mnist_conv_benchmark_test.py b/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/mnist_conv_benchmark_test.py
index 1ea2e45ec73..59b0a5edd6e 100644
--- a/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/mnist_conv_benchmark_test.py
+++ b/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/mnist_conv_benchmark_test.py
@@ -61,54 +61,61 @@ class ConvMnistBenchmark(tf.test.Benchmark):
   #   Check more details in `measure_performance()` method of
   #   benchmark_util.
   def benchmark_conv_mnist_bs_128(self):
-    """Measure performance with batch_size=128 and run_iters=2."""
+    """Measure performance with batch_size=128."""
     batch_size = 128
-    run_iters = 2
     metrics, wall_time, extras = benchmark_util.measure_performance(
         self._build_model,
         x=self.x_train,
         y=self.y_train,
         batch_size=batch_size,
-        run_iters=run_iters,
         epochs=self.epochs,
         optimizer='adam',
         loss='categorical_crossentropy',
         metrics=['accuracy'])
 
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
 
   def benchmark_conv_mnist_bs_256(self):
-    """Measure performance with batch_size=256 and run_iters=3."""
+    """Measure performance with batch_size=256."""
     batch_size = 256
-    run_iters = 3
     metrics, wall_time, extras = benchmark_util.measure_performance(
         self._build_model,
         x=self.x_train,
         y=self.y_train,
         batch_size=batch_size,
-        run_iters=run_iters,
         epochs=self.epochs,
         optimizer='adam',
         loss='categorical_crossentropy',
         metrics=['accuracy'])
 
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
 
-  def benchmark_conv_mnist_bs_256_gpu_2(self):
-    """Measure performance with batch_size=256, run_iters=3, gpu=2 and
+  def benchmark_conv_mnist_bs_512(self):
+    """Measure performance with batch_size=512."""
+    batch_size = 512
+    metrics, wall_time, extras = benchmark_util.measure_performance(
+        self._build_model,
+        x=self.x_train,
+        y=self.y_train,
+        batch_size=batch_size,
+        epochs=self.epochs,
+        optimizer='adam',
+        loss='categorical_crossentropy',
+        metrics=['accuracy'])
+
+    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
+
+  def benchmark_conv_mnist_bs_512_gpu_2(self):
+    """Measure performance with batch_size=512, gpu=2 and
 
     distribution_strategy='mirrored'
     """
-    batch_size = 256
-    run_iters = 3
+    batch_size = 512
     metrics, wall_time, extras = benchmark_util.measure_performance(
         self._build_model,
         x=self.x_train,
         y=self.y_train,
         batch_size=batch_size,
-        run_iters=run_iters,
         num_gpus=2,
         distribution_strategy='mirrored',
         epochs=self.epochs,
@@ -116,26 +123,7 @@ class ConvMnistBenchmark(tf.test.Benchmark):
         loss='categorical_crossentropy',
         metrics=['accuracy'])
 
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
-
-  def benchmark_conv_mnist_bs_512(self):
-    """Measure performance with batch_size=512 and run_iters=3."""
-    batch_size = 512
-    run_iters = 3
-    metrics, wall_time, extras = benchmark_util.measure_performance(
-        self._build_model,
-        x=self.x_train,
-        y=self.y_train,
-        batch_size=batch_size,
-        run_iters=run_iters,
-        epochs=self.epochs,
-        optimizer='adam',
-        loss='categorical_crossentropy',
-        metrics=['accuracy'])
-
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/mnist_conv_custom_training_benchmark_test.py b/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/mnist_conv_custom_training_benchmark_test.py
new file mode 100644
index 00000000000..f7c1130989f
--- /dev/null
+++ b/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/mnist_conv_custom_training_benchmark_test.py
@@ -0,0 +1,360 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmarks using custom training loop on MNIST dataset."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import timeit
+import numpy as np
+
+import tensorflow as tf
+
+from tensorflow.python.keras.benchmarks import distribution_util
+
+
+class CustomMnistBenchmark(tf.test.Benchmark):
+  """Benchmarks for custom training loop using `tf.test.Benchmark`."""
+
+  def __init__(self):
+    super(CustomMnistBenchmark, self).__init__()
+    self.num_classes = 10
+    self.input_shape = (28, 28, 1)
+    self.epochs = 15
+    (x_train, y_train), _ = tf.keras.datasets.mnist.load_data()
+    x_train = x_train.astype('float32') / 255
+    x_train = np.expand_dims(x_train, -1)
+    y_train = tf.keras.utils.to_categorical(y_train, self.num_classes)
+    self.num_examples = x_train.shape[0]
+    #  Use `tf.data.Dataset` for custom training loop.
+    self.train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
+
+  def _build_model(self):
+    """Model from https://keras.io/examples/vision/mnist_convnet/."""
+    model = tf.keras.Sequential([
+        tf.keras.Input(shape=self.input_shape),
+        tf.keras.layers.Conv2D(32, kernel_size=(3, 3), activation='relu'),
+        tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
+        tf.keras.layers.Conv2D(64, kernel_size=(3, 3), activation='relu'),
+        tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
+        tf.keras.layers.Flatten(),
+        tf.keras.layers.Dropout(0.5),
+        tf.keras.layers.Dense(self.num_classes, activation='softmax'),
+    ])
+
+    return model
+
+  def compute_loss(self, targets, predictions, loss_fn, batch_size):
+    """Compute average loss."""
+    per_example_loss = loss_fn(targets, predictions)
+    return tf.nn.compute_average_loss(
+        per_example_loss, global_batch_size=batch_size)
+
+  @tf.function(experimental_relax_shapes=True)
+  def train_step(self, inputs, model, loss_fn, optimizer, batch_size):
+    """Compute loss and optimize model by optimizer.
+
+    Arguments:
+      inputs: `tf.data`.
+      model: See `model` in `train_function()` method.
+      loss_fn: See `loss_fn` in `train_function()` method.
+      optimizer: See `optimizer` in `train_function()` method.
+      batch_size: See `batch_size` in `train_function()` method.
+
+    Returns:
+      Loss value.
+    """
+    train_x, train_y = inputs
+    with tf.GradientTape() as tape:
+      predictions = model(train_x, training=True)
+      loss = self.compute_loss(train_y, predictions, loss_fn, batch_size)
+    grads = tape.gradient(loss, model.trainable_weights)
+    optimizer.apply_gradients(zip(grads, model.trainable_weights))
+    return loss
+
+  @tf.function(experimental_relax_shapes=True)
+  def distributed_train_step(self, batch_dataset, model, loss_fn, optimizer,
+                             batch_size, distribution_strategy):
+    """Train step in distribution strategy setting.
+
+    Arguments:
+      batch_dataset: `tf.data`.
+      model: See `model` in `train_function()` method.
+      loss_fn: See `loss_fn` in `train_function()` method.
+      optimizer: See `optimizer` in `train_function()` method.
+      batch_size: See `batch_size` in `train_function()` method.
+      distribution_strategy: See `distribution_strategy` in `train_function()`
+        method.
+
+    Returns:
+      Sum of per_replica_losses.
+    """
+    per_replica_losses = distribution_strategy.run(
+        self.train_step,
+        args=(
+            batch_dataset,
+            model,
+            loss_fn,
+            optimizer,
+            batch_size,
+        ))
+    return distribution_strategy.reduce(
+        tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None)
+
+  def train_function(self,
+                     model,
+                     train_dataset,
+                     loss_fn,
+                     optimizer,
+                     epochs=2,
+                     distribution_strategy=None,
+                     batch_size=256):
+    """Train model in custom training loop and return average
+
+    train_step_time.
+
+    Arguments:
+      model: Model function to be benchmarked.
+      train_dataset: `tf.data` dataset. Should return a tuple of either (inputs,
+        targets) or (inputs, targets, sample_weights).
+      loss_fn: `tf.keras.losses.Loss` instance.
+      optimizer: `tf.keras.optimizers` instance.
+      epochs: Integer. Number of epochs to train the model. If unspecified,
+        `epochs` will default to 2.
+      distribution_strategy: Distribution strategies. It could be
+        `multi_worker_mirrored`, `one_device`, `mirrored`. If unspecified,
+        `distribution_strategy` will default to 'off'. Note that, `TPU` and
+        `parameter_server` are not supported yet.
+      batch_size: Integer. Number of samples per gradient update. If
+        unspecified, `batch_size` will default to 32.
+
+    Returns:
+      Average train_step_time.
+    """
+    train_step_time_list = []
+    timer = timeit.default_timer
+
+    total_loss = 0.0
+    num_batches = 0
+    for _ in range(epochs):
+      # Iterate over the batches of the dataset.
+      for batch_dataset in train_dataset:
+
+        start_time = timer()
+
+        if distribution_strategy is not None:
+          total_loss += self.distributed_train_step(batch_dataset, model,
+                                                    loss_fn, optimizer,
+                                                    batch_size,
+                                                    distribution_strategy)
+        else:
+          total_loss += self.train_step(batch_dataset, model, loss_fn,
+                                        optimizer, batch_size)
+        num_batches += 1
+
+        end_time = timer()
+        train_step_time_list.append(end_time - start_time)
+
+    return np.mean(train_step_time_list)
+
+  def measure_performance(self,
+                          model,
+                          dataset,
+                          loss_fn,
+                          optimizer,
+                          batch_size=32,
+                          run_iters=4,
+                          epochs=10,
+                          distribution_strategy=None):
+    """Run models and measure the performance.
+
+    Arguments:
+      model_fn: Model function to be benchmarked.
+      dataset: `tf.data` dataset. Should return a tuple of either (inputs,
+        targets) or (inputs, targets, sample_weights).
+      loss_fn: `tf.keras.losses.Loss` instance.
+      optimizer: `tf.keras.optimizers` instance.
+      batch_size: Integer. Number of samples per gradient update. If
+        unspecified, `batch_size` will default to 32.
+      run_iters: Integer. Number of iterations to run the performance
+        measurement. If unspecified, `run_iters` will default to 4.
+      epochs: Integer. Number of epochs to train the model. If unspecified,
+        `epochs` will default to 10.
+      distribution_strategy: Distribution strategies. It could be
+        `multi_worker_mirrored`, `one_device`, `mirrored`. If unspecified,
+        `distribution_strategy` will default to 'off'. Note that, `TPU` and
+        `parameter_server` are not supported yet.
+
+    Returns:
+      Performance summary, which contains build_time, avg_epoch_time,
+      wall_time, exp_per_sec, epochs, warmup_time, train_step_time.
+
+    Raise:
+      ValueError: if `dataset` is None or if `optimizer` instance is
+      not provided or if `loss_fn` instance is not provided.
+    """
+    if distribution_strategy is not None and \
+      not isinstance(dataset, tf.distribute.DistributedDataset):
+      raise ValueError('tf.distribute.DistributedDataset'
+                       ' required in distribution strategy.')
+
+    if distribution_strategy is None and \
+      not isinstance(dataset, tf.data.Dataset):
+      raise ValueError('`tf.data` is required.')
+
+    if not isinstance(loss_fn, tf.keras.losses.Loss):
+      raise ValueError('`tf.keras.losses.Loss` instance '
+                       'for loss_fn is required.')
+
+    if not isinstance(optimizer, tf.keras.optimizers.Optimizer):
+      raise ValueError('`tf.keras.optimizers` instance '
+                       'for optimizer is required.')
+
+    avg_epoch_time_list, train_step_time_list = [], []
+    wall_time_list, exp_per_sec_list, warmup_time_list = [], [], []
+
+    total_num_examples = epochs * self.num_examples
+
+    for _ in range(run_iters):
+      timer = timeit.default_timer
+      start_time = timer()
+      t1 = timer()
+      self.train_function(model, dataset, loss_fn, optimizer, 1,
+                          distribution_strategy, batch_size)
+      warmup_time = timer() - t1
+
+      t2 = timer()
+      train_step_time = self.train_function(model, dataset, loss_fn, optimizer,
+                                            epochs, distribution_strategy,
+                                            batch_size)
+      end_time = timer()
+
+      train_step_time_list.append(train_step_time)
+      warmup_time_list.append(warmup_time)
+      wall_time_list.append(end_time - start_time)
+      exp_per_sec_list.append(total_num_examples / (end_time - t2))
+      avg_epoch_time_list.append((end_time - t2) / epochs)
+
+    metrics = []
+    metrics.append({
+        'name': 'avg_epoch_time',
+        'value': np.mean(avg_epoch_time_list)
+    })
+    metrics.append({'name': 'exp_per_sec', 'value': np.mean(exp_per_sec_list)})
+    metrics.append({'name': 'warmup_time', 'value': np.mean(warmup_time_list)})
+    metrics.append({
+        'name': 'train_step_time',
+        'value': np.mean(train_step_time_list)
+    })
+    metrics.append({'name': 'epochs', 'value': epochs})
+
+    wall_time = np.mean(wall_time_list)
+
+    return metrics, wall_time
+
+  def benchmark_custom_training_mnist_bs_128(self):
+    """Measure performance with batch_size=128 and run_iters=5."""
+    batch_size = 128
+    run_iters = 5
+    train_dataset = self.train_dataset.shuffle(
+        buffer_size=1024).batch(batch_size)
+
+    # Instantiate a loss function.
+    loss_fn = tf.keras.losses.CategoricalCrossentropy(
+        reduction=tf.keras.losses.Reduction.NONE)
+    # Instantiate an optimizer to train the model.
+    optimizer = tf.keras.optimizers.Adam()
+    model = self._build_model()
+
+    metrics, wall_time = self.measure_performance(model, train_dataset, loss_fn,
+                                                  optimizer, batch_size,
+                                                  run_iters, self.epochs)
+    self.report_benchmark(iters=run_iters, wall_time=wall_time, metrics=metrics)
+
+  def benchmark_custom_training_mnist_bs_256(self):
+    """Measure performance with batch_size=256 and run_iters=5."""
+    batch_size = 256
+    run_iters = 5
+    train_dataset = self.train_dataset.shuffle(
+        buffer_size=1024).batch(batch_size)
+
+    # Instantiate a loss function.
+    loss_fn = tf.keras.losses.CategoricalCrossentropy(
+        reduction=tf.keras.losses.Reduction.NONE)
+    # Instantiate an optimizer to train the model.
+    optimizer = tf.keras.optimizers.Adam()
+    model = self._build_model()
+
+    metrics, wall_time = self.measure_performance(model, train_dataset, loss_fn,
+                                                  optimizer, batch_size,
+                                                  run_iters, self.epochs)
+    self.report_benchmark(iters=run_iters, wall_time=wall_time, metrics=metrics)
+
+  def benchmark_custom_training_mnist_bs_512(self):
+    """Measure performance with batch_size=512 and run_iters=10."""
+    batch_size = 512
+    run_iters = 5
+    train_dataset = self.train_dataset.shuffle(
+        buffer_size=1024).batch(batch_size)
+
+    # Instantiate a loss function.
+    loss_fn = tf.keras.losses.CategoricalCrossentropy(
+        reduction=tf.keras.losses.Reduction.NONE)
+    # Instantiate an optimizer to train the model.
+    optimizer = tf.keras.optimizers.Adam()
+    model = self._build_model()
+
+    metrics, wall_time = self.measure_performance(model, train_dataset, loss_fn,
+                                                  optimizer, batch_size,
+                                                  run_iters, self.epochs)
+    self.report_benchmark(iters=run_iters, wall_time=wall_time, metrics=metrics)
+
+  def benchmark_custom_training_mnist_bs_512_gpu_2(self):
+    """Measure performance with batch_size=512, run_iters=10, gpu=2 and
+
+    distribution_strategy='mirrored'.
+    """
+    batch_size = 512
+    run_iters = 10
+    train_dataset = self.train_dataset.shuffle(
+        buffer_size=1024).batch(batch_size)
+
+    distribution_strategy = 'mirrored'
+
+    strategy = distribution_util.get_distribution_strategy(
+        distribution_strategy=distribution_strategy, num_gpus=2)
+
+    if distribution_strategy != 'off':
+      train_dataset = strategy.experimental_distribute_dataset(train_dataset)
+
+    strategy_scope = distribution_util.get_strategy_scope(strategy)
+
+    with strategy_scope:
+      # Instantiate a loss function.
+      loss_fn = tf.keras.losses.CategoricalCrossentropy(
+          reduction=tf.keras.losses.Reduction.NONE)
+      # Instantiate an optimizer to train the model.
+      optimizer = tf.keras.optimizers.Adam()
+      model = self._build_model()
+
+    metrics, wall_time = self.measure_performance(model, train_dataset, loss_fn,
+                                                  optimizer, batch_size,
+                                                  run_iters, self.epochs,
+                                                  strategy)
+    self.report_benchmark(iters=run_iters, wall_time=wall_time, metrics=metrics)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/mnist_hierarchical_rnn_benchmark_test.py b/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/mnist_hierarchical_rnn_benchmark_test.py
index 29c71e576ba..ef7252733bc 100644
--- a/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/mnist_hierarchical_rnn_benchmark_test.py
+++ b/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/mnist_hierarchical_rnn_benchmark_test.py
@@ -62,77 +62,65 @@ class HierarchicalRNNBenchmark(tf.test.Benchmark):
   #   Check more details in `measure_performance()` method of
   #   benchmark_util.
   def benchmark_hrnn_mnist_bs_256(self):
-    """Measure performance with batch_size=256 and run_iters=4."""
+    """Measure performance with batch_size=256."""
     batch_size = 256
-    run_iters = 4
     metrics, wall_time, extras = benchmark_util.measure_performance(
         self._build_model,
         x=self.x_train,
         y=self.y_train,
         batch_size=batch_size,
-        run_iters=run_iters,
         optimizer='rmsprop',
         loss='categorical_crossentropy',
         metrics=['accuracy'])
 
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
 
-  def benchmark_hrnn_mnist_bs_256_gpu_2(self):
-    """Measure performance with batch_size=256, run_iters=4, gpu=2 and
-
-    distribution_strategy='mirrored'
-    """
-    batch_size = 256
-    run_iters = 4
+  def benchmark_hrnn_mnist_bs_512(self):
+    """Measure performance with batch_size=512."""
+    batch_size = 512
+    metrics, wall_time, extras = benchmark_util.measure_performance(
+        self._build_model,
+        x=self.x_train,
+        y=self.y_train,
+        batch_size=batch_size,
+        optimizer='rmsprop',
+        loss='categorical_crossentropy',
+        metrics=['accuracy'])
+
+    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
+
+  def benchmark_hrnn_mnist_bs_1024(self):
+    """Measure performance with batch_size=1024."""
+    batch_size = 1024
+    metrics, wall_time, extras = benchmark_util.measure_performance(
+        self._build_model,
+        x=self.x_train,
+        y=self.y_train,
+        batch_size=batch_size,
+        optimizer='rmsprop',
+        loss='categorical_crossentropy',
+        metrics=['accuracy'])
+
+    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
+
+  def benchmark_hrnn_mnist_bs_1024_gpu_2(self):
+    """Measure performance with batch_size=1024, gpu=2 and
+
+    distribution_strategy='mirrored'
+    """
+    batch_size = 1024
     metrics, wall_time, extras = benchmark_util.measure_performance(
         self._build_model,
         x=self.x_train,
         y=self.y_train,
         batch_size=batch_size,
-        run_iters=run_iters,
         num_gpus=2,
         distribution_strategy='mirrored',
         optimizer='rmsprop',
         loss='categorical_crossentropy',
         metrics=['accuracy'])
 
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
-
-  def benchmark_hrnn_mnist_bs_512(self):
-    """Measure performance with batch_size=512 and run_iters=5."""
-    batch_size = 512
-    run_iters = 5
-    metrics, wall_time, extras = benchmark_util.measure_performance(
-        self._build_model,
-        x=self.x_train,
-        y=self.y_train,
-        batch_size=batch_size,
-        run_iters=run_iters,
-        optimizer='rmsprop',
-        loss='categorical_crossentropy',
-        metrics=['accuracy'])
-
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
-
-  def benchmark_hrnn_mnist_bs_1024(self):
-    """Measure performance with batch_size=1024 and run_iters=3."""
-    batch_size = 1024
-    run_iters = 3
-    metrics, wall_time, extras = benchmark_util.measure_performance(
-        self._build_model,
-        x=self.x_train,
-        y=self.y_train,
-        batch_size=batch_size,
-        run_iters=run_iters,
-        optimizer='rmsprop',
-        loss='categorical_crossentropy',
-        metrics=['accuracy'])
-
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/mnist_irnn_benchmark_test.py b/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/mnist_irnn_benchmark_test.py
index b1913a2c268..dcfa8f44e8b 100644
--- a/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/mnist_irnn_benchmark_test.py
+++ b/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/mnist_irnn_benchmark_test.py
@@ -62,77 +62,65 @@ class IRNNMnistBenchmark(tf.test.Benchmark):
   #   Check more details in `measure_performance()` method of
   #   benchmark_util.
   def benchmark_irnn_mnist_bs_256(self):
-    """Measure performance with batch_size=256 and run_iters=4."""
+    """Measure performance with batch_size=256."""
     batch_size = 256
-    run_iters = 4
     metrics, wall_time, extras = benchmark_util.measure_performance(
         self._build_model,
         x=self.x_train,
         y=self.y_train,
         batch_size=batch_size,
-        run_iters=run_iters,
         optimizer=tf.keras.optimizers.RMSprop(learning_rate=self.learning_rate),
         loss='categorical_crossentropy',
         metrics=['accuracy'])
 
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
 
   def benchmark_irnn_mnist_bs_512(self):
-    """Measure performance with batch_size=512 and run_iters=3."""
+    """Measure performance with batch_size=512."""
     batch_size = 512
-    run_iters = 3
     metrics, wall_time, extras = benchmark_util.measure_performance(
         self._build_model,
         x=self.x_train,
         y=self.y_train,
         batch_size=batch_size,
-        run_iters=run_iters,
         optimizer=tf.keras.optimizers.RMSprop(learning_rate=self.learning_rate),
         loss='categorical_crossentropy',
         metrics=['accuracy'])
 
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
 
   def benchmark_irnn_mnist_bs_1024(self):
-    """Measure performance with batch_size=1024 and run_iters=3."""
+    """Measure performance with batch_size=1024."""
     batch_size = 1024
-    run_iters = 3
     metrics, wall_time, extras = benchmark_util.measure_performance(
         self._build_model,
         x=self.x_train,
         y=self.y_train,
         batch_size=batch_size,
-        run_iters=run_iters,
         optimizer=tf.keras.optimizers.RMSprop(learning_rate=self.learning_rate),
         loss='categorical_crossentropy',
         metrics=['accuracy'])
 
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
 
-  def benchmark_irnn_mnist_bs_1024_gpu_3(self):
-    """Measure performance with batch_size=1024, run_iters=3, gpu=3 and
+  def benchmark_irnn_mnist_bs_1024_gpu_2(self):
+    """Measure performance with batch_size=1024, gpu=2 and
 
     distribution_strategy='mirrored'
     """
     batch_size = 1024
-    run_iters = 3
     metrics, wall_time, extras = benchmark_util.measure_performance(
         self._build_model,
         x=self.x_train,
         y=self.y_train,
         batch_size=batch_size,
-        run_iters=run_iters,
-        num_gpus=3,
+        num_gpus=2,
         distribution_strategy='mirrored',
         optimizer=tf.keras.optimizers.RMSprop(learning_rate=self.learning_rate),
         loss='categorical_crossentropy',
         metrics=['accuracy'])
 
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/reuters_mlp_benchmark_test.py b/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/reuters_mlp_benchmark_test.py
index be7823441f3..c3be6e47659 100644
--- a/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/reuters_mlp_benchmark_test.py
+++ b/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/reuters_mlp_benchmark_test.py
@@ -61,81 +61,69 @@ class MLPReutersBenchmark(tf.test.Benchmark):
   #   Check more details in `measure_performance()` method of
   #   benchmark_util.
   def benchmark_mlp_reuters_bs_128(self):
-    """Measure performance with batch_size=128 and run_iters=2."""
+    """Measure performance with batch_size=128."""
     batch_size = 128
-    run_iters = 2
     metrics, wall_time, extras = benchmark_util.measure_performance(
         self._build_model,
         x=self.x_train,
         y=self.y_train,
         batch_size=batch_size,
-        run_iters=run_iters,
         epochs=self.epochs,
         optimizer='adam',
         loss='categorical_crossentropy',
         metrics=['accuracy'])
 
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
 
-  def benchmark_mlp_reuters_bs_128_gpu_3(self):
-    """Measure performance with batch_size=128, run_iters=2, gpu=3 and
-
-    distribution_strategy='mirrored'
-    """
-    batch_size = 128
-    run_iters = 2
+  def benchmark_mlp_reuters_bs_256(self):
+    """Measure performance with batch_size=256."""
+    batch_size = 256
     metrics, wall_time, extras = benchmark_util.measure_performance(
         self._build_model,
         x=self.x_train,
         y=self.y_train,
         batch_size=batch_size,
-        run_iters=run_iters,
-        num_gpus=3,
+        epochs=self.epochs,
+        optimizer='adam',
+        loss='categorical_crossentropy',
+        metrics=['accuracy'])
+
+    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
+
+  def benchmark_mlp_reuters_bs_512(self):
+    """Measure performance with batch_size=512."""
+    batch_size = 512
+    metrics, wall_time, extras = benchmark_util.measure_performance(
+        self._build_model,
+        x=self.x_train,
+        y=self.y_train,
+        batch_size=batch_size,
+        epochs=self.epochs,
+        optimizer='adam',
+        loss='categorical_crossentropy',
+        metrics=['accuracy'])
+
+    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
+
+  def benchmark_mlp_reuters_bs_512_gpu_2(self):
+    """Measure performance with batch_size=512, gpu=2 and
+
+    distribution_strategy='mirrored'
+    """
+    batch_size = 512
+    metrics, wall_time, extras = benchmark_util.measure_performance(
+        self._build_model,
+        x=self.x_train,
+        y=self.y_train,
+        batch_size=batch_size,
+        num_gpus=2,
         distribution_strategy='mirrored',
         epochs=self.epochs,
         optimizer='adam',
         loss='categorical_crossentropy',
         metrics=['accuracy'])
 
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
-
-  def benchmark_mlp_reuters_bs_256(self):
-    """Measure performance with batch_size=256 and run_iters=3."""
-    batch_size = 256
-    run_iters = 3
-    metrics, wall_time, extras = benchmark_util.measure_performance(
-        self._build_model,
-        x=self.x_train,
-        y=self.y_train,
-        batch_size=batch_size,
-        run_iters=run_iters,
-        epochs=self.epochs,
-        optimizer='adam',
-        loss='categorical_crossentropy',
-        metrics=['accuracy'])
-
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
-
-  def benchmark_mlp_reuters_bs_512(self):
-    """Measure performance with batch_size=512 and run_iters=4."""
-    batch_size = 512
-    run_iters = 4
-    metrics, wall_time, extras = benchmark_util.measure_performance(
-        self._build_model,
-        x=self.x_train,
-        y=self.y_train,
-        batch_size=batch_size,
-        run_iters=run_iters,
-        epochs=self.epochs,
-        optimizer='adam',
-        loss='categorical_crossentropy',
-        metrics=['accuracy'])
-
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/text_classification_transformer_benchmark_test.py b/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/text_classification_transformer_benchmark_test.py
index 8d1847ae204..688d360630d 100644
--- a/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/text_classification_transformer_benchmark_test.py
+++ b/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/text_classification_transformer_benchmark_test.py
@@ -66,77 +66,65 @@ class TextWithTransformerBenchmark(tf.test.Benchmark):
   #   Check more details in `measure_performance()` method of
   #   benchmark_util.
   def benchmark_text_classification_bs_128(self):
-    """Measure performance with batch_size=128 and run_iters=3."""
+    """Measure performance with batch_size=128."""
     batch_size = 128
-    run_iters = 3
     metrics, wall_time, extras = benchmark_util.measure_performance(
         self._build_model,
         x=self.imdb_x,
         y=self.imdb_y,
         batch_size=batch_size,
-        run_iters=run_iters,
         optimizer='adam',
         loss='sparse_categorical_crossentropy',
         metrics=['accuracy'])
 
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
 
   def benchmark_text_classification_bs_256(self):
-    """Measure performance with batch_size=256 and run_iters=3."""
+    """Measure performance with batch_size=256."""
     batch_size = 256
-    run_iters = 3
     metrics, wall_time, extras = benchmark_util.measure_performance(
         self._build_model,
         x=self.imdb_x,
         y=self.imdb_y,
         batch_size=batch_size,
-        run_iters=run_iters,
         optimizer='adam',
         loss='sparse_categorical_crossentropy',
         metrics=['accuracy'])
 
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
 
   def benchmark_text_classification_bs_512(self):
-    """Measure performance with batch_size=512 and run_iters=4."""
+    """Measure performance with batch_size=512."""
     batch_size = 512
-    run_iters = 4
     metrics, wall_time, extras = benchmark_util.measure_performance(
         self._build_model,
         x=self.imdb_x,
         y=self.imdb_y,
         batch_size=batch_size,
-        run_iters=run_iters,
         optimizer='adam',
         loss='sparse_categorical_crossentropy',
         metrics=['accuracy'])
 
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
 
   def benchmark_text_classification_bs_512_gpu_2(self):
-    """Measure performance with batch_size=512, run_iters=4, gpu=1 and
+    """Measure performance with batch_size=512, gpu=1 and
 
     distribution_strategy='mirrored'
     """
     batch_size = 512
-    run_iters = 4
     metrics, wall_time, extras = benchmark_util.measure_performance(
         self._build_model,
         x=self.imdb_x,
         y=self.imdb_y,
         batch_size=batch_size,
-        run_iters=run_iters,
         num_gpus=2,
         distribution_strategy='mirrored',
         optimizer='adam',
         loss='sparse_categorical_crossentropy',
         metrics=['accuracy'])
 
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
 
 
 class MultiHeadSelfAttention(tf.keras.layers.Layer):
diff --git a/tensorflow/python/keras/benchmarks/saved_model_benchmarks/BUILD b/tensorflow/python/keras/benchmarks/saved_model_benchmarks/BUILD
index 25a81cc41cc..a1bd5f32fa6 100644
--- a/tensorflow/python/keras/benchmarks/saved_model_benchmarks/BUILD
+++ b/tensorflow/python/keras/benchmarks/saved_model_benchmarks/BUILD
@@ -10,6 +10,12 @@ package(
 
 exports_files(["LICENSE"])
 
+filegroup(
+    name = "all_py_srcs",
+    srcs = glob(["*.py"]),
+    visibility = ["//tensorflow/python/keras/google/private_tf_api_test:__pkg__"],
+)
+
 # To run CPU benchmarks:
 #   bazel run -c opt benchmarks_test -- --benchmarks=.
 
@@ -28,6 +34,7 @@ py_library(
     srcs = ["saved_model_benchmark_util.py"],
     deps = [
         "//tensorflow:tensorflow_py",
+        "//tensorflow/python/keras/benchmarks:profiler_lib",
     ],
 )
 
@@ -41,6 +48,7 @@ cuda_py_test(
     deps = [
         ":saved_model_benchmark_util",
         "//tensorflow:tensorflow_py",
+        "//tensorflow/python/keras/benchmarks:profiler_lib",
     ],
 )
 
@@ -54,6 +62,7 @@ cuda_py_test(
     deps = [
         ":saved_model_benchmark_util",
         "//tensorflow:tensorflow_py",
+        "//tensorflow/python/keras/benchmarks:profiler_lib",
     ],
 )
 
@@ -67,6 +76,7 @@ cuda_py_test(
     deps = [
         ":saved_model_benchmark_util",
         "//tensorflow:tensorflow_py",
+        "//tensorflow/python/keras/benchmarks:profiler_lib",
     ],
 )
 
@@ -80,6 +90,7 @@ cuda_py_test(
     deps = [
         ":saved_model_benchmark_util",
         "//tensorflow:tensorflow_py",
+        "//tensorflow/python/keras/benchmarks:profiler_lib",
     ],
 )
 
@@ -93,6 +104,7 @@ cuda_py_test(
     deps = [
         ":saved_model_benchmark_util",
         "//tensorflow:tensorflow_py",
+        "//tensorflow/python/keras/benchmarks:profiler_lib",
     ],
 )
 
@@ -106,6 +118,7 @@ cuda_py_test(
     deps = [
         ":saved_model_benchmark_util",
         "//tensorflow:tensorflow_py",
+        "//tensorflow/python/keras/benchmarks:profiler_lib",
     ],
 )
 
@@ -119,6 +132,7 @@ cuda_py_test(
     deps = [
         ":saved_model_benchmark_util",
         "//tensorflow:tensorflow_py",
+        "//tensorflow/python/keras/benchmarks:profiler_lib",
     ],
 )
 
@@ -132,5 +146,6 @@ cuda_py_test(
     deps = [
         ":saved_model_benchmark_util",
         "//tensorflow:tensorflow_py",
+        "//tensorflow/python/keras/benchmarks:profiler_lib",
     ],
 )
diff --git a/tensorflow/python/keras/callbacks.py b/tensorflow/python/keras/callbacks.py
index 5a191263241..4864c60c815 100644
--- a/tensorflow/python/keras/callbacks.py
+++ b/tensorflow/python/keras/callbacks.py
@@ -37,6 +37,7 @@ from tensorflow.python.distribute import collective_all_reduce_strategy
 from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distributed_file_utils
 from tensorflow.python.distribute import mirrored_strategy
+from tensorflow.python.distribute import tpu_strategy
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend as K
@@ -241,9 +242,13 @@ class CallbackList(object):
     # pylint: enable=protected-access
 
     # Performance check: Check batch hooks for slowness compared to batch time.
-    self._timing = {}
-    self._check_timing = False
+    # Only run check for custom callbacks (i.e. not present in this file).
+    self._check_timing = any([cbk.__class__.__name__ not in globals()
+                              for cbk in self.callbacks])
+    self._num_batches_for_timing_check = 5
+    self._hook_times = {}
     self._batch_start_time = None
+    self._batch_times = []
 
   def _add_default_callbacks(self, add_history, add_progbar):
     """Adds `Callback`s that are always present."""
@@ -294,7 +299,6 @@ class CallbackList(object):
   def _call_batch_begin_hook(self, mode, batch, logs):
     """Helper function for `on_*_batch_begin` methods."""
     hook_name = 'on_{mode}_batch_begin'.format(mode=mode)
-    self._check_timing = batch == 1 and hook_name not in self._timing
     self._call_batch_hook_helper(hook_name, batch, logs)
 
     if self._check_timing:
@@ -304,31 +308,39 @@ class CallbackList(object):
     """Helper function for `on_*_batch_end` methods."""
     hook_name = 'on_{mode}_batch_end'.format(mode=mode)
 
-    if self._check_timing:
+    if self._check_timing and batch >= 1:
       batch_time = time.time() - self._batch_start_time
+      self._batch_times.append(batch_time)
 
     self._call_batch_hook_helper(hook_name, batch, logs)
 
-    if self._check_timing:
+    if len(self._batch_times) >= self._num_batches_for_timing_check:
       end_hook_name = hook_name
       begin_hook_name = 'on_{mode}_batch_begin'.format(mode=mode)
+      avg_batch_time = sum(self._batch_times) / len(self._batch_times)
+      avg_end_hook_time = sum(self._hook_times[end_hook_name]) / len(
+          self._hook_times[end_hook_name])
+      avg_begin_hook_time = sum(self._hook_times[begin_hook_name]) / len(
+          self._hook_times[begin_hook_name])
 
-      threshold_time = 1.5 * batch_time
-      warning_msg = ('Callbacks method `{hook}` is slow compared to '
+      threshold_time = 1.0 * avg_batch_time
+      warning_msg = ('Callback method `{hook}` is slow compared to '
                      'the batch time (batch time: {batch_time:.4f}s vs '
-                     '`{hook}` time: {cbk_time:.4f}s). Check your callbacks.')
-      if self._timing[begin_hook_name] > threshold_time:
+                     '`{hook}` time: {hook_time:.4f}s). Check your callbacks.')
+      if avg_begin_hook_time > threshold_time:
         logging.warning(warning_msg.format(
             hook=begin_hook_name,
-            batch_time=batch_time,
-            cbk_time=self._timing[begin_hook_name]))
-      if self._timing[end_hook_name] > threshold_time:
+            batch_time=avg_batch_time,
+            hook_time=avg_begin_hook_time))
+      if avg_end_hook_time > threshold_time:
         logging.warning(warning_msg.format(
             hook=end_hook_name,
-            batch_time=batch_time,
-            cbk_time=self._timing[end_hook_name]))
+            batch_time=avg_batch_time,
+            hook_time=avg_end_hook_time))
       self._check_timing = False
       self._batch_start_time = None
+      self._batch_times = []
+      self._hook_times = {}
 
   def _call_batch_hook_helper(self, hook_name, batch, logs):
     """Helper function for `on_*_batch_*` methods."""
@@ -347,7 +359,9 @@ class CallbackList(object):
         hook(batch, numpy_logs)
 
     if self._check_timing:
-      self._timing[hook_name] = time.time() - start_time
+      if hook_name not in self._hook_times:
+        self._hook_times[hook_name] = []
+      self._hook_times[hook_name].append(time.time() - start_time)
 
   def _call_begin_hook(self, mode):
     """Helper function for on_{train|test|predict}_begin methods."""
@@ -652,7 +666,8 @@ class Callback(object):
         epoch: Integer, index of epoch.
         logs: Dict, metric results for this training epoch, and for the
           validation epoch if validation is performed. Validation result keys
-          are prefixed with `val_`.
+          are prefixed with `val_`. For training epoch, the values of the  
+         `Model`'s metrics are returned. Example : `{'loss': 0.2, 'acc': 0.7}`.
     """
 
   @doc_controls.for_subclass_implementers
@@ -662,6 +677,10 @@ class Callback(object):
 
     Subclasses should override for any actions to run.
 
+    Note that if the `steps_per_execution` argument to `compile` in
+    `tf.keras.Model` is set to `N`, this method will only be called every `N`
+    batches.
+
     Arguments:
         batch: Integer, index of batch within the current epoch.
         logs: Dict, contains the return value of `model.train_step`. Typically,
@@ -678,6 +697,10 @@ class Callback(object):
 
     Subclasses should override for any actions to run.
 
+    Note that if the `steps_per_execution` argument to `compile` in
+    `tf.keras.Model` is set to `N`, this method will only be called every `N`
+    batches.
+
     Arguments:
         batch: Integer, index of batch within the current epoch.
         logs: Dict. Aggregated metric results up until this batch.
@@ -695,6 +718,10 @@ class Callback(object):
 
     Subclasses should override for any actions to run.
 
+    Note that if the `steps_per_execution` argument to `compile` in
+    `tf.keras.Model` is set to `N`, this method will only be called every `N`
+    batches.
+
     Arguments:
         batch: Integer, index of batch within the current epoch.
         logs: Dict, contains the return value of `model.test_step`. Typically,
@@ -712,6 +739,10 @@ class Callback(object):
 
     Subclasses should override for any actions to run.
 
+    Note that if the `steps_per_execution` argument to `compile` in
+    `tf.keras.Model` is set to `N`, this method will only be called every `N`
+    batches.
+
     Arguments:
         batch: Integer, index of batch within the current epoch.
         logs: Dict. Aggregated metric results up until this batch.
@@ -724,6 +755,10 @@ class Callback(object):
 
     Subclasses should override for any actions to run.
 
+    Note that if the `steps_per_execution` argument to `compile` in
+    `tf.keras.Model` is set to `N`, this method will only be called every `N`
+    batches.
+
     Arguments:
         batch: Integer, index of batch within the current epoch.
         logs: Dict, contains the return value of `model.predict_step`,
@@ -738,6 +773,10 @@ class Callback(object):
 
     Subclasses should override for any actions to run.
 
+    Note that if the `steps_per_execution` argument to `compile` in
+    `tf.keras.Model` is set to `N`, this method will only be called every `N`
+    batches.
+
     Arguments:
         batch: Integer, index of batch within the current epoch.
         logs: Dict. Aggregated metric results up until this batch.
@@ -883,10 +922,15 @@ class TerminateOnNaN(Callback):
   """Callback that terminates training when a NaN loss is encountered.
   """
 
+  def __init__(self):
+    super(TerminateOnNaN, self).__init__()
+    self._supports_tf_logs = True
+
   def on_batch_end(self, batch, logs=None):
     logs = logs or {}
     loss = logs.get('loss')
     if loss is not None:
+      loss = tf_utils.to_numpy_or_python_type(loss)
       if np.isnan(loss) or np.isinf(loss):
         print('Batch %d: Invalid loss, terminating training' % (batch))
         self.model.stop_training = True
@@ -1143,13 +1187,13 @@ class ModelCheckpoint(Callback):
       save_freq: `'epoch'` or integer. When using `'epoch'`, the callback saves
         the model after each epoch. When using integer, the callback saves the
         model at end of this many batches. If the `Model` is compiled with
-        `experimental_steps_per_execution=N`, then the saving criteria will be
+        `steps_per_execution=N`, then the saving criteria will be
         checked every Nth batch. Note that if the saving isn't aligned to
         epochs, the monitored metric may potentially be less reliable (it
         could reflect as little as 1 batch, since the metrics get reset every
         epoch). Defaults to `'epoch'`.
       options: Optional `tf.train.CheckpointOptions` object if
-        `save_weights_only` is true or optional `tf.saved_model.SavedOptions`
+        `save_weights_only` is true or optional `tf.saved_model.SaveOptions`
         object if `save_weights_only` is false.
       **kwargs: Additional arguments for backwards compatibility. Possible key
         is `period`.
@@ -1246,16 +1290,6 @@ class ModelCheckpoint(Callback):
       self.save_weights_only = True
 
   def on_train_begin(self, logs=None):
-    # pylint: disable=protected-access
-    if self.model._in_multi_worker_mode:
-      logging.warning(
-          'Automatic model reloading for interrupted job was removed from '
-          'the `ModelCheckpoint` callback in multi-worker mode, please use the '
-          '`keras.callbacks.experimental.BackupAndRestore` callback instead. '
-          'See this tutorial for details: '
-          'https://www.tensorflow.org/tutorials/distribute/'
-          'multi_worker_with_keras#backupandrestore_callback.'
-      )
     if self.load_weights_on_restart:
       filepath_to_load = (
           self._get_most_recently_modified_file_matching_pattern(self.filepath))
@@ -1358,6 +1392,8 @@ class ModelCheckpoint(Callback):
           raise IOError('Please specify a non-directory filepath for '
                         'ModelCheckpoint. Filepath used is an existing '
                         'directory: {}'.format(filepath))
+        # Re-throw the error for any other causes.
+        raise e
 
   def _get_file_path(self, epoch, logs):
     """Returns the file path for checkpoint."""
@@ -1384,9 +1420,10 @@ class ModelCheckpoint(Callback):
   def _checkpoint_exists(self, filepath):
     """Returns whether the checkpoint `filepath` refers to exists."""
     if filepath.endswith('.h5'):
-      return file_io.file_exists(filepath)
-    tf_saved_model_exists = file_io.file_exists(filepath)
-    tf_weights_only_checkpoint_exists = file_io.file_exists(filepath + '.index')
+      return file_io.file_exists_v2(filepath)
+    tf_saved_model_exists = file_io.file_exists_v2(filepath)
+    tf_weights_only_checkpoint_exists = file_io.file_exists_v2(
+        filepath + '.index')
     return tf_saved_model_exists or tf_weights_only_checkpoint_exists
 
   def _get_most_recently_modified_file_matching_pattern(self, pattern):
@@ -1451,7 +1488,7 @@ class ModelCheckpoint(Callback):
     n_file_with_latest_mod_time = 0
     file_path_with_largest_file_name = None
 
-    if file_io.file_exists(dir_name):
+    if file_io.file_exists_v2(dir_name):
       for file_name in os.listdir(dir_name):
         # Only consider if `file_name` matches the pattern.
         if re.match(base_name_regex, file_name):
@@ -1544,7 +1581,8 @@ class BackupAndRestore(Callback):
     self._supported_strategies = (
         distribute_lib._DefaultDistributionStrategy,
         mirrored_strategy.MirroredStrategy,
-        collective_all_reduce_strategy.CollectiveAllReduceStrategy)
+        collective_all_reduce_strategy.CollectiveAllReduceStrategy,
+        tpu_strategy.TPUStrategy, tpu_strategy.TPUStrategyV2)
 
     if not context.executing_eagerly():
       if ops.inside_function():
@@ -1572,8 +1610,10 @@ class BackupAndRestore(Callback):
     if not isinstance(self.model.distribute_strategy,
                       self._supported_strategies):
       raise NotImplementedError(
-          'Currently only support empty strategy, MirroredStrategy and '
-          'MultiWorkerMirroredStrategy.')
+          '%s is not supported yet. '
+          'Currently BackupAndRestore callback only supports empty strategy, '
+          'MirroredStrategy, MultiWorkerMirroredStrategy and TPUStrategy.' %
+          type(self.model.distribute_strategy).__name__)
     self.model._training_state = (
         worker_training_state.WorkerTrainingState(self.model, self.backup_dir))
     self._training_state = self.model._training_state
@@ -2401,7 +2441,7 @@ class ReduceLROnPlateau(Callback):
     """Resets wait counter and cooldown counter.
     """
     if self.mode not in ['auto', 'min', 'max']:
-      logging.warning('Learning Rate Plateau Reducing mode %s is unknown, '
+      logging.warning('Learning rate reduction mode %s is unknown, '
                       'fallback to auto mode.', self.mode)
       self.mode = 'auto'
     if (self.mode == 'min' or
@@ -2422,7 +2462,7 @@ class ReduceLROnPlateau(Callback):
     logs['lr'] = K.get_value(self.model.optimizer.lr)
     current = logs.get(self.monitor)
     if current is None:
-      logging.warning('Reduce LR on plateau conditioned on metric `%s` '
+      logging.warning('Learning rate reduction is conditioned on metric `%s` '
                       'which is not available. Available metrics are: %s',
                       self.monitor, ','.join(list(logs.keys())))
 
@@ -2490,7 +2530,7 @@ class CSVLogger(Callback):
 
   def on_train_begin(self, logs=None):
     if self.append:
-      if file_io.file_exists(self.filename):
+      if file_io.file_exists_v2(self.filename):
         with open(self.filename, 'r' + self.file_flags) as f:
           self.append_header = not bool(len(f.readline()))
       mode = 'a'
diff --git a/tensorflow/python/keras/callbacks_test.py b/tensorflow/python/keras/callbacks_test.py
index 1ac933135b9..4363a8646ec 100644
--- a/tensorflow/python/keras/callbacks_test.py
+++ b/tensorflow/python/keras/callbacks_test.py
@@ -282,13 +282,13 @@ class KerasCallbacksTest(keras_parameterized.TestCase):
     class SleepCallback(keras.callbacks.Callback):
 
       def on_train_batch_end(self, batch, logs=None):
-        time.sleep(1)
+        time.sleep(0.1)
 
     model = sequential.Sequential()
-    model.add(keras.layers.Dense(1, activation='sigmoid'))
+    model.add(keras.layers.Dense(1))
     model.compile(
         'sgd',
-        loss='binary_crossentropy',
+        loss='mse',
         run_eagerly=testing_utils.should_run_eagerly())
 
     warning_messages = []
@@ -298,15 +298,38 @@ class KerasCallbacksTest(keras_parameterized.TestCase):
 
     with test.mock.patch.object(logging, 'warning', warning):
       model.fit(
-          np.ones((10, 10), 'float32'),
-          np.ones((10, 1), 'float32'),
-          batch_size=5,
-          epochs=10,
+          np.ones((16, 1), 'float32'),
+          np.ones((16, 1), 'float32'),
+          batch_size=3,
+          epochs=1,
           callbacks=[SleepCallback()])
-    warning_msg = ('Callbacks method `on_train_batch_end` is slow compared '
+    warning_msg = ('Callback method `on_train_batch_end` is slow compared '
                    'to the batch time')
     self.assertIn(warning_msg, '\n'.join(warning_messages))
 
+  @keras_parameterized.run_all_keras_modes
+  def test_default_callbacks_no_warning(self):
+    # Test that without the callback no warning is raised
+    model = sequential.Sequential()
+    model.add(keras.layers.Dense(1))
+    model.compile(
+        'sgd',
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
+
+    warning_messages = []
+
+    def warning(msg):
+      warning_messages.append(msg)
+
+    with test.mock.patch.object(logging, 'warning', warning):
+      model.fit(
+          np.ones((16, 1), 'float32'),
+          np.ones((16, 1), 'float32'),
+          batch_size=3,
+          epochs=1)
+    self.assertListEqual(warning_messages, [])
+
   @keras_parameterized.run_with_all_model_types(exclude_models='functional')
   @keras_parameterized.run_all_keras_modes
   def test_progbar_logging_deferred_model_build(self):
@@ -911,33 +934,33 @@ class KerasCallbacksTest(keras_parameterized.TestCase):
                                            steps=10,
                                            verbose=0)
 
-    with context.eager_mode():
-      tensor = ops.convert_to_tensor(1.)
+    tensor = ops.convert_to_tensor_v2_with_dispatch(1.)
 
     def mock_numpy():
       raise RuntimeError(
           'If this error is seen, ModelCheckpoint is causing a blocking '
           'NumPy conversion even when not checkpointing.')
 
-    with test.mock.patch.object(tensor, 'numpy', mock_numpy):
-      logs = {'metric': tensor}
+    tensor.numpy = mock_numpy
 
-      cb_list.on_train_begin(logs)
-      cb_list.on_epoch_begin(0, logs)
-      cb_list.on_train_batch_begin(0, logs)
-      cb_list.on_train_batch_end(0, logs)
-      cb_list.on_epoch_end(0, logs)
-      cb_list.on_train_end(logs)
+    logs = {'metric': tensor}
 
-      cb_list.on_test_begin(logs)
-      cb_list.on_test_batch_begin(0, logs)
-      cb_list.on_test_batch_end(0, logs)
-      cb_list.on_test_end(logs)
+    cb_list.on_train_begin(logs)
+    cb_list.on_epoch_begin(0, logs)
+    cb_list.on_train_batch_begin(0, logs)
+    cb_list.on_train_batch_end(0, logs)
+    cb_list.on_epoch_end(0, logs)
+    cb_list.on_train_end(logs)
 
-      cb_list.on_predict_begin(logs)
-      cb_list.on_predict_batch_begin(logs)
-      cb_list.on_predict_batch_end(logs)
-      cb_list.on_predict_end(logs)
+    cb_list.on_test_begin(logs)
+    cb_list.on_test_batch_begin(0, logs)
+    cb_list.on_test_batch_end(0, logs)
+    cb_list.on_test_end(logs)
+
+    cb_list.on_predict_begin(logs)
+    cb_list.on_predict_batch_begin(logs)
+    cb_list.on_predict_batch_end(logs)
+    cb_list.on_predict_end(logs)
 
   def test_ProgbarLogger_verbose_2_nonblocking(self):
     # Should only cause a sync block on epoch end methods.
@@ -951,31 +974,30 @@ class KerasCallbacksTest(keras_parameterized.TestCase):
                                            steps=10,
                                            verbose=2)
 
-    with context.eager_mode():
-      tensor = ops.convert_to_tensor(1.)
+    tensor = ops.convert_to_tensor_v2_with_dispatch(1.)
 
     def mock_numpy():
       raise RuntimeError(
           'If this error is seen, ModelCheckpoint is causing a blocking '
           'NumPy conversion even when not checkpointing.')
 
-    with test.mock.patch.object(tensor, 'numpy', mock_numpy):
-      logs = {'metric': tensor}
+    tensor.numpy = mock_numpy
+    logs = {'metric': tensor}
 
-      cb_list.on_train_begin(logs)
-      cb_list.on_epoch_begin(0, logs)
-      cb_list.on_train_batch_begin(0, logs)
-      cb_list.on_train_batch_end(0, logs)
+    cb_list.on_train_begin(logs)
+    cb_list.on_epoch_begin(0, logs)
+    cb_list.on_train_batch_begin(0, logs)
+    cb_list.on_train_batch_end(0, logs)
 
-      cb_list.on_test_begin(logs)
-      cb_list.on_test_batch_begin(0, logs)
-      cb_list.on_test_batch_end(0, logs)
-      cb_list.on_test_end(logs)
+    cb_list.on_test_begin(logs)
+    cb_list.on_test_batch_begin(0, logs)
+    cb_list.on_test_batch_end(0, logs)
+    cb_list.on_test_end(logs)
 
-      with self.assertRaisesRegex(RuntimeError, 'NumPy conversion'):
-        # on_epoch_end should still block.
-        cb_list.on_epoch_end(0, logs)
-      cb_list.on_train_end(logs)
+    with self.assertRaisesRegex(RuntimeError, 'NumPy conversion'):
+      # on_epoch_end should still block.
+      cb_list.on_epoch_end(0, logs)
+    cb_list.on_train_end(logs)
 
   def test_EarlyStopping(self):
     with self.cached_session():
@@ -2170,7 +2192,7 @@ class TestTensorBoardV2(keras_parameterized.TestCase):
                                            steps=100,
                                            verbose=0)
 
-    tensor = ops.convert_to_tensor(1.)
+    tensor = ops.convert_to_tensor_v2_with_dispatch(1.)
 
     def mock_numpy():
       raise RuntimeError(
diff --git a/tensorflow/python/keras/datasets/BUILD b/tensorflow/python/keras/datasets/BUILD
index 63d9826f5ec..de15f9a53ef 100644
--- a/tensorflow/python/keras/datasets/BUILD
+++ b/tensorflow/python/keras/datasets/BUILD
@@ -10,6 +10,12 @@ package(
 
 exports_files(["LICENSE"])
 
+filegroup(
+    name = "all_py_srcs",
+    srcs = glob(["*.py"]),
+    visibility = ["//tensorflow/python/keras/google/private_tf_api_test:__pkg__"],
+)
+
 py_library(
     name = "datasets",
     srcs = [
diff --git a/tensorflow/python/keras/distribute/BUILD b/tensorflow/python/keras/distribute/BUILD
index 56a6a9d0e1f..f314013e30d 100644
--- a/tensorflow/python/keras/distribute/BUILD
+++ b/tensorflow/python/keras/distribute/BUILD
@@ -18,6 +18,12 @@ package(
 
 exports_files(["LICENSE"])
 
+filegroup(
+    name = "all_py_srcs",
+    srcs = glob(["*.py"]),
+    visibility = ["//tensorflow/python/keras/google/private_tf_api_test:__pkg__"],
+)
+
 py_library(
     name = "distribute",
     srcs = [
@@ -144,6 +150,7 @@ cuda_py_test(
 distribute_py_test(
     name = "checkpointing_test",
     srcs = ["checkpointing_test.py"],
+    disable_mlir_bridge = False,
     main = "checkpointing_test.py",
     tags = [
         "multi_and_single_gpu",
@@ -221,6 +228,7 @@ distribute_py_test(
 distribute_py_test(
     name = "custom_training_loop_metrics_test",
     srcs = ["custom_training_loop_metrics_test.py"],
+    disable_mlir_bridge = False,
     main = "custom_training_loop_metrics_test.py",
     tags = [
         "multi_and_single_gpu",
@@ -243,7 +251,6 @@ distribute_py_test(
     main = "custom_training_loop_models_test.py",
     tags = [
         "multi_and_single_gpu",
-        "no_cuda11",
     ],
     tpu_tags = [
         "no_oss",  # b/153615544.
@@ -267,6 +274,7 @@ distribute_py_test(
 distribute_py_test(
     name = "custom_training_loop_optimizer_test",
     srcs = ["custom_training_loop_optimizer_test.py"],
+    disable_mlir_bridge = False,
     main = "custom_training_loop_optimizer_test.py",
     tags = [
         "multi_and_single_gpu",
@@ -311,6 +319,7 @@ py_library(
 distribute_py_test(
     name = "keras_premade_models_test",
     srcs = ["keras_premade_models_test.py"],
+    disable_mlir_bridge = False,
     full_precision = True,
     main = "keras_premade_models_test.py",
     shard_count = 4,
@@ -347,6 +356,7 @@ distribute_py_test(
 distribute_py_test(
     name = "distributed_training_utils_test",
     srcs = ["distributed_training_utils_test.py"],
+    disable_mlir_bridge = False,
     full_precision = True,
     main = "distributed_training_utils_test.py",
     deps = [
@@ -371,9 +381,11 @@ py_library(
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:training",
+        "//tensorflow/python/distribute:collective_all_reduce_strategy",
         "//tensorflow/python/distribute:combinations",
         "//tensorflow/python/distribute:distribute_lib",
         "//tensorflow/python/distribute:mirrored_strategy",
+        "//tensorflow/python/distribute:multi_process_runner",
         "//tensorflow/python/distribute:strategy_combinations",
         "//tensorflow/python/distribute:tpu_strategy",
         "//tensorflow/python/eager:context",
@@ -389,6 +401,7 @@ distribute_py_test(
     name = "keras_dnn_correctness_test",
     size = "medium",
     srcs = ["keras_dnn_correctness_test.py"],
+    disable_mlir_bridge = False,
     full_precision = True,
     main = "keras_dnn_correctness_test.py",
     # Shard count is set to an odd number to distribute tasks across
@@ -411,7 +424,7 @@ distribute_py_test(
     srcs = ["keras_embedding_model_correctness_test.py"],
     full_precision = True,
     main = "keras_embedding_model_correctness_test.py",
-    shard_count = 4,
+    shard_count = 8,
     tags = [
         "multi_and_single_gpu",
         "no_windows_gpu",
@@ -426,6 +439,7 @@ distribute_py_test(
     name = "keras_image_model_correctness_test",
     size = "medium",
     srcs = ["keras_image_model_correctness_test.py"],
+    disable_mlir_bridge = False,
     full_precision = True,
     main = "keras_image_model_correctness_test.py",
     shard_count = 16,
@@ -444,6 +458,7 @@ distribute_py_test(
 distribute_py_test(
     name = "keras_metrics_test",
     srcs = ["keras_metrics_test.py"],
+    disable_mlir_bridge = False,
     main = "keras_metrics_test.py",
     tags = [
         "multi_and_single_gpu",
@@ -460,6 +475,21 @@ distribute_py_test(
     ],
 )
 
+distribute_py_test(
+    name = "keras_models_test",
+    srcs = ["keras_models_test.py"],
+    main = "keras_models_test.py",
+    tags = [
+        "multi_and_single_gpu",
+    ],
+    deps = [
+        "//tensorflow/python/distribute:combinations",
+        "//tensorflow/python/distribute:strategy_combinations",
+        "//tensorflow/python/eager:test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 distribute_py_test(
     name = "keras_rnn_model_correctness_test",
     size = "medium",
@@ -471,6 +501,8 @@ distribute_py_test(
     shard_count = 31,
     tags = [
         "multi_and_single_gpu",
+        "no_cuda11",
+        "no_oss",
         "no_windows_gpu",
         "notpu",  # TODO(b/153672562)
         "notsan",
@@ -484,6 +516,7 @@ distribute_py_test(
     name = "keras_save_load_test",
     size = "medium",
     srcs = ["keras_save_load_test.py"],
+    disable_mlir_bridge = False,
     full_precision = True,
     main = "keras_save_load_test.py",
     shard_count = 7,
@@ -500,6 +533,7 @@ distribute_py_test(
     name = "keras_stateful_lstm_model_correctness_test",
     size = "medium",
     srcs = ["keras_stateful_lstm_model_correctness_test.py"],
+    disable_mlir_bridge = False,
     full_precision = True,
     main = "keras_stateful_lstm_model_correctness_test.py",
     shard_count = 4,
@@ -731,6 +765,7 @@ py_test(
     tags = [
         "noasan",  # TODO(b/156029134)
         "nomsan",  # TODO(b/156029134)
+        "notap",  # TODO(b/165865820): restore when not flaky
         "notsan",  # TODO(b/156029134)
     ],
     deps = [
@@ -749,6 +784,7 @@ distribute_py_test(
     name = "saved_model_save_load_test",
     size = "medium",
     srcs = ["saved_model_save_load_test.py"],
+    disable_mlir_bridge = False,
     full_precision = True,
     main = "saved_model_save_load_test.py",
     shard_count = 7,
@@ -766,6 +802,7 @@ distribute_py_test(
     name = "saved_model_mixed_api_test",
     size = "medium",
     srcs = ["saved_model_mixed_api_test.py"],
+    disable_mlir_bridge = False,
     full_precision = True,
     main = "saved_model_mixed_api_test.py",
     shard_count = 7,
@@ -801,3 +838,31 @@ distribute_py_test(
         "@absl_py//absl/testing:parameterized",
     ],
 )
+
+py_test(
+    name = "parameter_server_training_test",
+    srcs = ["parameter_server_training_test.py"],
+    python_version = "PY3",
+    shard_count = 1,
+    tags = [
+        "no_oss",  # TODO(b/162119374): enable it in OSS.
+    ],
+    deps = [
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:init_ops_v2",
+        "//tensorflow/python:training_server_lib",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/distribute:multi_worker_test_base",
+        "//tensorflow/python/distribute:parameter_server_strategy_v2",
+        "//tensorflow/python/distribute:sharded_variable",
+        "//tensorflow/python/distribute/client",
+        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/keras",
+    ],
+)
diff --git a/tensorflow/python/keras/distribute/checkpointing_test.py b/tensorflow/python/keras/distribute/checkpointing_test.py
index b9689adede9..90a0290e38c 100644
--- a/tensorflow/python/keras/distribute/checkpointing_test.py
+++ b/tensorflow/python/keras/distribute/checkpointing_test.py
@@ -21,21 +21,23 @@ import os
 
 from absl.testing import parameterized
 
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import def_function
-from tensorflow.python.eager import test
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.keras.optimizer_v2 import adam
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables as variables_lib
+from tensorflow.python.platform import test
 from tensorflow.python.training.saving import checkpoint_options
 from tensorflow.python.training.tracking import util as trackable_utils
 
 
 class TrainingCheckpointTests(test.TestCase, parameterized.TestCase):
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           distribution=[
               strategy_combinations.mirrored_strategy_with_one_cpu,
@@ -94,7 +96,7 @@ class TrainingCheckpointTests(test.TestCase, parameterized.TestCase):
           ValueError, "optimizer slot variable under the scope"):
         checkpoint.restore(save_path)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           distribution=[
               strategy_combinations.mirrored_strategy_with_one_cpu,
@@ -136,4 +138,5 @@ class TrainingCheckpointTests(test.TestCase, parameterized.TestCase):
 
 
 if __name__ == "__main__":
+  ops.enable_eager_execution()
   test.main()
diff --git a/tensorflow/python/keras/distribute/collective_all_reduce_strategy_test.py b/tensorflow/python/keras/distribute/collective_all_reduce_strategy_test.py
index 60b7d4690bb..7dd285a585f 100644
--- a/tensorflow/python/keras/distribute/collective_all_reduce_strategy_test.py
+++ b/tensorflow/python/keras/distribute/collective_all_reduce_strategy_test.py
@@ -25,17 +25,19 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.compat import v2_compat
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import collective_all_reduce_strategy
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
 from tensorflow.python.distribute import cross_device_utils
+from tensorflow.python.distribute import multi_process_runner
 from tensorflow.python.distribute import multi_worker_test_base
 from tensorflow.python.distribute import multi_worker_util
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.distribute import strategy_test_lib
 from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
-from tensorflow.python.eager import context
+from tensorflow.python.framework import config as tf_config
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras import layers
 from tensorflow.python.keras import testing_utils
@@ -64,7 +66,7 @@ def create_test_objects(cluster_spec=None,
                         num_gpus=None):
   sess_config = config_pb2.ConfigProto()
   if num_gpus is None:
-    num_gpus = context.num_gpus()
+    num_gpus = len(tf_config.list_logical_devices('GPU'))
 
   if cluster_spec and task_type and task_id is not None:
     cluster_resolver = SimpleClusterResolver(
@@ -257,13 +259,13 @@ class DistributedCollectiveAllReduceStrategyTest(
     cls._cluster_spec = multi_worker_test_base.create_in_process_cluster(
         num_workers=3, num_ps=0)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(mode=['graph'], required_gpus=[0, 1, 2]))
   def testComplexModel(self, required_gpus):
     self._run_between_graph_clients(
         self._test_complex_model, self._cluster_spec, num_gpus=required_gpus)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(mode=['graph'], required_gpus=[0, 1, 2]))
   @testing_utils.enable_v2_dtype_behavior
   def testMixedPrecision(self, required_gpus):
@@ -285,13 +287,13 @@ class DistributedCollectiveAllReduceStrategyTestWithChief(
     cls._cluster_spec = multi_worker_test_base.create_in_process_cluster(
         num_workers=3, num_ps=0, has_chief=True)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(mode=['graph'], required_gpus=[0, 1, 2]))
   def testComplexModel(self, required_gpus):
     self._run_between_graph_clients(
         self._test_complex_model, self._cluster_spec, num_gpus=required_gpus)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(mode=['graph'], required_gpus=[0, 1, 2]))
   @testing_utils.enable_v2_dtype_behavior
   def testMixedPrecision(self, required_gpus):
@@ -310,12 +312,12 @@ class LocalCollectiveAllReduceStrategy(
     strategy_test_lib.TwoDeviceDistributionTestBase,
     parameterized.TestCase):
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(mode=['graph'], required_gpus=[2, 4]))
   def testComplexModel(self, required_gpus):
     self._test_complex_model(None, None, required_gpus)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(mode=['graph'], required_gpus=[2, 4]))
   @testing_utils.enable_v2_dtype_behavior
   def testMixedPrecision(self, required_gpus):
@@ -323,7 +325,7 @@ class LocalCollectiveAllReduceStrategy(
       self._test_mixed_precision(None, None, required_gpus)
 
 
-@combinations.generate(
+@ds_combinations.generate(
     combinations.combine(
         strategy=[
             strategy_combinations.multi_worker_mirrored_2x1_cpu,
@@ -370,4 +372,4 @@ class DistributedCollectiveAllReduceStrategyEagerTest(test.TestCase,
 
 if __name__ == '__main__':
   v2_compat.enable_v2_behavior()
-  combinations.main()
+  multi_process_runner.test_main()
diff --git a/tensorflow/python/keras/distribute/ctl_correctness_test.py b/tensorflow/python/keras/distribute/ctl_correctness_test.py
index dcac3f37e71..9b62431557c 100644
--- a/tensorflow/python/keras/distribute/ctl_correctness_test.py
+++ b/tensorflow/python/keras/distribute/ctl_correctness_test.py
@@ -25,18 +25,20 @@ import numpy as np
 from tensorflow.python import keras
 from tensorflow.python.compat import v2_compat
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
+from tensorflow.python.distribute import multi_process_runner
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import def_function
-from tensorflow.python.eager import test
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras.distribute import optimizer_combinations
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
+from tensorflow.python.platform import test
 
 _NUM_SAMPLES = 66
 _BATCH_SIZE = 32
@@ -224,7 +226,7 @@ class TestDistributionStrategyDnnCorrectness(test.TestCase,
     np.random.seed(_RANDOM_SEED)
     random_seed.set_random_seed(_RANDOM_SEED)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           distribution=strategy_combinations.all_strategies,
           optimizer_fn=optimizer_combinations.optimizers_v2,
@@ -252,11 +254,6 @@ class TestDistributionStrategyDnnCorrectness(test.TestCase,
     if ('CollectiveAllReduce' in type(distribution).__name__ and
         test_util.is_xla_enabled()):
       self.skipTest('XLA tests fail with MWMS.')
-    # Unable to use required_gpus to check if this is a multiGPU combination
-    # since required_gpus and NamedDistribution cannot be used together.
-    if ('CollectiveAllReduce' in type(distribution).__name__
-        and not inside_func and iteration_type == 'dataset'):
-      self.skipTest('MWMS tests fail with multiple GPUs.')
     self.dnn_correctness(distribution, optimizer_fn, iteration_type,
                          inside_func, sync_batchnorm)
 
@@ -282,4 +279,4 @@ class TestDistributionStrategyDnnCorrectness(test.TestCase,
 
 
 if __name__ == '__main__':
-  combinations.main()
+  multi_process_runner.test_main()
diff --git a/tensorflow/python/keras/distribute/custom_training_loop_metrics_test.py b/tensorflow/python/keras/distribute/custom_training_loop_metrics_test.py
index a41d1f369a4..0ad69699d64 100644
--- a/tensorflow/python/keras/distribute/custom_training_loop_metrics_test.py
+++ b/tensorflow/python/keras/distribute/custom_training_loop_metrics_test.py
@@ -22,17 +22,19 @@ from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
+from tensorflow.python.distribute import multi_process_runner
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.eager import def_function
-from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.keras import metrics
+from tensorflow.python.platform import test
 
 
 class KerasMetricsTest(test.TestCase, parameterized.TestCase):
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           distribution=strategy_combinations.all_strategies +
           strategy_combinations.multiworker_strategies,
@@ -57,7 +59,7 @@ class KerasMetricsTest(test.TestCase, parameterized.TestCase):
                      loss_metric_2.result().numpy())
     self.assertEqual(loss_metric.result().numpy(), 5.0)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           distribution=strategy_combinations.all_strategies+
           strategy_combinations.multiworker_strategies,
@@ -81,7 +83,7 @@ class KerasMetricsTest(test.TestCase, parameterized.TestCase):
     # of 10 resulting in mean of 4.5.
     self.assertEqual(metric.result().numpy(), 4.5)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           distribution=strategy_combinations.all_strategies,
           mode=["eager"]
@@ -100,4 +102,4 @@ class KerasMetricsTest(test.TestCase, parameterized.TestCase):
 
 
 if __name__ == "__main__":
-  combinations.main()
+  multi_process_runner.test_main()
diff --git a/tensorflow/python/keras/distribute/custom_training_loop_models_test.py b/tensorflow/python/keras/distribute/custom_training_loop_models_test.py
index 5a9384bb7e0..e66b174b3aa 100644
--- a/tensorflow/python/keras/distribute/custom_training_loop_models_test.py
+++ b/tensorflow/python/keras/distribute/custom_training_loop_models_test.py
@@ -25,14 +25,16 @@ import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
+from tensorflow.python.distribute import multi_process_runner
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import def_function
-from tensorflow.python.eager import test
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.module import module
 from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
 from tensorflow.python.util import nest
 
 
@@ -52,15 +54,17 @@ class CustomModel(module.Module):
     return x
 
 
+@ds_combinations.generate(
+    combinations.combine(
+        distribution=(strategy_combinations.all_strategies +
+                      strategy_combinations.multiworker_strategies),
+        mode=["eager"]
+        )
+    )
 class KerasModelsTest(test.TestCase, parameterized.TestCase):
 
-  @combinations.generate(
-      combinations.combine(
-          distribution=strategy_combinations.all_strategies,
-          mode=["eager"]
-      ))
-  def test_single_keras_layer_experimental_run(self, distribution):
-    dataset = self._get_dataset()
+  def test_single_keras_layer_run(self, distribution):
+    dataset = _get_dataset()
     input_iterator = iter(distribution.experimental_distribute_dataset(dataset))
 
     with distribution.scope():
@@ -72,7 +76,7 @@ class KerasModelsTest(test.TestCase, parameterized.TestCase):
         images, targets = inputs
         with backprop.GradientTape() as tape:
           outputs = model(images)
-          loss = math_ops.reduce_sum(outputs - targets)
+          loss = keras.losses.mean_squared_error(targets, outputs)
         grads = tape.gradient(loss, model.variables)
         return grads
 
@@ -83,72 +87,33 @@ class KerasModelsTest(test.TestCase, parameterized.TestCase):
 
     train_step(input_iterator)
 
-  @combinations.generate(
-      combinations.combine(
-          distribution=strategy_combinations.all_strategies,
-          mode=["eager"]
-      ))
-  def test_keras_model_creation_experimental_run(self, distribution):
-    dataset = self._get_dataset()
+  def test_keras_model_optimizer_run(self, distribution):
+    dataset = _get_dataset()
     input_iterator = iter(distribution.experimental_distribute_dataset(dataset))
 
     with distribution.scope():
-      model = self._get_model()
-
-    @def_function.function
-    def train_step(iterator):
-      def step_fn(inputs):
-        images, targets = inputs
-        with backprop.GradientTape() as tape:
-          outputs = model(images)
-          loss = math_ops.reduce_sum(outputs - targets)
-        grads = tape.gradient(loss, model.variables)
-        return grads
-
-      outputs = distribution.run(
-          step_fn, args=(next(iterator),))
-      return nest.map_structure(distribution.experimental_local_results,
-                                outputs)
-
-    train_step(input_iterator)
-
-  @combinations.generate(
-      combinations.combine(
-          distribution=strategy_combinations.all_strategies,
-          mode=["eager"]
-      ))
-  def test_keras_model_optimizer_experimental_run(self, distribution):
-    dataset = self._get_dataset()
-    input_iterator = iter(distribution.experimental_distribute_dataset(dataset))
-
-    with distribution.scope():
-      model = self._get_model()
+      model = _get_model()
       optimizer = keras.optimizer_v2.rmsprop.RMSprop()
 
     @def_function.function
-    def train_step(iterator):
+    def train_step(replicated_inputs):
       def step_fn(inputs):
         images, targets = inputs
         with backprop.GradientTape() as tape:
           outputs = model(images)
-          loss = math_ops.reduce_sum(outputs - targets)
+          loss = keras.losses.mean_squared_error(targets, outputs)
         grads = tape.gradient(loss, model.variables)
         optimizer.apply_gradients(zip(grads, model.variables))
         return loss
 
-      outputs = distribution.run(
-          step_fn, args=(next(iterator),))
+      outputs = distribution.run(step_fn, args=(replicated_inputs,))
       return nest.map_structure(distribution.experimental_local_results,
                                 outputs)
 
-    train_step(input_iterator)
+    for x in input_iterator:
+      train_step(x)
 
-  @combinations.generate(
-      combinations.combine(
-          distribution=strategy_combinations.all_strategies,
-          mode=["eager"]
-      ))
-  def test_keras_subclass_model_optimizer_experimental_run(self, distribution):
+  def test_keras_subclass_model_optimizer_run(self, distribution):
     def get_subclass_model():
 
       class KerasSubclassModel(keras.Model):
@@ -161,7 +126,7 @@ class KerasModelsTest(test.TestCase, parameterized.TestCase):
           return self.l(x)
 
       return KerasSubclassModel()
-    dataset = self._get_dataset()
+    dataset = _get_dataset()
     input_iterator = iter(distribution.experimental_distribute_dataset(dataset))
 
     with distribution.scope():
@@ -174,29 +139,23 @@ class KerasModelsTest(test.TestCase, parameterized.TestCase):
         images, targets = inputs
         with backprop.GradientTape() as tape:
           outputs = model(images)
-          loss = math_ops.reduce_sum(outputs - targets)
+          loss = keras.losses.mean_squared_error(targets, outputs)
         grads = tape.gradient(loss, model.variables)
         optimizer.apply_gradients(zip(grads, model.variables))
         return loss
 
-      outputs = distribution.run(
-          step_fn, args=(next(iterator),))
+      outputs = distribution.run(step_fn, args=(next(iterator),))
       return nest.map_structure(distribution.experimental_local_results,
                                 outputs)
 
     train_step(input_iterator)
 
-  @combinations.generate(
-      combinations.combine(
-          distribution=strategy_combinations.all_strategies,
-          mode=["eager"]
-      ))
-  def test_keras_model_optimizer_experimental_run_loop(self, distribution):
-    dataset = self._get_dataset()
+  def test_keras_model_optimizer_run_loop(self, distribution):
+    dataset = _get_dataset()
     input_iterator = iter(distribution.experimental_distribute_dataset(dataset))
 
     with distribution.scope():
-      model = self._get_model()
+      model = _get_model()
       optimizer = keras.optimizer_v2.rmsprop.RMSprop()
 
     @def_function.function
@@ -205,27 +164,22 @@ class KerasModelsTest(test.TestCase, parameterized.TestCase):
         images, targets = inputs
         with backprop.GradientTape() as tape:
           outputs = model(images)
-          loss = math_ops.reduce_sum(outputs - targets)
+          loss = keras.losses.mean_squared_error(targets, outputs)
         grads = tape.gradient(loss, model.variables)
         optimizer.apply_gradients(zip(grads, model.variables))
         return loss
 
-      for _ in range(5):
+      for _ in math_ops.range(4):
         distribution.run(step_fn, args=(next(iterator),))
 
     train_step(input_iterator)
 
-  @combinations.generate(
-      combinations.combine(
-          distribution=strategy_combinations.all_strategies,
-          mode=["eager"]
-      ))
   def test_batch_norm_with_dynamic_batch(self, distribution):
     inputs = np.zeros((10, 3, 3, 3), dtype=np.float32)
     targets = np.zeros((10, 4), dtype=np.float32)
     dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
     dataset = dataset.repeat()
-    dataset = dataset.batch(10, drop_remainder=False)
+    dataset = dataset.batch(10)
     input_iterator = iter(distribution.experimental_distribute_dataset(dataset))
 
     with distribution.scope():
@@ -242,7 +196,7 @@ class KerasModelsTest(test.TestCase, parameterized.TestCase):
         images, targets = inputs
         with backprop.GradientTape() as tape:
           outputs = model(images, training=True)
-          loss = math_ops.reduce_sum(outputs - targets)
+          loss = keras.losses.mean_squared_error(targets, outputs)
         grads = tape.gradient(loss, model.variables)
         optimizer.apply_gradients(zip(grads, model.variables))
         return loss
@@ -251,11 +205,6 @@ class KerasModelsTest(test.TestCase, parameterized.TestCase):
 
     train_step(input_iterator)
 
-  @combinations.generate(
-      combinations.combine(
-          distribution=strategy_combinations.all_strategies,
-          mode=["eager"]
-      ))
   def test_lstm(self, distribution):
 
     batch_size = 32
@@ -277,7 +226,7 @@ class KerasModelsTest(test.TestCase, parameterized.TestCase):
 
     x, y = create_lstm_data()
     dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
-    dataset = dataset.batch(batch_size, drop_remainder=True)
+    dataset = dataset.batch(batch_size)
     input_iterator = iter(distribution.experimental_distribute_dataset(dataset))
 
     with distribution.scope():
@@ -304,9 +253,6 @@ class KerasModelsTest(test.TestCase, parameterized.TestCase):
 
     train_step(input_iterator)
 
-  @combinations.generate(
-      combinations.combine(
-          distribution=strategy_combinations.all_strategies, mode=["eager"]))
   def test_nested_tf_functions(self, distribution):
     # The test builds two computations with keras layers, one with nested
     # tf.function, and the other without nested tf.function. We run these
@@ -316,7 +262,7 @@ class KerasModelsTest(test.TestCase, parameterized.TestCase):
     inputs = np.random.random((10, 3)).astype(np.float32)
     targets = np.ones((10, 4), dtype=np.float32)
     dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets)).repeat()
-    dataset = dataset.batch(10, drop_remainder=True)
+    dataset = dataset.batch(10)
     input_iterator = iter(distribution.experimental_distribute_dataset(dataset))
 
     def get_model():
@@ -339,7 +285,7 @@ class KerasModelsTest(test.TestCase, parameterized.TestCase):
 
     def compute_loss(images, targets):
       outputs = model(images)
-      return math_ops.reduce_sum(outputs - targets)
+      return keras.losses.mean_squared_error(targets, outputs)
 
     @def_function.function
     def train_step_without_nested_tf_function(inputs):
@@ -356,7 +302,7 @@ class KerasModelsTest(test.TestCase, parameterized.TestCase):
     @def_function.function
     def compute_loss2(images, targets):
       outputs = model2(images)
-      return math_ops.reduce_sum(outputs - targets)
+      return keras.losses.mean_squared_error(targets, outputs)
 
     @def_function.function
     def train_step_with_nested_tf_function(inputs):
@@ -379,14 +325,11 @@ class KerasModelsTest(test.TestCase, parameterized.TestCase):
     for model_v, model2_v in zip(model.variables, model2.variables):
       self.assertAllClose(model_v.numpy(), model2_v.numpy())
 
-  @combinations.generate(
-      combinations.combine(
-          distribution=strategy_combinations.all_strategies, mode=["eager"]))
   def test_nested_tf_functions_with_control_flow(self, distribution):
     inputs = np.random.random((10, 3)).astype(np.float32)
     targets = np.ones((10, 4), dtype=np.float32)
     dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets)).repeat()
-    dataset = dataset.batch(10, drop_remainder=True)
+    dataset = dataset.batch(10)
     input_iterator = iter(distribution.experimental_distribute_dataset(dataset))
 
     def get_model():
@@ -406,7 +349,7 @@ class KerasModelsTest(test.TestCase, parameterized.TestCase):
         images, targets = inputs
         with backprop.GradientTape() as tape:
           outputs = model(images)
-          loss = math_ops.reduce_sum(outputs - targets)
+          loss = keras.losses.mean_squared_error(targets, outputs)
         grads = tape.gradient(loss, model.variables)
         optimizer.apply_gradients(zip(grads, model.variables))
 
@@ -419,13 +362,8 @@ class KerasModelsTest(test.TestCase, parameterized.TestCase):
 
     train_steps(input_iterator)
 
-  @combinations.generate(
-      combinations.combine(
-          distribution=strategy_combinations.all_strategies,
-          mode=["eager"]
-      ))
-  def test_customized_tf_module_experimental_run(self, distribution):
-    dataset = self._get_dataset()
+  def test_customized_tf_module_run(self, distribution):
+    dataset = _get_dataset()
     input_iterator = iter(distribution.experimental_distribute_dataset(dataset))
 
     with distribution.scope():
@@ -438,7 +376,7 @@ class KerasModelsTest(test.TestCase, parameterized.TestCase):
         images, targets = inputs
         with backprop.GradientTape() as tape:
           outputs = model(images)
-          loss = math_ops.reduce_sum(outputs - targets)
+          loss = keras.losses.mean_squared_error(targets, outputs)
         grads = tape.gradient(loss, model.variables)
         return grads
 
@@ -449,14 +387,11 @@ class KerasModelsTest(test.TestCase, parameterized.TestCase):
 
     train_step(input_iterator)
 
-  @combinations.generate(
-      combinations.combine(
-          distribution=strategy_combinations.all_strategies, mode=["eager"]))
   def test_reduce_loss(self, distribution):
     inputs = np.zeros((10, 4), dtype=np.float32)
     targets = np.zeros((10, 1), dtype=np.float32)
     dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-    dataset = dataset.batch(10, drop_remainder=False)
+    dataset = dataset.batch(10)
     input_iterator = iter(distribution.experimental_distribute_dataset(dataset))
 
     with distribution.scope():
@@ -478,11 +413,14 @@ class KerasModelsTest(test.TestCase, parameterized.TestCase):
     loss = train_step(input_iterator)
     loss = distribution.reduce(reduce_util.ReduceOp.MEAN, loss, axis=0)
 
-  @combinations.generate(
+
+class KerasModelsXLATest(test.TestCase, parameterized.TestCase):
+
+  @ds_combinations.generate(
       combinations.combine(
           distribution=strategy_combinations.tpu_strategies, mode=["eager"]))
   def test_tf_function_experimental_compile(self, distribution):
-    dataset = self._get_dataset()
+    dataset = _get_dataset()
     input_iterator = iter(distribution.experimental_distribute_dataset(dataset))
 
     class CustomDense(keras.layers.Layer):
@@ -510,7 +448,7 @@ class KerasModelsTest(test.TestCase, parameterized.TestCase):
         images, targets = inputs
         with backprop.GradientTape() as tape:
           outputs = model(images)
-          loss = math_ops.reduce_sum(outputs - targets)
+          loss = keras.losses.mean_squared_error(targets, outputs)
         grads = tape.gradient(loss, model.variables)
         return grads
 
@@ -521,20 +459,21 @@ class KerasModelsTest(test.TestCase, parameterized.TestCase):
 
     train_step(input_iterator)
 
-  def _get_dataset(self):
-    inputs = np.zeros((10, 3), dtype=np.float32)
-    targets = np.zeros((10, 4), dtype=np.float32)
-    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-    dataset = dataset.repeat(100)
-    dataset = dataset.batch(10, drop_remainder=True)
-    return dataset
 
-  def _get_model(self):
-    x = keras.layers.Input(shape=(3,), name="input")
-    y = keras.layers.Dense(4, name="dense")(x)
-    model = keras.Model(x, y)
-    return model
+def _get_dataset():
+  inputs = np.zeros((31, 3), dtype=np.float32)
+  targets = np.zeros((31, 4), dtype=np.float32)
+  dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+  dataset = dataset.batch(10)
+  return dataset
+
+
+def _get_model():
+  x = keras.layers.Input(shape=(3,), name="input")
+  y = keras.layers.Dense(4, name="dense")(x)
+  model = keras.Model(x, y)
+  return model
 
 
 if __name__ == "__main__":
-  test.main()
+  multi_process_runner.test_main()
diff --git a/tensorflow/python/keras/distribute/custom_training_loop_optimizer_test.py b/tensorflow/python/keras/distribute/custom_training_loop_optimizer_test.py
index b9eee26220a..b61534f1d78 100644
--- a/tensorflow/python/keras/distribute/custom_training_loop_optimizer_test.py
+++ b/tensorflow/python/keras/distribute/custom_training_loop_optimizer_test.py
@@ -19,33 +19,32 @@ from __future__ import division
 from __future__ import print_function
 
 from absl.testing import parameterized
-
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.distribute import values
 from tensorflow.python.eager import def_function
-from tensorflow.python.eager import test
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.keras.optimizer_v2 import gradient_descent
 from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
 
 
 class OptimizerTest(test.TestCase, parameterized.TestCase):
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(
           combinations.combine(
               distribution=strategy_combinations.multidevice_strategies,
               mode=["eager"],
           ),
-          combinations.concat(
-              combinations.combine(
-                  experimental_aggregate_gradients=True,
-                  expected=[[[-0.3, -0.3], [-0.3, -0.3]]]),
-              combinations.combine(
-                  experimental_aggregate_gradients=False,
-                  expected=[[[-0.1, -0.1], [-0.2, -0.2]]]),
-          )))
+          combinations.combine(
+              experimental_aggregate_gradients=True,
+              expected=[[[-0.3, -0.3], [-0.3, -0.3]]]) +
+          combinations.combine(
+              experimental_aggregate_gradients=False,
+              expected=[[[-0.1, -0.1], [-0.2, -0.2]]])
+      ))
   def test_custom_aggregation(self, distribution,
                               experimental_aggregate_gradients, expected):
 
@@ -56,8 +55,8 @@ class OptimizerTest(test.TestCase, parameterized.TestCase):
     @def_function.function
     def optimize():
       grads = values.PerReplica([
-          ops.convert_to_tensor([1., 1.]),
-          ops.convert_to_tensor([2., 2.]),
+          ops.convert_to_tensor_v2_with_dispatch([1., 1.]),
+          ops.convert_to_tensor_v2_with_dispatch([2., 2.]),
       ])
 
       def step_fn(grads):
@@ -71,7 +70,7 @@ class OptimizerTest(test.TestCase, parameterized.TestCase):
 
     self.assertAllClose(optimize(), expected)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           distribution=strategy_combinations.one_device_strategy,
           mode=["eager"],
@@ -85,7 +84,7 @@ class OptimizerTest(test.TestCase, parameterized.TestCase):
 
     @def_function.function
     def optimize():
-      grads = ops.convert_to_tensor([1., 1.])
+      grads = ops.convert_to_tensor_v2_with_dispatch([1., 1.])
 
       def step_fn(grads):
         optimizer.apply_gradients(
@@ -98,7 +97,7 @@ class OptimizerTest(test.TestCase, parameterized.TestCase):
 
     self.assertAllClose(optimize(), [[-0.1, -0.1]])
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(distribution=[
           strategy_combinations.central_storage_strategy_with_gpu_and_cpu
       ]))
@@ -107,7 +106,7 @@ class OptimizerTest(test.TestCase, parameterized.TestCase):
       v = variables.Variable([0., 0.])
       optimizer = gradient_descent.SGD(0.1)
 
-    grads = ops.convert_to_tensor([1., 1.])
+    grads = ops.convert_to_tensor_v2_with_dispatch([1., 1.])
 
     def step_fn(grads):
       with self.assertRaises(NotImplementedError):
diff --git a/tensorflow/python/keras/distribute/distribute_strategy_test.py b/tensorflow/python/keras/distribute/distribute_strategy_test.py
index 4b6d3a80730..bbd62c11cb3 100644
--- a/tensorflow/python/keras/distribute/distribute_strategy_test.py
+++ b/tensorflow/python/keras/distribute/distribute_strategy_test.py
@@ -22,7 +22,8 @@ import numpy as np
 from tensorflow.python import keras
 from tensorflow.python.data.experimental.ops import cardinality
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import central_storage_strategy
+from tensorflow.python.distribute import combinations as ds_combinations
 from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.distribute import mirrored_strategy
 from tensorflow.python.distribute import multi_worker_test_base
@@ -36,6 +37,7 @@ from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.distribute import distributed_training_utils
 from tensorflow.python.keras.distribute import optimizer_combinations
@@ -352,7 +354,7 @@ class BatchCountingCB(keras.callbacks.Callback):
 class TestDistributionStrategyWithNumpyArrays(test.TestCase,
                                               parameterized.TestCase):
 
-  @combinations.generate(all_strategy_combinations())
+  @ds_combinations.generate(all_strategy_combinations())
   def test_calculating_input_params_no_steps_no_batch_size(self, distribution):
     # Calculate the per_replica_batch_size scaling factor for strategies
     # that use per_core_batch_size
@@ -373,7 +375,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
       self.assertEqual(batch_size, 20 // replica_scale_factor)
       self.assertEqual(steps, 1)
 
-  @combinations.generate(all_strategy_combinations())
+  @ds_combinations.generate(all_strategy_combinations())
   def test_calculating_input_params_with_steps_no_batch_size(
       self, distribution):
     # Calculate the per_replica_batch_size scaling factor for strategies
@@ -416,7 +418,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
           distributed_training_utils.get_input_params(
               distribution, 63, steps=1, batch_size=None)
 
-  @combinations.generate(all_strategy_combinations())
+  @ds_combinations.generate(all_strategy_combinations())
   def test_calculating_input_params_no_steps_with_batch_size(
       self, distribution):
     # Calculate the per_replica_batch_size scaling factor for strategies
@@ -438,7 +440,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
       self.assertEqual(batch_size, 32)
       self.assertEqual(steps, 2 // replica_scale_factor)
 
-  @combinations.generate(all_strategy_combinations())
+  @ds_combinations.generate(all_strategy_combinations())
   def test_calculating_input_params_with_steps_with_batch_size(
       self, distribution):
     with self.cached_session():
@@ -453,7 +455,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
         distributed_training_utils.get_input_params(
             distribution, 64, steps=10, batch_size=13)
 
-  @combinations.generate(all_strategy_combinations())
+  @ds_combinations.generate(all_strategy_combinations())
   def test_calling_model_with_numpy_arrays(self, distribution):
     with self.cached_session():
       with distribution.scope():
@@ -487,7 +489,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
         model.predict(inputs)
         model.predict(inputs, batch_size=8)
 
-  @combinations.generate(all_strategy_combinations())
+  @ds_combinations.generate(all_strategy_combinations())
   def test_calling_model_with_mixed_precision(self, distribution):
     if isinstance(distribution.extended,
                   parameter_server_strategy.ParameterServerStrategyExtended):
@@ -533,7 +535,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
       model.predict(inputs)
       model.predict(inputs, batch_size=8)
 
-  @combinations.generate(all_strategy_combinations())
+  @ds_combinations.generate(all_strategy_combinations())
   def test_operator_overload_mixed_precision(self, distribution):
     # Regression test that tests a fixed bug does not reoccur. Adding an
     # AutoCastVariable to a tensor on a TPU, where the variable was the LHS of
@@ -574,7 +576,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
       self.assertIsNotNone(grad_v1)
       self.assertIsNotNone(grad_v2)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           distribution=[strategy_combinations.one_device_strategy],
           mode=['graph', 'eager']))
@@ -592,7 +594,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
                                   'cannot be called in cross-replica context'):
         optimizer.apply_gradients(zip(gradients, model.trainable_variables))
 
-  @combinations.generate(all_strategy_combinations())
+  @ds_combinations.generate(all_strategy_combinations())
   def test_calling_model_with_nested_numpy_arrays(self, distribution):
     with self.cached_session():
       with distribution.scope():
@@ -623,7 +625,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
       model.predict(inputs)
       model.predict(inputs, batch_size=8)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           distribution=strategies_minus_tpu,
           mode=['graph', 'eager']))
@@ -664,7 +666,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
       result = model.evaluate(inputs, targets, batch_size=2, verbose=1)
       self.assertAllClose(result, 13.5)
 
-  @combinations.generate(all_strategy_combinations())
+  @ds_combinations.generate(all_strategy_combinations())
   def test_flatten_predict_outputs(self, distribution):
     with self.cached_session():
       with distribution.scope():
@@ -691,7 +693,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
       self.assertAllEqual([6, 7], outs[0].shape)
       self.assertAllEqual([6, 7], outs[1].shape)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(tpu_strategy_combinations_graph_only(),
                          combinations.combine(batch_size=[4, 6])))
   def test_evaluate_with_partial_batch(self, distribution, batch_size):
@@ -734,7 +736,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
           atol=1e-5,
           rtol=1e-5)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(
           tpu_strategy_combinations_graph_only()))
   def test_predict_with_partial_batch(self, distribution):
@@ -771,7 +773,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
           atol=1e-5,
           rtol=1e-5)
 
-  @combinations.generate(tpu_strategy_combinations_graph_only())
+  @ds_combinations.generate(tpu_strategy_combinations_graph_only())
   def test_no_target_model(self, distribution):
     with self.cached_session():
       optimizer = gradient_descent.GradientDescentOptimizer(0.001)
@@ -796,7 +798,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
         model.predict(inputs, steps=1)
         model.evaluate(inputs, steps=1)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(
           tpu_strategy_combinations_graph_only()))
   def test_predict_multi_output_model_with_partial_batch(
@@ -831,7 +833,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
           atol=1e-4,
           rtol=1e-4)
 
-  @combinations.generate(all_strategy_combinations())
+  @ds_combinations.generate(all_strategy_combinations())
   def test_gradients_are_none(self, distribution):
 
     if not context.executing_eagerly():
@@ -862,7 +864,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
 class TestDistributionStrategyWithDatasets(test.TestCase,
                                            parameterized.TestCase):
 
-  @combinations.generate(all_strategy_combinations())
+  @ds_combinations.generate(all_strategy_combinations())
   def test_calling_model_on_same_dataset(self, distribution):
     with self.cached_session():
       with distribution.scope():
@@ -895,7 +897,7 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
           validation_steps=2)
       model.predict(get_predict_dataset(distribution), steps=2)
 
-  @combinations.generate(all_strategy_combinations())
+  @ds_combinations.generate(all_strategy_combinations())
   def test_model_interleaved_eval_same_as_direct_eval(
       self, distribution):
     with self.cached_session():
@@ -946,7 +948,7 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
       self.assertEqual(interleaved_output.history['val_categorical_accuracy'],
                        [x[2] for x in user_controlled_output])
 
-  @combinations.generate(all_strategy_combinations())
+  @ds_combinations.generate(all_strategy_combinations())
   def test_fit_with_tuple_and_dict_dataset_inputs(self, distribution):
     with self.cached_session():
       with distribution.scope():
@@ -983,7 +985,7 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
 
       model.fit(dataset_dict, epochs=1, steps_per_epoch=2, verbose=1)
 
-  @combinations.generate(all_strategy_combinations())
+  @ds_combinations.generate(all_strategy_combinations())
   def test_fit_with_dictionary_in_the_dataset_b135161171(
       self, distribution):
 
@@ -1031,7 +1033,7 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
 
       model.fit(data)
 
-  @combinations.generate(all_strategy_combinations())
+  @ds_combinations.generate(all_strategy_combinations())
   def test_fit_eval_and_predict_methods_on_dataset_without_steps(
       self, distribution):
     with self.cached_session():
@@ -1067,7 +1069,7 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
       self.assertAllClose(
           predict_with_numpy, predict_with_ds, atol=1e-4, rtol=1e-4)
 
-  @combinations.generate(all_strategy_combinations())
+  @ds_combinations.generate(all_strategy_combinations())
   def test_on_dataset_with_unknown_cardinality_without_steps(
       self, distribution, mode):
     # TODO(b/155867206): Investigate why this test occasionally segfaults on TPU
@@ -1130,7 +1132,7 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
           atol=1e-4,
           rtol=1e-4)
 
-  @combinations.generate(tpu_strategy_combinations_graph_only())
+  @ds_combinations.generate(tpu_strategy_combinations_graph_only())
   def test_on_dataset_with_unknown_cardinality(self, distribution):
     with self.cached_session():
       with distribution.scope():
@@ -1171,7 +1173,7 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
                                   'Number of steps could not be inferred'):
         model.fit(dataset, epochs=1)
 
-  @combinations.generate(all_strategy_combinations())
+  @ds_combinations.generate(all_strategy_combinations())
   def test_fit_eval_and_predict_methods_on_dataset(
       self, distribution):
     with self.cached_session():
@@ -1192,7 +1194,7 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
       model.evaluate(dataset, steps=2, verbose=1)
       model.predict(get_predict_dataset(distribution), steps=2)
 
-  @combinations.generate(strategy_and_optimizer_combinations())
+  @ds_combinations.generate(strategy_and_optimizer_combinations())
   def test_fit_eval_and_predict_with_optimizer(self, distribution, optimizer):
     with self.cached_session():
 
@@ -1210,7 +1212,7 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
       model.evaluate(dataset, steps=2, verbose=1)
       model.predict(get_predict_dataset(distribution), steps=2)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           distribution=[
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
@@ -1242,7 +1244,7 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
       with self.assertRaisesRegex(ValueError, 'is incompatible with'):
         model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           distribution=[
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu
@@ -1268,7 +1270,7 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
 
       model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           distribution=[
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
@@ -1323,7 +1325,7 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
       ref_output = np.ones((160, 1), dtype=np.float32)
       self.assertArrayNear(output, ref_output, 1e-1)
 
-  @combinations.generate(all_strategy_combinations())
+  @ds_combinations.generate(all_strategy_combinations())
   def testOptimizerWithCallbacks(self, distribution):
     with self.cached_session():
       with distribution.scope():
@@ -1347,7 +1349,7 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
           callbacks=[keras.callbacks.LearningRateScheduler(schedule)])
       self.assertAllClose(0.001, keras.backend.get_value(model.optimizer.lr))
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(tpu_strategy_combinations_graph_only(),
                          combinations.combine(batch_size=[4, 6])))
   def test_evaluate_with_dataset_with_partial_batch(self, distribution,
@@ -1388,7 +1390,7 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
           atol=1e-5,
           rtol=1e-5)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(
           tpu_strategy_combinations_graph_only()))
   def test_predict_with_dataset_with_partial_batch(
@@ -1420,7 +1422,7 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
           atol=1e-5,
           rtol=1e-5)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(
           tpu_strategy_combinations_graph_only()))
   def test_predict_multi_output_model_with_dataset_with_partial_batch(
@@ -1457,7 +1459,7 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
           atol=1e-4,
           rtol=1e-4)
 
-  @combinations.generate(all_strategy_combinations_minus_default())
+  @ds_combinations.generate(all_strategy_combinations_minus_default())
   def test_match_model_input_matches_with_dataset_tensors(self, distribution):
 
     def _create_model_input_output_tensors():
@@ -1510,7 +1512,7 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
             atol=1e-4,
             rtol=1e-4)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           distribution=strategies_minus_tpu,
           mode=['graph', 'eager']))
@@ -1571,7 +1573,7 @@ class TestRegularizerLoss(test.TestCase, parameterized.TestCase):
   def loss_fn(_, y_pred):
     return math_ops.reduce_mean(y_pred)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(
           strategy_combinations.all_strategy_combinations_minus_default()))
   def test_regularizer_loss(self, distribution):
@@ -1604,10 +1606,12 @@ class TestRegularizerLoss(test.TestCase, parameterized.TestCase):
       self.assertEqual(-1.0, v)
 
 
+@testing_utils.run_all_without_tensor_float_32(
+    'Uses Dense layers, which call matmul')
 class TestDistributionStrategyWithKerasModels(test.TestCase,
                                               parameterized.TestCase):
 
-  @combinations.generate(all_strategy_combinations())
+  @ds_combinations.generate(all_strategy_combinations())
   def test_distribution_strategy_on_sequential_model(
       self, distribution):
     with distribution.scope():
@@ -1626,7 +1630,7 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
     model.predict(inputs, batch_size=10)
     model.evaluate(inputs, targets, batch_size=10)
 
-  @combinations.generate(all_strategy_combinations())
+  @ds_combinations.generate(all_strategy_combinations())
   def test_distribution_strategy_on_functional_model(
       self, distribution):
     with distribution.scope():
@@ -1645,7 +1649,7 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
     model.predict(inputs)
     model.evaluate(inputs, targets)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(distribution=all_strategies, mode=['eager']))
   def test_distributed_dataset(self, distribution):
     with distribution.scope():
@@ -1697,7 +1701,7 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
                                     'distributed dataset, you must specify'):
           model.fit(ds, epochs=2)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(distribution=all_strategies, mode=['eager']))
   def test_distributed_datasets_from_function(self, distribution):
     with distribution.scope():
@@ -1751,7 +1755,7 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
                                     'distributed dataset, you must specify'):
           model.fit(ds, epochs=2)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(distribution=all_strategies, mode=['eager']))
   def test_host_training_loop(self, distribution):
     with distribution.scope():
@@ -1761,7 +1765,7 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
       outputs = keras.layers.Dense(1)(x)
       model = keras.Model(inputs, outputs)
 
-    model.compile('sgd', 'mse', experimental_steps_per_execution=10)
+    model.compile('sgd', 'mse', steps_per_execution=10)
 
     bc = BatchCountingCB()
     x, y = np.ones((100, 10, 10, 3)), np.ones((100, 1))
@@ -1777,7 +1781,7 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
     self.assertEqual(bc.predict_begin_batches, [0, 10, 20, 30, 40])
     self.assertEqual(bc.predict_end_batches, [9, 19, 29, 39, 49])
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(distribution=all_strategies, mode=['eager']))
   def test_host_training_loop_last_partial_execution(self, distribution):
     with distribution.scope():
@@ -1785,7 +1789,7 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
       outputs = keras.layers.Dense(1)(inputs)
       model = keras.Model(inputs, outputs)
 
-    model.compile('sgd', 'mse', experimental_steps_per_execution=20)
+    model.compile('sgd', 'mse', steps_per_execution=20)
 
     bc = BatchCountingCB()
     x, y = np.ones((100, 10)), np.ones((100, 1))
@@ -1801,7 +1805,7 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
     self.assertEqual(bc.predict_begin_batches, [0, 20, 40])
     self.assertEqual(bc.predict_end_batches, [19, 39, 49])
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(distribution=all_strategies, mode=['eager']))
   def test_host_training_loop_dataset_unknown_size(self, distribution):
     with distribution.scope():
@@ -1809,7 +1813,7 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
       outputs = keras.layers.Dense(1)(inputs)
       model = keras.Model(inputs, outputs)
 
-    model.compile('sgd', 'mse', experimental_steps_per_execution=20)
+    model.compile('sgd', 'mse', steps_per_execution=20)
 
     x, y = np.ones((100, 10)), np.ones((100, 1))
     ds = dataset_ops.DatasetV2.from_tensor_slices((x, y)).batch(2)
@@ -1837,7 +1841,7 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
     self.assertEqual(bc.predict_begin_batches, [0, 20, 40])
     self.assertEqual(bc.predict_end_batches, [19, 39, 49])
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(distribution=all_strategies, mode=['eager']))
   def test_host_training_loop_truncate_to_epoch(self, distribution):
     with distribution.scope():
@@ -1845,7 +1849,7 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
       outputs = keras.layers.Dense(1)(inputs)
       model = keras.Model(inputs, outputs)
 
-    model.compile('sgd', 'mse', experimental_steps_per_execution=500)
+    model.compile('sgd', 'mse', steps_per_execution=500)
 
     x, y = np.ones((100, 10)), np.ones((100, 1))
     bc = BatchCountingCB()
@@ -1863,7 +1867,70 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
     self.assertEqual(bc.predict_begin_batches, [0])
     self.assertEqual(bc.predict_end_batches, [24])
 
-  @combinations.generate(
+  @ds_combinations.generate(
+      combinations.combine(distribution=all_strategies, mode=['eager']))
+  def test_gradient_clipping(self, distribution):
+
+    class MyLayer(keras.layers.Layer):
+
+      def build(self, _):
+        self.v1 = variables.Variable(1.)
+        self.v2 = variables.Variable(1.)
+
+      def call(self, x):
+        return 3 * self.v1 - 3 * self.v2
+
+    x, y = np.ones((10, 1)), np.ones((10, 1))
+
+    with distribution.scope():
+      layer = MyLayer()
+      model = keras.Sequential([layer])
+      optimizer = gradient_descent_keras.SGD(1., clipnorm=2., clipvalue=2.)
+    model.compile(optimizer, 'mae')
+
+    if isinstance(distribution,
+                  central_storage_strategy.CentralStorageStrategy):
+      with self.assertRaisesRegex(ValueError, 'not supported'):
+        model.fit(x, y, batch_size=10, epochs=1)
+    else:
+      model.fit(x, y, batch_size=10, epochs=1)
+      self.assertAllClose(self.evaluate(layer.v1), 3.)
+      self.assertAllClose(self.evaluate(layer.v2), -1.)
+
+  @ds_combinations.generate(
+      combinations.combine(distribution=all_strategies, mode=['eager']))
+  def test_custom_gradient_transformation(self, distribution):
+    if isinstance(distribution,
+                  central_storage_strategy.CentralStorageStrategy):
+      self.skipTest('Not supported with `CentralStorageStrategy`')
+
+    class MyLayer(keras.layers.Layer):
+
+      def build(self, _):
+        self.v1 = variables.Variable(1.)
+        self.v2 = variables.Variable(-1.)
+
+      def call(self, x):
+        return x + self.v1 + self.v2
+
+    def custom_transform(grads_and_vars):
+      # Always set gradients to 1.
+      return [(array_ops.ones_like(g), v) for g, v in grads_and_vars]
+
+    x, y = np.ones((10, 1)), np.ones((10, 1))
+
+    with distribution.scope():
+      layer = MyLayer()
+      model = keras.Sequential([layer])
+      optimizer = gradient_descent_keras.SGD(
+          1., gradient_transformers=[custom_transform])
+    model.compile(optimizer, 'mae')
+
+    model.fit(x, y, batch_size=10, epochs=1)
+    self.assertAllClose(self.evaluate(layer.v1), 0.)
+    self.assertAllClose(self.evaluate(layer.v2), -2.)
+
+  @ds_combinations.generate(
       combinations.times(
           all_strategy_combinations_minus_default()))
   def test_distribution_strategy_one_dimensional(self, distribution):
@@ -1881,7 +1948,7 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
 
       model.fit(x, y, epochs=1, steps_per_epoch=2)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           distribution=[
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
@@ -1925,7 +1992,7 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
     self.assertArrayNear(history.history['loss'], ds_history.history['loss'],
                          1e-5)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(
           all_strategy_combinations_minus_default()))
   def test_distribution_strategy_with_symbolic_add_loss(
@@ -1956,7 +2023,7 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
     self.assertAllClose(history.history, ds_history.history)
 
   # TODO(omalleyt): Investigate flakiness and re-enable.
-  @combinations.generate(all_strategy_minus_default_and_tpu_combinations())
+  @ds_combinations.generate(all_strategy_minus_default_and_tpu_combinations())
   def DISABLED_test_distribution_strategy_with_callable_add_loss(
       self, distribution):
 
@@ -1987,7 +2054,7 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
 
     self.assertAllClose(history.history, ds_history.history)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(
           all_strategy_minus_default_and_tpu_combinations()))
   def test_distribution_strategy_with_add_metric_in_call(
@@ -2035,7 +2102,7 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
 
     self.assertAllClose(history.history, ds_history.history)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           distribution=[
               strategy_combinations.one_device_strategy,
@@ -2089,7 +2156,7 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
 
     self.assertAllClose(history.history, ds_history.history)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       # TODO(phillypham): Why does validation_steps > 1 not work on TPUs?
       combinations.times(
           all_strategy_minus_default_and_tpu_combinations()))
@@ -2129,7 +2196,7 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
 
     self.assertAllClose(history.history, ds_history.history)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           distribution=strategies_minus_tpu,
           mode=['eager']))
@@ -2158,7 +2225,7 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
     self.assertAllEqual(output.values, expected_values)
     self.assertAllEqual(output.dense_shape, expected_dense_shape)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           distribution=strategies_minus_tpu,
           mode=['eager']))
@@ -2185,7 +2252,7 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
     expected_values = [[1], [2, 3]]
     self.assertAllEqual(expected_values, output)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           distribution=strategies_minus_default_minus_tpu + tpu_strategies,
           mode=['eager']))
@@ -2251,7 +2318,7 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
         for x in dataset:
           train_step(x)
 
-  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
+  @ds_combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_unimplemented_parameter_server_strategy(self):
     cluster_spec = multi_worker_test_base.create_in_process_cluster(
         num_workers=3, num_ps=2)
@@ -2367,7 +2434,7 @@ class TestDistributionStrategyWithMultipleAddLossAndMetricCalls(
     test.TestCase, parameterized.TestCase):
   """Tests complex models with multiple add loss and metric calls."""
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(
           all_strategy_combinations_minus_default(),
           combinations.combine(
@@ -2438,7 +2505,7 @@ class DeterministicModel(keras.Model):
 class TestModelCapturesStrategy(test.TestCase, parameterized.TestCase):
   """Tests that model creation captures the strategy."""
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           distribution=strategy_combinations.all_strategies,
           mode=['eager']))
diff --git a/tensorflow/python/keras/distribute/distributed_training_utils.py b/tensorflow/python/keras/distribute/distributed_training_utils.py
index 07bbf3f2b1c..1876b69539a 100644
--- a/tensorflow/python/keras/distribute/distributed_training_utils.py
+++ b/tensorflow/python/keras/distribute/distributed_training_utils.py
@@ -40,6 +40,7 @@ from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import optimizers
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.optimizer_v2 import optimizer_v2
+from tensorflow.python.keras.utils import tf_contextlib
 from tensorflow.python.keras.utils.mode_keys import ModeKeys
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
@@ -49,7 +50,6 @@ from tensorflow.python.ops.ragged import ragged_concat_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
-from tensorflow.python.util import tf_contextlib
 
 
 def set_weights(distribution_strategy, dist_model, weights):
diff --git a/tensorflow/python/keras/distribute/keras_correctness_test_base.py b/tensorflow/python/keras/distribute/keras_correctness_test_base.py
index 4a855f60777..825e94b9eba 100644
--- a/tensorflow/python/keras/distribute/keras_correctness_test_base.py
+++ b/tensorflow/python/keras/distribute/keras_correctness_test_base.py
@@ -24,13 +24,14 @@ import numpy as np
 import six
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import collective_all_reduce_strategy
 from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import mirrored_strategy
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.distribute import tpu_strategy
 from tensorflow.python.eager import context
 from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.keras.distribute import distributed_training_utils
 from tensorflow.python.keras.mixed_precision.experimental import policy
 from tensorflow.python.keras.preprocessing import sequence
@@ -54,6 +55,14 @@ all_strategies = [
 ]
 
 
+# TODO(b/159831559): add to all_strategies once all tests pass.
+multi_worker_mirrored = [
+    strategy_combinations.multi_worker_mirrored_2x1_cpu,
+    strategy_combinations.multi_worker_mirrored_2x1_gpu,
+    strategy_combinations.multi_worker_mirrored_2x2_gpu,
+]
+
+
 def eager_mode_test_configuration():
   return combinations.combine(
       mode='eager', use_numpy=[True, False], use_validation_data=[True, False])
@@ -117,6 +126,18 @@ def test_combinations_with_tpu_strategies():
       graph_mode_test_configuration()))
 
 
+def multi_worker_mirrored_eager():
+  return combinations.times(
+      combinations.combine(distribution=multi_worker_mirrored),
+      eager_mode_test_configuration())
+
+
+def multi_worker_mirrored_eager_and_graph():
+  return combinations.times(
+      combinations.combine(distribution=multi_worker_mirrored),
+      eager_mode_test_configuration() + graph_mode_test_configuration())
+
+
 class MaybeDistributionScope(object):
   """Provides a context allowing no distribution strategy."""
 
@@ -263,7 +284,12 @@ def fit_eval_and_predict(initial_weights,
 
   result['weights_1'] = model.get_weights()
 
-  if predict_inputs is not None:
+  # TODO(b/157924053): Now model.predict() doesn't support
+  # MultiWorkerMirroredStrategy. Enable model.predict() after it's supported.
+  if predict_inputs is not None and not isinstance(
+      distribution,
+      (collective_all_reduce_strategy.CollectiveAllReduceStrategy,
+       collective_all_reduce_strategy.CollectiveAllReduceStrategyV1)):
     # Check correctness of the result of predict() invoked
     # multiple times -- as for stateful models, result of
     # predict may differ for each batch.
diff --git a/tensorflow/python/keras/distribute/keras_dnn_correctness_test.py b/tensorflow/python/keras/distribute/keras_dnn_correctness_test.py
index 6ec7cc2bac5..621b7feadf7 100644
--- a/tensorflow/python/keras/distribute/keras_dnn_correctness_test.py
+++ b/tensorflow/python/keras/distribute/keras_dnn_correctness_test.py
@@ -20,20 +20,24 @@ from __future__ import print_function
 import numpy as np
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
 from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import multi_process_runner
 from tensorflow.python.eager import context
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.distribute import keras_correctness_test_base
 from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_descent_keras
-from tensorflow.python.platform import test
 from tensorflow.python.training import gradient_descent
 
 
 def all_strategy_combinations_with_eager_and_graph_modes():
   return (combinations.combine(
       distribution=keras_correctness_test_base.all_strategies,
-      mode=['graph', 'eager']))
+      mode=['graph', 'eager']) + combinations.combine(
+          distribution=keras_correctness_test_base.multi_worker_mirrored,
+          mode='eager'))
 
 
 def all_strategy_combinations_with_graph_mode():
@@ -47,6 +51,8 @@ def is_default_strategy(strategy):
     return not distribution_strategy_context.has_strategy()
 
 
+@testing_utils.run_all_without_tensor_float_32(
+    'Uses Dense layers, which call matmul')
 class TestDistributionStrategyDnnCorrectness(
     keras_correctness_test_base.TestDistributionStrategyCorrectnessBase):
 
@@ -97,22 +103,25 @@ class TestDistributionStrategyDnnCorrectness(
     x_predict = np.array([[1.], [2.], [3.], [4.]], dtype=np.float32)
     return x_train, y_train, x_eval, y_eval, x_predict
 
-  @combinations.generate(
-      keras_correctness_test_base.all_strategy_and_input_config_combinations())
+  @ds_combinations.generate(
+      keras_correctness_test_base.all_strategy_and_input_config_combinations() +
+      keras_correctness_test_base.multi_worker_mirrored_eager())
   def test_dnn_correctness(self, distribution, use_numpy, use_validation_data):
     self.run_correctness_test(distribution, use_numpy, use_validation_data)
 
-  @combinations.generate(
-      keras_correctness_test_base.test_combinations_with_tpu_strategies())
+  @ds_combinations.generate(
+      keras_correctness_test_base.test_combinations_with_tpu_strategies() +
+      keras_correctness_test_base.multi_worker_mirrored_eager())
   def test_dnn_correctness_with_partial_last_batch_eval(self, distribution,
                                                         use_numpy,
                                                         use_validation_data):
     self.run_correctness_test(
         distribution, use_numpy, use_validation_data, partial_last_batch='eval')
 
-  @combinations.generate(
+  @ds_combinations.generate(
       keras_correctness_test_base
-      .strategy_minus_tpu_and_input_config_combinations_eager())
+      .strategy_minus_tpu_and_input_config_combinations_eager() +
+      keras_correctness_test_base.multi_worker_mirrored_eager())
   def test_dnn_correctness_with_partial_last_batch(self, distribution,
                                                    use_numpy,
                                                    use_validation_data):
@@ -124,7 +133,7 @@ class TestDistributionStrategyDnnCorrectness(
         partial_last_batch='train_and_eval',
         training_epochs=1)
 
-  @combinations.generate(all_strategy_combinations_with_graph_mode())
+  @ds_combinations.generate(all_strategy_combinations_with_graph_mode())
   def test_dnn_with_dynamic_learning_rate(self, distribution):
     self.run_dynamic_lr_test(distribution)
 
@@ -163,7 +172,8 @@ class TestDistributionStrategyDnnMetricCorrectness(
       history = model.fit(x=train_dataset, epochs=2, steps_per_epoch=10)
       self.assertEqual(history.history['binary_accuracy'], [1.0, 1.0])
 
-  @combinations.generate(all_strategy_combinations_with_eager_and_graph_modes())
+  @ds_combinations.generate(
+      all_strategy_combinations_with_eager_and_graph_modes())
   def test_simple_dnn_metric_correctness(self, distribution):
     self.run_metric_correctness_test(distribution)
 
@@ -211,7 +221,8 @@ class TestDistributionStrategyDnnMetricEvalCorrectness(
       self.assertEqual(outs[1], 0.)
       self.assertEqual(outs[2], 0.)
 
-  @combinations.generate(all_strategy_combinations_with_eager_and_graph_modes())
+  @ds_combinations.generate(
+      all_strategy_combinations_with_eager_and_graph_modes())
   def test_identity_model_metric_eval_correctness(self, distribution):
     self.run_eval_metrics_correctness_test(distribution)
 
@@ -240,6 +251,8 @@ class SubclassedModel(keras.Model):
     return self.dense4(x)
 
 
+@testing_utils.run_all_without_tensor_float_32(
+    'Uses Dense layers, which call matmul')
 class TestDistributionStrategyDnnCorrectnessWithSubclassedModel(
     TestDistributionStrategyDnnCorrectness):
 
@@ -256,8 +269,9 @@ class TestDistributionStrategyDnnCorrectnessWithSubclassedModel(
           metrics=['mse'])
       return model
 
-  @combinations.generate(
-      keras_correctness_test_base.all_strategy_and_input_config_combinations())
+  @ds_combinations.generate(
+      keras_correctness_test_base.all_strategy_and_input_config_combinations() +
+      keras_correctness_test_base.multi_worker_mirrored_eager())
   def test_dnn_correctness(self, distribution, use_numpy, use_validation_data):
     if (context.executing_eagerly()) or is_default_strategy(distribution):
       self.run_correctness_test(distribution, use_numpy, use_validation_data)
@@ -275,7 +289,7 @@ class TestDistributionStrategyDnnCorrectnessWithSubclassedModel(
           '`input_dim` set in its first layer or a subclassed model.'):
         self.run_correctness_test(distribution, use_numpy, use_validation_data)
 
-  @combinations.generate(all_strategy_combinations_with_graph_mode())
+  @ds_combinations.generate(all_strategy_combinations_with_graph_mode())
   def test_dnn_with_dynamic_learning_rate(self, distribution):
     if ((context.executing_eagerly() and not K.is_tpu_strategy(distribution)) or
         is_default_strategy(distribution)):
@@ -294,7 +308,7 @@ class TestDistributionStrategyDnnCorrectnessWithSubclassedModel(
           '`input_dim` set in its first layer or a subclassed model.'):
         self.run_dynamic_lr_test(distribution)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       keras_correctness_test_base.test_combinations_with_tpu_strategies())
   def test_dnn_correctness_with_partial_last_batch_eval(self, distribution,
                                                         use_numpy,
@@ -311,4 +325,4 @@ class TestDistributionStrategyDnnCorrectnessWithSubclassedModel(
 
 
 if __name__ == '__main__':
-  test.main()
+  multi_process_runner.test_main()
diff --git a/tensorflow/python/keras/distribute/keras_embedding_model_correctness_test.py b/tensorflow/python/keras/distribute/keras_embedding_model_correctness_test.py
index 93e47df6ef0..91cb1cc77fd 100644
--- a/tensorflow/python/keras/distribute/keras_embedding_model_correctness_test.py
+++ b/tensorflow/python/keras/distribute/keras_embedding_model_correctness_test.py
@@ -19,7 +19,7 @@ from __future__ import print_function
 
 import numpy as np
 from tensorflow.python import keras
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
 from tensorflow.python.keras.distribute import keras_correctness_test_base
 from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_descent_keras
 from tensorflow.python.platform import test
@@ -55,7 +55,7 @@ class DistributionStrategyEmbeddingModelCorrectnessTest(
           metrics=['sparse_categorical_accuracy'])
     return model
 
-  @combinations.generate(
+  @ds_combinations.generate(
       keras_correctness_test_base.test_combinations_for_embedding_model())
   def test_embedding_model_correctness(self, distribution, use_numpy,
                                        use_validation_data):
@@ -63,7 +63,7 @@ class DistributionStrategyEmbeddingModelCorrectnessTest(
     self.use_distributed_dense = False
     self.run_correctness_test(distribution, use_numpy, use_validation_data)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       keras_correctness_test_base.test_combinations_for_embedding_model())
   def test_embedding_time_distributed_model_correctness(
       self, distribution, use_numpy, use_validation_data):
@@ -145,7 +145,7 @@ class DistributionStrategySiameseEmbeddingModelCorrectnessTest(
 
     return x_train, y_train, x_predict
 
-  @combinations.generate(
+  @ds_combinations.generate(
       keras_correctness_test_base.test_combinations_for_embedding_model())
   def test_siamese_embedding_model_correctness(self, distribution, use_numpy,
                                                use_validation_data):
diff --git a/tensorflow/python/keras/distribute/keras_image_model_correctness_test.py b/tensorflow/python/keras/distribute/keras_image_model_correctness_test.py
index 7e6ae3cc719..ed1e707d04a 100644
--- a/tensorflow/python/keras/distribute/keras_image_model_correctness_test.py
+++ b/tensorflow/python/keras/distribute/keras_image_model_correctness_test.py
@@ -19,13 +19,18 @@ from __future__ import print_function
 
 import numpy as np
 from tensorflow.python import keras
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
 from tensorflow.python.eager import context
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.distribute import keras_correctness_test_base
 from tensorflow.python.keras.optimizer_v2 import gradient_descent
 from tensorflow.python.platform import test
 
 
+@testing_utils.run_all_without_tensor_float_32(
+    'Uses Dense layers, which call matmul. Even if Dense layers run in '
+    'float64, the test sometimes fails with TensorFloat-32 enabled for unknown '
+    'reasons')
 class DistributionStrategyCnnCorrectnessTest(
     keras_correctness_test_base.TestDistributionStrategyCorrectnessBase):
 
@@ -91,23 +96,22 @@ class DistributionStrategyCnnCorrectnessTest(
     x_eval, y_eval = self._get_data(count=1000)
     return x_train, y_train, x_eval, y_eval, x_eval
 
-  @combinations.generate(
+  @ds_combinations.generate(
       keras_correctness_test_base.all_strategy_and_input_config_combinations())
   def test_cnn_correctness(self, distribution, use_numpy, use_validation_data):
     self.run_correctness_test(distribution, use_numpy, use_validation_data)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       keras_correctness_test_base.all_strategy_and_input_config_combinations())
   def test_cnn_with_batch_norm_correctness(self, distribution, use_numpy,
                                            use_validation_data):
-    self.skipTest('Flakily times out, b/134670856')
     self.run_correctness_test(
         distribution,
         use_numpy,
         use_validation_data,
         with_batch_norm='regular')
 
-  @combinations.generate(
+  @ds_combinations.generate(
       keras_correctness_test_base.all_strategy_and_input_config_combinations())
   def test_cnn_with_sync_batch_norm_correctness(self, distribution, use_numpy,
                                                 use_validation_data):
@@ -120,7 +124,7 @@ class DistributionStrategyCnnCorrectnessTest(
         use_validation_data,
         with_batch_norm='sync')
 
-  @combinations.generate(
+  @ds_combinations.generate(
       keras_correctness_test_base.test_combinations_with_tpu_strategies() +
       keras_correctness_test_base
       .strategy_minus_tpu_and_input_config_combinations_eager())
@@ -134,7 +138,7 @@ class DistributionStrategyCnnCorrectnessTest(
         partial_last_batch=True,
         training_epochs=1)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       keras_correctness_test_base.test_combinations_with_tpu_strategies() +
       keras_correctness_test_base
       .strategy_minus_tpu_and_input_config_combinations_eager())
diff --git a/tensorflow/python/keras/distribute/keras_metrics_test.py b/tensorflow/python/keras/distribute/keras_metrics_test.py
index 44ed5debe60..af82b79066c 100644
--- a/tensorflow/python/keras/distribute/keras_metrics_test.py
+++ b/tensorflow/python/keras/distribute/keras_metrics_test.py
@@ -19,12 +19,13 @@ from __future__ import print_function
 
 from absl.testing import parameterized
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
 from tensorflow.python.distribute import strategy_combinations
-from tensorflow.python.eager import test
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.keras import metrics
 from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
 
 
 def _labeled_dataset_fn():
@@ -114,7 +115,7 @@ class KerasMetricsTest(test.TestCase, parameterized.TestCase):
         if batches_consumed >= 4:  # Consume 4 input batches in total.
           break
 
-  @combinations.generate(all_combinations() + tpu_combinations())
+  @ds_combinations.generate(all_combinations() + tpu_combinations())
   def testMean(self, distribution):
     def _dataset_fn():
       return dataset_ops.Dataset.range(1000).map(math_ops.to_float).batch(
diff --git a/tensorflow/python/keras/distribute/keras_models_test.py b/tensorflow/python/keras/distribute/keras_models_test.py
new file mode 100644
index 00000000000..6c82545beb6
--- /dev/null
+++ b/tensorflow/python/keras/distribute/keras_models_test.py
@@ -0,0 +1,61 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras high level APIs, e.g. fit, evaluate and predict."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python import keras
+from tensorflow.python.distribute import combinations as ds_combinations
+from tensorflow.python.distribute import strategy_combinations
+from tensorflow.python.framework import test_combinations as combinations
+from tensorflow.python.platform import test
+
+
+class KerasModelsTest(test.TestCase, parameterized.TestCase):
+
+  @ds_combinations.generate(
+      combinations.combine(
+          distribution=strategy_combinations.all_strategies, mode=["eager"]))
+  def test_lstm_model_with_dynamic_batch(self, distribution):
+    input_data = np.random.random([1, 32, 64, 64, 3])
+    input_shape = tuple(input_data.shape[1:])
+
+    def build_model():
+      model = keras.models.Sequential()
+      model.add(
+          keras.layers.ConvLSTM2D(
+              4,
+              kernel_size=(4, 4),
+              activation="sigmoid",
+              padding="same",
+              input_shape=input_shape))
+      model.add(keras.layers.GlobalMaxPooling2D())
+      model.add(keras.layers.Dense(2, activation="sigmoid"))
+      return model
+
+    with distribution.scope():
+      model = build_model()
+      model.compile(loss="binary_crossentropy", optimizer="adam")
+      result = model.predict(input_data)
+      self.assertEqual(result.shape, (1, 2))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/distribute/keras_optimizer_v2_test.py b/tensorflow/python/keras/distribute/keras_optimizer_v2_test.py
index 75c7ce833c5..b6379706f0d 100644
--- a/tensorflow/python/keras/distribute/keras_optimizer_v2_test.py
+++ b/tensorflow/python/keras/distribute/keras_optimizer_v2_test.py
@@ -21,13 +21,14 @@ from __future__ import print_function
 from absl.testing import parameterized
 import numpy as np
 from tensorflow.python import keras
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
 from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.keras.optimizer_v2 import adam
 from tensorflow.python.keras.optimizer_v2 import gradient_descent
 from tensorflow.python.ops import math_ops
@@ -45,7 +46,7 @@ def get_model():
 
 class MirroredStrategyOptimizerV2Test(test.TestCase, parameterized.TestCase):
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           distribution=[
               strategy_combinations.central_storage_strategy_with_two_gpus,
@@ -101,7 +102,7 @@ class MirroredStrategyOptimizerV2Test(test.TestCase, parameterized.TestCase):
       # v(2) = beta2 * v(1) + (1-beta2) * grad^2 = 0.2 * 1.8 + 0.8 * 2.25
       self.assertAllClose(2.16, self.evaluate(all_vars[2]))
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           distribution=[
               strategy_combinations.central_storage_strategy_with_two_gpus,
diff --git a/tensorflow/python/keras/distribute/keras_premade_models_test.py b/tensorflow/python/keras/distribute/keras_premade_models_test.py
index 2d24ca369a7..1b7548e7a85 100644
--- a/tensorflow/python/keras/distribute/keras_premade_models_test.py
+++ b/tensorflow/python/keras/distribute/keras_premade_models_test.py
@@ -20,8 +20,9 @@ from __future__ import print_function
 from absl.testing import parameterized
 import numpy as np
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
 from tensorflow.python.distribute import strategy_combinations
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.keras.engine import sequential
 from tensorflow.python.keras.layers import core
 from tensorflow.python.keras.optimizer_v2 import adagrad
@@ -59,7 +60,7 @@ def get_dataset():
 
 class KerasPremadeModelsTest(test.TestCase, parameterized.TestCase):
 
-  @combinations.generate(strategy_combinations_eager_data_fn())
+  @ds_combinations.generate(strategy_combinations_eager_data_fn())
   def test_linear_model(self, distribution, data_fn):
     with distribution.scope():
       model = linear.LinearModel()
@@ -72,7 +73,7 @@ class KerasPremadeModelsTest(test.TestCase, parameterized.TestCase):
         hist = model.fit(get_dataset(), epochs=5)
       self.assertLess(hist.history['loss'][4], 0.2)
 
-  @combinations.generate(strategy_combinations_eager_data_fn())
+  @ds_combinations.generate(strategy_combinations_eager_data_fn())
   def test_wide_deep_model(self, distribution, data_fn):
     with distribution.scope():
       linear_model = linear.LinearModel(units=1)
diff --git a/tensorflow/python/keras/distribute/keras_rnn_model_correctness_test.py b/tensorflow/python/keras/distribute/keras_rnn_model_correctness_test.py
index aa7f0c20045..0dbf88000d2 100644
--- a/tensorflow/python/keras/distribute/keras_rnn_model_correctness_test.py
+++ b/tensorflow/python/keras/distribute/keras_rnn_model_correctness_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 import numpy as np
 from tensorflow.python import keras
 from tensorflow.python import tf2
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
 from tensorflow.python.distribute import tpu_strategy
 from tensorflow.python.eager import context
 from tensorflow.python.keras import testing_utils
@@ -69,6 +69,8 @@ class _DistributionStrategyRnnModelCorrectnessTest(
     return model
 
 
+@testing_utils.run_all_without_tensor_float_32(
+    'Uses Dense layers, which call matmul')
 class DistributionStrategyGruModelCorrectnessTest(
     _DistributionStrategyRnnModelCorrectnessTest):
 
@@ -80,7 +82,7 @@ class DistributionStrategyGruModelCorrectnessTest(
     else:
       return rnn_v1.GRU
 
-  @combinations.generate(
+  @ds_combinations.generate(
       keras_correctness_test_base.test_combinations_for_embedding_model())
   def test_gru_model_correctness(self, distribution, use_numpy,
                                  use_validation_data):
@@ -88,6 +90,8 @@ class DistributionStrategyGruModelCorrectnessTest(
     self.run_correctness_test(distribution, use_numpy, use_validation_data)
 
 
+@testing_utils.run_all_without_tensor_float_32(
+    'Uses Dense layers, which call matmul')
 class DistributionStrategyLstmModelCorrectnessTest(
     _DistributionStrategyRnnModelCorrectnessTest):
 
@@ -99,13 +103,13 @@ class DistributionStrategyLstmModelCorrectnessTest(
     else:
       return rnn_v1.LSTM
 
-  @combinations.generate(
+  @ds_combinations.generate(
       keras_correctness_test_base.test_combinations_for_embedding_model())
   def test_lstm_model_correctness(self, distribution, use_numpy,
                                   use_validation_data):
     self.run_correctness_test(distribution, use_numpy, use_validation_data)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       keras_correctness_test_base.test_combinations_for_embedding_model())
   @testing_utils.enable_v2_dtype_behavior
   def test_lstm_model_correctness_mixed_precision(self, distribution, use_numpy,
diff --git a/tensorflow/python/keras/distribute/keras_save_load_test.py b/tensorflow/python/keras/distribute/keras_save_load_test.py
index 65877a0f869..9f3c7f24dd8 100644
--- a/tensorflow/python/keras/distribute/keras_save_load_test.py
+++ b/tensorflow/python/keras/distribute/keras_save_load_test.py
@@ -17,13 +17,17 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-
-from tensorflow.python.distribute import combinations
-from tensorflow.python.eager import test
+from tensorflow.python.distribute import combinations as ds_combinations
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_combinations as combinations
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.distribute import saved_model_test_base as test_base
 from tensorflow.python.keras.saving import save
+from tensorflow.python.platform import test
 
 
+@testing_utils.run_all_without_tensor_float_32(
+    'Uses Dense layers, which call matmul')
 class KerasSaveLoadTest(test_base.TestSavedModelBase):
 
   def setUp(self):
@@ -42,13 +46,13 @@ class KerasSaveLoadTest(test_base.TestSavedModelBase):
     return restored_keras_model.predict(
         predict_dataset, steps=test_base.PREDICT_STEPS)
 
-  @combinations.generate(test_base.simple_models_with_strategies())
+  @ds_combinations.generate(test_base.simple_models_with_strategies())
   def test_save_no_strategy_restore_strategy(self, model_and_input,
                                              distribution):
     self.run_test_save_no_strategy_restore_strategy(
         model_and_input, distribution)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(test_base.simple_models_with_strategies(),
                          combinations.combine(save_in_scope=[True, False])))
   def test_save_strategy_restore_no_strategy(self, model_and_input,
@@ -56,7 +60,7 @@ class KerasSaveLoadTest(test_base.TestSavedModelBase):
     self.run_test_save_strategy_restore_no_strategy(
         model_and_input, distribution, save_in_scope)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(test_base.simple_models_with_strategy_pairs(),
                          combinations.combine(save_in_scope=[True, False])))
   def test_save_strategy_restore_strategy(self, model_and_input,
@@ -70,4 +74,5 @@ class KerasSaveLoadTest(test_base.TestSavedModelBase):
 
 
 if __name__ == '__main__':
+  ops.enable_eager_execution()
   test.main()
diff --git a/tensorflow/python/keras/distribute/keras_stateful_lstm_model_correctness_test.py b/tensorflow/python/keras/distribute/keras_stateful_lstm_model_correctness_test.py
index 7b1bc7665b8..f6344bd3646 100644
--- a/tensorflow/python/keras/distribute/keras_stateful_lstm_model_correctness_test.py
+++ b/tensorflow/python/keras/distribute/keras_stateful_lstm_model_correctness_test.py
@@ -19,8 +19,9 @@ from __future__ import print_function
 
 import numpy as np
 from tensorflow.python import keras
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
 from tensorflow.python.distribute import strategy_combinations
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.keras.distribute import keras_correctness_test_base
 from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_descent_keras
 from tensorflow.python.platform import test
@@ -82,7 +83,7 @@ class DistributionStrategyStatefulLstmModelCorrectnessTest(
 
   # TODO(jhseu): Disabled to fix b/130808953. Need to investigate why it
   # doesn't work and enable for DistributionStrategy more generally.
-  @combinations.generate(test_combinations_for_stateful_embedding_model())
+  @ds_combinations.generate(test_combinations_for_stateful_embedding_model())
   def disabled_test_stateful_lstm_model_correctness(
       self, distribution, use_numpy, use_validation_data):
     self.run_correctness_test(
@@ -91,7 +92,7 @@ class DistributionStrategyStatefulLstmModelCorrectnessTest(
         use_validation_data,
         is_stateful_model=True)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(
           keras_correctness_test_base.test_combinations_with_tpu_strategies()))
   def test_incorrectly_use_multiple_cores_for_stateful_lstm_model(
diff --git a/tensorflow/python/keras/distribute/keras_utils_test.py b/tensorflow/python/keras/distribute/keras_utils_test.py
index 1baa687fbb2..b4e8c6c738f 100644
--- a/tensorflow/python/keras/distribute/keras_utils_test.py
+++ b/tensorflow/python/keras/distribute/keras_utils_test.py
@@ -25,13 +25,14 @@ from absl.testing import parameterized
 import numpy as np
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.distribute import tpu_strategy
 from tensorflow.python.distribute import values
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.keras import losses
 from tensorflow.python.keras.distribute import distribute_strategy_test as keras_test_lib
 from tensorflow.python.keras.distribute import distributed_training_utils
@@ -73,7 +74,7 @@ class Counter(keras.callbacks.Callback):
 class TestDistributionStrategyWithCallbacks(test.TestCase,
                                             parameterized.TestCase):
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(
           keras_test_lib.all_strategy_combinations()))
   def test_callbacks_in_fit(self, distribution):
@@ -127,7 +128,7 @@ class TestDistributionStrategyWithCallbacks(test.TestCase,
             'on_train_end': 1
         })
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(
           keras_test_lib.all_strategy_combinations()))
   def test_callbacks_in_eval(self, distribution):
@@ -151,7 +152,7 @@ class TestDistributionStrategyWithCallbacks(test.TestCase,
             'on_test_end': 1
         })
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(
           keras_test_lib.all_strategy_combinations()))
   def test_callbacks_in_predict(self, distribution):
@@ -181,7 +182,7 @@ class TestDistributionStrategyWithCallbacks(test.TestCase,
 
 class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           distribution=[
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
@@ -205,7 +206,7 @@ class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
           distributed_training_utils.validate_distributed_dataset_inputs(
               distribution, x, y)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           distribution=[
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
@@ -229,7 +230,7 @@ class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
           distributed_training_utils.validate_distributed_dataset_inputs(
               distribution, x, y)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           distribution=[
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
@@ -279,7 +280,7 @@ class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
       with self.assertRaises(ValueError):
         model.predict(dataset, verbose=0)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           distribution=[
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
@@ -313,7 +314,7 @@ class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
         model.compile(
             'sgd')
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           distribution=[
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
@@ -340,7 +341,7 @@ class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
           model.compile(
               'sgd')
 
-  @combinations.generate(
+  @ds_combinations.generate(
       keras_test_lib.all_strategy_combinations_minus_default())
   def test_standalone_loss_without_loss_reduction(self, distribution):
     with distribution.scope():
@@ -358,7 +359,7 @@ class TestDistributionStrategyWithLossMasking(test.TestCase,
 
   # TODO(priyag): Enable all strategies for this test. Currently it does not
   # work for TPU due to some invalid datatype.
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           distribution=[
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
@@ -391,7 +392,7 @@ class TestDistributionStrategyWithLossMasking(test.TestCase,
 class TestDistributionStrategyWithNormalizationLayer(test.TestCase,
                                                      parameterized.TestCase):
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(
           keras_test_lib.all_strategy_combinations(),
           combinations.combine(
@@ -436,7 +437,7 @@ class TestDistributionStrategyWithNormalizationLayer(test.TestCase,
 class TestDistributionStrategySaveLoadWeights(test.TestCase,
                                               parameterized.TestCase):
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(
           keras_test_lib.all_strategy_combinations_minus_default(),
           combinations.combine(
@@ -463,7 +464,7 @@ class TestDistributionStrategySaveLoadWeights(test.TestCase,
             keras_test_lib.get_predict_dataset(distribution), steps=2)
         model_2.fit(dataset, epochs=1, steps_per_epoch=1)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(
           keras_test_lib.all_strategy_combinations_minus_default(),
           combinations.combine(
@@ -498,7 +499,7 @@ class TestDistributionStrategySaveLoadWeights(test.TestCase,
 
 class TestDistributionStrategyValidation(test.TestCase, parameterized.TestCase):
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(
           keras_test_lib.all_strategy_combinations_minus_default()))
   def test_layer_outside_scope(self, distribution):
@@ -517,7 +518,7 @@ class TestDistributionStrategyValidation(test.TestCase, parameterized.TestCase):
               loss,
               metrics=metrics)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       keras_test_lib.all_strategy_combinations_minus_default())
   def test_model_outside_scope(self, distribution):
     with self.cached_session():
@@ -536,7 +537,7 @@ class TestDistributionStrategyValidation(test.TestCase, parameterized.TestCase):
 class TestDistributionStrategyWithStaticShapes(test.TestCase,
                                                parameterized.TestCase):
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           distribution=[
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
@@ -549,7 +550,7 @@ class TestDistributionStrategyWithStaticShapes(test.TestCase,
           r'the number of replicas \(2\)'):
         keras.layers.Input(shape=(3,), batch_size=5, name='input')
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           distribution=[
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
diff --git a/tensorflow/python/keras/distribute/minimize_loss_test.py b/tensorflow/python/keras/distribute/minimize_loss_test.py
index 804b205b5a7..9df91f3fb6c 100644
--- a/tensorflow/python/keras/distribute/minimize_loss_test.py
+++ b/tensorflow/python/keras/distribute/minimize_loss_test.py
@@ -22,17 +22,17 @@ from absl.testing import parameterized
 import numpy
 
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.distribute import strategy_test_lib
 from tensorflow.python.distribute.single_loss_example import batchnorm_example
 from tensorflow.python.distribute.single_loss_example import minimize_loss_example
 from tensorflow.python.eager import context
-from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.keras.distribute import optimizer_combinations
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -42,6 +42,7 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.ops.losses import losses_impl
+from tensorflow.python.platform import test
 
 
 VAR_MAP_V1 = {
@@ -71,7 +72,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
     self.evaluate(iterator.initializer)
     return iterator
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(
           optimizer_combinations.distributions_and_v1_optimizers(),
           combinations.combine(mode=["graph"], use_callable_loss=[True, False])
@@ -122,7 +123,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
       is_not_increasing = all(y <= x for x, y in zip(error, error[1:]))
       self.assertTrue(is_not_increasing)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(
           optimizer_combinations.distributions_and_v1_optimizers(),
           combinations.combine(mode=["graph"], use_callable_loss=[True, False])
@@ -161,7 +162,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
       is_not_increasing = all(y <= x for x, y in zip(error, error[1:]))
       self.assertTrue(is_not_increasing)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(
           optimizer_combinations.distributions_and_v1_and_v2_optimizers(),
           combinations.combine(mode=["graph", "eager"])) + combinations.combine(
@@ -228,7 +229,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
           get_expected_variables(len(distribution.extended.parameter_devices)),
           set(created_variables))
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(
           combinations.combine(momentum=[0.8, 0.9, 0.99], renorm=[False, True]),
           combinations.times(
@@ -295,7 +296,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
               expected_moving_mean - averaged_batch_mean(i)) * (1.0 - momentum))
           self.assertNear(expected_moving_means[i], moving_means[i], 0.0001)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(
           combinations.combine(loss_reduction=[
               losses_impl.Reduction.SUM, losses_impl.Reduction.MEAN,
@@ -411,7 +412,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
         # One of the mean loss reductions.
         self.assertNear(weight, 2 + 0.053, 0.0001)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(
           optimizer_combinations.distributions_and_v1_and_v2_optimizers(),
           combinations.combine(mode=["graph", "eager"]),
@@ -541,7 +542,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(initial_loss.dtype, loss_tensor.dtype)
     self.assertEqual(initial_loss.shape, loss_tensor.shape)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       optimizer_combinations.distributions_and_v2_optimizers())
   def test_empty_var_list(self, distribution, optimizer_fn):
     opt = optimizer_fn()
diff --git a/tensorflow/python/keras/distribute/mirrored_strategy_test.py b/tensorflow/python/keras/distribute/mirrored_strategy_test.py
index 2844af8cc3a..fc800d4b210 100644
--- a/tensorflow/python/keras/distribute/mirrored_strategy_test.py
+++ b/tensorflow/python/keras/distribute/mirrored_strategy_test.py
@@ -19,17 +19,17 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
-from tensorflow.python.eager import function
-from tensorflow.python.eager import test
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.keras.engine import training as keras_training
 from tensorflow.python.keras.layers import core as keras_core
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
 from tensorflow.python.training import gradient_descent
 from tensorflow.python.training import optimizer as optimizer_lib
 
@@ -50,18 +50,18 @@ class MiniModel(keras_training.Model):
     return self.fc(inputs)
 
 
-@combinations.generate(
+@ds_combinations.generate(
     combinations.combine(
         distribution=[
             strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
         ],
-        mode=["graph", "eager"]))
+        mode=["eager"]))
 class MirroredStrategyDefunTest(test.TestCase):
 
   def testTrain(self, distribution):
     with distribution.scope():
       mock_model = MiniModel()
-      mock_model.call = function.defun(mock_model.call)
+      mock_model.call = def_function.function(mock_model.call)
 
       def loss_fn(ctx):
         del ctx
diff --git a/tensorflow/python/keras/distribute/mirrored_variable_test.py b/tensorflow/python/keras/distribute/mirrored_variable_test.py
index e24420ffc4c..22a8dbea234 100644
--- a/tensorflow/python/keras/distribute/mirrored_variable_test.py
+++ b/tensorflow/python/keras/distribute/mirrored_variable_test.py
@@ -20,14 +20,15 @@ from __future__ import print_function
 
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import collective_all_reduce_strategy
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
 from tensorflow.python.distribute import distribute_utils
 from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.eager import context
-from tensorflow.python.eager import test
 from tensorflow.python.framework import config
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.keras.layers import core
+from tensorflow.python.platform import test
 
 
 def _mimic_two_cpus():
@@ -39,11 +40,11 @@ def _mimic_two_cpus():
   ])
 
 
-@combinations.generate(
+@ds_combinations.generate(
     combinations.combine(
         distribution=[
             strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
-            combinations.NamedDistribution(
+            ds_combinations.NamedDistribution(
                 "Collective2CPUs",
                 # pylint: disable=g-long-lambda
                 lambda: collective_all_reduce_strategy.
diff --git a/tensorflow/python/keras/distribute/model_combinations.py b/tensorflow/python/keras/distribute/model_combinations.py
index 8b8dea9d16b..2d95b308cb3 100644
--- a/tensorflow/python/keras/distribute/model_combinations.py
+++ b/tensorflow/python/keras/distribute/model_combinations.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.distribute import combinations
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.keras.distribute import simple_models
 
 simple_functional_model = combinations.NamedObject(
diff --git a/tensorflow/python/keras/distribute/multi_worker_callback_tf2_test.py b/tensorflow/python/keras/distribute/multi_worker_callback_tf2_test.py
index abed53b0cd0..07bc974a352 100644
--- a/tensorflow/python/keras/distribute/multi_worker_callback_tf2_test.py
+++ b/tensorflow/python/keras/distribute/multi_worker_callback_tf2_test.py
@@ -24,10 +24,11 @@ import os
 from absl.testing import parameterized
 
 from tensorflow.python.distribute import collective_all_reduce_strategy as collective_strategy
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
 from tensorflow.python.distribute import distributed_file_utils
 from tensorflow.python.distribute import multi_process_runner
 from tensorflow.python.distribute import multi_worker_test_base as test_base
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.keras import callbacks
 from tensorflow.python.keras.distribute import multi_worker_testing_utils
 from tensorflow.python.lib.io import file_io
@@ -37,9 +38,10 @@ from tensorflow.python.platform import test
 def checkpoint_exists(filepath):
   """Returns whether the checkpoint `filepath` refers to exists."""
   if filepath.endswith('.h5'):
-    return file_io.file_exists(filepath)
-  tf_saved_model_exists = file_io.file_exists(filepath)
-  tf_weights_only_checkpoint_exists = file_io.file_exists(filepath + '.index')
+    return file_io.file_exists_v2(filepath)
+  tf_saved_model_exists = file_io.file_exists_v2(filepath)
+  tf_weights_only_checkpoint_exists = file_io.file_exists_v2(
+      filepath + '.index')
   return tf_saved_model_exists or tf_weights_only_checkpoint_exists
 
 
@@ -78,7 +80,7 @@ def _get_task_config():
 
 class KerasCallbackMultiProcessTest(parameterized.TestCase, test.TestCase):
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           mode=['eager'],
           file_format=['h5', 'tf'],
@@ -136,7 +138,7 @@ class KerasCallbackMultiProcessTest(parameterized.TestCase, test.TestCase):
         cluster_spec=test_base.create_cluster_spec(num_workers=2),
         args=(self, file_format))
 
-  @combinations.generate(combinations.combine(mode=['eager']))
+  @ds_combinations.generate(combinations.combine(mode=['eager']))
   def test_model_checkpoint_works_with_same_file_path(self, mode):
 
     def proc_model_checkpoint_works_with_same_file_path(
@@ -145,7 +147,7 @@ class KerasCallbackMultiProcessTest(parameterized.TestCase, test.TestCase):
       num_epoch = 2
 
       # The saving_filepath shouldn't exist at the beginning (as it's unique).
-      test_obj.assertFalse(file_io.file_exists(saving_filepath))
+      test_obj.assertFalse(file_io.file_exists_v2(saving_filepath))
 
       model.fit(
           x=train_ds,
@@ -153,7 +155,7 @@ class KerasCallbackMultiProcessTest(parameterized.TestCase, test.TestCase):
           steps_per_epoch=steps,
           callbacks=[callbacks.ModelCheckpoint(filepath=saving_filepath)])
 
-      test_obj.assertTrue(file_io.file_exists(saving_filepath))
+      test_obj.assertTrue(file_io.file_exists_v2(saving_filepath))
 
     saving_filepath = os.path.join(self.get_temp_dir(), 'checkpoint')
 
@@ -162,7 +164,7 @@ class KerasCallbackMultiProcessTest(parameterized.TestCase, test.TestCase):
         cluster_spec=test_base.create_cluster_spec(num_workers=2),
         args=(self, saving_filepath))
 
-  @combinations.generate(combinations.combine(mode=['eager']))
+  @ds_combinations.generate(combinations.combine(mode=['eager']))
   def test_backupandrestore_checkpoint_works_with_interruption(self, mode):
 
     class InterruptingCallback(callbacks.Callback):
@@ -185,7 +187,7 @@ class KerasCallbackMultiProcessTest(parameterized.TestCase, test.TestCase):
       num_epoch = 4
 
       # The saving_filepath shouldn't exist at the beginning (as it's unique).
-      test_obj.assertFalse(file_io.file_exists(saving_filepath))
+      test_obj.assertFalse(file_io.file_exists_v2(saving_filepath))
       bar_dir = os.path.join(os.path.dirname(saving_filepath), 'backup')
 
       try:
@@ -204,8 +206,8 @@ class KerasCallbackMultiProcessTest(parameterized.TestCase, test.TestCase):
 
       multi_process_runner.barrier().wait()
       backup_filepath = os.path.join(bar_dir, 'checkpoint')
-      test_obj.assertTrue(file_io.file_exists(backup_filepath))
-      test_obj.assertTrue(file_io.file_exists(saving_filepath))
+      test_obj.assertTrue(file_io.file_exists_v2(backup_filepath))
+      test_obj.assertTrue(file_io.file_exists_v2(saving_filepath))
 
       model.fit(
           x=train_ds,
@@ -217,8 +219,8 @@ class KerasCallbackMultiProcessTest(parameterized.TestCase, test.TestCase):
               AssertCallback()
           ])
       multi_process_runner.barrier().wait()
-      test_obj.assertFalse(file_io.file_exists(backup_filepath))
-      test_obj.assertTrue(file_io.file_exists(saving_filepath))
+      test_obj.assertFalse(file_io.file_exists_v2(backup_filepath))
+      test_obj.assertTrue(file_io.file_exists_v2(saving_filepath))
 
     saving_filepath = os.path.join(self.get_temp_dir(), 'checkpoint')
 
@@ -227,7 +229,7 @@ class KerasCallbackMultiProcessTest(parameterized.TestCase, test.TestCase):
         cluster_spec=test_base.create_cluster_spec(num_workers=2),
         args=(self, saving_filepath))
 
-  @combinations.generate(combinations.combine(mode=['eager']))
+  @ds_combinations.generate(combinations.combine(mode=['eager']))
   def test_tensorboard_saves_on_chief_but_not_otherwise(self, mode):
 
     def proc_tensorboard_saves_on_chief_but_not_otherwise(test_obj):
@@ -244,7 +246,7 @@ class KerasCallbackMultiProcessTest(parameterized.TestCase, test.TestCase):
           'logfile_%s_%d' % (task_config['type'], task_config['index']))
 
       # The saving_filepath shouldn't exist at the beginning (as it's unique).
-      test_obj.assertFalse(file_io.file_exists(saving_filepath))
+      test_obj.assertFalse(file_io.file_exists_v2(saving_filepath))
 
       model.fit(
           x=train_ds,
@@ -257,14 +259,15 @@ class KerasCallbackMultiProcessTest(parameterized.TestCase, test.TestCase):
       # `file_io.list_directory()` since the directory may be created at this
       # point.
       test_obj.assertEqual(
-          bool(file_io.list_directory(saving_filepath)), test_base.is_chief())
+          bool(file_io.list_directory_v2(saving_filepath)),
+          test_base.is_chief())
 
     multi_process_runner.run(
         proc_tensorboard_saves_on_chief_but_not_otherwise,
         cluster_spec=test_base.create_cluster_spec(num_workers=2),
         args=(self,))
 
-  @combinations.generate(combinations.combine(mode=['eager']))
+  @ds_combinations.generate(combinations.combine(mode=['eager']))
   def test_tensorboard_can_still_save_to_temp_even_if_it_exists(self, mode):
 
     def proc_tensorboard_can_still_save_to_temp_even_if_it_exists(test_obj):
@@ -280,7 +283,7 @@ class KerasCallbackMultiProcessTest(parameterized.TestCase, test.TestCase):
 
       # Verifies that even if `saving_filepath_for_temp` exists, tensorboard
       # can still save to temporary directory.
-      test_obj.assertTrue(file_io.file_exists(saving_filepath_for_temp))
+      test_obj.assertTrue(file_io.file_exists_v2(saving_filepath_for_temp))
 
       model.fit(
           x=train_ds,
@@ -293,7 +296,7 @@ class KerasCallbackMultiProcessTest(parameterized.TestCase, test.TestCase):
         cluster_spec=test_base.create_cluster_spec(num_workers=2),
         args=(self,))
 
-  @combinations.generate(combinations.combine(mode=['eager']))
+  @ds_combinations.generate(combinations.combine(mode=['eager']))
   def test_tensorboard_works_with_same_file_path(self, mode):
 
     def proc_tensorboard_works_with_same_file_path(test_obj, saving_filepath):
@@ -301,7 +304,7 @@ class KerasCallbackMultiProcessTest(parameterized.TestCase, test.TestCase):
       num_epoch = 2
 
       # The saving_filepath shouldn't exist at the beginning (as it's unique).
-      test_obj.assertFalse(file_io.file_exists(saving_filepath))
+      test_obj.assertFalse(file_io.file_exists_v2(saving_filepath))
 
       multi_process_runner.barrier().wait()
 
@@ -313,7 +316,7 @@ class KerasCallbackMultiProcessTest(parameterized.TestCase, test.TestCase):
 
       multi_process_runner.barrier().wait()
 
-      test_obj.assertTrue(file_io.list_directory(saving_filepath))
+      test_obj.assertTrue(file_io.list_directory_v2(saving_filepath))
 
     saving_filepath = os.path.join(self.get_temp_dir(), 'logfile')
 
@@ -322,7 +325,7 @@ class KerasCallbackMultiProcessTest(parameterized.TestCase, test.TestCase):
         cluster_spec=test_base.create_cluster_spec(num_workers=2),
         args=(self, saving_filepath))
 
-  @combinations.generate(combinations.combine(mode=['eager']))
+  @ds_combinations.generate(combinations.combine(mode=['eager']))
   def test_early_stopping(self, mode):
 
     def proc_early_stopping(test_obj):
diff --git a/tensorflow/python/keras/distribute/multi_worker_test.py b/tensorflow/python/keras/distribute/multi_worker_test.py
index a97719d0aaf..43c3f74fed4 100644
--- a/tensorflow/python/keras/distribute/multi_worker_test.py
+++ b/tensorflow/python/keras/distribute/multi_worker_test.py
@@ -31,12 +31,13 @@ from absl.testing import parameterized
 # pylint: disable=g-direct-tensorflow-import
 from tensorflow.python import keras
 from tensorflow.python.distribute import collective_all_reduce_strategy as collective_strategy
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
 from tensorflow.python.distribute import distribute_coordinator as dc
 from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import multi_worker_test_base as test_base
 from tensorflow.python.distribute import parameter_server_strategy
 from tensorflow.python.distribute.cluster_resolver import TFConfigClusterResolver
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.keras import backend
 from tensorflow.python.keras import callbacks
 from tensorflow.python.keras import metrics as metrics_module
@@ -203,7 +204,7 @@ class MultiWorkerVerificationCallback(callbacks.Callback):
 class KerasMultiWorkerTestIndependentWorker(test_base.IndependentWorkerTestBase,
                                             parameterized.TestCase):
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           mode=['graph'],
           strategy_cls=[
@@ -261,7 +262,7 @@ class KerasMultiWorkerTestIndependentWorker(test_base.IndependentWorkerTestBase,
     self.join_independent_workers(threads_to_join)
     verification_callback.verify(self)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           mode=['graph'],
           strategy_cls=[ParameterServerStrategy],
diff --git a/tensorflow/python/keras/distribute/multi_worker_tutorial_test.py b/tensorflow/python/keras/distribute/multi_worker_tutorial_test.py
index f7d64c2fc23..556ffe30236 100644
--- a/tensorflow/python/keras/distribute/multi_worker_tutorial_test.py
+++ b/tensorflow/python/keras/distribute/multi_worker_tutorial_test.py
@@ -27,10 +27,11 @@ from tensorflow.python import keras
 from tensorflow.python.data.experimental.ops import distribute_options
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import collective_all_reduce_strategy
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
 from tensorflow.python.distribute import multi_process_runner
 from tensorflow.python.distribute import multi_worker_test_base
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras.datasets import mnist
 from tensorflow.python.keras.optimizer_v2 import gradient_descent
@@ -56,7 +57,7 @@ class MultiWorkerTutorialTest(parameterized.TestCase, test.TestCase):
       else:
         raise
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           mode=['eager'],
           shard_policy=[None] + list(distribute_options.AutoShardPolicy)))
@@ -159,9 +160,10 @@ class MultiWorkerTutorialTest(parameterized.TestCase, test.TestCase):
       # Make sure chief finishes saving before non-chief's assertions.
       multi_process_runner.barrier().wait()
 
-      if not file_io.file_exists(model_path):
+      if not file_io.file_exists_v2(model_path):
         raise RuntimeError()
-      if file_io.file_exists(write_model_path) != _is_chief(task_type, task_id):
+      if file_io.file_exists_v2(write_model_path) != _is_chief(
+          task_type, task_id):
         raise RuntimeError()
 
       loaded_model = keras.saving.save.load_model(model_path)
@@ -179,9 +181,9 @@ class MultiWorkerTutorialTest(parameterized.TestCase, test.TestCase):
       # Make sure chief finishes saving before non-chief's assertions.
       multi_process_runner.barrier().wait()
 
-      if not file_io.file_exists(checkpoint_dir):
+      if not file_io.file_exists_v2(checkpoint_dir):
         raise RuntimeError()
-      if file_io.file_exists(write_checkpoint_dir) != _is_chief(
+      if file_io.file_exists_v2(write_checkpoint_dir) != _is_chief(
           task_type, task_id):
         raise RuntimeError()
 
diff --git a/tensorflow/python/keras/distribute/optimizer_combinations.py b/tensorflow/python/keras/distribute/optimizer_combinations.py
index 495ee6c0456..254bb375a75 100644
--- a/tensorflow/python/keras/distribute/optimizer_combinations.py
+++ b/tensorflow/python/keras/distribute/optimizer_combinations.py
@@ -17,9 +17,8 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-
-from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import strategy_combinations as strategy_combinations_base
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.keras.optimizer_v2 import adadelta as adadelta_keras_v2
 from tensorflow.python.keras.optimizer_v2 import adagrad as adagrad_keras_v2
 from tensorflow.python.keras.optimizer_v2 import adam as adam_keras_v2
diff --git a/tensorflow/python/keras/distribute/parameter_server_training_test.py b/tensorflow/python/keras/distribute/parameter_server_training_test.py
new file mode 100644
index 00000000000..f4e546b6e14
--- /dev/null
+++ b/tensorflow/python/keras/distribute/parameter_server_training_test.py
@@ -0,0 +1,217 @@
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ParameterServerClient and Keras models."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import random
+import tempfile
+
+from tensorflow.python import keras
+from tensorflow.python.compat import v2_compat
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import multi_worker_test_base
+from tensorflow.python.distribute import parameter_server_strategy_v2
+from tensorflow.python.distribute.client import client as client_lib
+from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.keras.layers.preprocessing import string_lookup
+from tensorflow.python.keras.optimizer_v2 import rmsprop
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops.losses import loss_reduction
+from tensorflow.python.platform import test
+from tensorflow.python.training.server_lib import ClusterSpec
+
+
+def make_client(num_workers, num_ps):
+  cluster_def = multi_worker_test_base.create_in_process_cluster(
+      num_workers=num_workers, num_ps=num_ps, rpc_layer="grpc")
+  cluster_def["chief"] = [
+      "localhost:%d" % multi_worker_test_base.pick_unused_port()
+  ]
+  cluster_resolver = SimpleClusterResolver(
+      ClusterSpec(cluster_def), rpc_layer="grpc")
+  return client_lib.Client(
+      parameter_server_strategy_v2.ParameterServerStrategyV2(cluster_resolver))
+
+
+class KPLTest(test.TestCase):
+
+  @classmethod
+  def setUpClass(cls):
+    super(KPLTest, cls).setUpClass()
+    cls.client = make_client(num_workers=3, num_ps=2)
+
+  def testTrainAndServe(self):
+    # These vocabularies usually come from TFT or a Beam pipeline.
+    feature_vocab = [
+        "avenger", "ironman", "batman", "hulk", "spiderman", "kingkong",
+        "wonder_woman"
+    ]
+    label_vocab = ["yes", "no"]
+
+    with self.client.strategy.scope():
+
+      # Define KPLs under client's context. Right now, if they have look up
+      # tables, they will be created on the client. Their variables will be
+      # created on PS. Ideally they should be cached on each worker since they
+      # will not be changed in a training step.
+      feature_lookup_layer = string_lookup.StringLookup()
+      raw_feature_input = keras.layers.Input(
+          shape=(3,), dtype=dtypes.string, name="feature", ragged=True)
+      feature_id_input = feature_lookup_layer(raw_feature_input)
+
+      # Model creates variables as well.
+      feature_ps = keras.Model({"features": raw_feature_input},
+                               feature_id_input)
+
+      # TODO(yuefengz): adapt may be expensive for large vocab?
+      feature_lookup_layer.adapt(feature_vocab)
+
+      label_lookup_layer = string_lookup.StringLookup(
+          num_oov_indices=0, mask_token=None)
+      raw_label_input = keras.layers.Input(
+          shape=(), dtype=dtypes.string, name="label")
+      label_id_input = label_lookup_layer(raw_label_input)
+      label_ps = keras.Model({"label": raw_label_input}, label_id_input)
+
+      label_lookup_layer.adapt(label_vocab)
+
+      # Only needed for serving.
+      label_inverse_lookup_layer = string_lookup.StringLookup(
+          num_oov_indices=1,
+          mask_token=None,
+          vocabulary=label_lookup_layer.get_vocabulary(),
+          invert=True)
+
+      def dataset_fn():
+
+        def feature_and_label_gen():
+          while True:
+            features = random.sample(feature_vocab, 3)
+            label = "yes" if "avenger" in features else "no"
+            yield {"features": features, "label": label}
+
+        # The dataset will be created on the client?
+        raw_dataset = dataset_ops.Dataset.from_generator(
+            feature_and_label_gen,
+            output_types={
+                "features": dtypes.string,
+                "label": dtypes.string
+            }).shuffle(200).batch(32)
+        preproc_dataset = raw_dataset.map(
+            lambda x: {  # pylint: disable=g-long-lambda
+                "features": feature_ps(x["features"]),
+                "label": label_ps(x["label"])
+            })
+        train_dataset = preproc_dataset.map(lambda x: (  # pylint: disable=g-long-lambda
+            {
+                "features": x["features"]
+            }, [x["label"]]))
+        return train_dataset
+
+      distributed_dataset = self.client.create_per_worker_dataset(dataset_fn)
+
+      model_input = keras.layers.Input(
+          shape=(3,), dtype=dtypes.int64, name="model_input")
+      emb_output = keras.layers.Embedding(
+          input_dim=len(feature_lookup_layer.get_vocabulary()), output_dim=20)(
+              model_input)
+      emb_output = math_ops.reduce_mean(emb_output, axis=1)
+      dense_output = keras.layers.Dense(
+          units=1, activation="sigmoid")(
+              emb_output)
+      model = keras.Model({"features": model_input}, dense_output)
+      optimizer = rmsprop.RMSprop(learning_rate=0.01)
+      accuracy = keras.metrics.Accuracy()
+
+      @def_function.function
+      def worker_fn(iterator):
+
+        def train_step(iterator):
+          batch_data, labels = next(iterator)
+          with backprop.GradientTape() as tape:
+            pred = model(batch_data, training=True)
+            loss = nn.compute_average_loss(
+                keras.losses.BinaryCrossentropy(
+                    reduction=loss_reduction.ReductionV2.NONE)(labels, pred))
+            gradients = tape.gradient(loss, model.trainable_variables)
+
+          optimizer.apply_gradients(zip(gradients, model.trainable_variables))
+
+          actual_pred = math_ops.cast(math_ops.greater(pred, 0.5), dtypes.int64)
+          accuracy.update_state(labels, actual_pred)
+
+        self.client._strategy.run(train_step, args=(iterator,))
+
+    distributed_iterator = iter(distributed_dataset)
+    for _ in range(10):
+      self.client.schedule(worker_fn, args=(distributed_iterator,))
+    self.client.join()
+    self.assertGreater(accuracy.result().numpy(), 0.0)
+
+    # Create a saved model.
+    model.feature_ps = feature_ps
+    model.label_ps = label_ps
+    model.label_inverse_lookup_layer = label_inverse_lookup_layer
+
+    def create_serving_signature(model):
+
+      @def_function.function
+      def serve_fn(raw_features):
+        raw_features = array_ops.expand_dims(raw_features, axis=0)
+        transformed_features = model.feature_ps(raw_features)
+        outputs = model(transformed_features)
+        outputs = array_ops.squeeze(outputs, axis=0)
+        outputs = math_ops.cast(math_ops.greater(outputs, 0.5), dtypes.int64)
+        decoded_outputs = model.label_inverse_lookup_layer(outputs)
+        return array_ops.squeeze(decoded_outputs, axis=0)
+
+      # serving does NOT have batch dimension
+      return serve_fn.get_concrete_function(
+          tensor_spec.TensorSpec(
+              shape=(3), dtype=dtypes.string, name="example"))
+
+    serving_fn = create_serving_signature(model)
+
+    saved_model_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
+    model.save(saved_model_dir, signatures={"serving_default": serving_fn})
+
+    # Test the saved_model.
+    loaded_serving_fn = keras.saving.save.load_model(
+        saved_model_dir).signatures["serving_default"]
+
+    # check the result w/ and w/o avenger.
+    prediction0 = loaded_serving_fn(
+        constant_op.constant(["avenger", "ironman", "avenger"]))["output_0"]
+    self.assertIn(prediction0, ("yes", "no"))
+
+    prediction1 = loaded_serving_fn(
+        constant_op.constant(["ironman", "ironman", "unkonwn"]))["output_0"]
+    self.assertIn(prediction1, ("yes", "no"))
+
+
+if __name__ == "__main__":
+  v2_compat.enable_v2_behavior()
+  test.main()
diff --git a/tensorflow/python/keras/distribute/saved_model_mixed_api_test.py b/tensorflow/python/keras/distribute/saved_model_mixed_api_test.py
index d303a4228b5..61151aad921 100644
--- a/tensorflow/python/keras/distribute/saved_model_mixed_api_test.py
+++ b/tensorflow/python/keras/distribute/saved_model_mixed_api_test.py
@@ -23,15 +23,19 @@ tf.saved_model.save().
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-
-from tensorflow.python.distribute import combinations
-from tensorflow.python.eager import test
+from tensorflow.python.distribute import combinations as ds_combinations
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_combinations as combinations
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.distribute import saved_model_test_base as test_base
 from tensorflow.python.keras.saving import save
+from tensorflow.python.platform import test
 
 _DEFAULT_FUNCTION_KEY = 'serving_default'
 
 
+@testing_utils.run_all_without_tensor_float_32(
+    'Uses Dense layers, which call matmul')
 class SavedModelSaveAndLoadTest(test_base.TestSavedModelBase):
 
   def setUp(self):
@@ -50,13 +54,13 @@ class SavedModelSaveAndLoadTest(test_base.TestSavedModelBase):
                                                        predict_dataset,
                                                        output_name)
 
-  @combinations.generate(test_base.simple_models_with_strategies())
+  @ds_combinations.generate(test_base.simple_models_with_strategies())
   def test_save_no_strategy_restore_strategy(self, model_and_input,
                                              distribution):
     self.run_test_save_no_strategy_restore_strategy(
         model_and_input, distribution)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(test_base.simple_models_with_strategies(),
                          combinations.combine(save_in_scope=[True, False])))
   def test_save_strategy_restore_no_strategy(self, model_and_input,
@@ -64,7 +68,7 @@ class SavedModelSaveAndLoadTest(test_base.TestSavedModelBase):
     self.run_test_save_strategy_restore_no_strategy(
         model_and_input, distribution, save_in_scope)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(test_base.simple_models_with_strategy_pairs(),
                          combinations.combine(save_in_scope=[True, False])))
   def test_save_strategy_restore_strategy(self, model_and_input,
@@ -78,4 +82,5 @@ class SavedModelSaveAndLoadTest(test_base.TestSavedModelBase):
 
 
 if __name__ == '__main__':
+  ops.enable_eager_execution()
   test.main()
diff --git a/tensorflow/python/keras/distribute/saved_model_save_load_test.py b/tensorflow/python/keras/distribute/saved_model_save_load_test.py
index 39856af2a20..2df80f2c5d8 100644
--- a/tensorflow/python/keras/distribute/saved_model_save_load_test.py
+++ b/tensorflow/python/keras/distribute/saved_model_save_load_test.py
@@ -19,19 +19,23 @@ from __future__ import division
 from __future__ import print_function
 
 import os
-
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
 from tensorflow.python.distribute import strategy_combinations
-from tensorflow.python.eager import test
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_spec
+from tensorflow.python.framework import test_combinations as combinations
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.distribute import model_combinations
 from tensorflow.python.keras.distribute import saved_model_test_base as test_base
 from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
 from tensorflow.python.saved_model import load_options as load_options_lib
 from tensorflow.python.saved_model import save_options as save_options_lib
 from tensorflow.python.saved_model import saved_model
 
 
+@testing_utils.run_all_without_tensor_float_32(
+    'Uses Dense layers, which call matmul')
 class SavedModelKerasModelTest(test_base.TestSavedModelBase):
 
   def setUp(self):
@@ -50,13 +54,13 @@ class SavedModelKerasModelTest(test_base.TestSavedModelBase):
                                                        predict_dataset,
                                                        output_name)
 
-  @combinations.generate(test_base.simple_models_with_strategies())
+  @ds_combinations.generate(test_base.simple_models_with_strategies())
   def test_save_no_strategy_restore_strategy(self, model_and_input,
                                              distribution):
     self.run_test_save_no_strategy_restore_strategy(
         model_and_input, distribution)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(test_base.simple_models_with_strategies(),
                          combinations.combine(save_in_scope=[True, False])))
   def test_save_strategy_restore_no_strategy(self, model_and_input,
@@ -64,7 +68,7 @@ class SavedModelKerasModelTest(test_base.TestSavedModelBase):
     self.run_test_save_strategy_restore_no_strategy(
         model_and_input, distribution, save_in_scope)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(test_base.simple_models_with_strategy_pairs(),
                          combinations.combine(save_in_scope=[True, False])))
   def test_save_strategy_restore_strategy(self, model_and_input,
@@ -76,7 +80,7 @@ class SavedModelKerasModelTest(test_base.TestSavedModelBase):
                                                  distribution_for_restoring,
                                                  save_in_scope)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(test_base.simple_models_with_strategies(),
                          combinations.combine(save_in_scope=[True, False])))
   def test_no_variable_device_placement(self, model_and_input, distribution,
@@ -126,13 +130,13 @@ class SavedModelTFModuleTest(test_base.TestSavedModelBase):
     model = saved_model.load(saved_dir)
     return self._predict_with_model(distribution, model, predict_dataset)
 
-  @combinations.generate(test_base.tfmodule_models_with_strategies())
+  @ds_combinations.generate(test_base.tfmodule_models_with_strategies())
   def test_save_no_strategy_restore_strategy(self, model_and_input,
                                              distribution):
     self.run_test_save_no_strategy_restore_strategy(
         model_and_input, distribution)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(test_base.tfmodule_models_with_strategies(),
                          combinations.combine(save_in_scope=[True, False])))
   def test_save_strategy_restore_no_strategy(
@@ -140,7 +144,7 @@ class SavedModelTFModuleTest(test_base.TestSavedModelBase):
     self.run_test_save_strategy_restore_no_strategy(
         model_and_input, distribution, save_in_scope)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(test_base.tfmodule_models_with_strategy_pairs(),
                          combinations.combine(save_in_scope=[True, False])))
   def test_save_strategy_restore_strategy(self, model_and_input,
@@ -152,7 +156,7 @@ class SavedModelTFModuleTest(test_base.TestSavedModelBase):
                                                  distribution_for_restoring,
                                                  save_in_scope)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           model_and_input=[model_combinations.simple_tfmodule_model],
           distribution=test_base.strategies +
@@ -177,4 +181,5 @@ class SavedModelTFModuleTest(test_base.TestSavedModelBase):
 
 
 if __name__ == '__main__':
+  ops.enable_eager_execution()
   test.main()
diff --git a/tensorflow/python/keras/distribute/saved_model_test_base.py b/tensorflow/python/keras/distribute/saved_model_test_base.py
index 9314dbe7d79..8ac8c0dc32a 100644
--- a/tensorflow/python/keras/distribute/saved_model_test_base.py
+++ b/tensorflow/python/keras/distribute/saved_model_test_base.py
@@ -24,12 +24,12 @@ from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import strategy_combinations
-from tensorflow.python.eager import test
 from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.keras.distribute import model_combinations
 from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
 from tensorflow.python.saved_model import saved_model
 
 _RANDOM_SEED = 1337
diff --git a/tensorflow/python/keras/distribute/step_fn_test.py b/tensorflow/python/keras/distribute/step_fn_test.py
index ed6f1886a53..213996fb7a7 100644
--- a/tensorflow/python/keras/distribute/step_fn_test.py
+++ b/tensorflow/python/keras/distribute/step_fn_test.py
@@ -20,20 +20,21 @@ from __future__ import print_function
 
 from absl.testing import parameterized
 import numpy
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.distribute.single_loss_example import single_loss_example
 from tensorflow.python.eager import context
-from tensorflow.python.eager import test
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras.distribute import optimizer_combinations
 from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
 
 
 @test_util.with_control_flow_v2
 class SingleLossStepTest(test.TestCase, parameterized.TestCase):
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(
           optimizer_combinations.distributions_and_v1_optimizers(),
           combinations.combine(
diff --git a/tensorflow/python/keras/distribute/worker_training_state.py b/tensorflow/python/keras/distribute/worker_training_state.py
index 06cf46b3333..6385594e0c0 100644
--- a/tensorflow/python/keras/distribute/worker_training_state.py
+++ b/tensorflow/python/keras/distribute/worker_training_state.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 import os
 
 from tensorflow.python.distribute import distributed_file_utils
-from tensorflow.python.distribute import multi_worker_util
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.keras import backend as K
@@ -104,12 +103,6 @@ class WorkerTrainingState(object):
       True if the training state is successfully restored. False if the training
       state doesn't need to be restored, or error occurred so it can't.
     """
-    # For multi-worker training, it should not restore a model in certain
-    # worker setting (e.g. non-chief worker in ParameterServerStrategy).
-    # pylint: disable=protected-access
-    if self._model._in_multi_worker_mode(
-    ) and not multi_worker_util.should_load_checkpoint():
-      return
     self.read_checkpoint_manager.restore_or_initialize()
 
   def delete_backup(self):
@@ -119,12 +112,12 @@ class WorkerTrainingState(object):
     successfully finishes.
     """
     # pylint: disable=protected-access
-    for pathname in file_io.get_matching_files(
+    for pathname in file_io.get_matching_files_v2(
         self.write_checkpoint_manager._prefix + '*'):
-      file_io.delete_recursively(pathname)
-    for pathname in file_io.get_matching_files(
+      file_io.delete_recursively_v2(pathname)
+    for pathname in file_io.get_matching_files_v2(
         os.path.join(self.write_checkpoint_manager.directory, 'checkpoint')):
-      file_io.delete_recursively(pathname)
+      file_io.delete_recursively_v2(pathname)
 
   def maybe_load_initial_epoch_from_ckpt(self, initial_epoch, mode):
     """Maybe load initial epoch from ckpt considering possible worker recovery.
diff --git a/tensorflow/python/keras/distribute/worker_training_state_test.py b/tensorflow/python/keras/distribute/worker_training_state_test.py
index 80a3deaa914..1df136827e6 100644
--- a/tensorflow/python/keras/distribute/worker_training_state_test.py
+++ b/tensorflow/python/keras/distribute/worker_training_state_test.py
@@ -21,8 +21,9 @@ import os
 import sys
 
 from absl.testing import parameterized
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
 from tensorflow.python.distribute import multi_worker_test_base as test_base
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.framework.errors_impl import NotFoundError
 from tensorflow.python.keras import callbacks
 from tensorflow.python.keras.distribute import multi_worker_testing_utils
@@ -33,7 +34,7 @@ from tensorflow.python.platform import test
 class ModelCheckpointTest(test_base.IndependentWorkerTestBase,
                           parameterized.TestCase):
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           mode=['graph'],
           required_gpus=[0, 1],
@@ -48,7 +49,7 @@ class ModelCheckpointTest(test_base.IndependentWorkerTestBase,
         callbacks.ModelCheckpoint(
             filepath=saving_filepath, save_weights_only=save_weights_only)
     ]
-    self.assertFalse(file_io.file_exists(saving_filepath))
+    self.assertFalse(file_io.file_exists_v2(saving_filepath))
 
     try:
       model.fit(
@@ -56,9 +57,9 @@ class ModelCheckpointTest(test_base.IndependentWorkerTestBase,
     except NotFoundError as e:
       if 'Failed to create a NewWriteableFile' in e.message:
         self.skipTest('b/138941852, path not found error in Windows py35.')
-    tf_saved_model_exists = file_io.file_exists(saving_filepath)
-    tf_weights_only_checkpoint_exists = file_io.file_exists(saving_filepath +
-                                                            '.index')
+    tf_saved_model_exists = file_io.file_exists_v2(saving_filepath)
+    tf_weights_only_checkpoint_exists = file_io.file_exists_v2(
+        saving_filepath + '.index')
     self.assertTrue(tf_saved_model_exists or tf_weights_only_checkpoint_exists)
 
 
diff --git a/tensorflow/python/keras/engine/BUILD b/tensorflow/python/keras/engine/BUILD
index 0d2ddb46049..1a2b8c48d20 100644
--- a/tensorflow/python/keras/engine/BUILD
+++ b/tensorflow/python/keras/engine/BUILD
@@ -16,6 +16,12 @@ package(
 
 exports_files(["LICENSE"])
 
+filegroup(
+    name = "all_py_srcs",
+    srcs = glob(["*.py"]),
+    visibility = ["//tensorflow/python/keras/google/private_tf_api_test:__pkg__"],
+)
+
 py_library(
     name = "engine",
     srcs = [
@@ -27,10 +33,10 @@ py_library(
         "saving.py",
         "sequential.py",
         "training.py",
-        "training_arrays.py",
-        "training_distributed.py",
-        "training_eager.py",
-        "training_generator.py",
+        "training_arrays_v1.py",
+        "training_distributed_v1.py",
+        "training_eager_v1.py",
+        "training_generator_v1.py",
         "training_utils.py",
         "training_v1.py",
     ],
@@ -96,6 +102,7 @@ py_library(
         "//tensorflow/python/distribute:distribute_lib",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/keras:backend",
+        "//tensorflow/python/keras/utils:tf_inspect",
         "//tensorflow/python/keras/utils:tf_utils",
     ],
 )
diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index c01c3d96aec..18bec0240e2 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -23,6 +23,7 @@ import copy
 import functools
 import itertools
 import threading
+import warnings
 import weakref
 
 import numpy as np
@@ -36,8 +37,8 @@ from tensorflow.python.autograph.core import ag_ctx
 from tensorflow.python.autograph.impl import api as autograph
 from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.eager import execute
-from tensorflow.python.eager import function
 from tensorflow.python.eager import monitoring
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -61,11 +62,13 @@ from tensorflow.python.keras.mixed_precision.experimental import policy
 from tensorflow.python.keras.saving.saved_model import layer_serialization
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import layer_utils
+from tensorflow.python.keras.utils import tf_inspect
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.keras.utils import version_utils
 # A module that only depends on `keras.layers` import these from here.
 from tensorflow.python.keras.utils.generic_utils import to_snake_case  # pylint: disable=unused-import
 from tensorflow.python.keras.utils.tf_utils import is_tensor_or_tensor_list  # pylint: disable=unused-import
+
 from tensorflow.python.module import module
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -79,10 +82,8 @@ from tensorflow.python.training.tracking import data_structures
 from tensorflow.python.training.tracking import layer_utils as trackable_layer_utils
 from tensorflow.python.training.tracking import tracking
 from tensorflow.python.util import compat
-from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
 from tensorflow.python.util import object_identity
-from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import keras_export
 from tensorflow.tools.docs import doc_controls
 
@@ -317,6 +318,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
         'weights',
         'activity_regularizer',
         'autocast',
+        'implementation',
     }
     # Validate optional keyword arguments.
     generic_utils.validate_kwargs(kwargs, allowed_kwargs)
@@ -1006,10 +1008,10 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
         np_arrays.ndarray, np.ndarray, float, int)) for x in input_list):
 
       def _convert_non_tensor(x):
-        # Don't call `ops.convert_to_tensor_v2` on all `inputs` because
+        # Don't call `ops.convert_to_tensor` on all `inputs` because
         # `SparseTensors` can't be converted to `Tensor`.
         if isinstance(x, (np_arrays.ndarray, np.ndarray, float, int)):
-          return ops.convert_to_tensor_v2(x)
+          return ops.convert_to_tensor_v2_with_dispatch(x)
         return x
 
       inputs = nest.map_structure(_convert_non_tensor, inputs)
@@ -1320,6 +1322,9 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
 
     Trainable weights are updated via gradient descent during training.
 
+    Note: This will not track the weights of nested `tf.Modules` that are not
+    themselves Keras layers.
+
     Returns:
       A list of trainable variables.
     """
@@ -1336,6 +1341,9 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     Non-trainable weights are *not* updated during training. They are expected
     to be updated manually in `call()`.
 
+    Note: This will not track the weights of nested `tf.Modules` that are not
+    themselves Keras layers.
+
     Returns:
       A list of non-trainable variables.
     """
@@ -1354,18 +1362,20 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
   def weights(self):
     """Returns the list of all layer variables/weights.
 
+    Note: This will not track the weights of nested `tf.Modules` that are not
+    themselves Keras layers.
+
     Returns:
       A list of variables.
     """
     return self.trainable_weights + self.non_trainable_weights
 
   @property
-  @deprecation.deprecated(
-      date=None,
-      instructions='This property should not be used in TensorFlow 2.0, '
-      'as updates are applied automatically.')
   @doc_controls.do_not_generate_docs
   def updates(self):
+    warnings.warn('`layer.updates` will be removed in a future version. '
+                  'This property should not be used in TensorFlow 2.0, '
+                  'as `updates` are applied automatically.')
     if keras_tensor.keras_tensors_enabled():
       return []
 
@@ -1518,7 +1528,8 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
       if loss is None:
         return None  # Will be filtered out when computing the .losses property
       if not tensor_util.is_tensor(loss):
-        loss = ops.convert_to_tensor_v2(loss, dtype=backend.floatx())
+        loss = ops.convert_to_tensor_v2_with_dispatch(
+            loss, dtype=backend.floatx())
       loss._unconditional_loss = True  # pylint: disable=protected-access
       return loss
 
@@ -1535,7 +1546,8 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
         continue
       if not tensor_util.is_tensor(loss) and not isinstance(
           loss, keras_tensor.KerasTensor):
-        loss = ops.convert_to_tensor_v2(loss, dtype=backend.floatx())
+        loss = ops.convert_to_tensor_v2_with_dispatch(
+            loss, dtype=backend.floatx())
       # TF Functions should take the eager path.
       if ((tf_utils.is_symbolic_tensor(loss) or
            isinstance(loss, keras_tensor.KerasTensor)) and
@@ -1757,7 +1769,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     if not call_context.frozen:
       for update in nest.flatten(updates):
         if callable(update):
-          update()
+          update()  # pylint: disable=not-callable
 
   def set_weights(self, weights):
     """Sets the weights of the layer, from Numpy arrays.
@@ -1883,8 +1895,6 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
         output_weights.append(weight)
     return backend.batch_get_value(output_weights)
 
-  @deprecation.deprecated(
-      date=None, instructions='Please use `layer.updates` instead.')
   @doc_controls.do_not_generate_docs
   def get_updates_for(self, inputs):
     """Deprecated, do NOT use!
@@ -1897,10 +1907,11 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     Returns:
       List of update ops of the layer that depend on `inputs`.
     """
+    warnings.warn('`layer.get_updates_for` is deprecated and '
+                  'will be removed in a future version. '
+                  'Please use `layer.updates` method instead.')
     return self.updates
 
-  @deprecation.deprecated(
-      date=None, instructions='Please use `layer.losses` instead.')
   @doc_controls.do_not_generate_docs
   def get_losses_for(self, inputs):
     """Deprecated, do NOT use!
@@ -1913,6 +1924,9 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     Returns:
       List of loss tensors of the layer that depend on `inputs`.
     """
+    warnings.warn('`layer.get_losses_for` is deprecated and '
+                  'will be removed in a future version. '
+                  'Please use `layer.losses` instead.')
     return self.losses
 
   @doc_controls.do_not_doc_inheritable
@@ -2217,8 +2231,6 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
   # Methods & attributes below are public aliases of other methods.            #
   ##############################################################################
 
-  @deprecation.deprecated(
-      date=None, instructions='Please use `layer.__call__` method instead.')
   @doc_controls.do_not_doc_inheritable
   def apply(self, inputs, *args, **kwargs):
     """Deprecated, do NOT use!
@@ -2233,13 +2245,17 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     Returns:
       Output tensor(s).
     """
+    warnings.warn('`layer.apply` is deprecated and '
+                  'will be removed in a future version. '
+                  'Please use `layer.__call__` method instead.')
     return self.__call__(inputs, *args, **kwargs)
 
-  @deprecation.deprecated(
-      date=None, instructions='Please use `layer.add_weight` method instead.')
   @doc_controls.do_not_doc_inheritable
   def add_variable(self, *args, **kwargs):
     """Deprecated, do NOT use! Alias for `add_weight`."""
+    warnings.warn('`layer.add_variable` is deprecated and '
+                  'will be removed in a future version. '
+                  'Please use `layer.add_weight` method instead.')
     return self.add_weight(*args, **kwargs)
 
   @property
@@ -2249,6 +2265,9 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
 
     Alias of `self.weights`.
 
+    Note: This will not track the weights of nested `tf.Modules` that are not
+    themselves Keras layers.
+
     Returns:
       A list of variables.
     """
@@ -2309,11 +2328,6 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
                        'use a different Strategy, e.g. a MirroredStrategy.' %
                        (strategy.__class__.__name__, self._dtype_policy.name))
 
-    # This has no impact on the layer behavior, and is only used for printing
-    # warnings.
-    self._dtype_defaulted_to_floatx = (not dtype and
-                                       policy.policy_defaults_to_floatx())
-
     # Performance optimization: cache the compute dtype as a Dtype object or
     # None, so that str to Dtype conversion doesn't happen in Layer.__call__.
     # TODO(b/157486353): Investigate returning DTypes in Policy.
@@ -2374,37 +2388,10 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
   def _cast_single_input(self, x):
     """Cast a single Tensor or TensorSpec to the compute dtype."""
     if self._should_cast_single_input(x):
-      if self._dtype_defaulted_to_floatx:
-        self._warn_about_input_casting(x.dtype.base_dtype)
       return math_ops.cast(x, self._compute_dtype_object)
     else:
       return x
 
-  def _warn_about_input_casting(self, input_dtype):
-    # self._already_warned_about_input_casting is only retrieved or set in this
-    # function.
-    already_warned = getattr(self, '_already_warned_about_input_casting', False)
-    if not already_warned:
-      tf_logging.warn(
-          "Layer {self.name} is casting an input tensor from dtype "
-          "{input_dtype} to the layer's dtype of {layer_dtype}, which is new "
-          "behavior in TensorFlow 2.  The layer has dtype {layer_dtype} "
-          'because its dtype defaults to floatx.\n\n'
-          ""
-          "If you intended to run this layer in {layer_dtype}, you can safely "
-          "ignore this warning. If in doubt, this warning is likely only an "
-          "issue if you are porting a TensorFlow 1.X model to TensorFlow 2.\n\n"
-          ""
-          "To change all layers to have dtype {input_dtype} by default, call "
-          "`tf.keras.backend.set_floatx('{input_dtype}')`. To change just this "
-          "layer, pass dtype='{input_dtype}' to the layer constructor. If you "
-          "are the author of this layer, you can disable autocasting by "
-          "passing autocast=False to the base Layer constructor.\n".format(
-              self=self,
-              input_dtype=input_dtype.name,
-              layer_dtype=self._compute_dtype))
-      self._already_warned_about_input_casting = True
-
   # _dtype used to be an attribute set in the constructor. We still expose it
   # because some clients still use it.
   # TODO(reedwm): Deprecate, then remove the _dtype property.
@@ -2586,10 +2573,10 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     # we copy them to avoid loss of KerasHistory metadata.
     flat_outputs = nest.flatten(outputs)
     flat_inputs = nest.flatten((args, kwargs))
-    inputs_set = object_identity.ObjectIdentitySet(flat_inputs)
+    input_ids_set = {id(i) for i in flat_inputs}
     outputs_copy = []
     for x in flat_outputs:
-      if x in inputs_set:
+      if id(x) in input_ids_set:
         with backend.name_scope(self.name):
           x = array_ops.identity(x)
       outputs_copy.append(x)
@@ -2926,14 +2913,14 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
                               self._call_accepts_kwargs)
 
   @property
-  @tracking.cached_per_instance
+  @layer_utils.cached_per_instance
   def _call_full_argspec(self):
     # Argspec inspection is expensive and the call spec is used often, so it
     # makes sense to cache the result.
     return tf_inspect.getfullargspec(self.call)
 
   @property
-  @tracking.cached_per_instance
+  @layer_utils.cached_per_instance
   def _call_fn_args(self):
     all_args = self._call_full_argspec.args
     # Scrub `self` that appears if a decorator was applied.
@@ -2942,7 +2929,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     return all_args
 
   @property
-  @tracking.cached_per_instance
+  @layer_utils.cached_per_instance
   def _call_fn_arg_defaults(self):
     call_fn_args = self._call_fn_args
     call_fn_defaults = self._call_full_argspec.defaults or []
@@ -2955,7 +2942,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     return defaults
 
   @property
-  @tracking.cached_per_instance
+  @layer_utils.cached_per_instance
   def _call_fn_arg_positions(self):
     call_fn_arg_positions = dict()
     for pos, arg in enumerate(self._call_fn_args):
@@ -2963,7 +2950,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     return call_fn_arg_positions
 
   @property
-  @tracking.cached_per_instance
+  @layer_utils.cached_per_instance
   def _call_accepts_kwargs(self):
     return self._call_full_argspec.varkw is not None
 
@@ -2985,12 +2972,13 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
 
   def _dedup_weights(self, weights):
     """Dedupe weights while maintaining order as much as possible."""
-    output, seen_weights = [], object_identity.ObjectIdentitySet()
+    output, seen_ids = [], set()
     for w in weights:
-      if w not in seen_weights:
+      if id(w) not in seen_ids:
         output.append(w)
         # Track the Variable's identity to avoid __eq__ issues.
-        seen_weights.add(w)
+        seen_ids.add(id(w))
+
     return output
 
   def _split_out_first_arg(self, args, kwargs):
@@ -3167,7 +3155,7 @@ class TensorFlowOpLayer(Layer):
         return op.outputs[0]
       return op.outputs
 
-  @function.defun
+  @def_function.function
   def _defun_call(self, inputs):
     """Wraps the op creation method in an Eager function for `run_eagerly`."""
     return self._make_op(inputs)
@@ -3266,7 +3254,7 @@ def _in_functional_construction_mode(layer, inputs, args, kwargs, input_list):
 
 def _convert_numpy_or_python_types(x):
   if isinstance(x, (np_arrays.ndarray, np.ndarray, float, int)):
-    return ops.convert_to_tensor_v2(x)
+    return ops.convert_to_tensor_v2_with_dispatch(x)
   return x
 
 
diff --git a/tensorflow/python/keras/engine/base_layer_test.py b/tensorflow/python/keras/engine/base_layer_test.py
index 022718ea549..321eefcadb1 100644
--- a/tensorflow/python/keras/engine/base_layer_test.py
+++ b/tensorflow/python/keras/engine/base_layer_test.py
@@ -43,7 +43,6 @@ from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.engine import input_layer
 from tensorflow.python.keras.engine import sequential
 from tensorflow.python.keras.engine import training as training_lib
-from tensorflow.python.keras.mixed_precision.experimental import policy
 from tensorflow.python.keras.optimizer_v2 import rmsprop
 from tensorflow.python.keras.utils import control_flow_util
 from tensorflow.python.layers import core as legacy_core
@@ -56,7 +55,6 @@ from tensorflow.python.ops import variables
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
-from tensorflow.python.platform import tf_logging
 from tensorflow.python.summary import summary_iterator
 from tensorflow.python.util import nest
 
@@ -1135,7 +1133,7 @@ class NameScopingTest(keras_parameterized.TestCase):
     self.assertEqual(sublayer.active_name_scope, 'MyName2/Sublayer')
 
   def test_name_scope_tf_tensor(self):
-    x = ops.convert_to_tensor_v2(np.ones((10, 10)))
+    x = ops.convert_to_tensor_v2_with_dispatch(np.ones((10, 10)))
     layer = layers.Dense(
         10, activation=layers.ReLU(name='MyAct'), name='MyName3')
     layer(x)
@@ -1545,44 +1543,6 @@ class DTypeTest(keras_parameterized.TestCase):
     layer = IdentityLayerWithoutAutocast(dtype='float64')
     self.assertEqual(layer(self._const('float32')).dtype, 'float32')
 
-  @testing_utils.enable_v2_dtype_behavior
-  def test_dtype_warnings(self):
-    # Test a layer warns when it casts inputs.
-    layer = IdentityLayer()
-    with test.mock.patch.object(tf_logging, 'warn') as mock_warn:
-      layer(self._const('float64'))
-      self.assertRegex(
-          str(mock_warn.call_args),
-          ".*from dtype float64 to the layer's dtype of float32.*"
-          "The layer has dtype float32 because.*")
-
-    # Test a layer does not warn a second time
-    with test.mock.patch.object(tf_logging, 'warn') as mock_warn:
-      layer(self._const('float64'))
-      mock_warn.assert_not_called()
-
-    # Test a new layer can warn even if a different layer already warned
-    layer = IdentityLayer()
-    with test.mock.patch.object(tf_logging, 'warn') as mock_warn:
-      layer(self._const('float64'))
-      self.assertRegex(
-          str(mock_warn.call_args),
-          ".*from dtype float64 to the layer's dtype of float32.*"
-          "The layer has dtype float32 because.*")
-
-    # Test a layer does not warn if a dtype is passed
-    layer = IdentityLayer(dtype='float32')
-    with test.mock.patch.object(tf_logging, 'warn') as mock_warn:
-      layer(self._const('float64'))
-      mock_warn.assert_not_called()
-
-    # Test a layer does not warn if a Policy is set:
-    with policy.policy_scope('float32'):
-      layer = IdentityLayer()
-      with test.mock.patch.object(tf_logging, 'warn') as mock_warn:
-        layer(self._const('float64'))
-        mock_warn.assert_not_called()
-
   @testing_utils.enable_v2_dtype_behavior
   def test_compute_output_signature(self):
 
diff --git a/tensorflow/python/keras/engine/base_layer_utils.py b/tensorflow/python/keras/engine/base_layer_utils.py
index 6fa955399d9..98414bc9d49 100644
--- a/tensorflow/python/keras/engine/base_layer_utils.py
+++ b/tensorflow/python/keras/engine/base_layer_utils.py
@@ -30,15 +30,14 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.utils import control_flow_util
+from tensorflow.python.keras.utils import tf_inspect
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_util_v2
-from tensorflow.python.ops import control_flow_v2_func_graphs
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.training.tracking import base as tracking
 from tensorflow.python.util import nest
-from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import keras_export
 
 _call_context = threading.local()
@@ -580,11 +579,7 @@ def check_graph_consistency(tensor=None, method='add_loss', force_raise=False):
   """
   if (force_raise or
       (ops.executing_eagerly_outside_functions() and
-       hasattr(tensor, 'graph') and
-       isinstance(tensor.graph,
-                  (control_flow_v2_func_graphs.CondBranchFuncGraph,
-                   control_flow_v2_func_graphs.WhileCondFuncGraph,
-                   control_flow_v2_func_graphs.WhileBodyFuncGraph)))):
+       hasattr(tensor, 'graph') and tensor.graph.is_control_flow_graph)):
     if method == 'activity_regularizer':
       bad_example = """
       class TestModel(tf.keras.Model):
diff --git a/tensorflow/python/keras/engine/base_layer_v1.py b/tensorflow/python/keras/engine/base_layer_v1.py
index 85d390f2360..a9078c6a0c7 100644
--- a/tensorflow/python/keras/engine/base_layer_v1.py
+++ b/tensorflow/python/keras/engine/base_layer_v1.py
@@ -22,6 +22,7 @@ import collections
 import functools
 import itertools
 import threading
+import warnings
 
 import numpy as np
 import six
@@ -51,6 +52,7 @@ from tensorflow.python.keras.mixed_precision.experimental import policy
 from tensorflow.python.keras.saving.saved_model import layer_serialization
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import layer_utils
+from tensorflow.python.keras.utils import tf_inspect
 from tensorflow.python.keras.utils import tf_utils
 # A module that only depends on `keras.layers` import these from here.
 from tensorflow.python.keras.utils.generic_utils import to_snake_case  # pylint: disable=unused-import
@@ -66,10 +68,8 @@ from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.training.tracking import data_structures
 from tensorflow.python.training.tracking import layer_utils as trackable_layer_utils
 from tensorflow.python.training.tracking import tracking
-from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
 from tensorflow.python.util import object_identity
-from tensorflow.python.util import tf_inspect
 from tensorflow.tools.docs import doc_controls
 
 
@@ -153,7 +153,7 @@ class Layer(base_layer.Layer):
   @trackable.no_automatic_dependency_tracking
   def __init__(self, trainable=True, name=None, dtype=None, dynamic=False,
                **kwargs):
-    base_layer.keras_api_gauge.get_cell('layer v1').set(True)
+    base_layer.keras_api_gauge.get_cell('layer').set(True)
     base_layer.keras_layers_gauge.get_cell(self.__class__.__name__).set(True)
     # These properties should be set by the user via keyword arguments.
     # note that 'dtype', 'input_shape' and 'batch_input_shape'
@@ -161,7 +161,7 @@ class Layer(base_layer.Layer):
     # to non-input layers.
     allowed_kwargs = {
         'input_dim', 'input_shape', 'batch_input_shape', 'batch_size',
-        'weights', 'activity_regularizer', 'autocast'
+        'weights', 'activity_regularizer', 'autocast', 'implementation'
     }
     # Validate optional keyword arguments.
     generic_utils.validate_kwargs(kwargs, allowed_kwargs)
@@ -690,10 +690,10 @@ class Layer(base_layer.Layer):
     # Accept NumPy and scalar inputs by converting to Tensors.
     if any(isinstance(x, (np.ndarray, float, int)) for x in input_list):
       def _convert_non_tensor(x):
-        # Don't call `ops.convert_to_tensor_v2` on all `inputs` because
+        # Don't call `ops.convert_to_tensor` on all `inputs` because
         # `SparseTensors` can't be converted to `Tensor`.
         if isinstance(x, (np.ndarray, float, int)):
-          return ops.convert_to_tensor_v2(x)
+          return ops.convert_to_tensor_v2_with_dispatch(x)
         return x
       inputs = nest.map_structure(_convert_non_tensor, inputs)
       input_list = nest.flatten(inputs)
@@ -1053,7 +1053,8 @@ class Layer(base_layer.Layer):
       if loss is None:
         return None  # Will be filtered out when computing the .losses property
       if not tensor_util.is_tensor(loss):
-        loss = ops.convert_to_tensor_v2(loss, dtype=backend.floatx())
+        loss = ops.convert_to_tensor_v2_with_dispatch(
+            loss, dtype=backend.floatx())
       loss._unconditional_loss = (inputs is None)  # pylint: disable=protected-access
       return loss
 
@@ -1068,7 +1069,8 @@ class Layer(base_layer.Layer):
       if loss is None:
         continue
       if not tensor_util.is_tensor(loss):
-        loss = ops.convert_to_tensor_v2(loss, dtype=backend.floatx())
+        loss = ops.convert_to_tensor_v2_with_dispatch(
+            loss, dtype=backend.floatx())
       # TF Functions should take the eager path.
       if (tf_utils.is_symbolic_tensor(loss) and
           not base_layer_utils.is_in_tf_function()):
@@ -1229,7 +1231,7 @@ class Layer(base_layer.Layer):
       elif hasattr(x, 'op'):
         update = x.op
       else:
-        update = ops.convert_to_tensor_v2(x)
+        update = ops.convert_to_tensor_v2_with_dispatch(x)
 
       reachable = tf_utils.get_reachable_from_inputs(relevant_inputs, [update])
       update._unconditional_update = update not in reachable
@@ -1692,8 +1694,6 @@ class Layer(base_layer.Layer):
   # Methods & attributes below are public aliases of other methods.            #
   ##############################################################################
 
-  @deprecation.deprecated(
-      date=None, instructions='Please use `layer.__call__` method instead.')
   @doc_controls.do_not_doc_inheritable
   def apply(self, inputs, *args, **kwargs):
     """Deprecated, do NOT use!
@@ -1708,13 +1708,17 @@ class Layer(base_layer.Layer):
     Returns:
       Output tensor(s).
     """
+    warnings.warn('`layer.apply` is deprecated and '
+                  'will be removed in a future version. '
+                  'Please use `layer.__call__` method instead.')
     return self.__call__(inputs, *args, **kwargs)
 
-  @deprecation.deprecated(
-      date=None, instructions='Please use `layer.add_weight` method instead.')
   @doc_controls.do_not_doc_inheritable
   def add_variable(self, *args, **kwargs):
     """Deprecated, do NOT use! Alias for `add_weight`."""
+    warnings.warn('`layer.add_variable` is deprecated and '
+                  'will be removed in a future version. '
+                  'Please use `layer.add_weight` method instead.')
     return self.add_weight(*args, **kwargs)
 
   @property
@@ -1781,11 +1785,6 @@ class Layer(base_layer.Layer):
                        'use a different Strategy, e.g. a MirroredStrategy.' %
                        (strategy.__class__.__name__, self._dtype_policy.name))
 
-    # This has no impact on the layer behavior, and is only used for printing
-    # warnings.
-    self._dtype_defaulted_to_floatx = (not dtype and
-                                       policy.policy_defaults_to_floatx())
-
     # Performance optimization: cache the compute dtype as a Dtype object or
     # None, so that str to Dtype conversion doesn't happen in Layer.__call__.
     if self._dtype_policy.compute_dtype:
@@ -1829,8 +1828,6 @@ class Layer(base_layer.Layer):
                       ragged_tensor.RaggedTensor)
         if (isinstance(x, cast_types) and x.dtype.is_floating and
             x.dtype.base_dtype.name != compute_dtype):
-          if self._dtype_defaulted_to_floatx:
-            self._warn_about_input_casting(x.dtype.base_dtype)
           return math_ops.cast(x, compute_dtype)
         elif isinstance(x, tensor_spec.TensorSpec) and x.dtype.is_floating:
           # Inputs may be TensorSpecs when this function is called from
@@ -1842,31 +1839,6 @@ class Layer(base_layer.Layer):
     else:
       return inputs
 
-  def _warn_about_input_casting(self, input_dtype):
-    # self._already_warned_about_input_casting is only retrieved or set in this
-    # function.
-    already_warned = getattr(self, '_already_warned_about_input_casting', False)
-    if not already_warned:
-      tf_logging.warn(
-          "Layer {self.name} is casting an input tensor from dtype "
-          "{input_dtype} to the layer's dtype of {layer_dtype}, which is new "
-          "behavior in TensorFlow 2.  The layer has dtype {layer_dtype} "
-          'because its dtype defaults to floatx.\n\n'
-          ""
-          "If you intended to run this layer in {layer_dtype}, you can safely "
-          "ignore this warning. If in doubt, this warning is likely only an "
-          "issue if you are porting a TensorFlow 1.X model to TensorFlow 2.\n\n"
-          ""
-          "To change all layers to have dtype {input_dtype} by default, call "
-          "`tf.keras.backend.set_floatx('{input_dtype}')`. To change just this "
-          "layer, pass dtype='{input_dtype}' to the layer constructor. If you "
-          "are the author of this layer, you can disable autocasting by "
-          "passing autocast=False to the base Layer constructor.\n".format(
-              self=self,
-              input_dtype=input_dtype.name,
-              layer_dtype=self._compute_dtype))
-      self._already_warned_about_input_casting = True
-
   # _dtype used to be an attribute set in the constructor. We still expose it
   # because some clients still use it.
   # TODO(reedwm): Deprecate, then remove the _dtype property.
@@ -2342,14 +2314,14 @@ class Layer(base_layer.Layer):
                               self._call_accepts_kwargs)
 
   @property
-  @tracking.cached_per_instance
+  @layer_utils.cached_per_instance
   def _call_full_argspec(self):
     # Argspec inspection is expensive and the call spec is used often, so it
     # makes sense to cache the result.
     return tf_inspect.getfullargspec(self.call)
 
   @property
-  @tracking.cached_per_instance
+  @layer_utils.cached_per_instance
   def _call_fn_args(self):
     all_args = self._call_full_argspec.args
     # Scrub `self` that appears if a decorator was applied.
@@ -2358,7 +2330,7 @@ class Layer(base_layer.Layer):
     return all_args
 
   @property
-  @tracking.cached_per_instance
+  @layer_utils.cached_per_instance
   def _call_fn_arg_positions(self):
     call_fn_arg_positions = dict()
     for pos, arg in enumerate(self._call_fn_args):
@@ -2366,24 +2338,25 @@ class Layer(base_layer.Layer):
     return call_fn_arg_positions
 
   @property
-  @tracking.cached_per_instance
+  @layer_utils.cached_per_instance
   def _call_accepts_kwargs(self):
     return self._call_full_argspec.varkw is not None
 
   @property
-  @tracking.cached_per_instance
+  @layer_utils.cached_per_instance
   def _should_compute_mask(self):
     return ('mask' in self._call_fn_args or
             getattr(self, 'compute_mask', None) is not None)
 
   def _dedup_weights(self, weights):
     """Dedupe weights while maintaining order as much as possible."""
-    output, seen_weights = [], object_identity.ObjectIdentitySet()
+    output, seen_ids = [], set()
     for w in weights:
-      if w not in seen_weights:
+      if id(w) not in seen_ids:
         output.append(w)
         # Track the Variable's identity to avoid __eq__ issues.
-        seen_weights.add(w)
+        seen_ids.add(id(w))
+
     return output
 
   # SavedModel properties. Please see keras/saving/saved_model for details.
diff --git a/tensorflow/python/keras/engine/base_preprocessing_layer.py b/tensorflow/python/keras/engine/base_preprocessing_layer.py
index f5577bf058e..0818825669d 100644
--- a/tensorflow/python/keras/engine/base_preprocessing_layer.py
+++ b/tensorflow/python/keras/engine/base_preprocessing_layer.py
@@ -31,7 +31,7 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.keras import backend as K
-from tensorflow.python.keras.engine import training_generator
+from tensorflow.python.keras.engine import training_generator_v1
 from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import sparse_ops
@@ -149,7 +149,7 @@ class CombinerPreprocessingLayer(PreprocessingLayer):
     else:
       accumulator = self._combiner.restore(self._restore_updates())
     if isinstance(data, (list, tuple)):
-      data = ops.convert_to_tensor_v2(data)
+      data = ops.convert_to_tensor_v2_with_dispatch(data)
     if not isinstance(data,
                       (dataset_ops.DatasetV2,
                        np.ndarray,
@@ -175,7 +175,7 @@ class CombinerPreprocessingLayer(PreprocessingLayer):
       next_data = self._get_dataset_iterator(
           dataset_ops.Dataset.from_tensor_slices(data).batch(512))
     else:
-      generator, _ = training_generator.convert_to_generator_like(
+      generator, _ = training_generator_v1.convert_to_generator_like(
           data, batch_size=512)
       # If the data is not a dataset, we can iterate over it using next(foo);
       # here, we wrap that into a callable.
diff --git a/tensorflow/python/keras/engine/compile_utils_test.py b/tensorflow/python/keras/engine/compile_utils_test.py
index 39127270539..ae92b9aeb09 100644
--- a/tensorflow/python/keras/engine/compile_utils_test.py
+++ b/tensorflow/python/keras/engine/compile_utils_test.py
@@ -53,7 +53,7 @@ class LossesContainerTest(keras_parameterized.TestCase):
 
     y_t = [array_ops.ones((10, 1)), array_ops.zeros((10, 1))]
     y_p = [array_ops.ones((10, 1)), array_ops.ones((10, 1))]
-    sw = ops.convert_to_tensor_v2([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
+    sw = ops.convert_to_tensor_v2_with_dispatch([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
 
     total_loss = loss_container(y_t, y_p, sample_weight=sw)
 
@@ -86,7 +86,7 @@ class LossesContainerTest(keras_parameterized.TestCase):
 
     y_t = {'out1': array_ops.ones((10, 1)), 'out2': array_ops.zeros((10, 1))}
     y_p = {'out1': array_ops.ones((10, 1)), 'out2': array_ops.ones((10, 1))}
-    sw = ops.convert_to_tensor_v2([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
+    sw = ops.convert_to_tensor_v2_with_dispatch([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
 
     total_loss = loss_container(y_t, y_p, sample_weight=sw)
 
@@ -112,7 +112,7 @@ class LossesContainerTest(keras_parameterized.TestCase):
 
     y_t = [array_ops.ones((10, 1)), array_ops.zeros((10, 1))]
     y_p = [array_ops.ones((10, 1)), array_ops.ones((10, 1))]
-    sw = ops.convert_to_tensor_v2([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
+    sw = ops.convert_to_tensor_v2_with_dispatch([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
 
     total_loss = loss_container(y_t, y_p, sample_weight=sw)
 
@@ -135,7 +135,7 @@ class LossesContainerTest(keras_parameterized.TestCase):
 
     y_t = {'out1': array_ops.ones((10, 1)), 'out2': array_ops.zeros((10, 1))}
     y_p = {'out1': array_ops.ones((10, 1)), 'out2': array_ops.ones((10, 1))}
-    sw = ops.convert_to_tensor_v2([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
+    sw = ops.convert_to_tensor_v2_with_dispatch([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
 
     total_loss = loss_container(y_t, y_p, sample_weight=sw)
 
@@ -170,7 +170,7 @@ class LossesContainerTest(keras_parameterized.TestCase):
               array_ops.zeros((10, 1))],
         'a': array_ops.ones((10, 1))
     }
-    sw = ops.convert_to_tensor_v2([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
+    sw = ops.convert_to_tensor_v2_with_dispatch([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
 
     total_loss = loss_container(y_t, y_p, sample_weight=sw)
     self.assertEqual(total_loss.numpy(), 0.75)
@@ -193,7 +193,7 @@ class LossesContainerTest(keras_parameterized.TestCase):
 
     y_t = [array_ops.ones((10, 1)), array_ops.zeros((10, 1))]
     y_p = [array_ops.ones((10, 1)), array_ops.ones((10, 1))]
-    sw = ops.convert_to_tensor_v2([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
+    sw = ops.convert_to_tensor_v2_with_dispatch([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
 
     total_loss = loss_container(y_t, y_p, sample_weight=sw)
     self.assertEqual(total_loss.numpy(), 0.5)
@@ -220,13 +220,13 @@ class LossesContainerTest(keras_parameterized.TestCase):
     })
 
     y_p = {
-        'output1': ops.convert_to_tensor([[0], [1], [2]]),
-        'output2': ops.convert_to_tensor([[3], [4], [5]]),
-        'output3': ops.convert_to_tensor([[6], [7], [8]])
+        'output1': ops.convert_to_tensor_v2_with_dispatch([[0], [1], [2]]),
+        'output2': ops.convert_to_tensor_v2_with_dispatch([[3], [4], [5]]),
+        'output3': ops.convert_to_tensor_v2_with_dispatch([[6], [7], [8]])
     }
     y_t = {
-        'output1': ops.convert_to_tensor([[1], [2], [3]]),
-        'output3': ops.convert_to_tensor([[4], [5], [6]])
+        'output1': ops.convert_to_tensor_v2_with_dispatch([[1], [2], [3]]),
+        'output3': ops.convert_to_tensor_v2_with_dispatch([[4], [5], [6]])
     }
 
     total_loss = loss_container(y_t, y_p)
@@ -372,7 +372,7 @@ class MetricsContainerTest(keras_parameterized.TestCase):
 
     y_t = [array_ops.ones((10, 1)), array_ops.zeros((10, 1))]
     y_p = [array_ops.ones((10, 1)), 2 * array_ops.ones((10, 1))]
-    sw = ops.convert_to_tensor_v2([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
+    sw = ops.convert_to_tensor_v2_with_dispatch([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
     metric_container.update_state(y_t, y_p, sample_weight=sw)
     self.assertLen(metric_container.metrics, 6)
 
@@ -415,7 +415,7 @@ class MetricsContainerTest(keras_parameterized.TestCase):
 
     y_t = {'out1': array_ops.ones((10, 1)), 'out2': array_ops.zeros((10, 1))}
     y_p = {'out1': array_ops.ones((10, 1)), 'out2': 2 * array_ops.ones((10, 1))}
-    sw = ops.convert_to_tensor_v2([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
+    sw = ops.convert_to_tensor_v2_with_dispatch([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
     metric_container.update_state(y_t, y_p, sample_weight=sw)
 
     mse_metric = metric_container.metrics[0]
@@ -440,7 +440,7 @@ class MetricsContainerTest(keras_parameterized.TestCase):
 
     y_t = [array_ops.ones((10, 1)), array_ops.zeros((10, 1))]
     y_p = [array_ops.ones((10, 1)), array_ops.ones((10, 1))]
-    sw = ops.convert_to_tensor_v2([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
+    sw = ops.convert_to_tensor_v2_with_dispatch([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
 
     metric_container.update_state(y_t, y_p, sample_weight=sw)
     self.assertLen(metric_container.metrics, 1)
@@ -457,7 +457,7 @@ class MetricsContainerTest(keras_parameterized.TestCase):
 
     y_t = {'out1': array_ops.ones((10, 1)), 'out2': array_ops.zeros((10, 1))}
     y_p = {'out1': array_ops.ones((10, 1)), 'out2': array_ops.ones((10, 1))}
-    sw = ops.convert_to_tensor_v2([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
+    sw = ops.convert_to_tensor_v2_with_dispatch([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
 
     metric_container.update_state(y_t, y_p, sample_weight=sw)
     self.assertLen(metric_container.metrics, 1)
@@ -487,7 +487,7 @@ class MetricsContainerTest(keras_parameterized.TestCase):
               array_ops.zeros((10, 1))],
         'a': array_ops.ones((10, 1))
     }
-    sw = ops.convert_to_tensor_v2([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
+    sw = ops.convert_to_tensor_v2_with_dispatch([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
 
     metric_container.update_state(y_t, y_p, sample_weight=sw)
     self.assertLen(metric_container.metrics, 3)
@@ -548,9 +548,9 @@ class MetricsContainerTest(keras_parameterized.TestCase):
     metric_container = compile_utils.MetricsContainer(
         metrics=['mae'], weighted_metrics=['mae'])
 
-    y_t = ops.convert_to_tensor_v2([[0], [3], [0]])
-    y_p = ops.convert_to_tensor_v2([[0], [0], [0]])
-    sw = ops.convert_to_tensor_v2([[1], [0], [1]])
+    y_t = ops.convert_to_tensor_v2_with_dispatch([[0], [3], [0]])
+    y_p = ops.convert_to_tensor_v2_with_dispatch([[0], [0], [0]])
+    sw = ops.convert_to_tensor_v2_with_dispatch([[1], [0], [1]])
 
     metric_container.update_state(y_t, y_p, sample_weight=sw)
     self.assertLen(metric_container.metrics, 2)
@@ -566,8 +566,8 @@ class MetricsContainerTest(keras_parameterized.TestCase):
   def test_broadcast_metrics_to_dict(self):
     metric_container = compile_utils.MetricsContainer(metrics=['mae'])
 
-    y_p = {'output': ops.convert_to_tensor([[0], [1], [2]])}
-    y_t = {'output': ops.convert_to_tensor([[1], [2], [3]])}
+    y_p = {'output': ops.convert_to_tensor_v2_with_dispatch([[0], [1], [2]])}
+    y_t = {'output': ops.convert_to_tensor_v2_with_dispatch([[1], [2], [3]])}
     metric_container.update_state(y_t, y_p)
 
     mae_metric = metric_container.metrics[0]
@@ -578,8 +578,8 @@ class MetricsContainerTest(keras_parameterized.TestCase):
     metric_container = compile_utils.MetricsContainer(
         metrics=['mae'], output_names=['output'])
 
-    y_p = ops.convert_to_tensor([[0], [1], [2]])
-    y_t = {'output': ops.convert_to_tensor([[1], [2], [3]])}
+    y_p = ops.convert_to_tensor_v2_with_dispatch([[0], [1], [2]])
+    y_t = {'output': ops.convert_to_tensor_v2_with_dispatch([[1], [2], [3]])}
     metric_container.update_state(y_t, y_p)
 
     mae_metric = metric_container.metrics[0]
@@ -595,13 +595,13 @@ class MetricsContainerTest(keras_parameterized.TestCase):
     })
 
     y_p = {
-        'output1': ops.convert_to_tensor([[0], [1], [2]]),
-        'output2': ops.convert_to_tensor([[3], [4], [5]]),
-        'output3': ops.convert_to_tensor([[6], [7], [8]])
+        'output1': ops.convert_to_tensor_v2_with_dispatch([[0], [1], [2]]),
+        'output2': ops.convert_to_tensor_v2_with_dispatch([[3], [4], [5]]),
+        'output3': ops.convert_to_tensor_v2_with_dispatch([[6], [7], [8]])
     }
     y_t = {
-        'output1': ops.convert_to_tensor([[1], [2], [3]]),
-        'output3': ops.convert_to_tensor([[4], [5], [6]])
+        'output1': ops.convert_to_tensor_v2_with_dispatch([[1], [2], [3]]),
+        'output3': ops.convert_to_tensor_v2_with_dispatch([[4], [5], [6]])
     }
 
     metric_container.update_state(y_t, y_p)
diff --git a/tensorflow/python/keras/engine/data_adapter.py b/tensorflow/python/keras/engine/data_adapter.py
index e9662da73e7..e8759b35448 100644
--- a/tensorflow/python/keras/engine/data_adapter.py
+++ b/tensorflow/python/keras/engine/data_adapter.py
@@ -423,8 +423,8 @@ class TensorLikeDataAdapter(DataAdapter):
 class GenericArrayLikeDataAdapter(TensorLikeDataAdapter):
   """Adapter that handles array-like data without forcing it into memory.
 
-  As an example, this adapter handles `keras.utils.HDF5Matrix` which holds
-  datasets that may be too big to fully fit into memory.
+  This adapter handles array-like datasets that may be too big to fully
+  fit into memory.
 
   Specifically, this adapter handles any Python class which implements:
   `__get_item__`, `__len__`, `shape`, and `dtype` with the same meanings
@@ -1006,7 +1006,7 @@ def _process_tensorlike(inputs):
       dtype = None
       if issubclass(x.dtype.type, np.floating):
         dtype = backend.floatx()
-      return ops.convert_to_tensor(x, dtype=dtype)
+      return ops.convert_to_tensor_v2_with_dispatch(x, dtype=dtype)
     elif scipy_sparse and scipy_sparse.issparse(x):
       return _scipy_sparse_to_sparse_tensor(x)
     return x
@@ -1281,7 +1281,7 @@ def _make_class_weight_map_fn(class_weight):
         "than the number of classes, found {}").format(class_weight)
     raise ValueError(error_msg)
 
-  class_weight_tensor = ops.convert_to_tensor_v2(
+  class_weight_tensor = ops.convert_to_tensor_v2_with_dispatch(
       [class_weight[int(c)] for c in class_ids])
 
   def _class_weights_map_fn(*data):
diff --git a/tensorflow/python/keras/engine/data_adapter_test.py b/tensorflow/python/keras/engine/data_adapter_test.py
index fad193009cf..1662a6955d6 100644
--- a/tensorflow/python/keras/engine/data_adapter_test.py
+++ b/tensorflow/python/keras/engine/data_adapter_test.py
@@ -26,11 +26,9 @@ import numpy as np
 from tensorflow.python import keras
 from tensorflow.python.data.experimental.ops import cardinality
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.framework import test_util
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import data_adapter
@@ -266,92 +264,92 @@ class TensorLikeDataAdapterTest(DataAdapterTestBase):
     self.assertEqual(adapter.get_size(), 10)
     self.assertFalse(adapter.has_partial_batch())
 
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
   def test_shuffle_correctness(self):
-    with context.eager_mode():
-      num_samples = 100
-      batch_size = 32
-      x = np.arange(num_samples)
-      np.random.seed(99)
-      adapter = self.adapter_cls(
-          x, y=None, batch_size=batch_size, shuffle=True, epochs=2)
+    num_samples = 100
+    batch_size = 32
+    x = np.arange(num_samples)
+    np.random.seed(99)
+    adapter = self.adapter_cls(
+        x, y=None, batch_size=batch_size, shuffle=True, epochs=2)
 
-      def _get_epoch(ds_iter):
-        ds_data = []
-        for _ in range(int(math.ceil(num_samples / batch_size))):
-          ds_data.append(next(ds_iter).numpy())
-        return np.concatenate(ds_data)
+    def _get_epoch(ds_iter):
+      ds_data = []
+      for _ in range(int(math.ceil(num_samples / batch_size))):
+        ds_data.append(next(ds_iter).numpy())
+      return np.concatenate(ds_data)
 
-      ds_iter = iter(adapter.get_dataset())
+    ds_iter = iter(adapter.get_dataset())
 
-      # First epoch.
-      epoch_data = _get_epoch(ds_iter)
-      # Check that shuffling occurred.
-      self.assertNotAllClose(x, epoch_data)
-      # Check that each elements appears, and only once.
-      self.assertAllClose(x, np.sort(epoch_data))
+    # First epoch.
+    epoch_data = _get_epoch(ds_iter)
+    # Check that shuffling occurred.
+    self.assertNotAllClose(x, epoch_data)
+    # Check that each elements appears, and only once.
+    self.assertAllClose(x, np.sort(epoch_data))
 
-      # Second epoch.
-      second_epoch_data = _get_epoch(ds_iter)
-      # Check that shuffling occurred.
-      self.assertNotAllClose(x, second_epoch_data)
-      # Check that shuffling is different across epochs.
-      self.assertNotAllClose(epoch_data, second_epoch_data)
-      # Check that each elements appears, and only once.
-      self.assertAllClose(x, np.sort(second_epoch_data))
+    # Second epoch.
+    second_epoch_data = _get_epoch(ds_iter)
+    # Check that shuffling occurred.
+    self.assertNotAllClose(x, second_epoch_data)
+    # Check that shuffling is different across epochs.
+    self.assertNotAllClose(epoch_data, second_epoch_data)
+    # Check that each elements appears, and only once.
+    self.assertAllClose(x, np.sort(second_epoch_data))
 
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
   def test_batch_shuffle_correctness(self):
-    with context.eager_mode():
-      num_samples = 100
-      batch_size = 6
-      x = np.arange(num_samples)
-      np.random.seed(99)
-      adapter = self.adapter_cls(
-          x, y=None, batch_size=batch_size, shuffle='batch', epochs=2)
+    num_samples = 100
+    batch_size = 6
+    x = np.arange(num_samples)
+    np.random.seed(99)
+    adapter = self.adapter_cls(
+        x, y=None, batch_size=batch_size, shuffle='batch', epochs=2)
 
-      def _get_epoch_batches(ds_iter):
-        ds_data = []
-        for _ in range(int(math.ceil(num_samples / batch_size))):
-          ds_data.append(next(ds_iter)[0].numpy())
-        return ds_data
+    def _get_epoch_batches(ds_iter):
+      ds_data = []
+      for _ in range(int(math.ceil(num_samples / batch_size))):
+        ds_data.append(next(ds_iter)[0].numpy())
+      return ds_data
 
-      ds_iter = iter(adapter.get_dataset())
+    ds_iter = iter(adapter.get_dataset())
 
-      # First epoch.
-      epoch_batch_data = _get_epoch_batches(ds_iter)
-      epoch_data = np.concatenate(epoch_batch_data)
+    # First epoch.
+    epoch_batch_data = _get_epoch_batches(ds_iter)
+    epoch_data = np.concatenate(epoch_batch_data)
 
-      def _verify_batch(batch):
-        # Verify that a batch contains only contiguous data, and that it has
-        # been shuffled.
-        shuffled_batch = np.sort(batch)
-        self.assertNotAllClose(batch, shuffled_batch)
-        for i in range(1, len(batch)):
-          self.assertEqual(shuffled_batch[i-1] + 1, shuffled_batch[i])
+    def _verify_batch(batch):
+      # Verify that a batch contains only contiguous data, and that it has
+      # been shuffled.
+      shuffled_batch = np.sort(batch)
+      self.assertNotAllClose(batch, shuffled_batch)
+      for i in range(1, len(batch)):
+        self.assertEqual(shuffled_batch[i-1] + 1, shuffled_batch[i])
 
-      # Assert that the data within each batch remains contiguous
-      for batch in epoch_batch_data:
-        _verify_batch(batch)
+    # Assert that the data within each batch remains contiguous
+    for batch in epoch_batch_data:
+      _verify_batch(batch)
 
-      # Check that individual batches are unshuffled
-      # Check that shuffling occurred.
-      self.assertNotAllClose(x, epoch_data)
-      # Check that each elements appears, and only once.
-      self.assertAllClose(x, np.sort(epoch_data))
+    # Check that individual batches are unshuffled
+    # Check that shuffling occurred.
+    self.assertNotAllClose(x, epoch_data)
+    # Check that each elements appears, and only once.
+    self.assertAllClose(x, np.sort(epoch_data))
 
-      # Second epoch.
-      second_epoch_batch_data = _get_epoch_batches(ds_iter)
-      second_epoch_data = np.concatenate(second_epoch_batch_data)
+    # Second epoch.
+    second_epoch_batch_data = _get_epoch_batches(ds_iter)
+    second_epoch_data = np.concatenate(second_epoch_batch_data)
 
-      # Assert that the data within each batch remains contiguous
-      for batch in second_epoch_batch_data:
-        _verify_batch(batch)
+    # Assert that the data within each batch remains contiguous
+    for batch in second_epoch_batch_data:
+      _verify_batch(batch)
 
-      # Check that shuffling occurred.
-      self.assertNotAllClose(x, second_epoch_data)
-      # Check that shuffling is different across epochs.
-      self.assertNotAllClose(epoch_data, second_epoch_data)
-      # Check that each elements appears, and only once.
-      self.assertAllClose(x, np.sort(second_epoch_data))
+    # Check that shuffling occurred.
+    self.assertNotAllClose(x, second_epoch_data)
+    # Check that shuffling is different across epochs.
+    self.assertNotAllClose(epoch_data, second_epoch_data)
+    # Check that each elements appears, and only once.
+    self.assertAllClose(x, np.sort(second_epoch_data))
 
   @parameterized.named_parameters(
       ('batch_size_5', 5, None, 5),
@@ -446,7 +444,7 @@ class GenericArrayLikeDataAdapterTest(DataAdapterTestBase):
   def test_training(self):
     # First verify that DummyArrayLike can't be converted to a Tensor
     with self.assertRaises(TypeError):
-      ops.convert_to_tensor_v2(self.arraylike_input)
+      ops.convert_to_tensor_v2_with_dispatch(self.arraylike_input)
 
     # Then train on the array like.
     # It should not be converted to a tensor directly (which would force it into
@@ -495,92 +493,92 @@ class GenericArrayLikeDataAdapterTest(DataAdapterTestBase):
     self.model.evaluate(self.arraylike_input,
                         self.tensor_target, batch_size=5)
 
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
   def test_shuffle_correctness(self):
-    with context.eager_mode():
-      num_samples = 100
-      batch_size = 32
-      x = DummyArrayLike(np.arange(num_samples))
-      np.random.seed(99)
-      adapter = self.adapter_cls(
-          x, y=None, batch_size=batch_size, shuffle=True, epochs=2)
+    num_samples = 100
+    batch_size = 32
+    x = DummyArrayLike(np.arange(num_samples))
+    np.random.seed(99)
+    adapter = self.adapter_cls(
+        x, y=None, batch_size=batch_size, shuffle=True, epochs=2)
 
-      def _get_epoch(ds_iter):
-        ds_data = []
-        for _ in range(int(math.ceil(num_samples / batch_size))):
-          ds_data.append(next(ds_iter).numpy())
-        return np.concatenate(ds_data)
+    def _get_epoch(ds_iter):
+      ds_data = []
+      for _ in range(int(math.ceil(num_samples / batch_size))):
+        ds_data.append(next(ds_iter).numpy())
+      return np.concatenate(ds_data)
 
-      ds_iter = iter(adapter.get_dataset())
+    ds_iter = iter(adapter.get_dataset())
 
-      # First epoch.
-      epoch_data = _get_epoch(ds_iter)
-      # Check that shuffling occurred.
-      self.assertNotAllClose(x, epoch_data)
-      # Check that each elements appears, and only once.
-      self.assertAllClose(x, np.sort(epoch_data))
+    # First epoch.
+    epoch_data = _get_epoch(ds_iter)
+    # Check that shuffling occurred.
+    self.assertNotAllClose(x, epoch_data)
+    # Check that each elements appears, and only once.
+    self.assertAllClose(x, np.sort(epoch_data))
 
-      # Second epoch.
-      second_epoch_data = _get_epoch(ds_iter)
-      # Check that shuffling occurred.
-      self.assertNotAllClose(x, second_epoch_data)
-      # Check that shuffling is different across epochs.
-      self.assertNotAllClose(epoch_data, second_epoch_data)
-      # Check that each elements appears, and only once.
-      self.assertAllClose(x, np.sort(second_epoch_data))
+    # Second epoch.
+    second_epoch_data = _get_epoch(ds_iter)
+    # Check that shuffling occurred.
+    self.assertNotAllClose(x, second_epoch_data)
+    # Check that shuffling is different across epochs.
+    self.assertNotAllClose(epoch_data, second_epoch_data)
+    # Check that each elements appears, and only once.
+    self.assertAllClose(x, np.sort(second_epoch_data))
 
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
   def test_batch_shuffle_correctness(self):
-    with context.eager_mode():
-      num_samples = 100
-      batch_size = 6
-      x = DummyArrayLike(np.arange(num_samples))
-      np.random.seed(99)
-      adapter = self.adapter_cls(
-          x, y=None, batch_size=batch_size, shuffle='batch', epochs=2)
+    num_samples = 100
+    batch_size = 6
+    x = DummyArrayLike(np.arange(num_samples))
+    np.random.seed(99)
+    adapter = self.adapter_cls(
+        x, y=None, batch_size=batch_size, shuffle='batch', epochs=2)
 
-      def _get_epoch_batches(ds_iter):
-        ds_data = []
-        for _ in range(int(math.ceil(num_samples / batch_size))):
-          ds_data.append(next(ds_iter)[0].numpy())
-        return ds_data
+    def _get_epoch_batches(ds_iter):
+      ds_data = []
+      for _ in range(int(math.ceil(num_samples / batch_size))):
+        ds_data.append(next(ds_iter)[0].numpy())
+      return ds_data
 
-      ds_iter = iter(adapter.get_dataset())
+    ds_iter = iter(adapter.get_dataset())
 
-      # First epoch.
-      epoch_batch_data = _get_epoch_batches(ds_iter)
-      epoch_data = np.concatenate(epoch_batch_data)
+    # First epoch.
+    epoch_batch_data = _get_epoch_batches(ds_iter)
+    epoch_data = np.concatenate(epoch_batch_data)
 
-      def _verify_batch(batch):
-        # Verify that a batch contains only contiguous data, but that it has
-        # been shuffled.
-        shuffled_batch = np.sort(batch)
-        self.assertNotAllClose(batch, shuffled_batch)
-        for i in range(1, len(batch)):
-          self.assertEqual(shuffled_batch[i-1] + 1, shuffled_batch[i])
+    def _verify_batch(batch):
+      # Verify that a batch contains only contiguous data, but that it has
+      # been shuffled.
+      shuffled_batch = np.sort(batch)
+      self.assertNotAllClose(batch, shuffled_batch)
+      for i in range(1, len(batch)):
+        self.assertEqual(shuffled_batch[i-1] + 1, shuffled_batch[i])
 
-      # Assert that the data within each batch is shuffled contiguous data
-      for batch in epoch_batch_data:
-        _verify_batch(batch)
+    # Assert that the data within each batch is shuffled contiguous data
+    for batch in epoch_batch_data:
+      _verify_batch(batch)
 
-      # Check that individual batches are unshuffled
-      # Check that shuffling occurred.
-      self.assertNotAllClose(x, epoch_data)
-      # Check that each elements appears, and only once.
-      self.assertAllClose(x, np.sort(epoch_data))
+    # Check that individual batches are unshuffled
+    # Check that shuffling occurred.
+    self.assertNotAllClose(x, epoch_data)
+    # Check that each elements appears, and only once.
+    self.assertAllClose(x, np.sort(epoch_data))
 
-      # Second epoch.
-      second_epoch_batch_data = _get_epoch_batches(ds_iter)
-      second_epoch_data = np.concatenate(second_epoch_batch_data)
+    # Second epoch.
+    second_epoch_batch_data = _get_epoch_batches(ds_iter)
+    second_epoch_data = np.concatenate(second_epoch_batch_data)
 
-      # Assert that the data within each batch remains contiguous
-      for batch in second_epoch_batch_data:
-        _verify_batch(batch)
+    # Assert that the data within each batch remains contiguous
+    for batch in second_epoch_batch_data:
+      _verify_batch(batch)
 
-      # Check that shuffling occurred.
-      self.assertNotAllClose(x, second_epoch_data)
-      # Check that shuffling is different across epochs.
-      self.assertNotAllClose(epoch_data, second_epoch_data)
-      # Check that each elements appears, and only once.
-      self.assertAllClose(x, np.sort(second_epoch_data))
+    # Check that shuffling occurred.
+    self.assertNotAllClose(x, second_epoch_data)
+    # Check that shuffling is different across epochs.
+    self.assertNotAllClose(epoch_data, second_epoch_data)
+    # Check that each elements appears, and only once.
+    self.assertAllClose(x, np.sort(second_epoch_data))
 
   @parameterized.named_parameters(
       ('batch_size_5', 5, None, 5),
@@ -676,7 +674,7 @@ class GeneratorDataAdapterTest(DataAdapterTestBase):
     self.model.fit(self.generator_input, steps_per_epoch=10)
 
   @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   @data_utils.dont_use_multiprocessing_pool
   def test_with_multiprocessing_training(self):
     self.model.compile(loss='sparse_categorical_crossentropy', optimizer='sgd',
@@ -712,15 +710,15 @@ class GeneratorDataAdapterTest(DataAdapterTestBase):
       self.adapter_cls(
           self.generator_input, sample_weights=self.generator_input)
 
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
   def test_not_shuffled(self):
     def generator():
       for i in range(10):
         yield np.ones((1, 1)) * i
 
     adapter = self.adapter_cls(generator(), shuffle=True)
-    with context.eager_mode():
-      for i, data in enumerate(adapter.get_dataset()):
-        self.assertEqual(i, data[0].numpy().flatten())
+    for i, data in enumerate(adapter.get_dataset()):
+      self.assertEqual(i, data[0].numpy().flatten())
 
 
 class KerasSequenceAdapterTest(DataAdapterTestBase):
@@ -743,7 +741,7 @@ class KerasSequenceAdapterTest(DataAdapterTestBase):
     self.model.fit(self.sequence_input)
 
   @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   @data_utils.dont_use_multiprocessing_pool
   def test_with_multiprocessing_training(self):
     self.model.compile(loss='sparse_categorical_crossentropy', optimizer='sgd',
@@ -914,7 +912,7 @@ class DataHandlerTest(keras_parameterized.TestCase):
     def generator():
       for _ in range(2):
         for step in range(3):
-          yield (ops.convert_to_tensor_v2([step]),)
+          yield (ops.convert_to_tensor_v2_with_dispatch([step]),)
 
     data_handler = data_adapter.DataHandler(
         generator(), epochs=2, steps_per_epoch=3)
@@ -1007,20 +1005,20 @@ class TestValidationSplit(keras_parameterized.TestCase):
       y = np.array([0, 2, 4, 6, 8])
       sw = np.array([0, 4, 8, 12, 16])
     else:
-      x = ops.convert_to_tensor_v2([0, 1, 2, 3, 4])
-      y = ops.convert_to_tensor_v2([0, 2, 4, 6, 8])
-      sw = ops.convert_to_tensor_v2([0, 4, 8, 12, 16])
+      x = ops.convert_to_tensor_v2_with_dispatch([0, 1, 2, 3, 4])
+      y = ops.convert_to_tensor_v2_with_dispatch([0, 2, 4, 6, 8])
+      sw = ops.convert_to_tensor_v2_with_dispatch([0, 4, 8, 12, 16])
 
     (train_x, train_y, train_sw), (val_x, val_y, val_sw) = (
         data_adapter.train_validation_split((x, y, sw), validation_split=0.2))
 
     if use_numpy:
-      train_x = ops.convert_to_tensor_v2(train_x)
-      train_y = ops.convert_to_tensor_v2(train_y)
-      train_sw = ops.convert_to_tensor_v2(train_sw)
-      val_x = ops.convert_to_tensor_v2(val_x)
-      val_y = ops.convert_to_tensor_v2(val_y)
-      val_sw = ops.convert_to_tensor_v2(val_sw)
+      train_x = ops.convert_to_tensor_v2_with_dispatch(train_x)
+      train_y = ops.convert_to_tensor_v2_with_dispatch(train_y)
+      train_sw = ops.convert_to_tensor_v2_with_dispatch(train_sw)
+      val_x = ops.convert_to_tensor_v2_with_dispatch(val_x)
+      val_y = ops.convert_to_tensor_v2_with_dispatch(val_y)
+      val_sw = ops.convert_to_tensor_v2_with_dispatch(val_sw)
 
     self.assertEqual(train_x.numpy().tolist(), [0, 1, 2, 3])
     self.assertEqual(train_y.numpy().tolist(), [0, 2, 4, 6])
diff --git a/tensorflow/python/keras/engine/functional.py b/tensorflow/python/keras/engine/functional.py
index 42c706a923d..f3911dba9c4 100644
--- a/tensorflow/python/keras/engine/functional.py
+++ b/tensorflow/python/keras/engine/functional.py
@@ -40,13 +40,13 @@ from tensorflow.python.keras.engine import training as training_lib
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.saving.saved_model import network_serialization
 from tensorflow.python.keras.utils import generic_utils
+from tensorflow.python.keras.utils import tf_inspect
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import nest
-from tensorflow.python.util import tf_inspect
 
 
 # pylint: disable=g-classes-have-attributes
@@ -107,7 +107,16 @@ class Functional(training_lib.Model):
   ))
 
   @trackable.no_automatic_dependency_tracking
-  def __init__(self, inputs=None, outputs=None, name=None, trainable=True):
+  def __init__(self, inputs=None, outputs=None, name=None, trainable=True,
+               **kwargs):
+    # This is used by the Model class, since we have some logic to swap the
+    # class in the __new__ method, which will lead to __init__ get invoked
+    # twice. Using the skip_init to skip one of the invocation of __init__ to
+    # avoid any side effects
+    skip_init = kwargs.pop('skip_init', False)
+    if skip_init:
+      return
+    generic_utils.validate_kwargs(kwargs, {})
     super(Functional, self).__init__(name=name, trainable=trainable)
     self._init_graph_network(inputs, outputs)
 
@@ -131,10 +140,16 @@ class Functional(training_lib.Model):
     # Models constructed with a single Tensor or list of Tensors can
     # be called with a dict, where the keys of the dict are the names
     # of the `Input` objects. Extra keys are ignored with warning.
-    self._enable_dict_to_input_mapping = (
-        not nest.is_nested(self._nested_inputs) or
-        (isinstance(self._nested_inputs, (list, tuple, dict)) and
-         not any(nest.is_nested(t) for t in self._nested_inputs)))
+    if not nest.is_nested(self._nested_inputs):
+      self._enable_dict_to_input_mapping = True
+    elif (isinstance(self._nested_inputs, (list, tuple)) and
+          not any(nest.is_nested(t) for t in self._nested_inputs)):
+      self._enable_dict_to_input_mapping = True
+    elif (isinstance(self._nested_inputs, dict) and
+          not any(nest.is_nested(t) for t in self._nested_inputs.values())):
+      self._enable_dict_to_input_mapping = True
+    else:
+      self._enable_dict_to_input_mapping = False
 
     if not keras_tensor.keras_tensors_enabled():
       if any(not hasattr(tensor, '_keras_history') for tensor in self.outputs):
@@ -486,6 +501,19 @@ class Functional(training_lib.Model):
     # Return shapes as TensorShapes.
     return output_shapes
 
+  def _init_set_name(self, name, zero_based=True):
+    if not name:
+      cls_name = self.__class__.__name__
+      if self.__class__ == Functional:
+        # Hide the functional class name from user, since its not a public
+        # visible class. Use "Model" instead,
+        cls_name = 'Model'
+      self._name = backend.unique_object_name(
+          generic_utils.to_snake_case(cls_name),
+          zero_based=zero_based)
+    else:
+      self._name = name
+
   def _run_internal_graph(self, inputs, training=None, mask=None):
     """Computes output tensors for new inputs.
 
@@ -668,7 +696,7 @@ class Functional(training_lib.Model):
       if len(layer._inbound_nodes) > 1 or (
           layer._inbound_nodes and not layer._inbound_nodes[0].is_input):
         cls_name = self.__class__.__name__
-        logging.warning(cls_name + ' inputs must come from '
+        logging.warning(cls_name + ' model inputs must come from '
                         '`tf.keras.Input` (thus holding past layer metadata), '
                         'they cannot be the output of '
                         'a previous non-Input layer. '
@@ -697,7 +725,7 @@ class Functional(training_lib.Model):
     for x in self.outputs:
       if not hasattr(x, '_keras_history'):
         cls_name = self.__class__.__name__
-        raise ValueError('Output tensors to a ' + cls_name + ' must be '
+        raise ValueError('Output tensors of a ' + cls_name + ' model must be '
                          'the output of a TensorFlow `Layer` '
                          '(thus holding past layer metadata). Found: ' + str(x))
 
diff --git a/tensorflow/python/keras/engine/functional_test.py b/tensorflow/python/keras/engine/functional_test.py
index dc87098d71f..8427517f235 100644
--- a/tensorflow/python/keras/engine/functional_test.py
+++ b/tensorflow/python/keras/engine/functional_test.py
@@ -60,6 +60,18 @@ except ImportError:
 
 class NetworkConstructionTest(keras_parameterized.TestCase):
 
+  def test_default_model_name(self):
+    inputs = input_layer_lib.Input(shape=(1,))
+    outputs = layers.Dense(1, activation='relu')(inputs)
+    model = training_lib.Model(inputs=inputs, outputs=outputs)
+    self.assertEqual(model.name, 'model')
+
+    model_2 = training_lib.Model(inputs=inputs, outputs=outputs)
+    self.assertEqual(model_2.name, 'model_1')
+
+    model_3 = training_lib.Model(inputs=inputs, outputs=outputs)
+    self.assertEqual(model_3.name, 'model_2')
+
   def test_get_updates(self):
 
     class MyLayer(layers.Layer):
@@ -1408,11 +1420,11 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     outputs = layers.Dense(4)(inputs)
 
     with self.assertRaisesRegex(TypeError,
-                                'got an unexpected keyword argument'):
+                                'Keyword argument not understood'):
       model = training_lib.Model(
           inputs, outputs, name='m', trainable=False, dtype='int64')
     with self.assertRaisesRegex(TypeError,
-                                'got an unexpected keyword argument'):
+                                'Keyword argument not understood'):
       model = training_lib.Model(
           inputs, outputs, name='m', trainable=False, dynamic=False)
 
@@ -1960,6 +1972,28 @@ class NestedNetworkTest(keras_parameterized.TestCase):
     # Check that 'b' was used and 'a' was ignored.
     self.assertEqual(res.shape.as_list(), [1, 1])
 
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
+  def test_nested_dict_mapping(self):
+    a = input_layer_lib.Input(shape=(1,), dtype='int32', name='a')
+    b = input_layer_lib.Input(shape=(1,), dtype='int32', name='b')
+    c = input_layer_lib.Input(shape=(1,), dtype='int32', name='c')
+    d = input_layer_lib.Input(shape=(1,), dtype='int32', name='d')
+    inputs = {'a': (a, b), 'c': (c, d)}
+    outputs = 1000 * a + 100 * b + 10 * c + d
+    model = training_lib.Model(inputs, outputs)
+
+    a_val = array_ops.ones((1, 1), dtype='int32')
+    b_val = 2 * array_ops.ones((1, 1), dtype='int32')
+    c_val = 3 * array_ops.ones((1, 1), dtype='int32')
+    d_val = 4 * array_ops.ones((1, 1), dtype='int32')
+
+    inputs_val = {'a': (a_val, b_val), 'c': (c_val, d_val)}
+    res = model(inputs_val)
+
+    # Check that inputs were flattened in the correct order.
+    self.assertFalse(model._enable_dict_to_input_mapping)
+    self.assertEqual(self.evaluate(res), [1234])
+
 
 @combinations.generate(combinations.keras_mode_combinations())
 class AddLossTest(keras_parameterized.TestCase):
diff --git a/tensorflow/python/keras/engine/keras_tensor.py b/tensorflow/python/keras/engine/keras_tensor.py
index 07b8bdfea5b..3aa9b595d4f 100644
--- a/tensorflow/python/keras/engine/keras_tensor.py
+++ b/tensorflow/python/keras/engine/keras_tensor.py
@@ -229,12 +229,8 @@ class KerasTensor(object):
 
     if hasattr(self, '_keras_history'):
       layer = self._keras_history.layer
-      node_index = self._keras_history.node_index
-      tensor_index = self._keras_history.tensor_index
       symbolic_description = (
-          ', description="Symbolic value %s from '
-          'symbolic call %s of layer \'%s\'"' % (
-              tensor_index, node_index, layer.name))
+          ', description="created by layer \'%s\'"' % (layer.name,))
     if self._inferred_value is not None:
       inferred_value_string = (
           ', inferred_value=%s' % self._inferred_value)
@@ -254,11 +250,7 @@ class KerasTensor(object):
 
     if hasattr(self, '_keras_history'):
       layer = self._keras_history.layer
-      node_index = self._keras_history.node_index
-      tensor_index = self._keras_history.tensor_index
-      symbolic_description = (
-          ' (Symbolic value %s from symbolic call %s of layer \'%s\')' % (
-              tensor_index, node_index, layer.name))
+      symbolic_description = ' (created by layer \'%s\')' % (layer.name,)
     if self._inferred_value is not None:
       inferred_value_string = (
           ' inferred_value=%s' % self._inferred_value)
diff --git a/tensorflow/python/keras/engine/keras_tensor_test.py b/tensorflow/python/keras/engine/keras_tensor_test.py
index 374b89202a1..dfe1077ddd9 100644
--- a/tensorflow/python/keras/engine/keras_tensor_test.py
+++ b/tensorflow/python/keras/engine/keras_tensor_test.py
@@ -68,21 +68,20 @@ class KerasTensorTest(test.TestCase):
       expected_str = (
           "KerasTensor(type_spec=TensorSpec(shape=(None, 3, 10), "
           "dtype=tf.float32, name=None), name='dense/BiasAdd:0', "
-          "description=\"Symbolic value 0 from symbolic call 0 "
-          "of layer 'dense'\")")
+          "description=\"created by layer 'dense'\")")
       expected_repr = (
-          "<KerasTensor: shape=(None, 3, 10) dtype=float32 (Symbolic value 0 "
-          "from symbolic call 0 of layer 'dense')>")
+          "<KerasTensor: shape=(None, 3, 10) dtype=float32 (created "
+          "by layer 'dense')>")
       self.assertEqual(expected_str, str(kt))
       self.assertEqual(expected_repr, repr(kt))
 
       kt = array_ops.reshape(kt, shape=(3, 5, 2))
       expected_str = (
           "KerasTensor(type_spec=TensorSpec(shape=(3, 5, 2), dtype=tf.float32, "
-          "name=None), name='tf.reshape/Reshape:0', description=\"Symbolic "
-          "value 0 from symbolic call 0 of layer 'tf.reshape'\")")
-      expected_repr = ("<KerasTensor: shape=(3, 5, 2) dtype=float32 (Symbolic "
-                       "value 0 from symbolic call 0 of layer 'tf.reshape')>")
+          "name=None), name='tf.reshape/Reshape:0', description=\"created "
+          "by layer 'tf.reshape'\")")
+      expected_repr = ("<KerasTensor: shape=(3, 5, 2) dtype=float32 (created "
+                       "by layer 'tf.reshape')>")
       self.assertEqual(expected_str, str(kt))
       self.assertEqual(expected_repr, repr(kt))
 
@@ -90,12 +89,10 @@ class KerasTensorTest(test.TestCase):
       for i in range(3):
         expected_str = (
             "KerasTensor(type_spec=TensorSpec(shape=(5, 2), dtype=tf.float32, "
-            "name=None), name='tf.unstack/unstack:%s', description=\"Symbolic "
-            "value %s from symbolic call 0 of layer 'tf.unstack'\")"
-        ) % (i, i)
+            "name=None), name='tf.unstack/unstack:%s', description=\"created "
+            "by layer 'tf.unstack'\")" % (i,))
         expected_repr = ("<KerasTensor: shape=(5, 2) dtype=float32 "
-                         "(Symbolic value %s from symbolic call 0 "
-                         "of layer 'tf.unstack')>") % i
+                         "(created by layer 'tf.unstack')>")
         self.assertEqual(expected_str, str(kts[i]))
         self.assertEqual(expected_repr, repr(kts[i]))
 
diff --git a/tensorflow/python/keras/engine/sequential.py b/tensorflow/python/keras/engine/sequential.py
index 3b50506370b..75bbcd024e0 100644
--- a/tensorflow/python/keras/engine/sequential.py
+++ b/tensorflow/python/keras/engine/sequential.py
@@ -20,6 +20,7 @@ from __future__ import division
 from __future__ import print_function
 
 import copy
+import warnings
 
 from tensorflow.python import tf2
 from tensorflow.python.framework import ops
@@ -32,11 +33,12 @@ from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.saving.saved_model import model_serialization
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import layer_utils
+from tensorflow.python.keras.utils import tf_inspect
 from tensorflow.python.keras.utils import tf_utils
+from tensorflow.python.ops.numpy_ops import np_arrays
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import nest
-from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -352,7 +354,8 @@ class Sequential(functional.Functional):
   def call(self, inputs, training=None, mask=None):  # pylint: disable=redefined-outer-name
     # If applicable, update the static input shape of the model.
     if not self._has_explicit_input_shape:
-      if not tensor_util.is_tensor(inputs):
+      if not tensor_util.is_tensor(inputs) and not isinstance(
+          inputs, np_arrays.ndarray):
         # This is a Sequential with mutiple inputs. This is technically an
         # invalid use case of Sequential, but we tolerate it for backwards
         # compatibility.
@@ -419,9 +422,9 @@ class Sequential(functional.Functional):
     Returns:
         A Numpy array of probability predictions.
     """
-    logging.warning('`model.predict_proba()` is deprecated and '
-                    'will be removed after 2021-01-01. '
-                    'Please use `model.predict()` instead.')
+    warnings.warn('`model.predict_proba()` is deprecated and '
+                  'will be removed after 2021-01-01. '
+                  'Please use `model.predict()` instead.')
     preds = self.predict(x, batch_size, verbose)
     if preds.min() < 0. or preds.max() > 1.:
       logging.warning('Network returning invalid probability values. '
@@ -444,15 +447,15 @@ class Sequential(functional.Functional):
     Returns:
         A numpy array of class predictions.
     """
-    logging.warning('`model.predict_classes()` is deprecated and '
-                    'will be removed after 2021-01-01. '
-                    'Please use instead:'
-                    '* `np.argmax(model.predict(x), axis=-1)`, '
-                    '  if your model does multi-class classification '
-                    '  (e.g. if it uses a `softmax` last-layer activation).'
-                    '* `(model.predict(x) > 0.5).astype("int32")`, '
-                    '  if your model does binary classification '
-                    '  (e.g. if it uses a `sigmoid` last-layer activation).')
+    warnings.warn('`model.predict_classes()` is deprecated and '
+                  'will be removed after 2021-01-01. '
+                  'Please use instead:'
+                  '* `np.argmax(model.predict(x), axis=-1)`, '
+                  '  if your model does multi-class classification '
+                  '  (e.g. if it uses a `softmax` last-layer activation).'
+                  '* `(model.predict(x) > 0.5).astype("int32")`, '
+                  '  if your model does binary classification '
+                  '  (e.g. if it uses a `sigmoid` last-layer activation).')
     proba = self.predict(x, batch_size=batch_size, verbose=verbose)
     if proba.shape[-1] > 1:
       return proba.argmax(axis=-1)
diff --git a/tensorflow/python/keras/engine/sequential_test.py b/tensorflow/python/keras/engine/sequential_test.py
index 1c8510ff3c9..6a9a3bf9bcc 100644
--- a/tensorflow/python/keras/engine/sequential_test.py
+++ b/tensorflow/python/keras/engine/sequential_test.py
@@ -24,7 +24,7 @@ import numpy as np
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
-from tensorflow.python.eager import function
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
@@ -456,7 +456,7 @@ class TestSequentialEagerIntegration(keras_parameterized.TestCase):
 
       def __init__(self, name=None):
         super(MySequential, self).__init__(name=name)
-        self.call = function.defun(self.call)
+        self.call = def_function.function(self.call)
 
     model = MySequential()
     model.add(keras.layers.Dense(4, activation='relu'))
diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index bf542129e5c..301037b7e35 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -22,13 +22,11 @@ import copy
 import itertools
 import json
 import os
+import warnings
 import six
 
 from tensorflow.python.autograph.lang import directives
-from tensorflow.python.distribute import distribute_coordinator as dc
-from tensorflow.python.distribute import distribute_coordinator_context as dc_context
 from tensorflow.python.distribute import distribution_strategy_context as ds_context
-from tensorflow.python.distribute import parameter_server_strategy
 from tensorflow.python.distribute import values as ds_values
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
@@ -75,7 +73,6 @@ from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.training.tracking import data_structures
 from tensorflow.python.training.tracking import layer_utils as trackable_layer_utils
 from tensorflow.python.training.tracking import util as trackable_utils
-from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util.tf_export import keras_export
@@ -95,26 +92,6 @@ except ImportError:
 # pylint: enable=g-import-not-at-top
 
 
-def enable_multi_worker(method):
-  """Decorator that handles running `method` with multi-worker strategy."""
-
-  def _method_wrapper(self, *args, **kwargs):
-    if not self._in_multi_worker_mode():  # pylint: disable=protected-access
-      return method(self, *args, **kwargs)
-
-    # Running inside `run_distribute_coordinator` already.
-    if dc_context.get_current_worker_context():
-      return method(self, *args, **kwargs)
-
-    return dc.run_distribute_coordinator(
-        lambda _: method(self, *args, **kwargs),
-        self.distribute_strategy,
-        mode=dc.CoordinatorMode.INDEPENDENT_WORKER)
-
-  return tf_decorator.make_decorator(
-      target=method, decorator_func=_method_wrapper)
-
-
 def disable_multi_worker(method):
   """Decorator that disallows multi-worker use of `method`."""
 
@@ -234,7 +211,7 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
     if is_functional_model_init_params(args, kwargs) and cls == Model:
       # Functional model
       from tensorflow.python.keras.engine import functional  # pylint: disable=g-import-not-at-top
-      return functional.Functional(*args, **kwargs)
+      return functional.Functional(skip_init=True, *args, **kwargs)
     else:
       return super(Model, cls).__new__(cls, *args, **kwargs)
 
@@ -468,6 +445,7 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
               loss_weights=None,
               weighted_metrics=None,
               run_eagerly=None,
+              steps_per_execution=None,
               **kwargs):
     """Configures the model for training.
 
@@ -519,17 +497,18 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
           logic will not be wrapped in a `tf.function`. Recommended to leave
           this as `None` unless your `Model` cannot be run inside a
           `tf.function`.
-        **kwargs: Any additional arguments. Supported arguments:
-            - `experimental_steps_per_execution`: Int. The number of batches to
-              run during each `tf.function` call. Running multiple batches
-              inside a single `tf.function` call can greatly improve performance
-              on TPUs or small models with a large Python overhead. Note that if
-              this value is set to `N`, `Callback.on_batch` methods will only be
-              called every `N` batches. This currently defaults to `1`. At most,
-              one full epoch will be run each execution. If a number larger than
-              the size of the epoch is passed, the execution will be truncated
-              to the size of the epoch.
-            - `sample_weight_mode` for backward compatibility.
+        steps_per_execution: Int. Defaults to 1. The number of batches to
+          run during each `tf.function` call. Running multiple batches
+          inside a single `tf.function` call can greatly improve performance
+          on TPUs or small models with a large Python overhead.
+          At most, one full epoch will be run each
+          execution. If a number larger than the size of the epoch is passed,
+          the execution will be truncated to the size of the epoch.
+          Note that if `steps_per_execution` is set to `N`,
+          `Callback.on_batch_begin` and `Callback.on_batch_end` methods
+          will only be called every `N` batches
+          (i.e. before/after each `tf.function` execution).
+        **kwargs: Arguments supported for backwards compatibility only.
 
     Raises:
         ValueError: In case of invalid arguments for
@@ -537,6 +516,13 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
     """
     base_layer.keras_api_gauge.get_cell('compile').set(True)
     with self.distribute_strategy.scope():
+      if 'experimental_steps_per_execution' in kwargs:
+        logging.warn('The argument `steps_per_execution` is no longer '
+                     'experimental. Pass `steps_per_execution` instead of '
+                     '`experimental_steps_per_execution`.')
+        if not steps_per_execution:
+          steps_per_execution = kwargs.pop('experimental_steps_per_execution')
+
       self._validate_compile(optimizer, metrics, **kwargs)
       self._run_eagerly = run_eagerly
 
@@ -546,9 +532,7 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
       self.compiled_metrics = compile_utils.MetricsContainer(
           metrics, weighted_metrics, output_names=self.output_names)
 
-      experimental_steps_per_execution = kwargs.pop(
-          'experimental_steps_per_execution', 1)
-      self._configure_steps_per_execution(experimental_steps_per_execution)
+      self._configure_steps_per_execution(steps_per_execution or 1)
 
       # Initializes attrs that are reset each time `compile` is called.
       self._reset_compile_cache()
@@ -741,8 +725,7 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
 
     """
     # These are the only transformations `Model.fit` applies to user-input
-    # data when a `tf.data.Dataset` is provided. These utilities will be exposed
-    # publicly.
+    # data when a `tf.data.Dataset` is provided.
     data = data_adapter.expand_1d(data)
     x, y, sample_weight = data_adapter.unpack_x_y_sample_weight(data)
 
@@ -750,15 +733,7 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
       y_pred = self(x, training=True)
       loss = self.compiled_loss(
           y, y_pred, sample_weight, regularization_losses=self.losses)
-    # For custom training steps, users can just write:
-    #   trainable_variables = self.trainable_variables
-    #   gradients = tape.gradient(loss, trainable_variables)
-    #   self.optimizer.apply_gradients(zip(gradients, trainable_variables))
-    # The _minimize call does a few extra steps unnecessary in most cases,
-    # such as loss scaling and gradient clipping.
-    _minimize(self.distribute_strategy, tape, self.optimizer, loss,
-              self.trainable_variables)
-
+    self.optimizer.minimize(loss, self.trainable_variables, tape=tape)
     self.compiled_metrics.update_state(y, y_pred, sample_weight)
     return {m.name: m.result() for m in self.metrics}
 
@@ -812,8 +787,7 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
 
       def train_function(iterator):
         """Runs a training execution with multiple steps."""
-        outputs = step_function(self, iterator)
-        for _ in math_ops.range(self._steps_per_execution - 1):
+        for _ in math_ops.range(self._steps_per_execution):
           outputs = step_function(self, iterator)
         return outputs
 
@@ -824,7 +798,6 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
     self.train_function = train_function
     return self.train_function
 
-  @enable_multi_worker
   def fit(self,
           x=None,
           y=None,
@@ -1234,8 +1207,7 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
 
       def test_function(iterator):
         """Runs an evaluation execution with multiple steps."""
-        outputs = step_function(self, iterator)
-        for _ in math_ops.range(self._steps_per_execution - 1):
+        for _ in math_ops.range(self._steps_per_execution):
           outputs = step_function(self, iterator)
         return outputs
 
@@ -1246,7 +1218,6 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
     self.test_function = test_function
     return self.test_function
 
-  @enable_multi_worker
   def evaluate(self,
                x=None,
                y=None,
@@ -1395,7 +1366,13 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
       if return_dict:
         return logs
       else:
-        results = [logs.get(name, None) for name in self.metrics_names]
+        results = []
+        for name in self.metrics_names:
+          if name in logs:
+            results.append(logs[name])
+        for key in sorted(logs.keys()):
+          if key not in self.metrics_names:
+            results.append(logs[key])
         if len(results) == 1:
           return results[0]
         return results
@@ -1798,8 +1775,6 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
       outputs = self.predict_function(iterator)
     return tf_utils.to_numpy_or_python_type(outputs)
 
-  @deprecation.deprecated(
-      None, 'Please use Model.fit, which supports generators.')
   def fit_generator(self,
                     generator,
                     steps_per_epoch=None,
@@ -1821,6 +1796,9 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
       `Model.fit` now supports generators, so there is no longer any need to use
       this endpoint.
     """
+    warnings.warn('`Model.fit_generator` is deprecated and '
+                  'will be removed in a future version. '
+                  'Please use `Model.fit`, which supports generators.')
     return self.fit(
         generator,
         steps_per_epoch=steps_per_epoch,
@@ -1837,8 +1815,6 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
         shuffle=shuffle,
         initial_epoch=initial_epoch)
 
-  @deprecation.deprecated(
-      None, 'Please use Model.evaluate, which supports generators.')
   def evaluate_generator(self,
                          generator,
                          steps=None,
@@ -1853,6 +1829,9 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
       `Model.evaluate` now supports generators, so there is no longer any need
       to use this endpoint.
     """
+    warnings.warn('`Model.evaluate_generator` is deprecated and '
+                  'will be removed in a future version. '
+                  'Please use `Model.evaluate`, which supports generators.')
     self._check_call_args('evaluate_generator')
 
     return self.evaluate(
@@ -1864,8 +1843,6 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
         verbose=verbose,
         callbacks=callbacks)
 
-  @deprecation.deprecated(
-      None, 'Please use Model.predict, which supports generators.')
   def predict_generator(self,
                         generator,
                         steps=None,
@@ -1880,6 +1857,9 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
       `Model.predict` now supports generators, so there is no longer any need
       to use this endpoint.
     """
+    warnings.warn('`Model.predict_generator` is deprecated and '
+                  'will be removed in a future version. '
+                  'Please use `Model.predict`, which supports generators.')
     return self.predict(
         generator,
         steps=steps,
@@ -2293,10 +2273,6 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
         layer.reset_states()
 
   @property
-  @deprecation.deprecated(
-      date=None,
-      instructions='This property should not be used in TensorFlow 2.0, '
-      'as updates are applied automatically.')
   @doc_controls.do_not_generate_docs
   def state_updates(self):
     """Deprecated, do NOT use!
@@ -2310,6 +2286,9 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
     Returns:
         A list of update ops.
     """
+    warnings.warn('`Model.state_updates` will be removed in a future version. '
+                  'This property should not be used in TensorFlow 2.0, '
+                  'as `updates` are applied automatically.')
     state_updates = []
     for layer in self.layers:
       if getattr(layer, 'stateful', False):
@@ -2321,6 +2300,9 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
   def weights(self):
     """Returns the list of all layer variables/weights.
 
+    Note: This will not track the weights of nested `tf.Modules` that are not
+    themselves Keras layers.
+
     Returns:
       A list of variables.
     """
@@ -2496,9 +2478,7 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
     if kwargs.pop('target_tensors', None) is not None:
       raise ValueError(
           'target_tensors argument is not supported when executing eagerly.')
-    invalid_kwargs = set(kwargs) - {
-        'experimental_steps_per_execution', 'sample_weight_mode'
-    }
+    invalid_kwargs = set(kwargs) - {'sample_weight_mode'}
     if invalid_kwargs:
       raise TypeError('Invalid keyword argument(s) in `compile`: %s' %
                       (invalid_kwargs,))
@@ -2613,15 +2593,33 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
   # Functions below exist only as v1 / v2 compatibility shims.
   ######################################################################
 
-  def _get_compile_args(self):
-    """Used for saving or cloning a Model."""
+  def _get_compile_args(self, user_metrics=True):
+    """Used for saving or cloning a Model.
+
+    Args:
+      user_metrics: Whether to return user-supplied metrics or `Metric` objects.
+        Defaults to returning the user-supplied metrics.
+
+    Returns:
+      Dictionary of arguments that were used when compiling the model.
+    """
     self._assert_compile_was_called()
     # pylint: disable=protected-access
+
+    saved_metrics = self.compiled_metrics._user_metrics
+    saved_weighted_metrics = self.compiled_metrics._user_weighted_metrics
+
+    if not user_metrics:
+      if saved_metrics is not None:
+        saved_metrics = self.compiled_metrics._metrics
+      if saved_weighted_metrics is not None:
+        saved_weighted_metrics = self.compiled_metrics._weighted_metrics
+
     compile_args = {
         'optimizer': self.optimizer,
         'loss': self.compiled_loss._user_losses,
-        'metrics': self.compiled_metrics._user_metrics,
-        'weighted_metrics': self.compiled_metrics._user_weighted_metrics,
+        'metrics': saved_metrics,
+        'weighted_metrics': saved_weighted_metrics,
         'loss_weights': self.compiled_loss._user_loss_weights,
     }
     # pylint: enable=protected-access
@@ -2700,60 +2698,6 @@ def _tpu_multi_host_concat(v, strategy):
   return concat(ordered_replicas)
 
 
-def _minimize(strategy, tape, optimizer, loss, trainable_variables):
-  """Minimizes loss for one step by updating `trainable_variables`.
-
-  This is roughly equivalent to
-
-  ```python
-  gradients = tape.gradient(loss, trainable_variables)
-  self.optimizer.apply_gradients(zip(gradients, trainable_variables))
-  ```
-
-  However, this function also applies gradient clipping and loss scaling if the
-  optimizer is a LossScaleOptimizer.
-
-  Args:
-    strategy: `tf.distribute.Strategy`.
-    tape: A gradient tape. The loss must have been computed under this tape.
-    optimizer: The optimizer used to minimize the loss.
-    loss: The loss tensor.
-    trainable_variables: The variables that will be updated in order to minimize
-      the loss.
-  """
-
-  with tape:
-    if isinstance(optimizer, lso.LossScaleOptimizer):
-      loss = optimizer.get_scaled_loss(loss)
-
-  gradients = tape.gradient(loss, trainable_variables)
-
-  # Whether to aggregate gradients outside of optimizer. This requires support
-  # of the optimizer and doesn't work with ParameterServerStrategy and
-  # CentralStorageStrategy.
-  aggregate_grads_outside_optimizer = (
-      optimizer._HAS_AGGREGATE_GRAD and  # pylint: disable=protected-access
-      not isinstance(strategy.extended,
-                     parameter_server_strategy.ParameterServerStrategyExtended))
-
-  if aggregate_grads_outside_optimizer:
-    # We aggregate gradients before unscaling them, in case a subclass of
-    # LossScaleOptimizer all-reduces in fp16. All-reducing in fp16 can only be
-    # done on scaled gradients, not unscaled gradients, for numeric stability.
-    gradients = optimizer._aggregate_gradients(zip(gradients,  # pylint: disable=protected-access
-                                                   trainable_variables))
-  if isinstance(optimizer, lso.LossScaleOptimizer):
-    gradients = optimizer.get_unscaled_gradients(gradients)
-  gradients = optimizer._clip_gradients(gradients)  # pylint: disable=protected-access
-  if trainable_variables:
-    if aggregate_grads_outside_optimizer:
-      optimizer.apply_gradients(
-          zip(gradients, trainable_variables),
-          experimental_aggregate_gradients=False)
-    else:
-      optimizer.apply_gradients(zip(gradients, trainable_variables))
-
-
 def _is_scalar(x):
   return isinstance(x, (ops.Tensor, variables.Variable)) and x.shape.rank == 0
 
diff --git a/tensorflow/python/keras/engine/training_arrays.py b/tensorflow/python/keras/engine/training_arrays_v1.py
similarity index 100%
rename from tensorflow/python/keras/engine/training_arrays.py
rename to tensorflow/python/keras/engine/training_arrays_v1.py
diff --git a/tensorflow/python/keras/engine/training_distributed.py b/tensorflow/python/keras/engine/training_distributed_v1.py
similarity index 99%
rename from tensorflow/python/keras/engine/training_distributed.py
rename to tensorflow/python/keras/engine/training_distributed_v1.py
index b33a90bd533..19ddc304498 100644
--- a/tensorflow/python/keras/engine/training_distributed.py
+++ b/tensorflow/python/keras/engine/training_distributed_v1.py
@@ -33,7 +33,7 @@ from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import callbacks as cbks
 from tensorflow.python.keras.distribute import distributed_training_utils as dist_utils
 from tensorflow.python.keras.engine import partial_batch_padding_handler as padding_util
-from tensorflow.python.keras.engine import training_arrays
+from tensorflow.python.keras.engine import training_arrays_v1
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.utils.generic_utils import Progbar
 from tensorflow.python.keras.utils.mode_keys import ModeKeys
@@ -669,7 +669,7 @@ class DistributionSingleWorkerTrainingLoop(training_utils.TrainingLoop):
             validation_steps=validation_steps,
             validation_freq=validation_freq)
 
-    return training_arrays.fit_loop(
+    return training_arrays_v1.fit_loop(
         model,
         dataset,
         batch_size=batch_size,
@@ -717,7 +717,7 @@ class DistributionSingleWorkerTrainingLoop(training_utils.TrainingLoop):
         return experimental_tpu_test_loop(
             model, dataset, verbose=verbose, steps=steps, callbacks=callbacks)
 
-    return training_arrays.test_loop(
+    return training_arrays_v1.test_loop(
         model,
         inputs=dataset,
         batch_size=batch_size,
@@ -751,7 +751,7 @@ class DistributionSingleWorkerTrainingLoop(training_utils.TrainingLoop):
       if not context.executing_eagerly():
         return experimental_tpu_predict_loop(
             model, dataset, verbose=verbose, steps=steps, callbacks=callbacks)
-    return training_arrays.predict_loop(
+    return training_arrays_v1.predict_loop(
         model,
         dataset,
         batch_size=batch_size,
diff --git a/tensorflow/python/keras/engine/training_eager.py b/tensorflow/python/keras/engine/training_eager_v1.py
similarity index 98%
rename from tensorflow/python/keras/engine/training_eager.py
rename to tensorflow/python/keras/engine/training_eager_v1.py
index 8064bf2a7ab..09e6f0d1edd 100644
--- a/tensorflow/python/keras/engine/training_eager.py
+++ b/tensorflow/python/keras/engine/training_eager_v1.py
@@ -121,7 +121,7 @@ def _model_loss(model,
   if any(
       isinstance(input_t, (np.ndarray, float, int))
       for input_t in nest.flatten(inputs)):
-    inputs = nest.map_structure(ops.convert_to_tensor_v2, inputs)
+    inputs = nest.map_structure(ops.convert_to_tensor_v2_with_dispatch, inputs)
 
   outs = model(inputs, **kwargs)
   outs = nest.flatten(outs)
@@ -131,7 +131,8 @@ def _model_loss(model,
   # TODO(sallymatson/psv): check if we should do same mismatch fix for weights
   if sample_weights:
     sample_weights = [
-        training_utils.cast_if_floating_dtype(ops.convert_to_tensor_v2(val))
+        training_utils.cast_if_floating_dtype(
+            ops.convert_to_tensor_v2_with_dispatch(val))
         if val is not None else None for val in sample_weights
     ]
 
@@ -273,7 +274,6 @@ def _process_single_batch(model,
           if isinstance(model.optimizer,
                         loss_scale_optimizer.LossScaleOptimizer):
             grads = model.optimizer.get_unscaled_gradients(grads)
-          grads = model.optimizer._clip_gradients(grads)
           model.optimizer.apply_gradients(zip(grads, trainable_weights))
       else:
         logging.warning('The list of trainable weights is empty. Make sure that'
diff --git a/tensorflow/python/keras/engine/training_generator_test.py b/tensorflow/python/keras/engine/training_generator_test.py
index 3837763c494..8f898618695 100644
--- a/tensorflow/python/keras/engine/training_generator_test.py
+++ b/tensorflow/python/keras/engine/training_generator_test.py
@@ -34,7 +34,7 @@ from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import input_layer
 from tensorflow.python.keras.engine import training
-from tensorflow.python.keras.engine import training_generator
+from tensorflow.python.keras.engine import training_generator_v1
 from tensorflow.python.keras.optimizer_v2 import rmsprop
 from tensorflow.python.keras.utils import data_utils
 from tensorflow.python.platform import test
@@ -527,7 +527,7 @@ class TestConvertToGeneratorLike(test.TestCase, parameterized.TestCase):
         isinstance(data, (dataset_ops.DatasetV2, iterator_ops.Iterator))):
       return
 
-    generator, steps = training_generator.convert_to_generator_like(
+    generator, steps = training_generator_v1.convert_to_generator_like(
         data, batch_size=2, steps_per_epoch=expected_batches)
     self.assertEqual(steps, expected_batches)
 
diff --git a/tensorflow/python/keras/engine/training_generator.py b/tensorflow/python/keras/engine/training_generator_v1.py
similarity index 100%
rename from tensorflow/python/keras/engine/training_generator.py
rename to tensorflow/python/keras/engine/training_generator_v1.py
diff --git a/tensorflow/python/keras/engine/training_test.py b/tensorflow/python/keras/engine/training_test.py
index 15976c0a072..1f8f8cb1b52 100644
--- a/tensorflow/python/keras/engine/training_test.py
+++ b/tensorflow/python/keras/engine/training_test.py
@@ -29,7 +29,6 @@ import six
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
-from tensorflow.python.eager import function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util as tf_test_util
@@ -987,17 +986,17 @@ class TrainingTest(keras_parameterized.TestCase):
       # Test with eager execution and iterator
       model.fit(dataset, epochs=1, steps_per_epoch=2)
 
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
   def test_losses_in_defun(self):
-    with context.eager_mode():
-      layer = layers_module.Dense(1, kernel_regularizer='l1')
-      layer(array_ops.ones([1, 10]))
+    layer = layers_module.Dense(1, kernel_regularizer='l1')
+    layer(array_ops.ones([1, 10]))
 
-      @function.defun
-      def get_losses():
-        return layer.losses
+    @def_function.function
+    def get_losses():
+      return layer.losses
 
-      self.assertAllEqual(
-          self.evaluate(layer.losses), self.evaluate(get_losses()))
+    self.assertAllEqual(
+        self.evaluate(layer.losses), self.evaluate(get_losses()))
 
   @keras_parameterized.run_all_keras_modes
   def test_logging(self):
@@ -1118,70 +1117,70 @@ class TrainingTest(keras_parameterized.TestCase):
     self.assertAllEqual([[6], [8], [10], [12]],
                         model.predict(dataset_two, steps=2))
 
+  @combinations.generate(combinations.combine(mode=['eager']))
   def test_training_on_sparse_categorical_crossentropy_loss_with_softmax(self):
-    with context.eager_mode():
-      np.random.seed(1337)
-      train_x = np.ones((100, 4))
-      train_y = np.random.randint(0, 1, size=(100, 1))
+    np.random.seed(1337)
+    train_x = np.ones((100, 4))
+    train_y = np.random.randint(0, 1, size=(100, 1))
 
-      reference_model = testing_utils.get_small_sequential_mlp(16, 2,
-                                                               input_dim=4)
-      reference_model.compile(loss='sparse_categorical_crossentropy',
-                              optimizer=RMSPropOptimizer(learning_rate=0.001),
-                              run_eagerly=True)
-      fixed_weights = reference_model.get_weights()
-      reference_model_loss = reference_model.train_on_batch(train_x, train_y)
+    reference_model = testing_utils.get_small_sequential_mlp(16, 2,
+                                                             input_dim=4)
+    reference_model.compile(loss='sparse_categorical_crossentropy',
+                            optimizer=RMSPropOptimizer(learning_rate=0.001),
+                            run_eagerly=True)
+    fixed_weights = reference_model.get_weights()
+    reference_model_loss = reference_model.train_on_batch(train_x, train_y)
 
-      test_model = testing_utils.get_small_sequential_mlp(16, 2, input_dim=4)
-      test_model.compile(loss='sparse_categorical_crossentropy',
-                         optimizer=RMSPropOptimizer(learning_rate=0.001),
-                         run_eagerly=False)
-      test_model.set_weights(fixed_weights)
-      test_model_loss = test_model.train_on_batch(train_x, train_y)
-      self.assertAlmostEqual(test_model_loss, reference_model_loss, places=4)
+    test_model = testing_utils.get_small_sequential_mlp(16, 2, input_dim=4)
+    test_model.compile(loss='sparse_categorical_crossentropy',
+                       optimizer=RMSPropOptimizer(learning_rate=0.001),
+                       run_eagerly=False)
+    test_model.set_weights(fixed_weights)
+    test_model_loss = test_model.train_on_batch(train_x, train_y)
+    self.assertAlmostEqual(test_model_loss, reference_model_loss, places=4)
 
+  @combinations.generate(combinations.combine(mode=['eager']))
   def test_training_on_categorical_crossentropy_loss_with_softmax(self):
-    with context.eager_mode():
-      np.random.seed(1337)
-      train_x = np.ones((100, 4))
-      train_y = np_utils.to_categorical(
-          np.random.randint(0, 1, size=(100, 1)), 2)
+    np.random.seed(1337)
+    train_x = np.ones((100, 4))
+    train_y = np_utils.to_categorical(
+        np.random.randint(0, 1, size=(100, 1)), 2)
 
-      reference_model = testing_utils.get_small_sequential_mlp(16, 2,
-                                                               input_dim=4)
-      reference_model.compile(loss='categorical_crossentropy',
-                              optimizer=RMSPropOptimizer(learning_rate=0.001),
-                              run_eagerly=True)
-      fixed_weights = reference_model.get_weights()
-      reference_model_loss = reference_model.train_on_batch(train_x, train_y)
+    reference_model = testing_utils.get_small_sequential_mlp(16, 2,
+                                                             input_dim=4)
+    reference_model.compile(loss='categorical_crossentropy',
+                            optimizer=RMSPropOptimizer(learning_rate=0.001),
+                            run_eagerly=True)
+    fixed_weights = reference_model.get_weights()
+    reference_model_loss = reference_model.train_on_batch(train_x, train_y)
 
-      test_model = testing_utils.get_small_sequential_mlp(16, 2, input_dim=4)
-      test_model.compile(loss='categorical_crossentropy',
-                         optimizer=RMSPropOptimizer(learning_rate=0.001),
-                         run_eagerly=False)
-      test_model.set_weights(fixed_weights)
-      test_model_loss = test_model.train_on_batch(train_x, train_y)
-      self.assertAlmostEqual(test_model_loss, reference_model_loss, places=4)
+    test_model = testing_utils.get_small_sequential_mlp(16, 2, input_dim=4)
+    test_model.compile(loss='categorical_crossentropy',
+                       optimizer=RMSPropOptimizer(learning_rate=0.001),
+                       run_eagerly=False)
+    test_model.set_weights(fixed_weights)
+    test_model_loss = test_model.train_on_batch(train_x, train_y)
+    self.assertAlmostEqual(test_model_loss, reference_model_loss, places=4)
 
+  @combinations.generate(combinations.combine(mode=['eager']))
   def test_training_on_binary_crossentropy_loss(self):
-    with context.eager_mode():
-      train_x = np.ones((100, 4), dtype=np.float32)
-      train_y = np.ones((100, 1), dtype=np.float32)
-      reference_model = testing_utils.get_small_sequential_mlp(16, 1,
-                                                               input_dim=4)
-      reference_model.compile(loss='binary_crossentropy',
-                              optimizer=RMSPropOptimizer(learning_rate=0.001),
-                              run_eagerly=True)
-      fixed_weights = reference_model.get_weights()
-      reference_model_loss = reference_model.train_on_batch(train_x, train_y)
+    train_x = np.ones((100, 4), dtype=np.float32)
+    train_y = np.ones((100, 1), dtype=np.float32)
+    reference_model = testing_utils.get_small_sequential_mlp(16, 1,
+                                                             input_dim=4)
+    reference_model.compile(loss='binary_crossentropy',
+                            optimizer=RMSPropOptimizer(learning_rate=0.001),
+                            run_eagerly=True)
+    fixed_weights = reference_model.get_weights()
+    reference_model_loss = reference_model.train_on_batch(train_x, train_y)
 
-      test_model = testing_utils.get_small_sequential_mlp(16, 1, input_dim=4)
-      test_model.compile(loss='binary_crossentropy',
-                         optimizer=RMSPropOptimizer(learning_rate=0.001),
-                         run_eagerly=False)
-      test_model.set_weights(fixed_weights)
-      test_model_loss = test_model.train_on_batch(train_x, train_y)
-      self.assertAlmostEqual(test_model_loss, reference_model_loss, places=4)
+    test_model = testing_utils.get_small_sequential_mlp(16, 1, input_dim=4)
+    test_model.compile(loss='binary_crossentropy',
+                       optimizer=RMSPropOptimizer(learning_rate=0.001),
+                       run_eagerly=False)
+    test_model.set_weights(fixed_weights)
+    test_model_loss = test_model.train_on_batch(train_x, train_y)
+    self.assertAlmostEqual(test_model_loss, reference_model_loss, places=4)
 
   @keras_parameterized.run_with_all_model_types
   @keras_parameterized.run_all_keras_modes
@@ -1618,6 +1617,64 @@ class TrainingTest(keras_parameterized.TestCase):
     model.evaluate(x, batch_size=batch_size)
     model.predict(x, batch_size=batch_size)
 
+  @keras_parameterized.run_all_keras_modes(
+      always_skip_v1=True)
+  @parameterized.named_parameters(
+      ('custom_metrics', False, True),
+      ('compiled_metrics', True, False),
+      ('both_compiled_and_custom_metrics', True, True))
+  def test_evaluate_with_custom_test_step(
+      self, use_compiled_metrics, use_custom_metrics):
+
+    class MyModel(training_module.Model):
+
+      def test_step(self, data):
+        x, y = data
+        pred = self(x)
+        metrics = {}
+        if use_compiled_metrics:
+          self.compiled_metrics.update_state(y, pred)
+          self.compiled_loss(y, pred)
+          for metric in self.metrics:
+            metrics[metric.name] = metric.result()
+        if use_custom_metrics:
+          custom_metrics = {
+              'mean': math_ops.reduce_mean(pred),
+              'sum': math_ops.reduce_sum(pred)
+          }
+          metrics.update(custom_metrics)
+        return metrics
+
+    inputs = layers_module.Input((2,))
+    outputs = layers_module.Dense(3)(inputs)
+    model = MyModel(inputs, outputs)
+    if use_compiled_metrics:
+      model.compile('adam', 'mse', metrics=['mae', 'mape'],
+                    run_eagerly=testing_utils.should_run_eagerly())
+    else:
+      model.compile('adam', 'mse',
+                    run_eagerly=testing_utils.should_run_eagerly())
+    x = np.random.random((4, 2))
+    y = np.random.random((4, 3))
+    results_list = model.evaluate(x, y)
+    results_dict = model.evaluate(x, y, return_dict=True)
+    self.assertLen(results_list, len(results_dict))
+    if use_compiled_metrics and use_custom_metrics:
+      self.assertLen(results_list, 5)
+      self.assertEqual(results_list,
+                       [results_dict['loss'],
+                        results_dict['mae'], results_dict['mape'],
+                        results_dict['mean'], results_dict['sum']])
+    if use_compiled_metrics and not use_custom_metrics:
+      self.assertLen(results_list, 3)
+      self.assertEqual(results_list,
+                       [results_dict['loss'],
+                        results_dict['mae'], results_dict['mape']])
+    if not use_compiled_metrics and use_custom_metrics:
+      self.assertLen(results_list, 2)
+      self.assertEqual(results_list,
+                       [results_dict['mean'], results_dict['sum']])
+
 
 class TestExceptionsAndWarnings(keras_parameterized.TestCase):
 
diff --git a/tensorflow/python/keras/engine/training_utils.py b/tensorflow/python/keras/engine/training_utils.py
index 84bcd99922f..1df48401f33 100644
--- a/tensorflow/python/keras/engine/training_utils.py
+++ b/tensorflow/python/keras/engine/training_utils.py
@@ -29,12 +29,12 @@ import numpy as np
 import six
 from six.moves import zip  # pylint: disable=redefined-builtin
 
+from tensorflow.core.framework import graph_pb2
 from tensorflow.python import tf2
 from tensorflow.python.data.experimental.ops import cardinality
 from tensorflow.python.data.experimental.ops.distribute_options import AutoShardPolicy
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
-from tensorflow.python.data.ops import readers
 from tensorflow.python.eager import context
 from tensorflow.python.framework import composite_tensor_utils
 from tensorflow.python.framework import dtypes
@@ -51,13 +51,13 @@ from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras.utils import data_utils
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import losses_utils
+from tensorflow.python.keras.utils import tf_inspect
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
-from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.compat import collections_abc
 
 
@@ -1009,7 +1009,7 @@ def standardize_weights(y,
       class_sample_weight = math_ops.cast(class_sample_weight, K.floatx())
       if sample_weight is not None:
         sample_weight = math_ops.cast(
-            ops.convert_to_tensor_v2(sample_weight), K.floatx())
+            ops.convert_to_tensor_v2_with_dispatch(sample_weight), K.floatx())
     else:
       y_classes = y
       if len(y.shape) == 2:
@@ -1365,7 +1365,7 @@ def check_steps_argument(input_data, steps, steps_name):
 
 def cast_single_tensor(x, dtype=None):
   if isinstance(x, np.ndarray):
-    x = ops.convert_to_tensor_v2(x)
+    x = ops.convert_to_tensor_v2_with_dispatch(x)
   dtype = dtype or K.floatx()
   if x.dtype.is_floating:
     return math_ops.cast(x, dtype=dtype)
@@ -1391,7 +1391,7 @@ def cast_if_floating_dtype_and_mismatch(targets, outputs):
   new_targets = []
   for target, out in zip(targets, outputs):
     if isinstance(target, np.ndarray):
-      target = ops.convert_to_tensor_v2(target)
+      target = ops.convert_to_tensor_v2_with_dispatch(target)
     if target.dtype != out.dtype:
       new_targets.append(cast_single_tensor(target, dtype=out.dtype))
     else:
@@ -1567,115 +1567,12 @@ def is_eager_dataset_or_iterator(data):
 
 
 # pylint: disable=protected-access
-def assert_not_batched(dataset):
-  """Asserts that `dataset` is not batched.
-
-  The algorithm used by this method is sound but not complete. In other words,
-  if the method fails to establish the assertion, it does not mean the dataset
-  is batched.
-
-  Example usage:
-  ```python
-  try:
-    assert_not_batched(dataset)
-    # safe to assume `dataset` it not batched here
-  expect ValueError:
-    # make no assumptions about `dataset`
-  ```
-
-  Args:
-    dataset: The dataset to analyze.
-
-  Raises:
-    ValueError: If the method cannot establish the assertion.
-  """
-  if isinstance(dataset, dataset_ops.DatasetV1Adapter):
-    return assert_not_batched(dataset._dataset)
+def get_dataset_graph_def(dataset):
+  if context.executing_eagerly():
+    graph_def_str = dataset._as_serialized_graph().numpy()
   else:
-    allowed_types = [
-        dataset_ops._OptionsDataset,
-        dataset_ops.ConcatenateDataset,
-        dataset_ops.CacheDataset,
-        dataset_ops.FilterDataset,
-        dataset_ops.MapDataset,
-        dataset_ops.ParallelMapDataset,
-        dataset_ops.PrefetchDataset,
-        dataset_ops.RangeDataset,
-        dataset_ops.RepeatDataset,
-        dataset_ops.ShuffleDataset,
-        dataset_ops.SkipDataset,
-        dataset_ops.SparseTensorSliceDataset,
-        dataset_ops.TakeDataset,
-        dataset_ops.TensorDataset,
-        dataset_ops.TensorSliceDataset,
-        dataset_ops.ZipDataset,
-        readers.FixedLengthRecordDatasetV2,
-        readers.TextLineDatasetV2,
-        readers.TFRecordDatasetV2,
-    ]
-    for ty in allowed_types:
-      if isinstance(dataset, ty):
-        for input_dataset in dataset._inputs():
-          assert_not_batched(input_dataset)
-        return
-    raise ValueError('Could not assert that dataset is not batched.')
-
-
-# pylint: disable=protected-access
-def assert_not_shuffled(dataset):
-  """Asserts that `dataset` is not shuffled.
-
-  The algorithm used by this method is sound but not complete. In other words,
-  if the method fails to establish the assertion, it does not mean the dataset
-  is shuffled.
-
-  Example usage:
-  ```python
-  try:
-    assert_not_shuffled(dataset)
-    # safe to assume `dataset` it not shuffled here
-  expect ValueError:
-    # make no assumptions about `dataset`
-  ```
-
-  Args:
-    dataset: The dataset to analyze.
-
-  Raises:
-    ValueError: If the method cannot establish the assertion.
-  """
-  if isinstance(dataset, dataset_ops.DatasetV1Adapter):
-    return assert_not_shuffled(dataset._dataset)
-  else:
-    allowed_types = [
-        dataset_ops._OptionsDataset,
-        dataset_ops.BatchDataset,
-        dataset_ops.ConcatenateDataset,
-        dataset_ops.CacheDataset,
-        dataset_ops.FilterDataset,
-        dataset_ops.MapDataset,
-        dataset_ops.PaddedBatchDataset,
-        dataset_ops.ParallelMapDataset,
-        dataset_ops.PrefetchDataset,
-        dataset_ops.RangeDataset,
-        dataset_ops.RepeatDataset,
-        dataset_ops.SkipDataset,
-        dataset_ops.SparseTensorSliceDataset,
-        dataset_ops.TakeDataset,
-        dataset_ops.TensorDataset,
-        dataset_ops.TensorSliceDataset,
-        dataset_ops.WindowDataset,
-        dataset_ops.ZipDataset,
-        readers.FixedLengthRecordDatasetV2,
-        readers.TextLineDatasetV2,
-        readers.TFRecordDatasetV2,
-    ]
-    for ty in allowed_types:
-      if isinstance(dataset, ty):
-        for input_dataset in dataset._inputs():
-          assert_not_shuffled(input_dataset)
-        return
-    raise ValueError('Could not assert that dataset is not shuffled.')
+    graph_def_str = K.get_value(dataset._as_serialized_graph())
+  return graph_pb2.GraphDef().FromString(graph_def_str)
 
 
 def verify_dataset_shuffled(x):
@@ -1684,18 +1581,22 @@ def verify_dataset_shuffled(x):
   Args:
     x: Dataset passed as an input to the model.
 
-  Raises:
-    ValueError: if the dataset is not already shuffled.
+  Returns:
+    boolean, whether the input dataset is shuffled or not.
   """
   assert isinstance(x, dataset_ops.DatasetV2)
-  try:
-    assert_not_shuffled(x)
-  except ValueError:
-    # Dataset may or may not be shuffled.
-    return
-  else:
-    logging.warning('Expected a shuffled dataset but input dataset `x` is '
-                    'not shuffled. Please invoke `shuffle()` on input dataset.')
+  graph_def = get_dataset_graph_def(x)
+  for node in graph_def.node:
+    if node.op.startswith('ShuffleDataset'):
+      return True
+  # Also check graph_def.library.function for ds.interleave or ds.flat_map
+  for function in graph_def.library.function:
+    for node in function.node_def:
+      if node.op.startswith('ShuffleDataset'):
+        return True
+  logging.warning('Expected a shuffled dataset but input dataset `x` is '
+                  'not shuffled. Please invoke `shuffle()` on input dataset.')
+  return False
 
 
 def is_dataset_or_iterator(data):
diff --git a/tensorflow/python/keras/engine/training_utils_test.py b/tensorflow/python/keras/engine/training_utils_test.py
index 06d26ef5088..8a3fd3926cf 100644
--- a/tensorflow/python/keras/engine/training_utils_test.py
+++ b/tensorflow/python/keras/engine/training_utils_test.py
@@ -55,28 +55,28 @@ class ModelInputsTest(test.TestCase):
     self.assertEqual(backend.floatx(), vals[0].dtype)
 
   def test_single_thing_eager(self):
+    if not context.executing_eagerly():
+      self.skipTest('Run in eager mode only.')
     with testing_utils.use_keras_tensors_scope(False):
-      with context.eager_mode():
-        a = np.ones(10, dtype=np.int32)
-        model_inputs = training_utils.ModelInputs(a)
-        self.assertEqual(['input_1'], model_inputs.get_input_names())
-        val = model_inputs.get_symbolic_inputs()
-        self.assertTrue(tf_utils.is_symbolic_tensor(val))
-        vals = model_inputs.get_symbolic_inputs(return_single_as_list=True)
-        self.assertEqual(1, len(vals))
-        self.assertTrue(tf_utils.is_symbolic_tensor(vals[0]))
-        self.assertEqual(dtypes.int32, vals[0].dtype)
+      a = np.ones(10, dtype=np.int32)
+      model_inputs = training_utils.ModelInputs(a)
+      self.assertEqual(['input_1'], model_inputs.get_input_names())
+      val = model_inputs.get_symbolic_inputs()
+      self.assertTrue(tf_utils.is_symbolic_tensor(val))
+      vals = model_inputs.get_symbolic_inputs(return_single_as_list=True)
+      self.assertEqual(1, len(vals))
+      self.assertTrue(tf_utils.is_symbolic_tensor(vals[0]))
+      self.assertEqual(dtypes.int32, vals[0].dtype)
     with testing_utils.use_keras_tensors_scope(True):
-      with context.eager_mode():
-        a = np.ones(10, dtype=np.int32)
-        model_inputs = training_utils.ModelInputs(a)
-        self.assertEqual(['input_1'], model_inputs.get_input_names())
-        val = model_inputs.get_symbolic_inputs()
-        self.assertIsInstance(val, keras_tensor.KerasTensor)
-        vals = model_inputs.get_symbolic_inputs(return_single_as_list=True)
-        self.assertEqual(1, len(vals))
-        self.assertIsInstance(vals[0], keras_tensor.KerasTensor)
-        self.assertEqual(dtypes.int32, vals[0].dtype)
+      a = np.ones(10, dtype=np.int32)
+      model_inputs = training_utils.ModelInputs(a)
+      self.assertEqual(['input_1'], model_inputs.get_input_names())
+      val = model_inputs.get_symbolic_inputs()
+      self.assertIsInstance(val, keras_tensor.KerasTensor)
+      vals = model_inputs.get_symbolic_inputs(return_single_as_list=True)
+      self.assertEqual(1, len(vals))
+      self.assertIsInstance(vals[0], keras_tensor.KerasTensor)
+      self.assertEqual(dtypes.int32, vals[0].dtype)
 
   def test_list(self):
     a = [np.ones(10), np.ones(20)]
@@ -87,22 +87,22 @@ class ModelInputsTest(test.TestCase):
     self.assertTrue(tensor_util.is_tensor(vals[1]))
 
   def test_list_eager(self):
+    if not context.executing_eagerly():
+      self.skipTest('Run in eager mode only.')
     with testing_utils.use_keras_tensors_scope(False):
-      with context.eager_mode():
-        a = [np.ones(10), np.ones(20)]
-        model_inputs = training_utils.ModelInputs(a)
-        self.assertEqual(['input_1', 'input_2'], model_inputs.get_input_names())
-        vals = model_inputs.get_symbolic_inputs()
-        self.assertTrue(tf_utils.is_symbolic_tensor(vals[0]))
-        self.assertTrue(tf_utils.is_symbolic_tensor(vals[1]))
+      a = [np.ones(10), np.ones(20)]
+      model_inputs = training_utils.ModelInputs(a)
+      self.assertEqual(['input_1', 'input_2'], model_inputs.get_input_names())
+      vals = model_inputs.get_symbolic_inputs()
+      self.assertTrue(tf_utils.is_symbolic_tensor(vals[0]))
+      self.assertTrue(tf_utils.is_symbolic_tensor(vals[1]))
     with testing_utils.use_keras_tensors_scope(True):
-      with context.eager_mode():
-        a = [np.ones(10), np.ones(20)]
-        model_inputs = training_utils.ModelInputs(a)
-        self.assertEqual(['input_1', 'input_2'], model_inputs.get_input_names())
-        vals = model_inputs.get_symbolic_inputs()
-        self.assertIsInstance(vals[0], keras_tensor.KerasTensor)
-        self.assertIsInstance(vals[1], keras_tensor.KerasTensor)
+      a = [np.ones(10), np.ones(20)]
+      model_inputs = training_utils.ModelInputs(a)
+      self.assertEqual(['input_1', 'input_2'], model_inputs.get_input_names())
+      vals = model_inputs.get_symbolic_inputs()
+      self.assertIsInstance(vals[0], keras_tensor.KerasTensor)
+      self.assertIsInstance(vals[1], keras_tensor.KerasTensor)
 
   def test_dict(self):
     a = {'b': np.ones(10), 'a': np.ones(20)}
@@ -113,74 +113,26 @@ class ModelInputsTest(test.TestCase):
     self.assertTrue(tensor_util.is_tensor(vals['b']))
 
   def test_dict_eager(self):
+    if not context.executing_eagerly():
+      self.skipTest('Run in eager mode only.')
     with testing_utils.use_keras_tensors_scope(False):
-      with context.eager_mode():
-        a = {'b': np.ones(10), 'a': np.ones(20)}
-        model_inputs = training_utils.ModelInputs(a)
-        self.assertEqual(['a', 'b'], model_inputs.get_input_names())
-        vals = model_inputs.get_symbolic_inputs()
-        self.assertTrue(tf_utils.is_symbolic_tensor(vals['a']))
-        self.assertTrue(tf_utils.is_symbolic_tensor(vals['b']))
+      a = {'b': np.ones(10), 'a': np.ones(20)}
+      model_inputs = training_utils.ModelInputs(a)
+      self.assertEqual(['a', 'b'], model_inputs.get_input_names())
+      vals = model_inputs.get_symbolic_inputs()
+      self.assertTrue(tf_utils.is_symbolic_tensor(vals['a']))
+      self.assertTrue(tf_utils.is_symbolic_tensor(vals['b']))
     with testing_utils.use_keras_tensors_scope(True):
-      with context.eager_mode():
-        a = {'b': np.ones(10), 'a': np.ones(20)}
-        model_inputs = training_utils.ModelInputs(a)
-        self.assertEqual(['a', 'b'], model_inputs.get_input_names())
-        vals = model_inputs.get_symbolic_inputs()
-        self.assertIsInstance(vals['a'], keras_tensor.KerasTensor)
-        self.assertIsInstance(vals['b'], keras_tensor.KerasTensor)
+      a = {'b': np.ones(10), 'a': np.ones(20)}
+      model_inputs = training_utils.ModelInputs(a)
+      self.assertEqual(['a', 'b'], model_inputs.get_input_names())
+      vals = model_inputs.get_symbolic_inputs()
+      self.assertIsInstance(vals['a'], keras_tensor.KerasTensor)
+      self.assertIsInstance(vals['b'], keras_tensor.KerasTensor)
 
 
 class DatasetUtilsTest(test.TestCase, parameterized.TestCase):
 
-  @parameterized.named_parameters(
-      # pylint: disable=g-long-lambda
-      ('Batch', lambda: dataset_ops.Dataset.range(5).batch(2), ValueError),
-      ('Cache', lambda: dataset_ops.Dataset.range(5).cache()),
-      ('Concatenate', lambda: dataset_ops.Dataset.range(5).concatenate(
-          dataset_ops.Dataset.range(5))),
-      ('FlatMap', lambda: dataset_ops.Dataset.range(5).flat_map(
-          lambda _: dataset_ops.Dataset.from_tensors(0)), ValueError),
-      ('Filter', lambda: dataset_ops.Dataset.range(5).filter(lambda _: True)),
-      ('FixedLengthRecordDatasetV2',
-       lambda: readers.FixedLengthRecordDatasetV2([], 42)),
-      ('FromTensors', lambda: dataset_ops.Dataset.from_tensors(0)),
-      ('FromTensorSlices',
-       lambda: dataset_ops.Dataset.from_tensor_slices([0, 0, 0])),
-      ('Interleave', lambda: dataset_ops.Dataset.range(5).interleave(
-          lambda _: dataset_ops.Dataset.from_tensors(0), cycle_length=1),
-       ValueError),
-      ('ParallelInterleave', lambda: dataset_ops.Dataset.range(5).interleave(
-          lambda _: dataset_ops.Dataset.from_tensors(0),
-          cycle_length=1,
-          num_parallel_calls=1), ValueError),
-      ('Map', lambda: dataset_ops.Dataset.range(5).map(lambda x: x)),
-      ('Options',
-       lambda: dataset_ops.Dataset.range(5).with_options(dataset_ops.Options())
-      ),
-      ('PaddedBatch', lambda: dataset_ops.Dataset.range(5).padded_batch(2, []),
-       ValueError),
-      ('ParallelMap', lambda: dataset_ops.Dataset.range(5).map(
-          lambda x: x, num_parallel_calls=1)),
-      ('Prefetch', lambda: dataset_ops.Dataset.range(5).prefetch(1)),
-      ('Range', lambda: dataset_ops.Dataset.range(0)),
-      ('Repeat', lambda: dataset_ops.Dataset.range(0).repeat(0)),
-      ('Shuffle', lambda: dataset_ops.Dataset.range(5).shuffle(1)),
-      ('Skip', lambda: dataset_ops.Dataset.range(5).skip(2)),
-      ('Take', lambda: dataset_ops.Dataset.range(5).take(2)),
-      ('TextLineDataset', lambda: readers.TextLineDatasetV2([])),
-      ('TFRecordDataset', lambda: readers.TFRecordDatasetV2([])),
-      ('Window', lambda: dataset_ops.Dataset.range(5).window(2), ValueError),
-      ('Zip', lambda: dataset_ops.Dataset.zip(dataset_ops.Dataset.range(5))),
-      # pylint: enable=g-long-lambda
-  )
-  def test_assert_not_batched(self, dataset_fn, expected_error=None):
-    if expected_error is None:
-      training_utils.assert_not_batched(dataset_fn())
-    else:
-      with self.assertRaises(expected_error):
-        training_utils.assert_not_batched(dataset_fn())
-
   @parameterized.named_parameters(
       # pylint: disable=g-long-lambda
       ('Batch', lambda: dataset_ops.Dataset.range(5).batch(2)),
@@ -188,7 +140,9 @@ class DatasetUtilsTest(test.TestCase, parameterized.TestCase):
       ('Concatenate', lambda: dataset_ops.Dataset.range(5).concatenate(
           dataset_ops.Dataset.range(5))),
       ('FlatMap', lambda: dataset_ops.Dataset.range(5).flat_map(
-          lambda _: dataset_ops.Dataset.from_tensors(0)), ValueError),
+          lambda _: dataset_ops.Dataset.from_tensors(0))),
+      ('FlatMap_Shuffle', lambda: dataset_ops.Dataset.range(5).flat_map(
+          lambda _: dataset_ops.Dataset.from_tensors(0).shuffle(1)), True),
       ('Filter', lambda: dataset_ops.Dataset.range(5).filter(lambda _: True)),
       ('FixedLengthRecordDatasetV2',
        lambda: readers.FixedLengthRecordDatasetV2([], 42)),
@@ -196,8 +150,10 @@ class DatasetUtilsTest(test.TestCase, parameterized.TestCase):
       ('FromTensorSlices',
        lambda: dataset_ops.Dataset.from_tensor_slices([0, 0, 0])),
       ('Interleave', lambda: dataset_ops.Dataset.range(5).interleave(
-          lambda _: dataset_ops.Dataset.from_tensors(0), cycle_length=1),
-       ValueError),
+          lambda _: dataset_ops.Dataset.from_tensors(0), cycle_length=1)),
+      ('Interleave_Shuffle', lambda: dataset_ops.Dataset.range(5).interleave(
+          lambda _: dataset_ops.Dataset.from_tensors(0).shuffle(1),
+          cycle_length=1), True),
       ('Map', lambda: dataset_ops.Dataset.range(5).map(lambda x: x)),
       ('Options',
        lambda: dataset_ops.Dataset.range(5).with_options(dataset_ops.Options())
@@ -206,13 +162,13 @@ class DatasetUtilsTest(test.TestCase, parameterized.TestCase):
       ('ParallelInterleave', lambda: dataset_ops.Dataset.range(5).interleave(
           lambda _: dataset_ops.Dataset.from_tensors(0),
           cycle_length=1,
-          num_parallel_calls=1), ValueError),
+          num_parallel_calls=1)),
       ('ParallelMap', lambda: dataset_ops.Dataset.range(5).map(
           lambda x: x, num_parallel_calls=1)),
       ('Prefetch', lambda: dataset_ops.Dataset.range(5).prefetch(1)),
       ('Range', lambda: dataset_ops.Dataset.range(0)),
       ('Repeat', lambda: dataset_ops.Dataset.range(0).repeat(0)),
-      ('Shuffle', lambda: dataset_ops.Dataset.range(5).shuffle(1), ValueError),
+      ('Shuffle', lambda: dataset_ops.Dataset.range(5).shuffle(1), True),
       ('Skip', lambda: dataset_ops.Dataset.range(5).skip(2)),
       ('Take', lambda: dataset_ops.Dataset.range(5).take(2)),
       ('TextLineDataset', lambda: readers.TextLineDatasetV2([])),
@@ -221,24 +177,17 @@ class DatasetUtilsTest(test.TestCase, parameterized.TestCase):
       ('Zip', lambda: dataset_ops.Dataset.zip(dataset_ops.Dataset.range(5))),
       # pylint: enable=g-long-lambda
   )
-  def test_assert_not_shuffled(self, dataset_fn, expected_error=None):
-    if expected_error is None:
-      training_utils.assert_not_shuffled(dataset_fn())
+  def test_verify_dataset_shuffled(self, dataset_fn, expect_shuffled=False):
+    dataset = dataset_fn()
+
+    if not expect_shuffled:
+      with test.mock.patch.object(logging, 'warning') as mock_log:
+        shuffled = training_utils.verify_dataset_shuffled(dataset)
+        self.assertRegex(
+            str(mock_log.call_args), 'input dataset `x` is not shuffled.')
+        self.assertFalse(shuffled)
     else:
-      with self.assertRaises(expected_error):
-        training_utils.assert_not_shuffled(dataset_fn())
-
-  def test_verify_dataset_shuffled(self):
-    dataset = dataset_ops.Dataset.range(5)
-    training_utils.assert_not_shuffled(dataset)
-
-    with test.mock.patch.object(logging, 'warning') as mock_log:
-      training_utils.verify_dataset_shuffled(dataset)
-      self.assertRegex(
-          str(mock_log.call_args), 'input dataset `x` is not shuffled.')
-
-    shuffled_dataset = dataset.shuffle(10)
-    training_utils.verify_dataset_shuffled(shuffled_dataset)
+      self.assertTrue(training_utils.verify_dataset_shuffled(dataset))
 
 
 class StandardizeWeightsTest(keras_parameterized.TestCase):
diff --git a/tensorflow/python/keras/engine/training_v1.py b/tensorflow/python/keras/engine/training_v1.py
index 2ac3337948a..d7476d195f3 100644
--- a/tensorflow/python/keras/engine/training_v1.py
+++ b/tensorflow/python/keras/engine/training_v1.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import warnings
 
 import numpy as np
 
@@ -44,16 +45,17 @@ from tensorflow.python.keras import optimizers
 from tensorflow.python.keras.distribute import distributed_training_utils
 from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.engine import training as training_lib
-from tensorflow.python.keras.engine import training_arrays
-from tensorflow.python.keras.engine import training_distributed
-from tensorflow.python.keras.engine import training_eager
-from tensorflow.python.keras.engine import training_generator
+from tensorflow.python.keras.engine import training_arrays_v1
+from tensorflow.python.keras.engine import training_distributed_v1
+from tensorflow.python.keras.engine import training_eager_v1
+from tensorflow.python.keras.engine import training_generator_v1
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.mixed_precision.experimental import loss_scale_optimizer
 from tensorflow.python.keras.optimizer_v2 import optimizer_v2
 from tensorflow.python.keras.saving.saved_model import model_serialization
 from tensorflow.python.keras.utils import data_utils
 from tensorflow.python.keras.utils import losses_utils
+from tensorflow.python.keras.utils import tf_inspect
 from tensorflow.python.keras.utils.mode_keys import ModeKeys
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -61,9 +63,7 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.training.tracking import layer_utils as trackable_layer_utils
 from tensorflow.python.types import core
-from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
-from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.compat import collections_abc
 
 try:
@@ -138,7 +138,6 @@ class Model(training_lib.Model):
 
   def __init__(self, *args, **kwargs):
     super(Model, self).__init__(*args, **kwargs)
-    base_layer.keras_api_gauge.get_cell('model v1').set(True)
     # initializing _distribution_strategy here since it is possible to call
     # predict on a model without compiling it.
     self._distribution_strategy = None
@@ -409,7 +408,7 @@ class Model(training_lib.Model):
       # time the model gets called on training data.
       return
     self._is_compiled = True
-    base_layer.keras_api_gauge.get_cell('compile_v1').set(True)
+    base_layer.keras_api_gauge.get_cell('compile').set(True)
 
     # Prepare list of loss functions, same size of model outputs.
     self.loss_functions = training_utils.prepare_loss_functions(
@@ -583,25 +582,25 @@ class Model(training_lib.Model):
     # Case 1: distribution strategy.
     if self._distribution_strategy:
       if self._in_multi_worker_mode():
-        return training_distributed.DistributionMultiWorkerTrainingLoop(
-            training_distributed.DistributionSingleWorkerTrainingLoop())
+        return training_distributed_v1.DistributionMultiWorkerTrainingLoop(
+            training_distributed_v1.DistributionSingleWorkerTrainingLoop())
       else:
-        return training_distributed.DistributionSingleWorkerTrainingLoop()
+        return training_distributed_v1.DistributionSingleWorkerTrainingLoop()
 
     # Case 2: generator-like. Input is Python generator, or Sequence object,
     # or a non-distributed Dataset or iterator in eager execution.
     if data_utils.is_generator_or_sequence(inputs):
-      return training_generator.GeneratorOrSequenceTrainingLoop()
+      return training_generator_v1.GeneratorOrSequenceTrainingLoop()
     if training_utils.is_eager_dataset_or_iterator(inputs):
-      return training_generator.EagerDatasetOrIteratorTrainingLoop()
+      return training_generator_v1.EagerDatasetOrIteratorTrainingLoop()
 
     # Case 3: Symbolic tensors or Numpy array-like.
     # This includes Datasets and iterators in graph mode (since they
     # generate symbolic tensors).
     if self.run_eagerly:
-      return training_generator.GeneratorLikeTrainingLoop()
+      return training_generator_v1.GeneratorLikeTrainingLoop()
     else:
-      return training_arrays.ArrayLikeTrainingLoop()
+      return training_arrays_v1.ArrayLikeTrainingLoop()
 
   def fit(self,
           x=None,
@@ -770,7 +769,7 @@ class Model(training_lib.Model):
             and what the model expects.
     """
     self._assert_built_as_v1()
-    base_layer.keras_api_gauge.get_cell('fit_v1').set(True)
+    base_layer.keras_api_gauge.get_cell('fit').set(True)
     # Legacy support
     if 'nb_epoch' in kwargs:
       logging.warning(
@@ -891,7 +890,7 @@ class Model(training_lib.Model):
         ValueError: in case of invalid arguments.
     """
     self._assert_built_as_v1()
-    base_layer.keras_api_gauge.get_cell('evaluate_v1').set(True)
+    base_layer.keras_api_gauge.get_cell('evaluate').set(True)
     self._assert_compile_was_called()
     self._check_call_args('evaluate')
 
@@ -971,7 +970,7 @@ class Model(training_lib.Model):
             that is not a multiple of the batch size.
     """
     self._assert_built_as_v1()
-    base_layer.keras_api_gauge.get_cell('predict_v1').set(True)
+    base_layer.keras_api_gauge.get_cell('predict').set(True)
     self._check_call_args('predict')
 
     func = self._select_training_loop(x)
@@ -1063,7 +1062,7 @@ class Model(training_lib.Model):
     # for each replica by `self._distribution_strategy` and the same code path
     # as Eager is expected to be taken.
     if self.run_eagerly or self._distribution_strategy:
-      output_dict = training_eager.train_on_batch(
+      output_dict = training_eager_v1.train_on_batch(
           self,
           x,
           y,
@@ -1142,7 +1141,7 @@ class Model(training_lib.Model):
     # If `self._distribution_strategy` is True, then we are in a replica context
     # at this point.
     if self.run_eagerly or self._distribution_strategy:
-      output_dict = training_eager.test_on_batch(
+      output_dict = training_eager_v1.test_on_batch(
           self,
           x,
           y,
@@ -1212,8 +1211,6 @@ class Model(training_lib.Model):
       return outputs[0]
     return outputs
 
-  @deprecation.deprecated(
-      None, 'Please use Model.fit, which supports generators.')
   def fit_generator(self,
                     generator,
                     steps_per_epoch=None,
@@ -1235,6 +1232,9 @@ class Model(training_lib.Model):
       `Model.fit` now supports generators, so there is no longer any need to use
       this endpoint.
     """
+    warnings.warn('`model.fit_generator` is deprecated and '
+                  'will be removed in a future version. '
+                  'Please use `Model.fit`, which supports generators.')
     return self.fit(
         generator,
         steps_per_epoch=steps_per_epoch,
@@ -1251,8 +1251,6 @@ class Model(training_lib.Model):
         shuffle=shuffle,
         initial_epoch=initial_epoch)
 
-  @deprecation.deprecated(
-      None, 'Please use Model.evaluate, which supports generators.')
   def evaluate_generator(self,
                          generator,
                          steps=None,
@@ -1267,6 +1265,9 @@ class Model(training_lib.Model):
       `Model.evaluate` now supports generators, so there is no longer any need
       to use this endpoint.
     """
+    warnings.warn('`Model.evaluate_generator` is deprecated and '
+                  'will be removed in a future version. '
+                  'Please use `Model.evaluate`, which supports generators.')
     self._check_call_args('evaluate_generator')
 
     return self.evaluate(
@@ -1278,8 +1279,6 @@ class Model(training_lib.Model):
         verbose=verbose,
         callbacks=callbacks)
 
-  @deprecation.deprecated(
-      None, 'Please use Model.predict, which supports generators.')
   def predict_generator(self,
                         generator,
                         steps=None,
@@ -1294,6 +1293,9 @@ class Model(training_lib.Model):
       `Model.predict` now supports generators, so there is no longer any need
       to use this endpoint.
     """
+    warnings.warn('`Model.predict_generator` is deprecated and '
+                  'will be removed in a future version. '
+                  'Please use `Model.predict`, which supports generators.')
     return self.predict(
         generator,
         steps=steps,
@@ -2811,7 +2813,8 @@ class Model(training_lib.Model):
   def _trackable_saved_model_saver(self):
     return model_serialization.ModelSavedModelSaver(self)
 
-  def _get_compile_args(self):
+  def _get_compile_args(self, user_metrics=True):
+    del user_metrics
     self._assert_compile_was_called()
     kwargs = {
         'loss': self.loss,
diff --git a/tensorflow/python/keras/estimator/__init__.py b/tensorflow/python/keras/estimator/__init__.py
index 895dd0458ef..2f11d1b9f87 100644
--- a/tensorflow/python/keras/estimator/__init__.py
+++ b/tensorflow/python/keras/estimator/__init__.py
@@ -38,7 +38,9 @@ def model_to_estimator(
     custom_objects=None,
     model_dir=None,
     config=None,
-    checkpoint_format='saver'):
+    checkpoint_format='saver',
+    metric_names_map=None,
+    export_outputs=None):
   """Constructs an `Estimator` instance from given keras model.
 
   If you use infrastructure or other tooling that relies on Estimators, you can
@@ -71,6 +73,28 @@ def model_to_estimator(
   estimator.train(input_fn, steps=1)
   ```
 
+  Example with customized export signature:
+  ```python
+  inputs = {'a': tf.keras.Input(..., name='a'),
+            'b': tf.keras.Input(..., name='b')}
+  outputs = {'c': tf.keras.layers.Dense(..., name='c')(inputs['a']),
+             'd': tf.keras.layers.Dense(..., name='d')(inputs['b'])}
+  keras_model = tf.keras.Model(inputs, outputs)
+  keras_model.compile(...)
+  export_outputs = {'c': tf.estimator.export.RegressionOutput,
+                    'd': tf.estimator.export.ClassificationOutput}
+
+  estimator = tf.keras.estimator.model_to_estimator(
+      keras_model, export_outputs=export_outputs)
+
+  def input_fn():
+    return dataset_ops.Dataset.from_tensors(
+        ({'features': features, 'sample_weights': sample_weights},
+         targets))
+
+  estimator.train(input_fn, steps=1)
+  ```
+
   Args:
     keras_model: A compiled Keras model object. This argument is mutually
       exclusive with `keras_model_path`. Estimator's `model_fn` uses the
@@ -100,6 +124,32 @@ def model_to_estimator(
       `tf.train.Checkpoint`. Currently, saving object-based checkpoints from
       `model_to_estimator` is only supported by Functional and Sequential
       models. Defaults to 'saver'.
+    metric_names_map: Optional dictionary mapping Keras model output metric
+      names to custom names. This can be used to override the default Keras
+      model output metrics names in a multi IO model use case and provide custom
+      names for the `eval_metric_ops` in Estimator.
+      The Keras model metric names can be obtained using `model.metrics_names`
+      excluding any loss metrics such as total loss and output losses.
+      For example, if your Keras model has two outputs `out_1` and `out_2`,
+      with `mse` loss and `acc` metric, then `model.metrics_names` will be
+      `['loss', 'out_1_loss', 'out_2_loss', 'out_1_acc', 'out_2_acc']`.
+      The model metric names excluding the loss metrics will be
+      `['out_1_acc', 'out_2_acc']`.
+    export_outputs: Optional dictionary. This can be used to override the
+      default Keras model output exports in a multi IO model use case and
+      provide custom names for the `export_outputs` in
+      `tf.estimator.EstimatorSpec`. Default is None, which is equivalent to
+      {'serving_default': `tf.estimator.export.PredictOutput`}. If not None,
+      the keys must match the keys of `model.output_names`.
+      A dict `{name: output}` where:
+        * name: An arbitrary name for this output.
+        * output: an `ExportOutput` class such as `ClassificationOutput`,
+          `RegressionOutput`, or `PredictOutput`. Single-headed models only need
+          to specify one entry in this dictionary. Multi-headed models should
+          specify one entry for each head, one of which must be named using
+          `tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY`
+          If no entry is provided, a default `PredictOutput` mapping to
+          `predictions` will be created.
 
   Returns:
     An Estimator from given keras model.
@@ -126,7 +176,9 @@ def model_to_estimator(
       model_dir=model_dir,
       config=config,
       checkpoint_format=checkpoint_format,
-      use_v2_estimator=False)
+      use_v2_estimator=False,
+      metric_names_map=metric_names_map,
+      export_outputs=export_outputs)
 
 
 @keras_export('keras.estimator.model_to_estimator', v1=[])
@@ -136,7 +188,8 @@ def model_to_estimator_v2(keras_model=None,
                           model_dir=None,
                           config=None,
                           checkpoint_format='checkpoint',
-                          metric_names_map=None):
+                          metric_names_map=None,
+                          export_outputs=None):
   """Constructs an `Estimator` instance from given keras model.
 
   If you use infrastructure or other tooling that relies on Estimators, you can
@@ -169,6 +222,28 @@ def model_to_estimator_v2(keras_model=None,
   estimator.train(input_fn, steps=1)
   ```
 
+  Example with customized export signature:
+  ```python
+  inputs = {'a': tf.keras.Input(..., name='a'),
+            'b': tf.keras.Input(..., name='b')}
+  outputs = {'c': tf.keras.layers.Dense(..., name='c')(inputs['a']),
+             'd': tf.keras.layers.Dense(..., name='d')(inputs['b'])}
+  keras_model = tf.keras.Model(inputs, outputs)
+  keras_model.compile(...)
+  export_outputs = {'c': tf.estimator.export.RegressionOutput,
+                    'd': tf.estimator.export.ClassificationOutput}
+
+  estimator = tf.keras.estimator.model_to_estimator(
+      keras_model, export_outputs=export_outputs)
+
+  def input_fn():
+    return dataset_ops.Dataset.from_tensors(
+        ({'features': features, 'sample_weights': sample_weights},
+         targets))
+
+  estimator.train(input_fn, steps=1)
+  ```
+
   Note: We do not support creating weighted metrics in Keras and converting them
   to weighted metrics in the Estimator API using `model_to_estimator`.
   You will have to create these metrics directly on the estimator spec using the
@@ -248,6 +323,21 @@ def model_to_estimator_v2(keras_model=None,
       `['loss', 'out_1_loss', 'out_2_loss', 'out_1_acc', 'out_2_acc']`.
       The model metric names excluding the loss metrics will be
       `['out_1_acc', 'out_2_acc']`.
+    export_outputs: Optional dictionary. This can be used to override the
+      default Keras model output exports in a multi IO model use case and
+      provide custom names for the `export_outputs` in
+      `tf.estimator.EstimatorSpec`. Default is None, which is equivalent to
+      {'serving_default': `tf.estimator.export.PredictOutput`}. If not None,
+      the keys must match the keys of `model.output_names`.
+      A dict `{name: output}` where:
+        * name: An arbitrary name for this output.
+        * output: an `ExportOutput` class such as `ClassificationOutput`,
+          `RegressionOutput`, or `PredictOutput`. Single-headed models only need
+          to specify one entry in this dictionary. Multi-headed models should
+          specify one entry for each head, one of which must be named using
+          `tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY`
+          If no entry is provided, a default `PredictOutput` mapping to
+          `predictions` will be created.
 
   Returns:
     An Estimator from given keras model.
@@ -275,5 +365,6 @@ def model_to_estimator_v2(keras_model=None,
       config=config,
       checkpoint_format=checkpoint_format,
       use_v2_estimator=True,
-      metric_names_map=metric_names_map)
+      metric_names_map=metric_names_map,
+      export_outputs=export_outputs)
 # LINT.ThenChange(//tensorflow_estimator/python/estimator/keras.py)
diff --git a/tensorflow/python/keras/feature_column/BUILD b/tensorflow/python/keras/feature_column/BUILD
index f1acbeba66c..424e114674d 100644
--- a/tensorflow/python/keras/feature_column/BUILD
+++ b/tensorflow/python/keras/feature_column/BUILD
@@ -10,6 +10,12 @@ package(
 
 exports_files(["LICENSE"])
 
+filegroup(
+    name = "all_py_srcs",
+    srcs = glob(["*.py"]),
+    visibility = ["//tensorflow/python/keras/google/private_tf_api_test:__pkg__"],
+)
+
 py_library(
     name = "feature_column",
     srcs = ["__init__.py"],
diff --git a/tensorflow/python/keras/feature_column/dense_features_test.py b/tensorflow/python/keras/feature_column/dense_features_test.py
index a9fcb4ad315..5bd106a0358 100644
--- a/tensorflow/python/keras/feature_column/dense_features_test.py
+++ b/tensorflow/python/keras/feature_column/dense_features_test.py
@@ -23,7 +23,6 @@ import numpy as np
 
 from tensorflow.python.client import session
 from tensorflow.python.eager import backprop
-from tensorflow.python.eager import context
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.feature_column import sequence_feature_column as sfc
 from tensorflow.python.framework import constant_op
@@ -59,148 +58,148 @@ class DenseFeaturesTest(keras_parameterized.TestCase):
     inputs = self.evaluate(dense_features(features))
     self.assertAllClose([[0.]], inputs)
 
+  @combinations.generate(combinations.combine(mode=['eager']))
   def test_reuses_variables(self):
-    with context.eager_mode():
-      sparse_input = sparse_tensor.SparseTensor(
-          indices=((0, 0), (1, 0), (2, 0)),
-          values=(0, 1, 2),
-          dense_shape=(3, 3))
+    sparse_input = sparse_tensor.SparseTensor(
+        indices=((0, 0), (1, 0), (2, 0)),
+        values=(0, 1, 2),
+        dense_shape=(3, 3))
 
-      # Create feature columns (categorical and embedding).
-      categorical_column = fc.categorical_column_with_identity(
-          key='a', num_buckets=3)
-      embedding_dimension = 2
+    # Create feature columns (categorical and embedding).
+    categorical_column = fc.categorical_column_with_identity(
+        key='a', num_buckets=3)
+    embedding_dimension = 2
 
-      def _embedding_column_initializer(shape, dtype, partition_info=None):
-        del shape  # unused
-        del dtype  # unused
-        del partition_info  # unused
-        embedding_values = (
-            (1, 0),  # id 0
-            (0, 1),  # id 1
-            (1, 1))  # id 2
-        return embedding_values
+    def _embedding_column_initializer(shape, dtype, partition_info=None):
+      del shape  # unused
+      del dtype  # unused
+      del partition_info  # unused
+      embedding_values = (
+          (1, 0),  # id 0
+          (0, 1),  # id 1
+          (1, 1))  # id 2
+      return embedding_values
 
-      embedding_column = fc.embedding_column(
-          categorical_column,
-          dimension=embedding_dimension,
-          initializer=_embedding_column_initializer)
+    embedding_column = fc.embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
+        initializer=_embedding_column_initializer)
 
-      dense_features = df.DenseFeatures([embedding_column])
-      features = {'a': sparse_input}
+    dense_features = df.DenseFeatures([embedding_column])
+    features = {'a': sparse_input}
 
-      inputs = dense_features(features)
-      variables = dense_features.variables
+    inputs = dense_features(features)
+    variables = dense_features.variables
 
-      # Sanity check: test that the inputs are correct.
-      self.assertAllEqual([[1, 0], [0, 1], [1, 1]], inputs)
+    # Sanity check: test that the inputs are correct.
+    self.assertAllEqual([[1, 0], [0, 1], [1, 1]], inputs)
 
-      # Check that only one variable was created.
-      self.assertEqual(1, len(variables))
+    # Check that only one variable was created.
+    self.assertEqual(1, len(variables))
 
-      # Check that invoking dense_features on the same features does not create
-      # additional variables
-      _ = dense_features(features)
-      self.assertEqual(1, len(variables))
-      self.assertIs(variables[0], dense_features.variables[0])
+    # Check that invoking dense_features on the same features does not create
+    # additional variables
+    _ = dense_features(features)
+    self.assertEqual(1, len(variables))
+    self.assertIs(variables[0], dense_features.variables[0])
 
+  @combinations.generate(combinations.combine(mode=['eager']))
   def test_dense_feature_with_partitioner(self):
-    with context.eager_mode():
-      sparse_input = sparse_tensor.SparseTensor(
-          indices=((0, 0), (1, 0), (2, 0), (3, 0)),
-          values=(0, 1, 3, 2),
-          dense_shape=(4, 4))
+    sparse_input = sparse_tensor.SparseTensor(
+        indices=((0, 0), (1, 0), (2, 0), (3, 0)),
+        values=(0, 1, 3, 2),
+        dense_shape=(4, 4))
 
-      # Create feature columns (categorical and embedding).
-      categorical_column = fc.categorical_column_with_identity(
-          key='a', num_buckets=4)
-      embedding_dimension = 2
+    # Create feature columns (categorical and embedding).
+    categorical_column = fc.categorical_column_with_identity(
+        key='a', num_buckets=4)
+    embedding_dimension = 2
 
-      def _embedding_column_initializer(shape, dtype, partition_info=None):
-        offset = partition_info._var_offset[0]
-        del shape  # unused
-        del dtype  # unused
-        if offset == 0:
-          embedding_values = (
-              (1, 0),  # id 0
-              (0, 1))  # id 1
-        else:
-          embedding_values = (
-              (1, 1),  # id 2
-              (2, 2))  # id 3
-        return embedding_values
-
-      embedding_column = fc.embedding_column(
-          categorical_column,
-          dimension=embedding_dimension,
-          initializer=_embedding_column_initializer)
-
-      dense_features = df.DenseFeatures(
-          [embedding_column],
-          partitioner=partitioned_variables.fixed_size_partitioner(2))
-      features = {'a': sparse_input}
-
-      inputs = dense_features(features)
-      variables = dense_features.variables
-
-      # Sanity check: test that the inputs are correct.
-      self.assertAllEqual([[1, 0], [0, 1], [2, 2], [1, 1]], inputs)
-
-      # Check that only one variable was created.
-      self.assertEqual(2, len(variables))
-
-      # Check that invoking dense_features on the same features does not create
-      # additional variables
-      _ = dense_features(features)
-      self.assertEqual(2, len(variables))
-      self.assertIs(variables[0], dense_features.variables[0])
-      self.assertIs(variables[1], dense_features.variables[1])
-
-  def test_feature_column_dense_features_gradient(self):
-    with context.eager_mode():
-      sparse_input = sparse_tensor.SparseTensor(
-          indices=((0, 0), (1, 0), (2, 0)),
-          values=(0, 1, 2),
-          dense_shape=(3, 3))
-
-      # Create feature columns (categorical and embedding).
-      categorical_column = fc.categorical_column_with_identity(
-          key='a', num_buckets=3)
-      embedding_dimension = 2
-
-      def _embedding_column_initializer(shape, dtype, partition_info=None):
-        del shape  # unused
-        del dtype  # unused
-        del partition_info  # unused
+    def _embedding_column_initializer(shape, dtype, partition_info=None):
+      offset = partition_info._var_offset[0]
+      del shape  # unused
+      del dtype  # unused
+      if offset == 0:
         embedding_values = (
             (1, 0),  # id 0
-            (0, 1),  # id 1
-            (1, 1))  # id 2
-        return embedding_values
+            (0, 1))  # id 1
+      else:
+        embedding_values = (
+            (1, 1),  # id 2
+            (2, 2))  # id 3
+      return embedding_values
 
-      embedding_column = fc.embedding_column(
-          categorical_column,
-          dimension=embedding_dimension,
-          initializer=_embedding_column_initializer)
+    embedding_column = fc.embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
+        initializer=_embedding_column_initializer)
 
-      dense_features = df.DenseFeatures([embedding_column])
-      features = {'a': sparse_input}
+    dense_features = df.DenseFeatures(
+        [embedding_column],
+        partitioner=partitioned_variables.fixed_size_partitioner(2))
+    features = {'a': sparse_input}
 
-      def scale_matrix():
-        matrix = dense_features(features)
-        return 2 * matrix
+    inputs = dense_features(features)
+    variables = dense_features.variables
 
-      # Sanity check: Verify that scale_matrix returns the correct output.
-      self.assertAllEqual([[2, 0], [0, 2], [2, 2]], scale_matrix())
+    # Sanity check: test that the inputs are correct.
+    self.assertAllEqual([[1, 0], [0, 1], [2, 2], [1, 1]], inputs)
 
-      # Check that the returned gradient is correct.
-      grad_function = backprop.implicit_grad(scale_matrix)
-      grads_and_vars = grad_function()
-      indexed_slice = grads_and_vars[0][0]
-      gradient = grads_and_vars[0][0].values
+    # Check that only one variable was created.
+    self.assertEqual(2, len(variables))
 
-      self.assertAllEqual([0, 1, 2], indexed_slice.indices)
-      self.assertAllEqual([[2, 2], [2, 2], [2, 2]], gradient)
+    # Check that invoking dense_features on the same features does not create
+    # additional variables
+    _ = dense_features(features)
+    self.assertEqual(2, len(variables))
+    self.assertIs(variables[0], dense_features.variables[0])
+    self.assertIs(variables[1], dense_features.variables[1])
+
+  @combinations.generate(combinations.combine(mode=['eager']))
+  def test_feature_column_dense_features_gradient(self):
+    sparse_input = sparse_tensor.SparseTensor(
+        indices=((0, 0), (1, 0), (2, 0)),
+        values=(0, 1, 2),
+        dense_shape=(3, 3))
+
+    # Create feature columns (categorical and embedding).
+    categorical_column = fc.categorical_column_with_identity(
+        key='a', num_buckets=3)
+    embedding_dimension = 2
+
+    def _embedding_column_initializer(shape, dtype, partition_info=None):
+      del shape  # unused
+      del dtype  # unused
+      del partition_info  # unused
+      embedding_values = (
+          (1, 0),  # id 0
+          (0, 1),  # id 1
+          (1, 1))  # id 2
+      return embedding_values
+
+    embedding_column = fc.embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
+        initializer=_embedding_column_initializer)
+
+    dense_features = df.DenseFeatures([embedding_column])
+    features = {'a': sparse_input}
+
+    def scale_matrix():
+      matrix = dense_features(features)
+      return 2 * matrix
+
+    # Sanity check: Verify that scale_matrix returns the correct output.
+    self.assertAllEqual([[2, 0], [0, 2], [2, 2]], scale_matrix())
+
+    # Check that the returned gradient is correct.
+    grad_function = backprop.implicit_grad(scale_matrix)
+    grads_and_vars = grad_function()
+    indexed_slice = grads_and_vars[0][0]
+    gradient = grads_and_vars[0][0].values
+
+    self.assertAllEqual([0, 1, 2], indexed_slice.indices)
+    self.assertAllEqual([[2, 2], [2, 2], [2, 2]], gradient)
 
   def test_raises_if_empty_feature_columns(self):
     with self.assertRaisesRegex(ValueError,
diff --git a/tensorflow/python/keras/feature_column/dense_features_v2_test.py b/tensorflow/python/keras/feature_column/dense_features_v2_test.py
index bb2ce657c46..30776149514 100644
--- a/tensorflow/python/keras/feature_column/dense_features_v2_test.py
+++ b/tensorflow/python/keras/feature_column/dense_features_v2_test.py
@@ -22,7 +22,6 @@ import numpy as np
 
 from tensorflow.python.client import session
 from tensorflow.python.eager import backprop
-from tensorflow.python.eager import context
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -54,96 +53,96 @@ class DenseFeaturesTest(keras_parameterized.TestCase):
     inputs = self.evaluate(dense_features(features))
     self.assertAllClose([[0.]], inputs)
 
+  @combinations.generate(combinations.combine(mode=['eager']))
   def test_reuses_variables(self):
-    with context.eager_mode():
-      sparse_input = sparse_tensor.SparseTensor(
-          indices=((0, 0), (1, 0), (2, 0)),
-          values=(0, 1, 2),
-          dense_shape=(3, 3))
+    sparse_input = sparse_tensor.SparseTensor(
+        indices=((0, 0), (1, 0), (2, 0)),
+        values=(0, 1, 2),
+        dense_shape=(3, 3))
 
-      # Create feature columns (categorical and embedding).
-      categorical_column = fc.categorical_column_with_identity(
-          key='a', num_buckets=3)
-      embedding_dimension = 2
+    # Create feature columns (categorical and embedding).
+    categorical_column = fc.categorical_column_with_identity(
+        key='a', num_buckets=3)
+    embedding_dimension = 2
 
-      def _embedding_column_initializer(shape, dtype, partition_info=None):
-        del shape  # unused
-        del dtype  # unused
-        del partition_info  # unused
-        embedding_values = (
-            (1, 0),  # id 0
-            (0, 1),  # id 1
-            (1, 1))  # id 2
-        return embedding_values
+    def _embedding_column_initializer(shape, dtype, partition_info=None):
+      del shape  # unused
+      del dtype  # unused
+      del partition_info  # unused
+      embedding_values = (
+          (1, 0),  # id 0
+          (0, 1),  # id 1
+          (1, 1))  # id 2
+      return embedding_values
 
-      embedding_column = fc.embedding_column(
-          categorical_column,
-          dimension=embedding_dimension,
-          initializer=_embedding_column_initializer)
+    embedding_column = fc.embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
+        initializer=_embedding_column_initializer)
 
-      dense_features = df.DenseFeatures([embedding_column])
-      features = {'a': sparse_input}
+    dense_features = df.DenseFeatures([embedding_column])
+    features = {'a': sparse_input}
 
-      inputs = dense_features(features)
-      variables = dense_features.variables
+    inputs = dense_features(features)
+    variables = dense_features.variables
 
-      # Sanity check: test that the inputs are correct.
-      self.assertAllEqual([[1, 0], [0, 1], [1, 1]], inputs)
+    # Sanity check: test that the inputs are correct.
+    self.assertAllEqual([[1, 0], [0, 1], [1, 1]], inputs)
 
-      # Check that only one variable was created.
-      self.assertEqual(1, len(variables))
+    # Check that only one variable was created.
+    self.assertEqual(1, len(variables))
 
-      # Check that invoking dense_features on the same features does not create
-      # additional variables
-      _ = dense_features(features)
-      self.assertEqual(1, len(variables))
-      self.assertIs(variables[0], dense_features.variables[0])
+    # Check that invoking dense_features on the same features does not create
+    # additional variables
+    _ = dense_features(features)
+    self.assertEqual(1, len(variables))
+    self.assertIs(variables[0], dense_features.variables[0])
 
+  @combinations.generate(combinations.combine(mode=['eager']))
   def test_feature_column_dense_features_gradient(self):
-    with context.eager_mode():
-      sparse_input = sparse_tensor.SparseTensor(
-          indices=((0, 0), (1, 0), (2, 0)),
-          values=(0, 1, 2),
-          dense_shape=(3, 3))
+    sparse_input = sparse_tensor.SparseTensor(
+        indices=((0, 0), (1, 0), (2, 0)),
+        values=(0, 1, 2),
+        dense_shape=(3, 3))
 
-      # Create feature columns (categorical and embedding).
-      categorical_column = fc.categorical_column_with_identity(
-          key='a', num_buckets=3)
-      embedding_dimension = 2
+    # Create feature columns (categorical and embedding).
+    categorical_column = fc.categorical_column_with_identity(
+        key='a', num_buckets=3)
+    embedding_dimension = 2
 
-      def _embedding_column_initializer(shape, dtype, partition_info=None):
-        del shape  # unused
-        del dtype  # unused
-        del partition_info  # unused
-        embedding_values = (
-            (1, 0),  # id 0
-            (0, 1),  # id 1
-            (1, 1))  # id 2
-        return embedding_values
+    def _embedding_column_initializer(shape, dtype, partition_info=None):
+      del shape  # unused
+      del dtype  # unused
+      del partition_info  # unused
+      embedding_values = (
+          (1, 0),  # id 0
+          (0, 1),  # id 1
+          (1, 1))  # id 2
+      return embedding_values
 
-      embedding_column = fc.embedding_column(
-          categorical_column,
-          dimension=embedding_dimension,
-          initializer=_embedding_column_initializer)
+    embedding_column = fc.embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
+        initializer=_embedding_column_initializer)
 
-      dense_features = df.DenseFeatures([embedding_column])
-      features = {'a': sparse_input}
+    dense_features = df.DenseFeatures([embedding_column])
+    features = {'a': sparse_input}
 
-      def scale_matrix():
-        matrix = dense_features(features)
-        return 2 * matrix
+    def scale_matrix():
+      matrix = dense_features(features)
+      return 2 * matrix
 
-      # Sanity check: Verify that scale_matrix returns the correct output.
-      self.assertAllEqual([[2, 0], [0, 2], [2, 2]], scale_matrix())
+    # Sanity check: Verify that scale_matrix returns the correct output.
+    self.assertAllEqual([[2, 0], [0, 2], [2, 2]], scale_matrix())
 
-      # Check that the returned gradient is correct.
-      grad_function = backprop.implicit_grad(scale_matrix)
-      grads_and_vars = grad_function()
-      indexed_slice = grads_and_vars[0][0]
-      gradient = grads_and_vars[0][0].values
+    # Check that the returned gradient is correct.
+    grad_function = backprop.implicit_grad(scale_matrix)
+    grads_and_vars = grad_function()
+    indexed_slice = grads_and_vars[0][0]
+    gradient = grads_and_vars[0][0].values
 
-      self.assertAllEqual([0, 1, 2], indexed_slice.indices)
-      self.assertAllEqual([[2, 2], [2, 2], [2, 2]], gradient)
+    self.assertAllEqual([0, 1, 2], indexed_slice.indices)
+    self.assertAllEqual([[2, 2], [2, 2], [2, 2]], gradient)
 
   def test_dense_feature_with_training_arg(self):
     price1 = fc.numeric_column('price1', shape=2)
diff --git a/tensorflow/python/keras/initializers/__init__.py b/tensorflow/python/keras/initializers/__init__.py
index 828a5b9ca49..ae388b591f5 100644
--- a/tensorflow/python/keras/initializers/__init__.py
+++ b/tensorflow/python/keras/initializers/__init__.py
@@ -25,8 +25,8 @@ from tensorflow.python import tf2
 from tensorflow.python.keras.initializers import initializers_v1
 from tensorflow.python.keras.initializers import initializers_v2
 from tensorflow.python.keras.utils import generic_utils
+from tensorflow.python.keras.utils import tf_inspect as inspect
 from tensorflow.python.ops import init_ops
-from tensorflow.python.util import tf_inspect as inspect
 from tensorflow.python.util.tf_export import keras_export
 
 
diff --git a/tensorflow/python/keras/initializers/initializers_v2.py b/tensorflow/python/keras/initializers/initializers_v2.py
index e2042feca62..66e6719f31f 100644
--- a/tensorflow/python/keras/initializers/initializers_v2.py
+++ b/tensorflow/python/keras/initializers/initializers_v2.py
@@ -34,7 +34,7 @@ class Initializer(object):
   signature:
 
   ```python
-  def __call__(self, shape, dtype=None)`:
+  def __call__(self, shape, dtype=None):
     # returns a tensor of shape `shape` and dtype `dtype`
     # containing values drawn from a distribution of your choice.
   ```
@@ -54,7 +54,7 @@ class Initializer(object):
       self.mean = mean
       self.stddev = stddev
 
-    def __call__(self, shape, dtype=None)`:
+    def __call__(self, shape, dtype=None):
       return tf.random.normal(
           shape, mean=self.mean, stddev=self.stddev, dtype=dtype)
 
diff --git a/tensorflow/python/keras/initializers_test.py b/tensorflow/python/keras/initializers_test.py
index e254f6340fc..f03de2b436e 100644
--- a/tensorflow/python/keras/initializers_test.py
+++ b/tensorflow/python/keras/initializers_test.py
@@ -20,11 +20,11 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python.framework import test_util
 from tensorflow.python.keras import backend
 from tensorflow.python.keras import combinations
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras import models
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import input_layer
 from tensorflow.python.keras.layers import core
 from tensorflow.python.ops import array_ops
@@ -239,7 +239,7 @@ class KerasInitializersTest(test.TestCase):
         model.get_config(), custom_objects={'my_initializer': my_initializer})
     self.assertEqual(model2.layers[1].kernel_initializer, my_initializer)
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_load_external_variance_scaling_v2(self):
     external_serialized_json = {
         'class_name': 'VarianceScaling',
diff --git a/tensorflow/python/keras/integration_test/BUILD b/tensorflow/python/keras/integration_test/BUILD
index b23dcc59b97..20e1a886d4e 100644
--- a/tensorflow/python/keras/integration_test/BUILD
+++ b/tensorflow/python/keras/integration_test/BUILD
@@ -97,6 +97,7 @@ tpu_py_test(
     name = "tpu_strategy_test",
     srcs = ["tpu_strategy_test.py"],
     disable_experimental = True,
+    disable_mlir_bridge = False,
     python_version = "PY3",
     tags = ["no_oss"],
     deps = [
diff --git a/tensorflow/python/keras/layers/BUILD b/tensorflow/python/keras/layers/BUILD
index fe46f580162..6806babe5ba 100644
--- a/tensorflow/python/keras/layers/BUILD
+++ b/tensorflow/python/keras/layers/BUILD
@@ -18,6 +18,12 @@ package(
 
 exports_files(["LICENSE"])
 
+filegroup(
+    name = "all_py_srcs",
+    srcs = glob(["*.py"]),
+    visibility = ["//tensorflow/python/keras/google/private_tf_api_test:__pkg__"],
+)
+
 # A separate build for layers without serialization to avoid circular deps
 # with feature column.
 py_library(
@@ -39,6 +45,7 @@ py_library(
         ":kernelized",
         ":local",
         ":merge",
+        ":multi_head_attention",
         ":noise",
         ":normalization",
         ":normalization_v2",
@@ -207,6 +214,22 @@ py_library(
     ],
 )
 
+py_library(
+    name = "multi_head_attention",
+    srcs = ["multi_head_attention.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:special_math_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:util",
+        "//tensorflow/python/keras:activations",
+        "//tensorflow/python/keras:base_layer",
+        "//tensorflow/python/keras:constraints",
+        "//tensorflow/python/keras:initializers",
+        "//tensorflow/python/keras:regularizers",
+    ],
+)
+
 py_library(
     name = "embeddings",
     srcs = ["embeddings.py"],
@@ -590,6 +613,18 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "multi_head_attention_test",
+    srcs = ["multi_head_attention_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":multi_head_attention",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/keras",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 cuda_py_test(
     name = "embeddings_test",
     size = "medium",
@@ -784,7 +819,9 @@ cuda_py_test(
     srcs = ["lstm_v2_test.py"],
     python_version = "PY3",
     shard_count = 12,
-    tags = ["no_cuda11"],
+    tags = [
+        "no_oss",
+    ],
     xla_enable_strict_auto_jit = False,
     deps = [
         "//tensorflow/python:client_testlib",
@@ -800,7 +837,10 @@ cuda_py_test(
     srcs = ["gru_v2_test.py"],
     python_version = "PY3",
     shard_count = 12,
-    tags = ["no_cuda11"],
+    tags = [
+        "no_cuda11",
+        "no_oss",
+    ],
     xla_enable_strict_auto_jit = False,
     deps = [
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/keras/layers/__init__.py b/tensorflow/python/keras/layers/__init__.py
index 8ce1c7d8224..b07773ae03a 100644
--- a/tensorflow/python/keras/layers/__init__.py
+++ b/tensorflow/python/keras/layers/__init__.py
@@ -143,6 +143,9 @@ from tensorflow.python.keras.layers.embeddings import Embedding
 # Einsum-based dense layer/
 from tensorflow.python.keras.layers.einsum_dense import EinsumDense
 
+# Multi-head Attention layer.
+from tensorflow.python.keras.layers.multi_head_attention import MultiHeadAttention
+
 # Locally-connected layers.
 from tensorflow.python.keras.layers.local import LocallyConnected1D
 from tensorflow.python.keras.layers.local import LocallyConnected2D
diff --git a/tensorflow/python/keras/layers/advanced_activations.py b/tensorflow/python/keras/layers/advanced_activations.py
index 7cb40c172b7..e4323b45dc4 100644
--- a/tensorflow/python/keras/layers/advanced_activations.py
+++ b/tensorflow/python/keras/layers/advanced_activations.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import dtypes
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import constraints
 from tensorflow.python.keras import initializers
@@ -259,10 +260,37 @@ class ThresholdedReLU(Layer):
     return input_shape
 
 
+def _large_compatible_negative(tensor_type):
+  """Large negative number as Tensor.
+
+  This function is necessary because the standard value for epsilon
+  in this module (-1e9) cannot be represented using tf.float16
+
+  Args:
+    tensor_type: a dtype to determine the type.
+
+  Returns:
+    a large negative number.
+  """
+  if tensor_type == dtypes.float16:
+    return dtypes.float16.min
+  return -1e9
+
+
 @keras_export('keras.layers.Softmax')
 class Softmax(Layer):
   """Softmax activation function.
 
+  Example without mask:
+
+  >>> inp = np.asarray([1., 2., 1.])
+  >>> layer = tf.keras.layers.Softmax()
+  >>> layer(inp).numpy()
+  array([0.21194157, 0.5761169 , 0.21194157], dtype=float32)
+  >>> mask = np.asarray([True, False, True], dtype=bool)
+  >>> layer(inp, mask).numpy()
+  array([0.5, 0. , 0.5], dtype=float32)
+
   Input shape:
     Arbitrary. Use the keyword argument `input_shape`
     (tuple of integers, does not include the samples axis)
@@ -272,7 +300,14 @@ class Softmax(Layer):
     Same shape as the input.
 
   Arguments:
-    axis: Integer, axis along which the softmax normalization is applied.
+    axis: Integer, or list of Integers, axis along which the softmax
+      normalization is applied.
+  Call arguments:
+    inputs: The inputs, or logits to the softmax layer.
+    mask: A boolean mask of the same shape as `inputs`. Defaults to `None`.
+
+  Returns:
+    softmaxed output with the same shape as `inputs`.
   """
 
   def __init__(self, axis=-1, **kwargs):
@@ -280,7 +315,23 @@ class Softmax(Layer):
     self.supports_masking = True
     self.axis = axis
 
-  def call(self, inputs):
+  def call(self, inputs, mask=None):
+    if mask is not None:
+      # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+      # masked positions, this operation will create a tensor which is 0.0 for
+      # positions we want to attend and -1e.9 for masked positions.
+      adder = (1.0 - math_ops.cast(mask, inputs.dtype)) * (
+          _large_compatible_negative(inputs.dtype))
+
+      # Since we are adding it to the raw scores before the softmax, this is
+      # effectively the same as removing these entirely.
+      inputs += adder
+    if isinstance(self.axis, (tuple, list)):
+      if len(self.axis) > 1:
+        return math_ops.exp(inputs - math_ops.reduce_logsumexp(
+            inputs, axis=self.axis, keepdims=True))
+      else:
+        return K.softmax(inputs, axis=self.axis[0])
     return K.softmax(inputs, axis=self.axis)
 
   def get_config(self):
diff --git a/tensorflow/python/keras/layers/convolutional.py b/tensorflow/python/keras/layers/convolutional.py
index 05efbd23c1e..39e0eaa935f 100644
--- a/tensorflow/python/keras/layers/convolutional.py
+++ b/tensorflow/python/keras/layers/convolutional.py
@@ -107,9 +107,6 @@ class Conv(Layer):
         not safe to use when doing asynchronous distributed training.
     bias_constraint: Optional projection function to be applied to the
         bias after being updated by an `Optimizer`.
-    trainable: Boolean, if `True` the weights of this layer will be marked as
-      trainable (and listed in `layer.trainable_weights`).
-    name: A string, the name of the layer.
   """
 
   def __init__(self,
@@ -422,8 +419,8 @@ class Conv1D(Conv):
       Specifying any stride value != 1 is incompatible with specifying
       any `dilation_rate` value != 1.
     padding: One of `"valid"`, `"same"` or `"causal"` (case-insensitive).
-      `"valid"` means no padding. `"same"` results in padding evenly to 
-      the left/right or up/down of the input such that output has the same 
+      `"valid"` means no padding. `"same"` results in padding evenly to
+      the left/right or up/down of the input such that output has the same
       height/width dimension as the input.
       `"causal"` results in causal (dilated) convolutions, e.g. `output[t]`
       does not depend on `input[t+1:]`. Useful when modeling temporal data
@@ -578,8 +575,8 @@ class Conv2D(Conv):
       specify the same value for all spatial dimensions. Specifying any stride
       value != 1 is incompatible with specifying any `dilation_rate` value != 1.
     padding: one of `"valid"` or `"same"` (case-insensitive).
-      `"valid"` means no padding. `"same"` results in padding evenly to 
-      the left/right or up/down of the input such that output has the same 
+      `"valid"` means no padding. `"same"` results in padding evenly to
+      the left/right or up/down of the input such that output has the same
       height/width dimension as the input.
     data_format: A string, one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs. `channels_last` corresponds
@@ -722,8 +719,8 @@ class Conv3D(Conv):
       specify the same value for all spatial dimensions. Specifying any stride
       value != 1 is incompatible with specifying any `dilation_rate` value != 1.
     padding: one of `"valid"` or `"same"` (case-insensitive).
-      `"valid"` means no padding. `"same"` results in padding evenly to 
-      the left/right or up/down of the input such that output has the same 
+      `"valid"` means no padding. `"same"` results in padding evenly to
+      the left/right or up/down of the input such that output has the same
       height/width dimension as the input.
     data_format: A string, one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs. `channels_last` corresponds
@@ -846,8 +843,8 @@ class Conv1DTranspose(Conv1D):
       time dimension. Specifying a stride value != 1 is incompatible with
       specifying a `dilation_rate` value != 1. Defaults to 1.
     padding: one of `"valid"` or `"same"` (case-insensitive).
-      `"valid"` means no padding. `"same"` results in padding evenly to 
-      the left/right or up/down of the input such that output has the same 
+      `"valid"` means no padding. `"same"` results in padding evenly to
+      the left/right or up/down of the input such that output has the same
       height/width dimension as the input.
     output_padding: An integer specifying the amount of padding along
       the time dimension of the output tensor.
@@ -1100,8 +1097,8 @@ class Conv2DTranspose(Conv2D):
       Specifying any stride value != 1 is incompatible with specifying
       any `dilation_rate` value != 1.
     padding: one of `"valid"` or `"same"` (case-insensitive).
-      `"valid"` means no padding. `"same"` results in padding evenly to 
-      the left/right or up/down of the input such that output has the same 
+      `"valid"` means no padding. `"same"` results in padding evenly to
+      the left/right or up/down of the input such that output has the same
       height/width dimension as the input.
     output_padding: An integer or tuple/list of 2 integers,
       specifying the amount of padding along the height and width
@@ -1404,8 +1401,8 @@ class Conv3DTranspose(Conv3D):
       Specifying any stride value != 1 is incompatible with specifying
       any `dilation_rate` value != 1.
     padding: one of `"valid"` or `"same"` (case-insensitive).
-      `"valid"` means no padding. `"same"` results in padding evenly to 
-      the left/right or up/down of the input such that output has the same 
+      `"valid"` means no padding. `"same"` results in padding evenly to
+      the left/right or up/down of the input such that output has the same
       height/width dimension as the input.
     output_padding: An integer or tuple/list of 3 integers,
       specifying the amount of padding along the depth, height, and
@@ -1704,8 +1701,8 @@ class SeparableConv(Conv):
       Specifying any `stride` value != 1 is incompatible with specifying
       any `dilation_rate` value != 1.
     padding: One of `"valid"` or `"same"` (case-insensitive).
-      `"valid"` means no padding. `"same"` results in padding evenly to 
-      the left/right or up/down of the input such that output has the same 
+      `"valid"` means no padding. `"same"` results in padding evenly to
+      the left/right or up/down of the input such that output has the same
       height/width dimension as the input.
     data_format: A string, one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
@@ -1747,7 +1744,6 @@ class SeparableConv(Conv):
       bias after being updated by an `Optimizer`.
     trainable: Boolean, if `True` the weights of this layer will be marked as
       trainable (and listed in `layer.trainable_weights`).
-    name: A string, the name of the layer.
   """
 
   def __init__(self,
@@ -1911,9 +1907,9 @@ class SeparableConv1D(SeparableConv):
       Specifying any `stride` value != 1 is incompatible with specifying
       any `dilation_rate` value != 1.
     padding: One of `"valid"`, `"same"`, or `"causal"` (case-insensitive).
-      `"valid"` means no padding. `"same"` results in padding evenly to 
-      the left/right or up/down of the input such that output has the same 
-      height/width dimension as the input. `"causal"` results in causal 
+      `"valid"` means no padding. `"same"` results in padding evenly to
+      the left/right or up/down of the input such that output has the same
+      height/width dimension as the input. `"causal"` results in causal
       (dilated) convolutions, e.g. `output[t]` does not depend on `input[t+1:]`.
     data_format: A string, one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
@@ -1960,7 +1956,6 @@ class SeparableConv1D(SeparableConv):
       see `keras.constraints`).
     trainable: Boolean, if `True` the weights of this layer will be marked as
       trainable (and listed in `layer.trainable_weights`).
-    name: A string, the name of the layer.
 
   Input shape:
     3D tensor with shape:
@@ -2100,8 +2095,8 @@ class SeparableConv2D(SeparableConv):
       Specifying any stride value != 1 is incompatible with specifying
       any `dilation_rate` value != 1.
     padding: one of `"valid"` or `"same"` (case-insensitive).
-      `"valid"` means no padding. `"same"` results in padding evenly to 
-      the left/right or up/down of the input such that output has the same 
+      `"valid"` means no padding. `"same"` results in padding evenly to
+      the left/right or up/down of the input such that output has the same
       height/width dimension as the input.
     data_format: A string,
       one of `channels_last` (default) or `channels_first`.
@@ -2263,8 +2258,8 @@ class DepthwiseConv2D(Conv2D):
       Specifying any stride value != 1 is incompatible with specifying
       any `dilation_rate` value != 1.
     padding: one of `'valid'` or `'same'` (case-insensitive).
-      `"valid"` means no padding. `"same"` results in padding evenly to 
-      the left/right or up/down of the input such that output has the same 
+      `"valid"` means no padding. `"same"` results in padding evenly to
+      the left/right or up/down of the input such that output has the same
       height/width dimension as the input.
     depth_multiplier: The number of depthwise convolution output channels
       for each input channel.
diff --git a/tensorflow/python/keras/layers/convolutional_recurrent_test.py b/tensorflow/python/keras/layers/convolutional_recurrent_test.py
index a9187009be2..cb6be1e6f4e 100644
--- a/tensorflow/python/keras/layers/convolutional_recurrent_test.py
+++ b/tensorflow/python/keras/layers/convolutional_recurrent_test.py
@@ -22,7 +22,6 @@ from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import keras
-from tensorflow.python.framework import test_util
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
@@ -32,7 +31,7 @@ from tensorflow.python.platform import test
 class ConvLSTMTest(keras_parameterized.TestCase):
 
   @parameterized.named_parameters(
-      *test_util.generate_combinations_with_testcase_name(
+      *testing_utils.generate_combinations_with_testcase_name(
           data_format=['channels_first', 'channels_last'],
           return_sequences=[True, False]))
   def test_conv_lstm(self, data_format, return_sequences):
diff --git a/tensorflow/python/keras/layers/convolutional_transpose_test.py b/tensorflow/python/keras/layers/convolutional_transpose_test.py
index dd73d22d51b..4326044458e 100644
--- a/tensorflow/python/keras/layers/convolutional_transpose_test.py
+++ b/tensorflow/python/keras/layers/convolutional_transpose_test.py
@@ -207,3 +207,6 @@ class Conv3DTransposeTest(keras_parameterized.TestCase):
             },
             input_shape=(None, 3, None, None, None),
             input_data=input_data)
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/layers/core.py b/tensorflow/python/keras/layers/core.py
index 36ac087ef64..c0932e7297b 100644
--- a/tensorflow/python/keras/layers/core.py
+++ b/tensorflow/python/keras/layers/core.py
@@ -46,6 +46,7 @@ from tensorflow.python.keras.layers.ops import core as core_ops
 from tensorflow.python.keras.utils import control_flow_util
 from tensorflow.python.keras.utils import conv_utils
 from tensorflow.python.keras.utils import generic_utils
+from tensorflow.python.keras.utils import tf_inspect
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
@@ -57,7 +58,6 @@ from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import dispatch
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
-from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import get_canonical_name_for_symbol
 from tensorflow.python.util.tf_export import get_symbol_from_name
 from tensorflow.python.util.tf_export import keras_export
@@ -201,7 +201,7 @@ class Dropout(Layer):
     noise_shape = []
     for i, value in enumerate(self.noise_shape):
       noise_shape.append(concrete_inputs_shape[i] if value is None else value)
-    return ops.convert_to_tensor_v2(noise_shape)
+    return ops.convert_to_tensor_v2_with_dispatch(noise_shape)
 
   def call(self, inputs, training=None):
     if training is None:
diff --git a/tensorflow/python/keras/layers/core_test.py b/tensorflow/python/keras/layers/core_test.py
index f6509814249..b7a11d32c71 100644
--- a/tensorflow/python/keras/layers/core_test.py
+++ b/tensorflow/python/keras/layers/core_test.py
@@ -504,14 +504,14 @@ class CoreLayersTest(keras_parameterized.TestCase):
         keras.layers.Dense, kwargs={'units': 3}, input_shape=(3, 4, 5, 2))
 
   def test_dense_dtype(self):
-    inputs = ops.convert_to_tensor_v2(
+    inputs = ops.convert_to_tensor_v2_with_dispatch(
         np.random.randint(low=0, high=7, size=(2, 2)))
     layer = keras.layers.Dense(5, dtype='float32')
     outputs = layer(inputs)
     self.assertEqual(outputs.dtype, 'float32')
 
   def test_dense_with_policy(self):
-    inputs = ops.convert_to_tensor_v2(
+    inputs = ops.convert_to_tensor_v2_with_dispatch(
         np.random.randint(low=0, high=7, size=(2, 2)))
     layer = keras.layers.Dense(5, dtype=policy.Policy('mixed_float16'))
     outputs = layer(inputs)
diff --git a/tensorflow/python/keras/layers/cudnn_recurrent_test.py b/tensorflow/python/keras/layers/cudnn_recurrent_test.py
index d25851f6569..3bb392c85ad 100644
--- a/tensorflow/python/keras/layers/cudnn_recurrent_test.py
+++ b/tensorflow/python/keras/layers/cudnn_recurrent_test.py
@@ -40,7 +40,7 @@ from tensorflow.python.training import gradient_descent
 class CuDNNTest(keras_parameterized.TestCase):
 
   @parameterized.named_parameters(
-      *test_util.generate_combinations_with_testcase_name(
+      *testing_utils.generate_combinations_with_testcase_name(
           layer_class=[keras.layers.CuDNNGRU, keras.layers.CuDNNLSTM],
           return_sequences=[True, False]))
   @test_util.run_gpu_only
@@ -56,7 +56,7 @@ class CuDNNTest(keras_parameterized.TestCase):
         input_shape=(num_samples, timesteps, input_size))
 
   @parameterized.named_parameters(
-      *test_util.generate_combinations_with_testcase_name(
+      *testing_utils.generate_combinations_with_testcase_name(
           layer_class=[keras.layers.CuDNNGRU, keras.layers.CuDNNLSTM],
           go_backwards=[True, False]))
   @test_util.run_gpu_only
@@ -269,7 +269,7 @@ class CuDNNV1OnlyTest(keras_parameterized.TestCase):
 
   # TODO(b/156439419): Reenable after the bug is fixed.
   @parameterized.named_parameters(
-      *test_util.generate_combinations_with_testcase_name(
+      *testing_utils.generate_combinations_with_testcase_name(
           rnn_type=['LSTM', 'GRU'], to_cudnn=[True, False],
           bidirectional=[True, False], implementation=[1, 2],
           model_nest_level=[1, 2], model_type=['seq', 'func']))
@@ -352,7 +352,7 @@ class CuDNNV1OnlyTest(keras_parameterized.TestCase):
     os.remove(fname)
 
   @parameterized.named_parameters(
-      *test_util.generate_combinations_with_testcase_name(
+      *testing_utils.generate_combinations_with_testcase_name(
           rnn_type=['LSTM', 'GRU'], to_cudnn=[True, False]))
   @test_util.run_v1_only('b/120911602')
   @test_util.run_gpu_only
diff --git a/tensorflow/python/keras/layers/dense_attention.py b/tensorflow/python/keras/layers/dense_attention.py
index cd277a1a6a9..ab2912505ef 100644
--- a/tensorflow/python/keras/layers/dense_attention.py
+++ b/tensorflow/python/keras/layers/dense_attention.py
@@ -180,7 +180,7 @@ class BaseDenseAttention(Layer):
       q_mask = mask[0]
       if q_mask is None:
         return None
-      return ops.convert_to_tensor_v2(q_mask)
+      return ops.convert_to_tensor_v2_with_dispatch(q_mask)
     return None
 
   def _validate_call_args(self, inputs, mask):
diff --git a/tensorflow/python/keras/layers/dense_attention_test.py b/tensorflow/python/keras/layers/dense_attention_test.py
index 942304e4316..5df53a8d1fb 100644
--- a/tensorflow/python/keras/layers/dense_attention_test.py
+++ b/tensorflow/python/keras/layers/dense_attention_test.py
@@ -385,10 +385,11 @@ class AttentionTest(test.TestCase, parameterized.TestCase):
 
   def test_scale_init_eager(self):
     """Tests that scale initializes to 1 when use_scale=True."""
-    with context.eager_mode():
-      attention_layer = dense_attention.Attention(use_scale=True)
-      attention_layer.build(input_shape=([1, 1, 1], [1, 1, 1]))
-      self.assertAllClose(1., attention_layer.scale.value())
+    if not context.executing_eagerly():
+      self.skipTest('Only run in eager mode')
+    attention_layer = dense_attention.Attention(use_scale=True)
+    attention_layer.build(input_shape=([1, 1, 1], [1, 1, 1]))
+    self.assertAllClose(1., attention_layer.scale.value())
 
   def test_scale_init_graph(self):
     """Tests that scale initializes to 1 when use_scale=True."""
diff --git a/tensorflow/python/keras/layers/embeddings.py b/tensorflow/python/keras/layers/embeddings.py
index defa03409a2..9b4a0622daa 100644
--- a/tensorflow/python/keras/layers/embeddings.py
+++ b/tensorflow/python/keras/layers/embeddings.py
@@ -20,11 +20,13 @@ from __future__ import print_function
 
 from tensorflow.python.distribute import sharded_variable
 from tensorflow.python.eager import context
+from tensorflow.python.framework import config as tf_config
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import constraints
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras import regularizers
+from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import embedding_ops
@@ -105,13 +107,17 @@ class Embedding(Layer):
       raise ValueError('Both `input_dim` and `output_dim` should be positive, '
                        'found input_dim {} and output_dim {}'.format(
                            input_dim, output_dim))
-    dtype = kwargs.pop('dtype', K.floatx())
+    if (not base_layer_utils.v2_dtype_behavior_enabled() and
+        'dtype' not in kwargs):
+      # In TF1, the dtype defaults to the input dtype which is typically int32,
+      # so explicitly set it to floatx
+      kwargs['dtype'] = K.floatx()
     # We set autocast to False, as we do not want to cast floating- point inputs
     # to self.dtype. In call(), we cast to int32, and casting to self.dtype
     # before casting to int32 might cause the int32 values to be different due
     # to a loss of precision.
     kwargs['autocast'] = False
-    super(Embedding, self).__init__(dtype=dtype, **kwargs)
+    super(Embedding, self).__init__(**kwargs)
 
     self.input_dim = input_dim
     self.output_dim = output_dim
@@ -130,23 +136,24 @@ class Embedding(Layer):
     # since it knows all kernels using the variable only exist on CPU.
     # When eager execution is enabled, the placement decision has to be made
     # right now. Checking for the presence of GPUs to avoid complicating the
-    # TPU codepaths which can handle sparse optimizers. But if we are within
-    # a tf.function, we go back the graph mode logic and rely on the placer.
-    if context.executing_eagerly() and context.context().num_gpus():
+    # TPU codepaths which can handle sparse optimizers.
+    if context.executing_eagerly() and tf_config.list_logical_devices('GPU'):
       with ops.device('cpu:0'):
         self.embeddings = self.add_weight(
             shape=(self.input_dim, self.output_dim),
             initializer=self.embeddings_initializer,
             name='embeddings',
             regularizer=self.embeddings_regularizer,
-            constraint=self.embeddings_constraint)
+            constraint=self.embeddings_constraint,
+            experimental_autocast=False)
     else:
       self.embeddings = self.add_weight(
           shape=(self.input_dim, self.output_dim),
           initializer=self.embeddings_initializer,
           name='embeddings',
           regularizer=self.embeddings_regularizer,
-          constraint=self.embeddings_constraint)
+          constraint=self.embeddings_constraint,
+          experimental_autocast=False)
     self.built = True
 
   def compute_mask(self, inputs, mask=None):
@@ -187,6 +194,10 @@ class Embedding(Layer):
       out = embedding_ops.embedding_lookup_v2(self.embeddings.variables, inputs)
     else:
       out = embedding_ops.embedding_lookup_v2(self.embeddings, inputs)
+    if self._dtype_policy.should_cast_variables:
+      # Instead of casting the variable as in most layers, cast the output, as
+      # this is mathematically equivalent but is faster.
+      out = math_ops.cast(out, self._dtype_policy.compute_dtype)
     return out
 
   def get_config(self):
diff --git a/tensorflow/python/keras/layers/embeddings_test.py b/tensorflow/python/keras/layers/embeddings_test.py
index 6aa873b2bd7..fd468bd15b1 100644
--- a/tensorflow/python/keras/layers/embeddings_test.py
+++ b/tensorflow/python/keras/layers/embeddings_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.mixed_precision.experimental import policy
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.platform import test
@@ -146,6 +147,17 @@ class EmbeddingTest(keras_parameterized.TestCase):
     outputs = model.predict(np.array([[0, 2, 4]], dtype='int32'))
     self.assertAllClose(outputs, [[[1., 2.], [5., 6.], [9., 10.]]])
 
+  @testing_utils.enable_v2_dtype_behavior
+  def test_mixed_precision_embedding(self):
+    try:
+      policy.set_policy('mixed_float16')
+      layer = keras.layers.Embedding(input_dim=5, output_dim=2)
+      self.assertEqual(layer._dtype_policy.name, 'mixed_float16')
+      outputs = layer(np.array([0, 1, 2]))
+      self.assertEqual(outputs.dtype, 'float16')
+    finally:
+      policy.set_policy('float32')
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/layers/gru_test.py b/tensorflow/python/keras/layers/gru_test.py
index 5b794c580a2..bfa4e5de087 100644
--- a/tensorflow/python/keras/layers/gru_test.py
+++ b/tensorflow/python/keras/layers/gru_test.py
@@ -26,7 +26,6 @@ import numpy as np
 from tensorflow.python import keras
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
@@ -48,7 +47,7 @@ class GRULayerTest(keras_parameterized.TestCase):
                 'return_sequences': True},
         input_shape=(num_samples, timesteps, embedding_dim))
 
-  @tf_test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_float64_GRU(self):
     if test.is_built_with_rocm():
       self.skipTest('Double type is yet not supported in ROCm')
diff --git a/tensorflow/python/keras/layers/gru_v2_test.py b/tensorflow/python/keras/layers/gru_v2_test.py
index e5100e495d3..8760728eb99 100644
--- a/tensorflow/python/keras/layers/gru_v2_test.py
+++ b/tensorflow/python/keras/layers/gru_v2_test.py
@@ -34,7 +34,6 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
-from tensorflow.python.framework import test_util
 from tensorflow.python.keras import combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
@@ -82,7 +81,7 @@ class GRUV2Test(keras_parameterized.TestCase):
                     reset_after=reset_after)
     self.assertFalse(layer._could_use_gpu_kernel)
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_use_on_default_activation_with_gpu_kernel(self):
     layer = rnn.GRU(1, activation=nn.tanh)
     self.assertTrue(layer._could_use_gpu_kernel)
@@ -149,7 +148,7 @@ class GRUV2Test(keras_parameterized.TestCase):
       l2 = layer_class.from_config(l1.get_config())
       assert l1.get_config() == l2.get_config()
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_gru_v2_feature_parity_with_canonical_gru(self):
     if test.is_built_with_rocm():
       self.skipTest('Skipping the test as ROCm MIOpen does not '
@@ -366,7 +365,7 @@ class GRUV2Test(keras_parameterized.TestCase):
                 'return_sequences': True},
         input_shape=(num_samples, timesteps, embedding_dim))
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_float64_GRU(self):
     if test.is_built_with_rocm():
       self.skipTest('Double type is yet not supported in ROCm')
@@ -566,7 +565,7 @@ class GRUV2Test(keras_parameterized.TestCase):
         run_eagerly=testing_utils.should_run_eagerly())
     model.fit(x, y, epochs=1, shuffle=False)
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_explicit_device_with_go_backward_and_mask(self):
     if test.is_built_with_rocm():
       self.skipTest('Skipping the test as ROCm MIOpen does not '
@@ -704,7 +703,7 @@ class GRUGraphRewriteTest(keras_parameterized.TestCase):
     else:
       self.assertEqual(runtime_value[0], rnn._RUNTIME_CPU)
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_GRU_runtime(self):
     layer = rnn.GRU(self.rnn_state_size, return_runtime=True)
 
@@ -720,7 +719,7 @@ class GRUGraphRewriteTest(keras_parameterized.TestCase):
     model = keras.models.Model(inputs=inputs, outputs=[outputs, runtime])
     self._test_runtime_with_model(model)
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_GRU_runtime_with_mask(self):
     if test.is_built_with_rocm():
       self.skipTest('Skipping the test as ROCm MIOpen does not '
@@ -779,7 +778,7 @@ class GRUGraphRewriteTest(keras_parameterized.TestCase):
     _, runtime_value = model.predict(x_train)
     self.assertEqual(runtime_value[0], rnn._RUNTIME_CPU)
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_GRU_runtime_with_cond(self):
     # This test is to demonstrate the graph rewrite of grappler plugin under
     # the condition that the function returns different number of internal
diff --git a/tensorflow/python/keras/layers/kernelized.py b/tensorflow/python/keras/layers/kernelized.py
index eac985e63bf..c8a6a65d68c 100644
--- a/tensorflow/python/keras/layers/kernelized.py
+++ b/tensorflow/python/keras/layers/kernelized.py
@@ -218,7 +218,7 @@ class RandomFourierFeatures(base_layer.Layer):
     super(RandomFourierFeatures, self).build(input_shape)
 
   def call(self, inputs):
-    inputs = ops.convert_to_tensor_v2(inputs, dtype=self.dtype)
+    inputs = ops.convert_to_tensor_v2_with_dispatch(inputs, dtype=self.dtype)
     inputs = gen_math_ops.cast(inputs, dtypes.float32)
     kernel = (1.0 / self.kernel_scale) * self.unscaled_kernel
     outputs = gen_math_ops.mat_mul(inputs, kernel)
diff --git a/tensorflow/python/keras/layers/kernelized_test.py b/tensorflow/python/keras/layers/kernelized_test.py
index 8ae3b2f31cb..e6178afaf7d 100644
--- a/tensorflow/python/keras/layers/kernelized_test.py
+++ b/tensorflow/python/keras/layers/kernelized_test.py
@@ -37,6 +37,7 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.keras import backend as keras_backend
 from tensorflow.python.keras import combinations
 from tensorflow.python.keras import initializers
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.engine import input_layer
 from tensorflow.python.keras.engine import training
@@ -71,7 +72,7 @@ class RandomFourierFeaturesTest(test.TestCase, parameterized.TestCase):
     else:
       self.assertAllClose(expected, actual, atol=atol)
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_state_saving_and_loading(self):
     input_data = np.random.random((1, 2))
     rff_layer = kernel_layers.RandomFourierFeatures(output_dim=10, scale=3.0)
diff --git a/tensorflow/python/keras/layers/legacy_rnn/BUILD b/tensorflow/python/keras/layers/legacy_rnn/BUILD
index 4d3b4a4c852..203c6d71f92 100644
--- a/tensorflow/python/keras/layers/legacy_rnn/BUILD
+++ b/tensorflow/python/keras/layers/legacy_rnn/BUILD
@@ -6,6 +6,12 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
+filegroup(
+    name = "all_py_srcs",
+    srcs = glob(["*.py"]),
+    visibility = ["//tensorflow/python/keras/google/private_tf_api_test:__pkg__"],
+)
+
 py_library(
     name = "rnn_cell_impl",
     srcs = ["rnn_cell_impl.py"],
diff --git a/tensorflow/python/keras/layers/legacy_rnn/rnn_cell_impl.py b/tensorflow/python/keras/layers/legacy_rnn/rnn_cell_impl.py
index 1e33edd497c..94222e27669 100644
--- a/tensorflow/python/keras/layers/legacy_rnn/rnn_cell_impl.py
+++ b/tensorflow/python/keras/layers/legacy_rnn/rnn_cell_impl.py
@@ -25,8 +25,10 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import warnings
 
 from tensorflow.python.eager import context
+from tensorflow.python.framework import config as tf_config
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -282,7 +284,7 @@ class RNNCell(base_layer.Layer):
   def get_initial_state(self, inputs=None, batch_size=None, dtype=None):
     if inputs is not None:
       # Validate the given batch_size and dtype against inputs if provided.
-      inputs = ops.convert_to_tensor(inputs, name="inputs")
+      inputs = ops.convert_to_tensor_v2_with_dispatch(inputs, name="inputs")
       if batch_size is not None:
         if tensor_util.is_tensor(batch_size):
           static_batch_size = tensor_util.constant_value(
@@ -416,13 +418,14 @@ class BasicRNNCell(LayerRNNCell):
                name=None,
                dtype=None,
                **kwargs):
-    logging.warning("`tf.nn.rnn_cell.BasicRNNCell` is deprecated. This class "
-                    "is equivalent as `tf.keras.layers.SimpleRNNCell`, "
-                    "and will be replaced by that in Tensorflow 2.0.")
+    warnings.warn("`tf.nn.rnn_cell.BasicRNNCell` is deprecated and will be "
+                  "removed in a future version. This class "
+                  "is equivalent as `tf.keras.layers.SimpleRNNCell`, "
+                  "and will be replaced by that in Tensorflow 2.0.")
     super(BasicRNNCell, self).__init__(
         _reuse=reuse, name=name, dtype=dtype, **kwargs)
     _check_supported_dtypes(self.dtype)
-    if context.executing_eagerly() and context.num_gpus() > 0:
+    if context.executing_eagerly() and tf_config.list_logical_devices("GPU"):
       logging.warn(
           "%s: Note that this cell is not optimized for performance. "
           "Please use tf.contrib.cudnn_rnn.CudnnRNNTanh for better "
@@ -523,14 +526,15 @@ class GRUCell(LayerRNNCell):
                name=None,
                dtype=None,
                **kwargs):
-    logging.warning("`tf.nn.rnn_cell.GRUCell` is deprecated. This class "
-                    "is equivalent as `tf.keras.layers.GRUCell`, "
-                    "and will be replaced by that in Tensorflow 2.0.")
+    warnings.warn("`tf.nn.rnn_cell.GRUCell` is deprecated and will be removed "
+                  "in a future version. This class "
+                  "is equivalent as `tf.keras.layers.GRUCell`, "
+                  "and will be replaced by that in Tensorflow 2.0.")
     super(GRUCell, self).__init__(
         _reuse=reuse, name=name, dtype=dtype, **kwargs)
     _check_supported_dtypes(self.dtype)
 
-    if context.executing_eagerly() and context.num_gpus() > 0:
+    if context.executing_eagerly() and tf_config.list_logical_devices("GPU"):
       logging.warn(
           "%s: Note that this cell is not optimized for performance. "
           "Please use tf.contrib.cudnn_rnn.CudnnGRU for better "
@@ -695,9 +699,10 @@ class BasicLSTMCell(LayerRNNCell):
         When restoring from CudnnLSTM-trained checkpoints, must use
         `CudnnCompatibleLSTMCell` instead.
     """
-    logging.warning("`tf.nn.rnn_cell.BasicLSTMCell` is deprecated. This class "
-                    "is equivalent as `tf.keras.layers.LSTMCell`, "
-                    "and will be replaced by that in Tensorflow 2.0.")
+    warnings.warn("`tf.nn.rnn_cell.BasicLSTMCell` is deprecated and will be "
+                  "removed in a future version. This class "
+                  "is equivalent as `tf.keras.layers.LSTMCell`, "
+                  "and will be replaced by that in Tensorflow 2.0.")
     super(BasicLSTMCell, self).__init__(
         _reuse=reuse, name=name, dtype=dtype, **kwargs)
     _check_supported_dtypes(self.dtype)
@@ -705,7 +710,7 @@ class BasicLSTMCell(LayerRNNCell):
       logging.warn(
           "%s: Using a concatenated state is slower and will soon be "
           "deprecated.  Use state_is_tuple=True.", self)
-    if context.executing_eagerly() and context.num_gpus() > 0:
+    if context.executing_eagerly() and tf_config.list_logical_devices("GPU"):
       logging.warn(
           "%s: Note that this cell is not optimized for performance. "
           "Please use tf.contrib.cudnn_rnn.CudnnLSTM for better "
@@ -895,9 +900,10 @@ class LSTMCell(LayerRNNCell):
         When restoring from CudnnLSTM-trained checkpoints, use
         `CudnnCompatibleLSTMCell` instead.
     """
-    logging.warning("`tf.nn.rnn_cell.LSTMCell` is deprecated. This class "
-                    "is equivalent as `tf.keras.layers.LSTMCell`, "
-                    "and will be replaced by that in Tensorflow 2.0.")
+    warnings.warn("`tf.nn.rnn_cell.LSTMCell` is deprecated and will be "
+                  "removed in a future version. This class "
+                  "is equivalent as `tf.keras.layers.LSTMCell`, "
+                  "and will be replaced by that in Tensorflow 2.0.")
     super(LSTMCell, self).__init__(
         _reuse=reuse, name=name, dtype=dtype, **kwargs)
     _check_supported_dtypes(self.dtype)
@@ -910,7 +916,7 @@ class LSTMCell(LayerRNNCell):
           "%s: The num_unit_shards and proj_unit_shards parameters are "
           "deprecated and will be removed in Jan 2017.  "
           "Use a variable scope with a partitioner instead.", self)
-    if context.executing_eagerly() and context.num_gpus() > 0:
+    if context.executing_eagerly() and tf_config.list_logical_devices("GPU"):
       logging.warn(
           "%s: Note that this cell is not optimized for performance. "
           "Please use tf.contrib.cudnn_rnn.CudnnLSTM for better "
diff --git a/tensorflow/python/keras/layers/legacy_rnn/rnn_cell_wrapper_impl.py b/tensorflow/python/keras/layers/legacy_rnn/rnn_cell_wrapper_impl.py
index 2e3923918a0..9618bc75545 100644
--- a/tensorflow/python/keras/layers/legacy_rnn/rnn_cell_wrapper_impl.py
+++ b/tensorflow/python/keras/layers/legacy_rnn/rnn_cell_wrapper_impl.py
@@ -116,7 +116,7 @@ class DropoutWrapperBase(object):
     with ops.name_scope_v2("DropoutWrapperInit"):
 
       def tensor_and_const_value(v):
-        tensor_value = ops.convert_to_tensor(v)
+        tensor_value = ops.convert_to_tensor_v2_with_dispatch(v)
         const_value = tensor_util.constant_value(tensor_value)
         return (tensor_value, const_value)
 
diff --git a/tensorflow/python/keras/layers/lstm_test.py b/tensorflow/python/keras/layers/lstm_test.py
index 5ed86dd829a..29020b09f64 100644
--- a/tensorflow/python/keras/layers/lstm_test.py
+++ b/tensorflow/python/keras/layers/lstm_test.py
@@ -25,7 +25,6 @@ import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.eager import context
-from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
@@ -47,7 +46,7 @@ class LSTMLayerTest(keras_parameterized.TestCase):
                 'return_sequences': True},
         input_shape=(num_samples, timesteps, embedding_dim))
 
-  @tf_test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_float64_LSTM(self):
     if test.is_built_with_rocm():
       self.skipTest('Double type is yet not supported in ROCm')
diff --git a/tensorflow/python/keras/layers/lstm_v2_test.py b/tensorflow/python/keras/layers/lstm_v2_test.py
index e9774e15c5f..c0f41b4bf7c 100644
--- a/tensorflow/python/keras/layers/lstm_v2_test.py
+++ b/tensorflow/python/keras/layers/lstm_v2_test.py
@@ -35,7 +35,6 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
-from tensorflow.python.framework import test_util
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.layers import recurrent as rnn_v1
@@ -81,7 +80,7 @@ class LSTMV2Test(keras_parameterized.TestCase):
         use_bias=use_bias)
     self.assertFalse(layer._could_use_gpu_kernel)
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_use_on_default_activation_with_gpu_kernel(self):
     layer = rnn.LSTM(1, activation=nn.tanh)
     self.assertTrue(layer._could_use_gpu_kernel)
@@ -324,7 +323,7 @@ class LSTMV2Test(keras_parameterized.TestCase):
     targets = np.random.random((num_samples, units))
     model.train_on_batch([main_inputs] + initial_state, targets)
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_lstm_v2_feature_parity_with_canonical_lstm(self):
     if test.is_built_with_rocm():
       self.skipTest('Skipping the test as ROCm MIOpen does not '
@@ -592,7 +591,7 @@ class LSTMV2Test(keras_parameterized.TestCase):
         },
         input_shape=(num_samples, timesteps, embedding_dim))
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_float64_LSTM(self):
     if test.is_built_with_rocm():
       self.skipTest('Skipping the test as ROCm MIOpen does not '
@@ -767,7 +766,7 @@ class LSTMV2Test(keras_parameterized.TestCase):
     model.evaluate(x, y)
     model.predict(x)
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_explicit_device_with_go_backward_and_mask(self):
     if test.is_built_with_rocm():
       self.skipTest('Skipping the test as ROCm MIOpen does not '
@@ -880,7 +879,7 @@ class LSTMGraphRewriteTest(keras_parameterized.TestCase):
     else:
       self.assertEqual(runtime_value[0], rnn._RUNTIME_CPU)
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_LSTM_runtime(self):
     layer = rnn.LSTM(self.rnn_state_size, return_runtime=True)
 
@@ -896,7 +895,7 @@ class LSTMGraphRewriteTest(keras_parameterized.TestCase):
     model = keras.models.Model(inputs=inputs, outputs=[outputs, runtime])
     self._test_runtime_with_model(model)
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_LSTM_runtime_with_mask(self):
     if test.is_built_with_rocm():
       self.skipTest('Skipping the test as ROCm MIOpen does not '
@@ -955,7 +954,7 @@ class LSTMGraphRewriteTest(keras_parameterized.TestCase):
     _, runtime_value = model.predict(x_train)
     self.assertEqual(runtime_value[0], rnn._RUNTIME_CPU)
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_LSTM_runtime_with_cond(self):
     # This test is to demonstrate the graph rewrite of grappler plugin under
     # the condition that the function returns different number of internal
diff --git a/tensorflow/python/keras/layers/merge_test.py b/tensorflow/python/keras/layers/merge_test.py
index 16d3701b7e3..028d2eb3925 100644
--- a/tensorflow/python/keras/layers/merge_test.py
+++ b/tensorflow/python/keras/layers/merge_test.py
@@ -22,7 +22,6 @@ from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import keras
-from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import combinations
 from tensorflow.python.keras import keras_parameterized
@@ -225,7 +224,7 @@ class MergeLayersTest(keras_parameterized.TestCase):
     self.assertEqual(layer.compute_output_shape([(4, 5), (4, 5)]), (4, 1))
 
   @parameterized.named_parameters(
-      *tf_test_util.generate_combinations_with_testcase_name(
+      *testing_utils.generate_combinations_with_testcase_name(
           layer=[keras.layers.Add, keras.layers.Subtract,
                  keras.layers.Multiply, keras.layers.Minimum,
                  keras.layers.Maximum, keras.layers.Average,
@@ -251,7 +250,7 @@ class MergeLayersTest(keras_parameterized.TestCase):
     self.assertAllEqual(out_dense, out_ragged)
 
   @parameterized.named_parameters(
-      *tf_test_util.generate_combinations_with_testcase_name(
+      *testing_utils.generate_combinations_with_testcase_name(
           layer=[keras.layers.Add, keras.layers.Subtract,
                  keras.layers.Multiply, keras.layers.Minimum,
                  keras.layers.Maximum, keras.layers.Average]))
diff --git a/tensorflow/python/keras/layers/multi_head_attention.py b/tensorflow/python/keras/layers/multi_head_attention.py
new file mode 100644
index 00000000000..7ddce8caceb
--- /dev/null
+++ b/tensorflow/python/keras/layers/multi_head_attention.py
@@ -0,0 +1,463 @@
+# Lint as: python3
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras-based attention layer."""
+# pylint: disable=g-classes-have-attributes
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import math
+import string
+
+import numpy as np
+
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras import constraints
+from tensorflow.python.keras import initializers
+from tensorflow.python.keras import regularizers
+from tensorflow.python.keras.engine.base_layer import Layer
+from tensorflow.python.keras.layers import advanced_activations
+from tensorflow.python.keras.layers import core
+from tensorflow.python.keras.layers import einsum_dense
+from tensorflow.python.keras.utils import tf_utils
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import special_math_ops
+from tensorflow.python.util.tf_export import keras_export
+
+
+_CHR_IDX = string.ascii_lowercase
+
+
+def _build_attention_equation(rank, attn_axes):
+  """Builds einsum equations for the attention computation.
+
+  Query, key, value inputs after projection are expected to have the shape as:
+  (bs, <non-attention dims>, <attention dims>, num_heads, channels).
+  bs and <non-attention dims> are treated as <batch dims>.
+  The attention operations can be generalized:
+  (1) Query-key dot product:
+  (<batch dims>, <query attention dims>, num_heads, channels), (<batch dims>,
+  <key attention dims>, num_heads, channels) -> (<batch dims>,
+  num_heads, <query attention dims>, <key attention dims>)
+  (2) Combination:
+  (<batch dims>, num_heads, <query attention dims>, <key attention dims>),
+  (<batch dims>, <value attention dims>, num_heads, channels) -> (<batch dims>,
+  <query attention dims>, num_heads, channels)
+
+  Args:
+    rank: the rank of query, key, value tensors.
+    attn_axes: a list/tuple of axes, [-1, rank), that will do attention.
+
+  Returns:
+    Einsum equations.
+  """
+  target_notation = _CHR_IDX[:rank]
+  # `batch_dims` includes the head dim.
+  batch_dims = tuple(np.delete(range(rank), attn_axes + (rank - 1,)))
+  letter_offset = rank
+  source_notation = ""
+  for i in range(rank):
+    if i in batch_dims or i == rank - 1:
+      source_notation += target_notation[i]
+    else:
+      source_notation += _CHR_IDX[letter_offset]
+      letter_offset += 1
+
+  product_notation = "".join([target_notation[i] for i in batch_dims] +
+                             [target_notation[i] for i in attn_axes] +
+                             [source_notation[i] for i in attn_axes])
+  dot_product_equation = "%s,%s->%s" % (source_notation, target_notation,
+                                        product_notation)
+  attn_scores_rank = len(product_notation)
+  combine_equation = "%s,%s->%s" % (product_notation, source_notation,
+                                    target_notation)
+  return dot_product_equation, combine_equation, attn_scores_rank
+
+
+def _build_proj_equation(free_dims, bound_dims, output_dims):
+  """Builds an einsum equation for projections inside multi-head attention."""
+  input_str = ""
+  kernel_str = ""
+  output_str = ""
+  bias_axes = ""
+  letter_offset = 0
+  for i in range(free_dims):
+    char = _CHR_IDX[i + letter_offset]
+    input_str += char
+    output_str += char
+
+  letter_offset += free_dims
+  for i in range(bound_dims):
+    char = _CHR_IDX[i + letter_offset]
+    input_str += char
+    kernel_str += char
+
+  letter_offset += bound_dims
+  for i in range(output_dims):
+    char = _CHR_IDX[i + letter_offset]
+    kernel_str += char
+    output_str += char
+    bias_axes += char
+  equation = "%s,%s->%s" % (input_str, kernel_str, output_str)
+
+  return equation, bias_axes, len(output_str)
+
+
+def _get_output_shape(output_rank, known_last_dims):
+  return [None] * (output_rank - len(known_last_dims)) + list(known_last_dims)
+
+
+@keras_export("keras.layers.MultiHeadAttention")
+class MultiHeadAttention(Layer):
+  """MultiHeadAttention layer.
+
+  This is an implementation of multi-headed attention based on "Attention
+  is all you Need". If `query`, `key,` `value` are the same, then
+  this is self-attention. Each timestep in `query` attends to the
+  corresponding sequence in `key`, and returns a fixed-width vector.
+
+  This layer first projects `query`, `key` and `value`. These are
+  (effectively) a list of tensors of length `num_attention_heads`, where the
+  corresponding shapes are [batch_size, <query dimensions>, key_dim],
+  [batch_size, <key/value dimensions>, key_dim],
+  [batch_size, <key/value dimensions>, value_dim].
+
+  Then, the query and key tensors are dot-producted and scaled. These are
+  softmaxed to obtain attention probabilities. The value tensors are then
+  interpolated by these probabilities, then concatenated back to a single
+  tensor.
+
+  Finally, the result tensor with the last dimension as value_dim can take an
+  linear projection and return.
+
+  Examples:
+
+  Performs 1D cross-attention over two sequence inputs with an attention mask.
+  Returns the additional attention weights over heads.
+
+  >>> layer = MultiHeadAttention(num_heads=2, key_dim=2)
+  >>> target = tf.keras.Input(shape=[8, 16])
+  >>> source = tf.keras.Input(shape=[4, 16])
+  >>> output_tensor, weights = layer(target, source,
+  ...                                return_attention_scores=True)
+  >>> print(output_tensor.shape)
+  (None, 8, 16)
+  >>> print(weights.shape)
+  (None, 2, 8, 4)
+
+  Performs 2D self-attention over a 5D input tensor on axes 2 and 3.
+
+  >>> layer = MultiHeadAttention(num_heads=2, key_dim=2, attention_axes=(2, 3))
+  >>> input_tensor = tf.keras.Input(shape=[5, 3, 4, 16])
+  >>> output_tensor = layer(input_tensor, input_tensor)
+  >>> print(output_tensor.shape)
+  (None, 5, 3, 4, 16)
+
+  Arguments:
+    num_heads: Number of attention heads.
+    key_dim: Size of each attention head for query and key.
+    value_dim:  Size of each attention head for value.
+    dropout: Dropout probability.
+    use_bias: Boolean, whether the dense layers use bias vectors/matrices.
+    output_shape: The expected shape of an output tensor, besides the batch and
+      sequence dims. If not specified, projects back to the key feature dim.
+    attention_axes: axes over which the attention is applied. `None` means
+      attention over all axes, but batch, heads, and features.
+    kernel_initializer: Initializer for dense layer kernels.
+    bias_initializer: Initializer for dense layer biases.
+    kernel_regularizer: Regularizer for dense layer kernels.
+    bias_regularizer: Regularizer for dense layer biases.
+    activity_regularizer: Regularizer for dense layer activity.
+    kernel_constraint: Constraint for dense layer kernels.
+    bias_constraint: Constraint for dense layer kernels.
+
+  Call arguments:
+    query: Query `Tensor` of shape `[B, T, dim]`.
+    value: Value `Tensor` of shape `[B, S, dim]`.
+    key: Optional key `Tensor` of shape `[B, S, dim]`. If not given, will use
+      `value` for both `key` and `value`, which is the most common case.
+    attention_mask: a boolean mask of shape `[B, T, S]`, that prevents attention
+      to certain positions.
+    return_attention_scores: A boolean to indicate whether the output should
+      be attention output if True, or (attention_output, attention_scores) if
+      False. Defaults to False.
+
+  Returns:
+    attention_output: The result of the computation, of shape [B, T, E],
+      where `T` is for target sequence shapes and `E` is the query input last
+      dimension if `output_shape` is `None`. Otherwise, the multi-head outputs
+      are project to the shape specified by `output_shape`.
+    attention_scores: [Optional] multi-head attention coeffients over
+      attention axes.
+  """
+
+  def __init__(self,
+               num_heads,
+               key_dim,
+               value_dim=None,
+               dropout=0.0,
+               use_bias=True,
+               output_shape=None,
+               attention_axes=None,
+               kernel_initializer="glorot_uniform",
+               bias_initializer="zeros",
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               bias_constraint=None,
+               **kwargs):
+    super(MultiHeadAttention, self).__init__(**kwargs)
+    self._num_heads = num_heads
+    self._key_dim = key_dim
+    self._value_dim = value_dim if value_dim else key_dim
+    self._dropout = dropout
+    self._use_bias = use_bias
+    self._output_shape = output_shape
+    self._kernel_initializer = initializers.get(kernel_initializer)
+    self._bias_initializer = initializers.get(bias_initializer)
+    self._kernel_regularizer = regularizers.get(kernel_regularizer)
+    self._bias_regularizer = regularizers.get(bias_regularizer)
+    self._kernel_constraint = constraints.get(kernel_constraint)
+    self._bias_constraint = constraints.get(bias_constraint)
+    if attention_axes is not None and not isinstance(attention_axes,
+                                                     collections.abc.Sized):
+      self._attention_axes = (attention_axes,)
+    else:
+      self._attention_axes = attention_axes
+    self._built_from_signature = False
+
+  def get_config(self):
+    config = {
+        "num_heads":
+            self._num_heads,
+        "key_dim":
+            self._key_dim,
+        "value_dim":
+            self._value_dim,
+        "dropout":
+            self._dropout,
+        "use_bias":
+            self._use_bias,
+        "output_shape":
+            self._output_shape,
+        "attention_axes":
+            self._attention_axes,
+        "kernel_initializer":
+            initializers.serialize(self._kernel_initializer),
+        "bias_initializer":
+            initializers.serialize(self._bias_initializer),
+        "kernel_regularizer":
+            regularizers.serialize(self._kernel_regularizer),
+        "bias_regularizer":
+            regularizers.serialize(self._bias_regularizer),
+        "activity_regularizer":
+            regularizers.serialize(self._activity_regularizer),
+        "kernel_constraint":
+            constraints.serialize(self._kernel_constraint),
+        "bias_constraint":
+            constraints.serialize(self._bias_constraint)
+    }
+    base_config = super(MultiHeadAttention, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def _build_from_signature(self, query, value, key=None):
+    """Builds layers and variables.
+
+    Once the method is called, self._built_from_signature will be set to True.
+
+    Args:
+      query: query tensor or TensorShape.
+      value: value tensor or TensorShape.
+      key: key tensor or TensorShape.
+    """
+    self._built_from_signature = True
+    if hasattr(query, "shape"):
+      query_shape = tensor_shape.TensorShape(query.shape)
+    else:
+      query_shape = query
+    if hasattr(value, "shape"):
+      value_shape = tensor_shape.TensorShape(value.shape)
+    else:
+      value_shape = value
+    if key is None:
+      key_shape = value_shape
+    elif hasattr(key, "shape"):
+      key_shape = tensor_shape.TensorShape(key.shape)
+    else:
+      key_shape = key
+
+    common_kwargs = dict(
+        kernel_initializer=self._kernel_initializer,
+        bias_initializer=self._bias_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer,
+        activity_regularizer=self._activity_regularizer,
+        kernel_constraint=self._kernel_constraint,
+        bias_constraint=self._bias_constraint)
+    # Any setup work performed only once should happen in an `init_scope`
+    # to avoid creating symbolic Tensors that will later pollute any eager
+    # operations.
+    with tf_utils.maybe_init_scope(self):
+      free_dims = query_shape.rank - 1
+      einsum_equation, bias_axes, output_rank = _build_proj_equation(
+          free_dims, bound_dims=1, output_dims=2)
+      self._query_dense = einsum_dense.EinsumDense(
+          einsum_equation,
+          output_shape=_get_output_shape(output_rank - 1,
+                                         [self._num_heads, self._key_dim]),
+          bias_axes=bias_axes if self._use_bias else None,
+          name="query",
+          **common_kwargs)
+      einsum_equation, bias_axes, output_rank = _build_proj_equation(
+          key_shape.rank - 1, bound_dims=1, output_dims=2)
+      self._key_dense = einsum_dense.EinsumDense(
+          einsum_equation,
+          output_shape=_get_output_shape(output_rank - 1,
+                                         [self._num_heads, self._key_dim]),
+          bias_axes=bias_axes if self._use_bias else None,
+          name="key",
+          **common_kwargs)
+      einsum_equation, bias_axes, output_rank = _build_proj_equation(
+          value_shape.rank - 1, bound_dims=1, output_dims=2)
+      self._value_dense = einsum_dense.EinsumDense(
+          einsum_equation,
+          output_shape=_get_output_shape(output_rank - 1,
+                                         [self._num_heads, self._value_dim]),
+          bias_axes=bias_axes if self._use_bias else None,
+          name="value",
+          **common_kwargs)
+
+      # Builds the attention computations for multi-head dot product attention.
+      # These computations could be wrapped into the keras attention layer once
+      # it support mult-head einsum computations.
+      self._build_attention(output_rank)
+      if self._output_shape:
+        if not isinstance(self._output_shape, collections.abc.Sized):
+          output_shape = [self._output_shape]
+        else:
+          output_shape = self._output_shape
+      else:
+        output_shape = [query_shape[-1]]
+      einsum_equation, bias_axes, output_rank = _build_proj_equation(
+          free_dims, bound_dims=2, output_dims=len(output_shape))
+      self._output_dense = einsum_dense.EinsumDense(
+          einsum_equation,
+          output_shape=_get_output_shape(output_rank - 1, output_shape),
+          bias_axes=bias_axes if self._use_bias else None,
+          name="attention_output",
+          **common_kwargs)
+
+  def _build_attention(self, rank):
+    """Builds multi-head dot-product attention computations.
+
+    This function builds attributes necessary for `_compute_attention` to
+    costomize attention computation to replace the default dot-product
+    attention.
+
+    Args:
+      rank: the rank of query, key, value tensors.
+    """
+    if self._attention_axes is None:
+      self._attention_axes = tuple(range(1, rank - 2))
+    else:
+      self._attention_axes = tuple(self._attention_axes)
+    self._dot_product_equation, self._combine_equation, attn_scores_rank = (
+        _build_attention_equation(rank, attn_axes=self._attention_axes))
+    norm_axes = tuple(
+        range(attn_scores_rank - len(self._attention_axes), attn_scores_rank))
+    self._softmax = advanced_activations.Softmax(axis=norm_axes)
+    self._dropout_layer = core.Dropout(rate=self._dropout)
+
+  def _masked_softmax(self, attention_scores, attention_mask=None):
+    # Normalize the attention scores to probabilities.
+    # `attention_scores` = [B, N, T, S]
+    if attention_mask is not None:
+      # The expand dim happens starting from the `num_heads` dimension,
+      # (<batch_dims>, num_heads, <query_attention_dims, key_attention_dims>)
+      mask_expansion_axes = [-len(self._attention_axes) * 2 - 1]
+      for _ in range(len(attention_scores.shape) - len(attention_mask.shape)):
+        attention_mask = array_ops.expand_dims(
+            attention_mask, axis=mask_expansion_axes)
+    return self._softmax(attention_scores, attention_mask)
+
+  def _compute_attention(self, query, key, value, attention_mask=None):
+    """Applies Dot-product attention with query, key, value tensors.
+
+    This function defines the computation inside `call` with projected
+    multi-head Q, K, V inputs. Users can override this function for customized
+    attention implementation.
+
+    Args:
+      query: Projected query `Tensor` of shape `[B, T, N, key_dim]`.
+      key: Projected key `Tensor` of shape `[B, T, N, key_dim]`.
+      value: Projected value `Tensor` of shape `[B, T, N, value_dim]`.
+      attention_mask: a boolean mask of shape `[B, T, S]`, that prevents
+        attention to certain positions.
+
+    Returns:
+      attention_output: Multi-headed outputs of attention computation.
+      attention_scores: Multi-headed attention weights.
+    """
+    # Note: Applying scalar multiply at the smaller end of einsum improves
+    # XLA performance, but may introduce slight numeric differences in
+    # the Transformer attention head.
+    query = math_ops.multiply(query, 1.0 / math.sqrt(float(self._key_dim)))
+
+    # Take the dot product between "query" and "key" to get the raw
+    # attention scores.
+    attention_scores = special_math_ops.einsum(self._dot_product_equation, key,
+                                               query)
+
+    attention_scores = self._masked_softmax(attention_scores, attention_mask)
+
+    # This is actually dropping out entire tokens to attend to, which might
+    # seem a bit unusual, but is taken from the original Transformer paper.
+    attention_scores_dropout = self._dropout_layer(attention_scores)
+
+    # `context_layer` = [B, T, N, H]
+    attention_output = special_math_ops.einsum(self._combine_equation,
+                                               attention_scores_dropout, value)
+    return attention_output, attention_scores
+
+  def call(self, query, value, key=None, attention_mask=None,
+           return_attention_scores=False):
+    if not self._built_from_signature:
+      self._build_from_signature(query=query, value=value, key=key)
+    if key is None:
+      key = value
+
+    #   N = `num_attention_heads`
+    #   H = `size_per_head`
+    # `query` = [B, T, N ,H]
+    query = self._query_dense(query)
+
+    # `key` = [B, S, N, H]
+    key = self._key_dense(key)
+
+    # `value` = [B, S, N, H]
+    value = self._value_dense(value)
+
+    attention_output, attention_scores = self._compute_attention(
+        query, key, value, attention_mask)
+    attention_output = self._output_dense(attention_output)
+
+    if return_attention_scores:
+      return attention_output, attention_scores
+    return attention_output
+
diff --git a/tensorflow/python/keras/layers/multi_head_attention_test.py b/tensorflow/python/keras/layers/multi_head_attention_test.py
new file mode 100644
index 00000000000..a50fefd05ba
--- /dev/null
+++ b/tensorflow/python/keras/layers/multi_head_attention_test.py
@@ -0,0 +1,255 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the attention layer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+import numpy as np
+
+from tensorflow.python import keras
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras.layers import multi_head_attention
+from tensorflow.python.platform import test
+
+
+# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
+# guarantees forward compatibility of this code for the V2 switchover.
+@keras_parameterized.run_all_keras_modes
+class MultiHeadAttentionTest(keras_parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      ("key_value_same_proj", None, None, [40, 80]),
+      ("key_value_different_proj", 32, 60, [40, 60]),
+  )
+  def test_non_masked_attention(self, value_dim, output_shape, output_dims):
+    """Test that the attention layer can be created without a mask tensor."""
+    test_layer = multi_head_attention.MultiHeadAttention(
+        num_heads=12,
+        key_dim=64,
+        value_dim=value_dim,
+        output_shape=output_shape)
+    # Create a 3-dimensional input (the first dimension is implicit).
+    query = keras.Input(shape=(40, 80))
+    value = keras.Input(shape=(20, 80))
+    output = test_layer(query=query, value=value)
+    self.assertEqual(output.shape.as_list(), [None] + output_dims)
+
+  def test_non_masked_self_attention(self):
+    """Test with one input (self-attenntion) and no mask tensor."""
+    test_layer = multi_head_attention.MultiHeadAttention(
+        num_heads=12, key_dim=64)
+    # Create a 3-dimensional input (the first dimension is implicit).
+    query = keras.Input(shape=(40, 80))
+    output = test_layer(query, query)
+    self.assertEqual(output.shape.as_list(), [None, 40, 80])
+
+  def test_attention_scores(self):
+    """Test attention outputs with coefficients."""
+    test_layer = multi_head_attention.MultiHeadAttention(
+        num_heads=12, key_dim=64)
+    # Create a 3-dimensional input (the first dimension is implicit).
+    query = keras.Input(shape=(40, 80))
+    output, coef = test_layer(query, query, return_attention_scores=True)
+    self.assertEqual(output.shape.as_list(), [None, 40, 80])
+    self.assertEqual(coef.shape.as_list(), [None, 12, 40, 40])
+
+  def test_attention_scores_with_values(self):
+    """Test attention outputs with coefficients."""
+    test_layer = multi_head_attention.MultiHeadAttention(
+        num_heads=12, key_dim=64)
+    # Create a 3-dimensional input (the first dimension is implicit).
+    query = keras.Input(shape=(40, 80))
+    value = keras.Input(shape=(60, 80))
+    output, coef = test_layer(query, value, return_attention_scores=True)
+    self.assertEqual(output.shape.as_list(), [None, 40, 80])
+    self.assertEqual(coef.shape.as_list(), [None, 12, 40, 60])
+
+  @parameterized.named_parameters(("with_bias", True), ("no_bias", False))
+  def test_masked_attention(self, use_bias):
+    """Test with a mask tensor."""
+    test_layer = multi_head_attention.MultiHeadAttention(
+        num_heads=2, key_dim=2, use_bias=use_bias)
+    # Create a 3-dimensional input (the first dimension is implicit).
+    batch_size = 3
+    query = keras.Input(shape=(4, 8))
+    value = keras.Input(shape=(2, 8))
+    mask_tensor = keras.Input(shape=(4, 2))
+    output = test_layer(query=query, value=value, attention_mask=mask_tensor)
+
+    # Create a model containing the test layer.
+    model = keras.Model([query, value, mask_tensor], output)
+
+    # Generate data for the input (non-mask) tensors.
+    from_data = 10 * np.random.random_sample((batch_size, 4, 8))
+    to_data = 10 * np.random.random_sample((batch_size, 2, 8))
+
+    # Invoke the data with a random set of mask data. This should mask at least
+    # one element.
+    mask_data = np.random.randint(2, size=(batch_size, 4, 2))
+    masked_output_data = model.predict([from_data, to_data, mask_data])
+
+    # Invoke the same data, but with a null mask (where no elements are masked).
+    null_mask_data = np.ones((batch_size, 4, 2))
+    unmasked_output_data = model.predict([from_data, to_data, null_mask_data])
+
+    # Because one data is masked and one is not, the outputs should not be the
+    # same.
+    self.assertNotAllClose(masked_output_data, unmasked_output_data)
+
+    # Tests the layer with three inputs: Q, K, V.
+    key = keras.Input(shape=(2, 8))
+    output = test_layer(query, value=value, key=key, attention_mask=mask_tensor)
+    model = keras.Model([query, value, key, mask_tensor], output)
+
+    masked_output_data = model.predict([from_data, to_data, to_data, mask_data])
+    unmasked_output_data = model.predict(
+        [from_data, to_data, to_data, null_mask_data])
+    # Because one data is masked and one is not, the outputs should not be the
+    # same.
+    self.assertNotAllClose(masked_output_data, unmasked_output_data)
+
+    if use_bias:
+      self.assertLen(test_layer._query_dense.trainable_variables, 2)
+      self.assertLen(test_layer._output_dense.trainable_variables, 2)
+    else:
+      self.assertLen(test_layer._query_dense.trainable_variables, 1)
+      self.assertLen(test_layer._output_dense.trainable_variables, 1)
+
+  def test_initializer(self):
+    """Test with a specified initializer."""
+    test_layer = multi_head_attention.MultiHeadAttention(
+        num_heads=12,
+        key_dim=64,
+        kernel_initializer=keras.initializers.TruncatedNormal(stddev=0.02))
+    # Create a 3-dimensional input (the first dimension is implicit).
+    query = keras.Input(shape=(40, 80))
+    output = test_layer(query, query)
+    self.assertEqual(output.shape.as_list(), [None, 40, 80])
+
+  def test_masked_attention_with_scores(self):
+    """Test with a mask tensor."""
+    test_layer = multi_head_attention.MultiHeadAttention(
+        num_heads=2, key_dim=2)
+    # Create a 3-dimensional input (the first dimension is implicit).
+    batch_size = 3
+    query = keras.Input(shape=(4, 8))
+    value = keras.Input(shape=(2, 8))
+    mask_tensor = keras.Input(shape=(4, 2))
+    output = test_layer(query=query, value=value, attention_mask=mask_tensor)
+
+    # Create a model containing the test layer.
+    model = keras.Model([query, value, mask_tensor], output)
+
+    # Generate data for the input (non-mask) tensors.
+    from_data = 10 * np.random.random_sample((batch_size, 4, 8))
+    to_data = 10 * np.random.random_sample((batch_size, 2, 8))
+
+    # Invoke the data with a random set of mask data. This should mask at least
+    # one element.
+    mask_data = np.random.randint(2, size=(batch_size, 4, 2))
+    masked_output_data = model.predict([from_data, to_data, mask_data])
+
+    # Invoke the same data, but with a null mask (where no elements are masked).
+    null_mask_data = np.ones((batch_size, 4, 2))
+    unmasked_output_data = model.predict([from_data, to_data, null_mask_data])
+
+    # Because one data is masked and one is not, the outputs should not be the
+    # same.
+    self.assertNotAllClose(masked_output_data, unmasked_output_data)
+
+    # Create a model containing attention scores.
+    output, scores = test_layer(
+        query=query, value=value, attention_mask=mask_tensor,
+        return_attention_scores=True)
+    model = keras.Model([query, value, mask_tensor], [output, scores])
+    masked_output_data_score, masked_score = model.predict(
+        [from_data, to_data, mask_data])
+    unmasked_output_data_score, unmasked_score = model.predict(
+        [from_data, to_data, null_mask_data])
+    self.assertNotAllClose(masked_output_data_score, unmasked_output_data_score)
+    self.assertAllClose(masked_output_data, masked_output_data_score)
+    self.assertAllClose(unmasked_output_data, unmasked_output_data_score)
+    self.assertNotAllClose(masked_score, unmasked_score)
+
+  @parameterized.named_parameters(
+      ("4d_inputs_1freebatch_mask2", [3, 4], [3, 2], [4, 2],
+       (2,)), ("4d_inputs_1freebatch_mask3", [3, 4], [3, 2], [3, 4, 2], (2,)),
+      ("4d_inputs_1freebatch_mask4", [3, 4], [3, 2], [3, 2, 4, 2],
+       (2,)), ("4D_inputs_2D_attention", [3, 4], [3, 2], [3, 4, 3, 2], (1, 2)),
+      ("5D_inputs_2D_attention", [5, 3, 4], [5, 3, 2], [3, 4, 3, 2], (2, 3)),
+      ("5D_inputs_2D_attention_fullmask", [5, 3, 4], [5, 3, 2], [5, 3, 4, 3, 2],
+       (2, 3)))
+  def test_high_dim_attention(self, q_dims, v_dims, mask_dims, attention_axes):
+    """Test with a mask tensor."""
+    test_layer = multi_head_attention.MultiHeadAttention(
+        num_heads=2, key_dim=2, attention_axes=attention_axes)
+    batch_size, hidden_size = 3, 8
+    # Generate data for the input (non-mask) tensors.
+    query_shape = [batch_size] + q_dims + [hidden_size]
+    value_shape = [batch_size] + v_dims + [hidden_size]
+    mask_shape = [batch_size] + mask_dims
+    query = 10 * np.random.random_sample(query_shape)
+    value = 10 * np.random.random_sample(value_shape)
+
+    # Invoke the data with a random set of mask data. This should mask at least
+    # one element.
+    mask_data = np.random.randint(2, size=mask_shape).astype("bool")
+    # Invoke the same data, but with a null mask (where no elements are masked).
+    null_mask_data = np.ones(mask_shape)
+    # Because one data is masked and one is not, the outputs should not be the
+    # same.
+    query_tensor = keras.Input(query_shape[1:], name="query")
+    value_tensor = keras.Input(value_shape[1:], name="value")
+    mask_tensor = keras.Input(mask_shape[1:], name="mask")
+    output = test_layer(query=query_tensor, value=value_tensor,
+                        attention_mask=mask_tensor)
+    model = keras.Model([query_tensor, value_tensor, mask_tensor], output)
+
+    self.assertNotAllClose(
+        model.predict([query, value, mask_data]),
+        model.predict([query, value, null_mask_data]))
+
+
+class SubclassAttention(multi_head_attention.MultiHeadAttention):
+
+  def _build_attention(self, qkv_rank):
+    pass
+
+  def _compute_attention(self,
+                         query_tensor,
+                         key_tensor,
+                         value_tensor,
+                         attention_mask=None):
+    return value_tensor, None
+
+
+@keras_parameterized.run_all_keras_modes
+class AttentionSubclassTest(keras_parameterized.TestCase):
+
+  def test_initializer(self):
+    """Test with a specified initializer."""
+    test_layer = SubclassAttention(num_heads=12, key_dim=64)
+    # Create a 3-dimensional input (the first dimension is implicit).
+    query = keras.Input(shape=(40, 80))
+    output = test_layer(query, query)
+    self.assertEqual(output.shape.as_list(), [None, 40, 80])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/layers/normalization.py b/tensorflow/python/keras/layers/normalization.py
index 12013882ff5..92178bc8fb1 100644
--- a/tensorflow/python/keras/layers/normalization.py
+++ b/tensorflow/python/keras/layers/normalization.py
@@ -30,37 +30,54 @@ from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.utils import control_flow_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables as tf_variables
-from tensorflow.python.platform import device_context
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import keras_export
 
 
 class BatchNormalizationBase(Layer):
-  r"""Normalize and scale inputs or activations.
+  r"""Layer that normalizes its inputs.
 
-  Normalize the activations of the previous layer at each batch,
-  i.e. applies a transformation that maintains the mean activation
-  close to 0 and the activation standard deviation close to 1.
+  Batch normalization applies a transformation that maintains the mean output
+  close to 0 and the output standard deviation close to 1.
 
-  Batch normalization differs from other layers in several key aspects:
+  Importantly, batch normalization works differently during training and
+  during inference.
 
-  1) Adding BatchNormalization with `training=True` to a model causes the
-  result of one example to depend on the contents of all other examples in a
-  minibatch. Be careful when padding batches or masking examples, as these can
-  change the minibatch statistics and affect other examples.
+  **During training** (i.e. when using `fit()` or when calling the layer/model
+  with the argument `training=True`), the layer normalizes its output using
+  the mean and standard deviation of the current batch of inputs. That is to
+  say, for each channel being normalized, the layer returns
+  `(batch - mean(batch)) / (var(batch) + epsilon) * gamma + beta`, where:
 
-  2) Updates to the weights (moving statistics) are based on the forward pass
-  of a model rather than the result of gradient computations.
+  - `epsilon` is small constant (configurable as part of the constructor
+  arguments)
+  - `gamma` is a learned scaling factor (initialized as 1), which
+  can be disabled by passing `scale=False` to the constructor.
+  - `beta` is a learned offset factor (initialized as 0), which
+  can be disabled by passing `center=False` to the constructor.
 
-  3) When performing inference using a model containing batch normalization, it
-  is generally (though not always) desirable to use accumulated statistics
-  rather than mini-batch statistics. This is accomplished by passing
-  `training=False` when calling the model, or using `model.predict`.
+  **During inference** (i.e. when using `evaluate()` or `predict()` or when
+  calling the layer/model with the argument `training=False` (which is the
+  default), the layer normalizes its output using a moving average of the
+  mean and standard deviation of the batches it has seen during training. That
+  is to say, it returns
+  `(batch - self.moving_mean) / (self.moving_var + epsilon) * gamma + beta`.
+
+  `self.moving_mean` and `self.moving_var` are non-trainable variables that
+  are updated each time the layer in called in training mode, as such:
+
+  - `moving_mean = moving_mean * momentum + mean(batch) * (1 - momentum)`
+  - `moving_var = moving_var * momentum + var(batch) * (1 - momentum)`
+
+  As such, the layer will only normalize its inputs during inference
+  *after having been trained on data that has similar statistics as the
+  inference data*.
 
   Arguments:
     axis: Integer, the axis that should be normalized (typically the features
@@ -117,6 +134,7 @@ class BatchNormalizationBase(Layer):
             across all examples), and finally apply gamma and/or beta. If
             `None`, no adjustment is applied. Cannot be specified if
             virtual_batch_size is specified.
+
   Call arguments:
     inputs: Input tensor (of any rank).
     training: Python boolean indicating whether the layer should behave in
@@ -125,21 +143,13 @@ class BatchNormalizationBase(Layer):
         variance of the current batch of inputs.
       - `training=False`: The layer will normalize its inputs using the mean and
         variance of its moving statistics, learned during training.
+
   Input shape: Arbitrary. Use the keyword argument `input_shape` (tuple of
     integers, does not include the samples axis) when using this layer as the
     first layer in a model.
+
   Output shape: Same shape as input.  {{TRAINABLE_ATTRIBUTE_NOTE}}
-  Normalization equations: Consider the intermediate activations \(x\) of a
-    mini-batch of size
-    \\(m\\):  We can compute the mean and variance of the batch  \\({\mu_B} =
-      \frac{1}{m} \sum_{i=1}^{m} {x_i}\\)  \\({\sigma_B^2} = \frac{1}{m}
-      \sum_{i=1}^{m} ({x_i} - {\mu_B})^2\\)  and then compute a normalized
-      \\(x\\), including a small factor \\({\epsilon}\\) for numerical
-      stability.  \\(\hat{x_i} = \frac{x_i - \mu_B}{\sqrt{\sigma_B^2 +
-      \epsilon}}\\)  And finally \\(\hat{x}\) is linearly transformed by
-      \({\gamma}\\)
-    and \\({\beta}\\), which are learned parameters:  \\({y_i} = {\gamma *
-      \hat{x_i} + \beta}\\)
+
   Reference:
     - [Ioffe and Szegedy, 2015](https://arxiv.org/abs/1502.03167).
   """
@@ -480,7 +490,8 @@ class BatchNormalizationBase(Layer):
   def _assign_moving_average(self, variable, value, momentum, inputs_size):
     with K.name_scope('AssignMovingAvg') as scope:
       with ops.colocate_with(variable):
-        decay = ops.convert_to_tensor_v2(1.0 - momentum, name='decay')
+        decay = ops.convert_to_tensor_v2_with_dispatch(
+            1.0 - momentum, name='decay')
         if decay.dtype != variable.dtype.base_dtype:
           decay = math_ops.cast(decay, variable.dtype.base_dtype)
         update_delta = (variable - math_ops.cast(value, variable.dtype)) * decay
@@ -514,7 +525,7 @@ class BatchNormalizationBase(Layer):
     use_fused_avg_updates = (
         ops.executing_eagerly_outside_functions() and
         isinstance(self.momentum, (float, int)) and
-        device_context.enclosing_tpu_context() is None)
+        enclosing_xla_context() is None)
     if use_fused_avg_updates:
       exponential_avg_factor = 1.0 - self.momentum
     else:
@@ -585,7 +596,7 @@ class BatchNormalizationBase(Layer):
                                                   lambda: self.momentum,
                                                   lambda: 1.0)
         else:
-          momentum = ops.convert_to_tensor_v2(self.momentum)
+          momentum = ops.convert_to_tensor_v2_with_dispatch(self.momentum)
 
       def mean_update():
         """Update self.moving_mean with the most recent data point."""
@@ -787,10 +798,11 @@ class BatchNormalizationBase(Layer):
       moving_variance = self.moving_variance
 
       mean = control_flow_util.smart_cond(
-          training, lambda: mean, lambda: ops.convert_to_tensor_v2(moving_mean))
+          training, lambda: mean,
+          lambda: ops.convert_to_tensor_v2_with_dispatch(moving_mean))
       variance = control_flow_util.smart_cond(
           training, lambda: variance,
-          lambda: ops.convert_to_tensor_v2(moving_variance))
+          lambda: ops.convert_to_tensor_v2_with_dispatch(moving_variance))
 
       if self.virtual_batch_size is not None:
         # This isn't strictly correct since in ghost batch norm, you are
@@ -932,6 +944,23 @@ def replace_in_base_docstring(replacements):
   return string
 
 
+def enclosing_xla_context():
+  """Recursively find and return the XLAControlFlowContext."""
+  graph = ops.get_default_graph()
+  while graph is not None:
+    # pylint: disable=protected-access
+    context_ = graph._get_control_flow_context()
+    # pylint: enable=protected-access
+    while context_ is not None:
+      if isinstance(context_, control_flow_ops.XLAControlFlowContext):
+        return context_
+      context_ = context_.outer_context
+    # This may be a FuncGraph due to defuns or v2 control flow. We need to
+    # find the original graph with the XLAControlFlowContext.
+    graph = getattr(graph, 'outer_graph', None)
+  return None
+
+
 @keras_export(v1=['keras.layers.BatchNormalization'])  # pylint: disable=missing-docstring
 class BatchNormalization(BatchNormalizationBase):
 
diff --git a/tensorflow/python/keras/layers/normalization_test.py b/tensorflow/python/keras/layers/normalization_test.py
index e60f34720a2..f89a615bee5 100644
--- a/tensorflow/python/keras/layers/normalization_test.py
+++ b/tensorflow/python/keras/layers/normalization_test.py
@@ -22,12 +22,10 @@ from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import keras
-from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import wrap_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
@@ -211,6 +209,7 @@ class BatchNormalizationTest(keras_parameterized.TestCase):
     train_loss = model.train_on_batch(test_data, test_targets)
     self.assertAlmostEqual(test_loss, train_loss)
 
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
   def test_eager_batchnorm_in_custom_model_call_with_tf_function(self):
 
     class MyModel(keras.Model):
@@ -223,16 +222,15 @@ class BatchNormalizationTest(keras_parameterized.TestCase):
       def call(self, x, training):
         return self.bn(x, training=training)
 
-    with context.eager_mode():
-      model = MyModel()
+    model = MyModel()
 
-      for _ in range(10):
-        x = constant_op.constant(0.5, shape=[1, 1])
-        model(x, training=True)
+    for _ in range(10):
+      x = constant_op.constant(0.5, shape=[1, 1])
+      model(x, training=True)
 
-      # Make sure the moving mean and variance have been updated
-      self.assertAllClose(model.bn.moving_mean.numpy(), [0.047], atol=3e-3)
-      self.assertAllClose(model.bn.moving_variance.numpy(), [0.9], atol=3e-2)
+    # Make sure the moving mean and variance have been updated
+    self.assertAllClose(model.bn.moving_mean.numpy(), [0.047], atol=3e-3)
+    self.assertAllClose(model.bn.moving_variance.numpy(), [0.9], atol=3e-2)
 
 
 class BatchNormalizationV1Test(keras_parameterized.TestCase):
@@ -757,7 +755,7 @@ class LayerNormalizationNumericsTest(keras_parameterized.TestCase):
         self.assertAllClose(gamma_grad_t, gamma_grad_ref, rtol=tol, atol=tol)
 
   # The gradient_checker_v2 does not work properly with LayerNorm in graph mode.
-  @tf_test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_backward(self):
     # For numeric stability, we ensure the axis's dimension(s) have at least 4
     # elements.
diff --git a/tensorflow/python/keras/layers/ops/BUILD b/tensorflow/python/keras/layers/ops/BUILD
index 09973c54790..e63bc36f368 100644
--- a/tensorflow/python/keras/layers/ops/BUILD
+++ b/tensorflow/python/keras/layers/ops/BUILD
@@ -10,6 +10,12 @@ package(
 
 exports_files(["LICENSE"])
 
+filegroup(
+    name = "all_py_srcs",
+    srcs = glob(["*.py"]),
+    visibility = ["//tensorflow/python/keras/google/private_tf_api_test:__pkg__"],
+)
+
 py_library(
     name = "core",
     srcs = ["core.py"],
diff --git a/tensorflow/python/keras/layers/preprocessing/BUILD b/tensorflow/python/keras/layers/preprocessing/BUILD
index 1fa6deb8cd9..7ca29cdb1a3 100644
--- a/tensorflow/python/keras/layers/preprocessing/BUILD
+++ b/tensorflow/python/keras/layers/preprocessing/BUILD
@@ -18,6 +18,12 @@ package(
 
 exports_files(["LICENSE"])
 
+filegroup(
+    name = "all_py_srcs",
+    srcs = glob(["*.py"]),
+    visibility = ["//tensorflow/python/keras/google/private_tf_api_test:__pkg__"],
+)
+
 py_library(
     name = "preprocessing",
     srcs = [
@@ -46,8 +52,10 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
+        "//tensorflow/python:boosted_trees_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python/keras/engine:base_layer",
+        "//tensorflow/python/ops/parallel_for:control_flow_ops",
     ],
 )
 
@@ -326,6 +334,7 @@ tf_py_test(
 distribute_py_test(
     name = "category_encoding_distribution_test",
     srcs = ["category_encoding_distribution_test.py"],
+    disable_mlir_bridge = False,
     main = "category_encoding_distribution_test.py",
     python_version = "PY3",
     tags = [
@@ -423,6 +432,7 @@ cuda_py_test(
 tpu_py_test(
     name = "hashing_distribution_test",
     srcs = ["hashing_distribution_test.py"],
+    disable_mlir_bridge = False,
     main = "hashing_distribution_test.py",
     python_version = "PY3",
     tags = ["multi_and_single_gpu"],
@@ -453,6 +463,7 @@ tf_py_test(
 tpu_py_test(
     name = "index_lookup_distribution_test",
     srcs = ["index_lookup_distribution_test.py"],
+    disable_mlir_bridge = False,
     main = "index_lookup_distribution_test.py",
     python_version = "PY3",
     tags = ["no_oss"],
@@ -592,6 +603,9 @@ tf_py_test(
     size = "medium",
     srcs = ["string_lookup_test.py"],
     python_version = "PY3",
+    tags = [
+        "notsan",  #b/168758821
+    ],
     deps = [
         ":preprocessing_test_utils",
         ":string_lookup",
diff --git a/tensorflow/python/keras/layers/preprocessing/benchmarks/BUILD b/tensorflow/python/keras/layers/preprocessing/benchmarks/BUILD
index 7c976880059..ab2dd49ce1b 100644
--- a/tensorflow/python/keras/layers/preprocessing/benchmarks/BUILD
+++ b/tensorflow/python/keras/layers/preprocessing/benchmarks/BUILD
@@ -10,6 +10,12 @@ package(
 
 exports_files(["LICENSE"])
 
+filegroup(
+    name = "all_py_srcs",
+    srcs = glob(["*.py"]),
+    visibility = ["//tensorflow/python/keras/google/private_tf_api_test:__pkg__"],
+)
+
 tf_py_test(
     name = "category_encoding_benchmark",
     srcs = ["category_encoding_benchmark.py"],
diff --git a/tensorflow/python/keras/layers/preprocessing/benchmarks/index_lookup_adapt_benchmark.py b/tensorflow/python/keras/layers/preprocessing/benchmarks/index_lookup_adapt_benchmark.py
index 619fb86103b..621a4588715 100644
--- a/tensorflow/python/keras/layers/preprocessing/benchmarks/index_lookup_adapt_benchmark.py
+++ b/tensorflow/python/keras/layers/preprocessing/benchmarks/index_lookup_adapt_benchmark.py
@@ -73,7 +73,11 @@ class BenchmarkAdapt(benchmark.Benchmark):
     batched_ds = ds.take(num_elements).batch(batch_size)
     input_t = keras.Input(shape=(), dtype=dtypes.string)
     layer = index_lookup.IndexLookup(
-        max_tokens=k, num_oov_tokens=0, reserve_zero=False)
+        max_tokens=k,
+        num_oov_indices=0,
+        mask_token=None,
+        oov_token="OOV",
+        dtype=dtypes.string)
     _ = layer(input_t)
     num_repeats = 5
     starts = []
@@ -93,7 +97,11 @@ class BenchmarkAdapt(benchmark.Benchmark):
     batched_ds = ds.take(num_elements).batch(batch_size)
     input_t = keras.Input(shape=(), dtype=dtypes.string)
     layer = index_lookup.IndexLookup(
-        max_tokens=k, num_oov_tokens=0, reserve_zero=False)
+        max_tokens=k,
+        num_oov_indices=0,
+        mask_token=None,
+        oov_token="OOV",
+        dtype=dtypes.string)
     _ = layer(input_t)
     num_repeats = 5
     starts = []
diff --git a/tensorflow/python/keras/layers/preprocessing/category_crossing.py b/tensorflow/python/keras/layers/preprocessing/category_crossing.py
index bdb29d21c4e..747a105afdd 100644
--- a/tensorflow/python/keras/layers/preprocessing/category_crossing.py
+++ b/tensorflow/python/keras/layers/preprocessing/category_crossing.py
@@ -143,7 +143,7 @@ class CategoryCrossing(base_preprocessing_layer.PreprocessingLayer):
 
   def _preprocess_input(self, inp):
     if isinstance(inp, (list, tuple, np.ndarray)):
-      inp = ops.convert_to_tensor(inp)
+      inp = ops.convert_to_tensor_v2_with_dispatch(inp)
     if inp.shape.rank == 1:
       inp = array_ops.expand_dims(inp, axis=-1)
     return inp
diff --git a/tensorflow/python/keras/layers/preprocessing/category_crossing_distribution_test.py b/tensorflow/python/keras/layers/preprocessing/category_crossing_distribution_test.py
index 1ccc7fe2296..867d1c6a35f 100644
--- a/tensorflow/python/keras/layers/preprocessing/category_crossing_distribution_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/category_crossing_distribution_test.py
@@ -22,11 +22,12 @@ import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.distribute import tpu_strategy
 from tensorflow.python.framework import config
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras.layers.preprocessing import category_crossing
 from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
@@ -45,7 +46,7 @@ def batch_wrapper(dataset, batch_size, distribution, repeat=None):
     return dataset.batch(batch_size)
 
 
-@combinations.generate(
+@ds_combinations.generate(
     combinations.combine(
         # Investigate why crossing is not supported with TPU.
         distribution=strategy_combinations.all_strategies,
diff --git a/tensorflow/python/keras/layers/preprocessing/category_crossing_test.py b/tensorflow/python/keras/layers/preprocessing/category_crossing_test.py
index 0f320196080..392996ce5cf 100644
--- a/tensorflow/python/keras/layers/preprocessing/category_crossing_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/category_crossing_test.py
@@ -25,8 +25,8 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
-from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import input_layer
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers.preprocessing import category_crossing
@@ -246,7 +246,7 @@ class CategoryCrossingTest(keras_parameterized.TestCase):
     self.assertEqual(output_spec.shape.dims[0], input_shapes[0].dims[0])
     self.assertEqual(output_spec.dtype, dtypes.string)
 
-  @tf_test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_config_with_custom_name(self):
     layer = category_crossing.CategoryCrossing(depth=2, name='hashing')
     config = layer.get_config()
diff --git a/tensorflow/python/keras/layers/preprocessing/category_encoding.py b/tensorflow/python/keras/layers/preprocessing/category_encoding.py
index 95540176e04..87112fa3d04 100644
--- a/tensorflow/python/keras/layers/preprocessing/category_encoding.py
+++ b/tensorflow/python/keras/layers/preprocessing/category_encoding.py
@@ -269,7 +269,7 @@ class CategoryEncoding(base_preprocessing_layer.CombinerPreprocessingLayer):
 
   def call(self, inputs, count_weights=None):
     if isinstance(inputs, (list, np.ndarray)):
-      inputs = ops.convert_to_tensor_v2(inputs)
+      inputs = ops.convert_to_tensor_v2_with_dispatch(inputs)
     if inputs.shape.rank == 1:
       inputs = array_ops.expand_dims(inputs, 1)
 
diff --git a/tensorflow/python/keras/layers/preprocessing/category_encoding_distribution_test.py b/tensorflow/python/keras/layers/preprocessing/category_encoding_distribution_test.py
index 011495b9314..7d6cff94067 100644
--- a/tensorflow/python/keras/layers/preprocessing/category_encoding_distribution_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/category_encoding_distribution_test.py
@@ -22,11 +22,12 @@ import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.distribute import tpu_strategy
 from tensorflow.python.framework import config
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras.layers.preprocessing import category_encoding
 from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
@@ -45,7 +46,7 @@ def batch_wrapper(dataset, batch_size, distribution, repeat=None):
     return dataset.batch(batch_size)
 
 
-@combinations.generate(
+@ds_combinations.generate(
     combinations.combine(
         # (b/156783625): Outside compilation failed for eager mode only.
         distribution=strategy_combinations.strategies_minus_tpu,
diff --git a/tensorflow/python/keras/layers/preprocessing/discretization.py b/tensorflow/python/keras/layers/preprocessing/discretization.py
index 6f5414d1a9f..e36ed118822 100644
--- a/tensorflow/python/keras/layers/preprocessing/discretization.py
+++ b/tensorflow/python/keras/layers/preprocessing/discretization.py
@@ -17,13 +17,17 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.keras.engine import base_preprocessing_layer
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import boosted_trees_ops
 from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops.parallel_for import control_flow_ops
 from tensorflow.python.ops.ragged import ragged_functional_ops
 from tensorflow.python.util.tf_export import keras_export
 
@@ -43,8 +47,8 @@ class Discretization(base_preprocessing_layer.PreprocessingLayer):
     Same as input shape.
 
   Attributes:
-    bins: Optional boundary specification. Bins include the left boundary and
-      exclude the right boundary, so `bins=[0., 1., 2.]` generates bins
+    bins: Optional boundary specification. Bins exclude the left boundary and
+      include the right boundary, so `bins=[0., 1., 2.]` generates bins
       `(-inf, 0.)`, `[0., 1.)`, `[1., 2.)`, and `[2., +inf)`.
 
   Examples:
@@ -55,14 +59,17 @@ class Discretization(base_preprocessing_layer.PreprocessingLayer):
   ...          bins=[0., 1., 2.])
   >>> layer(input)
   <tf.Tensor: shape=(2, 4), dtype=int32, numpy=
-  array([[0, 2, 3, 1],
-         [1, 3, 2, 1]], dtype=int32)>
+  array([[0, 1, 3, 1],
+         [0, 3, 2, 0]], dtype=int32)>
   """
 
   def __init__(self, bins, **kwargs):
     super(Discretization, self).__init__(**kwargs)
     base_preprocessing_layer._kpl_gauge.get_cell("V2").set("Discretization")
-    self.bins = bins
+    # The bucketization op requires a final rightmost boundary in order to
+    # correctly assign values higher than the largest left boundary.
+    # This should not impact intended buckets even if a max value is provided.
+    self.bins = np.append(bins, [np.Inf])
 
   def get_config(self):
     config = {
@@ -83,19 +90,40 @@ class Discretization(base_preprocessing_layer.PreprocessingLayer):
     return tensor_spec.TensorSpec(shape=output_shape, dtype=output_dtype)
 
   def call(self, inputs):
+    def _bucketize_op(bins):
+      bins = [gen_math_ops.cast(bins, dtypes.float32)]
+      return lambda inputs: boosted_trees_ops.boosted_trees_bucketize(  # pylint: disable=g-long-lambda
+          float_values=[gen_math_ops.cast(inputs, dtypes.float32)],
+          bucket_boundaries=bins)[0]
+
     if tf_utils.is_ragged(inputs):
       integer_buckets = ragged_functional_ops.map_flat_values(
-          gen_math_ops.Bucketize, input=inputs, boundaries=self.bins)
+          _bucketize_op(array_ops.squeeze(self.bins)),
+          inputs)
       # Ragged map_flat_values doesn't touch the non-values tensors in the
       # ragged composite tensor. If this op is the only op a Keras model,
       # this can cause errors in Graph mode, so wrap the tensor in an identity.
       return array_ops.identity(integer_buckets)
     elif isinstance(inputs, sparse_tensor.SparseTensor):
-      integer_buckets = gen_math_ops.Bucketize(
-          input=inputs.values, boundaries=self.bins)
+      integer_buckets = boosted_trees_ops.boosted_trees_bucketize(
+          [gen_math_ops.cast(inputs.values, dtypes.float32)],
+          bucket_boundaries=[gen_math_ops.cast(array_ops.squeeze(self.bins),
+                                               dtypes.float32)])[0]
       return sparse_tensor.SparseTensor(
           indices=array_ops.identity(inputs.indices),
           values=integer_buckets,
           dense_shape=array_ops.identity(inputs.dense_shape))
     else:
-      return gen_math_ops.Bucketize(input=inputs, boundaries=self.bins)
+      input_shape = inputs.get_shape()
+      if any(dim is None for dim in input_shape.as_list()[1:]):
+        raise NotImplementedError(
+            "Discretization Layer requires known non-batch shape,"
+            "found {}".format(input_shape))
+
+      reshaped = array_ops.reshape(
+          inputs, [-1, gen_math_ops.prod(input_shape.as_list()[1:], axis=0)])
+
+      return array_ops.reshape(
+          control_flow_ops.vectorized_map(
+              _bucketize_op(array_ops.squeeze(self.bins)), reshaped),
+          array_ops.constant([-1] + input_shape.as_list()[1:]))
diff --git a/tensorflow/python/keras/layers/preprocessing/discretization_distribution_test.py b/tensorflow/python/keras/layers/preprocessing/discretization_distribution_test.py
index aaeef8ea868..208aca92aa3 100644
--- a/tensorflow/python/keras/layers/preprocessing/discretization_distribution_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/discretization_distribution_test.py
@@ -21,17 +21,19 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python import keras
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
 from tensorflow.python.distribute import strategy_combinations
+from tensorflow.python.framework import config
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras.layers.preprocessing import discretization
 from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
 from tensorflow.python.platform import test
 
 
-@combinations.generate(
+@ds_combinations.generate(
     combinations.combine(
-        distribution=strategy_combinations.all_strategies,
+        distribution=strategy_combinations.strategies_minus_tpu,
         mode=["eager", "graph"]))
 class DiscretizationDistributionTest(
     keras_parameterized.TestCase,
@@ -40,11 +42,13 @@ class DiscretizationDistributionTest(
   def test_distribution(self, distribution):
     input_array = np.array([[-1.5, 1.0, 3.4, .5], [0.0, 3.0, 1.3, 0.0]])
 
-    expected_output = [[0, 2, 3, 1], [1, 3, 2, 1]]
-    expected_output_shape = [None, None]
+    expected_output = [[0, 1, 3, 1], [0, 3, 2, 0]]
+    expected_output_shape = [None, 4]
+
+    config.set_soft_device_placement(True)
 
     with distribution.scope():
-      input_data = keras.Input(shape=(None,))
+      input_data = keras.Input(shape=(4,))
       layer = discretization.Discretization(bins=[0., 1., 2.])
       bucket_data = layer(input_data)
       self.assertAllEqual(expected_output_shape, bucket_data.shape.as_list())
diff --git a/tensorflow/python/keras/layers/preprocessing/discretization_test.py b/tensorflow/python/keras/layers/preprocessing/discretization_test.py
index 54acf267066..9d04ccc26a5 100644
--- a/tensorflow/python/keras/layers/preprocessing/discretization_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/discretization_test.py
@@ -38,10 +38,10 @@ class DiscretizationTest(keras_parameterized.TestCase,
   def test_bucketize_with_explicit_buckets_integer(self):
     input_array = np.array([[-1.5, 1.0, 3.4, .5], [0.0, 3.0, 1.3, 0.0]])
 
-    expected_output = [[0, 2, 3, 1], [1, 3, 2, 1]]
-    expected_output_shape = [None, None]
+    expected_output = [[0, 1, 3, 1], [0, 3, 2, 0]]
+    expected_output_shape = [None, 4]
 
-    input_data = keras.Input(shape=(None,))
+    input_data = keras.Input(shape=(4,))
     layer = discretization.Discretization(bins=[0., 1., 2.])
     bucket_data = layer(input_data)
     self.assertAllEqual(expected_output_shape, bucket_data.shape.as_list())
@@ -54,9 +54,9 @@ class DiscretizationTest(keras_parameterized.TestCase,
     input_array = np.array([[-1, 1, 3, 0], [0, 3, 1, 0]], dtype=np.int64)
 
     expected_output = [[0, 2, 3, 1], [1, 3, 2, 1]]
-    expected_output_shape = [None, None]
+    expected_output_shape = [None, 4]
 
-    input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
+    input_data = keras.Input(shape=(4,), dtype=dtypes.int64)
     layer = discretization.Discretization(bins=[-.5, 0.5, 1.5])
     bucket_data = layer(input_data)
     self.assertAllEqual(expected_output_shape, bucket_data.shape.as_list())
@@ -83,7 +83,7 @@ class DiscretizationTest(keras_parameterized.TestCase,
     input_array = ragged_factory_ops.constant([[-1.5, 1.0, 3.4, .5],
                                                [0.0, 3.0, 1.3]])
 
-    expected_output = [[0, 2, 3, 1], [1, 3, 2]]
+    expected_output = [[0, 1, 3, 1], [0, 3, 2]]
     expected_output_shape = [None, None]
 
     input_data = keras.Input(shape=(None,), ragged=True)
diff --git a/tensorflow/python/keras/layers/preprocessing/hashing.py b/tensorflow/python/keras/layers/preprocessing/hashing.py
index a6de075535c..ea8d6f0fd95 100644
--- a/tensorflow/python/keras/layers/preprocessing/hashing.py
+++ b/tensorflow/python/keras/layers/preprocessing/hashing.py
@@ -154,7 +154,7 @@ class Hashing(base_preprocessing_layer.PreprocessingLayer):
 
   def _preprocess_single_input(self, inp):
     if isinstance(inp, (list, tuple, np.ndarray)):
-      inp = ops.convert_to_tensor(inp)
+      inp = ops.convert_to_tensor_v2_with_dispatch(inp)
     return inp
 
   def _preprocess_inputs(self, inputs):
diff --git a/tensorflow/python/keras/layers/preprocessing/hashing_distribution_test.py b/tensorflow/python/keras/layers/preprocessing/hashing_distribution_test.py
index 0cfd1ab967c..26982597515 100644
--- a/tensorflow/python/keras/layers/preprocessing/hashing_distribution_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/hashing_distribution_test.py
@@ -22,17 +22,18 @@ import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.framework import config
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras.layers.preprocessing import hashing
 from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
 from tensorflow.python.platform import test
 
 
-@combinations.generate(
+@ds_combinations.generate(
     combinations.combine(
         distribution=strategy_combinations.all_strategies,
         mode=["eager", "graph"]))
diff --git a/tensorflow/python/keras/layers/preprocessing/hashing_test.py b/tensorflow/python/keras/layers/preprocessing/hashing_test.py
index 2e5e5f7005c..58592b8910a 100644
--- a/tensorflow/python/keras/layers/preprocessing/hashing_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/hashing_test.py
@@ -25,8 +25,8 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
-from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import input_layer
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers.preprocessing import hashing
@@ -311,7 +311,7 @@ class HashingTest(keras_parameterized.TestCase):
     self.assertEqual(output_spec.shape.dims, input_shape.dims)
     self.assertEqual(output_spec.dtype, dtypes.int64)
 
-  @tf_test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_config_with_custom_name(self):
     layer = hashing.Hashing(num_bins=2, name='hashing')
     config = layer.get_config()
diff --git a/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py b/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py
index 87a18db31f3..8b98b5336bb 100644
--- a/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py
+++ b/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.eager import context
+from tensorflow.python.compat import compat
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
@@ -469,6 +470,8 @@ class RandomTranslation(PreprocessingLayer):
     interpolation: Interpolation mode. Supported values: "nearest", "bilinear".
     seed: Integer. Used to create a random seed.
     name: A string, the name of the layer.
+    fill_value: a float represents the value to be filled outside the
+      boundaries when `fill_mode` is "constant".
 
   Input shape:
     4D tensor with shape: `(samples, height, width, channels)`,
@@ -490,6 +493,7 @@ class RandomTranslation(PreprocessingLayer):
                interpolation='bilinear',
                seed=None,
                name=None,
+               fill_value=0.0,
                **kwargs):
     self.height_factor = height_factor
     if isinstance(height_factor, (tuple, list)):
@@ -522,6 +526,7 @@ class RandomTranslation(PreprocessingLayer):
     check_fill_mode_and_interpolation(fill_mode, interpolation)
 
     self.fill_mode = fill_mode
+    self.fill_value = fill_value
     self.interpolation = interpolation
     self.seed = seed
     self._rng = make_generator(self.seed)
@@ -559,7 +564,8 @@ class RandomTranslation(PreprocessingLayer):
           inputs,
           get_translation_matrix(translations),
           interpolation=self.interpolation,
-          fill_mode=self.fill_mode)
+          fill_mode=self.fill_mode,
+          fill_value=self.fill_value)
 
     output = control_flow_util.smart_cond(training, random_translated_inputs,
                                           lambda: inputs)
@@ -574,6 +580,7 @@ class RandomTranslation(PreprocessingLayer):
         'height_factor': self.height_factor,
         'width_factor': self.width_factor,
         'fill_mode': self.fill_mode,
+        'fill_value': self.fill_value,
         'interpolation': self.interpolation,
         'seed': self.seed,
     }
@@ -617,6 +624,7 @@ def get_translation_matrix(translations, name=None):
 def transform(images,
               transforms,
               fill_mode='reflect',
+              fill_value=0.0,
               interpolation='bilinear',
               output_shape=None,
               name=None):
@@ -636,6 +644,8 @@ def transform(images,
       not backpropagated into transformation parameters.
     fill_mode: Points outside the boundaries of the input are filled according
       to the given mode (one of `{'constant', 'reflect', 'wrap', 'nearest'}`).
+    fill_value: a float represents the value to be filled outside the
+      boundaries when `fill_mode` is "constant".
     interpolation: Interpolation mode. Supported values: "nearest", "bilinear".
     output_shape: Output dimesion after the transform, [height, width]. If None,
       output is the same size as input image.
@@ -681,7 +691,7 @@ def transform(images,
         if output_shape_value is not None:
           output_shape = output_shape_value
 
-    output_shape = ops.convert_to_tensor_v2(
+    output_shape = ops.convert_to_tensor_v2_with_dispatch(
         output_shape, dtypes.int32, name='output_shape')
 
     if not output_shape.get_shape().is_compatible_with([2]):
@@ -689,6 +699,18 @@ def transform(images,
                        'new_height, new_width, instead got '
                        '{}'.format(output_shape))
 
+    fill_value = ops.convert_to_tensor_v2_with_dispatch(
+        fill_value, dtypes.float32, name='fill_value')
+
+    if compat.forward_compatible(2020, 8, 5):
+      return gen_image_ops.ImageProjectiveTransformV3(
+          images=images,
+          output_shape=output_shape,
+          fill_value=fill_value,
+          transforms=transforms,
+          fill_mode=fill_mode.upper(),
+          interpolation=interpolation.upper())
+
     return gen_image_ops.ImageProjectiveTransformV2(
         images=images,
         output_shape=output_shape,
@@ -777,6 +799,8 @@ class RandomRotation(PreprocessingLayer):
     interpolation: Interpolation mode. Supported values: "nearest", "bilinear".
     seed: Integer. Used to create a random seed.
     name: A string, the name of the layer.
+    fill_value: a float represents the value to be filled outside the
+      boundaries when `fill_mode` is "constant".
 
   Input shape:
     4D tensor with shape: `(samples, height, width, channels)`,
@@ -796,6 +820,7 @@ class RandomRotation(PreprocessingLayer):
                interpolation='bilinear',
                seed=None,
                name=None,
+               fill_value=0.0,
                **kwargs):
     self.factor = factor
     if isinstance(factor, (tuple, list)):
@@ -809,6 +834,7 @@ class RandomRotation(PreprocessingLayer):
                        'got {}'.format(factor))
     check_fill_mode_and_interpolation(fill_mode, interpolation)
     self.fill_mode = fill_mode
+    self.fill_value = fill_value
     self.interpolation = interpolation
     self.seed = seed
     self._rng = make_generator(self.seed)
@@ -834,6 +860,7 @@ class RandomRotation(PreprocessingLayer):
           inputs,
           get_rotation_matrix(angles, img_hd, img_wd),
           fill_mode=self.fill_mode,
+          fill_value=self.fill_value,
           interpolation=self.interpolation)
 
     output = control_flow_util.smart_cond(training, random_rotated_inputs,
@@ -848,6 +875,7 @@ class RandomRotation(PreprocessingLayer):
     config = {
         'factor': self.factor,
         'fill_mode': self.fill_mode,
+        'fill_value': self.fill_value,
         'interpolation': self.interpolation,
         'seed': self.seed,
     }
@@ -892,6 +920,8 @@ class RandomZoom(PreprocessingLayer):
     interpolation: Interpolation mode. Supported values: "nearest", "bilinear".
     seed: Integer. Used to create a random seed.
     name: A string, the name of the layer.
+    fill_value: a float represents the value to be filled outside the
+      boundaries when `fill_mode` is "constant".
 
   Example:
 
@@ -914,7 +944,6 @@ class RandomZoom(PreprocessingLayer):
       negative.
   """
 
-  # TODO(b/156526279): Add `fill_value` argument.
   def __init__(self,
                height_factor,
                width_factor=None,
@@ -922,6 +951,7 @@ class RandomZoom(PreprocessingLayer):
                interpolation='bilinear',
                seed=None,
                name=None,
+               fill_value=0.0,
                **kwargs):
     self.height_factor = height_factor
     if isinstance(height_factor, (tuple, list)):
@@ -951,6 +981,7 @@ class RandomZoom(PreprocessingLayer):
     check_fill_mode_and_interpolation(fill_mode, interpolation)
 
     self.fill_mode = fill_mode
+    self.fill_value = fill_value
     self.interpolation = interpolation
     self.seed = seed
     self._rng = make_generator(self.seed)
@@ -983,8 +1014,10 @@ class RandomZoom(PreprocessingLayer):
           array_ops.concat([width_zoom, height_zoom], axis=1),
           dtype=dtypes.float32)
       return transform(
-          inputs, get_zoom_matrix(zooms, img_hd, img_wd),
+          inputs,
+          get_zoom_matrix(zooms, img_hd, img_wd),
           fill_mode=self.fill_mode,
+          fill_value=self.fill_value,
           interpolation=self.interpolation)
 
     output = control_flow_util.smart_cond(training, random_zoomed_inputs,
@@ -1000,6 +1033,7 @@ class RandomZoom(PreprocessingLayer):
         'height_factor': self.height_factor,
         'width_factor': self.width_factor,
         'fill_mode': self.fill_mode,
+        'fill_value': self.fill_value,
         'interpolation': self.interpolation,
         'seed': self.seed,
     }
diff --git a/tensorflow/python/keras/layers/preprocessing/image_preprocessing_distribution_test.py b/tensorflow/python/keras/layers/preprocessing/image_preprocessing_distribution_test.py
index 7fc2b42c919..2932ca35d05 100644
--- a/tensorflow/python/keras/layers/preprocessing/image_preprocessing_distribution_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/image_preprocessing_distribution_test.py
@@ -22,16 +22,17 @@ import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras.layers.preprocessing import image_preprocessing
 from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
 from tensorflow.python.platform import test
 
 
-@combinations.generate(
+@ds_combinations.generate(
     combinations.combine(
         distribution=strategy_combinations.all_strategies,
         mode=["eager", "graph"]))
diff --git a/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py b/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py
index b51e948baea..2525848bc34 100644
--- a/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py
@@ -21,15 +21,16 @@ from __future__ import print_function
 from absl.testing import parameterized
 import numpy as np
 
+from tensorflow.python.compat import compat
 from tensorflow.python.distribute.mirrored_strategy import MirroredStrategy
 from tensorflow.python.framework import errors
-from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import sequential
 from tensorflow.python.keras.layers.preprocessing import image_preprocessing
 from tensorflow.python.keras.utils.generic_utils import CustomObjectScope
 from tensorflow.python.ops import gen_stateful_random_ops
+from tensorflow.python.ops import gen_stateless_random_ops_v2
 from tensorflow.python.ops import image_ops_impl as image_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
@@ -316,14 +317,14 @@ class RescalingTest(keras_parameterized.TestCase):
         input_shape=(2, 5, 6, 3),
         expected_output_shape=(None, 5, 6, 3))
 
-  @tf_test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_rescaling_correctness_float(self):
     layer = image_preprocessing.Rescaling(scale=1./127.5, offset=-1.)
     inputs = random_ops.random_uniform((2, 4, 5, 3))
     outputs = layer(inputs)
     self.assertAllClose(outputs.numpy(), inputs.numpy() * (1./127.5) - 1)
 
-  @tf_test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_rescaling_correctness_int(self):
     layer = image_preprocessing.Rescaling(scale=1./127.5, offset=-1)
     inputs = random_ops.random_uniform((2, 4, 5, 3), 0, 100, dtype='int32')
@@ -414,7 +415,7 @@ class RandomFlipTest(keras_parameterized.TestCase):
           actual_output = layer(input_images, training=1)
           self.assertAllClose(expected_output, actual_output)
 
-  @tf_test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_config_with_custom_name(self):
     layer = image_preprocessing.RandomFlip(name='image_preproc')
     config = layer.get_config()
@@ -499,7 +500,7 @@ class RandomContrastTest(keras_parameterized.TestCase):
     with self.assertRaises(ValueError):
       image_preprocessing.RandomContrast((0.1, -0.2))
 
-  @tf_test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_config_with_custom_name(self):
     layer = image_preprocessing.RandomContrast((.5, .6), name='image_preproc')
     config = layer.get_config()
@@ -683,7 +684,7 @@ class RandomTranslationTest(keras_parameterized.TestCase):
         actual_output = layer(input_images, training=0)
         self.assertAllClose(expected_output, actual_output)
 
-  @tf_test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_config_with_custom_name(self):
     layer = image_preprocessing.RandomTranslation(.5, .6, name='image_preproc')
     config = layer.get_config()
@@ -698,11 +699,16 @@ class RandomTransformTest(keras_parameterized.TestCase):
                                       transform_matrix,
                                       expected_output,
                                       mode,
+                                      fill_value=0.0,
                                       interpolation='bilinear'):
     inp = np.arange(15).reshape((1, 5, 3, 1)).astype(np.float32)
     with self.cached_session(use_gpu=True):
       output = image_preprocessing.transform(
-          inp, transform_matrix, fill_mode=mode, interpolation=interpolation)
+          inp,
+          transform_matrix,
+          fill_mode=mode,
+          fill_value=fill_value,
+          interpolation=interpolation)
     self.assertAllClose(expected_output, output)
 
   def test_random_translation_reflect(self):
@@ -871,7 +877,7 @@ class RandomTransformTest(keras_parameterized.TestCase):
     self._run_random_transform_with_mock(transform_matrix, expected_output,
                                          'nearest')
 
-  def test_random_translation_constant(self):
+  def test_random_translation_constant_0(self):
     # constant output is (0000|abcd|0000)
 
     # Test down shift by 1.
@@ -926,6 +932,62 @@ class RandomTransformTest(keras_parameterized.TestCase):
     self._run_random_transform_with_mock(transform_matrix, expected_output,
                                          'constant')
 
+  def test_random_translation_constant_1(self):
+    with compat.forward_compatibility_horizon(2020, 8, 6):
+      # constant output is (1111|abcd|1111)
+
+      # Test down shift by 1.
+      # pyformat: disable
+      expected_output = np.asarray(
+          [[1., 1., 1.],
+           [0., 1., 2.],
+           [3., 4., 5.],
+           [6., 7., 8],
+           [9., 10., 11]]).reshape((1, 5, 3, 1)).astype(np.float32)
+      # pyformat: enable
+      transform_matrix = np.asarray([[1., 0., 0., 0., 1., -1., 0., 0.]])
+      self._run_random_transform_with_mock(
+          transform_matrix, expected_output, 'constant', fill_value=1.0)
+
+      # Test up shift by 1.
+      # pyformat: disable
+      expected_output = np.asarray(
+          [[3., 4., 5.],
+           [6., 7., 8],
+           [9., 10., 11.],
+           [12., 13., 14.],
+           [1., 1., 1.]]).reshape((1, 5, 3, 1)).astype(np.float32)
+      # pyformat: enable
+      transform_matrix = np.asarray([[1., 0., 0., 0., 1., 1., 0., 0.]])
+      self._run_random_transform_with_mock(
+          transform_matrix, expected_output, 'constant', fill_value=1.0)
+
+      # Test left shift by 1.
+      # pyformat: disable
+      expected_output = np.asarray(
+          [[1., 2., 1.],
+           [4., 5., 1.],
+           [7., 8., 1.],
+           [10., 11., 1.],
+           [13., 14., 1.]]).reshape((1, 5, 3, 1)).astype(np.float32)
+      # pyformat: enable
+      transform_matrix = np.asarray([[1., 0., 1., 0., 1., 0., 0., 0.]])
+      self._run_random_transform_with_mock(
+          transform_matrix, expected_output, 'constant', fill_value=1.0)
+
+      # Test right shift by 1.
+      # pyformat: disable
+      expected_output = np.asarray(
+          [[1., 0., 1.],
+           [1., 3., 4],
+           [1., 6., 7.],
+           [1., 9., 10.],
+           [1., 12., 13.]]).reshape((1, 5, 3, 1)).astype(np.float32)
+      # pyformat: enable
+      transform_matrix = np.asarray([[1., 0., -1., 0., 1., 0., 0., 0.]])
+      self._run_random_transform_with_mock(
+          transform_matrix, expected_output, 'constant', fill_value=1.0)
+
   def test_random_translation_nearest_interpolation(self):
     # nearest output is (aaaa|abcd|dddd)
 
@@ -1034,7 +1096,7 @@ class RandomRotationTest(keras_parameterized.TestCase):
       self.assertAllEqual(2, len(values))
       self.assertAllClose(values[0], values[1], rtol=1e-5)
 
-  @tf_test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_config_with_custom_name(self):
     layer = image_preprocessing.RandomRotation(.5, name='image_preproc')
     config = layer.get_config()
@@ -1140,7 +1202,7 @@ class RandomZoomTest(keras_parameterized.TestCase):
         actual_output = layer(input_images, training=0)
         self.assertAllClose(expected_output, actual_output)
 
-  @tf_test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_config_with_custom_name(self):
     layer = image_preprocessing.RandomZoom(.5, .6, name='image_preproc')
     config = layer.get_config()
@@ -1176,11 +1238,14 @@ class RandomHeightTest(keras_parameterized.TestCase):
     mock_factor = 0
     with test.mock.patch.object(
         gen_stateful_random_ops, 'stateful_uniform', return_value=mock_factor):
-      with testing_utils.use_gpu():
-        img = np.random.random((12, 5, 8, 3))
-        layer = image_preprocessing.RandomHeight(.4)
-        img_out = layer(img, training=True)
-        self.assertEqual(img_out.shape[1], 3)
+      with test.mock.patch.object(
+          gen_stateless_random_ops_v2, 'stateless_random_uniform_v2',
+          return_value=mock_factor):
+        with testing_utils.use_gpu():
+          img = np.random.random((12, 5, 8, 3))
+          layer = image_preprocessing.RandomHeight(.4)
+          img_out = layer(img, training=True)
+          self.assertEqual(img_out.shape[1], 3)
 
   def test_random_height_longer_numeric(self):
     for dtype in (np.int64, np.float32):
@@ -1231,7 +1296,7 @@ class RandomHeightTest(keras_parameterized.TestCase):
         actual_output = layer(input_images, training=0)
         self.assertAllClose(expected_output, actual_output)
 
-  @tf_test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_config_with_custom_name(self):
     layer = image_preprocessing.RandomHeight(.5, name='image_preproc')
     config = layer.get_config()
@@ -1267,11 +1332,14 @@ class RandomWidthTest(keras_parameterized.TestCase):
     mock_factor = 0
     with test.mock.patch.object(
         gen_stateful_random_ops, 'stateful_uniform', return_value=mock_factor):
-      with testing_utils.use_gpu():
-        img = np.random.random((12, 8, 5, 3))
-        layer = image_preprocessing.RandomWidth(.4)
-        img_out = layer(img, training=True)
-        self.assertEqual(img_out.shape[2], 3)
+      with test.mock.patch.object(
+          gen_stateless_random_ops_v2, 'stateless_random_uniform_v2',
+          return_value=mock_factor):
+        with testing_utils.use_gpu():
+          img = np.random.random((12, 8, 5, 3))
+          layer = image_preprocessing.RandomWidth(.4)
+          img_out = layer(img, training=True)
+          self.assertEqual(img_out.shape[2], 3)
 
   def test_random_width_longer_numeric(self):
     for dtype in (np.int64, np.float32):
@@ -1321,7 +1389,7 @@ class RandomWidthTest(keras_parameterized.TestCase):
         actual_output = layer(input_images, training=0)
         self.assertAllClose(expected_output, actual_output)
 
-  @tf_test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_config_with_custom_name(self):
     layer = image_preprocessing.RandomWidth(.5, name='image_preproc')
     config = layer.get_config()
diff --git a/tensorflow/python/keras/layers/preprocessing/index_lookup_distribution_test.py b/tensorflow/python/keras/layers/preprocessing/index_lookup_distribution_test.py
index c593cd41c85..b421990d2b9 100644
--- a/tensorflow/python/keras/layers/preprocessing/index_lookup_distribution_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/index_lookup_distribution_test.py
@@ -22,11 +22,12 @@ import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.eager import context
 from tensorflow.python.framework import config
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras.layers.preprocessing import index_lookup
 from tensorflow.python.keras.layers.preprocessing import index_lookup_v1
@@ -41,7 +42,7 @@ def get_layer_class():
     return index_lookup_v1.IndexLookup
 
 
-@combinations.generate(
+@ds_combinations.generate(
     combinations.combine(
         distribution=strategy_combinations.all_strategies,
         mode=["eager"]))  # Eager-only, no graph: b/158793009
diff --git a/tensorflow/python/keras/layers/preprocessing/normalization.py b/tensorflow/python/keras/layers/preprocessing/normalization.py
index 4b75def0247..b8cf233d780 100644
--- a/tensorflow/python/keras/layers/preprocessing/normalization.py
+++ b/tensorflow/python/keras/layers/preprocessing/normalization.py
@@ -145,7 +145,7 @@ class Normalization(base_preprocessing_layer.CombinerPreprocessingLayer):
     super(Normalization, self).build(input_shape)
 
   def call(self, inputs):
-    inputs = ops.convert_to_tensor_v2(inputs)
+    inputs = ops.convert_to_tensor_v2_with_dispatch(inputs)
     if inputs.shape.rank == 1:
       inputs = array_ops.expand_dims(inputs, 1)
     # If the inputs are not floats, cast them to floats. This avoids issues
diff --git a/tensorflow/python/keras/layers/preprocessing/normalization_distribution_test.py b/tensorflow/python/keras/layers/preprocessing/normalization_distribution_test.py
index f22556ef723..4bf15da4358 100644
--- a/tensorflow/python/keras/layers/preprocessing/normalization_distribution_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/normalization_distribution_test.py
@@ -22,9 +22,10 @@ import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.eager import context
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras.layers.preprocessing import normalization
 from tensorflow.python.keras.layers.preprocessing import normalization_v1
@@ -104,7 +105,7 @@ def _get_layer_computation_test_cases():
   return crossed_test_cases
 
 
-@combinations.generate(
+@ds_combinations.generate(
     combinations.times(
         combinations.combine(
             distribution=strategy_combinations.all_strategies,
diff --git a/tensorflow/python/keras/layers/preprocessing/table_utils.py b/tensorflow/python/keras/layers/preprocessing/table_utils.py
index 3329f32b4fe..c72b8252480 100644
--- a/tensorflow/python/keras/layers/preprocessing/table_utils.py
+++ b/tensorflow/python/keras/layers/preprocessing/table_utils.py
@@ -62,8 +62,10 @@ class TableHandler(object):
       raise RuntimeError("Size mismatch between values and key arrays. "
                          "Keys had size %s, values had size %s." %
                          (len(keys), len(values)))
-    keys = ops.convert_to_tensor(keys, dtype=self.table._key_dtype)  # pylint: disable=protected-access
-    values = ops.convert_to_tensor(values, dtype=self.table._value_dtype)  # pylint: disable=protected-access
+    keys = ops.convert_to_tensor_v2_with_dispatch(
+        keys, dtype=self.table._key_dtype)  # pylint: disable=protected-access
+    values = ops.convert_to_tensor_v2_with_dispatch(
+        values, dtype=self.table._value_dtype)  # pylint: disable=protected-access
     if values.shape.ndims != 1:
       raise ValueError("`values` must be 1-dimensional, got an input with "
                        " %s dimensions." % values.shape.ndims)
diff --git a/tensorflow/python/keras/layers/preprocessing/text_vectorization.py b/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
index 2cc8bc2b340..36e326bdc5c 100644
--- a/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
+++ b/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
@@ -367,7 +367,7 @@ class TextVectorization(base_preprocessing_layer.CombinerPreprocessingLayer):
     # on an implicit call to `build` in the base layer's `adapt`, since
     # preprocessing changes the input shape.
     if isinstance(data, (list, tuple, np.ndarray)):
-      data = ops.convert_to_tensor(data)
+      data = ops.convert_to_tensor_v2_with_dispatch(data)
 
     if isinstance(data, ops.Tensor):
       if data.shape.rank == 1:
@@ -566,7 +566,7 @@ class TextVectorization(base_preprocessing_layer.CombinerPreprocessingLayer):
 
   def call(self, inputs):
     if isinstance(inputs, (list, tuple, np.ndarray)):
-      inputs = ops.convert_to_tensor(inputs)
+      inputs = ops.convert_to_tensor_v2_with_dispatch(inputs)
 
     self._called = True
     inputs = self._preprocess(inputs)
diff --git a/tensorflow/python/keras/layers/preprocessing/text_vectorization_distribution_test.py b/tensorflow/python/keras/layers/preprocessing/text_vectorization_distribution_test.py
index 2d80f13684d..222bcd6252a 100644
--- a/tensorflow/python/keras/layers/preprocessing/text_vectorization_distribution_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/text_vectorization_distribution_test.py
@@ -22,11 +22,12 @@ import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.eager import context
 from tensorflow.python.framework import config
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
 from tensorflow.python.keras.layers.preprocessing import text_vectorization
@@ -41,7 +42,7 @@ def get_layer_class():
     return text_vectorization_v1.TextVectorization
 
 
-@combinations.generate(
+@ds_combinations.generate(
     combinations.combine(
         distribution=strategy_combinations.all_strategies,
         mode=["eager", "graph"]))
diff --git a/tensorflow/python/keras/layers/recurrent.py b/tensorflow/python/keras/layers/recurrent.py
index cfaa5a78758..5687ff3c4e8 100644
--- a/tensorflow/python/keras/layers/recurrent.py
+++ b/tensorflow/python/keras/layers/recurrent.py
@@ -19,6 +19,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import warnings
+
 import numpy as np
 
 from tensorflow.python.distribute import distribution_strategy_context as ds_context
@@ -43,7 +45,6 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.training.tracking import data_structures
-from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
 from tensorflow.python.util.compat import collections_abc
 from tensorflow.python.util.tf_export import keras_export
@@ -925,10 +926,17 @@ class RNN(Layer):
                        '`batch_shape` argument to your Input layer.')
     # initialize state if None
     if nest.flatten(self.states)[0] is None:
-      def create_state_variable(state):
-        return K.zeros([batch_size] + tensor_shape.TensorShape(state).as_list())
-      self.states = nest.map_structure(
-          create_state_variable, self.cell.state_size)
+      if getattr(self.cell, 'get_initial_state', None):
+        flat_init_state_values = nest.flatten(self.cell.get_initial_state(
+            inputs=None, batch_size=batch_size,
+            dtype=self.dtype or K.floatx()))
+      else:
+        flat_init_state_values = nest.flatten(_generate_zero_filled_state(
+            batch_size, self.cell.state_size, self.dtype or K.floatx()))
+      flat_states_variables = nest.map_structure(
+          K.variable, flat_init_state_values)
+      self.states = nest.pack_sequence_as(self.cell.state_size,
+                                          flat_states_variables)
       if not nest.is_nested(self.states):
         self.states = [self.states]
     elif states is None:
@@ -1709,12 +1717,6 @@ class GRUCell(DropoutRNNCellMixin, Layer):
     recurrent_dropout: Float between 0 and 1.
       Fraction of the units to drop for
       the linear transformation of the recurrent state.
-    implementation: Implementation mode, either 1 or 2.
-      Mode 1 will structure its operations as a larger number of
-      smaller dot products and additions, whereas mode 2 will
-      batch them into fewer, larger operations. These modes will
-      have different performance profiles on different hardware and
-      for different applications.
     reset_after: GRU convention (whether to apply reset gate after or
       before matrix multiplication). False = "before" (default),
       True = "after" (CuDNN compatible).
@@ -1743,7 +1745,6 @@ class GRUCell(DropoutRNNCellMixin, Layer):
                bias_constraint=None,
                dropout=0.,
                recurrent_dropout=0.,
-               implementation=1,
                reset_after=False,
                **kwargs):
     # By default use cached variable under v2 mode, see b/143699808.
@@ -1771,6 +1772,8 @@ class GRUCell(DropoutRNNCellMixin, Layer):
 
     self.dropout = min(1., max(0., dropout))
     self.recurrent_dropout = min(1., max(0., recurrent_dropout))
+
+    implementation = kwargs.pop('implementation', 1)
     if self.recurrent_dropout != 0 and implementation != 1:
       logging.debug(RECURRENT_DROPOUT_WARNING_MSG)
       self.implementation = 1
@@ -2000,12 +2003,6 @@ class GRU(RNN):
     recurrent_dropout: Float between 0 and 1.
       Fraction of the units to drop for
       the linear transformation of the recurrent state.
-    implementation: Implementation mode, either 1 or 2.
-      Mode 1 will structure its operations as a larger number of
-      smaller dot products and additions, whereas mode 2 will
-      batch them into fewer, larger operations. These modes will
-      have different performance profiles on different hardware and
-      for different applications.
     return_sequences: Boolean. Whether to return the last output
       in the output sequence, or the full sequence.
     return_state: Boolean. Whether to return the last state
@@ -2063,7 +2060,6 @@ class GRU(RNN):
                bias_constraint=None,
                dropout=0.,
                recurrent_dropout=0.,
-               implementation=1,
                return_sequences=False,
                return_state=False,
                go_backwards=False,
@@ -2071,6 +2067,7 @@ class GRU(RNN):
                unroll=False,
                reset_after=False,
                **kwargs):
+    implementation = kwargs.pop('implementation', 1)
     if implementation == 0:
       logging.warning('`implementation=0` has been deprecated, '
                       'and now defaults to `implementation=1`.'
@@ -2279,12 +2276,6 @@ class LSTMCell(DropoutRNNCellMixin, Layer):
     recurrent_dropout: Float between 0 and 1.
       Fraction of the units to drop for
       the linear transformation of the recurrent state.
-    implementation: Implementation mode, either 1 or 2.
-      Mode 1 will structure its operations as a larger number of
-      smaller dot products and additions, whereas mode 2 will
-      batch them into fewer, larger operations. These modes will
-      have different performance profiles on different hardware and
-      for different applications.
 
   Call arguments:
     inputs: A 2D tensor.
@@ -2311,7 +2302,6 @@ class LSTMCell(DropoutRNNCellMixin, Layer):
                bias_constraint=None,
                dropout=0.,
                recurrent_dropout=0.,
-               implementation=1,
                **kwargs):
     # By default use cached variable under v2 mode, see b/143699808.
     if ops.executing_eagerly_outside_functions():
@@ -2339,6 +2329,7 @@ class LSTMCell(DropoutRNNCellMixin, Layer):
 
     self.dropout = min(1., max(0., dropout))
     self.recurrent_dropout = min(1., max(0., recurrent_dropout))
+    implementation = kwargs.pop('implementation', 1)
     if self.recurrent_dropout != 0 and implementation != 1:
       logging.debug(RECURRENT_DROPOUT_WARNING_MSG)
       self.implementation = 1
@@ -2555,8 +2546,6 @@ class PeepholeLSTMCell(LSTMCell):
   ```
   """
 
-  @deprecation.deprecated(
-      None, 'Please use tensorflow_addons.rnn.PeepholeLSTMCell instead')
   def __init__(self,
                units,
                activation='tanh',
@@ -2574,8 +2563,11 @@ class PeepholeLSTMCell(LSTMCell):
                bias_constraint=None,
                dropout=0.,
                recurrent_dropout=0.,
-               implementation=1,
                **kwargs):
+    warnings.warn('`tf.keras.experimental.PeepholeLSTMCell` is deprecated '
+                  'and will be removed in a future version. '
+                  'Please use tensorflow_addons.rnn.PeepholeLSTMCell '
+                  'instead.')
     super(PeepholeLSTMCell, self).__init__(
         units=units,
         activation=activation,
@@ -2593,7 +2585,7 @@ class PeepholeLSTMCell(LSTMCell):
         bias_constraint=bias_constraint,
         dropout=dropout,
         recurrent_dropout=recurrent_dropout,
-        implementation=implementation,
+        implementation=kwargs.pop('implementation', 1),
         **kwargs)
 
   def build(self, input_shape):
@@ -2689,12 +2681,6 @@ class LSTM(RNN):
     recurrent_dropout: Float between 0 and 1.
       Fraction of the units to drop for
       the linear transformation of the recurrent state.
-    implementation: Implementation mode, either 1 or 2.
-      Mode 1 will structure its operations as a larger number of
-      smaller dot products and additions, whereas mode 2 will
-      batch them into fewer, larger operations. These modes will
-      have different performance profiles on different hardware and
-      for different applications.
     return_sequences: Boolean. Whether to return the last output.
       in the output sequence, or the full sequence.
     return_state: Boolean. Whether to return the last state
@@ -2750,13 +2736,13 @@ class LSTM(RNN):
                bias_constraint=None,
                dropout=0.,
                recurrent_dropout=0.,
-               implementation=1,
                return_sequences=False,
                return_state=False,
                go_backwards=False,
                stateful=False,
                unroll=False,
                **kwargs):
+    implementation = kwargs.pop('implementation', 1)
     if implementation == 0:
       logging.warning('`implementation=0` has been deprecated, '
                       'and now defaults to `implementation=1`.'
diff --git a/tensorflow/python/keras/layers/recurrent_test.py b/tensorflow/python/keras/layers/recurrent_test.py
index c8785a8eb9e..f9daa659c8a 100644
--- a/tensorflow/python/keras/layers/recurrent_test.py
+++ b/tensorflow/python/keras/layers/recurrent_test.py
@@ -33,7 +33,6 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.framework import test_util
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import base_layer_utils
@@ -722,7 +721,7 @@ class RNNTest(keras_parameterized.TestCase):
       self.assertAllClose(y_np, y_np_2, atol=1e-4)
 
   @parameterized.named_parameters(
-      *test_util.generate_combinations_with_testcase_name(
+      *testing_utils.generate_combinations_with_testcase_name(
           layer=[rnn_v1.SimpleRNN, rnn_v1.GRU, rnn_v1.LSTM,
                  rnn_v2.GRU, rnn_v2.LSTM],
           unroll=[True, False]))
@@ -743,7 +742,7 @@ class RNNTest(keras_parameterized.TestCase):
     model.train_on_batch(x_np, y_np)
 
   @parameterized.named_parameters(
-      *test_util.generate_combinations_with_testcase_name(
+      *testing_utils.generate_combinations_with_testcase_name(
           cell=[keras.layers.SimpleRNNCell, keras.layers.GRUCell,
                 keras.layers.LSTMCell],
           unroll=[True, False]))
@@ -1487,6 +1486,27 @@ class RNNTest(keras_parameterized.TestCase):
     self.assertAllClose(predict_1, predict_6)
     self.assertAllClose(predict_6, predict_7)
 
+  def test_stateful_rnn_with_customized_get_initial_state(self):
+
+    class TestCell(keras.layers.AbstractRNNCell):
+
+      state_size = 1
+      output_size = 2
+
+      def get_initial_state(self, inputs=None, batch_size=None, dtype=None):
+        return np.ones((batch_size, 1), dtype=dtype)
+
+      def call(self, inputs, states):
+        return inputs, states
+
+    layer = keras.layers.RNN(TestCell(), stateful=True, return_state=True)
+    inputs = keras.Input(shape=(10, 2), batch_size=4)
+    model = keras.Model(inputs, layer(inputs))
+    x = np.ones((4, 10, 2), dtype=np.float32)
+    output, state = model.predict(x)
+    self.assertAllClose(output, np.ones((4, 2)))
+    self.assertAllClose(state, np.ones((4, 1)))
+
   def test_input_dim_length(self):
     simple_rnn = keras.layers.SimpleRNN(5, input_length=10, input_dim=8)
     self.assertEqual(simple_rnn._batch_input_shape, (None, 10, 8))
@@ -1530,7 +1550,7 @@ class RNNTest(keras_parameterized.TestCase):
     model.predict(np.ones((batch, timesteps, input_dim)))
 
   @parameterized.named_parameters(
-      *test_util.generate_combinations_with_testcase_name(layer=[
+      *testing_utils.generate_combinations_with_testcase_name(layer=[
           rnn_v1.SimpleRNN, rnn_v1.GRU, rnn_v1.LSTM, rnn_v2.GRU, rnn_v2.LSTM
       ]))
   def test_rnn_with_ragged_input(self, layer):
diff --git a/tensorflow/python/keras/layers/recurrent_v2.py b/tensorflow/python/keras/layers/recurrent_v2.py
index 878269dee5e..94bc06a067f 100644
--- a/tensorflow/python/keras/layers/recurrent_v2.py
+++ b/tensorflow/python/keras/layers/recurrent_v2.py
@@ -22,6 +22,7 @@ import uuid
 
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function
+from tensorflow.python.framework import config
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import device
 from tensorflow.python.framework import dtypes
@@ -65,6 +66,12 @@ _CUDNN_NOT_AVAILABLE_MSG = ('Layer %s will not use cuDNN kernel since it '
                             'on GPU')
 
 
+def _use_new_code():
+  # TODO(b/168313799): Enable when the new codepath doesn't break deepcopy of
+  # built LSTM layers.
+  return False
+
+
 @keras_export('keras.layers.GRUCell', v1=[])
 class GRUCell(recurrent.GRUCell):
   """Cell class for the GRU layer.
@@ -124,12 +131,6 @@ class GRUCell(recurrent.GRUCell):
       linear transformation of the inputs. Default: 0.
     recurrent_dropout: Float between 0 and 1. Fraction of the units to drop for
       the linear transformation of the recurrent state. Default: 0.
-    implementation: Implementation mode, either 1 or 2.
-      Mode 1 will structure its operations as a larger number of
-      smaller dot products and additions, whereas mode 2 (default) will
-      batch them into fewer, larger operations. These modes will
-      have different performance profiles on different hardware and
-      for different applications. Default: 2.
     reset_after: GRU convention (whether to apply reset gate after or
       before matrix multiplication). False = "before",
       True = "after" (default and CuDNN compatible).
@@ -160,7 +161,6 @@ class GRUCell(recurrent.GRUCell):
                bias_constraint=None,
                dropout=0.,
                recurrent_dropout=0.,
-               implementation=2,
                reset_after=True,
                **kwargs):
     super(GRUCell, self).__init__(
@@ -179,7 +179,7 @@ class GRUCell(recurrent.GRUCell):
         bias_constraint=bias_constraint,
         dropout=dropout,
         recurrent_dropout=recurrent_dropout,
-        implementation=implementation,
+        implementation=kwargs.pop('implementation', 2),
         reset_after=reset_after,
         **kwargs)
 
@@ -269,12 +269,6 @@ class GRU(recurrent.DropoutRNNCellMixin, recurrent.GRU):
       transformation of the inputs. Default: 0.
     recurrent_dropout: Float between 0 and 1. Fraction of the units to drop for
       the linear transformation of the recurrent state. Default: 0.
-    implementation: Implementation mode, either 1 or 2.
-      Mode 1 will structure its operations as a larger number of
-      smaller dot products and additions, whereas mode 2 will
-      batch them into fewer, larger operations. These modes will
-      have different performance profiles on different hardware and
-      for different applications. Default: 2.
     return_sequences: Boolean. Whether to return the last output
       in the output sequence, or the full sequence. Default: `False`.
     return_state: Boolean. Whether to return the last state in addition to the
@@ -333,7 +327,6 @@ class GRU(recurrent.DropoutRNNCellMixin, recurrent.GRU):
                bias_constraint=None,
                dropout=0.,
                recurrent_dropout=0.,
-               implementation=2,
                return_sequences=False,
                return_state=False,
                go_backwards=False,
@@ -363,7 +356,7 @@ class GRU(recurrent.DropoutRNNCellMixin, recurrent.GRU):
         bias_constraint=bias_constraint,
         dropout=dropout,
         recurrent_dropout=recurrent_dropout,
-        implementation=implementation,
+        implementation=kwargs.pop('implementation', 2),
         return_sequences=return_sequences,
         return_state=return_state,
         go_backwards=go_backwards,
@@ -378,7 +371,7 @@ class GRU(recurrent.DropoutRNNCellMixin, recurrent.GRU):
         self.recurrent_activation in (activations.sigmoid, nn.sigmoid) and
         recurrent_dropout == 0 and not unroll and use_bias and
         reset_after and ops.executing_eagerly_outside_functions())
-    if context.num_gpus() > 0:
+    if config.list_logical_devices('GPU'):
       # Only show the message when there is GPU available, user will not care
       # about the cuDNN if there isn't any GPU.
       if self._could_use_gpu_kernel:
@@ -386,6 +379,20 @@ class GRU(recurrent.DropoutRNNCellMixin, recurrent.GRU):
       else:
         logging.warn(_CUDNN_NOT_AVAILABLE_MSG % self.name)
 
+    # TODO(b/162616551): Remove all compat statements
+    # This follows b/161915509 and is mainly to test the stateless Case op.
+    if _use_new_code():
+      # The first two attributes are added to support TFLite use case.
+      supportive_attributes = {
+          'time_major': time_major,
+          'go_backwards': go_backwards,
+          _FUNCTION_API_NAME_ATTRIBUTE: 'gru_' + str(uuid.uuid4())
+      }
+      self.defun_gru_with_backend_selection = function.defun_with_attributes(
+          gru_with_backend_selection,
+          attributes=supportive_attributes,
+          autograph=False)
+
   def build(self, input_shape):
     super(GRU, self).build(input_shape)
 
@@ -468,38 +475,54 @@ class GRU(recurrent.DropoutRNNCellMixin, recurrent.GRU):
     if dropout_mask is not None:
       inputs = inputs * dropout_mask[0]
 
-    gpu_gru_kwargs = {
-        'inputs': inputs,
-        'init_h': _read_variable_value(initial_state[0]),
-        'kernel': _read_variable_value(self.cell.kernel),
-        'recurrent_kernel': _read_variable_value(self.cell.recurrent_kernel),
-        'bias': _read_variable_value(self.cell.bias),
-        'mask': mask,
-        'time_major': self.time_major,
-        'go_backwards': self.go_backwards,
-        'sequence_lengths': sequence_lengths
-    }
-    normal_gru_kwargs = gpu_gru_kwargs.copy()
-    normal_gru_kwargs.update({
-        'zero_output_for_mask': self.zero_output_for_mask,
-    })
-
-    if context.executing_eagerly():
-      device_type = _get_context_device_type()
-      can_use_gpu = (
-          # Either user specified GPU or unspecified but GPU is available.
-          (device_type == _GPU_DEVICE_NAME
-           or (device_type is None and context.num_gpus() > 0))
-          and
-          (mask is None or is_cudnn_supported_inputs(mask, self.time_major)))
-      # Under eager context, check the device placement and prefer the
-      if can_use_gpu:
-        last_output, outputs, new_h, runtime = gpu_gru(**gpu_gru_kwargs)
-      else:
-        last_output, outputs, new_h, runtime = standard_gru(**normal_gru_kwargs)
+    if _use_new_code():
+      gru_kwargs = {
+          'inputs': inputs,
+          'init_h': _read_variable_value(initial_state[0]),
+          'kernel': _read_variable_value(self.cell.kernel),
+          'recurrent_kernel': _read_variable_value(self.cell.recurrent_kernel),
+          'bias': _read_variable_value(self.cell.bias),
+          'mask': mask,
+          'time_major': self.time_major,
+          'go_backwards': self.go_backwards,
+          'sequence_lengths': sequence_lengths,
+          'zero_output_for_mask': self.zero_output_for_mask
+      }
+      (last_output, outputs, new_h,
+       runtime) = self.defun_gru_with_backend_selection(**gru_kwargs)
     else:
-      last_output, outputs, new_h, runtime = gru_with_backend_selection(
-          **normal_gru_kwargs)
+      gpu_gru_kwargs = {
+          'inputs': inputs,
+          'init_h': _read_variable_value(initial_state[0]),
+          'kernel': _read_variable_value(self.cell.kernel),
+          'recurrent_kernel': _read_variable_value(self.cell.recurrent_kernel),
+          'bias': _read_variable_value(self.cell.bias),
+          'mask': mask,
+          'time_major': self.time_major,
+          'go_backwards': self.go_backwards,
+          'sequence_lengths': sequence_lengths
+      }
+      normal_gru_kwargs = gpu_gru_kwargs.copy()
+      normal_gru_kwargs.update({
+          'zero_output_for_mask': self.zero_output_for_mask,
+      })
+
+      if context.executing_eagerly():
+        device_type = _get_context_device_type()
+        can_use_gpu = (
+            # Either user specified GPU or unspecified but GPU is available.
+            (device_type == _GPU_DEVICE_NAME or
+             (device_type is None and config.list_logical_devices('GPU'))) and
+            (mask is None or is_cudnn_supported_inputs(mask, self.time_major)))
+        # Under eager context, check the device placement and prefer the
+        if can_use_gpu:
+          last_output, outputs, new_h, runtime = gpu_gru(**gpu_gru_kwargs)
+        else:
+          last_output, outputs, new_h, runtime = standard_gru(
+              **normal_gru_kwargs)
+      else:
+        last_output, outputs, new_h, runtime = gru_with_backend_selection(
+            **normal_gru_kwargs)
 
     states = [new_h]
     return last_output, outputs, runtime, states
@@ -766,24 +789,36 @@ def gru_with_backend_selection(inputs, init_h, kernel, recurrent_kernel, bias,
         true_fn=cudnn_gru_fn,
         false_fn=standard_gru_fn)
 
-  # Each time a `tf.function` is called, we will give it a unique
-  # identifiable API name, so that Grappler won't get confused when it
-  # sees multiple GRU layers added into same graph, and it will be able
-  # to pair up the different implementations across them.
-  api_name = 'gru_' + str(uuid.uuid4())
-  supportive_attribute = {
-      'time_major': time_major,
-      'go_backwards': go_backwards,
-  }
-  defun_standard_gru = _generate_defun_backend(
-      api_name, _CPU_DEVICE_NAME, standard_gru, supportive_attribute)
-  defun_gpu_gru = _generate_defun_backend(
-      api_name, _GPU_DEVICE_NAME, gpu_gru_with_fallback, supportive_attribute)
+  if _use_new_code():
+    # Chooses the implementation dynamicly based on the running device.
+    (last_output, outputs, new_h,
+     runtime) = control_flow_ops.execute_fn_for_device(
+         {
+             _CPU_DEVICE_NAME: lambda: standard_gru(**params),
+             _GPU_DEVICE_NAME: lambda: gpu_gru_with_fallback(**params)
+         }, lambda: standard_gru(**params))
+  else:
+    # Each time a `tf.function` is called, we will give it a unique
+    # identifiable API name, so that Grappler won't get confused when it
+    # sees multiple GRU layers added into same graph, and it will be able
+    # to pair up the different implementations across them.
+    api_name = 'gru_' + str(uuid.uuid4())
+    supportive_attribute = {
+        'time_major': time_major,
+        'go_backwards': go_backwards,
+    }
+    defun_standard_gru = _generate_defun_backend(api_name, _CPU_DEVICE_NAME,
+                                                 standard_gru,
+                                                 supportive_attribute)
+    defun_gpu_gru = _generate_defun_backend(api_name, _GPU_DEVICE_NAME,
+                                            gpu_gru_with_fallback,
+                                            supportive_attribute)
+
+    # Call the normal GRU impl and register the CuDNN impl function. The
+    # grappler will kick in during session execution to optimize the graph.
+    last_output, outputs, new_h, runtime = defun_standard_gru(**params)
+    function.register(defun_gpu_gru, **params)
 
-  # Call the normal GRU impl and register the CuDNN impl function. The
-  # grappler will kick in during session execution to optimize the graph.
-  last_output, outputs, new_h, runtime = defun_standard_gru(**params)
-  function.register(defun_gpu_gru, **params)
   return last_output, outputs, new_h, runtime
 
 
@@ -851,11 +886,6 @@ class LSTMCell(recurrent.LSTMCell):
       transformation of the inputs. Default: 0.
     recurrent_dropout: Float between 0 and 1. Fraction of the units to drop for
       the linear transformation of the recurrent state. Default: 0.
-    implementation: Implementation mode, either 1 or 2.
-      Mode 1 will structure its operations as a larger number of smaller dot
-      products and additions, whereas mode 2 (default) will batch them into
-      fewer, larger operations. These modes will have different performance
-      profiles on different hardware and for different applications. Default: 2.
 
   Call arguments:
     inputs: A 2D tensor, with shape of `[batch, feature]`.
@@ -886,7 +916,6 @@ class LSTMCell(recurrent.LSTMCell):
                bias_constraint=None,
                dropout=0.,
                recurrent_dropout=0.,
-               implementation=2,
                **kwargs):
     super(LSTMCell, self).__init__(
         units,
@@ -905,7 +934,7 @@ class LSTMCell(recurrent.LSTMCell):
         bias_constraint=bias_constraint,
         dropout=dropout,
         recurrent_dropout=recurrent_dropout,
-        implementation=implementation,
+        implementation=kwargs.pop('implementation', 2),
         **kwargs)
 
 
@@ -985,11 +1014,6 @@ class LSTM(recurrent.DropoutRNNCellMixin, recurrent.LSTM):
       transformation of the inputs. Default: 0.
     recurrent_dropout: Float between 0 and 1. Fraction of the units to drop for
       the linear transformation of the recurrent state. Default: 0.
-    implementation: Implementation mode, either 1 or 2. Mode 1 will structure
-      its operations as a larger number of smaller dot products and additions,
-      whereas mode 2 will batch them into fewer, larger operations. These modes
-      will have different performance profiles on different hardware and for
-      different applications. Default: 2.
     return_sequences: Boolean. Whether to return the last output. in the output
       sequence, or the full sequence. Default: `False`.
     return_state: Boolean. Whether to return the last state in addition to the
@@ -1043,7 +1067,6 @@ class LSTM(recurrent.DropoutRNNCellMixin, recurrent.LSTM):
                bias_constraint=None,
                dropout=0.,
                recurrent_dropout=0.,
-               implementation=2,
                return_sequences=False,
                return_state=False,
                go_backwards=False,
@@ -1073,7 +1096,7 @@ class LSTM(recurrent.DropoutRNNCellMixin, recurrent.LSTM):
         bias_constraint=bias_constraint,
         dropout=dropout,
         recurrent_dropout=recurrent_dropout,
-        implementation=implementation,
+        implementation=kwargs.pop('implementation', 2),
         return_sequences=return_sequences,
         return_state=return_state,
         go_backwards=go_backwards,
@@ -1090,7 +1113,7 @@ class LSTM(recurrent.DropoutRNNCellMixin, recurrent.LSTM):
         self.recurrent_activation in (activations.sigmoid, nn.sigmoid) and
         recurrent_dropout == 0 and not unroll and use_bias and
         ops.executing_eagerly_outside_functions())
-    if context.num_gpus() > 0:
+    if config.list_logical_devices('GPU'):
       # Only show the message when there is GPU available, user will not care
       # about the cuDNN if there isn't any GPU.
       if self._could_use_gpu_kernel:
@@ -1098,6 +1121,19 @@ class LSTM(recurrent.DropoutRNNCellMixin, recurrent.LSTM):
       else:
         logging.warn(_CUDNN_NOT_AVAILABLE_MSG % self.name)
 
+    if _use_new_code():
+      # The first two attributes are added to support TFLite use case.
+      supportive_attributes = {
+          'time_major': time_major,
+          'go_backwards': go_backwards,
+          _FUNCTION_API_NAME_ATTRIBUTE: 'lstm_' + str(uuid.uuid4())
+      }
+
+      self.defun_lstm_with_backend_selection = function.defun_with_attributes(
+          lstm_with_backend_selection,
+          attributes=supportive_attributes,
+          autograph=False)
+
   def call(self, inputs, mask=None, training=None, initial_state=None):
     # The input should be dense, padded with zeros. If a ragged input is fed
     # into the layer, it is padded and the row lengths are used for masking.
@@ -1146,42 +1182,80 @@ class LSTM(recurrent.DropoutRNNCellMixin, recurrent.LSTM):
       dropout_mask = self.get_dropout_mask_for_cell(inputs, training, count=4)
       if dropout_mask is not None:
         inputs = inputs * dropout_mask[0]
-      gpu_lstm_kwargs = {
-          'inputs': inputs,
-          'init_h': _read_variable_value(initial_state[0]),
-          'init_c': _read_variable_value(initial_state[1]),
-          'kernel': _read_variable_value(self.cell.kernel),
-          'recurrent_kernel': _read_variable_value(self.cell.recurrent_kernel),
-          'bias': _read_variable_value(self.cell.bias),
-          'mask': mask,
-          'time_major': self.time_major,
-          'go_backwards': self.go_backwards,
-          'sequence_lengths': row_lengths
-      }
-      normal_lstm_kwargs = gpu_lstm_kwargs.copy()
-      normal_lstm_kwargs.update({
-          'zero_output_for_mask': self.zero_output_for_mask,
-      })
-
-      if context.executing_eagerly():
-        device_type = _get_context_device_type()
-        can_use_gpu = (
-            # Either user specified GPU or unspecified but GPU is available.
-            (device_type == _GPU_DEVICE_NAME
-             or (device_type is None and context.num_gpus() > 0))
-            and
-            (mask is None or is_cudnn_supported_inputs(mask, self.time_major)))
-        # Under eager context, check the device placement and prefer the
-        # GPU implementation when GPU is available.
-        if can_use_gpu:
-          last_output, outputs, new_h, new_c, runtime = gpu_lstm(
-              **gpu_lstm_kwargs)
-        else:
-          last_output, outputs, new_h, new_c, runtime = standard_lstm(
-              **normal_lstm_kwargs)
-      else:
+      if _use_new_code():
+        lstm_kwargs = {
+            'inputs':
+                inputs,
+            'init_h':
+                _read_variable_value(initial_state[0]),
+            'init_c':
+                _read_variable_value(initial_state[1]),
+            'kernel':
+                _read_variable_value(self.cell.kernel),
+            'recurrent_kernel':
+                _read_variable_value(self.cell.recurrent_kernel),
+            'bias':
+                _read_variable_value(self.cell.bias),
+            'mask':
+                mask,
+            'time_major':
+                self.time_major,
+            'go_backwards':
+                self.go_backwards,
+            'sequence_lengths':
+                row_lengths,
+            'zero_output_for_mask':
+                self.zero_output_for_mask,
+        }
         (last_output, outputs, new_h, new_c,
-         runtime) = lstm_with_backend_selection(**normal_lstm_kwargs)
+         runtime) = self.defun_lstm_with_backend_selection(**lstm_kwargs)
+      else:
+        gpu_lstm_kwargs = {
+            'inputs':
+                inputs,
+            'init_h':
+                _read_variable_value(initial_state[0]),
+            'init_c':
+                _read_variable_value(initial_state[1]),
+            'kernel':
+                _read_variable_value(self.cell.kernel),
+            'recurrent_kernel':
+                _read_variable_value(self.cell.recurrent_kernel),
+            'bias':
+                _read_variable_value(self.cell.bias),
+            'mask':
+                mask,
+            'time_major':
+                self.time_major,
+            'go_backwards':
+                self.go_backwards,
+            'sequence_lengths':
+                row_lengths
+        }
+        normal_lstm_kwargs = gpu_lstm_kwargs.copy()
+        normal_lstm_kwargs.update({
+            'zero_output_for_mask': self.zero_output_for_mask,
+        })
+
+        if context.executing_eagerly():
+          device_type = _get_context_device_type()
+          can_use_gpu = (
+              # Either user specified GPU or unspecified but GPU is available.
+              (device_type == _GPU_DEVICE_NAME or
+               (device_type is None and config.list_logical_devices('GPU'))) and
+              (mask is None or
+               is_cudnn_supported_inputs(mask, self.time_major)))
+          # Under eager context, check the device placement and prefer the
+          # GPU implementation when GPU is available.
+          if can_use_gpu:
+            last_output, outputs, new_h, new_c, runtime = gpu_lstm(
+                **gpu_lstm_kwargs)
+          else:
+            last_output, outputs, new_h, new_c, runtime = standard_lstm(
+                **normal_lstm_kwargs)
+        else:
+          (last_output, outputs, new_h, new_c,
+           runtime) = lstm_with_backend_selection(**normal_lstm_kwargs)
 
       states = [new_h, new_c]
 
@@ -1539,25 +1613,35 @@ def lstm_with_backend_selection(inputs, init_h, init_c, kernel,
         true_fn=cudnn_lstm_fn,
         false_fn=stardard_lstm_fn)
 
-  # Each time a `tf.function` is called, we will give it a unique
-  # identifiable API name, so that Grappler won't get confused when it
-  # sees multiple LSTM layers added into same graph, and it will be able
-  # to pair up the different implementations across them.
-  api_name = 'lstm_' + str(uuid.uuid4())
-  supportive_attribute = {
-      'time_major': time_major,
-      'go_backwards': go_backwards,
-  }
-  defun_standard_lstm = _generate_defun_backend(
-      api_name, _CPU_DEVICE_NAME, standard_lstm, supportive_attribute)
-  defun_gpu_lstm = _generate_defun_backend(
-      api_name, _GPU_DEVICE_NAME, gpu_lstm_with_fallback, supportive_attribute)
+  if _use_new_code():
+    # Chooses the implementation dynamicly based on the running device.
+    (last_output, outputs, new_h, new_c,
+     runtime) = control_flow_ops.execute_fn_for_device(
+         {
+             _CPU_DEVICE_NAME: lambda: standard_lstm(**params),
+             _GPU_DEVICE_NAME: lambda: gpu_lstm_with_fallback(**params)
+         }, lambda: standard_lstm(**params))
+  else:
+    # Each time a `tf.function` is called, we will give it a unique
+    # identifiable API name, so that Grappler won't get confused when it
+    # sees multiple LSTM layers added into same graph, and it will be able
+    # to pair up the different implementations across them.
+    api_name = 'lstm_' + str(uuid.uuid4())
+    supportive_attribute = {
+        'time_major': time_major,
+        'go_backwards': go_backwards,
+    }
+    defun_standard_lstm = _generate_defun_backend(api_name, _CPU_DEVICE_NAME,
+                                                  standard_lstm,
+                                                  supportive_attribute)
+    defun_gpu_lstm = _generate_defun_backend(api_name, _GPU_DEVICE_NAME,
+                                             gpu_lstm_with_fallback,
+                                             supportive_attribute)
 
-  # Call the normal LSTM impl and register the CuDNN impl function. The
-  # grappler will kick in during session execution to optimize the graph.
-  last_output, outputs, new_h, new_c, runtime = defun_standard_lstm(
-      **params)
-  function.register(defun_gpu_lstm, **params)
+    # Call the normal LSTM impl and register the CuDNN impl function. The
+    # grappler will kick in during session execution to optimize the graph.
+    last_output, outputs, new_h, new_c, runtime = defun_standard_lstm(**params)
+    function.register(defun_gpu_lstm, **params)
 
   return last_output, outputs, new_h, new_c, runtime
 
diff --git a/tensorflow/python/keras/layers/rnn_cell_wrapper_v2.py b/tensorflow/python/keras/layers/rnn_cell_wrapper_v2.py
index d387a375aa2..7b6ade79ae5 100644
--- a/tensorflow/python/keras/layers/rnn_cell_wrapper_v2.py
+++ b/tensorflow/python/keras/layers/rnn_cell_wrapper_v2.py
@@ -26,8 +26,8 @@ from __future__ import print_function
 
 
 from tensorflow.python.keras.layers import recurrent
+from tensorflow.python.keras.utils import tf_inspect
 from tensorflow.python.ops import rnn_cell_wrapper_impl
-from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import tf_export
 
 
diff --git a/tensorflow/python/keras/layers/rnn_cell_wrapper_v2_test.py b/tensorflow/python/keras/layers/rnn_cell_wrapper_v2_test.py
index b0fd5189b17..19ea3dcce90 100644
--- a/tensorflow/python/keras/layers/rnn_cell_wrapper_v2_test.py
+++ b/tensorflow/python/keras/layers/rnn_cell_wrapper_v2_test.py
@@ -40,8 +40,10 @@ class RNNCellWrapperTest(test.TestCase, parameterized.TestCase):
 
   def testResidualWrapper(self):
     wrapper_type = rnn_cell_wrapper_v2.ResidualWrapper
-    x = ops.convert_to_tensor_v2(np.array([[1., 1., 1.]]), dtype="float32")
-    m = ops.convert_to_tensor_v2(np.array([[0.1, 0.1, 0.1]]), dtype="float32")
+    x = ops.convert_to_tensor_v2_with_dispatch(
+        np.array([[1., 1., 1.]]), dtype="float32")
+    m = ops.convert_to_tensor_v2_with_dispatch(
+        np.array([[0.1, 0.1, 0.1]]), dtype="float32")
     base_cell = rnn_cell_impl.GRUCell(
         3, kernel_initializer=init_ops.constant_initializer(0.5),
         bias_initializer=init_ops.constant_initializer(0.5))
@@ -62,9 +64,10 @@ class RNNCellWrapperTest(test.TestCase, parameterized.TestCase):
 
   def testResidualWrapperWithSlice(self):
     wrapper_type = rnn_cell_wrapper_v2.ResidualWrapper
-    x = ops.convert_to_tensor_v2(
+    x = ops.convert_to_tensor_v2_with_dispatch(
         np.array([[1., 1., 1., 1., 1.]]), dtype="float32")
-    m = ops.convert_to_tensor_v2(np.array([[0.1, 0.1, 0.1]]), dtype="float32")
+    m = ops.convert_to_tensor_v2_with_dispatch(
+        np.array([[0.1, 0.1, 0.1]]), dtype="float32")
     base_cell = rnn_cell_impl.GRUCell(
         3, kernel_initializer=init_ops.constant_initializer(0.5),
         bias_initializer=init_ops.constant_initializer(0.5))
@@ -116,7 +119,8 @@ class RNNCellWrapperTest(test.TestCase, parameterized.TestCase):
     base_cell = layers.SimpleRNNCell(1, name="basic_rnn_cell")
     rnn_cell = wrapper(base_cell)
     rnn_layer = layers.RNN(rnn_cell)
-    inputs = ops.convert_to_tensor_v2([[[1]]], dtype=dtypes.float32)
+    inputs = ops.convert_to_tensor_v2_with_dispatch([[[1]]],
+                                                    dtype=dtypes.float32)
     rnn_layer(inputs)
 
     wrapper_name = generic_utils.to_snake_case(wrapper.__name__)
@@ -140,8 +144,8 @@ class RNNCellWrapperTest(test.TestCase, parameterized.TestCase):
       base_cell = rnn_cell_impl.MultiRNNCell(
           [rnn_cell_impl.BasicRNNCell(1) for _ in range(2)])
     rnn_cell = wrapper(base_cell)
-    inputs = ops.convert_to_tensor_v2([[1]], dtype=dtypes.float32)
-    state = ops.convert_to_tensor_v2([[1]], dtype=dtypes.float32)
+    inputs = ops.convert_to_tensor_v2_with_dispatch([[1]], dtype=dtypes.float32)
+    state = ops.convert_to_tensor_v2_with_dispatch([[1]], dtype=dtypes.float32)
     _ = rnn_cell(inputs, [state, state])
     weights = base_cell._cells[0].weights
     self.assertLen(weights, expected_len=2)
diff --git a/tensorflow/python/keras/layers/serialization.py b/tensorflow/python/keras/layers/serialization.py
index d990f2075c8..e47d9f59e08 100644
--- a/tensorflow/python/keras/layers/serialization.py
+++ b/tensorflow/python/keras/layers/serialization.py
@@ -37,6 +37,7 @@ from tensorflow.python.keras.layers import einsum_dense
 from tensorflow.python.keras.layers import embeddings
 from tensorflow.python.keras.layers import local
 from tensorflow.python.keras.layers import merge
+from tensorflow.python.keras.layers import multi_head_attention
 from tensorflow.python.keras.layers import noise
 from tensorflow.python.keras.layers import normalization
 from tensorflow.python.keras.layers import normalization_v2
@@ -60,7 +61,7 @@ from tensorflow.python.keras.layers.preprocessing import string_lookup_v1 as pre
 from tensorflow.python.keras.layers.preprocessing import text_vectorization as preprocessing_text_vectorization
 from tensorflow.python.keras.layers.preprocessing import text_vectorization_v1 as preprocessing_text_vectorization_v1
 from tensorflow.python.keras.utils import generic_utils
-from tensorflow.python.util import tf_inspect as inspect
+from tensorflow.python.keras.utils import tf_inspect as inspect
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -70,7 +71,8 @@ ALL_MODULES = (base_layer, input_layer, advanced_activations, convolutional,
                pooling, image_preprocessing, preprocessing_integer_lookup_v1,
                preprocessing_normalization_v1, preprocessing_string_lookup_v1,
                preprocessing_text_vectorization_v1, recurrent, wrappers,
-               hashing, category_crossing, category_encoding_v1, discretization)
+               hashing, category_crossing, category_encoding_v1, discretization,
+               multi_head_attention)
 ALL_V2_MODULES = (rnn_cell_wrapper_v2, normalization_v2, recurrent_v2,
                   preprocessing_integer_lookup, preprocessing_normalization,
                   preprocessing_string_lookup, preprocessing_text_vectorization,
diff --git a/tensorflow/python/keras/layers/simplernn_test.py b/tensorflow/python/keras/layers/simplernn_test.py
index b586814a345..66f68720e7b 100644
--- a/tensorflow/python/keras/layers/simplernn_test.py
+++ b/tensorflow/python/keras/layers/simplernn_test.py
@@ -26,7 +26,6 @@ import numpy as np
 from tensorflow.python import keras
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import combinations
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
@@ -47,7 +46,7 @@ class SimpleRNNLayerTest(test.TestCase, parameterized.TestCase):
                 'return_sequences': True},
         input_shape=(num_samples, timesteps, embedding_dim))
 
-  @tf_test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_float64_SimpleRNN(self):
     num_samples = 2
     timesteps = 3
diff --git a/tensorflow/python/keras/layers/subclassed_layers_test.py b/tensorflow/python/keras/layers/subclassed_layers_test.py
index 6adeb0934ed..572ce859702 100644
--- a/tensorflow/python/keras/layers/subclassed_layers_test.py
+++ b/tensorflow/python/keras/layers/subclassed_layers_test.py
@@ -37,7 +37,7 @@ class SubclassedLayersTest(keras_parameterized.TestCase):
     class BuildConstantLayer(keras.layers.Layer):
 
       def build(self, input_shape):
-        self.b = ops.convert_to_tensor_v2(2.0)
+        self.b = ops.convert_to_tensor_v2_with_dispatch(2.0)
 
       def call(self, inputs):
         return self.b * inputs
@@ -46,7 +46,7 @@ class SubclassedLayersTest(keras_parameterized.TestCase):
     model = testing_utils.get_model_from_layers(
         [layer, keras.layers.Dense(1)], input_shape=(1,))
 
-    x = ops.convert_to_tensor_v2([[3.0]])
+    x = ops.convert_to_tensor_v2_with_dispatch([[3.0]])
     self.assertEqual(
         tf_utils.is_symbolic_tensor(model(x)), not context.executing_eagerly())
     self.assertEqual(
@@ -58,10 +58,10 @@ class SubclassedLayersTest(keras_parameterized.TestCase):
     class BuildDerivedConstantLayer(keras.layers.Layer):
 
       def build(self, input_shape):
-        a = ops.convert_to_tensor_v2(1.0)
+        a = ops.convert_to_tensor_v2_with_dispatch(1.0)
         b = 2.0 * a
         self.variable = variables.Variable(b)
-        self.constant = ops.convert_to_tensor_v2(self.variable)
+        self.constant = ops.convert_to_tensor_v2_with_dispatch(self.variable)
 
       def call(self, inputs):
         return self.variable * self.constant * inputs
@@ -70,7 +70,7 @@ class SubclassedLayersTest(keras_parameterized.TestCase):
     model = testing_utils.get_model_from_layers(
         [layer, keras.layers.Dense(1)], input_shape=(1,))
 
-    x = ops.convert_to_tensor_v2([[3.0]])
+    x = ops.convert_to_tensor_v2_with_dispatch([[3.0]])
     self.assertEqual(
         tf_utils.is_symbolic_tensor(model(x)), not context.executing_eagerly())
     self.assertEqual(
diff --git a/tensorflow/python/keras/layers/tensorflow_op_layer_test.py b/tensorflow/python/keras/layers/tensorflow_op_layer_test.py
index e128323a1a6..bde6dd137d7 100644
--- a/tensorflow/python/keras/layers/tensorflow_op_layer_test.py
+++ b/tensorflow/python/keras/layers/tensorflow_op_layer_test.py
@@ -637,7 +637,7 @@ class AutoLambdaTest(keras_parameterized.TestCase):
     self.assertAllEqual(model(ones), 3.0 * ones)
 
   def test_numerical_correctness_simple(self):
-    x = ops.convert_to_tensor_v2([[-1., 0., -2., 1.]])
+    x = ops.convert_to_tensor_v2_with_dispatch([[-1., 0., -2., 1.]])
     inputs = keras.Input(shape=(4,))
     outputs = gen_nn_ops.relu(inputs)
     model = keras.Model(inputs, outputs)
@@ -645,7 +645,7 @@ class AutoLambdaTest(keras_parameterized.TestCase):
     self.assertAllClose(y, [[0., 0., 0., 1.]])
 
   def test_numerical_correctness_with_attrs(self):
-    x = ops.convert_to_tensor_v2([[1.5, 1.5], [2.5, 3.5]])
+    x = ops.convert_to_tensor_v2_with_dispatch([[1.5, 1.5], [2.5, 3.5]])
     inputs = keras.Input(shape=(2,))
     outputs = math_ops.reduce_mean(inputs, axis=1)
     model = keras.Model(inputs, outputs)
@@ -653,7 +653,7 @@ class AutoLambdaTest(keras_parameterized.TestCase):
     self.assertAllClose(y, [1.5, 3.])
 
   def test_numerical_correctness_serialization(self):
-    x = ops.convert_to_tensor_v2([[-1., 0., -2., 1.]])
+    x = ops.convert_to_tensor_v2_with_dispatch([[-1., 0., -2., 1.]])
     inputs = keras.Input(shape=(4,))
     outputs = gen_nn_ops.relu(inputs)
     model1 = keras.Model(inputs, outputs)
@@ -731,7 +731,8 @@ class AutoLambdaTest(keras_parameterized.TestCase):
     model.summary()
 
 
-class InputInEagerTest(test.TestCase):
+@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
+class InputInEagerTest(keras_parameterized.TestCase):
   """Tests ops on keras inputs in Eager runtime.
 
   Input returns graph/symbolic tensors in the Eager runtime (this
@@ -740,21 +741,19 @@ class InputInEagerTest(test.TestCase):
   """
 
   def test_identity(self):
-    with context.eager_mode():
-      x = keras.Input(shape=(1,))
-      ident = array_ops.identity(x)
+    x = keras.Input(shape=(1,))
+    ident = array_ops.identity(x)
 
-      # This is now a graph tensor, and should be able to continue in graphland
-      self.assertIn('Identity', ident.name)
+    # This is now a graph tensor, and should be able to continue in graphland
+    self.assertIn('Identity', ident.name)
 
   def test_size(self):
-    with context.eager_mode():
-      x = keras.Input(shape=(3,))
-      self.assertAllEqual(x.get_shape().as_list(), [None, 3])
-      sz = array_ops.size(x)
+    x = keras.Input(shape=(3,))
+    self.assertAllEqual(x.get_shape().as_list(), [None, 3])
+    sz = array_ops.size(x)
 
-      # This is now a graph tensor, and should be able to continue in graphland
-      self.assertIn('Size', sz.name)
+    # This is now a graph tensor, and should be able to continue in graphland
+    self.assertIn('Size', sz.name)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/layers/wrappers.py b/tensorflow/python/keras/layers/wrappers.py
index 23fef467cfe..6798e5c8fff 100644
--- a/tensorflow/python/keras/layers/wrappers.py
+++ b/tensorflow/python/keras/layers/wrappers.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 
 import copy
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.engine.base_layer import Layer
@@ -28,11 +29,11 @@ from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.layers.recurrent import _standardize_args
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import layer_utils
+from tensorflow.python.keras.utils import tf_inspect
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.util import nest
-from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -241,6 +242,11 @@ class TimeDistributed(Wrapper):
         output_shape = self._get_shape_tuple((-1, input_length), y, 1,
                                              output_shape[2:])
         y = array_ops.reshape(y, output_shape)
+        if not context.executing_eagerly():
+          # Set the static shape for the result since it might be lost during
+          # array_ops reshape, eg, some `None` dim in the result could be
+          # inferred.
+          y.set_shape(self.compute_output_shape(input_shape))
 
     return y
 
diff --git a/tensorflow/python/keras/layers/wrappers_test.py b/tensorflow/python/keras/layers/wrappers_test.py
index 671fe65d520..f1412975cc3 100644
--- a/tensorflow/python/keras/layers/wrappers_test.py
+++ b/tensorflow/python/keras/layers/wrappers_test.py
@@ -28,7 +28,6 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
@@ -44,7 +43,6 @@ from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import test
 from tensorflow.python.training.tracking import util as trackable_util
 from tensorflow.python.util import nest
-from tensorflow.python.util import object_identity
 
 
 class _RNNCellWithConstants(keras.layers.Layer):
@@ -130,10 +128,11 @@ class TimeDistributedTest(keras_parameterized.TestCase):
 
     # check whether the model variables are present in the
     # trackable list of objects
-    checkpointed_objects = object_identity.ObjectIdentitySet(
-        trackable_util.list_objects(model))
+    checkpointed_object_ids = {
+        id(o) for o in trackable_util.list_objects(model)
+    }
     for v in model.variables:
-      self.assertIn(v, checkpointed_objects)
+      self.assertIn(id(v), checkpointed_object_ids)
 
   def test_timedistributed_static_batch_size(self):
     model = keras.models.Sequential()
@@ -405,7 +404,7 @@ class TimeDistributedTest(keras_parameterized.TestCase):
 
   @keras_parameterized.run_all_keras_modes
   @parameterized.named_parameters(
-      *tf_test_util.generate_combinations_with_testcase_name(
+      *testing_utils.generate_combinations_with_testcase_name(
           layer=[keras.layers.LSTM,
                  keras.layers.Dense]))
   def test_TimeDistributed_with_ragged_input(self, layer):
@@ -466,6 +465,13 @@ class TimeDistributedTest(keras_parameterized.TestCase):
         output_ragged, name='tensor')
     self.assertAllEqual(output_ragged.to_tensor(), output_dense)
 
+  def test_TimeDistributed_set_static_shape(self):
+    layer = keras.layers.TimeDistributed(keras.layers.Conv2D(16, (3, 3)))
+    inputs = keras.Input(batch_shape=(1, None, 32, 32, 1))
+    outputs = layer(inputs)
+    # Make sure the batch dim is not lost after array_ops.reshape.
+    self.assertListEqual(outputs.shape.as_list(), [1, None, 30, 30, 16])
+
 
 @combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class BidirectionalTest(test.TestCase, parameterized.TestCase):
@@ -492,10 +498,11 @@ class BidirectionalTest(test.TestCase, parameterized.TestCase):
 
       # check whether the model variables are present in the
       # trackable list of objects
-      checkpointed_objects = object_identity.ObjectIdentitySet(
-          trackable_util.list_objects(model))
+      checkpointed_object_ids = {
+          id(o) for o in trackable_util.list_objects(model)
+      }
       for v in model.variables:
-        self.assertIn(v, checkpointed_objects)
+        self.assertIn(id(v), checkpointed_object_ids)
 
       # test compute output shape
       ref_shape = model.layers[-1].output.shape
@@ -1030,10 +1037,11 @@ class BidirectionalTest(test.TestCase, parameterized.TestCase):
 
     # check whether the model variables are present in the
     # trackable list of objects
-    checkpointed_objects = object_identity.ObjectIdentitySet(
-        trackable_util.list_objects(model))
+    checkpointed_object_ids = {
+        id(o) for o in trackable_util.list_objects(model)
+    }
     for v in model.variables:
-      self.assertIn(v, checkpointed_objects)
+      self.assertIn(id(v), checkpointed_object_ids)
 
     # test compute output shape
     ref_shape = model.layers[-1].output.shape
@@ -1152,7 +1160,7 @@ class BidirectionalTest(test.TestCase, parameterized.TestCase):
         epochs=1,
         batch_size=10)
 
-  @tf_test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_wrapped_rnn_cell(self):
     # See https://github.com/tensorflow/tensorflow/issues/26581.
     batch = 20
diff --git a/tensorflow/python/keras/legacy_tf_layers/BUILD b/tensorflow/python/keras/legacy_tf_layers/BUILD
index 3ce0d3c6bec..be93336b833 100644
--- a/tensorflow/python/keras/legacy_tf_layers/BUILD
+++ b/tensorflow/python/keras/legacy_tf_layers/BUILD
@@ -9,6 +9,12 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
+filegroup(
+    name = "all_py_srcs",
+    srcs = glob(["*.py"]),
+    visibility = ["//tensorflow/python/keras/google/private_tf_api_test:__pkg__"],
+)
+
 py_library(
     name = "layers_base",
     srcs = [
diff --git a/tensorflow/python/keras/legacy_tf_layers/base.py b/tensorflow/python/keras/legacy_tf_layers/base.py
index 25b9ddca65e..8052651efa7 100644
--- a/tensorflow/python/keras/legacy_tf_layers/base.py
+++ b/tensorflow/python/keras/legacy_tf_layers/base.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import copy
+import warnings
 
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
@@ -26,13 +27,12 @@ from tensorflow.python.keras import backend
 from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.mixed_precision.experimental import policy
+from tensorflow.python.keras.utils import tf_contextlib
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.training.tracking import base as trackable
-from tensorflow.python.util import deprecation
 from tensorflow.python.util import function_utils
 from tensorflow.python.util import nest
-from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util.tf_export import tf_export
 
 # Avoid breaking users who directly import this symbol from this file.
@@ -237,11 +237,11 @@ class Layer(base_layer.Layer):
   # We no longer track graph in tf.layers layers. This property is only kept to
   # maintain API backward compatibility.
   @property
-  @deprecation.deprecated(
-      date=None,
-      instructions='Stop using this property because tf.layers layers no '
-      'longer track their graph.')
   def graph(self):
+    warnings.warn('`Layer.graph` is deprecated and '
+                  'will be removed in a future version. '
+                  'Please stop using this property because tf.layers layers no '
+                  'longer track their graph.')
     if context.executing_eagerly():
       raise RuntimeError('Layer.graph not supported when executing eagerly.')
     return None
diff --git a/tensorflow/python/keras/legacy_tf_layers/convolutional.py b/tensorflow/python/keras/legacy_tf_layers/convolutional.py
index 4fd53531fd1..4f3732510a0 100644
--- a/tensorflow/python/keras/legacy_tf_layers/convolutional.py
+++ b/tensorflow/python/keras/legacy_tf_layers/convolutional.py
@@ -19,10 +19,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import warnings
+
 from tensorflow.python.keras import layers as keras_layers
 from tensorflow.python.keras.legacy_tf_layers import base
 from tensorflow.python.ops import init_ops
-from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -118,9 +119,6 @@ class Conv1D(keras_layers.Conv1D, base.Layer):
         name=name, **kwargs)
 
 
-@deprecation.deprecated(
-    date=None,
-    instructions='Use `tf.keras.layers.Conv1D` instead.')
 @tf_export(v1=['layers.conv1d'])
 def conv1d(inputs,
            filters,
@@ -201,6 +199,9 @@ def conv1d(inputs,
   Raises:
     ValueError: if eager execution is enabled.
   """
+  warnings.warn('`tf.layers.conv1d` is deprecated and '
+                'will be removed in a future version. '
+                'Please Use `tf.keras.layers.Conv1D` instead.')
   layer = Conv1D(
       filters=filters,
       kernel_size=kernel_size,
@@ -323,9 +324,6 @@ class Conv2D(keras_layers.Conv2D, base.Layer):
         name=name, **kwargs)
 
 
-@deprecation.deprecated(
-    date=None,
-    instructions='Use `tf.keras.layers.Conv2D` instead.')
 @tf_export(v1=['layers.conv2d'])
 def conv2d(inputs,
            filters,
@@ -413,6 +411,9 @@ def conv2d(inputs,
   Raises:
     ValueError: if eager execution is enabled.
   """
+  warnings.warn('`tf.layers.conv2d` is deprecated and '
+                'will be removed in a future version. '
+                'Please Use `tf.keras.layers.Conv2D` instead.')
   layer = Conv2D(
       filters=filters,
       kernel_size=kernel_size,
@@ -536,9 +537,6 @@ class Conv3D(keras_layers.Conv3D, base.Layer):
         name=name, **kwargs)
 
 
-@deprecation.deprecated(
-    date=None,
-    instructions='Use `tf.keras.layers.Conv3D` instead.')
 @tf_export(v1=['layers.conv3d'])
 def conv3d(inputs,
            filters,
@@ -627,6 +625,9 @@ def conv3d(inputs,
   Raises:
     ValueError: if eager execution is enabled.
   """
+  warnings.warn('`tf.layers.conv3d` is deprecated and '
+                'will be removed in a future version. '
+                'Please Use `tf.keras.layers.Conv3D` instead.')
   layer = Conv3D(
       filters=filters,
       kernel_size=kernel_size,
@@ -875,9 +876,6 @@ class SeparableConv2D(keras_layers.SeparableConv2D, base.Layer):
         **kwargs)
 
 
-@deprecation.deprecated(
-    date=None,
-    instructions='Use `tf.keras.layers.SeparableConv1D` instead.')
 @tf_export(v1=['layers.separable_conv1d'])
 def separable_conv1d(inputs,
                      filters,
@@ -971,6 +969,9 @@ def separable_conv1d(inputs,
   Raises:
     ValueError: if eager execution is enabled.
   """
+  warnings.warn('`tf.layers.separable_conv1d` is deprecated and '
+                'will be removed in a future version. '
+                'Please Use `tf.keras.layers.SeparableConv1D` instead.')
   layer = SeparableConv1D(
       filters=filters,
       kernel_size=kernel_size,
@@ -998,9 +999,6 @@ def separable_conv1d(inputs,
   return layer.apply(inputs)
 
 
-@deprecation.deprecated(
-    date=None,
-    instructions='Use `tf.keras.layers.SeparableConv2D` instead.')
 @tf_export(v1=['layers.separable_conv2d'])
 def separable_conv2d(inputs,
                      filters,
@@ -1099,6 +1097,9 @@ def separable_conv2d(inputs,
   Raises:
     ValueError: if eager execution is enabled.
   """
+  warnings.warn('`tf.layers.separable_conv2d` is deprecated and '
+                'will be removed in a future version. '
+                'Please Use `tf.keras.layers.SeparableConv2D` instead.')
   layer = SeparableConv2D(
       filters=filters,
       kernel_size=kernel_size,
@@ -1214,9 +1215,6 @@ class Conv2DTranspose(keras_layers.Conv2DTranspose, base.Layer):
         **kwargs)
 
 
-@deprecation.deprecated(
-    date=None,
-    instructions='Use `tf.keras.layers.Conv2DTranspose` instead.')
 @tf_export(v1=['layers.conv2d_transpose'])
 def conv2d_transpose(inputs,
                      filters,
@@ -1293,6 +1291,9 @@ def conv2d_transpose(inputs,
   Raises:
     ValueError: if eager execution is enabled.
   """
+  warnings.warn('`tf.layers.conv2d_transpose` is deprecated and '
+                'will be removed in a future version. '
+                'Please Use `tf.keras.layers.Conv2DTranspose` instead.')
   layer = Conv2DTranspose(
       filters=filters,
       kernel_size=kernel_size,
@@ -1400,9 +1401,6 @@ class Conv3DTranspose(keras_layers.Conv3DTranspose, base.Layer):
         **kwargs)
 
 
-@deprecation.deprecated(
-    date=None,
-    instructions='Use `tf.keras.layers.Conv3DTranspose` instead.')
 @tf_export(v1=['layers.conv3d_transpose'])
 def conv3d_transpose(inputs,
                      filters,
@@ -1473,6 +1471,9 @@ def conv3d_transpose(inputs,
   Raises:
     ValueError: if eager execution is enabled.
   """
+  warnings.warn('`tf.layers.conv3d_transpose` is deprecated and '
+                'will be removed in a future version. '
+                'Please Use `tf.keras.layers.Conv3DTranspose` instead.')
   layer = Conv3DTranspose(
       filters=filters,
       kernel_size=kernel_size,
diff --git a/tensorflow/python/keras/legacy_tf_layers/core.py b/tensorflow/python/keras/legacy_tf_layers/core.py
index 78ddf2547ae..b401801bd4a 100644
--- a/tensorflow/python/keras/legacy_tf_layers/core.py
+++ b/tensorflow/python/keras/legacy_tf_layers/core.py
@@ -21,11 +21,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import warnings
 
 from tensorflow.python.keras import layers as keras_layers
 from tensorflow.python.keras.legacy_tf_layers import base
 from tensorflow.python.ops import init_ops
-from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -110,8 +110,6 @@ class Dense(keras_layers.Dense, base.Layer):
                                 **kwargs)
 
 
-@deprecation.deprecated(
-    date=None, instructions='Use keras.layers.Dense instead.')
 @tf_export(v1=['layers.dense'])
 def dense(
     inputs, units,
@@ -170,6 +168,9 @@ def dense(
   Raises:
     ValueError: if eager execution is enabled.
   """
+  warnings.warn('`tf.layers.dense` is deprecated and '
+                'will be removed in a future version. '
+                'Please use `tf.keras.layers.Dense` instead.')
   layer = Dense(units,
                 activation=activation,
                 use_bias=use_bias,
@@ -226,9 +227,6 @@ class Dropout(keras_layers.Dropout, base.Layer):
     return super(Dropout, self).call(inputs, training=training)
 
 
-@deprecation.deprecated(
-    date=None,
-    instructions='Use keras.layers.dropout instead.')
 @tf_export(v1=['layers.dropout'])
 def dropout(inputs,
             rate=0.5,
@@ -267,6 +265,9 @@ def dropout(inputs,
   Raises:
     ValueError: if eager execution is enabled.
   """
+  warnings.warn('`tf.layers.dropout` is deprecated and '
+                'will be removed in a future version. '
+                'Please use `tf.keras.layers.Dropout` instead.')
   layer = Dropout(rate, noise_shape=noise_shape, seed=seed, name=name)
   return layer.apply(inputs, training=training)
 
@@ -297,9 +298,6 @@ class Flatten(keras_layers.Flatten, base.Layer):
   pass
 
 
-@deprecation.deprecated(
-    date=None,
-    instructions='Use keras.layers.Flatten instead.')
 @tf_export(v1=['layers.flatten'])
 def flatten(inputs, name=None, data_format='channels_last'):
   """Flattens an input tensor while preserving the batch axis (axis 0).
@@ -328,6 +326,9 @@ def flatten(inputs, name=None, data_format='channels_last'):
     # now `y` has shape `(None, None)`
   ```
   """
+  warnings.warn('`tf.layers.flatten` is deprecated and '
+                'will be removed in a future version. '
+                'Please use `tf.keras.layers.Flatten` instead.')
   layer = Flatten(name=name, data_format=data_format)
   return layer.apply(inputs)
 
diff --git a/tensorflow/python/keras/legacy_tf_layers/core_test.py b/tensorflow/python/keras/legacy_tf_layers/core_test.py
index 88f9a1afa0a..3da2e947cad 100644
--- a/tensorflow/python/keras/legacy_tf_layers/core_test.py
+++ b/tensorflow/python/keras/legacy_tf_layers/core_test.py
@@ -290,22 +290,22 @@ class DenseTest(test.TestCase, parameterized.TestCase):
       self.assertAllClose(weights['scope/dense/bias'].read_value(), np.zeros(
           (2)))
 
+  @combinations.generate(combinations.combine(mode=['eager']))
   def testEagerExecution(self):
-    with context.eager_mode():
-      container = variable_scope.EagerVariableStore()
-      x = constant_op.constant([[2.0]])
-      with container.as_default():
-        y = core_layers.dense(
-            x, 1, name='my_dense',
-            kernel_initializer=init_ops.ones_initializer())
-      self.assertAllEqual(y, [[2.0]])
-      self.assertEqual(len(container.variables()), 2)
-      # Recreate the layer to test reuse.
-      with container.as_default():
-        core_layers.dense(
-            x, 1, name='my_dense',
-            kernel_initializer=init_ops.ones_initializer())
-      self.assertEqual(len(container.variables()), 2)
+    container = variable_scope.EagerVariableStore()
+    x = constant_op.constant([[2.0]])
+    with container.as_default():
+      y = core_layers.dense(
+          x, 1, name='my_dense',
+          kernel_initializer=init_ops.ones_initializer())
+    self.assertAllEqual(y, [[2.0]])
+    self.assertEqual(len(container.variables()), 2)
+    # Recreate the layer to test reuse.
+    with container.as_default():
+      core_layers.dense(
+          x, 1, name='my_dense',
+          kernel_initializer=init_ops.ones_initializer())
+    self.assertEqual(len(container.variables()), 2)
 
   def testFunctionalDenseWithCustomGetter(self):
     called = [0]
diff --git a/tensorflow/python/keras/legacy_tf_layers/normalization.py b/tensorflow/python/keras/legacy_tf_layers/normalization.py
index d874882aed1..4b16ad62336 100644
--- a/tensorflow/python/keras/legacy_tf_layers/normalization.py
+++ b/tensorflow/python/keras/legacy_tf_layers/normalization.py
@@ -19,11 +19,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import warnings
 
 from tensorflow.python.keras.layers import normalization as keras_normalization
 from tensorflow.python.keras.legacy_tf_layers import base
 from tensorflow.python.ops import init_ops
-from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -172,11 +172,6 @@ class BatchNormalization(keras_normalization.BatchNormalization, base.Layer):
     return super(BatchNormalization, self).call(inputs, training=training)
 
 
-@deprecation.deprecated(
-    date=None, instructions='Use keras.layers.BatchNormalization instead.  In '
-    'particular, `tf.control_dependencies(tf.GraphKeys.UPDATE_OPS)` should not '
-    'be used (consult the `tf.keras.layers.BatchNormalization` '
-    'documentation).')
 @tf_export(v1=['layers.batch_normalization'])
 def batch_normalization(inputs,
                         axis=-1,
@@ -309,6 +304,13 @@ def batch_normalization(inputs,
       2017](http://papers.nips.cc/paper/6790-batch-renormalization-towards-reducing-minibatch-dependence-in-batch-normalized-models)
       ([pdf](http://papers.nips.cc/paper/6790-batch-renormalization-towards-reducing-minibatch-dependence-in-batch-normalized-models.pdf))
   """
+  warnings.warn(
+      '`tf.layers.batch_normalization` is deprecated and '
+      'will be removed in a future version. '
+      'Please use `tf.keras.layers.BatchNormalization` instead. '
+      'In particular, `tf.control_dependencies(tf.GraphKeys.UPDATE_OPS)` '
+      'should not be used (consult the `tf.keras.layers.BatchNormalization` '
+      'documentation).')
   layer = BatchNormalization(
       axis=axis,
       momentum=momentum,
diff --git a/tensorflow/python/keras/legacy_tf_layers/pooling.py b/tensorflow/python/keras/legacy_tf_layers/pooling.py
index 2e1ba36c5b9..a989cb30e01 100644
--- a/tensorflow/python/keras/legacy_tf_layers/pooling.py
+++ b/tensorflow/python/keras/legacy_tf_layers/pooling.py
@@ -19,9 +19,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import warnings
+
 from tensorflow.python.keras import layers as keras_layers
 from tensorflow.python.keras.legacy_tf_layers import base
-from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -58,8 +59,6 @@ class AveragePooling1D(keras_layers.AveragePooling1D, base.Layer):
         **kwargs)
 
 
-@deprecation.deprecated(
-    date=None, instructions='Use keras.layers.AveragePooling1D instead.')
 @tf_export(v1=['layers.average_pooling1d'])
 def average_pooling1d(inputs, pool_size, strides,
                       padding='valid', data_format='channels_last',
@@ -87,6 +86,9 @@ def average_pooling1d(inputs, pool_size, strides,
   Raises:
     ValueError: if eager execution is enabled.
   """
+  warnings.warn('`tf.layers.average_pooling1d` is deprecated and '
+                'will be removed in a future version. '
+                'Please use `tf.keras.layers.AveragePooling1D` instead.')
   layer = AveragePooling1D(pool_size=pool_size,
                            strides=strides,
                            padding=padding,
@@ -128,8 +130,6 @@ class MaxPooling1D(keras_layers.MaxPooling1D, base.Layer):
         **kwargs)
 
 
-@deprecation.deprecated(
-    date=None, instructions='Use keras.layers.MaxPooling1D instead.')
 @tf_export(v1=['layers.max_pooling1d'])
 def max_pooling1d(inputs, pool_size, strides,
                   padding='valid', data_format='channels_last',
@@ -157,6 +157,9 @@ def max_pooling1d(inputs, pool_size, strides,
   Raises:
     ValueError: if eager execution is enabled.
   """
+  warnings.warn('`tf.layers.max_pooling1d` is deprecated and '
+                'will be removed in a future version. '
+                'Please use `tf.keras.layers.MaxPooling1D` instead.')
   layer = MaxPooling1D(pool_size=pool_size,
                        strides=strides,
                        padding=padding,
@@ -198,8 +201,6 @@ class AveragePooling2D(keras_layers.AveragePooling2D, base.Layer):
         padding=padding, data_format=data_format, name=name, **kwargs)
 
 
-@deprecation.deprecated(
-    date=None, instructions='Use keras.layers.AveragePooling2D instead.')
 @tf_export(v1=['layers.average_pooling2d'])
 def average_pooling2d(inputs,
                       pool_size, strides,
@@ -232,6 +233,9 @@ def average_pooling2d(inputs,
   Raises:
     ValueError: if eager execution is enabled.
   """
+  warnings.warn('`tf.layers.average_pooling2d` is deprecated and '
+                'will be removed in a future version. '
+                'Please use `tf.keras.layers.AveragePooling2D` instead.')
   layer = AveragePooling2D(pool_size=pool_size, strides=strides,
                            padding=padding, data_format=data_format,
                            name=name)
@@ -271,8 +275,6 @@ class MaxPooling2D(keras_layers.MaxPooling2D, base.Layer):
         padding=padding, data_format=data_format, name=name, **kwargs)
 
 
-@deprecation.deprecated(
-    date=None, instructions='Use keras.layers.MaxPooling2D instead.')
 @tf_export(v1=['layers.max_pooling2d'])
 def max_pooling2d(inputs,
                   pool_size, strides,
@@ -305,6 +307,9 @@ def max_pooling2d(inputs,
   Raises:
     ValueError: if eager execution is enabled.
   """
+  warnings.warn('`tf.layers.max_pooling2d` is deprecated and '
+                'will be removed in a future version. '
+                'Please use `tf.keras.layers.MaxPooling2D` instead.')
   layer = MaxPooling2D(pool_size=pool_size, strides=strides,
                        padding=padding, data_format=data_format,
                        name=name)
@@ -346,8 +351,6 @@ class AveragePooling3D(keras_layers.AveragePooling3D, base.Layer):
         padding=padding, data_format=data_format, name=name, **kwargs)
 
 
-@deprecation.deprecated(
-    date=None, instructions='Use keras.layers.AveragePooling3D instead.')
 @tf_export(v1=['layers.average_pooling3d'])
 def average_pooling3d(inputs,
                       pool_size, strides,
@@ -382,6 +385,9 @@ def average_pooling3d(inputs,
   Raises:
     ValueError: if eager execution is enabled.
   """
+  warnings.warn('`tf.layers.average_pooling3d` is deprecated and '
+                'will be removed in a future version. '
+                'Please use `tf.keras.layers.AveragePooling3D` instead.')
   layer = AveragePooling3D(pool_size=pool_size, strides=strides,
                            padding=padding, data_format=data_format,
                            name=name)
@@ -423,8 +429,6 @@ class MaxPooling3D(keras_layers.MaxPooling3D, base.Layer):
         padding=padding, data_format=data_format, name=name, **kwargs)
 
 
-@deprecation.deprecated(
-    date=None, instructions='Use keras.layers.MaxPooling3D instead.')
 @tf_export(v1=['layers.max_pooling3d'])
 def max_pooling3d(inputs,
                   pool_size, strides,
@@ -457,6 +461,9 @@ def max_pooling3d(inputs,
   Raises:
     ValueError: if eager execution is enabled.
   """
+  warnings.warn('`tf.layers.max_pooling3d` is deprecated and '
+                'will be removed in a future version. '
+                'Please use `tf.keras.layers.MaxPooling3D` instead.')
   layer = MaxPooling3D(pool_size=pool_size, strides=strides,
                        padding=padding, data_format=data_format,
                        name=name)
diff --git a/tensorflow/python/keras/losses.py b/tensorflow/python/keras/losses.py
index f75e6af6e30..6b74121cf80 100644
--- a/tensorflow/python/keras/losses.py
+++ b/tensorflow/python/keras/losses.py
@@ -1189,7 +1189,7 @@ def mean_squared_error(y_true, y_pred):
   Returns:
     Mean squared error values. shape = `[batch_size, d0, .. dN-1]`.
   """
-  y_pred = ops.convert_to_tensor_v2(y_pred)
+  y_pred = ops.convert_to_tensor_v2_with_dispatch(y_pred)
   y_true = math_ops.cast(y_true, y_pred.dtype)
   return K.mean(math_ops.squared_difference(y_pred, y_true), axis=-1)
 
@@ -1222,7 +1222,7 @@ def mean_absolute_error(y_true, y_pred):
   Returns:
     Mean absolute error values. shape = `[batch_size, d0, .. dN-1]`.
   """
-  y_pred = ops.convert_to_tensor_v2(y_pred)
+  y_pred = ops.convert_to_tensor_v2_with_dispatch(y_pred)
   y_true = math_ops.cast(y_true, y_pred.dtype)
   return K.mean(math_ops.abs(y_pred - y_true), axis=-1)
 
@@ -1257,7 +1257,7 @@ def mean_absolute_percentage_error(y_true, y_pred):
   Returns:
     Mean absolute percentage error values. shape = `[batch_size, d0, .. dN-1]`.
   """
-  y_pred = ops.convert_to_tensor_v2(y_pred)
+  y_pred = ops.convert_to_tensor_v2_with_dispatch(y_pred)
   y_true = math_ops.cast(y_true, y_pred.dtype)
   diff = math_ops.abs(
       (y_true - y_pred) / K.maximum(math_ops.abs(y_true), K.epsilon()))
@@ -1296,7 +1296,7 @@ def mean_squared_logarithmic_error(y_true, y_pred):
   Returns:
     Mean squared logarithmic error values. shape = `[batch_size, d0, .. dN-1]`.
   """
-  y_pred = ops.convert_to_tensor_v2(y_pred)
+  y_pred = ops.convert_to_tensor_v2_with_dispatch(y_pred)
   y_true = math_ops.cast(y_true, y_pred.dtype)
   first_log = math_ops.log(K.maximum(y_pred, K.epsilon()) + 1.)
   second_log = math_ops.log(K.maximum(y_true, K.epsilon()) + 1.)
@@ -1344,7 +1344,7 @@ def squared_hinge(y_true, y_pred):
   Returns:
      Squared hinge loss values. shape = `[batch_size, d0, .. dN-1]`.
   """
-  y_pred = ops.convert_to_tensor_v2(y_pred)
+  y_pred = ops.convert_to_tensor_v2_with_dispatch(y_pred)
   y_true = math_ops.cast(y_true, y_pred.dtype)
   y_true = _maybe_convert_labels(y_true)
   return K.mean(
@@ -1377,7 +1377,7 @@ def hinge(y_true, y_pred):
   Returns:
     Hinge loss values. shape = `[batch_size, d0, .. dN-1]`.
   """
-  y_pred = ops.convert_to_tensor_v2(y_pred)
+  y_pred = ops.convert_to_tensor_v2_with_dispatch(y_pred)
   y_true = math_ops.cast(y_true, y_pred.dtype)
   y_true = _maybe_convert_labels(y_true)
   return K.mean(math_ops.maximum(1. - y_true * y_pred, 0.), axis=-1)
@@ -1409,7 +1409,7 @@ def categorical_hinge(y_true, y_pred):
   Returns:
     Categorical hinge loss values.
   """
-  y_pred = ops.convert_to_tensor_v2(y_pred)
+  y_pred = ops.convert_to_tensor_v2_with_dispatch(y_pred)
   y_true = math_ops.cast(y_true, y_pred.dtype)
   pos = math_ops.reduce_sum(y_true * y_pred, axis=-1)
   neg = math_ops.reduce_max((1. - y_true) * y_pred, axis=-1)
@@ -1444,7 +1444,7 @@ def huber(y_true, y_pred, delta=1.0):
   delta = math_ops.cast(delta, dtype=K.floatx())
   error = math_ops.subtract(y_pred, y_true)
   abs_error = math_ops.abs(error)
-  half = ops.convert_to_tensor_v2(0.5, dtype=abs_error.dtype)
+  half = ops.convert_to_tensor_v2_with_dispatch(0.5, dtype=abs_error.dtype)
   return K.mean(
       array_ops.where_v2(
           abs_error <= delta, half * math_ops.pow(error, 2),
@@ -1481,7 +1481,7 @@ def log_cosh(y_true, y_pred):
   Returns:
     Logcosh error values. shape = `[batch_size, d0, .. dN-1]`.
   """
-  y_pred = ops.convert_to_tensor_v2(y_pred)
+  y_pred = ops.convert_to_tensor_v2_with_dispatch(y_pred)
   y_true = math_ops.cast(y_true, y_pred.dtype)
 
   def _logcosh(x):
@@ -1518,9 +1518,10 @@ def categorical_crossentropy(y_true,
   Returns:
     Categorical crossentropy loss value.
   """
-  y_pred = ops.convert_to_tensor_v2(y_pred)
+  y_pred = ops.convert_to_tensor_v2_with_dispatch(y_pred)
   y_true = math_ops.cast(y_true, y_pred.dtype)
-  label_smoothing = ops.convert_to_tensor_v2(label_smoothing, dtype=K.floatx())
+  label_smoothing = ops.convert_to_tensor_v2_with_dispatch(
+      label_smoothing, dtype=K.floatx())
 
   def _smooth_labels():
     num_classes = math_ops.cast(array_ops.shape(y_true)[-1], y_pred.dtype)
@@ -1557,7 +1558,7 @@ def sparse_categorical_crossentropy(y_true, y_pred, from_logits=False, axis=-1):
   Returns:
     Sparse categorical crossentropy loss value.
   """
-  y_pred = ops.convert_to_tensor_v2(y_pred)
+  y_pred = ops.convert_to_tensor_v2_with_dispatch(y_pred)
   y_true = math_ops.cast(y_true, y_pred.dtype)
   return K.sparse_categorical_crossentropy(
       y_true, y_pred, from_logits=from_logits, axis=axis)
@@ -1588,9 +1589,10 @@ def binary_crossentropy(y_true, y_pred, from_logits=False, label_smoothing=0):
   Returns:
     Binary crossentropy loss value. shape = `[batch_size, d0, .. dN-1]`.
   """
-  y_pred = ops.convert_to_tensor_v2(y_pred)
+  y_pred = ops.convert_to_tensor_v2_with_dispatch(y_pred)
   y_true = math_ops.cast(y_true, y_pred.dtype)
-  label_smoothing = ops.convert_to_tensor_v2(label_smoothing, dtype=K.floatx())
+  label_smoothing = ops.convert_to_tensor_v2_with_dispatch(
+      label_smoothing, dtype=K.floatx())
 
   def _smooth_labels():
     return y_true * (1.0 - label_smoothing) + 0.5 * label_smoothing
@@ -1638,7 +1640,7 @@ def kl_divergence(y_true, y_pred):
   Raises:
     TypeError: If `y_true` cannot be cast to the `y_pred.dtype`.
   """
-  y_pred = ops.convert_to_tensor_v2(y_pred)
+  y_pred = ops.convert_to_tensor_v2_with_dispatch(y_pred)
   y_true = math_ops.cast(y_true, y_pred.dtype)
   y_true = K.clip(y_true, K.epsilon(), 1)
   y_pred = K.clip(y_pred, K.epsilon(), 1)
@@ -1674,7 +1676,7 @@ def poisson(y_true, y_pred):
   Raises:
     InvalidArgumentError: If `y_true` and `y_pred` have incompatible shapes.
   """
-  y_pred = ops.convert_to_tensor_v2(y_pred)
+  y_pred = ops.convert_to_tensor_v2_with_dispatch(y_pred)
   y_true = math_ops.cast(y_true, y_pred.dtype)
   return K.mean(y_pred - y_true * math_ops.log(y_pred + K.epsilon()), axis=-1)
 
@@ -1728,12 +1730,13 @@ def cosine_similarity(y_true, y_pred, axis=-1):
 class CosineSimilarity(LossFunctionWrapper):
   """Computes the cosine similarity between labels and predictions.
 
-  Note that it is a negative quantity between -1 and 0, where 0 indicates
-  orthogonality and values closer to -1 indicate greater similarity. This makes
-  it usable as a loss function in a setting where you try to maximize the
-  proximity between predictions and targets. If either `y_true` or `y_pred`
-  is a zero vector, cosine similarity will be 0 regardless of the proximity
-  between predictions and targets.
+  Note that it is a number between -1 and 1. When it is a negative number
+  between -1 and 0, 0 indicates orthogonality and values closer to -1
+  indicate greater similarity. The values closer to 1 indicate greater
+  dissimilarity. This makes it usable as a loss function in a setting
+  where you try to maximize the proximity between predictions and targets.
+  If either `y_true` or `y_pred` is a zero vector, cosine similarity will be 0
+  regardless of the proximity between predictions and targets.
 
   `loss = -sum(l2_norm(y_true) * l2_norm(y_pred))`
 
diff --git a/tensorflow/python/keras/losses_test.py b/tensorflow/python/keras/losses_test.py
index 34213c8308a..4de49e69829 100644
--- a/tensorflow/python/keras/losses_test.py
+++ b/tensorflow/python/keras/losses_test.py
@@ -95,16 +95,19 @@ class KerasLossesTest(test.TestCase, parameterized.TestCase):
     p = backend.placeholder()
     o = losses.categorical_crossentropy(t, p)
 
-    t_val = ops.convert_to_tensor_v2([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]])
-    p_val = ops.convert_to_tensor_v2([[.9, .05, .05], [.05, .89, .06],
-                                      [.05, .01, .94]])
+    t_val = ops.convert_to_tensor_v2_with_dispatch([[1., 0., 0.], [0., 1., 0.],
+                                                    [0., 0., 1.]])
+    p_val = ops.convert_to_tensor_v2_with_dispatch([[.9, .05, .05],
+                                                    [.05, .89, .06],
+                                                    [.05, .01, .94]])
     f = backend.function([t, p], o)
 
     result = f([t_val, p_val])
     self.assertArrayNear(result, [.105, .116, .062], 1e-3)
 
     # from logits
-    p_val = ops.convert_to_tensor_v2([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
+    p_val = ops.convert_to_tensor_v2_with_dispatch([[8., 1., 1.], [0., 9., 1.],
+                                                    [2., 3., 5.]])
     o = losses.categorical_crossentropy(t, p, from_logits=True)
     f = backend.function([t, p], o)
 
@@ -133,16 +136,18 @@ class KerasLossesTest(test.TestCase, parameterized.TestCase):
     p = backend.placeholder()
     o = losses.sparse_categorical_crossentropy(t, p)
 
-    t_val = ops.convert_to_tensor_v2([0, 1, 2])
-    p_val = ops.convert_to_tensor_v2([[.9, .05, .05], [.05, .89, .06],
-                                      [.05, .01, .94]])
+    t_val = ops.convert_to_tensor_v2_with_dispatch([0, 1, 2])
+    p_val = ops.convert_to_tensor_v2_with_dispatch([[.9, .05, .05],
+                                                    [.05, .89, .06],
+                                                    [.05, .01, .94]])
     f = backend.function([t, p], o)
 
     result = f([t_val, p_val])
     self.assertArrayNear(result, [.105, .116, .062], 1e-3)
 
     # from logits
-    p_val = ops.convert_to_tensor_v2([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
+    p_val = ops.convert_to_tensor_v2_with_dispatch([[8., 1., 1.], [0., 9., 1.],
+                                                    [2., 3., 5.]])
     o = losses.sparse_categorical_crossentropy(t, p, from_logits=True)
     f = backend.function([t, p], o)
 
diff --git a/tensorflow/python/keras/metrics.py b/tensorflow/python/keras/metrics.py
index 12532a54489..c4bc03aed8c 100644
--- a/tensorflow/python/keras/metrics.py
+++ b/tensorflow/python/keras/metrics.py
@@ -56,6 +56,7 @@ from tensorflow.python.keras.losses import squared_hinge
 from tensorflow.python.keras.saving.saved_model import metric_serialization
 from tensorflow.python.keras.utils import losses_utils
 from tensorflow.python.keras.utils import metrics_utils
+from tensorflow.python.keras.utils import tf_inspect
 from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
 from tensorflow.python.keras.utils.generic_utils import to_list
@@ -72,7 +73,6 @@ from tensorflow.python.ops import weights_broadcast_ops
 from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import dispatch
 from tensorflow.python.util import nest
-from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import keras_export
 from tensorflow.tools.docs import doc_controls
 
@@ -644,7 +644,7 @@ class MeanMetricWrapper(Mean):
 
 @keras_export('keras.metrics.Accuracy')
 class Accuracy(MeanMetricWrapper):
-  """Calculates how often predictions equals labels.
+  """Calculates how often predictions equal labels.
 
   This metric creates two local variables, `total` and `count` that are used to
   compute the frequency with which `y_pred` matches `y_true`. This frequency is
@@ -686,7 +686,7 @@ class Accuracy(MeanMetricWrapper):
 
 @keras_export('keras.metrics.BinaryAccuracy')
 class BinaryAccuracy(MeanMetricWrapper):
-  """Calculates how often predictions matches binary labels.
+  """Calculates how often predictions match binary labels.
 
   This metric creates two local variables, `total` and `count` that are used to
   compute the frequency with which `y_pred` matches `y_true`. This frequency is
@@ -963,7 +963,7 @@ class _ConfusionMatrixConditionCount(Metric):
       result = self.accumulator[0]
     else:
       result = self.accumulator
-    return ops.convert_to_tensor_v2(result)
+    return ops.convert_to_tensor_v2_with_dispatch(result)
 
   def reset_states(self):
     num_thresholds = len(to_list(self.thresholds))
@@ -2824,7 +2824,7 @@ class MeanIoU(Metric):
     sum_over_col = math_ops.cast(
         math_ops.reduce_sum(self.total_cm, axis=1), dtype=self._dtype)
     true_positives = math_ops.cast(
-        array_ops.diag_part(self.total_cm), dtype=self._dtype)
+        array_ops.tensor_diag_part(self.total_cm), dtype=self._dtype)
 
     # sum_over_row + sum_over_col =
     #     2 * true_positives + false_positives + false_negatives.
@@ -3239,7 +3239,7 @@ def binary_accuracy(y_true, y_pred, threshold=0.5):
   Returns:
     Binary accuracy values. shape = `[batch_size, d0, .. dN-1]`
   """
-  y_pred = ops.convert_to_tensor_v2(y_pred)
+  y_pred = ops.convert_to_tensor_v2_with_dispatch(y_pred)
   threshold = math_ops.cast(threshold, y_pred.dtype)
   y_pred = math_ops.cast(y_pred > threshold, y_pred.dtype)
   return K.mean(math_ops.equal(y_true, y_pred), axis=-1)
@@ -3297,8 +3297,8 @@ def sparse_categorical_accuracy(y_true, y_pred):
   Returns:
     Sparse categorical accuracy values.
   """
-  y_pred = ops.convert_to_tensor_v2(y_pred)
-  y_true = ops.convert_to_tensor_v2(y_true)
+  y_pred = ops.convert_to_tensor_v2_with_dispatch(y_pred)
+  y_true = ops.convert_to_tensor_v2_with_dispatch(y_true)
   y_pred_rank = y_pred.shape.ndims
   y_true_rank = y_true.shape.ndims
   # If the shape of y_true is (num_samples, 1), squeeze to (num_samples,)
@@ -3364,8 +3364,8 @@ def sparse_top_k_categorical_accuracy(y_true, y_pred, k=5):
   Returns:
     Sparse top K categorical accuracy value.
   """
-  y_pred_rank = ops.convert_to_tensor_v2(y_pred).shape.ndims
-  y_true_rank = ops.convert_to_tensor_v2(y_true).shape.ndims
+  y_pred_rank = ops.convert_to_tensor_v2_with_dispatch(y_pred).shape.ndims
+  y_true_rank = ops.convert_to_tensor_v2_with_dispatch(y_true).shape.ndims
   # Flatten y_pred to (batch_size, num_samples) and y_true to (num_samples,)
   if (y_true_rank is not None) and (y_pred_rank is not None):
     if y_pred_rank > 2:
@@ -3402,6 +3402,7 @@ mae = MAE = mean_absolute_error
 mape = MAPE = mean_absolute_percentage_error
 msle = MSLE = mean_squared_logarithmic_error
 cosine_similarity = cosine_proximity
+log_cosh = logcosh
 
 
 def clone_metric(metric):
diff --git a/tensorflow/python/keras/metrics_functional_test.py b/tensorflow/python/keras/metrics_functional_test.py
index 1082ac939ff..ff0e103ce59 100644
--- a/tensorflow/python/keras/metrics_functional_test.py
+++ b/tensorflow/python/keras/metrics_functional_test.py
@@ -18,15 +18,16 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
-from tensorflow.python.eager import context
 from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import combinations
 from tensorflow.python.keras import metrics
 from tensorflow.python.platform import test
 
 
-class KerasFunctionalMetricsTest(test.TestCase):
+class KerasFunctionalMetricsTest(test.TestCase, parameterized.TestCase):
 
   def test_metrics(self):
     with self.cached_session():
@@ -68,21 +69,21 @@ class KerasFunctionalMetricsTest(test.TestCase):
       y_pred = K.variable(np.random.random((6, 7)))
       self.assertEqual(K.eval(metric(y_true, y_pred)).shape, (6,))
 
+  @combinations.generate(combinations.combine(mode=['eager']))
   def test_sparse_categorical_accuracy_eager(self):
     """Tests that ints passed in via Eager return results. See b/113504761."""
-    with context.eager_mode():
-      metric = metrics.sparse_categorical_accuracy
-      y_true = np.arange(6).reshape([6, 1])
-      y_pred = np.arange(36).reshape([6, 6])
-      self.assertAllEqual(metric(y_true, y_pred), [0., 0., 0., 0., 0., 1.])
+    metric = metrics.sparse_categorical_accuracy
+    y_true = np.arange(6).reshape([6, 1])
+    y_pred = np.arange(36).reshape([6, 6])
+    self.assertAllEqual(metric(y_true, y_pred), [0., 0., 0., 0., 0., 1.])
 
+  @combinations.generate(combinations.combine(mode=['eager']))
   def test_sparse_categorical_accuracy_float_eager(self):
     """Tests that floats passed in via Eager return results. See b/113504761."""
-    with context.eager_mode():
-      metric = metrics.sparse_categorical_accuracy
-      y_true = np.arange(6, dtype=np.float32).reshape([6, 1])
-      y_pred = np.arange(36).reshape([6, 6])
-      self.assertAllEqual(metric(y_true, y_pred), [0., 0., 0., 0., 0., 1.])
+    metric = metrics.sparse_categorical_accuracy
+    y_true = np.arange(6, dtype=np.float32).reshape([6, 1])
+    y_pred = np.arange(36).reshape([6, 6])
+    self.assertAllEqual(metric(y_true, y_pred), [0., 0., 0., 0., 0., 1.])
 
   def test_sparse_top_k_categorical_accuracy(self):
     with self.cached_session():
diff --git a/tensorflow/python/keras/metrics_test.py b/tensorflow/python/keras/metrics_test.py
index 7b339fc5a47..b297063e0d3 100644
--- a/tensorflow/python/keras/metrics_test.py
+++ b/tensorflow/python/keras/metrics_test.py
@@ -25,14 +25,11 @@ import os
 from absl.testing import parameterized
 import numpy as np
 
-from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
-from tensorflow.python.eager import function as eager_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
 from tensorflow.python.keras import backend
 from tensorflow.python.keras import combinations
 from tensorflow.python.keras import keras_parameterized
@@ -71,7 +68,7 @@ class KerasSumTest(test.TestCase, parameterized.TestCase):
       self.assertEqual(self.evaluate(m.total), 100)
 
       # check update_state() and result() + state accumulation + tensor input
-      update_op = m.update_state(ops.convert_to_tensor_v2([1, 5]))
+      update_op = m.update_state(ops.convert_to_tensor_v2_with_dispatch([1, 5]))
       self.evaluate(update_op)
       self.assertAlmostEqual(self.evaluate(m.result()), 106)
       self.assertEqual(self.evaluate(m.total), 106)  # 100 + 1 + 5
@@ -210,7 +207,7 @@ class MeanTest(keras_parameterized.TestCase):
     self.assertEqual(m2.dtype, dtypes.float32)
     self.assertEqual(len(m2.variables), 2)
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_function_wrapped_reset_state(self):
     m = metrics.Mean(name='my_mean')
 
@@ -1478,7 +1475,7 @@ class MeanTensorTest(test.TestCase, parameterized.TestCase):
     """Ensure that variables are created correctly in a tf function."""
     m = metrics.MeanTensor(dtype=dtypes.float64)
 
-    @eager_function.defun
+    @def_function.function
     def call_metric(x):
       return m(x)
 
@@ -1488,47 +1485,47 @@ class MeanTensorTest(test.TestCase, parameterized.TestCase):
       self.assertAllClose(self.evaluate(m.count), [1, 1])
       self.assertAllClose(self.evaluate(call_metric([20, 2])), [60, 21])
 
+  @combinations.generate(combinations.combine(mode=['eager']))
   def test_in_keras_model(self):
-    with context.eager_mode():
-      class ModelWithMetric(Model):
+    class ModelWithMetric(Model):
 
-        def __init__(self):
-          super(ModelWithMetric, self).__init__()
-          self.dense1 = layers.Dense(
-              3, activation='relu', kernel_initializer='ones')
-          self.dense2 = layers.Dense(
-              1, activation='sigmoid', kernel_initializer='ones')
-          self.mean_tensor = metrics.MeanTensor()
+      def __init__(self):
+        super(ModelWithMetric, self).__init__()
+        self.dense1 = layers.Dense(
+            3, activation='relu', kernel_initializer='ones')
+        self.dense2 = layers.Dense(
+            1, activation='sigmoid', kernel_initializer='ones')
+        self.mean_tensor = metrics.MeanTensor()
 
-        def call(self, x):
-          x = self.dense1(x)
-          x = self.dense2(x)
-          self.mean_tensor(self.dense1.kernel)
-          return x
+      def call(self, x):
+        x = self.dense1(x)
+        x = self.dense2(x)
+        self.mean_tensor(self.dense1.kernel)
+        return x
 
-      model = ModelWithMetric()
-      model.compile(
-          loss='mae',
-          optimizer='rmsprop',
-          run_eagerly=True)
+    model = ModelWithMetric()
+    model.compile(
+        loss='mae',
+        optimizer='rmsprop',
+        run_eagerly=True)
 
-      x = np.ones((100, 4))
-      y = np.zeros((100, 1))
-      model.evaluate(x, y, batch_size=50)
-      self.assertAllClose(self.evaluate(model.mean_tensor.result()),
-                          np.ones((4, 3)))
-      self.assertAllClose(self.evaluate(model.mean_tensor.total),
-                          np.full((4, 3), 2))
-      self.assertAllClose(self.evaluate(model.mean_tensor.count),
-                          np.full((4, 3), 2))
+    x = np.ones((100, 4))
+    y = np.zeros((100, 1))
+    model.evaluate(x, y, batch_size=50)
+    self.assertAllClose(self.evaluate(model.mean_tensor.result()),
+                        np.ones((4, 3)))
+    self.assertAllClose(self.evaluate(model.mean_tensor.total),
+                        np.full((4, 3), 2))
+    self.assertAllClose(self.evaluate(model.mean_tensor.count),
+                        np.full((4, 3), 2))
 
-      model.evaluate(x, y, batch_size=25)
-      self.assertAllClose(self.evaluate(model.mean_tensor.result()),
-                          np.ones((4, 3)))
-      self.assertAllClose(self.evaluate(model.mean_tensor.total),
-                          np.full((4, 3), 4))
-      self.assertAllClose(self.evaluate(model.mean_tensor.count),
-                          np.full((4, 3), 4))
+    model.evaluate(x, y, batch_size=25)
+    self.assertAllClose(self.evaluate(model.mean_tensor.result()),
+                        np.ones((4, 3)))
+    self.assertAllClose(self.evaluate(model.mean_tensor.total),
+                        np.full((4, 3), 4))
+    self.assertAllClose(self.evaluate(model.mean_tensor.count),
+                        np.full((4, 3), 4))
 
 
 @combinations.generate(combinations.combine(mode=['graph', 'eager']))
diff --git a/tensorflow/python/keras/mixed_precision/experimental/BUILD b/tensorflow/python/keras/mixed_precision/experimental/BUILD
index b143e5946f5..1d20febae46 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/BUILD
+++ b/tensorflow/python/keras/mixed_precision/experimental/BUILD
@@ -22,7 +22,6 @@ load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 package(
     default_visibility = [
         # TODO(scottzhu): Remove these two deps and convert the test to integration test.
-        "//tensorflow/python:__pkg__",  # For loss_scale_optimizer_test
         "//tensorflow/python/distribute:__pkg__",  # For collective_all_reduce_strategy_test
         "//tensorflow/python/keras:__subpackages__",
         "//tensorflow/tools/pip_package:__pkg__",
@@ -32,6 +31,12 @@ package(
 
 exports_files(["LICENSE"])
 
+filegroup(
+    name = "all_py_srcs",
+    srcs = glob(["*.py"]),
+    visibility = ["//tensorflow/python/keras/google/private_tf_api_test:__pkg__"],
+)
+
 py_library(
     name = "mixed_precision_experimental",
     srcs = ["__init__.py"],
diff --git a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py
index caad08ce066..d0d6442639f 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py
@@ -53,11 +53,12 @@ class AutoCastVariable(variables.Variable, core.Tensor):
   called.
   """
 
-  def __init__(self, variable):
+  def __init__(self, variable, op=None):
     """Creates an AutoCastVariable instance.
 
     Args:
       variable: A floating-point resource variable to wrap.
+      op: Optional operation of this variable.
 
     Raises:
       ValueError: If `variable` is not a floating-point resource variable
@@ -69,6 +70,7 @@ class AutoCastVariable(variables.Variable, core.Tensor):
       raise ValueError('variable must be a floating point variable but has '
                        'type: %s' % variable.dtype.name)
     self._variable = variable
+    self._op = op
 
   def _should_cast(self):
     """Returns True if this variable should be casted when accessed."""
@@ -124,7 +126,7 @@ class AutoCastVariable(variables.Variable, core.Tensor):
       raise ValueError(
           'Incompatible type conversion requested to type {!r} for variable '
           'of type {!r}'.format(dtype.name, self.dtype.name))
-    val = ops.convert_to_tensor_v2(
+    val = ops.convert_to_tensor_v2_with_dispatch(
         self._variable, dtype=self._variable.dtype, name=name)
     return math_ops.cast(val, self.dtype)
 
@@ -188,61 +190,89 @@ class AutoCastVariable(variables.Variable, core.Tensor):
   def constraint(self):
     return self._variable.constraint
 
+  def _apply_assign_update(self,
+                           update_fn,
+                           value,
+                           use_locking=None,
+                           name=None,
+                           read_value=True):
+    if ops.executing_eagerly_outside_functions():
+      assign_op = update_fn(value, use_locking, name, False)
+      if read_value:
+        return create_autocast_variable(self._variable, op=assign_op)
+      return assign_op
+
+    # Fallback to wrapping the returned variable in graph mode if possible
+    assign_var = update_fn(value, use_locking, name, read_value)
+    if read_value and resource_variable_ops.is_resource_variable(assign_var):
+      return create_autocast_variable(assign_var)
+    return assign_var
+
+  def _apply_update(self, update_fn, *args, **kwargs):
+    update_var = update_fn(*args, **kwargs)
+    if ops.executing_eagerly_outside_functions():
+      return self
+
+    # Fallback to wrapping the returned variable in graph mode if possible
+    if resource_variable_ops.is_resource_variable(update_var):
+      return create_autocast_variable(update_var)
+    return update_var
+
   def assign(self, value, use_locking=None, name=None, read_value=True):
-    assign_op = self._variable.assign(value, use_locking, name, read_value)
-    return _maybe_wrap(assign_op, wrap=read_value)
+    return self._apply_assign_update(self._variable.assign, value, use_locking,
+                                     name, read_value)
 
   def assign_add(self, delta, use_locking=None, name=None, read_value=True):
-    assign_op = self._variable.assign_add(delta, use_locking, name, read_value)
-    return _maybe_wrap(assign_op, wrap=read_value)
+    return self._apply_assign_update(self._variable.assign_add, delta,
+                                     use_locking, name, read_value)
 
   def assign_sub(self, delta, use_locking=None, name=None, read_value=True):
-    assign_op = self._variable.assign_sub(delta, use_locking, name, read_value)
-    return _maybe_wrap(assign_op, wrap=read_value)
+    return self._apply_assign_update(self._variable.assign_sub, delta,
+                                     use_locking, name, read_value)
 
   def scatter_sub(self, sparse_delta, use_locking=False, name=None):
-    var = self._variable.scatter_sub(sparse_delta, use_locking, name)
-    return _maybe_wrap(var)
+    return self._apply_update(self._variable.scatter_sub, sparse_delta,
+                              use_locking, name)
 
   def scatter_add(self, sparse_delta, use_locking=False, name=None):
-    var = self._variable.scatter_add(sparse_delta, use_locking, name)
-    return _maybe_wrap(var)
+    return self._apply_update(self._variable.scatter_add, sparse_delta,
+                              use_locking, name)
 
   def scatter_max(self, sparse_delta, use_locking=False, name=None):
-    var = self._variable.scatter_max(sparse_delta, use_locking, name)
-    return _maybe_wrap(var)
+    return self._apply_update(self._variable.scatter_max, sparse_delta,
+                              use_locking, name)
 
   def scatter_min(self, sparse_delta, use_locking=False, name=None):
-    var = self._variable.scatter_min(sparse_delta, use_locking, name)
-    return _maybe_wrap(var)
+    return self._apply_update(self._variable.scatter_min, sparse_delta,
+                              use_locking, name)
 
   def scatter_mul(self, sparse_delta, use_locking=False, name=None):
-    var = self._variable.scatter_mul(sparse_delta, use_locking, name)
-    return _maybe_wrap(var)
+    return self._apply_update(self._variable.scatter_mul, sparse_delta,
+                              use_locking, name)
 
   def scatter_div(self, sparse_delta, use_locking=False, name=None):
-    var = self._variable.scatter_div(sparse_delta, use_locking, name)
-    return _maybe_wrap(var)
+    return self._apply_update(self._variable.scatter_div, sparse_delta,
+                              use_locking, name)
 
   def scatter_update(self, sparse_delta, use_locking=False, name=None):
-    var = self._variable.scatter_update(sparse_delta, use_locking, name)
-    return _maybe_wrap(var)
+    return self._apply_update(self._variable.scatter_update, sparse_delta,
+                              use_locking, name)
 
   def batch_scatter_update(self, sparse_delta, use_locking=False, name=None):
-    var = self._variable.batch_scatter_update(sparse_delta, use_locking, name)
-    return _maybe_wrap(var)
+    return self._apply_update(self._variable.batch_scatter_update, sparse_delta,
+                              use_locking, name)
 
   def scatter_nd_sub(self, indices, updates, name=None):
-    var = self._variable.scatter_nd_sub(indices, updates, name)
-    return _maybe_wrap(var)
+    return self._apply_update(self._variable.scatter_nd_sub, indices, updates,
+                              name)
 
   def scatter_nd_add(self, indices, updates, name=None):
-    var = self._variable.scatter_nd_add(indices, updates, name)
-    return _maybe_wrap(var)
+    return self._apply_update(self._variable.scatter_nd_add, indices, updates,
+                              name)
 
   def scatter_nd_update(self, indices, updates, name=None):
-    var = self._variable.scatter_nd_update(indices, updates, name)
-    return _maybe_wrap(var)
+    return self._apply_update(self._variable.scatter_nd_update, indices,
+                              updates, name)
 
   def load(self, value, session=None):
     return self._variable.load(value, session)
@@ -265,8 +295,16 @@ class AutoCastVariable(variables.Variable, core.Tensor):
 
   @property
   def op(self):
+    if self._op is not None:
+      return self._op
     return self._variable.op
 
+  def _as_graph_element(self):
+    graph_element = self._variable._as_graph_element()  # pylint:disable=protected-access
+    if graph_element is None:
+      return self._op
+    return graph_element
+
   @property
   def graph(self):
     return self._variable.graph
@@ -428,7 +466,7 @@ ops.register_tensor_conversion_function(AutoCastVariable,
                                         AutoCastVariable._dense_var_to_tensor)  # pylint:disable=protected-access
 
 
-def create_autocast_variable(variable):
+def create_autocast_variable(variable, op=None):
   """Creates an AutoCastVariable that wraps another variable.
 
   This typically just returns `AutoCastVariable(variable)`. But, if the variable
@@ -440,13 +478,14 @@ def create_autocast_variable(variable):
 
   Args:
     variable: A floating-point resource variable to wrap.
+    op: Optional operation of this variable.
 
   Returns:
     An AutoCastVariable that wraps the variable.
   """
   if not isinstance(variable, (distribute_values.DistributedVariable,
                                ps_distribute_values.AggregatingVariable)):
-    return AutoCastVariable(variable)
+    return AutoCastVariable(variable, op=op)
 
   class AutoCastDistributedVariable(AutoCastVariable, variable.__class__):
     """An AutoCastVariable that also subclasses from variable.__class__.
@@ -468,25 +507,4 @@ def create_autocast_variable(variable):
              ).format(v=self)
       # pylint: enable=missing-format-attribute
 
-  return AutoCastDistributedVariable(variable)
-
-
-def _maybe_wrap(variable, wrap=True):
-  """Creates an AutoCastVariable that wraps another variable if applicable.
-
-  This function is used to wrap the return value of AutoCastVariable.assign.
-  Unfortunately MirroredVariable.assign will (incorrectly) return a Mirrored
-  value instead of a MirroredVariable. So we cannot properly wrap it in an
-  AutoCastVariable. We return the original variable in that case.
-
-  Args:
-    variable: A tf.Variable or op.
-    wrap: A boolean to define whether to wrap the variable in an
-      AutoCastVariable or not.
-
-  Returns:
-    An AutoCastVariable if wrap is True and variable is a resource variable.
-  """
-  if wrap and resource_variable_ops.is_resource_variable(variable):
-    return create_autocast_variable(variable)
-  return variable
+  return AutoCastDistributedVariable(variable, op=op)
diff --git a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
index c3162e0e80a..4ddc8aac8bd 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
@@ -23,7 +23,7 @@ from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import tf2
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
 from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.distribute import mirrored_strategy
 from tensorflow.python.distribute import strategy_combinations
@@ -33,6 +33,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import indexed_slices
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.keras.mixed_precision.experimental import autocast_variable
 from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_descent_v2
 from tensorflow.python.ops import array_ops
@@ -52,14 +53,14 @@ def get_var(val, dtype, name=None):
   return variables.VariableV1(val, use_resource=True, dtype=dtype, name=name)
 
 
-@combinations.generate(combinations.combine(mode=['graph', 'eager']))
+@ds_combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
 
   def setUp(self):
     strategy_combinations.set_virtual_cpus_to_at_least(3)
     super(AutoCastVariableTest, self).setUp()
 
-  @combinations.generate(maybe_distribute)
+  @ds_combinations.generate(maybe_distribute)
   def test_read(self, distribution):
     with distribution.scope():
       x = get_var(1., dtypes.float32)
@@ -103,7 +104,7 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
       self.assertEqual(x.sparse_read([0]).dtype, dtypes.float16)
       self.assertEqual(x.gather_nd([0]).dtype, dtypes.float16)
 
-  @combinations.generate(maybe_distribute)
+  @ds_combinations.generate(maybe_distribute)
   def test_read_nested_scopes(self, distribution):
     with distribution.scope():
       x = get_var(1., dtypes.float32)
@@ -123,7 +124,7 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
         self.assertEqual(x.dtype, dtypes.float16)
         self.assertEqual(x.read_value().dtype, dtypes.float16)
 
-  @combinations.generate(maybe_distribute)
+  @ds_combinations.generate(maybe_distribute)
   def test_dtype_is_not_string(self, distribution):
     with distribution.scope():
       x = get_var(1., dtypes.float32)
@@ -140,7 +141,7 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
         self.assertEqual(x.true_dtype, dtypes.float32)
         self.assertIsInstance(x.true_dtype, dtypes.DType)
 
-  @combinations.generate(maybe_distribute)
+  @ds_combinations.generate(maybe_distribute)
   def test_method_delegations(self, distribution):
     # Test AutoCastVariable correctly delegates Variable methods to the
     # underlying variable.
@@ -220,7 +221,7 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
             self.assertAllEqual(
                 evaluate(x.scatter_nd_update([[0], [1]], [1., 2.])), [1, 2])
 
-  @combinations.generate(maybe_distribute)
+  @ds_combinations.generate(maybe_distribute)
   def test_operator_overloads(self, distribution):
     with distribution.scope():
       for read_dtype in (dtypes.float32, dtypes.float16):
@@ -267,7 +268,7 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
             self.assertAllEqual(x == [7., 8., 10.], [True, True, False])
             self.assertAllEqual(x != [7., 8., 10.], [False, False, True])
 
-  @combinations.generate(maybe_distribute)
+  @ds_combinations.generate(maybe_distribute)
   def test_assign(self, distribution):
     with distribution.scope():
       x = get_var(0., dtypes.float32)
@@ -305,8 +306,8 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
         self.assertAllClose(3., self.evaluate(x.assign_sub(3.)))
 
         # Assign multiple times
-        # This currently only works if no strategy is used
-        if not ds_context.has_strategy():
+        # This currently doesn't work in graph mode if a strategy is used
+        if not ds_context.has_strategy() or context.executing_eagerly():
           assign = x.assign(1.)
           self.assertAllClose(1., self.evaluate(assign))
           self.assertAllClose(0., self.evaluate(assign.assign(0.)))
@@ -344,7 +345,56 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
         # assign still expect float32 value even if in float16 scope
         run_and_check()
 
-  @combinations.generate(maybe_distribute)
+  @ds_combinations.generate(maybe_distribute)
+  def test_assign_tf_function(self, distribution):
+    if not context.executing_eagerly():
+      self.skipTest('Test is not compatible with graph mode')
+
+    with distribution.scope():
+      x = get_var(0., dtypes.float32)
+      x = autocast_variable.create_autocast_variable(x)
+
+      @def_function.function
+      def run_assign():
+        return x.assign(1.).assign_add(3.).assign_add(3.).assign_sub(2.)
+
+      with ops.get_default_graph()._enable_auto_casting_variables(
+          dtypes.float16):
+        self.assertAllClose(5., self.evaluate(run_assign()))
+
+  @ds_combinations.generate(maybe_distribute)
+  def test_assign_op(self, distribution):
+    with distribution.scope():
+      x = get_var(0., dtypes.float32)
+      x = autocast_variable.create_autocast_variable(x)
+
+      @def_function.function
+      def func():
+        self.assertIsNotNone(x.assign(1.0).op)
+        self.assertIsNotNone(x.assign_add(1.0).op)
+        self.assertIsNotNone(x.assign_sub(1.0).op)
+
+      func()
+
+  @ds_combinations.generate(maybe_distribute)
+  def test_tf_function_control_dependencies(self, distribution):
+    if not context.executing_eagerly():
+      self.skipTest('Test is not compatible with graph mode')
+
+    with distribution.scope():
+      x = get_var(0., dtypes.float32)
+      x = autocast_variable.create_autocast_variable(x)
+
+      @def_function.function
+      def func():
+        update = x.assign_add(1.)
+        with ops.control_dependencies([update]):
+          x.assign_add(1.)
+
+      func()
+      self.assertAllClose(2., self.evaluate(x))
+
+  @ds_combinations.generate(maybe_distribute)
   def test_assign_stays_in_true_dtype(self, distribution):
     with distribution.scope():
       x = get_var(1., dtypes.float32)
@@ -358,20 +408,18 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
           dtypes.float16):
         # Variable should be increased, despite it appearing to be the same
         # float16 value.
-        self.assertEqual(1. + small_val,
-                         self.evaluate(x.assign(1. + small_tensor)))
+        self.evaluate(x.assign(1. + small_tensor))
         self.assertEqual(1., self.evaluate(x.value()))
-      self.assertEqual(1. + small_val, self.evaluate(x.value()))
+      self.assertEqual(1. + small_val, self.evaluate(x))
 
       self.evaluate(x.assign(1.))
       with ops.get_default_graph()._enable_auto_casting_variables(
           dtypes.float16):
-        self.assertEqual(1. + small_val,
-                         self.evaluate(x.assign_add(small_tensor)))
+        self.evaluate(x.assign_add(small_tensor))
         self.assertEqual(1., self.evaluate(x.value()))
-      self.assertEqual(1. + small_val, self.evaluate(x.value()))
+      self.assertEqual(1. + small_val, self.evaluate(x))
 
-  @combinations.generate(maybe_distribute)
+  @ds_combinations.generate(maybe_distribute)
   def test_checkpoint(self, distribution):
     with self.test_session():
       with distribution.scope():
@@ -387,7 +435,7 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
       checkpoint.restore(save_path).assert_consumed().run_restore_ops()
       self.assertEqual(self.evaluate(x), 123.)
 
-  @combinations.generate(maybe_distribute)
+  @ds_combinations.generate(maybe_distribute)
   def test_invalid_wrapped_variable(self, distribution):
     with distribution.scope():
       # Wrap a non-variable
@@ -436,7 +484,7 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
     with strategy.scope():
       x = get_var(1., dtypes.float32)
       x = autocast_variable.create_autocast_variable(x)
-      use_policy = getattr(strategy.extended, '_use_policy', False)
+      use_policy = getattr(strategy.extended, '_use_var_policy', False)
       if use_policy:
         self.assertRegex(
             repr(x).replace('\n', ' '),
diff --git a/tensorflow/python/keras/mixed_precision/experimental/keras_test.py b/tensorflow/python/keras/mixed_precision/experimental/keras_test.py
index cfa2fbca080..572bdbbce8a 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/keras_test.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/keras_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.distribute import mirrored_strategy
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
+from tensorflow.python.framework import config as tf_config
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.keras import backend
@@ -98,7 +99,7 @@ default_strategy_fn = distribution_strategy_context.get_strategy
 
 def create_mirrored_strategy():
   """Create a MirroredStrategy, using a GPU if it is available."""
-  if context.num_gpus() >= 1:
+  if tf_config.list_logical_devices('GPU'):
     return mirrored_strategy.MirroredStrategy(['cpu:0', 'gpu:0'])
   else:
     return mirrored_strategy.MirroredStrategy(['cpu:0'])
@@ -106,7 +107,8 @@ def create_mirrored_strategy():
 
 def create_central_storage_strategy():
   """Create a CentralStorageStrategy, using a GPU if it is available."""
-  compute_devices = ['cpu:0', 'gpu:0'] if context.num_gpus() >= 1 else ['cpu:0']
+  compute_devices = ['cpu:0', 'gpu:0'] if (
+      tf_config.list_logical_devices('GPU')) else ['cpu:0']
   return central_storage_strategy.CentralStorageStrategy(
       compute_devices, parameter_device='cpu:0')
 
diff --git a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_benchmark.py b/tensorflow/python/keras/mixed_precision/experimental/loss_scale_benchmark.py
index 4ebc360b973..95fcd1168d1 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_benchmark.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/loss_scale_benchmark.py
@@ -24,6 +24,7 @@ from tensorflow.python.distribute import mirrored_strategy
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
+from tensorflow.python.framework import config
 from tensorflow.python.keras.mixed_precision.experimental import loss_scale_optimizer
 from tensorflow.python.keras.optimizer_v2 import adam
 from tensorflow.python.ops import math_ops
@@ -136,7 +137,7 @@ class LossScaleBenchmark(test.Benchmark):
                             wall_time=(end - start) / num_iters, name=name)
 
   def _gpus_to_test_with(self):
-    num_gpus = context.num_gpus()
+    num_gpus = len(config.list_logical_devices('GPU'))
     gpus_to_test_with = []
     if num_gpus >= 1:
       gpus_to_test_with.append(1)
diff --git a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer.py b/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer.py
index 4a3f459de80..c309fafa4a4 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer.py
@@ -22,6 +22,7 @@ from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.distribute import mirrored_strategy
 from tensorflow.python.distribute import one_device_strategy
 from tensorflow.python.distribute import tpu_strategy
+from tensorflow.python.eager import backprop
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import smart_cond
@@ -141,8 +142,8 @@ class _DelegatingTrackableMixin(object):
     return self._trackable._add_variable_with_custom_getter(
         name, shape, dtype, initializer, getter, overwrite, **kwargs_for_getter)
 
-  def _preload_simple_restoration(self, name, shape):
-    return self._trackable._preload_simple_restoration(name, shape)
+  def _preload_simple_restoration(self, name):
+    return self._trackable._preload_simple_restoration(name)
 
   def _track_trackable(self, trackable, name, overwrite=False):  # pylint: disable=redefined-outer-name
     return self._trackable._track_trackable(trackable, name, overwrite)
@@ -349,9 +350,14 @@ class LossScaleOptimizer(_DelegatingTrackableMixin, optimizer_v2.OptimizerV2):
     ]
 
   def _compute_gradients(self, loss, var_list, grad_loss=None, tape=None):
-    loss = self.get_scaled_loss(loss)
-    grads_and_vars = self._optimizer._compute_gradients(loss, var_list,  # pylint: disable=protected-access
-                                                        grad_loss)
+    tape = backprop.GradientTape() if tape is None else tape
+    with tape:
+      loss = self.get_scaled_loss(loss)
+    grads_and_vars = self._optimizer._compute_gradients(  # pylint: disable=protected-access
+        loss,
+        var_list,
+        grad_loss,
+        tape=tape)
     grads = [g for g, _ in grads_and_vars]
     variables = [v for _, v in grads_and_vars]
     unscaled_grads = self.get_unscaled_gradients(grads)
diff --git a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer_test.py b/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer_test.py
index 9a9d174a64f..35c25c1da8c 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer_test.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.distribute import central_storage_strategy
 from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.distribute import mirrored_strategy
 from tensorflow.python.eager import context
+from tensorflow.python.framework import config as tf_config
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras import combinations
@@ -53,7 +54,7 @@ default_strategy_fn = distribution_strategy_context.get_strategy
 
 
 def create_mirrored_strategy():
-  if context.num_gpus() >= 1:
+  if tf_config.list_logical_devices('GPU'):
     return mirrored_strategy.MirroredStrategy(['cpu:0', 'gpu:0'])
   else:
     return mirrored_strategy.MirroredStrategy(['cpu:0'])
@@ -124,10 +125,10 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
   def testGetScaledLoss(self):
     opt = gradient_descent.SGD(2.0)
     opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale=2.)
-    loss = ops.convert_to_tensor_v2(5.)
+    loss = ops.convert_to_tensor_v2_with_dispatch(5.)
     self.assertEqual(10., self.evaluate(opt.get_scaled_loss(loss)))
     self.assertEqual(10., self.evaluate(opt.get_scaled_loss(lambda: loss)()))
-    loss = ops.convert_to_tensor_v2(5., dtype='float16')
+    loss = ops.convert_to_tensor_v2_with_dispatch(5., dtype='float16')
     self.assertEqual(10., self.evaluate(opt.get_scaled_loss(loss)))
     self.assertEqual(10., self.evaluate(opt.get_scaled_loss(lambda: loss)()))
 
@@ -135,8 +136,8 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
     opt = gradient_descent.SGD(2.0)
     opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale=2)
     scaled_grads = [
-        ops.convert_to_tensor_v2(3.), None,
-        ops.convert_to_tensor_v2(-4., dtype='float16')
+        ops.convert_to_tensor_v2_with_dispatch(3.), None,
+        ops.convert_to_tensor_v2_with_dispatch(-4., dtype='float16')
     ]
     grads = opt.get_unscaled_gradients(scaled_grads)
     grads = [self.evaluate(g) if g is not None else g for g in grads]
@@ -146,9 +147,10 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
     opt = gradient_descent.SGD(2.0)
     opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale=2)
     sparse_scaled_grad = ops.IndexedSlices(
-        ops.convert_to_tensor_v2([[4., 2.], [8., 5.]]),
-        ops.convert_to_tensor_v2([1, 3], dtype='int32'),
-        dense_shape=ops.convert_to_tensor_v2([5, 2], dtype='int32'))
+        ops.convert_to_tensor_v2_with_dispatch([[4., 2.], [8., 5.]]),
+        ops.convert_to_tensor_v2_with_dispatch([1, 3], dtype='int32'),
+        dense_shape=ops.convert_to_tensor_v2_with_dispatch([5, 2],
+                                                           dtype='int32'))
     sparse_grad = opt.get_unscaled_gradients([sparse_scaled_grad])[0]
     self.assertIsInstance(sparse_grad, ops.IndexedSlices)
     self.assertAllEqual([[2., 1.], [4., 2.5]],
diff --git a/tensorflow/python/keras/mixed_precision/experimental/policy.py b/tensorflow/python/keras/mixed_precision/experimental/policy.py
index 592057f0b56..c4b414d7bf5 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/policy.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/policy.py
@@ -514,11 +514,6 @@ def global_policy():
   return _global_policy
 
 
-def policy_defaults_to_floatx():
-  """Returns True if `global_policy()` will use the current value of floatx."""
-  return _global_policy is None and base_layer_utils.v2_dtype_behavior_enabled()
-
-
 def _check_if_mixed_precision_graph_rewrite_is_enabled(policy):
   if mixed_precision_global_state.mixed_precision_graph_rewrite_is_enabled:
     raise ValueError(
@@ -543,7 +538,12 @@ def set_policy(policy):
   passed to the layer constructor. If no global policy is set, layers will
   instead default to a Policy constructed from `tf.keras.backend.floatx()`.
 
-  See `keras.mixed_precision.experimental.Policy` for more information.
+  Only floating point policies can be set as the global policy, such as
+  `'float32'` and `'mixed_float16'`. Non-floating point policies such as
+  `'int32'` and `'complex64'` cannot be set as the global policy because most
+  layers do not support such policies.
+
+  See `tf.keras.mixed_precision.experimental.Policy` for more information.
 
   Args:
     policy: A Policy, or a string that will be converted to a Policy..
@@ -559,6 +559,12 @@ def set_policy(policy):
   is_mixed_policy = policy is not None and policy.should_cast_variables
   if is_mixed_policy:
     _check_if_mixed_precision_graph_rewrite_is_enabled(policy)
+  if (policy is not None and policy.compute_dtype is not None and
+      not dtypes.as_dtype(policy.compute_dtype).is_floating):
+    raise ValueError('set_policy can only be used to set the global policy to '
+                     'floating-point policies, such as "float32" and '
+                     '"mixed_float16", but got policy: %s'
+                     % (policy.name,))
   _global_policy = policy
   mixed_precision_global_state.using_mixed_precision_policy = is_mixed_policy
 
diff --git a/tensorflow/python/keras/mixed_precision/experimental/policy_test.py b/tensorflow/python/keras/mixed_precision/experimental/policy_test.py
index 94880a9b239..9ebcc3558e6 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/policy_test.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/policy_test.py
@@ -155,6 +155,21 @@ class PolicyTest(test.TestCase, parameterized.TestCase):
     finally:
       mp_policy.set_policy(None)
 
+  @testing_utils.enable_v2_dtype_behavior
+  def test_global_policy_dtype_error(self):
+    with self.assertRaisesRegex(
+        ValueError,
+        'set_policy can only be used to set the global policy to '
+        'floating-point policies, such as "float32" and "mixed_float16", but '
+        'got policy: int32'):
+      mp_policy.set_policy('int32')
+    with self.assertRaisesRegex(
+        ValueError,
+        'set_policy can only be used to set the global policy to '
+        'floating-point policies, such as "float32" and "mixed_float16", but '
+        'got policy: complex64'):
+      mp_policy.set_policy(mp_policy.Policy('complex64'))
+
   @testing_utils.enable_v2_dtype_behavior
   def test_loss_scale_warning(self):
     with test.mock.patch.object(tf_logging, 'warn') as mock_warn:
@@ -174,22 +189,24 @@ class PolicyTest(test.TestCase, parameterized.TestCase):
 
   @testing_utils.enable_v2_dtype_behavior
   def test_device_compatibility_warning(self):
-    with context.eager_mode():
-      device_compatibility_check._logged_compatibility_check = False
+    if not context.executing_eagerly():
+      self.skipTest('Run in eager mode only.')
+
+    device_compatibility_check._logged_compatibility_check = False
+    with test.mock.patch.object(tf_logging, 'warn') as mock_warn:
+      mp_policy.Policy('mixed_float16')
+    if config_module.list_physical_devices('GPU'):
+      mock_warn.assert_not_called()
+    else:
+      self.assertRegex(
+          mock_warn.call_args[0][0],
+          r'Mixed precision compatibility check \(mixed_float16\): WARNING.*')
+
+    if config_module.list_physical_devices('GPU'):
+      # Assert message is only logged once
       with test.mock.patch.object(tf_logging, 'warn') as mock_warn:
         mp_policy.Policy('mixed_float16')
-      if config_module.list_physical_devices('GPU'):
-        mock_warn.assert_not_called()
-      else:
-        self.assertRegex(
-            mock_warn.call_args[0][0],
-            r'Mixed precision compatibility check \(mixed_float16\): WARNING.*')
-
-      if config_module.list_physical_devices('GPU'):
-        # Assert message is only logged once
-        with test.mock.patch.object(tf_logging, 'warn') as mock_warn:
-          mp_policy.Policy('mixed_float16')
-        mock_warn.assert_not_called()
+      mock_warn.assert_not_called()
 
   @testing_utils.enable_v2_dtype_behavior
   def test_policy_scope(self):
diff --git a/tensorflow/python/keras/mixed_precision/experimental/test_util.py b/tensorflow/python/keras/mixed_precision/experimental/test_util.py
index 937b378115d..c0d9cbf98d6 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/test_util.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/test_util.py
@@ -55,7 +55,7 @@ def create_identity_with_grad_check_fn(expected_gradient, expected_dtype=None):
       if expected_dtype:
         assert dx.dtype == expected_dtype, (
             'dx.dtype should be %s but is: %s' % (expected_dtype, dx.dtype))
-      expected_tensor = ops.convert_to_tensor_v2(
+      expected_tensor = ops.convert_to_tensor_v2_with_dispatch(
           expected_gradient, dtype=dx.dtype, name='expected_gradient')
       # Control dependency is to ensure input is available. It's possible the
       # dataset will throw a StopIteration to indicate there is no more data, in
diff --git a/tensorflow/python/keras/optimizer_v2/BUILD b/tensorflow/python/keras/optimizer_v2/BUILD
index b519ec7fb3d..566c051b727 100644
--- a/tensorflow/python/keras/optimizer_v2/BUILD
+++ b/tensorflow/python/keras/optimizer_v2/BUILD
@@ -16,6 +16,12 @@ package(
 
 exports_files(["LICENSE"])
 
+filegroup(
+    name = "all_py_srcs",
+    srcs = glob(["*.py"]),
+    visibility = ["//tensorflow/python/keras/google/private_tf_api_test:__pkg__"],
+)
+
 py_library(
     name = "optimizer_v2",
     srcs = [
@@ -40,6 +46,7 @@ py_library(
         "//tensorflow/python:state_ops",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
+        "//tensorflow/python/distribute:central_storage_strategy",
         "//tensorflow/python/distribute:distribute_lib",
         "//tensorflow/python/distribute:parameter_server_strategy",
         "//tensorflow/python/distribute:reduce_util",
@@ -48,6 +55,7 @@ py_library(
         "//tensorflow/python/keras:backend_config",
         "//tensorflow/python/keras:initializers",
         "//tensorflow/python/keras/engine:base_layer_utils",
+        "//tensorflow/python/keras/utils:layer_utils",
         "//tensorflow/python/keras/utils:tf_utils",
     ],
 )
diff --git a/tensorflow/python/keras/optimizer_v2/adadelta.py b/tensorflow/python/keras/optimizer_v2/adadelta.py
index 8c895ae07f4..404b3f81e3f 100644
--- a/tensorflow/python/keras/optimizer_v2/adadelta.py
+++ b/tensorflow/python/keras/optimizer_v2/adadelta.py
@@ -101,7 +101,8 @@ class Adadelta(optimizer_v2.OptimizerV2):
     super(Adadelta, self)._prepare_local(var_device, var_dtype, apply_state)
     apply_state[(var_device, var_dtype)].update(
         dict(
-            epsilon=ops.convert_to_tensor_v2(self.epsilon, var_dtype),
+            epsilon=ops.convert_to_tensor_v2_with_dispatch(
+                self.epsilon, var_dtype),
             rho=array_ops.identity(self._get_hyper('rho', var_dtype))))
 
   def set_weights(self, weights):
diff --git a/tensorflow/python/keras/optimizer_v2/adadelta_test.py b/tensorflow/python/keras/optimizer_v2/adadelta_test.py
index fd785aa412f..29835f2cc04 100644
--- a/tensorflow/python/keras/optimizer_v2/adadelta_test.py
+++ b/tensorflow/python/keras/optimizer_v2/adadelta_test.py
@@ -148,9 +148,9 @@ class AdadeltaOptimizerTest(test.TestCase, parameterized.TestCase):
   def testResourceBasic(self):
     self.doTestBasic(use_resource=True)
 
+  @combinations.generate(combinations.combine(mode=["eager"]))
   def testBasicCallableParams(self):
-    with context.eager_mode():
-      self.doTestBasic(use_resource=True, use_callable_params=True)
+    self.doTestBasic(use_resource=True, use_callable_params=True)
 
   def testMinimizeSparseResourceVariable(self):
     # TODO(tanzheny, omalleyt): Fix test in eager mode.
diff --git a/tensorflow/python/keras/optimizer_v2/adagrad.py b/tensorflow/python/keras/optimizer_v2/adagrad.py
index ba76b837942..4d3294ab9f8 100644
--- a/tensorflow/python/keras/optimizer_v2/adagrad.py
+++ b/tensorflow/python/keras/optimizer_v2/adagrad.py
@@ -87,7 +87,8 @@ class Adagrad(optimizer_v2.OptimizerV2):
     super(Adagrad, self)._prepare_local(var_device, var_dtype, apply_state)
     apply_state[(var_device, var_dtype)].update(
         dict(
-            epsilon=ops.convert_to_tensor_v2(self.epsilon, var_dtype),
+            epsilon=ops.convert_to_tensor_v2_with_dispatch(
+                self.epsilon, var_dtype),
             neg_lr_t=-apply_state[(var_device, var_dtype)]['lr_t'],
             zero=array_ops.zeros((), dtype=dtypes.int64)))
 
diff --git a/tensorflow/python/keras/optimizer_v2/adagrad_test.py b/tensorflow/python/keras/optimizer_v2/adagrad_test.py
index 4496f0b98e7..5a8f9d6ad77 100644
--- a/tensorflow/python/keras/optimizer_v2/adagrad_test.py
+++ b/tensorflow/python/keras/optimizer_v2/adagrad_test.py
@@ -118,9 +118,9 @@ class AdagradOptimizerTest(test.TestCase, parameterized.TestCase):
   def testBasic(self):
     self.doTestBasic()
 
+  @combinations.generate(combinations.combine(mode=["eager"]))
   def testBasicCallableParams(self):
-    with context.eager_mode():
-      self.doTestBasic(use_callable_params=True)
+    self.doTestBasic(use_callable_params=True)
 
   def testBasicWithLearningRateDecay(self):
     for dtype in _DATA_TYPES:
diff --git a/tensorflow/python/keras/optimizer_v2/adam.py b/tensorflow/python/keras/optimizer_v2/adam.py
index 1fccd116012..e4896fd167e 100644
--- a/tensorflow/python/keras/optimizer_v2/adam.py
+++ b/tensorflow/python/keras/optimizer_v2/adam.py
@@ -144,7 +144,8 @@ class Adam(optimizer_v2.OptimizerV2):
     apply_state[(var_device, var_dtype)].update(
         dict(
             lr=lr,
-            epsilon=ops.convert_to_tensor_v2(self.epsilon, var_dtype),
+            epsilon=ops.convert_to_tensor_v2_with_dispatch(
+                self.epsilon, var_dtype),
             beta_1_t=beta_1_t,
             beta_1_power=beta_1_power,
             one_minus_beta_1_t=1 - beta_1_t,
@@ -396,7 +397,8 @@ class NonFusedAdam(optimizer_v2.OptimizerV2):
     apply_state[(var_device, var_dtype)].update(
         dict(
             lr=lr,
-            epsilon=ops.convert_to_tensor_v2(self.epsilon, var_dtype),
+            epsilon=ops.convert_to_tensor_v2_with_dispatch(
+                self.epsilon, var_dtype),
             beta_1_t=beta_1_t,
             beta_1_power=beta_1_power,
             one_minus_beta_1_t=1 - beta_1_t,
diff --git a/tensorflow/python/keras/optimizer_v2/adam_test.py b/tensorflow/python/keras/optimizer_v2/adam_test.py
index ad53e89bd81..61b639456c8 100644
--- a/tensorflow/python/keras/optimizer_v2/adam_test.py
+++ b/tensorflow/python/keras/optimizer_v2/adam_test.py
@@ -254,9 +254,9 @@ class AdamOptimizerTest(test.TestCase, parameterized.TestCase):
   def testResourceBasic(self):
     self.doTestBasic()
 
+  @combinations.generate(combinations.combine(mode=["eager"]))
   def testBasicCallableParams(self):
-    with context.eager_mode():
-      self.doTestBasic(use_callable_params=True)
+    self.doTestBasic(use_callable_params=True)
 
   @combinations.generate(combinations.combine(mode=["graph", "eager"]))
   def testBasicWithAmsgrad(self):
@@ -525,16 +525,16 @@ class AdamOptimizerTest(test.TestCase, parameterized.TestCase):
           self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
           self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
+  @combinations.generate(combinations.combine(mode=["eager"]))
   def testSlotsUniqueEager(self):
-    with context.eager_mode():
-      v1 = variables.Variable(1.)
-      v2 = variables.Variable(1.)
-      opt = adam.Adam(1.)
-      opt.minimize(lambda: v1 + v2, var_list=[v1, v2])
-      # There should be iteration, and two unique slot variables for v1 and v2.
-      self.assertEqual(5, len(set(v.ref() for v in opt.variables())))
-      self.assertEqual(
-          self.evaluate(opt.variables()[0]), self.evaluate(opt.iterations))
+    v1 = variables.Variable(1.)
+    v2 = variables.Variable(1.)
+    opt = adam.Adam(1.)
+    opt.minimize(lambda: v1 + v2, var_list=[v1, v2])
+    # There should be iteration, and two unique slot variables for v1 and v2.
+    self.assertLen(set(v.ref() for v in opt.variables()), 5)
+    self.assertEqual(
+        self.evaluate(opt.variables()[0]), self.evaluate(opt.iterations))
 
   def testSetWeightsFromV1AdamWithoutMinimize(self):
     keras_v1_adam = optimizers.Adam()
@@ -708,9 +708,9 @@ class NonFusedAdamOptimizerTest(test.TestCase, parameterized.TestCase):
   def testResourceBasic(self):
     self.doTestBasic()
 
+  @combinations.generate(combinations.combine(mode=["eager"]))
   def testBasicCallableParams(self):
-    with context.eager_mode():
-      self.doTestBasic(use_callable_params=True)
+    self.doTestBasic(use_callable_params=True)
 
   @combinations.generate(combinations.combine(mode=["graph", "eager"]))
   def testBasicWithAmsgrad(self):
diff --git a/tensorflow/python/keras/optimizer_v2/adamax.py b/tensorflow/python/keras/optimizer_v2/adamax.py
index 3f4312c731e..26cc59b1f98 100644
--- a/tensorflow/python/keras/optimizer_v2/adamax.py
+++ b/tensorflow/python/keras/optimizer_v2/adamax.py
@@ -122,7 +122,8 @@ class Adamax(optimizer_v2.OptimizerV2):
     apply_state[(var_device, var_dtype)].update(
         dict(
             neg_scaled_lr=-lr_t / (1 - beta_1_power),
-            epsilon=ops.convert_to_tensor_v2(self.epsilon, var_dtype),
+            epsilon=ops.convert_to_tensor_v2_with_dispatch(
+                self.epsilon, var_dtype),
             beta_1_t=beta_1_t,
             beta_1_power=beta_1_power,
             one_minus_beta_1_t=1 - beta_1_t,
diff --git a/tensorflow/python/keras/optimizer_v2/adamax_test.py b/tensorflow/python/keras/optimizer_v2/adamax_test.py
index a78a9d2f443..f955df863f1 100644
--- a/tensorflow/python/keras/optimizer_v2/adamax_test.py
+++ b/tensorflow/python/keras/optimizer_v2/adamax_test.py
@@ -350,14 +350,14 @@ class AdamaxOptimizerTest(test.TestCase, parameterized.TestCase):
           self.assertAllCloseAccordingToType(var0_np, var0)
           self.assertAllCloseAccordingToType(var1_np, var1)
 
+  @combinations.generate(combinations.combine(mode=["eager"]))
   def testSlotsUniqueEager(self):
-    with context.eager_mode():
-      v1 = variables.Variable(1.)
-      v2 = variables.Variable(1.)
-      opt = adamax.Adamax(1.)
-      opt.minimize(lambda: v1 + v2, var_list=[v1, v2])
-      # There should be iteration, and two unique slot variables for v1 and v2.
-      self.assertEqual(5, len({id(v) for v in opt.variables()}))
+    v1 = variables.Variable(1.)
+    v2 = variables.Variable(1.)
+    opt = adamax.Adamax(1.)
+    opt.minimize(lambda: v1 + v2, var_list=[v1, v2])
+    # There should be iteration, and two unique slot variables for v1 and v2.
+    self.assertLen({id(v) for v in opt.variables()}, 5)
 
   def testConstructAdamaxWithLR(self):
     opt = adamax.Adamax(lr=1.0)
diff --git a/tensorflow/python/keras/optimizer_v2/ftrl.py b/tensorflow/python/keras/optimizer_v2/ftrl.py
index 0e96724a44d..4a5a8c62bcc 100644
--- a/tensorflow/python/keras/optimizer_v2/ftrl.py
+++ b/tensorflow/python/keras/optimizer_v2/ftrl.py
@@ -30,12 +30,31 @@ from tensorflow.python.util.tf_export import keras_export
 class Ftrl(optimizer_v2.OptimizerV2):
   r"""Optimizer that implements the FTRL algorithm.
 
-  See Algorithm 1 of this [paper](
-  https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf).
+  See Algorithm 1 of this
+  [paper](https://research.google.com/pubs/archive/41159.pdf).
   This version has support for both online L2 (the L2 penalty given in the paper
   above) and shrinkage-type L2 (which is the addition of an L2 penalty to the
   loss function).
 
+  Initialization:
+  $$t = 0$$
+  $$n_{0} = 0$$
+  $$\sigma_{0} = 0$$
+  $$z_{0} = 0$$
+
+  Update ($$i$$ is variable index, $$\alpha$$ is the learning rate):
+  $$t = t + 1$$
+  $$n_{t,i} = n_{t-1,i} + g_{t,i}^{2}$$
+  $$\sigma_{t,i} = (\sqrt{n_{t,i}} - \sqrt{n_{t-1,i}}) / \alpha$$
+  $$z_{t,i} = z_{t-1,i} + g_{t,i} - \sigma_{t,i} * w_{t,i}$$
+  $$w_{t,i} = - ((\beta+\sqrt{n_{t,i}}) / \alpha + 2 * \lambda_{2})^{-1} *
+              (z_{i} - sgn(z_{i}) * \lambda_{1}) if \abs{z_{i}} > \lambda_{i}
+                                                 else 0$$
+
+  Check the documentation for the l2_shrinkage_regularization_strength
+  parameter for more details when shrinkage is enabled, in which case gradient
+  is replaced with gradient_with_shrinkage.
+
   Args:
     learning_rate: A `Tensor`, floating point value, or a schedule that is a
       `tf.keras.optimizers.schedules.LearningRateSchedule`. The learning rate.
@@ -54,6 +73,7 @@ class Ftrl(optimizer_v2.OptimizerV2):
       or equal to zero. This differs from L2 above in that the L2 above is a
       stabilization penalty, whereas this L2 shrinkage is a magnitude penalty.
       When input is sparse shrinkage will only happen on the active weights.
+    beta: A float value, representing the beta value from the paper.
     **kwargs: Keyword arguments. Allowed to be one of
       `"clipnorm"` or `"clipvalue"`.
       `"clipnorm"` (float) clips gradients by norm; `"clipvalue"` (float) clips
@@ -72,6 +92,7 @@ class Ftrl(optimizer_v2.OptimizerV2):
                l2_regularization_strength=0.0,
                name='Ftrl',
                l2_shrinkage_regularization_strength=0.0,
+               beta=0.0,
                **kwargs):
     super(Ftrl, self).__init__(name, **kwargs)
 
@@ -100,6 +121,7 @@ class Ftrl(optimizer_v2.OptimizerV2):
     self._set_hyper('learning_rate_power', learning_rate_power)
     self._set_hyper('l1_regularization_strength', l1_regularization_strength)
     self._set_hyper('l2_regularization_strength', l2_regularization_strength)
+    self._set_hyper('beta', beta)
     self._initial_accumulator_value = initial_accumulator_value
     self._l2_shrinkage_regularization_strength = (
         l2_shrinkage_regularization_strength)
@@ -115,22 +137,29 @@ class Ftrl(optimizer_v2.OptimizerV2):
 
   def _prepare_local(self, var_device, var_dtype, apply_state):
     super(Ftrl, self)._prepare_local(var_device, var_dtype, apply_state)
-    apply_state[(var_device, var_dtype)].update(dict(
-        learning_rate_power=array_ops.identity(
-            self._get_hyper('learning_rate_power', var_dtype)),
-        l1_regularization_strength=array_ops.identity(
-            self._get_hyper('l1_regularization_strength', var_dtype)),
-        l2_regularization_strength=array_ops.identity(
-            self._get_hyper('l2_regularization_strength', var_dtype)),
-        l2_shrinkage_regularization_strength=math_ops.cast(
-            self._l2_shrinkage_regularization_strength, var_dtype)
-        ))
+    apply_state[(var_device, var_dtype)].update(
+        dict(
+            learning_rate_power=array_ops.identity(
+                self._get_hyper('learning_rate_power', var_dtype)),
+            l1_regularization_strength=array_ops.identity(
+                self._get_hyper('l1_regularization_strength', var_dtype)),
+            l2_regularization_strength=array_ops.identity(
+                self._get_hyper('l2_regularization_strength', var_dtype)),
+            beta=array_ops.identity(self._get_hyper('beta', var_dtype)),
+            l2_shrinkage_regularization_strength=math_ops.cast(
+                self._l2_shrinkage_regularization_strength, var_dtype)))
 
   def _resource_apply_dense(self, grad, var, apply_state=None):
     var_device, var_dtype = var.device, var.dtype.base_dtype
     coefficients = ((apply_state or {}).get((var_device, var_dtype))
                     or self._fallback_apply_state(var_device, var_dtype))
 
+    # Adjust L2 regularization strength to include beta to avoid the underlying
+    # TensorFlow ops needing to include it.
+    adjusted_l2_regularization_strength = (
+        coefficients['l2_regularization_strength'] + coefficients['beta'] /
+        (2. * coefficients['lr_t']))
+
     accum = self.get_slot(var, 'accumulator')
     linear = self.get_slot(var, 'linear')
 
@@ -142,7 +171,7 @@ class Ftrl(optimizer_v2.OptimizerV2):
           grad=grad,
           lr=coefficients['lr_t'],
           l1=coefficients['l1_regularization_strength'],
-          l2=coefficients['l2_regularization_strength'],
+          l2=adjusted_l2_regularization_strength,
           lr_power=coefficients['learning_rate_power'],
           use_locking=self._use_locking)
     else:
@@ -153,7 +182,7 @@ class Ftrl(optimizer_v2.OptimizerV2):
           grad=grad,
           lr=coefficients['lr_t'],
           l1=coefficients['l1_regularization_strength'],
-          l2=coefficients['l2_regularization_strength'],
+          l2=adjusted_l2_regularization_strength,
           l2_shrinkage=coefficients['l2_shrinkage_regularization_strength'],
           lr_power=coefficients['learning_rate_power'],
           use_locking=self._use_locking)
@@ -163,6 +192,12 @@ class Ftrl(optimizer_v2.OptimizerV2):
     coefficients = ((apply_state or {}).get((var_device, var_dtype))
                     or self._fallback_apply_state(var_device, var_dtype))
 
+    # Adjust L2 regularization strength to include beta to avoid the underlying
+    # TensorFlow ops needing to include it.
+    adjusted_l2_regularization_strength = (
+        coefficients['l2_regularization_strength'] + coefficients['beta'] /
+        (2. * coefficients['lr_t']))
+
     accum = self.get_slot(var, 'accumulator')
     linear = self.get_slot(var, 'linear')
 
@@ -175,7 +210,7 @@ class Ftrl(optimizer_v2.OptimizerV2):
           indices=indices,
           lr=coefficients['lr_t'],
           l1=coefficients['l1_regularization_strength'],
-          l2=coefficients['l2_regularization_strength'],
+          l2=adjusted_l2_regularization_strength,
           lr_power=coefficients['learning_rate_power'],
           use_locking=self._use_locking)
     else:
@@ -187,7 +222,7 @@ class Ftrl(optimizer_v2.OptimizerV2):
           indices=indices,
           lr=coefficients['lr_t'],
           l1=coefficients['l1_regularization_strength'],
-          l2=coefficients['l2_regularization_strength'],
+          l2=adjusted_l2_regularization_strength,
           l2_shrinkage=coefficients['l2_shrinkage_regularization_strength'],
           lr_power=coefficients['learning_rate_power'],
           use_locking=self._use_locking)
@@ -207,6 +242,8 @@ class Ftrl(optimizer_v2.OptimizerV2):
             self._serialize_hyperparameter('l1_regularization_strength'),
         'l2_regularization_strength':
             self._serialize_hyperparameter('l2_regularization_strength'),
+        'beta':
+            self._serialize_hyperparameter('beta'),
         'l2_shrinkage_regularization_strength':
             self._l2_shrinkage_regularization_strength,
     })
diff --git a/tensorflow/python/keras/optimizer_v2/ftrl_test.py b/tensorflow/python/keras/optimizer_v2/ftrl_test.py
index 9b17c0013e1..6627fc0df29 100644
--- a/tensorflow/python/keras/optimizer_v2/ftrl_test.py
+++ b/tensorflow/python/keras/optimizer_v2/ftrl_test.py
@@ -156,6 +156,63 @@ class FtrlOptimizerTest(test.TestCase):
         self.assertAllCloseAccordingToType(
             np.array([-0.93460727, -1.86147261]), v1_val)
 
+  def testFtrlWithBeta(self):
+    # TODO(tanzheny, omalleyt): Fix test in eager mode.
+    for dtype in [dtypes.half, dtypes.float32]:
+      with ops.Graph().as_default(), self.cached_session(use_gpu=True):
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([4.0, 3.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.2], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.02], dtype=dtype)
+
+        opt = ftrl.Ftrl(3.0, initial_accumulator_value=0.1, beta=0.1)
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        self.evaluate(variables.global_variables_initializer())
+
+        v0_val, v1_val = self.evaluate([var0, var1])
+        self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
+        self.assertAllCloseAccordingToType([4.0, 3.0], v1_val)
+
+        # Run 10 steps FTRL
+        for _ in range(10):
+          update.run()
+        v0_val, v1_val = self.evaluate([var0, var1])
+        self.assertAllCloseAccordingToType(
+            np.array([-6.096838, -9.162214]), v0_val)
+        self.assertAllCloseAccordingToType(
+            np.array([-0.717741, -1.425132]), v1_val)
+
+  def testFtrlWithL2_Beta(self):
+    # TODO(tanzheny, omalleyt): Fix test in eager mode.
+    for dtype in [dtypes.half, dtypes.float32]:
+      with ops.Graph().as_default(), self.cached_session(use_gpu=True):
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([4.0, 3.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.2], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.02], dtype=dtype)
+
+        opt = ftrl.Ftrl(
+            3.0,
+            initial_accumulator_value=0.1,
+            l1_regularization_strength=0.0,
+            l2_regularization_strength=0.1,
+            beta=0.1)
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        self.evaluate(variables.global_variables_initializer())
+
+        v0_val, v1_val = self.evaluate([var0, var1])
+        self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
+        self.assertAllCloseAccordingToType([4.0, 3.0], v1_val)
+
+        # Run 10 steps FTRL
+        for _ in range(10):
+          update.run()
+        v0_val, v1_val = self.evaluate([var0, var1])
+        self.assertAllCloseAccordingToType(
+            np.array([-2.735487, -4.704625]), v0_val)
+        self.assertAllCloseAccordingToType(
+            np.array([-0.294335, -0.586556]), v1_val)
+
   def testFtrlWithL1_L2(self):
     # TODO(tanzheny, omalleyt): Fix test in eager mode.
     for dtype in [dtypes.half, dtypes.float32]:
diff --git a/tensorflow/python/keras/optimizer_v2/gradient_descent.py b/tensorflow/python/keras/optimizer_v2/gradient_descent.py
index 466b42a3818..ee7de98236b 100644
--- a/tensorflow/python/keras/optimizer_v2/gradient_descent.py
+++ b/tensorflow/python/keras/optimizer_v2/gradient_descent.py
@@ -40,10 +40,10 @@ class SGD(optimizer_v2.OptimizerV2):
 
   ```python
   velocity = momentum * velocity - learning_rate * g
-  w = w * velocity
+  w = w + velocity
   ```
 
-  When `nesterov=False`, this rule becomes:
+  When `nesterov=True`, this rule becomes:
 
   ```python
   velocity = momentum * velocity - learning_rate * g
diff --git a/tensorflow/python/keras/optimizer_v2/gradient_descent_test.py b/tensorflow/python/keras/optimizer_v2/gradient_descent_test.py
index 0f25beacc9a..165102bede5 100644
--- a/tensorflow/python/keras/optimizer_v2/gradient_descent_test.py
+++ b/tensorflow/python/keras/optimizer_v2/gradient_descent_test.py
@@ -23,7 +23,7 @@ import numpy as np
 
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
-from tensorflow.python.eager import function
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -258,25 +258,30 @@ class GradientDescentOptimizerTest(test.TestCase, parameterized.TestCase):
         self.assertAllCloseAccordingToType(
             [[3.0], [4.0 - 3.0 * 0.01 - 2.0 * 0.01]], self.evaluate(var1))
 
-  def testCapturingInDefunWhileExecutingEagerly(self):
-    with context.eager_mode():
-      optimizer = gradient_descent.SGD(1.0)
+  @combinations.generate(combinations.combine(mode=["eager"]))
+  def testCapturingInFunctionWhileExecutingEagerly(self):
+    optimizer = gradient_descent.SGD(1.0)
 
-      def step():
-        self.v = variables.Variable(1.0)
-        with backprop.GradientTape() as tape:
-          loss = self.v**2
-        grad = tape.gradient(loss, self.v)
-        optimizer.apply_gradients([(grad, self.v)])
-        return self.v.read_value()
+    var_holder = {}
+    def step():
+      if not var_holder:
+        var_holder["var"] = variables.Variable(1.0)
+      else:
+        var_holder["var"].assign(1.0)
 
-      compiled_step = function.defun(step)
+      with backprop.GradientTape() as tape:
+        loss = var_holder["var"]**2
+      grad = tape.gradient(loss, var_holder["var"])
+      optimizer.apply_gradients([(grad, var_holder["var"])])
+      return var_holder["var"].read_value()
 
-      self.assertEqual(float(step()), -1.0)
-      self.assertEqual(float(compiled_step()), -1.0)
-      # This shouldn't fail; in particular, the learning rate tensor should
-      # be an EagerTensor once again, not a graph Tensor.
-      self.assertEqual(float(step()), -1.0)
+    compiled_step = def_function.function(step)
+
+    self.assertEqual(float(step()), -1.0)
+    self.assertEqual(float(compiled_step()), -1.0)
+    # This shouldn't fail; in particular, the learning rate tensor should
+    # be an EagerTensor once again, not a graph Tensor.
+    self.assertEqual(float(step()), -1.0)
 
   def testConstructSGDWithLR(self):
     opt = gradient_descent.SGD(lr=1.0)
diff --git a/tensorflow/python/keras/optimizer_v2/learning_rate_schedule.py b/tensorflow/python/keras/optimizer_v2/learning_rate_schedule.py
index 4dcff3d6c44..30b4f2145bb 100644
--- a/tensorflow/python/keras/optimizer_v2/learning_rate_schedule.py
+++ b/tensorflow/python/keras/optimizer_v2/learning_rate_schedule.py
@@ -143,7 +143,7 @@ class ExponentialDecay(LearningRateSchedule):
 
   def __call__(self, step):
     with ops.name_scope_v2(self.name or "ExponentialDecay") as name:
-      initial_learning_rate = ops.convert_to_tensor_v2(
+      initial_learning_rate = ops.convert_to_tensor_v2_with_dispatch(
           self.initial_learning_rate, name="initial_learning_rate")
       dtype = initial_learning_rate.dtype
       decay_steps = math_ops.cast(self.decay_steps, dtype)
@@ -237,11 +237,11 @@ class PiecewiseConstantDecay(LearningRateSchedule):
 
   def __call__(self, step):
     with ops.name_scope_v2(self.name or "PiecewiseConstant"):
-      boundaries = nest.map_structure(ops.convert_to_tensor_v2,
+      boundaries = nest.map_structure(ops.convert_to_tensor_v2_with_dispatch,
                                       nest.flatten(self.boundaries))
-      values = nest.map_structure(ops.convert_to_tensor_v2,
+      values = nest.map_structure(ops.convert_to_tensor_v2_with_dispatch,
                                   nest.flatten(self.values))
-      x_recomp = ops.convert_to_tensor_v2(step)
+      x_recomp = ops.convert_to_tensor_v2_with_dispatch(step)
       for i, b in enumerate(boundaries):
         if b.dtype.base_dtype != x_recomp.dtype.base_dtype:
           # We cast the boundaries to have the same type as the step
@@ -374,7 +374,7 @@ class PolynomialDecay(LearningRateSchedule):
 
   def __call__(self, step):
     with ops.name_scope_v2(self.name or "PolynomialDecay") as name:
-      initial_learning_rate = ops.convert_to_tensor_v2(
+      initial_learning_rate = ops.convert_to_tensor_v2_with_dispatch(
           self.initial_learning_rate, name="initial_learning_rate")
       dtype = initial_learning_rate.dtype
       end_learning_rate = math_ops.cast(self.end_learning_rate, dtype)
@@ -494,7 +494,7 @@ class InverseTimeDecay(LearningRateSchedule):
 
   def __call__(self, step):
     with ops.name_scope_v2(self.name or "InverseTimeDecay") as name:
-      initial_learning_rate = ops.convert_to_tensor_v2(
+      initial_learning_rate = ops.convert_to_tensor_v2_with_dispatch(
           self.initial_learning_rate, name="initial_learning_rate")
       dtype = initial_learning_rate.dtype
       decay_steps = math_ops.cast(self.decay_steps, dtype)
@@ -588,7 +588,7 @@ class CosineDecay(LearningRateSchedule):
 
   def __call__(self, step):
     with ops.name_scope_v2(self.name or "CosineDecay"):
-      initial_learning_rate = ops.convert_to_tensor_v2(
+      initial_learning_rate = ops.convert_to_tensor_v2_with_dispatch(
           self.initial_learning_rate, name="initial_learning_rate")
       dtype = initial_learning_rate.dtype
       decay_steps = math_ops.cast(self.decay_steps, dtype)
@@ -687,7 +687,7 @@ class CosineDecayRestarts(LearningRateSchedule):
 
   def __call__(self, step):
     with ops.name_scope_v2(self.name or "SGDRDecay") as name:
-      initial_learning_rate = ops.convert_to_tensor_v2(
+      initial_learning_rate = ops.convert_to_tensor_v2_with_dispatch(
           self.initial_learning_rate, name="initial_learning_rate")
       dtype = initial_learning_rate.dtype
       first_decay_steps = math_ops.cast(self.first_decay_steps, dtype)
@@ -824,7 +824,7 @@ class LinearCosineDecay(LearningRateSchedule):
 
   def __call__(self, step):
     with ops.name_scope_v2(self.name or "LinearCosineDecay") as name:
-      initial_learning_rate = ops.convert_to_tensor_v2(
+      initial_learning_rate = ops.convert_to_tensor_v2_with_dispatch(
           self.initial_learning_rate, name="initial_learning_rate")
       dtype = initial_learning_rate.dtype
       decay_steps = math_ops.cast(self.decay_steps, dtype)
@@ -950,7 +950,7 @@ class NoisyLinearCosineDecay(LearningRateSchedule):
 
   def __call__(self, step):
     with ops.name_scope_v2(self.name or "NoisyLinearCosineDecay") as name:
-      initial_learning_rate = ops.convert_to_tensor_v2(
+      initial_learning_rate = ops.convert_to_tensor_v2_with_dispatch(
           self.initial_learning_rate, name="initial_learning_rate")
       dtype = initial_learning_rate.dtype
       decay_steps = math_ops.cast(self.decay_steps, dtype)
diff --git a/tensorflow/python/keras/optimizer_v2/learning_rate_schedule_test.py b/tensorflow/python/keras/optimizer_v2/learning_rate_schedule_test.py
index d2bc7b94ac2..447a348618c 100644
--- a/tensorflow/python/keras/optimizer_v2/learning_rate_schedule_test.py
+++ b/tensorflow/python/keras/optimizer_v2/learning_rate_schedule_test.py
@@ -43,9 +43,6 @@ def _maybe_serialized(lr_decay, serialize_and_deserialize):
     return lr_decay
 
 
-# @parameterized.named_parameters(
-#     ("NotSerialized", False),
-#     ("Serialized", True))
 @combinations.generate(combinations.combine(serialize=[False, True],
                                             mode=["graph", "eager"]))
 class LRDecayTestV2(test_util.TensorFlowTestCase, parameterized.TestCase):
@@ -123,24 +120,26 @@ class LRDecayTestV2(test_util.TensorFlowTestCase, parameterized.TestCase):
     self.assertAllClose(self.evaluate(decayed_lr(x)), 0.001, 1e-6)
 
   def testPiecewiseFunction(self, serialize):
+    if not context.executing_eagerly():
+      self.skipTest("Run on eager mode only.")
+
     del serialize
-    with context.eager_mode():
-      v = variables.Variable(1.)
-      def loss_fn():
-        return v * v
-      learning_rate = learning_rate_schedule.PiecewiseConstantDecay(
-          [1.], [1., 0.1])
-      opt = gradient_descent.SGD(learning_rate=learning_rate)
+    v = variables.Variable(1.)
+    def loss_fn():
+      return v * v
+    learning_rate = learning_rate_schedule.PiecewiseConstantDecay(
+        [1.], [1., 0.1])
+    opt = gradient_descent.SGD(learning_rate=learning_rate)
 
-      @def_function.function
-      def minimize():
-        with backprop.GradientTape() as tape:
-          loss = loss_fn()
-        g = tape.gradient(loss, [v])
-        opt.apply_gradients(list(zip(g, [v])))
+    @def_function.function
+    def minimize():
+      with backprop.GradientTape() as tape:
+        loss = loss_fn()
+      g = tape.gradient(loss, [v])
+      opt.apply_gradients(list(zip(g, [v])))
 
-      minimize()
-      self.assertAllEqual(v.read_value(), -1.0)
+    minimize()
+    self.assertAllEqual(v.read_value(), -1.0)
 
   def testPiecewiseConstantEdgeCases(self, serialize):
     # Test casting boundaries from int32 to int64.
diff --git a/tensorflow/python/keras/optimizer_v2/legacy_learning_rate_decay.py b/tensorflow/python/keras/optimizer_v2/legacy_learning_rate_decay.py
index ad280568fc7..ab8e4f55b52 100644
--- a/tensorflow/python/keras/optimizer_v2/legacy_learning_rate_decay.py
+++ b/tensorflow/python/keras/optimizer_v2/legacy_learning_rate_decay.py
@@ -148,10 +148,11 @@ def piecewise_constant(x, boundaries, values, name=None):
   the learning rate value across different invocations of optimizer functions.
   @end_compatibility
   """
-  boundaries = nest.map_structure(ops.convert_to_tensor_v2,
+  boundaries = nest.map_structure(ops.convert_to_tensor_v2_with_dispatch,
                                   nest.flatten(boundaries))
-  values = nest.map_structure(ops.convert_to_tensor_v2, nest.flatten(values))
-  x_recomp = ops.convert_to_tensor(x)
+  values = nest.map_structure(ops.convert_to_tensor_v2_with_dispatch,
+                              nest.flatten(values))
+  x_recomp = ops.convert_to_tensor_v2_with_dispatch(x)
   # Avoid explicit conversion to x's dtype. This could result in faulty
   # comparisons, for example if floats are converted to integers.
   for i, b in enumerate(boundaries):
diff --git a/tensorflow/python/keras/optimizer_v2/nadam.py b/tensorflow/python/keras/optimizer_v2/nadam.py
index 090eabacf1e..550db0f6472 100644
--- a/tensorflow/python/keras/optimizer_v2/nadam.py
+++ b/tensorflow/python/keras/optimizer_v2/nadam.py
@@ -122,7 +122,7 @@ class Nadam(optimizer_v2.OptimizerV2):
     apply_state[(var_device, var_dtype)] = dict(
         lr_t=lr_t,
         neg_lr_t=-lr_t,
-        epsilon=ops.convert_to_tensor_v2(self.epsilon, var_dtype),
+        epsilon=ops.convert_to_tensor_v2_with_dispatch(self.epsilon, var_dtype),
         beta_1_t=beta_1_t,
         beta_2_t=beta_2_t,
         m_t=m_t,
diff --git a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
index 18d94594542..ec6bca7ebeb 100644
--- a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
+++ b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
@@ -39,9 +39,10 @@ from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.optimizer_v2 import learning_rate_schedule
 from tensorflow.python.keras.optimizer_v2 import utils as optimizer_utils
 from tensorflow.python.keras.utils import generic_utils
+from tensorflow.python.keras.utils import layer_utils
+from tensorflow.python.keras.utils import tf_inspect
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_resource_variable_ops
 from tensorflow.python.ops import gradients
@@ -49,9 +50,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.saved_model import revived_types
 from tensorflow.python.training.tracking import base as trackable
-from tensorflow.python.training.tracking import tracking
 from tensorflow.python.util import nest
-from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -278,7 +277,11 @@ class OptimizerV2(trackable.Trackable):
   # Note: This attribute will likely be removed in an upcoming release.
   _HAS_AGGREGATE_GRAD = False
 
-  def __init__(self, name, **kwargs):
+  def __init__(self,
+               name,
+               gradient_aggregator=None,
+               gradient_transformers=None,
+               **kwargs):
     """Create a new Optimizer.
 
     This must be called by the constructors of subclasses.
@@ -287,21 +290,43 @@ class OptimizerV2(trackable.Trackable):
     you should be able to use the _set_hyper()/state.get_hyper()
     facility instead.
 
-    This class in stateful and thread-compatible.
+    This class is stateful and thread-compatible.
+
+    Example of custom gradient transformations:
+
+    ```python
+    def my_gradient_transformer(grads_and_vars):
+      # Simple example, double the gradients.
+      return [(2. * g, v) for g, v in grads_and_vars]
+
+    optimizer = tf.keras.optimizers.SGD(
+        1e-3, gradient_transformers=[my_gradient_transformer])
+    ```
 
     Args:
-      name: A non-empty string.  The name to use for accumulators created
-        for the optimizer.
-      **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
-        `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
-        gradients by value, `decay` is included for backward compatibility to
-        allow time inverse decay of learning rate. `lr` is included for backward
-        compatibility, recommended to use `learning_rate` instead.
+      name: String. The name to use for momentum accumulator weights created
+        by the optimizer.
+      gradient_aggregator: The function to use to aggregate gradients across
+        devices (when using `tf.distribute.Strategy`). If `None`, defaults to
+        summing the gradients across devices. The function should accept and
+        return a list of `(gradient, variable)` tuples.
+      gradient_transformers: Optional. List of functions to use to transform
+        gradients before applying updates to Variables. The functions are
+        applied after `gradient_aggregator`. The functions should accept and
+        return a list of `(gradient, variable)` tuples.
+      **kwargs: keyword arguments. Allowed arguments are `clipvalue`,
+        `clipnorm`, `global_clipnorm`.
+        If `clipvalue` (float) is set, the gradient of each weight
+        is clipped to be no higher than this value.
+        If `clipnorm` (float) is set, the gradient of each weight
+        is individually clipped so that its norm is no higher than this value.
+        If `global_clipnorm` (float) is set the gradient of all weights is
+        clipped so that their global norm is no higher than this value.
 
     Raises:
-      ValueError: If name is malformed.
+      ValueError: in case of any invalid argument.
     """
-    allowed_kwargs = {"clipnorm", "clipvalue", "lr", "decay"}
+    allowed_kwargs = {"clipnorm", "clipvalue", "lr", "decay", "global_clipnorm"}
     for k in kwargs:
       if k not in allowed_kwargs:
         raise TypeError("Unexpected keyword argument "
@@ -332,17 +357,7 @@ class OptimizerV2(trackable.Trackable):
       raise ValueError("decay cannot be less than 0: {}".format(decay))
     self._initial_decay = decay
 
-    # Set the gradient clipping properties
-    self.clipnorm = kwargs.pop("clipnorm", None)
-    self.clipvalue = kwargs.pop("clipvalue", None)
-    if ((self.clipnorm is not None or self.clipvalue is not None)
-        and distribute_ctx.has_strategy()):
-      raise ValueError("Gradient clipping in the optimizer "
-                       "(by setting clipnorm or clipvalue) is currently "
-                       "unsupported when using a distribution strategy.")
-
     self._hypers_created = False
-
     # Store the distribution strategy object if the optimizer is created inside
     # strategy scope, so it could be used to create variables later.
     if distribute_ctx.has_strategy():
@@ -350,6 +365,96 @@ class OptimizerV2(trackable.Trackable):
     else:
       self._distribution_strategy = None
 
+    # Configure gradient transformations.
+    if gradient_aggregator is None:
+      gradient_aggregator = optimizer_utils.all_reduce_sum_gradients
+    self.gradient_aggregator = gradient_aggregator
+    if gradient_transformers is None:
+      gradient_transformers = []
+    self.gradient_transformers = gradient_transformers
+    self.clipnorm = kwargs.pop("clipnorm", None)
+    self.global_clipnorm = kwargs.pop("global_clipnorm", None)
+    if self.clipnorm is not None and self.global_clipnorm is not None:
+      raise ValueError("Cannot accept both `clipnorm` and `global_clipnorm`, "
+                       "passed `clipnorm` {}, `global_clipnorm` {}".format(
+                           self.clipnorm, self.global_clipnorm))
+    self.clipvalue = kwargs.pop("clipvalue", None)
+
+  @property
+  def clipnorm(self):
+    """`float` or `None`. If set, clips gradients to a maximum norm."""
+    return self._clipnorm
+
+  @property
+  def global_clipnorm(self):
+    """`float` or `None`. If set, clips gradients to a maximum norm."""
+    return self._global_clipnorm
+
+  @clipnorm.setter
+  def clipnorm(self, val):
+    if val is not None and self.gradient_transformers:
+      raise ValueError("`clipnorm` cannot be set when `gradient_transformers` "
+                       "is set. Instead, use the `gradient_transformers` to "
+                       "specify clipping and other transformations.")
+    self._clipnorm = val
+    self._clipnorm_fn = optimizer_utils.make_gradient_clipnorm_fn(
+        self._clipnorm)
+
+  @global_clipnorm.setter
+  def global_clipnorm(self, val):
+    if val is not None and self.gradient_transformers:
+      raise ValueError("`clipnorm` cannot be set when `gradient_transformers` "
+                       "is set. Instead, use the `gradient_transformers` to "
+                       "specify clipping and other transformations.")
+    self._global_clipnorm = val
+    self._global_clipnorm_fn = optimizer_utils.make_global_gradient_clipnorm_fn(
+        self._global_clipnorm)
+
+  @property
+  def clipvalue(self):
+    """`float` or `None`. If set, clips gradients to a maximum value."""
+    return self._clipvalue
+
+  @clipvalue.setter
+  def clipvalue(self, val):
+    if val is not None and self.gradient_transformers:
+      raise ValueError("`clipvalue` cannot be set when `gradient_transformers` "
+                       "is set. Instead, use the `gradient_transformers` to "
+                       "specify clipping and other transformations.")
+    self._clipvalue = val
+    self._clipvalue_fn = optimizer_utils.make_gradient_clipvalue_fn(
+        self._clipvalue)
+
+  def _transform_loss(self, loss):
+    """Called in `.minimize` to transform loss before computing gradients."""
+    return loss
+
+  def _get_gradients(self, tape, loss, var_list, grad_loss=None):
+    """Called in `minimize` to compute gradients from loss."""
+    grads = tape.gradient(loss, var_list, grad_loss)
+    return list(zip(grads, var_list))
+
+  def _transform_unaggregated_gradients(self, grads_and_vars):
+    """Called in `apply_gradients` before gradient aggregation."""
+    return grads_and_vars
+
+  def _aggregate_gradients(self, grads_and_vars):
+    """Called in `apply_gradients` to aggregate gradients across devices."""
+    return self.gradient_aggregator(grads_and_vars)
+
+  def _transform_gradients(self, grads_and_vars):
+    """Called in `apply_gradients` after aggregation."""
+    if self._clipvalue is not None:
+      grads_and_vars = self._clipvalue_fn(grads_and_vars)
+    if self._clipnorm is not None:
+      grads_and_vars = self._clipnorm_fn(grads_and_vars)
+    if self._global_clipnorm is not None:
+      grads_and_vars = self._global_clipnorm_fn(grads_and_vars)
+
+    for fn in self.gradient_transformers:
+      grads_and_vars = fn(grads_and_vars)
+    return grads_and_vars
+
   def minimize(self, loss, var_list, grad_loss=None, name=None, tape=None):
     """Minimize `loss` by updating `var_list`.
 
@@ -385,26 +490,6 @@ class OptimizerV2(trackable.Trackable):
         loss, var_list=var_list, grad_loss=grad_loss, tape=tape)
     return self.apply_gradients(grads_and_vars, name=name)
 
-  def _clip_gradients(self, grads):
-    """Clip gradients according to the clipnorm and clipvalue attributes."""
-    if self.clipnorm is not None:
-      if distribute_ctx.has_strategy():
-        raise ValueError("Gradient clipping in the optimizer "
-                         "(by setting clipnorm or clipvalue) is currently "
-                         "unsupported when using a distribution strategy.")
-      grads = [None if g is None else clip_ops.clip_by_norm(g, self.clipnorm)
-               for g in grads]
-    if self.clipvalue is not None:
-      if distribute_ctx.has_strategy():
-        raise ValueError("Gradient clipping in the optimizer "
-                         "(by setting clipnorm or clipvalue) is currently "
-                         "unsupported when using a distribution strategy.")
-      v = self.clipvalue
-      grads = [
-          None if g is None else clip_ops.clip_by_value(g, -v, v) for g in grads
-      ]
-    return grads
-
   def _compute_gradients(self, loss, var_list, grad_loss=None, tape=None):
     """Compute gradients of `loss` for the variables in `var_list`.
 
@@ -444,19 +529,16 @@ class OptimizerV2(trackable.Trackable):
       with tape:
         if not callable(var_list):
           tape.watch(var_list)
-
-        if callable(loss):
-          loss = loss()
-
+        loss = loss()
         if callable(var_list):
           var_list = var_list()
 
+    with tape:
+      loss = self._transform_loss(loss)
+
     var_list = nest.flatten(var_list)
     with ops.name_scope_v2(self._name + "/gradients"):
-      grads = tape.gradient(loss, var_list, grad_loss)
-      # TODO(omalleyt): Move to post-aggregation.
-      grads = self._clip_gradients(grads)
-    grads_and_vars = list(zip(grads, var_list))
+      grads_and_vars = self._get_gradients(tape, loss, var_list, grad_loss)
 
     self._assert_valid_dtypes([
         v for g, v in grads_and_vars
@@ -465,34 +547,6 @@ class OptimizerV2(trackable.Trackable):
 
     return grads_and_vars
 
-  def get_gradients(self, loss, params):
-    """Returns gradients of `loss` with respect to `params`.
-
-    Arguments:
-      loss: Loss tensor.
-      params: List of variables.
-
-    Returns:
-      List of gradient tensors.
-
-    Raises:
-      ValueError: In case any gradient cannot be computed (e.g. if gradient
-        function not implemented).
-    """
-    params = nest.flatten(params)
-    with backend.get_graph().as_default(), backend.name_scope(self._name +
-                                                              "/gradients"):
-      grads = gradients.gradients(loss, params)
-      for grad, param in zip(grads, params):
-        if grad is None:
-          raise ValueError("Variable {} has `None` for gradient. "
-                           "Please make sure that all of your ops have a "
-                           "gradient defined (i.e. are differentiable). "
-                           "Common ops without gradient: "
-                           "K.argmax, K.round, K.eval.".format(param))
-      grads = self._clip_gradients(grads)
-    return grads
-
   def apply_gradients(self,
                       grads_and_vars,
                       name=None,
@@ -537,7 +591,7 @@ class OptimizerV2(trackable.Trackable):
     grads_and_vars = optimizer_utils.filter_empty_gradients(grads_and_vars)
     var_list = [v for (_, v) in grads_and_vars]
 
-    with backend.name_scope(self._name):
+    with ops.name_scope_v2(self._name):
       # Create iteration if necessary.
       with ops.init_scope():
         self._create_all_weights(var_list)
@@ -563,9 +617,10 @@ class OptimizerV2(trackable.Trackable):
 
       apply_state = self._prepare(var_list)
       if experimental_aggregate_gradients:
-        reduced_grads = self._aggregate_gradients(grads_and_vars)
-        var_list = [v for _, v in grads_and_vars]
-        grads_and_vars = list(zip(reduced_grads, var_list))
+        grads_and_vars = self._transform_unaggregated_gradients(grads_and_vars)
+        grads_and_vars = self._aggregate_gradients(grads_and_vars)
+      grads_and_vars = self._transform_gradients(grads_and_vars)
+
       return distribute_ctx.get_replica_context().merge_call(
           functools.partial(self._distributed_apply, apply_state=apply_state),
           args=(grads_and_vars,),
@@ -573,20 +628,6 @@ class OptimizerV2(trackable.Trackable):
               "name": name,
           })
 
-  def _aggregate_gradients(self, grads_and_vars):
-    """Returns aggregated gradients.
-
-    This method must be preserved to maintain backwards compatibility with
-    Horovod aggregation.
-
-    Args:
-      grads_and_vars: List of (gradient, variable) pairs.
-
-    Returns:
-      A list of all-reduced gradients.
-    """
-    return optimizer_utils.all_reduce_sum_gradients(grads_and_vars)
-
   def _distributed_apply(self, distribution, grads_and_vars, name, apply_state):
     """`apply_gradients` using a `DistributionStrategy`."""
 
@@ -646,6 +687,35 @@ class OptimizerV2(trackable.Trackable):
 
       return self._iterations.assign_add(1)
 
+  def get_gradients(self, loss, params):
+    """Returns gradients of `loss` with respect to `params`.
+
+    Should be used only in legacy v1 graph mode.
+
+    Arguments:
+      loss: Loss tensor.
+      params: List of variables.
+
+    Returns:
+      List of gradient tensors.
+
+    Raises:
+      ValueError: In case any gradient cannot be computed (e.g. if gradient
+        function not implemented).
+    """
+    params = nest.flatten(params)
+    with backend.get_graph().as_default(), backend.name_scope(self._name +
+                                                              "/gradients"):
+      grads = gradients.gradients(loss, params)
+      for grad, param in zip(grads, params):
+        if grad is None:
+          raise ValueError("Variable {} has `None` for gradient. "
+                           "Please make sure that all of your ops have a "
+                           "gradient defined (i.e. are differentiable). "
+                           "Common ops without gradient: "
+                           "K.argmax, K.round, K.eval.".format(param))
+    return grads
+
   def get_updates(self, loss, params):
     grads = self.get_gradients(loss, params)
     grads_and_vars = list(zip(grads, params))
@@ -1162,12 +1232,12 @@ class OptimizerV2(trackable.Trackable):
       return x.value()
 
   @property
-  @tracking.cached_per_instance
+  @layer_utils.cached_per_instance
   def _dense_apply_args(self):
     return tf_inspect.getfullargspec(self._resource_apply_dense).args
 
   @property
-  @tracking.cached_per_instance
+  @layer_utils.cached_per_instance
   def _sparse_apply_args(self):
     return tf_inspect.getfullargspec(self._resource_apply_sparse).args
 
@@ -1222,7 +1292,7 @@ class OptimizerV2(trackable.Trackable):
         # (aside from double initialization), and makes variable creator scopes
         # behave the same way they do when graph building.
         and not ops.get_default_graph()._variable_creator_stack):  # pylint: disable=protected-access
-      initializer = trackable.CheckpointInitialValue(
+      initializer = trackable.CheckpointInitialValueCallable(
           checkpoint_position=slot_variable_position)
       slot_variable = self.add_slot(
           var=variable,
@@ -1275,7 +1345,7 @@ def _var_key(var):
   # pylint: disable=protected-access
   # Get the distributed variable if it exists.
   if hasattr(var, "_distributed_container"):
-    var = var._distributed_container
+    var = var._distributed_container()
   if var._in_graph_mode:
     return var._shared_name
   return var._unique_id
diff --git a/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py b/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
index e994a6e1e44..7f4e6b332a9 100644
--- a/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
+++ b/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
@@ -237,7 +237,7 @@ class OptimizerTest(test.TestCase, parameterized.TestCase):
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testComputeGradientsWithTensors(self):
     with testing_utils.use_gpu():
-      x = ops.convert_to_tensor_v2(1.0)
+      x = ops.convert_to_tensor_v2_with_dispatch(1.0)
 
       def f():
         return x * x
@@ -356,6 +356,22 @@ class OptimizerTest(test.TestCase, parameterized.TestCase):
       self.evaluate(opt_op)
       self.assertAllClose([0.], self.evaluate(var))
 
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
+  def testGradGlobalClipNorm(self):
+    with testing_utils.use_gpu():
+      # l2 norm is 5.0
+      var1 = variables.Variable([1.0])
+      var2 = variables.Variable([2.0])
+      loss = lambda: 3 * var1 + 4 * var2
+      opt = gradient_descent.SGD(learning_rate=1.0, global_clipnorm=2.0)
+      opt_op = opt.minimize(loss, [var1, var2])
+      self.evaluate(variables.global_variables_initializer())
+      self.evaluate(opt_op)
+      # grad1 = 3.0 * 2.0 / 5.0 = 1.2
+      self.assertAllClose([-.2], self.evaluate(var1))
+      # grad2 = 4.0 * 2.0 / 5.0 = 1.6
+      self.assertAllClose([.4], self.evaluate(var2))
+
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testInvalidClipNorm(self):
     with self.assertRaisesRegex(ValueError, '>= 0'):
@@ -856,62 +872,60 @@ class OptimizersCompatibilityTest(keras_parameterized.TestCase):
 
 # Note: These tests are kept in a separate class to avoid bugs in some
 # distributions of Python that break AutoGraph which is used by tf.function.
-class OptimizerWithFunctionTest(test.TestCase):
+@combinations.generate(combinations.combine(mode=['eager']))
+class OptimizerWithFunctionTest(test.TestCase, parameterized.TestCase):
 
   def testBasic(self):
-    with context.eager_mode():
-      var = variables.Variable([1.0, 2.0], dtype=dtypes.float32)
-      loss = lambda: 3 * var
-      opt = adam.Adam(learning_rate=1.0)
+    var = variables.Variable([1.0, 2.0], dtype=dtypes.float32)
+    loss = lambda: 3 * var
+    opt = adam.Adam(learning_rate=1.0)
 
-      @def_function.function
-      def fn():
-        opt.minimize(loss, [var])
-        return var
+    @def_function.function
+    def fn():
+      opt.minimize(loss, [var])
+      return var
 
-      self.assertAllClose([0., 1.], fn(), atol=1e-4)
-      self.assertAllClose([-1, 0.], fn(), atol=1e-4)
+    self.assertAllClose([0., 1.], fn(), atol=1e-4)
+    self.assertAllClose([-1, 0.], fn(), atol=1e-4)
 
   def testVarKeyWithVarCreatedInEager(self):
-    with context.eager_mode():
-      a = variables.Variable([1., 2.], name='var')
-      b = variables.Variable([1.], name='var')
+    a = variables.Variable([1., 2.], name='var')
+    b = variables.Variable([1.], name='var')
 
-      @test_util.also_run_as_tf_function
-      def var_key_test():
-        self.assertFalse(a._in_graph_mode)
-        self.assertFalse(b._in_graph_mode)
-        var_key_a = optimizer_v2._var_key(a)
-        self.assertStartsWith(var_key_a, 'var_')
-        var_key_b = optimizer_v2._var_key(b)
-        self.assertStartsWith(var_key_b, 'var_')
-        self.assertNotEquals(var_key_a, var_key_b)
+    @test_util.also_run_as_tf_function
+    def var_key_test():
+      self.assertFalse(a._in_graph_mode)
+      self.assertFalse(b._in_graph_mode)
+      var_key_a = optimizer_v2._var_key(a)
+      self.assertStartsWith(var_key_a, 'var_')
+      var_key_b = optimizer_v2._var_key(b)
+      self.assertStartsWith(var_key_b, 'var_')
+      self.assertNotEqual(var_key_a, var_key_b)
 
-      var_key_test()
+    var_key_test()
 
   def testLearningRateDecayUsedInTwoFunctions(self):
-    with context.eager_mode():
-      a = variables.Variable([1., 2.], name='var')
-      b = variables.Variable([1.], name='var')
+    a = variables.Variable([1., 2.], name='var')
+    b = variables.Variable([1.], name='var')
 
-      learning_rate_decay = learning_rate_schedule.InverseTimeDecay(
-          0.5, decay_steps=1.0, decay_rate=0.5)
-      opt = adam.Adam(learning_rate=learning_rate_decay)
-      loss_a = lambda: 3 * a
-      loss_b = lambda: 2 * b
+    learning_rate_decay = learning_rate_schedule.InverseTimeDecay(
+        0.5, decay_steps=1.0, decay_rate=0.5)
+    opt = adam.Adam(learning_rate=learning_rate_decay)
+    loss_a = lambda: 3 * a
+    loss_b = lambda: 2 * b
 
-      @def_function.function
-      def fn_a():
-        opt.minimize(loss_a, [a])
-        return a
+    @def_function.function
+    def fn_a():
+      opt.minimize(loss_a, [a])
+      return a
 
-      @def_function.function
-      def fn_b():
-        opt.minimize(loss_b, [b])
-        return b
+    @def_function.function
+    def fn_b():
+      opt.minimize(loss_b, [b])
+      return b
 
-      fn_a()
-      fn_b()
+    fn_a()
+    fn_b()
 
 
 _NUM_LEARNERS = 50
diff --git a/tensorflow/python/keras/optimizer_v2/rmsprop.py b/tensorflow/python/keras/optimizer_v2/rmsprop.py
index 1fa2577e72f..315b0a65e3c 100644
--- a/tensorflow/python/keras/optimizer_v2/rmsprop.py
+++ b/tensorflow/python/keras/optimizer_v2/rmsprop.py
@@ -49,7 +49,7 @@ class RMSprop(optimizer_v2.OptimizerV2):
     learning_rate: A `Tensor`, floating point value, or a schedule that is a
       `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
       that takes no arguments and returns the actual value to use. The
-      learning rate. Defeaults to 0.001.
+      learning rate. Defaults to 0.001.
     rho: Discounting factor for the history/coming gradient. Defaults to 0.9.
     momentum: A scalar or a scalar `Tensor`. Defaults to 0.0.
     epsilon: A small constant for numerical stability. This epsilon is
@@ -109,7 +109,7 @@ class RMSprop(optimizer_v2.OptimizerV2):
       learning_rate: A `Tensor`, floating point value, or a schedule that is a
         `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
         that takes no arguments and returns the actual value to use. The
-        learning rate. Defeaults to 0.001.
+        learning rate. Defaults to 0.001.
       rho: Discounting factor for the history/coming gradient. Defaults to 0.9.
       momentum: A scalar or a scalar `Tensor`. Defaults to 0.0.
       epsilon: A small constant for numerical stability. This epsilon is
@@ -167,7 +167,8 @@ class RMSprop(optimizer_v2.OptimizerV2):
     apply_state[(var_device, var_dtype)].update(
         dict(
             neg_lr_t=-apply_state[(var_device, var_dtype)]["lr_t"],
-            epsilon=ops.convert_to_tensor_v2(self.epsilon, var_dtype),
+            epsilon=ops.convert_to_tensor_v2_with_dispatch(
+                self.epsilon, var_dtype),
             rho=rho,
             momentum=array_ops.identity(self._get_hyper("momentum", var_dtype)),
             one_minus_rho=1. - rho))
diff --git a/tensorflow/python/keras/optimizer_v2/rmsprop_test.py b/tensorflow/python/keras/optimizer_v2/rmsprop_test.py
index 35f795edb53..cd25ab842e6 100644
--- a/tensorflow/python/keras/optimizer_v2/rmsprop_test.py
+++ b/tensorflow/python/keras/optimizer_v2/rmsprop_test.py
@@ -25,7 +25,6 @@ import math
 from absl.testing import parameterized
 import numpy as np
 
-from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -59,7 +58,7 @@ _TESTPARAMS = [
 ]
 
 
-class RMSpropOptimizerTest(test.TestCase):
+class RMSpropOptimizerTest(test.TestCase, parameterized.TestCase):
 
   def _rmsprop_update_numpy(self, var, g, mg, rms, mom, lr, rho, momentum,
                             epsilon, centered):
@@ -459,54 +458,54 @@ class RMSpropOptimizerTest(test.TestCase):
           self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
           self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
+  @combinations.generate(combinations.combine(mode=["eager"]))
   def testCallableParams(self):
-    with context.eager_mode():
-      for dtype in _DATA_TYPES:
-        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
-        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
-        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
-        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+    for dtype in _DATA_TYPES:
+      var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+      var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+      grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+      grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
 
-        learning_rate = lambda: 2.0
-        rho = lambda: 0.9
-        momentum = lambda: 0.0
-        epsilon = 1.0
-        opt = rmsprop.RMSprop(learning_rate, rho, momentum, epsilon)
+      learning_rate = lambda: 2.0
+      rho = lambda: 0.9
+      momentum = lambda: 0.0
+      epsilon = 1.0
+      opt = rmsprop.RMSprop(learning_rate, rho, momentum, epsilon)
 
-        # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
-        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
-        # Step 1: the rms accumulators where 1. So we should see a normal
-        # update: v -= grad * learning_rate
-        opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        # Check the parameters.
-        self.assertAllCloseAccordingToType(
-            np.array([
-                1.0 - (0.1 * 2.0 / math.sqrt(0.001 + 1.0)),
-                2.0 - (0.1 * 2.0 / math.sqrt(0.001 + 1.0))
-            ]), self.evaluate(var0))
-        self.assertAllCloseAccordingToType(
-            np.array([
-                3.0 - (0.01 * 2.0 / math.sqrt(0.00001 + 1.0)),
-                4.0 - (0.01 * 2.0 / math.sqrt(0.00001 + 1.0))
-            ]), self.evaluate(var1))
-        # Step 2: the root mean square accumulators contain the previous update.
-        opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        # Check the parameters.
-        self.assertAllCloseAccordingToType(
-            np.array([
-                1.0 - (0.1 * 2.0 / math.sqrt(0.001 + 1.0)) -
-                (0.1 * 2.0 / math.sqrt(0.001 * 0.9 + 0.001 + 1.0)),
-                2.0 - (0.1 * 2.0 / math.sqrt(0.001 + 1.0)) -
-                (0.1 * 2.0 / math.sqrt(0.001 * 0.9 + 0.001 + 1.0))
-            ]), self.evaluate(var0))
-        self.assertAllCloseAccordingToType(
-            np.array([
-                3.0 - (0.01 * 2.0 / math.sqrt(0.00001 + 1.0)) -
-                (0.01 * 2.0 / math.sqrt(0.00001 * 0.9 + 1e-5 + 1.0)),
-                4.0 - (0.01 * 2.0 / math.sqrt(0.00001 + 1.0)) -
-                (0.01 * 2.0 / math.sqrt(0.00001 * 0.9 + 1e-5 + 1.0))
-            ]), self.evaluate(var1))
+      # Fetch params to validate initial values
+      self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+      self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+      # Step 1: the rms accumulators where 1. So we should see a normal
+      # update: v -= grad * learning_rate
+      opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+      # Check the parameters.
+      self.assertAllCloseAccordingToType(
+          np.array([
+              1.0 - (0.1 * 2.0 / math.sqrt(0.001 + 1.0)),
+              2.0 - (0.1 * 2.0 / math.sqrt(0.001 + 1.0))
+          ]), self.evaluate(var0))
+      self.assertAllCloseAccordingToType(
+          np.array([
+              3.0 - (0.01 * 2.0 / math.sqrt(0.00001 + 1.0)),
+              4.0 - (0.01 * 2.0 / math.sqrt(0.00001 + 1.0))
+          ]), self.evaluate(var1))
+      # Step 2: the root mean square accumulators contain the previous update.
+      opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+      # Check the parameters.
+      self.assertAllCloseAccordingToType(
+          np.array([
+              1.0 - (0.1 * 2.0 / math.sqrt(0.001 + 1.0)) -
+              (0.1 * 2.0 / math.sqrt(0.001 * 0.9 + 0.001 + 1.0)),
+              2.0 - (0.1 * 2.0 / math.sqrt(0.001 + 1.0)) -
+              (0.1 * 2.0 / math.sqrt(0.001 * 0.9 + 0.001 + 1.0))
+          ]), self.evaluate(var0))
+      self.assertAllCloseAccordingToType(
+          np.array([
+              3.0 - (0.01 * 2.0 / math.sqrt(0.00001 + 1.0)) -
+              (0.01 * 2.0 / math.sqrt(0.00001 * 0.9 + 1e-5 + 1.0)),
+              4.0 - (0.01 * 2.0 / math.sqrt(0.00001 + 1.0)) -
+              (0.01 * 2.0 / math.sqrt(0.00001 * 0.9 + 1e-5 + 1.0))
+          ]), self.evaluate(var1))
 
   def testConstructRMSpropWithLR(self):
     opt = rmsprop.RMSprop(lr=1.0)
@@ -521,31 +520,31 @@ class RMSpropOptimizerTest(test.TestCase):
     self.assertAllClose(self.evaluate(opt_2.lr), (1.0))
     self.assertAllClose(self.evaluate(opt_3.lr), (0.1))
 
+  @combinations.generate(combinations.combine(mode=["eager"]))
   def testSlotsUniqueEager(self):
-    with context.eager_mode():
-      v1 = variables.Variable(1.)
-      v2 = variables.Variable(1.)
+    v1 = variables.Variable(1.)
+    v2 = variables.Variable(1.)
 
-      opt = rmsprop.RMSprop(1., momentum=0., centered=False)
-      opt.minimize(lambda: v1 + v2, var_list=[v1, v2])
-      # There should be iteration, and one unique slot variable for v1 and v2.
-      self.assertEqual(3, len(set({id(v) for v in opt.variables()})))
-      self.assertEqual(
-          self.evaluate(opt.variables()[0]), self.evaluate(opt.iterations))
+    opt = rmsprop.RMSprop(1., momentum=0., centered=False)
+    opt.minimize(lambda: v1 + v2, var_list=[v1, v2])
+    # There should be iteration, and one unique slot variable for v1 and v2.
+    self.assertLen(set({id(v) for v in opt.variables()}), 3)
+    self.assertEqual(
+        self.evaluate(opt.variables()[0]), self.evaluate(opt.iterations))
 
-      opt = rmsprop.RMSprop(learning_rate=1., momentum=0.2, centered=False)
-      opt.minimize(lambda: v1 + v2, var_list=[v1, v2])
-      # There should be iteration, and two unique slot variables for v1 and v2.
-      self.assertEqual(5, len(set({id(v) for v in opt.variables()})))
-      self.assertEqual(
-          self.evaluate(opt.variables()[0]), self.evaluate(opt.iterations))
+    opt = rmsprop.RMSprop(learning_rate=1., momentum=0.2, centered=False)
+    opt.minimize(lambda: v1 + v2, var_list=[v1, v2])
+    # There should be iteration, and two unique slot variables for v1 and v2.
+    self.assertLen(set({id(v) for v in opt.variables()}), 5)
+    self.assertEqual(
+        self.evaluate(opt.variables()[0]), self.evaluate(opt.iterations))
 
-      opt = rmsprop.RMSprop(learning_rate=1., momentum=0.2, centered=True)
-      opt.minimize(lambda: v1 + v2, var_list=[v1, v2])
-      # There should be iteration, and three unique slot variables for v1 and v2
-      self.assertEqual(7, len(set({id(v) for v in opt.variables()})))
-      self.assertEqual(
-          self.evaluate(opt.variables()[0]), self.evaluate(opt.iterations))
+    opt = rmsprop.RMSprop(learning_rate=1., momentum=0.2, centered=True)
+    opt.minimize(lambda: v1 + v2, var_list=[v1, v2])
+    # There should be iteration, and three unique slot variables for v1 and v2
+    self.assertLen(set({id(v) for v in opt.variables()}), 7)
+    self.assertEqual(
+        self.evaluate(opt.variables()[0]), self.evaluate(opt.iterations))
 
 
 @combinations.generate(combinations.combine(mode=["graph", "eager"]))
diff --git a/tensorflow/python/keras/optimizer_v2/utils.py b/tensorflow/python/keras/optimizer_v2/utils.py
index 9f680e04dd6..44958792c10 100644
--- a/tensorflow/python/keras/optimizer_v2/utils.py
+++ b/tensorflow/python/keras/optimizer_v2/utils.py
@@ -18,8 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.distribute import central_storage_strategy
 from tensorflow.python.distribute import distribution_strategy_context as distribute_ctx
 from tensorflow.python.distribute import reduce_util as ds_reduce_util
+from tensorflow.python.ops import clip_ops
 from tensorflow.python.platform import tf_logging as logging
 
 
@@ -30,7 +32,7 @@ def all_reduce_sum_gradients(grads_and_vars):
     grads_and_vars: List of (gradient, variable) pairs.
 
   Returns:
-    A list of all-reduced gradients.
+    List of (gradient, variable) pairs where gradients have been all-reduced.
   """
   grads_and_vars = list(grads_and_vars)
   filtered_grads_and_vars = filter_empty_gradients(grads_and_vars)
@@ -47,11 +49,11 @@ def all_reduce_sum_gradients(grads_and_vars):
   # Copy 'reduced' but add None gradients back in
   reduced_with_nones = []
   reduced_pos = 0
-  for g, _ in grads_and_vars:
+  for g, v in grads_and_vars:
     if g is None:
-      reduced_with_nones.append(None)
+      reduced_with_nones.append((None, v))
     else:
-      reduced_with_nones.append(reduced[reduced_pos])
+      reduced_with_nones.append((reduced[reduced_pos], v))
       reduced_pos += 1
   assert reduced_pos == len(reduced), "Failed to add all gradients"
   return reduced_with_nones
@@ -82,6 +84,66 @@ def filter_empty_gradients(grads_and_vars):
   return filtered
 
 
+def make_gradient_clipnorm_fn(clipnorm):
+  """Creates a gradient transformation function for clipping by norm."""
+  if clipnorm is None:
+    return lambda grads_and_vars: grads_and_vars
+
+  def gradient_clipnorm_fn(grads_and_vars):
+
+    if isinstance(distribute_ctx.get_strategy(),
+                  central_storage_strategy.CentralStorageStrategy):
+      raise ValueError(
+          "`clipnorm` is not supported with `CenteralStorageStrategy`")
+
+    clipped_grads_and_vars = [
+        (clip_ops.clip_by_norm(g, clipnorm), v) for g, v in grads_and_vars
+    ]
+    return clipped_grads_and_vars
+
+  return gradient_clipnorm_fn
+
+
+def make_global_gradient_clipnorm_fn(clipnorm):
+  """Creates a gradient transformation function for clipping by norm."""
+  if clipnorm is None:
+    return lambda grads_and_vars: grads_and_vars
+
+  def gradient_clipnorm_fn(grads_and_vars):
+
+    if isinstance(distribute_ctx.get_strategy(),
+                  central_storage_strategy.CentralStorageStrategy):
+      raise ValueError(
+          "`global_clipnorm` is not supported with `CenteralStorageStrategy`")
+
+    grads, variables = zip(*grads_and_vars)
+    clipped_grads, _ = clip_ops.clip_by_global_norm(grads, clipnorm)
+    clipped_grads_and_vars = list(zip(clipped_grads, variables))
+    return clipped_grads_and_vars
+
+  return gradient_clipnorm_fn
+
+
+def make_gradient_clipvalue_fn(clipvalue):
+  """Creates a gradient transformation function for clipping by value."""
+  if clipvalue is None:
+    return lambda grads_and_vars: grads_and_vars
+
+  def gradient_clipvalue_fn(grads_and_vars):
+
+    if isinstance(distribute_ctx.get_strategy(),
+                  central_storage_strategy.CentralStorageStrategy):
+      raise ValueError(
+          "`clipvalue` is not supported with `CenteralStorageStrategy`")
+
+    clipped_grads_and_vars = [(clip_ops.clip_by_value(g, -clipvalue,
+                                                      clipvalue), v)
+                              for g, v in grads_and_vars]
+    return clipped_grads_and_vars
+
+  return gradient_clipvalue_fn
+
+
 def _all_reduce_sum_fn(distribution, grads_and_vars):
   return distribution.extended.batch_reduce_to(ds_reduce_util.ReduceOp.SUM,
                                                grads_and_vars)
diff --git a/tensorflow/python/keras/optimizers.py b/tensorflow/python/keras/optimizers.py
index 5d8e1351ae3..75d1e10242a 100644
--- a/tensorflow/python/keras/optimizers.py
+++ b/tensorflow/python/keras/optimizers.py
@@ -22,6 +22,7 @@ import six
 from six.moves import zip  # pylint: disable=redefined-builtin
 
 from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.eager import backprop
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.optimizer_v2 import adadelta as adadelta_v2
@@ -41,6 +42,7 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.training import optimizer as tf_optimizer_module
 from tensorflow.python.training import training_util
 from tensorflow.python.training.tracking import base as trackable
+from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -771,8 +773,28 @@ class TFOptimizer(Optimizer, trackable.Trackable):
     # TFOptimizer wrapper has no gradient clipping options.
     return grads
 
-  def apply_gradients(self, grads):
-    self.optimizer.apply_gradients(grads, global_step=self.iterations)
+  def minimize(self, loss, var_list, grad_loss=None, tape=None):
+    """Mimics the `OptimizerV2.minimize` API."""
+    if not callable(loss) and tape is None:
+      raise ValueError('`tape` is required when a `Tensor` loss is passed.')
+    tape = tape if tape is not None else backprop.GradientTape()
+
+    if callable(loss):
+      with tape:
+        if not callable(var_list):
+          tape.watch(var_list)
+        loss = loss()
+        if callable(var_list):
+          var_list = var_list()
+
+    var_list = nest.flatten(var_list)
+    if var_list:
+      grads = tape.gradient(loss, var_list, grad_loss)
+      grads_and_vars = list(zip(grads, var_list))
+      self.apply_gradients(grads_and_vars)
+
+  def apply_gradients(self, grads_and_vars):
+    self.optimizer.apply_gradients(grads_and_vars, global_step=self.iterations)
 
   def get_grads(self, loss, params):
     return self.optimizer.compute_gradients(loss, params)
diff --git a/tensorflow/python/keras/premade/BUILD b/tensorflow/python/keras/premade/BUILD
index 8c30cdbe2a6..fc85ae04811 100644
--- a/tensorflow/python/keras/premade/BUILD
+++ b/tensorflow/python/keras/premade/BUILD
@@ -11,6 +11,12 @@ package(
 
 exports_files(["LICENSE"])
 
+filegroup(
+    name = "all_py_srcs",
+    srcs = glob(["*.py"]),
+    visibility = ["//tensorflow/python/keras/google/private_tf_api_test:__pkg__"],
+)
+
 py_library(
     name = "premade",
     srcs = [
diff --git a/tensorflow/python/keras/premade/linear_test.py b/tensorflow/python/keras/premade/linear_test.py
index ad57baa7813..15914ec0c6e 100644
--- a/tensorflow/python/keras/premade/linear_test.py
+++ b/tensorflow/python/keras/premade/linear_test.py
@@ -113,68 +113,66 @@ class LinearModelTest(keras_parameterized.TestCase):
     indices = []
     values = []
     target = np.zeros((batch_size, 1))
-    with context.eager_mode():
-      for i in range(64):
-        rand_int = np.random.randint(3)
-        if rand_int == 0:
-          indices.append((i, 0))
-          val = np.random.uniform(low=-5, high=5)
-          values.append(val)
-          target[i] = 0.3 * val
-        elif rand_int == 1:
-          indices.append((i, 1))
-          val = np.random.uniform(low=-5, high=5)
-          values.append(val)
-          target[i] = 0.2 * val
-        else:
-          indices.append((i, 0))
-          indices.append((i, 1))
-          val_1 = np.random.uniform(low=-5, high=5)
-          val_2 = np.random.uniform(low=-5, high=5)
-          values.append(val_1)
-          values.append(val_2)
-          target[i] = 0.3 * val_1 + 0.2 * val_2
+    for i in range(64):
+      rand_int = np.random.randint(3)
+      if rand_int == 0:
+        indices.append((i, 0))
+        val = np.random.uniform(low=-5, high=5)
+        values.append(val)
+        target[i] = 0.3 * val
+      elif rand_int == 1:
+        indices.append((i, 1))
+        val = np.random.uniform(low=-5, high=5)
+        values.append(val)
+        target[i] = 0.2 * val
+      else:
+        indices.append((i, 0))
+        indices.append((i, 1))
+        val_1 = np.random.uniform(low=-5, high=5)
+        val_2 = np.random.uniform(low=-5, high=5)
+        values.append(val_1)
+        values.append(val_2)
+        target[i] = 0.3 * val_1 + 0.2 * val_2
 
-      indices = np.asarray(indices)
-      values = np.asarray(values)
-      shape = constant_op.constant([batch_size, 2], dtype=dtypes.int64)
-      inp = sparse_tensor.SparseTensor(indices, values, shape)
-      model = linear.LinearModel(use_bias=False)
-      opt = gradient_descent.SGD()
-      for _ in range(20):
-        with backprop.GradientTape() as t:
-          output = model(inp)
-          loss = backend.mean(losses.mean_squared_error(target, output))
-        grads = t.gradient(loss, model.trainable_variables)
-        grads_and_vars = zip(grads, model.trainable_variables)
-        opt.apply_gradients(grads_and_vars)
+    indices = np.asarray(indices)
+    values = np.asarray(values)
+    shape = constant_op.constant([batch_size, 2], dtype=dtypes.int64)
+    inp = sparse_tensor.SparseTensor(indices, values, shape)
+    model = linear.LinearModel(use_bias=False)
+    opt = gradient_descent.SGD()
+    for _ in range(20):
+      with backprop.GradientTape() as t:
+        output = model(inp)
+        loss = backend.mean(losses.mean_squared_error(target, output))
+      grads = t.gradient(loss, model.trainable_variables)
+      grads_and_vars = zip(grads, model.trainable_variables)
+      opt.apply_gradients(grads_and_vars)
 
   # This test is an example for a regression on categorical inputs, i.e.,
   # the output is 0.4, 0.6, 0.9 when input is 'alpha', 'beta', 'gamma'
   # separately.
   def test_linear_model_with_feature_column(self):
-    with context.eager_mode():
-      vocab_list = ['alpha', 'beta', 'gamma']
-      vocab_val = [0.4, 0.6, 0.9]
-      data = np.random.choice(vocab_list, size=256)
-      y = np.zeros_like(data, dtype=np.float32)
-      for vocab, val in zip(vocab_list, vocab_val):
-        indices = np.where(data == vocab)
-        y[indices] = val + np.random.uniform(
-            low=-0.01, high=0.01, size=indices[0].shape)
-      cat_column = fc.categorical_column_with_vocabulary_list(
-          key='symbol', vocabulary_list=vocab_list)
-      ind_column = fc.indicator_column(cat_column)
-      dense_feature_layer = dense_features_v2.DenseFeatures([ind_column])
-      linear_model = linear.LinearModel(
-          use_bias=False, kernel_initializer='zeros')
-      combined = sequential.Sequential([dense_feature_layer, linear_model])
-      opt = gradient_descent.SGD(learning_rate=0.1)
-      combined.compile(opt, 'mse', [])
-      combined.fit(x={'symbol': data}, y=y, batch_size=32, epochs=10)
-      self.assertAllClose([[0.4], [0.6], [0.9]],
-                          combined.layers[1].dense_layers[0].kernel.numpy(),
-                          atol=0.01)
+    vocab_list = ['alpha', 'beta', 'gamma']
+    vocab_val = [0.4, 0.6, 0.9]
+    data = np.random.choice(vocab_list, size=256)
+    y = np.zeros_like(data, dtype=np.float32)
+    for vocab, val in zip(vocab_list, vocab_val):
+      indices = np.where(data == vocab)
+      y[indices] = val + np.random.uniform(
+          low=-0.01, high=0.01, size=indices[0].shape)
+    cat_column = fc.categorical_column_with_vocabulary_list(
+        key='symbol', vocabulary_list=vocab_list)
+    ind_column = fc.indicator_column(cat_column)
+    dense_feature_layer = dense_features_v2.DenseFeatures([ind_column])
+    linear_model = linear.LinearModel(
+        use_bias=False, kernel_initializer='zeros')
+    combined = sequential.Sequential([dense_feature_layer, linear_model])
+    opt = gradient_descent.SGD(learning_rate=0.1)
+    combined.compile(opt, 'mse', [])
+    combined.fit(x={'symbol': data}, y=y, batch_size=32, epochs=10)
+    self.assertAllClose([[0.4], [0.6], [0.9]],
+                        combined.layers[1].dense_layers[0].kernel.numpy(),
+                        atol=0.01)
 
   def test_config(self):
     linear_model = linear.LinearModel(units=3, use_bias=True)
diff --git a/tensorflow/python/keras/premade/wide_deep_test.py b/tensorflow/python/keras/premade/wide_deep_test.py
index 591b53e9a84..a87961e8a72 100644
--- a/tensorflow/python/keras/premade/wide_deep_test.py
+++ b/tensorflow/python/keras/premade/wide_deep_test.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python.eager import context
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
@@ -96,29 +95,28 @@ class WideDeepModelTest(keras_parameterized.TestCase):
     wide_deep_model.fit(inputs, output, epochs=5)
 
   def test_wide_deep_model_with_multi_outputs(self):
-    with context.eager_mode():
-      inp = input_layer.Input(shape=(1,), name='linear')
-      l = linear.LinearModel(units=2, use_bias=False)(inp)
-      l1, l2 = array_ops.split(l, num_or_size_splits=2, axis=1)
-      linear_model = training.Model(inp, [l1, l2])
-      linear_model.set_weights([np.asarray([[0.5, 0.3]])])
-      h = core.Dense(units=2, use_bias=False)(inp)
-      h1, h2 = array_ops.split(h, num_or_size_splits=2, axis=1)
-      dnn_model = training.Model(inp, [h1, h2])
-      dnn_model.set_weights([np.asarray([[0.1, -0.5]])])
-      wide_deep_model = wide_deep.WideDeepModel(linear_model, dnn_model)
-      inp_np = np.asarray([[1.]])
-      out1, out2 = wide_deep_model(inp_np)
-      # output should be (0.5 + 0.1), and (0.3 - 0.5)
-      self.assertAllClose([[0.6]], out1)
-      self.assertAllClose([[-0.2]], out2)
+    inp = input_layer.Input(shape=(1,), name='linear')
+    l = linear.LinearModel(units=2, use_bias=False)(inp)
+    l1, l2 = array_ops.split(l, num_or_size_splits=2, axis=1)
+    linear_model = training.Model(inp, [l1, l2])
+    linear_model.set_weights([np.asarray([[0.5, 0.3]])])
+    h = core.Dense(units=2, use_bias=False)(inp)
+    h1, h2 = array_ops.split(h, num_or_size_splits=2, axis=1)
+    dnn_model = training.Model(inp, [h1, h2])
+    dnn_model.set_weights([np.asarray([[0.1, -0.5]])])
+    wide_deep_model = wide_deep.WideDeepModel(linear_model, dnn_model)
+    inp_np = np.asarray([[1.]])
+    out1, out2 = wide_deep_model(inp_np)
+    # output should be (0.5 + 0.1), and (0.3 - 0.5)
+    self.assertAllClose([[0.6]], out1)
+    self.assertAllClose([[-0.2]], out2)
 
-      wide_deep_model = wide_deep.WideDeepModel(
-          linear_model, dnn_model, activation='relu')
-      out1, out2 = wide_deep_model(inp_np)
-      # output should be relu((0.5 + 0.1)), and relu((0.3 - 0.5))
-      self.assertAllClose([[0.6]], out1)
-      self.assertAllClose([[0.]], out2)
+    wide_deep_model = wide_deep.WideDeepModel(
+        linear_model, dnn_model, activation='relu')
+    out1, out2 = wide_deep_model(inp_np)
+    # output should be relu((0.5 + 0.1)), and relu((0.3 - 0.5))
+    self.assertAllClose([[0.6]], out1)
+    self.assertAllClose([[0.]], out2)
 
   def test_wide_deep_model_with_single_optimizer(self):
     linear_model = linear.LinearModel(units=1)
diff --git a/tensorflow/python/keras/preprocessing/BUILD b/tensorflow/python/keras/preprocessing/BUILD
index 24260fb71db..1b32e11f8c9 100644
--- a/tensorflow/python/keras/preprocessing/BUILD
+++ b/tensorflow/python/keras/preprocessing/BUILD
@@ -15,6 +15,12 @@ package(
 
 exports_files(["LICENSE"])
 
+filegroup(
+    name = "all_py_srcs",
+    srcs = glob(["*.py"]),
+    visibility = ["//tensorflow/python/keras/google/private_tf_api_test:__pkg__"],
+)
+
 py_library(
     name = "preprocessing",
     srcs = [
diff --git a/tensorflow/python/keras/preprocessing/dataset_utils.py b/tensorflow/python/keras/preprocessing/dataset_utils.py
index 1c9d283c2f1..5000f5f798a 100644
--- a/tensorflow/python/keras/preprocessing/dataset_utils.py
+++ b/tensorflow/python/keras/preprocessing/dataset_utils.py
@@ -189,6 +189,16 @@ def get_training_or_validation_split(samples, labels, validation_split, subset):
 
 
 def labels_to_dataset(labels, label_mode, num_classes):
+  """Create a tf.data.Dataset from the list/tuple of labels.
+
+  Args:
+    labels: list/tuple of labels to be converted into a tf.data.Dataset.
+    label_mode: - 'binary' indicates that the labels (there can be only 2) are
+      encoded as `float32` scalars with values 0 or 1 (e.g. for
+      `binary_crossentropy`). - 'categorical' means that the labels are mapped
+      into a categorical vector. (e.g. for `categorical_crossentropy` loss).
+    num_classes: number of classes of labels.
+  """
   label_ds = dataset_ops.Dataset.from_tensor_slices(labels)
   if label_mode == 'binary':
     label_ds = label_ds.map(
@@ -199,7 +209,16 @@ def labels_to_dataset(labels, label_mode, num_classes):
 
 
 def check_validation_split_arg(validation_split, subset, shuffle, seed):
-  """Raise errors in case of invalid argument values."""
+  """Raise errors in case of invalid argument values.
+
+  Args:
+    shuffle: Whether to shuffle the data. Either True or False.
+    seed: random seed for shuffling and transformations.
+    validation_split: float between 0 and 1, fraction of data to reserve for
+      validation.
+    subset: One of "training" or "validation". Only used if `validation_split`
+      is set.
+  """
   if validation_split and not 0 < validation_split < 1:
     raise ValueError(
         '`validation_split` must be between 0 and 1, received: %s' %
diff --git a/tensorflow/python/keras/preprocessing/image.py b/tensorflow/python/keras/preprocessing/image.py
index f943967d65b..fbfe67fb8b6 100644
--- a/tensorflow/python/keras/preprocessing/image.py
+++ b/tensorflow/python/keras/preprocessing/image.py
@@ -33,11 +33,11 @@ from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.preprocessing.image_dataset import image_dataset_from_directory  # pylint: disable=unused-import
 from tensorflow.python.keras.utils import data_utils
+from tensorflow.python.keras.utils import tf_inspect
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import image_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging
-from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import keras_export
 
 random_rotation = image.random_rotation
@@ -111,7 +111,7 @@ def smart_resize(x, size, interpolation='bilinear'):
   if len(size) != 2:
     raise ValueError('Expected `size` to be a tuple of 2 integers, '
                      'but got: %s' % (size,))
-  img = ops.convert_to_tensor(x)
+  img = ops.convert_to_tensor_v2_with_dispatch(x)
   if img.shape.rank is not None:
     if img.shape.rank != 3:
       raise ValueError(
diff --git a/tensorflow/python/keras/preprocessing/image_test.py b/tensorflow/python/keras/preprocessing/image_test.py
index 9d010a98f61..78cec643570 100644
--- a/tensorflow/python/keras/preprocessing/image_test.py
+++ b/tensorflow/python/keras/preprocessing/image_test.py
@@ -24,13 +24,13 @@ import tempfile
 
 import numpy as np
 
-from tensorflow.python.framework import test_util
+from tensorflow.python.data import Dataset
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import layers
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import sequential
 from tensorflow.python.keras.preprocessing import image as preprocessing_image
 from tensorflow.python.platform import test
-from tensorflow.python.data import Dataset
 
 try:
   import PIL  # pylint:disable=g-import-not-at-top
@@ -58,7 +58,7 @@ def _generate_test_images():
 
 class TestImage(keras_parameterized.TestCase):
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_smart_resize(self):
     test_input = np.random.random((20, 40, 3))
     output = preprocessing_image.smart_resize(test_input, size=(50, 50))
@@ -71,7 +71,7 @@ class TestImage(keras_parameterized.TestCase):
     output = preprocessing_image.smart_resize(test_input, size=(5, 15))
     self.assertListEqual(list(output.shape), [5, 15, 3])
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_smart_resize_tf_dataset(self):
     test_input_np = np.random.random((2, 20, 40, 3))
     test_ds = Dataset.from_tensor_slices(test_input_np)
diff --git a/tensorflow/python/keras/saving/BUILD b/tensorflow/python/keras/saving/BUILD
index 62000be42d9..8c0ba8df31b 100644
--- a/tensorflow/python/keras/saving/BUILD
+++ b/tensorflow/python/keras/saving/BUILD
@@ -14,6 +14,15 @@ package(
 
 exports_files(["LICENSE"])
 
+filegroup(
+    name = "all_py_srcs",
+    srcs = glob([
+        "*.py",
+        "**/*.py",
+    ]),
+    visibility = ["//tensorflow/python/keras/google/private_tf_api_test:__pkg__"],
+)
+
 py_library(
     name = "saving",
     srcs = [
@@ -177,7 +186,7 @@ tf_py_test(
     size = "medium",
     srcs = ["saved_model/revive_test.py"],
     python_version = "PY3",
-    shard_count = 4,
+    shard_count = 8,
     tags = [
         "no_windows",  # b/158005583
     ],
diff --git a/tensorflow/python/keras/saving/hdf5_format.py b/tensorflow/python/keras/saving/hdf5_format.py
index 31c9a6e14e0..d8b888c8996 100644
--- a/tensorflow/python/keras/saving/hdf5_format.py
+++ b/tensorflow/python/keras/saving/hdf5_format.py
@@ -30,12 +30,13 @@ from tensorflow.python.keras import optimizers
 from tensorflow.python.keras.saving import model_config as model_config_lib
 from tensorflow.python.keras.saving import saving_utils
 from tensorflow.python.keras.saving.saved_model import json_utils
-from tensorflow.python.keras.utils import conv_utils
 from tensorflow.python.keras.utils.generic_utils import LazyLoader
 from tensorflow.python.keras.utils.io_utils import ask_to_proceed_with_overwrite
 from tensorflow.python.ops import variables as variables_module
+from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 
+
 # pylint: disable=g-import-not-at-top
 try:
   import h5py
@@ -99,6 +100,11 @@ def save_model_to_hdf5(model, filepath, overwrite=True, include_optimizer=True):
       if not proceed:
         return
 
+    # Try creating dir if not exist
+    dirpath = os.path.dirname(filepath)
+    if not os.path.exists(dirpath):
+      gfile.MakeDirs(dirpath)
+
     f = h5py.File(filepath, mode='w')
     opened_new_file = True
   else:
@@ -395,10 +401,6 @@ def preprocess_weights_for_loading(layer,
 
   conv_layers = ['Conv1D', 'Conv2D', 'Conv3D', 'Conv2DTranspose', 'ConvLSTM2D']
   if layer.__class__.__name__ in conv_layers:
-    if original_backend == 'theano':
-      weights[0] = conv_utils.convert_kernel(weights[0])
-      if layer.__class__.__name__ == 'ConvLSTM2D':
-        weights[1] = conv_utils.convert_kernel(weights[1])
     if K.int_shape(layer.weights[0]) != weights[0].shape:
       weights[0] = np.transpose(weights[0], (3, 2, 0, 1))
       if layer.__class__.__name__ == 'ConvLSTM2D':
diff --git a/tensorflow/python/keras/saving/hdf5_format_test.py b/tensorflow/python/keras/saving/hdf5_format_test.py
index dea492db4dc..1817bfc9263 100644
--- a/tensorflow/python/keras/saving/hdf5_format_test.py
+++ b/tensorflow/python/keras/saving/hdf5_format_test.py
@@ -730,6 +730,45 @@ class TestWholeModelSaving(keras_parameterized.TestCase):
       os.close(fd)
       os.remove(fname)
 
+  def test_model_saving_to_new_dir_path(self):
+    saved_model_dir = os.path.join(self._save_model_dir(), 'newdir',
+                                   'saved_model')
+    save_format = testing_utils.get_save_format()
+
+    with self.cached_session():
+      model = keras.models.Sequential()
+      model.add(keras.layers.Dense(2, input_shape=(3,)))
+      model.add(keras.layers.RepeatVector(3))
+      model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
+
+      x = np.random.random((1, 3))
+      out = model.predict(x)
+
+      keras.models.save_model(model, saved_model_dir, save_format=save_format)
+
+      new_model = keras.models.load_model(saved_model_dir)
+      self._assert_same_weights_and_metrics(model, new_model)
+
+      out2 = new_model.predict(x)
+      self.assertAllClose(out, out2, atol=1e-05)
+
+  def test_model_raise_exception_with_failed_saving(self):
+    if h5py is None:
+      self.skipTest('h5py required to run this test')
+
+    saved_model_dir = self._save_model_dir()
+    saved_model_path = os.path.join(saved_model_dir, 'saved_model.h5')
+
+    with self.cached_session():
+      model = keras.models.Sequential()
+      model.add(keras.layers.Dense(2, input_shape=(3,)))
+      model.add(keras.layers.RepeatVector(3))
+      model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
+
+      with self.assertRaisesRegex(OSError, 'Unable to create file'):
+        with h5py.File(saved_model_path, 'w'):
+          keras.models.save_model(model, saved_model_path)
+
   def test_saving_constant_initializer_with_numpy(self):
     saved_model_dir = self._save_model_dir()
     save_format = testing_utils.get_save_format()
@@ -1237,7 +1276,7 @@ class TestWeightSavingAndLoadingTFFormat(test.TestCase, parameterized.TestCase):
       prefix = 'ackpt'
       self.evaluate(v.assign(42.))
       m.save_weights(prefix)
-      self.assertTrue(file_io.file_exists('ackpt.index'))
+      self.assertTrue(file_io.file_exists_v2('ackpt.index'))
       self.evaluate(v.assign(1.))
       m.load_weights(prefix)
       self.assertEqual(42., self.evaluate(v))
@@ -1245,7 +1284,7 @@ class TestWeightSavingAndLoadingTFFormat(test.TestCase, parameterized.TestCase):
       prefix = 'subdir/ackpt'
       self.evaluate(v.assign(43.))
       m.save_weights(prefix)
-      self.assertTrue(file_io.file_exists('subdir/ackpt.index'))
+      self.assertTrue(file_io.file_exists_v2('subdir/ackpt.index'))
       self.evaluate(v.assign(2.))
       m.load_weights(prefix)
       self.assertEqual(43., self.evaluate(v))
@@ -1253,7 +1292,7 @@ class TestWeightSavingAndLoadingTFFormat(test.TestCase, parameterized.TestCase):
       prefix = 'ackpt/'
       self.evaluate(v.assign(44.))
       m.save_weights(prefix)
-      self.assertTrue(file_io.file_exists('ackpt/.index'))
+      self.assertTrue(file_io.file_exists_v2('ackpt/.index'))
       self.evaluate(v.assign(3.))
       m.load_weights(prefix)
       self.assertEqual(44., self.evaluate(v))
diff --git a/tensorflow/python/keras/saving/save_test.py b/tensorflow/python/keras/saving/save_test.py
index 4df46864f22..fb2114a292b 100644
--- a/tensorflow/python/keras/saving/save_test.py
+++ b/tensorflow/python/keras/saving/save_test.py
@@ -28,7 +28,6 @@ from tensorflow.python import keras
 from tensorflow.python.eager import context
 from tensorflow.python.feature_column import feature_column_lib
 from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.framework import test_util
 from tensorflow.python.keras import combinations
 from tensorflow.python.keras import losses
 from tensorflow.python.keras import testing_utils
@@ -66,13 +65,13 @@ class TestSaveModel(test.TestCase, parameterized.TestCase):
   def assert_saved_model(self, path):
     loader_impl.parse_saved_model(path)
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_save_format_defaults(self):
     path = os.path.join(self.get_temp_dir(), 'model_path')
     save.save_model(self.model, path)
     self.assert_saved_model(path)
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_save_format_defaults_pathlib(self):
     if sys.version_info < (3, 6):
       self.skipTest('pathlib is only available for python version >= 3.6')
@@ -80,7 +79,7 @@ class TestSaveModel(test.TestCase, parameterized.TestCase):
     save.save_model(self.model, path)
     self.assert_saved_model(path)
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_save_hdf5(self):
     path = os.path.join(self.get_temp_dir(), 'model')
     save.save_model(self.model, path, save_format='h5')
@@ -90,7 +89,7 @@ class TestSaveModel(test.TestCase, parameterized.TestCase):
         'requires the model to be a Functional model or a Sequential model.'):
       save.save_model(self.subclassed_model, path, save_format='h5')
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_save_load_hdf5_pathlib(self):
     if sys.version_info < (3, 6):
       self.skipTest('pathlib is only available for python version >= 3.6')
@@ -98,7 +97,7 @@ class TestSaveModel(test.TestCase, parameterized.TestCase):
     save.save_model(self.model, path, save_format='h5')
     save.load_model(path)
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_save_tf(self):
     path = os.path.join(self.get_temp_dir(), 'model')
     save.save_model(self.model, path, save_format='tf')
@@ -109,13 +108,13 @@ class TestSaveModel(test.TestCase, parameterized.TestCase):
     save.save_model(self.subclassed_model, path, save_format='tf')
     self.assert_saved_model(path)
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_save_load_tf_string(self):
     path = os.path.join(self.get_temp_dir(), 'model')
     save.save_model(self.model, path, save_format='tf')
     save.load_model(path)
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_save_load_tf_pathlib(self):
     if sys.version_info < (3, 6):
       self.skipTest('pathlib is only available for python version >= 3.6')
@@ -123,7 +122,7 @@ class TestSaveModel(test.TestCase, parameterized.TestCase):
     save.save_model(self.model, path, save_format='tf')
     save.load_model(path)
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_save_load_weights_tf_pathlib(self):
     if sys.version_info < (3, 6):
       self.skipTest('pathlib is only available for python version >= 3.6')
@@ -131,7 +130,7 @@ class TestSaveModel(test.TestCase, parameterized.TestCase):
     self.model.save_weights(path, save_format='tf')
     self.model.load_weights(path)
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_save_load_weights_hdf5_pathlib(self):
     if sys.version_info < (3, 6):
       self.skipTest('pathlib is only available for python version >= 3.6')
diff --git a/tensorflow/python/keras/saving/saved_model/load.py b/tensorflow/python/keras/saving/saved_model/load.py
index c0160609ef4..556675d4bb5 100644
--- a/tensorflow/python/keras/saving/saved_model/load.py
+++ b/tensorflow/python/keras/saving/saved_model/load.py
@@ -43,7 +43,6 @@ from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.training.tracking.tracking import delete_tracking
 from tensorflow.python.util import compat
 from tensorflow.python.util import nest
-from tensorflow.python.util import object_identity
 
 # To avoid circular dependencies between keras/engine and keras/saving,
 # code in keras/saving must delay imports.
@@ -179,8 +178,6 @@ class KerasObjectLoader(tf_load.Loader):
     # records all nodes that were generated directly/indirectly from the config,
     # so that they do not get recreated multiple times.
     self._nodes_recreated_from_config = {}
-    self._all_nodes_recreated_from_config = (
-        object_identity.ObjectIdentityWeakSet())
     # Store all node ids that have already been traversed when tracking nodes
     # that were recreated from the config.
     self._traversed_nodes_from_config = []
@@ -293,7 +290,6 @@ class KerasObjectLoader(tf_load.Loader):
                      'Object: {}'.format(obj_child))
       self._nodes_recreated_from_config[child_id] = (
           obj_child, self._config_node_setter(setter))
-      self._all_nodes_recreated_from_config.add(obj_child)
       self._add_children_recreated_from_config(
           obj_child, child_proto, child_id)
 
@@ -363,7 +359,6 @@ class KerasObjectLoader(tf_load.Loader):
 
     setter = self._config_node_setter(_revive_setter)
     self._nodes_recreated_from_config[node_id] = obj, setter
-    self._all_nodes_recreated_from_config.add(obj)
     self._add_children_recreated_from_config(
         obj, self._proto.nodes[node_id], node_id)
     return obj, setter
@@ -380,8 +375,11 @@ class KerasObjectLoader(tf_load.Loader):
         metadata['class_name'] == 'Sequential' or
         metadata['class_name'] == 'Functional')
     if not (generic_utils.validate_config(config) and
-            model_is_functional_or_sequential):
-      return None  # Revive as custom model.
+            model_is_functional_or_sequential
+           ) or generic_utils.get_registered_object(class_name) is not None:
+      # Model should not be revived as a graph network. Try reviving directly
+      # from config or as a custom model.
+      return None
 
     # Revive functional and sequential models as blank model objects for now (
     # must be initialized to enable setattr tracking and attribute caching).
diff --git a/tensorflow/python/keras/saving/saved_model/revive_test.py b/tensorflow/python/keras/saving/saved_model/revive_test.py
index 5e94597d00d..693568d4848 100644
--- a/tensorflow/python/keras/saving/saved_model/revive_test.py
+++ b/tensorflow/python/keras/saving/saved_model/revive_test.py
@@ -24,9 +24,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
 import shutil
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import keras
@@ -115,18 +115,43 @@ class CustomLayerWithConfig(CustomLayerNoConfig):
             'name': self.name}
 
 
-class TestModelRevive(keras_parameterized.TestCase):
+class CustomNetworkDefaultConfig(keras.Model):
+
+  def __init__(self, num_classes, name=None):
+    inputs = keras.Input((2, 3), name='inputs')
+    x = keras.layers.Flatten(name='flatten')(inputs)
+    y = keras.layers.Dense(num_classes, name='outputs')(x)
+    super(CustomNetworkDefaultConfig, self).__init__(inputs, y, name=name)
+
+
+class CustomNetworkWithConfig(CustomNetworkDefaultConfig):
+
+  def __init__(self, num_classes, name=None):
+    super(CustomNetworkWithConfig, self).__init__(num_classes, name=name)
+    self._config_dict = dict(num_classes=num_classes)
+
+  def get_config(self):
+    return self._config_dict
+
+  @classmethod
+  def from_config(cls, config):
+    return cls(config['num_classes'], name=config.get('name'))
+
+
+class CustomNetworkWithConfigName(CustomNetworkWithConfig):
+
+  def __init__(self, num_classes, name=None):
+    super(CustomNetworkWithConfigName, self).__init__(num_classes, name=name)
+    self._config_dict['name'] = self.name
+
+
+class ReviveTestBase(keras_parameterized.TestCase):
 
   def setUp(self):
-    super(TestModelRevive, self).setUp()
+    super(ReviveTestBase, self).setUp()
     self.path = self.get_temp_dir()
     self.addCleanup(shutil.rmtree, self.path, ignore_errors=True)
 
-  def _save_model_dir(self, dirname='saved_model'):
-    temp_dir = self.get_temp_dir()
-    self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
-    return os.path.join(temp_dir, dirname)
-
   def _assert_revived_correctness(self, model, revived):
     self.assertAllEqual(model.input_names, revived.input_names)
     self.assertAllEqual(model.output_names, revived.output_names)
@@ -173,6 +198,11 @@ class TestModelRevive(keras_parameterized.TestCase):
         self.assertEqual(type(model_layer).__name__,
                          type(revived_layer).__name__)
 
+
+# These tests take a while to run, so each should run in a separate shard
+# (putting them in the same TestCase resolves this).
+class TestBigModelRevive(ReviveTestBase):
+
   @keras_parameterized.run_with_all_model_types
   def test_revive(self):
     input_shape = None
@@ -235,6 +265,9 @@ class TestModelRevive(keras_parameterized.TestCase):
     revived = keras_load.load(self.path)
     self._assert_revived_correctness(model, revived)
 
+
+class TestModelRevive(ReviveTestBase):
+
   def test_revive_subclassed_with_nested_model(self):
     model = SubclassedModelNoConfig(1., 2.)
     # Run data through the Model to create save spec and weights.
@@ -244,17 +277,54 @@ class TestModelRevive(keras_parameterized.TestCase):
     self._assert_revived_correctness(model, revived)
 
   def test_revive_sequential_inputs(self):
-    model = keras.models.Sequential(
-        [keras.Input((None,), dtype=dtypes.string),
-         keras.layers.Lambda(string_ops.string_lower)])
+    model = keras.models.Sequential([
+        keras.Input((None,), dtype=dtypes.string),
+        keras.layers.Lambda(string_ops.string_lower)
+    ])
     model.save(self.path, save_format='tf')
     revived = keras_load.load(self.path)
     self.assertEqual(dtypes.string, revived._layers[0].dtype)
 
+  @parameterized.named_parameters(
+      ('default_config', CustomNetworkDefaultConfig),
+      ('with_config', CustomNetworkWithConfig),
+      ('with_config_name', CustomNetworkWithConfigName))
+  def test_revive_network(self, model_cls):
+    model = model_cls(8)
+    model.save(self.path, include_optimizer=False, save_format='tf')
+    revived = keras_load.load(self.path, compile=False)
+    self._assert_revived_correctness(model, revived)
+
+  def test_load_compiled_metrics(self):
+    model = testing_utils.get_small_sequential_mlp(1, 3)
+
+    # Compile with dense categorical accuracy
+    model.compile('rmsprop', 'mse', 'acc')
+    x = np.random.random((5, 10)).astype(np.float32)
+    y_true = np.random.random((5, 3)).astype(np.float32)
+    model.train_on_batch(x, y_true)
+
+    model.save(self.path, include_optimizer=True, save_format='tf')
+    revived = keras_load.load(self.path, compile=True)
+    self.assertAllClose(model.test_on_batch(x, y_true),
+                        revived.test_on_batch(x, y_true))
+
+    # Compile with sparse categorical accuracy
+    model.compile('rmsprop', 'mse', 'acc')
+    y_true = np.random.randint(0, 3, (5, 1)).astype(np.float32)
+    model.train_on_batch(x, y_true)
+    model.save(self.path, include_optimizer=True, save_format='tf')
+    revived = keras_load.load(self.path, compile=True)
+    self.assertAllClose(model.test_on_batch(x, y_true),
+                        revived.test_on_batch(x, y_true))
+
 
 if __name__ == '__main__':
   ops.enable_eager_execution()
   with generic_utils.CustomObjectScope({
       'CustomLayerWithConfig': CustomLayerWithConfig,
-      'SubclassedModelWithConfig': SubclassedModelWithConfig}):
+      'CustomNetworkWithConfig': CustomNetworkWithConfig,
+      'CustomNetworkWithConfigName': CustomNetworkWithConfigName,
+      'SubclassedModelWithConfig': SubclassedModelWithConfig
+  }):
     test.main()
diff --git a/tensorflow/python/keras/saving/saved_model/save_impl.py b/tensorflow/python/keras/saving/saved_model/save_impl.py
index a2c4d58d18e..4f981b517a1 100644
--- a/tensorflow/python/keras/saving/saved_model/save_impl.py
+++ b/tensorflow/python/keras/saving/saved_model/save_impl.py
@@ -37,6 +37,7 @@ from tensorflow.python.keras.saving.saved_model import constants
 from tensorflow.python.keras.saving.saved_model import load as keras_load
 from tensorflow.python.keras.saving.saved_model import serialized_attributes
 from tensorflow.python.keras.saving.saved_model import utils
+from tensorflow.python.keras.utils import tf_inspect
 from tensorflow.python.keras.utils import version_utils
 from tensorflow.python.keras.utils.generic_utils import LazyLoader
 from tensorflow.python.platform import tf_logging as logging
@@ -44,7 +45,6 @@ from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.training.tracking import data_structures
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
-from tensorflow.python.util import tf_inspect
 
 # To avoid circular dependencies between keras/engine and keras/saving,
 # code in keras/saving must delay imports.
diff --git a/tensorflow/python/keras/saving/saved_model/saved_model_test.py b/tensorflow/python/keras/saving/saved_model/saved_model_test.py
index 1dff9a2e8cf..9615fef54b9 100644
--- a/tensorflow/python/keras/saving/saved_model/saved_model_test.py
+++ b/tensorflow/python/keras/saving/saved_model/saved_model_test.py
@@ -53,6 +53,8 @@ from tensorflow.python.keras.saving.saved_model import load as keras_load
 from tensorflow.python.keras.saving.saved_model import save_impl as keras_save
 from tensorflow.python.keras.utils import control_flow_util
 from tensorflow.python.keras.utils import generic_utils
+from tensorflow.python.keras.utils import tf_contextlib
+from tensorflow.python.keras.utils import tf_inspect
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
@@ -62,8 +64,6 @@ from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import load as tf_load
 from tensorflow.python.saved_model import save as tf_save
-from tensorflow.python.util import tf_contextlib
-from tensorflow.python.util import tf_inspect
 
 
 class LayerWithLearningPhase(keras.engine.base_layer.Layer):
@@ -507,7 +507,7 @@ class TestModelSavingAndLoadingV2(keras_parameterized.TestCase):
 
     self.assertAllClose(
         model.predict(input_arr),
-        loaded.signatures['predict'](ops.convert_to_tensor_v2(
+        loaded.signatures['predict'](ops.convert_to_tensor_v2_with_dispatch(
             input_arr.astype('float32')))['predictions'])
 
     feature = {
@@ -517,7 +517,7 @@ class TestModelSavingAndLoadingV2(keras_parameterized.TestCase):
     example = example_pb2.Example(
         features=feature_pb2.Features(feature=feature))
     outputs = loaded.signatures['parse_and_predict'](
-        ops.convert_to_tensor_v2([example.SerializeToString()]))
+        ops.convert_to_tensor_v2_with_dispatch([example.SerializeToString()]))
     self.assertAllClose(model.predict(input_arr), outputs['predictions'])
     self.assertAllClose(model.layers[0](input_arr), outputs['layer_1_outputs'])
 
diff --git a/tensorflow/python/keras/saving/saved_model/utils.py b/tensorflow/python/keras/saving/saved_model/utils.py
index 82547cc393d..3d8cef537bc 100644
--- a/tensorflow/python/keras/saving/saved_model/utils.py
+++ b/tensorflow/python/keras/saving/saved_model/utils.py
@@ -24,10 +24,10 @@ from tensorflow.python.eager import context
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.utils import control_flow_util
+from tensorflow.python.keras.utils import tf_inspect
 from tensorflow.python.keras.utils.generic_utils import LazyLoader
 from tensorflow.python.training.tracking import layer_utils as trackable_layer_utils
 from tensorflow.python.util import tf_decorator
-from tensorflow.python.util import tf_inspect
 
 
 # pylint:disable=g-inconsistent-quotes
diff --git a/tensorflow/python/keras/saving/saved_model_experimental.py b/tensorflow/python/keras/saving/saved_model_experimental.py
index 25628cd1ba3..dddab2a084e 100644
--- a/tensorflow/python/keras/saving/saved_model_experimental.py
+++ b/tensorflow/python/keras/saving/saved_model_experimental.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import os
+import warnings
 
 import six
 
@@ -30,8 +31,8 @@ from tensorflow.python.keras.saving import model_config
 from tensorflow.python.keras.saving import saving_utils
 from tensorflow.python.keras.utils import mode_keys
 from tensorflow.python.keras.utils.generic_utils import LazyLoader
-from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import variables
+from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.saved_model import builder as saved_model_builder
 from tensorflow.python.saved_model import constants
@@ -41,7 +42,6 @@ from tensorflow.python.saved_model import utils_impl as saved_model_utils
 from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.training.tracking import graph_view
 from tensorflow.python.util import compat
-from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import keras_export
 
@@ -61,10 +61,6 @@ sequential = LazyLoader(
 # pylint:enable=g-inconsistent-quotes
 
 
-@deprecation.deprecated(
-    date=None,
-    instructions=('Please use `model.save(..., save_format="tf")` or '
-                  '`tf.keras.models.save_model(..., save_format="tf")`.'))
 @keras_export(v1=['keras.experimental.export_saved_model'])
 def export_saved_model(model,
                        saved_model_path,
@@ -130,6 +126,10 @@ def export_saved_model(model,
     ValueError: If the input signature cannot be inferred from the model.
     AssertionError: If the SavedModel directory already exists and isn't empty.
   """
+  warnings.warn('`tf.keras.experimental.export_saved_model` is deprecated'
+                'and will be removed in a future version. '
+                'Please use `model.save(..., save_format="tf")` or '
+                '`tf.keras.models.save_model(..., save_format="tf")`.')
   if serving_only:
     save_lib.save(
         model,
@@ -152,7 +152,8 @@ def _export_model_json(model, saved_model_path):
   model_json_filepath = os.path.join(
       saved_model_utils.get_or_create_assets_dir(saved_model_path),
       compat.as_text(constants.SAVED_MODEL_FILENAME_JSON))
-  file_io.write_string_to_file(model_json_filepath, model_json)
+  with gfile.Open(model_json_filepath, 'w') as f:
+    f.write(model_json)
 
 
 def _export_model_variables(model, saved_model_path):
@@ -371,10 +372,6 @@ def _assert_same_non_optimizer_objects(model, model_graph, clone, clone_graph):
   return True
 
 
-@deprecation.deprecated(
-    date=None,
-    instructions=('The experimental save and load functions have been  '
-                  'deprecated. Please switch to `tf.keras.models.load_model`.'))
 @keras_export(v1=['keras.experimental.load_from_saved_model'])
 def load_from_saved_model(saved_model_path, custom_objects=None):
   """Loads a keras Model from a SavedModel created by `export_saved_model()`.
@@ -412,12 +409,16 @@ def load_from_saved_model(saved_model_path, custom_objects=None):
   Returns:
     a keras.Model instance.
   """
+  warnings.warn('`tf.keras.experimental.load_from_saved_model` is deprecated'
+                'and will be removed in a future version. '
+                'Please switch to `tf.keras.models.load_model`.')
   # restore model topology from json string
   model_json_filepath = os.path.join(
       compat.as_bytes(saved_model_path),
       compat.as_bytes(constants.ASSETS_DIRECTORY),
       compat.as_bytes(constants.SAVED_MODEL_FILENAME_JSON))
-  model_json = file_io.read_file_to_string(model_json_filepath)
+  with gfile.Open(model_json_filepath, 'r') as f:
+    model_json = f.read()
   model = model_config.model_from_json(
       model_json, custom_objects=custom_objects)
 
diff --git a/tensorflow/python/keras/saving/saving_utils.py b/tensorflow/python/keras/saving/saving_utils.py
index 9fdf81cae2a..0c3d044e80d 100644
--- a/tensorflow/python/keras/saving/saving_utils.py
+++ b/tensorflow/python/keras/saving/saving_utils.py
@@ -173,7 +173,7 @@ def model_metadata(model, include_optimizer=True, require_config=True):
           'Prefer using a Keras optimizer instead '
           '(see keras.io/optimizers).')
     elif model._compile_was_called:  # pylint: disable=protected-access
-      training_config = model._get_compile_args()  # pylint: disable=protected-access
+      training_config = model._get_compile_args(user_metrics=False)  # pylint: disable=protected-access
       training_config.pop('optimizer', None)  # Handled separately.
       metadata['training_config'] = _serialize_nested_config(training_config)
       if isinstance(model.optimizer, optimizer_v2.RestoredOptimizer):
diff --git a/tensorflow/python/keras/testing_utils.py b/tensorflow/python/keras/testing_utils.py
index 550ff664823..6d212e0cda3 100644
--- a/tensorflow/python/keras/testing_utils.py
+++ b/tensorflow/python/keras/testing_utils.py
@@ -18,14 +18,17 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import contextlib
 import functools
+import itertools
 import threading
 
 import numpy as np
 
 from tensorflow.python import tf2
 from tensorflow.python.eager import context
+from tensorflow.python.framework import config
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
@@ -43,9 +46,9 @@ from tensorflow.python.keras.optimizer_v2 import adamax as adamax_v2
 from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_descent_v2
 from tensorflow.python.keras.optimizer_v2 import nadam as nadam_v2
 from tensorflow.python.keras.optimizer_v2 import rmsprop as rmsprop_v2
-from tensorflow.python.util import tf_contextlib
+from tensorflow.python.keras.utils import tf_contextlib
+from tensorflow.python.keras.utils import tf_inspect
 from tensorflow.python.util import tf_decorator
-from tensorflow.python.util import tf_inspect
 
 
 def string_test(actual, expected):
@@ -937,3 +940,146 @@ def use_gpu():
   """Uses gpu when requested and available."""
   with device(should_use_gpu=True):
     yield
+
+
+def for_all_test_methods(decorator, *args, **kwargs):
+  """Generate class-level decorator from given method-level decorator.
+
+  It is expected for the given decorator to take some arguments and return
+  a method that is then called on the test method to produce a decorated
+  method.
+
+  Args:
+    decorator: The decorator to apply.
+    *args: Positional arguments
+    **kwargs: Keyword arguments
+  Returns: Function that will decorate a given classes test methods with the
+    decorator.
+  """
+
+  def all_test_methods_impl(cls):
+    """Apply decorator to all test methods in class."""
+    for name in dir(cls):
+      value = getattr(cls, name)
+      if callable(value) and name.startswith('test') and (name !=
+                                                          'test_session'):
+        setattr(cls, name, decorator(*args, **kwargs)(value))
+    return cls
+
+  return all_test_methods_impl
+
+
+# The description is just for documentation purposes.
+def run_without_tensor_float_32(description):  # pylint: disable=unused-argument
+  """Execute test with TensorFloat-32 disabled.
+
+  While almost every real-world deep learning model runs fine with
+  TensorFloat-32, many tests use assertAllClose or similar methods.
+  TensorFloat-32 matmuls typically will cause such methods to fail with the
+  default tolerances.
+
+  Args:
+    description: A description used for documentation purposes, describing why
+      the test requires TensorFloat-32 to be disabled.
+
+  Returns:
+    Decorator which runs a test with TensorFloat-32 disabled.
+  """
+
+  def decorator(f):
+
+    @functools.wraps(f)
+    def decorated(self, *args, **kwargs):
+      allowed = config.tensor_float_32_execution_enabled()
+      try:
+        config.enable_tensor_float_32_execution(False)
+        f(self, *args, **kwargs)
+      finally:
+        config.enable_tensor_float_32_execution(allowed)
+
+    return decorated
+
+  return decorator
+
+
+# The description is just for documentation purposes.
+def run_all_without_tensor_float_32(description):  # pylint: disable=unused-argument
+  """Execute all tests in a class with TensorFloat-32 disabled."""
+  return for_all_test_methods(run_without_tensor_float_32, description)
+
+
+def run_v2_only(func=None):
+  """Execute the decorated test only if running in v2 mode.
+
+  This function is intended to be applied to tests that exercise v2 only
+  functionality. If the test is run in v1 mode it will simply be skipped.
+
+  See go/tf-test-decorator-cheatsheet for the decorators to use in different
+  v1/v2/eager/graph combinations.
+
+  Args:
+    func: function to be annotated. If `func` is None, this method returns a
+      decorator the can be applied to a function. If `func` is not None this
+      returns the decorator applied to `func`.
+
+  Returns:
+    Returns a decorator that will conditionally skip the decorated test method.
+  """
+
+  def decorator(f):
+    if tf_inspect.isclass(f):
+      raise ValueError('`run_v2_only` only supports test methods.')
+
+    def decorated(self, *args, **kwargs):
+      if not tf2.enabled():
+        self.skipTest('Test is only compatible with v2')
+
+      return f(self, *args, **kwargs)
+
+    return decorated
+
+  if func is not None:
+    return decorator(func)
+
+  return decorator
+
+
+def generate_combinations_with_testcase_name(**kwargs):
+  """Generate combinations based on its keyword arguments using combine().
+
+  This function calls combine() and appends a testcase name to the list of
+  dictionaries returned. The 'testcase_name' key is a required for named
+  parameterized tests.
+
+  Args:
+    **kwargs: keyword arguments of form `option=[possibilities, ...]` or
+      `option=the_only_possibility`.
+
+  Returns:
+    a list of dictionaries for each combination. Keys in the dictionaries are
+    the keyword argument names.  Each key has one value - one of the
+    corresponding keyword argument values.
+  """
+  sort_by_key = lambda k: k[0]
+  combinations = []
+  for key, values in sorted(kwargs.items(), key=sort_by_key):
+    if not isinstance(values, list):
+      values = [values]
+    combinations.append([(key, value) for value in values])
+
+  combinations = [collections.OrderedDict(result)
+                  for result in itertools.product(*combinations)]
+  named_combinations = []
+  for combination in combinations:
+    assert isinstance(combination, collections.OrderedDict)
+    name = ''.join([
+        '_{}_{}'.format(''.join(filter(str.isalnum, key)),
+                        ''.join(filter(str.isalnum, str(value))))
+        for key, value in combination.items()
+    ])
+    named_combinations.append(
+        collections.OrderedDict(
+            list(combination.items()) +
+            [('testcase_name', '_test{}'.format(name))]))
+
+  return named_combinations
diff --git a/tensorflow/python/keras/tests/BUILD b/tensorflow/python/keras/tests/BUILD
index 4db3327d1f6..d6f07484160 100644
--- a/tensorflow/python/keras/tests/BUILD
+++ b/tensorflow/python/keras/tests/BUILD
@@ -17,6 +17,12 @@ package(
 
 exports_files(["LICENSE"])
 
+filegroup(
+    name = "all_py_srcs",
+    srcs = glob(["*.py"]),
+    visibility = ["//tensorflow/python/keras/google/private_tf_api_test:__pkg__"],
+)
+
 tf_py_test(
     name = "get_config_test",
     srcs = ["get_config_test.py"],
@@ -274,7 +280,6 @@ cuda_py_test(
     name = "op_callbacks_test",
     srcs = ["op_callbacks_test.py"],
     python_version = "PY3",
-    tags = ["no_cuda11"],
     xla_enable_strict_auto_jit = False,
     deps = [
         "//tensorflow/python:framework_ops",
diff --git a/tensorflow/python/keras/tests/add_loss_correctness_test.py b/tensorflow/python/keras/tests/add_loss_correctness_test.py
index 8494d6e31a0..e7a9701dc52 100644
--- a/tensorflow/python/keras/tests/add_loss_correctness_test.py
+++ b/tensorflow/python/keras/tests/add_loss_correctness_test.py
@@ -102,48 +102,46 @@ class TestAddLossCorrectness(keras_parameterized.TestCase):
     history = model.fit(self.x, batch_size=3, epochs=5)
     self.assertAllClose(history.history['loss'], [0., -.1, -.2, -.3, -.4], 1e-3)
 
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
   def test_loss_on_model_ctl(self):
-    with context.eager_mode():
+    def get_model_and_train_step():
+      inputs = Input(shape=(1,))
+      targets = Input(shape=(1,))
+      outputs = testing_utils.Bias()(inputs)
+      model = Model([inputs, targets], outputs)
+      model.add_loss(MAE()(targets, outputs))
+      model.add_loss(math_ops.reduce_mean(mae(targets, outputs)))
+      return get_ctl_train_step(model)
 
-      def get_model_and_train_step():
-        inputs = Input(shape=(1,))
-        targets = Input(shape=(1,))
-        outputs = testing_utils.Bias()(inputs)
-        model = Model([inputs, targets], outputs)
-        model.add_loss(MAE()(targets, outputs))
-        model.add_loss(math_ops.reduce_mean(mae(targets, outputs)))
-        return get_ctl_train_step(model)
+    train_step = get_model_and_train_step()
+    loss = [train_step(self.x, self.y) for _ in range(5)]
+    self.assertAllClose(loss, [2., 1.8, 1.6, 1.4, 1.2], 1e-3)
 
-      train_step = get_model_and_train_step()
-      loss = [train_step(self.x, self.y) for _ in range(5)]
-      self.assertAllClose(loss, [2., 1.8, 1.6, 1.4, 1.2], 1e-3)
-
-      train_step = def_function.function(get_model_and_train_step())
-      loss = [train_step(self.x, self.y) for _ in range(5)]
-      self.assertAllClose(loss, [2., 1.8, 1.6, 1.4, 1.2], 1e-3)
+    train_step = def_function.function(get_model_and_train_step())
+    loss = [train_step(self.x, self.y) for _ in range(5)]
+    self.assertAllClose(loss, [2., 1.8, 1.6, 1.4, 1.2], 1e-3)
 
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
   def test_loss_callable_on_model_ctl(self):
-    with context.eager_mode():
+    def get_model_and_train_step():
+      inputs = Input(shape=(1,))
+      targets = Input(shape=(1,))
+      outputs = testing_utils.Bias()(inputs)
+      model = Model([inputs, targets], outputs)
 
-      def get_model_and_train_step():
-        inputs = Input(shape=(1,))
-        targets = Input(shape=(1,))
-        outputs = testing_utils.Bias()(inputs)
-        model = Model([inputs, targets], outputs)
+      def callable_loss():
+        return math_ops.reduce_sum(model.weights)
 
-        def callable_loss():
-          return math_ops.reduce_sum(model.weights)
+      model.add_loss(callable_loss)
+      return get_ctl_train_step(model)
 
-        model.add_loss(callable_loss)
-        return get_ctl_train_step(model)
+    train_step = get_model_and_train_step()
+    loss = [train_step(self.x, self.y) for _ in range(5)]
+    self.assertAllClose(loss, [0., -0.05, -0.1, -0.15, -0.2], 1e-3)
 
-      train_step = get_model_and_train_step()
-      loss = [train_step(self.x, self.y) for _ in range(5)]
-      self.assertAllClose(loss, [0., -0.05, -0.1, -0.15, -0.2], 1e-3)
-
-      train_step = def_function.function(get_model_and_train_step())
-      loss = [train_step(self.x, self.y) for _ in range(5)]
-      self.assertAllClose(loss, [0., -0.05, -0.1, -0.15, -0.2], 1e-3)
+    train_step = def_function.function(get_model_and_train_step())
+    loss = [train_step(self.x, self.y) for _ in range(5)]
+    self.assertAllClose(loss, [0., -0.05, -0.1, -0.15, -0.2], 1e-3)
 
   @keras_parameterized.run_all_keras_modes
   def test_loss_with_sample_weight_on_model_fit(self):
@@ -161,26 +159,25 @@ class TestAddLossCorrectness(keras_parameterized.TestCase):
     history = model.fit([self.x, self.y, self.w], batch_size=3, epochs=5)
     self.assertAllClose(history.history['loss'], [4., 3.6, 3.2, 2.8, 2.4], 1e-3)
 
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
   def test_loss_with_sample_weight_on_model_ctl(self):
-    with context.eager_mode():
+    def get_model_and_train_step():
+      inputs = Input(shape=(1,))
+      targets = Input(shape=(1,))
+      sw = Input(shape=(1,))
+      outputs = testing_utils.Bias()(inputs)
+      model = Model([inputs, targets, sw], outputs)
+      model.add_loss(MAE()(targets, outputs, sw))
+      model.add_loss(math_ops.reduce_mean(sw * mae(targets, outputs)))
+      return get_ctl_train_step(model)
 
-      def get_model_and_train_step():
-        inputs = Input(shape=(1,))
-        targets = Input(shape=(1,))
-        sw = Input(shape=(1,))
-        outputs = testing_utils.Bias()(inputs)
-        model = Model([inputs, targets, sw], outputs)
-        model.add_loss(MAE()(targets, outputs, sw))
-        model.add_loss(math_ops.reduce_mean(sw * mae(targets, outputs)))
-        return get_ctl_train_step(model)
+    train_step = get_model_and_train_step()
+    loss = [train_step(self.x, self.y, self.w) for _ in range(5)]
+    self.assertAllClose(loss, [2., 1.8, 1.6, 1.4, 1.2], 1e-3)
 
-      train_step = get_model_and_train_step()
-      loss = [train_step(self.x, self.y, self.w) for _ in range(5)]
-      self.assertAllClose(loss, [2., 1.8, 1.6, 1.4, 1.2], 1e-3)
-
-      train_step = def_function.function(get_model_and_train_step())
-      loss = [train_step(self.x, self.y, self.w) for _ in range(5)]
-      self.assertAllClose(loss, [2., 1.8, 1.6, 1.4, 1.2], 1e-3)
+    train_step = def_function.function(get_model_and_train_step())
+    loss = [train_step(self.x, self.y, self.w) for _ in range(5)]
+    self.assertAllClose(loss, [2., 1.8, 1.6, 1.4, 1.2], 1e-3)
 
   @keras_parameterized.run_all_keras_modes
   def test_loss_with_sample_weight_in_model_call(self):
@@ -429,27 +426,25 @@ class TestAddLossCorrectness(keras_parameterized.TestCase):
       self.assertEqual(len(model.get_losses_for(x4)), 2)
       self.assertEqual(len(model.get_losses_for(None)), 1)
 
-  @keras_parameterized.run_all_keras_modes
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
   def test_invalid_constant_input(self):
-    with context.eager_mode():
-      inputs = Input(shape=(1,))
-      outputs = testing_utils.Bias()(inputs)
-      model = Model(inputs, outputs)
-      with self.assertRaisesRegex(
-          ValueError,
-          'Expected a symbolic Tensors or a callable for the loss value'):
-        model.add_loss(1.)
+    inputs = Input(shape=(1,))
+    outputs = testing_utils.Bias()(inputs)
+    model = Model(inputs, outputs)
+    with self.assertRaisesRegex(
+        ValueError,
+        'Expected a symbolic Tensors or a callable for the loss value'):
+      model.add_loss(1.)
 
-  @keras_parameterized.run_all_keras_modes
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
   def test_invalid_variable_input(self):
-    with context.eager_mode():
-      inputs = Input(shape=(1,))
-      outputs = testing_utils.Bias()(inputs)
-      model = Model(inputs, outputs)
-      with self.assertRaisesRegex(
-          ValueError,
-          'Expected a symbolic Tensors or a callable for the loss value'):
-        model.add_loss(model.weights[0])
+    inputs = Input(shape=(1,))
+    outputs = testing_utils.Bias()(inputs)
+    model = Model(inputs, outputs)
+    with self.assertRaisesRegex(
+        ValueError,
+        'Expected a symbolic Tensors or a callable for the loss value'):
+      model.add_loss(model.weights[0])
 
   @keras_parameterized.run_all_keras_modes
   def test_add_entropy_loss_on_functional_model(self):
diff --git a/tensorflow/python/keras/tests/automatic_outside_compilation_test.py b/tensorflow/python/keras/tests/automatic_outside_compilation_test.py
index a770b7fa6aa..64733746776 100644
--- a/tensorflow/python/keras/tests/automatic_outside_compilation_test.py
+++ b/tensorflow/python/keras/tests/automatic_outside_compilation_test.py
@@ -32,7 +32,7 @@ from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import remote
-from tensorflow.python.eager import test
+from tensorflow.python.framework import ops
 from tensorflow.python.keras import callbacks
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras.distribute import distribute_strategy_test
@@ -46,6 +46,7 @@ from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import summary_ops_v2
 from tensorflow.python.platform import flags
+from tensorflow.python.platform import test
 from tensorflow.python.summary import summary_iterator
 from tensorflow.python.tpu import tpu_strategy_util
 
@@ -275,4 +276,5 @@ class AutoOutsideCompilationWithKerasTest(test.TestCase):
 
 
 if __name__ == '__main__':
+  ops.enable_eager_execution()
   test.main()
diff --git a/tensorflow/python/keras/tests/convert_to_constants_test.py b/tensorflow/python/keras/tests/convert_to_constants_test.py
index f59c83b79dc..05826ed918d 100644
--- a/tensorflow/python/keras/tests/convert_to_constants_test.py
+++ b/tensorflow/python/keras/tests/convert_to_constants_test.py
@@ -28,7 +28,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import convert_to_constants
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_spec
-from tensorflow.python.framework import test_util
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.module import module
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
@@ -101,7 +101,7 @@ class VariablesToConstantsTest(test.TestCase):
     for expected, actual in zip(expected_value, actual_value):
       np.testing.assert_almost_equal(expected.numpy(), actual.numpy())
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def testKerasModel(self):
     """Test a basic Keras model with Variables."""
     input_data = {"x": constant_op.constant(1., shape=[1, 1])}
@@ -124,7 +124,7 @@ class VariablesToConstantsTest(test.TestCase):
     root, output_func = self._freezeModel(to_save)
     self._testConvertedFunction(root, root.f, output_func, input_data)
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def testKerasLSTM(self):
     """Test a Keras LSTM containing dynamic_rnn ops."""
     input_data = {
@@ -146,7 +146,7 @@ class VariablesToConstantsTest(test.TestCase):
     root, output_func = self._freezeModel(to_save)
     self._testConvertedFunction(root, root.f, output_func, input_data)
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def testEmbeddings(self):
     """Test model with embeddings."""
     input_data = {
diff --git a/tensorflow/python/keras/tests/memory_test.py b/tensorflow/python/keras/tests/memory_test.py
index 753820d3295..465df84d6fe 100644
--- a/tensorflow/python/keras/tests/memory_test.py
+++ b/tensorflow/python/keras/tests/memory_test.py
@@ -26,10 +26,10 @@ from __future__ import print_function
 
 from tensorflow.python import keras
 from tensorflow.python.eager import backprop
-from tensorflow.python.eager import test
 from tensorflow.python.eager.memory_tests import memory_test_util
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
 
 
 class SingleLayerNet(keras.Model):
diff --git a/tensorflow/python/keras/tests/model_subclassing_test.py b/tensorflow/python/keras/tests/model_subclassing_test.py
index 4b9b625cc0b..db0a9de99fc 100644
--- a/tensorflow/python/keras/tests/model_subclassing_test.py
+++ b/tensorflow/python/keras/tests/model_subclassing_test.py
@@ -428,7 +428,7 @@ class ModelSubclassingTest(keras_parameterized.TestCase):
       def call(self, inputs):
         return inputs + self.b + self.c
 
-    x = ops.convert_to_tensor_v2(np.ones((10, 10), 'float32'))
+    x = ops.convert_to_tensor_v2_with_dispatch(np.ones((10, 10), 'float32'))
     model = MyModel()
     model(x)
     self.assertEqual(1, len(model.trainable_weights))
@@ -444,7 +444,7 @@ class ModelSubclassingTest(keras_parameterized.TestCase):
       def call(self, inputs):
         return inputs + self.b + self.c
 
-    x = ops.convert_to_tensor_v2(np.ones((10, 10), 'float32'))
+    x = ops.convert_to_tensor_v2_with_dispatch(np.ones((10, 10), 'float32'))
     model = MyModelCustomBuild()
     model(x)
     self.assertEqual(1, len(model.trainable_weights))
@@ -467,7 +467,7 @@ class ModelSubclassingTest(keras_parameterized.TestCase):
         self.add_update(self.c.assign(inputs[1, :]))
         return inputs + self.b + self.c
 
-    x = ops.convert_to_tensor_v2(np.ones((10, 10), 'float32'))
+    x = ops.convert_to_tensor_v2_with_dispatch(np.ones((10, 10), 'float32'))
     model = MyModel()
     model(x)
 
@@ -706,32 +706,33 @@ class CustomCallSignatureTests(test.TestCase, parameterized.TestCase):
       m.predict_on_batch(x)
 
   def test_deepcopy(self):
-    with context.eager_mode():
+    if not context.executing_eagerly():
+      self.skipTest('Run in eager mode only.')
 
-      class MyModel(keras.Model):
+    class MyModel(keras.Model):
 
-        def __init__(self):
-          super(MyModel, self).__init__()
-          self.my_variable = variables_lib.Variable(0.0, trainable=False)
-          self.layer = keras.layers.Dense(4)
+      def __init__(self):
+        super(MyModel, self).__init__()
+        self.my_variable = variables_lib.Variable(0.0, trainable=False)
+        self.layer = keras.layers.Dense(4)
 
-        def call(self, obs):
-          return self.layer(obs)
+      def call(self, obs):
+        return self.layer(obs)
 
-      model = MyModel()
-      model.my_variable.assign_add(1.0)
+    model = MyModel()
+    model.my_variable.assign_add(1.0)
 
-      new_model = copy.deepcopy(model)
-      self.assertEqual(model.my_variable.numpy(), 1.0)
-      self.assertEqual(new_model.my_variable.numpy(), 1.0)
+    new_model = copy.deepcopy(model)
+    self.assertEqual(model.my_variable.numpy(), 1.0)
+    self.assertEqual(new_model.my_variable.numpy(), 1.0)
 
-      model.my_variable.assign_add(1.0)
-      self.assertEqual(model.my_variable.numpy(), 2.0)
-      self.assertEqual(new_model.my_variable.numpy(), 1.0)
+    model.my_variable.assign_add(1.0)
+    self.assertEqual(model.my_variable.numpy(), 2.0)
+    self.assertEqual(new_model.my_variable.numpy(), 1.0)
 
-      # Check that Trackable logic still works.
-      self.assertLen(new_model.variables, 1)
-      self.assertLen(new_model.layers, 1)
+    # Check that Trackable logic still works.
+    self.assertLen(new_model.variables, 1)
+    self.assertLen(new_model.layers, 1)
 
   def test_batch_counters_not_in_variables(self):
 
diff --git a/tensorflow/python/keras/tests/op_callbacks_test.py b/tensorflow/python/keras/tests/op_callbacks_test.py
index ca50bbb1a81..bee71f3b09e 100644
--- a/tensorflow/python/keras/tests/op_callbacks_test.py
+++ b/tensorflow/python/keras/tests/op_callbacks_test.py
@@ -22,12 +22,12 @@ import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.eager import context
-from tensorflow.python.eager import test
 from tensorflow.python.framework import op_callbacks
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.ops import script_ops
+from tensorflow.python.platform import test
 from tensorflow.python.util import compat
 
 
diff --git a/tensorflow/python/keras/tests/saved_model_test.py b/tensorflow/python/keras/tests/saved_model_test.py
index cd6363b8855..9264a60eb55 100644
--- a/tensorflow/python/keras/tests/saved_model_test.py
+++ b/tensorflow/python/keras/tests/saved_model_test.py
@@ -22,7 +22,7 @@ import os
 import sys
 
 from tensorflow.python.eager import backprop
-from tensorflow.python.eager import function
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_spec
@@ -41,10 +41,7 @@ class _ModelWithOptimizerUsingDefun(util.Checkpoint):
     self.dense = core.Dense(1)
     self.optimizer = adam.Adam(0.01)
 
-  # Using defun due to control flow v2 cycles, b/121159261. def_function uses
-  # conds to gate variable initialization and so triggers cond reference cycles,
-  # but the thing being wrapped here does not use cond itself.
-  @function.defun(
+  @def_function.function(
       input_signature=(tensor_spec.TensorSpec([None, 2], dtypes.float32),
                        tensor_spec.TensorSpec([None], dtypes.float32)),
   )
diff --git a/tensorflow/python/keras/tests/summary_ops_test.py b/tensorflow/python/keras/tests/summary_ops_test.py
index 5c9fee91b9c..7d9a89ec60c 100644
--- a/tensorflow/python/keras/tests/summary_ops_test.py
+++ b/tensorflow/python/keras/tests/summary_ops_test.py
@@ -21,7 +21,7 @@ from __future__ import print_function
 import os
 
 from tensorflow.core.util import event_pb2
-from tensorflow.python.framework import test_util
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine.sequential import Sequential
 from tensorflow.python.keras.engine.training import Model
 from tensorflow.python.keras.layers.core import Activation
@@ -33,7 +33,7 @@ from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
 
 
-class SummaryOpsTest(test_util.TensorFlowTestCase):
+class SummaryOpsTest(test.TestCase):
 
   def tearDown(self):
     super(SummaryOpsTest, self).tearDown()
@@ -50,7 +50,7 @@ class SummaryOpsTest(test_util.TensorFlowTestCase):
     # the second event.
     return events[1]
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def testKerasModel(self):
     model = Sequential(
         [Dense(10, input_shape=(100,)),
@@ -59,7 +59,7 @@ class SummaryOpsTest(test_util.TensorFlowTestCase):
     first_val = event.summary.value[0]
     self.assertEqual(model.to_json(), first_val.tensor.string_val[0].decode())
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def testKerasModel_usesDefaultStep(self):
     model = Sequential(
         [Dense(10, input_shape=(100,)),
@@ -72,7 +72,7 @@ class SummaryOpsTest(test_util.TensorFlowTestCase):
       # Reset to default state for other tests.
       summary_ops.set_step(None)
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def testKerasModel_subclass(self):
 
     class SimpleSubclass(Model):
@@ -93,7 +93,7 @@ class SummaryOpsTest(test_util.TensorFlowTestCase):
       self.assertRegex(
           str(mock_log.call_args), 'Model failed to serialize as JSON.')
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def testKerasModel_otherExceptions(self):
     model = Sequential()
 
diff --git a/tensorflow/python/keras/tests/tracking_test.py b/tensorflow/python/keras/tests/tracking_test.py
index 02d5cd519ab..f3818190902 100644
--- a/tensorflow/python/keras/tests/tracking_test.py
+++ b/tensorflow/python/keras/tests/tracking_test.py
@@ -23,7 +23,6 @@ import numpy
 import six
 
 from tensorflow.python.eager import context
-from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -37,6 +36,7 @@ from tensorflow.python.module import module
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
 from tensorflow.python.training.tracking import base
 from tensorflow.python.training.tracking import data_structures
 from tensorflow.python.training.tracking import util
@@ -177,21 +177,18 @@ class ListTests(keras_parameterized.TestCase):
     m2(m2.null_input())
     self.assertLen(m2.trainable_variables, 6)
 
+  @combinations.generate(combinations.combine(mode=["graph", "eager"]))
   def testUpdatesForwarded(self):
-    with context.graph_mode():
-      model = HasList()
-      model_input = array_ops.ones([32, 2])
-      model(model_input)
+    model = HasList()
+    model_input = array_ops.ones([32, 2])
+    model(model_input)
+    if context.executing_eagerly():
+      self.assertEqual(0, len(model.updates))
+    else:
       self.assertGreater(len(model.layers_with_updates[0].updates), 0)
       self.assertEqual(set(model.layers_with_updates[0].updates),
                        set(model.updates))
 
-    with context.eager_mode():
-      model = HasList()
-      model_input = array_ops.ones([32, 2])
-      model(model_input)
-      self.assertEqual(0, len(model.updates))
-
   @combinations.generate(combinations.combine(mode=["graph", "eager"]))
   def testLossesForwarded(self):
     model = HasList()
@@ -609,4 +606,5 @@ class InterfaceTests(keras_parameterized.TestCase):
 
 
 if __name__ == "__main__":
+  ops.enable_eager_execution()
   test.main()
diff --git a/tensorflow/python/keras/tests/tracking_util_test.py b/tensorflow/python/keras/tests/tracking_util_test.py
index 21b6ef8e8d2..5f672faa751 100644
--- a/tensorflow/python/keras/tests/tracking_util_test.py
+++ b/tensorflow/python/keras/tests/tracking_util_test.py
@@ -100,13 +100,15 @@ class InterfaceTests(test.TestCase):
       checkpoint.save(os.path.join(self.get_temp_dir(), "ckpt"))
 
   def testObjectMetadata(self):
-    with context.eager_mode():
-      checkpoint_directory = self.get_temp_dir()
-      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-      dense = core.Dense(1)
-      checkpoint = trackable_utils.Checkpoint(dense=dense)
-      dense(constant_op.constant([[1.]]))
-      save_path = checkpoint.save(checkpoint_prefix)
+    if not context.executing_eagerly():
+      self.skipTest("Run in eager mode only.")
+
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    dense = core.Dense(1)
+    checkpoint = trackable_utils.Checkpoint(dense=dense)
+    dense(constant_op.constant([[1.]]))
+    save_path = checkpoint.save(checkpoint_prefix)
 
     objects = trackable_utils.object_metadata(save_path)
     all_variable_names = []
@@ -382,30 +384,30 @@ class CheckpointingTests(keras_parameterized.TestCase):
           self.assertEqual(training_continuation + 1,
                            self.evaluate(root.save_counter))
 
+  @combinations.generate(combinations.combine(mode=["eager"]))
   def testPartialRestoreWarningObject(self):
-    with context.eager_mode():
-      optimizer = adam.Adam(0.0)
-      original_root = trackable_utils.Checkpoint(v1=variables_lib.Variable(2.),
-                                                 v2=variables_lib.Variable(3.),
-                                                 optimizer=optimizer)
-      # Create a slot variable to save
-      optimizer.minimize(original_root.v1.read_value, [original_root.v1])
-      prefix = os.path.join(self.get_temp_dir(), "ckpt")
-      save_path = original_root.save(prefix)
-      partial_root = trackable_utils.Checkpoint(v1=variables_lib.Variable(0.))
-      weak_partial_root = weakref.ref(partial_root)
-      weak_v1 = weakref.ref(partial_root.v1)
-      partial_root.restore(save_path)
-      self.assertEqual(2., partial_root.v1.numpy())
-      with test.mock.patch.object(logging, "warning") as mock_log:
-        del partial_root
-        self.assertIsNone(weak_partial_root())
-        self.assertIsNone(weak_v1())
-        messages = str(mock_log.call_args_list)
-      self.assertIn("(root).v2'", messages)
-      self.assertIn("(root).optimizer's state 'm' for (root).v1", messages)
-      self.assertNotIn("(root).v1'", messages)
-      self.assertIn("expect_partial()", messages)
+    optimizer = adam.Adam(0.0)
+    original_root = trackable_utils.Checkpoint(v1=variables_lib.Variable(2.),
+                                               v2=variables_lib.Variable(3.),
+                                               optimizer=optimizer)
+    # Create a slot variable to save
+    optimizer.minimize(original_root.v1.read_value, [original_root.v1])
+    prefix = os.path.join(self.get_temp_dir(), "ckpt")
+    save_path = original_root.save(prefix)
+    partial_root = trackable_utils.Checkpoint(v1=variables_lib.Variable(0.))
+    weak_partial_root = weakref.ref(partial_root)
+    weak_v1 = weakref.ref(partial_root.v1)
+    partial_root.restore(save_path)
+    self.assertEqual(2., partial_root.v1.numpy())
+    with test.mock.patch.object(logging, "warning") as mock_log:
+      del partial_root
+      self.assertIsNone(weak_partial_root())
+      self.assertIsNone(weak_v1())
+      messages = str(mock_log.call_args_list)
+    self.assertIn("(root).v2'", messages)
+    self.assertIn("(root).optimizer's state 'm' for (root).v1", messages)
+    self.assertNotIn("(root).v1'", messages)
+    self.assertIn("expect_partial()", messages)
 
   # pylint: disable=cell-var-from-loop
   @combinations.generate(combinations.combine(mode=["graph", "eager"]))
@@ -450,6 +452,7 @@ class CheckpointingTests(keras_parameterized.TestCase):
                            self.evaluate(root.save_counter))
   # pylint: enable=cell-var-from-loop
 
+  @combinations.generate(combinations.combine(mode=["eager"]))
   def testAnonymousVarsInInit(self):
 
     class Model(training.Model):
@@ -463,21 +466,20 @@ class CheckpointingTests(keras_parameterized.TestCase):
       def call(self, x):
         return x * self.w + self.b
 
-    with context.eager_mode():
-      model = Model()
-      optimizer = adam.Adam(learning_rate=0.05)
-      checkpoint_directory = self.get_temp_dir()
-      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-      checkpoint = trackable_utils.Checkpoint(
-          model=model, optimizer=optimizer)
-      for _ in range(2):
-        checkpoint.save(checkpoint_prefix)
-        with backprop.GradientTape() as tape:
-          loss = (constant_op.constant(1.)
-                  - model(constant_op.constant(1.))) ** 2
-        grad = tape.gradient(loss, model.vars)
-        optimizer.apply_gradients(
-            [(g, v) for g, v in zip(grad, model.vars)])
+    model = Model()
+    optimizer = adam.Adam(learning_rate=0.05)
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    checkpoint = trackable_utils.Checkpoint(
+        model=model, optimizer=optimizer)
+    for _ in range(2):
+      checkpoint.save(checkpoint_prefix)
+      with backprop.GradientTape() as tape:
+        loss = (constant_op.constant(1.)
+                - model(constant_op.constant(1.))) ** 2
+      grad = tape.gradient(loss, model.vars)
+      optimizer.apply_gradients(
+          [(g, v) for g, v in zip(grad, model.vars)])
 
   @combinations.generate(combinations.combine(mode=["graph", "eager"]))
   def testDeferredSlotRestoration(self):
diff --git a/tensorflow/python/keras/tests/tracking_util_with_v1_optimizers_test.py b/tensorflow/python/keras/tests/tracking_util_with_v1_optimizers_test.py
index 6a998b238fe..5b5283af1eb 100644
--- a/tensorflow/python/keras/tests/tracking_util_with_v1_optimizers_test.py
+++ b/tensorflow/python/keras/tests/tracking_util_with_v1_optimizers_test.py
@@ -1,4 +1,4 @@
-  # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -26,7 +26,6 @@ from tensorflow.python.distribute import mirrored_strategy
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
-from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -39,6 +38,7 @@ from tensorflow.python.keras.layers import core
 from tensorflow.python.module import module
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
 from tensorflow.python.training import adam
 from tensorflow.python.training import checkpoint_management
 from tensorflow.python.training import saver as saver_lib
@@ -470,6 +470,7 @@ class CheckpointingTests(keras_parameterized.TestCase):
       pass  # Make sure we can use this as an op name if we prefix it.
     return named_variable.name
 
+  @combinations.generate(combinations.combine(mode=["eager"]))
   def testAnonymousVarsInInit(self):
 
     class Model(training.Model):
@@ -483,21 +484,20 @@ class CheckpointingTests(keras_parameterized.TestCase):
       def call(self, x):
         return x * self.w + self.b
 
-    with context.eager_mode():
-      model = Model()
-      optimizer = adam.AdamOptimizer(learning_rate=0.05)
-      checkpoint_directory = self.get_temp_dir()
-      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-      checkpoint = trackable_utils.Checkpoint(
-          model=model, optimizer=optimizer)
-      for _ in range(2):
-        checkpoint.save(checkpoint_prefix)
-        with backprop.GradientTape() as tape:
-          loss = (constant_op.constant(1.)
-                  - model(constant_op.constant(1.))) ** 2
-        grad = tape.gradient(loss, model.vars)
-        optimizer.apply_gradients(
-            [(g, v) for g, v in zip(grad, model.vars)])
+    model = Model()
+    optimizer = adam.AdamOptimizer(learning_rate=0.05)
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    checkpoint = trackable_utils.Checkpoint(
+        model=model, optimizer=optimizer)
+    for _ in range(2):
+      checkpoint.save(checkpoint_prefix)
+      with backprop.GradientTape() as tape:
+        loss = (constant_op.constant(1.)
+                - model(constant_op.constant(1.))) ** 2
+      grad = tape.gradient(loss, model.vars)
+      optimizer.apply_gradients(
+          [(g, v) for g, v in zip(grad, model.vars)])
 
   @combinations.generate(combinations.combine(mode=["graph", "eager"]))
   def test_initialize_if_not_restoring(self):
@@ -708,4 +708,5 @@ class CheckpointCompatibilityTests(keras_parameterized.TestCase):
 
 
 if __name__ == "__main__":
+  ops.enable_eager_execution()
   test.main()
diff --git a/tensorflow/python/keras/type/BUILD b/tensorflow/python/keras/type/BUILD
index bb612301dd1..301f9af3f9e 100644
--- a/tensorflow/python/keras/type/BUILD
+++ b/tensorflow/python/keras/type/BUILD
@@ -5,6 +5,12 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
+filegroup(
+    name = "all_py_srcs",
+    srcs = glob(["*.py"]),
+    visibility = ["//tensorflow/python/keras/google/private_tf_api_test:__pkg__"],
+)
+
 py_strict_library(
     name = "types",
     srcs = ["types.py"],
diff --git a/tensorflow/python/keras/utils/BUILD b/tensorflow/python/keras/utils/BUILD
index 899701d624c..79c99cc1870 100644
--- a/tensorflow/python/keras/utils/BUILD
+++ b/tensorflow/python/keras/utils/BUILD
@@ -15,6 +15,12 @@ package(
 
 exports_files(["LICENSE"])
 
+filegroup(
+    name = "all_py_srcs",
+    srcs = glob(["*.py"]),
+    visibility = ["//tensorflow/python/keras/google/private_tf_api_test:__pkg__"],
+)
+
 py_library(
     name = "utils",
     srcs = [
@@ -191,6 +197,24 @@ py_library(
     ],
 )
 
+py_library(
+    name = "tf_contextlib",
+    srcs = ["tf_contextlib.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:util",
+    ],
+)
+
+py_library(
+    name = "tf_inspect",
+    srcs = ["tf_inspect.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:util",
+    ],
+)
+
 py_library(
     name = "vis_utils",
     srcs = [
@@ -301,6 +325,19 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "layer_utils_test",
+    size = "small",
+    srcs = ["layer_utils_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":layer_utils",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/training/tracking",
+        "//third_party/py/numpy",
+    ],
+)
+
 tf_py_test(
     name = "np_utils_test",
     size = "small",
diff --git a/tensorflow/python/keras/utils/all_utils.py b/tensorflow/python/keras/utils/all_utils.py
index deb73b08337..17b8fe98310 100644
--- a/tensorflow/python/keras/utils/all_utils.py
+++ b/tensorflow/python/keras/utils/all_utils.py
@@ -34,8 +34,6 @@ from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras.utils.generic_utils import get_custom_objects
 from tensorflow.python.keras.utils.generic_utils import Progbar
 from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
-from tensorflow.python.keras.utils.io_utils import HDF5Matrix
-from tensorflow.python.keras.utils.layer_utils import convert_all_kernels_in_model
 from tensorflow.python.keras.utils.layer_utils import get_source_inputs
 from tensorflow.python.keras.utils.multi_gpu_utils import multi_gpu_model
 from tensorflow.python.keras.utils.np_utils import normalize
diff --git a/tensorflow/python/keras/utils/composite_tensor_support_test.py b/tensorflow/python/keras/utils/composite_tensor_support_test.py
index daba188414a..a48408c6788 100644
--- a/tensorflow/python/keras/utils/composite_tensor_support_test.py
+++ b/tensorflow/python/keras/utils/composite_tensor_support_test.py
@@ -325,7 +325,7 @@ def prepare_inputs(data, use_dict, use_dataset, action, input_name):
 @keras_parameterized.run_with_all_model_types
 @keras_parameterized.run_all_keras_modes
 @parameterized.named_parameters(
-    *test_util.generate_combinations_with_testcase_name(
+    *testing_utils.generate_combinations_with_testcase_name(
         use_dict=[True, False],
         use_dataset=[True, False],
         action=["predict", "evaluate", "fit"]))
@@ -490,7 +490,7 @@ class ScipySparseTensorInputTest(keras_parameterized.TestCase,
 @keras_parameterized.run_with_all_model_types
 @keras_parameterized.run_all_keras_modes
 @parameterized.named_parameters(
-    *test_util.generate_combinations_with_testcase_name(
+    *testing_utils.generate_combinations_with_testcase_name(
         use_dict=[True, False],
         use_dataset=[True, False],
         action=["predict", "evaluate", "fit"]))
@@ -537,7 +537,7 @@ class RaggedTensorInputTest(keras_parameterized.TestCase,
 @keras_parameterized.run_with_all_model_types
 @keras_parameterized.run_all_keras_modes
 @parameterized.named_parameters(
-    *test_util.generate_combinations_with_testcase_name(
+    *testing_utils.generate_combinations_with_testcase_name(
         use_dict=[True, False], use_dataset=[True, False]))
 class RaggedTensorInputValidationTest(keras_parameterized.TestCase,
                                       test_util.TensorFlowTestCase):
diff --git a/tensorflow/python/keras/utils/control_flow_util.py b/tensorflow/python/keras/utils/control_flow_util.py
index 8d13c573149..788b5731554 100644
--- a/tensorflow/python/keras/utils/control_flow_util.py
+++ b/tensorflow/python/keras/utils/control_flow_util.py
@@ -22,7 +22,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import smart_cond as smart_module
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import variables
 
@@ -127,13 +129,13 @@ def constant_value(pred):  # pylint: disable=invalid-name
     TypeError: If `pred` is not a Variable, Tensor or bool, or Python
       integer 1 or 0.
   """
-  # Allow integer booleans.
-  if isinstance(pred, int):
-    if pred == 1:
-      pred = True
-    elif pred == 0:
-      pred = False
-
+  if isinstance(pred, ops.Tensor):
+    return tensor_util.constant_value(pred)
+  if pred in {0, 1}:  # Accept 1/0 as valid boolean values
+    return bool(pred)
+  if isinstance(pred, bool):
+    return pred
   if isinstance(pred, variables.Variable):
     return None
-  return smart_module.smart_constant_value(pred)
+  raise TypeError("`pred` must be a Tensor, or a Python bool, or 1 or 0. "
+                  "Found instead: %s" % type(pred))
diff --git a/tensorflow/python/keras/utils/conv_utils.py b/tensorflow/python/keras/utils/conv_utils.py
index e8ee866d958..769ac654687 100644
--- a/tensorflow/python/keras/utils/conv_utils.py
+++ b/tensorflow/python/keras/utils/conv_utils.py
@@ -208,31 +208,6 @@ def normalize_padding(value):
   return padding
 
 
-def convert_kernel(kernel):
-  """Converts a Numpy kernel matrix from Theano format to TensorFlow format.
-
-  Also works reciprocally, since the transformation is its own inverse.
-
-  This is used for converting legacy Theano-saved model files.
-
-  Arguments:
-      kernel: Numpy array (3D, 4D or 5D).
-
-  Returns:
-      The converted kernel.
-
-  Raises:
-      ValueError: in case of invalid kernel shape or invalid data_format.
-  """
-  kernel = np.asarray(kernel)
-  if not 3 <= kernel.ndim <= 5:
-    raise ValueError('Invalid kernel shape:', kernel.shape)
-  slices = [slice(None, None, -1) for _ in range(kernel.ndim)]
-  no_flip = (slice(None, None), slice(None, None))
-  slices[-2:] = no_flip
-  return np.copy(kernel[slices])
-
-
 def conv_kernel_mask(input_shape, kernel_shape, strides, padding):
   """Compute a mask representing the connectivity of a convolution operation.
 
diff --git a/tensorflow/python/keras/utils/data_utils.py b/tensorflow/python/keras/utils/data_utils.py
index 7eb0b63aebd..7f15c3e8af5 100644
--- a/tensorflow/python/keras/utils/data_utils.py
+++ b/tensorflow/python/keras/utils/data_utils.py
@@ -23,14 +23,12 @@ from abc import abstractmethod
 from contextlib import closing
 import errno
 import functools
-import gc
 import hashlib
 import multiprocessing
 import multiprocessing.dummy
 import os
 import random
 import shutil
-import signal
 import sys
 import tarfile
 import threading
@@ -45,11 +43,10 @@ from six.moves.urllib.error import URLError
 
 from tensorflow.python.framework import ops
 from six.moves.urllib.request import urlopen
+from tensorflow.python.keras.utils import tf_inspect
 from tensorflow.python.keras.utils.generic_utils import Progbar
 from tensorflow.python.keras.utils.io_utils import path_to_string
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.util import deprecation
-from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -553,115 +550,6 @@ def init_pool(seqs):
   _SHARED_SEQUENCES = seqs
 
 
-@deprecation.deprecated('2020-06-07', 'Please manage pools using the standard '
-                        'Python lib.')
-@keras_export('keras.experimental.terminate_keras_multiprocessing_pools')
-def terminate_keras_multiprocessing_pools(grace_period=0.1, use_sigkill=False):
-  """Destroy Keras' multiprocessing pools to prevent deadlocks.
-
-  In general multiprocessing.Pool can interact quite badly with other, seemingly
-  unrelated, parts of a codebase due to Pool's reliance on fork. This method
-  cleans up all pools which are known to belong to Keras (and thus can be safely
-  terminated).
-
-  Args:
-    grace_period: Time (in seconds) to wait for process cleanup to propagate.
-    use_sigkill: Boolean of whether or not to perform a cleanup pass using
-      SIGKILL.
-
-  Returns:
-    A list of human readable strings describing all issues encountered. It is up
-    to the caller to decide whether to treat this as an error condition.
-  """
-  errors = []
-
-  # First cleanup the pools spawned by Keras. If we start killing workers and
-  # a parent pool is still alive it will just spawn replacements which we don't
-  # want.
-  gc.collect()
-  for pool in _DATA_POOLS:
-    pool.close()
-    pool.terminate()
-    # We do not join the pool, because that would wait forever if a worker
-    # refused to exit.
-
-    # Finally, delete our reference to the pool so that we do not block garbage
-    # collection.
-    del pool
-
-  # If there were any pools, sleep for a small grace period to allow everything
-  # to finalize.
-  if _DATA_POOLS:
-    time.sleep(grace_period)
-
-  # Now we kill any workers which are still alive. However we must compare
-  # the worker identifier to the set of identifiers which are known to have been
-  # spawned by pools belonging to Keras to avoid deleting unrelated workers.
-  # First we call the .terminate() method of a worker, and then if it still
-  # persists we directly send a signal to the process.  Certain worker tasks may
-  # be able to gracefully handle shutdown, so we send a SIGTERM and then
-  # optionally follow up with a SIGKILL.
-  visited_workers = set()
-  cleanup_passes = ['.terminate', 'SIGTERM']
-  if use_sigkill:
-    cleanup_passes.append('SIGKILL')
-  cleanup_passes.append('log')
-
-  for cleanup_pass in cleanup_passes:
-    while True:
-      # In rare cases, queue.qsize() overestimates the number of elements. This
-      # loop is designed to be more robust.
-      try:
-        _WORKER_IDS.add(get_worker_id_queue().get_nowait())
-      except queue.Empty:
-        break
-
-    gc.collect()
-    workers_terminated_this_pass = False
-    for worker in multiprocessing.active_children():
-      ident = worker.ident
-      if ident in _WORKER_IDS and worker.is_alive():
-        try:
-          if cleanup_pass == '.terminate':
-            # First we ask nicely.
-            worker.terminate()
-            worker.join(timeout=grace_period)
-            visited_workers.add(ident)
-            workers_terminated_this_pass = True
-          elif cleanup_pass in ('SIGTERM', 'SIGKILL'):
-            # Then we ask increasingly tersely.
-            os.kill(worker.pid, signal.SIGKILL if cleanup_pass == 'SIGKILL'
-                    else signal.SIGTERM)
-            workers_terminated_this_pass = True
-
-          elif cleanup_pass == 'log':
-            # And finally we give up and log the failure.
-            errors.append('worker still alive: {}, pid={}, hash={}'
-                          .format(worker.name, worker.pid, hash(worker)))
-
-        except OSError:
-          # Worker exited since the start of this loop.
-          pass
-
-    if workers_terminated_this_pass:
-      # There can be a small propagation delay between worker destruction and
-      # workers reporting False for is_alive and no longer appearing in the
-      # list of active children. Once again, we sleep for a small grace period.
-      # This prevents false positives from workers which are simply still in the
-      # process of spinning down.
-      time.sleep(grace_period)
-
-  # Finally we remove the visited worker ids to handle the edge case that a
-  # pid is reused.
-  _WORKER_IDS.difference_update(visited_workers)
-
-  gc.collect()
-  for pool in _DATA_POOLS:
-    errors.append('pool still exists: {}, hash={}'.format(pool, hash(pool)))
-
-  return errors
-
-
 def get_index(uid, i):
   """Get the value from the Sequence `uid` at index `i`.
 
diff --git a/tensorflow/python/keras/utils/generic_utils.py b/tensorflow/python/keras/utils/generic_utils.py
index e33a24b93dd..6b79ebf7581 100644
--- a/tensorflow/python/keras/utils/generic_utils.py
+++ b/tensorflow/python/keras/utils/generic_utils.py
@@ -29,11 +29,10 @@ import types as python_types
 
 import numpy as np
 import six
-
+from tensorflow.python.keras.utils import tf_contextlib
+from tensorflow.python.keras.utils import tf_inspect
 from tensorflow.python.util import nest
-from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util import tf_decorator
-from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import keras_export
 
 _GLOBAL_CUSTOM_OBJECTS = {}
@@ -526,6 +525,8 @@ class Progbar(object):
     self._start = time.time()
     self._last_update = 0
 
+    self._time_after_first_step = None
+
   def update(self, current, values=None, finalize=None):
     """Updates the progress bar.
 
@@ -597,10 +598,7 @@ class Progbar(object):
       self._total_width = len(bar)
       sys.stdout.write(bar)
 
-      if current:
-        time_per_unit = (now - self._start) / current
-      else:
-        time_per_unit = 0
+      time_per_unit = self._estimate_step_duration(current, now)
 
       if self.target is None or finalize:
         if time_per_unit >= 1 or time_per_unit == 0:
@@ -664,6 +662,37 @@ class Progbar(object):
   def add(self, n, values=None):
     self.update(self._seen_so_far + n, values)
 
+  def _estimate_step_duration(self, current, now):
+    """Estimate the duration of a single step.
+
+    Given the step number `current` and the corresponding time `now`
+    this function returns an estimate for how long a single step
+    takes. If this is called before one step has been completed
+    (i.e. `current == 0`) then zero is given as an estimate. The duration
+    estimate ignores the duration of the (assumed to be non-representative)
+    first step for estimates when more steps are available (i.e. `current>1`).
+    Arguments:
+      current: Index of current step.
+      now: The current time.
+    Returns: Estimate of the duration of a single step.
+    """
+    if current:
+      # there are a few special scenarios here:
+      # 1) somebody is calling the progress bar without ever supplying step 1
+      # 2) somebody is calling the progress bar and supplies step one mulitple
+      #    times, e.g. as part of a finalizing call
+      # in these cases, we just fall back to the simple calculation
+      if self._time_after_first_step is not None and current > 1:
+        time_per_unit = (now - self._time_after_first_step) / (current - 1)
+      else:
+        time_per_unit = (now - self._start) / current
+
+      if current == 1:
+        self._time_after_first_step = now
+      return time_per_unit
+    else:
+      return 0
+
 
 def make_batches(size, batch_size):
   """Returns a list of batch indices (tuples of indices).
diff --git a/tensorflow/python/keras/utils/io_utils.py b/tensorflow/python/keras/utils/io_utils.py
index 7c3395b239c..e70f8013ef8 100644
--- a/tensorflow/python/keras/utils/io_utils.py
+++ b/tensorflow/python/keras/utils/io_utils.py
@@ -18,21 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
 import os
 import sys
 
-import numpy as np
 import six
-from tensorflow.python.framework import tensor_spec
-from tensorflow.python.framework import type_spec
-from tensorflow.python.util import deprecation
-from tensorflow.python.util.tf_export import keras_export
-
-try:
-  import h5py
-except ImportError:
-  h5py = None
 
 
 if sys.version_info >= (3, 6):
@@ -77,162 +66,6 @@ def path_to_string(path):
   return _path_to_string(path)
 
 
-@keras_export('keras.utils.HDF5Matrix')
-class HDF5Matrix(object):
-  """Representation of HDF5 dataset to be used instead of a Numpy array.
-
-  THIS CLASS IS DEPRECATED.
-  Training with HDF5Matrix may not be optimized for performance, and might
-  not work with every distribution strategy.
-
-  We recommend using https://github.com/tensorflow/io to load your
-  HDF5 data into a tf.data Dataset and passing that dataset to Keras.
-  """
-  refs = collections.defaultdict(int)
-
-  @deprecation.deprecated('2020-05-30', 'Training with '
-                          'HDF5Matrix is not optimized for performance. '
-                          'Instead, we recommend using '
-                          'https://github.com/tensorflow/io to load your '
-                          'HDF5 data into a tf.data Dataset and passing '
-                          'that dataset to Keras.')
-  def __init__(self, datapath, dataset, start=0, end=None, normalizer=None):
-    """Representation of HDF5 dataset to be used instead of a Numpy array.
-
-    Example:
-
-    ```python
-        x_data = HDF5Matrix('input/file.hdf5', 'data')
-        model.predict(x_data)
-    ```
-
-    Providing `start` and `end` allows use of a slice of the dataset.
-
-    Optionally, a normalizer function (or lambda) can be given. This will
-    be called on every slice of data retrieved.
-
-    Arguments:
-        datapath: string, path to a HDF5 file
-        dataset: string, name of the HDF5 dataset in the file specified
-            in datapath
-        start: int, start of desired slice of the specified dataset
-        end: int, end of desired slice of the specified dataset
-        normalizer: function to be called on data when retrieved
-
-    Returns:
-        An array-like HDF5 dataset.
-
-    Raises:
-      ImportError if HDF5 & h5py are not installed
-    """
-    if h5py is None:
-      raise ImportError('The use of HDF5Matrix requires '
-                        'HDF5 and h5py installed.')
-
-    if datapath not in list(self.refs.keys()):
-      f = h5py.File(datapath)
-      self.refs[datapath] = f
-    else:
-      f = self.refs[datapath]
-    self.data = f[dataset]
-    self.start = start
-    if end is None:
-      self.end = self.data.shape[0]
-    else:
-      self.end = end
-    self.normalizer = normalizer
-
-  def __len__(self):
-    return self.end - self.start
-
-  def __getitem__(self, key):
-    if isinstance(key, slice):
-      start, stop = key.start, key.stop
-      if start is None:
-        start = 0
-      if stop is None:
-        stop = self.shape[0]
-      if stop + self.start <= self.end:
-        idx = slice(start + self.start, stop + self.start)
-      else:
-        raise IndexError
-    elif isinstance(key, (int, np.integer)):
-      if key + self.start < self.end:
-        idx = key + self.start
-      else:
-        raise IndexError
-    elif isinstance(key, np.ndarray):
-      if np.max(key) + self.start < self.end:
-        idx = (self.start + key).tolist()
-      else:
-        raise IndexError
-    else:
-      # Assume list/iterable
-      if max(key) + self.start < self.end:
-        idx = [x + self.start for x in key]
-      else:
-        raise IndexError
-    if self.normalizer is not None:
-      return self.normalizer(self.data[idx])
-    else:
-      return self.data[idx]
-
-  @property
-  def shape(self):
-    """Gets a numpy-style shape tuple giving the dataset dimensions.
-
-    Returns:
-        A numpy-style shape tuple.
-    """
-    return (self.end - self.start,) + self.data.shape[1:]
-
-  @property
-  def dtype(self):
-    """Gets the datatype of the dataset.
-
-    Returns:
-        A numpy dtype string.
-    """
-    return self.data.dtype
-
-  @property
-  def ndim(self):
-    """Gets the number of dimensions (rank) of the dataset.
-
-    Returns:
-        An integer denoting the number of dimensions (rank) of the dataset.
-    """
-    return self.data.ndim
-
-  @property
-  def size(self):
-    """Gets the total dataset size (number of elements).
-
-    Returns:
-        An integer denoting the number of elements in the dataset.
-    """
-    return np.prod(self.shape)
-
-  @staticmethod
-  def _to_type_spec(value):
-    """Gets the Tensorflow TypeSpec corresponding to the passed dataset.
-
-    Args:
-      value: A HDF5Matrix object.
-
-    Returns:
-      A tf.TensorSpec.
-    """
-    if not isinstance(value, HDF5Matrix):
-      raise TypeError('Expected value to be a HDF5Matrix, but saw: {}'.format(
-          type(value)))
-    return tensor_spec.TensorSpec(shape=value.shape, dtype=value.dtype)
-
-
-type_spec.register_type_spec_from_value_converter(HDF5Matrix,
-                                                  HDF5Matrix._to_type_spec)  # pylint: disable=protected-access
-
-
 def ask_to_proceed_with_overwrite(filepath):
   """Produces a prompt asking about overwriting a file.
 
diff --git a/tensorflow/python/keras/utils/io_utils_test.py b/tensorflow/python/keras/utils/io_utils_test.py
index 29328e52dbc..a0ead4ee623 100644
--- a/tensorflow/python/keras/utils/io_utils_test.py
+++ b/tensorflow/python/keras/utils/io_utils_test.py
@@ -18,110 +18,17 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-import shutil
 import sys
 
-import numpy as np
 import six
 
-from tensorflow.python import keras
 from tensorflow.python.keras import keras_parameterized
-from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.utils import io_utils
 from tensorflow.python.platform import test
 
-try:
-  import h5py  # pylint:disable=g-import-not-at-top
-except ImportError:
-  h5py = None
-
-
-def create_dataset(h5_path='test.h5'):
-  x = np.random.randn(200, 10).astype('float32')
-  y = np.random.randint(0, 2, size=(200, 1))
-  f = h5py.File(h5_path, 'w')
-  # Creating dataset to store features
-  x_dset = f.create_dataset('my_data', (200, 10), dtype='f')
-  x_dset[:] = x
-  # Creating dataset to store labels
-  y_dset = f.create_dataset('my_labels', (200, 1), dtype='i')
-  y_dset[:] = y
-  f.close()
-
 
 class TestIOUtils(keras_parameterized.TestCase):
 
-  @keras_parameterized.run_all_keras_modes
-  def test_HDF5Matrix(self):
-    if h5py is None:
-      return
-
-    temp_dir = self.get_temp_dir()
-    self.addCleanup(shutil.rmtree, temp_dir)
-
-    h5_path = os.path.join(temp_dir, 'test.h5')
-    create_dataset(h5_path)
-
-    # Instantiating HDF5Matrix for the training set,
-    # which is a slice of the first 150 elements
-    x_train = io_utils.HDF5Matrix(h5_path, 'my_data', start=0, end=150)
-    y_train = io_utils.HDF5Matrix(h5_path, 'my_labels', start=0, end=150)
-
-    # Likewise for the test set
-    x_test = io_utils.HDF5Matrix(h5_path, 'my_data', start=150, end=200)
-    y_test = io_utils.HDF5Matrix(h5_path, 'my_labels', start=150, end=200)
-
-    # HDF5Matrix behave more or less like Numpy matrices
-    # with regard to indexing
-    self.assertEqual(y_train.shape, (150, 1))
-    # But they do not support negative indices, so don't try print(x_train[-1])
-
-    self.assertEqual(y_train.dtype, np.dtype('i'))
-    self.assertEqual(y_train.ndim, 2)
-    self.assertEqual(y_train.size, 150)
-
-    model = keras.models.Sequential()
-    model.add(keras.layers.Dense(64, input_shape=(10,), activation='relu'))
-    model.add(keras.layers.Dense(1, activation='sigmoid'))
-    model.compile(
-        loss='binary_crossentropy',
-        optimizer='sgd',
-        run_eagerly=testing_utils.should_run_eagerly())
-
-    # Note: you have to use shuffle='batch' or False with HDF5Matrix
-    model.fit(x_train, y_train, batch_size=32, shuffle='batch', verbose=False)
-    # test that evaluation and prediction
-    # don't crash and return reasonable results
-    out_pred = model.predict(x_test, batch_size=32, verbose=False)
-    out_eval = model.evaluate(x_test, y_test, batch_size=32, verbose=False)
-
-    self.assertEqual(out_pred.shape, (50, 1))
-    self.assertGreater(out_eval, 0)
-
-    # test slicing for shortened array
-    self.assertEqual(len(x_train[0:]), len(x_train))
-
-    # test __getitem__ invalid use cases
-    with self.assertRaises(IndexError):
-      _ = x_train[1000]
-    with self.assertRaises(IndexError):
-      _ = x_train[1000: 1001]
-    with self.assertRaises(IndexError):
-      _ = x_train[[1000, 1001]]
-    with self.assertRaises(IndexError):
-      _ = x_train[six.moves.range(1000, 1001)]
-    with self.assertRaises(IndexError):
-      _ = x_train[np.array([1000])]
-    with self.assertRaises(TypeError):
-      _ = x_train[None]
-
-    # test normalizer
-    normalizer = lambda x: x + 1
-    normalized_x_train = io_utils.HDF5Matrix(
-        h5_path, 'my_data', start=0, end=150, normalizer=normalizer)
-    self.assertAllClose(normalized_x_train[0][0], x_train[0][0] + 1)
-
   def test_ask_to_proceed_with_overwrite(self):
     with test.mock.patch.object(six.moves, 'input') as mock_log:
       mock_log.return_value = 'y'
diff --git a/tensorflow/python/keras/utils/layer_utils.py b/tensorflow/python/keras/utils/layer_utils.py
index d2d3d919fff..71e84e38922 100644
--- a/tensorflow/python/keras/utils/layer_utils.py
+++ b/tensorflow/python/keras/utils/layer_utils.py
@@ -19,14 +19,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
+import weakref
+
 import numpy as np
 import six
 
-from tensorflow.python.keras import backend as K
-from tensorflow.python.keras.utils.conv_utils import convert_kernel
-from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
-from tensorflow.python.util import object_identity
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -101,7 +100,7 @@ def count_params(weights):
   Returns:
       The total number of scalars composing the weights
   """
-  unique_weights = object_identity.ObjectIdentitySet(weights)
+  unique_weights = {id(w): w for w in weights}.values()
   weight_shapes = [w.shape.as_list() for w in unique_weights]
   standardized_weight_shapes = [
       [0 if w_i is None else w_i for w_i in w] for w in weight_shapes
@@ -326,37 +325,6 @@ def gather_non_trainable_weights(trainable, sub_layers, extra_variables):
   return weights + non_trainable_extra_variables
 
 
-@deprecation.deprecated('2020-06-23',
-                        'The Theano kernel format is legacy; '
-                        'this utility will be removed.')
-@keras_export('keras.utils.convert_all_kernels_in_model')
-def convert_all_kernels_in_model(model):
-  """Converts all convolution kernels in a model from Theano to TensorFlow.
-
-  Also works from TensorFlow to Theano.
-
-  This is used for converting legacy Theano-saved model files.
-
-  Arguments:
-      model: target model for the conversion.
-  """
-  # Note: SeparableConvolution not included
-  # since only supported by TF.
-  conv_classes = {
-      'Conv1D',
-      'Conv2D',
-      'Conv3D',
-      'Conv2DTranspose',
-  }
-  to_assign = []
-  for layer in model.layers:
-    if layer.__class__.__name__ in conv_classes:
-      original_kernel = K.get_value(layer.kernel)
-      converted_kernel = convert_kernel(original_kernel)
-      to_assign.append((layer.kernel, converted_kernel))
-  K.batch_set_value(to_assign)
-
-
 def convert_dense_weights_data_format(dense,
                                       previous_feature_map_shape,
                                       target_data_format='channels_first'):
@@ -404,3 +372,98 @@ def is_builtin_layer(layer):
   # of the base layer class.
   return (layer._keras_api_names != ('keras.layers.Layer',) and
           layer._keras_api_names_v1 != ('keras.layers.Layer',))
+
+
+def cached_per_instance(f):
+  """Lightweight decorator for caching lazily constructed properties.
+
+  When to use:
+  This decorator provides simple caching with minimal overhead. It is designed
+  for properties which are expensive to compute and static over the life of a
+  class instance, and provides no mechanism for cache invalidation. Thus it is
+  best suited for lazily exposing derived properties of other static data.
+
+  For classes with custom getattr / setattr behavior (such as trackable
+  objects), storing cache results as object attributes is not performant.
+  Instead, a specialized cache can significantly reduce property lookup
+  overhead. (While still allowing the decorated property to be lazily computed.)
+  Consider the following class:
+
+  ```
+  class MyClass(object):
+    def __setattr__(self, key, value):
+      # Some expensive class specific code
+      # ...
+      # ...
+
+      super(MyClass, self).__setattr__(key, value)
+
+    @property
+    def thing(self):
+      # `thing` is expensive to compute (and may not even be requested), so we
+      # want to lazily compute it and then cache it.
+      output = getattr(self, '_thing', None)
+      if output is None:
+        self._thing = output = compute_thing(self)
+      return output
+  ```
+
+  It's also worth noting that ANY overriding of __setattr__, even something as
+  simple as:
+  ```
+    def __setattr__(self, key, value):
+      super(MyClass, self).__setattr__(key, value)
+  ```
+
+  Slows down attribute assignment by nearly 10x.
+
+  By contrast, replacing the definition of `thing` with the following sidesteps
+  the expensive __setattr__ altogether:
+
+  '''
+  @property
+  @tracking.cached_per_instance
+  def thing(self):
+    # `thing` is expensive to compute (and may not even be requested), so we
+    # want to lazily compute it and then cache it.
+    return compute_thing(self)
+  '''
+
+  Performance:
+  The overhead for this decorator is ~0.4 us / call. A much lower overhead
+  implementation (~0.085 us / call) can be achieved by using a custom dict type:
+
+  ```
+  def dict_based_cache(f):
+    class Cache(dict):
+      __slots__ = ()
+      def __missing__(self, key):
+        self[key] = output = f(key)
+        return output
+
+    return property(Cache().__getitem__)
+  ```
+
+  However, that implementation holds class instances as keys, and as a result
+  blocks garbage collection. (And modifying it to use weakref's as keys raises
+  the lookup overhead to ~0.4 us) As a result, the WeakKeyDictionary
+  implementation below turns out to be more prudent.
+
+  Args:
+    f: The function to cache.
+
+  Returns:
+    f decorated with simple caching behavior.
+  """
+
+  cache = weakref.WeakKeyDictionary()
+
+  @functools.wraps(f)
+  def wrapped(item):
+    output = cache.get(item)
+    if output is None:
+      cache[item] = output = f(item)
+    return output
+
+  wrapped.cache = cache
+  return wrapped
diff --git a/tensorflow/python/keras/utils/layer_utils_test.py b/tensorflow/python/keras/utils/layer_utils_test.py
new file mode 100644
index 00000000000..a4e53a21aba
--- /dev/null
+++ b/tensorflow/python/keras/utils/layer_utils_test.py
@@ -0,0 +1,170 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for layer_utils."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import contextlib
+import multiprocessing.dummy
+import pickle
+import time
+import timeit
+
+import numpy as np
+
+from tensorflow.python.keras.utils import layer_utils
+from tensorflow.python.platform import test
+from tensorflow.python.training.tracking import tracking
+
+
+_PICKLEABLE_CALL_COUNT = collections.Counter()
+
+
+class MyPickleableObject(tracking.AutoTrackable):
+  """Needed for InterfaceTests.test_property_cache_serialization.
+
+  This class must be at the top level. This is a constraint of pickle,
+  unrelated to `cached_per_instance`.
+  """
+
+  @property
+  @layer_utils.cached_per_instance
+  def my_id(self):
+    _PICKLEABLE_CALL_COUNT[self] += 1
+    return id(self)
+
+
+class LayerUtilsTest(test.TestCase):
+
+  def test_property_cache(self):
+    test_counter = collections.Counter()
+
+    class MyObject(tracking.AutoTrackable):
+
+      def __init__(self):
+        super(MyObject, self).__init__()
+        self._frozen = True
+
+      def __setattr__(self, key, value):
+        """Enforce that cache does not set attribute on MyObject."""
+        if getattr(self, "_frozen", False):
+          raise ValueError("Cannot mutate when frozen.")
+        return super(MyObject, self).__setattr__(key, value)
+
+      @property
+      @layer_utils.cached_per_instance
+      def test_property(self):
+        test_counter[id(self)] += 1
+        return id(self)
+
+    first_object = MyObject()
+    second_object = MyObject()
+
+    # Make sure the objects return the correct values
+    self.assertEqual(first_object.test_property, id(first_object))
+    self.assertEqual(second_object.test_property, id(second_object))
+
+    # Make sure the cache does not share across objects
+    self.assertNotEqual(first_object.test_property, second_object.test_property)
+
+    # Check again (Now the values should be cached.)
+    self.assertEqual(first_object.test_property, id(first_object))
+    self.assertEqual(second_object.test_property, id(second_object))
+
+    # Count the function calls to make sure the cache is actually being used.
+    self.assertAllEqual(tuple(test_counter.values()), (1, 1))
+
+  def test_property_cache_threaded(self):
+    call_count = collections.Counter()
+
+    class MyObject(tracking.AutoTrackable):
+
+      @property
+      @layer_utils.cached_per_instance
+      def test_property(self):
+        # Random sleeps to ensure that the execution thread changes
+        # mid-computation.
+        call_count["test_property"] += 1
+        time.sleep(np.random.random() + 1.)
+
+        # Use a RandomState which is seeded off the instance's id (the mod is
+        # because numpy limits the range of seeds) to ensure that an instance
+        # returns the same value in different threads, but different instances
+        # return different values.
+        return int(np.random.RandomState(id(self) % (2 ** 31)).randint(2 ** 16))
+
+      def get_test_property(self, _):
+        """Function provided to .map for threading test."""
+        return self.test_property
+
+    # Test that multiple threads return the same value. This requires that
+    # the underlying function is repeatable, as cached_property makes no attempt
+    # to prioritize the first call.
+    test_obj = MyObject()
+    with contextlib.closing(multiprocessing.dummy.Pool(32)) as pool:
+      # Intentionally make a large pool (even when there are only a small number
+      # of cpus) to ensure that the runtime switches threads.
+      results = pool.map(test_obj.get_test_property, range(64))
+    self.assertEqual(len(set(results)), 1)
+
+    # Make sure we actually are testing threaded behavior.
+    self.assertGreater(call_count["test_property"], 1)
+
+    # Make sure new threads still cache hit.
+    with contextlib.closing(multiprocessing.dummy.Pool(2)) as pool:
+      start_time = timeit.default_timer()  # Don't time pool instantiation.
+      results = pool.map(test_obj.get_test_property, range(4))
+    total_time = timeit.default_timer() - start_time
+
+    # Note(taylorrobie): The reason that it is safe to time a unit test is that
+    #                    a cache hit will be << 1 second, and a cache miss is
+    #                    guaranteed to be >= 1 second. Empirically confirmed by
+    #                    100,000 runs with no flakes.
+    self.assertLess(total_time, 0.95)
+
+  def test_property_cache_serialization(self):
+    # Reset call count. .keys() must be wrapped in a list, because otherwise we
+    # would mutate the iterator while iterating.
+    for k in list(_PICKLEABLE_CALL_COUNT.keys()):
+      _PICKLEABLE_CALL_COUNT.pop(k)
+
+    first_instance = MyPickleableObject()
+    self.assertEqual(id(first_instance), first_instance.my_id)
+
+    # Test that we can pickle and un-pickle
+    second_instance = pickle.loads(pickle.dumps(first_instance))
+
+    self.assertEqual(id(second_instance), second_instance.my_id)
+    self.assertNotEqual(first_instance.my_id, second_instance.my_id)
+
+    # Make sure de-serialized object uses the cache.
+    self.assertEqual(_PICKLEABLE_CALL_COUNT[second_instance], 1)
+
+    # Make sure the decorator cache is not being serialized with the object.
+    expected_size = len(pickle.dumps(second_instance))
+    for _ in range(5):
+      # Add some more entries to the cache.
+      _ = MyPickleableObject().my_id
+    self.assertEqual(len(_PICKLEABLE_CALL_COUNT), 7)
+    size_check_instance = MyPickleableObject()
+    _ = size_check_instance.my_id
+    self.assertEqual(expected_size, len(pickle.dumps(size_check_instance)))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/utils/losses_utils.py b/tensorflow/python/keras/utils/losses_utils.py
index b8a063e3b42..08ef613c3e2 100644
--- a/tensorflow/python/keras/utils/losses_utils.py
+++ b/tensorflow/python/keras/utils/losses_utils.py
@@ -253,11 +253,11 @@ def compute_weighted_loss(losses,
     ops.get_default_graph()._last_loss_reduction = reduction  # pylint: disable=protected-access
 
     if not isinstance(losses, keras_tensor.KerasTensor):
-      losses = ops.convert_to_tensor_v2(losses)
+      losses = ops.convert_to_tensor_v2_with_dispatch(losses)
     input_dtype = losses.dtype
 
     if not isinstance(sample_weight, keras_tensor.KerasTensor):
-      sample_weight = ops.convert_to_tensor_v2(sample_weight)
+      sample_weight = ops.convert_to_tensor_v2_with_dispatch(sample_weight)
 
     # TODO(psv): Handle casting here in a better way, eg. if losses is float64
     # we do not want to lose precision.
diff --git a/tensorflow/python/keras/utils/metrics_utils.py b/tensorflow/python/keras/utils/metrics_utils.py
index 7d47850e8aa..5b3905a28da 100644
--- a/tensorflow/python/keras/utils/metrics_utils.py
+++ b/tensorflow/python/keras/utils/metrics_utils.py
@@ -311,7 +311,8 @@ def update_confusion_matrix_variables(variables_to_update,
 
   y_true = math_ops.cast(y_true, dtype=variable_dtype)
   y_pred = math_ops.cast(y_pred, dtype=variable_dtype)
-  thresholds = ops.convert_to_tensor_v2(thresholds, dtype=variable_dtype)
+  thresholds = ops.convert_to_tensor_v2_with_dispatch(
+      thresholds, dtype=variable_dtype)
   num_thresholds = thresholds.shape[0]
   if multi_label:
     one_thresh = math_ops.equal(
diff --git a/tensorflow/python/keras/utils/multi_gpu_utils.py b/tensorflow/python/keras/utils/multi_gpu_utils.py
index 0d2d043d731..089ca98f6d0 100644
--- a/tensorflow/python/keras/utils/multi_gpu_utils.py
+++ b/tensorflow/python/keras/utils/multi_gpu_utils.py
@@ -23,8 +23,6 @@ from tensorflow.python.keras.engine.training import Model
 from tensorflow.python.keras.layers.core import Lambda
 from tensorflow.python.keras.layers.merge import concatenate
 from tensorflow.python.ops import array_ops
-from tensorflow.python.util import deprecation
-from tensorflow.python.util.tf_export import keras_export
 
 
 def _get_available_devices():
@@ -36,9 +34,6 @@ def _normalize_device_name(name):
   return name
 
 
-@keras_export('keras.utils.multi_gpu_model')
-@deprecation.deprecated(
-    '2020-04-01', 'Use `tf.distribute.MirroredStrategy` instead.')
 def multi_gpu_model(model, gpus, cpu_merge=True, cpu_relocation=False):
   """Replicates a model on different GPUs.
 
diff --git a/tensorflow/python/keras/utils/tf_contextlib.py b/tensorflow/python/keras/utils/tf_contextlib.py
new file mode 100644
index 00000000000..3830014d4ac
--- /dev/null
+++ b/tensorflow/python/keras/utils/tf_contextlib.py
@@ -0,0 +1,36 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TFDecorator-aware replacements for the contextlib module."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib as _contextlib
+
+from tensorflow.python.util import tf_decorator
+
+
+def contextmanager(target):
+  """A tf_decorator-aware wrapper for `contextlib.contextmanager`.
+
+  Usage is identical to `contextlib.contextmanager`.
+
+  Args:
+    target: A callable to be wrapped in a contextmanager.
+  Returns:
+    A callable that can be used inside of a `with` statement.
+  """
+  context_manager = _contextlib.contextmanager(target)
+  return tf_decorator.make_decorator(target, context_manager, 'contextmanager')
diff --git a/tensorflow/python/keras/utils/tf_inspect.py b/tensorflow/python/keras/utils/tf_inspect.py
new file mode 100644
index 00000000000..dd13ea6c393
--- /dev/null
+++ b/tensorflow/python/keras/utils/tf_inspect.py
@@ -0,0 +1,402 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TFDecorator-aware replacements for the inspect module."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import functools
+import inspect as _inspect
+
+import six
+
+from tensorflow.python.util import tf_decorator
+
+ArgSpec = _inspect.ArgSpec
+
+
+if hasattr(_inspect, 'FullArgSpec'):
+  FullArgSpec = _inspect.FullArgSpec  # pylint: disable=invalid-name
+else:
+  FullArgSpec = collections.namedtuple('FullArgSpec', [
+      'args', 'varargs', 'varkw', 'defaults', 'kwonlyargs', 'kwonlydefaults',
+      'annotations'
+  ])
+
+
+def _convert_maybe_argspec_to_fullargspec(argspec):
+  if isinstance(argspec, FullArgSpec):
+    return argspec
+  return FullArgSpec(
+      args=argspec.args,
+      varargs=argspec.varargs,
+      varkw=argspec.keywords,
+      defaults=argspec.defaults,
+      kwonlyargs=[],
+      kwonlydefaults=None,
+      annotations={})
+
+if hasattr(_inspect, 'getfullargspec'):
+  _getfullargspec = _inspect.getfullargspec  # pylint: disable=invalid-name
+
+  def _getargspec(target):
+    """A python3 version of getargspec.
+
+    Calls `getfullargspec` and assigns args, varargs,
+    varkw, and defaults to a python 2/3 compatible `ArgSpec`.
+
+    The parameter name 'varkw' is changed to 'keywords' to fit the
+    `ArgSpec` struct.
+
+    Args:
+      target: the target object to inspect.
+
+    Returns:
+      An ArgSpec with args, varargs, keywords, and defaults parameters
+      from FullArgSpec.
+    """
+    fullargspecs = getfullargspec(target)
+    argspecs = ArgSpec(
+        args=fullargspecs.args,
+        varargs=fullargspecs.varargs,
+        keywords=fullargspecs.varkw,
+        defaults=fullargspecs.defaults)
+    return argspecs
+else:
+  _getargspec = _inspect.getargspec
+
+  def _getfullargspec(target):
+    """A python2 version of getfullargspec.
+
+    Args:
+      target: the target object to inspect.
+
+    Returns:
+      A FullArgSpec with empty kwonlyargs, kwonlydefaults and annotations.
+    """
+    return _convert_maybe_argspec_to_fullargspec(getargspec(target))
+
+
+def getargspec(obj):
+  """TFDecorator-aware replacement for `inspect.getargspec`.
+
+  Note: `getfullargspec` is recommended as the python 2/3 compatible
+  replacement for this function.
+
+  Args:
+    obj: A function, partial function, or callable object, possibly decorated.
+
+  Returns:
+    The `ArgSpec` that describes the signature of the outermost decorator that
+    changes the callable's signature, or the `ArgSpec` that describes
+    the object if not decorated.
+
+  Raises:
+    ValueError: When callable's signature can not be expressed with
+      ArgSpec.
+    TypeError: For objects of unsupported types.
+  """
+  if isinstance(obj, functools.partial):
+    return _get_argspec_for_partial(obj)
+
+  decorators, target = tf_decorator.unwrap(obj)
+
+  spec = next((d.decorator_argspec
+               for d in decorators
+               if d.decorator_argspec is not None), None)
+  if spec:
+    return spec
+
+  try:
+    # Python3 will handle most callables here (not partial).
+    return _getargspec(target)
+  except TypeError:
+    pass
+
+  if isinstance(target, type):
+    try:
+      return _getargspec(target.__init__)
+    except TypeError:
+      pass
+
+    try:
+      return _getargspec(target.__new__)
+    except TypeError:
+      pass
+
+  # The `type(target)` ensures that if a class is received we don't return
+  # the signature of its __call__ method.
+  return _getargspec(type(target).__call__)
+
+
+def _get_argspec_for_partial(obj):
+  """Implements `getargspec` for `functools.partial` objects.
+
+  Args:
+    obj: The `functools.partial` object
+  Returns:
+    An `inspect.ArgSpec`
+  Raises:
+    ValueError: When callable's signature can not be expressed with
+      ArgSpec.
+  """
+  # When callable is a functools.partial object, we construct its ArgSpec with
+  # following strategy:
+  # - If callable partial contains default value for positional arguments (ie.
+  # object.args), then final ArgSpec doesn't contain those positional arguments.
+  # - If callable partial contains default value for keyword arguments (ie.
+  # object.keywords), then we merge them with wrapped target. Default values
+  # from callable partial takes precedence over those from wrapped target.
+  #
+  # However, there is a case where it is impossible to construct a valid
+  # ArgSpec. Python requires arguments that have no default values must be
+  # defined before those with default values. ArgSpec structure is only valid
+  # when this presumption holds true because default values are expressed as a
+  # tuple of values without keywords and they are always assumed to belong to
+  # last K arguments where K is number of default values present.
+  #
+  # Since functools.partial can give default value to any argument, this
+  # presumption may no longer hold in some cases. For example:
+  #
+  # def func(m, n):
+  #   return 2 * m + n
+  # partialed = functools.partial(func, m=1)
+  #
+  # This example will result in m having a default value but n doesn't. This is
+  # usually not allowed in Python and can not be expressed in ArgSpec correctly.
+  #
+  # Thus, we must detect cases like this by finding first argument with default
+  # value and ensures all following arguments also have default values. When
+  # this is not true, a ValueError is raised.
+
+  n_prune_args = len(obj.args)
+  partial_keywords = obj.keywords or {}
+
+  args, varargs, keywords, defaults = getargspec(obj.func)
+
+  # Pruning first n_prune_args arguments.
+  args = args[n_prune_args:]
+
+  # Partial function may give default value to any argument, therefore length
+  # of default value list must be len(args) to allow each argument to
+  # potentially be given a default value.
+  no_default = object()
+  all_defaults = [no_default] * len(args)
+
+  if defaults:
+    all_defaults[-len(defaults):] = defaults
+
+  # Fill in default values provided by partial function in all_defaults.
+  for kw, default in six.iteritems(partial_keywords):
+    if kw in args:
+      idx = args.index(kw)
+      all_defaults[idx] = default
+    elif not keywords:
+      raise ValueError('Function does not have **kwargs parameter, but '
+                       'contains an unknown partial keyword.')
+
+  # Find first argument with default value set.
+  first_default = next(
+      (idx for idx, x in enumerate(all_defaults) if x is not no_default), None)
+
+  # If no default values are found, return ArgSpec with defaults=None.
+  if first_default is None:
+    return ArgSpec(args, varargs, keywords, None)
+
+  # Checks if all arguments have default value set after first one.
+  invalid_default_values = [
+      args[i] for i, j in enumerate(all_defaults)
+      if j is no_default and i > first_default
+  ]
+
+  if invalid_default_values:
+    raise ValueError('Some arguments %s do not have default value, but they '
+                     'are positioned after those with default values. This can '
+                     'not be expressed with ArgSpec.' % invalid_default_values)
+
+  return ArgSpec(args, varargs, keywords, tuple(all_defaults[first_default:]))
+
+
+def getfullargspec(obj):
+  """TFDecorator-aware replacement for `inspect.getfullargspec`.
+
+  This wrapper emulates `inspect.getfullargspec` in[^)]* Python2.
+
+  Args:
+    obj: A callable, possibly decorated.
+
+  Returns:
+    The `FullArgSpec` that describes the signature of
+    the outermost decorator that changes the callable's signature. If the
+    callable is not decorated, `inspect.getfullargspec()` will be called
+    directly on the callable.
+  """
+  decorators, target = tf_decorator.unwrap(obj)
+
+  for d in decorators:
+    if d.decorator_argspec is not None:
+      return _convert_maybe_argspec_to_fullargspec(d.decorator_argspec)
+  return _getfullargspec(target)
+
+
+def getcallargs(*func_and_positional, **named):
+  """TFDecorator-aware replacement for inspect.getcallargs.
+
+  Args:
+    *func_and_positional: A callable, possibly decorated, followed by any
+      positional arguments that would be passed to `func`.
+    **named: The named argument dictionary that would be passed to `func`.
+
+  Returns:
+    A dictionary mapping `func`'s named arguments to the values they would
+    receive if `func(*positional, **named)` were called.
+
+  `getcallargs` will use the argspec from the outermost decorator that provides
+  it. If no attached decorators modify argspec, the final unwrapped target's
+  argspec will be used.
+  """
+  func = func_and_positional[0]
+  positional = func_and_positional[1:]
+  argspec = getfullargspec(func)
+  call_args = named.copy()
+  this = getattr(func, 'im_self', None) or getattr(func, '__self__', None)
+  if ismethod(func) and this:
+    positional = (this,) + positional
+  remaining_positionals = [arg for arg in argspec.args if arg not in call_args]
+  call_args.update(dict(zip(remaining_positionals, positional)))
+  default_count = 0 if not argspec.defaults else len(argspec.defaults)
+  if default_count:
+    for arg, value in zip(argspec.args[-default_count:], argspec.defaults):
+      if arg not in call_args:
+        call_args[arg] = value
+  if argspec.kwonlydefaults is not None:
+    for k, v in argspec.kwonlydefaults.items():
+      if k not in call_args:
+        call_args[k] = v
+  return call_args
+
+
+def getframeinfo(*args, **kwargs):
+  return _inspect.getframeinfo(*args, **kwargs)
+
+
+def getdoc(object):  # pylint: disable=redefined-builtin
+  """TFDecorator-aware replacement for inspect.getdoc.
+
+  Args:
+    object: An object, possibly decorated.
+
+  Returns:
+    The docstring associated with the object.
+
+  The outermost-decorated object is intended to have the most complete
+  documentation, so the decorated parameter is not unwrapped.
+  """
+  return _inspect.getdoc(object)
+
+
+def getfile(object):  # pylint: disable=redefined-builtin
+  """TFDecorator-aware replacement for inspect.getfile."""
+  unwrapped_object = tf_decorator.unwrap(object)[1]
+
+  # Work around for the case when object is a stack frame
+  # and only .pyc files are used. In this case, getfile
+  # might return incorrect path. So, we get the path from f_globals
+  # instead.
+  if (hasattr(unwrapped_object, 'f_globals') and
+      '__file__' in unwrapped_object.f_globals):
+    return unwrapped_object.f_globals['__file__']
+  return _inspect.getfile(unwrapped_object)
+
+
+def getmembers(object, predicate=None):  # pylint: disable=redefined-builtin
+  """TFDecorator-aware replacement for inspect.getmembers."""
+  return _inspect.getmembers(object, predicate)
+
+
+def getmodule(object):  # pylint: disable=redefined-builtin
+  """TFDecorator-aware replacement for inspect.getmodule."""
+  return _inspect.getmodule(object)
+
+
+def getmro(cls):
+  """TFDecorator-aware replacement for inspect.getmro."""
+  return _inspect.getmro(cls)
+
+
+def getsource(object):  # pylint: disable=redefined-builtin
+  """TFDecorator-aware replacement for inspect.getsource."""
+  return _inspect.getsource(tf_decorator.unwrap(object)[1])
+
+
+def getsourcefile(object):  # pylint: disable=redefined-builtin
+  """TFDecorator-aware replacement for inspect.getsourcefile."""
+  return _inspect.getsourcefile(tf_decorator.unwrap(object)[1])
+
+
+def getsourcelines(object):  # pylint: disable=redefined-builtin
+  """TFDecorator-aware replacement for inspect.getsourcelines."""
+  return _inspect.getsourcelines(tf_decorator.unwrap(object)[1])
+
+
+def isbuiltin(object):  # pylint: disable=redefined-builtin
+  """TFDecorator-aware replacement for inspect.isbuiltin."""
+  return _inspect.isbuiltin(tf_decorator.unwrap(object)[1])
+
+
+def isclass(object):  # pylint: disable=redefined-builtin
+  """TFDecorator-aware replacement for inspect.isclass."""
+  return _inspect.isclass(tf_decorator.unwrap(object)[1])
+
+
+def isfunction(object):  # pylint: disable=redefined-builtin
+  """TFDecorator-aware replacement for inspect.isfunction."""
+  return _inspect.isfunction(tf_decorator.unwrap(object)[1])
+
+
+def isframe(object):  # pylint: disable=redefined-builtin
+  """TFDecorator-aware replacement for inspect.ismodule."""
+  return _inspect.isframe(tf_decorator.unwrap(object)[1])
+
+
+def isgenerator(object):  # pylint: disable=redefined-builtin
+  """TFDecorator-aware replacement for inspect.isgenerator."""
+  return _inspect.isgenerator(tf_decorator.unwrap(object)[1])
+
+
+def isgeneratorfunction(object):  # pylint: disable=redefined-builtin
+  """TFDecorator-aware replacement for inspect.isgeneratorfunction."""
+  return _inspect.isgeneratorfunction(tf_decorator.unwrap(object)[1])
+
+
+def ismethod(object):  # pylint: disable=redefined-builtin
+  """TFDecorator-aware replacement for inspect.ismethod."""
+  return _inspect.ismethod(tf_decorator.unwrap(object)[1])
+
+
+def ismodule(object):  # pylint: disable=redefined-builtin
+  """TFDecorator-aware replacement for inspect.ismodule."""
+  return _inspect.ismodule(tf_decorator.unwrap(object)[1])
+
+
+def isroutine(object):  # pylint: disable=redefined-builtin
+  """TFDecorator-aware replacement for inspect.isroutine."""
+  return _inspect.isroutine(tf_decorator.unwrap(object)[1])
+
+
+def stack(context=1):
+  """TFDecorator-aware replacement for inspect.stack."""
+  return _inspect.stack(context)[1:]
diff --git a/tensorflow/python/keras/utils/tf_utils.py b/tensorflow/python/keras/utils/tf_utils.py
index 51cb1acc899..3e75da4ec13 100644
--- a/tensorflow/python/keras/utils/tf_utils.py
+++ b/tensorflow/python/keras/utils/tf_utils.py
@@ -30,13 +30,13 @@ from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.framework import type_spec
 from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.utils import tf_contextlib
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.ops.ragged import ragged_tensor_value
 from tensorflow.python.util import nest
 from tensorflow.python.util import object_identity
-from tensorflow.python.util import tf_contextlib
 
 
 def is_tensor_or_tensor_list(v):
diff --git a/tensorflow/python/keras/utils/tf_utils_test.py b/tensorflow/python/keras/utils/tf_utils_test.py
index 9a3939e0c39..73d8671e388 100644
--- a/tensorflow/python/keras/utils/tf_utils_test.py
+++ b/tensorflow/python/keras/utils/tf_utils_test.py
@@ -44,14 +44,17 @@ class TestIsSymbolicTensor(test.TestCase, parameterized.TestCase):
       self.assertFalse(tf_utils.is_symbolic_tensor(
           variables.Variable(name='blah', initial_value=0.)))
       self.assertFalse(
-          tf_utils.is_symbolic_tensor(ops.convert_to_tensor_v2(0.)))
+          tf_utils.is_symbolic_tensor(
+              ops.convert_to_tensor_v2_with_dispatch(0.)))
       self.assertFalse(tf_utils.is_symbolic_tensor(
           sparse_tensor.SparseTensor(
               indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4])))
     else:
       self.assertTrue(tf_utils.is_symbolic_tensor(
           variables.Variable(name='blah', initial_value=0.)))
-      self.assertTrue(tf_utils.is_symbolic_tensor(ops.convert_to_tensor_v2(0.)))
+      self.assertTrue(
+          tf_utils.is_symbolic_tensor(
+              ops.convert_to_tensor_v2_with_dispatch(0.)))
       self.assertTrue(tf_utils.is_symbolic_tensor(
           sparse_tensor.SparseTensor(
               indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4])))
@@ -61,7 +64,7 @@ class TestIsSymbolicTensor(test.TestCase, parameterized.TestCase):
     class CustomClass(object):
 
       def value(self):
-        return ops.convert_to_tensor_v2(42.)
+        return ops.convert_to_tensor_v2_with_dispatch(42.)
 
     ops.register_tensor_conversion_function(
         CustomClass, lambda value, **_: value.value())
@@ -72,7 +75,8 @@ class TestIsSymbolicTensor(test.TestCase, parameterized.TestCase):
       self.assertFalse(tf_utils.is_symbolic_tensor(
           variables.Variable(name='blah', initial_value=0.)))
       self.assertFalse(
-          tf_utils.is_symbolic_tensor(ops.convert_to_tensor_v2(0.)))
+          tf_utils.is_symbolic_tensor(
+              ops.convert_to_tensor_v2_with_dispatch(0.)))
       self.assertFalse(tf_utils.is_symbolic_tensor(
           sparse_tensor.SparseTensor(
               indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4])))
@@ -80,7 +84,9 @@ class TestIsSymbolicTensor(test.TestCase, parameterized.TestCase):
     else:
       self.assertTrue(tf_utils.is_symbolic_tensor(
           variables.Variable(name='blah', initial_value=0.)))
-      self.assertTrue(tf_utils.is_symbolic_tensor(ops.convert_to_tensor_v2(0.)))
+      self.assertTrue(
+          tf_utils.is_symbolic_tensor(
+              ops.convert_to_tensor_v2_with_dispatch(0.)))
       self.assertTrue(tf_utils.is_symbolic_tensor(
           sparse_tensor.SparseTensor(
               indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4])))
@@ -95,7 +101,7 @@ class TestIsSymbolicTensor(test.TestCase, parameterized.TestCase):
 
       def __init__(self, input_):
         self._input = input_
-        self.value = ops.convert_to_tensor_v2([[42.]])
+        self.value = ops.convert_to_tensor_v2_with_dispatch([[42.]])
 
       @property
       def dtype(self):
@@ -110,7 +116,7 @@ class TestIsSymbolicTensor(test.TestCase, parameterized.TestCase):
       def __init__(self, fn, **kwargs):
         def _fn(*fargs, **fkwargs):
           d = fn(*fargs, **fkwargs)
-          x = ops.convert_to_tensor_v2(d)
+          x = ops.convert_to_tensor_v2_with_dispatch(d)
           d.shape = x.shape
           d.get_shape = x.get_shape
           return d, x
@@ -138,7 +144,7 @@ class TestIsSymbolicTensor(test.TestCase, parameterized.TestCase):
     model = keras.Model(model.inputs, model(model.outputs))
     # Now we instantiate the model and verify we have a `Foo` object, not a
     # `Tensor`.
-    y = model(ops.convert_to_tensor_v2([[7.]]))
+    y = model(ops.convert_to_tensor_v2_with_dispatch([[7.]]))
     self.assertIsInstance(y, Foo)
     # Confirm that (custom) loss sees `Foo` instance, not Tensor.
     obtained_prediction_box = [None]
diff --git a/tensorflow/python/keras/utils/vis_utils_test.py b/tensorflow/python/keras/utils/vis_utils_test.py
index 8b401c3d21b..ccdde30446d 100644
--- a/tensorflow/python/keras/utils/vis_utils_test.py
+++ b/tensorflow/python/keras/utils/vis_utils_test.py
@@ -38,8 +38,8 @@ class ModelToDotFormatTest(test.TestCase):
     try:
       vis_utils.plot_model(
           model, to_file=dot_img_file, show_shapes=True, show_dtype=True)
-      self.assertTrue(file_io.file_exists(dot_img_file))
-      file_io.delete_file(dot_img_file)
+      self.assertTrue(file_io.file_exists_v2(dot_img_file))
+      file_io.delete_file_v2(dot_img_file)
     except ImportError:
       pass
 
@@ -68,8 +68,8 @@ class ModelToDotFormatTest(test.TestCase):
           show_shapes=True,
           show_dtype=True,
           expand_nested=True)
-      self.assertTrue(file_io.file_exists(dot_img_file))
-      file_io.delete_file(dot_img_file)
+      self.assertTrue(file_io.file_exists_v2(dot_img_file))
+      file_io.delete_file_v2(dot_img_file)
     except ImportError:
       pass
 
@@ -86,8 +86,8 @@ class ModelToDotFormatTest(test.TestCase):
           show_shapes=True,
           show_dtype=True,
           expand_nested=True)
-      self.assertTrue(file_io.file_exists(dot_img_file))
-      file_io.delete_file(dot_img_file)
+      self.assertTrue(file_io.file_exists_v2(dot_img_file))
+      file_io.delete_file_v2(dot_img_file)
     except ImportError:
       pass
 
@@ -102,8 +102,8 @@ class ModelToDotFormatTest(test.TestCase):
           show_shapes=True,
           show_dtype=True,
           expand_nested=True)
-      self.assertTrue(file_io.file_exists(dot_img_file))
-      file_io.delete_file(dot_img_file)
+      self.assertTrue(file_io.file_exists_v2(dot_img_file))
+      file_io.delete_file_v2(dot_img_file)
     except ImportError:
       pass
 
diff --git a/tensorflow/python/keras/wrappers/BUILD b/tensorflow/python/keras/wrappers/BUILD
index 446dac2697f..d00ce9ceab8 100644
--- a/tensorflow/python/keras/wrappers/BUILD
+++ b/tensorflow/python/keras/wrappers/BUILD
@@ -10,6 +10,12 @@ package(
 
 exports_files(["LICENSE"])
 
+filegroup(
+    name = "all_py_srcs",
+    srcs = glob(["*.py"]),
+    visibility = ["//tensorflow/python/keras/google/private_tf_api_test:__pkg__"],
+)
+
 py_library(
     name = "wrappers",
     srcs = [
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 8504052c16a..2c9de17731a 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -1,6 +1,6 @@
 # Tests of TensorFlow kernels written using the Python API.
 
-load("//tensorflow:tensorflow.bzl", "sycl_py_test", "tf_custom_op_library")
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_library")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 
 # buildifier: disable=same-origin-load
@@ -20,6 +20,7 @@ tf_py_test(
     size = "small",
     srcs = ["as_string_op_test.py"],
     tags = ["no_windows"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -33,6 +34,7 @@ tf_py_test(
     name = "attention_ops_test",
     size = "small",
     srcs = ["attention_ops_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -52,6 +54,7 @@ tf_py_test(
         "nomsan",  # TODO(b/161902335): Re-enable.
         "notsan",  # TODO(b/161829717): Re-enable.
     ],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:data_flow_ops",
@@ -66,6 +69,7 @@ tf_py_test(
     size = "small",
     srcs = ["base64_ops_test.py"],
     tags = ["nomac"],  # b/35468214
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -80,6 +84,7 @@ tf_py_test(
 tf_py_test(
     name = "batch_scatter_ops_test",
     srcs = ["batch_scatter_ops_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -100,6 +105,7 @@ tf_py_test(
     name = "bcast_ops_test",
     size = "small",
     srcs = ["bcast_ops_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops_gen",
         "//tensorflow/python:client_testlib",
@@ -130,11 +136,38 @@ cuda_py_test(
     ],
 )
 
+# TODO(kattian): add GPU capability and change to cuda_py_test
+tf_py_test(
+    name = "map_ops_test",
+    size = "small",
+    srcs = ["map_ops_test.py"],
+    grpc_enabled = True,
+    tags = [
+        "noasan",  # TODO(b/164696004)
+        "notsan",  # TODO(b/164696004)
+    ],
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradients_impl",
+        "//tensorflow/python:map_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 cuda_py_test(
     name = "benchmark_test",
     size = "small",
     srcs = ["benchmark_test.py"],
     tags = ["no_windows"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
@@ -147,6 +180,7 @@ cuda_py_test(
 cuda_py_test(
     name = "reduce_benchmark_test",
     srcs = ["reduce_benchmark_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -165,6 +199,7 @@ cuda_py_test(
     size = "small",
     srcs = ["bincount_op_test.py"],
     tags = ["no_windows_gpu"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:bincount_ops",
         "//tensorflow/python:client_testlib",
@@ -176,6 +211,7 @@ tf_py_test(
     name = "candidate_sampler_ops_test",
     size = "small",
     srcs = ["candidate_sampler_ops_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:candidate_sampling_ops",
@@ -190,6 +226,7 @@ tf_py_test(
     name = "checkpoint_ops_test",
     size = "medium",
     srcs = ["checkpoint_ops_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:checkpoint_ops_gen",
@@ -237,6 +274,7 @@ tf_py_test(
         "no_gpu",  # b/127001953
         "no_windows",
     ],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:clip_ops",
@@ -248,6 +286,7 @@ tf_py_test(
     name = "collective_ops_test",
     size = "small",
     srcs = ["collective_ops_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:collective_ops_gen",
@@ -259,6 +298,7 @@ tf_py_test(
     name = "conditional_accumulator_test",
     size = "small",
     srcs = ["conditional_accumulator_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -276,6 +316,7 @@ tf_py_test(
     name = "ctc_decoder_ops_test",
     size = "small",
     srcs = ["ctc_decoder_ops_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -316,6 +357,7 @@ cuda_py_test(
     name = "cudnn_deterministic_ops_test",
     size = "small",
     srcs = ["cudnn_deterministic_ops_test.py"],
+    tfrt_enabled = True,
     xla_enable_strict_auto_jit = True,
     deps = [
         ":cudnn_deterministic_base",
@@ -326,6 +368,7 @@ cuda_py_test(
     name = "cudnn_deterministic_test",
     size = "small",
     srcs = ["cudnn_deterministic_test.py"],
+    tfrt_enabled = True,
     deps = [
         ":cudnn_deterministic_base",
     ],
@@ -335,6 +378,7 @@ cuda_py_test(
     name = "cumulative_logsumexp_test",
     size = "medium",
     srcs = ["cumulative_logsumexp_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -350,6 +394,7 @@ tf_py_test(
     name = "decode_csv_op_test",
     size = "small",
     srcs = ["decode_csv_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
@@ -364,6 +409,7 @@ tf_py_test(
     name = "decode_png_op_test",
     size = "small",
     srcs = ["decode_png_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -377,6 +423,7 @@ tf_py_test(
     name = "decode_bmp_op_test",
     size = "small",
     srcs = ["decode_bmp_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -390,6 +437,7 @@ tf_py_test(
     name = "decode_jpeg_op_test",
     srcs = ["decode_jpeg_op_test.py"],
     data = ["//tensorflow/core:image_testdata"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -403,6 +451,7 @@ tf_py_test(
     size = "small",
     srcs = ["decode_image_op_test.py"],
     data = ["//tensorflow/core:image_testdata"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
@@ -417,6 +466,7 @@ tf_py_test(
     name = "decode_raw_op_test",
     size = "small",
     srcs = ["decode_raw_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -430,6 +480,7 @@ tf_py_test(
     name = "decode_compressed_op_test",
     size = "small",
     srcs = ["decode_compressed_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -443,6 +494,7 @@ cuda_py_test(
     name = "determinant_op_test",
     size = "medium",
     srcs = ["determinant_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -455,6 +507,7 @@ tf_py_test(
     name = "draw_bounding_box_op_test",
     size = "small",
     srcs = ["draw_bounding_box_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -469,6 +522,7 @@ tf_py_test(
     name = "edit_distance_op_test",
     size = "small",
     srcs = ["edit_distance_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -500,6 +554,7 @@ tf_py_test(
     name = "fingerprint_op_test",
     size = "small",
     srcs = ["fingerprint_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//third_party/py/numpy",
     ],
@@ -510,6 +565,7 @@ tf_py_test(
     size = "small",
     srcs = ["fractional_avg_pool_op_test.py"],
     shard_count = 5,
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -526,6 +582,7 @@ tf_py_test(
     size = "small",
     srcs = ["fractional_max_pool_op_test.py"],
     shard_count = 5,
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -541,6 +598,7 @@ tf_py_test(
     name = "identity_op_py_test",
     size = "small",
     srcs = ["identity_op_py_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:array_ops_gen",
@@ -555,6 +613,7 @@ tf_py_test(
     name = "identity_n_op_py_test",
     size = "small",
     srcs = ["identity_n_op_py_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:array_ops_gen",
@@ -569,6 +628,7 @@ cuda_py_test(
     name = "in_topk_op_test",
     size = "small",
     srcs = ["in_topk_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
@@ -581,6 +641,7 @@ tf_py_test(
     name = "record_input_test",
     size = "medium",
     srcs = ["record_input_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:data_flow_ops",
@@ -593,6 +654,7 @@ tf_py_test(
     name = "io_ops_test",
     size = "small",
     srcs = ["io_ops_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:io_ops",
@@ -604,6 +666,7 @@ tf_py_test(
     name = "listdiff_op_test",
     size = "small",
     srcs = ["listdiff_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -620,6 +683,7 @@ tf_py_test(
     tags = [
         "no_windows",
     ],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -666,6 +730,7 @@ tf_py_test(
     name = "losses_test",
     size = "medium",
     srcs = ["losses_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -688,6 +753,7 @@ tf_py_test(
     srcs = ["matrix_exponential_op_test.py"],
     shard_count = 16,
     tags = ["no_windows_gpu"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -740,6 +806,7 @@ cuda_py_test(
     name = "matrix_square_root_op_test",
     size = "medium",
     srcs = ["matrix_square_root_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -765,6 +832,7 @@ cuda_py_test(
     name = "banded_triangular_solve_op_test",
     size = "small",
     srcs = ["banded_triangular_solve_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:linalg_ops",
@@ -777,6 +845,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["matrix_triangular_solve_op_test.py"],
     shard_count = 3,
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:linalg_ops",
@@ -805,6 +874,7 @@ tf_py_test(
     name = "parse_single_example_op_test",
     size = "small",
     srcs = ["parse_single_example_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
@@ -822,6 +892,7 @@ tf_py_test(
     name = "partitioned_variables_test",
     size = "small",
     srcs = ["partitioned_variables_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -839,6 +910,7 @@ tf_py_test(
     name = "priority_queue_test",
     size = "medium",
     srcs = ["priority_queue_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -854,6 +926,8 @@ cuda_py_test(
     name = "resource_variable_ops_test",
     size = "medium",
     srcs = ["resource_variable_ops_test.py"],
+    # TODO(kkb): CppMemoryChecker is flaky without these two flags, investigate.
+    #
     # TODO(b/128347673): Re-enable.
     tags = ["no_windows"],
     tfrt_enabled = True,
@@ -864,6 +938,7 @@ cuda_py_test(
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:memory_checker",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:variables",
         "@absl_py//absl/testing:parameterized",
@@ -874,6 +949,7 @@ tf_py_test(
     name = "regex_replace_op_test",
     size = "small",
     srcs = ["regex_replace_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
@@ -887,6 +963,7 @@ tf_py_test(
     name = "regex_full_match_op_test",
     size = "small",
     srcs = ["regex_full_match_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
@@ -910,24 +987,6 @@ tf_py_test(
     ],
 )
 
-cuda_py_test(
-    name = "scatter_nd_ops_test",
-    size = "medium",
-    srcs = ["scatter_nd_ops_test.py"],
-    tags = ["noasan"],  # http://b/32635055
-    deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:variables",
-        "//third_party/py/numpy",
-    ],
-)
-
 tf_py_test(
     name = "segment_reduction_ops_test",
     size = "medium",
@@ -948,6 +1007,7 @@ tf_py_test(
     name = "sparse_add_op_test",
     size = "small",
     srcs = ["sparse_add_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
@@ -964,6 +1024,7 @@ tf_py_test(
     name = "sparse_concat_op_test",
     size = "small",
     srcs = ["sparse_concat_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -978,6 +1039,7 @@ tf_py_test(
     name = "sparse_conditional_accumulator_test",
     size = "small",
     srcs = ["sparse_conditional_accumulator_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -992,6 +1054,7 @@ tf_py_test(
     name = "sparse_reorder_op_test",
     size = "small",
     srcs = ["sparse_reorder_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1007,6 +1070,7 @@ tf_py_test(
     name = "sparse_reshape_op_test",
     size = "small",
     srcs = ["sparse_reshape_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1022,6 +1086,7 @@ tf_py_test(
     name = "sparse_split_op_test",
     size = "small",
     srcs = ["sparse_split_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework",
@@ -1034,6 +1099,7 @@ tf_py_test(
     name = "sparse_slice_op_test",
     size = "small",
     srcs = ["sparse_slice_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework",
@@ -1047,6 +1113,7 @@ tf_py_test(
     name = "sparse_to_dense_op_py_test",
     size = "small",
     srcs = ["sparse_to_dense_op_py_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1060,6 +1127,7 @@ tf_py_test(
     name = "sparsemask_op_test",
     size = "small",
     srcs = ["sparsemask_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1072,6 +1140,7 @@ tf_py_test(
     name = "string_format_op_test",
     size = "small",
     srcs = ["string_format_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -1085,6 +1154,7 @@ tf_py_test(
     name = "string_join_op_test",
     size = "small",
     srcs = ["string_join_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:string_ops",
@@ -1095,6 +1165,7 @@ tf_py_test(
     name = "string_split_op_test",
     size = "small",
     srcs = ["string_split_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1114,6 +1185,7 @@ tf_py_test(
     name = "string_bytes_split_op_test",
     size = "small",
     srcs = ["string_bytes_split_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1133,6 +1205,7 @@ tf_py_test(
     name = "string_length_op_test",
     size = "small",
     srcs = ["string_length_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -1144,6 +1217,7 @@ tf_py_test(
     name = "string_strip_op_test",
     size = "small",
     srcs = ["string_strip_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1158,6 +1232,7 @@ tf_py_test(
     name = "string_lower_op_test",
     size = "small",
     srcs = ["string_lower_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1172,6 +1247,7 @@ tf_py_test(
     name = "string_upper_op_test",
     size = "small",
     srcs = ["string_upper_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1186,6 +1262,7 @@ tf_py_test(
     name = "substr_op_test",
     size = "small",
     srcs = ["substr_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
@@ -1224,6 +1301,7 @@ tf_py_test(
     name = "summary_v1_ops_test",
     size = "small",
     srcs = ["summary_v1_ops_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
@@ -1237,6 +1315,7 @@ tf_py_test(
     name = "summary_v1_tensor_op_test",
     size = "small",
     srcs = ["summary_v1_tensor_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
@@ -1271,6 +1350,7 @@ cuda_py_test(
     name = "template_mirrored_strategy_test",
     size = "small",
     srcs = ["template_mirrored_strategy_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:init_ops",
@@ -1290,6 +1370,7 @@ cuda_py_test(
     tags = [
         "no_oss",  # TODO(b/142818120): Re-enable.
     ],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -1302,6 +1383,7 @@ tf_py_test(
     name = "unicode_script_op_test",
     size = "small",
     srcs = ["unicode_script_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
@@ -1314,6 +1396,7 @@ cuda_py_test(
     name = "topk_op_test",
     size = "medium",
     srcs = ["topk_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1329,6 +1412,7 @@ cuda_py_test(
     name = "nth_element_op_test",
     size = "small",
     srcs = ["nth_element_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1344,6 +1428,7 @@ tf_py_test(
     name = "unicode_encode_op_test",
     size = "small",
     srcs = ["unicode_encode_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
@@ -1362,6 +1447,7 @@ tf_py_test(
     name = "unicode_transcode_op_test",
     size = "small",
     srcs = ["unicode_transcode_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -1394,6 +1480,7 @@ tf_py_test(
     name = "unique_op_test",
     size = "small",
     srcs = ["unique_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1455,6 +1542,7 @@ cuda_py_test(
     name = "where_op_test",
     size = "medium",
     srcs = ["where_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1467,6 +1555,7 @@ cuda_py_test(
     name = "cast_op_test",
     size = "small",
     srcs = ["cast_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1483,6 +1572,7 @@ cuda_py_test(
     size = "small",
     srcs = ["dense_update_ops_no_tsan_test.py"],
     tags = ["notsan"],
+    tfrt_enabled = True,
     # TODO (b/140294007): the test fails with XLA.
     xla_enable_strict_auto_jit = False,
     deps = [
@@ -1501,6 +1591,7 @@ cuda_py_test(
     srcs = ["diag_op_test.py"],
     shard_count = 6,
     tags = ["no_windows_gpu"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1516,6 +1607,7 @@ tf_py_test(
     size = "small",
     srcs = ["reader_ops_test.py"],
     data = ["//tensorflow/core:lmdb_testdata"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
@@ -1534,6 +1626,7 @@ cuda_py_test(
     name = "aggregate_ops_test",
     size = "small",
     srcs = ["aggregate_ops_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1547,6 +1640,7 @@ cuda_py_test(
     name = "argmax_op_test",
     size = "small",
     srcs = ["argmax_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:math_ops",
@@ -1600,6 +1694,7 @@ cuda_py_test(
     size = "small",
     srcs = ["inplace_ops_test.py"],
     shard_count = 10,
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1616,6 +1711,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["batch_matmul_op_test.py"],
     shard_count = 20,
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1629,6 +1725,7 @@ cuda_py_test(
     name = "batchtospace_op_test",
     size = "small",
     srcs = ["batchtospace_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:array_ops_gen",
@@ -1642,6 +1739,7 @@ cuda_py_test(
     name = "betainc_op_test",
     size = "small",
     srcs = ["betainc_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1670,6 +1768,7 @@ cuda_py_test(
     name = "bias_op_deterministic_test",
     size = "medium",
     srcs = ["bias_op_deterministic_test.py"],
+    tfrt_enabled = True,
     deps = [
         ":bias_op_base",
     ],
@@ -1688,6 +1787,7 @@ cuda_py_test(
     name = "bitcast_op_test",
     size = "small",
     srcs = ["bitcast_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1718,6 +1818,7 @@ cuda_py_test(
     name = "constant_op_test",
     size = "small",
     srcs = ["constant_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1794,6 +1895,7 @@ tf_py_test(
     name = "control_flow_util_test",
     size = "small",
     srcs = ["control_flow_util_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:control_flow_ops",
@@ -1821,6 +1923,7 @@ cuda_py_test(
     name = "conv1d_test",
     size = "small",
     srcs = ["conv1d_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1833,6 +1936,7 @@ cuda_py_test(
     name = "conv1d_transpose_test",
     size = "small",
     srcs = ["conv1d_transpose_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
@@ -1846,6 +1950,7 @@ cuda_py_test(
     name = "conv2d_transpose_test",
     size = "small",
     srcs = ["conv2d_transpose_test.py"],
+    tfrt_enabled = True,
 
     # TODO(b/144432983): S32 convolutions should not be auto-clustered, only
     # crashes tests.
@@ -1864,6 +1969,7 @@ cuda_py_test(
     name = "conv3d_backprop_filter_v2_grad_test",
     size = "small",
     srcs = ["conv3d_backprop_filter_v2_grad_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1878,6 +1984,7 @@ cuda_py_test(
     name = "cross_grad_test",
     size = "small",
     srcs = ["cross_grad_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1902,6 +2009,7 @@ cuda_py_test(
     name = "dense_update_ops_test",
     size = "small",
     srcs = ["dense_update_ops_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1918,6 +2026,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["depthtospace_op_test.py"],
     tags = ["no_windows_gpu"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1932,6 +2041,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["division_past_test.py"],
     tags = ["manual"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -1943,6 +2053,10 @@ cuda_py_test(
     name = "dynamic_partition_op_test",
     size = "medium",
     srcs = ["dynamic_partition_op_test.py"],
+    tags = [
+        "multi_and_single_gpu",
+    ],
+    tfrt_enabled = False,  # TODO(b/153089059): add support for complex128.
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1958,6 +2072,7 @@ cuda_py_test(
     name = "dynamic_stitch_op_test",
     size = "small",
     srcs = ["dynamic_stitch_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:data_flow_grad",
@@ -1972,6 +2087,7 @@ cuda_py_test(
     name = "extract_image_patches_op_test",
     size = "small",
     srcs = ["extract_image_patches_op_test.py"],
+    tfrt_enabled = True,
     # TODO(b/144432983): S32 convolutions should not be auto-clustered.
     xla_enable_strict_auto_jit = False,
     deps = [
@@ -1986,6 +2102,7 @@ cuda_py_test(
     name = "extract_volume_patches_op_test",
     size = "small",
     srcs = ["extract_volume_patches_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2024,6 +2141,7 @@ cuda_py_test(
     name = "gather_nd_op_test",
     size = "small",
     srcs = ["gather_nd_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client",
@@ -2039,6 +2157,7 @@ cuda_py_test(
     name = "gradient_correctness_test",
     size = "small",
     srcs = ["gradient_correctness_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -2096,6 +2215,7 @@ cuda_py_test(
     name = "lrn_op_test",
     size = "medium",
     srcs = ["lrn_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2111,6 +2231,7 @@ cuda_py_test(
     name = "lu_op_test",
     size = "small",
     srcs = ["lu_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2145,6 +2266,7 @@ cuda_py_test(
     size = "small",
     srcs = ["manip_ops_test.py"],
     tags = ["no_windows_gpu"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -2158,6 +2280,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["matmul_op_test.py"],
     shard_count = 20,
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2174,6 +2297,7 @@ cuda_py_test(
     name = "morphological_ops_test",
     size = "small",
     srcs = ["morphological_ops_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -2187,6 +2311,7 @@ cuda_py_test(
     name = "numerics_test",
     size = "small",
     srcs = ["numerics_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2203,6 +2328,7 @@ cuda_py_test(
     size = "small",
     srcs = ["one_hot_op_test.py"],
     tags = ["no_windows_gpu"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2215,6 +2341,7 @@ cuda_py_test(
     name = "stack_op_test",
     size = "small",
     srcs = ["stack_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2251,6 +2378,7 @@ cuda_py_test(
     name = "pad_op_test",
     size = "small",
     srcs = ["pad_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2263,6 +2391,7 @@ cuda_py_test(
     name = "padding_fifo_queue_test",
     size = "small",
     srcs = ["padding_fifo_queue_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2296,6 +2425,7 @@ cuda_py_test(
     name = "reduce_join_op_test",
     size = "small",
     srcs = ["reduce_join_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2326,6 +2456,7 @@ cuda_py_test(
     tags = [
         "no_windows_gpu",
     ],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2345,6 +2476,7 @@ cuda_py_test(
         "no_gpu",
         "noguitar",
     ],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2358,6 +2490,7 @@ cuda_py_test(
     name = "relu_op_test",
     size = "small",
     srcs = ["relu_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -2376,6 +2509,7 @@ cuda_py_test(
     name = "reshape_op_test",
     size = "small",
     srcs = ["reshape_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2388,6 +2522,7 @@ cuda_py_test(
     name = "reverse_sequence_op_test",
     size = "small",
     srcs = ["reverse_sequence_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2400,6 +2535,7 @@ cuda_py_test(
     name = "compare_and_bitpack_op_test",
     size = "small",
     srcs = ["compare_and_bitpack_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -2412,6 +2548,7 @@ cuda_py_test(
     name = "scalar_test",
     size = "small",
     srcs = ["scalar_test.py"],
+    tfrt_enabled = True,
     # b/140221961: Invalid dims for operations
     xla_enable_strict_auto_jit = False,
     deps = [
@@ -2432,6 +2569,7 @@ cuda_py_test(
     name = "scan_ops_test",
     size = "medium",
     srcs = ["scan_ops_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
@@ -2445,6 +2583,7 @@ cuda_py_test(
     name = "shape_ops_test",
     size = "medium",
     srcs = ["shape_ops_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
@@ -2462,6 +2601,7 @@ cuda_py_test(
     name = "softmax_op_test",
     size = "medium",
     srcs = ["softmax_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2476,6 +2616,7 @@ cuda_py_test(
     name = "softplus_op_test",
     size = "small",
     srcs = ["softplus_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -2489,6 +2630,7 @@ cuda_py_test(
     name = "softsign_op_test",
     size = "small",
     srcs = ["softsign_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -2502,6 +2644,7 @@ cuda_py_test(
     name = "spacetobatch_op_test",
     size = "small",
     srcs = ["spacetobatch_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:array_ops_gen",
@@ -2521,6 +2664,7 @@ cuda_py_test(
         "no_windows",
         "no_windows_gpu",
     ],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2534,6 +2678,7 @@ tf_py_test(
     name = "sparse_serialization_ops_test",
     size = "small",
     srcs = ["sparse_serialization_ops_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2548,6 +2693,7 @@ tf_py_test(
     name = "sparse_tensors_map_ops_test",
     size = "small",
     srcs = ["sparse_tensors_map_ops_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client",
@@ -2564,6 +2710,7 @@ cuda_py_test(
     name = "sparse_tensor_dense_matmul_grad_test",
     size = "small",
     srcs = ["sparse_tensor_dense_matmul_grad_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework",
@@ -2578,6 +2725,7 @@ cuda_py_test(
     name = "sparse_xent_op_test",
     size = "small",
     srcs = ["sparse_xent_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
@@ -2616,6 +2764,7 @@ cuda_py_test(
     name = "stack_ops_test",
     size = "small",
     srcs = ["stack_ops_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:control_flow_ops",
@@ -2631,6 +2780,7 @@ cuda_py_test(
     name = "string_to_hash_bucket_op_test",
     size = "small",
     srcs = ["string_to_hash_bucket_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2643,6 +2793,7 @@ cuda_py_test(
     name = "string_to_number_op_test",
     size = "small",
     srcs = ["string_to_number_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2655,6 +2806,7 @@ cuda_py_test(
     name = "summary_v1_audio_op_test",
     size = "small",
     srcs = ["summary_v1_audio_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
@@ -2668,6 +2820,7 @@ cuda_py_test(
     name = "summary_v1_image_op_test",
     size = "small",
     srcs = ["summary_v1_image_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
@@ -2718,6 +2871,7 @@ cuda_py_test(
     size = "small",
     srcs = ["trace_op_test.py"],
     tags = ["no_windows_gpu"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:math_ops",
@@ -2748,6 +2902,7 @@ cuda_py_test(
     name = "variable_ops_test",
     size = "small",
     srcs = ["variable_ops_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2765,6 +2920,7 @@ cuda_py_test(
     name = "xent_op_test",
     size = "small",
     srcs = ["xent_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -2781,6 +2937,7 @@ cuda_py_test(
     name = "zero_division_test",
     size = "medium",
     srcs = ["zero_division_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
@@ -2796,6 +2953,7 @@ cuda_py_test(
     tags = [
         "no_gpu",  #  Flaky: b/80127739, b/127001953
     ],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2812,6 +2970,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["atrous_convolution_test.py"],
     tags = ["manual"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2826,6 +2985,7 @@ cuda_py_test(
     name = "pool_test",
     size = "medium",
     srcs = ["pool_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -2857,6 +3017,7 @@ cuda_py_test(
     name = "conv3d_transpose_test",
     size = "medium",
     srcs = ["conv3d_transpose_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -2900,6 +3061,7 @@ cuda_py_test(
     shard_count = 3,
     # TODO(b/118842098): Re-enable this test in Kokoro.
     tags = ["no_oss"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2916,6 +3078,7 @@ tf_py_test(
     size = "medium",
     srcs = ["neon_depthwise_conv_op_test.py"],
     tags = ["no_windows"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2932,6 +3095,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["division_future_test.py"],
     tags = ["manual"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -2943,6 +3107,7 @@ cuda_py_test(
     name = "pooling_ops_3d_test",
     size = "medium",
     srcs = ["pooling_ops_3d_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -2957,6 +3122,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["pooling_ops_test.py"],
     shard_count = 4,
+    tfrt_enabled = True,
     xla_enable_strict_auto_jit = False,  # Flaky in XLA b/149568654
     deps = [
         "//tensorflow/python:array_ops",
@@ -2977,6 +3143,7 @@ cuda_py_test(
     timeout = "long",
     srcs = ["rnn_test.py"],
     shard_count = 10,
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
@@ -3051,6 +3218,7 @@ cuda_py_test(
     tags = [
         "no_oss",  # Requires 4GB+ RAM
     ],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -3065,6 +3233,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["sparse_matmul_op_test.py"],
     tags = ["no_windows"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -3100,6 +3269,7 @@ cuda_py_test(
     name = "sparse_tensor_dense_matmul_op_test",
     size = "medium",
     srcs = ["sparse_tensor_dense_matmul_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
@@ -3154,6 +3324,7 @@ cuda_py_test(
     name = "stage_op_test",
     size = "medium",
     srcs = ["stage_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -3169,6 +3340,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["map_stage_op_test.py"],
     tags = ["no_oss"],  # b/124474135
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -3184,6 +3356,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["concat_op_test.py"],
     tags = ["no_windows"],  # b/126916429
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:array_ops_gen",
@@ -3205,6 +3378,7 @@ cuda_py_test(
         "nomsan",
         "notsan",
     ],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -3217,6 +3391,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["conv_ops_3d_test.py"],
     shard_count = 30,
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -3230,6 +3405,8 @@ cuda_py_test(
     size = "medium",
     srcs = ["cwise_ops_test.py"],
     shard_count = 50,
+    tags = ["no_windows"],  # b/163222163
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -3250,6 +3427,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["cwise_ops_binary_test.py"],
     shard_count = 50,
+    tfrt_enabled = True,
     # b/140155647: Error just outside of tolerance
     xla_enable_strict_auto_jit = False,
     deps = [
@@ -3294,6 +3472,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["embedding_ops_test.py"],
     shard_count = 20,
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -3336,6 +3515,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["matrix_band_part_op_test.py"],
     shard_count = 20,
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -3353,6 +3533,7 @@ tf_py_test(
     tags = [
         "no_windows",
     ],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -3373,6 +3554,7 @@ cuda_py_test(
     tags = [
         "no_windows",
     ],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -3432,6 +3614,7 @@ cuda_py_test(
         "no_windows_gpu",
         "nomsan",
     ],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -3475,24 +3658,11 @@ cuda_py_test(
     ],
 )
 
-sycl_py_test(
-    name = "basic_gpu_test",
-    size = "small",
-    srcs = ["basic_gpu_test.py"],
-    deps = [
-        "//tensorflow/python:array_ops_gen",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:math_ops_gen",
-        "//third_party/py/numpy",
-    ],
-)
-
 tf_py_test(
     name = "sets_test",
     size = "medium",
     srcs = ["sets_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:errors",
         "//tensorflow/python:framework",
@@ -3511,6 +3681,7 @@ tf_py_test(
     size = "small",
     srcs = ["weights_broadcast_test.py"],
     shard_count = 3,
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -3530,6 +3701,7 @@ tf_py_test(
     srcs = ["metrics_test.py"],
     shard_count = 20,
     tags = ["no_windows_gpu"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -3550,6 +3722,7 @@ tf_py_test(
     name = "confusion_matrix_test",
     size = "small",
     srcs = ["confusion_matrix_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -3565,6 +3738,7 @@ cuda_py_test(
     name = "bucketize_op_test",
     size = "medium",
     srcs = ["bucketize_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -3578,6 +3752,7 @@ tf_py_test(
     size = "small",
     srcs = ["sparse_cross_op_test.py"],
     tags = ["no_windows"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -3590,6 +3765,7 @@ tf_py_test(
     name = "garbage_collection_test",
     size = "small",
     srcs = ["garbage_collection_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
@@ -3671,6 +3847,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["cond_v2_test.py"],
     grpc_enabled = True,
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -3746,6 +3923,7 @@ cuda_py_test(
     srcs = ["tridiagonal_matmul_op_test.py"],
     shard_count = 10,
     tags = ["no_rocm"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
diff --git a/tensorflow/python/kernel_tests/array_ops/BUILD b/tensorflow/python/kernel_tests/array_ops/BUILD
index bc448f3da05..42fa2fef81e 100644
--- a/tensorflow/python/kernel_tests/array_ops/BUILD
+++ b/tensorflow/python/kernel_tests/array_ops/BUILD
@@ -60,3 +60,21 @@ cuda_py_test(
         "@absl_py//absl/testing:parameterized",
     ],
 )
+
+cuda_py_test(
+    name = "scatter_nd_ops_test",
+    size = "medium",
+    srcs = ["scatter_nd_ops_test.py"],
+    tags = ["noasan"],  # http://b/32635055
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:variables",
+        "//third_party/py/numpy",
+    ],
+)
diff --git a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py b/tensorflow/python/kernel_tests/array_ops/scatter_nd_ops_test.py
similarity index 61%
rename from tensorflow/python/kernel_tests/scatter_nd_ops_test.py
rename to tensorflow/python/kernel_tests/array_ops/scatter_nd_ops_test.py
index d6768712d65..144cc525905 100644
--- a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops/scatter_nd_ops_test.py
@@ -20,18 +20,18 @@ from __future__ import print_function
 
 import functools
 
+from absl.testing import parameterized
 import numpy as np
 
-from tensorflow.python.client import session
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gradient_checker
-from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
@@ -115,7 +115,7 @@ class StatefulScatterNdTest(test.TestCase):
     np.random.seed(8)
     ref_shapes = [(3, 6), (3, 6), (3, 6, 9), (3, 6, 9), (3, 6, 9), (3, 6, 9)]
     indices_shapes = [(2,), (2, 2), (2,), (2, 2), (2, 3), (2, 3, 3)]
-    with self.cached_session(use_gpu=True):
+    with test_util.device(use_gpu=True):
       for ref_shape, indices_shape in zip(ref_shapes, indices_shapes):
         num_updates = indices_shape[0]
         ixdim = indices_shape[-1]
@@ -151,7 +151,7 @@ class StatefulScatterNdTest(test.TestCase):
         # Scatter via tensorflow
         ref_var = variables.VariableV1(ref)
         self.evaluate(ref_var.initializer)
-        tf_scatter(ref_var, indices, updates).eval()
+        self.evaluate(tf_scatter(ref_var, indices, updates))
 
         # Compare
         self.assertAllClose(new, self.evaluate(ref_var))
@@ -164,7 +164,7 @@ class StatefulScatterNdTest(test.TestCase):
 
   def testSimple(self):
     indices = constant_op.constant([[4], [3], [1], [7]], dtype=dtypes.int32)
-    for dtype in (dtypes.int64, dtypes.float32, dtypes.float64,
+    for dtype in (dtypes.int32, dtypes.int64, dtypes.float32, dtypes.float64,
                   dtypes.complex64, dtypes.complex128):
       updates = constant_op.constant([9, 10, 11, 12], dtype=dtype)
       ref = variables.Variable([0, 0, 0, 0, 0, 0, 0, 0], dtype=dtype)
@@ -187,20 +187,19 @@ class StatefulScatterNdTest(test.TestCase):
     self.assertAllEqual(self.evaluate(update),
                         [b"qq", b"cc", b"ee", b"dd", b"aa", b"", b"", b"bb"])
 
-  @test_util.run_deprecated_v1
   def testSimpleResource(self):
     indices = constant_op.constant([[4], [3], [1], [7]], dtype=dtypes.int32)
-    updates = constant_op.constant([9, 10, 11, 12], dtype=dtypes.float32)
-    ref = resource_variable_ops.ResourceVariable(
-        [0, 0, 0, 0, 0, 0, 0, 0], dtype=dtypes.float32)
-    expected = np.array([0, 11, 0, 10, 9, 0, 0, 12])
-    scatter = state_ops.scatter_nd_update(ref, indices, updates)
-    init = variables.global_variables_initializer()
+    for dtype in (dtypes.int32, dtypes.float32):
+      updates = constant_op.constant([9, 10, 11, 12], dtype=dtype)
+      ref = resource_variable_ops.ResourceVariable([0, 0, 0, 0, 0, 0, 0, 0],
+                                                   dtype=dtype)
+      expected = np.array([0, 11, 0, 10, 9, 0, 0, 12])
+      scatter = state_ops.scatter_nd_update(ref, indices, updates)
 
-    with self.session(use_gpu=True) as sess:
-      self.evaluate(init)
-      self.evaluate(scatter)
-      self.assertAllClose(ref, expected)
+      with test_util.device(use_gpu=True):
+        self.evaluate(ref.initializer)
+        self.evaluate(scatter)
+        self.assertAllClose(ref, expected)
 
   def testSimple2(self):
     indices = constant_op.constant([[1, 0], [1, 1]], dtype=dtypes.int32)
@@ -230,15 +229,12 @@ class StatefulScatterNdTest(test.TestCase):
       result = self.evaluate(scatter)
       self.assertAllClose(result, expected)
 
-  @test_util.run_deprecated_v1
   def testVariableRankUpdate(self):
     self._VariableRankTests(_NumpyUpdate, state_ops.scatter_nd_update)
 
-  @test_util.run_deprecated_v1
   def testVariableRankAdd(self):
     self._VariableRankTests(_NumpyAdd, state_ops.scatter_nd_add)
 
-  @test_util.run_deprecated_v1
   def testVariableRankSub(self):
     self._VariableRankTests(_NumpySub, state_ops.scatter_nd_sub)
 
@@ -256,13 +252,10 @@ class StatefulScatterNdTest(test.TestCase):
         self._VariableRankTest(
             np_scatter, tf_scatter, vtype, itype, repeat_indices=True)
 
-  @test_util.run_v1_only("b/120545219")
   def testScatterRepeatIndices(self):
     """This tests scatter_add using indices that repeat."""
     self._ScatterRepeatIndicesTest(_NumpyAdd, state_ops.scatter_nd_add)
     self._ScatterRepeatIndicesTest(_NumpySub, state_ops.scatter_nd_sub)
-    self._ScatterRepeatIndicesTest(_NumpyMin, state_ops.scatter_nd_min)
-    self._ScatterRepeatIndicesTest(_NumpyMax, state_ops.scatter_nd_max)
     # TODO(ebrevdo): Re-enable when we need ScatterNdMul and ScatterNdDiv.
     # self._ScatterRepeatIndicesTest(_NumpyMul, state_ops.scatter_nd_mul)
     # self._ScatterRepeatIndicesTest(_NumpyDiv, state_ops.scatter_nd_div)
@@ -280,34 +273,32 @@ class StatefulScatterNdTest(test.TestCase):
   #     session.run([update0, update1])
   #     self.assertAllEqual([False, True], self.evaluate(var))
 
-  @test_util.run_v1_only("b/120545219")
   def testScatterOutOfRangeCpu(self):
     # TODO(simister): Re-enable once binary size increase due to
     # scatter_nd ops is under control.
     #  tf.scatter_nd_mul, tf.scatter_nd_div,
     for op in (state_ops.scatter_nd_add, state_ops.scatter_nd_sub,
-               state_ops.scatter_nd_min, state_ops.scatter_nd_max,
                state_ops.scatter_nd_update):
       params = np.array([1, 2, 3, 4, 5, 6]).astype(np.float32)
       updates = np.array([-3, -4, -5]).astype(np.float32)
-      with self.cached_session(use_gpu=False):
+      with test_util.device(use_gpu=False):
         ref = variables.VariableV1(params)
         self.evaluate(ref.initializer)
 
         # Indices all in range, no problem.
         indices = np.array([[2], [0], [5]])
-        op(ref, indices, updates).eval()
+        self.evaluate(op(ref, indices, updates))
 
         # Test some out of range errors.
         indices = np.array([[-1], [0], [5]])
         with self.assertRaisesOpError(
             r"indices\[0\] = \[-1\] does not index into shape \[6\]"):
-          op(ref, indices, updates).eval()
+          self.evaluate(op(ref, indices, updates))
 
         indices = np.array([[2], [0], [6]])
         with self.assertRaisesOpError(
             r"indices\[2\] = \[6\] does not index into shape \[6\]"):
-          op(ref, indices, updates).eval()
+          self.evaluate(op(ref, indices, updates))
 
   def testRank3ValidShape(self):
     indices = array_ops.zeros([2, 2, 2], dtypes.int32)
@@ -318,7 +309,6 @@ class StatefulScatterNdTest(test.TestCase):
         state_ops.scatter_nd_update(ref, indices,
                                     updates).get_shape().as_list(), shape)
 
-  @test_util.run_v1_only("b/120545219")
   @test_util.disable_xla("b/123337890")  # Error messages differ
   def testResVarInvalidOutputShape(self):
     res = variables.Variable(
@@ -329,7 +319,6 @@ class StatefulScatterNdTest(test.TestCase):
       with self.assertRaisesOpError("Output must be at least 1-D"):
         state_ops.scatter_nd_update(res, [[0]], [0.22]).eval()
 
-  @test_util.run_deprecated_v1
   def testExtraIndicesDimensions(self):
     indices = array_ops.zeros([1, 1, 2], dtypes.int32)
     updates = array_ops.zeros([1, 1], dtypes.int32)
@@ -343,27 +332,26 @@ class StatefulScatterNdTest(test.TestCase):
       self.evaluate(ref.initializer)
       self.assertAllEqual(expected_result, self.evaluate(scatter_update))
 
-  @test_util.run_deprecated_v1
   def testRank3InvalidShape1(self):
     indices = array_ops.zeros([3, 2, 2], dtypes.int32)
     updates = array_ops.zeros([2, 2, 2], dtypes.int32)
     shape = np.array([2, 2, 2])
     ref = variables.Variable(array_ops.zeros(shape, dtypes.int32))
     with self.assertRaisesWithPredicateMatch(
-        ValueError, r"The outer \d+ dimensions of indices\.shape="):
+        (errors.InvalidArgumentError, ValueError),
+        r"Dimensions \[\d,\d\) of indices\[shape="):
       state_ops.scatter_nd_update(ref, indices, updates)
 
-  @test_util.run_deprecated_v1
   def testRank3InvalidShape2(self):
     indices = array_ops.zeros([2, 2, 1], dtypes.int32)
     updates = array_ops.zeros([2, 2], dtypes.int32)
     shape = np.array([2, 2, 2])
     ref = variables.Variable(array_ops.zeros(shape, dtypes.int32))
     with self.assertRaisesWithPredicateMatch(
-        ValueError, r"The inner \d+ dimensions of input\.shape="):
+        (errors.InvalidArgumentError, ValueError),
+        r"Dimensions \[\d,\d\) of input\[shape="):
       state_ops.scatter_nd_update(ref, indices, updates)
 
-  @test_util.run_deprecated_v1
   def testConcurrentUpdates(self):
     num_updates = 10000
     update_values = np.random.rand(num_updates)
@@ -377,10 +365,73 @@ class StatefulScatterNdTest(test.TestCase):
     scatter = state_ops.scatter_nd_add(ref, indices, updates)
     init = variables.global_variables_initializer()
 
-    with session.Session() as sess:
-      self.evaluate(init)
-      result = self.evaluate(scatter)
-      assert np.allclose(result, expected_result)
+    self.evaluate(init)
+    result = self.evaluate(scatter)
+    assert np.allclose(result, expected_result)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testMin(self):
+    variable = variables.Variable(array_ops.ones([8], dtype=dtypes.int32))
+    resource_variable = resource_variable_ops.ResourceVariable(
+        array_ops.ones([8], dtype=dtypes.int32))
+    indices = constant_op.constant([4, 3, 1, 7])
+    updates = constant_op.constant([0, 2, -1, 2], dtype=dtypes.int32)
+
+    for ref in (variable, resource_variable):
+      min_result = state_ops.scatter_min(ref, indices, updates)
+      self.evaluate(ref.initializer)
+
+      expected_result = constant_op.constant([1, -1, 1, 1, 0, 1, 1, 1])
+      self.assertAllEqual(self.evaluate(min_result), expected_result)
+      self.assertAllEqual(self.evaluate(ref), expected_result)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testMax(self):
+    variable = variables.Variable(array_ops.ones([8], dtype=dtypes.int32))
+    resource_variable = resource_variable_ops.ResourceVariable(
+        array_ops.ones([8], dtype=dtypes.int32))
+    indices = constant_op.constant([4, 3, 1, 7])
+    updates = constant_op.constant([0, 2, -1, 2], dtype=dtypes.int32)
+
+    for ref in (variable, resource_variable):
+      max_result = state_ops.scatter_max(ref, indices, updates)
+      self.evaluate(ref.initializer)
+
+      expected_result = constant_op.constant([1, 1, 1, 2, 1, 1, 1, 2])
+      self.assertAllEqual(self.evaluate(max_result), expected_result)
+      self.assertAllEqual(self.evaluate(ref), expected_result)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testAdd(self):
+    variable = variables.Variable(array_ops.ones([8], dtype=dtypes.int32))
+    resource_variable = resource_variable_ops.ResourceVariable(
+        array_ops.ones([8], dtype=dtypes.int32))
+    indices = constant_op.constant([4, 3, 1, 7])
+    updates = constant_op.constant([0, 2, -1, 3], dtype=dtypes.int32)
+
+    for ref in (variable, resource_variable):
+      add_result = state_ops.scatter_add(ref, indices, updates)
+      self.evaluate(ref.initializer)
+
+      expected_result = constant_op.constant([1, 0, 1, 3, 1, 1, 1, 4])
+      self.assertAllEqual(self.evaluate(add_result), expected_result)
+      self.assertAllEqual(self.evaluate(ref), expected_result)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testSub(self):
+    variable = variables.Variable(array_ops.ones([8], dtype=dtypes.int32))
+    resource_variable = resource_variable_ops.ResourceVariable(
+        array_ops.ones([8], dtype=dtypes.int32))
+    indices = constant_op.constant([4, 3, 1, 7])
+    updates = constant_op.constant([0, 2, -1, 2], dtype=dtypes.int32)
+
+    for ref in (variable, resource_variable):
+      sub_result = state_ops.scatter_sub(ref, indices, updates)
+      self.evaluate(ref.initializer)
+
+      expected_result = constant_op.constant([1, 2, 1, -1, 1, 1, 1, -1])
+      self.assertAllEqual(self.evaluate(sub_result), expected_result)
+      self.assertAllEqual(self.evaluate(ref), expected_result)
 
   # TODO(fpmc): Re-enable this test when gpu_pip test actually runs on a GPU.
   def _disabledTestScatterOutOfRangeGpu(self):
@@ -410,7 +461,7 @@ class StatefulScatterNdTest(test.TestCase):
         op(ref, indices, updates).eval()
 
 
-class ScatterNdTest(test.TestCase):
+class ScatterNdTest(test.TestCase, parameterized.TestCase):
   non_aliasing_add_test = False
 
   def scatter_nd(self, indices, updates, shape, input_=None):
@@ -492,7 +543,6 @@ class ScatterNdTest(test.TestCase):
     self.assertAllEqual(
         self.scatter_nd(indices, updates, shape).get_shape().as_list(), shape)
 
-  @test_util.run_deprecated_v1
   def testExtraIndicesDimensions(self):
     indices = array_ops.zeros([1, 1, 2], dtypes.int32)
     updates = array_ops.zeros([1, 1], dtypes.int32)
@@ -500,55 +550,57 @@ class ScatterNdTest(test.TestCase):
     scatter = self.scatter_nd(indices, updates, shape)
     self.assertAllEqual(scatter.get_shape().as_list(), shape)
     expected_result = np.zeros([2, 2], dtype=np.int32)
-    with self.cached_session():
-      self.assertAllEqual(expected_result, self.evaluate(scatter))
+    self.assertAllEqual(expected_result, self.evaluate(scatter))
 
-  @test_util.run_deprecated_v1
   def testUndefinedIndicesShape(self):
-    indices = array_ops.placeholder(dtypes.int32, shape=None)
-    updates = array_ops.placeholder(dtypes.int32, shape=[2, 2, 2])
-    shape = constant_op.constant([2, 2, 2], dtypes.int32)
-    self.scatter_nd(indices, updates, shape)
+    # Placeholders are only valid in Graph.
+    with ops.Graph().as_default():
+      indices = array_ops.placeholder(dtypes.int32, shape=None)
+      updates = array_ops.placeholder(dtypes.int32, shape=[2, 2, 2])
+      shape = constant_op.constant([2, 2, 2], dtypes.int32)
+      self.scatter_nd(indices, updates, shape)
 
-  @test_util.run_deprecated_v1
   def testUndefinedUpdatesShape(self):
-    indices = array_ops.placeholder(dtypes.int32, shape=[2, 2, 2])
-    updates = array_ops.placeholder(dtypes.int32, shape=None)
-    shape = constant_op.constant([2, 2, 2], dtypes.int32)
-    self.scatter_nd(indices, updates, shape)
+    # Placeholders are only valid in Graph.
+    with ops.Graph().as_default():
+      indices = array_ops.placeholder(dtypes.int32, shape=[2, 2, 2])
+      updates = array_ops.placeholder(dtypes.int32, shape=None)
+      shape = constant_op.constant([2, 2, 2], dtypes.int32)
+      self.scatter_nd(indices, updates, shape)
 
-  @test_util.run_deprecated_v1
   def testUndefinedOutputShape(self):
-    indices = array_ops.placeholder(dtypes.int32, shape=[2, 2, 2])
-    updates = array_ops.placeholder(dtypes.int32, shape=[2, 2, 2])
-    shape = array_ops.placeholder(dtypes.int32, shape=[None])
-    self.scatter_nd(indices, updates, shape)
+    # Placeholders are only valid in Graph.
+    with ops.Graph().as_default():
+      indices = array_ops.placeholder(dtypes.int32, shape=[2, 2, 2])
+      updates = array_ops.placeholder(dtypes.int32, shape=[2, 2, 2])
+      shape = array_ops.placeholder(dtypes.int32, shape=[None])
+      self.scatter_nd(indices, updates, shape)
 
-  @test_util.run_deprecated_v1
   def testEmptyOutputShape1(self):
     indices = array_ops.zeros([2, 2, 2], dtypes.int32)
     updates = array_ops.zeros([2, 2, 2], dtypes.int32)
     shape = constant_op.constant([0, 3, 2], dtypes.int32)
 
     with self.assertRaisesWithPredicateMatch(
-        ValueError, "Indices and updates specified for empty output shape"):
+        (errors.InvalidArgumentError, ValueError),
+        "Indices and updates specified for empty"):
       self.scatter_nd(indices, updates, shape)
 
-  @test_util.run_v1_only("b/120545219")
   def testEmptyOutputShape2(self):
-    indices = array_ops.placeholder(dtypes.int32, shape=None)
-    updates = array_ops.placeholder(dtypes.int32, shape=None)
-    shape = constant_op.constant([0, 3, 2], dtypes.int32)
+    with ops.Graph().as_default():
+      indices = array_ops.placeholder(dtypes.int32, shape=None)
+      updates = array_ops.placeholder(dtypes.int32, shape=None)
+      shape = constant_op.constant([0, 3, 2], dtypes.int32)
 
-    with self.cached_session():
-      with self.assertRaisesOpError(
-          "Indices and updates specified for empty output"):
-        self.scatter_nd(indices, updates, shape).eval(feed_dict={
-            indices: np.zeros([2, 2, 2], dtype=np.int32),
-            updates: np.zeros([2, 2, 2], dtype=np.int32)
-        })
+      with self.cached_session():
+        with self.assertRaisesOpError(
+            "Indices and updates specified for empty (input|output)"):
+          self.scatter_nd(indices, updates, shape).eval(
+              feed_dict={
+                  indices: np.zeros([2, 2, 2], dtype=np.int32),
+                  updates: np.zeros([2, 2, 2], dtype=np.int32)
+              })
 
-  @test_util.run_deprecated_v1
   def testEmptyOutputShape3(self):
     indices = array_ops.zeros([0], dtypes.int32)
     updates = array_ops.zeros([0], dtypes.int32)
@@ -558,157 +610,156 @@ class ScatterNdTest(test.TestCase):
     with self.cached_session():
       self.assertEqual(self.evaluate(scatter).size, 0)
 
-  @test_util.run_deprecated_v1
   def testRank3InvalidShape1(self):
     indices = array_ops.zeros([3, 2, 2], dtypes.int32)
     updates = array_ops.zeros([2, 2, 2], dtypes.int32)
     shape = np.array([2, 2, 2])
     with self.assertRaisesWithPredicateMatch(
-        ValueError, r"The outer \d+ dimensions of indices\.shape="):
+        (errors.InvalidArgumentError, ValueError),
+        r"Dimensions \[\d\,\d\) of indices\[shape="):
       self.scatter_nd(indices, updates, shape)
 
-  @test_util.run_deprecated_v1
   def testRank3InvalidShape2(self):
     indices = array_ops.zeros([2, 2, 1], dtypes.int32)
     updates = array_ops.zeros([2, 2], dtypes.int32)
     shape = np.array([2, 2, 2])
     with self.assertRaisesWithPredicateMatch(
-        ValueError, r"The inner \d+ dimensions of (input|output)\.shape="):
+        (errors.InvalidArgumentError, ValueError),
+        r"Dimensions \[\d\,\d\) of input\[shape="):
       self.scatter_nd(indices, updates, shape)
 
-  @test_util.run_deprecated_v1
-  def testGradientsRank2ElementUpdate(self):
+  @parameterized.parameters(set((True, context.executing_eagerly())))
+  def testGradientsRank2ElementUpdate(self, use_tape):
     for dtype in GRADIENT_TESTS_DTYPES:
-      indices = constant_op.constant([[0, 0], [1, 1]], dtype=dtypes.int32)
-      updates = constant_op.constant([1, 4], dtype=dtype)
-      shape = constant_op.constant([2, 2], dtype=dtypes.int32)
-      input_ = array_ops.zeros(shape, dtype=dtype)
-      outputs = self.scatter_nd(indices, updates, shape, input_)
+      with test_util.AbstractGradientTape(use_tape=use_tape) as tape:
+        indices = constant_op.constant([[0, 0], [1, 1]], dtype=dtypes.int32)
+        updates = constant_op.constant([1, 4], dtype=dtype)
+        tape.watch(updates)
+        shape = constant_op.constant([2, 2], dtype=dtypes.int32)
+        input_ = array_ops.zeros(shape, dtype=dtype)
+        tape.watch(input_)
+        outputs = self.scatter_nd(indices, updates, shape, input_)
 
-      grad_vals = constant_op.constant([[1, 2], [3, 4]], dtype=dtype)
-      updates_grad, input_grad = gradients_impl.gradients(
-          [outputs], [updates, input_], [grad_vals])
+        grad_vals = constant_op.constant([[1, 2], [3, 4]], dtype=dtype)
+
+        updates_grad, input_grad = tape.gradient([outputs], [updates, input_],
+                                                 [grad_vals])
       expected_updates_grad = np.array([1, 4], dtype=dtype.as_numpy_dtype())
       expected_input_grad = np.array([[1, 2], [3, 4]],
                                      dtype=dtype.as_numpy_dtype())
-      with self.cached_session():
-        self.assertAllEqual(expected_updates_grad, self.evaluate(updates_grad))
-        if self.non_aliasing_add_test:
-          self.assertAllEqual(expected_input_grad, self.evaluate(input_grad))
+      self.assertAllEqual(expected_updates_grad, self.evaluate(updates_grad))
+      if self.non_aliasing_add_test:
+        self.assertAllEqual(expected_input_grad, self.evaluate(input_grad))
 
-  @test_util.run_deprecated_v1
-  def testGradientsRank2SliceUpdate(self):
+  @parameterized.parameters(set((True, context.executing_eagerly())))
+  def testGradientsRank2SliceUpdate(self, use_tape):
     for dtype in GRADIENT_TESTS_DTYPES:
-      indices = constant_op.constant([[1], [0]], dtype=dtypes.int32)
-      updates = constant_op.constant([[3, 4], [1, 2]], dtype=dtype)
-      shape = constant_op.constant([2, 2], dtype=dtypes.int32)
-      input_ = array_ops.zeros(shape, dtype=dtype)
-      outputs = self.scatter_nd(indices, updates, shape, input_)
+      with test_util.AbstractGradientTape(use_tape=use_tape) as tape:
+        indices = constant_op.constant([[1], [0]], dtype=dtypes.int32)
+        updates = constant_op.constant([[3, 4], [1, 2]], dtype=dtype)
+        tape.watch(updates)
+        shape = constant_op.constant([2, 2], dtype=dtypes.int32)
+        input_ = array_ops.zeros(shape, dtype=dtype)
+        tape.watch(input_)
+        outputs = self.scatter_nd(indices, updates, shape, input_)
 
-      grad_vals = constant_op.constant([[3, 4], [1, 2]], dtype=dtype)
-      updates_grad, input_grad = gradients_impl.gradients(
-          [outputs], [updates, input_], [grad_vals])
+        grad_vals = constant_op.constant([[3, 4], [1, 2]], dtype=dtype)
+        updates_grad, input_grad = tape.gradient([outputs], [updates, input_],
+                                                 [grad_vals])
       expected_updates_grad = np.array([[1, 2], [3, 4]],
                                        dtype=dtype.as_numpy_dtype())
       expected_input_grad = np.array([[3, 4], [1, 2]],
                                      dtype=dtype.as_numpy_dtype())
-      with self.cached_session():
-        self.assertAllEqual(expected_updates_grad, self.evaluate(updates_grad))
-        if self.non_aliasing_add_test:
-          self.assertAllEqual(expected_input_grad, self.evaluate(input_grad))
+      self.assertAllEqual(expected_updates_grad, self.evaluate(updates_grad))
+      if self.non_aliasing_add_test:
+        self.assertAllEqual(expected_input_grad, self.evaluate(input_grad))
 
-  @test_util.run_deprecated_v1
-  def testGradientsRank3SliceUpdate(self):
+  @parameterized.parameters(set((True, context.executing_eagerly())))
+  def testGradientsRank3SliceUpdate(self, use_tape):
     for dtype in GRADIENT_TESTS_DTYPES:
-      indices = constant_op.constant([[[0, 1], [1, 0]], [[0, 0], [1, 1]]],
-                                     dtype=dtypes.int32)
-      updates = constant_op.constant([[[5, 7], [2, 4]], [[1, 3], [6, 8]]],
-                                     dtype=dtype)
-      shape = constant_op.constant([2, 2, 2], dtype=dtypes.int32)
-      input_ = array_ops.zeros(shape, dtype=dtype)
-      outputs = self.scatter_nd(indices, updates, shape, input_)
-
-      grad_vals = constant_op.constant([[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+      with test_util.AbstractGradientTape(use_tape=use_tape) as tape:
+        indices = constant_op.constant([[[0, 1], [1, 0]], [[0, 0], [1, 1]]],
+                                       dtype=dtypes.int32)
+        updates = constant_op.constant([[[5, 7], [2, 4]], [[1, 3], [6, 8]]],
                                        dtype=dtype)
-      updates_grad, input_grad = gradients_impl.gradients(
-          [outputs], [updates, input_], [grad_vals])
+        tape.watch(updates)
+        shape = constant_op.constant([2, 2, 2], dtype=dtypes.int32)
+        input_ = array_ops.zeros(shape, dtype=dtype)
+        tape.watch(input_)
+        outputs = self.scatter_nd(indices, updates, shape, input_)
+
+        grad_vals = constant_op.constant([[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+                                         dtype=dtype)
+        updates_grad, input_grad = tape.gradient([outputs], [updates, input_],
+                                                 [grad_vals])
       expected_updates_grad = np.array([[[3, 4], [5, 6]], [[1, 2], [7, 8]]],
                                        dtype=dtype.as_numpy_dtype())
       expected_input_grad = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
                                      dtype=dtype.as_numpy_dtype())
-      with self.cached_session():
-        self.assertAllEqual(expected_updates_grad, self.evaluate(updates_grad))
-        if self.non_aliasing_add_test:
-          self.assertAllEqual(expected_input_grad, self.evaluate(input_grad))
+      self.assertAllEqual(expected_updates_grad, self.evaluate(updates_grad))
+      if self.non_aliasing_add_test:
+        self.assertAllEqual(expected_input_grad, self.evaluate(input_grad))
 
-  @test_util.run_deprecated_v1
-  def testGradientsRank7SliceUpdate(self):
+  @parameterized.parameters(set((True, context.executing_eagerly())))
+  def testGradientsRank7SliceUpdate(self, use_tape):
     for dtype in GRADIENT_TESTS_DTYPES:
-      indices = constant_op.constant(
-          [[[[[[[0, 0, 0, 0, 0, 1], [0, 0, 1, 0, 0, 0]]]],
-             [[[[0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 1]]]]]]],
-          dtype=dtypes.int32)
-      updates = constant_op.constant(
-          [[[[[[[5, 6], [2, 4]]]], [[[[1, 3], [6, 8]]]]]]], dtype=dtype)
-      shape = constant_op.constant([1, 1, 2, 1, 1, 2, 2], dtype=dtypes.int32)
-      input_ = array_ops.zeros(shape, dtype=dtype)
-      outputs = self.scatter_nd(indices, updates, shape, input_)
+      with test_util.AbstractGradientTape(use_tape=use_tape) as tape:
+        indices = constant_op.constant(
+            [[[[[[[0, 0, 0, 0, 0, 1], [0, 0, 1, 0, 0, 0]]]],
+               [[[[0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 1]]]]]]],
+            dtype=dtypes.int32)
+        updates = constant_op.constant(
+            [[[[[[[5, 6], [2, 4]]]], [[[[1, 3], [6, 8]]]]]]], dtype=dtype)
+        tape.watch(updates)
+        shape = constant_op.constant([1, 1, 2, 1, 1, 2, 2], dtype=dtypes.int32)
+        input_ = array_ops.zeros(shape, dtype=dtype)
+        tape.watch(input_)
+        outputs = self.scatter_nd(indices, updates, shape, input_)
 
-      grad_vals = constant_op.constant(
-          [[[[[[[1, 2], [3, 4]]]], [[[[5, 6], [7, 8]]]]]]], dtype=dtype)
-      updates_grad, input_grad = gradients_impl.gradients(
-          [outputs], [updates, input_], [grad_vals])
+        grad_vals = constant_op.constant(
+            [[[[[[[1, 2], [3, 4]]]], [[[[5, 6], [7, 8]]]]]]], dtype=dtype)
+        updates_grad, input_grad = tape.gradient([outputs], [updates, input_],
+                                                 [grad_vals])
       expected_updates_grad = np.array(
           [[[[[[[3, 4], [5, 6]]]], [[[[1, 2], [7, 8]]]]]]],
           dtype=dtype.as_numpy_dtype())
       expected_input_grad = np.array(
           [[[[[[[1, 2], [3, 4]]]], [[[[5, 6], [7, 8]]]]]]],
           dtype=dtype.as_numpy_dtype())
-      with self.cached_session():
-        self.assertAllEqual(expected_updates_grad, self.evaluate(updates_grad))
-        if self.non_aliasing_add_test:
-          self.assertAllEqual(expected_input_grad, self.evaluate(input_grad))
+      self.assertAllEqual(expected_updates_grad, self.evaluate(updates_grad))
+      if self.non_aliasing_add_test:
+        self.assertAllEqual(expected_input_grad, self.evaluate(input_grad))
 
-  @test_util.run_deprecated_v1
   def testScatterNdRepeatedIndicesAdd(self):
     indices = array_ops.zeros([100000, 1], dtypes.int32)
     values = np.random.randn(100000)
     shape = [1]
-    with self.cached_session():
-      val = self.scatter_nd(indices, values, shape).eval()
+    val = self.evaluate(self.scatter_nd(indices, values, shape))
     self.assertAllClose([np.sum(values)], val)
 
-  @test_util.run_deprecated_v1
   def testSmokeScatterNdBatch2DSliceDim2(self):
-    with self.cached_session():
-      indices = array_ops.zeros([3, 5, 2], dtype=dtypes.int32)
-      values = array_ops.zeros([3, 5, 7])
-      shape = [4, 6, 7]
-      self.scatter_nd(indices, values, shape).eval()
+    indices = array_ops.zeros([3, 5, 2], dtype=dtypes.int32)
+    values = array_ops.zeros([3, 5, 7])
+    shape = [4, 6, 7]
+    self.evaluate(self.scatter_nd(indices, values, shape))
 
-  @test_util.run_deprecated_v1
   def testSmokeScatterNdBatch1DSliceDim2(self):
-    with self.cached_session():
-      indices = array_ops.zeros([0, 2], dtype=dtypes.int32)
-      values = array_ops.zeros([0, 7])
-      shape = [4, 6, 7]
-      self.scatter_nd(indices, values, shape).eval()
+    indices = array_ops.zeros([0, 2], dtype=dtypes.int32)
+    values = array_ops.zeros([0, 7])
+    shape = [4, 6, 7]
+    self.evaluate(self.scatter_nd(indices, values, shape))
 
-  @test_util.run_deprecated_v1
   def testSmokeScatterNdBatch1DSliceDim3ShapeRank7(self):
-    with self.cached_session():
-      indices = array_ops.zeros([1, 3], dtype=dtypes.int32)
-      values = array_ops.zeros([1, 6, 7, 8, 9])
-      shape = [3, 4, 5, 6, 7, 8, 9]
-      self.scatter_nd(indices, values, shape).eval()
+    indices = array_ops.zeros([1, 3], dtype=dtypes.int32)
+    values = array_ops.zeros([1, 6, 7, 8, 9])
+    shape = [3, 4, 5, 6, 7, 8, 9]
+    self.evaluate(self.scatter_nd(indices, values, shape))
 
-  @test_util.run_deprecated_v1
   def testSmokeScatterNdBatch2DSliceDim3ShapeRank7(self):
-    with self.cached_session():
-      indices = array_ops.zeros([1, 2, 3], dtype=dtypes.int32)
-      values = array_ops.zeros([1, 2, 6, 7, 8, 9])
-      shape = [3, 4, 5, 6, 7, 8, 9]
-      self.scatter_nd(indices, values, shape).eval()
+    indices = array_ops.zeros([1, 2, 3], dtype=dtypes.int32)
+    values = array_ops.zeros([1, 2, 6, 7, 8, 9])
+    shape = [3, 4, 5, 6, 7, 8, 9]
+    self.evaluate(self.scatter_nd(indices, values, shape))
 
 
 class ScatterNdNonAliasingAddTest(ScatterNdTest):
@@ -728,78 +779,79 @@ class ScatterNdTensorTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def testUpdateAddSub(self):
-    indices = constant_op.constant([[4], [3], [1], [7]])
-    updates = constant_op.constant([9, 10, 11, 12], dtype=dtypes.float32)
-    t = array_ops.ones([8], dtype=dtypes.float32)
-    assigned = array_ops.tensor_scatter_update(t, indices, updates)
-    added = array_ops.tensor_scatter_add(t, indices, updates)
-    subbed = array_ops.tensor_scatter_sub(t, indices, updates)
+    for dtype in (dtypes.int32, dtypes.float32):
+      indices = constant_op.constant([[4], [3], [1], [7]])
+      updates = constant_op.constant([9, 10, 11, 12], dtype=dtype)
+      t = array_ops.ones([8], dtype=dtype)
+      assigned = array_ops.tensor_scatter_update(t, indices, updates)
+      added = array_ops.tensor_scatter_add(t, indices, updates)
+      subbed = array_ops.tensor_scatter_sub(t, indices, updates)
 
-    self.assertAllEqual(assigned,
-                        constant_op.constant([1, 11, 1, 10, 9, 1, 1, 12]))
-    self.assertAllEqual(added,
-                        constant_op.constant([1, 12, 1, 11, 10, 1, 1, 13]))
-    self.assertAllEqual(subbed,
-                        constant_op.constant([1, -10, 1, -9, -8, 1, 1, -11]))
+      self.assertAllEqual(assigned,
+                          constant_op.constant([1, 11, 1, 10, 9, 1, 1, 12]))
+      self.assertAllEqual(added,
+                          constant_op.constant([1, 12, 1, 11, 10, 1, 1, 13]))
+      self.assertAllEqual(subbed,
+                          constant_op.constant([1, -10, 1, -9, -8, 1, 1, -11]))
 
-  @test_util.run_v1_only("b/120545219")
   def testUpdateAddSubGradients(self):
-
     with self.cached_session():
       indices = constant_op.constant([[3], [1]])
       updates = constant_op.constant([9, 10], dtype=dtypes.float32)
       x = array_ops.ones([4], dtype=dtypes.float32)
 
-      assigned = array_ops.tensor_scatter_update(x, indices, updates)
-      added = array_ops.tensor_scatter_add(x, indices, updates)
-      subbed = array_ops.tensor_scatter_sub(x, indices, updates)
+      theoretical, numerical = gradient_checker_v2.compute_gradient(
+          lambda x: array_ops.tensor_scatter_update(x, indices, updates), [x])
+      self.assertAllClose(theoretical, numerical, 5e-4, 5e-4)
+      theoretical, numerical = gradient_checker_v2.compute_gradient(
+          lambda x: array_ops.tensor_scatter_add(x, indices, updates), [x])
+      self.assertAllClose(theoretical, numerical, 5e-4, 5e-4)
+      theoretical, numerical = gradient_checker_v2.compute_gradient(
+          lambda x: array_ops.tensor_scatter_sub(x, indices, updates), [x])
+      self.assertAllClose(theoretical, numerical, 5e-4, 5e-4)
 
-      err_assigned = gradient_checker.compute_gradient_error(
-          x, [4], assigned, [4])
-      err_added = gradient_checker.compute_gradient_error(x, [4], added, [4])
-      err_subbed = gradient_checker.compute_gradient_error(x, [4], subbed, [4])
-
-      self.assertLess(err_assigned, 2e-4)
-      self.assertLess(err_added, 2e-4)
-      self.assertLess(err_subbed, 2e-4)
-
-      err_assigned_wrt_updates = gradient_checker.compute_gradient_error(
-          updates, [2], assigned, [4])
-      err_added_wrt_updates = gradient_checker.compute_gradient_error(
-          updates, [2], added, [4])
-      err_subbed_wrt_updates = gradient_checker.compute_gradient_error(
-          updates, [2], subbed, [4])
-
-      self.assertLess(err_assigned_wrt_updates, 2e-4)
-      self.assertLess(err_added_wrt_updates, 2e-4)
-      self.assertLess(err_subbed_wrt_updates, 2e-4)
+      theoretical, numerical = gradient_checker_v2.compute_gradient(
+          lambda updates: array_ops.tensor_scatter_update(x, indices, updates),
+          [updates])
+      self.assertAllClose(theoretical, numerical, 5e-4, 5e-4)
+      theoretical, numerical = gradient_checker_v2.compute_gradient(
+          lambda updates: array_ops.tensor_scatter_add(x, indices, updates),
+          [updates])
+      self.assertAllClose(theoretical, numerical, 5e-4, 5e-4)
+      theoretical, numerical = gradient_checker_v2.compute_gradient(
+          lambda updates: array_ops.tensor_scatter_sub(x, indices, updates),
+          [updates])
+      self.assertAllClose(theoretical, numerical, 5e-4, 5e-4)
 
   @test_util.run_in_graph_and_eager_modes
   def testUpdateMinMax(self):
-    indices = constant_op.constant([[4], [3], [1], [7]])
-    updates = constant_op.constant([0, 2, -1, 1.2], dtype=dtypes.float32)
-    t = array_ops.ones([8], dtype=dtypes.float32)
-    assigned = array_ops.tensor_scatter_update(t, indices, updates)
-    min_result = array_ops.tensor_scatter_min(t, indices, updates)
-    max_result = array_ops.tensor_scatter_max(t, indices, updates)
+    for dtype in (dtypes.int32, dtypes.float32):
+      indices = constant_op.constant([[4], [3], [1], [7]])
+      updates = constant_op.constant([0, 2, -1, 2], dtype=dtype)
+      t = array_ops.ones([8], dtype=dtype)
+      assigned = array_ops.tensor_scatter_update(t, indices, updates)
+      min_result = array_ops.tensor_scatter_min(t, indices, updates)
+      max_result = array_ops.tensor_scatter_max(t, indices, updates)
 
-    self.assertAllEqual(assigned,
-                        constant_op.constant([1, -1, 1, 2, 0, 1, 1, 1.2]))
-    self.assertAllEqual(min_result,
-                        constant_op.constant([1, -1, 1, 1, 0, 1, 1, 1]))
-    self.assertAllEqual(max_result,
-                        constant_op.constant([1, 1, 1, 2, 1, 1, 1, 1.2]))
+      self.assertAllEqual(assigned,
+                          constant_op.constant([1, -1, 1, 2, 0, 1, 1, 2]))
+      self.assertAllEqual(min_result,
+                          constant_op.constant([1, -1, 1, 1, 0, 1, 1, 1]))
+      self.assertAllEqual(max_result,
+                          constant_op.constant([1, 1, 1, 2, 1, 1, 1, 2]))
 
   def testTensorScatterUpdateWithForwarding(self):
-    @def_function.function
-    def _TestFn():
-      indices = constant_op.constant([[4], [3], [1], [7]])
-      updates = constant_op.constant([9, 10, 11, 12], dtype=dtypes.float32)
-      t = array_ops.ones([8], dtype=dtypes.float32)
+    for dtype in (dtypes.int32, dtypes.float32):
 
-      return array_ops.tensor_scatter_update(t, indices, updates)
+      @def_function.function
+      def _TestFn():
+        indices = constant_op.constant([[4], [3], [1], [7]])
+        updates = constant_op.constant([9, 10, 11, 12], dtype=dtype)  # pylint: disable=cell-var-from-loop
+        t = array_ops.ones([8], dtype=dtype)  # pylint: disable=cell-var-from-loop
 
-    self.assertAllEqual(_TestFn(), [1, 11, 1, 10, 9, 1, 1, 12])
+        return array_ops.tensor_scatter_update(t, indices, updates)
+
+      self.assertAllEqual(_TestFn(), [1, 11, 1, 10, 9, 1, 1, 12])
 
   @test_util.run_in_graph_and_eager_modes
   def testTensorScatterUpdateWithStrings(self):
diff --git a/tensorflow/python/kernel_tests/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops_test.py
index c6f924daca6..cbf25df03b0 100644
--- a/tensorflow/python/kernel_tests/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops_test.py
@@ -26,6 +26,7 @@ import numpy as np
 
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
+from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
@@ -40,6 +41,7 @@ from tensorflow.python.framework import test_ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import list_ops
@@ -521,6 +523,11 @@ class ReverseV2Test(test_util.TensorFlowTestCase):
               np_answer = x_np[::-1, :, :]
               self.assertAllEqual(x_tf, np_answer)
 
+  def testReverseInvalidShape(self):
+    x = np.ndarray(shape=[0, 1, 1])
+    v = array_ops.reverse_v2(x, axis=[1])
+    self.assertAllEqual(self.evaluate(v), v)
+
 
 class MeshgridTest(test_util.TensorFlowTestCase):
 
@@ -1142,7 +1149,7 @@ class StridedSliceAssignChecker(object):
       self.test.assertAllEqual(val_copy, valnp)
 
 
-class SliceAssignTest(test_util.TensorFlowTestCase):
+class SliceAssignTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   @test_util.run_deprecated_v1
   def testInvalidSlice(self):
@@ -1227,8 +1234,9 @@ class SliceAssignTest(test_util.TensorFlowTestCase):
       with self.assertRaises(ValueError):
         sess.run(v[:].assign(too_small_val))
 
+  @test_util.disable_xla("b/123559667")
   @test_util.run_in_graph_and_eager_modes
-  def testTensorStridedSliceAssignWithInputForward(self):
+  def testTensorStridedSliceUpdateWithInputForward(self):
     """Tests tensor_strided_slice_update with input-forwarding taking effect."""
     @def_function.function
     def assign(x):
@@ -1236,8 +1244,9 @@ class SliceAssignTest(test_util.TensorFlowTestCase):
       return gen_array_ops.tensor_strided_slice_update(y, [0], [1], [1], [0])
     self.assertAllEqual([0, 1], self.evaluate(assign(array_ops.zeros([2]))))
 
+  @test_util.disable_xla("b/123559667")
   @test_util.run_in_graph_and_eager_modes
-  def testTensorStridedSliceAssignNoInputForward(self):
+  def testTensorStridedSliceUpdateNoInputForward(self):
     """Tests tensor_strided_slice_update with no input-forwarding."""
     x = constant_op.constant([0.2, 0.3])
     y = x + 1
@@ -1247,6 +1256,36 @@ class SliceAssignTest(test_util.TensorFlowTestCase):
     ans = y + z
     self.assertAllClose([1.6, 2.6], self.evaluate(ans))
 
+  @test_util.disable_xla("b/123559667")
+  def testTensorStridedSliceUpdateGradSimple(self):
+    original = constant_op.constant([0.2, 0.3])
+    updates = constant_op.constant([0.4])
+    with backprop.GradientTape() as tape:
+      tape.watch([original, updates])
+      updated = gen_array_ops.tensor_strided_slice_update(
+          original, [0], [1], [1], updates)
+    d1, d2 = tape.gradient(updated, [original, updates],
+                           output_gradients=constant_op.constant([2.0, 3.0]))
+    self.assertAllClose([0.0, 3.0], d1)
+    self.assertAllClose([2.0], d2)
+
+  @parameterized.named_parameters(
+      ("_%s" % i, *args) for i, args in enumerate([  # pylint:disable=g-complex-comprehension
+          ([2, 5], [0, 1], [1, 0], [1, 2], [2], 0, 2, 0, 0, 1),
+          ([4], [5], [3], [1], [3], 1, 0, 0, 0, 0),
+          ([2, 2, 3, 2], [0, 0, 1], [1, 0, 2], [1, 0, 1], [2, 3], 0, 0, 2, 0, 5)
+      ]))
+  @test_util.disable_xla("b/123559667")
+  def testTensorStridedSliceUpdateGrad(
+      self, shape, begin, end, strides, updates_shape, *args):
+    with self.cached_session():
+      def f(a, b):
+        return gen_array_ops.tensor_strided_slice_update(
+            a, begin, end, strides, b, *args)
+      theoretical, numerical = gradient_checker_v2.compute_gradient(
+          f, [array_ops.zeros(shape), array_ops.ones(updates_shape)], delta=1.0)
+      self.assertAllClose(theoretical, numerical)
+
 
 class ShapeSizeRankTest(test_util.TensorFlowTestCase):
 
@@ -1364,6 +1403,34 @@ class SequenceMaskTest(test_util.TensorFlowTestCase):
       check_dtypes(dtypes.int64, dtypes.int32)
       check_dtypes(dtypes.int64, dtypes.int64)
 
+  def testOutputDtype(self):
+
+    def check_output_dtype(output_dtype):
+      res = self.evaluate(
+          array_ops.sequence_mask(
+              constant_op.constant([1, 3, 2], dtype=dtypes.int32),
+              constant_op.constant(5, dtype=dtypes.int32),
+              dtype=output_dtype))
+      self.assertAllEqual(
+          res,
+          self.evaluate(
+              math_ops.cast([[True, False, False, False, False],
+                             [True, True, True, False, False],
+                             [True, True, False, False, False]], output_dtype)))
+
+    check_output_dtype(dtypes.bool)
+    check_output_dtype("bool")
+    check_output_dtype(np.bool)
+    check_output_dtype(dtypes.int32)
+    check_output_dtype("int32")
+    check_output_dtype(np.int32)
+    check_output_dtype(dtypes.float32)
+    check_output_dtype("float32")
+    check_output_dtype(np.float32)
+    check_output_dtype(dtypes.int64)
+    check_output_dtype("float64")
+    check_output_dtype(np.float64)
+
 
 class ConcatSliceResourceTest(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/python/kernel_tests/batch_matmul_op_test.py b/tensorflow/python/kernel_tests/batch_matmul_op_test.py
index 30b61027813..ac82a320bb6 100644
--- a/tensorflow/python/kernel_tests/batch_matmul_op_test.py
+++ b/tensorflow/python/kernel_tests/batch_matmul_op_test.py
@@ -130,6 +130,7 @@ class BatchMatmulOpTest(test.TestCase):
 
 def _GetBatchMatmulOpTest(dtype, adjoint_a, adjoint_b, use_static_shape):
 
+  @test_util.run_without_tensor_float_32("Tests batch matmul")
   def Test(self):
     np.random.seed(42)
     self._testNonEmpty(dtype, adjoint_a, adjoint_b, use_static_shape)
@@ -141,6 +142,7 @@ def _GetBatchMatmulOpTest(dtype, adjoint_a, adjoint_b, use_static_shape):
 def _GetBatchMatmulOpBroadcastingTest(dtype, adjoint_a, adjoint_b,
                                       use_static_shape):
 
+  @test_util.run_without_tensor_float_32("Tests batch matmul")
   def Test(self):
     np.random.seed(42)
     self._testBroadcasting(dtype, adjoint_a, adjoint_b, use_static_shape)
diff --git a/tensorflow/python/kernel_tests/cholesky_op_test.py b/tensorflow/python/kernel_tests/cholesky_op_test.py
index a9afca8bfe7..0697f7def1b 100644
--- a/tensorflow/python/kernel_tests/cholesky_op_test.py
+++ b/tensorflow/python/kernel_tests/cholesky_op_test.py
@@ -106,7 +106,7 @@ class CholeskyOpTest(test.TestCase):
   def _verifyCholesky(self, x):
     # Verify that LL^T == x.
     chol = linalg_ops.cholesky(x)
-    verification = math_ops.matmul(chol, chol, adjoint_b=True)
+    verification = test_util.matmul_without_tf32(chol, chol, adjoint_b=True)
     self._verifyCholeskyBase(x, chol, verification)
 
   @test_util.run_in_graph_and_eager_modes(use_gpu=True)
@@ -271,8 +271,8 @@ class CholeskyGradTest(test.TestCase):
     def Compute(x):
       # Turn the random matrix x into a Hermitian matrix by
       # computing the quadratic form x * x^H.
-      a = math_ops.matmul(x, math_ops.conj(
-          array_ops.matrix_transpose(x))) / shape[0]
+      a = test_util.matmul_without_tf32(
+          x, math_ops.conj(array_ops.matrix_transpose(x))) / shape[0]
       if batch:
         a = array_ops.tile(array_ops.expand_dims(a, 0), [2, 1, 1])
       # Finally take the cholesky decomposition of the Hermitian matrix.
diff --git a/tensorflow/python/kernel_tests/clip_ops_test.py b/tensorflow/python/kernel_tests/clip_ops_test.py
index 8d6b475c914..d0c805f96e3 100644
--- a/tensorflow/python/kernel_tests/clip_ops_test.py
+++ b/tensorflow/python/kernel_tests/clip_ops_test.py
@@ -67,6 +67,7 @@ class ClipTest(test.TestCase):
         dtypes.float16,
         dtypes.float32,
         dtypes.float64,
+        dtypes.bfloat16,
         dtypes.int16,
         dtypes.int32,
         dtypes.int64,
@@ -88,6 +89,7 @@ class ClipTest(test.TestCase):
         dtypes.float16,
         dtypes.float32,
         dtypes.float64,
+        dtypes.bfloat16,
         dtypes.int16,
         dtypes.int32,
         dtypes.int64,
@@ -110,6 +112,7 @@ class ClipTest(test.TestCase):
         dtypes.float16,
         dtypes.float32,
         dtypes.float64,
+        dtypes.bfloat16,
         dtypes.int16,
         dtypes.int32,
         dtypes.int64,
@@ -132,6 +135,7 @@ class ClipTest(test.TestCase):
         dtypes.float16,
         dtypes.float32,
         dtypes.float64,
+        dtypes.bfloat16,
         dtypes.int16,
         dtypes.int32,
         dtypes.int64,
diff --git a/tensorflow/python/kernel_tests/collective_ops_test.py b/tensorflow/python/kernel_tests/collective_ops_test.py
index 4225df7537a..4c34f02c254 100644
--- a/tensorflow/python/kernel_tests/collective_ops_test.py
+++ b/tensorflow/python/kernel_tests/collective_ops_test.py
@@ -80,6 +80,47 @@ class CollectiveOpsTest(test.TestCase):
     for result in run_all_reduce_2cpus():
       self.assertAllClose(result, [2.], rtol=1e-5, atol=1e-5)
 
+  @test_util.run_v2_only
+  def testGatherV2(self):
+    self._setup_context()
+
+    @def_function.function
+    def single_all_gather(in_value, group_size, group_key, instance_key):
+      return gen_collective_ops.collective_gather_v2(
+          in_value,
+          group_size,
+          group_key,
+          instance_key,
+          communication_hint='auto')
+
+    @def_function.function
+    def run_all_gather_1cpu():
+      with ops.device('/device:CPU:0'):
+        in_value = constant_op.constant([1.])
+        group_size = constant_op.constant(1)
+        group_key = constant_op.constant(1)
+        instance_key = constant_op.constant(1)
+        return single_all_gather(in_value, group_size, group_key, instance_key)
+
+    @def_function.function
+    def run_all_gather_2cpus():
+      in_value = constant_op.constant([1.])
+      group_size = constant_op.constant(2)
+      group_key = constant_op.constant(2)
+      instance_key = constant_op.constant(2)
+      collectives = []
+      with ops.device('/device:CPU:0'):
+        collectives.append(single_all_gather(in_value, group_size, group_key,
+                                             instance_key))
+      with ops.device('/device:CPU:1'):
+        collectives.append(single_all_gather(in_value, group_size, group_key,
+                                             instance_key))
+      return collectives
+
+    self.assertAllClose(run_all_gather_1cpu(), [1.], rtol=1e-5, atol=1e-5)
+    for result in run_all_gather_2cpus():
+      self.assertAllClose(result, [1., 1.], rtol=1e-5, atol=1e-5)
+
   @test_util.run_v2_only
   def testInstanceKeyScopedUnderGroupKey(self):
     self._setup_context()
diff --git a/tensorflow/python/kernel_tests/cond_v2_test.py b/tensorflow/python/kernel_tests/cond_v2_test.py
index b8829181747..30b20c67fda 100644
--- a/tensorflow/python/kernel_tests/cond_v2_test.py
+++ b/tensorflow/python/kernel_tests/cond_v2_test.py
@@ -20,7 +20,6 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.core.protobuf import config_pb2
-from tensorflow.python.compat.compat import forward_compatibility_horizon
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
@@ -941,6 +940,7 @@ class CondV2Test(test.TestCase):
     self.assertEqual(fn_output[0].op.type, "StatefulPartitionedCall")
     self.assertAllEqual(self.evaluate(fn_output), [2.0, 4.0])
 
+  @test_util.disable_tfrt("GPU to host copy not implemented yet.")
   def testGradientTapeOfCondWithResourceVariableInFunction(self):
     with context.eager_mode():
       v = variables.Variable(2.)
@@ -1606,5 +1606,4 @@ def _has_node_with_op(run_metadata, op_type):
 
 
 if __name__ == "__main__":
-  with forward_compatibility_horizon(2020, 8, 21):
-    test.main()
+  test.main()
diff --git a/tensorflow/python/kernel_tests/conv_ops_3d_test.py b/tensorflow/python/kernel_tests/conv_ops_3d_test.py
index ff4da3afc9f..d0c4fea8eb4 100644
--- a/tensorflow/python/kernel_tests/conv_ops_3d_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_3d_test.py
@@ -48,6 +48,10 @@ def GetTestConfigs():
   return test_configs
 
 
+@test_util.run_all_without_tensor_float_32(
+    "Tests Conv3d, which in some cases is implemented with a matmul. With "
+    "TensorFloat-32, tests fail in some of those cases (and as of August 13 "
+    "2020, only those cases)")
 class Conv3DTest(test.TestCase):
 
   def _DtypesToTest(self, use_gpu):
@@ -189,9 +193,9 @@ class Conv3DTest(test.TestCase):
                 e_value.flatten(), c_value.flatten(), atol=tolerance, rtol=1e-6)
 
   def _CreateNumpyTensor(self, sizes):
-    return np.asarray([f * 1.0
-                       for f in range(1,
-                                      np.prod(sizes) + 1)]).reshape(sizes)
+    return np.asarray([f * 1.0 for f in range(1,
+                                              np.prod(sizes) + 1)],
+                      dtype=np.float32).reshape(sizes)
 
   @test_util.run_in_graph_and_eager_modes
   def testConv3DExpandedBatch(self):
diff --git a/tensorflow/python/kernel_tests/conv_ops_test.py b/tensorflow/python/kernel_tests/conv_ops_test.py
index f480f4319da..f7234398cdc 100644
--- a/tensorflow/python/kernel_tests/conv_ops_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_test.py
@@ -310,14 +310,14 @@ class Conv2DTest(test.TestCase):
           data_format, use_gpu)
       expected_results.append(expected)
       computed_results.append(computed)
-      tolerance = 1e-2 if use_gpu else 1e-5
-      expected_values = self.evaluate(expected_results)
-      computed_values = self.evaluate(computed_results)
-      for e_value, c_value in zip(expected_values, computed_values):
-        tf_logging.debug("expected = %s", e_value)
-        tf_logging.debug("actual = %s", c_value)
-        self.assertAllClose(
-            e_value.flatten(), c_value.flatten(), atol=tolerance, rtol=rtol)
+    tolerance = 1e-2 if use_gpu else 1e-5
+    expected_values = self.evaluate(expected_results)
+    computed_values = self.evaluate(computed_results)
+    for e_value, c_value in zip(expected_values, computed_values):
+      tf_logging.debug("expected = %s", e_value)
+      tf_logging.debug("actual = %s", c_value)
+      self.assertAllClose(
+          e_value.flatten(), c_value.flatten(), atol=tolerance, rtol=rtol)
 
   def _VerifyValues(self,
                     tensor_in_sizes,
diff --git a/tensorflow/python/kernel_tests/cwise_ops_binary_test.py b/tensorflow/python/kernel_tests/cwise_ops_binary_test.py
index 50e6c0ad91f..30a04ab3d3e 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_binary_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_binary_test.py
@@ -343,9 +343,9 @@ class BinaryOpTest(test.TestCase):
     self._compareGpu(x, y, np.mod, _MOD)
 
   def testUint32Basic(self):
-    x = np.arange(1, 13, 2).reshape(1, 3, 2).astype(np.int32)
-    y = np.arange(1, 7, 1).reshape(1, 3, 2).astype(np.int32)
-    self._compareBoth(x, y, np.add, math_ops.add)
+    x = np.arange(1, 13, 2).reshape(1, 3, 2).astype(np.uint32)
+    y = np.arange(1, 7, 1).reshape(1, 3, 2).astype(np.uint32)
+    self._compareBoth(x, y, np.add, math_ops.add_v2)
 
   def testInt64Basic(self):
     x = np.arange(1 << 40, 13 << 40, 2 << 40).reshape(1, 3, 2).astype(np.int64)
diff --git a/tensorflow/python/kernel_tests/cwise_ops_test.py b/tensorflow/python/kernel_tests/cwise_ops_test.py
index 78d3af17990..96628a1a06a 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_test.py
@@ -840,11 +840,16 @@ class MathOpsOverloadTest(test.TestCase):
       return self.evaluate(z)
 
   def _compareBinary(self, x, y, dtype, np_func, tf_func):
-    np_ans = np_func(x, y).astype(dtype.as_numpy_dtype)
+    # astype and assertAllClose do not properly handle bfloat16 values
+    np_ans = np_func(x, y).astype(np.float32 if dtype == dtypes_lib.bfloat16
+                                  else dtype.as_numpy_dtype)
+    rtol = 1e-2 if dtype == dtypes_lib.bfloat16 else 1e-6
     self.assertAllClose(np_ans,
-                        self._computeTensorAndLiteral(x, y, dtype, tf_func))
+                        self._computeTensorAndLiteral(x, y, dtype, tf_func),
+                        rtol=rtol)
     self.assertAllClose(np_ans,
-                        self._computeLiteralAndTensor(x, y, dtype, tf_func))
+                        self._computeLiteralAndTensor(x, y, dtype, tf_func),
+                        rtol=rtol)
 
   def _compareUnary(self, x, dtype, np_func, tf_func):
     np_ans = np_func(x).astype(dtype.as_numpy_dtype)
@@ -857,6 +862,7 @@ class MathOpsOverloadTest(test.TestCase):
         dtypes_lib.float16,
         dtypes_lib.float32,
         dtypes_lib.float64,
+        dtypes_lib.bfloat16,
         dtypes_lib.int32,
         dtypes_lib.int64,
         dtypes_lib.complex64,
@@ -920,12 +926,16 @@ class MathOpsOverloadTest(test.TestCase):
 class IsFiniteInfNanTest(test.TestCase):
 
   def _compare(self, x, use_gpu):
-    np_finite, np_inf, np_nan = np.isfinite(x), np.isinf(x), np.isnan(x)
     with test_util.device(use_gpu=use_gpu):
       inx = ops.convert_to_tensor(x)
       ofinite, oinf, onan = math_ops.is_finite(inx), math_ops.is_inf(
           inx), math_ops.is_nan(inx)
       tf_finite, tf_inf, tf_nan = self.evaluate([ofinite, oinf, onan])
+    if x.dtype == dtypes_lib.bfloat16.as_numpy_dtype:
+      # Numpy will implicitly convert bfloat16 value to float16, so we cast to
+      # float32 to avoid this.
+      x = x.astype(np.float32)
+    np_finite, np_inf, np_nan = np.isfinite(x), np.isinf(x), np.isnan(x)
     self.assertAllEqual(np_inf, tf_inf)
     self.assertAllEqual(np_nan, tf_nan)
     self.assertAllEqual(np_finite, tf_finite)
@@ -934,11 +944,18 @@ class IsFiniteInfNanTest(test.TestCase):
     self.assertShapeEqual(np_finite, ofinite)
 
   def _testDtype(self, dtype):
-    fi = np.finfo(dtype)
-    data = np.array([
-        0, -1, 1, fi.resolution, -fi.resolution, fi.min, fi.max, -np.inf,
-        np.inf, np.nan
-    ]).astype(dtype)
+    if dtype != dtypes_lib.bfloat16.as_numpy_dtype:
+      fi = np.finfo(dtype)
+      data = np.array([
+          0, -1, 1, fi.resolution, -fi.resolution, fi.min, fi.max, -np.inf,
+          np.inf, np.nan
+      ]).astype(dtype)
+    else:
+      # np.finfo does not support bfloat16
+      data = np.array([
+          0, -1, 1, 0.01, -0.01, -3.3895e+38, 3.3895e+38, -np.inf, np.inf,
+          np.nan
+      ]).astype(dtype)
     self._compare(data, use_gpu=False)
     self._compare(data, use_gpu=True)
 
@@ -951,6 +968,9 @@ class IsFiniteInfNanTest(test.TestCase):
   def testDouble(self):
     self._testDtype(np.float64)
 
+  def testBfloat16(self):
+    self._testDtype(dtypes_lib.bfloat16.as_numpy_dtype)
+
   def testSqrt(self):
     for dtype in [np.float16, np.float32, np.float64]:
       fi = np.finfo(dtype)
@@ -998,8 +1018,8 @@ class RoundingTest(test.TestCase):
   def _testDtype(self, dtype):
     data = (np.arange(-3, 3) / 4.).reshape(1, 3, 2).astype(dtype)
     self._compare(data)
-    # TODO: rint op is not supported for float16
-    if dtype is np.float16:
+    # TODO(reedwm): rint op is not supported for float16 and bfloat16
+    if dtype in (np.float16, dtypes_lib.bfloat16.as_numpy_dtype):
       return
     self._compare_values(data)
     x = [0.5, 0.5000001]
@@ -1012,8 +1032,8 @@ class RoundingTest(test.TestCase):
     self._compare_values(x, y=y)
 
   def testTypes(self):
-    self.skipTest("b/131162241")
-    for dtype in [np.float16, np.float32, np.float64]:
+    for dtype in [np.float16, np.float32, np.float64,
+                  dtypes_lib.bfloat16.as_numpy_dtype]:
       with self.subTest(dtype=dtype):
         self._testDtype(dtype)
 
diff --git a/tensorflow/python/kernel_tests/cwise_ops_unary_test.py b/tensorflow/python/kernel_tests/cwise_ops_unary_test.py
index df848a653d4..9d46ed35639 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_unary_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_unary_test.py
@@ -61,6 +61,8 @@ def _default_tolerance(dtype):
   Args:
     dtype: A datatype.
   """
+  if dtype == dtypes_lib.bfloat16.as_numpy_dtype:
+    return 5e-3
   if dtype == np.float16:
     return 5e-3
   elif dtype in (np.float32, np.complex64):
@@ -81,12 +83,7 @@ class UnaryOpTest(test.TestCase):
     np_ans = np_func(x)
     with self.cached_session(use_gpu=False):
       inx = ops.convert_to_tensor(x)
-      if x.dtype in (np.float32, np.float64,
-                     dtypes_lib.bfloat16.as_numpy_dtype):
-        y = 1.1 * tf_func(inx)
-        np_ans *= 1.1
-      else:
-        y = tf_func(inx)
+      y = tf_func(inx)
       tf_cpu = self.evaluate(y)
       self.assertShapeEqual(np_ans, y)
       if x.dtype == np.float16:
@@ -99,7 +96,7 @@ class UnaryOpTest(test.TestCase):
       if x.dtype in (np.complex64, np.complex128) and tf_func == math_ops.sign:
         return  # Return early
 
-      if x.dtype == np.float16:
+      if x.dtype in (np.float16, dtypes_lib.bfloat16.as_numpy_dtype):
         s = list(np.shape(x))
         jacob_t, _ = gradient_checker.compute_gradient(
             inx, s, y, s, x_init_value=x)
@@ -108,7 +105,7 @@ class UnaryOpTest(test.TestCase):
         yf = tf_func(inxf)
         _, jacob_n = gradient_checker.compute_gradient(
             inxf, s, yf, s, x_init_value=xf, delta=1e-2)
-        jacob_n = jacob_n.astype(np.float16)
+        jacob_n = jacob_n.astype(x.dtype)
         self.assertAllClose(jacob_t, jacob_n, rtol=grad_rtol, atol=grad_atol)
       elif x.dtype in (np.float32, np.complex64):
         s = list(np.shape(x))
@@ -384,13 +381,36 @@ class UnaryOpTest(test.TestCase):
     self._compareBothSparse(y, np.sign, math_ops.sign)
     self._compareBothSparse(x, np.vectorize(math.erf), math_ops.erf, tol=1e-3)
 
+  @test_util.run_deprecated_v1
   def testBFloat16Basic(self):
+    def compute_f32(np_func):
+      """Decorator to compute Numpy function with float32 math."""
+      def f(x):
+        y = np_func(x.astype(np.float32))
+        return y.astype(x.dtype)
+      return f
+
+    bfloat16 = dtypes_lib.bfloat16.as_numpy_dtype
     x = np.arange(-6, 6,
                   2).reshape(1, 3, 2).astype(dtypes_lib.bfloat16.as_numpy_dtype)
+    y = (x + .5).astype(bfloat16)  # no zero
+    z = (x + 15.5).astype(bfloat16)  # all positive
     self._compareCpu(x, np.abs, math_ops.abs)
     self._compareCpu(x, np.abs, _ABS)
     self._compareBoth(x, np.negative, math_ops.negative)
     self._compareBoth(x, np.negative, _NEG)
+    self._compareCpu(y, compute_f32(self._inv), math_ops.reciprocal)
+    self._compareCpu(x, np.exp, math_ops.exp)
+    self._compareCpu(x, np.expm1, math_ops.expm1)
+    self._compareCpu(z, compute_f32(np.log), math_ops.log)
+    self._compareCpu(z, compute_f32(np.log1p), math_ops.log1p)
+    self._compareCpu(y, np.sign, math_ops.sign)
+    self._compareBoth(x, compute_f32(np.sin), math_ops.sin)
+    self._compareBoth(x, compute_f32(np.cos), math_ops.cos)
+    self._compareBoth(x, compute_f32(np.tan), math_ops.tan)
+    self._compareBoth(x, compute_f32(np.sinh), math_ops.sinh)
+    self._compareBoth(x, compute_f32(np.cosh), math_ops.cosh)
+    self._compareBoth(x, compute_f32(np.tanh), math_ops.tanh)
 
   def testInt8Basic(self):
     x = np.arange(-6, 6, 2).reshape(1, 3, 2).astype(np.int8)
diff --git a/tensorflow/python/kernel_tests/decode_image_op_test.py b/tensorflow/python/kernel_tests/decode_image_op_test.py
index ba5770001ad..a2c0c7f63a8 100644
--- a/tensorflow/python/kernel_tests/decode_image_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_image_op_test.py
@@ -45,7 +45,6 @@ class DecodeImageOpTest(test.TestCase):
       self.assertEqual(len(bmp0), 4194)
       self.assertAllEqual(image0, image1)
 
-  @test_util.run_deprecated_v1
   def testGif(self):
     # Read some real GIFs
     path = os.path.join(prefix_path, "gif", "testdata", "scan.gif")
@@ -76,11 +75,10 @@ class DecodeImageOpTest(test.TestCase):
 
         self.assertAllClose(frame, gt)
 
-        bad_channels = image_ops.decode_image(gif0, channels=1)
         with self.assertRaises(errors_impl.InvalidArgumentError):
+          bad_channels = image_ops.decode_image(gif0, channels=1)
           self.evaluate(bad_channels)
 
-  @test_util.run_deprecated_v1
   def testJpeg(self):
     # Read a real jpeg and verify shape
     path = os.path.join(prefix_path, "jpeg", "testdata", "jpeg_merge_test1.jpg")
@@ -93,8 +91,8 @@ class DecodeImageOpTest(test.TestCase):
       self.assertEqual(image0.shape, (256, 128, 3))
       self.assertAllEqual(image0, image1)
 
-      bad_channels = image_ops.decode_image(jpeg0, channels=4)
       with self.assertRaises(errors_impl.InvalidArgumentError):
+        bad_channels = image_ops.decode_image(jpeg0, channels=4)
         self.evaluate(bad_channels)
 
   def testPng(self):
diff --git a/tensorflow/python/kernel_tests/decode_raw_op_test.py b/tensorflow/python/kernel_tests/decode_raw_op_test.py
index 5ed6689e48a..79d5a725552 100644
--- a/tensorflow/python/kernel_tests/decode_raw_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_raw_op_test.py
@@ -21,7 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import test_util
+from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.platform import test
@@ -29,15 +29,16 @@ from tensorflow.python.platform import test
 
 class DecodeRawOpTest(test.TestCase):
 
-  @test_util.deprecated_graph_mode_only
   def testShapeInference(self):
-    for dtype in [dtypes.bool, dtypes.int8, dtypes.uint8, dtypes.int16,
-                  dtypes.uint16, dtypes.int32, dtypes.int64, dtypes.float16,
-                  dtypes.float32, dtypes.float64, dtypes.complex64,
-                  dtypes.complex128]:
-      in_bytes = array_ops.placeholder(dtypes.string, shape=[None])
-      decode = parsing_ops.decode_raw(in_bytes, dtype)
-      self.assertEqual([None, None], decode.get_shape().as_list())
+    # Shape function requires placeholders and a graph.
+    with ops.Graph().as_default():
+      for dtype in [dtypes.bool, dtypes.int8, dtypes.uint8, dtypes.int16,
+                    dtypes.uint16, dtypes.int32, dtypes.int64, dtypes.float16,
+                    dtypes.float32, dtypes.float64, dtypes.complex64,
+                    dtypes.complex128]:
+        in_bytes = array_ops.placeholder(dtypes.string, shape=[None])
+        decode = parsing_ops.decode_raw(in_bytes, dtype)
+        self.assertEqual([None, None], decode.get_shape().as_list())
 
   def testToUint8(self):
     self.assertAllEqual(
diff --git a/tensorflow/python/kernel_tests/depthwise_conv_op_test.py b/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
index 093de720b53..266a0f8d0fb 100644
--- a/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
+++ b/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
@@ -832,7 +832,7 @@ class DepthwiseConv2DTest(test.TestCase):
       # double datatype is currently not supported for convolution ops
       # on the ROCm platform
       optional_float64 = [] if test.is_built_with_rocm() else [dtypes.float64]
-      for data_type in ([dtypes.float32] + optional_float64):
+      for data_type in ([dtypes.float16, dtypes.float32] + optional_float64):
         self._ConstructAndTestGradient(
             input_size,
             filter_size,
diff --git a/tensorflow/python/kernel_tests/distributions/BUILD b/tensorflow/python/kernel_tests/distributions/BUILD
index 549d7b4c98e..fc1c9af6946 100644
--- a/tensorflow/python/kernel_tests/distributions/BUILD
+++ b/tensorflow/python/kernel_tests/distributions/BUILD
@@ -212,6 +212,7 @@ cuda_py_test(
         "no_oss",
         # disable to avoid false positives from scipy.
         "nomsan",
+        "noasan",  #b/168810473
     ],
     deps = [
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/kernel_tests/distributions/dirichlet_multinomial_test.py b/tensorflow/python/kernel_tests/distributions/dirichlet_multinomial_test.py
index a4c07daa940..7c8f389f178 100644
--- a/tensorflow/python/kernel_tests/distributions/dirichlet_multinomial_test.py
+++ b/tensorflow/python/kernel_tests/distributions/dirichlet_multinomial_test.py
@@ -268,6 +268,8 @@ class DirichletMultinomialTest(test.TestCase):
       self.assertAllClose(sample_var_, analytic_var, atol=0.05, rtol=0.)
       self.assertAllClose(sample_stddev_, analytic_stddev, atol=0.02, rtol=0.)
 
+  @test_util.run_without_tensor_float_32(
+      "Tests DirichletMultinomial.covariance, which calls matmul")
   def testCovariance(self):
     # Shape [2]
     alpha = [1., 2]
diff --git a/tensorflow/python/kernel_tests/distributions/dirichlet_test.py b/tensorflow/python/kernel_tests/distributions/dirichlet_test.py
index 0f963824531..a0d8bef327d 100644
--- a/tensorflow/python/kernel_tests/distributions/dirichlet_test.py
+++ b/tensorflow/python/kernel_tests/distributions/dirichlet_test.py
@@ -200,6 +200,8 @@ class DirichletTest(test.TestCase):
     self.assertAllClose(sample_var_, analytic_var, atol=0.03, rtol=0.)
     self.assertAllClose(sample_stddev_, analytic_stddev, atol=0.02, rtol=0.)
 
+  @test_util.run_without_tensor_float_32(
+      "Calls Dirichlet.covariance, which calls matmul")
   def testVariance(self):
     alpha = [1., 2, 3]
     denominator = np.sum(alpha)**2 * (np.sum(alpha) + 1)
diff --git a/tensorflow/python/kernel_tests/dynamic_partition_op_test.py b/tensorflow/python/kernel_tests/dynamic_partition_op_test.py
index 8c448194076..0fd9790c794 100644
--- a/tensorflow/python/kernel_tests/dynamic_partition_op_test.py
+++ b/tensorflow/python/kernel_tests/dynamic_partition_op_test.py
@@ -23,8 +23,10 @@ import unittest
 import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
+from tensorflow.python.framework import config
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import data_flow_ops
@@ -346,6 +348,19 @@ class DynamicPartitionTest(test.TestCase):
       res = self.evaluate(partitioned)
     self.assertEqual(res[-1].shape[0], 192)
 
+  #  see https://github.com/tensorflow/tensorflow/issues/42500
+  def testMultiGPU(self):
+    device_list = config.list_logical_devices("GPU")
+    results = []
+    for device in device_list:
+      with ops.device(device.name):
+        data = constant_op.constant(np.zeros((1000,)))
+        partitions = constant_op.constant(np.arange(1000, dtype=np.int32) % 10)
+        result = data_flow_ops.dynamic_partition(data, partitions, 10)
+        results.append(self.evaluate(result))
+    if device_list:
+      self.assertAllEqual(results, np.zeros((len(device_list), 10, 100)))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/einsum_op_test.py b/tensorflow/python/kernel_tests/einsum_op_test.py
index 10b96716580..4236eb93278 100644
--- a/tensorflow/python/kernel_tests/einsum_op_test.py
+++ b/tensorflow/python/kernel_tests/einsum_op_test.py
@@ -35,6 +35,8 @@ from tensorflow.python.platform import benchmark
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_without_tensor_float_32(
+    'Tests einsum, which sometimes does a matmul with cuBLAS')
 class EinsumOpTest(test.TestCase):
 
   def _check(self, s, *input_shapes, **kwargs):
@@ -237,7 +239,6 @@ class EinsumOpTest(test.TestCase):
           ((4, 3), (None, 3)))
     check('...ij,...jk->...ik', ((3, 1, 2, 3), None), ((1, 7, 3, 4), None))
 
-  @test_util.disable_xla('b/131919749')
   def testOutputRepeatedLabels(self):
     # This is the reverse operation of generalized traces, to be used for
     # computing symbolic gradients of einsum. Note: this operation is not
@@ -264,7 +265,6 @@ class EinsumOpTest(test.TestCase):
     # From transformer xl.
     check('ibnd,ijbn->jnd', [(1, 0, 5, 10), (1, 1, 0, 5)], (1, 5, 10))
 
-  @test_util.disable_xla('b/131919749')
   def testEmptyWithRepeatedLabels(self):
 
     def check(equation, input_shapes, output_shape):
@@ -287,6 +287,8 @@ class EinsumOpTest(test.TestCase):
 
 
 @test_util.run_all_in_graph_and_eager_modes
+@test_util.run_all_without_tensor_float_32(
+    "Tests einsum's gradient, which sometimes does a matmul with cuBLAS")
 class EinsumGradTest(test.TestCase):
 
   def _check_gradient(self, s, *input_shapes):
@@ -310,7 +312,6 @@ class EinsumGradTest(test.TestCase):
           self.assertLess(
               gradient_checker_v2.max_error(analytical, numerical), tol)
 
-  @test_util.disable_xla('b/131919749')
   def testUnary(self):
     # Unary cases.
     self._check_gradient('->', ())
@@ -319,7 +320,6 @@ class EinsumGradTest(test.TestCase):
     self._check_gradient('aabcd->add', (3, 3, 5, 4, 4))
     self._check_gradient('abcd->da', (3, 5, 4, 2))
 
-  @test_util.disable_xla('b/131919749')
   def testUnaryEllipsis(self):
     self._check_gradient('...->...', ())
     self._check_gradient('...->', ())
@@ -362,11 +362,9 @@ class EinsumGradTest(test.TestCase):
     self._check_gradient('ijkm,ijln->ijmn', (2, 3, 3, 4), (2, 3, 3, 2))
     self._check_gradient('abce,badf->abcd', (1, 2, 3, 4), (2, 1, 4, 3))
 
-  @test_util.disable_xla('b/131919749')
   def testReducedIndicesWithRepeatedLabels(self):
     self._check_gradient('abce,badf->bcba', (1, 2, 3, 4), (2, 1, 4, 3))
 
-  @test_util.disable_xla('b/131919749')
   def testRepeatedLabels(self):
     # Repeated indices.
     self._check_gradient('aba,a->b', (3, 4, 3), (3,))
@@ -376,7 +374,6 @@ class EinsumGradTest(test.TestCase):
     self._check_gradient('aab,bc->ac', (1, 1, 3), (3, 4))
     self._check_gradient('aab,bcc->ac', (2, 2, 3), (3, 4, 4))
 
-  @test_util.disable_xla('b/131919749')
   def testEmptyWithRepeatedLabels(self):
     self._check_gradient('aab,bc->ac', (0, 0, 10), (10, 10))
     self._check_gradient('aab,bc->ac', (1, 1, 0), (0, 10))
@@ -388,7 +385,6 @@ class EinsumGradTest(test.TestCase):
     self._check_gradient('...ij,...jk->...ik', (3, 1, 3, 2), (1, 5, 2, 4))
     self._check_gradient('i...j,j...k->i...k', (3, 1, 2, 2), (2, 2, 3, 1, 4))
 
-  @test_util.disable_xla('b/131919749')
   def testBroadcastingWithRepeatedLabels(self):
     self._check_gradient('ij,jk...k->i...', (3, 2), (2, 4, 1, 4))
     self._check_gradient('aab,b...c->a...c', (1, 1, 3), (3, 1, 1, 4))
diff --git a/tensorflow/python/kernel_tests/init_ops_test.py b/tensorflow/python/kernel_tests/init_ops_test.py
index e3268fad2d8..f2348c6c7ac 100644
--- a/tensorflow/python/kernel_tests/init_ops_test.py
+++ b/tensorflow/python/kernel_tests/init_ops_test.py
@@ -945,6 +945,8 @@ class ConvolutionDeltaOrthogonalInitializerTest(test.TestCase):
       self.assertAllClose(abs_value, count, rtol=tol, atol=tol)
 
 
+@test_util.run_all_without_tensor_float_32(
+    "Tests convolutional_orthogonal_1d, which calls matmul")
 class ConvolutionOrthogonal1dInitializerTest(test.TestCase):
 
   @test_util.run_deprecated_v1
@@ -1174,6 +1176,8 @@ class ConvolutionOrthogonal2dInitializerTest(test.TestCase):
         self.assertAllClose(self.evaluate(ratio), gain, rtol=tol, atol=tol)
 
 
+@test_util.run_all_without_tensor_float_32(
+    "Tests convolutional_orthogonal_3d, which calls matmul")
 class ConvolutionOrthogonal3dInitializerTest(test.TestCase):
 
   @test_util.run_deprecated_v1
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py
index e0e6fedd34e..c18456c670d 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py
@@ -144,6 +144,35 @@ class SquareLinearOperatorBlockDiagTest(
     self.assertTrue(operator.is_non_singular)
     self.assertFalse(operator.is_self_adjoint)
 
+  def test_is_x_parameters(self):
+    matrix = [[1., 0.], [1., 1.]]
+    sub_operator = linalg.LinearOperatorFullMatrix(matrix)
+    operator = block_diag.LinearOperatorBlockDiag(
+        [sub_operator],
+        is_positive_definite=True,
+        is_non_singular=True,
+        is_self_adjoint=False)
+    self.assertEqual(
+        operator.parameters,
+        {
+            "name": None,
+            "is_square": True,
+            "is_positive_definite": True,
+            "is_self_adjoint": False,
+            "is_non_singular": True,
+            "operators": [sub_operator],
+        })
+    self.assertEqual(
+        sub_operator.parameters,
+        {
+            "is_non_singular": None,
+            "is_positive_definite": None,
+            "is_self_adjoint": None,
+            "is_square": None,
+            "matrix": matrix,
+            "name": "LinearOperatorFullMatrix",
+        })
+
   def test_block_diag_adjoint_type(self):
     matrix = [[1., 0.], [0., 1.]]
     operator = block_diag.LinearOperatorBlockDiag(
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py
index c3a3ae9fe8a..1d3313d6504 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py
@@ -283,6 +283,18 @@ class LinearOperatorCirculantTestNonHermitianSpectrum(
     operator = linalg.LinearOperatorCirculant(
         lin_op_spectrum, input_output_dtype=dtype)
 
+    self.assertEqual(
+        operator.parameters,
+        {
+            "input_output_dtype": dtype,
+            "is_non_singular": None,
+            "is_positive_definite": None,
+            "is_self_adjoint": None,
+            "is_square": True,
+            "name": "LinearOperatorCirculant",
+            "spectrum": lin_op_spectrum,
+        })
+
     mat = self._spectrum_to_circulant_1d(spectrum, shape, dtype=dtype)
 
     return operator, mat
@@ -526,6 +538,20 @@ class LinearOperatorCirculant2DTestHermitianSpectrum(
         is_self_adjoint=True if ensure_self_adjoint_and_pd else None,
         input_output_dtype=dtype)
 
+    self.assertEqual(
+        operator.parameters,
+        {
+            "input_output_dtype": dtype,
+            "is_non_singular": None,
+            "is_positive_definite": (
+                True if ensure_self_adjoint_and_pd else None),
+            "is_self_adjoint": (
+                True if ensure_self_adjoint_and_pd else None),
+            "is_square": True,
+            "name": "LinearOperatorCirculant2D",
+            "spectrum": lin_op_spectrum,
+        })
+
     mat = self._spectrum_to_circulant_2d(spectrum, shape, dtype=dtype)
 
     return operator, mat
@@ -570,6 +596,19 @@ class LinearOperatorCirculant2DTestNonHermitianSpectrum(
     operator = linalg.LinearOperatorCirculant2D(
         lin_op_spectrum, input_output_dtype=dtype)
 
+    self.assertEqual(
+        operator.parameters,
+        {
+            "input_output_dtype": dtype,
+            "is_non_singular": None,
+            "is_positive_definite": None,
+            "is_self_adjoint": None,
+            "is_square": True,
+            "name": "LinearOperatorCirculant2D",
+            "spectrum": lin_op_spectrum,
+        }
+    )
+
     mat = self._spectrum_to_circulant_2d(spectrum, shape, dtype=dtype)
 
     return operator, mat
@@ -675,6 +714,18 @@ class LinearOperatorCirculant3DTest(test.TestCase):
       operator = linalg.LinearOperatorCirculant3D(spectrum)
       self.assertAllEqual((2, 2 * 3 * 5, 2 * 3 * 5), operator.shape)
 
+      self.assertEqual(
+          operator.parameters,
+          {
+              "input_output_dtype": dtypes.complex64,
+              "is_non_singular": None,
+              "is_positive_definite": None,
+              "is_self_adjoint": None,
+              "is_square": True,
+              "name": "LinearOperatorCirculant3D",
+              "spectrum": spectrum,
+          })
+
       matrix_tensor = operator.to_dense()
       self.assertEqual(matrix_tensor.dtype, dtypes.complex64)
       matrix_h = linalg.adjoint(matrix_tensor)
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_test.py
index 475cac212ce..0100eb4934b 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_test.py
@@ -43,6 +43,14 @@ class LinearOperatorShape(linalg.LinearOperator):
                is_self_adjoint=None,
                is_positive_definite=None,
                is_square=None):
+    parameters = dict(
+        shape=shape,
+        is_non_singular=is_non_singular,
+        is_self_adjoint=is_self_adjoint,
+        is_positive_definite=is_positive_definite,
+        is_square=is_square
+    )
+
     self._stored_shape = shape
     super(LinearOperatorShape, self).__init__(
         dtype=dtypes.float32,
@@ -50,7 +58,8 @@ class LinearOperatorShape(linalg.LinearOperator):
         is_non_singular=is_non_singular,
         is_self_adjoint=is_self_adjoint,
         is_positive_definite=is_positive_definite,
-        is_square=is_square)
+        is_square=is_square,
+        parameters=parameters)
 
   def _shape(self):
     return tensor_shape.TensorShape(self._stored_shape)
@@ -71,13 +80,22 @@ class LinearOperatorMatmulSolve(linalg.LinearOperator):
                is_self_adjoint=None,
                is_positive_definite=None,
                is_square=None):
+    parameters = dict(
+        matrix=matrix,
+        is_non_singular=is_non_singular,
+        is_self_adjoint=is_self_adjoint,
+        is_positive_definite=is_positive_definite,
+        is_square=is_square
+    )
+
     self._matrix = ops.convert_to_tensor(matrix, name="matrix")
     super(LinearOperatorMatmulSolve, self).__init__(
         dtype=self._matrix.dtype,
         is_non_singular=is_non_singular,
         is_self_adjoint=is_self_adjoint,
         is_positive_definite=is_positive_definite,
-        is_square=is_square)
+        is_square=is_square,
+        parameters=parameters)
 
   def _shape(self):
     return self._matrix.shape
@@ -109,6 +127,14 @@ class LinearOperatorTest(test.TestCase):
     self.assertAllEqual((1, 2), operator.batch_shape)
     self.assertAllEqual(4, operator.domain_dimension)
     self.assertAllEqual(3, operator.range_dimension)
+    expected_parameters = {
+        "is_non_singular": None,
+        "is_positive_definite": None,
+        "is_self_adjoint": None,
+        "is_square": None,
+        "shape": (1, 2, 3, 4),
+    }
+    self.assertEqual(expected_parameters, operator.parameters)
 
   def test_all_shape_methods_defined_by_the_one_method_shape(self):
     with self.cached_session():
@@ -131,6 +157,19 @@ class LinearOperatorTest(test.TestCase):
     self.assertTrue(operator.is_self_adjoint)
     self.assertFalse(operator.is_positive_definite)
 
+  def test_nontrivial_parameters(self):
+    matrix = rng.randn(2, 3, 4)
+    matrix_ph = array_ops.placeholder_with_default(input=matrix, shape=None)
+    operator = LinearOperatorMatmulSolve(matrix_ph)
+    expected_parameters = {
+        "is_non_singular": None,
+        "is_positive_definite": None,
+        "is_self_adjoint": None,
+        "is_square": None,
+        "matrix": matrix_ph,
+    }
+    self.assertEqual(expected_parameters, operator.parameters)
+
   def test_generic_to_dense_method_non_square_matrix_static(self):
     matrix = rng.randn(2, 3, 4)
     operator = LinearOperatorMatmulSolve(matrix)
diff --git a/tensorflow/python/kernel_tests/linalg/sparse/csr_sparse_matrix_ops_test.py b/tensorflow/python/kernel_tests/linalg/sparse/csr_sparse_matrix_ops_test.py
index ac82f190db0..f42600bd334 100644
--- a/tensorflow/python/kernel_tests/linalg/sparse/csr_sparse_matrix_ops_test.py
+++ b/tensorflow/python/kernel_tests/linalg/sparse/csr_sparse_matrix_ops_test.py
@@ -534,7 +534,7 @@ class CSRSparseMatrixOpsTest(test.TestCase):
       c_value = self.evaluate(c)
 
       expected_c_value = self.evaluate(
-          math_ops.conj(math_ops.matmul(a_dense, b)))
+          math_ops.conj(test_util.matmul_without_tf32(a_dense, b)))
       self.assertAllClose(expected_c_value, c_value)
 
   @test_util.run_in_graph_and_eager_modes
@@ -576,7 +576,7 @@ class CSRSparseMatrixOpsTest(test.TestCase):
                 transpose_b=transpose_b,
                 adjoint_a=adjoint_a,
                 adjoint_b=adjoint_b)
-            c_dense_t = math_ops.matmul(
+            c_dense_t = test_util.matmul_without_tf32(
                 a_mats,
                 b_mats,
                 transpose_a=transpose_a,
@@ -640,7 +640,7 @@ class CSRSparseMatrixOpsTest(test.TestCase):
                 adjoint_b=adjoint_b)
 
             # Example: t(adj(a) . b) = t(b) . conj(a)
-            c_dense_t = math_ops.matmul(
+            c_dense_t = test_util.matmul_without_tf32(
                 math_ops.conj(b_mats) if adjoint_b else b_mats,
                 math_ops.conj(a_mats) if adjoint_a else a_mats,
                 transpose_a=not (transpose_b or adjoint_b),
@@ -670,7 +670,7 @@ class CSRSparseMatrixOpsTest(test.TestCase):
     c_t = sparse_csr_matrix_ops.sparse_matrix_mat_mul(
         a_sm, b_mats, conjugate_output=True)
 
-    c_dense_t = math_ops.conj(math_ops.matmul(a_mats, b_mats))
+    c_dense_t = math_ops.conj(test_util.matmul_without_tf32(a_mats, b_mats))
     self.assertAllEqual(c_t.shape, c_dense_t.shape)
     c_t_value, c_dense_t_value = self.evaluate((c_t, c_dense_t))
 
@@ -772,7 +772,7 @@ class CSRSparseMatrixOpsTest(test.TestCase):
             adjoint_b=adjoint_b)
         c_sm_dense = sparse_csr_matrix_ops.csr_sparse_matrix_to_dense(
             c_sm, dtypes.float32)
-        c_dense_t = math_ops.matmul(
+        c_dense_t = test_util.matmul_without_tf32(
             a_mats,
             b_mats,
             transpose_a=transpose_a,
@@ -1143,7 +1143,7 @@ class CSRSparseMatrixOpsTest(test.TestCase):
         dense_cholesky = sparse_csr_matrix_ops.csr_sparse_matrix_to_dense(
             cholesky_sparse_matrices, dtype)
         # Compute L * Lh where L is the Sparse Cholesky factor.
-        verification = math_ops.matmul(
+        verification = test_util.matmul_without_tf32(
             dense_cholesky, array_ops.transpose(dense_cholesky, conjugate=True))
         verification = twist_matrix(verification, ordering_amd)
         # Assert that input matrix A satisfies A = L * Lh.
@@ -1197,7 +1197,7 @@ class CSRSparseMatrixOpsTest(test.TestCase):
           cholesky_sparse_matrix, dtype)
 
       # Compute L * Lh.
-      verification = math_ops.matmul(
+      verification = test_util.matmul_without_tf32(
           dense_cholesky,
           array_ops.transpose(dense_cholesky, perm=[0, 2, 1], conjugate=True))
       verification = twist_matrix(verification, ordering_amd)
@@ -1238,7 +1238,7 @@ class CSRSparseMatrixOpsTest(test.TestCase):
         cholesky_sparse_matrix, dtypes.float32)
 
     # Compute L * Lh.
-    verification = math_ops.matmul(
+    verification = test_util.matmul_without_tf32(
         dense_cholesky, array_ops.transpose(dense_cholesky, perm=[0, 2, 1]))
     verification = twist_matrix(verification, ordering_amd)
     verification_values = self.evaluate(verification)
diff --git a/tensorflow/python/kernel_tests/linalg/sparse/csr_sparse_matrix_test.py b/tensorflow/python/kernel_tests/linalg/sparse/csr_sparse_matrix_test.py
index 35c706cb36a..4aa3474ffbb 100644
--- a/tensorflow/python/kernel_tests/linalg/sparse/csr_sparse_matrix_test.py
+++ b/tensorflow/python/kernel_tests/linalg/sparse/csr_sparse_matrix_test.py
@@ -162,7 +162,7 @@ class SparseMatrixMatmulTest(test.TestCase):
                          1.j * np.random.randn(*dense_shape_b))).astype(dtype)
       a_sm = sparse_csr_matrix_ops.CSRSparseMatrix(a_mats)
       b_sm = sparse_csr_matrix_ops.CSRSparseMatrix(b_mats)
-      c_dense = math_ops.matmul(
+      c_dense = test_util.matmul_without_tf32(
           a_mats,
           b_mats,
           transpose_a=transpose_a,
@@ -202,7 +202,7 @@ class SparseMatrixMatmulTest(test.TestCase):
       b_mats = (np.random.randn(*dense_shape_b) +
                 1.j * np.random.randn(*dense_shape_b)).astype(dtype)
       a_sm = sparse_csr_matrix_ops.CSRSparseMatrix(a_mats)
-      c_dense = math_ops.matmul(
+      c_dense = test_util.matmul_without_tf32(
           a_mats,
           b_mats,
           transpose_a=transpose_a,
@@ -240,7 +240,7 @@ class SparseMatrixMatmulTest(test.TestCase):
       b_mats = sparsify((np.random.randn(*dense_shape_b) +
                          1.j * np.random.randn(*dense_shape_b))).astype(dtype)
       b_sm = sparse_csr_matrix_ops.CSRSparseMatrix(b_mats)
-      c_dense = math_ops.matmul(
+      c_dense = test_util.matmul_without_tf32(
           a_mats,
           b_mats,
           transpose_a=transpose_a,
diff --git a/tensorflow/python/kernel_tests/linalg_grad_test.py b/tensorflow/python/kernel_tests/linalg_grad_test.py
index f1d885fd231..4ed02fec222 100644
--- a/tensorflow/python/kernel_tests/linalg_grad_test.py
+++ b/tensorflow/python/kernel_tests/linalg_grad_test.py
@@ -63,6 +63,9 @@ def _GetMatrixUnaryFunctorGradientTest(functor_, dtype_, shape_, **kwargs_):
 
   @test_util.enable_control_flow_v2
   @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  @test_util.run_without_tensor_float_32(
+      'Tests `tf.linalg.expm`, which call matmul. Additionally, calls ops '
+      'which do matmul in their gradient, such as MatrixSolve.')
   def Test(self):
 
     def RandomInput():
@@ -102,6 +105,16 @@ def _GetMatrixBinaryFunctorGradientTest(functor_,
                                         **kwargs_):
 
   @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  @test_util.run_without_tensor_float_32(
+      'Tests `tf.linalg.lstsq`, which call matmul. Additionally, calls ops '
+      'which do matmul in their gradient, such as MatrixSolveLs.')
+  # TODO(b/164254522): With TensorFloat-32, some tests fails with extremely high
+  # absolute and relative differences when calling assertAllClose. For example,
+  # the test test_MatrixSolveLsGradient_float32_10_10_1e-06 of class
+  # MatrixBinaryFunctorGradientTest fails with a max absolute difference of
+  # 0.883 and a max relative difference of 736892. We should consider disabling
+  # TensorFloat-32 within `tf.linalg.lstsq and perhaps other linear algebra
+  # functions, even if TensorFloat-32 is allowed globally.
   def Test(self):
 
     def RandomInput():
diff --git a/tensorflow/python/kernel_tests/lookup_ops_test.py b/tensorflow/python/kernel_tests/lookup_ops_test.py
index a1ded4a9e3b..045dafc3089 100644
--- a/tensorflow/python/kernel_tests/lookup_ops_test.py
+++ b/tensorflow/python/kernel_tests/lookup_ops_test.py
@@ -45,6 +45,7 @@ from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import map_fn
 from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import test
 from tensorflow.python.training import saver
 from tensorflow.python.training import server_lib
@@ -213,6 +214,25 @@ class StaticHashTableTest(BaseLookupTableTest):
     self.assertAllEqual(sp_indices, out_indices)
     self.assertAllEqual(sp_shape, out_shape)
 
+  def testStaticHashTableWithRaggedTensorInput(self):
+    default_val = constant_op.constant(-1, dtypes.int64)
+    keys = constant_op.constant(["brain", "salad", "surgery"])
+    values = constant_op.constant([0, 1, 2], dtypes.int64)
+    table = self.getHashTable()(
+        lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
+    self.initialize_table(table)
+
+    row_splits = [0, 2, 3]
+    input_tensor = ragged_tensor.RaggedTensor.from_row_splits(
+        constant_op.constant(["brain", "salad", "tank"]),
+        constant_op.constant(row_splits, dtypes.int64))
+    output = table.lookup(input_tensor)
+
+    out = self.evaluate(output)
+
+    self.assertAllEqual([0, 1, -1], out.values)
+    self.assertAllEqual(row_splits, out.row_splits)
+
   def testSignatureMismatch(self):
     default_val = -1
     keys = constant_op.constant(["brain", "salad", "surgery"])
@@ -1081,6 +1101,28 @@ class StaticVocabularyTableTest(BaseLookupTableTest):
     self.assertAllEqual([0, 1, 0, 2, 3], sp_ids_val)
     self.assertAllEqual(input_shape, sp_ids_shape)
 
+  def testRaggedTensor(self):
+    vocab_file = self._createVocabFile("feat_to_id_7.txt")
+    input_row_splits = [0, 2, 4, 5]
+    ragged_features = ragged_tensor.RaggedTensor.from_row_splits(
+        constant_op.constant(["brain", "salad", "brain", "surgery", "tarkus"],
+                             dtypes.string),
+        constant_op.constant(input_row_splits, dtypes.int64))
+
+    table = self.getVocabularyTable()(lookup_ops.TextFileIdTableInitializer(
+        vocab_file, vocab_size=3), 1)
+    self.initialize_table(table)
+
+    ragged_ids = table.lookup(ragged_features)
+
+    self.assertAllEqual([5], ragged_ids.values._shape_as_list())
+
+    ragged_ids_val, ragged_ids_row_splits = self.evaluate(
+        [ragged_ids.values, ragged_ids.row_splits])
+
+    self.assertAllEqual([0, 1, 0, 2, 3], ragged_ids_val)
+    self.assertAllEqual(input_row_splits, ragged_ids_row_splits)
+
   def testInt32SparseTensor(self):
     input_indices = [[0, 0], [0, 1], [2, 0], [2, 2], [3, 0]]
     input_shape = [4, 4]
@@ -1107,6 +1149,29 @@ class StaticVocabularyTableTest(BaseLookupTableTest):
     self.assertAllEqual([0, 1, 0, 2, 3], sp_ids_val)
     self.assertAllEqual(input_shape, sp_ids_shape)
 
+  def testInt32RaggedTensor(self):
+    input_row_splits = [0, 2, 4, 5]
+    ragged_features = ragged_tensor.RaggedTensor.from_row_splits(
+        constant_op.constant([42, 1, 42, -1000, 11], dtypes.int32),
+        constant_op.constant(input_row_splits, dtypes.int64))
+
+    table = self.getVocabularyTable()(
+        lookup_ops.KeyValueTensorInitializer((42, 1, -1000), (0, 1, 2),
+                                             dtypes.int64, dtypes.int64),
+        1,
+        lookup_key_dtype=dtypes.int32)
+    self.initialize_table(table)
+
+    ragged_ids = table.lookup(ragged_features)
+
+    self.assertAllEqual([5], ragged_ids.values._shape_as_list())
+
+    ragged_ids_val, ragged_ids_row_splits = self.evaluate(
+        [ragged_ids.values, ragged_ids.row_splits])
+
+    self.assertAllEqual([0, 1, 0, 2, 3], ragged_ids_val)
+    self.assertAllEqual(input_row_splits, ragged_ids_row_splits)
+
   def testInt64SparseTensor(self):
     input_indices = [[0, 0], [0, 1], [2, 0], [2, 2], [3, 0]]
     input_shape = [4, 4]
@@ -1130,6 +1195,26 @@ class StaticVocabularyTableTest(BaseLookupTableTest):
     self.assertAllEqual([0, 1, 0, 2, 3], sp_ids_val)
     self.assertAllEqual(input_shape, sp_ids_shape)
 
+  def testInt64RaggedTensor(self):
+    input_row_splits = [0, 2, 4, 5]
+    ragged_features = ragged_tensor.RaggedTensor.from_row_splits(
+        constant_op.constant([42, 1, 42, -1000, 11], dtypes.int64),
+        constant_op.constant(input_row_splits, dtypes.int64))
+
+    table = self.getVocabularyTable()(lookup_ops.KeyValueTensorInitializer(
+        (42, 1, -1000), (0, 1, 2), dtypes.int64, dtypes.int64), 1)
+    self.initialize_table(table)
+
+    ragged_ids = table.lookup(ragged_features)
+
+    self.assertAllEqual([5], ragged_ids.values._shape_as_list())
+
+    ragged_ids_val, ragged_ids_row_splits = self.evaluate(
+        [ragged_ids.values, ragged_ids.row_splits])
+
+    self.assertAllEqual([0, 1, 0, 2, 3], ragged_ids_val)
+    self.assertAllEqual(input_row_splits, ragged_ids_row_splits)
+
   def testStaticVocabularyTableNoInnerTable(self):
     table = self.getVocabularyTable()(None, num_oov_buckets=1)
     self.assertIsNone(table.resource_handle)
@@ -2682,6 +2767,29 @@ class IdTableWithHashBucketsTest(test.TestCase):
     self.assertAllEqual([0, 1, 0, 2, 3], sp_ids_val)
     self.assertAllEqual(input_shape, sp_ids_shape)
 
+  def testRaggedTensor(self):
+    vocab_file = self._createVocabFile("feat_to_id_7.txt")
+    input_row_splits = [0, 2, 4, 5]
+    ragged_features = ragged_tensor.RaggedTensor.from_row_splits(
+        constant_op.constant(["brain", "salad", "brain", "surgery", "tarkus"],
+                             dtypes.string),
+        constant_op.constant(input_row_splits, dtypes.int64))
+
+    table = lookup_ops.IdTableWithHashBuckets(
+        lookup_ops.StaticHashTable(
+            lookup_ops.TextFileIdTableInitializer(vocab_file, vocab_size=3),
+            -1), 1)
+    self.evaluate(table.initializer)
+
+    ragged_ids = table.lookup(ragged_features)
+    self.assertAllEqual([5], ragged_ids.values._shape_as_list())
+
+    ragged_ids_val, ragged_ids_row_splits = self.evaluate(
+        [ragged_ids.values, ragged_ids.row_splits])
+
+    self.assertAllEqual([0, 1, 0, 2, 3], ragged_ids_val)
+    self.assertAllEqual(input_row_splits, ragged_ids_row_splits)
+
   def testInt32SparseTensor(self):
     input_indices = [[0, 0], [0, 1], [2, 0], [2, 2], [3, 0]]
     input_shape = [4, 4]
@@ -2709,6 +2817,30 @@ class IdTableWithHashBucketsTest(test.TestCase):
     self.assertAllEqual([0, 1, 0, 2, 3], sp_ids_val)
     self.assertAllEqual(input_shape, sp_ids_shape)
 
+  def testInt32RaggedTensor(self):
+    input_row_splits = [0, 2, 4, 5]
+    ragged_features = ragged_tensor.RaggedTensor.from_row_splits(
+        constant_op.constant([42, 1, 42, -1000, 11], dtypes.int32),
+        constant_op.constant(input_row_splits, dtypes.int32))
+
+    table = lookup_ops.IdTableWithHashBuckets(
+        lookup_ops.StaticHashTable(
+            lookup_ops.KeyValueTensorInitializer(
+                (42, 1, -1000), (0, 1, 2), dtypes.int64, dtypes.int64), -1),
+        1,
+        key_dtype=dtypes.int32)
+    self.evaluate(table.initializer)
+
+    ragged_ids = table.lookup(ragged_features)
+
+    self.assertAllEqual([5], ragged_ids.values._shape_as_list())
+
+    ragged_ids_val, ragged_ids_row_splits = self.evaluate(
+        [ragged_ids.values, ragged_ids.row_splits])
+
+    self.assertAllEqual([0, 1, 0, 2, 3], ragged_ids_val)
+    self.assertAllEqual(input_row_splits, ragged_ids_row_splits)
+
   def testInt64SparseTensor(self):
     input_indices = [[0, 0], [0, 1], [2, 0], [2, 2], [3, 0]]
     input_shape = [4, 4]
@@ -2736,6 +2868,30 @@ class IdTableWithHashBucketsTest(test.TestCase):
     self.assertAllEqual([0, 1, 0, 2, 3], sp_ids_val)
     self.assertAllEqual(input_shape, sp_ids_shape)
 
+  def testInt64RaggedTensor(self):
+    input_row_splits = [0, 2, 4, 5]
+    ragged_features = ragged_tensor.RaggedTensor.from_row_splits(
+        constant_op.constant([42, 1, 42, -1000, 11], dtypes.int64),
+        constant_op.constant(input_row_splits, dtypes.int64))
+
+    table = lookup_ops.IdTableWithHashBuckets(
+        lookup_ops.StaticHashTable(
+            lookup_ops.KeyValueTensorInitializer(
+                (42, 1, -1000), (0, 1, 2), dtypes.int64, dtypes.int64), -1),
+        1,
+        key_dtype=dtypes.int64)
+    self.evaluate(table.initializer)
+
+    ragged_ids = table.lookup(ragged_features)
+
+    self.assertAllEqual([5], ragged_ids.values._shape_as_list())
+
+    ragged_ids_val, ragged_ids_row_splits = self.evaluate(
+        [ragged_ids.values, ragged_ids.row_splits])
+
+    self.assertAllEqual([0, 1, 0, 2, 3], ragged_ids_val)
+    self.assertAllEqual(input_row_splits, ragged_ids_row_splits)
+
   def testIdTableWithHashBucketsWithInvalidHashers(self):
     vocab_file = self._createVocabFile("feat_to_id_4.txt")
     with self.cached_session():
diff --git a/tensorflow/python/kernel_tests/lu_op_test.py b/tensorflow/python/kernel_tests/lu_op_test.py
index fee6aecb3b0..8d522e80a08 100644
--- a/tensorflow/python/kernel_tests/lu_op_test.py
+++ b/tensorflow/python/kernel_tests/lu_op_test.py
@@ -91,7 +91,7 @@ class LuOpTest(test.TestCase):
     # Prepare the upper factor.
     upper = array_ops.matrix_band_part(lu, 0, -1)
 
-    verification = math_ops.matmul(lower, upper)
+    verification = test_util.matmul_without_tf32(lower, upper)
 
     # Permute the rows of product of the Cholesky factors.
     if num_rows > 0:
diff --git a/tensorflow/python/kernel_tests/map_ops_test.py b/tensorflow/python/kernel_tests/map_ops_test.py
new file mode 100644
index 00000000000..771e22e5c5e
--- /dev/null
+++ b/tensorflow/python/kernel_tests/map_ops_test.py
@@ -0,0 +1,454 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Tests for TensorMap ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+from tensorflow.python.eager import backprop
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import map_ops
+from tensorflow.python.ops import sort_ops
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class MapOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
+
+  def testEmptyTensorMapSize(self):
+    m = map_ops.empty_tensor_map()
+    s = map_ops.tensor_map_size(m)
+    self.assertAllEqual(s, 0)
+
+  def testTensorMapInsert(self):
+    m = map_ops.empty_tensor_map()
+    k = constant_op.constant(1.0)
+    v = constant_op.constant(2.0)
+    m = map_ops.tensor_map_insert(m, k, v)
+    s = map_ops.tensor_map_size(m)
+    self.assertAllEqual(s, 1)
+
+  def testTensorMapLookup(self):
+    m = map_ops.empty_tensor_map()
+    k = constant_op.constant(1.0)
+    v = constant_op.constant(2.0)
+    m = map_ops.tensor_map_insert(m, k, v)
+    l = map_ops.tensor_map_lookup(m, k, dtypes.float32)
+    self.assertAllClose(l, v)
+
+  def testTensorMapLookupFromEmptyMapFails(self):
+    m = map_ops.empty_tensor_map()
+    k = constant_op.constant(1.0)
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                "Trying to lookup non-existent key. *"):
+      l = map_ops.tensor_map_lookup(m, k, dtypes.float32)
+      self.evaluate(l)
+
+  def testTensorMapLookupMissingKeyFails(self):
+    m = map_ops.empty_tensor_map()
+    k = constant_op.constant(1.0)
+    k2 = constant_op.constant(2.0)
+    v = constant_op.constant(11.0)
+    m = map_ops.tensor_map_insert(m, k, v)
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                "Trying to lookup non-existent key. *"):
+      l = map_ops.tensor_map_lookup(m, k2, dtypes.float32)
+      self.evaluate(l)
+
+  def testTensorMapErase(self):
+    m = map_ops.empty_tensor_map()
+    k = constant_op.constant(1.0)
+    v = constant_op.constant(2.0)
+    m = map_ops.tensor_map_insert(m, k, v)
+    s = map_ops.tensor_map_size(m)
+    self.assertAllEqual(s, 1)
+    m = map_ops.tensor_map_erase(m, k, v.dtype)
+    s = map_ops.tensor_map_size(m)
+    self.assertAllEqual(s, 0)
+
+  def testTensorMapEraseFromEmptyMapFails(self):
+    m = map_ops.empty_tensor_map()
+    k = constant_op.constant(1.0)
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                "Trying to erase non-existent item. *"):
+      m = map_ops.tensor_map_erase(m, k, dtypes.float32)
+      self.evaluate(m)
+
+  def testTensorMapEraseMissingKeyFails(self):
+    m = map_ops.empty_tensor_map()
+    k = constant_op.constant(1.0)
+    k2 = constant_op.constant(2.0)
+    v = constant_op.constant(2.0)
+    m = map_ops.tensor_map_insert(m, k2, v)
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                "Trying to erase non-existent item. *"):
+      m = map_ops.tensor_map_erase(m, k, dtypes.float32)
+      self.evaluate(m)
+
+  def testTensorMapHasKey(self):
+    m = map_ops.empty_tensor_map()
+    k = constant_op.constant(1.0)
+    k2 = constant_op.constant(2.0)
+    v = constant_op.constant(2.0)
+    m = map_ops.tensor_map_insert(m, k, v)
+    # Check has key.
+    b = map_ops.tensor_map_has_key(m, k)
+    b2 = map_ops.tensor_map_has_key(m, k2)
+    self.assertAllEqual(b, True)
+    self.assertAllEqual(b2, False)
+
+  def testIfHasKeyLookup(self):
+    m = map_ops.empty_tensor_map()
+    k = constant_op.constant(1.0)
+    k2 = constant_op.constant(2.0)
+    v = constant_op.constant(2.0)
+    m = map_ops.tensor_map_insert(m, k, v)
+
+    default_value = array_ops.zeros_like(v)
+    l = control_flow_ops.cond(
+        map_ops.tensor_map_has_key(m, k),
+        lambda: map_ops.tensor_map_lookup(m, k, dtypes.float32),
+        lambda: default_value)
+    l2 = control_flow_ops.cond(
+        map_ops.tensor_map_has_key(m, k2),
+        lambda: map_ops.tensor_map_lookup(m, k, dtypes.float32),
+        lambda: default_value)
+    self.assertAllClose(l, v)
+    self.assertAllClose(l2, default_value)
+
+  def testStackKeys(self):
+    m = map_ops.empty_tensor_map()
+    k = constant_op.constant(1.0)
+    k2 = constant_op.constant(2.0)
+    k3 = constant_op.constant(3.0)
+    v = constant_op.constant(21.0)
+    v2 = constant_op.constant(22.0)
+    v3 = constant_op.constant(23.0)
+    m = map_ops.tensor_map_insert(m, k, v)
+    m = map_ops.tensor_map_insert(m, k2, v2)
+    keys = map_ops.tensor_map_stack_keys(m, k.dtype)
+    expected = constant_op.constant([1.0, 2.0])
+    self.assertAllClose(array_ops.shape(keys), array_ops.shape(expected))
+    self.assertAllClose(sort_ops.sort(keys), expected)
+
+    m = map_ops.tensor_map_insert(m, k3, v3)
+    keys = map_ops.tensor_map_stack_keys(m, k.dtype)
+    expected = constant_op.constant([1.0, 2.0, 3.0])
+    self.assertAllClose(array_ops.shape(keys), array_ops.shape(expected))
+    self.assertAllClose(sort_ops.sort(keys), expected)
+
+  def testStackKeysEmptyMapFails(self):
+    m = map_ops.empty_tensor_map()
+    with self.assertRaisesRegex(
+        errors.InvalidArgumentError, "TensorMapStackKeys cannot be called "
+        "on empty map."):
+      keys = map_ops.tensor_map_stack_keys(m, dtypes.float32)
+      self.evaluate(keys)
+
+  def testStackKeysIncorrectDtypeFails(self):
+    m = map_ops.empty_tensor_map()
+    k = constant_op.constant("key_with_wrong_dtype")
+    v = constant_op.constant(2.0)
+    m = map_ops.tensor_map_insert(m, k, v)
+    simple = "Key does not match requested dtype."
+    with self.assertRaisesRegex(errors.InvalidArgumentError, simple):
+      keys = map_ops.tensor_map_stack_keys(m, dtypes.float32)
+      self.evaluate(keys)
+
+  def testStackKeysIncorrectShapeFails(self):
+    m = map_ops.empty_tensor_map()
+    k = constant_op.constant(1.0)
+    k2 = constant_op.constant([1.0, 11.0])
+    v = constant_op.constant(2.0)
+    v2 = constant_op.constant(22.0)
+    m = map_ops.tensor_map_insert(m, k, v)
+    m = map_ops.tensor_map_insert(m, k2, v2)
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                "Keys must all have the same shape."):
+      keys = map_ops.tensor_map_stack_keys(m, dtypes.float32)
+      self.evaluate(keys)
+
+  def testInsertLookupGrad(self):
+    with backprop.GradientTape() as tape:
+      m = map_ops.empty_tensor_map()
+      k = constant_op.constant(1.0)
+      v = constant_op.constant(11.0)
+      tape.watch(v)
+      m = map_ops.tensor_map_insert(m, k, v)
+      l = map_ops.tensor_map_lookup(m, k, dtypes.float32)
+      l *= 5
+      g = tape.gradient(l, v)
+      self.assertAllEqual(g, 5)
+
+  def testMultipleInsertLookupGrad(self):
+    with backprop.GradientTape(persistent=True) as tape:
+      m = map_ops.empty_tensor_map()
+      k = constant_op.constant(1.0)
+      k2 = constant_op.constant(2.0)
+      k3 = constant_op.constant(3.0)
+      v = constant_op.constant(11.0)
+      v2 = constant_op.constant(12.0)
+      v3 = constant_op.constant(13.0)
+      tape.watch(v)
+      tape.watch(v2)
+      tape.watch(v3)
+      m = map_ops.tensor_map_insert(m, k, v)
+      m = map_ops.tensor_map_insert(m, k2, v2)
+      m = map_ops.tensor_map_insert(m, k3, v3)
+      l = map_ops.tensor_map_lookup(m, k, v.dtype)
+      l2 = map_ops.tensor_map_lookup(m, k2, v2.dtype)
+      l3 = map_ops.tensor_map_lookup(m, k3, v3.dtype)
+      g = tape.gradient(l * 5, v)
+      g2 = tape.gradient(l2 * 6, v2)
+      g3 = tape.gradient(l3 * 7, v3)
+      self.assertAllEqual(g, 5)
+      self.assertAllEqual(g2, 6)
+      self.assertAllEqual(g3, 7)
+    del tape
+
+  def testInsertLookupComposeGrad(self):
+    with backprop.GradientTape() as tape:
+      m = map_ops.empty_tensor_map()
+      k = constant_op.constant(1.0)
+      k2 = constant_op.constant(2.0)
+      v = constant_op.constant(11.0)
+      tape.watch(v)
+      m = map_ops.tensor_map_insert(m, k, v)
+      l = map_ops.tensor_map_lookup(m, k, v.dtype)
+      m = map_ops.tensor_map_insert(m, k2, l)
+      l2 = map_ops.tensor_map_lookup(m, k2, l.dtype)
+      g = tape.gradient(l2 * 5, v)
+      self.assertAllEqual(g, 5)
+
+  def testReplaceLookupGrad(self):
+    # Test using same key and different value.
+    with backprop.GradientTape(persistent=True) as tape:
+      m = map_ops.empty_tensor_map()
+      k = constant_op.constant(1.0)
+      v = constant_op.constant(11.0)
+      v2 = constant_op.constant(22.0)
+      tape.watch(v)
+      tape.watch(v2)
+      m = map_ops.tensor_map_insert(m, k, v)
+      l = map_ops.tensor_map_lookup(m, k, v.dtype)
+      self.assertAllClose(l, v)
+      g = tape.gradient(l * 5, v)
+      self.assertAllEqual(g, 5)
+      # Replace key and lookup.
+      m = map_ops.tensor_map_insert(m, k, v2)
+      l2 = map_ops.tensor_map_lookup(m, k, v2.dtype)
+      self.assertAllClose(l2, v2)
+      g2 = tape.gradient(l2 * 6, v)
+      self.assertAllClose(g2, array_ops.zeros_like(v))
+      g3 = tape.gradient(l2 * 7, v2)
+      self.assertAllClose(g3, 7)
+    del tape
+
+  def testDiffKeySameValueGrad(self):
+    with backprop.GradientTape(persistent=True) as tape:
+      m = map_ops.empty_tensor_map()
+      k = constant_op.constant(1.0)
+      k2 = constant_op.constant(11.0)
+      v = constant_op.constant(2.0)
+      v2 = constant_op.constant(22.0)
+      tape.watch(v)
+      tape.watch(v2)
+      m = map_ops.tensor_map_insert(m, k, v)
+      m = map_ops.tensor_map_insert(m, k2, v)
+      l = map_ops.tensor_map_lookup(m, k, v.dtype)
+      l2 = map_ops.tensor_map_lookup(m, k2, v.dtype)
+      g = tape.gradient(l + l2, v)
+      self.assertAllEqual(g, 2)
+      m = map_ops.tensor_map_insert(m, k2, v2)
+      l2 = map_ops.tensor_map_lookup(m, k2, v2.dtype)
+      g2 = tape.gradient(l + l2, v2)
+      self.assertAllEqual(g2, 1)
+    del tape
+
+  def testLookupAddGrad(self):
+    with backprop.GradientTape(persistent=True) as tape:
+      k = constant_op.constant(1.0)
+      k2 = constant_op.constant(2.0)
+      v = constant_op.constant(11.0)
+      v2 = constant_op.constant(22.0)
+      tape.watch(v)
+      tape.watch(v2)
+      m = map_ops.empty_tensor_map()
+      m = map_ops.tensor_map_insert(m, k, v)
+      m = map_ops.tensor_map_insert(m, k2, v2)
+      l1 = map_ops.tensor_map_lookup(m, k, v.dtype)
+      l2 = map_ops.tensor_map_lookup(m, k2, v2.dtype)
+      g = tape.gradient(l1 + l2, [l1, l2])
+      self.assertAllClose(g, [1, 1])
+      g2 = tape.gradient(l1 + l2, [v, v2])
+      self.assertAllClose(g2, [1, 1])
+      g3 = tape.gradient(l1 + l2 * 4, v2)
+      self.assertAllEqual(g3, 4)
+    del tape
+
+  def testLookupMultiplyGrad(self):
+    with backprop.GradientTape(persistent=True) as tape:
+      k = constant_op.constant(1.0)
+      k2 = constant_op.constant(2.0)
+      v = constant_op.constant(11.0)
+      v2 = constant_op.constant(22.0)
+      tape.watch(v)
+      tape.watch(v2)
+      m = map_ops.empty_tensor_map()
+      m = map_ops.tensor_map_insert(m, k, v)
+      m = map_ops.tensor_map_insert(m, k2, v2)
+      l1 = map_ops.tensor_map_lookup(m, k, v.dtype)
+      l2 = map_ops.tensor_map_lookup(m, k2, v2.dtype)
+      g = tape.gradient(l1 * l2, [v, v2])
+      self.assertAllClose(g, [v2, v])
+      g2 = tape.gradient(l1 * l1, v)
+      self.assertAllClose(g2, 2 * v)
+    del tape
+
+  def testEraseFirstInsertGrad(self):
+    with backprop.GradientTape(persistent=True) as tape:
+      m = map_ops.empty_tensor_map()
+      k = constant_op.constant(1.0)
+      k2 = constant_op.constant(2.0)
+      v = constant_op.constant(11.0)
+      v2 = constant_op.constant(22.0)
+      tape.watch(v)
+      tape.watch(v2)
+      m = map_ops.tensor_map_insert(m, k, v)
+      l = map_ops.tensor_map_lookup(m, k, v.dtype)
+      m = map_ops.tensor_map_insert(m, k2, v2)
+      m = map_ops.tensor_map_erase(m, k, v.dtype)
+      l2 = map_ops.tensor_map_lookup(m, k2, v2.dtype)
+      self.assertAllClose(l2, v2)
+      g = tape.gradient(l * 5, v)
+      self.assertAllEqual(g, 5)
+      g2 = tape.gradient(l2 * 6, v2)
+      self.assertAllEqual(g2, 6)
+    del tape
+
+  def testEraseSecondInsertGrad(self):
+    with backprop.GradientTape(persistent=True) as tape:
+      m = map_ops.empty_tensor_map()
+      k = constant_op.constant(1.0)
+      k2 = constant_op.constant(2.0)
+      v = constant_op.constant(11.0)
+      v2 = constant_op.constant(22.0)
+      tape.watch(v)
+      tape.watch(v2)
+      m = map_ops.tensor_map_insert(m, k, v)
+      m = map_ops.tensor_map_insert(m, k2, v2)
+      m = map_ops.tensor_map_erase(m, k2, v2.dtype)
+      l = map_ops.tensor_map_lookup(m, k, v.dtype)
+      self.assertAllClose(l, v)
+      g = tape.gradient(l * 5, v)
+      self.assertAllEqual(g, 5)
+    del tape
+
+  def testEraseInsertComposedGrad(self):
+    with backprop.GradientTape(persistent=True) as tape:
+      m = map_ops.empty_tensor_map()
+      k = constant_op.constant(1.0)
+      k2 = constant_op.constant(2.0)
+      v = constant_op.constant(11.0)
+      v2 = constant_op.constant(22.0)
+      tape.watch(v)
+      tape.watch(v2)
+      m = map_ops.tensor_map_insert(m, k, v)
+      l = map_ops.tensor_map_lookup(m, k, v.dtype)
+      m = map_ops.tensor_map_erase(m, k, v.dtype)
+      m = map_ops.tensor_map_insert(m, k2, l)
+      l2 = map_ops.tensor_map_lookup(m, k2, l.dtype)
+      g = tape.gradient(l2 * 5, v)
+      self.assertAllEqual(g, 5)
+    del tape
+
+  def testStringKeyGrad(self):
+    with backprop.GradientTape(persistent=True) as tape:
+      m = map_ops.empty_tensor_map()
+      k = constant_op.constant("key")
+      k2 = constant_op.constant("key2")
+      v = constant_op.constant(2.0)
+      v2 = constant_op.constant(22.0)
+      tape.watch(v)
+      tape.watch(v2)
+      m = map_ops.tensor_map_insert(m, k, v)
+      m = map_ops.tensor_map_insert(m, k2, v2)
+      s = map_ops.tensor_map_size(m)
+      self.assertAllEqual(s, 2)
+      # Test lookup and gradient.
+      l = map_ops.tensor_map_lookup(m, k, v.dtype)
+      self.assertAllClose(l, v)
+      self.assertAllClose(tape.gradient(l * 5, v), 5)
+      # Test replace and gradient.
+      m = map_ops.tensor_map_insert(m, k, v2)
+      l2 = map_ops.tensor_map_lookup(m, k, v2.dtype)
+      self.assertAllClose(l2, v2)
+      g = tape.gradient(l2 * 6, v2)
+      self.assertAllEqual(g, 6)
+      # Test erase, has key, and gradient.
+      m = map_ops.tensor_map_erase(m, k, v2.dtype)
+      s = map_ops.tensor_map_size(m)
+      self.assertAllEqual(s, 1)
+      h = map_ops.tensor_map_has_key(m, k)
+      self.assertAllEqual(h, False)
+      l = map_ops.tensor_map_lookup(m, k2, v2.dtype)
+      g2 = tape.gradient(l * 6, v2)
+      self.assertAllEqual(g2, 6)
+    del tape
+
+  def testStringKeyValue(self):
+    m = map_ops.empty_tensor_map()
+    k = constant_op.constant("key")
+    v = constant_op.constant("value")
+    k2 = constant_op.constant(1.0)
+    v2 = constant_op.constant(2.0)
+    # Test insert and lookup on string key-value pair.
+    m = map_ops.tensor_map_insert(m, k, v)
+    m = map_ops.tensor_map_insert(m, k2, v2)
+    l = map_ops.tensor_map_lookup(m, k, v.dtype)
+    self.assertAllEqual(l, v)
+    # Test lookup on float key-value pair.
+    l2 = map_ops.tensor_map_lookup(m, k2, v2.dtype)
+    self.assertAllClose(l2, v2)
+    # Test erase and has key.
+    self.assertAllEqual(map_ops.tensor_map_has_key(m, k), True)
+    m = map_ops.tensor_map_erase(m, k, v.dtype)
+    self.assertAllEqual(map_ops.tensor_map_has_key(m, k), False)
+    self.assertAllEqual(map_ops.tensor_map_has_key(m, k2), True)
+
+  def testVectorValue(self):
+    m = map_ops.empty_tensor_map()
+    k = constant_op.constant([1.0, 2.0])
+    v = constant_op.constant([11.0, 22.0])
+    # Test insert and lookup.
+    m = map_ops.tensor_map_insert(m, k, v)
+    s = map_ops.tensor_map_size(m)
+    self.assertAllEqual(s, 1)
+    l = map_ops.tensor_map_lookup(m, k, v.dtype)
+    self.assertAllEqual(l, v)
+    # Test erase and has key.
+    m = map_ops.tensor_map_erase(m, k, v.dtype)
+    s = map_ops.tensor_map_size(m)
+    self.assertAllEqual(s, 0)
+    self.assertAllEqual(map_ops.tensor_map_has_key(m, k), False)
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/matmul_op_test.py b/tensorflow/python/kernel_tests/matmul_op_test.py
index 712d7336b94..737ca777804 100644
--- a/tensorflow/python/kernel_tests/matmul_op_test.py
+++ b/tensorflow/python/kernel_tests/matmul_op_test.py
@@ -70,6 +70,7 @@ class MatMulTest(test_lib.TestCase):
 
 def _GetMatMulTest(a_np_, b_np_, use_static_shape_, **kwargs_):
 
+  @test_util.run_without_tensor_float_32("Tests matmul")
   def Test(self):
     np_val = np.matrix(a_np_) * np.matrix(b_np_)
 
diff --git a/tensorflow/python/kernel_tests/matrix_exponential_op_test.py b/tensorflow/python/kernel_tests/matrix_exponential_op_test.py
index 149ce0d631e..61e2610e595 100644
--- a/tensorflow/python/kernel_tests/matrix_exponential_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_exponential_op_test.py
@@ -137,6 +137,12 @@ class ExponentialOpTest(test.TestCase):
     with self.assertRaises(ValueError):
       linalg_impl.matrix_exponential(tensor3)
 
+  def testInfinite(self):
+    # Check that the op does not loop forever on infinite inputs. (b/158433036)
+    in_tensor = [[np.inf, 1.], [1., 1.]]
+    result = self.evaluate(linalg_impl.matrix_exponential(in_tensor))
+    self.assertTrue(np.all(np.isnan(result)))
+
   def testEmpty(self):
     self._verifyExponentialReal(np.empty([0, 2, 2]))
     self._verifyExponentialReal(np.empty([2, 0, 0]))
diff --git a/tensorflow/python/kernel_tests/matrix_inverse_op_test.py b/tensorflow/python/kernel_tests/matrix_inverse_op_test.py
index ffe0f595618..9a5a467a5a1 100644
--- a/tensorflow/python/kernel_tests/matrix_inverse_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_inverse_op_test.py
@@ -26,7 +26,6 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import linalg_ops
-from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import benchmark
@@ -41,7 +40,7 @@ class InverseOpTest(test.TestCase):
       with self.cached_session(use_gpu=True):
         # Verify that x^{-1} * x == Identity matrix.
         inv = linalg_ops.matrix_inverse(y, adjoint=adjoint)
-        tf_ans = math_ops.matmul(inv, y, adjoint_b=adjoint)
+        tf_ans = test_util.matmul_without_tf32(inv, y, adjoint_b=adjoint)
         np_ans = np.identity(y.shape[-1])
         if x.ndim > 2:
           tiling = list(y.shape)
diff --git a/tensorflow/python/kernel_tests/matrix_square_root_op_test.py b/tensorflow/python/kernel_tests/matrix_square_root_op_test.py
index 6cf330ed981..98796f256ab 100644
--- a/tensorflow/python/kernel_tests/matrix_square_root_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_square_root_op_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.ops import stateless_random_ops
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_without_tensor_float_32
 class SquareRootOpTest(test.TestCase):
 
   def _verifySquareRoot(self, matrix, np_type):
@@ -36,7 +37,7 @@ class SquareRootOpTest(test.TestCase):
 
     # Verify that matmul(sqrtm(A), sqrtm(A)) = A
     sqrt = gen_linalg_ops.matrix_square_root(matrix)
-    square = math_ops.matmul(sqrt, sqrt)
+    square = test_util.matmul_without_tf32(sqrt, sqrt)
     self.assertShapeEqual(matrix, square)
     self.assertAllClose(matrix, square, rtol=1e-4, atol=1e-3)
 
diff --git a/tensorflow/python/kernel_tests/parse_single_example_op_test.py b/tensorflow/python/kernel_tests/parse_single_example_op_test.py
index ab270bf0d59..498b6a8fd65 100644
--- a/tensorflow/python/kernel_tests/parse_single_example_op_test.py
+++ b/tensorflow/python/kernel_tests/parse_single_example_op_test.py
@@ -856,6 +856,7 @@ class ParseSingleExampleTest(test.TestCase):
                                                  expected_err[1]):
           out = parsing_ops.parse_single_example(**kwargs)
           sess.run(flatten_values_tensors_or_sparse(out.values()))
+        return
       else:
         # Returns dict w/ Tensors and SparseTensors.
         out = parsing_ops.parse_single_example(**kwargs)
@@ -939,6 +940,20 @@ class ParseSingleExampleTest(test.TestCase):
         },
         expected_output)
 
+  def testExampleLongerThanSpec(self):
+    serialized = example(
+        features=features({
+            "a": bytes_feature([b"a", b"b"]),
+        })).SerializeToString()
+    self._test(
+        {
+            "serialized": ops.convert_to_tensor(serialized),
+            "features": {
+                "a": parsing_ops.FixedLenFeature(1, dtypes.string)
+            }
+        },
+        expected_err=(errors_impl.OpError, "Can't parse serialized Example"))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/pooling_ops_test.py b/tensorflow/python/kernel_tests/pooling_ops_test.py
index 7555230fa35..20699f5de49 100644
--- a/tensorflow/python/kernel_tests/pooling_ops_test.py
+++ b/tensorflow/python/kernel_tests/pooling_ops_test.py
@@ -50,15 +50,21 @@ def GetDeviceScope(self, use_gpu=False):
     return self.session(use_gpu=use_gpu)
 
 
-def GetTestConfigs(include_nchw_vect_c=False):
+def GetTestConfigs(include_nchw_vect_c=False, one_dimensional=False):
   """Get all the valid tests configs to run.
 
   Args:
     include_nchw_vect_c: Whether to include NCHW_VECT_C in the test configs.
+    one_dimensional: If it's a 1D test
 
   Returns:
     all the valid test configs as tuples of data_format and use_gpu.
   """
+  if one_dimensional:
+    test_configs = [("NWC", False), ("NWC", True)]
+    if test.is_gpu_available(cuda_only=True):
+      test_configs += [("NCW", True)]
+    return test_configs
   test_configs = [("NHWC", False), ("NHWC", True)]
   if not test.is_gpu_available(cuda_only=True):
     tf_logging.info("NCHW and NCHW_VECT_C tests skipped because not run with "
@@ -106,8 +112,12 @@ def GetShrunkInceptionMaxPoolShapes(shrink=30):
 
 class PoolingTest(test.TestCase):
 
+  def _isMaxPool(self, func):
+    return func in (nn_ops.max_pool, nn_ops.max_pool_v2)
+
   def _VerifyOneType(self, pool_func, input_sizes, ksize, strides, padding,
-                     data_format, data_type, expected, use_gpu, v2):
+                     data_format, data_type, expected, use_gpu, v2,
+                     use_negative_input=False):
     """Verifies the output values of the pooling function.
 
     Args:
@@ -121,6 +131,8 @@ class PoolingTest(test.TestCase):
       data_type: The data type to use to run the pooling operation.
       expected: An array containing the expected operation outputs.
       use_gpu: Whether we are running on GPU.
+      v2: Whether to use v2 version.
+      use_negative_input: If the input values should be negative.
     """
     total_size = 1
     for s in input_sizes:
@@ -141,10 +153,11 @@ class PoolingTest(test.TestCase):
                     data_type)
     # Initializes the input tensor with array containing incrementing
     # numbers from 1, wrapping round to -127 after 127 to support int8.
-    x = [((f + 128) % 255) - 127 for f in range(total_size)]
+    y = -1 if use_negative_input else 1
+    x = [(((f + 128) % 255) - 127)*y for f in range(total_size)]
     with self.cached_session(use_gpu=use_gpu):
       t = constant_op.constant(x, shape=input_sizes, dtype=data_type)
-      if data_format in ("NCHW", "NCHW_VECT_C"):
+      if data_format in ("NCHW", "NCHW_VECT_C", "NCW"):
         if data_format == "NCHW_VECT_C":
           t = test_util.NHWCToNCHW_VECT_C(t)
           t, _, _ = gen_array_ops.quantize_v2(t, -128.0, 127.0, dtypes.qint8)
@@ -152,6 +165,8 @@ class PoolingTest(test.TestCase):
           t = test_util.NHWCToNCHW(t)
         ksize = test_util.NHWCToNCHW(ksize)
         strides = test_util.NHWCToNCHW(strides)
+        if isinstance(padding, list):
+          padding = test_util.NHWCToNCHW(padding)
       ksize_placeholder = array_ops.placeholder(dtypes.int32, shape=[4])
       strides_placeholder = array_ops.placeholder(dtypes.int32, shape=[4])
       if v2:
@@ -184,7 +199,8 @@ class PoolingTest(test.TestCase):
       self.assertAllCloseAccordingToType(expected, actual.flatten())
 
   def _VerifyOneTest(self, pool_func, input_sizes, ksize, strides, padding,
-                     data_format, expected, use_gpu, v2):
+                     data_format, expected, use_gpu, v2,
+                     use_negative_input=False):
     """Verifies the output values of the pooling function.
 
     Args:
@@ -197,6 +213,8 @@ class PoolingTest(test.TestCase):
       data_format: The data format we use to run the pooling operation.
       expected: An array containing the expected operation outputs.
       use_gpu: Whether we are running on GPU.
+      v2: Whether to use v2 version.
+      use_negative_input: If the input values should be negative."
     """
     if data_format == "NCHW_VECT_C":
       avg_pool_func = nn_ops.avg_pool
@@ -204,17 +222,24 @@ class PoolingTest(test.TestCase):
       if pool_func == avg_pool_func:
         tf_logging.info("NCHW_VECT_C not yet implemented for avg_pool")
         return
+      if (self._isMaxPool(pool_func) and isinstance(padding, list)):
+        tf_logging.info("NCHW_VECT_C not yet implemented for max pool" +
+                        " with explicit padding")
+        return
 
     self._VerifyOneType(pool_func, input_sizes, ksize, strides, padding,
-                        data_format, dtypes.float32, expected, use_gpu, v2)
+                        data_format, dtypes.float32, expected, use_gpu, v2,
+                        use_negative_input)
     if not test.is_built_with_rocm():
       # double datatype is not supported for pooling ops on the ROCm platform
       self._VerifyOneType(pool_func, input_sizes, ksize, strides, padding,
-                          data_format, dtypes.float64, expected, use_gpu, v2)
+                          data_format, dtypes.float64, expected, use_gpu, v2,
+                          use_negative_input)
 
     if not use_gpu or test_util.GpuSupportsHalfMatMulAndConv():
       self._VerifyOneType(pool_func, input_sizes, ksize, strides, padding,
-                          data_format, dtypes.float16, expected, use_gpu, v2)
+                          data_format, dtypes.float16, expected, use_gpu, v2,
+                          use_negative_input)
 
   def _VerifyValues(self,
                     pool_func,
@@ -224,7 +249,9 @@ class PoolingTest(test.TestCase):
                     padding,
                     expected,
                     use_gpu,
-                    v2=False):
+                    v2=False,
+                    one_dim=False,
+                    use_negative_input=False):
     """Verifies the output values of the pooling function.
 
     Args:
@@ -236,11 +263,16 @@ class PoolingTest(test.TestCase):
       padding: Padding type.
       expected: An array containing the expected operation outputs.
       use_gpu: Whether we are running on GPU.
+      v2: Whether to use v2 version.
+      one_dim: If one dimensional pools should be done instead of two
+        dimensional pools.
+      use_negative_input: If the input values should be negative.
     """
-    for (data_format, use_gpu_2) in GetTestConfigs(True):
+    for (data_format, use_gpu_2) in GetTestConfigs(True, one_dim):
       if use_gpu_2 == use_gpu:
         self._VerifyOneTest(pool_func, input_sizes, ksize, strides, padding,
-                            data_format, expected, use_gpu, v2)
+                            data_format, expected, use_gpu, v2,
+                            use_negative_input)
 
   def _testAvgPoolValidPadding(self, use_gpu):
     expected_output = [7.0, 8.0, 9.0]
@@ -467,6 +499,101 @@ class PoolingTest(test.TestCase):
           use_gpu=use_gpu,
           v2=v2)
 
+  def _testMaxPoolZeroExplicitPadding(self, use_gpu):
+    expected_output = [9.0]
+    self._VerifyValues(
+        nn_ops.max_pool,
+        input_sizes=[1, 3, 3, 1],
+        ksize=[1, 3, 3, 1],
+        strides=[1, 2, 2, 1],
+        padding=[[0, 0], [0, 0], [0, 0], [0, 0]],
+        expected=expected_output,
+        use_gpu=use_gpu)
+
+  def _testMaxPoolNegativeInputExpPadding(self, use_gpu):
+    expected_output = [-1, -1, -1, -1]
+    self._VerifyValues(
+        nn_ops.max_pool,
+        input_sizes=[1, 3, 3, 1],
+        ksize=[1, 3, 3, 1],
+        strides=[1, 2, 2, 1],
+        padding=[[0, 0], [2, 1], [2, 1], [0, 0]],
+        expected=expected_output,
+        use_gpu=use_gpu,
+        use_negative_input=True)
+
+  def _testMaxPoolExplicitPadding(self, use_gpu):
+    expected_output = [9.0, 9.0]
+    self._VerifyValues(
+        nn_ops.max_pool,
+        input_sizes=[1, 3, 3, 1],
+        ksize=[1, 3, 3, 1],
+        strides=[1, 2, 2, 1],
+        padding=[[0, 0], [0, 2], [0, 1], [0, 0]],
+        expected=expected_output,
+        use_gpu=use_gpu)
+
+  def _testMaxPoolExplicitPaddingAdvanced(self, use_gpu):
+    expected_output = [7, 9, 11, 12, 19, 21, 23, 24, 31, 33, 35, 36, 31, 33,
+                       35, 36]
+    self._VerifyValues(
+        nn_ops.max_pool,
+        input_sizes=[1, 6, 6, 1],
+        ksize=[1, 3, 3, 1],
+        strides=[1, 2, 2, 1],
+        padding=[[0, 0], [1, 2], [2, 1], [0, 0]],
+        expected=expected_output,
+        use_gpu=use_gpu)
+
+  def _testMaxPoolNegativeInputExpPaddingAdv(self, use_gpu):
+    expected_output = [-1, -1, -3, -5, -7, -7, -9, -11, -19, -19, -21, -23, -31,
+                       -31, -33, -35]
+
+    self._VerifyValues(
+        nn_ops.max_pool,
+        input_sizes=[1, 6, 6, 1],
+        ksize=[1, 3, 3, 1],
+        strides=[1, 2, 2, 1],
+        padding=[[0, 0], [1, 2], [2, 1], [0, 0]],
+        expected=expected_output,
+        use_gpu=use_gpu,
+        use_negative_input=True)
+
+  def _testMaxPoolExplicitPaddingV2(self, use_gpu):
+    expected_output = [9.0, 9.0]
+    self._VerifyValues(
+        nn_ops.max_pool_v2,
+        input_sizes=[1, 3, 3, 1],
+        ksize=[1, 3, 3, 1],
+        strides=[1, 2, 2, 1],
+        padding=[[0, 0], [0, 2], [0, 1], [0, 0]],
+        expected=expected_output,
+        use_gpu=use_gpu)
+
+  def _testMaxPoolExplicitPadding1D(self, use_gpu):
+    expected_output = [2.0, 3.0]
+    self._VerifyValues(
+        nn_ops.max_pool1d,
+        input_sizes=[1, 3, 1],
+        ksize=[1, 2, 1],
+        strides=[1, 2, 1],
+        padding=[[0, 0], [0, 1], [0, 0]],
+        expected=expected_output,
+        use_gpu=use_gpu,
+        one_dim=True)
+
+  def _testMaxPoolExplicitPadding1dV2(self, use_gpu):
+    expected_output = [2.0, 3.0]
+    self._VerifyValues(
+        nn_ops.max_pool_v2,
+        input_sizes=[1, 3, 1],
+        ksize=[1, 2, 1],
+        strides=[1, 2, 1],
+        padding=[[0, 0], [0, 1], [0, 0]],
+        expected=expected_output,
+        use_gpu=use_gpu,
+        one_dim=True)
+
   def _testMaxPoolSamePaddingNonSquareWindow(self, use_gpu):
     # input is:
     # [1.0, 2.0
@@ -618,6 +745,14 @@ class PoolingTest(test.TestCase):
       self._testMaxPoolSamePaddingPacket4(use_gpu)
       self._testMaxPoolSamePaddingPacket8(use_gpu)
       self._testMaxPoolEmptyInput(use_gpu)
+      self._testMaxPoolZeroExplicitPadding(use_gpu)
+      self._testMaxPoolExplicitPadding(use_gpu)
+      self._testMaxPoolExplicitPaddingV2(use_gpu)
+      self._testMaxPoolExplicitPadding1D(use_gpu)
+      self._testMaxPoolExplicitPadding1dV2(use_gpu)
+      self._testMaxPoolExplicitPaddingAdvanced(use_gpu)
+      self._testMaxPoolNegativeInputExpPadding(use_gpu)
+      self._testMaxPoolNegativeInputExpPaddingAdv(use_gpu)
 
   # Tests for DepthwiseMaxPooling on CPU only.
   @test_util.run_deprecated_v1
@@ -980,7 +1115,7 @@ class PoolingTest(test.TestCase):
                                 data_format,
                                 use_gpu,
                                 x_init_value=None):
-    """Verifies the gradients of the avg pooling function.
+    """Verifies the gradients of the max or avg pooling function.
 
     Args:
       pool_func: Function to be called, co.MaxPool, co.AvgPool,
@@ -1017,11 +1152,13 @@ class PoolingTest(test.TestCase):
         func_name = "max_pool"
         err_tolerance = 1e-3
       if data_format == "NCHW":
-        ksize = [1, 1, window_rows, window_rows]
+        ksize = [1, 1, window_rows, window_cols]
         strides = [1, 1, row_stride, col_stride]
+        if isinstance(padding, list):
+          padding = test_util.NHWCToNCHW(padding)
         t = test_util.NHWCToNCHW(input_tensor)
       else:
-        ksize = [1, window_rows, window_rows, 1]
+        ksize = [1, window_rows, window_cols, 1]
         strides = [1, row_stride, col_stride, 1]
         t = input_tensor
       t = pool_func(
@@ -1261,6 +1398,76 @@ class PoolingTest(test.TestCase):
           data_format=data_format,
           use_gpu=use_gpu)
 
+  def _testMaxPoolExplicitPadding1(self, data_format, use_gpu):
+    for pool_func in [nn_ops.max_pool]:
+      self._ConstructAndTestGradient(
+          pool_func,
+          input_sizes=[1, 7, 7, 1],
+          output_sizes=[1, 7, 7, 1],
+          window_rows=3,
+          window_cols=3,
+          row_stride=1,
+          col_stride=1,
+          padding=[[0, 0], [1, 1], [1, 1], [0, 0]],
+          data_format=data_format,
+          use_gpu=use_gpu)
+
+  def _testMaxPoolExplicitPadding2(self, data_format, use_gpu):
+    for pool_func in [nn_ops.max_pool]:
+      self._ConstructAndTestGradient(
+          pool_func,
+          input_sizes=[1, 7, 7, 1],
+          output_sizes=[1, 6, 8, 1],
+          window_rows=3,
+          window_cols=5,
+          row_stride=1,
+          col_stride=1,
+          padding=[[0, 0], [0, 1], [2, 3], [0, 0]],
+          data_format=data_format,
+          use_gpu=use_gpu)
+
+  def _testMaxPoolExplicitPaddingLeftGreater(self, data_format, use_gpu):
+    for pool_func in [nn_ops.max_pool]:
+      self._ConstructAndTestGradient(
+          pool_func,
+          input_sizes=[1, 7, 7, 1],
+          output_sizes=[1, 6, 8, 1],
+          window_rows=3,
+          window_cols=5,
+          row_stride=1,
+          col_stride=1,
+          padding=[[0, 0], [0, 1], [3, 2], [0, 0]],
+          data_format=data_format,
+          use_gpu=use_gpu)
+
+  def _testMaxPoolExplicitPaddingBatchChannel(self, data_format, use_gpu):
+    for pool_func in [nn_ops.max_pool]:
+      self._ConstructAndTestGradient(
+          pool_func,
+          input_sizes=[4, 7, 7, 3],
+          output_sizes=[4, 6, 8, 3],
+          window_rows=3,
+          window_cols=5,
+          row_stride=1,
+          col_stride=1,
+          padding=[[0, 0], [0, 1], [3, 2], [0, 0]],
+          data_format=data_format,
+          use_gpu=use_gpu)
+
+  def _testMaxPoolExplicitPaddingStrides(self, data_format, use_gpu):
+    for pool_func in [nn_ops.max_pool]:
+      self._ConstructAndTestGradient(
+          pool_func,
+          input_sizes=[1, 7, 7, 1],
+          output_sizes=[1, 4, 3, 1],
+          window_rows=3,
+          window_cols=3,
+          row_stride=2,
+          col_stride=3,
+          padding=[[0, 0], [1, 1], [1, 1], [0, 0]],
+          data_format=data_format,
+          use_gpu=use_gpu)
+
   @test_util.run_deprecated_v1
   def testMaxPoolGrad(self):
     for (data_format, use_gpu) in GetTestConfigs():
@@ -1274,6 +1481,11 @@ class PoolingTest(test.TestCase):
       self._testMaxPoolGradSamePadding2_1(data_format, use_gpu)
       self._testMaxPoolGradSamePadding2_2(data_format, use_gpu)
       self._testMaxPoolGradSamePadding3_1(data_format, use_gpu)
+      self._testMaxPoolExplicitPadding1(data_format, use_gpu)
+      self._testMaxPoolExplicitPadding2(data_format, use_gpu)
+      self._testMaxPoolExplicitPaddingStrides(data_format, use_gpu)
+      self._testMaxPoolExplicitPaddingLeftGreater(data_format, use_gpu)
+      self._testMaxPoolExplicitPaddingBatchChannel(data_format, use_gpu)
 
   def _MaxPoolGrad(self, orig_input, orig_output, grad, window_rows,
                    window_cols, row_stride, col_stride, padding, v2):
@@ -1294,9 +1506,16 @@ class PoolingTest(test.TestCase):
       A Tensor.
     """
     pool_func = gen_nn_ops.max_pool_grad_v2 if v2 else gen_nn_ops.max_pool_grad
-    return pool_func(orig_input, orig_output, grad,
-                     [1, window_rows, window_cols, 1],
-                     [1, row_stride, col_stride, 1], padding)
+    if v2:
+      return pool_func(orig_input, orig_output, grad,
+                       [1, window_rows, window_cols, 1],
+                       [1, row_stride, col_stride, 1], padding)
+    else:
+      padding, explicit_paddings = nn_ops.convert_padding(padding)
+      return pool_func(orig_input, orig_output, grad,
+                       [1, window_rows, window_cols, 1],
+                       [1, row_stride, col_stride, 1], padding,
+                       explicit_paddings)
 
   def _testMaxPoolGradDirect(self, input_data, output_backprop,
                              expected_input_backprop, input_sizes, output_sizes,
@@ -1439,6 +1658,116 @@ class PoolingTest(test.TestCase):
             use_gpu=use_gpu,
             v2=v2)
 
+  def _testMaxPoolGradZeroExplicitPadding(self):
+    input_data = [
+        1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0,
+        0.0, 1.0
+    ]
+    output_backprop = [11.0, 12.0, 13.0, 15.0, 16.0, 17.0, 19.0, 20.0, 21.0]
+    expected_input_backprop = [
+        11.0, 0.0, 25.0, 0.0, 0.0, 31.0, 0.0, 17.0, 19.0, 0.0, 41.0, 0.0, 0.0,
+        0.0, 0.0, 0.0
+    ]
+
+    for use_gpu in True, False:
+      for v2 in [False]:
+        self._testMaxPoolGradDirect(
+            input_data,
+            output_backprop,
+            expected_input_backprop,
+            input_sizes=[1, 4, 4, 1],
+            output_sizes=[1, 3, 3, 1],
+            window_rows=2,
+            window_cols=2,
+            row_stride=1,
+            col_stride=1,
+            padding=[[0, 0], [0, 0], [0, 0], [0, 0]],
+            use_gpu=use_gpu,
+            v2=v2)
+
+  def _testMaxPoolGradExplicitPadding_1(self):
+    input_data = [
+        1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0,
+        0.0, 1.0
+    ]
+    output_backprop = [11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0,
+                       20.0, 21.0, 22.0]
+    expected_input_backprop = [
+        11.0, 0.0, 25.0, 0.0, 0.0, 31.0, 0.0, 49.0, 19.0, 0.0, 41.0, 0.0, 0.0,
+        0.0, 0.0, 22.0
+    ]
+
+    for use_gpu in True, False:
+      for v2 in [False]:
+        self._testMaxPoolGradDirect(
+            input_data,
+            output_backprop,
+            expected_input_backprop,
+            input_sizes=[1, 4, 4, 1],
+            output_sizes=[1, 3, 4, 1],
+            window_rows=2,
+            window_cols=2,
+            row_stride=1,
+            col_stride=1,
+            padding=[[0, 0], [0, 0], [0, 1], [0, 0]],
+            use_gpu=use_gpu,
+            v2=v2)
+
+  def _testMaxPoolGradExplicitPadding_2(self):
+    input_data = [
+        1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0,
+        0.0, 1.0
+    ]
+    output_backprop = [11.0, 12.0, 13.0, 15.0, 16.0, 17.0, 19.0, 20.0, 21.0]
+    expected_input_backprop = [
+        54.0, 0.0, 30.0, 0.0, 0.0, 0.0, 0.0, 0.0, 39.0, 0.0, 21.0, 0.0, 0.0,
+        0.0, 0.0, 0.0
+    ]
+
+    for use_gpu in True, False:
+      for v2 in [False]:
+        self._testMaxPoolGradDirect(
+            input_data,
+            output_backprop,
+            expected_input_backprop,
+            input_sizes=[1, 4, 4, 1],
+            output_sizes=[1, 3, 3, 1],
+            window_rows=3,
+            window_cols=3,
+            row_stride=2,
+            col_stride=2,
+            padding=[[0, 0], [2, 1], [2, 1], [0, 0]],
+            use_gpu=use_gpu,
+            v2=v2)
+
+  def _testMaxPoolGradExplicitPadding_3(self):
+    input_data = [
+        -1.0, -5.0, -1.0, -5.0, -5.0, -1.0, -5.0, -1.0, -1.0, -5.0, -1.0, -5.0,
+        -5.0, -1.0, -5.0, -1.0
+    ]
+    output_backprop = [11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0,
+                       20.0, 21.0, 22.0]
+    expected_input_backprop = [
+        11.0, 0.0, 25.0, 0.0, 0.0, 31.0, 0.0, 49.0, 19.0, 0.0, 41.0, 0.0, 0.0,
+        0.0, 0.0, 22.0
+    ]
+
+    for use_gpu in True, False:
+      for v2 in [False]:
+        self._testMaxPoolGradDirect(
+            input_data,
+            output_backprop,
+            expected_input_backprop,
+            input_sizes=[1, 4, 4, 1],
+            output_sizes=[1, 3, 4, 1],
+            window_rows=2,
+            window_cols=2,
+            row_stride=1,
+            col_stride=1,
+            padding=[[0, 0], [0, 0], [0, 1], [0, 0]],
+            use_gpu=use_gpu,
+            v2=v2)
+
   @test_util.no_xla_auto_jit("b/123923733")  # NaNs handled differently
   def _testMaxPoolGradDirectWithNans2_1(self):
     input_data = [float("nan")] * 16
@@ -1615,6 +1944,10 @@ class PoolingTest(test.TestCase):
     self._testMaxPoolGradDirect1_3()
     self._testMaxPoolGradDirectWithNans2_1()
     self._testMaxPoolGradDirectWithNans2_2()
+    self._testMaxPoolGradZeroExplicitPadding()
+    self._testMaxPoolGradExplicitPadding_1()
+    self._testMaxPoolGradExplicitPadding_2()
+    self._testMaxPoolGradExplicitPadding_3()
 
   def _testMaxPoolGradGradValidPadding1_1(self, data_format, use_gpu):
     for pool_func in [gen_nn_ops.max_pool_v2, nn_ops.max_pool]:
@@ -1956,6 +2289,94 @@ class PoolingTest(test.TestCase):
               strides=[1, 1, 1, 1],
               padding="VALID")
 
+  @test_util.run_deprecated_v1
+  def _testEdgeCasesRaiseErrors(self):
+    with self.assertRaisesRegexp(
+        ValueError, "Data formats NCHW_VECT_C is not yet supported with "
+                    "explicit padding"):
+      nn_ops.max_pool(
+          array_ops.placeholder(dtypes.float32, shape=[1, 3, 3, 1]),
+          ksize=[1, 2, 2, 1],
+          strides=[1, 2, 2, 1],
+          padding=[[0, 0], [0, 1], [0, 1], [0, 0]],
+          data_format="NCHW_VECT_C")
+    with self.assertRaisesRegexp(
+        ValueError, "Explicit padding is not yet supported with an input "
+                    "tensor of rank 5"):
+      nn_ops.max_pool_v2(
+          array_ops.placeholder(dtypes.float32, shape=[1, 3, 3, 1, 1]),
+          ksize=[1, 2, 2, 1, 1],
+          strides=[1, 2, 2, 1, 1],
+          padding=[[0, 0], [0, 1], [0, 1], [0, 0]],
+          data_format="NCHW")
+    with self.assertRaisesRegexp(
+        ValueError, "Attr 'padding' of 'MaxPoolV2' Op passed "
+                    "string 'EXPLICIT'"):
+      gen_nn_ops.max_pool_v2(
+          array_ops.placeholder(dtypes.float32, shape=[1, 3, 3, 1, 1]),
+          ksize=[1, 2, 2, 1, 1],
+          strides=[1, 2, 2, 1, 1],
+          padding="EXPLICIT",
+          data_format="NHWC")
+
+  @test_util.run_deprecated_v1
+  def _testEdgeCasesExcessPadding(self):
+    with self.session(use_gpu=test.is_gpu_available()) as sess:
+      with self.assertRaisesRegexp(
+          errors_impl.InvalidArgumentError,
+          "Right padding 2 needs to be smaller than the window size 2"):
+        input_sizes = [1, 3, 3, 1]
+        x = [(((f + 128) % 255) - 127) for f in range(9)]
+        t = constant_op.constant(x, shape=input_sizes, dtype=dtypes.float32)
+        sess.run(gen_nn_ops.max_pool(
+            t,
+            ksize=[1, 2, 2, 1],
+            strides=[1, 2, 2, 1],
+            padding="EXPLICIT",
+            explicit_paddings=[0, 0, 0, 1, 0, 2, 0, 0],
+            data_format="NHWC"))
+
+  @test_util.run_deprecated_v1
+  def _testNegativePadding(self):
+    with self.session(use_gpu=test.is_gpu_available()) as sess:
+      with self.assertRaisesRegexp(
+          ValueError, "All elements of explicit_paddings must be "
+                      "nonnegative for"):
+        input_sizes = [1, 3, 3, 1]
+        x = [(((f + 128) % 255) - 127) for f in range(9)]
+        t = constant_op.constant(x, shape=input_sizes, dtype=dtypes.float32)
+        sess.run(gen_nn_ops.max_pool(
+            t,
+            ksize=[1, 2, 2, 1],
+            strides=[1, 2, 2, 1],
+            padding="EXPLICIT",
+            explicit_paddings=[0, 0, -1, -1, -1, -1, 0, 0],
+            data_format="NHWC"))
+
+  @test_util.run_deprecated_v1
+  def _testExplicitPaddingBatch(self):
+    with self.session(use_gpu=test.is_gpu_available()) as sess:
+      with self.assertRaisesRegexp(
+          ValueError, "Nonzero explicit padding in the batch or depth "
+                      "dimensions is not supported"):
+        input_sizes = [1, 3, 3, 1]
+        x = [(((f + 128) % 255) - 127) for f in range(9)]
+        t = constant_op.constant(x, shape=input_sizes, dtype=dtypes.float32)
+        sess.run(gen_nn_ops.max_pool(
+            t,
+            ksize=[1, 2, 2, 1],
+            strides=[1, 2, 2, 1],
+            padding="EXPLICIT",
+            explicit_paddings=[1, 1, 1, 1, 1, 1, 0, 0],
+            data_format="NHWC"))
+
+  def testExplicitPaddingEdgeCases(self):
+    # Tests for Explicit padding.
+    self._testEdgeCasesRaiseErrors()
+    self._testEdgeCasesExcessPadding()
+    self._testExplicitPaddingBatch()
+    self._testNegativePadding()
+
 
 def GetMaxPoolFwdTest(input_size, filter_size, strides, padding):
 
diff --git a/tensorflow/python/kernel_tests/qr_op_test.py b/tensorflow/python/kernel_tests/qr_op_test.py
index b895fe4ea99..0a618b7f555 100644
--- a/tensorflow/python/kernel_tests/qr_op_test.py
+++ b/tensorflow/python/kernel_tests/qr_op_test.py
@@ -200,6 +200,8 @@ def _GetQrGradOpTest(dtype_, shape_, full_matrices_):
     return a
 
   @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  @test_util.run_without_tensor_float_32("Tests Qr gradient, which calls matmul"
+                                        )
   def Test(self):
     np.random.seed(42)
     # Optimal stepsize for central difference is O(epsilon^{1/3}).
diff --git a/tensorflow/python/kernel_tests/random/BUILD b/tensorflow/python/kernel_tests/random/BUILD
index 31e0417102d..06360fc2095 100644
--- a/tensorflow/python/kernel_tests/random/BUILD
+++ b/tensorflow/python/kernel_tests/random/BUILD
@@ -120,9 +120,6 @@ cuda_py_test(
     size = "medium",
     srcs = ["stateless_random_ops_test.py"],
     shard_count = 10,
-    tags = [
-        "notap",  # b/162112278
-    ],
     tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
diff --git a/tensorflow/python/kernel_tests/random/random_crop_test.py b/tensorflow/python/kernel_tests/random/random_crop_test.py
index 724bee07157..f8effa0ee7b 100644
--- a/tensorflow/python/kernel_tests/random/random_crop_test.py
+++ b/tensorflow/python/kernel_tests/random/random_crop_test.py
@@ -77,5 +77,88 @@ class RandomCropTest(test.TestCase):
     self.assertAllClose(counts, mean, atol=four_stddev)
 
 
-if __name__ == '__main__':
+class StatelessRandomCropTest(test.TestCase):
+
+  def testNoOp(self):
+    # No random cropping is performed since the size is value.shape.
+    for shape in (2, 1, 1), (2, 1, 3), (4, 5, 3):
+      value = np.arange(0, np.prod(shape), dtype=np.int32).reshape(shape)
+      crop = random_ops.stateless_random_crop(value, shape, seed=(1, 2))
+      self.evaluate(crop)
+      self.assertAllEqual(crop, value)
+
+  def testContains(self):
+    with test_util.use_gpu():
+      shape = (3, 5, 7)
+      target = (2, 3, 4)
+      value = np.random.randint(1000000, size=shape)
+      iterations = 10
+      value_set = set(
+          tuple(value[i:i + 2, j:j + 3, k:k + 4].ravel())  # pylint: disable=g-complex-comprehension
+          for i in range(2) for j in range(3) for k in range(4))
+      test_seeds = [
+          tuple(map(lambda x, i=i: x + 1 * i, t))
+          for (i, t) in enumerate((1, 2) for _ in range(iterations))
+      ]
+
+      # Check that the result is valid by making sure that it is one of all
+      # possible values for randomly cropping `value` with `target` shape.
+      for seed in test_seeds:
+        crop = random_ops.stateless_random_crop(value, size=target, seed=seed)
+        y = self.evaluate(crop)
+        self.assertAllEqual(y.shape, target)
+        self.assertIn(tuple(y.ravel()), value_set)
+
+  # TODO(b/162345082): stateless random op generates different random number
+  # with xla_gpu. Update tests such that there is a single ground truth result
+  # to test against.
+  def testRandomization(self):
+    with test_util.use_gpu():
+      shape = [5, 4, 1]
+      size = np.prod(shape)
+      single = [1, 1, 1]
+      value = np.arange(size).reshape(shape)
+      iterations = 5
+      num_samples = 5
+
+      # Test that the same result is returned given the same seed is provided
+      # for each round.
+      test_seed = (1, 2)
+      observations = [[] for _ in range(iterations)]
+      for observation in observations:
+        crop = random_ops.stateless_random_crop(value, single, seed=test_seed)
+        counts = np.zeros(size, dtype=np.int32)
+        for _ in range(num_samples):
+          y = self.evaluate(crop)
+          self.assertAllEqual(y.shape, single)
+          counts[y] += 1
+
+        observation.append(counts)
+
+      for i in range(1, iterations):
+        self.assertAllEqual(observations[0], observations[i])
+
+      # Test that the same sequence of results are returned given the same
+      # sequence of seeds provided.
+      test_seeds = [
+          tuple(map(lambda x, i=i: x + 1 * i, t))
+          for (i, t) in enumerate((1, 2) for _ in range(iterations))
+      ]
+      observations = [[] for _ in range(iterations)]
+      for observation in observations:
+        counts = np.zeros(size, dtype=np.int32)
+        for seed in test_seeds:
+          crop = random_ops.stateless_random_crop(
+              value, single, seed=seed)
+          y = self.evaluate(crop)
+          self.assertAllEqual(y.shape, single)
+          counts[y] += 1
+
+        observation.append(counts)
+
+      for i in range(1, iterations):
+        self.assertAllEqual(observations[0], observations[i])
+
+
+if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/random/random_ops_test.py b/tensorflow/python/kernel_tests/random/random_ops_test.py
index c361f79fb1f..135e4406c82 100644
--- a/tensorflow/python/kernel_tests/random/random_ops_test.py
+++ b/tensorflow/python/kernel_tests/random/random_ops_test.py
@@ -336,8 +336,6 @@ class RandomUniformTest(RandomOpTestCommon):
       self.assertLess(error.max(), 5 * std)
 
   # Check that minval = maxval is fine iff we're producing no numbers
-  @test_util.disable_tfrt(
-      "TFE_TensorHandleToNumpy not implemented yet. b/156191611")
   def testUniformIntsDegenerate(self):
     for dt in dtypes.int32, dtypes.int64:
       def sample(n):
diff --git a/tensorflow/python/kernel_tests/random/stateless_random_ops_test.py b/tensorflow/python/kernel_tests/random/stateless_random_ops_test.py
index 27b10ea2258..547089b522f 100644
--- a/tensorflow/python/kernel_tests/random/stateless_random_ops_test.py
+++ b/tensorflow/python/kernel_tests/random/stateless_random_ops_test.py
@@ -22,10 +22,13 @@ import functools
 
 from absl.testing import parameterized
 import numpy as np
+from tensorflow.python.compat import compat
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
+from tensorflow.python.framework import config
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@@ -35,6 +38,31 @@ from tensorflow.python.ops import stateless_random_ops as stateless
 from tensorflow.python.platform import test
 
 
+# Note that in theory each test will reset the eager context and may choose to
+# hide some devices, so we shouldn't cache this transient info. Tests in this
+# file don't make those config changes, so caching is fine. It provides a good
+# speed-up.
+_cached_device = None
+
+
+def get_device():
+  global _cached_device
+  if _cached_device is not None:
+    return _cached_device
+  # Precedence from high to low
+  for device_type in ('XLA_GPU', 'GPU', 'XLA_CPU', 'CPU'):
+    devices = config.list_logical_devices(device_type)
+    if devices:
+      _cached_device = devices[0]
+      return _cached_device
+  raise ValueError('Cannot find any suitable device. Available devices: %s' %
+                   config.list_logical_devices())
+
+
+BEFORE_EXPIRE = (2020, 10, 24)
+AFTER_EXPIRE = (2020, 10, 26)
+
+
 def invert_philox(key, value):
   """Invert the Philox bijection."""
   key = np.array(key, dtype=np.uint32)
@@ -52,161 +80,337 @@ def invert_philox(key, value):
   return np.array(value)
 
 
-class StatelessOpsTest(test.TestCase, parameterized.TestCase):
+SEEDS = ((7, 17), (11, 5), (2, 3))
+SEED_TYPES = [dtypes.int32, dtypes.int64]
 
-  def _test_match(self, cases):
-    # Stateless ops should be the same as stateful ops on the first call
-    # after seed scrambling.
-    cases = tuple(cases)
-    key = 0x3ec8f720, 0x02461e29
-    for seed in (7, 17), (11, 5), (2, 3):
-      preseed = invert_philox(key, (seed[0], 0, seed[1], 0)).astype(np.uint64)
-      preseed = preseed[::2] | preseed[1::2] << 32
-      random_seed.set_random_seed(seed[0])
-      with test_util.use_gpu():
-        for stateless_op, stateful_op in cases:
-          if context.executing_eagerly():
-            # Call set_random_seed in order to clear kernel cache, to prevent
-            # kernel reusing for the stateful op
-            random_seed.set_random_seed(seed[0])
-          stateful = stateful_op(seed=seed[1])
-          pure = stateless_op(seed=preseed)
-          self.assertAllEqual(stateful, pure)
 
-  def _test_determinism(self, cases):
-    # Stateless values should be equal iff the seeds are equal (roughly)
-    cases = tuple(cases)
-    seeds = [(x, y) for x in range(5) for y in range(5)] * 3
-    with self.test_session(use_gpu=True), test_util.use_gpu():
-      for seed_type in [dtypes.int32, dtypes.int64]:
-        for stateless_op, _ in cases:
-          if context.executing_eagerly():
-            values = [
-                (seed, stateless_op(seed=constant_op.constant(seed, seed_type)))
-                for seed in seeds]
-          else:
-            # Have this branch because the above branch is too slow in graph
-            # mode
-            seed_t = array_ops.placeholder(seed_type, shape=[2])
-            pure = stateless_op(seed=seed_t)
-            values = [
-                (seed, pure.eval(feed_dict={seed_t: seed})) for seed in seeds
-            ]
-          for s0, v0 in values:
-            for s1, v1 in values:
-              self.assertEqual(s0 == s1, np.all(v0 == v1))
+def float_cases(shape_dtypes=(None,)):
+  cases = (
+      # Uniform distribution, with and without range
+      ('uniform', stateless.stateless_random_uniform, random_ops.random_uniform,
+       {}),
+      ('uniform2', stateless.stateless_random_uniform,
+       random_ops.random_uniform, dict(minval=2.2, maxval=7.1)),
+      # Normal distribution, with and without mean+stddev
+      ('normal', stateless.stateless_random_normal, random_ops.random_normal,
+       {}),
+      ('normal2', stateless.stateless_random_normal, random_ops.random_normal,
+       dict(mean=2, stddev=3)),
+      # Truncated normal distribution, with and without mean+stddev
+      ('trnorm', stateless.stateless_truncated_normal,
+       random_ops.truncated_normal, {}),
+      ('trnorm2', stateless.stateless_truncated_normal,
+       random_ops.truncated_normal, dict(mean=3, stddev=4)),
+  )
+  # Explicitly passing in params because capturing cell variable from loop is
+  # problematic in Python
+  def wrap(op, dtype, shape, shape_dtype, kwds, seed):
+    device_type = get_device().device_type
+    # Some dtypes are not supported on some devices
+    if (dtype == dtypes.float16 and device_type in ('XLA_GPU', 'XLA_CPU') or
+        dtype == dtypes.bfloat16 and device_type == 'GPU'):
+      dtype = dtypes.float32
+    shape_ = (constant_op.constant(shape, dtype=shape_dtype)
+              if shape_dtype is not None else shape)
+    return op(seed=seed, shape=shape_, dtype=dtype, **kwds)
 
-  def _float_cases(self, shape_dtypes=(None,)):
-    float_cases = (
-        # Uniform distribution, with and without range
-        (stateless.stateless_random_uniform, random_ops.random_uniform, {}),
-        (stateless.stateless_random_uniform, random_ops.random_uniform,
-         dict(minval=2.2, maxval=7.1)),
-        # Normal distribution, with and without mean+stddev
-        (stateless.stateless_random_normal, random_ops.random_normal, {}),
-        (stateless.stateless_random_normal, random_ops.random_normal,
-         dict(mean=2, stddev=3)),
-        # Truncated normal distribution, with and without mean+stddev
-        (stateless.stateless_truncated_normal, random_ops.truncated_normal, {}),
-        (stateless.stateless_truncated_normal, random_ops.truncated_normal,
-         dict(mean=3, stddev=4)),
-    )
-    for dtype in dtypes.float16, dtypes.float32, dtypes.float64:
-      for shape_dtype in shape_dtypes:
-        for shape in (), (3,), (2, 5):
-          if shape_dtype is not None:
-            shape = constant_op.constant(shape, dtype=shape_dtype)
-          for stateless_op, stateful_op, kwds in float_cases:
-            kwds = dict(shape=shape, dtype=dtype, **kwds)
-            yield (functools.partial(stateless_op, **kwds),
-                   functools.partial(stateful_op, **kwds))
+  def _name(a):
+    if hasattr(a, 'name'):
+      return a.name
+    else:
+      return a
 
-  def _int_cases(self, shape_dtypes=(None,)):
+  for dtype in dtypes.float16, dtypes.bfloat16, dtypes.float32, dtypes.float64:
+    for shape_dtype in shape_dtypes:
+      for shape in (), (3,), (2, 5):
+        for name, stateless_op, stateful_op, kwds in cases:
+          yield (('%s_%s_%s_%s' %
+                  (name, _name(dtype), shape, _name(shape_dtype))).replace(
+                      ' ', ''),
+                 functools.partial(wrap, stateless_op, dtype, shape,
+                                   shape_dtype, kwds),
+                 functools.partial(wrap, stateful_op, dtype, shape, shape_dtype,
+                                   kwds))
+
+
+def int_cases(shape_dtypes=(None,), minval_maxval=None):
+
+  def wrap(op, minval, maxval, shape, shape_dtype, dtype, seed):
+    shape_ = (constant_op.constant(shape, dtype=shape_dtype)
+              if shape_dtype is not None else shape)
+    return op(
+        seed=seed, shape=shape_, minval=minval, maxval=maxval, dtype=dtype)
+
+  if minval_maxval is None:
+    minval_maxval = ((2, 11111),)
+  for minval, maxval in minval_maxval:
     for shape_dtype in shape_dtypes:
       for shape in (), (3,), (2, 5):
-        if shape_dtype is not None:
-          shape = constant_op.constant(shape, dtype=shape_dtype)
         for dtype in dtypes.int32, dtypes.int64:
-          kwds = dict(minval=2, maxval=11111, dtype=dtype, shape=shape)
-          yield (functools.partial(stateless.stateless_random_uniform, **kwds),
-                 functools.partial(random_ops.random_uniform, **kwds))
+          yield ('uniform_%s_%s' % (minval, maxval),
+                 functools.partial(wrap, stateless.stateless_random_uniform,
+                                   minval, maxval, shape, shape_dtype, dtype),
+                 functools.partial(wrap, random_ops.random_uniform, minval,
+                                   maxval, shape, shape_dtype, dtype))
 
-  def _multinomial_cases(self):
-    num_samples = 10
-    for logits_dtype in np.float16, np.float32, np.float64:
-      for output_dtype in dtypes.int32, dtypes.int64:
-        for logits in ([[0.1, 0.25, 0.5, 0.15]], [[0.5, 0.5], [0.8, 0.2],
-                                                  [0.25, 0.75]]):
-          kwds = dict(
+
+def multinomial_cases():
+  num_samples = 10
+  def wrap(op, logits, logits_dtype, output_dtype, seed):
+    return op(seed=seed,
               logits=constant_op.constant(logits, dtype=logits_dtype),
-              num_samples=num_samples,
-              output_dtype=output_dtype)
-          yield (functools.partial(stateless.stateless_multinomial, **kwds),
-                 functools.partial(random_ops.multinomial, **kwds))
+              num_samples=num_samples, output_dtype=output_dtype)
+  for logits_dtype in np.float16, np.float32, np.float64:
+    for output_dtype in dtypes.int32, dtypes.int64:
+      for logits in ([[0.1, 0.25, 0.5, 0.15]], [[0.5, 0.5], [0.8, 0.2],
+                                                [0.25, 0.75]]):
+        yield ('multinomial',
+               functools.partial(wrap, stateless.stateless_multinomial, logits,
+                                 logits_dtype, output_dtype),
+               functools.partial(wrap, random_ops.multinomial, logits,
+                                 logits_dtype, output_dtype))
 
-  def _gamma_cases(self):
-    for dtype in np.float16, np.float32, np.float64:
-      for alpha in ([[.5, 1., 2.]], [[0.5, 0.5], [0.8, 0.2], [0.25, 0.75]]):
-        kwds = dict(alpha=constant_op.constant(alpha, dtype=dtype), dtype=dtype)
-        yield (
-            functools.partial(stateless.stateless_random_gamma,
-                              shape=(10,) + tuple(np.shape(alpha)), **kwds),
-            functools.partial(random_ops.random_gamma, shape=(10,), **kwds))
 
-  def _poisson_cases(self):
-    for lam_dtype in np.float16, np.float32, np.float64, np.int32, np.int64:
-      for out_dtype in np.float16, np.float32, np.float64, np.int32, np.int64:
-        for lam in ([[5.5, 1., 2.]], [[7.5, 10.5], [3.8, 8.2], [1.25, 9.75]]):
-          kwds = dict(
+def gamma_cases():
+  def wrap(op, alpha, dtype, shape, seed):
+    return op(seed=seed, shape=shape,
+              alpha=constant_op.constant(alpha, dtype=dtype), dtype=dtype)
+  for dtype in np.float16, np.float32, np.float64:
+    for alpha in ([[.5, 1., 2.]], [[0.5, 0.5], [0.8, 0.2], [0.25, 0.75]]):
+      yield ('gamma',
+             functools.partial(wrap, stateless.stateless_random_gamma, alpha,
+                               dtype, (10,) + tuple(np.shape(alpha))),
+             functools.partial(wrap, random_ops.random_gamma, alpha, dtype,
+                               (10,)))
+
+
+def poisson_cases():
+  def wrap(op, lam, lam_dtype, out_dtype, shape, seed):
+    return op(seed=seed, shape=shape,
               lam=constant_op.constant(lam_dtype(lam), dtype=lam_dtype),
               dtype=out_dtype)
-          yield (
-              functools.partial(stateless.stateless_random_poisson,
-                                shape=(10,) + tuple(np.shape(lam)),
-                                **kwds),
-              functools.partial(random_ops.random_poisson, shape=(10,), **kwds))
+  for lam_dtype in np.float16, np.float32, np.float64, np.int32, np.int64:
+    for out_dtype in np.float16, np.float32, np.float64, np.int32, np.int64:
+      for lam in ([[5.5, 1., 2.]], [[7.5, 10.5], [3.8, 8.2], [1.25, 9.75]]):
+        yield ('poisson',
+               functools.partial(wrap, stateless.stateless_random_poisson, lam,
+                                 lam_dtype, out_dtype,
+                                 (10,) + tuple(np.shape(lam))),
+               functools.partial(wrap, random_ops.random_poisson, lam,
+                                 lam_dtype, out_dtype, (10,)))
 
-  @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
-  def testMatchFloat(self):
-    self._test_match(self._float_cases())
 
-  @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
-  def testMatchInt(self):
-    self._test_match(self._int_cases())
+class StatelessOpsTest(test.TestCase, parameterized.TestCase):
 
-  @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
-  def testMatchMultinomial(self):
-    self._test_match(self._multinomial_cases())
+  def _test_match(self, case, seed):
+    # Stateless ops should be the same as stateful ops on the first call
+    # after seed scrambling.
+    key = 0x3ec8f720, 0x02461e29
+    preseed = invert_philox(key, (seed[0], 0, seed[1], 0)).astype(np.uint64)
+    preseed = preseed[::2] | preseed[1::2] << 32
+    with ops.device(get_device().name):
+      _, stateless_op, stateful_op = case
+      random_seed.set_random_seed(seed[0])
+      stateful = stateful_op(seed=seed[1])
+      pure = stateless_op(seed=preseed)
+      self.assertAllEqual(stateful, pure)
 
-  @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
-  def testMatchGamma(self):
-    self._test_match(self._gamma_cases())
+  def _test_old_and_new_stateless_match(self, case, seed):
+    """Tests that the new stateless ops match the old stateless ones."""
+    with ops.device(get_device().name):
+      _, stateless_op, _ = case
+      with compat.forward_compatibility_horizon(*BEFORE_EXPIRE):
+        old = stateless_op(seed=seed)
+      with compat.forward_compatibility_horizon(*AFTER_EXPIRE):
+        new = stateless_op(seed=seed)
+      self.assertAllClose(old, new)
 
-  @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
-  def testMatchPoisson(self):
-    self._test_match(self._poisson_cases())
+  def _test_determinism(self, case, seed_type):
+    # Stateless values should be equal iff the seeds are equal (roughly)
+    seeds = [(x, y) for x in range(5) for y in range(5)] * 3  # pylint: disable=g-complex-comprehension
+    with self.test_session(use_gpu=True), ops.device(get_device().name):
+      _, stateless_op, _ = case
+      if context.executing_eagerly():
+        values = [
+            (seed, stateless_op(seed=constant_op.constant(seed, seed_type)))
+            for seed in seeds]
+      else:
+        # Have this branch because the above branch is too slow in graph
+        # mode
+        seed_t = array_ops.placeholder(seed_type, shape=[2])
+        pure = stateless_op(seed=seed_t)
+        values = [
+            (seed, pure.eval(feed_dict={seed_t: seed})) for seed in seeds
+        ]
+      for s0, v0 in values:
+        for s1, v1 in values:
+          if dtypes.as_dtype(v0.dtype) != dtypes.bfloat16:
+            self.assertEqual(s0 == s1, np.all(v0 == v1))
+          elif s0 == s1:
+            # Skip the s0 != s1 case because v0 and v1 can be either equal or
+            # unequal in that case due to bfloat16's low precision
+            self.assertAllEqual(v0, v1)
 
+  @parameterized.named_parameters(
+      ('_%s_%s_%s' % (case[0], case_id, seed_id), case, seed)  # pylint: disable=g-complex-comprehension
+      for seed_id, seed in enumerate(SEEDS)
+      for case_id, case in enumerate(float_cases()))
   @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
-  def testDeterminismFloat(self):
-    self._test_determinism(
-        self._float_cases(shape_dtypes=(dtypes.int32, dtypes.int64)))
+  def testMatchFloat(self, case, seed):
+    if get_device().device_type in ('XLA_GPU', 'XLA_CPU'):
+      # This test was passing before because soft placement silently picked the
+      # CPU kernels.
+      self.skipTest('Skip on XLA because XLA kernels do not support int64 '
+                    'seeds needed by this test.')
+    self._test_match(case, seed)
 
+  @parameterized.named_parameters(
+      ('_%s_%s_%s' % (case[0], case_id, seed_id), case, seed)  # pylint: disable=g-complex-comprehension
+      for seed_id, seed in enumerate(SEEDS)
+      for case_id, case in enumerate(int_cases()))
   @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
-  def testDeterminismInt(self):
-    self._test_determinism(
-        self._int_cases(shape_dtypes=(dtypes.int32, dtypes.int64)))
+  def testMatchInt(self, case, seed):
+    if get_device().device_type in ('XLA_GPU', 'XLA_CPU'):
+      # This test was passing before because soft placement silently picked the
+      # CPU kernels.
+      self.skipTest('Skip on XLA because XLA kernels do not support int64 '
+                    'seeds needed by this test.')
+    self._test_match(case, seed)
 
+  @parameterized.named_parameters(
+      ('_%s_%s_%s' % (case[0], case_id, seed_id), case, seed)  # pylint: disable=g-complex-comprehension
+      for seed_id, seed in enumerate(SEEDS)
+      for case_id, case in enumerate(multinomial_cases()))
   @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
-  def testDeterminismMultinomial(self):
-    self._test_determinism(self._multinomial_cases())
+  def testMatchMultinomial(self, case, seed):
+    if get_device().device_type in ('XLA_GPU', 'XLA_CPU'):
+      # This test was passing before because soft placement silently picked the
+      # CPU kernels.
+      self.skipTest('Lacking XLA kernel')
+    self._test_match(case, seed)
 
+  @parameterized.named_parameters(
+      ('_%s_%s_%s' % (case[0], case_id, seed_id), case, seed)  # pylint: disable=g-complex-comprehension
+      for seed_id, seed in enumerate(SEEDS)
+      for case_id, case in enumerate(gamma_cases()))
   @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
-  def testDeterminismGamma(self):
-    self._test_determinism(self._gamma_cases())
+  def testMatchGamma(self, case, seed):
+    if get_device().device_type == 'GPU':
+      # This test was passing before because soft placement silently picked the
+      # CPU kernels.
+      self.skipTest('Lacking GPU kernel')
+    if get_device().device_type in ('XLA_GPU', 'XLA_CPU'):
+      # This test was passing before because soft placement silently picked the
+      # CPU kernels.
+      self.skipTest('Lacking XLA kernel')
+    self._test_match(case, seed)
 
+  @parameterized.named_parameters(
+      ('_%s_%s_%s' % (case[0], case_id, seed_id), case, seed)  # pylint: disable=g-complex-comprehension
+      for seed_id, seed in enumerate(SEEDS)
+      for case_id, case in enumerate(poisson_cases()))
   @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
-  def testDeterminismPoisson(self):
-    self._test_determinism(self._poisson_cases())
+  def testMatchPoisson(self, case, seed):
+    if get_device().device_type == 'GPU':
+      # This test was passing before because soft placement silently picked the
+      # CPU kernels.
+      self.skipTest('Lacking GPU kernel')
+    if get_device().device_type in ('XLA_GPU', 'XLA_CPU'):
+      # This test was passing before because soft placement silently picked the
+      # CPU kernels.
+      self.skipTest('Lacking XLA kernel')
+    self._test_match(case, seed)
+
+  @parameterized.named_parameters(
+      ('_%s_%s_%s' % (case[0], case_id, seed_id), case, seed)  # pylint: disable=g-complex-comprehension
+      for seed_id, seed in enumerate(SEEDS)
+      for case_id, case in enumerate(float_cases()))
+  @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
+  def testOldAndNewStatelessMatchFloat(self, case, seed):
+    self._test_old_and_new_stateless_match(case, seed)
+
+  @parameterized.named_parameters(
+      ('_%s_%s_%s' % (case[0], case_id, seed_id), case, seed)  # pylint: disable=g-complex-comprehension
+      for seed_id, seed in enumerate(SEEDS)
+      for case_id, case in enumerate(
+          int_cases(minval_maxval=((2, 11111), (None, None)))))
+  @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
+  def testOldAndNewStatelessMatchInt(self, case, seed):
+    self._test_old_and_new_stateless_match(case, seed)
+
+  @parameterized.named_parameters(
+      ('_%s_%s_%s' % (case[0], seed_type.name, case_id), case, seed_type)  # pylint: disable=g-complex-comprehension
+      for seed_type in SEED_TYPES
+      for case_id, case in enumerate(
+          float_cases(shape_dtypes=(dtypes.int32, dtypes.int64))))
+  @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
+  def testDeterminismFloat(self, case, seed_type):
+    if seed_type == dtypes.int64 and get_device().device_type in ('XLA_GPU',
+                                                                  'XLA_CPU'):
+      # This test was passing before because soft placement silently picked the
+      # CPU kernels.
+      self.skipTest(
+          'Skip on XLA because XLA kernels do not support int64 seeds.')
+    self._test_determinism(case, seed_type)
+
+  @parameterized.named_parameters(
+      ('_%s_%s_%s' % (case[0], seed_type.name, case_id), case, seed_type)  # pylint: disable=g-complex-comprehension
+      for seed_type in SEED_TYPES
+      for case_id, case in enumerate(
+          int_cases(shape_dtypes=(dtypes.int32, dtypes.int64))))
+  @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
+  def testDeterminismInt(self, case, seed_type):
+    if seed_type == dtypes.int64 and get_device().device_type in ('XLA_GPU',
+                                                                  'XLA_CPU'):
+      # This test was passing before because soft placement silently picked the
+      # CPU kernels.
+      self.skipTest(
+          'Skip on XLA because XLA kernels do not support int64 seeds.')
+    self._test_determinism(case, seed_type)
+
+  @parameterized.named_parameters(
+      ('_%s_%s_%s' % (case[0], seed_type.name, case_id), case, seed_type)  # pylint: disable=g-complex-comprehension
+      for seed_type in SEED_TYPES
+      for case_id, case in enumerate(multinomial_cases()))
+  @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
+  def testDeterminismMultinomial(self, case, seed_type):
+    if get_device().device_type in ('XLA_GPU', 'XLA_CPU'):
+      # This test was passing before because soft placement silently picked the
+      # CPU kernels.
+      self.skipTest('Lacking XLA kernel')
+    self._test_determinism(case, seed_type)
+
+  @parameterized.named_parameters(
+      ('_%s_%s_%s' % (case[0], seed_type.name, case_id), case, seed_type)  # pylint: disable=g-complex-comprehension
+      for seed_type in SEED_TYPES
+      for case_id, case in enumerate(gamma_cases()))
+  @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
+  def testDeterminismGamma(self, case, seed_type):
+    if get_device().device_type == 'GPU':
+      # This test was passing before because soft placement silently picked the
+      # CPU kernels.
+      self.skipTest('Lacking GPU kernel')
+    if get_device().device_type in ('XLA_GPU', 'XLA_CPU'):
+      # This test was passing before because soft placement silently picked the
+      # CPU kernels.
+      self.skipTest('Lacking XLA kernel')
+    self._test_determinism(case, seed_type)
+
+  @parameterized.named_parameters(
+      ('_%s_%s_%s' % (case[0], seed_type.name, case_id), case, seed_type)  # pylint: disable=g-complex-comprehension
+      for seed_type in SEED_TYPES
+      for case_id, case in enumerate(poisson_cases()))
+  @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
+  def testDeterminismPoisson(self, case, seed_type):
+    if get_device().device_type == 'GPU':
+      # This test was passing before because soft placement silently picked the
+      # CPU kernels.
+      self.skipTest('Lacking GPU kernel')
+    if get_device().device_type in ('XLA_GPU', 'XLA_CPU'):
+      # This test was passing before because soft placement silently picked the
+      # CPU kernels.
+      self.skipTest('Lacking XLA kernel')
+    self._test_determinism(case, seed_type)
 
   def assertDTypeEqual(self, a, b):
     self.assertEqual(dtypes.as_dtype(a), dtypes.as_dtype(b))
@@ -268,4 +472,6 @@ class StatelessOpsTest(test.TestCase, parameterized.TestCase):
 
 
 if __name__ == '__main__':
+  config.set_soft_device_placement(False)
+  context.context().enable_xla_devices()
   test.main()
diff --git a/tensorflow/python/kernel_tests/resource_variable_ops_test.py b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
index beedf6ef1f1..839235233ff 100644
--- a/tensorflow/python/kernel_tests/resource_variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
@@ -34,6 +34,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import cpp_shape_inference_pb2
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import memory_checker
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
@@ -169,10 +170,12 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
   @test_util.run_in_graph_and_eager_modes
   def testVariableShape(self):
     v = resource_variable_ops.ResourceVariable([1., 1.])
+    vshape = resource_variable_ops.variable_shape(v.handle)
     self.assertAllEqual(
-        tensor_util.constant_value(
-            resource_variable_ops.variable_shape(v.handle)),
+        tensor_util.constant_value(vshape),
         [2])
+    if not context.executing_eagerly():
+      self.assertEqual("Const", vshape.op.type)
 
   @test_util.run_deprecated_v1
   def testDifferentAssignGraph(self):
@@ -1006,7 +1009,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
         var = variable_scope.get_variable("x", shape=[1, 1],
                                           dtype=dtypes.float32)
         with self.assertRaisesRegex(ValueError,
-                                    "Shapes.*and.*are incompatible"):
+                                    "shape.*and.*are incompatible"):
           assign = var.assign(np.zeros(shape=[2, 2]))
           self.evaluate(assign)
 
@@ -1566,6 +1569,29 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
           var.handle, indices, dtype=dtype)
     self.assertAllEqual(expected, result)
 
+  @test_util.run_v2_only
+  def testUninitializedVariableMemoryUsage(self):
+    if test_util.is_gpu_available():
+      # TODO(allenl): Investigate possible GPU-specific memory leaks
+      self.skipTest("Disabled when a GPU is available")
+    # TODO(kkb): Python memory checker complains continuous `weakref`
+    # allocations, investigate.
+    if memory_checker.CppMemoryChecker is None:
+      self.skipTest("Requires the C++ memory checker")
+
+    def _create_and_delete_variable():
+      resource_variable_ops.UninitializedVariable(
+          shape=[100, 100],
+          dtype=dtypes.float32)
+
+    _create_and_delete_variable()
+    checker = memory_checker.CppMemoryChecker(
+        "ResourceVariableOps.testUninitializedVariableMemoryUsage")
+    for _ in range(2):
+      _create_and_delete_variable()
+      checker.record_snapshot()
+    checker.report()
+    checker.assert_no_leak_if_all_possibly_except_one()
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/rnn_cell_test.py b/tensorflow/python/kernel_tests/rnn_cell_test.py
index 01b324f29fb..7fa31d14777 100644
--- a/tensorflow/python/kernel_tests/rnn_cell_test.py
+++ b/tensorflow/python/kernel_tests/rnn_cell_test.py
@@ -3062,6 +3062,8 @@ class RNNCellTest(test.TestCase, parameterized.TestCase):
 
 
 @test_util.run_all_in_graph_and_eager_modes
+@test_util.run_all_without_tensor_float_32(
+    "Uses an LSTMCell, which calls matmul")
 class DropoutWrapperTest(test.TestCase, parameterized.TestCase):
 
   def _testDropoutWrapper(self,
diff --git a/tensorflow/python/kernel_tests/self_adjoint_eig_op_test.py b/tensorflow/python/kernel_tests/self_adjoint_eig_op_test.py
index 5be7cb4dd3a..40f8b31b7c2 100644
--- a/tensorflow/python/kernel_tests/self_adjoint_eig_op_test.py
+++ b/tensorflow/python/kernel_tests/self_adjoint_eig_op_test.py
@@ -38,6 +38,7 @@ def _AddTest(test_class, op_name, testcase_name, fn):
   setattr(test_class, test_name, fn)
 
 
+@test_util.run_all_without_tensor_float_32
 class SelfAdjointEigTest(test.TestCase):
 
   @test_util.run_deprecated_v1
@@ -160,8 +161,8 @@ def _GetSelfAdjointEigTest(dtype_, shape_, compute_v_):
         tf_e, tf_v = linalg_ops.self_adjoint_eig(constant_op.constant(a))
 
         # Check that V*diag(E)*V^T is close to A.
-        a_ev = math_ops.matmul(
-            math_ops.matmul(tf_v, array_ops.matrix_diag(tf_e)),
+        a_ev = test_util.matmul_without_tf32(
+            test_util.matmul_without_tf32(tf_v, array_ops.matrix_diag(tf_e)),
             tf_v,
             adjoint_b=True)
         self.assertAllClose(self.evaluate(a_ev), a, atol=atol)
diff --git a/tensorflow/python/kernel_tests/string_format_op_test.py b/tensorflow/python/kernel_tests/string_format_op_test.py
index adb8ad6e677..d546b09a5b4 100644
--- a/tensorflow/python/kernel_tests/string_format_op_test.py
+++ b/tensorflow/python/kernel_tests/string_format_op_test.py
@@ -379,6 +379,15 @@ class StringFormatOpTest(test.TestCase):
         format_output = string_ops.string_format("{}", (tensor, tensor))
         self.evaluate(format_output)
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testTensorAndFormatUnicode(self):
+    with self.cached_session():
+      tensor = constant_op.constant("😊")
+      format_output = string_ops.string_format("😊:{}", tensor)
+      out = self.evaluate(format_output)
+      expected = '😊:"😊"'
+      self.assertEqual(compat.as_text(out), expected)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/template_mirrored_strategy_test.py b/tensorflow/python/kernel_tests/template_mirrored_strategy_test.py
index e4a97167c8b..df397d449c3 100644
--- a/tensorflow/python/kernel_tests/template_mirrored_strategy_test.py
+++ b/tensorflow/python/kernel_tests/template_mirrored_strategy_test.py
@@ -30,6 +30,7 @@ from tensorflow.python.platform import test
 class TemplateMirroredStrategyTest(test.TestCase):
 
   @test_util.run_deprecated_v1
+  @test_util.disable_tfrt("Strategy not supported yet.")
   def test_merge_call(self):
     if not test.is_gpu_available():
       self.skipTest("No GPU available")
diff --git a/tensorflow/python/kernel_tests/tensordot_op_test.py b/tensorflow/python/kernel_tests/tensordot_op_test.py
index a031f9bca07..368a7f18f8b 100644
--- a/tensorflow/python/kernel_tests/tensordot_op_test.py
+++ b/tensorflow/python/kernel_tests/tensordot_op_test.py
@@ -165,6 +165,7 @@ def _get_tensordot_tests(dtype_, rank_a_, rank_b_, num_dims_, dynamic_shape_):
     return a, b, a_dims, b_dims
 
   @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  @test_util.run_without_tensor_float_32("Tests tensordot, which calls matmul")
   def test_tensordot(self):
     if dynamic_shape_ and context.executing_eagerly():
       self.skipTest("Placeholders not support in eager mode")
@@ -196,6 +197,7 @@ def _get_tensordot_tests(dtype_, rank_a_, rank_b_, num_dims_, dynamic_shape_):
       self.assertAllEqual(tf_ans.shape, np_ans.shape)
 
   @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  @test_util.run_without_tensor_float_32("Tests tensordot, which calls matmul")
   def test_tensordot_scalar_axes(self):
     if dynamic_shape_ and context.executing_eagerly():
       self.skipTest("Placeholders not support in eager mode")
diff --git a/tensorflow/python/kernel_tests/v1_compat_tests/BUILD b/tensorflow/python/kernel_tests/v1_compat_tests/BUILD
index 9cd0f4df101..bd9c02d8101 100644
--- a/tensorflow/python/kernel_tests/v1_compat_tests/BUILD
+++ b/tensorflow/python/kernel_tests/v1_compat_tests/BUILD
@@ -19,6 +19,18 @@ tf_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "scatter_nd_ops_test",
+    size = "small",
+    srcs = ["scatter_nd_ops_test.py"],
+    deps = [
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:variables",
+        "//third_party/py/numpy",
+    ],
+)
+
 cuda_py_test(
     name = "session_ops_test",
     size = "small",
diff --git a/tensorflow/python/kernel_tests/v1_compat_tests/scatter_nd_ops_test.py b/tensorflow/python/kernel_tests/v1_compat_tests/scatter_nd_ops_test.py
new file mode 100644
index 00000000000..6ee75649867
--- /dev/null
+++ b/tensorflow/python/kernel_tests/v1_compat_tests/scatter_nd_ops_test.py
@@ -0,0 +1,159 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for scatter_nd_ops that only work in V1."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+
+import numpy as np
+
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+def _AsType(v, vtype):
+  return v.astype(vtype) if isinstance(v, np.ndarray) else vtype(v)
+
+
+def _FlatInnerDims(tensor, ndims=2):
+  shape = list(tensor.shape)
+  return tensor.reshape(
+      [functools.reduce(lambda x, y: x * y, shape[:-ndims + 1], 1)] +
+      shape[-ndims + 1:])
+
+
+def _FlatOuterDims(tensor, ndims=2):
+  shape = list(tensor.shape)
+  return tensor.reshape(
+      shape[:ndims - 1] +
+      [functools.reduce(lambda x, y: x * y, shape[ndims - 1:], 1)])
+
+
+def _NumpyScatterNd(ref, indices, updates, op):
+  ixdim = indices.shape[-1]
+  num_updates = indices.size // ixdim
+  total_nd = len(ref.shape)
+  slice_size = 1
+  for i in range(ixdim, total_nd):
+    slice_size *= ref.shape[i]
+  flat_indices = _FlatInnerDims(indices)
+  flat_updates = updates.reshape((num_updates, slice_size))
+  output_flat = _FlatOuterDims(ref, ixdim + 1)
+  for ix_updates, ix_output in enumerate(flat_indices):
+    ix_output = tuple(ix_output)
+    output_flat[ix_output] = op(output_flat[ix_output],
+                                flat_updates[ix_updates])
+  return output_flat.reshape(ref.shape)
+
+
+def _NumpyMin(ref, indices, updates):
+  return _NumpyScatterNd(ref, indices, updates, np.minimum)
+
+
+def _NumpyMax(ref, indices, updates):
+  return _NumpyScatterNd(ref, indices, updates, np.maximum)
+
+
+class StatefulScatterNdTest(test.TestCase):
+
+  def _VariableRankTest(self,
+                        np_scatter,
+                        tf_scatter,
+                        vtype,
+                        itype,
+                        repeat_indices=False):
+    np.random.seed(8)
+    ref_shapes = [(3, 6), (3, 6), (3, 6, 9), (3, 6, 9), (3, 6, 9), (3, 6, 9)]
+    indices_shapes = [(2,), (2, 2), (2,), (2, 2), (2, 3), (2, 3, 3)]
+    with test_util.device(use_gpu=True):
+      for ref_shape, indices_shape in zip(ref_shapes, indices_shapes):
+        num_updates = indices_shape[0]
+        ixdim = indices_shape[-1]
+
+        indexable_area_shape = ()
+        for i in range(ixdim):
+          indexable_area_shape += (ref_shape[i],)
+        all_indices = [
+            list(coord) for coord, _ in np.ndenumerate(
+                np.empty(indexable_area_shape, vtype))
+        ]
+        np.random.shuffle(all_indices)
+        indices = np.array(all_indices[:num_updates])
+
+        if num_updates > 1 and repeat_indices:
+          indices = indices[:num_updates // 2]
+          for _ in range(num_updates - num_updates // 2):
+            indices = np.append(
+                indices, [indices[np.random.randint(num_updates // 2)]], axis=0)
+          np.random.shuffle(indices)
+        indices = _AsType(indices[:num_updates], itype)
+
+        updates_shape = (num_updates,)
+        for i in range(ixdim, len(ref_shape)):
+          updates_shape += (ref_shape[i],)
+        updates = _AsType(np.random.randn(*(updates_shape)), vtype)
+        ref = _AsType(np.random.randn(*(ref_shape)), vtype)
+
+        # Scatter via numpy
+        new = ref.copy()
+        np_scatter(new, indices, updates)
+        # Scatter via tensorflow
+        ref_var = variables.VariableV1(ref)
+        self.evaluate(ref_var.initializer)
+        self.evaluate(tf_scatter(ref_var, indices, updates))
+
+        # Compare
+        self.assertAllClose(new, self.evaluate(ref_var))
+
+  def _ScatterRepeatIndicesTest(self, np_scatter, tf_scatter):
+    for vtype in (np.int32, np.float16, np.float32, np.float64):
+      for itype in (np.int32, np.int64):
+        self._VariableRankTest(
+            np_scatter, tf_scatter, vtype, itype, repeat_indices=True)
+
+  @test_util.run_v1_only("Don't need to test VariableV1 in TF2")
+  def testScatterRepeatIndicesMinMax(self):
+    """This tests scatter_add using indices that repeat."""
+    self._ScatterRepeatIndicesTest(_NumpyMin, state_ops.scatter_nd_min)
+    self._ScatterRepeatIndicesTest(_NumpyMax, state_ops.scatter_nd_max)
+
+  @test_util.run_v1_only("Don't need to test VariableV1 in TF2")
+  def testScatterOutOfRangeCpu(self):
+    for op in (state_ops.scatter_nd_min, state_ops.scatter_nd_max):
+      params = np.array([1, 2, 3, 4, 5, 6]).astype(np.float32)
+      updates = np.array([-3, -4, -5]).astype(np.float32)
+      with self.cached_session(use_gpu=False):
+        ref = variables.VariableV1(params)
+        self.evaluate(ref.initializer)
+
+        # Indices all in range, no problem.
+        indices = np.array([[2], [0], [5]])
+        self.evaluate(op(ref, indices, updates))
+
+        # Test some out of range errors.
+        indices = np.array([[-1], [0], [5]])
+        with self.assertRaisesOpError(
+            r"indices\[0\] = \[-1\] does not index into shape \[6\]"):
+          op(ref, indices, updates).eval()
+
+        indices = np.array([[2], [0], [6]])
+        with self.assertRaisesOpError(
+            r"indices\[2\] = \[6\] does not index into shape \[6\]"):
+          op(ref, indices, updates).eval()
diff --git a/tensorflow/python/kernel_tests/variables_test.py b/tensorflow/python/kernel_tests/variables_test.py
index d81f9c23d97..0d3bbb5144d 100644
--- a/tensorflow/python/kernel_tests/variables_test.py
+++ b/tensorflow/python/kernel_tests/variables_test.py
@@ -170,7 +170,7 @@ class VariablesTestCase(test.TestCase, parameterized.TestCase):
   def testAssignDifferentShapesEagerNotAllowed(self):
     with context.eager_mode():
       var = variables.Variable(np.zeros(shape=[1, 1]))
-      with self.assertRaisesRegex(ValueError, "Shapes.*and.*are incompatible"):
+      with self.assertRaisesRegex(ValueError, "shape.*and.*are incompatible"):
         var.assign(np.zeros(shape=[2, 2]))
 
   @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
diff --git a/tensorflow/python/kernel_tests/while_v2_test.py b/tensorflow/python/kernel_tests/while_v2_test.py
index de2e8e3cc8d..4b9c1fed916 100644
--- a/tensorflow/python/kernel_tests/while_v2_test.py
+++ b/tensorflow/python/kernel_tests/while_v2_test.py
@@ -1830,6 +1830,18 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
       return grad_out
     self.assertAllEqual(F(), 8.0)
 
+  def testIndexedSlicesInIncomingGrads(self):
+    @def_function.function
+    def F():
+      x = constant_op.constant([2.])
+      # Computes x^4
+      ret = while_loop_v2(
+          lambda _: True, lambda v: v * v, [x], return_same_structure=False,
+          maximum_iterations=2)
+      v = array_ops.gather(ret, [0])
+      return gradients_impl.gradients(v, [x])[0]  # 4*x^3
+    self.assertAllEqual(self.evaluate(F()), [32.])
+
 
 def ScalarShape():
   return ops.convert_to_tensor([], dtype=dtypes.int32)
diff --git a/tensorflow/python/lib/core/ndarray_tensor.cc b/tensorflow/python/lib/core/ndarray_tensor.cc
index 7be05c03e36..03fbea39748 100644
--- a/tensorflow/python/lib/core/ndarray_tensor.cc
+++ b/tensorflow/python/lib/core/ndarray_tensor.cc
@@ -470,7 +470,7 @@ Status TF_TensorToPyArray(Safe_TF_TensorPtr tensor, PyObject** out_ndarray) {
 }
 
 Status NdarrayToTensor(TFE_Context* ctx, PyObject* ndarray,
-                       Safe_TF_TensorPtr* ret, bool convert_string) {
+                       Safe_TF_TensorPtr* ret) {
   DCHECK(ret != nullptr);
 
   // Make sure we dereference this array object in case of error, etc.
@@ -501,7 +501,7 @@ Status NdarrayToTensor(TFE_Context* ctx, PyObject* ndarray,
     if (ctx) {
       *ret = make_safe(new TF_Tensor{tensorflow::unwrap(ctx)->CreateTensor(
           static_cast<tensorflow::DataType>(dtype), {}, 0, PyArray_DATA(array),
-          size, convert_string, &DelayedNumpyDecref, array)});
+          size, &DelayedNumpyDecref, array)});
     } else {
       *ret = make_safe(TF_NewTensor(dtype, {}, 0, PyArray_DATA(array), size,
                                     &DelayedNumpyDecref, array));
@@ -513,8 +513,7 @@ Status NdarrayToTensor(TFE_Context* ctx, PyObject* ndarray,
     if (ctx) {
       *ret = make_safe(new TF_Tensor{tensorflow::unwrap(ctx)->CreateTensor(
           static_cast<tensorflow::DataType>(dtype), dims.data(), dims.size(),
-          PyArray_DATA(array), size, convert_string, &DelayedNumpyDecref,
-          array)});
+          PyArray_DATA(array), size, &DelayedNumpyDecref, array)});
     } else {
       *ret = make_safe(TF_NewTensor(dtype, dims.data(), dims.size(),
                                     PyArray_DATA(array), size,
@@ -528,7 +527,7 @@ Status NdarrayToTensor(TFE_Context* ctx, PyObject* ndarray,
     if (ctx) {
       *ret = make_safe(new TF_Tensor{tensorflow::unwrap(ctx)->CreateTensor(
           static_cast<tensorflow::DataType>(dtype), dims.data(), dims.size(),
-          encoded, size, convert_string,
+          encoded, size,
           [](void* data, size_t len, void* arg) {
             delete[] reinterpret_cast<tensorflow::tstring*>(data);
           },
@@ -551,8 +550,7 @@ TF_Tensor* TF_TensorFromTensor(const tensorflow::Tensor& src, Status* status);
 
 Status NdarrayToTensor(PyObject* obj, Tensor* ret) {
   Safe_TF_TensorPtr tf_tensor = make_safe(static_cast<TF_Tensor*>(nullptr));
-  Status s = NdarrayToTensor(nullptr /*ctx*/, obj, &tf_tensor,
-                             false /*convert_string*/);
+  Status s = NdarrayToTensor(nullptr /*ctx*/, obj, &tf_tensor);
   if (!s.ok()) {
     return s;
   }
diff --git a/tensorflow/python/lib/core/ndarray_tensor.h b/tensorflow/python/lib/core/ndarray_tensor.h
index 38c098417d5..e7657778fa8 100644
--- a/tensorflow/python/lib/core/ndarray_tensor.h
+++ b/tensorflow/python/lib/core/ndarray_tensor.h
@@ -36,7 +36,7 @@ Status TF_TensorToPyArray(Safe_TF_TensorPtr tensor, PyObject** out_ndarray);
 // Expected to be removed once tstring migration is done.
 ABSL_MUST_USE_RESULT
 Status NdarrayToTensor(TFE_Context* ctx, PyObject* ndarray,
-                       Safe_TF_TensorPtr* ret, bool convert_string);
+                       Safe_TF_TensorPtr* ret);
 
 // Creates a tensor in 'ret' from the input Ndarray.
 // TODO(kkb): This is an old conversion function that does not support TFRT.
diff --git a/tensorflow/python/lib/core/py_seq_tensor.cc b/tensorflow/python/lib/core/py_seq_tensor.cc
index 0139355c6b7..9acb6d4a283 100644
--- a/tensorflow/python/lib/core/py_seq_tensor.cc
+++ b/tensorflow/python/lib/core/py_seq_tensor.cc
@@ -686,8 +686,7 @@ typedef Converter<bool> BoolConverter;
 // other.
 TFE_TensorHandle* NumpyToTFE_TensorHandle(TFE_Context* ctx, PyObject* obj) {
   Safe_TF_TensorPtr tf_tensor = make_safe(static_cast<TF_Tensor*>(nullptr));
-  Status status = tensorflow::NdarrayToTensor(ctx, obj, &tf_tensor,
-                                              true /*convert_string*/);
+  Status status = tensorflow::NdarrayToTensor(ctx, obj, &tf_tensor);
 
   if (TF_PREDICT_FALSE(!status.ok())) {
     PyErr_SetString(PyExc_ValueError,
diff --git a/tensorflow/python/lib/core/pybind11_lib.h b/tensorflow/python/lib/core/pybind11_lib.h
index a0fb45a5152..e464728a515 100644
--- a/tensorflow/python/lib/core/pybind11_lib.h
+++ b/tensorflow/python/lib/core/pybind11_lib.h
@@ -55,12 +55,12 @@ inline py::object PyoOrThrow(PyObject* ptr) {
   return Pyo(ptr);
 }
 
-void ThrowTypeError(const char* error_message) {
+[[noreturn]] void ThrowTypeError(const char* error_message) {
   PyErr_SetString(PyExc_TypeError, error_message);
   throw pybind11::error_already_set();
 }
 
-void ThrowValueError(const char* error_message) {
+[[noreturn]] void ThrowValueError(const char* error_message) {
   PyErr_SetString(PyExc_ValueError, error_message);
   throw pybind11::error_already_set();
 }
diff --git a/tensorflow/python/lib/core/safe_ptr.cc b/tensorflow/python/lib/core/safe_ptr.cc
index 2194f2499fd..ce852a4f009 100644
--- a/tensorflow/python/lib/core/safe_ptr.cc
+++ b/tensorflow/python/lib/core/safe_ptr.cc
@@ -17,10 +17,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-Safe_PyObjectPtr make_safe(PyObject* object) {
-  return Safe_PyObjectPtr(object);
-}
-
 Safe_TF_TensorPtr make_safe(TF_Tensor* tensor) {
   return Safe_TF_TensorPtr(tensor);
 }
diff --git a/tensorflow/python/lib/core/safe_ptr.h b/tensorflow/python/lib/core/safe_ptr.h
index 44d14e9bea4..00f47d7bbe6 100644
--- a/tensorflow/python/lib/core/safe_ptr.h
+++ b/tensorflow/python/lib/core/safe_ptr.h
@@ -16,20 +16,17 @@ limitations under the License.
 #ifndef TENSORFLOW_PYTHON_LIB_CORE_SAFE_PTR_H_
 #define TENSORFLOW_PYTHON_LIB_CORE_SAFE_PTR_H_
 
-#include <memory>
-
 #include <Python.h>
 
+#include <memory>
+
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/python/lib/core/safe_pyobject_ptr.h"
 
 namespace tensorflow {
 namespace detail {
 
-struct PyDecrefDeleter {
-  void operator()(PyObject* p) const { Py_DECREF(p); }
-};
-
 struct TFTensorDeleter {
   void operator()(TF_Tensor* p) const { TF_DeleteTensor(p); }
 };
@@ -48,11 +45,6 @@ struct TFBufferDeleter {
 
 }  // namespace detail
 
-// Safe container for an owned PyObject. On destruction, the reference count of
-// the contained object will be decremented.
-using Safe_PyObjectPtr = std::unique_ptr<PyObject, detail::PyDecrefDeleter>;
-Safe_PyObjectPtr make_safe(PyObject* o);
-
 // Safe containers for an owned TF_Tensor. On destruction, the tensor will be
 // deleted by TF_DeleteTensor.
 using Safe_TF_TensorPtr = std::unique_ptr<TF_Tensor, detail::TFTensorDeleter>;
diff --git a/tensorflow/compiler/mlir/lite/ir/dialect_registration.cc b/tensorflow/python/lib/core/safe_pyobject_ptr.cc
similarity index 70%
rename from tensorflow/compiler/mlir/lite/ir/dialect_registration.cc
rename to tensorflow/python/lib/core/safe_pyobject_ptr.cc
index fae20437811..966d3ec5ab5 100644
--- a/tensorflow/compiler/mlir/lite/ir/dialect_registration.cc
+++ b/tensorflow/python/lib/core/safe_pyobject_ptr.cc
@@ -1,4 +1,4 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,7 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/python/lib/core/safe_pyobject_ptr.h"
 
-// Static initialization for TensorFlow Lite op registration.
-static mlir::DialectRegistration<mlir::TFL::TensorFlowLiteDialect> tfl_ops;
+namespace tensorflow {
+
+Safe_PyObjectPtr make_safe(PyObject* object) {
+  return Safe_PyObjectPtr(object);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_kernel_creator_util.h b/tensorflow/python/lib/core/safe_pyobject_ptr.h
similarity index 50%
rename from tensorflow/compiler/jit/xla_kernel_creator_util.h
rename to tensorflow/python/lib/core/safe_pyobject_ptr.h
index f090f55f354..496bfed6c62 100644
--- a/tensorflow/compiler/jit/xla_kernel_creator_util.h
+++ b/tensorflow/python/lib/core/safe_pyobject_ptr.h
@@ -1,4 +1,4 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,22 +12,28 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_COMPILER_JIT_XLA_KERNEL_CREATOR_UTIL_H_
-#define TENSORFLOW_COMPILER_JIT_XLA_KERNEL_CREATOR_UTIL_H_
 
-#include "tensorflow/core/framework/function.h"
-#include "tensorflow/core/framework/node_def.pb.h"
-#include "tensorflow/core/lib/core/status.h"
+#ifndef TENSORFLOW_PYTHON_LIB_CORE_SAFE_PYOBJECT_PTR_H_
+#define TENSORFLOW_PYTHON_LIB_CORE_SAFE_PYOBJECT_PTR_H_
+
+#include <Python.h>
+
+#include <memory>
 
 namespace tensorflow {
+namespace detail {
 
-class FunctionLibraryRuntime;
-class OpKernel;
+struct PyDecrefDeleter {
+  void operator()(PyObject* p) const { Py_DECREF(p); }
+};
 
-// Given a supported NodeDef, returns a XlaLaunchOp that computes the node.
-Status CreateXlaKernel(FunctionLibraryRuntime* flr, const NodeDef& node_def,
-                       std::unique_ptr<OpKernel>* kernel);
+}  // namespace detail
+
+// Safe container for an owned PyObject. On destruction, the reference count of
+// the contained object will be decremented.
+using Safe_PyObjectPtr = std::unique_ptr<PyObject, detail::PyDecrefDeleter>;
+Safe_PyObjectPtr make_safe(PyObject* o);
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_COMPILER_JIT_XLA_KERNEL_CREATOR_UTIL_H_
+#endif  // TENSORFLOW_PYTHON_LIB_CORE_SAFE_PYOBJECT_PTR_H_
diff --git a/tensorflow/python/mlir_wrapper.cc b/tensorflow/python/mlir_wrapper.cc
index 6bc0183fdc5..fa16e5872ee 100644
--- a/tensorflow/python/mlir_wrapper.cc
+++ b/tensorflow/python/mlir_wrapper.cc
@@ -31,6 +31,17 @@ PYBIND11_MODULE(_pywrap_mlir, m) {
           return output;
         });
 
+  m.def("ImportFunction", [](const std::string &functiondef,
+                             const std::string &functiondef_library,
+                             const std::string &pass_pipeline) {
+    tensorflow::Safe_TF_StatusPtr status =
+        tensorflow::make_safe(TF_NewStatus());
+    std::string output = tensorflow::ImportFunction(
+        functiondef, functiondef_library, pass_pipeline, status.get());
+    tensorflow::MaybeRaiseRegisteredFromTFStatus(status.get());
+    return output;
+  });
+
   m.def("ExperimentalConvertSavedModelToMlir",
         [](const std::string &saved_model_path,
            const std::string &exported_names, bool show_debug_info) {
diff --git a/tensorflow/python/modules_with_exports.py b/tensorflow/python/modules_with_exports.py
index 40ef715aacf..3f848797b08 100644
--- a/tensorflow/python/modules_with_exports.py
+++ b/tensorflow/python/modules_with_exports.py
@@ -53,6 +53,16 @@ from tensorflow.python.ops import initializers_ns as initializers
 
 from tensorflow.python.util.tf_export import tf_export
 
+# _internal APIs
+from tensorflow.python.framework.combinations import *
+from tensorflow.python.framework.test_combinations import *
+from tensorflow.python.util.tf_decorator import make_decorator
+from tensorflow.python.util.tf_decorator import unwrap
+
+tf_export('__internal__.decorator.make_decorator', v1=[])(make_decorator)
+tf_export('__internal__.decorator.unwrap', v1=[])(unwrap)
+
+
 # Export protos
 # pylint: disable=undefined-variable
 tf_export(v1=['AttrValue'])(AttrValue)
diff --git a/tensorflow/python/ops/array_grad.py b/tensorflow/python/ops/array_grad.py
index 5576ce5e538..6da542ff98e 100644
--- a/tensorflow/python/ops/array_grad.py
+++ b/tensorflow/python/ops/array_grad.py
@@ -314,6 +314,30 @@ def _StridedSliceGradGrad(op, grad):
       shrink_axis_mask=op.get_attr("shrink_axis_mask"))
 
 
+@ops.RegisterGradient("TensorStridedSliceUpdate")
+def _TensorStridedSliceUpdateGrad(op, grad):  # pylint:disable=missing-function-docstring
+  begin = op.inputs[1]
+  end = op.inputs[2]
+  strides = op.inputs[3]
+  begin_mask = op.get_attr("begin_mask")
+  end_mask = op.get_attr("end_mask")
+  ellipsis_mask = op.get_attr("ellipsis_mask")
+  new_axis_mask = op.get_attr("new_axis_mask")
+  shrink_axis_mask = op.get_attr("shrink_axis_mask")
+  def Apply(f, *args):
+    return f(*args,
+             begin_mask=begin_mask,
+             end_mask=end_mask,
+             shrink_axis_mask=shrink_axis_mask,
+             new_axis_mask=new_axis_mask,
+             ellipsis_mask=ellipsis_mask)
+  dy = Apply(array_ops.strided_slice,
+             grad, begin, end, strides)
+  dx = Apply(array_ops.tensor_strided_slice_update,
+             grad, begin, end, strides, array_ops.zeros_like(dy))
+  return dx, None, None, None, dy
+
+
 @ops.RegisterGradient("Split")
 def _SplitGrad(op, *grads):
   return None, array_ops.concat(list(grads), op.inputs[0])
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 5d68deb7ac1..287658f179f 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -2472,7 +2472,6 @@ def matrix_diag(diagonal,
 @tf_export("linalg.diag_part", v1=["linalg.diag_part", "matrix_diag_part"])
 @dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("matrix_diag_part")
-@dispatch.add_dispatch_support
 def matrix_diag_part(
     input,  # pylint:disable=redefined-builtin
     name="diag_part",
@@ -2610,6 +2609,51 @@ def matrix_diag_part(
       input=input, k=k, padding_value=padding_value, align=align, name=name)
 
 
+@tf_export(
+    "linalg.tensor_diag_part", v1=["linalg.tensor_diag_part", "diag_part"])
+@dispatch.add_dispatch_support
+@deprecation.deprecated_endpoints("diag_part")
+def tensor_diag_part(
+    input,  # pylint:disable=redefined-builtin
+    name=None):
+  """Returns the diagonal part of the tensor.
+
+  This operation returns a tensor with the `diagonal` part
+  of the `input`. The `diagonal` part is computed as follows:
+
+  Assume `input` has dimensions `[D1,..., Dk, D1,..., Dk]`, then the output is a
+  tensor of rank `k` with dimensions `[D1,..., Dk]` where:
+
+  `diagonal[i1,..., ik] = input[i1, ..., ik, i1,..., ik]`.
+
+  For a rank 2 tensor, `linalg.diag_part` and `linalg.tensor_diag_part`
+  produce the same result. For rank 3 and higher, linalg.diag_part extracts
+  the diagonal of each inner-most matrix in the tensor. An example where
+  they differ is given below.
+
+  >>> x = [[[[1111,1112],[1121,1122]],
+  ...       [[1211,1212],[1221,1222]]],
+  ...      [[[2111, 2112], [2121, 2122]],
+  ...       [[2211, 2212], [2221, 2222]]]
+  ...      ]
+  >>> tf.linalg.tensor_diag_part(x)
+  <tf.Tensor: shape=(2, 2), dtype=int32, numpy=
+  array([[1111, 1212],
+         [2121, 2222]], dtype=int32)>
+  >>> tf.linalg.diag_part(x).shape
+  TensorShape([2, 2, 2])
+
+  Args:
+    input: A `Tensor` with rank `2k`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A Tensor containing diagonals of `input`. Has the same type as `input`, and
+    rank `k`.
+  """
+  return gen_array_ops.diag_part(input=input, name=name)
+
+
 @tf_export("linalg.set_diag", v1=["linalg.set_diag", "matrix_set_diag"])
 @dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("matrix_set_diag")
@@ -4252,8 +4296,7 @@ def sequence_mask(lengths, maxlen=None, dtype=dtypes.bool, name=None):
     # authoritative type. Whenever maxlen fits into tf.int32, so do the lengths.
     matrix = gen_math_ops.cast(expand_dims(lengths, -1), maxlen.dtype)
     result = row_vector < matrix
-
-    if dtype is None or result.dtype.base_dtype == dtype.base_dtype:
+    if dtype is None or result.dtype.is_compatible_with(dtype):
       return result
     else:
       return gen_math_ops.cast(result, dtype)
@@ -4489,6 +4532,23 @@ def where_v2(condition, x=None, y=None, name=None):
   <tf.Tensor: shape=(4,), dtype=int32, numpy=array([100, 100, 100, 100],
   dtype=int32)>
 
+  Note that if the gradient of either branch of the tf.where generates
+  a NaN, then the gradient of the entire tf.where will be NaN.
+  A workaround is to use an inner tf.where to ensure the function has
+  no asymptote, and to avoid computing a value whose gradient is NaN by
+  replacing dangerous inputs with safe inputs.
+
+  Instead of this,
+
+  >>> y = tf.constant(-1, dtype=tf.float32)
+  >>> tf.where(y > 0, tf.sqrt(y), y)
+  <tf.Tensor: shape=(), dtype=float32, numpy=-1.0>
+
+  Use this
+
+  >>> tf.where(y > 0, tf.sqrt(tf.where(y > 0, y, 1)), y)
+  <tf.Tensor: shape=(), dtype=float32, numpy=-1.0>
+
   Args:
     condition: A `tf.Tensor` of type `bool`
     x: If provided, a Tensor which is of the same type as `y`, and has a shape
@@ -5502,9 +5562,9 @@ def extract_image_patches_v2(images, sizes, strides, rates, padding, name=None):
   ```
 
   Args:
-    images: A 4-D Tensor with shape `[batch, in_rows, in_cols, depth]
-    sizes: The size of the extracted patches. Must be [1, size_rows, size_cols,
-      1].
+    images: A 4-D Tensor with shape `[batch, in_rows, in_cols, depth]`.
+    sizes: The size of the extracted patches. Must be
+      `[1, size_rows, size_cols, 1]`.
     strides: A 1-D Tensor of length 4. How far the centers of two consecutive
       patches are in the images. Must be: `[1, stride_rows, stride_cols, 1]`.
     rates: A 1-D Tensor of length 4. Must be: `[1, rate_rows, rate_cols, 1]`.
diff --git a/tensorflow/python/ops/collective_ops_multi_worker_test.py b/tensorflow/python/ops/collective_ops_multi_worker_test.py
new file mode 100644
index 00000000000..4385a20cd20
--- /dev/null
+++ b/tensorflow/python/ops/collective_ops_multi_worker_test.py
@@ -0,0 +1,139 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for multi worker Collective Operations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import os
+import time
+
+from tensorflow.core.protobuf import tensorflow_server_pb2
+from tensorflow.python.distribute import cluster_resolver as cluster_resolver_lib
+from tensorflow.python.distribute import multi_process_runner
+from tensorflow.python.distribute import multi_worker_test_base
+from tensorflow.python.eager import context
+from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors
+from tensorflow.python.ops import collective_ops
+
+
+def enable_collective_ops(cluster_resolver):
+  context.context().configure_collective_ops(
+      collective_leader="/job:worker/replica:0/task:0")
+  config_proto = copy.deepcopy(context.context().config)
+  server_def = tensorflow_server_pb2.ServerDef(
+      cluster=cluster_resolver.cluster_spec().as_cluster_def(),
+      default_session_config=config_proto,
+      job_name=cluster_resolver.task_type,
+      task_index=cluster_resolver.task_id,
+      protocol=cluster_resolver.rpc_layer or "grpc")
+  context.context().enable_collective_ops(server_def)
+
+
+class CollectiveOpTest(test.TestCase):
+
+  def testCheckHealth(self):
+
+    def worker_fn():
+      enable_collective_ops(cluster_resolver_lib.TFConfigClusterResolver())
+      # There may be some delays before the server startup. Check health should
+      # eventually be OK.
+      while True:
+        try:
+          for task in [
+              "/job:worker/replica:0/task:0",
+              "/job:worker/replica:0/task:1",
+          ]:
+            context.context().check_collective_ops_peer_health(task)
+        except errors.UnavailableError:
+          continue
+        break
+      multi_process_runner.barrier().wait()
+
+    cluster_spec = multi_worker_test_base.create_cluster_spec(num_workers=2)
+    mpr = multi_process_runner.MultiProcessRunner(worker_fn, cluster_spec)
+    mpr.start()
+    mpr.join()
+
+  def testCheckHealthPeerDown(self):
+
+    def worker_fn():
+      enable_collective_ops(cluster_resolver_lib.TFConfigClusterResolver())
+      context.context().check_collective_ops_peer_health(
+          "/job:worker/replica:0/task:1",)
+
+    cluster_spec = multi_worker_test_base.create_cluster_spec(num_workers=2)
+    mpr = multi_process_runner.MultiProcessRunner(worker_fn, cluster_spec)
+    mpr.start_single_process("worker", 0)
+    with self.assertRaises(errors.UnavailableError):
+      mpr.join()
+
+  def testCheckHealthPeerRestart(self):
+
+    def worker_fn():
+      cluster_resolver = cluster_resolver_lib.TFConfigClusterResolver()
+      enable_collective_ops(cluster_resolver)
+
+      collective_ops.all_reduce(
+          constant_op.constant(1.),
+          group_size=2,
+          group_key=100,
+          instance_key=100,
+          merge_op="Add",
+          final_op="Id",
+          communication_hint="ring")
+
+      if cluster_resolver.task_type == "worker":
+        # MultiProcessRunner will auto restart worker-0.
+        os._exit(1)  # pylint: disable=protected-access
+      else:
+        # chief should eventually gets FailedPreconditionError after worker-0
+        # has restarted.
+        while True:
+          time.sleep(1)
+          try:
+            context.context().check_collective_ops_peer_health(
+                "/job:worker/replica:0/task:0",)
+          except errors.UnavailableError:
+            pass
+          except errors.FailedPreconditionError:
+            break
+
+    cluster_spec = multi_worker_test_base.create_cluster_spec(
+        has_chief=True, num_workers=1)
+    mpr = multi_process_runner.MultiProcessRunner(
+        worker_fn, cluster_spec, auto_restart=True)
+    mpr.start()
+    mpr.join()
+
+  def testCheckHealthInvalidPeer(self):
+
+    def worker_fn():
+      enable_collective_ops(cluster_resolver_lib.TFConfigClusterResolver())
+      context.context().check_collective_ops_peer_health("localhost:12345",)
+
+    cluster_spec = multi_worker_test_base.create_cluster_spec(num_workers=2)
+    mpr = multi_process_runner.MultiProcessRunner(worker_fn, cluster_spec)
+    mpr.start_single_process("worker", 0)
+    with self.assertRaises(errors.InvalidArgumentError):
+      mpr.join()
+
+
+if __name__ == "__main__":
+  multi_process_runner.test_main()
diff --git a/tensorflow/python/ops/cond_v2.py b/tensorflow/python/ops/cond_v2.py
index 17a5d5e97fa..163f0fb7077 100644
--- a/tensorflow/python/ops/cond_v2.py
+++ b/tensorflow/python/ops/cond_v2.py
@@ -25,7 +25,6 @@ from __future__ import print_function
 
 import collections
 
-from tensorflow.python.compat import compat
 from tensorflow.python.eager import backprop_util
 from tensorflow.python.framework import auto_control_deps
 from tensorflow.python.framework import auto_control_deps_utils as acd
@@ -1120,10 +1119,7 @@ def _build_case(branch_index,
         op for op in bg.get_operations() if auto_control_deps.op_is_stateful(op)
     ])
 
-  # TODO(b/161915509): Remove this after 08/20/2020. This is required to abide
-  # by 3-week forward compat window of new TF python op generating code with
-  # stale runtime binaries.
-  if (stateful_ops or not compat.forward_compatible(2020, 8, 20)):
+  if stateful_ops:
     op_fn = gen_functional_ops.case
   else:
     op_fn = gen_functional_ops.stateless_case
diff --git a/tensorflow/python/ops/confusion_matrix.py b/tensorflow/python/ops/confusion_matrix.py
index 39177defe57..38d3461bc0b 100644
--- a/tensorflow/python/ops/confusion_matrix.py
+++ b/tensorflow/python/ops/confusion_matrix.py
@@ -20,12 +20,10 @@ from __future__ import print_function
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import sparse_ops
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
@@ -194,13 +192,10 @@ def confusion_matrix(labels,
     indices = array_ops.stack([labels, predictions], axis=1)
     values = (array_ops.ones_like(predictions, dtype)
               if weights is None else weights)
-    cm_sparse = sparse_tensor.SparseTensor(
+    return array_ops.scatter_nd(
         indices=indices,
-        values=values,
-        dense_shape=math_ops.cast(shape, dtypes.int64))
-    zero_matrix = array_ops.zeros(math_ops.cast(shape, dtypes.int32), dtype)
-
-    return sparse_ops.sparse_add(zero_matrix, cm_sparse)
+        updates=values,
+        shape=math_ops.cast(shape, dtypes.int64))
 
 
 @tf_export(v1=['math.confusion_matrix', 'confusion_matrix'])
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index 748f842a9e0..8e58c8d4408 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -3647,7 +3647,11 @@ def execute_fn_for_device(device_branch_fns, default_fn, name="execute_fn"):
     The tensors returned by the callable identified by device type during
     execution, or those returned by 'default_fn' if no key matches.
   """
-
+  # Always execute the default fn for XLA to avoid complicated graph by case op.
+  # see more discussions in b/167276293.
+  is_in_xla = util.GraphOrParentsInXlaContext(ops.get_default_graph())
+  if is_in_xla:
+    return default_fn()
   device_branch_fns_upper = {k.upper(): v for k, v in device_branch_fns.items()}
   branch_fns = list(device_branch_fns_upper.values())
   devices = list(device_branch_fns_upper.keys())
diff --git a/tensorflow/python/ops/custom_gradient.py b/tensorflow/python/ops/custom_gradient.py
index 2437e050c50..f081f036b58 100644
--- a/tensorflow/python/ops/custom_gradient.py
+++ b/tensorflow/python/ops/custom_gradient.py
@@ -125,6 +125,42 @@ def custom_gradient(f=None):
   With this definition, the gradient at x=100 will be correctly evaluated as
   1.0.
 
+  The variable `dy` is defined as the upstream gradient. i.e. the gradient from
+  all the layers or functions originating from this layer.
+
+  By chain rule we know that
+  `dy/dx = dy/x_0 * dx_0/dx_1 * ... * dx_i/dx_i+1 * ... * dx_n/dx`
+
+  In this case the gradient of our current function defined as 
+  `dx_i/dx_i+1 = (1 - 1 / (1 + e))`. The upstream gradient `dy` would be
+  `dx_i+1/dx_i+2 * dx_i+2/dx_i+3 * ... * dx_n/dx`. The upstream gradient 
+  multiplied by the current gradient is then passed downstream.
+
+  In case the function takes multiple variables as input, the `grad` 
+  function must also return  the same number of variables.
+  We take the function `z = x * y` as an example.
+
+  >>> @tf.custom_gradient
+  ... def bar(x, y):
+  ...   def grad(upstream):
+  ...     dz_dx = y
+  ...     dz_dy = x
+  ...     return upstream * dz_dx, upstream * dz_dy
+  ...   z = x * y
+  ...   return z, grad
+  >>> x = tf.constant(2.0, dtype=tf.float32)
+  >>> y = tf.constant(3.0, dtype=tf.float32)
+  >>> with tf.GradientTape(persistent=True) as tape:
+  ...   tape.watch(x)
+  ...   tape.watch(y)
+  ...   z = bar(x, y)
+  >>> z
+  <tf.Tensor: shape=(), dtype=float32, numpy=6.0>
+  >>> tape.gradient(z, x)
+  <tf.Tensor: shape=(), dtype=float32, numpy=3.0>
+  >>> tape.gradient(z, y)
+  <tf.Tensor: shape=(), dtype=float32, numpy=2.0>
+
   Nesting custom gradients can lead to unintuitive results. The default
   behavior does not correspond to n-th order derivatives. For example
 
diff --git a/tensorflow/python/ops/image_grad_test.py b/tensorflow/python/ops/image_grad_test.py
index ce7c8252c04..3da536c967e 100644
--- a/tensorflow/python/ops/image_grad_test.py
+++ b/tensorflow/python/ops/image_grad_test.py
@@ -54,59 +54,60 @@ class ResizeNearestNeighborOpTest(test.TestCase):
         resize_out = self.evaluate(resize_out)
       self.assertEqual(out_shape, list(resize_out.shape))
 
-  @test_util.run_deprecated_v1
   def testGradFromResizeToLargerInBothDims(self):
     in_shape = [1, 2, 3, 1]
-    out_shape = [1, 4, 6, 1]
+    out_shape = (1, 4, 6, 1)
 
     for nptype in self.TYPES:
       x = np.arange(0, 6).reshape(in_shape).astype(nptype)
 
+      def resize_nn(t, shape=out_shape):
+        return image_ops.resize_nearest_neighbor(t, shape[1:3])
+
       with self.cached_session(use_gpu=True):
         input_tensor = constant_op.constant(x, shape=in_shape)
-        resize_out = image_ops.resize_nearest_neighbor(input_tensor,
-                                                       out_shape[1:3])
-        err = gradient_checker.compute_gradient_error(
-            input_tensor, in_shape, resize_out, out_shape, x_init_value=x)
-      self.assertLess(err, 1e-3)
+        err = gradient_checker_v2.max_error(
+            *gradient_checker_v2.compute_gradient(resize_nn, [input_tensor]))
+        self.assertLess(err, 1e-3)
 
-  @test_util.run_deprecated_v1
   def testGradFromResizeToSmallerInBothDims(self):
     in_shape = [1, 4, 6, 1]
-    out_shape = [1, 2, 3, 1]
+    out_shape = (1, 2, 3, 1)
 
     for nptype in self.TYPES:
       x = np.arange(0, 24).reshape(in_shape).astype(nptype)
 
+      def resize_nn(t, shape=out_shape):
+        return image_ops.resize_nearest_neighbor(t, shape[1:3])
+
       with self.cached_session(use_gpu=True):
         input_tensor = constant_op.constant(x, shape=in_shape)
-        resize_out = image_ops.resize_nearest_neighbor(input_tensor,
-                                                       out_shape[1:3])
-        err = gradient_checker.compute_gradient_error(
-            input_tensor, in_shape, resize_out, out_shape, x_init_value=x)
-      self.assertLess(err, 1e-3)
+        err = gradient_checker_v2.max_error(
+            *gradient_checker_v2.compute_gradient(resize_nn, [input_tensor]))
+        self.assertLess(err, 1e-3)
 
-  @test_util.run_deprecated_v1
   def testCompareGpuVsCpu(self):
     in_shape = [1, 4, 6, 3]
-    out_shape = [1, 8, 16, 3]
+    out_shape = (1, 8, 16, 3)
 
     for nptype in self.TYPES:
       x = np.arange(0, np.prod(in_shape)).reshape(in_shape).astype(nptype)
       for align_corners in [True, False]:
+
+        def resize_nn(t, shape=out_shape, align_corners=align_corners):
+          return image_ops.resize_nearest_neighbor(
+              t, shape[1:3], align_corners=align_corners)
+
         with self.cached_session(use_gpu=False):
           input_tensor = constant_op.constant(x, shape=in_shape)
-          resize_out = image_ops.resize_nearest_neighbor(
-              input_tensor, out_shape[1:3], align_corners=align_corners)
-          grad_cpu = gradient_checker.compute_gradient(
-              input_tensor, in_shape, resize_out, out_shape, x_init_value=x)
+          grad_cpu = gradient_checker_v2.compute_gradient(resize_nn,
+                                                          [input_tensor])
 
         with self.cached_session(use_gpu=True):
           input_tensor = constant_op.constant(x, shape=in_shape)
-          resize_out = image_ops.resize_nearest_neighbor(
-              input_tensor, out_shape[1:3], align_corners=align_corners)
-          grad_gpu = gradient_checker.compute_gradient(
-              input_tensor, in_shape, resize_out, out_shape, x_init_value=x)
+          grad_gpu = gradient_checker_v2.compute_gradient(resize_nn,
+                                                          [input_tensor])
+
         self.assertAllClose(grad_cpu, grad_gpu, rtol=1e-5, atol=1e-5)
 
 
@@ -183,20 +184,21 @@ class ResizeBilinearOpTest(test.TestCase):
     out_shape = [1, 2, 3, 1]
     x = np.arange(0, 24).reshape(in_shape)
 
-    with self.cached_session() as sess:
-      for dtype in [np.float16, np.float32, np.float64]:
-        input_tensor = constant_op.constant(x.astype(dtype), shape=in_shape)
-        resize_out = image_ops.resize_bilinear(input_tensor, out_shape[1:3])
-        grad = sess.run(gradients_impl.gradients(resize_out, input_tensor))[0]
-        self.assertAllEqual(in_shape, grad.shape)
-        # Not using gradient_checker.compute_gradient as I didn't work out
-        # the changes required to compensate for the lower precision of
-        # float16 when computing the numeric jacobian.
-        # Instead, we just test the theoretical jacobian.
-        self.assertAllEqual([[[[1.], [0.], [1.], [0.], [1.], [0.]], [[0.], [
-            0.
-        ], [0.], [0.], [0.], [0.]], [[1.], [0.], [1.], [0.], [1.], [0.]],
-                              [[0.], [0.], [0.], [0.], [0.], [0.]]]], grad)
+    for use_gpu in [False, True]:
+      with self.cached_session(use_gpu=use_gpu) as sess:
+        for dtype in [np.float16, np.float32, np.float64]:
+          input_tensor = constant_op.constant(x.astype(dtype), shape=in_shape)
+          resize_out = image_ops.resize_bilinear(input_tensor, out_shape[1:3])
+          grad = sess.run(gradients_impl.gradients(resize_out, input_tensor))[0]
+          self.assertAllEqual(in_shape, grad.shape)
+          # Not using gradient_checker.compute_gradient as I didn't work out
+          # the changes required to compensate for the lower precision of
+          # float16 when computing the numeric jacobian.
+          # Instead, we just test the theoretical jacobian.
+          self.assertAllEqual([[[[1.], [0.], [1.], [0.], [1.], [0.]],
+                                [[0.], [0.], [0.], [0.], [0.], [0.]],
+                                [[1.], [0.], [1.], [0.], [1.], [0.]],
+                                [[0.], [0.], [0.], [0.], [0.], [0.]]]], grad)
 
 
 class ResizeBicubicOpTest(test.TestCase):
diff --git a/tensorflow/python/ops/image_ops.py b/tensorflow/python/ops/image_ops.py
index 3c8d4989a5f..549295df148 100644
--- a/tensorflow/python/ops/image_ops.py
+++ b/tensorflow/python/ops/image_ops.py
@@ -271,3 +271,38 @@ def _image_projective_transform_grad(op, grad):
       interpolation=interpolation,
       fill_mode=fill_mode)
   return [output, None, None]
+
+
+@ops.RegisterGradient("ImageProjectiveTransformV3")
+def _image_projective_transform_v3_grad(op, grad):
+  """Computes the gradient for ImageProjectiveTransform."""
+  images = op.inputs[0]
+  transforms = op.inputs[1]
+  interpolation = op.get_attr("interpolation")
+  fill_mode = op.get_attr("fill_mode")
+
+  image_or_images = ops.convert_to_tensor(images, name="images")
+  transform_or_transforms = ops.convert_to_tensor(
+      transforms, name="transforms", dtype=dtypes.float32)
+
+  if image_or_images.dtype.base_dtype not in _IMAGE_DTYPES:
+    raise TypeError("Invalid dtype %s." % image_or_images.dtype)
+  if len(transform_or_transforms.get_shape()) == 1:
+    transforms = transform_or_transforms[None]
+  elif len(transform_or_transforms.get_shape()) == 2:
+    transforms = transform_or_transforms
+  else:
+    raise TypeError("Transforms should have rank 1 or 2.")
+
+  # Invert transformations
+  transforms = flat_transforms_to_matrices(transforms=transforms)
+  inverse = linalg_ops.matrix_inverse(transforms)
+  transforms = matrices_to_flat_transforms(inverse)
+  output = gen_image_ops.image_projective_transform_v3(
+      images=grad,
+      transforms=transforms,
+      output_shape=array_ops.shape(image_or_images)[1:3],
+      interpolation=interpolation,
+      fill_mode=fill_mode,
+      fill_value=0.0)
+  return [output, None, None, None]
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index 8d542b4eaaa..53676b29fce 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -534,7 +534,7 @@ def flip_left_right(image):
 
   Outputs the contents of `image` flipped along the width dimension.
 
-  See also `reverse()`.
+  See also `tf.reverse`.
 
   Usage Example:
 
@@ -2034,8 +2034,8 @@ def adjust_brightness(image, delta):
   The value `delta` is added to all components of the tensor `image`. `image` is
   converted to `float` and scaled appropriately if it is in fixed-point
   representation, and `delta` is converted to the same data type. For regular
-  images, `delta` should be in the range `[0,1)`, as it is added to the image in
-  floating point representation, where pixel values are in the `[0,1)` range.
+  images, `delta` should be in the range `(-1,1)`, as it is added to the image
+  in floating point representation, where pixel values are in the `[0,1)` range.
 
   Usage Example:
 
@@ -3256,6 +3256,130 @@ def sample_distorted_bounding_box_v2(image_size,
         name=name)
 
 
+@tf_export('image.stateless_sample_distorted_bounding_box', v1=[])
+@dispatch.add_dispatch_support
+def stateless_sample_distorted_bounding_box(image_size,
+                                            bounding_boxes,
+                                            seed,
+                                            min_object_covered=0.1,
+                                            aspect_ratio_range=None,
+                                            area_range=None,
+                                            max_attempts=None,
+                                            use_image_if_no_bounding_boxes=None,
+                                            name=None):
+  """Generate a randomly distorted bounding box for an image deterministically.
+
+  Bounding box annotations are often supplied in addition to ground-truth labels
+  in image recognition or object localization tasks. A common technique for
+  training such a system is to randomly distort an image while preserving
+  its content, i.e. *data augmentation*. This Op, given the same `seed`,
+  deterministically outputs a randomly distorted localization of an object, i.e.
+  bounding box, given an `image_size`, `bounding_boxes` and a series of
+  constraints.
+
+  The output of this Op is a single bounding box that may be used to crop the
+  original image. The output is returned as 3 tensors: `begin`, `size` and
+  `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
+  image. The latter may be supplied to `tf.image.draw_bounding_boxes` to
+  visualize what the bounding box looks like.
+
+  Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`.
+  The bounding box coordinates are floats in `[0.0, 1.0]` relative to the width
+  and the height of the underlying image.
+
+  The output of this Op is guaranteed to be the same given the same `seed` and
+  is independent of how many times the function is called, and independent of
+  global seed settings (e.g. `tf.random.set_seed`).
+
+  Example usage:
+
+  >>> image = np.array([[[1], [2], [3]], [[4], [5], [6]], [[7], [8], [9]]])
+  >>> bbox = tf.constant(
+  ...   [0.0, 0.0, 1.0, 1.0], dtype=tf.float32, shape=[1, 1, 4])
+  >>> seed = (1, 2)
+  >>> # Generate a single distorted bounding box.
+  >>> bbox_begin, bbox_size, bbox_draw = (
+  ...   tf.image.stateless_sample_distorted_bounding_box(
+  ...     tf.shape(image), bounding_boxes=bbox, seed=seed))
+  >>> # Employ the bounding box to distort the image.
+  >>> tf.slice(image, bbox_begin, bbox_size)
+  <tf.Tensor: shape=(2, 2, 1), dtype=int64, numpy=
+  array([[[1],
+          [2]],
+         [[4],
+          [5]]])>
+  >>> # Draw the bounding box in an image summary.
+  >>> colors = np.array([[1.0, 0.0, 0.0], [0.0, 0.0, 1.0]])
+  >>> tf.image.draw_bounding_boxes(
+  ...   tf.expand_dims(tf.cast(image, tf.float32),0), bbox_draw, colors)
+  <tf.Tensor: shape=(1, 3, 3, 1), dtype=float32, numpy=
+  array([[[[1.],
+           [1.],
+           [3.]],
+          [[1.],
+           [1.],
+           [6.]],
+          [[7.],
+           [8.],
+           [9.]]]], dtype=float32)>
+
+  Note that if no bounding box information is available, setting
+  `use_image_if_no_bounding_boxes = true` will assume there is a single implicit
+  bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
+  false and no bounding boxes are supplied, an error is raised.
+
+  Args:
+    image_size: A `Tensor`. Must be one of the following types: `uint8`, `int8`,
+      `int16`, `int32`, `int64`. 1-D, containing `[height, width, channels]`.
+    bounding_boxes: A `Tensor` of type `float32`. 3-D with shape `[batch, N, 4]`
+      describing the N bounding boxes associated with the image.
+    seed: A shape [2] Tensor, the seed to the random number generator. Must have
+      dtype `int32` or `int64`. (When using XLA, only `int32` is allowed.)
+    min_object_covered: A Tensor of type `float32`. Defaults to `0.1`. The
+      cropped area of the image must contain at least this fraction of any
+      bounding box supplied. The value of this parameter should be non-negative.
+      In the case of 0, the cropped area does not need to overlap any of the
+      bounding boxes supplied.
+    aspect_ratio_range: An optional list of `floats`. Defaults to `[0.75,
+      1.33]`. The cropped area of the image must have an aspect `ratio = width /
+      height` within this range.
+    area_range: An optional list of `floats`. Defaults to `[0.05, 1]`. The
+      cropped area of the image must contain a fraction of the supplied image
+      within this range.
+    max_attempts: An optional `int`. Defaults to `100`. Number of attempts at
+      generating a cropped region of the image of the specified constraints.
+      After `max_attempts` failures, return the entire image.
+    use_image_if_no_bounding_boxes: An optional `bool`. Defaults to `False`.
+      Controls behavior if no bounding boxes supplied. If true, assume an
+      implicit bounding box covering the whole input. If false, raise an error.
+    name: A name for the operation (optional).
+
+  Returns:
+    A tuple of `Tensor` objects (begin, size, bboxes).
+
+    begin: A `Tensor`. Has the same type as `image_size`. 1-D, containing
+    `[offset_height, offset_width, 0]`. Provide as input to
+      `tf.slice`.
+    size: A `Tensor`. Has the same type as `image_size`. 1-D, containing
+    `[target_height, target_width, -1]`. Provide as input to
+      `tf.slice`.
+    bboxes: A `Tensor` of type `float32`. 3-D with shape `[1, 1, 4]` containing
+    the distorted bounding box.
+    Provide as input to `tf.image.draw_bounding_boxes`.
+  """
+  with ops.name_scope(name, 'stateless_sample_distorted_bounding_box'):
+    return gen_image_ops.stateless_sample_distorted_bounding_box(
+        image_size=image_size,
+        bounding_boxes=bounding_boxes,
+        seed=seed,
+        min_object_covered=min_object_covered,
+        aspect_ratio_range=aspect_ratio_range,
+        area_range=area_range,
+        max_attempts=max_attempts,
+        use_image_if_no_bounding_boxes=use_image_if_no_bounding_boxes,
+        name=name)
+
+
 @tf_export(v1=['image.sample_distorted_bounding_box'])
 @dispatch.add_dispatch_support
 @deprecation.deprecated(
@@ -3633,7 +3757,10 @@ def rgb_to_yuv(images):
 
   Outputs a tensor of the same shape as the `images` tensor, containing the YUV
   value of the pixels.
-  The output is only well defined if the value in images are in [0,1].
+  The output is only well defined if the value in images are in [0, 1].
+  There are two ways of representing an image: [0, 255] pixel values range or 
+  [0, 1] (as float) pixel values range. Users need to convert the input image 
+  into a float [0, 1] range.
 
   Args:
     images: 2-D or higher rank. Image data to convert. Last dimension must be
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index 1adece3474b..0e705163540 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -34,11 +34,14 @@ from tensorflow.python.client import session
 from tensorflow.python.compat import compat
 from tensorflow.python.data.experimental.ops import get_single_element
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -97,6 +100,8 @@ class RGBToHSVTest(test_util.TensorFlowTestCase):
 
 class RGBToYIQTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_without_tensor_float_32(
+      "Calls rgb_to_yiq and yiq_to_rgb, which use matmul")
   def testBatch(self):
     # Build an arbitrary RGB image
     np.random.seed(7)
@@ -127,6 +132,8 @@ class RGBToYIQTest(test_util.TensorFlowTestCase):
 
 class RGBToYUVTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_without_tensor_float_32(
+      "Calls rgb_to_yuv and yuv_to_rgb, which use matmul")
   def testBatch(self):
     # Build an arbitrary RGB image
     np.random.seed(7)
@@ -249,36 +256,36 @@ class GrayscaleToRGBTest(test_util.TensorFlowTestCase):
       with self.assertRaisesRegex(ValueError, err_msg):
         image_ops.grayscale_to_rgb(x_tf)
 
-  @test_util.run_deprecated_v1
   def testShapeInference(self):
-    # Shape inference works and produces expected output where possible
-    rgb_shape = [7, None, 19, 3]
-    gray_shape = rgb_shape[:-1] + [1]
-    with self.cached_session(use_gpu=True):
-      rgb_tf = array_ops.placeholder(dtypes.uint8, shape=rgb_shape)
-      gray = image_ops.rgb_to_grayscale(rgb_tf)
-      self.assertEqual(gray_shape, gray.get_shape().as_list())
+    # Shape function requires placeholders and a graph.
+    with ops.Graph().as_default():
+      # Shape inference works and produces expected output where possible
+      rgb_shape = [7, None, 19, 3]
+      gray_shape = rgb_shape[:-1] + [1]
+      with self.cached_session(use_gpu=True):
+        rgb_tf = array_ops.placeholder(dtypes.uint8, shape=rgb_shape)
+        gray = image_ops.rgb_to_grayscale(rgb_tf)
+        self.assertEqual(gray_shape, gray.get_shape().as_list())
 
-    with self.cached_session(use_gpu=True):
-      gray_tf = array_ops.placeholder(dtypes.uint8, shape=gray_shape)
-      rgb = image_ops.grayscale_to_rgb(gray_tf)
-      self.assertEqual(rgb_shape, rgb.get_shape().as_list())
+      with self.cached_session(use_gpu=True):
+        gray_tf = array_ops.placeholder(dtypes.uint8, shape=gray_shape)
+        rgb = image_ops.grayscale_to_rgb(gray_tf)
+        self.assertEqual(rgb_shape, rgb.get_shape().as_list())
 
-    # Shape inference does not break for unknown shapes
-    with self.cached_session(use_gpu=True):
-      rgb_tf_unknown = array_ops.placeholder(dtypes.uint8)
-      gray_unknown = image_ops.rgb_to_grayscale(rgb_tf_unknown)
-      self.assertFalse(gray_unknown.get_shape())
+      # Shape inference does not break for unknown shapes
+      with self.cached_session(use_gpu=True):
+        rgb_tf_unknown = array_ops.placeholder(dtypes.uint8)
+        gray_unknown = image_ops.rgb_to_grayscale(rgb_tf_unknown)
+        self.assertFalse(gray_unknown.get_shape())
 
-    with self.cached_session(use_gpu=True):
-      gray_tf_unknown = array_ops.placeholder(dtypes.uint8)
-      rgb_unknown = image_ops.grayscale_to_rgb(gray_tf_unknown)
-      self.assertFalse(rgb_unknown.get_shape())
+      with self.cached_session(use_gpu=True):
+        gray_tf_unknown = array_ops.placeholder(dtypes.uint8)
+        rgb_unknown = image_ops.grayscale_to_rgb(gray_tf_unknown)
+        self.assertFalse(rgb_unknown.get_shape())
 
 
 class AdjustGamma(test_util.TensorFlowTestCase):
 
-  @test_util.run_deprecated_v1
   def test_adjust_gamma_less_zero_float32(self):
     """White image should be returned for gamma equal to zero"""
     with self.cached_session():
@@ -288,10 +295,10 @@ class AdjustGamma(test_util.TensorFlowTestCase):
       x = constant_op.constant(x_np, shape=x_np.shape)
 
       err_msg = "Gamma should be a non-negative real number"
-      with self.assertRaisesRegex(ValueError, err_msg):
+      with self.assertRaisesRegex(
+          (ValueError, errors.InvalidArgumentError), err_msg):
         image_ops.adjust_gamma(x, gamma=-1)
 
-  @test_util.run_deprecated_v1
   def test_adjust_gamma_less_zero_uint8(self):
     """White image should be returned for gamma equal to zero"""
     with self.cached_session():
@@ -301,10 +308,10 @@ class AdjustGamma(test_util.TensorFlowTestCase):
       x = constant_op.constant(x_np, shape=x_np.shape)
 
       err_msg = "Gamma should be a non-negative real number"
-      with self.assertRaisesRegex(ValueError, err_msg):
+      with self.assertRaisesRegex(
+          (ValueError, errors.InvalidArgumentError), err_msg):
         image_ops.adjust_gamma(x, gamma=-1)
 
-  @test_util.run_deprecated_v1
   def test_adjust_gamma_less_zero_tensor(self):
     """White image should be returned for gamma equal to zero"""
     with self.cached_session():
@@ -314,10 +321,10 @@ class AdjustGamma(test_util.TensorFlowTestCase):
       x = constant_op.constant(x_np, shape=x_np.shape)
       y = constant_op.constant(-1.0, dtype=dtypes.float32)
 
-      image = image_ops.adjust_gamma(x, gamma=y)
-
       err_msg = "Gamma should be a non-negative real number"
-      with self.assertRaisesRegex(errors.InvalidArgumentError, err_msg):
+      with self.assertRaisesRegex(
+          (ValueError, errors.InvalidArgumentError), err_msg):
+        image = image_ops.adjust_gamma(x, gamma=y)
         self.evaluate(image)
 
   def _test_adjust_gamma_uint8(self, gamma):
@@ -329,7 +336,7 @@ class AdjustGamma(test_util.TensorFlowTestCase):
       x_np = np.random.uniform(0, 255, (8, 8)).astype(np.uint8)
       x = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.adjust_gamma(x, gamma=gamma)
-      y_tf = np.trunc(y.eval())
+      y_tf = np.trunc(self.evaluate(y))
 
       # calculate gamma correction using numpy
       # firstly, transform uint8 to float representation
@@ -349,22 +356,19 @@ class AdjustGamma(test_util.TensorFlowTestCase):
       x_np = np.random.uniform(0, 1.0, (8, 8))
       x = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.adjust_gamma(x, gamma=gamma)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
 
       y_np = np.clip(np.power(x_np, gamma), 0, 1.0)
 
       self.assertAllClose(y_tf, y_np, 1e-6)
 
-  @test_util.run_deprecated_v1
   def test_adjust_gamma_one_float32(self):
     """Same image should be returned for gamma equal to one"""
     self._test_adjust_gamma_float32(1.0)
 
-  @test_util.run_deprecated_v1
   def test_adjust_gamma_one_uint8(self):
     self._test_adjust_gamma_uint8(1.0)
 
-  @test_util.run_deprecated_v1
   def test_adjust_gamma_zero_uint8(self):
     """White image should be returned for gamma equal
 
@@ -372,7 +376,6 @@ class AdjustGamma(test_util.TensorFlowTestCase):
     """
     self._test_adjust_gamma_uint8(gamma=0.0)
 
-  @test_util.run_deprecated_v1
   def test_adjust_gamma_less_one_uint8(self):
     """Verifying the output with expected results for gamma
 
@@ -380,7 +383,6 @@ class AdjustGamma(test_util.TensorFlowTestCase):
     """
     self._test_adjust_gamma_uint8(gamma=0.5)
 
-  @test_util.run_deprecated_v1
   def test_adjust_gamma_greater_one_uint8(self):
     """Verifying the output with expected results for gamma
 
@@ -388,7 +390,6 @@ class AdjustGamma(test_util.TensorFlowTestCase):
     """
     self._test_adjust_gamma_uint8(gamma=1.0)
 
-  @test_util.run_deprecated_v1
   def test_adjust_gamma_less_one_float32(self):
     """Verifying the output with expected results for gamma
 
@@ -396,7 +397,6 @@ class AdjustGamma(test_util.TensorFlowTestCase):
     """
     self._test_adjust_gamma_float32(0.5)
 
-  @test_util.run_deprecated_v1
   def test_adjust_gamma_greater_one_float32(self):
     """Verifying the output with expected results for gamma
 
@@ -404,7 +404,6 @@ class AdjustGamma(test_util.TensorFlowTestCase):
     """
     self._test_adjust_gamma_float32(1.0)
 
-  @test_util.run_deprecated_v1
   def test_adjust_gamma_zero_float32(self):
     """White image should be returned for gamma equal
 
@@ -964,7 +963,6 @@ class AdjustSaturationTest(test_util.TensorFlowTestCase):
       y_v[i][2] = b
     return y_v.reshape(x_np.shape)
 
-  @test_util.run_deprecated_v1
   def testAdjustRandomSaturation(self):
     x_shapes = [
         [2, 2, 3],
@@ -999,7 +997,7 @@ class AdjustSaturationTest(test_util.TensorFlowTestCase):
           else:
             raise AssertionError("Invalid test style: %s" % (test_style))
           y_baseline = self._adjustSaturationNp(x_np, scale)
-          y_fused = image_ops.adjust_saturation(x_np, scale).eval()
+          y_fused = self.evaluate(image_ops.adjust_saturation(x_np, scale))
           self.assertAllClose(y_fused, y_baseline, rtol=2e-5, atol=1e-5)
 
 
@@ -1024,7 +1022,6 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase,
       y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, x_np)
 
-  @test_util.run_deprecated_v1
   def testLeftRight(self):
     x_np = np.array([[1, 2, 3], [1, 2, 3]], dtype=np.uint8).reshape([2, 3, 1])
     y_np = np.array([[3, 2, 1], [3, 2, 1]], dtype=np.uint8).reshape([2, 3, 1])
@@ -1032,7 +1029,6 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase,
     with self.cached_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.flip_left_right(x_tf)
-      self.assertTrue(y.op.name.startswith("flip_left_right"))
       y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
@@ -1050,21 +1046,46 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase,
       y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
-  @test_util.run_deprecated_v1
+  def testRandomFlipLeftRightStateful(self):
+    # Test random flip with single seed (stateful).
+    with ops.Graph().as_default():
+      x_np = np.array([[1, 2, 3], [1, 2, 3]], dtype=np.uint8).reshape([2, 3, 1])
+      y_np = np.array([[3, 2, 1], [3, 2, 1]], dtype=np.uint8).reshape([2, 3, 1])
+      seed = 42
+
+      with self.cached_session(use_gpu=True):
+        x_tf = constant_op.constant(x_np, shape=x_np.shape)
+        y = image_ops.random_flip_left_right(x_tf, seed=seed)
+        self.assertTrue(y.op.name.startswith("random_flip_left_right"))
+
+        count_flipped = 0
+        count_unflipped = 0
+        for _ in range(100):
+          y_tf = self.evaluate(y)
+          if y_tf[0][0] == 1:
+            self.assertAllEqual(y_tf, x_np)
+            count_unflipped += 1
+          else:
+            self.assertAllEqual(y_tf, y_np)
+            count_flipped += 1
+
+        # 100 trials
+        # Mean: 50
+        # Std Dev: ~5
+        # Six Sigma: 50 - (5 * 6) = 20
+        self.assertGreaterEqual(count_flipped, 20)
+        self.assertGreaterEqual(count_unflipped, 20)
+
   def testRandomFlipLeftRight(self):
     x_np = np.array([[1, 2, 3], [1, 2, 3]], dtype=np.uint8).reshape([2, 3, 1])
     y_np = np.array([[3, 2, 1], [3, 2, 1]], dtype=np.uint8).reshape([2, 3, 1])
-    seed = 42
 
     with self.cached_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
-      y = image_ops.random_flip_left_right(x_tf, seed=seed)
-      self.assertTrue(y.op.name.startswith("random_flip_left_right"))
-
       count_flipped = 0
       count_unflipped = 0
-      for _ in range(100):
-        y_tf = self.evaluate(y)
+      for seed in range(100):
+        y_tf = self.evaluate(image_ops.random_flip_left_right(x_tf, seed=seed))
         if y_tf[0][0] == 1:
           self.assertAllEqual(y_tf, x_np)
           count_unflipped += 1
@@ -1072,12 +1093,8 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase,
           self.assertAllEqual(y_tf, y_np)
           count_flipped += 1
 
-      # 100 trials
-      # Mean: 50
-      # Std Dev: ~5
-      # Six Sigma: 50 - (5 * 6) = 20
-      self.assertGreaterEqual(count_flipped, 20)
-      self.assertGreaterEqual(count_unflipped, 20)
+      self.assertEqual(count_flipped, 45)
+      self.assertEqual(count_unflipped, 55)
 
   # TODO(b/162345082): stateless random op generates different random number
   # with xla_gpu. Update tests such that there is a single ground truth result
@@ -1182,7 +1199,6 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase,
         self.assertAllEqual(flip_counts[0], flip_counts[i])
         self.assertAllEqual(flip_sequences[0], flip_sequences[i])
 
-  @test_util.run_deprecated_v1
   def testRandomFlipLeftRightWithBatch(self):
     batch_size = 16
     seed = 42
@@ -1201,13 +1217,10 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase,
 
     with self.cached_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
-      y = image_ops.random_flip_left_right(x_tf, seed=seed)
-      self.assertTrue(y.op.name.startswith("random_flip_left_right"))
-
       count_flipped = 0
       count_unflipped = 0
-      for _ in range(100):
-        y_tf = self.evaluate(y)
+      for seed in range(100):
+        y_tf = self.evaluate(image_ops.random_flip_left_right(x_tf, seed=seed))
 
         # check every element of the batch
         for i in range(batch_size):
@@ -1218,14 +1231,8 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase,
             self.assertAllEqual(y_tf[i], y_np[i])
             count_flipped += 1
 
-      # 100 trials, each containing batch_size elements
-      # Mean: 50 * batch_size
-      # Std Dev: ~5 * sqrt(batch_size)
-      # Six Sigma: 50 * batch_size - (5 * 6 * sqrt(batch_size))
-      #          = 50 * batch_size - 30 * sqrt(batch_size) = 800 - 30 * 4 = 680
-      six_sigma = 50 * batch_size - 30 * np.sqrt(batch_size)
-      self.assertGreaterEqual(count_flipped, six_sigma)
-      self.assertGreaterEqual(count_unflipped, six_sigma)
+      self.assertEqual(count_flipped, 772)
+      self.assertEqual(count_unflipped, 828)
 
   def testInvolutionUpDown(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
@@ -1247,7 +1254,6 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase,
       y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, x_np)
 
-  @test_util.run_deprecated_v1
   def testUpDown(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
     y_np = np.array([[4, 5, 6], [1, 2, 3]], dtype=np.uint8).reshape([2, 3, 1])
@@ -1255,7 +1261,6 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase,
     with self.cached_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.flip_up_down(x_tf)
-      self.assertTrue(y.op.name.startswith("flip_up_down"))
       y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
@@ -1273,21 +1278,45 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase,
       y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
-  @test_util.run_deprecated_v1
+  def testRandomFlipUpDownStateful(self):
+    # Test random flip with single seed (stateful).
+    with ops.Graph().as_default():
+      x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
+      y_np = np.array([[4, 5, 6], [1, 2, 3]], dtype=np.uint8).reshape([2, 3, 1])
+      seed = 42
+
+      with self.cached_session(use_gpu=True):
+        x_tf = constant_op.constant(x_np, shape=x_np.shape)
+        y = image_ops.random_flip_up_down(x_tf, seed=seed)
+        self.assertTrue(y.op.name.startswith("random_flip_up_down"))
+        count_flipped = 0
+        count_unflipped = 0
+        for _ in range(100):
+          y_tf = self.evaluate(y)
+          if y_tf[0][0] == 1:
+            self.assertAllEqual(y_tf, x_np)
+            count_unflipped += 1
+          else:
+            self.assertAllEqual(y_tf, y_np)
+            count_flipped += 1
+
+        # 100 trials
+        # Mean: 50
+        # Std Dev: ~5
+        # Six Sigma: 50 - (5 * 6) = 20
+        self.assertGreaterEqual(count_flipped, 20)
+        self.assertGreaterEqual(count_unflipped, 20)
+
   def testRandomFlipUpDown(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
     y_np = np.array([[4, 5, 6], [1, 2, 3]], dtype=np.uint8).reshape([2, 3, 1])
 
-    seed = 42
-
     with self.cached_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
-      y = image_ops.random_flip_up_down(x_tf, seed=seed)
-      self.assertTrue(y.op.name.startswith("random_flip_up_down"))
       count_flipped = 0
       count_unflipped = 0
-      for _ in range(100):
-        y_tf = self.evaluate(y)
+      for seed in range(100):
+        y_tf = self.evaluate(image_ops.random_flip_up_down(x_tf, seed=seed))
         if y_tf[0][0] == 1:
           self.assertAllEqual(y_tf, x_np)
           count_unflipped += 1
@@ -1295,14 +1324,9 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase,
           self.assertAllEqual(y_tf, y_np)
           count_flipped += 1
 
-      # 100 trials
-      # Mean: 50
-      # Std Dev: ~5
-      # Six Sigma: 50 - (5 * 6) = 20
-      self.assertGreaterEqual(count_flipped, 20)
-      self.assertGreaterEqual(count_unflipped, 20)
+      self.assertEqual(count_flipped, 45)
+      self.assertEqual(count_unflipped, 55)
 
-  @test_util.run_deprecated_v1
   def testRandomFlipUpDownWithBatch(self):
     batch_size = 16
     seed = 42
@@ -1321,13 +1345,10 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase,
 
     with self.cached_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
-      y = image_ops.random_flip_up_down(x_tf, seed=seed)
-      self.assertTrue(y.op.name.startswith("random_flip_up_down"))
-
       count_flipped = 0
       count_unflipped = 0
-      for _ in range(100):
-        y_tf = self.evaluate(y)
+      for seed in range(100):
+        y_tf = self.evaluate(image_ops.random_flip_up_down(x_tf, seed=seed))
 
         # check every element of the batch
         for i in range(batch_size):
@@ -1338,14 +1359,8 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase,
             self.assertAllEqual(y_tf[i], y_np[i])
             count_flipped += 1
 
-      # 100 trials, each containing batch_size elements
-      # Mean: 50 * batch_size
-      # Std Dev: ~5 * sqrt(batch_size)
-      # Six Sigma: 50 * batch_size - (5 * 6 * sqrt(batch_size))
-      #          = 50 * batch_size - 30 * sqrt(batch_size) = 800 - 30 * 4 = 680
-      six_sigma = 50 * batch_size - 30 * np.sqrt(batch_size)
-      self.assertGreaterEqual(count_flipped, six_sigma)
-      self.assertGreaterEqual(count_unflipped, six_sigma)
+      self.assertEqual(count_flipped, 772)
+      self.assertEqual(count_unflipped, 828)
 
   def testInvolutionTranspose(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
@@ -1367,7 +1382,6 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase,
       y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, x_np)
 
-  @test_util.run_deprecated_v1
   def testTranspose(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
     y_np = np.array([[1, 4], [2, 5], [3, 6]], dtype=np.uint8).reshape([3, 2, 1])
@@ -1375,7 +1389,6 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase,
     with self.cached_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.transpose(x_tf)
-      self.assertTrue(y.op.name.startswith("transpose"))
       y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
@@ -1394,48 +1407,49 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase,
       y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
-  @test_util.run_deprecated_v1
   def testPartialShapes(self):
-    p_unknown_rank = array_ops.placeholder(dtypes.uint8)
-    p_unknown_dims_3 = array_ops.placeholder(
-        dtypes.uint8, shape=[None, None, None])
-    p_unknown_dims_4 = array_ops.placeholder(
-        dtypes.uint8, shape=[None, None, None, None])
-    p_unknown_width = array_ops.placeholder(dtypes.uint8, shape=[64, None, 3])
-    p_unknown_batch = array_ops.placeholder(
-        dtypes.uint8, shape=[None, 64, 64, 3])
-    p_wrong_rank = array_ops.placeholder(dtypes.uint8, shape=[None, None])
-    p_zero_dim = array_ops.placeholder(dtypes.uint8, shape=[64, 0, 3])
+    # Shape function requires placeholders and a graph.
+    with ops.Graph().as_default():
+      p_unknown_rank = array_ops.placeholder(dtypes.uint8)
+      p_unknown_dims_3 = array_ops.placeholder(
+          dtypes.uint8, shape=[None, None, None])
+      p_unknown_dims_4 = array_ops.placeholder(
+          dtypes.uint8, shape=[None, None, None, None])
+      p_unknown_width = array_ops.placeholder(dtypes.uint8, shape=[64, None, 3])
+      p_unknown_batch = array_ops.placeholder(
+          dtypes.uint8, shape=[None, 64, 64, 3])
+      p_wrong_rank = array_ops.placeholder(dtypes.uint8, shape=[None, None])
+      p_zero_dim = array_ops.placeholder(dtypes.uint8, shape=[64, 0, 3])
 
-    #Ops that support 3D input
-    for op in [
-        image_ops.flip_left_right, image_ops.flip_up_down,
-        image_ops.random_flip_left_right, image_ops.random_flip_up_down,
-        image_ops.transpose, image_ops.rot90
-    ]:
-      transformed_unknown_rank = op(p_unknown_rank)
-      self.assertIsNone(transformed_unknown_rank.get_shape().ndims)
-      transformed_unknown_dims_3 = op(p_unknown_dims_3)
-      self.assertEqual(3, transformed_unknown_dims_3.get_shape().ndims)
-      transformed_unknown_width = op(p_unknown_width)
-      self.assertEqual(3, transformed_unknown_width.get_shape().ndims)
+      #Ops that support 3D input
+      for op in [
+          image_ops.flip_left_right, image_ops.flip_up_down,
+          image_ops.random_flip_left_right, image_ops.random_flip_up_down,
+          image_ops.transpose, image_ops.rot90
+      ]:
+        transformed_unknown_rank = op(p_unknown_rank)
+        self.assertIsNone(transformed_unknown_rank.get_shape().ndims)
+        transformed_unknown_dims_3 = op(p_unknown_dims_3)
+        self.assertEqual(3, transformed_unknown_dims_3.get_shape().ndims)
+        transformed_unknown_width = op(p_unknown_width)
+        self.assertEqual(3, transformed_unknown_width.get_shape().ndims)
 
-      with self.assertRaisesRegex(ValueError, "must be > 0"):
-        op(p_zero_dim)
+        with self.assertRaisesRegex(ValueError, "must be > 0"):
+          op(p_zero_dim)
 
-    #Ops that support 4D input
-    for op in [
-        image_ops.flip_left_right, image_ops.flip_up_down,
-        image_ops.random_flip_left_right, image_ops.random_flip_up_down,
-        image_ops.transpose, image_ops.rot90
-    ]:
-      transformed_unknown_dims_4 = op(p_unknown_dims_4)
-      self.assertEqual(4, transformed_unknown_dims_4.get_shape().ndims)
-      transformed_unknown_batch = op(p_unknown_batch)
-      self.assertEqual(4, transformed_unknown_batch.get_shape().ndims)
-      with self.assertRaisesRegex(ValueError,
-                                  "must be at least three-dimensional"):
-        op(p_wrong_rank)
+      #Ops that support 4D input
+      for op in [
+          image_ops.flip_left_right, image_ops.flip_up_down,
+          image_ops.random_flip_left_right, image_ops.random_flip_up_down,
+          image_ops.transpose, image_ops.rot90
+      ]:
+        transformed_unknown_dims_4 = op(p_unknown_dims_4)
+        self.assertEqual(4, transformed_unknown_dims_4.get_shape().ndims)
+        transformed_unknown_batch = op(p_unknown_batch)
+        self.assertEqual(4, transformed_unknown_batch.get_shape().ndims)
+        with self.assertRaisesRegex(ValueError,
+                                    "must be at least three-dimensional"):
+          op(p_wrong_rank)
 
   def testRot90GroupOrder(self):
     image = np.arange(24, dtype=np.uint8).reshape([2, 4, 3])
@@ -1453,25 +1467,21 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase,
         rotated = image_ops.rot90(rotated)
       self.assertAllEqual(image, self.evaluate(rotated))
 
-  @test_util.run_deprecated_v1
   def testRot90NumpyEquivalence(self):
     image = np.arange(24, dtype=np.uint8).reshape([2, 4, 3])
     with self.cached_session(use_gpu=True):
-      k_placeholder = array_ops.placeholder(dtypes.int32, shape=[])
-      y_tf = image_ops.rot90(image, k_placeholder)
       for k in xrange(4):
         y_np = np.rot90(image, k=k)
-        self.assertAllEqual(y_np, y_tf.eval({k_placeholder: k}))
+        self.assertAllEqual(
+            y_np, self.evaluate(image_ops.rot90(image, k)))
 
-  @test_util.run_deprecated_v1
   def testRot90NumpyEquivalenceWithBatch(self):
     image = np.arange(48, dtype=np.uint8).reshape([2, 2, 4, 3])
     with self.cached_session(use_gpu=True):
-      k_placeholder = array_ops.placeholder(dtypes.int32, shape=[])
-      y_tf = image_ops.rot90(image, k_placeholder)
       for k in xrange(4):
         y_np = np.rot90(image, k=k, axes=(1, 2))
-        self.assertAllEqual(y_np, y_tf.eval({k_placeholder: k}))
+        self.assertAllEqual(
+            y_np, self.evaluate(image_ops.rot90(image, k)))
 
   def testFlipImageUnknownShape(self):
     expected_output = constant_op.constant([[[[3, 4, 5], [0, 1, 2]],
@@ -1572,12 +1582,12 @@ class AdjustContrastTest(test_util.TensorFlowTestCase):
       y_tf = self._adjustContrastTf(x_np, contrast_factor)
       self.assertAllClose(y_tf, y_np, rtol=1e-5, atol=1e-5)
 
-  @test_util.run_deprecated_v1
   def testContrastFactorShape(self):
     x_shape = [1, 2, 2, 3]
     x_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
     x_np = np.array(x_data, dtype=np.uint8).reshape(x_shape)
-    with self.assertRaisesRegex(ValueError,
+    with self.assertRaisesRegex((ValueError, errors.InvalidArgumentError),
+                                "contrast_factor must be scalar|"
                                 "Shape must be rank 0 but is rank 1"):
       image_ops.adjust_contrast(x_np, [2.0])
 
@@ -1645,7 +1655,6 @@ class PerImageWhiteningTest(test_util.TensorFlowTestCase):
     y /= stddev
     return y
 
-  @test_util.run_deprecated_v1
   def testBasic(self):
     x_shape = [13, 9, 3]
     x_np = np.arange(0, np.prod(x_shape), dtype=np.float32).reshape(x_shape)
@@ -1654,7 +1663,6 @@ class PerImageWhiteningTest(test_util.TensorFlowTestCase):
     with self.cached_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_shape)
       y = image_ops.per_image_standardization(x)
-      self.assertTrue(y.op.name.startswith("per_image_standardization"))
       y_tf = self.evaluate(y)
       self.assertAllClose(y_tf, y_np, atol=1e-4)
 
@@ -1697,19 +1705,15 @@ class CropToBoundingBoxTest(test_util.TensorFlowTestCase):
       offset_width = ops.convert_to_tensor(offset_width)
       target_height = ops.convert_to_tensor(target_height)
       target_width = ops.convert_to_tensor(target_width)
-      x_tensor = array_ops.placeholder(x.dtype, shape=[None] * x.ndim)
-      feed_dict = {x_tensor: x}
+      x_tensor = ops.convert_to_tensor(x)
     else:
       x_tensor = x
-      feed_dict = {}
 
     y = image_ops.crop_to_bounding_box(x_tensor, offset_height, offset_width,
                                        target_height, target_width)
-    if not use_tensor_inputs:
-      self.assertTrue(y.get_shape().is_fully_defined())
 
     with self.cached_session(use_gpu=True):
-      return y.eval(feed_dict=feed_dict)
+      return self.evaluate(y)
 
   def _assertReturns(self,
                      x,
@@ -1743,27 +1747,21 @@ class CropToBoundingBoxTest(test_util.TensorFlowTestCase):
     x = np.array(x).reshape(x_shape)
 
     for use_tensor_inputs in use_tensor_inputs_options:
-      try:
+      with self.assertRaisesRegex(
+          (ValueError, errors.InvalidArgumentError), err_msg):
         self._CropToBoundingBox(x, offset_height, offset_width, target_height,
                                 target_width, use_tensor_inputs)
-      except Exception as e:
-        if err_msg not in str(e):
-          raise
-      else:
-        raise AssertionError("Exception not raised: %s" % err_msg)
 
   def _assertShapeInference(self, pre_shape, height, width, post_shape):
     image = array_ops.placeholder(dtypes.float32, shape=pre_shape)
     y = image_ops.crop_to_bounding_box(image, 0, 0, height, width)
     self.assertEqual(y.get_shape().as_list(), post_shape)
 
-  @test_util.run_deprecated_v1
   def testNoOp(self):
     x_shape = [10, 10, 10]
     x = np.random.uniform(size=x_shape)
     self._assertReturns(x, x_shape, 0, 0, x, x_shape)
 
-  @test_util.run_deprecated_v1
   def testCrop(self):
     x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
     x_shape = [3, 3, 1]
@@ -1788,21 +1786,21 @@ class CropToBoundingBoxTest(test_util.TensorFlowTestCase):
     y = [1, 2, 4, 5, 7, 8]
     self._assertReturns(x, x_shape, offset_height, offset_width, y, y_shape)
 
-  @test_util.run_deprecated_v1
   def testShapeInference(self):
-    self._assertShapeInference([55, 66, 3], 55, 66, [55, 66, 3])
-    self._assertShapeInference([59, 69, 3], 55, 66, [55, 66, 3])
-    self._assertShapeInference([None, 66, 3], 55, 66, [55, 66, 3])
-    self._assertShapeInference([None, 69, 3], 55, 66, [55, 66, 3])
-    self._assertShapeInference([55, None, 3], 55, 66, [55, 66, 3])
-    self._assertShapeInference([59, None, 3], 55, 66, [55, 66, 3])
-    self._assertShapeInference([None, None, 3], 55, 66, [55, 66, 3])
-    self._assertShapeInference([55, 66, None], 55, 66, [55, 66, None])
-    self._assertShapeInference([59, 69, None], 55, 66, [55, 66, None])
-    self._assertShapeInference([None, None, None], 55, 66, [55, 66, None])
-    self._assertShapeInference(None, 55, 66, [55, 66, None])
+    # Shape function requires placeholders and a graph.
+    with ops.Graph().as_default():
+      self._assertShapeInference([55, 66, 3], 55, 66, [55, 66, 3])
+      self._assertShapeInference([59, 69, 3], 55, 66, [55, 66, 3])
+      self._assertShapeInference([None, 66, 3], 55, 66, [55, 66, 3])
+      self._assertShapeInference([None, 69, 3], 55, 66, [55, 66, 3])
+      self._assertShapeInference([55, None, 3], 55, 66, [55, 66, 3])
+      self._assertShapeInference([59, None, 3], 55, 66, [55, 66, 3])
+      self._assertShapeInference([None, None, 3], 55, 66, [55, 66, 3])
+      self._assertShapeInference([55, 66, None], 55, 66, [55, 66, None])
+      self._assertShapeInference([59, 69, None], 55, 66, [55, 66, None])
+      self._assertShapeInference([None, None, None], 55, 66, [55, 66, None])
+      self._assertShapeInference(None, 55, 66, [55, 66, None])
 
-  @test_util.run_deprecated_v1
   def testNon3DInput(self):
     # Input image is not 3D
     x = [0] * 15
@@ -1814,7 +1812,6 @@ class CropToBoundingBoxTest(test_util.TensorFlowTestCase):
                          target_width,
                          "must have either 3 or 4 dimensions.")
 
-  @test_util.run_deprecated_v1
   def testZeroLengthInput(self):
     # Input image has 0-length dimension(s).
     # Each line is a test configuration:
@@ -1843,32 +1840,32 @@ class CropToBoundingBoxTest(test_util.TensorFlowTestCase):
           offset_width,
           target_height,
           target_width,
-          "assertion failed:",
+          "inner 3 dims of 'image.shape' must be > 0",
           use_tensor_inputs_options=[True])
 
-  @test_util.run_deprecated_v1
   def testBadParams(self):
     x_shape = [4, 4, 1]
     x = np.zeros(x_shape)
 
     # Each line is a test configuration:
     #   (offset_height, offset_width, target_height, target_width), err_msg
-    test_config = (([-1, 0, 3, 3], "offset_height must be >= 0"), ([
-        0, -1, 3, 3
-    ], "offset_width must be >= 0"), ([0, 0, 0, 3],
-                                      "target_height must be > 0"),
-                   ([0, 0, 3, 0], "target_width must be > 0"),
-                   ([2, 0, 3, 3], "height must be >= target + offset"),
-                   ([0, 2, 3, 3], "width must be >= target + offset"))
+    test_config = (
+        ([-1, 0, 3, 3], "offset_height must be >= 0"),
+        ([0, -1, 3, 3], "offset_width must be >= 0"),
+        ([0, 0, 0, 3], "target_height must be > 0"),
+        ([0, 0, 3, 0], "target_width must be > 0"),
+        ([2, 0, 3, 3], r"height must be >= target \+ offset"),
+        ([0, 2, 3, 3], r"width must be >= target \+ offset"))
 
     for params, err_msg in test_config:
       self._assertRaises(x, x_shape, *params, err_msg=err_msg)
 
-  @test_util.run_deprecated_v1
   def testNameScope(self):
-    image = array_ops.placeholder(dtypes.float32, shape=[55, 66, 3])
-    y = image_ops.crop_to_bounding_box(image, 0, 0, 55, 66)
-    self.assertTrue(y.name.startswith("crop_to_bounding_box"))
+    # Testing name scope requires a graph.
+    with ops.Graph().as_default():
+      image = array_ops.placeholder(dtypes.float32, shape=[55, 66, 3])
+      y = image_ops.crop_to_bounding_box(image, 0, 0, 55, 66)
+      self.assertTrue(y.name.startswith("crop_to_bounding_box"))
 
 
 class CentralCropTest(test_util.TensorFlowTestCase):
@@ -1881,7 +1878,6 @@ class CentralCropTest(test_util.TensorFlowTestCase):
     else:
       self.assertEqual(y.get_shape().as_list(), post_shape)
 
-  @test_util.run_deprecated_v1
   def testNoOp(self):
     x_shapes = [[13, 9, 3], [5, 13, 9, 3]]
     for x_shape in x_shapes:
@@ -1892,7 +1888,6 @@ class CentralCropTest(test_util.TensorFlowTestCase):
           y = image_ops.central_crop(x, 1.0)
           y_tf = self.evaluate(y)
           self.assertAllEqual(y_tf, x_np)
-          self.assertEqual(y.op.name, x.op.name)
 
   def testCropping(self):
     x_shape = [4, 8, 1]
@@ -1925,7 +1920,6 @@ class CentralCropTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(y_tf, y_np)
       self.assertAllEqual(y_tf.shape, y_np.shape)
 
-  @test_util.run_deprecated_v1
   def testCropping2(self):
     # Test case for 10315
     x_shapes = [[240, 320, 3], [5, 240, 320, 3]]
@@ -1936,51 +1930,50 @@ class CentralCropTest(test_util.TensorFlowTestCase):
       y_np = np.zeros(y_shape, dtype=np.int32)
       for use_gpu in [True, False]:
         with self.cached_session(use_gpu=use_gpu):
-          x = array_ops.placeholder(shape=x_shape, dtype=dtypes.int32)
-          y = image_ops.central_crop(x, 0.33)
-          y_tf = y.eval(feed_dict={x: x_np})
+          y_tf = self.evaluate(image_ops.central_crop(x_np, 0.33))
           self.assertAllEqual(y_tf, y_np)
           self.assertAllEqual(y_tf.shape, y_np.shape)
 
-  @test_util.run_deprecated_v1
   def testShapeInference(self):
-    # Test no-op fraction=1.0, with 3-D tensors.
-    self._assertShapeInference([50, 60, 3], 1.0, [50, 60, 3])
-    self._assertShapeInference([None, 60, 3], 1.0, [None, 60, 3])
-    self._assertShapeInference([50, None, 3], 1.0, [50, None, 3])
-    self._assertShapeInference([None, None, 3], 1.0, [None, None, 3])
-    self._assertShapeInference([50, 60, None], 1.0, [50, 60, None])
-    self._assertShapeInference([None, None, None], 1.0, [None, None, None])
+    # Shape function requires placeholders and a graph.
+    with ops.Graph().as_default():
+      # Test no-op fraction=1.0, with 3-D tensors.
+      self._assertShapeInference([50, 60, 3], 1.0, [50, 60, 3])
+      self._assertShapeInference([None, 60, 3], 1.0, [None, 60, 3])
+      self._assertShapeInference([50, None, 3], 1.0, [50, None, 3])
+      self._assertShapeInference([None, None, 3], 1.0, [None, None, 3])
+      self._assertShapeInference([50, 60, None], 1.0, [50, 60, None])
+      self._assertShapeInference([None, None, None], 1.0, [None, None, None])
 
-    # Test no-op fraction=0.5, with 3-D tensors.
-    self._assertShapeInference([50, 60, 3], 0.5, [26, 30, 3])
-    self._assertShapeInference([None, 60, 3], 0.5, [None, 30, 3])
-    self._assertShapeInference([50, None, 3], 0.5, [26, None, 3])
-    self._assertShapeInference([None, None, 3], 0.5, [None, None, 3])
-    self._assertShapeInference([50, 60, None], 0.5, [26, 30, None])
-    self._assertShapeInference([None, None, None], 0.5, [None, None, None])
+      # Test no-op fraction=0.5, with 3-D tensors.
+      self._assertShapeInference([50, 60, 3], 0.5, [26, 30, 3])
+      self._assertShapeInference([None, 60, 3], 0.5, [None, 30, 3])
+      self._assertShapeInference([50, None, 3], 0.5, [26, None, 3])
+      self._assertShapeInference([None, None, 3], 0.5, [None, None, 3])
+      self._assertShapeInference([50, 60, None], 0.5, [26, 30, None])
+      self._assertShapeInference([None, None, None], 0.5, [None, None, None])
 
-    # Test no-op fraction=1.0, with 4-D tensors.
-    self._assertShapeInference([5, 50, 60, 3], 1.0, [5, 50, 60, 3])
-    self._assertShapeInference([5, None, 60, 3], 1.0, [5, None, 60, 3])
-    self._assertShapeInference([5, 50, None, 3], 1.0, [5, 50, None, 3])
-    self._assertShapeInference([5, None, None, 3], 1.0, [5, None, None, 3])
-    self._assertShapeInference([5, 50, 60, None], 1.0, [5, 50, 60, None])
-    self._assertShapeInference([5, None, None, None], 1.0,
-                               [5, None, None, None])
-    self._assertShapeInference([None, None, None, None], 1.0,
-                               [None, None, None, None])
+      # Test no-op fraction=1.0, with 4-D tensors.
+      self._assertShapeInference([5, 50, 60, 3], 1.0, [5, 50, 60, 3])
+      self._assertShapeInference([5, None, 60, 3], 1.0, [5, None, 60, 3])
+      self._assertShapeInference([5, 50, None, 3], 1.0, [5, 50, None, 3])
+      self._assertShapeInference([5, None, None, 3], 1.0, [5, None, None, 3])
+      self._assertShapeInference([5, 50, 60, None], 1.0, [5, 50, 60, None])
+      self._assertShapeInference([5, None, None, None], 1.0,
+                                 [5, None, None, None])
+      self._assertShapeInference([None, None, None, None], 1.0,
+                                 [None, None, None, None])
 
-    # Test no-op fraction=0.5, with 4-D tensors.
-    self._assertShapeInference([5, 50, 60, 3], 0.5, [5, 26, 30, 3])
-    self._assertShapeInference([5, None, 60, 3], 0.5, [5, None, 30, 3])
-    self._assertShapeInference([5, 50, None, 3], 0.5, [5, 26, None, 3])
-    self._assertShapeInference([5, None, None, 3], 0.5, [5, None, None, 3])
-    self._assertShapeInference([5, 50, 60, None], 0.5, [5, 26, 30, None])
-    self._assertShapeInference([5, None, None, None], 0.5,
-                               [5, None, None, None])
-    self._assertShapeInference([None, None, None, None], 0.5,
-                               [None, None, None, None])
+      # Test no-op fraction=0.5, with 4-D tensors.
+      self._assertShapeInference([5, 50, 60, 3], 0.5, [5, 26, 30, 3])
+      self._assertShapeInference([5, None, 60, 3], 0.5, [5, None, 30, 3])
+      self._assertShapeInference([5, 50, None, 3], 0.5, [5, 26, None, 3])
+      self._assertShapeInference([5, None, None, 3], 0.5, [5, None, None, 3])
+      self._assertShapeInference([5, 50, 60, None], 0.5, [5, 26, 30, None])
+      self._assertShapeInference([5, None, None, None], 0.5,
+                                 [5, None, None, None])
+      self._assertShapeInference([None, None, None, None], 0.5,
+                                 [None, None, None, None])
 
   def testErrorOnInvalidCentralCropFractionValues(self):
     x_shape = [13, 9, 3]
@@ -2003,14 +1996,15 @@ class CentralCropTest(test_util.TensorFlowTestCase):
           with self.assertRaises(ValueError):
             _ = image_ops.central_crop(x, 0.5)
 
-  @test_util.run_deprecated_v1
   def testNameScope(self):
-    x_shape = [13, 9, 3]
-    x_np = np.ones(x_shape, dtype=np.float32)
-    for use_gpu in [True, False]:
-      with self.cached_session(use_gpu=use_gpu):
-        y = image_ops.central_crop(x_np, 1.0)
-        self.assertTrue(y.op.name.startswith("central_crop"))
+    # Testing name scope requires a graph.
+    with ops.Graph().as_default():
+      x_shape = [13, 9, 3]
+      x_np = np.ones(x_shape, dtype=np.float32)
+      for use_gpu in [True, False]:
+        with self.cached_session(use_gpu=use_gpu):
+          y = image_ops.central_crop(x_np, 1.0)
+          self.assertTrue(y.op.name.startswith("central_crop"))
 
 
 class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
@@ -2022,19 +2016,17 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
       offset_width = ops.convert_to_tensor(offset_width)
       target_height = ops.convert_to_tensor(target_height)
       target_width = ops.convert_to_tensor(target_width)
-      x_tensor = array_ops.placeholder(x.dtype, shape=[None] * x.ndim)
-      feed_dict = {x_tensor: x}
+      x_tensor = ops.convert_to_tensor(x)
     else:
       x_tensor = x
-      feed_dict = {}
 
-    y = image_ops.pad_to_bounding_box(x_tensor, offset_height, offset_width,
-                                      target_height, target_width)
-    if not use_tensor_inputs:
-      self.assertTrue(y.get_shape().is_fully_defined())
+    @def_function.function
+    def pad_bbox(*args):
+      return image_ops.pad_to_bounding_box(*args)
 
     with self.cached_session(use_gpu=True):
-      return y.eval(feed_dict=feed_dict)
+      return self.evaluate(pad_bbox(x_tensor, offset_height, offset_width,
+                                    target_height, target_width))
 
   def _assertReturns(self,
                      x,
@@ -2068,14 +2060,10 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
     x = np.array(x).reshape(x_shape)
 
     for use_tensor_inputs in use_tensor_inputs_options:
-      try:
+      with self.assertRaisesRegex(
+          (ValueError, errors.InvalidArgumentError), err_msg):
         self._PadToBoundingBox(x, offset_height, offset_width, target_height,
                                target_width, use_tensor_inputs)
-      except Exception as e:
-        if err_msg not in str(e):
-          raise
-      else:
-        raise AssertionError("Exception not raised: %s" % err_msg)
 
   def _assertShapeInference(self, pre_shape, height, width, post_shape):
     image = array_ops.placeholder(dtypes.float32, shape=pre_shape)
@@ -2096,14 +2084,12 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
     with self.cached_session(use_gpu=True):
       self.assertAllClose(y, self.evaluate(y_tf))
 
-  @test_util.run_deprecated_v1
   def testNoOp(self):
     x_shape = [10, 10, 10]
     x = np.random.uniform(size=x_shape)
     offset_height, offset_width = [0, 0]
     self._assertReturns(x, x_shape, offset_height, offset_width, x, x_shape)
 
-  @test_util.run_deprecated_v1
   def testPadding(self):
     x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
     x_shape = [3, 3, 1]
@@ -2128,21 +2114,21 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
     y_shape = [3, 4, 1]
     self._assertReturns(x, x_shape, offset_height, offset_width, y, y_shape)
 
-  @test_util.run_deprecated_v1
   def testShapeInference(self):
-    self._assertShapeInference([55, 66, 3], 55, 66, [55, 66, 3])
-    self._assertShapeInference([50, 60, 3], 55, 66, [55, 66, 3])
-    self._assertShapeInference([None, 66, 3], 55, 66, [55, 66, 3])
-    self._assertShapeInference([None, 60, 3], 55, 66, [55, 66, 3])
-    self._assertShapeInference([55, None, 3], 55, 66, [55, 66, 3])
-    self._assertShapeInference([50, None, 3], 55, 66, [55, 66, 3])
-    self._assertShapeInference([None, None, 3], 55, 66, [55, 66, 3])
-    self._assertShapeInference([55, 66, None], 55, 66, [55, 66, None])
-    self._assertShapeInference([50, 60, None], 55, 66, [55, 66, None])
-    self._assertShapeInference([None, None, None], 55, 66, [55, 66, None])
-    self._assertShapeInference(None, 55, 66, [55, 66, None])
+    # Shape function requires placeholders and a graph.
+    with ops.Graph().as_default():
+      self._assertShapeInference([55, 66, 3], 55, 66, [55, 66, 3])
+      self._assertShapeInference([50, 60, 3], 55, 66, [55, 66, 3])
+      self._assertShapeInference([None, 66, 3], 55, 66, [55, 66, 3])
+      self._assertShapeInference([None, 60, 3], 55, 66, [55, 66, 3])
+      self._assertShapeInference([55, None, 3], 55, 66, [55, 66, 3])
+      self._assertShapeInference([50, None, 3], 55, 66, [55, 66, 3])
+      self._assertShapeInference([None, None, 3], 55, 66, [55, 66, 3])
+      self._assertShapeInference([55, 66, None], 55, 66, [55, 66, None])
+      self._assertShapeInference([50, 60, None], 55, 66, [55, 66, None])
+      self._assertShapeInference([None, None, None], 55, 66, [55, 66, None])
+      self._assertShapeInference(None, 55, 66, [55, 66, None])
 
-  @test_util.run_deprecated_v1
   def testNon3DInput(self):
     # Input image is not 3D
     x = [0] * 15
@@ -2154,7 +2140,6 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
                          target_width,
                          "must have either 3 or 4 dimensions.")
 
-  @test_util.run_deprecated_v1
   def testZeroLengthInput(self):
     # Input image has 0-length dimension(s).
     # Each line is a test configuration:
@@ -2187,26 +2172,31 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
           "inner 3 dims of \\'image.shape\\' must be > 0",
           use_tensor_inputs_options=[True])
 
-  @test_util.run_deprecated_v1
   def testBadParams(self):
     x_shape = [3, 3, 1]
     x = np.zeros(x_shape)
 
     # Each line is a test configuration:
     #   offset_height, offset_width, target_height, target_width, err_msg
-    test_config = ((-1, 0, 4, 4, "offset_height must be >= 0"),
-                   (0, -1, 4, 4, "offset_width must be >= 0"),
-                   (2, 0, 4, 4, "height must be <= target - offset"),
-                   (0, 2, 4, 4, "width must be <= target - offset"))
+    test_config = (
+        (-1, 0, 4, 4,
+         "offset_height must be >= 0"),
+        (0, -1, 4, 4,
+         "offset_width must be >= 0"),
+        (2, 0, 4, 4,
+         "height must be <= target - offset"),
+        (0, 2, 4, 4,
+         "width must be <= target - offset"))
 
     for config_item in test_config:
       self._assertRaises(x, x_shape, *config_item)
 
-  @test_util.run_deprecated_v1
   def testNameScope(self):
-    image = array_ops.placeholder(dtypes.float32, shape=[55, 66, 3])
-    y = image_ops.pad_to_bounding_box(image, 0, 0, 55, 66)
-    self.assertTrue(y.op.name.startswith("pad_to_bounding_box"))
+    # Testing name scope requires a graph.
+    with ops.Graph().as_default():
+      image = array_ops.placeholder(dtypes.float32, shape=[55, 66, 3])
+      y = image_ops.pad_to_bounding_box(image, 0, 0, 55, 66)
+      self.assertTrue(y.op.name.startswith("pad_to_bounding_box"))
 
 
 class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
@@ -2255,19 +2245,17 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
         fraction_object_covered.append(float(np.sum(y_tf)) / bounding_box_area)
 
       # min_object_covered as tensor
-      min_object_covered_placeholder = array_ops.placeholder(dtypes.float32)
+      min_object_covered_t = ops.convert_to_tensor(min_object_covered)
       begin, size, _ = image_ops.sample_distorted_bounding_box(
           image_size=image_size_tf,
           bounding_boxes=bounding_box_tf,
-          min_object_covered=min_object_covered_placeholder,
+          min_object_covered=min_object_covered_t,
           aspect_ratio_range=aspect_ratio_range,
           area_range=area_range)
       y = array_ops.strided_slice(image_tf, begin, begin + size)
 
       for _ in xrange(num_iter):
-        y_tf = y.eval(feed_dict={
-            min_object_covered_placeholder: min_object_covered
-        })
+        y_tf = self.evaluate(y)
         crop_height = y_tf.shape[0]
         crop_width = y_tf.shape[1]
         aspect_ratio = float(crop_width) / float(crop_height)
@@ -2310,7 +2298,6 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
     # TODO(wicke, shlens, dga): Restore this test so that it is no longer flaky.
     # self.assertGreaterEqual(min(fraction_object_covered), min_object_covered)
 
-  @test_util.run_deprecated_v1
   def testWholeImageBoundingBox(self):
     height = 40
     width = 50
@@ -2325,7 +2312,6 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
         aspect_ratio_range=(0.75, 1.33),
         area_range=(0.05, 1.0))
 
-  @test_util.run_deprecated_v1
   def testWithBoundingBox(self):
     height = 40
     width = 50
@@ -2356,43 +2342,44 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
         aspect_ratio_range=(0.75, 1.33),
         area_range=(0.05, 1.0))
 
-  @test_util.run_deprecated_v1
   def testSampleDistortedBoundingBoxShape(self):
-    with self.cached_session(use_gpu=True):
-      image_size = constant_op.constant(
-          [40, 50, 1], shape=[3], dtype=dtypes.int32)
-      bounding_box = constant_op.constant(
-          [[[0.0, 0.0, 1.0, 1.0]]],
-          shape=[1, 1, 4],
-          dtype=dtypes.float32,
-      )
-      begin, end, bbox_for_drawing = image_ops.sample_distorted_bounding_box(
-          image_size=image_size,
-          bounding_boxes=bounding_box,
-          min_object_covered=0.1,
-          aspect_ratio_range=(0.75, 1.33),
-          area_range=(0.05, 1.0))
+    # Shape function requires placeholders and a graph.
+    with ops.Graph().as_default():
+      with self.cached_session(use_gpu=True):
+        image_size = constant_op.constant(
+            [40, 50, 1], shape=[3], dtype=dtypes.int32)
+        bounding_box = constant_op.constant(
+            [[[0.0, 0.0, 1.0, 1.0]]],
+            shape=[1, 1, 4],
+            dtype=dtypes.float32,
+        )
+        begin, end, bbox_for_drawing = image_ops.sample_distorted_bounding_box(
+            image_size=image_size,
+            bounding_boxes=bounding_box,
+            min_object_covered=0.1,
+            aspect_ratio_range=(0.75, 1.33),
+            area_range=(0.05, 1.0))
 
-      # Test that the shapes are correct.
-      self.assertAllEqual([3], begin.get_shape().as_list())
-      self.assertAllEqual([3], end.get_shape().as_list())
-      self.assertAllEqual([1, 1, 4], bbox_for_drawing.get_shape().as_list())
-      # Actual run to make sure shape is correct inside Compute().
-      begin = self.evaluate(begin)
-      end = self.evaluate(end)
-      bbox_for_drawing = self.evaluate(bbox_for_drawing)
+        # Test that the shapes are correct.
+        self.assertAllEqual([3], begin.get_shape().as_list())
+        self.assertAllEqual([3], end.get_shape().as_list())
+        self.assertAllEqual([1, 1, 4], bbox_for_drawing.get_shape().as_list())
+        # Actual run to make sure shape is correct inside Compute().
+        begin = self.evaluate(begin)
+        end = self.evaluate(end)
+        bbox_for_drawing = self.evaluate(bbox_for_drawing)
 
-      begin, end, bbox_for_drawing = image_ops.sample_distorted_bounding_box(
-          image_size=image_size,
-          bounding_boxes=bounding_box,
-          min_object_covered=array_ops.placeholder(dtypes.float32),
-          aspect_ratio_range=(0.75, 1.33),
-          area_range=(0.05, 1.0))
+        begin, end, bbox_for_drawing = image_ops.sample_distorted_bounding_box(
+            image_size=image_size,
+            bounding_boxes=bounding_box,
+            min_object_covered=array_ops.placeholder(dtypes.float32),
+            aspect_ratio_range=(0.75, 1.33),
+            area_range=(0.05, 1.0))
 
-      # Test that the shapes are correct.
-      self.assertAllEqual([3], begin.get_shape().as_list())
-      self.assertAllEqual([3], end.get_shape().as_list())
-      self.assertAllEqual([1, 1, 4], bbox_for_drawing.get_shape().as_list())
+        # Test that the shapes are correct.
+        self.assertAllEqual([3], begin.get_shape().as_list())
+        self.assertAllEqual([3], end.get_shape().as_list())
+        self.assertAllEqual([1, 1, 4], bbox_for_drawing.get_shape().as_list())
 
   def testDefaultMinObjectCovered(self):
     # By default min_object_covered=0.1 if not provided
@@ -2418,6 +2405,149 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
       end = self.evaluate(end)
       bbox_for_drawing = self.evaluate(bbox_for_drawing)
 
+  def _testStatelessSampleDistortedBoundingBox(self, image, bounding_box,
+                                               min_object_covered,
+                                               aspect_ratio_range, area_range):
+    with test_util.use_gpu():
+      original_area = float(np.prod(image.shape))
+      bounding_box_area = float((bounding_box[3] - bounding_box[1]) *
+                                (bounding_box[2] - bounding_box[0]))
+
+      image_size_np = np.array(image.shape, dtype=np.int32)
+      bounding_box_np = (
+          np.array(bounding_box, dtype=np.float32).reshape([1, 1, 4]))
+
+      iterations = 2
+      test_seeds = [(1, 2), (3, 4), (5, 6)]
+
+      for seed in test_seeds:
+        aspect_ratios = []
+        area_ratios = []
+        fraction_object_covered = []
+        for _ in range(iterations):
+          image_tf = constant_op.constant(image, shape=image.shape)
+          image_size_tf = constant_op.constant(
+              image_size_np, shape=image_size_np.shape)
+          bounding_box_tf = constant_op.constant(bounding_box_np,
+                                                 dtype=dtypes.float32,
+                                                 shape=bounding_box_np.shape)
+          begin, size, _ = image_ops.stateless_sample_distorted_bounding_box(
+              image_size=image_size_tf,
+              bounding_boxes=bounding_box_tf,
+              seed=seed,
+              min_object_covered=min_object_covered,
+              aspect_ratio_range=aspect_ratio_range,
+              area_range=area_range)
+          y = array_ops.strided_slice(image_tf, begin, begin + size)
+          y_tf = self.evaluate(y)
+          crop_height = y_tf.shape[0]
+          crop_width = y_tf.shape[1]
+          aspect_ratio = float(crop_width) / float(crop_height)
+          area = float(crop_width * crop_height)
+          aspect_ratios.append(aspect_ratio)
+          area_ratio = area / original_area
+          area_ratios.append(area_ratio)
+          fraction_object_covered.append(
+              float(np.sum(y_tf)) / bounding_box_area)
+
+        # Check that `area_ratio` is within valid range.
+        self.assertLessEqual(area_ratio, area_range[1])
+        self.assertGreaterEqual(area_ratio, area_range[0])
+
+        # Each array should consist of one value just repeated `iteration` times
+        # because the same seed is used.
+        self.assertEqual(len(set(aspect_ratios)), 1)
+        self.assertEqual(len(set(area_ratios)), 1)
+        self.assertEqual(len(set(fraction_object_covered)), 1)
+
+  # TODO(b/162345082): stateless random op generates different random number
+  # with xla_gpu. Update tests such that there is a single ground truth result
+  # to test against.
+  def testWholeImageBoundingBoxStateless(self):
+    height = 40
+    width = 50
+    image_size = [height, width, 1]
+    bounding_box = [0.0, 0.0, 1.0, 1.0]
+    image = np.arange(
+        0, np.prod(image_size), dtype=np.int32).reshape(image_size)
+    for min_obj_covered in [0.1, constant_op.constant(0.1)]:
+      self._testStatelessSampleDistortedBoundingBox(
+          image,
+          bounding_box,
+          min_object_covered=min_obj_covered,
+          aspect_ratio_range=(0.75, 1.33),
+          area_range=(0.05, 1.0))
+
+  # TODO(b/162345082): stateless random op generates different random number
+  # with xla_gpu. Update tests such that there is a single ground truth result
+  # to test against.
+  def testWithBoundingBoxStateless(self):
+    height = 40
+    width = 50
+    x_shape = [height, width, 1]
+    image = np.zeros(x_shape, dtype=np.int32)
+
+    xmin = 2
+    ymin = 3
+    xmax = 12
+    ymax = 13
+    for x in np.arange(xmin, xmax + 1, 1):
+      for y in np.arange(ymin, ymax + 1, 1):
+        image[x, y] = 1
+
+    # Bounding box is specified as (ymin, xmin, ymax, xmax) in
+    # relative coordinates.
+    bounding_box = (float(ymin) / height, float(xmin) / width,
+                    float(ymax) / height, float(xmax) / width)
+
+    # Test both scalar and tensor input for `min_object_covered`.
+    for min_obj_covered in [0.1, constant_op.constant(0.1)]:
+      self._testStatelessSampleDistortedBoundingBox(
+          image,
+          bounding_box=bounding_box,
+          min_object_covered=min_obj_covered,
+          aspect_ratio_range=(0.75, 1.33),
+          area_range=(0.05, 1.0))
+
+  def testSampleDistortedBoundingBoxShapeStateless(self):
+    with test_util.use_gpu():
+      image_size = constant_op.constant(
+          [40, 50, 1], shape=[3], dtype=dtypes.int32)
+      bounding_box = constant_op.constant(
+          [[[0.0, 0.0, 1.0, 1.0]]],
+          shape=[1, 1, 4],
+          dtype=dtypes.float32,
+      )
+
+      bbox_func = functools.partial(
+          image_ops.stateless_sample_distorted_bounding_box,
+          image_size=image_size,
+          bounding_boxes=bounding_box,
+          min_object_covered=0.1,
+          aspect_ratio_range=(0.75, 1.33),
+          area_range=(0.05, 1.0))
+
+      # Check error is raised with wrong seed shapes.
+      for seed in [1, (1, 2, 3)]:
+        with self.assertRaises((ValueError, errors.InvalidArgumentError)):
+          begin, end, bbox_for_drawing = bbox_func(seed=seed)
+
+      test_seed = (1, 2)
+      begin, end, bbox_for_drawing = bbox_func(seed=test_seed)
+
+      # Test that the shapes are correct.
+      self.assertAllEqual([3], begin.get_shape().as_list())
+      self.assertAllEqual([3], end.get_shape().as_list())
+      self.assertAllEqual([1, 1, 4], bbox_for_drawing.get_shape().as_list())
+
+      # Actual run to make sure shape is correct inside Compute().
+      begin = self.evaluate(begin)
+      end = self.evaluate(end)
+      bbox_for_drawing = self.evaluate(bbox_for_drawing)
+      self.assertAllEqual([3], begin.shape)
+      self.assertAllEqual([3], end.shape)
+      self.assertAllEqual([1, 1, 4], bbox_for_drawing.shape)
+
 
 class ResizeImagesV2Test(test_util.TensorFlowTestCase):
 
@@ -2959,7 +3089,8 @@ class ResizeImagesV2Test(test_util.TensorFlowTestCase):
     self._assertResizeCheckShape(x, x_shape, [320, 320], [320, 320, 3])
 
 
-class ResizeImagesTest(test_util.TensorFlowTestCase):
+class ResizeImagesTest(test_util.TensorFlowTestCase,
+                       parameterized.TestCase):
 
   METHODS = [
       image_ops.ResizeMethodV1.BILINEAR,
@@ -2994,7 +3125,6 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
       return False
 
   @test_util.disable_xla("align_corners=False not supported by XLA")
-  @test_util.run_deprecated_v1
   def testNoOp(self):
     img_shape = [1, 6, 4, 1]
     single_shape = [6, 4, 1]
@@ -3030,7 +3160,6 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
         newshape = self.evaluate(yshape)
         self.assertAllEqual(single_shape, newshape)
 
-  @test_util.run_deprecated_v1
   def testTensorArguments(self):
     img_shape = [1, 6, 4, 1]
     single_shape = [6, 4, 1]
@@ -3040,16 +3169,18 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
         127, 127, 64, 64, 127, 127, 64, 64, 64, 64, 127, 127, 64, 64, 127, 127,
         50, 50, 100, 100, 50, 50, 100, 100
     ]
-    new_size = array_ops.placeholder(dtypes.int32, shape=(2))
+
+    def resize_func(t, new_size, method):
+      return image_ops.resize_images(t, new_size, method)
 
     img_np = np.array(data, dtype=np.uint8).reshape(img_shape)
 
     for method in self.METHODS:
-      with self.cached_session(use_gpu=True) as sess:
+      with self.cached_session(use_gpu=True):
         image = constant_op.constant(img_np, shape=img_shape)
-        y = image_ops.resize_images(image, new_size, method)
+        y = resize_func(image, [6, 4], method)
         yshape = array_ops.shape(y)
-        resized, newshape = sess.run([y, yshape], {new_size: [6, 4]})
+        resized, newshape = self.evaluate([y, yshape])
         self.assertAllEqual(img_shape, newshape)
         self.assertAllClose(resized, img_np, atol=1e-5)
 
@@ -3057,58 +3188,82 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
     with self.cached_session(use_gpu=True):
       img_single = img_np.reshape(single_shape)
       image = constant_op.constant(img_single, shape=single_shape)
-      y = image_ops.resize_images(image, new_size, self.METHODS[0])
+      y = resize_func(image, [6, 4], self.METHODS[0])
       yshape = array_ops.shape(y)
-      resized, newshape = sess.run([y, yshape], {new_size: [6, 4]})
+      resized, newshape = self.evaluate([y, yshape])
       self.assertAllEqual(single_shape, newshape)
       self.assertAllClose(resized, img_single, atol=1e-5)
 
     # Incorrect shape.
     with self.assertRaises(ValueError):
       new_size = constant_op.constant(4)
-      _ = image_ops.resize_images(image, new_size,
-                                  image_ops.ResizeMethodV1.BILINEAR)
+      _ = resize_func(image, new_size, image_ops.ResizeMethodV1.BILINEAR)
     with self.assertRaises(ValueError):
       new_size = constant_op.constant([4])
-      _ = image_ops.resize_images(image, new_size,
-                                  image_ops.ResizeMethodV1.BILINEAR)
+      _ = resize_func(image, new_size, image_ops.ResizeMethodV1.BILINEAR)
     with self.assertRaises(ValueError):
       new_size = constant_op.constant([1, 2, 3])
-      _ = image_ops.resize_images(image, new_size,
-                                  image_ops.ResizeMethodV1.BILINEAR)
+      _ = resize_func(image, new_size, image_ops.ResizeMethodV1.BILINEAR)
 
     # Incorrect dtypes.
     with self.assertRaises(ValueError):
       new_size = constant_op.constant([6.0, 4])
-      _ = image_ops.resize_images(image, new_size,
-                                  image_ops.ResizeMethodV1.BILINEAR)
+      _ = resize_func(image, new_size, image_ops.ResizeMethodV1.BILINEAR)
     with self.assertRaises(ValueError):
-      _ = image_ops.resize_images(image, [6, 4.0],
-                                  image_ops.ResizeMethodV1.BILINEAR)
+      _ = resize_func(image, [6, 4.0], image_ops.ResizeMethodV1.BILINEAR)
     with self.assertRaises(ValueError):
-      _ = image_ops.resize_images(image, [None, 4],
-                                  image_ops.ResizeMethodV1.BILINEAR)
+      _ = resize_func(image, [None, 4], image_ops.ResizeMethodV1.BILINEAR)
     with self.assertRaises(ValueError):
-      _ = image_ops.resize_images(image, [6, None],
-                                  image_ops.ResizeMethodV1.BILINEAR)
+      _ = resize_func(image, [6, None], image_ops.ResizeMethodV1.BILINEAR)
 
-  @test_util.run_deprecated_v1
-  def testReturnDtype(self):
-    target_shapes = [[6, 4], [3, 2], [
-        array_ops.placeholder(dtypes.int32),
-        array_ops.placeholder(dtypes.int32)
-    ]]
-    for nptype in self.TYPES:
-      image = array_ops.placeholder(nptype, shape=[1, 6, 4, 1])
-      for method in self.METHODS:
-        for target_shape in target_shapes:
-          y = image_ops.resize_images(image, target_shape, method)
-          if (method == image_ops.ResizeMethodV1.NEAREST_NEIGHBOR or
-              target_shape == image.shape[1:3]):
-            expected_dtype = image.dtype
-          else:
-            expected_dtype = dtypes.float32
-          self.assertEqual(y.dtype, expected_dtype)
+  def testReturnDtypeV1(self):
+    # Shape inference in V1.
+    with ops.Graph().as_default():
+      target_shapes = [[6, 4], [3, 2], [
+          array_ops.placeholder(dtypes.int32),
+          array_ops.placeholder(dtypes.int32)
+      ]]
+      for nptype in self.TYPES:
+        image = array_ops.placeholder(nptype, shape=[1, 6, 4, 1])
+        for method in self.METHODS:
+          for target_shape in target_shapes:
+            y = image_ops.resize_images(image, target_shape, method)
+            if (method == image_ops.ResizeMethodV1.NEAREST_NEIGHBOR or
+                target_shape == image.shape[1:3]):
+              expected_dtype = image.dtype
+            else:
+              expected_dtype = dtypes.float32
+            self.assertEqual(y.dtype, expected_dtype)
+
+  @parameterized.named_parameters([("_RunEagerly", True), ("_RunGraph", False)])
+  def testReturnDtypeV2(self, run_func_eagerly):
+    if not context.executing_eagerly() and run_func_eagerly:
+      # Skip running tf.function eagerly in V1 mode.
+      self.skipTest("Skip test that runs tf.function eagerly in V1 mode.")
+    else:
+
+      @def_function.function
+      def test_dtype(image, target_shape, target_method):
+        y = image_ops.resize_images(image, target_shape, target_method)
+        if (method == image_ops.ResizeMethodV1.NEAREST_NEIGHBOR or
+            target_shape == image.shape[1:3]):
+          expected_dtype = image.dtype
+        else:
+          expected_dtype = dtypes.float32
+
+        self.assertEqual(y.dtype, expected_dtype)
+
+      target_shapes = [[6, 4],
+                       [3, 2],
+                       [tensor_spec.TensorSpec(shape=None, dtype=dtypes.int32),
+                        tensor_spec.TensorSpec(shape=None, dtype=dtypes.int32)]]
+
+      for nptype in self.TYPES:
+        for method in self.METHODS:
+          for target_shape in target_shapes:
+            with test_util.run_functions_eagerly(run_func_eagerly):
+              image = tensor_spec.TensorSpec(shape=[1, 6, 4, 1], dtype=nptype)
+              test_dtype.get_concrete_function(image, target_shape, method)
 
   @test_util.disable_xla("align_corners=False not supported by XLA")
   def testSumTensor(self):
@@ -3347,51 +3502,51 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
               value[use_gpu] = self.evaluate(out_op)
           self.assertAllClose(value[True], value[False], rtol=1e-5, atol=1e-5)
 
-  @test_util.run_deprecated_v1
   def testShapeInference(self):
-    self._assertShapeInference([50, 60, 3], [55, 66], [55, 66, 3])
-    self._assertShapeInference([55, 66, 3], [55, 66], [55, 66, 3])
-    self._assertShapeInference([59, 69, 3], [55, 66], [55, 66, 3])
-    self._assertShapeInference([50, 69, 3], [55, 66], [55, 66, 3])
-    self._assertShapeInference([59, 60, 3], [55, 66], [55, 66, 3])
-    self._assertShapeInference([None, 60, 3], [55, 66], [55, 66, 3])
-    self._assertShapeInference([None, 66, 3], [55, 66], [55, 66, 3])
-    self._assertShapeInference([None, 69, 3], [55, 66], [55, 66, 3])
-    self._assertShapeInference([50, None, 3], [55, 66], [55, 66, 3])
-    self._assertShapeInference([55, None, 3], [55, 66], [55, 66, 3])
-    self._assertShapeInference([59, None, 3], [55, 66], [55, 66, 3])
-    self._assertShapeInference([None, None, 3], [55, 66], [55, 66, 3])
-    self._assertShapeInference([50, 60, None], [55, 66], [55, 66, None])
-    self._assertShapeInference([55, 66, None], [55, 66], [55, 66, None])
-    self._assertShapeInference([59, 69, None], [55, 66], [55, 66, None])
-    self._assertShapeInference([50, 69, None], [55, 66], [55, 66, None])
-    self._assertShapeInference([59, 60, None], [55, 66], [55, 66, None])
-    self._assertShapeInference([None, None, None], [55, 66], [55, 66, None])
+    # Shape function requires placeholders and a graph.
+    with ops.Graph().as_default():
+      self._assertShapeInference([50, 60, 3], [55, 66], [55, 66, 3])
+      self._assertShapeInference([55, 66, 3], [55, 66], [55, 66, 3])
+      self._assertShapeInference([59, 69, 3], [55, 66], [55, 66, 3])
+      self._assertShapeInference([50, 69, 3], [55, 66], [55, 66, 3])
+      self._assertShapeInference([59, 60, 3], [55, 66], [55, 66, 3])
+      self._assertShapeInference([None, 60, 3], [55, 66], [55, 66, 3])
+      self._assertShapeInference([None, 66, 3], [55, 66], [55, 66, 3])
+      self._assertShapeInference([None, 69, 3], [55, 66], [55, 66, 3])
+      self._assertShapeInference([50, None, 3], [55, 66], [55, 66, 3])
+      self._assertShapeInference([55, None, 3], [55, 66], [55, 66, 3])
+      self._assertShapeInference([59, None, 3], [55, 66], [55, 66, 3])
+      self._assertShapeInference([None, None, 3], [55, 66], [55, 66, 3])
+      self._assertShapeInference([50, 60, None], [55, 66], [55, 66, None])
+      self._assertShapeInference([55, 66, None], [55, 66], [55, 66, None])
+      self._assertShapeInference([59, 69, None], [55, 66], [55, 66, None])
+      self._assertShapeInference([50, 69, None], [55, 66], [55, 66, None])
+      self._assertShapeInference([59, 60, None], [55, 66], [55, 66, None])
+      self._assertShapeInference([None, None, None], [55, 66], [55, 66, None])
 
-  @test_util.run_deprecated_v1
   def testNameScope(self):
-    img_shape = [1, 3, 2, 1]
-    with self.cached_session(use_gpu=True):
-      single_image = array_ops.placeholder(dtypes.float32, shape=[50, 60, 3])
-      y = image_ops.resize_images(single_image, [55, 66])
-      self.assertTrue(y.op.name.startswith("resize"))
+    # Testing name scope requires placeholders and a graph.
+    with ops.Graph().as_default():
+      img_shape = [1, 3, 2, 1]
+      with self.cached_session(use_gpu=True):
+        single_image = array_ops.placeholder(dtypes.float32, shape=[50, 60, 3])
+        y = image_ops.resize_images(single_image, [55, 66])
+        self.assertTrue(y.op.name.startswith("resize"))
 
   def _ResizeImageCall(self, x, max_h, max_w, preserve_aspect_ratio,
                        use_tensor_inputs):
     if use_tensor_inputs:
       target_max = ops.convert_to_tensor([max_h, max_w])
-      x_tensor = array_ops.placeholder(x.dtype, shape=[None] * x.ndim)
-      feed_dict = {x_tensor: x}
+      x_tensor = ops.convert_to_tensor(x)
     else:
       target_max = [max_h, max_w]
       x_tensor = x
-      feed_dict = {}
 
-    y = image_ops.resize_images(x_tensor, target_max,
-                                preserve_aspect_ratio=preserve_aspect_ratio)
+    y = image_ops.resize_images(
+        x_tensor, target_max, preserve_aspect_ratio=preserve_aspect_ratio)
 
     with self.cached_session(use_gpu=True):
-      return y.eval(feed_dict=feed_dict)
+      return self.evaluate(y)
 
   def _assertResizeEqual(self, x, x_shape, y, y_shape,
                          preserve_aspect_ratio=True,
@@ -3419,7 +3574,6 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
                                    preserve_aspect_ratio, use_tensor_inputs)
       self.assertShapeEqual(y, ops.convert_to_tensor(y_tf))
 
-  @test_util.run_deprecated_v1
   def testPreserveAspectRatioMultipleImages(self):
     x_shape = [10, 100, 100, 10]
     x = np.random.uniform(size=x_shape)
@@ -3427,42 +3581,36 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
     self._assertResizeCheckShape(x, x_shape, [250, 250], [10, 250, 250, 10],
                                  preserve_aspect_ratio=False)
 
-  @test_util.run_deprecated_v1
   def testPreserveAspectRatioNoOp(self):
     x_shape = [10, 10, 10]
     x = np.random.uniform(size=x_shape)
 
     self._assertResizeEqual(x, x_shape, x, x_shape)
 
-  @test_util.run_deprecated_v1
   def testPreserveAspectRatioSmaller(self):
     x_shape = [100, 100, 10]
     x = np.random.uniform(size=x_shape)
 
     self._assertResizeCheckShape(x, x_shape, [75, 50], [50, 50, 10])
 
-  @test_util.run_deprecated_v1
   def testPreserveAspectRatioSmallerMultipleImages(self):
     x_shape = [10, 100, 100, 10]
     x = np.random.uniform(size=x_shape)
 
     self._assertResizeCheckShape(x, x_shape, [75, 50], [10, 50, 50, 10])
 
-  @test_util.run_deprecated_v1
   def testPreserveAspectRatioLarger(self):
     x_shape = [100, 100, 10]
     x = np.random.uniform(size=x_shape)
 
     self._assertResizeCheckShape(x, x_shape, [150, 200], [150, 150, 10])
 
-  @test_util.run_deprecated_v1
   def testPreserveAspectRatioSameRatio(self):
     x_shape = [1920, 1080, 3]
     x = np.random.uniform(size=x_shape)
 
     self._assertResizeCheckShape(x, x_shape, [3840, 2160], [3840, 2160, 3])
 
-  @test_util.run_deprecated_v1
   def testPreserveAspectRatioSquare(self):
     x_shape = [299, 299, 3]
     x = np.random.uniform(size=x_shape)
@@ -4258,17 +4406,6 @@ class PngTest(test_util.TensorFlowTestCase):
         self.assertEqual(image.get_shape().as_list(),
                          [None, None, channels or None])
 
-  def testPaletteOnly(self):
-    filename = "tensorflow/core/lib/png/testdata/palette_only.png"
-    expected = np.zeros((20, 20, 1), np.uint8)
-    expected[1, 1:19, :] = 1
-    expected[3, 1:19, :] = 2
-    with self.cached_session(use_gpu=True):
-      channels = 1
-      png = image_ops.decode_png(io_ops.read_file(filename), channels=channels)
-      png = self.evaluate(png)
-      self.assertAllEqual(expected, png)
-
 
 class GifTest(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/python/ops/init_ops_test.py b/tensorflow/python/ops/init_ops_test.py
index ae8bfbdbdd0..4ea7ef007d6 100644
--- a/tensorflow/python/ops/init_ops_test.py
+++ b/tensorflow/python/ops/init_ops_test.py
@@ -203,6 +203,8 @@ class InitializersTest(test.TestCase):
             run_metadata=run_metadata)
 
   @test_util.run_gpu_only
+  @test_util.disable_tfrt('b/165614506: Incorrect device name set in '
+                          'tfrt::TensorHandle.')
   def test_eager_orthogonal_gpu(self):
     with context.eager_mode():
       v = variable_scope.get_variable(
diff --git a/tensorflow/python/ops/init_ops_v2_test.py b/tensorflow/python/ops/init_ops_v2_test.py
index 37b66d59c09..d06ffa4cc68 100644
--- a/tensorflow/python/ops/init_ops_v2_test.py
+++ b/tensorflow/python/ops/init_ops_v2_test.py
@@ -162,8 +162,7 @@ class RandomUniformInitializerTest(InitializersTest):
 
   @test_util.run_in_graph_and_eager_modes
   def testRangeInitializer(self):
-    self.skipTest("b/161580897")
-    shape = (9, 6, 7)
+    shape = (20, 6, 7)
     self._range_test(
         init_ops_v2.RandomUniform(minval=-1, maxval=1, seed=124),
         shape,
diff --git a/tensorflow/python/ops/linalg/linalg_impl.py b/tensorflow/python/ops/linalg/linalg_impl.py
index 9ddf7b5e8b8..2c1b5889720 100644
--- a/tensorflow/python/ops/linalg/linalg_impl.py
+++ b/tensorflow/python/ops/linalg/linalg_impl.py
@@ -234,7 +234,7 @@ def _matrix_exp_pade13(matrix):
 def matrix_exponential(input, name=None):  # pylint: disable=redefined-builtin
   r"""Computes the matrix exponential of one or more square matrices.
 
-  exp(A) = \sum_{n=0}^\infty A^n/n!
+  $$exp(A) = \sum_{n=0}^\infty A^n/n!$$
 
   The exponential is computed using a combination of the scaling and squaring
   method and the Pade approximation. Details can be found in:
@@ -276,6 +276,7 @@ def matrix_exponential(input, name=None):  # pylint: disable=redefined-builtin
             math_ops.abs(matrix),
             axis=array_ops.size(array_ops.shape(matrix)) - 2),
         axis=-1)[..., array_ops.newaxis, array_ops.newaxis]
+
     const = lambda x: constant_op.constant(x, l1_norm.dtype)
 
     def _nest_where(vals, cases):
@@ -320,13 +321,19 @@ def matrix_exponential(input, name=None):  # pylint: disable=redefined-builtin
     else:
       raise ValueError('tf.linalg.expm does not support matrices of type %s' %
                        matrix.dtype)
-    numer = u + v
-    denom = -u + v
-    result = linalg_ops.matrix_solve(denom, numer)
-    max_squarings = math_ops.reduce_max(squarings)
 
+    is_finite = math_ops.is_finite(math_ops.reduce_max(l1_norm))
+    nan = constant_op.constant(np.nan, matrix.dtype)
+    result = control_flow_ops.cond(
+        is_finite, lambda: linalg_ops.matrix_solve(-u + v, u + v),
+        lambda: array_ops.fill(array_ops.shape(matrix), nan))
+    max_squarings = math_ops.reduce_max(squarings)
     i = const(0.0)
-    c = lambda i, r: math_ops.less(i, max_squarings)
+
+    def c(i, _):
+      return control_flow_ops.cond(is_finite,
+                                   lambda: math_ops.less(i, max_squarings),
+                                   lambda: constant_op.constant(False))
 
     def b(i, r):
       return i + 1, array_ops.where_v2(
diff --git a/tensorflow/python/ops/linalg/linear_operator.py b/tensorflow/python/ops/linalg/linear_operator.py
index cf14cdb6eae..08974f83ffb 100644
--- a/tensorflow/python/ops/linalg/linear_operator.py
+++ b/tensorflow/python/ops/linalg/linear_operator.py
@@ -146,6 +146,27 @@ class LinearOperator(module.Module):
   * If `is_X == False`, callers should expect the operator to not have `X`.
   * If `is_X == None` (the default), callers should have no expectation either
     way.
+
+  #### Initialization parameters
+
+  All subclasses of `LinearOperator` are expected to pass a `parameters`
+  argument to `super().__init__()`.  This should be a `dict` containing
+  the unadulterated arguments passed to the subclass `__init__`.  For example,
+  `MyLinearOperator` with an initializer should look like:
+
+  ```python
+  def __init__(self, operator, is_square=False, name=None):
+     parameters = dict(
+         operator=operator,
+         is_square=is_square,
+         name=name
+     )
+     ...
+     super().__init__(..., parameters=parameters)
+   ```
+
+   Users can then access `my_linear_operator.parameters` to see all arguments
+   passed to its initializer.
   """
 
   # TODO(b/143910018) Remove graph_parents in V3.
@@ -158,7 +179,8 @@ class LinearOperator(module.Module):
                is_self_adjoint=None,
                is_positive_definite=None,
                is_square=None,
-               name=None):
+               name=None,
+               parameters=None):
     r"""Initialize the `LinearOperator`.
 
     **This is a private method for subclass use.**
@@ -179,6 +201,8 @@ class LinearOperator(module.Module):
         https://en.wikipedia.org/wiki/Positive-definite_matrix#Extension_for_non-symmetric_matrices
       is_square:  Expect that this operator acts like square [batch] matrices.
       name: A name for this `LinearOperator`.
+      parameters: Python `dict` of parameters used to instantiate this
+        `LinearOperator`.
 
     Raises:
       ValueError:  If any member of graph_parents is `None` or not a `Tensor`.
@@ -210,6 +234,8 @@ class LinearOperator(module.Module):
     self._is_non_singular = is_non_singular
     self._is_self_adjoint = is_self_adjoint
     self._is_positive_definite = is_positive_definite
+    self._parameters = self._no_dependency(parameters)
+    self._parameters_sanitized = False
     self._name = name or type(self).__name__
 
   @contextlib.contextmanager
@@ -221,6 +247,11 @@ class LinearOperator(module.Module):
     with ops.name_scope(full_name) as scope:
       yield scope
 
+  @property
+  def parameters(self):
+    """Dictionary of parameters used to instantiate this `LinearOperator`."""
+    return dict(self._parameters)
+
   @property
   def dtype(self):
     """The `DType` of `Tensor`s handled by this `LinearOperator`."""
diff --git a/tensorflow/python/ops/linalg/linear_operator_adjoint.py b/tensorflow/python/ops/linalg/linear_operator_adjoint.py
index 57c65647330..1af0ce9a008 100644
--- a/tensorflow/python/ops/linalg/linear_operator_adjoint.py
+++ b/tensorflow/python/ops/linalg/linear_operator_adjoint.py
@@ -112,6 +112,14 @@ class LinearOperatorAdjoint(linear_operator.LinearOperator):
     Raises:
       ValueError:  If `operator.is_non_singular` is False.
     """
+    parameters = dict(
+        operator=operator,
+        is_non_singular=is_non_singular,
+        is_self_adjoint=is_self_adjoint,
+        is_positive_definite=is_positive_definite,
+        is_square=is_square,
+        name=name,
+    )
 
     self._operator = operator
 
@@ -150,6 +158,7 @@ class LinearOperatorAdjoint(linear_operator.LinearOperator):
           is_self_adjoint=is_self_adjoint,
           is_positive_definite=is_positive_definite,
           is_square=is_square,
+          parameters=parameters,
           name=name)
     # TODO(b/143910018) Remove graph_parents in V3.
     self._set_graph_parents(operator.graph_parents)
diff --git a/tensorflow/python/ops/linalg/linear_operator_block_diag.py b/tensorflow/python/ops/linalg/linear_operator_block_diag.py
index 7afa15ae069..514b023ba82 100644
--- a/tensorflow/python/ops/linalg/linear_operator_block_diag.py
+++ b/tensorflow/python/ops/linalg/linear_operator_block_diag.py
@@ -163,6 +163,15 @@ class LinearOperatorBlockDiag(linear_operator.LinearOperator):
       TypeError:  If all operators do not have the same `dtype`.
       ValueError:  If `operators` is empty or are non-square.
     """
+    parameters = dict(
+        operators=operators,
+        is_non_singular=is_non_singular,
+        is_self_adjoint=is_self_adjoint,
+        is_positive_definite=is_positive_definite,
+        is_square=is_square,
+        name=name
+    )
+
     # Validate operators.
     check_ops.assert_proper_iterable(operators)
     operators = list(operators)
@@ -224,6 +233,7 @@ class LinearOperatorBlockDiag(linear_operator.LinearOperator):
           is_self_adjoint=is_self_adjoint,
           is_positive_definite=is_positive_definite,
           is_square=True,
+          parameters=parameters,
           name=name)
 
     # TODO(b/143910018) Remove graph_parents in V3.
diff --git a/tensorflow/python/ops/linalg/linear_operator_block_lower_triangular.py b/tensorflow/python/ops/linalg/linear_operator_block_lower_triangular.py
index 84f2ff15345..e2fa737bf59 100644
--- a/tensorflow/python/ops/linalg/linear_operator_block_lower_triangular.py
+++ b/tensorflow/python/ops/linalg/linear_operator_block_lower_triangular.py
@@ -231,6 +231,15 @@ class LinearOperatorBlockLowerTriangular(linear_operator.LinearOperator):
       ValueError:  If `operators` is empty, contains an erroneous number of
         elements, or contains operators with incompatible shapes.
     """
+    parameters = dict(
+        operators=operators,
+        is_non_singular=is_non_singular,
+        is_self_adjoint=is_self_adjoint,
+        is_positive_definite=is_positive_definite,
+        is_square=is_square,
+        name=name
+    )
+
     # Validate operators.
     check_ops.assert_proper_iterable(operators)
     for row in operators:
@@ -256,6 +265,7 @@ class LinearOperatorBlockLowerTriangular(linear_operator.LinearOperator):
           is_self_adjoint=is_self_adjoint,
           is_positive_definite=is_positive_definite,
           is_square=is_square,
+          parameters=parameters,
           name=name)
 
   def _validate_num_operators(self):
@@ -716,7 +726,7 @@ class LinearOperatorBlockLowerTriangular(linear_operator.LinearOperator):
           # `Ax_0 + Bx_1 + Dx_2 = y_0` as `Ax_0 = y_0*`, where
           # `y_0* = y_0 - Bx_1 - Dx_2`.
           for j in reversed(range(index + 1, len(self.operators))):
-            y -= self.operators[j][index].matmul(
+            y = y - self.operators[j][index].matmul(
                 solution_list[len(self.operators) - 1 - j],
                 adjoint=adjoint)
           # Continuing the example above, solve `Ax_0 = y_0*` for `x_0`.
@@ -738,7 +748,7 @@ class LinearOperatorBlockLowerTriangular(linear_operator.LinearOperator):
           # `Dx_0 + Ex_1 + Fx_2 = y_2` as `Fx_2 = y_2*`, where
           # `y_2* = y_2 - D_x0 - Ex_1`.
           for i, operator in enumerate(row[:-1]):
-            y -= operator.matmul(solution_list[i], adjoint=adjoint)
+            y = y - operator.matmul(solution_list[i], adjoint=adjoint)
           # Continuing the example above, solve `Fx_2 = y_2*` for `x_2`.
           solution_list.append(row[-1].solve(y, adjoint=adjoint))
 
diff --git a/tensorflow/python/ops/linalg/linear_operator_circulant.py b/tensorflow/python/ops/linalg/linear_operator_circulant.py
index d4b671c53bd..31dd5b2967a 100644
--- a/tensorflow/python/ops/linalg/linear_operator_circulant.py
+++ b/tensorflow/python/ops/linalg/linear_operator_circulant.py
@@ -63,6 +63,7 @@ class _BaseLinearOperatorCirculant(linear_operator.LinearOperator):
                is_self_adjoint=None,
                is_positive_definite=None,
                is_square=True,
+               parameters=None,
                name="LinearOperatorCirculant"):
     r"""Initialize an `_BaseLinearOperatorCirculant`.
 
@@ -83,6 +84,8 @@ class _BaseLinearOperatorCirculant(linear_operator.LinearOperator):
         https://en.wikipedia.org/wiki/Positive-definite_matrix\
             #Extension_for_non_symmetric_matrices
       is_square:  Expect that this operator acts like square [batch] matrices.
+      parameters: Python `dict` of parameters used to instantiate this
+        `LinearOperator`.
       name:  A name to prepend to all ops created by this class.
 
     Raises:
@@ -121,6 +124,7 @@ class _BaseLinearOperatorCirculant(linear_operator.LinearOperator):
           is_self_adjoint=is_self_adjoint,
           is_positive_definite=is_positive_definite,
           is_square=is_square,
+          parameters=parameters,
           name=name)
       # TODO(b/143910018) Remove graph_parents in V3.
       self._set_graph_parents([self.spectrum])
@@ -744,6 +748,15 @@ class LinearOperatorCirculant(_BaseLinearOperatorCirculant):
       is_square:  Expect that this operator acts like square [batch] matrices.
       name:  A name to prepend to all ops created by this class.
     """
+    parameters = dict(
+        spectrum=spectrum,
+        input_output_dtype=input_output_dtype,
+        is_non_singular=is_non_singular,
+        is_self_adjoint=is_self_adjoint,
+        is_positive_definite=is_positive_definite,
+        is_square=is_square,
+        name=name
+    )
     super(LinearOperatorCirculant, self).__init__(
         spectrum,
         block_depth=1,
@@ -752,6 +765,7 @@ class LinearOperatorCirculant(_BaseLinearOperatorCirculant):
         is_self_adjoint=is_self_adjoint,
         is_positive_definite=is_positive_definite,
         is_square=is_square,
+        parameters=parameters,
         name=name)
 
   def _eigvals(self):
@@ -924,6 +938,15 @@ class LinearOperatorCirculant2D(_BaseLinearOperatorCirculant):
       is_square:  Expect that this operator acts like square [batch] matrices.
       name:  A name to prepend to all ops created by this class.
     """
+    parameters = dict(
+        spectrum=spectrum,
+        input_output_dtype=input_output_dtype,
+        is_non_singular=is_non_singular,
+        is_self_adjoint=is_self_adjoint,
+        is_positive_definite=is_positive_definite,
+        is_square=is_square,
+        name=name
+    )
     super(LinearOperatorCirculant2D, self).__init__(
         spectrum,
         block_depth=2,
@@ -932,6 +955,7 @@ class LinearOperatorCirculant2D(_BaseLinearOperatorCirculant):
         is_self_adjoint=is_self_adjoint,
         is_positive_definite=is_positive_definite,
         is_square=is_square,
+        parameters=parameters,
         name=name)
 
 
@@ -1074,6 +1098,15 @@ class LinearOperatorCirculant3D(_BaseLinearOperatorCirculant):
       is_square:  Expect that this operator acts like square [batch] matrices.
       name:  A name to prepend to all ops created by this class.
     """
+    parameters = dict(
+        spectrum=spectrum,
+        input_output_dtype=input_output_dtype,
+        is_non_singular=is_non_singular,
+        is_self_adjoint=is_self_adjoint,
+        is_positive_definite=is_positive_definite,
+        is_square=is_square,
+        name=name
+    )
     super(LinearOperatorCirculant3D, self).__init__(
         spectrum,
         block_depth=3,
@@ -1082,6 +1115,7 @@ class LinearOperatorCirculant3D(_BaseLinearOperatorCirculant):
         is_self_adjoint=is_self_adjoint,
         is_positive_definite=is_positive_definite,
         is_square=is_square,
+        parameters=parameters,
         name=name)
 
 
diff --git a/tensorflow/python/ops/linalg/linear_operator_composition.py b/tensorflow/python/ops/linalg/linear_operator_composition.py
index 00ef86d5aba..ace7e85ddf6 100644
--- a/tensorflow/python/ops/linalg/linear_operator_composition.py
+++ b/tensorflow/python/ops/linalg/linear_operator_composition.py
@@ -143,6 +143,14 @@ class LinearOperatorComposition(linear_operator.LinearOperator):
       TypeError:  If all operators do not have the same `dtype`.
       ValueError:  If `operators` is empty.
     """
+    parameters = dict(
+        operators=operators,
+        is_non_singular=is_non_singular,
+        is_self_adjoint=is_self_adjoint,
+        is_positive_definite=is_positive_definite,
+        is_square=is_square,
+        name=name)
+
     # Validate operators.
     check_ops.assert_proper_iterable(operators)
     operators = list(operators)
@@ -182,6 +190,7 @@ class LinearOperatorComposition(linear_operator.LinearOperator):
           is_self_adjoint=is_self_adjoint,
           is_positive_definite=is_positive_definite,
           is_square=is_square,
+          parameters=parameters,
           name=name)
     # TODO(b/143910018) Remove graph_parents in V3.
     self._set_graph_parents(graph_parents)
diff --git a/tensorflow/python/ops/linalg/linear_operator_diag.py b/tensorflow/python/ops/linalg/linear_operator_diag.py
index b5e81b267ce..3f298bce341 100644
--- a/tensorflow/python/ops/linalg/linear_operator_diag.py
+++ b/tensorflow/python/ops/linalg/linear_operator_diag.py
@@ -139,6 +139,14 @@ class LinearOperatorDiag(linear_operator.LinearOperator):
       TypeError:  If `diag.dtype` is not an allowed type.
       ValueError:  If `diag.dtype` is real, and `is_self_adjoint` is not `True`.
     """
+    parameters = dict(
+        diag=diag,
+        is_non_singular=is_non_singular,
+        is_self_adjoint=is_self_adjoint,
+        is_positive_definite=is_positive_definite,
+        is_square=is_square,
+        name=name
+    )
 
     with ops.name_scope(name, values=[diag]):
       self._diag = linear_operator_util.convert_nonref_to_tensor(
@@ -163,6 +171,7 @@ class LinearOperatorDiag(linear_operator.LinearOperator):
           is_self_adjoint=is_self_adjoint,
           is_positive_definite=is_positive_definite,
           is_square=is_square,
+          parameters=parameters,
           name=name)
       # TODO(b/143910018) Remove graph_parents in V3.
       self._set_graph_parents([self._diag])
diff --git a/tensorflow/python/ops/linalg/linear_operator_full_matrix.py b/tensorflow/python/ops/linalg/linear_operator_full_matrix.py
index b10822589d5..a616a8c09fe 100644
--- a/tensorflow/python/ops/linalg/linear_operator_full_matrix.py
+++ b/tensorflow/python/ops/linalg/linear_operator_full_matrix.py
@@ -133,6 +133,14 @@ class LinearOperatorFullMatrix(linear_operator.LinearOperator):
     Raises:
       TypeError:  If `diag.dtype` is not an allowed type.
     """
+    parameters = dict(
+        matrix=matrix,
+        is_non_singular=is_non_singular,
+        is_self_adjoint=is_self_adjoint,
+        is_positive_definite=is_positive_definite,
+        is_square=is_square,
+        name=name
+    )
 
     with ops.name_scope(name, values=[matrix]):
       self._matrix = linear_operator_util.convert_nonref_to_tensor(
@@ -146,6 +154,7 @@ class LinearOperatorFullMatrix(linear_operator.LinearOperator):
           is_self_adjoint=is_self_adjoint,
           is_positive_definite=is_positive_definite,
           is_square=is_square,
+          parameters=parameters,
           name=name)
       # TODO(b/143910018) Remove graph_parents in V3.
       self._set_graph_parents([self._matrix])
diff --git a/tensorflow/python/ops/linalg/linear_operator_householder.py b/tensorflow/python/ops/linalg/linear_operator_householder.py
index 265c862ea03..cbb7a88a9ed 100644
--- a/tensorflow/python/ops/linalg/linear_operator_householder.py
+++ b/tensorflow/python/ops/linalg/linear_operator_householder.py
@@ -123,6 +123,14 @@ class LinearOperatorHouseholder(linear_operator.LinearOperator):
       ValueError:  `is_self_adjoint` is not `True`, `is_positive_definite` is
         not `False` or `is_square` is not `True`.
     """
+    parameters = dict(
+        reflection_axis=reflection_axis,
+        is_non_singular=is_non_singular,
+        is_self_adjoint=is_self_adjoint,
+        is_positive_definite=is_positive_definite,
+        is_square=is_square,
+        name=name
+    )
 
     with ops.name_scope(name, values=[reflection_axis]):
       self._reflection_axis = linear_operator_util.convert_nonref_to_tensor(
@@ -152,6 +160,7 @@ class LinearOperatorHouseholder(linear_operator.LinearOperator):
           is_self_adjoint=is_self_adjoint,
           is_positive_definite=is_positive_definite,
           is_square=is_square,
+          parameters=parameters,
           name=name)
       # TODO(b/143910018) Remove graph_parents in V3.
       self._set_graph_parents([self._reflection_axis])
diff --git a/tensorflow/python/ops/linalg/linear_operator_identity.py b/tensorflow/python/ops/linalg/linear_operator_identity.py
index a0f7ead42d6..8d5d2c8a52a 100644
--- a/tensorflow/python/ops/linalg/linear_operator_identity.py
+++ b/tensorflow/python/ops/linalg/linear_operator_identity.py
@@ -252,6 +252,17 @@ class LinearOperatorIdentity(BaseLinearOperatorIdentity):
         `{is_self_adjoint, is_non_singular, is_positive_definite}`.
       TypeError:  If `num_rows` or `batch_shape` is ref-type (e.g. Variable).
     """
+    parameters = dict(
+        num_rows=num_rows,
+        batch_shape=batch_shape,
+        dtype=dtype,
+        is_non_singular=is_non_singular,
+        is_self_adjoint=is_self_adjoint,
+        is_positive_definite=is_positive_definite,
+        is_square=is_square,
+        assert_proper_shapes=assert_proper_shapes,
+        name=name)
+
     dtype = dtype or dtypes.float32
     self._assert_proper_shapes = assert_proper_shapes
 
@@ -272,6 +283,7 @@ class LinearOperatorIdentity(BaseLinearOperatorIdentity):
           is_self_adjoint=is_self_adjoint,
           is_positive_definite=is_positive_definite,
           is_square=is_square,
+          parameters=parameters,
           name=name)
 
       linear_operator_util.assert_not_ref_type(num_rows, "num_rows")
@@ -596,6 +608,16 @@ class LinearOperatorScaledIdentity(BaseLinearOperatorIdentity):
       ValueError:  If `num_rows` is determined statically to be non-scalar, or
         negative.
     """
+    parameters = dict(
+        num_rows=num_rows,
+        multiplier=multiplier,
+        is_non_singular=is_non_singular,
+        is_self_adjoint=is_self_adjoint,
+        is_positive_definite=is_positive_definite,
+        is_square=is_square,
+        assert_proper_shapes=assert_proper_shapes,
+        name=name)
+
     self._assert_proper_shapes = assert_proper_shapes
 
     with ops.name_scope(name, values=[multiplier, num_rows]):
@@ -620,6 +642,7 @@ class LinearOperatorScaledIdentity(BaseLinearOperatorIdentity):
           is_self_adjoint=is_self_adjoint,
           is_positive_definite=is_positive_definite,
           is_square=is_square,
+          parameters=parameters,
           name=name)
 
       self._num_rows = linear_operator_util.shape_tensor(
diff --git a/tensorflow/python/ops/linalg/linear_operator_inversion.py b/tensorflow/python/ops/linalg/linear_operator_inversion.py
index d6527e7c6d5..b2784c4d1e5 100644
--- a/tensorflow/python/ops/linalg/linear_operator_inversion.py
+++ b/tensorflow/python/ops/linalg/linear_operator_inversion.py
@@ -113,6 +113,14 @@ class LinearOperatorInversion(linear_operator.LinearOperator):
     Raises:
       ValueError:  If `operator.is_non_singular` is False.
     """
+    parameters = dict(
+        operator=operator,
+        is_non_singular=is_non_singular,
+        is_self_adjoint=is_self_adjoint,
+        is_positive_definite=is_positive_definite,
+        is_square=is_square,
+        name=name
+    )
 
     self._operator = operator
 
@@ -163,6 +171,7 @@ class LinearOperatorInversion(linear_operator.LinearOperator):
           is_self_adjoint=is_self_adjoint,
           is_positive_definite=is_positive_definite,
           is_square=is_square,
+          parameters=parameters,
           name=name)
     # TODO(b/143910018) Remove graph_parents in V3.
     self._set_graph_parents(operator.graph_parents)
diff --git a/tensorflow/python/ops/linalg/linear_operator_kronecker.py b/tensorflow/python/ops/linalg/linear_operator_kronecker.py
index 1fe68885bfe..b351bc5c507 100644
--- a/tensorflow/python/ops/linalg/linear_operator_kronecker.py
+++ b/tensorflow/python/ops/linalg/linear_operator_kronecker.py
@@ -167,6 +167,15 @@ class LinearOperatorKronecker(linear_operator.LinearOperator):
       TypeError:  If all operators do not have the same `dtype`.
       ValueError:  If `operators` is empty.
     """
+    parameters = dict(
+        operators=operators,
+        is_non_singular=is_non_singular,
+        is_self_adjoint=is_self_adjoint,
+        is_positive_definite=is_positive_definite,
+        is_square=is_square,
+        name=name
+    )
+
     # Validate operators.
     check_ops.assert_proper_iterable(operators)
     operators = list(operators)
@@ -226,6 +235,7 @@ class LinearOperatorKronecker(linear_operator.LinearOperator):
           is_self_adjoint=is_self_adjoint,
           is_positive_definite=is_positive_definite,
           is_square=is_square,
+          parameters=parameters,
           name=name)
     # TODO(b/143910018) Remove graph_parents in V3.
     self._set_graph_parents(graph_parents)
diff --git a/tensorflow/python/ops/linalg/linear_operator_low_rank_update.py b/tensorflow/python/ops/linalg/linear_operator_low_rank_update.py
index c141bb19f35..2f12c71b48a 100644
--- a/tensorflow/python/ops/linalg/linear_operator_low_rank_update.py
+++ b/tensorflow/python/ops/linalg/linear_operator_low_rank_update.py
@@ -182,6 +182,18 @@ class LinearOperatorLowRankUpdate(linear_operator.LinearOperator):
     Raises:
       ValueError:  If `is_X` flags are set in an inconsistent way.
     """
+    parameters = dict(
+        base_operator=base_operator,
+        u=u,
+        diag_update=diag_update,
+        v=v,
+        is_diag_update_positive=is_diag_update_positive,
+        is_non_singular=is_non_singular,
+        is_self_adjoint=is_self_adjoint,
+        is_positive_definite=is_positive_definite,
+        is_square=is_square,
+        name=name
+    )
     dtype = base_operator.dtype
 
     if diag_update is not None:
@@ -253,6 +265,7 @@ class LinearOperatorLowRankUpdate(linear_operator.LinearOperator):
           is_self_adjoint=is_self_adjoint,
           is_positive_definite=is_positive_definite,
           is_square=is_square,
+          parameters=parameters,
           name=name)
       self._set_graph_parents(graph_parents)
 
diff --git a/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py b/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py
index a4120102663..fbc1f531083 100644
--- a/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py
+++ b/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py
@@ -137,6 +137,14 @@ class LinearOperatorLowerTriangular(linear_operator.LinearOperator):
     Raises:
       ValueError:  If `is_square` is `False`.
     """
+    parameters = dict(
+        tril=tril,
+        is_non_singular=is_non_singular,
+        is_self_adjoint=is_self_adjoint,
+        is_positive_definite=is_positive_definite,
+        is_square=is_square,
+        name=name
+    )
 
     if is_square is False:
       raise ValueError(
@@ -155,6 +163,7 @@ class LinearOperatorLowerTriangular(linear_operator.LinearOperator):
           is_self_adjoint=is_self_adjoint,
           is_positive_definite=is_positive_definite,
           is_square=is_square,
+          parameters=parameters,
           name=name)
       self._set_graph_parents([self._tril])
 
diff --git a/tensorflow/python/ops/linalg/linear_operator_permutation.py b/tensorflow/python/ops/linalg/linear_operator_permutation.py
index 9cc8e158a21..7f15941c473 100644
--- a/tensorflow/python/ops/linalg/linear_operator_permutation.py
+++ b/tensorflow/python/ops/linalg/linear_operator_permutation.py
@@ -140,6 +140,15 @@ class LinearOperatorPermutation(linear_operator.LinearOperator):
       ValueError:  `is_self_adjoint` is not `True`, `is_positive_definite` is
         not `False` or `is_square` is not `True`.
     """
+    parameters = dict(
+        perm=perm,
+        dtype=dtype,
+        is_non_singular=is_non_singular,
+        is_self_adjoint=is_self_adjoint,
+        is_positive_definite=is_positive_definite,
+        is_square=is_square,
+        name=name
+    )
 
     with ops.name_scope(name, values=[perm]):
       self._perm = linear_operator_util.convert_nonref_to_tensor(
@@ -160,6 +169,7 @@ class LinearOperatorPermutation(linear_operator.LinearOperator):
           is_self_adjoint=is_self_adjoint,
           is_positive_definite=is_positive_definite,
           is_square=is_square,
+          parameters=parameters,
           name=name)
 
   def _check_perm(self, perm):
diff --git a/tensorflow/python/ops/linalg/linear_operator_toeplitz.py b/tensorflow/python/ops/linalg/linear_operator_toeplitz.py
index 2d61a536e29..95546c25118 100644
--- a/tensorflow/python/ops/linalg/linear_operator_toeplitz.py
+++ b/tensorflow/python/ops/linalg/linear_operator_toeplitz.py
@@ -138,6 +138,15 @@ class LinearOperatorToeplitz(linear_operator.LinearOperator):
       is_square:  Expect that this operator acts like square [batch] matrices.
       name: A name for this `LinearOperator`.
     """
+    parameters = dict(
+        col=col,
+        row=row,
+        is_non_singular=is_non_singular,
+        is_self_adjoint=is_self_adjoint,
+        is_positive_definite=is_positive_definite,
+        is_square=is_square,
+        name=name
+    )
 
     with ops.name_scope(name, values=[row, col]):
       self._row = linear_operator_util.convert_nonref_to_tensor(row, name="row")
@@ -155,7 +164,9 @@ class LinearOperatorToeplitz(linear_operator.LinearOperator):
           is_self_adjoint=is_self_adjoint,
           is_positive_definite=is_positive_definite,
           is_square=is_square,
+          parameters=parameters,
           name=name)
+
       self._set_graph_parents([self._row, self._col])
 
   def _check_row_col(self, row, col):
diff --git a/tensorflow/python/ops/linalg/linear_operator_tridiag.py b/tensorflow/python/ops/linalg/linear_operator_tridiag.py
index 2ba310f75bf..b8c4027cc76 100644
--- a/tensorflow/python/ops/linalg/linear_operator_tridiag.py
+++ b/tensorflow/python/ops/linalg/linear_operator_tridiag.py
@@ -171,6 +171,15 @@ class LinearOperatorTridiag(linear_operator.LinearOperator):
       TypeError:  If `diag.dtype` is not an allowed type.
       ValueError:  If `diag.dtype` is real, and `is_self_adjoint` is not `True`.
     """
+    parameters = dict(
+        diagonals=diagonals,
+        diagonals_format=diagonals_format,
+        is_non_singular=is_non_singular,
+        is_self_adjoint=is_self_adjoint,
+        is_positive_definite=is_positive_definite,
+        is_square=is_square,
+        name=name
+    )
 
     with ops.name_scope(name, values=[diagonals]):
       if diagonals_format not in _DIAGONAL_FORMATS:
@@ -193,6 +202,7 @@ class LinearOperatorTridiag(linear_operator.LinearOperator):
           is_self_adjoint=is_self_adjoint,
           is_positive_definite=is_positive_definite,
           is_square=is_square,
+          parameters=parameters,
           name=name)
 
   def _shape(self):
diff --git a/tensorflow/python/ops/linalg/linear_operator_zeros.py b/tensorflow/python/ops/linalg/linear_operator_zeros.py
index 7382ef51218..eded9bb713f 100644
--- a/tensorflow/python/ops/linalg/linear_operator_zeros.py
+++ b/tensorflow/python/ops/linalg/linear_operator_zeros.py
@@ -176,6 +176,19 @@ class LinearOperatorZeros(linear_operator.LinearOperator):
       ValueError:  If any of the following is not `True`:
         `{is_self_adjoint, is_non_singular, is_positive_definite}`.
     """
+    parameters = dict(
+        num_rows=num_rows,
+        num_columns=num_columns,
+        batch_shape=batch_shape,
+        dtype=dtype,
+        is_non_singular=is_non_singular,
+        is_self_adjoint=is_self_adjoint,
+        is_positive_definite=is_positive_definite,
+        is_square=is_square,
+        assert_proper_shapes=assert_proper_shapes,
+        name=name
+    )
+
     dtype = dtype or dtypes.float32
     self._assert_proper_shapes = assert_proper_shapes
 
@@ -194,6 +207,7 @@ class LinearOperatorZeros(linear_operator.LinearOperator):
           is_self_adjoint=is_self_adjoint,
           is_positive_definite=is_positive_definite,
           is_square=is_square,
+          parameters=parameters,
           name=name)
 
       linear_operator_util.assert_not_ref_type(num_rows, "num_rows")
diff --git a/tensorflow/python/ops/linalg_ops.py b/tensorflow/python/ops/linalg_ops.py
index 03b7b98119d..966706a854f 100644
--- a/tensorflow/python/ops/linalg_ops.py
+++ b/tensorflow/python/ops/linalg_ops.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_linalg_ops
 from tensorflow.python.ops import linalg_ops_impl
 from tensorflow.python.ops import map_fn
@@ -731,9 +732,11 @@ def norm(tensor,
             lambda i: control_flow_ops.cond(i >= 0, lambda: i, lambda: i + rank),
             ops.convert_to_tensor(axis))
         axes = math_ops.range(rank)
-        perm_before = array_ops.concat(
-            [array_ops.setdiff1d(axes, positive_axis)[0], positive_axis],
-            axis=0)
+        perm_before = array_ops.concat([
+            gen_array_ops.list_diff(axes, positive_axis, dtypes.int32)[0],
+            positive_axis
+        ],
+                                       axis=0)
         perm_after = map_fn.map_fn(
             lambda i: math_ops.cast(
                 array_ops.squeeze(
diff --git a/tensorflow/python/ops/lookup_ops.py b/tensorflow/python/ops/lookup_ops.py
index 87b8aaa30bd..9f27ccf9a1c 100644
--- a/tensorflow/python/ops/lookup_ops.py
+++ b/tensorflow/python/ops/lookup_ops.py
@@ -40,6 +40,7 @@ from tensorflow.python.ops import string_ops
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_lookup_ops import *
+from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.training.saver import BaseSaverBuilder
 # pylint: enable=wildcard-import
 from tensorflow.python.training.tracking import base as trackable_base
@@ -209,14 +210,16 @@ class InitializableLookupTableBase(LookupInterface):
       name: A name for the operation (optional).
 
     Returns:
-      A `SparseTensor` if keys are sparse, otherwise a dense `Tensor`.
+      A `SparseTensor` if keys are sparse, a `RaggedTensor` if keys are ragged,
+      otherwise a dense `Tensor`.
 
     Raises:
       TypeError: when `keys` or `default_value` doesn't match the table data
         types.
     """
     key_tensor = keys
-    if isinstance(keys, sparse_tensor.SparseTensor):
+    if isinstance(keys,
+                  (sparse_tensor.SparseTensor, ragged_tensor.RaggedTensor)):
       key_tensor = keys.values
 
     if keys.dtype.base_dtype != self._key_dtype:
@@ -233,6 +236,8 @@ class InitializableLookupTableBase(LookupInterface):
     values.set_shape(key_tensor.get_shape())
     if isinstance(keys, sparse_tensor.SparseTensor):
       return sparse_tensor.SparseTensor(keys.indices, values, keys.dense_shape)
+    elif isinstance(keys, ragged_tensor.RaggedTensor):
+      return keys.with_values(values)
     else:
       return values
 
@@ -1058,7 +1063,8 @@ class IdTableWithHashBuckets(LookupInterface):
       name: Optional name for the op.
 
     Returns:
-      A `SparseTensor` if keys are sparse, otherwise a dense `Tensor`.
+      A `SparseTensor` if keys are sparse, a `RaggedTensor` if keys are ragged,
+      otherwise a dense `Tensor`.
 
     Raises:
       TypeError: when `keys` doesn't match the table key data type.
@@ -1067,7 +1073,8 @@ class IdTableWithHashBuckets(LookupInterface):
       raise TypeError("Signature mismatch. Keys must be dtype %s, got %s." %
                       (self._key_dtype, keys.dtype))
     values = keys
-    if isinstance(keys, sparse_tensor.SparseTensor):
+    if isinstance(keys,
+                  (sparse_tensor.SparseTensor, ragged_tensor.RaggedTensor)):
       values = keys.values
     if self._table and (self._table.key_dtype.base_dtype == dtypes.int64):
       values = math_ops.cast(values, dtypes.int64)
@@ -1092,6 +1099,8 @@ class IdTableWithHashBuckets(LookupInterface):
           ids = buckets
     if isinstance(keys, sparse_tensor.SparseTensor):
       return sparse_tensor.SparseTensor(keys.indices, ids, keys.dense_shape)
+    elif isinstance(keys, ragged_tensor.RaggedTensor):
+      return keys.with_values(ids)
     return ids
 
 
@@ -1244,7 +1253,8 @@ class StaticVocabularyTable(LookupInterface):
       name: Optional name for the op.
 
     Returns:
-      A `SparseTensor` if keys are sparse, otherwise a dense `Tensor`.
+      A `SparseTensor` if keys are sparse, a `RaggedTensor` if keys are ragged,
+      otherwise a dense `Tensor`.
 
     Raises:
       TypeError: when `keys` doesn't match the table key data type.
@@ -1253,7 +1263,8 @@ class StaticVocabularyTable(LookupInterface):
       raise TypeError("Signature mismatch. Keys must be dtype %s, got %s." %
                       (self._key_dtype, keys.dtype))
     values = keys
-    if isinstance(keys, sparse_tensor.SparseTensor):
+    if isinstance(keys,
+                  (sparse_tensor.SparseTensor, ragged_tensor.RaggedTensor)):
       values = keys.values
     if self._table and (self._table.key_dtype.base_dtype == dtypes.int64):
       values = math_ops.cast(values, dtypes.int64)
@@ -1273,6 +1284,8 @@ class StaticVocabularyTable(LookupInterface):
         ids = buckets
     if isinstance(keys, sparse_tensor.SparseTensor):
       return sparse_tensor.SparseTensor(keys.indices, ids, keys.dense_shape)
+    elif isinstance(keys, ragged_tensor.RaggedTensor):
+      return keys.with_values(ids)
     return ids
 
 
diff --git a/tensorflow/python/ops/map_ops.py b/tensorflow/python/ops/map_ops.py
new file mode 100644
index 00000000000..7315e7e18bd
--- /dev/null
+++ b/tensorflow/python/ops/map_ops.py
@@ -0,0 +1,76 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Ops to manipulate hashmap of tensors."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# go/tf-wildcard-import
+# pylint: disable=wildcard-import
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_map_ops
+from tensorflow.python.ops.gen_map_ops import *
+
+ops.NotDifferentiable("EmptyTensorMap")
+
+def empty_tensor_map():
+  return gen_map_ops.empty_tensor_map()
+
+def tensor_map_size(input_handle):
+  return gen_map_ops.tensor_map_size(input_handle)
+
+def tensor_map_insert(input_handle, key, value):
+  return gen_map_ops.tensor_map_insert(input_handle, key, value)
+
+def tensor_map_lookup(input_handle, key, value_dtype):
+  return gen_map_ops.tensor_map_lookup(input_handle, key, value_dtype)
+
+def tensor_map_erase(input_handle, key, value_dtype):
+  return gen_map_ops.tensor_map_erase(input_handle, key, value_dtype)
+
+def tensor_map_has_key(input_handle, key):
+  return gen_map_ops.tensor_map_has_key(input_handle, key)
+
+
+def tensor_map_stack_keys(input_handle, key_dtype):
+  return gen_map_ops.tensor_map_stack_keys(input_handle, key_dtype)
+
+
+@ops.RegisterGradient("TensorMapLookup")
+def LookupGrad(op, dval):
+  _, k = op.inputs
+  map_grad = empty_tensor_map()
+  map_grad = tensor_map_insert(map_grad, k, dval)
+  key_grad = None
+  return map_grad, key_grad
+
+@ops.RegisterGradient("TensorMapInsert")
+def InsertGrad(op, dmap):
+  _, k, v = op.inputs
+  key_grad = None
+  (value_grad, map_grad) = control_flow_ops.cond(
+      tensor_map_has_key(dmap, k), lambda:
+      (tensor_map_lookup(dmap, k, v.dtype), tensor_map_erase(dmap, k, v.dtype)),
+      lambda: (array_ops.zeros_like(v), dmap))
+  return map_grad, key_grad, value_grad
+
+@ops.RegisterGradient("TensorMapErase")
+def EraseGrad(op, dmap):
+  key_grad = None
+  map_grad = dmap
+  return map_grad, key_grad
diff --git a/tensorflow/python/ops/math_grad.py b/tensorflow/python/ops/math_grad.py
index 463ad8337c7..389f6f8dce9 100644
--- a/tensorflow/python/ops/math_grad.py
+++ b/tensorflow/python/ops/math_grad.py
@@ -297,7 +297,7 @@ def _ProdGrad(op, grad):
     reduction_indices = (reduction_indices + rank) % rank
     reduced = math_ops.cast(reduction_indices, dtypes.int32)
     idx = math_ops.range(0, rank)
-    other, _ = array_ops.setdiff1d(idx, reduced)
+    other, _ = gen_array_ops.list_diff(idx, reduced, dtypes.int32)
     perm = array_ops.concat([reduced, other], 0)
     reduced_num = math_ops.reduce_prod(array_ops.gather(input_shape, reduced))
     other_num = math_ops.reduce_prod(array_ops.gather(input_shape, other))
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 320791cc292..606cb97f4ec 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -367,8 +367,17 @@ def abs(x, name=None):  # pylint: disable=redefined-builtin
   Given a tensor `x` of complex numbers, this operation returns a tensor of type
   `float32` or `float64` that is the absolute value of each element in `x`. For
   a complex number \\(a + bj\\), its absolute value is computed as
-  \\(\sqrt{a^2 + b^2}\\).  For example:
+  \\(\sqrt{a^2 + b^2}\\).
 
+  For example:
+
+  >>> # real number
+  >>> x = tf.constant([-2.25, 3.25])
+  >>> tf.abs(x)
+  <tf.Tensor: shape=(2,), dtype=float32,
+  numpy=array([2.25, 3.25], dtype=float32)>
+
+  >>> # complex number
   >>> x = tf.constant([[-2.25 + 4.75j], [-3.25 + 5.75j]])
   >>> tf.abs(x)
   <tf.Tensor: shape=(2, 1), dtype=float64, numpy=
@@ -685,20 +694,27 @@ def complex(real, imag, name=None):
 @tf_export("math.sign", "sign")
 @dispatch.add_dispatch_support
 def sign(x, name=None):
-  """Returns an element-wise indication of the sign of a number.
+  r"""Returns an element-wise indication of the sign of a number.
 
-  y = sign(x) = -1 if x < 0; 0 if x == 0; 1 if x > 0.
+  `y = sign(x) = -1 if x < 0; 0 if x == 0; 1 if x > 0`.
 
-  For complex numbers, y = sign(x) = x / |x| if x != 0, otherwise y = 0.
+  For complex numbers, `y = sign(x) = x / |x| if x != 0, otherwise y = 0`.
 
   Example usage:
 
+  >>> # real number
   >>> tf.math.sign([0., 2., -3.])
-  <tf.Tensor: ... numpy=array([ 0.,  1., -1.], dtype=float32)>
+  <tf.Tensor: shape=(3,), dtype=float32,
+  numpy=array([ 0.,  1., -1.], dtype=float32)>
+
+  >>> # complex number
+  >>> tf.math.sign([1 + 1j, 0 + 0j])
+  <tf.Tensor: shape=(2,), dtype=complex128,
+  numpy=array([0.70710678+0.70710678j, 0.        +0.j        ])>
 
   Args:
    x: A Tensor. Must be one of the following types: bfloat16, half, float32,
-      float64, int32, int64, complex64, complex128.
+     float64, int32, int64, complex64, complex128.
    name: A name for the operation (optional).
 
   Returns:
@@ -708,7 +724,7 @@ def sign(x, name=None):
      tf.math.sign(x.values, ...), x.dense_shape).
   """
   x = ops.convert_to_tensor(x)
-  if x.dtype in (dtypes.complex64, dtypes.complex128):
+  if x.dtype.is_complex:
     return gen_math_ops.div_no_nan(
         x,
         cast(
@@ -3030,7 +3046,7 @@ def reduce_logsumexp(input_tensor, axis=None, keepdims=False, name=None):
             dims=reduce_dim))
     if not keepdims:
       my_max = array_ops.reshape(my_max, gen_array_ops.shape(result))
-    result = gen_math_ops.add(result, my_max)
+    result = _add_dispatch(result, my_max, name=name)
     return _may_reduce_to_scalar(keepdims, axis, result)
 
 
@@ -3615,9 +3631,9 @@ def _accumulate_n_grad(op, grad):
 def sigmoid(x, name=None):
   r"""Computes sigmoid of `x` element-wise.
 
-  Formula for calculating sigmoid(x): `y = 1 / (1 + exp(-x))`.
+  Formula for calculating $\mathrm{sigmoid}(x) = y = 1 / (1 + \exp(-x))$.
 
-  For x \in (-inf, inf) => sigmoid(x) \in (0, 1)
+  For $x \in (-\infty, \infty)$, $\mathrm{sigmoid}(x) \in (0, 1)$.
 
   Example Usage:
 
@@ -4000,8 +4016,7 @@ def unsorted_segment_mean(data, segment_ids, num_segments, name=None):
   segmentation](https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/math#about_segmentation)
   for an explanation of segments.
 
-  This operator is similar to the unsorted segment sum operator found
-  [here](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
+  This operator is similar to the `tf.math.unsorted_segment_sum` operator.
   Instead of computing the sum over segments, it computes the mean of all
   entries belonging to a segment such that:
 
@@ -4047,8 +4062,7 @@ def unsorted_segment_sqrt_n(data, segment_ids, num_segments, name=None):
   segmentation](https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/math#about_segmentation)
   for an explanation of segments.
 
-  This operator is similar to the unsorted segment sum operator found
-  [here](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
+  This operator is similar to the `tf.math.unsorted_segment_sum` operator.
   Additionally to computing the sum over segments, it divides the results by
   sqrt(N).
 
@@ -4500,7 +4514,7 @@ def tensordot(a, b, axes, name=None):
         rank_a = array_ops.rank(a)
         axes = ops.convert_to_tensor(axes, dtype=dtypes.int32, name="axes")
         axes = array_ops.where(axes >= 0, axes, axes + rank_a)
-        free, _ = array_ops.setdiff1d(range(rank_a), axes)
+        free, _ = gen_array_ops.list_diff(range(rank_a), axes, dtypes.int32)
       free_dims = array_ops.gather(shape_a, free)
       axes_dims = array_ops.gather(shape_a, axes)
       prod_free_dims = reduce_prod(free_dims)
@@ -4581,12 +4595,12 @@ def polyval(coeffs, x, name=None):
   If `x` is a tensor and `coeffs` is a list n + 1 tensors,
   this function returns the value of the n-th order polynomial
 
-     p(x) = coeffs[n-1] + coeffs[n-2] * x + ...  + coeffs[0] * x**(n-1)
+  `p(x) = coeffs[n-1] + coeffs[n-2] * x + ...  + coeffs[0] * x**(n-1)`
 
   evaluated using Horner's method, i.e.
 
-     p(x) = coeffs[n-1] + x * (coeffs[n-2] + ... + x * (coeffs[1] +
-            x * coeffs[0]))
+  `p(x) = coeffs[n-1] + x * (coeffs[n-2] + ... + x * (coeffs[1]
+          + x * coeffs[0]))`
 
   Usage Example:
 
@@ -4833,10 +4847,14 @@ def exp(x, name=None):
   numpy=array([   7.389056, 2980.958   ], dtype=float32)>
 
   For complex numbers, the exponential value is calculated as
-  \\(e^{x+iy}={e^x}{e^{iy}}={e^x}{\\cos(y)+i\\sin(y)}\\)
+  $$
+  e^{x+iy} = {e^x} {e^{iy}} = {e^x} ({\cos (y) + i \sin (y)})
+  $$
 
   For `1+1j` the value would be computed as:
-  \\(e^1{\\cos(1)+i\\sin(1)} = 2.7182817 \\times (0.5403023+0.84147096j)\\)
+  $$
+  e^1 (\cos (1) + i \sin (1)) = 2.7182817 \times (0.5403023+0.84147096j)
+  $$
 
   >>> x = tf.constant(1 + 1j)
   >>> tf.math.exp(x)
diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index 296395d034f..dabf4bb9d33 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -476,6 +476,13 @@ class DivAndModTest(test_util.TensorFlowTestCase):
     #               % array_ops.constant(divs))
     # self.assertAllEqual(tf2_result, tf_result)
 
+  def testFloorModBfloat64(self):
+    nums, divs = self.floatTestData()
+    tf_result = math_ops.floormod(math_ops.cast(nums, dtypes.bfloat16),
+                                  math_ops.cast(divs, dtypes.bfloat16))
+    np_result = nums % divs
+    self.assertAllEqual(tf_result, np_result)
+
   def testTruncateModInt(self):
     nums, divs = self.intTestData()
     tf_result = math_ops.truncatemod(nums, divs)
diff --git a/tensorflow/python/ops/nn_grad.py b/tensorflow/python/ops/nn_grad.py
index 4f6bafc096a..58dd1852cc5 100644
--- a/tensorflow/python/ops/nn_grad.py
+++ b/tensorflow/python/ops/nn_grad.py
@@ -688,6 +688,7 @@ def _MaxPoolGrad(op, grad):
       op.get_attr("ksize"),
       op.get_attr("strides"),
       padding=op.get_attr("padding"),
+      explicit_paddings=op.get_attr("explicit_paddings"),
       data_format=op.get_attr("data_format"))
 
 
@@ -1142,3 +1143,48 @@ def _NthElementGrad(op, grad):
   num_selected = array_ops.expand_dims(math_ops.reduce_sum(indicators, -1), -1)
 
   return [math_ops.divide(indicators, num_selected) * grad, None]
+
+
+def _MeanAggregator(inputs, segments):
+  """Replaces each segment with its mean along the last axis.
+
+  Specifically, each value in the `inputs` tensor gets replaced by the mean
+  value computed from the values that belong to the same segment.
+
+  Args:
+   inputs: A 2-tensor. Aggregation is done over dimension 1.
+   segments: A 2-tensor, same shape as `input`.
+
+  Returns:
+    The result, same shape and type as `inputs`.
+  """
+  result = []
+  for inputs_i, segments_i in zip(
+      array_ops.split(inputs, inputs.shape[0]),
+      array_ops.split(segments, segments.shape[0])):
+    # Note that we do not use tf.math.segment_mean, as it has no TPU support.
+    means_i = math_ops.unsorted_segment_mean(
+        inputs_i, segments_i, num_segments=math_ops.reduce_max(segments_i) + 1)
+    result.append(
+        array_ops.reshape(array_ops.gather(means_i, segments_i), [-1]))
+  return array_ops.stack(result, axis=0)
+
+
+# We have to register the gradients for these ops so that tensorflow will know
+# how to differentiate them.
+@ops.RegisterGradient("IsotonicRegression")
+def _IsotonicRegressionGrad(op, grad_output, grad_segments):
+  """Gradient for the isotonic regression function.
+
+  Args:
+    op: The IsotonicRegression tensorflow op.
+    grad_output: Tensor of incoming gradients with respect to the output.
+    grad_segments: Tensor of incoming gradients with respect to the segments.
+
+  Returns:
+    A tensor, same size as `grad_output` with the gradient with respect to
+    the input.
+  """
+  del grad_segments  # Discrete, non-differentiable.
+  segments = op.outputs[1]
+  return _MeanAggregator(grad_output, segments)
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index ff55ca32e8d..9cb23ba72cb 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -1752,12 +1752,14 @@ def atrous_conv2d(value, filters, rate, padding, name=None):
       name=name)
 
 
-def convert_padding(padding):
+def convert_padding(padding, expected_length=4):
   """Converts Python padding to C++ padding for ops which take EXPLICIT padding.
 
   Args:
     padding: the `padding` argument for a Python op which supports EXPLICIT
       padding.
+    expected_length: Expected number of entries in the padding list when
+      explicit padding is used.
 
   Returns:
     (padding, explicit_paddings) pair, which should be passed as attributes to a
@@ -1783,9 +1785,9 @@ def convert_padding(padding):
                          "be a list/tuple of size 2. Element with index %d of "
                          "padding has size %d" % (i, len(dim_paddings)))
       explicit_paddings.extend(dim_paddings)
-    if len(padding) != 4:
-      raise ValueError("When padding is a list, it must be of size 4. Got "
-                       "padding of size: %d" % len(padding))
+    if len(padding) != expected_length:
+      raise ValueError("When padding is a list, it must be of size %d. Got "
+                       "padding of size: %d" % (expected_length, len(padding)))
     padding = "EXPLICIT"
   return padding, explicit_paddings
 
@@ -2524,8 +2526,13 @@ def conv2d_transpose_v2(
       value is given it is replicated in the `H` and `W` dimension. By default
       the `N` and `C` dimensions are set to 0. The dimension order is determined
       by the value of `data_format`, see below for details.
-    padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm. See
-      the "returns" section of `tf.nn.convolution` for details.
+    padding: Either the `string `"SAME"` or `"VALID"` indicating the type of
+      padding algorithm to use, or a list indicating the explicit paddings at
+      the start and end of each dimension. When explicit padding is used and
+      data_format is `"NHWC"`, this should be in the form `[[0, 0], [pad_top,
+      pad_bottom], [pad_left, pad_right], [0, 0]]`. When explicit padding used
+      and data_format is `"NCHW"`, this should be in the form `[[0, 0], [0, 0],
+      [pad_top, pad_bottom], [pad_left, pad_right]]`.
     data_format: A string. 'NHWC' and 'NCHW' are supported.
     dilations: An int or list of `ints` that has length `1`, `2` or `4`,
       defaults to 1. The dilation factor for each dimension of`input`. If a
@@ -2559,6 +2566,7 @@ def conv2d_transpose_v2(
 
     strides = _get_sequence(strides, 2, channel_index, "strides")
     dilations = _get_sequence(dilations, 2, channel_index, "dilations")
+    padding, explicit_paddings = convert_padding(padding)
 
     return gen_nn_ops.conv2d_backprop_input(
         input_sizes=output_shape,
@@ -2566,6 +2574,7 @@ def conv2d_transpose_v2(
         out_backprop=input,
         strides=strides,
         padding=padding,
+        explicit_paddings=explicit_paddings,
         data_format=data_format,
         dilations=dilations,
         name=name)
@@ -3566,46 +3575,49 @@ def _flatten_outer_dims(logits):
   return output
 
 
-def _softmax(logits, compute_op, dim=-1, name=None):
-  """Helper function for softmax and log_softmax.
+def _wrap_2d_function(inputs, compute_op, dim=-1, name=None):
+  """Helper function for ops that accept and return 2d inputs of same shape.
 
-  It reshapes and transposes the input logits into a 2-D Tensor and then invokes
-  the tf.nn._softmax or tf.nn._log_softmax function. The output would be
-  transposed and reshaped back.
+  It reshapes and transposes the inputs into a 2-D Tensor and then invokes
+  the given function. The output would be transposed and reshaped back.
+  If the given function returns a tuple of tensors, each of them will be
+  transposed and reshaped.
 
   Args:
-    logits: A non-empty `Tensor`. Must be one of the following types: `half`,
+    inputs: A non-empty `Tensor`. Must be one of the following types: `half`,
       `float32`, `float64`.
-    compute_op: Either gen_nn_ops.softmax or gen_nn_ops.log_softmax
+    compute_op: The function to wrap. Must accept the input tensor as its first
+      arugment, and a second keyword argument `name`.
     dim: The dimension softmax would be performed on. The default is -1 which
       indicates the last dimension.
     name: A name for the operation (optional).
 
   Returns:
-    A `Tensor`. Has the same type as `logits`. Same shape as `logits`.
+    A `Tensor`. Has the same shape as inputs. If compute_op returns multiple
+      tensors, each of them have the same shape as the input.
   Raises:
-    InvalidArgumentError: if `logits` is empty or `dim` is beyond the last
-      dimension of `logits`.
+    InvalidArgumentError: if `inputs` is empty or `dim` is beyond the last
+      dimension of `inputs`.
   """
 
-  def _swap_axis(logits, dim_index, last_index, name=None):
+  def _swap_axis(input_tensor, dim_index, last_index, name=None):
     """Swaps logits's dim_index and last_index."""
     return array_ops.transpose(
-        logits,
+        input_tensor,
         array_ops.concat([
             math_ops.range(dim_index), [last_index],
             math_ops.range(dim_index + 1, last_index), [dim_index]
         ], 0),
         name=name)
 
-  logits = ops.convert_to_tensor(logits)
+  inputs = ops.convert_to_tensor(inputs)
 
   # We need its original shape for shape inference.
-  shape = logits.get_shape()
+  shape = inputs.get_shape()
   is_last_dim = (dim == -1) or (dim == shape.ndims - 1)
 
   if is_last_dim:
-    return compute_op(logits, name=name)
+    return compute_op(inputs, name=name)
 
   dim_val = dim
   if isinstance(dim, ops.Tensor):
@@ -3618,10 +3630,10 @@ def _softmax(logits, compute_op, dim=-1, name=None):
                                        shape.ndims))
 
   # If dim is not the last dimension, we have to do a transpose so that we can
-  # still perform softmax on its last dimension.
+  # still perform the op on its last dimension.
 
   # In case dim is negative (and is not last dimension -1), add shape.ndims
-  ndims = array_ops.rank(logits)
+  ndims = array_ops.rank(inputs)
   if not isinstance(dim, ops.Tensor):
     if dim < 0:
       dim += ndims
@@ -3629,20 +3641,24 @@ def _softmax(logits, compute_op, dim=-1, name=None):
     dim = array_ops.where(math_ops.less(dim, 0), dim + ndims, dim)
 
   # Swap logits' dimension of dim and its last dimension.
-  input_rank = array_ops.rank(logits)
+  input_rank = array_ops.rank(inputs)
   dim_axis = dim % shape.ndims
-  logits = _swap_axis(logits, dim_axis, math_ops.subtract(input_rank, 1))
+  inputs = _swap_axis(inputs, dim_axis, math_ops.subtract(input_rank, 1))
 
-  # Do the actual softmax on its last dimension.
-  output = compute_op(logits)
+  # Do the actual call on its last dimension.
+  def fix_output(output):
+    output = _swap_axis(
+        output, dim_axis, math_ops.subtract(input_rank, 1), name=name)
 
-  output = _swap_axis(
-      output, dim_axis, math_ops.subtract(input_rank, 1), name=name)
+    # Make shape inference work since transpose may erase its static shape.
+    output.set_shape(shape)
+    return output
 
-  # Make shape inference work since transpose may erase its static shape.
-  output.set_shape(shape)
-
-  return output
+  outputs = compute_op(inputs)
+  if isinstance(outputs, tuple):
+    return tuple(fix_output(output) for output in outputs)
+  else:
+    return fix_output(outputs)
 
 
 @tf_export(v1=["nn.softmax", "math.softmax"])
@@ -3687,7 +3703,7 @@ def softmax(logits, axis=None, name=None, dim=None):
   axis = deprecation.deprecated_argument_lookup("axis", axis, "dim", dim)
   if axis is None:
     axis = -1
-  return _softmax(logits, gen_nn_ops.softmax, axis, name)
+  return _wrap_2d_function(logits, gen_nn_ops.softmax, axis, name)
 
 
 @tf_export("nn.softmax", "math.softmax", v1=[])
@@ -3715,7 +3731,7 @@ def softmax_v2(logits, axis=None, name=None):
   """
   if axis is None:
     axis = -1
-  return _softmax(logits, gen_nn_ops.softmax, axis, name)
+  return _wrap_2d_function(logits, gen_nn_ops.softmax, axis, name)
 
 
 @tf_export(v1=["nn.log_softmax", "math.log_softmax"])
@@ -3746,7 +3762,7 @@ def log_softmax(logits, axis=None, name=None, dim=None):
   axis = deprecation.deprecated_argument_lookup("axis", axis, "dim", dim)
   if axis is None:
     axis = -1
-  return _softmax(logits, gen_nn_ops.log_softmax, axis, name)
+  return _wrap_2d_function(logits, gen_nn_ops.log_softmax, axis, name)
 
 
 @tf_export("nn.log_softmax", "math.log_softmax", v1=[])
@@ -3774,7 +3790,7 @@ def log_softmax_v2(logits, axis=None, name=None):
   """
   if axis is None:
     axis = -1
-  return _softmax(logits, gen_nn_ops.log_softmax, axis, name)
+  return _wrap_2d_function(logits, gen_nn_ops.log_softmax, axis, name)
 
 
 def _ensure_xent_args(name, sentinel, labels, logits):
@@ -4474,8 +4490,15 @@ def max_pool_v2(input, ksize, strides, padding, data_format=None, name=None):
       of the window for each dimension of the input tensor.
     strides: An int or list of `ints` that has length `1`, `N` or `N+2`. The
       stride of the sliding window for each dimension of the input tensor.
-    padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm. See
-      the "returns" section of `tf.nn.convolution` for details.
+    padding: Either the `string `"SAME"` or `"VALID"` indicating the type of
+      padding algorithm to use, or a list indicating the explicit paddings at
+      the start and end of each dimension. When explicit padding is used and
+      data_format is `"NHWC"`, this should be in the form `[[0, 0], [pad_top,
+      pad_bottom], [pad_left, pad_right], [0, 0]]`. When explicit padding used
+      and data_format is `"NCHW"`, this should be in the form `[[0, 0], [0, 0],
+      [pad_top, pad_bottom], [pad_left, pad_right]]`. When using explicit
+      padding, the size of the paddings cannot be greater than the sliding
+      window size.
     data_format: A string. Specifies the channel dimension. For N=1 it can be
       either "NWC" (default) or "NCW", for N=2 it can be either "NHWC" (default)
       or "NCHW" and for N=3 either "NDHWC" (default) or "NCDHW".
@@ -4501,12 +4524,20 @@ def max_pool_v2(input, ksize, strides, padding, data_format=None, name=None):
   else:
     channel_index = 1 if data_format.startswith("NC") else n + 1
 
+  if isinstance(padding, (list, tuple)) and data_format == "NCHW_VECT_C":
+    raise ValueError("Data formats NCHW_VECT_C is not yet supported with "
+                     "explicit padding")
+
   ksize = _get_sequence(ksize, n, channel_index, "ksize")
   strides = _get_sequence(strides, n, channel_index, "strides")
 
+  if (isinstance(padding, (list, tuple)) and n == 3):
+    raise ValueError("Explicit padding is not yet supported with an input "
+                     "tensor of rank 5")
+
   max_pooling_ops = {
       1: max_pool1d,
-      2: gen_nn_ops.max_pool,
+      2: max_pool2d,
       3: gen_nn_ops.max_pool3d
   }
 
@@ -4538,8 +4569,15 @@ def max_pool(value,
       The size of the window for each dimension of the input tensor.
     strides: An int or list of `ints` that has length `1`, `2` or `4`.
       The stride of the sliding window for each dimension of the input tensor.
-    padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
-      See the "returns" section of `tf.nn.convolution` for details.
+    padding: Either the `string `"SAME"` or `"VALID"` indicating the type of
+      padding algorithm to use, or a list indicating the explicit paddings at
+      the start and end of each dimension. When explicit padding is used and
+      data_format is `"NHWC"`, this should be in the form `[[0, 0], [pad_top,
+      pad_bottom], [pad_left, pad_right], [0, 0]]`. When explicit padding used
+      and data_format is `"NCHW"`, this should be in the form `[[0, 0], [0, 0],
+      [pad_top, pad_bottom], [pad_left, pad_right]]`. When using explicit
+      padding, the size of the paddings cannot be greater than the sliding
+      window size.
     data_format: A string. 'NHWC', 'NCHW' and 'NCHW_VECT_C' are supported.
     name: Optional name for the operation.
     input: Alias for value.
@@ -4556,6 +4594,10 @@ def max_pool(value,
 
     ksize = _get_sequence(ksize, 2, channel_index, "ksize")
     strides = _get_sequence(strides, 2, channel_index, "strides")
+    if isinstance(padding, (list, tuple)) and data_format == "NCHW_VECT_C":
+      raise ValueError("Data formats NCHW_VECT_C is not yet supported with "
+                       "explicit padding")
+    padding, explicit_paddings = convert_padding(padding)
     if ((np.isscalar(ksize) and ksize == 0) or
         (isinstance(ksize,
                     (list, tuple, np.ndarray)) and any(v == 0 for v in ksize))):
@@ -4566,6 +4608,7 @@ def max_pool(value,
         ksize=ksize,
         strides=strides,
         padding=padding,
+        explicit_paddings=explicit_paddings,
         data_format=data_format,
         name=name)
 
@@ -4584,8 +4627,14 @@ def max_pool1d(input, ksize, strides, padding, data_format="NWC", name=None):
       window for each dimension of the input tensor.
     strides: An int or list of `ints` that has length `1` or `3`. The stride of
       the sliding window for each dimension of the input tensor.
-    padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm. See
-      the "returns" section of `tf.nn.convolution` for details.
+    padding: Either the `string `"SAME"` or `"VALID"` indicating the type of
+      padding algorithm to use, or a list indicating the explicit paddings at
+      the start and end of each dimension. When explicit padding is used and
+      data_format is `"NWC"`, this should be in the form `[[0, 0], [pad_left,
+      pad_right], [0, 0]]`. When explicit padding used and data_format is
+      `"NCW"`, this should be in the form `[[0, 0], [0, 0], [pad_left,
+      pad_right]]`. When using explicit padding, the size of the paddings cannot
+      be greater than the sliding window size.
     data_format: An optional string from: "NWC", "NCW". Defaults to "NWC".
     name: A name for the operation (optional).
 
@@ -4594,11 +4643,17 @@ def max_pool1d(input, ksize, strides, padding, data_format="NWC", name=None):
     The max pooled output tensor.
   """
   with ops.name_scope(name, "MaxPool1d", [input]) as name:
+    if isinstance(padding, (list, tuple)) and data_format == "NCHW_VECT_C":
+      raise ValueError("Data formats NCHW_VECT_C is not yet supported with "
+                       "explicit padding")
     if data_format is None:
       data_format = "NWC"
     channel_index = 1 if data_format.startswith("NC") else 2
     ksize = [1] + _get_sequence(ksize, 1, channel_index, "ksize")
     strides = [1] + _get_sequence(strides, 1, channel_index, "strides")
+    padding, explicit_paddings = convert_padding(padding, 3)
+    if padding == "EXPLICIT":
+      explicit_paddings = [0, 0] + explicit_paddings
 
     expanding_dim = 1 if data_format == "NWC" else 2
     data_format = "NHWC" if data_format == "NWC" else "NCHW"
@@ -4609,6 +4664,7 @@ def max_pool1d(input, ksize, strides, padding, data_format="NWC", name=None):
         ksize=ksize,
         strides=strides,
         padding=padding,
+        explicit_paddings=explicit_paddings,
         data_format=data_format,
         name=name)
     return array_ops.squeeze(result, expanding_dim)
@@ -4627,8 +4683,15 @@ def max_pool2d(input, ksize, strides, padding, data_format="NHWC", name=None):
       the window for each dimension of the input tensor.
     strides: An int or list of `ints` that has length `1`, `2` or `4`. The
       stride of the sliding window for each dimension of the input tensor.
-    padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm. See
-      the "returns" section of `tf.nn.convolution` for details.
+    padding: Either the `string `"SAME"` or `"VALID"` indicating the type of
+      padding algorithm to use, or a list indicating the explicit paddings at
+      the start and end of each dimension. When explicit padding is used and
+      data_format is `"NHWC"`, this should be in the form `[[0, 0], [pad_top,
+      pad_bottom], [pad_left, pad_right], [0, 0]]`. When explicit padding used
+      and data_format is `"NCHW"`, this should be in the form `[[0, 0], [0, 0],
+      [pad_top, pad_bottom], [pad_left, pad_right]]`. When using explicit
+      padding, the size of the paddings cannot be greater than the sliding
+      window size.
     data_format: A string. 'NHWC', 'NCHW' and 'NCHW_VECT_C' are supported.
     name: Optional name for the operation.
 
@@ -4643,12 +4706,17 @@ def max_pool2d(input, ksize, strides, padding, data_format="NHWC", name=None):
 
     ksize = _get_sequence(ksize, 2, channel_index, "ksize")
     strides = _get_sequence(strides, 2, channel_index, "strides")
+    if isinstance(padding, (list, tuple)) and data_format == "NCHW_VECT_C":
+      raise ValueError("Data formats NCHW_VECT_C is not yet supported with "
+                       "explicit padding")
+    padding, explicit_paddings = convert_padding(padding)
 
     return gen_nn_ops.max_pool(
         input,
         ksize=ksize,
         strides=strides,
         padding=padding,
+        explicit_paddings=explicit_paddings,
         data_format=data_format,
         name=name)
 # pylint: enable=redefined-builtin
@@ -5674,3 +5742,78 @@ tf_export(v1=["nn.quantized_relu_x"])(
     dispatch.add_dispatch_support(gen_nn_ops.quantized_relu_x))
 tf_export(v1=["nn.quantized_max_pool"])(
     dispatch.add_dispatch_support(gen_nn_ops.quantized_max_pool))
+
+
+@tf_export("nn.isotonic_regression", v1=[])
+@dispatch.add_dispatch_support
+def isotonic_regression(inputs, decreasing=True, axis=-1):
+  r"""Solves isotonic regression problems along the given axis.
+
+  For each vector x, the problem solved is
+
+  $$\argmin_{y_1 >= y_2 >= ... >= y_n} \sum_i (x_i - y_i)^2.$$
+
+  As the solution is component-wise constant, a second tensor is returned that
+  encodes the segments. The problems are solved over the given axis.
+
+  Consider the following example, where we solve a batch of two problems. The
+  first input is [3, 1, 2], while the second [1, 3, 4] (as the axis is 1).
+  >>> x = tf.constant([[3, 1, 2], [1, 3, 4]], dtype=tf.float32)
+  >>> y, segments = tf.nn.isotonic_regression(x, axis=1)
+  >>> y  # The solution.
+  <tf.Tensor: shape=(2, 3), dtype=float32, numpy=
+  array([[3.       , 1.5      , 1.5      ],
+         [2.6666667, 2.6666667, 2.6666667]], dtype=float32)>
+
+  Note that the first solution has two blocks [2] and [1.5, 1.5]. The second
+  solution is constant, and thus has a single segment. These segments are
+  exactly what the second returned tensor encodes:
+
+  >>> segments
+  <tf.Tensor: shape=(2, 3), dtype=int32, numpy=
+  array([[0, 1, 1],
+         [0, 0, 0]], dtype=int32)>
+
+
+  Args:
+    inputs: A tensor holding the inputs.
+    decreasing: If set to False, the inequalities in the optimizing constrained
+      are flipped.
+    axis: The axis along which the problems should be solved.
+
+  Returns:
+    output: The solutions, same shape as type as the input.
+    segments: An int32 tensor, same shape as the input indicating the segments
+      that have the same value. Specifically, those positions that have the same
+      value correspond to the same segment. These values start at zero, and are
+      monotonously increasing for each solution.
+  """
+  type_promotions = {
+      # Float types get mapped to themselves, int8/16 to float32, rest to double
+      dtypes.float32:
+          dtypes.float32,
+      dtypes.half:
+          dtypes.half,
+      dtypes.bfloat16:
+          dtypes.bfloat16,
+      dtypes.int8:
+          dtypes.float32,
+      dtypes.int16:
+          dtypes.float32,
+  }
+  inputs = ops.convert_to_tensor(inputs)
+  try:
+    output_dtype = type_promotions[inputs.dtype]
+  except KeyError:
+    output_dtype = dtypes.float64
+
+  def compute_on_matrix(matrix, name=None):
+    iso_fn = functools.partial(
+        gen_nn_ops.isotonic_regression, output_dtype=output_dtype, name=name)
+    if decreasing:
+      return iso_fn(matrix)
+    else:
+      output, segments = iso_fn(-matrix)
+      return -output, segments
+
+  return _wrap_2d_function(inputs, compute_on_matrix, axis)
diff --git a/tensorflow/python/ops/nn_test.py b/tensorflow/python/ops/nn_test.py
index 3802f92b384..7f3d9f6e286 100644
--- a/tensorflow/python/ops/nn_test.py
+++ b/tensorflow/python/ops/nn_test.py
@@ -32,6 +32,7 @@ from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import nn_impl
@@ -540,6 +541,8 @@ class DropoutTest(test_lib.TestCase):
       _ = nn_ops.dropout(x, 0.5)
 
 
+@test_util.run_all_without_tensor_float_32(
+    "Tests _compute_sampled_logits and related functions, which call matmul")
 class ComputeSampledLogitsTest(test_lib.TestCase):
 
   def setUp(self):
@@ -1701,5 +1704,88 @@ class RaggedEmbeddingTest(test_lib.TestCase):
           actual)
 
 
+class IsotonicTest(parameterized.TestCase, test_lib.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_increasing_and_decreasing(self):
+    x = constant_op.constant([[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]],
+                             dtype=dtypes.float64)
+    y, segments = nn_ops.isotonic_regression(x, decreasing=False)
+    self.assertAllClose(y, x)
+    self.assertAllClose(segments, [[0, 1, 2, 3, 4], [0, 1, 2, 3, 4]])
+
+    y, segments = nn_ops.isotonic_regression(x, decreasing=True)
+    self.assertAllClose(
+        y,
+        [
+            [2, 2, 2, 2, 2],  # Average of the inputs.
+            [7, 7, 7, 7, 7]
+        ])
+    self.assertAllClose(segments, array_ops.zeros((2, 5)))
+
+    y, segments = nn_ops.isotonic_regression(-x, decreasing=True)
+    self.assertAllClose(segments, [[0, 1, 2, 3, 4], [0, 1, 2, 3, 4]])
+
+    self.assertAllClose(y, -x)
+    y, segments = nn_ops.isotonic_regression(-x, decreasing=False)
+    self.assertAllClose(
+        -y,
+        [
+            [2, 2, 2, 2, 2],  # Average of the inputs.
+            [7, 7, 7, 7, 7]
+        ])
+    self.assertAllClose(segments, array_ops.zeros((2, 5)))
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_different_axis(self):
+    x = constant_op.constant([[0, 6, 2, 8, 4], [5, 1, 7, 3, 9]],
+                             dtype=dtypes.float64)
+    y, segments = nn_ops.isotonic_regression(x, decreasing=True, axis=0)
+    self.assertAllClose(
+        y,
+        [
+            [2.5, 6, 4.5, 8, 6.5],  # Either identity or average.
+            [2.5, 1, 4.5, 3, 6.5]
+        ])
+    self.assertAllClose(segments, [[0, 0, 0, 0, 0], [0, 1, 0, 1, 0]])
+
+  @test_util.run_v2_only
+  def testGradientV2(self, dtype=np.float64, batch_size=30, dimensions=50):
+
+    @def_function.function
+    def ComputeIsotonicFn(x):
+      y, _ = nn_ops.isotonic_regression(x)  # No gradient wrt segments.
+      return y
+
+    np.random.seed(0)
+    x_init = np.random.randn(batch_size, dimensions).astype(dtype)
+    grad_theoretical, grad_numerical = gradient_checker_v2.compute_gradient(
+        ComputeIsotonicFn, [x_init], delta=1e-5)
+    self.assertAllClose(grad_theoretical, grad_numerical)
+
+  @test_util.run_v1_only("compute_gradient_error is v1 only")
+  def testGradientV1(self, dtype=np.float64, batch_size=30, dimensions=50):
+    np.random.seed(0)
+    x_init = np.random.randn(batch_size, dimensions).astype(dtype)
+    with self.cached_session():
+      x = array_ops.placeholder(dtype, (batch_size, dimensions))
+      y, _ = nn_ops.isotonic_regression(x)  # Segments have no gradient.
+      max_error = gradient_checker.compute_gradient_error(
+          x, (batch_size, dimensions), y, (batch_size, dimensions), x_init)
+    self.assertAllClose(max_error, 0.)
+
+  @parameterized.parameters([[dtypes.half, dtypes.half],
+                             [dtypes.bfloat16, dtypes.bfloat16],
+                             [dtypes.float32, dtypes.float32],
+                             [dtypes.float64, dtypes.float64],
+                             [dtypes.int32, dtypes.float64],
+                             [dtypes.int16, dtypes.float32]])
+  def testTypePromotion(self, dtype_in, expected_dtype_out):
+    x = constant_op.constant([[0, 6, 2, 8, 4], [5, 1, 7, 3, 9]], dtype=dtype_in)
+    y, segments = nn_ops.isotonic_regression(x)
+    self.assertEqual(y.dtype, expected_dtype_out)
+    self.assertEqual(segments.dtype, dtypes.int32)
+
+
 if __name__ == "__main__":
   test_lib.main()
diff --git a/tensorflow/python/ops/numpy_ops/__init__.py b/tensorflow/python/ops/numpy_ops/__init__.py
index 5cc5cf5ac85..f50f1934643 100644
--- a/tensorflow/python/ops/numpy_ops/__init__.py
+++ b/tensorflow/python/ops/numpy_ops/__init__.py
@@ -24,6 +24,9 @@ NumPy" section.
 
 ## Getting Started
 
+Please also see [TensorFlow NumPy Guide](
+https://www.tensorflow.org/guide/tf_numpy).
+
 In the code snippets below, we will assume that `tf.experimental.numpy` is
 imported as `tnp` and NumPy is imported as `np`
 
@@ -125,7 +128,7 @@ during runtime. Some differences are:
     may need to change to explicit shape operations or control flow
     constructs.
 *   Also note the [autograph limitations](
-https://www.tensorflow.org/code/tensorflow/python/autograph/g3doc/reference/limitations.md).
+https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/autograph/g3doc/reference/limitations.md).
 
 
 ## Mutation and Variables
diff --git a/tensorflow/python/ops/numpy_ops/g3doc/TensorFlow_NumPy_Keras_and_Distribution_Strategy.ipynb b/tensorflow/python/ops/numpy_ops/g3doc/TensorFlow_NumPy_Keras_and_Distribution_Strategy.ipynb
new file mode 100644
index 00000000000..11e968d1576
--- /dev/null
+++ b/tensorflow/python/ops/numpy_ops/g3doc/TensorFlow_NumPy_Keras_and_Distribution_Strategy.ipynb
@@ -0,0 +1,318 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "TWLmmaQpX-i1"
+      },
+      "source": [
+        "# TensorFlow NumPy: Keras and Distribution Strategy"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "fmGBjt1arUk7"
+      },
+      "source": [
+        "## Overview\n",
+        "\n",
+        "TensorFlow Numpy provides an implementation of a subset of NumPy API on top of TensorFlow backend. Please see [TF NumPy API documentation](https://www.tensorflow.org/api_docs/python/tf/experimental/numpy) and \n",
+        " [TensorFlow NumPy Guide](https://colab.sandbox.google.com/drive/15AshdHLS_xTMohWDleTiAgyPdRt6JQJJ#scrollTo=s2enCDi_FvCR).\n",
+        "\n",
+        "This document shows how TensorFlow NumPy interoperates with TensorFlow's high level APIs like DistributionStrategky and Keras."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "eAf_CAIerkPZ"
+      },
+      "source": [
+        "## Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "OG0u3eVdSOAk"
+      },
+      "outputs": [],
+      "source": [
+        "!pip install --quiet --upgrade tf-nightly"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "YjQUVUd3X325"
+      },
+      "outputs": [],
+      "source": [
+        "import tensorflow as tf\n",
+        "import tensorflow.experimental.numpy as tnp\n",
+        "\n",
+        "# Creates 3 logical GPU devices for demonstrating distribution.\n",
+        "gpu_device = tf.config.list_physical_devices(\"GPU\")[0]\n",
+        "tf.config.set_logical_device_configuration(\n",
+        "    gpu_device, [tf.config.LogicalDeviceConfiguration(128)] * 3)\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "UTZPYMaPr_oU"
+      },
+      "source": [
+        "## TF NumPy and Keras\n",
+        "\n",
+        "TF NumPy can be used to create custom Keras layers. These layers interoperate with and behave like regular Keras layers. Here are some things to note to understand how these layers work.\n",
+        "\n",
+        "- Existing Keras layers can be invoked with ND Array inputs, in addition to other input types like `tf.Tensor`, `np.ndarray`, python literals, etc. All these types will be internally convert to a `tf.Tensor` before the layer's `call` method is invoked\n",
+        "- Existing Keras layers will continue to output `tf.Tensor` values. Custom layers could output ND Array or `tf.Tensor`. \n",
+        "- Custom and existing Keras layers should be freely composable.\n",
+        "\n",
+        "Checkout the examples below that demonstrate the above.\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "gsZLC4eEsm8P"
+      },
+      "source": [
+        "### ND Array inputs\n",
+        "\n",
+        "Create and call an existing Keras layers with ND Array inputs. Note that the layer outputs a `tf.Tensor`."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "CTiylo_UrxW7"
+      },
+      "outputs": [],
+      "source": [
+        "dense_layer = tf.keras.layers.Dense(5)\n",
+        "inputs = tnp.random.randn(2, 3).astype(tnp.float32)\n",
+        "outputs = dense_layer(inputs)\n",
+        "print(\"Shape:\", outputs.shape)\n",
+        "print(\"Class:\", outputs.__class__)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "vltJnASzXJNq"
+      },
+      "source": [
+        "### Custom Keras Layer\n",
+        "\n",
+        "Create a new Keras layer as below using TensorFlow NumPy methods.  Note that the layer's call method receives a `tf.tensor` value as input. It can convert to `ndarray` using `tnp.asarray`. However this conversion may not be needed since TF NumPy APIs can handle `tf.Tensor` inputs."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "0i7lOWJwsVMy"
+      },
+      "outputs": [],
+      "source": [
+        "class ProjectionLayer(tf.keras.layers.Layer):\n",
+        "  \"\"\"Linear projection layer using TF NumPy.\"\"\"\n",
+        "\n",
+        "  def __init__(self, units):\n",
+        "    super(ProjectionLayer, self).__init__()\n",
+        "    self._units = units\n",
+        "\n",
+        "  def build(self, input_shape):\n",
+        "    stddev = tnp.sqrt(self._units).astype(tnp.float32)\n",
+        "    initial_value = tnp.random.randn(input_shape[1], self._units).astype(\n",
+        "        tnp.float32) / stddev\n",
+        "    # Note that TF NumPy can interoperate with tf.Variable.\n",
+        "    self.w = tf.Variable(initial_value, trainable=True)\n",
+        "\n",
+        "  def call(self, inputs):\n",
+        "    return tnp.matmul(inputs, self.w)\n",
+        "\n",
+        "# Call with ndarray inputs\n",
+        "layer = ProjectionLayer(2)\n",
+        "tnp_inputs = tnp.random.randn(2, 4).astype(tnp.float32)\n",
+        "print(\"output:\", layer(tnp_inputs))\n",
+        "\n",
+        "# Call with tf.Tensor inputs\n",
+        "tf_inputs = tf.random.uniform([2, 4])\n",
+        "print(\"\\noutput: \", layer(tf_inputs))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "UExEbq1EENLB"
+      },
+      "source": [
+        "### Composing layers\n",
+        "\n",
+        "Next create a Keras model by composing the `ProjectionLayer` defined above with a `Dense` layer."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "qbTkqFgDDXaw"
+      },
+      "outputs": [],
+      "source": [
+        "batch_size = 3\n",
+        "units = 5\n",
+        "model = tf.keras.Sequential([tf.keras.layers.Dense(units),\n",
+        "                             ProjectionLayer(2)])\n",
+        "\n",
+        "print(\"Calling with ND Array inputs\")\n",
+        "tnp_inputs = tnp.random.randn(batch_size, units).astype(tnp.float32)\n",
+        "output = model.call(tnp_inputs)\n",
+        "print(\"Output shape %s.\\nOutput class: %s\\n\" % (output.shape, output.__class__))\n",
+        "\n",
+        "print(\"Calling with tensor inputs\")\n",
+        "tf_inputs = tf.convert_to_tensor(tnp_inputs)\n",
+        "output = model.call(tf_inputs)\n",
+        "print(\"Output shape %s.\\nOutput class: %s\" % (output.shape, output.__class__))\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "QeooMJZdYbXq"
+      },
+      "source": [
+        "## Distributed Strategy: tf.distribution\n",
+        "\n",
+        "[TensorFlow NumPy Guide](https://colab.sandbox.google.com/drive/15AshdHLS_xTMohWDleTiAgyPdRt6JQJJ#scrollTo=s2enCDi_FvCR) shows how `tf.device` API can be used to place individual operations on specific devices. Note that this works for remote devices as well.\n",
+        "\n",
+        "\n",
+        "TensorFlow also has higher level distribution APIs that make it easy to replicate computation across devices. \n",
+        "Here we will show how to place TensorFlow NumPy code in a Distribution Strategy context to easily perform replicated computation.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "tOTNvkTxZ-ok"
+      },
+      "outputs": [],
+      "source": [
+        "# Initialize the strategy\n",
+        "gpus = tf.config.list_logical_devices(\"GPU\")\n",
+        "print(\"Using following GPUs\", gpus)\n",
+        "\n",
+        "strategy = tf.distribute.MirroredStrategy(gpus)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "Zlmeo8i7Euq0"
+      },
+      "source": [
+        "### Simple replication example\n",
+        "\n",
+        "First try running a simple NumPy function in `strategy` context."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "u3ZLh3_ZB8mk"
+      },
+      "outputs": [],
+      "source": [
+        "@tf.function\n",
+        "def replica_fn():\n",
+        "  replica_id = tf.distribute.get_replica_context().replica_id_in_sync_group\n",
+        "  print(\"Running on device %s\" % replica_id.device)\n",
+        "  return tnp.asarray(replica_id) * 5\n",
+        "\n",
+        "print(strategy.run(replica_fn).values)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "UyyZBpLyE9LG"
+      },
+      "source": [
+        "### Replicated model execution\n",
+        "\n",
+        "Next run the model defined earlier under `strategy` scope."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "6VeBFzTCCbZk"
+      },
+      "outputs": [],
+      "source": [
+        "# Test running the model in a distributed setting.\n",
+        "model = tf.keras.Sequential([tf.keras.layers.Dense(units), ProjectionLayer(2)])\n",
+        "\n",
+        "@tf.function\n",
+        "def model_replica_fn():\n",
+        "  inputs = tnp.random.randn(batch_size, units).astype(tnp.float32)\n",
+        "  return model.call(inputs)\n",
+        "\n",
+        "print(\"Outputs:\\n\", strategy.run(model_replica_fn).values)"
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "collapsed_sections": [],
+      "name": "TensorFlow NumPy: Keras and Distribution Strategy",
+      "private_outputs": true,
+      "provenance": [],
+      "toc_visible": true
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/tensorflow/python/ops/numpy_ops/g3doc/TensorFlow_Numpy_Distributed_Image_Classification.ipynb b/tensorflow/python/ops/numpy_ops/g3doc/TensorFlow_Numpy_Distributed_Image_Classification.ipynb
new file mode 100644
index 00000000000..a7cd7f38b41
--- /dev/null
+++ b/tensorflow/python/ops/numpy_ops/g3doc/TensorFlow_Numpy_Distributed_Image_Classification.ipynb
@@ -0,0 +1,541 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "KQALG9h23b0R"
+      },
+      "source": [
+        "##### Copyright 2020 The TensorFlow Authors."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "both",
+        "colab": {},
+        "colab_type": "code",
+        "id": "U34SJW0W3dg_"
+      },
+      "outputs": [],
+      "source": [
+        "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+        "# you may not use this file except in compliance with the License.\n",
+        "# You may obtain a copy of the License at\n",
+        "#\n",
+        "# https://www.apache.org/licenses/LICENSE-2.0\n",
+        "#\n",
+        "# Unless required by applicable law or agreed to in writing, software\n",
+        "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+        "# See the License for the specific language governing permissions and\n",
+        "# limitations under the License."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "VIX1XZHJ3gFo"
+      },
+      "source": [
+        "# TensorFlow NumPy: Distributed Image Classification Tutorial"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "f7NApJ7R3ndN"
+      },
+      "source": [
+        "## Overview\n",
+        "\n",
+        "TensorFlow implements a subset of the [NumPy API](https://numpy.org/doc/1.16), available as `tf.experimental.numpy`. This allows running NumPy code, accelerated by TensorFlow together with access to all of TensorFlow's APIs. Please see [TensorFlow NumPy Guide](https://www.tensorflow.org/guide/tf_numpy) to get started.\n",
+        "\n",
+        "Here you will learn how to build a deep model for an image classification task by using TensorFlow Numpy APIs. For using higher level `tf.keras` APIs, see the following [tutorial](tutorials/quickstart/beginner)."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "IYDdfih63rSG"
+      },
+      "source": [
+        "## Setup\n",
+        "\n",
+        "tf.experimental.numpy will be available in the stable branch starting from TensorFlow 2.4. For now, it is available in  nightly."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "3IlLM-YlTMv5"
+      },
+      "outputs": [],
+      "source": [
+        "!pip install --quiet --upgrade tf-nightly\n",
+        "!pip install --quiet --upgrade tensorflow-datasets"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "U13hRXHKTcsE"
+      },
+      "outputs": [],
+      "source": [
+        "import collections\n",
+        "import functools\n",
+        "import matplotlib.pyplot as plt\n",
+        "import os\n",
+        "import tensorflow as tf\n",
+        "import tensorflow.experimental.numpy as tnp\n",
+        "import tensorflow_datasets as tfds\n",
+        "\n",
+        "gpus = tf.config.list_physical_devices('GPU')\n",
+        "if gpus:\n",
+        "  tf.config.set_logical_device_configuration(gpus[0], [\n",
+        "      tf.config.LogicalDeviceConfiguration(memory_limit=128),\n",
+        "      tf.config.LogicalDeviceConfiguration(memory_limit=128)])\n",
+        "  devices = tf.config.list_logical_devices('GPU')\n",
+        "else:\n",
+        "  cpus = tf.config.list_physical_devices('CPU')\n",
+        "  tf.config.set_logical_device_configuration(cpus[0], [\n",
+        "      tf.config.LogicalDeviceConfiguration(),\n",
+        "      tf.config.LogicalDeviceConfiguration()])\n",
+        "  devices = tf.config.list_logical_devices('CPU')\n",
+        "\n",
+        "print(\"Using following virtual devices\", devices)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "AxNuZSqZKcdM"
+      },
+      "source": [
+        "## Mnist dataset\n",
+        "\n",
+        "Mnist contains 28 * 28 images of digits from 0 to 9. The task is to classify the images as these 10 possible classes.\n",
+        "\n",
+        "Below, load the dataset and examine a few samples."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "yKf9Tm5OjwGK"
+      },
+      "outputs": [],
+      "source": [
+        "NUM_CLASSES = 10\n",
+        "BATCH_SIZE = 64\n",
+        "INPUT_SIZE = 28 * 28\n",
+        "\n",
+        "def process_data(data_dict):\n",
+        "  images = tnp.asarray(data_dict['image']) / 255.0\n",
+        "  images = images.reshape(-1, INPUT_SIZE).astype(tnp.float32)\n",
+        "  labels = tnp.asarray(data_dict['label'])\n",
+        "  labels = tnp.eye(NUM_CLASSES, dtype=tnp.float32)[labels]\n",
+        "  return images, labels\n",
+        "\n",
+        "with tf.device(\"CPU:0\"):\n",
+        "  train_dataset = tfds.load('mnist', split='train', shuffle_files=True, \n",
+        "                            batch_size=BATCH_SIZE).map(process_data)\n",
+        "  test_dataset = tfds.load('mnist', split='test', shuffle_files=True, \n",
+        "                          batch_size=-1)\n",
+        "  x_test, y_test = process_data(test_dataset)\n",
+        "\n",
+        "  # Plots some examples.\n",
+        "  images, labels = next(iter(train_dataset.take(1)))\n",
+        "  _, axes = plt.subplots(1, 8, figsize=(12, 96))\n",
+        "  for i, ax in enumerate(axes):\n",
+        "    ax.imshow(images[i].reshape(28, 28), cmap='gray')\n",
+        "    ax.axis(\"off\")\n",
+        "    ax.set_title(\"Label: %d\" % int(tnp.argmax(labels[i])))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "ZDJQp4i00qaJ"
+      },
+      "source": [
+        "## Define layers and model\n",
+        "\n",
+        "Here, you will implement a multi-layer perceptron model that trains on the MNIST data. First, define a `Dense` class which applies a linear transform followed by a \"relu\" non-linearity."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "44yzAmBFreyg"
+      },
+      "outputs": [],
+      "source": [
+        "class Dense(object):\n",
+        "\n",
+        "  def __init__(self, units, use_relu=True):\n",
+        "    self.wt = None\n",
+        "    self.bias = None\n",
+        "    self._use_relu = use_relu\n",
+        "    self._built = False\n",
+        "    self._units = units\n",
+        "\n",
+        "  def __call__(self, inputs):\n",
+        "    if not self._built:\n",
+        "      self._build(inputs.shape)\n",
+        "    x = tnp.add(tnp.matmul(inputs, self.wt), self.bias)\n",
+        "    if self._use_relu:\n",
+        "      return tnp.maximum(x, 0.)\n",
+        "    else:\n",
+        "      return x\n",
+        "\n",
+        "  @property\n",
+        "  def params(self):\n",
+        "    assert self._built\n",
+        "    return [self.wt, self.bias]\n",
+        "\n",
+        "  def _build(self, input_shape):\n",
+        "    size = input_shape[1]\n",
+        "    stddev = 1 / tnp.sqrt(size)\n",
+        "    # Note that model parameters are `tf.Variable` since they requires\n",
+        "    # mutation, which is currently unsupported by TensorFlow NumPy.\n",
+        "    # Also note interoperation with TensorFlow APIs below.\n",
+        "    self.wt = tf.Variable(\n",
+        "        tf.random.truncated_normal(\n",
+        "            [size, self._units], stddev=stddev, dtype=tf.float32))\n",
+        "    self.bias = tf.Variable(tf.zeros([self._units], dtype=tf.float32))\n",
+        "    self._built = True"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "wfKpg3adUCy9"
+      },
+      "source": [
+        "Next, create a `Model` object that applies two non-linear `Dense` transforms,\n",
+        "followed by a linear transform."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "NdrdxKB7SenC"
+      },
+      "outputs": [],
+      "source": [
+        "class Model(object):\n",
+        "  \"\"\"A  three layer neural network.\"\"\"\n",
+        "\n",
+        "  def __init__(self):\n",
+        "    self.layer1 = Dense(128)\n",
+        "    self.layer2 = Dense(32)\n",
+        "    self.layer3 = Dense(NUM_CLASSES, use_relu=False)\n",
+        "\n",
+        "  def __call__(self, inputs):\n",
+        "    x = self.layer1(inputs)\n",
+        "    x = self.layer2(x)\n",
+        "    return self.layer3(x)\n",
+        "\n",
+        "  @property\n",
+        "  def params(self):\n",
+        "    return self.layer1.params + self.layer2.params + self.layer3.params"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "Hoxh5Z7E_9Pv"
+      },
+      "source": [
+        "## Training and evaluation\n",
+        "\n",
+        "Checkout the following methods for performing training and evaluation."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "hOxqjE7rZPdr"
+      },
+      "outputs": [],
+      "source": [
+        "def forward(model, inputs, labels):\n",
+        "  \"\"\"Computes prediction and loss.\"\"\"\n",
+        "  logits = model(inputs)\n",
+        "  # TensorFlow's loss function has numerically stable implementation of forward\n",
+        "  # pass and gradients. So we prefer that here.\n",
+        "  loss = tf.nn.softmax_cross_entropy_with_logits(labels, logits)\n",
+        "  mean_loss = tnp.mean(loss)\n",
+        "  return logits, mean_loss\n",
+        "\n",
+        "def compute_gradients(model, inputs, labels):\n",
+        "  \"\"\"Computes gradients of loss based on `labels` and prediction on `inputs`.\"\"\"\n",
+        "  with tf.GradientTape() as tape:\n",
+        "    tape.watch(inputs)\n",
+        "    _, loss = forward(model, inputs, labels)\n",
+        "  gradients = tape.gradient(loss, model.params)\n",
+        "  return gradients\n",
+        "\n",
+        "def compute_sgd_updates(gradients, learning_rate):\n",
+        "  \"\"\"Computes parameter updates based on SGD update rule.\"\"\"\n",
+        "  return [-learning_rate * grad for grad in gradients]\n",
+        "\n",
+        "def apply_updates(model, updates):\n",
+        "  \"\"\"Applies `update` to `model.params`.\"\"\"\n",
+        "  for param, update in zip(model.params, updates):\n",
+        "    param.assign_add(update)\n",
+        "\n",
+        "def evaluate(model, images, labels):\n",
+        "  \"\"\"Evaluates accuracy for `model`'s predictions.\"\"\"\n",
+        "  prediction = model(images)\n",
+        "  predicted_class = tnp.argmax(prediction, axis=-1)\n",
+        "  actual_class = tnp.argmax(labels, axis=-1)\n",
+        "  return float(tnp.mean(predicted_class == actual_class))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "8t70b5d6XCs7"
+      },
+      "source": [
+        "### Single GPU training"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "HrhS_M6kALeP"
+      },
+      "outputs": [],
+      "source": [
+        "NUM_EPOCHS = 10\n",
+        "\n",
+        "@tf.function\n",
+        "def train_step(model, input, labels, learning_rate):\n",
+        "  gradients = compute_gradients(model, input, labels)\n",
+        "  updates = compute_sgd_updates(gradients, learning_rate)\n",
+        "  apply_updates(model, updates)\n",
+        "\n",
+        "# Creates and build a model.\n",
+        "model = Model()\n",
+        "\n",
+        "accuracies = []\n",
+        "for _ in range(NUM_EPOCHS):\n",
+        "  for inputs, labels in train_dataset:\n",
+        "    train_step(model, inputs, labels, learning_rate=0.1)\n",
+        "  accuracies.append(evaluate(model, x_test, y_test))\n",
+        "\n",
+        "def plot_accuracies(accuracies):\n",
+        "  plt.plot(accuracies)\n",
+        "  plt.xlabel(\"epoch\")\n",
+        "  plt.ylabel(\"accuracy\")\n",
+        "  plt.title(\"Eval accuracy vs epoch\")\n",
+        "\n",
+        "plot_accuracies(accuracies)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "ak_hCOkGXXfl"
+      },
+      "source": [
+        "### Multi GPU runs\n",
+        "\n",
+        "Next, run mirrored training on multiple GPUs. Note that the GPUs used here are virtual and map to the same physical GPU.\n",
+        "\n",
+        "First, define a few utilities to run replicated computation and reductions."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "ujbeT5p6Xm7k"
+      },
+      "source": [
+        "#### Distribution primitives\n",
+        "\n",
+        "Checkout primitives below for function replication and distributed reduction."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "MZ6hivj-ZIRo"
+      },
+      "outputs": [],
+      "source": [
+        "import threading\n",
+        "import queue\n",
+        "\n",
+        "# Note that this code currently relies on dispatching operations from python\n",
+        "# threads.\n",
+        "class ReplicatedFunction(object):\n",
+        "  \"\"\"Creates a callable that will run `fn` on each device in `devices`.\"\"\"\n",
+        "\n",
+        "  def __init__(self, fn, devices, **kw_args):\n",
+        "    self._shutdown = False\n",
+        "\n",
+        "    def _replica_fn(device, input_queue, output_queue):\n",
+        "      while not self._shutdown:\n",
+        "        inputs = input_queue.get()\n",
+        "        with tf.device(device):\n",
+        "          output_queue.put(fn(*inputs, **kw_args))\n",
+        "\n",
+        "    self.threads = []\n",
+        "    self.input_queues = [queue.Queue() for _ in devices]\n",
+        "    self.output_queues = [queue.Queue() for _ in devices]\n",
+        "    for i, device in enumerate(devices):\n",
+        "      thread = threading.Thread(\n",
+        "          target=_replica_fn,\n",
+        "          args=(device, self.input_queues[i], self.output_queues[i]))\n",
+        "      thread.start()\n",
+        "      self.threads.append(thread)\n",
+        "\n",
+        "  def __call__(self, *inputs):\n",
+        "    all_inputs = zip(*inputs)\n",
+        "    for input_queue, replica_input, in zip(self.input_queues, all_inputs):\n",
+        "      input_queue.put(replica_input)\n",
+        "    return [q.get() for q in self.output_queues]\n",
+        "\n",
+        "  def __del__(self):\n",
+        "    self._shutdown = True\n",
+        "    for t in self.threads:\n",
+        "      t.join(3)\n",
+        "    self.threads = None\n",
+        "\n",
+        "def collective_mean(inputs, num_devices):\n",
+        "  \"\"\"Performs collective mean reduction on inputs.\"\"\"\n",
+        "  outputs = []\n",
+        "  for instance_key, inp in enumerate(inputs):\n",
+        "    outputs.append(tnp.asarray(\n",
+        "      tf.raw_ops.CollectiveReduce(\n",
+        "          input=inp, group_size=num_devices, group_key=0,\n",
+        "          instance_key=instance_key, merge_op='Add', final_op='Div',\n",
+        "          subdiv_offsets=[])))\n",
+        "  return outputs"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "1ZiN1rpJYHLu"
+      },
+      "source": [
+        "#### Distributed training "
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "A6ZHYmLapunm"
+      },
+      "outputs": [],
+      "source": [
+        "# This is similar to `train_step` except for an extra collective reduction of\n",
+        "# gradients\n",
+        "@tf.function\n",
+        "def replica_step(model, inputs, labels,\n",
+        "                 learning_rate=None, num_devices=None):\n",
+        "  gradients = compute_gradients(model, inputs, labels)\n",
+        "  # Note that each replica performs a reduction to compute mean of gradients.\n",
+        "  reduced_gradients = collective_mean(gradients, num_devices)\n",
+        "  updates = compute_sgd_updates(reduced_gradients, learning_rate)\n",
+        "  apply_updates(model, updates)\n",
+        "\n",
+        "models = [Model() for _ in devices]\n",
+        "\n",
+        "# The code below builds all the model objects and copies model parameters from\n",
+        "# the first model to all the replicas.\n",
+        "def init_model(model):\n",
+        "  model(tnp.zeros((1, INPUT_SIZE), dtype=tnp.float32))\n",
+        "  if model != models[0]:\n",
+        "    # Copy the first models weights into the other models.\n",
+        "    for p1, p2 in zip(model.params, models[0].params):\n",
+        "      p1.assign(p2)\n",
+        "\n",
+        "with tf.device(devices[0]):\n",
+        "  init_model(models[0])\n",
+        "# Replicate and run the parameter initialization.\n",
+        "ReplicatedFunction(init_model, devices[1:])(models[1:])\n",
+        "\n",
+        "# Replicate the training step\n",
+        "replicated_step = ReplicatedFunction(\n",
+        "    replica_step, devices, learning_rate=0.1, num_devices=len(devices))\n",
+        "\n",
+        "accuracies = []\n",
+        "print(\"Running distributed training on devices: %s\" % devices)\n",
+        "for _ in range(NUM_EPOCHS):\n",
+        "  for inputs, labels in train_dataset:\n",
+        "    replicated_step(models,\n",
+        "                    tnp.split(inputs, len(devices)),\n",
+        "                    tnp.split(labels, len(devices)))\n",
+        "  accuracies.append(evaluate(models[0], x_test, y_test))\n",
+        "\n",
+        "plot_accuracies(accuracies)"
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "collapsed_sections": [
+        "KQALG9h23b0R",
+        "f7NApJ7R3ndN"
+      ],
+      "name": "TensorFlow Numpy: Distributed Image Classification",
+      "private_outputs": true,
+      "provenance": [],
+      "toc_visible": true
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/tensorflow/python/ops/numpy_ops/np_array_ops.py b/tensorflow/python/ops/numpy_ops/np_array_ops.py
index 5f82bca0061..16417a34f62 100644
--- a/tensorflow/python/ops/numpy_ops/np_array_ops.py
+++ b/tensorflow/python/ops/numpy_ops/np_array_ops.py
@@ -19,6 +19,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import enum
+import functools
 import math
 import numbers
 import numpy as np
@@ -172,11 +174,6 @@ def _array_internal(val, dtype=None, copy=True, ndmin=0):  # pylint: disable=red
   else:
     result_t = val
 
-  if copy and isinstance(result_t, ops.Tensor):
-    # Note: In eager mode, a copy of `result_t` is made only if it is not on
-    # the context device.
-    result_t = array_ops.identity(result_t)
-
   if not isinstance(result_t, ops.Tensor):
     if not dtype:
       dtype = np_utils.result_type(result_t)
@@ -203,6 +200,9 @@ def _array_internal(val, dtype=None, copy=True, ndmin=0):  # pylint: disable=red
   elif dtype:
     result_t = math_ops.cast(result_t, dtype)
 
+  if copy:
+    result_t = array_ops.identity(result_t)
+
   if ndmin == 0:
     return np_arrays.tensor_to_ndarray(result_t)
 
@@ -562,6 +562,21 @@ def _reduce(tf_fn,
       tf_fn(input_tensor=a.data, axis=axis, keepdims=keepdims))
 
 
+# TODO (DarrenZhang01): Add `axis` support to the `size` API.
+@np_utils.np_doc('size')
+def size(x, axis=None):  # pylint: disable=missing-docstring
+  if axis is not None:
+    raise NotImplementedError('axis argument is not supported in the current '
+                              '`np.size` implementation')
+  if isinstance(x, (int, float, np.int32, np.int64, np.float32, np.float64)):
+    return 1
+  x = asarray(x).data
+  if x.shape.is_fully_defined():
+    return np.prod(x.shape.as_list())
+  else:
+    return np_utils.tensor_to_ndarray(array_ops.size_v2(x))
+
+
 @np_utils.np_doc('sum')
 def sum(a, axis=None, dtype=None, keepdims=None):  # pylint: disable=redefined-builtin
   return _reduce(
@@ -832,6 +847,8 @@ def moveaxis(a, source, destination):  # pylint: disable=missing-docstring
     source = (source,)
   if isinstance(destination, int):
     destination = (destination,)
+  if len(source) != len(destination):
+    raise ValueError('The lengths of source and destination must equal')
 
   a_rank = np_utils._maybe_static(array_ops.rank(a))  # pylint: disable=protected-access
 
@@ -1510,8 +1527,50 @@ def _as_index(idx, need_scalar=True):
   return data, data.shape.rank == 0
 
 
-def _slice_helper(tensor, slice_spec):
-  """Helper function for __getitem__."""
+class _UpdateMethod(enum.Enum):
+  UPDATE = 0
+  ADD = 1
+  MIN = 2
+  MAX = 3
+
+
+def _slice_helper(tensor, slice_spec, update_method=None, updates=None):
+  """Helper function for __getitem__ and _with_index_update_helper.
+
+  This function collects the indices in `slice_spec` into two buckets, which we
+  can call "idx1" and "idx2" here. idx1 is intended for `strided_slice`, idx2
+  `gather`.  They also correspond to "basic indices" and "advanced indices" in
+  numpy.  This function supports both reading and writing at the indices. The
+  reading path can be summarized as `gather(stride_slice(tensor, idx1),
+  idx2)`. The writing path can be summarized as `strided_slice_update(tensor,
+  idx1, scatter(strided_slice(tensor, idx1), idx2, updates))`.  (`gather` here
+  means `tf.gather` or `tf.gather_nd`; `scatter` here means
+  `tf.tensor_scatter_update`.)  The writing path is inefficient because it needs
+  to first read out a portion (probably much larger than `updates`) of `tensor`
+  using `strided_slice`, update it, and then write the portion back. An
+  alternative approach is to only use `scatter`, which amounts to using the
+  indexing mechanism of gather/scatter to implement
+  strided_slice/strided_slice_update. This is feasible for XLA Gather/Scatter
+  because they support spans (e.g. `2:5`) in indices (as begin/end pairs), but
+  not TF gather/scatter because they don't support spans (except those that
+  cover entire dimensions, i.e. `:`).  If we materialize spans into individual
+  indices, the size of the index tensor would explode.  (Note that XLA
+  Gather/Scatter have a similar problem for stride > 1 because they don't
+  support strides.  Indices such as `1:2:8` will need to be materialized into
+  individual indices such as [1, 3, 5, 7].)
+
+  Args:
+    tensor: the tensor to be read from or write into.
+    slice_spec: the indices.
+    update_method: (optional) a member of `_UpdateMethod`, indicating how to
+      update the values (replacement, add, etc.). `None` indicates just reading.
+    updates: (optional) the new values to write into `tensor`. It must have the
+      same dtype as `tensor`.
+
+  Returns:
+    The result of reading (if `update_method` is `None`) or the updated `tensor`
+    after writing.
+  """
   begin, end, strides = [], [], []
   new_axis_mask, shrink_axis_mask = 0, 0
   begin_mask, end_mask = 0, 0
@@ -1581,20 +1640,60 @@ def _slice_helper(tensor, slice_spec):
     else:
       var_empty = constant_op.constant([], dtype=dtypes.int32)
       packed_begin = packed_end = packed_strides = var_empty
-    # TODO(agarwal): set_shape on tensor to set rank.
-    tensor = array_ops.strided_slice(
-        tensor,
-        packed_begin,
-        packed_end,
-        packed_strides,
-        begin_mask=begin_mask,
-        end_mask=end_mask,
-        shrink_axis_mask=shrink_axis_mask,
-        new_axis_mask=new_axis_mask,
-        ellipsis_mask=ellipsis_mask,
-        name=name)
+    if update_method == _UpdateMethod.UPDATE and not advanced_indices:
+      return array_ops.tensor_strided_slice_update(
+          tensor,
+          packed_begin,
+          packed_end,
+          packed_strides,
+          updates,
+          begin_mask=begin_mask,
+          end_mask=end_mask,
+          shrink_axis_mask=shrink_axis_mask,
+          new_axis_mask=new_axis_mask,
+          ellipsis_mask=ellipsis_mask,
+          name=name)
+    else:
+      # TODO(b/164251540): Find a better way to support update that does not
+      #   involve one read + two writes.
+      if updates is not None:
+        original_tensor = tensor
+      # TODO(agarwal): set_shape on tensor to set rank.
+      tensor = array_ops.strided_slice(
+          tensor,
+          packed_begin,
+          packed_end,
+          packed_strides,
+          begin_mask=begin_mask,
+          end_mask=end_mask,
+          shrink_axis_mask=shrink_axis_mask,
+          new_axis_mask=new_axis_mask,
+          ellipsis_mask=ellipsis_mask,
+          name=name)
     if not advanced_indices:
-      return tensor
+      if update_method is None:
+        return tensor
+      assert update_method != _UpdateMethod.UPDATE
+      # TF lacks TensorStridedSliceAdd and alike, so we need to do
+      # read+add+update.
+      if update_method == _UpdateMethod.ADD:
+        update_op = math_ops.add
+      elif update_method == _UpdateMethod.MIN:
+        update_op = math_ops.minimum
+      elif update_method == _UpdateMethod.MAX:
+        update_op = math_ops.maximum
+      return array_ops.tensor_strided_slice_update(
+          original_tensor,
+          packed_begin,
+          packed_end,
+          packed_strides,
+          update_op(tensor, updates),
+          begin_mask=begin_mask,
+          end_mask=end_mask,
+          shrink_axis_mask=shrink_axis_mask,
+          new_axis_mask=new_axis_mask,
+          ellipsis_mask=ellipsis_mask,
+          name=name + '_2')
     advanced_indices_map = {}
     for index, data, had_ellipsis in advanced_indices:
       if had_ellipsis:
@@ -1618,14 +1717,57 @@ def _slice_helper(tensor, slice_spec):
     indices = [x.data for x in _promote_dtype(*indices)]
     indices = np_utils.tf_broadcast(*indices)
     stacked_indices = array_ops.stack(indices, axis=-1)
-    if not dims_contiguous:
-      tensor = moveaxis(tensor, dims, range(len(dims))).data
+    # Skip the contiguous-dims optimization for update because there is no
+    # tf.*scatter* op that supports the `axis` argument.
+    if not dims_contiguous or updates is not None:
+      if range(len(dims)) != dims:
+        tensor = moveaxis(tensor, dims, range(len(dims))).data
       tensor_shape_prefix = array_ops.shape(
           tensor, out_type=stacked_indices.dtype)[:len(dims)]
       stacked_indices = array_ops.where_v2(
           stacked_indices < 0, stacked_indices + tensor_shape_prefix,
           stacked_indices)
-      return array_ops.gather_nd(tensor, stacked_indices)
+      if updates is None:
+        return array_ops.gather_nd(tensor, stacked_indices)
+      else:
+        if dims_contiguous:
+          # TODO(wangpeng): Support unknown rank (e.g. by partially flattening
+          #   `updates`)
+          if stacked_indices.shape.rank is None:
+            raise NotImplementedError(
+                'Rank of the advanced indices must currently be known')
+          batch_size = stacked_indices.shape.rank - 1
+          batch_start = dims[0]
+          if batch_start < 0:
+            batch_start += len(dims) - batch_size
+          def range_(start, length):
+            return range(start, start + length)
+          updates = moveaxis(updates, range_(batch_start, batch_size),
+                             range(batch_size)).data
+        if update_method == _UpdateMethod.UPDATE:
+          update_op = array_ops.tensor_scatter_update
+        elif update_method == _UpdateMethod.ADD:
+          update_op = array_ops.tensor_scatter_add
+        elif update_method == _UpdateMethod.MIN:
+          update_op = array_ops.tensor_scatter_min
+        elif update_method == _UpdateMethod.MAX:
+          update_op = array_ops.tensor_scatter_max
+        tensor = update_op(
+            tensor, stacked_indices, updates)
+        if range(len(dims)) != dims:
+          tensor = moveaxis(tensor, range(len(dims)), dims).data
+        return array_ops.tensor_strided_slice_update(
+            original_tensor,
+            packed_begin,
+            packed_end,
+            packed_strides,
+            tensor,
+            begin_mask=begin_mask,
+            end_mask=end_mask,
+            shrink_axis_mask=shrink_axis_mask,
+            new_axis_mask=new_axis_mask,
+            ellipsis_mask=ellipsis_mask,
+            name=name + '_2')
     # Note that gather_nd does not support gathering from inside the array.
     # To avoid shuffling data back and forth, we transform the indices and
     # do a gather instead.
@@ -1685,4 +1827,29 @@ def _getitem(self, slice_spec):
   return np_utils.tensor_to_ndarray(result_t)
 
 
+def _with_index_update_helper(update_method, a, slice_spec, updates):
+  """Implementation of ndarray._with_index_*."""
+  if (isinstance(slice_spec, bool) or (isinstance(slice_spec, ops.Tensor) and
+                                       slice_spec.dtype == dtypes.bool) or
+      (isinstance(slice_spec, (np.ndarray, np_arrays.ndarray)) and
+       slice_spec.dtype == np.bool)):
+    slice_spec = nonzero(slice_spec)
+
+  if not isinstance(slice_spec, tuple):
+    slice_spec = _as_spec_tuple(slice_spec)
+
+  a_dtype = a.dtype
+  a, updates = _promote_dtype_binary(a, updates)
+  result_t = _slice_helper(a.data, slice_spec, update_method, updates.data)
+  return np_utils.tensor_to_ndarray(result_t).astype(a_dtype)
+
+
 setattr(np_arrays.ndarray, '__getitem__', _getitem)
+setattr(np_arrays.ndarray, '_with_index_update',
+        functools.partial(_with_index_update_helper, _UpdateMethod.UPDATE))
+setattr(np_arrays.ndarray, '_with_index_add',
+        functools.partial(_with_index_update_helper, _UpdateMethod.ADD))
+setattr(np_arrays.ndarray, '_with_index_min',
+        functools.partial(_with_index_update_helper, _UpdateMethod.MIN))
+setattr(np_arrays.ndarray, '_with_index_max',
+        functools.partial(_with_index_update_helper, _UpdateMethod.MAX))
diff --git a/tensorflow/python/ops/numpy_ops/np_array_ops_test.py b/tensorflow/python/ops/numpy_ops/np_array_ops_test.py
index d52e0c4ea83..b3beb32793b 100644
--- a/tensorflow/python/ops/numpy_ops/np_array_ops_test.py
+++ b/tensorflow/python/ops/numpy_ops/np_array_ops_test.py
@@ -24,21 +24,42 @@ import numpy as np
 from six.moves import range
 from six.moves import zip
 
+from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import config
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import indexed_slices
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops.numpy_ops import np_array_ops
 from tensorflow.python.ops.numpy_ops import np_arrays
 from tensorflow.python.platform import test
 
 
+_virtual_devices_ready = False
+
+
+def set_up_virtual_devices():
+  global _virtual_devices_ready
+  if _virtual_devices_ready:
+    return
+  physical_devices = config.list_physical_devices('CPU')
+  config.set_logical_device_configuration(
+      physical_devices[0], [
+          context.LogicalDeviceConfiguration(),
+          context.LogicalDeviceConfiguration()
+      ])
+  _virtual_devices_ready = True
+
+
 class ArrayCreationTest(test.TestCase):
 
   def setUp(self):
     super(ArrayCreationTest, self).setUp()
+    set_up_virtual_devices()
     python_shapes = [
         0, 1, 2, (), (1,), (2,), (1, 2, 3), [], [1], [2], [1, 2, 3]
     ]
@@ -282,42 +303,51 @@ class ArrayCreationTest(test.TestCase):
 
     zeros_list = np_array_ops.zeros(5)
 
-    # TODO(srbs): Test that copy=True when context.device is different from
-    # tensor device copies the tensor.
+    def test_copy_equal_false():
+      # Backing tensor is the same if copy=False, other attributes being None.
+      self.assertIs(
+          np_array_ops.array(zeros_list, copy=False).data, zeros_list.data)
+      self.assertIs(
+          np_array_ops.array(zeros_list.data, copy=False).data, zeros_list.data)
 
-    # Backing tensor is the same if copy=False, other attributes being None.
-    self.assertIs(
-        np_array_ops.array(zeros_list, copy=False).data, zeros_list.data)
-    self.assertIs(
-        np_array_ops.array(zeros_list.data, copy=False).data, zeros_list.data)
+      # Backing tensor is different if ndmin is not satisfied.
+      self.assertIsNot(
+          np_array_ops.array(zeros_list, copy=False, ndmin=2).data,
+          zeros_list.data)
+      self.assertIsNot(
+          np_array_ops.array(zeros_list.data, copy=False, ndmin=2).data,
+          zeros_list.data)
+      self.assertIs(
+          np_array_ops.array(zeros_list, copy=False, ndmin=1).data,
+          zeros_list.data)
+      self.assertIs(
+          np_array_ops.array(zeros_list.data, copy=False, ndmin=1).data,
+          zeros_list.data)
 
-    # Backing tensor is different if ndmin is not satisfied.
-    self.assertIsNot(
-        np_array_ops.array(zeros_list, copy=False, ndmin=2).data,
-        zeros_list.data)
-    self.assertIsNot(
-        np_array_ops.array(zeros_list.data, copy=False, ndmin=2).data,
-        zeros_list.data)
-    self.assertIs(
-        np_array_ops.array(zeros_list, copy=False, ndmin=1).data,
-        zeros_list.data)
-    self.assertIs(
-        np_array_ops.array(zeros_list.data, copy=False, ndmin=1).data,
-        zeros_list.data)
+      # Backing tensor is different if dtype is not satisfied.
+      self.assertIsNot(
+          np_array_ops.array(zeros_list, copy=False, dtype=int).data,
+          zeros_list.data)
+      self.assertIsNot(
+          np_array_ops.array(zeros_list.data, copy=False, dtype=int).data,
+          zeros_list.data)
+      self.assertIs(
+          np_array_ops.array(zeros_list, copy=False, dtype=float).data,
+          zeros_list.data)
+      self.assertIs(
+          np_array_ops.array(zeros_list.data, copy=False, dtype=float).data,
+          zeros_list.data)
 
-    # Backing tensor is different if dtype is not satisfied.
-    self.assertIsNot(
-        np_array_ops.array(zeros_list, copy=False, dtype=int).data,
-        zeros_list.data)
-    self.assertIsNot(
-        np_array_ops.array(zeros_list.data, copy=False, dtype=int).data,
-        zeros_list.data)
-    self.assertIs(
-        np_array_ops.array(zeros_list, copy=False, dtype=float).data,
-        zeros_list.data)
-    self.assertIs(
-        np_array_ops.array(zeros_list.data, copy=False, dtype=float).data,
-        zeros_list.data)
+    test_copy_equal_false()
+    with ops.device('CPU:1'):
+      test_copy_equal_false()
+
+    self.assertNotIn('CPU:1', zeros_list.data.backing_device)
+    with ops.device('CPU:1'):
+      self.assertIn('CPU:1', np_array_ops.array(zeros_list, copy=True).data
+                    .backing_device)
+      self.assertIn('CPU:1', np_array_ops.array(np.array(0), copy=True).data
+                    .backing_device)
 
   def testAsArray(self):
     for a, dtype in itertools.product(self.all_arrays, self.all_types):
@@ -327,6 +357,8 @@ class ArrayCreationTest(test.TestCase):
     zeros_list = np_array_ops.zeros(5)
     # Same instance is returned if no dtype is specified and input is ndarray.
     self.assertIs(np_array_ops.asarray(zeros_list), zeros_list)
+    with ops.device('CPU:1'):
+      self.assertIs(np_array_ops.asarray(zeros_list), zeros_list)
     # Different instance is returned if dtype is specified and input is ndarray.
     self.assertIsNot(np_array_ops.asarray(zeros_list, dtype=int), zeros_list)
 
@@ -338,6 +370,8 @@ class ArrayCreationTest(test.TestCase):
     zeros_list = np_array_ops.zeros(5)
     # Same instance is returned if no dtype is specified and input is ndarray.
     self.assertIs(np_array_ops.asanyarray(zeros_list), zeros_list)
+    with ops.device('CPU:1'):
+      self.assertIs(np_array_ops.asanyarray(zeros_list), zeros_list)
     # Different instance is returned if dtype is specified and input is ndarray.
     self.assertIsNot(np_array_ops.asanyarray(zeros_list, dtype=int), zeros_list)
 
@@ -526,6 +560,7 @@ class ArrayMethodsTest(test.TestCase):
 
   def setUp(self):
     super(ArrayMethodsTest, self).setUp()
+    set_up_virtual_devices()
     self.array_transforms = [
         lambda x: x,
         ops.convert_to_tensor,
@@ -600,6 +635,14 @@ class ArrayMethodsTest(test.TestCase):
     run_test([True])
     run_test(np.arange(9).reshape((3, 3)).tolist())
 
+    a = np_array_ops.asarray(0)
+    self.assertNotIn('CPU:1', a.data.backing_device)
+    with ops.device('CPU:1'):
+      self.assertIn('CPU:1', np_array_ops.array(a, copy=True).data
+                    .backing_device)
+      self.assertIn('CPU:1', np_array_ops.array(np.array(0), copy=True).data
+                    .backing_device)
+
   def testCumProdAndSum(self):
 
     def run_test(arr, *args, **kwargs):
@@ -763,6 +806,31 @@ class ArrayMethodsTest(test.TestCase):
   def testAmax(self):
     self._testReduce(np_array_ops.amax, np.amax, 'amax')
 
+  def testSize(self):
+
+    def run_test(arr, axis=None):
+      onp_arr = np.array(arr)
+      self.assertEqual(np_array_ops.size(arr, axis), np.size(onp_arr, axis))
+
+    run_test(np_array_ops.array([1]))
+    run_test(np_array_ops.array([1, 2, 3, 4, 5]))
+    run_test(np_array_ops.ones((2, 3, 2)))
+    run_test(np_array_ops.ones((3, 2)))
+    run_test(np_array_ops.zeros((5, 6, 7)))
+    run_test(1)
+    run_test(np_array_ops.ones((3, 2, 1)))
+    run_test(constant_op.constant(5))
+    run_test(constant_op.constant([1, 1, 1]))
+    self.assertRaises(NotImplementedError, np_array_ops.size, np.ones((2, 2)),
+                      1)
+
+    @def_function.function(input_signature=[tensor_spec.TensorSpec(shape=None)])
+    def f(arr):
+      arr = np_array_ops.asarray(arr)
+      return np_array_ops.size(arr)
+
+    self.assertEqual(f(np_array_ops.ones((3, 2))).data.numpy(), 6)
+
   def testRavel(self):
 
     def run_test(arr, *args, **kwargs):
diff --git a/tensorflow/python/ops/numpy_ops/np_arrays.py b/tensorflow/python/ops/numpy_ops/np_arrays.py
index 9a859047843..ade758d36d3 100644
--- a/tensorflow/python/ops/numpy_ops/np_arrays.py
+++ b/tensorflow/python/ops/numpy_ops/np_arrays.py
@@ -294,6 +294,19 @@ class ndarray(composite_tensor.CompositeTensor):
   # NOTE: we currently prefer interop with TF to allow TF to take precedence.
   __array_priority__ = 90
 
+  def __array_module__(self, types):
+    # Experimental support for NumPy's module dispatch with NEP-37:
+    # https://numpy.org/neps/nep-0037-array-module.html
+    # Currently requires https://github.com/seberg/numpy-dispatch
+
+    # pylint: disable=g-import-not-at-top
+    import tensorflow.compat.v2 as tf
+
+    if all(issubclass(t, (ndarray, np.ndarray)) for t in types):
+      return tf.experimental.numpy
+    else:
+      return NotImplemented
+
   def __index__(self):
     """Returns a python scalar.
 
diff --git a/tensorflow/python/ops/numpy_ops/np_interop_test.py b/tensorflow/python/ops/numpy_ops/np_interop_test.py
index 3eb7bebd767..8999c8f832e 100644
--- a/tensorflow/python/ops/numpy_ops/np_interop_test.py
+++ b/tensorflow/python/ops/numpy_ops/np_interop_test.py
@@ -98,6 +98,18 @@ class InteropTest(tf.test.TestCase):
     self.assertAllClose(dx, 2.0)
     self.assertAllClose(dy, 3.0)
 
+  def testGradientTapeNoneGradients(self):
+    y = np.asarray(2.0)
+
+    with tf.GradientTape() as t:
+      x = np.asarray(3.0)
+      t.watch([x])
+      z = 2 * x
+
+    dz = t.gradient(z, y)
+
+    self.assertIsNone(dz)
+
   def testCondInterop(self):
     x = np.asarray(3.0)
 
@@ -162,6 +174,17 @@ class InteropTest(tf.test.TestCase):
     self.assertIsInstance(sq, onp.ndarray)
     self.assertEqual(100., sq[0])
 
+  def testArrayModule(self):
+    arr = np.asarray([10])
+
+    module = arr.__array_module__((np.ndarray,))
+    self.assertIs(module, tf.experimental.numpy)
+
+    class Dummy:
+      pass
+    module = arr.__array_module__((np.ndarray, Dummy))
+    self.assertIs(module, NotImplemented)
+
     # TODO(nareshmodi): Fails since the autopacking code doesn't use
     # nest.flatten.
 
@@ -293,12 +316,12 @@ class InteropTest(tf.test.TestCase):
 
     model = tf.keras.Sequential(
         [tf.keras.layers.Dense(100), ProjectionLayer(2)])
-    output = model.call(np.random.randn(10, 100))
+    output = model.call(np.random.randn(10, 100).astype(np.float32))
 
     self.assertIsInstance(output, np.ndarray)
 
     dense_layer = tf.keras.layers.Dense(100)
-    output = dense_layer(np.random.randn(10, 100))
+    output = dense_layer(np.random.randn(10, 100).astype(np.float32))
 
   def testPForInterop(self):
     def outer_product(a):
@@ -311,6 +334,11 @@ class InteropTest(tf.test.TestCase):
     self.assertIsInstance(c, np.ndarray)
     self.assertEqual(c.shape, (batch_size, 32, 32, 32, 32))
 
+    c = tf.vectorized_map(lambda x: x.T, a)
+
+    self.assertIsInstance(c, np.ndarray)
+    self.assertEqual(c.shape, (batch_size, 32, 32))
+
   def testJacobian(self):
     with tf.GradientTape() as g:
       x = np.asarray([1., 2.])
diff --git a/tensorflow/python/ops/numpy_ops/np_math_ops.py b/tensorflow/python/ops/numpy_ops/np_math_ops.py
index c1505e6fb65..631975c9b8a 100644
--- a/tensorflow/python/ops/numpy_ops/np_math_ops.py
+++ b/tensorflow/python/ops/numpy_ops/np_math_ops.py
@@ -217,6 +217,9 @@ def clip(a, a_min, a_max):  # pylint: disable=missing-docstring
             *np_utils.tf_broadcast(a.data, a_min.data, a_max.data)))
 
 
+setattr(np_arrays.ndarray, 'clip', clip)
+
+
 @np_utils.np_doc('matmul')
 def matmul(x1, x2):  # pylint: disable=missing-docstring
   def f(x1, x2):
diff --git a/tensorflow/python/ops/parallel_for/BUILD b/tensorflow/python/ops/parallel_for/BUILD
index 2f3f7309395..b189ac57bb9 100644
--- a/tensorflow/python/ops/parallel_for/BUILD
+++ b/tensorflow/python/ops/parallel_for/BUILD
@@ -85,6 +85,7 @@ py_library(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:tensor_array_ops",
+        "//tensorflow/python:tensor_shape",
         "//tensorflow/python:tensor_util",
         "//tensorflow/python:util",
         "//tensorflow/python/eager:context",
diff --git a/tensorflow/python/ops/parallel_for/array_test.py b/tensorflow/python/ops/parallel_for/array_test.py
index 1e2ecdbea7b..d4490502dba 100644
--- a/tensorflow/python/ops/parallel_for/array_test.py
+++ b/tensorflow/python/ops/parallel_for/array_test.py
@@ -59,6 +59,11 @@ class ArrayTest(PForTestCase):
         outputs.append(array_ops.gather(y, [i, 1, 2], axis=2, batch_dims=1))
         outputs.append(array_ops.gather(y, [[2, i], [i, 1], [2, 1]],
                                         axis=-1, batch_dims=1))
+        outputs.append(
+            array_ops.gather(y, [[0, 1, 2]] * 3, axis=2, batch_dims=2))
+        outputs.append(array_ops.gather(y, [0, 1, 2], axis=1, batch_dims=-1))
+        outputs.append(
+            array_ops.gather(y, [[0, 1, 2]] * 3, axis=2, batch_dims=-2))
 
       return outputs
 
diff --git a/tensorflow/python/ops/parallel_for/control_flow_ops.py b/tensorflow/python/ops/parallel_for/control_flow_ops.py
index deb41873347..b60bc210e9b 100644
--- a/tensorflow/python/ops/parallel_for/control_flow_ops.py
+++ b/tensorflow/python/ops/parallel_for/control_flow_ops.py
@@ -26,6 +26,7 @@ from tensorflow.python.eager import def_function
 from tensorflow.python.framework import indexed_slices
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -349,11 +350,23 @@ def _pfor_impl(loop_fn,
       return nest.pack_sequence_as(loop_fn_outputs, nest.flatten(outputs))
 
 
+def _broadcasting_gather(x, i):
+  """Wrapper for gather that implicitly broadcasts unit dimensions."""
+  static_first_dim = tensor_shape.dimension_value(x.shape[0])
+  if static_first_dim == 1:
+    i = 0
+  elif static_first_dim is None:
+    i = array_ops.where_v2(array_ops.shape(x)[0] > 1, i, 0)
+  result = array_ops.gather(x, i)
+  if isinstance(x, np_arrays.ndarray):
+    result = np_arrays.ndarray.from_tensor(result)
+  return result
+
+
 @tf_export("vectorized_map")
 def vectorized_map(fn, elems, fallback_to_while_loop=True):
   """Parallel map on the list of tensors unpacked from `elems` on dimension 0.
 
-
   This method works similar to `tf.map_fn` but is optimized to run much faster,
   possibly with a much larger memory footprint. The speedups are obtained by
   vectorization (see [Auto-Vectorizing TensorFlow Graphs: Jacobians, 
@@ -420,7 +433,10 @@ def vectorized_map(fn, elems, fallback_to_while_loop=True):
       the structure of `elems`.
     elems: A tensor or (possibly nested) sequence of tensors, each of which will
       be unpacked along their first dimension. The nested sequence of the
-      resulting slices will be mapped over by `fn`.
+      resulting slices will be mapped over by `fn`. The first dimensions of all
+      elements must broadcast to a consistent value; equivalently, each
+      element tensor must have first dimension of either `B` or `1`, for some
+      common batch size `B >= 1`.
     fallback_to_while_loop: If true, on failing to vectorize an operation,
       the unsupported op is wrapped in a tf.while_loop to execute the map
       iterations. Note that this fallback only happens for unsupported ops and
@@ -437,14 +453,31 @@ def vectorized_map(fn, elems, fallback_to_while_loop=True):
   Raises:
     ValueError: If vectorization fails and fallback_to_while_loop is False.
   """
+  def _convert_to_tensor_or_ndarray(x):
+    if isinstance(x, np_arrays.ndarray):
+      return x
+    return ops.convert_to_tensor(x)
+  elems = nest.map_structure(_convert_to_tensor_or_ndarray, elems)
+
   def loop_fn(i):
-    gathered_elems = nest.map_structure(lambda x: array_ops.gather(x, i), elems)
+    gathered_elems = nest.map_structure(lambda x: _broadcasting_gather(x, i),
+                                        elems)
     return fn(gathered_elems)
-  batch_size = None
-  first_elem = ops.convert_to_tensor(nest.flatten(elems)[0])
-  if first_elem.shape.rank is not None:
-    batch_size = first_elem.shape.as_list()[0]
-  if batch_size is None:
-    batch_size = array_ops.shape(first_elem)[0]
+
+  # Extract batch size from the maximum first dimension of any element.
+  flat_elems = nest.flatten(elems)
+  def _get_shape(x):
+    if isinstance(x, np_arrays.ndarray):
+      x = x.data
+    if x.shape.rank is None:
+      return None
+    return x.shape.as_list()[0]
+  static_first_dims = [_get_shape(elem) for elem in flat_elems]
+  if any([s is None for s in static_first_dims]):
+    batch_size = math_ops.reduce_max(
+        [array_ops.shape(elem)[0] for elem in flat_elems])
+  else:
+    batch_size = max(static_first_dims)
+
   return pfor(loop_fn, batch_size,
               fallback_to_while_loop=fallback_to_while_loop)
diff --git a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
index f8e4e4762ac..96d41f3b359 100644
--- a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
+++ b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
@@ -44,6 +44,7 @@ from tensorflow.python.ops import cond_v2
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import control_flow_v2_toggles
 from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import gen_list_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import gradients as gradient_ops
 from tensorflow.python.ops import image_ops
@@ -132,6 +133,27 @@ class PForTest(PForTestCase):
     result = pfor_control_flow_ops.vectorized_map(compute, x)
     self.run_and_assert_equal(result, array_ops.ones((10, 1, 3)))
 
+  def test_vectorized_map_broadcasts_unit_dimensions(self):
+    convert_with_static_shape = ops.convert_to_tensor
+    convert_with_dynamic_shape = (
+        lambda x: array_ops.placeholder_with_default(x, shape=None))
+
+    for convert in (convert_with_static_shape, convert_with_dynamic_shape):
+      a = convert([3.1])
+      b = convert([-2., 6., 9.])
+
+      # One elem with leading unit dimension.
+      a_plus_1 = pfor_control_flow_ops.vectorized_map(lambda a: a + 1, a)
+      self.assertAllEqual(*self.evaluate((a_plus_1, a + 1)))
+
+      # Two elems, both with leading unit dimension.
+      a_plus_a = pfor_control_flow_ops.vectorized_map(sum, (a, a))
+      self.assertAllEqual(*self.evaluate((a_plus_a, a + a)))
+
+      # Elem w/ unit dimension broadcast against elem with batch dim.
+      a_plus_b = pfor_control_flow_ops.vectorized_map(sum, (a, b))
+      self.assertAllEqual(*self.evaluate((a_plus_b, a + b)))
+
   def test_vectorized_map_example_1(self):
 
     def outer_product(a):
@@ -889,6 +911,7 @@ class TensorArrayTest(PForTestCase):
       self.assertAllClose(actual_grad, computed_grad)
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class TensorListTest(PForTestCase):
 
   def test_create_outside_and_write(self):
@@ -988,6 +1011,38 @@ class TensorListTest(PForTestCase):
 
     self._test_loop_fn(loop_fn, 2)
 
+  def test_create_inside_and_concat(self):
+
+    def loop_fn(i):
+      handle = list_ops.tensor_list_reserve([2], 2, dtypes.int32)
+      handle = list_ops.tensor_list_scatter([[i, 2]], [0], input_handle=handle)
+      handle = list_ops.tensor_list_scatter([[1, 2]], [1], input_handle=handle)
+      return gen_list_ops.tensor_list_concat_v2(
+          handle,
+          element_dtype=dtypes.int32,
+          element_shape=[2],
+          leading_dims=[])
+
+    output = pfor_control_flow_ops.pfor(loop_fn, 2)
+    self.assertAllClose([[0, 2, 1, 2], [1, 2, 1, 2]], output[0])
+    self.assertAllClose([[2, 2], [2, 2]], output[1])
+
+  def test_create_outside_and_concat(self):
+    h = list_ops.tensor_list_reserve([2], 2, dtypes.int32)
+
+    def loop_fn(i):
+      handle = list_ops.tensor_list_scatter([[i, 2]], [0], input_handle=h)
+      handle = list_ops.tensor_list_scatter([[1, 2]], [1], input_handle=handle)
+      return gen_list_ops.tensor_list_concat_v2(
+          handle,
+          element_dtype=dtypes.int32,
+          element_shape=[2],
+          leading_dims=[])
+
+    output = pfor_control_flow_ops.pfor(loop_fn, 2)
+    self.assertAllClose([[0, 2, 1, 2], [1, 2, 1, 2]], output[0])
+    self.assertAllClose([[2, 2], [2, 2]], output[1])
+
   def test_tensor_list_from_tensor(self):
     t = random_ops.random_uniform([2, 3, 4])
 
diff --git a/tensorflow/python/ops/parallel_for/math_test.py b/tensorflow/python/ops/parallel_for/math_test.py
index 85b58055d8f..30e724413f4 100644
--- a/tensorflow/python/ops/parallel_for/math_test.py
+++ b/tensorflow/python/ops/parallel_for/math_test.py
@@ -261,6 +261,9 @@ class MathTest(PForTestCase, parameterized.TestCase):
 
     self._test_loop_fn(loop_fn, 4)
 
+  @test_util.run_without_tensor_float_32(
+      "Calls matmul in parallel for-loop and compares result to calling matmul "
+      "in sequential for-loop")
   def test_matmul(self):
     for tr_a in (True, False):
       for tr_b in (True, False):
@@ -745,6 +748,9 @@ class LinalgTest(PForTestCase):
 
     self._test_loop_fn(loop_fn, 2)
 
+  @test_util.run_without_tensor_float_32(
+      "Calls einsum in parallel for-loop and compares result to calling einsum "
+      "in sequential for-loop")
   def test_einsum(self):
     b = 10
     x_series = random_ops.random_uniform([b, 9, 9])
diff --git a/tensorflow/python/ops/parallel_for/pfor.py b/tensorflow/python/ops/parallel_for/pfor.py
index d14ad1e5cba..3c6b9c0d756 100644
--- a/tensorflow/python/ops/parallel_for/pfor.py
+++ b/tensorflow/python/ops/parallel_for/pfor.py
@@ -46,6 +46,7 @@ from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_image_ops
 from tensorflow.python.ops import gen_linalg_ops
+from tensorflow.python.ops import gen_list_ops
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import gen_parsing_ops
@@ -2275,7 +2276,11 @@ def _convert_gather(pfor_input):
         # it must be picking up all the rows of param.
         return wrap(param, True)
 
-    if batch_dims > 0:
+    if batch_dims != 0:
+      # Convert `batch_dims` to its positive equivalent if necessary.
+      batch_dims_pos = batch_dims
+      if batch_dims < 0:
+        batch_dims_pos += array_ops.rank(indices)
       # In order to maintain
       #   indices.shape[:batch_dims] == params.shape[:batch_dims]
       # with stacked indices, we move the first dimension of `indices` to the
@@ -2283,8 +2288,9 @@ def _convert_gather(pfor_input):
       # inserted into the shape of `output` at the `axis` dimension, which is
       # then transposed to the front (below).
       order = array_ops.concat([
-          (list(range(1, batch_dims + 1)) + [0]),
-          math_ops.range(batch_dims + 1, array_ops.rank(indices))], axis=0)
+          math_ops.range(1, batch_dims_pos + 1),
+          [0],
+          math_ops.range(batch_dims_pos + 1, array_ops.rank(indices))], axis=0)
       indices = array_ops.transpose(indices, order)
 
     output = array_ops.gather(
@@ -2310,7 +2316,7 @@ def _convert_gather(pfor_input):
     output = array_ops.gather(
         param, indices,
         axis=array_ops.where(axis >= 0, axis + 1, axis),
-        batch_dims=batch_dims + 1)
+        batch_dims=(batch_dims + 1 if batch_dims >= 0 else batch_dims))
     return wrap(output, True)
 
 
@@ -3660,6 +3666,46 @@ def _convert_tensor_array_set_item(pfor_input):
     return wrap(_tile_variant(handle, pfor_input), True)
 
 
+@RegisterPFor("TensorListConcatV2")
+def _convert_tensor_list_concat_v2(pfor_input):
+  input_handle = pfor_input.stacked_input(0)
+  element_shape = pfor_input.unstacked_input(1)
+  leading_dims = pfor_input.unstacked_input(2)
+  element_dtype = pfor_input.get_attr("element_dtype")
+
+  handle = _untile_variant(input_handle)
+  length = list_ops.tensor_list_length(handle)
+  # Note that element_shape attribute can have incomplete shapes. This doesn't
+  # seem to work well when creating another list and then doing a concat on it.
+  # Hence we try to find the dynamic shape here.
+  element_shape = control_flow_ops.cond(
+      length > 0, lambda: array_ops.shape(
+          list_ops.tensor_list_get_item(handle, 0, element_dtype, None)),
+      lambda: constant_op.constant([0, 0], dtype=dtypes.int32))
+  # The code below creates a copy of the list with each elements' first two
+  # dimensions transposed.
+  new_element_shape = array_ops.concat(
+      [element_shape[1:2], element_shape[0:1], element_shape[2:]], axis=0)
+
+  # Create a new TensorList with elements transposed.
+  def _transpose_elem(i, h):
+    elem = list_ops.tensor_list_get_item(handle, i, element_dtype, None)
+    elem = _transpose_first_two_dims(elem)
+    return i + 1, list_ops.tensor_list_set_item(h, i, elem)
+
+  new_handle = list_ops.tensor_list_reserve(new_element_shape, length,
+                                            element_dtype)
+  new_handle = control_flow_ops.while_loop(lambda i, _: i < length,
+                                           _transpose_elem, [0, new_handle])[1]
+  output, lengths = gen_list_ops.tensor_list_concat_v2(
+      input_handle=new_handle,
+      element_dtype=element_dtype,
+      element_shape=new_element_shape,
+      leading_dims=leading_dims)
+  output = _transpose_first_two_dims(output)
+  return wrap(output, True), wrap(lengths, False)
+
+
 @RegisterPFor("TensorListStack")
 def _convert_tensor_list_stack(pfor_input):
   handle = pfor_input.stacked_input(0)
diff --git a/tensorflow/python/ops/parallel_for/xla_control_flow_ops_test.py b/tensorflow/python/ops/parallel_for/xla_control_flow_ops_test.py
index 33f0d7b76ae..188df3f9b87 100644
--- a/tensorflow/python/ops/parallel_for/xla_control_flow_ops_test.py
+++ b/tensorflow/python/ops/parallel_for/xla_control_flow_ops_test.py
@@ -22,6 +22,7 @@ from __future__ import print_function
 from tensorflow.compiler.tf2xla.python import xla as xla_ops
 from tensorflow.python.compiler.xla import jit
 from tensorflow.python.compiler.xla import xla
+from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import test_util
@@ -39,6 +40,10 @@ from tensorflow.python.platform import test
 @test_util.run_all_in_graph_and_eager_modes
 class PForTest(PForTestCase):
 
+  def __init__(self, method_name="runTest"):
+    super(PForTest, self).__init__(method_name)
+    context.context().enable_xla_devices()
+
   def test_xla_einsum(self):
     num_loop = 10
     x_series = random_ops.random_uniform([num_loop, 9, 9])
diff --git a/tensorflow/python/ops/ragged/BUILD b/tensorflow/python/ops/ragged/BUILD
index 95e5602a246..309957a76a1 100644
--- a/tensorflow/python/ops/ragged/BUILD
+++ b/tensorflow/python/ops/ragged/BUILD
@@ -2,17 +2,7 @@ load("//tensorflow:tensorflow.bzl", "py_test")
 load("//tensorflow/tools/test:performance.bzl", "tf_py_logged_benchmark")
 
 package(
-    default_visibility = [
-        "//intelligence/datum/prensor:__pkg__",
-        "//learning/brain/contrib/text:__pkg__",
-        "//nlp/nlx/bert:__pkg__",
-        "//nlp/nlx/i18n/pangloss:__subpackages__",
-        "//nlp/nlx/infrastructure/multiscale:__subpackages__",
-        "//nlp/projects/atc/tf/ops:__pkg__",
-        "//research/graph/convolutions/model:__subpackages__",
-        "//research/socrates:__subpackages__",
-        "//tensorflow:internal",
-    ],
+    default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],  # Apache 2.0
 )
 
@@ -481,6 +471,19 @@ py_library(
     ],
 )
 
+py_library(
+    name = "ragged_tensor_test_ops",
+    srcs = ["ragged_tensor_test_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:bitwise_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:string_ops",
+    ],
+)
+
 #-------------------------------------------------------------------------------
 # RaggedTensor Tests
 #-------------------------------------------------------------------------------
@@ -1056,17 +1059,19 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":ragged",  # fixdeps: keep
+        ":ragged_dispatch",
         ":ragged_factory_ops",
         ":ragged_tensor",
+        ":ragged_tensor_test_ops",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:bitwise_ops",
         "//tensorflow/python:clip_ops",
+        "//tensorflow/python:data_flow_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:nn_ops",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:string_ops",
@@ -1296,3 +1301,32 @@ py_test(
         "@absl_py//absl/testing:parameterized",
     ],
 )
+
+py_test(
+    name = "ragged_tensor_supported_values_test",
+    srcs = ["ragged_tensor_supported_values_test.py"],
+    python_version = "PY3",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_factory_ops",
+        ":ragged_tensor",
+        ":ragged_tensor_test_ops",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:clip_ops",
+        "//tensorflow/python:composite_tensor",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:string_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:tensor_spec",
+        "//tensorflow/python:type_spec",
+        "//tensorflow/python:util",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
diff --git a/tensorflow/python/ops/ragged/ragged_dispatch.py b/tensorflow/python/ops/ragged/ragged_dispatch.py
index 5c9388b8677..c52ae6db8f8 100644
--- a/tensorflow/python/ops/ragged/ragged_dispatch.py
+++ b/tensorflow/python/ops/ragged/ragged_dispatch.py
@@ -497,6 +497,7 @@ _RAGGED_DISPATCH_OPS = [
     (array_ops.stack, ragged_concat_ops.stack, ['[values]']),
     (array_ops.tile, ragged_array_ops.tile, ['input']),
     (array_ops.where, ragged_where_op.where, ['condition', 'x', 'y']),
+    (array_ops.where_v2, ragged_where_op.where_v2, ['condition', 'x', 'y']),
     (data_flow_ops.dynamic_partition, _ragged_dynamic_partition,
      ['data', 'partitions']),
     (math_ops.unsorted_segment_sum, ragged_math_ops.segment_sum,
diff --git a/tensorflow/python/ops/ragged/ragged_dispatch_test.py b/tensorflow/python/ops/ragged/ragged_dispatch_test.py
index 7ef0d9fd0b8..0237624aa45 100644
--- a/tensorflow/python/ops/ragged/ragged_dispatch_test.py
+++ b/tensorflow/python/ops/ragged/ragged_dispatch_test.py
@@ -30,115 +30,15 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import data_flow_ops
-from tensorflow.python.ops import gen_bitwise_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.ops.ragged import ragged_dispatch
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import ragged_tensor_test_ops as test_ops
 from tensorflow.python.platform import googletest
 
-# Constants listing various op types to test.  Each operation
-# should be included in at least one list below, or tested separately if
-# necessary (e.g., because it expects additional arguments).
-UNARY_FLOAT_OPS = [
-    math_ops.abs,
-    math_ops.acos,
-    math_ops.acosh,
-    math_ops.angle,
-    math_ops.asin,
-    math_ops.asinh,
-    math_ops.atan,
-    math_ops.atanh,
-    math_ops.ceil,
-    math_ops.conj,
-    math_ops.cos,
-    math_ops.cosh,
-    math_ops.digamma,
-    math_ops.erf,
-    math_ops.erfc,
-    math_ops.erfinv,
-    math_ops.exp,
-    math_ops.expm1,
-    math_ops.floor,
-    math_ops.imag,
-    math_ops.is_finite,
-    math_ops.is_inf,
-    math_ops.is_nan,
-    math_ops.lgamma,
-    math_ops.log,
-    math_ops.log1p,
-    math_ops.log_sigmoid,
-    math_ops.ndtri,
-    math_ops.negative,
-    math_ops.real,
-    math_ops.reciprocal,
-    math_ops.rint,
-    math_ops.round,
-    math_ops.rsqrt,
-    math_ops.sign,
-    math_ops.sin,
-    math_ops.sinh,
-    math_ops.sqrt,
-    math_ops.square,
-    math_ops.tan,
-    array_ops.identity,
-    array_ops.ones_like,
-    array_ops.zeros_like,
-]
-UNARY_BOOL_OPS = [
-    math_ops.logical_not,
-]
-UNARY_STRING_OPS = [
-    string_ops.decode_base64,
-    string_ops.encode_base64,
-    string_ops.string_strip,
-    parsing_ops.decode_compressed,
-]
-BINARY_FLOAT_OPS = [
-    math_ops.add,
-    math_ops.atan2,
-    math_ops.complex,
-    math_ops.div_no_nan,
-    math_ops.divide,
-    math_ops.equal,
-    math_ops.floordiv,
-    math_ops.floormod,
-    math_ops.greater,
-    math_ops.greater_equal,
-    math_ops.less,
-    math_ops.less_equal,
-    math_ops.maximum,
-    math_ops.minimum,
-    math_ops.multiply,
-    math_ops.not_equal,
-    math_ops.pow,
-    math_ops.realdiv,
-    math_ops.squared_difference,
-    math_ops.subtract,
-    math_ops.truediv,
-]
-BINARY_BOOL_OPS = [
-    math_ops.logical_and,
-    math_ops.logical_or,
-    math_ops.logical_xor,
-]
-UNARY_INT_OPS = [
-    gen_bitwise_ops.invert,
-    string_ops.unicode_script,
-]
-BINARY_INT_OPS = [
-    gen_bitwise_ops.bitwise_and,
-    gen_bitwise_ops.bitwise_or,
-    gen_bitwise_ops.bitwise_xor,
-    gen_bitwise_ops.left_shift,
-    gen_bitwise_ops.right_shift,
-    math_ops.truncatediv,
-    math_ops.truncatemod,
-]
-
 
 # pylint: disable=g-complex-comprehension
 @test_util.run_all_in_graph_and_eager_modes
@@ -183,17 +83,17 @@ class RaggedDispatchTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       # Test each unary op.
       #=========================================================================
       [{'x': ragged_factory_ops.constant_value([[-2.0, 3.0], [-3.0]]), 'op': op}
-       for op in UNARY_FLOAT_OPS] +
+       for op in test_ops.UNARY_FLOAT_OPS] +
       [{'x': ragged_factory_ops.constant_value([[True, False], [True]]),
         'op': op}
-       for op in UNARY_BOOL_OPS] +
+       for op in test_ops.UNARY_BOOL_OPS] +
       [{'x': ragged_factory_ops.constant_value([[18, 512], [12412]], np.int32),
         'op': op}
-       for op in UNARY_INT_OPS] +
+       for op in test_ops.UNARY_INT_OPS] +
       [{'x': ragged_factory_ops.constant_value([['abcd', 'efgh'],
                                                 ['aabbccdd']]),
         'op': op}
-       for op in UNARY_STRING_OPS] +
+       for op in test_ops.UNARY_STRING_OPS] +
       [
           {'op': clip_ops.clip_by_value,
            'x': ragged_factory_ops.constant_value([[-2.0, 3.0], [-3.0]]),
@@ -337,20 +237,20 @@ class RaggedDispatchTest(test_util.TensorFlowTestCase, parameterized.TestCase):
            'use_kwargs': ('x',)},
       ] +
       #=========================================================================
-      # Test each unary op.
+      # Test each binary op.
       #=========================================================================
       [{'x': ragged_factory_ops.constant_value([[-2.0, 3.0], [-3.0]]),
         'y': ragged_factory_ops.constant_value([[5.0, 1.0], [12.0]]),
         'op': op}
-       for op in BINARY_FLOAT_OPS] +
+       for op in test_ops.BINARY_FLOAT_OPS] +
       [{'x': ragged_factory_ops.constant_value([[-2, 3], [-3]]),
         'y': ragged_factory_ops.constant_value([[5, 1], [12]]),
         'op': op}
-       for op in BINARY_INT_OPS] +
+       for op in test_ops.BINARY_INT_OPS] +
       [{'x': ragged_factory_ops.constant_value([[True, True], [False]]),
         'y': ragged_factory_ops.constant_value([[False, True], [False]]),
         'op': op}
-       for op in BINARY_BOOL_OPS]
+       for op in test_ops.BINARY_BOOL_OPS]
       )  # pyformat: disable
   def testBinaryElementwiseOp(self, x, y, op=math_ops.add, **extra_args):
     use_kwargs = extra_args.pop('use_kwargs', ())
@@ -581,6 +481,12 @@ class RaggedDispatchTest(test_util.TensorFlowTestCase, parameterized.TestCase):
           op=array_ops.where,
           args=(ragged_factory_ops.constant_value([[True, False], [True]]),),
           expected=[[0, 0], [1, 0]]),
+      dict(
+          op=array_ops.where_v2,
+          args=(ragged_factory_ops.constant_value([[True, False], [True]]),
+                ragged_factory_ops.constant_value([[b'A', b'B'], [b'C']]),
+                ragged_factory_ops.constant_value([[b'a', b'b'], [b'c']])),
+          expected=ragged_factory_ops.constant_value([[b'A', b'b'], [b'C']])),
       dict(
           op=math_ops.unsorted_segment_sum,
           kwargs={
@@ -834,8 +740,7 @@ class RaggedDispatchTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     ]
 
     # Ops that should be listed as supported in v1 only.
-    # TODO(edloper): Add a dispatch for where_v2.
-    supported_ops_v1 = ['batch_gather', 'where']
+    supported_ops_v1 = ['batch_gather']
 
     # Ops that should be listed as supported in v2 only.
     supported_ops_v2 = []
diff --git a/tensorflow/python/ops/ragged/ragged_factory_ops.py b/tensorflow/python/ops/ragged/ragged_factory_ops.py
index 0513c6b690b..1d57187e518 100644
--- a/tensorflow/python/ops/ragged/ragged_factory_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_factory_ops.py
@@ -346,5 +346,6 @@ def placeholder(dtype, ragged_rank, value_shape=None, name=None):
     for i in reversed(range(ragged_rank)):
       row_splits = array_ops.placeholder(dtypes.int64, [None],
                                          "row_splits_%d" % i)
-      result = ragged_tensor.RaggedTensor.from_row_splits(result, row_splits)
+      result = ragged_tensor.RaggedTensor.from_row_splits(result, row_splits,
+                                                          validate=False)
     return result
diff --git a/tensorflow/python/ops/ragged/ragged_functional_ops.py b/tensorflow/python/ops/ragged/ragged_functional_ops.py
index 00b5ced6170..22625077e56 100644
--- a/tensorflow/python/ops/ragged/ragged_functional_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_functional_ops.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.ragged import ragged_config
 from tensorflow.python.ops.ragged import ragged_tensor
@@ -70,10 +71,22 @@ def map_flat_values(op, *args, **kwargs):
   # Replace RaggedTensors with their values; and collect the splits tensors
   # from each RaggedTensor.
   nested_splits_lists = []
-  inner_args = _replace_ragged_with_flat_values(args, nested_splits_lists)
-  inner_kwargs = _replace_ragged_with_flat_values(kwargs, nested_splits_lists)
+  flat_values_nrows = []
+  inner_args = _replace_ragged_with_flat_values(args, nested_splits_lists,
+                                                flat_values_nrows)
+  inner_kwargs = _replace_ragged_with_flat_values(kwargs, nested_splits_lists,
+                                                  flat_values_nrows)
   if not nested_splits_lists:
     return op(*args, **kwargs)
+  if flat_values_nrows:
+    flat_values_nrows = set(flat_values_nrows)
+    if len(flat_values_nrows) != 1:
+      raise ValueError("Input RaggedTensors' flat_values must all have the "
+                       "same outer-dimension size.  Got sizes: %s" %
+                       flat_values_nrows)
+    flat_values_nrows = flat_values_nrows.pop()  # Get the single element
+  else:
+    flat_values_nrows = None
 
   split_dtypes = set(splits[0].dtype for splits in nested_splits_lists)
   if len(split_dtypes) > 1:
@@ -88,13 +101,23 @@ def map_flat_values(op, *args, **kwargs):
 
   with ops.control_dependencies(
       ragged_util.assert_splits_match(nested_splits_lists)):
-    # Delegate to op, and then compose the result from the transformed values
-    # and the splits.
+    # Delegate to `op`
+    op_output = op(*inner_args, **inner_kwargs)
+    # Check that the result has the expected shape (if known).
+    if flat_values_nrows is not None:
+      if not op_output.shape[:1].is_compatible_with([flat_values_nrows]):
+        raise ValueError(
+            "tf.ragged.map_flat_values requires that the output of `op` have "
+            "the same outer-dimension size as flat_values of any ragged "
+            "inputs. (output shape: %s; expected outer dimension size: %s)" %
+            (op_output.shape, flat_values_nrows))
+    # Compose the result from the transformed values and the splits.
     return ragged_tensor.RaggedTensor.from_nested_row_splits(
-        op(*inner_args, **inner_kwargs), nested_splits_lists[0], validate=False)
+        op_output, nested_splits_lists[0], validate=False)
 
 
-def _replace_ragged_with_flat_values(value, nested_splits_lists):
+def _replace_ragged_with_flat_values(value, nested_splits_lists,
+                                     flat_values_nrows):
   """Replace RaggedTensors with their flat_values, and record their splits.
 
   Returns a copy of `value`, with any nested `RaggedTensor`s replaced by their
@@ -106,6 +129,9 @@ def _replace_ragged_with_flat_values(value, nested_splits_lists):
     value: The value that should be transformed by replacing `RaggedTensors`.
     nested_splits_lists: An output parameter used to record the `nested_splits`
       for any `RaggedTensors` that were replaced.
+    flat_values_nrows: An output parameter used to record the outer dimension
+      size for each replacement `flat_values` (when known).  Contains a list of
+      int.
 
   Returns:
     A copy of `value` with nested `RaggedTensors` replaced by their `values`.
@@ -114,11 +140,15 @@ def _replace_ragged_with_flat_values(value, nested_splits_lists):
   if ragged_tensor.is_ragged(value):
     value = ragged_tensor.convert_to_tensor_or_ragged_tensor(value)
     nested_splits_lists.append(value.nested_row_splits)
+    nrows = tensor_shape.dimension_at_index(value.flat_values.shape, 0).value
+    if nrows is not None:
+      flat_values_nrows.append(nrows)
     return value.flat_values
 
   # Recursion cases
   def recurse(v):
-    return _replace_ragged_with_flat_values(v, nested_splits_lists)
+    return _replace_ragged_with_flat_values(v, nested_splits_lists,
+                                            flat_values_nrows)
 
   if isinstance(value, list):
     return [recurse(v) for v in value]
diff --git a/tensorflow/python/ops/ragged/ragged_map_flat_values_op_test.py b/tensorflow/python/ops/ragged/ragged_map_flat_values_op_test.py
index 588a5473741..e65c877aa68 100644
--- a/tensorflow/python/ops/ragged/ragged_map_flat_values_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_map_flat_values_op_test.py
@@ -178,18 +178,33 @@ class RaggedMapInnerValuesOpTest(test_util.TensorFlowTestCase):
   def testRaggedTensorSplitsRaggedRankMismatchError(self):
     x = ragged_factory_ops.constant([[3, 1, 4], [], [1, 5]])
     y = ragged_factory_ops.constant([[[3, 1, 4], []], [], [[1, 5]]])
-    self.assertRaisesRegex(ValueError,
-                           r'Inputs must have identical ragged splits.*',
-                           ragged_functional_ops.map_flat_values, math_ops.add,
-                           x, y)
+    with self.assertRaisesRegex(ValueError,
+                                r'Inputs must have identical ragged splits.*'):
+      ragged_functional_ops.map_flat_values(math_ops.add, x, y)
 
   def testRaggedTensorSplitsValueMismatchError(self):
     x = ragged_factory_ops.constant([[3, 1, 4], [], [1, 5]])
     y = ragged_factory_ops.constant([[1], [2, 3], [4, 5]])
-    self.assertRaisesRegex(errors.InvalidArgumentError,
-                           r'Inputs must have identical ragged splits.*',
-                           ragged_functional_ops.map_flat_values, math_ops.add,
-                           x, y)
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                r'Inputs must have identical ragged splits.*'):
+      ragged_functional_ops.map_flat_values(math_ops.add, x, y)
+
+    z_splits = array_ops.placeholder_with_default(
+        constant_op.constant([0, 3], dtypes.int64), None)
+    z = ragged_tensor.RaggedTensor.from_row_splits([0, 1, 2], z_splits)
+    with self.assertRaisesRegex(
+        ValueError,
+        r"Input RaggedTensors' flat_values must all have the same "
+        r'outer-dimension size.  Got sizes: \{3, 5\}'):
+      ragged_functional_ops.map_flat_values(math_ops.add, x, z)
+
+  def testRaggedTensorShapeMismatchError(self):
+    x = ragged_factory_ops.constant([[1, 2, 3], [4, 5]])
+    with self.assertRaisesRegex(
+        ValueError, r'tf.ragged.map_flat_values requires that the output of '
+        '`op` have the same outer-dimension size as flat_values of any ragged '
+        r'inputs. \(output shape: \(\); expected outer dimension size: 5\)'):
+      ragged_functional_ops.map_flat_values(math_ops.argmax, x)
 
   def testRaggedTensorSplitsMismatchErrorAtRuntime(self):
     splits1 = array_ops.placeholder_with_default(
diff --git a/tensorflow/python/ops/ragged/ragged_map_fn_op_test.py b/tensorflow/python/ops/ragged/ragged_map_fn_op_test.py
index 8a40e396a68..bead4923a0a 100644
--- a/tensorflow/python/ops/ragged/ragged_map_fn_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_map_fn_op_test.py
@@ -150,6 +150,21 @@ class RaggedMapOpTest(test_util.TensorFlowTestCase,
           result_dtype=ragged_tensor.RaggedTensorType(
               dtype=dtypes.int64, ragged_rank=4),
       ),
+      # [d1] -> [d1, (d2), (d3)]
+      dict(
+          fn=ragged_math_ops.range,
+          elems=np.array([1, 2, 3], np.int64),
+          expected_output=[[[0]], [[0, 1]], [[0, 1, 2]]],
+          result_dtype=ragged_tensor.RaggedTensorType(
+              dtype=dtypes.int64, ragged_rank=2)),
+      # [0] -> [0, (d2), (d3)]  (github issue #36232)
+      dict(
+          fn=ragged_math_ops.range,
+          elems=np.zeros([0], np.int64),
+          expected_output=[],
+          expected_ragged_rank=2,
+          result_dtype=ragged_tensor.RaggedTensorType(
+              dtype=dtypes.int64, ragged_rank=2)),
   ])
 
   def testRaggedMap(
diff --git a/tensorflow/python/ops/ragged/ragged_placeholder_op_test.py b/tensorflow/python/ops/ragged/ragged_placeholder_op_test.py
index d2261d408b3..cdad7d49205 100644
--- a/tensorflow/python/ops/ragged/ragged_placeholder_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_placeholder_op_test.py
@@ -21,6 +21,7 @@ from absl.testing import parameterized
 
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.platform import googletest
@@ -34,30 +35,20 @@ class RaggedPlaceholderOpTest(test_util.TensorFlowTestCase,
       # dtype, ragged_rank, value_shape, name -> expected
       (dtypes.int32, 0, [5], None,
        'Tensor("Placeholder:0", shape=(5,), dtype=int32)'),
-      (dtypes.int32, 1, [], 'ph',
-       'tf.RaggedTensor('
+      (dtypes.int32, 1, [], 'ph', 'tf.RaggedTensor('
        'values=Tensor("ph/flat_values:0", shape=(None,), dtype=int32), '
-       'row_splits=Tensor("ph/RaggedFromRowSplits/control_dependency:0", '
-       'shape=(None,), dtype=int64))'),
-      (dtypes.string, 1, [5], 'ph',
-       'tf.RaggedTensor('
+       'row_splits=Tensor("ph/row_splits_0:0", shape=(None,), dtype=int64))'),
+      (dtypes.string, 1, [5], 'ph', 'tf.RaggedTensor('
        'values=Tensor("ph/flat_values:0", shape=(None, 5), dtype=string), '
-       'row_splits=Tensor("ph/RaggedFromRowSplits/control_dependency:0", '
-       'shape=(None,), dtype=int64))'),
-      (dtypes.float32, 2, [], 'ph',
-       'tf.RaggedTensor(values=tf.RaggedTensor('
+       'row_splits=Tensor("ph/row_splits_0:0", shape=(None,), dtype=int64))'),
+      (dtypes.float32, 2, [], 'ph', 'tf.RaggedTensor(values=tf.RaggedTensor('
        'values=Tensor("ph/flat_values:0", shape=(None,), dtype=float32), '
-       'row_splits=Tensor("ph/RaggedFromRowSplits/control_dependency:0", '
-       'shape=(None,), dtype=int64)), '
-       'row_splits=Tensor("ph/RaggedFromRowSplits_1/control_dependency:0", '
-       'shape=(None,), dtype=int64))'),
-      (dtypes.int32, 2, [3, 5], 'ph',
-       'tf.RaggedTensor(values=tf.RaggedTensor('
+       'row_splits=Tensor("ph/row_splits_1:0", shape=(None,), dtype=int64)), '
+       'row_splits=Tensor("ph/row_splits_0:0", shape=(None,), dtype=int64))'),
+      (dtypes.int32, 2, [3, 5], 'ph', 'tf.RaggedTensor(values=tf.RaggedTensor('
        'values=Tensor("ph/flat_values:0", shape=(None, 3, 5), dtype=int32), '
-       'row_splits=Tensor("ph/RaggedFromRowSplits/control_dependency:0", '
-       'shape=(None,), dtype=int64)), '
-       'row_splits=Tensor("ph/RaggedFromRowSplits_1/control_dependency:0", '
-       'shape=(None,), dtype=int64))'),
+       'row_splits=Tensor("ph/row_splits_1:0", shape=(None,), dtype=int64)), '
+       'row_splits=Tensor("ph/row_splits_0:0", shape=(None,), dtype=int64))'),
   ])
   def testRaggedPlaceholder(self, dtype, ragged_rank, value_shape, name,
                             expected):
@@ -72,6 +63,16 @@ class RaggedPlaceholderOpTest(test_util.TensorFlowTestCase,
       with self.assertRaises(RuntimeError):
         ragged_factory_ops.placeholder(dtypes.int32, 1, [])
 
+  def testRaggedPlaceholderDoesNotIncludeValidationOps(self):
+    if context.executing_eagerly():
+      return
+    graph = ops.Graph()
+    with graph.as_default():
+      ragged_factory_ops.placeholder(
+          dtypes.float32, ragged_rank=1, value_shape=[])
+      self.assertEqual([op.type for op in graph.get_operations()],
+                       ['Placeholder', 'Placeholder'])
+
 
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_tensor.py b/tensorflow/python/ops/ragged/ragged_tensor.py
index 767f549e952..5f713fa0793 100644
--- a/tensorflow/python/ops/ragged/ragged_tensor.py
+++ b/tensorflow/python/ops/ragged/ragged_tensor.py
@@ -45,6 +45,7 @@ from tensorflow.python.ops.ragged import ragged_util
 from tensorflow.python.ops.ragged.row_partition import RowPartition
 from tensorflow.python.types import internal as internal_types
 from tensorflow.python.util.tf_export import tf_export
+from tensorflow.tools.docs import doc_controls
 
 # pylint: disable=protected-access
 _convert_row_partition = RowPartition._convert_row_partition
@@ -75,6 +76,18 @@ class RaggedTensor(composite_tensor.CompositeTensor,
   time: it can't depend on the runtime values of `Tensor`s, and can't vary
   dynamically for different session runs.
 
+  Note that the `__init__` constructor is private. Please use one of the
+  following methods to construct a `RaggedTensor`:
+
+      * `tf.RaggedTensor.from_row_lengths`
+      * `tf.RaggedTensor.from_value_rowids`
+      * `tf.RaggedTensor.from_row_splits`
+      * `tf.RaggedTensor.from_row_starts`
+      * `tf.RaggedTensor.from_row_limits`
+      * `tf.RaggedTensor.from_nested_row_splits`
+      * `tf.RaggedTensor.from_nested_row_lengths`
+      * `tf.RaggedTensor.from_nested_value_rowids`
+
   ### Potentially Ragged Tensors
 
   Many ops support both `Tensor`s and `RaggedTensor`s.  The term "potentially
@@ -228,6 +241,7 @@ class RaggedTensor(composite_tensor.CompositeTensor,
   #=============================================================================
   # Constructor (private)
   #=============================================================================
+  @doc_controls.do_not_generate_docs
   def __init__(self, values, row_partition, internal=False):
     """Creates a `RaggedTensor` with a specified partitioning for `values`.
 
@@ -261,15 +275,12 @@ class RaggedTensor(composite_tensor.CompositeTensor,
       raise ValueError("RaggedTensor constructor is private; please use one "
                        "of the factory methods instead (e.g., "
                        "RaggedTensor.from_row_lengths())")
-    if not isinstance(values, (RaggedTensor, ops.Tensor)):
-      raise TypeError("values must be a Tensor or RaggedTensor, got %r" %
-                      values)
+    _assert_is_supported_ragged_values_type(values)
     if not isinstance(row_partition, RowPartition):
       raise TypeError("row_partition must be a RowPartition, got %r" %
                       row_partition)
 
     # Validate shapes.
-    values = convert_to_tensor_or_ragged_tensor(values)
     values.shape.with_rank_at_least(1)
     if isinstance(values, RaggedTensor):
       # pylint: disable=protected-access
@@ -506,7 +517,7 @@ class RaggedTensor(composite_tensor.CompositeTensor,
     if not isinstance(validate, bool):
       raise TypeError("validate must have type bool")
     with ops.name_scope(name, "RaggedFromRowStarts", [values, row_starts]):
-      values = convert_to_tensor_or_ragged_tensor(values)
+      values = _convert_to_ragged_tensor_values(values)
       row_partition = RowPartition.from_row_starts(
           row_starts=row_starts,
           nvals=_nrows(values),
@@ -611,7 +622,7 @@ class RaggedTensor(composite_tensor.CompositeTensor,
       raise TypeError("validate must have type bool")
     with ops.name_scope(name, "RaggedFromUniformRowLength",
                         [values, uniform_row_length, nrows]):
-      values = convert_to_tensor_or_ragged_tensor(values)
+      values = _convert_to_ragged_tensor_values(values)
       uniform_row_length = _convert_row_partition(
           uniform_row_length, "UniformRowLength",
           _get_optional_partition_dtype(values))
@@ -790,7 +801,7 @@ class RaggedTensor(composite_tensor.CompositeTensor,
               (name, row_partition.dtype, values._row_partition.dtype))
         values = values.with_row_splits_dtype(row_partition.dtype)
     else:
-      values = ops.convert_to_tensor(values, name="values")
+      values = _convert_to_ragged_tensor_values(values)
 
     return (values, row_partition)
 
@@ -1325,10 +1336,11 @@ class RaggedTensor(composite_tensor.CompositeTensor,
       `result.rank = self.ragged_rank + new_values.rank`.
       `result.ragged_rank = self.ragged_rank + new_values.ragged_rank`.
     """
-    if isinstance(self._values, ops.Tensor):
-      return self.with_values(new_values)
-    else:
+    if isinstance(self._values, RaggedTensor):
       return self.with_values(self.values.with_flat_values(new_values))
+    else:
+      _assert_is_supported_ragged_values_type(new_values)
+      return self.with_values(new_values)
 
   def with_row_splits_dtype(self, dtype):
     """Returns a copy of this RaggedTensor with the given `row_splits` dtype.
@@ -2149,7 +2161,10 @@ def match_row_splits_dtypes(*tensors, **kwargs):
 class RaggedTensorSpec(type_spec.BatchableTypeSpec):
   """Type specification for a `tf.RaggedTensor`."""
 
-  __slots__ = ["_shape", "_dtype", "_ragged_rank", "_row_splits_dtype"]
+  __slots__ = [
+      "_shape", "_dtype", "_ragged_rank", "_row_splits_dtype",
+      "_flat_values_spec"
+  ]
 
   @property
   def dtype(self):
@@ -2211,7 +2226,7 @@ class RaggedTensorSpec(type_spec.BatchableTypeSpec):
 
   @property
   def row_splits_dtype(self):
-    """The `tf.dtypes.DType` of the the RaggedTensor's `row_splits`.
+    """The `tf.dtypes.DType` of the RaggedTensor's `row_splits`.
 
     Examples:
 
@@ -2225,6 +2240,16 @@ class RaggedTensorSpec(type_spec.BatchableTypeSpec):
     """
     return self._row_splits_dtype
 
+  @property
+  def flat_values_spec(self):
+    """The `TypeSpec` of the flat_values of RaggedTensor.
+
+    Returns:
+      - The TypeSpec of flat_values.
+      - None when the flat_values is a Tensor.
+    """
+    return self._flat_values_spec
+
   @property
   def value_type(self):
     return RaggedTensor if self._ragged_rank > 0 else ops.Tensor
@@ -2233,7 +2258,8 @@ class RaggedTensorSpec(type_spec.BatchableTypeSpec):
                shape=None,
                dtype=dtypes.float32,
                ragged_rank=None,
-               row_splits_dtype=dtypes.int64):
+               row_splits_dtype=dtypes.int64,
+               flat_values_spec=None):
     """Constructs a type specification for a `tf.RaggedTensor`.
 
     Args:
@@ -2244,10 +2270,23 @@ class RaggedTensorSpec(type_spec.BatchableTypeSpec):
         flat_values is partitioned.  Defaults to `shape.ndims - 1`.
       row_splits_dtype: `dtype` for the RaggedTensor's `row_splits` tensor. One
         of `tf.int32` or `tf.int64`.
+      flat_values_spec: TypeSpec for flat_value of the RaggedTensor. It shall be
+        provided when the flat_values is a CompositeTensor rather then Tensor.
+        If both `dtype` and `flat_values_spec` and  are provided, `dtype` must
+        be the same as `flat_values_spec.dtype`. (experimental)
     """
     self._shape = tensor_shape.as_shape(shape)
-    self._dtype = dtypes.as_dtype(dtype)
     self._row_splits_dtype = dtypes.as_dtype(row_splits_dtype)
+    if flat_values_spec is not None:
+      if dtype is None:
+        dtype = flat_values_spec.dtype
+      elif dtype != flat_values_spec.dtype:
+        raise ValueError("dtype must be the same as flat_values_spec.dtype")
+    elif dtype is None:
+      raise ValueError(
+          "At least one of dtype or flat_values_spec must be provided")
+    self._dtype = dtypes.as_dtype(dtype)
+    self._flat_values_spec = flat_values_spec
 
     rank = self._shape.ndims
     if ragged_rank is None:
@@ -2264,29 +2303,43 @@ class RaggedTensorSpec(type_spec.BatchableTypeSpec):
         raise ValueError("ragged_rank must be less than rank.")
 
   def is_compatible_with(self, spec_or_value):
-    if (self._ragged_rank == 0 and
-        isinstance(spec_or_value, (ops.Tensor, tensor_spec.TensorSpec))):
-      return tensor_spec.TensorSpec(
-          self._shape, self._dtype).is_compatible_with(spec_or_value)
-    else:
-      return super(RaggedTensorSpec, self).is_compatible_with(spec_or_value)
+    # RaggedTensor with ragged_rank 0 can be compatible with raw flat_values.
+    if self._ragged_rank == 0:
+      if self._flat_values_spec is None:
+        if isinstance(spec_or_value, (ops.Tensor, tensor_spec.TensorSpec)):
+          return tensor_spec.TensorSpec(
+              self._shape, self._dtype).is_compatible_with(spec_or_value)
+      elif not isinstance(spec_or_value, (RaggedTensor, RaggedTensorSpec)):
+        return self._flat_values_spec.is_compatible_with(spec_or_value)
+    return super(RaggedTensorSpec, self).is_compatible_with(spec_or_value)
 
   def _serialize(self):
-    return (self._shape, self._dtype, self._ragged_rank, self._row_splits_dtype)
+    if self._flat_values_spec is None:
+      return (self._shape, self._dtype, self._ragged_rank,
+              self._row_splits_dtype)
+    else:
+      return (self._shape, self._dtype, self._ragged_rank,
+              self._row_splits_dtype, self._flat_values_spec)
 
   @property
   def _component_specs(self):
     if self._ragged_rank == 0:
-      return [tensor_spec.TensorSpec(self._shape, self._dtype)]
+      if self._flat_values_spec is not None:
+        return [self._flat_values_spec]
+      else:
+        return [tensor_spec.TensorSpec(self._shape, self._dtype)]
 
-    flat_values_shape = tensor_shape.TensorShape([None]).concatenate(
-        self._shape[self._ragged_rank + 1:])
+    flat_values_spec = self._flat_values_spec
+    if flat_values_spec is None:
+      flat_values_shape = tensor_shape.TensorShape([None]).concatenate(
+          self._shape[self._ragged_rank + 1:])
+      flat_values_spec = tensor_spec.TensorSpec(flat_values_shape, self._dtype)
     outer_dim = tensor_shape.dimension_at_index(self._shape, 0)
     outer_splits_shape = [None if outer_dim is None else outer_dim + 1]
     inner_splits_spec = tensor_spec.TensorSpec([None], self._row_splits_dtype)
 
     specs = ([
-        tensor_spec.TensorSpec(flat_values_shape, self._dtype),
+        flat_values_spec,
         tensor_spec.TensorSpec(outer_splits_shape, self._row_splits_dtype)
     ] + [inner_splits_spec for _ in range(self._ragged_rank - 1)])
     return specs
@@ -2328,6 +2381,8 @@ class RaggedTensorSpec(type_spec.BatchableTypeSpec):
   def _to_tensor_list(self, value):
     # TODO(edloper): Update gen_ragged_conversion_ops that convert to and
     # from variant to include all of the row-partitioning tensors.
+    if self._flat_values_spec is not None:
+      raise ValueError("Customized value_type is not supported")
     ragged_rank = value.ragged_rank if isinstance(value, RaggedTensor) else 0
     if ragged_rank != self._ragged_rank:
       raise ValueError("Ragged rank of value (%d) does not match ragged "
@@ -2341,6 +2396,8 @@ class RaggedTensorSpec(type_spec.BatchableTypeSpec):
     return [value._to_variant(batched_input=False)]
 
   def _to_batched_tensor_list(self, value):
+    if self._flat_values_spec is not None:
+      raise ValueError("Customized value_type is not supported")
     ragged_rank = value.ragged_rank if isinstance(value, RaggedTensor) else 0
     if ragged_rank != self._ragged_rank:
       raise ValueError("Ragged rank of value (%d) does not match ragged "
@@ -2353,6 +2410,8 @@ class RaggedTensorSpec(type_spec.BatchableTypeSpec):
     return [value._to_variant(batched_input=True)]
 
   def _from_compatible_tensor_list(self, tensor_list):
+    if self._flat_values_spec is not None:
+      raise ValueError("Customized value_type is not supported")
     if self._ragged_rank < 0:
       raise ValueError("ragged_rank must be non-negative; got %s." %
                        self._ragged_rank)
@@ -2372,11 +2431,15 @@ class RaggedTensorSpec(type_spec.BatchableTypeSpec):
     return result
 
   def _batch(self, batch_size):
+    if self._flat_values_spec is not None:
+      raise ValueError("Customized value_type is not supported")
     return RaggedTensorSpec(
         tensor_shape.TensorShape([batch_size]).concatenate(self._shape),
         self._dtype, self._ragged_rank + 1, self._row_splits_dtype)
 
   def _unbatch(self):
+    if self._flat_values_spec is not None:
+      raise ValueError("Customized value_type is not supported")
     # Note: Negative ragged_rank is allowed here because the dataset could be
     # subsequently batched again. If ragged_rank > 1, assume row_splits_dtype is
     # consistent. Errors are handled in
@@ -2395,11 +2458,20 @@ class RaggedTensorSpec(type_spec.BatchableTypeSpec):
 
   @classmethod
   def from_value(cls, value):
-    return cls(
-        shape=value.shape,
-        dtype=value.values.dtype,
-        ragged_rank=value.ragged_rank,
-        row_splits_dtype=value.row_splits.dtype)
+    if (isinstance(value, ragged_tensor_value.RaggedTensorValue) or
+        isinstance(value.flat_values, ops.Tensor)):
+      return cls(
+          shape=value.shape,
+          dtype=value.values.dtype,
+          ragged_rank=value.ragged_rank,
+          row_splits_dtype=value.row_splits.dtype)
+    else:
+      return cls(
+          shape=value.shape,
+          dtype=value.values.dtype,
+          ragged_rank=value.ragged_rank,
+          row_splits_dtype=value.row_splits.dtype,
+          flat_values_spec=type_spec.type_spec_from_value(value.flat_values))
 
 
 type_spec.register_type_spec_from_value_converter(
@@ -2453,6 +2525,27 @@ def convert_to_tensor_or_ragged_tensor(value,
         value=value, dtype=dtype, preferred_dtype=preferred_dtype, name=name)
 
 
+def _convert_to_ragged_tensor_values(value):
+  """Converts value to supported RaggedTensor value.
+
+  * If `value` is an object of supported value type, then return it as-is.
+  * Otherwise convert it to Tensor or RaggedTensor.
+
+  Args:
+    value: An object of `Tensor`, `RaggedTensor` or registerred RaggedTensor
+      value types, or an object whose type has a registered `Tensor`
+      conversion function.
+
+  Returns:
+    An object of `Tensor`, `RaggedTensor` or registerred RaggedTensor
+    value types
+  """
+  if _is_supported_ragged_values_type(value):
+    return value
+  else:
+    return convert_to_tensor_or_ragged_tensor(value, name="values")
+
+
 #===============================================================================
 # Register RaggedTensor for use with session.run.
 #===============================================================================
@@ -2771,3 +2864,67 @@ def _get_optional_partition_dtype(values):
 
 
 ops.no_gradient("RaggedTensorToVariant")
+
+
+_SUPPORTED_RAGGED_VALUE_TYPES = (ops.Tensor, RaggedTensor)
+
+
+# TODO(edloper): Consider whether we should change the registry to be on
+# TypeSpecs rather than ValueTypes.
+def _add_supported_value_type(cls):
+  """Register the `cls` as supported value type of RaggedTenosr.
+
+  The cls must be a subclass of CompositeTensor, and must support:
+   - Properties:
+     - x.shape
+     - x.dtype
+   - Methods:
+     - x.__getitem__(idx) (method: returns a supported value type)
+   - Ops:
+     - tf.shape(x) -- tf.shape(x)[0] must be a tf.Tensor.
+     - tf.tile(x)
+     - assert_rank_at_least(x)
+     - tf.ones_like(x)
+     - tf.gather(params=x, indices=Tensor)
+     - tf.add(x, y)
+     - tf.boolean_mask(x, ...)
+     - @TODO(edloper): Complete this list
+
+   Note: the following RaggedTensor, RaggedTensorSpec methods & ops are not
+   currently supported unless `rt.values` is a RaggedTensor or a tf.Tensor:
+     - rt.to_tensor()
+     - rt.to_sparse_tensor()
+     - rt._to_variant()
+     - rt._from_variant()
+     - tf.ragged.cross([rt])
+     - tf.gather(params=x, indices=rt)  # rt used for indices
+     - RaggedTensorSpec methods:
+       - _batch
+       - _unbatch
+       - _to_tensor_list
+       - _to_batched_tensor_list
+       - _from_compatible_tensor_list
+
+  Args:
+    cls: The type to be added to supported value types.
+  """
+  if not issubclass(cls, composite_tensor.CompositeTensor):
+    raise ValueError("cls(%s) must be a subclass of CompositeTensor" % cls)
+  if not hasattr(cls, "shape"):
+    raise ValueError("cls must support the `shape` property")
+  if not hasattr(cls, "dtype"):
+    raise ValueError("cls must support the `dtype` property")
+  global _SUPPORTED_RAGGED_VALUE_TYPES
+  _SUPPORTED_RAGGED_VALUE_TYPES += (cls,)
+
+
+def _is_supported_ragged_values_type(value):
+  return isinstance(value, _SUPPORTED_RAGGED_VALUE_TYPES)
+
+
+def _assert_is_supported_ragged_values_type(value):
+  if not _is_supported_ragged_values_type(value):
+    ok_types = ", ".join(cls.__name__ for cls in
+                         _SUPPORTED_RAGGED_VALUE_TYPES)
+    raise TypeError("type(values) must be one of: %r, got %r" %
+                    (ok_types, value))
diff --git a/tensorflow/python/ops/ragged/ragged_tensor_shape.py b/tensorflow/python/ops/ragged/ragged_tensor_shape.py
index 5dd84c5387e..c8635acd697 100644
--- a/tensorflow/python/ops/ragged/ragged_tensor_shape.py
+++ b/tensorflow/python/ops/ragged/ragged_tensor_shape.py
@@ -582,10 +582,12 @@ def _broadcast_to_ragged_shape(rt_input, dst_shape, broadcast_inner_dimensions):
     rt_input = ragged_array_ops.tile(rt_input, multiples)
 
   if broadcast_inner_dimensions:
+    new_shape = array_ops.broadcast_dynamic_shape(
+        array_ops.shape(
+            rt_input.flat_values, out_type=dst_shape.dim_size_dtype),
+        array_ops.concat([[1], dst_shape.inner_dim_sizes], axis=0))
     rt_input = rt_input.with_flat_values(
-        array_ops.reshape(
-            rt_input.flat_values,
-            array_ops.concat([[-1], dst_shape.inner_dim_sizes], axis=0)))
+        array_ops.broadcast_to(rt_input.flat_values, new_shape))
 
   # Do broadcasting for dimensions that become ragged.  We must do these from
   # outermost to innermost.
diff --git a/tensorflow/python/ops/ragged/ragged_tensor_shape_test.py b/tensorflow/python/ops/ragged/ragged_tensor_shape_test.py
index 7014226dc99..1e8aeeeae1d 100644
--- a/tensorflow/python/ops/ragged/ragged_tensor_shape_test.py
+++ b/tensorflow/python/ops/ragged/ragged_tensor_shape_test.py
@@ -378,37 +378,41 @@ class RaggedTensorShapeTest(test_util.TensorFlowTestCase,
         r'partitioned_dim_sizes=\(<[^>]+>, <[^>]+>\), '
         r'inner_dim_sizes=<[^>]+>\)')
 
-  @parameterized.parameters(
-      [
-          dict(
-              x=[[10], [20], [30]],  # shape=[3, 1]
-              dim_sizes=[3, 2],
-              expected=[[10, 10], [20, 20], [30, 30]]),
-          dict(
-              x=[[10], [20], [30]],  # shape=[3, 1]
-              dim_sizes=[3, [3, 0, 2]],
-              expected=ragged_factory_ops.constant_value(
-                  [[10, 10, 10], [], [30, 30]], dtype=np.int32)),
-          dict(
-              x=[[[1, 2, 3]], [[4, 5, 6]]],  # shape = [2, 1, 3]
-              dim_sizes=[2, [2, 3], 3],
-              expected=ragged_factory_ops.constant_value(
-                  [[[1, 2, 3], [1, 2, 3]], [[4, 5, 6], [4, 5, 6], [4, 5, 6]]],
-                  dtype=np.int32,
-                  ragged_rank=1)),
-          dict(
-              x=[[[1]], [[2]]],  # shape = [2, 1, 1]
-              dim_sizes=[2, [2, 3], [0, 2, 1, 2, 0]],
-              expected=ragged_factory_ops.constant_value(
-                  [[[], [1, 1]], [[2], [2, 2], []]],
-                  dtype=np.int32,
-                  ragged_rank=2)),
-          dict(
-              x=10,
-              dim_sizes=[3, [3, 0, 2]],
-              expected=ragged_factory_ops.constant_value([[10, 10, 10], [],
-                                                          [10, 10]])),
-      ])
+  @parameterized.parameters([
+      dict(
+          x=[[10], [20], [30]],  # shape=[3, 1]
+          dim_sizes=[3, 2],
+          expected=[[10, 10], [20, 20], [30, 30]]),
+      dict(
+          x=[[10], [20], [30]],  # shape=[3, 1]
+          dim_sizes=[3, [3, 0, 2]],
+          expected=ragged_factory_ops.constant_value(
+              [[10, 10, 10], [], [30, 30]], dtype=np.int32)),
+      dict(
+          x=[[[1, 2, 3]], [[4, 5, 6]]],  # shape = [2, 1, 3]
+          dim_sizes=[2, [2, 3], 3],
+          expected=ragged_factory_ops.constant_value(
+              [[[1, 2, 3], [1, 2, 3]], [[4, 5, 6], [4, 5, 6], [4, 5, 6]]],
+              dtype=np.int32,
+              ragged_rank=1)),
+      dict(
+          x=[[[1]], [[2]]],  # shape = [2, 1, 1]
+          dim_sizes=[2, [2, 3], [0, 2, 1, 2, 0]],
+          expected=ragged_factory_ops.constant_value(
+              [[[], [1, 1]], [[2], [2, 2], []]], dtype=np.int32,
+              ragged_rank=2)),
+      dict(
+          x=10,
+          dim_sizes=[3, [3, 0, 2]],
+          expected=ragged_factory_ops.constant_value([[10, 10, 10], [],
+                                                      [10, 10]])),
+      dict(
+          x=ragged_factory_ops.constant_value([[[1], [2]], [[3]]],
+                                              ragged_rank=1),
+          dim_sizes=[2, [2, 1], 2],
+          expected=ragged_factory_ops.constant_value(
+              [[[1, 1], [2, 2]], [[3, 3]]], ragged_rank=1)),
+  ])
   def testRaggedBroadcastTo(self, x, dim_sizes, expected):
     shape = RaggedTensorDynamicShape.from_dim_sizes(dim_sizes)
     result = ragged_tensor_shape.broadcast_to(x, shape)
diff --git a/tensorflow/python/ops/ragged/ragged_tensor_supported_values_test.py b/tensorflow/python/ops/ragged/ragged_tensor_supported_values_test.py
new file mode 100644
index 00000000000..9d0241e7cf3
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_tensor_supported_values_test.py
@@ -0,0 +1,500 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for RaggedTensor supported value types."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.framework import composite_tensor
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.framework import test_util
+from tensorflow.python.framework import type_spec
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import string_ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import ragged_tensor_test_ops as test_ops
+from tensorflow.python.ops.ragged.ragged_tensor import RaggedTensor
+from tensorflow.python.ops.ragged.ragged_tensor import RaggedTensorSpec
+from tensorflow.python.platform import googletest
+from tensorflow.python.util import dispatch
+
+
+class WrappedTensor(composite_tensor.CompositeTensor):
+  """A class used to test extending RaggedTensor value type support.
+
+  Simply wraps a `tf.Tensor` value.
+  """
+
+  def __init__(self, value):
+    if not isinstance(value, ops.Tensor):
+      raise ValueError("Expect Tensor object, but get '%s'" % value)
+    self.value = value
+
+  @property
+  def shape(self):
+    return self.value.shape
+
+  @property
+  def dtype(self):
+    return self.value.dtype
+
+  def __getitem__(self, idx):
+    return WrappedTensor(self.value.__getitem__(idx))
+
+  @property
+  def _type_spec(self):
+    return WrappedTensorSpec(type_spec.type_spec_from_value(self.value))
+
+
+class WrappedTensorSpec(type_spec.TypeSpec):
+
+  def __init__(self, value_spec):
+    self._value_spec = value_spec
+
+  @property
+  def dtype(self):
+    return self._value_spec.dtype
+
+  @property
+  def value_type(self):
+    return WrappedTensor
+
+  def _to_components(self, value):
+    return value.value
+
+  def _from_components(self, value):
+    return WrappedTensor(value)
+
+  def _component_specs(self):
+    return self._value_spec
+
+  def _serialize(self):
+    return (self._value_spec,)
+
+
+class WrappedTensorOpDispatcher(dispatch.GlobalOpDispatcher):
+  """Global op dispatcher for WrappedTensor."""
+
+  # For these ops, just return plain Tensors (not WrappedTensors).
+  OPS_THAT_RETURN_UNTRACED_RESULTS = (array_ops.shape, array_ops.shape_v2,
+                                      check_ops.assert_rank_at_least)
+
+  def call_op(self, op, *args, **kwargs):
+    return op(*args, **kwargs)
+
+  def handle(self, op, args, kwargs):
+    # Dispatcher only applies if at least one arg is a WrappedTensor.
+    if not (any(self.is_wrapped_tensor_arg(x) for x in args) or
+            any(self.is_wrapped_tensor_arg(x) for x in kwargs.values())):
+      return self.NOT_SUPPORTED
+
+    args = [self.unwrap(v) for v in args]
+    kwargs = dict([(k, self.unwrap(v)) for (k, v) in kwargs.items()])
+    value = self.call_op(op, *args, **kwargs)
+    if op in self.OPS_THAT_RETURN_UNTRACED_RESULTS:
+      return value
+    else:
+      return WrappedTensor(value)
+
+  def unwrap(self, value):
+    if isinstance(value, WrappedTensor):
+      return value.value
+    elif isinstance(value, (list, tuple)):
+      return type(value)([self.unwrap(v) for v in value])
+    else:
+      return value
+
+  def is_wrapped_tensor_arg(self, value):
+    if isinstance(value, WrappedTensor):
+      return True
+    if isinstance(value, (list, tuple)):
+      if any(isinstance(x, WrappedTensor) for x in value):
+        return True
+    return False
+
+
+WrappedTensorOpDispatcher().register()
+ragged_tensor._add_supported_value_type(WrappedTensor)
+
+
+# pylint: disable=g-complex-comprehension
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedTensorSupportedValuesTest(test_util.TensorFlowTestCase,
+                                      parameterized.TestCase):
+
+  def assertAllTensorsEqual(self, list1, list2):
+    self.assertLen(list1, len(list2))
+    for (t1, t2) in zip(list1, list2):
+      self.assertAllEqual(t1, t2)
+
+  def testConstruction(self):
+    tensor_values = constant_op.constant(
+        ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
+    values = WrappedTensor(tensor_values)
+
+    row_splits = constant_op.constant([0, 2, 2, 5, 6, 8], dtypes.int64)
+    rt = RaggedTensor.from_row_splits(values, row_splits)
+    self.assertIsInstance(rt.values, WrappedTensor)
+    self.assertAllEqual(rt.values.value, tensor_values)
+    self.assertAllEqual(rt.row_splits, row_splits)
+
+    row_starts = constant_op.constant([0, 2, 2, 5, 6], dtypes.int64)
+    rt = RaggedTensor.from_row_starts(values, row_starts)
+    self.assertIsInstance(rt.values, WrappedTensor)
+    self.assertAllEqual(rt.values.value, tensor_values)
+    self.assertAllEqual(rt.row_starts(), row_starts)
+
+    row_limits = constant_op.constant([2, 2, 5, 6, 8], dtypes.int64)
+    rt = RaggedTensor.from_row_limits(values, row_limits)
+    self.assertIsInstance(rt.values, WrappedTensor)
+    self.assertAllEqual(rt.values.value, tensor_values)
+    self.assertAllEqual(rt.row_limits(), row_limits)
+
+    row_lengths = constant_op.constant([2, 0, 3, 1, 2], dtypes.int64)
+    rt = RaggedTensor.from_row_lengths(values, row_lengths)
+    self.assertIsInstance(rt.values, WrappedTensor)
+    self.assertAllEqual(rt.values.value, tensor_values)
+    self.assertAllEqual(rt.row_lengths(), row_lengths)
+
+    rt = RaggedTensor.from_uniform_row_length(values, 4)
+    self.assertIsInstance(rt.values, WrappedTensor)
+    self.assertAllEqual(rt.values.value, tensor_values)
+    self.assertAllEqual(rt.uniform_row_length, 4)
+
+  def testWithValues(self):
+    tensor_values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    values = WrappedTensor(tensor_values)
+    nested_row_splits = [[0, 2, 5], [0, 2, 2, 5, 6, 7]]
+    rt = RaggedTensor.from_nested_row_splits(values, nested_row_splits)
+
+    tensor_int = constant_op.constant([1, 2, 3, 4, 5])
+    rt_int = rt.with_values(tensor_int)
+    self.assertAllEqual(rt_int.values, tensor_int)
+
+    rt_wrapped_int = rt.with_values(WrappedTensor(tensor_int))
+    self.assertIsInstance(rt_wrapped_int.values, WrappedTensor)
+    self.assertAllEqual(rt_wrapped_int.values.value, tensor_int)
+
+  def testWithFlatValues(self):
+    tensor_values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    values = WrappedTensor(tensor_values)
+    nested_row_splits = [[0, 2, 5], [0, 2, 2, 5, 6, 7]]
+    rt = RaggedTensor.from_nested_row_splits(values, nested_row_splits)
+
+    tensor_int = constant_op.constant([1, 2, 3, 4, 5, 6, 7])
+    rt_int = rt.with_flat_values(tensor_int)
+    self.assertAllEqual(rt_int.flat_values, tensor_int)
+
+    rt_wrapped_int = rt.with_flat_values(WrappedTensor(tensor_int))
+    self.assertIsInstance(rt_wrapped_int.flat_values, WrappedTensor)
+    self.assertAllEqual(rt_wrapped_int.flat_values.value, tensor_int)
+
+  @parameterized.parameters(
+      #=========================================================================
+      # Test each unary op.
+      #=========================================================================
+      [{'x': ([[-2.0, 3.0], [-3.0]]), 'op': op}
+       for op in test_ops.UNARY_FLOAT_OPS] +
+      [{'x': ([[True, False], [True]]),
+        'op': op}
+       for op in test_ops.UNARY_BOOL_OPS] +
+      [{'x': [[18, 512], [12412]],
+        'x_dtype': dtypes.int32,
+        'op': op}
+       for op in test_ops.UNARY_INT_OPS] +
+      [{'x': ([['abcd', 'efgh'], ['aabbccdd']]),
+        'op': op}
+       for op in test_ops.UNARY_STRING_OPS] +
+      [
+          {'op': clip_ops.clip_by_value,
+           'x': ([[-2.0, 3.0], [-3.0]]),
+           'clip_value_min': 0.1, 'clip_value_max': 4.0},
+          {'op': math_ops.cast,
+           'x': ([[-2.0, 3.0], [-3.0]]),
+           'dtype': dtypes.int32},
+          {'op': math_ops.saturate_cast,
+           'x': ([[-2.0, 3.0], [-3.0]]),
+           'dtype': dtypes.int32},
+          {'op': string_ops.string_to_hash_bucket,
+           'x': (
+               [['abcd', 'efgh'], ['aabbccdd']]),
+           'num_buckets': 1000},
+          {'op': string_ops.string_to_hash_bucket_fast,
+           'x': (
+               [['abcd', 'efgh'], ['aabbccdd']]),
+           'num_buckets': 1000},
+          {'op': string_ops.string_to_hash_bucket_strong,
+           'x': (
+               [['abcd', 'efgh'], ['aabbccdd']]),
+           'num_buckets': 1000,
+           'key': [1231, 12512]},
+          {'op': string_ops.string_to_number,
+           'x': ([['-2.0', '3.0'], ['-3.0']])},
+          {'op': string_ops.regex_full_match,
+           'x': ([['hello', '123'], ['1+1']]),
+           'pattern': r'\w+'},
+          {'op': string_ops.regex_replace,
+           'x': ([['hello', '123'], ['1+1']]),
+           'pattern': r'\d',
+           'rewrite': '#'},
+          {'op': string_ops.substr,
+           'x': ([['hello', '123'], ['1+1']]),
+           'pos': 2, 'len': 3},
+          {'op': array_ops.check_numerics,
+           'x': ([[-2.0, 3.0], [-3.0]]),
+           'message': 'check-numerics'},
+          {'op': nn_ops.dropout,
+           'x': ([[-2.0, 3.0], [-3.0]]),
+           'rate': 0.5,
+           'seed': 1},
+      ])  # pyformat: disable
+  def testUnaryElementwiseOp(self,
+                             x,
+                             x_dtype=None,
+                             op=math_ops.abs,
+                             **extra_args):
+    x = ragged_factory_ops.constant(x, x_dtype)
+    wrapped_x = ragged_tensor.RaggedTensor.from_nested_row_splits(
+        WrappedTensor(x.flat_values), x.nested_row_splits)
+    test_util.random_seed.set_seed(1234)
+    res = op(x, **extra_args)
+    test_util.random_seed.set_seed(1234)
+    wrapped_res = op(wrapped_x, **extra_args)
+    self.assertIsInstance(wrapped_res.flat_values, WrappedTensor)
+    self.assertAllEqual(wrapped_res.flat_values.value, res.flat_values)
+    self.assertAllTensorsEqual(wrapped_res.nested_row_splits,
+                               res.nested_row_splits)
+
+  @parameterized.parameters(
+      #=========================================================================
+      # Test each binary op.
+      #=========================================================================
+      [{'x': [[-2.0, 3.0], [-3.0]],
+        'y': [[5.0, 1.0], [12.0]],
+        'op': op}
+       for op in test_ops.BINARY_FLOAT_OPS] +
+      [{'x': [[-2, 3], [-3]],
+        'y': [[5, 1], [12]],
+        'op': op}
+       for op in test_ops.BINARY_INT_OPS] +
+      [{'x': [[True, True], [False]],
+        'y': [[False, True], [False]],
+        'op': op}
+       for op in test_ops.BINARY_BOOL_OPS]
+      )  # pyformat: disable
+  def testBinaryElementwiseOp(self, x, y, op=math_ops.add):
+    x = ragged_factory_ops.constant(x)
+    y = ragged_factory_ops.constant(y)
+    wrapped_x = ragged_tensor.RaggedTensor.from_nested_row_splits(
+        WrappedTensor(x.flat_values), x.nested_row_splits)
+    wrapped_y = ragged_tensor.RaggedTensor.from_nested_row_splits(
+        WrappedTensor(y.flat_values), y.nested_row_splits)
+    res = op(x, y)
+    wrapped_res = op(wrapped_x, wrapped_y)
+    self.assertIsInstance(wrapped_res.flat_values, WrappedTensor)
+    self.assertAllEqual(wrapped_res.flat_values.value, res.flat_values)
+    self.assertAllTensorsEqual(wrapped_res.nested_row_splits,
+                               res.nested_row_splits)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedTensorSpecSupportedValuesTest(test_util.TensorFlowTestCase,
+                                          parameterized.TestCase):
+
+  def assertAllTensorsEqual(self, list1, list2):
+    self.assertLen(list1, len(list2))
+    for (t1, t2) in zip(list1, list2):
+      self.assertAllEqual(t1, t2)
+
+  def testConstruction(self):
+    flat_values_spec = WrappedTensorSpec(
+        tensor_spec.TensorSpec(shape=(None, 5), dtype=dtypes.float32))
+    spec1 = RaggedTensorSpec(
+        shape=None,
+        dtype=dtypes.float32,
+        ragged_rank=1,
+        row_splits_dtype=dtypes.int64,
+        flat_values_spec=flat_values_spec)
+    self.assertIsNone(spec1._shape.rank)
+    self.assertEqual(spec1._dtype, dtypes.float32)
+    self.assertEqual(spec1._row_splits_dtype, dtypes.int64)
+    self.assertEqual(spec1._ragged_rank, 1)
+    self.assertEqual(spec1._flat_values_spec, flat_values_spec)
+
+    self.assertIsNone(spec1.shape.rank)
+    self.assertEqual(spec1.dtype, dtypes.float32)
+    self.assertEqual(spec1.row_splits_dtype, dtypes.int64)
+    self.assertEqual(spec1.ragged_rank, 1)
+    self.assertEqual(spec1.flat_values_spec, flat_values_spec)
+
+    with self.assertRaisesRegex(
+        ValueError, 'dtype must be the same as flat_values_spec.dtype'):
+      spec1 = RaggedTensorSpec(
+          shape=None,
+          dtype=dtypes.float64,
+          ragged_rank=1,
+          row_splits_dtype=dtypes.int64,
+          flat_values_spec=flat_values_spec)
+
+  @parameterized.parameters([
+      (RaggedTensorSpec(
+          ragged_rank=1,
+          flat_values_spec=tensor_spec.TensorSpec(None, dtypes.float32)),
+       (tensor_shape.TensorShape(None), dtypes.float32, 1, dtypes.int64,
+        tensor_spec.TensorSpec(None, dtypes.float32))),
+      (RaggedTensorSpec(
+          shape=(5, None, 5),
+          ragged_rank=1,
+          dtype=dtypes.float64,
+          flat_values_spec=tensor_spec.TensorSpec(
+              (5,), dtypes.float64)), (tensor_shape.TensorShape(
+                  (5, None, 5)), dtypes.float64, 1, dtypes.int64,
+                                       tensor_spec.TensorSpec((5,),
+                                                              dtypes.float64))),
+  ])
+  def testSerialize(self, rt_spec, expected):
+    serialization = rt_spec._serialize()
+    # TensorShape has an unconventional definition of equality, so we can't use
+    # assertEqual directly here.  But repr() is deterministic and lossless for
+    # the expected values, so we can use that instead.
+    self.assertEqual(repr(serialization), repr(expected))
+
+  @parameterized.parameters([
+      (RaggedTensorSpec(
+          ragged_rank=0,
+          shape=[5, 3],
+          flat_values_spec=WrappedTensorSpec(
+              tensor_spec.TensorSpec([5, 3], dtypes.float32))),
+       [WrappedTensorSpec(tensor_spec.TensorSpec([5, 3], dtypes.float32))]),
+      (RaggedTensorSpec(
+          ragged_rank=1,
+          flat_values_spec=WrappedTensorSpec(
+              tensor_spec.TensorSpec([None, 3], dtypes.float32))),
+       [
+           WrappedTensorSpec(tensor_spec.TensorSpec([None, 3], dtypes.float32)),
+           tensor_spec.TensorSpec([None], dtypes.int64),
+       ]),
+      (RaggedTensorSpec(
+          ragged_rank=2,
+          dtype=dtypes.float64,
+          flat_values_spec=WrappedTensorSpec(
+              tensor_spec.TensorSpec([None, 3], dtypes.float64))),
+       [
+           WrappedTensorSpec(tensor_spec.TensorSpec([None, 3], dtypes.float64)),
+           tensor_spec.TensorSpec([None], dtypes.int64),
+           tensor_spec.TensorSpec([None], dtypes.int64),
+       ]),
+      (RaggedTensorSpec(
+          shape=[5, None, None],
+          dtype=dtypes.string,
+          flat_values_spec=WrappedTensorSpec(
+              tensor_spec.TensorSpec([None, 3], dtypes.string))),
+       [
+           WrappedTensorSpec(tensor_spec.TensorSpec([None, 3], dtypes.string)),
+           tensor_spec.TensorSpec([6], dtypes.int64),
+           tensor_spec.TensorSpec([None], dtypes.int64),
+       ]),
+  ])
+  def testComponentSpecs(self, rt_spec, expected):
+    self.assertEqual(rt_spec._component_specs, expected)
+
+  @parameterized.parameters([
+      {
+          'rt_spec':
+              RaggedTensorSpec(
+                  shape=[2, None, None],
+                  ragged_rank=1,
+                  flat_values_spec=WrappedTensorSpec(
+                      tensor_spec.TensorSpec(None, dtype=dtypes.float32))),
+          'flat_values': [[1.0, 2.0], [3.0, 4.0]],
+          'nested_row_splits': [[0, 1, 1, 2]],
+      },
+      {
+          'rt_spec':
+              RaggedTensorSpec(
+                  shape=[2, None, None],
+                  flat_values_spec=WrappedTensorSpec(
+                      tensor_spec.TensorSpec(None, dtype=dtypes.float32))),
+          'flat_values': [1.0, 2.0, 3.0, 4.0],
+          'nested_row_splits': [[0, 2, 4], [0, 2, 3, 3, 4]],
+      },
+  ])
+  def testToFromComponents(self, rt_spec, flat_values, nested_row_splits):
+    wrapped_tensor = WrappedTensor(constant_op.constant(flat_values))
+    rt = RaggedTensor.from_nested_row_splits(wrapped_tensor, nested_row_splits)
+    components = rt_spec._to_components(rt)
+    self.assertIsInstance(components[0], WrappedTensor)
+    self.assertAllEqual(components[0].value, wrapped_tensor.value)
+    self.assertAllTensorsEqual(components[1:], nested_row_splits)
+    rt_reconstructed = rt_spec._from_components(components)
+    self.assertIsInstance(rt_reconstructed.flat_values, WrappedTensor)
+    self.assertAllEqual(rt_reconstructed.flat_values.value,
+                        wrapped_tensor.value)
+    self.assertAllTensorsEqual(rt_reconstructed.nested_row_splits,
+                               rt.nested_row_splits)
+    self.assertEqual(rt_reconstructed.dtype, rt.dtype)
+
+  def testIsCompatibleWith(self):
+    spec1 = RaggedTensorSpec([32, None, None],
+                             dtypes.float32,
+                             2,
+                             flat_values_spec=WrappedTensorSpec(
+                                 tensor_spec.TensorSpec([None, None],
+                                                        dtypes.float32)))
+    spec2 = RaggedTensorSpec(
+        None,
+        dtypes.float32,
+        2,
+        flat_values_spec=WrappedTensorSpec(
+            tensor_spec.TensorSpec(None, dtypes.float32)))
+    spec3 = RaggedTensorSpec(
+        None,
+        dtypes.int32,
+        1,
+        flat_values_spec=WrappedTensorSpec(
+            tensor_spec.TensorSpec(None, dtypes.int32)))
+    spec4 = RaggedTensorSpec([None],
+                             dtypes.int32,
+                             0,
+                             flat_values_spec=WrappedTensorSpec(
+                                 tensor_spec.TensorSpec(None, dtypes.int32)))
+    spec5 = RaggedTensorSpec([None], dtypes.int32, 0)
+
+    self.assertTrue(spec1.is_compatible_with(spec2))
+    self.assertFalse(spec1.is_compatible_with(spec3))
+    self.assertFalse(spec1.is_compatible_with(spec4))
+    self.assertFalse(spec2.is_compatible_with(spec3))
+    self.assertFalse(spec2.is_compatible_with(spec4))
+    self.assertFalse(spec3.is_compatible_with(spec4))
+    self.assertFalse(spec4.is_compatible_with(spec5))
+    value = constant_op.constant([1, 2, 3])
+    self.assertFalse(spec4.is_compatible_with(value))
+    self.assertTrue(spec4.is_compatible_with(WrappedTensor(value)))
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_tensor_test.py b/tensorflow/python/ops/ragged/ragged_tensor_test.py
index 286b730b298..d92cb9cec6c 100644
--- a/tensorflow/python/ops/ragged/ragged_tensor_test.py
+++ b/tensorflow/python/ops/ragged/ragged_tensor_test.py
@@ -155,8 +155,9 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
                                 'RaggedTensor constructor is private'):
       RaggedTensor(values=values, row_partition=rp)
 
-    with self.assertRaisesRegex(TypeError,
-                                'values must be a Tensor or RaggedTensor'):
+    with self.assertRaisesRegex(
+        TypeError,
+        r"""type\(values\) must be one of: 'Tensor, RaggedTensor.*"""):
       RaggedTensor(values=range(7), row_partition=rp, internal=True)
 
     with self.assertRaisesRegex(TypeError,
diff --git a/tensorflow/python/ops/ragged/ragged_tensor_test_ops.py b/tensorflow/python/ops/ragged/ragged_tensor_test_ops.py
new file mode 100644
index 00000000000..c4f04a69ac3
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_tensor_test_ops.py
@@ -0,0 +1,125 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""It lists ops of RaggedTensor for the interest of test."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_bitwise_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.ops import string_ops
+
+
+# Constants listing various op types to test.  Each operation
+# should be included in at least one list below, or tested separately if
+# necessary (e.g., because it expects additional arguments).
+UNARY_FLOAT_OPS = [
+    math_ops.abs,
+    math_ops.acos,
+    math_ops.acosh,
+    math_ops.angle,
+    math_ops.asin,
+    math_ops.asinh,
+    math_ops.atan,
+    math_ops.atanh,
+    math_ops.ceil,
+    math_ops.conj,
+    math_ops.cos,
+    math_ops.cosh,
+    math_ops.digamma,
+    math_ops.erf,
+    math_ops.erfc,
+    math_ops.erfinv,
+    math_ops.exp,
+    math_ops.expm1,
+    math_ops.floor,
+    math_ops.imag,
+    math_ops.is_finite,
+    math_ops.is_inf,
+    math_ops.is_nan,
+    math_ops.lgamma,
+    math_ops.log,
+    math_ops.log1p,
+    math_ops.log_sigmoid,
+    math_ops.ndtri,
+    math_ops.negative,
+    math_ops.real,
+    math_ops.reciprocal,
+    math_ops.rint,
+    math_ops.round,
+    math_ops.rsqrt,
+    math_ops.sign,
+    math_ops.sin,
+    math_ops.sinh,
+    math_ops.sqrt,
+    math_ops.square,
+    math_ops.tan,
+    array_ops.identity,
+    array_ops.ones_like,
+    array_ops.zeros_like,
+]
+UNARY_BOOL_OPS = [
+    math_ops.logical_not,
+]
+UNARY_STRING_OPS = [
+    string_ops.decode_base64,
+    string_ops.encode_base64,
+    string_ops.string_strip,
+    parsing_ops.decode_compressed,
+]
+BINARY_FLOAT_OPS = [
+    math_ops.add,
+    math_ops.atan2,
+    math_ops.complex,
+    math_ops.div_no_nan,
+    math_ops.divide,
+    math_ops.equal,
+    math_ops.floordiv,
+    math_ops.floormod,
+    math_ops.greater,
+    math_ops.greater_equal,
+    math_ops.less,
+    math_ops.less_equal,
+    math_ops.maximum,
+    math_ops.minimum,
+    math_ops.multiply,
+    math_ops.not_equal,
+    math_ops.pow,
+    math_ops.realdiv,
+    math_ops.squared_difference,
+    math_ops.subtract,
+    math_ops.truediv,
+]
+BINARY_BOOL_OPS = [
+    math_ops.logical_and,
+    math_ops.logical_or,
+    math_ops.logical_xor,
+]
+UNARY_INT_OPS = [
+    gen_bitwise_ops.invert,
+    string_ops.unicode_script,
+]
+BINARY_INT_OPS = [
+    gen_bitwise_ops.bitwise_and,
+    gen_bitwise_ops.bitwise_or,
+    gen_bitwise_ops.bitwise_xor,
+    gen_bitwise_ops.left_shift,
+    gen_bitwise_ops.right_shift,
+    math_ops.truncatediv,
+    math_ops.truncatemod,
+]
diff --git a/tensorflow/python/ops/ragged/ragged_where_op.py b/tensorflow/python/ops/ragged/ragged_where_op.py
index 23af6b20146..6e4ff548f62 100644
--- a/tensorflow/python/ops/ragged/ragged_where_op.py
+++ b/tensorflow/python/ops/ragged/ragged_where_op.py
@@ -25,6 +25,68 @@ from tensorflow.python.ops.ragged import ragged_concat_ops
 from tensorflow.python.ops.ragged import ragged_functional_ops
 from tensorflow.python.ops.ragged import ragged_gather_ops
 from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import ragged_tensor_shape
+
+
+def where_v2(condition, x=None, y=None, name=None):
+  """Return the elements where `condition` is `True`.
+
+  : If both `x` and `y` are None: Retrieve indices of true elements.
+
+    Returns the coordinates of true elements of `condition`. The coordinates
+    are returned in a 2-D tensor with shape
+    `[num_true_values, dim_size(condition)]`, where `result[i]` is the
+    coordinates of the `i`th true value (in row-major order).
+
+  : If both `x` and `y` are non-`None`: Multiplex between `x` and `y`.
+
+    Choose an output shape  from the shapes of `condition`, `x`, and `y` that
+    all three shapes are broadcastable to; and then use the broadcasted
+    `condition` tensor as a mask that chooses whether the corredsponding element
+    in the output should be taken from `x` (if `condition` is true) or `y` (if
+    `condition` is false).
+
+  >>> # Example: retrieve indices of true elements
+  >>> tf.where(tf.ragged.constant([[True, False], [True]]))
+  <tf.Tensor: shape=(2, 2), dtype=int64, numpy= array([[0, 0], [1, 0]])>
+
+  >>> # Example: multiplex between `x` and `y`
+  >>> tf.where(tf.ragged.constant([[True, False], [True, False, True]]),
+  ...          tf.ragged.constant([['A', 'B'], ['C', 'D', 'E']]),
+  ...          tf.ragged.constant([['a', 'b'], ['c', 'd', 'e']]))
+  <tf.RaggedTensor [[b'A', b'b'], [b'C', b'd', b'E']]>
+
+  Args:
+    condition: A potentially ragged tensor of type `bool`
+    x: A potentially ragged tensor (optional).
+    y: A potentially ragged tensor (optional).  Must be specified if `x` is
+      specified.  Must have the same rank and type as `x`.
+    name: A name of the operation (optional).
+
+  Returns:
+    : If both `x` and `y` are `None`:
+      A `Tensor` with shape `(num_true, rank(condition))`.
+    : Otherwise:
+      A potentially ragged tensor with the same type as `x` and `y`, and whose
+      shape is broadcast-compatible with `x`, `y`, and `condition`.
+
+  Raises:
+    ValueError: When exactly one of `x` or `y` is non-`None`; or when
+      `condition`, `x`, and `y` have incompatible shapes.
+  """
+  if (x is None) != (y is None):
+    raise ValueError('x and y must be either both None or both non-None')
+
+  with ops.name_scope('RaggedWhere', name, [condition, x, y]):
+    condition = ragged_tensor.convert_to_tensor_or_ragged_tensor(
+        condition, name='condition')
+    if x is None:
+      return _coordinate_where(condition)
+    else:
+      x = ragged_tensor.convert_to_tensor_or_ragged_tensor(x, name='x')
+      y = ragged_tensor.convert_to_tensor_or_ragged_tensor(y, name='y')
+      condition, x, y = ragged_tensor.match_row_splits_dtypes(condition, x, y)
+      return _elementwise_where_v2(condition, x, y)
 
 
 def where(condition, x=None, y=None, name=None):
@@ -134,6 +196,32 @@ def _elementwise_where(condition, x, y):
     raise ValueError('Input shapes do not match.')
 
 
+def _elementwise_where_v2(condition, x, y):
+  """Ragged version of tf.where_v2(condition, x, y)."""
+  # Broadcast x, y, and condition to have the same shape.
+  if not (condition.shape.is_fully_defined() and x.shape.is_fully_defined() and
+          y.shape.is_fully_defined() and x.shape == y.shape and
+          condition.shape == x.shape):
+    shape_c = ragged_tensor_shape.RaggedTensorDynamicShape.from_tensor(
+        condition)
+    shape_x = ragged_tensor_shape.RaggedTensorDynamicShape.from_tensor(x)
+    shape_y = ragged_tensor_shape.RaggedTensorDynamicShape.from_tensor(y)
+    shape = ragged_tensor_shape.broadcast_dynamic_shape(
+        shape_c, ragged_tensor_shape.broadcast_dynamic_shape(shape_x, shape_y))
+    condition = ragged_tensor_shape.broadcast_to(condition, shape)
+    x = ragged_tensor_shape.broadcast_to(x, shape)
+    y = ragged_tensor_shape.broadcast_to(y, shape)
+
+  condition_is_ragged = isinstance(condition, ragged_tensor.RaggedTensor)
+  x_is_ragged = isinstance(x, ragged_tensor.RaggedTensor)
+  y_is_ragged = isinstance(y, ragged_tensor.RaggedTensor)
+  if not (condition_is_ragged or x_is_ragged or y_is_ragged):
+    return array_ops.where_v2(condition, x, y)
+
+  return ragged_functional_ops.map_flat_values(array_ops.where_v2, condition, x,
+                                               y)
+
+
 def _coordinate_where(condition):
   """Ragged version of tf.where(condition)."""
   if not isinstance(condition, ragged_tensor.RaggedTensor):
diff --git a/tensorflow/python/ops/ragged/ragged_where_op_test.py b/tensorflow/python/ops/ragged/ragged_where_op_test.py
index 4d5d6cd666a..162e47ec659 100644
--- a/tensorflow/python/ops/ragged/ragged_where_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_where_op_test.py
@@ -19,6 +19,9 @@ from __future__ import division
 from __future__ import print_function
 
 from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_where_op
@@ -26,8 +29,7 @@ from tensorflow.python.platform import googletest
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class RaggedWhereOpTest(test_util.TensorFlowTestCase,
-                        parameterized.TestCase):
+class RaggedWhereV1OpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   @parameterized.parameters([
       #=========================================================================
@@ -209,5 +211,169 @@ class RaggedWhereOpTest(test_util.TensorFlowTestCase,
       ragged_where_op.where(condition, x, y)
 
 
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedWhereV2OpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
+
+  @parameterized.parameters([
+      #=========================================================================
+      # Coordinate-retrieval mode
+      #=========================================================================
+      dict(  # shape=[D1]
+          condition=[True, False, True, False, True],
+          expected=[[0], [2], [4]]),
+      dict(  # shape=[D1, D2]
+          condition=[[True, False], [False, True]],
+          expected=[[0, 0], [1, 1]]),
+      dict(  # shape=[D1, (D2)]
+          condition=ragged_factory_ops.constant_value(
+              [[True, False, True], [False, True]]),
+          expected=[[0, 0], [0, 2], [1, 1]]),
+      dict(  # shape=[D1, (D2), (D3)]
+          condition=ragged_factory_ops.constant_value([
+              [[True, False, True], [False, True]],
+              [[True], [], [False], [False, True, False]]
+          ]),
+          expected=[[0, 0, 0], [0, 0, 2], [0, 1, 1],
+                    [1, 0, 0], [1, 3, 1]]),
+      dict(  # shape=[D1, (D2), D3]
+          condition=ragged_factory_ops.constant_value([
+              [[True, False], [False, True]],
+              [[True, False], [False, False], [True, False], [False, True]]
+          ], ragged_rank=1),
+          expected=[[0, 0, 0], [0, 1, 1],
+                    [1, 0, 0], [1, 2, 0], [1, 3, 1]]),
+      dict(  # shape=[D1, (D2), (D3), (D4)]
+          condition=ragged_factory_ops.constant_value([
+              [[[], [True]]],
+              [[[True, False, True], [False, True]],
+               [[True], [], [False], [False, True, False]]]
+          ]),
+          expected=[[0, 0, 1, 0],
+                    [1, 0, 0, 0], [1, 0, 0, 2], [1, 0, 1, 1],
+                    [1, 1, 0, 0], [1, 1, 3, 1]]),
+
+      #=========================================================================
+      # Elementwise multiplexing
+      #=========================================================================
+      dict(  # shape=[]
+          condition=True, x='A', y='a', expected=b'A'),
+      dict(  # shape=[]
+          condition=False, x='A', y='a', expected=b'a'),
+      dict(  # shape=[D1]
+          condition=[True, False, True],
+          x=['A', 'B', 'C'],
+          y=['a', 'b', 'c'],
+          expected=[b'A', b'b', b'C']),
+      dict(  # shape=[D1, D2]
+          condition=[[True, False], [False, True]],
+          x=[['A', 'B'], ['D', 'E']],
+          y=[['a', 'b'], ['d', 'e']],
+          expected=[[b'A', b'b'], [b'd', b'E']]),
+      dict(  # shape=[D1, (D2)]
+          condition=ragged_factory_ops.constant_value(
+              [[True, False, True], [False, True]]),
+          x=ragged_factory_ops.constant_value([['A', 'B', 'C'], ['D', 'E']]),
+          y=ragged_factory_ops.constant_value([['a', 'b', 'c'], ['d', 'e']]),
+          expected=ragged_factory_ops.constant_value(
+              [[b'A', b'b', b'C'], [b'd', b'E']])),
+      dict(  # shape=[D1, (D2), D3]
+          condition=ragged_factory_ops.constant_value([
+              [[True, False], [False, True]],
+              [[True, False], [False, False], [True, False], [False, True]]
+          ], ragged_rank=1),
+          x=ragged_factory_ops.constant_value([
+              [['A', 'B'], ['C', 'D']],
+              [['E', 'F'], ['G', 'H'], ['I', 'J'], ['K', 'L']]
+          ], ragged_rank=1),
+          y=ragged_factory_ops.constant_value([
+              [['a', 'b'], ['c', 'd']],
+              [['e', 'f'], ['g', 'h'], ['i', 'j'], ['k', 'l']]
+          ], ragged_rank=1),
+          expected=ragged_factory_ops.constant_value([
+              [[b'A', b'b'], [b'c', b'D']],
+              [[b'E', b'f'], [b'g', b'h'], [b'I', b'j'], [b'k', b'L']]
+          ], ragged_rank=1)),
+      dict(  # shape=[D1, (D2), (D3), (D4)]
+          condition=ragged_factory_ops.constant_value([
+              [[[], [True]]],
+              [[[True, False, True], [False, True]],
+               [[True], [], [False], [False, True, False]]]
+          ]),
+          x=ragged_factory_ops.constant_value([
+              [[[], ['A']]],
+              [[['B', 'C', 'D'], ['E', 'F']],
+               [['G'], [], ['H'], ['I', 'J', 'K']]]
+          ]),
+          y=ragged_factory_ops.constant_value([
+              [[[], ['a']]],
+              [[['b', 'c', 'd'], ['e', 'f']],
+               [['g'], [], ['h'], ['i', 'j', 'k']]]
+          ]),
+          expected=ragged_factory_ops.constant_value([
+              [[[], [b'A']]],
+              [[[b'B', b'c', b'D'], [b'e', b'F']],
+               [[b'G'], [], [b'h'], [b'i', b'J', b'k']]]
+          ])),
+
+      #=========================================================================
+      # Broadcasting
+      #=========================================================================
+      dict(  # c.shape=[D1], x.shape=[D1, D2], y.shape=[D1, D2]
+          condition=[[True], [False], [True]],
+          x=[['A', 'B'], ['C', 'D'], ['E', 'F']],
+          y=[['a', 'b'], ['c', 'd'], ['e', 'f']],
+          expected=[[b'A', b'B'], [b'c', b'd'], [b'E', b'F']]),
+      dict(  # c.shape=[D1], x.shape=[D1, (D2)], y.shape=[D1, (D2)]
+          condition=[[True], [False], [True]],
+          x=ragged_factory_ops.constant_value(
+              [['A', 'B', 'C'], ['D', 'E'], ['F', 'G']]),
+          y=ragged_factory_ops.constant_value(
+              [['a', 'b', 'c'], ['d', 'e'], ['f', 'g']]),
+          expected=ragged_factory_ops.constant_value(
+              [[b'A', b'B', b'C'], [b'd', b'e'], [b'F', b'G']])),
+      dict(  # c.shape=[D1, None], x.shape=[], y.shape=[]
+          condition=ragged_factory_ops.constant_value(
+              [[True, False, True, True], [True, False]]),
+          x=10,
+          y=20,
+          expected=ragged_factory_ops.constant_value(
+              [[10, 20, 10, 10], [10, 20]])),
+      dict(  # c.shape=[D1, D2], x.shape=[D1, 1], y.shape=[1, D2]
+          condition=[[True, False], [True, False], [False, True]],
+          x=[[10], [20], [30]],
+          y=[[40, 50]],
+          expected=[[10, 50], [20, 50], [40, 30]]),
+      dict(  # c.shape=[D1, (D2), D3], x.shape=[D1, (D2), 1], y.shape=[D3]
+          condition=ragged_factory_ops.constant_value(
+              [[[True, False], [False, True]], [[True, True]]],
+              ragged_rank=1),
+          x=ragged_factory_ops.constant_value([[[10], [20]], [[30]]],
+                                              ragged_rank=1),
+          y=np.array([[[40, 50]]]),
+          expected=[[[10, 50], [40, 20]], [[30, 30]]]),
+  ])   # pyformat: disable
+  def testRaggedWhere(self, condition, expected, x=None, y=None):
+    result = ragged_where_op.where_v2(condition, x, y)
+    self.assertAllEqual(result, expected)
+
+  @parameterized.parameters([
+      dict(
+          condition=[True, False],
+          x=[1, 2],
+          error=ValueError,
+          message='x and y must be either both None or both non-None'),
+      dict(
+          condition=ragged_factory_ops.constant_value([[True, False, True],
+                                                       [False, True]]),
+          x=ragged_factory_ops.constant_value([['A', 'B', 'C'], ['D', 'E']]),
+          y=[['a', 'b'], ['d', 'e']],
+          error=errors.InvalidArgumentError,
+          message=r'must be broadcastable|Unable to broadcast'),
+  ])
+  def testRaggedWhereErrors(self, condition, error, message, x=None, y=None):
+    with self.assertRaisesRegex(error, message):
+      self.evaluate(ragged_where_op.where_v2(condition, x, y))
+
+
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/python/ops/random_ops.py b/tensorflow/python/ops/random_ops.py
index 0bb4b78c29f..46a1e321093 100644
--- a/tensorflow/python/ops/random_ops.py
+++ b/tensorflow/python/ops/random_ops.py
@@ -29,6 +29,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_random_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import stateless_random_ops
 
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
@@ -373,9 +374,6 @@ def random_crop(value, size, seed=None, name=None):
   Returns:
     A cropped tensor of the same rank as `value` and shape `size`.
   """
-  # TODO(shlens): Implement edge case to guarantee output size dimensions.
-  # If size > value.shape, zero pad the result so that it always has shape
-  # exactly size.
   with ops.name_scope(name, "random_crop", [value, size]) as name:
     value = ops.convert_to_tensor(value, name="value")
     size = ops.convert_to_tensor(size, dtype=dtypes.int32, name="size")
@@ -394,6 +392,59 @@ def random_crop(value, size, seed=None, name=None):
     return array_ops.slice(value, offset, size, name=name)
 
 
+@tf_export("image.stateless_random_crop", v1=[])
+@dispatch.add_dispatch_support
+def stateless_random_crop(value, size, seed, name=None):
+  """Randomly crops a tensor to a given size in a deterministic manner.
+
+  Slices a shape `size` portion out of `value` at a uniformly chosen offset.
+  Requires `value.shape >= size`.
+
+  If a dimension should not be cropped, pass the full size of that dimension.
+  For example, RGB images can be cropped with
+  `size = [crop_height, crop_width, 3]`.
+
+  Guarantees the same results given the same `seed` independent of how many
+  times the function is called, and independent of global seed settings (e.g.
+  `tf.random.set_seed`).
+
+  Usage Example:
+
+  >>> image = [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]
+  >>> seed = (1, 2)
+  >>> tf.image.stateless_random_crop(value=image, size=(1, 2, 3), seed=seed)
+  <tf.Tensor: shape=(1, 2, 3), dtype=int32, numpy=
+  array([[[1, 2, 3],
+          [4, 5, 6]]], dtype=int32)>
+
+  Args:
+    value: Input tensor to crop.
+    size: 1-D tensor with size the rank of `value`.
+    seed: A shape [2] Tensor, the seed to the random number generator. Must have
+      dtype `int32` or `int64`. (When using XLA, only `int32` is allowed.)
+    name: A name for this operation (optional).
+
+  Returns:
+    A cropped tensor of the same rank as `value` and shape `size`.
+  """
+  with ops.name_scope(name, "random_crop", [value, size]) as name:
+    value = ops.convert_to_tensor(value, name="value")
+    size = ops.convert_to_tensor(size, dtype=dtypes.int32, name="size")
+    shape = array_ops.shape(value)
+    check = control_flow_ops.Assert(
+        math_ops.reduce_all(shape >= size),
+        ["Need value.shape >= size, got ", shape, size],
+        summarize=1000)
+    shape = control_flow_ops.with_dependencies([check], shape)
+    limit = shape - size + 1
+    offset = stateless_random_ops.stateless_random_uniform(
+        array_ops.shape(shape),
+        dtype=size.dtype,
+        maxval=size.dtype.max,
+        seed=seed) % limit
+    return array_ops.slice(value, offset, size, name=name)
+
+
 @tf_export(v1=["random.multinomial", "multinomial"])
 @dispatch.add_dispatch_support
 @deprecation.deprecated(
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index 7b319e4270e..4806617c1af 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -55,7 +55,6 @@ from tensorflow.python.types import core
 from tensorflow.python.util import compat
 from tensorflow.python.util.deprecation import deprecated
 
-
 acd.register_read_only_resource_op("ReadVariableOp")
 acd.register_read_only_resource_op("VariableShape")
 acd.register_read_only_resource_op("ResourceGather")
@@ -99,8 +98,10 @@ def _set_handle_shapes_and_types(tensor, handle_data, graph_mode):
   shapes, types = zip(*[(pair.shape, pair.dtype)
                         for pair in handle_data.shape_and_type])
   ranks = [len(s.dim) if not s.unknown_rank else -1 for s in shapes]
-  shapes = [[d.size for d in s.dim]  # pylint: disable=g-complex-comprehension
-            if not s.unknown_rank else None for s in shapes]
+  shapes = [
+      [d.size for d in s.dim]  # pylint: disable=g-complex-comprehension
+      if not s.unknown_rank else None for s in shapes
+  ]
   pywrap_tf_session.TF_GraphSetOutputHandleShapesAndTypes_wrapper(
       tensor._op._graph._c_graph,  # pylint: disable=protected-access
       tensor._as_tf_output(),  # pylint: disable=protected-access
@@ -134,29 +135,33 @@ def _combine_handle_data(handle, initial_value):
 
   extra_handle_data = get_eager_safe_handle_data(initial_value)
   if extra_handle_data is not None and extra_handle_data.is_set:
-    if (variable_handle_data is None
-        or not variable_handle_data.is_set
-        or len(variable_handle_data.shape_and_type) != 1):
+    if (variable_handle_data is None or not variable_handle_data.is_set or
+        len(variable_handle_data.shape_and_type) != 1):
       raise RuntimeError(
           "Expected VarHandleOp to return a length==1 shape_and_type, "
           "but saw: '%s'" % (variable_handle_data,))
-    variable_handle_data.shape_and_type.extend(
-        extra_handle_data.shape_and_type)
+    variable_handle_data.shape_and_type.extend(extra_handle_data.shape_and_type)
   return variable_handle_data
 
 
-def _variable_handle_from_shape_and_dtype(
-    shape, dtype, shared_name, name, graph_mode, initial_value=None):
+def _variable_handle_from_shape_and_dtype(shape,
+                                          dtype,
+                                          shared_name,
+                                          name,
+                                          graph_mode,
+                                          initial_value=None):
   """Create a variable handle, copying in handle data from `initial_value`."""
   container = ops.get_default_graph()._container  # pylint: disable=protected-access
   if container is None:
     container = ""
   shape = tensor_shape.as_shape(shape)
   dtype = dtypes.as_dtype(dtype)
-  handle = gen_resource_variable_ops.var_handle_op(shape=shape, dtype=dtype,
-                                                   shared_name=shared_name,
-                                                   name=name,
-                                                   container=container)
+  handle = gen_resource_variable_ops.var_handle_op(
+      shape=shape,
+      dtype=dtype,
+      shared_name=shared_name,
+      name=name,
+      container=container)
   if initial_value is None:
     initial_value = handle
   if graph_mode:
@@ -174,7 +179,8 @@ def _variable_handle_from_shape_and_dtype(
     # compatible with ASYNC execution mode. Further, since not all devices
     # support string tensors, we encode the assertion string in the Op name
     gen_logging_ops._assert(  # pylint: disable=protected-access
-        math_ops.logical_not(exists), [exists], name="EagerVariableNameReuse")
+        math_ops.logical_not(exists), [exists],
+        name="EagerVariableNameReuse")
 
     handle_data = cpp_shape_inference_pb2.CppShapeInferenceResult.HandleData()
     handle_data.is_set = True
@@ -185,13 +191,11 @@ def _variable_handle_from_shape_and_dtype(
     if initial_value is not None and initial_value.dtype == dtypes.variant:
       extra_handle_data = get_eager_safe_handle_data(initial_value)
       if extra_handle_data is not None and extra_handle_data.is_set:
-        if (not handle_data.is_set
-            or len(handle_data.shape_and_type) != 1):
+        if (not handle_data.is_set or len(handle_data.shape_and_type) != 1):
           raise RuntimeError(
               "Expected VarHandleOp to return a length==1 shape_and_type, "
               "but saw: '%s'" % (handle_data,))
-        handle_data.shape_and_type.extend(
-            extra_handle_data.shape_and_type)
+        handle_data.shape_and_type.extend(extra_handle_data.shape_and_type)
 
     _set_handle_shapes_and_types(handle, handle_data, graph_mode)
     return handle
@@ -231,8 +235,8 @@ def eager_safe_variable_handle(initial_value, shape, shared_name, name,
 
   Args:
     initial_value: A `Tensor`.
-    shape: The shape of the handle data. Can be `TensorShape(None)`
-      (i.e. unknown shape).
+    shape: The shape of the handle data. Can be `TensorShape(None)` (i.e.
+      unknown shape).
     shared_name: A string.
     name: A string.
     graph_mode: A python bool.
@@ -241,16 +245,16 @@ def eager_safe_variable_handle(initial_value, shape, shared_name, name,
     The handle, a `Tensor` of type `resource`.
   """
   dtype = initial_value.dtype.base_dtype
-  return _variable_handle_from_shape_and_dtype(
-      shape, dtype, shared_name, name, graph_mode, initial_value)
+  return _variable_handle_from_shape_and_dtype(shape, dtype, shared_name, name,
+                                               graph_mode, initial_value)
 
 
 @contextlib.contextmanager
 def _handle_graph(handle):
   # Note: might have an eager tensor but not be executing eagerly when building
   # functions.
-  if (context.executing_eagerly() or isinstance(handle, ops.EagerTensor)
-      or ops.has_default_graph()):
+  if (context.executing_eagerly() or isinstance(handle, ops.EagerTensor) or
+      ops.has_default_graph()):
     yield
   else:
     with handle.graph.as_default():
@@ -313,9 +317,8 @@ def shape_safe_assign_variable_handle(handle, shape, value, name=None):
   with _handle_graph(handle):
     value_tensor = ops.convert_to_tensor(value)
   shape.assert_is_compatible_with(value_tensor.shape)
-  return gen_resource_variable_ops.assign_variable_op(handle,
-                                                      value_tensor,
-                                                      name=name)
+  return gen_resource_variable_ops.assign_variable_op(
+      handle, value_tensor, name=name)
 
 
 def _maybe_set_handle_data(dtype, handle, tensor):
@@ -326,8 +329,7 @@ def _maybe_set_handle_data(dtype, handle, tensor):
     if handle_data.is_set and len(handle_data.shape_and_type) > 1:
       tensor._handle_data = (  # pylint: disable=protected-access
           cpp_shape_inference_pb2.CppShapeInferenceResult.HandleData(
-              is_set=True,
-              shape_and_type=handle_data.shape_and_type[1:]))
+              is_set=True, shape_and_type=handle_data.shape_and_type[1:]))
 
 
 def variable_accessed(variable):
@@ -376,14 +378,14 @@ class BaseResourceVariable(variables.VariableV1, core.Tensor):
         after being updated by an `Optimizer` (e.g. used to implement norm
         constraints or value constraints for layer weights). The function must
         take as input the unprojected Tensor representing the value of the
-        variable and return the Tensor for the projected value
-        (which must have the same shape). Constraints are not safe to
-        use when doing asynchronous distributed training.
+        variable and return the Tensor for the projected value (which must have
+        the same shape). Constraints are not safe to use when doing asynchronous
+        distributed training.
       synchronization: Indicates when a distributed a variable will be
         aggregated. Accepted values are constants defined in the class
         `tf.VariableSynchronization`. By default the synchronization is set to
-        `AUTO` and the current `DistributionStrategy` chooses
-        when to synchronize.
+        `AUTO` and the current `DistributionStrategy` chooses when to
+        synchronize.
       aggregation: Indicates how a distributed variable will be aggregated.
         Accepted values are constants defined in the class
         `tf.VariableAggregation`.
@@ -396,8 +398,8 @@ class BaseResourceVariable(variables.VariableV1, core.Tensor):
         tensor which reads this variable's value.
       initial_value: Optional. Variable's initial value.
       initializer_op: Operation which assigns the variable's initial value.
-      is_initialized_op: Pre-created operation to check whether this variable
-        is initialized.
+      is_initialized_op: Pre-created operation to check whether this variable is
+        initialized.
       cached_value: Pre-created operation to read this variable in a specific
         device.
       save_slice_info: Metadata for variable partitioning.
@@ -431,7 +433,6 @@ class BaseResourceVariable(variables.VariableV1, core.Tensor):
     self._shape = tensor_shape.as_shape(shape)
     self._dtype = dtypes.as_dtype(dtype)
     self._handle = handle
-    self._graph_element = graph_element
     self._unique_id = unique_id
     self._handle_name = handle_name + ":0"
     self._constraint = constraint
@@ -651,8 +652,8 @@ class BaseResourceVariable(variables.VariableV1, core.Tensor):
       other Op modifies this variable, the values produced will all be
       distinct.
     """
-    return gen_state_ops.resource_count_up_to(self.handle, limit=limit,
-                                              T=self.dtype)
+    return gen_state_ops.resource_count_up_to(
+        self.handle, limit=limit, T=self.dtype)
 
   def _map_resources(self, save_options):
     """For implementing `Trackable`."""
@@ -670,8 +671,8 @@ class BaseResourceVariable(variables.VariableV1, core.Tensor):
     variable_accessed(self)
 
     def read_and_set_handle():
-      result = gen_resource_variable_ops.read_variable_op(self._handle,
-                                                          self._dtype)
+      result = gen_resource_variable_ops.read_variable_op(
+          self._handle, self._dtype)
       _maybe_set_handle_data(self._dtype, self._handle, result)
       return result
 
@@ -720,8 +721,7 @@ class BaseResourceVariable(variables.VariableV1, core.Tensor):
         if handle_data.is_set and len(handle_data.shape_and_type) > 1:
           value._handle_data = (  # pylint: disable=protected-access
               cpp_shape_inference_pb2.CppShapeInferenceResult.HandleData(
-                  is_set=True,
-                  shape_and_type=handle_data.shape_and_type[1:]))
+                  is_set=True, shape_and_type=handle_data.shape_and_type[1:]))
 
     return array_ops.identity(value)
 
@@ -810,7 +810,7 @@ class BaseResourceVariable(variables.VariableV1, core.Tensor):
       use_locking: If `True`, use locking during the operation.
       name: The name to use for the operation.
       read_value: A `bool`. Whether to read and return the new value of the
-          variable or not.
+        variable or not.
 
     Returns:
       If `read_value` is `True`, this method will return the new value of the
@@ -823,7 +823,8 @@ class BaseResourceVariable(variables.VariableV1, core.Tensor):
     # don't need it.
     with _handle_graph(self.handle), self._assign_dependencies():
       assign_sub_op = gen_resource_variable_ops.assign_sub_variable_op(
-          self.handle, ops.convert_to_tensor(delta, dtype=self.dtype),
+          self.handle,
+          ops.convert_to_tensor(delta, dtype=self.dtype),
           name=name)
     if read_value:
       return self._lazy_read(assign_sub_op)
@@ -837,7 +838,7 @@ class BaseResourceVariable(variables.VariableV1, core.Tensor):
       use_locking: If `True`, use locking during the operation.
       name: The name to use for the operation.
       read_value: A `bool`. Whether to read and return the new value of the
-          variable or not.
+        variable or not.
 
     Returns:
       If `read_value` is `True`, this method will return the new value of the
@@ -847,7 +848,8 @@ class BaseResourceVariable(variables.VariableV1, core.Tensor):
     """
     with _handle_graph(self.handle), self._assign_dependencies():
       assign_add_op = gen_resource_variable_ops.assign_add_variable_op(
-          self.handle, ops.convert_to_tensor(delta, dtype=self.dtype),
+          self.handle,
+          ops.convert_to_tensor(delta, dtype=self.dtype),
           name=name)
     if read_value:
       return self._lazy_read(assign_add_op)
@@ -856,10 +858,13 @@ class BaseResourceVariable(variables.VariableV1, core.Tensor):
   def _lazy_read(self, op):
     variable_accessed(self)
     return _UnreadVariable(
-        handle=self._handle, dtype=self.dtype, shape=self._shape,
+        handle=self._handle,
+        dtype=self.dtype,
+        shape=self._shape,
         in_graph_mode=self._in_graph_mode,
         deleter=self._handle_deleter if not self._in_graph_mode else None,
-        parent_op=op, unique_id=self._unique_id)
+        parent_op=op,
+        unique_id=self._unique_id)
 
   def assign(self, value, use_locking=None, name=None, read_value=True):
     """Assigns a new value to this variable.
@@ -869,7 +874,7 @@ class BaseResourceVariable(variables.VariableV1, core.Tensor):
       use_locking: If `True`, use locking during the assignment.
       name: The name to use for the assignment.
       read_value: A `bool`. Whether to read and return the new value of the
-          variable or not.
+        variable or not.
 
     Returns:
       If `read_value` is `True`, this method will return the new value of the
@@ -881,7 +886,15 @@ class BaseResourceVariable(variables.VariableV1, core.Tensor):
     # initialize the variable.
     with _handle_graph(self.handle):
       value_tensor = ops.convert_to_tensor(value, dtype=self.dtype)
-      self._shape.assert_is_compatible_with(value_tensor.shape)
+      if not self._shape.is_compatible_with(value_tensor.shape):
+        if self.name is None:
+          tensor_name = ""
+        else:
+          tensor_name = " " + str(self.name)
+        raise ValueError(
+            ("Cannot assign to variable%s due to variable shape %s and value "
+             "shape %s are incompatible") %
+            (tensor_name, self._shape, value_tensor.shape))
       assign_op = gen_resource_variable_ops.assign_variable_op(
           self.handle, value_tensor, name=name)
       if read_value:
@@ -915,9 +928,12 @@ class BaseResourceVariable(variables.VariableV1, core.Tensor):
     """
     if not isinstance(sparse_delta, ops.IndexedSlices):
       raise TypeError("sparse_delta is not IndexedSlices: %s" % sparse_delta)
-    return self._lazy_read(gen_resource_variable_ops.resource_scatter_sub(
-        self.handle, sparse_delta.indices,
-        ops.convert_to_tensor(sparse_delta.values, self.dtype), name=name))
+    return self._lazy_read(
+        gen_resource_variable_ops.resource_scatter_sub(
+            self.handle,
+            sparse_delta.indices,
+            ops.convert_to_tensor(sparse_delta.values, self.dtype),
+            name=name))
 
   def scatter_add(self, sparse_delta, use_locking=False, name=None):
     """Adds `tf.IndexedSlices` to this variable.
@@ -935,16 +951,19 @@ class BaseResourceVariable(variables.VariableV1, core.Tensor):
     """
     if not isinstance(sparse_delta, ops.IndexedSlices):
       raise TypeError("sparse_delta is not IndexedSlices: %s" % sparse_delta)
-    return self._lazy_read(gen_resource_variable_ops.resource_scatter_add(
-        self.handle, sparse_delta.indices,
-        ops.convert_to_tensor(sparse_delta.values, self.dtype), name=name))
+    return self._lazy_read(
+        gen_resource_variable_ops.resource_scatter_add(
+            self.handle,
+            sparse_delta.indices,
+            ops.convert_to_tensor(sparse_delta.values, self.dtype),
+            name=name))
 
   def scatter_max(self, sparse_delta, use_locking=False, name=None):
     """Updates this variable with the max of `tf.IndexedSlices` and itself.
 
     Args:
-      sparse_delta: `tf.IndexedSlices` to use as an argument of max
-        with this variable.
+      sparse_delta: `tf.IndexedSlices` to use as an argument of max with this
+        variable.
       use_locking: If `True`, use locking during the operation.
       name: the name of the operation.
 
@@ -956,16 +975,19 @@ class BaseResourceVariable(variables.VariableV1, core.Tensor):
     """
     if not isinstance(sparse_delta, ops.IndexedSlices):
       raise TypeError("sparse_delta is not IndexedSlices: %s" % sparse_delta)
-    return self._lazy_read(gen_resource_variable_ops.resource_scatter_max(
-        self.handle, sparse_delta.indices,
-        ops.convert_to_tensor(sparse_delta.values, self.dtype), name=name))
+    return self._lazy_read(
+        gen_resource_variable_ops.resource_scatter_max(
+            self.handle,
+            sparse_delta.indices,
+            ops.convert_to_tensor(sparse_delta.values, self.dtype),
+            name=name))
 
   def scatter_min(self, sparse_delta, use_locking=False, name=None):
     """Updates this variable with the min of `tf.IndexedSlices` and itself.
 
     Args:
-      sparse_delta: `tf.IndexedSlices` to use as an argument of min
-        with this variable.
+      sparse_delta: `tf.IndexedSlices` to use as an argument of min with this
+        variable.
       use_locking: If `True`, use locking during the operation.
       name: the name of the operation.
 
@@ -977,9 +999,12 @@ class BaseResourceVariable(variables.VariableV1, core.Tensor):
     """
     if not isinstance(sparse_delta, ops.IndexedSlices):
       raise TypeError("sparse_delta is not IndexedSlices: %s" % sparse_delta)
-    return self._lazy_read(gen_resource_variable_ops.resource_scatter_min(
-        self.handle, sparse_delta.indices,
-        ops.convert_to_tensor(sparse_delta.values, self.dtype), name=name))
+    return self._lazy_read(
+        gen_resource_variable_ops.resource_scatter_min(
+            self.handle,
+            sparse_delta.indices,
+            ops.convert_to_tensor(sparse_delta.values, self.dtype),
+            name=name))
 
   def scatter_mul(self, sparse_delta, use_locking=False, name=None):
     """Multiply this variable by `tf.IndexedSlices`.
@@ -997,9 +1022,12 @@ class BaseResourceVariable(variables.VariableV1, core.Tensor):
     """
     if not isinstance(sparse_delta, ops.IndexedSlices):
       raise TypeError("sparse_delta is not IndexedSlices: %s" % sparse_delta)
-    return self._lazy_read(gen_resource_variable_ops.resource_scatter_mul(
-        self.handle, sparse_delta.indices,
-        ops.convert_to_tensor(sparse_delta.values, self.dtype), name=name))
+    return self._lazy_read(
+        gen_resource_variable_ops.resource_scatter_mul(
+            self.handle,
+            sparse_delta.indices,
+            ops.convert_to_tensor(sparse_delta.values, self.dtype),
+            name=name))
 
   def scatter_div(self, sparse_delta, use_locking=False, name=None):
     """Divide this variable by `tf.IndexedSlices`.
@@ -1017,9 +1045,12 @@ class BaseResourceVariable(variables.VariableV1, core.Tensor):
     """
     if not isinstance(sparse_delta, ops.IndexedSlices):
       raise TypeError("sparse_delta is not IndexedSlices: %s" % sparse_delta)
-    return self._lazy_read(gen_resource_variable_ops.resource_scatter_div(
-        self.handle, sparse_delta.indices,
-        ops.convert_to_tensor(sparse_delta.values, self.dtype), name=name))
+    return self._lazy_read(
+        gen_resource_variable_ops.resource_scatter_div(
+            self.handle,
+            sparse_delta.indices,
+            ops.convert_to_tensor(sparse_delta.values, self.dtype),
+            name=name))
 
   def scatter_update(self, sparse_delta, use_locking=False, name=None):
     """Assigns `tf.IndexedSlices` to this variable.
@@ -1037,9 +1068,12 @@ class BaseResourceVariable(variables.VariableV1, core.Tensor):
     """
     if not isinstance(sparse_delta, ops.IndexedSlices):
       raise TypeError("sparse_delta is not IndexedSlices: %s" % sparse_delta)
-    return self._lazy_read(gen_resource_variable_ops.resource_scatter_update(
-        self.handle, sparse_delta.indices,
-        ops.convert_to_tensor(sparse_delta.values, self.dtype), name=name))
+    return self._lazy_read(
+        gen_resource_variable_ops.resource_scatter_update(
+            self.handle,
+            sparse_delta.indices,
+            ops.convert_to_tensor(sparse_delta.values, self.dtype),
+            name=name))
 
   def batch_scatter_update(self, sparse_delta, use_locking=False, name=None):
     """Assigns `tf.IndexedSlices` to this variable batch-wise.
@@ -1087,9 +1121,13 @@ class BaseResourceVariable(variables.VariableV1, core.Tensor):
     """
     if not isinstance(sparse_delta, ops.IndexedSlices):
       raise TypeError("sparse_delta is not IndexedSlices: %s" % sparse_delta)
-    return self._lazy_read(state_ops.batch_scatter_update(
-        self, sparse_delta.indices, sparse_delta.values,
-        use_locking=use_locking, name=name))
+    return self._lazy_read(
+        state_ops.batch_scatter_update(
+            self,
+            sparse_delta.indices,
+            sparse_delta.values,
+            use_locking=use_locking,
+            name=name))
 
   def scatter_nd_sub(self, indices, updates, name=None):
     """Applies sparse subtraction to individual values or slices in a Variable.
@@ -1136,9 +1174,12 @@ class BaseResourceVariable(variables.VariableV1, core.Tensor):
     Returns:
       The updated variable.
     """
-    return self._lazy_read(gen_state_ops.resource_scatter_nd_sub(
-        self.handle, indices, ops.convert_to_tensor(updates, self.dtype),
-        name=name))
+    return self._lazy_read(
+        gen_state_ops.resource_scatter_nd_sub(
+            self.handle,
+            indices,
+            ops.convert_to_tensor(updates, self.dtype),
+            name=name))
 
   def scatter_nd_add(self, indices, updates, name=None):
     """Applies sparse addition to individual values or slices in a Variable.
@@ -1185,9 +1226,12 @@ class BaseResourceVariable(variables.VariableV1, core.Tensor):
     Returns:
       The updated variable.
     """
-    return self._lazy_read(gen_state_ops.resource_scatter_nd_add(
-        self.handle, indices, ops.convert_to_tensor(updates, self.dtype),
-        name=name))
+    return self._lazy_read(
+        gen_state_ops.resource_scatter_nd_add(
+            self.handle,
+            indices,
+            ops.convert_to_tensor(updates, self.dtype),
+            name=name))
 
   def scatter_nd_update(self, indices, updates, name=None):
     """Applies sparse assignment to individual values or slices in a Variable.
@@ -1234,9 +1278,12 @@ class BaseResourceVariable(variables.VariableV1, core.Tensor):
     Returns:
       The updated variable.
     """
-    return self._lazy_read(gen_state_ops.resource_scatter_nd_update(
-        self.handle, indices, ops.convert_to_tensor(updates, self.dtype),
-        name=name))
+    return self._lazy_read(
+        gen_state_ops.resource_scatter_nd_update(
+            self.handle,
+            indices,
+            ops.convert_to_tensor(updates, self.dtype),
+            name=name))
 
   def scatter_nd_max(self, indices, updates, name=None):
     """Updates this variable with the max of `tf.IndexedSlices` and itself.
@@ -1442,29 +1489,30 @@ class ResourceVariable(BaseResourceVariable):
   ```
   """
 
-  def __init__(self,  # pylint: disable=super-init-not-called
-               initial_value=None,
-               trainable=None,
-               collections=None,
-               validate_shape=True,  # pylint: disable=unused-argument
-               caching_device=None,
-               name=None,
-               dtype=None,
-               variable_def=None,
-               import_scope=None,
-               constraint=None,
-               distribute_strategy=None,
-               synchronization=None,
-               aggregation=None,
-               shape=None):
+  def __init__(
+      self,  # pylint: disable=super-init-not-called
+      initial_value=None,
+      trainable=None,
+      collections=None,
+      validate_shape=True,  # pylint: disable=unused-argument
+      caching_device=None,
+      name=None,
+      dtype=None,
+      variable_def=None,
+      import_scope=None,
+      constraint=None,
+      distribute_strategy=None,
+      synchronization=None,
+      aggregation=None,
+      shape=None):
     """Creates a variable.
 
     Args:
       initial_value: A `Tensor`, or Python object convertible to a `Tensor`,
-        which is the initial value for the Variable. Can also be a
-        callable with no argument that returns the initial value when called.
-        (Note that initializer functions from init_ops.py must first be bound
-        to a shape before being used here.)
+        which is the initial value for the Variable. Can also be a callable with
+        no argument that returns the initial value when called. (Note that
+        initializer functions from init_ops.py must first be bound to a shape
+        before being used here.)
       trainable: If `True`, the default, also adds the variable to the graph
         collection `GraphKeys.TRAINABLE_VARIABLES`. This collection is used as
         the default list of variables to use by the `Optimizer` classes.
@@ -1480,10 +1528,9 @@ class ResourceVariable(BaseResourceVariable):
         deduplicate copying through `Switch` and other conditional statements.
       name: Optional name for the variable. Defaults to `'Variable'` and gets
         uniquified automatically.
-      dtype: If set, initial_value will be converted to the given type.
-        If None, either the datatype will be kept (if initial_value is
-        a Tensor) or float32 will be used (if it is a Python object convertible
-        to a Tensor).
+      dtype: If set, initial_value will be converted to the given type. If None,
+        either the datatype will be kept (if initial_value is a Tensor) or
+        float32 will be used (if it is a Python object convertible to a Tensor).
       variable_def: `VariableDef` protocol buffer. If not None, recreates the
         `ResourceVariable` object with its contents. `variable_def` and other
         arguments (except for import_scope) are mutually exclusive.
@@ -1493,16 +1540,16 @@ class ResourceVariable(BaseResourceVariable):
         after being updated by an `Optimizer` (e.g. used to implement norm
         constraints or value constraints for layer weights). The function must
         take as input the unprojected Tensor representing the value of the
-        variable and return the Tensor for the projected value
-        (which must have the same shape). Constraints are not safe to
-        use when doing asynchronous distributed training.
+        variable and return the Tensor for the projected value (which must have
+        the same shape). Constraints are not safe to use when doing asynchronous
+        distributed training.
       distribute_strategy: The tf.distribute.Strategy this variable is being
         created inside of.
       synchronization: Indicates when a distributed a variable will be
         aggregated. Accepted values are constants defined in the class
         `tf.VariableSynchronization`. By default the synchronization is set to
-        `AUTO` and the current `DistributionStrategy` chooses
-        when to synchronize.
+        `AUTO` and the current `DistributionStrategy` chooses when to
+        synchronize.
       aggregation: Indicates how a distributed variable will be aggregated.
         Accepted values are constants defined in the class
         `tf.VariableAggregation`.
@@ -1562,8 +1609,8 @@ class ResourceVariable(BaseResourceVariable):
         which is the initial value for the Variable. The initial value must have
         a shape specified unless `validate_shape` is set to False. Can also be a
         callable with no argument that returns the initial value when called.
-        (Note that initializer functions from init_ops.py must first be bound
-         to a shape before being used here.)
+        (Note that initializer functions from init_ops.py must first be bound to
+        a shape before being used here.)
       trainable: If `True`, the default, also adds the variable to the graph
         collection `GraphKeys.TRAINABLE_VARIABLES`. This collection is used as
         the default list of variables to use by the `Optimizer` classes.
@@ -1578,27 +1625,26 @@ class ResourceVariable(BaseResourceVariable):
         deduplicate copying through `Switch` and other conditional statements.
       name: Optional name for the variable. Defaults to `'Variable'` and gets
         uniquified automatically.
-      dtype: If set, initial_value will be converted to the given type.
-        If None, either the datatype will be kept (if initial_value is
-       a Tensor) or float32 will be used (if it is a Python object convertible
-       to a Tensor).
+      dtype: If set, initial_value will be converted to the given type. If None,
+        either the datatype will be kept (if initial_value is a Tensor) or
+        float32 will be used (if it is a Python object convertible to a Tensor).
       constraint: An optional projection function to be applied to the variable
         after being updated by an `Optimizer` (e.g. used to implement norm
         constraints or value constraints for layer weights). The function must
         take as input the unprojected Tensor representing the value of the
-        variable and return the Tensor for the projected value
-        (which must have the same shape). Constraints are not safe to
-        use when doing asynchronous distributed training.
+        variable and return the Tensor for the projected value (which must have
+        the same shape). Constraints are not safe to use when doing asynchronous
+        distributed training.
       synchronization: Indicates when a distributed a variable will be
         aggregated. Accepted values are constants defined in the class
         `tf.VariableSynchronization`. By default the synchronization is set to
-        `AUTO` and the current `DistributionStrategy` chooses
-        when to synchronize.
+        `AUTO` and the current `DistributionStrategy` chooses when to
+        synchronize.
       aggregation: Indicates how a distributed variable will be aggregated.
         Accepted values are constants defined in the class
         `tf.VariableAggregation`.
-      distribute_strategy: DistributionStrategy under which this variable
-        was created.
+      distribute_strategy: DistributionStrategy under which this variable was
+        created.
       shape: (optional) The shape of this variable. If None, the shape of
         `initial_value` will be used. When setting this argument to
         `tf.TensorShape(None)` (representing an unspecified shape), the variable
@@ -1640,11 +1686,6 @@ class ResourceVariable(BaseResourceVariable):
     if constraint is not None and not callable(constraint):
       raise ValueError("The `constraint` argument must be a callable.")
 
-    if isinstance(initial_value, trackable.CheckpointInitialValue):
-      self._maybe_initialize_trackable()
-      self._update_uid = initial_value.checkpoint_position.restore_uid
-      initial_value = initial_value.wrapped_value
-
     if trainable and ops.GraphKeys.TRAINABLE_VARIABLES not in collections:
       collections = list(collections) + [ops.GraphKeys.TRAINABLE_VARIABLES]
     with ops.init_scope():
@@ -1673,9 +1714,15 @@ class ResourceVariable(BaseResourceVariable):
                 s=[compat.as_bytes("loc:@%s" % handle_name)]))
         with ops.get_default_graph()._attr_scope({"_class": attr}):
           with ops.name_scope("Initializer"), device_context_manager(None):
-            initial_value = ops.convert_to_tensor(
-                initial_value() if init_from_fn else initial_value,
-                name="initial_value", dtype=dtype)
+            if init_from_fn:
+              initial_value = initial_value()
+            if isinstance(initial_value, trackable.CheckpointInitialValue):
+              self._maybe_initialize_trackable()
+              self._update_uid = initial_value.checkpoint_position.restore_uid
+              initial_value = initial_value.wrapped_value
+            initial_value = ops.convert_to_tensor(initial_value,
+                                                  name="initial_value",
+                                                  dtype=dtype)
           if shape is not None:
             if not initial_value.shape.is_compatible_with(shape):
               raise ValueError(
@@ -1715,8 +1762,7 @@ class ResourceVariable(BaseResourceVariable):
                   gen_resource_variable_ops.assign_variable_op(
                       handle,
                       variables._try_guard_against_uninitialized_dependencies(
-                          name,
-                          initial_value),
+                          name, initial_value),
                       name=n))
               # pylint: enable=protected-access
             # pylint: enable=g-backslash-continuation
@@ -1767,13 +1813,23 @@ class ResourceVariable(BaseResourceVariable):
           ops.add_to_collections(ops.GraphKeys.GLOBAL_STEP, self)
       initial_value = initial_value if self._in_graph_mode else None
       super(ResourceVariable, self).__init__(
-          trainable=trainable, shape=shape, dtype=dtype, handle=handle,
-          synchronization=synchronization, constraint=constraint,
-          aggregation=aggregation, distribute_strategy=distribute_strategy,
-          name=name, unique_id=unique_id, handle_name=handle_name,
-          graph_element=graph_element, initial_value=initial_value,
-          initializer_op=initializer_op, is_initialized_op=is_initialized_op,
-          cached_value=cached_value, caching_device=caching_device)
+          trainable=trainable,
+          shape=shape,
+          dtype=dtype,
+          handle=handle,
+          synchronization=synchronization,
+          constraint=constraint,
+          aggregation=aggregation,
+          distribute_strategy=distribute_strategy,
+          name=name,
+          unique_id=unique_id,
+          handle_name=handle_name,
+          graph_element=graph_element,
+          initial_value=initial_value,
+          initializer_op=initializer_op,
+          is_initialized_op=is_initialized_op,
+          cached_value=cached_value,
+          caching_device=caching_device)
 
   def _init_from_proto(self, variable_def, import_scope=None):
     """Initializes from `VariableDef` proto."""
@@ -1789,8 +1845,7 @@ class ResourceVariable(BaseResourceVariable):
     self._handle = g.as_graph_element(
         ops.prepend_name_scope(
             variable_def.variable_name, import_scope=import_scope))
-    self._shape = tensor_shape.TensorShape(
-        self._handle.op.get_attr("shape"))
+    self._shape = tensor_shape.TensorShape(self._handle.op.get_attr("shape"))
     self._handle_name = self._handle.name
     self._unique_id = self._handle_name
     self._initializer_op = g.as_graph_element(
@@ -1800,16 +1855,14 @@ class ResourceVariable(BaseResourceVariable):
     if (hasattr(variable_def, "initial_value_name") and
         variable_def.initial_value_name):
       self._initial_value = g.as_graph_element(
-          ops.prepend_name_scope(variable_def.initial_value_name,
-                                 import_scope=import_scope))
+          ops.prepend_name_scope(
+              variable_def.initial_value_name, import_scope=import_scope))
     else:
       self._initial_value = None
     synchronization, aggregation, trainable = (
         variables.validate_synchronization_aggregation_trainable(
-            variable_def.synchronization,
-            variable_def.aggregation,
-            variable_def.trainable,
-            variable_def.variable_name))
+            variable_def.synchronization, variable_def.aggregation,
+            variable_def.trainable, variable_def.variable_name))
     self._synchronization = synchronization
     self._aggregation = aggregation
     self._trainable = trainable
@@ -1828,8 +1881,8 @@ class ResourceVariable(BaseResourceVariable):
       self._cached_value = None
       # Legacy case for protos without the snapshot name; assume it's the
       # following.
-      self._graph_element = g.get_tensor_by_name(
-          self._handle.op.name + "/Read/ReadVariableOp:0")
+      self._graph_element = g.get_tensor_by_name(self._handle.op.name +
+                                                 "/Read/ReadVariableOp:0")
     if variable_def.HasField("save_slice_info_def"):
       self._save_slice_info = variables.Variable.SaveSliceInfo(
           save_slice_info_def=variable_def.save_slice_info_def,
@@ -1875,14 +1928,14 @@ class UninitializedVariable(BaseResourceVariable):
         after being updated by an `Optimizer` (e.g. used to implement norm
         constraints or value constraints for layer weights). The function must
         take as input the unprojected Tensor representing the value of the
-        variable and return the Tensor for the projected value
-        (which must have the same shape). Constraints are not safe to
-        use when doing asynchronous distributed training.
+        variable and return the Tensor for the projected value (which must have
+        the same shape). Constraints are not safe to use when doing asynchronous
+        distributed training.
       synchronization: Indicates when a distributed a variable will be
         aggregated. Accepted values are constants defined in the class
         `tf.VariableSynchronization`. By default the synchronization is set to
-        `AUTO` and the current `DistributionStrategy` chooses
-        when to synchronize.
+        `AUTO` and the current `DistributionStrategy` chooses when to
+        synchronize.
       aggregation: Indicates how a distributed variable will be aggregated.
         Accepted values are constants defined in the class
         `tf.VariableAggregation`.
@@ -1901,10 +1954,13 @@ class UninitializedVariable(BaseResourceVariable):
           unique_id = shared_name
         else:
           unique_id = "%s_%d" % (handle_name, ops.uid())
-          shared_name = context.shared_name(unique_id)
+          shared_name = context.shared_name()
         handle = _variable_handle_from_shape_and_dtype(
-            shape=shape, dtype=dtype, shared_name=shared_name,
-            name=name, graph_mode=self._in_graph_mode,
+            shape=shape,
+            dtype=dtype,
+            shared_name=shared_name,
+            name=name,
+            graph_mode=self._in_graph_mode,
             initial_value=extra_handle_data)
         if not context.executing_eagerly():
           with ops.name_scope("Read"):
@@ -1921,10 +1977,17 @@ class UninitializedVariable(BaseResourceVariable):
         else:
           graph_element = None
     super(UninitializedVariable, self).__init__(
-        distribute_strategy=distribute_strategy, shape=shape, dtype=dtype,
-        unique_id=unique_id, handle_name=handle_name, constraint=constraint,
-        handle=handle, graph_element=graph_element, trainable=trainable,
-        synchronization=synchronization, aggregation=aggregation)
+        distribute_strategy=distribute_strategy,
+        shape=shape,
+        dtype=dtype,
+        unique_id=unique_id,
+        handle_name=handle_name,
+        constraint=constraint,
+        handle=handle,
+        graph_element=graph_element,
+        trainable=trainable,
+        synchronization=synchronization,
+        aggregation=aggregation)
 
 
 _pywrap_utils.RegisterType("ResourceVariable", ResourceVariable)
@@ -1947,8 +2010,8 @@ class _UnreadVariable(BaseResourceVariable):
   Pretends to be the tensor if anyone looks.
   """
 
-  def __init__(self, handle, dtype, shape, in_graph_mode, deleter,
-               parent_op, unique_id):
+  def __init__(self, handle, dtype, shape, in_graph_mode, deleter, parent_op,
+               unique_id):
     if isinstance(handle, ops.EagerTensor):
       handle_name = ""
     else:
@@ -1964,8 +2027,12 @@ class _UnreadVariable(BaseResourceVariable):
             handle, dtype)
         _maybe_set_handle_data(dtype, handle, graph_element)
     super(_UnreadVariable, self).__init__(
-        handle=handle, shape=shape, handle_name=handle_name,
-        unique_id=unique_id, dtype=dtype, handle_deleter=deleter,
+        handle=handle,
+        shape=shape,
+        handle_name=handle_name,
+        unique_id=unique_id,
+        dtype=dtype,
+        handle_deleter=deleter,
         graph_element=graph_element)
     self._parent_op = parent_op
 
@@ -1984,8 +2051,8 @@ class _UnreadVariable(BaseResourceVariable):
 
   def _read_variable_op(self):
     with ops.control_dependencies([self._parent_op]):
-      result = gen_resource_variable_ops.read_variable_op(self._handle,
-                                                          self._dtype)
+      result = gen_resource_variable_ops.read_variable_op(
+          self._handle, self._dtype)
       _maybe_set_handle_data(self._dtype, self._handle, result)
       return result
 
@@ -2036,13 +2103,13 @@ class _UnreadVariable(BaseResourceVariable):
 
   def scatter_update(self, sparse_delta, use_locking=False, name=None):
     with ops.control_dependencies([self._parent_op]):
-      return super(_UnreadVariable, self).scatter_update(sparse_delta,
-                                                         use_locking, name)
+      return super(_UnreadVariable,
+                   self).scatter_update(sparse_delta, use_locking, name)
 
   def batch_scatter_update(self, sparse_delta, use_locking=False, name=None):
     with ops.control_dependencies([self._parent_op]):
-      return super(_UnreadVariable, self).batch_scatter_update(
-          sparse_delta, use_locking, name)
+      return super(_UnreadVariable,
+                   self).batch_scatter_update(sparse_delta, use_locking, name)
 
   def scatter_nd_sub(self, indices, updates, name=None):
     with ops.control_dependencies([self._parent_op]):
@@ -2054,8 +2121,8 @@ class _UnreadVariable(BaseResourceVariable):
 
   def scatter_nd_update(self, indices, updates, name=None):
     with ops.control_dependencies([self._parent_op]):
-      return super(_UnreadVariable, self).scatter_nd_update(indices, updates,
-                                                            name)
+      return super(_UnreadVariable,
+                   self).scatter_nd_update(indices, updates, name)
 
   def scatter_nd_max(self, indices, updates, name=None):
     with ops.control_dependencies([self._parent_op]):
@@ -2078,8 +2145,8 @@ def _ReadGrad(_, grad):
 
 
 def variable_shape(handle, out_type=dtypes.int32):
-  if getattr(
-      handle, "_handle_data", None) is None or not handle._handle_data.is_set:  # pylint: disable=protected-access
+  if getattr(handle, "_handle_data",
+             None) is None or not handle._handle_data.is_set:  # pylint: disable=protected-access
     return gen_resource_variable_ops.variable_shape(handle, out_type=out_type)
   shape_proto = handle._handle_data.shape_and_type[0].shape  # pylint: disable=protected-access
   if shape_proto.unknown_rank or any(x.size == -1 for x in shape_proto.dim):
@@ -2174,6 +2241,7 @@ def copy_to_graph_uninitialized(var):
   # pylint: enable=protected-access
   return new_variable
 
+
 ops.NotDifferentiable("Assert")
 ops.NotDifferentiable("VarIsInitializedOp")
 ops.NotDifferentiable("VariableShape")
diff --git a/tensorflow/python/ops/script_ops.py b/tensorflow/python/ops/script_ops.py
index 7ee5a16ca9a..53f6a2b0492 100644
--- a/tensorflow/python/ops/script_ops.py
+++ b/tensorflow/python/ops/script_ops.py
@@ -71,7 +71,7 @@ def _maybe_copy_to_context_device(tensor, device_name):
 class EagerFunc(object):
   """A wrapper for a function owned by an EagerPyFunc."""
 
-  def __init__(self, func, Tout, is_grad_func):
+  def __init__(self, func, Tout, is_grad_func, use_tape_cache=True):
     """Constructs an EagerFunc.
 
     Args:
@@ -80,10 +80,14 @@ class EagerFunc(object):
         None.
       is_grad_func: Whether this EagerFunc is the gradient of another
         EagerPyFunc.
+      use_tape_cache: (Optional.) Whether to cache `func` in the `tape_cache`.
+        For additional information, see description of `_eager_py_func`.
+        This parameter should be removed once the #35084 issue is fixed.
     """
     self._func = func
     self._out_dtypes = Tout
     self._is_grad_func = is_grad_func
+    self._use_tape_cache = use_tape_cache
 
   def _convert(self, value, dtype):
     """Converts `value` to a tensor of type `dtype`, with error checking.
@@ -147,7 +151,8 @@ class EagerFunc(object):
         else:
           outputs = _maybe_copy_to_context_device(
               self._convert(ret, dtype=self._out_dtypes[0]), device_name)
-    tape_cache[compat.as_bytes(token)] = (tape, args, outputs)
+    if self._use_tape_cache:
+      tape_cache[compat.as_bytes(token)] = (tape, args, outputs)
     return outputs
 
 
@@ -277,7 +282,8 @@ def _internal_py_func(func,
                       stateful=None,
                       eager=False,
                       is_grad_func=False,
-                      name=None):
+                      name=None,
+                      use_tape_cache=True):
   """See documentation for py_func and eager_py_func."""
   if not callable(func):
     raise ValueError("Expected func to be callable, got func of type {}".format(
@@ -293,7 +299,7 @@ def _internal_py_func(func,
     Tout = [Tout]
 
   if eager:
-    func = EagerFunc(func, Tout, is_grad_func)
+    func = EagerFunc(func, Tout, is_grad_func, use_tape_cache=use_tape_cache)
 
   # Tying the registered function's lifetime with the current default graph is
   # not reliable. For example, Estimator-based binaries may switch graphs in
@@ -370,6 +376,58 @@ def _EagerPyFuncGrad(op, *dy):
         is_grad_func=True)
 
 
+def _eager_py_func(func, inp, Tout, name=None, use_tape_cache=True):
+  """Wraps a python function into a TensorFlow op that executes it eagerly.
+
+  This function is the internal implementation for `eager_py_func`, see the
+  `eager_py_func` docstring for the full description.
+
+  Note: this function as a layer of indirection was added with one
+  specific purpose: as a workaround for github issue #35084.
+  It does all the same as `eager_py_func` used to do with one difference:
+  it can be used to instruct underlying EagerFunc not to use `tape_cache`
+  to avoid memory leak. When the issue #35084 is fixed - this function should
+  be removed, its body should be moved back to become the body of
+  `eager_py_func` and all the call sites should be reverted to
+  using `eager_py_func` without `use_tape_cache` argument of any value.
+
+  Args:
+    func: A Python function which accepts a list of `Tensor` objects having
+      element types that match the corresponding `tf.Tensor` objects in `inp`
+      and returns a list of `Tensor` objects (or a single `Tensor`, or `None`)
+      having element types that match the corresponding values in `Tout`.
+    inp: A list of `Tensor` objects.
+    Tout: A list or tuple of tensorflow data types or a single tensorflow data
+      type if there is only one, indicating what `func` returns; an empty list
+      if no value is returned (i.e., if the return value is `None`).
+    name: A name for the operation (optional).
+    use_tape_cache: (Optional.) Whether to cache `func` in the `tape_cache`.
+      For additional information, see description of `_eager_py_func`.
+      This parameter should be removed once the #35084 issue is fixed.
+
+  Returns:
+    A list of `Tensor` or a single `Tensor` which `func` computes; an empty list
+    if `func` returns None.
+  """
+  if ops.executing_eagerly_outside_functions():
+    with ops.device(context.context().host_address_space()):
+      return _internal_py_func(
+          func=func,
+          inp=inp,
+          Tout=Tout,
+          eager=True,
+          name=name,
+          use_tape_cache=use_tape_cache)
+
+  return _internal_py_func(
+      func=func,
+      inp=inp,
+      Tout=Tout,
+      eager=True,
+      name=name,
+      use_tape_cache=use_tape_cache)
+
+
 @tf_export("py_function")
 @dispatch.add_dispatch_support
 def eager_py_func(func, inp, Tout, name=None):
@@ -451,12 +509,8 @@ def eager_py_func(func, inp, Tout, name=None):
     A list of `Tensor` or a single `Tensor` which `func` computes; an empty list
     if `func` returns None.
   """
-  if ops.executing_eagerly_outside_functions():
-    with ops.device(context.context().host_address_space()):
-      return _internal_py_func(
-          func=func, inp=inp, Tout=Tout, eager=True, name=name)
-
-  return _internal_py_func(func=func, inp=inp, Tout=Tout, eager=True, name=name)
+  return _eager_py_func(
+      func=func, inp=inp, Tout=Tout, name=name, use_tape_cache=True)
 
 
 def py_func_common(func, inp, Tout, stateful=True, name=None):
diff --git a/tensorflow/python/ops/signal/shape_ops.py b/tensorflow/python/ops/signal/shape_ops.py
index c059f6d58d6..555a7deabef 100644
--- a/tensorflow/python/ops/signal/shape_ops.py
+++ b/tensorflow/python/ops/signal/shape_ops.py
@@ -167,11 +167,11 @@ def frame(signal, frame_length, frame_step, pad_end=False, pad_value=0, axis=-1,
           0, frame_length + frame_step * (num_frames - 1) - length_samples)
 
       # Pad the inner dimension of signal by pad_samples.
-      paddings = array_ops.concat(
-          [array_ops.zeros([num_outer_dimensions, 2], dtype=pad_samples.dtype),
-           [[0, pad_samples]],
-           array_ops.zeros([num_inner_dimensions, 2], dtype=pad_samples.dtype)],
-          0)
+      paddings = array_ops.concat([
+          array_ops.zeros([num_outer_dimensions, 2], dtype=pad_samples.dtype),
+          ops.convert_to_tensor([[0, pad_samples]]),
+          array_ops.zeros([num_inner_dimensions, 2], dtype=pad_samples.dtype)
+      ], 0)
       signal = array_ops.pad(signal, paddings, constant_values=pad_value)
 
       signal_shape = array_ops.shape(signal)
diff --git a/tensorflow/python/ops/special_math_ops_test.py b/tensorflow/python/ops/special_math_ops_test.py
index 623f5063c7d..ba184b222ca 100644
--- a/tensorflow/python/ops/special_math_ops_test.py
+++ b/tensorflow/python/ops/special_math_ops_test.py
@@ -635,6 +635,8 @@ class BesselTest(test.TestCase, parameterized.TestCase):
 
 
 @test_util.run_all_in_graph_and_eager_modes
+@test_util.run_all_without_tensor_float_32(
+    'Tests einsum, which sometimes does a matmul with cuBLAS')
 class EinsumTest(test.TestCase):
 
   def _check(self, s, *input_shapes, **kwargs):
diff --git a/tensorflow/python/ops/stateful_random_ops.py b/tensorflow/python/ops/stateful_random_ops.py
index b6cbb229af4..902b83f9e0e 100644
--- a/tensorflow/python/ops/stateful_random_ops.py
+++ b/tensorflow/python/ops/stateful_random_ops.py
@@ -23,6 +23,7 @@ import enum  # pylint: disable=g-bad-import-order
 import numpy as np
 import six
 
+from tensorflow.python.compat import compat
 from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.eager import context
 from tensorflow.python.framework import composite_tensor
@@ -32,12 +33,14 @@ from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import type_spec
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_stateful_random_ops
+from tensorflow.python.ops import gen_stateless_random_ops_v2
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.training.tracking import tracking
 from tensorflow.python.util.tf_export import tf_export
 
+
 # A seed for random ops (stateful and stateless) will always be 1024
 # bits, all of which will be sent to the C++ code. The actual C++
 # implementation of some algorithms may only use a lower part of the bits.
@@ -136,6 +139,15 @@ def _make_1d_state(state_size, seed):
   return seed
 
 
+def _get_counter_size(alg):
+  if alg == RNG_ALG_PHILOX:
+    return 2
+  elif alg == RNG_ALG_THREEFRY:
+    return 1
+  else:
+    raise ValueError("Unsupported algorithm id: %s" % alg)
+
+
 def _get_state_size(alg):
   if alg == RNG_ALG_PHILOX:
     return PHILOX_STATE_SIZE
@@ -560,6 +572,10 @@ class Generator(tracking.AutoTrackable, composite_tensor.CompositeTensor):
     return self._alg
 
   def _standard_normal(self, shape, dtype):
+    if compat.forward_compatible(2020, 10, 25):
+      key, counter = self._prepare_key_counter(shape)
+      return gen_stateless_random_ops_v2.stateless_random_normal_v2(
+          shape, key=key, counter=counter, dtype=dtype, alg=self.algorithm)
     return gen_stateful_random_ops.stateful_standard_normal_v2(
         self.state.handle, self.algorithm, shape, dtype=dtype)
 
@@ -586,6 +602,8 @@ class Generator(tracking.AutoTrackable, composite_tensor.CompositeTensor):
     else:
       raise ValueError("Unsupported algorithm id: %s" % alg)
 
+  # TODO(wangpeng): Add "Returns" section to docstring once new version kicks in
+  # pylint: disable=g-doc-return-or-yield
   def skip(self, delta):
     """Advance the counter of a counter-based RNG.
 
@@ -595,7 +613,24 @@ class Generator(tracking.AutoTrackable, composite_tensor.CompositeTensor):
         (or any other distribution). The actual increment added to the
         counter is an unspecified implementation detail.
     """
-    gen_stateful_random_ops.rng_skip(self.state.handle, self.algorithm, delta)
+    if compat.forward_compatible(2020, 10, 25):
+      return gen_stateful_random_ops.rng_read_and_skip(
+          self.state.handle,
+          alg=math_ops.cast(self.algorithm, dtypes.int32),
+          delta=math_ops.cast(delta, dtypes.uint64))
+    gen_stateful_random_ops.rng_skip(
+        self.state.handle, math_ops.cast(self.algorithm, dtypes.int64),
+        math_ops.cast(delta, dtypes.int64))
+  # pylint: enable=g-doc-return-or-yield
+
+  def _prepare_key_counter(self, shape):
+    delta = math_ops.reduce_prod(shape)
+    counter_key = self.skip(delta)
+    counter_size = _get_counter_size(self.algorithm)
+    counter = array_ops.bitcast(counter_key[:counter_size], dtypes.uint64)
+    key = array_ops.bitcast(counter_key[counter_size:counter_size + 1],
+                            dtypes.uint64)
+    return key, counter
 
   # The following functions return a tensor and as a side effect update
   # self._state_var.
@@ -624,6 +659,14 @@ class Generator(tracking.AutoTrackable, composite_tensor.CompositeTensor):
       return math_ops.add(rnd * stddev, mean, name=name)
 
   def _truncated_normal(self, shape, dtype):
+    if compat.forward_compatible(2020, 10, 25):
+      key, counter = self._prepare_key_counter(shape)
+      return gen_stateless_random_ops_v2.stateless_truncated_normal_v2(
+          shape=shape,
+          key=key,
+          counter=counter,
+          dtype=dtype,
+          alg=self.algorithm)
     return gen_stateful_random_ops.stateful_truncated_normal(
         self.state.handle, self.algorithm, shape, dtype=dtype)
 
@@ -662,10 +705,27 @@ class Generator(tracking.AutoTrackable, composite_tensor.CompositeTensor):
       return math_ops.add(mul, mean_tensor, name=name)
 
   def _uniform(self, shape, dtype):
+    if compat.forward_compatible(2020, 10, 25):
+      key, counter = self._prepare_key_counter(shape)
+      return gen_stateless_random_ops_v2.stateless_random_uniform_v2(
+          shape=shape,
+          key=key,
+          counter=counter,
+          dtype=dtype,
+          alg=self.algorithm)
     return gen_stateful_random_ops.stateful_uniform(
         self.state.handle, self.algorithm, shape=shape, dtype=dtype)
 
   def _uniform_full_int(self, shape, dtype, name=None):
+    if compat.forward_compatible(2020, 10, 25):
+      key, counter = self._prepare_key_counter(shape)
+      return gen_stateless_random_ops_v2.stateless_random_uniform_full_int_v2(
+          shape=shape,
+          key=key,
+          counter=counter,
+          dtype=dtype,
+          alg=self.algorithm,
+          name=name)
     return gen_stateful_random_ops.stateful_uniform_full_int(
         self.state.handle, self.algorithm, shape=shape,
         dtype=dtype, name=name)
@@ -729,6 +789,16 @@ class Generator(tracking.AutoTrackable, composite_tensor.CompositeTensor):
       minval = ops.convert_to_tensor(minval, dtype=dtype, name="min")
       maxval = ops.convert_to_tensor(maxval, dtype=dtype, name="max")
       if dtype.is_integer:
+        if compat.forward_compatible(2020, 10, 25):
+          key, counter = self._prepare_key_counter(shape)
+          return gen_stateless_random_ops_v2.stateless_random_uniform_int_v2(
+              shape=shape,
+              key=key,
+              counter=counter,
+              minval=minval,
+              maxval=maxval,
+              alg=self.algorithm,
+              name=name)
         return gen_stateful_random_ops.stateful_uniform_int(
             self.state.handle, self.algorithm, shape=shape,
             minval=minval, maxval=maxval, name=name)
diff --git a/tensorflow/python/ops/stateful_random_ops_test.py b/tensorflow/python/ops/stateful_random_ops_test.py
index 2389a068854..756ead401b4 100644
--- a/tensorflow/python/ops/stateful_random_ops_test.py
+++ b/tensorflow/python/ops/stateful_random_ops_test.py
@@ -274,7 +274,7 @@ class StatefulRandomOpsTest(test.TestCase, parameterized.TestCase):
     with self.cached_session() as sess:
       gen1 = random.Generator.from_seed(seed)
       gen2 = random.Generator.from_non_deterministic_state()
-      sess.run((gen1._state_var.initializer, gen2._state_var.initializer))
+      sess.run((gen1.state.initializer, gen2.state.initializer))
       r1 = gen1.normal(shape, dtype=dtypes.float32)
       r2 = gen2.normal(shape, dtype=dtypes.float32)
       def f():
@@ -372,7 +372,7 @@ class StatefulRandomOpsTest(test.TestCase, parameterized.TestCase):
     gen = random.Generator(state=[counter, 0, key], alg=random.RNG_ALG_PHILOX)
     delta = 432
     gen.skip(delta)
-    new_counter = gen._state_var[0]
+    new_counter = gen.state[0]
     self.assertAllEqual(counter + delta * 256, new_counter)
 
   def _sameAsOldRandomOps(self, device, floats):
@@ -394,7 +394,7 @@ class StatefulRandomOpsTest(test.TestCase, parameterized.TestCase):
         with ops.device(device):
           return new(dtype, gen)
 
-      for _ in range(100):
+      for _ in range(5):
         self.assertAllEqual(run_old(), run_new())
 
     shape = constant_op.constant([4, 7])
@@ -582,6 +582,11 @@ class StatefulRandomOpsTest(test.TestCase, parameterized.TestCase):
   @test_util.run_v2_only
   def testGetGlobalGeneratorWithXla(self):
     """Demonstrates using the global generator with XLA."""
+    # This test was passing before because soft placement silently picked the
+    # CPU kernel.
+    # TODO(wangpeng): Remove this skip
+    self.skipTest("NonDeterministicInts lacks XLA kernel.")
+
     if not config.list_physical_devices("XLA_CPU"):
       self.skipTest("No XLA_CPU device available.")
 
@@ -675,17 +680,16 @@ class StatefulRandomOpsTest(test.TestCase, parameterized.TestCase):
           random.GeneratorSpec(shape=(2, 3), dtype=dtypes.int64))
 
   @test_util.run_v2_only
-  @test_util.run_cuda_only
-  def testMirroredStratSeq(self):
+  def testCreateOutsideMirroredStrat(self):
     """Tests RNG/MirrorStrategy interaction #1.
 
-    If an RNG is created outside strategy.scope(), all replicas will access the
+    If an RNG is created outside a DS scope, all replicas will access the
     same RNG object, and accesses are serialized.
     """
     shape = [3, 4]
     dtype = dtypes.int32
     gen = random.Generator.from_seed(1234)
-    strat = MirroredStrategy(devices=["/cpu:0", test_util.gpu_device_name()])
+    strat = MirroredStrategy(devices=["cpu:0", "cpu:1"])
     with strat.scope():
       def f():
         t1 = gen.uniform_full_int(shape=shape, dtype=dtype)
@@ -762,4 +766,5 @@ class StatefulRandomOpsTest(test.TestCase, parameterized.TestCase):
 
 
 if __name__ == "__main__":
+  config.set_soft_device_placement(False)
   test.main()
diff --git a/tensorflow/python/ops/stateless_random_ops.py b/tensorflow/python/ops/stateless_random_ops.py
index 3e825cc4775..ebe15ec0dce 100644
--- a/tensorflow/python/ops/stateless_random_ops.py
+++ b/tensorflow/python/ops/stateless_random_ops.py
@@ -20,16 +20,19 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.compat import compat
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_stateless_random_ops
+from tensorflow.python.ops import gen_stateless_random_ops_v2
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
+
 ops.NotDifferentiable("StatelessMultinomial")
 ops.NotDifferentiable("StatelessRandomBinomial")
 ops.NotDifferentiable("StatelessRandomNormal")
@@ -40,6 +43,13 @@ ops.NotDifferentiable("StatelessRandomUniformFullInt")
 ops.NotDifferentiable("StatelessTruncatedNormal")
 
 
+ops.NotDifferentiable("StatelessRandomNormalV2")
+ops.NotDifferentiable("StatelessRandomUniformV2")
+ops.NotDifferentiable("StatelessRandomUniformIntV2")
+ops.NotDifferentiable("StatelessRandomUniformFullIntV2")
+ops.NotDifferentiable("StatelessTruncatedNormalV2")
+
+
 @tf_export("random.experimental.stateless_split")
 @dispatch.add_dispatch_support
 def split(seed, num=2):
@@ -113,6 +123,10 @@ def fold_in(seed, data):
   return array_ops.stack([seed1, data])
 
 
+_get_key_counter_alg = (gen_stateless_random_ops_v2
+                        .stateless_random_get_key_counter_alg)
+
+
 @tf_export("random.stateless_uniform")
 @dispatch.add_dispatch_support
 def stateless_random_uniform(shape,
@@ -192,17 +206,35 @@ def stateless_random_uniform(shape,
                       [shape, seed, minval, maxval]) as name:
     shape = tensor_util.shape_tensor(shape)
     if dtype.is_integer and minval is None:
-      result = gen_stateless_random_ops.stateless_random_uniform_full_int(
-          shape, seed=seed, dtype=dtype, name=name)
+      if compat.forward_compatible(2020, 10, 25):
+        key, counter, alg = _get_key_counter_alg(seed)
+        result = (gen_stateless_random_ops_v2
+                  .stateless_random_uniform_full_int_v2(
+                      shape, key=key, counter=counter, dtype=dtype, alg=alg,
+                      name=name))
+      else:
+        result = gen_stateless_random_ops.stateless_random_uniform_full_int(
+            shape, seed=seed, dtype=dtype, name=name)
     else:
       minval = ops.convert_to_tensor(minval, dtype=dtype, name="min")
       maxval = ops.convert_to_tensor(maxval, dtype=dtype, name="max")
       if dtype.is_integer:
-        result = gen_stateless_random_ops.stateless_random_uniform_int(
-            shape, seed=seed, minval=minval, maxval=maxval, name=name)
+        if compat.forward_compatible(2020, 10, 25):
+          key, counter, alg = _get_key_counter_alg(seed)
+          result = gen_stateless_random_ops_v2.stateless_random_uniform_int_v2(
+              shape, key=key, counter=counter, minval=minval, maxval=maxval,
+              alg=alg, name=name)
+        else:
+          result = gen_stateless_random_ops.stateless_random_uniform_int(
+              shape, seed=seed, minval=minval, maxval=maxval, name=name)
       else:
-        rnd = gen_stateless_random_ops.stateless_random_uniform(
-            shape, seed=seed, dtype=dtype)
+        if compat.forward_compatible(2020, 10, 25):
+          key, counter, alg = _get_key_counter_alg(seed)
+          rnd = gen_stateless_random_ops_v2.stateless_random_uniform_v2(
+              shape, key=key, counter=counter, dtype=dtype, alg=alg)
+        else:
+          rnd = gen_stateless_random_ops.stateless_random_uniform(
+              shape, seed=seed, dtype=dtype)
         result = math_ops.add(rnd * (maxval - minval), minval, name=name)
     tensor_util.maybe_set_static_shape(result, shape)
     return result
@@ -476,7 +508,12 @@ def stateless_random_normal(shape,
     shape = tensor_util.shape_tensor(shape)
     mean = ops.convert_to_tensor(mean, dtype=dtype, name="mean")
     stddev = ops.convert_to_tensor(stddev, dtype=dtype, name="stddev")
-    rnd = gen_stateless_random_ops.stateless_random_normal(shape, seed, dtype)
+    if compat.forward_compatible(2020, 10, 25):
+      key, counter, alg = _get_key_counter_alg(seed)
+      rnd = gen_stateless_random_ops_v2.stateless_random_normal_v2(
+          shape, key=key, counter=counter, dtype=dtype, alg=alg)
+    else:
+      rnd = gen_stateless_random_ops.stateless_random_normal(shape, seed, dtype)
     result = math_ops.add(rnd * stddev, mean, name=name)
     tensor_util.maybe_set_static_shape(result, shape)
     return result
@@ -521,8 +558,13 @@ def stateless_truncated_normal(shape,
     shape = tensor_util.shape_tensor(shape)
     mean = ops.convert_to_tensor(mean, dtype=dtype, name="mean")
     stddev = ops.convert_to_tensor(stddev, dtype=dtype, name="stddev")
-    rnd = gen_stateless_random_ops.stateless_truncated_normal(
-        shape, seed, dtype)
+    if compat.forward_compatible(2020, 10, 25):
+      key, counter, alg = _get_key_counter_alg(seed)
+      rnd = gen_stateless_random_ops_v2.stateless_truncated_normal_v2(
+          shape, key=key, counter=counter, dtype=dtype, alg=alg)
+    else:
+      rnd = gen_stateless_random_ops.stateless_truncated_normal(
+          shape, seed, dtype)
     result = math_ops.add(rnd * stddev, mean, name=name)
     tensor_util.maybe_set_static_shape(result, shape)
     return result
diff --git a/tensorflow/python/ops/structured/structured_tensor.py b/tensorflow/python/ops/structured/structured_tensor.py
index 3c3bd03a06b..c09a38f1d21 100644
--- a/tensorflow/python/ops/structured/structured_tensor.py
+++ b/tensorflow/python/ops/structured/structured_tensor.py
@@ -1,3 +1,4 @@
+# Lint as python3
 # Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -19,6 +20,7 @@ from __future__ import division
 from __future__ import print_function
 
 import re
+from typing import Callable, Dict, List, Sequence, Tuple, Union
 
 import numpy as np
 
@@ -85,6 +87,23 @@ class StructuredTensor(composite_tensor.CompositeTensor):
   field.
   """
 
+  #=============================================================================
+  # Common Types
+  #=============================================================================
+  # pylint: disable=invalid-name
+  # Field names work as key, and they can be a sequence to refer to the
+  # sub-levels (embedded) StructuredTensor's.
+  FieldName = Union[str, Sequence[str]]
+
+  # Each field may contain one of the following types of Tensors.
+  FieldValue = Union[ops.Tensor, ragged_tensor.RaggedTensor, 'StructuredTensor']
+
+  # Function that takes a FieldValue as input and returns the transformed
+  # FieldValue.
+  FieldFn = Callable[[FieldValue], FieldValue]
+
+  # pylint: enable=invalid-name
+
   #=============================================================================
   # Constructor & Factory Methods
   #=============================================================================
@@ -252,6 +271,180 @@ class StructuredTensor(composite_tensor.CompositeTensor):
         row_partitions,
         internal=_structured_tensor_factory_key)
 
+  def with_updates(self,
+                   updates: Dict[FieldName, Union[FieldValue, FieldFn, None]],
+                   validate: bool = False) -> 'StructuredTensor':    # pylint: disable=bad-whitespace
+    """Creates a new `StructuredTensor` with the updated fields.
+
+    If this `StructuredTensor` is a scalar, and `k` is the `FieldName` being
+    updated and `v` the new value, then:
+
+    ```
+    result[k] = v              # If (k, v) is in updates and v is a FieldValue
+    result[k] = f(self[k])     # If (k, f) is in updates and f is a FieldFn
+    result[k] = self[k]        # If k is in self.field_names but not in updates
+    ```
+
+    If this `StructuredTensor` has rank `N` and shape `[D1...DN]`, then each
+    FieldValue `v` in `updates` must have shape `[D1...DN, ...]`, that is,
+    prefixed with the same shape as the `StructuredTensor`. Then the resulting
+    `StructuredTensor` will have:
+
+    ```
+    result[i1...iN][k] = v[i1...iN]                        # (k, v) in updates
+    result[i1...iN][k] = f(self.field_value(k))[i1...iN]   # (k, f) in updates
+    result[i1...iN][k] = self[i1...iN][k]                  # k not in updates
+    ```
+
+    Note that `result.shape` is always equal to `self.shape` (but the shapes
+    of nested StructuredTensors may be changed if they are updated with new
+    values).
+
+    Args:
+      updates: A dictionary mapping `FieldName` to either a `FieldValue` to be
+        used to update, or a `FieldFn` that will transform the value for the
+        given `FieldName`. `FieldName` can be a string for a direct field, or a
+        sequence of strings to refer to a nested sub-field. `FieldFn` is a
+        function that takes a `FieldValue` as input and should return a
+        `FieldValue`. All other fields are copied over to the new
+        `StructuredTensor`. New `FieldName` can be given (to add new fields),
+        but only to existing `StructuredTensor`, it won't automatically create
+        new nested structures -- but one can create a whole `StructureTensor`
+        sub-structure and set that into an existing structure. If the new value
+        is set to `None`, it is removed.
+      validate: If true, then add runtime validation ops that check that the
+        field values all have compatible shapes in the outer `shape.rank`
+        dimensions.
+
+    Returns:
+      A `StructuredTensor`.
+
+    Raises:
+      `ValueError`: If the any of the `FieldName` keys points to non-existent
+        sub-structures, if parent and child nodes are updated, if shapes
+        change, if a delete update is given for a non-existant field, or if a
+        `FieldFn` transforming function is given for a `FieldName` that doesn't
+        yet exist.
+
+    Examples:
+
+    >>> shoes_us = StructuredTensor.from_pyval([
+    ...    {"age": 12, "nicknames": ["Josaphine"],
+    ...       "shoes": {"sizes": [8.0, 7.5, 7.5]}},
+    ...    {"age": 82, "nicknames": ["Bob", "Bobby"],
+    ...        "shoes": {"sizes": [11.0, 11.5, 12.0]}},
+    ...    {"age": 42, "nicknames": ["Elmo"],
+    ...        "shoes": {"sizes": [9.0, 9.5, 10.0]}}])
+    >>> def us_to_europe(t):
+    ...   return tf.round(t * 2.54 + 17.0)  # Rough approximation.
+    >>> shoe_sizes_key = ("shoes", "sizes")
+    >>> shoes_eu = shoes_us.with_updates({shoe_sizes_key: us_to_europe})
+    >>> shoes_eu.field_value(shoe_sizes_key)
+    <tf.RaggedTensor [[37.0, 36.0, 36.0], [45.0, 46.0, 47.0],
+    [40.0, 41.0, 42.0]]>
+    """
+    updates_items = [(_normalize_field_name_to_tuple(name), value)
+                     for name, value in updates.items()]
+
+    # Sort by keys and check for updates of both parent and child nodes.
+    updates_items = sorted(updates_items)
+    for i in range(1, len(updates_items)):
+      # Parent of a node would precede node in the sorted order.
+      name = updates_items[i][0]  # item[0] is the name, item[1] is the value.
+      prev_name = updates_items[i - 1][0]
+      if name[:len(prev_name)] == prev_name:
+        raise ValueError(
+            '`StructuredTensor.with_updates` does not allow both parent and '
+            'child nodes to be updated: parent={}, child={}. If needed you can '
+            'update child nodes in the parent update value.'.format(
+                prev_name, name))
+    return self._with_updates_impl((), updates_items, validate)
+
+  def _with_updates_impl(self, error_prefix: Tuple[str],  # pylint: disable=invalid-sequence-index
+                         updates: List[Tuple[FieldName, Union[FieldValue,  # pylint: disable=invalid-sequence-index
+                                                              FieldFn]]],
+                         validate: bool) -> 'StructuredTensor':
+    """Recursive part of `with_updates` implementation."""
+    # Get current fields.
+    new_fields = dict(self._fields)
+
+    # Convert field name to string with full path for error messages.
+    def name_fullpath(name: Sequence[str]) -> str:
+      return str(error_prefix + (name,))
+
+    # Apply value if a function or the value itself.
+    def apply_value(name: str, value: Union['FieldValue',
+                                            'FieldFn']) -> 'FieldValue':
+      if callable(value):
+        # `value` is actually a transforming function.
+        if name not in new_fields:
+          raise ValueError(
+              '`StructuredTensor.with_updates` cannot update the field {} '
+              'because a transforming function was given, but that field '
+              'does not already exist.'.format(name_fullpath(name)))
+        value = value(new_fields[name])
+      return value
+
+    # Merge updates.
+    for name, value in updates:
+      if not name or not name[0]:
+        raise ValueError(
+            '`StructuredTensor.with_updates` does not allow empty names '
+            '{}.'.format(name_fullpath(name)))
+
+      if len(name) == 1:
+        name = name[0]
+        if value is None:
+          if name not in new_fields:
+            raise ValueError(
+                '`StructuredTensor.with_updates` cannot delete field '
+                '{} because it is not present.'.format(name_fullpath(name)))
+          new_fields.pop(name)
+        else:
+          new_fields[name] = apply_value(name, value)
+      else:
+        # Recursive
+        prefix = name[0]
+        suffix = name[1:]
+        if prefix not in new_fields:
+          raise ValueError(
+              '`StructuredTensor.with_updates` cannot create new sub-field '
+              '{} if parent field {} is not set.'.format(
+                  error_prefix + tuple(name), name_fullpath(prefix)))
+        current_value = new_fields[prefix]
+        if not isinstance(current_value, StructuredTensor):
+          raise ValueError(
+              '`StructuredTensor.with_updates` cannot create new sub-field '
+              '{} if parent structure {} is not a `StructuredTensor` that '
+              'can contain sub-structures -- it is a `{}`.'.format(
+                  error_prefix + tuple(name), name_fullpath(prefix),
+                  type(current_value)))
+        one_update = [(suffix, value)]
+
+        # Accessing protected member in recursion.
+        # FutureWork: optimize by aggregating the recursions, instead of
+        #   calling one at a time.
+        # pylint: disable=protected-access
+        value = current_value._with_updates_impl(error_prefix + (prefix,),
+                                                 one_update, validate)
+        # pylint: enable=protected-access
+        new_fields[prefix] = value
+
+    # TODO(edloper): When validate=True, only validate the modified fields.
+    try:
+      return StructuredTensor.from_fields(
+          new_fields,
+          shape=self.shape,
+          row_partitions=self._row_partitions,
+          nrows=self._nrows,
+          validate=validate)
+
+    except ValueError as e:
+      msg = '`StructuredTensor.with_updates` failed'
+      if error_prefix:
+        msg = '{} for field {}'.format(msg, error_prefix)
+      raise ValueError('{}: {}'.format(msg, e))
+
   #=============================================================================
   # Properties
   #=============================================================================
@@ -279,22 +472,74 @@ class StructuredTensor(composite_tensor.CompositeTensor):
   def row_partitions(self):
     """A tuple of `RowPartition`s defining the shape of this `StructuredTensor`.
 
-    If this `StructuredTensor` has a ragged shape, then all fields will be
-    encoded as either `RaggedTensor`s or `StructuredTensor`s with these
-    `RowPartition`s used to define their outermost `self.rank` dimensions.
+    When `self.rank <= 1`, this tuple will be empty.
 
-    If this `StructuredTensor` has a uniform (non-ragged) shape, then these
-    row partitions will all be defined using `uniform_row_length`.
+    When `self.rank > 1`, these `RowPartitions` define the shape of the
+    `StructuredTensor` by describing how a flat (1D) list of structures can be
+    repeatedly partitioned to form a higher-dimensional object.  In particular,
+    the flat list is first partitioned into sublists using `row_partitions[-1]`,
+    and then those sublists are further partitioned using `row_partitions[-2]`,
+    etc.  The following examples show the row partitions used to describe
+    several different `StructuredTensor`, each of which contains 8 copies of
+    the same structure (`x`):
+
+    >>> x = {'a': 1, 'b': ['foo', 'bar', 'baz']}       # shape = [] (scalar)
+
+    >>> s1 = [[x, x, x, x], [x, x, x, x]]              # shape = [2, 4]
+    >>> StructuredTensor.from_pyval(s1).row_partitions
+    (tf.RowPartition(row_splits=tf.Tensor([0 4 8], shape=(3,),
+                                          dtype=int64)),)
+
+    >>> s2 = [[x, x], [x, x], [x, x], [x, x]]          # shape = [4, 2]
+    >>> StructuredTensor.from_pyval(s2).row_partitions
+    (tf.RowPartition(row_splits=tf.Tensor([0 2 4 6 8], shape=(5,),
+                                          dtype=int64)),)
+
+    >>> s3 = [[x, x, x], [], [x, x, x, x], [x]]        # shape = [2, None]
+    >>> StructuredTensor.from_pyval(s3).row_partitions
+    (tf.RowPartition(row_splits=tf.Tensor([0 3 3 7 8], shape=(5,),
+                                          dtype=int64)),)
+
+    >>> s4 = [[[x, x], [x, x]], [[x, x], [x, x]]]      # shape = [2, 2, 2]
+    >>> StructuredTensor.from_pyval(s4).row_partitions
+    (tf.RowPartition(row_splits=tf.Tensor([0 2 4], shape=(3,), dtype=int64)),
+     tf.RowPartition(row_splits=tf.Tensor([0 2 4 6 8], shape=(5,),
+                                          dtype=int64)))
+
+
+    >>> s5 = [[[x, x], [x]], [[x, x]], [[x, x], [x]]]  # shape = [3, None, None]
+    >>> StructuredTensor.from_pyval(s5).row_partitions
+    (tf.RowPartition(row_splits=tf.Tensor([0 2 3 5], shape=(4,), dtype=int64)),
+     tf.RowPartition(row_splits=tf.Tensor([0 2 3 5 7 8], shape=(6,),
+                                          dtype=int64)))
+
+    Note that shapes for nested fields (such as `x['b']` in the above example)
+    are not considered part of the shape of a `StructuredTensor`, and are not
+    included in `row_partitions`.
+
+    If this `StructuredTensor` has a ragged shape (i.e., if any of the
+    `row_partitions` is not uniform in size), then all fields will be encoded
+    as either `RaggedTensor`s or `StructuredTensor`s with these `RowPartition`s
+    used to define their outermost `self.rank` dimensions.
 
     Returns:
       A `tuple` of `RowPartition` objects with length `self.rank - 1`
-      (or `0` if `self.rank < 2`).
+      (or `0` if `self.rank < 2`)
+
     """
     return self._row_partitions
 
   def nrows(self):
     """The number of rows in this StructuredTensor (if rank>0).
 
+    This means the length of the outer-most dimension of the StructuredTensor.
+
+    Notice that if `self.rank > 1`, then this equals the number of rows
+    of the first row partition. That is,
+    `self.nrows() == self.row_partitions[0].nrows()`.
+
+    Otherwise `self.nrows()` will be the first dimension of the field values.
+
     Returns:
       A scalar integer `Tensor` (or `None` if `self.rank == 0`).
     """
@@ -1175,3 +1420,13 @@ def _merge_dims(value, outer_axis, inner_axis):
 
 
 _structured_tensor_factory_key = object()  # unique private object
+
+
+def _normalize_field_name_to_tuple(name: 'FieldName') -> Sequence[str]:
+  """FieldName can be given also as string, this normalizes it to a tuple."""
+  if isinstance(name, str):
+    return (name,)
+  if isinstance(name, list):
+    return tuple(name)
+  assert isinstance(name, tuple)
+  return name
diff --git a/tensorflow/python/ops/structured/structured_tensor_test.py b/tensorflow/python/ops/structured/structured_tensor_test.py
index 75aa5a872a6..f4218042cc2 100644
--- a/tensorflow/python/ops/structured/structured_tensor_test.py
+++ b/tensorflow/python/ops/structured/structured_tensor_test.py
@@ -924,7 +924,7 @@ class StructuredTensorTest(test_util.TensorFlowTestCase,
     st = StructuredTensor.from_pyval({"a": 5, "b": {"c": [1, 2, 3]}})
     self.assertAllEqual(st.field_value(("a",)), 5)
     self.assertAllEqual(st.field_value(("b", "c")), [1, 2, 3])
-    expected = "Field path \(.*a.*,.*b.*\) not found in .*"
+    expected = r"Field path \(.*a.*,.*b.*\) not found in .*"
     with self.assertRaisesRegex(KeyError, expected):
       st.field_value(("a", "b"))
 
@@ -961,6 +961,179 @@ class StructuredTensorTest(test_util.TensorFlowTestCase,
     r = result.field_value("r")
     self.assertAllEqual(r, [[[1, 2], [3, 4]]])
 
+  @parameterized.parameters([
+      # Simple example.
+      (
+          {"a": 12, "b": 23},
+          {"a": 7},
+      ),
+      # New field.
+      (
+          {"a": 12},
+          {("b",): 13},
+      ),
+      # Nested example.
+      (
+          {"a": 12, "b": {"c": 23}},
+          {("b", "c"): 7},
+      ),
+      # Multipe updates.
+      (
+          {"a": 12, "b": {"c": 23}},
+          {"a": 3, ("b", "c"): 7},
+      ),
+      # Deep updates.
+      (
+          {"a": 12, "b": {"c": 23, "d": {"e": 11}}},
+          {("b", "c"): 7, ("b", "d", "e"): 13},
+      ),
+      # Multiple updates to the same substructure.
+      (
+          {"a": 12, "b": {"c": 23, "d": {"e": 11}}},
+          {("b", "c"): 7, ("b", "f"): 13},
+      ),
+      # Scalar to non-scalar elements. Shape remains unchanged.
+      (
+          {"a": 5},
+          {"a": ragged_factory_ops.constant_value([[51, 52], [61, 62, 63]])},
+      ),
+      # Non-scalar element to scalar.
+      (
+          {"c": {"a": [5, 3], "b": 2}},
+          {("c", "a"): 5},
+      ),
+      # Rank-1 StructuredTensor: shape is preserved and an item is added.
+      (
+          [{"a": 5}, {"a": 6}],
+          {"a": [15, 16], "b": np.array([0.9, 1.1])},
+      ),
+      # Non-scalar ragged elements, within a rank-2 StructuredTensor: elements
+      # rows (inner dimensions) are changed, but StructuredTensor shape
+      # (outer dimensions) are preserved.
+      (
+          [[{"a": [5]}], [{"a": [3, 4]}, {"a": [8]}]],
+          {"a": ragged_factory_ops.constant_value([[[50, 60]], [[30], []]])},
+      ),
+  ])  # pyformat: disable
+  def testWithUpdatesValues(self, pyval, updates):
+    st = StructuredTensor.from_pyval(pyval)
+    updated_st = st.with_updates(updates, validate=False)
+    for key, value in updates.items():
+      got = updated_st.field_value(key)
+      self.assertAllEqual(
+          value, got, "Update failed: key={}, value={}, got={}".format(
+              key, value, got))
+
+  def testWithUpdatesFunctions(self):
+    pyval = {"a": 12, "b": {"c": 23, "d": {"e": 11}}}
+    st = StructuredTensor.from_pyval(pyval)
+    st_updated = st.with_updates(
+        {
+            "a": lambda x: x + 1,
+            ("b", "d", "e"): lambda x: x + 7
+        }, validate=True)
+    # Updated values.
+    self.assertAllEqual(st_updated.field_value("a"), 13)
+    self.assertAllEqual(st_updated.field_value(("b", "d", "e")), 18)
+    # Unchanged value.
+    self.assertAllEqual(st_updated.field_value(("b", "c")), 23)
+
+  def testWithUpdatesChecks(self):
+    pyval = {"a": 12, "b": {"c": 23, "d": {"e": 11}}}
+    st = StructuredTensor.from_pyval(pyval)
+
+    # Try to set non-existant sub-structure.
+    with self.assertRaisesRegex(
+        ValueError, r"cannot create new sub-field.*\('b', 'x'\).*is not set"):
+      st.with_updates({("b", "x", "e"): 5})
+
+    # Try to set with path to a non-sub-structure.
+    with self.assertRaisesRegex(
+        ValueError, r"cannot create new sub-field.*\('b', 'c'\).*is not a "
+        r"`StructuredTensor`"):
+      st.with_updates({("b", "c", "e"): 5})
+
+    # Try to apply function to non-existing value.
+    with self.assertRaisesRegex(
+        ValueError, r"cannot update.*\('b', 'd', 'x'\).*does not already "
+        r"exist"):
+      st.with_updates({("b", "d", "x"): lambda x: x + 1})
+
+    # Empty names not allowed.
+    with self.assertRaisesRegex(ValueError, r"does not allow empty names"):
+      st.with_updates({(): lambda x: x + 1})
+    with self.assertRaisesRegex(ValueError, r"does not allow empty names"):
+      st.with_updates({("b", ""): lambda x: x + 1})
+
+    # Parent and child nodes cannot be updated simultaneously.
+    with self.assertRaisesRegex(
+        ValueError, r"does not allow both parent and child nodes.*"
+        r"parent=\('b'.*child=\('b', 'd'"):
+      st.with_updates({("b", "d"): lambda x: x + 1, "a": 3, "b": 10})
+
+    # Invalid shape change.
+    with self.assertRaisesRegex(
+        ValueError, r"\('c'.*incompatible with the shape that was specified"):
+      st_with_shape = StructuredTensor.from_pyval([[{
+          "c": {
+              "a": 5,
+              "b": 2
+          }
+      }], [{
+          "c": {
+              "a": 3,
+              "b": 1
+          }
+      }, {
+          "c": {
+              "a": 8,
+              "b": 18
+          }
+      }]])
+      st_with_shape.with_updates({("c", "a"): 3})
+
+  def testWithUpdatesDelete(self):
+    pyval = {"a": 12, "b": {"c": 23, "d": {"e": 11}}}
+    st = StructuredTensor.from_pyval(pyval)
+    updated_st = st.with_updates({("b", "c"): None}, validate=True)
+    self.assertNotIn("c", updated_st.field_value("b").field_names())
+    with self.assertRaisesRegex(ValueError,
+                                r"cannot delete.*\('b', 'x'\).*not present"):
+      st.with_updates({("b", "x"): None}, validate=True)
+    with self.assertRaisesRegex(ValueError,
+                                r"cannot delete.*\'x'.*not present"):
+      st.with_updates({"x": None}, validate=False)
+
+    # Test that nrows() and rowpartitions() is preserved after removal.
+    pyval = [[{"a": 1}, {"a": 2}], [{"a": 3}]]
+    st = StructuredTensor.from_pyval(pyval)
+    self.assertLen(st.row_partitions, 1)
+    self.assertAllEqual(st.nrows(), 2)
+    self.assertAllEqual(st.row_partitions[0].row_lengths(), [2, 1])
+    updated_st = st.with_updates({("a",): None}, validate=True)
+    self.assertLen(updated_st.row_partitions, 1)
+    self.assertAllEqual(updated_st.nrows(), 2)
+    self.assertAllEqual(updated_st.row_partitions[0].row_lengths(), [2, 1])
+
+    # Test that it works also for rank-1 and rank-0 empty results.
+    pyval = [{"a": 1}, {"a": 2}]
+    st = StructuredTensor.from_pyval(pyval)
+    self.assertEqual(st.rank, 1)
+    updated_st = st.with_updates({("a",): None}, validate=True)
+    self.assertEqual(updated_st.rank, 1)
+
+    # assertEqual won't work because nrows() returns a tensor, and
+    # assertEqual doesn't do the magic to convert them to numbers in a
+    # way that works in eager/non-eager mode.
+    self.assertAllEqual(updated_st.nrows(), 2)
+    pyval = {"a": [0, 1]}
+    st = StructuredTensor.from_pyval(pyval)
+    self.assertEqual(st.rank, 0)
+    updated_st = st.with_updates({("a",): None}, validate=True)
+    self.assertEqual(updated_st.rank, 0)
+    self.assertFalse(updated_st.row_partitions)
+    self.assertIsNone(updated_st.nrows())
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/ops/summary_ops_v2.py b/tensorflow/python/ops/summary_ops_v2.py
index 305c3fa6a9a..db9227c97cb 100644
--- a/tensorflow/python/ops/summary_ops_v2.py
+++ b/tensorflow/python/ops/summary_ops_v2.py
@@ -1360,14 +1360,14 @@ def trace_off():
   """Stops the current trace and discards any collected information."""
   global _current_trace_context
   with _current_trace_context_lock:
+    if _current_trace_context is None:
+      return  # tracing already off
+    graph, profiler = _current_trace_context  # pylint: disable=redefined-outer-name, unpacking-non-sequence
     _current_trace_context = None
 
-  # Disabling run_metadata disables graph collection as well.
-  context.context().disable_run_metadata()
+  if graph:
+    # Disabling run_metadata disables graph collection as well.
+    context.context().disable_run_metadata()
 
-  # profiler only has start and stop. One needs to stop in order to export
-  # and stopping when it is not running will raise an error.
-  try:
+  if profiler:
     _profiler.stop()
-  except _profiler.ProfilerNotRunningError:
-    pass
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index 6e0e83f8564..c1804f770b1 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -270,6 +270,24 @@ def disable_resource_variables():
   _api_usage_gauge.get_cell().set(False)
 
 
+def _needs_no_arguments(python_callable):
+  """Returns true if the callable needs no arguments to call."""
+  # TODO(bfontain): Switch to inspect.signature when we are python 3 only.
+  # signature = inspect.signature(python_callable)
+  # return not [1 for param in signature.parameters.values()
+  #             if param.default == param.empty]
+  num_arguments = len(tf_inspect.getargspec(python_callable).args)
+  if not tf_inspect.isfunction(python_callable) and not isinstance(
+      python_callable, functools.partial):
+    # getargspec includes self for function objects (which aren't
+    # functools.partial). This has no default so we need to remove it.
+    # It is not even an argument so its odd that getargspec returns this.
+    # Note that this is fixed with inspect.signature in Python 3.
+    num_arguments -= 1
+  return num_arguments == len(
+      tf_inspect.getargspec(python_callable).defaults or [])
+
+
 class _VariableStore(object):
   """Variable store that carries a number of named Variables.
 
@@ -905,18 +923,17 @@ class _VariableStore(object):
         # Instantiate initializer if provided initializer is a type object.
         if tf_inspect.isclass(initializer):
           initializer = initializer()
-        if shape is not None and shape.is_fully_defined():
+        if shape.is_fully_defined():
           if "partition_info" in tf_inspect.getargspec(initializer).args:
-            init_val = lambda: initializer(  # pylint: disable=g-long-lambda
-                shape.as_list(),
-                dtype=dtype,
-                partition_info=partition_info)
+            init_val = functools.partial(initializer,
+                                         shape.as_list(),
+                                         dtype=dtype,
+                                         partition_info=partition_info)
           else:
-            init_val = lambda: initializer(  # pylint: disable=g-long-lambda
-                shape.as_list(), dtype=dtype)
+            init_val = functools.partial(initializer,
+                                         shape.as_list(), dtype=dtype)
           variable_dtype = dtype.base_dtype
-        elif len(tf_inspect.getargspec(initializer).args) == len(
-            tf_inspect.getargspec(initializer).defaults or []):
+        elif _needs_no_arguments(initializer):
           init_val = initializer
           variable_dtype = None
         else:
diff --git a/tensorflow/python/ops/variables.py b/tensorflow/python/ops/variables.py
index 4d6ca923af3..4e79ec97ff9 100644
--- a/tensorflow/python/ops/variables.py
+++ b/tensorflow/python/ops/variables.py
@@ -1794,8 +1794,13 @@ class RefVariable(VariableV1, core.Tensor):
           # pylint: disable=protected-access
           with ops.get_default_graph()._attr_scope({"_class": attr}):
             with ops.name_scope("Initializer"), ops.device(None):
+              initial_value = initial_value()
+              if isinstance(initial_value, trackable.CheckpointInitialValue):
+                self._maybe_initialize_trackable()
+                self._update_uid = initial_value.checkpoint_position.restore_uid
+                initial_value = initial_value.wrapped_value
               self._initial_value = ops.convert_to_tensor(
-                  initial_value(), name="initial_value", dtype=dtype)
+                  initial_value, name="initial_value", dtype=dtype)
               if shape is None:
                 shape = (
                     self._initial_value.get_shape()
diff --git a/tensorflow/python/ops/while_v2.py b/tensorflow/python/ops/while_v2.py
index 30d4c6d235a..23c24476934 100644
--- a/tensorflow/python/ops/while_v2.py
+++ b/tensorflow/python/ops/while_v2.py
@@ -520,6 +520,12 @@ def _preprocess_grad(grad, body_graph_output, while_op_input, while_op_output):
       default_gradient.supports_default_grad(while_op_input) and grad is None):
     return _zeros_like(while_op_input, while_op_output)
 
+  # Convert IndexedSlices to dense tensors since it is unlikely that downstream
+  # gradient functions with properly handle indexed slices. This is similar to
+  # what we do in tf.function gradients.
+  if isinstance(grad, ops.IndexedSlices):
+    return ops.convert_to_tensor(grad)
+
   return grad
 
 
diff --git a/tensorflow/python/profiler/integration_test/BUILD b/tensorflow/python/profiler/integration_test/BUILD
new file mode 100644
index 00000000000..3c20df99d17
--- /dev/null
+++ b/tensorflow/python/profiler/integration_test/BUILD
@@ -0,0 +1,13 @@
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+py_library(
+    name = "mnist_testing_utils",
+    srcs = ["mnist_testing_utils.py"],
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:extra_py_tests_deps",
+    ],
+)
diff --git a/tensorflow/python/profiler/integration_test/mnist_testing_utils.py b/tensorflow/python/profiler/integration_test/mnist_testing_utils.py
new file mode 100644
index 00000000000..c0a7800fa9a
--- /dev/null
+++ b/tensorflow/python/profiler/integration_test/mnist_testing_utils.py
@@ -0,0 +1,72 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A simple MNIST model for testing multi-worker distribution strategies with Keras."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+
+def mnist_synthetic_dataset(batch_size, steps_per_epoch):
+  """Generate synthetic MNIST dataset for testing."""
+  # train dataset
+  x_train = tf.ones([batch_size * steps_per_epoch, 28, 28, 1],
+                    dtype=tf.dtypes.float32)
+  y_train = tf.ones([batch_size * steps_per_epoch, 1], dtype=tf.dtypes.int32)
+  train_ds = tf.data.Dataset.from_tensor_slices((x_train, y_train))
+  train_ds = train_ds.repeat()
+  # train_ds = train_ds.shuffle(100)
+  train_ds = train_ds.batch(64, drop_remainder=True)
+
+  # eval dataset
+  x_test = tf.random.uniform([10000, 28, 28, 1], dtype=tf.dtypes.float32)
+  y_test = tf.random.uniform([10000, 1],
+                             minval=0,
+                             maxval=9,
+                             dtype=tf.dtypes.int32)
+  eval_ds = tf.data.Dataset.from_tensor_slices((x_test, y_test))
+  eval_ds = eval_ds.batch(64, drop_remainder=True)
+
+  return train_ds, eval_ds
+
+
+def get_mnist_model(input_shape):
+  """Define a deterministically-initialized CNN model for MNIST testing."""
+  inputs = tf.keras.Input(shape=input_shape)
+  x = tf.keras.layers.Conv2D(
+      32,
+      kernel_size=(3, 3),
+      activation="relu",
+      kernel_initializer=tf.keras.initializers.TruncatedNormal(seed=99))(
+          inputs)
+  x = tf.keras.layers.BatchNormalization()(x)
+  x = tf.keras.layers.Flatten()(x) + tf.keras.layers.Flatten()(x)
+  x = tf.keras.layers.Dense(
+      10,
+      activation="softmax",
+      kernel_initializer=tf.keras.initializers.TruncatedNormal(seed=99))(
+          x)
+  model = tf.keras.Model(inputs=inputs, outputs=x)
+
+  # TODO(yuefengz): optimizer with slot variables doesn't work because of
+  # optimizer's bug.
+  # TODO(yuefengz): we should not allow non-v2 optimizer.
+  model.compile(
+      loss=tf.keras.losses.sparse_categorical_crossentropy,
+      optimizer=tf.keras.optimizers.SGD(learning_rate=0.001),
+      metrics=["accuracy"])
+  return model
diff --git a/tensorflow/python/profiler/internal/BUILD b/tensorflow/python/profiler/internal/BUILD
index eead8061e14..5eeaf96448d 100644
--- a/tensorflow/python/profiler/internal/BUILD
+++ b/tensorflow/python/profiler/internal/BUILD
@@ -120,7 +120,6 @@ tf_python_pybind_extension(
         "//tensorflow/core:lib",
         "//tensorflow/core/profiler/convert:op_stats_to_tf_stats",
         "//tensorflow/core/profiler/convert:xplane_to_op_stats",
-        "//tensorflow/core/profiler/convert:xplane_to_profile_response",
         "//tensorflow/core/profiler/convert:xplane_to_trace_events",
         "//tensorflow/core/profiler/lib:profiler_session_headers",
         "//tensorflow/core/profiler/rpc:profiler_server_headers",
@@ -143,7 +142,10 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/core:lib",
-        "//tensorflow/core/profiler/lib:traceme",
+        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
+        "//tensorflow/core/profiler/utils:xplane_builder",
+        "//tensorflow/core/profiler/utils:xplane_schema",
+        "//tensorflow/core/profiler/utils:xplane_utils",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
         "@pybind11",
diff --git a/tensorflow/python/profiler/internal/profiler_wrapper.cc b/tensorflow/python/profiler/internal/profiler_wrapper.cc
index 0984a8b45c5..b58f9b98776 100644
--- a/tensorflow/python/profiler/internal/profiler_wrapper.cc
+++ b/tensorflow/python/profiler/internal/profiler_wrapper.cc
@@ -16,11 +16,11 @@ limitations under the License.
 #include <memory>
 
 #include "absl/memory/memory.h"
+#include "absl/strings/match.h"
 #include "absl/strings/numbers.h"
 #include "pybind11/pybind11.h"
 #include "pybind11/pytypes.h"
 #include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/platform/host_info.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h"
@@ -28,7 +28,6 @@ limitations under the License.
 #include "tensorflow/core/profiler/convert/op_stats_to_tf_stats.h"
 #include "tensorflow/core/profiler/convert/xplane_to_memory_profile.h"
 #include "tensorflow/core/profiler/convert/xplane_to_op_stats.h"
-#include "tensorflow/core/profiler/convert/xplane_to_profile_response.h"
 #include "tensorflow/core/profiler/convert/xplane_to_trace_events.h"
 #include "tensorflow/core/profiler/lib/profiler_session.h"
 #include "tensorflow/core/profiler/protobuf/input_pipeline.pb.h"
@@ -42,24 +41,25 @@ namespace py = ::pybind11;
 
 namespace {
 
-using ::tensorflow::profiler::KERNEL_STATS_DB;
-using ::tensorflow::profiler::OP_METRICS_DB;
-using ::tensorflow::profiler::STEP_DB;
-
 tensorflow::Status ValidateHostPortPair(const std::string& host_port) {
   tensorflow::uint32 port;
   std::vector<absl::string_view> parts = absl::StrSplit(host_port, ':');
   // Must be host:port, port must be a number, host must not contain a '/',
   // host also must not be empty.
   if (parts.size() != 2 || !absl::SimpleAtoi(parts[1], &port) ||
-      parts[0].find("/") != std::string::npos || parts[0].empty()) {
+      absl::StrContains(parts[0], "/") || parts[0].empty()) {
     return tensorflow::errors::InvalidArgument(
         "Could not interpret \"", host_port, "\" as a host-port pair.");
   }
   return tensorflow::Status::OK();
 }
 
-tensorflow::ProfileOptions GetOptions(const py::dict& opts) {
+// Takes profiler options in a py::dict and returns a ProfileOptions.
+// This must be called under GIL because it reads Python objects. Reading Python
+// objects require GIL because the objects can be mutated by other Python
+// threads. In addition, Python objects are reference counted; reading py::dict
+// will increase its reference count.
+tensorflow::ProfileOptions GetOptionsLocked(const py::dict& opts) {
   tensorflow::ProfileOptions options =
       tensorflow::ProfilerSession::DefaultOptions();
   for (const auto& kw : opts) {
@@ -81,7 +81,7 @@ tensorflow::ProfileOptions GetOptions(const py::dict& opts) {
 class ProfilerSessionWrapper {
  public:
   void Start(const char* logdir, const py::dict& options) {
-    session_ = tensorflow::ProfilerSession::Create(GetOptions(options));
+    session_ = tensorflow::ProfilerSession::Create(GetOptionsLocked(options));
     logdir_ = logdir;
     tensorflow::MaybeRaiseRegisteredFromStatus(session_->Status());
   }
@@ -105,23 +105,7 @@ class ProfilerSessionWrapper {
     tensorflow::Status status;
     status = session_->CollectData(&xspace);
     session_.reset();
-    tensorflow::MaybeRaiseRegisteredFromStatus(status);
-
-    tensorflow::ProfileResponse response;
-    tensorflow::ProfileRequest request =
-        tensorflow::profiler::PopulateProfileRequest(
-            /*duration_ms=*/0, logdir_,
-            tensorflow::profiler::GetCurrentTimeStampAsString(),
-            tensorflow::port::Hostname(), /*opts=*/{});
-    status = tensorflow::profiler::ConvertXSpaceToProfileResponse(
-        xspace, request, &response);
-    tensorflow::MaybeRaiseRegisteredFromStatus(status);
-
-    std::stringstream ss;  // Record LOG messages.
-    status = tensorflow::profiler::SaveTensorboardProfile(
-        request.repository_root(), request.session_id(), request.host_name(),
-        response, &ss);
-    LOG(INFO) << ss.str();
+    status = tensorflow::profiler::ExportToTensorBoard(xspace, logdir_);
     tensorflow::MaybeRaiseRegisteredFromStatus(status);
   }
 
@@ -141,35 +125,48 @@ PYBIND11_MODULE(_pywrap_profiler, m) {
       .def("export_to_tb", &ProfilerSessionWrapper::ExportToTensorBoard);
 
   m.def("start_server", [](int port) {
-    auto profiler_server = absl::make_unique<tensorflow::ProfilerServer>();
+    auto profiler_server =
+        absl::make_unique<tensorflow::profiler::ProfilerServer>();
     profiler_server->StartProfilerServer(port);
     // Intentionally release profiler server. Should transfer ownership to
     // caller instead.
     profiler_server.release();
   });
 
-  m.def("trace", [](const char* service_addr, const char* logdir,
-                    const char* worker_list, bool include_dataset_ops,
-                    int duration_ms, int num_tracing_attempts,
-                    py::dict options) {
-    tensorflow::Status status = ValidateHostPortPair(service_addr);
-    tensorflow::MaybeRaiseRegisteredFromStatus(status);
-    tensorflow::ProfileOptions opts = GetOptions(options);
-    opts.set_include_dataset_ops(include_dataset_ops);
-    status =
-        tensorflow::profiler::Trace(service_addr, logdir, worker_list,
-                                    duration_ms, num_tracing_attempts, opts);
-    tensorflow::MaybeRaiseRegisteredFromStatus(status);
-  });
+  m.def("trace",
+        [](const char* service_addr, const char* logdir,
+           const char* worker_list, bool include_dataset_ops, int duration_ms,
+           int num_tracing_attempts, py::dict options) {
+          // Normalize py::dict into a well defined proto.
+          tensorflow::ProfileOptions opts = GetOptionsLocked(options);
+
+          tensorflow::Status status = ValidateHostPortPair(service_addr);
+          tensorflow::MaybeRaiseRegisteredFromStatus(status);
+          opts.set_include_dataset_ops(include_dataset_ops);
+          {
+            // Release the lock to keep the lock scope to a minimum, and allow
+            // other threads to proceed.
+            py::gil_scoped_release release;
+            status = tensorflow::profiler::Trace(service_addr, logdir,
+                                                 worker_list, duration_ms,
+                                                 num_tracing_attempts, opts);
+          }
+          tensorflow::MaybeRaiseRegisteredFromStatus(status);
+        });
 
   m.def("monitor", [](const char* service_addr, int duration_ms,
                       int monitoring_level, bool display_timestamp) {
     tensorflow::Status status = ValidateHostPortPair(service_addr);
     tensorflow::MaybeRaiseRegisteredFromStatus(status);
     tensorflow::string content;
-    status = tensorflow::profiler::Monitor(service_addr, duration_ms,
-                                           monitoring_level, display_timestamp,
-                                           &content);
+    {
+      // Release the lock to keep the lock scope to a minimum, and allow
+      // other threads to proceed.
+      py::gil_scoped_release release;
+      status = tensorflow::profiler::Monitor(service_addr, duration_ms,
+                                             monitoring_level,
+                                             display_timestamp, &content);
+    }
     tensorflow::MaybeRaiseRegisteredFromStatus(status);
     return content;
   });
@@ -186,10 +183,14 @@ PYBIND11_MODULE(_pywrap_profiler, m) {
         [](const py::bytes& serialized_xspace_proto) {
           tensorflow::profiler::XSpace xspace;
           xspace.ParseFromString(std::string(serialized_xspace_proto));
+          tensorflow::profiler::OpStatsOptions options;
+          options.generate_kernel_stats_db = true;
+          options.generate_op_metrics_db = true;
+          options.generate_step_db = true;
+          // TODO(profiler): xspace should tell whether this is sampling mode.
           tensorflow::profiler::OverviewPage overview_page =
               tensorflow::profiler::ConvertOpStatsToOverviewPage(
-                  ConvertXSpaceToOpStats(
-                      xspace, {OP_METRICS_DB, STEP_DB, KERNEL_STATS_DB}));
+                  ConvertXSpaceToOpStats(xspace, options));
           return py::bytes(overview_page.SerializeAsString());
         });
 
@@ -197,26 +198,34 @@ PYBIND11_MODULE(_pywrap_profiler, m) {
         [](const py::bytes& serialized_xspace_proto) {
           tensorflow::profiler::XSpace xspace;
           xspace.ParseFromString(std::string(serialized_xspace_proto));
+          tensorflow::profiler::OpStatsOptions options;
+          options.generate_op_metrics_db = true;
+          options.generate_step_db = true;
           tensorflow::profiler::InputPipelineAnalysisResult input_pipeline =
               tensorflow::profiler::ConvertOpStatsToInputPipelineAnalysis(
-                  ConvertXSpaceToOpStats(xspace, {OP_METRICS_DB, STEP_DB}));
+                  ConvertXSpaceToOpStats(xspace, options));
           return py::bytes(input_pipeline.SerializeAsString());
         });
 
   m.def("xspace_to_tf_stats", [](const py::bytes& serialized_xspace_proto) {
     tensorflow::profiler::XSpace xspace;
     xspace.ParseFromString(std::string(serialized_xspace_proto));
+    tensorflow::profiler::OpStatsOptions options;
+    options.generate_op_metrics_db = true;
+    options.generate_kernel_stats_db = true;
     tensorflow::profiler::TfStatsDatabase tf_stats_db =
         tensorflow::profiler::ConvertOpStatsToTfStats(
-            ConvertXSpaceToOpStats(xspace, {OP_METRICS_DB}));
+            ConvertXSpaceToOpStats(xspace, options));
     return py::bytes(tf_stats_db.SerializeAsString());
   });
 
   m.def("xspace_to_kernel_stats", [](const py::bytes& serialized_xspace_proto) {
     tensorflow::profiler::XSpace xspace;
     xspace.ParseFromString(std::string(serialized_xspace_proto));
+    tensorflow::profiler::OpStatsOptions options;
+    options.generate_kernel_stats_db = true;
     tensorflow::profiler::OpStats op_stats =
-        ConvertXSpaceToOpStats(xspace, {KERNEL_STATS_DB});
+        ConvertXSpaceToOpStats(xspace, options);
     return py::bytes(op_stats.kernel_stats_db().SerializeAsString());
   });
 
diff --git a/tensorflow/python/profiler/internal/python_hooks.cc b/tensorflow/python/profiler/internal/python_hooks.cc
index 33e182f8de0..ea4d11b30ab 100644
--- a/tensorflow/python/profiler/internal/python_hooks.cc
+++ b/tensorflow/python/profiler/internal/python_hooks.cc
@@ -16,13 +16,19 @@ limitations under the License.
 
 #include "absl/strings/string_view.h"
 #include "absl/strings/strip.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/path.h"
+#include "tensorflow/core/profiler/utils/xplane_builder.h"
+#include "tensorflow/core/profiler/utils/xplane_schema.h"
+#include "tensorflow/core/profiler/utils/xplane_utils.h"
 
 namespace tensorflow {
 namespace profiler {
 
 namespace py = ::pybind11;
 
+namespace {
+
 template <typename T>
 int ProfileFunction(PyObject* obj, PyFrameObject* frame, int what,
                     PyObject* arg) {
@@ -40,40 +46,165 @@ void ThreadingSetProfile(const py::object& callback) {
   setprofile(callback);
 }
 
+std::string GetEventName(PyCodeObject* py_code) {
+  string filename(py::reinterpret_borrow<py::str>(py_code->co_filename));
+  string function;
+  if (py_code->co_name == nullptr) {
+    function = "<unknown>";
+  } else {
+    function = py::reinterpret_borrow<py::str>(py_code->co_name);
+  }
+
+  return absl::StrCat("$", io::Basename(filename), ":", py_code->co_firstlineno,
+                      " ", function);
+}
+
+string GetEventName(PyCFunctionObject* py_cfunc) {
+  PyObject* module = py_cfunc->m_module;
+  string filename;
+  bool filename_ok;
+#if PY_MAJOR_VERSION < 3
+  filename_ok = (module != nullptr && PyString_Check(module));
+#else
+  filename_ok = (module != nullptr && PyUnicode_Check(module));
+#endif
+  if (filename_ok) {
+    filename = py::reinterpret_borrow<py::str>(module);
+  } else {
+    filename = "<unknown>";
+  }
+
+  return absl::StrCat("$", filename, " ", py_cfunc->m_ml->ml_name);
+}
+
+void AddEventToXLine(const PythonTraceEntry& event, XLineBuilder* line,
+                     XPlaneBuilder* plane) {
+  // TODO(jiesun): maybe add full filename as event stats.
+  auto xevent = line->AddEvent(*plane->GetOrCreateEventMetadata(event.Name()));
+  xevent.SetTimestampNs(event.start_time_ns);
+  xevent.SetEndTimestampNs(event.end_time_ns);
+}
+
+}  // namespace
+
+std::string PythonTraceEntry::Name() const {
+  std::string event_name;
+  if (code_object) {
+    return GetEventName(code_object);
+  } else if (function_object) {
+    return GetEventName(function_object);
+  }
+  return "<unknown>";
+}
+
 PythonHooks* PythonHooks::GetSingleton() {
   static PythonHooks* singleton = new PythonHooks;
   return singleton;
 }
 
-void PythonHooks::Start(const PythonHooksOptions& option) {
+void PythonHooks::Start(const PythonHooksOptions& options) {
   if (!Py_IsInitialized()) return;
-  if (option.enable_python_traceme || option.enable_trace_python_function) {
+
+#if PY_MAJOR_VERSION < 3 || (PY_MAJOR_VERSION == 3 && PY_MINOR_VERSION < 7)
+  // Before Python 3.7, the GIL is created on demand by PyEval_InitThreads().
+  // When a thread was not started by Python (e.g., when starting profiling via
+  // RPC) there might be no GIL. Before Python 3.6, PyGILState_Ensure would
+  // crash. The crash was fixed in Python 3.6 but the fix introduced a race for
+  // GIL creation. Calling PyEval_InitThreads() prevents the race. This is a
+  // no-op when called for a second time so it is innocuous. See
+  // https://vstinner.github.io/python37-gil-change.html for details.
+  PyEval_InitThreads();
+#endif
+
+  options_ = options;
+  start_timestamp_ns_ = EnvTime::NowNanos();
+  if (options_.enable_python_traceme || options_.enable_trace_python_function) {
     PyGILState_STATE gil_state = PyGILState_Ensure();
-    if (option.enable_trace_python_function) {
+    if (options_.enable_trace_python_function) {
       SetProfilerInAllThreads();
     }
-    if (option.enable_python_traceme) {
+    if (options_.enable_python_traceme) {
       EnableTraceMe(true);
     }
+    if (options_.end_to_end_mode) {
+      // When end to end mode is used, Stop() and Finalize() i.e. symbolization
+      // and data collection happens during C's atexit(), when Py_FinalizeEx()
+      // already called.
+      try {
+        auto atexit = py::module::import("atexit");
+        atexit.attr("register")(py::cpp_function([]() {
+          PythonHooks* singleton = PythonHooks::GetSingleton();
+          singleton->Stop();
+          singleton->CollectData(&(singleton->end_to_end_xplane_.emplace()));
+        }));
+      } catch (const py::error_already_set& e) {
+        LOG(ERROR) << "Can't install atexit handler for e2e mode." << e.what();
+      }
+    }
     PyGILState_Release(gil_state);
+    active_session_ = true;
   }
 }
 
-void PythonHooks::Stop(const PythonHooksOptions& option) {
+void PythonHooks::Stop() {
   if (!Py_IsInitialized()) return;
-  if (option.enable_python_traceme || option.enable_trace_python_function) {
+  if (!active_session_) return;  // Makes sure Stop() can be reentrant.
+  if (options_.enable_python_traceme || options_.enable_trace_python_function) {
     PyGILState_STATE gil_state = PyGILState_Ensure();
-    if (option.enable_trace_python_function) {
+    if (options_.enable_trace_python_function) {
       ClearProfilerInAllThreads();
     }
-    if (option.enable_python_traceme) {
+    if (options_.enable_python_traceme) {
       EnableTraceMe(false);
     }
     PyGILState_Release(gil_state);
+    active_session_ = false;
   }
 }
 
-void PythonHooks::Finalize() { tracemes_.clear(); }
+void PythonHooks::CollectData(XPlane* raw_plane) {
+  DCHECK(raw_plane);
+  XPlaneBuilder plane(raw_plane);
+  for (auto& it : entries_) {
+    uint64 thread_id = it.first;
+    auto& thread_events = it.second;
+    VLOG(1) << "Collecting " << thread_events.completed.size() << ":"
+            << thread_events.active.size() << " events on thread " << thread_id;
+    auto line = plane.GetOrCreateLine(thread_id);
+    line.SetTimestampNs(start_timestamp_ns_);
+    for (const auto& event : thread_events.completed) {
+      AddEventToXLine(event, &line, &plane);
+    }
+    if (options_.include_incomplete_events) {
+      uint64 now = EnvTime::NowNanos();
+      while (!thread_events.active.empty()) {
+        auto& event = thread_events.active.top();
+        event.end_time_ns = now;
+        AddEventToXLine(event, &line, &plane);
+        thread_events.active.pop();
+      }
+    }
+  }
+  entries_.clear();
+}
+
+void PythonHooks::Finalize(XSpace* space) {
+  if (space) {
+    XPlane* plane =
+        FindOrAddMutablePlaneWithName(space, kPythonTracerPlaneName);
+    if (options_.end_to_end_mode) {
+      if (end_to_end_xplane_) {
+        end_to_end_xplane_->set_name(plane->name());
+        plane->Swap(&*end_to_end_xplane_);
+        end_to_end_xplane_.reset();
+      }
+    } else {
+      PyGILState_STATE gil_state = PyGILState_Ensure();
+      CollectData(plane);
+      PyGILState_Release(gil_state);
+    }
+  }
+}
 
 void PythonHooks::ProfileSlow(const py::object& frame, const string& event,
                               const py::object& arg) {
@@ -106,48 +237,58 @@ void PythonHooks::ProfileSlow(const py::object& frame, const string& event,
 }
 
 void PythonHooks::ProfileFast(PyFrameObject* frame, int what, PyObject* arg) {
-  const int64 thread_id = PyThread_get_thread_ident();
+  const int64 thread_id = Env::Default()->GetCurrentThreadId();
+  uint64 now = EnvTime::NowNanos();
+  auto& thread_traces = entries_[thread_id];
 
-  if (what == PyTrace_CALL) {
-    PyCodeObject* f_code = frame->f_code;
-    string filename(py::reinterpret_borrow<py::str>(f_code->co_filename));
-    int line_no = frame->f_lineno;
-
-    string function;
-    if (f_code->co_name == nullptr) {
-      function = "<unknown>";
-    } else {
-      function = py::reinterpret_borrow<py::str>(f_code->co_name);
+  switch (what) {
+    case PyTrace_CALL: {
+      PyCodeObject* f_code = frame->f_code;
+      thread_traces.active.emplace(now, 0, f_code, nullptr);
+      break;
     }
-
-    tracemes_[thread_id].push_back(absl::make_unique<TraceMe>(absl::StrCat(
-        "$", io::Basename(filename), ":", line_no, " ", function)));
-  } else if (what == PyTrace_C_CALL && PyCFunction_Check(arg)) {
-    // Python stack does not have a filename/line_no for native calls.
-    auto* func = reinterpret_cast<PyCFunctionObject*>(arg);
-    PyObject* module = func->m_module;
-    string filename;
-    bool filename_ok;
-#if PY_MAJOR_VERSION < 3
-    filename_ok = (module != nullptr && PyString_Check(module));
-#else
-    filename_ok = (module != nullptr && PyUnicode_Check(module));
-#endif
-    if (filename_ok) {
-      filename = py::reinterpret_borrow<py::str>(module);
-    } else {
-      filename = "<unknown>";
+    case PyTrace_RETURN:
+    case PyTrace_EXCEPTION: {
+      if (!thread_traces.active.empty()) {
+        auto& entry = thread_traces.active.top();
+        entry.end_time_ns = now;
+        thread_traces.completed.emplace_back(std::move(entry));
+        thread_traces.active.pop();
+      } else if (options_.include_incomplete_events) {
+        PyCodeObject* f_code = frame->f_code;
+        thread_traces.completed.emplace_back(start_timestamp_ns_, now, f_code,
+                                             nullptr);
+      }
+      break;
     }
-
-    string function(func->m_ml->ml_name);
-    tracemes_[thread_id].push_back(absl::make_unique<TraceMe>(
-        absl::StrCat(filename, " ", func->m_ml->ml_name)));
-  } else if (what == PyTrace_RETURN || what == PyTrace_C_RETURN ||
-             what == PyTrace_EXCEPTION || what == PyTrace_C_EXCEPTION) {
-    auto& thread_tracemes = tracemes_[thread_id];
-    if (!thread_tracemes.empty()) {
-      thread_tracemes.pop_back();
+    case PyTrace_C_CALL: {
+      if (PyCFunction_Check(arg)) {
+        // Python stack does not have a filename/line_no for native calls.
+        auto* func = reinterpret_cast<PyCFunctionObject*>(arg);
+        entries_[thread_id].active.emplace(now, 0, nullptr, func);
+      }
+      break;
     }
+    case PyTrace_C_RETURN:
+    case PyTrace_C_EXCEPTION: {
+      if (!thread_traces.active.empty()) {
+        auto& entry = thread_traces.active.top();
+        entry.end_time_ns = now;
+        thread_traces.completed.emplace_back(std::move(entry));
+        thread_traces.active.pop();
+      } else if (options_.include_incomplete_events) {
+        // Only the end of the events is recorded, use profiler start as start.
+        if (PyCFunction_Check(arg)) {
+          // Python stack does not have a filename/line_no for native calls.
+          auto* func = reinterpret_cast<PyCFunctionObject*>(arg);
+          entries_[thread_id].completed.emplace_back(start_timestamp_ns_, now,
+                                                     nullptr, func);
+        }
+      }
+      break;
+    }
+    default:
+      break;
   }
 }
 
diff --git a/tensorflow/python/profiler/internal/python_hooks.h b/tensorflow/python/profiler/internal/python_hooks.h
index 582edf4a93b..b30fcc391f4 100644
--- a/tensorflow/python/profiler/internal/python_hooks.h
+++ b/tensorflow/python/profiler/internal/python_hooks.h
@@ -16,14 +16,16 @@ limitations under the License.
 #define TENSORFLOW_PYTHON_PROFILER_INTERNAL_PYTHON_HOOKS_H_
 
 #include <memory>
+#include <stack>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
 #include "pybind11/cast.h"
 #include "pybind11/pybind11.h"
 #include "pybind11/pytypes.h"
+#include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 
 namespace tensorflow {
 namespace profiler {
@@ -33,6 +35,52 @@ namespace py = ::pybind11;
 struct PythonHooksOptions {
   bool enable_trace_python_function = false;
   bool enable_python_traceme = true;
+  bool end_to_end_mode = false;
+  // Incomplete events are defined as those python calls which we only see
+  // either start or end, but not both. If we want to include them in the final
+  // result, profiler start, end time are used respectively to the absent
+  // timestamps.
+  bool include_incomplete_events = true;
+};
+
+struct PythonTraceEntry {
+  PythonTraceEntry(uint64 start, uint64 end, PyCodeObject* code,
+                   PyCFunctionObject* func)
+      : start_time_ns(start),
+        end_time_ns(end),
+        code_object(code),
+        function_object(func) {
+    Py_XINCREF(code_object);
+    Py_XINCREF(function_object);
+  }
+  ~PythonTraceEntry() {
+    Py_XDECREF(code_object);
+    Py_XDECREF(function_object);
+  }
+  PythonTraceEntry(PythonTraceEntry&& other) {
+    start_time_ns = other.start_time_ns;
+    end_time_ns = other.end_time_ns;
+    code_object = other.code_object;
+    function_object = other.function_object;
+    other.code_object = nullptr;
+    other.function_object = nullptr;
+  }
+
+  std::string Name() const;
+
+  uint64 start_time_ns;
+  uint64 end_time_ns;
+  PyCodeObject* code_object;
+  PyCFunctionObject* function_object;
+
+  PythonTraceEntry(const PythonTraceEntry& other) = delete;
+  void operator=(const PythonTraceEntry&) = delete;
+  void operator=(PythonTraceEntry&&) = delete;
+};
+
+struct PerThreadEvents {
+  std::deque<PythonTraceEntry> completed;
+  std::stack<PythonTraceEntry> active;
 };
 
 // Singleton for tracing python function calls.
@@ -41,19 +89,27 @@ class PythonHooks {
   static PythonHooks* GetSingleton();
 
   void Start(const PythonHooksOptions& option);
-  void Stop(const PythonHooksOptions& option);
-  void Finalize();
+  void Stop();
+  void Finalize(XSpace* space);
   void ProfileSlow(const py::object& frame, const string& event,
                    const py::object& arg);
   void ProfileFast(PyFrameObject* frame, int what, PyObject* arg);
 
  private:
   void EnableTraceMe(bool enable);
+  void CollectData(XPlane* raw_plane);
 
   void SetProfilerInAllThreads();
   void ClearProfilerInAllThreads();
 
-  absl::flat_hash_map<int64, std::vector<std::unique_ptr<TraceMe>>> tracemes_;
+  // entries_ are accessed when GIL is held, therefore no race conditions.
+  absl::flat_hash_map<int64, PerThreadEvents> entries_;
+  uint64 start_timestamp_ns_;
+  bool active_session_ = false;
+  PythonHooksOptions options_;
+  // In end to end mode, Python get uninitialized before Stop()/Finalize(), we
+  // need to buffer the result.
+  absl::optional<XPlane> end_to_end_xplane_;
 };
 
 }  // namespace profiler
diff --git a/tensorflow/python/profiler/profiler_client_test.py b/tensorflow/python/profiler/profiler_client_test.py
index 36a7df41be2..343f09834fd 100644
--- a/tensorflow/python/profiler/profiler_client_test.py
+++ b/tensorflow/python/profiler/profiler_client_test.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tempfile
+import portpicker
 
 from tensorflow.python.eager import test
 from tensorflow.python.framework import errors
@@ -29,18 +29,35 @@ from tensorflow.python.profiler import profiler_v2 as profiler
 
 class ProfilerClientTest(test_util.TensorFlowTestCase):
 
-  def testStartTracing_ProcessInvalidAddress(self):
-    with self.assertRaises(errors.UnavailableError):
-      profiler_client.trace('localhost:6006', tempfile.mkdtemp(), 2000)
+  def testTrace_ProfileIdleServer(self):
+    test_port = portpicker.pick_unused_port()
+    profiler.start_server(test_port)
+    # Test the profilers are successfully started and connected to profiler
+    # service on the worker. Since there is no op running, it is expected to
+    # return UnavailableError with no trace events collected string.
+    with self.assertRaises(errors.UnavailableError) as error:
+      profiler_client.trace(
+          'localhost:' + str(test_port), self.get_temp_dir(), duration_ms=10)
+    self.assertEqual('No trace event is collected', str(error.exception))
 
-  def testStartTracing_ProcessInvalidAddressWithOptions(self):
-    with self.assertRaises(errors.UnavailableError):
+  def testTrace_ProfileIdleServerWithOptions(self):
+    test_port = portpicker.pick_unused_port()
+    profiler.start_server(test_port)
+    # Test the profilers are successfully started and connected to profiler
+    # service on the worker. Since there is no op running, it is expected to
+    # return UnavailableError with no trace events collected string.
+    with self.assertRaises(errors.UnavailableError) as error:
       options = profiler.ProfilerOptions(
           host_tracer_level=3, device_tracer_level=0)
       profiler_client.trace(
-          'localhost:6006', tempfile.mkdtemp(), 2000, options=options)
+          'localhost:' + str(test_port),
+          self.get_temp_dir(),
+          duration_ms=10,
+          options=options)
+    self.assertEqual('No trace event is collected', str(error.exception))
 
   def testMonitor_ProcessInvalidAddress(self):
+    # Monitor is only supported in cloud TPU. Test invalid address instead.
     with self.assertRaises(errors.UnavailableError):
       profiler_client.monitor('localhost:6006', 2000)
 
diff --git a/tensorflow/python/profiler/trace.py b/tensorflow/python/profiler/trace.py
index 0591d90fa43..e4cf581bd25 100644
--- a/tensorflow/python/profiler/trace.py
+++ b/tensorflow/python/profiler/trace.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
+
 from tensorflow.python.profiler.internal import _pywrap_traceme
 from tensorflow.python.util.tf_export import tf_export
 
@@ -123,3 +125,43 @@ class Trace(object):
   def __exit__(self, exc_type, exc_val, exc_tb):
     if self._traceme:
       self._traceme.Stop()
+
+
+def trace_wrapper(trace_name, **trace_kwargs):
+  """Decorator alternative to `with Trace(): ...`.  It's faster.
+
+  Args:
+    trace_name: The name of the trace event.
+    **trace_kwargs: Keyword arguments added to the trace event. Both the key and
+      value are of types that can be converted to strings, which will be
+      interpreted by the profiler according to the traceme name.
+
+  Returns:
+    A decorator that can wrap a function and apply `Trace` scope if needed.
+
+  Example usage:
+    ```python
+
+    @trace_wrapper('trace_name')
+    def func(x, y, z):
+      pass  # code to execute and apply `Trace` if needed.
+
+    # Equivalent to
+    # with Trace('trace_name'):
+    #   func(1, 2, 3)
+    func(1, 2, 3)
+    ```
+  """
+
+  def inner_wrapper(func):
+
+    @functools.wraps(func)
+    def wrapped(*args, **kwargs):
+      if enabled:
+        with Trace(trace_name, **trace_kwargs):
+          return func(*args, **kwargs)
+      return func(*args, **kwargs)
+
+    return wrapped
+
+  return inner_wrapper
diff --git a/tensorflow/python/pywrap_mlir.py b/tensorflow/python/pywrap_mlir.py
index a8a8181ce48..82048140e16 100644
--- a/tensorflow/python/pywrap_mlir.py
+++ b/tensorflow/python/pywrap_mlir.py
@@ -29,6 +29,13 @@ def import_graphdef(graphdef, pass_pipeline):
       pass_pipeline.encode('utf-8'))
 
 
+def import_function(concrete_function, pass_pipeline):
+  return ImportFunction(
+      str(concrete_function.function_def).encode('utf-8'),
+      str(concrete_function.graph.as_graph_def().library).encode('utf-8'),
+      pass_pipeline.encode('utf-8'))
+
+
 def experimental_convert_saved_model_to_mlir(saved_model_path, exported_names,
                                              show_debug_info):
   return ExperimentalConvertSavedModelToMlir(
diff --git a/tensorflow/python/saved_model/BUILD b/tensorflow/python/saved_model/BUILD
index 27e0e984f5f..10cf520f4e5 100644
--- a/tensorflow/python/saved_model/BUILD
+++ b/tensorflow/python/saved_model/BUILD
@@ -180,13 +180,13 @@ tf_py_test(
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:lib",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:saver_test_utils",
         "//tensorflow/python:session",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:test_ops",
         "//tensorflow/python:training",
         "//tensorflow/python:util",
         "//tensorflow/python:variables",
+        "//tensorflow/python/training:saver_test_utils",
     ],
 )
 
@@ -445,13 +445,14 @@ py_strict_library(
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:func_graph",
         "//tensorflow/python:platform",
         "//tensorflow/python:saver",
         "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:training_lib",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:lift_to_graph",
         "//tensorflow/python/eager:wrap_function",
+        "//tensorflow/python/training:monitored_session",
         "//tensorflow/python/training/tracking",
     ],
 )
@@ -586,6 +587,7 @@ py_strict_library(
         "//tensorflow/python/data/ops:iterator_ops",
         "//tensorflow/python/data/ops:optional_ops",
         "//tensorflow/python/distribute:values",
+        "//tensorflow/python/ops/numpy_ops:numpy",
         "//tensorflow/python/ops/ragged:ragged_tensor",
         "//tensorflow/python/ops/ragged:row_partition",
         "@six_archive//:six",
diff --git a/tensorflow/python/saved_model/builder_impl.py b/tensorflow/python/saved_model/builder_impl.py
index 46b1fb57de2..51e3f55924d 100644
--- a/tensorflow/python/saved_model/builder_impl.py
+++ b/tensorflow/python/saved_model/builder_impl.py
@@ -534,7 +534,7 @@ class SavedModelBuilder(_SavedModelBuilder):
 
     # legacy_init_op is deprecated, and going away in TF 2.0.
     # Re-mapping to main_op, as treatment is identical regardless.
-    main_op = main_op or legacy_init_op
+    main_op = main_op if main_op is not None else legacy_init_op
 
     # Add assets and ops
     self._add_collections(assets_collection, main_op, None)
diff --git a/tensorflow/python/saved_model/load_test.py b/tensorflow/python/saved_model/load_test.py
index 320182385f8..e802e5409c2 100644
--- a/tensorflow/python/saved_model/load_test.py
+++ b/tensorflow/python/saved_model/load_test.py
@@ -50,9 +50,11 @@ from tensorflow.python.ops import cond_v2
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import numpy_ops as tnp
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
+from tensorflow.python.ops.numpy_ops import np_arrays
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.saved_model import load
@@ -1798,6 +1800,8 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     root = tracking.AutoTrackable()
     root.table = lookup_ops.MutableHashTable(dtypes.string, dtypes.float32, -1)
     root.table.insert("foo", 15)
+    root.table2 = lookup_ops.MutableHashTable(dtypes.string, dtypes.float32, -1)
+    root.table2.insert("idk", 21)
 
     @def_function.function(
         input_signature=[tensor_spec.TensorSpec(None, dtypes.string)])
@@ -1810,6 +1814,34 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(self.evaluate(imported.lookup("foo")), 15)
     self.assertEqual(self.evaluate(imported.lookup("idk")), -1)
 
+  def test_saving_ndarray_specs(self, cycles):
+    class NdarrayModule(module.Module):
+
+      @def_function.function
+      def plain(self, x):
+        return tnp.add(x, 1)
+
+      @def_function.function(input_signature=[
+          np_arrays.NdarraySpec(tensor_spec.TensorSpec([], dtypes.float32))])
+      def with_signature(self, x):
+        return tnp.add(x, 1)
+
+    m = NdarrayModule()
+    c = tnp.asarray(3.0, tnp.float32)
+    output_plain, output_with_signature = m.plain(c), m.with_signature(c)
+
+    loaded_m = cycle(m, cycles)
+
+    load_output_plain, load_output_with_signature = (
+        loaded_m.plain(c), loaded_m.with_signature(c))
+
+    self.assertIsInstance(output_plain, tnp.ndarray)
+    self.assertIsInstance(load_output_plain, tnp.ndarray)
+    self.assertIsInstance(output_with_signature, tnp.ndarray)
+    self.assertIsInstance(load_output_with_signature, tnp.ndarray)
+    self.assertAllClose(output_plain, load_output_plain)
+    self.assertAllClose(output_with_signature, load_output_with_signature)
+
 
 class SingleCycleTests(test.TestCase, parameterized.TestCase):
 
diff --git a/tensorflow/python/saved_model/load_v1_in_v2.py b/tensorflow/python/saved_model/load_v1_in_v2.py
index ede91da168c..add3b4e6320 100644
--- a/tensorflow/python/saved_model/load_v1_in_v2.py
+++ b/tensorflow/python/saved_model/load_v1_in_v2.py
@@ -25,6 +25,7 @@ from tensorflow.python.eager import lift_to_graph
 from tensorflow.python.eager import wrap_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
@@ -143,6 +144,7 @@ class _EagerSavedModelLoader(loader_impl.SavedModelLoader):
           for input_spec in input_specs
       ]
       input_names = []
+      input_tensors = []
       for original_input_name, feed in zip(original_input_names, feeds):
         if isinstance(feed, sparse_tensor.SparseTensor):
           # We have to give explicit name for SparseTensor arguments, because
@@ -151,8 +153,10 @@ class _EagerSavedModelLoader(loader_impl.SavedModelLoader):
           values_name = "%s_values" % original_input_name
           dense_shape_name = "%s_dense_shape" % original_input_name
           input_names.extend([indices_name, values_name, dense_shape_name])
+          input_tensors.extend([feed.indices, feed.values, feed.dense_shape])
         else:
           input_names.append(original_input_name)
+          input_tensors.append(feed)
       fetches = {name: out for name, out in signature_def.outputs.items()}
       try:
         signature_fn = wrapped.prune(feeds=feeds, fetches=fetches)
@@ -173,6 +177,11 @@ class _EagerSavedModelLoader(loader_impl.SavedModelLoader):
         raise
       # pylint: disable=protected-access
       signature_fn._arg_keywords = input_names
+      signature_fn._func_graph.structured_input_signature = (
+          (),
+          func_graph.convert_structure_to_signature(
+              dict(zip(input_names, input_tensors))))
+
       if len(input_names) == 1:
         # Allowing positional arguments does not create any ambiguity if there's
         # only one.
diff --git a/tensorflow/python/saved_model/load_v1_in_v2_test.py b/tensorflow/python/saved_model/load_v1_in_v2_test.py
index bafeea128ed..806a4db6fba 100644
--- a/tensorflow/python/saved_model/load_v1_in_v2_test.py
+++ b/tensorflow/python/saved_model/load_v1_in_v2_test.py
@@ -32,6 +32,7 @@ from tensorflow.python.framework import function as framework_function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
 from tensorflow.python.framework import versions
 from tensorflow.python.lib.io import file_io
@@ -630,6 +631,15 @@ class LoadTest(test.TestCase):
             imported.signatures["serving_default"](constant_op.constant(2.))),
         {"y": [10, 8, 6, 4, 2, 0]})
 
+  def test_structured_input_signature(self):
+    path = self._v1_single_metagraph_saved_model(False)
+    imported = load.load(path)
+    args, kwargs = (
+        imported.signatures["serving_default"].structured_input_signature)
+    self.assertEqual(args, ())
+    self.assertAllEqual(
+        kwargs, {"start": tensor_spec.TensorSpec(shape=None, name="start")})
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/saved_model/method_name_updater.py b/tensorflow/python/saved_model/method_name_updater.py
index 12f0bdd3552..5c18c7f88e2 100644
--- a/tensorflow/python/saved_model/method_name_updater.py
+++ b/tensorflow/python/saved_model/method_name_updater.py
@@ -44,7 +44,8 @@ class MethodNameUpdater(object):
   Typical usages of the `MethodNameUpdater`
   ```python
   ...
-  updater = tf.compat.v1.saved_model.MethodNameUpdater(export_dir)
+  updater = tf.compat.v1.saved_model.signature_def_utils.MethodNameUpdater(
+      export_dir)
   # Update all signature_defs with key "foo" in all meta graph defs.
   updater.replace_method_name(signature_key="foo", method_name="regress")
   # Update a single signature_def with key "bar" in the meta graph def with
diff --git a/tensorflow/python/saved_model/model_utils/BUILD b/tensorflow/python/saved_model/model_utils/BUILD
index 775d81a86bc..8e41a613b64 100644
--- a/tensorflow/python/saved_model/model_utils/BUILD
+++ b/tensorflow/python/saved_model/model_utils/BUILD
@@ -48,6 +48,7 @@ py_strict_library(
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:tensor_util",
         "//tensorflow/python/saved_model:signature_def_utils",
         "@six_archive//:six",
     ],
@@ -69,6 +70,7 @@ py_strict_test(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:metrics",
         "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/saved_model:signature_constants",
     ],
diff --git a/tensorflow/python/saved_model/model_utils/export_output.py b/tensorflow/python/saved_model/model_utils/export_output.py
index b571bad067e..9b3ce04e071 100644
--- a/tensorflow/python/saved_model/model_utils/export_output.py
+++ b/tensorflow/python/saved_model/model_utils/export_output.py
@@ -26,6 +26,7 @@ import six
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.saved_model import signature_def_utils
 
 
@@ -342,16 +343,16 @@ class _SupervisedOutput(ExportOutput):
         raise ValueError(
             '{} output value must be a Tensor; got {}.'.format(
                 key, metric_val))
-      if (not isinstance(metric_op, ops.Tensor) and
-          not isinstance(metric_op, ops.Operation)):
+      if not (tensor_util.is_tensor(metric_op) or
+              isinstance(metric_op, ops.Operation)):
         raise ValueError(
             '{} update_op must be a Tensor or Operation; got {}.'.format(
                 key, metric_op))
 
-      # We must wrap any ops in a Tensor before export, as the SignatureDef
-      # proto expects tensors only. See b/109740581
+      # We must wrap any ops (or variables) in a Tensor before export, as the
+      # SignatureDef proto expects tensors only. See b/109740581
       metric_op_tensor = metric_op
-      if isinstance(metric_op, ops.Operation):
+      if not isinstance(metric_op, ops.Tensor):
         with ops.control_dependencies([metric_op]):
           metric_op_tensor = constant_op.constant([], name='metric_op_wrapper')
 
diff --git a/tensorflow/python/saved_model/model_utils/export_output_test.py b/tensorflow/python/saved_model/model_utils/export_output_test.py
index 8a3f107ce6c..8fd13b3d72e 100644
--- a/tensorflow/python/saved_model/model_utils/export_output_test.py
+++ b/tensorflow/python/saved_model/model_utils/export_output_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import metrics as metrics_module
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model.model_utils import export_output as export_output_lib
@@ -373,10 +374,16 @@ class SupervisedOutputTest(test.TestCase):
       mean, update_op = metrics_module.mean_tensor(constant_op.constant([0]))
       metrics = {
           'metrics_1': (mean, update_op),
-          'metrics_2': (constant_op.constant([0]), control_flow_ops.no_op())
+          'metrics_2': (constant_op.constant([0]), control_flow_ops.no_op()),
+          # Keras metric's update_state() could return a Variable, rather than
+          # an Operation or Tensor.
+          'keras_1': (constant_op.constant([0.5]),
+                      variables.Variable(1.0, name='AssignAddVariableOp_3'))
       }
 
       outputter = MockSupervisedOutput(loss, predictions, metrics)
+      # If we get there, it means constructor succeeded; which is sufficient
+      # for testing the constructor.
 
       self.assertTrue(outputter.metrics['metrics_1/update_op'].name.startswith(
           'mean/update_op'))
diff --git a/tensorflow/python/saved_model/nested_structure_coder.py b/tensorflow/python/saved_model/nested_structure_coder.py
index 9c71b853675..a7e5548ee06 100644
--- a/tensorflow/python/saved_model/nested_structure_coder.py
+++ b/tensorflow/python/saved_model/nested_structure_coder.py
@@ -48,6 +48,7 @@ from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.ops.numpy_ops import np_arrays
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.ops.ragged import row_partition
 from tensorflow.python.util import compat
@@ -516,6 +517,8 @@ class _TypeSpecCodec(object):
           resource_variable_ops.VariableSpec,
       struct_pb2.TypeSpecProto.ROW_PARTITION_SPEC:
           row_partition.RowPartitionSpec,
+      struct_pb2.TypeSpecProto.NDARRAY_SPEC:
+          np_arrays.NdarraySpec,
   }
 
   # Mapping from type (TypeSpec subclass) to enum value.
diff --git a/tensorflow/python/saved_model/nested_structure_coder_test.py b/tensorflow/python/saved_model/nested_structure_coder_test.py
index 9951ea64a49..fb074f76eb0 100644
--- a/tensorflow/python/saved_model/nested_structure_coder_test.py
+++ b/tensorflow/python/saved_model/nested_structure_coder_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops.numpy_ops import np_arrays
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import nested_structure_coder
@@ -331,6 +332,14 @@ class NestedStructureTest(test.TestCase):
     decoded = self._coder.decode_proto(encoded)
     self.assertEqual(structure, decoded)
 
+  def testEncodeDecodeNdarraySpec(self):
+    structure = [np_arrays.NdarraySpec(
+        tensor_spec.TensorSpec([4, 2], dtypes.float32))]
+    self.assertTrue(self._coder.can_encode(structure))
+    encoded = self._coder.encode_structure(structure)
+    decoded = self._coder.decode_proto(encoded)
+    self.assertEqual(structure, decoded)
+
   def testNotEncodable(self):
 
     class NotEncodable(object):
diff --git a/tensorflow/python/saved_model/save.py b/tensorflow/python/saved_model/save.py
index 33780c14db8..361883adc22 100644
--- a/tensorflow/python/saved_model/save.py
+++ b/tensorflow/python/saved_model/save.py
@@ -757,6 +757,11 @@ def _write_object_proto(obj, proto, asset_file_def_index, function_name_map):
     proto.variable.synchronization = obj.synchronization.value
     proto.variable.aggregation = obj.aggregation.value
     proto.variable.shape.CopyFrom(obj.shape.as_proto())
+    options = save_context.get_save_options()
+    if options.experimental_variable_policy._save_variable_devices(  # pylint: disable=protected-access
+    ):
+      if hasattr(obj, "device"):
+        proto.variable.device = obj.device
   elif isinstance(obj, def_function.Function):
     proto.function.CopyFrom(function_serialization.serialize_function(
         obj, function_name_map))
@@ -1005,8 +1010,8 @@ def save(obj, export_dir, signatures=None, options=None):
   utils_impl.get_or_create_variables_dir(export_dir)
   ckpt_options = checkpoint_options.CheckpointOptions(
       experimental_io_device=options.experimental_io_device)
-  object_saver.save(utils_impl.get_variables_path(export_dir),
-                    options=ckpt_options)
+  object_saver.save(
+      utils_impl.get_variables_path(export_dir), options=ckpt_options)
   builder_impl.copy_assets_to_destination_dir(asset_info.asset_filename_map,
                                               export_dir)
   # Note that this needs to be the last file operation when saving the
diff --git a/tensorflow/python/saved_model/save_test.py b/tensorflow/python/saved_model/save_test.py
index 28b8fa907e0..d74d190f37e 100644
--- a/tensorflow/python/saved_model/save_test.py
+++ b/tensorflow/python/saved_model/save_test.py
@@ -247,7 +247,7 @@ class SaveTest(test.TestCase, parameterized.TestCase):
     root.f(constant_op.constant(1.))
     to_save = root.f.get_concrete_function(constant_op.constant(1.))
     save_dir = os.path.join(self.get_temp_dir(), "saved_model")
-    with self.assertRaisesRegex(ValueError, "non-flat outputs"):
+    with self.assertRaisesRegex(ValueError, "non-Tensor value"):
       save.save(root, save_dir, to_save)
 
   def test_nested_dict_outputs(self):
@@ -259,8 +259,7 @@ class SaveTest(test.TestCase, parameterized.TestCase):
     root.f(constant_op.constant(1.))
     to_save = root.f.get_concrete_function(constant_op.constant(1.))
     save_dir = os.path.join(self.get_temp_dir(), "saved_model")
-    with self.assertRaisesRegex(ValueError,
-                                "dictionary containing non-Tensor value"):
+    with self.assertRaisesRegex(ValueError, "non-Tensor value"):
       save.save(root, save_dir, to_save)
 
   def test_variable(self):
@@ -515,12 +514,14 @@ class SaveTest(test.TestCase, parameterized.TestCase):
     else:
       save.save(obj=root, export_dir=file_name, options=options)
 
-    graph_def = None
+    meta = None
     if meta_graph_only:
-      graph_def = meta_graph.read_meta_graph_file(file_name).graph_def
+      meta = meta_graph.read_meta_graph_file(file_name)
     else:
-      graph_def = loader_impl.parse_saved_model(
-          file_name).meta_graphs[0].graph_def
+      meta = loader_impl.parse_saved_model(file_name).meta_graphs[0]
+
+    # Check devices in meta graph nodes.
+    graph_def = meta.graph_def
     v0 = next((n for n in graph_def.node if n.name == "v0"), None)
     v1 = next((n for n in graph_def.node if n.name == "v1"), None)
     self.assertIsNotNone(v0)
@@ -532,6 +533,23 @@ class SaveTest(test.TestCase, parameterized.TestCase):
       self.assertEmpty(v0.device)
       self.assertEmpty(v1.device)
 
+    # Check devices in object graph nodes.
+    object_graph_def = meta.object_graph_def
+    v0 = next((n.variable
+               for n in object_graph_def.nodes
+               if n.HasField("variable") and n.variable.name == "v0"), None)
+    v1 = next((n.variable
+               for n in object_graph_def.nodes
+               if n.HasField("variable") and n.variable.name == "v1"), None)
+    self.assertIsNotNone(v0)
+    self.assertIsNotNone(v1)
+    if save_devices == save_options.VariablePolicy.SAVE_VARIABLE_DEVICES:
+      self.assertIn("CPU:0", v0.device)
+      self.assertIn("CPU:1", v1.device)
+    else:
+      self.assertEmpty(v0.device)
+      self.assertEmpty(v1.device)
+
   @parameterized.named_parameters(
       ("_ExpandDistributedVariables",
        save_options.VariablePolicy.EXPAND_DISTRIBUTED_VARIABLES),
diff --git a/tensorflow/python/saved_model/signature_serialization.py b/tensorflow/python/saved_model/signature_serialization.py
index 55d0d70295e..74f76c690f2 100644
--- a/tensorflow/python/saved_model/signature_serialization.py
+++ b/tensorflow/python/saved_model/signature_serialization.py
@@ -150,44 +150,32 @@ def canonicalize_signatures(signatures):
   return concrete_signatures, wrapped_functions
 
 
-def _is_flat(sequence):
-  sequence_flat = nest.flatten(sequence)
-  try:
-    nest.assert_same_structure(sequence_flat, sequence, check_types=False)
-    return True
-  except ValueError:
-    return False
-  except TypeError:
-    return False
-
-
 def _normalize_outputs(outputs, function_name, signature_key):
   """Construct an output dictionary from unnormalized function outputs."""
-  if isinstance(outputs, collections_abc.Mapping):
-    for key, value in outputs.items():
-      if not isinstance(value, ops.Tensor):
-        raise ValueError(
-            ("Got a dictionary containing non-Tensor value {} for key {} "
-             "in the output of the function {} used to generate a SavedModel "
-             "signature. Dictionaries outputs for functions used as signatures "
-             "should have one Tensor output per string key.")
-            .format(value, key, compat.as_str_any(function_name)))
-    return outputs
-  else:
-    original_outputs = outputs
+  # Convert `outputs` to a dictionary (if it's not one already).
+  if not isinstance(outputs, collections_abc.Mapping):
     if not isinstance(outputs, collections_abc.Sequence):
       outputs = [outputs]
-    if not _is_flat(outputs):
+    outputs = {("output_{}".format(output_index)): output
+               for output_index, output
+               in enumerate(outputs)}
+
+  # Check that the keys of `outputs` are strings and the values are Tensors.
+  for key, value in outputs.items():
+    if not isinstance(key, compat.bytes_or_text_types):
       raise ValueError(
-          ("Got non-flat outputs '{}' from '{}' for SavedModel "
-           "signature '{}'. Signatures have one Tensor per output, so "
-           "to have predictable names Python functions used to generate "
-           "these signatures should avoid outputting Tensors in nested "
-           "structures.")
-          .format(original_outputs, function_name, signature_key))
-    return {("output_{}".format(output_index)): output
-            for output_index, output
-            in enumerate(outputs)}
+          ("Got a dictionary with a non-string key {!r} in the output of the "
+           "function {} used to generate the SavedModel signature {!r}.")
+          .format(key, compat.as_str_any(function_name), signature_key))
+    if not isinstance(value, ops.Tensor):
+      raise ValueError(
+          ("Got a non-Tensor value {!r} for key {!r} in the output of the "
+           "function {} used to generate the SavedModel signature {!r}. "
+           "Outputs for functions used as signatures must be a single Tensor, "
+           "a sequence of Tensors, or a dictionary from string to Tensor.")
+          .format(value, key, compat.as_str_any(function_name), signature_key))
+
+  return outputs
 
 
 # _SignatureMap is immutable to ensure that users do not expect changes to be
diff --git a/tensorflow/python/saved_model/utils_impl.py b/tensorflow/python/saved_model/utils_impl.py
index 899dd61d172..17ef2ee05c3 100644
--- a/tensorflow/python/saved_model/utils_impl.py
+++ b/tensorflow/python/saved_model/utils_impl.py
@@ -262,6 +262,18 @@ def get_or_create_debug_dir(export_dir):
   return debug_dir
 
 
+def get_saved_model_pbtxt_path(export_dir):
+  return os.path.join(
+      compat.as_bytes(compat.path_to_str(export_dir)),
+      compat.as_bytes(constants.SAVED_MODEL_FILENAME_PBTXT))
+
+
+def get_saved_model_pb_path(export_dir):
+  return os.path.join(
+      compat.as_bytes(compat.path_to_str(export_dir)),
+      compat.as_bytes(constants.SAVED_MODEL_FILENAME_PB))
+
+
 def get_debug_dir(export_dir):
   """Returns path to the debug sub-directory in the SavedModel."""
   return os.path.join(
diff --git a/tensorflow/python/tf_program/pywrap_tfd.py b/tensorflow/python/tf_program/pywrap_tfd.py
index 0d9a236f5d3..a7a30b71f4e 100644
--- a/tensorflow/python/tf_program/pywrap_tfd.py
+++ b/tensorflow/python/tf_program/pywrap_tfd.py
@@ -137,8 +137,8 @@ class TFProgram(object):
   """Python wrap for a Tensorflow Program (essentially an mlir Module)."""
 
   def __init__(self):
-    mlir.registerDialects()
     self.ctx = mlir.MLIRContext()
+    mlir.preloadTensorFlowDialects(self.ctx)
     self.builder = mlir.Builder(self.ctx)
     self.module = mlir.ModuleOp.create(mlir.UnknownLoc.get(self.ctx))
     self.curr_func = None
diff --git a/tensorflow/python/tfe_wrapper.cc b/tensorflow/python/tfe_wrapper.cc
index ec0a1ac1c23..e76d9e872e4 100644
--- a/tensorflow/python/tfe_wrapper.cc
+++ b/tensorflow/python/tfe_wrapper.cc
@@ -28,15 +28,18 @@ limitations under the License.
 #include "tensorflow/c/eager/c_api_experimental.h"
 #include "tensorflow/c/eager/c_api_internal.h"
 #include "tensorflow/c/eager/dlpack.h"
+#include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
 #include "tensorflow/c/tf_status.h"
 #include "tensorflow/c/tf_status_helper.h"
 #include "tensorflow/compiler/jit/flags.h"
+#include "tensorflow/compiler/jit/get_compiler_ir.h"
 #include "tensorflow/python/eager/pywrap_tensor_conversion.h"
 #include "tensorflow/python/eager/pywrap_tfe.h"
 #include "tensorflow/python/lib/core/py_exception_registry.h"
 #include "tensorflow/python/lib/core/pybind11_lib.h"
 #include "tensorflow/python/lib/core/pybind11_status.h"
 #include "tensorflow/python/lib/core/safe_ptr.h"
+#include "tensorflow/python/lib/core/safe_pyobject_ptr.h"
 #include "tensorflow/python/util/util.h"
 
 namespace py = pybind11;
@@ -284,8 +287,168 @@ static py::object TFE_ClearScalarCache() {
   return py::none();
 }
 
+// Returns compiler IR for a given function.
+static std::string TFE_GetCompilerIr(py::handle& ctx,
+                                     const char* concrete_function_name,
+                                     const char* stage, const char* device_name,
+                                     py::handle& inputs) {
+  EagerContext* context = ContextFromInterface(
+      reinterpret_cast<ImmediateExecutionContext*>(InputTFE_Context(ctx)));
+
+  std::string s_stage(stage);
+  IrExportStage selected_stage = [&] {
+    if (s_stage == "hlo") {
+      return IrExportStage::HLO;
+    } else if (s_stage == "optimized_hlo") {
+      return IrExportStage::OPTIMIZED_HLO;
+    } else {
+      ThrowValueError(
+          absl::StrFormat("Invalid stage selected: '%s'. Valid values are: "
+                          "'hlo', 'optimized_hlo'",
+                          s_stage)
+              .c_str());
+    }
+  }();
+
+  TFE_InputTensorHandles handles = InputTFE_InputTensorHandles(inputs);
+
+  std::vector<const Tensor*> input_tensors;
+  for (TFE_TensorHandle* tensor_handle : handles) {
+    AbstractTensorHandle* abstract_tensor_handle = unwrap(tensor_handle);
+    TensorHandle* th = TensorHandleFromInterface(abstract_tensor_handle);
+
+    const Tensor* t;
+    Status st = th->Tensor(&t);
+    if (!st.ok()) {
+      ThrowValueError(
+          absl::StrFormat("Could not resolve tensor: '%s'", st.error_message())
+              .c_str());
+    }
+    input_tensors.push_back(t);
+  }
+
+  DeviceNameUtils::ParsedName input_device_name;
+  if (!DeviceNameUtils::ParseFullOrLocalName(device_name, &input_device_name)) {
+    ThrowValueError(
+        absl::StrFormat("Failed parsing device name: '%s'", device_name)
+            .c_str());
+  }
+
+  std::vector<Device*> devices = context->local_device_mgr()->ListDevices();
+  auto selected_device = absl::c_find_if(devices, [&](const Device* d) {
+    return DeviceNameUtils::AreCompatibleDevNames(input_device_name,
+                                                  d->parsed_name());
+  });
+  if (selected_device == devices.end()) {
+    ThrowValueError("No matching device found");
+  }
+
+  xla::StatusOr<std::string> hlo_text =
+      GetCompilerIr(selected_stage, context->pflr(), concrete_function_name,
+                    *selected_device, input_tensors);
+
+  if (!hlo_text.ok()) {
+    ThrowValueError(absl::StrFormat("Failed getting HLO text: '%s'",
+                                    hlo_text.status().error_message())
+                        .c_str());
+  }
+  return *hlo_text;
+}
+
 }  // namespace tensorflow
 
+namespace {
+
+// Wrapper around the EagerContextThreadLocalData struct (defined in
+// pywrap_tfe.h), so it can be accessed from Python.
+//
+// For PyObject* fields, the get_*() methods return a new reference; and the
+// set_*() methods create a new reference (i.e., they do not steal a reference).
+class EagerContextThreadLocalDataWrapper {
+ public:
+  explicit EagerContextThreadLocalDataWrapper(py::handle py_eager_context,
+                                              py::handle is_eager,
+                                              py::handle device_spec)
+      : py_eager_context_(py_eager_context.ptr()) {
+    tensorflow::MakeEagerContextThreadLocalData(
+        py_eager_context.ptr(), is_eager.ptr(), device_spec.ptr());
+  }
+
+  ~EagerContextThreadLocalDataWrapper() {
+    tensorflow::DestroyEagerContextThreadLocalData(py_eager_context_);
+  }
+
+  bool get_is_eager() const { return GetData()->is_eager; }
+  void set_is_eager(bool v) { GetData()->is_eager = v; }
+
+  bool get_invoking_op_callbacks() const {
+    return GetData()->invoking_op_callbacks;
+  }
+  void set_invoking_op_callbacks(bool v) {
+    GetData()->invoking_op_callbacks = v;
+  }
+
+  py::handle get_device_name() const {
+    return GetPyObject(&GetData()->device_name);
+  }
+  void set_device_name(py::handle v) {
+    SetPyObject(v, &GetData()->device_name);
+  }
+
+  py::handle get_scope_name() const {
+    return GetPyObject(&GetData()->scope_name);
+  }
+  void set_scope_name(py::handle v) { SetPyObject(v, &GetData()->scope_name); }
+
+  py::handle get_device_spec() const {
+    return GetPyObject(&GetData()->device_spec);
+  }
+  void set_device_spec(py::handle v) {
+    SetPyObject(v, &GetData()->device_spec);
+  }
+
+  py::handle get_function_call_options() const {
+    return GetPyObject(&GetData()->function_call_options);
+  }
+  void set_function_call_options(py::handle v) {
+    SetPyObject(v, &GetData()->function_call_options);
+  }
+
+  py::handle get_executor() const { return GetPyObject(&GetData()->executor); }
+  void set_executor(py::handle v) { SetPyObject(v, &GetData()->executor); }
+
+  py::handle get_op_callbacks() const {
+    return GetPyObject(&GetData()->op_callbacks);
+  }
+  void set_op_callbacks(py::handle v) {
+    SetPyObject(v, &GetData()->op_callbacks);
+  }
+
+ private:
+  tensorflow::EagerContextThreadLocalData* GetData() const {
+    auto* result =
+        tensorflow::GetEagerContextThreadLocalData(py_eager_context_);
+    if (!result) {
+      throw py::error_already_set();
+    }
+    return result;
+  }
+
+  py::handle GetPyObject(tensorflow::Safe_PyObjectPtr* obj) const {
+    Py_INCREF(obj->get());
+    return obj->get();
+  }
+
+  void SetPyObject(py::handle value, tensorflow::Safe_PyObjectPtr* ptr) {
+    Py_INCREF(value.ptr());
+    ptr->reset(value.ptr());
+  }
+
+  PyObject* py_eager_context_;  // not owned (borrowed reference).
+};
+
+}  // namespace
+
 // py::return_value_policy::reference is defined as specified by the
 // pybind11 documents listed here.
 // https://pybind11.readthedocs.io/en/stable/advanced/functions.html#return-value-policies
@@ -359,10 +522,8 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
                 tensorflow::InputTFE_Context(ctx)));
 
         tensorflow::DeviceNameUtils::ParsedName input_device_name;
-        if (!tensorflow::DeviceNameUtils::ParseFullName(device_name,
-                                                        &input_device_name) &&
-            !tensorflow::DeviceNameUtils::ParseLocalName(device_name,
-                                                         &input_device_name)) {
+        if (!tensorflow::DeviceNameUtils::ParseFullOrLocalName(
+                device_name, &input_device_name)) {
           tensorflow::ThrowValueError(
               absl::StrFormat("Failed parsing device name: '%s'", device_name)
                   .c_str());
@@ -375,11 +536,6 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
         for (int device_idx = 0; device_idx < devices.size(); device_idx++) {
           tensorflow::Device* device = devices[device_idx];
 
-          if (absl::StrContains(device->name(), "XLA") &&
-              !absl::StrContains(device_name, "XLA")) {
-            continue;
-          }
-
           if (tensorflow::DeviceNameUtils::AreCompatibleDevNames(
                   input_device_name, device->parsed_name())) {
             if (device->device_type() == tensorflow::DEVICE_CPU) {
@@ -387,13 +543,6 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
                   "CPU does not support getting allocator information");
             }
 
-            if (absl::StrContains(device->device_type(), "XLA") &&
-                !absl::StrContains(device_name, "XLA")) {
-              // TODO(b/140134773): Remove this workaround.
-              // Do not accidentally match XLA devices.
-              continue;
-            }
-
             if (matched_device != nullptr) {
               tensorflow::ThrowValueError(
                   absl::StrFormat(
@@ -412,7 +561,6 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
               absl::StrFormat("No matching devices found for '%s'", device_name)
                   .c_str());
         }
-        CHECK(matched_device);
 
         tensorflow::AllocatorAttributes attrs;
         tensorflow::Allocator* allocator = matched_device->GetAllocator(attrs);
@@ -426,7 +574,6 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
             absl::StrFormat("Allocator stats not available for device '%s'",
                             matched_device->name())
                 .c_str());
-        LOG(FATAL) << "Unreachable";
       });
 
   // XLA Eager Logic
@@ -436,7 +583,7 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
   m.def("TF_SetXlaConstantFoldingDisabled", &TF_SetXlaConstantFoldingDisabled);
   m.def("TF_GetXlaConstantFoldingDisabled", &TF_GetXlaConstantFoldingDisabled);
   m.def("TF_SetXlaMinClusterSize", &TF_SetXlaMinClusterSize);
-  m.def("TF_IsXlaEnabled", [] { return tensorflow::IsXlaEnabled(); });
+  m.def("TF_GetCompilerIr", &tensorflow::TFE_GetCompilerIr);
 
   // MLIR Logic
   m.def("TF_IsMlirBridgeEnabled", [] {
@@ -445,6 +592,9 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
   m.def("TF_EnableMlirBridge", [](bool enabled) {
     tensorflow::GetMlirCommonFlags()->tf_mlir_enable_mlir_bridge = enabled;
   });
+  m.def("TF_EnableXlaDevices", [] {
+    tensorflow::GetXlaDeviceFlags()->tf_xla_enable_xla_devices = true;
+  });
 
   // // TFE_Context Logic
   m.def(
@@ -542,21 +692,13 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
     return TFE_ContextGetDevicePlacementPolicy(
         tensorflow::InputTFE_Context(ctx));
   });
-  m.def("TFE_ContextGetMirroringPolicy", [](py::handle& ctx) {
-    return TFE_ContextGetMirroringPolicy(tensorflow::InputTFE_Context(ctx));
-  });
   m.def("TFE_ContextSetThreadLocalDevicePlacementPolicy",
         [](py::handle& ctx, TFE_ContextDevicePlacementPolicy policy) {
           TFE_ContextSetThreadLocalDevicePlacementPolicy(
               tensorflow::InputTFE_Context(ctx), policy);
         });
-  m.def("TFE_ContextSetThreadLocalMirroringPolicy",
-        [](py::handle& ctx, TFE_ContextMirroringPolicy policy) {
-          TFE_ContextSetThreadLocalMirroringPolicy(
-              tensorflow::InputTFE_Context(ctx), policy);
-        });
   m.def("TFE_ContextSetServerDef", [](py::handle& ctx, int keep_alive_secs,
-                                      py::str proto) {
+                                      py::bytes proto) {
     tensorflow::Safe_TF_StatusPtr status =
         tensorflow::make_safe(TF_NewStatus());
     tensorflow::Safe_TF_BufferPtr buf =
@@ -566,7 +708,7 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
     tensorflow::MaybeRaiseRegisteredFromTFStatus(status.get());
   });
   m.def("TFE_ContextUpdateServerDef", [](py::handle& ctx, int keep_alive_secs,
-                                         py::str proto) {
+                                         py::bytes proto) {
     tensorflow::Safe_TF_StatusPtr status =
         tensorflow::make_safe(TF_NewStatus());
     tensorflow::Safe_TF_BufferPtr buf =
@@ -846,7 +988,7 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
   m.def("TFE_NewContextOptions", &TFE_NewContextOptions,
         py::return_value_policy::reference);
   m.def("TFE_ContextOptionsSetConfig", [](TFE_ContextOptions* options,
-                                          py::str proto) {
+                                          py::bytes proto) {
     tensorflow::Safe_TF_StatusPtr status =
         tensorflow::make_safe(TF_NewStatus());
     tensorflow::Safe_TF_BufferPtr buf =
@@ -860,8 +1002,6 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
   m.def("TFE_ContextOptionsSetLazyRemoteInputsCopy",
         &TFE_ContextOptionsSetLazyRemoteInputsCopy);
   m.def("TFE_ContextOptionsSetTfrt", &TFE_ContextOptionsSetTfrt);
-  m.def("TFE_ContextOptionsSetMirroringPolicy",
-        &TFE_ContextOptionsSetMirroringPolicy);
   m.def("TFE_ContextOptionsSetAsync", &TFE_ContextOptionsSetAsync);
   m.def("TFE_DeleteContextOptions", &TFE_DeleteContextOptions,
         py::return_value_policy::reference);
@@ -897,7 +1037,7 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
           return tensorflow::PyoOrThrow(
               TFE_Py_EncodeArg(o.ptr(), include_tensor_ranks_only));
         });
-  m.def("TFE_EnableCollectiveOps", [](const py::handle& ctx, py::str proto) {
+  m.def("TFE_EnableCollectiveOps", [](const py::handle& ctx, py::bytes proto) {
     tensorflow::Safe_TF_StatusPtr status =
         tensorflow::make_safe(TF_NewStatus());
     tensorflow::Safe_TF_BufferPtr buf =
@@ -913,6 +1053,14 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
     TF_SetStatus(status.get(), static_cast<TF_Code>(code), message);
     TFE_AbortCollectiveOps(tensorflow::InputTFE_Context(ctx), status.get());
   });
+  m.def("TFE_CollectiveOpsCheckPeerHealth",
+        [](const py::handle& ctx, const char* task) {
+          tensorflow::Safe_TF_StatusPtr status =
+              tensorflow::make_safe(TF_NewStatus());
+          TFE_CollectiveOpsCheckPeerHealth(tensorflow::InputTFE_Context(ctx),
+                                           task, status.get());
+          tensorflow::MaybeRaiseRegisteredFromTFStatus(status.get());
+        });
   m.def("TF_ListPhysicalDevices", &tensorflow::TF_ListPhysicalDevices);
   m.def("TF_GetDeviceDetails", &tensorflow::TF_GetDeviceDetails);
   m.def("TF_DeleteDeviceList", &TF_DeleteDeviceList,
@@ -1288,6 +1436,38 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
     tensorflow::MaybeRaiseRegisteredFromTFStatus(status.get());
   });
 
+  py::class_<EagerContextThreadLocalDataWrapper>(m,
+                                                 "EagerContextThreadLocalData")
+      .def(py::init<py::handle, py::handle, py::handle>(),
+           py::arg("py_eager_context"), py::arg("is_eager"),
+           py::arg("device_spec"))
+      .def_property("is_eager",
+                    &EagerContextThreadLocalDataWrapper::get_is_eager,
+                    &EagerContextThreadLocalDataWrapper::set_is_eager)
+      .def_property(
+          "invoking_op_callbacks",
+          &EagerContextThreadLocalDataWrapper::get_invoking_op_callbacks,
+          &EagerContextThreadLocalDataWrapper::set_invoking_op_callbacks)
+      .def_property("device_name",
+                    &EagerContextThreadLocalDataWrapper::get_device_name,
+                    &EagerContextThreadLocalDataWrapper::set_device_name)
+      .def_property("scope_name",
+                    &EagerContextThreadLocalDataWrapper::get_scope_name,
+                    &EagerContextThreadLocalDataWrapper::set_scope_name)
+      .def_property("device_spec",
+                    &EagerContextThreadLocalDataWrapper::get_device_spec,
+                    &EagerContextThreadLocalDataWrapper::set_device_spec)
+      .def_property(
+          "function_call_options",
+          &EagerContextThreadLocalDataWrapper::get_function_call_options,
+          &EagerContextThreadLocalDataWrapper::set_function_call_options)
+      .def_property("executor",
+                    &EagerContextThreadLocalDataWrapper::get_executor,
+                    &EagerContextThreadLocalDataWrapper::set_executor)
+      .def_property("op_callbacks",
+                    &EagerContextThreadLocalDataWrapper::get_op_callbacks,
+                    &EagerContextThreadLocalDataWrapper::set_op_callbacks);
+
   // C API Enum
 
   py::enum_<TFE_ContextDevicePlacementPolicy>(
@@ -1310,9 +1490,4 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
       .value("TF_ATTR_PLACEHOLDER", TF_ATTR_PLACEHOLDER)
       .value("TF_ATTR_FUNC", TF_ATTR_FUNC)
       .export_values();
-
-  py::enum_<TFE_ContextMirroringPolicy>(m, "TFE_ContextMirroringPolicy")
-      .value("TFE_MIRRORING_NONE", TFE_MIRRORING_NONE)
-      .value("TFE_MIRRORING_ALL", TFE_MIRRORING_ALL)
-      .export_values();
 };
diff --git a/tensorflow/python/tools/api/generator/BUILD b/tensorflow/python/tools/api/generator/BUILD
index bdf48716144..22aeebc2033 100644
--- a/tensorflow/python/tools/api/generator/BUILD
+++ b/tensorflow/python/tools/api/generator/BUILD
@@ -85,6 +85,8 @@ py_test(
     ],
     deps = [
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_combinations",
+        "//tensorflow/python:modules_with_exports",
         "//tensorflow/python:no_contrib",
         "//tensorflow/python/tools/api/generator:create_python_api",
     ],
diff --git a/tensorflow/python/tools/api/generator/api_gen.bzl b/tensorflow/python/tools/api/generator/api_gen.bzl
index 45355c90aaa..fb9fd65c7bb 100644
--- a/tensorflow/python/tools/api/generator/api_gen.bzl
+++ b/tensorflow/python/tools/api/generator/api_gen.bzl
@@ -139,7 +139,7 @@ def gen_api_init_files(
             " --use_relative_imports=True $(OUTS)"
         ),
         srcs = srcs,
-        tools = [":" + api_gen_binary_target],
+        exec_tools = [":" + api_gen_binary_target],
         visibility = [
             "//tensorflow:__pkg__",
             "//tensorflow/tools/api/tests:__pkg__",
diff --git a/tensorflow/python/tools/api/generator/api_init_files.bzl b/tensorflow/python/tools/api/generator/api_init_files.bzl
index d2770e92b2e..48c30ba1c1d 100644
--- a/tensorflow/python/tools/api/generator/api_init_files.bzl
+++ b/tensorflow/python/tools/api/generator/api_init_files.bzl
@@ -4,6 +4,11 @@
 TENSORFLOW_API_INIT_FILES = [
     # BEGIN GENERATED FILES
     "__init__.py",
+    "__internal__/__init__.py",
+    "__internal__/decorator/__init__.py",
+    "__internal__/test/__init__.py",
+    "__internal__/test/combinations/__init__.py",
+    "__internal__/tracking/__init__.py",
     "__operators__/__init__.py",
     "audio/__init__.py",
     "autograph/__init__.py",
@@ -97,6 +102,7 @@ KERAS_API_INIT_FILES = [
     "keras/applications/inception_v3/__init__.py",
     "keras/applications/mobilenet/__init__.py",
     "keras/applications/mobilenet_v2/__init__.py",
+    "keras/applications/mobilenet_v3/__init__.py",
     "keras/applications/nasnet/__init__.py",
     "keras/applications/resnet/__init__.py",
     "keras/applications/resnet_v2/__init__.py",
diff --git a/tensorflow/python/tools/api/generator/api_init_files_v1.bzl b/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
index d1761b4d2bc..36593eff901 100644
--- a/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
+++ b/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
@@ -113,6 +113,7 @@ KERAS_API_INIT_FILES_V1 = [
     "keras/applications/inception_v3/__init__.py",
     "keras/applications/mobilenet/__init__.py",
     "keras/applications/mobilenet_v2/__init__.py",
+    "keras/applications/mobilenet_v3/__init__.py",
     "keras/applications/nasnet/__init__.py",
     "keras/applications/resnet/__init__.py",
     "keras/applications/resnet_v2/__init__.py",
diff --git a/tensorflow/python/tools/api/generator/output_init_files_test.py b/tensorflow/python/tools/api/generator/output_init_files_test.py
index f1f85de868b..fc2ad26dc9c 100644
--- a/tensorflow/python/tools/api/generator/output_init_files_test.py
+++ b/tensorflow/python/tools/api/generator/output_init_files_test.py
@@ -24,6 +24,9 @@ import sys
 # pylint: disable=unused-import
 from tensorflow import python as _tf_for_api_traversal
 from tensorflow.lite.python import lite as _tflite_for_api_traversal
+from tensorflow.python import modules_with_exports
+from tensorflow.python.framework import combinations
+from tensorflow.python.framework import test_combinations
 # pylint: enable=unused-import
 from tensorflow.python.platform import resource_loader
 from tensorflow.python.platform import test
diff --git a/tensorflow/python/tools/saved_model_aot_compile.py b/tensorflow/python/tools/saved_model_aot_compile.py
index 5a34d10420a..bf955ad825c 100644
--- a/tensorflow/python/tools/saved_model_aot_compile.py
+++ b/tensorflow/python/tools/saved_model_aot_compile.py
@@ -321,7 +321,8 @@ def aot_compile_cpu_meta_graph_def(checkpoint_path,
   # Load the Variables so that we can freeze the graph.
   with session.Session(graph=ops_lib.Graph()) as sess:
     restorer = saver_lib.import_meta_graph(meta_graph_def, clear_devices=True)
-    restorer.restore(sess, checkpoint_path)
+    if restorer is not None:
+      restorer.restore(sess, checkpoint_path)
     graph_def.CopyFrom(
         graph_util.convert_variables_to_constants(
             sess,
diff --git a/tensorflow/python/tpu/BUILD b/tensorflow/python/tpu/BUILD
index e1e71e62692..b2a60087a3e 100644
--- a/tensorflow/python/tpu/BUILD
+++ b/tensorflow/python/tpu/BUILD
@@ -1,6 +1,7 @@
 # Description: Operations defined for Cloud TPUs
 
-load("//tensorflow:tensorflow.bzl", "tf_py_test")
+load("//tensorflow:tensorflow.bzl", "pytype_library")  # buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "tf_py_test")  # buildifier: disable=same-origin-load
 load("//tensorflow/core/platform:build_config.bzl", "tf_proto_library")
 load("//tensorflow/python/tpu:tpu.bzl", "tpu_py_test")
 
@@ -41,17 +42,18 @@ py_test(
     ],
 )
 
-py_library(
-    name = "tpu_py",
+pytype_library(
+    name = "tpu_ops",
     srcs = ["ops/tpu_ops.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":tpu_function",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:tpu_ops_gen",
     ],
 )
 
-py_library(
+pytype_library(
     name = "async_checkpoint",
     srcs = ["async_checkpoint.py"],
     srcs_version = "PY2AND3",
@@ -88,7 +90,7 @@ tpu_py_test(
     ],
 )
 
-py_library(
+pytype_library(
     name = "preempted_hook_py",
     srcs = ["preempted_hook.py"],
     srcs_version = "PY2AND3",
@@ -185,7 +187,7 @@ py_library(
     ],
 )
 
-py_library(
+pytype_library(
     name = "tpu_lib",
     srcs = [
         "__init__.py",
@@ -196,20 +198,18 @@ py_library(
         "tensor_tracer_flags.py",
         "tensor_tracer_report.py",
         "topology.py",
-        "tpu.py",
         "tpu_feed.py",
-        "tpu_function.py",
         "tpu_optimizer.py",
         "tpu_sharding.py",
         "tpu_strategy_util.py",
-        "tpu_system_metadata.py",
         "training_loop.py",
     ],
     srcs_version = "PY2AND3",
     deps = [
         ":datasets",
         ":functional",
-        ":tpu_py",
+        ":tpu_function",
+        ":tpu_ops",
         "//tensorflow/compiler/xla/experimental/xla_sharding",
         "//tensorflow/compiler/xla/python_api:xla_shape",
         "//tensorflow/core:protos_all_py",
@@ -233,6 +233,7 @@ py_library(
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/compiler/xla",
+        "//tensorflow/python/distribute/cluster_resolver:tpu_cluster_resolver_py",
         "//tensorflow/python/ops/losses",
         "//tensorflow/python/tpu:tensor_tracer_proto_py",
         "//tensorflow/python/tpu/profiler",
@@ -240,7 +241,64 @@ py_library(
     ],
 )
 
-py_library(
+pytype_library(
+    name = "tpu_py",
+    srcs = ["tpu.py"],
+    deps = [
+        ":tpu_function",
+        ":tpu_ops",
+        "//tensorflow/core/protobuf/tpu:dynamic_padding_proto_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:auto_control_deps",
+        "//tensorflow/python:c_api_util",
+        "//tensorflow/python:composite_tensor",
+        "//tensorflow/python:config",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:device",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:func_graph",
+        "//tensorflow/python:function",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/compiler/xla",
+        "//tensorflow/python/distribute:device_util",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//third_party/py/numpy",
+        "@absl_py//absl/logging",
+        "@enum34_archive//:enum",
+        "@six_archive//:six",
+    ],
+)
+
+pytype_library(
+    name = "tpu_function",
+    srcs = ["tpu_function.py"],
+    deps = [
+    ],
+)
+
+pytype_library(
+    name = "tpu_system_metadata",
+    srcs = ["tpu_system_metadata.py"],
+    deps = [
+        ":tpu_py",
+        "//tensorflow/core/protobuf/tpu:tpu_embedding_configuration_proto_py",
+        "//tensorflow/python:config",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:session",
+        "//tensorflow/python:util",
+        "//tensorflow/python/distribute:device_util",
+    ],
+)
+
+pytype_library(
     name = "datasets",
     srcs = [
         "datasets.py",
@@ -250,6 +308,7 @@ py_library(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:function",
         "//tensorflow/python:functional_ops",
+        "//tensorflow/python/data/experimental/ops:interleave_ops",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:iterator_ops",
         "//tensorflow/python/data/ops:readers",
@@ -329,7 +388,7 @@ tf_py_test(
     ],
 )
 
-py_library(
+pytype_library(
     name = "tpu_embedding",
     srcs = [
         "tpu_embedding.py",
@@ -351,7 +410,7 @@ py_library(
     ],
 )
 
-py_library(
+pytype_library(
     name = "tpu_strategy_util",
     srcs = ["tpu_strategy_util.py"],
     deps = [
@@ -366,7 +425,7 @@ py_library(
     ],
 )
 
-py_library(
+pytype_library(
     name = "feature_column",
     srcs = ["feature_column.py"],
     deps = [
@@ -379,7 +438,7 @@ py_library(
     ],
 )
 
-py_library(
+pytype_library(
     name = "feature_column_v2",
     srcs = ["feature_column_v2.py"],
     deps = [
@@ -437,7 +496,7 @@ tf_py_test(
     ],
 )
 
-py_library(
+pytype_library(
     name = "tpu_embedding_v2_utils",
     srcs = ["tpu_embedding_v2_utils.py"],
     srcs_version = "PY2AND3",
@@ -446,13 +505,13 @@ py_library(
         "//tensorflow/python/distribute:device_util",
         "//tensorflow/python/distribute:sharded_variable",
         "//tensorflow/python/tpu:tpu_lib",
-        "//tensorflow/python/tpu:tpu_py",
+        "//tensorflow/python/tpu:tpu_ops",
         "//tensorflow/python/training/saving:saveable_hook",
         "@six_archive//:six",
     ],
 )
 
-py_library(
+pytype_library(
     name = "tpu_embedding_v2",
     srcs = ["tpu_embedding_v2.py"],
     srcs_version = "PY2AND3",
@@ -460,9 +519,11 @@ py_library(
         ":tpu_embedding_v2_utils",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/distribute:device_util",
+        "//tensorflow/python/distribute:distribute_utils",
         "//tensorflow/python/distribute:sharded_variable",
+        "//tensorflow/python/distribute:tpu_strategy",
         "//tensorflow/python/tpu:tpu_lib",
-        "//tensorflow/python/tpu:tpu_py",
+        "//tensorflow/python/tpu:tpu_ops",
         "//tensorflow/python/training/saving:saveable_hook",
         "@six_archive//:six",
     ],
@@ -474,6 +535,7 @@ tpu_py_test(
         "tpu_embedding_v2_test.py",
     ],
     disable_experimental = True,
+    disable_mlir_bridge = False,
     python_version = "PY3",
     shard_count = 4,
     srcs_version = "PY2AND3",
@@ -505,6 +567,7 @@ tpu_py_test(
         "tpu_embedding_v2_correctness_test.py",
     ],
     disable_experimental = True,
+    disable_mlir_bridge = False,
     python_version = "PY3",
     shard_count = 4,
     srcs_version = "PY2AND3",
@@ -543,6 +606,19 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "tpu_embedding_v2_utils_test",
+    srcs = [
+        "tpu_embedding_v2_utils_test.py",
+    ],
+    python_version = "PY3",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":tpu_embedding_v2",
+        "//tensorflow/python/compat:v2_compat",
+    ],
+)
+
 tpu_py_test(
     name = "tpu_outside_compilation_test",
     srcs = [
diff --git a/tensorflow/python/tpu/client/client.py b/tensorflow/python/tpu/client/client.py
index c834a57c153..fa13908b255 100644
--- a/tensorflow/python/tpu/client/client.py
+++ b/tensorflow/python/tpu/client/client.py
@@ -41,12 +41,14 @@ FLAGS = flags.FLAGS
 
 flags.DEFINE_bool('runtime_oom_exit', True,
                   'Exit the script when the TPU runtime is OOM.')
+flags.DEFINE_bool('hbm_oom_exit', True,
+                  'Exit the script when the TPU HBM is OOM.')
 
 _GKE_ENV_VARIABLE = 'KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS'
 _ENDPOINTS_SEPARATOR = ','
 _DEFAULT_ENV_VARIABLE = 'TPU_NAME'
 _DISCOVERY_SERVICE_URL_ENV_VARIABLE = 'TPU_API_DISCOVERY_URL'
-_GCE_METADATA_ENDPOINT = 'http://metadata.google.internal'
+_GCE_METADATA_URL_ENV_VARIABLE = 'GCE_METADATA_IP'
 _DEFAULT_ENDPOINT_PORT = '8470'
 _OOM_EVENT_COOL_TIME_SEC = 90
 _VERSION_SWITCHER_ENDPOINT = 'http://{}:8475/requestversion'
@@ -68,9 +70,14 @@ def _environment_discovery_url():
   return os.environ.get(_DISCOVERY_SERVICE_URL_ENV_VARIABLE)
 
 
+def _gce_metadata_endpoint():
+  return 'http://' + os.environ.get(_GCE_METADATA_URL_ENV_VARIABLE,
+                                    'metadata.google.internal')
+
+
 def _request_compute_metadata(path):
   req = request.Request(
-      '%s/computeMetadata/v1/%s' % (_GCE_METADATA_ENDPOINT, path),
+      '%s/computeMetadata/v1/%s' % (_gce_metadata_endpoint(), path),
       headers={'Metadata-Flavor': 'Google'})
   resp = request.urlopen(req)
   return _as_text(resp.read())
@@ -166,9 +173,8 @@ class Client(object):
     """Return the structured Symptom message."""
     return 'Symptom: ' + msg
 
-  def _oom_event(self):
+  def _oom_event(self, symptoms):
     """Check if a runtime OOM event is reported."""
-    symptoms = self.symptoms()
     if not symptoms:
       return False
     for symptom in reversed(symptoms):
@@ -188,6 +194,27 @@ class Client(object):
         return True
     return False
 
+  def _hbm_oom_event(self, symptoms):
+    """Check if a HBM OOM event is reported."""
+    if not symptoms:
+      return False
+    for symptom in reversed(symptoms):
+      if symptom['symptomType'] != 'HBM_OUT_OF_MEMORY':
+        continue
+      oom_datetime_str = symptom['createTime'].split('.')[0]
+      oom_datetime = datetime.datetime.strptime(oom_datetime_str,
+                                                '%Y-%m-%dT%H:%M:%S')
+      time_diff = _utcnow() - oom_datetime
+      if time_diff < datetime.timedelta(seconds=_OOM_EVENT_COOL_TIME_SEC):
+        logging.warning(self._symptom_msg(
+            'a recent HBM OOM has occured ~{} seconds ago. The model '
+            'script will terminate automatically. To prevent future HBM OOM '
+            'events, please consider reducing the model size. To disable this '
+            'behavior, set flag --hbm_oom_exit=false when starting the '
+            'script.'.format(time_diff.seconds)))
+        return True
+    return False
+
   def _tpu_service(self):
     """Creates a new Cloud TPU API object.
 
@@ -259,9 +286,12 @@ class Client(object):
     If false the TPU is in a unrecoverable state and should be recreated.
     """
     state = self.state()
+    symptoms = self.symptoms()
     if state and state in ['TERMINATED', 'PREEMPTED']:
       return False
-    elif FLAGS.runtime_oom_exit and self._oom_event():
+    elif FLAGS.runtime_oom_exit and self._oom_event(symptoms):
+      return False
+    elif FLAGS.hbm_oom_exit and self._hbm_oom_event(symptoms):
       return False
     return True
 
diff --git a/tensorflow/python/tpu/client/client_test.py b/tensorflow/python/tpu/client/client_test.py
index f53f09cd3d5..c919636adde 100644
--- a/tensorflow/python/tpu/client/client_test.py
+++ b/tensorflow/python/tpu/client/client_test.py
@@ -87,6 +87,7 @@ class CloudTpuClientTest(test.TestCase):
     if 'TPU_NAME' in os.environ:
       del os.environ['TPU_NAME']
     self._time_now = 0
+    self.addCleanup(mock.patch.stopall)
 
   def _mock_time(self, *args, **kwargs):
     return self._time_now
@@ -472,6 +473,189 @@ class CloudTpuClientTest(test.TestCase):
                         service=self.mock_service_client(tpu_map=tpu_map))
       self.assertEqual(want, c.recoverable())
 
+  @mock.patch.object(client, '_request_compute_metadata',
+                     mock_request_compute_metadata)
+  @mock.patch.object(client, '_utcnow', mock_utcnow)
+  def testRecoverableHBMOOM(self):
+    test_cases = [
+        ({
+            'projects/test-project/locations/us-central1-c/nodes/tpu_name': {
+                'state':
+                    'READY',
+            }
+        }, True),
+        ({
+            'projects/test-project/locations/us-central1-c/nodes/tpu_name': {
+                'state':
+                    'READY',
+                'symptoms': [{
+                    'createTime': '2000-01-01T00:29:30.123456Z',
+                    'symptomType': 'HBM_OUT_OF_MEMORY',
+                    'details': 'The TPU HBM has run OOM at timestamp '
+                               '2020-05-29T04:51:32.038721+00:00',
+                    'workerId': '0'
+                }]
+            }
+        }, False),
+        ({
+            'projects/test-project/locations/us-central1-c/nodes/tpu_name': {
+                'state':
+                    'READY',
+                'symptoms': [{
+                    'createTime': '2000-01-01T00:28:20.123456Z',
+                    'symptomType': 'HBM_OUT_OF_MEMORY',
+                    'details': 'The TPU HBM has run OOM at timestamp '
+                               '2020-05-29T04:51:32.038721+00:00',
+                    'workerId': '0'
+                }]
+            }
+        }, True),
+        ({
+            'projects/test-project/locations/us-central1-c/nodes/tpu_name': {
+                'state':
+                    'READY',
+                'symptoms': [{
+                    'createTime': '2000-01-01T00:28:40.123456Z',
+                    'symptomType': 'LOW_MEMORY',
+                    'details': 'The TPU HBM has run OOM at timestamp '
+                               '2020-05-29T04:51:32.038721+00:00',
+                    'workerId': '0'
+                }, {
+                    'createTime': '2000-01-01T00:29:30.123456Z',
+                    'symptomType': 'HBM_OUT_OF_MEMORY',
+                    'details': 'The TPU HBM has run OOM at timestamp '
+                               '2020-05-29T04:51:32.038721+00:00',
+                    'workerId': '0'
+                }, {
+                    'createTime': '2000-01-01T00:29:40.123456Z',
+                    'symptomType': 'LOW_MEMORY',
+                    'details': 'The TPU HBM has run OOM at timestamp '
+                               '2020-05-29T04:51:32.038721+00:00',
+                    'workerId': '0'
+                }]
+            }
+        }, False),
+        ({
+            'projects/test-project/locations/us-central1-c/nodes/tpu_name': {
+                'state':
+                    'READY',
+                'symptoms': [{
+                    'createTime': '2000-01-01T00:28:20.123456Z',
+                    'symptomType': 'HBM_OUT_OF_MEMORY',
+                    'details': 'The TPU HBM has run OOM at timestamp '
+                               '2020-05-29T04:51:32.038721+00:00',
+                    'workerId': '0'
+                }, {
+                    'createTime': '2000-01-01T00:29:30.123456Z',
+                    'symptomType': 'LOW_MEMORY',
+                    'details': 'The TPU HBM has run OOM at timestamp '
+                               '2020-05-29T04:51:32.038721+00:00',
+                    'workerId': '0'
+                }, {
+                    'createTime': '2000-01-01T00:29:40.123456Z',
+                    'symptomType': 'LOW_MEMORY',
+                    'details': 'The TPU HBM has run OOM at timestamp '
+                               '2020-05-29T04:51:32.038721+00:00',
+                    'workerId': '0'
+                }]
+            }
+        }, True),
+        ({
+            'projects/test-project/locations/us-central1-c/nodes/tpu_name': {
+                'state':
+                    'READY',
+                'symptoms': [{
+                    'createTime': '2000-01-01T00:29:00.123456Z',
+                    'symptomType': 'LOW_MEMORY',
+                    'details': 'The TPU HBM has run OOM at timestamp '
+                               '2020-05-29T04:51:32.038721+00:00',
+                    'workerId': '0'
+                }, {
+                    'createTime': '2000-01-01T00:29:10.123456Z',
+                    'symptomType': 'LOW_MEMORY',
+                    'details': 'The TPU HBM has run OOM at timestamp '
+                               '2020-05-29T04:51:32.038721+00:00',
+                    'workerId': '0'
+                }, {
+                    'createTime': '2000-01-01T00:29:20.123456Z',
+                    'symptomType': 'LOW_MEMORY',
+                    'details': 'The TPU HBM has run OOM at timestamp '
+                               '2020-05-29T04:51:32.038721+00:00',
+                    'workerId': '0'
+                }, {
+                    'createTime': '2000-01-01T00:29:30.123456Z',
+                    'symptomType': 'LOW_MEMORY',
+                    'details': 'The TPU HBM has run OOM at timestamp '
+                               '2020-05-29T04:51:32.038721+00:00',
+                    'workerId': '0'
+                }, {
+                    'createTime': '2000-01-01T00:29:40.123456Z',
+                    'symptomType': 'LOW_MEMORY',
+                    'details': 'The TPU HBM has run OOM at timestamp '
+                               '2020-05-29T04:51:32.038721+00:00',
+                    'workerId': '0'
+                }]
+            }
+        }, True)
+    ]
+
+    for tpu_map, want in test_cases:
+      c = client.Client(tpu='tpu_name',
+                        service=self.mock_service_client(tpu_map=tpu_map))
+      self.assertEqual(want, c.recoverable())
+
+  @mock.patch.object(client, '_request_compute_metadata',
+                     mock_request_compute_metadata)
+  @mock.patch.object(client, '_utcnow', mock_utcnow)
+  def testRecoverableHBMOOMDisabled(self):
+    test_cases = [
+        ({
+            'projects/test-project/locations/us-central1-c/nodes/tpu_name': {
+                'state':
+                    'READY',
+                'symptoms': [{
+                    'createTime': '2000-01-01T00:29:30.123456Z',
+                    'symptomType': 'HBM_OUT_OF_MEMORY',
+                    'details': 'The TPU HBM has run OOM at timestamp '
+                               '2020-05-29T04:51:32.038721+00:00',
+                    'workerId': '0'
+                }]
+            }
+        }, True),
+    ]
+
+    FLAGS.hbm_oom_exit = False
+    for tpu_map, want in test_cases:
+      c = client.Client(tpu='tpu_name',
+                        service=self.mock_service_client(tpu_map=tpu_map))
+      self.assertEqual(want, c.recoverable())
+    FLAGS.hbm_oom_exit = True
+
+  @mock.patch.object(client, '_request_compute_metadata',
+                     mock_request_compute_metadata)
+  @mock.patch.object(client, '_utcnow', mock_utcnow)
+  def testRecoverableHBMOOMNoAPI(self):
+    test_cases = [
+        ({
+            'projects/test-project/locations/us-central1-c/nodes/tpu_name': {
+                'state':
+                    'READY',
+                'symptoms': [{
+                    'createTime': '2000-01-01T00:29:30.123456Z',
+                    'symptomType': 'HBM_OUT_OF_MEMORY',
+                    'details': 'The TPU HBM has run OOM at timestamp '
+                               '2020-05-29T04:51:32.038721+00:00',
+                    'workerId': '0'
+                }]
+            }
+        }, True),
+    ]
+
+    for tpu_map, want in test_cases:
+      c = client.Client(tpu='grpc://1.2.3.4:8470',
+                        service=self.mock_service_client(tpu_map=tpu_map))
+      self.assertEqual(want, c.recoverable())
+
   @mock.patch.object(client, '_request_compute_metadata',
                      mock_request_compute_metadata)
   def testHealthApi(self):
diff --git a/tensorflow/python/tpu/client/version.py b/tensorflow/python/tpu/client/version.py
index a91586640fc..36f02a86878 100644
--- a/tensorflow/python/tpu/client/version.py
+++ b/tensorflow/python/tpu/client/version.py
@@ -18,4 +18,4 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-__version__ = "0.10"
+__version__ = "0.11"
diff --git a/tensorflow/python/tpu/device_assignment.py b/tensorflow/python/tpu/device_assignment.py
index 9e805655a01..f8cb4e16266 100644
--- a/tensorflow/python/tpu/device_assignment.py
+++ b/tensorflow/python/tpu/device_assignment.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import enum
 import math
 
 import numpy as np
@@ -313,10 +314,22 @@ def _ring_3d(x_size, y_size, z_size):
   return ret
 
 
+class DeviceOrderMode(enum.IntEnum):
+  """The way of determining device orders when computing device assignment."""
+  # By default the mode is set to AUTO, the library will choose to form rings
+  # when that is possible.
+  AUTO = 0
+  # Form rings for replicas and model-parallel cores.
+  RING = 1
+  # Form meshes for replicas and/or model-parallel cores.
+  MESH = 2
+
+
 def device_assignment(topology,
                       computation_shape=None,
                       computation_stride=None,
-                      num_replicas=1):
+                      num_replicas=1,
+                      device_order_mode=DeviceOrderMode.AUTO):
   """Computes a device_assignment of a computation across a TPU topology.
 
   Attempts to choose a compact grid of cores for locality.
@@ -341,6 +354,9 @@ def device_assignment(topology,
       TPU topology. If None, the `computation_stride` is `[1] * topology_rank`.
     num_replicas: The number of computation replicas to run. The replicas will
       be packed into the free spaces of the topology.
+    device_order_mode: An enum of `DeviceOrderMode` class which indicates
+      whether to assign devices to form rings or meshes, or let the library to
+      choose.
 
   Returns:
     A DeviceAssignment object, which describes the mapping between the logical
@@ -450,6 +466,12 @@ def device_assignment(topology,
         computation_shape[-1] == 2  # Only handle 3D case.
         and np.prod(computation_stride) == 1  # Ensure no stride.
         and num_replicas == max_replicas)  # Full replication.
+
+    if device_order_mode != DeviceOrderMode.AUTO:
+      if device_order_mode == DeviceOrderMode.RING and not enable_3d_tiling:
+        raise ValueError("cannot assign ring order in the given topology")
+      enable_3d_tiling = device_order_mode == DeviceOrderMode.RING
+
     if enable_3d_tiling:
       assignment = []
       inner_ring = _ring_3d(computation_shape[0], computation_shape[1],
diff --git a/tensorflow/python/tpu/tensor_tracer.py b/tensorflow/python/tpu/tensor_tracer.py
index c0536d84182..bb7b67bb245 100644
--- a/tensorflow/python/tpu/tensor_tracer.py
+++ b/tensorflow/python/tpu/tensor_tracer.py
@@ -44,6 +44,7 @@ from tensorflow.python.ops import logging_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_impl
 from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import summary_ops_v2 as summary
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import analytics
@@ -991,16 +992,16 @@ class TensorTracer(object):
 
       Raises:
         ValueError: If tensor_name is not already in
-                    self._tensorname_idx_map.
+                    tensor_trace_order.tensorname_idx_map.
       """
 
       if self._parameters.is_brief_mode():
         if tensor_name not in tensor_trace_order.tensorname_idx_map:
           raise ValueError(
               'Tensor name %s is not in the tensorname_idx_map'%tensor_name)
-        msg = '%d'%self._tensorname_idx_map[tensor_name]
+        msg = '%d' % tensor_trace_order.tensorname_idx_map[tensor_name]
       else:
-        msg = '"%s"'%tensor_name
+        msg = '"%s"' % tensor_name
 
       if self._parameters.trace_dir:
         output_path = os.path.join(self._parameters.trace_dir, _TRACE_FILE_NAME)
@@ -1643,11 +1644,12 @@ class TensorTracer(object):
       raise ValueError('Provide a trace_dir for tensor tracer in summary mode. '
                        '--trace_dir=/model/dir')
 
-    def _write_cache(step, **kwargs):
+    def _write_cache(step, event_file_suffix=None, **kwargs):
       """Writes the given caches as tensor summary.
 
       Args:
         step: Step tensor with dimension [num_cores].
+        event_file_suffix: Event filename suffix tensor.
         **kwargs: The dictionary of tensors that needs to be written as
           summaries. Key and value pairs within kwargs correspond to the tag
           name, and tensor content that will be written using summary.write.
@@ -1664,16 +1666,20 @@ class TensorTracer(object):
       Raises:
         RuntimeError: if there is no aggregate function defined for a signature.
       """
-
+      file_suffix = _TT_EVENT_FILE_SUFFIX
+      if event_file_suffix is not None:
+        file_suffix = string_ops.string_join([file_suffix, event_file_suffix],
+                                             separator='.')
       # TODO(deveci): Parametrize max_queue, so that flushing op can be called
       # less frequently.
       # Setting max_queue to 100 appears to be safe even when the number of
       # iterations are much lower, as the destructor of the writer flushes it.
       summary_write_ops = []
-      with summary.create_file_writer_v2(
+      summary_writer = summary.create_file_writer_v2(
           self._parameters.trace_dir,
-          filename_suffix=_TT_EVENT_FILE_SUFFIX,
-          max_queue=_TT_SUMMARY_MAX_QUEUE).as_default():
+          filename_suffix=file_suffix,
+          max_queue=_TT_SUMMARY_MAX_QUEUE)
+      with summary_writer.as_default():
         summary_metadata = summary_pb2.SummaryMetadata(
             plugin_data=summary_pb2.SummaryMetadata.PluginData(
                 plugin_name=_TT_TENSORBOARD_PLUGIN_NAME))
@@ -1688,8 +1694,7 @@ class TensorTracer(object):
             if key == _TT_SUMMARY_TAG and value.shape.as_list()[0] != 1:
               value = self.aggregate_global_cache(value)
 
-          with ops.control_dependencies(
-              summary.summary_writer_initializer_op()):
+          with ops.control_dependencies([summary_writer.init()]):
             summary_write_ops.append(summary.write(
                 _TT_SUMMARY_TAG + '/' + key, value, metadata=summary_metadata,
                 step=step[0]))
diff --git a/tensorflow/python/tpu/tensor_tracer_flags.py b/tensorflow/python/tpu/tensor_tracer_flags.py
index 4e412c46e82..722a2fa0276 100644
--- a/tensorflow/python/tpu/tensor_tracer_flags.py
+++ b/tensorflow/python/tpu/tensor_tracer_flags.py
@@ -358,7 +358,7 @@ class TTParameters(object):
     int_list = []
     found, flag_value = self.get_flag_value(wanted_flag_name)
 
-    if found:
+    if found and flag_value:
       try:
         integer_values = flag_value.split(',')
         int_list = [int(int_val) for int_val in integer_values]
diff --git a/tensorflow/python/tpu/tpu.py b/tensorflow/python/tpu/tpu.py
index 5a2f7ba4454..c6b5a256b42 100644
--- a/tensorflow/python/tpu/tpu.py
+++ b/tensorflow/python/tpu/tpu.py
@@ -21,6 +21,8 @@ from __future__ import print_function
 
 import collections
 import enum
+import typing
+from typing import Any
 
 from absl import logging
 import numpy as np
@@ -604,6 +606,9 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
 
   def AddValue(self, val):
     """Add `val` to the current context and its outer context recursively."""
+    if not self._outer_context:
+      return val
+
     if val.name in self._values:
       # Use the real value if it comes from outer context.
       result = self._external_values.get(val.name)
@@ -825,7 +830,7 @@ class XLAOptions(
       requested.
   """
 
-  def __new__(cls, use_spmd_for_xla_partitioning=False):
+  def __new__(cls, use_spmd_for_xla_partitioning=True):
     return super(XLAOptions, cls).__new__(cls, use_spmd_for_xla_partitioning)
 
 
@@ -1419,9 +1424,12 @@ def split_compile_and_replicate(computation,
 
     # tensor_tracer imports tpu.py. Local import to tensor_tracer to avoid
     # import-cycle
-    # pylint: disable=g-import-not-at-top
-    from tensorflow.python.tpu import tensor_tracer
-    # pylint: enable=g-import-not-at-top
+    if typing.TYPE_CHECKING:
+      tensor_tracer = Any
+    else:
+      # pylint: disable=g-import-not-at-top
+      from tensorflow.python.tpu import tensor_tracer
+      # pylint: enable=g-import-not-at-top
     if tensor_tracer.TensorTracer.is_enabled():
       tt = tensor_tracer.TensorTracer()
       output_tensors = tt.trace_tpu(ops.get_default_graph(),
diff --git a/tensorflow/python/tpu/tpu_embedding.py b/tensorflow/python/tpu/tpu_embedding.py
index 13afe1a2147..9a0b03da744 100644
--- a/tensorflow/python/tpu/tpu_embedding.py
+++ b/tensorflow/python/tpu/tpu_embedding.py
@@ -577,9 +577,15 @@ class FtrlParameters(_OptimizationParameters):
                clip_weight_min=None,
                clip_weight_max=None,
                weight_decay_factor=None,
-               multiply_weight_decay_factor_by_learning_rate=None):
+               multiply_weight_decay_factor_by_learning_rate=None,
+               multiply_linear_by_learning_rate=False,
+               beta=0,
+               allow_zero_accumulator=False):
     """Optimization parameters for Ftrl.
 
+    Implements FTRL as described in the following [paper](
+    https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/41159.pdf)
+
     Args:
       learning_rate: a floating point value. The learning rate.
       learning_rate_power: A float value, must be less or equal to zero.
@@ -602,6 +608,14 @@ class FtrlParameters(_OptimizationParameters):
         weights are not decayed.
       multiply_weight_decay_factor_by_learning_rate: if true,
         `weight_decay_factor` is multiplied by the current learning rate.
+      multiply_linear_by_learning_rate: When true, multiplies the usages of the
+        linear slot in the weight update by the learning rate. This is useful
+        when ramping up learning rate from 0 (which would normally produce
+        NaNs).
+      beta: The beta parameter for FTRL.
+      allow_zero_accumulator: Changes the implementation of the square root to
+        allow for the case of initial_accumulator_value being zero. This will
+        cause a slight performance drop.
     """
     super(FtrlParameters,
           self).__init__(learning_rate, use_gradient_accumulation,
@@ -628,6 +642,9 @@ class FtrlParameters(_OptimizationParameters):
     self.initial_linear_value = 0.0
     self.l1_regularization_strength = l1_regularization_strength
     self.l2_regularization_strength = l2_regularization_strength
+    self.multiply_linear_by_learning_rate = multiply_linear_by_learning_rate
+    self.beta = beta
+    self.allow_zero_accumulator = allow_zero_accumulator
 
 
 class ProximalYogiParameters(_OptimizationParameters):
@@ -1896,6 +1913,12 @@ class _FtrlHandler(_OptimizerHandler):
         self._optimization_parameters.l1_regularization_strength)
     table_descriptor.optimization_parameters.ftrl.l2 = (
         self._optimization_parameters.l2_regularization_strength)
+    table_descriptor.optimization_parameters.ftrl.multiply_linear_by_lr = (
+        self._optimization_parameters.multiply_linear_by_learning_rate)
+    table_descriptor.optimization_parameters.ftrl.beta = (
+        self._optimization_parameters.beta)
+    table_descriptor.optimization_parameters.ftrl.allow_zero_accumulator = (
+        self._optimization_parameters.allow_zero_accumulator)
 
   def get_default_slot_variable_names(self, table):
     # These match the default slot variable names created by
@@ -2211,7 +2234,7 @@ def _create_device_fn(hosts):
     if part_match:
       idx = int(part_match.group(1))
     else:
-      idx = int(dummy_match.group(1))
+      idx = int(dummy_match.group(1))  # pytype: disable=attribute-error
 
     device = hosts[idx]
     logging.debug('assigning {} to {}.', op, device)
diff --git a/tensorflow/python/tpu/tpu_embedding_v2.py b/tensorflow/python/tpu/tpu_embedding_v2.py
index 5e316d35aa4..ef75fc28066 100644
--- a/tensorflow/python/tpu/tpu_embedding_v2.py
+++ b/tensorflow/python/tpu/tpu_embedding_v2.py
@@ -29,6 +29,7 @@ from tensorflow.python.distribute import distribute_utils
 from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.distribute import sharded_variable
 from tensorflow.python.distribute import tpu_strategy
+from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import device as tf_device
@@ -41,6 +42,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.saved_model import save_context
 from tensorflow.python.tpu import tpu
 from tensorflow.python.tpu import tpu_embedding_v2_utils
 from tensorflow.python.tpu.ops import tpu_ops
@@ -49,6 +51,7 @@ from tensorflow.python.training.tracking import base
 from tensorflow.python.training.tracking import tracking
 from tensorflow.python.util import compat
 from tensorflow.python.util import nest
+from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -377,11 +380,11 @@ class TPUEmbedding(tracking.AutoTrackable):
     # properly tracked by the tracking API.
     self._variables = self._create_variables_and_slots()
 
-    if self._using_tpu:
-      self._load_variables()
-
     self._built = True
 
+    # This is internally conditioned self._built and self._using_tpu
+    self._load_variables()
+
   def _maybe_build(self, batch_size):
     if not self._built:
       # This can be called while tracing a function, so we wrap the
@@ -411,6 +414,9 @@ class TPUEmbedding(tracking.AutoTrackable):
     # 1. Variables are stale and are only updated when a checkpoint is made.
     # 2. Updating the variables won't affect the actual tables on the TPU.
     if self._using_tpu:
+      if save_context.in_save_context():
+        return {table: self._variables[table.name]["parameters"].variables[0]
+                for table in self._table_config}
       raise RuntimeError("Unable to retrieve embedding tables when using a TPU "
                          "strategy. If you need access, save your model, "
                          "create this object under a CPU strategy and restore.")
@@ -769,19 +775,19 @@ class TPUEmbedding(tracking.AutoTrackable):
 
     def create_variables(table):
       """Create all variables."""
-      shape = (table.vocabulary_size, table.dim)
+      variable_shape = (table.vocabulary_size, table.dim)
 
       def getter(name, shape, dtype, initializer, trainable):
-        # TODO(bfontain): make CheckpointInitialValue a callable rather than
-        # something that inherits from tensor.
-        if not isinstance(initializer, base.CheckpointInitialValue):
-          initial_value = functools.partial(initializer, shape, dtype=dtype)
-        else:
-          initial_value = initializer
-
+        del shape
+        # _add_variable_with_custom_getter clears the shape sometimes, so we
+        # take the global shape from outside the getter.
+        initial_value = functools.partial(initializer, variable_shape,
+                                          dtype=dtype)
         return tf_variables.Variable(
             name=name,
             initial_value=initial_value,
+            shape=variable_shape,
+            dtype=dtype,
             trainable=trainable)
 
       def variable_creator(name, initializer, trainable=True):
@@ -791,7 +797,7 @@ class TPUEmbedding(tracking.AutoTrackable):
         return self._add_variable_with_custom_getter(
             name=name,
             initializer=initializer,
-            shape=shape,
+            shape=variable_shape,
             dtype=dtypes.float32,
             getter=getter,
             trainable=trainable)
@@ -824,61 +830,29 @@ class TPUEmbedding(tracking.AutoTrackable):
 
     return variables
 
-  @def_function.function
   def _load_variables(self):
-    """Load embedding tables to onto TPU for each table and host."""
+    # Only load the variables if we are:
+    # 1) Using TPU
+    # 2) Variables are created
+    # 3) Not in save context (except if running eagerly)
+    if self._using_tpu and self._built and not (
+        not context.executing_eagerly() and save_context.in_save_context()):
+      _load_variables_impl(self._config_proto.SerializeToString(),
+                           self._hosts,
+                           self._variables,
+                           self._table_config)
 
-    def select_fn(host_id):
-      return lambda x: x.variables[host_id]
-
-    num_hosts = self._strategy.extended.num_hosts
-    config = self._config_proto.SerializeToString()
-    for host_id, host in enumerate(self._hosts):
-      variables = nest.map_structure(select_fn(host_id), self._variables)
-      with ops.device(host):
-        for table in self._table_config:
-          table.optimizer._load()(  # pylint: disable=protected-access
-              table_name=table.name,
-              num_shards=num_hosts,
-              shard_id=host_id,
-              config=config,
-              **variables[table.name])
-          # Ensure that only the first table/first host gets a config so that we
-          # don't bloat graph by attaching this large string to each op.
-          # We have num tables * num hosts of these so for models with a large
-          # number of tables training on a large slice, this can be an issue.
-          config = None
-
-  @def_function.function
   def _retrieve_variables(self):
-    """Retrieve embedding tables from TPU to host memory."""
-    num_hosts = self._strategy.extended.num_hosts
-    config = self._config_proto.SerializeToString()
-    for host_id, host in enumerate(self._hosts):
-      with ops.device(host):
-        for table in self._table_config:
-          retrieved = table.optimizer._retrieve()(  # pylint: disable=protected-access
-              table_name=table.name,
-              num_shards=num_hosts,
-              shard_id=host_id,
-              config=config)
-          # When there are no slot variables (e.g with SGD) this returns a
-          # single tensor rather than a tuple. In this case we put the tensor in
-          # a list to make the following code easier to write.
-          if not isinstance(retrieved, tuple):
-            retrieved = (retrieved,)
-
-          for i, slot in enumerate(["parameters"] +
-                                   table.optimizer._slot_names()):  # pylint: disable=protected-access
-            # We must assign the CPU variables the values of tensors that were
-            # returned from the TPU.
-            self._variables[table.name][slot].variables[host_id].assign(
-                retrieved[i])
-          # Ensure that only the first table/first host gets a config so that we
-          # don't bloat graph by attaching this large string to each op.
-          # We have num tables * num hosts of these so for models with a large
-          # number of tables training on a large slice, this can be an issue.
-          config = None
+    # Only retrieve the variables if we are:
+    # 1) Using TPU
+    # 2) Variables are created
+    # 3) Not in save context (except if running eagerly)
+    if self._using_tpu and self._built and not (
+        not context.executing_eagerly() and save_context.in_save_context()):
+      _retrieve_variables_impl(self._config_proto.SerializeToString(),
+                               self._hosts,
+                               self._variables,
+                               self._table_config)
 
   def _gather_saveables_for_checkpoint(self):
     """Overrides default Trackable implementation to add load/retrieve hook."""
@@ -888,16 +862,9 @@ class TPUEmbedding(tracking.AutoTrackable):
     # always executed. Once that is done, we can output an empty list when on
     # CPU.
 
-    def _load_variables():
-      if self._using_tpu and self._built:
-        self._load_variables()
-
-    def _retrieve_variables():
-      if self._using_tpu and self._built:
-        self._retrieve_variables()
-
     def factory(name=_HOOK_KEY):
-      return TPUEmbeddingSaveable(name, _load_variables, _retrieve_variables)
+      return TPUEmbeddingSaveable(name, self._load_variables,
+                                  self._retrieve_variables)
     return {_HOOK_KEY: factory}
 
   # Some helper functions for the below enqueue function.
@@ -1316,6 +1283,75 @@ class TPUEmbedding(tracking.AutoTrackable):
     return batch_size
 
 
+@def_function.function
+def _load_variables_impl(config, hosts, variables, table_config):
+  """Load embedding tables to onto TPU for each table and host.
+
+  Args:
+    config: A serialized TPUEmbeddingConfiguration proto.
+    hosts: A list of CPU devices, on per host.
+    variables: A dictionary of dictionaries of TPUShardedVariables. First key is
+      the table name, second key is 'parameters' or the optimizer slot name.
+    table_config: A list of tf.tpu.experimental.embedding.TableConfig objects.
+  """
+  def select_fn(host_id):
+    return lambda x: x.variables[host_id]
+
+  for host_id, host in enumerate(hosts):
+    host_variables = nest.map_structure(select_fn(host_id), variables)
+    with ops.device(host):
+      for table in table_config:
+        table.optimizer._load()(  # pylint: disable=protected-access
+            table_name=table.name,
+            num_shards=len(hosts),
+            shard_id=host_id,
+            config=config,
+            **host_variables[table.name])
+        # Ensure that only the first table/first host gets a config so that we
+        # don't bloat graph by attaching this large string to each op.
+        # We have num tables * num hosts of these so for models with a large
+        # number of tables training on a large slice, this can be an issue.
+        config = None
+
+
+@def_function.function
+def _retrieve_variables_impl(config, hosts, variables, table_config):
+  """Retrieve embedding tables from TPU to host memory.
+
+  Args:
+    config: A serialized TPUEmbeddingConfiguration proto.
+    hosts: A list of all the host CPU devices.
+    variables: A dictionary of dictionaries of TPUShardedVariables. First key is
+      the table name, second key is 'parameters' or the optimizer slot name.
+    table_config: A list of tf.tpu.experimental.embedding.TableConfig objects.
+  """
+  for host_id, host in enumerate(hosts):
+    with ops.device(host):
+      for table in table_config:
+        retrieved = table.optimizer._retrieve()(  # pylint: disable=protected-access
+            table_name=table.name,
+            num_shards=len(hosts),
+            shard_id=host_id,
+            config=config)
+        # When there are no slot variables (e.g with SGD) this returns a
+        # single tensor rather than a tuple. In this case we put the tensor in
+        # a list to make the following code easier to write.
+        if not isinstance(retrieved, tuple):
+          retrieved = (retrieved,)
+
+        for i, slot in enumerate(["parameters"] +
+                                 table.optimizer._slot_names()):  # pylint: disable=protected-access
+          # We must assign the CPU variables the values of tensors that were
+          # returned from the TPU.
+          variables[table.name][slot].variables[host_id].assign(
+              retrieved[i])
+        # Ensure that only the first table/first host gets a config so that we
+        # don't bloat graph by attaching this large string to each op.
+        # We have num tables * num hosts of these so for models with a large
+        # number of tables training on a large slice, this can be an issue.
+        config = None
+
+
 class TPUEmbeddingSaveable(saveable_hook.SaveableHook):
   """Save/Restore hook to Retrieve/Load TPUEmbedding variables."""
 
@@ -1358,13 +1394,64 @@ def _ragged_embedding_lookup_with_reduce(table, ragged, weights, combiner):
   return ragged_result
 
 
+@tf_export("tpu.experimental.embedding.serving_embedding_lookup")
 def cpu_embedding_lookup(inputs, weights, tables, feature_config):
-  """Uses CPU embedding lookup for embedding ids in features.
+  """Apply standard lookup ops with `tf.tpu.experimental.embedding` configs.
+
+  This function is a utility which allows using the
+  `tf.tpu.experimental.embedding` config objects with standard lookup functions.
+  This can be used when exporting a model which uses
+  `tf.tpu.experimental.embedding.TPUEmbedding` for serving on CPU. In particular
+  `tf.tpu.experimental.embedding.TPUEmbedding` only supports lookups on TPUs and
+  should not be part of your serving graph.
+
+  Note that TPU specific options (such as `max_sequence_length`) in the
+  configuration objects will be ignored.
+
+  In the following example we take take a trained model (see the documentation
+  for `tf.tpu.experimental.embedding.TPUEmbedding` for the context) and create a
+  saved model with a serving function that will perform the embedding lookup and
+  pass the results to your model:
+
+  ```python
+  model = model_fn(...)
+  embedding = tf.tpu.experimental.embedding.TPUEmbedding(
+      feature_config=feature_config,
+      batch_size=1024,
+      optimizer=tf.tpu.experimental.embedding.SGD(0.1))
+  checkpoint = tf.train.Checkpoint(model=model, embedding=embedding)
+  checkpoint.restore(...)
+
+  @tf.function(input_signature=[{'feature_one': tf.TensorSpec(...),
+                                 'feature_two': tf.TensorSpec(...),
+                                 'feature_three': tf.TensorSpec(...)}])
+  def serve_tensors(embedding_featurese):
+    embedded_features = tf.tpu.experimental.embedding.serving_embedding_lookup(
+        embedding_features, None, embedding.embedding_tables,
+        feature_config)
+    return model(embedded_features)
+
+  model.embedding_api = embedding
+  tf.saved_model.save(model,
+                      export_dir=...,
+                      signatures={'serving_default': serve_tensors})
+
+  ```
+
+  NOTE: Its important to assign the embedding api object to a member of your
+  model as `tf.saved_model.save` only supports saving variables one `Trackable`
+  object. Since the model's weights are in `model` and the embedding table are
+  managed by `embedding`, we assign `embedding` to and attribute of `model` so
+  that tf.saved_model.save can find the embedding variables.
+
+  NOTE: The same `serve_tensors` function and `tf.saved_model.save` call will
+  work directly from training.
 
   Args:
     inputs: a nested structure of Tensors, SparseTensors or RaggedTensors.
     weights: a nested structure of Tensors, SparseTensors or RaggedTensors or
-      None for no weights.
+      None for no weights. If not None, structure must match that of inputs, but
+      entries are allowed to be None.
     tables: a dict of mapping TableConfig objects to Variables.
     feature_config: a nested structure of FeatureConfig objects with the same
       structure as inputs.
@@ -1455,10 +1542,8 @@ def extract_variable_info(kwargs):
     return (kwargs["name"], shape,
             kwargs["initial_value"].keywords.get("dtype", kwargs["dtype"]),
             kwargs["initial_value"].func)
-  elif isinstance(kwargs["initial_value"], base.CheckpointInitialValue):
-    return (kwargs["name"], kwargs["initial_value"].shape,
-            kwargs["initial_value"].dtype, kwargs["initial_value"])
-  elif "shape" not in kwargs or kwargs["shape"] is None:
+  elif "shape" not in kwargs or kwargs["shape"] is None or not callable(
+      kwargs["initial_value"]):
     raise ValueError(
         "Unable to extract initializer function and shape from {}. Please "
         "either pass a function that expects a shape and dtype as the "
@@ -1486,7 +1571,8 @@ def make_sharded_variable_creator(hosts):
     kwargs["skip_mirrored_creator"] = True
 
     num_hosts = len(hosts)
-    name, shape, dtype, initial_value = extract_variable_info(kwargs)
+    name, shape, dtype, unwrapped_initial_value = extract_variable_info(kwargs)
+    initial_value = kwargs["initial_value"]
     rows = shape[0]
     cols = shape[1]
     missing = rows % num_hosts
@@ -1494,26 +1580,23 @@ def make_sharded_variable_creator(hosts):
     partitions = ([rows // num_hosts + 1] * missing + [rows // num_hosts] *
                   (num_hosts - missing))
     variables = []
-    newkwargs = kwargs
-    newkwargs["dtype"] = dtype
-    # TODO(bfontain): Remove this check once we can pass position and shape of
-    # shards to CheckpointInitialValue.
-    if isinstance(initial_value, base.CheckpointInitialValue) and num_hosts > 1:
-      raise RuntimeError("Delayed restoration of variables not available when "
-                         "there are multiple TPU hosts, please ensure that the "
-                         "api object is build before you restore.")
+    sharding_aware = "shard_info" in tf_inspect.getargspec(initial_value).args
 
+    # Keep track of offset for sharding aware initializers.
+    offset = 0
+    kwargs["dtype"] = dtype
     for i, p in enumerate(partitions):
       with ops.device(hosts[i]):
-        newkwargs["shape"] = (p, cols)
-        newkwargs["name"] = "{}_{}".format(name, i)
-        if isinstance(initial_value, base.CheckpointInitialValue):
-          # TODO(bfontain): Patch CheckpointInitialValue to take in account the
-          # position and shape of this shard.
-          newkwargs["initial_value"] = initial_value
+        kwargs["name"] = "{}_{}".format(name, i)
+        kwargs["shape"] = (p, cols)
+        if sharding_aware:
+          shard_info = base.ShardInfo(kwargs["shape"], (offset, 0))
+          kwargs["initial_value"] = functools.partial(
+              initial_value, shard_info=shard_info)
+          offset += p
         else:
-          newkwargs["initial_value"] = (
-              lambda: initial_value(newkwargs["shape"], dtype=dtype))
+          kwargs["initial_value"] = functools.partial(
+              unwrapped_initial_value, kwargs["shape"], dtype=dtype)
         variables.append(next_creator(*args, **kwargs))
     return TPUShardedVariable(variables, name=name)
   return sharded_variable_creator
diff --git a/tensorflow/python/tpu/tpu_embedding_v2_utils.py b/tensorflow/python/tpu/tpu_embedding_v2_utils.py
index 86f85392681..8487581346b 100644
--- a/tensorflow/python/tpu/tpu_embedding_v2_utils.py
+++ b/tensorflow/python/tpu/tpu_embedding_v2_utils.py
@@ -21,26 +21,55 @@ from __future__ import unicode_literals
 
 import abc
 import math
+from typing import Any, Dict, Callable, List, Optional, Text, Tuple, TypeVar, Union
 import six
 
 from tensorflow.core.protobuf.tpu import optimization_parameters_pb2
+from tensorflow.python.distribute import sharded_variable
+from tensorflow.python.framework import ops
 from tensorflow.python.ops import init_ops_v2
+from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.tpu.ops import tpu_ops
+from tensorflow.python.types import core
 from tensorflow.python.util.tf_export import tf_export
 
 
+TableVariable = TypeVar("TableVariable", sharded_variable.ShardedVariable,
+                        tf_variables.Variable)
+SlotVarCreationFnType = Callable[
+    [TableVariable, List[Text], List[init_ops_v2.Initializer]],
+    Dict[Text, TableVariable]]
+ClipValueType = Union[Tuple[float, float], float]
+
+
 @six.add_metaclass(abc.ABCMeta)
 class _Optimizer(object):
   """Base class for all optimizers, with common parameters."""
 
-  def __init__(self, learning_rate, use_gradient_accumulation, clip_weight_min,
-               clip_weight_max, weight_decay_factor,
-               multiply_weight_decay_factor_by_learning_rate,
-               slot_variable_creation_fn=None):
+  def __init__(
+      self,
+      learning_rate: Union[float, Callable[[], float]],
+      use_gradient_accumulation: bool,
+      clip_weight_min: Optional[float],
+      clip_weight_max: Optional[float],
+      weight_decay_factor: Optional[float],
+      multiply_weight_decay_factor_by_learning_rate: bool,
+      clipvalue: Optional[ClipValueType] = None,
+      slot_variable_creation_fn: Optional[SlotVarCreationFnType] = None):
     self.learning_rate = learning_rate
     self.use_gradient_accumulation = use_gradient_accumulation
     self.clip_weight_min = clip_weight_min
     self.clip_weight_max = clip_weight_max
+    if not use_gradient_accumulation and clipvalue is not None:
+      raise ValueError("Received non-None gradient clipping limit {} but "
+                       "use_gradient_accumulation is not set to True.".format(
+                           clipvalue))
+    if clipvalue is None:
+      clipvalue = (None, None)
+    elif not isinstance(clipvalue, tuple):
+      clipvalue = (-1. * clipvalue, clipvalue)
+    self.clip_gradient_min, self.clip_gradient_max = clipvalue
+
     self.weight_decay_factor = weight_decay_factor
     self.multiply_weight_decay_factor_by_learning_rate = (
         multiply_weight_decay_factor_by_learning_rate)
@@ -52,7 +81,7 @@ class _Optimizer(object):
     self.slot_variable_creation_fn = slot_variable_creation_fn
 
   @abc.abstractmethod
-  def _slot_names(self):
+  def _slot_names(self) -> List[Text]:
     """Returns the name of all the slot variables.
 
     This does not include the 'parameters' variable and these names must match
@@ -62,14 +91,15 @@ class _Optimizer(object):
     raise NotImplementedError
 
   @abc.abstractmethod
-  def _slot_initializers(self):
+  def _slot_initializers(self) -> List[init_ops_v2.Initializer]:
     """Returns initializers for slot variables.
 
     This returns a parallel list to self._slot_names().
     """
     raise NotImplementedError
 
-  def _set_optimization_parameters(self, parameters):
+  def _set_optimization_parameters(
+      self, parameters: optimization_parameters_pb2.OptimizationParameters):
     """Sets the optimizer fields in the OptimizationParameters."""
     if self.use_gradient_accumulation:
       parameters.gradient_accumulation_status = (
@@ -84,22 +114,32 @@ class _Optimizer(object):
     if self.clip_weight_max is not None:
       parameters.clipping_limits.upper.value = self.clip_weight_max
 
+    if self.clip_gradient_min is not None:
+      parameters.gradient_clipping_limits.lower.value = self.clip_gradient_min
+
+    if self.clip_gradient_max is not None:
+      parameters.gradient_clipping_limits.upper.value = self.clip_gradient_max
+
     if self.weight_decay_factor:
       parameters.weight_decay_factor = self.weight_decay_factor
       if self.multiply_weight_decay_factor_by_learning_rate:
         parameters.multiply_weight_decay_factor_by_learning_rate = True
 
   @abc.abstractmethod
-  def _load(self):
+  def _load(self) -> Callable[..., ops.Operation]:
     """Returns the load function for the optimizer."""
     raise NotImplementedError
 
   @abc.abstractmethod
-  def _retrieve(self):
+  def _retrieve(self) -> Callable[..., core.Tensor]:
     """Returns the retrieve function for the optimizer."""
     raise NotImplementedError
 
-  def _create_slots(self, table, variable_creator):
+  def _create_slots(
+      self, table: "TableConfig",
+      variable_creator: Callable[[Text, init_ops_v2.Initializer],
+                                 tf_variables.Variable]
+  ) -> Dict[Text, tf_variables.Variable]:
     """Creates slot variables for table.
 
     Args:
@@ -117,7 +157,7 @@ class _Optimizer(object):
       slots = {}
       for slot, initializer in zip(self._slot_names(),
                                    self._slot_initializers()):
-        slots[slot] = variable_creator(name=slot, initializer=initializer)
+        slots[slot] = variable_creator(slot, initializer)
       return slots
 
 
@@ -169,11 +209,12 @@ class SGD(_Optimizer):
   """
 
   def __init__(self,
-               learning_rate=0.01,
-               clip_weight_min=None,
-               clip_weight_max=None,
-               weight_decay_factor=None,
-               multiply_weight_decay_factor_by_learning_rate=None):
+               learning_rate: Union[float, Callable[[], float]] = 0.01,
+               clip_weight_min: Optional[float] = None,
+               clip_weight_max: Optional[float] = None,
+               weight_decay_factor: Optional[float] = None,
+               multiply_weight_decay_factor_by_learning_rate: bool = None,
+               clipvalue: Optional[ClipValueType] = None):
     """Optimization parameters for stochastic gradient descent.
 
     Args:
@@ -186,25 +227,38 @@ class SGD(_Optimizer):
         by this factor each step.
       multiply_weight_decay_factor_by_learning_rate: if true,
         `weight_decay_factor` is multiplied by the current learning rate.
+      clipvalue: Controls clipping of the gradient. Set to either a single
+        positive scalar value to get clipping or a tiple of scalar values (min,
+        max) to set a separate maximum or minimum. If one of the two entries is
+        None, then there will be no clipping that direction. Note if this is
+        set, you may see a decrease in performance as  gradient accumulation
+        will be enabled (it is normally off for SGD as it has no affect on
+        accuracy). See
+        'tensorflow/core/protobuf/tpu/optimization_parameters.proto' for more
+        information on gradient accumulation and its impact on tpu embeddings.
     """
+    use_gradient_accumulation = clipvalue is not None
+
     super(SGD, self).__init__(
-        learning_rate, False, clip_weight_min, clip_weight_max,
-        weight_decay_factor, multiply_weight_decay_factor_by_learning_rate)
+        learning_rate, use_gradient_accumulation, clip_weight_min,
+        clip_weight_max, weight_decay_factor,
+        multiply_weight_decay_factor_by_learning_rate, clipvalue)
 
-  def _slot_names(self):
+  def _slot_names(self) -> List[Text]:
     return []
 
-  def _slot_initializers(self):
+  def _slot_initializers(self) -> List[init_ops_v2.Initializer]:
     return []
 
-  def _set_optimization_parameters(self, parameters):
+  def _set_optimization_parameters(
+      self, parameters: optimization_parameters_pb2.OptimizationParameters):
     super(SGD, self)._set_optimization_parameters(parameters)
     parameters.stochastic_gradient_descent.SetInParent()
 
-  def _load(self):
+  def _load(self) -> Callable[..., ops.Operation]:
     return tpu_ops.load_tpu_embedding_stochastic_gradient_descent_parameters
 
-  def _retrieve(self):
+  def _retrieve(self) -> Callable[..., core.Tensor]:
     return tpu_ops.retrieve_tpu_embedding_stochastic_gradient_descent_parameters
 
 
@@ -255,15 +309,17 @@ class Adagrad(_Optimizer):
   algorithm.
   """
 
-  def __init__(self,
-               learning_rate=0.001,
-               initial_accumulator_value=0.1,
-               use_gradient_accumulation=True,
-               clip_weight_min=None,
-               clip_weight_max=None,
-               weight_decay_factor=None,
-               multiply_weight_decay_factor_by_learning_rate=None,
-               slot_variable_creation_fn=None):
+  def __init__(
+      self,
+      learning_rate: float = 0.001,
+      initial_accumulator_value: float = 0.1,
+      use_gradient_accumulation: bool = True,
+      clip_weight_min: Optional[float] = None,
+      clip_weight_max: Optional[float] = None,
+      weight_decay_factor: Optional[float] = None,
+      multiply_weight_decay_factor_by_learning_rate: bool = None,
+      slot_variable_creation_fn: Optional[SlotVarCreationFnType] = None,
+      clipvalue: Optional[ClipValueType] = None):
     """Optimization parameters for Adagrad.
 
     Args:
@@ -278,36 +334,42 @@ class Adagrad(_Optimizer):
         weights are not decayed.
       multiply_weight_decay_factor_by_learning_rate: if true,
         `weight_decay_factor` is multiplied by the current learning rate.
-      slot_variable_creation_fn: Defaults to `None`. If you wish do directly
-        control the creation of the slot variables, set this to a callable
-        taking two parameters, a variable and a list of slot names to create for
-        it. This function should return a dict with the slot names as keys and
-        the created variables as values. When set to None (the default), uses
-        the built-in variable creation.
+      slot_variable_creation_fn: If you wish do directly control the creation of
+        the slot variables, set this to a callable taking three parameters: a
+          table variable, a list of slot names to create for it, and a list of
+          initializers. This function should return a dict with the slot names
+          as keys and the created variables as values with types matching the
+          table variable. When set to None (the default), uses the built-in
+          variable creation.
+      clipvalue: Controls clipping of the gradient. Set to either a single
+        positive scalar value to get clipping or a tuple of scalar values (min,
+        max) to set a separate maximum or minimum. If one of the two entries is
+        None, then there will be no clipping that direction.
     """
     super(Adagrad, self).__init__(
         learning_rate, use_gradient_accumulation, clip_weight_min,
         clip_weight_max, weight_decay_factor,
-        multiply_weight_decay_factor_by_learning_rate,
+        multiply_weight_decay_factor_by_learning_rate, clipvalue,
         slot_variable_creation_fn)
     if initial_accumulator_value <= 0:
       raise ValueError("Adagrad initial_accumulator_value must be positive")
     self.initial_accumulator_value = initial_accumulator_value
 
-  def _slot_names(self):
+  def _slot_names(self) -> List[Text]:
     return ["accumulators"]
 
-  def _slot_initializers(self):
+  def _slot_initializers(self) -> List[init_ops_v2.Initializer]:
     return [init_ops_v2.Constant(self.initial_accumulator_value)]
 
-  def _set_optimization_parameters(self, parameters):
+  def _set_optimization_parameters(
+      self, parameters: optimization_parameters_pb2.OptimizationParameters):
     super(Adagrad, self)._set_optimization_parameters(parameters)
     parameters.adagrad.SetInParent()
 
-  def _load(self):
+  def _load(self) -> Callable[..., ops.Operation]:
     return tpu_ops.load_tpu_embedding_adagrad_parameters
 
-  def _retrieve(self):
+  def _retrieve(self) -> Callable[..., core.Tensor]:
     return tpu_ops.retrieve_tpu_embedding_adagrad_parameters
 
 
@@ -362,19 +424,21 @@ class Adam(_Optimizer):
   algorithm.
   """
 
-  def __init__(self,
-               learning_rate=0.001,
-               beta_1=0.9,
-               beta_2=0.999,
-               epsilon=1e-07,
-               lazy_adam=True,
-               sum_inside_sqrt=True,
-               use_gradient_accumulation=True,
-               clip_weight_min=None,
-               clip_weight_max=None,
-               weight_decay_factor=None,
-               multiply_weight_decay_factor_by_learning_rate=None,
-               slot_variable_creation_fn=None):
+  def __init__(
+      self,
+      learning_rate: Union[float, Callable[[], float]] = 0.001,
+      beta_1: float = 0.9,
+      beta_2: float = 0.999,
+      epsilon: float = 1e-07,
+      lazy_adam: bool = True,
+      sum_inside_sqrt: bool = True,
+      use_gradient_accumulation: bool = True,
+      clip_weight_min: Optional[float] = None,
+      clip_weight_max: Optional[float] = None,
+      weight_decay_factor: Optional[float] = None,
+      multiply_weight_decay_factor_by_learning_rate: bool = None,
+      slot_variable_creation_fn: Optional[SlotVarCreationFnType] = None,
+      clipvalue: Optional[ClipValueType] = None):
     """Optimization parameters for Adam.
 
     See 'tensorflow/core/protobuf/tpu/optimization_parameters.proto' for a
@@ -384,10 +448,10 @@ class Adam(_Optimizer):
     Args:
       learning_rate: The learning rate. It should be a floating point value or a
         callable taking no arguments for a dynamic learning rate.
-      beta_1: A float value.
-        The exponential decay rate for the 1st moment estimates.
-      beta_2: A float value.
-        The exponential decay rate for the 2nd moment estimates.
+      beta_1: A float value. The exponential decay rate for the 1st moment
+        estimates.
+      beta_2: A float value. The exponential decay rate for the 2nd moment
+        estimates.
       epsilon: A small constant for numerical stability.
       lazy_adam: Use lazy Adam instead of Adam. Lazy Adam trains faster.
       sum_inside_sqrt: When this is true, the Adam update formula is changed
@@ -402,15 +466,22 @@ class Adam(_Optimizer):
         weights are not decayed.
       multiply_weight_decay_factor_by_learning_rate: if true,
         `weight_decay_factor` is multiplied by the current learning rate.
-      slot_variable_creation_fn: a callable taking two parameters, a variable
-        and a list of slot names to create for it. This function should return
-        a dict with the slot names as keys and the created variables as values.
-        When set to None (the default), uses the built-in variable creation.
+      slot_variable_creation_fn: If you wish do directly control the creation of
+        the slot variables, set this to a callable taking three parameters: a
+          table variable, a list of slot names to create for it, and a list of
+          initializers. This function should return a dict with the slot names
+          as keys and the created variables as values with types matching the
+          table variable. When set to None (the default), uses the built-in
+          variable creation.
+      clipvalue: Controls clipping of the gradient. Set to either a single
+        positive scalar value to get clipping or a tiple of scalar values (min,
+        max) to set a separate maximum or minimum. If one of the two entries is
+        None, then there will be no clipping that direction.
     """
     super(Adam, self).__init__(
         learning_rate, use_gradient_accumulation, clip_weight_min,
         clip_weight_max, weight_decay_factor,
-        multiply_weight_decay_factor_by_learning_rate,
+        multiply_weight_decay_factor_by_learning_rate, clipvalue,
         slot_variable_creation_fn)
     if beta_1 < 0. or beta_1 >= 1.:
       raise ValueError("beta1 must be in the range [0, 1), but received {}."
@@ -430,13 +501,14 @@ class Adam(_Optimizer):
     self.lazy_adam = lazy_adam
     self.sum_inside_sqrt = sum_inside_sqrt
 
-  def _slot_names(self):
+  def _slot_names(self) -> List[Text]:
     return ["momenta", "velocities"]
 
-  def _slot_initializers(self):
+  def _slot_initializers(self) -> List[init_ops_v2.Initializer]:
     return [init_ops_v2.Constant(), init_ops_v2.Constant()]
 
-  def _set_optimization_parameters(self, parameters):
+  def _set_optimization_parameters(
+      self, parameters: optimization_parameters_pb2.OptimizationParameters):
     super(Adam, self)._set_optimization_parameters(parameters)
     parameters.adam.beta1 = self.beta_1
     parameters.adam.beta2 = self.beta_2
@@ -444,10 +516,10 @@ class Adam(_Optimizer):
     parameters.adam.use_non_lazy_adam = not self.lazy_adam
     parameters.adam.use_sum_inside_sqrt = self.sum_inside_sqrt
 
-  def _load(self):
+  def _load(self) -> Callable[..., ops.Operation]:
     return tpu_ops.load_tpu_embedding_adam_parameters
 
-  def _retrieve(self):
+  def _retrieve(self) -> Callable[..., core.Tensor]:
     return tpu_ops.retrieve_tpu_embedding_adam_parameters
 
 
@@ -488,8 +560,13 @@ class TableConfig(object):
 
   """
 
-  def __init__(self, vocabulary_size, dim, initializer, optimizer=None,
-               combiner="mean", name=None):
+  def __init__(self,
+               vocabulary_size: int,
+               dim: int,
+               initializer: Optional[Callable[[Any], None]],
+               optimizer: Optional[_Optimizer] = None,
+               combiner: Text = "mean",
+               name: Optional[Text] = None):
     """Embedding table configuration.
 
     Args:
@@ -506,10 +583,10 @@ class TableConfig(object):
         `tf.tpu.experimental.embedding.Adam`. It set will override the global
         optimizer passed to `tf.tpu.experimental.embedding.TPUEmbedding`.
       combiner: A string specifying how to reduce if there are multiple entries
-        in a single row. Currently 'mean', 'sqrtn', 'sum' are
-        supported, with 'mean' the default. 'sqrtn' often achieves good
-        accuracy, in particular with bag-of-words columns. For more information,
-        see `tf.nn.embedding_lookup_sparse`.
+        in a single row. Currently 'mean', 'sqrtn', 'sum' are supported, with
+        'mean' the default. 'sqrtn' often achieves good accuracy, in particular
+        with bag-of-words columns. For more information, see
+        `tf.nn.embedding_lookup_sparse`.
       name: An optional string used to name the table. Useful for debugging.
 
     Returns:
@@ -585,7 +662,10 @@ class FeatureConfig(object):
   will be `(batch_size, max_sequence_length, dim)`.
   """
 
-  def __init__(self, table, max_sequence_length=0, name=None):
+  def __init__(self,
+               table: TableConfig,
+               max_sequence_length: int = 0,
+               name: Optional[Text] = None):
     """Feature configuration.
 
     Args:
diff --git a/tensorflow/python/tpu/tpu_embedding_v2_utils_test.py b/tensorflow/python/tpu/tpu_embedding_v2_utils_test.py
new file mode 100644
index 00000000000..14dfb32e075
--- /dev/null
+++ b/tensorflow/python/tpu/tpu_embedding_v2_utils_test.py
@@ -0,0 +1,65 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for TPU Embeddings mid level API utils on TPU."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.compat import v2_compat
+from tensorflow.python.platform import test
+from tensorflow.python.tpu import tpu_embedding_v2_utils
+
+
+class TPUEmbeddingOptimizerTest(parameterized.TestCase, test.TestCase):
+
+  @parameterized.parameters(tpu_embedding_v2_utils.Adagrad,
+                            tpu_embedding_v2_utils.Adam)
+  def test_grad_clip_with_accumulation_off(self, optimizer):
+    with self.assertRaisesRegex(ValueError, 'accumulation'):
+      optimizer(use_gradient_accumulation=False, clipvalue=0.)
+    with self.assertRaisesRegex(ValueError, 'accumulation'):
+      optimizer(use_gradient_accumulation=False, clipvalue=(None, 1.))
+
+  @parameterized.parameters(tpu_embedding_v2_utils.SGD,
+                            tpu_embedding_v2_utils.Adagrad,
+                            tpu_embedding_v2_utils.Adam)
+  def test_grad_clip_with_tuple(self, optimizer):
+    opt = optimizer(clipvalue=(-1., 1.))
+    self.assertEqual(-1., opt.clip_gradient_min)
+    self.assertEqual(1., opt.clip_gradient_max)
+
+  @parameterized.parameters(tpu_embedding_v2_utils.SGD,
+                            tpu_embedding_v2_utils.Adagrad,
+                            tpu_embedding_v2_utils.Adam)
+  def test_grad_clip_with_single_value(self, optimizer):
+    opt = optimizer(clipvalue=1.)
+    self.assertEqual(-1., opt.clip_gradient_min)
+    self.assertEqual(1., opt.clip_gradient_max)
+
+  @parameterized.parameters(tpu_embedding_v2_utils.SGD,
+                            tpu_embedding_v2_utils.Adagrad,
+                            tpu_embedding_v2_utils.Adam)
+  def test_grad_clip_with_tuple_and_none(self, optimizer):
+    opt = optimizer(clipvalue=(None, 1))
+    self.assertIsNone(opt.clip_gradient_min)
+    self.assertEqual(1., opt.clip_gradient_max)
+
+
+if __name__ == '__main__':
+  v2_compat.enable_v2_behavior()
+  test.main()
diff --git a/tensorflow/python/tpu/tpu_outside_compilation_test.py b/tensorflow/python/tpu/tpu_outside_compilation_test.py
index 291ab7f8d53..5ae618640b2 100644
--- a/tensorflow/python/tpu/tpu_outside_compilation_test.py
+++ b/tensorflow/python/tpu/tpu_outside_compilation_test.py
@@ -19,10 +19,12 @@ from __future__ import division
 from __future__ import print_function
 
 import os
+import tempfile
 
 from absl.testing import parameterized
 import numpy as np
 
+from tensorflow.core.util import event_pb2
 from tensorflow.python.distribute import tpu_strategy as tpu_lib
 from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver
 from tensorflow.python.eager import def_function
@@ -30,6 +32,7 @@ from tensorflow.python.eager import remote
 from tensorflow.python.eager import test
 from tensorflow.python.framework import config
 from tensorflow.python.framework import constant_op
+from tensorflow.python.lib.io import tf_record
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradients_impl
@@ -38,8 +41,8 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import summary_ops_v2 as summary
-from tensorflow.python.ops import variables
 from tensorflow.python.platform import flags
+from tensorflow.python.platform import gfile
 from tensorflow.python.tpu import tpu
 from tensorflow.python.tpu import tpu_strategy_util
 
@@ -70,32 +73,25 @@ def computation_with_string_ops(x):
   return string_ops.string_to_number(output)
 
 
+def _events_from_logdir(test_case, logdir):
+  """Reads summary events from log directory."""
+  test_case.assertTrue(gfile.Exists(logdir))
+  files = gfile.ListDirectory(logdir)
+  test_case.assertLen(files, 1)
+  records = list(tf_record.tf_record_iterator(os.path.join(logdir, files[0])))
+  result = []
+  for r in records:
+    event = event_pb2.Event()
+    event.ParseFromString(r)
+    result.append(event)
+  return result
+
+
 class TpuOutsideCompilationTest(test.TestCase, parameterized.TestCase):
 
-  def testResourceVariableAssignOnHost(self):
-    strategy = get_tpu_strategy()
-    with strategy.scope():
-      v = variables.Variable(
-          0.0, aggregation=variables.VariableAggregation.MEAN)
-    v2 = variables.Variable(0.0, aggregation=variables.VariableAggregation.MEAN)
-
-    def assign_fn():
-      v2.assign_add(4.0)
-
-    @def_function.function
-    def train_step():
-
-      def assign_add():
-        v.assign_add(2.0)
-        tpu.outside_compilation(assign_fn)
-        v.assign_add(3.0)
-
-      strategy.run(assign_add)
-      return
-
-    train_step()
-    self.assertAllEqual(4.0 * strategy.num_replicas_in_sync, v2.numpy())
-    self.assertAllEqual(5.0, v.numpy())
+  def setUp(self):
+    super(TpuOutsideCompilationTest, self).setUp()
+    config.set_soft_device_placement(False)
 
   def testHostNoInput(self):
     strategy = get_tpu_strategy()
@@ -452,7 +448,8 @@ class TpuOutsideCompilationTest(test.TestCase, parameterized.TestCase):
         constant_op.constant(2916., shape=(strategy.num_replicas_in_sync)))
 
 
-class OutsideCompilationOnUnsupportedOpTest(test.TestCase):
+class OutsideCompilationOnUnsupportedOpTest(test.TestCase,
+                                            parameterized.TestCase):
 
   def setUp(self):
     super(OutsideCompilationOnUnsupportedOpTest, self).setUp()
@@ -488,6 +485,75 @@ class OutsideCompilationOnUnsupportedOpTest(test.TestCase):
         strategy.experimental_local_results(train_step(0)),
         constant_op.constant(10, shape=(strategy.num_replicas_in_sync)))
 
+  def testSummaryWithAutoOutsideCompilation(self):
+    strategy = get_tpu_strategy()
+
+    def host_computation(x):
+      summary.scalar("x", x, step=0)
+      return x * 2.0
+
+    @def_function.function
+    def step():
+
+      def computation(x):
+        x = x + 1.0
+        y = host_computation(x)
+        return y + 1.0
+
+      return strategy.run(computation, args=(2.0,))
+
+    logdir = tempfile.mkdtemp()
+    summary_writer = summary.create_file_writer(logdir, flush_millis=10000)
+    with summary_writer.as_default(), summary.always_record_summaries():
+      self.assertAllEqual(
+          strategy.experimental_local_results(step()),
+          constant_op.constant(7., shape=(strategy.num_replicas_in_sync)))
+    events = _events_from_logdir(self, logdir)
+    # There will be 2 entries: 1 summary file header entry, and 1 entry
+    # written by host.
+    self.assertLen(events, 2)
+    self.assertEqual(events[1].summary.value[0].tag, "x")
+    self.assertEqual(events[1].summary.value[0].simple_value, 3.0)
+
+  @parameterized.parameters((True), (False))
+  def testSummaryControlFlowIfWithAutoOutsideCompilation(
+      self, take_true_branch):
+    strategy = get_tpu_strategy()
+
+    @def_function.function
+    def step():
+
+      def computation(x):
+        x = x + 1.0
+        if x < 5:
+          summary.scalar("x", x, step=0)
+          x = x * 2.0
+        return x + 1.0
+
+      if take_true_branch:
+        return strategy.run(computation, args=(2.0,))
+      else:
+        return strategy.run(computation, args=(10.0,))
+
+    logdir = tempfile.mkdtemp()
+    summary_writer = summary.create_file_writer(logdir, flush_millis=10000)
+    output_value = 12.
+    if take_true_branch:
+      output_value = 7.
+    with summary_writer.as_default(), summary.always_record_summaries():
+      self.assertAllEqual(
+          strategy.experimental_local_results(step()),
+          constant_op.constant(
+              output_value, shape=(strategy.num_replicas_in_sync)))
+    if take_true_branch:
+      events = _events_from_logdir(self, logdir)
+      # There will be 2 entries: 1 summary file header entry, and 1 entry
+      # written by host.
+      #
+      self.assertLen(events, 2)
+      self.assertEqual(events[1].summary.value[0].tag, "cond/x")
+      self.assertEqual(events[1].summary.value[0].simple_value, 3.0)
+
   def testAutoOutsideCompilationWithFunctionalNodes(self):
     strategy = get_tpu_strategy()
 
diff --git a/tensorflow/python/tpu/tpu_strategy_util.py b/tensorflow/python/tpu/tpu_strategy_util.py
index c315d7c5e1b..d4ba15ee2db 100644
--- a/tensorflow/python/tpu/tpu_strategy_util.py
+++ b/tensorflow/python/tpu/tpu_strategy_util.py
@@ -109,10 +109,6 @@ def initialize_tpu_system(cluster_resolver=None):
     context.context()._clear_caches()  # pylint: disable=protected-access
 
     serialized_topology = output.numpy()
-
-    # TODO(b/134094971): Remove this when lazy tensor copy in multi-device
-    # function has been implemented.
-    context.context().mirroring_policy = context.MIRRORING_ALL
   elif not ops.executing_eagerly_outside_functions():
     master = cluster_resolver.master()
     cluster_spec = cluster_resolver.cluster_spec()
diff --git a/tensorflow/python/tpu/tpu_test.py b/tensorflow/python/tpu/tpu_test.py
index c1a7e4dae92..8d10d9404d8 100644
--- a/tensorflow/python/tpu/tpu_test.py
+++ b/tensorflow/python/tpu/tpu_test.py
@@ -19,11 +19,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import importer
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
 from tensorflow.python.layers import convolutional
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -40,49 +40,77 @@ from tensorflow.python.tpu import training_loop
 
 class TPUContextTest(test.TestCase):
 
-  @test_util.deprecated_graph_mode_only
   def testIsInContext(self):
     """Test that control_flow_util can check that we're in a TPU context."""
-    z1 = array_ops.identity(1)
-    pivot = control_flow_ops.no_op()
-    context = tpu.TPUReplicateContext(b"context", 1, pivot=pivot)
-    context.Enter()
-    z2 = array_ops.identity(1)
-    context.Exit()
-    self.assertFalse(control_flow_util.IsInXLAContext(z1.op))
-    self.assertTrue(control_flow_util.IsInXLAContext(z2.op))
+    with ops.Graph().as_default():
+      z1 = array_ops.identity(1)
+      pivot = control_flow_ops.no_op()
+      context = tpu.TPUReplicateContext(b"context", 1, pivot=pivot)
+      context.Enter()
+      z2 = array_ops.identity(1)
+      context.Exit()
+      self.assertFalse(control_flow_util.IsInXLAContext(z1.op))
+      self.assertTrue(control_flow_util.IsInXLAContext(z2.op))
+
+  def testHandlesNameCollision(self):
+    """Test AddValue handles name collisions for ops from different graphs."""
+    with ops.Graph().as_default():
+      z = array_ops.zeros([2, 3], name="a")
+      assert z.name == "a:0", "Expected: a:0, Found: %s" % z.name
+
+      @def_function.function
+      def f():
+        pivot = control_flow_ops.no_op()
+        context = tpu.TPUReplicateContext(b"context", 1, pivot=pivot)
+        context.Enter()
+        array_ops.identity(z)  # Capture z.
+        z1 = array_ops.zeros([3, 2], name="a")
+        assert z1.name == "a:0", "Expected: a:0, Found: %s" % z1.name
+        z2 = array_ops.zeros([3, 2], name="a")
+        # Prior to fixing b/166794533 this would fail with a shape mismatch
+        # because context.AddValue would have cached `z` by its name which
+        # collides with z1's name.
+        result = z1 + z2
+        context.Exit()
+        return result
+
+      f.get_concrete_function()
 
 
 class TPULayerRewriteTest(test.TestCase):
 
-  @test_util.deprecated_graph_mode_only
   def testUsingInfeedQueueWithRegularizer(self):
     """Test that Layer regularizers can reference data created in loops."""
 
-    def make_regularizer(scale):
-      return lambda inputs: scale * math_ops.reduce_sum(math_ops.square(inputs))
+    with ops.Graph().as_default():
 
-    def training_step(inputs, scale):
-      outputs = convolutional.conv2d(
-          inputs,
-          filters=16,
-          kernel_size=(3, 3),
-          data_format="channels_first",
-          kernel_regularizer=make_regularizer(scale))
-      loss = math_ops.reduce_mean(math_ops.square(outputs))
-      return loss.op
+      def make_regularizer(scale):
+        def regularizer(inputs):
+          return scale * math_ops.reduce_sum(math_ops.square(inputs))
+        return regularizer
 
-    inputs = array_ops.zeros(shape=(128, 32, 32, 16))
-    scale = array_ops.ones(shape=())
-    infeed = tpu_feed.InfeedQueue(
-        tuple_types=[dtypes.float32, dtypes.float32],
-        tuple_shapes=[inputs.shape, scale.shape])
+      def training_step(inputs, scale):
+        outputs = convolutional.conv2d(
+            inputs,
+            filters=16,
+            kernel_size=(3, 3),
+            data_format="channels_first",
+            kernel_regularizer=make_regularizer(scale))
+        loss = math_ops.reduce_mean(math_ops.square(outputs))
+        return loss.op
 
-    def loop():
-      return training_loop.repeat(5, training_step, infeed_queue=infeed)
+      inputs = array_ops.zeros(shape=(128, 32, 32, 16))
+      scale = array_ops.ones(shape=())
+      infeed = tpu_feed.InfeedQueue(
+          tuple_types=[dtypes.float32, dtypes.float32],
+          tuple_shapes=[inputs.shape, scale.shape])
+
+      def loop():
+        return training_loop.repeat(5, training_step, infeed_queue=infeed)
+
+      # This should not throw an error.
+      tpu.rewrite(loop)
 
-    # This should not throw an error.
-    tpu.rewrite(loop)
 
 class TPUGraphPruneTest(test.TestCase):
 
diff --git a/tensorflow/python/training/BUILD b/tensorflow/python/training/BUILD
new file mode 100644
index 00000000000..0e864b176d6
--- /dev/null
+++ b/tensorflow/python/training/BUILD
@@ -0,0 +1,1424 @@
+load("//tensorflow/core/platform:build_config.bzl", "tf_proto_library")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "cuda_py_tests")
+
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+exports_files(
+    # Used in a pybind extension whose rule must be in tensorflow/python
+    ["quantize_training_wrapper.cc"],
+    visibility = ["//tensorflow/python:__pkg__"],
+)
+
+exports_files(
+    # Used in a rule which visibility limits to tensorflow/python
+    ["learning_rate_decay.py"],
+    visibility = ["//tensorflow/python:__pkg__"],
+)
+
+# Files which have their own BUILD rules, but which for compatibility with
+# strict dep checking need to be direct dependencies of training_lib. Do not add
+# any new files to this list.
+filegroup(
+    name = "deprecated_inclusions_in_training_lib",
+    srcs = [
+        "adadelta.py",
+        "adagrad.py",
+        "adagrad_da.py",
+        "adam.py",
+        "basic_loops.py",
+        "checkpoint_ops.py",
+        "checkpoint_utils.py",
+        "coordinator.py",
+        "device_setter.py",
+        "evaluation.py",
+        "ftrl.py",
+        "gradient_descent.py",
+        "input.py",
+        "learning_rate_decay.py",
+        "momentum.py",
+        "monitored_session.py",
+        "moving_averages.py",
+        "optimizer.py",
+        "proximal_adagrad.py",
+        "proximal_gradient_descent.py",
+        "py_checkpoint_reader.py",
+        "quantize_training.py",
+        "queue_runner.py",
+        "queue_runner_impl.py",
+        "rmsprop.py",
+        "server_lib.py",
+        "session_manager.py",
+        "slot_creator.py",
+        "summary_io.py",
+        "supervisor.py",
+        "sync_replicas_optimizer.py",
+        "tensorboard_logging.py",
+        "training.py",
+        "training_ops.py",
+        "warm_starting_util.py",
+    ],
+    visibility = ["//tensorflow/python/training:__pkg__"],
+)
+
+py_library(
+    name = "training_lib",
+    srcs = [
+        "__init__.py",
+        "training.py",
+        ":deprecated_inclusions_in_training_lib",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":adadelta",
+        ":adagrad",
+        ":adagrad_da",
+        ":adam",
+        ":basic_loops",
+        ":basic_session_run_hooks",
+        ":checkpoint_management",
+        ":checkpoint_utils",
+        ":coordinator",
+        ":device_setter",
+        ":ftrl",
+        ":gradient_descent",
+        ":input",
+        ":momentum",
+        ":monitored_session",
+        ":moving_averages",
+        ":optimizer",
+        ":proximal_adagrad",
+        ":proximal_gradient_descent",
+        ":py_checkpoint_reader",
+        ":quantize_training",
+        ":queue_runner",
+        ":rmsprop",
+        ":saver",
+        ":server_lib",
+        ":session_manager",
+        ":session_run_hook",
+        ":summary_io",
+        ":supervisor",
+        ":sync_replicas_optimizer",
+        ":tensorboard_logging",
+        ":training_util",
+        ":warm_starting_util",
+        "//tensorflow/python:learning_rate_decay",
+        "//tensorflow/python:sdca_ops",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python/training/experimental:loss_scale_optimizer",
+        "//tensorflow/python/training/experimental:mixed_precision",
+    ],
+)
+
+py_library(
+    name = "training",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":training_lib",
+        "//tensorflow/python/training/tracking:base",
+        "//tensorflow/python/training/tracking:python_state",
+        "//tensorflow/python/training/tracking:util",
+    ],
+)
+
+py_library(
+    name = "adadelta",
+    srcs = ["adadelta.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":optimizer",
+        ":training_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:tf_export",
+    ],
+)
+
+py_library(
+    name = "adagrad_da",
+    srcs = ["adagrad_da.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":optimizer",
+        ":training_ops",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:tf_export",
+    ],
+)
+
+py_library(
+    name = "adagrad",
+    srcs = ["adagrad.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":optimizer",
+        ":training_ops",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:array_ops_gen",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:tf_export",
+    ],
+)
+
+py_library(
+    name = "adam",
+    srcs = ["adam.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":optimizer",
+        ":training_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
+py_library(
+    name = "basic_loops",
+    srcs = ["basic_loops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:errors",
+        "//tensorflow/python:tf_export",
+    ],
+)
+
+py_library(
+    name = "checkpoint_ops",
+    srcs = ["checkpoint_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:checkpoint_ops_gen",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:math_ops",
+    ],
+)
+
+py_library(
+    name = "checkpoint_utils",
+    srcs = ["checkpoint_utils.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":checkpoint_management",
+        ":py_checkpoint_reader",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:io_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/training/saving:saveable_object_util",
+        "@six_archive//:six",
+    ],
+)
+
+py_library(
+    name = "coordinator",
+    srcs = ["coordinator.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:errors",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python:util",
+        "@six_archive//:six",
+    ],
+)
+
+py_library(
+    name = "device_setter",
+    srcs = ["device_setter.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":server_lib",
+        "//tensorflow/python:device",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:tf_export",
+        "@six_archive//:six",
+    ],
+)
+
+py_library(
+    name = "distribution_strategy_context",
+    srcs = ["distribution_strategy_context.py"],
+    srcs_version = "PY2AND3",
+    deps = ["//tensorflow/python/distribute:distribute_lib"],
+)
+
+py_library(
+    name = "evaluation",
+    srcs = ["evaluation.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":basic_session_run_hooks",
+        ":monitored_session",
+        ":session_run_hook",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:variable_scope",
+    ],
+)
+
+py_library(
+    name = "ftrl",
+    srcs = ["ftrl.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":optimizer",
+        ":training_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:tf_export",
+    ],
+)
+
+py_library(
+    name = "gradient_descent",
+    srcs = ["gradient_descent.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":optimizer",
+        ":training_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:tf_export",
+    ],
+)
+
+py_library(
+    name = "input",
+    srcs = ["input.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":queue_runner",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:data_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:io_ops",
+        "//tensorflow/python:layers_util",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:summary",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/eager:context",
+        "@six_archive//:six",
+    ],
+)
+
+py_library(
+    name = "momentum",
+    srcs = ["momentum.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":optimizer",
+        ":training_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:tf_export",
+    ],
+)
+
+py_library(
+    name = "moving_averages",
+    srcs = ["moving_averages.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":slot_creator",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/distribute:reduce_util",
+    ],
+)
+
+py_library(
+    name = "optimizer",
+    srcs = ["optimizer.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":slot_creator",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/distribute:reduce_util",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/training/tracking:base",
+        "@six_archive//:six",
+    ],
+)
+
+py_library(
+    name = "proximal_adagrad",
+    srcs = ["proximal_adagrad.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":optimizer",
+        ":training_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:tf_export",
+    ],
+)
+
+py_library(
+    name = "proximal_gradient_descent",
+    srcs = ["proximal_gradient_descent.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":optimizer",
+        ":training_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:tf_export",
+    ],
+)
+
+py_library(
+    name = "quantize_training",
+    srcs = ["quantize_training.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:_pywrap_quantize_training",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python:util",
+    ],
+)
+
+py_library(
+    name = "queue_runner_impl",
+    srcs = ["queue_runner_impl.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:session",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python:util",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
+py_library(
+    name = "queue_runner",
+    srcs = ["queue_runner.py"],
+    srcs_version = "PY2AND3",
+    deps = [":queue_runner_impl"],
+)
+
+py_library(
+    name = "rmsprop",
+    srcs = ["rmsprop.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":optimizer",
+        ":training_ops",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:tf_export",
+    ],
+)
+
+py_library(
+    name = "session_manager",
+    srcs = ["session_manager.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":checkpoint_management",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:session",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_library(
+    name = "slot_creator",
+    srcs = ["slot_creator.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/distribute:distribute_lib",
+    ],
+)
+
+py_library(
+    name = "summary_io",
+    srcs = ["summary_io.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:util",
+    ],
+)
+
+py_library(
+    name = "sync_replicas_optimizer",
+    srcs = ["sync_replicas_optimizer.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":optimizer",
+        ":queue_runner",
+        ":session_manager",
+        ":session_run_hook",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:data_flow_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/distribute:distribute_lib",
+    ],
+)
+
+py_library(
+    name = "tensorboard_logging",
+    srcs = ["tensorboard_logging.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:platform",
+    ],
+)
+
+py_library(
+    name = "training_ops",
+    srcs = [
+        "gen_training_ops.py",
+        "training_ops.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:training_ops_gen",
+    ],
+)
+
+py_library(
+    name = "warm_starting_util",
+    srcs = ["warm_starting_util.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":checkpoint_ops",
+        ":checkpoint_utils",
+        ":saver",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/training/saving:saveable_object_util",
+        "@six_archive//:six",
+    ],
+)
+
+py_library(
+    name = "distribute",
+    srcs = [
+        "distribute.py",
+        "distribution_strategy_context.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python/distribute:distribute_lib",
+    ],
+)
+
+tf_py_test(
+    name = "server_lib_test",
+    size = "small",
+    srcs = ["server_lib_test.py"],
+    grpc_enabled = True,
+    python_version = "PY3",
+    tags = [
+        "noasan",  # TODO(b/161236904): flaky timeout in trying to start gRPC server
+    ],
+    tfrt_enabled = True,
+    deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:data_flow_ops",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variables",
+        "//third_party/py/numpy",
+    ],
+)
+
+tf_py_test(
+    name = "server_lib_multiple_containers_test",
+    size = "small",
+    srcs = ["server_lib_multiple_containers_test.py"],
+    grpc_enabled = True,
+    python_version = "PY3",
+    tfrt_enabled = True,
+    deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:data_flow_ops",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:variables",
+        "//third_party/py/numpy",
+    ],
+)
+
+tf_py_test(
+    name = "server_lib_same_variables_clear_container_test",
+    size = "small",
+    srcs = ["server_lib_same_variables_clear_container_test.py"],
+    grpc_enabled = True,
+    python_version = "PY3",
+    tfrt_enabled = True,
+    deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:data_flow_ops",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:variables",
+        "//third_party/py/numpy",
+    ],
+)
+
+tf_py_test(
+    name = "server_lib_same_variables_clear_test",
+    size = "small",
+    srcs = ["server_lib_same_variables_clear_test.py"],
+    grpc_enabled = True,
+    python_version = "PY3",
+    tfrt_enabled = True,
+    deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:data_flow_ops",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:variables",
+        "//third_party/py/numpy",
+    ],
+)
+
+tf_py_test(
+    name = "server_lib_same_variables_no_clear_test",
+    size = "small",
+    srcs = ["server_lib_same_variables_no_clear_test.py"],
+    grpc_enabled = True,
+    python_version = "PY3",
+    tfrt_enabled = True,
+    deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:data_flow_ops",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:variables",
+        "//third_party/py/numpy",
+    ],
+)
+
+tf_py_test(
+    name = "server_lib_sparse_job_test",
+    size = "small",
+    srcs = ["server_lib_sparse_job_test.py"],
+    grpc_enabled = True,
+    python_version = "PY3",
+    tfrt_enabled = True,
+    deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:data_flow_ops",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:variables",
+        "//third_party/py/numpy",
+    ],
+)
+
+cuda_py_test(
+    name = "localhost_cluster_performance_test",
+    size = "medium",
+    srcs = [
+        "localhost_cluster_performance_test.py",
+    ],
+    grpc_enabled = True,
+    python_version = "PY3",
+    tags = [
+        "no_oss",  # Test flaky due to port collisions.
+        "oss_serial",
+    ],
+    tfrt_enabled = True,
+    deps = [
+        ":device_setter",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:distributed_framework_test_lib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:partitioned_variables",
+        "//tensorflow/python:session",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//third_party/py/numpy",
+    ],
+)
+
+tf_py_test(
+    name = "sync_replicas_optimizer_test",
+    size = "medium",
+    srcs = [
+        "sync_replicas_optimizer_test.py",
+    ],
+    grpc_enabled = True,
+    python_version = "PY3",
+    tags = [
+        "no_oss",  # Test flaky due to port collisions.
+        "notsan",  # data race due to b/62910646
+        "oss_serial",
+    ],
+    tfrt_enabled = True,
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:variables",
+    ],
+)
+
+tf_py_test(
+    name = "evaluation_test",
+    size = "small",
+    srcs = ["evaluation_test.py"],
+    python_version = "PY3",
+    shard_count = 3,
+    tags = [
+        "manual",
+        "notap",  # Disabling until b/33000128 and b/33040312 are fixed.
+    ],
+    deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:metrics",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:summary",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/ops/losses",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_library(
+    name = "py_checkpoint_reader",
+    srcs = ["py_checkpoint_reader.py"],
+    deps = [
+        "//tensorflow/python:_pywrap_checkpoint_reader",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python:util",
+    ],
+)
+
+tf_proto_library(
+    name = "checkpoint_state",
+    srcs = ["checkpoint_state.proto"],
+    cc_api_version = 2,
+)
+
+py_library(
+    name = "checkpoint_management",
+    srcs = ["checkpoint_management.py"],
+    deps = [
+        ":training_util",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:lib",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
+cuda_py_test(
+    name = "checkpoint_management_test",
+    size = "small",
+    srcs = [
+        "checkpoint_management_test.py",
+    ],
+    python_version = "PY3",
+    deps = [
+        ":checkpoint_management",
+        ":saver",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:lib",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/training/tracking:util",
+    ],
+)
+
+py_library(
+    name = "saver",
+    srcs = ["saver.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":checkpoint_management",
+        ":py_checkpoint_reader",
+        ":training_util",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:device",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:io_ops",
+        "//tensorflow/python:io_ops_gen",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:session",
+        "//tensorflow/python:string_ops",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/training/saving:saveable_object",
+        "//tensorflow/python/training/saving:saveable_object_util",
+        "//tensorflow/python/training/tracking:base",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_library(
+    name = "saver_test_utils",
+    srcs = ["saver_test_utils.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":saver",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:lookup_ops_gen",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
+cuda_py_test(
+    name = "saver_test",
+    size = "medium",
+    srcs = [
+        "saver_test.py",
+    ],
+    python_version = "PY3",
+    tags = ["multi_gpu"],
+    deps = [
+        ":adam",
+        ":checkpoint_management",
+        ":gradient_descent",
+        ":py_checkpoint_reader",
+        ":queue_runner_impl",
+        ":saver",
+        ":saver_test_utils",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:data_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:function",
+        "//tensorflow/python:gradients_impl",
+        "//tensorflow/python:lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn_grad",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:partitioned_variables",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:summary",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:iterator_ops",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/training/tracking:base",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+    ],
+)
+
+tf_py_test(
+    name = "saver_large_variable_test",
+    size = "medium",
+    srcs = ["saver_large_variable_test.py"],
+    python_version = "PY3",
+    tags = [
+        "manual",
+        "noasan",  # http://b/30379628
+        "notsan",  # http://b/30379628
+    ],
+    tfrt_enabled = True,
+    deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:variables",
+    ],
+)
+
+tf_py_test(
+    name = "saver_large_partitioned_variable_test",
+    size = "medium",
+    srcs = ["saver_large_partitioned_variable_test.py"],
+    python_version = "PY3",
+    tags = [
+        "noasan",  # http://b/30782289
+        "notsan",  # http://b/30782289
+    ],
+    tfrt_enabled = True,
+    deps = [
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:partitioned_variables",
+        "//tensorflow/python:variables",
+    ],
+)
+
+py_library(
+    name = "basic_session_run_hooks",
+    srcs = ["basic_session_run_hooks.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":session_run_hook",
+        ":summary_io",
+        ":training_util",
+        "//tensorflow/python:client",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python:variable_scope",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+    ],
+)
+
+py_library(
+    name = "session_run_hook",
+    srcs = ["session_run_hook.py"],
+    srcs_version = "PY2AND3",
+    deps = ["//tensorflow/python:tf_export"],
+)
+
+py_library(
+    name = "supervisor",
+    srcs = ["supervisor.py"],
+    deps = [
+        ":coordinator",
+        ":saver",
+        ":session_manager",
+        ":training_util",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:lookup_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:summary",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
+tf_py_test(
+    name = "supervisor_test",
+    size = "small",
+    srcs = ["supervisor_test.py"],
+    grpc_enabled = True,
+    python_version = "PY3",
+    tags = ["no_windows"],
+    tfrt_enabled = True,
+    deps = [
+        ":checkpoint_management",
+        ":saver",
+        ":supervisor",
+        ":training",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:io_ops",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:summary",
+        "//tensorflow/python:variables",
+    ],
+)
+
+py_library(
+    name = "server_lib",
+    srcs = ["server_lib.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:errors",
+        "//tensorflow/python:pywrap_tf_session",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python:util",
+    ],
+)
+
+py_library(
+    name = "training_util",
+    srcs = ["training_util.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
+tf_py_test(
+    name = "training_util_test",
+    size = "small",
+    srcs = ["training_util_test.py"],
+    python_version = "PY3",
+    tfrt_enabled = True,
+    deps = [
+        ":training_util",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:variables",
+    ],
+)
+
+cuda_py_test(
+    name = "adam_test",
+    size = "medium",
+    srcs = ["adam_test.py"],
+    python_version = "PY3",
+    tfrt_enabled = True,
+    deps = [
+        ":adam",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+        "//third_party/py/numpy",
+    ],
+)
+
+cuda_py_test(
+    name = "moving_averages_test",
+    size = "small",
+    srcs = [
+        "moving_averages_test.py",
+    ],
+    python_version = "PY3",
+    tags = [
+        "no_windows",  # b/139083295: bfloat16 tests fail on Windows
+        "notsan",
+    ],
+    tfrt_enabled = True,
+    deps = [
+        ":moving_averages",
+        ":saver",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:state_ops_gen",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
+cuda_py_tests(
+    name = "training_tests",
+    size = "medium",
+    srcs = [
+        "adadelta_test.py",
+        "adagrad_da_test.py",
+        "adagrad_test.py",
+        "basic_loops_test.py",
+        "coordinator_test.py",
+        "device_setter_test.py",
+        "ftrl_test.py",
+        "gradient_descent_test.py",
+        "momentum_test.py",
+        "optimizer_test.py",
+        "proximal_adagrad_test.py",
+        "proximal_gradient_descent_test.py",
+        "quantize_training_test.py",
+        "queue_runner_test.py",
+        "rmsprop_test.py",
+        "slot_creator_test.py",
+        "tensorboard_logging_test.py",
+        "training_ops_test.py",
+    ],
+    python_version = "PY3",
+    deps = [
+        ":training",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:data_flow_ops",
+        "//tensorflow/python:data_flow_ops_gen",
+        "//tensorflow/python:embedding_ops",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:lookup_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn_grad",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:partitioned_variables",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:pywrap_tensorflow",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:resources",
+        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:state_ops_gen",
+        "//tensorflow/python:summary",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+    ],
+)
+
+cuda_py_test(
+    name = "session_manager_test",
+    size = "medium",  # TODO(irving): Can this be made small?
+    srcs = ["session_manager_test.py"],
+    grpc_enabled = True,
+    main = "session_manager_test.py",
+    python_version = "PY3",
+    tfrt_enabled = True,
+    deps = [
+        ":checkpoint_management",
+        ":saver",
+        ":server_lib",
+        ":session_manager",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:session",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+    ],
+)
+
+tf_py_test(
+    name = "basic_session_run_hooks_test",
+    size = "medium",
+    srcs = ["basic_session_run_hooks_test.py"],
+    python_version = "PY3",
+    tags = [
+        "no_pip",  # Relies on contrib
+        "no_windows",
+        "notsan",  # intermittent races on a few percent of runs
+    ],
+    tfrt_enabled = True,
+    deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:fake_summary_writer",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:nn_grad",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:summary",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+    ],
+)
+
+tf_py_test(
+    name = "checkpoint_utils_test",
+    size = "small",
+    srcs = ["checkpoint_utils_test.py"],
+    python_version = "PY3",
+    tags = [
+        "manual",
+        "no_cuda_on_cpu_tap",
+        "no_oss",
+        "no_windows",
+        "notap",
+    ],
+    deps = [
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:io_ops",
+        "//tensorflow/python:partitioned_variables",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+    ],
+)
+
+tf_py_test(
+    name = "checkpoint_ops_test",
+    size = "small",
+    srcs = ["checkpoint_ops_test.py"],
+    python_version = "PY3",
+    tfrt_enabled = True,
+    deps = [
+        "//tensorflow/python:checkpoint_ops_gen",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:io_ops",
+        "//tensorflow/python:partitioned_variables",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:pywrap_tensorflow",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+    ],
+)
+
+tf_py_test(
+    name = "warm_starting_util_test",
+    size = "medium",
+    srcs = ["warm_starting_util_test.py"],
+    python_version = "PY3",
+    tfrt_enabled = True,
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_library(
+    name = "monitored_session",
+    srcs = ["monitored_session.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":basic_session_run_hooks",
+        ":coordinator",
+        ":queue_runner",
+        ":saver",
+        ":session_manager",
+        ":session_run_hook",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:lookup_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:resources",
+        "//tensorflow/python:summary",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/distribute:distribute_coordinator_context",
+        "@six_archive//:six",
+    ],
+)
+
+tf_py_test(
+    name = "monitored_session_test",
+    size = "medium",
+    srcs = ["monitored_session_test.py"],
+    tags = [
+        "no_pip",
+        "notsan",  # b/67945581
+    ],
+    tfrt_enabled = True,
+    deps = [
+        ":checkpoint_management",
+        ":monitored_session",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:saver",
+        "//tensorflow/python:session",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:summary",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/distribute:collective_all_reduce_strategy",
+        "//tensorflow/python/distribute:distribute_coordinator",
+    ],
+)
+
+tf_py_test(
+    name = "input_test",
+    size = "medium",
+    srcs = ["input_test.py"],
+    python_version = "PY3",
+    tfrt_enabled = True,
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variables",
+        "//third_party/py/numpy",
+    ],
+)
diff --git a/tensorflow/python/training/experimental/BUILD b/tensorflow/python/training/experimental/BUILD
new file mode 100644
index 00000000000..afc4cd673db
--- /dev/null
+++ b/tensorflow/python/training/experimental/BUILD
@@ -0,0 +1,149 @@
+load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+py_library(
+    name = "loss_scale",
+    srcs = ["loss_scale.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/distribute:reduce_util",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/training/tracking:base",
+        "@six_archive//:six",
+    ],
+)
+
+py_library(
+    name = "loss_scale_optimizer",
+    srcs = ["loss_scale_optimizer.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":loss_scale",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:smart_cond",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/training:optimizer",
+    ],
+)
+
+py_test(
+    name = "loss_scale_optimizer_test",
+    size = "small",
+    srcs = ["loss_scale_optimizer_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":loss_scale",
+        ":loss_scale_optimizer",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/distribute:mirrored_strategy",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/training:gradient_descent",
+        "//tensorflow/python/training:momentum",
+        "//tensorflow/python/training/tracking:util",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "loss_scale_test",
+    size = "medium",
+    srcs = ["loss_scale_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":loss_scale",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/distribute:mirrored_strategy",
+        "//tensorflow/python/distribute:one_device_strategy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_library(
+    name = "mixed_precision_global_state",
+    srcs = ["mixed_precision_global_state.py"],
+    srcs_version = "PY2AND3",
+)
+
+py_library(
+    name = "mixed_precision",
+    srcs = ["mixed_precision.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":loss_scale",
+        ":loss_scale_optimizer",
+        ":mixed_precision_global_state",
+        "//tensorflow/python:config",
+        "//tensorflow/python:util",
+    ],
+)
+
+cuda_py_test(
+    name = "mixed_precision_test",
+    size = "small",
+    srcs = ["mixed_precision_test.py"],
+    python_version = "PY3",
+    tfrt_enabled = True,
+    deps = [
+        ":mixed_precision",
+        "//tensorflow/python:client_testlib",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_library(
+    name = "loss_scaling_gradient_tape",
+    srcs = ["loss_scaling_gradient_tape.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":loss_scale",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:unconnected_gradients",
+        "//tensorflow/python:util",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/eager:backprop",
+    ],
+)
+
+cuda_py_test(
+    name = "loss_scaling_gradient_tape_test",
+    size = "medium",
+    srcs = ["loss_scaling_gradient_tape_test.py"],
+    shard_count = 2,
+    deps = [
+        ":loss_scale",
+        ":loss_scaling_gradient_tape",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_test_combinations_lib",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/distribute:mirrored_strategy",
+        "//tensorflow/python/eager:def_function",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
diff --git a/tensorflow/python/training/experimental/mixed_precision_test.py b/tensorflow/python/training/experimental/mixed_precision_test.py
index c3b7b94b8c8..e02e741070a 100644
--- a/tensorflow/python/training/experimental/mixed_precision_test.py
+++ b/tensorflow/python/training/experimental/mixed_precision_test.py
@@ -102,6 +102,7 @@ class MixedPrecisionTest(test.TestCase, parameterized.TestCase):
 
   @test_util.run_gpu_only
   @test_util.run_in_graph_and_eager_modes
+  @test_util.disable_tfrt('Grappler rewrite doesn\'t apply to tfrt.')
   def test_grappler_pass_enabled(self):
     opt = gradient_descent_v1.GradientDescentOptimizer(1.0)
     enable_mixed_precision_graph_rewrite(opt, 123.)
diff --git a/tensorflow/python/training/ftrl.py b/tensorflow/python/training/ftrl.py
index c7b3867631d..6c8a6ceadc5 100644
--- a/tensorflow/python/training/ftrl.py
+++ b/tensorflow/python/training/ftrl.py
@@ -49,7 +49,8 @@ class FtrlOptimizer(optimizer.Optimizer):
                name="Ftrl",
                accum_name=None,
                linear_name=None,
-               l2_shrinkage_regularization_strength=0.0):
+               l2_shrinkage_regularization_strength=0.0,
+               beta=None):
     r"""Construct a new FTRL optimizer.
 
     Args:
@@ -79,10 +80,11 @@ class FtrlOptimizer(optimizer.Optimizer):
         function w.r.t. the weights w.
         Specifically, in the absence of L1 regularization, it is equivalent to
         the following update rule:
-        w_{t+1} = w_t - lr_t / (1 + 2*L2*lr_t) * g_t -
-                  2*L2_shrinkage*lr_t / (1 + 2*L2*lr_t) * w_t
+        w_{t+1} = w_t - lr_t / (beta + 2*L2*lr_t) * g_t -
+                  2*L2_shrinkage*lr_t / (beta + 2*L2*lr_t) * w_t
         where lr_t is the learning rate at t.
         When input is sparse shrinkage will only happen on the active weights.
+      beta: A float value; corresponds to the beta parameter in the paper.
 
     Raises:
       ValueError: If one of the arguments is invalid.
@@ -119,12 +121,13 @@ class FtrlOptimizer(optimizer.Optimizer):
     self._initial_accumulator_value = initial_accumulator_value
     self._l1_regularization_strength = l1_regularization_strength
     self._l2_regularization_strength = l2_regularization_strength
+    self._beta = (0.0 if beta is None else beta)
     self._l2_shrinkage_regularization_strength = (
         l2_shrinkage_regularization_strength)
     self._learning_rate_tensor = None
     self._learning_rate_power_tensor = None
     self._l1_regularization_strength_tensor = None
-    self._l2_regularization_strength_tensor = None
+    self._adjusted_l2_regularization_strength_tensor = None
     self._l2_shrinkage_regularization_strength_tensor = None
     self._accum_name = accum_name
     self._linear_name = linear_name
@@ -142,8 +145,14 @@ class FtrlOptimizer(optimizer.Optimizer):
         self._learning_rate, name="learning_rate")
     self._l1_regularization_strength_tensor = ops.convert_to_tensor(
         self._l1_regularization_strength, name="l1_regularization_strength")
-    self._l2_regularization_strength_tensor = ops.convert_to_tensor(
-        self._l2_regularization_strength, name="l2_regularization_strength")
+    # L2 regularization strength with beta added in so that the underlying
+    # TensorFlow ops do not need to include that parameter.
+    self._adjusted_l2_regularization_strength_tensor = ops.convert_to_tensor(
+        self._l2_regularization_strength + self._beta /
+        (2. * self._learning_rate),
+        name="adjusted_l2_regularization_strength")
+    assert self._adjusted_l2_regularization_strength_tensor is not None
+    self._beta_tensor = ops.convert_to_tensor(self._beta, name="beta")
     self._l2_shrinkage_regularization_strength_tensor = ops.convert_to_tensor(
         self._l2_shrinkage_regularization_strength,
         name="l2_shrinkage_regularization_strength")
@@ -162,7 +171,7 @@ class FtrlOptimizer(optimizer.Optimizer):
           math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
           math_ops.cast(self._l1_regularization_strength_tensor,
                         var.dtype.base_dtype),
-          math_ops.cast(self._l2_regularization_strength_tensor,
+          math_ops.cast(self._adjusted_l2_regularization_strength_tensor,
                         var.dtype.base_dtype),
           math_ops.cast(self._learning_rate_power_tensor, var.dtype.base_dtype),
           use_locking=self._use_locking)
@@ -175,7 +184,7 @@ class FtrlOptimizer(optimizer.Optimizer):
           math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
           math_ops.cast(self._l1_regularization_strength_tensor,
                         var.dtype.base_dtype),
-          math_ops.cast(self._l2_regularization_strength_tensor,
+          math_ops.cast(self._adjusted_l2_regularization_strength_tensor,
                         var.dtype.base_dtype),
           math_ops.cast(self._l2_shrinkage_regularization_strength_tensor,
                         var.dtype.base_dtype),
@@ -194,7 +203,7 @@ class FtrlOptimizer(optimizer.Optimizer):
           math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
           math_ops.cast(self._l1_regularization_strength_tensor,
                         var.dtype.base_dtype),
-          math_ops.cast(self._l2_regularization_strength_tensor,
+          math_ops.cast(self._adjusted_l2_regularization_strength_tensor,
                         var.dtype.base_dtype),
           math_ops.cast(self._learning_rate_power_tensor, var.dtype.base_dtype),
           use_locking=self._use_locking)
@@ -207,7 +216,7 @@ class FtrlOptimizer(optimizer.Optimizer):
           math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
           math_ops.cast(self._l1_regularization_strength_tensor,
                         var.dtype.base_dtype),
-          math_ops.cast(self._l2_regularization_strength_tensor,
+          math_ops.cast(self._adjusted_l2_regularization_strength_tensor,
                         var.dtype.base_dtype),
           math_ops.cast(self._l2_shrinkage_regularization_strength_tensor,
                         var.dtype.base_dtype),
@@ -227,7 +236,7 @@ class FtrlOptimizer(optimizer.Optimizer):
           math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
           math_ops.cast(self._l1_regularization_strength_tensor,
                         var.dtype.base_dtype),
-          math_ops.cast(self._l2_regularization_strength_tensor,
+          math_ops.cast(self._adjusted_l2_regularization_strength_tensor,
                         var.dtype.base_dtype),
           math_ops.cast(self._learning_rate_power_tensor, var.dtype.base_dtype),
           use_locking=self._use_locking)
@@ -241,7 +250,7 @@ class FtrlOptimizer(optimizer.Optimizer):
           math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
           math_ops.cast(self._l1_regularization_strength_tensor,
                         var.dtype.base_dtype),
-          math_ops.cast(self._l2_regularization_strength_tensor,
+          math_ops.cast(self._adjusted_l2_regularization_strength_tensor,
                         var.dtype.base_dtype),
           math_ops.cast(self._l2_shrinkage_regularization_strength_tensor,
                         grad.dtype.base_dtype),
@@ -260,7 +269,8 @@ class FtrlOptimizer(optimizer.Optimizer):
           indices,
           math_ops.cast(self._learning_rate_tensor, grad.dtype),
           math_ops.cast(self._l1_regularization_strength_tensor, grad.dtype),
-          math_ops.cast(self._l2_regularization_strength_tensor, grad.dtype),
+          math_ops.cast(self._adjusted_l2_regularization_strength_tensor,
+                        grad.dtype),
           math_ops.cast(self._learning_rate_power_tensor, grad.dtype),
           use_locking=self._use_locking)
     else:
@@ -272,7 +282,8 @@ class FtrlOptimizer(optimizer.Optimizer):
           indices,
           math_ops.cast(self._learning_rate_tensor, grad.dtype),
           math_ops.cast(self._l1_regularization_strength_tensor, grad.dtype),
-          math_ops.cast(self._l2_regularization_strength_tensor, grad.dtype),
+          math_ops.cast(self._adjusted_l2_regularization_strength_tensor,
+                        grad.dtype),
           math_ops.cast(self._l2_shrinkage_regularization_strength_tensor,
                         grad.dtype),
           math_ops.cast(self._learning_rate_power_tensor, grad.dtype),
diff --git a/tensorflow/python/training/ftrl_test.py b/tensorflow/python/training/ftrl_test.py
index f0cbe13e037..ff1bf177a72 100644
--- a/tensorflow/python/training/ftrl_test.py
+++ b/tensorflow/python/training/ftrl_test.py
@@ -161,6 +161,65 @@ class FtrlOptimizerTest(test.TestCase):
           self.assertAllCloseAccordingToType(
               np.array([-0.93460727, -1.86147261]), v1_val)
 
+  def testFtrlWithBeta(self):
+    # The v1 optimizers do not support eager execution
+    with ops.Graph().as_default():
+      for dtype in [dtypes.half, dtypes.float32]:
+        with self.cached_session():
+          var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+          var1 = variables.Variable([4.0, 3.0], dtype=dtype)
+          grads0 = constant_op.constant([0.1, 0.2], dtype=dtype)
+          grads1 = constant_op.constant([0.01, 0.02], dtype=dtype)
+
+          opt = ftrl.FtrlOptimizer(3.0, initial_accumulator_value=0.1, beta=0.1)
+          update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+          self.evaluate(variables.global_variables_initializer())
+
+          v0_val, v1_val = self.evaluate([var0, var1])
+          self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
+          self.assertAllCloseAccordingToType([4.0, 3.0], v1_val)
+
+          # Run 10 steps FTRL
+          for _ in range(10):
+            update.run()
+          v0_val, v1_val = self.evaluate([var0, var1])
+          self.assertAllCloseAccordingToType(
+              np.array([-6.096838, -9.162214]), v0_val)
+          self.assertAllCloseAccordingToType(
+              np.array([-0.717741, -1.425132]), v1_val)
+
+  def testFtrlWithL2_Beta(self):
+    # The v1 optimizers do not support eager execution
+    with ops.Graph().as_default():
+      for dtype in [dtypes.half, dtypes.float32]:
+        with self.cached_session():
+          var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+          var1 = variables.Variable([4.0, 3.0], dtype=dtype)
+          grads0 = constant_op.constant([0.1, 0.2], dtype=dtype)
+          grads1 = constant_op.constant([0.01, 0.02], dtype=dtype)
+
+          opt = ftrl.FtrlOptimizer(
+              3.0,
+              initial_accumulator_value=0.1,
+              l1_regularization_strength=0.0,
+              l2_regularization_strength=0.1,
+              beta=0.1)
+          update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+          self.evaluate(variables.global_variables_initializer())
+
+          v0_val, v1_val = self.evaluate([var0, var1])
+          self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
+          self.assertAllCloseAccordingToType([4.0, 3.0], v1_val)
+
+          # Run 10 steps FTRL
+          for _ in range(10):
+            update.run()
+          v0_val, v1_val = self.evaluate([var0, var1])
+          self.assertAllCloseAccordingToType(
+              np.array([-2.735487, -4.704625]), v0_val)
+          self.assertAllCloseAccordingToType(
+              np.array([-0.294335, -0.586556]), v1_val)
+
   def testFtrlWithL1_L2(self):
     # The v1 optimizers do not support eager execution
     with ops.Graph().as_default():
diff --git a/tensorflow/python/training/gen_training_ops.py b/tensorflow/python/training/gen_training_ops.py
new file mode 100644
index 00000000000..5590b5056f8
--- /dev/null
+++ b/tensorflow/python/training/gen_training_ops.py
@@ -0,0 +1,29 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Python wrappers for training ops."""
+# NOTE(allenl): The generated op wrappers for training ops were originally in
+# training/gen_training_ops.py. They moved to ops/gen_training_ops.py when
+# training/ became a module, and this is an alias to avoid breaking existing
+# imports.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# go/tf-wildcard-import
+# pylint: disable=wildcard-import
+from tensorflow.python.ops.gen_training_ops import *
+# pylint: enable=wildcard-import
diff --git a/tensorflow/python/training/momentum_test.py b/tensorflow/python/training/momentum_test.py
index 332cc4018ac..b6313f5a1da 100644
--- a/tensorflow/python/training/momentum_test.py
+++ b/tensorflow/python/training/momentum_test.py
@@ -47,10 +47,12 @@ class MomentumOptimizerTest(test.TestCase):
   def doTestBasic(self, use_resource=False, use_callable_params=False):
     for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
       if use_resource:
-        var0 = resource_variable_ops.ResourceVariable(
-            [1.0, 2.0], dtype=dtype, name="var0_%d" % i)
-        var1 = resource_variable_ops.ResourceVariable(
-            [3.0, 4.0], dtype=dtype, name="var1_%d" % i)
+        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0],
+                                                      dtype=dtype,
+                                                      name="var0_%d" % i)
+        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0],
+                                                      dtype=dtype,
+                                                      name="var1_%d" % i)
       else:
         var0 = variables.Variable([1.0, 2.0], dtype=dtype)
         var1 = variables.Variable([3.0, 4.0], dtype=dtype)
@@ -63,8 +65,7 @@ class MomentumOptimizerTest(test.TestCase):
         momentum = momentum()
       mom_opt = momentum_lib.MomentumOptimizer(
           learning_rate=learning_rate, momentum=momentum)
-      mom_update = mom_opt.apply_gradients(
-          zip([grads0, grads1], [var0, var1]))
+      mom_update = mom_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
 
       if not context.executing_eagerly():
         self.evaluate(variables.global_variables_initializer())
@@ -87,14 +88,13 @@ class MomentumOptimizerTest(test.TestCase):
       if not context.executing_eagerly():
         self.evaluate(mom_update)
       # Check that the momentum accumulators have been updated.
-      self.assertAllCloseAccordingToType(np.array([0.1, 0.1]),
-                                         self.evaluate(slot0))
-      self.assertAllCloseAccordingToType(np.array([0.01, 0.01]),
-                                         self.evaluate(slot1))
+      self.assertAllCloseAccordingToType(
+          np.array([0.1, 0.1]), self.evaluate(slot0))
+      self.assertAllCloseAccordingToType(
+          np.array([0.01, 0.01]), self.evaluate(slot1))
       # Check that the parameters have been updated.
       self.assertAllCloseAccordingToType(
-          np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]),
-          self.evaluate(var0))
+          np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]), self.evaluate(var0))
       self.assertAllCloseAccordingToType(
           np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]),
           self.evaluate(var1))
@@ -118,8 +118,8 @@ class MomentumOptimizerTest(test.TestCase):
           ]), self.evaluate(var0))
       self.assertAllCloseAccordingToType(
           np.array([
-              2.98 - ((0.9 * 0.01 + 0.01) * 2.0), 3.98 - (
-                  (0.9 * 0.01 + 0.01) * 2.0)
+              2.98 - ((0.9 * 0.01 + 0.01) * 2.0),
+              3.98 - ((0.9 * 0.01 + 0.01) * 2.0)
           ]), self.evaluate(var1))
 
   def testBasic(self):
@@ -137,10 +137,12 @@ class MomentumOptimizerTest(test.TestCase):
   def testVariablesAcrossGraphs(self):
     optimizer = momentum_lib.MomentumOptimizer(0.01, 0.5)
     with ops.Graph().as_default():
-      var0 = resource_variable_ops.ResourceVariable(
-          [1.0, 2.0], dtype=dtypes.float32, name="var0")
-      var1 = resource_variable_ops.ResourceVariable(
-          [3.0, 4.0], dtype=dtypes.float32, name="var1")
+      var0 = resource_variable_ops.ResourceVariable([1.0, 2.0],
+                                                    dtype=dtypes.float32,
+                                                    name="var0")
+      var1 = resource_variable_ops.ResourceVariable([3.0, 4.0],
+                                                    dtype=dtypes.float32,
+                                                    name="var1")
       loss = math_ops.reduce_sum(var0 + var1)
       optimizer.minimize(loss)
       optimizer_variables = optimizer.variables()
@@ -149,10 +151,12 @@ class MomentumOptimizerTest(test.TestCase):
       self.assertEqual(2, len(optimizer_variables))
 
     with ops.Graph().as_default():
-      var2 = resource_variable_ops.ResourceVariable(
-          [1.0, 2.0], dtype=dtypes.float32, name="var2")
-      var3 = resource_variable_ops.ResourceVariable(
-          [3.0, 4.0], dtype=dtypes.float32, name="var3")
+      var2 = resource_variable_ops.ResourceVariable([1.0, 2.0],
+                                                    dtype=dtypes.float32,
+                                                    name="var2")
+      var3 = resource_variable_ops.ResourceVariable([3.0, 4.0],
+                                                    dtype=dtypes.float32,
+                                                    name="var3")
       loss = math_ops.reduce_sum(var2 + var3)
       optimizer.minimize(loss)
       optimizer_variables = optimizer.variables()
@@ -181,9 +185,8 @@ class MomentumOptimizerTest(test.TestCase):
           opt_op.run()
           var0_np, accum0_np = self._update_nesterov_momentum_numpy(
               var0_np, accum0_np, var0_np * 10, 2.0, 0.9)
-          var1_np, accum1_np = self._update_nesterov_momentum_numpy(var1_np,
-                                                                    accum1_np,
-                                                                    3, 2.0, 0.9)
+          var1_np, accum1_np = self._update_nesterov_momentum_numpy(
+              var1_np, accum1_np, 3, 2.0, 0.9)
           self.assertAllClose(var0_np, self.evaluate(var0))
           self.assertAllClose(var1_np, self.evaluate(var1))
 
@@ -200,32 +203,29 @@ class MomentumOptimizerTest(test.TestCase):
           grads.append(var0_np * 10)
           var0_np, accum0_np = self._update_nesterov_momentum_numpy(
               var0_np, accum0_np, var0_np * 10, 2.0, 0.9)
-          var1_np, accum1_np = self._update_nesterov_momentum_numpy(var1_np,
-                                                                    accum1_np,
-                                                                    3, 2.0, 0.9)
+          var1_np, accum1_np = self._update_nesterov_momentum_numpy(
+              var1_np, accum1_np, 3, 2.0, 0.9)
         var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
         var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
         accum0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
         accum1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
         var0 = variables.Variable(var0_np)
         var1 = variables.Variable(var1_np)
-        loss = 5 * var0 * var0 + 3 * var1
         mom_op = momentum_lib.MomentumOptimizer(
             learning_rate=2.0, momentum=0.9, use_nesterov=True)
         x_feed = array_ops.placeholder(dtype)
-        y_feed = ops.IndexedSlices(
-            x_feed, constant_op.constant([0, 1]), constant_op.constant([2]))
-        grads_and_vars = [(y_feed, var0), (constant_op.constant(
-            [3.0, 3.0], dtype=dtype), var1)]
+        y_feed = ops.IndexedSlices(x_feed, constant_op.constant([0, 1]),
+                                   constant_op.constant([2]))
+        grads_and_vars = [(y_feed, var0),
+                          (constant_op.constant([3.0, 3.0], dtype=dtype), var1)]
         opt_update = mom_op.apply_gradients(grads_and_vars)
         self.evaluate(variables.global_variables_initializer())
         for t in range(1, 5):
           opt_update.run(feed_dict={x_feed: grads[t - 1]})
           var0_np, accum0_np = self._update_nesterov_momentum_numpy(
               var0_np, accum0_np, var0_np * 10, 2.0, 0.9)
-          var1_np, accum1_np = self._update_nesterov_momentum_numpy(var1_np,
-                                                                    accum1_np,
-                                                                    3, 2.0, 0.9)
+          var1_np, accum1_np = self._update_nesterov_momentum_numpy(
+              var1_np, accum1_np, 3, 2.0, 0.9)
           self.assertAllClose(var0_np, self.evaluate(var0))
           self.assertAllClose(var1_np, self.evaluate(var1))
 
@@ -249,6 +249,7 @@ class MomentumOptimizerTest(test.TestCase):
         x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
         pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
         return pred * pred
+
       # pylint: enable=cell-var-from-loop
 
       opt = momentum_lib.MomentumOptimizer(learning_rate=1.0, momentum=0.0)
@@ -464,15 +465,11 @@ class MomentumOptimizerTest(test.TestCase):
         var0 = variables.Variable(array_ops.zeros([4, 2], dtype=dtype))
         var1 = variables.Variable(constant_op.constant(1.0, dtype, [4, 2]))
         grads0 = ops.IndexedSlices(
-            constant_op.constant(
-                [[.1, .1]], dtype=dtype),
-            constant_op.constant([1]),
-            constant_op.constant([4, 2]))
+            constant_op.constant([[.1, .1]], dtype=dtype),
+            constant_op.constant([1]), constant_op.constant([4, 2]))
         grads1 = ops.IndexedSlices(
-            constant_op.constant(
-                [[.01, .01], [.01, .01]], dtype=dtype),
-            constant_op.constant([2, 3]),
-            constant_op.constant([4, 2]))
+            constant_op.constant([[.01, .01], [.01, .01]], dtype=dtype),
+            constant_op.constant([2, 3]), constant_op.constant([4, 2]))
         mom_opt = momentum_lib.MomentumOptimizer(
             learning_rate=2.0, momentum=0.9)
         mom_update = mom_opt.apply_gradients(
diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py
index 1fe8a8c729b..9e7d486123c 100644
--- a/tensorflow/python/training/optimizer.py
+++ b/tensorflow/python/training/optimizer.py
@@ -759,7 +759,7 @@ class Optimizer(
     if hasattr(var, "_distributed_container"):
       # NOTE: If this isn't patched, then there is no `handle` in
       # `_resource_apply_dense`.
-      distributed_container = var._distributed_container
+      distributed_container = var._distributed_container()
       assert distributed_container is not None
       if ops.executing_eagerly_outside_functions():
         key = distributed_container._unique_id
@@ -824,7 +824,7 @@ class Optimizer(
       with distribution_strategy.extended.colocate_vars_with(colocate_with):
         if eager:
           restored_initial_value = self._preload_simple_restoration(
-              name=name, shape=None)
+              name=name)
           if restored_initial_value is not None:
             initial_value = restored_initial_value
         v = variable_scope.variable(
@@ -1213,11 +1213,15 @@ class Optimizer(
         # (aside from double initialization), and makes variable creator scopes
         # behave the same way they do when graph building.
         and not ops.get_default_graph()._variable_creator_stack):  # pylint: disable=protected-access
-      initializer = trackable.CheckpointInitialValue(
+      initializer = trackable.CheckpointInitialValueCallable(
           checkpoint_position=slot_variable_position)
-      slot_variable = self._get_or_make_slot(
+      # CheckpointInitialValueCallable will ignore the shape and dtype
+      # parameters but they must be passed.
+      slot_variable = self._get_or_make_slot_with_initializer(
           var=variable,
-          val=initializer,
+          initializer=initializer,
+          shape=variable.shape,
+          dtype=variable.dtype,
           slot_name=slot_name,
           op_name=self._name)
       # Slot variables are not owned by any one object (because we don't want to
diff --git a/tensorflow/python/training/saving/saveable_object_util.py b/tensorflow/python/training/saving/saveable_object_util.py
index c3c3570c0f8..d4af3fb7956 100644
--- a/tensorflow/python/training/saving/saveable_object_util.py
+++ b/tensorflow/python/training/saving/saveable_object_util.py
@@ -326,7 +326,7 @@ def _add_saveable(saveables, seen_ops, saveable):
   Raises:
     ValueError: If the saveable has already been processed.
   """
-  if saveable.op in seen_ops:
+  if saveable.op is not None and saveable.op in seen_ops:
     raise ValueError("The same saveable will be restored with two names: %s" %
                      saveable.name)
   saveables.append(saveable)
diff --git a/tensorflow/python/training/server_lib_same_variables_clear_container_test.py b/tensorflow/python/training/server_lib_same_variables_clear_container_test.py
index e0ab21bbd97..f6b041b2907 100644
--- a/tensorflow/python/training/server_lib_same_variables_clear_container_test.py
+++ b/tensorflow/python/training/server_lib_same_variables_clear_container_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 from tensorflow.python.client import session
 from tensorflow.python.framework import errors_impl
-from tensorflow.python.framework import test_util
+from tensorflow.python.framework import ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
@@ -33,7 +33,6 @@ class SameVariablesClearContainerTest(test.TestCase):
   # TODO(b/34465411): Starting multiple servers with different configurations
   # in the same test is flaky. Move this test case back into
   # "server_lib_test.py" when this is no longer the case.
-  @test_util.run_deprecated_v1
   def testSameVariablesClearContainer(self):
     # Starts two servers with different names so they map to different
     # resource "containers".
@@ -47,36 +46,37 @@ class SameVariablesClearContainerTest(test.TestCase):
         }, protocol="grpc", start=True)
 
     # Creates a graph with 2 variables.
-    v0 = variables.Variable(1.0, name="v0")
-    v1 = variables.Variable(2.0, name="v0")
+    with ops.Graph().as_default():
+      v0 = variables.Variable(1.0, name="v0")
+      v1 = variables.Variable(2.0, name="v0")
 
-    # Initializes the variables. Verifies that the values are correct.
-    sess_0 = session.Session(server0.target)
-    sess_1 = session.Session(server1.target)
-    sess_0.run(v0.initializer)
-    sess_1.run(v1.initializer)
-    self.assertAllEqual(1.0, sess_0.run(v0))
-    self.assertAllEqual(2.0, sess_1.run(v1))
+      # Initializes the variables. Verifies that the values are correct.
+      sess_0 = session.Session(server0.target)
+      sess_1 = session.Session(server1.target)
+      sess_0.run(v0.initializer)
+      sess_1.run(v1.initializer)
+      self.assertAllEqual(1.0, sess_0.run(v0))
+      self.assertAllEqual(2.0, sess_1.run(v1))
 
-    # Resets container "local0". Verifies that v0 is no longer initialized.
-    session.Session.reset(server0.target, ["local0"])
-    sess = session.Session(server0.target)
-    with self.assertRaises(errors_impl.FailedPreconditionError):
-      self.evaluate(v0)
-    # Reinitializes v0 for the following test.
-    self.evaluate(v0.initializer)
+      # Resets container "local0". Verifies that v0 is no longer initialized.
+      session.Session.reset(server0.target, ["local0"])
+      _ = session.Session(server0.target)
+      with self.assertRaises(errors_impl.FailedPreconditionError):
+        self.evaluate(v0)
+      # Reinitializes v0 for the following test.
+      self.evaluate(v0.initializer)
 
-    # Verifies that v1 is still valid.
-    self.assertAllEqual(2.0, sess_1.run(v1))
+      # Verifies that v1 is still valid.
+      self.assertAllEqual(2.0, sess_1.run(v1))
 
-    # Resets container "local1". Verifies that v1 is no longer initialized.
-    session.Session.reset(server1.target, ["local1"])
-    sess = session.Session(server1.target)
-    with self.assertRaises(errors_impl.FailedPreconditionError):
-      self.evaluate(v1)
-    # Verifies that v0 is still valid.
-    sess = session.Session(server0.target)
-    self.assertAllEqual(1.0, self.evaluate(v0))
+      # Resets container "local1". Verifies that v1 is no longer initialized.
+      session.Session.reset(server1.target, ["local1"])
+      _ = session.Session(server1.target)
+      with self.assertRaises(errors_impl.FailedPreconditionError):
+        self.evaluate(v1)
+      # Verifies that v0 is still valid.
+      _ = session.Session(server0.target)
+      self.assertAllEqual(1.0, self.evaluate(v0))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/training/tracking/BUILD b/tensorflow/python/training/tracking/BUILD
index ffc43964fb4..370b78c84f5 100644
--- a/tensorflow/python/training/tracking/BUILD
+++ b/tensorflow/python/training/tracking/BUILD
@@ -146,6 +146,7 @@ py_library(
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/saved_model:utils",
         "//tensorflow/python/training/saving:checkpoint_options",
         "//tensorflow/python/training/saving:functional_saver",
         "//tensorflow/python/training/saving:saveable_object_util",
@@ -184,6 +185,7 @@ tf_py_test(
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/saved_model:save",
         "//tensorflow/python/training/saving:checkpoint_options",
         "@absl_py//absl/testing:parameterized",
         "@six_archive//:six",
diff --git a/tensorflow/python/training/tracking/base.py b/tensorflow/python/training/tracking/base.py
index a8b0410dc77..43fc9767324 100644
--- a/tensorflow/python/training/tracking/base.py
+++ b/tensorflow/python/training/tracking/base.py
@@ -54,6 +54,37 @@ TrackableReference = collections.namedtuple(
     ])
 
 
+# TODO(bfontain):  Update once sharded initialization interface is finalized.
+ShardInfo = collections.namedtuple(
+    "CheckpointInitialValueShardInfo", ["shape", "offset"])
+
+
+class CheckpointInitialValueCallable(object):
+  """A callable object that returns a CheckpointInitialValue.
+
+  See CheckpointInitialValue for more information.
+  """
+
+  def __init__(self, checkpoint_position):
+    self._checkpoint_position = checkpoint_position
+
+  @property
+  def checkpoint_position(self):
+    return self._checkpoint_position
+
+  def __call__(self, shape=None, dtype=None, shard_info=None):
+    # Note that the signature here is for compatibility with normal callable
+    # initializers which take shape and dtype. Although dtype isn't used, it
+    # will get passed in by a functool.partial_wrapper in places like
+    # base_layer_utils.py's make_variable.
+    return CheckpointInitialValue(
+        self._checkpoint_position, shape, shard_info=shard_info)
+
+  @property
+  def restore_uid(self):
+    return self._checkpoint_position.restore_uid
+
+
 class CheckpointInitialValue(ops.Tensor):
   """Tensor wrapper for managing update UIDs in `Variables`.
 
@@ -67,8 +98,18 @@ class CheckpointInitialValue(ops.Tensor):
   how `CheckpointInitialValue` is used.
   """
 
-  def __init__(self, checkpoint_position, shape=None):
-    self.wrapped_value = checkpoint_position.value_tensors()[VARIABLE_VALUE_KEY]
+  def __init__(self, checkpoint_position, shape=None, shard_info=None):
+    if shard_info:
+      full_shape_str = " ".join("%d" % d for d in shape) + " "
+      slice_spec = ":".join(
+          "%d,%d" % (o, s) for o, s in zip(shard_info.offset, shard_info.shape))
+      shape_and_slice = full_shape_str + slice_spec
+      # Override shape here so we set the correct shape below.
+      shape = shard_info.shape
+    else:
+      shape_and_slice = ""
+    self.wrapped_value = checkpoint_position.value_tensors(
+        {VARIABLE_VALUE_KEY: shape_and_slice})[VARIABLE_VALUE_KEY]
     if shape:
       # We need to set the static shape information on the initializer if
       # possible so we don't get a variable with an unknown shape.
@@ -286,12 +327,18 @@ class CheckpointPosition(object):
             attributes[0].name == VARIABLE_VALUE_KEY and
             not self.object_proto.children)
 
-  def value_tensors(self):
+  def value_tensors(self, shape_and_slices=None):
     """Create value `Tensor`s for this object's attributes.
 
     Does not require that the Python object has been created. Used for
     restore-on-create when executing eagerly.
 
+    Args:
+      shape_and_slices: A dict mapping from object attribute names to a shape
+        and slice string that will be passed to a RestoreV2 op. If the dict is
+        None or if an object attribute is not in the dict, the full tensor will
+        be restored.
+
     Returns:
       A dictionary mapping from object attribute names to `Tensor`s.
     """
@@ -304,15 +351,20 @@ class CheckpointPosition(object):
       with ops.init_scope():
         with ops.device(io_device):
           # Run the restore itself on the io_device(CPU or specified).
+          if (shape_and_slices is not None and
+              serialized_tensor.name in shape_and_slices):
+            shape_and_slice = shape_and_slices[serialized_tensor.name]
+          else:
+            shape_and_slice = ""
           value, = io_ops.restore_v2(
               prefix=self._checkpoint.save_path_tensor,
               tensor_names=[checkpoint_key],
-              shape_and_slices=[""],
+              shape_and_slices=[shape_and_slice],
               dtypes=[base_type],
               name="%s_checkpoint_read" % (serialized_tensor.name,))
         # Copy the value to the current device if necessary.
         value_tensors[serialized_tensor.name] = array_ops.identity(value)
-      return value_tensors
+    return value_tensors
 
   def gather_ops_or_named_saveables(self):
     """Looks up or creates SaveableObjects which don't have cached ops."""
@@ -735,11 +787,11 @@ class Trackable(object):
         # then assigning (when executing eagerly). This call returns None if
         # there is nothing to restore.
         checkpoint_initializer = self._preload_simple_restoration(
-            name=name, shape=shape)
+            name=name)
       else:
         checkpoint_initializer = None
       if (checkpoint_initializer is not None and
-          not (isinstance(initializer, CheckpointInitialValue) and
+          not (isinstance(initializer, CheckpointInitialValueCallable) and
                (initializer.restore_uid > checkpoint_initializer.restore_uid))):
         # If multiple Trackable objects are "creating" the same variable
         # via the magic of custom getters, the one with the highest restore UID
@@ -748,7 +800,6 @@ class Trackable(object):
         # then we'll catch that when we call _track_trackable. So this is
         # "best effort" to set the initializer with the highest restore UID.
         initializer = checkpoint_initializer
-        shape = None
     new_variable = getter(
         name=name,
         shape=shape,
@@ -767,7 +818,7 @@ class Trackable(object):
       # fallback once all get_variable() return types are Trackable.
       return new_variable
 
-  def _preload_simple_restoration(self, name, shape):
+  def _preload_simple_restoration(self, name):
     """Return a dependency's value for restore-on-create.
 
     Note the restoration is not deleted; if for some reason preload is called
@@ -778,7 +829,6 @@ class Trackable(object):
     Args:
       name: The object-local name of the dependency holding the variable's
         value.
-      shape: The shape of the variable being loaded into.
 
     Returns:
       An callable for use as a variable's initializer/initial_value, or None if
@@ -801,8 +851,8 @@ class Trackable(object):
     checkpoint_position = max(
         deferred_dependencies_list,
         key=lambda restore: restore.checkpoint.restore_uid)
-    return CheckpointInitialValue(
-        checkpoint_position=checkpoint_position, shape=shape)
+    return CheckpointInitialValueCallable(
+        checkpoint_position=checkpoint_position)
 
   def _track_trackable(self, trackable, name, overwrite=False):
     """Declare a dependency on another `Trackable` object.
diff --git a/tensorflow/python/training/tracking/graph_view.py b/tensorflow/python/training/tracking/graph_view.py
index 1cf84023b1c..6aeb41b47a9 100644
--- a/tensorflow/python/training/tracking/graph_view.py
+++ b/tensorflow/python/training/tracking/graph_view.py
@@ -142,7 +142,7 @@ def _serialize_slot_variables(trackable_objects, node_ids, object_names):
 class ObjectGraphView(object):
   """Gathers and serializes an object graph."""
 
-  def __init__(self, root, saveables_cache=None):
+  def __init__(self, root, saveables_cache=None, attached_dependencies=None):
     """Configure the graph view.
 
     Args:
@@ -151,16 +151,24 @@ class ObjectGraphView(object):
       saveables_cache: A dictionary mapping `Trackable` objects ->
         attribute names -> SaveableObjects, used to avoid re-creating
         SaveableObjects when graph building.
+      attached_dependencies: Dependencies to attach to the root object. Used
+        when saving a Checkpoint with a defined root object.
     """
     self._root_ref = root
     self._saveables_cache = saveables_cache
+    self._attached_dependencies = attached_dependencies
 
   def list_dependencies(self, obj):
     # pylint: disable=protected-access
     obj._maybe_initialize_trackable()
-    return obj._checkpoint_dependencies
+    dependencies = obj._checkpoint_dependencies
     # pylint: enable=protected-access
 
+    if obj is self.root and self._attached_dependencies:
+      dependencies = dependencies.copy()
+      dependencies.extend(self._attached_dependencies)
+    return dependencies
+
   @property
   def saveables_cache(self):
     """Maps Trackable objects -> attribute names -> list(SaveableObjects).
@@ -173,6 +181,19 @@ class ObjectGraphView(object):
     """
     return self._saveables_cache
 
+  @property
+  def attached_dependencies(self):
+    """Returns list of dependencies that should be saved in the checkpoint.
+
+    These dependencies are not tracked by root, but are in the the checkpoint.
+    This is defined when the user creates a Checkpoint with both root and kwargs
+    set.
+
+    Returns:
+      A list of TrackableReferences.
+    """
+    return self._attached_dependencies
+
   @property
   def root(self):
     if isinstance(self._root_ref, weakref.ref):
diff --git a/tensorflow/python/training/tracking/tracking.py b/tensorflow/python/training/tracking/tracking.py
index 8a27cc37cb2..3abafdcb233 100644
--- a/tensorflow/python/training/tracking/tracking.py
+++ b/tensorflow/python/training/tracking/tracking.py
@@ -18,8 +18,6 @@ from __future__ import division
 from __future__ import print_function
 
 import copy
-import functools
-import weakref
 
 from absl import logging
 
@@ -49,6 +47,7 @@ class NotTrackable(object):
   pass
 
 
+@tf_export("__internal__.tracking.AutoTrackable", v1=[])
 class AutoTrackable(base.Trackable):
   """Manages dependencies on other objects.
 
@@ -357,100 +356,5 @@ class Asset(base.Trackable):
     return self._path
 
 
-def cached_per_instance(f):
-  """Lightweight decorator for caching lazily constructed properties.
-
-  When to use:
-  This decorator provides simple caching with minimal overhead. It is designed
-  for properties which are expensive to compute and static over the life of a
-  class instance, and provides no mechanism for cache invalidation. Thus it is
-  best suited for lazily exposing derived properties of other static data.
-
-  For classes with custom getattr / setattr behavior (such as trackable
-  objects), storing cache results as object attributes is not performant.
-  Instead, a specialized cache can significantly reduce property lookup
-  overhead. (While still allowing the decorated property to be lazily computed.)
-  Consider the following class:
-
-  ```
-  class MyClass(object):
-    def __setattr__(self, key, value):
-      # Some expensive class specific code
-      # ...
-      # ...
-
-      super(MyClass, self).__setattr__(key, value)
-
-    @property
-    def thing(self):
-      # `thing` is expensive to compute (and may not even be requested), so we
-      # want to lazily compute it and then cache it.
-      output = getattr(self, '_thing', None)
-      if output is None:
-        self._thing = output = compute_thing(self)
-      return output
-  ```
-
-  It's also worth noting that ANY overriding of __setattr__, even something as
-  simple as:
-  ```
-    def __setattr__(self, key, value):
-      super(MyClass, self).__setattr__(key, value)
-  ```
-
-  Slows down attribute assignment by nearly 10x.
-
-  By contrast, replacing the definition of `thing` with the following sidesteps
-  the expensive __setattr__ altogether:
-
-  '''
-  @property
-  @tracking.cached_per_instance
-  def thing(self):
-    # `thing` is expensive to compute (and may not even be requested), so we
-    # want to lazily compute it and then cache it.
-    return compute_thing(self)
-  '''
-
-  Performance:
-  The overhead for this decorator is ~0.4 us / call. A much lower overhead
-  implementation (~0.085 us / call) can be achieved by using a custom dict type:
-
-  ```
-  def dict_based_cache(f):
-    class Cache(dict):
-      __slots__ = ()
-      def __missing__(self, key):
-        self[key] = output = f(key)
-        return output
-
-    return property(Cache().__getitem__)
-  ```
-
-  However, that implementation holds class instances as keys, and as a result
-  blocks garbage collection. (And modifying it to use weakref's as keys raises
-  the lookup overhead to ~0.4 us) As a result, the WeakKeyDictionary
-  implementation below turns out to be more prudent.
-
-  Args:
-    f: The function to cache.
-
-  Returns:
-    f decorated with simple caching behavior.
-  """
-
-  cache = weakref.WeakKeyDictionary()
-
-  @functools.wraps(f)
-  def wrapped(item):
-    output = cache.get(item)
-    if output is None:
-      cache[item] = output = f(item)
-    return output
-
-  wrapped.cache = cache
-  return wrapped
-
-
 ops.register_tensor_conversion_function(
     Asset, lambda asset, **kw: ops.convert_to_tensor(asset.asset_path, **kw))
diff --git a/tensorflow/python/training/tracking/tracking_test.py b/tensorflow/python/training/tracking/tracking_test.py
index e2b01964bb3..3d6be8c0f4b 100644
--- a/tensorflow/python/training/tracking/tracking_test.py
+++ b/tensorflow/python/training/tracking/tracking_test.py
@@ -16,13 +16,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-import contextlib
-import multiprocessing.dummy
 import os
-import pickle
-import time
-import timeit
 
 import numpy as np
 
@@ -35,23 +29,6 @@ from tensorflow.python.training.tracking import util
 from tensorflow.python.util import nest
 
 
-_PICKLEABLE_CALL_COUNT = collections.Counter()
-
-
-class MyPickleableObject(tracking.AutoTrackable):
-  """Needed for InterfaceTests.test_property_cache_serialization.
-
-  This class must be at the top level. This is a constraint of pickle,
-  unrelated to `cached_per_instance`.
-  """
-
-  @property
-  @tracking.cached_per_instance
-  def my_id(self):
-    _PICKLEABLE_CALL_COUNT[self] += 1
-    return id(self)
-
-
 class InterfaceTests(test.TestCase):
 
   def testMultipleAssignment(self):
@@ -169,120 +146,6 @@ class InterfaceTests(test.TestCase):
     self.assertAllClose({"k": [np.ones([2, 2]), np.zeros([3, 3])]},
                         self.evaluate(a.tensors))
 
-  def test_property_cache(self):
-    test_counter = collections.Counter()
-
-    class MyObject(tracking.AutoTrackable):
-
-      def __init__(self):
-        super(MyObject, self).__init__()
-        self._frozen = True
-
-      def __setattr__(self, key, value):
-        """Enforce that cache does not set attribute on MyObject."""
-        if getattr(self, "_frozen", False):
-          raise ValueError("Cannot mutate when frozen.")
-        return super(MyObject, self).__setattr__(key, value)
-
-      @property
-      @tracking.cached_per_instance
-      def test_property(self):
-        test_counter[id(self)] += 1
-        return id(self)
-
-    first_object = MyObject()
-    second_object = MyObject()
-
-    # Make sure the objects return the correct values
-    self.assertEqual(first_object.test_property, id(first_object))
-    self.assertEqual(second_object.test_property, id(second_object))
-
-    # Make sure the cache does not share across objects
-    self.assertNotEqual(first_object.test_property, second_object.test_property)
-
-    # Check again (Now the values should be cached.)
-    self.assertEqual(first_object.test_property, id(first_object))
-    self.assertEqual(second_object.test_property, id(second_object))
-
-    # Count the function calls to make sure the cache is actually being used.
-    self.assertAllEqual(tuple(test_counter.values()), (1, 1))
-
-  def test_property_cache_threaded(self):
-    call_count = collections.Counter()
-
-    class MyObject(tracking.AutoTrackable):
-
-      @property
-      @tracking.cached_per_instance
-      def test_property(self):
-        # Random sleeps to ensure that the execution thread changes
-        # mid-computation.
-        call_count["test_property"] += 1
-        time.sleep(np.random.random() + 1.)
-
-        # Use a RandomState which is seeded off the instance's id (the mod is
-        # because numpy limits the range of seeds) to ensure that an instance
-        # returns the same value in different threads, but different instances
-        # return different values.
-        return int(np.random.RandomState(id(self) % (2 ** 31)).randint(2 ** 16))
-
-      def get_test_property(self, _):
-        """Function provided to .map for threading test."""
-        return self.test_property
-
-    # Test that multiple threads return the same value. This requires that
-    # the underlying function is repeatable, as cached_property makes no attempt
-    # to prioritize the first call.
-    test_obj = MyObject()
-    with contextlib.closing(multiprocessing.dummy.Pool(32)) as pool:
-      # Intentionally make a large pool (even when there are only a small number
-      # of cpus) to ensure that the runtime switches threads.
-      results = pool.map(test_obj.get_test_property, range(64))
-    self.assertEqual(len(set(results)), 1)
-
-    # Make sure we actually are testing threaded behavior.
-    self.assertGreater(call_count["test_property"], 1)
-
-    # Make sure new threads still cache hit.
-    with contextlib.closing(multiprocessing.dummy.Pool(2)) as pool:
-      start_time = timeit.default_timer()  # Don't time pool instantiation.
-      results = pool.map(test_obj.get_test_property, range(4))
-    total_time = timeit.default_timer() - start_time
-
-    # Note(taylorrobie): The reason that it is safe to time a unit test is that
-    #                    a cache hit will be << 1 second, and a cache miss is
-    #                    guaranteed to be >= 1 second. Empirically confirmed by
-    #                    100,000 runs with no flakes.
-    self.assertLess(total_time, 0.95)
-
-  def test_property_cache_serialization(self):
-    # Reset call count. .keys() must be wrapped in a list, because otherwise we
-    # would mutate the iterator while iterating.
-    for k in list(_PICKLEABLE_CALL_COUNT.keys()):
-      _PICKLEABLE_CALL_COUNT.pop(k)
-
-    first_instance = MyPickleableObject()
-    self.assertEqual(id(first_instance), first_instance.my_id)
-
-    # Test that we can pickle and un-pickle
-    second_instance = pickle.loads(pickle.dumps(first_instance))
-
-    self.assertEqual(id(second_instance), second_instance.my_id)
-    self.assertNotEqual(first_instance.my_id, second_instance.my_id)
-
-    # Make sure de-serialized object uses the cache.
-    self.assertEqual(_PICKLEABLE_CALL_COUNT[second_instance], 1)
-
-    # Make sure the decorator cache is not being serialized with the object.
-    expected_size = len(pickle.dumps(second_instance))
-    for _ in range(5):
-      # Add some more entries to the cache.
-      _ = MyPickleableObject().my_id
-    self.assertEqual(len(_PICKLEABLE_CALL_COUNT), 7)
-    size_check_instance = MyPickleableObject()
-    _ = size_check_instance.my_id
-    self.assertEqual(expected_size, len(pickle.dumps(size_check_instance)))
-
 
 class _DummyResource(tracking.TrackableResource):
 
diff --git a/tensorflow/python/training/tracking/util.py b/tensorflow/python/training/tracking/util.py
index bf05b6ff74c..d6fdfbc04ee 100644
--- a/tensorflow/python/training/tracking/util.py
+++ b/tensorflow/python/training/tracking/util.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 import abc
 import collections
+import functools
 import os
 import weakref
 
@@ -40,7 +41,9 @@ from tensorflow.python.ops import gen_io_ops as io_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
+from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.saved_model import utils_impl
 from tensorflow.python.training import checkpoint_management
 from tensorflow.python.training import py_checkpoint_reader
 from tensorflow.python.training import saver as v1_saver_lib
@@ -55,6 +58,7 @@ from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import object_identity
 from tensorflow.python.util import tf_contextlib
+from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -395,7 +399,7 @@ class _NameBasedRestoreCoordinator(object):
             restored_tensors=restored_tensors, restored_shapes=None)
 
 
-# TODO(allenl): If this ends up in a public API, consider adding LINT.IfChange
+# TODO(allenl): If this ends up in a public API, consider adding LINT.If Change
 # or consolidating the implementation with get_variable.
 def _default_getter(name,
                     shape,
@@ -425,10 +429,16 @@ def _default_getter(name,
       # Instantiate initializer if provided initializer is a type object.
       if isinstance(initializer, type(init_ops.Initializer)):
         initializer = initializer(dtype=dtype)
-
-      def initial_value():
-        return initializer(
-            shape_object.as_list(), dtype=dtype, partition_info=partition_info)
+      shape_list = None if shape is None else shape_object.as_list()
+      if "partition_info" in tf_inspect.getargspec(initializer).args:
+        initial_value = functools.partial(initializer,
+                                          shape_list,
+                                          dtype=dtype,
+                                          partition_info=partition_info)
+      else:
+        initial_value = functools.partial(initializer,
+                                          shape_list,
+                                          dtype=dtype)
 
     return variables.VariableV1(
         initial_value=initial_value,
@@ -1325,6 +1335,30 @@ class TrackableSaver(object):
         options=options)
     base.CheckpointPosition(
         checkpoint=checkpoint, proto_id=0).restore(self._graph_view.root)
+
+    # Attached dependencies are not attached to the root, so should be restored
+    # separately.
+    if self._graph_view.attached_dependencies:
+      for ref in self._graph_view.attached_dependencies:
+        if ref.name == "root":
+          # Root dependency is automatically added to attached dependencies --
+          # this can be ignored since it maps back to the root object.
+          continue
+        proto_id = None
+        # Find proto ID of attached dependency (if it is in the proto).
+        for proto_ref in object_graph_proto.nodes[0].children:
+          if proto_ref.local_name == ref.name:
+            proto_id = proto_ref.node_id
+            break
+
+        if proto_id in checkpoint.object_by_proto_id:
+          # Object has already been restored. This can happen when there's an
+          # indirect connection from the attached object to the root.
+          continue
+
+        base.CheckpointPosition(
+            checkpoint=checkpoint, proto_id=proto_id).restore(ref.ref)
+
     load_status = CheckpointLoadStatus(
         checkpoint,
         graph_view=self._graph_view,
@@ -1358,7 +1392,7 @@ def frozen_saver(root_trackable):
   return functional_saver.MultiDeviceSaver(named_saveable_objects)
 
 
-def saver_with_op_caching(obj):
+def saver_with_op_caching(obj, attached_dependencies=None):
   """A TrackableSaver with a SaveableObject cache when graph building."""
   if context.executing_eagerly():
     saveables_cache = None
@@ -1366,7 +1400,19 @@ def saver_with_op_caching(obj):
     saveables_cache = object_identity.ObjectIdentityWeakKeyDictionary()
   return TrackableSaver(
       graph_view_lib.ObjectGraphView(
-          weakref.ref(obj), saveables_cache=saveables_cache))
+          weakref.ref(obj), saveables_cache=saveables_cache,
+          attached_dependencies=attached_dependencies))
+
+
+def _assert_trackable(obj):
+  if not isinstance(
+      obj, (base.Trackable, def_function.Function)):
+    raise ValueError(
+        "`Checkpoint` was expecting a trackable object (an object "
+        "derived from `TrackableBase`), got {}. If you believe this "
+        "object should be trackable (i.e. it is part of the "
+        "TensorFlow Python API and manages state), please open an issue."
+        .format(obj))
 
 
 # Mentions graph building / Sessions. The v2 version is below.
@@ -1737,15 +1783,32 @@ class CheckpointV1(tracking.AutoTrackable):
 
 @tf_export("train.Checkpoint", v1=[])
 class Checkpoint(tracking.AutoTrackable):
-  """Groups trackable objects, saving and restoring them.
+  """Manages saving/restoring trackable values to disk.
 
-  `Checkpoint`'s constructor accepts keyword arguments whose values are types
-  that contain trackable state, such as `tf.keras.optimizers.Optimizer`
-  implementations, `tf.Variable`s, `tf.data.Dataset` iterators, `tf.keras.Layer`
-  implementations, or `tf.keras.Model` implementations. It saves these values
-  with a checkpoint, and maintains a `save_counter` for numbering checkpoints.
+  TensorFlow objects may contain trackable state, such as `tf.Variable`s,
+  `tf.keras.optimizers.Optimizer` implementations, `tf.data.Dataset` iterators,
+  `tf.keras.Layer` implementations, or  `tf.keras.Model` implementations.
+  These are called **trackable objects**.
 
-  Example usage:
+  A `Checkpoint` object can be constructed to save either a single or group of
+  trackable objects to a checkpoint file. It maintains a `save_counter` for
+  numbering checkpoints.
+
+  Example:
+
+  ```python
+  model = tf.keras.Model(...)
+  checkpoint = tf.train.Checkpoint(model)
+
+  # Save a checkpoint to /tmp/training_checkpoints-{save_counter}. Every time
+  # checkpoint.save is called, the save counter is increased.
+  save_path = checkpoint.save('/tmp/training_checkpoints')
+
+  # Restore the checkpointed values to the `model` object.
+  checkpoint.restore(save_path)
+  ```
+
+  Example 2:
 
   ```python
   import tensorflow as tf
@@ -1805,45 +1868,79 @@ class Checkpoint(tracking.AutoTrackable):
   as a single checkpoint. This avoids copying all variables to one worker, but
   does require that all workers see a common filesystem.
 
-  While `tf.keras.Model.save_weights` and `tf.train.Checkpoint.save` save in the
-  same format, note that the root of the resulting checkpoint is the object the
-  save method is attached to. This means saving a `tf.keras.Model` using
-  `save_weights` and loading into a `tf.train.Checkpoint` with a `Model`
-  attached (or vice versa) will not match the `Model`'s variables. See the
-  [guide to training
+  This function differs slightly from the Keras Model `save_weights` function.
+  `tf.keras.Model.save_weights` creates a checkpoint file with the name
+  specified in `filepath`, while `tf.train.Checkpoint` numbers the checkpoints,
+  using `filepath` as the prefix for the checkpoint file names. Aside from this,
+  `model.save_weights()` and `tf.train.Checkpoint(model).save()` are equivalent.
+
+  See the [guide to training
   checkpoints](https://www.tensorflow.org/guide/checkpoint) for
-  details. Prefer `tf.train.Checkpoint` over `tf.keras.Model.save_weights` for
-  training checkpoints.
+  details.
 
   Attributes:
     save_counter: Incremented when `save()` is called. Used to number
       checkpoints.
   """
 
-  def __init__(self, **kwargs):
-    """Group objects into a training checkpoint.
+  def __init__(self, root=None, **kwargs):
+    """Creates a training checkpoint for a single or group of objects.
 
     Args:
+      root: The root object to checkpoint.
       **kwargs: Keyword arguments are set as attributes of this object, and are
         saved with the checkpoint. Values must be trackable objects.
 
     Raises:
-      ValueError: If objects in `kwargs` are not trackable.
+      ValueError: If `root` or the objects in `kwargs` are not trackable. A
+        `ValueError` is also raised if the `root` object tracks different
+        objects from the ones listed in attributes in kwargs (e.g.
+        `root.child = A` and `tf.train.Checkpoint(root, child=B)` are
+        incompatible).
+
     """
     super(Checkpoint, self).__init__()
-    for k, v in sorted(kwargs.items(), key=lambda item: item[0]):
-      setattr(self, k, v)
-      if not isinstance(
-          getattr(self, k), (base.Trackable, def_function.Function)):
-        raise ValueError(
-            ("`Checkpoint` was expecting a trackable object (an object "
-             "derived from `TrackableBase`), got %s. If you believe this "
-             "object should be trackable (i.e. it is part of the "
-             "TensorFlow Python API and manages state), please open an issue.")
-            % (v,))
+
+    saver_root = self
+    attached_dependencies = None
     self._save_counter = None  # Created lazily for restore-on-create.
     self._save_assign_op = None
-    self._saver = saver_with_op_caching(self)
+
+    if root:
+      _assert_trackable(root)
+      saver_root = root
+      attached_dependencies = []
+
+      # All keyword arguments (including root itself) are set as children
+      # of root.
+      kwargs["root"] = root
+      root._maybe_initialize_trackable()
+
+      self._save_counter = data_structures.NoDependency(
+          root._lookup_dependency("save_counter"))
+      self._root = data_structures.NoDependency(root)
+
+    for k, v in sorted(kwargs.items(), key=lambda item: item[0]):
+      setattr(self, k, v)
+
+      # Call getattr instead of directly using v because setattr converts
+      # v to a Trackable data structure when v is a list/dict/tuple.
+      converted_v = getattr(self, k)
+      _assert_trackable(converted_v)
+
+      if root:
+        # Make sure that root doesn't already have dependencies with these names
+        child = root._lookup_dependency(k)
+        if child is None:
+          attached_dependencies.append(base.TrackableReference(k, converted_v))
+        elif child != converted_v:
+          raise ValueError(
+              "Cannot create a Checkpoint with keyword argument {name} if "
+              "root.{name} already exists.".format(name=k))
+
+    self._saver = saver_with_op_caching(saver_root, attached_dependencies)
+    self._attached_dependencies = data_structures.NoDependency(
+        attached_dependencies)
 
   def _maybe_create_save_counter(self):
     """Create a save counter if it does not yet exist."""
@@ -1859,6 +1956,15 @@ class Checkpoint(tracking.AutoTrackable):
                 initializer=0,
                 dtype=dtypes.int64,
                 trainable=False))
+        if self._attached_dependencies is not None:
+          self._attached_dependencies.append(
+              base.TrackableReference("save_counter", self._save_counter))
+          # When loading a checkpoint, the save counter is created after
+          # the checkpoint has been loaded, so it must be handled in a deferred
+          # manner.
+          restore = self.root._deferred_dependencies.pop("save_counter", ())  # pylint: disable=protected-access
+          if restore:
+            restore[0].restore(self._save_counter)
 
   def write(self, file_prefix, options=None):
     """Writes a training checkpoint.
@@ -2074,15 +2180,32 @@ class Checkpoint(tracking.AutoTrackable):
     a matching Python object.
 
     Name-based `tf.compat.v1.train.Saver` checkpoints from TensorFlow 1.x can be
-    loaded
-    using this method. Names are used to match variables. Re-encode name-based
-    checkpoints using `tf.train.Checkpoint.save` as soon as possible.
+    loaded using this method. Names are used to match variables. Re-encode
+    name-based checkpoints using `tf.train.Checkpoint.save` as soon as possible.
+
+    **Loading from SavedModel checkpoints**
+
+    To load values from a SavedModel, just pass the SavedModel directory
+    to checkpoint.restore:
+
+    ```python
+    model = tf.keras.Model(...)
+    tf.saved_model.save(model, path)  # or model.save(path, save_format='tf')
+
+    checkpoint = tf.train.Checkpoint(model)
+    checkpoint.restore(path).expect_partial()
+    ```
+
+    This example calls `expect_partial()` on the loaded status, since
+    SavedModels saved from Keras often generates extra keys in the checkpoint.
+    Otherwise, the program prints a lot of warnings about unused keys at exit
+    time.
 
     Args:
       save_path: The path to the checkpoint, as returned by `save` or
         `tf.train.latest_checkpoint`. If the checkpoint was written by the
         name-based `tf.compat.v1.train.Saver`, names are used to match
-        variables.
+        variables. This path may also be a SavedModel directory.
       options: Optional `tf.train.CheckpointOptions` object.
 
     Returns:
@@ -2121,8 +2244,25 @@ class Checkpoint(tracking.AutoTrackable):
           restores. Warnings are otherwise printed for unused parts of the
           checkpoint file or object when the `Checkpoint` object is deleted
           (often at program shutdown).
+
+    Raises:
+      NotFoundError: if the a checkpoint or SavedModel cannot be found at
+        `save_path`.
     """
-    status = self.read(save_path, options=options)
+    orig_save_path = save_path
+
+    if save_path is not None and gfile.IsDirectory(save_path) and (
+        (gfile.Exists(utils_impl.get_saved_model_pb_path(save_path)) or
+         gfile.Exists(utils_impl.get_saved_model_pbtxt_path(save_path)))):
+      save_path = utils_impl.get_variables_path(save_path)
+
+    try:
+      status = self.read(save_path, options=options)
+    except errors_impl.NotFoundError:
+      raise errors_impl.NotFoundError(
+          None, None,
+          "Could not find checkpoint or SavedModel at {}."
+          .format(orig_save_path))
     # Create the save counter now so it gets initialized with other variables
     # when graph building. Creating it earlier would lead to errors when using,
     # say, train.Saver() to save the model before initializing it.
diff --git a/tensorflow/python/training/tracking/util_test.py b/tensorflow/python/training/tracking/util_test.py
index 4ef5f63380b..38a1e9a59fa 100644
--- a/tensorflow/python/training/tracking/util_test.py
+++ b/tensorflow/python/training/tracking/util_test.py
@@ -26,6 +26,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
@@ -37,6 +38,7 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.saved_model import save as saved_model_save
 from tensorflow.python.training import checkpoint_management
 from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.training.saving import checkpoint_options
@@ -794,6 +796,101 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
     self.assertAllClose(self.evaluate(load_checkpoint.a), [0, 1])
     self.assertAllClose(self.evaluate(load_checkpoint.b), {"a": 2, "b": 3})
 
+  def _create_trackable(self):
+    class Model(tracking.AutoTrackable):
+
+      def __init__(self):
+        self.v = variables_lib.Variable(2.)
+
+      def __call__(self, x):
+        return self.v * x
+    return Model()
+
+  def test_initialize_with_root_object(self):
+    model = self._create_trackable()
+    input_value = constant_op.constant([[3.]])
+    expected_output = self.evaluate(model(input_value))
+    model.deferred_variable = variables_lib.Variable(5.)
+
+    checkpoint = trackable_utils.Checkpoint(model)
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    save_path = checkpoint.save(checkpoint_prefix)
+
+    new_model = self._create_trackable()
+    load_checkpoint = trackable_utils.Checkpoint(new_model)
+    load_checkpoint.restore(save_path)
+    self.assertAllClose(expected_output, new_model(input_value))
+
+    new_model.deferred_variable = variables_lib.Variable(1.)
+    self.assertEqual(self.evaluate(new_model.deferred_variable), 5)
+
+  def test_initialize_with_root_object_and_kwargs(self):
+    model = self._create_trackable()
+    model.v.assign(3.)
+    separate_variable = variables_lib.Variable(5.)
+
+    with self.assertRaisesRegex(ValueError, "root.v already exists"):
+      trackable_utils.Checkpoint(model, v=separate_variable)
+
+    checkpoint = trackable_utils.Checkpoint(
+        model, separate_variable=separate_variable)
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    save_path = checkpoint.save(checkpoint_prefix)
+
+    # Case 1: Loading checkpoint with same configuration.
+    new_model = self._create_trackable()
+    separate_variable = variables_lib.Variable(1.)
+    load_checkpoint = trackable_utils.Checkpoint(
+        new_model, separate_variable=separate_variable)
+    load_checkpoint.restore(save_path).assert_consumed()
+    self.assertEqual(self.evaluate(new_model.v), 3)
+    self.assertEqual(self.evaluate(separate_variable), 5)
+    self.assertEqual(self.evaluate(load_checkpoint.save_counter), 1)
+
+    # Case 2: Loading checkpoint where v and separate_variable are swapped:
+    # v is not attached to the root, while separate variable is attached to root
+    new_model = tracking.AutoTrackable()
+    new_model.separate_variable = variables_lib.Variable(200.)
+    v = variables_lib.Variable(100.)
+    load_checkpoint = trackable_utils.Checkpoint(new_model, v=v)
+    load_checkpoint.restore(save_path).assert_consumed()
+    self.assertEqual(self.evaluate(v), 3)
+    self.assertEqual(self.evaluate(new_model.separate_variable), 5)
+    self.assertEqual(self.evaluate(load_checkpoint.save_counter), 1)
+
+    # Case 3: Loading checkpoint where no root object is specified
+    separate_variable = variables_lib.Variable(200.)
+    v = variables_lib.Variable(100.)
+    load_checkpoint = trackable_utils.Checkpoint(
+        v=v, separate_variable=separate_variable)
+    load_checkpoint.restore(save_path).assert_consumed()
+    self.assertEqual(self.evaluate(v), 3)
+    self.assertEqual(self.evaluate(new_model.separate_variable), 5)
+    self.assertEqual(self.evaluate(load_checkpoint.save_counter), 1)
+
+  def test_checkpoint_saved_model_compatibility(self):
+    model = self._create_trackable()
+    input_value = constant_op.constant([[3.]])
+    expected_output = self.evaluate(model(input_value))
+    model.deferred_variable = variables_lib.Variable(5.)
+    saved_model_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    saved_model_save.save(model, saved_model_dir)
+
+    new_model = self._create_trackable()
+    load_checkpoint = trackable_utils.Checkpoint(new_model)
+
+    with self.assertRaisesRegex(errors_impl.NotFoundError,
+                                "Could not find checkpoint or SavedModel"):
+      load_checkpoint.restore(saved_model_dir + "no").expect_partial()
+
+    load_checkpoint.restore(saved_model_dir).expect_partial()
+    self.assertAllClose(expected_output, new_model(input_value))
+
+    new_model.deferred_variable = variables_lib.Variable(1.)
+    self.assertEqual(self.evaluate(new_model.deferred_variable), 5)
+
 
 class TemplateTests(parameterized.TestCase, test.TestCase):
 
diff --git a/tensorflow/python/training/training_ops.py b/tensorflow/python/training/training_ops.py
index d7133cfb500..ba53657d6e6 100644
--- a/tensorflow/python/training/training_ops.py
+++ b/tensorflow/python/training/training_ops.py
@@ -19,8 +19,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.training import gen_training_ops  # pylint: disable=unused-import
+from tensorflow.python.ops import gen_training_ops  # pylint: disable=unused-import
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
-from tensorflow.python.training.gen_training_ops import *
+from tensorflow.python.ops.gen_training_ops import *
 # pylint: enable=wildcard-import
diff --git a/tensorflow/python/types/BUILD b/tensorflow/python/types/BUILD
index 5f3f4fd0e31..d48f066d294 100644
--- a/tensorflow/python/types/BUILD
+++ b/tensorflow/python/types/BUILD
@@ -35,6 +35,7 @@ py_strict_library(
         ":doc_typealias",
         "//tensorflow/python:tf_export",
         "//third_party/py/numpy",
+        "@typing_extensions_archive//:typing_extensions",
     ],
 )
 
diff --git a/tensorflow/python/types/core.py b/tensorflow/python/types/core.py
index bec5aecaba0..b4506594a82 100644
--- a/tensorflow/python/types/core.py
+++ b/tensorflow/python/types/core.py
@@ -18,14 +18,21 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import sys
 import textwrap
 
 from typing import Union
+
 import numpy as np
 
 from tensorflow.python.types import doc_typealias
 from tensorflow.python.util.tf_export import tf_export
 
+if sys.version_info >= (3, 8):
+  from typing import Protocol  # pylint:disable=g-import-not-at-top
+else:
+  from typing_extensions import Protocol  # pylint:disable=g-import-not-at-top
+
 # TODO(mdan): Consider adding ABC once the dependence on isinstance is reduced.
 # TODO(mdan): Add type annotations.
 
@@ -67,9 +74,24 @@ class Value(Tensor):
     pass
 
 
+class TensorProtocol(Protocol):
+  """Protocol type for objects that can be converted to Tensor."""
+
+  def __tf_tensor__(self, dtype=None, name=None):
+    """Converts this object to a Tensor.
+
+    Args:
+      dtype: data type for the returned Tensor
+      name: a name for the operations which create the Tensor
+    Returns:
+      A Tensor.
+    """
+    pass
+
+
 # TODO(rahulkamat): Add missing types that are convertible to Tensor.
-TensorLike = Union[Tensor, int, float, bool, str, complex, tuple, list,
-                   np.ndarray]
+TensorLike = Union[Tensor, TensorProtocol, int, float, bool, str, complex,
+                   tuple, list, np.ndarray]
 doc_typealias.document(
     obj=TensorLike,
     doc=textwrap.dedent("""\
diff --git a/tensorflow/python/util/function_parameter_canonicalizer.cc b/tensorflow/python/util/function_parameter_canonicalizer.cc
new file mode 100644
index 00000000000..a56cc0c4c2d
--- /dev/null
+++ b/tensorflow/python/util/function_parameter_canonicalizer.cc
@@ -0,0 +1,135 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/python/util/function_parameter_canonicalizer.h"
+
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/python/lib/core/py_util.h"
+#include "tensorflow/python/lib/core/safe_pyobject_ptr.h"
+
+namespace tensorflow {
+
+FunctionParameterCanonicalizer::FunctionParameterCanonicalizer(
+    absl::Span<const char*> arg_names, absl::Span<PyObject*> defaults)
+    : positional_args_size_(arg_names.size() - defaults.size()) {
+  DCheckPyGilState();
+  DCHECK_GE(positional_args_size_, 0);
+
+  interned_arg_names_.reserve(arg_names.size());
+  for (const char* obj : arg_names)
+    interned_arg_names_.emplace_back(PyUnicode_InternFromString(obj));
+
+  DCHECK(AreInternedArgNamesUnique());
+
+  for (PyObject* obj : defaults) Py_INCREF(obj);
+  defaults_ = std::vector<Safe_PyObjectPtr>(defaults.begin(), defaults.end());
+}
+
+bool FunctionParameterCanonicalizer::Canonicalize(
+    PyObject* args, PyObject* kwargs, absl::Span<PyObject*> result) {
+  // TODO(kkb): Closely follow `Python/ceval.c`'s logic and error handling.
+
+  DCheckPyGilState();
+  DCHECK(PyTuple_CheckExact(args));
+  DCHECK(PyDict_CheckExact(kwargs));
+  DCHECK_EQ(result.size(), interned_arg_names_.size());
+
+  const int args_size = Py_SIZE(args);
+  int remaining_positional_args_count = positional_args_size_ - args_size;
+
+  // Check if the number of input arguments are too many.
+  if (TF_PREDICT_FALSE(args_size > interned_arg_names_.size())) {
+    // TODO(kkb): Also report the actual numbers.
+    PyErr_SetString(PyExc_TypeError, "Too many arguments were given");
+    return false;
+  }
+
+  // Fill positional arguments.
+  for (int i = 0; i < args_size; ++i) result[i] = PyTuple_GET_ITEM(args, i);
+
+  // Fill default arguments.
+  for (int i = std::max(positional_args_size_, args_size);
+       i < interned_arg_names_.size(); ++i)
+    result[i] = defaults_[i - positional_args_size_].get();
+
+  // Fill keyword arguments.
+  if (kwargs != nullptr) {
+    PyObject *key, *value;
+    Py_ssize_t pos = 0;
+    while (PyDict_Next(kwargs, &pos, &key, &value)) {
+      std::size_t index = InternedArgNameLinearSearch(key);
+
+      // Check if key object(argument name) was found in the pre-built intern
+      // string table.
+      if (TF_PREDICT_FALSE(index == interned_arg_names_.size())) {
+        // `key` might not be an interend string, so get the interned string
+        // and try again.
+        PyUnicode_InternInPlace(&key);
+        index = InternedArgNameLinearSearch(key);
+
+        // Stil not found, then return an error.
+        if (TF_PREDICT_FALSE(index == interned_arg_names_.size())) {
+          PyErr_Format(PyExc_TypeError,
+                       "Got an unexpected keyword argument '%S'", key);
+          return false;
+        }
+      }
+
+      // Check if the keyword argument overlaps with positional arguments.
+      if (TF_PREDICT_FALSE(index < args_size)) {
+        PyErr_Format(PyExc_TypeError, "Got multiple values for argument '%S'",
+                     key);
+        return false;
+      }
+
+      if (TF_PREDICT_FALSE(index < positional_args_size_))
+        --remaining_positional_args_count;
+
+      result[index] = value;
+    }
+  }
+
+  // Check if all the arguments are filled.
+  // Example failure, not enough number of arguments passed: `matmul(x)`
+  if (TF_PREDICT_FALSE(remaining_positional_args_count > 0)) {
+    // TODO(kkb): Report what arguments are missing.
+    PyErr_SetString(PyExc_TypeError, "Missing required positional argument");
+    return false;
+  }
+
+  return true;
+}
+
+ABSL_MUST_USE_RESULT
+ABSL_ATTRIBUTE_HOT
+inline std::size_t FunctionParameterCanonicalizer::InternedArgNameLinearSearch(
+    PyObject* name) {
+  std::size_t result = interned_arg_names_.size();
+
+  for (std::size_t i = 0; i < interned_arg_names_.size(); ++i)
+    if (TF_PREDICT_FALSE(name == interned_arg_names_[i].get())) return i;
+
+  return result;
+}
+
+bool FunctionParameterCanonicalizer::AreInternedArgNamesUnique() {
+  absl::flat_hash_set<PyObject*> interned_arg_names_set;
+  for (const Safe_PyObjectPtr& obj : interned_arg_names_)
+    interned_arg_names_set.emplace(obj.get());
+
+  return interned_arg_names_set.size() == interned_arg_names_.size();
+}
+}  // namespace tensorflow
diff --git a/tensorflow/python/util/function_parameter_canonicalizer.h b/tensorflow/python/util/function_parameter_canonicalizer.h
new file mode 100644
index 00000000000..8e7fd7dd693
--- /dev/null
+++ b/tensorflow/python/util/function_parameter_canonicalizer.h
@@ -0,0 +1,73 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_PYTHON_UTIL_FUNCTION_PARAMETER_CANONICALIZER_H_
+#define TENSORFLOW_PYTHON_UTIL_FUNCTION_PARAMETER_CANONICALIZER_H_
+
+#include <Python.h>
+
+#include <vector>
+
+#include "absl/types/span.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/python/lib/core/safe_pyobject_ptr.h"
+
+namespace tensorflow {
+
+// A class that Canonicalizes Python arg & kwargs parameters.
+class FunctionParameterCanonicalizer {
+ public:
+  // `arg_names` is a list of argument names, and `defaults` is default PyObject
+  // instances for arguments. `default` is aligned to the end.
+  FunctionParameterCanonicalizer(absl::Span<const char*> arg_names,
+                                 absl::Span<PyObject*> defaults);
+
+  // Returns the total number of arguments.
+  ABSL_MUST_USE_RESULT
+  int GetArgSize() const { return interned_arg_names_.size(); }
+
+  // Canonicalizes `args` and `kwargs` by the spec specified at construction.
+  // It's written to `result`. Returns `true` if Canonicalization was
+  // successful, and `false` otherwise. When it fails, it also sets CPython
+  // error status.
+  // This function does not update reference counter of any Python objects.
+  // `PyObject*`s in `result` are borrowed references from `args`, `kwargs`, and
+  // possibly `defaults_`, and will be only valid if `args` and `kwargs` are
+  // still alive.
+  ABSL_MUST_USE_RESULT
+  ABSL_ATTRIBUTE_HOT
+  bool Canonicalize(PyObject* args, PyObject* kwargs,
+                    absl::Span<PyObject*> result);
+
+ private:
+  // Simple linear search of `name` in `interned_arg_names`. If found, returns
+  // the index. If not found, returns `interned_arg_names.size()`.
+  ABSL_MUST_USE_RESULT
+  ABSL_ATTRIBUTE_HOT
+  std::size_t InternedArgNameLinearSearch(PyObject* name);
+
+  // Check if `interned_arg_names_` is unique.
+  bool AreInternedArgNamesUnique();
+
+  // TODO(kkb): Use one `std::vector` and two `absl:Span`s instead to improve
+  // cache locality.
+  std::vector<Safe_PyObjectPtr> interned_arg_names_;
+  std::vector<Safe_PyObjectPtr> defaults_;
+  const int positional_args_size_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_PYTHON_UTIL_FUNCTION_PARAMETER_CANONICALIZER_H_
diff --git a/tensorflow/python/util/function_parameter_canonicalizer_binding_for_test.cc b/tensorflow/python/util/function_parameter_canonicalizer_binding_for_test.cc
new file mode 100644
index 00000000000..e93f6905734
--- /dev/null
+++ b/tensorflow/python/util/function_parameter_canonicalizer_binding_for_test.cc
@@ -0,0 +1,71 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <Python.h>
+
+#include <vector>
+
+#include "absl/types/span.h"
+#include "pybind11/pybind11.h"
+#include "pybind11/pytypes.h"
+#include "pybind11/stl.h"
+#include "tensorflow/python/util/function_parameter_canonicalizer.h"
+
+namespace py = pybind11;
+
+class FunctionParameterCanonicalizerWrapper {
+ public:
+  FunctionParameterCanonicalizerWrapper(absl::Span<const char*> arg_names,
+                                        absl::Span<PyObject*> defaults)
+      : function_parameter_canonicalizer_(arg_names, defaults) {}
+
+  tensorflow::FunctionParameterCanonicalizer function_parameter_canonicalizer_;
+};
+
+PYBIND11_MODULE(_function_parameter_canonicalizer_binding_for_test, m) {
+  py::class_<FunctionParameterCanonicalizerWrapper>(
+      m, "FunctionParameterCanonicalizer")
+      .def(py::init([](std::vector<std::string> arg_names, py::tuple defaults) {
+        std::vector<const char*> arg_names_c_str;
+        for (const std::string& name : arg_names)
+          arg_names_c_str.emplace_back(name.c_str());
+
+        tensorflow::Safe_PyObjectPtr defaults_fast(
+            PySequence_Fast(defaults.ptr(), "Expected tuple"));
+        if (!defaults) throw py::error_already_set();
+        PyObject** default_items = PySequence_Fast_ITEMS(defaults_fast.get());
+        return new FunctionParameterCanonicalizerWrapper(
+            absl::MakeSpan(arg_names_c_str),
+            absl::MakeSpan(default_items,
+                           PySequence_Fast_GET_SIZE(defaults_fast.get())));
+      }))
+      .def("canonicalize", [](FunctionParameterCanonicalizerWrapper& self,
+                              py::args args, py::kwargs kwargs) {
+        std::vector<PyObject*> result_raw(
+            self.function_parameter_canonicalizer_.GetArgSize());
+
+        bool is_suceeded = self.function_parameter_canonicalizer_.Canonicalize(
+            args.ptr(), kwargs.ptr(), absl::MakeSpan(result_raw));
+
+        if (!is_suceeded) {
+          CHECK(PyErr_Occurred());
+          throw py::error_already_set();
+        }
+
+        py::list result;
+        for (PyObject* obj : result_raw) result.append(obj);
+        return result;
+      });
+}
diff --git a/tensorflow/python/util/function_parameter_canonicalizer_test.py b/tensorflow/python/util/function_parameter_canonicalizer_test.py
new file mode 100644
index 00000000000..968265ff36f
--- /dev/null
+++ b/tensorflow/python/util/function_parameter_canonicalizer_test.py
@@ -0,0 +1,89 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tensorflow::FunctionParameterCanonicalizer`."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python import _function_parameter_canonicalizer_binding_for_test
+from tensorflow.python.platform import test
+
+
+class FunctionParameterCanonicalizerTest(test.TestCase):
+
+  def setUp(self):
+    super(FunctionParameterCanonicalizerTest, self).setUp()
+    self._matmul_func = (
+        _function_parameter_canonicalizer_binding_for_test
+        .FunctionParameterCanonicalizer([
+            'a', 'b', 'transpose_a', 'transpose_b', 'adjoint_a', 'adjoint_b',
+            'a_is_sparse', 'b_is_sparse', 'name'
+        ], (False, False, False, False, False, False, None)))
+
+  def testPosOnly(self):
+    self.assertEqual(
+        self._matmul_func.canonicalize(2, 3),
+        [2, 3, False, False, False, False, False, False, None])
+
+  def testPosOnly2(self):
+    self.assertEqual(
+        self._matmul_func.canonicalize(2, 3, True, False, True),
+        [2, 3, True, False, True, False, False, False, None])
+
+  def testPosAndKwd(self):
+    self.assertEqual(
+        self._matmul_func.canonicalize(
+            2, 3, transpose_a=True, name='my_matmul'),
+        [2, 3, True, False, False, False, False, False, 'my_matmul'])
+
+  def testPosAndKwd2(self):
+    self.assertEqual(
+        self._matmul_func.canonicalize(2, b=3),
+        [2, 3, False, False, False, False, False, False, None])
+
+  def testMissingPos(self):
+    with self.assertRaisesRegex(TypeError,
+                                'Missing required positional argument'):
+      self._matmul_func.canonicalize(2)
+
+  def testMissingPos2(self):
+    with self.assertRaisesRegex(TypeError,
+                                'Missing required positional argument'):
+      self._matmul_func.canonicalize(
+          transpose_a=True, transpose_b=True, adjoint_a=True)
+
+  def testTooManyArgs(self):
+    with self.assertRaisesRegex(TypeError, 'Too many arguments were given'):
+      self._matmul_func.canonicalize(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
+
+  def testInvalidKwd(self):
+    with self.assertRaisesRegex(TypeError,
+                                'Got an unexpected keyword argument'):
+      self._matmul_func.canonicalize(2, 3, hohoho=True)
+
+  def testDuplicatedArg(self):
+    with self.assertRaisesRegex(TypeError,
+                                "Got multiple values for argument 'b'"):
+      self._matmul_func.canonicalize(2, 3, False, b=4)
+
+  def testDuplicatedArg2(self):
+    with self.assertRaisesRegex(
+        TypeError, "Got multiple values for argument 'transpose_a'"):
+      self._matmul_func.canonicalize(2, 3, False, transpose_a=True)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/util/nest.py b/tensorflow/python/util/nest.py
index 66f43a3d682..9f4ae1d9670 100644
--- a/tensorflow/python/util/nest.py
+++ b/tensorflow/python/util/nest.py
@@ -335,6 +335,9 @@ def flatten(structure, expand_composites=False):
   Raises:
     TypeError: The nest is or contains a dict with non-sortable keys.
   """
+  if structure is None:
+    return [None]
+  expand_composites = bool(expand_composites)
   return _pywrap_utils.Flatten(structure, expand_composites)
 
 
@@ -392,6 +395,10 @@ def assert_same_structure(nest1, nest2, check_types=True,
     TypeError: If the two structures differ in the type of sequence in any of
       their substructures. Only possible if `check_types` is `True`.
   """
+  # Convert to bool explicitly as otherwise pybind will not be able# to handle
+  # type mismatch message correctly. See GitHub issue 42329 for details.
+  check_types = bool(check_types)
+  expand_composites = bool(expand_composites)
   try:
     _pywrap_utils.AssertSameStructure(nest1, nest2, check_types,
                                       expand_composites)
diff --git a/tensorflow/python/util/nest_test.py b/tensorflow/python/util/nest_test.py
index fb3f2102ba7..7f8bb247792 100644
--- a/tensorflow/python/util/nest_test.py
+++ b/tensorflow/python/util/nest_test.py
@@ -1218,6 +1218,18 @@ class NestTest(parameterized.TestCase, test.TestCase):
         expected,
     )
 
+  def testInvalidCheckTypes(self):
+    with self.assertRaises((ValueError, TypeError)):
+      nest.assert_same_structure(
+          nest1=array_ops.zeros((1)),
+          nest2=array_ops.ones((1, 1, 1)),
+          check_types=array_ops.ones((2)))
+    with self.assertRaises((ValueError, TypeError)):
+      nest.assert_same_structure(
+          nest1=array_ops.zeros((1)),
+          nest2=array_ops.ones((1, 1, 1)),
+          expand_composites=array_ops.ones((2)))
+
 
 class NestBenchmark(test.Benchmark):
 
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/ir/dialect_registration.cc b/tensorflow/python/util/tensor_float_32.cc
similarity index 70%
rename from tensorflow/compiler/mlir/tools/kernel_gen/ir/dialect_registration.cc
rename to tensorflow/python/util/tensor_float_32.cc
index a2e5955b570..6c3e8a267a4 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/ir/dialect_registration.cc
+++ b/tensorflow/python/util/tensor_float_32.cc
@@ -13,9 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.h"
+#include "pybind11/pybind11.h"
+#include "tensorflow/core/platform/tensor_float_32_utils.h"
 
-// Static initialization for TF Framework dialect registration.
-static mlir::DialectRegistration<
-    mlir::kernel_gen::tf_framework::TFFrameworkDialect>
-    tf_framework_ops;
+PYBIND11_MODULE(_pywrap_tensor_float_32_execution, m) {
+  m.def("enable", &tensorflow::enable_tensor_float_32_execution);
+  m.def("is_enabled", &tensorflow::tensor_float_32_execution_enabled);
+}
diff --git a/tensorflow/python/util/tf_stack.cc b/tensorflow/python/util/tf_stack.cc
index aa9be6305ce..7f5ff7ff8ae 100644
--- a/tensorflow/python/util/tf_stack.cc
+++ b/tensorflow/python/util/tf_stack.cc
@@ -127,6 +127,11 @@ PYBIND11_MODULE(_tf_stack, m) {
       // For compatibility with the traceback module.
       .def("__eq__", &FrameSummary::operator==)
       .def("__ne__", &FrameSummary::operator!=)
+      .def("__hash__",
+           [](const FrameSummary& self) {
+             return py::hash(
+                 py::make_tuple(self.filename, self.lineno, self.name));
+           })
       .def("__getitem__",
            [](const FrameSummary& self, const py::object& index) -> py::object {
              return py::make_tuple(self.filename, self.lineno, self.name,
diff --git a/tensorflow/python/util/tf_stack_test.py b/tensorflow/python/util/tf_stack_test.py
index dc5a2a2baa0..07dc2d3f930 100644
--- a/tensorflow/python/util/tf_stack_test.py
+++ b/tensorflow/python/util/tf_stack_test.py
@@ -52,6 +52,17 @@ class TFStackTest(test.TestCase):
     another_frame0, _ = tf_stack.extract_stack(limit=2)
     self.assertEqual(frame0, another_frame0)
 
+  def testFrameSummaryEqualityAndHash(self):
+    # Both defined on the same line to produce identical stacks.
+    frame1, frame2 = tf_stack.extract_stack(), tf_stack.extract_stack()
+    self.assertEqual(len(frame1), len(frame2))
+    for f1, f2 in zip(frame1, frame2):
+      self.assertEqual(f1, f2)
+      self.assertEqual(hash(f1), hash(f1))
+      self.assertEqual(hash(f1), hash(f2))
+    self.assertEqual(frame1, frame2)
+    self.assertEqual(hash(tuple(frame1)), hash(tuple(frame2)))
+
 
 def extract_stack(limit=None):
   # Both defined on the same line to produce identical stacks.
diff --git a/tensorflow/python/util/util.cc b/tensorflow/python/util/util.cc
index 41b02a3dd4e..a34141716c6 100644
--- a/tensorflow/python/util/util.cc
+++ b/tensorflow/python/util/util.cc
@@ -24,7 +24,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/python/lib/core/safe_ptr.h"
+#include "tensorflow/python/lib/core/safe_pyobject_ptr.h"
 
 namespace tensorflow {
 namespace swig {
diff --git a/tensorflow/security/README.md b/tensorflow/security/README.md
index 34f98e640d6..f7a756ed84e 100644
--- a/tensorflow/security/README.md
+++ b/tensorflow/security/README.md
@@ -10,7 +10,7 @@ in [SECURITY.md](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.m
 
 | Advisory Number | Type               | Versions affected | Reported by           | Additional Information      |
 |-----------------|--------------------|:-----------------:|-----------------------|-----------------------------|
-| [TFSA-2020-001](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2020-001.md)   | Segmentation fault when converting a Python string to `tf.float16` | >= 12.0, <= 2.1 | (found internally) |  |
+| [TFSA-2020-001](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2020-001.md)   | Segmentation fault when converting a Python string to `tf.float16` | >= 1.12.0, <= 2.1 | (found internally) |  |
 | [TFSA-2019-002](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2019-002.md)   | Heap buffer overflow in `UnsortedSegmentSum` | <= 1.14 | (found internally) |  |
 | [TFSA-2019-001](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2019-001.md)   | Null Pointer Dereference Error in Decoding GIF Files | <= 1.12 | Baidu Security Lab |  |
 | [TFSA-2018-006](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2018-006.md)   | Crafted Configuration File results in Invalid Memory Access | <= 1.7 | Blade Team of Tencent |  |
diff --git a/tensorflow/security/fuzzing/BUILD b/tensorflow/security/fuzzing/BUILD
index 75c62ec8bf1..2f2bf6d29b4 100644
--- a/tensorflow/security/fuzzing/BUILD
+++ b/tensorflow/security/fuzzing/BUILD
@@ -27,15 +27,6 @@ tf_fuzz_target(
     ],
 )
 
-tf_fuzz_target(
-    name = "consume_non_whitespace_fuzz",
-    srcs = ["consume_non_whitespace_fuzz.cc"],
-    deps = [
-        "//tensorflow/core/platform:str_util",
-        "//tensorflow/core/platform:stringpiece",
-    ],
-)
-
 tf_fuzz_target(
     name = "consume_leading_digits_fuzz",
     srcs = ["consume_leading_digits_fuzz.cc"],
diff --git a/tensorflow/security/fuzzing/consume_non_whitespace_fuzz.cc b/tensorflow/security/fuzzing/consume_non_whitespace_fuzz.cc
deleted file mode 100644
index 6d2b5b929b8..00000000000
--- a/tensorflow/security/fuzzing/consume_non_whitespace_fuzz.cc
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <cstdint>
-#include <cstdlib>
-
-#include "tensorflow/core/platform/str_util.h"
-#include "tensorflow/core/platform/stringpiece.h"
-
-// This is a fuzzer for tensorflow::str_util::ConsumeNonWhitespace
-
-namespace {
-
-extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
-  uint8_t *byte_data = const_cast<uint8_t *>(data);
-  char *char_data = reinterpret_cast<char *>(byte_data);
-
-  tensorflow::StringPiece sp(char_data, size);
-  tensorflow::StringPiece spe;
-
-  while (!sp.empty()) {
-    const size_t initial_size = sp.size();
-    (void)initial_size;  // "use" initial_size even if assert is disabled
-
-    const bool leading_whitespace =
-        tensorflow::str_util::ConsumeNonWhitespace(&sp, &spe);
-
-    if (leading_whitespace) {
-      assert(!spe.empty());
-    }
-    assert(initial_size == (sp.size() + spe.size()));
-
-    tensorflow::str_util::RemoveLeadingWhitespace(&sp);
-    assert(initial_size > sp.size());
-  }
-
-  return 0;
-}
-
-}  // namespace
diff --git a/tensorflow/security/fuzzing/stringprintf_fuzz.cc b/tensorflow/security/fuzzing/stringprintf_fuzz.cc
index 5cb2afe04a1..6980e0b9083 100644
--- a/tensorflow/security/fuzzing/stringprintf_fuzz.cc
+++ b/tensorflow/security/fuzzing/stringprintf_fuzz.cc
@@ -30,14 +30,30 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
   const char split_a = split & 0x07;
   const char split_b = (split >> 3) & 0x07;
 
-  const std::string sa_string = fuzzed_data.ConsumeBytesAsString(split_a);
-  const std::string sb_string = fuzzed_data.ConsumeBytesAsString(split_b);
-  const std::string sc_string = fuzzed_data.ConsumeRemainingBytesAsString();
-  const char *sa = sa_string.c_str();
-  const char *sb = sb_string.c_str();
-  const char *sc = sc_string.c_str();
+  const std::string ss[3] = {
+      fuzzed_data.ConsumeBytesAsString(split_a),
+      fuzzed_data.ConsumeBytesAsString(split_b),
+      fuzzed_data.ConsumeRemainingBytesAsString(),
+  };
+  const std::string all = ss[0] + ss[1] + ss[2];
 
-  tensorflow::strings::Printf("%s %s %s", sa, sb, sc);
+  int n[4] = {-1, -1, -1, -1};
+  const std::string ret =
+      tensorflow::strings::Printf("%n%s%n%s%n%s%n", &n[0], ss[0].c_str(), &n[1],
+                                  ss[1].c_str(), &n[2], ss[2].c_str(), &n[3]);
+
+  int size_so_far = 0;
+  for (int i = 0; i < 3; i++) {
+    assert(n[i] >= 0);
+    assert(n[i] <= size_so_far);
+    size_so_far += ss[i].size();
+  }
+
+  assert(n[3] >= 0);
+  assert(n[3] <= size_so_far);
+  assert(n[3] <= all.size());
+  assert(n[3] <= size - 1);
+  assert(ret.size() == n[3]);
 
   return 0;
 }
diff --git a/tensorflow/stream_executor/BUILD b/tensorflow/stream_executor/BUILD
index 871576f6cef..22aa60a70a4 100644
--- a/tensorflow/stream_executor/BUILD
+++ b/tensorflow/stream_executor/BUILD
@@ -67,7 +67,6 @@ cc_library(
         "plugin.h",
         "plugin_registry.h",
         "rng.h",
-        "shared_memory_config.h",
         "stream_executor_pimpl.h",
         "temporary_device_memory.h",
         "temporary_memory_manager.h",
@@ -123,7 +122,6 @@ cc_library(
         "multi_platform_manager.h",
         "platform.h",
         "plugin_registry.h",
-        "shared_memory_config.h",
         "stream_executor.h",
         "stream_executor_internal.h",
         "timer.h",
@@ -173,11 +171,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "shared_memory_config",
-    hdrs = ["shared_memory_config.h"],
-)
-
 # Aliases for backwards compatibility.
 alias(
     name = "stream_header",
@@ -343,7 +336,6 @@ cc_library(
         "kernel_cache_config.h",
         "kernel_spec.h",
         "platform.h",
-        "shared_memory_config.h",
         "stream.h",
         "stream_executor_internal.h",
         "trace_listener.h",
@@ -455,7 +447,6 @@ cc_library(
         "stream_executor_internal.cc",
     ],
     hdrs = [
-        "shared_memory_config.h",
         "stream_executor_internal.h",
     ],
     deps = [
@@ -484,7 +475,6 @@ cc_library(
         "dnn.h",
         "kernel.h",
         "kernel_cache_config.h",
-        "shared_memory_config.h",
         "stream_executor_pimpl.h",
     ],
     visibility = ["//visibility:public"],
@@ -569,7 +559,6 @@ cc_library(
         "plugin.h",
         "plugin_registry.h",
         "rng.h",
-        "shared_memory_config.h",
         "stream.h",
         "stream_executor.h",
         "stream_executor_internal.h",
@@ -619,7 +608,6 @@ cc_library(
         "plugin.h",
         "plugin_registry.h",
         "rng.h",
-        "shared_memory_config.h",
         "stream.h",
         "stream_executor.h",
         "stream_executor_internal.h",
diff --git a/tensorflow/stream_executor/cuda/BUILD b/tensorflow/stream_executor/cuda/BUILD
index f3cffc04465..dccdab8877e 100644
--- a/tensorflow/stream_executor/cuda/BUILD
+++ b/tensorflow/stream_executor/cuda/BUILD
@@ -130,6 +130,18 @@ cc_library(
     ],
 )
 
+tf_cuda_cc_test(
+    name = "cuda_driver_test",
+    srcs = ["cuda_driver_test.cc"],
+    tags = tf_cuda_tests_tags(),
+    deps = [
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/stream_executor/lib",
+        "@local_config_cuda//cuda:cuda_headers",
+    ],
+)
+
 tf_cuda_cc_test(
     name = "memcpy_test",
     srcs = ["memcpy_test.cc"],
@@ -251,7 +263,7 @@ cc_library(
         "@local_config_cuda//cuda:cuda_headers",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core/platform:tf32_utils",
+        "//tensorflow/core/platform:tensor_float_32_utils",
         "//tensorflow/stream_executor",
         "//tensorflow/stream_executor:event",
         "//tensorflow/stream_executor:host_or_device_scalar",
@@ -352,13 +364,14 @@ cc_library(
         ":cuda_timer",
         ":cudnn_version",
         ":cudnn_lib",
+        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "//third_party/eigen3",
         "@local_config_cuda//cuda:cuda_headers",
         "@local_config_cuda//cuda:cudnn_header",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core/platform:tf32_utils",
+        "//tensorflow/core/platform:tensor_float_32_utils",
         "//tensorflow/stream_executor:dnn",
         "//tensorflow/stream_executor:event",
         "//tensorflow/stream_executor:plugin_registry",
diff --git a/tensorflow/stream_executor/cuda/cuda_blas.cc b/tensorflow/stream_executor/cuda/cuda_blas.cc
index f32c8b3e81e..4b659bb81e1 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.cc
+++ b/tensorflow/stream_executor/cuda/cuda_blas.cc
@@ -49,7 +49,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "third_party/eigen3/Eigen/Core"
-#include "tensorflow/core/platform/tf32_utils.h"
+#include "tensorflow/core/platform/tensor_float_32_utils.h"
 #include "tensorflow/core/util/env_var.h"
 #include "tensorflow/stream_executor/cuda/cuda_activation.h"
 #include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h"
@@ -400,7 +400,7 @@ bool CUDABlas::DoBlasInternalImpl(FuncT cublas_func, Stream *stream,
   ScopedCublasMathMode math_mode{blas_};
 #if CUBLAS_VER_MAJOR >= 11
   if (math_type == CUBLAS_TF32_TENSOR_OP_MATH &&
-      tensorflow::tf32_execution_allowed()) {
+      tensorflow::tensor_float_32_execution_enabled()) {
 #else
   if (math_type == CUBLAS_TENSOR_OP_MATH) {
 #endif
@@ -1952,7 +1952,7 @@ bool CUDABlas::DoBlasGemmWithAlgorithmImpl(
                 << " uses tensor ops, but tensor ops are not available in sm"
                 << cc_major << "X devices for float input types.";
         return false;
-      } else if (!tensorflow::tf32_execution_allowed()) {
+      } else if (!tensorflow::tensor_float_32_execution_enabled()) {
         VLOG(2) << "DoBlasGemmWithAlgorithm returning false because algorithm "
                 << algorithm
                 << " uses tensor ops, but tensor ops are disabled for fp32"
@@ -2294,10 +2294,11 @@ port::Status CUDABlas::DoBlasGemmBatchedInternal(
 #if CUBLAS_VER_MAJOR >= 11
     } else if (data_type == CUDA_R_32F) {
       // DoBlassInternalImpl will switch math_type back to CUBLAS_DEFAULT_MATH
-      // if TF32 is disabled.
+      // if TensorFloat-32 is disabled.
       math_type = CUBLAS_TF32_TENSOR_OP_MATH;
-      algo = tensorflow::tf32_execution_allowed() ? CUBLAS_GEMM_DFALT_TENSOR_OP
-                                                  : CUBLAS_GEMM_DFALT;
+      algo = tensorflow::tensor_float_32_execution_enabled()
+                 ? CUBLAS_GEMM_DFALT_TENSOR_OP
+                 : CUBLAS_GEMM_DFALT;
 #endif
     } else {
       math_type = CUBLAS_DEFAULT_MATH;
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index 716eeea7cfa..9639d7bab09 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -19,10 +19,11 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
+#include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/platform/tf32_utils.h"
+#include "tensorflow/core/platform/tensor_float_32_utils.h"
 #include "tensorflow/core/util/env_var.h"
 #include "tensorflow/stream_executor/cuda/cuda_activation.h"
 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
@@ -740,7 +741,7 @@ static bool IsTensorMathOpSet(const CudnnConvolutionDescriptor& conv) {
 
 static bool TensorOpMathAvailable(int cc_major) { return cc_major >= 7; }
 
-static bool IsTensorMathAllowed(Stream* stream, dnn::DataType input_type) {
+static bool IsTensorMathEnabled(Stream* stream, dnn::DataType input_type) {
   int cc_major, cc_minor;
   std::tie(cc_major, cc_minor) = GetCcMajorMinor(stream);
   if (!TensorOpMathAvailable(cc_major)) {
@@ -750,7 +751,7 @@ static bool IsTensorMathAllowed(Stream* stream, dnn::DataType input_type) {
 #if CUDNN_VERSION < 8000
     return false;
 #else
-    if (!tensorflow::tf32_execution_allowed()) {
+    if (!tensorflow::tensor_float_32_execution_enabled()) {
       return false;
     }
 #endif
@@ -1099,7 +1100,7 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor {
     // TODO(csigg): Minimal support cuDNN version is 7.3, clean up.
     bool allow_tensor_ops = data_type == CUDNN_DATA_HALF;
     if (data_type == CUDNN_DATA_FLOAT)
-      allow_tensor_ops = tensorflow::tf32_execution_allowed();
+      allow_tensor_ops = tensorflow::tensor_float_32_execution_enabled();
     bool use_tensor_ops =
         algorithm_config.algorithm().has_value()
             ? algorithm_config.algorithm()->tensor_ops_enabled()
@@ -2647,12 +2648,12 @@ port::StatusOr<bool> UseTensorOps(Stream* stream, dnn::DataType type,
   bool use_tensor_ops;
   if (desc.has_value()) {
     use_tensor_ops = desc->tensor_ops_enabled();
-    if (use_tensor_ops && !IsTensorMathAllowed(stream, type)) {
+    if (use_tensor_ops && !IsTensorMathEnabled(stream, type)) {
       return port::Status(port::error::INVALID_ARGUMENT,
-                          "Algo requests disallowed tensor op evaluation.");
+                          "Algo requests disabled tensor op evaluation.");
     }
   } else {
-    use_tensor_ops = IsTensorMathAllowed(stream, type);
+    use_tensor_ops = IsTensorMathEnabled(stream, type);
   }
   return use_tensor_ops;
 }
@@ -3703,7 +3704,7 @@ port::Status CudnnSupport::DoBatchNormalizationBackwardImpl(
   return port::Status::OK();
 }
 
-bool CudnnSupport::DoFusedConvolve(
+port::Status CudnnSupport::DoFusedConvolve(
     Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
     const DeviceMemory<double>& conv_input_data, double conv_input_scale,
     const dnn::FilterDescriptor& filter_descriptor,
@@ -3716,18 +3717,16 @@ bool CudnnSupport::DoFusedConvolve(
     DeviceMemory<double>* output_data, ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
-  return IsStatusOk(
-      DoFusedConvolveImpl(
-          stream, conv_input_descriptor, conv_input_data, conv_input_scale,
-          filter_descriptor, filter_data, convolution_descriptor,
-          side_input_data, side_input_scale, bias_descriptor, biases,
-          activation_mode, output_descriptor, output_data,
-          GetConvAccumulatorType(dnn::DataType::kDouble), scratch_allocator,
-          algorithm_config, output_profile_result),
-      /*report_error=*/!output_profile_result);
+  return DoFusedConvolveImpl(
+      stream, conv_input_descriptor, conv_input_data, conv_input_scale,
+      filter_descriptor, filter_data, convolution_descriptor, side_input_data,
+      side_input_scale, bias_descriptor, biases, activation_mode,
+      output_descriptor, output_data,
+      GetConvAccumulatorType(dnn::DataType::kDouble), scratch_allocator,
+      algorithm_config, output_profile_result);
 }
 
-bool CudnnSupport::DoFusedConvolve(
+port::Status CudnnSupport::DoFusedConvolve(
     Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
     const DeviceMemory<float>& conv_input_data, float conv_input_scale,
     const dnn::FilterDescriptor& filter_descriptor,
@@ -3740,18 +3739,16 @@ bool CudnnSupport::DoFusedConvolve(
     DeviceMemory<float>* output_data, ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
-  return IsStatusOk(
-      DoFusedConvolveImpl(
-          stream, conv_input_descriptor, conv_input_data, conv_input_scale,
-          filter_descriptor, filter_data, convolution_descriptor,
-          side_input_data, side_input_scale, bias_descriptor, biases,
-          activation_mode, output_descriptor, output_data,
-          GetConvAccumulatorType(dnn::DataType::kFloat), scratch_allocator,
-          algorithm_config, output_profile_result),
-      /*report_error=*/!output_profile_result);
+  return DoFusedConvolveImpl(
+      stream, conv_input_descriptor, conv_input_data, conv_input_scale,
+      filter_descriptor, filter_data, convolution_descriptor, side_input_data,
+      side_input_scale, bias_descriptor, biases, activation_mode,
+      output_descriptor, output_data,
+      GetConvAccumulatorType(dnn::DataType::kFloat), scratch_allocator,
+      algorithm_config, output_profile_result);
 }
 
-bool CudnnSupport::DoFusedConvolve(
+port::Status CudnnSupport::DoFusedConvolve(
     Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
     const DeviceMemory<Eigen::half>& conv_input_data, float conv_input_scale,
     const dnn::FilterDescriptor& filter_descriptor,
@@ -3765,18 +3762,16 @@ bool CudnnSupport::DoFusedConvolve(
     DeviceMemory<Eigen::half>* output_data, ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
-  return IsStatusOk(
-      DoFusedConvolveImpl(
-          stream, conv_input_descriptor, conv_input_data, conv_input_scale,
-          filter_descriptor, filter_data, convolution_descriptor,
-          side_input_data, side_input_scale, bias_descriptor, biases,
-          activation_mode, output_descriptor, output_data,
-          GetConvAccumulatorType(dnn::DataType::kHalf), scratch_allocator,
-          algorithm_config, output_profile_result),
-      /*report_error=*/!output_profile_result);
+  return DoFusedConvolveImpl(
+      stream, conv_input_descriptor, conv_input_data, conv_input_scale,
+      filter_descriptor, filter_data, convolution_descriptor, side_input_data,
+      side_input_scale, bias_descriptor, biases, activation_mode,
+      output_descriptor, output_data,
+      GetConvAccumulatorType(dnn::DataType::kHalf), scratch_allocator,
+      algorithm_config, output_profile_result);
 }
 
-bool CudnnSupport::DoFusedConvolve(
+port::Status CudnnSupport::DoFusedConvolve(
     Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
     const DeviceMemory<int8>& conv_input_data, float conv_input_scale,
     const dnn::FilterDescriptor& filter_descriptor,
@@ -3793,23 +3788,21 @@ bool CudnnSupport::DoFusedConvolve(
   std::tie(cc_major, cc_minor) = GetCcMajorMinor(stream);
 
   if (cc_major < 6 || (cc_major == 6 && cc_minor < 1)) {
-    LOG(WARNING) << "cudnnConvolutionBiasActivationForward() for int8 is only "
-                    "supported on GPUs with compute capability 6.1 or later.";
-    return false;
+    return port::UnimplementedError(
+        "cudnnConvolutionBiasActivationForward() for int8 is only supported on "
+        "GPUs with compute capability 6.1 or later.");
   }
 
-  return IsStatusOk(
-      DoFusedConvolveImpl(
-          stream, conv_input_descriptor, conv_input_data, conv_input_scale,
-          filter_descriptor, filter_data, convolution_descriptor,
-          side_input_data, side_input_scale, bias_descriptor, biases,
-          activation_mode, output_descriptor, output_data,
-          GetConvAccumulatorType(dnn::DataType::kInt8), scratch_allocator,
-          algorithm_config, output_profile_result),
-      /*report_error=*/!output_profile_result);
+  return DoFusedConvolveImpl(
+      stream, conv_input_descriptor, conv_input_data, conv_input_scale,
+      filter_descriptor, filter_data, convolution_descriptor, side_input_data,
+      side_input_scale, bias_descriptor, biases, activation_mode,
+      output_descriptor, output_data,
+      GetConvAccumulatorType(dnn::DataType::kInt8), scratch_allocator,
+      algorithm_config, output_profile_result);
 }
 
-bool CudnnSupport::DoFusedConvolve(
+port::Status CudnnSupport::DoFusedConvolve(
     Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
     const DeviceMemory<int8>& conv_input_data, float conv_input_scale,
     const dnn::FilterDescriptor& filter_descriptor,
@@ -3826,20 +3819,18 @@ bool CudnnSupport::DoFusedConvolve(
   stream->parent()->GetDeviceDescription().cuda_compute_capability(&cc_major,
                                                                    &cc_minor);
   if (cc_major < 6 || (cc_major == 6 && cc_minor < 1)) {
-    LOG(WARNING) << "cudnnConvolutionBiasActivationForward() for int8 is only "
-                    "supported on GPUs with compute capability 6.1 or later.";
-    return false;
+    return port::UnimplementedError(
+        "cudnnConvolutionBiasActivationForward() for int8 is only supported on "
+        "GPUs with compute capability 6.1 or later.");
   }
 
-  return IsStatusOk(
-      DoFusedConvolveImpl(
-          stream, conv_input_descriptor, conv_input_data, conv_input_scale,
-          filter_descriptor, filter_data, convolution_descriptor,
-          side_input_data, side_input_scale, bias_descriptor, biases,
-          activation_mode, output_descriptor, output_data,
-          GetConvAccumulatorType(dnn::DataType::kInt8), scratch_allocator,
-          algorithm_config, output_profile_result),
-      /*report_error=*/!output_profile_result);
+  return DoFusedConvolveImpl(
+      stream, conv_input_descriptor, conv_input_data, conv_input_scale,
+      filter_descriptor, filter_data, convolution_descriptor, side_input_data,
+      side_input_scale, bias_descriptor, biases, activation_mode,
+      output_descriptor, output_data,
+      GetConvAccumulatorType(dnn::DataType::kInt8), scratch_allocator,
+      algorithm_config, output_profile_result);
 }
 
 port::Status CudnnSupport::DoPrepareForCtcLoss(
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.h b/tensorflow/stream_executor/cuda/cuda_dnn.h
index 181502e03ee..9cab982c9a1 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.h
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.h
@@ -277,7 +277,7 @@ class CudnnSupport : public dnn::DnnSupport {
       dnn::AlgorithmDesc algorithm_desc, DeviceMemory<uint8> scratch_memory,
       dnn::ProfileResult* output_profile_result) override;
 
-  bool DoFusedConvolve(
+  port::Status DoFusedConvolve(
       Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
       const DeviceMemory<double>& conv_input_data, double conv_input_scale,
       const dnn::FilterDescriptor& filter_descriptor,
@@ -291,7 +291,7 @@ class CudnnSupport : public dnn::DnnSupport {
       const dnn::AlgorithmConfig& algorithm_config,
       dnn::ProfileResult* output_profile_result) override;
 
-  bool DoFusedConvolve(
+  port::Status DoFusedConvolve(
       Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
       const DeviceMemory<float>& conv_input_data, float conv_input_scale,
       const dnn::FilterDescriptor& filter_descriptor,
@@ -305,25 +305,23 @@ class CudnnSupport : public dnn::DnnSupport {
       const dnn::AlgorithmConfig& algorithm_config,
       dnn::ProfileResult* output_profile_result) override;
 
-  bool DoFusedConvolve(Stream* stream,
-                       const dnn::BatchDescriptor& conv_input_descriptor,
-                       const DeviceMemory<Eigen::half>& conv_input_data,
-                       float conv_input_scale,
-                       const dnn::FilterDescriptor& filter_descriptor,
-                       const DeviceMemory<Eigen::half>& filter_data,
-                       const dnn::ConvolutionDescriptor& convolution_descriptor,
-                       const DeviceMemory<Eigen::half>& side_input_data,
-                       float side_input_scale,
-                       const dnn::BatchDescriptor& bias_descriptor,
-                       const DeviceMemory<Eigen::half>& biases,
-                       dnn::ActivationMode activation_mode,
-                       const dnn::BatchDescriptor& output_descriptor,
-                       DeviceMemory<Eigen::half>* output_data,
-                       ScratchAllocator* scratch_allocator,
-                       const dnn::AlgorithmConfig& algorithm_config,
-                       dnn::ProfileResult* output_profile_result) override;
+  port::Status DoFusedConvolve(
+      Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
+      const DeviceMemory<Eigen::half>& conv_input_data, float conv_input_scale,
+      const dnn::FilterDescriptor& filter_descriptor,
+      const DeviceMemory<Eigen::half>& filter_data,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      const DeviceMemory<Eigen::half>& side_input_data, float side_input_scale,
+      const dnn::BatchDescriptor& bias_descriptor,
+      const DeviceMemory<Eigen::half>& biases,
+      dnn::ActivationMode activation_mode,
+      const dnn::BatchDescriptor& output_descriptor,
+      DeviceMemory<Eigen::half>* output_data,
+      ScratchAllocator* scratch_allocator,
+      const dnn::AlgorithmConfig& algorithm_config,
+      dnn::ProfileResult* output_profile_result) override;
 
-  bool DoFusedConvolve(
+  port::Status DoFusedConvolve(
       Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
       const DeviceMemory<int8>& conv_input_data, float conv_input_scale,
       const dnn::FilterDescriptor& filter_descriptor,
@@ -337,7 +335,7 @@ class CudnnSupport : public dnn::DnnSupport {
       const dnn::AlgorithmConfig& algorithm_config,
       dnn::ProfileResult* output_profile_result) override;
 
-  bool DoFusedConvolve(
+  port::Status DoFusedConvolve(
       Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
       const DeviceMemory<int8>& conv_input_data, float conv_input_scale,
       const dnn::FilterDescriptor& filter_descriptor,
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.cc b/tensorflow/stream_executor/cuda/cuda_driver.cc
index e30eb549a9c..67fd72d52f3 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.cc
+++ b/tensorflow/stream_executor/cuda/cuda_driver.cc
@@ -200,6 +200,21 @@ ScopedActivateContext::ScopedActivateContext(GpuContext* cuda_context) {
   if (FLAGS_gpuexec_cuda_sync_around_driver_calls) SynchronizeOrDie();
 
   auto* tls = &tls_data.get();
+
+  // If this is an outermost scope, we must not assume that the CUDA context has
+  // been left in the same state we left it. Other code may have run on this
+  // thread and altered the context.
+  if (tls->depth == 0) {
+    VLOG(3) << "ScopedActivateContext switching to " << cuda_context->id();
+    FAIL_IF_CUDA_RES_ERROR(cuCtxSetCurrent(cuda_context->context()),
+                           "Failed setting context");
+    tls->depth = 1;
+    tls->id = cuda_context->id();
+    tls->context = cuda_context;
+    to_restore_ = nullptr;
+    return;
+  }
+
   tls->depth++;
   if (tls->id == cuda_context->id()) {
     if (kVerifyGpuContext) {
@@ -212,8 +227,7 @@ ScopedActivateContext::ScopedActivateContext(GpuContext* cuda_context) {
   VLOG(3) << "ScopedActivateContext switching context from " << tls->id
           << " to " << cuda_context->id();
 
-  to_restore_ = (tls->depth == 1 ? nullptr : tls->context);
-
+  to_restore_ = tls->context;
   // Set the context and update thread local.
   FAIL_IF_CUDA_RES_ERROR(cuCtxSetCurrent(cuda_context->context()),
                          "Failed setting context");
diff --git a/tensorflow/stream_executor/cuda/cuda_driver_test.cc b/tensorflow/stream_executor/cuda/cuda_driver_test.cc
new file mode 100644
index 00000000000..5b173f96d85
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cuda_driver_test.cc
@@ -0,0 +1,76 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#include "tensorflow/stream_executor/cuda/cuda_driver.h"
+
+#include "absl/memory/memory.h"
+#include "third_party/gpus/cuda/include/cuda_runtime_api.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace stream_executor {
+namespace gpu {
+
+void CheckCuda(CUresult result, const char* file, int line) {
+  if (result == CUDA_SUCCESS) {
+    return;
+  }
+  const char* name;
+  cuGetErrorName(result, &name);
+  const char* message;
+  cuGetErrorString(result, &message);
+  LOG(FATAL) << file << "(" << line << "): " << name << ", " << message;
+}
+
+void CheckCuda(cudaError_t result, const char* file, int line) {
+  if (result == cudaSuccess) {
+    return;
+  }
+  const char* name = cudaGetErrorName(result);
+  const char* message = cudaGetErrorString(result);
+  LOG(FATAL) << file << "(" << line << "): " << name << ", " << message;
+}
+
+#define CHECK_CUDA(result) CheckCuda(result, __FILE__, __LINE__)
+
+TEST(CudaDriverTest, ScopedActivateContextTest) {
+  CHECK_CUDA(cuInit(0));
+  CUdevice device;
+  CHECK_CUDA(cuDeviceGet(&device, 0));
+  CUcontext context0, context1;
+  CHECK_CUDA(cuCtxCreate(&context0, 0, device));
+  CHECK_CUDA(cuCtxCreate(&context1, 0, device));
+  GpuContext se_context1(context1, /*id=*/101);
+  {
+    ScopedActivateContext scope(&se_context1);
+    CUcontext c;
+    CHECK_CUDA(cuCtxGetCurrent(&c));
+    EXPECT_EQ(c, context1);
+  }
+  CHECK_CUDA(cuCtxSetCurrent(context0));
+  // ScopedActivateContext must correctly set the CUDA context even if some
+  // other code changes the context between the two scopes.
+  {
+    ScopedActivateContext scope(&se_context1);
+    CUcontext c;
+    CHECK_CUDA(cuCtxGetCurrent(&c));
+    EXPECT_EQ(c, context1);
+  }
+}
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
index 79a027f1255..d649d00ded9 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
@@ -101,12 +101,12 @@ static GpuTimer* AsGpuTimer(Timer* timer) {
 // N.B. we must lose constness in order to pass a suitable type to the existing
 // libcuda APIs, so the caller should take care to only pass the result of const
 // GPU memory conversions to libcuda functions which will honor constness.
-static CUdeviceptr AsCudaDevicePtr(const DeviceMemoryBase &gpu_mem) {
+static CUdeviceptr AsCudaDevicePtr(const DeviceMemoryBase& gpu_mem) {
   return reinterpret_cast<CUdeviceptr>(gpu_mem.opaque());
 }
 
 // See description on const version above.
-static CUdeviceptr AsCudaDevicePtr(DeviceMemoryBase *gpu_mem) {
+static CUdeviceptr AsCudaDevicePtr(DeviceMemoryBase* gpu_mem) {
   return AsCudaDevicePtr(*gpu_mem);
 }
 
@@ -225,11 +225,11 @@ port::Status GpuExecutor::LoadModuleFromCuBin(const char* cubin,
   if (*module == nullptr) {
     TF_RETURN_IF_ERROR(GpuDriver::LoadCubin(context_, cubin, module));
     module_refcount = 1;
-    VLOG(3) << "Loaded CUBIN " << static_cast<const void *>(cubin)
+    VLOG(3) << "Loaded CUBIN " << static_cast<const void*>(cubin)
             << " as module " << *module;
   } else {
     ++module_refcount;
-    VLOG(3) << "CUBIN " << static_cast<const void *>(cubin)
+    VLOG(3) << "CUBIN " << static_cast<const void*>(cubin)
             << " is already loaded as module " << *module;
   }
   gpu_binary_to_module_[cubin] = {*module, module_refcount};
@@ -242,12 +242,12 @@ port::Status GpuExecutor::LoadModuleFromPtx(const char* ptx, CUmodule* module) {
 
   if (*module == nullptr) {
     TF_RETURN_IF_ERROR(GpuDriver::LoadPtx(context_, ptx, module));
-    VLOG(3) << "Loaded PTX " << static_cast<const void *>(ptx) << " as module "
+    VLOG(3) << "Loaded PTX " << static_cast<const void*>(ptx) << " as module "
             << *module;
     module_refcount = 1;
   } else {
     ++module_refcount;
-    VLOG(3) << "PTX " << static_cast<const void *>(ptx)
+    VLOG(3) << "PTX " << static_cast<const void*>(ptx)
             << " is already loaded as module " << module;
   }
   gpu_binary_to_module_[ptx] = {*module, module_refcount};
@@ -271,7 +271,7 @@ port::Status GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
   if (spec.has_cuda_cubin_in_memory()) {
     absl::MutexLock lock{&in_memory_modules_mu_};
     kernelname = &spec.cuda_cubin_in_memory().kernelname();
-    const char *cubin = spec.cuda_cubin_in_memory().bytes();
+    const char* cubin = spec.cuda_cubin_in_memory().bytes();
     TF_RETURN_IF_ERROR(LoadModuleFromCuBin(cubin, &module));
     kernel_to_gpu_binary_[kernel] = cubin;
   } else if (spec.has_cuda_ptx_in_memory()) {
@@ -281,7 +281,7 @@ port::Status GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
       return port::InternalError("Compute capability not set");
     }
 
-    const char *ptx = spec.cuda_ptx_in_memory().text(cc_major_, cc_minor_);
+    const char* ptx = spec.cuda_ptx_in_memory().text(cc_major_, cc_minor_);
     if (ptx == nullptr) {
       ptx = spec.cuda_ptx_in_memory().default_text();
     }
@@ -318,8 +318,8 @@ bool GpuExecutor::UnloadGpuBinary(const void* gpu_binary) {
     VLOG(3) << "No loaded CUDA module for " << gpu_binary;
     return false;
   }
-  auto &module = module_it->second.first;
-  auto &refcount = module_it->second.second;
+  auto& module = module_it->second.first;
+  auto& refcount = module_it->second.second;
   VLOG(3) << "Found CUDA module " << module << " with refcount " << refcount;
   if (--refcount == 0) {
     VLOG(3) << "Unloading CUDA module " << module;
@@ -355,8 +355,8 @@ port::Status GpuExecutor::LoadModule(const MultiModuleLoaderSpec& spec,
     TF_RETURN_IF_ERROR(LoadModuleFromCuBin(
         reinterpret_cast<const char*>(spec.cuda_cubin_in_memory().data()),
         &cu_module));
-    *module_handle = ModuleHandle(const_cast<void *>(
-        static_cast<const void *>(spec.cuda_cubin_in_memory().data())));
+    *module_handle = ModuleHandle(const_cast<void*>(
+        static_cast<const void*>(spec.cuda_cubin_in_memory().data())));
     return port::Status::OK();
   } else if (spec.has_cuda_ptx_in_memory()) {
     if (cc_major_ == 0 && cc_minor_ == 0) {
@@ -370,15 +370,15 @@ port::Status GpuExecutor::LoadModule(const MultiModuleLoaderSpec& spec,
     absl::MutexLock lock{&in_memory_modules_mu_};
     TF_RETURN_IF_ERROR(
         LoadModuleFromPtx(spec.cuda_ptx_in_memory(), &cu_module));
-    *module_handle = ModuleHandle(const_cast<void *>(
-        static_cast<const void *>(spec.cuda_ptx_in_memory())));
+    *module_handle = ModuleHandle(
+        const_cast<void*>(static_cast<const void*>(spec.cuda_ptx_in_memory())));
     return port::Status::OK();
   }
   return port::InternalError("No method of loading CUDA module provided");
 }
 
 bool GpuExecutor::UnloadModule(ModuleHandle module_handle) {
-  const char *gpu_binary = reinterpret_cast<const char *>(module_handle.id());
+  const char* gpu_binary = reinterpret_cast<const char*>(module_handle.id());
   absl::MutexLock lock{&in_memory_modules_mu_};
   return UnloadGpuBinary(gpu_binary);
 }
@@ -425,7 +425,7 @@ port::Status GpuExecutor::Launch(Stream* stream, const ThreadDim& thread_dims,
         cufunc, cuda_kernel->GetGpuCacheConfig()));
   }
 
-  void **kernel_params = const_cast<void **>(args.argument_addresses().data());
+  void** kernel_params = const_cast<void**>(args.argument_addresses().data());
 
   return GpuDriver::LaunchKernel(
       context_, cufunc, block_dims.x, block_dims.y, block_dims.z, thread_dims.x,
@@ -454,7 +454,7 @@ void GpuExecutor::VlogOccupancyInfo(const KernelBase& kernel,
     return;
   }
 
-  const DeviceDescription &device_description =
+  const DeviceDescription& device_description =
       kernel.parent()->GetDeviceDescription();
 
   const GpuKernel* cuda_kernel = AsGpuKernel(&kernel);
@@ -522,7 +522,7 @@ DeviceMemoryBase GpuExecutor::Allocate(uint64 size, int64 memory_space) {
 void* GpuExecutor::GetSubBuffer(DeviceMemoryBase* mem, uint64 offset_bytes,
                                 uint64 size_bytes) {
   // offset and size are in bytes, so char* works as the pointer type.
-  return reinterpret_cast<char *>(mem->opaque()) + offset_bytes;
+  return reinterpret_cast<char*>(mem->opaque()) + offset_bytes;
 }
 
 void GpuExecutor::Deallocate(DeviceMemoryBase* mem) {
@@ -662,8 +662,8 @@ bool GpuExecutor::HostCallback(Stream* stream,
 /* static */ void GpuExecutor::InternalHostCallback(CUstream stream,
                                                     CUresult status,
                                                     void* data) {
-  std::function<void()> *callback =
-      reinterpret_cast<std::function<void()> *>(data);
+  std::function<void()>* callback =
+      reinterpret_cast<std::function<void()>*>(data);
   (*callback)();
   delete callback;
 }
@@ -744,7 +744,7 @@ port::Status GpuExecutor::BlockHostUntilDone(Stream* stream) {
 }
 
 blas::BlasSupport* GpuExecutor::CreateBlas() {
-  PluginRegistry *registry = PluginRegistry::Instance();
+  PluginRegistry* registry = PluginRegistry::Instance();
   port::StatusOr<PluginRegistry::BlasFactory> status =
       registry->GetFactory<PluginRegistry::BlasFactory>(cuda::kCudaPlatformId,
                                                         plugin_config_.blas());
@@ -758,7 +758,7 @@ blas::BlasSupport* GpuExecutor::CreateBlas() {
 }
 
 dnn::DnnSupport* GpuExecutor::CreateDnn() {
-  PluginRegistry *registry = PluginRegistry::Instance();
+  PluginRegistry* registry = PluginRegistry::Instance();
   port::StatusOr<PluginRegistry::DnnFactory> status =
       registry->GetFactory<PluginRegistry::DnnFactory>(cuda::kCudaPlatformId,
                                                        plugin_config_.dnn());
@@ -772,7 +772,7 @@ dnn::DnnSupport* GpuExecutor::CreateDnn() {
 }
 
 fft::FftSupport* GpuExecutor::CreateFft() {
-  PluginRegistry *registry = PluginRegistry::Instance();
+  PluginRegistry* registry = PluginRegistry::Instance();
   port::StatusOr<PluginRegistry::FftFactory> status =
       registry->GetFactory<PluginRegistry::FftFactory>(cuda::kCudaPlatformId,
                                                        plugin_config_.fft());
@@ -786,7 +786,7 @@ fft::FftSupport* GpuExecutor::CreateFft() {
 }
 
 rng::RngSupport* GpuExecutor::CreateRng() {
-  PluginRegistry *registry = PluginRegistry::Instance();
+  PluginRegistry* registry = PluginRegistry::Instance();
   port::StatusOr<PluginRegistry::RngFactory> status =
       registry->GetFactory<PluginRegistry::RngFactory>(cuda::kCudaPlatformId,
                                                        plugin_config_.rng());
@@ -812,47 +812,6 @@ port::Status GpuExecutor::EnablePeerAccessTo(StreamExecutorInterface* other) {
   return GpuDriver::EnablePeerAccess(context_, cuda_other->context_);
 }
 
-SharedMemoryConfig GpuExecutor::GetDeviceSharedMemoryConfig() {
-  port::StatusOr<CUsharedconfig> cuda_config =
-      GpuDriver::ContextGetSharedMemConfig(context_);
-  if (!cuda_config.ok()) {
-    // Don't log; the failed call will log necessary output.
-    return SharedMemoryConfig::kDefault;
-  }
-
-  switch (cuda_config.ValueOrDie()) {
-    case CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE:
-      return SharedMemoryConfig::kDefault;
-    case CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE:
-      return SharedMemoryConfig::kFourByte;
-    case CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE:
-      return SharedMemoryConfig::kEightByte;
-    default:
-      LOG(FATAL) << "Invalid shared memory configuration returned: "
-                 << cuda_config.ValueOrDie();
-  }
-}
-
-port::Status GpuExecutor::SetDeviceSharedMemoryConfig(
-    SharedMemoryConfig config) {
-  CUsharedconfig cuda_config;
-  switch (config) {
-    case SharedMemoryConfig::kDefault:
-      cuda_config = CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE;
-      break;
-    case SharedMemoryConfig::kFourByte:
-      cuda_config = CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE;
-      break;
-    case SharedMemoryConfig::kEightByte:
-      cuda_config = CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE;
-      break;
-    default:
-      LOG(FATAL) << "Invalid shared memory configuration specified: "
-                 << static_cast<int>(config);
-  }
-  return GpuDriver::ContextSetSharedMemConfig(context_, cuda_config);
-}
-
 bool GpuExecutor::DeviceMemoryUsage(int64* free, int64* total) const {
   return GpuDriver::GetDeviceMemoryInfo(context_, free, total);
 }
@@ -875,7 +834,7 @@ bool GpuExecutor::GetSymbol(const std::string& symbol_name,
       return lookup_in_module(it->second.first);
     }
 
-    for (auto &it : gpu_binary_to_module_) {
+    for (auto& it : gpu_binary_to_module_) {
       if (lookup_in_module(it.second.first)) {
         return true;
       }
@@ -963,7 +922,7 @@ static int TryToReadNumaNode(const std::string& pci_bus_id,
   // We have to use fopen/fread here so that the device properties can be
   // populated before InitGoogle procedure has been completed (at which point we
   // could use the file::* utilities).
-  FILE *file = fopen(filename.c_str(), "r");
+  FILE* file = fopen(filename.c_str(), "r");
   if (file == nullptr) {
     LOG(ERROR) << "could not open file to read NUMA node: " << filename
                << "\nYour kernel may have been built without NUMA support.";
@@ -980,8 +939,9 @@ static int TryToReadNumaNode(const std::string& pci_bus_id,
   if (port::safe_strto32(content, &value)) {
     if (value < 0) {  // See http://b/18228951 for details on this path.
       LOG(INFO) << "successful NUMA node read from SysFS had negative value ("
-                << value << "), but there must be at least one NUMA node"
-                            ", so returning NUMA node zero";
+                << value
+                << "), but there must be at least one NUMA node"
+                   ", so returning NUMA node zero";
       fclose(file);
       return 0;
     }
diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h
index 7b45ec2cc87..53cdff8cb7a 100644
--- a/tensorflow/stream_executor/dnn.h
+++ b/tensorflow/stream_executor/dnn.h
@@ -1163,7 +1163,7 @@ class DnnSupport {
   //   that if the inverse of the filter is applied to the output in VALID mode
   //   the result is the same size as the input - this requires even more
   //   padding of the input.
-  virtual bool DoFusedConvolve(
+  virtual port::Status DoFusedConvolve(
       Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
       const DeviceMemory<double>& conv_input_data, double conv_input_scale,
       const dnn::FilterDescriptor& filter_descriptor,
@@ -1176,11 +1176,12 @@ class DnnSupport {
       DeviceMemory<double>* output_data, ScratchAllocator* scratch_allocator,
       const dnn::AlgorithmConfig& algorithm_config,
       dnn::ProfileResult* output_profile_result) {
-    return false;
+    return port::UnimplementedError(
+        "DnnSupport::DoFusedConvolve not implemented on this platform.");
   }
 
   // This is the float version of DoFusedConvolve.
-  virtual bool DoFusedConvolve(
+  virtual port::Status DoFusedConvolve(
       Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
       const DeviceMemory<float>& conv_input_data, float conv_input_scale,
       const dnn::FilterDescriptor& filter_descriptor,
@@ -1193,12 +1194,13 @@ class DnnSupport {
       DeviceMemory<float>* output_data, ScratchAllocator* scratch_allocator,
       const dnn::AlgorithmConfig& algorithm_config,
       dnn::ProfileResult* output_profile_result) {
-    return false;
+    return port::UnimplementedError(
+        "DnnSupport::DoFusedConvolve not implemented on this platform.");
   }
 
   // This is the Eigen::half version of DoFusedConvolve.
   // The scaling parameters are still floats.
-  virtual bool DoFusedConvolve(
+  virtual port::Status DoFusedConvolve(
       Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
       const DeviceMemory<Eigen::half>& conv_input_data, float conv_input_scale,
       const dnn::FilterDescriptor& filter_descriptor,
@@ -1213,12 +1215,13 @@ class DnnSupport {
       ScratchAllocator* scratch_allocator,
       const dnn::AlgorithmConfig& algorithm_config,
       dnn::ProfileResult* output_profile_result) {
-    return false;
+    return port::UnimplementedError(
+        "DnnSupport::DoFusedConvolve not implemented on this platform.");
   }
 
   // This is the int8 version of DoFusedConvolve.
   // The bias input and scaling parameters are floats.
-  virtual bool DoFusedConvolve(
+  virtual port::Status DoFusedConvolve(
       Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
       const DeviceMemory<int8>& conv_input_data, float conv_input_scale,
       const dnn::FilterDescriptor& filter_descriptor,
@@ -1231,12 +1234,13 @@ class DnnSupport {
       DeviceMemory<int8>* output_data, ScratchAllocator* scratch_allocator,
       const dnn::AlgorithmConfig& algorithm_config,
       dnn::ProfileResult* output_profile_result) {
-    return false;
+    return port::UnimplementedError(
+        "DnnSupport::DoFusedConvolve not implemented on this platform.");
   }
 
   // This is the int8 version of DoFusedConvolve.
   // The output, bias input and scaling parameters are floats.
-  virtual bool DoFusedConvolve(
+  virtual port::Status DoFusedConvolve(
       Stream* /*stream*/, const dnn::BatchDescriptor& /*conv_input_descriptor*/,
       const DeviceMemory<int8>& /*conv_input_data*/, float /*conv_input_scale*/,
       const dnn::FilterDescriptor& /*filter_descriptor*/,
@@ -1252,7 +1256,8 @@ class DnnSupport {
       ScratchAllocator* /*scratch_allocator*/,
       const dnn::AlgorithmConfig& /*algorithm_config*/,
       dnn::ProfileResult* /*output_profile_result*/) {
-    return false;
+    return port::UnimplementedError(
+        "DnnSupport::DoFusedConvolve not implemented on this platform.");
   }
 
   template <typename ElementType, typename OutputType>
diff --git a/tensorflow/stream_executor/gpu/gpu_executor.h b/tensorflow/stream_executor/gpu/gpu_executor.h
index fc4ea0e0ab2..edc015c6126 100644
--- a/tensorflow/stream_executor/gpu/gpu_executor.h
+++ b/tensorflow/stream_executor/gpu/gpu_executor.h
@@ -188,10 +188,6 @@ class GpuExecutor : public internal::StreamExecutorInterface {
 
   bool CanEnablePeerAccessTo(StreamExecutorInterface* other) override;
 
-  SharedMemoryConfig GetDeviceSharedMemoryConfig() override;
-
-  port::Status SetDeviceSharedMemoryConfig(SharedMemoryConfig config) override;
-
   bool DeviceMemoryUsage(int64* free, int64* total) const override;
 
   // Search for the symbol and returns a device pointer and size.
diff --git a/tensorflow/stream_executor/host/BUILD b/tensorflow/stream_executor/host/BUILD
index be5af1f6ee7..e0ac5e5367b 100644
--- a/tensorflow/stream_executor/host/BUILD
+++ b/tensorflow/stream_executor/host/BUILD
@@ -54,6 +54,7 @@ cc_library(
         "//tensorflow/stream_executor:stream_executor_headers",
         "//tensorflow/stream_executor/lib",
         "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
     ],
diff --git a/tensorflow/stream_executor/host/host_gpu_executor.h b/tensorflow/stream_executor/host/host_gpu_executor.h
index 9b896fe06f8..953f8ced47f 100644
--- a/tensorflow/stream_executor/host/host_gpu_executor.h
+++ b/tensorflow/stream_executor/host/host_gpu_executor.h
@@ -148,20 +148,6 @@ class HostExecutor : public internal::StreamExecutorInterface {
     return true;
   }
 
-  SharedMemoryConfig GetDeviceSharedMemoryConfig() override {
-    LOG(INFO) << "Shared memory configuration is unsupported for host "
-              << "executors.";
-    return SharedMemoryConfig::kDefault;
-  }
-
-  port::Status SetDeviceSharedMemoryConfig(SharedMemoryConfig config) override {
-    std::string error_msg{
-        "Shared memory configuration is unsupported for host "
-        "executors."};
-    LOG(INFO) << error_msg;
-    return port::Status(port::error::UNIMPLEMENTED, error_msg);
-  }
-
   bool SupportsBlas() const override;
   blas::BlasSupport *CreateBlas() override;
 
diff --git a/tensorflow/stream_executor/host/host_platform.cc b/tensorflow/stream_executor/host/host_platform.cc
index a829ffc96fb..a8543881d50 100644
--- a/tensorflow/stream_executor/host/host_platform.cc
+++ b/tensorflow/stream_executor/host/host_platform.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <thread>
 
+#include "absl/memory/memory.h"
 #include "absl/strings/str_format.h"
 #include "tensorflow/stream_executor/host/host_gpu_executor.h"
 #include "tensorflow/stream_executor/host/host_platform_id.h"
diff --git a/tensorflow/stream_executor/platform/default/dso_loader.cc b/tensorflow/stream_executor/platform/default/dso_loader.cc
index 6e0113ab05a..70b1ebe070a 100644
--- a/tensorflow/stream_executor/platform/default/dso_loader.cc
+++ b/tensorflow/stream_executor/platform/default/dso_loader.cc
@@ -136,6 +136,10 @@ port::StatusOr<void*> GetRocrandDsoHandle() {
   return GetDsoHandle("rocrand", "");
 }
 
+port::StatusOr<void*> GetHipsparseDsoHandle() {
+  return GetDsoHandle("hipsparse", "");
+}
+
 port::StatusOr<void*> GetHipDsoHandle() { return GetDsoHandle("hip_hcc", ""); }
 
 }  // namespace DsoLoader
@@ -206,6 +210,11 @@ port::StatusOr<void*> GetRocrandDsoHandle() {
   return *result;
 }
 
+port::StatusOr<void*> GetHipsparseDsoHandle() {
+  static auto result = new auto(DsoLoader::GetHipsparseDsoHandle());
+  return *result;
+}
+
 port::StatusOr<void*> GetHipDsoHandle() {
   static auto result = new auto(DsoLoader::GetHipDsoHandle());
   return *result;
diff --git a/tensorflow/stream_executor/platform/default/dso_loader.h b/tensorflow/stream_executor/platform/default/dso_loader.h
index 7eee2e60785..91138f713bd 100644
--- a/tensorflow/stream_executor/platform/default/dso_loader.h
+++ b/tensorflow/stream_executor/platform/default/dso_loader.h
@@ -50,6 +50,7 @@ port::StatusOr<void*> GetRocblasDsoHandle();
 port::StatusOr<void*> GetMiopenDsoHandle();
 port::StatusOr<void*> GetRocfftDsoHandle();
 port::StatusOr<void*> GetRocrandDsoHandle();
+port::StatusOr<void*> GetHipsparseDsoHandle();
 port::StatusOr<void*> GetHipDsoHandle();
 
 // The following method tries to dlopen all necessary GPU libraries for the GPU
@@ -82,6 +83,7 @@ port::StatusOr<void*> GetRocblasDsoHandle();
 port::StatusOr<void*> GetMiopenDsoHandle();
 port::StatusOr<void*> GetRocfftDsoHandle();
 port::StatusOr<void*> GetRocrandDsoHandle();
+port::StatusOr<void*> GetHipsparseDsoHandle();
 port::StatusOr<void*> GetHipDsoHandle();
 }  // namespace CachedDsoLoader
 
diff --git a/tensorflow/stream_executor/rocm/BUILD b/tensorflow/stream_executor/rocm/BUILD
index bd924125d77..bd4c45382f8 100644
--- a/tensorflow/stream_executor/rocm/BUILD
+++ b/tensorflow/stream_executor/rocm/BUILD
@@ -277,6 +277,23 @@ cc_library(
     alwayslink = True,
 )
 
+cc_library(
+    name = "hipsparse_wrapper",
+    srcs = if_rocm_is_configured(["hipsparse_wrapper.h"]),
+    hdrs = if_rocm_is_configured(["hipsparse_wrapper.h"]),
+    deps = if_rocm_is_configured([
+        ":rocm_gpu_executor",
+        ":rocm_platform_id",
+        "@local_config_rocm//rocm:rocm_headers",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "//tensorflow/stream_executor/platform:dso_loader",
+    ] + if_static([
+        "@local_config_rocm//rocm:hiprand",
+    ])),
+    alwayslink = True,
+)
+
 cc_library(
     name = "all_runtime",
     copts = tf_copts(),
diff --git a/tensorflow/stream_executor/rocm/hipsparse_wrapper.h b/tensorflow/stream_executor/rocm/hipsparse_wrapper.h
new file mode 100644
index 00000000000..6444f015cf8
--- /dev/null
+++ b/tensorflow/stream_executor/rocm/hipsparse_wrapper.h
@@ -0,0 +1,105 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file wraps hipsparse API calls with dso loader so that we don't need to
+// have explicit linking to libhipsparse. All TF hipsarse API usage should route
+// through this wrapper.
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_ROCM_HIPSPARSE_WRAPPER_H_
+#define TENSORFLOW_STREAM_EXECUTOR_ROCM_HIPSPARSE_WRAPPER_H_
+
+#include "rocm/include/hipsparse/hipsparse.h"
+#include "tensorflow/stream_executor/lib/env.h"
+#include "tensorflow/stream_executor/platform/dso_loader.h"
+#include "tensorflow/stream_executor/platform/port.h"
+
+namespace tensorflow {
+namespace wrap {
+
+#ifdef PLATFORM_GOOGLE
+
+#define HIPSPARSE_API_WRAPPER(__name)               \
+  struct WrapperShim__##__name {                    \
+    template <typename... Args>                     \
+    hipsparseStatus_t operator()(Args... args) {    \
+      hipSparseStatus_t retval = ::__name(args...); \
+      return retval;                                \
+    }                                               \
+  } __name;
+
+#else
+
+#define HIPSPARSE_API_WRAPPER(__name)                                          \
+  struct DynLoadShim__##__name {                                               \
+    static const char* kName;                                                  \
+    using FuncPtrT = std::add_pointer<decltype(::__name)>::type;               \
+    static void* GetDsoHandle() {                                              \
+      auto s =                                                                 \
+          stream_executor::internal::CachedDsoLoader::GetHipsparseDsoHandle(); \
+      return s.ValueOrDie();                                                   \
+    }                                                                          \
+    static FuncPtrT LoadOrDie() {                                              \
+      void* f;                                                                 \
+      auto s =                                                                 \
+          Env::Default()->GetSymbolFromLibrary(GetDsoHandle(), kName, &f);     \
+      CHECK(s.ok()) << "could not find " << kName                              \
+                    << " in miopen DSO; dlerror: " << s.error_message();       \
+      return reinterpret_cast<FuncPtrT>(f);                                    \
+    }                                                                          \
+    static FuncPtrT DynLoad() {                                                \
+      static FuncPtrT f = LoadOrDie();                                         \
+      return f;                                                                \
+    }                                                                          \
+    template <typename... Args>                                                \
+    hipsparseStatus_t operator()(Args... args) {                               \
+      return DynLoad()(args...);                                               \
+    }                                                                          \
+  } __name;                                                                    \
+  const char* DynLoadShim__##__name::kName = #__name;
+
+#endif
+
+// clang-format off
+#define FOREACH_HIPSPARSE_API(__macro)		\
+  __macro(hipsparseCreate)			\
+  __macro(hipsparseCreateMatDescr)		\
+  __macro(hipsparseDcsr2csc)			\
+  __macro(hipsparseDcsrgemm)			\
+  __macro(hipsparseDcsrmm2)			\
+  __macro(hipsparseDcsrmv)			\
+  __macro(hipsparseDestroy)			\
+  __macro(hipsparseDestroyMatDescr)		\
+  __macro(hipsparseScsr2csc)			\
+  __macro(hipsparseScsrgemm)			\
+  __macro(hipsparseScsrmm2)			\
+  __macro(hipsparseScsrmv)			\
+  __macro(hipsparseSetStream)			\
+  __macro(hipsparseSetMatIndexBase)		\
+  __macro(hipsparseSetMatType)			\
+  __macro(hipsparseXcoo2csr)			\
+  __macro(hipsparseXcsr2coo)			\
+  __macro(hipsparseXcsrgemmNnz)
+
+// clang-format on
+
+FOREACH_HIPSPARSE_API(HIPSPARSE_API_WRAPPER)
+
+#undef FOREACH_HIPSPARSE_API
+#undef HIPSPARSE_API_WRAPPER
+
+}  // namespace wrap
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_ROCM_HIPSPARSE_WRAPPER_H_
diff --git a/tensorflow/stream_executor/rocm/rocm_dnn.cc b/tensorflow/stream_executor/rocm/rocm_dnn.cc
index 4b2761e7658..80306105d4a 100644
--- a/tensorflow/stream_executor/rocm/rocm_dnn.cc
+++ b/tensorflow/stream_executor/rocm/rocm_dnn.cc
@@ -3680,7 +3680,7 @@ bool MIOpenSupport::DoBatchNormalizationBackwardImpl(
   return true;
 }
 
-bool MIOpenSupport::DoFusedConvolve(
+port::Status MIOpenSupport::DoFusedConvolve(
     Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
     const DeviceMemory<double>& conv_input_data, double conv_input_scale,
     const dnn::FilterDescriptor& filter_descriptor,
@@ -3693,11 +3693,10 @@ bool MIOpenSupport::DoFusedConvolve(
     DeviceMemory<double>* output_data, ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
-  LOG(ERROR) << "fused convolve not implemented yet";
-  return false;
+  return port::UnimplementedError("fused convolve not implemented yet");
 }
 
-bool MIOpenSupport::DoFusedConvolve(
+port::Status MIOpenSupport::DoFusedConvolve(
     Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
     const DeviceMemory<float>& conv_input_data, float conv_input_scale,
     const dnn::FilterDescriptor& filter_descriptor,
@@ -3710,11 +3709,10 @@ bool MIOpenSupport::DoFusedConvolve(
     DeviceMemory<float>* output_data, ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
-  LOG(ERROR) << "fused convolve not implemented yet";
-  return false;
+  return port::UnimplementedError("fused convolve not implemented yet");
 }
 
-bool MIOpenSupport::DoFusedConvolve(
+port::Status MIOpenSupport::DoFusedConvolve(
     Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
     const DeviceMemory<Eigen::half>& conv_input_data, float conv_input_scale,
     const dnn::FilterDescriptor& filter_descriptor,
@@ -3728,11 +3726,10 @@ bool MIOpenSupport::DoFusedConvolve(
     DeviceMemory<Eigen::half>* output_data, ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
-  LOG(ERROR) << "fused convolve not implemented yet";
-  return false;
+  return port::UnimplementedError("fused convolve not implemented yet");
 }
 
-bool MIOpenSupport::DoFusedConvolve(
+port::Status MIOpenSupport::DoFusedConvolve(
     Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
     const DeviceMemory<int8>& conv_input_data, float conv_input_scale,
     const dnn::FilterDescriptor& filter_descriptor,
@@ -3745,8 +3742,7 @@ bool MIOpenSupport::DoFusedConvolve(
     DeviceMemory<int8>* output_data, ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
-  LOG(ERROR) << "fused convolve not implemented yet";
-  return false;
+  return port::UnimplementedError("fused convolve not implemented yet");
 }
 
 bool MIOpenSupport::DoTransformTensor(Stream* stream,
diff --git a/tensorflow/stream_executor/rocm/rocm_dnn.h b/tensorflow/stream_executor/rocm/rocm_dnn.h
index b01c1cc5290..654a1bf8f3a 100644
--- a/tensorflow/stream_executor/rocm/rocm_dnn.h
+++ b/tensorflow/stream_executor/rocm/rocm_dnn.h
@@ -315,7 +315,7 @@ class MIOpenSupport : public dnn::DnnSupport {
       dnn::AlgorithmDesc algorithm_desc, DeviceMemory<uint8> scratch_memory,
       dnn::ProfileResult* output_profile_result) override;
 
-  bool DoFusedConvolve(
+  port::Status DoFusedConvolve(
       Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
       const DeviceMemory<double>& conv_input_data, double conv_input_scale,
       const dnn::FilterDescriptor& filter_descriptor,
@@ -329,7 +329,7 @@ class MIOpenSupport : public dnn::DnnSupport {
       const dnn::AlgorithmConfig& algorithm_config,
       dnn::ProfileResult* output_profile_result) override;
 
-  bool DoFusedConvolve(
+  port::Status DoFusedConvolve(
       Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
       const DeviceMemory<float>& conv_input_data, float conv_input_scale,
       const dnn::FilterDescriptor& filter_descriptor,
@@ -343,25 +343,23 @@ class MIOpenSupport : public dnn::DnnSupport {
       const dnn::AlgorithmConfig& algorithm_config,
       dnn::ProfileResult* output_profile_result) override;
 
-  bool DoFusedConvolve(Stream* stream,
-                       const dnn::BatchDescriptor& conv_input_descriptor,
-                       const DeviceMemory<Eigen::half>& conv_input_data,
-                       float conv_input_scale,
-                       const dnn::FilterDescriptor& filter_descriptor,
-                       const DeviceMemory<Eigen::half>& filter_data,
-                       const dnn::ConvolutionDescriptor& convolution_descriptor,
-                       const DeviceMemory<Eigen::half>& side_input_data,
-                       float side_input_scale,
-                       const dnn::BatchDescriptor& bias_descriptor,
-                       const DeviceMemory<Eigen::half>& biases,
-                       dnn::ActivationMode activation_mode,
-                       const dnn::BatchDescriptor& output_descriptor,
-                       DeviceMemory<Eigen::half>* output_data,
-                       ScratchAllocator* scratch_allocator,
-                       const dnn::AlgorithmConfig& algorithm_config,
-                       dnn::ProfileResult* output_profile_result) override;
+  port::Status DoFusedConvolve(
+      Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
+      const DeviceMemory<Eigen::half>& conv_input_data, float conv_input_scale,
+      const dnn::FilterDescriptor& filter_descriptor,
+      const DeviceMemory<Eigen::half>& filter_data,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      const DeviceMemory<Eigen::half>& side_input_data, float side_input_scale,
+      const dnn::BatchDescriptor& bias_descriptor,
+      const DeviceMemory<Eigen::half>& biases,
+      dnn::ActivationMode activation_mode,
+      const dnn::BatchDescriptor& output_descriptor,
+      DeviceMemory<Eigen::half>* output_data,
+      ScratchAllocator* scratch_allocator,
+      const dnn::AlgorithmConfig& algorithm_config,
+      dnn::ProfileResult* output_profile_result) override;
 
-  bool DoFusedConvolve(
+  port::Status DoFusedConvolve(
       Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
       const DeviceMemory<int8>& conv_input_data, float conv_input_scale,
       const dnn::FilterDescriptor& filter_descriptor,
diff --git a/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc b/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc
index fd3b5f19913..2a85cb820ed 100644
--- a/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc
+++ b/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc
@@ -720,47 +720,6 @@ port::Status GpuExecutor::EnablePeerAccessTo(StreamExecutorInterface* other) {
   return GpuDriver::EnablePeerAccess(context_, rocm_other->context_);
 }
 
-SharedMemoryConfig GpuExecutor::GetDeviceSharedMemoryConfig() {
-  port::StatusOr<hipSharedMemConfig> rocm_config =
-      GpuDriver::ContextGetSharedMemConfig(context_);
-  if (!rocm_config.ok()) {
-    // Don't log; the failed call will log necessary output.
-    return SharedMemoryConfig::kDefault;
-  }
-
-  switch (rocm_config.ValueOrDie()) {
-    case hipSharedMemBankSizeDefault:
-      return SharedMemoryConfig::kDefault;
-    case hipSharedMemBankSizeFourByte:
-      return SharedMemoryConfig::kFourByte;
-    case hipSharedMemBankSizeEightByte:
-      return SharedMemoryConfig::kEightByte;
-    default:
-      LOG(FATAL) << "Invalid shared memory configuration returned: "
-                 << rocm_config.ValueOrDie();
-  }
-}
-
-port::Status GpuExecutor::SetDeviceSharedMemoryConfig(
-    SharedMemoryConfig config) {
-  hipSharedMemConfig rocm_config;
-  switch (config) {
-    case SharedMemoryConfig::kDefault:
-      rocm_config = hipSharedMemBankSizeDefault;
-      break;
-    case SharedMemoryConfig::kFourByte:
-      rocm_config = hipSharedMemBankSizeFourByte;
-      break;
-    case SharedMemoryConfig::kEightByte:
-      rocm_config = hipSharedMemBankSizeEightByte;
-      break;
-    default:
-      LOG(FATAL) << "Invalid shared memory configuration specified: "
-                 << static_cast<int>(config);
-  }
-  return GpuDriver::ContextSetSharedMemConfig(context_, rocm_config);
-}
-
 bool GpuExecutor::DeviceMemoryUsage(int64* free, int64* total) const {
   return GpuDriver::GetDeviceMemoryInfo(context_, free, total);
 }
@@ -768,24 +727,24 @@ bool GpuExecutor::DeviceMemoryUsage(int64* free, int64* total) const {
 bool GpuExecutor::GetSymbol(const string& symbol_name,
                             ModuleHandle module_handle, void** mem,
                             size_t* bytes) {
-    absl::MutexLock lock{&in_memory_modules_mu_};
-    if (static_cast<bool>(module_handle)) {
-      auto it = gpu_binary_to_module_.find(module_handle.id());
-      CHECK(it != gpu_binary_to_module_.end());
-      if (GpuDriver::GetModuleSymbol(
-              context_, it->second.first, symbol_name.c_str(),
-              reinterpret_cast<hipDeviceptr_t*>(mem), bytes)) {
-        return true;
-      }
+  absl::MutexLock lock{&in_memory_modules_mu_};
+  if (static_cast<bool>(module_handle)) {
+    auto it = gpu_binary_to_module_.find(module_handle.id());
+    CHECK(it != gpu_binary_to_module_.end());
+    if (GpuDriver::GetModuleSymbol(
+            context_, it->second.first, symbol_name.c_str(),
+            reinterpret_cast<hipDeviceptr_t*>(mem), bytes)) {
+      return true;
     }
+  }
 
-    for (auto& it : gpu_binary_to_module_) {
-      if (GpuDriver::GetModuleSymbol(
-              context_, it.second.first, symbol_name.c_str(),
-              reinterpret_cast<hipDeviceptr_t*>(mem), bytes)) {
-        return true;
-      }
+  for (auto& it : gpu_binary_to_module_) {
+    if (GpuDriver::GetModuleSymbol(
+            context_, it.second.first, symbol_name.c_str(),
+            reinterpret_cast<hipDeviceptr_t*>(mem), bytes)) {
+      return true;
     }
+  }
 
   LOG(INFO) << "Falied to find symbol in any modules: " << symbol_name;
   return false;
diff --git a/tensorflow/stream_executor/shared_memory_config.h b/tensorflow/stream_executor/shared_memory_config.h
deleted file mode 100644
index 7cbeb3bcd91..00000000000
--- a/tensorflow/stream_executor/shared_memory_config.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This file defines a uniform interface to configuration options for shared
-// memory for supported devices. As with many StreamExecutor-supported features,
-// support for the options defined herein is device-dependent.
-#ifndef TENSORFLOW_STREAM_EXECUTOR_SHARED_MEMORY_CONFIG_H_
-#define TENSORFLOW_STREAM_EXECUTOR_SHARED_MEMORY_CONFIG_H_
-
-namespace stream_executor {
-
-// SharedMemoryConfig enum describes potential widths of shared memory banks for
-// a device or kernel.
-enum class SharedMemoryConfig {
-  kDefault,    // Use the device default configuration.
-  kFourByte,   // Sets shared memory banks to be four bytes wide.
-  kEightByte,  // Sets shared memory banks to be eight bytes wide.
-};
-
-}  // namespace stream_executor
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_SHARED_MEMORY_CONFIG_H_
diff --git a/tensorflow/stream_executor/stream.cc b/tensorflow/stream_executor/stream.cc
index 505d54cf5bf..62689e61be1 100644
--- a/tensorflow/stream_executor/stream.cc
+++ b/tensorflow/stream_executor/stream.cc
@@ -251,7 +251,7 @@ Stream::Stream(StreamExecutor *parent)
     : parent_(parent),
       implementation_(parent->implementation()->GetStreamImplementation()),
       allocated_(false),
-      ok_(false),
+      status_(port::InternalError("Uninitialized stream")),
       temporary_memory_manager_(this) {
   VLOG_CALL(PARAM(parent));
 }
@@ -261,7 +261,7 @@ Stream::Stream(StreamExecutor *parent,
     : parent_(parent),
       implementation_(implementation),
       allocated_(false),
-      ok_(false),
+      status_(port::InternalError("Uninitialized stream")),
       temporary_memory_manager_(this) {
   VLOG_CALL(PARAM(parent), PARAM(implementation));
 }
@@ -300,12 +300,12 @@ Stream &Stream::Init() {
   absl::MutexLock lock(&mu_);
   CHECK_EQ(false, allocated_)
       << "stream appears to already have been initialized";
-  CHECK(!ok_) << "stream should be in !ok() state pre-initialization";
+  CHECK(!status_.ok()) << "stream should be in !ok() state pre-initialization";
 
   if (parent_->AllocateStream(this)) {
     // Successful initialization!
     allocated_ = true;
-    ok_ = true;
+    status_ = port::Status::OK();
   } else {
     LOG(ERROR) << "failed to allocate stream during initialization";
   }
@@ -316,11 +316,7 @@ Stream &Stream::Init() {
 Stream &Stream::InitTimer(Timer *timer) {
   VLOG_CALL(PARAM(timer));
 
-  if (ok()) {
     CheckError(parent_->AllocateTimer(timer));
-  } else {
-    LOG(INFO) << "did not allocate timer: " << timer;
-  }
   return *this;
 }
 
@@ -359,17 +355,14 @@ Stream &Stream::ThenBatchNormalizationForward(
     ScratchAllocator *workspace_allocator) {
   VLOG_CALL(PARAM(x), PARAM(scale), PARAM(offset), PARAM(x_desc),
             PARAM(scale_offset_desc), PARAM(epsilon), PARAM(y));
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoBatchNormalizationForward(
-          this, x, scale, offset, estimated_mean, estimated_variance,
-          side_input, x_desc, scale_offset_desc, epsilon,
-          exponential_average_factor, activation_mode, y, batch_mean, batch_var,
-          saved_mean, saved_inv_var, is_training, reserve_space_allocator,
-          workspace_allocator));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoBatchNormalizationForward(
+        this, x, scale, offset, estimated_mean, estimated_variance, side_input,
+        x_desc, scale_offset_desc, epsilon, exponential_average_factor,
+        activation_mode, y, batch_mean, batch_var, saved_mean, saved_inv_var,
+        is_training, reserve_space_allocator, workspace_allocator));
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -386,15 +379,13 @@ Stream &Stream::ThenBatchNormalizationBackward(
   VLOG_CALL(PARAM(y_backprop), PARAM(x), PARAM(scale), PARAM(x_desc),
             PARAM(scale_offset_desc), PARAM(epsilon), PARAM(x_backprop),
             PARAM(scale_backprop), PARAM(offset_backprop));
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoBatchNormalizationBackward(
-          this, y_backprop, x, scale, mean, inv_var, x_desc, scale_offset_desc,
-          epsilon, x_backprop, scale_backprop, offset_backprop,
-          reserve_space_data, workspace_allocator));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoBatchNormalizationBackward(
+        this, y_backprop, x, scale, mean, inv_var, x_desc, scale_offset_desc,
+        epsilon, x_backprop, scale_backprop, offset_backprop,
+        reserve_space_data, workspace_allocator));
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -415,17 +406,14 @@ Stream &Stream::ThenBatchNormalizationForward(
     ScratchAllocator *workspace_allocator) {
   VLOG_CALL(PARAM(x), PARAM(scale), PARAM(offset), PARAM(x_desc),
             PARAM(scale_offset_desc), PARAM(epsilon), PARAM(y));
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoBatchNormalizationForward(
-          this, x, scale, offset, estimated_mean, estimated_variance,
-          side_input, x_desc, scale_offset_desc, epsilon,
-          exponential_average_factor, activation_mode, y, batch_mean, batch_var,
-          saved_mean, saved_inv_var, is_training, reserve_space_allocator,
-          workspace_allocator));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoBatchNormalizationForward(
+        this, x, scale, offset, estimated_mean, estimated_variance, side_input,
+        x_desc, scale_offset_desc, epsilon, exponential_average_factor,
+        activation_mode, y, batch_mean, batch_var, saved_mean, saved_inv_var,
+        is_training, reserve_space_allocator, workspace_allocator));
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -443,16 +431,14 @@ Stream &Stream::ThenBatchNormalizationBackward(
   VLOG_CALL(PARAM(y_backprop), PARAM(x), PARAM(scale), PARAM(x_desc),
             PARAM(scale_offset_desc), PARAM(epsilon), PARAM(x_backprop),
             PARAM(scale_backprop), PARAM(offset_backprop));
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoBatchNormalizationBackward(
-          this, y_backprop, x, scale, mean, inv_var, x_desc, scale_offset_desc,
-          epsilon, x_backprop, scale_backprop, offset_backprop,
-          reserve_space_data, workspace_allocator));
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoBatchNormalizationBackward(
+        this, y_backprop, x, scale, mean, inv_var, x_desc, scale_offset_desc,
+        epsilon, x_backprop, scale_backprop, offset_backprop,
+        reserve_space_data, workspace_allocator));
 
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -477,20 +463,18 @@ Stream &Stream::ThenFusedConvolveWithAlgorithm(
             PARAM(activation_mode), PARAM(output_descriptor), PARAM(output),
             PARAM(algorithm_config));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      auto status = dnn->DoFusedConvolve(
-          this, conv_input_descriptor, conv_input_data, conv_input_scale,
-          filter_descriptor, filter_data, convolution_descriptor,
-          side_input_data, side_input_scale, bias_descriptor, biases,
-          activation_mode, output_descriptor, output, scratch_allocator,
-          algorithm_config, output_profile_result);
-      if (!status && !output_profile_result) {
-        SetError();
-      }
-    } else {
-      SetErrorAndLogNoDnnSupport();
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    auto status = dnn->DoFusedConvolve(
+        this, conv_input_descriptor, conv_input_data, conv_input_scale,
+        filter_descriptor, filter_data, convolution_descriptor, side_input_data,
+        side_input_scale, bias_descriptor, biases, activation_mode,
+        output_descriptor, output, scratch_allocator, algorithm_config,
+        output_profile_result);
+    if (!status.ok() && !output_profile_result) {
+      CheckStatus(status);
     }
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -515,20 +499,18 @@ Stream &Stream::ThenFusedConvolveWithAlgorithm(
             PARAM(activation_mode), PARAM(output_descriptor), PARAM(output),
             PARAM(algorithm_config));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      auto status = dnn->DoFusedConvolve(
-          this, conv_input_descriptor, conv_input_data, conv_input_scale,
-          filter_descriptor, filter_data, convolution_descriptor,
-          side_input_data, side_input_scale, bias_descriptor, biases,
-          activation_mode, output_descriptor, output, scratch_allocator,
-          algorithm_config, output_profile_result);
-      if (!status && !output_profile_result) {
-        SetError();
-      }
-    } else {
-      SetErrorAndLogNoDnnSupport();
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    auto status = dnn->DoFusedConvolve(
+        this, conv_input_descriptor, conv_input_data, conv_input_scale,
+        filter_descriptor, filter_data, convolution_descriptor, side_input_data,
+        side_input_scale, bias_descriptor, biases, activation_mode,
+        output_descriptor, output, scratch_allocator, algorithm_config,
+        output_profile_result);
+    if (!status.ok() && !output_profile_result) {
+      CheckStatus(status);
     }
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -554,20 +536,18 @@ Stream &Stream::ThenFusedConvolveWithAlgorithm(
             PARAM(bias_descriptor), PARAM(biases), PARAM(activation_mode),
             PARAM(output_descriptor), PARAM(output), PARAM(algorithm_config));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      auto status = dnn->DoFusedConvolve(
-          this, conv_input_descriptor, conv_input_data, conv_input_scale,
-          filter_descriptor, filter_data, convolution_descriptor,
-          side_input_data, side_input_scale, bias_descriptor, biases,
-          activation_mode, output_descriptor, output, scratch_allocator,
-          algorithm_config, output_profile_result);
-      if (!status && !output_profile_result) {
-        SetError();
-      }
-    } else {
-      SetErrorAndLogNoDnnSupport();
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    auto status = dnn->DoFusedConvolve(
+        this, conv_input_descriptor, conv_input_data, conv_input_scale,
+        filter_descriptor, filter_data, convolution_descriptor, side_input_data,
+        side_input_scale, bias_descriptor, biases, activation_mode,
+        output_descriptor, output, scratch_allocator, algorithm_config,
+        output_profile_result);
+    if (!status.ok() && !output_profile_result) {
+      CheckStatus(status);
     }
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -592,20 +572,18 @@ Stream &Stream::ThenFusedConvolveWithAlgorithm(
             PARAM(bias_descriptor), PARAM(biases), PARAM(activation_mode),
             PARAM(output_descriptor), PARAM(output), PARAM(algorithm_config));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      auto status = dnn->DoFusedConvolve(
-          this, conv_input_descriptor, conv_input_data, conv_input_scale,
-          filter_descriptor, filter_data, convolution_descriptor,
-          side_input_data, side_input_scale, bias_descriptor, biases,
-          activation_mode, output_descriptor, output, scratch_allocator,
-          algorithm_config, output_profile_result);
-      if (!status && !output_profile_result) {
-        SetError();
-      }
-    } else {
-      SetErrorAndLogNoDnnSupport();
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    auto status = dnn->DoFusedConvolve(
+        this, conv_input_descriptor, conv_input_data, conv_input_scale,
+        filter_descriptor, filter_data, convolution_descriptor, side_input_data,
+        side_input_scale, bias_descriptor, biases, activation_mode,
+        output_descriptor, output, scratch_allocator, algorithm_config,
+        output_profile_result);
+    if (!status.ok() && !output_profile_result) {
+      CheckStatus(status);
     }
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -630,20 +608,18 @@ Stream &Stream::ThenFusedConvolveWithAlgorithm(
             PARAM(bias_descriptor), PARAM(biases), PARAM(activation_mode),
             PARAM(output_descriptor), PARAM(output), PARAM(algorithm_config));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      auto status = dnn->DoFusedConvolve(
-          this, conv_input_descriptor, conv_input_data, conv_input_scale,
-          filter_descriptor, filter_data, convolution_descriptor,
-          side_input_data, side_input_scale, bias_descriptor, biases,
-          activation_mode, output_descriptor, output, scratch_allocator,
-          algorithm_config, output_profile_result);
-      if (!status && !output_profile_result) {
-        SetError();
-      }
-    } else {
-      SetErrorAndLogNoDnnSupport();
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    auto status = dnn->DoFusedConvolve(
+        this, conv_input_descriptor, conv_input_data, conv_input_scale,
+        filter_descriptor, filter_data, convolution_descriptor, side_input_data,
+        side_input_scale, bias_descriptor, biases, activation_mode,
+        output_descriptor, output, scratch_allocator, algorithm_config,
+        output_profile_result);
+    if (!status.ok() && !output_profile_result) {
+      CheckStatus(status);
     }
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -663,29 +639,27 @@ Stream &Stream::ThenConvolveWithAlgorithm(
             PARAM(convolution_descriptor), PARAM(output_descriptor),
             PARAM(output), PARAM(algorithm_config));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      DeviceMemory<uint8> scratch_memory;
-      dnn::AlgorithmDesc algorithm_desc;
-      auto status =
-          dnn->PrepareForConvolution(
-                 dnn::ConvolutionKind::FORWARD, this, input_descriptor,
-                 input_data, filter_descriptor, filter_data, output_descriptor,
-                 *output, convolution_descriptor, algorithm_config,
-                 scratch_allocator, &algorithm_desc, &scratch_memory)
-              .ok();
-      if (status) {
-        status = dnn->DoConvolve(
-            this, input_descriptor, input_data, filter_descriptor, filter_data,
-            convolution_descriptor, output_descriptor, output, algorithm_desc,
-            &scratch_memory, output_profile_result);
-      }
-      if (!status && !output_profile_result) {
-        SetError();
-      }
-    } else {
-      SetErrorAndLogNoDnnSupport();
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    DeviceMemory<uint8> scratch_memory;
+    dnn::AlgorithmDesc algorithm_desc;
+    auto status =
+        dnn->PrepareForConvolution(
+               dnn::ConvolutionKind::FORWARD, this, input_descriptor,
+               input_data, filter_descriptor, filter_data, output_descriptor,
+               *output, convolution_descriptor, algorithm_config,
+               scratch_allocator, &algorithm_desc, &scratch_memory)
+            .ok();
+    if (status) {
+      status = dnn->DoConvolve(
+          this, input_descriptor, input_data, filter_descriptor, filter_data,
+          convolution_descriptor, output_descriptor, output, algorithm_desc,
+          &scratch_memory, output_profile_result);
     }
+    if (!status && !output_profile_result) {
+      SetError();
+    }
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -705,29 +679,27 @@ Stream &Stream::ThenConvolveWithAlgorithm(
             PARAM(convolution_descriptor), PARAM(output_descriptor),
             PARAM(output), PARAM(algorithm_config));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      DeviceMemory<uint8> scratch_memory;
-      dnn::AlgorithmDesc algorithm_desc;
-      auto status =
-          dnn->PrepareForConvolution(
-                 dnn::ConvolutionKind::FORWARD, this, input_descriptor,
-                 input_data, filter_descriptor, filter_data, output_descriptor,
-                 *output, convolution_descriptor, algorithm_config,
-                 scratch_allocator, &algorithm_desc, &scratch_memory)
-              .ok();
-      if (status) {
-        status = dnn->DoConvolve(
-            this, input_descriptor, input_data, filter_descriptor, filter_data,
-            convolution_descriptor, output_descriptor, output, algorithm_desc,
-            &scratch_memory, output_profile_result);
-      }
-      if (!status && !output_profile_result) {
-        SetError();
-      }
-    } else {
-      SetErrorAndLogNoDnnSupport();
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    DeviceMemory<uint8> scratch_memory;
+    dnn::AlgorithmDesc algorithm_desc;
+    auto status =
+        dnn->PrepareForConvolution(
+               dnn::ConvolutionKind::FORWARD, this, input_descriptor,
+               input_data, filter_descriptor, filter_data, output_descriptor,
+               *output, convolution_descriptor, algorithm_config,
+               scratch_allocator, &algorithm_desc, &scratch_memory)
+            .ok();
+    if (status) {
+      status = dnn->DoConvolve(
+          this, input_descriptor, input_data, filter_descriptor, filter_data,
+          convolution_descriptor, output_descriptor, output, algorithm_desc,
+          &scratch_memory, output_profile_result);
     }
+    if (!status && !output_profile_result) {
+      SetError();
+    }
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -747,29 +719,27 @@ Stream &Stream::ThenConvolveWithAlgorithm(
             PARAM(convolution_descriptor), PARAM(output_descriptor),
             PARAM(output), PARAM(algorithm_config));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      DeviceMemory<uint8> scratch_memory;
-      dnn::AlgorithmDesc algorithm_desc;
-      auto status =
-          dnn->PrepareForConvolution(
-                 dnn::ConvolutionKind::FORWARD, this, input_descriptor,
-                 input_data, filter_descriptor, filter_data, output_descriptor,
-                 *output, convolution_descriptor, algorithm_config,
-                 scratch_allocator, &algorithm_desc, &scratch_memory)
-              .ok();
-      if (status) {
-        status = dnn->DoConvolve(
-            this, input_descriptor, input_data, filter_descriptor, filter_data,
-            convolution_descriptor, output_descriptor, output, algorithm_desc,
-            &scratch_memory, output_profile_result);
-      }
-      if (!status && !output_profile_result) {
-        SetError();
-      }
-    } else {
-      SetErrorAndLogNoDnnSupport();
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    DeviceMemory<uint8> scratch_memory;
+    dnn::AlgorithmDesc algorithm_desc;
+    auto status =
+        dnn->PrepareForConvolution(
+               dnn::ConvolutionKind::FORWARD, this, input_descriptor,
+               input_data, filter_descriptor, filter_data, output_descriptor,
+               *output, convolution_descriptor, algorithm_config,
+               scratch_allocator, &algorithm_desc, &scratch_memory)
+            .ok();
+    if (status) {
+      status = dnn->DoConvolve(
+          this, input_descriptor, input_data, filter_descriptor, filter_data,
+          convolution_descriptor, output_descriptor, output, algorithm_desc,
+          &scratch_memory, output_profile_result);
     }
+    if (!status && !output_profile_result) {
+      SetError();
+    }
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -789,29 +759,27 @@ Stream &Stream::ThenConvolveWithAlgorithm(
             PARAM(convolution_descriptor), PARAM(output_descriptor),
             PARAM(output), PARAM(algorithm_config));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      DeviceMemory<uint8> scratch_memory;
-      dnn::AlgorithmDesc algorithm_desc;
-      auto status =
-          dnn->PrepareForConvolution(
-                 dnn::ConvolutionKind::FORWARD, this, input_descriptor,
-                 input_data, filter_descriptor, filter_data, output_descriptor,
-                 *output, convolution_descriptor, algorithm_config,
-                 scratch_allocator, &algorithm_desc, &scratch_memory)
-              .ok();
-      if (status) {
-        status = dnn->DoConvolve(
-            this, input_descriptor, input_data, filter_descriptor, filter_data,
-            convolution_descriptor, output_descriptor, output, algorithm_desc,
-            &scratch_memory, output_profile_result);
-      }
-      if (!status && !output_profile_result) {
-        SetError();
-      }
-    } else {
-      SetErrorAndLogNoDnnSupport();
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    DeviceMemory<uint8> scratch_memory;
+    dnn::AlgorithmDesc algorithm_desc;
+    auto status =
+        dnn->PrepareForConvolution(
+               dnn::ConvolutionKind::FORWARD, this, input_descriptor,
+               input_data, filter_descriptor, filter_data, output_descriptor,
+               *output, convolution_descriptor, algorithm_config,
+               scratch_allocator, &algorithm_desc, &scratch_memory)
+            .ok();
+    if (status) {
+      status = dnn->DoConvolve(
+          this, input_descriptor, input_data, filter_descriptor, filter_data,
+          convolution_descriptor, output_descriptor, output, algorithm_desc,
+          &scratch_memory, output_profile_result);
     }
+    if (!status && !output_profile_result) {
+      SetError();
+    }
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -831,29 +799,27 @@ Stream &Stream::ThenConvolveWithAlgorithm(
             PARAM(convolution_descriptor), PARAM(output_descriptor),
             PARAM(output), PARAM(algorithm_config));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      DeviceMemory<uint8> scratch_memory;
-      dnn::AlgorithmDesc algorithm_desc;
-      auto status =
-          dnn->PrepareForConvolution(
-                 dnn::ConvolutionKind::FORWARD, this, input_descriptor,
-                 input_data, filter_descriptor, filter_data, output_descriptor,
-                 *output, convolution_descriptor, algorithm_config,
-                 scratch_allocator, &algorithm_desc, &scratch_memory)
-              .ok();
-      if (status) {
-        status = dnn->DoConvolve(
-            this, input_descriptor, input_data, filter_descriptor, filter_data,
-            convolution_descriptor, output_descriptor, output, algorithm_desc,
-            &scratch_memory, output_profile_result);
-      }
-      if (!status && !output_profile_result) {
-        SetError();
-      }
-    } else {
-      SetErrorAndLogNoDnnSupport();
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    DeviceMemory<uint8> scratch_memory;
+    dnn::AlgorithmDesc algorithm_desc;
+    auto status =
+        dnn->PrepareForConvolution(
+               dnn::ConvolutionKind::FORWARD, this, input_descriptor,
+               input_data, filter_descriptor, filter_data, output_descriptor,
+               *output, convolution_descriptor, algorithm_config,
+               scratch_allocator, &algorithm_desc, &scratch_memory)
+            .ok();
+    if (status) {
+      status = dnn->DoConvolve(
+          this, input_descriptor, input_data, filter_descriptor, filter_data,
+          convolution_descriptor, output_descriptor, output, algorithm_desc,
+          &scratch_memory, output_profile_result);
     }
+    if (!status && !output_profile_result) {
+      SetError();
+    }
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -887,18 +853,15 @@ Stream &Stream::ThenConvolveQuantized(
             PARAM(coefficient_scales), PARAM(convolution_descriptor),
             PARAM(output_descriptor), PARAM(output));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoConvolveQuantized(
-          this, input_descriptor, input_data, filter_descriptor,
-          filter_coefficients, coefficient_scales, convolution_descriptor,
-          output_descriptor, output));
-    } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoConvolveQuantized(
+        this, input_descriptor, input_data, filter_descriptor,
+        filter_coefficients, coefficient_scales, convolution_descriptor,
+        output_descriptor, output));
+  } else {
+    SetError();
+    LOG(WARNING) << "attempting to perform DNN operation using StreamExecutor "
+                    "without DNN support";
   }
   return *this;
 }
@@ -917,18 +880,15 @@ Stream &Stream::ThenConvolveQuantized(
             PARAM(coefficient_scales), PARAM(convolution_descriptor),
             PARAM(output_descriptor), PARAM(output));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoConvolveQuantized(
-          this, input_descriptor, input_data, filter_descriptor,
-          filter_coefficients, coefficient_scales, convolution_descriptor,
-          output_descriptor, output));
-    } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoConvolveQuantized(
+        this, input_descriptor, input_data, filter_descriptor,
+        filter_coefficients, coefficient_scales, convolution_descriptor,
+        output_descriptor, output));
+  } else {
+    SetError();
+    LOG(WARNING) << "attempting to perform DNN operation using StreamExecutor "
+                    "without DNN support";
   }
   return *this;
 }
@@ -947,15 +907,13 @@ Stream &Stream::ThenSeparableConvolve(
       PARAM(depth_multiplier), PARAM(first_weights), PARAM(second_weights),
       PARAM(convolution_descriptor), PARAM(output_descriptor), PARAM(output));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoSeparableConvolve(
-          this, batch_descriptor, input_data, filter_descriptor,
-          depth_multiplier, first_weights, second_weights,
-          convolution_descriptor, output_descriptor, output));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoSeparableConvolve(
+        this, batch_descriptor, input_data, filter_descriptor, depth_multiplier,
+        first_weights, second_weights, convolution_descriptor,
+        output_descriptor, output));
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -976,31 +934,29 @@ Stream &Stream::ThenConvolveBackwardDataWithAlgorithm(
             PARAM(convolution_descriptor), PARAM(input_descriptor),
             PARAM(backward_input_data));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      DeviceMemory<uint8> scratch_memory;
-      dnn::AlgorithmDesc algorithm_desc;
-      auto status =
-          dnn->PrepareForConvolution(
-                 dnn::ConvolutionKind::BACKWARD_DATA, this, input_descriptor,
-                 *backward_input_data, filter_descriptor, filter_data,
-                 output_descriptor, backward_output_data,
-                 convolution_descriptor, algorithm_config, scratch_allocator,
-                 &algorithm_desc, &scratch_memory)
-              .ok();
-      if (status) {
-        status = dnn->DoConvolveBackwardData(
-            this, filter_descriptor, filter_data, output_descriptor,
-            backward_output_data, convolution_descriptor, input_descriptor,
-            backward_input_data, algorithm_desc, &scratch_memory,
-            output_profile_result);
-      }
-      if (!status && !output_profile_result) {
-        SetError();
-      }
-    } else {
-      SetErrorAndLogNoDnnSupport();
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    DeviceMemory<uint8> scratch_memory;
+    dnn::AlgorithmDesc algorithm_desc;
+    auto status =
+        dnn->PrepareForConvolution(
+               dnn::ConvolutionKind::BACKWARD_DATA, this, input_descriptor,
+               *backward_input_data, filter_descriptor, filter_data,
+               output_descriptor, backward_output_data, convolution_descriptor,
+               algorithm_config, scratch_allocator, &algorithm_desc,
+               &scratch_memory)
+            .ok();
+    if (status) {
+      status = dnn->DoConvolveBackwardData(
+          this, filter_descriptor, filter_data, output_descriptor,
+          backward_output_data, convolution_descriptor, input_descriptor,
+          backward_input_data, algorithm_desc, &scratch_memory,
+          output_profile_result);
     }
+    if (!status && !output_profile_result) {
+      SetError();
+    }
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -1021,31 +977,29 @@ Stream &Stream::ThenConvolveBackwardDataWithAlgorithm(
             PARAM(convolution_descriptor), PARAM(input_descriptor),
             PARAM(backward_input_data));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      DeviceMemory<uint8> scratch_memory;
-      dnn::AlgorithmDesc algorithm_desc;
-      auto status =
-          dnn->PrepareForConvolution(
-                 dnn::ConvolutionKind::BACKWARD_DATA, this, input_descriptor,
-                 *backward_input_data, filter_descriptor, filter_data,
-                 output_descriptor, backward_output_data,
-                 convolution_descriptor, algorithm_config, scratch_allocator,
-                 &algorithm_desc, &scratch_memory)
-              .ok();
-      if (status) {
-        status = dnn->DoConvolveBackwardData(
-            this, filter_descriptor, filter_data, output_descriptor,
-            backward_output_data, convolution_descriptor, input_descriptor,
-            backward_input_data, algorithm_desc, &scratch_memory,
-            output_profile_result);
-      }
-      if (!status && !output_profile_result) {
-        SetError();
-      }
-    } else {
-      SetErrorAndLogNoDnnSupport();
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    DeviceMemory<uint8> scratch_memory;
+    dnn::AlgorithmDesc algorithm_desc;
+    auto status =
+        dnn->PrepareForConvolution(
+               dnn::ConvolutionKind::BACKWARD_DATA, this, input_descriptor,
+               *backward_input_data, filter_descriptor, filter_data,
+               output_descriptor, backward_output_data, convolution_descriptor,
+               algorithm_config, scratch_allocator, &algorithm_desc,
+               &scratch_memory)
+            .ok();
+    if (status) {
+      status = dnn->DoConvolveBackwardData(
+          this, filter_descriptor, filter_data, output_descriptor,
+          backward_output_data, convolution_descriptor, input_descriptor,
+          backward_input_data, algorithm_desc, &scratch_memory,
+          output_profile_result);
     }
+    if (!status && !output_profile_result) {
+      SetError();
+    }
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -1066,31 +1020,29 @@ Stream &Stream::ThenConvolveBackwardDataWithAlgorithm(
             PARAM(convolution_descriptor), PARAM(input_descriptor),
             PARAM(backward_input_data), PARAM(algorithm_config));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      DeviceMemory<uint8> scratch_memory;
-      dnn::AlgorithmDesc algorithm_desc;
-      auto status =
-          dnn->PrepareForConvolution(
-                 dnn::ConvolutionKind::BACKWARD_DATA, this, input_descriptor,
-                 *backward_input_data, filter_descriptor, filter_data,
-                 output_descriptor, backward_output_data,
-                 convolution_descriptor, algorithm_config, scratch_allocator,
-                 &algorithm_desc, &scratch_memory)
-              .ok();
-      if (status) {
-        status = dnn->DoConvolveBackwardData(
-            this, filter_descriptor, filter_data, output_descriptor,
-            backward_output_data, convolution_descriptor, input_descriptor,
-            backward_input_data, algorithm_desc, &scratch_memory,
-            output_profile_result);
-      }
-      if (!status && !output_profile_result) {
-        SetError();
-      }
-    } else {
-      SetErrorAndLogNoDnnSupport();
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    DeviceMemory<uint8> scratch_memory;
+    dnn::AlgorithmDesc algorithm_desc;
+    auto status =
+        dnn->PrepareForConvolution(
+               dnn::ConvolutionKind::BACKWARD_DATA, this, input_descriptor,
+               *backward_input_data, filter_descriptor, filter_data,
+               output_descriptor, backward_output_data, convolution_descriptor,
+               algorithm_config, scratch_allocator, &algorithm_desc,
+               &scratch_memory)
+            .ok();
+    if (status) {
+      status = dnn->DoConvolveBackwardData(
+          this, filter_descriptor, filter_data, output_descriptor,
+          backward_output_data, convolution_descriptor, input_descriptor,
+          backward_input_data, algorithm_desc, &scratch_memory,
+          output_profile_result);
     }
+    if (!status && !output_profile_result) {
+      SetError();
+    }
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -1111,31 +1063,29 @@ Stream &Stream::ThenConvolveBackwardFilterWithAlgorithm(
             PARAM(convolution_descriptor), PARAM(filter_descriptor),
             PARAM(backward_filter_data));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      DeviceMemory<uint8> scratch_memory;
-      dnn::AlgorithmDesc algorithm_desc;
-      auto status =
-          dnn->PrepareForConvolution(
-                 dnn::ConvolutionKind::BACKWARD_FILTER, this, input_descriptor,
-                 input_data, filter_descriptor, *backward_filter_data,
-                 output_descriptor, backward_output_data,
-                 convolution_descriptor, algorithm_config, scratch_allocator,
-                 &algorithm_desc, &scratch_memory)
-              .ok();
-      if (status) {
-        status = dnn->DoConvolveBackwardFilter(
-            this, input_descriptor, input_data, output_descriptor,
-            backward_output_data, convolution_descriptor, filter_descriptor,
-            backward_filter_data, algorithm_desc, &scratch_memory,
-            output_profile_result);
-      }
-      if (!status && !output_profile_result) {
-        SetError();
-      }
-    } else {
-      SetErrorAndLogNoDnnSupport();
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    DeviceMemory<uint8> scratch_memory;
+    dnn::AlgorithmDesc algorithm_desc;
+    auto status =
+        dnn->PrepareForConvolution(
+               dnn::ConvolutionKind::BACKWARD_FILTER, this, input_descriptor,
+               input_data, filter_descriptor, *backward_filter_data,
+               output_descriptor, backward_output_data, convolution_descriptor,
+               algorithm_config, scratch_allocator, &algorithm_desc,
+               &scratch_memory)
+            .ok();
+    if (status) {
+      status = dnn->DoConvolveBackwardFilter(
+          this, input_descriptor, input_data, output_descriptor,
+          backward_output_data, convolution_descriptor, filter_descriptor,
+          backward_filter_data, algorithm_desc, &scratch_memory,
+          output_profile_result);
     }
+    if (!status && !output_profile_result) {
+      SetError();
+    }
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -1156,31 +1106,29 @@ Stream &Stream::ThenConvolveBackwardFilterWithAlgorithm(
             PARAM(convolution_descriptor), PARAM(filter_descriptor),
             PARAM(backward_filter_data));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      DeviceMemory<uint8> scratch_memory;
-      dnn::AlgorithmDesc algorithm_desc;
-      auto status =
-          dnn->PrepareForConvolution(
-                 dnn::ConvolutionKind::BACKWARD_FILTER, this, input_descriptor,
-                 input_data, filter_descriptor, *backward_filter_data,
-                 output_descriptor, backward_output_data,
-                 convolution_descriptor, algorithm_config, scratch_allocator,
-                 &algorithm_desc, &scratch_memory)
-              .ok();
-      if (status) {
-        status = dnn->DoConvolveBackwardFilter(
-            this, input_descriptor, input_data, output_descriptor,
-            backward_output_data, convolution_descriptor, filter_descriptor,
-            backward_filter_data, algorithm_desc, &scratch_memory,
-            output_profile_result);
-      }
-      if (!status && !output_profile_result) {
-        SetError();
-      }
-    } else {
-      SetErrorAndLogNoDnnSupport();
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    DeviceMemory<uint8> scratch_memory;
+    dnn::AlgorithmDesc algorithm_desc;
+    auto status =
+        dnn->PrepareForConvolution(
+               dnn::ConvolutionKind::BACKWARD_FILTER, this, input_descriptor,
+               input_data, filter_descriptor, *backward_filter_data,
+               output_descriptor, backward_output_data, convolution_descriptor,
+               algorithm_config, scratch_allocator, &algorithm_desc,
+               &scratch_memory)
+            .ok();
+    if (status) {
+      status = dnn->DoConvolveBackwardFilter(
+          this, input_descriptor, input_data, output_descriptor,
+          backward_output_data, convolution_descriptor, filter_descriptor,
+          backward_filter_data, algorithm_desc, &scratch_memory,
+          output_profile_result);
     }
+    if (!status && !output_profile_result) {
+      SetError();
+    }
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -1201,31 +1149,29 @@ Stream &Stream::ThenConvolveBackwardFilterWithAlgorithm(
             PARAM(convolution_descriptor), PARAM(filter_descriptor),
             PARAM(backward_filter_data));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      DeviceMemory<uint8> scratch_memory;
-      dnn::AlgorithmDesc algorithm_desc;
-      auto status =
-          dnn->PrepareForConvolution(
-                 dnn::ConvolutionKind::BACKWARD_FILTER, this, input_descriptor,
-                 input_data, filter_descriptor, *backward_filter_data,
-                 output_descriptor, backward_output_data,
-                 convolution_descriptor, algorithm_config, scratch_allocator,
-                 &algorithm_desc, &scratch_memory)
-              .ok();
-      if (status) {
-        status = dnn->DoConvolveBackwardFilter(
-            this, input_descriptor, input_data, output_descriptor,
-            backward_output_data, convolution_descriptor, filter_descriptor,
-            backward_filter_data, algorithm_desc, &scratch_memory,
-            output_profile_result);
-      }
-      if (!status && !output_profile_result) {
-        SetError();
-      }
-    } else {
-      SetErrorAndLogNoDnnSupport();
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    DeviceMemory<uint8> scratch_memory;
+    dnn::AlgorithmDesc algorithm_desc;
+    auto status =
+        dnn->PrepareForConvolution(
+               dnn::ConvolutionKind::BACKWARD_FILTER, this, input_descriptor,
+               input_data, filter_descriptor, *backward_filter_data,
+               output_descriptor, backward_output_data, convolution_descriptor,
+               algorithm_config, scratch_allocator, &algorithm_desc,
+               &scratch_memory)
+            .ok();
+    if (status) {
+      status = dnn->DoConvolveBackwardFilter(
+          this, input_descriptor, input_data, output_descriptor,
+          backward_output_data, convolution_descriptor, filter_descriptor,
+          backward_filter_data, algorithm_desc, &scratch_memory,
+          output_profile_result);
     }
+    if (!status && !output_profile_result) {
+      SetError();
+    }
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -1239,14 +1185,12 @@ Stream &Stream::ThenConvolveBackwardBiasImpl(
   VLOG_CALL(PARAM(input_descriptor), PARAM(input_data), PARAM(bias_descriptor),
             PARAM(backward_bias_data));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoConvolveBackwardBias(this, input_descriptor, input_data,
-                                             bias_descriptor,
-                                             backward_bias_data));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoConvolveBackwardBias(this, input_descriptor, input_data,
+                                           bias_descriptor,
+                                           backward_bias_data));
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -1286,13 +1230,11 @@ Stream &Stream::ThenMatMul(const DeviceMemory<float> &input_data,
   VLOG_CALL(PARAM(input_data), PARAM(weights), PARAM(input_dimensions),
             PARAM(output_dimensions), PARAM(output_data));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoMatMul(this, input_data, weights, input_dimensions,
-                               output_dimensions, output_data));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoMatMul(this, input_data, weights, input_dimensions,
+                             output_dimensions, output_data));
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -1307,14 +1249,12 @@ Stream &Stream::ThenMatMulQuantized(
             PARAM(input_dimensions), PARAM(output_dimensions),
             PARAM(output_data));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoMatMulQuantized(this, input_data, weights,
-                                        weight_scales, input_dimensions,
-                                        output_dimensions, output_data));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoMatMulQuantized(this, input_data, weights, weight_scales,
+                                      input_dimensions, output_dimensions,
+                                      output_data));
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -1329,14 +1269,12 @@ Stream &Stream::ThenMatMulQuantized(
             PARAM(input_dimensions), PARAM(output_dimensions),
             PARAM(output_data));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoMatMulQuantized(this, input_data, weights,
-                                        weight_scales, input_dimensions,
-                                        output_dimensions, output_data));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoMatMulQuantized(this, input_data, weights, weight_scales,
+                                      input_dimensions, output_dimensions,
+                                      output_data));
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -1348,13 +1286,11 @@ Stream &Stream::ThenBiasAdd(const DeviceMemory<float> &input_data,
   VLOG_CALL(PARAM(input_data), PARAM(biases), PARAM(dimensions),
             PARAM(output_data));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(
-          dnn->DoBiasAdd(this, input_data, biases, dimensions, output_data));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(
+        dnn->DoBiasAdd(this, input_data, biases, dimensions, output_data));
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -1369,17 +1305,14 @@ Stream &Stream::ThenPoolForward(
             PARAM(input_data), PARAM(output_dimensions), PARAM(output_data),
             PARAM(workspace_allocator));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoPoolForward(this, pooling_dimensions, input_dimensions,
-                                    input_data, output_dimensions, output_data,
-                                    workspace_allocator));
-    } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoPoolForward(this, pooling_dimensions, input_dimensions,
+                                  input_data, output_dimensions, output_data,
+                                  workspace_allocator));
+  } else {
+    SetError();
+    LOG(WARNING) << "attempting to perform DNN operation using StreamExecutor "
+                    "without DNN support";
   }
   return *this;
 }
@@ -1394,14 +1327,12 @@ Stream &Stream::ThenPoolForward(
             PARAM(input_data), PARAM(output_dimensions), PARAM(output_data),
             PARAM(workspace_allocator));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoPoolForward(this, pooling_dimensions, input_dimensions,
-                                    input_data, output_dimensions, output_data,
-                                    workspace_allocator));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoPoolForward(this, pooling_dimensions, input_dimensions,
+                                  input_data, output_dimensions, output_data,
+                                  workspace_allocator));
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -1417,14 +1348,12 @@ Stream &Stream::ThenPoolForward(
             PARAM(input_data), PARAM(output_dimensions), PARAM(output_data),
             PARAM(workspace_allocator));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoPoolForward(this, pooling_dimensions, input_dimensions,
-                                    input_data, output_dimensions, output_data,
-                                    workspace_allocator));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoPoolForward(this, pooling_dimensions, input_dimensions,
+                                  input_data, output_dimensions, output_data,
+                                  workspace_allocator));
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -1439,14 +1368,12 @@ Stream &Stream::ThenPoolForward(
             PARAM(input_data), PARAM(output_dimensions), PARAM(output_data),
             PARAM(workspace_allocator));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoPoolForward(this, pooling_dimensions, input_dimensions,
-                                    input_data, output_dimensions, output_data,
-                                    workspace_allocator));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoPoolForward(this, pooling_dimensions, input_dimensions,
+                                  input_data, output_dimensions, output_data,
+                                  workspace_allocator));
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -1465,18 +1392,15 @@ Stream &Stream::ThenPoolBackward(
             PARAM(input_diff_data), PARAM(output_diff_data),
             PARAM(workspace_allocator));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoPoolBackward(this, pooling_dimensions, input_dimensions,
-                                     input_data, output_dimensions, output_data,
-                                     input_diff_data, output_diff_data,
-                                     workspace_allocator));
-    } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoPoolBackward(this, pooling_dimensions, input_dimensions,
+                                   input_data, output_dimensions, output_data,
+                                   input_diff_data, output_diff_data,
+                                   workspace_allocator));
+  } else {
+    SetError();
+    LOG(WARNING) << "attempting to perform DNN operation using StreamExecutor "
+                    "without DNN support";
   }
   return *this;
 }
@@ -1495,15 +1419,13 @@ Stream &Stream::ThenPoolBackward(
             PARAM(input_diff_data), PARAM(output_diff_data),
             PARAM(workspace_allocator));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoPoolBackward(this, pooling_dimensions, input_dimensions,
-                                     input_data, output_dimensions, output_data,
-                                     input_diff_data, output_diff_data,
-                                     workspace_allocator));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoPoolBackward(this, pooling_dimensions, input_dimensions,
+                                   input_data, output_dimensions, output_data,
+                                   input_diff_data, output_diff_data,
+                                   workspace_allocator));
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -1522,15 +1444,13 @@ Stream &Stream::ThenPoolBackward(
             PARAM(input_diff_data), PARAM(output_diff_data),
             PARAM(workspace_allocator));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoPoolBackward(this, pooling_dimensions, input_dimensions,
-                                     input_data, output_dimensions, output_data,
-                                     input_diff_data, output_diff_data,
-                                     workspace_allocator));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoPoolBackward(this, pooling_dimensions, input_dimensions,
+                                   input_data, output_dimensions, output_data,
+                                   input_diff_data, output_diff_data,
+                                   workspace_allocator));
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -1542,13 +1462,11 @@ Stream &Stream::ThenNormalizeWithDimensions(
   VLOG_CALL(PARAM(normalize_descriptor), PARAM(dimensions), PARAM(input_data),
             PARAM(output_data));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoNormalizeWithDimensions(
-          this, normalize_descriptor, dimensions, input_data, output_data));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoNormalizeWithDimensions(
+        this, normalize_descriptor, dimensions, input_data, output_data));
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -1564,15 +1482,13 @@ Stream &Stream::ThenNormalizeBackwardWithDimensions(
             PARAM(normalized_data), PARAM(normalized_variable_gradient),
             PARAM(raw_variable_gradient), PARAM(workspace_allocator));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoNormalizeBackwardWithDimensions(
-          this, normalize_descriptor, dimensions, raw_data, normalized_data,
-          normalized_variable_gradient, raw_variable_gradient,
-          workspace_allocator));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoNormalizeBackwardWithDimensions(
+        this, normalize_descriptor, dimensions, raw_data, normalized_data,
+        normalized_variable_gradient, raw_variable_gradient,
+        workspace_allocator));
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -1593,13 +1509,11 @@ Stream &Stream::ThenActivateWithOptions(dnn::ActivationMode activation_mode,
   VLOG_CALL(PARAM(activation_mode), PARAM(dimensions), PARAM(input_data),
             PARAM(output_data), PARAM(options));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoActivate(this, activation_mode, dimensions, input_data,
-                                 output_data, options));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoActivate(this, activation_mode, dimensions, input_data,
+                               output_data, options));
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -1623,13 +1537,11 @@ Stream &Stream::ThenDepthConcatenate(
     }
   }
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoDepthConcatenate(this, input_dimensions, input_data,
-                                         output_data));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoDepthConcatenate(this, input_dimensions, input_data,
+                                       output_data));
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -1670,13 +1582,11 @@ Stream &Stream::ThenSpaceConcatenate(
       return *this;
     }
   }
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoSpaceConcatenate(this, input_dimensions, input_data,
-                                         output_data, concat_direction));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoSpaceConcatenate(this, input_dimensions, input_data,
+                                       output_data, concat_direction));
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -1688,13 +1598,11 @@ Stream &Stream::ThenReshape(const dnn::BatchDescriptor &input_dimensions,
   VLOG_CALL(PARAM(input_dimensions), PARAM(input_data),
             PARAM(output_dimensions), PARAM(output_data));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoReshape(this, input_dimensions, input_data,
-                                output_dimensions, output_data));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoReshape(this, input_dimensions, input_data,
+                              output_dimensions, output_data));
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -1708,14 +1616,12 @@ Stream &Stream::ThenDepthToSpace(
             PARAM(depth_to_space_layout), PARAM(sqrt_depth_reduction),
             PARAM(output_data));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoDepthToSpace(this, input_dimensions, input_data,
-                                     depth_to_space_layout,
-                                     sqrt_depth_reduction, output_data));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoDepthToSpace(this, input_dimensions, input_data,
+                                   depth_to_space_layout, sqrt_depth_reduction,
+                                   output_data));
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -1729,14 +1635,12 @@ Stream &Stream::ThenSpaceToDepth(
             PARAM(space_to_depth_layout), PARAM(sqrt_depth_increase),
             PARAM(output_data));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoSpaceToDepth(this, input_dimensions, input_data,
-                                     space_to_depth_layout, sqrt_depth_increase,
-                                     output_data));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoSpaceToDepth(this, input_dimensions, input_data,
+                                   space_to_depth_layout, sqrt_depth_increase,
+                                   output_data));
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -1750,14 +1654,12 @@ Stream &Stream::ThenElementwiseOperate(
   VLOG_CALL(PARAM(operation), PARAM(input_dimensions), PARAM(input_data),
             PARAM(output_dimensions), PARAM(output_data));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoElementwiseOperate(this, operation, input_dimensions,
-                                           input_data, output_dimensions,
-                                           output_data));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoElementwiseOperate(this, operation, input_dimensions,
+                                         input_data, output_dimensions,
+                                         output_data));
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -1773,14 +1675,12 @@ Stream &Stream::ThenElementwiseOperateScaledQuantized(
             PARAM(input_dimensions), PARAM(input_data),
             PARAM(output_dimensions), PARAM(output_data));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoElementwiseOperateScaledQuantized(
-          this, operation, input_multiplicands, output_divisor,
-          input_dimensions, input_data, output_dimensions, output_data));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoElementwiseOperateScaledQuantized(
+        this, operation, input_multiplicands, output_divisor, input_dimensions,
+        input_data, output_dimensions, output_data));
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -1793,13 +1693,11 @@ Stream &Stream::ThenXYPad(const dnn::BatchDescriptor &dimensions,
             PARAM(right_pad), PARAM(top_pad), PARAM(bottom_pad),
             PARAM(output_data));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoXYPad(this, dimensions, input_data, left_pad, right_pad,
-                              top_pad, bottom_pad, output_data));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoXYPad(this, dimensions, input_data, left_pad, right_pad,
+                            top_pad, bottom_pad, output_data));
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -1813,14 +1711,11 @@ Stream &Stream::ThenXYSlice(const dnn::BatchDescriptor &dimensions,
             PARAM(right_trim), PARAM(top_trim), PARAM(bottom_trim),
             PARAM(output_data));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoXYSlice(this, dimensions, input_data, left_trim,
-                                right_trim, top_trim, bottom_trim,
-                                output_data));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoXYSlice(this, dimensions, input_data, left_trim,
+                              right_trim, top_trim, bottom_trim, output_data));
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -1832,13 +1727,11 @@ Stream &Stream::ThenXYBroadcast(const dnn::BatchDescriptor &dimensions,
   VLOG_CALL(PARAM(dimensions), PARAM(input_data), PARAM(replicate_x),
             PARAM(replicate_y), PARAM(output_data));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoXYBroadcast(this, dimensions, input_data, replicate_x,
-                                    replicate_y, output_data));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoXYBroadcast(this, dimensions, input_data, replicate_x,
+                                  replicate_y, output_data));
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -1849,13 +1742,11 @@ Stream &Stream::ThenMemcpyD2HQuantized(
   VLOG_CALL(PARAM(gpu_unquantized_src), PARAM(mode), PARAM(host_dst),
             PARAM(size));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoMemcpyD2HQuantized(this, gpu_unquantized_src, mode,
-                                           host_dst, size));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoMemcpyD2HQuantized(this, gpu_unquantized_src, mode,
+                                         host_dst, size));
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -1866,13 +1757,11 @@ Stream &Stream::ThenMemcpyH2DQuantized(
   VLOG_CALL(PARAM(host_src), PARAM(size), PARAM(mode),
             PARAM(gpu_unquantized_dst));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoMemcpyH2DQuantized(this, host_src, size, mode,
-                                           gpu_unquantized_dst));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoMemcpyH2DQuantized(this, host_src, size, mode,
+                                         gpu_unquantized_dst));
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -1920,7 +1809,7 @@ Stream *Stream::GetOrCreateSubStream() {
                             false);
   Stream *sub_stream = sub_streams_.back().first.get();
   sub_stream->Init();
-  if (!sub_stream->ok_) {
+  if (!sub_stream->ok()) {
     LOG(ERROR) << "sub-stream failed to be initialized";
   }
   VLOG(1) << DebugStreamPointers() << " created new sub_stream "
@@ -1972,24 +1861,14 @@ void Stream::ReturnSubStream(Stream *sub_stream) {
 Stream &Stream::ThenStartTimer(Timer *t) {
   VLOG_CALL(PARAM(t));
 
-  if (ok()) {
-    CheckError(parent_->StartTimer(this, t));
-  } else {
-    LOG(INFO) << DebugStreamPointers()
-              << " did not enqueue 'start timer': " << t;
-  }
+  CheckError(parent_->StartTimer(this, t));
   return *this;
 }
 
 Stream &Stream::ThenStopTimer(Timer *t) {
   VLOG_CALL(PARAM(t));
 
-  if (ok()) {
-    CheckError(parent_->StopTimer(this, t));
-  } else {
-    LOG(INFO) << DebugStreamPointers()
-              << " did not enqueue 'stop timer': " << t;
-  }
+  CheckError(parent_->StopTimer(this, t));
   return *this;
 }
 
@@ -2079,7 +1958,8 @@ Stream &Stream::ThenBlasAsum(uint64 elem_count, const DeviceMemory<double> &x,
   VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(result));
 
   ThenBlasImpl<uint64, const DeviceMemory<double> &, int,
-               DeviceMemory<double> *> impl;
+               DeviceMemory<double> *>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasAsum, elem_count, x, incx,
               result);
 }
@@ -2090,7 +1970,8 @@ Stream &Stream::ThenBlasAsum(uint64 elem_count,
   VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(result));
 
   ThenBlasImpl<uint64, const DeviceMemory<std::complex<float>> &, int,
-               DeviceMemory<float> *> impl;
+               DeviceMemory<float> *>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasAsum, elem_count, x, incx,
               result);
 }
@@ -2101,7 +1982,8 @@ Stream &Stream::ThenBlasAsum(uint64 elem_count,
   VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(result));
 
   ThenBlasImpl<uint64, const DeviceMemory<std::complex<double>> &, int,
-               DeviceMemory<double> *> impl;
+               DeviceMemory<double> *>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasAsum, elem_count, x, incx,
               result);
 }
@@ -2113,7 +1995,8 @@ Stream &Stream::ThenBlasAxpy(uint64 elem_count, float alpha,
             PARAM(incy));
 
   ThenBlasImpl<uint64, float, const DeviceMemory<float> &, int,
-               DeviceMemory<float> *, int> impl;
+               DeviceMemory<float> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasAxpy, elem_count, alpha, x, incx,
               y, incy);
 }
@@ -2125,7 +2008,8 @@ Stream &Stream::ThenBlasAxpy(uint64 elem_count, double alpha,
             PARAM(incy));
 
   ThenBlasImpl<uint64, double, const DeviceMemory<double> &, int,
-               DeviceMemory<double> *, int> impl;
+               DeviceMemory<double> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasAxpy, elem_count, alpha, x, incx,
               y, incy);
 }
@@ -2139,7 +2023,8 @@ Stream &Stream::ThenBlasAxpy(uint64 elem_count, std::complex<float> alpha,
 
   ThenBlasImpl<uint64, std::complex<float>,
                const DeviceMemory<std::complex<float>> &, int,
-               DeviceMemory<std::complex<float>> *, int> impl;
+               DeviceMemory<std::complex<float>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasAxpy, elem_count, alpha, x, incx,
               y, incy);
 }
@@ -2153,7 +2038,8 @@ Stream &Stream::ThenBlasAxpy(uint64 elem_count, std::complex<double> alpha,
 
   ThenBlasImpl<uint64, std::complex<double>,
                const DeviceMemory<std::complex<double>> &, int,
-               DeviceMemory<std::complex<double>> *, int> impl;
+               DeviceMemory<std::complex<double>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasAxpy, elem_count, alpha, x, incx,
               y, incy);
 }
@@ -2163,7 +2049,8 @@ Stream &Stream::ThenBlasCopy(uint64 elem_count, const DeviceMemory<float> &x,
   VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy));
 
   ThenBlasImpl<uint64, const DeviceMemory<float> &, int, DeviceMemory<float> *,
-               int> impl;
+               int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasCopy, elem_count, x, incx, y,
               incy);
 }
@@ -2173,7 +2060,8 @@ Stream &Stream::ThenBlasCopy(uint64 elem_count, const DeviceMemory<double> &x,
   VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy));
 
   ThenBlasImpl<uint64, const DeviceMemory<double> &, int,
-               DeviceMemory<double> *, int> impl;
+               DeviceMemory<double> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasCopy, elem_count, x, incx, y,
               incy);
 }
@@ -2185,7 +2073,8 @@ Stream &Stream::ThenBlasCopy(uint64 elem_count,
   VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy));
 
   ThenBlasImpl<uint64, const DeviceMemory<std::complex<float>> &, int,
-               DeviceMemory<std::complex<float>> *, int> impl;
+               DeviceMemory<std::complex<float>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasCopy, elem_count, x, incx, y,
               incy);
 }
@@ -2197,7 +2086,8 @@ Stream &Stream::ThenBlasCopy(uint64 elem_count,
   VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy));
 
   ThenBlasImpl<uint64, const DeviceMemory<std::complex<double>> &, int,
-               DeviceMemory<std::complex<double>> *, int> impl;
+               DeviceMemory<std::complex<double>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasCopy, elem_count, x, incx, y,
               incy);
 }
@@ -2209,7 +2099,8 @@ Stream &Stream::ThenBlasDot(uint64 elem_count, const DeviceMemory<float> &x,
             PARAM(result));
 
   ThenBlasImpl<uint64, const DeviceMemory<float> &, int,
-               const DeviceMemory<float> &, int, DeviceMemory<float> *> impl;
+               const DeviceMemory<float> &, int, DeviceMemory<float> *>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasDot, elem_count, x, incx, y, incy,
               result);
 }
@@ -2221,7 +2112,8 @@ Stream &Stream::ThenBlasDot(uint64 elem_count, const DeviceMemory<double> &x,
             PARAM(result));
 
   ThenBlasImpl<uint64, const DeviceMemory<double> &, int,
-               const DeviceMemory<double> &, int, DeviceMemory<double> *> impl;
+               const DeviceMemory<double> &, int, DeviceMemory<double> *>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasDot, elem_count, x, incx, y, incy,
               result);
 }
@@ -2237,7 +2129,8 @@ Stream &Stream::ThenBlasDotc(uint64 elem_count,
 
   ThenBlasImpl<uint64, const DeviceMemory<std::complex<float>> &, int,
                const DeviceMemory<std::complex<float>> &, int,
-               DeviceMemory<std::complex<float>> *> impl;
+               DeviceMemory<std::complex<float>> *>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasDotc, elem_count, x, incx, y,
               incy, result);
 }
@@ -2253,7 +2146,8 @@ Stream &Stream::ThenBlasDotc(uint64 elem_count,
 
   ThenBlasImpl<uint64, const DeviceMemory<std::complex<double>> &, int,
                const DeviceMemory<std::complex<double>> &, int,
-               DeviceMemory<std::complex<double>> *> impl;
+               DeviceMemory<std::complex<double>> *>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasDotc, elem_count, x, incx, y,
               incy, result);
 }
@@ -2269,7 +2163,8 @@ Stream &Stream::ThenBlasDotu(uint64 elem_count,
 
   ThenBlasImpl<uint64, const DeviceMemory<std::complex<float>> &, int,
                const DeviceMemory<std::complex<float>> &, int,
-               DeviceMemory<std::complex<float>> *> impl;
+               DeviceMemory<std::complex<float>> *>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasDotu, elem_count, x, incx, y,
               incy, result);
 }
@@ -2285,7 +2180,8 @@ Stream &Stream::ThenBlasDotu(uint64 elem_count,
 
   ThenBlasImpl<uint64, const DeviceMemory<std::complex<double>> &, int,
                const DeviceMemory<std::complex<double>> &, int,
-               DeviceMemory<std::complex<double>> *> impl;
+               DeviceMemory<std::complex<double>> *>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasDotu, elem_count, x, incx, y,
               incy, result);
 }
@@ -2305,7 +2201,8 @@ Stream &Stream::ThenBlasNrm2(uint64 elem_count, const DeviceMemory<double> &x,
   VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(result));
 
   ThenBlasImpl<uint64, const DeviceMemory<double> &, int,
-               DeviceMemory<double> *> impl;
+               DeviceMemory<double> *>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasNrm2, elem_count, x, incx,
               result);
 }
@@ -2316,7 +2213,8 @@ Stream &Stream::ThenBlasNrm2(uint64 elem_count,
   VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(result));
 
   ThenBlasImpl<uint64, const DeviceMemory<std::complex<float>> &, int,
-               DeviceMemory<float> *> impl;
+               DeviceMemory<float> *>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasNrm2, elem_count, x, incx,
               result);
 }
@@ -2327,7 +2225,8 @@ Stream &Stream::ThenBlasNrm2(uint64 elem_count,
   VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(result));
 
   ThenBlasImpl<uint64, const DeviceMemory<std::complex<double>> &, int,
-               DeviceMemory<double> *> impl;
+               DeviceMemory<double> *>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasNrm2, elem_count, x, incx,
               result);
 }
@@ -2339,7 +2238,8 @@ Stream &Stream::ThenBlasRot(uint64 elem_count, DeviceMemory<float> *x, int incx,
             PARAM(c), PARAM(s));
 
   ThenBlasImpl<uint64, DeviceMemory<float> *, int, DeviceMemory<float> *, int,
-               float, float> impl;
+               float, float>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasRot, elem_count, x, incx, y, incy,
               c, s);
 }
@@ -2351,7 +2251,8 @@ Stream &Stream::ThenBlasRot(uint64 elem_count, DeviceMemory<double> *x,
             PARAM(c), PARAM(s));
 
   ThenBlasImpl<uint64, DeviceMemory<double> *, int, DeviceMemory<double> *, int,
-               double, double> impl;
+               double, double>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasRot, elem_count, x, incx, y, incy,
               c, s);
 }
@@ -2364,7 +2265,8 @@ Stream &Stream::ThenBlasRot(uint64 elem_count,
             PARAM(c), PARAM(s));
 
   ThenBlasImpl<uint64, DeviceMemory<std::complex<float>> *, int,
-               DeviceMemory<std::complex<float>> *, int, float, float> impl;
+               DeviceMemory<std::complex<float>> *, int, float, float>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasRot, elem_count, x, incx, y, incy,
               c, s);
 }
@@ -2377,7 +2279,8 @@ Stream &Stream::ThenBlasRot(uint64 elem_count,
             PARAM(c), PARAM(s));
 
   ThenBlasImpl<uint64, DeviceMemory<std::complex<double>> *, int,
-               DeviceMemory<std::complex<double>> *, int, double, double> impl;
+               DeviceMemory<std::complex<double>> *, int, double, double>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasRot, elem_count, x, incx, y, incy,
               c, s);
 }
@@ -2387,7 +2290,8 @@ Stream &Stream::ThenBlasRotg(DeviceMemory<float> *a, DeviceMemory<float> *b,
   VLOG_CALL(PARAM(a), PARAM(b), PARAM(c), PARAM(s));
 
   ThenBlasImpl<DeviceMemory<float> *, DeviceMemory<float> *,
-               DeviceMemory<float> *, DeviceMemory<float> *> impl;
+               DeviceMemory<float> *, DeviceMemory<float> *>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasRotg, a, b, c, s);
 }
 
@@ -2396,7 +2300,8 @@ Stream &Stream::ThenBlasRotg(DeviceMemory<double> *a, DeviceMemory<double> *b,
   VLOG_CALL(PARAM(a), PARAM(b), PARAM(c), PARAM(s));
 
   ThenBlasImpl<DeviceMemory<double> *, DeviceMemory<double> *,
-               DeviceMemory<double> *, DeviceMemory<double> *> impl;
+               DeviceMemory<double> *, DeviceMemory<double> *>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasRotg, a, b, c, s);
 }
 
@@ -2408,7 +2313,8 @@ Stream &Stream::ThenBlasRotg(DeviceMemory<std::complex<float>> *a,
 
   ThenBlasImpl<DeviceMemory<std::complex<float>> *,
                DeviceMemory<std::complex<float>> *, DeviceMemory<float> *,
-               DeviceMemory<std::complex<float>> *> impl;
+               DeviceMemory<std::complex<float>> *>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasRotg, a, b, c, s);
 }
 
@@ -2420,7 +2326,8 @@ Stream &Stream::ThenBlasRotg(DeviceMemory<std::complex<double>> *a,
 
   ThenBlasImpl<DeviceMemory<std::complex<double>> *,
                DeviceMemory<std::complex<double>> *, DeviceMemory<double> *,
-               DeviceMemory<std::complex<double>> *> impl;
+               DeviceMemory<std::complex<double>> *>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasRotg, a, b, c, s);
 }
 
@@ -2431,7 +2338,8 @@ Stream &Stream::ThenBlasRotm(uint64 elem_count, DeviceMemory<float> *x,
             PARAM(param));
 
   ThenBlasImpl<uint64, DeviceMemory<float> *, int, DeviceMemory<float> *, int,
-               const DeviceMemory<float> &> impl;
+               const DeviceMemory<float> &>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasRotm, elem_count, x, incx, y,
               incy, param);
 }
@@ -2443,7 +2351,8 @@ Stream &Stream::ThenBlasRotm(uint64 elem_count, DeviceMemory<double> *x,
             PARAM(param));
 
   ThenBlasImpl<uint64, DeviceMemory<double> *, int, DeviceMemory<double> *, int,
-               const DeviceMemory<double> &> impl;
+               const DeviceMemory<double> &>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasRotm, elem_count, x, incx, y,
               incy, param);
 }
@@ -2456,7 +2365,8 @@ Stream &Stream::ThenBlasRotmg(DeviceMemory<float> *d1, DeviceMemory<float> *d2,
 
   ThenBlasImpl<DeviceMemory<float> *, DeviceMemory<float> *,
                DeviceMemory<float> *, const DeviceMemory<float> &,
-               DeviceMemory<float> *> impl;
+               DeviceMemory<float> *>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasRotmg, d1, d2, x1, y1, param);
 }
 
@@ -2469,7 +2379,8 @@ Stream &Stream::ThenBlasRotmg(DeviceMemory<double> *d1,
 
   ThenBlasImpl<DeviceMemory<double> *, DeviceMemory<double> *,
                DeviceMemory<double> *, const DeviceMemory<double> &,
-               DeviceMemory<double> *> impl;
+               DeviceMemory<double> *>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasRotmg, d1, d2, x1, y1, param);
 }
 
@@ -2510,7 +2421,8 @@ Stream &Stream::ThenBlasScal(uint64 elem_count, std::complex<float> alpha,
   VLOG_CALL(PARAM(elem_count), PARAM(alpha), PARAM(x), PARAM(incx));
 
   ThenBlasImpl<uint64, std::complex<float>, DeviceMemory<std::complex<float>> *,
-               int> impl;
+               int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasScal, elem_count, alpha, x, incx);
 }
 
@@ -2519,7 +2431,8 @@ Stream &Stream::ThenBlasScal(uint64 elem_count, std::complex<double> alpha,
   VLOG_CALL(PARAM(elem_count), PARAM(alpha), PARAM(x), PARAM(incx));
 
   ThenBlasImpl<uint64, std::complex<double>,
-               DeviceMemory<std::complex<double>> *, int> impl;
+               DeviceMemory<std::complex<double>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasScal, elem_count, alpha, x, incx);
 }
 
@@ -2549,7 +2462,8 @@ Stream &Stream::ThenBlasSwap(uint64 elem_count,
   VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy));
 
   ThenBlasImpl<uint64, DeviceMemory<std::complex<float>> *, int,
-               DeviceMemory<std::complex<float>> *, int> impl;
+               DeviceMemory<std::complex<float>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasSwap, elem_count, x, incx, y,
               incy);
 }
@@ -2560,7 +2474,8 @@ Stream &Stream::ThenBlasSwap(uint64 elem_count,
   VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy));
 
   ThenBlasImpl<uint64, DeviceMemory<std::complex<double>> *, int,
-               DeviceMemory<std::complex<double>> *, int> impl;
+               DeviceMemory<std::complex<double>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasSwap, elem_count, x, incx, y,
               incy);
 }
@@ -2591,7 +2506,8 @@ Stream &Stream::ThenBlasIamax(uint64 elem_count,
   VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(result));
 
   ThenBlasImpl<uint64, const DeviceMemory<std::complex<float>> &, int,
-               DeviceMemory<int> *> impl;
+               DeviceMemory<int> *>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasIamax, elem_count, x, incx,
               result);
 }
@@ -2602,7 +2518,8 @@ Stream &Stream::ThenBlasIamax(uint64 elem_count,
   VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(result));
 
   ThenBlasImpl<uint64, const DeviceMemory<std::complex<double>> &, int,
-               DeviceMemory<int> *> impl;
+               DeviceMemory<int> *>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasIamax, elem_count, x, incx,
               result);
 }
@@ -2633,7 +2550,8 @@ Stream &Stream::ThenBlasIamin(uint64 elem_count,
   VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(result));
 
   ThenBlasImpl<uint64, const DeviceMemory<std::complex<float>> &, int,
-               DeviceMemory<int> *> impl;
+               DeviceMemory<int> *>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasIamin, elem_count, x, incx,
               result);
 }
@@ -2644,7 +2562,8 @@ Stream &Stream::ThenBlasIamin(uint64 elem_count,
   VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(result));
 
   ThenBlasImpl<uint64, const DeviceMemory<std::complex<double>> &, int,
-               DeviceMemory<int> *> impl;
+               DeviceMemory<int> *>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasIamin, elem_count, x, incx,
               result);
 }
@@ -2660,7 +2579,8 @@ Stream &Stream::ThenBlasGbmv(blas::Transpose trans, uint64 m, uint64 n,
 
   ThenBlasImpl<blas::Transpose, uint64, uint64, uint64, uint64, float,
                const DeviceMemory<float> &, int, const DeviceMemory<float> &,
-               int, float, DeviceMemory<float> *, int> impl;
+               int, float, DeviceMemory<float> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasGbmv, trans, m, n, kl, ku, alpha,
               a, lda, x, incx, beta, y, incy);
 }
@@ -2676,7 +2596,8 @@ Stream &Stream::ThenBlasGbmv(blas::Transpose trans, uint64 m, uint64 n,
 
   ThenBlasImpl<blas::Transpose, uint64, uint64, uint64, uint64, double,
                const DeviceMemory<double> &, int, const DeviceMemory<double> &,
-               int, double, DeviceMemory<double> *, int> impl;
+               int, double, DeviceMemory<double> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasGbmv, trans, m, n, kl, ku, alpha,
               a, lda, x, incx, beta, y, incy);
 }
@@ -2695,8 +2616,8 @@ Stream &Stream::ThenBlasGbmv(blas::Transpose trans, uint64 m, uint64 n,
   ThenBlasImpl<blas::Transpose, uint64, uint64, uint64, uint64,
                std::complex<float>, const DeviceMemory<std::complex<float>> &,
                int, const DeviceMemory<std::complex<float>> &, int,
-               std::complex<float>, DeviceMemory<std::complex<float>> *,
-               int> impl;
+               std::complex<float>, DeviceMemory<std::complex<float>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasGbmv, trans, m, n, kl, ku, alpha,
               a, lda, x, incx, beta, y, incy);
 }
@@ -2715,8 +2636,8 @@ Stream &Stream::ThenBlasGbmv(blas::Transpose trans, uint64 m, uint64 n,
   ThenBlasImpl<blas::Transpose, uint64, uint64, uint64, uint64,
                std::complex<double>, const DeviceMemory<std::complex<double>> &,
                int, const DeviceMemory<std::complex<double>> &, int,
-               std::complex<double>, DeviceMemory<std::complex<double>> *,
-               int> impl;
+               std::complex<double>, DeviceMemory<std::complex<double>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasGbmv, trans, m, n, kl, ku, alpha,
               a, lda, x, incx, beta, y, incy);
 }
@@ -2731,7 +2652,8 @@ Stream &Stream::ThenBlasGemv(blas::Transpose trans, uint64 m, uint64 n,
 
   ThenBlasImpl<blas::Transpose, uint64, uint64, float,
                const DeviceMemory<float> &, int, const DeviceMemory<float> &,
-               int, float, DeviceMemory<float> *, int> impl;
+               int, float, DeviceMemory<float> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasGemv, trans, m, n, alpha, a, lda,
               x, incx, beta, y, incy);
 }
@@ -2746,7 +2668,8 @@ Stream &Stream::ThenBlasGemv(blas::Transpose trans, uint64 m, uint64 n,
 
   ThenBlasImpl<blas::Transpose, uint64, uint64, double,
                const DeviceMemory<double> &, int, const DeviceMemory<double> &,
-               int, double, DeviceMemory<double> *, int> impl;
+               int, double, DeviceMemory<double> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasGemv, trans, m, n, alpha, a, lda,
               x, incx, beta, y, incy);
 }
@@ -2765,8 +2688,8 @@ Stream &Stream::ThenBlasGemv(blas::Transpose trans, uint64 m, uint64 n,
   ThenBlasImpl<blas::Transpose, uint64, uint64, std::complex<float>,
                const DeviceMemory<std::complex<float>> &, int,
                const DeviceMemory<std::complex<float>> &, int,
-               std::complex<float>, DeviceMemory<std::complex<float>> *,
-               int> impl;
+               std::complex<float>, DeviceMemory<std::complex<float>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasGemv, trans, m, n, alpha, a, lda,
               x, incx, beta, y, incy);
 }
@@ -2785,8 +2708,8 @@ Stream &Stream::ThenBlasGemv(blas::Transpose trans, uint64 m, uint64 n,
   ThenBlasImpl<blas::Transpose, uint64, uint64, std::complex<double>,
                const DeviceMemory<std::complex<double>> &, int,
                const DeviceMemory<std::complex<double>> &, int,
-               std::complex<double>, DeviceMemory<std::complex<double>> *,
-               int> impl;
+               std::complex<double>, DeviceMemory<std::complex<double>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasGemv, trans, m, n, alpha, a, lda,
               x, incx, beta, y, incy);
 }
@@ -2799,8 +2722,8 @@ Stream &Stream::ThenBlasGer(uint64 m, uint64 n, float alpha,
             PARAM(incy), PARAM(a), PARAM(lda));
 
   ThenBlasImpl<uint64, uint64, float, const DeviceMemory<float> &, int,
-               const DeviceMemory<float> &, int, DeviceMemory<float> *,
-               int> impl;
+               const DeviceMemory<float> &, int, DeviceMemory<float> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasGer, m, n, alpha, x, incx, y,
               incy, a, lda);
 }
@@ -2813,8 +2736,8 @@ Stream &Stream::ThenBlasGer(uint64 m, uint64 n, double alpha,
             PARAM(incy), PARAM(a), PARAM(lda));
 
   ThenBlasImpl<uint64, uint64, double, const DeviceMemory<double> &, int,
-               const DeviceMemory<double> &, int, DeviceMemory<double> *,
-               int> impl;
+               const DeviceMemory<double> &, int, DeviceMemory<double> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasGer, m, n, alpha, x, incx, y,
               incy, a, lda);
 }
@@ -2831,7 +2754,8 @@ Stream &Stream::ThenBlasGerc(uint64 m, uint64 n, std::complex<float> alpha,
   ThenBlasImpl<uint64, uint64, std::complex<float>,
                const DeviceMemory<std::complex<float>> &, int,
                const DeviceMemory<std::complex<float>> &, int,
-               DeviceMemory<std::complex<float>> *, int> impl;
+               DeviceMemory<std::complex<float>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasGerc, m, n, alpha, x, incx, y,
               incy, a, lda);
 }
@@ -2848,7 +2772,8 @@ Stream &Stream::ThenBlasGerc(uint64 m, uint64 n, std::complex<double> alpha,
   ThenBlasImpl<uint64, uint64, std::complex<double>,
                const DeviceMemory<std::complex<double>> &, int,
                const DeviceMemory<std::complex<double>> &, int,
-               DeviceMemory<std::complex<double>> *, int> impl;
+               DeviceMemory<std::complex<double>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasGerc, m, n, alpha, x, incx, y,
               incy, a, lda);
 }
@@ -2865,7 +2790,8 @@ Stream &Stream::ThenBlasGeru(uint64 m, uint64 n, std::complex<float> alpha,
   ThenBlasImpl<uint64, uint64, std::complex<float>,
                const DeviceMemory<std::complex<float>> &, int,
                const DeviceMemory<std::complex<float>> &, int,
-               DeviceMemory<std::complex<float>> *, int> impl;
+               DeviceMemory<std::complex<float>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasGeru, m, n, alpha, x, incx, y,
               incy, a, lda);
 }
@@ -2882,7 +2808,8 @@ Stream &Stream::ThenBlasGeru(uint64 m, uint64 n, std::complex<double> alpha,
   ThenBlasImpl<uint64, uint64, std::complex<double>,
                const DeviceMemory<std::complex<double>> &, int,
                const DeviceMemory<std::complex<double>> &, int,
-               DeviceMemory<std::complex<double>> *, int> impl;
+               DeviceMemory<std::complex<double>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasGeru, m, n, alpha, x, incx, y,
               incy, a, lda);
 }
@@ -2900,8 +2827,8 @@ Stream &Stream::ThenBlasHbmv(blas::UpperLower uplo, uint64 n, uint64 k,
   ThenBlasImpl<blas::UpperLower, uint64, uint64, std::complex<float>,
                const DeviceMemory<std::complex<float>> &, int,
                const DeviceMemory<std::complex<float>> &, int,
-               std::complex<float>, DeviceMemory<std::complex<float>> *,
-               int> impl;
+               std::complex<float>, DeviceMemory<std::complex<float>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasHbmv, uplo, n, k, alpha, a, lda,
               x, incx, beta, y, incy);
 }
@@ -2919,8 +2846,8 @@ Stream &Stream::ThenBlasHbmv(blas::UpperLower uplo, uint64 n, uint64 k,
   ThenBlasImpl<blas::UpperLower, uint64, uint64, std::complex<double>,
                const DeviceMemory<std::complex<double>> &, int,
                const DeviceMemory<std::complex<double>> &, int,
-               std::complex<double>, DeviceMemory<std::complex<double>> *,
-               int> impl;
+               std::complex<double>, DeviceMemory<std::complex<double>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasHbmv, uplo, n, k, alpha, a, lda,
               x, incx, beta, y, incy);
 }
@@ -2938,8 +2865,8 @@ Stream &Stream::ThenBlasHemv(blas::UpperLower uplo, uint64 n,
   ThenBlasImpl<blas::UpperLower, uint64, std::complex<float>,
                const DeviceMemory<std::complex<float>> &, int,
                const DeviceMemory<std::complex<float>> &, int,
-               std::complex<float>, DeviceMemory<std::complex<float>> *,
-               int> impl;
+               std::complex<float>, DeviceMemory<std::complex<float>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasHemv, uplo, n, alpha, a, lda, x,
               incx, beta, y, incy);
 }
@@ -2957,8 +2884,8 @@ Stream &Stream::ThenBlasHemv(blas::UpperLower uplo, uint64 n,
   ThenBlasImpl<blas::UpperLower, uint64, std::complex<double>,
                const DeviceMemory<std::complex<double>> &, int,
                const DeviceMemory<std::complex<double>> &, int,
-               std::complex<double>, DeviceMemory<std::complex<double>> *,
-               int> impl;
+               std::complex<double>, DeviceMemory<std::complex<double>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasHemv, uplo, n, alpha, a, lda, x,
               incx, beta, y, incy);
 }
@@ -2972,7 +2899,8 @@ Stream &Stream::ThenBlasHer(blas::UpperLower uplo, uint64 n, float alpha,
 
   ThenBlasImpl<blas::UpperLower, uint64, float,
                const DeviceMemory<std::complex<float>> &, int,
-               DeviceMemory<std::complex<float>> *, int> impl;
+               DeviceMemory<std::complex<float>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasHer, uplo, n, alpha, x, incx, a,
               lda);
 }
@@ -2986,7 +2914,8 @@ Stream &Stream::ThenBlasHer(blas::UpperLower uplo, uint64 n, double alpha,
 
   ThenBlasImpl<blas::UpperLower, uint64, double,
                const DeviceMemory<std::complex<double>> &, int,
-               DeviceMemory<std::complex<double>> *, int> impl;
+               DeviceMemory<std::complex<double>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasHer, uplo, n, alpha, x, incx, a,
               lda);
 }
@@ -3004,7 +2933,8 @@ Stream &Stream::ThenBlasHer2(blas::UpperLower uplo, uint64 n,
   ThenBlasImpl<blas::UpperLower, uint64, std::complex<float>,
                const DeviceMemory<std::complex<float>> &, int,
                const DeviceMemory<std::complex<float>> &, int,
-               DeviceMemory<std::complex<float>> *, int> impl;
+               DeviceMemory<std::complex<float>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasHer2, uplo, n, alpha, x, incx, y,
               incy, a, lda);
 }
@@ -3022,7 +2952,8 @@ Stream &Stream::ThenBlasHer2(blas::UpperLower uplo, uint64 n,
   ThenBlasImpl<blas::UpperLower, uint64, std::complex<double>,
                const DeviceMemory<std::complex<double>> &, int,
                const DeviceMemory<std::complex<double>> &, int,
-               DeviceMemory<std::complex<double>> *, int> impl;
+               DeviceMemory<std::complex<double>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasHer2, uplo, n, alpha, x, incx, y,
               incy, a, lda);
 }
@@ -3039,8 +2970,8 @@ Stream &Stream::ThenBlasHpmv(blas::UpperLower uplo, uint64 n,
   ThenBlasImpl<blas::UpperLower, uint64, std::complex<float>,
                const DeviceMemory<std::complex<float>> &,
                const DeviceMemory<std::complex<float>> &, int,
-               std::complex<float>, DeviceMemory<std::complex<float>> *,
-               int> impl;
+               std::complex<float>, DeviceMemory<std::complex<float>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasHpmv, uplo, n, alpha, ap, x, incx,
               beta, y, incy);
 }
@@ -3057,8 +2988,8 @@ Stream &Stream::ThenBlasHpmv(blas::UpperLower uplo, uint64 n,
   ThenBlasImpl<blas::UpperLower, uint64, std::complex<double>,
                const DeviceMemory<std::complex<double>> &,
                const DeviceMemory<std::complex<double>> &, int,
-               std::complex<double>, DeviceMemory<std::complex<double>> *,
-               int> impl;
+               std::complex<double>, DeviceMemory<std::complex<double>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasHpmv, uplo, n, alpha, ap, x, incx,
               beta, y, incy);
 }
@@ -3071,7 +3002,8 @@ Stream &Stream::ThenBlasHpr(blas::UpperLower uplo, uint64 n, float alpha,
 
   ThenBlasImpl<blas::UpperLower, uint64, float,
                const DeviceMemory<std::complex<float>> &, int,
-               DeviceMemory<std::complex<float>> *> impl;
+               DeviceMemory<std::complex<float>> *>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasHpr, uplo, n, alpha, x, incx, ap);
 }
 
@@ -3083,7 +3015,8 @@ Stream &Stream::ThenBlasHpr(blas::UpperLower uplo, uint64 n, double alpha,
 
   ThenBlasImpl<blas::UpperLower, uint64, double,
                const DeviceMemory<std::complex<double>> &, int,
-               DeviceMemory<std::complex<double>> *> impl;
+               DeviceMemory<std::complex<double>> *>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasHpr, uplo, n, alpha, x, incx, ap);
 }
 
@@ -3099,7 +3032,8 @@ Stream &Stream::ThenBlasHpr2(blas::UpperLower uplo, uint64 n,
   ThenBlasImpl<blas::UpperLower, uint64, std::complex<float>,
                const DeviceMemory<std::complex<float>> &, int,
                const DeviceMemory<std::complex<float>> &, int,
-               DeviceMemory<std::complex<float>> *> impl;
+               DeviceMemory<std::complex<float>> *>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasHpr2, uplo, n, alpha, x, incx, y,
               incy, ap);
 }
@@ -3116,7 +3050,8 @@ Stream &Stream::ThenBlasHpr2(blas::UpperLower uplo, uint64 n,
   ThenBlasImpl<blas::UpperLower, uint64, std::complex<double>,
                const DeviceMemory<std::complex<double>> &, int,
                const DeviceMemory<std::complex<double>> &, int,
-               DeviceMemory<std::complex<double>> *> impl;
+               DeviceMemory<std::complex<double>> *>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasHpr2, uplo, n, alpha, x, incx, y,
               incy, ap);
 }
@@ -3130,7 +3065,8 @@ Stream &Stream::ThenBlasSbmv(blas::UpperLower uplo, uint64 n, uint64 k,
 
   ThenBlasImpl<blas::UpperLower, uint64, uint64, float,
                const DeviceMemory<float> &, int, const DeviceMemory<float> &,
-               int, float, DeviceMemory<float> *, int> impl;
+               int, float, DeviceMemory<float> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasSbmv, uplo, n, k, alpha, a, lda,
               x, incx, beta, y, incy);
 }
@@ -3144,7 +3080,8 @@ Stream &Stream::ThenBlasSbmv(blas::UpperLower uplo, uint64 n, uint64 k,
 
   ThenBlasImpl<blas::UpperLower, uint64, uint64, double,
                const DeviceMemory<double> &, int, const DeviceMemory<double> &,
-               int, double, DeviceMemory<double> *, int> impl;
+               int, double, DeviceMemory<double> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasSbmv, uplo, n, k, alpha, a, lda,
               x, incx, beta, y, incy);
 }
@@ -3158,7 +3095,8 @@ Stream &Stream::ThenBlasSpmv(blas::UpperLower uplo, uint64 n, float alpha,
 
   ThenBlasImpl<blas::UpperLower, uint64, float, const DeviceMemory<float> &,
                const DeviceMemory<float> &, int, float, DeviceMemory<float> *,
-               int> impl;
+               int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasSpmv, uplo, n, alpha, ap, x, incx,
               beta, y, incy);
 }
@@ -3172,7 +3110,8 @@ Stream &Stream::ThenBlasSpmv(blas::UpperLower uplo, uint64 n, double alpha,
 
   ThenBlasImpl<blas::UpperLower, uint64, double, const DeviceMemory<double> &,
                const DeviceMemory<double> &, int, double,
-               DeviceMemory<double> *, int> impl;
+               DeviceMemory<double> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasSpmv, uplo, n, alpha, ap, x, incx,
               beta, y, incy);
 }
@@ -3184,7 +3123,8 @@ Stream &Stream::ThenBlasSpr(blas::UpperLower uplo, uint64 n, float alpha,
             PARAM(ap));
 
   ThenBlasImpl<blas::UpperLower, uint64, float, const DeviceMemory<float> &,
-               int, DeviceMemory<float> *> impl;
+               int, DeviceMemory<float> *>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasSpr, uplo, n, alpha, x, incx, ap);
 }
 
@@ -3195,7 +3135,8 @@ Stream &Stream::ThenBlasSpr(blas::UpperLower uplo, uint64 n, double alpha,
             PARAM(ap));
 
   ThenBlasImpl<blas::UpperLower, uint64, double, const DeviceMemory<double> &,
-               int, DeviceMemory<double> *> impl;
+               int, DeviceMemory<double> *>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasSpr, uplo, n, alpha, x, incx, ap);
 }
 
@@ -3207,8 +3148,8 @@ Stream &Stream::ThenBlasSpr2(blas::UpperLower uplo, uint64 n, float alpha,
             PARAM(y), PARAM(incy), PARAM(ap));
 
   ThenBlasImpl<blas::UpperLower, uint64, float, const DeviceMemory<float> &,
-               int, const DeviceMemory<float> &, int,
-               DeviceMemory<float> *> impl;
+               int, const DeviceMemory<float> &, int, DeviceMemory<float> *>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasSpr2, uplo, n, alpha, x, incx, y,
               incy, ap);
 }
@@ -3221,8 +3162,8 @@ Stream &Stream::ThenBlasSpr2(blas::UpperLower uplo, uint64 n, double alpha,
             PARAM(y), PARAM(incy), PARAM(ap));
 
   ThenBlasImpl<blas::UpperLower, uint64, double, const DeviceMemory<double> &,
-               int, const DeviceMemory<double> &, int,
-               DeviceMemory<double> *> impl;
+               int, const DeviceMemory<double> &, int, DeviceMemory<double> *>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasSpr2, uplo, n, alpha, x, incx, y,
               incy, ap);
 }
@@ -3236,7 +3177,8 @@ Stream &Stream::ThenBlasSymv(blas::UpperLower uplo, uint64 n, float alpha,
 
   ThenBlasImpl<blas::UpperLower, uint64, float, const DeviceMemory<float> &,
                int, const DeviceMemory<float> &, int, float,
-               DeviceMemory<float> *, int> impl;
+               DeviceMemory<float> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasSymv, uplo, n, alpha, a, lda, x,
               incx, beta, y, incy);
 }
@@ -3250,7 +3192,8 @@ Stream &Stream::ThenBlasSymv(blas::UpperLower uplo, uint64 n, double alpha,
 
   ThenBlasImpl<blas::UpperLower, uint64, double, const DeviceMemory<double> &,
                int, const DeviceMemory<double> &, int, double,
-               DeviceMemory<double> *, int> impl;
+               DeviceMemory<double> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasSymv, uplo, n, alpha, a, lda, x,
               incx, beta, y, incy);
 }
@@ -3262,7 +3205,8 @@ Stream &Stream::ThenBlasSyr(blas::UpperLower uplo, uint64 n, float alpha,
             PARAM(a), PARAM(lda));
 
   ThenBlasImpl<blas::UpperLower, uint64, float, const DeviceMemory<float> &,
-               int, DeviceMemory<float> *, int> impl;
+               int, DeviceMemory<float> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasSyr, uplo, n, alpha, x, incx, a,
               lda);
 }
@@ -3274,7 +3218,8 @@ Stream &Stream::ThenBlasSyr(blas::UpperLower uplo, uint64 n, double alpha,
             PARAM(a), PARAM(lda));
 
   ThenBlasImpl<blas::UpperLower, uint64, double, const DeviceMemory<double> &,
-               int, DeviceMemory<double> *, int> impl;
+               int, DeviceMemory<double> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasSyr, uplo, n, alpha, x, incx, a,
               lda);
 }
@@ -3288,7 +3233,8 @@ Stream &Stream::ThenBlasSyr2(blas::UpperLower uplo, uint64 n, float alpha,
 
   ThenBlasImpl<blas::UpperLower, uint64, float, const DeviceMemory<float> &,
                int, const DeviceMemory<float> &, int, DeviceMemory<float> *,
-               int> impl;
+               int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasSyr2, uplo, n, alpha, x, incx, y,
               incy, a, lda);
 }
@@ -3302,7 +3248,8 @@ Stream &Stream::ThenBlasSyr2(blas::UpperLower uplo, uint64 n, double alpha,
 
   ThenBlasImpl<blas::UpperLower, uint64, double, const DeviceMemory<double> &,
                int, const DeviceMemory<double> &, int, DeviceMemory<double> *,
-               int> impl;
+               int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasSyr2, uplo, n, alpha, x, incx, y,
               incy, a, lda);
 }
@@ -3316,7 +3263,8 @@ Stream &Stream::ThenBlasTbmv(blas::UpperLower uplo, blas::Transpose trans,
 
   ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64,
                uint64, const DeviceMemory<float> &, int, DeviceMemory<float> *,
-               int> impl;
+               int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasTbmv, uplo, trans, diag, n, k, a,
               lda, x, incx);
 }
@@ -3330,7 +3278,8 @@ Stream &Stream::ThenBlasTbmv(blas::UpperLower uplo, blas::Transpose trans,
 
   ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64,
                uint64, const DeviceMemory<double> &, int,
-               DeviceMemory<double> *, int> impl;
+               DeviceMemory<double> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasTbmv, uplo, trans, diag, n, k, a,
               lda, x, incx);
 }
@@ -3345,7 +3294,8 @@ Stream &Stream::ThenBlasTbmv(blas::UpperLower uplo, blas::Transpose trans,
 
   ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64,
                uint64, const DeviceMemory<std::complex<float>> &, int,
-               DeviceMemory<std::complex<float>> *, int> impl;
+               DeviceMemory<std::complex<float>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasTbmv, uplo, trans, diag, n, k, a,
               lda, x, incx);
 }
@@ -3360,7 +3310,8 @@ Stream &Stream::ThenBlasTbmv(blas::UpperLower uplo, blas::Transpose trans,
 
   ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64,
                uint64, const DeviceMemory<std::complex<double>> &, int,
-               DeviceMemory<std::complex<double>> *, int> impl;
+               DeviceMemory<std::complex<double>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasTbmv, uplo, trans, diag, n, k, a,
               lda, x, incx);
 }
@@ -3374,7 +3325,8 @@ Stream &Stream::ThenBlasTbsv(blas::UpperLower uplo, blas::Transpose trans,
 
   ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64,
                uint64, const DeviceMemory<float> &, int, DeviceMemory<float> *,
-               int> impl;
+               int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasTbsv, uplo, trans, diag, n, k, a,
               lda, x, incx);
 }
@@ -3388,7 +3340,8 @@ Stream &Stream::ThenBlasTbsv(blas::UpperLower uplo, blas::Transpose trans,
 
   ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64,
                uint64, const DeviceMemory<double> &, int,
-               DeviceMemory<double> *, int> impl;
+               DeviceMemory<double> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasTbsv, uplo, trans, diag, n, k, a,
               lda, x, incx);
 }
@@ -3403,7 +3356,8 @@ Stream &Stream::ThenBlasTbsv(blas::UpperLower uplo, blas::Transpose trans,
 
   ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64,
                uint64, const DeviceMemory<std::complex<float>> &, int,
-               DeviceMemory<std::complex<float>> *, int> impl;
+               DeviceMemory<std::complex<float>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasTbsv, uplo, trans, diag, n, k, a,
               lda, x, incx);
 }
@@ -3418,7 +3372,8 @@ Stream &Stream::ThenBlasTbsv(blas::UpperLower uplo, blas::Transpose trans,
 
   ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64,
                uint64, const DeviceMemory<std::complex<double>> &, int,
-               DeviceMemory<std::complex<double>> *, int> impl;
+               DeviceMemory<std::complex<double>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasTbsv, uplo, trans, diag, n, k, a,
               lda, x, incx);
 }
@@ -3431,7 +3386,8 @@ Stream &Stream::ThenBlasTpmv(blas::UpperLower uplo, blas::Transpose trans,
             PARAM(x), PARAM(incx));
 
   ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64,
-               const DeviceMemory<float> &, DeviceMemory<float> *, int> impl;
+               const DeviceMemory<float> &, DeviceMemory<float> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasTpmv, uplo, trans, diag, n, ap, x,
               incx);
 }
@@ -3444,7 +3400,8 @@ Stream &Stream::ThenBlasTpmv(blas::UpperLower uplo, blas::Transpose trans,
             PARAM(x), PARAM(incx));
 
   ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64,
-               const DeviceMemory<double> &, DeviceMemory<double> *, int> impl;
+               const DeviceMemory<double> &, DeviceMemory<double> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasTpmv, uplo, trans, diag, n, ap, x,
               incx);
 }
@@ -3458,7 +3415,8 @@ Stream &Stream::ThenBlasTpmv(blas::UpperLower uplo, blas::Transpose trans,
 
   ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64,
                const DeviceMemory<std::complex<float>> &,
-               DeviceMemory<std::complex<float>> *, int> impl;
+               DeviceMemory<std::complex<float>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasTpmv, uplo, trans, diag, n, ap, x,
               incx);
 }
@@ -3472,7 +3430,8 @@ Stream &Stream::ThenBlasTpmv(blas::UpperLower uplo, blas::Transpose trans,
 
   ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64,
                const DeviceMemory<std::complex<double>> &,
-               DeviceMemory<std::complex<double>> *, int> impl;
+               DeviceMemory<std::complex<double>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasTpmv, uplo, trans, diag, n, ap, x,
               incx);
 }
@@ -3485,7 +3444,8 @@ Stream &Stream::ThenBlasTpsv(blas::UpperLower uplo, blas::Transpose trans,
             PARAM(x), PARAM(incx));
 
   ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64,
-               const DeviceMemory<float> &, DeviceMemory<float> *, int> impl;
+               const DeviceMemory<float> &, DeviceMemory<float> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasTpsv, uplo, trans, diag, n, ap, x,
               incx);
 }
@@ -3498,7 +3458,8 @@ Stream &Stream::ThenBlasTpsv(blas::UpperLower uplo, blas::Transpose trans,
             PARAM(x), PARAM(incx));
 
   ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64,
-               const DeviceMemory<double> &, DeviceMemory<double> *, int> impl;
+               const DeviceMemory<double> &, DeviceMemory<double> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasTpsv, uplo, trans, diag, n, ap, x,
               incx);
 }
@@ -3512,7 +3473,8 @@ Stream &Stream::ThenBlasTpsv(blas::UpperLower uplo, blas::Transpose trans,
 
   ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64,
                const DeviceMemory<std::complex<float>> &,
-               DeviceMemory<std::complex<float>> *, int> impl;
+               DeviceMemory<std::complex<float>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasTpsv, uplo, trans, diag, n, ap, x,
               incx);
 }
@@ -3526,7 +3488,8 @@ Stream &Stream::ThenBlasTpsv(blas::UpperLower uplo, blas::Transpose trans,
 
   ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64,
                const DeviceMemory<std::complex<double>> &,
-               DeviceMemory<std::complex<double>> *, int> impl;
+               DeviceMemory<std::complex<double>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasTpsv, uplo, trans, diag, n, ap, x,
               incx);
 }
@@ -3539,8 +3502,8 @@ Stream &Stream::ThenBlasTrmv(blas::UpperLower uplo, blas::Transpose trans,
             PARAM(lda), PARAM(x), PARAM(incx));
 
   ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64,
-               const DeviceMemory<float> &, int, DeviceMemory<float> *,
-               int> impl;
+               const DeviceMemory<float> &, int, DeviceMemory<float> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasTrmv, uplo, trans, diag, n, a,
               lda, x, incx);
 }
@@ -3553,8 +3516,8 @@ Stream &Stream::ThenBlasTrmv(blas::UpperLower uplo, blas::Transpose trans,
             PARAM(lda), PARAM(x), PARAM(incx));
 
   ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64,
-               const DeviceMemory<double> &, int, DeviceMemory<double> *,
-               int> impl;
+               const DeviceMemory<double> &, int, DeviceMemory<double> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasTrmv, uplo, trans, diag, n, a,
               lda, x, incx);
 }
@@ -3569,7 +3532,8 @@ Stream &Stream::ThenBlasTrmv(blas::UpperLower uplo, blas::Transpose trans,
 
   ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64,
                const DeviceMemory<std::complex<float>> &, int,
-               DeviceMemory<std::complex<float>> *, int> impl;
+               DeviceMemory<std::complex<float>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasTrmv, uplo, trans, diag, n, a,
               lda, x, incx);
 }
@@ -3584,7 +3548,8 @@ Stream &Stream::ThenBlasTrmv(blas::UpperLower uplo, blas::Transpose trans,
 
   ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64,
                const DeviceMemory<std::complex<double>> &, int,
-               DeviceMemory<std::complex<double>> *, int> impl;
+               DeviceMemory<std::complex<double>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasTrmv, uplo, trans, diag, n, a,
               lda, x, incx);
 }
@@ -3597,8 +3562,8 @@ Stream &Stream::ThenBlasTrsv(blas::UpperLower uplo, blas::Transpose trans,
             PARAM(lda), PARAM(x), PARAM(incx));
 
   ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64,
-               const DeviceMemory<float> &, int, DeviceMemory<float> *,
-               int> impl;
+               const DeviceMemory<float> &, int, DeviceMemory<float> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasTrsv, uplo, trans, diag, n, a,
               lda, x, incx);
 }
@@ -3611,8 +3576,8 @@ Stream &Stream::ThenBlasTrsv(blas::UpperLower uplo, blas::Transpose trans,
             PARAM(lda), PARAM(x), PARAM(incx));
 
   ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64,
-               const DeviceMemory<double> &, int, DeviceMemory<double> *,
-               int> impl;
+               const DeviceMemory<double> &, int, DeviceMemory<double> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasTrsv, uplo, trans, diag, n, a,
               lda, x, incx);
 }
@@ -3627,7 +3592,8 @@ Stream &Stream::ThenBlasTrsv(blas::UpperLower uplo, blas::Transpose trans,
 
   ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64,
                const DeviceMemory<std::complex<float>> &, int,
-               DeviceMemory<std::complex<float>> *, int> impl;
+               DeviceMemory<std::complex<float>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasTrsv, uplo, trans, diag, n, a,
               lda, x, incx);
 }
@@ -3642,7 +3608,8 @@ Stream &Stream::ThenBlasTrsv(blas::UpperLower uplo, blas::Transpose trans,
 
   ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64,
                const DeviceMemory<std::complex<double>> &, int,
-               DeviceMemory<std::complex<double>> *, int> impl;
+               DeviceMemory<std::complex<double>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasTrsv, uplo, trans, diag, n, a,
               lda, x, incx);
 }
@@ -3651,16 +3618,17 @@ Stream &Stream::ThenBlasGemm(blas::Transpose transa, blas::Transpose transb,
                              uint64 m, uint64 n, uint64 k, float alpha,
                              const DeviceMemory<Eigen::half> &a, int lda,
                              const DeviceMemory<Eigen::half> &b, int ldb,
-                             float beta,
-                             DeviceMemory<Eigen::half> *c, int ldc) {
+                             float beta, DeviceMemory<Eigen::half> *c,
+                             int ldc) {
   VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k),
             PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb),
             PARAM(beta), PARAM(c), PARAM(ldc));
 
   ThenBlasImpl<blas::Transpose, blas::Transpose, uint64, uint64, uint64, float,
                const DeviceMemory<Eigen::half> &, int,
-               const DeviceMemory<Eigen::half> &, int,
-               float, DeviceMemory<Eigen::half> *, int> impl;
+               const DeviceMemory<Eigen::half> &, int, float,
+               DeviceMemory<Eigen::half> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasGemm, transa, transb, m, n, k,
               alpha, a, lda, b, ldb, beta, c, ldc);
 }
@@ -3676,7 +3644,8 @@ Stream &Stream::ThenBlasGemm(blas::Transpose transa, blas::Transpose transb,
 
   ThenBlasImpl<blas::Transpose, blas::Transpose, uint64, uint64, uint64, float,
                const DeviceMemory<float> &, int, const DeviceMemory<float> &,
-               int, float, DeviceMemory<float> *, int> impl;
+               int, float, DeviceMemory<float> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasGemm, transa, transb, m, n, k,
               alpha, a, lda, b, ldb, beta, c, ldc);
 }
@@ -3692,7 +3661,8 @@ Stream &Stream::ThenBlasGemm(blas::Transpose transa, blas::Transpose transb,
 
   ThenBlasImpl<blas::Transpose, blas::Transpose, uint64, uint64, uint64, double,
                const DeviceMemory<double> &, int, const DeviceMemory<double> &,
-               int, double, DeviceMemory<double> *, int> impl;
+               int, double, DeviceMemory<double> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasGemm, transa, transb, m, n, k,
               alpha, a, lda, b, ldb, beta, c, ldc);
 }
@@ -3712,8 +3682,8 @@ Stream &Stream::ThenBlasGemm(blas::Transpose transa, blas::Transpose transb,
   ThenBlasImpl<blas::Transpose, blas::Transpose, uint64, uint64, uint64,
                std::complex<float>, const DeviceMemory<std::complex<float>> &,
                int, const DeviceMemory<std::complex<float>> &, int,
-               std::complex<float>, DeviceMemory<std::complex<float>> *,
-               int> impl;
+               std::complex<float>, DeviceMemory<std::complex<float>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasGemm, transa, transb, m, n, k,
               alpha, a, lda, b, ldb, beta, c, ldc);
 }
@@ -3733,8 +3703,8 @@ Stream &Stream::ThenBlasGemm(blas::Transpose transa, blas::Transpose transb,
   ThenBlasImpl<blas::Transpose, blas::Transpose, uint64, uint64, uint64,
                std::complex<double>, const DeviceMemory<std::complex<double>> &,
                int, const DeviceMemory<std::complex<double>> &, int,
-               std::complex<double>, DeviceMemory<std::complex<double>> *,
-               int> impl;
+               std::complex<double>, DeviceMemory<std::complex<double>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasGemm, transa, transb, m, n, k,
               alpha, a, lda, b, ldb, beta, c, ldc);
 }
@@ -4100,8 +4070,8 @@ Stream &Stream::ThenBlasHemm(blas::Side side, blas::UpperLower uplo, uint64 m,
   ThenBlasImpl<blas::Side, blas::UpperLower, uint64, uint64,
                std::complex<float>, const DeviceMemory<std::complex<float>> &,
                int, const DeviceMemory<std::complex<float>> &, int,
-               std::complex<float>, DeviceMemory<std::complex<float>> *,
-               int> impl;
+               std::complex<float>, DeviceMemory<std::complex<float>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasHemm, side, uplo, m, n, alpha, a,
               lda, b, ldb, beta, c, ldc);
 }
@@ -4120,8 +4090,8 @@ Stream &Stream::ThenBlasHemm(blas::Side side, blas::UpperLower uplo, uint64 m,
   ThenBlasImpl<blas::Side, blas::UpperLower, uint64, uint64,
                std::complex<double>, const DeviceMemory<std::complex<double>> &,
                int, const DeviceMemory<std::complex<double>> &, int,
-               std::complex<double>, DeviceMemory<std::complex<double>> *,
-               int> impl;
+               std::complex<double>, DeviceMemory<std::complex<double>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasHemm, side, uplo, m, n, alpha, a,
               lda, b, ldb, beta, c, ldc);
 }
@@ -4136,7 +4106,8 @@ Stream &Stream::ThenBlasHerk(blas::UpperLower uplo, blas::Transpose trans,
 
   ThenBlasImpl<blas::UpperLower, blas::Transpose, uint64, uint64, float,
                const DeviceMemory<std::complex<float>> &, int, float,
-               DeviceMemory<std::complex<float>> *, int> impl;
+               DeviceMemory<std::complex<float>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasHerk, uplo, trans, n, k, alpha, a,
               lda, beta, c, ldc);
 }
@@ -4151,7 +4122,8 @@ Stream &Stream::ThenBlasHerk(blas::UpperLower uplo, blas::Transpose trans,
 
   ThenBlasImpl<blas::UpperLower, blas::Transpose, uint64, uint64, double,
                const DeviceMemory<std::complex<double>> &, int, double,
-               DeviceMemory<std::complex<double>> *, int> impl;
+               DeviceMemory<std::complex<double>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasHerk, uplo, trans, n, k, alpha, a,
               lda, beta, c, ldc);
 }
@@ -4170,7 +4142,8 @@ Stream &Stream::ThenBlasHer2k(blas::UpperLower uplo, blas::Transpose trans,
   ThenBlasImpl<blas::UpperLower, blas::Transpose, uint64, uint64,
                std::complex<float>, const DeviceMemory<std::complex<float>> &,
                int, const DeviceMemory<std::complex<float>> &, int, float,
-               DeviceMemory<std::complex<float>> *, int> impl;
+               DeviceMemory<std::complex<float>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasHer2k, uplo, trans, n, k, alpha,
               a, lda, b, ldb, beta, c, ldc);
 }
@@ -4189,7 +4162,8 @@ Stream &Stream::ThenBlasHer2k(blas::UpperLower uplo, blas::Transpose trans,
   ThenBlasImpl<blas::UpperLower, blas::Transpose, uint64, uint64,
                std::complex<double>, const DeviceMemory<std::complex<double>> &,
                int, const DeviceMemory<std::complex<double>> &, int, double,
-               DeviceMemory<std::complex<double>> *, int> impl;
+               DeviceMemory<std::complex<double>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasHer2k, uplo, trans, n, k, alpha,
               a, lda, b, ldb, beta, c, ldc);
 }
@@ -4205,7 +4179,8 @@ Stream &Stream::ThenBlasSymm(blas::Side side, blas::UpperLower uplo, uint64 m,
 
   ThenBlasImpl<blas::Side, blas::UpperLower, uint64, uint64, float,
                const DeviceMemory<float> &, int, const DeviceMemory<float> &,
-               int, float, DeviceMemory<float> *, int> impl;
+               int, float, DeviceMemory<float> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasSymm, side, uplo, m, n, alpha, a,
               lda, b, ldb, beta, c, ldc);
 }
@@ -4221,7 +4196,8 @@ Stream &Stream::ThenBlasSymm(blas::Side side, blas::UpperLower uplo, uint64 m,
 
   ThenBlasImpl<blas::Side, blas::UpperLower, uint64, uint64, double,
                const DeviceMemory<double> &, int, const DeviceMemory<double> &,
-               int, double, DeviceMemory<double> *, int> impl;
+               int, double, DeviceMemory<double> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasSymm, side, uplo, m, n, alpha, a,
               lda, b, ldb, beta, c, ldc);
 }
@@ -4240,8 +4216,8 @@ Stream &Stream::ThenBlasSymm(blas::Side side, blas::UpperLower uplo, uint64 m,
   ThenBlasImpl<blas::Side, blas::UpperLower, uint64, uint64,
                std::complex<float>, const DeviceMemory<std::complex<float>> &,
                int, const DeviceMemory<std::complex<float>> &, int,
-               std::complex<float>, DeviceMemory<std::complex<float>> *,
-               int> impl;
+               std::complex<float>, DeviceMemory<std::complex<float>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasSymm, side, uplo, m, n, alpha, a,
               lda, b, ldb, beta, c, ldc);
 }
@@ -4260,8 +4236,8 @@ Stream &Stream::ThenBlasSymm(blas::Side side, blas::UpperLower uplo, uint64 m,
   ThenBlasImpl<blas::Side, blas::UpperLower, uint64, uint64,
                std::complex<double>, const DeviceMemory<std::complex<double>> &,
                int, const DeviceMemory<std::complex<double>> &, int,
-               std::complex<double>, DeviceMemory<std::complex<double>> *,
-               int> impl;
+               std::complex<double>, DeviceMemory<std::complex<double>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasSymm, side, uplo, m, n, alpha, a,
               lda, b, ldb, beta, c, ldc);
 }
@@ -4275,7 +4251,8 @@ Stream &Stream::ThenBlasSyrk(blas::UpperLower uplo, blas::Transpose trans,
 
   ThenBlasImpl<blas::UpperLower, blas::Transpose, uint64, uint64, float,
                const DeviceMemory<float> &, int, float, DeviceMemory<float> *,
-               int> impl;
+               int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasSyrk, uplo, trans, n, k, alpha, a,
               lda, beta, c, ldc);
 }
@@ -4289,7 +4266,8 @@ Stream &Stream::ThenBlasSyrk(blas::UpperLower uplo, blas::Transpose trans,
 
   ThenBlasImpl<blas::UpperLower, blas::Transpose, uint64, uint64, double,
                const DeviceMemory<double> &, int, double,
-               DeviceMemory<double> *, int> impl;
+               DeviceMemory<double> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasSyrk, uplo, trans, n, k, alpha, a,
               lda, beta, c, ldc);
 }
@@ -4305,7 +4283,8 @@ Stream &Stream::ThenBlasSyrk(blas::UpperLower uplo, blas::Transpose trans,
   ThenBlasImpl<blas::UpperLower, blas::Transpose, uint64, uint64,
                std::complex<float>, const DeviceMemory<std::complex<float>> &,
                int, std::complex<float>, DeviceMemory<std::complex<float>> *,
-               int> impl;
+               int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasSyrk, uplo, trans, n, k, alpha, a,
               lda, beta, c, ldc);
 }
@@ -4321,7 +4300,8 @@ Stream &Stream::ThenBlasSyrk(blas::UpperLower uplo, blas::Transpose trans,
   ThenBlasImpl<blas::UpperLower, blas::Transpose, uint64, uint64,
                std::complex<double>, const DeviceMemory<std::complex<double>> &,
                int, std::complex<double>, DeviceMemory<std::complex<double>> *,
-               int> impl;
+               int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasSyrk, uplo, trans, n, k, alpha, a,
               lda, beta, c, ldc);
 }
@@ -4337,7 +4317,8 @@ Stream &Stream::ThenBlasSyr2k(blas::UpperLower uplo, blas::Transpose trans,
 
   ThenBlasImpl<blas::UpperLower, blas::Transpose, uint64, uint64, float,
                const DeviceMemory<float> &, int, const DeviceMemory<float> &,
-               int, float, DeviceMemory<float> *, int> impl;
+               int, float, DeviceMemory<float> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasSyr2k, uplo, trans, n, k, alpha,
               a, lda, b, ldb, beta, c, ldc);
 }
@@ -4353,7 +4334,8 @@ Stream &Stream::ThenBlasSyr2k(blas::UpperLower uplo, blas::Transpose trans,
 
   ThenBlasImpl<blas::UpperLower, blas::Transpose, uint64, uint64, double,
                const DeviceMemory<double> &, int, const DeviceMemory<double> &,
-               int, double, DeviceMemory<double> *, int> impl;
+               int, double, DeviceMemory<double> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasSyr2k, uplo, trans, n, k, alpha,
               a, lda, b, ldb, beta, c, ldc);
 }
@@ -4372,8 +4354,8 @@ Stream &Stream::ThenBlasSyr2k(blas::UpperLower uplo, blas::Transpose trans,
   ThenBlasImpl<blas::UpperLower, blas::Transpose, uint64, uint64,
                std::complex<float>, const DeviceMemory<std::complex<float>> &,
                int, const DeviceMemory<std::complex<float>> &, int,
-               std::complex<float>, DeviceMemory<std::complex<float>> *,
-               int> impl;
+               std::complex<float>, DeviceMemory<std::complex<float>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasSyr2k, uplo, trans, n, k, alpha,
               a, lda, b, ldb, beta, c, ldc);
 }
@@ -4392,8 +4374,8 @@ Stream &Stream::ThenBlasSyr2k(blas::UpperLower uplo, blas::Transpose trans,
   ThenBlasImpl<blas::UpperLower, blas::Transpose, uint64, uint64,
                std::complex<double>, const DeviceMemory<std::complex<double>> &,
                int, const DeviceMemory<std::complex<double>> &, int,
-               std::complex<double>, DeviceMemory<std::complex<double>> *,
-               int> impl;
+               std::complex<double>, DeviceMemory<std::complex<double>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasSyr2k, uplo, trans, n, k, alpha,
               a, lda, b, ldb, beta, c, ldc);
 }
@@ -4408,7 +4390,8 @@ Stream &Stream::ThenBlasTrmm(blas::Side side, blas::UpperLower uplo,
 
   ThenBlasImpl<blas::Side, blas::UpperLower, blas::Transpose, blas::Diagonal,
                uint64, uint64, float, const DeviceMemory<float> &, int,
-               DeviceMemory<float> *, int> impl;
+               DeviceMemory<float> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasTrmm, side, uplo, transa, diag, m,
               n, alpha, a, lda, b, ldb);
 }
@@ -4423,7 +4406,8 @@ Stream &Stream::ThenBlasTrmm(blas::Side side, blas::UpperLower uplo,
 
   ThenBlasImpl<blas::Side, blas::UpperLower, blas::Transpose, blas::Diagonal,
                uint64, uint64, double, const DeviceMemory<double> &, int,
-               DeviceMemory<double> *, int> impl;
+               DeviceMemory<double> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasTrmm, side, uplo, transa, diag, m,
               n, alpha, a, lda, b, ldb);
 }
@@ -4440,7 +4424,8 @@ Stream &Stream::ThenBlasTrmm(blas::Side side, blas::UpperLower uplo,
   ThenBlasImpl<blas::Side, blas::UpperLower, blas::Transpose, blas::Diagonal,
                uint64, uint64, std::complex<float>,
                const DeviceMemory<std::complex<float>> &, int,
-               DeviceMemory<std::complex<float>> *, int> impl;
+               DeviceMemory<std::complex<float>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasTrmm, side, uplo, transa, diag, m,
               n, alpha, a, lda, b, ldb);
 }
@@ -4457,7 +4442,8 @@ Stream &Stream::ThenBlasTrmm(blas::Side side, blas::UpperLower uplo,
   ThenBlasImpl<blas::Side, blas::UpperLower, blas::Transpose, blas::Diagonal,
                uint64, uint64, std::complex<double>,
                const DeviceMemory<std::complex<double>> &, int,
-               DeviceMemory<std::complex<double>> *, int> impl;
+               DeviceMemory<std::complex<double>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasTrmm, side, uplo, transa, diag, m,
               n, alpha, a, lda, b, ldb);
 }
@@ -4472,7 +4458,8 @@ Stream &Stream::ThenBlasTrsm(blas::Side side, blas::UpperLower uplo,
 
   ThenBlasImpl<blas::Side, blas::UpperLower, blas::Transpose, blas::Diagonal,
                uint64, uint64, float, const DeviceMemory<float> &, int,
-               DeviceMemory<float> *, int> impl;
+               DeviceMemory<float> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasTrsm, side, uplo, transa, diag, m,
               n, alpha, a, lda, b, ldb);
 }
@@ -4487,7 +4474,8 @@ Stream &Stream::ThenBlasTrsm(blas::Side side, blas::UpperLower uplo,
 
   ThenBlasImpl<blas::Side, blas::UpperLower, blas::Transpose, blas::Diagonal,
                uint64, uint64, double, const DeviceMemory<double> &, int,
-               DeviceMemory<double> *, int> impl;
+               DeviceMemory<double> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasTrsm, side, uplo, transa, diag, m,
               n, alpha, a, lda, b, ldb);
 }
@@ -4504,7 +4492,8 @@ Stream &Stream::ThenBlasTrsm(blas::Side side, blas::UpperLower uplo,
   ThenBlasImpl<blas::Side, blas::UpperLower, blas::Transpose, blas::Diagonal,
                uint64, uint64, std::complex<float>,
                const DeviceMemory<std::complex<float>> &, int,
-               DeviceMemory<std::complex<float>> *, int> impl;
+               DeviceMemory<std::complex<float>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasTrsm, side, uplo, transa, diag, m,
               n, alpha, a, lda, b, ldb);
 }
@@ -4521,7 +4510,8 @@ Stream &Stream::ThenBlasTrsm(blas::Side side, blas::UpperLower uplo,
   ThenBlasImpl<blas::Side, blas::UpperLower, blas::Transpose, blas::Diagonal,
                uint64, uint64, std::complex<double>,
                const DeviceMemory<std::complex<double>> &, int,
-               DeviceMemory<std::complex<double>> *, int> impl;
+               DeviceMemory<std::complex<double>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasTrsm, side, uplo, transa, diag, m,
               n, alpha, a, lda, b, ldb);
 }
@@ -4814,17 +4804,11 @@ Stream &Stream::ThenBlasGemmStridedBatched(
 Stream &Stream::ThenSetRngSeed(const uint8 *seed, uint64 seed_bytes) {
   VLOG_CALL(PARAM(seed), PARAM(seed_bytes));
 
-  if (ok()) {
-    if (rng::RngSupport *rng = parent_->AsRng()) {
-      CheckError(rng->SetSeed(this, seed, seed_bytes));
-    } else {
-      SetError();
-      LOG(INFO) << DebugStreamPointers() << " unable to initialize RNG";
-    }
+  if (rng::RngSupport *rng = parent_->AsRng()) {
+    CheckError(rng->SetSeed(this, seed, seed_bytes));
   } else {
-    LOG(INFO) << DebugStreamPointers()
-              << " did not set RNG seed: " << static_cast<const void *>(seed)
-              << "; bytes: " << seed_bytes;
+    SetError();
+    LOG(INFO) << DebugStreamPointers() << " unable to initialize RNG";
   }
   return *this;
 }
@@ -4832,15 +4816,13 @@ Stream &Stream::ThenSetRngSeed(const uint8 *seed, uint64 seed_bytes) {
 Stream &Stream::ThenPopulateRandUniform(DeviceMemory<float> *values) {
   VLOG_CALL(PARAM(values));
 
-  if (ok()) {
-    if (rng::RngSupport *rng = parent_->AsRng()) {
-      CheckError(rng->DoPopulateRandUniform(this, values));
-    } else {
-      SetError();
-      LOG(INFO) << DebugStreamPointers()
-                << " attempting to perform RNG operation using StreamExecutor"
-                   " without RNG support.";
-    }
+  if (rng::RngSupport *rng = parent_->AsRng()) {
+    CheckError(rng->DoPopulateRandUniform(this, values));
+  } else {
+    SetError();
+    LOG(INFO) << DebugStreamPointers()
+              << " attempting to perform RNG operation using StreamExecutor"
+                 " without RNG support.";
   }
   return *this;
 }
@@ -4849,15 +4831,13 @@ Stream &Stream::ThenPopulateRandGaussian(float mean, float sd,
                                          DeviceMemory<float> *values) {
   VLOG_CALL(PARAM(mean), PARAM(sd), PARAM(values));
 
-  if (ok()) {
-    if (rng::RngSupport *rng = parent_->AsRng()) {
-      CheckError(rng->DoPopulateRandGaussian(this, mean, sd, values));
-    } else {
-      SetError();
-      LOG(INFO) << DebugStreamPointers()
-                << " attempting to perform RNG operation using StreamExecutor"
-                   " without RNG support.";
-    }
+  if (rng::RngSupport *rng = parent_->AsRng()) {
+    CheckError(rng->DoPopulateRandGaussian(this, mean, sd, values));
+  } else {
+    SetError();
+    LOG(INFO) << DebugStreamPointers()
+              << " attempting to perform RNG operation using StreamExecutor"
+                 " without RNG support.";
   }
   return *this;
 }
@@ -4866,15 +4846,13 @@ Stream &Stream::ThenPopulateRandGaussian(double mean, double sd,
                                          DeviceMemory<double> *values) {
   VLOG_CALL(PARAM(mean), PARAM(sd), PARAM(values));
 
-  if (ok()) {
-    if (rng::RngSupport *rng = parent_->AsRng()) {
-      CheckError(rng->DoPopulateRandGaussian(this, mean, sd, values));
-    } else {
-      SetError();
-      LOG(INFO) << DebugStreamPointers()
-                << " attempting to perform RNG operation using StreamExecutor"
-                   " without RNG support.";
-    }
+  if (rng::RngSupport *rng = parent_->AsRng()) {
+    CheckError(rng->DoPopulateRandGaussian(this, mean, sd, values));
+  } else {
+    SetError();
+    LOG(INFO) << DebugStreamPointers()
+              << " attempting to perform RNG operation using StreamExecutor"
+                 " without RNG support.";
   }
   return *this;
 }
@@ -4882,15 +4860,13 @@ Stream &Stream::ThenPopulateRandGaussian(double mean, double sd,
 Stream &Stream::ThenPopulateRandUniform(DeviceMemory<double> *values) {
   VLOG_CALL(PARAM(values));
 
-  if (ok()) {
-    if (rng::RngSupport *rng = parent_->AsRng()) {
-      CheckError(rng->DoPopulateRandUniform(this, values));
-    } else {
-      SetError();
-      LOG(INFO) << DebugStreamPointers()
-                << " attempting to perform RNG operation using StreamExecutor"
-                   " without RNG support.";
-    }
+  if (rng::RngSupport *rng = parent_->AsRng()) {
+    CheckError(rng->DoPopulateRandUniform(this, values));
+  } else {
+    SetError();
+    LOG(INFO) << DebugStreamPointers()
+              << " attempting to perform RNG operation using StreamExecutor"
+                 " without RNG support.";
   }
   return *this;
 }
@@ -4899,15 +4875,13 @@ Stream &Stream::ThenPopulateRandUniform(
     DeviceMemory<std::complex<float>> *values) {
   VLOG_CALL(PARAM(values));
 
-  if (ok()) {
-    if (rng::RngSupport *rng = parent_->AsRng()) {
-      CheckError(rng->DoPopulateRandUniform(this, values));
-    } else {
-      SetError();
-      LOG(INFO) << DebugStreamPointers()
-                << " attempting to perform RNG operation using StreamExecutor"
-                   " without RNG support.";
-    }
+  if (rng::RngSupport *rng = parent_->AsRng()) {
+    CheckError(rng->DoPopulateRandUniform(this, values));
+  } else {
+    SetError();
+    LOG(INFO) << DebugStreamPointers()
+              << " attempting to perform RNG operation using StreamExecutor"
+                 " without RNG support.";
   }
   return *this;
 }
@@ -4916,15 +4890,13 @@ Stream &Stream::ThenPopulateRandUniform(
     DeviceMemory<std::complex<double>> *values) {
   VLOG_CALL(PARAM(values));
 
-  if (ok()) {
-    if (rng::RngSupport *rng = parent_->AsRng()) {
-      CheckError(rng->DoPopulateRandUniform(this, values));
-    } else {
-      SetError();
-      LOG(INFO) << DebugStreamPointers()
-                << " attempting to perform RNG operation using StreamExecutor"
-                   " without RNG support.";
-    }
+  if (rng::RngSupport *rng = parent_->AsRng()) {
+    CheckError(rng->DoPopulateRandUniform(this, values));
+  } else {
+    SetError();
+    LOG(INFO) << DebugStreamPointers()
+              << " attempting to perform RNG operation using StreamExecutor"
+                 " without RNG support.";
   }
   return *this;
 }
@@ -4933,12 +4905,7 @@ Stream &Stream::ThenMemcpy(void *host_dst, const DeviceMemoryBase &gpu_src,
                            uint64 size) {
   VLOG_CALL(PARAM(host_dst), PARAM(gpu_src), PARAM(size));
 
-  if (ok()) {
-    CheckError(parent_->Memcpy(this, host_dst, gpu_src, size));
-  } else {
-    LOG(INFO) << DebugStreamPointers()
-              << " did not memcpy device-to-host; source: " << gpu_src.opaque();
-  }
+  CheckError(parent_->Memcpy(this, host_dst, gpu_src, size));
   return *this;
 }
 
@@ -4946,12 +4913,7 @@ Stream &Stream::ThenMemcpy(DeviceMemoryBase *gpu_dst, const void *host_src,
                            uint64 size) {
   VLOG_CALL(PARAM(gpu_dst), PARAM(host_src), PARAM(size));
 
-  if (ok()) {
-    CheckError(parent_->Memcpy(this, gpu_dst, host_src, size));
-  } else {
-    LOG(INFO) << DebugStreamPointers()
-              << " did not memcpy host-to-device; source: " << host_src;
-  }
+  CheckError(parent_->Memcpy(this, gpu_dst, host_src, size));
   return *this;
 }
 
@@ -4959,24 +4921,14 @@ Stream &Stream::ThenMemcpy(DeviceMemoryBase *gpu_dst,
                            const DeviceMemoryBase &gpu_src, uint64 size) {
   VLOG_CALL(PARAM(gpu_dst), PARAM(gpu_src), PARAM(size));
 
-  if (ok()) {
-    CheckError(parent_->MemcpyDeviceToDevice(this, gpu_dst, gpu_src, size));
-  } else {
-    LOG(INFO) << DebugStreamPointers()
-              << " did not memcpy gpu-to-gpu; source: " << &gpu_src;
-  }
+  CheckError(parent_->MemcpyDeviceToDevice(this, gpu_dst, gpu_src, size));
   return *this;
 }
 
 Stream &Stream::ThenMemZero(DeviceMemoryBase *location, uint64 size) {
   VLOG_CALL(PARAM(location), PARAM(size));
 
-  if (ok()) {
-    CheckStatus(parent_->MemZero(this, location, size));
-  } else {
-    LOG(INFO) << DebugStreamPointers()
-              << " did not memzero GPU location; source: " << location;
-  }
+  CheckStatus(parent_->MemZero(this, location, size));
   return *this;
 }
 
@@ -4984,13 +4936,7 @@ Stream &Stream::ThenMemset32(DeviceMemoryBase *location, uint32 pattern,
                              uint64 size) {
   VLOG_CALL(PARAM(location), PARAM(pattern), PARAM(size));
 
-  if (ok()) {
-    CheckStatus(parent_->Memset32(this, location, pattern, size));
-  } else {
-    LOG(INFO) << DebugStreamPointers()
-              << " did not memset GPU location; source: " << location
-              << "; size: " << size << "; pattern: " << std::hex << pattern;
-  }
+  CheckStatus(parent_->Memset32(this, location, pattern, size));
   return *this;
 }
 
@@ -5013,20 +4959,17 @@ Stream &Stream::ThenRnnForward(
     ScratchAllocator *workspace_allocator,
     dnn::ProfileResult *output_profile_result) {
   // TODO(zhengxq): add VLOG PARAM calls.
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      auto status = dnn->DoRnnForward(
-          this, rnn_desc, input_desc, input_data, input_h_desc, input_h_data,
-          input_c_desc, input_c_data, params, output_desc, output_data,
-          output_h_desc, output_h_data, output_c_desc, output_c_data,
-          is_training, reserve_space_allocator, workspace_allocator,
-          output_profile_result);
-      if (!status && !output_profile_result) {
-        SetError();
-      }
-    } else {
-      SetErrorAndLogNoDnnSupport();
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    auto status = dnn->DoRnnForward(
+        this, rnn_desc, input_desc, input_data, input_h_desc, input_h_data,
+        input_c_desc, input_c_data, params, output_desc, output_data,
+        output_h_desc, output_h_data, output_c_desc, output_c_data, is_training,
+        reserve_space_allocator, workspace_allocator, output_profile_result);
+    if (!status && !output_profile_result) {
+      SetError();
     }
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -5049,20 +4992,17 @@ Stream &Stream::ThenRnnForward(
     ScratchAllocator *workspace_allocator,
     dnn::ProfileResult *output_profile_result) {
   // TODO(zhengxq): add VLOG PARAM calls.
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      auto status = dnn->DoRnnForward(
-          this, rnn_desc, input_desc, input_data, input_h_desc, input_h_data,
-          input_c_desc, input_c_data, params, output_desc, output_data,
-          output_h_desc, output_h_data, output_c_desc, output_c_data,
-          is_training, reserve_space_allocator, workspace_allocator,
-          output_profile_result);
-      if (!status && !output_profile_result) {
-        SetError();
-      }
-    } else {
-      SetErrorAndLogNoDnnSupport();
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    auto status = dnn->DoRnnForward(
+        this, rnn_desc, input_desc, input_data, input_h_desc, input_h_data,
+        input_c_desc, input_c_data, params, output_desc, output_data,
+        output_h_desc, output_h_data, output_c_desc, output_c_data, is_training,
+        reserve_space_allocator, workspace_allocator, output_profile_result);
+    if (!status && !output_profile_result) {
+      SetError();
     }
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -5086,20 +5026,17 @@ Stream &Stream::ThenRnnForward(
     ScratchAllocator *workspace_allocator,
     dnn::ProfileResult *output_profile_result) {
   // TODO(zhengxq): add VLOG PARAM calls.
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      auto status = dnn->DoRnnForward(
-          this, rnn_desc, input_desc, input_data, input_h_desc, input_h_data,
-          input_c_desc, input_c_data, params, output_desc, output_data,
-          output_h_desc, output_h_data, output_c_desc, output_c_data,
-          is_training, reserve_space_allocator, workspace_allocator,
-          output_profile_result);
-      if (!status && !output_profile_result) {
-        SetError();
-      }
-    } else {
-      SetErrorAndLogNoDnnSupport();
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    auto status = dnn->DoRnnForward(
+        this, rnn_desc, input_desc, input_data, input_h_desc, input_h_data,
+        input_c_desc, input_c_data, params, output_desc, output_data,
+        output_h_desc, output_h_data, output_c_desc, output_c_data, is_training,
+        reserve_space_allocator, workspace_allocator, output_profile_result);
+    if (!status && !output_profile_result) {
+      SetError();
     }
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -5130,23 +5067,21 @@ Stream &Stream::ThenRnnBackward(
     ScratchAllocator *workspace_allocator,
     dnn::ProfileResult *output_profile_result) {
   // TODO(zhengxq): add VLOG PARAM calls.
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      auto status = dnn->DoRnnBackward(
-          this, rnn_desc, input_desc, input_data, input_h_desc, input_h_data,
-          input_c_desc, input_c_data, params, output_desc, output_data,
-          output_h_desc, output_h_data, output_c_desc, output_c_data,
-          output_backprop_data, output_h_backprop_data, output_c_backprop_data,
-          input_backprop_data, input_h_backprop_data, input_c_backprop_data,
-          params_backprop_data, reserve_space_data, workspace_allocator,
-          output_profile_result);
-      if (!status && !output_profile_result) {
-        SetError();
-      }
-    } else {
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    auto status = dnn->DoRnnBackward(
+        this, rnn_desc, input_desc, input_data, input_h_desc, input_h_data,
+        input_c_desc, input_c_data, params, output_desc, output_data,
+        output_h_desc, output_h_data, output_c_desc, output_c_data,
+        output_backprop_data, output_h_backprop_data, output_c_backprop_data,
+        input_backprop_data, input_h_backprop_data, input_c_backprop_data,
+        params_backprop_data, reserve_space_data, workspace_allocator,
+        output_profile_result);
+    if (!status && !output_profile_result) {
       SetError();
-      LOG(WARNING) << "Attempting to call ThenRnnBackward without DNN support";
     }
+  } else {
+    SetError();
+    LOG(WARNING) << "Attempting to call ThenRnnBackward without DNN support";
   }
   return *this;
 }
@@ -5176,23 +5111,21 @@ Stream &Stream::ThenRnnBackward(
     ScratchAllocator *workspace_allocator,
     dnn::ProfileResult *output_profile_result) {
   // TODO(zhengxq): add VLOG PARAM calls.
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      auto status = dnn->DoRnnBackward(
-          this, rnn_desc, input_desc, input_data, input_h_desc, input_h_data,
-          input_c_desc, input_c_data, params, output_desc, output_data,
-          output_h_desc, output_h_data, output_c_desc, output_c_data,
-          output_backprop_data, output_h_backprop_data, output_c_backprop_data,
-          input_backprop_data, input_h_backprop_data, input_c_backprop_data,
-          params_backprop_data, reserve_space_data, workspace_allocator,
-          output_profile_result);
-      if (!status && !output_profile_result) {
-        SetError();
-      }
-    } else {
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    auto status = dnn->DoRnnBackward(
+        this, rnn_desc, input_desc, input_data, input_h_desc, input_h_data,
+        input_c_desc, input_c_data, params, output_desc, output_data,
+        output_h_desc, output_h_data, output_c_desc, output_c_data,
+        output_backprop_data, output_h_backprop_data, output_c_backprop_data,
+        input_backprop_data, input_h_backprop_data, input_c_backprop_data,
+        params_backprop_data, reserve_space_data, workspace_allocator,
+        output_profile_result);
+    if (!status && !output_profile_result) {
       SetError();
-      LOG(WARNING) << "Attempting to call ThenRnnBackward without DNN support";
     }
+  } else {
+    SetError();
+    LOG(WARNING) << "Attempting to call ThenRnnBackward without DNN support";
   }
   return *this;
 }
@@ -5223,23 +5156,21 @@ Stream &Stream::ThenRnnBackward(
     ScratchAllocator *workspace_allocator,
     dnn::ProfileResult *output_profile_result) {
   // TODO(zhengxq): add VLOG PARAM calls.
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      auto status = dnn->DoRnnBackward(
-          this, rnn_desc, input_desc, input_data, input_h_desc, input_h_data,
-          input_c_desc, input_c_data, params, output_desc, output_data,
-          output_h_desc, output_h_data, output_c_desc, output_c_data,
-          output_backprop_data, output_h_backprop_data, output_c_backprop_data,
-          input_backprop_data, input_h_backprop_data, input_c_backprop_data,
-          params_backprop_data, reserve_space_data, workspace_allocator,
-          output_profile_result);
-      if (!status && !output_profile_result) {
-        SetError();
-      }
-    } else {
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    auto status = dnn->DoRnnBackward(
+        this, rnn_desc, input_desc, input_data, input_h_desc, input_h_data,
+        input_c_desc, input_c_data, params, output_desc, output_data,
+        output_h_desc, output_h_data, output_c_desc, output_c_data,
+        output_backprop_data, output_h_backprop_data, output_c_backprop_data,
+        input_backprop_data, input_h_backprop_data, input_c_backprop_data,
+        params_backprop_data, reserve_space_data, workspace_allocator,
+        output_profile_result);
+    if (!status && !output_profile_result) {
       SetError();
-      LOG(WARNING) << "Attempting to call ThenRnnBackward without DNN support";
     }
+  } else {
+    SetError();
+    LOG(WARNING) << "Attempting to call ThenRnnBackward without DNN support";
   }
   return *this;
 }
@@ -5253,28 +5184,26 @@ Stream &Stream::ThenCtcLoss(const dnn::RnnStateTensorDescriptor &probs_desc,
                             const dnn::RnnStateTensorDescriptor &grads_desc,
                             DeviceMemory<float> *grads_data,
                             ScratchAllocator *workspace_allocator) {
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      DeviceMemory<uint8> scratch_memory;
-      int ctc_loss_algo_id;
-      auto status =
-          dnn->PrepareForCtcLoss(this, probs_desc, probs_data, grads_desc,
-                                 labels_data, labels_lengths_data,
-                                 input_lengths_data, workspace_allocator,
-                                 &scratch_memory, &ctc_loss_algo_id)
-              .ok();
-      if (status) {
-        status = dnn->DoCtcLoss(this, probs_desc, probs_data, labels_data,
-                                labels_lengths_data, input_lengths_data,
-                                costs_data, grads_desc, grads_data,
-                                &scratch_memory, ctc_loss_algo_id);
-      }
-      if (!status) {
-        SetError();
-      }
-    } else {
-      SetErrorAndLogNoDnnSupport();
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    DeviceMemory<uint8> scratch_memory;
+    int ctc_loss_algo_id;
+    auto status =
+        dnn->PrepareForCtcLoss(this, probs_desc, probs_data, grads_desc,
+                               labels_data, labels_lengths_data,
+                               input_lengths_data, workspace_allocator,
+                               &scratch_memory, &ctc_loss_algo_id)
+            .ok();
+    if (status) {
+      status = dnn->DoCtcLoss(this, probs_desc, probs_data, labels_data,
+                              labels_lengths_data, input_lengths_data,
+                              costs_data, grads_desc, grads_data,
+                              &scratch_memory, ctc_loss_algo_id);
     }
+    if (!status) {
+      SetError();
+    }
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -5288,14 +5217,12 @@ Stream &Stream::ThenTransformTensor(const dnn::BatchDescriptor &input_desc,
   VLOG_CALL(PARAM(input_desc), PARAM(input_type), PARAM(input_data),
             PARAM(output_desc), PARAM(output_type), PARAM(scale),
             PARAM(output_data));
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoTransformTensor(this, input_desc, input_type,
-                                        input_data, output_desc, output_type,
-                                        scale, output_data));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoTransformTensor(this, input_desc, input_type, input_data,
+                                      output_desc, output_type, scale,
+                                      output_data));
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -5342,15 +5269,13 @@ Stream &Stream::ThenFft(fft::Plan *plan,
                         DeviceMemory<std::complex<float>> *output) {
   VLOG_CALL(PARAM(plan), PARAM(input), PARAM(output));
 
-  if (ok()) {
-    if (fft::FftSupport *fft = parent_->AsFft()) {
-      CheckError(fft->DoFft(this, plan, input, output));
-    } else {
-      SetError();
-      LOG(INFO) << DebugStreamPointers()
-                << " attempting to perform FFT operation using StreamExecutor"
-                   " without FFT support";
-    }
+  if (fft::FftSupport *fft = parent_->AsFft()) {
+    CheckError(fft->DoFft(this, plan, input, output));
+  } else {
+    SetError();
+    LOG(INFO) << DebugStreamPointers()
+              << " attempting to perform FFT operation using StreamExecutor"
+                 " without FFT support";
   }
   return *this;
 }
@@ -5360,15 +5285,13 @@ Stream &Stream::ThenFft(fft::Plan *plan,
                         DeviceMemory<std::complex<double>> *output) {
   VLOG_CALL(PARAM(plan), PARAM(input), PARAM(output));
 
-  if (ok()) {
-    if (fft::FftSupport *fft = parent_->AsFft()) {
-      CheckError(fft->DoFft(this, plan, input, output));
-    } else {
-      SetError();
-      LOG(INFO) << DebugStreamPointers()
-                << " attempting to perform FFT operation using StreamExecutor"
-                   " without FFT support";
-    }
+  if (fft::FftSupport *fft = parent_->AsFft()) {
+    CheckError(fft->DoFft(this, plan, input, output));
+  } else {
+    SetError();
+    LOG(INFO) << DebugStreamPointers()
+              << " attempting to perform FFT operation using StreamExecutor"
+                 " without FFT support";
   }
   return *this;
 }
@@ -5377,15 +5300,13 @@ Stream &Stream::ThenFft(fft::Plan *plan, const DeviceMemory<float> &input,
                         DeviceMemory<std::complex<float>> *output) {
   VLOG_CALL(PARAM(plan), PARAM(input), PARAM(output));
 
-  if (ok()) {
-    if (fft::FftSupport *fft = parent_->AsFft()) {
-      CheckError(fft->DoFft(this, plan, input, output));
-    } else {
-      SetError();
-      LOG(INFO) << DebugStreamPointers()
-                << " attempting to perform FFT operation using StreamExecutor"
-                   " without FFT support";
-    }
+  if (fft::FftSupport *fft = parent_->AsFft()) {
+    CheckError(fft->DoFft(this, plan, input, output));
+  } else {
+    SetError();
+    LOG(INFO) << DebugStreamPointers()
+              << " attempting to perform FFT operation using StreamExecutor"
+                 " without FFT support";
   }
   return *this;
 }
@@ -5394,15 +5315,13 @@ Stream &Stream::ThenFft(fft::Plan *plan, const DeviceMemory<double> &input,
                         DeviceMemory<std::complex<double>> *output) {
   VLOG_CALL(PARAM(plan), PARAM(input), PARAM(output));
 
-  if (ok()) {
-    if (fft::FftSupport *fft = parent_->AsFft()) {
-      CheckError(fft->DoFft(this, plan, input, output));
-    } else {
-      SetError();
-      LOG(INFO) << DebugStreamPointers()
-                << " attempting to perform FFT operation using StreamExecutor"
-                   " without FFT support";
-    }
+  if (fft::FftSupport *fft = parent_->AsFft()) {
+    CheckError(fft->DoFft(this, plan, input, output));
+  } else {
+    SetError();
+    LOG(INFO) << DebugStreamPointers()
+              << " attempting to perform FFT operation using StreamExecutor"
+                 " without FFT support";
   }
   return *this;
 }
@@ -5412,15 +5331,13 @@ Stream &Stream::ThenFft(fft::Plan *plan,
                         DeviceMemory<float> *output) {
   VLOG_CALL(PARAM(plan), PARAM(input), PARAM(output));
 
-  if (ok()) {
-    if (fft::FftSupport *fft = parent_->AsFft()) {
-      CheckError(fft->DoFft(this, plan, input, output));
-    } else {
-      SetError();
-      LOG(INFO) << DebugStreamPointers()
-                << " attempting to perform FFT operation using StreamExecutor"
-                   " without FFT support";
-    }
+  if (fft::FftSupport *fft = parent_->AsFft()) {
+    CheckError(fft->DoFft(this, plan, input, output));
+  } else {
+    SetError();
+    LOG(INFO) << DebugStreamPointers()
+              << " attempting to perform FFT operation using StreamExecutor"
+                 " without FFT support";
   }
   return *this;
 }
@@ -5430,15 +5347,13 @@ Stream &Stream::ThenFft(fft::Plan *plan,
                         DeviceMemory<double> *output) {
   VLOG_CALL(PARAM(plan), PARAM(input), PARAM(output));
 
-  if (ok()) {
-    if (fft::FftSupport *fft = parent_->AsFft()) {
-      CheckError(fft->DoFft(this, plan, input, output));
-    } else {
-      SetError();
-      LOG(INFO) << DebugStreamPointers()
-                << " attempting to perform FFT operation using StreamExecutor"
-                   " without FFT support";
-    }
+  if (fft::FftSupport *fft = parent_->AsFft()) {
+    CheckError(fft->DoFft(this, plan, input, output));
+  } else {
+    SetError();
+    LOG(INFO) << DebugStreamPointers()
+              << " attempting to perform FFT operation using StreamExecutor"
+                 " without FFT support";
   }
   return *this;
 }
@@ -5500,7 +5415,7 @@ void Stream::CheckStatus(port::Status status) {
   }
   LOG(ERROR) << status;
   absl::MutexLock lock(&mu_);
-  ok_ = false;
+  status_ = status;
 }
 
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/stream.h b/tensorflow/stream_executor/stream.h
index bf727d63da2..bfe442641ad 100644
--- a/tensorflow/stream_executor/stream.h
+++ b/tensorflow/stream_executor/stream.h
@@ -2026,7 +2026,7 @@ class Stream {
 
   bool InErrorState() const TF_LOCKS_EXCLUDED(mu_) {
     absl::ReaderMutexLock lock(&mu_);
-    return !ok_;
+    return !status_.ok();
   }
 
   // Sets the error state if operation_retcode is false.
@@ -2036,7 +2036,7 @@ class Stream {
       return;
     }
     absl::MutexLock lock(&mu_);
-    ok_ = false;
+    status_ = port::InternalError("Unknown error");
   }
 
   // Checks the status and logs the error message, if any.
@@ -2070,9 +2070,8 @@ class Stream {
   // See StreamExecutor::AllocateStream.
   bool allocated_ TF_GUARDED_BY(mu_);
 
-  // Whether all operations have entrained successfully to the current program
-  // point.
-  bool ok_ TF_GUARDED_BY(mu_);
+  // The last error (if any) of all method calls.
+  port::Status status_ TF_GUARDED_BY(mu_);
 
   // Sub-streams that are generated from this stream. Each element has a pointer
   // to sub-stream and a boolean value indicating if this substream is ready to
diff --git a/tensorflow/stream_executor/stream_executor_internal.h b/tensorflow/stream_executor/stream_executor_internal.h
index 408b4fc8207..437338085b3 100644
--- a/tensorflow/stream_executor/stream_executor_internal.h
+++ b/tensorflow/stream_executor/stream_executor_internal.h
@@ -44,7 +44,6 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform.h"
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/plugin_registry.h"
-#include "tensorflow/stream_executor/shared_memory_config.h"
 #include "tensorflow/stream_executor/trace_listener.h"
 
 namespace stream_executor {
@@ -267,9 +266,6 @@ class StreamExecutorInterface {
   virtual int PlatformDeviceCount() = 0;
   virtual port::Status EnablePeerAccessTo(StreamExecutorInterface *other) = 0;
   virtual bool CanEnablePeerAccessTo(StreamExecutorInterface *other) = 0;
-  virtual SharedMemoryConfig GetDeviceSharedMemoryConfig() = 0;
-  virtual port::Status SetDeviceSharedMemoryConfig(
-      SharedMemoryConfig config) = 0;
 
   virtual int64 GetDeviceLoad() { return -1; }
 
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.cc b/tensorflow/stream_executor/stream_executor_pimpl.cc
index d23f1472e33..db4e8f9b694 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.cc
+++ b/tensorflow/stream_executor/stream_executor_pimpl.cc
@@ -230,23 +230,6 @@ port::Status StreamExecutor::EnablePeerAccessTo(StreamExecutor *other) {
   return implementation_->EnablePeerAccessTo(other->implementation_.get());
 }
 
-SharedMemoryConfig StreamExecutor::GetDeviceSharedMemoryConfig() {
-  return implementation_->GetDeviceSharedMemoryConfig();
-}
-
-port::Status StreamExecutor::SetDeviceSharedMemoryConfig(
-    SharedMemoryConfig config) {
-  if (config != SharedMemoryConfig::kDefault &&
-      config != SharedMemoryConfig::kFourByte &&
-      config != SharedMemoryConfig::kEightByte) {
-    std::string error_msg = absl::StrFormat(
-        "Invalid shared memory config specified: %d", static_cast<int>(config));
-    LOG(ERROR) << error_msg;
-    return port::Status(port::error::INVALID_ARGUMENT, error_msg);
-  }
-  return implementation_->SetDeviceSharedMemoryConfig(config);
-}
-
 const DeviceDescription &StreamExecutor::GetDeviceDescription() const {
   absl::MutexLock lock(&mu_);
   if (device_description_ != nullptr) {
@@ -858,7 +841,7 @@ absl::optional<AllocatorStats> StreamExecutor::GetAllocatorStats() {
 }
 
 template <typename TraceCallT, typename... ArgsT>
-void StreamExecutor::SubmitTrace(TraceCallT trace_call, ArgsT &&... args) {
+void StreamExecutor::SubmitTrace(TraceCallT trace_call, ArgsT &&...args) {
   if (tracing_enabled_) {
     {
       // instance tracers held in a block to limit the lock lifetime.
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.h b/tensorflow/stream_executor/stream_executor_pimpl.h
index f7f69f78e89..b9b118ca42c 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.h
+++ b/tensorflow/stream_executor/stream_executor_pimpl.h
@@ -35,7 +35,6 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/logging.h"
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/rng.h"
-#include "tensorflow/stream_executor/shared_memory_config.h"
 #include "tensorflow/stream_executor/stream.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 #include "tensorflow/stream_executor/trace_listener.h"
@@ -54,8 +53,8 @@ struct AllocRecord {
 };
 
 // Forward declaration of private friend class.
-template <typename BeginCallT, typename CompleteCallT,
-          typename ReturnT, typename... BeginArgsT>
+template <typename BeginCallT, typename CompleteCallT, typename ReturnT,
+          typename... BeginArgsT>
 class ScopedTracer;
 
 // A StreamExecutor manages a single device, in terms of executing work (kernel
@@ -322,14 +321,6 @@ class StreamExecutor {
   // this is more an up-front test as to whether it's expressly forbidden.
   bool CanEnablePeerAccessTo(StreamExecutor *other);
 
-  // Gets the preferred shared memory configuration for the device to which this
-  // executor is bound.
-  SharedMemoryConfig GetDeviceSharedMemoryConfig();
-
-  // Sets the preferred shared memory configuration for the device to which this
-  // executor is bound.
-  port::Status SetDeviceSharedMemoryConfig(SharedMemoryConfig config);
-
   // Obtains metadata about the underlying device.
   // The value is cached on first use.
   const DeviceDescription &GetDeviceDescription() const;
@@ -507,12 +498,12 @@ class StreamExecutor {
   // To register a listener for all executors for a given platform, see
   // Platform::RegisterTraceListener().
   // Does not take ownership of listener.
-  void RegisterTraceListener(TraceListener* listener);
+  void RegisterTraceListener(TraceListener *listener);
 
   // Removes a TraceListener from this StreamExecutor instance.
   // Returns false (and logs) in cases where the argument listener was not
   // previously registered.
-  bool UnregisterTraceListener(TraceListener* listener);
+  bool UnregisterTraceListener(TraceListener *listener);
 
   // Return allocator statistics.
   absl::optional<AllocatorStats> GetAllocatorStats();
@@ -522,8 +513,8 @@ class StreamExecutor {
   StreamExecutorMemoryAllocator *GetAllocator() { return &allocator_; }
 
  private:
-  template <typename BeginCallT, typename CompleteCallT,
-            typename ReturnT, typename... BeginArgsT>
+  template <typename BeginCallT, typename CompleteCallT, typename ReturnT,
+            typename... BeginArgsT>
   friend class ScopedTracer;
   friend class Event;
   friend class Stream;
@@ -648,7 +639,7 @@ class StreamExecutor {
   // Calls the relevant TraceListener routine to begin tracing for the specified
   // asynchronous method.
   template <typename TraceCallT, typename... ArgsT>
-  void SubmitTrace(TraceCallT trace_call, ArgsT&&... args);
+  void SubmitTrace(TraceCallT trace_call, ArgsT &&...args);
 
   // Reader/writer lock for class-static StreamExecutor members.
   static absl::Mutex static_mu_;
diff --git a/tensorflow/stream_executor/tf_allocator_adapter.cc b/tensorflow/stream_executor/tf_allocator_adapter.cc
index 0b2d66f7e29..b3483932333 100644
--- a/tensorflow/stream_executor/tf_allocator_adapter.cc
+++ b/tensorflow/stream_executor/tf_allocator_adapter.cc
@@ -40,7 +40,7 @@ port::StatusOr<OwningDeviceMemory> TfAllocatorAdapter::Allocate(
     int64 memory_space) {
   CHECK_EQ(memory_space, 0);
   tensorflow::AllocationAttributes attrs;
-  attrs.no_retry_on_failure = !retry_on_failure;
+  attrs.retry_on_failure = retry_on_failure;
   void *data = nullptr;
   if (size != 0) {
     data = wrapped_->AllocateRaw(tensorflow::Allocator::kAllocatorAlignment,
diff --git a/tensorflow/stream_executor/tpu/BUILD b/tensorflow/stream_executor/tpu/BUILD
index 459021043df..83aabf312d9 100644
--- a/tensorflow/stream_executor/tpu/BUILD
+++ b/tensorflow/stream_executor/tpu/BUILD
@@ -52,6 +52,18 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "noncopyable_buffer",
+    hdrs = ["noncopyable_buffer.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
 cc_library(
     name = "tpu_node_context_c_api_hdrs",
     hdrs = ["tpu_node_context_c_api.h"],
@@ -189,6 +201,18 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "tpu_transfer_manager_interface",
+    srcs = ["tpu_transfer_manager_interface.cc"],
+    hdrs = ["tpu_transfer_manager_interface.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":noncopyable_buffer",
+        ":tpu_platform_interface",
+        "//tensorflow/compiler/xla/service:transfer_manager",
+    ],
+)
+
 cc_library(
     name = "tpu_transfer_manager",
     srcs = ["tpu_transfer_manager_registration.cc"],
@@ -206,10 +230,12 @@ cc_library(
     hdrs = ["tpu_transfer_manager.h"],
     deps = [
         ":c_api_conversions",
+        ":noncopyable_buffer",
         ":proto_helper",
         ":status_helper",
         ":tpu_executor_base",
         ":tpu_executor_c_api_hdrs",
+        ":tpu_transfer_manager_interface",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
@@ -225,8 +251,10 @@ cc_library(
     srcs = ["tpu_computation_placer.cc"],
     hdrs = ["tpu_computation_placer.h"],
     deps = [
+        ":status_helper",
         ":tpu_executor",
         ":tpu_executor_c_api_hdrs",
+        ":tpu_topology_external",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/service:computation_placer",
         "//tensorflow/core/tpu:tpu_api",
@@ -269,6 +297,7 @@ cc_library(
     hdrs = ["tpu_platform_interface.h"],
     visibility = ["//visibility:public"],
     deps = [
+        ":c_api_decl",
         ":tpu_topology_external",
         "//tensorflow/core:lib",
         "//tensorflow/stream_executor",
@@ -281,6 +310,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":tpu_platform_interface",
+        ":tpu_topology_external",
         "//tensorflow/stream_executor:stream_executor_headers",
         "//tensorflow/stream_executor/lib",
     ],
diff --git a/tensorflow/stream_executor/tpu/c_api_conversions.cc b/tensorflow/stream_executor/tpu/c_api_conversions.cc
index ddbd9ec2219..50b3b87551c 100644
--- a/tensorflow/stream_executor/tpu/c_api_conversions.cc
+++ b/tensorflow/stream_executor/tpu/c_api_conversions.cc
@@ -43,14 +43,20 @@ xla::ShapedBuffer FromC(XLA_ShapedBuffer* c_buffer) {
   return xla_shaped_buffer;
 }
 
-SE_MaybeOwningDeviceMemory ToC(xla::MaybeOwningDeviceMemory& mem) {
+SE_MaybeOwningDeviceMemory ToC(xla::MaybeOwningDeviceMemory& mem,
+                               bool aliased) {
   SE_MaybeOwningDeviceMemory se_mem;
   se_mem.owned = mem.HasOwnership();
   se_mem.memory = ApiConverter::ToC(mem.AsDeviceMemoryBase());
   if (mem.HasOwnership()) {
-    auto owned = mem.Release().value();
-    se_mem.device_ordinal = owned.device_ordinal();
-    se_mem.allocator = ApiConverter::ToC(owned.allocator());
+    const stream_executor::OwningDeviceMemory* owned =
+        mem.AsOwningDeviceMemory();
+    se_mem.device_ordinal = owned->device_ordinal();
+    se_mem.allocator = ApiConverter::ToC(owned->allocator());
+    if (!aliased) {
+      // Underlying buffer is owned by se_mem now.
+      mem.Release()->Release();
+    }
   } else {
     se_mem.allocator =
         ToC(static_cast<stream_executor::DeviceMemoryAllocator*>(nullptr));
@@ -209,7 +215,7 @@ void ToC(const xla::ShapedBuffer& buffer, XLA_ShapedBuffer* c_device_buffer) {
 }
 
 void Free(XLA_Shape* shape) { delete[] shape->bytes; }
-void Free(XLA_ShapeIndex*) {}
+void Free(XLA_ShapeIndex* shape_index) { delete[] shape_index; }
 void Free(SE_DeviceMemoryBase*) {}
 
 void Free(XLA_Literal* c_literal) {
diff --git a/tensorflow/stream_executor/tpu/c_api_conversions.h b/tensorflow/stream_executor/tpu/c_api_conversions.h
index bfe5f37204c..1f073865f59 100644
--- a/tensorflow/stream_executor/tpu/c_api_conversions.h
+++ b/tensorflow/stream_executor/tpu/c_api_conversions.h
@@ -89,7 +89,10 @@ SE_DeviceMemoryAllocator ToC(stream_executor::DeviceMemoryAllocator* allocator);
 
 // OwningDeviceMemory
 SE_MaybeOwningDeviceMemory ToC(stream_executor::OwningDeviceMemory* mem);
-SE_MaybeOwningDeviceMemory ToC(xla::MaybeOwningDeviceMemory& mem);
+// mem.HasOwnership() may be true if the buffer is aliased and shouldn't be
+// released. 'aliased' should be true in this case. 'aliased' has no effect if
+// 'mem' is unowned.
+SE_MaybeOwningDeviceMemory ToC(xla::MaybeOwningDeviceMemory& mem, bool aliased);
 
 // Helper for managing stack based C -> C++ conversions.
 template <class CType>
diff --git a/tensorflow/stream_executor/tpu/c_api_decl.h b/tensorflow/stream_executor/tpu/c_api_decl.h
index c42423c232f..7953670dec7 100644
--- a/tensorflow/stream_executor/tpu/c_api_decl.h
+++ b/tensorflow/stream_executor/tpu/c_api_decl.h
@@ -31,6 +31,13 @@ enum TpuCoreTypeEnum {
   kEmbeddingV2,
 };
 
+enum TpuVersionEnum {
+  kUnknownTpuVersion,
+  kTpuV2,
+  kTpuV3,
+  kTpuV4,
+};
+
 typedef struct SE_Status SE_Status;
 
 typedef struct SE_Platform SE_Platform;
@@ -253,6 +260,10 @@ typedef struct XLA_ComputationPlacer XLA_ComputationPlacer;
 
 typedef void (*XLA_CallbackFn)(void*);
 typedef void (*XLA_StatusCallbackFn)(void*, SE_Status*);
+
+typedef struct SE_TpuTopology SE_TpuTopology;
+typedef struct SE_TpuTopology_Core SE_TpuTopology_Core;
+typedef struct SE_TpuTopology_Core SE_TpuTopology_Host;
 }
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_TPU_C_API_DECL_H_
diff --git a/tensorflow/stream_executor/tpu/c_api_defn.h b/tensorflow/stream_executor/tpu/c_api_defn.h
index 1599f1f266a..62c02e2de48 100644
--- a/tensorflow/stream_executor/tpu/c_api_defn.h
+++ b/tensorflow/stream_executor/tpu/c_api_defn.h
@@ -63,8 +63,10 @@ struct SE_DeviceOptions {
   stream_executor::DeviceOptions options;
 };
 
+// Ignored -- these are just used to enforce the interface types
 struct XLA_TransferManager {};
-
 struct XLA_ComputationPlacer {};
+struct SE_TpuTopology {};
+struct SE_TpuTopology_Core {};
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_TPU_C_API_DEFN_H_
diff --git a/tensorflow/stream_executor/tpu/noncopyable_buffer.h b/tensorflow/stream_executor/tpu/noncopyable_buffer.h
new file mode 100644
index 00000000000..09ea45f0108
--- /dev/null
+++ b/tensorflow/stream_executor/tpu/noncopyable_buffer.h
@@ -0,0 +1,112 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_TPU_NONCOPYABLE_BUFFER_H_
+#define TENSORFLOW_STREAM_EXECUTOR_TPU_NONCOPYABLE_BUFFER_H_
+
+#include <memory>
+
+#include "absl/base/casts.h"
+#include "absl/types/optional.h"
+#include "absl/types/span.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace tpu {
+
+// Uncopyable buffer type with optional ownership of the underlying data. If
+// data is not owned then ensuring lifetime of the data exceeds the lifetime of
+// the buffer is the responsibility of the user.
+class NoncopyableBuffer {
+ public:
+  NoncopyableBuffer() = default;
+
+  // Allocate an owning buffer without initializing the data. Useful when it
+  // will be filled by a subsequent function and want to avoid initialization
+  // cost. Size is specified in number of uint32's.
+  explicit NoncopyableBuffer(size_t size)
+      : data_(new uint32[size]), buf_(data_.get()), size_(size) {}
+
+  // Allocates an owning buffer and initializes it with the specified data. Size
+  // is specified in number of uint32's.
+  NoncopyableBuffer(size_t size, absl::optional<uint32> value)
+      : NoncopyableBuffer(size) {
+#ifndef MEMORY_SANITIZER
+    if (!value.has_value()) {
+      return;
+    }
+#endif
+    uint32 v = value.value_or(0);
+    for (int64 i = 0; i < size; ++i) {
+      data_[i] = v;
+    }
+  }
+
+  // Directly use buf pointer without copying it to owning data_. This delays
+  // the memcpy until mutable access is requested. "buf" is not owned by this
+  // data structure, so it is the user's duty to ensure the live range of "buf"
+  // is longer than this data structure.
+  NoncopyableBuffer(const uint8* buf, uint64 size)  // Size is in uint8's.
+      : buf_(buf), size_(size / sizeof(uint32)) {
+    CHECK_EQ(size % sizeof(uint32), 0);
+  }
+  NoncopyableBuffer(const uint32* buf, uint64 size)  // Size is in uint32's.
+      : buf_(buf), size_(size) {}
+
+  NoncopyableBuffer(const NoncopyableBuffer&) = delete;
+  NoncopyableBuffer(NoncopyableBuffer&&) = default;
+
+  NoncopyableBuffer& operator=(const NoncopyableBuffer&) = delete;
+  NoncopyableBuffer& operator=(NoncopyableBuffer&&) = default;
+
+  // Ensure that the buffer owns the data and returns a mutable view into the
+  // owned data for modification.
+  absl::Span<uint32> mutable_data() {
+    if (data_ == nullptr) {
+      data_.reset(new uint32[size_]);
+      memcpy(data_.get(), buf_, size_ * sizeof(uint32));
+      buf_ = data_.get();
+    }
+    return absl::Span<uint32>(data_.get(), size_);
+  }
+
+  absl::Span<const uint32> const_data() const {
+    return absl::Span<const uint32>(absl::bit_cast<uint32*>(buf_), size_);
+  }
+  // Clone the content to a given buffer.
+  void CloneTo(void* buf) { memcpy(buf, buf_, size_ * sizeof(uint32)); }
+
+  // Return true if data is owned by this buffer (have been copied to `data_`).
+  bool owns_data() const { return data_ != nullptr; }
+
+  // Returns a copy of the object that owns its buffer.
+  NoncopyableBuffer Clone() const {
+    NoncopyableBuffer clone(size_);
+    memcpy(clone.data_.get(), buf_, size_ * sizeof(uint32));
+    return clone;
+  }
+
+ private:
+  // If data_ != nullptr then buf_ == data_.get()
+  std::unique_ptr<uint32[]> data_;  // Owning data pointer.
+  const void* buf_;                 // Non-owning data pointer.
+  uint64 size_;                     // Size in number of uint32's.
+};
+
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_TPU_NONCOPYABLE_BUFFER_H_
diff --git a/tensorflow/stream_executor/tpu/proto_helper.h b/tensorflow/stream_executor/tpu/proto_helper.h
index 29c322b0e9e..cd231e06c22 100644
--- a/tensorflow/stream_executor/tpu/proto_helper.h
+++ b/tensorflow/stream_executor/tpu/proto_helper.h
@@ -32,12 +32,13 @@ namespace tpu {
 
 using SerializedProto = TpuSerializedProto;
 
-// Serializes a proto and put the result in the given SerializedProto* argument.
+// Serializes a `proto` and put the result in the given `SerializedProtoType*`
+// argument.
 //
 // Users should call SerializedProto_Free on `serialized_proto` afterwards.
-template <class Proto>
-inline void SerializeProto(const Proto& proto,
-                           SerializedProto* serialized_proto) {
+template <class ProtoType, class SerializedProtoType>
+inline void SerializeProto(const ProtoType& proto,
+                           SerializedProtoType* serialized_proto) {
   auto size = proto.ByteSizeLong();
   auto bytes = new char[size];
   CHECK(proto.SerializeToArray(bytes, size));
@@ -48,8 +49,8 @@ inline void SerializeProto(const Proto& proto,
 // Serializes a proto and return the result as a SerializedProto value.
 //
 // Users should call SerializedProto_Free on the return value afterwards.
-template <class Proto>
-inline SerializedProto SerializeProto(const Proto& proto) {
+template <class ProtoType>
+inline SerializedProto SerializeProto(const ProtoType& proto) {
   SerializedProto serialized_proto;
   SerializeProto(proto, &serialized_proto);
   return serialized_proto;
@@ -57,9 +58,9 @@ inline SerializedProto SerializeProto(const Proto& proto) {
 
 // Deserializes a buffer and return the corresponding proto. If the buffer is
 // empty, return an empty proto.
-template <class Proto>
-inline Proto DeserializeProto(const SerializedProto& serialized_proto) {
-  Proto proto;
+template <class ProtoType, class SerializedProtoType>
+inline ProtoType DeserializeProto(const SerializedProtoType& serialized_proto) {
+  ProtoType proto;
   if (serialized_proto.bytes != nullptr) {
     CHECK_GT(serialized_proto.size, 0);
     CHECK(proto.ParseFromArray(serialized_proto.bytes, serialized_proto.size))
@@ -69,7 +70,8 @@ inline Proto DeserializeProto(const SerializedProto& serialized_proto) {
 }
 
 // Releases the memory allocated for serialized protos.
-inline void SerializedProto_Free(const SerializedProto& serialized_proto) {
+template <class SerializedProtoType>
+inline void SerializedProto_Free(const SerializedProtoType& serialized_proto) {
   CHECK_NE(serialized_proto.bytes, nullptr);
   CHECK_GT(serialized_proto.size, 0);
   delete[] serialized_proto.bytes;
diff --git a/tensorflow/stream_executor/tpu/tpu_computation_placer.cc b/tensorflow/stream_executor/tpu/tpu_computation_placer.cc
index 9d8aa3808b3..44191b5a5a4 100644
--- a/tensorflow/stream_executor/tpu/tpu_computation_placer.cc
+++ b/tensorflow/stream_executor/tpu/tpu_computation_placer.cc
@@ -16,8 +16,12 @@ limitations under the License.
 #include "tensorflow/stream_executor/tpu/tpu_computation_placer.h"
 
 #include "tensorflow/core/tpu/tpu_api.h"
+#include "tensorflow/stream_executor/tpu/status_helper.h"
 #include "tensorflow/stream_executor/tpu/tpu_platform.h"
 
+namespace tensorflow {
+namespace tpu {
+
 template <typename T>
 using StatusOr = TpuComputationPlacer::StatusOr<T>;
 
@@ -37,7 +41,30 @@ StatusOr<int> TpuComputationPlacer::DeviceId(int replica, int computation,
 
 StatusOr<xla::DeviceAssignment> TpuComputationPlacer::AssignDevices(
     int replica_count, int computation_count) {
-  LOG(FATAL) << "Unimplemented.";
+  StatusHelper status;
+  xla::DeviceAssignment result(replica_count, computation_count);
+  tensorflow::tpu::ExecutorApiFn()->TpuComputationPlacer_AssignDevicesFn(
+      placer_, replica_count, computation_count, result.data(),
+      status.c_status);
+  if (!status.ok()) {
+    return status.status();
+  }
+  return result;
+}
+
+/*static*/ StatusOr<xla::DeviceAssignment>
+TpuComputationPlacer::AssignLocalDevices(TpuHostLocationExternal host_location,
+                                         int replica_count,
+                                         int computation_count) {
+  StatusHelper status;
+  xla::DeviceAssignment result(replica_count, computation_count);
+  tensorflow::tpu::ExecutorApiFn()->TpuComputationPlacer_AssignLocalDevicesFn(
+      host_location.impl(), replica_count, computation_count, result.data(),
+      status.c_status);
+  if (!status.ok()) {
+    return status.status();
+  }
+  return result;
 }
 
 static std::unique_ptr<xla::ComputationPlacer> CreateTpuComputationPlacer() {
@@ -50,3 +77,6 @@ static bool InitModule() {
   return true;
 }
 static bool module_initialized = InitModule();
+
+}  // namespace tpu
+}  // namespace tensorflow
diff --git a/tensorflow/stream_executor/tpu/tpu_computation_placer.h b/tensorflow/stream_executor/tpu/tpu_computation_placer.h
index c8f4c9e3888..5e807f72035 100644
--- a/tensorflow/stream_executor/tpu/tpu_computation_placer.h
+++ b/tensorflow/stream_executor/tpu/tpu_computation_placer.h
@@ -19,6 +19,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/computation_placer.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/stream_executor/tpu/tpu_executor_c_api.h"
+#include "tensorflow/stream_executor/tpu/tpu_topology.h"
+
+namespace tensorflow {
+namespace tpu {
 
 class TpuComputationPlacer : public xla::ComputationPlacer {
  public:
@@ -34,8 +38,15 @@ class TpuComputationPlacer : public xla::ComputationPlacer {
   StatusOr<xla::DeviceAssignment> AssignDevices(int replica_count,
                                                 int computation_count) override;
 
+  static StatusOr<xla::DeviceAssignment> AssignLocalDevices(
+      TpuHostLocationExternal host_location, int replica_count,
+      int computation_count);
+
  private:
   XLA_ComputationPlacer* placer_;
 };
 
+}  // namespace tpu
+}  // namespace tensorflow
+
 #endif  // TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_COMPUTATION_PLACER_H_
diff --git a/tensorflow/stream_executor/tpu/tpu_executable.cc b/tensorflow/stream_executor/tpu/tpu_executable.cc
index dd28f21455e..95b71fe6226 100644
--- a/tensorflow/stream_executor/tpu/tpu_executable.cc
+++ b/tensorflow/stream_executor/tpu/tpu_executable.cc
@@ -113,4 +113,9 @@ int64 TpuExecutable::ShapeSize(const Shape& shape) {
   return size;
 }
 
+absl::string_view TpuExecutable::fingerprint() const {
+  // TODO(skye): the fingerprint can be plumbed through via core_program_
+  LOG(FATAL) << "TpuExecutable::fingerprint() unimplemented";
+}
+
 }  // namespace xla
diff --git a/tensorflow/stream_executor/tpu/tpu_executable.h b/tensorflow/stream_executor/tpu/tpu_executable.h
index 3c9e60ba335..d2c3200c93d 100644
--- a/tensorflow/stream_executor/tpu/tpu_executable.h
+++ b/tensorflow/stream_executor/tpu/tpu_executable.h
@@ -46,6 +46,8 @@ class TpuExecutable : public TpuExecutableInterface {
 
   const XLA_TpuProgram* core_program() const { return core_program_; }
 
+  absl::string_view fingerprint() const override;
+
  private:
   Status LoadProgramAndEnqueueToStream(
       const ServiceExecutableRunOptions& run_options,
diff --git a/tensorflow/stream_executor/tpu/tpu_executable_interface.cc b/tensorflow/stream_executor/tpu/tpu_executable_interface.cc
index 13f9db98e5d..af29f2e2b06 100644
--- a/tensorflow/stream_executor/tpu/tpu_executable_interface.cc
+++ b/tensorflow/stream_executor/tpu/tpu_executable_interface.cc
@@ -62,6 +62,24 @@ TpuExecutableInterface::AllocateOutputMemoryWithInputReuse(
           << " host_shape = " << ShapeUtil::HumanStringWithLayout(host_shape);
   Shape device_shape = HostShapeToDeviceShape(host_shape);
 
+  TF_RETURN_IF_ERROR(alias_config.ForEachAliasWithStatus(
+      [&](const ShapeIndex& output_index,
+          absl::optional<HloInputOutputAliasConfig::Alias> alias) {
+        if (alias && alias->must_alias()) {
+          VLOG(1) << alias->ToString();
+          const MaybeOwningDeviceMemory& original_input =
+              (*arguments)[alias->parameter_number].Buffers().element(
+                  alias->parameter_index);
+          if (!original_input.HasOwnership()) {
+            return InvalidArgument(
+                "An input was configured to be must-alias at "
+                "compile time but not donated at runtime: %s",
+                alias->ToString());
+          }
+        }
+        return Status::OK();
+      }));
+
   if (VLOG_IS_ON(3)) {
     VLOG(3) << "AllocateOutputMemoryWithInputReuse, device = " << device_ordinal
             << " host_shape = " << ShapeUtil::HumanStringWithLayout(host_shape);
@@ -171,13 +189,12 @@ StatusOr<ExecutionOutput> TpuExecutableInterface::ExecuteAsyncOnStream(
           shape, alias_config, run_options->allocator(), &arguments, stream,
           run_options->run_options().host_to_device_stream()));
 
-  MarkToBeReleasedArguments(absl::MakeSpan(arguments), result);
-
   // Address of the buffer in TPU memory that is being speculated.
   absl::optional<se::DeviceMemoryBase> cross_program_prefetch_addr;
   if (hlo_module_) {
-    for (const auto& [parameter, index] :
-         hlo_module_->CrossProgramPrefetches()) {
+    for (const auto& prefetch : hlo_module_->CrossProgramPrefetches()) {
+      const auto& parameter = prefetch.first;
+      const auto& index = prefetch.second;
       CHECK_LT(parameter, arguments.size());
       // Ensure the cross program prefetched buffer doesn't alias with any
       // program outputs. If the input and output aliased, the buffer could be
@@ -185,6 +202,7 @@ StatusOr<ExecutionOutput> TpuExecutableInterface::ExecuteAsyncOnStream(
       // data from fast memory instead of fresh data in large memory.
       auto it = arguments[parameter].MutableBuffers()->find({index});
       CHECK(it != arguments[parameter].MutableBuffers()->end());
+      CHECK(!it->second.AsDeviceMemoryBase().is_null());
       if (absl::c_none_of(result.Result().buffers(), [&](auto index_addr_pair) {
             return index_addr_pair.second.IsSameAs(
                 it->second.AsDeviceMemoryBase());
@@ -195,6 +213,11 @@ StatusOr<ExecutionOutput> TpuExecutableInterface::ExecuteAsyncOnStream(
     }
   }
 
+  // MarkToBeReleasedArguments may std::move some elements of arguments, so it
+  // must run after the cross program prefetch address is calculated from the
+  // arguments.
+  MarkToBeReleasedArguments(absl::MakeSpan(arguments), result);
+
   TF_RETURN_IF_ERROR(LoadProgramAndEnqueueToStream(
       *run_options, memory_bases, result.Result().root_buffer(),
       cross_program_prefetch_addr));
diff --git a/tensorflow/stream_executor/tpu/tpu_executable_interface.h b/tensorflow/stream_executor/tpu/tpu_executable_interface.h
index d0e13b8aea8..18e74a7f19d 100644
--- a/tensorflow/stream_executor/tpu/tpu_executable_interface.h
+++ b/tensorflow/stream_executor/tpu/tpu_executable_interface.h
@@ -80,6 +80,8 @@ class TpuExecutableInterface : public Executable {
       absl::optional<stream_executor::DeviceMemoryBase>
           cross_program_prefetch_addr) = 0;
 
+  virtual absl::string_view fingerprint() const = 0;
+
  protected:
   virtual Shape HostShapeToDeviceShape(const Shape& host_shape) = 0;
 
diff --git a/tensorflow/stream_executor/tpu/tpu_executor.cc b/tensorflow/stream_executor/tpu/tpu_executor.cc
index 851fb3ec4e7..b31624fc6d4 100644
--- a/tensorflow/stream_executor/tpu/tpu_executor.cc
+++ b/tensorflow/stream_executor/tpu/tpu_executor.cc
@@ -62,7 +62,7 @@ bool TpuExecutor::SynchronizeAllActivity() {
 Status TpuExecutor::BlockHostUntilDone(Stream* stream) {
   StatusHelper status;
   tpu::ExecutorApiFn()->TpuExecutor_BlockHostUntilDoneFn(
-      executor_, stream_map().at(stream->implementation()), status.c_status);
+      executor_, get_stream(stream->implementation()), status.c_status);
   return status.status();
 }
 
@@ -76,25 +76,32 @@ Status TpuExecutor::BlockUntilDoneOrFailed() {
 Status TpuExecutor::GetStatus(Stream* stream) {
   StatusHelper status;
   tpu::ExecutorApiFn()->TpuExecutor_GetStatusFn(
-      executor_, stream_map().at(stream->implementation()), status.c_status);
+      executor_, get_stream(stream->implementation()), status.c_status);
   return status.status();
 }
 
+tpu::TpuCoreLocationExternal TpuExecutor::GetCoreLocationExternal() const {
+  return tpu::TpuCoreLocationExternal(
+      tpu::ExecutorApiFn()->TpuExecutor_GetCoreLocationFn(executor_));
+}
+
 bool TpuExecutor::AllocateStream(Stream* stream) {
   return tpu::ExecutorApiFn()->TpuExecutor_AllocateStreamFn(
-      executor_, stream_map().at(stream->implementation()));
+      executor_, get_stream(stream->implementation()));
 }
 
 void TpuExecutor::DeallocateStream(Stream* stream) {
   tpu::ExecutorApiFn()->TpuExecutor_DeallocateStreamFn(
-      executor_, stream_map().at(stream->implementation()));
+      executor_, get_stream(stream->implementation()));
+  tpu_platform().mutex().lock();
   stream_map().erase(stream->implementation());
+  tpu_platform().mutex().unlock();
 }
 
 bool TpuExecutor::CreateStreamDependency(Stream* dependent, Stream* other) {
   return tpu::ExecutorApiFn()->TpuExecutor_CreateStreamDependencyFn(
-      executor_, stream_map().at(dependent->implementation()),
-      stream_map().at(other->implementation()));
+      executor_, get_stream(dependent->implementation()),
+      get_stream(other->implementation()));
 }
 
 Status TpuExecutor::AllocateEvent(Event* event) { return Status::OK(); }
@@ -111,13 +118,13 @@ void TpuExecutor::DeallocateTimer(Timer* timer) {}
 
 bool TpuExecutor::StartTimer(Stream* stream, ::stream_executor::Timer* timer) {
   return tpu::ExecutorApiFn()->TpuExecutor_StartTimerFn(
-      executor_, stream_map().at(stream->implementation()),
+      executor_, get_stream(stream->implementation()),
       timer_map_.at(timer->implementation()));
 }
 
 bool TpuExecutor::StopTimer(Stream* stream, ::stream_executor::Timer* timer) {
   return tpu::ExecutorApiFn()->TpuExecutor_StopTimerFn(
-      executor_, stream_map().at(stream->implementation()),
+      executor_, get_stream(stream->implementation()),
       timer_map_.at(timer->implementation()));
 }
 
@@ -134,7 +141,7 @@ Status TpuExecutor::RecordEvent(Stream* stream,
   StatusHelper status;
   auto se_event = tpu_platform().LookupEvent(event->implementation());
   tpu::ExecutorApiFn()->TpuExecutor_RecordEventFn(
-      executor_, stream_map().at(stream->implementation()), se_event,
+      executor_, get_stream(stream->implementation()), se_event,
       status.c_status);
   return status.status();
 }
@@ -144,7 +151,7 @@ Status TpuExecutor::WaitForEvent(Stream* stream,
   StatusHelper status;
   auto se_event = tpu_platform().LookupEvent(event->implementation());
   tpu::ExecutorApiFn()->TpuExecutor_WaitForEventFn(
-      executor_, stream_map().at(stream->implementation()), se_event,
+      executor_, get_stream(stream->implementation()), se_event,
       status.c_status);
   return status.status();
 }
@@ -168,7 +175,7 @@ TpuExecutor::GetTimerImplementation() {
 std::unique_ptr<::stream_executor::internal::StreamInterface>
 TpuExecutor::GetStreamImplementation() {
   SE_Stream* tpu_stream = tpu::ExecutorApiFn()->TpuStream_NewFn(executor_);
-  auto ptr = absl::make_unique<TpuStream>(tpu_stream);
+  auto ptr = absl::make_unique<tpu::TpuStream>(tpu_stream);
   tpu_platform().mutex().lock();
   stream_map()[ptr.get()] = tpu_stream;
   tpu_platform().mutex().unlock();
@@ -273,7 +280,7 @@ bool TpuExecutor::Memcpy(Stream* stream, void* host_dst,
                          uint64 size) {
   SE_DeviceMemoryBase se_base = ApiConverter::ToC(device_src);
   return tpu::ExecutorApiFn()->TpuExecutor_MemcpyToHostFn(
-      executor_, stream_map().at(stream->implementation()), host_dst, &se_base,
+      executor_, get_stream(stream->implementation()), host_dst, &se_base,
       size);
 }
 
@@ -282,7 +289,7 @@ bool TpuExecutor::Memcpy(Stream* stream,
                          const void* host_src, uint64 size) {
   SE_DeviceMemoryBase se_base = ApiConverter::ToC(*device_dst);
   return tpu::ExecutorApiFn()->TpuExecutor_MemcpyFromHostFn(
-      executor_, stream_map().at(stream->implementation()), &se_base, host_src,
+      executor_, get_stream(stream->implementation()), &se_base, host_src,
       size);
 }
 
@@ -336,8 +343,8 @@ bool TpuExecutor::HostCallback(Stream* stream,
                                std::function<Status()> callback) {
   HostCallbackContext* ctx = new HostCallbackContext{callback};
   return tpu::ExecutorApiFn()->TpuExecutor_HostCallbackFn(
-      executor_, stream_map().at(stream->implementation()),
-      &HostCallbackTrampoline, ctx);
+      executor_, get_stream(stream->implementation()), &HostCallbackTrampoline,
+      ctx);
 }
 
 TpuExecutor::StatusOr<std::unique_ptr<::stream_executor::DeviceDescription>>
diff --git a/tensorflow/stream_executor/tpu/tpu_executor.h b/tensorflow/stream_executor/tpu/tpu_executor.h
index faeae86da9b..4c678cc5b02 100644
--- a/tensorflow/stream_executor/tpu/tpu_executor.h
+++ b/tensorflow/stream_executor/tpu/tpu_executor.h
@@ -34,6 +34,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/tpu/tpu_executor_interface.h"
 #include "tensorflow/stream_executor/tpu/tpu_platform.h"
 #include "tensorflow/stream_executor/tpu/tpu_platform_interface.h"
+#include "tensorflow/stream_executor/tpu/tpu_stream.h"
 
 namespace tensorflow {
 
@@ -96,11 +97,12 @@ class TpuExecutor : public tensorflow::tpu::TpuExecutorInterface {
   void DequeueOutfeed(int32 outfeed_queue_index, absl::Span<uint8> bytes,
                       StatusCallback done);
 
-  Status EnqueueInfeed(int32 infeed_queue_index,
-                       absl::Span<const uint8> bytes);
+  Status EnqueueInfeed(int32 infeed_queue_index, absl::Span<const uint8> bytes);
 
   absl::optional<stream_executor::AllocatorStats> GetAllocatorStats() override;
 
+  tpu::TpuCoreLocationExternal GetCoreLocationExternal() const override;
+
   Status GetStatus(Stream* stream) override;
 
   std::unique_ptr<::stream_executor::internal::StreamInterface>
@@ -175,10 +177,6 @@ class TpuExecutor : public tensorflow::tpu::TpuExecutorInterface {
     LOG(FATAL) << "Not yet implemented";
   }
 
-  stream_executor::SharedMemoryConfig GetDeviceSharedMemoryConfig() override {
-    LOG(FATAL) << "not yet implemented";
-  }
-
   void* GetSubBuffer(DeviceMemoryBase* parent, uint64 offset,
                      uint64 size) override {
     LOG(FATAL) << "not yet implemented";
@@ -197,10 +195,7 @@ class TpuExecutor : public tensorflow::tpu::TpuExecutorInterface {
   bool CanEnablePeerAccessTo(StreamExecutorInterface* other) override {
     LOG(FATAL) << "not yet implemented";
   }
-  Status SetDeviceSharedMemoryConfig(
-      stream_executor::SharedMemoryConfig config) override {
-    LOG(FATAL) << "not yet implemented";
-  }
+
   void* HostMemoryAllocate(uint64 size) override {
     LOG(FATAL) << "not yet implemented";
   }
@@ -232,6 +227,11 @@ class TpuExecutor : public tensorflow::tpu::TpuExecutorInterface {
     return *(tpu_platform().stream_map());
   }
 
+  SE_Stream* get_stream(StreamInterface* ptr) {
+    tensorflow::mutex_lock m(tpu_platform().mutex());
+    return stream_map()[ptr];
+  }
+
   TimerMap timer_map_;
   tensorflow::tpu::TpuPlatformInterface* platform_;
   SE_StreamExecutor* executor_;
diff --git a/tensorflow/stream_executor/tpu/tpu_executor_c_api.h b/tensorflow/stream_executor/tpu/tpu_executor_c_api.h
index 77806bd338e..43c7270b958 100644
--- a/tensorflow/stream_executor/tpu/tpu_executor_c_api.h
+++ b/tensorflow/stream_executor/tpu/tpu_executor_c_api.h
@@ -39,8 +39,8 @@ SE_PlatformId TpuPlatform_Id(SE_Platform* platform);
 int64_t TpuPlatform_VisibleDeviceCount(SE_Platform* platform);
 int64_t TpuPlatform_TpuMemoryLimit(SE_Platform* platform);
 bool TpuPlatform_ShouldRegisterTpuDeviceToDeviceCopy(SE_Platform* platform);
-void* TpuPlatform_GetTopologyPtr(SE_Platform* platform);
-void* TpuPlatform_GetHostLocation(SE_Platform* platform);
+SE_TpuTopology* TpuPlatform_GetTopologyPtr(SE_Platform* platform);
+SE_TpuTopology_Host* TpuPlatform_GetHostLocation(SE_Platform* platform);
 
 void TpuExecutor_Init(SE_StreamExecutor* executor, int device_ordinal,
                       SE_DeviceOptions* device_options, SE_Status* status);
@@ -65,6 +65,8 @@ bool TpuExecutor_CreateStreamDependency(SE_StreamExecutor* executor,
 void TpuExecutor_GetStatus(SE_StreamExecutor* executor, SE_Stream* stream,
                            SE_Status* status);
 
+SE_TpuTopology_Core* TpuExecutor_GetCoreLocation(SE_StreamExecutor* executor);
+
 void TpuExecutor_AllocateEvent(SE_StreamExecutor* executor, SE_Event* event,
                                SE_Status* status);
 void TpuExecutor_DeallocateEvent(SE_StreamExecutor* executor, SE_Event* event,
@@ -125,6 +127,14 @@ void TpuStream_Free(SE_Stream*);
 void* TpuStream_Stream(SE_Stream*);
 bool TpuStream_Status(SE_Stream*);
 bool TpuStream_IsSameSharedMemoryLocation(SE_Stream*, SE_Stream*);
+void TpuStream_EnqueueTransferHostToDevice(SE_Stream* stream,
+                                           SE_DeviceMemoryBase device_dst,
+                                           void* host_src, uint64_t size,
+                                           SE_Status* status);
+void TpuStream_EnqueueTransferDeviceToHost(SE_Stream* stream,
+                                           SE_DeviceMemoryBase device_src,
+                                           void* host_dst, uint64_t size,
+                                           SE_Status* status);
 void TpuStream_TpuEnqueueOnDeviceSendRecvLocal(SE_Stream* stream,
                                                SE_DeviceMemoryBase send_buffer,
                                                SE_DeviceMemoryBase recv_buffer,
@@ -178,31 +188,93 @@ void TpuTransferManager_TransferLiteralFromDevice(
     XLA_StatusCallbackFn callback, void* ctx);
 int64_t TpuTransferManager_GetByteSizeRequirement(XLA_TransferManager* manager,
                                                   XLA_Shape* shape);
+void TpuTransferManager_ChooseCompactLayoutForShape(
+    XLA_TransferManager* manager, XLA_Shape* host_shape, XLA_Shape* output,
+    SE_Status* status);
+bool TpuTransferManager_CanShapedBufferBeAccessedNow(
+    XLA_TransferManager* manager, SE_StreamExecutor* executor,
+    XLA_ShapedBuffer* device_buffer);
+bool TpuTransferManager_CanBufferBeAccessedNow(
+    XLA_TransferManager* manager, SE_StreamExecutor* executor,
+    SE_DeviceMemoryBase* device_buffer);
 void TpuTransferManager_WriteSingleTupleIndexTable(
     XLA_TransferManager* manager, SE_Stream* stream,
     SE_DeviceMemoryBase* elements, size_t elements_len, XLA_Shape* shape,
     SE_DeviceMemoryBase* region, SE_Status* status);
+void TpuTransferManager_GetInfeedLayout(XLA_Shape* shape,
+                                        XLA_Shape* infeed_shape);
+void TpuTransferManager_LinearizeToBuffers(
+    XLA_TransferManager* manager, XLA_Literal* c_literal, char*** buffers_array,
+    int64_t** buffers_size, int64_t* buffers_array_size, SE_Status* status);
+void TpuTransferManager_FreeBuffers(char** buffers_array, int64_t* buffers_size,
+                                    int64_t buffers_array_size);
+void TpuTransferManager_TransferLiteralToInfeed(XLA_TransferManager* manager,
+                                                SE_StreamExecutor* executor,
+                                                XLA_Literal* c_literal,
+                                                SE_Status* status);
+void TpuTransferManager_TransferBuffersToInfeed(XLA_TransferManager* manager,
+                                                SE_StreamExecutor* executor,
+                                                uint32_t** buffers_array,
+                                                int64_t* buffers_size_in_uint32,
+                                                int64_t buffers_array_size,
+                                                SE_Status* status);
+void TpuTransferManager_TransferLiteralFromOutfeed(XLA_TransferManager* manager,
+                                                   SE_StreamExecutor* executor,
+                                                   XLA_Shape* shape,
+                                                   XLA_Literal* c_literal,
+                                                   SE_Status* status);
+void TpuTransferManager_ResetDevices(XLA_TransferManager* manager,
+                                     SE_StreamExecutor** executors,
+                                     int64_t num_executors, SE_Status* status);
 
 XLA_ComputationPlacer* TpuComputationPlacer_New();
 void TpuComputationPlacer_Free(XLA_ComputationPlacer* placer);
+// `assignment` should be a preallocated array of size `replicate_count` *
+// `computation_count`. The assignment will be constructed as a 2D array where
+// assignment[replica][computation] = device_id.
+void TpuComputationPlacer_AssignDevices(XLA_ComputationPlacer* placer,
+                                        int replica_count,
+                                        int computation_count, int* assignment,
+                                        SE_Status* status);
+void TpuComputationPlacer_AssignLocalDevices(SE_TpuTopology_Host* host,
+                                             int replica_count,
+                                             int computation_count,
+                                             int* assignment,
+                                             SE_Status* status);
 
-int TpuTopology_LogicalDevicesPerHost(void* tpu_topology,
+int TpuTopology_LogicalDevicesPerHost(SE_TpuTopology* tpu_topology,
                                       TpuCoreTypeEnum tpu_core_type);
-int TpuTopology_LogicalDevicesPerChip(void* tpu_topology,
+int TpuTopology_LogicalDevicesPerChip(SE_TpuTopology* tpu_topology,
                                       TpuCoreTypeEnum tpu_core_type);
-int TpuTopology_ChipBounds_X(void* tpu_topology);
-int TpuTopology_ChipBounds_Y(void* tpu_topology);
-int TpuTopology_ChipBounds_Z(void* tpu_topology);
-bool TpuTopology_HasChip(void* tpu_topology, int x, int y, int z);
-void* TpuTopology_Core(void* tpu_topology, int x, int y, int z,
-                       TpuCoreTypeEnum tpu_core_type, int index);
-int TpuCoreLocation_ChipCoordinates_X(void* tpu_core_location);
-int TpuCoreLocation_ChipCoordinates_Y(void* tpu_core_location);
-int TpuCoreLocation_ChipCoordinates_Z(void* tpu_core_location);
-int TpuCoreLocation_Index(void* tpu_core_location);
-int TpuCoreLocation_Id(void* tpu_core_location);
+int TpuTopology_ChipBounds_X(SE_TpuTopology* tpu_topology);
+int TpuTopology_ChipBounds_Y(SE_TpuTopology* tpu_topology);
+int TpuTopology_ChipBounds_Z(SE_TpuTopology* tpu_topology);
+bool TpuTopology_HasChip(SE_TpuTopology* tpu_topology, int x, int y, int z);
+SE_TpuTopology_Core* TpuTopology_Core(SE_TpuTopology* tpu_topology, int x,
+                                      int y, int z,
+                                      TpuCoreTypeEnum tpu_core_type, int index);
+int TpuTopology_NumCores(SE_TpuTopology* tpu_topology,
+                         TpuCoreTypeEnum tpu_core_type);
+// 'cores' should be a preallocated array of size TpuTopology_NumCores.
+void TpuTopology_Cores(SE_TpuTopology* tpu_topology,
+                       TpuCoreTypeEnum tpu_core_type,
+                       SE_TpuTopology_Core** cores);
+int TpuTopology_IdForHost(SE_TpuTopology* tpu_topology, int x, int y, int z);
+TpuVersionEnum TpuTopology_Version(SE_TpuTopology* tpu_topology);
+void TpuCoreLocation_ChipCoordinates(SE_TpuTopology_Core* tpu_core_location,
+                                     int* x, int* y, int* z);
+void TpuCoreLocation_HostCoordinates(SE_TpuTopology_Core* tpu_core_location,
+                                     int* x, int* y, int* z);
+int TpuCoreLocation_Index(SE_TpuTopology_Core* tpu_core_location);
+int TpuCoreLocation_Id(SE_TpuTopology_Core* tpu_core_location);
 
-int TpuHostLocation_Id(void* tpu_host_location);
+int TpuHostLocation_Id(SE_TpuTopology_Host* tpu_host_location);
+int TpuHostLocation_NumCores(SE_TpuTopology_Host* tpu_host_location,
+                             TpuCoreTypeEnum tpu_core_type);
+// 'cores' should be a preallocated array of size TpuHostLocation_NumCores.
+void TpuHostLocation_Cores(SE_TpuTopology_Host* tpu_host_location,
+                           TpuCoreTypeEnum tpu_core_type,
+                           SE_TpuTopology_Core** cores);
 
 // C API for XLA::Compiler interface
 
@@ -234,6 +306,10 @@ TFTPU_CAPI_EXPORT void TpuExecutable_ExecuteAsyncOnStream(
     SE_HloExecutionProfile* hlo_execution_profile, SE_ExecutionOutput* output,
     SE_Status* status);
 
+TFTPU_CAPI_EXPORT void TpuExecutable_Fingerprint(SE_Executable* executable,
+                                                 const char** fingerprint,
+                                                 size_t* size);
+
 TFTPU_CAPI_EXPORT void TpuExecutable_Free(SE_Executable*);
 
 // Converts an XLA `Shape` into its equivalent TPU `Shape` representation.
@@ -269,6 +345,7 @@ struct TfTpu_ExecutorApiFn {
   TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_DeallocateStream);
   TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_CreateStreamDependency);
   TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_GetStatus);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_GetCoreLocation);
   TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_AllocateEvent);
   TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_DeallocateEvent);
   TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_PollForEventStatus);
@@ -296,6 +373,8 @@ struct TfTpu_ExecutorApiFn {
   TFTPU_ADD_FN_IN_STRUCT(TpuStream_Stream);
   TFTPU_ADD_FN_IN_STRUCT(TpuStream_Status);
   TFTPU_ADD_FN_IN_STRUCT(TpuStream_IsSameSharedMemoryLocation);
+  TFTPU_ADD_FN_IN_STRUCT(TpuStream_EnqueueTransferHostToDevice);
+  TFTPU_ADD_FN_IN_STRUCT(TpuStream_EnqueueTransferDeviceToHost);
   TFTPU_ADD_FN_IN_STRUCT(TpuStream_TpuEnqueueOnDeviceSendRecvLocal);
 
   TFTPU_ADD_FN_IN_STRUCT(TpuEvent_New);
@@ -333,10 +412,22 @@ struct TfTpu_ExecutorApiFn {
   TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_TransferLiteralToDeviceAsync);
   TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_TransferLiteralFromDevice);
   TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_GetByteSizeRequirement);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_ChooseCompactLayoutForShape);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_CanShapedBufferBeAccessedNow);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_CanBufferBeAccessedNow);
   TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_WriteSingleTupleIndexTable);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_GetInfeedLayout);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_LinearizeToBuffers);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_FreeBuffers);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_TransferLiteralToInfeed);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_TransferBuffersToInfeed);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_TransferLiteralFromOutfeed);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_ResetDevices);
 
   TFTPU_ADD_FN_IN_STRUCT(TpuComputationPlacer_New);
   TFTPU_ADD_FN_IN_STRUCT(TpuComputationPlacer_Free);
+  TFTPU_ADD_FN_IN_STRUCT(TpuComputationPlacer_AssignDevices);
+  TFTPU_ADD_FN_IN_STRUCT(TpuComputationPlacer_AssignLocalDevices);
 
   TFTPU_ADD_FN_IN_STRUCT(TpuTopology_LogicalDevicesPerHost);
   TFTPU_ADD_FN_IN_STRUCT(TpuTopology_LogicalDevicesPerChip);
@@ -345,13 +436,19 @@ struct TfTpu_ExecutorApiFn {
   TFTPU_ADD_FN_IN_STRUCT(TpuTopology_ChipBounds_Z);
   TFTPU_ADD_FN_IN_STRUCT(TpuTopology_HasChip);
   TFTPU_ADD_FN_IN_STRUCT(TpuTopology_Core);
-  TFTPU_ADD_FN_IN_STRUCT(TpuCoreLocation_ChipCoordinates_X);
-  TFTPU_ADD_FN_IN_STRUCT(TpuCoreLocation_ChipCoordinates_Y);
-  TFTPU_ADD_FN_IN_STRUCT(TpuCoreLocation_ChipCoordinates_Z);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTopology_NumCores);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTopology_Cores);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTopology_IdForHost);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTopology_Version);
+
+  TFTPU_ADD_FN_IN_STRUCT(TpuCoreLocation_ChipCoordinates);
+  TFTPU_ADD_FN_IN_STRUCT(TpuCoreLocation_HostCoordinates);
   TFTPU_ADD_FN_IN_STRUCT(TpuCoreLocation_Index);
   TFTPU_ADD_FN_IN_STRUCT(TpuCoreLocation_Id);
 
   TFTPU_ADD_FN_IN_STRUCT(TpuHostLocation_Id);
+  TFTPU_ADD_FN_IN_STRUCT(TpuHostLocation_NumCores);
+  TFTPU_ADD_FN_IN_STRUCT(TpuHostLocation_Cores);
 
   TFTPU_ADD_FN_IN_STRUCT(TpuCompiler_New);
   TFTPU_ADD_FN_IN_STRUCT(TpuCompiler_Free);
@@ -360,6 +457,7 @@ struct TfTpu_ExecutorApiFn {
   TFTPU_ADD_FN_IN_STRUCT(TpuCompiler_Compile);
   TFTPU_ADD_FN_IN_STRUCT(TpuCompiler_ShapeSize);
   TFTPU_ADD_FN_IN_STRUCT(TpuExecutable_ExecuteAsyncOnStream);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutable_Fingerprint);
   TFTPU_ADD_FN_IN_STRUCT(TpuExecutable_Free);
 
   TFTPU_ADD_FN_IN_STRUCT(XlaShapeToTpuShapeRepresentation);
diff --git a/tensorflow/stream_executor/tpu/tpu_executor_interface.h b/tensorflow/stream_executor/tpu/tpu_executor_interface.h
index d3145b140b8..399a81f8553 100644
--- a/tensorflow/stream_executor/tpu/tpu_executor_interface.h
+++ b/tensorflow/stream_executor/tpu/tpu_executor_interface.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/statusor.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 #include "tensorflow/stream_executor/tpu/tpu_platform_interface.h"
+#include "tensorflow/stream_executor/tpu/tpu_topology.h"
 
 namespace tpu {
 class TpuCore;
@@ -53,6 +54,10 @@ class TpuExecutorInterface
   }
 
   virtual TpuPlatformInterface& platform() { LOG(FATAL) << "Unimplemented."; }
+
+  virtual TpuCoreLocationExternal GetCoreLocationExternal() const {
+    LOG(FATAL) << "Unimplemented.";
+  }
 };
 
 }  // namespace tpu
diff --git a/tensorflow/stream_executor/tpu/tpu_platform.h b/tensorflow/stream_executor/tpu/tpu_platform.h
index a70634f7055..dea41878348 100644
--- a/tensorflow/stream_executor/tpu/tpu_platform.h
+++ b/tensorflow/stream_executor/tpu/tpu_platform.h
@@ -70,9 +70,7 @@ class TpuPlatform : public ::tensorflow::tpu::TpuPlatformInterface {
   Status Initialize(
       const std::map<std::string, std::string>& platform_options) override;
 
-  Status Reset() override { return Reset(false); }
-
-  Status Reset(bool only_tear_down) override {
+  Status Reset(bool only_tear_down, absl::string_view reason) override {
     LOG(FATAL) << "Not yet implemented";
   }
 
diff --git a/tensorflow/stream_executor/tpu/tpu_platform_interface.cc b/tensorflow/stream_executor/tpu/tpu_platform_interface.cc
index 28430392117..9b8b9cd8ed5 100644
--- a/tensorflow/stream_executor/tpu/tpu_platform_interface.cc
+++ b/tensorflow/stream_executor/tpu/tpu_platform_interface.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <atomic>
 
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/stream_executor/multi_platform_manager.h"
 
@@ -24,7 +25,14 @@ namespace tensorflow {
 namespace tpu {
 
 namespace {
-TpuPlatformInterface* GetRegisteredPlatformStatic(bool initialize_platform) {
+TpuPlatformInterface* GetRegisteredPlatformStatic(bool initialize_platform,
+                                                  int tries_left = 5) {
+  if (tries_left <= 0) {
+    LOG(ERROR) << "Unable to find a TPU platform after exhausting all tries. "
+                  "Returning nullptr...";
+    return nullptr;
+  }
+
   // Prefer TpuPlatform if it's registered.
   auto status_or_tpu_platform =
       stream_executor::MultiPlatformManager::PlatformWithName(
@@ -47,21 +55,29 @@ TpuPlatformInterface* GetRegisteredPlatformStatic(bool initialize_platform) {
                    nullptr;
           },
           initialize_platform);
-  if (!status_or_other_tpu_platforms.ok()) {
+
+  // If we encounter an error, and it is not because the platform isn't found.
+  if (!status_or_other_tpu_platforms.ok() &&
+      status_or_other_tpu_platforms.status().code() != error::NOT_FOUND) {
     LOG(WARNING) << "Error when getting other TPU platforms: "
-                 << status_or_tpu_platform.status();
+                 << status_or_other_tpu_platforms.status();
     return nullptr;
   }
-  auto other_tpu_platforms = status_or_other_tpu_platforms.ValueOrDie();
-  if (!other_tpu_platforms.empty()) {
+
+  // If we find at least one thing, we return the first thing we see.
+  if (status_or_other_tpu_platforms.ok()) {
+    auto other_tpu_platforms = status_or_other_tpu_platforms.ValueOrDie();
     LOG(WARNING) << other_tpu_platforms.size()
                  << " TPU platforms registered, selecting "
                  << other_tpu_platforms[0]->Name();
     return static_cast<TpuPlatformInterface*>(other_tpu_platforms[0]);
   }
 
-  LOG(WARNING) << "No TPU platform registered";
-  return nullptr;
+  LOG(WARNING)
+      << "No TPU platform registered. Waiting 1 second and trying again... ("
+      << (tries_left - 1) << " tries left)";
+  Env::Default()->SleepForMicroseconds(1000000);  // 1 second
+  return GetRegisteredPlatformStatic(initialize_platform, --tries_left);
 }
 }  // namespace
 
diff --git a/tensorflow/stream_executor/tpu/tpu_platform_interface.h b/tensorflow/stream_executor/tpu/tpu_platform_interface.h
index a0a3b444550..1aa30581d29 100644
--- a/tensorflow/stream_executor/tpu/tpu_platform_interface.h
+++ b/tensorflow/stream_executor/tpu/tpu_platform_interface.h
@@ -18,12 +18,15 @@ limitations under the License.
 
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/stream_executor/platform.h"
+#include "tensorflow/stream_executor/tpu/c_api_decl.h"
 #include "tensorflow/stream_executor/tpu/tpu_topology.h"
 
 namespace tensorflow {
 namespace tpu {
 
-typedef void* TpuTopologyPtr;
+// TODO(skyewm): get rid of TpuTopologyPtr and either use SE_TpuTopology* or
+// return a TpuTopologyExternal.
+typedef SE_TpuTopology* TpuTopologyPtr;
 
 class TpuPlatformInterface : public stream_executor::Platform {
  public:
@@ -37,9 +40,11 @@ class TpuPlatformInterface : public stream_executor::Platform {
   // Option to not initialize a platform if not necessary.
   static TpuPlatformInterface* GetRegisteredPlatform(bool initialize_platform);
 
-  virtual Status Reset() { return Reset(false); }
+  virtual Status Reset(bool only_tear_down, absl::string_view reason) = 0;
 
-  virtual Status Reset(bool only_tear_down) = 0;
+  Status Reset(absl::string_view reason) { return Reset(false, reason); }
+
+  Status Reset() { return Reset(false, {}); }
 
   virtual int64 TpuMemoryLimit() = 0;
 
diff --git a/tensorflow/stream_executor/tpu/tpu_stream.h b/tensorflow/stream_executor/tpu/tpu_stream.h
index ab84005c718..cd7637d12c6 100644
--- a/tensorflow/stream_executor/tpu/tpu_stream.h
+++ b/tensorflow/stream_executor/tpu/tpu_stream.h
@@ -23,6 +23,9 @@ limitations under the License.
 #include "tensorflow/stream_executor/tpu/tpu_executor_c_api.h"
 #include "tensorflow/stream_executor/tpu/tpu_stream_interface.h"
 
+namespace tensorflow {
+namespace tpu {
+
 class TpuStream : public tensorflow::tpu::TpuStreamInterface {
  public:
   using Status = stream_executor::port::Status;
@@ -39,6 +42,26 @@ class TpuStream : public tensorflow::tpu::TpuStreamInterface {
             stream_, static_cast<TpuStream*>(other)->stream_);
   }
 
+  Status EnqueueTransferHostToDevice(
+      stream_executor::DeviceMemoryBase device_dst, const void* host_src,
+      uint64 size) {
+    StatusHelper status;
+    tensorflow::tpu::ExecutorApiFn()->TpuStream_EnqueueTransferHostToDeviceFn(
+        stream_, ApiConverter::ToC(device_dst), const_cast<void*>(host_src),
+        size, status.c_status);
+    return status.status();
+  }
+
+  Status EnqueueTransferDeviceToHost(
+      stream_executor::DeviceMemoryBase device_src, void* host_dst,
+      uint64 size) {
+    StatusHelper status;
+    tensorflow::tpu::ExecutorApiFn()->TpuStream_EnqueueTransferDeviceToHostFn(
+        stream_, ApiConverter::ToC(device_src), host_dst, size,
+        status.c_status);
+    return status.status();
+  }
+
   Status EnqueueOnTpuDeviceSendRecvLocal(
       stream_executor::DeviceMemoryBase send_buffer,
       stream_executor::DeviceMemoryBase recv_buffer) override {
@@ -56,4 +79,7 @@ class TpuStream : public tensorflow::tpu::TpuStreamInterface {
   mutable SE_Stream* stream_;
 };
 
+}  // namespace tpu
+}  // namespace tensorflow
+
 #endif  // TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_STREAM_H_
diff --git a/tensorflow/stream_executor/tpu/tpu_topology.cc b/tensorflow/stream_executor/tpu/tpu_topology.cc
index 4499b2d70eb..2638cf2d93a 100644
--- a/tensorflow/stream_executor/tpu/tpu_topology.cc
+++ b/tensorflow/stream_executor/tpu/tpu_topology.cc
@@ -20,12 +20,18 @@ limitations under the License.
 namespace tensorflow {
 namespace tpu {
 
-TpuChipCoordinatesExternal TpuCoreLocationExternal::chip_coordinates() const {
-  return {
-      tpu::ExecutorApiFn()->TpuCoreLocation_ChipCoordinates_XFn(core_location_),
-      tpu::ExecutorApiFn()->TpuCoreLocation_ChipCoordinates_YFn(core_location_),
-      tpu::ExecutorApiFn()->TpuCoreLocation_ChipCoordinates_ZFn(
-          core_location_)};
+TpuDimensionsExternal TpuCoreLocationExternal::chip_coordinates() const {
+  int x, y, z;
+  tpu::ExecutorApiFn()->TpuCoreLocation_ChipCoordinatesFn(core_location_, &x,
+                                                          &y, &z);
+  return {x, y, z};
+}
+
+TpuDimensionsExternal TpuCoreLocationExternal::host_coordinates() const {
+  int x, y, z;
+  tpu::ExecutorApiFn()->TpuCoreLocation_HostCoordinatesFn(core_location_, &x,
+                                                          &y, &z);
+  return {x, y, z};
 }
 
 int32 TpuCoreLocationExternal::index() const {
@@ -40,6 +46,21 @@ int32 TpuHostLocationExternal::Id() const {
   return tpu::ExecutorApiFn()->TpuHostLocation_IdFn(host_location_);
 }
 
+std::vector<TpuCoreLocationExternal> TpuHostLocationExternal::Cores(
+    TpuCoreTypeEnum core_type) const {
+  int num_cores = tpu::ExecutorApiFn()->TpuHostLocation_NumCoresFn(
+      host_location_, core_type);
+  std::vector<SE_TpuTopology_Core*> core_ptrs(num_cores);
+  tpu::ExecutorApiFn()->TpuHostLocation_CoresFn(host_location_, core_type,
+                                                core_ptrs.data());
+  std::vector<TpuCoreLocationExternal> result;
+  result.reserve(num_cores);
+  for (SE_TpuTopology_Core* ptr : core_ptrs) {
+    result.emplace_back(ptr);
+  }
+  return result;
+}
+
 int32 TpuTopologyExternal::LogicalDevicesPerHost(
     TpuCoreTypeEnum core_type) const {
   return tpu::ExecutorApiFn()->TpuTopology_LogicalDevicesPerHostFn(topology_,
@@ -69,5 +90,42 @@ TpuCoreLocationExternal TpuTopologyExternal::Core(int x, int y, int z,
       topology_, x, y, z, core_type, index));
 }
 
+std::vector<TpuCoreLocationExternal> TpuTopologyExternal::cores(
+    TpuCoreTypeEnum core_type) const {
+  int num_cores =
+      tpu::ExecutorApiFn()->TpuTopology_NumCoresFn(topology_, core_type);
+  std::vector<SE_TpuTopology_Core*> core_ptrs(num_cores);
+  tpu::ExecutorApiFn()->TpuTopology_CoresFn(topology_, core_type,
+                                            core_ptrs.data());
+  std::vector<TpuCoreLocationExternal> result;
+  result.reserve(num_cores);
+  for (SE_TpuTopology_Core* ptr : core_ptrs) {
+    result.emplace_back(ptr);
+  }
+  return result;
+}
+
+int TpuTopologyExternal::IdForHost(TpuDimensionsExternal host) const {
+  return tpu::ExecutorApiFn()->TpuTopology_IdForHostFn(topology_, host.x,
+                                                       host.y, host.z);
+}
+
+TpuVersionEnum TpuTopologyExternal::version() const {
+  return tpu::ExecutorApiFn()->TpuTopology_VersionFn(topology_);
+}
+
+std::string TpuVersionEnumToString(TpuVersionEnum version) {
+  switch (version) {
+    case kUnknownTpuVersion:
+      return "Unknown TPU version";
+    case kTpuV2:
+      return "TPU v2";
+    case kTpuV3:
+      return "TPU v3";
+    case kTpuV4:
+      return "TPU v4";
+  }
+}
+
 }  // namespace tpu
 }  // namespace tensorflow
diff --git a/tensorflow/stream_executor/tpu/tpu_topology.h b/tensorflow/stream_executor/tpu/tpu_topology.h
index d6c169f4fa0..7a92353993b 100644
--- a/tensorflow/stream_executor/tpu/tpu_topology.h
+++ b/tensorflow/stream_executor/tpu/tpu_topology.h
@@ -16,13 +16,15 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_TOPOLOGY_H_
 #define TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_TOPOLOGY_H_
 
+#include <vector>
+
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/stream_executor/tpu/c_api_decl.h"
 
 namespace tensorflow {
 namespace tpu {
 
-struct TpuChipCoordinatesExternal {
+struct TpuDimensionsExternal {
   int x;
   int y;
   int z;
@@ -31,24 +33,30 @@ struct TpuChipCoordinatesExternal {
 class TpuCoreLocationExternal {
  public:
   TpuCoreLocationExternal() : core_location_(nullptr) {}
-  explicit TpuCoreLocationExternal(void* core_location)
+  explicit TpuCoreLocationExternal(SE_TpuTopology_Core* core_location)
       : core_location_(core_location) {}
-  TpuChipCoordinatesExternal chip_coordinates() const;
+  TpuDimensionsExternal chip_coordinates() const;
+  TpuDimensionsExternal host_coordinates() const;
   int32 index() const;
   int32 Id() const;
 
+  SE_TpuTopology_Core* impl() const { return core_location_; }
+
  private:
-  void* core_location_;
+  SE_TpuTopology_Core* core_location_;
 };
 
 class TpuHostLocationExternal {
  public:
-  explicit TpuHostLocationExternal(void* host_location)
+  explicit TpuHostLocationExternal(SE_TpuTopology_Host* host_location)
       : host_location_(host_location) {}
   int32 Id() const;
+  std::vector<TpuCoreLocationExternal> Cores(TpuCoreTypeEnum core_type) const;
+
+  SE_TpuTopology_Host* impl() const { return host_location_; }
 
  private:
-  void* host_location_;
+  SE_TpuTopology_Host* host_location_;
 };
 
 struct TpuTopologyChipBoundsExternal {
@@ -59,18 +67,24 @@ struct TpuTopologyChipBoundsExternal {
 
 class TpuTopologyExternal {
  public:
-  explicit TpuTopologyExternal(void* topology) : topology_(topology) {}
+  explicit TpuTopologyExternal(SE_TpuTopology* topology)
+      : topology_(topology) {}
   int32 LogicalDevicesPerHost(TpuCoreTypeEnum core_type) const;
   int32 LogicalDevicesPerChip(TpuCoreTypeEnum core_type) const;
   TpuTopologyChipBoundsExternal chip_bounds() const;
   bool HasChip(int x, int y, int z) const;
   TpuCoreLocationExternal Core(int x, int y, int z, TpuCoreTypeEnum core_type,
                                int index) const;
+  std::vector<TpuCoreLocationExternal> cores(TpuCoreTypeEnum core_type) const;
+  int IdForHost(TpuDimensionsExternal host) const;
+  TpuVersionEnum version() const;
 
  private:
-  void* topology_;
+  SE_TpuTopology* topology_;
 };
 
+std::string TpuVersionEnumToString(TpuVersionEnum version);
+
 }  // namespace tpu
 }  // namespace tensorflow
 
diff --git a/tensorflow/stream_executor/tpu/tpu_transfer_manager.cc b/tensorflow/stream_executor/tpu/tpu_transfer_manager.cc
index a7288003f8d..79e5093c14c 100644
--- a/tensorflow/stream_executor/tpu/tpu_transfer_manager.cc
+++ b/tensorflow/stream_executor/tpu/tpu_transfer_manager.cc
@@ -22,14 +22,18 @@ limitations under the License.
 #include "tensorflow/core/tpu/tpu_api.h"
 #include "tensorflow/stream_executor/device_memory.h"
 #include "tensorflow/stream_executor/tpu/c_api_conversions.h"
+#include "tensorflow/stream_executor/tpu/noncopyable_buffer.h"
 #include "tensorflow/stream_executor/tpu/proto_helper.h"
 #include "tensorflow/stream_executor/tpu/status_helper.h"
+#include "tensorflow/stream_executor/tpu/tpu_executor.h"
 #include "tensorflow/stream_executor/tpu/tpu_executor_c_api.h"
 #include "tensorflow/stream_executor/tpu/tpu_platform.h"
 
 namespace tensorflow {
 
 using Status = stream_executor::port::Status;
+template <typename T>
+using StatusOr = stream_executor::port::StatusOr<T>;
 
 TpuTransferManager::TpuTransferManager() {
   manager_ = tpu::ExecutorApiFn()->TpuTransferManager_NewFn();
@@ -80,6 +84,82 @@ Status TpuTransferManager::TransferLiteralToDeviceAsync(
   return status.status();
 }
 
+Status TpuTransferManager::TransferLiteralToInfeed(
+    stream_executor::StreamExecutor* executor,
+    const xla::LiteralSlice& literal) {
+  StatusHelper status;
+  XLA_Literal c_literal;
+  ApiConverter::ToC(literal, &c_literal);
+  auto* tpu_executor = static_cast<TpuExecutor*>(executor->implementation());
+
+  tpu::ExecutorApiFn()->TpuTransferManager_TransferLiteralToInfeedFn(
+      manager_, tpu_executor->se_executor(), &c_literal, status.c_status);
+
+  ApiConverter::Free(&c_literal);
+
+  return status.status();
+}
+
+Status TpuTransferManager::TransferBuffersToInfeed(
+    se::StreamExecutor* executor,
+    const std::deque<tensorflow::tpu::NoncopyableBuffer>& buffers) {
+  StatusHelper status;
+  auto* tpu_executor = static_cast<TpuExecutor*>(executor->implementation());
+
+  std::vector<int64_t> buffers_size;
+  std::vector<uint32_t*> buffers_array;
+
+  buffers_size.reserve(buffers.size());
+  buffers_array.reserve(buffers.size());
+
+  for (int64_t i = 0; i < buffers.size(); ++i) {
+    buffers_array.push_back(
+        const_cast<unsigned int*>(buffers[i].const_data().data()));
+    buffers_size.push_back(buffers[i].const_data().size());
+  }
+
+  tpu::ExecutorApiFn()->TpuTransferManager_TransferBuffersToInfeedFn(
+      manager_, tpu_executor->se_executor(), buffers_array.data(),
+      buffers_size.data(), buffers_size.size(), status.c_status);
+  return status.status();
+}
+
+Status TpuTransferManager::TransferLiteralFromOutfeed(
+    stream_executor::StreamExecutor* executor, const xla::Shape& literal_shape,
+    xla::MutableBorrowingLiteral literal) {
+  StatusHelper status;
+  XLA_Shape c_shape;
+  XLA_Literal c_literal;
+  auto* tpu_executor = static_cast<TpuExecutor*>(executor->implementation());
+
+  ApiConverter::ToC(literal_shape, &c_shape);
+  ApiConverter::ToC(literal, &c_literal);
+
+  tpu::ExecutorApiFn()->TpuTransferManager_TransferLiteralFromOutfeedFn(
+      manager_, tpu_executor->se_executor(), &c_shape, &c_literal,
+      status.c_status);
+
+  ApiConverter::Free(&c_shape);
+  ApiConverter::Free(&c_literal);
+
+  return status.status();
+}
+
+Status TpuTransferManager::ResetDevices(
+    absl::Span<stream_executor::StreamExecutor* const> executor) {
+  StatusHelper status;
+  std::vector<SE_StreamExecutor*> se;
+  se.reserve(executor.size());
+  for (int64_t i = 0; i < executor.size(); ++i) {
+    se.push_back(static_cast<TpuExecutor*>(executor[i]->implementation())
+                     ->se_executor());
+  }
+
+  tpu::ExecutorApiFn()->TpuTransferManager_ResetDevicesFn(
+      manager_, se.data(), executor.size(), status.c_status);
+  return status.status();
+}
+
 struct TransferFromDeviceState {
   std::atomic<int64_t> remaining_transfers;
   SE_Status* overall_status =
@@ -139,6 +219,49 @@ int64 TpuTransferManager::GetByteSizeRequirement(
   return size_in_bytes;
 }
 
+StatusOr<xla::Shape> TpuTransferManager::ChooseCompactLayoutForShape(
+    const xla::Shape& host_shape) const {
+  XLA_Shape c_host_shape;
+  ApiConverter::ToC(host_shape, &c_host_shape);
+  XLA_Shape c_output;
+  StatusHelper status;
+  tpu::ExecutorApiFn()->TpuTransferManager_ChooseCompactLayoutForShapeFn(
+      manager_, &c_host_shape, &c_output, status.c_status);
+  // TODO(skyewm): use a scoped version of XLA_Shape
+  ApiConverter::Free(&c_host_shape);
+  if (!status.status().ok()) {
+    ApiConverter::Free(&c_output);
+    return status.status();
+  }
+  xla::Shape output = ApiConverter::FromC(&c_output);
+  ApiConverter::Free(&c_output);
+  return output;
+}
+
+bool TpuTransferManager::CanShapedBufferBeAccessedNow(
+    stream_executor::StreamExecutor* executor,
+    const xla::ShapedBuffer& device_buffer) const {
+  auto* tpu_executor = down_cast<TpuExecutor*>(executor->implementation());
+  XLA_ShapedBuffer c_device_buffer;
+  ApiConverter::ToC(device_buffer, &c_device_buffer);
+  auto cleanup = xla::MakeCleanup(
+      [&c_device_buffer]() { ApiConverter::Free(&c_device_buffer); });
+  return tpu::ExecutorApiFn()
+      ->TpuTransferManager_CanShapedBufferBeAccessedNowFn(
+          manager_, tpu_executor->se_executor(), &c_device_buffer);
+}
+
+bool TpuTransferManager::CanBufferBeAccessedNow(
+    se::StreamExecutor* executor,
+    const se::DeviceMemoryBase& device_buffer) const {
+  auto* tpu_executor = down_cast<TpuExecutor*>(executor->implementation());
+  SE_DeviceMemoryBase c_device_buffer{const_cast<void*>(device_buffer.opaque()),
+                                      device_buffer.size(),
+                                      device_buffer.payload()};
+  return tpu::ExecutorApiFn()->TpuTransferManager_CanBufferBeAccessedNowFn(
+      manager_, tpu_executor->se_executor(), &c_device_buffer);
+}
+
 Status TpuTransferManager::WriteSingleTupleIndexTable(
     stream_executor::Stream* stream,
     absl::Span<const stream_executor::DeviceMemoryBase> elements,
@@ -168,4 +291,32 @@ Status TpuTransferManager::WriteSingleTupleIndexTable(
   return status.status();
 }
 
+Status TpuTransferManager::LinearizeToBuffers(
+    const xla::LiteralSlice& literal,
+    std::deque<tensorflow::tpu::NoncopyableBuffer>* buffers) {
+  XLA_Literal c_literal;
+  ApiConverter::ToC(literal, &c_literal);
+
+  char** buffers_array;
+  int64_t* buffers_size;
+  int64_t buffers_array_size;
+  StatusHelper status;
+
+  tpu::ExecutorApiFn()->TpuTransferManager_LinearizeToBuffersFn(
+      manager_, &c_literal, &buffers_array, &buffers_size, &buffers_array_size,
+      status.c_status);
+
+  for (int64_t i = 0; i < buffers_array_size; ++i) {
+    tpu::NoncopyableBuffer buf(buffers_size[i]);
+    memcpy(buf.mutable_data().data(), buffers_array[i], buffers_size[i]);
+    buffers->push_back(std::move(buf));
+  }
+
+  tpu::ExecutorApiFn()->TpuTransferManager_FreeBuffersFn(
+      buffers_array, buffers_size, buffers_array_size);
+
+  ApiConverter::Free(&c_literal);
+  return status.status();
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/stream_executor/tpu/tpu_transfer_manager.h b/tensorflow/stream_executor/tpu/tpu_transfer_manager.h
index 163ac81ea5f..86ed2441fde 100644
--- a/tensorflow/stream_executor/tpu/tpu_transfer_manager.h
+++ b/tensorflow/stream_executor/tpu/tpu_transfer_manager.h
@@ -22,10 +22,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/stream_executor/stream_executor.h"
 #include "tensorflow/stream_executor/tpu/tpu_executor_c_api.h"
+#include "tensorflow/stream_executor/tpu/tpu_transfer_manager_interface.h"
 
 namespace tensorflow {
 
-class TpuTransferManager : public xla::TransferManager {
+class TpuTransferManager : public xla::TpuTransferManagerInterface {
  public:
   TpuTransferManager();
   ~TpuTransferManager() override;
@@ -50,30 +51,43 @@ class TpuTransferManager : public xla::TransferManager {
       const TransferMetadata* transfer_metadata) override;
 
   Status TransferLiteralToInfeed(stream_executor::StreamExecutor* executor,
-                                 const xla::LiteralSlice& literal) override {
-    LOG(FATAL) << "Not yet implemented";
-  }
+                                 const xla::LiteralSlice& literal) override;
 
   Status TransferLiteralFromOutfeed(
       stream_executor::StreamExecutor* executor,
       const xla::Shape& literal_shape,
-      xla::MutableBorrowingLiteral literal) override {
-    LOG(FATAL) << "Not yet implemented";
-  }
+      xla::MutableBorrowingLiteral literal) override;
+
+  Status TransferBuffersToInfeed(
+      se::StreamExecutor* executor,
+      const std::deque<tensorflow::tpu::NoncopyableBuffer>& buffers) override;
 
   Status ResetDevices(
-      absl::Span<stream_executor::StreamExecutor* const> executor) override {
-    LOG(FATAL) << "Not yet implemented";
-  }
+      absl::Span<stream_executor::StreamExecutor* const> executor) override;
 
   int64 GetByteSizeRequirement(const xla::Shape& shape) const override;
 
+  StatusOr<xla::Shape> ChooseCompactLayoutForShape(
+      const xla::Shape& host_shape) const override;
+
+  bool CanShapedBufferBeAccessedNow(
+      stream_executor::StreamExecutor* executor,
+      const xla::ShapedBuffer& device_buffer) const override;
+
+  bool CanBufferBeAccessedNow(
+      se::StreamExecutor* executor,
+      const se::DeviceMemoryBase& device_buffer) const override;
+
   Status WriteSingleTupleIndexTable(
       stream_executor::Stream* stream,
       absl::Span<const stream_executor::DeviceMemoryBase> elements,
       const xla::Shape& shape,
       stream_executor::DeviceMemoryBase* region) override;
 
+  Status LinearizeToBuffers(
+      const xla::LiteralSlice& literal,
+      std::deque<tensorflow::tpu::NoncopyableBuffer>* buffers) override;
+
  private:
   XLA_TransferManager* manager_;
 };
diff --git a/tensorflow/stream_executor/tpu/tpu_transfer_manager_interface.cc b/tensorflow/stream_executor/tpu/tpu_transfer_manager_interface.cc
new file mode 100644
index 00000000000..746093972a4
--- /dev/null
+++ b/tensorflow/stream_executor/tpu/tpu_transfer_manager_interface.cc
@@ -0,0 +1,40 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/stream_executor/tpu/tpu_transfer_manager_interface.h"
+
+#include "tensorflow/stream_executor/tpu/tpu_platform_interface.h"
+
+namespace xla {
+
+/*static*/ TpuTransferManagerInterface*
+TpuTransferManagerInterface::GetRegisteredTpuTransferManager() {
+  auto* platform = tensorflow::tpu::TpuPlatformInterface::GetRegisteredPlatform(
+      /*initialize_platform=*/false);
+  if (platform == nullptr) {
+    LOG(ERROR) << "Unable to retrieve registered TPU platform.";
+    return nullptr;
+  }
+  auto tm = xla::TransferManager::GetForPlatform(platform);
+  if (!tm.ok()) {
+    LOG(ERROR) << "Unable to retrieve TpuTransferManager. No TPU platform is "
+                  "registered for platform "
+               << platform->Name() << " and ID " << platform->id();
+    return nullptr;
+  }
+  return static_cast<TpuTransferManagerInterface*>(tm.ValueOrDie());
+}
+
+}  // namespace xla
diff --git a/tensorflow/stream_executor/tpu/tpu_transfer_manager_interface.h b/tensorflow/stream_executor/tpu/tpu_transfer_manager_interface.h
new file mode 100644
index 00000000000..b7e000b89ac
--- /dev/null
+++ b/tensorflow/stream_executor/tpu/tpu_transfer_manager_interface.h
@@ -0,0 +1,41 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_TRANSFER_MANAGER_INTERFACE_H_
+#define TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_TRANSFER_MANAGER_INTERFACE_H_
+
+#include <deque>
+
+#include "tensorflow/compiler/xla/service/transfer_manager.h"
+#include "tensorflow/stream_executor/tpu/noncopyable_buffer.h"
+
+namespace xla {
+
+class TpuTransferManagerInterface : public xla::TransferManager {
+ public:
+  virtual Status TransferBuffersToInfeed(
+      se::StreamExecutor* executor,
+      const std::deque<tensorflow::tpu::NoncopyableBuffer>& buffers) = 0;
+
+  virtual Status LinearizeToBuffers(
+      const LiteralSlice& literal,
+      std::deque<tensorflow::tpu::NoncopyableBuffer>* buffers) = 0;
+
+  static TpuTransferManagerInterface* GetRegisteredTpuTransferManager();
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_TRANSFER_MANAGER_INTERFACE_H_
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 51e26c67e72..132ba3b6caa 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -9,7 +9,6 @@ load(
     "tf_additional_xla_deps_py",
     "tf_exec_properties",
     "tf_gpu_tests_tags",
-    "tf_sycl_tests_tags",
 )
 load(
     "//tensorflow/core/platform:rules_cc.bzl",
@@ -47,7 +46,6 @@ load(
 load(
     "//third_party/mkl_dnn:build_defs.bzl",
     "if_mkl_open_source_only",
-    "if_mkl_v1",
     "if_mkldnn_threadpool",
 )
 load(
@@ -325,9 +323,8 @@ def tf_copts(
         if_nvcc(["-DTENSORFLOW_USE_NVCC=1"]) +
         if_xla_available(["-DTENSORFLOW_USE_XLA=1"]) +
         if_tensorrt(["-DGOOGLE_TENSORRT=1"]) +
-        if_mkl(["-DINTEL_MKL=1", "-DEIGEN_USE_VML"]) +
+        if_mkl(["-DINTEL_MKL=1", "-DENABLE_MKLDNN_V1", "-DENABLE_INTEL_MKL_BFLOAT16"]) +
         if_mkl_open_source_only(["-DINTEL_MKL_DNN_ONLY"]) +
-        if_mkl_v1(["-DENABLE_MKLDNN_V1", "-DENABLE_INTEL_MKL_BFLOAT16"]) +
         if_mkldnn_threadpool(["-DENABLE_MKLDNN_THREADPOOL"]) +
         if_enable_mkl(["-DENABLE_MKL"]) +
         if_ngraph(["-DINTEL_NGRAPH=1"]) +
@@ -352,12 +349,6 @@ def tf_copts(
 def tf_openmp_copts():
     return (if_mkl_lnx_x64(["-fopenmp"]) + if_mkldnn_threadpool(["-fno-openmp"]))
 
-def tfe_xla_copts():
-    return select({
-        "//tensorflow:with_xla_support": ["-DTENSORFLOW_EAGER_USE_XLA"],
-        "//conditions:default": [],
-    })
-
 def tf_opts_nortti():
     return [
         "-fno-rtti",
@@ -400,7 +391,7 @@ def tf_defines_nortti_if_lite_protos():
 
 # Given a list of "op_lib_names" (a list of files in the ops directory
 # without their .cc extensions), generate a library for that file.
-def tf_gen_op_libs(op_lib_names, deps = None, is_external = True):
+def tf_gen_op_libs(op_lib_names, deps = None, is_external = True, compatible_with = None):
     # Make library out of each op so it can also be used to generate wrappers
     # for various languages.
     if not deps:
@@ -411,6 +402,7 @@ def tf_gen_op_libs(op_lib_names, deps = None, is_external = True):
             copts = tf_copts(is_external = is_external),
             srcs = ["ops/" + n + ".cc"],
             deps = deps + [clean_dep("//tensorflow/core:framework")],
+            compatible_with = compatible_with,
             visibility = ["//visibility:public"],
             alwayslink = 1,
             linkstatic = 1,
@@ -746,7 +738,8 @@ def tf_gen_op_wrapper_cc(
         deps = None,
         include_internal_ops = 0,
         # ApiDefs will be loaded in the order specified in this list.
-        api_def_srcs = []):
+        api_def_srcs = [],
+        compatible_with = []):
     # Construct an op generator binary for these ops.
     tool = out_ops_file + "_gen_cc"
     if deps == None:
@@ -788,6 +781,7 @@ def tf_gen_op_wrapper_cc(
         cmd = ("$(location :" + tool + ") $(location :" + out_ops_file + ".h) " +
                "$(location :" + out_ops_file + ".cc) " +
                str(include_internal_ops) + " " + api_def_args_str),
+        compatible_with = compatible_with,
     )
 
 # Given a list of "op_lib_names" (a list of files in the ops directory
@@ -837,7 +831,8 @@ def tf_gen_op_wrappers_cc(
         # ApiDefs will be loaded in the order specified in this list.
         api_def_srcs = [],
         # Any extra dependencies that the wrapper generator might need.
-        extra_gen_deps = []):
+        extra_gen_deps = [],
+        compatible_with = []):
     subsrcs = other_srcs[:]
     subhdrs = other_hdrs[:]
     internalsrcs = other_srcs_internal[:]
@@ -851,6 +846,7 @@ def tf_gen_op_wrappers_cc(
             op_gen = op_gen,
             pkg = pkg,
             deps = [pkg + ":" + n + "_op_lib"] + extra_gen_deps,
+            compatible_with = compatible_with,
         )
         subsrcs += ["ops/" + n + ".cc"]
         subhdrs += ["ops/" + n + ".h"]
@@ -873,6 +869,7 @@ def tf_gen_op_wrappers_cc(
         copts = tf_copts(),
         alwayslink = 1,
         visibility = visibility,
+        compatible_with = compatible_with,
     )
     cc_library(
         name = name + "_internal",
@@ -890,6 +887,7 @@ def tf_gen_op_wrappers_cc(
         copts = tf_copts(),
         alwayslink = 1,
         visibility = [clean_dep("//tensorflow:internal")],
+        compatible_with = compatible_with,
     )
 
 # Generates a Python library target wrapping the ops registered in "deps".
@@ -1642,77 +1640,6 @@ def _get_transitive_headers(hdrs, deps):
         transitive = [dep[CcInfo].compilation_context.headers for dep in deps],
     )
 
-# Bazel rules for building swig files.
-def _py_wrap_cc_impl(ctx):
-    srcs = ctx.files.srcs
-    if len(srcs) != 1:
-        fail("Exactly one SWIG source file label must be specified.", "srcs")
-    module_name = ctx.attr.module_name
-    src = ctx.files.srcs[0]
-    inputs = _get_transitive_headers([src] + ctx.files.swig_includes, ctx.attr.deps)
-    inputs = depset(ctx.files._swiglib, transitive = [inputs])
-    inputs = depset(ctx.files.toolchain_deps, transitive = [inputs])
-    swig_include_dirs = depset(_get_repository_roots(ctx, inputs))
-    swig_include_dirs = depset(sorted([f.dirname for f in ctx.files._swiglib]), transitive = [swig_include_dirs])
-    args = [
-        "-c++",
-        "-python",
-        "-module",
-        module_name,
-        "-o",
-        ctx.outputs.cc_out.path,
-        "-outdir",
-        ctx.outputs.py_out.dirname,
-    ]
-    args += ["-l" + f.path for f in ctx.files.swig_includes]
-    args += ["-I" + i for i in swig_include_dirs.to_list()]
-    args.append(src.path)
-    outputs = [ctx.outputs.cc_out, ctx.outputs.py_out]
-    ctx.actions.run(
-        executable = ctx.executable._swig,
-        arguments = args,
-        inputs = inputs,
-        outputs = outputs,
-        mnemonic = "PythonSwig",
-        progress_message = "SWIGing " + src.path,
-    )
-    return struct(files = depset(outputs))
-
-_py_wrap_cc = rule(
-    attrs = {
-        "srcs": attr.label_list(
-            mandatory = True,
-            allow_files = True,
-        ),
-        "swig_includes": attr.label_list(
-            allow_files = True,
-        ),
-        "deps": attr.label_list(
-            allow_files = True,
-            providers = [CcInfo],
-        ),
-        "toolchain_deps": attr.label_list(
-            allow_files = True,
-        ),
-        "module_name": attr.string(mandatory = True),
-        "py_module_name": attr.string(mandatory = True),
-        "_swig": attr.label(
-            default = Label("@swig//:swig"),
-            executable = True,
-            cfg = "host",
-        ),
-        "_swiglib": attr.label(
-            default = Label("@swig//:templates"),
-            allow_files = True,
-        ),
-    },
-    outputs = {
-        "cc_out": "%{module_name}.cc",
-        "py_out": "%{py_module_name}.py",
-    },
-    implementation = _py_wrap_cc_impl,
-)
-
 def _get_repository_roots(ctx, files):
     """Returns abnormal root directories under which files reside.
 
@@ -1813,16 +1740,22 @@ _transitive_parameters_library = rule(
 #   * Eigen: it's a header-only library.  Add it directly to your deps.
 #   * GRPC: add a direct dep on @com_github_grpc_grpc//:grpc++_public_hdrs.
 #
-def cc_header_only_library(name, deps = [], includes = [], extra_deps = [], **kwargs):
-    _transitive_hdrs(name = name + "_gather", deps = deps)
+def cc_header_only_library(name, deps = [], includes = [], extra_deps = [], compatible_with = None, **kwargs):
+    _transitive_hdrs(
+        name = name + "_gather",
+        deps = deps,
+        compatible_with = compatible_with,
+    )
     _transitive_parameters_library(
         name = name + "_gathered_parameters",
         original_deps = deps,
+        compatible_with = compatible_with,
     )
     cc_library(
         name = name,
         hdrs = [":" + name + "_gather"],
         includes = includes,
+        compatible_with = compatible_with,
         deps = [":" + name + "_gathered_parameters"] + extra_deps,
         **kwargs
     )
@@ -2246,6 +2179,10 @@ register_extension_info(
     label_regex_for_dep = "{extension_name}",
 )
 
+def pytype_library(**kwargs):
+    # Types not enforced in OSS.
+    native.py_library(**kwargs)
+
 def tf_py_test(
         name,
         srcs,
@@ -2417,44 +2354,6 @@ register_extension_info(
     label_regex_map = {"deps": "deps:{extension_name}"},
 )
 
-def sycl_py_test(
-        name,
-        srcs,
-        size = "medium",
-        data = [],
-        main = None,
-        args = [],
-        shard_count = 1,
-        kernels = [],
-        tags = [],
-        flaky = 0,
-        xla_enabled = False,
-        grpc_enabled = False,
-        **kwargs):
-    test_tags = tags + tf_sycl_tests_tags()
-    if "additional_deps" in kwargs:
-        fail("Use `deps` to specify dependencies. `additional_deps` has been replaced with the standard pattern of `deps`.")
-    tf_py_test(
-        name = name,
-        size = size,
-        srcs = srcs,
-        args = args,
-        data = data,
-        flaky = flaky,
-        grpc_enabled = grpc_enabled,
-        kernels = kernels,
-        main = main,
-        shard_count = shard_count,
-        tags = test_tags,
-        xla_enabled = xla_enabled,
-        **kwargs
-    )
-
-register_extension_info(
-    extension_name = "sycl_py_test",
-    label_regex_map = {"deps": "deps:{extension_name}"},
-)
-
 def py_tests(
         name,
         srcs,
@@ -2548,7 +2447,7 @@ def cuda_py_tests(*args, **kwargs):
 #
 # Return a struct with fields (hdrs, srcs) containing the names of the
 # generated files.
-def tf_generate_proto_text_sources(name, srcs_relative_dir, srcs, protodeps = [], deps = [], visibility = None):
+def tf_generate_proto_text_sources(name, srcs_relative_dir, srcs, protodeps = [], deps = [], visibility = None, compatible_with = None):
     out_hdrs = (
         [
             p.replace(".proto", ".pb_text.h")
@@ -2567,15 +2466,18 @@ def tf_generate_proto_text_sources(name, srcs_relative_dir, srcs, protodeps = []
         exec_tools = [
             clean_dep("//tensorflow/tools/proto_text:gen_proto_text_functions"),
         ],
+        compatible_with = compatible_with,
     )
 
     native.filegroup(
         name = name + "_hdrs",
         srcs = out_hdrs,
         visibility = visibility,
+        compatible_with = compatible_with,
     )
 
     cc_library(
+        compatible_with = compatible_with,
         name = name,
         srcs = out_srcs,
         hdrs = out_hdrs,
@@ -2648,12 +2550,13 @@ def _local_genrule(**kwargs):
         **kwargs
     )
 
-def tf_version_info_genrule(name, out):
+def tf_version_info_genrule(name, out, compatible_with = None):
     # TODO(gunan): Investigate making this action hermetic so we do not need
     # to run it locally.
     _local_genrule(
         name = name,
         out = out,
+        compatible_with = compatible_with,
         exec_tool = "//tensorflow/tools/git:gen_git_source",
         srcs = [
             "@local_config_git//:gen/spec.json",
@@ -2999,3 +2902,15 @@ def tf_grpc_dependency():
 
 def tf_grpc_cc_dependency():
     return "//tensorflow:grpc++"
+
+def get_compatible_with_portable():
+    return []
+
+def get_compatible_with_cloud():
+    return []
+
+def filegroup(**kwargs):
+    native.filegroup(**kwargs)
+
+def genrule(**kwargs):
+    native.genrule(**kwargs)
diff --git a/tensorflow/tools/android/inference_interface/asset_manager_filesystem.cc b/tensorflow/tools/android/inference_interface/asset_manager_filesystem.cc
index 04d5774adb8..f86dacf25b0 100644
--- a/tensorflow/tools/android/inference_interface/asset_manager_filesystem.cc
+++ b/tensorflow/tools/android/inference_interface/asset_manager_filesystem.cc
@@ -124,8 +124,8 @@ AssetManagerFileSystem::AssetManagerFileSystem(AAssetManager* asset_manager,
                                                const string& prefix)
     : asset_manager_(asset_manager), prefix_(prefix) {}
 
-Status AssetManagerFileSystem::FileExists(
-    const string& fname /*, TransactionToken* token */) {
+Status AssetManagerFileSystem::FileExists(const string& fname,
+                                          TransactionToken* token) {
   string path = RemoveAssetPrefix(fname);
   auto asset = ScopedAsset(
       AAssetManager_open(asset_manager_, path.c_str(), AASSET_MODE_RANDOM));
@@ -136,8 +136,8 @@ Status AssetManagerFileSystem::FileExists(
 }
 
 Status AssetManagerFileSystem::NewRandomAccessFile(
-    const string& fname,
-    std::unique_ptr<RandomAccessFile>* result /*, TransactionToken* token */) {
+    const string& fname, TransactionToken* token,
+    std::unique_ptr<RandomAccessFile>* result) {
   string path = RemoveAssetPrefix(fname);
   auto asset = ScopedAsset(
       AAssetManager_open(asset_manager_, path.c_str(), AASSET_MODE_RANDOM));
@@ -149,8 +149,8 @@ Status AssetManagerFileSystem::NewRandomAccessFile(
 }
 
 Status AssetManagerFileSystem::NewReadOnlyMemoryRegionFromFile(
-    const string& fname, std::unique_ptr<ReadOnlyMemoryRegion>*
-                             result /*, TransactionToken* token */) {
+    const string& fname, TransactionToken* token,
+    std::unique_ptr<ReadOnlyMemoryRegion>* result) {
   string path = RemoveAssetPrefix(fname);
   auto asset = ScopedAsset(
       AAssetManager_open(asset_manager_, path.c_str(), AASSET_MODE_STREAMING));
@@ -186,9 +186,9 @@ Status AssetManagerFileSystem::NewReadOnlyMemoryRegionFromFile(
   return Status::OK();
 }
 
-Status AssetManagerFileSystem::GetChildren(
-    const string& prefixed_dir,
-    std::vector<string>* r /*, TransactionToken* token */) {
+Status AssetManagerFileSystem::GetChildren(const string& prefixed_dir,
+                                           TransactionToken* token,
+                                           std::vector<string>* r) {
   std::string path = NormalizeDirectoryPath(prefixed_dir);
   auto dir =
       ScopedAssetDir(AAssetManager_openDir(asset_manager_, path.c_str()));
@@ -203,8 +203,8 @@ Status AssetManagerFileSystem::GetChildren(
   return Status::OK();
 }
 
-Status AssetManagerFileSystem::GetFileSize(
-    const string& fname, uint64* s /*, TransactionToken* token */) {
+Status AssetManagerFileSystem::GetFileSize(const string& fname,
+                                           TransactionToken* token, uint64* s) {
   // If fname corresponds to a directory, return early. It doesn't map to an
   // AAsset, and would otherwise return NotFound.
   if (DirectoryExists(fname)) {
@@ -221,8 +221,9 @@ Status AssetManagerFileSystem::GetFileSize(
   return Status::OK();
 }
 
-Status AssetManagerFileSystem::Stat(
-    const string& fname, FileStatistics* stat /*, TransactionToken* token */) {
+Status AssetManagerFileSystem::Stat(const string& fname,
+                                    TransactionToken* token,
+                                    FileStatistics* stat) {
   uint64 size;
   stat->is_directory = DirectoryExists(fname);
   TF_RETURN_IF_ERROR(GetFileSize(fname, &size));
@@ -240,8 +241,7 @@ string AssetManagerFileSystem::RemoveAssetPrefix(const string& name) {
   return string(piece);
 }
 
-bool AssetManagerFileSystem::DirectoryExists(
-    const std::string& fname /*, TransactionToken* token */) {
+bool AssetManagerFileSystem::DirectoryExists(const std::string& fname) {
   std::string path = NormalizeDirectoryPath(fname);
   auto dir =
       ScopedAssetDir(AAssetManager_openDir(asset_manager_, path.c_str()));
@@ -250,36 +250,36 @@ bool AssetManagerFileSystem::DirectoryExists(
   return AAssetDir_getNextFileName(dir.get()) != NULL;
 }
 
-Status AssetManagerFileSystem::GetMatchingPaths(
-    const string& pattern,
-    std::vector<string>* results /*, TransactionToken* token */) {
+Status AssetManagerFileSystem::GetMatchingPaths(const string& pattern,
+                                                TransactionToken* token,
+                                                std::vector<string>* results) {
   return internal::GetMatchingPaths(this, Env::Default(), pattern, results);
 }
 
 Status AssetManagerFileSystem::NewWritableFile(
-    const string& fname,
-    std::unique_ptr<WritableFile>* result /*, TransactionToken* token */) {
+    const string& fname, TransactionToken* token,
+    std::unique_ptr<WritableFile>* result) {
   return errors::Unimplemented("Asset storage is read only.");
 }
 Status AssetManagerFileSystem::NewAppendableFile(
-    const string& fname,
-    std::unique_ptr<WritableFile>* result /*, TransactionToken* token */) {
+    const string& fname, TransactionToken* token,
+    std::unique_ptr<WritableFile>* result) {
   return errors::Unimplemented("Asset storage is read only.");
 }
-Status AssetManagerFileSystem::DeleteFile(
-    const string& f /*, TransactionToken* token */) {
+Status AssetManagerFileSystem::DeleteFile(const string& f,
+                                          TransactionToken* token) {
   return errors::Unimplemented("Asset storage is read only.");
 }
-Status AssetManagerFileSystem::CreateDir(
-    const string& d /*, TransactionToken* token */) {
+Status AssetManagerFileSystem::CreateDir(const string& d,
+                                         TransactionToken* token) {
   return errors::Unimplemented("Asset storage is read only.");
 }
-Status AssetManagerFileSystem::DeleteDir(
-    const string& d /*, TransactionToken* token */) {
+Status AssetManagerFileSystem::DeleteDir(const string& d,
+                                         TransactionToken* token) {
   return errors::Unimplemented("Asset storage is read only.");
 }
-Status AssetManagerFileSystem::RenameFile(
-    const string& s, const string& t /*, TransactionToken* token */) {
+Status AssetManagerFileSystem::RenameFile(const string& s, const string& t,
+                                          TransactionToken* token) {
   return errors::Unimplemented("Asset storage is read only.");
 }
 
diff --git a/tensorflow/tools/android/inference_interface/asset_manager_filesystem.h b/tensorflow/tools/android/inference_interface/asset_manager_filesystem.h
index 329e55d6cc7..893d5ccb90a 100644
--- a/tensorflow/tools/android/inference_interface/asset_manager_filesystem.h
+++ b/tensorflow/tools/android/inference_interface/asset_manager_filesystem.h
@@ -42,52 +42,38 @@ class AssetManagerFileSystem : public FileSystem {
   AssetManagerFileSystem(AAssetManager* asset_manager, const string& prefix);
   ~AssetManagerFileSystem() override = default;
 
-  Status FileExists(
-      const string& fname /*, TransactionToken* token = nullptr*/) override;
-  Status NewRandomAccessFile(
-      const string& filename,
-      std::unique_ptr<RandomAccessFile>*
-          result /*, TransactionToken* token = nullptr*/) override;
-  Status NewReadOnlyMemoryRegionFromFile(
-      const string& filename,
-      std::unique_ptr<ReadOnlyMemoryRegion>*
-          result /*, TransactionToken* token = nullptr*/) override;
+  TF_USE_FILESYSTEM_METHODS_WITH_NO_TRANSACTION_SUPPORT;
 
-  Status GetFileSize(
-      const string& f,
-      uint64* s /*, TransactionToken* token = nullptr*/) override;
+  Status FileExists(const string& fname, TransactionToken* token) override;
+  Status NewRandomAccessFile(
+      const string& filename, TransactionToken* token,
+      std::unique_ptr<RandomAccessFile>* result) override;
+  Status NewReadOnlyMemoryRegionFromFile(
+      const string& filename, TransactionToken* token,
+      std::unique_ptr<ReadOnlyMemoryRegion>* result) override;
+
+  Status GetFileSize(const string& f, TransactionToken* token,
+                     uint64* s) override;
   // Currently just returns size.
-  Status Stat(
-      const string& fname,
-      FileStatistics* stat /*, TransactionToken* token = nullptr*/) override;
-  Status GetChildren(
-      const string& dir,
-      std::vector<string>* r /*, TransactionToken* token = nullptr*/) override;
+  Status Stat(const string& fname, TransactionToken* token,
+              FileStatistics* stat) override;
+  Status GetChildren(const string& dir, TransactionToken* token,
+                     std::vector<string>* r) override;
 
   // All these functions return Unimplemented error. Asset storage is
   // read only.
-  Status NewWritableFile(
-      const string& fname,
-      std::unique_ptr<WritableFile>*
-          result /*, TransactionToken* token = nullptr*/) override;
-  Status NewAppendableFile(
-      const string& fname,
-      std::unique_ptr<WritableFile>*
-          result /*, TransactionToken* token = nullptr*/) override;
-  Status DeleteFile(
-      const string& f /*, TransactionToken* token = nullptr*/) override;
-  Status CreateDir(
-      const string& d /*, TransactionToken* token = nullptr*/) override;
-  Status DeleteDir(
-      const string& d /*, TransactionToken* token = nullptr*/) override;
-  Status RenameFile(
-      const string& s,
-      const string& t /*, TransactionToken* token = nullptr*/) override;
+  Status NewWritableFile(const string& fname, TransactionToken* token,
+                         std::unique_ptr<WritableFile>* result) override;
+  Status NewAppendableFile(const string& fname, TransactionToken* token,
+                           std::unique_ptr<WritableFile>* result) override;
+  Status DeleteFile(const string& f, TransactionToken* token) override;
+  Status CreateDir(const string& d, TransactionToken* token) override;
+  Status DeleteDir(const string& d, TransactionToken* token) override;
+  Status RenameFile(const string& s, const string& t,
+                    TransactionToken* token) override;
 
-  Status GetMatchingPaths(
-      const string& pattern,
-      std::vector<string>* results /*, TransactionToken* token = nullptr*/)
-      override;
+  Status GetMatchingPaths(const string& pattern, TransactionToken* token,
+                          std::vector<string>* results) override;
 
  private:
   string RemoveAssetPrefix(const string& name);
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-ragged-tensor-spec.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-ragged-tensor-spec.pbtxt
index 25c3f6e3e11..a9043abdacb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-ragged-tensor-spec.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-ragged-tensor-spec.pbtxt
@@ -8,6 +8,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "flat_values_spec"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "ragged_rank"
     mtype: "<type \'property\'>"
@@ -26,7 +30,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'shape\', \'dtype\', \'ragged_rank\', \'row_splits_dtype\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'float32\'>\", \'None\', \"<dtype: \'int64\'>\"], "
+    argspec: "args=[\'self\', \'shape\', \'dtype\', \'ragged_rank\', \'row_splits_dtype\', \'flat_values_spec\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'float32\'>\", \'None\', \"<dtype: \'int64\'>\", \'None\'], "
   }
   member_method {
     name: "from_value"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.config.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.config.experimental.pbtxt
index 7397719e656..f6be3da59b8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.config.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.config.experimental.pbtxt
@@ -24,6 +24,10 @@ tf_module {
     name: "enable_mlir_graph_optimization"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "enable_tensor_float_32_execution"
+    argspec: "args=[\'enabled\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_device_details"
     argspec: "args=[\'device\'], varargs=None, keywords=None, defaults=None"
@@ -36,6 +40,10 @@ tf_module {
     name: "get_memory_growth"
     argspec: "args=[\'device\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_memory_usage"
+    argspec: "args=[\'device\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_synchronous_execution"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
@@ -76,4 +84,8 @@ tf_module {
     name: "set_visible_devices"
     argspec: "args=[\'devices\', \'device_type\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "tensor_float_32_execution_enabled"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
index 2621415a8ec..92d48198ca3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
@@ -65,7 +65,7 @@ tf_class {
   }
   member_method {
     name: "from_generator"
-    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\', \'output_signature\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "from_sparse_tensor_slices"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
index 9c5735dfbb0..6ef93ca9890 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
@@ -67,7 +67,7 @@ tf_class {
   }
   member_method {
     name: "from_generator"
-    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\', \'output_signature\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "from_sparse_tensor_slices"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-iterator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-iterator.pbtxt
index f276879275d..0c8af2ec6c9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-iterator.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-iterator.pbtxt
@@ -39,6 +39,10 @@ tf_class {
     name: "get_next"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "get_next_as_optional"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "make_initializer"
     argspec: "args=[\'self\', \'dataset\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt
index e200aacf19b..7249cdff1be 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt
@@ -67,7 +67,7 @@ tf_class {
   }
   member_method {
     name: "from_generator"
-    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\', \'output_signature\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "from_sparse_tensor_slices"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
index 01ad7819cd2..79aeac1d2a1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
@@ -67,7 +67,7 @@ tf_class {
   }
   member_method {
     name: "from_generator"
-    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\', \'output_signature\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "from_sparse_tensor_slices"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt
index aae0721ce44..b81d19a161f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt
@@ -67,7 +67,7 @@ tf_class {
   }
   member_method {
     name: "from_generator"
-    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\', \'output_signature\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "from_sparse_tensor_slices"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-optimization-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-optimization-options.pbtxt
index e33265430ea..f0ada15d5cd 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-optimization-options.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-optimization-options.pbtxt
@@ -19,6 +19,10 @@ tf_class {
     name: "autotune_cpu_budget"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "autotune_ram_budget"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "filter_fusion"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt
index 6463166881c..cacd6fa0d0a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt
@@ -67,7 +67,7 @@ tf_class {
   }
   member_method {
     name: "from_generator"
-    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\', \'output_signature\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "from_sparse_tensor_slices"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt
index d037ee8e7cf..3cbcda297e4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt
@@ -67,7 +67,7 @@ tf_class {
   }
   member_method {
     name: "from_generator"
-    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\', \'output_signature\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "from_sparse_tensor_slices"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
index e9ec2a6e187..e55f4061aa8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
@@ -166,7 +166,7 @@ tf_module {
   }
   member_method {
     name: "ignore_errors"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'log_warning\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "latency_stats"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.service.-dispatcher-config.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.service.-dispatcher-config.pbtxt
new file mode 100644
index 00000000000..2364e41e4c5
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.service.-dispatcher-config.pbtxt
@@ -0,0 +1,39 @@
+path: "tensorflow.data.experimental.service.DispatcherConfig"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.experimental.service.server_lib.DispatcherConfig\'>"
+  is_instance: "<class \'tensorflow.python.data.experimental.service.server_lib.DispatcherConfig\'>"
+  is_instance: "<type \'tuple\'>"
+  member {
+    name: "fault_tolerant_mode"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "job_gc_check_interval_ms"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "job_gc_timeout_ms"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "port"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "protocol"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "work_dir"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "count"
+  }
+  member_method {
+    name: "index"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.service.-worker-config.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.service.-worker-config.pbtxt
new file mode 100644
index 00000000000..d8eaf9bc7d7
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.service.-worker-config.pbtxt
@@ -0,0 +1,35 @@
+path: "tensorflow.data.experimental.service.WorkerConfig"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.experimental.service.server_lib.WorkerConfig\'>"
+  is_instance: "<class \'tensorflow.python.data.experimental.service.server_lib.WorkerConfig\'>"
+  is_instance: "<type \'tuple\'>"
+  member {
+    name: "dispatcher_address"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "heartbeat_interval_ms"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "port"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "protocol"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "worker_address"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "count"
+  }
+  member_method {
+    name: "index"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.service.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.service.pbtxt
index 3630c97da93..1d91c01c2a5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.service.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.service.pbtxt
@@ -1,5 +1,13 @@
 path: "tensorflow.data.experimental.service"
 tf_module {
+  member {
+    name: "DispatcherConfig"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "WorkerConfig"
+    mtype: "<type \'type\'>"
+  }
   member_method {
     name: "distribute"
     argspec: "args=[\'processing_mode\', \'service\', \'job_name\', \'max_outstanding_requests\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.pbtxt
index 1d2af017d84..6b64ad6d1ab 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.data"
 tf_module {
+  member {
+    name: "AUTOTUNE"
+    mtype: "<type \'int\'>"
+  }
   member {
     name: "Dataset"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
index a9f6f069560..da08722a7a3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
@@ -174,7 +174,7 @@ tf_class {
   }
   member_method {
     name: "compile"
-    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\', \'steps_per_execution\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
index 168539be647..1719c8bd9c7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
@@ -180,7 +180,7 @@ tf_class {
   }
   member_method {
     name: "compile"
-    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\', \'steps_per_execution\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.applications.mobilenet_v3.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.applications.mobilenet_v3.pbtxt
new file mode 100644
index 00000000000..418ace0882f
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.applications.mobilenet_v3.pbtxt
@@ -0,0 +1,11 @@
+path: "tensorflow.keras.applications.mobilenet_v3"
+tf_module {
+  member_method {
+    name: "decode_predictions"
+    argspec: "args=[\'preds\', \'top\'], varargs=None, keywords=None, defaults=[\'5\'], "
+  }
+  member_method {
+    name: "preprocess_input"
+    argspec: "args=[\'x\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.applications.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.applications.pbtxt
index 900df849f45..60c9b6d2909 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.applications.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.applications.pbtxt
@@ -28,6 +28,10 @@ tf_module {
     name: "mobilenet_v2"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "mobilenet_v3"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "nasnet"
     mtype: "<type \'module\'>"
@@ -116,6 +120,14 @@ tf_module {
     name: "MobileNetV2"
     argspec: "args=[\'input_shape\', \'alpha\', \'include_top\', \'weights\', \'input_tensor\', \'pooling\', \'classes\', \'classifier_activation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'1.0\', \'True\', \'imagenet\', \'None\', \'None\', \'1000\', \'softmax\'], "
   }
+  member_method {
+    name: "MobileNetV3Large"
+    argspec: "args=[\'input_shape\', \'alpha\', \'minimalistic\', \'include_top\', \'weights\', \'input_tensor\', \'classes\', \'pooling\', \'dropout_rate\', \'classifier_activation\'], varargs=None, keywords=None, defaults=[\'None\', \'1.0\', \'False\', \'True\', \'imagenet\', \'None\', \'1000\', \'None\', \'0.2\', \'softmax\'], "
+  }
+  member_method {
+    name: "MobileNetV3Small"
+    argspec: "args=[\'input_shape\', \'alpha\', \'minimalistic\', \'include_top\', \'weights\', \'input_tensor\', \'classes\', \'pooling\', \'dropout_rate\', \'classifier_activation\'], varargs=None, keywords=None, defaults=[\'None\', \'1.0\', \'False\', \'True\', \'imagenet\', \'None\', \'1000\', \'None\', \'0.2\', \'softmax\'], "
+  }
   member_method {
     name: "NASNetLarge"
     argspec: "args=[\'input_shape\', \'include_top\', \'weights\', \'input_tensor\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'imagenet\', \'None\', \'None\', \'1000\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.estimator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.estimator.pbtxt
index d0dca9a5a31..0a9ee49aecd 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.estimator.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.estimator.pbtxt
@@ -2,6 +2,6 @@ path: "tensorflow.keras.estimator"
 tf_module {
   member_method {
     name: "model_to_estimator"
-    argspec: "args=[\'keras_model\', \'keras_model_path\', \'custom_objects\', \'model_dir\', \'config\', \'checkpoint_format\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'saver\'], "
+    argspec: "args=[\'keras_model\', \'keras_model_path\', \'custom_objects\', \'model_dir\', \'config\', \'checkpoint_format\', \'metric_names_map\', \'export_outputs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'saver\', \'None\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-model.pbtxt
index 2aff054a51d..d93c018b073 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-model.pbtxt
@@ -175,7 +175,7 @@ tf_class {
   }
   member_method {
     name: "compile"
-    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\', \'steps_per_execution\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
index 3bd1bc2c939..1b50d327cd6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'unit_forget_bias\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'implementation\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'1\'], "
+    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'unit_forget_bias\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-wide-deep-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-wide-deep-model.pbtxt
index ed49246e458..9fba915d01a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-wide-deep-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-wide-deep-model.pbtxt
@@ -175,7 +175,7 @@ tf_class {
   }
   member_method {
     name: "compile"
-    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\', \'steps_per_execution\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.pbtxt
index 4a83b58df83..340f2705b85 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.pbtxt
@@ -40,8 +40,4 @@ tf_module {
     name: "load_from_saved_model"
     argspec: "args=[\'saved_model_path\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "terminate_keras_multiprocessing_pools"
-    argspec: "args=[\'grace_period\', \'use_sigkill\'], varargs=None, keywords=None, defaults=[\'0.1\', \'False\'], "
-  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt
index 7ed2d307cd0..a4f9e447bdf 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'implementation\', \'reset_after\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'reset_after\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'False\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt
index 0a2ebd2cfe7..f0e84b8edd7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt
@@ -190,7 +190,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'implementation\', \'return_sequences\', \'return_state\', \'go_backwards\', \'stateful\', \'unroll\', \'reset_after\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'1\', \'False\', \'False\', \'False\', \'False\', \'False\', \'False\'], "
+    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'return_sequences\', \'return_state\', \'go_backwards\', \'stateful\', \'unroll\', \'reset_after\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'False\', \'False\', \'False\', \'False\', \'False\', \'False\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
index 27cada1194e..13fbf554fd3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'unit_forget_bias\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'implementation\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'1\'], "
+    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'unit_forget_bias\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt
index 080fb51d538..ff9e8b6df74 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt
@@ -190,7 +190,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'unit_forget_bias\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'implementation\', \'return_sequences\', \'return_state\', \'go_backwards\', \'stateful\', \'unroll\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'1\', \'False\', \'False\', \'False\', \'False\', \'False\'], "
+    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'unit_forget_bias\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'return_sequences\', \'return_state\', \'go_backwards\', \'stateful\', \'unroll\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'False\', \'False\', \'False\', \'False\', \'False\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multi-head-attention.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multi-head-attention.pbtxt
new file mode 100644
index 00000000000..070ee20ab30
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multi-head-attention.pbtxt
@@ -0,0 +1,222 @@
+path: "tensorflow.keras.layers.MultiHeadAttention"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.multi_head_attention.MultiHeadAttention\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "supports_masking"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_heads\', \'key_dim\', \'value_dim\', \'dropout\', \'use_bias\', \'output_shape\', \'attention_axes\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'0.0\', \'True\', \'None\', \'None\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'query\', \'value\', \'key\', \'attention_mask\', \'return_attention_scores\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_signature\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt
index db272bdf782..97e4b91bfa3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt
@@ -149,7 +149,7 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-rotation.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-rotation.pbtxt
index ee9a0254382..7c68f9ef783 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-rotation.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-rotation.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'factor\', \'fill_mode\', \'interpolation\', \'seed\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'reflect\', \'bilinear\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'factor\', \'fill_mode\', \'interpolation\', \'seed\', \'name\', \'fill_value\'], varargs=None, keywords=kwargs, defaults=[\'reflect\', \'bilinear\', \'None\', \'None\', \'0.0\'], "
   }
   member_method {
     name: "adapt"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-translation.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-translation.pbtxt
index 7e1095e7503..73866dfcf50 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-translation.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-translation.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'height_factor\', \'width_factor\', \'fill_mode\', \'interpolation\', \'seed\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'reflect\', \'bilinear\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'height_factor\', \'width_factor\', \'fill_mode\', \'interpolation\', \'seed\', \'name\', \'fill_value\'], varargs=None, keywords=kwargs, defaults=[\'reflect\', \'bilinear\', \'None\', \'None\', \'0.0\'], "
   }
   member_method {
     name: "adapt"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt
index fd59a92a4af..392e2efef39 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'height_factor\', \'width_factor\', \'fill_mode\', \'interpolation\', \'seed\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'reflect\', \'bilinear\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'height_factor\', \'width_factor\', \'fill_mode\', \'interpolation\', \'seed\', \'name\', \'fill_value\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'reflect\', \'bilinear\', \'None\', \'None\', \'0.0\'], "
   }
   member_method {
     name: "adapt"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.pbtxt
index ea139297807..35714912b04 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.pbtxt
@@ -312,6 +312,10 @@ tf_module {
     name: "Minimum"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "MultiHeadAttention"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Multiply"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt
index dbab3abae8e..e3c8c7e8a65 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt
@@ -5,6 +5,18 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "clipnorm"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "clipvalue"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "global_clipnorm"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "iterations"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
index 4368742d7bb..15c0ab5abbb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
@@ -174,7 +174,7 @@ tf_class {
   }
   member_method {
     name: "compile"
-    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\', \'steps_per_execution\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
index 8e9409f27a9..729fdd660ca 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
@@ -180,7 +180,7 @@ tf_class {
   }
   member_method {
     name: "compile"
-    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\', \'steps_per_execution\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adadelta.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adadelta.pbtxt
index af854e98013..88e4ecfbb62 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adadelta.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adadelta.pbtxt
@@ -4,6 +4,18 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "clipnorm"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "clipvalue"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "global_clipnorm"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "iterations"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adagrad.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adagrad.pbtxt
index e89cc5cef75..89e0718d5b6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adagrad.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adagrad.pbtxt
@@ -4,6 +4,18 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "clipnorm"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "clipvalue"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "global_clipnorm"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "iterations"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adam.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adam.pbtxt
index 15414d7234f..29b1fba5aae 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adam.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adam.pbtxt
@@ -4,6 +4,18 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "clipnorm"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "clipvalue"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "global_clipnorm"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "iterations"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adamax.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adamax.pbtxt
index 8b3c429e6b5..c481aa07ace 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adamax.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adamax.pbtxt
@@ -4,6 +4,18 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "clipnorm"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "clipvalue"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "global_clipnorm"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "iterations"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-ftrl.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-ftrl.pbtxt
index 51ab675db74..a2b9d310eb9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-ftrl.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-ftrl.pbtxt
@@ -4,6 +4,18 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "clipnorm"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "clipvalue"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "global_clipnorm"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "iterations"
     mtype: "<type \'property\'>"
@@ -14,7 +26,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'learning_rate_power\', \'initial_accumulator_value\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'name\', \'l2_shrinkage_regularization_strength\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'-0.5\', \'0.1\', \'0.0\', \'0.0\', \'Ftrl\', \'0.0\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'learning_rate_power\', \'initial_accumulator_value\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'name\', \'l2_shrinkage_regularization_strength\', \'beta\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'-0.5\', \'0.1\', \'0.0\', \'0.0\', \'Ftrl\', \'0.0\', \'0.0\'], "
   }
   member_method {
     name: "add_slot"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-nadam.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-nadam.pbtxt
index 342c0951bbe..650ac77d6df 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-nadam.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-nadam.pbtxt
@@ -4,6 +4,18 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "clipnorm"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "clipvalue"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "global_clipnorm"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "iterations"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-optimizer.pbtxt
index f007b4b971a..50e3da3eda5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-optimizer.pbtxt
@@ -3,6 +3,18 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "clipnorm"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "clipvalue"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "global_clipnorm"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "iterations"
     mtype: "<type \'property\'>"
@@ -13,7 +25,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'name\', \'gradient_aggregator\', \'gradient_transformers\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "add_slot"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-r-m-sprop.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
index d5bf6fa7f47..ab8391e0465 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
@@ -4,6 +4,18 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "clipnorm"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "clipvalue"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "global_clipnorm"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "iterations"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-s-g-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-s-g-d.pbtxt
index df904f72511..2bad07d9998 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-s-g-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-s-g-d.pbtxt
@@ -4,6 +4,18 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "clipnorm"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "clipvalue"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "global_clipnorm"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "iterations"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.utils.-h-d-f5-matrix.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.utils.-h-d-f5-matrix.pbtxt
deleted file mode 100644
index 6b832051a97..00000000000
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.utils.-h-d-f5-matrix.pbtxt
+++ /dev/null
@@ -1,29 +0,0 @@
-path: "tensorflow.keras.utils.HDF5Matrix"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras.utils.io_utils.HDF5Matrix\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "ndim"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "refs"
-    mtype: "<type \'collections.defaultdict\'>"
-  }
-  member {
-    name: "shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "size"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'datapath\', \'dataset\', \'start\', \'end\', \'normalizer\'], varargs=None, keywords=None, defaults=[\'0\', \'None\', \'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.utils.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.utils.pbtxt
index 0b8876a5988..5df4850d2d2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.utils.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.utils.pbtxt
@@ -8,10 +8,6 @@ tf_module {
     name: "GeneratorEnqueuer"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "HDF5Matrix"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "OrderedEnqueuer"
     mtype: "<type \'type\'>"
@@ -32,10 +28,6 @@ tf_module {
     name: "custom_object_scope"
     mtype: "<type \'type\'>"
   }
-  member_method {
-    name: "convert_all_kernels_in_model"
-    argspec: "args=[\'model\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "deserialize_keras_object"
     argspec: "args=[\'identifier\', \'module_objects\', \'custom_objects\', \'printable_module_name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'object\'], "
@@ -64,10 +56,6 @@ tf_module {
     name: "model_to_dot"
     argspec: "args=[\'model\', \'show_shapes\', \'show_dtype\', \'show_layer_names\', \'rankdir\', \'expand_nested\', \'dpi\', \'subgraph\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'True\', \'TB\', \'False\', \'96\', \'False\'], "
   }
-  member_method {
-    name: "multi_gpu_model"
-    argspec: "args=[\'model\', \'gpus\', \'cpu_merge\', \'cpu_relocation\'], varargs=None, keywords=None, defaults=[\'True\', \'False\'], "
-  }
   member_method {
     name: "normalize"
     argspec: "args=[\'x\', \'axis\', \'order\'], varargs=None, keywords=None, defaults=[\'-1\', \'2\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-adjoint.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-adjoint.pbtxt
index d26bde73d6e..cd2342fa17b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-adjoint.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-adjoint.pbtxt
@@ -54,6 +54,10 @@ tf_class {
     name: "operator"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-diag.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-diag.pbtxt
index 4739f586002..37cab1cd949 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-diag.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-diag.pbtxt
@@ -54,6 +54,10 @@ tf_class {
     name: "operators"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-lower-triangular.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-lower-triangular.pbtxt
index f6573a08ab1..15548662969 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-lower-triangular.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-lower-triangular.pbtxt
@@ -54,6 +54,10 @@ tf_class {
     name: "operators"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant.pbtxt
index 7c3a62bb067..96f3f456c22 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant.pbtxt
@@ -59,6 +59,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
index ca1ca3678a2..82696611119 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
@@ -59,6 +59,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
index e91de61a7f5..fa9ff47a9ea 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
@@ -59,6 +59,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-composition.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-composition.pbtxt
index 14c5514be31..1f3a3e01534 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-composition.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-composition.pbtxt
@@ -54,6 +54,10 @@ tf_class {
     name: "operators"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-diag.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-diag.pbtxt
index 6198572ba4f..40aea957ecb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-diag.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-diag.pbtxt
@@ -54,6 +54,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-full-matrix.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
index 9fe14ecc611..c23af284169 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
@@ -50,6 +50,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-householder.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-householder.pbtxt
index b71cda0a1be..ac861ce8131 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-householder.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-householder.pbtxt
@@ -50,6 +50,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-identity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-identity.pbtxt
index e4051585a35..1c8a1071cca 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-identity.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-identity.pbtxt
@@ -51,6 +51,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-inversion.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-inversion.pbtxt
index ee9351e5bb4..6379a67eadb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-inversion.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-inversion.pbtxt
@@ -54,6 +54,10 @@ tf_class {
     name: "operator"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-kronecker.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-kronecker.pbtxt
index 3c5b3a8c3db..fda61393e1a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-kronecker.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-kronecker.pbtxt
@@ -54,6 +54,10 @@ tf_class {
     name: "operators"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
index bf32f07455e..c07a18eb61c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
@@ -66,6 +66,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
index 2bf8383bc30..39e44edf3c2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
@@ -50,6 +50,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-permutation.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-permutation.pbtxt
index 321b7004109..228bfd41be2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-permutation.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-permutation.pbtxt
@@ -50,6 +50,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "perm"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
index a8a7a06fb51..358c0f88659 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
@@ -55,6 +55,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-toeplitz.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-toeplitz.pbtxt
index 15bae49eda0..7f863ce4170 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-toeplitz.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-toeplitz.pbtxt
@@ -54,6 +54,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-tridiag.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-tridiag.pbtxt
index 0609904bbb3..eadb8f066ec 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-tridiag.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-tridiag.pbtxt
@@ -58,6 +58,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-zeros.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-zeros.pbtxt
index 75777dc7745..f905de20b68 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-zeros.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-zeros.pbtxt
@@ -50,6 +50,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator.pbtxt
index 2390fb26d9c..c9ee0301612 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator.pbtxt
@@ -49,6 +49,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
@@ -75,7 +79,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'dtype\', \'graph_parents\', \'is_non_singular\', \'is_self_adjoint\', \'is_positive_definite\', \'is_square\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'dtype\', \'graph_parents\', \'is_non_singular\', \'is_self_adjoint\', \'is_positive_definite\', \'is_square\', \'name\', \'parameters\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_to_tensor"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.mlir.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.mlir.experimental.pbtxt
index e268fcf8e73..7a140a13bc6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.mlir.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.mlir.experimental.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.mlir.experimental"
 tf_module {
+  member_method {
+    name: "convert_function"
+    argspec: "args=[\'concrete_function\', \'pass_pipeline\'], varargs=None, keywords=None, defaults=[\'tf-standard-pipeline\'], "
+  }
   member_method {
     name: "convert_graph_def"
     argspec: "args=[\'graph_def\', \'pass_pipeline\'], varargs=None, keywords=None, defaults=[\'tf-standard-pipeline\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
index 1d74f47508a..e23596bd739 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
@@ -266,7 +266,7 @@ tf_module {
   }
   member_method {
     name: "AutoShardDataset"
-    argspec: "args=[\'input_dataset\', \'num_workers\', \'index\', \'output_types\', \'output_shapes\', \'auto_shard_policy\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\'], "
+    argspec: "args=[\'input_dataset\', \'num_workers\', \'index\', \'output_types\', \'output_shapes\', \'auto_shard_policy\', \'num_replicas\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'None\'], "
   }
   member_method {
     name: "AvgPool"
@@ -760,6 +760,10 @@ tf_module {
     name: "CollectiveGather"
     argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'shape\', \'communication_hint\', \'timeout_seconds\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'0\', \'None\'], "
   }
+  member_method {
+    name: "CollectiveGatherV2"
+    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'communication_hint\', \'timeout_seconds\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'0\', \'None\'], "
+  }
   member_method {
     name: "CollectivePermute"
     argspec: "args=[\'input\', \'source_target_pairs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -1426,7 +1430,7 @@ tf_module {
   }
   member_method {
     name: "ExperimentalIgnoreErrorsDataset"
-    argspec: "args=[\'input_dataset\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input_dataset\', \'output_types\', \'output_shapes\', \'log_warning\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
   member_method {
     name: "ExperimentalIteratorGetDevice"
@@ -1870,7 +1874,7 @@ tf_module {
   }
   member_method {
     name: "IgnoreErrorsDataset"
-    argspec: "args=[\'input_dataset\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input_dataset\', \'output_types\', \'output_shapes\', \'log_warning\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
   member_method {
     name: "Imag"
@@ -1880,6 +1884,10 @@ tf_module {
     name: "ImageProjectiveTransformV2"
     argspec: "args=[\'images\', \'transforms\', \'output_shape\', \'interpolation\', \'fill_mode\', \'name\'], varargs=None, keywords=None, defaults=[\'CONSTANT\', \'None\'], "
   }
+  member_method {
+    name: "ImageProjectiveTransformV3"
+    argspec: "args=[\'images\', \'transforms\', \'output_shape\', \'fill_value\', \'interpolation\', \'fill_mode\', \'name\'], varargs=None, keywords=None, defaults=[\'CONSTANT\', \'None\'], "
+  }
   member_method {
     name: "ImageSummary"
     argspec: "args=[\'tag\', \'tensor\', \'max_images\', \'bad_color\', \'name\'], varargs=None, keywords=None, defaults=[\'3\', \'dtype: DT_UINT8\\ntensor_shape {\\n  dim {\\n    size: 4\\n  }\\n}\\nint_val: 255\\nint_val: 0\\nint_val: 0\\nint_val: 255\\n\', \'None\'], "
@@ -1996,6 +2004,10 @@ tf_module {
     name: "IsVariableInitialized"
     argspec: "args=[\'ref\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "IsotonicRegression"
+    argspec: "args=[\'input\', \'output_dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
+  }
   member_method {
     name: "Iterator"
     argspec: "args=[\'shared_name\', \'container\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -2418,7 +2430,7 @@ tf_module {
   }
   member_method {
     name: "MaxPool"
-    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'explicit_paddings\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'NHWC\', \'None\'], "
   }
   member_method {
     name: "MaxPool3D"
@@ -2434,7 +2446,7 @@ tf_module {
   }
   member_method {
     name: "MaxPoolGrad"
-    argspec: "args=[\'orig_input\', \'orig_output\', \'grad\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
+    argspec: "args=[\'orig_input\', \'orig_output\', \'grad\', \'ksize\', \'strides\', \'padding\', \'explicit_paddings\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'NHWC\', \'None\'], "
   }
   member_method {
     name: "MaxPoolGradGrad"
@@ -2510,7 +2522,7 @@ tf_module {
   }
   member_method {
     name: "ModelDataset"
-    argspec: "args=[\'input_dataset\', \'output_types\', \'output_shapes\', \'algorithm\', \'cpu_budget\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'None\'], "
+    argspec: "args=[\'input_dataset\', \'output_types\', \'output_shapes\', \'algorithm\', \'cpu_budget\', \'ram_budget\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'0\', \'None\'], "
   }
   member_method {
     name: "Mul"
@@ -3780,6 +3792,10 @@ tf_module {
     name: "Rint"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "RngReadAndSkip"
+    argspec: "args=[\'resource\', \'alg\', \'delta\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "RngSkip"
     argspec: "args=[\'resource\', \'algorithm\', \'delta\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -4532,10 +4548,18 @@ tf_module {
     name: "StatelessRandomGammaV2"
     argspec: "args=[\'shape\', \'seed\', \'alpha\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "StatelessRandomGetKeyCounterAlg"
+    argspec: "args=[\'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "StatelessRandomNormal"
     argspec: "args=[\'shape\', \'seed\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
   }
+  member_method {
+    name: "StatelessRandomNormalV2"
+    argspec: "args=[\'shape\', \'key\', \'counter\', \'alg\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
+  }
   member_method {
     name: "StatelessRandomPoisson"
     argspec: "args=[\'shape\', \'seed\', \'lam\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -4548,14 +4572,34 @@ tf_module {
     name: "StatelessRandomUniformFullInt"
     argspec: "args=[\'shape\', \'seed\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'uint64\'>\", \'None\'], "
   }
+  member_method {
+    name: "StatelessRandomUniformFullIntV2"
+    argspec: "args=[\'shape\', \'key\', \'counter\', \'alg\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'uint64\'>\", \'None\'], "
+  }
   member_method {
     name: "StatelessRandomUniformInt"
     argspec: "args=[\'shape\', \'seed\', \'minval\', \'maxval\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "StatelessRandomUniformIntV2"
+    argspec: "args=[\'shape\', \'key\', \'counter\', \'alg\', \'minval\', \'maxval\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "StatelessRandomUniformV2"
+    argspec: "args=[\'shape\', \'key\', \'counter\', \'alg\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "StatelessSampleDistortedBoundingBox"
+    argspec: "args=[\'image_size\', \'bounding_boxes\', \'min_object_covered\', \'seed\', \'aspect_ratio_range\', \'area_range\', \'max_attempts\', \'use_image_if_no_bounding_boxes\', \'name\'], varargs=None, keywords=None, defaults=[\'[0.75, 1.33]\', \'[0.05, 1]\', \'100\', \'False\', \'None\'], "
+  }
   member_method {
     name: "StatelessTruncatedNormal"
     argspec: "args=[\'shape\', \'seed\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
   }
+  member_method {
+    name: "StatelessTruncatedNormalV2"
+    argspec: "args=[\'shape\', \'key\', \'counter\', \'alg\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
+  }
   member_method {
     name: "StatelessWhile"
     argspec: "args=[\'input\', \'cond\', \'body\', \'output_shapes\', \'parallel_iterations\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'10\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.-ftrl-parameters.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.-ftrl-parameters.pbtxt
index 9e435cc0e8f..450015c3695 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.-ftrl-parameters.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.-ftrl-parameters.pbtxt
@@ -5,6 +5,6 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'learning_rate_power\', \'initial_accumulator_value\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'use_gradient_accumulation\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\'], varargs=None, keywords=None, defaults=[\'-0.5\', \'0.1\', \'0.0\', \'0.0\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'learning_rate_power\', \'initial_accumulator_value\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'use_gradient_accumulation\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\', \'multiply_linear_by_learning_rate\', \'beta\', \'allow_zero_accumulator\'], varargs=None, keywords=None, defaults=[\'-0.5\', \'0.1\', \'0.0\', \'0.0\', \'True\', \'None\', \'None\', \'None\', \'None\', \'False\', \'0\', \'False\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-adagrad.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-adagrad.pbtxt
index e2c6bbd43d9..5762d8dafb8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-adagrad.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-adagrad.pbtxt
@@ -5,6 +5,6 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'initial_accumulator_value\', \'use_gradient_accumulation\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\', \'slot_variable_creation_fn\'], varargs=None, keywords=None, defaults=[\'0.001\', \'0.1\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'initial_accumulator_value\', \'use_gradient_accumulation\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\', \'slot_variable_creation_fn\', \'clipvalue\'], varargs=None, keywords=None, defaults=[\'0.001\', \'0.1\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-adam.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-adam.pbtxt
index 941e81acbbb..f66e284a0fa 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-adam.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-adam.pbtxt
@@ -5,6 +5,6 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'epsilon\', \'lazy_adam\', \'sum_inside_sqrt\', \'use_gradient_accumulation\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\', \'slot_variable_creation_fn\'], varargs=None, keywords=None, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-07\', \'True\', \'True\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'epsilon\', \'lazy_adam\', \'sum_inside_sqrt\', \'use_gradient_accumulation\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\', \'slot_variable_creation_fn\', \'clipvalue\'], varargs=None, keywords=None, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-07\', \'True\', \'True\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-s-g-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-s-g-d.pbtxt
index 9a3f47406b8..bf428575b2d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-s-g-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-s-g-d.pbtxt
@@ -5,6 +5,6 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\'], varargs=None, keywords=None, defaults=[\'0.01\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\', \'clipvalue\'], varargs=None, keywords=None, defaults=[\'0.01\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.pbtxt
index 9d4f24f4edd..9a11c9738d6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.pbtxt
@@ -24,4 +24,8 @@ tf_module {
     name: "TableConfig"
     mtype: "<type \'type\'>"
   }
+  member_method {
+    name: "serving_embedding_lookup"
+    argspec: "args=[\'inputs\', \'weights\', \'tables\', \'feature_config\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-ftrl-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-ftrl-optimizer.pbtxt
index 1d1aceb0138..9e12ae9b71f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-ftrl-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-ftrl-optimizer.pbtxt
@@ -18,7 +18,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'learning_rate_power\', \'initial_accumulator_value\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'use_locking\', \'name\', \'accum_name\', \'linear_name\', \'l2_shrinkage_regularization_strength\'], varargs=None, keywords=None, defaults=[\'-0.5\', \'0.1\', \'0.0\', \'0.0\', \'False\', \'Ftrl\', \'None\', \'None\', \'0.0\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'learning_rate_power\', \'initial_accumulator_value\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'use_locking\', \'name\', \'accum_name\', \'linear_name\', \'l2_shrinkage_regularization_strength\', \'beta\'], varargs=None, keywords=None, defaults=[\'-0.5\', \'0.1\', \'0.0\', \'0.0\', \'False\', \'Ftrl\', \'None\', \'None\', \'0.0\', \'None\'], "
   }
   member_method {
     name: "apply_gradients"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-ragged-tensor-spec.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-ragged-tensor-spec.pbtxt
index 25c3f6e3e11..a9043abdacb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-ragged-tensor-spec.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-ragged-tensor-spec.pbtxt
@@ -8,6 +8,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "flat_values_spec"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "ragged_rank"
     mtype: "<type \'property\'>"
@@ -26,7 +30,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'shape\', \'dtype\', \'ragged_rank\', \'row_splits_dtype\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'float32\'>\", \'None\', \"<dtype: \'int64\'>\"], "
+    argspec: "args=[\'self\', \'shape\', \'dtype\', \'ragged_rank\', \'row_splits_dtype\', \'flat_values_spec\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'float32\'>\", \'None\', \"<dtype: \'int64\'>\", \'None\'], "
   }
   member_method {
     name: "from_value"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.decorator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.decorator.pbtxt
new file mode 100644
index 00000000000..48f08d5eaaa
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.decorator.pbtxt
@@ -0,0 +1,11 @@
+path: "tensorflow.__internal__.decorator"
+tf_module {
+  member_method {
+    name: "make_decorator"
+    argspec: "args=[\'target\', \'decorator_func\', \'decorator_name\', \'decorator_doc\', \'decorator_argspec\'], varargs=None, keywords=None, defaults=[\'None\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "unwrap"
+    argspec: "args=[\'maybe_tf_decorator\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.pbtxt
new file mode 100644
index 00000000000..ac5fe01d310
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.pbtxt
@@ -0,0 +1,15 @@
+path: "tensorflow.__internal__"
+tf_module {
+  member {
+    name: "decorator"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "test"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "tracking"
+    mtype: "<type \'module\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.test.combinations.-named-object.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.test.combinations.-named-object.pbtxt
new file mode 100644
index 00000000000..de5dc8eca9d
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.test.combinations.-named-object.pbtxt
@@ -0,0 +1,9 @@
+path: "tensorflow.__internal__.test.combinations.NamedObject"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.test_combinations.NamedObject\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'obj\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.test.combinations.-test-combination.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.test.combinations.-test-combination.pbtxt
new file mode 100644
index 00000000000..d70cd20a7d5
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.test.combinations.-test-combination.pbtxt
@@ -0,0 +1,20 @@
+path: "tensorflow.__internal__.test.combinations.TestCombination"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.test_combinations.TestCombination\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "context_managers"
+    argspec: "args=[\'self\', \'kwargs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "parameter_modifiers"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "should_execute_combination"
+    argspec: "args=[\'self\', \'kwargs\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.test.combinations.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.test.combinations.pbtxt
new file mode 100644
index 00000000000..08695f72bea
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.test.combinations.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.__internal__.test.combinations"
+tf_module {
+  member {
+    name: "NamedObject"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TestCombination"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "generate"
+    mtype: "<class \'functools.partial\'>"
+  }
+  member_method {
+    name: "combine"
+    argspec: "args=[], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "times"
+    argspec: "args=[], varargs=combined, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.test.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.test.pbtxt
new file mode 100644
index 00000000000..ec8f5b3dd96
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.test.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.__internal__.test"
+tf_module {
+  member {
+    name: "combinations"
+    mtype: "<type \'module\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.tracking.-auto-trackable.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.tracking.-auto-trackable.pbtxt
new file mode 100644
index 00000000000..f4d2526f68a
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.tracking.-auto-trackable.pbtxt
@@ -0,0 +1,9 @@
+path: "tensorflow.__internal__.tracking.AutoTrackable"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.tracking.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.tracking.pbtxt
new file mode 100644
index 00000000000..d69dc7ce52e
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.tracking.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.__internal__.tracking"
+tf_module {
+  member {
+    name: "AutoTrackable"
+    mtype: "<type \'type\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.config.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.config.experimental.pbtxt
index 7397719e656..f6be3da59b8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.config.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.config.experimental.pbtxt
@@ -24,6 +24,10 @@ tf_module {
     name: "enable_mlir_graph_optimization"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "enable_tensor_float_32_execution"
+    argspec: "args=[\'enabled\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_device_details"
     argspec: "args=[\'device\'], varargs=None, keywords=None, defaults=None"
@@ -36,6 +40,10 @@ tf_module {
     name: "get_memory_growth"
     argspec: "args=[\'device\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_memory_usage"
+    argspec: "args=[\'device\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_synchronous_execution"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
@@ -76,4 +84,8 @@ tf_module {
     name: "set_visible_devices"
     argspec: "args=[\'devices\', \'device_type\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "tensor_float_32_execution_enabled"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
index 88d1cb71f09..1aee69e3a5f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
@@ -48,7 +48,7 @@ tf_class {
   }
   member_method {
     name: "from_generator"
-    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\', \'output_signature\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "from_tensor_slices"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
index 1b40337f229..eec1f30e679 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
@@ -50,7 +50,7 @@ tf_class {
   }
   member_method {
     name: "from_generator"
-    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\', \'output_signature\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "from_tensor_slices"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
index 16c63b4aa54..a9d0eafa605 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
@@ -49,7 +49,7 @@ tf_class {
   }
   member_method {
     name: "from_generator"
-    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\', \'output_signature\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "from_tensor_slices"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
index 712e2ea3d70..c875e08a4c3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
@@ -50,7 +50,7 @@ tf_class {
   }
   member_method {
     name: "from_generator"
-    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\', \'output_signature\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "from_tensor_slices"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
index f228c4afacc..90c6edd4da4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
@@ -50,7 +50,7 @@ tf_class {
   }
   member_method {
     name: "from_generator"
-    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\', \'output_signature\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "from_tensor_slices"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-optimization-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-optimization-options.pbtxt
index e33265430ea..f0ada15d5cd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-optimization-options.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-optimization-options.pbtxt
@@ -19,6 +19,10 @@ tf_class {
     name: "autotune_cpu_budget"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "autotune_ram_budget"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "filter_fusion"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt
index 67e7697ed91..ae2c22c237e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt
@@ -50,7 +50,7 @@ tf_class {
   }
   member_method {
     name: "from_generator"
-    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\', \'output_signature\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "from_tensor_slices"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt
index b9d9f7df03f..71726e9aebe 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt
@@ -50,7 +50,7 @@ tf_class {
   }
   member_method {
     name: "from_generator"
-    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\', \'output_signature\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "from_tensor_slices"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
index 39d9eb2347e..d6d1d15e97a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
@@ -138,7 +138,7 @@ tf_module {
   }
   member_method {
     name: "ignore_errors"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'log_warning\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "latency_stats"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-dispatch-server.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-dispatch-server.pbtxt
index 86efaf268e0..9cf8100ade3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-dispatch-server.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-dispatch-server.pbtxt
@@ -8,7 +8,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'port\', \'protocol\', \'start\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
+    argspec: "args=[\'self\', \'config\', \'start\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
   }
   member_method {
     name: "join"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-dispatcher-config.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-dispatcher-config.pbtxt
new file mode 100644
index 00000000000..2364e41e4c5
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-dispatcher-config.pbtxt
@@ -0,0 +1,39 @@
+path: "tensorflow.data.experimental.service.DispatcherConfig"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.experimental.service.server_lib.DispatcherConfig\'>"
+  is_instance: "<class \'tensorflow.python.data.experimental.service.server_lib.DispatcherConfig\'>"
+  is_instance: "<type \'tuple\'>"
+  member {
+    name: "fault_tolerant_mode"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "job_gc_check_interval_ms"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "job_gc_timeout_ms"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "port"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "protocol"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "work_dir"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "count"
+  }
+  member_method {
+    name: "index"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-worker-config.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-worker-config.pbtxt
new file mode 100644
index 00000000000..d8eaf9bc7d7
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-worker-config.pbtxt
@@ -0,0 +1,35 @@
+path: "tensorflow.data.experimental.service.WorkerConfig"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.experimental.service.server_lib.WorkerConfig\'>"
+  is_instance: "<class \'tensorflow.python.data.experimental.service.server_lib.WorkerConfig\'>"
+  is_instance: "<type \'tuple\'>"
+  member {
+    name: "dispatcher_address"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "heartbeat_interval_ms"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "port"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "protocol"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "worker_address"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "count"
+  }
+  member_method {
+    name: "index"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-worker-server.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-worker-server.pbtxt
index 8d8b1fd8584..6c0a78abb23 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-worker-server.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-worker-server.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'port\', \'dispatcher_address\', \'worker_address\', \'protocol\', \'start\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
+    argspec: "args=[\'self\', \'config\', \'start\'], varargs=None, keywords=None, defaults=[\'True\'], "
   }
   member_method {
     name: "join"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.pbtxt
index 3ec5cd90ff8..0dd42fcdc24 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.pbtxt
@@ -4,6 +4,14 @@ tf_module {
     name: "DispatchServer"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "DispatcherConfig"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "WorkerConfig"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "WorkerServer"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.pbtxt
index b6ac6f8c88a..ead3f808875 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.data"
 tf_module {
+  member {
+    name: "AUTOTUNE"
+    mtype: "<type \'int\'>"
+  }
   member {
     name: "Dataset"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt
index 148c8c9d71f..5e17d5c8752 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt
@@ -28,10 +28,6 @@ tf_class {
     name: "configure"
     argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
-  member_method {
-    name: "experimental_assign_to_logical_device"
-    argspec: "args=[\'self\', \'tensor\', \'logical_device_id\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "experimental_distribute_dataset"
     argspec: "args=[\'self\', \'dataset\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -52,18 +48,10 @@ tf_class {
     name: "experimental_make_numpy_dataset"
     argspec: "args=[\'self\', \'numpy_input\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "experimental_replicate_to_logical_devices"
-    argspec: "args=[\'self\', \'tensor\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "experimental_run"
     argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "experimental_split_to_logical_devices"
-    argspec: "args=[\'self\', \'tensor\', \'partition_dimensions\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "group"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-one-device-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-one-device-strategy.pbtxt
index 51e0d889489..aa23fddab08 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-one-device-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-one-device-strategy.pbtxt
@@ -28,10 +28,6 @@ tf_class {
     name: "configure"
     argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
-  member_method {
-    name: "experimental_assign_to_logical_device"
-    argspec: "args=[\'self\', \'tensor\', \'logical_device_id\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "experimental_distribute_dataset"
     argspec: "args=[\'self\', \'dataset\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -52,18 +48,10 @@ tf_class {
     name: "experimental_make_numpy_dataset"
     argspec: "args=[\'self\', \'numpy_input\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "experimental_replicate_to_logical_devices"
-    argspec: "args=[\'self\', \'tensor\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "experimental_run"
     argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "experimental_split_to_logical_devices"
-    argspec: "args=[\'self\', \'tensor\', \'partition_dimensions\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "group"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy-extended.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy-extended.pbtxt
index f3fa80427a4..75e34579e5c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy-extended.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy-extended.pbtxt
@@ -26,10 +26,6 @@ tf_class {
     name: "colocate_vars_with"
     argspec: "args=[\'self\', \'colocate_with_variable\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "non_slot_devices"
-    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "reduce_to"
     argspec: "args=[\'self\', \'reduce_op\', \'value\', \'destinations\', \'experimental_hints\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -38,10 +34,6 @@ tf_class {
     name: "update"
     argspec: "args=[\'self\', \'var\', \'fn\', \'args\', \'kwargs\', \'group\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'True\'], "
   }
-  member_method {
-    name: "update_non_slot"
-    argspec: "args=[\'self\', \'colocate_with\', \'fn\', \'args\', \'kwargs\', \'group\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'True\'], "
-  }
   member_method {
     name: "value_container"
     argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt
index dbd329d6874..611247e3ab9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt
@@ -27,10 +27,6 @@ tf_class {
     name: "configure"
     argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
-  member_method {
-    name: "experimental_assign_to_logical_device"
-    argspec: "args=[\'self\', \'tensor\', \'logical_device_id\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "experimental_distribute_dataset"
     argspec: "args=[\'self\', \'dataset\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -51,18 +47,10 @@ tf_class {
     name: "experimental_make_numpy_dataset"
     argspec: "args=[\'self\', \'numpy_input\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "experimental_replicate_to_logical_devices"
-    argspec: "args=[\'self\', \'tensor\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "experimental_run"
     argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "experimental_split_to_logical_devices"
-    argspec: "args=[\'self\', \'tensor\', \'partition_dimensions\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "group"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt
index 963ad04f6ab..7fd7878c45c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt
@@ -28,10 +28,6 @@ tf_class {
     name: "configure"
     argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
-  member_method {
-    name: "experimental_assign_to_logical_device"
-    argspec: "args=[\'self\', \'tensor\', \'logical_device_id\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "experimental_distribute_dataset"
     argspec: "args=[\'self\', \'dataset\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -52,18 +48,10 @@ tf_class {
     name: "experimental_make_numpy_dataset"
     argspec: "args=[\'self\', \'numpy_input\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "experimental_replicate_to_logical_devices"
-    argspec: "args=[\'self\', \'tensor\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "experimental_run"
     argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "experimental_split_to_logical_devices"
-    argspec: "args=[\'self\', \'tensor\', \'partition_dimensions\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "group"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
index 5a44eaf20b5..66e1fb34bb5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
@@ -28,10 +28,6 @@ tf_class {
     name: "configure"
     argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
-  member_method {
-    name: "experimental_assign_to_logical_device"
-    argspec: "args=[\'self\', \'tensor\', \'logical_device_id\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "experimental_distribute_dataset"
     argspec: "args=[\'self\', \'dataset\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -52,18 +48,10 @@ tf_class {
     name: "experimental_make_numpy_dataset"
     argspec: "args=[\'self\', \'numpy_input\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "experimental_replicate_to_logical_devices"
-    argspec: "args=[\'self\', \'tensor\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "experimental_run"
     argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "experimental_split_to_logical_devices"
-    argspec: "args=[\'self\', \'tensor\', \'partition_dimensions\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "group"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
index 58bd5497817..3d8265ee720 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
@@ -28,10 +28,6 @@ tf_class {
     name: "configure"
     argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
-  member_method {
-    name: "experimental_assign_to_logical_device"
-    argspec: "args=[\'self\', \'tensor\', \'logical_device_id\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "experimental_distribute_dataset"
     argspec: "args=[\'self\', \'dataset\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -52,18 +48,10 @@ tf_class {
     name: "experimental_make_numpy_dataset"
     argspec: "args=[\'self\', \'numpy_input\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "experimental_replicate_to_logical_devices"
-    argspec: "args=[\'self\', \'tensor\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "experimental_run"
     argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "experimental_split_to_logical_devices"
-    argspec: "args=[\'self\', \'tensor\', \'partition_dimensions\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "group"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt
index 4bcd2277411..8956d528b3a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt
@@ -28,10 +28,6 @@ tf_class {
     name: "configure"
     argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
-  member_method {
-    name: "experimental_assign_to_logical_device"
-    argspec: "args=[\'self\', \'tensor\', \'logical_device_id\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "experimental_distribute_dataset"
     argspec: "args=[\'self\', \'dataset\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -52,18 +48,10 @@ tf_class {
     name: "experimental_make_numpy_dataset"
     argspec: "args=[\'self\', \'numpy_input\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "experimental_replicate_to_logical_devices"
-    argspec: "args=[\'self\', \'tensor\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "experimental_run"
     argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "experimental_split_to_logical_devices"
-    argspec: "args=[\'self\', \'tensor\', \'partition_dimensions\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "group"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.image.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.image.pbtxt
index 8bca192e1c1..941d811f435 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.image.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.image.pbtxt
@@ -240,6 +240,10 @@ tf_module {
     name: "stateless_random_contrast"
     argspec: "args=[\'image\', \'lower\', \'upper\', \'seed\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "stateless_random_crop"
+    argspec: "args=[\'value\', \'size\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "stateless_random_flip_left_right"
     argspec: "args=[\'image\', \'seed\'], varargs=None, keywords=None, defaults=None"
@@ -260,6 +264,10 @@ tf_module {
     name: "stateless_random_saturation"
     argspec: "args=[\'image\', \'lower\', \'upper\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "stateless_sample_distorted_bounding_box"
+    argspec: "args=[\'image_size\', \'bounding_boxes\', \'seed\', \'min_object_covered\', \'aspect_ratio_range\', \'area_range\', \'max_attempts\', \'use_image_if_no_bounding_boxes\', \'name\'], varargs=None, keywords=None, defaults=[\'0.1\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
   member_method {
     name: "total_variation"
     argspec: "args=[\'images\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
index a9f6f069560..da08722a7a3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
@@ -174,7 +174,7 @@ tf_class {
   }
   member_method {
     name: "compile"
-    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\', \'steps_per_execution\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
index 168539be647..1719c8bd9c7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
@@ -180,7 +180,7 @@ tf_class {
   }
   member_method {
     name: "compile"
-    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\', \'steps_per_execution\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.applications.mobilenet_v3.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.applications.mobilenet_v3.pbtxt
new file mode 100644
index 00000000000..418ace0882f
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.applications.mobilenet_v3.pbtxt
@@ -0,0 +1,11 @@
+path: "tensorflow.keras.applications.mobilenet_v3"
+tf_module {
+  member_method {
+    name: "decode_predictions"
+    argspec: "args=[\'preds\', \'top\'], varargs=None, keywords=None, defaults=[\'5\'], "
+  }
+  member_method {
+    name: "preprocess_input"
+    argspec: "args=[\'x\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.applications.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.applications.pbtxt
index 900df849f45..60c9b6d2909 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.applications.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.applications.pbtxt
@@ -28,6 +28,10 @@ tf_module {
     name: "mobilenet_v2"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "mobilenet_v3"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "nasnet"
     mtype: "<type \'module\'>"
@@ -116,6 +120,14 @@ tf_module {
     name: "MobileNetV2"
     argspec: "args=[\'input_shape\', \'alpha\', \'include_top\', \'weights\', \'input_tensor\', \'pooling\', \'classes\', \'classifier_activation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'1.0\', \'True\', \'imagenet\', \'None\', \'None\', \'1000\', \'softmax\'], "
   }
+  member_method {
+    name: "MobileNetV3Large"
+    argspec: "args=[\'input_shape\', \'alpha\', \'minimalistic\', \'include_top\', \'weights\', \'input_tensor\', \'classes\', \'pooling\', \'dropout_rate\', \'classifier_activation\'], varargs=None, keywords=None, defaults=[\'None\', \'1.0\', \'False\', \'True\', \'imagenet\', \'None\', \'1000\', \'None\', \'0.2\', \'softmax\'], "
+  }
+  member_method {
+    name: "MobileNetV3Small"
+    argspec: "args=[\'input_shape\', \'alpha\', \'minimalistic\', \'include_top\', \'weights\', \'input_tensor\', \'classes\', \'pooling\', \'dropout_rate\', \'classifier_activation\'], varargs=None, keywords=None, defaults=[\'None\', \'1.0\', \'False\', \'True\', \'imagenet\', \'None\', \'1000\', \'None\', \'0.2\', \'softmax\'], "
+  }
   member_method {
     name: "NASNetLarge"
     argspec: "args=[\'input_shape\', \'include_top\', \'weights\', \'input_tensor\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'imagenet\', \'None\', \'None\', \'1000\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.estimator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.estimator.pbtxt
index d9415ba4e54..28d62d03936 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.estimator.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.estimator.pbtxt
@@ -2,6 +2,6 @@ path: "tensorflow.keras.estimator"
 tf_module {
   member_method {
     name: "model_to_estimator"
-    argspec: "args=[\'keras_model\', \'keras_model_path\', \'custom_objects\', \'model_dir\', \'config\', \'checkpoint_format\', \'metric_names_map\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'checkpoint\', \'None\'], "
+    argspec: "args=[\'keras_model\', \'keras_model_path\', \'custom_objects\', \'model_dir\', \'config\', \'checkpoint_format\', \'metric_names_map\', \'export_outputs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'checkpoint\', \'None\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-model.pbtxt
index 2aff054a51d..d93c018b073 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-model.pbtxt
@@ -175,7 +175,7 @@ tf_class {
   }
   member_method {
     name: "compile"
-    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\', \'steps_per_execution\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
index 3bd1bc2c939..1b50d327cd6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'unit_forget_bias\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'implementation\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'1\'], "
+    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'unit_forget_bias\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-wide-deep-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-wide-deep-model.pbtxt
index ed49246e458..9fba915d01a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-wide-deep-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-wide-deep-model.pbtxt
@@ -175,7 +175,7 @@ tf_class {
   }
   member_method {
     name: "compile"
-    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\', \'steps_per_execution\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.pbtxt
index 655018a30c4..2a0f0c780f1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.pbtxt
@@ -32,8 +32,4 @@ tf_module {
     name: "WideDeepModel"
     mtype: "<type \'type\'>"
   }
-  member_method {
-    name: "terminate_keras_multiprocessing_pools"
-    argspec: "args=[\'grace_period\', \'use_sigkill\'], varargs=None, keywords=None, defaults=[\'0.1\', \'False\'], "
-  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt
index 84576890c14..1a2338fe077 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'implementation\', \'reset_after\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'2\', \'True\'], "
+    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'reset_after\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'True\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt
index bf71821c303..7fd64ab47ca 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt
@@ -192,7 +192,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'implementation\', \'return_sequences\', \'return_state\', \'go_backwards\', \'stateful\', \'unroll\', \'time_major\', \'reset_after\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'2\', \'False\', \'False\', \'False\', \'False\', \'False\', \'False\', \'True\'], "
+    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'return_sequences\', \'return_state\', \'go_backwards\', \'stateful\', \'unroll\', \'time_major\', \'reset_after\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'False\', \'False\', \'False\', \'False\', \'False\', \'False\', \'True\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
index fb7ee25f8f6..c39df6fa394 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'unit_forget_bias\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'implementation\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'2\'], "
+    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'unit_forget_bias\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt
index fd5eb6f50ce..3877cf015a2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt
@@ -192,7 +192,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'unit_forget_bias\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'implementation\', \'return_sequences\', \'return_state\', \'go_backwards\', \'stateful\', \'time_major\', \'unroll\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'2\', \'False\', \'False\', \'False\', \'False\', \'False\', \'False\'], "
+    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'unit_forget_bias\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'return_sequences\', \'return_state\', \'go_backwards\', \'stateful\', \'time_major\', \'unroll\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'False\', \'False\', \'False\', \'False\', \'False\', \'False\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multi-head-attention.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multi-head-attention.pbtxt
new file mode 100644
index 00000000000..070ee20ab30
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multi-head-attention.pbtxt
@@ -0,0 +1,222 @@
+path: "tensorflow.keras.layers.MultiHeadAttention"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.multi_head_attention.MultiHeadAttention\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "supports_masking"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_heads\', \'key_dim\', \'value_dim\', \'dropout\', \'use_bias\', \'output_shape\', \'attention_axes\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'0.0\', \'True\', \'None\', \'None\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'query\', \'value\', \'key\', \'attention_mask\', \'return_attention_scores\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_signature\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt
index db272bdf782..97e4b91bfa3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt
@@ -149,7 +149,7 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-rotation.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-rotation.pbtxt
index ee9a0254382..7c68f9ef783 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-rotation.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-rotation.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'factor\', \'fill_mode\', \'interpolation\', \'seed\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'reflect\', \'bilinear\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'factor\', \'fill_mode\', \'interpolation\', \'seed\', \'name\', \'fill_value\'], varargs=None, keywords=kwargs, defaults=[\'reflect\', \'bilinear\', \'None\', \'None\', \'0.0\'], "
   }
   member_method {
     name: "adapt"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-translation.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-translation.pbtxt
index 7e1095e7503..73866dfcf50 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-translation.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-translation.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'height_factor\', \'width_factor\', \'fill_mode\', \'interpolation\', \'seed\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'reflect\', \'bilinear\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'height_factor\', \'width_factor\', \'fill_mode\', \'interpolation\', \'seed\', \'name\', \'fill_value\'], varargs=None, keywords=kwargs, defaults=[\'reflect\', \'bilinear\', \'None\', \'None\', \'0.0\'], "
   }
   member_method {
     name: "adapt"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt
index fd59a92a4af..392e2efef39 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'height_factor\', \'width_factor\', \'fill_mode\', \'interpolation\', \'seed\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'reflect\', \'bilinear\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'height_factor\', \'width_factor\', \'fill_mode\', \'interpolation\', \'seed\', \'name\', \'fill_value\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'reflect\', \'bilinear\', \'None\', \'None\', \'0.0\'], "
   }
   member_method {
     name: "adapt"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.pbtxt
index 3706919341d..078c7ec8a67 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.pbtxt
@@ -304,6 +304,10 @@ tf_module {
     name: "Minimum"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "MultiHeadAttention"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Multiply"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt
index dbab3abae8e..e3c8c7e8a65 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt
@@ -5,6 +5,18 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "clipnorm"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "clipvalue"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "global_clipnorm"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "iterations"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
index 4368742d7bb..15c0ab5abbb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
@@ -174,7 +174,7 @@ tf_class {
   }
   member_method {
     name: "compile"
-    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\', \'steps_per_execution\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
index 8e9409f27a9..729fdd660ca 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
@@ -180,7 +180,7 @@ tf_class {
   }
   member_method {
     name: "compile"
-    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\', \'steps_per_execution\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adadelta.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adadelta.pbtxt
index af854e98013..88e4ecfbb62 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adadelta.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adadelta.pbtxt
@@ -4,6 +4,18 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "clipnorm"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "clipvalue"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "global_clipnorm"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "iterations"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adagrad.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adagrad.pbtxt
index e89cc5cef75..89e0718d5b6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adagrad.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adagrad.pbtxt
@@ -4,6 +4,18 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "clipnorm"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "clipvalue"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "global_clipnorm"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "iterations"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adam.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adam.pbtxt
index 15414d7234f..29b1fba5aae 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adam.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adam.pbtxt
@@ -4,6 +4,18 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "clipnorm"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "clipvalue"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "global_clipnorm"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "iterations"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adamax.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adamax.pbtxt
index 8b3c429e6b5..c481aa07ace 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adamax.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adamax.pbtxt
@@ -4,6 +4,18 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "clipnorm"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "clipvalue"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "global_clipnorm"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "iterations"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-ftrl.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-ftrl.pbtxt
index 51ab675db74..a2b9d310eb9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-ftrl.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-ftrl.pbtxt
@@ -4,6 +4,18 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "clipnorm"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "clipvalue"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "global_clipnorm"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "iterations"
     mtype: "<type \'property\'>"
@@ -14,7 +26,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'learning_rate_power\', \'initial_accumulator_value\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'name\', \'l2_shrinkage_regularization_strength\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'-0.5\', \'0.1\', \'0.0\', \'0.0\', \'Ftrl\', \'0.0\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'learning_rate_power\', \'initial_accumulator_value\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'name\', \'l2_shrinkage_regularization_strength\', \'beta\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'-0.5\', \'0.1\', \'0.0\', \'0.0\', \'Ftrl\', \'0.0\', \'0.0\'], "
   }
   member_method {
     name: "add_slot"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-nadam.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-nadam.pbtxt
index 342c0951bbe..650ac77d6df 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-nadam.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-nadam.pbtxt
@@ -4,6 +4,18 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "clipnorm"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "clipvalue"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "global_clipnorm"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "iterations"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-optimizer.pbtxt
index f007b4b971a..50e3da3eda5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-optimizer.pbtxt
@@ -3,6 +3,18 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "clipnorm"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "clipvalue"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "global_clipnorm"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "iterations"
     mtype: "<type \'property\'>"
@@ -13,7 +25,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'name\', \'gradient_aggregator\', \'gradient_transformers\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "add_slot"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-r-m-sprop.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
index d5bf6fa7f47..ab8391e0465 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
@@ -4,6 +4,18 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "clipnorm"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "clipvalue"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "global_clipnorm"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "iterations"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-s-g-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-s-g-d.pbtxt
index df904f72511..2bad07d9998 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-s-g-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-s-g-d.pbtxt
@@ -4,6 +4,18 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "clipnorm"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "clipvalue"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "global_clipnorm"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "iterations"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.utils.-h-d-f5-matrix.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.utils.-h-d-f5-matrix.pbtxt
deleted file mode 100644
index 6b832051a97..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.utils.-h-d-f5-matrix.pbtxt
+++ /dev/null
@@ -1,29 +0,0 @@
-path: "tensorflow.keras.utils.HDF5Matrix"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras.utils.io_utils.HDF5Matrix\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "ndim"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "refs"
-    mtype: "<type \'collections.defaultdict\'>"
-  }
-  member {
-    name: "shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "size"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'datapath\', \'dataset\', \'start\', \'end\', \'normalizer\'], varargs=None, keywords=None, defaults=[\'0\', \'None\', \'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.utils.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.utils.pbtxt
index 35fd96c0d17..c8de4602486 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.utils.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.utils.pbtxt
@@ -8,10 +8,6 @@ tf_module {
     name: "GeneratorEnqueuer"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "HDF5Matrix"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "OrderedEnqueuer"
     mtype: "<type \'type\'>"
@@ -32,10 +28,6 @@ tf_module {
     name: "custom_object_scope"
     mtype: "<type \'type\'>"
   }
-  member_method {
-    name: "convert_all_kernels_in_model"
-    argspec: "args=[\'model\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "deserialize_keras_object"
     argspec: "args=[\'identifier\', \'module_objects\', \'custom_objects\', \'printable_module_name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'object\'], "
@@ -64,10 +56,6 @@ tf_module {
     name: "model_to_dot"
     argspec: "args=[\'model\', \'show_shapes\', \'show_dtype\', \'show_layer_names\', \'rankdir\', \'expand_nested\', \'dpi\', \'subgraph\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'True\', \'TB\', \'False\', \'96\', \'False\'], "
   }
-  member_method {
-    name: "multi_gpu_model"
-    argspec: "args=[\'model\', \'gpus\', \'cpu_merge\', \'cpu_relocation\'], varargs=None, keywords=None, defaults=[\'True\', \'False\'], "
-  }
   member_method {
     name: "normalize"
     argspec: "args=[\'x\', \'axis\', \'order\'], varargs=None, keywords=None, defaults=[\'-1\', \'2\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-adjoint.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-adjoint.pbtxt
index d26bde73d6e..cd2342fa17b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-adjoint.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-adjoint.pbtxt
@@ -54,6 +54,10 @@ tf_class {
     name: "operator"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-diag.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-diag.pbtxt
index 4739f586002..37cab1cd949 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-diag.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-diag.pbtxt
@@ -54,6 +54,10 @@ tf_class {
     name: "operators"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-lower-triangular.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-lower-triangular.pbtxt
index f6573a08ab1..15548662969 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-lower-triangular.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-lower-triangular.pbtxt
@@ -54,6 +54,10 @@ tf_class {
     name: "operators"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant.pbtxt
index 7c3a62bb067..96f3f456c22 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant.pbtxt
@@ -59,6 +59,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
index ca1ca3678a2..82696611119 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
@@ -59,6 +59,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
index e91de61a7f5..fa9ff47a9ea 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
@@ -59,6 +59,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-composition.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-composition.pbtxt
index 14c5514be31..1f3a3e01534 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-composition.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-composition.pbtxt
@@ -54,6 +54,10 @@ tf_class {
     name: "operators"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-diag.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-diag.pbtxt
index 6198572ba4f..40aea957ecb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-diag.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-diag.pbtxt
@@ -54,6 +54,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-full-matrix.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
index 9fe14ecc611..c23af284169 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
@@ -50,6 +50,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-householder.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-householder.pbtxt
index b71cda0a1be..ac861ce8131 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-householder.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-householder.pbtxt
@@ -50,6 +50,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-identity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-identity.pbtxt
index e4051585a35..1c8a1071cca 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-identity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-identity.pbtxt
@@ -51,6 +51,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-inversion.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-inversion.pbtxt
index ee9351e5bb4..6379a67eadb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-inversion.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-inversion.pbtxt
@@ -54,6 +54,10 @@ tf_class {
     name: "operator"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-kronecker.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-kronecker.pbtxt
index 3c5b3a8c3db..fda61393e1a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-kronecker.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-kronecker.pbtxt
@@ -54,6 +54,10 @@ tf_class {
     name: "operators"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
index bf32f07455e..c07a18eb61c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
@@ -66,6 +66,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
index 2bf8383bc30..39e44edf3c2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
@@ -50,6 +50,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-permutation.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-permutation.pbtxt
index 321b7004109..228bfd41be2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-permutation.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-permutation.pbtxt
@@ -50,6 +50,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "perm"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
index a8a7a06fb51..358c0f88659 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
@@ -55,6 +55,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-toeplitz.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-toeplitz.pbtxt
index 15bae49eda0..7f863ce4170 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-toeplitz.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-toeplitz.pbtxt
@@ -54,6 +54,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-tridiag.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-tridiag.pbtxt
index 0609904bbb3..eadb8f066ec 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-tridiag.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-tridiag.pbtxt
@@ -58,6 +58,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-zeros.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-zeros.pbtxt
index 75777dc7745..f905de20b68 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-zeros.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-zeros.pbtxt
@@ -50,6 +50,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator.pbtxt
index 2390fb26d9c..c9ee0301612 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator.pbtxt
@@ -49,6 +49,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
@@ -75,7 +79,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'dtype\', \'graph_parents\', \'is_non_singular\', \'is_self_adjoint\', \'is_positive_definite\', \'is_square\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'dtype\', \'graph_parents\', \'is_non_singular\', \'is_self_adjoint\', \'is_positive_definite\', \'is_square\', \'name\', \'parameters\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_to_tensor"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.mlir.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.mlir.experimental.pbtxt
index e268fcf8e73..7a140a13bc6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.mlir.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.mlir.experimental.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.mlir.experimental"
 tf_module {
+  member_method {
+    name: "convert_function"
+    argspec: "args=[\'concrete_function\', \'pass_pipeline\'], varargs=None, keywords=None, defaults=[\'tf-standard-pipeline\'], "
+  }
   member_method {
     name: "convert_graph_def"
     argspec: "args=[\'graph_def\', \'pass_pipeline\'], varargs=None, keywords=None, defaults=[\'tf-standard-pipeline\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.pbtxt
index 741ab7fe017..1baea4b7414 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.pbtxt
@@ -176,6 +176,10 @@ tf_module {
     name: "in_top_k"
     argspec: "args=[\'targets\', \'predictions\', \'k\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "isotonic_regression"
+    argspec: "args=[\'inputs\', \'decreasing\', \'axis\'], varargs=None, keywords=None, defaults=[\'True\', \'-1\'], "
+  }
   member_method {
     name: "l2_loss"
     argspec: "args=[\'t\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adadelta.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adadelta.pbtxt
index cb3d38246a7..605cb27c36b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adadelta.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adadelta.pbtxt
@@ -4,6 +4,18 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "clipnorm"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "clipvalue"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "global_clipnorm"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "iterations"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adagrad.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adagrad.pbtxt
index c7b2bca4b6b..a436583fdd6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adagrad.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adagrad.pbtxt
@@ -4,6 +4,18 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "clipnorm"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "clipvalue"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "global_clipnorm"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "iterations"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adam.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adam.pbtxt
index 209c9fe6620..f874658cc25 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adam.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adam.pbtxt
@@ -4,6 +4,18 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "clipnorm"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "clipvalue"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "global_clipnorm"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "iterations"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adamax.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adamax.pbtxt
index 12bbb14fb71..6798187be77 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adamax.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adamax.pbtxt
@@ -4,6 +4,18 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "clipnorm"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "clipvalue"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "global_clipnorm"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "iterations"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-ftrl.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-ftrl.pbtxt
index 1482ed54eb9..15efc6ada39 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-ftrl.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-ftrl.pbtxt
@@ -4,6 +4,18 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "clipnorm"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "clipvalue"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "global_clipnorm"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "iterations"
     mtype: "<type \'property\'>"
@@ -14,7 +26,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'learning_rate_power\', \'initial_accumulator_value\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'name\', \'l2_shrinkage_regularization_strength\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'-0.5\', \'0.1\', \'0.0\', \'0.0\', \'Ftrl\', \'0.0\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'learning_rate_power\', \'initial_accumulator_value\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'name\', \'l2_shrinkage_regularization_strength\', \'beta\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'-0.5\', \'0.1\', \'0.0\', \'0.0\', \'Ftrl\', \'0.0\', \'0.0\'], "
   }
   member_method {
     name: "add_slot"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-nadam.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-nadam.pbtxt
index 2a422fa2340..00cf3e0e24e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-nadam.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-nadam.pbtxt
@@ -4,6 +4,18 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "clipnorm"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "clipvalue"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "global_clipnorm"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "iterations"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-optimizer.pbtxt
index e7021e02772..881d15c5306 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-optimizer.pbtxt
@@ -3,6 +3,18 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "clipnorm"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "clipvalue"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "global_clipnorm"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "iterations"
     mtype: "<type \'property\'>"
@@ -13,7 +25,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'name\', \'gradient_aggregator\', \'gradient_transformers\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "add_slot"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-r-m-sprop.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-r-m-sprop.pbtxt
index 6543f4023a4..661e9cb5a58 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-r-m-sprop.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-r-m-sprop.pbtxt
@@ -4,6 +4,18 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "clipnorm"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "clipvalue"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "global_clipnorm"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "iterations"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-s-g-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-s-g-d.pbtxt
index 94ff8dfcdfc..a14c9a4ce57 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-s-g-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-s-g-d.pbtxt
@@ -4,6 +4,18 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "clipnorm"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "clipvalue"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "global_clipnorm"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "iterations"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
index 1d74f47508a..e23596bd739 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
@@ -266,7 +266,7 @@ tf_module {
   }
   member_method {
     name: "AutoShardDataset"
-    argspec: "args=[\'input_dataset\', \'num_workers\', \'index\', \'output_types\', \'output_shapes\', \'auto_shard_policy\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\'], "
+    argspec: "args=[\'input_dataset\', \'num_workers\', \'index\', \'output_types\', \'output_shapes\', \'auto_shard_policy\', \'num_replicas\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'None\'], "
   }
   member_method {
     name: "AvgPool"
@@ -760,6 +760,10 @@ tf_module {
     name: "CollectiveGather"
     argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'shape\', \'communication_hint\', \'timeout_seconds\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'0\', \'None\'], "
   }
+  member_method {
+    name: "CollectiveGatherV2"
+    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'communication_hint\', \'timeout_seconds\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'0\', \'None\'], "
+  }
   member_method {
     name: "CollectivePermute"
     argspec: "args=[\'input\', \'source_target_pairs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -1426,7 +1430,7 @@ tf_module {
   }
   member_method {
     name: "ExperimentalIgnoreErrorsDataset"
-    argspec: "args=[\'input_dataset\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input_dataset\', \'output_types\', \'output_shapes\', \'log_warning\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
   member_method {
     name: "ExperimentalIteratorGetDevice"
@@ -1870,7 +1874,7 @@ tf_module {
   }
   member_method {
     name: "IgnoreErrorsDataset"
-    argspec: "args=[\'input_dataset\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input_dataset\', \'output_types\', \'output_shapes\', \'log_warning\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
   member_method {
     name: "Imag"
@@ -1880,6 +1884,10 @@ tf_module {
     name: "ImageProjectiveTransformV2"
     argspec: "args=[\'images\', \'transforms\', \'output_shape\', \'interpolation\', \'fill_mode\', \'name\'], varargs=None, keywords=None, defaults=[\'CONSTANT\', \'None\'], "
   }
+  member_method {
+    name: "ImageProjectiveTransformV3"
+    argspec: "args=[\'images\', \'transforms\', \'output_shape\', \'fill_value\', \'interpolation\', \'fill_mode\', \'name\'], varargs=None, keywords=None, defaults=[\'CONSTANT\', \'None\'], "
+  }
   member_method {
     name: "ImageSummary"
     argspec: "args=[\'tag\', \'tensor\', \'max_images\', \'bad_color\', \'name\'], varargs=None, keywords=None, defaults=[\'3\', \'dtype: DT_UINT8\\ntensor_shape {\\n  dim {\\n    size: 4\\n  }\\n}\\nint_val: 255\\nint_val: 0\\nint_val: 0\\nint_val: 255\\n\', \'None\'], "
@@ -1996,6 +2004,10 @@ tf_module {
     name: "IsVariableInitialized"
     argspec: "args=[\'ref\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "IsotonicRegression"
+    argspec: "args=[\'input\', \'output_dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
+  }
   member_method {
     name: "Iterator"
     argspec: "args=[\'shared_name\', \'container\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -2418,7 +2430,7 @@ tf_module {
   }
   member_method {
     name: "MaxPool"
-    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'explicit_paddings\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'NHWC\', \'None\'], "
   }
   member_method {
     name: "MaxPool3D"
@@ -2434,7 +2446,7 @@ tf_module {
   }
   member_method {
     name: "MaxPoolGrad"
-    argspec: "args=[\'orig_input\', \'orig_output\', \'grad\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
+    argspec: "args=[\'orig_input\', \'orig_output\', \'grad\', \'ksize\', \'strides\', \'padding\', \'explicit_paddings\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'NHWC\', \'None\'], "
   }
   member_method {
     name: "MaxPoolGradGrad"
@@ -2510,7 +2522,7 @@ tf_module {
   }
   member_method {
     name: "ModelDataset"
-    argspec: "args=[\'input_dataset\', \'output_types\', \'output_shapes\', \'algorithm\', \'cpu_budget\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'None\'], "
+    argspec: "args=[\'input_dataset\', \'output_types\', \'output_shapes\', \'algorithm\', \'cpu_budget\', \'ram_budget\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'0\', \'None\'], "
   }
   member_method {
     name: "Mul"
@@ -3780,6 +3792,10 @@ tf_module {
     name: "Rint"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "RngReadAndSkip"
+    argspec: "args=[\'resource\', \'alg\', \'delta\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "RngSkip"
     argspec: "args=[\'resource\', \'algorithm\', \'delta\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -4532,10 +4548,18 @@ tf_module {
     name: "StatelessRandomGammaV2"
     argspec: "args=[\'shape\', \'seed\', \'alpha\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "StatelessRandomGetKeyCounterAlg"
+    argspec: "args=[\'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "StatelessRandomNormal"
     argspec: "args=[\'shape\', \'seed\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
   }
+  member_method {
+    name: "StatelessRandomNormalV2"
+    argspec: "args=[\'shape\', \'key\', \'counter\', \'alg\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
+  }
   member_method {
     name: "StatelessRandomPoisson"
     argspec: "args=[\'shape\', \'seed\', \'lam\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -4548,14 +4572,34 @@ tf_module {
     name: "StatelessRandomUniformFullInt"
     argspec: "args=[\'shape\', \'seed\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'uint64\'>\", \'None\'], "
   }
+  member_method {
+    name: "StatelessRandomUniformFullIntV2"
+    argspec: "args=[\'shape\', \'key\', \'counter\', \'alg\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'uint64\'>\", \'None\'], "
+  }
   member_method {
     name: "StatelessRandomUniformInt"
     argspec: "args=[\'shape\', \'seed\', \'minval\', \'maxval\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "StatelessRandomUniformIntV2"
+    argspec: "args=[\'shape\', \'key\', \'counter\', \'alg\', \'minval\', \'maxval\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "StatelessRandomUniformV2"
+    argspec: "args=[\'shape\', \'key\', \'counter\', \'alg\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "StatelessSampleDistortedBoundingBox"
+    argspec: "args=[\'image_size\', \'bounding_boxes\', \'min_object_covered\', \'seed\', \'aspect_ratio_range\', \'area_range\', \'max_attempts\', \'use_image_if_no_bounding_boxes\', \'name\'], varargs=None, keywords=None, defaults=[\'[0.75, 1.33]\', \'[0.05, 1]\', \'100\', \'False\', \'None\'], "
+  }
   member_method {
     name: "StatelessTruncatedNormal"
     argspec: "args=[\'shape\', \'seed\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
   }
+  member_method {
+    name: "StatelessTruncatedNormalV2"
+    argspec: "args=[\'shape\', \'key\', \'counter\', \'alg\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
+  }
   member_method {
     name: "StatelessWhile"
     argspec: "args=[\'input\', \'cond\', \'body\', \'output_shapes\', \'parallel_iterations\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'10\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-adagrad.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-adagrad.pbtxt
index e2c6bbd43d9..5762d8dafb8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-adagrad.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-adagrad.pbtxt
@@ -5,6 +5,6 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'initial_accumulator_value\', \'use_gradient_accumulation\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\', \'slot_variable_creation_fn\'], varargs=None, keywords=None, defaults=[\'0.001\', \'0.1\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'initial_accumulator_value\', \'use_gradient_accumulation\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\', \'slot_variable_creation_fn\', \'clipvalue\'], varargs=None, keywords=None, defaults=[\'0.001\', \'0.1\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-adam.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-adam.pbtxt
index 941e81acbbb..f66e284a0fa 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-adam.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-adam.pbtxt
@@ -5,6 +5,6 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'epsilon\', \'lazy_adam\', \'sum_inside_sqrt\', \'use_gradient_accumulation\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\', \'slot_variable_creation_fn\'], varargs=None, keywords=None, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-07\', \'True\', \'True\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'epsilon\', \'lazy_adam\', \'sum_inside_sqrt\', \'use_gradient_accumulation\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\', \'slot_variable_creation_fn\', \'clipvalue\'], varargs=None, keywords=None, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-07\', \'True\', \'True\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-s-g-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-s-g-d.pbtxt
index 9a3f47406b8..bf428575b2d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-s-g-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-s-g-d.pbtxt
@@ -5,6 +5,6 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\'], varargs=None, keywords=None, defaults=[\'0.01\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\', \'clipvalue\'], varargs=None, keywords=None, defaults=[\'0.01\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.pbtxt
index 9d4f24f4edd..9a11c9738d6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.pbtxt
@@ -24,4 +24,8 @@ tf_module {
     name: "TableConfig"
     mtype: "<type \'type\'>"
   }
+  member_method {
+    name: "serving_embedding_lookup"
+    argspec: "args=[\'inputs\', \'weights\', \'tables\', \'feature_config\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint.pbtxt
index 56651271c13..807a4315f0a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint.pbtxt
@@ -10,7 +10,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'root\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
   }
   member_method {
     name: "read"
diff --git a/tensorflow/tools/ci_build/Dockerfile.custom_op_ubuntu_16 b/tensorflow/tools/ci_build/Dockerfile.custom_op_ubuntu_16
index 0f02c34b057..2e3f884b138 100644
--- a/tensorflow/tools/ci_build/Dockerfile.custom_op_ubuntu_16
+++ b/tensorflow/tools/ci_build/Dockerfile.custom_op_ubuntu_16
@@ -77,10 +77,3 @@ ARG TF_PACKAGE=tensorflow
 ARG TF_PACKAGE_VERSION=
 RUN pip3 install ${TF_PACKAGE}-cpu${TF_PACKAGE_VERSION:+==${TF_PACKAGE_VERSION}}
 
-# TODO(klimek): Figure out a better way to get the right include paths
-# forwarded when we install new packages.
-RUN ln -s "/usr/include/x86_64-linux-gnu/python2.7" "/dt7/usr/include/x86_64-linux-gnu/python2.7"
-RUN ln -s "/usr/include/x86_64-linux-gnu/python2.7" "/dt8/usr/include/x86_64-linux-gnu/python2.7"
-
-RUN ln -s "/usr/include/x86_64-linux-gnu/python3.6m" "/dt7/usr/include/x86_64-linux-gnu/python3.6m"
-RUN ln -s "/usr/include/x86_64-linux-gnu/python3.6m" "/dt8/usr/include/x86_64-linux-gnu/python3.6m"
diff --git a/tensorflow/tools/ci_build/Dockerfile.custom_op_ubuntu_16_cuda10.0 b/tensorflow/tools/ci_build/Dockerfile.custom_op_ubuntu_16_cuda10.0
index 0732cd834a3..c4812a2a03d 100644
--- a/tensorflow/tools/ci_build/Dockerfile.custom_op_ubuntu_16_cuda10.0
+++ b/tensorflow/tools/ci_build/Dockerfile.custom_op_ubuntu_16_cuda10.0
@@ -76,8 +76,3 @@ RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.6 0
 ARG TF_PACKAGE=tensorflow
 ARG TF_PACKAGE_VERSION=
 RUN pip3 install ${TF_PACKAGE}${TF_PACKAGE_VERSION:+==${TF_PACKAGE_VERSION}}
-
-# TODO(klimek): Figure out a better way to get the right include paths
-# forwarded when we install new packages.
-RUN ln -s "/usr/include/x86_64-linux-gnu/python3.6m" "/dt7/usr/include/x86_64-linux-gnu/python3.6m"
-RUN ln -s "/usr/include/x86_64-linux-gnu/python3.6m" "/dt8/usr/include/x86_64-linux-gnu/python3.6m"
diff --git a/tensorflow/tools/ci_build/Dockerfile.custom_op_ubuntu_16_cuda10.1 b/tensorflow/tools/ci_build/Dockerfile.custom_op_ubuntu_16_cuda10.1
index 4d58ad67df6..e660694ab78 100644
--- a/tensorflow/tools/ci_build/Dockerfile.custom_op_ubuntu_16_cuda10.1
+++ b/tensorflow/tools/ci_build/Dockerfile.custom_op_ubuntu_16_cuda10.1
@@ -78,8 +78,3 @@ RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.6 0
 ARG TF_PACKAGE=tensorflow
 ARG TF_PACKAGE_VERSION=
 RUN pip3 install ${TF_PACKAGE}${TF_PACKAGE_VERSION:+==${TF_PACKAGE_VERSION}}
-
-# TODO(klimek): Figure out a better way to get the right include paths
-# forwarded when we install new packages.
-RUN ln -s "/usr/include/x86_64-linux-gnu/python3.6m" "/dt7/usr/include/x86_64-linux-gnu/python3.6m"
-RUN ln -s "/usr/include/x86_64-linux-gnu/python3.6m" "/dt8/usr/include/x86_64-linux-gnu/python3.6m"
diff --git a/tensorflow/tools/ci_build/Dockerfile.horovod.gpu b/tensorflow/tools/ci_build/Dockerfile.horovod.gpu
new file mode 100644
index 00000000000..3607c52fc63
--- /dev/null
+++ b/tensorflow/tools/ci_build/Dockerfile.horovod.gpu
@@ -0,0 +1,62 @@
+FROM nvidia/cuda:11.0-cudnn8-devel-ubuntu18.04
+
+# Install GCC, Python3.7 and other dependencies.
+RUN apt-get update && \
+    apt-get install --assume-yes \
+        build-essential \
+        git \
+        wget \
+        cmake \
+        curl \
+        vim \
+        ca-certificates \
+        libjpeg-dev \
+        libpng-dev \
+        librdmacm1 \
+        libibverbs1 \
+        ibverbs-providers \
+        python3.7 \
+        python3.7-dev \
+        python3-pip \
+        python3.7-distutils && \
+    rm -rf /var/lib/apt/lists/* && \
+    rm -f /usr/bin/python && \
+    rm -f /usr/bin/python3 && \
+    ln -s /usr/bin/python3.7 /usr/bin/python && \
+    ln -s /usr/bin/python3.7 /usr/bin/python3 && \
+    gcc --version && \
+    g++ --version
+
+# Install tf-nightly and verify version.
+RUN python3.7 -m pip install --upgrade pip && \
+    pip3.7 install --no-cache --no-cache-dir tf-nightly && \
+    python3.7 -c "import tensorflow as tf; print(tf.__version__)"
+
+WORKDIR /tmp/openmpi_source
+
+# Download and install open-mpi.
+RUN wget https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.4.tar.gz && \
+    tar xvf openmpi-4.0.4.tar.gz && \
+    cd openmpi-4.0.4 && \
+    ./configure --enable-orterun-prefix-by-default && \
+    make -j $(nproc) all && \
+    make install
+
+# Set the path for OpenMPI binaries, libs and headers to be discoverable
+ENV LD_LIBRARY_PATH=/usr/local/lib/openmpi
+RUN ldconfig
+
+ENV HOROVOD_GPU_OPERATIONS=NCCL
+ENV HOROVOD_WITH_TENSORFLOW=1
+ENV HOROVOD_WITHOUT_PYTORCH=1
+ENV HOROVOD_WITHOUT_MXNET=1
+
+RUN pip3.7 install --no-cache --no-cache-dir \
+        git+https://github.com/horovod/horovod.git
+
+WORKDIR /workspace
+
+RUN git clone \
+        https://github.com/DEKHTIARJonathan/TF_HVD_Stability_Test.git \
+        /workspace && \
+    pip3.7 install --no-cache --no-cache-dir -r requirements.txt
diff --git a/tensorflow/tools/ci_build/builds/docker_test.sh b/tensorflow/tools/ci_build/builds/docker_test.sh
index b2d1dbae433..eee0a9103ff 100755
--- a/tensorflow/tools/ci_build/builds/docker_test.sh
+++ b/tensorflow/tools/ci_build/builds/docker_test.sh
@@ -122,7 +122,7 @@ ${GPU_EXTRA_PARAMS} ${ROCM_EXTRA_PARAMS} \
 "${DOCKER_IMG_TAG}" \
 /bin/bash -c "tensorflow/tools/ci_build/builds/run_pip_tests.sh && "\
 "tensorflow/tools/ci_build/builds/test_tutorials.sh && "\
-"tensorflow/tools/ci_bukld/builds/integration_tests.sh"
+"tensorflow/tools/ci_build/builds/integration_tests.sh"
 
 RESULT=$?
 
diff --git a/tensorflow/tools/ci_build/builds/libtensorflow.sh b/tensorflow/tools/ci_build/builds/libtensorflow.sh
index a281afe7442..bfd551f1772 100755
--- a/tensorflow/tools/ci_build/builds/libtensorflow.sh
+++ b/tensorflow/tools/ci_build/builds/libtensorflow.sh
@@ -54,8 +54,9 @@ function build_libtensorflow_tarball() {
   BAZEL_OPTS="--config=opt --cxxopt=-D_GLIBCXX_USE_CXX11_ABI=0"
   export CC_OPT_FLAGS="-mavx -msse4.2"
   if [ "${TF_NEED_CUDA}" == "1" ]; then
-    BAZEL_OPTS="${BAZEL_OPTS} --config=cuda --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain"
+    BAZEL_OPTS="${BAZEL_OPTS} --config=cuda --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda11:toolchain"
     export TF_NEED_ROCM=0
+    export TF_CUDA_COMPUTE_CAPABILITIES="sm_35,sm_50,sm_60,sm_70,sm_75,compute_80"
   fi
   bazel clean --expunge
   yes "" | ./configure
diff --git a/tensorflow/tools/ci_build/builds/nightly_release_smoke_test.sh b/tensorflow/tools/ci_build/builds/nightly_release_smoke_test.sh
index 19e1232cd92..523daf666d5 100644
--- a/tensorflow/tools/ci_build/builds/nightly_release_smoke_test.sh
+++ b/tensorflow/tools/ci_build/builds/nightly_release_smoke_test.sh
@@ -23,7 +23,7 @@ MAC_CPU_MAX_WHL_SIZE=165M
 LINUX_CPU_MAX_WHL_SIZE=138M
 WIN_CPU_MAX_WHL_SIZE=113M
 # GPU size
-LINUX_GPU_MAX_WHL_SIZE=337M
+LINUX_GPU_MAX_WHL_SIZE=380M
 WIN_GPU_MAX_WHL_SIZE=252M
 
 function run_smoke_test() {
diff --git a/tensorflow/tools/ci_build/ci_build.sh b/tensorflow/tools/ci_build/ci_build.sh
index d41972f4e1a..59a5a5d38b7 100755
--- a/tensorflow/tools/ci_build/ci_build.sh
+++ b/tensorflow/tools/ci_build/ci_build.sh
@@ -162,7 +162,7 @@ mkdir -p ${WORKSPACE}/bazel-ci_build-cache
 # By default we cleanup - remove the container once it finish running (--rm)
 # and share the PID namespace (--pid=host) so the process inside does not have
 # pid 1 and SIGKILL is propagated to the process inside (jenkins can kill it).
-${DOCKER_BINARY} run --rm --pid=host \
+${DOCKER_BINARY} run --rm --name ${DOCKER_IMG_NAME} --pid=host \
     -v ${WORKSPACE}/bazel-ci_build-cache:${WORKSPACE}/bazel-ci_build-cache \
     -e "CI_BUILD_HOME=${WORKSPACE}/bazel-ci_build-cache" \
     -e "CI_BUILD_USER=$(id -u -n)" \
diff --git a/tensorflow/tools/ci_build/ci_sanity.sh b/tensorflow/tools/ci_build/ci_sanity.sh
index f4961e896ee..fca6df53dbb 100755
--- a/tensorflow/tools/ci_build/ci_sanity.sh
+++ b/tensorflow/tools/ci_build/ci_sanity.sh
@@ -109,11 +109,13 @@ do_pylint() {
 "^tensorflow/python/platform/gfile\.py.*\[E0301.*non-iterator "\
 "^tensorflow/python/keras/callbacks\.py.*\[E1133.*not-an-iterable "\
 "^tensorflow/python/keras/engine/base_layer.py.*\[E0203.*access-member-before-definition "\
+"^tensorflow/python/keras/engine/base_layer.py.*\[E1102.*not-callable "\
 "^tensorflow/python/keras/layers/recurrent\.py.*\[E0203.*access-member-before-definition "\
 "^tensorflow/python/kernel_tests/constant_op_eager_test.py.*\[E0303.*invalid-length-returned "\
 "^tensorflow/python/keras/utils/data_utils.py.*\[E1102.*not-callable "\
 "^tensorflow/python/autograph/.*_py3_test\.py.*\[E0001.*syntax-error "\
-"^tensorflow/python/keras/preprocessing/image\.py.*\[E0240.*Inconsistent method resolution "
+"^tensorflow/python/keras/preprocessing/image\.py.*\[E0240.*Inconsistent method resolution "\
+"^tensorflow/\.py.*\[C0326.*bad-whitespace.*No space allowed around keyword argument assignment "
 
   echo "ERROR_ALLOWLIST=\"${ERROR_ALLOWLIST}\""
 
@@ -125,6 +127,22 @@ do_pylint() {
 
   PYLINT_BIN="python3 -m pylint"
 
+  echo ""
+  echo "check whether pylint is available or not."
+  echo ""
+  ${PYLINT_BIN} --version
+  if [[ $? -eq 0 ]]
+  then
+    echo ""
+    echo "pylint available, proceeding with pylint sanity check."
+    echo ""
+  else
+    echo ""
+    echo "pylint not available." >&2
+    echo ""
+    return 1
+  fi
+
   if [[ "$1" == "--incremental" ]]; then
     PYTHON_SRC_FILES=$(get_py_files_to_check --incremental)
 
@@ -198,7 +216,7 @@ do_pylint() {
     IS_ALLOWLISTED=0
     for WL_REGEX in ${ERROR_ALLOWLIST}; do
       if echo ${LINE} | grep -q "${WL_REGEX}"; then
-        echo "Found a allowlisted error:"
+        echo "Found an allowlisted error:"
         echo "  ${LINE}"
         IS_ALLOWLISTED=1
       fi
diff --git a/tensorflow/tools/ci_build/horovod/gpu/nightly.sh b/tensorflow/tools/ci_build/horovod/gpu/nightly.sh
index 060193614c3..90384742c50 100644
--- a/tensorflow/tools/ci_build/horovod/gpu/nightly.sh
+++ b/tensorflow/tools/ci_build/horovod/gpu/nightly.sh
@@ -14,67 +14,13 @@
 # limitations under the License.
 # ==============================================================================
 set -e
-set -x
 
-# Source the external common scripts.
-source tensorflow/tools/ci_build/release/common.sh
+# Build the docker image
+cd tensorflow/tools/ci_build
+docker build -t horovod_test_container:latest -f Dockerfile.horovod.gpu .
 
-# Exit src directory to avoid Python import issues.
-# We do not need TensorFlow source files.
-mkdir /tmp/horovod_test
-cd /tmp/horovod_test
+docker run --rm \
+  --gpus all \
+  --shm-size=2g --ulimit memlock=-1 --ulimit stack=67108864 \
+  horovod_test_container:latest bash -c "python3.7 -m pytest"
 
-
-# Update the latest Python dependency packages via pip3.7
-install_ubuntu_16_pip_deps pip3.7
-
-# Install latest bazel
-install_bazelisk
-which bazel
-
-# Install realpath
-sudo apt-get install realpath
-
-# Install tf-nightly and verify version.
-pip3.7 install --user --upgrade tf-nightly
-
-python3.7 -c "import tensorflow as tf; print(tf.__version__)"
-
-# Download and install open-mpi.
-wget https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.4.tar.gz
-tar xvf openmpi-4.0.4.tar.gz
-
-# Install gcc.
-sudo apt install --assume-yes build-essential
-
-gcc --version
-
-cd openmpi-4.0.4
-./configure
-
-# Install open-mpi.
-sudo make all install
-export LD_LIBRARY_PATH=/usr/local/lib/openmpi
-sudo ldconfig
-
-sudo update-alternatives --install /usr/bin/gcc gcc /dt7/usr/bin/gcc 60 --slave /usr/bin/g++ g++ /dt7/usr/bin/g++
-
-g++ --version
-
-# Install Horovod.
-cd ..
-HOROVOD_WITH_TENSORFLOW=1
-pip3.7 install horovod[tensorflow] --user
-
-# Install tests.
-git clone https://github.com/DEKHTIARJonathan/TF_HVD_Stability_Test.git
-
-# Install pytest.
-pip3.7 install -U pytest --user
-
-# Install requirements.
-cd TF_HVD_Stability_Test
-pip3.7 install -r requirements.txt --user
-
-# Run the tests.
-python3.7 -m pytest
diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh
index 3009213d43a..578967a67cf 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh
@@ -96,13 +96,17 @@ pip3 install py-cpuinfo
 
 # pylint==1.6.4 requires python-astroid (>= 1.4.5) requires lazy-object-proxy
 # Latest version of lazy-object-proxy (1.4.2) fails to install from source
-# when using setuptools 39.1.0
+# when using setuptools 39.1.0.
+# NOTE: Using the updated version of pylint for python3 as python2 is EOL,
+# thus using the updated version of lazy-object-proxy==1.4.3
 pip2 install lazy-object-proxy==1.4.1
-pip3 install lazy-object-proxy==1.4.1
+pip3 install lazy-object-proxy==1.4.3
 
-# pylint tests require the following:
+# pylint tests require the following version. pylint==1.6.4 hangs erratically,
+# thus using the updated version of 2.5.3 only for python3 as python2 is EOL
+# and this version is not available.
 pip2 install pylint==1.6.4
-pip3 install pylint==1.6.4
+pip3 install pylint==2.5.3
 
 # pycodestyle tests require the following:
 pip2 install pycodestyle
diff --git a/tensorflow/tools/ci_build/linux/libtensorflow_docker.sh b/tensorflow/tools/ci_build/linux/libtensorflow_docker.sh
index 1b255682671..7b2ba29de8c 100755
--- a/tensorflow/tools/ci_build/linux/libtensorflow_docker.sh
+++ b/tensorflow/tools/ci_build/linux/libtensorflow_docker.sh
@@ -58,7 +58,7 @@ ${DOCKER_BINARY} run \
   -e "TF_NEED_HDFS=0" \
   -e "TF_NEED_CUDA=${TF_NEED_CUDA}" \
   -e "TF_NEED_TENSORRT=${TF_NEED_CUDA}" \
+  -e "TF_CUDA_COMPUTE_CAPABILITIES=${TF_CUDA_COMPUTE_CAPABILITIES}" \
   -e "TF_NEED_ROCM=${TF_NEED_ROCM}" \
-  -e "TF_NEED_OPENCL_SYCL=0" \
   "${DOCKER_IMAGE}" \
   "/workspace/tensorflow/tools/ci_build/linux/libtensorflow.sh"
diff --git a/tensorflow/tools/ci_build/osx/libtensorflow_cpu.sh b/tensorflow/tools/ci_build/osx/libtensorflow_cpu.sh
index 06798adc03b..69f01520e23 100755
--- a/tensorflow/tools/ci_build/osx/libtensorflow_cpu.sh
+++ b/tensorflow/tools/ci_build/osx/libtensorflow_cpu.sh
@@ -27,7 +27,6 @@ export PYTHON_BIN_PATH="/usr/bin/python"
 export TF_NEED_HDFS=0
 export TF_NEED_CUDA=0
 export TF_NEED_ROCM=0
-export TF_NEED_OPENCL_SYCL=0
 export TF_NEED_MKL=0
 export COMPUTECPP_PATH="/usr/local"
 
diff --git a/tensorflow/tools/ci_build/osx/libtensorflow_gpu.sh b/tensorflow/tools/ci_build/osx/libtensorflow_gpu.sh
index 95f1992d7d6..73920e94eec 100755
--- a/tensorflow/tools/ci_build/osx/libtensorflow_gpu.sh
+++ b/tensorflow/tools/ci_build/osx/libtensorflow_gpu.sh
@@ -28,7 +28,6 @@ export LD_LIBRARY_PATH="/usr/local/cuda/lib:/usr/local/cuda/extras/CUPTI/lib:${L
 export PYTHON_BIN_PATH="/usr/bin/python"
 export TF_NEED_HDFS=0
 export TF_NEED_ROCM=0
-export TF_NEED_OPENCL_SYCL=0
 export TF_NEED_MKL=0
 export COMPUTECPP_PATH="/usr/local"
 
diff --git a/tensorflow/tools/ci_build/osx/libtensorflow_rocm.sh b/tensorflow/tools/ci_build/osx/libtensorflow_rocm.sh
index aeabc0e39e1..4f3b67f9c26 100755
--- a/tensorflow/tools/ci_build/osx/libtensorflow_rocm.sh
+++ b/tensorflow/tools/ci_build/osx/libtensorflow_rocm.sh
@@ -28,7 +28,6 @@ export PYTHON_BIN_PATH="/usr/bin/python"
 export TF_NEED_GCP=0
 export TF_NEED_HDFS=0
 export TF_NEED_CUDA=0
-export TF_NEED_OPENCL_SYCL=0
 export TF_NEED_MKL=0
 export COMPUTECPP_PATH="/usr/local"
 
diff --git a/tensorflow/tools/ci_build/presubmit/ubuntu_16/cpu_py36_full/build.sh b/tensorflow/tools/ci_build/presubmit/ubuntu_16/cpu_py36_full/build.sh
index bde3c3d55e3..2be1dbc3948 100644
--- a/tensorflow/tools/ci_build/presubmit/ubuntu_16/cpu_py36_full/build.sh
+++ b/tensorflow/tools/ci_build/presubmit/ubuntu_16/cpu_py36_full/build.sh
@@ -65,11 +65,15 @@ function run_build () {
     --define=framework_shared_object=true \
     --define=with_xla_support=true \
     -c opt \
+    --host_copt="-w" \
     --copt="-w" \
+    --host_copt=-mavx \
     --copt=-mavx \
+    --host_linkopt=-lrt \
     --linkopt=-lrt \
     --distinct_host_configuration=false \
-    --remote_default_exec_properties=build=${CACHE_SILO_VAL} \
+    --remote_default_platform_properties="properties:{name:\"build\" value:\"${CACHE_SILO_VAL}\"}" \
+    --host_crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010:toolchain \
     --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010:toolchain \
     --host_javabase=@bazel_toolchains//configs/ubuntu16_04_clang/1.1:jdk8 \
     --javabase=@bazel_toolchains//configs/ubuntu16_04_clang/1.1:jdk8 \
diff --git a/tensorflow/tools/ci_build/presubmit/ubuntu_16/gpu_py36_full/build.sh b/tensorflow/tools/ci_build/presubmit/ubuntu_16/gpu_py36_full/build.sh
index a27cc881f41..dd461b7578f 100644
--- a/tensorflow/tools/ci_build/presubmit/ubuntu_16/gpu_py36_full/build.sh
+++ b/tensorflow/tools/ci_build/presubmit/ubuntu_16/gpu_py36_full/build.sh
@@ -94,12 +94,17 @@ function run_build () {
     --define=allow_oversize_protos=true \
     --define=grpc_no_ares=true \
     -c opt \
+    --host_copt="-w" \
     --copt="-w" \
+    --host_copt=-mavx \
     --copt=-mavx \
+    --host_linkopt=-lrt \
     --linkopt=-lrt \
+    --host_linkopt=-lm \
     --linkopt=-lm \
     --distinct_host_configuration=false \
     --remote_default_exec_properties=build=${CACHE_SILO_VAL} \
+    --host_crosstool_top="${TF_CUDA_CONFIG_REPO}//crosstool:toolchain" \
     --crosstool_top="${TF_CUDA_CONFIG_REPO}//crosstool:toolchain" \
     --host_javabase=@bazel_toolchains//configs/ubuntu16_04_clang/1.1:jdk8 \
     --javabase=@bazel_toolchains//configs/ubuntu16_04_clang/1.0:jdk8 \
diff --git a/tensorflow/tools/ci_build/pylintrc b/tensorflow/tools/ci_build/pylintrc
index 5d65c9644c7..1b24859e88e 100644
--- a/tensorflow/tools/ci_build/pylintrc
+++ b/tensorflow/tools/ci_build/pylintrc
@@ -38,7 +38,7 @@ enable=indexing-exception,old-raise-syntax
 # --enable=similarities". If you want to run only the classes checker, but have
 # no Warning level messages displayed, use"--disable=all --enable=classes
 # --disable=W"
-disable=design,similarities,no-self-use,attribute-defined-outside-init,locally-disabled,star-args,pointless-except,bad-option-value,global-statement,fixme,suppressed-message,useless-suppression,locally-enabled,no-member,no-name-in-module,import-error,unsubscriptable-object,unbalanced-tuple-unpacking,undefined-variable,not-context-manager
+disable=design,similarities,no-self-use,attribute-defined-outside-init,locally-disabled,star-args,pointless-except,bad-option-value,global-statement,fixme,suppressed-message,useless-suppression,locally-enabled,no-member,no-name-in-module,import-error,unsubscriptable-object,unbalanced-tuple-unpacking,undefined-variable,not-context-manager,invalid-sequence-index
 
 
 # Set the cache size for astng objects.
diff --git a/tensorflow/tools/ci_build/rel/macos/cpu_libtensorflow.sh b/tensorflow/tools/ci_build/rel/macos/cpu_libtensorflow.sh
index ccc80e1bafd..3dfab5a2aaa 100644
--- a/tensorflow/tools/ci_build/rel/macos/cpu_libtensorflow.sh
+++ b/tensorflow/tools/ci_build/rel/macos/cpu_libtensorflow.sh
@@ -13,11 +13,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
-set -e
-set -x
+echo "chmod go+w lib_package/*" >> tensorflow/tools/ci_build/linux/libtensorflow.sh
+echo "bazel clean --expunge" >> tensorflow/tools/ci_build/linux/libtensorflow.sh
 
 # Install latest bazel
 source tensorflow/tools/ci_build/release/common.sh
 install_bazelisk
+
+# Pick a version of xcode
+export DEVELOPER_DIR=/Applications/Xcode_10.3.app/Contents/Developer
+sudo xcode-select -s "${DEVELOPER_DIR}"
+
+# Update the version string to nightly
+./tensorflow/tools/ci_build/update_version.py --nightly
+
 tensorflow/tools/ci_build/osx/libtensorflow_cpu.sh
+
+# Copy the nightly version update script
+cp tensorflow/tools/ci_build/builds/libtensorflow_nightly_symlink.sh lib_package
diff --git a/tensorflow/tools/ci_build/rel/macos/cpu_py35_pip.sh b/tensorflow/tools/ci_build/rel/macos/cpu_py35_pip.sh
index 3f31033b2ac..99c2a149394 100644
--- a/tensorflow/tools/ci_build/rel/macos/cpu_py35_pip.sh
+++ b/tensorflow/tools/ci_build/rel/macos/cpu_py35_pip.sh
@@ -33,13 +33,11 @@ export TF_PYTHON_VERSION='python3.5'
 export TF_BUILD_BOTH_CPU_PACKAGES=1
 
 # Run configure.
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
 # Export optional variables for running pip.sh
-export TF_BUILD_FLAGS="--config=opt --config=v2"
+export TF_BUILD_FLAGS="--config=release_cpu_macos"
 export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going --test_env=TF2_BEHAVIOR=1"
 export TF_TEST_TARGETS="//tensorflow/python/..."
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
diff --git a/tensorflow/tools/ci_build/rel/macos/cpu_py36_pip.sh b/tensorflow/tools/ci_build/rel/macos/cpu_py36_pip.sh
index 26ee4ea8edb..375a8c705fa 100644
--- a/tensorflow/tools/ci_build/rel/macos/cpu_py36_pip.sh
+++ b/tensorflow/tools/ci_build/rel/macos/cpu_py36_pip.sh
@@ -33,13 +33,11 @@ export TF_PYTHON_VERSION='python3.6'
 export TF_BUILD_BOTH_CPU_PACKAGES=1
 
 # Run configure.
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
 # Export optional variables for running pip.sh
-export TF_BUILD_FLAGS="--config=opt --config=v2"
+export TF_BUILD_FLAGS="--config=release_cpu_macos"
 export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going --test_env=TF2_BEHAVIOR=1"
 export TF_TEST_TARGETS="//tensorflow/python/..."
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
diff --git a/tensorflow/tools/ci_build/rel/macos/cpu_py37_pip.sh b/tensorflow/tools/ci_build/rel/macos/cpu_py37_pip.sh
index ed577db961a..ea6779be698 100644
--- a/tensorflow/tools/ci_build/rel/macos/cpu_py37_pip.sh
+++ b/tensorflow/tools/ci_build/rel/macos/cpu_py37_pip.sh
@@ -33,13 +33,11 @@ export TF_PYTHON_VERSION='python3.7'
 export TF_BUILD_BOTH_CPU_PACKAGES=1
 
 # Run configure.
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
 # Export optional variables for running pip.sh
-export TF_BUILD_FLAGS="--config=opt --config=v2"
+export TF_BUILD_FLAGS="--config=release_cpu_macos"
 export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going --test_env=TF2_BEHAVIOR=1"
 export TF_TEST_TARGETS="//tensorflow/python/..."
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
diff --git a/tensorflow/tools/ci_build/rel/macos/cpu_py38_nonpip.sh b/tensorflow/tools/ci_build/rel/macos/cpu_py38_nonpip.sh
old mode 100755
new mode 100644
diff --git a/tensorflow/tools/ci_build/rel/macos/cpu_py38_pip.sh b/tensorflow/tools/ci_build/rel/macos/cpu_py38_pip.sh
index f8eda5a7520..f0ef8e89766 100644
--- a/tensorflow/tools/ci_build/rel/macos/cpu_py38_pip.sh
+++ b/tensorflow/tools/ci_build/rel/macos/cpu_py38_pip.sh
@@ -33,13 +33,11 @@ export TF_PYTHON_VERSION='python3.8'
 export TF_BUILD_BOTH_CPU_PACKAGES=1
 
 # Run configure.
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
 # Export optional variables for running pip.sh
-export TF_BUILD_FLAGS="--config=opt --config=v2"
+export TF_BUILD_FLAGS="--config=release_cpu_macos"
 export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going --test_env=TF2_BEHAVIOR=1"
 export TF_TEST_TARGETS="//tensorflow/python/..."
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
diff --git a/tensorflow/tools/ci_build/rel/ubuntu/cpu_libtensorflow.sh b/tensorflow/tools/ci_build/rel/ubuntu/cpu_libtensorflow.sh
old mode 100755
new mode 100644
index a0e3a7f4594..1504688dcbc
--- a/tensorflow/tools/ci_build/rel/ubuntu/cpu_libtensorflow.sh
+++ b/tensorflow/tools/ci_build/rel/ubuntu/cpu_libtensorflow.sh
@@ -38,3 +38,9 @@ if [ -n "${IS_NIGHTLY_BUILD}" ]; then
   cp tensorflow/tools/ci_build/builds/libtensorflow_nightly_symlink.sh lib_package
 fi
 
+# Upload to go/tf-sizetracker
+python3 ./tensorflow/tools/ci_build/sizetrack_helper.py \
+  --team tensorflow_libtensorflow \
+  --artifact_id ubuntu_cpu_nightly \
+  --upload \
+  --artifact "$(find lib_package -iname "libtensorflow*.tar.gz" -not -iname "*jni*" | head -n 1)"
diff --git a/tensorflow/tools/ci_build/rel/ubuntu/cpu_py35_nonpip.sh b/tensorflow/tools/ci_build/rel/ubuntu/cpu_py35_nonpip.sh
old mode 100755
new mode 100644
diff --git a/tensorflow/tools/ci_build/rel/ubuntu/cpu_py35_pip.sh b/tensorflow/tools/ci_build/rel/ubuntu/cpu_py35_pip.sh
old mode 100755
new mode 100644
index b938ed2fde1..bdbb7f15e34
--- a/tensorflow/tools/ci_build/rel/ubuntu/cpu_py35_pip.sh
+++ b/tensorflow/tools/ci_build/rel/ubuntu/cpu_py35_pip.sh
@@ -28,11 +28,6 @@ export CONTAINER_TYPE="CPU"
 export TF_PYTHON_VERSION='python3.5'
 
 # Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
@@ -40,7 +35,7 @@ yes "" | "$PYTHON_BIN_PATH" configure.py
 source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 # Export optional variables for running pip.sh
-export TF_BUILD_FLAGS="--config=opt --config=v2 --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain"
+export TF_BUILD_FLAGS="--config=release_cpu_linux"
 export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going --test_env=TF2_BEHAVIOR=1"
 export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
diff --git a/tensorflow/tools/ci_build/rel/ubuntu/cpu_py36_nonpip.sh b/tensorflow/tools/ci_build/rel/ubuntu/cpu_py36_nonpip.sh
old mode 100755
new mode 100644
diff --git a/tensorflow/tools/ci_build/rel/ubuntu/cpu_py36_pip.sh b/tensorflow/tools/ci_build/rel/ubuntu/cpu_py36_pip.sh
old mode 100755
new mode 100644
index 44ae820c507..6277291043c
--- a/tensorflow/tools/ci_build/rel/ubuntu/cpu_py36_pip.sh
+++ b/tensorflow/tools/ci_build/rel/ubuntu/cpu_py36_pip.sh
@@ -28,11 +28,6 @@ export CONTAINER_TYPE="CPU"
 export TF_PYTHON_VERSION='python3.6'
 
 # Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
@@ -40,7 +35,7 @@ yes "" | "$PYTHON_BIN_PATH" configure.py
 source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 # Export optional variables for running pip.sh
-export TF_BUILD_FLAGS="--config=opt --config=v2 --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain"
+export TF_BUILD_FLAGS="--config=release_cpu_linux"
 export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going --test_env=TF2_BEHAVIOR=1"
 export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
diff --git a/tensorflow/tools/ci_build/rel/ubuntu/cpu_py37_nonpip.sh b/tensorflow/tools/ci_build/rel/ubuntu/cpu_py37_nonpip.sh
old mode 100755
new mode 100644
diff --git a/tensorflow/tools/ci_build/rel/ubuntu/cpu_py37_pip.sh b/tensorflow/tools/ci_build/rel/ubuntu/cpu_py37_pip.sh
old mode 100755
new mode 100644
index 28784f9f499..ff88ae46f39
--- a/tensorflow/tools/ci_build/rel/ubuntu/cpu_py37_pip.sh
+++ b/tensorflow/tools/ci_build/rel/ubuntu/cpu_py37_pip.sh
@@ -28,11 +28,6 @@ export CONTAINER_TYPE="CPU"
 export TF_PYTHON_VERSION='python3.7'
 
 # Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
@@ -40,7 +35,7 @@ yes "" | "$PYTHON_BIN_PATH" configure.py
 source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 # Export optional variables for running pip.sh
-export TF_BUILD_FLAGS="--config=opt --config=v2 --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain"
+export TF_BUILD_FLAGS="--config=release_cpu_linux"
 export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going --test_env=TF2_BEHAVIOR=1"
 export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
diff --git a/tensorflow/tools/ci_build/rel/ubuntu/cpu_py38_nonpip.sh b/tensorflow/tools/ci_build/rel/ubuntu/cpu_py38_nonpip.sh
old mode 100755
new mode 100644
diff --git a/tensorflow/tools/ci_build/rel/ubuntu/cpu_py38_pip.sh b/tensorflow/tools/ci_build/rel/ubuntu/cpu_py38_pip.sh
old mode 100755
new mode 100644
index ace3257479a..52872cfd0a6
--- a/tensorflow/tools/ci_build/rel/ubuntu/cpu_py38_pip.sh
+++ b/tensorflow/tools/ci_build/rel/ubuntu/cpu_py38_pip.sh
@@ -28,11 +28,6 @@ export CONTAINER_TYPE="CPU"
 export TF_PYTHON_VERSION='python3.8'
 
 # Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
@@ -40,7 +35,7 @@ yes "" | "$PYTHON_BIN_PATH" configure.py
 source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 # Export optional variables for running pip.sh
-export TF_BUILD_FLAGS="--config=opt --config=v2 --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain"
+export TF_BUILD_FLAGS="--config=release_cpu_linux"
 export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going --test_env=TF2_BEHAVIOR=1"
 export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
diff --git a/tensorflow/tools/ci_build/rel/ubuntu/gpu_libtensorflow.sh b/tensorflow/tools/ci_build/rel/ubuntu/gpu_libtensorflow.sh
old mode 100755
new mode 100644
diff --git a/tensorflow/tools/ci_build/rel/ubuntu/gpu_pip_on_cpu.sh b/tensorflow/tools/ci_build/rel/ubuntu/gpu_pip_on_cpu.sh
index 6e67bf20730..22ca5b7b567 100755
--- a/tensorflow/tools/ci_build/rel/ubuntu/gpu_pip_on_cpu.sh
+++ b/tensorflow/tools/ci_build/rel/ubuntu/gpu_pip_on_cpu.sh
@@ -27,14 +27,14 @@ export TF_NEED_GCP=1
 export TF_NEED_HDFS=1
 export TF_NEED_S3=1
 export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=10
-export TF_CUDNN_VERSION=7
+export TF_CUDA_VERSION=11
+export TF_CUDNN_VERSION=8
 export TF_NEED_TENSORRT=1
 export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
 export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which python3.6)
 export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
+export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_50,sm_60,sm_70,sm_75,compute_80
 
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
@@ -42,7 +42,7 @@ yes "" | "$PYTHON_BIN_PATH" configure.py
 ## Build GPU pip package
 ########################
 bazel build --config=opt \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
+  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda11:toolchain \
   tensorflow/tools/pip_package:build_pip_package
 
 # Set TF nightly flag so we get the proper version of estimator
diff --git a/tensorflow/tools/ci_build/rel/ubuntu/gpu_py35_nonpip.sh b/tensorflow/tools/ci_build/rel/ubuntu/gpu_py35_nonpip.sh
old mode 100755
new mode 100644
index 47ed3c4fd2a..a8dfd2047ba
--- a/tensorflow/tools/ci_build/rel/ubuntu/gpu_py35_nonpip.sh
+++ b/tensorflow/tools/ci_build/rel/ubuntu/gpu_py35_nonpip.sh
@@ -27,8 +27,8 @@ export TF_NEED_GCP=1
 export TF_NEED_HDFS=1
 export TF_NEED_S3=1
 export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=10
-export TF_CUDNN_VERSION=7
+export TF_CUDA_VERSION=11
+export TF_CUDNN_VERSION=8
 export TF_NEED_TENSORRT=1
 export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
 export CC_OPT_FLAGS='-mavx'
@@ -36,7 +36,7 @@ export PYTHON_BIN_PATH=$(which python3.5)
 export TF2_BEHAVIOR=1
 export PROJECT_NAME="tensorflow_gpu"
 export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
+export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_50,sm_60,sm_70,sm_75,compute_80
 
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
@@ -47,7 +47,7 @@ tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py35"
 
 set +e
 bazel test --config=cuda --config=opt \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
+  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda11:toolchain \
   --linkopt=-lrt \
   --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
   --test_lang_filters=py \
diff --git a/tensorflow/tools/ci_build/rel/ubuntu/gpu_py35_pip.sh b/tensorflow/tools/ci_build/rel/ubuntu/gpu_py35_pip.sh
old mode 100755
new mode 100644
index 5b0ee602cfa..f178ac0754e
--- a/tensorflow/tools/ci_build/rel/ubuntu/gpu_py35_pip.sh
+++ b/tensorflow/tools/ci_build/rel/ubuntu/gpu_py35_pip.sh
@@ -28,20 +28,7 @@ export CONTAINER_TYPE="GPU"
 export TF_PYTHON_VERSION='python3.5'
 
 # Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=10
-export TF_CUDNN_VERSION=7
-export TF_NEED_TENSORRT=1
-export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
-export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-export PROJECT_NAME="tensorflow_gpu"
-export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
-
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
 # Get the default test targets for bazel.
@@ -49,18 +36,17 @@ source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 # Export optional variables for running pip.sh
 export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py35'
-export TF_BUILD_FLAGS="--config=opt --config=v2 --config=cuda --distinct_host_configuration=false \
---action_env=TF_CUDA_VERSION --action_env=TF_CUDNN_VERSION --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain "
+export TF_BUILD_FLAGS="--config=release_gpu_linux "
 export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filters=${TF_TEST_FILTER_TAGS} \
 --distinct_host_configuration=false \
---action_env=TF_CUDA_VERSION --action_env=TF_CUDNN_VERSION --test_env=TF2_BEHAVIOR=1 \
+--action_env=TF_CUDA_VERSION=11 --action_env=TF_CUDNN_VERSION=8 --test_env=TF2_BEHAVIOR=1 \
 --config=cuda --test_output=errors --local_test_jobs=4 --test_lang_filters=py \
 --verbose_failures=true --keep_going --define=no_tensorflow_py_deps=true \
 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
 export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 #export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME=${PROJECT_NAME}
+export TF_PROJECT_NAME="tensorflow_gpu"
 export TF_PIP_TEST_ROOT="pip_test"
 
 # To build both tensorflow and tensorflow-gpu pip packages
diff --git a/tensorflow/tools/ci_build/rel/ubuntu/gpu_py36_nonpip.sh b/tensorflow/tools/ci_build/rel/ubuntu/gpu_py36_nonpip.sh
old mode 100755
new mode 100644
index 70038a8d875..c52acec7784
--- a/tensorflow/tools/ci_build/rel/ubuntu/gpu_py36_nonpip.sh
+++ b/tensorflow/tools/ci_build/rel/ubuntu/gpu_py36_nonpip.sh
@@ -27,8 +27,8 @@ export TF_NEED_GCP=1
 export TF_NEED_HDFS=1
 export TF_NEED_S3=1
 export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=10
-export TF_CUDNN_VERSION=7
+export TF_CUDA_VERSION=11
+export TF_CUDNN_VERSION=8
 export TF_NEED_TENSORRT=1
 export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
 export CC_OPT_FLAGS='-mavx'
@@ -36,7 +36,7 @@ export PYTHON_BIN_PATH=$(which python3.6)
 export TF2_BEHAVIOR=1
 export PROJECT_NAME="tensorflow_gpu"
 export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
+export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_50,sm_60,sm_70,sm_75,compute_80
 
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
@@ -47,7 +47,7 @@ tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py36"
 
 set +e
 bazel test --config=cuda --config=opt \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
+  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda11:toolchain \
   --linkopt=-lrt \
   --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
   --test_lang_filters=py \
diff --git a/tensorflow/tools/ci_build/rel/ubuntu/gpu_py36_pip.sh b/tensorflow/tools/ci_build/rel/ubuntu/gpu_py36_pip.sh
old mode 100755
new mode 100644
index 3223149f5a4..9bc559a01ab
--- a/tensorflow/tools/ci_build/rel/ubuntu/gpu_py36_pip.sh
+++ b/tensorflow/tools/ci_build/rel/ubuntu/gpu_py36_pip.sh
@@ -28,20 +28,7 @@ export CONTAINER_TYPE="GPU"
 export TF_PYTHON_VERSION='python3.6'
 
 # Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=10
-export TF_CUDNN_VERSION=7
-export TF_NEED_TENSORRT=1
-export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
-export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-export PROJECT_NAME="tensorflow_gpu"
-export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
-
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
 # Get the default test targets for bazel.
@@ -49,18 +36,17 @@ source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 # Export optional variables for running pip.sh
 export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py36'
-export TF_BUILD_FLAGS="--config=opt --config=v2 --config=cuda --distinct_host_configuration=false \
---action_env=TF_CUDA_VERSION --action_env=TF_CUDNN_VERSION --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain "
+export TF_BUILD_FLAGS="--config=release_gpu_linux "
 export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filters=${TF_TEST_FILTER_TAGS} \
 --distinct_host_configuration=false \
---action_env=TF_CUDA_VERSION --action_env=TF_CUDNN_VERSION --test_env=TF2_BEHAVIOR=1 \
+--action_env=TF_CUDA_VERSION=11 --action_env=TF_CUDNN_VERSION=8 --test_env=TF2_BEHAVIOR=1 \
 --config=cuda --test_output=errors --local_test_jobs=4 --test_lang_filters=py \
 --verbose_failures=true --keep_going --define=no_tensorflow_py_deps=true \
 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
 export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 #export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME=${PROJECT_NAME}
+export TF_PROJECT_NAME=="tensorflow_gpu"
 export TF_PIP_TEST_ROOT="pip_test"
 
 # To build both tensorflow and tensorflow-gpu pip packages
diff --git a/tensorflow/tools/ci_build/rel/ubuntu/gpu_py37_nonpip.sh b/tensorflow/tools/ci_build/rel/ubuntu/gpu_py37_nonpip.sh
old mode 100755
new mode 100644
index 225b2cf4b7b..bf5fabba741
--- a/tensorflow/tools/ci_build/rel/ubuntu/gpu_py37_nonpip.sh
+++ b/tensorflow/tools/ci_build/rel/ubuntu/gpu_py37_nonpip.sh
@@ -27,8 +27,8 @@ export TF_NEED_GCP=1
 export TF_NEED_HDFS=1
 export TF_NEED_S3=1
 export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=10
-export TF_CUDNN_VERSION=7
+export TF_CUDA_VERSION=11
+export TF_CUDNN_VERSION=8
 export TF_NEED_TENSORRT=1
 export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
 export CC_OPT_FLAGS='-mavx'
@@ -36,7 +36,7 @@ export PYTHON_BIN_PATH=$(which python3.7)
 export TF2_BEHAVIOR=1
 export PROJECT_NAME="tensorflow_gpu"
 export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
+export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_50,sm_60,sm_70,sm_75,compute_80
 
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
@@ -47,7 +47,7 @@ tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py37"
 
 set +e
 bazel test --config=cuda --config=opt \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
+  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda11:toolchain \
   --linkopt=-lrt \
   --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
   --test_lang_filters=py \
diff --git a/tensorflow/tools/ci_build/rel/ubuntu/gpu_py37_pip.sh b/tensorflow/tools/ci_build/rel/ubuntu/gpu_py37_pip.sh
old mode 100755
new mode 100644
index 5dfffbe3fe1..71d6f3e6401
--- a/tensorflow/tools/ci_build/rel/ubuntu/gpu_py37_pip.sh
+++ b/tensorflow/tools/ci_build/rel/ubuntu/gpu_py37_pip.sh
@@ -28,20 +28,7 @@ export CONTAINER_TYPE="GPU"
 export TF_PYTHON_VERSION='python3.7'
 
 # Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=10
-export TF_CUDNN_VERSION=7
-export TF_NEED_TENSORRT=1
-export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
-export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-export PROJECT_NAME="tensorflow_gpu"
-export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
-
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
 # Get the default test targets for bazel.
@@ -49,18 +36,17 @@ source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 # Export optional variables for running pip.sh
 export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py37'
-export TF_BUILD_FLAGS="--config=opt --config=v2 --config=cuda --distinct_host_configuration=false \
---action_env=TF_CUDA_VERSION --action_env=TF_CUDNN_VERSION --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain "
+export TF_BUILD_FLAGS="--config=release_gpu_linux "
 export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filters=${TF_TEST_FILTER_TAGS} \
 --distinct_host_configuration=false \
---action_env=TF_CUDA_VERSION --action_env=TF_CUDNN_VERSION --test_env=TF2_BEHAVIOR=1 \
+--action_env=TF_CUDA_VERSION=11 --action_env=TF_CUDNN_VERSION=8 --test_env=TF2_BEHAVIOR=1 \
 --config=cuda --test_output=errors --local_test_jobs=4 --test_lang_filters=py \
 --verbose_failures=true --keep_going --define=no_tensorflow_py_deps=true \
 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
 export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 #export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME=${PROJECT_NAME}
+export TF_PROJECT_NAME=="tensorflow_gpu"
 export TF_PIP_TEST_ROOT="pip_test"
 
 # To build both tensorflow and tensorflow-gpu pip packages
diff --git a/tensorflow/tools/ci_build/rel/ubuntu/gpu_py38_nonpip.sh b/tensorflow/tools/ci_build/rel/ubuntu/gpu_py38_nonpip.sh
old mode 100755
new mode 100644
index f7678b7436f..5f29daf36e0
--- a/tensorflow/tools/ci_build/rel/ubuntu/gpu_py38_nonpip.sh
+++ b/tensorflow/tools/ci_build/rel/ubuntu/gpu_py38_nonpip.sh
@@ -27,8 +27,8 @@ export TF_NEED_GCP=1
 export TF_NEED_HDFS=1
 export TF_NEED_S3=1
 export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=10
-export TF_CUDNN_VERSION=7
+export TF_CUDA_VERSION=11
+export TF_CUDNN_VERSION=8
 export TF_NEED_TENSORRT=1
 export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
 export CC_OPT_FLAGS='-mavx'
@@ -36,7 +36,7 @@ export PYTHON_BIN_PATH=$(which python3.8)
 export TF2_BEHAVIOR=1
 export PROJECT_NAME="tensorflow_gpu"
 export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
+export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_50,sm_60,sm_70,sm_75,compute_80
 
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
@@ -47,7 +47,7 @@ tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py38"
 
 test +e
 bazel test --config=cuda --config=opt \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
+  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda11:toolchain \
   --linkopt=-lrt \
   --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
   --test_lang_filters=py \
diff --git a/tensorflow/tools/ci_build/rel/ubuntu/gpu_py38_pip.sh b/tensorflow/tools/ci_build/rel/ubuntu/gpu_py38_pip.sh
old mode 100755
new mode 100644
index cc0a5254607..f49b77bae70
--- a/tensorflow/tools/ci_build/rel/ubuntu/gpu_py38_pip.sh
+++ b/tensorflow/tools/ci_build/rel/ubuntu/gpu_py38_pip.sh
@@ -28,20 +28,7 @@ export CONTAINER_TYPE="GPU"
 export TF_PYTHON_VERSION='python3.8'
 
 # Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=10
-export TF_CUDNN_VERSION=7
-export TF_NEED_TENSORRT=1
-export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
-export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-export PROJECT_NAME="tensorflow_gpu"
-export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
-
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
 # Get the default test targets for bazel.
@@ -49,18 +36,17 @@ source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 # Export optional variables for running pip.sh
 export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py38'
-export TF_BUILD_FLAGS="--config=opt --config=v2 --config=cuda --distinct_host_configuration=false \
---action_env=TF_CUDA_VERSION --action_env=TF_CUDNN_VERSION --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain "
+export TF_BUILD_FLAGS="--config=release_gpu_linux "
 export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filters=${TF_TEST_FILTER_TAGS} \
 --distinct_host_configuration=false \
---action_env=TF_CUDA_VERSION --action_env=TF_CUDNN_VERSION --test_env=TF2_BEHAVIOR=1 \
+--action_env=TF_CUDA_VERSION=11 --action_env=TF_CUDNN_VERSION=8 --test_env=TF2_BEHAVIOR=1 \
 --config=cuda --test_output=errors --local_test_jobs=4 --test_lang_filters=py \
 --verbose_failures=true --keep_going --define=no_tensorflow_py_deps=true \
 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
 export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 #export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME=${PROJECT_NAME}
+export TF_PROJECT_NAME=="tensorflow_gpu"
 export TF_PIP_TEST_ROOT="pip_test"
 
 # To build both tensorflow and tensorflow-gpu pip packages
diff --git a/tensorflow/tools/ci_build/rel/ubuntu/sanity.sh b/tensorflow/tools/ci_build/rel/ubuntu/sanity.sh
old mode 100755
new mode 100644
diff --git a/tensorflow/tools/ci_build/rel/windows/cpu_libtensorflow.bat b/tensorflow/tools/ci_build/rel/windows/cpu_libtensorflow.bat
old mode 100755
new mode 100644
diff --git a/tensorflow/tools/ci_build/rel/windows/cpu_py35.bat b/tensorflow/tools/ci_build/rel/windows/cpu_py35.bat
old mode 100755
new mode 100644
index 02b12c7650a..175917d7cad
--- a/tensorflow/tools/ci_build/rel/windows/cpu_py35.bat
+++ b/tensorflow/tools/ci_build/rel/windows/cpu_py35.bat
@@ -17,4 +17,4 @@ SET PYTHON_DIRECTORY=Python35
 
 CALL tensorflow\tools\ci_build\release\common_win.bat
 
-call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_build_flags "--config=v2 --define=no_tensorflow_py_deps=true" --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
+call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
diff --git a/tensorflow/tools/ci_build/rel/windows/cpu_py36.bat b/tensorflow/tools/ci_build/rel/windows/cpu_py36.bat
old mode 100755
new mode 100644
index e44e6ca6e18..85b75053eff
--- a/tensorflow/tools/ci_build/rel/windows/cpu_py36.bat
+++ b/tensorflow/tools/ci_build/rel/windows/cpu_py36.bat
@@ -17,4 +17,4 @@ SET PYTHON_DIRECTORY=Python36
 
 CALL tensorflow\tools\ci_build\release\common_win.bat
 
-call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_build_flags "--config=v2 --define=no_tensorflow_py_deps=true" --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
+call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
diff --git a/tensorflow/tools/ci_build/rel/windows/cpu_py37.bat b/tensorflow/tools/ci_build/rel/windows/cpu_py37.bat
old mode 100755
new mode 100644
index c65167a5dc6..d8a6673ba4c
--- a/tensorflow/tools/ci_build/rel/windows/cpu_py37.bat
+++ b/tensorflow/tools/ci_build/rel/windows/cpu_py37.bat
@@ -17,4 +17,4 @@ SET PYTHON_DIRECTORY=Python37
 
 CALL tensorflow\tools\ci_build\release\common_win.bat
 
-call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_build_flags "--config=v2 --define=no_tensorflow_py_deps=true" --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
+call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
diff --git a/tensorflow/tools/ci_build/rel/windows/cpu_py38.bat b/tensorflow/tools/ci_build/rel/windows/cpu_py38.bat
old mode 100755
new mode 100644
index 06599fc0d8c..86adcda0bb9
--- a/tensorflow/tools/ci_build/rel/windows/cpu_py38.bat
+++ b/tensorflow/tools/ci_build/rel/windows/cpu_py38.bat
@@ -17,5 +17,5 @@ SET PYTHON_DIRECTORY=Python38
 
 CALL tensorflow\tools\ci_build\release\common_win.bat
 
-call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_build_flags "--config=v2 --define=no_tensorflow_py_deps=true" --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
+call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
 
diff --git a/tensorflow/tools/ci_build/rel/windows/gpu_libtensorflow.bat b/tensorflow/tools/ci_build/rel/windows/gpu_libtensorflow.bat
old mode 100755
new mode 100644
diff --git a/tensorflow/tools/ci_build/rel/windows/gpu_pip_on_cpu.bat b/tensorflow/tools/ci_build/rel/windows/gpu_pip_on_cpu.bat
old mode 100755
new mode 100644
diff --git a/tensorflow/tools/ci_build/rel/windows/gpu_py35.bat b/tensorflow/tools/ci_build/rel/windows/gpu_py35.bat
old mode 100755
new mode 100644
index cba62225bee..86c118b2f83
--- a/tensorflow/tools/ci_build/rel/windows/gpu_py35.bat
+++ b/tensorflow/tools/ci_build/rel/windows/gpu_py35.bat
@@ -17,7 +17,7 @@ SET PYTHON_DIRECTORY=Python35
 
 CALL tensorflow\tools\ci_build\release\common_win.bat
 
-call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --release_build --extra_build_flags "--config=v2" --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow"
+call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow"
 
 for %%a in ("%~dp0\.") do set "PARENT_DIR=%%~nxa"
 bash -l tensorflow\tools\ci_build\release\windows\%PARENT_DIR%\release_pip_rename.sh
diff --git a/tensorflow/tools/ci_build/rel/windows/gpu_py36.bat b/tensorflow/tools/ci_build/rel/windows/gpu_py36.bat
old mode 100755
new mode 100644
index ede8bd35f52..cc4f84afbee
--- a/tensorflow/tools/ci_build/rel/windows/gpu_py36.bat
+++ b/tensorflow/tools/ci_build/rel/windows/gpu_py36.bat
@@ -17,7 +17,7 @@ SET PYTHON_DIRECTORY=Python36
 
 CALL tensorflow\tools\ci_build\release\common_win.bat
 
-call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --release_build --extra_build_flags "--config=v2" --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow"
+call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow"
 
 for %%a in ("%~dp0\.") do set "PARENT_DIR=%%~nxa"
 bash -l tensorflow\tools\ci_build\release\windows\%PARENT_DIR%\release_pip_rename.sh
\ No newline at end of file
diff --git a/tensorflow/tools/ci_build/rel/windows/gpu_py37.bat b/tensorflow/tools/ci_build/rel/windows/gpu_py37.bat
old mode 100755
new mode 100644
index 7509270fc43..5fa798e3eb8
--- a/tensorflow/tools/ci_build/rel/windows/gpu_py37.bat
+++ b/tensorflow/tools/ci_build/rel/windows/gpu_py37.bat
@@ -17,7 +17,7 @@ SET PYTHON_DIRECTORY=Python37
 
 CALL tensorflow\tools\ci_build\release\common_win.bat
 
-call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --release_build --extra_build_flags "--config=v2" --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow"
+call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow"
 
 for %%a in ("%~dp0\.") do set "PARENT_DIR=%%~nxa"
 bash -l tensorflow\tools\ci_build\release\windows\%PARENT_DIR%\release_pip_rename.sh
\ No newline at end of file
diff --git a/tensorflow/tools/ci_build/rel/windows/gpu_py38.bat b/tensorflow/tools/ci_build/rel/windows/gpu_py38.bat
old mode 100755
new mode 100644
index fc1c600fa5e..fa1fc131145
--- a/tensorflow/tools/ci_build/rel/windows/gpu_py38.bat
+++ b/tensorflow/tools/ci_build/rel/windows/gpu_py38.bat
@@ -17,7 +17,7 @@ SET PYTHON_DIRECTORY=Python38
 
 CALL tensorflow\tools\ci_build\release\common_win.bat
 
-call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --release_build --extra_build_flags "--config=v2" --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow"
+call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow"
 
 for %%a in ("%~dp0\.") do set "PARENT_DIR=%%~nxa"
 bash -l tensorflow\tools\ci_build\release\windows\%PARENT_DIR%\release_pip_rename.sh
diff --git a/tensorflow/tools/ci_build/release/common.sh b/tensorflow/tools/ci_build/release/common.sh
index c3b5bd9a867..ec335ad7408 100644
--- a/tensorflow/tools/ci_build/release/common.sh
+++ b/tensorflow/tools/ci_build/release/common.sh
@@ -142,6 +142,7 @@ function install_pip_deps {
   ${SUDO_CMD} ${PIP_CMD} install portpicker
   ${SUDO_CMD} ${PIP_CMD} install scipy
   ${SUDO_CMD} ${PIP_CMD} install scikit-learn
+  ${SUDO_CMD} ${PIP_CMD} install typing_extensions
   ${SUDO_CMD} ${PIP_CMD} install --upgrade tb-nightly
   ${PIP_CMD} install --user --upgrade flatbuffers
   ${PIP_CMD} install --user --upgrade attrs
@@ -178,6 +179,7 @@ function install_ubuntu_16_pip_deps {
   "${PIP_CMD}" install portpicker --user
   "${PIP_CMD}" install scipy --user
   "${PIP_CMD}" install scikit-learn --user
+  "${PIP_CMD}" install typing_extensions --user
   "${PIP_CMD}" install PyYAML==3.13 --user
   # b/156523241
   "${PIP_CMD}" install --force-reinstall --user --upgrade tf-estimator-nightly
@@ -220,6 +222,7 @@ function install_macos_pip_deps {
   ${SUDO_CMD} ${PIP_CMD} install numpy==1.16.0
   ${SUDO_CMD} ${PIP_CMD} install gast==0.3.3
   ${SUDO_CMD} ${PIP_CMD} install h5py==2.10.0
+  ${SUDO_CMD} ${PIP_CMD} install typing_extensions
   ${SUDO_CMD} ${PIP_CMD} install --upgrade grpcio
   ${SUDO_CMD} ${PIP_CMD} install --upgrade tb-nightly
   ${PIP_CMD} install --user --upgrade flatbuffers
@@ -257,22 +260,22 @@ function copy_to_new_project_name {
   VERSION="$(echo "${FULL_TAG}" | cut -d '-' -f 1)"
 
   TMP_DIR="$(mktemp -d)"
-  cp "${WHL_PATH}" "${TMP_DIR}"
-  pushd "${TMP_DIR}"
-  unzip -q "${ORIGINAL_WHL_NAME}"
+  wheel unpack "${WHL_PATH}" -d "${TMP_DIR}"
+  TMP_UNPACKED_DIR="$(ls -d "${TMP_DIR}"/* | head -n 1)"
+  pushd "${TMP_UNPACKED_DIR}"
 
   ORIGINAL_WHL_DIR_PREFIX="${ORIGINAL_PROJECT_NAME}-${VERSION}"
   NEW_WHL_DIR_PREFIX="${NEW_PROJECT_NAME}-${VERSION}"
   mv "${ORIGINAL_WHL_DIR_PREFIX}.dist-info" "${NEW_WHL_DIR_PREFIX}.dist-info"
-  mv "${ORIGINAL_WHL_DIR_PREFIX}.data" "${NEW_WHL_DIR_PREFIX}.data" || echo
-  sed -i.bak "s/${ORIGINAL_PROJECT_NAME}/${NEW_PROJECT_NAME}/g" "${NEW_WHL_DIR_PREFIX}.dist-info/RECORD"
+  if [[ -d "${ORIGINAL_WHL_DIR_PREFIX}.data" ]]; then
+    mv "${ORIGINAL_WHL_DIR_PREFIX}.data" "${NEW_WHL_DIR_PREFIX}.data"
+  fi
 
   ORIGINAL_PROJECT_NAME_DASH="${ORIGINAL_PROJECT_NAME//_/-}"
   NEW_PROJECT_NAME_DASH="${NEW_PROJECT_NAME//_/-}"
   sed -i.bak "s/${ORIGINAL_PROJECT_NAME_DASH}/${NEW_PROJECT_NAME_DASH}/g" "${NEW_WHL_DIR_PREFIX}.dist-info/METADATA"
 
-  zip -rq "${NEW_WHL_NAME}" "${NEW_WHL_DIR_PREFIX}.dist-info" "${NEW_WHL_DIR_PREFIX}.data" "tensorflow" "tensorflow_core"
-  mv "${NEW_WHL_NAME}" "${ORIGINAL_WHL_DIR}"
+  wheel pack "${TMP_UNPACKED_DIR}" -d "${ORIGINAL_WHL_DIR}"
   popd
   rm -rf "${TMP_DIR}"
 }
diff --git a/tensorflow/tools/ci_build/release/common_win.bat b/tensorflow/tools/ci_build/release/common_win.bat
index e460ec8b0e1..6d3a6e51abb 100644
--- a/tensorflow/tools/ci_build/release/common_win.bat
+++ b/tensorflow/tools/ci_build/release/common_win.bat
@@ -57,13 +57,16 @@ IF "%PYTHON_DIRECTORY%"=="Python37" (
 @REM handle this case.
 %PIP_EXE% install gast==0.3.3
 %PIP_EXE% install astunparse==1.6.3
+%PIP_EXE% install typing_extensions
 
 :: Set cuda related environment variables. If we are not using CUDA, these are not used.
 IF NOT DEFINED TF_CUDA_VERSION (
-  SET TF_CUDA_VERSION=10.1
+  SET TF_CUDA_VERSION=11.0
 )
-SET TF_CUDNN_VERSION=7
-SET TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
+IF NOT DEFINED TF_CUDNN_VERSION (
+  SET TF_CUDNN_VERSION=8
+)
+SET TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_50,sm_60,sm_70,sm_75,compute_80
 SET CUDA_TOOLKIT_PATH=C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v%TF_CUDA_VERSION%
 SET CUDNN_INSTALL_PATH=C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v%TF_CUDA_VERSION%
 SET PATH=%CUDA_TOOLKIT_PATH%\extras\CUPTI\libx64;%PATH%
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py2_full/nightly_release.sh b/tensorflow/tools/ci_build/release/macos/cpu_py2_full/nightly_release.sh
deleted file mode 100644
index 6dc3e3849ad..00000000000
--- a/tensorflow/tools/ci_build/release/macos/cpu_py2_full/nightly_release.sh
+++ /dev/null
@@ -1,62 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-install_bazelisk
-
-# Pick a version of xcode
-export DEVELOPER_DIR=/Applications/Xcode_10.3.app/Contents/Developer
-sudo xcode-select -s "${DEVELOPER_DIR}"
-
-# Install dependencies
-install_macos_pip_deps sudo
-sudo pip install twine
-
-./tensorflow/tools/ci_build/update_version.py --nightly
-
-# Run configure.
-export PYTHON_BIN_PATH=$(which python2)
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Build the pip package
-bazel build --config=release_cpu_macos tensorflow/tools/pip_package:build_pip_package
-mkdir pip_pkg
-./bazel-bin/tensorflow/tools/pip_package/build_pip_package pip_pkg --cpu --nightly_flag
-
-# Copy and rename to tf_nightly
-for f in $(ls pip_pkg/tf_nightly_cpu-*dev*macosx*.whl); do
-  copy_to_new_project_name "${f}" tf_nightly
-done
-
-# Upload the built packages to pypi.
-for f in $(ls pip_pkg/tf_nightly*dev*macosx*.whl); do
-
-  # test the whl pip package
-  chmod +x tensorflow/tools/ci_build/builds/nightly_release_smoke_test.sh
-  ./tensorflow/tools/ci_build/builds/nightly_release_smoke_test.sh ${f}
-  RETVAL=$?
-
-  # Upload the PIP package if whl test passes.
-  if [ ${RETVAL} -eq 0 ]; then
-    echo "Basic PIP test PASSED, Uploading package: ${f}"
-    twine upload -r pypi-warehouse "${f}" || echo
-  else
-    echo "Basic PIP test FAILED, will not upload ${f} package"
-    return 1
-  fi
-done
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py2_full/nonpip.sh b/tensorflow/tools/ci_build/release/macos/cpu_py2_full/nonpip.sh
deleted file mode 100644
index 9031cd9be63..00000000000
--- a/tensorflow/tools/ci_build/release/macos/cpu_py2_full/nonpip.sh
+++ /dev/null
@@ -1,51 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-install_bazelisk
-
-# Pick a more recent version of xcode
-export DEVELOPER_DIR=/Applications/Xcode_10.3.app/Contents/Developer
-sudo xcode-select -s "${DEVELOPER_DIR}"
-python -m virtualenv tf_build_env --system-site-packages
-source tf_build_env/bin/activate
-
-# Install macos pip dependencies
-install_macos_pip_deps sudo
-
-# Run configure.
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
-export TF2_BEHAVIOR=1
-export PYTHON_BIN_PATH=$(which python2)
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-tag_filters="-no_oss,-oss_serial,-nomac,-no_mac,-no_oss_py2,-v1only,-gpu,-tpu,-benchmark-test"
-
-# Run tests
-set +e
-bazel test --test_output=errors --config=opt \
-  --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
-  --build_tag_filters="${tag_filters}" \
-  --test_tag_filters="${tag_filters}" -- \
-  ${DEFAULT_BAZEL_TARGETS} \
-  -//tensorflow/lite/...
-test_xml_summary_exit
\ No newline at end of file
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py2_full/nonpip_v1.sh b/tensorflow/tools/ci_build/release/macos/cpu_py2_full/nonpip_v1.sh
deleted file mode 100644
index 8817b19fa7b..00000000000
--- a/tensorflow/tools/ci_build/release/macos/cpu_py2_full/nonpip_v1.sh
+++ /dev/null
@@ -1,48 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-install_bazelisk
-
-# Pick a more recent version of xcode
-sudo xcode-select --switch /Applications/Xcode_9.2.app/Contents/Developer
-
-# Install pip dependencies
-python -m virtualenv tf_build_env --system-site-packages
-source tf_build_env/bin/activate
-install_macos_pip_deps sudo
-
-# Run configure.
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which python2)
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# TODO(b/122370901): Fix nomac, no_mac inconsistency.
-tag_filters="-no_oss,-oss_serial,-nomac,-no_mac,-no_oss_py2"
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-# Run tests
-bazel test --test_output=errors --config=opt \
-  --incompatible_depset_union=false \
-  --build_tag_filters="${tag_filters}" \
-  --test_tag_filters="${tag_filters}" -- \
-  ${DEFAULT_BAZEL_TARGETS} \
-  -//tensorflow/lite/...
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py2_full/pip.sh b/tensorflow/tools/ci_build/release/macos/cpu_py2_full/pip.sh
deleted file mode 100644
index bcc7b4500d6..00000000000
--- a/tensorflow/tools/ci_build/release/macos/cpu_py2_full/pip.sh
+++ /dev/null
@@ -1,49 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-install_bazelisk
-
-# Pick a more recent version of xcode
-export DEVELOPER_DIR=/Applications/Xcode_10.3.app/Contents/Developer
-sudo xcode-select -s "${DEVELOPER_DIR}"
-
-# Install macos pip dependencies
-install_macos_pip_deps sudo
-
-# Export required variables for running pip_new.sh
-export OS_TYPE="MACOS"
-export CONTAINER_TYPE="CPU"
-export TF_PYTHON_VERSION='python2'
-export TF_BUILD_BOTH_CPU_PACKAGES=1
-
-# Run configure.
-export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Export optional variables for running pip.sh
-export TF_BUILD_FLAGS="--config=release_cpu_macos"
-export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going --test_env=TF2_BEHAVIOR=1"
-export TF_TEST_TARGETS="//tensorflow/python/..."
-export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-export TF_TEST_FILTER_TAGS='-nomac,-no_mac,-no_oss,-oss_serial,-no_oss_py2,-v1only,-gpu,-tpu,-benchmark-test'
-#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME="tensorflow"
-export TF_PIP_TEST_ROOT="pip_test"
-
-./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py2_full/pip_v1.sh b/tensorflow/tools/ci_build/release/macos/cpu_py2_full/pip_v1.sh
deleted file mode 100644
index 188e47fa74b..00000000000
--- a/tensorflow/tools/ci_build/release/macos/cpu_py2_full/pip_v1.sh
+++ /dev/null
@@ -1,51 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-install_bazelisk
-
-# Install pip dependencies
-install_macos_pip_deps sudo
-
-# Export required variables for running pip.sh
-export OS_TYPE="MACOS"
-export CONTAINER_TYPE="CPU"
-export TF_PYTHON_VERSION='python2'
-
-# Run configure.
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Export optional variables for running pip.sh
-export TF_BUILD_FLAGS="--config=opt"
-export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going"
-export TF_TEST_TARGETS="//tensorflow/python/..."
-export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-export TF_TEST_FILTER_TAGS='-nomac,-no_mac,-no_oss,-oss_serial,-no_oss_py2'
-#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME="tensorflow"
-export TF_PIP_TEST_ROOT="pip_test"
-
-./tensorflow/tools/ci_build/builds/pip_new.sh
-
-# Copy and rename to tensorflow_cpu
-for WHL_PATH in $(ls ${TF_ARTIFACTS_DIR}/tensorflow/${TF_PIP_TEST_ROOT}/whl/tensorflow*.whl); do
-  copy_to_new_project_name "${WHL_PATH}" tensorflow_cpu
-done
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py2_xla/build.sh b/tensorflow/tools/ci_build/release/macos/cpu_py2_xla/build.sh
deleted file mode 100644
index 367bc172ba0..00000000000
--- a/tensorflow/tools/ci_build/release/macos/cpu_py2_xla/build.sh
+++ /dev/null
@@ -1,56 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-install_bazelisk
-
-# Pick a more recent version of xcode
-sudo xcode-select --switch /Applications/Xcode_9.2.app/Contents/Developer
-
-# Set bazel temporary directory to avoid space issues.
-export TEST_TMPDIR=/tmpfs/bazel_tmp
-mkdir "${TEST_TMPDIR}"
-source tensorflow/tools/ci_build/builds/builds_common.sh
-
-# Do the pyenv shim thing to avoid build breakages.
-export PATH=$(echo "$PATH" | sed "s#${HOME}/.pyenv/shims:##g")
-
-# Fix numpy version
-sudo pip2 install numpy==1.12.1
-sudo pip2 install grpcio
-sudo pip2 install --upgrade setuptools==39.1.0
-
-py_ver="2"
-
-bazel clean
-export TF_ENABLE_XLA=1
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export PYTHON_BIN_PATH=$(which python"${py_ver}")
-
-yes "" | "$PYTHON_BIN_PATH" configure.py
-tag_filters="-no_oss,-oss_serial,-benchmark-test,-nomac,-no_mac,-no_oss_py2"
-
-bazel test --test_output=errors -c opt \
-  --test_tag_filters="${tag_filters}" \
-  --build_tag_filters="${tag_filters}" \
-  --distinct_host_configuration=false --build_tests_only \
-  --java_toolchain=@bazel_tools//tools/jdk:toolchain_hostjdk8 \
-  --test_timeout 300,450,1200,3600 -- \
-  //tensorflow/... -//tensorflow/compiler/...
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py35_full/nonpip.sh b/tensorflow/tools/ci_build/release/macos/cpu_py35_full/nonpip.sh
index 7e85779a207..dc1491d65d7 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py35_full/nonpip.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py35_full/nonpip.sh
@@ -41,11 +41,9 @@ tag_filters="-no_oss,-oss_serial,-nomac,-no_mac,-no_oss_py35,-v1only,-gpu,-tpu,-
 source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 # Run tests
-set +e
 bazel test --test_output=errors --config=opt \
   --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
   --build_tag_filters="${tag_filters}" \
   --test_tag_filters="${tag_filters}" -- \
   ${DEFAULT_BAZEL_TARGETS} \
   -//tensorflow/lite/...
-test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py36_full/nonpip.sh b/tensorflow/tools/ci_build/release/macos/cpu_py36_full/nonpip.sh
index 07d4f7957af..eb245cb1d04 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py36_full/nonpip.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py36_full/nonpip.sh
@@ -41,11 +41,9 @@ tag_filters="-no_oss,-oss_serial,-nomac,-no_mac,-no_oss_py36,-v1only,-gpu,-tpu,-
 source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 # Run tests
-set +e
 bazel test --test_output=errors --config=opt \
   --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
   --build_tag_filters="${tag_filters}" \
   --test_tag_filters="${tag_filters}" -- \
   ${DEFAULT_BAZEL_TARGETS} \
   -//tensorflow/lite/...
-test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py37_full/nonpip.sh b/tensorflow/tools/ci_build/release/macos/cpu_py37_full/nonpip.sh
index a23ca47a038..b9e6c3c9cf0 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py37_full/nonpip.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py37_full/nonpip.sh
@@ -41,11 +41,9 @@ tag_filters="-no_oss,-oss_serial,-nomac,-no_mac$(maybe_skip_v1),-gpu,-tpu,-bench
 source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 # Run tests
-set +e
 bazel test --test_output=errors --config=opt \
   --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
   --build_tag_filters="${tag_filters}" \
   --test_tag_filters="${tag_filters}" -- \
   ${DEFAULT_BAZEL_TARGETS} \
   -//tensorflow/lite/...
-test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py38_full/nonpip.sh b/tensorflow/tools/ci_build/release/macos/cpu_py38_full/nonpip.sh
index 179ecdf97ca..a90d59ff492 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py38_full/nonpip.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py38_full/nonpip.sh
@@ -41,11 +41,9 @@ tag_filters="-no_oss,-oss_serial,-nomac,-no_mac$(maybe_skip_v1),-gpu,-tpu,-bench
 source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 # Run tests
-set +e
 bazel test --test_output=errors --config=opt \
   --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
   --build_tag_filters="${tag_filters}" \
   --test_tag_filters="${tag_filters}" -- \
   ${DEFAULT_BAZEL_TARGETS} \
   -//tensorflow/lite/...
-test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/nightly_release.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/nightly_release.sh
deleted file mode 100644
index b60fe5fdc51..00000000000
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/nightly_release.sh
+++ /dev/null
@@ -1,59 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-# Install python dependencies
-install_ubuntu_16_pip_deps pip2.7
-
-install_bazelisk
-
-python2.7 tensorflow/tools/ci_build/update_version.py --nightly
-
-# Run configure.
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which python2.7)
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Build the pip package
-bazel build --config=release_cpu_linux tensorflow/tools/pip_package:build_pip_package
-
-./bazel-bin/tensorflow/tools/pip_package/build_pip_package pip_pkg --cpu --nightly_flag
-
-# Upload the built packages to pypi.
-for WHL_PATH in $(ls pip_pkg/tf_nightly_cpu-*dev*.whl); do
-
-  WHL_DIR=$(dirname "${WHL_PATH}")
-  WHL_BASE_NAME=$(basename "${WHL_PATH}")
-  AUDITED_WHL_NAME="${WHL_DIR}"/$(echo "${WHL_BASE_NAME//linux/manylinux2010}")
-  auditwheel repair --plat manylinux2010_x86_64 -w "${WHL_DIR}" "${WHL_PATH}"
-
-  # test the whl pip package
-  chmod +x tensorflow/tools/ci_build/builds/nightly_release_smoke_test.sh
-  ./tensorflow/tools/ci_build/builds/nightly_release_smoke_test.sh ${AUDITED_WHL_NAME}
-  RETVAL=$?
-
-  # Update results counter
-  if [ ${RETVAL} -eq 0 ]; then
-    echo "Basic PIP test PASSED, Uploading package: ${AUDITED_WHL_NAME}"
-    twine upload -r pypi-warehouse "${AUDITED_WHL_NAME}" || echo
-  else
-    echo "Basic PIP test FAILED, will not upload ${AUDITED_WHL_NAME} package"
-    return 1
-  fi
-done
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/nonpip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/nonpip.sh
deleted file mode 100644
index 8323625662f..00000000000
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/nonpip.sh
+++ /dev/null
@@ -1,49 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-# Install python dependencies
-install_ubuntu_16_pip_deps pip2.7
-# Update Bazel to the desired version
-install_bazelisk
-
-# Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which python2.7)
-export TF2_BEHAVIOR=1
-yes "" | "$PYTHON_BIN_PATH" configure.py
-tag_filters="-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-no_oss_py2,-v1only"
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-# Run tests
-set +e
-bazel test --test_output=errors --config=opt --test_lang_filters=py \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
-  --linkopt=-lrt \
-  --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
-  --build_tag_filters="${tag_filters}" \
-  --test_tag_filters="${tag_filters}" -- \
-  ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
-test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/nonpip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/nonpip_v1.sh
deleted file mode 100644
index f9241673fd1..00000000000
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/nonpip_v1.sh
+++ /dev/null
@@ -1,45 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-# Install python dependencies
-install_ubuntu_16_pip_deps pip2.7
-
-install_bazelisk
-
-# Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which python2.7)
-yes "" | "$PYTHON_BIN_PATH" configure.py
-tag_filters="-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-no_oss_py2"
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-# Run tests
-bazel test --test_output=errors --config=opt --test_lang_filters=py \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
-  --linkopt=-lrt \
-  --build_tag_filters="${tag_filters}" \
-  --test_tag_filters="${tag_filters}" -- \
-  ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/pip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/pip.sh
deleted file mode 100644
index aa1e4b52483..00000000000
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/pip.sh
+++ /dev/null
@@ -1,53 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-# Install python dependencies
-install_ubuntu_16_pip_deps pip2.7
-# Update bazel
-install_bazelisk
-
-# Export required variables for running pip.sh
-export OS_TYPE="UBUNTU"
-export CONTAINER_TYPE="CPU"
-export TF_PYTHON_VERSION='python2.7'
-
-# Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-# Export optional variables for running pip.sh
-export TF_BUILD_FLAGS="--config=opt --config=v2 --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain"
-export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going --test_env=TF2_BEHAVIOR=1"
-export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
-export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-export TF_TEST_FILTER_TAGS='-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-no_oss_py2,-v1only'
-#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME="tensorflow_cpu"
-export TF_PIP_TEST_ROOT="pip_test"
-
-./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/pip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/pip_v1.sh
deleted file mode 100644
index bd2e27e8781..00000000000
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/pip_v1.sh
+++ /dev/null
@@ -1,50 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-# Install python dependencies
-install_ubuntu_16_pip_deps pip2.7
-
-install_bazelisk
-
-# Export required variables for running pip.sh
-export OS_TYPE="UBUNTU"
-export CONTAINER_TYPE="CPU"
-export TF_PYTHON_VERSION='python2.7'
-
-# Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Export optional variables for running pip.sh
-export TF_BUILD_FLAGS="--config=opt --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain"
-export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going"
-export TF_TEST_TARGETS="//tensorflow/python/... "
-export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-export TF_TEST_FILTER_TAGS='-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-no_oss_py2'
-#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME="tensorflow_cpu"
-export TF_PIP_TEST_ROOT="pip_test"
-
-./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/nightly_release.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/nightly_release.sh
index 200f3c41725..4af77739c55 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/nightly_release.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/nightly_release.sh
@@ -20,8 +20,6 @@ source tensorflow/tools/ci_build/release/common.sh
 
 install_ubuntu_16_pip_deps pip3.5
 
-pip3.7 install --upgrade auditwheel --user
-
 install_bazelisk
 
 python2.7 tensorflow/tools/ci_build/update_version.py --nightly
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/nightly_release.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/nightly_release.sh
index c54fe72a55a..9cca17e5517 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/nightly_release.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/nightly_release.sh
@@ -20,8 +20,6 @@ source tensorflow/tools/ci_build/release/common.sh
 
 install_ubuntu_16_pip_deps pip3.6
 
-pip3.7 install --upgrade auditwheel --user
-
 install_bazelisk
 
 python2.7 tensorflow/tools/ci_build/update_version.py --nightly
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/nightly_release.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/nightly_release.sh
index 4bea46486c3..29fe8f4c351 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/nightly_release.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/nightly_release.sh
@@ -20,8 +20,6 @@ source tensorflow/tools/ci_build/release/common.sh
 
 install_ubuntu_16_pip_deps pip3.7
 
-pip3.7 install --upgrade auditwheel --user
-
 install_bazelisk
 
 python2.7 tensorflow/tools/ci_build/update_version.py --nightly
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py38_full/nightly_release.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py38_full/nightly_release.sh
index 3dc627f23ee..442d6a4cc76 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py38_full/nightly_release.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py38_full/nightly_release.sh
@@ -20,8 +20,6 @@ source tensorflow/tools/ci_build/release/common.sh
 
 install_ubuntu_16_pip_deps pip3.8
 
-pip3.7 install --upgrade auditwheel --user
-
 install_bazelisk
 
 python2.7 tensorflow/tools/ci_build/update_version.py --nightly
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_pip_on_cpu/build.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_pip_on_cpu/build.sh
index 6e67bf20730..22ca5b7b567 100755
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_pip_on_cpu/build.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_pip_on_cpu/build.sh
@@ -27,14 +27,14 @@ export TF_NEED_GCP=1
 export TF_NEED_HDFS=1
 export TF_NEED_S3=1
 export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=10
-export TF_CUDNN_VERSION=7
+export TF_CUDA_VERSION=11
+export TF_CUDNN_VERSION=8
 export TF_NEED_TENSORRT=1
 export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
 export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which python3.6)
 export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
+export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_50,sm_60,sm_70,sm_75,compute_80
 
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
@@ -42,7 +42,7 @@ yes "" | "$PYTHON_BIN_PATH" configure.py
 ## Build GPU pip package
 ########################
 bazel build --config=opt \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
+  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda11:toolchain \
   tensorflow/tools/pip_package:build_pip_package
 
 # Set TF nightly flag so we get the proper version of estimator
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/nightly_release.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/nightly_release.sh
deleted file mode 100644
index 079b683a6d5..00000000000
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/nightly_release.sh
+++ /dev/null
@@ -1,74 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip2.7
-
-install_bazelisk
-
-python2.7 tensorflow/tools/ci_build/update_version.py --nightly
-
-# Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=10
-export TF_CUDNN_VERSION=7
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
-export TF_NEED_TENSORRT=1
-export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which python2.7)
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Build the pip package
-bazel build --config=opt --config=v2 \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
-  tensorflow/tools/pip_package:build_pip_package
-
-./bazel-bin/tensorflow/tools/pip_package/build_pip_package pip_pkg --nightly_flag
-./bazel-bin/tensorflow/tools/pip_package/build_pip_package pip_pkg --gpu --nightly_flag
-
-# Upload the built packages to pypi.
-for WHL_PATH in $(ls pip_pkg/tf_nightly*dev*.whl); do
-
-  WHL_DIR=$(dirname "${WHL_PATH}")
-  WHL_BASE_NAME=$(basename "${WHL_PATH}")
-  AUDITED_WHL_NAME="${WHL_DIR}"/$(echo "${WHL_BASE_NAME//linux/manylinux2010}")
-
-  # Copy and rename for gpu manylinux as we do not want auditwheel to package in libcudart.so
-  WHL_PATH=${AUDITED_WHL_NAME}
-  cp "${WHL_DIR}"/"${WHL_BASE_NAME}" "${WHL_PATH}"
-  echo "Copied manylinux2010 wheel file at: ${WHL_PATH}"
-
-  # test the whl pip package
-  chmod +x tensorflow/tools/ci_build/builds/nightly_release_smoke_test.sh
-  ./tensorflow/tools/ci_build/builds/nightly_release_smoke_test.sh ${AUDITED_WHL_NAME}
-  RETVAL=$?
-
-  # Upload the PIP package if whl test passes.
-  if [ ${RETVAL} -eq 0 ]; then
-    echo "Basic PIP test PASSED, Uploading package: ${AUDITED_WHL_NAME}"
-    twine upload -r pypi-warehouse "${AUDITED_WHL_NAME}" || echo
-  else
-    echo "Basic PIP test FAILED, will not upload ${AUDITED_WHL_NAME} package"
-    return 1
-  fi
-done
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/nonpip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/nonpip.sh
deleted file mode 100644
index e8c8b763d4b..00000000000
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/nonpip.sh
+++ /dev/null
@@ -1,60 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip2.7
-# Install bazelisk
-install_bazelisk
-
-# Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=10
-export TF_CUDNN_VERSION=7
-export TF_NEED_TENSORRT=1
-export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which python2.7)
-export TF2_BEHAVIOR=1
-export PROJECT_NAME="tensorflow_gpu"
-export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
-
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py2"
-
-set +e
-bazel test --config=cuda --config=opt \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
-  --linkopt=-lrt \
-  --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
-  --test_lang_filters=py \
-  --build_tag_filters="${tag_filters}" \
-  --test_tag_filters="${tag_filters}" \
-  --test_timeout="300,450,1200,3600" --local_test_jobs=4 \
-  --test_output=errors --verbose_failures=true --keep_going \
-  --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \
-  -- ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
-test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/nonpip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/nonpip_v1.sh
deleted file mode 100644
index 20e7977945f..00000000000
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/nonpip_v1.sh
+++ /dev/null
@@ -1,56 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip2.7
-
-install_bazelisk
-
-# Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=10
-export TF_CUDNN_VERSION=7
-export TF_NEED_TENSORRT=1
-export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which python2.7)
-export PROJECT_NAME="tensorflow_gpu"
-export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
-
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py2"
-
-bazel test --config=cuda --config=opt \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
-  --linkopt=-lrt \
-  --test_lang_filters=py \
-  --build_tag_filters=${tag_filters} \
-  --test_tag_filters=${tag_filters} \
-  --test_timeout="300,450,1200,3600" --local_test_jobs=4 \
-  --test_output=errors --verbose_failures=true --keep_going \
-  --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \
-  -- ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/pip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/pip.sh
deleted file mode 100644
index b3f7f158648..00000000000
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/pip.sh
+++ /dev/null
@@ -1,68 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip2.7
-# Update bazel
-install_bazelisk
-
-# Export required variables for running pip.sh
-export OS_TYPE="UBUNTU"
-export CONTAINER_TYPE="GPU"
-export TF_PYTHON_VERSION='python2.7'
-
-# Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=10
-export TF_CUDNN_VERSION=7
-export TF_NEED_TENSORRT=1
-export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-export PROJECT_NAME="tensorflow_gpu"
-export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
-
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-# Export optional variables for running pip.sh
-export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial'
-export TF_BUILD_FLAGS="--config=opt --config=v2 --config=cuda --distinct_host_configuration=false \
---action_env=TF_CUDA_VERSION --action_env=TF_CUDNN_VERSION --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain "
-export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filters=${TF_TEST_FILTER_TAGS} --distinct_host_configuration=false \
---action_env=TF_CUDA_VERSION --action_env=TF_CUDNN_VERSION --test_env=TF2_BEHAVIOR=1 \
---config=cuda --test_output=errors --local_test_jobs=4 --test_lang_filters=py \
---verbose_failures=true --keep_going --define=no_tensorflow_py_deps=true \
---run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
-export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/..."
-export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME=${PROJECT_NAME}
-export TF_PIP_TEST_ROOT="pip_test"
-
-# To build both tensorflow and tensorflow-gpu pip packages
-export TF_BUILD_BOTH_GPU_PACKAGES=1
-
-./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/pip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/pip_v1.sh
deleted file mode 100644
index ea00d9f7539..00000000000
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/pip_v1.sh
+++ /dev/null
@@ -1,66 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip2.7
-
-install_bazelisk
-
-# Export required variables for running pip.sh
-export OS_TYPE="UBUNTU"
-export CONTAINER_TYPE="GPU"
-export TF_PYTHON_VERSION='python2.7'
-
-# Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=10
-export TF_CUDNN_VERSION=7
-export TF_NEED_TENSORRT=1
-export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-export PROJECT_NAME="tensorflow_gpu"
-export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
-
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Export optional variables for running pip.sh
-export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial'
-export TF_BUILD_FLAGS="--config=opt --config=cuda --distinct_host_configuration=false \
---action_env=TF_CUDA_VERSION --action_env=TF_CUDNN_VERSION --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain "
-export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filters=${TF_TEST_FILTER_TAGS} \
---distinct_host_configuration=false \
---action_env=TF_CUDA_VERSION --action_env=TF_CUDNN_VERSION \
---config=cuda --test_output=errors --local_test_jobs=4 --test_lang_filters=py \
---verbose_failures=true --keep_going --define=no_tensorflow_py_deps=true \
---run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
-export TF_TEST_TARGETS="//tensorflow/python/... "
-export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME=${PROJECT_NAME}
-export TF_PIP_TEST_ROOT="pip_test"
-
-# To build both tensorflow and tensorflow-gpu pip packages
-export TF_BUILD_BOTH_GPU_PACKAGES=1
-
-./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/nonpip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/nonpip.sh
index 47ed3c4fd2a..a8dfd2047ba 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/nonpip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/nonpip.sh
@@ -27,8 +27,8 @@ export TF_NEED_GCP=1
 export TF_NEED_HDFS=1
 export TF_NEED_S3=1
 export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=10
-export TF_CUDNN_VERSION=7
+export TF_CUDA_VERSION=11
+export TF_CUDNN_VERSION=8
 export TF_NEED_TENSORRT=1
 export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
 export CC_OPT_FLAGS='-mavx'
@@ -36,7 +36,7 @@ export PYTHON_BIN_PATH=$(which python3.5)
 export TF2_BEHAVIOR=1
 export PROJECT_NAME="tensorflow_gpu"
 export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
+export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_50,sm_60,sm_70,sm_75,compute_80
 
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
@@ -47,7 +47,7 @@ tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py35"
 
 set +e
 bazel test --config=cuda --config=opt \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
+  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda11:toolchain \
   --linkopt=-lrt \
   --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
   --test_lang_filters=py \
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/nonpip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/nonpip_v1.sh
index e4a5a69c10f..e0e69504f26 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/nonpip_v1.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/nonpip_v1.sh
@@ -35,7 +35,7 @@ export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which python3.5)
 export PROJECT_NAME="tensorflow_gpu"
 export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
+export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_50,sm_60,sm_70,sm_75,compute_80
 
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/pip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/pip.sh
index 2a5c550890b..f178ac0754e 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/pip.sh
@@ -39,7 +39,7 @@ export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss
 export TF_BUILD_FLAGS="--config=release_gpu_linux "
 export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filters=${TF_TEST_FILTER_TAGS} \
 --distinct_host_configuration=false \
---action_env=TF_CUDA_VERSION=10 --action_env=TF_CUDNN_VERSION=7 --test_env=TF2_BEHAVIOR=1 \
+--action_env=TF_CUDA_VERSION=11 --action_env=TF_CUDNN_VERSION=8 --test_env=TF2_BEHAVIOR=1 \
 --config=cuda --test_output=errors --local_test_jobs=4 --test_lang_filters=py \
 --verbose_failures=true --keep_going --define=no_tensorflow_py_deps=true \
 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/pip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/pip_v1.sh
index a860decbe51..6c83621269e 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/pip_v1.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/pip_v1.sh
@@ -40,7 +40,7 @@ export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
 export PROJECT_NAME="tensorflow_gpu"
 export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
+export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_50,sm_60,sm_70,sm_75,compute_80
 
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/nonpip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/nonpip.sh
index 70038a8d875..c52acec7784 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/nonpip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/nonpip.sh
@@ -27,8 +27,8 @@ export TF_NEED_GCP=1
 export TF_NEED_HDFS=1
 export TF_NEED_S3=1
 export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=10
-export TF_CUDNN_VERSION=7
+export TF_CUDA_VERSION=11
+export TF_CUDNN_VERSION=8
 export TF_NEED_TENSORRT=1
 export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
 export CC_OPT_FLAGS='-mavx'
@@ -36,7 +36,7 @@ export PYTHON_BIN_PATH=$(which python3.6)
 export TF2_BEHAVIOR=1
 export PROJECT_NAME="tensorflow_gpu"
 export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
+export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_50,sm_60,sm_70,sm_75,compute_80
 
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
@@ -47,7 +47,7 @@ tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py36"
 
 set +e
 bazel test --config=cuda --config=opt \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
+  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda11:toolchain \
   --linkopt=-lrt \
   --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
   --test_lang_filters=py \
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/nonpip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/nonpip_v1.sh
index aaa4d017546..1da93811d43 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/nonpip_v1.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/nonpip_v1.sh
@@ -35,7 +35,7 @@ export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which python3.6)
 export PROJECT_NAME="tensorflow_gpu"
 export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
+export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_50,sm_60,sm_70,sm_75,compute_80
 
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/pip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/pip.sh
index 9aa724c27b9..9bc559a01ab 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/pip.sh
@@ -39,7 +39,7 @@ export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss
 export TF_BUILD_FLAGS="--config=release_gpu_linux "
 export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filters=${TF_TEST_FILTER_TAGS} \
 --distinct_host_configuration=false \
---action_env=TF_CUDA_VERSION=10 --action_env=TF_CUDNN_VERSION=7 --test_env=TF2_BEHAVIOR=1 \
+--action_env=TF_CUDA_VERSION=11 --action_env=TF_CUDNN_VERSION=8 --test_env=TF2_BEHAVIOR=1 \
 --config=cuda --test_output=errors --local_test_jobs=4 --test_lang_filters=py \
 --verbose_failures=true --keep_going --define=no_tensorflow_py_deps=true \
 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/pip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/pip_v1.sh
index 2ae067c53ce..e3da69ebc32 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/pip_v1.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/pip_v1.sh
@@ -40,7 +40,7 @@ export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
 export PROJECT_NAME="tensorflow_gpu"
 export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
+export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_50,sm_60,sm_70,sm_75,compute_80
 
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/nonpip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/nonpip.sh
index 225b2cf4b7b..bf5fabba741 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/nonpip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/nonpip.sh
@@ -27,8 +27,8 @@ export TF_NEED_GCP=1
 export TF_NEED_HDFS=1
 export TF_NEED_S3=1
 export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=10
-export TF_CUDNN_VERSION=7
+export TF_CUDA_VERSION=11
+export TF_CUDNN_VERSION=8
 export TF_NEED_TENSORRT=1
 export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
 export CC_OPT_FLAGS='-mavx'
@@ -36,7 +36,7 @@ export PYTHON_BIN_PATH=$(which python3.7)
 export TF2_BEHAVIOR=1
 export PROJECT_NAME="tensorflow_gpu"
 export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
+export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_50,sm_60,sm_70,sm_75,compute_80
 
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
@@ -47,7 +47,7 @@ tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py37"
 
 set +e
 bazel test --config=cuda --config=opt \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
+  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda11:toolchain \
   --linkopt=-lrt \
   --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
   --test_lang_filters=py \
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/nonpip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/nonpip_v1.sh
index 112f232a8e3..a620e3c92d2 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/nonpip_v1.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/nonpip_v1.sh
@@ -35,7 +35,7 @@ export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which python3.7)
 export PROJECT_NAME="tensorflow_gpu"
 export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
+export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_50,sm_60,sm_70,sm_75,compute_80
 
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/pip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/pip.sh
index 9bfc6608a0b..71d6f3e6401 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/pip.sh
@@ -39,7 +39,7 @@ export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss
 export TF_BUILD_FLAGS="--config=release_gpu_linux "
 export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filters=${TF_TEST_FILTER_TAGS} \
 --distinct_host_configuration=false \
---action_env=TF_CUDA_VERSION=10 --action_env=TF_CUDNN_VERSION=7 --test_env=TF2_BEHAVIOR=1 \
+--action_env=TF_CUDA_VERSION=11 --action_env=TF_CUDNN_VERSION=8 --test_env=TF2_BEHAVIOR=1 \
 --config=cuda --test_output=errors --local_test_jobs=4 --test_lang_filters=py \
 --verbose_failures=true --keep_going --define=no_tensorflow_py_deps=true \
 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/pip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/pip_v1.sh
index f6128448b99..a0fb0c40001 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/pip_v1.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/pip_v1.sh
@@ -40,7 +40,7 @@ export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
 export PROJECT_NAME="tensorflow_gpu"
 export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
+export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_50,sm_60,sm_70,sm_75,compute_80
 
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/nonpip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/nonpip.sh
index f7678b7436f..5f29daf36e0 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/nonpip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/nonpip.sh
@@ -27,8 +27,8 @@ export TF_NEED_GCP=1
 export TF_NEED_HDFS=1
 export TF_NEED_S3=1
 export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=10
-export TF_CUDNN_VERSION=7
+export TF_CUDA_VERSION=11
+export TF_CUDNN_VERSION=8
 export TF_NEED_TENSORRT=1
 export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
 export CC_OPT_FLAGS='-mavx'
@@ -36,7 +36,7 @@ export PYTHON_BIN_PATH=$(which python3.8)
 export TF2_BEHAVIOR=1
 export PROJECT_NAME="tensorflow_gpu"
 export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
+export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_50,sm_60,sm_70,sm_75,compute_80
 
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
@@ -47,7 +47,7 @@ tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py38"
 
 test +e
 bazel test --config=cuda --config=opt \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
+  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda11:toolchain \
   --linkopt=-lrt \
   --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
   --test_lang_filters=py \
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/pip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/pip.sh
index d8838e7704a..f49b77bae70 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/pip.sh
@@ -39,7 +39,7 @@ export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss
 export TF_BUILD_FLAGS="--config=release_gpu_linux "
 export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filters=${TF_TEST_FILTER_TAGS} \
 --distinct_host_configuration=false \
---action_env=TF_CUDA_VERSION=10 --action_env=TF_CUDNN_VERSION=7 --test_env=TF2_BEHAVIOR=1 \
+--action_env=TF_CUDA_VERSION=11 --action_env=TF_CUDNN_VERSION=8 --test_env=TF2_BEHAVIOR=1 \
 --config=cuda --test_output=errors --local_test_jobs=4 --test_lang_filters=py \
 --verbose_failures=true --keep_going --define=no_tensorflow_py_deps=true \
 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/libtensorflow/cpu/build.sh b/tensorflow/tools/ci_build/release/ubuntu_16/libtensorflow/cpu/build.sh
index a0e3a7f4594..1504688dcbc 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/libtensorflow/cpu/build.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/libtensorflow/cpu/build.sh
@@ -38,3 +38,9 @@ if [ -n "${IS_NIGHTLY_BUILD}" ]; then
   cp tensorflow/tools/ci_build/builds/libtensorflow_nightly_symlink.sh lib_package
 fi
 
+# Upload to go/tf-sizetracker
+python3 ./tensorflow/tools/ci_build/sizetrack_helper.py \
+  --team tensorflow_libtensorflow \
+  --artifact_id ubuntu_cpu_nightly \
+  --upload \
+  --artifact "$(find lib_package -iname "libtensorflow*.tar.gz" -not -iname "*jni*" | head -n 1)"
diff --git a/tensorflow/tools/ci_build/sizetrack_helper.py b/tensorflow/tools/ci_build/sizetrack_helper.py
index 032dbdf7490..0b42ff84e7e 100755
--- a/tensorflow/tools/ci_build/sizetrack_helper.py
+++ b/tensorflow/tools/ci_build/sizetrack_helper.py
@@ -54,9 +54,11 @@ import csv
 import datetime
 import os
 import os.path
+import pathlib
 import platform
 import subprocess
 
+
 parser = argparse.ArgumentParser(
     usage=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 parser.add_argument(
@@ -78,7 +80,7 @@ parser.add_argument(
 parser.add_argument(
     "--bucket",
     type=str,
-    default="gs://tensorflow-testing-bucket",
+    default="gs://tf-sizetracker-artifacts",
     help="GCS bucket for artifacts.")
 parser.add_argument(
     "--team",
@@ -97,6 +99,10 @@ parser.add_argument(
     "--job",
     type=str,
     help="Name of job calling this script. Default: $KOKORO_JOB_NAME.")
+parser.add_argument(
+    "--build_id",
+    type=str,
+    help="UUID of build calling this script. Default: $KOKORO_BUILD_ID.")
 parser.add_argument(
     "--print_schema",
     action="store_true",
@@ -112,13 +118,16 @@ size.add_argument(
     help="Manually set the recorded size instead of providing an artifact.")
 FLAGS = parser.parse_args()
 
+
+NOW = datetime.datetime.now(
+    datetime.timezone.utc).replace(microsecond=0).isoformat()
 TABLE_NAME = "{}.{}".format(FLAGS.dataset, FLAGS.table)
 PROJECT_LEVEL_TABLE_NAME = "{}:{}".format(FLAGS.project, TABLE_NAME)
 CL_TRAILER = "PiperOrigin-RevId"
 PRETTY_COMMIT_DATE = "%cI"
 PRETTY_CL = "%(trailers:key={},valueonly)".format(CL_TRAILER)
 PRETTY_HEAD_INFO = "%h\t{cl}\t%s\t%ae\t%aI\t%ce\t%cI".format(cl=PRETTY_CL)
-PRETTY_EARLY = "{cl}\t%aI\t%cI".format(cl=PRETTY_CL)
+PRETTY_EARLY = "%aI\t{cl}\t%cI".format(cl=PRETTY_CL)
 PRETTY_COMMIT = "%h"
 # This is a BigQuery table schema defined as CSV
 # See https://cloud.google.com/bigquery/docs/schemas
@@ -146,6 +155,7 @@ SCHEMA = ",".join([
     "logged_date:timestamp",
     "uploaded_to:string",
     "job:string",
+    "build_id:string",
 ])
 # Select the earliest recorded commit in the same table for the same artifact
 # and team. Used to determine the full range of tested commits for each
@@ -231,6 +241,15 @@ def gcloud(tool, args, stdin=None):
   return ret.stdout.strip()
 
 
+def bq(args, stdin=None):
+  """Helper for running bq, the BigQuery tool."""
+  # bq prints extra messages to stdout if ~/.bigqueryrc doesn't exist
+  pathlib.Path(pathlib.Path.home() / ".bigqueryrc").touch()
+  return gcloud(
+      "bq", ["--project_id", FLAGS.project, "--headless", *args],
+      stdin=stdin)
+
+
 def get_all_tested_commits():
   """Get details about the full commit range tested by this invocation."""
   head_info = git_pretty("HEAD", PRETTY_HEAD_INFO, n=1)
@@ -245,23 +264,20 @@ def get_all_tested_commits():
   # --format=csv returns an empty string if no results, or else two lines:
   # commit
   # COMMIT_HASH
-  earliest_commit = gcloud(
-      "bq", [
-          "--project_id", FLAGS.project, "query", "--format", "csv",
-          "--nouse_legacy_sql"
-      ],
-      stdin=query_earliest_included_commit)
+  earliest_commit = bq(["query", "--format", "csv", "--nouse_legacy_sql"],
+                       stdin=query_earliest_included_commit)
 
   # Compute the commit/CL range since the last test
   if earliest_commit:
 
     earliest_commit = earliest_commit.splitlines()[-1]  # Ignore CSV header
-    early_cl, early_author_date, early_commit_date = git_pretty(
+    early_author_date, early_cl, early_commit_date = git_pretty(
         earliest_commit, PRETTY_EARLY, n=1)[0].split("\t")
 
     all_range = "{commit}..HEAD".format(commit=earliest_commit)
-    all_commits = ",".join(git_pretty(all_range, PRETTY_COMMIT))
-    all_changelists = ",".join(git_pretty(all_range, PRETTY_CL))
+    # Reversed: convert to chronological
+    all_commits = ",".join(reversed(git_pretty(all_range, PRETTY_COMMIT)))
+    all_changelists = ",".join(reversed(git_pretty(all_range, PRETTY_CL)))
 
     return [
         earliest_commit, early_cl, early_author_date, early_commit_date,
@@ -278,15 +294,13 @@ def get_upload_path():
   """Generate URL for 'gsutil cp'."""
   if FLAGS.upload and FLAGS.artifact:
     artifact_filename = os.path.basename(FLAGS.artifact.name)
-    ts = datetime.datetime.now(
-        datetime.timezone.utc).isoformat(timespec="seconds")
     # note: not os.path.join here, because gsutil is always linux-style
     # Using a timestamp prevents duplicate entries
     path = "{bucket}/{team}/{artifact_id}/{now}.{artifact_filename}".format(
         bucket=FLAGS.bucket,
         team=FLAGS.team,
         artifact_id=FLAGS.artifact_id,
-        now=ts,
+        now=NOW,
         artifact_filename=artifact_filename)
     return path
   else:
@@ -320,6 +334,7 @@ def build_row():
       current_time,
       get_upload_path(),
       FLAGS.job,
+      FLAGS.build_id,
   ]
 
 
@@ -339,6 +354,8 @@ def main():
 
   if not FLAGS.job:
     FLAGS.job = os.environ.get("KOKORO_JOB_NAME", "NO_JOB")
+  if not FLAGS.build_id:
+    FLAGS.build_id = os.environ.get("KOKORO_BUILD_ID", "NO_BUILD")
 
   # Generate data about this artifact into a Tab Separated Value file
   next_tsv_row = build_row()
@@ -356,12 +373,13 @@ def main():
     print("DRY RUN: Generated this TSV row:")
     print("\t".join(map(str, next_tsv_row)))
   else:
-    with open("data.tsv", "w") as tsvfile:
-      writer = csv.writer(tsvfile, delimiter="\t", quoting=csv.QUOTE_MINIMAL)
+    with open("data.tsv", "w", newline="") as tsvfile:
+      writer = csv.writer(tsvfile, delimiter="\t", quoting=csv.QUOTE_MINIMAL,
+                          lineterminator=os.linesep)
       writer.writerow(next_tsv_row)
-    gcloud("bq", [
-        "--project_id", FLAGS.project, "load", "--source_format", "CSV",
-        "--field_delimiter", "tab", PROJECT_LEVEL_TABLE_NAME, "data.tsv", SCHEMA
+    bq([
+        "load", "--source_format", "CSV", "--field_delimiter", "tab",
+        PROJECT_LEVEL_TABLE_NAME, "data.tsv", SCHEMA
     ])
 
 
diff --git a/tensorflow/tools/ci_build/windows/bazel/common_env.sh b/tensorflow/tools/ci_build/windows/bazel/common_env.sh
index 23016f7f3ed..e767a0cb765 100644
--- a/tensorflow/tools/ci_build/windows/bazel/common_env.sh
+++ b/tensorflow/tools/ci_build/windows/bazel/common_env.sh
@@ -55,8 +55,8 @@ export PATH="/c/Program Files/Git/cmd:$PATH"
 export PATH="/c/${PYTHON_BASE_PATH}/Scripts:$PATH"
 
 # Setting default values to CUDA related environment variables
-export TF_CUDA_VERSION=${TF_CUDA_VERSION:-10.1}
-export TF_CUDNN_VERSION=${TF_CUDNN_VERSION:-7}
+export TF_CUDA_VERSION=${TF_CUDA_VERSION:-11.0}
+export TF_CUDNN_VERSION=${TF_CUDNN_VERSION:-8}
 export TF_CUDA_COMPUTE_CAPABILITIES=${TF_CUDA_COMPUTE_CAPABILITIES:-6.0}
 export CUDA_TOOLKIT_PATH=${CUDA_TOOLKIT_PATH:-"C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${TF_CUDA_VERSION}"}
 export CUDNN_INSTALL_PATH=${CUDNN_INSTALL_PATH:-"C:/tools/cuda"}
diff --git a/tensorflow/tools/compatibility/BUILD b/tensorflow/tools/compatibility/BUILD
index f161796a886..18a8d72a0b0 100644
--- a/tensorflow/tools/compatibility/BUILD
+++ b/tensorflow/tools/compatibility/BUILD
@@ -216,7 +216,7 @@ genrule(
            " --infile $(location testdata/test_file_v0_11.py)" +
            " --outfile $(location test_file_v1_0.py)" +
            " --reportfile $(location report.txt)"),
-    tools = [":tf_upgrade"],
+    exec_tools = [":tf_upgrade"],
 )
 
 py_test(
@@ -243,7 +243,7 @@ genrule(
            " --outfile $(location test_file_v2_0.py)" +
            " --reportfile $(location report_v2.txt) && " +
            "sed -i'.original' 's/_TEST_VERSION = 1/_TEST_VERSION = 2/g' $(location test_file_v2_0.py)"),
-    tools = [":tf_upgrade_v2"],
+    exec_tools = [":tf_upgrade_v2"],
 )
 
 py_test(
diff --git a/tensorflow/tools/def_file_filter/symbols_pybind.txt b/tensorflow/tools/def_file_filter/symbols_pybind.txt
index 55db32e8d6f..b4121c2ee13 100644
--- a/tensorflow/tools/def_file_filter/symbols_pybind.txt
+++ b/tensorflow/tools/def_file_filter/symbols_pybind.txt
@@ -20,6 +20,7 @@ tensorflow::swig::AssertSameStructureForData
 tensorflow::swig::RegisterPyObject
 tensorflow::swig::RegisterType
 tensorflow::swig::IsEagerTensorSlow
+tensorflow::swig::GetRegisteredPyObject
 
 [util_port] # util_port
 tensorflow::IsGoogleCudaEnabled
@@ -100,6 +101,7 @@ tensorflow::data::GrpcDataServerBase::Start
 tensorflow::data::GrpcDataServerBase::Stop
 tensorflow::data::GrpcDataServerBase::BoundPort
 tensorflow::data::DispatchGrpcDataServer::NumWorkers
+tensorflow::data::WorkerGrpcDataServer::NumTasks
 tensorflow::data::NewDispatchServer
 tensorflow::data::NewWorkerServer
 
@@ -203,6 +205,9 @@ TFE_Py_TensorShapeOnDevice
 TFE_Py_EncodeArg
 TFE_Py_EnableInteractivePythonLogging
 TFE_Py_SetEagerContext
+tensorflow::MakeEagerContextThreadLocalData
+tensorflow::GetEagerContextThreadLocalData
+tensorflow::DestroyEagerContextThreadLocalData
 
 [eager_executor] # tfe
 tensorflow::EagerExecutor::~EagerExecutor
@@ -219,6 +224,7 @@ tensorflow::ExperimentalRunPassPipeline
 tensorflow::ExperimentalConvertSavedModelV1ToMlir
 tensorflow::ExperimentalConvertSavedModelToMlir
 tensorflow::ImportGraphDef
+tensorflow::ImportFunction
 
 [op_gen_lib] # tf_session
 tensorflow::ApiDefMap::~ApiDefMap
@@ -336,8 +342,8 @@ tensorflow::ProfilerSession::Status
 tensorflow::ProfilerSession::~ProfilerSession
 
 [profiler_server_impl] # profiler
-tensorflow::ProfilerServer::StartProfilerServer
-tensorflow::ProfilerServer::~ProfilerServer
+tensorflow::profiler::ProfilerServer::StartProfilerServer
+tensorflow::profiler::ProfilerServer::~ProfilerServer
 
 [profiler_client_impl] # profiler
 tensorflow::profiler::ProfileGrpc
@@ -378,7 +384,8 @@ tensorflow::grappler::CostAnalyzer::GenerateReport
 [flags] # tfe
 tensorflow::IsXlaEnabled
 tensorflow::GetMlirCommonFlags
+tensorflow::GetXlaDeviceFlags
 
-[tf32_utils] # tf32
-tensorflow::allow_tf32_execution
-tensorflow::tf32_execution_allowed
+[tensor_float_32_utils] # tensor_float_32
+tensorflow::enable_tensor_float_32_execution
+tensorflow::tensor_float_32_execution_enabled
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile
index b99c384fe20..83e01bdfd16 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile
@@ -22,37 +22,34 @@
 ARG UBUNTU_VERSION=18.04
 
 ARG ARCH=
-ARG CUDA=10.1
+ARG CUDA=11.0
 FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}-base-ubuntu${UBUNTU_VERSION} as base
 # ARCH and CUDA are specified again because the FROM directive resets ARGs
 # (but their default value is retained if set previously)
 ARG ARCH
 ARG CUDA
-ARG CUDNN=7.6.4.38-1
-ARG CUDNN_MAJOR_VERSION=7
+ARG CUDNN=8.0.2.39-1
+ARG CUDNN_MAJOR_VERSION=8
 ARG LIB_DIR_PREFIX=x86_64
-ARG LIBNVINFER=6.0.1-1
-ARG LIBNVINFER_MAJOR_VERSION=6
+ARG LIBNVINFER=7.1.3-1
+ARG LIBNVINFER_MAJOR_VERSION=7
 
 # Needed for string substitution
 SHELL ["/bin/bash", "-c"]
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
         cuda-command-line-tools-${CUDA/./-} \
-        # There appears to be a regression in libcublas10=10.2.2.89-1 which
-        # prevents cublas from initializing in TF. See
-        # https://github.com/tensorflow/tensorflow/issues/9489#issuecomment-562394257
-        libcublas10=10.2.1.243-1 \ 
-        libcublas-dev=10.2.1.243-1 \
+        libcublas-${CUDA/./-} \
+        libcublas-dev-${CUDA/./-} \
         cuda-nvrtc-${CUDA/./-} \
         cuda-nvrtc-dev-${CUDA/./-} \
         cuda-cudart-dev-${CUDA/./-} \
-        cuda-cufft-dev-${CUDA/./-} \
-        cuda-curand-dev-${CUDA/./-} \
-        cuda-cusolver-dev-${CUDA/./-} \
-        cuda-cusparse-dev-${CUDA/./-} \
-        libcudnn7=${CUDNN}+cuda${CUDA} \
-        libcudnn7-dev=${CUDNN}+cuda${CUDA} \
+        libcufft-dev-${CUDA/./-} \
+        libcurand-dev-${CUDA/./-} \
+        libcusolver-dev-${CUDA/./-} \
+        libcusparse-dev-${CUDA/./-} \
+        libcudnn8=${CUDNN}+cuda${CUDA} \
+        libcudnn8-dev=${CUDNN}+cuda${CUDA} \
         libcurl3-dev \
         libfreetype6-dev \
         libhdf5-serial-dev \
@@ -67,7 +64,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         git \
         && \
     find /usr/local/cuda-${CUDA}/lib64/ -type f -name 'lib*_static.a' -not -name 'libcudart_static.a' -delete && \
-    rm /usr/lib/${LIB_DIR_PREFIX}-linux-gnu/libcudnn_static_v7.a
+    rm /usr/lib/${LIB_DIR_PREFIX}-linux-gnu/libcudnn_static_v8.a
 
 # Install TensorRT if not building for PowerPC
 RUN [[ "${ARCH}" = "ppc64le" ]] || { apt-get update && \
@@ -79,7 +76,7 @@ RUN [[ "${ARCH}" = "ppc64le" ]] || { apt-get update && \
         && rm -rf /var/lib/apt/lists/*; }
 
 # Configure the build for our CUDA configuration.
-ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:/usr/include/x64_64-linux-gnu:$LD_LIBRARY_PATH
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:/usr/include/x86_64-linux-gnu:/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH
 ENV TF_NEED_CUDA 1
 ENV TF_NEED_TENSORRT 1
 ENV TF_CUDA_VERSION=${CUDA}
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile
index 4493964cffc..60a3e57c294 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile
@@ -22,37 +22,34 @@
 ARG UBUNTU_VERSION=18.04
 
 ARG ARCH=
-ARG CUDA=10.1
+ARG CUDA=11.0
 FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}-base-ubuntu${UBUNTU_VERSION} as base
 # ARCH and CUDA are specified again because the FROM directive resets ARGs
 # (but their default value is retained if set previously)
 ARG ARCH
 ARG CUDA
-ARG CUDNN=7.6.4.38-1
-ARG CUDNN_MAJOR_VERSION=7
+ARG CUDNN=8.0.2.39-1
+ARG CUDNN_MAJOR_VERSION=8
 ARG LIB_DIR_PREFIX=x86_64
-ARG LIBNVINFER=6.0.1-1
-ARG LIBNVINFER_MAJOR_VERSION=6
+ARG LIBNVINFER=7.1.3-1
+ARG LIBNVINFER_MAJOR_VERSION=7
 
 # Needed for string substitution
 SHELL ["/bin/bash", "-c"]
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
         cuda-command-line-tools-${CUDA/./-} \
-        # There appears to be a regression in libcublas10=10.2.2.89-1 which
-        # prevents cublas from initializing in TF. See
-        # https://github.com/tensorflow/tensorflow/issues/9489#issuecomment-562394257
-        libcublas10=10.2.1.243-1 \ 
-        libcublas-dev=10.2.1.243-1 \
+        libcublas-${CUDA/./-} \
+        libcublas-dev-${CUDA/./-} \
         cuda-nvrtc-${CUDA/./-} \
         cuda-nvrtc-dev-${CUDA/./-} \
         cuda-cudart-dev-${CUDA/./-} \
-        cuda-cufft-dev-${CUDA/./-} \
-        cuda-curand-dev-${CUDA/./-} \
-        cuda-cusolver-dev-${CUDA/./-} \
-        cuda-cusparse-dev-${CUDA/./-} \
-        libcudnn7=${CUDNN}+cuda${CUDA} \
-        libcudnn7-dev=${CUDNN}+cuda${CUDA} \
+        libcufft-dev-${CUDA/./-} \
+        libcurand-dev-${CUDA/./-} \
+        libcusolver-dev-${CUDA/./-} \
+        libcusparse-dev-${CUDA/./-} \
+        libcudnn8=${CUDNN}+cuda${CUDA} \
+        libcudnn8-dev=${CUDNN}+cuda${CUDA} \
         libcurl3-dev \
         libfreetype6-dev \
         libhdf5-serial-dev \
@@ -67,7 +64,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         git \
         && \
     find /usr/local/cuda-${CUDA}/lib64/ -type f -name 'lib*_static.a' -not -name 'libcudart_static.a' -delete && \
-    rm /usr/lib/${LIB_DIR_PREFIX}-linux-gnu/libcudnn_static_v7.a
+    rm /usr/lib/${LIB_DIR_PREFIX}-linux-gnu/libcudnn_static_v8.a
 
 # Install TensorRT if not building for PowerPC
 RUN [[ "${ARCH}" = "ppc64le" ]] || { apt-get update && \
@@ -79,7 +76,7 @@ RUN [[ "${ARCH}" = "ppc64le" ]] || { apt-get update && \
         && rm -rf /var/lib/apt/lists/*; }
 
 # Configure the build for our CUDA configuration.
-ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:/usr/include/x64_64-linux-gnu:$LD_LIBRARY_PATH
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:/usr/include/x86_64-linux-gnu:/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH
 ENV TF_NEED_CUDA 1
 ENV TF_NEED_TENSORRT 1
 ENV TF_CUDA_VERSION=${CUDA}
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/gpu-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/gpu-jupyter.Dockerfile
index d4d913ce34a..911678b2ce3 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/gpu-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/gpu-jupyter.Dockerfile
@@ -22,17 +22,17 @@
 ARG UBUNTU_VERSION=18.04
 
 ARG ARCH=
-ARG CUDA=10.1
+ARG CUDA=11.0
 FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}-base-ubuntu${UBUNTU_VERSION} as base
 # ARCH and CUDA are specified again because the FROM directive resets ARGs
 # (but their default value is retained if set previously)
 ARG ARCH
 ARG CUDA
-ARG CUDNN=7.6.4.38-1
-ARG CUDNN_MAJOR_VERSION=7
+ARG CUDNN=8.0.2.39-1
+ARG CUDNN_MAJOR_VERSION=8
 ARG LIB_DIR_PREFIX=x86_64
-ARG LIBNVINFER=6.0.1-1
-ARG LIBNVINFER_MAJOR_VERSION=6
+ARG LIBNVINFER=7.1.3-1
+ARG LIBNVINFER_MAJOR_VERSION=7
 
 # Needed for string substitution
 SHELL ["/bin/bash", "-c"]
@@ -40,17 +40,14 @@ SHELL ["/bin/bash", "-c"]
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
         cuda-command-line-tools-${CUDA/./-} \
-        # There appears to be a regression in libcublas10=10.2.2.89-1 which
-        # prevents cublas from initializing in TF. See
-        # https://github.com/tensorflow/tensorflow/issues/9489#issuecomment-562394257
-        libcublas10=10.2.1.243-1 \ 
+        libcublas-${CUDA/./-} \
         cuda-nvrtc-${CUDA/./-} \
-        cuda-cufft-${CUDA/./-} \
-        cuda-curand-${CUDA/./-} \
-        cuda-cusolver-${CUDA/./-} \
-        cuda-cusparse-${CUDA/./-} \
+        libcufft-${CUDA/./-} \
+        libcurand-${CUDA/./-} \
+        libcusolver-${CUDA/./-} \
+        libcusparse-${CUDA/./-} \
         curl \
-        libcudnn7=${CUDNN}+cuda${CUDA} \
+        libcudnn8=${CUDNN}+cuda${CUDA} \
         libfreetype6-dev \
         libhdf5-serial-dev \
         libzmq3-dev \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/gpu.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/gpu.Dockerfile
index f563f2fc909..228513d6736 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/gpu.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/gpu.Dockerfile
@@ -22,17 +22,17 @@
 ARG UBUNTU_VERSION=18.04
 
 ARG ARCH=
-ARG CUDA=10.1
+ARG CUDA=11.0
 FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}-base-ubuntu${UBUNTU_VERSION} as base
 # ARCH and CUDA are specified again because the FROM directive resets ARGs
 # (but their default value is retained if set previously)
 ARG ARCH
 ARG CUDA
-ARG CUDNN=7.6.4.38-1
-ARG CUDNN_MAJOR_VERSION=7
+ARG CUDNN=8.0.2.39-1
+ARG CUDNN_MAJOR_VERSION=8
 ARG LIB_DIR_PREFIX=x86_64
-ARG LIBNVINFER=6.0.1-1
-ARG LIBNVINFER_MAJOR_VERSION=6
+ARG LIBNVINFER=7.1.3-1
+ARG LIBNVINFER_MAJOR_VERSION=7
 
 # Needed for string substitution
 SHELL ["/bin/bash", "-c"]
@@ -40,17 +40,14 @@ SHELL ["/bin/bash", "-c"]
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
         cuda-command-line-tools-${CUDA/./-} \
-        # There appears to be a regression in libcublas10=10.2.2.89-1 which
-        # prevents cublas from initializing in TF. See
-        # https://github.com/tensorflow/tensorflow/issues/9489#issuecomment-562394257
-        libcublas10=10.2.1.243-1 \ 
+        libcublas-${CUDA/./-} \
         cuda-nvrtc-${CUDA/./-} \
-        cuda-cufft-${CUDA/./-} \
-        cuda-curand-${CUDA/./-} \
-        cuda-cusolver-${CUDA/./-} \
-        cuda-cusparse-${CUDA/./-} \
+        libcufft-${CUDA/./-} \
+        libcurand-${CUDA/./-} \
+        libcusolver-${CUDA/./-} \
+        libcusparse-${CUDA/./-} \
         curl \
-        libcudnn7=${CUDNN}+cuda${CUDA} \
+        libcudnn8=${CUDNN}+cuda${CUDA} \
         libfreetype6-dev \
         libhdf5-serial-dev \
         libzmq3-dev \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-onednn-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-jupyter.Dockerfile
similarity index 100%
rename from tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-onednn-jupyter.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-jupyter.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-onednn-mpi-horovod-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-mpi-horovod-jupyter.Dockerfile
similarity index 100%
rename from tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-onednn-mpi-horovod-jupyter.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-mpi-horovod-jupyter.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-onednn-mpi-horovod.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-mpi-horovod.Dockerfile
similarity index 100%
rename from tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-onednn-mpi-horovod.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-mpi-horovod.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-mpich-horovod-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-mpich-horovod-jupyter.Dockerfile
new file mode 100644
index 00000000000..ee6abd862ed
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-mpich-horovod-jupyter.Dockerfile
@@ -0,0 +1,132 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=20.04
+
+FROM ubuntu:${UBUNTU_VERSION} AS base
+
+ARG DEBIAN_FRONTEND="noninteractive"
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+        build-essential \
+        curl \
+        git \
+        libcurl3-dev \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libzmq3-dev \
+        pkg-config \
+        rsync \
+        software-properties-common \
+        sudo \
+        unzip \
+        zip \
+        zlib1g-dev \
+        openjdk-8-jdk \
+        openjdk-8-jre-headless \
+        && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+ENV CI_BUILD_PYTHON python
+
+# CACHE_STOP is used to rerun future commands, otherwise cloning tensorflow will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+# Check out TensorFlow source code if --build-arg CHECKOUT_TF_SRC=1
+ARG CHECKOUT_TF_SRC=0
+ARG TF_BRANCH=master
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git --branch "${TF_BRANCH}" --single-branch /tensorflow_src || true
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    python3 \
+    python3-pip
+
+RUN python3 -m pip --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which python3) /usr/local/bin/python
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    curl
+
+# Install bazel
+ARG BAZEL_VERSION=3.1.0
+RUN mkdir /bazel && \
+    curl -fSsL -o /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
+    curl -fSsL -o /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
+    bash /bazel/installer.sh && \
+    rm -f /bazel/installer.sh
+
+ARG DEBIAN_FRONTEND="noninteractive"
+
+# install mpich, openssh for MPI to communicate between containers
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    mpich \
+    libmpich-dev \
+    openssh-client \
+    openssh-server && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Create a wrapper for MPICH to allow running as root by default
+RUN mv /usr/bin/mpirun /usr/bin/mpirun.real && \
+    echo '#!/bin/bash' > /usr/bin/mpirun && \
+    echo 'mpirun.real "$@"' >> /usr/bin/mpirun && \
+    chmod a+x /usr/bin/mpirun
+
+# Disable GCC noise for gcc newer than 5.x, otherwise Horovod installation fails
+RUN sed -i 's/# if __GNUC__ > 5/# if __GNUC__ > 9/g' /usr/include/mpich/mpicxx.h
+
+
+# Set up SSH
+RUN mkdir -p /var/run/sshd
+
+# Allow OpenSSH to talk to containers without asking for confirmation
+RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
+    echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
+    mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
+
+# Check out horovod source code if --build-arg CHECKOUT_HOROVOD_SRC=1
+ARG CHECKOUT_HOROVOD_SRC=0
+ARG HOROVOD_BRANCH=master
+RUN test "${CHECKOUT_HOROVOD_SRC}" -eq 1 && git clone --branch "${HOROVOD_BRANCH}" --single-branch --recursive https://github.com/uber/horovod.git /horovod_src || true
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
+
+RUN python3 -m pip install --no-cache-dir jupyter matplotlib
+# Pin ipykernel and nbformat; see https://github.com/ipython/ipykernel/issues/422
+RUN python3 -m pip install --no-cache-dir jupyter_http_over_ws ipykernel==5.1.1 nbformat==4.4.0
+RUN jupyter serverextension enable --py jupyter_http_over_ws
+
+RUN mkdir -p /tf/ && chmod -R a+rwx /tf/
+RUN mkdir /.local && chmod a+rwx /.local
+WORKDIR /tf
+EXPOSE 8888
+
+RUN python3 -m ipykernel.kernelspec
+
+CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/tf --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-mpich-horovod.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-mpich-horovod.Dockerfile
new file mode 100644
index 00000000000..daf92ea7e2d
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-mpich-horovod.Dockerfile
@@ -0,0 +1,118 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=20.04
+
+FROM ubuntu:${UBUNTU_VERSION} AS base
+
+ARG DEBIAN_FRONTEND="noninteractive"
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+        build-essential \
+        curl \
+        git \
+        libcurl3-dev \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libzmq3-dev \
+        pkg-config \
+        rsync \
+        software-properties-common \
+        sudo \
+        unzip \
+        zip \
+        zlib1g-dev \
+        openjdk-8-jdk \
+        openjdk-8-jre-headless \
+        && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+ENV CI_BUILD_PYTHON python
+
+# CACHE_STOP is used to rerun future commands, otherwise cloning tensorflow will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+# Check out TensorFlow source code if --build-arg CHECKOUT_TF_SRC=1
+ARG CHECKOUT_TF_SRC=0
+ARG TF_BRANCH=master
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git --branch "${TF_BRANCH}" --single-branch /tensorflow_src || true
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    python3 \
+    python3-pip
+
+RUN python3 -m pip --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which python3) /usr/local/bin/python
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    curl
+
+# Install bazel
+ARG BAZEL_VERSION=3.1.0
+RUN mkdir /bazel && \
+    curl -fSsL -o /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
+    curl -fSsL -o /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
+    bash /bazel/installer.sh && \
+    rm -f /bazel/installer.sh
+
+ARG DEBIAN_FRONTEND="noninteractive"
+
+# install mpich, openssh for MPI to communicate between containers
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    mpich \
+    libmpich-dev \
+    openssh-client \
+    openssh-server && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Create a wrapper for MPICH to allow running as root by default
+RUN mv /usr/bin/mpirun /usr/bin/mpirun.real && \
+    echo '#!/bin/bash' > /usr/bin/mpirun && \
+    echo 'mpirun.real "$@"' >> /usr/bin/mpirun && \
+    chmod a+x /usr/bin/mpirun
+
+# Disable GCC noise for gcc newer than 5.x, otherwise Horovod installation fails
+RUN sed -i 's/# if __GNUC__ > 5/# if __GNUC__ > 9/g' /usr/include/mpich/mpicxx.h
+
+
+# Set up SSH
+RUN mkdir -p /var/run/sshd
+
+# Allow OpenSSH to talk to containers without asking for confirmation
+RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
+    echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
+    mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
+
+# Check out horovod source code if --build-arg CHECKOUT_HOROVOD_SRC=1
+ARG CHECKOUT_HOROVOD_SRC=0
+ARG HOROVOD_BRANCH=master
+RUN test "${CHECKOUT_HOROVOD_SRC}" -eq 1 && git clone --branch "${HOROVOD_BRANCH}" --single-branch --recursive https://github.com/uber/horovod.git /horovod_src || true
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-onednn.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel.Dockerfile
similarity index 100%
rename from tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-onednn.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-onednn-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-jupyter.Dockerfile
similarity index 100%
rename from tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-onednn-jupyter.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-jupyter.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-onednn-mpi-horovod-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-mpi-horovod-jupyter.Dockerfile
similarity index 100%
rename from tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-onednn-mpi-horovod-jupyter.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-mpi-horovod-jupyter.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-onednn-mpi-horovod.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-mpi-horovod.Dockerfile
similarity index 100%
rename from tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-onednn-mpi-horovod.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-mpi-horovod.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-mpich-horovod-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-mpich-horovod-jupyter.Dockerfile
new file mode 100644
index 00000000000..32f935e5ff6
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-mpich-horovod-jupyter.Dockerfile
@@ -0,0 +1,117 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=20.04
+
+FROM ubuntu:${UBUNTU_VERSION} as base
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    python3 \
+    python3-pip
+
+RUN python3 -m pip --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which python3) /usr/local/bin/python
+
+# Options:
+#   tensorflow
+#   tensorflow-gpu
+#   tf-nightly
+#   tf-nightly-gpu
+# Set --build-arg TF_PACKAGE_VERSION=1.11.0rc0 to install a specific version.
+# Installs the latest version by default.
+ARG TF_PACKAGE=tensorflow
+ARG TF_PACKAGE_VERSION=
+RUN python3 -m pip install --no-cache-dir ${TF_PACKAGE}${TF_PACKAGE_VERSION:+==${TF_PACKAGE_VERSION}}
+
+ARG DEBIAN_FRONTEND="noninteractive"
+
+# install mpich, openssh for MPI to communicate between containers
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    mpich \
+    libmpich-dev \
+    openssh-client \
+    openssh-server && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Create a wrapper for MPICH to allow running as root by default
+RUN mv /usr/bin/mpirun /usr/bin/mpirun.real && \
+    echo '#!/bin/bash' > /usr/bin/mpirun && \
+    echo 'mpirun.real "$@"' >> /usr/bin/mpirun && \
+    chmod a+x /usr/bin/mpirun
+
+# Disable GCC noise for gcc newer than 5.x, otherwise Horovod installation fails
+RUN sed -i 's/# if __GNUC__ > 5/# if __GNUC__ > 9/g' /usr/include/mpich/mpicxx.h
+
+
+# Set up SSH
+RUN mkdir -p /var/run/sshd
+
+# Allow OpenSSH to talk to containers without asking for confirmation
+RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
+    echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
+    mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
+
+# Install Horovod
+ARG HOROVOD_WITHOUT_PYTORCH=1
+ARG HOROVOD_WITHOUT_MXNET=1
+ARG HOROVOD_WITH_TENSORFLOW=1
+ARG HOROVOD_VERSION=
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    software-properties-common
+
+RUN add-apt-repository ppa:ubuntu-toolchain-r/test
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    build-essential \
+    g++-8 \
+    gcc-8 \
+    python3-dev
+
+RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-5 500 --slave /usr/bin/g++ g++ /usr/bin/g++-5 && \
+    update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-8 800 --slave /usr/bin/g++ g++ /usr/bin/g++-8
+
+RUN python3 -m pip install --no-cache-dir horovod${HOROVOD_VERSION:+==${HOROVOD_VERSION}}
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
+
+RUN python3 -m pip install --no-cache-dir jupyter matplotlib
+# Pin ipykernel and nbformat; see https://github.com/ipython/ipykernel/issues/422
+RUN python3 -m pip install --no-cache-dir jupyter_http_over_ws ipykernel==5.1.1 nbformat==4.4.0
+RUN jupyter serverextension enable --py jupyter_http_over_ws
+
+RUN mkdir -p /tf/ && chmod -R a+rwx /tf/
+RUN mkdir /.local && chmod a+rwx /.local
+WORKDIR /tf
+EXPOSE 8888
+
+RUN python3 -m ipykernel.kernelspec
+
+CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/tf --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-mpich-horovod.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-mpich-horovod.Dockerfile
new file mode 100644
index 00000000000..11875008066
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-mpich-horovod.Dockerfile
@@ -0,0 +1,103 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=20.04
+
+FROM ubuntu:${UBUNTU_VERSION} as base
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    python3 \
+    python3-pip
+
+RUN python3 -m pip --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which python3) /usr/local/bin/python
+
+# Options:
+#   tensorflow
+#   tensorflow-gpu
+#   tf-nightly
+#   tf-nightly-gpu
+# Set --build-arg TF_PACKAGE_VERSION=1.11.0rc0 to install a specific version.
+# Installs the latest version by default.
+ARG TF_PACKAGE=tensorflow
+ARG TF_PACKAGE_VERSION=
+RUN python3 -m pip install --no-cache-dir ${TF_PACKAGE}${TF_PACKAGE_VERSION:+==${TF_PACKAGE_VERSION}}
+
+ARG DEBIAN_FRONTEND="noninteractive"
+
+# install mpich, openssh for MPI to communicate between containers
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    mpich \
+    libmpich-dev \
+    openssh-client \
+    openssh-server && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Create a wrapper for MPICH to allow running as root by default
+RUN mv /usr/bin/mpirun /usr/bin/mpirun.real && \
+    echo '#!/bin/bash' > /usr/bin/mpirun && \
+    echo 'mpirun.real "$@"' >> /usr/bin/mpirun && \
+    chmod a+x /usr/bin/mpirun
+
+# Disable GCC noise for gcc newer than 5.x, otherwise Horovod installation fails
+RUN sed -i 's/# if __GNUC__ > 5/# if __GNUC__ > 9/g' /usr/include/mpich/mpicxx.h
+
+
+# Set up SSH
+RUN mkdir -p /var/run/sshd
+
+# Allow OpenSSH to talk to containers without asking for confirmation
+RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
+    echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
+    mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
+
+# Install Horovod
+ARG HOROVOD_WITHOUT_PYTORCH=1
+ARG HOROVOD_WITHOUT_MXNET=1
+ARG HOROVOD_WITH_TENSORFLOW=1
+ARG HOROVOD_VERSION=
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    software-properties-common
+
+RUN add-apt-repository ppa:ubuntu-toolchain-r/test
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    build-essential \
+    g++-8 \
+    gcc-8 \
+    python3-dev
+
+RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-5 500 --slave /usr/bin/g++ g++ /usr/bin/g++-5 && \
+    update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-8 800 --slave /usr/bin/g++ g++ /usr/bin/g++-8
+
+RUN python3 -m pip install --no-cache-dir horovod${HOROVOD_VERSION:+==${HOROVOD_VERSION}}
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-onednn.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04.Dockerfile
similarity index 100%
rename from tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-onednn.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-onednn-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-jupyter.Dockerfile
similarity index 100%
rename from tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-onednn-jupyter.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-jupyter.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-onednn-mpi-horovod-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-mpi-horovod-jupyter.Dockerfile
similarity index 100%
rename from tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-onednn-mpi-horovod-jupyter.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-mpi-horovod-jupyter.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-onednn-mpi-horovod.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-mpi-horovod.Dockerfile
similarity index 100%
rename from tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-onednn-mpi-horovod.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-mpi-horovod.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-mpich-horovod-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-mpich-horovod-jupyter.Dockerfile
new file mode 100644
index 00000000000..030fb86dbe5
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-mpich-horovod-jupyter.Dockerfile
@@ -0,0 +1,128 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=20.04
+
+FROM ubuntu:${UBUNTU_VERSION} AS base
+
+ARG DEBIAN_FRONTEND="noninteractive"
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+        build-essential \
+        curl \
+        git \
+        libcurl3-dev \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libzmq3-dev \
+        pkg-config \
+        rsync \
+        software-properties-common \
+        sudo \
+        unzip \
+        zip \
+        zlib1g-dev \
+        openjdk-8-jdk \
+        openjdk-8-jre-headless \
+        && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+ENV CI_BUILD_PYTHON python
+
+# CACHE_STOP is used to rerun future commands, otherwise cloning tensorflow will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+# Check out TensorFlow source code if --build-arg CHECKOUT_TF_SRC=1
+ARG CHECKOUT_TF_SRC=0
+ARG TF_BRANCH=master
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git --branch "${TF_BRANCH}" --single-branch /tensorflow_src || true
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    python3 \
+    python3-pip
+
+RUN python3 -m pip --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which python3) /usr/local/bin/python
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    curl
+
+# Install bazel
+ARG BAZEL_VERSION=3.1.0
+RUN mkdir /bazel && \
+    curl -fSsL -o /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
+    curl -fSsL -o /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
+    bash /bazel/installer.sh && \
+    rm -f /bazel/installer.sh
+
+ARG DEBIAN_FRONTEND="noninteractive"
+
+# install mpich, openssh for MPI to communicate between containers
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    mpich \
+    libmpich-dev \
+    openssh-client \
+    openssh-server && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Create a wrapper for MPICH to allow running as root by default
+RUN mv /usr/bin/mpirun /usr/bin/mpirun.real && \
+    echo '#!/bin/bash' > /usr/bin/mpirun && \
+    echo 'mpirun.real "$@"' >> /usr/bin/mpirun && \
+    chmod a+x /usr/bin/mpirun
+
+# Set up SSH
+RUN mkdir -p /var/run/sshd
+
+# Allow OpenSSH to talk to containers without asking for confirmation
+RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
+    echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
+    mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
+
+# Check out horovod source code if --build-arg CHECKOUT_HOROVOD_SRC=1
+ARG CHECKOUT_HOROVOD_SRC=0
+ARG HOROVOD_BRANCH=master
+RUN test "${CHECKOUT_HOROVOD_SRC}" -eq 1 && git clone --branch "${HOROVOD_BRANCH}" --single-branch --recursive https://github.com/uber/horovod.git /horovod_src || true
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
+
+RUN python3 -m pip install --no-cache-dir jupyter matplotlib
+# Pin ipykernel and nbformat; see https://github.com/ipython/ipykernel/issues/422
+RUN python3 -m pip install --no-cache-dir jupyter_http_over_ws ipykernel==5.1.1 nbformat==4.4.0
+RUN jupyter serverextension enable --py jupyter_http_over_ws
+
+RUN mkdir -p /tf/ && chmod -R a+rwx /tf/
+RUN mkdir /.local && chmod a+rwx /.local
+WORKDIR /tf
+EXPOSE 8888
+
+RUN python3 -m ipykernel.kernelspec
+
+CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/tf --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-mpich-horovod.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-mpich-horovod.Dockerfile
new file mode 100644
index 00000000000..ad763a8626e
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-mpich-horovod.Dockerfile
@@ -0,0 +1,114 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=20.04
+
+FROM ubuntu:${UBUNTU_VERSION} AS base
+
+ARG DEBIAN_FRONTEND="noninteractive"
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+        build-essential \
+        curl \
+        git \
+        libcurl3-dev \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libzmq3-dev \
+        pkg-config \
+        rsync \
+        software-properties-common \
+        sudo \
+        unzip \
+        zip \
+        zlib1g-dev \
+        openjdk-8-jdk \
+        openjdk-8-jre-headless \
+        && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+ENV CI_BUILD_PYTHON python
+
+# CACHE_STOP is used to rerun future commands, otherwise cloning tensorflow will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+# Check out TensorFlow source code if --build-arg CHECKOUT_TF_SRC=1
+ARG CHECKOUT_TF_SRC=0
+ARG TF_BRANCH=master
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git --branch "${TF_BRANCH}" --single-branch /tensorflow_src || true
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    python3 \
+    python3-pip
+
+RUN python3 -m pip --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which python3) /usr/local/bin/python
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    curl
+
+# Install bazel
+ARG BAZEL_VERSION=3.1.0
+RUN mkdir /bazel && \
+    curl -fSsL -o /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
+    curl -fSsL -o /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
+    bash /bazel/installer.sh && \
+    rm -f /bazel/installer.sh
+
+ARG DEBIAN_FRONTEND="noninteractive"
+
+# install mpich, openssh for MPI to communicate between containers
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    mpich \
+    libmpich-dev \
+    openssh-client \
+    openssh-server && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Create a wrapper for MPICH to allow running as root by default
+RUN mv /usr/bin/mpirun /usr/bin/mpirun.real && \
+    echo '#!/bin/bash' > /usr/bin/mpirun && \
+    echo 'mpirun.real "$@"' >> /usr/bin/mpirun && \
+    chmod a+x /usr/bin/mpirun
+
+# Set up SSH
+RUN mkdir -p /var/run/sshd
+
+# Allow OpenSSH to talk to containers without asking for confirmation
+RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
+    echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
+    mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
+
+# Check out horovod source code if --build-arg CHECKOUT_HOROVOD_SRC=1
+ARG CHECKOUT_HOROVOD_SRC=0
+ARG HOROVOD_BRANCH=master
+RUN test "${CHECKOUT_HOROVOD_SRC}" -eq 1 && git clone --branch "${HOROVOD_BRANCH}" --single-branch --recursive https://github.com/uber/horovod.git /horovod_src || true
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-onednn.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel.Dockerfile
similarity index 100%
rename from tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-onednn.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-onednn-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-jupyter.Dockerfile
similarity index 100%
rename from tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-onednn-jupyter.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-jupyter.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-onednn-mpi-horovod-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-mpi-horovod-jupyter.Dockerfile
similarity index 100%
rename from tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-onednn-mpi-horovod-jupyter.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-mpi-horovod-jupyter.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-onednn-mpi-horovod.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-mpi-horovod.Dockerfile
similarity index 100%
rename from tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-onednn-mpi-horovod.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-mpi-horovod.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-mpich-horovod-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-mpich-horovod-jupyter.Dockerfile
new file mode 100644
index 00000000000..0b4289284e3
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-mpich-horovod-jupyter.Dockerfile
@@ -0,0 +1,108 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=20.04
+
+FROM ubuntu:${UBUNTU_VERSION} as base
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    python3 \
+    python3-pip
+
+RUN python3 -m pip --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which python3) /usr/local/bin/python
+
+# Options:
+#   tensorflow
+#   tensorflow-gpu
+#   tf-nightly
+#   tf-nightly-gpu
+# Set --build-arg TF_PACKAGE_VERSION=1.11.0rc0 to install a specific version.
+# Installs the latest version by default.
+ARG TF_PACKAGE=tensorflow
+ARG TF_PACKAGE_VERSION=
+RUN python3 -m pip install --no-cache-dir ${TF_PACKAGE}${TF_PACKAGE_VERSION:+==${TF_PACKAGE_VERSION}}
+
+ARG DEBIAN_FRONTEND="noninteractive"
+
+# install mpich, openssh for MPI to communicate between containers
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    mpich \
+    libmpich-dev \
+    openssh-client \
+    openssh-server && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Create a wrapper for MPICH to allow running as root by default
+RUN mv /usr/bin/mpirun /usr/bin/mpirun.real && \
+    echo '#!/bin/bash' > /usr/bin/mpirun && \
+    echo 'mpirun.real "$@"' >> /usr/bin/mpirun && \
+    chmod a+x /usr/bin/mpirun
+
+# Set up SSH
+RUN mkdir -p /var/run/sshd
+
+# Allow OpenSSH to talk to containers without asking for confirmation
+RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
+    echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
+    mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
+
+# Install Horovod
+ARG HOROVOD_WITHOUT_PYTORCH=1
+ARG HOROVOD_WITHOUT_MXNET=1
+ARG HOROVOD_WITH_TENSORFLOW=1
+ARG HOROVOD_VERSION=
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    build-essential \
+    g++-8 \
+    gcc-8 \
+    python3-dev
+
+RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-7 700 --slave /usr/bin/g++ g++ /usr/bin/g++-7 && \
+    update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-8 800 --slave /usr/bin/g++ g++ /usr/bin/g++-8
+
+RUN python3 -m pip install --no-cache-dir horovod${HOROVOD_VERSION:+==${HOROVOD_VERSION}}
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
+
+RUN python3 -m pip install --no-cache-dir jupyter matplotlib
+# Pin ipykernel and nbformat; see https://github.com/ipython/ipykernel/issues/422
+RUN python3 -m pip install --no-cache-dir jupyter_http_over_ws ipykernel==5.1.1 nbformat==4.4.0
+RUN jupyter serverextension enable --py jupyter_http_over_ws
+
+RUN mkdir -p /tf/ && chmod -R a+rwx /tf/
+RUN mkdir /.local && chmod a+rwx /.local
+WORKDIR /tf
+EXPOSE 8888
+
+RUN python3 -m ipykernel.kernelspec
+
+CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/tf --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-mpich-horovod.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-mpich-horovod.Dockerfile
new file mode 100644
index 00000000000..f570e927d76
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-mpich-horovod.Dockerfile
@@ -0,0 +1,94 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=20.04
+
+FROM ubuntu:${UBUNTU_VERSION} as base
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    python3 \
+    python3-pip
+
+RUN python3 -m pip --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which python3) /usr/local/bin/python
+
+# Options:
+#   tensorflow
+#   tensorflow-gpu
+#   tf-nightly
+#   tf-nightly-gpu
+# Set --build-arg TF_PACKAGE_VERSION=1.11.0rc0 to install a specific version.
+# Installs the latest version by default.
+ARG TF_PACKAGE=tensorflow
+ARG TF_PACKAGE_VERSION=
+RUN python3 -m pip install --no-cache-dir ${TF_PACKAGE}${TF_PACKAGE_VERSION:+==${TF_PACKAGE_VERSION}}
+
+ARG DEBIAN_FRONTEND="noninteractive"
+
+# install mpich, openssh for MPI to communicate between containers
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    mpich \
+    libmpich-dev \
+    openssh-client \
+    openssh-server && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Create a wrapper for MPICH to allow running as root by default
+RUN mv /usr/bin/mpirun /usr/bin/mpirun.real && \
+    echo '#!/bin/bash' > /usr/bin/mpirun && \
+    echo 'mpirun.real "$@"' >> /usr/bin/mpirun && \
+    chmod a+x /usr/bin/mpirun
+
+# Set up SSH
+RUN mkdir -p /var/run/sshd
+
+# Allow OpenSSH to talk to containers without asking for confirmation
+RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
+    echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
+    mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
+
+# Install Horovod
+ARG HOROVOD_WITHOUT_PYTORCH=1
+ARG HOROVOD_WITHOUT_MXNET=1
+ARG HOROVOD_WITH_TENSORFLOW=1
+ARG HOROVOD_VERSION=
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    build-essential \
+    g++-8 \
+    gcc-8 \
+    python3-dev
+
+RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-7 700 --slave /usr/bin/g++ g++ /usr/bin/g++-7 && \
+    update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-8 800 --slave /usr/bin/g++ g++ /usr/bin/g++-8
+
+RUN python3 -m pip install --no-cache-dir horovod${HOROVOD_VERSION:+==${HOROVOD_VERSION}}
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-onednn.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04.Dockerfile
similarity index 100%
rename from tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-onednn.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-onednn-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-jupyter.Dockerfile
similarity index 100%
rename from tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-onednn-jupyter.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-jupyter.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-onednn-mpi-horovod-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-mpi-horovod-jupyter.Dockerfile
similarity index 100%
rename from tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-onednn-mpi-horovod-jupyter.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-mpi-horovod-jupyter.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-onednn-mpi-horovod.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-mpi-horovod.Dockerfile
similarity index 100%
rename from tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-onednn-mpi-horovod.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-mpi-horovod.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-mpich-horovod-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-mpich-horovod-jupyter.Dockerfile
new file mode 100644
index 00000000000..f123955e3d0
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-mpich-horovod-jupyter.Dockerfile
@@ -0,0 +1,138 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=20.04
+
+FROM ubuntu:${UBUNTU_VERSION} AS base
+
+ARG DEBIAN_FRONTEND="noninteractive"
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+        build-essential \
+        curl \
+        git \
+        libcurl3-dev \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libzmq3-dev \
+        pkg-config \
+        rsync \
+        software-properties-common \
+        sudo \
+        unzip \
+        zip \
+        zlib1g-dev \
+        openjdk-8-jdk \
+        openjdk-8-jre-headless \
+        && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+ENV CI_BUILD_PYTHON python
+
+# CACHE_STOP is used to rerun future commands, otherwise cloning tensorflow will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+# Check out TensorFlow source code if --build-arg CHECKOUT_TF_SRC=1
+ARG CHECKOUT_TF_SRC=0
+ARG TF_BRANCH=master
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git --branch "${TF_BRANCH}" --single-branch /tensorflow_src || true
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+ARG PYTHON=python3
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    curl \
+    software-properties-common
+
+RUN add-apt-repository ppa:deadsnakes/ppa
+
+RUN apt-get install -y --no-install-recommends --fix-missing \
+    ${PYTHON}
+
+RUN curl -fSsL https://bootstrap.pypa.io/get-pip.py | python3.7
+RUN ${PYTHON} -m pip --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -sf $(which ${PYTHON}) /usr/local/bin/python && \
+    ln -sf $(which ${PYTHON}) /usr/local/bin/python3 && \
+    ln -sf $(which ${PYTHON}) /usr/bin/python && \
+    ln -sf $(which ${PYTHON}) /usr/bin/python3
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    curl
+
+# Install bazel
+ARG BAZEL_VERSION=3.1.0
+RUN mkdir /bazel && \
+    curl -fSsL -o /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
+    curl -fSsL -o /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
+    bash /bazel/installer.sh && \
+    rm -f /bazel/installer.sh
+
+ARG DEBIAN_FRONTEND="noninteractive"
+
+# install mpich, openssh for MPI to communicate between containers
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    mpich \
+    libmpich-dev \
+    openssh-client \
+    openssh-server && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Create a wrapper for MPICH to allow running as root by default
+RUN mv /usr/bin/mpirun /usr/bin/mpirun.real && \
+    echo '#!/bin/bash' > /usr/bin/mpirun && \
+    echo 'mpirun.real "$@"' >> /usr/bin/mpirun && \
+    chmod a+x /usr/bin/mpirun
+
+# Set up SSH
+RUN mkdir -p /var/run/sshd
+
+# Allow OpenSSH to talk to containers without asking for confirmation
+RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
+    echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
+    mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
+
+# Check out horovod source code if --build-arg CHECKOUT_HOROVOD_SRC=1
+ARG CHECKOUT_HOROVOD_SRC=0
+ARG HOROVOD_BRANCH=master
+RUN test "${CHECKOUT_HOROVOD_SRC}" -eq 1 && git clone --branch "${HOROVOD_BRANCH}" --single-branch --recursive https://github.com/uber/horovod.git /horovod_src || true
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
+
+RUN python3 -m pip install --no-cache-dir jupyter matplotlib
+# Pin ipykernel and nbformat; see https://github.com/ipython/ipykernel/issues/422
+RUN python3 -m pip install --no-cache-dir jupyter_http_over_ws ipykernel==5.1.1 nbformat==4.4.0
+RUN jupyter serverextension enable --py jupyter_http_over_ws
+
+RUN mkdir -p /tf/ && chmod -R a+rwx /tf/
+RUN mkdir /.local && chmod a+rwx /.local
+WORKDIR /tf
+EXPOSE 8888
+
+RUN python3 -m ipykernel.kernelspec
+
+CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/tf --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-mpich-horovod.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-mpich-horovod.Dockerfile
new file mode 100644
index 00000000000..d4abafe55b1
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-mpich-horovod.Dockerfile
@@ -0,0 +1,124 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=20.04
+
+FROM ubuntu:${UBUNTU_VERSION} AS base
+
+ARG DEBIAN_FRONTEND="noninteractive"
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+        build-essential \
+        curl \
+        git \
+        libcurl3-dev \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libzmq3-dev \
+        pkg-config \
+        rsync \
+        software-properties-common \
+        sudo \
+        unzip \
+        zip \
+        zlib1g-dev \
+        openjdk-8-jdk \
+        openjdk-8-jre-headless \
+        && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+ENV CI_BUILD_PYTHON python
+
+# CACHE_STOP is used to rerun future commands, otherwise cloning tensorflow will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+# Check out TensorFlow source code if --build-arg CHECKOUT_TF_SRC=1
+ARG CHECKOUT_TF_SRC=0
+ARG TF_BRANCH=master
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git --branch "${TF_BRANCH}" --single-branch /tensorflow_src || true
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+ARG PYTHON=python3
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    curl \
+    software-properties-common
+
+RUN add-apt-repository ppa:deadsnakes/ppa
+
+RUN apt-get install -y --no-install-recommends --fix-missing \
+    ${PYTHON}
+
+RUN curl -fSsL https://bootstrap.pypa.io/get-pip.py | python3.7
+RUN ${PYTHON} -m pip --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -sf $(which ${PYTHON}) /usr/local/bin/python && \
+    ln -sf $(which ${PYTHON}) /usr/local/bin/python3 && \
+    ln -sf $(which ${PYTHON}) /usr/bin/python && \
+    ln -sf $(which ${PYTHON}) /usr/bin/python3
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    curl
+
+# Install bazel
+ARG BAZEL_VERSION=3.1.0
+RUN mkdir /bazel && \
+    curl -fSsL -o /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
+    curl -fSsL -o /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
+    bash /bazel/installer.sh && \
+    rm -f /bazel/installer.sh
+
+ARG DEBIAN_FRONTEND="noninteractive"
+
+# install mpich, openssh for MPI to communicate between containers
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    mpich \
+    libmpich-dev \
+    openssh-client \
+    openssh-server && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Create a wrapper for MPICH to allow running as root by default
+RUN mv /usr/bin/mpirun /usr/bin/mpirun.real && \
+    echo '#!/bin/bash' > /usr/bin/mpirun && \
+    echo 'mpirun.real "$@"' >> /usr/bin/mpirun && \
+    chmod a+x /usr/bin/mpirun
+
+# Set up SSH
+RUN mkdir -p /var/run/sshd
+
+# Allow OpenSSH to talk to containers without asking for confirmation
+RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
+    echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
+    mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
+
+# Check out horovod source code if --build-arg CHECKOUT_HOROVOD_SRC=1
+ARG CHECKOUT_HOROVOD_SRC=0
+ARG HOROVOD_BRANCH=master
+RUN test "${CHECKOUT_HOROVOD_SRC}" -eq 1 && git clone --branch "${HOROVOD_BRANCH}" --single-branch --recursive https://github.com/uber/horovod.git /horovod_src || true
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-onednn.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel.Dockerfile
similarity index 100%
rename from tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-onednn.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-onednn-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-jupyter.Dockerfile
similarity index 100%
rename from tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-onednn-jupyter.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-jupyter.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-onednn-mpi-horovod-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-mpi-horovod-jupyter.Dockerfile
similarity index 100%
rename from tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-onednn-mpi-horovod-jupyter.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-mpi-horovod-jupyter.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-onednn-mpi-horovod.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-mpi-horovod.Dockerfile
similarity index 100%
rename from tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-onednn-mpi-horovod.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-mpi-horovod.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-mpich-horovod-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-mpich-horovod-jupyter.Dockerfile
new file mode 100644
index 00000000000..65473aca585
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-mpich-horovod-jupyter.Dockerfile
@@ -0,0 +1,118 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=20.04
+
+FROM ubuntu:${UBUNTU_VERSION} as base
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+ARG PYTHON=python3
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    curl \
+    software-properties-common
+
+RUN add-apt-repository ppa:deadsnakes/ppa
+
+RUN apt-get install -y --no-install-recommends --fix-missing \
+    ${PYTHON}
+
+RUN curl -fSsL https://bootstrap.pypa.io/get-pip.py | python3.7
+RUN ${PYTHON} -m pip --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -sf $(which ${PYTHON}) /usr/local/bin/python && \
+    ln -sf $(which ${PYTHON}) /usr/local/bin/python3 && \
+    ln -sf $(which ${PYTHON}) /usr/bin/python && \
+    ln -sf $(which ${PYTHON}) /usr/bin/python3
+
+# Options:
+#   tensorflow
+#   tensorflow-gpu
+#   tf-nightly
+#   tf-nightly-gpu
+# Set --build-arg TF_PACKAGE_VERSION=1.11.0rc0 to install a specific version.
+# Installs the latest version by default.
+ARG TF_PACKAGE=tensorflow
+ARG TF_PACKAGE_VERSION=
+RUN python3 -m pip install --no-cache-dir ${TF_PACKAGE}${TF_PACKAGE_VERSION:+==${TF_PACKAGE_VERSION}}
+
+ARG DEBIAN_FRONTEND="noninteractive"
+
+# install mpich, openssh for MPI to communicate between containers
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    mpich \
+    libmpich-dev \
+    openssh-client \
+    openssh-server && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Create a wrapper for MPICH to allow running as root by default
+RUN mv /usr/bin/mpirun /usr/bin/mpirun.real && \
+    echo '#!/bin/bash' > /usr/bin/mpirun && \
+    echo 'mpirun.real "$@"' >> /usr/bin/mpirun && \
+    chmod a+x /usr/bin/mpirun
+
+# Set up SSH
+RUN mkdir -p /var/run/sshd
+
+# Allow OpenSSH to talk to containers without asking for confirmation
+RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
+    echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
+    mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
+
+# Install Horovod
+ARG HOROVOD_WITHOUT_PYTORCH=1
+ARG HOROVOD_WITHOUT_MXNET=1
+ARG HOROVOD_WITH_TENSORFLOW=1
+ARG HOROVOD_VERSION=
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    build-essential \
+    g++-8 \
+    gcc-8 \
+    ${PYTHON}-dev
+
+RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 100 --slave /usr/bin/g++ g++ /usr/bin/g++-9 && \
+    update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-8 800 --slave /usr/bin/g++ g++ /usr/bin/g++-8
+
+RUN python3 -m pip install --no-cache-dir horovod${HOROVOD_VERSION:+==${HOROVOD_VERSION}}
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
+
+RUN python3 -m pip install --no-cache-dir jupyter matplotlib
+# Pin ipykernel and nbformat; see https://github.com/ipython/ipykernel/issues/422
+RUN python3 -m pip install --no-cache-dir jupyter_http_over_ws ipykernel==5.1.1 nbformat==4.4.0
+RUN jupyter serverextension enable --py jupyter_http_over_ws
+
+RUN mkdir -p /tf/ && chmod -R a+rwx /tf/
+RUN mkdir /.local && chmod a+rwx /.local
+WORKDIR /tf
+EXPOSE 8888
+
+RUN python3 -m ipykernel.kernelspec
+
+CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/tf --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-mpich-horovod.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-mpich-horovod.Dockerfile
new file mode 100644
index 00000000000..24bd164eab9
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-mpich-horovod.Dockerfile
@@ -0,0 +1,104 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=20.04
+
+FROM ubuntu:${UBUNTU_VERSION} as base
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+ARG PYTHON=python3
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    curl \
+    software-properties-common
+
+RUN add-apt-repository ppa:deadsnakes/ppa
+
+RUN apt-get install -y --no-install-recommends --fix-missing \
+    ${PYTHON}
+
+RUN curl -fSsL https://bootstrap.pypa.io/get-pip.py | python3.7
+RUN ${PYTHON} -m pip --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -sf $(which ${PYTHON}) /usr/local/bin/python && \
+    ln -sf $(which ${PYTHON}) /usr/local/bin/python3 && \
+    ln -sf $(which ${PYTHON}) /usr/bin/python && \
+    ln -sf $(which ${PYTHON}) /usr/bin/python3
+
+# Options:
+#   tensorflow
+#   tensorflow-gpu
+#   tf-nightly
+#   tf-nightly-gpu
+# Set --build-arg TF_PACKAGE_VERSION=1.11.0rc0 to install a specific version.
+# Installs the latest version by default.
+ARG TF_PACKAGE=tensorflow
+ARG TF_PACKAGE_VERSION=
+RUN python3 -m pip install --no-cache-dir ${TF_PACKAGE}${TF_PACKAGE_VERSION:+==${TF_PACKAGE_VERSION}}
+
+ARG DEBIAN_FRONTEND="noninteractive"
+
+# install mpich, openssh for MPI to communicate between containers
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    mpich \
+    libmpich-dev \
+    openssh-client \
+    openssh-server && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Create a wrapper for MPICH to allow running as root by default
+RUN mv /usr/bin/mpirun /usr/bin/mpirun.real && \
+    echo '#!/bin/bash' > /usr/bin/mpirun && \
+    echo 'mpirun.real "$@"' >> /usr/bin/mpirun && \
+    chmod a+x /usr/bin/mpirun
+
+# Set up SSH
+RUN mkdir -p /var/run/sshd
+
+# Allow OpenSSH to talk to containers without asking for confirmation
+RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
+    echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
+    mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
+
+# Install Horovod
+ARG HOROVOD_WITHOUT_PYTORCH=1
+ARG HOROVOD_WITHOUT_MXNET=1
+ARG HOROVOD_WITH_TENSORFLOW=1
+ARG HOROVOD_VERSION=
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    build-essential \
+    g++-8 \
+    gcc-8 \
+    ${PYTHON}-dev
+
+RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 100 --slave /usr/bin/g++ g++ /usr/bin/g++-9 && \
+    update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-8 800 --slave /usr/bin/g++ g++ /usr/bin/g++-8
+
+RUN python3 -m pip install --no-cache-dir horovod${HOROVOD_VERSION:+==${HOROVOD_VERSION}}
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-onednn.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04.Dockerfile
similarity index 100%
rename from tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-onednn.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le-jupyter.Dockerfile
index c104f6c86cb..946136f0c88 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le-jupyter.Dockerfile
@@ -79,7 +79,7 @@ RUN [[ "${ARCH}" = "ppc64le" ]] || { apt-get update && \
         && rm -rf /var/lib/apt/lists/*; }
 
 # Configure the build for our CUDA configuration.
-ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:/usr/include/x64_64-linux-gnu:$LD_LIBRARY_PATH
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:/usr/include/x86_64-linux-gnu:/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH
 ENV TF_NEED_CUDA 1
 ENV TF_NEED_TENSORRT 1
 ENV TF_CUDA_VERSION=${CUDA}
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le.Dockerfile
index 9e2c6385d34..cf84f4a74a8 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le.Dockerfile
@@ -79,7 +79,7 @@ RUN [[ "${ARCH}" = "ppc64le" ]] || { apt-get update && \
         && rm -rf /var/lib/apt/lists/*; }
 
 # Configure the build for our CUDA configuration.
-ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:/usr/include/x64_64-linux-gnu:$LD_LIBRARY_PATH
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:/usr/include/x86_64-linux-gnu:/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH
 ENV TF_NEED_CUDA 1
 ENV TF_NEED_TENSORRT 1
 ENV TF_CUDA_VERSION=${CUDA}
diff --git a/tensorflow/tools/dockerfiles/partials/onednn/ubuntu/1604-mpich.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/onednn/ubuntu/1604-mpich.partial.Dockerfile
new file mode 100644
index 00000000000..1f7dd889057
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/partials/onednn/ubuntu/1604-mpich.partial.Dockerfile
@@ -0,0 +1,28 @@
+ARG DEBIAN_FRONTEND="noninteractive"
+
+# install mpich, openssh for MPI to communicate between containers
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    mpich \
+    libmpich-dev \
+    openssh-client \
+    openssh-server && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Create a wrapper for MPICH to allow running as root by default
+RUN mv /usr/bin/mpirun /usr/bin/mpirun.real && \
+    echo '#!/bin/bash' > /usr/bin/mpirun && \
+    echo 'mpirun.real "$@"' >> /usr/bin/mpirun && \
+    chmod a+x /usr/bin/mpirun
+
+# Disable GCC noise for gcc newer than 5.x, otherwise Horovod installation fails
+RUN sed -i 's/# if __GNUC__ > 5/# if __GNUC__ > 9/g' /usr/include/mpich/mpicxx.h
+
+
+# Set up SSH
+RUN mkdir -p /var/run/sshd
+
+# Allow OpenSSH to talk to containers without asking for confirmation
+RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
+    echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
+    mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
diff --git a/tensorflow/tools/dockerfiles/partials/onednn/ubuntu/mpich.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/onednn/ubuntu/mpich.partial.Dockerfile
new file mode 100644
index 00000000000..e69a800c9b7
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/partials/onednn/ubuntu/mpich.partial.Dockerfile
@@ -0,0 +1,24 @@
+ARG DEBIAN_FRONTEND="noninteractive"
+
+# install mpich, openssh for MPI to communicate between containers
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    mpich \
+    libmpich-dev \
+    openssh-client \
+    openssh-server && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Create a wrapper for MPICH to allow running as root by default
+RUN mv /usr/bin/mpirun /usr/bin/mpirun.real && \
+    echo '#!/bin/bash' > /usr/bin/mpirun && \
+    echo 'mpirun.real "$@"' >> /usr/bin/mpirun && \
+    chmod a+x /usr/bin/mpirun
+
+# Set up SSH
+RUN mkdir -p /var/run/sshd
+
+# Allow OpenSSH to talk to containers without asking for confirmation
+RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
+    echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
+    mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/devel-nvidia.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/devel-nvidia.partial.Dockerfile
index d7e01071a14..ed310f39ecf 100644
--- a/tensorflow/tools/dockerfiles/partials/ubuntu/devel-nvidia.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/devel-nvidia.partial.Dockerfile
@@ -1,35 +1,32 @@
 ARG ARCH=
-ARG CUDA=10.1
+ARG CUDA=11.0
 FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}-base-ubuntu${UBUNTU_VERSION} as base
 # ARCH and CUDA are specified again because the FROM directive resets ARGs
 # (but their default value is retained if set previously)
 ARG ARCH
 ARG CUDA
-ARG CUDNN=7.6.4.38-1
-ARG CUDNN_MAJOR_VERSION=7
+ARG CUDNN=8.0.2.39-1
+ARG CUDNN_MAJOR_VERSION=8
 ARG LIB_DIR_PREFIX=x86_64
-ARG LIBNVINFER=6.0.1-1
-ARG LIBNVINFER_MAJOR_VERSION=6
+ARG LIBNVINFER=7.1.3-1
+ARG LIBNVINFER_MAJOR_VERSION=7
 
 # Needed for string substitution
 SHELL ["/bin/bash", "-c"]
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
         cuda-command-line-tools-${CUDA/./-} \
-        # There appears to be a regression in libcublas10=10.2.2.89-1 which
-        # prevents cublas from initializing in TF. See
-        # https://github.com/tensorflow/tensorflow/issues/9489#issuecomment-562394257
-        libcublas10=10.2.1.243-1 \ 
-        libcublas-dev=10.2.1.243-1 \
+        libcublas-${CUDA/./-} \
+        libcublas-dev-${CUDA/./-} \
         cuda-nvrtc-${CUDA/./-} \
         cuda-nvrtc-dev-${CUDA/./-} \
         cuda-cudart-dev-${CUDA/./-} \
-        cuda-cufft-dev-${CUDA/./-} \
-        cuda-curand-dev-${CUDA/./-} \
-        cuda-cusolver-dev-${CUDA/./-} \
-        cuda-cusparse-dev-${CUDA/./-} \
-        libcudnn7=${CUDNN}+cuda${CUDA} \
-        libcudnn7-dev=${CUDNN}+cuda${CUDA} \
+        libcufft-dev-${CUDA/./-} \
+        libcurand-dev-${CUDA/./-} \
+        libcusolver-dev-${CUDA/./-} \
+        libcusparse-dev-${CUDA/./-} \
+        libcudnn8=${CUDNN}+cuda${CUDA} \
+        libcudnn8-dev=${CUDNN}+cuda${CUDA} \
         libcurl3-dev \
         libfreetype6-dev \
         libhdf5-serial-dev \
@@ -44,7 +41,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         git \
         && \
     find /usr/local/cuda-${CUDA}/lib64/ -type f -name 'lib*_static.a' -not -name 'libcudart_static.a' -delete && \
-    rm /usr/lib/${LIB_DIR_PREFIX}-linux-gnu/libcudnn_static_v7.a
+    rm /usr/lib/${LIB_DIR_PREFIX}-linux-gnu/libcudnn_static_v8.a
 
 # Install TensorRT if not building for PowerPC
 RUN [[ "${ARCH}" = "ppc64le" ]] || { apt-get update && \
@@ -56,7 +53,7 @@ RUN [[ "${ARCH}" = "ppc64le" ]] || { apt-get update && \
         && rm -rf /var/lib/apt/lists/*; }
 
 # Configure the build for our CUDA configuration.
-ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:/usr/include/x64_64-linux-gnu:$LD_LIBRARY_PATH
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:/usr/include/x86_64-linux-gnu:/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH
 ENV TF_NEED_CUDA 1
 ENV TF_NEED_TENSORRT 1
 ENV TF_CUDA_VERSION=${CUDA}
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/nvidia.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/nvidia.partial.Dockerfile
index 555caf08cb7..b2a7b46a7cb 100644
--- a/tensorflow/tools/dockerfiles/partials/ubuntu/nvidia.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/nvidia.partial.Dockerfile
@@ -1,15 +1,15 @@
 ARG ARCH=
-ARG CUDA=10.1
+ARG CUDA=11.0
 FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}-base-ubuntu${UBUNTU_VERSION} as base
 # ARCH and CUDA are specified again because the FROM directive resets ARGs
 # (but their default value is retained if set previously)
 ARG ARCH
 ARG CUDA
-ARG CUDNN=7.6.4.38-1
-ARG CUDNN_MAJOR_VERSION=7
+ARG CUDNN=8.0.2.39-1
+ARG CUDNN_MAJOR_VERSION=8
 ARG LIB_DIR_PREFIX=x86_64
-ARG LIBNVINFER=6.0.1-1
-ARG LIBNVINFER_MAJOR_VERSION=6
+ARG LIBNVINFER=7.1.3-1
+ARG LIBNVINFER_MAJOR_VERSION=7
 
 # Needed for string substitution
 SHELL ["/bin/bash", "-c"]
@@ -17,17 +17,14 @@ SHELL ["/bin/bash", "-c"]
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
         cuda-command-line-tools-${CUDA/./-} \
-        # There appears to be a regression in libcublas10=10.2.2.89-1 which
-        # prevents cublas from initializing in TF. See
-        # https://github.com/tensorflow/tensorflow/issues/9489#issuecomment-562394257
-        libcublas10=10.2.1.243-1 \ 
+        libcublas-${CUDA/./-} \
         cuda-nvrtc-${CUDA/./-} \
-        cuda-cufft-${CUDA/./-} \
-        cuda-curand-${CUDA/./-} \
-        cuda-cusolver-${CUDA/./-} \
-        cuda-cusparse-${CUDA/./-} \
+        libcufft-${CUDA/./-} \
+        libcurand-${CUDA/./-} \
+        libcusolver-${CUDA/./-} \
+        libcusparse-${CUDA/./-} \
         curl \
-        libcudnn7=${CUDNN}+cuda${CUDA} \
+        libcudnn8=${CUDNN}+cuda${CUDA} \
         libfreetype6-dev \
         libhdf5-serial-dev \
         libzmq3-dev \
diff --git a/tensorflow/tools/dockerfiles/spec.yml b/tensorflow/tools/dockerfiles/spec.yml
index 83829d73346..421f8c56bd1 100644
--- a/tensorflow/tools/dockerfiles/spec.yml
+++ b/tensorflow/tools/dockerfiles/spec.yml
@@ -47,6 +47,10 @@ releases:
             - "{_TAG_PREFIX}{ubuntu-onednn-mpi-horovod}{onednn-jupyter}"
             - "{_TAG_PREFIX}{ubuntu-devel-onednn-mpi-horovod}"
             - "{_TAG_PREFIX}{ubuntu-devel-onednn-mpi-horovod}{onednn-jupyter}"
+            - "{_TAG_PREFIX}{ubuntu-onednn-mpich-horovod}"
+            - "{_TAG_PREFIX}{ubuntu-onednn-mpich-horovod}{onednn-jupyter}"
+            - "{_TAG_PREFIX}{ubuntu-devel-onednn-mpich-horovod}"
+            - "{_TAG_PREFIX}{ubuntu-devel-onednn-mpich-horovod}{onednn-jupyter}"
 
     # Dockerfiles stored in the TF repo; not pushed anywhere
     dockerfiles:
@@ -65,6 +69,10 @@ releases:
             - "{ubuntu-devel-onednn-mpi-horovod}"
             - "{ubuntu-onednn-mpi-horovod}{onednn-jupyter}"
             - "{ubuntu-devel-onednn-mpi-horovod}{onednn-jupyter}"
+            - "{ubuntu-onednn-mpich-horovod}"
+            - "{ubuntu-devel-onednn-mpich-horovod}"
+            - "{ubuntu-onednn-mpich-horovod}{onednn-jupyter}"
+            - "{ubuntu-devel-onednn-mpich-horovod}{onednn-jupyter}"
             - "{ubuntu-devel-arm64v8}{jupyter}"
 
 slice_sets:
@@ -147,102 +155,9 @@ slice_sets:
               - UBUNTU_VERSION=18.04
               - CHECKOUT_TF_SRC=1
 
-    ubuntu-onednn:
-        - add_to_name: "-16.04-onednn"
-          dockerfile_exclusive_name: "ubuntu-16.04-onednn"
-          dockerfile_subdirectory: "onednn"
-          partials:
-              - onednn/ubuntu/version
-              - onednn/ubuntu/cpu
-              - onednn/ubuntu/python
-              - tensorflow
-              - shell
-          tests:
-              - import-onednn.sh
-          args:
-              - TF_PACKAGE=intel-tensorflow
-              - UBUNTU_VERSION=16.04
-        - add_to_name: "-18.04-onednn"
-          dockerfile_exclusive_name: "ubuntu-18.04-onednn"
-          dockerfile_subdirectory: "onednn"
-          partials:
-              - onednn/ubuntu/version
-              - onednn/ubuntu/cpu
-              - onednn/ubuntu/python
-              - tensorflow
-              - shell
-          tests:
-              - import-onednn.sh
-          args:
-              - TF_PACKAGE=intel-tensorflow
-              - UBUNTU_VERSION=18.04
-        - add_to_name: "-20.04-onednn"
-          dockerfile_exclusive_name: "ubuntu-20.04-onednn"
-          dockerfile_subdirectory: "onednn"
-          partials:
-              - onednn/ubuntu/version
-              - onednn/ubuntu/cpu
-              - onednn/ubuntu/python3
-              - tensorflow
-              - shell
-          tests:
-              - import-onednn.sh
-          args:
-              - TF_PACKAGE=intel-tensorflow
-              - UBUNTU_VERSION=20.04
-              - PYTHON=python3.7
-
-    ubuntu-devel-onednn:
-        - add_to_name: "-16.04-devel-onednn"
-          dockerfile_exclusive_name: "ubuntu-16.04-devel-onednn"
-          dockerfile_subdirectory: "onednn"
-          partials:
-              - onednn/ubuntu/version
-              - onednn/ubuntu/devel
-              - onednn/ubuntu/python
-              - onednn/ubuntu/bazel
-              - shell
-          tests:
-              - ""
-          args:
-              - UBUNTU_VERSION=16.04
-              - CHECKOUT_TF_SRC=1
-              - TF_BRANCH=master
-        - add_to_name: "-18.04-devel-onednn"
-          dockerfile_exclusive_name: "ubuntu-18.04-devel-onednn"
-          dockerfile_subdirectory: "onednn"
-          partials:
-              - onednn/ubuntu/version
-              - onednn/ubuntu/devel
-              - onednn/ubuntu/python
-              - onednn/ubuntu/bazel
-              - shell
-          tests:
-              - ""
-          args:
-              - UBUNTU_VERSION=18.04
-              - CHECKOUT_TF_SRC=1
-              - TF_BRANCH=master
-        - add_to_name: "-20.04-devel-onednn"
-          dockerfile_exclusive_name: "ubuntu-20.04-devel-onednn"
-          dockerfile_subdirectory: "onednn"
-          partials:
-              - onednn/ubuntu/version
-              - onednn/ubuntu/devel
-              - onednn/ubuntu/python3
-              - onednn/ubuntu/bazel
-              - shell
-          tests:
-              - ""
-          args:
-              - UBUNTU_VERSION=20.04
-              - PYTHON=python3.7
-              - CHECKOUT_TF_SRC=1
-              - TF_BRANCH=master
-
     ubuntu-onednn-mpi-horovod:
-        - add_to_name: "-16.04-onednn-mpi-horovod"
-          dockerfile_exclusive_name: "ubuntu-16.04-onednn-mpi-horovod"
+        - add_to_name: "-16.04-mpi-horovod"
+          dockerfile_exclusive_name: "ubuntu-16.04-mpi-horovod"
           dockerfile_subdirectory: "onednn"
           partials:
               - onednn/ubuntu/version
@@ -258,8 +173,8 @@ slice_sets:
               - UBUNTU_VERSION=16.04
               - DEBIAN_FRONTEND="noninteractive"
               - TF_PACKAGE=intel-tensorflow
-        - add_to_name: "-18.04-onednn-mpi-horovod"
-          dockerfile_exclusive_name: "ubuntu-18.04-onednn-mpi-horovod"
+        - add_to_name: "-18.04-mpi-horovod"
+          dockerfile_exclusive_name: "ubuntu-18.04-mpi-horovod"
           dockerfile_subdirectory: "onednn"
           partials:
               - onednn/ubuntu/version
@@ -275,8 +190,8 @@ slice_sets:
               - UBUNTU_VERSION=18.04
               - DEBIAN_FRONTEND="noninteractive"
               - TF_PACKAGE=intel-tensorflow
-        - add_to_name: "-20.04-onednn-mpi-horovod"
-          dockerfile_exclusive_name: "ubuntu-20.04-onednn-mpi-horovod"
+        - add_to_name: "-20.04-mpi-horovod"
+          dockerfile_exclusive_name: "ubuntu-20.04-mpi-horovod"
           dockerfile_subdirectory: "onednn"
           partials:
               - onednn/ubuntu/version
@@ -295,8 +210,8 @@ slice_sets:
               - TF_PACKAGE=intel-tensorflow
 
     ubuntu-devel-onednn-mpi-horovod:
-        - add_to_name: "-16.04-onednn-devel-mpi-horovod"
-          dockerfile_exclusive_name: "ubuntu-16.04-devel-onednn-mpi-horovod"
+        - add_to_name: "-16.04-devel-mpi-horovod"
+          dockerfile_exclusive_name: "ubuntu-16.04-devel-mpi-horovod"
           dockerfile_subdirectory: "onednn"
           partials:
               - onednn/ubuntu/version
@@ -313,8 +228,8 @@ slice_sets:
               - CHECKOUT_TF_SRC=1
               - CHECKOUT_HOROVOD_SRC=1
               - HOROVOD_BRANCH=master
-        - add_to_name: "-18.04-onednn-devel-mpi-horovod"
-          dockerfile_exclusive_name: "ubuntu-18.04-devel-onednn-mpi-horovod"
+        - add_to_name: "-18.04-devel-mpi-horovod"
+          dockerfile_exclusive_name: "ubuntu-18.04-devel-mpi-horovod"
           dockerfile_subdirectory: "onednn"
           partials:
               - onednn/ubuntu/version
@@ -331,8 +246,8 @@ slice_sets:
               - CHECKOUT_TF_SRC=1
               - CHECKOUT_HOROVOD_SRC=1
               - HOROVOD_BRANCH=master
-        - add_to_name: "-20.04-onednn-devel-mpi-horovod"
-          dockerfile_exclusive_name: "ubuntu-20.04-devel-onednn-mpi-horovod"
+        - add_to_name: "-20.04-devel-mpi-horovod"
+          dockerfile_exclusive_name: "ubuntu-20.04-devel-mpi-horovod"
           dockerfile_subdirectory: "onednn"
           partials:
               - onednn/ubuntu/version
@@ -351,10 +266,120 @@ slice_sets:
               - CHECKOUT_HOROVOD_SRC=1
               - HOROVOD_BRANCH=master
 
+    ubuntu-onednn-mpich-horovod:
+        - add_to_name: "-16.04-mpich-horovod"
+          dockerfile_exclusive_name: "ubuntu-16.04-mpich-horovod"
+          dockerfile_subdirectory: "onednn"
+          partials:
+              - onednn/ubuntu/version
+              - onednn/ubuntu/cpu
+              - onednn/ubuntu/python
+              - tensorflow
+              - onednn/ubuntu/1604-mpich
+              - onednn/ubuntu/1604-horovod
+              - shell
+          tests:
+              - import-onednn-horovod.sh
+          args:
+              - UBUNTU_VERSION=16.04
+              - DEBIAN_FRONTEND="noninteractive"
+              - TF_PACKAGE=intel-tensorflow
+        - add_to_name: "-18.04-mpich-horovod"
+          dockerfile_exclusive_name: "ubuntu-18.04-mpich-horovod"
+          dockerfile_subdirectory: "onednn"
+          partials:
+              - onednn/ubuntu/version
+              - onednn/ubuntu/cpu
+              - onednn/ubuntu/python
+              - tensorflow
+              - onednn/ubuntu/mpich
+              - onednn/ubuntu/horovod
+              - shell
+          tests:
+              - import-onednn-horovod.sh
+          args:
+              - UBUNTU_VERSION=18.04
+              - DEBIAN_FRONTEND="noninteractive"
+              - TF_PACKAGE=intel-tensorflow
+        - add_to_name: "-20.04-mpich-horovod"
+          dockerfile_exclusive_name: "ubuntu-20.04-mpich-horovod"
+          dockerfile_subdirectory: "onednn"
+          partials:
+              - onednn/ubuntu/version
+              - onednn/ubuntu/cpu
+              - onednn/ubuntu/python3
+              - tensorflow
+              - onednn/ubuntu/mpich
+              - onednn/ubuntu/2004-horovod
+              - shell
+          tests:
+              - import-onednn-horovod.sh
+          args:
+              - UBUNTU_VERSION=20.04
+              - PYTHON=python3.7
+              - DEBIAN_FRONTEND="noninteractive"
+              - TF_PACKAGE=intel-tensorflow
+
+    ubuntu-devel-onednn-mpich-horovod:
+        - add_to_name: "-16.04-devel-mpich-horovod"
+          dockerfile_exclusive_name: "ubuntu-16.04-devel-mpich-horovod"
+          dockerfile_subdirectory: "onednn"
+          partials:
+              - onednn/ubuntu/version
+              - onednn/ubuntu/devel
+              - onednn/ubuntu/python
+              - onednn/ubuntu/bazel
+              - onednn/ubuntu/1604-mpich
+              - onednn/ubuntu/devel-horovod
+              - shell
+          tests:
+              - ""
+          args:
+              - UBUNTU_VERSION=16.04
+              - CHECKOUT_TF_SRC=1
+              - CHECKOUT_HOROVOD_SRC=1
+              - HOROVOD_BRANCH=master
+        - add_to_name: "-18.04-devel-mpich-horovod"
+          dockerfile_exclusive_name: "ubuntu-18.04-devel-mpich-horovod"
+          dockerfile_subdirectory: "onednn"
+          partials:
+              - onednn/ubuntu/version
+              - onednn/ubuntu/devel
+              - onednn/ubuntu/python
+              - onednn/ubuntu/bazel
+              - onednn/ubuntu/mpich
+              - onednn/ubuntu/devel-horovod
+              - shell
+          tests:
+              - ""
+          args:
+              - UBUNTU_VERSION=18.04
+              - CHECKOUT_TF_SRC=1
+              - CHECKOUT_HOROVOD_SRC=1
+              - HOROVOD_BRANCH=master
+        - add_to_name: "-20.04-devel-mpich-horovod"
+          dockerfile_exclusive_name: "ubuntu-20.04-devel-mpich-horovod"
+          dockerfile_subdirectory: "onednn"
+          partials:
+              - onednn/ubuntu/version
+              - onednn/ubuntu/devel
+              - onednn/ubuntu/python3
+              - onednn/ubuntu/bazel
+              - onednn/ubuntu/mpich
+              - onednn/ubuntu/devel-horovod
+              - shell
+          tests:
+              - ""
+          args:
+              - UBUNTU_VERSION=20.04
+              - PYTHON=python3.7
+              - CHECKOUT_TF_SRC=1
+              - CHECKOUT_HOROVOD_SRC=1
+              - HOROVOD_BRANCH=master
 
     ubuntu-onednn:
-        - add_to_name: "-16.04-onednn"
-          dockerfile_exclusive_name: "ubuntu-16.04-onednn"
+        - add_to_name: "-16.04"
+          dockerfile_exclusive_name: "ubuntu-16.04"
           dockerfile_subdirectory: "onednn"
           partials:
               - onednn/ubuntu/version
@@ -367,8 +392,8 @@ slice_sets:
           args:
               - TF_PACKAGE=intel-tensorflow
               - UBUNTU_VERSION=16.04
-        - add_to_name: "-18.04-onednn"
-          dockerfile_exclusive_name: "ubuntu-18.04-onednn"
+        - add_to_name: "-18.04"
+          dockerfile_exclusive_name: "ubuntu-18.04"
           dockerfile_subdirectory: "onednn"
           partials:
               - onednn/ubuntu/version
@@ -381,8 +406,8 @@ slice_sets:
           args:
               - TF_PACKAGE=intel-tensorflow
               - UBUNTU_VERSION=18.04
-        - add_to_name: "-20.04-onednn"
-          dockerfile_exclusive_name: "ubuntu-20.04-onednn"
+        - add_to_name: "-20.04"
+          dockerfile_exclusive_name: "ubuntu-20.04"
           dockerfile_subdirectory: "onednn"
           partials:
               - onednn/ubuntu/version
@@ -398,8 +423,8 @@ slice_sets:
               - PYTHON=python3.7
 
     ubuntu-devel-onednn:
-        - add_to_name: "-16.04-devel-onednn"
-          dockerfile_exclusive_name: "ubuntu-16.04-devel-onednn"
+        - add_to_name: "-16.04-devel"
+          dockerfile_exclusive_name: "ubuntu-16.04-devel"
           dockerfile_subdirectory: "onednn"
           partials:
               - onednn/ubuntu/version
@@ -413,8 +438,8 @@ slice_sets:
               - UBUNTU_VERSION=16.04
               - CHECKOUT_TF_SRC=1
               - TF_BRANCH=master
-        - add_to_name: "-18.04-devel-onednn"
-          dockerfile_exclusive_name: "ubuntu-18.04-devel-onednn"
+        - add_to_name: "-18.04-devel"
+          dockerfile_exclusive_name: "ubuntu-18.04-devel"
           dockerfile_subdirectory: "onednn"
           partials:
               - onednn/ubuntu/version
@@ -428,8 +453,8 @@ slice_sets:
               - UBUNTU_VERSION=18.04
               - CHECKOUT_TF_SRC=1
               - TF_BRANCH=master
-        - add_to_name: "-20.04-devel-onednn"
-          dockerfile_exclusive_name: "ubuntu-20.04-devel-onednn"
+        - add_to_name: "-20.04-devel"
+          dockerfile_exclusive_name: "ubuntu-20.04-devel"
           dockerfile_subdirectory: "onednn"
           partials:
               - onednn/ubuntu/version
diff --git a/tensorflow/tools/docs/BUILD b/tensorflow/tools/docs/BUILD
index 9814059be08..3668da2645d 100644
--- a/tensorflow/tools/docs/BUILD
+++ b/tensorflow/tools/docs/BUILD
@@ -55,6 +55,8 @@ py_test(
         ":tf_doctest_lib",
         "//tensorflow:tensorflow_py",
         "//third_party/py/numpy",
+        "@absl_py//absl/flags",
+        "@absl_py//absl/testing:absltest",
     ],
 )
 
@@ -103,6 +105,8 @@ py_test(
         ":tf_doctest_lib",
         "//tensorflow:tensorflow_py",
         "//third_party/py/numpy",
+        "@absl_py//absl/flags",
+        "@absl_py//absl/testing:absltest",
     ],
 )
 
@@ -119,19 +123,11 @@ py_test(
     ],
     deps = [
         ":tf_doctest_lib",
+        "@absl_py//absl/testing:absltest",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_library(
-    name = "doc_generator_visitor",
-    srcs = [
-        "doc_generator_visitor.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = ["@six_archive//:six"],
-)
-
 py_library(
     name = "doc_controls",
     srcs = ["doc_controls.py"],
@@ -151,58 +147,6 @@ py_test(
     ],
 )
 
-py_library(
-    name = "parser",
-    srcs = ["parser.py"],
-    srcs_version = "PY2AND3",
-    visibility = ["//visibility:public"],
-    deps = [
-        ":doc_controls",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:util",
-        "@astor_archive//:astor",
-        "@six_archive//:six",
-    ],
-)
-
-py_library(
-    name = "pretty_docs",
-    srcs = ["pretty_docs.py"],
-    srcs_version = "PY2AND3",
-    deps = ["@six_archive//:six"],
-)
-
-py_library(
-    name = "generate_lib",
-    srcs = ["generate_lib.py"],
-    srcs_version = "PY2AND3",
-    visibility = ["//visibility:public"],
-    deps = [
-        ":doc_controls",
-        ":doc_generator_visitor",
-        ":parser",
-        ":pretty_docs",
-        ":py_guide_parser",
-        "//tensorflow/python:util",
-        "//tensorflow/tools/common:public_api",
-        "//tensorflow/tools/common:traverse",
-        "@six_archive//:six",
-    ],
-)
-
-py_binary(
-    name = "generate",
-    srcs = ["generate.py"],
-    python_version = "PY3",
-    srcs_version = "PY2AND3",
-    deps = [
-        ":generate_lib",
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/python:util",
-        "//tensorflow/python/debug:debug_py",
-    ],
-)
-
 py_test(
     name = "generate2_test",
     size = "medium",
@@ -219,6 +163,8 @@ py_test(
     ],
     deps = [
         ":generate2_lib",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:platform_test",
     ],
 )
 
@@ -247,7 +193,11 @@ py_library(
     deps = [
         ":base_dir_oss",
         "//tensorflow:tensorflow_py",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:tf_export",
         "//tensorflow/python:util",
+        "@absl_py//absl:app",
+        "@absl_py//absl/flags",
     ],
 )
 
@@ -261,10 +211,3 @@ py_binary(
         "@absl_py//absl/flags",
     ],
 )
-
-py_library(
-    name = "py_guide_parser",
-    srcs = ["py_guide_parser.py"],
-    srcs_version = "PY2AND3",
-    deps = ["@six_archive//:six"],
-)
diff --git a/tensorflow/tools/docs/build_java_api_docs.py b/tensorflow/tools/docs/build_java_api_docs.py
index 343a561d225..31383f07fad 100644
--- a/tensorflow/tools/docs/build_java_api_docs.py
+++ b/tensorflow/tools/docs/build_java_api_docs.py
@@ -62,6 +62,13 @@ def main(unused_argv):
   shutil.copytree(SOURCE_PATH, merged_source / 'java')
 
   if FLAGS.gen_ops:
+    # `$ yes | configure`
+    yes = subprocess.Popen(['yes', ''], stdout=subprocess.PIPE)
+    configure = subprocess.Popen([TENSORFLOW_ROOT / 'configure'],
+                                 stdin=yes.stdout,
+                                 cwd=TENSORFLOW_ROOT)
+    configure.communicate()
+
     subprocess.check_call(
         ['bazel', 'build', '//tensorflow/java:java_op_gen_sources'],
         cwd=TENSORFLOW_ROOT)
diff --git a/tensorflow/tools/docs/doc_generator_visitor.py b/tensorflow/tools/docs/doc_generator_visitor.py
deleted file mode 100644
index 592e7c1f966..00000000000
--- a/tensorflow/tools/docs/doc_generator_visitor.py
+++ /dev/null
@@ -1,286 +0,0 @@
-# Lint as: python2, python3
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""A `traverse` visitor for processing documentation."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import six
-
-from tensorflow.python.util import tf_export
-from tensorflow.python.util import tf_inspect
-
-
-class DocGeneratorVisitor(object):
-  """A visitor that generates docs for a python object when __call__ed."""
-
-  def __init__(self, root_name=''):
-    """Make a visitor.
-
-    As this visitor is starting its traversal at a module or class, it will not
-    be told the name of that object during traversal. `root_name` is the name it
-    should use for that object, effectively prefixing all names with
-    "root_name.".
-
-    Args:
-      root_name: The name of the root module/class.
-    """
-    self.set_root_name(root_name)
-    self._index = {}
-    self._tree = {}
-    self._reverse_index = None
-    self._duplicates = None
-    self._duplicate_of = None
-
-  def set_root_name(self, root_name):
-    """Sets the root name for subsequent __call__s."""
-    self._root_name = root_name or ''
-    self._prefix = (six.ensure_str(root_name) + '.') if root_name else ''
-
-  @property
-  def index(self):
-    """A map from fully qualified names to objects to be documented.
-
-    The index is filled when the visitor is passed to `traverse`.
-
-    Returns:
-      The index filled by traversal.
-    """
-    return self._index
-
-  @property
-  def tree(self):
-    """A map from fully qualified names to all its child names for traversal.
-
-    The full name to member names map is filled when the visitor is passed to
-    `traverse`.
-
-    Returns:
-      The full name to member name map filled by traversal.
-    """
-    return self._tree
-
-  @property
-  def reverse_index(self):
-    """A map from `id(object)` to the preferred fully qualified name.
-
-    This map only contains non-primitive objects (no numbers or strings) present
-    in `index` (for primitive objects, `id()` doesn't quite do the right thing).
-
-    It is computed when it, `duplicate_of`, or `duplicates` are first accessed.
-
-    Returns:
-      The `id(object)` to full name map.
-    """
-    self._maybe_find_duplicates()
-    return self._reverse_index
-
-  @property
-  def duplicate_of(self):
-    """A map from duplicate full names to a preferred fully qualified name.
-
-    This map only contains names that are not themself a preferred name.
-
-    It is computed when it, `reverse_index`, or `duplicates` are first accessed.
-
-    Returns:
-      The map from duplicate name to preferred name.
-    """
-    self._maybe_find_duplicates()
-    return self._duplicate_of
-
-  @property
-  def duplicates(self):
-    """A map from preferred full names to a list of all names for this symbol.
-
-    This function returns a map from preferred (master) name for a symbol to a
-    lexicographically sorted list of all aliases for that name (incl. the master
-    name). Symbols without duplicate names do not appear in this map.
-
-    It is computed when it, `reverse_index`, or `duplicate_of` are first
-    accessed.
-
-    Returns:
-      The map from master name to list of all duplicate names.
-    """
-    self._maybe_find_duplicates()
-    return self._duplicates
-
-  def _add_prefix(self, name):
-    """Adds the root name to a name."""
-    return self._prefix + name if name else self._root_name
-
-  def __call__(self, parent_name, parent, children):
-    """Visitor interface, see `tensorflow/tools/common:traverse` for details.
-
-    This method is called for each symbol found in a traversal using
-    `tensorflow/tools/common:traverse`. It should not be called directly in
-    user code.
-
-    Args:
-      parent_name: The fully qualified name of a symbol found during traversal.
-      parent: The Python object referenced by `parent_name`.
-      children: A list of `(name, py_object)` pairs enumerating, in alphabetical
-        order, the children (as determined by `tf_inspect.getmembers`) of
-          `parent`. `name` is the local name of `py_object` in `parent`.
-
-    Raises:
-      RuntimeError: If this visitor is called with a `parent` that is not a
-        class or module.
-    """
-    parent_name = self._add_prefix(parent_name)
-    self._index[parent_name] = parent
-    self._tree[parent_name] = []
-
-    if not (tf_inspect.ismodule(parent) or tf_inspect.isclass(parent)):
-      raise RuntimeError('Unexpected type in visitor -- %s: %r' % (parent_name,
-                                                                   parent))
-
-    for i, (name, child) in enumerate(list(children)):
-      # Don't document __metaclass__
-      if name in ['__metaclass__']:
-        del children[i]
-        continue
-
-      full_name = '.'.join([parent_name, name]) if parent_name else name
-      self._index[full_name] = child
-      self._tree[parent_name].append(name)
-
-  def _score_name(self, name):
-    """Return a tuple of scores indicating how to sort for the best name.
-
-    This function is meant to be used as the `key` to the `sorted` function.
-
-    This sorting in order:
-      Prefers names refering to the defining class, over a subclass.
-      Prefers names that are not in "contrib".
-      prefers submodules to the root namespace.
-      Prefers short names `tf.thing` over `tf.a.b.c.thing`
-      Sorts lexicographically on name parts.
-
-    Args:
-      name: the full name to score, for example `tf.estimator.Estimator`
-
-    Returns:
-      A tuple of scores. When sorted the preferred name will have the lowest
-      value.
-    """
-    parts = six.ensure_str(name).split('.')
-    short_name = parts[-1]
-
-    container = self._index['.'.join(parts[:-1])]
-
-    defining_class_score = 1
-    if tf_inspect.isclass(container):
-      if short_name in container.__dict__:
-        # prefer the defining class
-        defining_class_score = -1
-
-    contrib_score = -1
-    if 'contrib' in parts:
-      contrib_score = 1
-
-    while parts:
-      container = self._index['.'.join(parts)]
-      if tf_inspect.ismodule(container):
-        break
-      parts.pop()
-
-    module_length = len(parts)
-    if len(parts) == 2:
-      # `tf.submodule.thing` is better than `tf.thing`
-      module_length_score = -1
-    else:
-      # shorter is better
-      module_length_score = module_length
-
-    return (defining_class_score, contrib_score, module_length_score, name)
-
-  def _maybe_find_duplicates(self):
-    """Compute data structures containing information about duplicates.
-
-    Find duplicates in `index` and decide on one to be the "master" name.
-
-    Computes a reverse_index mapping each object id to its master name.
-
-    Also computes a map `duplicate_of` from aliases to their master name (the
-    master name itself has no entry in this map), and a map `duplicates` from
-    master names to a lexicographically sorted list of all aliases for that name
-    (incl. the master name).
-
-    All these are computed and set as fields if they haven't already.
-    """
-    if self._reverse_index is not None:
-      return
-
-    # Maps the id of a symbol to its fully qualified name. For symbols that have
-    # several aliases, this map contains the first one found.
-    # We use id(py_object) to get a hashable value for py_object. Note all
-    # objects in _index are in memory at the same time so this is safe.
-    reverse_index = {}
-
-    # Make a preliminary duplicates map. For all sets of duplicate names, it
-    # maps the first name found to a list of all duplicate names.
-    raw_duplicates = {}
-    for full_name, py_object in six.iteritems(self._index):
-      # We cannot use the duplicate mechanism for some constants, since e.g.,
-      # id(c1) == id(c2) with c1=1, c2=1. This is unproblematic since constants
-      # have no usable docstring and won't be documented automatically.
-      singelton_types = (
-          six.integer_types + six.string_types +
-          (six.binary_type, six.text_type, float, complex, bool))
-      if (py_object not in (None, ()) and
-          not isinstance(py_object, singelton_types)):
-        object_id = id(py_object)
-        if object_id in reverse_index:
-          master_name = reverse_index[object_id]
-          if master_name in raw_duplicates:
-            raw_duplicates[master_name].append(full_name)
-          else:
-            raw_duplicates[master_name] = [master_name, full_name]
-        else:
-          reverse_index[object_id] = full_name
-    # Decide on master names, rewire duplicates and make a duplicate_of map
-    # mapping all non-master duplicates to the master name. The master symbol
-    # does not have an entry in this map.
-    duplicate_of = {}
-    # Duplicates maps the main symbols to the set of all duplicates of that
-    # symbol (incl. itself).
-    duplicates = {}
-    for names in raw_duplicates.values():
-      names = sorted(names)
-      master_name = (
-          tf_export.get_canonical_name_for_symbol(self._index[names[0]])
-          if names else None)
-      if master_name:
-        master_name = 'tf.%s' % master_name
-      else:
-        # Choose the master name with a lexical sort on the tuples returned by
-        # by _score_name.
-        master_name = min(names, key=self._score_name)
-
-      duplicates[master_name] = names
-      for name in names:
-        if name != master_name:
-          duplicate_of[name] = master_name
-
-      # Set the reverse index to the canonical name.
-      reverse_index[id(self._index[master_name])] = master_name
-
-    self._duplicate_of = duplicate_of
-    self._duplicates = duplicates
-    self._reverse_index = reverse_index
diff --git a/tensorflow/tools/docs/generate.py b/tensorflow/tools/docs/generate.py
deleted file mode 100644
index fc93085e3e0..00000000000
--- a/tensorflow/tools/docs/generate.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Generate docs for the TensorFlow Python API."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import sys
-
-import tensorflow as tf
-
-from tensorflow.python import debug as tf_debug
-from tensorflow.python.util import tf_inspect
-from tensorflow.tools.docs import generate_lib
-
-if __name__ == '__main__':
-  doc_generator = generate_lib.DocGenerator()
-  doc_generator.add_output_dir_argument()
-  doc_generator.add_src_dir_argument()
-
-  # This doc generator works on the TensorFlow codebase. Since this script lives
-  # at tensorflow/tools/docs, and all code is defined somewhere inside
-  # tensorflow/, we can compute the base directory (two levels up), which is
-  # valid unless we're trying to apply this to a different code base, or are
-  # moving the script around.
-  script_dir = os.path.dirname(tf_inspect.getfile(tf_inspect.currentframe()))
-  default_base_dir = os.path.join(script_dir, '..', '..')
-  doc_generator.add_base_dir_argument(default_base_dir)
-
-  flags = doc_generator.parse_known_args()
-
-  # tf_debug is not imported with tf, it's a separate module altogether
-  doc_generator.set_py_modules([('tf', tf), ('tfdbg', tf_debug)])
-
-  sys.exit(doc_generator.build(flags))
diff --git a/tensorflow/tools/docs/generate2.py b/tensorflow/tools/docs/generate2.py
index 44152ba30ef..0b3b9e00bb6 100644
--- a/tensorflow/tools/docs/generate2.py
+++ b/tensorflow/tools/docs/generate2.py
@@ -77,6 +77,10 @@ flags.DEFINE_string(
     "The path prefix (up to `.../api_docs/python`) used in the "
     "`_toc.yaml` and `_redirects.yaml` files")
 
+flags.DEFINE_bool("gen_report", False,
+                  ("Generate an API report containing the health of the"
+                   "docstrings of the public API."))
+
 _PRIVATE_MAP = {
     "tf": ["python", "core", "compiler", "examples", "tools", "contrib"],
     # There's some aliasing between the compats and v1/2s, so it's easier to
@@ -151,13 +155,15 @@ class TfExportAwareVisitor(doc_generator_visitor.DocGeneratorVisitor):
     return (canonical_score,) + scores
 
 
-def build_docs(output_dir, code_url_prefix, search_hints=True):
+def build_docs(output_dir, code_url_prefix, search_hints, gen_report):
   """Build api docs for tensorflow v2.
 
   Args:
     output_dir: A string path, where to put the files.
     code_url_prefix: prefix for "Defined in" links.
     search_hints: Bool. Include meta-data search hints at the top of each file.
+    gen_report: Bool. Generates an API report containing the health of the
+      docstrings of the public API.
   """
   # The custom page will be used for raw_ops.md not the one generated above.
   doc_controls.set_custom_page_content(tf.raw_ops, generate_raw_ops_doc())
@@ -173,6 +179,11 @@ def build_docs(output_dir, code_url_prefix, search_hints=True):
         cls=cls,
         skip=["__init__"])
 
+  try:
+    doc_controls.do_not_generate_docs(tf.__internal__)
+  except AttributeError:
+    pass
+
   try:
     doc_controls.do_not_generate_docs(tf.__operators__)
   except AttributeError:
@@ -208,10 +219,15 @@ def build_docs(output_dir, code_url_prefix, search_hints=True):
       code_url_prefix=code_url_prefixes,
       site_path=FLAGS.site_path,
       visitor_cls=TfExportAwareVisitor,
-      private_map=_PRIVATE_MAP)
+      private_map=_PRIVATE_MAP,
+      gen_report=gen_report,
+  )
 
   doc_generator.build(output_dir)
 
+  if gen_report:
+    return
+
   out_path = pathlib.Path(output_dir)
 
   expected_path_contents = {
@@ -267,7 +283,8 @@ def main(argv):
   build_docs(
       output_dir=FLAGS.output_dir,
       code_url_prefix=FLAGS.code_url_prefix,
-      search_hints=FLAGS.search_hints)
+      search_hints=FLAGS.search_hints,
+      gen_report=FLAGS.gen_report,)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/tools/docs/generate2_test.py b/tensorflow/tools/docs/generate2_test.py
index 57d3ec398b1..1392667b4c5 100644
--- a/tensorflow/tools/docs/generate2_test.py
+++ b/tensorflow/tools/docs/generate2_test.py
@@ -50,7 +50,12 @@ class Generate2Test(googletest.TestCase):
       shutil.rmtree(output_dir)
     os.makedirs(output_dir)
     with self.assertRaisesRegex(ValueError, '2000 files'):
-      generate2.build_docs(output_dir=output_dir, code_url_prefix='')
+      generate2.build_docs(
+          output_dir=output_dir,
+          code_url_prefix='',
+          search_hints=True,
+          gen_report=False,
+      )
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/tools/docs/generate_lib.py b/tensorflow/tools/docs/generate_lib.py
deleted file mode 100644
index 3bd0fd615f4..00000000000
--- a/tensorflow/tools/docs/generate_lib.py
+++ /dev/null
@@ -1,644 +0,0 @@
-# Lint as: python2, python3
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Generate docs for the TensorFlow Python API."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-import fnmatch
-import os
-import shutil
-import tempfile
-
-import six
-
-from tensorflow.python.util import tf_inspect
-from tensorflow.tools.common import public_api
-from tensorflow.tools.common import traverse
-from tensorflow.tools.docs import doc_controls
-from tensorflow.tools.docs import doc_generator_visitor
-from tensorflow.tools.docs import parser
-from tensorflow.tools.docs import pretty_docs
-from tensorflow.tools.docs import py_guide_parser
-
-
-def write_docs(output_dir,
-               parser_config,
-               yaml_toc,
-               root_title='TensorFlow',
-               search_hints=True,
-               site_api_path='api_docs/python'):
-  """Write previously extracted docs to disk.
-
-  Write a docs page for each symbol included in the indices of parser_config to
-  a tree of docs at `output_dir`.
-
-  Symbols with multiple aliases will have only one page written about
-  them, which is referenced for all aliases.
-
-  Args:
-    output_dir: Directory to write documentation markdown files to. Will be
-      created if it doesn't exist.
-    parser_config: A `parser.ParserConfig` object, containing all the necessary
-      indices.
-    yaml_toc: Set to `True` to generate a "_toc.yaml" file.
-    root_title: The title name for the root level index.md.
-    search_hints: (bool) include meta-data search hints at the top of each
-      output file.
-    site_api_path: The output path relative to the site root. Used in the
-      `_toc.yaml` and `_redirects.yaml` files.
-
-  Raises:
-    ValueError: if `output_dir` is not an absolute path
-  """
-  # Make output_dir.
-  if not os.path.isabs(output_dir):
-    raise ValueError("'output_dir' must be an absolute path.\n"
-                     "    output_dir='%s'" % output_dir)
-
-  if not os.path.exists(output_dir):
-    os.makedirs(output_dir)
-
-  # These dictionaries are used for table-of-contents generation below
-  # They will contain, after the for-loop below::
-  #  - module name(string):classes and functions the module contains(list)
-  module_children = {}
-  #  - symbol name(string):pathname (string)
-  symbol_to_file = {}
-
-  # Collect redirects for an api _redirects.yaml file.
-  redirects = []
-
-  # Parse and write Markdown pages, resolving cross-links (@{symbol}).
-  for full_name, py_object in six.iteritems(parser_config.index):
-    parser_config.reference_resolver.current_doc_full_name = full_name
-
-    if full_name in parser_config.duplicate_of:
-      continue
-
-    # Methods and some routines are documented only as part of their class.
-    if not (tf_inspect.ismodule(py_object) or tf_inspect.isclass(py_object) or
-            parser.is_free_function(py_object, full_name, parser_config.index)):
-      continue
-
-    sitepath = os.path.join(parser.documentation_path(full_name)[:-3])
-
-    # For TOC, we need to store a mapping from full_name to the file
-    # we're generating
-    symbol_to_file[full_name] = sitepath
-
-    # For a module, remember the module for the table-of-contents
-    if tf_inspect.ismodule(py_object):
-      if full_name in parser_config.tree:
-        module_children.setdefault(full_name, [])
-
-    # For something else that's documented,
-    # figure out what module it lives in
-    else:
-      subname = str(full_name)
-      while True:
-        subname = subname[:subname.rindex('.')]
-        if tf_inspect.ismodule(parser_config.index[subname]):
-          module_children.setdefault(subname, []).append(full_name)
-          break
-
-    # Generate docs for `py_object`, resolving references.
-    page_info = parser.docs_for_object(full_name, py_object, parser_config)
-
-    path = os.path.join(output_dir, parser.documentation_path(full_name))
-    directory = os.path.dirname(path)
-    try:
-      if not os.path.exists(directory):
-        os.makedirs(directory)
-      # This function returns raw bytes in PY2 or unicode in PY3.
-      if search_hints:
-        content = [page_info.get_metadata_html()]
-      else:
-        content = ['']
-
-      content.append(pretty_docs.build_md_page(page_info))
-      text = '\n'.join(content)
-      if six.PY3:
-        text = text.encode('utf-8')
-      with open(path, 'wb') as f:
-        f.write(text)
-    except OSError:
-      raise OSError(
-          'Cannot write documentation for %s to %s' % (full_name, directory))
-
-    duplicates = parser_config.duplicates.get(full_name, [])
-    if not duplicates:
-      continue
-
-    duplicates = [item for item in duplicates if item != full_name]
-
-    for dup in duplicates:
-      from_path = os.path.join(site_api_path,
-                               six.ensure_str(dup).replace('.', '/'))
-      to_path = os.path.join(site_api_path,
-                             six.ensure_str(full_name).replace('.', '/'))
-      redirects.append((
-          os.path.join('/', from_path),
-          os.path.join('/', to_path)))
-
-  if redirects:
-    redirects = sorted(redirects)
-    template = ('- from: {}\n'
-                '  to: {}\n')
-    redirects = [template.format(f, t) for f, t in redirects]
-    api_redirects_path = os.path.join(output_dir, '_redirects.yaml')
-    with open(api_redirects_path, 'w') as redirect_file:
-      redirect_file.write('redirects:\n')
-      redirect_file.write(''.join(redirects))
-
-  if yaml_toc:
-    # Generate table of contents
-
-    # Put modules in alphabetical order, case-insensitive
-    modules = sorted(list(module_children.keys()), key=lambda a: a.upper())
-
-    leftnav_path = os.path.join(output_dir, '_toc.yaml')
-    with open(leftnav_path, 'w') as f:
-
-      # Generate header
-      f.write('# Automatically generated file; please do not edit\ntoc:\n')
-      for module in modules:
-        indent_num = module.count('.')
-        # Don't list `tf.submodule` inside `tf`
-        indent_num = max(indent_num, 1)
-        indent = '  '*indent_num
-
-        if indent_num > 1:
-          # tf.contrib.baysflow.entropy will be under
-          #   tf.contrib->baysflow->entropy
-          title = six.ensure_str(module).split('.')[-1]
-        else:
-          title = module
-
-        header = [
-            '- title: ' + six.ensure_str(title), '  section:',
-            '  - title: Overview', '    path: ' +
-            os.path.join('/', site_api_path, symbol_to_file[module])
-        ]
-        header = ''.join([indent+line+'\n' for line in header])
-        f.write(header)
-
-        symbols_in_module = module_children.get(module, [])
-        # Sort case-insensitive, if equal sort case sensitive (upper first)
-        symbols_in_module.sort(key=lambda a: (a.upper(), a))
-
-        for full_name in symbols_in_module:
-          item = [
-              '  - title: ' + full_name[len(module) + 1:],
-              '    path: ' + os.path.join('/', site_api_path,
-                                          symbol_to_file[full_name])]
-          item = ''.join([indent+line+'\n' for line in item])
-          f.write(item)
-
-  # Write a global index containing all full names with links.
-  with open(os.path.join(output_dir, 'index.md'), 'w') as f:
-    f.write(
-        six.ensure_str(
-            parser.generate_global_index(root_title, parser_config.index,
-                                         parser_config.reference_resolver)))
-
-
-def add_dict_to_dict(add_from, add_to):
-  for key in add_from:
-    if key in add_to:
-      add_to[key].extend(add_from[key])
-    else:
-      add_to[key] = add_from[key]
-
-
-# Exclude some libraries in contrib from the documentation altogether.
-def _get_default_private_map():
-  return {
-      'tf.test': ['mock'],
-      'tf': ['contrib'],
-      'tf.compat': ['v1', 'v2'],
-  }
-
-
-# Exclude members of some libraries.
-def _get_default_do_not_descend_map():
-  # TODO(markdaoust): Use docs_controls decorators, locally, instead.
-  return {
-      'tf': ['cli', 'lib', 'wrappers'],
-  }
-
-
-class DocControlsAwareCrawler(public_api.PublicAPIVisitor):
-  """A `docs_controls` aware API-crawler."""
-
-  def _is_private(self, path, name, obj):
-    if doc_controls.should_skip(obj):
-      return True
-    return super(DocControlsAwareCrawler, self)._is_private(path, name, obj)
-
-
-def extract(py_modules,
-            private_map,
-            do_not_descend_map,
-            visitor_cls=doc_generator_visitor.DocGeneratorVisitor):
-  """Extract docs from tf namespace and write them to disk."""
-  # Traverse the first module.
-  visitor = visitor_cls(py_modules[0][0])
-  api_visitor = DocControlsAwareCrawler(visitor)
-  api_visitor.set_root_name(py_modules[0][0])
-  add_dict_to_dict(private_map, api_visitor.private_map)
-  add_dict_to_dict(do_not_descend_map, api_visitor.do_not_descend_map)
-
-  traverse.traverse(py_modules[0][1], api_visitor)
-
-  # Traverse all py_modules after the first:
-  for module_name, module in py_modules[1:]:
-    visitor.set_root_name(module_name)
-    api_visitor.set_root_name(module_name)
-    traverse.traverse(module, api_visitor)
-
-  return visitor
-
-
-class _GetMarkdownTitle(py_guide_parser.PyGuideParser):
-  """Extract the title from a .md file."""
-
-  def __init__(self):
-    self.title = None
-    py_guide_parser.PyGuideParser.__init__(self)
-
-  def process_title(self, _, title):
-    if self.title is None:  # only use the first title
-      self.title = title
-
-
-class _DocInfo(object):
-  """A simple struct for holding a doc's url and title."""
-
-  def __init__(self, url, title):
-    self.url = url
-    self.title = title
-
-
-def build_doc_index(src_dir):
-  """Build an index from a keyword designating a doc to _DocInfo objects."""
-  doc_index = {}
-  if not os.path.isabs(src_dir):
-    raise ValueError("'src_dir' must be an absolute path.\n"
-                     "    src_dir='%s'" % src_dir)
-
-  if not os.path.exists(src_dir):
-    raise ValueError("'src_dir' path must exist.\n"
-                     "    src_dir='%s'" % src_dir)
-
-  for dirpath, _, filenames in os.walk(src_dir):
-    suffix = os.path.relpath(path=dirpath, start=src_dir)
-    for base_name in filenames:
-      if not six.ensure_str(base_name).endswith('.md'):
-        continue
-      title_parser = _GetMarkdownTitle()
-      title_parser.process(os.path.join(dirpath, base_name))
-      if title_parser.title is None:
-        msg = ('`{}` has no markdown title (# title)'.format(
-            os.path.join(dirpath, base_name)))
-        raise ValueError(msg)
-      key_parts = six.ensure_str(os.path.join(suffix,
-                                              base_name[:-3])).split('/')
-      if key_parts[-1] == 'index':
-        key_parts = key_parts[:-1]
-      doc_info = _DocInfo(os.path.join(suffix, base_name), title_parser.title)
-      doc_index[key_parts[-1]] = doc_info
-      if len(key_parts) > 1:
-        doc_index['/'.join(key_parts[-2:])] = doc_info
-
-  return doc_index
-
-
-class _GuideRef(object):
-
-  def __init__(self, base_name, title, section_title, section_tag):
-    self.url = 'api_guides/python/' + six.ensure_str(
-        (('%s#%s' % (base_name, section_tag)) if section_tag else base_name))
-    self.link_text = (('%s > %s' % (title, section_title))
-                      if section_title else title)
-
-  def make_md_link(self, url_prefix):
-    return '[%s](%s%s)' % (self.link_text, url_prefix, self.url)
-
-
-class _GenerateGuideIndex(py_guide_parser.PyGuideParser):
-  """Turn guide files into an index from symbol name to a list of _GuideRefs."""
-
-  def __init__(self):
-    self.index = {}
-    py_guide_parser.PyGuideParser.__init__(self)
-
-  def process(self, full_path, base_name):
-    """Index a file, reading from `full_path`, with `base_name` as the link."""
-    self.full_path = full_path
-    self.base_name = base_name
-    self.title = None
-    self.section_title = None
-    self.section_tag = None
-    py_guide_parser.PyGuideParser.process(self, full_path)
-
-  def process_title(self, _, title):
-    if self.title is None:  # only use the first title
-      self.title = title
-
-  def process_section(self, _, section_title, tag):
-    self.section_title = section_title
-    self.section_tag = tag
-
-  def process_line(self, _, line):
-    """Index the file and section of each `symbol` reference."""
-    for match in parser.AUTO_REFERENCE_RE.finditer(line):
-      val = self.index.get(match.group(1), [])
-      val.append(
-          _GuideRef(self.base_name, self.title, self.section_title,
-                    self.section_tag))
-      self.index[match.group(1)] = val
-
-
-def _build_guide_index(guide_src_dir):
-  """Return dict: symbol name -> _GuideRef from the files in `guide_src_dir`."""
-  index_generator = _GenerateGuideIndex()
-  if os.path.exists(guide_src_dir):
-    for full_path, base_name in py_guide_parser.md_files_in_dir(guide_src_dir):
-      index_generator.process(full_path, base_name)
-  return index_generator.index
-
-
-class _UpdateTags(py_guide_parser.PyGuideParser):
-  """Rewrites a Python guide so that each section has an explicit id tag.
-
-  "section" here refers to blocks delimited by second level headings.
-  """
-
-  def process_section(self, line_number, section_title, tag):
-    self.replace_line(line_number, '<h2 id="%s">%s</h2>' % (tag, section_title))
-
-
-def update_id_tags_inplace(src_dir):
-  """Set explicit ids on all second-level headings to ensure back-links work.
-
-  Args:
-    src_dir: The directory of md-files to convert (inplace).
-  """
-  tag_updater = _UpdateTags()
-
-  for dirpath, _, filenames in os.walk(src_dir):
-    for base_name in filenames:
-      if not base_name.endswith('.md'):
-        continue
-      full_path = os.path.join(src_dir, dirpath, base_name)
-
-      # Tag updater loads the file, makes the replacements, and returns the
-      # modified file contents
-      content = tag_updater.process(full_path)
-      with open(full_path, 'w') as f:
-        f.write(six.ensure_str(content))
-
-
-EXCLUDED = set(['__init__.py', 'OWNERS', 'README.txt'])
-
-
-def replace_refs(src_dir,
-                 output_dir,
-                 reference_resolver,
-                 file_pattern='*.md',
-                 api_docs_relpath='api_docs'):
-  """Fix @{} references in all files under `src_dir` matching `file_pattern`.
-
-  A matching directory structure, with the modified files is
-  written to `output_dir`.
-
-  `{"__init__.py","OWNERS","README.txt"}` are skipped.
-
-  Files not matching `file_pattern` (using `fnmatch`) are copied with no change.
-
-  Also, files in the `api_guides/python` directory get explicit ids set on all
-  heading-2s to ensure back-links work.
-
-  Args:
-    src_dir: The directory to convert files from.
-    output_dir: The root directory to write the resulting files to.
-    reference_resolver: A `parser.ReferenceResolver` to make the replacements.
-    file_pattern: Only replace references in files matching file_patters,
-      using fnmatch. Non-matching files are copied unchanged.
-    api_docs_relpath: Relative-path string to the api_docs, from the src_dir.
-  """
-  # Iterate through all the source files and process them.
-  for dirpath, _, filenames in os.walk(src_dir):
-    depth = os.path.relpath(src_dir, start=dirpath)
-    # How to get from `dirpath` to api_docs/python/
-    relative_path_to_root = os.path.join(depth, api_docs_relpath, 'python')
-
-    # Make the directory under output_dir.
-    new_dir = os.path.join(output_dir,
-                           os.path.relpath(path=dirpath, start=src_dir))
-    if not os.path.exists(new_dir):
-      os.makedirs(new_dir)
-
-    for base_name in filenames:
-      if base_name in EXCLUDED:
-        continue
-      full_in_path = os.path.join(dirpath, base_name)
-
-      # Set the `current_doc_full_name` so bad files can be reported on errors.
-      reference_resolver.current_doc_full_name = full_in_path
-
-      suffix = os.path.relpath(path=full_in_path, start=src_dir)
-      full_out_path = os.path.join(output_dir, suffix)
-      # Copy files that do not match the file_pattern, unmodified.
-      if not fnmatch.fnmatch(base_name, file_pattern):
-        if full_in_path != full_out_path:
-          shutil.copyfile(full_in_path, full_out_path)
-        continue
-
-      with open(full_in_path, 'rb') as f:
-        content = f.read().decode('utf-8')
-
-      content = reference_resolver.replace_references(content,
-                                                      relative_path_to_root)
-      with open(full_out_path, 'wb') as f:
-        f.write(six.ensure_binary(content, 'utf-8'))
-
-
-class DocGenerator(object):
-  """Main entry point for generating docs."""
-
-  def __init__(self):
-    self.argument_parser = argparse.ArgumentParser()
-    self._py_modules = None
-    self._private_map = _get_default_private_map()
-    self._do_not_descend_map = _get_default_do_not_descend_map()
-    self.yaml_toc = True
-
-    self.argument_parser.add_argument(
-        '--no_search_hints',
-        dest='search_hints',
-        action='store_false',
-        default=True)
-
-    self.argument_parser.add_argument(
-        '--site_api_path',
-        type=str, default='api_docs/python',
-        help='The path from the site-root to api_docs'
-             'directory for this project')
-
-    self.argument_parser.add_argument(
-        '--api_cache_out_path',
-        type=str,
-        default=None,
-        help='Path to store a json-serialized api-index, so links can be '
-        'inserted into docs without rebuilding the api_docs')
-
-  def add_output_dir_argument(self):
-    self.argument_parser.add_argument(
-        '--output_dir',
-        type=str,
-        default=None,
-        required=True,
-        help='Directory to write docs to.')
-
-  def add_src_dir_argument(self):
-    self.argument_parser.add_argument(
-        '--src_dir',
-        type=str,
-        default=tempfile.mkdtemp(),
-        required=False,
-        help='Optional directory of source docs to add api_docs links to')
-
-  def add_base_dir_argument(self, default_base_dir):
-    self.argument_parser.add_argument(
-        '--base_dir',
-        type=str,
-        default=default_base_dir,
-        help='Base directory to strip from file names referenced in docs.')
-
-  def parse_known_args(self):
-    flags, _ = self.argument_parser.parse_known_args()
-    return flags
-
-  def add_to_private_map(self, d):
-    add_dict_to_dict(d, self._private_map)
-
-  def add_to_do_not_descend_map(self, d):
-    add_dict_to_dict(d, self._do_not_descend_map)
-
-  def set_private_map(self, d):
-    self._private_map = d
-
-  def set_do_not_descend_map(self, d):
-    self._do_not_descend_map = d
-
-  def set_py_modules(self, py_modules):
-    self._py_modules = py_modules
-
-  def py_module_names(self):
-    if self._py_modules is None:
-      raise RuntimeError(
-          'Must call set_py_modules() before running py_module_names().')
-    return [name for (name, _) in self._py_modules]
-
-  def make_reference_resolver(self, visitor, doc_index):
-    return parser.ReferenceResolver.from_visitor(
-        visitor, doc_index, py_module_names=self.py_module_names())
-
-  def make_parser_config(self, visitor, reference_resolver, guide_index,
-                         base_dir):
-    return parser.ParserConfig(
-        reference_resolver=reference_resolver,
-        duplicates=visitor.duplicates,
-        duplicate_of=visitor.duplicate_of,
-        tree=visitor.tree,
-        index=visitor.index,
-        reverse_index=visitor.reverse_index,
-        guide_index=guide_index,
-        base_dir=base_dir)
-
-  def run_extraction(self):
-    return extract(self._py_modules, self._private_map,
-                   self._do_not_descend_map)
-
-  def build(self, flags):
-    """Build all the docs.
-
-    This produces two outputs
-
-    python api docs:
-
-      * generated from modules set with `set_py_modules`.
-      * written to '{FLAGS.output_dir}/api_docs/python/'
-
-    non-api docs:
-
-      * Everything in '{FLAGS.src_dir}' is copied to '{FLAGS.output_dir}'.
-      * '@{}' references in '.md' files are replaced with links.
-      * '.md' files under 'api_guides/python' have explicit ids set for their
-        second level headings.
-
-    Args:
-      flags:
-        * src_dir: Where to fetch the non-api-docs.
-        * base_dir: Base of the docs directory (Used to build correct
-          relative links).
-        * output_dir: Where to write the resulting docs.
-
-    Returns:
-      The number of errors encountered while processing.
-    """
-    # Extract the python api from the _py_modules
-    doc_index = build_doc_index(flags.src_dir)
-    visitor = self.run_extraction()
-    reference_resolver = self.make_reference_resolver(visitor, doc_index)
-
-    if getattr(flags, 'api_cache_out_path', None):
-      reference_resolver.to_json_file(flags.api_cache_out_path)
-
-    # Build the guide_index for the api_docs back links.
-    root_title = getattr(flags, 'root_title', 'TensorFlow')
-    guide_index = _build_guide_index(
-        os.path.join(flags.src_dir, 'api_guides/python'))
-
-    # Write the api docs.
-    parser_config = self.make_parser_config(visitor, reference_resolver,
-                                            guide_index, flags.base_dir)
-    output_dir = os.path.join(flags.output_dir, 'api_docs/python')
-
-    write_docs(
-        output_dir,
-        parser_config,
-        yaml_toc=self.yaml_toc,
-        root_title=root_title,
-        search_hints=getattr(flags, 'search_hints', True),
-        site_api_path=getattr(flags, 'site_api_path', ''))
-
-    # Replace all the @{} references in files under `FLAGS.src_dir`
-    replace_refs(flags.src_dir, flags.output_dir, reference_resolver, '*.md')
-    # Fix the tags in the guide dir.
-    guide_dir = os.path.join(flags.output_dir, 'api_guides/python')
-    if os.path.exists(guide_dir):
-      update_id_tags_inplace(guide_dir)
-
-    # Report all errors found by the reference resolver, and return the error
-    # code.
-    parser_config.reference_resolver.log_errors()
-
-    return parser_config.reference_resolver.num_errors()
diff --git a/tensorflow/tools/docs/parser.py b/tensorflow/tools/docs/parser.py
deleted file mode 100644
index 61518bcbd46..00000000000
--- a/tensorflow/tools/docs/parser.py
+++ /dev/null
@@ -1,1788 +0,0 @@
-# Lint as: python2, python3
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Turn Python docstrings into Markdown for TensorFlow documentation."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import ast
-import collections
-import functools
-import itertools
-import json
-import os
-import re
-
-import astor
-import six
-from six.moves import zip
-
-from google.protobuf.message import Message as ProtoMessage
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.util import tf_inspect
-from tensorflow.tools.docs import doc_controls
-
-
-def is_free_function(py_object, full_name, index):
-  """Check if input is a free function (and not a class- or static method).
-
-  Args:
-    py_object: The the object in question.
-    full_name: The full name of the object, like `tf.module.symbol`.
-    index: The {full_name:py_object} dictionary for the public API.
-
-  Returns:
-    True if the obeject is a stand-alone function, and not part of a class
-    definition.
-  """
-  if not tf_inspect.isfunction(py_object):
-    return False
-
-  parent_name = six.ensure_str(full_name).rsplit('.', 1)[0]
-  if tf_inspect.isclass(index[parent_name]):
-    return False
-
-  return True
-
-
-# A regular expression capturing a python identifier.
-IDENTIFIER_RE = r'[a-zA-Z_]\w*'
-
-
-class TFDocsError(Exception):
-  pass
-
-
-class _Errors(object):
-  """A collection of errors."""
-
-  def __init__(self):
-    self._errors = []
-
-  def log_all(self):
-    """Log all the collected errors to the standard error."""
-    template = 'ERROR:\n    output file name: %s\n    %s\n\n'
-
-    for full_name, message in self._errors:
-      logging.warn(template, full_name, message)
-
-  def append(self, full_name, message):
-    """Add an error to the collection.
-
-    Args:
-      full_name: The path to the file in which the error occurred.
-      message: The message to display with the error.
-    """
-    self._errors.append((full_name, message))
-
-  def __len__(self):
-    return len(self._errors)
-
-  def __eq__(self, other):
-    if not isinstance(other, _Errors):
-      return False
-    return self._errors == other._errors  # pylint: disable=protected-access
-
-
-def documentation_path(full_name, is_fragment=False):
-  """Returns the file path for the documentation for the given API symbol.
-
-  Given the fully qualified name of a library symbol, compute the path to which
-  to write the documentation for that symbol (relative to a base directory).
-  Documentation files are organized into directories that mirror the python
-  module/class structure.
-
-  Args:
-    full_name: Fully qualified name of a library symbol.
-    is_fragment: If `False` produce a direct markdown link (`tf.a.b.c` -->
-      `tf/a/b/c.md`). If `True` produce fragment link, `tf.a.b.c` -->
-      `tf/a/b.md#c`
-  Returns:
-    The file path to which to write the documentation for `full_name`.
-  """
-  parts = six.ensure_str(full_name).split('.')
-  if is_fragment:
-    parts, fragment = parts[:-1], parts[-1]
-
-  result = six.ensure_str(os.path.join(*parts)) + '.md'
-
-  if is_fragment:
-    result = six.ensure_str(result) + '#' + six.ensure_str(fragment)
-
-  return result
-
-
-def _get_raw_docstring(py_object):
-  """Get the docs for a given python object.
-
-  Args:
-    py_object: A python object to retrieve the docs for (class, function/method,
-      or module).
-
-  Returns:
-    The docstring, or the empty string if no docstring was found.
-  """
-  # For object instances, tf_inspect.getdoc does give us the docstring of their
-  # type, which is not what we want. Only return the docstring if it is useful.
-  if (tf_inspect.isclass(py_object) or tf_inspect.ismethod(py_object) or
-      tf_inspect.isfunction(py_object) or tf_inspect.ismodule(py_object) or
-      isinstance(py_object, property)):
-    return tf_inspect.getdoc(py_object) or ''
-  else:
-    return ''
-
-
-# A regular expression for capturing a @{symbol} reference.
-SYMBOL_REFERENCE_RE = re.compile(
-    r"""
-    # Start with a literal "@{".
-    @\{
-      # Group at least 1 symbol, not "}".
-      ([^}]+)
-    # Followed by a closing "}"
-    \}
-    """,
-    flags=re.VERBOSE)
-
-AUTO_REFERENCE_RE = re.compile(r'`([a-zA-Z0-9_.]+?)`')
-
-
-class ReferenceResolver(object):
-  """Class for replacing @{...} references with Markdown links.
-
-  Attributes:
-    current_doc_full_name: A string (or None) indicating the name of the
-      document currently being processed, so errors can reference the broken
-      doc.
-  """
-
-  def __init__(self, duplicate_of, doc_index, is_fragment, py_module_names):
-    """Initializes a Reference Resolver.
-
-    Args:
-      duplicate_of: A map from duplicate names to preferred names of API
-        symbols.
-      doc_index: A `dict` mapping symbol name strings to objects with `url`
-        and `title` fields. Used to resolve @{$doc} references in docstrings.
-      is_fragment: A map from full names to bool for each symbol. If True the
-        object lives at a page fragment `tf.a.b.c` --> `tf/a/b#c`. If False
-        object has a page to itself: `tf.a.b.c` --> `tf/a/b/c`.
-      py_module_names: A list of string names of Python modules.
-    """
-    self._duplicate_of = duplicate_of
-    self._doc_index = doc_index
-    self._is_fragment = is_fragment
-    self._all_names = set(is_fragment.keys())
-    self._py_module_names = py_module_names
-
-    self.current_doc_full_name = None
-    self._errors = _Errors()
-
-  def add_error(self, message):
-    self._errors.append(self.current_doc_full_name, message)
-
-  def log_errors(self):
-    self._errors.log_all()
-
-  def num_errors(self):
-    return len(self._errors)
-
-  @classmethod
-  def from_visitor(cls, visitor, doc_index, **kwargs):
-    """A factory function for building a ReferenceResolver from a visitor.
-
-    Args:
-      visitor: an instance of `DocGeneratorVisitor`
-      doc_index: a dictionary mapping document names to references objects with
-        "title" and "url" fields
-      **kwargs: all remaining args are passed to the constructor
-    Returns:
-      an instance of `ReferenceResolver` ()
-    """
-    is_fragment = {}
-    for name, obj in visitor.index.items():
-      has_page = (
-          tf_inspect.isclass(obj) or tf_inspect.ismodule(obj) or
-          is_free_function(obj, name, visitor.index))
-
-      is_fragment[name] = not has_page
-
-    return cls(
-        duplicate_of=visitor.duplicate_of,
-        doc_index=doc_index,
-        is_fragment=is_fragment,
-        **kwargs)
-
-  @classmethod
-  def from_json_file(cls, filepath, doc_index):
-    with open(filepath) as f:
-      json_dict = json.load(f)
-
-    return cls(doc_index=doc_index, **json_dict)
-
-  def to_json_file(self, filepath):
-    """Converts the RefenceResolver to json and writes it to the specified file.
-
-    Args:
-      filepath: The file path to write the json to.
-    """
-    try:
-      os.makedirs(os.path.dirname(filepath))
-    except OSError:
-      pass
-    json_dict = {}
-    for key, value in self.__dict__.items():
-      # Drop these two fields. `_doc_index` is not serializable. `_all_names` is
-      # generated by the constructor.
-      if key in ('_doc_index', '_all_names',
-                 '_errors', 'current_doc_full_name'):
-        continue
-
-      # Strip off any leading underscores on field names as these are not
-      # recognized by the constructor.
-      json_dict[key.lstrip('_')] = value
-
-    with open(filepath, 'w') as f:
-      json.dump(json_dict, f, indent=2, sort_keys=True)
-
-  def replace_references(self, string, relative_path_to_root):
-    """Replace "@{symbol}" references with links to symbol's documentation page.
-
-    This functions finds all occurrences of "@{symbol}" in `string`
-    and replaces them with markdown links to the documentation page
-    for "symbol".
-
-    `relative_path_to_root` is the relative path from the document
-    that contains the "@{symbol}" reference to the root of the API
-    documentation that is linked to. If the containing page is part of
-    the same API docset, `relative_path_to_root` can be set to
-    `os.path.dirname(documentation_path(name))`, where `name` is the
-    python name of the object whose documentation page the reference
-    lives on.
-
-    Args:
-      string: A string in which "@{symbol}" references should be replaced.
-      relative_path_to_root: The relative path from the containing document to
-        the root of the API documentation that is being linked to.
-
-    Returns:
-      `string`, with "@{symbol}" references replaced by Markdown links.
-    """
-
-    def strict_one_ref(match):
-      try:
-        return self._one_ref(match, relative_path_to_root)
-      except TFDocsError as e:
-        self.add_error(e.message)
-        return 'BAD_LINK'
-
-    string = re.sub(SYMBOL_REFERENCE_RE, strict_one_ref, six.ensure_str(string))
-
-    def sloppy_one_ref(match):
-      try:
-        return self._one_ref(match, relative_path_to_root)
-      except TFDocsError:
-        return match.group(0)
-
-    string = re.sub(AUTO_REFERENCE_RE, sloppy_one_ref, string)
-
-    return string
-
-  def python_link(self, link_text, ref_full_name, relative_path_to_root,
-                  code_ref=True):
-    """Resolve a "@{python symbol}" reference to a Markdown link.
-
-    This will pick the canonical location for duplicate symbols.  The
-    input to this function should already be stripped of the '@' and
-    '{}'.  This function returns a Markdown link. If `code_ref` is
-    true, it is assumed that this is a code reference, so the link
-    text will be rendered as code (using backticks).
-    `link_text` should refer to a library symbol, starting with 'tf.'.
-
-    Args:
-      link_text: The text of the Markdown link.
-      ref_full_name: The fully qualified name of the symbol to link to.
-      relative_path_to_root: The relative path from the location of the current
-        document to the root of the API documentation.
-      code_ref: If true (the default), put `link_text` in `...`.
-
-    Returns:
-      A markdown link to the documentation page of `ref_full_name`.
-    """
-    url = self.reference_to_url(ref_full_name, relative_path_to_root)
-
-    if code_ref:
-      link_text = link_text.join(['<code>', '</code>'])
-    else:
-      link_text = self._link_text_to_html(link_text)
-
-    return '<a href="{}">{}</a>'.format(url, link_text)
-
-  @staticmethod
-  def _link_text_to_html(link_text):
-    code_re = '`(.*?)`'
-    return re.sub(code_re, r'<code>\1</code>', six.ensure_str(link_text))
-
-  def py_master_name(self, full_name):
-    """Return the master name for a Python symbol name."""
-    return self._duplicate_of.get(full_name, full_name)
-
-  def reference_to_url(self, ref_full_name, relative_path_to_root):
-    """Resolve a "@{python symbol}" reference to a relative path.
-
-    The input to this function should already be stripped of the '@'
-    and '{}', and its output is only the link, not the full Markdown.
-
-    If `ref_full_name` is the name of a class member, method, or property, the
-    link will point to the page of the containing class, and it will include the
-    method name as an anchor. For example, `tf.module.MyClass.my_method` will be
-    translated into a link to
-    `os.join.path(relative_path_to_root, 'tf/module/MyClass.md#my_method')`.
-
-    Args:
-      ref_full_name: The fully qualified name of the symbol to link to.
-      relative_path_to_root: The relative path from the location of the current
-        document to the root of the API documentation.
-
-    Returns:
-      A relative path that links from the documentation page of `from_full_name`
-      to the documentation page of `ref_full_name`.
-
-    Raises:
-      RuntimeError: If `ref_full_name` is not documented.
-      TFDocsError: If the @{} syntax cannot be decoded.
-    """
-    master_name = self._duplicate_of.get(ref_full_name, ref_full_name)
-
-    # Check whether this link exists
-    if master_name not in self._all_names:
-      raise TFDocsError(
-          'Cannot make link to "%s": Not in index.' % master_name)
-
-    ref_path = documentation_path(master_name, self._is_fragment[master_name])
-    return os.path.join(relative_path_to_root, ref_path)
-
-  def _one_ref(self, match, relative_path_to_root):
-    """Return a link for a single "@{symbol}" reference."""
-    string = match.group(1)
-
-    # Look for link text after $.
-    dollar = string.rfind('$')
-    if dollar > 0:  # Ignore $ in first character
-      link_text = string[dollar + 1:]
-      string = string[:dollar]
-      manual_link_text = True
-    else:
-      link_text = string
-      manual_link_text = False
-
-    # Handle different types of references.
-    if six.ensure_str(string).startswith('$'):  # Doc reference
-      return self._doc_link(string, link_text, manual_link_text,
-                            relative_path_to_root)
-
-    elif six.ensure_str(string).startswith('tensorflow::'):
-      # C++ symbol
-      return self._cc_link(string, link_text, manual_link_text,
-                           relative_path_to_root)
-
-    else:
-      is_python = False
-      for py_module_name in self._py_module_names:
-        if string == py_module_name or string.startswith(
-            six.ensure_str(py_module_name) + '.'):
-          is_python = True
-          break
-      if is_python:  # Python symbol
-        return self.python_link(
-            link_text,
-            string,
-            relative_path_to_root,
-            code_ref=not manual_link_text)
-
-    # Error!
-    raise TFDocsError('Did not understand "%s"' % match.group(0),
-                      'BROKEN_LINK')
-
-  def _doc_link(self, string, link_text, manual_link_text,
-                relative_path_to_root):
-    """Generate a link for a @{$...} reference."""
-    string = string[1:]  # remove leading $
-
-    # If string has a #, split that part into `hash_tag`
-    hash_pos = six.ensure_str(string).find('#')
-    if hash_pos > -1:
-      hash_tag = string[hash_pos:]
-      string = string[:hash_pos]
-    else:
-      hash_tag = ''
-
-    if string in self._doc_index:
-      if not manual_link_text: link_text = self._doc_index[string].title
-      url = os.path.normpath(os.path.join(
-          relative_path_to_root, '../..', self._doc_index[string].url))
-      link_text = self._link_text_to_html(link_text)
-      return '<a href="{}{}">{}</a>'.format(url, hash_tag, link_text)
-
-    return self._doc_missing(string, hash_tag, link_text, manual_link_text,
-                             relative_path_to_root)
-
-  def _doc_missing(self, string, unused_hash_tag, unused_link_text,
-                   unused_manual_link_text, unused_relative_path_to_root):
-    """Generate an error for unrecognized @{$...} references."""
-    raise TFDocsError('Unknown Document "%s"' % string)
-
-  def _cc_link(self, string, link_text, unused_manual_link_text,
-               relative_path_to_root):
-    """Generate a link for a @{tensorflow::...} reference."""
-    # TODO(josh11b): Fix this hard-coding of paths.
-    if string == 'tensorflow::ClientSession':
-      ret = 'class/tensorflow/client-session.md'
-    elif string == 'tensorflow::Scope':
-      ret = 'class/tensorflow/scope.md'
-    elif string == 'tensorflow::Status':
-      ret = 'class/tensorflow/status.md'
-    elif string == 'tensorflow::Tensor':
-      ret = 'class/tensorflow/tensor.md'
-    elif string == 'tensorflow::ops::Const':
-      ret = 'namespace/tensorflow/ops.md#const'
-    else:
-      raise TFDocsError('C++ reference not understood: "%s"' % string)
-
-    # relative_path_to_root gets you to api_docs/python, we go from there
-    # to api_docs/cc, and then add ret.
-    cc_relative_path = os.path.normpath(os.path.join(
-        relative_path_to_root, '../cc', ret))
-
-    return '<a href="{}"><code>{}</code></a>'.format(cc_relative_path,
-                                                     link_text)
-
-
-# TODO(aselle): Collect these into a big list for all modules and functions
-# and make a rosetta stone page.
-def _handle_compatibility(doc):
-  """Parse and remove compatibility blocks from the main docstring.
-
-  Args:
-    doc: The docstring that contains compatibility notes"
-
-  Returns:
-    a tuple of the modified doc string and a hash that maps from compatibility
-    note type to the text of the note.
-  """
-  compatibility_notes = {}
-  match_compatibility = re.compile(r'[ \t]*@compatibility\((\w+)\)\s*\n'
-                                   r'((?:[^@\n]*\n)+)'
-                                   r'\s*@end_compatibility')
-  for f in match_compatibility.finditer(doc):
-    compatibility_notes[f.group(1)] = f.group(2)
-  return match_compatibility.subn(r'', doc)[0], compatibility_notes
-
-
-def _gen_pairs(items):
-  """Given an list of items [a,b,a,b...], generate pairs [(a,b),(a,b)...].
-
-  Args:
-    items: A list of items (length must be even)
-
-  Yields:
-    The original items, in pairs
-  """
-  assert len(items) % 2 == 0
-  items = iter(items)
-  while True:
-    try:
-      yield next(items), next(items)
-    except StopIteration:
-      return
-
-
-class _FunctionDetail(
-    collections.namedtuple('_FunctionDetail', ['keyword', 'header', 'items'])):
-  """A simple class to contain function details.
-
-  Composed of a "keyword", a possibly empty "header" string, and a possibly
-  empty
-  list of key-value pair "items".
-  """
-  __slots__ = []
-
-  def __str__(self):
-    """Return the original string that represents the function detail."""
-    parts = [six.ensure_str(self.keyword) + ':\n']
-    parts.append(self.header)
-    for key, value in self.items:
-      parts.append('  ' + six.ensure_str(key) + ': ')
-      parts.append(value)
-
-    return ''.join(parts)
-
-
-def _parse_function_details(docstring):
-  r"""Given a docstring, split off the header and parse the function details.
-
-  For example the docstring of tf.nn.relu:
-
-  '''Computes rectified linear: `max(features, 0)`.
-
-  Args:
-    features: A `Tensor`. Must be one of the following types: `float32`,
-      `float64`, `int32`, `int64`, `uint8`, `int16`, `int8`, `uint16`,
-      `half`.
-    name: A name for the operation (optional).
-
-  Returns:
-    A `Tensor`. Has the same type as `features`.
-  '''
-
-  This is parsed, and returned as:
-
-  ```
-  ('Computes rectified linear: `max(features, 0)`.\n\n', [
-      _FunctionDetail(
-          keyword='Args',
-          header='',
-          items=[
-              ('features', ' A `Tensor`. Must be ...'),
-              ('name', ' A name for the operation (optional).\n\n')]),
-      _FunctionDetail(
-          keyword='Returns',
-          header='  A `Tensor`. Has the same type as `features`.',
-          items=[])
-  ])
-  ```
-
-  Args:
-    docstring: The docstring to parse
-
-  Returns:
-    A (header, function_details) pair, where header is a string and
-    function_details is a (possibly empty) list of `_FunctionDetail` objects.
-  """
-
-  detail_keywords = '|'.join([
-      'Args', 'Arguments', 'Fields', 'Returns', 'Yields', 'Raises', 'Attributes'
-  ])
-  tag_re = re.compile('(?<=\n)(' + detail_keywords + '):\n', re.MULTILINE)
-  parts = tag_re.split(docstring)
-
-  # The first part is the main docstring
-  docstring = parts[0]
-
-  # Everything else alternates keyword-content
-  pairs = list(_gen_pairs(parts[1:]))
-
-  function_details = []
-  item_re = re.compile(r'^   ? ?(\*?\*?\w[\w.]*?\s*):\s', re.MULTILINE)
-
-  for keyword, content in pairs:
-    content = item_re.split(six.ensure_str(content))
-    header = content[0]
-    items = list(_gen_pairs(content[1:]))
-
-    function_details.append(_FunctionDetail(keyword, header, items))
-
-  return docstring, function_details
-
-
-_DocstringInfo = collections.namedtuple('_DocstringInfo', [
-    'brief', 'docstring', 'function_details', 'compatibility'
-])
-
-
-def _parse_md_docstring(py_object, relative_path_to_root, reference_resolver):
-  """Parse the object's docstring and return a `_DocstringInfo`.
-
-  This function clears @@'s from the docstring, and replaces @{} references
-  with markdown links.
-
-  For links within the same set of docs, the `relative_path_to_root` for a
-  docstring on the page for `full_name` can be set to:
-
-  ```python
-  relative_path_to_root = os.path.relpath(
-    path='.', start=os.path.dirname(documentation_path(full_name)) or '.')
-  ```
-
-  Args:
-    py_object: A python object to retrieve the docs for (class, function/method,
-      or module).
-    relative_path_to_root: The relative path from the location of the current
-      document to the root of the Python API documentation. This is used to
-      compute links for "@{symbol}" references.
-    reference_resolver: An instance of ReferenceResolver.
-
-  Returns:
-    A _DocstringInfo object, all fields will be empty if no docstring was found.
-  """
-  # TODO(wicke): If this is a partial, use the .func docstring and add a note.
-  raw_docstring = _get_raw_docstring(py_object)
-
-  raw_docstring = reference_resolver.replace_references(
-      raw_docstring, relative_path_to_root)
-
-  atat_re = re.compile(r' *@@[a-zA-Z_.0-9]+ *$')
-  raw_docstring = '\n'.join(
-      line for line in six.ensure_str(raw_docstring).split('\n')
-      if not atat_re.match(six.ensure_str(line)))
-
-  docstring, compatibility = _handle_compatibility(raw_docstring)
-  docstring, function_details = _parse_function_details(docstring)
-
-  if 'Generated by: tensorflow/tools/api/generator' in docstring:
-    docstring = ''
-
-  return _DocstringInfo(
-      docstring.split('\n')[0], docstring, function_details, compatibility)
-
-
-def _get_arg_spec(func):
-  """Extracts signature information from a function or functools.partial object.
-
-  For functions, uses `tf_inspect.getfullargspec`. For `functools.partial`
-  objects, corrects the signature of the underlying function to take into
-  account the removed arguments.
-
-  Args:
-    func: A function whose signature to extract.
-
-  Returns:
-    An `FullArgSpec` namedtuple `(args, varargs, varkw, defaults, etc.)`,
-    as returned by `tf_inspect.getfullargspec`.
-  """
-  # getfullargspec does not work for functools.partial objects directly.
-  if isinstance(func, functools.partial):
-    argspec = tf_inspect.getfullargspec(func.func)
-    # Remove the args from the original function that have been used up.
-    first_default_arg = (
-        len(argspec.args or []) - len(argspec.defaults or []))
-    partial_args = len(func.args)
-    argspec_args = []
-
-    if argspec.args:
-      argspec_args = list(argspec.args[partial_args:])
-
-    argspec_defaults = list(argspec.defaults or ())
-    if argspec.defaults and partial_args > first_default_arg:
-      argspec_defaults = list(argspec.defaults[partial_args-first_default_arg:])
-
-    first_default_arg = max(0, first_default_arg - partial_args)
-    for kwarg in (func.keywords or []):
-      if kwarg in (argspec.args or []):
-        i = argspec_args.index(kwarg)
-        argspec_args.pop(i)
-        if i >= first_default_arg:
-          argspec_defaults.pop(i-first_default_arg)
-        else:
-          first_default_arg -= 1
-    return tf_inspect.FullArgSpec(
-        args=argspec_args,
-        varargs=argspec.varargs,
-        varkw=argspec.varkw,
-        defaults=tuple(argspec_defaults),
-        kwonlyargs=[],
-        kwonlydefaults=None,
-        annotations={})
-  else:  # Regular function or method, getargspec will work fine.
-    return tf_inspect.getfullargspec(func)
-
-
-def _remove_first_line_indent(string):
-  indent = len(re.match(r'^\s*', six.ensure_str(string)).group(0))
-  return '\n'.join(
-      [line[indent:] for line in six.ensure_str(string).split('\n')])
-
-
-PAREN_NUMBER_RE = re.compile(r'^\(([0-9.e-]+)\)')
-
-
-def _generate_signature(func, reverse_index):
-  """Given a function, returns a list of strings representing its args.
-
-  This function produces a list of strings representing the arguments to a
-  python function. It uses tf_inspect.getfullargspec, which
-  does not generalize well to Python 3.x, which is more flexible in how *args
-  and **kwargs are handled. This is not a problem in TF, since we have to remain
-  compatible to Python 2.7 anyway.
-
-  This function uses `__name__` for callables if it is available. This can lead
-  to poor results for functools.partial and other callable objects.
-
-  The returned string is Python code, so if it is included in a Markdown
-  document, it should be typeset as code (using backticks), or escaped.
-
-  Args:
-    func: A function, method, or functools.partial to extract the signature for.
-    reverse_index: A map from object ids to canonical full names to use.
-
-  Returns:
-    A list of strings representing the argument signature of `func` as python
-    code.
-  """
-
-  args_list = []
-
-  argspec = _get_arg_spec(func)
-  first_arg_with_default = (
-      len(argspec.args or []) - len(argspec.defaults or []))
-
-  # Python documentation skips `self` when printing method signatures.
-  # Note we cannot test for ismethod here since unbound methods do not register
-  # as methods (in Python 3).
-  first_arg = 1 if 'self' in argspec.args[:1] else 0
-
-  # Add all args without defaults.
-  for arg in argspec.args[first_arg:first_arg_with_default]:
-    args_list.append(arg)
-
-  # Add all args with defaults.
-  if argspec.defaults:
-    try:
-      source = _remove_first_line_indent(tf_inspect.getsource(func))
-      func_ast = ast.parse(source)
-      ast_defaults = func_ast.body[0].args.defaults
-    except IOError:  # If this is a builtin, getsource fails with IOError
-      # If we cannot get the source, assume the AST would be equal to the repr
-      # of the defaults.
-      ast_defaults = [None] * len(argspec.defaults)
-
-    for arg, default, ast_default in zip(
-        argspec.args[first_arg_with_default:], argspec.defaults, ast_defaults):
-      if id(default) in reverse_index:
-        default_text = reverse_index[id(default)]
-      elif ast_default is not None:
-        default_text = (
-            six.ensure_str(astor.to_source(ast_default)).rstrip('\n').replace(
-                '\t', '\\t').replace('\n', '\\n').replace('"""', "'"))
-        default_text = PAREN_NUMBER_RE.sub('\\1', six.ensure_str(default_text))
-
-        if default_text != repr(default):
-          # This may be an internal name. If so, handle the ones we know about.
-          # TODO(wicke): This should be replaced with a lookup in the index.
-          # TODO(wicke): (replace first ident with tf., check if in index)
-          internal_names = {
-              'ops.GraphKeys': 'tf.GraphKeys',
-              '_ops.GraphKeys': 'tf.GraphKeys',
-              'init_ops.zeros_initializer': 'tf.zeros_initializer',
-              'init_ops.ones_initializer': 'tf.ones_initializer',
-              'saver_pb2.SaverDef': 'tf.train.SaverDef',
-          }
-          full_name_re = '^%s(.%s)+' % (IDENTIFIER_RE, IDENTIFIER_RE)
-          match = re.match(full_name_re, default_text)
-          if match:
-            lookup_text = default_text
-            for internal_name, public_name in six.iteritems(internal_names):
-              if match.group(0).startswith(internal_name):
-                lookup_text = public_name + default_text[len(internal_name):]
-                break
-            if default_text is lookup_text:
-              logging.warn(
-                  'WARNING: Using default arg, failed lookup: %s, repr: %r',
-                  default_text, default)
-            else:
-              default_text = lookup_text
-      else:
-        default_text = repr(default)
-
-      args_list.append('%s=%s' % (arg, default_text))
-
-  # Add *args and *kwargs.
-  if argspec.varargs:
-    args_list.append('*' + six.ensure_str(argspec.varargs))
-  if argspec.varkw:
-    args_list.append('**' + six.ensure_str(argspec.varkw))
-
-  return args_list
-
-
-def _get_guides_markdown(duplicate_names, guide_index, relative_path):
-  all_guides = []
-  for name in duplicate_names:
-    all_guides.extend(guide_index.get(name, []))
-  if not all_guides: return ''
-  prefix = '../' * (relative_path.count('/') + 3)
-  links = sorted(set([guide_ref.make_md_link(prefix)
-                      for guide_ref in all_guides]))
-  return 'See the guide%s: %s\n\n' % (
-      's' if len(links) > 1 else '', ', '.join(links))
-
-
-def _get_defining_class(py_class, name):
-  for cls in tf_inspect.getmro(py_class):
-    if name in cls.__dict__:
-      return cls
-  return None
-
-
-class _LinkInfo(
-    collections.namedtuple(
-        '_LinkInfo', ['short_name', 'full_name', 'obj', 'doc', 'url'])):
-
-  __slots__ = []
-
-  def is_link(self):
-    return True
-
-
-class _OtherMemberInfo(
-    collections.namedtuple('_OtherMemberInfo',
-                           ['short_name', 'full_name', 'obj', 'doc'])):
-
-  __slots__ = []
-
-  def is_link(self):
-    return False
-
-
-_PropertyInfo = collections.namedtuple(
-    '_PropertyInfo', ['short_name', 'full_name', 'obj', 'doc'])
-
-_MethodInfo = collections.namedtuple('_MethodInfo', [
-    'short_name', 'full_name', 'obj', 'doc', 'signature', 'decorators'
-])
-
-
-class _FunctionPageInfo(object):
-  """Collects docs For a function Page."""
-
-  def __init__(self, full_name):
-    self._full_name = full_name
-    self._defined_in = None
-    self._aliases = None
-    self._doc = None
-    self._guides = None
-
-    self._signature = None
-    self._decorators = []
-
-  def for_function(self):
-    return True
-
-  def for_class(self):
-    return False
-
-  def for_module(self):
-    return False
-
-  @property
-  def full_name(self):
-    return self._full_name
-
-  @property
-  def short_name(self):
-    return six.ensure_str(self._full_name).split('.')[-1]
-
-  @property
-  def defined_in(self):
-    return self._defined_in
-
-  def set_defined_in(self, defined_in):
-    assert self.defined_in is None
-    self._defined_in = defined_in
-
-  @property
-  def aliases(self):
-    return self._aliases
-
-  def set_aliases(self, aliases):
-    assert self.aliases is None
-    self._aliases = aliases
-
-  @property
-  def doc(self):
-    return self._doc
-
-  def set_doc(self, doc):
-    assert self.doc is None
-    self._doc = doc
-
-  @property
-  def guides(self):
-    return self._guides
-
-  def set_guides(self, guides):
-    assert self.guides is None
-    self._guides = guides
-
-  @property
-  def signature(self):
-    return self._signature
-
-  def set_signature(self, function, reverse_index):
-    """Attach the function's signature.
-
-    Args:
-      function: The python function being documented.
-      reverse_index: A map from object ids in the index to full names.
-    """
-
-    assert self.signature is None
-    self._signature = _generate_signature(function, reverse_index)
-
-  @property
-  def decorators(self):
-    return list(self._decorators)
-
-  def add_decorator(self, dec):
-    self._decorators.append(dec)
-
-  def get_metadata_html(self):
-    return _Metadata(self.full_name).build_html()
-
-
-class _ClassPageInfo(object):
-  """Collects docs for a class page.
-
-  Attributes:
-    full_name: The fully qualified name of the object at the master
-      location. Aka `master_name`. For example: `tf.nn.sigmoid`.
-    short_name: The last component of the `full_name`. For example: `sigmoid`.
-    defined_in: The path to the file where this object is defined.
-    aliases: The list of all fully qualified names for the locations where the
-      object is visible in the public api. This includes the master location.
-    doc: A `_DocstringInfo` object representing the object's docstring (can be
-      created with `_parse_md_docstring`).
-    guides: A markdown string, of back links pointing to the api_guides that
-      reference this object.
-    bases: A list of `_LinkInfo` objects pointing to the docs for the parent
-      classes.
-    properties: A list of `_PropertyInfo` objects documenting the class'
-      properties (attributes that use `@property`).
-    methods: A list of `_MethodInfo` objects documenting the class' methods.
-    classes: A list of `_LinkInfo` objects pointing to docs for any nested
-      classes.
-    other_members: A list of `_OtherMemberInfo` objects documenting any other
-      object's defined inside the class object (mostly enum style fields).
-  """
-
-  def __init__(self, full_name):
-    self._full_name = full_name
-    self._defined_in = None
-    self._aliases = None
-    self._doc = None
-    self._guides = None
-    self._namedtuplefields = None
-
-    self._bases = None
-    self._properties = []
-    self._methods = []
-    self._classes = []
-    self._other_members = []
-
-  def for_function(self):
-    """Returns true if this object documents a function."""
-    return False
-
-  def for_class(self):
-    """Returns true if this object documents a class."""
-    return True
-
-  def for_module(self):
-    """Returns true if this object documents a module."""
-    return False
-
-  @property
-  def full_name(self):
-    """Returns the documented object's fully qualified name."""
-    return self._full_name
-
-  @property
-  def short_name(self):
-    """Returns the documented object's short name."""
-    return six.ensure_str(self._full_name).split('.')[-1]
-
-  @property
-  def defined_in(self):
-    """Returns the path to the file where the documented object is defined."""
-    return self._defined_in
-
-  def set_defined_in(self, defined_in):
-    """Sets the `defined_in` path."""
-    assert self.defined_in is None
-    self._defined_in = defined_in
-
-  @property
-  def aliases(self):
-    """Returns a list of all full names for the documented object."""
-    return self._aliases
-
-  def set_aliases(self, aliases):
-    """Sets the `aliases` list.
-
-    Args:
-      aliases: A list of strings. Containing all the object's full names.
-    """
-    assert self.aliases is None
-    self._aliases = aliases
-
-  @property
-  def doc(self):
-    """Returns a `_DocstringInfo` created from the object's docstring."""
-    return self._doc
-
-  def set_doc(self, doc):
-    """Sets the `doc` field.
-
-    Args:
-      doc: An instance of `_DocstringInfo`.
-    """
-    assert self.doc is None
-    self._doc = doc
-
-  @property
-  def guides(self):
-    """Returns a markdown string containing backlinks to relevant api_guides."""
-    return self._guides
-
-  def set_guides(self, guides):
-    """Sets the `guides` field.
-
-    Args:
-      guides: A markdown string containing backlinks to all the api_guides that
-        link to the documented object.
-    """
-    assert self.guides is None
-    self._guides = guides
-
-  @property
-  def namedtuplefields(self):
-    return self._namedtuplefields
-
-  def set_namedtuplefields(self, py_class):
-    if issubclass(py_class, tuple):
-      if all(
-          hasattr(py_class, attr)
-          for attr in ('_asdict', '_fields', '_make', '_replace')):
-        self._namedtuplefields = py_class._fields
-
-  @property
-  def bases(self):
-    """Returns a list of `_LinkInfo` objects pointing to the class' parents."""
-    return self._bases
-
-  def _set_bases(self, relative_path, parser_config):
-    """Builds the `bases` attribute, to document this class' parent-classes.
-
-    This method sets the `bases` to a list of `_LinkInfo` objects point to the
-    doc pages for the class' parents.
-
-    Args:
-      relative_path: The relative path from the doc this object describes to
-        the documentation root.
-      parser_config: An instance of `ParserConfig`.
-    """
-    bases = []
-    obj = parser_config.py_name_to_object(self.full_name)
-    for base in obj.__bases__:
-      base_full_name = parser_config.reverse_index.get(id(base), None)
-      if base_full_name is None:
-        continue
-      base_doc = _parse_md_docstring(base, relative_path,
-                                     parser_config.reference_resolver)
-      base_url = parser_config.reference_resolver.reference_to_url(
-          base_full_name, relative_path)
-
-      link_info = _LinkInfo(
-          short_name=six.ensure_str(base_full_name).split('.')[-1],
-          full_name=base_full_name,
-          obj=base,
-          doc=base_doc,
-          url=base_url)
-      bases.append(link_info)
-
-    self._bases = bases
-
-  @property
-  def properties(self):
-    """Returns a list of `_PropertyInfo` describing the class' properties."""
-    props_dict = {prop.short_name: prop for prop in self._properties}
-    props = []
-    if self.namedtuplefields:
-      for field in self.namedtuplefields:
-        props.append(props_dict.pop(field))
-
-    props.extend(sorted(props_dict.values()))
-
-    return props
-
-  def _add_property(self, short_name, full_name, obj, doc):
-    """Adds a `_PropertyInfo` entry to the `properties` list.
-
-    Args:
-      short_name: The property's short name.
-      full_name: The property's fully qualified name.
-      obj: The property object itself
-      doc: The property's parsed docstring, a `_DocstringInfo`.
-    """
-    # Hide useless namedtuple docs-trings
-    if re.match('Alias for field number [0-9]+', six.ensure_str(doc.docstring)):
-      doc = doc._replace(docstring='', brief='')
-    property_info = _PropertyInfo(short_name, full_name, obj, doc)
-    self._properties.append(property_info)
-
-  @property
-  def methods(self):
-    """Returns a list of `_MethodInfo` describing the class' methods."""
-    return self._methods
-
-  def _add_method(self, short_name, full_name, obj, doc, signature, decorators):
-    """Adds a `_MethodInfo` entry to the `methods` list.
-
-    Args:
-      short_name: The method's short name.
-      full_name: The method's fully qualified name.
-      obj: The method object itself
-      doc: The method's parsed docstring, a `_DocstringInfo`
-      signature: The method's parsed signature (see: `_generate_signature`)
-      decorators: A list of strings describing the decorators that should be
-        mentioned on the object's docs page.
-    """
-
-    method_info = _MethodInfo(short_name, full_name, obj, doc, signature,
-                              decorators)
-
-    self._methods.append(method_info)
-
-  @property
-  def classes(self):
-    """Returns a list of `_LinkInfo` pointing to any nested classes."""
-    return self._classes
-
-  def get_metadata_html(self):
-    meta_data = _Metadata(self.full_name)
-    for item in itertools.chain(self.classes, self.properties, self.methods,
-                                self.other_members):
-      meta_data.append(item)
-
-    return meta_data.build_html()
-
-  def _add_class(self, short_name, full_name, obj, doc, url):
-    """Adds a `_LinkInfo` for a nested class to `classes` list.
-
-    Args:
-      short_name: The class' short name.
-      full_name: The class' fully qualified name.
-      obj: The class object itself
-      doc: The class' parsed docstring, a `_DocstringInfo`
-      url: A url pointing to where the nested class is documented.
-    """
-    page_info = _LinkInfo(short_name, full_name, obj, doc, url)
-
-    self._classes.append(page_info)
-
-  @property
-  def other_members(self):
-    """Returns a list of `_OtherMemberInfo` describing any other contents."""
-    return self._other_members
-
-  def _add_other_member(self, short_name, full_name, obj, doc):
-    """Adds an `_OtherMemberInfo` entry to the `other_members` list.
-
-    Args:
-      short_name: The class' short name.
-      full_name: The class' fully qualified name.
-      obj: The class object itself
-      doc: The class' parsed docstring, a `_DocstringInfo`
-    """
-    other_member_info = _OtherMemberInfo(short_name, full_name, obj, doc)
-    self._other_members.append(other_member_info)
-
-  def collect_docs_for_class(self, py_class, parser_config):
-    """Collects information necessary specifically for a class's doc page.
-
-    Mainly, this is details about the class's members.
-
-    Args:
-      py_class: The class object being documented
-      parser_config: An instance of ParserConfig.
-    """
-    self.set_namedtuplefields(py_class)
-    doc_path = documentation_path(self.full_name)
-    relative_path = os.path.relpath(
-        path='.', start=os.path.dirname(doc_path) or '.')
-
-    self._set_bases(relative_path, parser_config)
-
-    for short_name in parser_config.tree[self.full_name]:
-      # Remove builtin members that we never want to document.
-      if short_name in [
-          '__class__', '__base__', '__weakref__', '__doc__', '__module__',
-          '__dict__', '__abstractmethods__', '__slots__', '__getnewargs__',
-          '__str__', '__repr__', '__hash__', '__reduce__'
-      ]:
-        continue
-
-      child_name = '.'.join([self.full_name, short_name])
-      child = parser_config.py_name_to_object(child_name)
-
-      # Don't document anything that is defined in object or by protobuf.
-      defining_class = _get_defining_class(py_class, short_name)
-      if defining_class in [object, type, tuple, BaseException, Exception]:
-        continue
-
-      # The following condition excludes most protobuf-defined symbols.
-      if (defining_class and
-          defining_class.__name__ in ['CMessage', 'Message', 'MessageMeta']):
-        continue
-      # TODO(markdaoust): Add a note in child docs showing the defining class.
-
-      if doc_controls.should_skip_class_attr(py_class, short_name):
-        continue
-
-      child_doc = _parse_md_docstring(child, relative_path,
-                                      parser_config.reference_resolver)
-
-      if isinstance(child, property):
-        self._add_property(short_name, child_name, child, child_doc)
-
-      elif tf_inspect.isclass(child):
-        if defining_class is None:
-          continue
-        url = parser_config.reference_resolver.reference_to_url(
-            child_name, relative_path)
-        self._add_class(short_name, child_name, child, child_doc, url)
-
-      elif (tf_inspect.ismethod(child) or tf_inspect.isfunction(child) or
-            tf_inspect.isroutine(child)):
-        if defining_class is None:
-          continue
-
-        # Omit methods defined by namedtuple.
-        original_method = defining_class.__dict__[short_name]
-        if (hasattr(original_method, '__module__') and six.ensure_str(
-            (original_method.__module__ or '')).startswith('namedtuple')):
-          continue
-
-        # Some methods are often overridden without documentation. Because it's
-        # obvious what they do, don't include them in the docs if there's no
-        # docstring.
-        if not child_doc.brief.strip() and short_name in [
-            '__del__', '__copy__'
-        ]:
-          continue
-
-        try:
-          child_signature = _generate_signature(child,
-                                                parser_config.reverse_index)
-        except TypeError:
-          # If this is a (dynamically created) slot wrapper, tf_inspect will
-          # raise typeerror when trying to get to the code. Ignore such
-          # functions.
-          continue
-
-        child_decorators = []
-        try:
-          if isinstance(py_class.__dict__[short_name], classmethod):
-            child_decorators.append('classmethod')
-        except KeyError:
-          pass
-
-        try:
-          if isinstance(py_class.__dict__[short_name], staticmethod):
-            child_decorators.append('staticmethod')
-        except KeyError:
-          pass
-
-        self._add_method(short_name, child_name, child, child_doc,
-                         child_signature, child_decorators)
-      else:
-        # Exclude members defined by protobuf that are useless
-        if issubclass(py_class, ProtoMessage):
-          if (six.ensure_str(short_name).endswith('_FIELD_NUMBER') or
-              short_name in ['__slots__', 'DESCRIPTOR']):
-            continue
-
-        # TODO(wicke): We may want to also remember the object itself.
-        self._add_other_member(short_name, child_name, child, child_doc)
-
-
-class _ModulePageInfo(object):
-  """Collects docs for a module page."""
-
-  def __init__(self, full_name):
-    self._full_name = full_name
-    self._defined_in = None
-    self._aliases = None
-    self._doc = None
-    self._guides = None
-
-    self._modules = []
-    self._classes = []
-    self._functions = []
-    self._other_members = []
-
-  def for_function(self):
-    return False
-
-  def for_class(self):
-    return False
-
-  def for_module(self):
-    return True
-
-  @property
-  def full_name(self):
-    return self._full_name
-
-  @property
-  def short_name(self):
-    return six.ensure_str(self._full_name).split('.')[-1]
-
-  @property
-  def defined_in(self):
-    return self._defined_in
-
-  def set_defined_in(self, defined_in):
-    assert self.defined_in is None
-    self._defined_in = defined_in
-
-  @property
-  def aliases(self):
-    return self._aliases
-
-  def set_aliases(self, aliases):
-    assert self.aliases is None
-    self._aliases = aliases
-
-  @property
-  def doc(self):
-    return self._doc
-
-  def set_doc(self, doc):
-    assert self.doc is None
-    self._doc = doc
-
-  @property
-  def guides(self):
-    return self._guides
-
-  def set_guides(self, guides):
-    assert self.guides is None
-    self._guides = guides
-
-  @property
-  def modules(self):
-    return self._modules
-
-  def _add_module(self, short_name, full_name, obj, doc, url):
-    self._modules.append(_LinkInfo(short_name, full_name, obj, doc, url))
-
-  @property
-  def classes(self):
-    return self._classes
-
-  def _add_class(self, short_name, full_name, obj, doc, url):
-    self._classes.append(_LinkInfo(short_name, full_name, obj, doc, url))
-
-  @property
-  def functions(self):
-    return self._functions
-
-  def _add_function(self, short_name, full_name, obj, doc, url):
-    self._functions.append(_LinkInfo(short_name, full_name, obj, doc, url))
-
-  @property
-  def other_members(self):
-    return self._other_members
-
-  def _add_other_member(self, short_name, full_name, obj, doc):
-    self._other_members.append(
-        _OtherMemberInfo(short_name, full_name, obj, doc))
-
-  def get_metadata_html(self):
-    meta_data = _Metadata(self.full_name)
-
-    # Objects with their own pages are not added to the matadata list for the
-    # module, the module only has a link to the object page. No docs.
-    for item in self.other_members:
-      meta_data.append(item)
-
-    return meta_data.build_html()
-
-  def collect_docs_for_module(self, parser_config):
-    """Collect information necessary specifically for a module's doc page.
-
-    Mainly this is information about the members of the module.
-
-    Args:
-      parser_config: An instance of ParserConfig.
-    """
-    relative_path = os.path.relpath(
-        path='.',
-        start=os.path.dirname(documentation_path(self.full_name)) or '.')
-
-    member_names = parser_config.tree.get(self.full_name, [])
-    for name in member_names:
-
-      if name in ['__builtins__', '__doc__', '__file__',
-                  '__name__', '__path__', '__package__',
-                  '__cached__', '__loader__', '__spec__']:
-        continue
-
-      member_full_name = six.ensure_str(self.full_name) + '.' + six.ensure_str(
-          name) if self.full_name else name
-      member = parser_config.py_name_to_object(member_full_name)
-
-      member_doc = _parse_md_docstring(member, relative_path,
-                                       parser_config.reference_resolver)
-
-      url = parser_config.reference_resolver.reference_to_url(
-          member_full_name, relative_path)
-
-      if tf_inspect.ismodule(member):
-        self._add_module(name, member_full_name, member, member_doc, url)
-
-      elif tf_inspect.isclass(member):
-        self._add_class(name, member_full_name, member, member_doc, url)
-
-      elif tf_inspect.isfunction(member):
-        self._add_function(name, member_full_name, member, member_doc, url)
-
-      else:
-        self._add_other_member(name, member_full_name, member, member_doc)
-
-
-class ParserConfig(object):
-  """Stores all indexes required to parse the docs."""
-
-  def __init__(self, reference_resolver, duplicates, duplicate_of, tree, index,
-               reverse_index, guide_index, base_dir):
-    """Object with the common config for docs_for_object() calls.
-
-    Args:
-      reference_resolver: An instance of ReferenceResolver.
-      duplicates: A `dict` mapping fully qualified names to a set of all
-        aliases of this name. This is used to automatically generate a list of
-        all aliases for each name.
-      duplicate_of: A map from duplicate names to preferred names of API
-        symbols.
-      tree: A `dict` mapping a fully qualified name to the names of all its
-        members. Used to populate the members section of a class or module page.
-      index: A `dict` mapping full names to objects.
-      reverse_index: A `dict` mapping object ids to full names.
-
-      guide_index: A `dict` mapping symbol name strings to objects with a
-        `make_md_link()` method.
-
-      base_dir: A base path that is stripped from file locations written to the
-        docs.
-    """
-    self.reference_resolver = reference_resolver
-    self.duplicates = duplicates
-    self.duplicate_of = duplicate_of
-    self.tree = tree
-    self.reverse_index = reverse_index
-    self.index = index
-    self.guide_index = guide_index
-    self.base_dir = base_dir
-    self.defined_in_prefix = 'tensorflow/'
-    self.code_url_prefix = (
-        '/code/stable/tensorflow/')  # pylint: disable=line-too-long
-
-  def py_name_to_object(self, full_name):
-    """Return the Python object for a Python symbol name."""
-    return self.index[full_name]
-
-
-def docs_for_object(full_name, py_object, parser_config):
-  """Return a PageInfo object describing a given object from the TF API.
-
-  This function uses _parse_md_docstring to parse the docs pertaining to
-  `object`.
-
-  This function resolves '@{symbol}' references in the docstrings into links to
-  the appropriate location. It also adds a list of alternative names for the
-  symbol automatically.
-
-  It assumes that the docs for each object live in a file given by
-  `documentation_path`, and that relative links to files within the
-  documentation are resolvable.
-
-  Args:
-    full_name: The fully qualified name of the symbol to be
-      documented.
-    py_object: The Python object to be documented. Its documentation is sourced
-      from `py_object`'s docstring.
-    parser_config: A ParserConfig object.
-
-  Returns:
-    Either a `_FunctionPageInfo`, `_ClassPageInfo`, or a `_ModulePageInfo`
-    depending on the type of the python object being documented.
-
-  Raises:
-    RuntimeError: If an object is encountered for which we don't know how
-      to make docs.
-  """
-
-  # Which other aliases exist for the object referenced by full_name?
-  master_name = parser_config.reference_resolver.py_master_name(full_name)
-  duplicate_names = parser_config.duplicates.get(master_name, [full_name])
-
-  # TODO(wicke): Once other pieces are ready, enable this also for partials.
-  if (tf_inspect.ismethod(py_object) or tf_inspect.isfunction(py_object) or
-      # Some methods in classes from extensions come in as routines.
-      tf_inspect.isroutine(py_object)):
-    page_info = _FunctionPageInfo(master_name)
-    page_info.set_signature(py_object, parser_config.reverse_index)
-
-  elif tf_inspect.isclass(py_object):
-    page_info = _ClassPageInfo(master_name)
-    page_info.collect_docs_for_class(py_object, parser_config)
-
-  elif tf_inspect.ismodule(py_object):
-    page_info = _ModulePageInfo(master_name)
-    page_info.collect_docs_for_module(parser_config)
-
-  else:
-    raise RuntimeError('Cannot make docs for object %s: %r' % (full_name,
-                                                               py_object))
-
-  relative_path = os.path.relpath(
-      path='.', start=os.path.dirname(documentation_path(full_name)) or '.')
-
-  page_info.set_doc(_parse_md_docstring(
-      py_object, relative_path, parser_config.reference_resolver))
-
-  page_info.set_aliases(duplicate_names)
-
-  page_info.set_guides(_get_guides_markdown(
-      duplicate_names, parser_config.guide_index, relative_path))
-
-  page_info.set_defined_in(_get_defined_in(py_object, parser_config))
-
-  return page_info
-
-
-class _PythonBuiltin(object):
-  """This class indicated that the object in question is a python builtin.
-
-  This can be used for the `defined_in` slot of the `PageInfo` objects.
-  """
-
-  def is_builtin(self):
-    return True
-
-  def is_python_file(self):
-    return False
-
-  def is_generated_file(self):
-    return False
-
-  def __str__(self):
-    return 'This is an alias for a Python built-in.\n\n'
-
-
-class _PythonFile(object):
-  """This class indicates that the object is defined in a regular python file.
-
-  This can be used for the `defined_in` slot of the `PageInfo` objects.
-  """
-
-  def __init__(self, path, parser_config):
-    self.path = path
-    self.path_prefix = parser_config.defined_in_prefix
-    self.code_url_prefix = parser_config.code_url_prefix
-
-  def is_builtin(self):
-    return False
-
-  def is_python_file(self):
-    return True
-
-  def is_generated_file(self):
-    return False
-
-  def __str__(self):
-    return 'Defined in [`{prefix}{path}`]({code_prefix}{path}).\n\n'.format(
-        path=self.path, prefix=self.path_prefix,
-        code_prefix=self.code_url_prefix)
-
-
-class _ProtoFile(object):
-  """This class indicates that the object is defined in a .proto file.
-
-  This can be used for the `defined_in` slot of the `PageInfo` objects.
-  """
-
-  def __init__(self, path, parser_config):
-    self.path = path
-    self.path_prefix = parser_config.defined_in_prefix
-    self.code_url_prefix = parser_config.code_url_prefix
-
-  def is_builtin(self):
-    return False
-
-  def is_python_file(self):
-    return False
-
-  def is_generated_file(self):
-    return False
-
-  def __str__(self):
-    return 'Defined in [`{prefix}{path}`]({code_prefix}{path}).\n\n'.format(
-        path=self.path, prefix=self.path_prefix,
-        code_prefix=self.code_url_prefix)
-
-
-class _GeneratedFile(object):
-  """This class indicates that the object is defined in a generated python file.
-
-  Generated files should not be linked to directly.
-
-  This can be used for the `defined_in` slot of the `PageInfo` objects.
-  """
-
-  def __init__(self, path, parser_config):
-    self.path = path
-    self.path_prefix = parser_config.defined_in_prefix
-
-  def is_builtin(self):
-    return False
-
-  def is_python_file(self):
-    return False
-
-  def is_generated_file(self):
-    return True
-
-  def __str__(self):
-    return ''
-
-
-def _get_defined_in(py_object, parser_config):
-  """Returns a description of where the passed in python object was defined.
-
-  Args:
-    py_object: The Python object.
-    parser_config: A ParserConfig object.
-
-  Returns:
-    Either a `_PythonBuiltin`, `_PythonFile`, or a `_GeneratedFile`
-  """
-  # Every page gets a note about where this object is defined
-  # TODO(wicke): If py_object is decorated, get the decorated object instead.
-  # TODO(wicke): Only use decorators that support this in TF.
-
-  try:
-    path = os.path.relpath(path=tf_inspect.getfile(py_object),
-                           start=parser_config.base_dir)
-  except TypeError:  # getfile throws TypeError if py_object is a builtin.
-    return _PythonBuiltin()
-
-  # TODO(wicke): If this is a generated file, link to the source instead.
-  # TODO(wicke): Move all generated files to a generated/ directory.
-  # TODO(wicke): And make their source file predictable from the file name.
-
-  # In case this is compiled, point to the original
-  if six.ensure_str(path).endswith('.pyc'):
-    path = path[:-1]
-
-  # Never include links outside this code base.
-  if six.ensure_str(path).startswith('..') or re.search(r'\b_api\b',
-                                                        six.ensure_str(path)):
-    return None
-
-  if re.match(r'.*/gen_[^/]*\.py$', six.ensure_str(path)):
-    return _GeneratedFile(path, parser_config)
-  if 'genfiles' in path or 'tools/api/generator' in path:
-    return _GeneratedFile(path, parser_config)
-  elif re.match(r'.*_pb2\.py$', six.ensure_str(path)):
-    # The _pb2.py files all appear right next to their defining .proto file.
-    return _ProtoFile(six.ensure_str(path[:-7]) + '.proto', parser_config)
-  else:
-    return _PythonFile(path, parser_config)
-
-
-# TODO(markdaoust): This should just parse, pretty_docs should generate the md.
-def generate_global_index(library_name, index, reference_resolver):
-  """Given a dict of full names to python objects, generate an index page.
-
-  The index page generated contains a list of links for all symbols in `index`
-  that have their own documentation page.
-
-  Args:
-    library_name: The name for the documented library to use in the title.
-    index: A dict mapping full names to python objects.
-    reference_resolver: An instance of ReferenceResolver.
-
-  Returns:
-    A string containing an index page as Markdown.
-  """
-  symbol_links = []
-  for full_name, py_object in six.iteritems(index):
-    if (tf_inspect.ismodule(py_object) or tf_inspect.isfunction(py_object) or
-        tf_inspect.isclass(py_object)):
-      # In Python 3, unbound methods are functions, so eliminate those.
-      if tf_inspect.isfunction(py_object):
-        if full_name.count('.') == 0:
-          parent_name = ''
-        else:
-          parent_name = full_name[:full_name.rfind('.')]
-        if parent_name in index and tf_inspect.isclass(index[parent_name]):
-          # Skip methods (=functions with class parents).
-          continue
-      symbol_links.append((
-          full_name, reference_resolver.python_link(full_name, full_name, '.')))
-
-  lines = ['# All symbols in %s' % library_name, '']
-  for _, link in sorted(symbol_links, key=lambda x: x[0]):
-    lines.append('*  %s' % link)
-
-  # TODO(markdaoust): use a _ModulePageInfo -> prety_docs.build_md_page()
-  return '\n'.join(lines)
-
-
-class _Metadata(object):
-  """A class for building a page's Metadata block.
-
-  Attributes:
-    name: The name of the page being described by the Metadata block.
-    version: The source version.
-  """
-
-  def __init__(self, name, version='Stable'):
-    """Creates a Metadata builder.
-
-    Args:
-      name: The name of the page being described by the Metadata block.
-      version: The source version.
-    """
-    self.name = name
-    self.version = version
-    self._content = []
-
-  def append(self, item):
-    """Adds an item from the page to the Metadata block.
-
-    Args:
-      item: The parsed page section to add.
-    """
-    self._content.append(item.short_name)
-
-  def build_html(self):
-    """Returns the Metadata block as an Html string."""
-    schema = 'http://developers.google.com/ReferenceObject'
-    parts = ['<div itemscope itemtype="%s">' % schema]
-
-    parts.append('<meta itemprop="name" content="%s" />' % self.name)
-    parts.append('<meta itemprop="path" content="%s" />' % self.version)
-    for item in self._content:
-      parts.append('<meta itemprop="property" content="%s"/>' % item)
-
-    parts.extend(['</div>', ''])
-
-    return '\n'.join(parts)
diff --git a/tensorflow/tools/docs/pretty_docs.py b/tensorflow/tools/docs/pretty_docs.py
deleted file mode 100644
index 98b5c7a3b39..00000000000
--- a/tensorflow/tools/docs/pretty_docs.py
+++ /dev/null
@@ -1,328 +0,0 @@
-# Lint as: python2, python3
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""A module for converting parsed doc content into markdown pages.
-
-The adjacent `parser` module creates `PageInfo` objects, containing all data
-necessary to document an element of the TensorFlow API.
-
-This module contains one public function, which handels the conversion of these
-`PageInfo` objects into a markdown string:
-
-    md_page = build_md_page(page_info)
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import textwrap
-
-import six
-
-
-def build_md_page(page_info):
-  """Given a PageInfo object, return markdown for the page.
-
-  Args:
-    page_info: must be a `parser.FunctionPageInfo`, `parser.ClassPageInfo`, or
-        `parser.ModulePageInfo`
-
-  Returns:
-    Markdown for the page
-
-  Raises:
-    ValueError: if `page_info` is an instance of an unrecognized class
-  """
-  if page_info.for_function():
-    return _build_function_page(page_info)
-
-  if page_info.for_class():
-    return _build_class_page(page_info)
-
-  if page_info.for_module():
-    return _build_module_page(page_info)
-
-  raise ValueError('Unknown Page Info Type: %s' % type(page_info))
-
-
-def _build_function_page(page_info):
-  """Given a FunctionPageInfo object Return the page as an md string."""
-  parts = ['# %s\n\n' % page_info.full_name]
-
-  parts.append(_build_aliases(page_info.aliases))
-
-  if page_info.signature is not None:
-    parts.append(_build_signature(page_info))
-
-  if page_info.defined_in:
-    parts.append('\n\n')
-    parts.append(str(page_info.defined_in))
-
-  parts.append(page_info.guides)
-  parts.append(page_info.doc.docstring)
-  parts.append(_build_function_details(page_info.doc.function_details))
-  parts.append(_build_compatibility(page_info.doc.compatibility))
-
-  return ''.join(parts)
-
-
-def _build_class_page(page_info):
-  """Given a ClassPageInfo object Return the page as an md string."""
-  parts = ['# {page_info.full_name}\n\n'.format(page_info=page_info)]
-
-  parts.append('## Class `%s`\n\n' %
-               six.ensure_str(page_info.full_name).split('.')[-1])
-  if page_info.bases:
-    parts.append('Inherits From: ')
-
-    link_template = '[`{short_name}`]({url})'
-    parts.append(', '.join(
-        link_template.format(**base._asdict()) for base in page_info.bases))
-
-  parts.append('\n\n')
-
-  # Sort the methods list, but make sure constructors come first.
-  constructor_names = ['__init__', '__new__']
-  constructors = sorted(
-      method for method in page_info.methods
-      if method.short_name in constructor_names)
-  other_methods = sorted(
-      method for method in page_info.methods
-      if method.short_name not in constructor_names)
-
-  parts.append(_build_aliases(page_info.aliases))
-
-  if page_info.defined_in is not None:
-    parts.append('\n\n')
-    parts.append(str(page_info.defined_in))
-
-  parts.append(page_info.guides)
-  parts.append(page_info.doc.docstring)
-  parts.append(_build_function_details(page_info.doc.function_details))
-  parts.append(_build_compatibility(page_info.doc.compatibility))
-
-  parts.append('\n\n')
-
-  if constructors:
-    for method_info in constructors:
-      parts.append(_build_method_section(method_info, heading_level=2))
-    parts.append('\n\n')
-
-  if page_info.classes:
-    parts.append('## Child Classes\n')
-
-    link_template = ('[`class {class_info.short_name}`]'
-                     '({class_info.url})\n\n')
-    class_links = sorted(
-        link_template.format(class_info=class_info)
-        for class_info in page_info.classes)
-
-    parts.extend(class_links)
-
-  if page_info.properties:
-    parts.append('## Properties\n\n')
-    for prop_info in page_info.properties:
-      h3 = '<h3 id="{short_name}"><code>{short_name}</code></h3>\n\n'
-      parts.append(h3.format(short_name=prop_info.short_name))
-
-      parts.append(prop_info.doc.docstring)
-      parts.append(_build_function_details(prop_info.doc.function_details))
-      parts.append(_build_compatibility(prop_info.doc.compatibility))
-
-      parts.append('\n\n')
-
-    parts.append('\n\n')
-
-  if other_methods:
-    parts.append('## Methods\n\n')
-
-    for method_info in other_methods:
-      parts.append(_build_method_section(method_info))
-    parts.append('\n\n')
-
-  if page_info.other_members:
-    parts.append('## Class Members\n\n')
-
-    # TODO(markdaoust): Document the value of the members,
-    #                   at least for basic types.
-
-    h3 = '<h3 id="{short_name}"><code>{short_name}</code></h3>\n\n'
-    others_member_headings = (h3.format(short_name=info.short_name)
-                              for info in sorted(page_info.other_members))
-    parts.extend(others_member_headings)
-
-  return ''.join(parts)
-
-
-def _build_method_section(method_info, heading_level=3):
-  """Generates a markdown section for a method.
-
-  Args:
-    method_info: A `MethodInfo` object.
-    heading_level: An Int, which HTML heading level to use.
-
-  Returns:
-    A markdown string.
-  """
-  parts = []
-  heading = ('<h{heading_level} id="{short_name}">'
-             '<code>{short_name}</code>'
-             '</h{heading_level}>\n\n')
-  parts.append(heading.format(heading_level=heading_level,
-                              **method_info._asdict()))
-
-  if method_info.signature is not None:
-    parts.append(_build_signature(method_info, use_full_name=False))
-
-  parts.append(method_info.doc.docstring)
-  parts.append(_build_function_details(method_info.doc.function_details))
-  parts.append(_build_compatibility(method_info.doc.compatibility))
-  parts.append('\n\n')
-  return ''.join(parts)
-
-
-def _build_module_page(page_info):
-  """Given a ClassPageInfo object Return the page as an md string."""
-  parts = ['# Module: {full_name}\n\n'.format(full_name=page_info.full_name)]
-
-  parts.append(_build_aliases(page_info.aliases))
-
-  if page_info.defined_in is not None:
-    parts.append('\n\n')
-    parts.append(str(page_info.defined_in))
-
-  parts.append(page_info.doc.docstring)
-  parts.append(_build_compatibility(page_info.doc.compatibility))
-
-  parts.append('\n\n')
-
-  if page_info.modules:
-    parts.append('## Modules\n\n')
-    template = '[`{short_name}`]({url}) module'
-
-    for item in page_info.modules:
-      parts.append(template.format(**item._asdict()))
-
-      if item.doc.brief:
-        parts.append(': ' + six.ensure_str(item.doc.brief))
-
-      parts.append('\n\n')
-
-  if page_info.classes:
-    parts.append('## Classes\n\n')
-    template = '[`class {short_name}`]({url})'
-
-    for item in page_info.classes:
-      parts.append(template.format(**item._asdict()))
-
-      if item.doc.brief:
-        parts.append(': ' + six.ensure_str(item.doc.brief))
-
-      parts.append('\n\n')
-
-  if page_info.functions:
-    parts.append('## Functions\n\n')
-    template = '[`{short_name}(...)`]({url})'
-
-    for item in page_info.functions:
-      parts.append(template.format(**item._asdict()))
-
-      if item.doc.brief:
-        parts.append(': ' + six.ensure_str(item.doc.brief))
-
-      parts.append('\n\n')
-
-  if page_info.other_members:
-    # TODO(markdaoust): Document the value of the members,
-    #                   at least for basic types.
-    parts.append('## Other Members\n\n')
-
-    h3 = '<h3 id="{short_name}"><code>{short_name}</code></h3>\n\n'
-    for item in page_info.other_members:
-      parts.append(h3.format(**item._asdict()))
-
-  return ''.join(parts)
-
-
-def _build_signature(obj_info, use_full_name=True):
-  """Returns a md code block showing the function signature."""
-  # Special case tf.range, since it has an optional first argument
-  if obj_info.full_name == 'tf.range':
-    return (
-        '``` python\n'
-        "tf.range(limit, delta=1, dtype=None, name='range')\n"
-        "tf.range(start, limit, delta=1, dtype=None, name='range')\n"
-        '```\n\n')
-
-  parts = ['``` python']
-  parts.extend(['@' + six.ensure_str(dec) for dec in obj_info.decorators])
-  signature_template = '{name}({sig})'
-
-  if not obj_info.signature:
-    sig = ''
-  elif len(obj_info.signature) == 1:
-    sig = obj_info.signature[0]
-  else:
-    sig = ',\n'.join('    %s' % sig_item for sig_item in obj_info.signature)
-    sig = '\n'+sig+'\n'
-
-  if use_full_name:
-    obj_name = obj_info.full_name
-  else:
-    obj_name = obj_info.short_name
-  parts.append(signature_template.format(name=obj_name, sig=sig))
-  parts.append('```\n\n')
-
-  return '\n'.join(parts)
-
-
-def _build_compatibility(compatibility):
-  """Return the compatibility section as an md string."""
-  parts = []
-  sorted_keys = sorted(compatibility.keys())
-  for key in sorted_keys:
-
-    value = compatibility[key]
-    # Dedent so that it does not trigger markdown code formatting.
-    value = textwrap.dedent(value)
-    parts.append('\n\n#### %s Compatibility\n%s\n' % (key.title(), value))
-
-  return ''.join(parts)
-
-
-def _build_function_details(function_details):
-  """Return the function details section as an md string."""
-  parts = []
-  for detail in function_details:
-    sub = []
-    sub.append('#### ' + six.ensure_str(detail.keyword) + ':\n\n')
-    sub.append(textwrap.dedent(detail.header))
-    for key, value in detail.items:
-      sub.append('* <b>`%s`</b>: %s' % (key, value))
-    parts.append(''.join(sub))
-
-  return '\n'.join(parts)
-
-
-def _build_aliases(aliases):
-  aliases = sorted(aliases, key=lambda x: ('compat.v' in x, x))
-  parts = []
-  if len(aliases) > 1:
-    parts.append('**Aliases**: ')
-    parts.extend(', '.join('`{}`'.format(name) for name in aliases))
-    parts.append('\n\n')
-
-  return ''.join(parts)
diff --git a/tensorflow/tools/docs/py_guide_parser.py b/tensorflow/tools/docs/py_guide_parser.py
deleted file mode 100644
index 8d1cee6912a..00000000000
--- a/tensorflow/tools/docs/py_guide_parser.py
+++ /dev/null
@@ -1,103 +0,0 @@
-# Lint as: python2, python3
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Library for operating on Python API Guide files."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import re
-
-import six
-
-
-def md_files_in_dir(py_guide_src_dir):
-  """Returns a list of filename (full_path, base) pairs for guide files."""
-  all_in_dir = [(os.path.join(py_guide_src_dir, f), f)
-                for f in os.listdir(py_guide_src_dir)]
-  return [(full, f)
-          for full, f in all_in_dir
-          if os.path.isfile(full) and six.ensure_str(f).endswith('.md')]
-
-
-class PyGuideParser(object):
-  """Simple parsing of a guide .md file.
-
-  Descendants can override the process_*() functions (called by process())
-  to either record information from the guide, or call replace_line()
-  to affect the return value of process().
-  """
-
-  def __init__(self):
-    self._lines = None
-
-  def process(self, full_path):
-    """Read and process the file at `full_path`."""
-    with open(full_path, 'rb') as f:
-      md_string = f.read().decode('utf-8')
-    self._lines = md_string.split('\n')
-    seen = set()
-
-    in_blockquote = False
-    for i, line in enumerate(self._lines):
-      if '```' in line:
-        in_blockquote = not in_blockquote
-
-      if not in_blockquote and line.startswith('# '):
-        self.process_title(i, line[2:])
-      elif not in_blockquote and line.startswith('## '):
-        section_title = line.strip()[3:]
-        existing_tag = re.search(' {([^}]+)} *$', line)
-        if existing_tag:
-          tag = existing_tag.group(1)
-        else:
-          tag = re.sub('[^a-zA-Z0-9]+', '_', section_title)
-          if tag in seen:
-            suffix = 0
-            while True:
-              candidate = '%s_%d' % (tag, suffix)
-              if candidate not in seen:
-                tag = candidate
-                break
-        seen.add(tag)
-        self.process_section(i, section_title, tag)
-
-      elif in_blockquote:
-        self.process_in_blockquote(i, line)
-      else:
-        self.process_line(i, line)
-
-    ret = '\n'.join(self._lines)
-    self._lines = None
-    return ret
-
-  def replace_line(self, line_number, line):
-    """Replace the contents of line numbered `line_number` with `line`."""
-    self._lines[line_number] = line
-
-  def process_title(self, line_number, title):
-    pass
-
-  def process_section(self, line_number, section_title, tag):
-    pass
-
-  def process_in_blockquote(self, line_number, line):
-    pass
-
-  def process_line(self, line_number, line):
-    pass
diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD
index b336ff21b05..6d5ec6a5484 100644
--- a/tensorflow/tools/lib_package/BUILD
+++ b/tensorflow/tools/lib_package/BUILD
@@ -157,7 +157,6 @@ genrule(
         "@llvm-project//llvm:LICENSE.TXT",
         "@llvm-project//mlir:LICENSE.TXT",
         "@lmdb//:LICENSE",
-        "@local_config_sycl//sycl:LICENSE.text",
         "@local_config_tensorrt//:LICENSE",
         "@nasm//:LICENSE",
         "@nsync//:LICENSE",
@@ -238,7 +237,6 @@ genrule(
         "@llvm-project//llvm:LICENSE.TXT",
         "@llvm-project//mlir:LICENSE.TXT",
         "@lmdb//:LICENSE",
-        "@local_config_sycl//sycl:LICENSE.text",
         "@local_config_tensorrt//:LICENSE",
         "@nasm//:LICENSE",
         "@nsync//:LICENSE",
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 9cf6e10702f..5b792a7594a 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -36,6 +36,7 @@ transitive_hdrs(
         "//tensorflow/cc/saved_model:loader",
         "//tensorflow/cc/saved_model:reader",
         "//tensorflow/cc/saved_model:bundle_v2",
+        "//tensorflow/c/experimental/filesystem:modular_filesystem",
         # WARNING: None of the C/C++ code under python/ has any API guarantees, and TF team
         # reserves the right to change APIs and other header-level interfaces.  If your custom
         # op uses these headers, it may break when users upgrade their version of tensorflow.
@@ -109,8 +110,8 @@ COMMON_PIP_DEPS = [
     "//tensorflow/python:cond_v2",
     "//tensorflow/python:distributed_framework_test_lib",
     "//tensorflow/python/distribute:distribute_test_lib_pip",
-    "//tensorflow/python:loss_scale",
-    "//tensorflow/python:loss_scale_optimizer",
+    "//tensorflow/python/training/experimental:loss_scale",
+    "//tensorflow/python/training/experimental:loss_scale_optimizer",
     "//tensorflow/python:memory_checker",
     "//tensorflow/python:meta_graph_testdata",
     "//tensorflow/python:util_example_parser_configuration",
@@ -120,6 +121,7 @@ COMMON_PIP_DEPS = [
     "//tensorflow/python/data/experimental/kernel_tests:stats_dataset_test_base",
     "//tensorflow/python/data/experimental/ops:testing",
     "//tensorflow/python/data/experimental/service:server_lib",
+    "//tensorflow/python/ops/ragged:ragged_tensor_test_ops",
     "//tensorflow/python/data/kernel_tests:test_base",
     "//tensorflow/python/debug:debug_pip",
     "//tensorflow/python/distribute:combinations",
@@ -146,12 +148,10 @@ COMMON_PIP_DEPS = [
     "//tensorflow/python:while_v2",
     "//tensorflow/tools/common:public_api",
     "//tensorflow/tools/common:test_module1",
-    "//tensorflow/tools/docs:doc_generator_visitor",
-    "//tensorflow/tools/docs:generate_lib",
-    "//tensorflow/tools/docs:parser",
-    "//tensorflow/tools/docs:py_guide_parser",
+    "//tensorflow/tools/common:traverse",
+    "//tensorflow/python/distribute:parameter_server_strategy_v2",
     "//tensorflow/python/distribute/client:client",
-    "//tensorflow/python/distribute/client:parameter_server_client",
+    "//tensorflow/python/distribute/client:remote_eager_lib",
     "//tensorflow/python/distribute/client:metric_utils",
 ]
 
@@ -179,7 +179,6 @@ filegroup(
         "@ruy//:LICENSE",
         "@arm_neon_2_x86_sse//:LICENSE",
         "@astunparse_archive//:LICENSE",
-        "@astor_archive//:LICENSE",
         "@boringssl//:LICENSE",
         "@com_google_absl//:LICENSE",
         "@com_google_protobuf//:LICENSE",
@@ -204,7 +203,6 @@ filegroup(
         "@llvm-project//llvm:LICENSE.TXT",
         "@llvm-project//mlir:LICENSE.TXT",
         "@lmdb//:LICENSE",
-        "@local_config_sycl//sycl:LICENSE.text",
         "@local_config_tensorrt//:LICENSE",
         "@nasm//:LICENSE",
         "@nsync//:LICENSE",
@@ -217,6 +215,7 @@ filegroup(
         "@sobol_data//:LICENSE",
         "@tblib_archive//:LICENSE",
         "@termcolor_archive//:COPYING.txt",
+        "@typing_extensions_archive//:LICENSE",
         "@zlib//:zlib.h",
         "@clog//:LICENSE",
         "@cpuinfo//:LICENSE",
diff --git a/tensorflow/tools/pip_package/pip_smoke_test.py b/tensorflow/tools/pip_package/pip_smoke_test.py
index d2002b58598..60e1ae5b656 100644
--- a/tensorflow/tools/pip_package/pip_smoke_test.py
+++ b/tensorflow/tools/pip_package/pip_smoke_test.py
@@ -74,6 +74,7 @@ PYTHON_TARGETS, PY_TEST_QUERY_EXPRESSION = BuildPyTestDependencies()
 # List of dependencies that should not included in the pip package.
 DEPENDENCY_DENYLIST = [
     "//tensorflow/python:extra_py_tests_deps",
+    "//tensorflow/cc/saved_model:saved_model_test_files",
     "//tensorflow/cc/saved_model:saved_model_half_plus_two",
     "//tensorflow:no_tensorflow_py_deps",
     "//tensorflow/tools/pip_package:win_pip_package_marker",
@@ -81,7 +82,7 @@ DEPENDENCY_DENYLIST = [
     "//tensorflow/python:tf_optimizer",
     "//tensorflow/python:compare_test_proto_py",
     "//tensorflow/core:image_testdata",
-    "//tensorflow/core:lmdb_testdata",
+    "//tensorflow/core/lib/lmdb/testdata:lmdb_testdata",
     "//tensorflow/core/kernels/cloud:bigquery_reader_ops",
     "//tensorflow/python/debug:grpc_tensorflow_server.par",
     "//tensorflow/python/feature_column:vocabulary_testdata",
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 54021af9975..27cf842d2cc 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -67,6 +67,7 @@ REQUIRED_PACKAGES = [
     'tensorboard >= 2.3.0, < 3',
     'tensorflow_estimator >= 2.3.0, < 2.4.0',
     'termcolor >= 1.1.0',
+    'typing_extensions >= 3.7.4.2',
     'wrapt >= 1.11.1',
     'wheel >= 0.26',
     'six >= 1.12.0',
@@ -259,6 +260,7 @@ setup(
     version=_VERSION.replace('-', ''),
     description=DOCLINES[0],
     long_description='\n'.join(DOCLINES[2:]),
+    long_description_content_type="text/markdown",
     url='https://www.tensorflow.org/',
     download_url='https://github.com/tensorflow/tensorflow/tags',
     author='Google Inc.',
@@ -287,6 +289,8 @@ setup(
     # PyPI package information.
     classifiers=sorted([
         'Development Status :: 5 - Production/Stable',
+        # TODO(angerson) Add IFTTT when possible
+        'Environment :: GPU :: NVIDIA CUDA :: 11.0',
         'Intended Audience :: Developers',
         'Intended Audience :: Education',
         'Intended Audience :: Science/Research',
diff --git a/tensorflow/tools/test/run_and_gather_logs.py b/tensorflow/tools/test/run_and_gather_logs.py
index a1486826615..d8b706513ab 100644
--- a/tensorflow/tools/test/run_and_gather_logs.py
+++ b/tensorflow/tools/test/run_and_gather_logs.py
@@ -22,7 +22,6 @@ from __future__ import print_function
 import argparse
 import os
 import shlex
-from string import maketrans
 import sys
 import time
 
@@ -86,7 +85,7 @@ def main(unused_args):
     file_name = FLAGS.test_log_output_filename
   else:
     file_name = (
-        six.ensure_str(name).strip("/").translate(maketrans("/:", "__")) +
+        six.ensure_str(name).strip("/").translate(str.maketrans("/:", "__")) +
         time.strftime("%Y%m%d%H%M%S", time.gmtime()))
   if FLAGS.test_log_output_use_tmpdir:
     tmpdir = test.get_temp_dir()
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index b770dfeead5..c2eb78b8eb8 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -7,7 +7,6 @@ load("//third_party/nccl:nccl_configure.bzl", "nccl_configure")
 load("//third_party/mkl:build_defs.bzl", "mkl_repository")
 load("//third_party/git:git_configure.bzl", "git_configure")
 load("//third_party/py:python_configure.bzl", "python_configure")
-load("//third_party/sycl:sycl_configure.bzl", "sycl_configure")
 load("//third_party/systemlibs:syslibs_configure.bzl", "syslibs_configure")
 load("//third_party/toolchains/remote:configure.bzl", "remote_execution_configure")
 load("//third_party/toolchains/clang6:repo.bzl", "clang6_configure")
@@ -99,7 +98,6 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     tensorrt_configure(name = "local_config_tensorrt")
     nccl_configure(name = "local_config_nccl")
     git_configure(name = "local_config_git")
-    sycl_configure(name = "local_config_sycl")
     syslibs_configure(name = "local_config_syslibs")
     python_configure(name = "local_config_python")
     rocm_configure(name = "local_config_rocm")
@@ -164,11 +162,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
 
     tf_http_archive(
         name = "XNNPACK",
-        sha256 = "c6eae589a4af7785da467162acd339bae359842e14c93bddc8fbe84ffd361c70",
-        strip_prefix = "XNNPACK-aff24e26a760552ee98a036f2a6e95b123e1bc6d",
+        sha256 = "4b199c96fb2d551450b48eb5549843b41c023ad200aa86760a7c56d0dc0da806",
+        strip_prefix = "XNNPACK-68447302abcfad0d4b6b19a1efe7d7eef8833f4a",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/XNNPACK/archive/aff24e26a760552ee98a036f2a6e95b123e1bc6d.zip",
-            "https://github.com/google/XNNPACK/archive/aff24e26a760552ee98a036f2a6e95b123e1bc6d.zip",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/XNNPACK/archive/68447302abcfad0d4b6b19a1efe7d7eef8833f4a.zip",
+            "https://github.com/google/XNNPACK/archive/68447302abcfad0d4b6b19a1efe7d7eef8833f4a.zip",
         ],
     )
 
@@ -211,11 +209,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "mkl_dnn_v1",
         build_file = clean_dep("//third_party/mkl_dnn:mkldnn_v1.BUILD"),
-        sha256 = "54737bcb4dc1961d32ee75da3ecc529fa48198f8b2ca863a079e19a9c4adb70f",
-        strip_prefix = "oneDNN-1.4",
+        sha256 = "aef4d2a726f76f5b98902491a1a4ac69954039aa8e5a1d67ef6ce58ed00e23a6",
+        strip_prefix = "oneDNN-1.5.1",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/oneapi-src/oneDNN/archive/v1.4.tar.gz",
-            "https://github.com/oneapi-src/oneDNN/archive/v1.4.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/oneapi-src/oneDNN/archive/v1.5.1.tar.gz",
+            "https://github.com/oneapi-src/oneDNN/archive/v1.5.1.tar.gz",
         ],
     )
 
@@ -237,11 +235,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         name = "eigen_archive",
         build_file = clean_dep("//third_party:eigen.BUILD"),
         patch_file = clean_dep("//third_party/eigen3:gpu_packet_math.patch"),
-        sha256 = "9d8cbf2bd665cbb7b684bf4c6c5482b98dc6965847108f260c077049da04bee8",  # SHARED_EIGEN_SHA
-        strip_prefix = "eigen-2ce2f5198929caab4b41a6ad1b9c93f67d8b9a69",
+        sha256 = "a3c10a8c14f55e9f09f98b0a0ac6874c21bda91f65b7469d9b1f6925990e867b",  # SHARED_EIGEN_SHA
+        strip_prefix = "eigen-d10b27fe37736d2944630ecd7557cefa95cf87c9",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/gitlab.com/libeigen/eigen/-/archive/2ce2f5198929caab4b41a6ad1b9c93f67d8b9a69/eigen-2ce2f5198929caab4b41a6ad1b9c93f67d8b9a69.tar.gz",
-            "https://gitlab.com/libeigen/eigen/-/archive/2ce2f5198929caab4b41a6ad1b9c93f67d8b9a69/eigen-2ce2f5198929caab4b41a6ad1b9c93f67d8b9a69.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/gitlab.com/libeigen/eigen/-/archive/d10b27fe37736d2944630ecd7557cefa95cf87c9/eigen-d10b27fe37736d2944630ecd7557cefa95cf87c9.tar.gz",
+            "https://gitlab.com/libeigen/eigen/-/archive/d10b27fe37736d2944630ecd7557cefa95cf87c9/eigen-d10b27fe37736d2944630ecd7557cefa95cf87c9.tar.gz",
         ],
     )
 
@@ -314,15 +312,6 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         ],
     )
 
-    http_file(
-        name = "com_github_nlohmann_json_single_header",
-        sha256 = "63da6d1f22b2a7bb9e4ff7d6b255cf691a161ff49532dcc45d398a53e295835f",
-        urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/nlohmann/json/releases/download/v3.4.0/json.hpp",
-            "https://github.com/nlohmann/json/releases/download/v3.4.0/json.hpp",
-        ],
-    )
-
     tf_http_archive(
         name = "com_github_google_crc32c",
         sha256 = "6b3b1d861bb8307658b2407bc7a4c59e566855ef5368a60b35c893551e4788e9",
@@ -336,18 +325,19 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
 
     tf_http_archive(
         name = "com_github_googlecloudplatform_google_cloud_cpp",
-        sha256 = "839b2d4dcb36a671734dac6b30ea8c298bbeaafcf7a45ee4a7d7aa5986b16569",
-        strip_prefix = "google-cloud-cpp-1.14.0",
+        sha256 = "ff82045b9491f0d880fc8e5c83fd9542eafb156dcac9ff8c6209ced66ed2a7f0",
+        strip_prefix = "google-cloud-cpp-1.17.1",
         repo_mapping = {
             "@com_github_curl_curl": "@curl",
+            "@com_github_nlohmann_json": "@nlohmann_json_lib",
         },
         system_build_file = clean_dep("//third_party/systemlibs:google_cloud_cpp.BUILD"),
         system_link_files = {
             "//third_party/systemlibs:google_cloud_cpp.google.cloud.bigtable.BUILD": "google/cloud/bigtable/BUILD",
         },
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/googleapis/google-cloud-cpp/archive/v1.14.0.tar.gz",
-            "https://github.com/googleapis/google-cloud-cpp/archive/v1.14.0.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/googleapis/google-cloud-cpp/archive/v1.17.1.tar.gz",
+            "https://github.com/googleapis/google-cloud-cpp/archive/v1.17.1.tar.gz",
         ],
     )
 
@@ -514,6 +504,29 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         ],
     )
 
+    tf_http_archive(
+        name = "typing_extensions_archive",
+        build_file = clean_dep("//third_party:typing_extensions.BUILD"),
+        sha256 = "79ee589a3caca649a9bfd2a8de4709837400dfa00b6cc81962a1e6a1815969ae",
+        strip_prefix = "typing_extensions-3.7.4.2/src_py3",
+        system_build_file = clean_dep("//third_party/systemlibs:typing_extensions.BUILD"),
+        urls = [
+            "http://mirror.tensorflow.org/files.pythonhosted.org/packages/6a/28/d32852f2af6b5ead85d396249d5bdf450833f3a69896d76eb480d9c5e406/typing_extensions-3.7.4.2.tar.gz",
+            "https://files.pythonhosted.org/packages/6a/28/d32852f2af6b5ead85d396249d5bdf450833f3a69896d76eb480d9c5e406/typing_extensions-3.7.4.2.tar.gz",
+        ],
+    )
+
+    filegroup_external(
+        name = "typing_extensions_license",
+        licenses = ["notice"],  # PSFL
+        sha256_urls = {
+            "ff17ce94e102024deb68773eb1cc74ca76da4e658f373531f0ac22d68a6bb1ad": [
+                "http://mirror.tensorflow.org/raw.githubusercontent.com/python/typing/master/typing_extensions/LICENSE",
+                "https://raw.githubusercontent.com/python/typing/master/typing_extensions/LICENSE",
+            ],
+        },
+    )
+
     tf_http_archive(
         name = "opt_einsum_archive",
         build_file = clean_dep("//third_party:opt_einsum.BUILD"),
@@ -657,19 +670,6 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         ],
     )
 
-    tf_http_archive(
-        name = "swig",
-        build_file = clean_dep("//third_party:swig.BUILD"),
-        sha256 = "58a475dbbd4a4d7075e5fe86d4e54c9edde39847cdb96a3053d87cb64a23a453",
-        strip_prefix = "swig-3.0.8",
-        system_build_file = clean_dep("//third_party/systemlibs:swig.BUILD"),
-        urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/ufpr.dl.sourceforge.net/project/swig/swig/swig-3.0.8/swig-3.0.8.tar.gz",
-            "https://ufpr.dl.sourceforge.net/project/swig/swig/swig-3.0.8/swig-3.0.8.tar.gz",
-            "https://pilotfiber.dl.sourceforge.net/project/swig/swig/swig-3.0.8/swig-3.0.8.tar.gz",
-        ],
-    )
-
     tf_http_archive(
         name = "curl",
         build_file = clean_dep("//third_party:curl.BUILD"),
@@ -712,8 +712,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "28e322ea9393e6b3841886006dd170ddd810fd9b"
-    LLVM_SHA256 = "438268a47b69687ea5e588a285a2255de414addc36e0405e1d70f7cb5208aa75"
+    LLVM_COMMIT = "c0e7f64685789520ad732d9dd6bf388dc916e518"
+    LLVM_SHA256 = "af324ce9776090d6a4a22a7122ff3e88cc64d1bf221207057c75c76a4733e06c"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
@@ -892,15 +892,16 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         ],
     )
 
+    # The CUDA 11 toolkit ships with CUB.  We should be able to delete this rule
+    # once TF drops support for CUDA 10.
     tf_http_archive(
         name = "cub_archive",
         build_file = clean_dep("//third_party:cub.BUILD"),
-        patch_file = clean_dep("//third_party:cub.pr170.patch"),
-        sha256 = "6bfa06ab52a650ae7ee6963143a0bbc667d6504822cbd9670369b598f18c58c3",
-        strip_prefix = "cub-1.8.0",
+        sha256 = "162514b3cc264ac89d91898b58450190b8192e2af1142cf8ccac2d59aa160dda",
+        strip_prefix = "cub-1.9.9",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/NVlabs/cub/archive/1.8.0.zip",
-            "https://github.com/NVlabs/cub/archive/1.8.0.zip",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/NVlabs/cub/archive/1.9.9.zip",
+            "https://github.com/NVlabs/cub/archive/1.9.9.zip",
         ],
     )
 
diff --git a/third_party/aws/aws-checksums.bazel b/third_party/aws/aws-checksums.bazel
index f620a96d2c8..26cc7718006 100644
--- a/third_party/aws/aws-checksums.bazel
+++ b/third_party/aws/aws-checksums.bazel
@@ -7,6 +7,13 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
+load("@bazel_skylib//lib:selects.bzl", "selects")
+
+selects.config_setting_group(
+    name = "is_linux_debug",
+    match_all = ["@org_tensorflow//tensorflow:linux_x86_64", "@org_tensorflow//tensorflow:debug"],
+)
+
 cc_library(
     name = "aws-checksums",
     srcs = select({
@@ -28,4 +35,10 @@ cc_library(
     deps = [
         "@aws-c-common",
     ],
+    defines = select({
+        ":is_linux_debug": [
+            "DEBUG_BUILD"
+        ],
+        "//conditions:default": [],
+    }),
 )
diff --git a/third_party/cub.BUILD b/third_party/cub.BUILD
index a04347b21ee..29159c9dad3 100644
--- a/third_party/cub.BUILD
+++ b/third_party/cub.BUILD
@@ -20,7 +20,6 @@ filegroup(
 cc_library(
     name = "cub",
     hdrs = if_cuda([":cub_header_files"]),
-    include_prefix = "third_party",
     deps = [
         "@local_config_cuda//cuda:cuda_headers",
     ],
diff --git a/third_party/cub.pr170.patch b/third_party/cub.pr170.patch
deleted file mode 100644
index 5b7432e8858..00000000000
--- a/third_party/cub.pr170.patch
+++ /dev/null
@@ -1,48 +0,0 @@
-From fd6e7a61a16a17fa155cbd717de0c79001af71e6 Mon Sep 17 00:00:00 2001
-From: Artem Belevich <tra@google.com>
-Date: Mon, 23 Sep 2019 11:18:56 -0700
-Subject: [PATCH] Fix CUDA version detection in CUB
-
-This fixes the problem with CUB using deprecated shfl/vote instructions when CUB
-is compiled with clang (e.g. some TensorFlow builds).
----
- cub/util_arch.cuh | 3 ++-
- cub/util_type.cuh | 4 ++--
- 2 files changed, 4 insertions(+), 3 deletions(-)
-
-diff --git a/cub/util_arch.cuh b/cub/util_arch.cuh
-index 87c5ea2fb..9ad9d1cbb 100644
---- a/cub/util_arch.cuh
-+++ b/cub/util_arch.cuh
-@@ -44,7 +44,8 @@ namespace cub {
- 
- #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
- 
--#if (__CUDACC_VER_MAJOR__ >= 9) && !defined(CUB_USE_COOPERATIVE_GROUPS)
-+#if !defined(CUB_USE_COOPERATIVE_GROUPS) && \
-+    (__CUDACC_VER_MAJOR__ >= 9 || CUDA_VERSION >= 9000)
-     #define CUB_USE_COOPERATIVE_GROUPS
- #endif
- 
-diff --git a/cub/util_type.cuh b/cub/util_type.cuh
-index 0ba41e1ed..b2433d735 100644
---- a/cub/util_type.cuh
-+++ b/cub/util_type.cuh
-@@ -37,7 +37,7 @@
- #include <limits>
- #include <cfloat>
- 
--#if (__CUDACC_VER_MAJOR__ >= 9)
-+#if (__CUDACC_VER_MAJOR__ >= 9 || CUDA_VERSION >= 9000)
-     #include <cuda_fp16.h>
- #endif
- 
-@@ -1063,7 +1063,7 @@ struct FpLimits<double>
- };
- 
- 
--#if (__CUDACC_VER_MAJOR__ >= 9)
-+#if (__CUDACC_VER_MAJOR__ >= 9 || CUDA_VERSION >= 9000)
- template <>
- struct FpLimits<__half>
- {
diff --git a/third_party/eigen3/BUILD b/third_party/eigen3/BUILD
index 595321fda8d..bbe74cf1f24 100644
--- a/third_party/eigen3/BUILD
+++ b/third_party/eigen3/BUILD
@@ -2,6 +2,8 @@
 #   Eigen is a C++ template library for linear algebra: vectors,
 #   matrices, and related algorithms.
 
+load("//third_party/mkl:build_defs.bzl", "if_mkl")
+
 licenses([
     # Note: Eigen is an MPL2 library that includes GPL v3 and LGPL v2.1+ code.
     #       We've taken special care to not reference any restricted code.
@@ -11,8 +13,6 @@ licenses([
 
 exports_files(["LICENSE"])
 
-load("//third_party/mkl:build_defs.bzl", "if_mkl")
-
 EIGEN3_THIRD_PARTY_HEADERS = [
     "Eigen/Core",
     "Eigen/LU",
@@ -37,7 +37,6 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         "@eigen_archive//:eigen",
-        "@local_config_sycl//sycl",
     ],
 )
 
diff --git a/third_party/flatbuffers/BUILD.system b/third_party/flatbuffers/BUILD.system
index 14fceada826..8fe4d7a5907 100644
--- a/third_party/flatbuffers/BUILD.system
+++ b/third_party/flatbuffers/BUILD.system
@@ -36,3 +36,8 @@ cc_library(
     name = "runtime_cc",
     visibility = ["//visibility:public"],
 )
+
+py_library(
+    name = "runtime_py",
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/flatbuffers/build_defs.bzl b/third_party/flatbuffers/build_defs.bzl
index 1fbe629e66a..d7099e58717 100644
--- a/third_party/flatbuffers/build_defs.bzl
+++ b/third_party/flatbuffers/build_defs.bzl
@@ -24,6 +24,7 @@ def flatbuffer_library_public(
         out_prefix = "",
         includes = [],
         include_paths = [],
+        compatible_with = [],
         flatc_args = DEFAULT_FLATC_ARGS,
         reflection_name = "",
         reflection_visibility = None,
@@ -43,6 +44,8 @@ def flatbuffer_library_public(
           single source targets. Usually is a directory name.
       includes: Optional, list of filegroups of schemas that the srcs depend on.
       include_paths: Optional, list of paths the includes files can be found in.
+      compatible_with: Optional, passed to genrule for environments this rule
+          can be built for.
       flatc_args: Optional, list of additional arguments to pass to flatc.
       reflection_name: Optional, if set this will generate the flatbuffer
         reflection binaries for the schemas.
@@ -72,6 +75,7 @@ def flatbuffer_library_public(
         srcs = srcs,
         outs = outs,
         output_to_bindir = output_to_bindir,
+        compatible_with = compatible_with,
         tools = includes + [flatc_path],
         cmd = genrule_cmd,
         message = "Generating flatbuffer files for %s:" % (name),
@@ -97,6 +101,7 @@ def flatbuffer_library_public(
             srcs = srcs,
             outs = reflection_outs,
             output_to_bindir = output_to_bindir,
+            compatible_with = compatible_with,
             tools = includes + [flatc_path],
             cmd = reflection_genrule_cmd,
             message = "Generating flatbuffer reflection binary for %s:" % (name),
@@ -111,6 +116,7 @@ def flatbuffer_library_public(
         #         native.FilesetEntry(files = reflection_outs),
         #     ],
         #     visibility = reflection_visibility,
+        #     compatible_with = compatible_with,
         # )
 
 def flatbuffer_cc_library(
@@ -120,6 +126,7 @@ def flatbuffer_cc_library(
         out_prefix = "",
         includes = [],
         include_paths = [],
+        compatible_with = [],
         flatc_args = DEFAULT_FLATC_ARGS,
         visibility = None,
         srcs_filegroup_visibility = None,
@@ -175,6 +182,8 @@ def flatbuffer_cc_library(
       includes: Optional, list of filegroups of schemas that the srcs depend on.
           ** SEE REMARKS BELOW **
       include_paths: Optional, list of paths the includes files can be found in.
+      compatible_with: Optional, passed to genrule for environments this rule
+          can be built for
       flatc_args: Optional list of additional arguments to pass to flatc
           (e.g. --gen-mutable).
       visibility: The visibility of the generated cc_library. By default, use the
@@ -198,6 +207,7 @@ def flatbuffer_cc_library(
         out_prefix = out_prefix,
         includes = includes,
         include_paths = include_paths,
+        compatible_with = compatible_with,
         flatc_args = flatc_args,
         reflection_name = reflection_name,
         reflection_visibility = visibility,
@@ -215,6 +225,7 @@ def flatbuffer_cc_library(
         includes = ["."],
         linkstatic = 1,
         visibility = visibility,
+        compatible_with = compatible_with,
     )
 
     # A filegroup for the `srcs`. That is, all the schema files for this
@@ -223,6 +234,7 @@ def flatbuffer_cc_library(
         name = srcs_filegroup_name if srcs_filegroup_name else "%s_includes" % (name),
         srcs = srcs,
         visibility = srcs_filegroup_visibility if srcs_filegroup_visibility != None else visibility,
+        compatible_with = compatible_with,
     )
 
 # Custom provider to track dependencies transitively.
diff --git a/third_party/gpus/cuda/BUILD.tpl b/third_party/gpus/cuda/BUILD.tpl
index e5833e7cdbb..a4a21abc367 100644
--- a/third_party/gpus/cuda/BUILD.tpl
+++ b/third_party/gpus/cuda/BUILD.tpl
@@ -176,6 +176,11 @@ cc_library(
     ],
 )
 
+alias(
+    name = "cub_headers",
+    actual = "%{cub_actual}"
+)
+
 cuda_header_library(
     name = "cupti_headers",
     hdrs = [":cuda-extras"],
@@ -224,3 +229,4 @@ py_library(
 )
 
 %{copy_rules}
+
diff --git a/third_party/gpus/cuda/BUILD.windows.tpl b/third_party/gpus/cuda/BUILD.windows.tpl
index 55a9ec3d1ab..cabfac28fc3 100644
--- a/third_party/gpus/cuda/BUILD.windows.tpl
+++ b/third_party/gpus/cuda/BUILD.windows.tpl
@@ -171,6 +171,11 @@ cc_library(
     ],
 )
 
+alias(
+    name = "cub_headers",
+    actual = "%{cub_actual}"
+)
+
 cuda_header_library(
     name = "cupti_headers",
     hdrs = [":cuda-extras"],
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index 70bb91159de..ea33963fe19 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -692,6 +692,7 @@ def _get_cuda_config(repository_ctx, find_cuda_config_script):
     return struct(
         cuda_toolkit_path = toolkit_path,
         cuda_version = cuda_version,
+        cuda_version_major = cuda_major,
         cublas_version = cublas_version,
         cusolver_version = cusolver_version,
         curand_version = curand_version,
@@ -776,6 +777,7 @@ def _create_dummy_repository(repository_ctx):
             "%{curand_lib}": lib_name("curand", cpu_value),
             "%{cupti_lib}": lib_name("cupti", cpu_value),
             "%{cusparse_lib}": lib_name("cusparse", cpu_value),
+            "%{cub_actual}": ":cuda_headers",
             "%{copy_rules}": """
 filegroup(name="cuda-include")
 filegroup(name="cublas-include")
@@ -1122,6 +1124,10 @@ def _create_local_cuda_repository(repository_ctx):
         },
     )
 
+    cub_actual = "@cub_archive//:cub"
+    if int(cuda_config.cuda_version_major) >= 11:
+        cub_actual = ":cuda_headers"
+
     repository_ctx.template(
         "cuda/BUILD",
         tpl_paths["cuda:BUILD"],
@@ -1137,6 +1143,7 @@ def _create_local_cuda_repository(repository_ctx):
             "%{curand_lib}": _basename(repository_ctx, cuda_libs["curand"]),
             "%{cupti_lib}": _basename(repository_ctx, cuda_libs["cupti"]),
             "%{cusparse_lib}": _basename(repository_ctx, cuda_libs["cusparse"]),
+            "%{cub_actual}": cub_actual,
             "%{copy_rules}": "\n".join(copy_rules),
         },
     )
diff --git a/third_party/gpus/find_cuda_config.py b/third_party/gpus/find_cuda_config.py
index 091cd32d5fe..80f343023cd 100644
--- a/third_party/gpus/find_cuda_config.py
+++ b/third_party/gpus/find_cuda_config.py
@@ -176,6 +176,7 @@ def _header_paths():
       "include/*-linux-gnu",
       "extras/CUPTI/include",
       "include/cuda/CUPTI",
+      "local/cuda/extras/CUPTI/include",
   ]
 
 
@@ -188,6 +189,8 @@ def _library_paths():
       "lib/*-linux-gnu",
       "lib/x64",
       "extras/CUPTI/*",
+      "local/cuda/lib64",
+      "local/cuda/extras/CUPTI/lib64",
   ]
 
 
@@ -268,12 +271,14 @@ def _find_cuda_config(base_paths, required_version):
   nvcc_path, nvcc_version = _find_versioned_file(base_paths, [
       "",
       "bin",
+      "local/cuda/bin",
   ], nvcc_name, cuda_version, get_nvcc_version)
 
   nvvm_path = _find_file(base_paths, [
       "nvvm/libdevice",
       "share/cuda",
       "lib/nvidia-cuda-toolkit/libdevice",
+      "local/cuda/nvvm/libdevice",
   ], "libdevice*.10.bc")
 
   cupti_header_path = _find_file(base_paths, _header_paths(), "cupti.h")
diff --git a/third_party/gpus/rocm/BUILD.tpl b/third_party/gpus/rocm/BUILD.tpl
index cf8950b5bc7..d2533a08de1 100644
--- a/third_party/gpus/rocm/BUILD.tpl
+++ b/third_party/gpus/rocm/BUILD.tpl
@@ -108,6 +108,7 @@ cc_library(
         ":rocfft",
         ":hiprand",
         ":miopen",
+        ":hipsparse",
     ],
 )
 
@@ -137,11 +138,17 @@ cc_library(
     ],
 )
 
-cc_import(
+cc_library(
     name = "hipsparse",
-    hdrs = glob(["rocm/include/hipsparse/**",]),
-    shared_library = "rocm/lib/%{hipsparse_lib}",
-    visibility = ["//visibility:public"],
+    data = ["rocm/lib/%{hipsparse_lib}"],
+)
+
+filegroup(
+    name = "rocm_root",
+    srcs = [
+        "rocm/bin/clang-offload-bundler",
+        "rocm/bin/bin2c.py",
+    ],
 )
 
 %{copy_rules}
diff --git a/third_party/gpus/rocm/build_defs.bzl.tpl b/third_party/gpus/rocm/build_defs.bzl.tpl
index 08c59f95a07..ce4c1b04399 100644
--- a/third_party/gpus/rocm/build_defs.bzl.tpl
+++ b/third_party/gpus/rocm/build_defs.bzl.tpl
@@ -34,6 +34,10 @@ def rocm_is_configured():
     """Returns true if ROCm was enabled during the configure process."""
     return %{rocm_is_configured}
 
+def rocm_gpu_architectures():
+    """Returns a list of supported GPU architectures."""
+    return %{rocm_gpu_architectures}
+
 def if_rocm_is_configured(x):
     """Tests if the ROCm was enabled during the configure process.
 
diff --git a/third_party/gpus/rocm_configure.bzl b/third_party/gpus/rocm_configure.bzl
index d28337de836..1312574f0aa 100644
--- a/third_party/gpus/rocm_configure.bzl
+++ b/third_party/gpus/rocm_configure.bzl
@@ -9,8 +9,7 @@
   * `TF_ROCM_VERSION`: The version of the ROCm toolkit. If this is blank, then
     use the system default.
   * `TF_MIOPEN_VERSION`: The version of the MIOpen library.
-  * `TF_ROCM_AMDGPU_TARGETS`: The AMDGPU targets. Default is
-    `gfx803,gfx900`.
+  * `TF_ROCM_AMDGPU_TARGETS`: The AMDGPU targets.
 """
 
 load(
@@ -44,7 +43,6 @@ _TF_ROCM_CONFIG_REPO = "TF_ROCM_CONFIG_REPO"
 _DEFAULT_ROCM_VERSION = ""
 _DEFAULT_MIOPEN_VERSION = ""
 _DEFAULT_ROCM_TOOLKIT_PATH = "/opt/rocm"
-_DEFAULT_ROCM_AMDGPU_TARGETS = ["gfx803", "gfx900"]
 
 def verify_build_defines(params):
     """Verify all variables that crosstool/BUILD.rocm.tpl expects are substituted.
@@ -228,11 +226,14 @@ def _rocm_toolkit_path(repository_ctx, bash_bin):
         auto_configure_fail("Cannot find rocm toolkit path.")
     return rocm_toolkit_path
 
-def _amdgpu_targets(repository_ctx):
+def _amdgpu_targets(repository_ctx, rocm_toolkit_path, bash_bin):
     """Returns a list of strings representing AMDGPU targets."""
     amdgpu_targets_str = get_host_environ(repository_ctx, _TF_ROCM_AMDGPU_TARGETS)
     if not amdgpu_targets_str:
-        return _DEFAULT_ROCM_AMDGPU_TARGETS
+        cmd = "%s/bin/rocm_agent_enumerator" % rocm_toolkit_path
+        result = execute(repository_ctx, [bash_bin, "-c", cmd])
+        targets = [target for target in result.stdout.strip().split("\n") if target != "gfx000"]
+        amdgpu_targets_str = ",".join(targets)
     amdgpu_targets = amdgpu_targets_str.split(",")
     for amdgpu_target in amdgpu_targets:
         if amdgpu_target[:3] != "gfx" or not amdgpu_target[3:].isdigit():
@@ -389,7 +390,7 @@ def _find_libs(repository_ctx, rocm_config, bash_bin):
     libs_paths = [
         (name, _rocm_lib_paths(repository_ctx, name, path))
         for name, path in [
-            ("hip_hcc", rocm_config.rocm_toolkit_path),
+            ("hip_hcc", rocm_config.rocm_toolkit_path + "/hip"),
             ("rocblas", rocm_config.rocm_toolkit_path + "/rocblas"),
             ("rocfft", rocm_config.rocm_toolkit_path + "/rocfft"),
             ("hiprand", rocm_config.rocm_toolkit_path + "/hiprand"),
@@ -416,7 +417,7 @@ def _get_rocm_config(repository_ctx, bash_bin):
     rocm_toolkit_path = _rocm_toolkit_path(repository_ctx, bash_bin)
     return struct(
         rocm_toolkit_path = rocm_toolkit_path,
-        amdgpu_targets = _amdgpu_targets(repository_ctx),
+        amdgpu_targets = _amdgpu_targets(repository_ctx, rocm_toolkit_path, bash_bin),
     )
 
 def _tpl_path(repository_ctx, labelname):
@@ -464,6 +465,7 @@ def _create_dummy_repository(repository_ctx):
         {
             "%{rocm_is_configured}": "False",
             "%{rocm_extra_copts}": "[]",
+            "%{rocm_gpu_architectures}": "[]",
         },
     )
     _tpl(
@@ -532,12 +534,8 @@ def _genrule(src_dir, genrule_name, command, outs):
     )
 
 def _compute_rocm_extra_copts(repository_ctx, amdgpu_targets):
-    if False:
-        amdgpu_target_flags = ["--amdgpu-target=" +
-                               amdgpu_target for amdgpu_target in amdgpu_targets]
-    else:
-        # AMDGPU targets are handled in the "crosstool_wrapper_driver_is_not_gcc"
-        amdgpu_target_flags = []
+    amdgpu_target_flags = ["--amdgpu-target=" +
+                           amdgpu_target for amdgpu_target in amdgpu_targets]
     return str(amdgpu_target_flags)
 
 def _create_local_rocm_repository(repository_ctx):
@@ -611,6 +609,26 @@ def _create_local_rocm_repository(repository_ctx):
         outs = rocm_lib_outs,
     ))
 
+    clang_offload_bundler_path = rocm_toolkit_path + _if_hipcc_is_hipclang(
+        repository_ctx,
+        rocm_config,
+        bash_bin,
+        "/llvm/bin/",
+        "/hcc/bin/",
+    ) + "clang-offload-bundler"
+
+    # copy files mentioned in third_party/gpus/rocm/BUILD
+    copy_rules.append(make_copy_files_rule(
+        repository_ctx,
+        name = "rocm-bin",
+        srcs = [
+            clang_offload_bundler_path,
+        ],
+        outs = [
+            "rocm/bin/" + "clang-offload-bundler",
+        ],
+    ))
+
     # Set up BUILD file for rocm/
     repository_ctx.template(
         "rocm/build_defs.bzl",
@@ -621,6 +639,7 @@ def _create_local_rocm_repository(repository_ctx):
                 repository_ctx,
                 rocm_config.amdgpu_targets,
             ),
+            "%{rocm_gpu_architectures}": str(rocm_config.amdgpu_targets),
         },
     )
     repository_ctx.template(
@@ -708,7 +727,7 @@ def _create_local_rocm_repository(repository_ctx):
         tpl_paths["crosstool:clang/bin/crosstool_wrapper_driver_rocm"],
         {
             "%{cpu_compiler}": str(cc),
-            "%{hipcc_path}": rocm_config.rocm_toolkit_path + "/bin/hipcc",
+            "%{hipcc_path}": rocm_config.rocm_toolkit_path + "/hip/bin/hipcc",
             "%{hipcc_env}": _hipcc_env(repository_ctx),
             "%{hipcc_is_hipclang}": _hipcc_is_hipclang(repository_ctx, rocm_config, bash_bin),
             "%{rocr_runtime_path}": rocm_config.rocm_toolkit_path + "/lib",
@@ -719,9 +738,6 @@ def _create_local_rocm_repository(repository_ctx):
             "%{hcc_runtime_library}": "mcwamp",
             "%{crosstool_verbose}": _crosstool_verbose(repository_ctx),
             "%{gcc_host_compiler_path}": str(cc),
-            "%{rocm_amdgpu_targets}": ",".join(
-                ["\"%s\"" % c for c in rocm_config.amdgpu_targets],
-            ),
         },
     )
 
diff --git a/third_party/llvm/llvm.autogenerated.BUILD b/third_party/llvm/llvm.autogenerated.BUILD
index befc20c4fab..2fa578c3f4e 100644
--- a/third_party/llvm/llvm.autogenerated.BUILD
+++ b/third_party/llvm/llvm.autogenerated.BUILD
@@ -419,6 +419,19 @@ cc_binary(
     ],
 )
 
+cc_library(
+    name = "filecheck-lib",
+    srcs = glob([
+        "lib/FileCheck/*.cpp",
+        "lib/FileCheck/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/FileCheck/*.h",
+    ]),
+    includes = ["include"],
+    deps = [":Support"],
+)
+
 cc_binary(
     name = "FileCheck",
     testonly = 1,
@@ -429,7 +442,10 @@ cc_binary(
     copts = llvm_copts,
     linkopts = llvm_linkopts,
     stamp = 0,
-    deps = [":Support"],
+    deps = [
+        ":Support",
+        ":filecheck-lib",
+    ],
 )
 
 llvm_target_list = [
@@ -532,6 +548,8 @@ llvm_target_list = [
             ("-gen-callingconv", "lib/Target/PowerPC/PPCGenCallingConv.inc"),
             ("-gen-subtarget", "lib/Target/PowerPC/PPCGenSubtargetInfo.inc"),
             ("-gen-disassembler", "lib/Target/PowerPC/PPCGenDisassemblerTables.inc"),
+            ("-gen-register-bank", "lib/Target/PowerPC/PPCGenRegisterBank.inc"),
+            ("-gen-global-isel", "lib/Target/PowerPC/PPCGenGlobalISel.inc"),
         ],
     },
     {
@@ -634,6 +652,7 @@ gentbl(
         ":common_target_td_sources",
     ] + glob([
         "lib/Target/" + target["dir_name"] + "/*.td",
+        "lib/Target/" + target["name"] + "/GISel/*.td",
     ]),
     deps = target.get("tbl_deps", []),
 ) for target in llvm_target_list]
@@ -685,26 +704,36 @@ cc_library(
     ],
 )
 
-gentbl(
-    name = "omp_gen",
-    tbl_outs = [("--gen-directive-decl", "include/llvm/Frontend/OpenMP/OMP.h.inc")],
-    tblgen = ":llvm-tblgen",
-    td_file = "include/llvm/Frontend/OpenMP/OMP.td",
-    td_srcs = glob([
+exports_files([
+    "include/llvm/Frontend/OpenMP/OMP.td",
+])
+
+filegroup(
+    name = "omp_td_files",
+    srcs = glob([
         "include/llvm/Frontend/OpenMP/*.td",
         "include/llvm/Frontend/Directive/*.td",
     ]),
 )
 
 gentbl(
-    name = "omp_gen_impl",
-    tbl_outs = [("--gen-directive-impl", "include/llvm/Frontend/OpenMP/OMP.cpp.inc")],
+    name = "omp_gen",
+    tbl_outs = [("--gen-directive-decl", "include/llvm/Frontend/OpenMP/OMP.h.inc")],
     tblgen = ":llvm-tblgen",
     td_file = "include/llvm/Frontend/OpenMP/OMP.td",
-    td_srcs = glob([
-        "include/llvm/Frontend/OpenMP/*.td",
-        "include/llvm/Frontend/Directive/*.td",
-    ]),
+    td_srcs = [
+        ":omp_td_files",
+    ],
+)
+
+gentbl(
+    name = "omp_gen_impl",
+    tbl_outs = [("--gen-directive-impl", "include/llvm/Frontend/OpenMP/OMP.cpp")],
+    tblgen = ":llvm-tblgen",
+    td_file = "include/llvm/Frontend/OpenMP/OMP.td",
+    td_srcs = [
+        ":omp_td_files",
+    ],
 )
 
 # TODO(b/159809163): autogenerate this after enabling release-mode ML
@@ -1556,7 +1585,9 @@ cc_library(
         ":BPFInfo",
         ":CodeGen",
         ":Core",
+        ":IPO",
         ":MC",
+        ":Scalar",
         ":SelectionDAG",
         ":Support",
         ":Target",
@@ -1747,6 +1778,7 @@ cc_library(
         "lib/CodeGen/*.c",
         "lib/CodeGen/*.cpp",
         "lib/CodeGen/*.inc",
+        "lib/CodeGen/LiveDebugValues/*.cpp",
         "lib/CodeGen/*.h",
     ]),
     hdrs = glob([
@@ -2082,7 +2114,10 @@ cc_library(
         "include/llvm/Extensions/*.inc",
     ]),
     copts = llvm_copts,
-    deps = [":config"],
+    deps = [
+        ":Support",
+        ":config",
+    ],
 )
 
 cc_library(
@@ -2092,7 +2127,7 @@ cc_library(
         "lib/Frontend/OpenMP/*.cpp",
         "lib/Frontend/OpenMP/*.inc",
         "lib/Frontend/OpenMP/*.h",
-    ]),
+    ]) + ["include/llvm/Frontend/OpenMP/OMP.cpp"],
     hdrs = glob([
         "include/llvm/Frontend/OpenMP/*.h",
         "include/llvm/Frontend/OpenMP/*.def",
@@ -2162,6 +2197,27 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "HelloNew",
+    srcs = glob([
+        "lib/Transforms/HelloNew/*.c",
+        "lib/Transforms/HelloNew/*.cpp",
+        "lib/Transforms/HelloNew/*.inc",
+        "lib/Transforms/HelloNew/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/Transforms/HelloNew/*.h",
+        "include/llvm/Transforms/HelloNew/*.def",
+        "include/llvm/Transforms/HelloNew/*.inc",
+    ]),
+    copts = llvm_copts,
+    deps = [
+        ":Core",
+        ":Support",
+        ":config",
+    ],
+)
+
 cc_library(
     name = "HexagonAsmParser",
     srcs = glob([
@@ -2399,6 +2455,27 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "InterfaceStub",
+    srcs = glob([
+        "lib/InterfaceStub/*.c",
+        "lib/InterfaceStub/*.cpp",
+        "lib/InterfaceStub/*.inc",
+        "lib/InterfaceStub/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/InterfaceStub/*.h",
+        "include/llvm/InterfaceStub/*.def",
+        "include/llvm/InterfaceStub/*.inc",
+    ]),
+    copts = llvm_copts,
+    deps = [
+        ":Object",
+        ":Support",
+        ":config",
+    ],
+)
+
 cc_library(
     name = "Interpreter",
     srcs = glob([
@@ -3287,9 +3364,11 @@ cc_library(
         ":CodeGen",
         ":Core",
         ":Coroutines",
+        ":HelloNew",
         ":IPO",
         ":InstCombine",
         ":Instrumentation",
+        ":ObjCARC",
         ":Scalar",
         ":Support",
         ":Target",
@@ -3329,6 +3408,7 @@ cc_library(
         "lib/Target/PowerPC/*.c",
         "lib/Target/PowerPC/*.cpp",
         "lib/Target/PowerPC/*.inc",
+        "lib/Target/PowerPC/GISel/*.cpp",
     ]),
     hdrs = glob([
         "include/llvm/Target/PowerPC/*.h",
@@ -3342,6 +3422,7 @@ cc_library(
         ":AsmPrinter",
         ":CodeGen",
         ":Core",
+        ":GlobalISel",
         ":MC",
         ":PowerPCDesc",
         ":PowerPCInfo",
@@ -3445,6 +3526,7 @@ cc_library(
     copts = llvm_copts,
     deps = [
         ":Core",
+        ":Demangle",
         ":Support",
         ":config",
     ],
@@ -4082,8 +4164,6 @@ cc_library(
         "include/llvm/TextAPI/*.def",
         "include/llvm/TextAPI/*.inc",
     ]) + [
-        "include/llvm/TextAPI/ELF/TBEHandler.h",
-        "include/llvm/TextAPI/ELF/ELFStub.h",
         "include/llvm/TextAPI/MachO/Architecture.def",
         "include/llvm/TextAPI/MachO/PackedVersion.h",
         "include/llvm/TextAPI/MachO/InterfaceFile.h",
diff --git a/third_party/llvm/llvm.bzl b/third_party/llvm/llvm.bzl
index 48b986bbdc1..dcbaab9edd4 100644
--- a/third_party/llvm/llvm.bzl
+++ b/third_party/llvm/llvm.bzl
@@ -190,6 +190,7 @@ posix_cmake_vars = {
     "HAVE_PTHREAD_H": 1,
     "HAVE_SIGNAL_H": 1,
     "HAVE_STDINT_H": 1,
+    "HAVE_SYSEXITS_H": 1,
     "HAVE_SYS_IOCTL_H": 1,
     "HAVE_SYS_MMAN_H": 1,
     "HAVE_SYS_PARAM_H": 1,
diff --git a/third_party/mkl/build_defs.bzl b/third_party/mkl/build_defs.bzl
index 851403fd13a..28bd262e61e 100644
--- a/third_party/mkl/build_defs.bzl
+++ b/third_party/mkl/build_defs.bzl
@@ -90,12 +90,7 @@ def mkl_deps():
       inclusion in the deps attribute of rules.
     """
     return select({
-        "@org_tensorflow//third_party/mkl_dnn:build_with_mkl_dnn_only": ["@mkl_dnn"],
-        "@org_tensorflow//third_party/mkl_dnn:build_with_mkl_dnn_v1_only": ["@mkl_dnn_v1//:mkl_dnn"],
-        "@org_tensorflow//third_party/mkl:build_with_mkl": [
-            "@org_tensorflow//third_party/mkl:intel_binary_blob",
-            "@mkl_dnn",
-        ],
+        "@org_tensorflow//third_party/mkl:build_with_mkl": ["@mkl_dnn_v1//:mkl_dnn"],
         "//conditions:default": [],
     })
 
diff --git a/third_party/mkl_dnn/BUILD b/third_party/mkl_dnn/BUILD
index c3059a3dc5c..e7051774570 100644
--- a/third_party/mkl_dnn/BUILD
+++ b/third_party/mkl_dnn/BUILD
@@ -9,39 +9,19 @@ package(
 
 exports_files(["LICENSE"])
 
-config_setting(
-    name = "build_with_mkl_dnn_only",
-    define_values = {
-        "build_with_mkl": "true",
-        "build_with_mkl_dnn_only": "true",
-    },
-    visibility = ["//visibility:public"],
-)
-
 config_setting(
     name = "build_with_mkl_opensource",
     define_values = {
         "build_with_mkl": "true",
-        "build_with_mkl_dnn_v1_only": "true",
         "build_with_mkl_opensource": "true",
     },
     visibility = ["//visibility:public"],
 )
 
-config_setting(
-    name = "build_with_mkl_dnn_v1_only",
-    define_values = {
-        "build_with_mkl": "true",
-        "build_with_mkl_dnn_v1_only": "true",
-    },
-    visibility = ["//visibility:public"],
-)
-
 config_setting(
     name = "build_with_mkldnn_threadpool",
     define_values = {
         "build_with_mkl": "true",
-        "build_with_mkl_dnn_v1_only": "true",
         "build_with_mkl_opensource": "true",
         "build_with_mkldnn_threadpool": "true",
     },
diff --git a/third_party/mkl_dnn/build_defs.bzl b/third_party/mkl_dnn/build_defs.bzl
index 6a3e4f827ce..b3bbd3b087c 100644
--- a/third_party/mkl_dnn/build_defs.bzl
+++ b/third_party/mkl_dnn/build_defs.bzl
@@ -14,22 +14,6 @@ def if_mkl_open_source_only(if_true, if_false = []):
         "//conditions:default": if_false,
     })
 
-def if_mkl_v1(if_true, if_false = []):
-    """Returns `if_true` if MKL-DNN v1.x is used.
-
-    Shorthand for select()'ing on whether we're building with
-    MKL-DNN v1.x open source library only, without depending on MKL binary form.
-
-    Returns a select statement which evaluates to if_true if we're building
-    with MKL-DNN v1.x open source library only. Otherwise, the
-    select statement evaluates to if_false.
-
-    """
-    return select({
-        "@org_tensorflow//third_party/mkl_dnn:build_with_mkl_dnn_v1_only": if_true,
-        "//conditions:default": if_false,
-    })
-
 def if_mkldnn_threadpool(if_true, if_false = []):
     """Returns `if_true` if MKL-DNN v1.x is used.
 
diff --git a/third_party/mkl_dnn/mkldnn.BUILD b/third_party/mkl_dnn/mkldnn.BUILD
index 5279043ad29..11b9b917fa0 100644
--- a/third_party/mkl_dnn/mkldnn.BUILD
+++ b/third_party/mkl_dnn/mkldnn.BUILD
@@ -1,10 +1,5 @@
 exports_files(["LICENSE"])
 
-load(
-    "@org_tensorflow//third_party/mkl_dnn:build_defs.bzl",
-    "if_mkl_open_source_only",
-    "if_mkl_v1",
-)
 load(
     "@org_tensorflow//third_party:common.bzl",
     "template_rule",
@@ -50,65 +45,6 @@ template_rule(
     },
 )
 
-cc_library(
-    name = "mkl_dnn",
-    srcs = glob([
-        "src/common/*.cpp",
-        "src/common/*.hpp",
-        "src/cpu/*.cpp",
-        "src/cpu/*.hpp",
-        "src/cpu/**/*.cpp",
-        "src/cpu/**/*.hpp",
-        "src/cpu/xbyak/*.h",
-    ]) + if_mkl_v1([
-        ":mkldnn_config_h",
-    ]) + [":mkldnn_version_h"],
-    hdrs = glob(["include/*"]),
-    copts = [
-        "-fexceptions",
-        "-DUSE_MKL",
-        "-DUSE_CBLAS",
-    ] + if_mkl_open_source_only([
-        "-UUSE_MKL",
-        "-UUSE_CBLAS",
-    ]) + if_mkl_v1([
-        "-UUSE_MKL",
-        "-UUSE_CBLAS",
-    ]) + select({
-        "@org_tensorflow//tensorflow:linux_x86_64": [
-            "-fopenmp",  # only works with gcc
-        ],
-        # TODO(ibiryukov): enable openmp with clang by including libomp as a
-        # dependency.
-        ":clang_linux_x86_64": [],
-        "//conditions:default": [],
-    }),
-    includes = [
-        "include",
-        "src",
-        "src/common",
-        "src/cpu",
-        "src/cpu/gemm",
-        "src/cpu/xbyak",
-    ],
-    visibility = ["//visibility:public"],
-    deps = select({
-        "@org_tensorflow//tensorflow:linux_x86_64": [
-            "@mkl_linux//:mkl_headers",
-            "@mkl_linux//:mkl_libs_linux",
-        ],
-        "@org_tensorflow//tensorflow:macos": [
-            "@mkl_darwin//:mkl_headers",
-            "@mkl_darwin//:mkl_libs_darwin",
-        ],
-        "@org_tensorflow//tensorflow:windows": [
-            "@mkl_windows//:mkl_headers",
-            "@mkl_windows//:mkl_libs_windows",
-        ],
-        "//conditions:default": [],
-    }),
-)
-
 cc_library(
     name = "mkldnn_single_threaded",
     srcs = glob([
diff --git a/third_party/mkl_dnn/mkldnn_v1.BUILD b/third_party/mkl_dnn/mkldnn_v1.BUILD
index 592a28e01a8..0e6acc2fadd 100644
--- a/third_party/mkl_dnn/mkldnn_v1.BUILD
+++ b/third_party/mkl_dnn/mkldnn_v1.BUILD
@@ -3,7 +3,6 @@ exports_files(["LICENSE"])
 load(
     "@org_tensorflow//third_party/mkl_dnn:build_defs.bzl",
     "if_mkl_open_source_only",
-    "if_mkl_v1",
     "if_mkldnn_threadpool",
 )
 load(
@@ -59,8 +58,8 @@ template_rule(
     out = "include/dnnl_version.h",
     substitutions = {
         "@DNNL_VERSION_MAJOR@": "1",
-        "@DNNL_VERSION_MINOR@": "4",
-        "@DNNL_VERSION_PATCH@": "0",
+        "@DNNL_VERSION_MINOR@": "5",
+        "@DNNL_VERSION_PATCH@": "1",
         "@DNNL_VERSION_HASH@": "N/A",
     },
 )
@@ -75,8 +74,8 @@ cc_library(
         "src/cpu/**/*.cpp",
         "src/cpu/**/*.hpp",
         "src/cpu/xbyak/*.h",
-        "src/cpu/jit_utils/jitprofiling/*.c",
-        "src/cpu/jit_utils/jitprofiling/*.h",
+        "src/cpu/x64/jit_utils/jitprofiling/*.c",
+        "src/cpu/x64/jit_utils/jitprofiling/*.h",
     ]) + [
         ":dnnl_config_h",
         ":dnnl_version_h",
@@ -84,18 +83,9 @@ cc_library(
     hdrs = glob(["include/*"]),
     copts = [
         "-fexceptions",
-        "-DUSE_MKL",
-        "-DUSE_CBLAS",
-    ] + if_mkl_open_source_only([
         "-UUSE_MKL",
         "-UUSE_CBLAS",
-    ]) + if_mkl_v1([
-        "-UUSE_MKL",
-        "-UUSE_CBLAS",
-    ]) + if_mkldnn_threadpool([
-        "-UUSE_MKL",
-        "-UUSE_CBLAS",
-    ]) + select({
+    ] + select({
         "@org_tensorflow//tensorflow:linux_x86_64": [
             "-fopenmp",  # only works with gcc
         ],
diff --git a/third_party/mlir/BUILD b/third_party/mlir/BUILD
index 3941375bc02..1c750e61bb4 100644
--- a/third_party/mlir/BUILD
+++ b/third_party/mlir/BUILD
@@ -4,9 +4,10 @@
 load("@org_tensorflow//third_party/mlir:tblgen.bzl", "gentbl")
 load("@org_tensorflow//third_party/mlir:linalggen.bzl", "genlinalg")
 
-licenses(["notice"])
-
-package(default_visibility = [":friends"])
+package(
+    default_visibility = [":friends"],
+    licenses = ["notice"],
+)
 
 package_group(
     name = "subpackages",
@@ -24,14 +25,6 @@ exports_files([
     "run_lit.sh",
 ])
 
-cc_library(
-    name = "DialectSymbolRegistry",
-    # strip_include_prefix does not apply to textual_hdrs.
-    hdrs = ["include/mlir/IR/DialectSymbolRegistry.def"],
-    strip_include_prefix = "include/mlir/IR",
-    textual_hdrs = ["include/mlir/IR/DialectSymbolRegistry.def"],
-)
-
 [
     gentbl(
         name = name + "IncGen",
@@ -69,11 +62,12 @@ cc_library(
         "include/mlir/IR/*.h",
     ]) + [
         "include/mlir/Interfaces/CallInterfaces.h",
+        "include/mlir/Interfaces/DecodeAttributesInterfaces.h",
+        "include/mlir/Interfaces/FoldInterfaces.h",
     ],
     includes = ["include"],
     deps = [
         ":CallOpInterfacesIncGen",
-        ":DialectSymbolRegistry",
         ":InferTypeOpInterfaceIncGen",
         ":OpAsmInterfaceIncGen",
         ":RegionKindInterfaceIncGen",
@@ -124,12 +118,25 @@ cc_library(
 )
 
 cc_library(
-    name = "EDSCInterface",
+    name = "CAPIIR",
     srcs = [
-        "lib/EDSC/CoreAPIs.cpp",
+        "lib/CAPI/IR/AffineMap.cpp",
+        "lib/CAPI/IR/IR.cpp",
+        "lib/CAPI/IR/StandardAttributes.cpp",
+        "lib/CAPI/IR/StandardTypes.cpp",
+        "lib/CAPI/IR/Support.cpp",
     ],
     hdrs = [
-        "include/mlir-c/Core.h",
+        "include/mlir-c/AffineMap.h",
+        "include/mlir-c/IR.h",
+        "include/mlir-c/StandardAttributes.h",
+        "include/mlir-c/StandardTypes.h",
+        "include/mlir-c/Support.h",
+        "include/mlir/CAPI/AffineMap.h",
+        "include/mlir/CAPI/IR.h",
+        "include/mlir/CAPI/Support.h",
+        "include/mlir/CAPI/Utils.h",
+        "include/mlir/CAPI/Wrap.h",
     ],
     includes = ["include"],
     deps = [
@@ -140,6 +147,21 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "CAPIRegistration",
+    srcs = [
+        "lib/CAPI/Registration/Registration.cpp",
+    ],
+    hdrs = [
+        "include/mlir-c/Registration.h",
+    ],
+    includes = ["include"],
+    deps = [
+        ":AllPassesAndDialectsNoRegistration",
+        ":CAPIIR",
+    ],
+)
+
 filegroup(
     name = "OpBaseTdFiles",
     srcs = [
@@ -578,10 +600,10 @@ cc_library(
         ":LinalgToLLVM",
         ":LinalgToSPIRV",
         ":LinalgToStandard",
+        ":OpenMPToLLVM",
         ":SCFToGPUPass",
         ":SCFToStandard",
         ":SPIRVToLLVM",
-        ":ShapeToSCF",
         ":ShapeToStandard",
         ":StandardToLLVM",
         ":StandardToSPIRVTransforms",
@@ -784,25 +806,6 @@ cc_library(
     ]) + ["lib/Conversion/PassDetail.h"],
     hdrs = ["include/mlir/Conversion/ShapeToStandard/ShapeToStandard.h"],
     includes = ["include"],
-    deps = [
-        ":ConversionPassIncGen",
-        ":Pass",
-        ":SCFDialect",
-        ":Shape",
-        ":StandardOps",
-        ":Support",
-        ":Transforms",
-    ],
-)
-
-cc_library(
-    name = "ShapeToSCF",
-    srcs = glob([
-        "lib/Conversion/ShapeToSCF/*.cpp",
-        "lib/Conversion/ShapeToSCF/*.h",
-    ]) + ["lib/Conversion/PassDetail.h"],
-    hdrs = ["include/mlir/Conversion/ShapeToSCF/ShapeToSCF.h"],
-    includes = ["include"],
     deps = [
         ":ConversionPassIncGen",
         ":IR",
@@ -810,6 +813,7 @@ cc_library(
         ":SCFDialect",
         ":Shape",
         ":StandardOps",
+        ":Support",
         ":Transforms",
     ],
 )
@@ -950,9 +954,14 @@ cc_library(
             "lib/Support/MlirOptMain.cpp",
         ],
     ),
-    hdrs = glob([
-        "include/mlir/Support/*.h",
-    ]),
+    hdrs = glob(
+        [
+            "include/mlir/Support/*.h",
+        ],
+        exclude = [
+            "include/mlir/Support/MlirOptMain.h",
+        ],
+    ),
     includes = ["include"],
     deps = [
         "@llvm-project//llvm:Support",
@@ -978,7 +987,6 @@ cc_library(
     ],
     includes = ["include"],
     deps = [
-        ":Analysis",
         ":IR",
         ":ParserTokenKinds",
         ":Support",
@@ -1104,6 +1112,7 @@ cc_library(
         ":ControlFlowInterfaces",
         ":IR",
         ":LLVMOpsIncGen",
+        ":OpenMPDialect",
         ":SideEffectInterfaces",
         ":Support",
         "@llvm-project//llvm:AsmParser",
@@ -1459,6 +1468,7 @@ cc_library(
         ":IR",
         ":LLVMDialect",
         ":Pass",
+        ":StandardToLLVM",
         ":Support",
         ":TargetNVVMIR",
         "@llvm-project//llvm:Core",
@@ -1741,6 +1751,116 @@ gentbl(
     ],
 )
 
+cc_library(
+    name = "PDLDialect",
+    srcs = glob([
+        "lib/Dialect/PDL/IR/*.cpp",
+        "lib/Dialect/PDL/IR/*.h",
+    ]),
+    hdrs = glob([
+        "include/mlir/Dialect/PDL/IR/*.h",
+    ]),
+    includes = ["include"],
+    deps = [
+        ":IR",
+        ":InferTypeOpInterface",
+        ":PDLOpsIncGen",
+        ":SideEffects",
+        ":Support",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
+filegroup(
+    name = "PDLOpsTdFiles",
+    srcs = [
+        "include/mlir/Dialect/PDL/IR/PDLBase.td",
+        "include/mlir/Dialect/PDL/IR/PDLOps.td",
+        "include/mlir/IR/SymbolInterfaces.td",
+        "include/mlir/Interfaces/SideEffectInterfaces.td",
+        ":OpBaseTdFiles",
+    ],
+)
+
+gentbl(
+    name = "PDLOpsIncGen",
+    strip_include_prefix = "include",
+    tbl_outs = [
+        (
+            "-gen-op-decls",
+            "include/mlir/Dialect/PDL/IR/PDLOps.h.inc",
+        ),
+        (
+            "-gen-op-defs",
+            "include/mlir/Dialect/PDL/IR/PDLOps.cpp.inc",
+        ),
+        (
+            "-gen-dialect-decls",
+            "include/mlir/Dialect/PDL/IR/PDLOpsDialect.h.inc",
+        ),
+    ],
+    tblgen = ":mlir-tblgen",
+    td_file = "include/mlir/Dialect/PDL/IR/PDLOps.td",
+    td_srcs = [
+        ":PDLOpsTdFiles",
+    ],
+)
+
+cc_library(
+    name = "PDLInterpDialect",
+    srcs = glob([
+        "lib/Dialect/PDLInterp/IR/*.cpp",
+        "lib/Dialect/PDLInterp/IR/*.h",
+    ]),
+    hdrs = glob([
+        "include/mlir/Dialect/PDLInterp/IR/*.h",
+    ]),
+    includes = ["include"],
+    deps = [
+        ":IR",
+        ":InferTypeOpInterface",
+        ":PDLDialect",
+        ":PDLInterpOpsIncGen",
+        ":SideEffects",
+        ":Support",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
+filegroup(
+    name = "PDLInterpOpsTdFiles",
+    srcs = [
+        "include/mlir/Dialect/PDL/IR/PDLBase.td",
+        "include/mlir/Dialect/PDLInterp/IR/PDLInterpOps.td",
+        "include/mlir/Interfaces/SideEffectInterfaces.td",
+        ":OpBaseTdFiles",
+    ],
+)
+
+gentbl(
+    name = "PDLInterpOpsIncGen",
+    strip_include_prefix = "include",
+    tbl_outs = [
+        (
+            "-gen-op-decls",
+            "include/mlir/Dialect/PDLInterp/IR/PDLInterpOps.h.inc",
+        ),
+        (
+            "-gen-op-defs",
+            "include/mlir/Dialect/PDLInterp/IR/PDLInterpOps.cpp.inc",
+        ),
+        (
+            "-gen-dialect-decls -dialect=pdl_interp",
+            "include/mlir/Dialect/PDLInterp/IR/PDLInterpOpsDialect.h.inc",
+        ),
+    ],
+    tblgen = ":mlir-tblgen",
+    td_file = "include/mlir/Dialect/PDLInterp/IR/PDLInterpOps.td",
+    td_srcs = [
+        ":PDLInterpOpsTdFiles",
+    ],
+)
+
 # TODO(gcmn): Update SPIRV dependencies so that they map better to cmake files.
 filegroup(
     name = "SPIRVOpsTdFiles",
@@ -2566,6 +2686,7 @@ cc_library(
         ":Affine",
         ":CallOpInterfaces",
         ":IR",
+        ":LinalgOps",
         ":SCFDialect",
         ":StandardOps",
         ":Support",
@@ -2743,6 +2864,9 @@ cc_library(
     srcs = [
         "lib/Support/MlirOptMain.cpp",
     ],
+    hdrs = [
+        "include/mlir/Support/MlirOptMain.h",
+    ],
     includes = ["include"],
     deps = [
         ":Analysis",
@@ -2761,7 +2885,6 @@ cc_library(
         ":Parser",
         ":Pass",
         ":SCFTransforms",
-        ":ShapeToSCF",
         ":ShapeToStandard",
         ":ShapeTransforms",
         ":StandardOpsTransforms",
@@ -2775,7 +2898,6 @@ cc_library(
         "@llvm-project//mlir/test:TestAffine",
         "@llvm-project//mlir/test:TestDialect",
         "@llvm-project//mlir/test:TestIR",
-        "@llvm-project//mlir/test:TestLLVMIR",
         "@llvm-project//mlir/test:TestPass",
         "@llvm-project//mlir/test:TestReducer",
         "@llvm-project//mlir/test:TestSPIRV",
@@ -2806,7 +2928,6 @@ cc_library(
         ":Support",
         ":Translation",
         "@llvm-project//llvm:Support",
-        "@llvm-project//mlir/test:TestLLVMTypeTranslation",
     ],
 )
 
@@ -2852,7 +2973,11 @@ cc_library(
         ":LinalgToStandard",
         ":LinalgTransforms",
         ":NVVMDialect",
+        ":OpenACCDialect",
         ":OpenMPDialect",
+        ":OpenMPToLLVM",
+        ":PDLDialect",
+        ":PDLInterpDialect",
         ":QuantOps",
         ":QuantPassIncGen",
         ":ROCDLDialect",
@@ -2867,7 +2992,6 @@ cc_library(
         ":SPIRVPassIncGen",
         ":SPIRVToLLVM",
         ":Shape",
-        ":ShapeToSCF",
         ":ShapeToStandard",
         ":ShapeTransforms",
         ":ShapeTransformsPassIncGen",
@@ -2887,16 +3011,13 @@ cc_library(
 
 cc_library(
     name = "AllPassesAndDialects",
-    srcs = ["@org_tensorflow//third_party/mlir:mlir-auto-init.cpp"],
     deps = [
         ":AllPassesAndDialectsNoRegistration",
     ],
-    alwayslink = 1,
 )
 
-# TODO(jpienaar): This library should be removed.
-cc_library(
-    name = "MlirOptMain",
+cc_binary(
+    name = "mlir-opt",
     srcs = [
         "tools/mlir-opt/mlir-opt.cpp",
     ],
@@ -2906,29 +3027,17 @@ cc_library(
         ":Analysis",
         ":IR",
         ":MlirOptLib",
-        ":Pass",
-        ":Support",
-        "@llvm-project//llvm:Support",
-    ],
-)
-
-cc_binary(
-    name = "mlir-opt",
-    deps = [
-        ":Analysis",
-        ":IR",
-        ":MlirOptLib",
-        ":MlirOptMain",
         ":OpenMPDialect",
+        ":Pass",
         ":QuantOps",
         ":SCFToGPUPass",
+        ":Support",
         ":Transforms",
         "@llvm-project//llvm:AllTargetsCodeGens",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir/test:TestAffine",
         "@llvm-project//mlir/test:TestDialect",
         "@llvm-project//mlir/test:TestIR",
-        "@llvm-project//mlir/test:TestLLVMIR",
         "@llvm-project//mlir/test:TestPass",
         "@llvm-project//mlir/test:TestReducer",
         "@llvm-project//mlir/test:TestSPIRV",
@@ -3154,7 +3263,84 @@ cc_binary(
     ],
 )
 
+## OpenACC dialect
+
+gentbl(
+    name = "OpenACCOpsIncGen",
+    strip_include_prefix = "include",
+    tbl_outs = [
+        (
+            "-gen-dialect-decls -dialect=acc",
+            "include/mlir/Dialect/OpenACC/OpenACCOpsDialect.h.inc",
+        ),
+        (
+            "-gen-op-decls",
+            "include/mlir/Dialect/OpenACC/OpenACCOps.h.inc",
+        ),
+        (
+            "-gen-op-defs",
+            "include/mlir/Dialect/OpenACC/OpenACCOps.cpp.inc",
+        ),
+        (
+            "-gen-enum-decls",
+            "include/mlir/Dialect/OpenACC/OpenACCOpsEnums.h.inc",
+        ),
+        (
+            "-gen-enum-defs",
+            "include/mlir/Dialect/OpenACC/OpenACCOpsEnums.cpp.inc",
+        ),
+        (
+            "-gen-op-doc",
+            "g3doc/Dialects/OpenACC/OpenACCOps.md",
+        ),
+    ],
+    tblgen = ":mlir-tblgen",
+    td_file = "include/mlir/Dialect/OpenACC/OpenACCOps.td",
+    td_srcs = [
+        ":OpBaseTdFiles",
+        ":OmpCommonTdGen",
+    ],
+)
+
+cc_library(
+    name = "OpenACCDialect",
+    srcs = glob(
+        [
+            "lib/Dialect/OpenACC/IR/*.cpp",
+            "lib/Dialect/OpenACC/IR/*.h",
+        ],
+    ),
+    hdrs = glob([
+        "include/mlir/Dialect/OpenACC/*.h",
+    ]),
+    includes = ["include"],
+    deps = [
+        ":IR",
+        ":OpenACCOpsIncGen",
+        ":StandardOps",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
 ## OpenMP dialect
+gentbl(
+    name = "OmpCommonTdGen",
+    strip_include_prefix = "include",
+    tbl_outs = [
+        (
+            "-gen-directive-decl",
+            "include/mlir/Dialect/OpenMP/OmpCommon.td",
+        ),
+    ],
+    tblgen = ":mlir-tblgen",
+    td_file = "@llvm-project//llvm:include/llvm/Frontend/OpenMP/OMP.td",
+    td_includes = ["external/llvm-project/llvm/include"],
+    td_srcs = [
+        "@llvm-project//llvm:omp_td_files",
+        ":OpBaseTdFiles",
+    ],
+)
+
 gentbl(
     name = "OpenMPOpsIncGen",
     strip_include_prefix = "include",
@@ -3188,6 +3374,8 @@ gentbl(
     td_file = "include/mlir/Dialect/OpenMP/OpenMPOps.td",
     td_srcs = [
         ":OpBaseTdFiles",
+        ":OmpCommonTdGen",
+        "include/mlir/Dialect/OpenMP/OmpCommon.td",
     ],
 )
 
@@ -3211,6 +3399,31 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "OpenMPToLLVM",
+    srcs = glob([
+        "lib/Conversion/OpenMPToLLVM/*.cpp",
+        "lib/Conversion/OpenMPToLLVM/*.h",
+    ]) + ["lib/Conversion/PassDetail.h"],
+    hdrs = glob([
+        "include/mlir/Conversion/OpenMPToLLVM/*.h",
+    ]),
+    includes = ["include"],
+    deps = [
+        ":ConversionPassIncGen",
+        ":IR",
+        ":LLVMDialect",
+        ":OpenMPDialect",
+        ":Pass",
+        ":StandardOps",
+        ":StandardToLLVM",
+        ":Transforms",
+        "@llvm-project//llvm:Core",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
+## QuantOps dialect
 filegroup(
     name = "QuantizationOpsTdFiles",
     srcs = [
@@ -3221,7 +3434,6 @@ filegroup(
     ],
 )
 
-## QuantOps dialect
 gentbl(
     name = "QuantOpsIncGen",
     strip_include_prefix = "include",
@@ -3453,6 +3665,7 @@ cc_library(
         ":LinalgOps",
         ":LinalgTransforms",
         ":Pass",
+        ":SCFDialect",
         ":SCFToStandard",
         ":StandardOps",
         ":StandardToLLVM",
@@ -3690,6 +3903,7 @@ cc_library(
         ":EDSC",
         ":IR",
         ":LLVMDialect",
+        ":LinalgTransforms",
         ":Pass",
         ":SCFDialect",
         ":StandardOps",
@@ -3714,6 +3928,7 @@ exports_files(
         "include/mlir/Interfaces/CallInterfaces.td",
         "include/mlir/Interfaces/ControlFlowInterfaces.h",
         "include/mlir/Interfaces/ControlFlowInterfaces.td",
+        "include/mlir/Interfaces/CopyOpInterface.td",
         "include/mlir/Interfaces/SideEffectInterfaces.td",
         "include/mlir/Interfaces/VectorInterfaces.td",
         "include/mlir/Interfaces/ViewLikeInterface.td",
diff --git a/third_party/mlir/mlir-auto-init.cpp b/third_party/mlir/mlir-auto-init.cpp
deleted file mode 100644
index af22a2c966e..00000000000
--- a/third_party/mlir/mlir-auto-init.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-#include "mlir/InitAllDialects.h"
-#include "mlir/InitAllPasses.h"
-
-namespace mlir {
-// This target is a convenient dependency for users to auto-initialize MLIR
-// internals.
-static bool auto_init = []() {
-  registerAllDialects();
-  registerAllPasses();
-
-  return true;
-}();
-
-} // namespace mlir
diff --git a/third_party/mlir/test.BUILD b/third_party/mlir/test.BUILD
index 6c4eeecc346..d88190ce60a 100644
--- a/third_party/mlir/test.BUILD
+++ b/third_party/mlir/test.BUILD
@@ -1,8 +1,9 @@
 load("@org_tensorflow//third_party/mlir:tblgen.bzl", "gentbl")
 
-licenses(["notice"])
-
-package(default_visibility = [":test_friends"])
+package(
+    default_visibility = [":test_friends"],
+    licenses = ["notice"],
+)
 
 # Please only depend on this from MLIR tests.
 package_group(
@@ -126,14 +127,19 @@ cc_library(
         "lib/IR/TestFunc.cpp",
         "lib/IR/TestInterfaces.cpp",
         "lib/IR/TestMatchers.cpp",
+        "lib/IR/TestPrintDefUse.cpp",
+        "lib/IR/TestPrintNesting.cpp",
         "lib/IR/TestSideEffects.cpp",
+        "lib/IR/TestSlicing.cpp",
         "lib/IR/TestSymbolUses.cpp",
         "lib/IR/TestTypes.cpp",
     ],
     deps = [
         ":TestDialect",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:Analysis",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:LinalgOps",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:Support",
@@ -165,20 +171,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "TestLLVMTypeTranslation",
-    srcs = [
-        "lib/Target/TestLLVMTypeTranslation.cpp",
-    ],
-    deps = [
-        ":TestLLVMIR",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:LLVMDialect",
-        "@llvm-project//mlir:LLVMIRModuleTranslation",
-        "@llvm-project//mlir:Translation",
-    ],
-)
-
 cc_library(
     name = "TestTransforms",
     srcs = glob(["lib/Transforms/*.cpp"]),
@@ -188,16 +180,20 @@ cc_library(
         ":TestDialect",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:Affine",
+        "@llvm-project//mlir:AffineTransforms",
         "@llvm-project//mlir:Analysis",
         "@llvm-project//mlir:EDSC",
         "@llvm-project//mlir:GPUDialect",
         "@llvm-project//mlir:GPUToGPURuntimeTransforms",
         "@llvm-project//mlir:GPUTransforms",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:LLVMDialect",
+        "@llvm-project//mlir:LLVMTransforms",
         "@llvm-project//mlir:LinalgOps",
         "@llvm-project//mlir:LinalgTransforms",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:SCFDialect",
+        "@llvm-project//mlir:SPIRVDialect",
         "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:StandardOpsTransforms",
         "@llvm-project//mlir:Support",
@@ -230,19 +226,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "TestLLVMIR",
-    srcs = [
-        "lib/Dialect/LLVMIR/LLVMTypeTestDialect.cpp",
-    ],
-    deps = [
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:Dialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:LLVMDialect",
-    ],
-)
-
 cc_library(
     name = "TestSPIRV",
     srcs = glob([
diff --git a/third_party/nasm/BUILD.system b/third_party/nasm/BUILD.system
index 7f74da7595a..52f608187fe 100644
--- a/third_party/nasm/BUILD.system
+++ b/third_party/nasm/BUILD.system
@@ -5,8 +5,14 @@ filegroup(
     visibility = ["//visibility:public"],
 )
 
+genrule(
+    name = "lnnasmlink",
+    outs = ["nasmlink"],
+    cmd = "ln -s $$(which nasm) $@",
+)
+
 sh_binary(
     name = "nasm",
-    srcs = ["nasm"],
+    srcs = ["nasmlink"],
     visibility = ["@libjpeg_turbo//:__pkg__"],
 )
diff --git a/third_party/nccl/build_defs.bzl.tpl b/third_party/nccl/build_defs.bzl.tpl
index b520f71d0f1..7dd6ea58a2c 100644
--- a/third_party/nccl/build_defs.bzl.tpl
+++ b/third_party/nccl/build_defs.bzl.tpl
@@ -392,7 +392,11 @@ def cuda_rdc_library(name, hdrs = None, copts = None, linkstatic = True, **kwarg
     merged = name + "_merged"
     _merge_archive(
         name = merged,
-        srcs = [pruned, dlink],
+
+        # TODO(b/166662245): We're deliberately not using `pruned` here.
+        # Pruning __nv_relfatbin also seems to prune out the PTX shipped with
+        # NCCL.
+        srcs = [lib, dlink],
     )
 
     # Create cc target from archive.
diff --git a/third_party/ngraph/ngraph.BUILD b/third_party/ngraph/ngraph.BUILD
index afbe79b94d0..dfdd891b228 100644
--- a/third_party/ngraph/ngraph.BUILD
+++ b/third_party/ngraph/ngraph.BUILD
@@ -117,7 +117,7 @@ cc_library(
     deps = [
         ":ngraph_headers",
         "@eigen_archive//:eigen",
-        "@mkl_dnn",
+        "@mkl_dnn_v1//:mkl_dnn",
         "@nlohmann_json_lib",
         "@tbb",
     ],
diff --git a/third_party/ngraph/nlohmann_json.BUILD b/third_party/ngraph/nlohmann_json.BUILD
index 04c8db6a961..1d8fa92e9c1 100644
--- a/third_party/ngraph/nlohmann_json.BUILD
+++ b/third_party/ngraph/nlohmann_json.BUILD
@@ -13,3 +13,10 @@ cc_library(
     visibility = ["//visibility:public"],
     alwayslink = 1,
 )
+
+cc_library(
+    name = "nlohmann_json",
+    includes = ["include"],
+    visibility = ["//visibility:public"],
+    deps = ["nlohmann_json_lib"],
+)
diff --git a/third_party/py/numpy/tf_numpy_api/tensorflow.experimental.numpy.ndarray.pbtxt b/third_party/py/numpy/tf_numpy_api/tensorflow.experimental.numpy.ndarray.pbtxt
index f54ecbdbf47..4edc5f08e84 100644
--- a/third_party/py/numpy/tf_numpy_api/tensorflow.experimental.numpy.ndarray.pbtxt
+++ b/third_party/py/numpy/tf_numpy_api/tensorflow.experimental.numpy.ndarray.pbtxt
@@ -35,6 +35,10 @@ tf_class {
     name: "astype"
     argspec: "args=[\'self\', \'dtype\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "clip"
+    argspec: "args=[\'a\', \'a_min\', \'a_max\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "from_tensor"
     argspec: "args=[\'cls\', \'tensor\'], varargs=None, keywords=None, defaults=None"
diff --git a/third_party/py/numpy/tf_numpy_api/tensorflow.experimental.numpy.pbtxt b/third_party/py/numpy/tf_numpy_api/tensorflow.experimental.numpy.pbtxt
index f5ffcf9e244..7fdb4d0f38a 100644
--- a/third_party/py/numpy/tf_numpy_api/tensorflow.experimental.numpy.pbtxt
+++ b/third_party/py/numpy/tf_numpy_api/tensorflow.experimental.numpy.pbtxt
@@ -784,6 +784,10 @@ tf_module {
     name: "sinh"
     argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "size"
+    argspec: "args=[\'x\', \'axis\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "sort"
     argspec: "args=[\'a\', \'axis\', \'kind\', \'order\'], varargs=None, keywords=None, defaults=[\'-1\', \'quicksort\', \'None\'], "
diff --git a/third_party/swig.BUILD b/third_party/swig.BUILD
deleted file mode 100644
index 59a3d9e6714..00000000000
--- a/third_party/swig.BUILD
+++ /dev/null
@@ -1,336 +0,0 @@
-licenses(["restricted"])  # GPLv3
-
-exports_files(["LICENSE"])
-
-cc_binary(
-    name = "swig",
-    srcs = [
-        "Source/CParse/cparse.h",
-        "Source/CParse/cscanner.c",
-        "Source/CParse/parser.c",
-        "Source/CParse/parser.h",
-        "Source/CParse/templ.c",
-        "Source/CParse/util.c",
-        "Source/DOH/base.c",
-        "Source/DOH/doh.h",
-        "Source/DOH/dohint.h",
-        "Source/DOH/file.c",
-        "Source/DOH/fio.c",
-        "Source/DOH/hash.c",
-        "Source/DOH/list.c",
-        "Source/DOH/memory.c",
-        "Source/DOH/string.c",
-        "Source/DOH/void.c",
-        "Source/Include/swigconfig.h",
-        "Source/Include/swigwarn.h",
-        "Source/Modules/allocate.cxx",
-        "Source/Modules/browser.cxx",
-        "Source/Modules/contract.cxx",
-        "Source/Modules/directors.cxx",
-        "Source/Modules/emit.cxx",
-        "Source/Modules/lang.cxx",
-        "Source/Modules/main.cxx",
-        "Source/Modules/module.cxx",
-        "Source/Modules/nested.cxx",
-        "Source/Modules/overload.cxx",
-        "Source/Modules/python.cxx",
-        "Source/Modules/swigmain-lite.cxx",
-        "Source/Modules/swigmod.h",
-        "Source/Modules/typepass.cxx",
-        "Source/Modules/uffi.cxx",
-        "Source/Modules/utils.cxx",
-        "Source/Modules/xml.cxx",
-        "Source/Preprocessor/cpp.c",
-        "Source/Preprocessor/expr.c",
-        "Source/Preprocessor/preprocessor.h",
-        "Source/Swig/cwrap.c",
-        "Source/Swig/deprecate.c",
-        "Source/Swig/error.c",
-        "Source/Swig/extend.c",
-        "Source/Swig/fragment.c",
-        "Source/Swig/getopt.c",
-        "Source/Swig/include.c",
-        "Source/Swig/misc.c",
-        "Source/Swig/naming.c",
-        "Source/Swig/parms.c",
-        "Source/Swig/scanner.c",
-        "Source/Swig/stype.c",
-        "Source/Swig/swig.h",
-        "Source/Swig/swigfile.h",
-        "Source/Swig/swigopt.h",
-        "Source/Swig/swigparm.h",
-        "Source/Swig/swigscan.h",
-        "Source/Swig/swigtree.h",
-        "Source/Swig/swigwrap.h",
-        "Source/Swig/symbol.c",
-        "Source/Swig/tree.c",
-        "Source/Swig/typemap.c",
-        "Source/Swig/typeobj.c",
-        "Source/Swig/typesys.c",
-        "Source/Swig/wrapfunc.c",
-    ],
-    copts = ["$(STACK_FRAME_UNLIMITED)"] + select({
-        ":windows": [],
-        "//conditions:default": [
-            "-Wno-parentheses",
-            "-Wno-unused-variable",
-            "-fexceptions",
-        ],
-    }),
-    data = [":templates"],
-    includes = [
-        "Source/CParse",
-        "Source/DOH",
-        "Source/Include",
-        "Source/Modules",
-        "Source/Preprocessor",
-        "Source/Swig",
-    ],
-    output_licenses = ["unencumbered"],
-    visibility = ["//visibility:public"],
-    deps = ["@pcre"],
-)
-
-filegroup(
-    name = "templates",
-    srcs = [
-        "Lib/allkw.swg",
-        "Lib/attribute.i",
-        "Lib/carrays.i",
-        "Lib/cdata.i",
-        "Lib/cffi/cffi.swg",
-        "Lib/cmalloc.i",
-        "Lib/constraints.i",
-        "Lib/cpointer.i",
-        "Lib/cstring.i",
-        "Lib/cwstring.i",
-        "Lib/exception.i",
-        "Lib/intrusive_ptr.i",
-        "Lib/inttypes.i",
-        "Lib/linkruntime.c",
-        "Lib/math.i",
-        "Lib/pointer.i",
-        "Lib/python/argcargv.i",
-        "Lib/python/attribute.i",
-        "Lib/python/boost_shared_ptr.i",
-        "Lib/python/builtin.swg",
-        "Lib/python/carrays.i",
-        "Lib/python/ccomplex.i",
-        "Lib/python/cdata.i",
-        "Lib/python/cmalloc.i",
-        "Lib/python/cni.i",
-        "Lib/python/complex.i",
-        "Lib/python/cpointer.i",
-        "Lib/python/cstring.i",
-        "Lib/python/cwstring.i",
-        "Lib/python/defarg.swg",
-        "Lib/python/director.swg",
-        "Lib/python/embed.i",
-        "Lib/python/embed15.i",
-        "Lib/python/exception.i",
-        "Lib/python/factory.i",
-        "Lib/python/file.i",
-        "Lib/python/implicit.i",
-        "Lib/python/jstring.i",
-        "Lib/python/pyabc.i",
-        "Lib/python/pyapi.swg",
-        "Lib/python/pybackward.swg",
-        "Lib/python/pybuffer.i",
-        "Lib/python/pyclasses.swg",
-        "Lib/python/pycomplex.swg",
-        "Lib/python/pycontainer.swg",
-        "Lib/python/pydocs.swg",
-        "Lib/python/pyerrors.swg",
-        "Lib/python/pyfragments.swg",
-        "Lib/python/pyhead.swg",
-        "Lib/python/pyinit.swg",
-        "Lib/python/pyiterators.swg",
-        "Lib/python/pymacros.swg",
-        "Lib/python/pyname_compat.i",
-        "Lib/python/pyopers.swg",
-        "Lib/python/pyprimtypes.swg",
-        "Lib/python/pyrun.swg",
-        "Lib/python/pyruntime.swg",
-        "Lib/python/pystdcommon.swg",
-        "Lib/python/pystrings.swg",
-        "Lib/python/python.swg",
-        "Lib/python/pythonkw.swg",
-        "Lib/python/pythreads.swg",
-        "Lib/python/pytuplehlp.swg",
-        "Lib/python/pytypemaps.swg",
-        "Lib/python/pyuserdir.swg",
-        "Lib/python/pywstrings.swg",
-        "Lib/python/std_alloc.i",
-        "Lib/python/std_auto_ptr.i",
-        "Lib/python/std_basic_string.i",
-        "Lib/python/std_carray.i",
-        "Lib/python/std_char_traits.i",
-        "Lib/python/std_common.i",
-        "Lib/python/std_complex.i",
-        "Lib/python/std_container.i",
-        "Lib/python/std_deque.i",
-        "Lib/python/std_except.i",
-        "Lib/python/std_ios.i",
-        "Lib/python/std_iostream.i",
-        "Lib/python/std_list.i",
-        "Lib/python/std_map.i",
-        "Lib/python/std_multimap.i",
-        "Lib/python/std_multiset.i",
-        "Lib/python/std_pair.i",
-        "Lib/python/std_set.i",
-        "Lib/python/std_shared_ptr.i",
-        "Lib/python/std_sstream.i",
-        "Lib/python/std_streambuf.i",
-        "Lib/python/std_string.i",
-        "Lib/python/std_unordered_map.i",
-        "Lib/python/std_unordered_multimap.i",
-        "Lib/python/std_unordered_multiset.i",
-        "Lib/python/std_unordered_set.i",
-        "Lib/python/std_vector.i",
-        "Lib/python/std_vectora.i",
-        "Lib/python/std_wios.i",
-        "Lib/python/std_wiostream.i",
-        "Lib/python/std_wsstream.i",
-        "Lib/python/std_wstreambuf.i",
-        "Lib/python/std_wstring.i",
-        "Lib/python/stl.i",
-        "Lib/python/typemaps.i",
-        "Lib/python/wchar.i",
-        "Lib/runtime.swg",
-        "Lib/shared_ptr.i",
-        "Lib/std/_std_deque.i",
-        "Lib/std/std_alloc.i",
-        "Lib/std/std_basic_string.i",
-        "Lib/std/std_carray.swg",
-        "Lib/std/std_char_traits.i",
-        "Lib/std/std_common.i",
-        "Lib/std/std_container.i",
-        "Lib/std/std_deque.i",
-        "Lib/std/std_except.i",
-        "Lib/std/std_ios.i",
-        "Lib/std/std_iostream.i",
-        "Lib/std/std_list.i",
-        "Lib/std/std_map.i",
-        "Lib/std/std_multimap.i",
-        "Lib/std/std_multiset.i",
-        "Lib/std/std_pair.i",
-        "Lib/std/std_queue.i",
-        "Lib/std/std_set.i",
-        "Lib/std/std_sstream.i",
-        "Lib/std/std_stack.i",
-        "Lib/std/std_streambuf.i",
-        "Lib/std/std_string.i",
-        "Lib/std/std_unordered_map.i",
-        "Lib/std/std_unordered_multimap.i",
-        "Lib/std/std_unordered_multiset.i",
-        "Lib/std/std_unordered_set.i",
-        "Lib/std/std_vector.i",
-        "Lib/std/std_vectora.i",
-        "Lib/std/std_wios.i",
-        "Lib/std/std_wiostream.i",
-        "Lib/std/std_wsstream.i",
-        "Lib/std/std_wstreambuf.i",
-        "Lib/std/std_wstring.i",
-        "Lib/std_except.i",
-        "Lib/stdint.i",
-        "Lib/stl.i",
-        "Lib/swig.swg",
-        "Lib/swigarch.i",
-        "Lib/swigerrors.swg",
-        "Lib/swiginit.swg",
-        "Lib/swiglabels.swg",
-        "Lib/swigrun.i",
-        "Lib/swigrun.swg",
-        "Lib/swigwarn.swg",
-        "Lib/swigwarnings.swg",
-        "Lib/typemaps/attribute.swg",
-        "Lib/typemaps/carrays.swg",
-        "Lib/typemaps/cdata.swg",
-        "Lib/typemaps/cmalloc.swg",
-        "Lib/typemaps/cpointer.swg",
-        "Lib/typemaps/cstring.swg",
-        "Lib/typemaps/cstrings.swg",
-        "Lib/typemaps/cwstring.swg",
-        "Lib/typemaps/enumint.swg",
-        "Lib/typemaps/exception.swg",
-        "Lib/typemaps/factory.swg",
-        "Lib/typemaps/fragments.swg",
-        "Lib/typemaps/implicit.swg",
-        "Lib/typemaps/inoutlist.swg",
-        "Lib/typemaps/misctypes.swg",
-        "Lib/typemaps/primtypes.swg",
-        "Lib/typemaps/ptrtypes.swg",
-        "Lib/typemaps/std_except.swg",
-        "Lib/typemaps/std_string.swg",
-        "Lib/typemaps/std_strings.swg",
-        "Lib/typemaps/std_wstring.swg",
-        "Lib/typemaps/string.swg",
-        "Lib/typemaps/strings.swg",
-        "Lib/typemaps/swigmacros.swg",
-        "Lib/typemaps/swigobject.swg",
-        "Lib/typemaps/swigtype.swg",
-        "Lib/typemaps/swigtypemaps.swg",
-        "Lib/typemaps/traits.swg",
-        "Lib/typemaps/typemaps.swg",
-        "Lib/typemaps/valtypes.swg",
-        "Lib/typemaps/void.swg",
-        "Lib/typemaps/wstring.swg",
-        "Lib/wchar.i",
-        "Lib/windows.i",
-    ],
-    licenses = ["notice"],  # simple notice license for Lib/
-    path = "Lib",
-    visibility = ["//visibility:public"],
-)
-
-genrule(
-    name = "swigconfig",
-    outs = ["Source/Include/swigconfig.h"],
-    cmd = "cat <<EOF >$@\n" +
-          "#define HAVE_BOOL\n" +
-          "#define HAVE_PCRE\n" +
-          "#define HAVE_POPEN\n" +
-          "#define PACKAGE_BUGREPORT \"http://www.swig.org\"\n" +
-          "#define PACKAGE_VERSION \"3.0.8\"\n" +
-          "#define STDC_HEADERS\n" +
-          "#define SWIG_CXX \"bazel4lyfe\"\n" +
-          "#define SWIG_LIB \"external/swig/Lib\"\n" +
-          "#define SWIG_LIB_WIN_UNIX \"\"\n" +
-          "#define SWIG_PLATFORM \"bazel4lyfe\"\n" +
-          "EOF",
-)
-
-genrule(
-    name = "get_rid_of_stuff_we_dont_need_yet",
-    srcs = ["Source/Modules/swigmain.cxx"],
-    outs = ["Source/Modules/swigmain-lite.cxx"],
-    cmd = "sed -e '/swig_allegrocl/d'" +
-          "    -e '/swig_cffi/d'" +
-          "    -e '/swig_chicken/d'" +
-          "    -e '/swig_clisp/d'" +
-          "    -e '/swig_csharp/d'" +
-          "    -e '/swig_d/d'" +
-          "    -e '/swig_go/d'" +
-          "    -e '/swig_guile/d'" +
-          "    -e '/swig_java/d'" +
-          "    -e '/swig_lua/d'" +
-          "    -e '/swig_modula3/d'" +
-          "    -e '/swig_mzscheme/d'" +
-          "    -e '/swig_ocaml/d'" +
-          "    -e '/swig_octave/d'" +
-          "    -e '/swig_perl/d'" +
-          "    -e '/swig_php/d'" +
-          "    -e '/swig_pike/d'" +
-          "    -e '/swig_r/d'" +
-          "    -e '/swig_ruby/d'" +
-          "    -e '/swig_scilab/d'" +
-          "    -e '/swig_sexp/d'" +
-          "    -e '/swig_tcl/d'" +
-          "    -e '/swig_uffi/d'" +
-          "    $< >$@",
-)
-
-config_setting(
-    name = "windows",
-    values = {"cpu": "x64_windows"},
-)
diff --git a/third_party/sycl/BUILD b/third_party/sycl/BUILD
deleted file mode 100644
index 2b86f73b98b..00000000000
--- a/third_party/sycl/BUILD
+++ /dev/null
@@ -1,4 +0,0 @@
-package(
-    default_visibility = ["//visibility:public"],
-    licenses = ["notice"],  # Apache 2.0
-)
diff --git a/third_party/sycl/crosstool/BUILD.tpl b/third_party/sycl/crosstool/BUILD.tpl
deleted file mode 100755
index 72744334aaf..00000000000
--- a/third_party/sycl/crosstool/BUILD.tpl
+++ /dev/null
@@ -1,27 +0,0 @@
-licenses(["notice"])  # Apache 2.0
-
-package(default_visibility = ["//visibility:public"])
-
-cc_toolchain_suite(
-    name = "toolchain",
-    toolchains = {
-        "local|compiler": ":cc-compiler-local",
-    },
-)
-
-cc_toolchain(
-    name = "cc-compiler-local",
-    all_files = ":empty",
-    compiler_files = ":empty",
-    cpu = "local",
-    dwp_files = ":empty",
-    linker_files = ":empty",
-    objcopy_files = ":empty",
-    strip_files = ":empty",
-    supports_param_files = 1,
-)
-
-filegroup(
-    name = "empty",
-    srcs = [],
-)
diff --git a/third_party/sycl/crosstool/CROSSTOOL.tpl b/third_party/sycl/crosstool/CROSSTOOL.tpl
deleted file mode 100755
index f8e50efcc65..00000000000
--- a/third_party/sycl/crosstool/CROSSTOOL.tpl
+++ /dev/null
@@ -1,217 +0,0 @@
-major_version: "local"
-minor_version: ""
-default_target_cpu: "same_as_host"
-
-default_toolchain {
-  cpu: "k8"
-  toolchain_identifier: "local_linux"
-}
-
-default_toolchain {
-  cpu: "arm"
-  toolchain_identifier: "local_arm"
-}
-
-toolchain {
-  abi_version: "local"
-  abi_libc_version: "local"
-  builtin_sysroot: ""
-  compiler: "compiler"
-  host_system_name: "local"
-  needsPic: true
-  supports_gold_linker: false
-  supports_incremental_linker: false
-  supports_fission: false
-  supports_interface_shared_objects: false
-  supports_normalizing_ar: false
-  supports_start_end_lib: false
-  supports_thin_archives: false
-  target_libc: "local"
-  target_cpu: "local"
-  target_system_name: "local"
-  toolchain_identifier: "local_linux"
-
-  tool_path { name: "ar" path: "/usr/bin/ar" }
-  tool_path { name: "compat-ld" path: "/usr/bin/ld" }
-  tool_path { name: "cpp" path: "/usr/bin/cpp" }
-  tool_path { name: "dwp" path: "/usr/bin/dwp" }
-  tool_path { name: "gcc" path: "%{sycl_impl}" }
-  # Use "-std=c++11" for nvcc. For consistency, force both the host compiler
-  # and the device compiler to use "-std=c++11".
-  cxx_flag: "%{c++_std}"
-  linker_flag: "-Wl,-no-as-needed"
-  linker_flag: "-lstdc++"
-  linker_flag: "-B/usr/bin/"
-
-  # TODO(bazel-team): In theory, the path here ought to exactly match the path
-  # used by gcc. That works because bazel currently doesn't track files at
-  # absolute locations and has no remote execution, yet. However, this will need
-  # to be fixed, maybe with auto-detection?
-  cxx_builtin_include_directory: "/usr/lib/gcc/"
-  cxx_builtin_include_directory: "/usr/lib"
-  cxx_builtin_include_directory: "/usr/lib64"
-  cxx_builtin_include_directory: "/usr/local/include"
-  cxx_builtin_include_directory: "/usr/include"
-
-  cxx_builtin_include_directory: "%{sycl_include_dir}"
-  cxx_builtin_include_directory: "%{python_lib_path}"
-
-  tool_path { name: "gcov" path: "/usr/bin/gcov" }
-
-  # C(++) compiles invoke the compiler (as that is the one knowing where
-  # to find libraries), but we provide LD so other rules can invoke the linker.
-  tool_path { name: "ld" path: "/usr/bin/ld" }
-
-  tool_path { name: "nm" path: "/usr/bin/nm" }
-  tool_path { name: "objcopy" path: "/usr/bin/objcopy" }
-  objcopy_embed_flag: "-I"
-  objcopy_embed_flag: "binary"
-  tool_path { name: "objdump" path: "/usr/bin/objdump" }
-  tool_path { name: "strip" path: "/usr/bin/strip" }
-
-  # Make C++ compilation deterministic. Use linkstamping instead of these
-  # compiler symbols.
-  unfiltered_cxx_flag: "-Wno-builtin-macro-redefined"
-  unfiltered_cxx_flag: "-D__DATE__=\"redacted\""
-  unfiltered_cxx_flag: "-D__TIMESTAMP__=\"redacted\""
-  unfiltered_cxx_flag: "-D__TIME__=\"redacted\""
-
-  compiler_flag: "-fPIE"
-
-  # Keep stack frames for debugging, even in opt mode.
-  compiler_flag: "-fno-omit-frame-pointer"
-
-  # Anticipated future default.
-  linker_flag: "-no-canonical-prefixes"
-  unfiltered_cxx_flag: "-fno-canonical-system-headers"
-
-  # Have gcc return the exit code from ld.
-  linker_flag: "-pass-exit-codes"
-
-  # All warnings are enabled. Maybe enable -Werror as well?
-  compiler_flag: "-Wall"
-
-  # Enable SSE instructions by default
-  compiler_flag: "-msse3"
-
-  # Anticipated future default.
-  linker_flag: "-Wl,-no-as-needed"
-  # Stamp the binary with a unique identifier.
-  linker_flag: "-Wl,--build-id=md5"
-  linker_flag: "-Wl,--hash-style=gnu"
-
-  linking_mode_flags { mode: DYNAMIC }
-
-  compilation_mode_flags {
-    mode: FASTBUILD
-    compiler_flag: "-O0"
-  }
-
-  compilation_mode_flags {
-    mode: DBG
-    compiler_flag: "-g"
-  }
-
-  compilation_mode_flags {
-    mode: OPT
-    compiler_flag: "-g0"
-    compiler_flag: "-O2"
-    compiler_flag: "-DNDEBUG"
-    compiler_flag: "-ffunction-sections"
-    compiler_flag: "-fdata-sections"
-    linker_flag: "-Wl,--gc-sections"
-  }
-}
-
-toolchain {
-  abi_version: "local"
-  abi_libc_version: "local"
-  builtin_sysroot: ""
-  compiler: "compiler"
-  host_system_name: "local"
-  needsPic: true
-  supports_gold_linker: false
-  supports_incremental_linker: false
-  supports_fission: false
-  supports_interface_shared_objects: false
-  supports_normalizing_ar: false
-  supports_start_end_lib: false
-  supports_thin_archives: false
-  target_libc: "local"
-  target_cpu: "local"
-  target_system_name: "local"
-  toolchain_identifier: "local_arm"
-
-  tool_path { name: "ar" path: "/usr/bin/ar" }
-  tool_path { name: "compat-ld" path: "/usr/bin/ld" }
-  tool_path { name: "cpp" path: "/usr/bin/cpp" }
-  tool_path { name: "dwp" path: "/usr/bin/dwp" }
-  tool_path { name: "gcc" path: "computecpp" }
-  # Use "-std=c++11" for nvcc. For consistency, force both the host compiler
-  # and the device compiler to use "-std=c++11".
-  cxx_flag: "-std=c++11"
-  linker_flag: "-Wl,-no-as-needed"
-  linker_flag: "-lstdc++"
-  linker_flag: "-B/usr/bin/"
-
-  # TODO(bazel-team): In theory, the path here ought to exactly match the path
-  # used by gcc. That works because bazel currently doesn't track files at
-  # absolute locations and has no remote execution, yet. However, this will need
-  # to be fixed, maybe with auto-detection?
-  cxx_builtin_include_directory: "/usr/lib/gcc/"
-  cxx_builtin_include_directory: "/usr/lib"
-  cxx_builtin_include_directory: "/usr/lib64"
-  cxx_builtin_include_directory: "/usr/local/include"
-  cxx_builtin_include_directory: "/usr/include"
-
-  cxx_builtin_include_directory: "%{computecpp_toolkit_path}"
-  cxx_builtin_include_directory: "%{python_lib_path}"
-
-  tool_path { name: "gcov" path: "/usr/bin/gcov" }
-
-  # C(++) compiles invoke the compiler (as that is the one knowing where
-  # to find libraries), but we provide LD so other rules can invoke the linker.
-  tool_path { name: "ld" path: "/usr/bin/ld" }
-
-  tool_path { name: "nm" path: "/usr/bin/nm" }
-  tool_path { name: "objcopy" path: "/usr/bin/objcopy" }
-  objcopy_embed_flag: "-I"
-  objcopy_embed_flag: "binary"
-  tool_path { name: "objdump" path: "/usr/bin/objdump" }
-  tool_path { name: "strip" path: "/usr/bin/strip" }
-
-  # Make C++ compilation deterministic. Use linkstamping instead of these
-  # compiler symbols.
-  unfiltered_cxx_flag: "-Wno-builtin-macro-redefined"
-  unfiltered_cxx_flag: "-D__DATE__=\"redacted\""
-  unfiltered_cxx_flag: "-D__TIMESTAMP__=\"redacted\""
-  unfiltered_cxx_flag: "-D__TIME__=\"redacted\""
-
-  # All warnings are enabled. Maybe enable -Werror as well?
-  compiler_flag: "-Wall"
-
-  # Anticipated future default.
-  linker_flag: "-Wl,-no-as-needed"
-  # Stamp the binary with a unique identifier.
-  linker_flag: "-Wl,--build-id=md5"
-  linker_flag: "-Wl,--hash-style=gnu"
-
-  linking_mode_flags { mode: DYNAMIC }
-
-  compilation_mode_flags {
-    mode: FASTBUILD
-    compiler_flag: "-O0"
-  }
-
-  compilation_mode_flags {
-    mode: DBG
-    compiler_flag: "-g"
-  }
-
-  compilation_mode_flags {
-    mode: OPT
-    compiler_flag: "-g0"
-    compiler_flag: "-O2"
-    compiler_flag: "-DNDEBUG"
-  }
-}
\ No newline at end of file
diff --git a/third_party/sycl/crosstool/computecpp.tpl b/third_party/sycl/crosstool/computecpp.tpl
deleted file mode 100755
index ac27e81bc88..00000000000
--- a/third_party/sycl/crosstool/computecpp.tpl
+++ /dev/null
@@ -1,94 +0,0 @@
-#!/usr/bin/env python
-
-import os
-import sys
-import tempfile
-from subprocess import call, Popen, PIPE
-
-CPU_CXX_COMPILER = ('%{host_cxx_compiler}')
-CPU_C_COMPILER = ('%{host_c_compiler}')
-
-CURRENT_DIR = os.path.dirname(sys.argv[0])
-COMPUTECPP_ROOT = CURRENT_DIR + '/../sycl/'
-COMPUTECPP_DRIVER= COMPUTECPP_ROOT + 'bin/compute++'
-COMPUTECPP_INCLUDE = COMPUTECPP_ROOT + 'include'
-
-def main():
-  remove_flags = ('-Wl,--no-undefined', '-Wno-unused-but-set-variable', '-Wignored-attributes')
-  # remove -fsanitize-coverage from string with g++
-  if 'g++' in CPU_CXX_COMPILER:
-    remove_flags += ('-fsanitize-coverage',)
-  compiler_flags = [flag for flag in sys.argv[1:] if not flag.startswith(remove_flags)]
-
-  output_file_index = compiler_flags.index('-o') + 1
-  output_file_name = compiler_flags[output_file_index]
-
-  if output_file_index == 1:
-    # we are linking
-    return call([CPU_CXX_COMPILER] + compiler_flags + ['-Wl,--no-undefined'])
-
-  # find what we compile
-  compiling_cpp = False
-  if '-c' in compiler_flags:
-    compiled_file_index = compiler_flags.index('-c') + 1
-    compiled_file_name = compiler_flags[compiled_file_index]
-    compiling_cpp = compiled_file_name.endswith(('.cc', '.c++', '.cpp', '.CPP', '.C', '.cxx'))
-
-  # add -D_GLIBCXX_USE_CXX11_ABI=0 to the command line if you have custom installation of GCC/Clang
-  compiler_flags = compiler_flags + ['-DEIGEN_USE_SYCL=1', '-DTENSORFLOW_USE_SYCL', '-DEIGEN_HAS_C99_MATH']
-
-  if not compiling_cpp:
-    # compile for C
-    return call([CPU_C_COMPILER] + compiler_flags)
-
-  # create a denylist of folders that will be skipped when compiling with ComputeCpp
-  skip_extensions = [".cu.cc"]
-  skip_folders = ["tensorflow/compiler", "tensorflow/docs_src", "third_party", "external", "hexagon"]
-  skip_folders = [(folder + '/') for folder in skip_folders]
-  # if compiling external project skip computecpp
-  if any(compiled_file_name.endswith(_ext) for _ext in skip_extensions) or any(_folder in output_file_name for _folder in skip_folders):
-    return call([CPU_CXX_COMPILER] + compiler_flags)
-
-  # this is an optimisation that will check if compiled file has to be compiled with ComputeCpp
-  flags_without_output = list(compiler_flags)
-  del flags_without_output[output_file_index]   # remove output_file_name
-  del flags_without_output[output_file_index - 1] # remove '-o'
-  # create preprocessed of the file and store it for later use
-  pipe = Popen([CPU_CXX_COMPILER] + flags_without_output + ["-E"], stdout=PIPE)
-  preprocessed_file_str = pipe.communicate()[0]
-  if pipe.returncode != 0:
-    return pipe.returncode
-
-  # check if it has parallel_for in it
-  if not '.parallel_for' in preprocessed_file_str:
-    # call CXX compiler like usual
-    with tempfile.NamedTemporaryFile(suffix=".ii") as preprocessed_file: # Force '.ii' extension so that g++ does not preprocess the file again
-      preprocessed_file.write(preprocessed_file_str)
-      preprocessed_file.flush()
-      compiler_flags[compiled_file_index] = preprocessed_file.name
-      return call([CPU_CXX_COMPILER] + compiler_flags)
-  del preprocessed_file_str   # save some memory as this string can be quite big
-
-  filename, file_extension = os.path.splitext(output_file_name)
-  bc_out = filename + '.sycl'
-
-  # strip asan for the device
-  computecpp_device_compiler_flags = ['-sycl-compress-name', '-Wno-unused-variable', '-Wno-c++11-narrowing',
-                                      '-I', COMPUTECPP_INCLUDE, '-isystem', COMPUTECPP_INCLUDE,
-                                      '-std=c++11', '-sycl', '-emit-llvm', '-no-serial-memop',
-                                      '-Xclang', '-cl-denorms-are-zero', '-Xclang', '-cl-fp32-correctly-rounded-divide-sqrt']
-  # disable flags enabling SIMD instructions
-  computecpp_device_compiler_flags += [flag for flag in compiler_flags if \
-    not any(x in flag.lower() for x in ('-fsanitize', '-fno-canonical-system-headers', '=native', '=core2', 'msse', 'vectorize', 'mavx', 'mmmx', 'm3dnow', 'fma'))]
-
-  x = call([COMPUTECPP_DRIVER] + computecpp_device_compiler_flags)
-  if x == 0:
-    # dont want that in case of compiling with computecpp first
-    host_compiler_flags = [flag for flag in compiler_flags if (not flag.startswith(('-MF', '-MD',)) and not '.d' in flag)]
-    host_compiler_flags[host_compiler_flags.index('-c')] = "--include"
-    host_compiler_flags = ['-xc++', '-Wno-unused-variable', '-I', COMPUTECPP_INCLUDE, '-c', bc_out] + host_compiler_flags
-    x = call([CPU_CXX_COMPILER] + host_compiler_flags)
-  return x
-
-if __name__ == '__main__':
-  sys.exit(main())
diff --git a/third_party/sycl/crosstool/trisycl.tpl b/third_party/sycl/crosstool/trisycl.tpl
deleted file mode 100644
index 8206a1a94b1..00000000000
--- a/third_party/sycl/crosstool/trisycl.tpl
+++ /dev/null
@@ -1,85 +0,0 @@
-#!/usr/bin/env python
-
-import os
-import sys
-import tempfile
-from subprocess import call
-
-CPU_CXX_COMPILER = ('%{host_cxx_compiler}')
-CPU_C_COMPILER = ('%{host_c_compiler}')
-
-CURRENT_DIR = os.path.dirname(sys.argv[0])
-TRISYCL_INCLUDE_DIR = CURRENT_DIR + '/../sycl/include'
-
-
-def main():
-  compiler_flags = []
-
-  remove_flags = ('-Wl,--no-undefined', '-Wno-unused-but-set-variable',
-                  '-Wignored-attributes', '-fno-exceptions')
-  # remove -fsamotoze-coverage from string with g++
-  if 'g++' in CPU_CXX_COMPILER:
-    remove_flags += ('-fsanitize-coverage',)
-    compiler_flags += ['-fopenmp']
-  else:
-    compiler_flags += ['-fopenmp=libomp']
-
-  compiler_flags += [
-      flag for flag in sys.argv[1:] if not flag.startswith(remove_flags)
-  ]
-
-  output_file_index = compiler_flags.index('-o') + 1
-  output_file_name = compiler_flags[output_file_index]
-
-  if (output_file_index == 1):
-    # we are linking
-    return call([CPU_CXX_COMPILER] + compiler_flags + ['-Wl,--no-undefined'])
-
-  # find what we compile
-  compiling_cpp = 0
-  if ('-c' in compiler_flags):
-    compiled_file_index = compiler_flags.index('-c') + 1
-    compiled_file_name = compiler_flags[compiled_file_index]
-    if (compiled_file_name.endswith(('.cc', '.c++', '.cpp', '.CPP', '.C',
-                                     '.cxx'))):
-      compiling_cpp = 1
-
-  debug_flags = [
-      '-DTRISYCL_DEBUG', '-DBOOST_LOG_DYN_LINK', '-DTRISYCL_TRACE_KERNEL',
-      '-lpthread', '-lboost_log', '-g', '-rdynamic'
-  ]
-
-  opt_flags = ['-DNDEBUG', '-DBOOST_DISABLE_ASSERTS', '-O3']
-
-  compiler_flags = compiler_flags + [
-      '-DEIGEN_USE_SYCL=1', '-DEIGEN_HAS_C99_MATH',
-      '-DEIGEN_MAX_ALIGN_BYTES=16', '-DTENSORFLOW_USE_SYCL'
-  ] + opt_flags
-
-  if (compiling_cpp == 1):
-    # create a denylist of folders that will be skipped when compiling
-    # with triSYCL
-    skip_extensions = ['.cu.cc']
-    skip_folders = [
-        'tensorflow/compiler', 'tensorflow/docs_src', 'tensorflow/tensorboard',
-        'third_party', 'external', 'hexagon'
-    ]
-    skip_folders = [(folder + '/') for folder in skip_folders]
-    # if compiling external project skip triSYCL
-    if any(
-        compiled_file_name.endswith(_ext) for _ext in skip_extensions) or any(
-            _folder in output_file_name for _folder in skip_folders):
-      return call([CPU_CXX_COMPILER] + compiler_flags)
-
-    host_compiler_flags = [
-        '-xc++', '-Wno-unused-variable', '-I', TRISYCL_INCLUDE_DIR
-    ] + compiler_flags
-    x = call([CPU_CXX_COMPILER] + host_compiler_flags)
-    return x
-  else:
-    # compile for C
-    return call([CPU_C_COMPILER] + compiler_flags)
-
-
-if __name__ == '__main__':
-  sys.exit(main())
diff --git a/third_party/sycl/sycl/BUILD b/third_party/sycl/sycl/BUILD
deleted file mode 100644
index 65f5a8414c4..00000000000
--- a/third_party/sycl/sycl/BUILD
+++ /dev/null
@@ -1,8 +0,0 @@
-# Description:
-# A minimal BUILD file to make template files in this folder available. Without this BUILD file,
-# bazel returns errors when trying to access tpl files in this folder.
-
-package(
-    default_visibility = ["//visibility:public"],
-    licenses = ["notice"],  # Apache 2.0
-)
diff --git a/third_party/sycl/sycl/BUILD.tpl b/third_party/sycl/sycl/BUILD.tpl
deleted file mode 100755
index b7e9aa8edb4..00000000000
--- a/third_party/sycl/sycl/BUILD.tpl
+++ /dev/null
@@ -1,56 +0,0 @@
-licenses(["notice"])  # Apache 2.0
-
-load("@local_config_sycl//sycl:build_defs.bzl", "if_sycl")
-load(":platform.bzl", "sycl_library_path")
-
-load(":platform.bzl", "readlink_command")
-
-package(default_visibility = ["//visibility:public"])
-
-exports_files(["LICENSE.text"])
-
-config_setting(
-    name = "using_sycl_ccpp",
-    define_values = {
-        "using_sycl": "true",
-        "using_trisycl": "false",
-    },
-)
-
-config_setting(
-    name = "using_sycl_trisycl",
-    define_values = {
-        "using_sycl": "true",
-        "using_trisycl": "true",
-    },
-)
-
-
-cc_library(
-    name = "sycl_headers",
-    hdrs = glob([
-        "**/*.h",
-        "**/*.hpp",
-    ]),
-    includes = [".", "include"],
-)
-
-cc_library(
-    name = "syclrt",
-    srcs = [
-        sycl_library_path("ComputeCpp")
-    ],
-    data = [
-        sycl_library_path("ComputeCpp")
-    ],
-    includes = ["include/"],
-    linkstatic = 0,
-)
-
-cc_library(
-    name = "sycl",
-    deps = if_sycl([
-        ":sycl_headers",
-        ":syclrt",
-    ]),
-)
diff --git a/third_party/sycl/sycl/LICENSE.text b/third_party/sycl/sycl/LICENSE.text
deleted file mode 100644
index 8d3f050b392..00000000000
--- a/third_party/sycl/sycl/LICENSE.text
+++ /dev/null
@@ -1,268 +0,0 @@
-
----------------------------------------------------------------------
-
-SOFTWARE LICENSE AGREEMENT
-
----------------------------------------------------------------------
----------------------------------------------------------------------
-
-By downloading, installing, copying, or otherwise using the
-ComputeCpp Community Edition software, including any associated
-components, media, printed materials, and electronic documentation
-("Software"), the user agrees to the following terms and conditions
-of this Software License Agreement ("Agreement"). Please read the
-terms of this Agreement carefully before beginning your download, as
-pressing the "I AGREE" button at the end of this Agreement will
-confirm your assent. If you do not agree to these terms, then
-Codeplay Software Limited is unwilling to license the Software to
-you; so please press the "CANCEL" button to cancel your download.
-
- 1. License. Codeplay Software Ltd., a company incorporated in
-    England and Wales with registered number 04567874 and having its
-    registered office at Regent House, 316 Beulah Hill, London,
-    United Kingdom, SE19 3HF ("Codeplay") hereby grants the user,
-    free of charge, a non-exclusive worldwide license to use and
-    replicate (but not modify) the Software for any use, whether
-    commercial or non-commercial, in accordance with this Agreement.
-    Codeplay reserves all rights to the Software that are not
-    expressly granted by this Agreement.
- 2. Redistribution. The user may copy and redistribute unmodified
-    copies of only those components of the Software which are
-    specified below ("Redistributable Components"), in object code
-    form, as part of the user’s software applications or libraries
-    ("Applications"). The user acknowledges and agrees that it has no
-    right to modify the Redistributable Components in any way. Any
-    use of the Redistributable Components within the user’s
-    Applications will continue to be subject to the terms and
-    conditions of this Agreement, and the user must also distribute a
-    copy of this Agreement and reproduce and include all notices of
-    copyrights or other proprietary rights in the Software. The
-    user’s redistribution of the Redistributable Components will not
-    entitle it to any payment from Codeplay. The user may not
-    transfer any of its rights or obligations under this Agreement.
-
-+-------------------------------------------+
-|Redistributable Component|File Name        |
-|-------------------------+-----------------|
-|Runtime (for Linux)      |libComputeCpp.so |
-|-------------------------+-----------------|
-|Runtime (for Windows)    |libComputeCpp.dll|
-+-------------------------------------------+
-
- 3. Restrictions. The user shall not:
-
-     a. circumvent or bypass any technological protection measures in
-        or relating to the Software;
-     b. use the Software to perform any unauthorized transfer of
-        information or for any illegal purpose;
-     c. de-compile, decrypt, disassemble, hack, emulate, exploit or
-        reverse-engineer the Software (other than to the limited
-        extent permitted by law);
-     d. copy or redistribute any components of the Software that are
-        not listed in the table of Redistributable Components;
-     e. publish, rent, lease, sell, export, import, or lend the
-        Software;
-     f. represent in any way that it is selling the Software itself
-        or any license to use the Software, nor refer to Codeplay or
-        ComputeCpp within its marketing materials, without the
-        express prior written permission of Codeplay.
- 4. Support. Codeplay does not provide any guarantees of support for
-    the Software to the user. Codeplay will use reasonable endeavors
-    to respond to users' support requests, for the most recent
-    release only, via the community support website at https://
-    computecpp.codeplay.com.
- 5. Intellectual Property. The Software is owned by Codeplay or its
-    licensors, and is protected by the copyright laws of the United
-    Kingdom and other countries and international treaty provisions.
-    Codeplay (and/or its licensors, as the case may be) retains all
-    copyrights, trade secrets and other proprietary rights in the
-    Software, including the rights to make and license the use of all
-    copies. To the extent that any patents owned by Codeplay or its
-    licensors relate to any component of the Software, the license
-    granted to the user in accordance with this Agreement allows for
-    the lawful use of such patents but only for the purposes of this
-    Agreement and not further or otherwise. Therefore, the user may
-    make no copies of the Software, or the written materials that
-    accompany the Software, or reproduce it in any way, except as set
-    forth above.
- 6. Terms. This Agreement is effective until terminated. Codeplay or
-    the user may terminate it immediately at any time. Any violation
-    of the terms of this Agreement by the user will result in
-    immediate termination by Codeplay. Upon termination, the user
-    must return or destroy the Software and accompanying materials
-    and notify Codeplay of its actions by email to info@codeplay.com.
- 7. NO WARRANTIES. Codeplay expressly disclaims any warranty for the
-    Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
-    ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
-    WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE
-    AND NON-INFRINGEMENT. IN NO EVENT SHALL CODEPLAY BE LIABLE FOR
-    ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
-    CONTRACT, DELICT OR TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-    SOFTWARE. In particular, Codeplay provides no guarantees of
-    application performance on the target hardware.
- 8. General. The invalidity of any portion or provision of this
-    Agreement shall not affect any other portions or provisions. This
-    Agreement shall be governed by the laws of Scotland. This
-    Agreement is the complete and exclusive agreement between the
-    user and Codeplay regarding the Software, and it supersedes any
-    prior agreement, oral or written, and any other communication
-    between the user and Codeplay relating to the subject matter of
-    the Agreement. Any amendment or modification of this Agreement
-    must be in writing and signed by both parties. If the user does
-    not agree to the terms of this Agreement, the user must not
-    install or use the Software.
- 9. Third Party Licenses. The following licenses are for third-party
-    components included in the software.
-
-     a. License for Clang/LLVM compiler technology components:
-
-==============================================================================
-
-LLVM Release License
-
-==============================================================================
-
-University of Illinois/NCSA
-
-Open Source License
-
-Copyright (c) 2007-2014 University of Illinois at Urbana-Champaign.
-
-All rights reserved.
-
-Developed by:
-
- LLVM Team
-
- University of Illinois at Urbana-Champaign
-
- http://llvm.org
-
-Permission is hereby granted, free of charge, to any person obtaining a copy of
-this software and associated documentation files (the "Software"), to deal with
-the Software without restriction, including without limitation the rights to
-use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
-of the Software, and to permit persons to whom the Software is furnished to do
-so, subject to the following conditions:
-
- * Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimers.
-
- * Redistributions in binary form must reproduce the above copyright notice,
- this list of conditions and the following disclaimers in the
- documentation and/or other materials provided with the distribution.
-
- * Neither the names of the LLVM Team, University of Illinois at
- Urbana-Champaign, nor the names of its contributors may be used to
- endorse or promote products derived from this Software without specific
- prior written permission.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
-FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
-SOFTWARE.
-
-==============================================================================
-
- b. License for OpenBSD regex components:
-
-$OpenBSD: COPYRIGHT,v 1.3 2003/06/02 20:18:36 millert Exp $
-Copyright 1992, 1993, 1994 Henry Spencer. All rights reserved.
-This software is not subject to any license of the American Telephone
-and Telegraph Company or of the Regents of the University of California.
-Permission is granted to anyone to use this software for any purpose on
-any computer system, and to alter it and redistribute it, subject
-to the following restrictions:
-
-1. The author is not responsible for the consequences of use of this
- software, no matter how awful, even if they arise from flaws in it.
-
-2. The origin of this software must not be misrepresented, either by
- explicit claim or by omission. Since few users ever read sources,
- credits must appear in the documentation.
-
-3. Altered versions must be plainly marked as such, and must not be
- misrepresented as being the original software. Since few users
- ever read sources, credits must appear in the documentation.
-
-4. This notice may not be removed or altered.
-
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
-
-/*-
- * Copyright (c) 1994
- *      The Regents of the University of California. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. Neither the name of the University nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- *                  @(#)COPYRIGHT8.1 (Berkeley) 3/16/94
- */
-
- c. License for MD5 components:
-
-/*
- * This code is derived from (original license follows):
- *
- * This is an OpenSSL-compatible implementation of the RSA Data Security, Inc.
- * MD5 Message-Digest Algorithm (RFC 1321).
- *
- * Homepage:
- *  http://openwall.info/wiki/people/solar/software/public-domain-source-code/md5
- *
- * Author:
- * Alexander Peslyak, better known as Solar Designer <solar at openwall.com>
- *
- * This software was written by Alexander Peslyak in 2001. No copyright is
- * claimed, and the software is hereby placed in the public domain.
- * In case this attempt to disclaim copyright and place the software in the
- * public domain is deemed null and void, then the software is
- * Copyright (c) 2001 Alexander Peslyak and it is hereby released to the
- * general public under the following terms:
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted.
- *
- * There's ABSOLUTELY NO WARRANTY, express or implied.
- *
- * (This is a heavily cut-down "BSD license".)
- *
- * This differs from Colin Plumb's older public domain implementation in that
- * no exactly 32-bit integer data type is required (any 32-bit or wider
- * unsigned integer data type will do), there's no compile-time endianness
- * configuration, and the function prototypes match OpenSSL's. No code from
- * Colin Plumb's implementation has been reused; this comment merely compares
- * the properties of the two independent implementations.
- *
- * The primary goals of this implementation are portability and ease of use.
- * It is meant to be fast, but not as fast as possible. Some known
- * optimizations are not included to reduce source code size and avoid
- * compile-time configuration.
- */
-
-
diff --git a/third_party/sycl/sycl/build_defs.bzl.tpl b/third_party/sycl/sycl/build_defs.bzl.tpl
deleted file mode 100755
index a726c8d953c..00000000000
--- a/third_party/sycl/sycl/build_defs.bzl.tpl
+++ /dev/null
@@ -1,28 +0,0 @@
-# Macros for building SYCL code.
-
-def if_sycl(if_true, if_false = []):
-    """Shorthand for select()'ing on whether we're building with SYCL.
-
-    Returns a select statement which evaluates to if_true if we're building
-    with SYCL enabled.  Otherwise, the select statement evaluates to if_false.
-    If we are building with triSYCL instead of ComputeCPP, a list with
-    the first element of if_true is returned.
-    """
-    return select({
-        "@local_config_sycl//sycl:using_sycl_ccpp": if_true,
-        "@local_config_sycl//sycl:using_sycl_trisycl": if_true[0:1],
-        "//conditions:default": if_false,
-    })
-
-def if_ccpp(if_true, if_false = []):
-    """Shorthand for select()'ing if we are building with ComputeCPP.
-
-    Returns a select statement which evaluates to if_true if we're building
-    with ComputeCPP enabled. Otherwise, the select statement evaluates
-    to if_false.
-    """
-    return select({
-        "@local_config_sycl//sycl:using_sycl_ccpp": if_true,
-        "@local_config_sycl//sycl:using_sycl_trisycl": if_false,
-        "//conditions:default": if_false,
-    })
diff --git a/third_party/sycl/sycl/platform.bzl.tpl b/third_party/sycl/sycl/platform.bzl.tpl
deleted file mode 100755
index cb4b3356b22..00000000000
--- a/third_party/sycl/sycl/platform.bzl.tpl
+++ /dev/null
@@ -1,5 +0,0 @@
-def sycl_library_path(name):
-    return "lib/lib{}.so".format(name)
-
-def readlink_command():
-    return "readlink"
diff --git a/third_party/sycl/sycl_configure.bzl b/third_party/sycl/sycl_configure.bzl
deleted file mode 100644
index 185160af9e3..00000000000
--- a/third_party/sycl/sycl_configure.bzl
+++ /dev/null
@@ -1,260 +0,0 @@
-"""SYCL autoconfiguration.
-`sycl_configure` depends on the following environment variables:
-
-  * HOST_CXX_COMPILER:  The host C++ compiler
-  * HOST_C_COMPILER:    The host C compiler
-  * COMPUTECPP_TOOLKIT_PATH: The path to the ComputeCpp toolkit.
-  * TRISYCL_INCLUDE_DIR: The path to the include directory of triSYCL.
-                         (if using triSYCL instead of ComputeCPP)
-  * PYTHON_LIB_PATH: The path to the python lib
-"""
-
-_HOST_CXX_COMPILER = "HOST_CXX_COMPILER"
-_HOST_C_COMPILER = "HOST_C_COMPILER"
-_COMPUTECPP_TOOLKIT_PATH = "COMPUTECPP_TOOLKIT_PATH"
-_TRISYCL_INCLUDE_DIR = "TRISYCL_INCLUDE_DIR"
-_PYTHON_LIB_PATH = "PYTHON_LIB_PATH"
-
-def _enable_sycl(repository_ctx):
-    if "TF_NEED_OPENCL_SYCL" in repository_ctx.os.environ:
-        enable_sycl = repository_ctx.os.environ["TF_NEED_OPENCL_SYCL"].strip()
-        return enable_sycl == "1"
-    return False
-
-def _enable_compute_cpp(repository_ctx):
-    return _COMPUTECPP_TOOLKIT_PATH in repository_ctx.os.environ
-
-def auto_configure_fail(msg):
-    """Output failure message when auto configuration fails."""
-    red = "\033[0;31m"
-    no_color = "\033[0m"
-    fail("\n%sAuto-Configuration Error:%s %s\n" % (red, no_color, msg))
-
-# END cc_configure common functions (see TODO above).
-
-def find_c(repository_ctx):
-    """Find host C compiler."""
-    c_name = "gcc"
-    if _HOST_C_COMPILER in repository_ctx.os.environ:
-        c_name = repository_ctx.os.environ[_HOST_C_COMPILER].strip()
-    if c_name.startswith("/"):
-        return c_name
-    c = repository_ctx.which(c_name)
-    if c == None:
-        fail("Cannot find C compiler, please correct your path.")
-    return c
-
-def find_cc(repository_ctx):
-    """Find host C++ compiler."""
-    cc_name = "g++"
-    if _HOST_CXX_COMPILER in repository_ctx.os.environ:
-        cc_name = repository_ctx.os.environ[_HOST_CXX_COMPILER].strip()
-    if cc_name.startswith("/"):
-        return cc_name
-    cc = repository_ctx.which(cc_name)
-    if cc == None:
-        fail("Cannot find C++ compiler, please correct your path.")
-    return cc
-
-def find_computecpp_root(repository_ctx):
-    """Find ComputeCpp compiler."""
-    sycl_name = ""
-    if _COMPUTECPP_TOOLKIT_PATH in repository_ctx.os.environ:
-        sycl_name = repository_ctx.os.environ[_COMPUTECPP_TOOLKIT_PATH].strip()
-    if sycl_name.startswith("/"):
-        return sycl_name
-    fail("Cannot find SYCL compiler, please correct your path")
-
-def find_trisycl_include_dir(repository_ctx):
-    """Find triSYCL include directory. """
-    if _TRISYCL_INCLUDE_DIR in repository_ctx.os.environ:
-        sycl_name = repository_ctx.os.environ[_TRISYCL_INCLUDE_DIR].strip()
-        if sycl_name.startswith("/"):
-            return sycl_name
-    fail("Cannot find triSYCL include directory, please correct your path")
-
-def find_python_lib(repository_ctx):
-    """Returns python path."""
-    if _PYTHON_LIB_PATH in repository_ctx.os.environ:
-        return repository_ctx.os.environ[_PYTHON_LIB_PATH].strip()
-    fail("Environment variable PYTHON_LIB_PATH was not specified re-run ./configure")
-
-def _check_lib(repository_ctx, toolkit_path, lib):
-    """Checks if lib exists under sycl_toolkit_path or fail if it doesn't.
-
-    Args:
-      repository_ctx: The repository context.
-      toolkit_path: The toolkit directory containing the libraries.
-      ib: The library to look for under toolkit_path.
-    """
-    lib_path = toolkit_path + "/" + lib
-    if not repository_ctx.path(lib_path).exists:
-        auto_configure_fail("Cannot find %s" % lib_path)
-
-def _check_dir(repository_ctx, directory):
-    """Checks whether the directory exists and fail if it does not.
-
-    Args:
-      repository_ctx: The repository context.
-      directory: The directory to check the existence of.
-    """
-    if not repository_ctx.path(directory).exists:
-        auto_configure_fail("Cannot find dir: %s" % directory)
-
-def _symlink_dir(repository_ctx, src_dir, dest_dir):
-    """Symlinks all the files in a directory.
-
-    Args:
-      repository_ctx: The repository context.
-      src_dir: The source directory.
-      dest_dir: The destination directory to create the symlinks in.
-    """
-    files = repository_ctx.path(src_dir).readdir()
-    for src_file in files:
-        repository_ctx.symlink(src_file, dest_dir + "/" + src_file.basename)
-
-def _tpl(repository_ctx, tpl, substitutions = {}, out = None):
-    if not out:
-        out = tpl.replace(":", "/")
-    repository_ctx.template(
-        out,
-        Label("//third_party/sycl/%s.tpl" % tpl),
-        substitutions,
-    )
-
-def _file(repository_ctx, label):
-    repository_ctx.template(
-        label.replace(":", "/"),
-        Label("//third_party/sycl/%s" % label),
-        {},
-    )
-
-_DUMMY_CROSSTOOL_BZL_FILE = """
-def error_sycl_disabled():
-  fail("ERROR: Building with --config=sycl but TensorFlow is not configured " +
-       "to build with SYCL support. Please re-run ./configure and enter 'Y' " +
-       "at the prompt to build with SYCL support.")
-
-  native.genrule(
-      name = "error_gen_crosstool",
-      outs = ["CROSSTOOL"],
-      cmd = "echo 'Should not be run.' && exit 1",
-  )
-
-  native.filegroup(
-      name = "crosstool",
-      srcs = [":CROSSTOOL"],
-      output_licenses = ["unencumbered"],
-  )
-"""
-
-_DUMMY_CROSSTOOL_BUILD_FILE = """
-load("//crosstool:error_sycl_disabled.bzl", "error_sycl_disabled")
-
-error_sycl_disabled()
-"""
-
-def _create_dummy_repository(repository_ctx):
-    # Set up BUILD file for sycl/.
-    _tpl(repository_ctx, "sycl:build_defs.bzl")
-    _tpl(repository_ctx, "sycl:BUILD")
-    _file(repository_ctx, "sycl:LICENSE.text")
-    _tpl(repository_ctx, "sycl:platform.bzl")
-
-    # Create dummy files for the SYCL toolkit since they are still required by
-    # tensorflow/sycl/platform/default/build_config:sycl.
-    repository_ctx.file("sycl/include/sycl.hpp", "")
-    repository_ctx.file("sycl/lib/libComputeCpp.so", "")
-
-    # If sycl_configure is not configured to build with SYCL support, and the user
-    # attempts to build with --config=sycl, add a dummy build rule to intercept
-    # this and fail with an actionable error message.
-    repository_ctx.file(
-        "crosstool/error_sycl_disabled.bzl",
-        _DUMMY_CROSSTOOL_BZL_FILE,
-    )
-    repository_ctx.file("crosstool/BUILD", _DUMMY_CROSSTOOL_BUILD_FILE)
-
-def _sycl_autoconf_imp(repository_ctx):
-    """Implementation of the sycl_autoconf rule."""
-    if not _enable_sycl(repository_ctx):
-        _create_dummy_repository(repository_ctx)
-    else:
-        # copy template files
-        _tpl(repository_ctx, "sycl:build_defs.bzl")
-        _tpl(repository_ctx, "sycl:BUILD")
-        _tpl(repository_ctx, "sycl:platform.bzl")
-        _tpl(repository_ctx, "crosstool:BUILD")
-        _file(repository_ctx, "sycl:LICENSE.text")
-
-        if _enable_compute_cpp(repository_ctx):
-            _tpl(
-                repository_ctx,
-                "crosstool:computecpp",
-                {
-                    "%{host_cxx_compiler}": find_cc(repository_ctx),
-                    "%{host_c_compiler}": find_c(repository_ctx),
-                },
-            )
-
-            computecpp_root = find_computecpp_root(repository_ctx)
-            _check_dir(repository_ctx, computecpp_root)
-
-            _tpl(
-                repository_ctx,
-                "crosstool:CROSSTOOL",
-                {
-                    "%{sycl_include_dir}": computecpp_root,
-                    "%{sycl_impl}": "computecpp",
-                    "%{c++_std}": "-std=c++11",
-                    "%{python_lib_path}": find_python_lib(repository_ctx),
-                },
-            )
-
-            # symlink libraries
-            _check_lib(repository_ctx, computecpp_root + "/lib", "libComputeCpp.so")
-            _symlink_dir(repository_ctx, computecpp_root + "/lib", "sycl/lib")
-            _symlink_dir(repository_ctx, computecpp_root + "/include", "sycl/include")
-            _symlink_dir(repository_ctx, computecpp_root + "/bin", "sycl/bin")
-        else:
-            trisycl_include_dir = find_trisycl_include_dir(repository_ctx)
-            _check_dir(repository_ctx, trisycl_include_dir)
-
-            _tpl(
-                repository_ctx,
-                "crosstool:trisycl",
-                {
-                    "%{host_cxx_compiler}": find_cc(repository_ctx),
-                    "%{host_c_compiler}": find_c(repository_ctx),
-                    "%{trisycl_include_dir}": trisycl_include_dir,
-                },
-            )
-
-            _tpl(
-                repository_ctx,
-                "crosstool:CROSSTOOL",
-                {
-                    "%{sycl_include_dir}": trisycl_include_dir,
-                    "%{sycl_impl}": "trisycl",
-                    "%{c++_std}": "-std=c++1y",
-                    "%{python_lib_path}": find_python_lib(repository_ctx),
-                },
-            )
-
-            _symlink_dir(repository_ctx, trisycl_include_dir, "sycl/include")
-
-sycl_configure = repository_rule(
-    implementation = _sycl_autoconf_imp,
-    local = True,
-)
-"""Detects and configures the SYCL toolchain.
-
-Add the following to your WORKSPACE FILE:
-
-```python
-sycl_configure(name = "local_config_sycl")
-```
-
-Args:
-  name: A unique name for this workspace rule.
-"""
diff --git a/third_party/systemlibs/jsoncpp.BUILD b/third_party/systemlibs/jsoncpp.BUILD
index 7d54f9289bf..b5951e3a340 100644
--- a/third_party/systemlibs/jsoncpp.BUILD
+++ b/third_party/systemlibs/jsoncpp.BUILD
@@ -5,35 +5,8 @@ filegroup(
     visibility = ["//visibility:public"],
 )
 
-HEADERS = [
-    "include/json/allocator.h",
-    "include/json/assertions.h",
-    "include/json/autolink.h",
-    "include/json/config.h",
-    "include/json/features.h",
-    "include/json/forwards.h",
-    "include/json/json.h",
-    "include/json/reader.h",
-    "include/json/value.h",
-    "include/json/version.h",
-    "include/json/writer.h",
-]
-
-genrule(
-    name = "link_headers",
-    outs = HEADERS,
-    cmd = """
-      for i in $(OUTS); do
-        i=$${i##*/}
-        ln -sf $(INCLUDEDIR)/jsoncpp/json/$$i $(@D)/include/json/$$i
-      done
-    """,
-)
-
 cc_library(
     name = "jsoncpp",
-    hdrs = HEADERS,
-    includes = ["."],
     linkopts = ["-ljsoncpp"],
     visibility = ["//visibility:public"],
 )
diff --git a/third_party/systemlibs/protobuf.BUILD b/third_party/systemlibs/protobuf.BUILD
index 118135d1290..ccf2ab4dc7d 100644
--- a/third_party/systemlibs/protobuf.BUILD
+++ b/third_party/systemlibs/protobuf.BUILD
@@ -12,38 +12,24 @@ filegroup(
     visibility = ["//visibility:public"],
 )
 
-HEADERS = [
-    "google/protobuf/any.pb.h",
+PROTO_FILES = [
     "google/protobuf/any.proto",
-    "google/protobuf/arena.h",
-    "google/protobuf/compiler/importer.h",
-    "google/protobuf/descriptor.h",
-    "google/protobuf/descriptor.pb.h",
+    "google/protobuf/api.proto",
+    "google/protobuf/compiler/plugin.proto",
     "google/protobuf/descriptor.proto",
-    "google/protobuf/duration.pb.h",
     "google/protobuf/duration.proto",
-    "google/protobuf/dynamic_message.h",
-    "google/protobuf/empty.pb.h",
     "google/protobuf/empty.proto",
-    "google/protobuf/field_mask.pb.h",
     "google/protobuf/field_mask.proto",
-    "google/protobuf/io/coded_stream.h",
-    "google/protobuf/io/zero_copy_stream.h",
-    "google/protobuf/io/zero_copy_stream_impl_lite.h",
-    "google/protobuf/map.h",
-    "google/protobuf/repeated_field.h",
-    "google/protobuf/text_format.h",
-    "google/protobuf/timestamp.pb.h",
+    "google/protobuf/source_context.proto",
+    "google/protobuf/struct.proto",
     "google/protobuf/timestamp.proto",
-    "google/protobuf/util/json_util.h",
-    "google/protobuf/util/type_resolver_util.h",
-    "google/protobuf/wrappers.pb.h",
+    "google/protobuf/type.proto",
     "google/protobuf/wrappers.proto",
 ]
 
 genrule(
-    name = "link_headers",
-    outs = HEADERS,
+    name = "link_proto_files",
+    outs = PROTO_FILES,
     cmd = """
       for i in $(OUTS); do
         f=$${i#$(@D)/}
@@ -55,14 +41,12 @@ genrule(
 
 cc_library(
     name = "protobuf",
-    hdrs = HEADERS,
     linkopts = ["-lprotobuf"],
     visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "protobuf_headers",
-    hdrs = HEADERS,
     linkopts = ["-lprotobuf"],
     visibility = ["//visibility:public"],
 )
@@ -83,7 +67,6 @@ genrule(
 
 cc_proto_library(
     name = "cc_wkt_protos",
-    hdrs = HEADERS,
     internal_bootstrap_hack = 1,
     protoc = ":protoc",
     visibility = ["//visibility:public"],
@@ -98,7 +81,78 @@ proto_gen(
 
 py_library(
     name = "protobuf_python",
-    data = [":link_headers"],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
 )
+
+proto_library(
+    name = "any_proto",
+    srcs = ["google/protobuf/any.proto"],
+    visibility = ["//visibility:public"],
+)
+
+proto_library(
+    name = "api_proto",
+    srcs = ["google/protobuf/api.proto"],
+    visibility = ["//visibility:public"],
+)
+
+proto_library(
+    name = "compiler_plugin_proto",
+    srcs = ["google/protobuf/compiler/plugin.proto"],
+    visibility = ["//visibility:public"],
+)
+
+proto_library(
+    name = "descriptor_proto",
+    srcs = ["google/protobuf/descriptor.proto"],
+    visibility = ["//visibility:public"],
+)
+
+proto_library(
+    name = "duration_proto",
+    srcs = ["google/protobuf/duration.proto"],
+    visibility = ["//visibility:public"],
+)
+
+proto_library(
+    name = "empty_proto",
+    srcs = ["google/protobuf/empty.proto"],
+    visibility = ["//visibility:public"],
+)
+
+proto_library(
+    name = "field_mask_proto",
+    srcs = ["google/protobuf/field_mask.proto"],
+    visibility = ["//visibility:public"],
+)
+
+proto_library(
+    name = "source_context_proto",
+    srcs = ["google/protobuf/source_context.proto"],
+    visibility = ["//visibility:public"],
+)
+
+proto_library(
+    name = "struct_proto",
+    srcs = ["google/protobuf/struct.proto"],
+    visibility = ["//visibility:public"],
+)
+
+proto_library(
+    name = "timestamp_proto",
+    srcs = ["google/protobuf/timestamp.proto"],
+    visibility = ["//visibility:public"],
+)
+
+proto_library(
+    name = "type_proto",
+    srcs = ["google/protobuf/type.proto"],
+    visibility = ["//visibility:public"],
+)
+
+proto_library(
+    name = "wrappers_proto",
+    srcs = ["google/protobuf/wrappers.proto"],
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/systemlibs/swig.BUILD b/third_party/systemlibs/swig.BUILD
deleted file mode 100644
index 4c9b74dadbc..00000000000
--- a/third_party/systemlibs/swig.BUILD
+++ /dev/null
@@ -1,23 +0,0 @@
-licenses(["restricted"])  # GPLv3
-
-filegroup(
-    name = "LICENSE",
-    visibility = ["//visibility:public"],
-)
-
-filegroup(
-    name = "templates",
-    visibility = ["//visibility:public"],
-)
-
-genrule(
-    name = "lnswiglink",
-    outs = ["swiglink"],
-    cmd = "ln -s $$(which swig) $@",
-)
-
-sh_binary(
-    name = "swig",
-    srcs = ["swiglink"],
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/systemlibs/syslibs_configure.bzl b/third_party/systemlibs/syslibs_configure.bzl
index 217c0131186..76948f2c2cb 100644
--- a/third_party/systemlibs/syslibs_configure.bzl
+++ b/third_party/systemlibs/syslibs_configure.bzl
@@ -41,7 +41,6 @@ VALID_LIBS = [
     "pybind11",
     "six_archive",
     "snappy",
-    "swig",
     "termcolor_archive",
     "wrapt",
     "zlib",
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda11/BUILD b/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda11/BUILD
new file mode 100755
index 00000000000..358af09fbdd
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda11/BUILD
@@ -0,0 +1,175 @@
+# This file is expanded from a template by cuda_configure.bzl
+# Update cuda_configure.bzl#verify_build_defines when adding new variables.
+
+load(":cc_toolchain_config.bzl", "cc_toolchain_config")
+
+licenses(["restricted"])
+
+package(default_visibility = ["//visibility:public"])
+
+toolchain(
+    name = "toolchain-linux-x86_64",
+    exec_compatible_with = [
+        "@bazel_tools//platforms:linux",
+        "@bazel_tools//platforms:x86_64",
+    ],
+    target_compatible_with = [
+        "@bazel_tools//platforms:linux",
+        "@bazel_tools//platforms:x86_64",
+    ],
+    toolchain = ":cc-compiler-local",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain_suite(
+    name = "toolchain",
+    toolchains = {
+        "local|compiler": ":cc-compiler-local",
+        "darwin|compiler": ":cc-compiler-darwin",
+        "x64_windows|msvc-cl": ":cc-compiler-windows",
+        "x64_windows": ":cc-compiler-windows",
+        "arm": ":cc-compiler-local",
+        "aarch64": ":cc-compiler-local",
+        "k8": ":cc-compiler-local",
+        "piii": ":cc-compiler-local",
+        "ppc": ":cc-compiler-local",
+        "darwin": ":cc-compiler-darwin",
+    },
+)
+
+cc_toolchain(
+    name = "cc-compiler-local",
+    all_files = ":crosstool_wrapper_driver_is_not_gcc",
+    ar_files = ":crosstool_wrapper_driver_is_not_gcc",
+    as_files = ":crosstool_wrapper_driver_is_not_gcc",
+    compiler_files = ":crosstool_wrapper_driver_is_not_gcc",
+    dwp_files = ":empty",
+    linker_files = ":crosstool_wrapper_driver_is_not_gcc",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    # To support linker flags that need to go to the start of command line
+    # we need the toolchain to support parameter files. Parameter files are
+    # last on the command line and contain all shared libraries to link, so all
+    # regular options will be left of them.
+    supports_param_files = 1,
+    toolchain_config = ":cc-compiler-local-config",
+    toolchain_identifier = "local_linux",
+)
+
+cc_toolchain_config(
+    name = "cc-compiler-local-config",
+    builtin_include_directories = [
+        "/dt7/usr/include/c++/7",
+        "/dt7/usr/include/c++/7/x86_64-pc-linux-gnu",
+        "/dt7/usr/include/c++/7/backward",
+        "/dt7/usr/lib/gcc/x86_64-pc-linux-gnu/7/include",
+        "/dt7/usr/lib/gcc/x86_64-pc-linux-gnu/7/include-fixed",
+        "/dt7/usr/include",
+        "/usr/local/cuda-11.0/targets/x86_64-linux/include",
+        "/usr/local/cuda-11.0/include",
+        "/usr/local/cuda-11.0/extras/CUPTI/include",
+        "/usr/include",
+    ],
+    builtin_sysroot = "",
+    cpu = "local",
+    cuda_path = "",
+    extra_no_canonical_prefixes_flags = ["-fno-canonical-system-headers"],
+    host_compiler_path = "clang/bin/crosstool_wrapper_driver_is_not_gcc",
+    host_compiler_prefix = "/usr/bin",
+    host_compiler_warnings = [],
+    host_unfiltered_compile_flags = [],
+    linker_bin_path = "/usr/bin",
+)
+
+cc_toolchain(
+    name = "cc-compiler-darwin",
+    all_files = ":crosstool_wrapper_driver_is_not_gcc",
+    ar_files = ":crosstool_wrapper_driver_is_not_gcc",
+    as_files = ":crosstool_wrapper_driver_is_not_gcc",
+    compiler_files = ":crosstool_wrapper_driver_is_not_gcc",
+    dwp_files = ":empty",
+    linker_files = ":crosstool_wrapper_driver_is_not_gcc",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 0,
+    toolchain_config = ":cc-compiler-local-darwin",
+    toolchain_identifier = "local_darwin",
+)
+
+cc_toolchain_config(
+    name = "cc-compiler-local-darwin",
+    builtin_include_directories = [
+        "/dt7/usr/include/c++/7",
+        "/dt7/usr/include/c++/7/x86_64-pc-linux-gnu",
+        "/dt7/usr/include/c++/7/backward",
+        "/dt7/usr/lib/gcc/x86_64-pc-linux-gnu/7/include",
+        "/dt7/usr/lib/gcc/x86_64-pc-linux-gnu/7/include-fixed",
+        "/dt7/usr/include",
+        "/usr/local/cuda-11.0/targets/x86_64-linux/include",
+        "/usr/local/cuda-11.0/include",
+        "/usr/local/cuda-11.0/extras/CUPTI/include",
+        "/usr/include",
+    ],
+    cpu = "darwin",
+    extra_no_canonical_prefixes_flags = ["-fno-canonical-system-headers"],
+    host_compiler_path = "clang/bin/crosstool_wrapper_driver_is_not_gcc",
+    host_compiler_prefix = "/usr/bin",
+    host_compiler_warnings = [],
+    host_unfiltered_compile_flags = [],
+    linker_bin_path = "/usr/bin",
+)
+
+cc_toolchain(
+    name = "cc-compiler-windows",
+    all_files = ":windows_msvc_wrapper_files",
+    ar_files = ":windows_msvc_wrapper_files",
+    as_files = ":windows_msvc_wrapper_files",
+    compiler_files = ":windows_msvc_wrapper_files",
+    dwp_files = ":empty",
+    linker_files = ":windows_msvc_wrapper_files",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":cc-compiler-windows-config",
+    toolchain_identifier = "local_windows",
+)
+
+cc_toolchain_config(
+    name = "cc-compiler-windows-config",
+    builtin_include_directories = [
+        "/dt7/usr/include/c++/7",
+        "/dt7/usr/include/c++/7/x86_64-pc-linux-gnu",
+        "/dt7/usr/include/c++/7/backward",
+        "/dt7/usr/lib/gcc/x86_64-pc-linux-gnu/7/include",
+        "/dt7/usr/lib/gcc/x86_64-pc-linux-gnu/7/include-fixed",
+        "/dt7/usr/include",
+        "/usr/local/cuda-11.0/targets/x86_64-linux/include",
+        "/usr/local/cuda-11.0/include",
+        "/usr/local/cuda-11.0/extras/CUPTI/include",
+        "/usr/include",
+    ],
+    cpu = "x64_windows",
+    msvc_cl_path = "msvc_not_used",
+    msvc_env_include = "msvc_not_used",
+    msvc_env_lib = "msvc_not_used",
+    msvc_env_path = "msvc_not_used",
+    msvc_env_tmp = "msvc_not_used",
+    msvc_lib_path = "msvc_not_used",
+    msvc_link_path = "msvc_not_used",
+    msvc_ml_path = "msvc_not_used",
+)
+
+filegroup(
+    name = "empty",
+    srcs = [],
+)
+
+filegroup(
+    name = "crosstool_wrapper_driver_is_not_gcc",
+    srcs = ["clang/bin/crosstool_wrapper_driver_is_not_gcc"],
+)
+
+filegroup(
+    name = "windows_msvc_wrapper_files",
+    srcs = glob(["windows/msvc_*"]),
+)
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda11/cc_toolchain_config.bzl b/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda11/cc_toolchain_config.bzl
new file mode 100755
index 00000000000..70197628811
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda11/cc_toolchain_config.bzl
@@ -0,0 +1,1516 @@
+"""cc_toolchain_config rule for configuring CUDA toolchains on Linux, Mac, and Windows."""
+
+load(
+    "@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl",
+    "action_config",
+    "env_entry",
+    "env_set",
+    "feature",
+    "feature_set",
+    "flag_group",
+    "flag_set",
+    "tool",
+    "tool_path",
+    "variable_with_value",
+)
+load(
+    "@bazel_tools//tools/build_defs/cc:action_names.bzl",
+    "ASSEMBLE_ACTION_NAME",
+    "CC_FLAGS_MAKE_VARIABLE_ACTION_NAME",
+    "CLIF_MATCH_ACTION_NAME",
+    "CPP_COMPILE_ACTION_NAME",
+    "CPP_HEADER_PARSING_ACTION_NAME",
+    "CPP_LINK_DYNAMIC_LIBRARY_ACTION_NAME",
+    "CPP_LINK_EXECUTABLE_ACTION_NAME",
+    "CPP_LINK_NODEPS_DYNAMIC_LIBRARY_ACTION_NAME",
+    "CPP_LINK_STATIC_LIBRARY_ACTION_NAME",
+    "CPP_MODULE_CODEGEN_ACTION_NAME",
+    "CPP_MODULE_COMPILE_ACTION_NAME",
+    "C_COMPILE_ACTION_NAME",
+    "LINKSTAMP_COMPILE_ACTION_NAME",
+    "LTO_BACKEND_ACTION_NAME",
+    "LTO_INDEXING_ACTION_NAME",
+    "OBJCPP_COMPILE_ACTION_NAME",
+    "OBJCPP_EXECUTABLE_ACTION_NAME",
+    "OBJC_ARCHIVE_ACTION_NAME",
+    "OBJC_COMPILE_ACTION_NAME",
+    "OBJC_EXECUTABLE_ACTION_NAME",
+    "OBJC_FULLY_LINK_ACTION_NAME",
+    "PREPROCESS_ASSEMBLE_ACTION_NAME",
+    "STRIP_ACTION_NAME",
+)
+
+ACTION_NAMES = struct(
+    c_compile = C_COMPILE_ACTION_NAME,
+    cpp_compile = CPP_COMPILE_ACTION_NAME,
+    linkstamp_compile = LINKSTAMP_COMPILE_ACTION_NAME,
+    cc_flags_make_variable = CC_FLAGS_MAKE_VARIABLE_ACTION_NAME,
+    cpp_module_codegen = CPP_MODULE_CODEGEN_ACTION_NAME,
+    cpp_header_parsing = CPP_HEADER_PARSING_ACTION_NAME,
+    cpp_module_compile = CPP_MODULE_COMPILE_ACTION_NAME,
+    assemble = ASSEMBLE_ACTION_NAME,
+    preprocess_assemble = PREPROCESS_ASSEMBLE_ACTION_NAME,
+    lto_indexing = LTO_INDEXING_ACTION_NAME,
+    lto_backend = LTO_BACKEND_ACTION_NAME,
+    cpp_link_executable = CPP_LINK_EXECUTABLE_ACTION_NAME,
+    cpp_link_dynamic_library = CPP_LINK_DYNAMIC_LIBRARY_ACTION_NAME,
+    cpp_link_nodeps_dynamic_library = CPP_LINK_NODEPS_DYNAMIC_LIBRARY_ACTION_NAME,
+    cpp_link_static_library = CPP_LINK_STATIC_LIBRARY_ACTION_NAME,
+    strip = STRIP_ACTION_NAME,
+    objc_archive = OBJC_ARCHIVE_ACTION_NAME,
+    objc_compile = OBJC_COMPILE_ACTION_NAME,
+    objc_executable = OBJC_EXECUTABLE_ACTION_NAME,
+    objc_fully_link = OBJC_FULLY_LINK_ACTION_NAME,
+    objcpp_compile = OBJCPP_COMPILE_ACTION_NAME,
+    objcpp_executable = OBJCPP_EXECUTABLE_ACTION_NAME,
+    clif_match = CLIF_MATCH_ACTION_NAME,
+    objcopy_embed_data = "objcopy_embed_data",
+    ld_embed_data = "ld_embed_data",
+)
+
+def _impl(ctx):
+    if (ctx.attr.cpu == "darwin"):
+        toolchain_identifier = "local_darwin"
+    elif (ctx.attr.cpu == "local"):
+        toolchain_identifier = "local_linux"
+    elif (ctx.attr.cpu == "x64_windows"):
+        toolchain_identifier = "local_windows"
+    else:
+        fail("Unreachable")
+
+    host_system_name = "local"
+
+    target_system_name = "local"
+
+    if (ctx.attr.cpu == "darwin"):
+        target_cpu = "darwin"
+    elif (ctx.attr.cpu == "local"):
+        target_cpu = "local"
+    elif (ctx.attr.cpu == "x64_windows"):
+        target_cpu = "x64_windows"
+    else:
+        fail("Unreachable")
+
+    if (ctx.attr.cpu == "local"):
+        target_libc = "local"
+    elif (ctx.attr.cpu == "darwin"):
+        target_libc = "macosx"
+    elif (ctx.attr.cpu == "x64_windows"):
+        target_libc = "msvcrt"
+    else:
+        fail("Unreachable")
+
+    if (ctx.attr.cpu == "darwin" or
+        ctx.attr.cpu == "local"):
+        compiler = "compiler"
+    elif (ctx.attr.cpu == "x64_windows"):
+        compiler = "msvc-cl"
+    else:
+        fail("Unreachable")
+
+    abi_version = "local"
+
+    abi_libc_version = "local"
+
+    cc_target_os = None
+
+    builtin_sysroot = ctx.attr.builtin_sysroot
+
+    all_link_actions = [
+        ACTION_NAMES.cpp_link_executable,
+        ACTION_NAMES.cpp_link_dynamic_library,
+        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+    ]
+
+    cpp_link_dynamic_library_action = action_config(
+        action_name = ACTION_NAMES.cpp_link_dynamic_library,
+        implies = [
+            "nologo",
+            "shared_flag",
+            "linkstamps",
+            "output_execpath_flags",
+            "input_param_flags",
+            "user_link_flags",
+            "linker_subsystem_flag",
+            "linker_param_file",
+            "msvc_env",
+            "no_stripping",
+            "has_configured_linker_path",
+            "def_file",
+        ],
+        tools = [tool(path = ctx.attr.msvc_link_path)],
+    )
+
+    cpp_link_nodeps_dynamic_library_action = action_config(
+        action_name = ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+        implies = [
+            "nologo",
+            "shared_flag",
+            "linkstamps",
+            "output_execpath_flags",
+            "input_param_flags",
+            "user_link_flags",
+            "linker_subsystem_flag",
+            "linker_param_file",
+            "msvc_env",
+            "no_stripping",
+            "has_configured_linker_path",
+            "def_file",
+        ],
+        tools = [tool(path = ctx.attr.msvc_link_path)],
+    )
+
+    cpp_link_static_library_action = action_config(
+        action_name = ACTION_NAMES.cpp_link_static_library,
+        implies = [
+            "nologo",
+            "archiver_flags",
+            "input_param_flags",
+            "linker_param_file",
+            "msvc_env",
+        ],
+        tools = [tool(path = ctx.attr.msvc_lib_path)],
+    )
+
+    assemble_action = action_config(
+        action_name = ACTION_NAMES.assemble,
+        implies = [
+            "compiler_input_flags",
+            "compiler_output_flags",
+            "nologo",
+            "msvc_env",
+            "sysroot",
+        ],
+        tools = [tool(path = ctx.attr.msvc_ml_path)],
+    )
+
+    preprocess_assemble_action = action_config(
+        action_name = ACTION_NAMES.preprocess_assemble,
+        implies = [
+            "compiler_input_flags",
+            "compiler_output_flags",
+            "nologo",
+            "msvc_env",
+            "sysroot",
+        ],
+        tools = [tool(path = ctx.attr.msvc_ml_path)],
+    )
+
+    c_compile_action = action_config(
+        action_name = ACTION_NAMES.c_compile,
+        implies = [
+            "compiler_input_flags",
+            "compiler_output_flags",
+            "nologo",
+            "msvc_env",
+            "parse_showincludes",
+            "user_compile_flags",
+            "sysroot",
+            "unfiltered_compile_flags",
+        ],
+        tools = [tool(path = ctx.attr.msvc_cl_path)],
+    )
+
+    cpp_compile_action = action_config(
+        action_name = ACTION_NAMES.cpp_compile,
+        implies = [
+            "compiler_input_flags",
+            "compiler_output_flags",
+            "nologo",
+            "msvc_env",
+            "parse_showincludes",
+            "user_compile_flags",
+            "sysroot",
+            "unfiltered_compile_flags",
+        ],
+        tools = [tool(path = ctx.attr.msvc_cl_path)],
+    )
+
+    cpp_link_executable_action = action_config(
+        action_name = ACTION_NAMES.cpp_link_executable,
+        implies = [
+            "nologo",
+            "linkstamps",
+            "output_execpath_flags",
+            "input_param_flags",
+            "user_link_flags",
+            "linker_subsystem_flag",
+            "linker_param_file",
+            "msvc_env",
+            "no_stripping",
+        ],
+        tools = [tool(path = ctx.attr.msvc_link_path)],
+    )
+
+    if (ctx.attr.cpu == "darwin" or
+        ctx.attr.cpu == "local"):
+        action_configs = []
+    elif (ctx.attr.cpu == "x64_windows"):
+        action_configs = [
+            assemble_action,
+            preprocess_assemble_action,
+            c_compile_action,
+            cpp_compile_action,
+            cpp_link_executable_action,
+            cpp_link_dynamic_library_action,
+            cpp_link_nodeps_dynamic_library_action,
+            cpp_link_static_library_action,
+        ]
+    else:
+        fail("Unreachable")
+
+    no_windows_export_all_symbols_feature = feature(name = "no_windows_export_all_symbols")
+
+    pic_feature = feature(
+        name = "pic",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [
+                    flag_group(flags = ["-fPIC"], expand_if_available = "pic"),
+                    flag_group(
+                        flags = ["-fPIE"],
+                        expand_if_not_available = "pic",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    preprocessor_defines_feature = feature(
+        name = "preprocessor_defines",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["/D%{preprocessor_defines}"],
+                        iterate_over = "preprocessor_defines",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    generate_pdb_file_feature = feature(
+        name = "generate_pdb_file",
+        requires = [
+            feature_set(features = ["dbg"]),
+            feature_set(features = ["fastbuild"]),
+        ],
+    )
+
+    linkstamps_feature = feature(
+        name = "linkstamps",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [
+                    flag_group(
+                        flags = ["%{linkstamp_paths}"],
+                        iterate_over = "linkstamp_paths",
+                        expand_if_available = "linkstamp_paths",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    unfiltered_compile_flags_feature = feature(
+        name = "unfiltered_compile_flags",
+        flag_sets = ([
+            flag_set(
+                actions = [
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ctx.attr.host_unfiltered_compile_flags,
+                    ),
+                ],
+            ),
+        ] if ctx.attr.host_unfiltered_compile_flags else []),
+    )
+
+    determinism_feature = feature(
+        name = "determinism",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [
+                    flag_group(
+                        flags = [
+                            "-Wno-builtin-macro-redefined",
+                            "-D__DATE__=\"redacted\"",
+                            "-D__TIMESTAMP__=\"redacted\"",
+                            "-D__TIME__=\"redacted\"",
+                        ],
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    nologo_feature = feature(
+        name = "nologo",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.cpp_link_executable,
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ACTION_NAMES.cpp_link_static_library,
+                ],
+                flag_groups = [flag_group(flags = ["/nologo"])],
+            ),
+        ],
+    )
+
+    supports_pic_feature = feature(name = "supports_pic", enabled = True)
+
+    output_execpath_flags_feature = feature(
+        name = "output_execpath_flags",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [
+                    flag_group(
+                        flags = ["/OUT:%{output_execpath}"],
+                        expand_if_available = "output_execpath",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    default_link_flags_feature = feature(
+        name = "default_link_flags",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/MACHINE:X64"])],
+            ),
+        ],
+    )
+
+    if (ctx.attr.cpu == "local"):
+        hardening_feature = feature(
+            name = "hardening",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [
+                        flag_group(
+                            flags = [
+                                "-U_FORTIFY_SOURCE",
+                                "-D_FORTIFY_SOURCE=1",
+                                "-fstack-protector",
+                            ],
+                        ),
+                    ],
+                ),
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ],
+                    flag_groups = [flag_group(flags = ["-Wl,-z,relro,-z,now"])],
+                ),
+                flag_set(
+                    actions = [ACTION_NAMES.cpp_link_executable],
+                    flag_groups = [flag_group(flags = ["-pie", "-Wl,-z,relro,-z,now"])],
+                ),
+            ],
+        )
+    elif (ctx.attr.cpu == "darwin"):
+        hardening_feature = feature(
+            name = "hardening",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [
+                        flag_group(
+                            flags = [
+                                "-U_FORTIFY_SOURCE",
+                                "-D_FORTIFY_SOURCE=1",
+                                "-fstack-protector",
+                            ],
+                        ),
+                    ],
+                ),
+                flag_set(
+                    actions = [ACTION_NAMES.cpp_link_executable],
+                    flag_groups = [flag_group(flags = ["-pie"])],
+                ),
+            ],
+        )
+    else:
+        hardening_feature = None
+
+    supports_dynamic_linker_feature = feature(name = "supports_dynamic_linker", enabled = True)
+
+    targets_windows_feature = feature(
+        name = "targets_windows",
+        enabled = True,
+        implies = ["copy_dynamic_libraries_to_binary"],
+    )
+
+    msvc_env_feature = feature(
+        name = "msvc_env",
+        env_sets = [
+            env_set(
+                actions = [
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.cpp_link_executable,
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ACTION_NAMES.cpp_link_static_library,
+                ],
+                env_entries = [
+                    env_entry(key = "PATH", value = ctx.attr.msvc_env_path),
+                    env_entry(
+                        key = "INCLUDE",
+                        value = ctx.attr.msvc_env_include,
+                    ),
+                    env_entry(key = "LIB", value = ctx.attr.msvc_env_lib),
+                    env_entry(key = "TMP", value = ctx.attr.msvc_env_tmp),
+                    env_entry(key = "TEMP", value = ctx.attr.msvc_env_tmp),
+                ],
+            ),
+        ],
+    )
+
+    linker_subsystem_flag_feature = feature(
+        name = "linker_subsystem_flag",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/SUBSYSTEM:CONSOLE"])],
+            ),
+        ],
+    )
+
+    dynamic_link_msvcrt_no_debug_feature = feature(
+        name = "dynamic_link_msvcrt_no_debug",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/MD"])],
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrt.lib"])],
+            ),
+        ],
+        requires = [
+            feature_set(features = ["fastbuild"]),
+            feature_set(features = ["opt"]),
+        ],
+    )
+
+    warnings_feature = feature(
+        name = "warnings",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [
+                    flag_group(
+                        flags = ["-Wall"] + ctx.attr.host_compiler_warnings,
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    dynamic_link_msvcrt_debug_feature = feature(
+        name = "dynamic_link_msvcrt_debug",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/MDd"])],
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrtd.lib"])],
+            ),
+        ],
+        requires = [feature_set(features = ["dbg"])],
+    )
+
+    compiler_output_flags_feature = feature(
+        name = "compiler_output_flags",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.assemble],
+                flag_groups = [
+                    flag_group(
+                        flag_groups = [
+                            flag_group(
+                                flags = ["/Fo%{output_file}", "/Zi"],
+                                expand_if_not_available = "output_preprocess_file",
+                            ),
+                        ],
+                        expand_if_available = "output_file",
+                        expand_if_not_available = "output_assembly_file",
+                    ),
+                ],
+            ),
+            flag_set(
+                actions = [
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flag_groups = [
+                            flag_group(
+                                flags = ["/Fo%{output_file}"],
+                                expand_if_not_available = "output_preprocess_file",
+                            ),
+                        ],
+                        expand_if_available = "output_file",
+                        expand_if_not_available = "output_assembly_file",
+                    ),
+                    flag_group(
+                        flag_groups = [
+                            flag_group(
+                                flags = ["/Fa%{output_file}"],
+                                expand_if_available = "output_assembly_file",
+                            ),
+                        ],
+                        expand_if_available = "output_file",
+                    ),
+                    flag_group(
+                        flag_groups = [
+                            flag_group(
+                                flags = ["/P", "/Fi%{output_file}"],
+                                expand_if_available = "output_preprocess_file",
+                            ),
+                        ],
+                        expand_if_available = "output_file",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    default_compile_flags_feature = feature(
+        name = "default_compile_flags",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.linkstamp_compile,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.lto_backend,
+                    ACTION_NAMES.clif_match,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = [
+                            "/DCOMPILER_MSVC",
+                            "/DNOMINMAX",
+                            "/D_WIN32_WINNT=0x0600",
+                            "/D_CRT_SECURE_NO_DEPRECATE",
+                            "/D_CRT_SECURE_NO_WARNINGS",
+                            "/D_SILENCE_STDEXT_HASH_DEPRECATION_WARNINGS",
+                            "/bigobj",
+                            "/Zm500",
+                            "/J",
+                            "/Gy",
+                            "/GF",
+                            "/EHsc",
+                            "/wd4351",
+                            "/wd4291",
+                            "/wd4250",
+                            "/wd4996",
+                        ],
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    static_link_msvcrt_debug_feature = feature(
+        name = "static_link_msvcrt_debug",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/MTd"])],
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmtd.lib"])],
+            ),
+        ],
+        requires = [feature_set(features = ["dbg"])],
+    )
+
+    static_link_msvcrt_feature = feature(name = "static_link_msvcrt")
+
+    if (ctx.attr.cpu == "darwin" or
+        ctx.attr.cpu == "local"):
+        dbg_feature = feature(
+            name = "dbg",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["-g"])],
+                ),
+            ],
+            implies = ["common"],
+        )
+    elif (ctx.attr.cpu == "x64_windows"):
+        dbg_feature = feature(
+            name = "dbg",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/Od", "/Z7", "/DDEBUG"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["/DEBUG:FULL", "/INCREMENTAL:NO"])],
+                ),
+            ],
+            implies = ["generate_pdb_file"],
+        )
+    else:
+        dbg_feature = None
+
+    undefined_dynamic_feature = feature(
+        name = "undefined-dynamic",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ACTION_NAMES.cpp_link_executable,
+                ],
+                flag_groups = [flag_group(flags = ["-undefined", "dynamic_lookup"])],
+            ),
+        ],
+    )
+
+    parse_showincludes_feature = feature(
+        name = "parse_showincludes",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                ],
+                flag_groups = [flag_group(flags = ["/showIncludes"])],
+            ),
+        ],
+    )
+
+    linker_param_file_feature = feature(
+        name = "linker_param_file",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions +
+                          [ACTION_NAMES.cpp_link_static_library],
+                flag_groups = [
+                    flag_group(
+                        flags = ["@%{linker_param_file}"],
+                        expand_if_available = "linker_param_file",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    static_link_msvcrt_no_debug_feature = feature(
+        name = "static_link_msvcrt_no_debug",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/MT"])],
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmt.lib"])],
+            ),
+        ],
+        requires = [
+            feature_set(features = ["fastbuild"]),
+            feature_set(features = ["opt"]),
+        ],
+    )
+
+    supports_interface_shared_libraries_feature = feature(
+        name = "supports_interface_shared_libraries",
+        enabled = True,
+    )
+
+    disable_assertions_feature = feature(
+        name = "disable-assertions",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["-DNDEBUG"])],
+            ),
+        ],
+    )
+
+    if (ctx.attr.cpu == "x64_windows"):
+        fastbuild_feature = feature(
+            name = "fastbuild",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/Od", "/Z7", "/DDEBUG"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [
+                        flag_group(flags = ["/DEBUG:FASTLINK", "/INCREMENTAL:NO"]),
+                    ],
+                ),
+            ],
+            implies = ["generate_pdb_file"],
+        )
+    elif (ctx.attr.cpu == "darwin" or
+          ctx.attr.cpu == "local"):
+        fastbuild_feature = feature(name = "fastbuild", implies = ["common"])
+    else:
+        fastbuild_feature = None
+
+    user_compile_flags_feature = feature(
+        name = "user_compile_flags",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["%{user_compile_flags}"],
+                        iterate_over = "user_compile_flags",
+                        expand_if_available = "user_compile_flags",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    compiler_input_flags_feature = feature(
+        name = "compiler_input_flags",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["/c", "%{source_file}"],
+                        expand_if_available = "source_file",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    no_legacy_features_feature = feature(name = "no_legacy_features")
+
+    archiver_flags_feature = feature(
+        name = "archiver_flags",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.cpp_link_static_library],
+                flag_groups = [
+                    flag_group(
+                        flags = ["/OUT:%{output_execpath}"],
+                        expand_if_available = "output_execpath",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    redirector_feature = feature(
+        name = "redirector",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = [
+                            "-B",
+                            "external/local_config_cuda/crosstool/windows/msvc_wrapper_for_nvcc.py",
+                        ],
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    linker_bin_path_feature = feature(
+        name = "linker-bin-path",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["-B" + ctx.attr.linker_bin_path])],
+            ),
+        ],
+    )
+
+    if (ctx.attr.cpu == "local"):
+        opt_feature = feature(
+            name = "opt",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["-g0", "-O2", "-ffunction-sections", "-fdata-sections"],
+                        ),
+                    ],
+                ),
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                        ACTION_NAMES.cpp_link_executable,
+                    ],
+                    flag_groups = [flag_group(flags = ["-Wl,--gc-sections"])],
+                ),
+            ],
+            implies = ["common", "disable-assertions"],
+        )
+    elif (ctx.attr.cpu == "darwin"):
+        opt_feature = feature(
+            name = "opt",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["-g0", "-O2", "-ffunction-sections", "-fdata-sections"],
+                        ),
+                    ],
+                ),
+            ],
+            implies = ["common", "disable-assertions"],
+        )
+    elif (ctx.attr.cpu == "x64_windows"):
+        opt_feature = feature(
+            name = "opt",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/O2", "/DNDEBUG"])],
+                ),
+            ],
+        )
+    else:
+        opt_feature = None
+
+    include_paths_feature = feature(
+        name = "include_paths",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["/I%{quote_include_paths}"],
+                        iterate_over = "quote_include_paths",
+                    ),
+                    flag_group(
+                        flags = ["/I%{include_paths}"],
+                        iterate_over = "include_paths",
+                    ),
+                    flag_group(
+                        flags = ["/I%{system_include_paths}"],
+                        iterate_over = "system_include_paths",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    shared_flag_feature = feature(
+        name = "shared_flag",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                ],
+                flag_groups = [flag_group(flags = ["/DLL"])],
+            ),
+        ],
+    )
+
+    windows_export_all_symbols_feature = feature(name = "windows_export_all_symbols")
+
+    frame_pointer_feature = feature(
+        name = "frame-pointer",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["-fno-omit-frame-pointer"])],
+            ),
+        ],
+    )
+
+    build_id_feature = feature(
+        name = "build-id",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [
+                    flag_group(
+                        flags = ["-Wl,--build-id=md5", "-Wl,--hash-style=gnu"],
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    sysroot_feature = feature(
+        name = "sysroot",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.cpp_link_executable,
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["--sysroot=%{sysroot}"],
+                        iterate_over = "sysroot",
+                        expand_if_available = "sysroot",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    cuda_path_feature = feature(
+        name = "cuda_path",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.cpp_link_executable,
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["--cuda-path=" + ctx.attr.cuda_path],
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    def_file_feature = feature(
+        name = "def_file",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [
+                    flag_group(
+                        flags = ["/DEF:%{def_file_path}", "/ignore:4070"],
+                        expand_if_available = "def_file_path",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    if (ctx.attr.cpu == "darwin"):
+        stdlib_feature = feature(
+            name = "stdlib",
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["-lc++"])],
+                ),
+            ],
+        )
+    elif (ctx.attr.cpu == "local"):
+        stdlib_feature = feature(
+            name = "stdlib",
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["-lstdc++"])],
+                ),
+            ],
+        )
+    else:
+        stdlib_feature = None
+
+    no_stripping_feature = feature(name = "no_stripping")
+
+    alwayslink_feature = feature(
+        name = "alwayslink",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ACTION_NAMES.cpp_link_executable,
+                ],
+                flag_groups = [flag_group(flags = ["-Wl,-no-as-needed"])],
+            ),
+        ],
+    )
+
+    input_param_flags_feature = feature(
+        name = "input_param_flags",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["/IMPLIB:%{interface_library_output_path}"],
+                        expand_if_available = "interface_library_output_path",
+                    ),
+                ],
+            ),
+            flag_set(
+                actions = all_link_actions +
+                          [ACTION_NAMES.cpp_link_static_library],
+                flag_groups = [
+                    flag_group(
+                        iterate_over = "libraries_to_link",
+                        flag_groups = [
+                            flag_group(
+                                iterate_over = "libraries_to_link.object_files",
+                                flag_groups = [flag_group(flags = ["%{libraries_to_link.object_files}"])],
+                                expand_if_equal = variable_with_value(
+                                    name = "libraries_to_link.type",
+                                    value = "object_file_group",
+                                ),
+                            ),
+                            flag_group(
+                                flag_groups = [flag_group(flags = ["%{libraries_to_link.name}"])],
+                                expand_if_equal = variable_with_value(
+                                    name = "libraries_to_link.type",
+                                    value = "object_file",
+                                ),
+                            ),
+                            flag_group(
+                                flag_groups = [flag_group(flags = ["%{libraries_to_link.name}"])],
+                                expand_if_equal = variable_with_value(
+                                    name = "libraries_to_link.type",
+                                    value = "interface_library",
+                                ),
+                            ),
+                            flag_group(
+                                flag_groups = [
+                                    flag_group(
+                                        flags = ["%{libraries_to_link.name}"],
+                                        expand_if_false = "libraries_to_link.is_whole_archive",
+                                    ),
+                                    flag_group(
+                                        flags = ["/WHOLEARCHIVE:%{libraries_to_link.name}"],
+                                        expand_if_true = "libraries_to_link.is_whole_archive",
+                                    ),
+                                ],
+                                expand_if_equal = variable_with_value(
+                                    name = "libraries_to_link.type",
+                                    value = "static_library",
+                                ),
+                            ),
+                        ],
+                        expand_if_available = "libraries_to_link",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    if (ctx.attr.cpu == "local"):
+        no_canonical_prefixes_feature = feature(
+            name = "no-canonical-prefixes",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_link_executable,
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = [
+                                "-no-canonical-prefixes",
+                            ] + ctx.attr.extra_no_canonical_prefixes_flags,
+                        ),
+                    ],
+                ),
+            ],
+        )
+    elif (ctx.attr.cpu == "darwin"):
+        no_canonical_prefixes_feature = feature(
+            name = "no-canonical-prefixes",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_link_executable,
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ],
+                    flag_groups = [flag_group(flags = ["-no-canonical-prefixes"])],
+                ),
+            ],
+        )
+    else:
+        no_canonical_prefixes_feature = None
+
+    has_configured_linker_path_feature = feature(name = "has_configured_linker_path")
+
+    copy_dynamic_libraries_to_binary_feature = feature(name = "copy_dynamic_libraries_to_binary")
+
+    user_link_flags_feature = feature(
+        name = "user_link_flags",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [
+                    flag_group(
+                        flags = ["%{user_link_flags}"],
+                        iterate_over = "user_link_flags",
+                        expand_if_available = "user_link_flags",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    cpp11_feature = feature(
+        name = "c++11",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["-std=c++11"])],
+            ),
+        ],
+    )
+
+    if (ctx.attr.cpu == "local"):
+        common_feature = feature(
+            name = "common",
+            implies = [
+                "stdlib",
+                "c++11",
+                "determinism",
+                "alwayslink",
+                "hardening",
+                "warnings",
+                "frame-pointer",
+                "build-id",
+                "no-canonical-prefixes",
+                "linker-bin-path",
+            ],
+        )
+    elif (ctx.attr.cpu == "darwin"):
+        common_feature = feature(
+            name = "common",
+            implies = [
+                "stdlib",
+                "c++11",
+                "determinism",
+                "hardening",
+                "warnings",
+                "frame-pointer",
+                "no-canonical-prefixes",
+                "linker-bin-path",
+                "undefined-dynamic",
+            ],
+        )
+    else:
+        common_feature = None
+
+    if (ctx.attr.cpu == "local"):
+        features = [
+            cpp11_feature,
+            stdlib_feature,
+            determinism_feature,
+            alwayslink_feature,
+            pic_feature,
+            hardening_feature,
+            warnings_feature,
+            frame_pointer_feature,
+            build_id_feature,
+            no_canonical_prefixes_feature,
+            disable_assertions_feature,
+            linker_bin_path_feature,
+            common_feature,
+            opt_feature,
+            fastbuild_feature,
+            dbg_feature,
+            supports_dynamic_linker_feature,
+            supports_pic_feature,
+        ]
+        if ctx.attr.cuda_path:
+            features.append(cuda_path_feature)
+    elif (ctx.attr.cpu == "darwin"):
+        features = [
+            cpp11_feature,
+            stdlib_feature,
+            determinism_feature,
+            pic_feature,
+            hardening_feature,
+            warnings_feature,
+            frame_pointer_feature,
+            no_canonical_prefixes_feature,
+            disable_assertions_feature,
+            linker_bin_path_feature,
+            undefined_dynamic_feature,
+            common_feature,
+            opt_feature,
+            fastbuild_feature,
+            dbg_feature,
+            supports_dynamic_linker_feature,
+            supports_pic_feature,
+        ]
+    elif (ctx.attr.cpu == "x64_windows"):
+        features = [
+            no_legacy_features_feature,
+            redirector_feature,
+            nologo_feature,
+            has_configured_linker_path_feature,
+            no_stripping_feature,
+            targets_windows_feature,
+            copy_dynamic_libraries_to_binary_feature,
+            default_compile_flags_feature,
+            msvc_env_feature,
+            include_paths_feature,
+            preprocessor_defines_feature,
+            parse_showincludes_feature,
+            generate_pdb_file_feature,
+            shared_flag_feature,
+            linkstamps_feature,
+            output_execpath_flags_feature,
+            archiver_flags_feature,
+            input_param_flags_feature,
+            linker_subsystem_flag_feature,
+            user_link_flags_feature,
+            default_link_flags_feature,
+            linker_param_file_feature,
+            static_link_msvcrt_feature,
+            static_link_msvcrt_no_debug_feature,
+            dynamic_link_msvcrt_no_debug_feature,
+            static_link_msvcrt_debug_feature,
+            dynamic_link_msvcrt_debug_feature,
+            dbg_feature,
+            fastbuild_feature,
+            opt_feature,
+            user_compile_flags_feature,
+            sysroot_feature,
+            unfiltered_compile_flags_feature,
+            compiler_output_flags_feature,
+            compiler_input_flags_feature,
+            def_file_feature,
+            windows_export_all_symbols_feature,
+            no_windows_export_all_symbols_feature,
+            supports_dynamic_linker_feature,
+            supports_interface_shared_libraries_feature,
+        ]
+    else:
+        fail("Unreachable")
+
+    cxx_builtin_include_directories = ctx.attr.builtin_include_directories
+
+    if (ctx.attr.cpu == "x64_windows"):
+        tool_paths = [
+            tool_path(name = "ar", path = ctx.attr.msvc_lib_path),
+            tool_path(name = "ml", path = ctx.attr.msvc_ml_path),
+            tool_path(name = "cpp", path = ctx.attr.msvc_cl_path),
+            tool_path(name = "gcc", path = ctx.attr.msvc_cl_path),
+            tool_path(name = "gcov", path = "wrapper/bin/msvc_nop.bat"),
+            tool_path(name = "ld", path = ctx.attr.msvc_link_path),
+            tool_path(name = "nm", path = "wrapper/bin/msvc_nop.bat"),
+            tool_path(
+                name = "objcopy",
+                path = "wrapper/bin/msvc_nop.bat",
+            ),
+            tool_path(
+                name = "objdump",
+                path = "wrapper/bin/msvc_nop.bat",
+            ),
+            tool_path(
+                name = "strip",
+                path = "wrapper/bin/msvc_nop.bat",
+            ),
+        ]
+    elif (ctx.attr.cpu == "local"):
+        tool_paths = [
+            tool_path(name = "gcc", path = ctx.attr.host_compiler_path),
+            tool_path(name = "ar", path = ctx.attr.host_compiler_prefix + "/ar"),
+            tool_path(name = "compat-ld", path = ctx.attr.host_compiler_prefix + "/ld"),
+            tool_path(name = "cpp", path = ctx.attr.host_compiler_prefix + "/cpp"),
+            tool_path(name = "dwp", path = ctx.attr.host_compiler_prefix + "/dwp"),
+            tool_path(name = "gcov", path = ctx.attr.host_compiler_prefix + "/gcov"),
+            tool_path(name = "ld", path = ctx.attr.host_compiler_prefix + "/ld"),
+            tool_path(name = "nm", path = ctx.attr.host_compiler_prefix + "/nm"),
+            tool_path(name = "objcopy", path = ctx.attr.host_compiler_prefix + "/objcopy"),
+            tool_path(name = "objdump", path = ctx.attr.host_compiler_prefix + "/objdump"),
+            tool_path(name = "strip", path = ctx.attr.host_compiler_prefix + "/strip"),
+        ]
+    elif (ctx.attr.cpu == "darwin"):
+        tool_paths = [
+            tool_path(name = "gcc", path = ctx.attr.host_compiler_path),
+            tool_path(name = "ar", path = ctx.attr.host_compiler_prefix + "/libtool"),
+            tool_path(name = "compat-ld", path = ctx.attr.host_compiler_prefix + "/ld"),
+            tool_path(name = "cpp", path = ctx.attr.host_compiler_prefix + "/cpp"),
+            tool_path(name = "dwp", path = ctx.attr.host_compiler_prefix + "/dwp"),
+            tool_path(name = "gcov", path = ctx.attr.host_compiler_prefix + "/gcov"),
+            tool_path(name = "ld", path = ctx.attr.host_compiler_prefix + "/ld"),
+            tool_path(name = "nm", path = ctx.attr.host_compiler_prefix + "/nm"),
+            tool_path(name = "objcopy", path = ctx.attr.host_compiler_prefix + "/objcopy"),
+            tool_path(name = "objdump", path = ctx.attr.host_compiler_prefix + "/objdump"),
+            tool_path(name = "strip", path = ctx.attr.host_compiler_prefix + "/strip"),
+        ]
+    else:
+        fail("Unreachable")
+
+    out = ctx.actions.declare_file(ctx.label.name)
+    ctx.actions.write(out, "Fake executable")
+    return [
+        cc_common.create_cc_toolchain_config_info(
+            ctx = ctx,
+            features = features,
+            action_configs = action_configs,
+            artifact_name_patterns = [],
+            cxx_builtin_include_directories = cxx_builtin_include_directories,
+            toolchain_identifier = toolchain_identifier,
+            host_system_name = host_system_name,
+            target_system_name = target_system_name,
+            target_cpu = target_cpu,
+            target_libc = target_libc,
+            compiler = compiler,
+            abi_version = abi_version,
+            abi_libc_version = abi_libc_version,
+            tool_paths = tool_paths,
+            make_variables = [],
+            builtin_sysroot = builtin_sysroot,
+            cc_target_os = cc_target_os,
+        ),
+        DefaultInfo(
+            executable = out,
+        ),
+    ]
+
+cc_toolchain_config = rule(
+    implementation = _impl,
+    attrs = {
+        "cpu": attr.string(mandatory = True, values = ["darwin", "local", "x64_windows"]),
+        "builtin_include_directories": attr.string_list(),
+        "extra_no_canonical_prefixes_flags": attr.string_list(),
+        "host_compiler_path": attr.string(),
+        "host_compiler_prefix": attr.string(),
+        "host_compiler_warnings": attr.string_list(),
+        "host_unfiltered_compile_flags": attr.string_list(),
+        "linker_bin_path": attr.string(),
+        "builtin_sysroot": attr.string(),
+        "cuda_path": attr.string(),
+        "msvc_cl_path": attr.string(default = "msvc_not_used"),
+        "msvc_env_include": attr.string(default = "msvc_not_used"),
+        "msvc_env_lib": attr.string(default = "msvc_not_used"),
+        "msvc_env_path": attr.string(default = "msvc_not_used"),
+        "msvc_env_tmp": attr.string(default = "msvc_not_used"),
+        "msvc_lib_path": attr.string(default = "msvc_not_used"),
+        "msvc_link_path": attr.string(default = "msvc_not_used"),
+        "msvc_ml_path": attr.string(default = "msvc_not_used"),
+    },
+    provides = [CcToolchainConfigInfo],
+    executable = True,
+)
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda11/clang/bin/crosstool_wrapper_driver_is_not_gcc b/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda11/clang/bin/crosstool_wrapper_driver_is_not_gcc
new file mode 100755
index 00000000000..07c85a38229
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda11/clang/bin/crosstool_wrapper_driver_is_not_gcc
@@ -0,0 +1,289 @@
+#!/usr/bin/env python
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Crosstool wrapper for compiling CUDA programs.
+
+SYNOPSIS:
+  crosstool_wrapper_is_not_gcc [options passed in by cc_library()
+                                or cc_binary() rule]
+
+DESCRIPTION:
+  This script is expected to be called by the cc_library() or cc_binary() bazel
+  rules. When the option "-x cuda" is present in the list of arguments passed
+  to this script, it invokes the nvcc CUDA compiler. Most arguments are passed
+  as is as a string to --compiler-options of nvcc. When "-x cuda" is not
+  present, this wrapper invokes hybrid_driver_is_not_gcc with the input
+  arguments as is.
+
+NOTES:
+  Changes to the contents of this file must be propagated from
+  //third_party/gpus/crosstool/crosstool_wrapper_is_not_gcc to
+  //third_party/gpus/crosstool/v*/*/clang/bin/crosstool_wrapper_is_not_gcc
+"""
+
+from __future__ import print_function
+
+__author__ = 'keveman@google.com (Manjunath Kudlur)'
+
+from argparse import ArgumentParser
+import os
+import subprocess
+import re
+import sys
+import pipes
+
+# Template values set by cuda_autoconf.
+CPU_COMPILER = ('/dt7/usr/bin/gcc')
+GCC_HOST_COMPILER_PATH = ('/dt7/usr/bin/gcc')
+
+NVCC_PATH = '/usr/local/cuda-11.0/bin/nvcc'
+PREFIX_DIR = os.path.dirname(GCC_HOST_COMPILER_PATH)
+NVCC_VERSION = '10.1'
+
+def Log(s):
+  print('gpus/crosstool: {0}'.format(s))
+
+
+def GetOptionValue(argv, option):
+  """Extract the list of values for option from the argv list.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+    option: The option whose value to extract, with the leading '-'.
+
+  Returns:
+    A list of values, either directly following the option,
+    (eg., -opt val1 val2) or values collected from multiple occurrences of
+    the option (eg., -opt val1 -opt val2).
+  """
+
+  parser = ArgumentParser()
+  parser.add_argument(option, nargs='*', action='append')
+  option = option.lstrip('-').replace('-', '_')
+  args, _ = parser.parse_known_args(argv)
+  if not args or not vars(args)[option]:
+    return []
+  else:
+    return sum(vars(args)[option], [])
+
+
+def GetHostCompilerOptions(argv):
+  """Collect the -isystem, -iquote, and --sysroot option values from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+
+  Returns:
+    The string that can be used as the --compiler-options to nvcc.
+  """
+
+  parser = ArgumentParser()
+  parser.add_argument('-isystem', nargs='*', action='append')
+  parser.add_argument('-iquote', nargs='*', action='append')
+  parser.add_argument('--sysroot', nargs=1)
+  parser.add_argument('-g', nargs='*', action='append')
+  parser.add_argument('-fno-canonical-system-headers', action='store_true')
+  parser.add_argument('-no-canonical-prefixes', action='store_true')
+
+  args, _ = parser.parse_known_args(argv)
+
+  opts = ''
+
+  if args.isystem:
+    opts += ' -isystem ' + ' -isystem '.join(sum(args.isystem, []))
+  if args.iquote:
+    opts += ' -iquote ' + ' -iquote '.join(sum(args.iquote, []))
+  if args.g:
+    opts += ' -g' + ' -g'.join(sum(args.g, []))
+  if args.fno_canonical_system_headers:
+    opts += ' -fno-canonical-system-headers'
+  if args.no_canonical_prefixes:
+    opts += ' -no-canonical-prefixes'
+  if args.sysroot:
+    opts += ' --sysroot ' + args.sysroot[0]
+
+  return opts
+
+def _update_options(nvcc_options):
+  if NVCC_VERSION in ("7.0",):
+    return nvcc_options
+
+  update_options = { "relaxed-constexpr" : "expt-relaxed-constexpr" }
+  return [ update_options[opt] if opt in update_options else opt
+                    for opt in nvcc_options ]
+
+def GetNvccOptions(argv):
+  """Collect the -nvcc_options values from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+
+  Returns:
+    The string that can be passed directly to nvcc.
+  """
+
+  parser = ArgumentParser()
+  parser.add_argument('-nvcc_options', nargs='*', action='append')
+
+  args, _ = parser.parse_known_args(argv)
+
+  if args.nvcc_options:
+    options = _update_options(sum(args.nvcc_options, []))
+    return ' '.join(['--'+a for a in options])
+  return ''
+
+def system(cmd):
+  """Invokes cmd with os.system().
+
+  Args:
+    cmd: The command.
+
+  Returns:
+    The exit code if the process exited with exit() or -signal
+    if the process was terminated by a signal.
+  """
+  retv = os.system(cmd)
+  if os.WIFEXITED(retv):
+    return os.WEXITSTATUS(retv)
+  else:
+    return -os.WTERMSIG(retv)
+
+def InvokeNvcc(argv, log=False):
+  """Call nvcc with arguments assembled from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+    log: True if logging is requested.
+
+  Returns:
+    The return value of calling system('nvcc ' + args)
+  """
+
+  host_compiler_options = GetHostCompilerOptions(argv)
+  nvcc_compiler_options = GetNvccOptions(argv)
+  opt_option = GetOptionValue(argv, '-O')
+  m_options = GetOptionValue(argv, '-m')
+  m_options = ''.join([' -m' + m for m in m_options if m in ['32', '64']])
+  include_options = GetOptionValue(argv, '-I')
+  out_file = GetOptionValue(argv, '-o')
+  depfiles = GetOptionValue(argv, '-MF')
+  defines = GetOptionValue(argv, '-D')
+  defines = ''.join([' -D' + define for define in defines])
+  undefines = GetOptionValue(argv, '-U')
+  undefines = ''.join([' -U' + define for define in undefines])
+  std_options = GetOptionValue(argv, '-std')
+  # Supported -std flags as of CUDA 9.0. Only keep last to mimic gcc/clang.
+  nvcc_allowed_std_options = ["c++03", "c++11", "c++14"]
+  std_options = ''.join([' -std=' + define
+      for define in std_options if define in nvcc_allowed_std_options][-1:])
+  fatbin_options = ''.join([' --fatbin-options=' + option
+      for option in GetOptionValue(argv, '-Xcuda-fatbinary')])
+
+  # The list of source files get passed after the -c option. I don't know of
+  # any other reliable way to just get the list of source files to be compiled.
+  src_files = GetOptionValue(argv, '-c')
+
+  # Pass -w through from host to nvcc, but don't do anything fancier with
+  # warnings-related flags, since they're not necessarily the same across
+  # compilers.
+  warning_options = ' -w' if '-w' in argv else ''
+
+  if len(src_files) == 0:
+    return 1
+  if len(out_file) != 1:
+    return 1
+
+  opt = (' -O2' if (len(opt_option) > 0 and int(opt_option[0]) > 0)
+         else ' -g')
+
+  includes = (' -I ' + ' -I '.join(include_options)
+              if len(include_options) > 0
+              else '')
+
+  # Unfortunately, there are other options that have -c prefix too.
+  # So allowing only those look like C/C++ files.
+  src_files = [f for f in src_files if
+               re.search('\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)]
+  srcs = ' '.join(src_files)
+  out = ' -o ' + out_file[0]
+
+  nvccopts = '-D_FORCE_INLINES '
+  for capability in GetOptionValue(argv, "--cuda-gpu-arch"):
+    capability = capability[len('sm_'):]
+    nvccopts += r'-gencode=arch=compute_%s,\"code=sm_%s\" ' % (capability,
+                                                               capability)
+  for capability in GetOptionValue(argv, '--cuda-include-ptx'):
+    capability = capability[len('sm_'):]
+    nvccopts += r'-gencode=arch=compute_%s,\"code=compute_%s\" ' % (capability,
+                                                                    capability)
+  nvccopts += nvcc_compiler_options
+  nvccopts += undefines
+  nvccopts += defines
+  nvccopts += std_options
+  nvccopts += m_options
+  nvccopts += warning_options
+  nvccopts += fatbin_options
+
+  if depfiles:
+    # Generate the dependency file
+    depfile = depfiles[0]
+    cmd = (NVCC_PATH + ' ' + nvccopts +
+           ' --compiler-options "' + host_compiler_options + '"' +
+           ' --compiler-bindir=' + GCC_HOST_COMPILER_PATH +
+           ' -I .' +
+           ' -x cu ' + opt + includes + ' ' + srcs + ' -M -o ' + depfile)
+    if log: Log(cmd)
+    exit_status = system(cmd)
+    if exit_status != 0:
+      return exit_status
+
+  cmd = (NVCC_PATH + ' ' + nvccopts +
+         ' --compiler-options "' + host_compiler_options + ' -fPIC"' +
+         ' --compiler-bindir=' + GCC_HOST_COMPILER_PATH +
+         ' -I .' +
+         ' -x cu ' + opt + includes + ' -c ' + srcs + out)
+
+  # TODO(zhengxq): for some reason, 'gcc' needs this help to find 'as'.
+  # Need to investigate and fix.
+  cmd = 'PATH=' + PREFIX_DIR + ':$PATH ' + cmd
+  if log: Log(cmd)
+  return system(cmd)
+
+
+def main():
+  parser = ArgumentParser()
+  parser.add_argument('-x', nargs=1)
+  parser.add_argument('--cuda_log', action='store_true')
+  args, leftover = parser.parse_known_args(sys.argv[1:])
+
+  if args.x and args.x[0] == 'cuda':
+    if args.cuda_log: Log('-x cuda')
+    leftover = [pipes.quote(s) for s in leftover]
+    if args.cuda_log: Log('using nvcc')
+    return InvokeNvcc(leftover, log=args.cuda_log)
+
+  # Strip our flags before passing through to the CPU compiler for files which
+  # are not -x cuda. We can't just pass 'leftover' because it also strips -x.
+  # We not only want to pass -x to the CPU compiler, but also keep it in its
+  # relative location in the argv list (the compiler is actually sensitive to
+  # this).
+  cpu_compiler_flags = [flag for flag in sys.argv[1:]
+                             if not flag.startswith(('--cuda_log'))]
+
+  return subprocess.call([CPU_COMPILER] + cpu_compiler_flags)
+
+if __name__ == '__main__':
+  sys.exit(main())
diff --git a/third_party/typing_extensions.BUILD b/third_party/typing_extensions.BUILD
new file mode 100644
index 00000000000..f3b6c26e295
--- /dev/null
+++ b/third_party/typing_extensions.BUILD
@@ -0,0 +1,20 @@
+# Description:
+#   Backports for the typing module to older Python versions. See
+#   https://github.com/python/typing/blob/master/typing_extensions/README.rst
+
+licenses(["notice"])  # PSF
+
+py_library(
+    name = "typing_extensions",
+    srcs = ["typing_extensions.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+)
+
+genrule(
+    name = "license",
+    srcs = ["@astunparse_license"],
+    outs = ["LICENSE"],
+    cmd = "cp $< $@",
+    visibility = ["//visibility:public"],
+)